diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 76f6d7aeca0d86f128dc820005166044525547eb..77ee313687fc8a8a1388f90475566fbaea276bba 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
 
 
 def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
similarity index 59%
rename from .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
rename to .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
index 1bce7e7fdf146fe6e19f4e5191a0b310364535fa..56ec933c9cc0e5e1fc8041db7f485fa272575d20 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -1,12 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.595
+    value: 0.419
   - name: "exact_match,flexible-extract"
-    value: 0.582
+    value: 0.416
 limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccb4f84201b77437200ae69a0d5a28869a0dfe01
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
+limit: 100
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46f1a9fbf6ff95465e75b47ee23fcaf077fa8852
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index a2f235f485815848db1106809d714fd519f24e60..aa4fb9fa03d6d7baa0868bdacb650a82e7e25200 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f3c31743e75ba0298e75489300a0caa96995dde
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..514c15d6098ed767e2338886b7a9887ee43c56ee
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -0,0 +1,14 @@
+model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+tasks:
+  - name: "mmlu_pro"
+    metrics:
+      - name: "exact_match,custom-extract"
+        value: 0.82
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
+enforce_eager: false # we use false to speed up the eval process
+kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
+max_model_len: 40960
+apply_chat_template: true
+fewshot_as_multiturn: true
+gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
diff --git a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5552391d9eababaf498e82b245ff297cd0c65e68
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -0,0 +1 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
new file mode 100644
index 0000000000000000000000000000000000000000..91e22b6459c1235d6ed7877618423caa7dc297af
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1097d220245fc42a9ea77bcde11e5b410614b3eb
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
\ No newline at end of file
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c8db951381b0bd8b4c36ffe0b97c2155aea5c52b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.9
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d85a1721db9a59d46ab9a7fdaf52b68c8dc13186
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index ceea01166b7f414c36e11e0d7007bca7b580b740..3627b760eddcf769538db2f8e6ba94746847bb08 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,21 +19,35 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
     trust_remote_code = eval_config.get("trust_remote_code", False)
     max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
+    enforce_eager = eval_config.get("enforce_eager", "true")
+    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
     model_args = (
         f"pretrained={eval_config['model_name']},"
         f"tensor_parallel_size={tp_size},"
-        f"enforce_eager=true,"
+        f"enforce_eager={enforce_eager},"
+        f"kv_cache_dtype={kv_cache_dtype},"
         f"add_bos_token=true,"
         f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
     )
     results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
         model_args=model_args,
         tasks=[task["name"] for task in eval_config["tasks"]],
         num_fewshot=eval_config["num_fewshot"],
         limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        # text models. however, this is regressing measured strict-match for
+        # existing text models in CI, so only apply it for mm, or explicitly set
+        apply_chat_template=eval_config.get(
+            "apply_chat_template", backend == "vllm-vlm"
+        ),
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+        gen_kwargs=eval_config.get("gen_kwargs"),
+        batch_size=batch_size,
     )
     return results
 
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
deleted file mode 100644
index 4259514940d3fc7b4e3d3062047267f1a40a1b1a..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    key: wait-for-container-image
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - label: "Cleanup H100"
-    agents:
-      queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
-  
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  # Premerge benchmark
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
deleted file mode 100644
index 466def07b6f1f676607c5f47391530ae7454afae..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Nightly benchmark annotation
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
-- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
-- [benchmarking results](artifact://results.zip)
-- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-## Results reproduction
-
-- Find the docker we use in `benchmarking pipeline`
-- Deploy the docker, and inside the docker:
-    - Download `nightly-benchmarks.zip`.
-    - In the same folder, run the following code:
-
-    ```bash
-    export HF_TOKEN=<your HF token>
-    apt update
-    apt install -y git
-    unzip nightly-benchmarks.zip
-    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-    ```
-
-And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
deleted file mode 100644
index 2ef36089b6afb88a222c286d62e5dd9ed8d2872d..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ /dev/null
@@ -1,39 +0,0 @@
-
-# Nightly benchmark
-
-This benchmark aims to:
-
-- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
-- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
-
-## Setup
-
-- Docker images:
-    - vLLM: `vllm/vllm-openai:v0.6.2`
-    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
-    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
-- Hardware
-    - 8x Nvidia A100 GPUs
-- Workload:
-    - Dataset
-        - ShareGPT dataset
-        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-    - Models: llama-3 8B, llama-3 70B.
-        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-## Known issues
-
-- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
-- TGI does not support `ignore-eos` flag.
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
deleted file mode 100644
index 199517e8b067c05df6375a7bcbc35e5a93ddb838..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ /dev/null
@@ -1,196 +0,0 @@
-common_pod_spec: &common_pod_spec
-  priorityClassName: perf-benchmark
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-    - name: devshm
-      emptyDir:
-        medium: Memory
-    - name: hf-cache
-      hostPath:
-        path: /root/.cache/huggingface
-        type: Directory
-
-common_container_settings: &common_container_settings
-  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  resources:
-    limits:
-      nvidia.com/gpu: 8
-  volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-  env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-
-steps:
-  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
-                <<: *common_container_settings
-
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
-                <<: *common_container_settings
-
-
-
-
-  - label: "A100 trt llama-8B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"
-
-
-  - label: "A100 trt llama-70B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
-        
-  - wait
-
-  - label: "Collect the results"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-            - image: vllm/vllm-openai:v0.5.0.post1
-              command:
-              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-              resources:
-                limits:
-                  nvidia.com/gpu: 8
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: VLLM_SOURCE_CODE_LOC
-                value: /workspace/build/buildkite/vllm/performance-benchmark
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-
-  - block: ":rocket: check the results!"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
deleted file mode 100644
index 8532ff7ef798cc039926ef8781fccab71f5e17a9..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-
-from transformers import AutoTokenizer
-
-
-def main(model, cachedir):
-    # Load the tokenizer and save it to the specified directory
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    tokenizer.save_pretrained(cachedir)
-    print(f"Tokenizer saved to {cachedir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
-    )
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-    parser.add_argument(
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
-    )
-
-    args = parser.parse_args()
-    main(args.model, args.cachedir)
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
deleted file mode 100644
index 053fd52c35ae906710cbe08579becf0eafc72c4b..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
-    )
-    parser.add_argument(
-        "--results-folder",
-        type=str,
-        required=True,
-        help="The folder where the results are stored.",
-    )
-    parser.add_argument(
-        "--description", type=str, required=True, help="Description of the results."
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.0)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
-        )
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file) as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
-
-    with open(args.description) as f:
-        description = f.read()
-
-    description = description.format(nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
deleted file mode 100644
index ddea1d2b1b1ed5710f5f878c663e83f388a62b73..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
deleted file mode 100644
index 69b6b146b3549bee8d6c3fb336c306de0cf9de55..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-
-main() {
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip plotting the results."
-        exit 0
-    fi
-
-    # initial annotation
-    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
-
-    # download results
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
-    ls
-    ls results/
-
-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd "$VLLM_SOURCE_CODE_LOC/"
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
-    
-
-
-    # The figures should be generated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
-}
-
-main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
deleted file mode 100644
index a00de940cbbb81bff911996a7ec56d0c83c8470d..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ /dev/null
@@ -1,464 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f '[p]ython'
-  pkill -f '[p]ython3'
-  pkill -f '[t]ritonserver'
-  pkill -f '[p]t_main_thread'
-  pkill -f '[t]ext-generation'
-  pkill -f '[l]mdeploy'
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pkill -f '[V]LLM'
-
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which "$cmd" >/dev/null; then
-    apt-get update && apt-get install -y "$cmd"
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-run_genai_perf_tests() {
-  # run genai-perf tests
-
-  # $1: a json file specifying genai-perf test cases
-  local genai_perf_test_file
-  genai_perf_test_file=$1
-
-  # Iterate over genai-perf tests
-  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps=$num_prompts
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-      #TODO: add output dir.
-      client_command="genai-perf profile \
-        -m $model \
-        --service-kind openai \
-        --backend "$backend" \
-        --endpoint-type chat \
-        --streaming \
-        --url localhost:$port \
-        --request-rate $qps \
-        --num-prompts $num_prompts \
-      "
-
-    echo "Client command: $client_command"
-
-    eval "$client_command"
-
-    #TODO: process/record outputs
-    done
-  done
-
-  kill_gpu_processes
-
-}
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  pip install -r requirements/dev.txt
-  which genai-perf
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-  # genai-perf dependency
-  ensure_installed libb64-0d
-
-  prepare_dataset
-
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-
-  # run the test
-  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
-
-  # run genai-perf tests
-  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
-  mv artifacts/ $RESULTS_FOLDER/
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
-  upload_to_buildkite
-
-}
-
-main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
deleted file mode 100644
index fb3b9d5e34e03c7708f560301a7b802f3e3906bc..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import datetime
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "completed": "Successful req.",
-    "request_throughput": "Tput (req/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
-    "engine": "Engine",
-}
-
-if __name__ == "__main__":
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-        with open(test_file) as f:
-            raw_result = json.loads(f.read())
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands")) as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    serving_results = pd.DataFrame.from_dict(serving_results)
-
-    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            columns=serving_column_mapping
-        )
-
-    serving_md_table_with_headers = tabulate(
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
-
-    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
-
-    # document benchmarking results in markdown
-    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        # document results with header.
-        # for those who wants to reproduce our benchmark.
-        f.write(serving_md_table_with_headers)
-        f.write("\n")
-
-    # document benchmarking results in json
-    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
-        f.write(json.dumps(results))
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
deleted file mode 100644
index 50e1ab0242202854c8a978b9133c8de79a98a18b..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-else
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-fi
-
-TIMEOUT_SECONDS=10
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
deleted file mode 100644
index 569117aae852d0c78c64a71a610678584ed62d19..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ /dev/null
@@ -1,30 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
deleted file mode 100644
index 48c015aa8403b1bec6e7a014bc6317feee3e56ad..0000000000000000000000000000000000000000
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
similarity index 69%
rename from .buildkite/nightly-benchmarks/README.md
rename to .buildkite/performance-benchmarks/README.md
index e6f5c8b60f459a1d25d45493807e407ffca5bab3..6d494f64f14fa18086391c35975641b612a3d700 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -2,40 +2,23 @@
 
 ## Introduction
 
-This directory contains two sets of benchmark for vllm.
-
-- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
-- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-
-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
+vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
 
 **Benchmarking Duration**: about 1hr.
 
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 
-## Nightly benchmark quick overview
-
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
-
-**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-
-**Benchmarking Duration**: about 3.5hrs.
-
 ## Trigger the benchmark
 
-Performance benchmark will be triggered when:
-
-- A PR being merged into vllm.
-- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
-
-Manually Trigger the benchmark
+The benchmark needs to be triggered manually:
 
 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 
 Runtime environment variables:
@@ -47,14 +30,11 @@ Runtime environment variables:
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 
-Nightly benchmark will be triggered when:
-
-- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
-
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
 >
 ### Latency test
 
@@ -152,26 +132,3 @@ Here is an example using the script to compare result_a and result_b with Model,
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
-
-## Nightly test details
-
-See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
-### Workflow
-
-- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
-- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
-- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
-- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-
-### Nightly tests
-
-In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-
-### Docker containers
-
-The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
-
-WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
similarity index 92%
rename from .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
rename to .buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
index 8bb16bd3cf373add01a8766a36e70fa236824d49..b9437ac5ca99a7364e567a3fcc1171e2939206a6 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
@@ -5,7 +5,7 @@
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
@@ -16,7 +16,7 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.
 
@@ -28,7 +28,7 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
similarity index 58%
rename from .buildkite/nightly-benchmarks/scripts/compare-json-results.py
rename to .buildkite/performance-benchmarks/scripts/compare-json-results.py
index 5ea5a50a258a434cbece3fa82163ce53f4a86e7e..c8bf7b0453662d71dff5a6be0f48d2ceb63785e3 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,6 +7,7 @@ from importlib import util
 
 import pandas as pd
 
+pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
 
 
@@ -109,7 +110,10 @@ def compare_data_columns(
         if len(compare_frames) >= 2:
             base = compare_frames[0]
             current = compare_frames[-1]
-            ratio = current / base
+            if "P99" in data_column or "Median" in data_column:
+                ratio = base / current  # for latency
+            else:
+                ratio = current / base
             ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
             ratio.name = f"Ratio 1 vs {len(compare_frames)}"
             frames.append(ratio)
@@ -199,6 +203,71 @@ def split_json_by_tp_pp(
     return saved_paths
 
 
+def _add_limit_line(fig, y_value, label):
+    # Visible dashed line + annotation
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    # Optional: add a legend item (as a transparent helper trace)
+    if plot and plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
+                ),
+                name=f"{label}",
+            )
+        )
+
+
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    # Fallback: guess an integer-like column (harmless if unused)
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
+def _highlight_threshold(
+    df: pd.DataFrame, threshold: float
+) -> "pd.io.formats.style.Styler":
+    """Highlight numeric per-configuration columns with value <= threshold."""
+    conc_col = _find_concurrency_col(df)
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
+        if c in df.columns
+    ]
+    conf_cols = [
+        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
+    ]
+    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -220,6 +289,26 @@ if __name__ == "__main__":
         default="# of max concurrency.",
         help="column name to use as X Axis in comparison graph",
     )
+    parser.add_argument(
+        "-l",
+        "--latency",
+        type=str,
+        default="p99",
+        help="take median|p99 for latency like TTFT/TPOT",
+    )
+    parser.add_argument(
+        "--ttft-max-ms",
+        type=float,
+        default=3000.0,
+        help="Reference limit for TTFT plots (ms)",
+    )
+    parser.add_argument(
+        "--tpot-max-ms",
+        type=float,
+        default=100.0,
+        help="Reference limit for TPOT plots (ms)",
+    )
+
     args = parser.parse_args()
 
     drop_column = "P99"
@@ -234,12 +323,22 @@ if __name__ == "__main__":
         "# of max concurrency.",
         "qps",
     ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-    html_msgs_for_data_cols = [
-        "Compare Output Tokens /n",
-        "Median TTFT /n",
-        "Median TPOT /n",
-    ]
+
+    if "median" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "Median TTFT /n",
+            "Median TPOT /n",
+        ]
+        drop_column = "P99"
+    elif "p99" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "P99 TTFT /n",
+            "P99 TPOT /n",
+        ]
 
     if len(args.file) == 1:
         files = split_json_by_tp_pp(args.file[0], output_root="splits")
@@ -275,33 +374,83 @@ if __name__ == "__main__":
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_df_sorted = output_df.sort_values(by=args.xaxis)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
-                html = group.to_html()
-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-
-                if plot and plotly_found:
-                    import plotly.express as px
-
-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
+                group_name = (
+                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
+                )
+                group_html_name = "perf_comparison_" + group_name + ".html"
+
+                metric_name = str(data_cols_to_compare[i]).lower()
+                if "tok/s" in metric_name:
+                    html = group.to_html()
+                elif "ttft" in metric_name:
+                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+                elif (
+                    "tpot" in metric_name
+                    or "median" in metric_name
+                    or "p99" in metric_name
+                ):
+                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
                     )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
                     )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+
+                text_file.write(html_msgs_for_data_cols[i])
+                text_file.write(html)
+                with open(group_html_name, "a+") as sub_text_file:
+                    sub_text_file.write(html_msgs_for_data_cols[i])
+                    sub_text_file.write(html)
+
+                    if plot and plotly_found:
+                        import plotly.express as px
+
+                        df = group[raw_data_cols]
+                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                        # Melt DataFrame for plotting
+                        df_melted = df_sorted.melt(
+                            id_vars=info_cols[y_axis_index],
+                            var_name="Configuration",
+                            value_name=data_cols_to_compare[i],
+                        )
+                        title = (
+                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        )
+                        # Create Plotly line chart
+                        fig = px.line(
+                            df_melted,
+                            x=info_cols[y_axis_index],
+                            y=data_cols_to_compare[i],
+                            color="Configuration",
+                            title=title,
+                            markers=True,
+                        )
+
+                        # ---- Add threshold lines based on metric name ----
+                        if "ttft" in metric_name:
+                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+                        elif (
+                            "tpot" in metric_name
+                            or "median" in metric_name
+                            or "p99" in metric_name
+                        ):
+                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+                        # Export to HTML
+                        text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
+                        sub_text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
similarity index 98%
rename from .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
rename to .buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
index 77047636bb9513a96895c1620700e3f6e7482ee8..80bb4d846a22616c0eef7c832b22ef251aa41da2 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -63,9 +63,11 @@ serving_column_mapping = {
     "mean_ttft_ms": "Mean TTFT (ms)",
     "median_ttft_ms": "Median TTFT (ms)",
     "p99_ttft_ms": "P99 TTFT (ms)",
+    "std_ttft_ms": "STD TTFT (ms)",
     "mean_tpot_ms": "Mean TPOT (ms)",
     "median_tpot_ms": "Median",
     "p99_tpot_ms": "P99",
+    "std_tpot_ms": "STD TPOT (ms)",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median ITL (ms)",
     "p99_itl_ms": "P99 ITL (ms)",
@@ -368,7 +370,7 @@ if __name__ == "__main__":
         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
         # we want to turn it into "8xGPUTYPE"
         df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
         )
 
     # get markdown tables
@@ -390,7 +392,7 @@ if __name__ == "__main__":
     json_file = "benchmark_results.json"
     with open(results_folder / md_file, "w") as f:
         results = read_markdown(
-            "../.buildkite/nightly-benchmarks/"
+            "../.buildkite/performance-benchmarks/"
             + "performance-benchmarks-descriptions.md"
         )
         results = results.format(
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/performance-benchmarks/scripts/launch-server.sh
similarity index 97%
rename from .buildkite/nightly-benchmarks/scripts/launch-server.sh
rename to .buildkite/performance-benchmarks/scripts/launch-server.sh
index fb5063db86942a15f3ca5cd3d645b7ae03acd468..ebacdcbd6821ba1e369048af7b9d16e49c397b50 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
@@ -181,18 +181,14 @@ launch_vllm_server() {
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   else
     echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   fi
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
similarity index 95%
rename from .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
rename to .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index b1b7d2d77a44d962fec74cc58680a6c1ed1ade59..99a5a5e334f8ec0f7502bb942671af19863936a9 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -15,6 +15,8 @@ check_gpus() {
     declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
   elif command -v amd-smi; then
     declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  elif command -v hl-smi; then
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
   fi
 
   if [[ $gpu_count -gt 0 ]]; then
@@ -23,10 +25,16 @@ check_gpus() {
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
+  
+  declare -g arch_suffix=''
+  
   if command -v nvidia-smi; then
     declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
   elif command -v amd-smi; then
     declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  elif command -v hl-smi; then
+    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
+    arch_suffix='-hpu'
   fi
   echo "GPU type is $gpu_type"
 }
@@ -138,6 +146,10 @@ kill_gpu_processes() {
     while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
       sleep 1
     done
+  elif command -v hl-smi; then
+    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
+      sleep 1
+    done
   fi
 
   # remove vllm config file
@@ -365,8 +377,7 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="$server_envs python3 \
-      -m vllm.entrypoints.openai.api_server \
+    server_command="$server_envs vllm serve \
       $server_args"
 
     # run the server
@@ -452,14 +463,10 @@ main() {
      ARCH='-cpu'
   else
      check_gpus
+     ARCH="$arch_suffix"
   fi
   check_hf_token
 
-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
@@ -475,7 +482,12 @@ main() {
   ensure_sharegpt_downloaded
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
+
+  # dump vllm info via vllm collect-env
+  env_output=$(vllm collect-env)
+
+  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
 
   # benchmarking
   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/genai-perf-tests.json
rename to .buildkite/performance-benchmarks/tests/genai-perf-tests.json
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..77d1694ec8641a574bea94c5bc8b092262e7ae3d
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..296380f72a668b8ce41dc55379d2841d2fd70744
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/performance-benchmarks/tests/latency-tests.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/latency-tests.json
rename to .buildkite/performance-benchmarks/tests/latency-tests.json
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/performance-benchmarks/tests/nightly-tests.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/nightly-tests.json
rename to .buildkite/performance-benchmarks/tests/nightly-tests.json
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
similarity index 80%
rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
index ce396d6e54f278cd8605a844469828301dff212f..0b1a42e79025555e3e5925612202d6c40feb1c93 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -95,6 +95,38 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -233,6 +265,41 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
         "qps_list": ["inf"],
@@ -365,6 +432,38 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -503,6 +602,41 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
         "qps_list": ["inf"],
@@ -638,6 +772,39 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -780,6 +947,42 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
         "qps_list": ["inf"],
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
similarity index 52%
rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index e21c8df0a9fe955e0f36392bc3a65bcd83de620b..f792956f39472e0e6f232aaff3756d7b0c6837a2 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -2,7 +2,7 @@
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -28,13 +28,13 @@
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -60,13 +60,13 @@
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_tp1_random_128_128",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -76,11 +76,12 @@
         },
         "server_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
@@ -90,15 +91,122 @@
         "client_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "test_name": "serving_llama8B_tp1_random_2048_128",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -108,7 +216,7 @@
         },
         "server_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -124,16 +232,16 @@
             "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
         }
     },
     {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "test_name": "serving_llama8B_tp2_random_2048_128",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -143,7 +251,7 @@
         },
         "server_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
+            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -159,10 +267,10 @@
             "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
         }
     }
 ]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c6b34bd9fa33367a020888fddfe0fc3a5ad2108
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -0,0 +1,82 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/serving-tests.json
rename to .buildkite/performance-benchmarks/tests/serving-tests.json
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc214ddfb27e3a4d66456cb856f1ecbdac15babd
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
new file mode 100644
index 0000000000000000000000000000000000000000..3127bf2f6bce376906f419b46424134c86bd97ff
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -0,0 +1,61 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/performance-benchmarks/tests/throughput-tests.json
similarity index 100%
rename from .buildkite/nightly-benchmarks/tests/throughput-tests.json
rename to .buildkite/performance-benchmarks/tests/throughput-tests.json
diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml
deleted file mode 100644
index d5cad1c73c6f895d446dd2b594915d7af8ac9f7a..0000000000000000000000000000000000000000
--- a/.buildkite/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-
-[tool.ruff.format]
-docstring-code-format = true
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 7677d783fabc26ae34fae7b579df3cde123d1cc2..38c400ba1faf5037e7ed0115fc489c1333218f66 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
   - label: "Build arm64 wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-arm64-cuda-12-9
@@ -8,13 +8,28 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
+  # aarch64 build
+  - label: "Build arm64 CPU wheel"
+    depends_on: ~
+    id: build-wheel-arm64-cpu
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
   - label: "Build wheel - CUDA 12.8"
     depends_on: ~
     id: build-wheel-cuda-12-8
@@ -28,33 +43,33 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - label: "Build wheel - CUDA 12.6"
+  - label: "Build wheel - CUDA 12.9"
     depends_on: ~
-    id: build-wheel-cuda-12-6
+    id: build-wheel-cuda-12-9
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
+  - label: "Build wheel - CUDA 13.0"
     depends_on: ~
-    id: build-wheel-cuda-12-9
+    id: build-wheel-cuda-13-0
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
+  # Build release images (12.9)
   - label: "Build release image (x86)"
     depends_on: ~
     id: build-release-image-x86
@@ -62,13 +77,12 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
       # re-tag to default image tag and push, just in case arm64 build fails
       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
-  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build release image (arm64)"
     depends_on: ~
     id: build-release-image-arm64
@@ -76,7 +90,7 @@ steps:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
@@ -102,24 +116,6 @@ steps:
     commands:
       - "bash .buildkite/scripts/annotate-release.sh"
 
-  - label: "Build and publish TPU release image"
-    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: tpu_queue_postmerge
-    commands:
-      - "yes | docker system prune -a"
-      - "git fetch --all"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
-      - "docker push vllm/vllm-tpu:nightly"
-      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - input: "Provide Release version here"
     id: input-release-version
     fields:
@@ -136,12 +132,28 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
 
+  - block: "Build arm64 CPU release image"
+    key: block-arm64-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish arm64 CPU release image"
+    depends_on: block-arm64-cpu-release-image-build
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   - label: "Build and publish nightly multi-arch image to DockerHub"
     depends_on:
       - create-multi-arch-manifest
@@ -150,11 +162,16 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-aarch64"
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest push vllm/vllm-openai:nightly"
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
       # Clean up old nightly builds (keep only last 14)
       - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
     plugins:
@@ -163,3 +180,4 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+      DOCKERHUB_USERNAME: "vllmbot"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index fde48603ad3cd1dc01b455e291f23badb02037e2..56bb5cedaa0a96259f2b4159f1abeaca8061cf0e 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -2,16 +2,23 @@
 
 set -ex
 
-# Get release version and strip leading 'v' if present
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
-
-if [ -z "$RELEASE_VERSION" ]; then
-  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
-  exit 1
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
 fi
 
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
-To download the wheel:
+To download the wheel (by commit):
+\`\`\`
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+\`\`\`
+
+To download the wheel (by version):
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
index 1a82f7d085233bdc8262a34201cb610b3ca393b4..f02a128c6772665aa7996f9f4442ebc4f5219e0d 100755
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -8,20 +8,41 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 
-# Get DockerHub token from environment
+# Get DockerHub credentials from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
     echo "Error: DOCKERHUB_TOKEN environment variable is not set"
     exit 1
 fi
 
+if [ -z "$DOCKERHUB_USERNAME" ]; then
+    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+    exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+    "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+    echo "Error: Failed to get DockerHub bearer token"
+    exit 1
+fi
+
 # Function to get all tags from DockerHub
 get_all_tags() {
     local page=1
     local all_tags=""
     
     while true; do
-        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+        set +x
+        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
             "$REPO_API_URL?page=$page&page_size=100")
+        set -x
         
         # Get both last_updated timestamp and tag name, separated by |
         local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@@ -43,7 +64,9 @@ delete_tag() {
     echo "Deleting tag: $tag_name"
     
     local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+    set -x
     
     if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
         echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index aa4cc7b35a5434013be24a5baecb748e283347a2..864eb470bb0a7e33cd9067ec912a30a3e2722ee3 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
         fi
 done
 
-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 
-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 
-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s lora"* ]]; then
   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
@@ -173,19 +169,28 @@ fi
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
 if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
     # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
     echo "Shard ${GPU} commands:$commands_gpu"
     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
         --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
         --network=host \
         --shm-size=16gb \
+        --group-add "$render_gid" \
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
@@ -217,8 +222,8 @@ else
           --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
           --network=host \
           --shm-size=16gb \
+          --group-add "$render_gid" \
           --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 36bcb015d308ebf096662060c4e9726db10e0927..39ea1801730819b713c1d1e868f61bf7af241797 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,25 +25,28 @@ function cpu_tests() {
 
   # offline inference
   podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    set -xve
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
-    set -e
+    set -evx
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
     pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 
 export container_id
 export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
 
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7512cb1bbed016ea37d5645fdbcb3f20c9cbd540..7479c43977d780af179d0a0fd21ebc7041425988 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,6 +49,7 @@ function cpu_tests() {
   # Run kernel tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
@@ -70,13 +71,13 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
 
   # Note: disable it until supports V1
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+  #   pytest -x -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
   # Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..29c8f5ed5a91aeefb14a74fb722d071741dda971
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
+CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
+TEST_RUN_CONFIG_FILE="vllm_test.cfg"
+VLLM_ASCEND_TMP_DIR=
+# Get the test run configuration file from the vllm-ascend repository
+fetch_vllm_test_cfg() {
+    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
+    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
+    cleanup() {
+        rm -rf "${VLLM_ASCEND_TMP_DIR}"
+    }
+    trap cleanup EXIT
+
+    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
+    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
+        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
+        exit 1
+    fi
+
+    # If the file already exists locally, just overwrite it
+    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
+    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
+
+    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
+    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
+    rm -rf "${VLLM_ASCEND_TMP_DIR}"
+    trap - EXIT
+}
+
+# Downloads test run configuration file from a remote URL.
+# Loads the configuration into the current script environment.
+get_config() {
+    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
+        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
+        exit 1
+    fi
+    source "${TEST_RUN_CONFIG_FILE}"
+    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
+    return 0
+}
+
+# get test running configuration.
+fetch_vllm_test_cfg
+get_config
+# Check if the function call was successful. If not, exit the script.
+if [ $? -ne 0 ]; then
+  exit 1
+fi
+
+image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+
+# Try building the docker image
+cat <<EOF | DOCKER_BUILDKIT=1 docker build \
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
+FROM ${BASE_IMAGE_NAME}
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
+    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
+    apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install for pytest to make the docker build cache layer always valid
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install pytest>=6.0  modelscope
+
+WORKDIR /workspace/vllm
+
+# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements/common.txt
+
+COPY . .
+
+# Install vLLM
+RUN --mount=type=cache,target=/root/.cache/pip \
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton
+
+# Install vllm-ascend
+WORKDIR /workspace
+ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+ARG VLLM_ASCEND_TAG=main
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
+
+# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r /workspace/vllm-ascend/requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV VLLM_USE_MODELSCOPE=True
+
+WORKDIR /workspace/vllm-ascend
+
+CMD ["/bin/bash"]
+
+EOF
+
+# Setup cleanup
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+#   returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+    local input="$1"
+    local index cards_num
+    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+        index="${BASH_REMATCH[1]}"
+        cards_num="${BASH_REMATCH[2]}"
+    else
+        echo "parse error" >&2
+        return 1
+    fi
+
+    local devices=""
+    local i=0
+    while (( i < cards_num )); do
+        local dev_idx=$(((index - 1)*cards_num + i ))
+        devices="$devices --device /dev/davinci${dev_idx}"
+        ((i++))
+    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
+}
+
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${model_cache_dir}
+docker run \
+    ${devices} \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    bash -c '
+    set -e
+    pytest -v -s tests/e2e/vllm_interface/
+'
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index e76528a1782058d10d8ee9fb62ac8a7b19b96f04..cbb2527a4ff0aa30664f340c61f75833db469e2a 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
 
 echo "--- Hardware Information ---"
 # tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 69366cd503219864f59a4cacdd3d33417c1cec22..f022fa3672eeba2774d51e34b928ea25c7d4d6e6 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
 
 echo "--- Hardware Information ---"
 # tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 1fc3dbd8c21f4c46e5e85301d51bd39140a1e72c..d49f3e2f47cf1e3e26df72e1c6bb0bff00b9185b 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -20,7 +20,10 @@ trap remove_docker_container EXIT
 
 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri \
+    --device /dev/dri:/dev/dri \
+    --net=host \
+    --ipc=host \
+    --privileged \
     -v /dev/dri/by-path:/dev/dri/by-path \
     --entrypoint="" \
     -e "HF_TOKEN=${HF_TOKEN}" \
@@ -42,9 +45,7 @@ docker run \
     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
-    pytest -v -s v1/test_utils.py
-    pytest -v -s v1/test_metrics_reader.py
 '
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 72812218cb668368bfcf5b2811acf4bfb74102a2..51536b36b808de2a2bf6c2977d3990a2d90a115c 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?
 
 # run server-based benchmarks and upload the result to buildkite
-python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+vllm serve meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5302f524a0ae445fc3ae957a072c72453e50ddf6
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8010}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a5135299297e25d69e6baf1aaad3d196c533f76a
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
index bd25c803081a677b04faa7f45acf8629e97eb877..ecb98d4516bd533e02325fb0e68d8a14da7bf48b 100644
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
 INPUT_LEN=1800
 OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index b1e17b438578d4a071f7b870b6c24ae6dc5d160e..3364fce8e1fdc2ffdf1c215266b6d496356c40b1 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
 
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
  --seed 42 \
  --max-num-seqs $MAX_NUM_SEQS \
  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 43aa8c47be2992975a9264d2da7f0e6d06698d4b..945c5e48c00902c1947dc50b586acf0ae02881b0 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 
-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
     # only upload index.html for cu129 wheels (default wheels) as it
     # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
     # only upload index.html for cu129 wheels (default wheels) as it
     # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2471b509a9fff761d242080d94f37b9639c46f89
--- /dev/null
+++ b/.buildkite/test-amd.yaml
@@ -0,0 +1,1476 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# to generate the final pipeline yaml file.
+
+# Documentation
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on the first host, the second
+#     command runs on the second host.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+
+# When adding a test
+# - If the test belongs to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step.
+#   Note that all steps execute in parallel.
+
+steps:
+##### fast check tests  #####
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
+
+- label: Async Engine, Inputs, Utils, Worker Test # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/transformers_utils
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s transformers_utils
+
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+
+- label: Entrypoints Unit Tests # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration Test (API Server) # 100min
+  timeout_in_minutes: 130
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+- label: Entrypoints Integration Test (Pooling)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/test_basic_correctness
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - pushd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
+
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: EPLB Algorithm Test # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
+
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+##### fast check tests  #####
+#####  1 GPU test  #####
+
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Engine Test # 25min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
+
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 Test others (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_fusion_attn.py
+    - pytest -v -s compile/test_functionalization.py
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
+  #  - pytest -v -s compile/test_sequence_parallelism.py
+  #  - pytest -v -s compile/test_async_tp.py
+    - pytest -v -s compile/test_fusion_all_reduce.py
+    - pytest -v -s compile/test_decorator.py
+    - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py
+
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
+  - pytest -v -s compile/piecewise/
+
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Model Executor Test # 23min
+  timeout_in_minutes: 35
+  torch_nightly: true
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
+
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: LM Eval Small Models # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: OpenAI API correctness # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval
+  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
+
+#####  models test  #####
+
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models Test (Extended) 1
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models Test (Extended) 2
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+
+- label: Transformers Nightly Models Test
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/"
+  optional: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/test_fusions_e2e.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: ROCm GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  agent_pool: mi325_1
+  mirror_hardwares: [amdproduction]
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
+#####  1 GPU test  #####
+#####  multi gpus test  #####
+
+- label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+
+- label: Distributed Tests (2 GPUs) # 68min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+
+- label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+
+- label: Pipeline + Context Parallelism Test # 45min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
+
+
+- label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/compile/test_async_tp.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0914c899aa5b8878af5ba2be2ced9724f98620c0..4ac76aba67b9cde147b55907c702c01cab586954 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
 
 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -38,7 +39,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
   soft_fail: true
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
@@ -50,19 +51,30 @@ steps:
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
-  - tests/utils_
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
+  - tests/config
+  no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
-  - pytest -v -s multimodal
-  - pytest -v -s utils_ # Utils
-  - pytest -v -s transformers_utils # transformers_utils
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -159,13 +171,12 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
-  - tests/v1/test_internal_lb_dp.py
-  - tests/v1/test_hybrid_lb_dp.py
+  - tests/v1/distributed
   - tests/v1/engine/test_engine_core_client.py
   - tests/distributed/test_symm_mem_allreduce.py
   commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
   # test with torchrun tp=2 and external_dp=2
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with torchrun tp=2 and pp=2
@@ -180,10 +191,10 @@ steps:
   - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -197,6 +208,24 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -206,8 +235,8 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 5min
-  timeout_in_minutes: 15
+- label: EPLB Execution Test # 10min
+  timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -215,6 +244,7 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
@@ -289,27 +319,56 @@ steps:
     - vllm/
     - tests/v1
   commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
-    - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/kv_connector/unit
-    - pytest -v -s v1/metrics
-    - pytest -v -s v1/test_kv_sharing.py
-    - pytest -v -s v1/test_metrics_reader.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_outputs.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
+- label: V1 Test others (CPU) # 5 mins
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+
 - label: Examples Test # 30min
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
@@ -334,7 +393,8 @@ steps:
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 - label: Platform Tests (CUDA) # 4min
   timeout_in_minutes: 15
@@ -369,7 +429,12 @@ steps:
       --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
       --ignore=lora/test_chatglm3_tp.py \
       --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
+
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests # 15min
@@ -380,15 +445,18 @@ steps:
     - vllm/
     - tests/compile
   commands:
+    - pytest -v -s compile/test_graph_partition.py
+    - pytest -v -s compile/test_config.py
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
     - pytest -v -s compile/test_fusion_attn.py
+    - pytest -v -s compile/test_functionalization.py
     - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-    - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py
+    - pytest -v -s compile/test_qk_norm_rope_fusion.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -399,17 +467,34 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
@@ -417,8 +502,9 @@ steps:
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
   commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 
 - label: Kernels Attention Test %N # 23min
   timeout_in_minutes: 35
@@ -452,6 +538,8 @@ steps:
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
@@ -462,32 +550,25 @@ steps:
   source_file_dependencies:
   - csrc/mamba/
   - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
   commands:
     - pytest -v -s kernels/mamba
 
-- label: Tensorizer Test # 14min
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/model_executor/model_loader
-  - tests/tensorizer_loader
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
-  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-
-- label: Model Executor Test # 7min
-  timeout_in_minutes: 20
+- label: Model Executor Test # 23min
+  timeout_in_minutes: 35
+  torch_nightly: true
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
   commands:
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
 - label: Benchmarks # 11min
   timeout_in_minutes: 20
@@ -521,8 +602,9 @@ steps:
   # since torchao nightly is only compatible with torch nightly currently
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
   timeout_in_minutes: 75
@@ -530,6 +612,7 @@ steps:
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  autorun_on_main: true
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
@@ -550,10 +633,17 @@ steps:
   source_file_dependencies:
     - vllm/
     - tests/tool_use
-    - tests/mistral_tool_use
   commands:
-    - pytest -v -s tool_use
-    - pytest -v -s mistral_tool_use
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
 
 #####  models test  #####
 
@@ -593,13 +683,19 @@ steps:
   - vllm/
   - tests/models/test_transformers.py
   - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
   - tests/models/test_utils.py
   - tests/models/test_vision.py
+  no_gpu: true
   commands:
-    - pytest -v -s models/test_transformers.py \
-                   models/test_registry.py \
-                   models/test_utils.py \
-                   models/test_vision.py
+    - pytest -v -s models/test_utils.py models/test_vision.py
 
 - label: Language Models Tests (Standard)
   timeout_in_minutes: 25
@@ -658,8 +754,10 @@ steps:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (PPL)
@@ -714,6 +812,16 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
   mirror_hardwares: [amdexperimental]
   optional: true
@@ -768,16 +876,17 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_transformers.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
-- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
   gpu: b200
   # optional: true
@@ -790,13 +899,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -808,19 +920,64 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion and Compile Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
-- label: GPT-OSS Eval (Blackwell)
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/test_fusions_e2e.py
+
+- label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   gpu: b200
-  optional: true # disable while debugging
+  optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
   - vllm/model_executor/models/gpt_oss.py
@@ -828,7 +985,34 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -889,19 +1073,21 @@ steps:
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
+  - tests/v1/distributed
   - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - tests/v1/shutdown
   - tests/v1/worker/test_worker_memory_snapshot.py
   commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -945,6 +1131,11 @@ steps:
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
   # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
   # other tests continue here:
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
@@ -984,6 +1175,8 @@ steps:
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1010,6 +1203,17 @@ steps:
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
 
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1040,15 +1244,34 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
-- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
   gpu: h200
   optional: true
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
+    - pytest -v -s tests/compile/test_async_tp.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1059,6 +1282,7 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1071,3 +1295,21 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
diff --git a/.coveragerc b/.coveragerc
index bc6342956109bae444611e7f26700bf88b711c33..b7a9fdb4e05a872cd7f3e2515b09ddbd6635388e 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,10 @@
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+    vllm
+    */dist-packages/vllm
+    */site-packages/vllm
 omit =
     */tests/*
     */test_*
@@ -12,6 +17,16 @@ omit =
     */benchmarks/*
     */docs/*
 
+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+    vllm
+    /vllm-workspace/src/vllm
+    /vllm-workspace/vllm
+    */site-packages/vllm
+    */dist-packages/vllm
+
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000000000000000000000000000000000000..5a601d00cef8b9720cd6b078da6d1a14c5fff072
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,4 @@
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9d749fe8d3238b9c6530ebe5e85d3fa238fd368c..6e178bb690c56f442f875ae9b694e0441f84801f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,17 +3,13 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
-/vllm/v1/attention @LucasWilkinson
-/vllm/v1/sample @22quinn @houseroad
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@@ -24,44 +20,57 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett @luccafong
-/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backends/mla @pavanimajety
+/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/sample @22quinn @houseroad @njhill
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 
-# Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+# Transformers modeling backend
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 
+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
+
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
@@ -106,11 +115,21 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
-/vllm/v1/attention/backends/rocm*.py @gshtras
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
-/vllm/attention/ops/rocm*.py @gshtras
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+/vllm/**/*rocm* @tjtanaa
+/docker/Dockerfile.rocm* @gshtras @tjtanaa
+/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
+/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
+/csrc/rocm @gshtras @tjtanaa
+/requirements/*rocm* @tjtanaa
+/tests/**/*rocm* @tjtanaa
+/docs/**/*rocm* @tjtanaa
+/vllm/**/*quark* @tjtanaa
+/tests/**/*quark* @tjtanaa
+/docs/**/*quark* @tjtanaa
+/vllm/**/*aiter* @tjtanaa
+/tests/**/*aiter* @tjtanaa
 
 # TPU
 /vllm/v1/worker/tpu* @NickLucche
@@ -120,3 +139,16 @@ mkdocs.yaml @hmellor
 
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
+
+# Pooling models
+/examples/*/pooling/ @noooop
+/tests/models/*/pooling* @noooop
+/tests/entrypoints/pooling @noooop
+/vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop
+
+# Security guide and policies
+/docs/usage/security.md @russellb
+/SECURITY.md @russellb
+/docs/contributing/vulnerability_management.md @russellb
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 75ee3e3c55b4634d8e38ff8e615614af3ddb911d..997a40e18e5885c282984d61997bf1dbef3397f6 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -2,6 +2,7 @@ pull_request_rules:
 - name: label-documentation
   description: Automatically apply documentation label
   conditions:
+    - label != stale
     - or:
       - files~=^[^/]+\.md$
       - files~=^docs/
@@ -10,10 +11,13 @@ pull_request_rules:
     label:
       add:
         - documentation
+    comment:
+      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 
 - name: label-ci-build
   description: Automatically apply ci/build label
   conditions:
+    - label != stale
     - or:
       - files~=^\.github/
       - files~=\.buildkite/
@@ -30,6 +34,7 @@ pull_request_rules:
 - name: label-deepseek
   description: Automatically apply deepseek label
   conditions:
+    - label != stale
     - or:
       - files~=^examples/.*deepseek.*\.py
       - files~=^tests/.*deepseek.*\.py
@@ -46,6 +51,7 @@ pull_request_rules:
 - name: label-frontend
   description: Automatically apply frontend label
   conditions:
+    - label != stale
     - files~=^vllm/entrypoints/
   actions:
     label:
@@ -55,6 +61,7 @@ pull_request_rules:
 - name: label-llama
   description: Automatically apply llama label
   conditions:
+    - label != stale
     - or:
       - files~=^examples/.*llama.*\.py
       - files~=^tests/.*llama.*\.py
@@ -70,6 +77,7 @@ pull_request_rules:
 - name: label-multi-modality
   description: Automatically apply multi-modality label
   conditions:
+    - label != stale
     - or:
       - files~=^vllm/multimodal/
       - files~=^tests/multimodal/
@@ -83,6 +91,7 @@ pull_request_rules:
 - name: label-new-model
   description: Automatically apply new-model label
   conditions:
+    - label != stale
     - and:
       - files~=^vllm/model_executor/models/
       - files=vllm/model_executor/models/registry.py
@@ -94,11 +103,12 @@ pull_request_rules:
 - name: label-performance
   description: Automatically apply performance label
   conditions:
+    - label != stale
     - or:
       - files~=^benchmarks/
       - files~=^vllm/benchmarks/
       - files~=^tests/benchmarks/
-      - files~=^\.buildkite/nightly-benchmarks/
+      - files~=^\.buildkite/performance-benchmarks/
   actions:
     label:
       add:
@@ -107,6 +117,7 @@ pull_request_rules:
 - name: label-qwen
   description: Automatically apply qwen label
   conditions:
+    - label != stale
     - or:
       - files~=^examples/.*qwen.*\.py
       - files~=^tests/.*qwen.*\.py
@@ -121,6 +132,7 @@ pull_request_rules:
 - name: label-gpt-oss
   description: Automatically apply gpt-oss label
   conditions:
+    - label != stale
     - or:
       - files~=^examples/.*gpt[-_]?oss.*\.py
       - files~=^tests/.*gpt[-_]?oss.*\.py
@@ -139,9 +151,27 @@ pull_request_rules:
       add:
         - gpt-oss
 
+- name: label-nvidia
+  description: Automatically apply nvidia label
+  conditions:
+    - label != stale
+    - or:
+      - files~=cuda
+      - files~=cutlass
+      - files~=flashinfer
+      - files~=trtllm
+      - title~=(?i)NVIDIA
+      - title~=(?i)CUDA
+      - title~=(?i)CUTLASS
+  actions:
+    label:
+      add:
+        - nvidia
+
 - name: label-rocm
   description: Automatically apply rocm label
   conditions:
+    - label != stale
     - or:
       - files~=^csrc/rocm/
       - files~=^docker/Dockerfile.rocm
@@ -162,6 +192,7 @@ pull_request_rules:
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:
+    - label != stale
     - or:
       - files~=^benchmarks/structured_schemas/
       - files=benchmarks/benchmark_serving_structured_output.py
@@ -181,6 +212,7 @@ pull_request_rules:
 - name: label-speculative-decoding
   description: Automatically apply speculative-decoding label
   conditions:
+    - label != stale
     - or:
       - files~=^vllm/v1/spec_decode/
       - files~=^tests/v1/spec_decode/
@@ -196,6 +228,7 @@ pull_request_rules:
 - name: label-v1
   description: Automatically apply v1 label
   conditions:
+    - label != stale
     - or:
       - files~=^vllm/v1/
       - files~=^tests/v1/
@@ -208,6 +241,7 @@ pull_request_rules:
   description: Automatically apply tpu label
   # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
+    - label != stale
     - or:
       - files~=tpu.py
       - files~=_tpu
@@ -223,6 +257,7 @@ pull_request_rules:
   description: Automatically remove tpu label
   # Keep this list in sync with `label-tpu` conditions
   conditions:
+    - label != stale
     - and:
       - -files~=tpu.py
       - -files~=_tpu
@@ -237,9 +272,9 @@ pull_request_rules:
 - name: label-tool-calling
   description: Automatically add tool-calling label
   conditions:
+    - label != stale
     - or:
       - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
       - files~=^tests/entrypoints/openai/tool_parsers/
       - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
       - files~=^vllm/entrypoints/openai/tool_parsers/
@@ -256,8 +291,9 @@ pull_request_rules:
 
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
-      - conflict
-      - -closed
+    - label != stale
+    - conflict
+    - -closed
   actions:
     label:
       add:
@@ -271,10 +307,12 @@ pull_request_rules:
 
 - name: assign reviewer for tensorizer changes
   conditions:
+    - label != stale
+    - or:
       - files~=^vllm/model_executor/model_loader/tensorizer.py
       - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
       - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/tensorizer_loader/
+      - files~=^tests/model_executor/model_loader/tensorizer_loader/
   actions:
     assign:
       users:
@@ -282,6 +320,7 @@ pull_request_rules:
 
 - name: assign reviewer for modelopt changes
   conditions:
+    - label != stale
     - or:
         - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
         - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@@ -296,8 +335,8 @@ pull_request_rules:
 
 - name: remove 'needs-rebase' label when conflict is resolved
   conditions:
-      - -conflict
-      - -closed
+    - -conflict
+    - -closed
   actions:
     label:
       remove:
@@ -306,6 +345,7 @@ pull_request_rules:
 - name: label-kv-connector
   description: Automatically apply kv-connector label
   conditions:
+    - label != stale
     - or:
       - files~=^examples/online_serving/disaggregated[^/]*/.*
       - files~=^examples/offline_inference/disaggregated[^/]*/.*
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index c2b17abe811cdcd133159dd4349bb8ed9235aede..7d565ef9f2e459531a1a231db9d244dd4f4085be 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,6 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Label issues based on keywords
+        id: label-step
         uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
         with:
           script: |
@@ -42,7 +43,6 @@ jobs:
                     searchIn: "body"
                   },
                 ],
-                
                 // Substring search - matches anywhere in text (partial matches)
                 substrings: [
                   {
@@ -89,14 +89,12 @@ jobs:
                     term: "hip_",
                     searchIn: "both"
                   },
-                  
                   // ROCm tools and libraries
                   {
                     term: "hipify",
                     searchIn: "both"
                   },
                 ],
-                
                 // Regex patterns - for complex pattern matching
                 regexPatterns: [
                   {
@@ -107,13 +105,17 @@ jobs:
                   }
                 ],
               },
+              // Add more label configurations here as needed
+              // example: {
+              //   keywords: [...],
+              //   substrings: [...],
+              //   regexPatterns: [...]
+              // },
             };
-            
             // Helper function to create regex based on search type
             function createSearchRegex(term, type) {
               // Escape special regex characters in the term
               const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-              
               switch (type) {
                 case 'keyword':
                   // Word boundary search - matches whole words only
@@ -125,16 +127,13 @@ jobs:
                   throw new Error(`Unknown search type: ${type}`);
               }
             }
-            
             // Helper function to find matching terms in text with line information
             function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
               const matches = [];
               const lines = text.split('\n');
-              
               for (const termConfig of searchTerms) {
                 let regex;
                 let term, searchIn, pattern, description, flags;
-                
                 // Handle different input formats (string or object)
                 if (typeof termConfig === 'string') {
                   term = termConfig;
@@ -146,21 +145,17 @@ jobs:
                   description = termConfig.description;
                   flags = termConfig.flags;
                 }
-                
                 // Skip if this term shouldn't be searched in the current location
                 if (searchIn !== 'both' && searchIn !== searchLocation) {
                   continue;
                 }
-                
                 // Create appropriate regex
                 if (searchType === 'regex') {
                   regex = new RegExp(pattern, flags || "gi");
                 } else {
                   regex = createSearchRegex(term, searchType);
                 }
-                
                 const termMatches = [];
-                
                 // Check each line for matches
                 lines.forEach((line, lineIndex) => {
                   const lineMatches = line.match(regex);
@@ -175,15 +170,14 @@ jobs:
                         originalTerm: term || pattern,
                         description: description,
                         // Show context around the match in the line
-                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                        context: line.length > 100 ?
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                           : line.trim()
                       });
                     });
                   }
                 });
-                
                 if (termMatches.length > 0) {
                   matches.push({
                     term: term || (description || pattern),
@@ -196,64 +190,48 @@ jobs:
                   });
                 }
               }
-              
               return matches;
             }
-            
             // Helper function to check if label should be added
             async function processLabel(labelName, config) {
               const body = context.payload.issue.body || "";
               const title = context.payload.issue.title || "";
-              
               core.notice(`Processing label: ${labelName}`);
               core.notice(`Issue Title: "${title}"`);
               core.notice(`Issue Body length: ${body.length} characters`);
-              
               let shouldAddLabel = false;
               let allMatches = [];
               let reason = '';
-              
               const keywords = config.keywords || [];
               const substrings = config.substrings || [];
               const regexPatterns = config.regexPatterns || [];
-              
               core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-              
               // Search in title
               if (title.trim()) {
                 core.notice(`Searching in title: "${title}"`);
-                
                 const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                 const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                 const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-                
                 allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
               }
-              
               // Search in body
               if (body.trim()) {
                 core.notice(`Searching in body (${body.length} characters)`);
-                
                 const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                 const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                 const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-                
                 allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
               }
-              
               if (allMatches.length > 0) {
                 core.notice(`Found ${allMatches.length} matching term(s):`);
-                
                 for (const termMatch of allMatches) {
                   const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                   const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-                  
                   if (termMatch.searchType === 'regex') {
                     core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                   } else {
                     core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                   }
-                  
                   // Show details for each match
                   termMatch.matches.forEach((match, index) => {
                     core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@@ -266,7 +244,6 @@ jobs:
                     }
                   });
                 }
-                
                 shouldAddLabel = true;
                 const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                 const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@@ -274,13 +251,10 @@ jobs:
                 const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                 const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                 const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-                
                 reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
               }
-              
               core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
               core.notice(`Reason: ${reason || 'No matching terms found'}`);
-              
               if (shouldAddLabel) {
                 const existingLabels = context.payload.issue.labels.map(l => l.name);
                 if (!existingLabels.includes(labelName)) {
@@ -296,14 +270,92 @@ jobs:
                 core.notice(`Label "${labelName}" already present.`);
                 return false;
               }
-              
               core.notice(`No matching terms found for label "${labelName}".`);
               return false;
             }
-            
             // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
-              .map(([labelName, config]) => processLabel(labelName, config));
-            const labelsAdded = await Promise.all(processLabels);
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file
+            const labelsAddedResults = await Promise.all(
+              Object.entries(labelConfig).map(([labelName, config]) => 
+                processLabel(labelName, config).then(added => ({ labelName, added }))
+              )
+            );
+            
+            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            
+            // Return which labels were added for the next step
+            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+            core.setOutput('labels_added', JSON.stringify(addedLabels));
+            return addedLabels;
+
+      - name: CC users for labeled issues
+        if: steps.label-step.outputs.labels_added != '[]'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            // Configuration: Map labels to GitHub users to CC
+            // You can add multiple users per label, and multiple label configurations
+            const ccConfig = {
+              rocm: {
+                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
+                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
+              },
+              // Add more label -> user mappings here
+              // Example:
+              // cuda: {
+              //   users: ['user1', 'user2'],
+              //   message: 'CC {users} for CUDA-related issue'
+              // },
+              // performance: {
+              //   users: ['perfexpert'],
+              //   message: 'CC {users} for performance issue'
+              // },
+            };
+            
+            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+            
+            // Get existing comments to check for already mentioned users
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            
+            const issueBody = context.payload.issue.body || '';
+            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+            
+            // Process each label that was added
+            for (const label of labelsAdded) {
+              if (ccConfig[label]) {
+                const config = ccConfig[label];
+                const usersToMention = [];
+                
+                // Check which users haven't been mentioned yet
+                for (const user of config.users) {
+                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+                  if (!mentionPattern.test(allExistingText)) {
+                    usersToMention.push(user);
+                  } else {
+                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
+                  }
+                }
+                
+                // Post comment if there are users to mention
+                if (usersToMention.length > 0) {
+                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
+                  const message = config.message.replace('{users}', mentions);
+                  
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    body: message
+                  });
+                  
+                  core.notice(`CC comment added for label "${label}": ${mentions}`);
+                } else {
+                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+                }
+              }
+            }
\ No newline at end of file
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..42b05ecd5ac06ca8e55d36a87c2f628a1d6c3f73
--- /dev/null
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,81 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 20
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
+          python-version: '3.12'
+
+      - name: Create virtual environment
+        run: |
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+
+      - name: Smoke test vllm serve
+        timeout-minutes: 10
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2048 \
+            --load-format=dummy \
+            --enforce-eager \
+            --port 8000 &
+
+          SERVER_PID=$!
+
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+
+          # Cleanup
+          kill "$SERVER_PID"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 82844810a633a18a73ddd2a73e12d0cd9073934e..dca3089f496c9639674ab3b87d24c93adfd05c13 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index b1df673e83ca8c1f94fbeb1e1754cd541c39ea26..50070d7898fe6d40cb0e162892b70111291f5fa0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,6 +94,9 @@ ipython_config.py
 # generated files
 **/generated/**
 
+# uv
+uv.lock
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@@ -218,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
 
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
+
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index c86fed9555d6235fa1e33652256f1609aec18d6a..937487f47364d04a8f8cb02a6bd0b8ac467362ec 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,11 +3,9 @@ MD007:
 MD013: false
 MD024:
   siblings_only: true
+MD031:
+  list_items: false
 MD033: false
-MD042: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8ca414ee4269b1e9fcbf38e8505001ea2edfa71e..e034f75a9d3224b7f50030128912d4b18e7cf50e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,30 +6,19 @@ default_stages:
   - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
-- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
-    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.14.0
   hooks:
-  - id: ruff
+  - id: ruff-check
     args: [--output-format, github, --fix]
   - id: ruff-format
-    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.35.5
+  rev: v1.38.1
   hooks:
   - id: typos
-- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
+    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v21.1.2
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@@ -46,32 +35,27 @@ repos:
   hooks:
   - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.9.1
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
   - id: format-torch-nightly-test
     name: reformat nightly_torch_test.txt to be in sync with test.in
     language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
     files: ^requirements/test\.(in|txt)$
   - id: mypy-local
-    name: Run mypy for local Python installation
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    name: Run mypy locally for lowest supported Python version
+    entry: python tools/pre_commit/mypy.py 0 "3.10"
     stages: [pre-commit] # Don't run in CI
     <<: &mypy_common
       language: python
       types_or: [python, pyi]
       require_serial: true
       additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: python tools/pre_commit/mypy.py 1 "3.9"
-    <<: *mypy_common
-    stages: [manual] # Only run in CI
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -87,14 +71,19 @@ repos:
     entry: python tools/pre_commit/mypy.py 1 "3.12"
     <<: *mypy_common
     stages: [manual] # Only run in CI
+  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.13
+    entry: python tools/pre_commit/mypy.py 1 "3.13"
+    <<: *mypy_common
+    stages: [manual] # Only run in CI
   - id: shellcheck
     name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
     language: script
     types: [shell]
   - id: png-lint
     name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
     language: script
     types: [png]
   - id: signoff-commit
@@ -111,12 +100,12 @@ repos:
     stages: [commit-msg]
   - id: check-spdx-header
     name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
     language: python
     types: [python]
   - id: check-root-lazy-imports
     name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
     language: python
     types: [python]
   - id: check-filenames
@@ -130,11 +119,11 @@ repos:
     pass_filenames: false
   - id: update-dockerfile-graph
     name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
     language: script
   - id: enforce-import-regex-instead-of-re
     name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
     language: python
     types: [python]
     pass_filenames: false
@@ -142,7 +131,7 @@ repos:
   # forbid directly import triton
   - id: forbid-direct-triton-import
     name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
     language: python
     types: [python]
     pass_filenames: false
@@ -155,7 +144,7 @@ repos:
     additional_dependencies: [regex]
   - id: validate-config
     name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
     language: python
     additional_dependencies: [regex]
   # Keep `suggestion` last
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 138537ba37fb02156571936d170f8582992a4d54..4bdaf4181e7c1a4d37c13cd03ad4616966b18b9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,11 +34,18 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx928;gfx936")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151;gfx928;gfx936")
 
+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+  set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
@@ -49,8 +56,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -86,6 +93,9 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
   set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@@ -175,6 +185,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 
+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
 #
 # Set CUDA include flags for CXX compiler.
 #
@@ -225,11 +244,28 @@ set_gencode_flags_for_srcs(
   SRCS "${VLLM_CUMEM_EXT_SRC}"
   CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
-  define_gpu_extension_target(
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # link against cuda driver library
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library. Prefer an absolute path to
+    # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+    # back to linking by name "amdhip64".
+    find_library(AMDHIP64_LIB
+      NAMES amdhip64 libamdhip64.so
+      PATHS ${ROCM_PATH}/lib
+      NO_DEFAULT_PATH)
+    if(AMDHIP64_LIB)
+      message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+      list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+    else()
+      message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+      list(APPEND CUMEM_LIBS amdhip64)
+    endif()
+  endif()
+  define_extension_target(
     cumem_allocator
     DESTINATION vllm
     LANGUAGE CXX
@@ -253,13 +289,13 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
-  "csrc/opt/transpose_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
   # "csrc/layernorm_quant_kernels.cu"
   "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
-  # "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  # "csrc/quantization/w8a8/fp8/common.cu"
   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
   # "csrc/quantization/activation_kernels.cu"
@@ -271,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -303,13 +339,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
-    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/fp8/per_token_group_quant.cu")
+    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -319,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -413,11 +449,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -441,12 +477,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -471,12 +511,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
   # require CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -507,7 +551,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -551,7 +595,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
   # CUDA 12.8 or later
-  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -570,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # FP4 Archs and flags
-  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -592,7 +644,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
     set(SRCS
       "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@@ -618,7 +674,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -636,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -657,9 +717,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # moe_data.cu is used by all CUTLASS MoE kernels.
-  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -676,9 +740,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -793,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if(HADACORE_ARCHS)
     set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
     set_gencode_flags_for_srcs(
@@ -815,7 +883,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
+define_extension_target(
   _C
   DESTINATION vllm
   LANGUAGE ${VLLM_GPU_LANG}
@@ -840,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/moe_lora_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -870,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
@@ -929,7 +998,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
+define_extension_target(
   _moe_C
   DESTINATION vllm
   LANGUAGE ${VLLM_GPU_LANG}
@@ -951,7 +1020,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     "csrc/rocm/skinny_gemms.cu"
     "csrc/rocm/attention.cu")
 
-  define_gpu_extension_target(
+  define_extension_target(
     _rocm_C
     DESTINATION vllm
     LANGUAGE ${VLLM_GPU_LANG}
@@ -966,6 +1035,7 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
+    include(cmake/external_projects/qutlass.cmake)
 
     # vllm-flash-attn should be last as it overwrites some CMake functions
     include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
index 0c6e5aa6b31d215f413a241b37d8f879c17fe979..fc339ceda8839fa6344795f7b3886fcfe32d2d95 100644
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -21,6 +21,10 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
+- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
+- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@@ -81,7 +85,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
@@ -148,6 +152,7 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
+- Volcengine
 
 Slack Sponsor: Anyscale
 
@@ -178,4 +183,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Media Kit
 
-- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
\ No newline at end of file
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index b333ba9cd8e99b85cdcaf5ff7b4bb2d3c0f1e6fd..56b721cbb402192153c3c0c8bafee725a86cdc83 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -74,7 +74,7 @@ start_server() {
     local vllm_log=$4
     local profile_dir=$5
 
-    pkill -if vllm
+    pkill -if "vllm serve" || true
 
     # Define the common arguments as a bash array.
     # Each argument and its value are separate elements.
@@ -96,11 +96,11 @@ start_server() {
     # This correctly passes each element as a separate argument.
     if [[ -n "$profile_dir" ]]; then
         # Start server with profiling enabled
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     else
         # Start server without profiling
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+        VLLM_SERVER_DEV_MODE=1 \
             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     fi
     local server_pid=$!
@@ -139,7 +139,7 @@ run_benchmark() {
     echo "vllm_log: $vllm_log"
     echo
     rm -f $vllm_log
-    pkill -if vllm
+    pkill -if "vllm serve" || true
 
     echo "starting server..."
     # Call start_server without a profile_dir to avoid profiling overhead
@@ -232,7 +232,7 @@ run_benchmark() {
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    pkill -if vllm
+    pkill -if "vllm serve" || true
     sleep 10
     echo "===================="
     return 0
@@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 else
     echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
-pkill -if vllm
+pkill -if "vllm serve" || true
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ba7c733be0b25bdef90e4fba703ecb4640fe1787..4021fede72153ee67b5570529c07e1fbdc750d41 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -8,7 +8,6 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union
 
 import aiohttp
 import huggingface_hub.constants
@@ -28,13 +27,13 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
     ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None
 
 
 @dataclass
@@ -52,7 +51,7 @@ class RequestFuncOutput:
 
 async def async_request_tgi(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -133,7 +132,7 @@ async def async_request_tgi(
 
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -204,7 +203,7 @@ async def async_request_trt_llm(
 
 async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
 
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -367,7 +366,7 @@ async def async_request_openai_completions(
 
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("chat/completions", "profile")), (
@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
 
 async def async_request_openai_audio(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     # Lazy import without PlaceholderModule to avoid vllm dep.
     import soundfile
@@ -610,7 +609,7 @@ def get_tokenizer(
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
     **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path is not None and not os.path.exists(
         pretrained_model_name_or_path
     ):
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
new file mode 100755
index 0000000000000000000000000000000000000000..b5c16c42de467abb26927a017003fc36c5c33e71
--- /dev/null
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index eae8d9927ea391d496e2c5582ea72e67391c93d3..20cd26bdddf513c21f31853380d62583ac51980d 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 
+from benchmark_utils import TimeCollector
 from tabulate import tabulate
 
-from benchmark_utils import TimeCollector
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
 
 
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 6e0f3b51c9d284d22dd1acdc1b794b9a5c0f31b1..f64fd09bab9fa7d57dfe5a1312bdcc6eb0f9292f 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -46,7 +46,7 @@ import time
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index d4b83edbd940edfd561df26fe04adcc32d0dca0d..dedb564fffac8eee211577c10f138247629177cb 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -5,9 +5,9 @@ import time
 from unittest import mock
 
 import numpy as np
+from benchmark_utils import TimeCollector
 from tabulate import tabulate
 
-from benchmark_utils import TimeCollector
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -19,7 +19,7 @@ from vllm.config import (
     VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -164,7 +164,7 @@ def invoke_main() -> None:
     )
     parser.add_argument(
         "--batched", action="store_true", help="consider time to prepare batch"
-    )  # noqa: E501
+    )
     parser.add_argument(
         "--num-iteration",
         type=int,
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index b5e2613de1cd4a0a2557cf09baa458aa15a6049c..28fc383a318dd16c54dcbc6cca0acdb5ac0b7876 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -32,13 +32,12 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import PreTrainedTokenizerBase
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 try:
     from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -70,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
 
     # Remove the special tokens.
     return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
+        [v for v in vocab.values() if v not in all_special_ids],
         k=length,
     )
 
@@ -80,7 +79,7 @@ def sample_requests_from_dataset(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -128,7 +127,7 @@ def sample_requests_from_random(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
     prefix_len: int,
 ) -> list[Request]:
     requests = []
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index bb453791c1862bdd0883f0a47af6fddeef764f8f..a35db0063b0ae245f2022af198f80c673c700512 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -7,12 +7,11 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 # Select a equi-probable random priority
@@ -24,7 +23,7 @@ def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index a0350625491f49a5cf2ec6c6e5f128944de504fe..55001cf3722a0b4d4d835f4632d07687c4c3f162 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -31,20 +31,19 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Optional
 
 import datasets
 import numpy as np
 import pandas as pd
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-
 from backend_request_func import (
     ASYNC_REQUEST_FUNCS,
     RequestFuncInput,
     RequestFuncOutput,
 )
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
 
 try:
     from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -52,7 +51,7 @@ except ImportError:
     from backend_request_func import get_tokenizer
 
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
@@ -317,7 +316,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     actual_output_lens: list[int] = []
     total_input = 0
@@ -437,9 +436,9 @@ async def benchmark(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[str],
     ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
     structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -503,15 +502,9 @@ async def benchmark(
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
 
     async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
         async with semaphore:
             return await request_func(request_func_input=request_func_input, pbar=pbar)
 
@@ -910,13 +903,13 @@ def create_argument_parser():
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
     )
     parser.add_argument(
         "--tokenizer-mode",
         type=str,
         default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
     )
     parser.add_argument(
         "--num-prompts",
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 98624abdf49fb0dbf75a5c3f58a5fe95469e9b51..f0d661f9d53498f5067bfb0ef7d19d52af84699d 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any
 
 
 def convert_to_pytorch_benchmark_format(
@@ -92,7 +92,7 @@ class TimeCollector:
     def __init__(self, scale: int) -> None:
         self.cnt: int = 0
         self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
         self.scale = scale
         self.start_time: int = time.monotonic_ns()
 
@@ -104,13 +104,13 @@ class TimeCollector:
         else:
             self._max = max(self._max, v)
 
-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
         return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
 
-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
         return self._max / self.scale if self._max else "N/A"
 
-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
         return [self.avg(), self.max()]
 
     def __enter__(self) -> None:
@@ -118,8 +118,8 @@ class TimeCollector:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
     ) -> None:
         self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 9ec270bbd2e988b459f99d876215984bf125d0f9..67fccdf4fd07e3aaa0feb29c64c114f7fdbb738f 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -16,7 +15,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index a5a5b52f603977cc0c967e455724bc7c6a2bf585..f7325ddd2cbbfef35bf1ffeb7e9ebb678c7e355b 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -17,9 +16,10 @@ from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import cdiv
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -53,7 +53,7 @@ def bench_int8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
@@ -108,7 +108,7 @@ def bench_fp8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
@@ -158,7 +158,7 @@ def bench_fp8(
         "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
             a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
         ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
             a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
         ),
         "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
@@ -183,7 +183,7 @@ def bench(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
     dtype: torch.dtype,
     MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 2c72941cf7e5112724683a89585276e1bae6e9c0..d683835db96a4d0f720e3d86560694ee6af9b828 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -55,9 +55,7 @@ benchmark() {
   output_len=$2
 
 
-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -65,9 +63,7 @@ benchmark() {
     '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
 
-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
     --port 8200 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 0bbf7cd2b1c81e5d3d6641b705918a11a3feead6..35c86cc845221ae24395f89394c764d78eed5329 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -38,16 +38,12 @@ wait_for_server() {
 launch_chunked_prefill() {
   model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
     --port 8100 \
     --max-model-len 10000 \
     --enable-chunked-prefill \
     --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
     --port 8200 \
     --max-model-len 10000 \
     --enable-chunked-prefill \
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
   model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
     '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
     --port 8200 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 901524214469e8677cbb58c3078e8b1d724585bb..d809bf1db8cbca52a915051cec26a2d1e24d026a 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -3,10 +3,9 @@
 
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
     rms_norm_layer: RMSNorm,  # this stores the weights
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     out, _ = ops.rms_norm_dynamic_per_token_quant(
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index f1e504499eaf612de382709911f966b0de782f84..11e3ac7f0c1fa6ab4e576ee872a79a0129adc13f 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
+
+# Disable DeepGEMM for this benchmark to use CUTLASS
+os.environ["VLLM_USE_DEEP_GEMM"] = "0"
+
 import torch
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear,
+    W8A8BlockFp8LinearOp,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    # Create random FP8 tensors
+    # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
     A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
 
+    # Create quantized weight tensor
     B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
     B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
-    # Create scales
+    # Create weight scales
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
         * factor_for_scale
     )
 
-    # SM90 CUTLASS requires row-major format for scales
-    if use_cutlass and current_platform.is_device_capability(90):
-        Bs = Bs.T.contiguous()
+    # Create W8A8BlockFp8LinearOp instance
+    weight_group_shape = GroupShape(block_n, block_k)
+    act_quant_group_shape = GroupShape(1, block_k)  # Per-token, per-group quantization
+
+    linear_op = W8A8BlockFp8LinearOp(
+        weight_group_shape=weight_group_shape,
+        act_quant_group_shape=act_quant_group_shape,
+        cutlass_block_fp8_supported=use_cutlass,
+        use_aiter_and_is_supported=False,
+    )
 
     def run():
-        if use_cutlass:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
-            )
-        else:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
-            )
+        return linear_op.apply(
+            input=A_ref,
+            weight=B,
+            weight_scale=Bs,
+            input_scale=None,
+            bias=None,
+        )
 
     return run
 
diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/bench_mxfp4_qutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc7721876a1739055f851f86468ce01fb3fd670
--- /dev/null
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "mxfp4": dict(no_a_quant=False, enabled=True),
+    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _quant_weight_mxfp4(
+    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
+):
+    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
+        b, forward_hadamard_matrix, method="abs_max"
+    )
+    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
+    return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
+    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
+        b, forward_hadamard_matrix, device
+    )
+    alpha = torch.tensor([1.0], device="cuda")
+
+    if cfg["no_a_quant"]:
+        # Pre-quantize activation
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+            a, forward_hadamard_matrix, method="abs_max"
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+
+        def run():
+            return matmul_mxf4_bf16_tn(
+                input_hf_e2m1,
+                weight_hf_e2m1,
+                input_hf_scale_block,
+                weight_hf_scale_block,
+                alpha,
+            )
+
+        return run
+
+    # Quantize activation on-the-fly
+    def run():
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+            a, forward_hadamard_matrix, method="abs_max"
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+        return matmul_mxf4_bf16_tn(
+            input_hf_e2m1,
+            weight_hf_e2m1,
+            input_hf_scale_block,
+            weight_hf_scale_block,
+            alpha,
+        )
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+            16384,
+            24576,
+            32768,
+        ],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs MXFP4 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_mxfp4_runner(
+            cfg, a, b, forward_hadamard_matrix, dtype, device
+        )
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), rep=200, quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        for had_size in [32, 64, 128]:
+            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
+            benchmark.run(
+                print_data=True,
+                show_plots=True,
+                save_path=f"bench_mxfp4_res_n{N}_k{K}",
+                N=N,
+                K=K,
+                had_size=had_size,
+            )
+
+    print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/bench_nvfp4_qutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fecc816f9466ce3d2cf7e0f1780da0ea50cf39d
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
+from vllm._custom_ops import fusedQuantizeNv
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "nvfp4": dict(no_a_quant=False, enabled=True),
+    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _quant_weight_nvfp4(
+    b: torch.Tensor,
+    forward_hadamard_matrix: torch.Tensor,
+    global_scale: torch.Tensor,
+    device: str,
+    M: int,
+    N: int,
+    K: int,
+):
+    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
+        b, forward_hadamard_matrix, global_scale
+    )
+    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
+        -1, K // 16
+    )
+    return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
+    alpha = torch.tensor([1.0], device="cuda")
+    global_scale = torch.tensor([1.0], device="cuda")
+    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
+        b, forward_hadamard_matrix, global_scale, device, M, N, K
+    )
+
+    if cfg["no_a_quant"]:
+        # Pre-quantize activation
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+            a, forward_hadamard_matrix, global_scale
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+            -1, K // 16
+        )
+
+        def run():
+            return ops.cutlass_scaled_fp4_mm(
+                input_hf_e2m1,
+                weight_hf_e2m1,
+                input_hf_scale_block,
+                weight_hf_scale_block,
+                alpha,
+                torch.bfloat16,
+            )
+
+        return run
+
+    # Quantize activation on-the-fly
+    def run():
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+            a, forward_hadamard_matrix, global_scale
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+            -1, K // 16
+        )
+        return ops.cutlass_scaled_fp4_mm(
+            input_hf_e2m1,
+            weight_hf_e2m1,
+            input_hf_scale_block,
+            weight_hf_scale_block,
+            alpha,
+            torch.bfloat16,
+        )
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+            16384,
+            24576,
+            32768,
+        ],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs NVFP4 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_nvfp4_runner(
+            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
+        )
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), rep=200, quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        for had_size in [16, 32, 64, 128]:
+            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+            benchmark.run(
+                print_data=True,
+                show_plots=True,
+                save_path=f"bench_nvfp4_res_n{N}_k{K}",
+                N=N,
+                K=K,
+                had_size=had_size,
+            )
+
+    print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py
index e08e5680c191ed844d5262c48e72a4ca534472dc..7792cfd03b0e49022d7365b9a96e8e41ad236a99 100644
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch
 
 import pandas as pd
@@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 
 def with_triton_mode(fn):
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 93edbcc9391fcf696e7333d7945919c8248cfb75..66268b71b3de644b424429aa66ad7dc6216b3c3a 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
index 66b44c27d6ee8cdea4a7081c8142b0e037b8fd0c..6bcb1798379576be0e49c7b374da05b5dfc87405 100644
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -28,7 +28,7 @@ except ImportError as e:
 
 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
 
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 parser = FlexibleArgumentParser(
     description="Benchmark BitBLAS int4 on a specific target."
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index 726a2a371d1099d7c260af675703e22c8d4b1f21..7982cbb1422c5e4dda0f8aff07a3bad91f85ad0a 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 WEIGHT_SHAPES_MOE = {
     "nvidia/DeepSeek-R1-FP4": [
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index b419b2fa0e3eb84b4b8a340a3721731f343cdde6..027f67ad4db694896b30cd13fa877d2ce30a95a7 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index 4cbdde5a5b2cab30db5bda1beec1e1c240335326..b414efa6e330bca5f1f5e5604c60fa357d9803bc 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Callable, Optional
 
 import torch
 import torch.distributed as dist
@@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
@@ -264,12 +264,12 @@ class CommunicatorBenchmark:
     def benchmark_allreduce_single(
         self,
         sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
         should_use_fn: Callable[[torch.Tensor], bool],
         context,
         num_warmup: int,
         num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
         """Benchmark method with CUDA graph optimization."""
         try:
             # Create test tensor (2D: sequence_length x hidden_size)
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e7fdcf55426f1e50424a2715e1f8323dcff729
--- /dev/null
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -0,0 +1,1129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark for FlashInfer fused collective operations vs standard operations.
+
+This benchmark compares:
+1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
+2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+
+Usage with torchrun:
+    torchrun --nproc_per_node=2 benchmark_fused_collective.py
+
+"""
+
+import argparse
+import itertools
+import os
+import time
+
+import pandas as pd
+import torch  # type: ignore
+import torch.distributed as dist  # type: ignore
+
+from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    graph_capture,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm  # noqa
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8  # noqa
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape  # noqa
+from vllm.platforms import current_platform  # noqa
+
+RMS_NORM_OP = torch.ops._C.rms_norm
+FUSED_ADD_RMS_NORM_OP = torch.ops._C.fused_add_rms_norm
+RMS_NORM_STATIC_FP8_QUANT_OP = torch.ops._C.rms_norm_static_fp8_quant
+FUSED_ADD_RMS_NORM_STATIC_FP8_QUANT_OP = (
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant
+)
+SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
+
+logger = init_logger(__name__)
+
+# Try to import FlashInfer
+try:
+    import flashinfer.comm as flashinfer_comm  # type: ignore
+
+    if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
+        flashinfer_comm = None
+        logger.warning(
+            "FlashInfer comm module found but missing trtllm_allreduce_fusion"
+        )
+except ImportError:
+    flashinfer_comm = None
+    logger.warning("FlashInfer not found, only benchmarking standard operations")
+
+# Constants
+FP8_DTYPE = current_platform.fp8_dtype()
+MiB = 1024 * 1024
+
+# FlashInfer max sizes per world size
+# Enable 64MB for 2, 4, 8 world sizes to verify large input sizes
+# use --disable-oneshot to disable oneshot mode for very large input sizes
+_FI_MAX_SIZES = {
+    2: 64 * MiB,  # 64MB
+    4: 64 * MiB,  # 64MB
+    8: 64 * MiB,  # 64MB
+}
+
+# Global workspace tensor for FlashInfer
+_FI_WORKSPACE_TENSOR = None
+
+
+def setup_flashinfer_workspace(
+    world_size: int,
+    rank: int,
+    hidden_dim: int,
+    max_token_num: int,
+    use_fp32_lamport: bool = False,
+):
+    """Setup FlashInfer workspace for fused allreduce operations."""
+    global _FI_WORKSPACE_TENSOR
+
+    if flashinfer_comm is None:
+        return None, None
+
+    if world_size not in _FI_MAX_SIZES:
+        logger.warning("FlashInfer not supported for world size %s", world_size)
+        return None, None
+
+    try:
+        # Create IPC workspace
+        ipc_handles, workspace_tensor = (
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                tp_rank=rank,
+                tp_size=world_size,
+                max_token_num=max_token_num,
+                hidden_dim=hidden_dim,
+                group=get_tp_group().device_group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
+        )
+
+        _FI_WORKSPACE_TENSOR = workspace_tensor
+        return ipc_handles, workspace_tensor
+    except Exception as e:
+        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        return None, None
+
+
+def cleanup_flashinfer_workspace(ipc_handles):
+    """Cleanup FlashInfer workspace."""
+    if flashinfer_comm is None or ipc_handles is None:
+        return
+
+    try:
+        group = get_tp_group().device_group
+        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
+    except Exception as e:
+        logger.error("Failed to cleanup FlashInfer workspace: %s", e)
+
+
+class FlashInferFusedAllReduceParams:
+    """Parameters for FlashInfer fused allreduce operations."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        use_fp32_lamport: bool = False,
+        max_token_num: int = 1024,
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.use_fp32_lamport = use_fp32_lamport
+        self.trigger_completion_at_end = True
+        self.launch_with_pdl = True
+        self.fp32_acc = True
+        self.max_token_num = max_token_num
+
+    def get_trtllm_fused_allreduce_kwargs(self):
+        return {
+            "world_rank": self.rank,
+            "world_size": self.world_size,
+            "launch_with_pdl": self.launch_with_pdl,
+            "trigger_completion_at_end": self.trigger_completion_at_end,
+            "fp32_acc": self.fp32_acc,
+        }
+
+
+def flashinfer_fused_allreduce_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    allreduce_params: "FlashInferFusedAllReduceParams",
+    use_oneshot: bool,
+    norm_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm operation."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        allreduce_out=None,
+        quant_out=None,
+        scale_out=None,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=None,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    scale_factor: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    use_oneshot: bool = True,
+    norm_out: torch.Tensor | None = None,
+    quant_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=None,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=scale_factor,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    input_global_scale: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    quant_out: torch.Tensor,
+    use_oneshot: bool,
+    output_scale: torch.Tensor,
+    norm_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=output_scale,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=input_global_scale,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+class VllmFusedAllreduce:
+    def __init__(self, hidden_dim, dtype):
+        self.rms_eps = 1e-6
+        self.rms_norm = RMSNorm(hidden_dim, eps=self.rms_eps, dtype=dtype)
+        self.fp8_quant = QuantFP8(
+            static=True,
+            group_shape=GroupShape.PER_TENSOR,
+        )
+
+    def allreduce_rmsnorm(
+        self, input_tensor: torch.Tensor, residual: torch.Tensor | None
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        return self.rms_norm(allreduce_out, residual)
+
+    def allreduce_rmsnorm_fp8_quant(
+        self,
+        input_tensor: torch.Tensor,
+        residual: torch.Tensor | None,
+        scale_factor: torch.Tensor,
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        rms_out = self.rms_norm(allreduce_out, residual)
+        if residual is None:
+            quant_out = self.fp8_quant(rms_out, scale_factor)
+            return quant_out
+        else:
+            rms_out, residual_out = rms_out
+            quant_out = self.fp8_quant(rms_out, scale_factor)
+            return quant_out, residual_out
+
+    def allreduce_rmsnorm_fp4_quant(
+        self,
+        input_tensor: torch.Tensor,
+        residual: torch.Tensor | None,
+        input_global_scale: torch.Tensor,
+        quant_out: torch.Tensor,
+        output_scale: torch.Tensor,
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        rms_out = self.rms_norm(allreduce_out, residual)
+        if residual is None:
+            SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale)
+            return quant_out, output_scale
+        else:
+            rms_out, residual_out = rms_out
+            SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale)
+            return quant_out, residual_out, output_scale
+
+
+def create_test_tensors(
+    num_tokens: int, hidden_dim: int, dtype: torch.dtype, use_residual: bool = True
+):
+    """Create test tensors for benchmarking."""
+    input_tensor = torch.randn(num_tokens, hidden_dim, dtype=dtype)
+    residual = (
+        torch.randn_like(input_tensor)
+        if use_residual
+        else torch.zeros_like(input_tensor)
+    )
+    rms_gamma = torch.ones(hidden_dim, dtype=dtype)
+    norm_out = None if use_residual else torch.empty_like(input_tensor)
+
+    # Quantization scales
+    scale_fp8 = torch.tensor(1.0, dtype=torch.float32)
+    scale_fp4 = torch.tensor(1.0, dtype=torch.float32)
+    quant_out_fp8 = torch.empty_like(input_tensor, dtype=FP8_DTYPE)
+    # Pre-allocate FP4 output tensors (to avoid allocation overhead in benchmarks)
+    fp4_quant_out = torch.empty((num_tokens, hidden_dim // 2), dtype=torch.uint8)
+    fp4_output_scale = torch.empty((128, 4), dtype=torch.int32)
+
+    return (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    )
+
+
+def benchmark_operation(
+    operation_func, *args, warmup: int = 5, trials: int = 20, **kwargs
+):
+    """Benchmark a single operation using CUDA graphs."""
+    # Warmup before graph capture
+    for _ in range(warmup):
+        operation_func(*args, **kwargs)
+    torch.cuda.synchronize()
+
+    # Create CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    num_op_per_cudagraph = 10
+
+    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    with graph_capture(device=device), torch.cuda.graph(graph):
+        for _ in range(num_op_per_cudagraph):
+            operation_func(*args, **kwargs)
+
+    # Graph warmup
+    torch.cuda.synchronize()
+    for _ in range(warmup):
+        graph.replay()
+
+    # Benchmark with CUDA graph
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+
+    for _ in range(trials // num_op_per_cudagraph):
+        # operation_func(*args, **kwargs)
+        graph.replay()
+
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+
+    avg_time_ms = ((end_time - start_time) / trials) * 1000
+    return avg_time_ms
+
+
+def run_benchmarks(
+    num_tokens: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    use_residual: bool,
+    allreduce_params: FlashInferFusedAllReduceParams | None,
+    quant_modes: set[str],
+    no_oneshot: bool,
+):
+    """Run all benchmarks for given configuration.
+
+    Args:
+        quant_mode: "none", "fp8_only", "fp4_only", or "all"
+    """
+    (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    ) = create_test_tensors(num_tokens, hidden_dim, dtype, use_residual)
+
+    rms_eps = 1e-6
+    results = {}
+    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
+    use_oneshot_options = [False] if no_oneshot else [True, False]
+
+    # Create RMSNorm and QuantFP8 layers once for native benchmarks
+
+    if "none" in quant_modes:
+        # Standard AllReduce + RMSNorm
+        for custom_op in ["-rms_norm", "+rms_norm"]:
+            with set_current_vllm_config(
+                VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
+            ):
+                try:
+                    suffix = (
+                        "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
+                    )
+                    time_ms = benchmark_operation(
+                        vllm_fused_allreduce.allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                    )
+                    results[f"standard_allreduce_{suffix}"] = time_ms
+                except Exception as e:
+                    logger.error("Standard AllReduce+RMSNorm failed: %s", e)
+                    results[f"standard_allreduce_{suffix}"] = float("inf")
+
+        # Standard AllReduce + RMSNorm Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
+        ):
+            try:
+                standard_allreduce_rmsnorm_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                )
+                results["standard_allreduce_rmsnorm_native_compiled"] = time_ms
+            except Exception as e:
+                logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
+                results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
+
+        # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
+                except Exception as e:
+                    logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
+                        "inf"
+                    )
+
+    if "fp8" in quant_modes:
+        # Standard AllReduce + RMSNorm + FP8 Quant
+        for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]:
+            suffix = (
+                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
+            )
+            for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
+                suffix += (
+                    "_custom_quant_fp8"
+                    if "+" in quant_fp8_custom_op
+                    else "_native_quant_fp8"
+                )
+                with set_current_vllm_config(
+                    VllmConfig(
+                        compilation_config=CompilationConfig(
+                            custom_ops=[rms_norm_custom_op, quant_fp8_custom_op]
+                        )
+                    )
+                ):
+                    try:
+                        time_ms = benchmark_operation(
+                            vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
+                            input_tensor,
+                            residual=residual,
+                            scale_factor=scale_fp8,
+                        )
+                        results[f"standard_allreduce{suffix}"] = time_ms
+                    except Exception as e:
+                        logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
+                        results[f"standard_allreduce{suffix}"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(
+                compilation_config=CompilationConfig(
+                    custom_ops=["-rms_norm", "-quant_fp8"]
+                )
+            )
+        ):
+            try:
+                standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_fp8_quant_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                    scale_factor=scale_fp8,
+                )
+                results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "Standard AllReduce+RMSNorm+FP8 Native Compiled failed: %s", e
+                )
+                results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = float(
+                    "inf"
+                )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
+                        input_tensor,
+                        norm_out=norm_out,
+                        residual=residual,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        scale_factor=scale_fp8,
+                        quant_out=quant_out_fp8,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        time_ms
+                    )
+                except Exception as e:
+                    logger.error(
+                        "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
+                        e,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        float("inf")
+                    )
+
+    if "fp4" in quant_modes and current_platform.has_device_capability(100):
+        # Standard AllReduce + RMSNorm + FP4 Quant
+        for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]:
+            suffix = (
+                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
+            )
+            with set_current_vllm_config(
+                VllmConfig(
+                    compilation_config=CompilationConfig(
+                        custom_ops=[rms_norm_custom_op]
+                    )
+                )
+            ):
+                try:
+                    time_ms = benchmark_operation(
+                        vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        input_global_scale=scale_fp4,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                    )
+                    results[f"standard_allreduce_{suffix}_fp4_quant"] = time_ms
+                except Exception as e:
+                    logger.error("Standard AllReduce+RMSNorm+FP4 failed: %s", e)
+                    results[f"standard_allreduce_{suffix}_fp4_quant"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP4 Quant Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
+        ):
+            try:
+                standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_fp4_quant_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                    quant_out=fp4_quant_out,
+                    input_global_scale=scale_fp4,
+                    output_scale=fp4_output_scale,
+                )
+                results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "Standard AllReduce+RMSNorm+FP4 Native Compiled failed: %s", e
+                )
+                results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = float(
+                    "inf"
+                )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        input_global_scale=scale_fp4,
+                        allreduce_params=allreduce_params,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        time_ms
+                    )
+                except Exception as e:
+                    logger.error(
+                        "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
+                        e,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        float("inf")
+                    )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                    input_tensor,
+                    residual=residual,
+                    norm_out=norm_out,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    input_global_scale=scale_fp4,
+                    allreduce_params=allreduce_params,
+                    quant_out=fp4_quant_out,
+                    output_scale=fp4_output_scale,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
+                    "inf"
+                )
+
+    return results
+
+
+def prepare_results_with_speedups(results_dict):
+    """Prepare results with speedup calculations based on dynamic baseline selection."""
+    prepared_results = []
+
+    # Determine the fastest baseline for each operation type
+    def get_fastest_baseline(op_name, results_dict):
+        """Get the fastest baseline between standard and native_compiled versions."""
+        if "fp8_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp8_quant",
+                "standard_allreduce_rmsnorm_fp8_quant_native_compiled",
+            ]
+        elif "fp4_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp4_quant",
+                "standard_allreduce_rmsnorm_fp4_quant_native_compiled",
+            ]
+        else:
+            candidates = [
+                "standard_allreduce_rmsnorm",
+                "standard_allreduce_rmsnorm_native_compiled",
+            ]
+
+        # Find the fastest among available candidates
+        fastest_time = float("inf")
+        fastest_baseline = None
+
+        for candidate in candidates:
+            if (
+                candidate in results_dict
+                and results_dict[candidate] != float("inf")
+                and results_dict[candidate] < fastest_time
+            ):
+                fastest_time = results_dict[candidate]
+                fastest_baseline = candidate
+
+        return fastest_baseline
+
+    # Create dynamic baseline mapping
+    dynamic_baseline_mapping = {}
+    for op_name in results_dict:
+        if (
+            op_name.startswith("flashinfer_")
+            or op_name.startswith("standard_")
+            and not op_name.endswith("_native_compiled")
+        ):
+            dynamic_baseline_mapping[op_name] = get_fastest_baseline(
+                op_name, results_dict
+            )
+
+    for op_name, time_ms in results_dict.items():
+        if time_ms == float("inf"):
+            speedup_str = "FAILED"
+            time_str = "FAILED"
+        else:
+            time_str = f"{time_ms:.3f}"
+            # Find the appropriate baseline for this operation
+            baseline_op = dynamic_baseline_mapping.get(op_name)
+            if baseline_op and baseline_op in results_dict:
+                baseline_time = results_dict[baseline_op]
+                if baseline_time != float("inf") and baseline_time > 0:
+                    speedup = baseline_time / time_ms
+                    speedup_str = f"{speedup:.2f}x"
+                else:
+                    speedup_str = "N/A"
+            else:
+                # For baseline operations, determine if this is the fastest baseline
+                if op_name.endswith("_native_compiled") or (
+                    op_name.startswith("standard_")
+                    and not op_name.endswith("_native_compiled")
+                ):
+                    fastest_baseline = get_fastest_baseline(op_name, results_dict)
+                    if fastest_baseline == op_name:
+                        speedup_str = "baseline"
+                    else:
+                        if fastest_baseline and fastest_baseline in results_dict:
+                            baseline_time = results_dict[fastest_baseline]
+                            if baseline_time != float("inf") and baseline_time > 0:
+                                speedup = baseline_time / time_ms
+                                speedup_str = f"{speedup:.2f}x"
+                            else:
+                                speedup_str = "N/A"
+                        else:
+                            speedup_str = "N/A"
+                else:
+                    speedup_str = "N/A"
+
+        prepared_results.append(
+            {
+                "operation": op_name,
+                "time_ms": time_ms,
+                "time_str": time_str,
+                "speedup_str": speedup_str,
+            }
+        )
+
+    return prepared_results
+
+
+def print_results(
+    results_dict,
+    num_tokens,
+    hidden_dim,
+    dtype,
+    use_residual,
+    quant_modes,
+    input_size_mb,
+):
+    """Print benchmark results in a formatted table."""
+    print(f"\n{'=' * 80}")
+    print(
+        f"Results: num_tokens={num_tokens}, hidden_dim={hidden_dim} "
+        f"(input size: {input_size_mb:.2f} MB)"
+    )
+    print(
+        f"dtype={dtype}, residual={'yes' if use_residual else 'no'}, "
+        f"quant_modes={','.join(sorted(list(quant_modes)))}"
+    )
+    print(f"{'=' * 80}")
+    print(f"{'Operation':<50} {'Time (ms)':<12} {'Speedup':<10}")
+    print(f"{'-' * 80}")
+
+    # Prepare results with speedup calculations
+    prepared_results = prepare_results_with_speedups(results_dict)
+
+    for result in prepared_results:
+        if result["time_ms"] == float("inf"):
+            time_display = result["time_str"]
+        else:
+            time_display = f"{result['time_ms']:.3f}"
+
+        print(
+            f"{result['operation']:<50} {time_display:<12} {result['speedup_str']:<10}"
+        )
+
+
+def format_results_markdown(
+    all_results: list[dict], world_size: int, args: argparse.Namespace
+) -> str:
+    """Format all benchmark results as markdown."""
+    lines: list[str] = []
+    lines.append("# FlashInfer Fused Collective Operations Benchmark Results")
+    lines.append("")
+    lines.append(f"**World Size:** {world_size}  ")
+    lines.append(f"**Hidden Dimension:** {args.hidden_dim}  ")
+    lines.append(f"**Warmup Iterations:** {args.warmup}  ")
+    lines.append(f"**Benchmark Trials:** {args.trials}  ")
+    modes = ",".join(all_results[0]["quant_modes"]) if all_results else "N/A"
+    lines.append(f"**Quantization Modes:** {modes}  ")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+
+    for entry in all_results:
+        num_tokens = entry["num_tokens"]
+        dtype = entry["dtype"]
+        use_residual = entry["use_residual"]
+        results_dict = entry["results"]
+        input_size_mb = entry["input_size_mb"]
+        residual_str = "with residual" if use_residual else "no residual"
+
+        lines.append(
+            f"## Configuration: num_tokens={num_tokens}, dtype={dtype}, {residual_str}"
+        )
+        lines.append(f"**Input Size:** {input_size_mb:.2f} MB")
+        lines.append("")
+
+        prepared = prepare_results_with_speedups(results_dict)
+        # Build DataFrame for markdown export
+        rows = [
+            {
+                "Operation": r["operation"].replace("_", " ").title(),
+                "Time (ms)": r["time_str"],
+                "Speedup": r["speedup_str"],
+            }
+            for r in prepared
+        ]
+        df = pd.DataFrame(rows)
+        if df.empty:
+            lines.append("No results.")
+        else:
+            lines.append(df.to_markdown(index=False))
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def save_results_to_file(
+    all_results: list[dict], world_size: int, args: argparse.Namespace, rank: int
+):
+    """Save benchmark results to markdown file (only on rank 0)."""
+    if rank != 0:
+        return
+
+    if not all_results:
+        logger.warning("No results to save")
+        return
+
+    output_path = args.output_file
+
+    try:
+        markdown_content = format_results_markdown(all_results, world_size, args)
+
+        with open(output_path, "a") as f:
+            f.write(markdown_content)
+
+    except Exception as e:
+        logger.error("Failed to save results to file: %s", e)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark fused collective operations"
+    )
+    parser.add_argument(
+        "--num-tokens",
+        type=int,
+        nargs="+",
+        default=[128, 512, 1024, 2048],
+        help="Numbers of tokens to test",
+    )
+    parser.add_argument(
+        "--hidden-dim", type=int, default=8192, help="Hidden dimension size"
+    )
+    parser.add_argument(
+        "--dtypes",
+        type=str,
+        nargs="+",
+        default=["bfloat16"],
+        choices=["float16", "bfloat16", "float32"],
+        help="Data types to test",
+    )
+    parser.add_argument(
+        "--no-residual",
+        action="store_true",
+        help="Skip residual connection tests",
+    )
+
+    parser.add_argument(
+        "--quant-modes",
+        type=str,
+        default="none,fp8,fp4",
+        help=(
+            "Comma-separated quantization modes to run: none, fp8, fp4. "
+            "Default: none,fp8,fp4"
+        ),
+    )
+
+    parser.add_argument(
+        "--warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--trials", type=int, default=20, help="Number of benchmark trials"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        help="""Output file path for markdown results 
+                (default: benchmark_results_<timestamp>.md)
+        """,
+    )
+
+    parser.add_argument(
+        "--no-oneshot",
+        action="store_true",
+        help="Skip oneshot benchmarks",
+    )
+
+    args = parser.parse_args()
+
+    # Check if running with torchrun (required for collective operations)
+    if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ:
+        raise RuntimeError(
+            "Must run with torchrun for distributed benchmarking. "
+            "Example: torchrun --nproc_per_node=2 benchmark_fused_collective.py"
+        )
+
+    # Initialize distributed environment
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Validate world size (must be > 1 for collective operations)
+    if world_size <= 1:
+        raise ValueError(
+            "World size must be > 1 for collective operations benchmarking. "
+            f"Current world size: {world_size}. Use torchrun with --nproc_per_node > 1."
+        )
+
+    # Parse quantization modes
+    valid_quant_modes = {"none", "fp8", "fp4"}
+    raw_modes = [
+        m.strip().lower() for m in (args.quant_modes or "").split(",") if m.strip()
+    ]
+    quant_modes = set(raw_modes) if raw_modes else {"none", "fp8", "fp4"}
+    invalid = sorted(list(quant_modes - valid_quant_modes))
+    if invalid:
+        raise ValueError(
+            f"Invalid --quant-modes entries: {','.join(invalid)}. "
+            f"Valid options are: {','.join(sorted(valid_quant_modes))}."
+        )
+
+    if rank == 0:
+        logger.info("Running benchmark with world_size=%s, rank=%s", world_size, rank)
+        logger.info("Quantization modes: %s", ",".join(sorted(list(quant_modes))))
+        if flashinfer_comm is not None:
+            logger.info(
+                "FlashInfer available - will benchmark fused operations",
+            )
+        else:
+            logger.info(
+                "FlashInfer not available - only benchmarking standard operations"
+            )
+
+    # Convert dtype strings to torch dtypes
+    dtype_map = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }
+    dtypes = [dtype_map[dt] for dt in args.dtypes]
+
+    # Test configurations
+    residual_options = [True] if not args.no_residual else [False]
+
+    configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
+
+    # Setup FlashInfer workspace if available
+    ipc_handles = None
+    allreduce_params = None
+
+    if flashinfer_comm is not None:
+        # Use the largest hidden dimension for workspace setup
+        max_num_token = _FI_MAX_SIZES.get(world_size) // (
+            args.hidden_dim * world_size * 2
+        )
+
+        ipc_handles, workspace_tensor = setup_flashinfer_workspace(
+            world_size, rank, args.hidden_dim, max_num_token
+        )
+
+        if workspace_tensor is not None:
+            allreduce_params = FlashInferFusedAllReduceParams(
+                rank=rank,
+                world_size=world_size,
+                max_token_num=max_num_token,
+            )
+
+    # Collect all results for markdown export
+    all_results = []
+
+    try:
+        # Run benchmarks
+        for num_tokens, dtype, use_residual in configs:
+            if rank == 0:
+                logger.info(
+                    "\nTesting:  num_tokens=%s, hidden_dim=%s, dtype=%s, residual=%s",
+                    num_tokens,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                )
+
+            results = run_benchmarks(
+                num_tokens,
+                args.hidden_dim,
+                dtype,
+                use_residual,
+                allreduce_params,
+                quant_modes=quant_modes,
+                no_oneshot=args.no_oneshot,
+            )
+
+            # Store results for markdown export
+            if rank == 0:
+                # Calculate input size in MB
+                input_size_mb = (
+                    num_tokens * args.hidden_dim * torch.finfo(dtype).bits
+                ) / (8 * 1024 * 1024)
+                all_results.append(
+                    {
+                        "num_tokens": num_tokens,
+                        "hidden_dim": args.hidden_dim,
+                        "dtype": str(dtype).replace("torch.", ""),
+                        "use_residual": use_residual,
+                        "quant_modes": sorted(list(quant_modes)),
+                        "input_size_mb": input_size_mb,
+                        "results": results,
+                    }
+                )
+
+                print_results(
+                    results,
+                    num_tokens,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                    quant_modes,
+                    input_size_mb,
+                )
+
+        # Save results to markdown file
+        if args.output_file and rank == 0:
+            save_results_to_file(all_results, world_size, args, rank)
+
+    finally:
+        # Cleanup
+        if ipc_handles is not None:
+            cleanup_flashinfer_workspace(ipc_handles)
+
+        dist.barrier()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 14330ae6f03c55bb8bddb42689740b5c72bb324e..9b426d8d5f778a483aab12fa82d451460b940274 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -13,11 +13,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts,
     fused_topk,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = [
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
-    "nm-testing/deepseekv2-lite",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "deepseek-ai/DeepSeek-V2-Lite",
     "ibm-granite/granite-3.0-1b-a400m",
     "ibm-granite/granite-3.0-3b-a800m",
 ]
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 69978ec6b23e94ce21d8f3dfb633317b3803e6ef..6fa5c248670e32cdd672e45e016d3c47d9689577 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -7,7 +7,8 @@ import torch
 
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 
 @torch.inference_mode()
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 799b16999873f41ef39d3a302874602201e332e6..6715c9b548aa1adabb372fab1e4d562b48c6e112 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -6,11 +6,12 @@ import copy
 import json
 import pickle
 import time
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -18,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm.triton_utils import HAS_TRITON
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
+from vllm.triton_utils import HAS_TRITON, triton
 
 if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+    from vllm.lora.ops.triton_ops import (  ## added fused_moe_lora
+        LoRAKernelMeta,
+        fused_moe_lora_expand,
+        fused_moe_lora_shrink,
+        lora_expand,
+        lora_shrink,
+    )
+    from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+        _LORA_PTR_DICT,  ## added _LORA_PTR_DICT for fused_moe_lora
+    )
     from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-
-from vllm.utils import FlexibleArgumentParser
+from vllm import _custom_ops as ops
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import round_up
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
@@ -58,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
 DEFAULT_SORT_BY_LORA_IDS = [False, True]
 DEFAULT_SEQ_LENGTHS = [1]
 DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+DEFAULT_TOP_K_NUMS = [1]  # Added for MoE LoRA top_k
+DEFAULT_NUM_EXPERTS = [8]  # Added for MoE LoRA num_experts
 
 
 # Utilities
@@ -158,7 +172,7 @@ def ref_group_gemm(
     seq_lens_cpu: torch.Tensor,
     prompt_lora_mapping_cpu: torch.Tensor,
     scaling: float,
-    add_inputs: Optional[bool],
+    add_inputs: bool | None,
 ):
     """
     Torch group gemm reference implementation to test correctness of
@@ -190,6 +204,11 @@ class OpType(Enum):
 
     LORA_SHRINK = auto()
     LORA_EXPAND = auto()
+    ## Adding support for fused moe lora
+    FUSED_MOE_LORA_GATE_UP_SHRINK = auto()  ## Gate/Up projection variant with shrink
+    FUSED_MOE_LORA_GATE_UP_EXPAND = auto()  ## Gate/Up projection variant with expand
+    FUSED_MOE_LORA_DOWN_SHRINK = auto()  ## Down projection variant with shrink
+    FUSED_MOE_LORA_DOWN_EXPAND = auto()  ## Down projection variant with expand
 
     @staticmethod
     def from_str(s: str) -> "OpType":
@@ -197,6 +216,15 @@ class OpType(Enum):
             return OpType.LORA_SHRINK
         if s.lower() == "lora_expand":
             return OpType.LORA_EXPAND
+        # Adding support for fused moe lora, both in gate_up and down
+        if s.lower() == "fused_moe_lora_gate_up_shrink":  ## Gate/Up variant with shrink
+            return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
+        if s.lower() == "fused_moe_lora_gate_up_expand":  ## Gate/Up variant with expand
+            return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
+        if s.lower() == "fused_moe_lora_down_shrink":  ## Down variant with shrink
+            return OpType.FUSED_MOE_LORA_DOWN_SHRINK
+        if s.lower() == "fused_moe_lora_down_expand":  ## Down variant with expand
+            return OpType.FUSED_MOE_LORA_DOWN_EXPAND
         raise ValueError(f"Unrecognized str {s} to convert to OpType")
 
     def is_shrink_fn(self) -> bool:
@@ -205,19 +233,56 @@ class OpType(Enum):
     def is_expand_fn(self) -> bool:
         return self in [OpType.LORA_EXPAND]
 
+    def is_fused_moe_lora_fn(self) -> bool:  ## adding for fused MoE LoRA
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_gate_up_fn(
+        self,
+    ) -> bool:  ## adding for fused MoE LoRA Gate/Up
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+        ]
+
+    def is_fused_moe_lora_down_fn(self) -> bool:  ## adding for fused MoE LoRA Down
+        return self in [
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_shrink_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]
+
+    def is_fused_moe_lora_expand_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
     def num_slices(self) -> list[int]:
+        if self.is_fused_moe_lora_gate_up_fn():
+            return [2]
+        elif self.is_fused_moe_lora_down_fn():
+            return [1]
         return [1, 2, 3]
 
     def mkn(
         self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
     ) -> tuple[int, int, int]:
         num_tokens = batch_size * seq_length
-        if self.is_shrink_fn():
+        if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
             m = num_tokens
             k = hidden_size
             n = lora_rank
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
             m = num_tokens
             k = lora_rank
             n = hidden_size
@@ -231,9 +296,36 @@ class OpType(Enum):
         """
         if self.is_shrink_fn():
             return op_dtype, op_dtype, torch.float32
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
             return torch.float32, op_dtype, op_dtype
+        else:
+            assert self.is_fused_moe_lora_fn()
+            return op_dtype, op_dtype, op_dtype
+
+    def matmul_shapes_fused_moe_lora(
+        self,
+        m: int,
+        n: int,
+        k: int,
+        num_loras: int,
+        num_slices: int,
+        top_k_num: int,
+        num_experts: int,
+    ) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
+        if self.is_fused_moe_lora_shrink_fn():
+            input_shape = (
+                (m * top_k_num, n)
+                if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+                else (m, n)
+            )
+            output_shape = (num_slices, m, top_k_num, k)
+            weight_shape = (num_loras, num_experts, k, n)
+        else:
+            assert self.is_fused_moe_lora_expand_fn()
+            input_shape = (num_slices, m, top_k_num, k)
+            output_shape = (m, top_k_num, n * num_slices)
+            weight_shape = (num_loras, num_experts, n, k)
+        return (input_shape, weight_shape, output_shape)
 
     def matmul_shapes(
         self,
@@ -243,6 +335,8 @@ class OpType(Enum):
         lora_rank: int,
         num_loras: int,
         num_slices: int,
+        top_k_num: int | None = None,
+        num_experts: int | None = None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         """
         Given num_slices, return the shapes of the A, B, and C matrices
@@ -257,6 +351,16 @@ class OpType(Enum):
         if self in [OpType.LORA_EXPAND]:
             # LoRA expand kernels support num_slices inherently in the kernel
             return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self.is_fused_moe_lora_fn():
+            return self.matmul_shapes_fused_moe_lora(
+                m,
+                k,
+                n,
+                num_loras,
+                num_slices,
+                top_k_num,
+                num_experts,
+            )
         raise ValueError(f"Unrecognized op_type {self}")
 
     def bench_fn(self) -> Callable:
@@ -264,6 +368,16 @@ class OpType(Enum):
             return lora_shrink
         if self == OpType.LORA_EXPAND:
             return lora_expand
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]:
+            return fused_moe_lora_shrink
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]:
+            return fused_moe_lora_expand
 
         raise ValueError(f"Unrecognized optype {self}")
 
@@ -316,8 +430,10 @@ class BenchmarkContext:
     lora_rank: int
     sort_by_lora_id: bool
     dtype: torch.dtype
-    seq_length: Optional[int] = None
-    num_slices: Optional[int] = None  # num_slices for slice based ops
+    seq_length: int | None = None
+    num_experts: int | None = None  # num_experts for MoE based ops
+    top_k_num: int | None = None  # top_k for MoE based ops
+    num_slices: int | None = None  # num_slices for slice based ops
 
     def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
         ctx = copy.copy(self)
@@ -372,6 +488,11 @@ class BenchmarkTensors:
             f"{dtype_to_str(self.output.dtype)}"
         )
 
+    def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
+        return (
+            size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
+        )
+
     @staticmethod
     def make(
         ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
@@ -384,6 +505,8 @@ class BenchmarkTensors:
             ctx.lora_rank,
             ctx.num_loras,
             ctx.num_slices,
+            ctx.top_k_num,
+            ctx.num_experts,
         )
         a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
         input_tensor, lora_weights, output_tensor = make_rand_tensors(
@@ -431,17 +554,27 @@ class BenchmarkTensors:
             prompt_lora_indices_tensor,
         )
 
-    def sanity_check(self) -> None:
+    def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
         """
         Fails asserts when non-conformality is detected.
         """
-        num_tokens = self.input.shape[-2]
+        num_tokens = (
+            self.input.shape[1]
+            if op_type.is_fused_moe_lora_expand_fn()
+            else self.input.shape[-2]
+        )
         # check metadata tensors
-        assert torch.sum(self.seq_lens) == num_tokens
+        ## In down shrink case, each token is repeated top_k_num times
+        assert num_tokens == self.get_num_tokens(
+            torch.sum(self.seq_lens), ctx.top_k_num, op_type
+        ), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
         num_seqs = self.seq_lens.shape[0]
         # assert self.seq_start_loc.shape[0] == num_seqs
+        ## In down shrink case, each prompt corresponds to top_k_num sequences
         assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+        assert self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )
 
     def to_device(self, device: str):
         """
@@ -470,21 +603,111 @@ class BenchmarkTensors:
                 to_device(field) if field_name != "no_lora_flag_cpu" else field,
             )
 
-    def metadata(self) -> tuple[int, int, int]:
+    def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
         """
         Return num_seqs, num_tokens and max_seq_len
         """
         num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        num_tokens = self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )
         max_seq_len = torch.max(self.seq_lens).item()
         num_slices = len(self.lora_weights_lst)
         return num_seqs, num_tokens, max_seq_len, num_slices
 
-    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
-        self.sanity_check()
+    def fused_moe_lora_data_prepare(
+        self,
+        block_size: int,
+        token_lora_mapping: torch.Tensor,
+        ctx: BenchmarkContext,
+    ):
+        def moe_lora_align_block_size(
+            topk_ids: torch.Tensor,
+            token_lora_mapping: torch.Tensor,
+            block_size: int,
+            num_experts: int,
+            max_loras: int,
+            expert_map: torch.Tensor | None = None,
+            pad_sorted_ids: bool = False,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            """
+            Aligns tokens and experts into block-sized chunks for LoRA-based
+            mixture-of-experts (MoE) execution.
+            """
+            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+            if pad_sorted_ids:
+                max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            sorted_ids = torch.empty(
+                (max_loras * max_num_tokens_padded,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+            # Expert ids must be set default to -1 to prevent a blank block
+            expert_ids = torch.empty(
+                (max_loras * max_num_m_blocks,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            num_tokens_post_pad = torch.empty(
+                (max_loras), dtype=torch.int32, device=topk_ids.device
+            )
+
+            ops.moe_lora_align_block_size(
+                topk_ids,
+                token_lora_mapping,
+                num_experts,
+                block_size,
+                max_loras,
+                max_num_tokens_padded,
+                max_num_m_blocks,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+            if expert_map is not None:
+                expert_ids = expert_map[expert_ids]
+
+            return sorted_ids, expert_ids, num_tokens_post_pad
+
+        num_tokens = ctx.batch_size
+        curr_topk_ids = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+        topk_weights = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+
+        (sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
+            moe_lora_align_block_size(
+                topk_ids=curr_topk_ids,
+                token_lora_mapping=token_lora_mapping,
+                block_size=block_size,
+                num_experts=ctx.num_experts,
+                max_loras=ctx.num_loras,
+            )
+        )
+
+        sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
+        expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
+        num_tokens_post_padded = num_tokens_post_padded_lora
+        return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
+
+    def as_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
         self.to_device(self.input.device)
 
-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
 
         # Sanity check matrix shapes.
         i_shape, lw_shape, o_shape = (
@@ -519,11 +742,13 @@ class BenchmarkTensors:
             "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
         }
 
-    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        self.sanity_check()
+    def as_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
         self.to_device(self.input.device)
 
-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
 
         # Sanity check matrix shapes.
         i_shape, lw_shape, o_shape = (
@@ -560,22 +785,177 @@ class BenchmarkTensors:
             "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
         }
 
+    def as_fused_moe_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+        # Expected input shape : [num_tokens, hidden_size] for gate_up
+        # Expected input shape : [top_k_num * num_tokens, hidden_size] for down
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == hidden_size
+        lora_rank = lw_shape[-2]
+        # Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(o_shape) == 4
+        assert (
+            o_shape
+            == (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
+            if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+            else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
+        )
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "qcurr_hidden_states": self.input,
+            "lora_a_stacked": self.lora_weights_lst,
+            "a_intermediate_cache1": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "shrink_num_warps": kernel_config["NUM_WARPS"],
+            "shrink_num_stages": kernel_config["NUM_STAGES"],
+            "shrink_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
+    def as_fused_moe_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+
+        # Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(i_shape) == 4
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[-1]
+        # Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == lora_rank
+        hidden_size = lw_shape[-2]
+        # Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
+        assert len(o_shape) == 3
+        assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
+
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "a_intermediate_cache1": self.input,
+            "lora_b_stacked": self.lora_weights_lst,
+            "output": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "max_lora_rank": lora_rank,
+            "w1_output_dim_size": lw_shape[2],
+            "expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "expand_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "expand_num_warps": kernel_config["NUM_WARPS"],
+            "expand_num_stages": kernel_config["NUM_STAGES"],
+            "expand_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
     def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: Optional[bool] = None
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
     ) -> dict[str, Any]:
-        if op_type.is_shrink_fn():
+        if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
             assert add_inputs is None
         else:
             assert add_inputs is not None
 
         if op_type == OpType.LORA_SHRINK:
-            return self.as_lora_shrink_kwargs()
+            return self.as_lora_shrink_kwargs(ctx, op_type)
         if op_type == OpType.LORA_EXPAND:
-            return self.as_lora_expand_kwargs(add_inputs)
+            return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
+        if op_type.is_fused_moe_lora_shrink_fn():
+            return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
+        if op_type.is_fused_moe_lora_expand_fn():
+            return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
         raise ValueError(f"Unrecognized optype {self}")
 
     def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+        self, op_type: OpType, expand_fn_add_inputs: bool | None
     ) -> bool:
         """
         Test correctness of op_type implementation against a grouped gemm
@@ -611,12 +991,12 @@ def bench_optype(
     ctx: BenchmarkContext,
     arg_pool_size: int,
     op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: Optional[bool] = None,
+    cuda_graph_nops: int | None = None,
+    expand_fn_add_inputs: bool | None = None,
     test_correctness: bool = False,
 ) -> TMeasurement:
     assert arg_pool_size >= 1
-    if op_type.is_shrink_fn():
+    if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
         assert expand_fn_add_inputs is None
     else:
         assert expand_fn_add_inputs is not None
@@ -626,23 +1006,30 @@ def bench_optype(
         BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
     ]
     for bt in bench_tensors:
-        bt.sanity_check()
+        bt.sanity_check(ctx, op_type)
 
     # Test correctness of our implementation.
     if test_correctness:
+        assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
+            f"Correctness testing is not supported for {op_type.name}."
+        )
         assert all(
-            [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
+            [
+                bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
+                for bt in bench_tensors
+            ]
         )
 
     # BenchmarkTensors -> dict (kwargs)
     kwargs_list = [
-        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
         for bt in bench_tensors
     ]
 
     # Clear LoRA optimization hash-maps.
     _LORA_A_PTR_DICT.clear()
     _LORA_B_PTR_DICT.clear()
+    _LORA_PTR_DICT.clear()
     # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
     for kwargs in kwargs_list:
         op_type.bench_fn()(**kwargs)
@@ -679,7 +1066,7 @@ def bench_torch_mm(
     ctx: BenchmarkContext,
     arg_pool_size: int,
     op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
 ) -> TMeasurement:
     """
     Benchmark basic torch.mm as a roofline.
@@ -744,7 +1131,7 @@ def use_cuda_graph_recommendation() -> str:
             """
 
 
-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
@@ -792,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
 
                     # Benchmark bench_op
                     expand_fn_add_inputs = (
-                        [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                        [None]
+                        if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
+                        else args.expand_fn_add_inputs
                     )
                     for add_input_arg in expand_fn_add_inputs:
                         seq_len_timers.append(
@@ -830,12 +1219,22 @@ def as_benchmark_contexts(
     hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
 ) -> list[BenchmarkContext]:
     ctxs: list[BenchmarkContext] = []
-    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+    for (
+        batch_size,
+        hidden_size,
+        lora_rank,
+        num_loras,
+        sort_by_lora_id,
+        top_k_num,
+        num_experts,
+    ) in product(  # noqa
         args.batch_sizes,
         list(hidden_sizes),
         lora_ranks,
         args.num_loras,
         args.sort_by_lora_id,
+        args.top_k_nums,
+        args.num_experts,
     ):
         ctxs.append(
             BenchmarkContext(
@@ -850,6 +1249,8 @@ def as_benchmark_contexts(
                 seq_length=None,
                 sort_by_lora_id=sort_by_lora_id,
                 dtype=args.dtype,
+                top_k_num=top_k_num,
+                num_experts=num_experts,
                 # To be filled based on the OpType to benchmark
                 num_slices=None,
             )
@@ -1011,6 +1412,22 @@ if __name__ == "__main__":
             ),
         )
 
+        p.add_argument(
+            "--top-k-nums",
+            nargs="+",
+            type=int,
+            default=DEFAULT_TOP_K_NUMS,
+            help="Top-K values for MoE LoRA operations",
+        )
+
+        p.add_argument(
+            "--num-experts",
+            nargs="+",
+            type=int,
+            default=DEFAULT_NUM_EXPERTS,
+            help="Number of experts for MoE LoRA operations",
+        )
+
     parser = FlexibleArgumentParser(
         description=f"""
 Benchmark LoRA kernels:
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 1b1c3b321cce44d61ec6fb3e0f97f22950956978..8787724d77cfb989ef0d401f5dcca58486e02cbe 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -8,10 +8,9 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional
 
 import pandas as pd
 import torch
@@ -34,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
@@ -63,23 +62,23 @@ class BenchmarkTensors:
     a: torch.Tensor
 
     w_q: torch.Tensor
-    group_size: Optional[int]
+    group_size: int | None
     wtype: ScalarType
     w_g_s: torch.Tensor
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None
 
 
 @dataclass
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 def rand_data(shape, dtype=torch.float16, scale=1):
@@ -93,8 +92,8 @@ def quantize_and_pack(
     atype: torch.dtype,
     w: torch.Tensor,
     wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
     zero_points: bool = False,
 ):
     assert wtype.is_integer(), "TODO: support floating point weights"
@@ -113,7 +112,7 @@ def quantize_and_pack(
 
 
 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> list[BenchmarkTensors]:
     m, n, k = shape
 
@@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
     return res
 
 
-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
 
 
 def bench(
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 34cc45e94d76d930013c9d60ca8c5f78dd0b7254..12ca9214b1f9583fb22df68b9fc70f7872a58f23 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     sort_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 02c2db674d4b07f1a8871d1572959e5aac01940c..c99951aa278262e8c4aa1fabf1df2541e6c500cc 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
@@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
     num_warps_range = [1, 2, 4, 8]
     group_m_range = [1, 4, 8, 16, 32]
     num_stage_range = [2]
-    waves_per_eu_range = [0]
+    waves_per_eu_range = [0, 1, 2, 4]
     matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
     kpack_range = [1, 2] if use_fp16 else []
 
@@ -579,19 +579,23 @@ def main(args: argparse.Namespace):
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
         intermediate_size = config.ffn_config.ffn_hidden_size
+        hidden_size = config.hidden_size
     elif config.architectures[0] == "JambaForCausalLM":
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
+        hidden_size = config.hidden_size
     elif config.architectures[0] in (
         "DeepseekV2ForCausalLM",
         "DeepseekV3ForCausalLM",
         "DeepseekV32ForCausalLM",
         "Glm4MoeForCausalLM",
+        "NemotronHForCausalLM",
     ):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
+        hidden_size = config.hidden_size
     elif config.architectures[0] in (
         "Qwen2MoeForCausalLM",
         "Qwen3MoeForCausalLM",
@@ -600,10 +604,23 @@ def main(args: argparse.Namespace):
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
+        hidden_size = config.hidden_size
+    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+        text_config = config.get_text_config()
+        E = text_config.num_experts
+        topk = text_config.num_experts_per_tok
+        intermediate_size = text_config.moe_intermediate_size
+        hidden_size = text_config.hidden_size
     elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
         E = config.num_experts
         topk = config.moe_topk[0]
         intermediate_size = config.moe_intermediate_size[0]
+        hidden_size = config.hidden_size
+    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+        E = config.thinker_config.text_config.num_experts
+        topk = config.thinker_config.text_config.num_experts_per_tok
+        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
+        hidden_size = config.thinker_config.text_config.hidden_size
     else:
         # Support for llama4
         config = config.get_text_config()
@@ -611,6 +628,7 @@ def main(args: argparse.Namespace):
         E = config.num_local_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
+        hidden_size = config.hidden_size
     enable_ep = bool(args.enable_expert_parallel)
     if enable_ep:
         ensure_divisibility(E, args.tp_size, "Number of experts")
@@ -619,8 +637,7 @@ def main(args: argparse.Namespace):
     else:
         ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 04d2205aa3722c5fe056c204f7cc0c28d46d6d6b..efa5a7386027e0f140dc6c79659c61c86c974f4d 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 )
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
 
     hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     use_customized_permute = args.use_customized_permute
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index b9147361708fd09a8d2da852eab08fbc68aa3bfd..cb848d2bf579e8185f2c63192ae64b57204f2a15 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -39,7 +39,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 7e0376c18ecc79690ca49e3995258c5e749bbac7..46ab2a5fe5e98fa05eead6989d924af102e79771 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -3,16 +3,15 @@
 
 import random
 import time
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
     create_kv_caches_with_random,
 )
 
@@ -37,7 +36,7 @@ def main(
     seed: int,
     do_profile: bool,
     device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
+    kv_cache_dtype: str | None = None,
 ) -> None:
     current_platform.seed_everything(seed)
 
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index 1ccb5e08b3d57b0d66ed72061941830640cb32d8..bdc1eb733084e165ae73c9fb5c6b4d5185056fe2 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -3,8 +3,8 @@
 
 import argparse
 import math
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Callable
 from unittest.mock import patch
 
 import torch
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
deleted file mode 100644
index 9ac8f5e6594e4562981576c41a693d8825f90e77..0000000000000000000000000000000000000000
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import torch
-
-from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton
-
-
-def polynorm_naive(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    def norm(x, eps: float):
-        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-    x = x.float()
-    return (
-        (
-            weight[0] * norm(x**3, eps)
-            + weight[1] * norm(x**2, eps)
-            + weight[2] * norm(x, eps)
-            + bias
-        )
-        .to(weight.dtype)
-        .view(orig_shape)
-    )
-
-
-def polynorm_vllm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    out = torch.empty_like(x)
-    vllm_ops.poly_norm(out, x, weight, bias, eps)
-    output = out
-
-    output = output.view(orig_shape)
-    return output
-
-
-def calculate_diff(batch_size, seq_len, hidden_dim):
-    dtype = torch.bfloat16
-    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-    weight = torch.ones(3, dtype=dtype, device="cuda")
-    bias = torch.ones(1, dtype=dtype, device="cuda")
-
-    output_naive = polynorm_naive(x, weight, bias)
-    output_vllm = polynorm_vllm(x, weight, bias)
-
-    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [2**i for i in range(0, 7, 2)]
-seq_length_range = [2**i for i in range(6, 11, 1)]
-dim_range = [2048, 4096]
-configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
-
-
-def get_benchmark():
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["dim", "batch_size", "seq_len"],
-            x_vals=[list(_) for _ in configs],
-            line_arg="provider",
-            line_vals=["naive", "vllm"],
-            line_names=["Naive", "vLLM"],
-            styles=[("blue", "-"), ("red", "-")],
-            ylabel="us",
-            plot_name="polynorm-perf",
-            args={},
-        )
-    )
-    def benchmark(dim, batch_size, seq_len, provider):
-        dtype = torch.bfloat16
-        hidden_dim = dim * 4
-
-        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-        weight = torch.ones(3, dtype=dtype, device="cuda")
-        bias = torch.ones(1, dtype=dtype, device="cuda")
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        if provider == "naive":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x, weight, bias),
-                quantiles=quantiles,
-            )
-        else:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x, weight, bias),
-                quantiles=quantiles,
-            )
-
-        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-    return benchmark
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length",
-    )
-    parser.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=8192,
-        help="Intermediate size of MLP",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./configs/polnorm/",
-        help="Path to save polnorm benchmark results",
-    )
-
-    args = parser.parse_args()
-
-    # Run correctness test
-    calculate_diff(
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        hidden_dim=args.hidden_dim,
-    )
-
-    benchmark = get_benchmark()
-    # Run performance benchmark
-    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 6ab26f5f1adf73ccdc68c0d5f20d622d0c09b7ba..3c2ac9128947af0f33aad790b811bc3f56afae10 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -7,7 +7,8 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 
 @torch.inference_mode()
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3aef0c630b284feb8a8b4e3e35cdfa62237f39
--- /dev/null
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    create_kv_caches_with_random,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    num_iters: int,
+    benchmark_mode: str,
+    device: str = "cuda",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError("num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+    key_caches, value_caches = create_kv_caches_with_random(
+        num_blocks,
+        block_size,
+        1,  # num_layers
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    # to free unused memory
+    del key_caches, value_caches
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    function_under_test = lambda: ops.reshape_and_cache(
+        key,  # noqa: F821
+        value,  # noqa: F821
+        key_cache,  # noqa: F821
+        value_cache,  # noqa: F821
+        slot_mapping,  # noqa: F821
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
+
+    if benchmark_mode == "cudagraph":
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            function_under_test()
+        torch.cuda.synchronize()
+        function_under_test = lambda: g.replay()
+
+    def run_cuda_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            function_under_test()
+            torch.cuda.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_cuda_benchmark(3)
+
+    lat = run_cuda_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.cuda.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for exp in range(1, 17):
+        n_tok = 2**exp
+        lat = run_benchmark(
+            num_tokens=n_tok,
+            num_heads=args.num_heads,
+            head_size=args.head_size,
+            block_size=args.block_size,
+            num_blocks=args.num_blocks,
+            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+            kv_cache_dtype=args.kv_cache_dtype,
+            num_iters=args.iters,
+            benchmark_mode=args.mode,
+            device="cuda",
+        )
+        rows.append([n_tok, lat * 1e6])  # convert to microseconds
+
+    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
+    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+
+    parser.add_argument("--num-heads", type=int, default=128)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--num-blocks", type=int, default=128 * 128)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16", "float"],
+        default="bfloat16",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=200)
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["cudagraph", "no_graph"],
+        default="cudagraph",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index 0aace571064a07b4f347c7158d8a6b4e926aee87..12f17ea575d9448b1cd7ad3e900e4a86a7594703 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time
 
@@ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
     create_kv_caches_with_random_flash,
 )
 
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index 4cf633a81358d80202fa383d7312dac4a818ea62..d8d7f5bcf9dada3b499e133ac7d7b262583fb615 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from typing import Optional, Union
 
 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         orig_dtype = x.dtype
         x = x.to(torch.float32)
         if residual is not None:
@@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@@ -65,7 +64,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     orig_shape = x.shape
@@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     orig_shape = x.shape
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index b81baf17a8c674edd72fe9a6397a0b027a8d33b9..074b7a440b612215333401ce89701ed24198033c 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,98 +1,76 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from itertools import accumulate
-from typing import Optional
+import itertools
 
-import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
-from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.triton_utils import triton
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+batch_size_range = [2**i for i in range(0, 8, 2)]
+seq_len_range = [2**i for i in range(6, 10, 1)]
+num_heads_range = [32, 48]
+configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
 
-def benchmark_rope_kernels_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    # silulating serving 4 LoRAs
-    scaling_factors = [1, 2, 4, 8]
-    # batched RoPE can take multiple scaling factors
-    batched_rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-        {"rope_type": "linear", "factor": tuple(scaling_factors)},
+
+def get_benchmark(head_size, rotary_dim, is_neox_style, device):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "num_heads"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "flashinfer", "vllm"],
+            line_names=["PyTorch", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
+            args={},
+        )
     )
-    # non-batched RoPE takes only one scaling factor, we create multiple
-    # instances to simulate the same behavior
-    non_batched_ropes: list[RotaryEmbedding] = []
-    for scaling_factor in scaling_factors:
-        non_batched_ropes.append(
-            get_rope(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                {"rope_type": "linear", "factor": (scaling_factor,)},
-            )
+    def benchmark(batch_size, seq_len, num_heads, provider):
+        dtype = torch.bfloat16
+        max_position = 8192
+        base = 10000
+        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope = rope.to(dtype=dtype, device=device)
+        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
+
+        positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
+        query = torch.randn(
+            (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
         )
+        key = torch.randn_like(query)
 
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
-    key = torch.randn_like(query)
+        quantiles = [0.5, 0.2, 0.8]
 
-    # create query offsets for batched RoPE, we concat multiple kv cache
-    # together and each query needs to find the right kv cache of its type
-    offset_map = torch.tensor(
-        list(
-            accumulate(
-                [0]
-                + [
-                    max_position * scaling_factor * 2
-                    for scaling_factor in scaling_factors[:-1]
-                ]
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_native(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
             )
-        )
-    )
-    query_types = torch.randint(
-        0, len(scaling_factors), (batch_size, seq_len), device=device
-    )
-    # map query types to offsets
-    query_offsets = offset_map[query_types]
-    # the kernel takes flattened offsets
-    flatten_offsets = query_offsets.flatten()
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch.ops.vllm.flashinfer_rotary_embedding(
+                    positions,
+                    query.clone(),
+                    key.clone(),
+                    head_size,
+                    cos_sin_cache,
+                    is_neox_style,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 
-    # batched queries of the same type together for non-batched RoPE
-    queries = [query[query_types == i] for i in range(len(scaling_factors))]
-    keys = [key[query_types == i] for i in range(len(scaling_factors))]
-    packed_qkr = zip(queries, keys, non_batched_ropes)
-    # synchronize before start timing
-    torch.cuda.synchronize()
-    with nvtx.annotate("non-batched", color="yellow"):
-        for q, k, r in packed_qkr:
-            r.forward(positions, q, k)
-    torch.cuda.synchronize()
-    with nvtx.annotate("batched", color="green"):
-        batched_rope.forward(positions, query, key, flatten_offsets)
-    torch.cuda.synchronize()
+    return benchmark
 
 
 if __name__ == "__main__":
@@ -117,17 +95,12 @@ if __name__ == "__main__":
     parser.add_argument(
         "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
     )
+    parser.add_argument("--save-path", type=str, default="./configs/rope/")
     args = parser.parse_args()
-    print(args)
 
-    benchmark_rope_kernels_multi_lora(
-        is_neox_style=args.is_neox_style,
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        num_heads=args.num_heads,
-        head_size=args.head_size,
-        rotary_dim=args.rotary_dim,
-        dtype=getattr(torch, args.dtype),
-        seed=args.seed,
-        device=args.device,
+    # Get the benchmark function
+    benchmark = get_benchmark(
+        args.head_size, args.rotary_dim, args.is_neox_style, args.device
     )
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 18c459c31d3f84d1ca7b7e31d00d50b521177e96..3e23c4cac059c04cf6a4153e9830cbec2ace36f0 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -78,11 +78,11 @@ WEIGHT_SHAPES = {
 }
 
 WEIGHT_SHAPES_MOE = {
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [
         [8, 2, 4096, 28672],
         [8, 2, 14336, 4096],
     ],
-    "nm-testing/deepseekv2-lite": [
+    "deepseek-ai/DeepSeek-V2-Lite": [
         [64, 6, 2048, 1408],
     ],
     "ibm-granite/granite-3.0-1b-a400m": [
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index c7a4066b39d70342db4e7bdc957e5616a6f32990..a5887aafd30d6ad9008c565dde608e9ec238fffb 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -1,5 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Comprehensive 3-way SiLU Benchmark Suite
+
+This benchmark compares three SiLU implementations:
+1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
+2. Triton Kernel - Triton-based implementation
+
+The suite generates detailed performance comparisons including:
+- Memory bandwidth utilization
+- Speedup ratios (baseline vs optimized implementations)
+- Performance across different expert configurations and token distributions
+"""
+
 from collections.abc import Callable
 
 import matplotlib.pyplot as plt
@@ -7,7 +21,7 @@ import numpy as np
 import torch
 
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm_cuda,
+    persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
     num_parallel_tokens,
     group_size: int = 128,
     eps: float = 1e-10,
+    expert_offsets: torch.Tensor = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
 
@@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
 
 
 # Parse generation strategies
-strategies = ["uniform", "max_t", "first_t"]
+strategies = ["random_imbalanced", "uniform", "max_t"]
 
 
 def benchmark(
@@ -195,15 +210,27 @@ def benchmark(
         current_platform.seed_everything(42 + seed_offset)
         y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
 
-        if gen_strategy == "uniform":
-            r = torch.rand(size=(E,), device="cuda")
+        if gen_strategy == "random_imbalanced":
+
+            def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
+                mean = total_tokens // n_e
+                min_max = mean // ratio
+                e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
+                e[0] = min_max
+                r = torch.rand(size=(E - 1,))
+                r /= r.sum()
+                r *= total_tokens - min_max
+                r = r.round().long()
+                e[1:] = r.to(device=device)
+                return e
+
+            tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
+        elif gen_strategy == "uniform":
+            r = torch.rand(size=(E,))
             r /= r.sum()
             r *= total_tokens
-            tokens_per_expert = r.int()
-            tokens_per_expert = torch.minimum(
-                tokens_per_expert,
-                torch.ones((E,), device=r.device, dtype=torch.int) * T,
-            )
+            r = r.round().long()
+            tokens_per_expert = r
         elif gen_strategy == "max_t":
             tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
             tokens_per_expert.fill_(total_tokens / E)
@@ -281,40 +308,34 @@ def benchmark(
 
 
 def create_comparison_plot(
-    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+    ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
 ):
-    """Create a comparison plot for a specific generation strategy"""
-    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+    fig, ax = plt.subplots(1, 1, figsize=(18, 6))
 
     # Configure x-axis positions
     x = np.arange(len(config_labels))
-    width = 0.35
+    width = 0.25
 
     # Execution Time plot (lower is better)
+    ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
     ax.bar(
-        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
-    )
-    ax.bar(
-        x + width / 2,
-        baseline_times,
-        width,
-        label="Baseline",
-        alpha=0.8,
-        color="orange",
+        x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
     )
 
-    # Add speedup labels over each bar pair
+    # Add speedup labels over each bar trio
     for i in range(len(x)):
-        speedup = ratio[i]
-        max_height = max(cuda_times[i], baseline_times[i])
+        triton_v2_speedup = ratios[i][1]  # triton/v2
+        max_height = max(silu_v2_times[i], triton_times[i])
+
+        # Triton/V2 speedup
         ax.text(
-            x[i],
+            x[i] + width / 2,
             max_height + max_height * 0.02,
-            f"{speedup:.2f}x",
+            f"{triton_v2_speedup:.2f}x",
             ha="center",
             va="bottom",
             fontweight="bold",
-            fontsize=9,
+            fontsize=8,
         )
 
     ax.set_xlabel("Configuration")
@@ -332,56 +353,75 @@ def create_comparison_plot(
 
 
 def create_combined_plot(all_results):
-    """Create a combined plot with all strategies in one PNG"""
     num_strategies = len(all_results)
-    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
 
     if num_strategies == 1:
         axes = [axes]
 
     for idx, (
         strategy_name,
-        ratio,
-        cuda_times,
-        baseline_times,
+        all_ratios,
+        all_silu_v2_results,
+        all_triton_results,
         config_labels,
+        config_x_axis,
     ) in enumerate(all_results):
         ax = axes[idx]
 
+        # Flatten the nested results to get bandwidth percentages for plotting
+        silu_v2_bandwidths = []
+        triton_bandwidths = []
+        flat_ratios = []
+
+        for config_results in all_silu_v2_results:
+            for result in config_results:
+                silu_v2_bandwidths.append(result[3])  # bandwidth percentage
+
+        for config_results in all_triton_results:
+            for result in config_results:
+                triton_bandwidths.append(result[3])  # bandwidth percentage
+
+        for config_ratios in all_ratios:
+            for ratio in config_ratios:
+                flat_ratios.append(ratio)
+
         # Configure x-axis positions
         x = np.arange(len(config_labels))
-        width = 0.35
+        width = 0.25
 
-        # Execution Time plot (lower is better)
+        # Bandwidth utilization plot (higher is better)
         ax.bar(
-            x - width / 2,
-            cuda_times,
+            x,
+            silu_v2_bandwidths,
             width,
-            label="CUDA Kernel",
+            label="SiLU V2 (CUDA)",
             alpha=0.8,
             color="blue",
         )
         ax.bar(
-            x + width / 2,
-            baseline_times,
+            x + width,
+            triton_bandwidths,
             width,
-            label="Baseline",
+            label="Triton Kernel",
             alpha=0.8,
-            color="orange",
+            color="green",
         )
 
-        # Add speedup labels over each bar pair
+        # Add speedup labels over each bar trio
         for i in range(len(x)):
-            speedup = ratio[i]
-            max_height = max(cuda_times[i], baseline_times[i])
+            triton_v2_speedup = flat_ratios[i]  # triton/v2
+            max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+
+            # Triton/V2 speedup
             ax.text(
-                x[i],
+                x[i] + width / 2,
                 max_height + max_height * 0.02,
-                f"{speedup:.2f}x",
+                f"{triton_v2_speedup:.2f}x",
                 ha="center",
                 va="bottom",
                 fontweight="bold",
-                fontsize=9,
+                fontsize=8,
             )
 
         ax.set_xlabel("Configuration")
@@ -395,7 +435,7 @@ def create_combined_plot(all_results):
         ax.grid(True, alpha=0.3)
 
     plt.tight_layout()
-    filename = "../../silu_bench/silu_benchmark_combined.png"
+    filename = "silu_benchmark_combined_3way.png"
     plt.savefig(filename, dpi=300, bbox_inches="tight")
     plt.show()
 
@@ -405,7 +445,9 @@ def create_combined_plot(all_results):
 outer_dim = 7168
 configs = [
     # DeepSeekV3 Configs
+    # (1, 56, 7168),
     (8, 1024, 7168),
+    # (32, 56, 7168),
     # DeepSeekV3 Configs
     (32, 1024, 7168),
     # DeepSeekV3 Configs
@@ -417,6 +459,7 @@ num_warmups = 20
 
 strategy_descriptions = {
     "uniform": "Uniform Random",
+    "random_imbalanced": "Imbalanced Random",
     "max_t": "Even Assignment",
     "first_t": "experts[0] = T, experts[1:] = 0",
 }
@@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
     print(f"Testing strategy: {strategy_descriptions[strategy]}")
     print(f"{'=' * 60}")
 
-    # Collect benchmark data for both algorithms
+    # Collect benchmark data for all three algorithms
     config_labels = []
     config_x_axis = []
-    all_cuda_results = []
-    all_baseline_results = []
+    all_silu_v2_results = []
+    all_triton_results = []
     all_ratios = []
 
     for E, T, H in configs:
-        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+        total_tokens_config = []
+        for i in [8, 16, 32, 64, 128, 256, 512]:
+            if i <= T:
+                total_tokens_config.append(i * E)
         config_x_axis.append(total_tokens_config)
 
-        cuda_results = []
-        baseline_results = []
+        silu_v2_results = []
+        triton_results = []
         ratios = []
 
         for total_tokens in total_tokens_config:
             config_label = f"E={E},T={T},H={H},TT={total_tokens}"
             config_labels.append(config_label)
 
-            # CUDA kernel results
-            time_ms_cuda, gflops, gbps, perc = benchmark(
-                silu_mul_fp8_quant_deep_gemm_cuda,
+            # SiLU V2 (CUDA kernel) results
+            time_ms_silu_v2, gflops, gbps, perc = benchmark(
+                persistent_masked_m_silu_mul_quant,
                 E,
                 T,
                 H,
@@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
                 num_warmups=num_warmups,
                 gen_strategy=strategy,
             )
-            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+            silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
 
-            # Baseline results
+            # Triton kernel results
             time_ms_triton, gflops, gbps, perc = benchmark(
                 silu_mul_fp8_quant_deep_gemm_triton,
                 E,
@@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
                 num_warmups=num_warmups,
                 gen_strategy=strategy,
             )
-            baseline_results.append((time_ms_triton, gflops, gbps, perc))
-            ratios.append(time_ms_triton / time_ms_cuda)
+            triton_results.append((time_ms_triton, gflops, gbps, perc))
 
-            print(f"Completed: {config_label}")
-        all_cuda_results.append(cuda_results)
-        all_baseline_results.append(baseline_results)
+            # Calculate speedup ratios (triton baseline / implementation)
+            triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+            ratios.append(triton_v2_ratio)
+
+            print(
+                f"Completed: {config_label}:"
+                f" V2: {time_ms_silu_v2:.3f}ms,"
+                f" Triton: {time_ms_triton:.3f}ms"
+            )
+
+        all_silu_v2_results.append(silu_v2_results)
+        all_triton_results.append(triton_results)
         all_ratios.append(ratios)
 
     # Store results for combined plotting
@@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
         (
             strategy_descriptions[strategy],
             all_ratios,
-            all_cuda_results,
-            all_baseline_results,
+            all_silu_v2_results,
+            all_triton_results,
             config_labels,
             config_x_axis,
         )
@@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
 
     # Print summary table for this strategy
     print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
-    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
-    print("-" * 60)
+    print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+    print("-" * 90)
 
     for i, (E, T, H) in enumerate(configs):
-        speedup = baseline_results[i][0] / cuda_results[i][0]
+        # Get the first result for each config (simplifying for summary)
+        v2_time = silu_v2_results[i][0]
+        triton_time = triton_results[i][0]
+        triton_v2_speedup = triton_time / v2_time
         config_label = f"E={E:3d},T={T:4d},H={H:4d}"
         print(
-            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
-            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+            f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+            f"{triton_v2_speedup:8.2f}x"
         )
 
 
@@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
     num_strategies = len(all_results)
     num_configs = len(configs)
 
-    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
     fig, axs = plt.subplots(
-        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+        num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
     )
 
     # Add main title to the entire figure
     fig.suptitle(
-        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
-        fontsize=16,
+        "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+        fontsize=18,
         fontweight="bold",
         y=0.98,
     )
@@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
         (
             strategy_name,
             all_ratios,
-            all_cuda_results,
-            all_baseline_results,
+            all_silu_v2_results,
+            all_triton_results,
             config_labels,
             config_x_axis,
         ) = result
@@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
             ratios = all_ratios[config_idx]
             total_tokens_values = config_x_axis[config_idx]
 
-            # Extract CUDA and Triton bandwidth percentages
-            cuda_bandwidth_percentages = [
-                result[3] for result in all_cuda_results[config_idx]
+            # Extract speedup ratios
+            triton_v2_ratios = [ratio for ratio in ratios]
+
+            # Extract bandwidth percentages for all implementations
+            v2_bandwidth_percentages = [
+                result[3] for result in all_silu_v2_results[config_idx]
             ]
             triton_bandwidth_percentages = [
-                result[3] for result in all_baseline_results[config_idx]
+                result[3] for result in all_triton_results[config_idx]
             ]
 
             # Plot speedup ratios vs total tokens (left plot)
             ax_speedup.plot(
-                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+                total_tokens_values,
+                triton_v2_ratios,
+                "go-",
+                linewidth=3,
+                markersize=8,
+                label="Triton/V2 Speedup",
             )
             ax_speedup.set_title(
-                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+                f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
                 fontsize=12,
                 fontweight="bold",
             )
             ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
             ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+            ax_speedup.legend(prop={"weight": "bold"})
             ax_speedup.grid(True, alpha=0.3)
 
+            # Plot bandwidth utilization (right plot)
             ax_bandwidth.plot(
                 total_tokens_values,
-                cuda_bandwidth_percentages,
-                "ro-",
+                v2_bandwidth_percentages,
+                "o-",
                 linewidth=3,
                 markersize=8,
-                label="CUDA",
+                label="SiLU V2",
+                color="blue",
             )
             ax_bandwidth.plot(
                 total_tokens_values,
                 triton_bandwidth_percentages,
-                "go-",
+                "o-",
                 linewidth=3,
                 markersize=8,
                 label="Triton",
+                color="green",
             )
             ax_bandwidth.set_title(
                 f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
                 for label in ax.get_xticklabels() + ax.get_yticklabels():
                     label.set_fontweight("bold")
 
-            # Add value labels on speedup points
-            for x, y in zip(total_tokens_values, ratios):
+            # Add value labels on Triton/V2 speedup points
+            for x, y in zip(total_tokens_values, triton_v2_ratios):
                 ax_speedup.annotate(
                     f"{y:.2f}x",
                     (x, y),
                     textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=10,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
-                )
-
-            # Add value labels on CUDA bandwidth points
-            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=9,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
-                )
-
-            # Add value labels on Triton bandwidth points
-            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
                     xytext=(0, -15),
                     ha="center",
                     fontsize=9,
@@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
 
     plt.tight_layout()
     plt.subplots_adjust(top=0.93)  # Make room for main title
-    filename = "silu_benchmark_total_tokens.png"
+    filename = "silu_benchmark_total_tokens_3way.png"
     plt.savefig(filename, dpi=300, bbox_inches="tight")
     plt.show()
 
     return filename
 
 
-# Create combined plot with all strategies
-combined_plot_filename = create_total_tokens_plot(all_results)
+# Create comprehensive 3-way comparison plots
+combined_plot_filename = create_combined_plot(all_results)
+total_tokens_plot_filename = create_total_tokens_plot(all_results)
 
-print(f"\n{'=' * 60}")
-print("Benchmark Complete!")
-print(f"Generated combined plot: {combined_plot_filename}")
-print(f"{'=' * 60}")
+print(f"\n{'=' * 80}")
+print("3-Way Benchmark Suite Complete!")
+print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print("Compared: SiLU V2 (CUDA), and Triton implementations")
+print(f"{'=' * 80}")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 6ddab46214577d010498cf24f3774f27f7368475..29ce18234dfa03875e32a7644be014ebf0289f0e 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -4,12 +4,11 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional
 
 import flashinfer
 import torch
 
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @torch.no_grad()
 def benchmark_decode(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_len: int,
     num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 131df74c7de1b4c196b75e074cdd3480b978db2f..2a25d03748112ba0d3fa49b7a7449c7fe215049f 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -4,12 +4,11 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional
 
 import flashinfer
 import torch
 
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @torch.no_grad()
 def benchmark_prefill(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_len: int,
     num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index c6c8e0b0b936bcc3c78e78cae111421ba0805f4a..ab54f81985bc25ba06d820d486852a5c3b7fe4d1 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -14,11 +14,11 @@ import torch
 from tqdm import tqdm
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_block_fp8_matmul,
+    _w8a8_triton_block_scaled_mm,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 mp.set_start_method("spawn", force=True)
 
@@ -83,7 +83,7 @@ def w8a8_block_matmul(
         )
 
     if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_block_fp8_matmul
+        kernel = _w8a8_triton_block_scaled_mm
     else:
         raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
 
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index db2398fc40a49dcf9b590d45e7ae6d4d794f0a0d..ba31bc563829858ac11aad81371133054406ee29 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# fmt: off
 # ruff: noqa: E501
 import time
 
@@ -9,7 +8,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
@@ -20,19 +19,21 @@ from vllm.utils.deep_gemm import (
 )
 
 
-def benchmark_shape(m: int,
-                    n: int,
-                    k: int,
-                    warmup: int = 100,
-                    repeat: int = 10000,
-                    verbose: bool = False) -> dict:
+def benchmark_shape(
+    m: int,
+    n: int,
+    k: int,
+    warmup: int = 100,
+    repeat: int = 10000,
+    verbose: bool = False,
+) -> dict:
     """Benchmark all implementations for a specific (m, n, k) shape."""
     if verbose:
         print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
 
     # Create test tensors
-    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
-    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
 
     # Reference result in BF16
     torch.cuda.synchronize()
@@ -49,34 +50,39 @@ def benchmark_shape(m: int,
     # Pre-quantize A for all implementations
     A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
     A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
     A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
     A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True)
+        A, block_size[1], column_major_scales=True
+    )
 
     # === DeepGEMM Implementation ===
     def deepgemm_gemm():
-        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
-                                       (B_deepgemm, B_scale_deepgemm),
-                                       C_deepgemm)
+        fp8_gemm_nt(
+            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+        )
         return C_deepgemm
 
     # === vLLM Triton Implementation ===
     def vllm_triton_gemm():
-        return w8a8_block_fp8_matmul(A_vllm,
-                                     B_vllm,
-                                     A_scale_vllm,
-                                     B_scale_vllm,
-                                     block_size,
-                                     output_dtype=torch.bfloat16)
+        return w8a8_triton_block_scaled_mm(
+            A_vllm,
+            B_vllm,
+            A_scale_vllm,
+            B_scale_vllm,
+            block_size,
+            output_dtype=torch.bfloat16,
+        )
 
     # === vLLM CUTLASS Implementation ===
     def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(A_vllm_cutlass,
-                                     B_vllm.T,
-                                     scale_a=A_scale_vllm_cutlass,
-                                     scale_b=B_scale_vllm.T,
-                                     out_dtype=torch.bfloat16)
+        return ops.cutlass_scaled_mm(
+            A_vllm_cutlass,
+            B_vllm.T,
+            scale_a=A_scale_vllm_cutlass,
+            scale_b=B_scale_vllm.T,
+            out_dtype=torch.bfloat16,
+        )
 
     # Run correctness check first
     if verbose:
@@ -93,26 +99,23 @@ def benchmark_shape(m: int,
         print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
         print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
         print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print("vLLM Triton vs DeepGEMM difference: "
-              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
-        print("vLLM CUTLASS vs DeepGEMM difference: "
-              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+        print(
+            "vLLM Triton vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        )
+        print(
+            "vLLM CUTLASS vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
+        )
 
     # Benchmark implementations
     implementations = {
         "DeepGEMM": deepgemm_gemm,
         "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm
+        "vLLM CUTLASS": vllm_cutlass_gemm,
     }
 
-    benchmark_results = {
-        "shape": {
-            "m": m,
-            "n": n,
-            "k": k
-        },
-        "implementations": {}
-    }
+    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
 
     for name, func in implementations.items():
         # Warmup
@@ -140,38 +143,36 @@ def benchmark_shape(m: int,
             "tflops": tflops,
             "gb_s": gb_s,
             "diff": {
-                "DeepGEMM":
-                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
-                "Reference":
-                deepgemm_diff if name == "DeepGEMM" else
-                (vllm_triton_diff
-                 if name == "vLLM Triton" else vllm_cutlass_diff)
-            }
+                "DeepGEMM": 0.0
+                if name == "DeepGEMM"
+                else calc_diff(func(), C_deepgemm),
+                "Reference": deepgemm_diff
+                if name == "DeepGEMM"
+                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+            },
         }
 
         if verbose:
-            print(
-                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
-            )
+            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
 
     # Calculate speedups
     baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
     for name, data in benchmark_results["implementations"].items():
         if name != "DeepGEMM":
             speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name][
-                "speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
             if verbose:
-                print(f"DeepGEMM is {1/speedup:.2f}x "
-                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+                print(
+                    f"DeepGEMM is {1 / speedup:.2f}x "
+                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
+                )
 
-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
-        "time_ms"]
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
-        "time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
     cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"][
-        "speedup_vs_triton"] = cutlass_vs_triton
+    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+        cutlass_vs_triton
+    )
     if verbose:
         print(
             f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@@ -183,8 +184,7 @@ def benchmark_shape(m: int,
 
 def format_table_row(values, widths):
     """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}"
-                             for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
 
 
 def print_table(headers, rows, title=None):
@@ -292,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
     for result in all_results:
         shape = result["shape"]
         impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
-        ])
+        deepgemm_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+            ]
+        )
 
-    print_table(deepgemm_headers,
-                deepgemm_rows,
-                title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
 
     # Print vLLM Triton table
-    triton_headers = [
-        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
-    ]
+    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
     triton_rows = []
     for result in all_results:
         shape = result["shape"]
         impl_data = result["implementations"]["vLLM Triton"]
         speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-            format_speedup(speedup)
-        ])
+        triton_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(speedup),
+            ]
+        )
 
-    print_table(triton_headers,
-                triton_rows,
-                title="vLLM Triton Implementation:")
+    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
 
     # Print vLLM CUTLASS table
     cutlass_headers = [
-        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
-        "vs Triton"
+        "m",
+        "n",
+        "k",
+        "Time (μs)",
+        "TFLOPS",
+        "GB/s",
+        "vs DeepGEMM",
+        "vs Triton",
     ]
     cutlass_rows = []
     for result in all_results:
@@ -331,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
         impl_data = result["implementations"]["vLLM CUTLASS"]
         vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
         vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-            format_speedup(vs_deepgemm),
-            format_speedup(vs_triton)
-        ])
+        cutlass_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(vs_deepgemm),
+                format_speedup(vs_triton),
+            ]
+        )
 
-    print_table(cutlass_headers,
-                cutlass_rows,
-                title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
 
     # Calculate and print averages
     print("\n===== AVERAGE PERFORMANCE =====")
 
     implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
     avg_metrics = {
-        impl: {
-            "tflops": 0,
-            "gb_s": 0,
-            "time_ms": 0
-        }
-        for impl in implementations
+        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
     }
 
     for result in all_results:
@@ -370,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
         avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
         avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
         avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append([
-            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
-        ])
+        avg_rows.append(
+            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+        )
 
     print_table(avg_headers, avg_rows)
 
@@ -380,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
     avg_speedups = {
         "DeepGEMM vs vLLM Triton": 0,
         "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0
+        "vLLM CUTLASS vs vLLM Triton": 0,
     }
 
     for result in all_results:
         deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
         vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
-            "time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
 
-        avg_speedups[
-            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups[
-            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        avg_speedups[
-            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+            vllm_triton_time / vllm_cutlass_time
+        )
 
     print("\n===== AVERAGE SPEEDUPS =====")
     speedup_headers = ["Comparison", "Speedup"]
@@ -412,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
 
     for result in all_results:
         for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"][
-                "Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
 
     diff_headers = ["Implementation", "Avg Diff vs Reference"]
     diff_rows = []
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 9a4da0ef5a85d0df2178d5e990ad177e3f76bf37..6964a3d3e0824d6ec93d6dff012b79cc56f7433e 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -11,7 +11,7 @@ import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement
 
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 4bbb36bb4359259944310adb8e1047a02f126dea..a9af811bbe9ca9e0b0f66c493f27bcc890dc3515 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -55,7 +55,7 @@ class Bench:
 
     def __init__(
         self,
-        cuda_graph_params: Optional[CudaGraphBenchParams],
+        cuda_graph_params: CudaGraphBenchParams | None,
         label: str,
         sub_label: str,
         description: str,
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 67b937930d58c2c28043eed0259b7a7b85e3f2d0..8cb8a2f386a9715c06617ec6afafc79dca3cec2f 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@@ -11,6 +11,7 @@ from bench_utils import (
     Color,
     logger,
 )
+from tqdm import tqdm
 from transformers import AutoTokenizer  # type: ignore
 
 # Conversation ID is a string (e.g: "UzTK34D")
@@ -35,8 +36,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
     def __init__(
         self,
-        min_val: Union[int, float],
-        max_val: Union[int, float],
+        min_val: int | float,
+        max_val: int | float,
         is_integer: bool = True,
     ) -> None:
         self.min_val = min_val
@@ -56,7 +57,7 @@ class UniformDistribution(Distribution):
 
 
 class ConstantDistribution(Distribution):
-    def __init__(self, value: Union[int, float]) -> None:
+    def __init__(self, value: int | float) -> None:
         self.value = value
         self.max_val = value
 
@@ -68,7 +69,7 @@ class ConstantDistribution(Distribution):
 
 
 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
         self.alpha = alpha
         self.max_val = max_val
 
@@ -83,7 +84,7 @@ class ZipfDistribution(Distribution):
 
 
 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
         self.alpha = alpha
         self.max_val = max_val
 
@@ -100,11 +101,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
     def __init__(
         self,
-        mean: Optional[float] = None,
-        sigma: Optional[float] = None,
-        average: Optional[int] = None,
-        median_ratio: Optional[float] = None,
-        max_val: Optional[int] = None,
+        mean: float | None = None,
+        sigma: float | None = None,
+        average: int | None = None,
+        median_ratio: float | None = None,
+        max_val: int | None = None,
     ) -> None:
         self.average = average
         self.median_ratio = median_ratio
@@ -417,6 +418,10 @@ def generate_conversations(
             data = file.read()
             tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
             list_of_tokens.extend(tokens_in_file)
+        logger.info(
+            f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+            f"total tokens so far: {len(list_of_tokens)}"
+        )
 
     conversations: ConversationsMap = {}
     conv_id = 0
@@ -449,18 +454,25 @@ def generate_conversations(
         )
         base_offset += common_prefix_tokens
 
-    for conv_id in range(args.num_conversations):
+    for conv_id in tqdm(
+        range(args.num_conversations),
+        total=args.num_conversations,
+        desc="Generating conversations",
+        unit="conv",
+    ):
         # Generate a single conversation
         messages: MessagesList = []
 
         nturns = turn_count[conv_id]
 
         # User prompt token count per turn (with lower limit)
-        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
         input_token_count = np.maximum(input_token_count, base_prompt_token_count)
 
         # Assistant answer token count per turn (with lower limit)
-        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+            int
+        )
         output_token_count = np.maximum(output_token_count, 1)
 
         user_turn = True
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 66d85eaf513125cc5431dff9256da8a621a2a7c4..772d685ad90ff8d93822c3fca9b7fc1796a06ab1 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple
 
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@@ -46,15 +46,16 @@ class ConversationSampling(str, Enum):
 
 class ClientArgs(NamedTuple):
     seed: int
-    max_num_requests: Optional[int]
+    max_num_requests: int | None
     skip_first_turn: bool
-    max_turns: Optional[int]
+    max_turns: int | None
     max_active_conversations: int
     verbose: bool
     print_content: bool
     verify_output: bool
     conversation_sampling: ConversationSampling
     request_rate: float
+    max_retries: int
 
 
 class RequestArgs(NamedTuple):
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
     stream: bool
     limit_min_tokens: int  # Use negative value for no limit
     limit_max_tokens: int  # Use negative value for no limit
+    timeout_sec: int
 
 
 class BenchmarkArgs(NamedTuple):
@@ -109,9 +111,9 @@ class RequestStats(NamedTuple):
 
 class MetricStats:
     def __init__(self) -> None:
-        self.min: Optional[float] = None
-        self.max: Optional[float] = None
-        self.avg: Optional[float] = None
+        self.min: float | None = None
+        self.max: float | None = None
+        self.avg: float | None = None
         self.sum = 0.0
         self.count = 0
 
@@ -143,7 +145,7 @@ class MovingAverage:
         self.index = 0
         self.sum = 0.0
         self.count = 0
-        self.avg: Optional[float] = None
+        self.avg: float | None = None
 
     def update(self, new_value: float) -> None:
         if self.count < self.window_size:
@@ -169,7 +171,7 @@ class MovingAverage:
 class DebugStats:
     def __init__(self, logger: logging.Logger, window_size: int) -> None:
         self.logger = logger
-        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
             "moving_avg_ttft_ms": MovingAverage(window_size),
             "moving_avg_tpot_ms": MovingAverage(window_size),
             "ttft_ms": MetricStats(),
@@ -198,14 +200,6 @@ class DebugStats:
         self.logger.info("-" * 50)
 
 
-# Must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text
-
-
 def nanosec_to_millisec(value: float) -> float:
     return value / 1000000.0
 
@@ -220,8 +214,9 @@ async def send_request(
     chat_url: str,
     model: str,
     stream: bool = True,
-    min_tokens: Optional[int] = None,
-    max_tokens: Optional[int] = None,
+    min_tokens: int | None = None,
+    max_tokens: int | None = None,
+    timeout_sec: int = 120,
 ) -> ServerResponse:
     payload = {
         "model": model,
@@ -243,16 +238,22 @@ async def send_request(
     headers = {"Content-Type": "application/json"}
 
     # Calculate the timeout for the request
-    timeout_sec = 120
     if max_tokens is not None:
         # Assume TPOT of 200ms and use max_tokens to determine timeout
-        timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+        token_based_timeout = int(max_tokens * 0.2)
+        if token_based_timeout > timeout_sec:
+            timeout_sec = token_based_timeout
+            logger.info(
+                "Using timeout of %ds based on max_tokens %d",
+                timeout_sec,
+                max_tokens,
+            )
     timeout = aiohttp.ClientTimeout(total=timeout_sec)
 
     valid_response = True
-    ttft: Optional[float] = None
+    ttft: float | None = None
     chunk_delay: list[int] = []
-    latency: Optional[float] = None
+    latency: float | None = None
     first_chunk = ""
     generated_text = ""
 
@@ -269,7 +270,7 @@ async def send_request(
                 if not chunk_bytes:
                     continue
 
-                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                 if chunk == "[DONE]":
                     # End of stream
                     latency = time.perf_counter_ns() - start_time
@@ -364,7 +365,7 @@ async def send_turn(
     req_args: RequestArgs,
     verbose: bool,
     verify_output: bool,
-) -> Optional[RequestStats]:
+) -> RequestStats | None:
     assert messages_to_use > 0
     assert messages_to_use <= len(conversation_messages)
 
@@ -417,6 +418,7 @@ async def send_turn(
         req_args.stream,
         min_tokens,
         max_tokens,
+        req_args.timeout_sec,
     )
 
     if response.valid is False:
@@ -526,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
     await asyncio.sleep(interval)
 
 
+async def exponential_backoff_sleep(
+    attempt_cnt: int,
+    base_rate: float = 1.0,
+    backoff_factor: float = 2.0,
+    jitter_fraction: float = 0.10,
+    verbose: bool = False,
+) -> None:
+    # Sleep with exponential backoff and jitter after a failed request.
+    backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+    jittered_delay = backoff_delay * (
+        1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+    )
+
+    if verbose:
+        logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+
+    await asyncio.sleep(jittered_delay)
+
+
 async def client_main(
     args: ClientArgs,
     req_args: RequestArgs,
@@ -540,8 +561,11 @@ async def client_main(
         f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
     )
 
-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)
 
     # Active conversations
     active_convs: ConversationsMap = {}
@@ -644,7 +668,7 @@ async def client_main(
 
             if args.verbose:
                 curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: Union[str, float] = "N/A"
+                time_since_last_turn: str | float = "N/A"
                 if conv_id in time_of_last_turn:
                     time_since_last_turn = round(
                         curr_time_sec - time_of_last_turn[conv_id], 3
@@ -654,49 +678,62 @@ async def client_main(
                 )
                 time_of_last_turn[conv_id] = curr_time_sec
 
-            success = True
-            try:
-                result = await send_turn(
-                    session,
-                    client_id,
-                    conv_id,
-                    messages,
-                    current_turn,
-                    tokenizer,
-                    req_args,
-                    args.print_content,
-                    args.verify_output,
-                )
-                if result is not None:
-                    result_queue.put(result)
-                else:
-                    # None means that the request failed,
-                    # and should not be added to the statistics.
-                    success = False
-                    num_failures += 1
-
-                    logger.warning(
-                        f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+            success = False
+            for attempt_cnt in range(args.max_retries + 1):
+                try:
+                    exception = False
+                    result = await send_turn(
+                        session,
+                        client_id,
+                        conv_id,
+                        messages,
+                        current_turn,
+                        tokenizer,
+                        req_args,
+                        args.print_content,
+                        args.verify_output,
+                    )
+                    if result is not None:
+                        result_queue.put(result)
+                        success = True
+                        break
+                    else:
+                        logger.warning(
+                            f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                        )
+                except asyncio.exceptions.TimeoutError:
+                    exception = True
+                    logger.error(
+                        "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+                        "Base timeout is %ss (set with --request-timeout-sec), but the "
+                        "effective timeout may be longer based on max_tokens. If this "
+                        "is unexpected, consider increasing the timeout or checking "
+                        "model performance.%s",
+                        Color.RED,
+                        client_id,
+                        conv_id,
+                        current_turn,
+                        req_args.timeout_sec,
+                        Color.RESET,
+                    )
+                except Exception:
+                    exception = True
+                    logger.exception(
+                        f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
                     )
 
-                    # Remove the conversation (should not be used again)
-                    active_convs.pop(conv_id)
-
-            except asyncio.exceptions.TimeoutError:
-                num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
+                # Sleep before retry if not last attempt
+                if not success and attempt_cnt < args.max_retries:
+                    await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
 
-            except Exception:
+            if not success:
                 num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
+                # Remove the conversation (should not be used again)
+                active_convs.pop(conv_id)
+                if exception:
+                    break  # Exit gracefully instead of raising an error
 
-            if success:
+            else:
                 num_successes += 1
 
                 # Update the turns counter to include the LLM response
@@ -769,7 +806,7 @@ def get_client_config(
             "Number of conversations must be equal or larger than the number of clients"
         )
 
-    max_req_per_client: Optional[int] = None
+    max_req_per_client: int | None = None
     if args.max_num_requests is not None:
         # Max number of requests per client
         req_per_client = args.max_num_requests // args.num_clients
@@ -811,6 +848,7 @@ def get_client_config(
         verify_output=args.verify_output,
         conversation_sampling=args.conversation_sampling,
         request_rate=args.request_rate,
+        max_retries=args.max_retries,
     )
 
     if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
@@ -823,6 +861,9 @@ def get_client_config(
                 "Invalid min/max tokens limits (min should not be larger than max)"
             )
 
+    if args.request_timeout_sec <= 0:
+        raise ValueError("Request timeout must be a positive number")
+
     # Arguments for API requests
     chat_url = f"{args.url}/v1/chat/completions"
     model_name = args.served_model_name if args.served_model_name else args.model
@@ -833,6 +874,7 @@ def get_client_config(
         stream=not args.no_stream,
         limit_min_tokens=args.limit_min_tokens,
         limit_max_tokens=args.limit_max_tokens,
+        timeout_sec=args.request_timeout_sec,
     )
 
     return client_args, req_args
@@ -936,13 +978,13 @@ async def main_mp(
                     f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                 )
 
-                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
                 if len(client_metrics) < (5 * bench_args.num_clients):
                     # Do not estimate the RPS if the number of samples is very low
                     # (threshold can be tuned if needed)
                     rps = "N/A"
 
-                runtime_left_sec: Union[str, float] = round(
+                runtime_left_sec: str | float = round(
                     (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                 )
                 if percent < 0.05:
@@ -976,7 +1018,7 @@ async def main_mp(
             f"(is alive: {client.is_alive()}){Color.RESET}"
         )
 
-        client.join(timeout=120)
+        client.join(timeout=req_args.timeout_sec + 1)
 
         if client.is_alive():
             logger.warning(
@@ -1032,7 +1074,7 @@ def process_statistics(
     warmup_percentages: list[float],
     test_params: dict,
     verbose: bool,
-    gen_conv_args: Optional[GenConvArgs] = None,
+    gen_conv_args: GenConvArgs | None = None,
     excel_output: bool = False,
 ) -> None:
     if len(client_metrics) == 0:
@@ -1259,7 +1301,7 @@ async def main() -> None:
         default=None,
         help="The model name used in the API. "
         "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
     )
 
     parser.add_argument(
@@ -1342,6 +1384,16 @@ async def main() -> None:
         help="Expected request rate (Poisson process) per client in requests/sec."
         "Set to 0 for no delay between requests.",
     )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+        help="Maximum number of retry attempts for timed-out requests. "
+        "Default is 0 (no retries). "
+        "Set to higher values to retry failed requests and maintain "
+        "fair workload distribution. "
+        "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+    )
     parser.add_argument(
         "--conversation-sampling",
         type=ConversationSampling,
@@ -1359,6 +1411,13 @@ async def main() -> None:
         action="store_true",
         help="Verify the LLM output (compare to the answers in the input JSON file)",
     )
+    parser.add_argument(
+        "--request-timeout-sec",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each API request (default: 120). "
+        "Automatically increased if max tokens imply longer decoding.",
+    )
 
     parser.add_argument(
         "--no-stream",
@@ -1434,11 +1493,10 @@ async def main() -> None:
             f"Invalid --warmup-percentage={args.warmup_percentage}"
         ) from None
 
+    # Set global seeds for main process
     random.seed(args.seed)
     np.random.seed(args.seed)
 
-    if not os.path.exists(args.model):
-        raise OSError(f"Path does not exist: {args.model}")
     logger.info("Loading tokenizer")
     tokenizer = AutoTokenizer.from_pretrained(args.model)
 
diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
index c3622c99a2e5361c79110cdd3ca7b5bff4d78f22..fccab4d0ce21ad69980736710eee7e9814485974 100644
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any, Optional
+from typing import Any
 
 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 
 
 def content_is_valid(
-    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+    content: str, min_content_len: int | None, max_content_len: int | None
 ) -> bool:
     if min_content_len and len(content) < min_content_len:
         return False
@@ -37,7 +37,7 @@ def content_is_valid(
 
 
 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 ) -> None:
     # Collect statistics
     stats = []
@@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
     seed: int,
     input_file: str,
     output_file: str,
-    max_items: Optional[int],
-    min_content_len: Optional[int] = None,
-    max_content_len: Optional[int] = None,
-    min_turns: Optional[int] = None,
-    max_turns: Optional[int] = None,
-    model: Optional[str] = None,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
 ) -> None:
     if min_turns and max_turns:
         assert min_turns <= max_turns
diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt
index f0e1935914a14b72167a6c32f93702530376d0af..bae656a5c5c4bd4f2bb544b32138cd31fd692c87 100644
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
@@ -2,4 +2,5 @@ numpy>=1.24
 pandas>=2.0.0
 aiohttp>=3.10
 transformers>=4.46
-xlsxwriter>=3.2.1
\ No newline at end of file
+xlsxwriter>=3.2.1
+tqdm>=4.66
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index 0957a9c65f06c912ae73588e017602af72ae9772..178599952d5c4e81145c73676b8ab0d4eaef6aa6 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -5,7 +5,7 @@ import cProfile
 import pstats
 
 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
deleted file mode 100644
index 65b1e09a247e2d74e31b616c38a8544de75ad525..0000000000000000000000000000000000000000
--- a/benchmarks/pyproject.toml
+++ /dev/null
@@ -1,49 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-
-[tool.ruff.format]
-docstring-code-format = true
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index e6d0012c1a4be6d4a17708f1061e4bb278f68bf1..aa84125818d10d98194e50de7f3c09809093c5fa 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -15,6 +15,7 @@ endif()
 #
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         set(ENABLE_AVX512VNNI OFF)
         message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
     endif()
+
+    find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
+    if (AMXBF16_FOUND OR ENABLE_AMXBF16)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
+            set(ENABLE_AMXBF16 ON)
+            add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
+        else()
+            set(ENABLE_AMXBF16 OFF)
+            message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AMXBF16 OFF)
+        message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
+    endif()
     
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -188,31 +205,115 @@ else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
 
-#
-# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
-# Flag to enable ACL kernels for AARCH64 platforms
-if (VLLM_BUILD_ACL STREQUAL "ON")
-    set(USE_ACL ON)
-else()
-    set(USE_ACL OFF)
-endif()
 
+# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.9
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
-    )
+    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
+    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
+    if(ASIMD_FOUND)
+        # Set number of parallel build processes
+        include(ProcessorCount)
+        ProcessorCount(NPROC)
+        if(NOT NPROC)
+            set(NPROC 4)
+        endif()
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+        if (OPEN_MP)
+            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+        endif()
+
+        # Fetch and populate ACL
+        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
+            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
+        else()
+            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
+            FetchContent_Populate(arm_compute
+                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
+                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
+                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
+                GIT_TAG        v52.6.0
+                GIT_SHALLOW    TRUE
+                GIT_PROGRESS   TRUE
+            )
+            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
+        endif()
 
-    if(USE_ACL)
-        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
-        if(NOT ARM_COMPUTE_LIBRARY)
-            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
+        # Build ACL with CMake
+        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
+        set(CMAKE_BUILD_TYPE "Release")
+        set(ARM_COMPUTE_ARCH "armv8.2-a")
+        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
+        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
+        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
+        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
+        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
+        set(ARM_COMPUTE_BUILD_TESTING "OFF")
+
+        set(_cmake_config_cmd
+             ${CMAKE_COMMAND} -G Ninja -B build 
+            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
+            -DCMAKE_BUILD_TYPE=Release 
+            -DARM_COMPUTE_ARCH=armv8.2-a 
+            -DARM_COMPUTE_ENABLE_ASSERTS=OFF 
+            -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF 
+            -DARM_COMPUTE_ENABLE_OPENMP=ON 
+            -DARM_COMPUTE_ENABLE_WERROR=OFF 
+            -DARM_COMPUTE_BUILD_EXAMPLES=OFF 
+            -DARM_COMPUTE_BUILD_TESTING=OFF)
+        set(_cmake_build_cmd
+            ${CMAKE_COMMAND} --build build -- -j${NPROC}
+        )
+
+        execute_process(
+            COMMAND ${_cmake_config_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+        )
+        execute_process(
+            COMMAND ${_cmake_build_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+            RESULT_VARIABLE _acl_rc
+        )
+
+        if(NOT _acl_rc EQUAL 0)
+            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
         endif()
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        message(STATUS "Arm Compute Library (ACL) built successfully.")
+
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
+        add_compile_definitions(VLLM_USE_ACL)
+    endif()
+
+    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+
+    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+        FetchContent_Declare(
+            oneDNN
+            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+        )
+    else()
+        message(STATUS "Downloading oneDNN from GitHub")
+        FetchContent_Declare(
+            oneDNN
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            GIT_TAG v3.10
+            GIT_PROGRESS TRUE
+            GIT_SHALLOW TRUE
+        )
     endif()
 
     set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -229,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
     set(ONEDNN_VERBOSE "OFF")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
+    set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
+    set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
     add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
     target_include_directories(
         dnnl_ext
@@ -259,14 +363,14 @@ endif()
 #
 set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
-    "csrc/cpu/attention.cpp"
-    "csrc/cpu/cache.cpp"
     "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/mla_decode.cpp"
     "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
-    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+    "csrc/cpu/cpu_attn.cpp"
+    "csrc/cpu/scratchpad_manager.cpp"
+    "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
@@ -297,7 +401,7 @@ message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
 # Define extension targets
 #
 
-define_gpu_extension_target(
+define_extension_target(
     _C
     DESTINATION vllm
     LANGUAGE CXX
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index c9e7aec880b99ab2f4e785b0e94c7167fd820e15..2cf3c1a755d3c0b86623121eefb481644280c8a3 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
   FetchContent_Declare(
         flashmla
         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""
@@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
         ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
         ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
         ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
     )
 
     set(FlashMLA_INCLUDES
@@ -91,7 +92,7 @@ if(FLASH_MLA_ARCHS)
         SRCS "${FlashMLA_Extension_SOURCES}"
         CUDA_ARCHS "${FLASH_MLA_ARCHS}")
 
-    define_gpu_extension_target(
+    define_extension_target(
         _flashmla_C
         DESTINATION vllm
         LANGUAGE ${VLLM_GPU_LANG}
@@ -108,7 +109,7 @@ if(FLASH_MLA_ARCHS)
         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 
-    define_gpu_extension_target(
+    define_extension_target(
         _flashmla_extension_C
         DESTINATION vllm
         LANGUAGE ${VLLM_GPU_LANG}
diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5a59a409999ad5d673eea5b3a67648bba2c43c5e
--- /dev/null
+++ b/cmake/external_projects/qutlass.cmake
@@ -0,0 +1,97 @@
+include(FetchContent)
+
+set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
+
+if(DEFINED ENV{QUTLASS_SRC_DIR})
+  set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
+endif()
+
+if(QUTLASS_SRC_DIR)
+  FetchContent_Declare(
+    qutlass
+    SOURCE_DIR ${QUTLASS_SRC_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+    qutlass
+    GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
+    GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
+    GIT_PROGRESS TRUE
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+endif()
+
+FetchContent_Populate(qutlass)
+
+if(NOT qutlass_SOURCE_DIR)
+  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
+endif()
+message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
+
+cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+
+  if(QUTLASS_ARCHS MATCHES "10\\.0a")
+    set(QUTLASS_TARGET_CC 100)
+  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
+    set(QUTLASS_TARGET_CC 120)
+  else()
+    message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
+  endif()
+
+  set(QUTLASS_SOURCES
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
+  )
+
+  set(QUTLASS_INCLUDES
+    ${qutlass_SOURCE_DIR}
+    ${qutlass_SOURCE_DIR}/qutlass
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
+  )
+
+  if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
+  elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
+    message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
+  else()
+    message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
+                        "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
+  endif()
+
+  set_gencode_flags_for_srcs(
+    SRCS "${QUTLASS_SOURCES}"
+    CUDA_ARCHS "${QUTLASS_ARCHS}"
+  )
+
+  target_sources(_C PRIVATE ${QUTLASS_SOURCES})
+  target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
+  target_compile_definitions(_C PRIVATE
+    QUTLASS_DISABLE_PYBIND=1
+    TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
+  )
+
+  set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
+  )
+
+else()
+  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+    message(STATUS
+      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+  else()
+    message(STATUS
+      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+      "CUDA_ARCHS='${CUDA_ARCHS}'.")
+  endif()
+endif()
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 3d32121f13ac2709924cd23e47256e6a1bf820a6..567c8959f0454f7a0f6e0fd11b119acce11165f5 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
+          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/hipify.py b/cmake/hipify.py
index 55d378f5b11137739d21beb6d80df672040f66a4..8504f9defee96bcd1d3f6eb2698c78061b41bcec 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -16,7 +16,7 @@ import shutil
 
 from torch.utils.hipify.hipify_python import hipify
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
     # Project directory where all the source + include files live.
@@ -34,15 +34,14 @@ if __name__ == '__main__':
     )
 
     # Source files to convert.
-    parser.add_argument("sources",
-                        help="Source files to hipify.",
-                        nargs="*",
-                        default=[])
+    parser.add_argument(
+        "sources", help="Source files to hipify.", nargs="*", default=[]
+    )
 
     args = parser.parse_args()
 
     # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, '*')]
+    includes = [os.path.join(args.project_dir, "*")]
 
     # Get absolute path for all source files.
     extra_files = [os.path.abspath(s) for s in args.sources]
@@ -51,25 +50,31 @@ if __name__ == '__main__':
     # The directory might already exist to hold object files so we ignore that.
     shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
 
-    hipify_result = hipify(project_directory=args.project_dir,
-                           output_directory=args.output_dir,
-                           header_include_dirs=[],
-                           includes=includes,
-                           extra_files=extra_files,
-                           show_detailed=True,
-                           is_pytorch_extension=True,
-                           hipify_extra_files_only=True)
+    hipify_result = hipify(
+        project_directory=args.project_dir,
+        output_directory=args.output_dir,
+        header_include_dirs=[],
+        includes=includes,
+        extra_files=extra_files,
+        show_detailed=True,
+        is_pytorch_extension=True,
+        hipify_extra_files_only=True,
+    )
 
     hipified_sources = []
     for source in args.sources:
         s_abs = os.path.abspath(source)
-        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-                          (s_abs in hipify_result
-                           and hipify_result[s_abs].hipified_path is not None)
-                          else s_abs)
+        hipified_s_abs = (
+            hipify_result[s_abs].hipified_path
+            if (
+                s_abs in hipify_result
+                and hipify_result[s_abs].hipified_path is not None
+            )
+            else s_abs
+        )
         hipified_sources.append(hipified_s_abs)
 
-    assert (len(hipified_sources) == len(args.sources))
+    assert len(hipified_sources) == len(args.sources)
 
     # Print hipified source files.
     print("\n".join(hipified_sources))
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 10170c836594d78391c3a5f40e53b879a06a3544..7d62391469341ced1a0522161cdedbd25f1163e4 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -130,6 +130,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
   set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()
 
+# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
+#   libgomp.so    -> libgomp-<hash>.so...
+#   libgomp.so.1  -> libgomp-<hash>.so...
+# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
+function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
+  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
+
+  # Use run_python to locate vendored libgomp; never throw on failure.
+  run_python(_VLLM_TORCH_GOMP_PATH
+    "
+import os, glob
+try:
+  import torch
+  torch_pkg = os.path.dirname(torch.__file__)
+  site_root = os.path.dirname(torch_pkg)
+  torch_libs = os.path.join(site_root, 'torch.libs')
+  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
+except:
+  print('')
+"
+    "failed to probe torch.libs for libgomp")
+
+  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
+    return()
+  endif()
+
+  # Create shim under the build tree
+  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
+  file(MAKE_DIRECTORY "${_shim}")
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
+
+  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
+endfunction()
+
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
   string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
@@ -311,13 +349,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   list(REMOVE_DUPLICATES _PTX_ARCHS)
   list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
 
-  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+  # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "\\a$")
+    if(_arch MATCHES "[af]$")
       list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      string(REPLACE "a" "" _base "${_arch}")
+      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
       if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
         list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
         list(APPEND _CUDA_ARCHS "${_arch}")
@@ -416,21 +454,20 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 endmacro()
 
 #
-# Define a target named `GPU_MOD_NAME` for a single extension. The
+# Define a target named `MOD_NAME` for a single extension. The
 # arguments are:
 #
 # DESTINATION <dest>         - Module destination directory.
-# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
-#                              etc.
+# LANGUAGE <lang>            - The language for this module, e.g. CUDA, HIP,
+#                              CXX, etc.
 # SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
 #
 # Optional arguments:
 #
-# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
-#                              format.
-#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
-#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+# ARCHITECTURES <arches>     - A list of target architectures in cmake format.
+#                              For GPU, refer to CMAKE_CUDA_ARCHITECTURES and
+#                              CMAKE_HIP_ARCHITECTURES for more info.
 #                              ARCHITECTURES will use cmake's defaults if
 #                              not provided.
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
@@ -441,63 +478,61 @@ endmacro()
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
-function (define_gpu_extension_target GPU_MOD_NAME)
+function (define_extension_target MOD_NAME)
   cmake_parse_arguments(PARSE_ARGV 1
-    GPU
+    ARG
     "WITH_SOABI"
     "DESTINATION;LANGUAGE;USE_SABI"
     "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
-  if (GPU_LANGUAGE STREQUAL "HIP")
-    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  if (ARG_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}")
   endif()
 
-  if (GPU_WITH_SOABI)
-    set(GPU_WITH_SOABI WITH_SOABI)
+  if (ARG_WITH_SOABI)
+    set(SOABI_KEYWORD WITH_SOABI)
   else()
-    set(GPU_WITH_SOABI)
+    set(SOABI_KEYWORD "")
   endif()
 
-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  if (ARG_USE_SABI)
+    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
   else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
   endif()
 
-  if (GPU_LANGUAGE STREQUAL "HIP")
+  if (ARG_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
     # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
-    target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
+      ${ARG_INCLUDE_DIRECTORIES})
   else()
-    target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE csrc
+      ${ARG_INCLUDE_DIRECTORIES})
   endif()
 
-  if (GPU_ARCHITECTURES)
-    set_target_properties(${GPU_MOD_NAME} PROPERTIES
-      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  if (ARG_ARCHITECTURES)
+    set_target_properties(${MOD_NAME} PROPERTIES
+      ${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}")
   endif()
 
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${ARG_LANGUAGE}>:${ARG_COMPILE_FLAGS}>)
 
-  target_compile_options(${GPU_MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
-
-  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
 
-
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+  target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES})
 
   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
   # dependencies that are not necessary and may not be installed.
-  if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+  if (ARG_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES})
   else()
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES})
   endif()
 
-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
-endfunction()
\ No newline at end of file
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME})
+endfunction()
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000000000000000000000000000000000000..304c0be8105fc2ee3596efd937d505ed9ba7d354
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,12 @@
+codecov:
+  require_ci_to_pass: false
+
+fixes:
+  # Map source code paths to repository root paths
+  # Wildcards match any Python version (python3.*)
+  - "/vllm-workspace/src/vllm/::vllm/"
+  - "/vllm-workspace/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
index abd38d61f8bf851ca7f151c035a101803d23c03a..35b7a56c55a15186b9276c16a7f9e8c73e5c65a6 100644
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -28,10 +28,10 @@
 
 #ifdef USE_ROCM
   #include <hip/hip_bf16.h>
-  #include "../quantization/fp8/amd/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
 typedef __hip_bfloat16 __nv_bfloat16;
 #else
-  #include "../quantization/fp8/nvidia/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 6bee9e4ce1166cd922b4979d9c9a594ef8e75c44..229d9862fb670338da17a762e9b25be6a389745b 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -46,6 +46,32 @@ __global__ void merge_attn_states_kernel(
   s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
 
   const float max_lse = fmaxf(p_lse, s_lse);
+
+  /* In certain edge cases, MLA can produce p_lse = s_lse = -inf;
+     continuing the pipeline then yields NaN. Root cause: with chunked prefill
+     a batch may be split into two chunks; if a request in that batch has no
+     prefix hit, every LSE entry for that request’s position is -inf, and at
+     this moment we merge cross-attention at first. For now we simply emit
+     prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix
+     this problem.
+  */
+  if (std::isinf(max_lse)) {
+    if (pack_offset < head_size) {
+      // Pack 128b load
+      pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+          prefix_head_ptr)[pack_offset / pack_size];
+
+      // Pack 128b storage
+      reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+          p_out_pack;
+    }
+    // We only need to write to output_lse once per head.
+    if (output_lse != nullptr && pack_idx == 0) {
+      output_lse[head_idx * num_tokens + token_idx] = max_lse;
+    }
+    return;
+  }
+
   p_lse = p_lse - max_lse;
   s_lse = s_lse - max_lse;
   const float p_se = expf(p_lse);
diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
index 297d94dcc06314320f4c8ce3f32614a3a76d76f0..2d4b4a67d242168dc36d4da63f56a50bc36cd9c2 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -125,32 +125,37 @@ public:
   }
 
   static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
     if (args.split_kv >= 1) return;
     auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
     int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
-
-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(1, max_splits);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    int max_splits = 1;
+
+    if (B <= 4 && seq_length_k >= 16) {
+      max_splits = 16;
+    }
+    else if (B <= 8 && seq_length_k >= 4) {
+      max_splits = 8;
+    }
+    else if ((B <= 16 && seq_length_k >= 8) ||
+             (B == 48 && seq_length_k >= 32)) {
+      max_splits = 4;
+    }
+    else if ((B <= 32 && seq_length_k >= 16) ||
+             (B == 96 && seq_length_k >= 16)) {
+      max_splits = 2;
     }
-    
-    // printf("    max_splits = %d\n", max_splits);
+    else {
+      max_splits = 1;
+    }
+
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
     int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
     int split_heur = min(max_splits, sms_per_batch);
     int waves = ceil_div(B * split_heur, sm_count);
     int k_waves = ceil_div(max_splits, split_heur);
     int split_wave_aware = ceil_div(max_splits, k_waves);
     args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
   }
 
   /// Determines whether the GEMM can execute the given problem.
diff --git a/csrc/cache.h b/csrc/cache.h
index 427bd0d54fac5cded3659399b51c589b25a8e1b6..10e9bb25cf658b53fe4b0964b1ff8d3523c60fb6 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -64,3 +64,11 @@ void indexer_k_quant_and_cache(
     torch::Tensor& slot_mapping,  // [num_tokens]
     int64_t quant_block_size,     // quantization block size
     const std::string& scale_fmt);
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,   // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 1286f5806d4b64b2381d2c00483480b24815013c..10f31a999e3b40d21dfa6e506863e525111a9d36 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -9,16 +9,17 @@
 #include "quantization/vectorization_utils.cuh"
 
 #ifdef USE_ROCM
-  #include "quantization/fp8/amd/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
 #else
-  #include "quantization/fp8/nvidia/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif
 
 #include <algorithm>
 #include <cassert>
-#include <cfloat>  // FLT_MIN
-#include <map>
-#include <vector>
+
+#include <cfloat>
+// #include <map>
+// #include <vector>
 
 #ifdef USE_ROCM
   #include <hip/hip_bf16.h>
@@ -210,6 +211,20 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
 
 namespace vllm {
 
+// Used to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@@ -225,58 +240,50 @@ __global__ void reshape_and_cache_kernel(
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   if (slot_idx < 0) {
-    // Padding token that should be ignored.
     return;
   }
 
   const int64_t block_idx = slot_idx / block_size;
   const int64_t block_offset = slot_idx % block_size;
+  const int h_block_count = head_size / x;  // head_size//x
 
-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
-
-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int x_idx = head_offset / x;
-    const int x_offset = head_offset % x;
-
-    const int64_t tgt_key_idx =
-        block_idx * num_heads * (head_size / x) * block_size * x +
-        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-        block_offset * x + x_offset;
-    const int64_t tgt_value_idx =
-        block_idx * num_heads * head_size * block_size +
-        head_idx * head_size * block_size + head_offset * block_size +
-        block_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_idx] = tgt_key;
-      value_cache[tgt_value_idx] = tgt_value;
-    } else {
-      key_cache[tgt_key_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-      value_cache[tgt_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
-    }
+  const int h_block_idx = threadIdx.x;
+  if (h_block_idx >= num_heads * h_block_count) {
+    return;
   }
-}
 
-// Used by vectorization_utils to copy/convert one element
-template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
-struct CopyWithScaleOp {
-  float scale;
+  const int head_idx = h_block_idx / h_block_count;
+  const int h_block = h_block_idx % h_block_count;
 
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-    }
+  const scalar_t* __restrict__ key_src =
+      key + token_idx * key_stride + head_idx * head_size + h_block * x;
+  const int64_t src_value_start =
+      token_idx * value_stride + head_idx * head_size + h_block * x;
+
+  cache_t* __restrict__ key_dst =
+      key_cache + block_idx * num_heads * h_block_count * block_size * x +
+      head_idx * h_block_count * block_size * x + h_block * block_size * x +
+      block_offset * x;
+  const int64_t tgt_value_start =
+      block_idx * num_heads * h_block_count * x * block_size +
+      head_idx * h_block_count * x * block_size + h_block * x * block_size +
+      block_offset;
+
+  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
+  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
+
+  const scalar_t* __restrict__ value_src = value + src_value_start;
+  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
+#pragma unroll
+  for (int i = 0; i < x; i++) {
+    v_op(value_dst[i * block_size], value_src[i]);
   }
-};
+}
 
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
@@ -424,84 +431,81 @@ __global__ void concat_and_cache_ds_mla_kernel(
   const int64_t dst_idx_start =
       block_idx * block_stride + block_offset * entry_stride;
 
-  // Create 4 tile scales in shared memory
-  __shared__ float smem[20];
-  float* shard_abs_max = smem;
-  float* tile_scales = smem + 16;
-
-  // For the NoPE part, each tile of 128 elements is handled by 4 warps
-  // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
-  // The first thread of the first warp in each tile writes the scale
-  // value for the tile. The RoPE part (last 64 elements) is handled
-  // by another 2 warps (64 threads).
-  // So in total, we use 18 warps (576 threads) per block.
+  // For the NoPE part, each tile of 128 elements is handled by half of one warp
+  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
+  // So in total, we use 3 warps (96 threads) per block.
 
   // Cast kv_cache to 16_bit for RoPE values
   scalar_t* kv_cache_16bit =
       reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
 
-  // The last 64 threads handle the RoPE part
-  if (threadIdx.x >= kv_lora_rank) {
-    const int8_t pe_idx = threadIdx.x - kv_lora_rank;
-    const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
+  // The last warp handles the RoPE part
+  if (threadIdx.x >= 64) {
+    // Each thread handles two elements of RoPE
+    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
+    // Vectorized load of two 16-bit values, performed as one 32-bit load
+    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
     // RoPE values start after the packed 8-bit NoPE values and the
     // 32-bit scales
-    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
-    kv_cache_16bit[dst_idx] = k_pe[src_idx];
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
+    // Vectorized store of two 16-bit values, performed as one 32-bit store
+    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
     return;
   }
 
-  // Determine the scale for each chunk of NoPE
-  const int16_t tile_idx = threadIdx.x >> 7;
-  const int16_t warp_idx = (threadIdx.x & 127) >> 5;
-  const int16_t lane_idx = threadIdx.x & 31;
-
-  // Load the NoPE element for this thread into registers
-  const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
-  const scalar_t src_val = kv_c[src_idx];
-
-  // Warp-level reduction to find the max absolute value in the warp
-  float max_abs = fabsf(src_val);
+  // The first two warps handle the NoPE part
+  const int8_t warp_idx = threadIdx.x >> 5;
+  const int8_t lane_idx = threadIdx.x & 31;
+  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
+
+  // Each thread handles 8 elements of NoPE
+  // Load the NoPE elements for this thread into registers
+  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
+  // Vectorized load of eight 16-bit values, performed as an int4 load
+  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
+  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
+
+  // Max absolute value of this thread's elements
+  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
+                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
+                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
+                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
+
+  // Warp-level reduction to find the max absolute value in each half-warp
 #pragma unroll
-  for (int offset = 16; offset > 0; offset /= 2) {
-#ifdef USE_ROCM
-    max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
-#else
-    max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
-#endif
+  for (int offset = 8; offset > 0; offset /= 2) {
+    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
   }
 
-  // The first lane of each warp in each tile writes the max_abs of this part
-  // of the tile to shared memory
-  if (lane_idx == 0) {
-    shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
-  }
-  __syncthreads();
-
-  // The first lane of the first warp in each tile computes the scale for the
-  // tile and writes it to shared memory and to kv_cache
-  if (warp_idx == 0 && lane_idx == 0) {
-    float4 shard_abs_max_vec =
-        reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
-    float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
-                             fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
-                       448.f;
-
-    // Avoid division by zero in `scaled_convert`
-    tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
+  // Compute the scale for the tile
+  float tile_scale = max_abs / 448.f;
+  tile_scale = fmaxf(tile_scale, FLT_MIN);
+
+  // The first lane of each half-warp writes the scale to kv_cache
+  if ((lane_idx == 0) || (lane_idx == 16)) {
     float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
     const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
-    kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
+    kv_cache_32bit[dst_idx] = tile_scale;
   }
 
-  __syncthreads();
+  // Now all threads in the block scale and write their elements
+  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
+  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
+
+  uint8_t result[8];
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    result[i] =
+        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+            vals[i], tile_scale);
+  }
 
-  // Now all threads in the block scale and write their element
-  const float scale_val = tile_scales[tile_idx];
-  const int64_t dst_idx = dst_idx_start + threadIdx.x;
-  kv_cache[dst_idx] =
-      fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
-          src_val, scale_val);
+  // Store as aligned 64-bit writes
+  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
+      *reinterpret_cast<const uint64_t*>(result);
 }
 
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@@ -571,6 +575,70 @@ __global__ void indexer_k_quant_and_cache_kernel(
   }
 }
 
+template <int BLOCK_Y_SIZE>
+__global__ void cp_gather_indexer_k_quant_cache_kernel(
+    const char* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                        // cache_stride]
+    char* __restrict__ dst_k,           // [num_tokens, head_dim]
+    char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size *
+                                   // 4]
+    const int* __restrict__ block_table,  // [batch_size, num_blocks]
+    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
+    const int batch_size,                 // batch size
+    const int64_t token_stride,           // stride for each token in dst_k
+    const int64_t head_dim,               // dimension of each head
+    const int64_t block_stride,           // stride for each block in kv_cache
+    const int64_t cache_token_stride,     // stride for each token in kv_cache
+    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
+    const int num_blocks,            // number of blocks
+    const int num_tokens,            // number of tokens
+    const int quant_block_size       // quantization block size
+) {
+  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
+  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
+  // Find batch index within a block
+  __shared__ int batch_idx[BLOCK_Y_SIZE];
+  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
+       iter++) {
+    int tid = iter * blockDim.x + threadIdx.x;
+    if (tid < batch_size) {
+      const int seq_start = cu_seq_lens[tid];
+      const int seq_end = cu_seq_lens[tid + 1];
+      if (token_idx >= seq_start && token_idx < seq_end) {
+        batch_idx[threadIdx.y] = tid;
+      }
+    }
+  }
+
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  if (head_idx >= head_dim || token_idx >= num_tokens) {
+    return;
+  }
+  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
+  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
+                                    inbatch_seq_idx / cache_block_size];
+  const int64_t src_block_offset = block_idx * block_stride;
+  const int64_t cache_inblock_offset =
+      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
+  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
+  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
+
+  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
+      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
+  ;
+  if (threadIdx.x == 0) {
+    const int64_t src_scale_offset =
+        src_block_offset + cache_block_size * head_dim +
+        cache_inblock_offset * 4 / quant_block_size;
+    reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
+        reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
+  }
+}
+
 }  // namespace vllm
 
 // KV_T is the data type of key and value tensors.
@@ -606,9 +674,10 @@ void reshape_and_cache(
 
   int key_stride = key.stride(0);
   int value_stride = value.stride(0);
+  int head_div_x = head_size / x;
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_size, 512));
+  dim3 block(std::min(num_heads * head_div_x, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -741,13 +810,12 @@ void concat_and_cache_mla(
 
   if (kv_cache_dtype == "fp8_ds_mla") {
     dim3 grid(num_tokens);
-    // For the NoPE part, each tile of 128 elements is handled by 4 warps
-    // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
-    // The first thread of the first warp in each tile writes the scale
-    // value for the tile. The RoPE part (last 64 elements) is handled
-    // by another 2 warps (64 threads).
-    // So in total, we use 18 warps (576 threads) per block.
-    dim3 block(576);
+    // For the NoPE part, each tile of 128 elements is handled by half of one
+    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+    // The RoPE part (last 64 elements) is handled by another 1 warp (32
+    // threads). So in total, we use 3 warps (96 threads) per block.
+    dim3 block(96);
     DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                                CALL_CONCAT_AND_CACHE_DS_MLA);
   } else {
@@ -1172,3 +1240,59 @@ void indexer_k_quant_and_cache(
   DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
                              CALL_INDEXER_K_QUANT_AND_CACHE);
 }
+
+// Macro to dispatch the kernel based on the data amount.
+#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \
+  vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \
+      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
+              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
+         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
+          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
+          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
+          reinterpret_cast<char*>(dst_scale.data_ptr()),                    \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
+          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
+          num_tokens, quant_block_size);
+
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,  // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
+) {
+  int batch_size = block_table.size(0);
+  int num_tokens = dst_k.size(0);
+  int head_dim = dst_k.size(1);
+  int quant_block_size = head_dim * 4 / dst_scale.size(1);
+
+  TORCH_CHECK(kv_cache.device() == dst_k.device(),
+              "kv_cache and dst_k must be on the same device");
+  TORCH_CHECK(kv_cache.device() == dst_scale.device(),
+              "kv_cache and dst_scale must be on the same device");
+  TORCH_CHECK(kv_cache.device() == block_table.device(),
+              "kv_cache and block_table must be on the same device");
+  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
+              "kv_cache and cu_seq_lens must be on the same device");
+  TORCH_CHECK(head_dim % quant_block_size == 0,
+              "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 16;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (num_tokens < 32) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
+  } else if (num_tokens < 64) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
+  } else if (num_tokens < 128) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
+  } else if (num_tokens < 256) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
+  } else if (num_tokens < 512) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
+  } else {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
+  }
+}
diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fffe96b868575d17b9191ab610a2862ff00b0a43
--- /dev/null
+++ b/csrc/core/batch_invariant.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdlib>
+#include <string>
+#include <cctype>
+
+namespace vllm {
+
+// vllm_is_batch_invariant(); returns true
+// if env VLLM_BATCH_INVARIANT=1
+inline bool vllm_is_batch_invariant() {
+  static bool cached = []() {
+    std::string env_key = "VLLM_BATCH_INVARIANT";
+    const char* val = std::getenv(env_key.c_str());
+    return (val && std::atoi(val) != 0) ? 1 : 0;
+  }();
+  return cached;
+}
+
+}  // namespace vllm
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
deleted file mode 100644
index 82862fea7f2be78bb562bfce673b324b3ad97b3d..0000000000000000000000000000000000000000
--- a/csrc/cpu/attention.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-#include "cpu_types.hpp"
-
-namespace {
-
-template <typename scalar_t>
-struct KernelVecType {
-  using q_load_vec_type = void;
-  using q_vec_type = void;
-  using k_load_vec_type = void;
-  using k_vec_type = void;
-  using qk_acc_vec_type = void;
-  using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using q_load_vec_type = vec_op::FP32Vec4;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power and s390x architecture-specific vector types
-  using q_load_vec_type = vec_op::FP32Vec8;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures, including x86
-  using q_load_vec_type = vec_op::FP16Vec8;
-  using k_load_vec_type = vec_op::FP16Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
-#endif
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::BF16Vec32;
-  using k_load_vec_type = vec_op::BF16Vec32;
-  using k_vec_type = vec_op::BF16Vec32;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#else
-  #ifdef __aarch64__
-    #ifndef ARM_BF16_SUPPORT
-    // pass
-    #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-    #endif
-  #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-  #endif
-#endif
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
-                                           const int capacity) {
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= data[i] ? max : data[i];
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
-                                                const int capacity,
-                                                const float alibi_slope,
-                                                const int start_index,
-                                                const int seq_len) {
-  data[0] += alibi_slope * (start_index - seq_len + 1);
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
-    data[i] = qk;
-    max = max >= qk ? max : qk;
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
-                                         const int size) {
-  T max = max_data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= max_data[i] ? max : max_data[i];
-  }
-
-  T rescaled_sum = 0;
-  for (int i = 0; i < size; ++i) {
-    T rescale_factor = std::exp(max_data[i] - max);
-    rescaled_sum += rescale_factor * sum_data[i];
-    sum_data[i] *= rescale_factor;
-  }
-  for (int i = 0; i < size; ++i) {
-    sum_data[i] /= rescaled_sum + 1e-8;
-  }
-}
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
-struct reduceQKBlockKernel {
-  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
-  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
-  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
-  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
-  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
-
-  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
-  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
-  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
-
-  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
-  static_assert(k_load_vec_type::get_elem_num() % x == 0);
-  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
-
-  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
-                                const scalar_t* __restrict__ k_block,
-                                float* __restrict__ logits, float scale,
-                                const int token_num) {
-    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
-
-    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
-    if (token_num == BLOCK_SIZE) {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-
-        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
-            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
-              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                             TOKEN_PER_GROUP);
-              k_vec_type k_group_vec(k_load_group_vec);
-              vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                          k_group_vec);
-              vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                               token_group_idx * x * TOKEN_PER_GROUP);
-            });
-      }
-    } else {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-        for (int token_group_start = 0; token_group_start < group_num;
-             token_group_start += UNROLL_GROUP_NUM) {
-          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
-              [token_group_start, k_block, &q_group_vec,
-               &group_accums](int token_group_idx) {
-                token_group_idx += token_group_start;
-                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                               TOKEN_PER_GROUP);
-                k_vec_type k_group_vec(k_load_group_vec);
-                vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                            k_group_vec);
-                vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                                 token_group_idx * x * TOKEN_PER_GROUP);
-              });
-        }
-      }
-    }
-
-    for (int token_group_idx = 0; token_group_idx < group_num;
-         ++token_group_idx) {
-      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
-          [&group_accums, logits, scale, token_group_idx](int token_idx) {
-            float dot_v =
-                group_accums[token_group_idx]
-                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
-                                             TOKEN_PER_GROUP>(token_idx);
-            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
-                dot_v * scale;
-          });
-    }
-  }
-};
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
-          int HEAD_PARTITION_SIZE, typename acc_t>
-FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
-                                   acc_t&& acc) {
-  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
-  static_assert(BLOCK_SIZE == ELEM_NUM);
-  vec_op::FP32Vec16 prob_vec(prob);
-
-  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
-    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
-    vec_op::FP32Vec16 fp32_v_vec(v_vec);
-    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
-  });
-}
-};  // namespace
-
-// Paged attention v1
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
-struct paged_attention_v1_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-
-    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
-    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
-    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
-
-    const int parallel_work_item_num = omp_get_max_threads();
-
-    size_t logits_bytes =
-        parallel_work_item_num * max_seq_len_padded * sizeof(float);
-    float* logits = (float*)std::aligned_alloc(
-        64, logits_bytes);  // Cacheline alignment for each context token.
-                            // [parallel_work_item_num, max_seq_len_padded]
-
-#pragma omp parallel for collapse(2) schedule(dynamic, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        int seq_len = seq_lens[seq_idx];
-        const int* seq_block_table =
-            block_tables + max_num_blocks_per_seq * seq_idx;
-        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-        const scalar_t* __restrict__ q_vec_ptr =
-            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
-        float* __restrict__ thread_block_logits =
-            logits + omp_get_thread_num() * max_seq_len_padded;
-
-        // Compute logits
-        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-          const int64_t physical_block_idx = seq_block_table[block_idx];
-          const scalar_t* __restrict__ k_block_cache_ptr =
-              k_cache + physical_block_idx * kv_block_stride +
-              kv_head_idx * kv_head_stride;
-          float* __restrict__ head_block_logits =
-              thread_block_logits + block_idx * BLOCK_SIZE;
-
-          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-        }
-
-        // Compute softmax
-        if (alibi_slopes) {
-          reduceSoftmaxAlibi(thread_block_logits, seq_len,
-                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
-                             seq_len);
-        } else {
-          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
-        }
-
-        // Compute value
-        constexpr int head_elem_num_per_partition = 16;
-        constexpr int head_partition_num =
-            HEAD_SIZE / head_elem_num_per_partition;
-        for (int head_part_idx = 0; head_part_idx < head_partition_num;
-             ++head_part_idx) {
-          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-          scalar_t* __restrict__ out_ptr =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              head_part_idx * head_elem_num_per_partition;
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const float* __restrict__ prob_vec_ptr =
-                thread_block_logits + block_idx * BLOCK_SIZE;
-            const scalar_t* __restrict__ v_block_cache_ptr =
-                v_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride +
-                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                             head_elem_num_per_partition>(
-                prob_vec_ptr, v_block_cache_ptr, accums);
-
-            if (block_idx != block_num - 1) {
-              const int64_t next_physical_block_idx =
-                  seq_block_table[block_idx + 1];
-              const scalar_t* __restrict__ next_v_block_cache_ptr =
-                  v_cache + next_physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                  [&](int head_elem_idx) {
-                    if (head_elem_idx % 2 == 0) {
-                      vec_op::prefetch(next_v_block_cache_ptr +
-                                       BLOCK_SIZE * head_elem_idx);
-                    }
-                  });
-            }
-          }
-
-          vec_op::unroll_loop<int, head_elem_num_per_partition>(
-              [&](int head_elem_idx) {
-                float value = accums[head_elem_idx].reduce_sum();
-                vec_op::storeFP32(value, out_ptr + head_elem_idx);
-              });
-        }
-      }
-    }
-    std::free(logits);
-  }
-};
-
-#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
-  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
-      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
-      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
-      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
-      num_heads);
-
-template <typename T, int BLOCK_SIZE>
-void paged_attention_v1_impl_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
-  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes);
-
-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v1(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
-                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
-                               });
-}
-
-// Paged attention v2
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
-struct paged_attention_v2_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      float* __restrict__ max_logits,        // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
-                                             // max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads, const int max_num_partitions) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
-    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
-
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int partition_idx = 0; partition_idx < max_num_partitions;
-           ++partition_idx) {
-        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int start_token_idx = partition_idx * PARTITION_SIZE;
-
-          if (start_token_idx >= seq_len) continue;
-
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-          const bool no_reduce = (partition_num == 1);
-          const int token_num =
-              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
-               start_token_idx);
-          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          const int last_block_token_num =
-              token_num - (block_num - 1) * BLOCK_SIZE;
-          const int* seq_block_table = block_tables +
-                                       max_num_blocks_per_seq * seq_idx +
-                                       start_token_idx / BLOCK_SIZE;
-          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-          const scalar_t* __restrict__ q_vec_ptr =
-              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
-
-          // Compute logits
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const scalar_t* __restrict__ k_block_cache_ptr =
-                k_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride;
-            float* __restrict__ head_block_logits =
-                logits + block_idx * BLOCK_SIZE;
-
-            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-          }
-
-          std::pair<float, float> max_and_sum;
-          if (alibi_slopes) {
-            max_and_sum = reduceSoftmaxAlibi(
-                logits, token_num, block_num * BLOCK_SIZE,
-                alibi_slopes[head_idx], start_token_idx, seq_len);
-          } else {
-            max_and_sum =
-                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
-          }
-
-          auto&& [max_logit, exp_sum] = max_and_sum;
-
-          scalar_t* __restrict__ output_buffer = nullptr;
-          if (!no_reduce) {
-            auto idx = seq_idx * num_heads * max_num_partitions +
-                       head_idx * max_num_partitions + partition_idx;
-            max_logits[idx] = max_logit;
-            exp_sums[idx] = exp_sum;
-            output_buffer =
-                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-                head_idx * max_num_partitions * HEAD_SIZE +
-                partition_idx * HEAD_SIZE;
-          } else {
-            output_buffer =
-                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-          }
-
-          // Compute value
-          constexpr int head_elem_num_per_partition = 16;
-          constexpr int head_partition_num =
-              HEAD_SIZE / head_elem_num_per_partition;
-          for (int head_part_idx = 0; head_part_idx < head_partition_num;
-               ++head_part_idx) {
-            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-            scalar_t* __restrict__ out_ptr =
-                output_buffer + head_part_idx * head_elem_num_per_partition;
-            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-              const int64_t physical_block_idx = seq_block_table[block_idx];
-              const float* __restrict__ prob_vec_ptr =
-                  logits + block_idx * BLOCK_SIZE;
-              const scalar_t* __restrict__ v_block_cache_ptr =
-                  v_cache + physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                               head_elem_num_per_partition>(
-                  prob_vec_ptr, v_block_cache_ptr, accums);
-
-              if (block_idx != block_num - 1) {
-                const int64_t next_physical_block_idx =
-                    seq_block_table[block_idx + 1];
-                const scalar_t* __restrict__ next_v_block_cache_ptr =
-                    v_cache + next_physical_block_idx * kv_block_stride +
-                    kv_head_idx * kv_head_stride +
-                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-                vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                    [&](int head_elem_idx) {
-                      if (head_elem_idx % 2 == 0) {
-                        vec_op::prefetch(next_v_block_cache_ptr +
-                                         BLOCK_SIZE * head_elem_idx);
-                      }
-                    });
-              }
-            }
-
-            vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                [&](int head_elem_idx) {
-                  float value = accums[head_elem_idx].reduce_sum();
-                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
-                });
-          }
-        }
-      }
-    }
-
-    // Rescale partition softmax and store the factors to exp_sums
-#pragma omp parallel for collapse(2) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        const int seq_len = seq_lens[seq_idx];
-        const int partition_num =
-            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-        if (partition_num == 1) continue;
-
-        reducePartitionSoftmax(
-            max_logits + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            exp_sums + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            partition_num);
-      }
-    }
-
-    // Reduce values
-    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
-    constexpr int head_elem_num_per_group =
-        16;  // Note: didn't align with the cacheline size, due to some
-             // HEAD_SIZE didn't align with 64 bytes
-    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
-    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
-    const float* __restrict__ rescale_factors = exp_sums;
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-          if (partition_num == 1) continue;
-
-          const float* __restrict__ seq_head_rescale_factors =
-              rescale_factors + seq_idx * num_heads * max_num_partitions +
-              head_idx * max_num_partitions;
-          const scalar_t* __restrict__ seq_head_tmp_out =
-              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-              head_idx * max_num_partitions * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-          scalar_t* __restrict__ seq_head_output =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-
-          vec_op::FP32Vec16 acc;
-          for (int i = 0; i < partition_num; ++i) {
-            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
-            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
-            vec_op::FP32Vec16 fp32_value(value);
-            acc = acc + fp32_value * rescale_factor;
-          }
-          v_load_vec_type cast_acc(acc);
-          cast_acc.save(seq_head_output);
-        }
-      }
-    }
-  }
-};
-
-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
-  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
-      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
-      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
-      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
-      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
-      max_num_partitions);
-
-template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
-void paged_attention_v2_impl_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-  int max_num_partitions = exp_sums.size(-1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
-  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
-      alibi_slopes);
-
-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v2(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
-                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
-                               });
-}
\ No newline at end of file
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
deleted file mode 100644
index 69f6d06e3c9679f28000a262df0520bb2a549c9d..0000000000000000000000000000000000000000
--- a/csrc/cpu/cache.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include <map>
-#include <vector>
-
-#include "cpu_types.hpp"
-
-#if defined(__x86_64__)
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
-#else
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
-#endif
-
-namespace {
-template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
-                          const torch::Tensor& mapping_pairs,
-                          const int element_num_per_block,
-                          const int layer_num) {
-  const size_t pair_num = mapping_pairs.size(0);
-  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
-#pragma omp parallel for collapse(2)
-  for (int layer = 0; layer < layer_num; ++layer) {
-    for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset =
-          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
-      int64_t target_offset =
-          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
-      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
-      scalar_t* source_ptr = key_cache_ptr + source_offset;
-      scalar_t* target_ptr = key_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-
-      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
-      source_ptr = value_cache_ptr + source_offset;
-      target_ptr = value_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-    }
-  }
-}
-
-template <typename scalar_t>
-void reshape_and_cache_cpu_impl(
-    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
-    const int64_t* __restrict__ slot_mapping, const int num_tokens,
-    const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x) {
-  const int block_elem_num = num_heads * head_size * block_size;
-
-#pragma omp parallel for collapse(2)
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-      const int64_t slot_idx = slot_mapping[token_idx];
-      if (slot_idx >= 0) {
-        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
-        int src_value_head_idx =
-            token_idx * value_stride + head_idx * head_size;
-        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
-        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
-        const int64_t block_index = slot_idx / block_size;
-        const int64_t block_offset = slot_idx % block_size;
-        scalar_t* target_key_head_ptr = key_cache +
-                                        block_elem_num * block_index +
-                                        head_idx * block_size * head_size;
-        scalar_t* target_value_head_ptr = value_cache +
-                                          block_elem_num * block_index +
-                                          head_idx * block_size * head_size;
-
-        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
-          const int64_t target_offset =
-              src_key_idx * block_size + block_offset * x;
-          for (int i = 0; i < x; ++i) {
-            target_key_head_ptr[target_offset + i] =
-                src_key_head_ptr[src_key_idx + i];
-          }
-        }
-
-        for (int src_value_idx = 0; src_value_idx < head_size;
-             ++src_value_idx) {
-          const int64_t target_offset =
-              src_value_idx * block_size + block_offset;
-          target_value_head_ptr[target_offset] =
-              src_value_head_ptr[src_value_idx];
-        }
-      }
-    }
-  }
-}
-};  // namespace
-
-template <typename scalar_t>
-void concat_and_cache_mla_cpu_impl(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                      // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int num_tokens,                      //
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size                       //
-) {
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    const int64_t slot_idx = slot_mapping[token_idx];
-    // NOTE: slot_idx can be -1 if the token is padded
-    if (slot_idx < 0) {
-      continue;
-    }
-    const int64_t block_idx = slot_idx / block_size;
-    const int64_t block_offset = slot_idx % block_size;
-
-    auto copy = [&](const scalar_t* __restrict__ src,
-                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
-                    int size, int offset) {
-      for (int i = 0; i < size; i++) {
-        const int64_t src_idx = token_idx * src_stride + i;
-        const int64_t dst_idx =
-            block_idx * block_stride + block_offset * entry_stride + i + offset;
-        dst[dst_idx] = src[src_idx];
-      }
-    };
-
-    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-  }
-}
-
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  unsigned num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-
-  const int element_num_per_block = key_caches[0][0].numel();
-  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                   element_num_per_block, num_layers);
-    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-  });
-}
-
-void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
-                       torch::Tensor& key_cache, torch::Tensor& value_cache,
-                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
-  int num_heads = key.size(1);
-  int head_size = key.size(2);
-  int block_size = key_cache.size(3);
-  int x = key_cache.size(4);
-
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-
-  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-    reshape_and_cache_cpu_impl<scalar_t>(
-        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
-        num_heads, head_size, block_size, x);
-    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-  });
-}
-
-void concat_and_cache_mla(
-    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  TORCH_CHECK(kv_cache_dtype != "fp8");
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        concat_and_cache_mla_cpu_impl<scalar_t>(
-            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
-      });
-}
-
-void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
-                 const torch::Tensor& block_mapping) {
-  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
-}
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50f17c758c148774fe331c30200232f18c35b7bb
--- /dev/null
+++ b/csrc/cpu/cpu_attn.cpp
@@ -0,0 +1,249 @@
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+  #define AMX_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::AMX: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
+                                                     scalar_t, head_dim>;     \
+      return __VA_ARGS__();                                                   \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
+#endif
+
+#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
+  case HEAD_DIM: {                            \
+    constexpr size_t head_dim = HEAD_DIM;     \
+    return __VA_ARGS__();                     \
+  }
+
+#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
+  [&] {                                                         \
+    switch (HEAD_DIM) {                                         \
+      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
+      default: {                                                \
+        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
+                               std::to_string(HEAD_DIM));       \
+      }                                                         \
+    }                                                           \
+  }()
+
+#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
+  [&] {                                                                       \
+    switch (ISA_TYPE) {                                                       \
+      AMX_DISPATCH(__VA_ARGS__)                                               \
+      case cpu_attention::ISA::VEC: {                                         \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      case cpu_attention::ISA::VEC16: {                                       \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      default: {                                                              \
+        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
+      }                                                                       \
+    }                                                                         \
+  }()
+
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split) {
+  cpu_attention::ISA isa;
+  if (isa_hint == "amx") {
+    isa = cpu_attention::ISA::AMX;
+  } else if (isa_hint == "vec") {
+    isa = cpu_attention::ISA::VEC;
+  } else if (isa_hint == "vec16") {
+    isa = cpu_attention::ISA::VEC16;
+  } else {
+    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
+  }
+
+  cpu_attention::AttentionScheduler::ScheduleInput input;
+  input.num_reqs = num_req;
+  input.num_heads_q = num_heads_q;
+  input.num_heads_kv = num_heads_kv;
+  input.head_dim = head_dim;
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  if (window_size != -1) {
+    input.left_sliding_window_size = window_size - 1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = window_size - 1;
+    }
+  } else {
+    input.left_sliding_window_size = -1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = -1;
+    }
+  }
+  input.casual = casual;
+  input.isa = isa;
+  input.enable_kv_split = enable_kv_split;
+  TORCH_CHECK(casual, "Only supports casual mask for now.");
+
+  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
+    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
+        input.elem_size = sizeof(scalar_t);
+        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+        input.output_buffer_elem_size =
+            sizeof(attn_impl::partial_output_buffer_t);
+        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
+      });
+    });
+  });
+
+  cpu_attention::AttentionScheduler scheduler;
+  torch::Tensor metadata = scheduler.schedule(input);
+  return metadata;
+}
+
+void cpu_attn_reshape_and_cache(
+    const torch::Tensor& key,    // [token_num, head_num, head_size]
+    const torch::Tensor& value,  // [token_num, head_num, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor& slot_mapping, const std::string& isa) {
+  TORCH_CHECK_EQ(key.dim(), 3);
+  TORCH_CHECK_EQ(value.dim(), 3);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+  TORCH_CHECK_EQ(key.stride(2), 1);
+  TORCH_CHECK_EQ(value.stride(2), 1);
+
+  const int64_t token_num = key.size(0);
+  const int64_t key_token_num_stride = key.stride(0);
+  const int64_t value_token_num_stride = value.stride(0);
+  const int64_t head_num = value.size(1);
+  const int64_t key_head_num_stride = key.stride(1);
+  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t num_blocks = key_cache.size(0);
+  const int64_t num_blocks_stride = key_cache.stride(0);
+  const int64_t cache_head_num_stride = key_cache.stride(1);
+  const int64_t block_size = key_cache.size(2);
+  const int64_t block_size_stride = key_cache.stride(2);
+  const int64_t head_dim = key.size(-1);
+
+  cpu_attention::ISA isa_tag = [&]() {
+    if (isa == "amx") {
+      return cpu_attention::ISA::AMX;
+    } else if (isa == "vec") {
+      return cpu_attention::ISA::VEC;
+    } else if (isa == "vec16") {
+      return cpu_attention::ISA::VEC16;
+    } else {
+      TORCH_CHECK(false, "Invalid ISA type: " + isa);
+    }
+  }();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
+            attn_impl::reshape_and_cache(
+                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+                key_cache.data_ptr<scalar_t>(),
+                value_cache.data_ptr<scalar_t>(),
+                slot_mapping.data_ptr<int64_t>(), token_num,
+                key_token_num_stride, value_token_num_stride, head_num,
+                key_head_num_stride, value_head_num_stride, num_blocks,
+                num_blocks_stride, cache_head_num_stride, block_size,
+                block_size_stride);
+          });
+        });
+      });
+}
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor&
+        value_cache,        // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor& output,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor& query_start_loc,  // [num_tokens + 1]
+    const torch::Tensor& seq_lens,         // [num_tokens]
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,  // [num_heads]
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table,  // [num_tokens, max_block_num]
+    const double softcap, const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux  // [num_heads]
+) {
+  TORCH_CHECK_EQ(query.dim(), 3);
+  TORCH_CHECK_EQ(query.stride(2), 1);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+
+  cpu_attention::AttentionInput input;
+  input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
+      scheduler_metadata.data_ptr());
+  input.num_tokens = query.size(0);
+  input.num_heads = query.size(1);
+  input.num_kv_heads = key_cache.size(1);
+  input.block_size = key_cache.size(2);
+  input.query = query.data_ptr();
+  input.query_num_tokens_stride = query.stride(0);
+  input.query_num_heads_stride = query.stride(1);
+  input.cache_num_blocks_stride = key_cache.stride(0);
+  input.cache_num_kv_heads_stride = key_cache.stride(1);
+  input.blt_num_tokens_stride = block_table.stride(0);
+  input.key_cache = key_cache.data_ptr();
+  input.value_cache = value_cache.data_ptr();
+  input.output = output.data_ptr();
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  input.block_table = block_table.data_ptr<int32_t>();
+  input.alibi_slopes =
+      alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
+  // For now sink must be bf16
+  input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
+  input.scale = scale;
+  input.causal = causal;
+  input.sliding_window_left = sliding_window_left;
+  input.sliding_window_right = sliding_window_right;
+  if (input.causal) {
+    // to make boundary calculation easier
+    input.sliding_window_right = 0;
+  }
+  float softcap_fp32 = softcap;
+  input.softcap = softcap_fp32;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
+          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
+            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+            mainloop(&input);
+          });
+        });
+      });
+}
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8da458b99119c31667ff875eeb947e5979f65968
--- /dev/null
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -0,0 +1,511 @@
+#ifndef CPU_ATTN_AMX_HPP
+#define CPU_ATTN_AMX_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename kv_cache_t>
+class TileGemm224 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    // k_cache, v_cache are prepacked
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    // logits_buffer, output_buffer are not prepacked
+    float* __restrict__ c_tile_4 = c_tile;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename kv_cache_t>
+class TileGemm122 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_tile;
+    float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      }
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = scalar_t;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = scalar_t;
+
+  constexpr static int64_t BlockSizeAlignment =
+      AMX_TILE_ROW_BYTES /
+      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 32;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::AMX;
+  constexpr static bool scale_on_logits = true;
+
+ public:
+  AttentionImpl() : current_q_head_num_(0) {
+    // Use all columns in AMX tiles
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  ~AttentionImpl() { _tile_release(); }
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if (q_head_num > AMX_TILE_ROW_NUM) {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    } else {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    }
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * head_dim;
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * (AMX_TILE_ROW_BYTES / 4);
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return block_size * HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      scalar_t* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, const float scale) {
+    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
+    constexpr int64_t head_elem_num_pre_block =
+        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
+
+    int32_t idx = 0;
+    int8_t* __restrict__ q_buffer_iter = reinterpret_cast<int8_t*>(q_buffer);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num;
+         ++q_num_idx, src += q_num_stride) {
+      scalar_t* __restrict__ src_iter = src;
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv;
+           ++q_head_idx, src_iter += q_head_stride) {
+        vec_op::unroll_loop<int32_t, head_size_block_num>(
+            [&](int32_t head_size_block_idx) {
+              // Use INT8Vec64 for 64 bytes block
+              vec_op::INT8Vec64 vec(src_iter + head_size_block_idx *
+                                                   head_elem_num_pre_block);
+              vec.save(q_buffer_iter + head_size_block_idx * AMX_TILE_BYTES);
+            });
+
+        ++idx;
+        q_buffer_iter += AMX_TILE_ROW_BYTES;
+        if ((idx & (AMX_TILE_ROW_NUM - 1)) == 0) {
+          // head is in another amx tile
+          q_buffer_iter -= AMX_TILE_ROW_NUM * AMX_TILE_ROW_BYTES;
+          q_buffer_iter += head_size_block_num * AMX_TILE_BYTES;
+        }
+      }
+    }
+  }
+
+  // reshape KV to AMX friendly layout
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+    // For AMX 2D tiles, size of each line is 64 bytes
+    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
+    // For AMX B martix, N always is 16
+    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
+    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
+    // For now suppose block_size is divisible by amx_tile_column_num
+    TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
+
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          // Head elements should be packed as quand-words and stored in token
+          // groups with (quadword_stride/4) tokens
+          constexpr int64_t token_num_per_group = amx_tile_row_size / 4;
+          static_assert(head_dim % (4 / sizeof(scalar_t)) == 0);
+          constexpr int64_t quadword_num = head_dim / (4 / sizeof(scalar_t));
+          const int32_t* key_start_quadword_ptr =
+              reinterpret_cast<const int32_t*>(
+                  key + token_idx * key_token_num_stride +
+                  head_idx * key_head_num_stride);
+          const int64_t group_idx = block_offset / token_num_per_group;
+          const int64_t group_offset = block_offset % token_num_per_group;
+          constexpr int64_t quadword_num_per_group =
+              token_num_per_group * quadword_num;
+          int32_t* key_cache_start_ptr =
+              reinterpret_cast<int32_t*>(key_cache +
+                                         block_idx * num_blocks_stride +
+                                         head_idx * cache_head_num_stride) +
+              group_idx * quadword_num_per_group + group_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; j < quadword_num;
+               i += token_num_per_group, ++j) {
+            key_cache_start_ptr[i] = key_start_quadword_ptr[j];
+          }
+        }
+        {
+          // Write Value
+          // Different from Key, block_size dimension is packed rather than
+          // head_size dimension block_size dimension is packed as quand-words;
+          constexpr int64_t token_num_per_sub_group = 4 / sizeof(scalar_t);
+          const int64_t token_num_per_group = block_size;
+          constexpr int64_t head_elems_per_group = amx_b_tile_n_size;
+          const int64_t group_size = token_num_per_group * head_elems_per_group;
+          // For now suppose head_dim is divisible by amx_b_tile_n_size
+          static_assert(head_dim % head_elems_per_group == 0);
+          constexpr int64_t group_num = head_dim / head_elems_per_group;
+          const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+          const int64_t sub_group_offset =
+              block_offset % token_num_per_sub_group;
+
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride +
+              sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
+              sub_group_offset;
+
+          for (int64_t i = 0; i < group_num; ++i) {
+#pragma GCC unroll head_elems_per_group
+            for (int64_t j = 0, k = 0; j < head_elems_per_group;
+                 ++j, k += token_num_per_sub_group) {
+              value_cache_start_ptr[k] = value_start_ptr[j];
+            }
+            value_start_ptr += head_elems_per_group;
+            value_cache_start_ptr += group_size;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t current_q_head_num_;
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..344296528b652d56e6c0f876ef37287891e6c51d
--- /dev/null
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -0,0 +1,2007 @@
+#ifndef CPU_ATTN_HPP
+#define CPU_ATTN_HPP
+
+#include <unistd.h>
+#include <type_traits>
+#include <cstddef>
+
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "cpu_attn_macros.h"
+
+namespace cpu_attention {
+enum class ISA { AMX, VEC, VEC16 };
+
+template <ISA isa, typename scalar_t, int64_t head_dim>
+class AttentionImpl {};
+
+struct AttentionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_num;
+  int32_t kv_split_pos_start;
+  int32_t kv_split_pos_end;
+
+  int64_t total_kv_len;
+  int32_t split_id;
+  int32_t local_split_id;
+
+  AttentionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t kv_split_pos_start,
+                         const int32_t kv_split_pos_end)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_num(0),
+        kv_split_pos_start(kv_split_pos_start),
+        kv_split_pos_end(kv_split_pos_end),
+        total_kv_len(0),
+        split_id(-1),
+        local_split_id(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_num: " << q_token_num << ",\n";
+    ss << "kv_split_pos_start: " << kv_split_pos_start << ",\n";
+    ss << "kv_split_pos_end: " << kv_split_pos_end << ",\n";
+    ss << "total_kv_len: " << total_kv_len << ",\n";
+    ss << "split_id: " << split_id << ",\n";
+    ss << "local_split_id: " << local_split_id << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct ReductionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_id_num;
+  int32_t split_start_id;
+  int32_t split_num;
+
+  ReductionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t q_token_id_num,
+                         const int32_t split_start_id)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_id_num(q_token_id_num),
+        split_start_id(split_start_id),
+        split_num(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_id_num: " << q_token_id_num << ",\n";
+    ss << "split_start_id: " << split_start_id << ",\n";
+    ss << "split_num: " << split_num << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct AttentionMetadata {
+  std::atomic_int64_t counter;
+  char _padding1[56];
+  ISA isa;
+  int32_t workitem_group_num;
+  int32_t reduction_item_num;
+  int32_t reduction_split_num;
+  int32_t thread_num;
+  int32_t effective_thread_num;  // non-zero item num in workitem_num_per_thread
+  int32_t split_kv_q_token_num_threshold;
+  int64_t attention_scratchpad_size_per_thread;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  AttentionWorkItemGroup* workitem_groups_ptr;
+  ReductionWorkItemGroup* reduction_items_ptr;
+  int32_t cu_workitem_num_per_thread[1025] = {
+      0};  // prefix sum of workitem_num_per_thread
+  char _padding2[56];
+
+  AttentionMetadata(ISA isa, int32_t workitem_group_num,
+                    int32_t reduction_item_num, int32_t reduction_split_num,
+                    int32_t split_kv_q_token_num_threshold)
+      : isa(isa),
+        workitem_group_num(workitem_group_num),
+        reduction_item_num(reduction_item_num),
+        reduction_split_num(reduction_split_num),
+        thread_num(omp_get_max_threads()),
+        effective_thread_num(thread_num),
+        split_kv_q_token_num_threshold(split_kv_q_token_num_threshold),
+        attention_scratchpad_size_per_thread(0),
+        reduction_scratchpad_size_per_kv_head(0),
+        workitem_groups_ptr(
+            (AttentionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata))),
+        reduction_items_ptr(
+            (ReductionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata) +
+                                      workitem_group_num *
+                                          sizeof(AttentionWorkItemGroup))),
+        counter(0) {
+    TORCH_CHECK_LE(thread_num, 1024);
+    static_assert(sizeof(AttentionMetadata) % 64 == 0);
+    TORCH_CHECK(reinterpret_cast<size_t>(this) % 64 == 0);
+  }
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+
+  void print() const {
+    std::stringstream ss;
+    ss << "ISA: ";
+    switch (isa) {
+      case ISA::AMX:
+        ss << "AMX, ";
+        break;
+      case ISA::VEC:
+        ss << "VEC, ";
+        break;
+    }
+    ss << "workitem_group_num: " << workitem_group_num
+       << ", reduction_item_num: " << reduction_item_num
+       << ", reduction_split_num: " << reduction_split_num
+       << ", thread_num: " << thread_num
+       << ", effective_thread_num: " << effective_thread_num
+       << ", attention_scratchpad_size_per_thread: "
+       << attention_scratchpad_size_per_thread
+       << ", reduction_scratchpad_size_per_kv_head: "
+       << reduction_scratchpad_size_per_kv_head << ", workitem groups:\n";
+    for (int32_t i = 0; i < workitem_group_num; ++i) {
+      ss << (workitem_groups_ptr + i)->to_string() << ",\n";
+    }
+
+    ss << "cu_workitem_num_per_thread: [";
+    for (int32_t i = 0; i < thread_num + 1; ++i) {
+      ss << cu_workitem_num_per_thread[i] << ", ";
+    }
+    ss << "]\n";
+
+    ss << "reduction items: \n";
+
+    for (int32_t i = 0; i < reduction_item_num; ++i) {
+      ss << (reduction_items_ptr + i)->to_string() << ",\n";
+    }
+
+    std::printf("%s", ss.str().c_str());
+  }
+};
+
+// Thread attention scratchpad contains:
+//  - Q: q_tile_size * head_dim * q_buffer_elem_size, gather Q heads, especially
+//  for GQA
+//  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size, logits
+//  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
+//  * q_tile_size * 4, partial output, max + sum (float)
+// Reduction scratchpad contains:
+//  - flags: bool array to indicate wether the split is finished
+//  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
+//  - max, sum: 2 * split_num * q_tile_size * 4
+class AttentionScratchPad {
+ public:
+  AttentionScratchPad(int64_t thread_id,
+                      const AttentionMetadata& attention_metadata,
+                      void* scratchpad_ptr)
+      : thread_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            thread_id *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            attention_metadata.thread_num *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_size_per_kv_head(
+            attention_metadata.reduction_scratchpad_size_per_kv_head) {}
+
+  // for attention
+  void update(const int64_t head_dim, const int64_t q_buffer_elem_size,
+              const int64_t logits_buffer_elem_size,
+              const int64_t output_buffer_elem_size,
+              const int64_t max_num_q_per_iter, const int64_t q_head_tile_size,
+              const int64_t kv_tile_size) {
+    int64_t buffer_offset = 0;
+    q_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_q_buffer_size(q_head_tile_size, head_dim, q_buffer_elem_size);
+    logits_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_logits_buffer_size(max_num_q_per_iter, kv_tile_size,
+                                              logits_buffer_elem_size);
+    output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_buffer_size(
+        q_head_tile_size, head_dim, output_buffer_elem_size);
+    max_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_max_sum_buffer_size(q_head_tile_size);
+    sum_buffer_offset_ = buffer_offset;
+  }
+
+  // for reduction
+  void update(const int32_t kv_head_idx, const int32_t total_split_num,
+              const int64_t head_dim, const int64_t q_head_tile_size,
+              const int64_t output_buffer_elem_size) {
+    int64_t buffer_offset = kv_head_idx * reduction_scratchpad_size_per_kv_head;
+    reduce_flag_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_flag_buffer_size(total_split_num);
+    reduce_output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_output_buffer_size(
+        total_split_num, q_head_tile_size, head_dim, output_buffer_elem_size);
+    reduce_max_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_reduce_max_sum_buffer_size(total_split_num, q_head_tile_size);
+    reduce_sum_buffer_offset_ = buffer_offset;
+  }
+
+  template <typename T>
+  T* get_q_buffer() {
+    return reinterpret_cast<T*>(thread_scratchpad_ptr + q_buffer_offset_);
+  }
+
+  float* get_logits_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    logits_buffer_offset_);
+  }
+
+  float* get_output_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    output_buffer_offset_);
+  }
+
+  float* get_max_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + max_buffer_offset_);
+  }
+
+  float* get_sum_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + sum_buffer_offset_);
+  }
+
+  volatile bool* get_reduce_flag_buffer() {
+    return reinterpret_cast<volatile bool*>(reduction_scratchpad_ptr +
+                                            reduce_flag_buffer_offset_);
+  }
+
+  float* get_reduce_output_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_output_buffer_offset_);
+  }
+
+  float* get_reduce_max_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_max_buffer_offset_);
+  }
+
+  float* get_reduce_sum_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_sum_buffer_offset_);
+  }
+
+  int64_t get_thread_scratchpad_size() const {
+    return 2 * sum_buffer_offset_ - max_buffer_offset_;
+  }
+
+  int64_t get_reduction_scratchpad_size() const {
+    return 2 * reduce_sum_buffer_offset_ - reduce_max_buffer_offset_;
+  }
+
+ private:
+  static int64_t round_to_64(const int64_t num) {
+    return ((num + 63) >> 6) << 6;
+  }
+
+  static int64_t calcu_q_buffer_size(const int64_t q_tile_size,
+                                     const int64_t head_dim,
+                                     const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_logits_buffer_size(const int64_t max_num_q_per_iter,
+                                          const int64_t k_tile_size,
+                                          const int64_t elem_size) {
+    return round_to_64(elem_size * max_num_q_per_iter * k_tile_size);
+  }
+
+  static int64_t calcu_partial_output_buffer_size(const int64_t q_tile_size,
+                                                  const int64_t head_dim,
+                                                  const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_partial_output_max_sum_buffer_size(
+      const int64_t q_tile_size) {
+    return round_to_64(q_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_flag_buffer_size(const int64_t total_split_num) {
+    return round_to_64(total_split_num * sizeof(bool));
+  }
+
+  static int64_t calcu_reduce_max_sum_buffer_size(
+      const int64_t total_split_num, const int32_t q_head_tile_size) {
+    return round_to_64(total_split_num * q_head_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_output_buffer_size(
+      const int64_t total_split_num, const int64_t q_head_tile_size,
+      const int64_t head_dim, const int64_t output_buffer_elem_size) {
+    return round_to_64(total_split_num * q_head_tile_size * head_dim *
+                       output_buffer_elem_size);
+  }
+
+ private:
+  int8_t* thread_scratchpad_ptr;
+  int8_t* reduction_scratchpad_ptr;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  // attention buffers
+  int64_t q_buffer_offset_;
+  int64_t logits_buffer_offset_;
+  int64_t output_buffer_offset_;
+  int64_t max_buffer_offset_;
+  int64_t sum_buffer_offset_;
+  // reduction buffers
+  int64_t reduce_flag_buffer_offset_;
+  int64_t reduce_output_buffer_offset_;
+  int64_t reduce_max_buffer_offset_;
+  int64_t reduce_sum_buffer_offset_;
+};
+
+class AttentionScheduler {
+ public:
+  struct ScheduleInput {
+    int32_t num_reqs;
+    int32_t elem_size;
+    int32_t q_buffer_elem_size;
+    int32_t logits_buffer_elem_size;
+    int32_t output_buffer_elem_size;
+    int32_t num_heads_q;
+    int32_t num_heads_kv;
+    int32_t head_dim;
+    int32_t* query_start_loc;
+    int32_t* seq_lens;
+    int32_t left_sliding_window_size;
+    int32_t right_sliding_window_size;
+    bool casual;
+    cpu_attention::ISA isa;
+    int32_t max_num_q_per_iter;  // max Q head num can be hold in registers
+    int32_t kv_block_alignment;  // context length alignment requirement
+    bool enable_kv_split;
+  };
+
+  static constexpr int32_t MaxQTileIterNum = 128;
+
+  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+
+  torch::Tensor schedule(const ScheduleInput& input) const {
+    const bool casual = input.casual;
+    const int32_t thread_num = omp_get_max_threads();
+    const int64_t cache_size = get_available_l2_size();
+    const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
+    const int32_t kv_len_alignment = input.kv_block_alignment;
+    int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
+    const bool use_gqa = (max_num_q_per_iter % q_head_per_kv == 0);
+    if (!use_gqa) {
+      q_head_per_kv = 1;  // fallback to MHA
+    }
+    const int32_t min_split_kv_len =
+        ((max_num_q_per_iter * 4 + kv_len_alignment - 1) / kv_len_alignment) *
+        kv_len_alignment;
+    const int32_t max_num_q_token_per_iter = max_num_q_per_iter / q_head_per_kv;
+    const int64_t default_tile_size = calcu_default_tile_size(
+        cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+        input.logits_buffer_elem_size, input.output_buffer_elem_size,
+        max_num_q_per_iter, max_num_q_per_iter);
+    const int32_t default_tile_token_num = default_tile_size / q_head_per_kv;
+    const int32_t split_kv_q_token_num_threshold =
+        input.enable_kv_split ? 1 : 0;
+    const int32_t left_sliding_window_size = input.left_sliding_window_size;
+    const int32_t right_sliding_window_size = input.right_sliding_window_size;
+    TORCH_CHECK_LE(split_kv_q_token_num_threshold * q_head_per_kv, 16);
+
+    // get total kv len
+    int64_t total_kv_len = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        total_kv_len += curr_kv_len;
+      }
+    }
+    const int64_t kv_len_per_thread =
+        (((total_kv_len / thread_num) + kv_len_alignment - 1) /
+         kv_len_alignment) *
+        kv_len_alignment * (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    std::vector<AttentionWorkItemGroup> workitems;
+    std::vector<ReductionWorkItemGroup> reduce_workitems;
+    workitems.reserve(1024);
+    reduce_workitems.reserve(1024);
+    std::vector<int32_t> workitem_num_per_thread(thread_num, 0);
+
+    // split tasks
+    int32_t curr_thread_id = 0;
+    int64_t remaining_kv_len = kv_len_per_thread;
+    int32_t cum_split_num = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+      int32_t local_split_id = 0;
+
+      AttentionWorkItemGroup curr_workitem(req_id, 0, 0, seq_len);
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        int32_t kv_token_pos_start = aligned_kv_tile_pos_left;
+
+        while (curr_kv_len > 0) {
+          if (curr_kv_len <= (remaining_kv_len + min_split_kv_len) ||
+              curr_thread_id == (thread_num - 1)) {
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+
+            if (remaining_kv_len < 0) {
+              // stop to accept more workitems
+              remaining_kv_len -= min_split_kv_len;
+            }
+
+            if (curr_workitem.kv_split_pos_start != 0) {
+              // got a partial kv spilt, need to create a single workitem
+              curr_workitem.split_id = cum_split_num;
+              curr_workitem.local_split_id = local_split_id;
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              ++reduce_workitems.back().split_num;
+              ++cum_split_num;
+
+              curr_workitem = AttentionWorkItemGroup(
+                  req_id, token_id + max_num_q_token_per_iter, 0, seq_len);
+            }
+
+            break;
+          }
+
+          if (remaining_kv_len < min_split_kv_len &&
+              (curr_workitem.total_kv_len > 0 ||
+               workitem_num_per_thread[curr_thread_id] > 0)) {
+            // remaining_kv_len is too short, and have allocated workitems, just
+            // leave to next thread
+            if (curr_workitem.total_kv_len > 0) {
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+            }
+
+            // switch to next thread
+            ++curr_thread_id;
+            remaining_kv_len = kv_len_per_thread;
+
+            // retry this iteration
+            continue;
+          }
+
+          // only split tail splits with q_tile_token_num <=
+          // split_kv_q_token_num_threshold
+          if (token_id + max_num_q_token_per_iter < q_token_num ||
+              q_tile_token_num > split_kv_q_token_num_threshold) {
+            // if requires a new q tile iteration and already has workitems,
+            // leave this workitem to next thread
+            if (curr_workitem.q_token_num % default_tile_token_num == 0 &&
+                (curr_workitem.total_kv_len > 0 ||
+                 workitem_num_per_thread[curr_thread_id] > 0)) {
+              if (curr_workitem.total_kv_len > 0) {
+                workitems.emplace_back(curr_workitem);
+                ++workitem_num_per_thread[curr_thread_id];
+              }
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+
+              // switch to next thread
+              ++curr_thread_id;
+              remaining_kv_len = kv_len_per_thread;
+            }
+
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+            break;
+          }
+
+          // split kv
+          if (curr_workitem.total_kv_len > 0) {
+            // write back curr workitem
+            workitems.emplace_back(curr_workitem);
+            ++workitem_num_per_thread[curr_thread_id];
+          }
+
+          if (kv_token_pos_start == aligned_kv_tile_pos_left) {
+            // first split, init the workitem
+            reduce_workitems.emplace_back(ReductionWorkItemGroup(
+                req_id, token_id, q_tile_token_num, cum_split_num));
+          }
+
+          int32_t spilt_size =
+              std::min(std::max(remaining_kv_len, (int64_t)min_split_kv_len),
+                       (int64_t)curr_kv_len);
+          curr_workitem =
+              AttentionWorkItemGroup(req_id, token_id, kv_token_pos_start,
+                                     kv_token_pos_start + spilt_size);
+          curr_workitem.q_token_num += q_tile_token_num;
+          curr_workitem.total_kv_len += spilt_size;
+          curr_workitem.split_id = cum_split_num;
+          curr_workitem.local_split_id = local_split_id;
+          workitems.emplace_back(curr_workitem);
+          ++workitem_num_per_thread[curr_thread_id];
+          ++reduce_workitems.back().split_num;
+          ++cum_split_num;
+          ++local_split_id;
+
+          kv_token_pos_start += spilt_size;
+          curr_kv_len -= spilt_size;
+          curr_workitem = AttentionWorkItemGroup(req_id, token_id,
+                                                 kv_token_pos_start, seq_len);
+
+          // switch to next thread
+          ++curr_thread_id;
+          remaining_kv_len = kv_len_per_thread;
+        }
+      }
+
+      if (curr_workitem.total_kv_len > 0) {
+        // write back curr workitem
+        workitems.emplace_back(curr_workitem);
+        ++workitem_num_per_thread[curr_thread_id];
+      }
+    }
+
+    int64_t metadata_tensor_size =
+        sizeof(AttentionMetadata) +
+        workitems.size() * sizeof(AttentionWorkItemGroup) +
+        reduce_workitems.size() * sizeof(ReductionWorkItemGroup);
+    auto options =
+        torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    torch::Tensor metadata_tensor =
+        torch::empty({metadata_tensor_size}, options);
+    AttentionMetadata* metadata_ptr = new (metadata_tensor.data_ptr())
+        AttentionMetadata(input.isa, workitems.size(), reduce_workitems.size(),
+                          cum_split_num, split_kv_q_token_num_threshold);
+    AttentionWorkItemGroup* workitem_groups_ptr =
+        metadata_ptr->workitem_groups_ptr;
+    ReductionWorkItemGroup* reduction_items_ptr =
+        metadata_ptr->reduction_items_ptr;
+    std::memcpy(workitem_groups_ptr, workitems.data(),
+                workitems.size() * sizeof(AttentionWorkItemGroup));
+    std::memcpy(reduction_items_ptr, reduce_workitems.data(),
+                reduce_workitems.size() * sizeof(ReductionWorkItemGroup));
+
+    int32_t effective_thread_num = 0;
+    for (; effective_thread_num < thread_num; ++effective_thread_num) {
+      if (workitem_num_per_thread[effective_thread_num] == 0) {
+        break;
+      }
+    }
+
+    std::memcpy(metadata_ptr->cu_workitem_num_per_thread + 1,
+                workitem_num_per_thread.data(),
+                workitem_num_per_thread.size() * sizeof(int32_t));
+    for (int32_t i = 1; i <= thread_num; ++i) {
+      metadata_ptr->cu_workitem_num_per_thread[i] +=
+          metadata_ptr->cu_workitem_num_per_thread[i - 1];
+    }
+    metadata_ptr->effective_thread_num = effective_thread_num;
+
+    {
+      // when q_tile_size = max_num_q_per_iter, requires max
+      // attention_scratchpad_size
+      AttentionScratchPad sc(0, *metadata_ptr, 0x0);
+      int64_t n = AttentionScheduler::calcu_tile_size_with_constant_q(
+          cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+          input.logits_buffer_elem_size, input.output_buffer_elem_size,
+          max_num_q_per_iter, kv_len_alignment, max_num_q_per_iter, true);
+      sc.update(input.head_dim, input.q_buffer_elem_size,
+                input.logits_buffer_elem_size, input.output_buffer_elem_size,
+                max_num_q_per_iter, max_num_q_per_iter, n);
+      metadata_ptr->attention_scratchpad_size_per_thread =
+          ((sc.get_thread_scratchpad_size() + 63) / 64) * 64;
+
+      sc.update(0, metadata_ptr->reduction_split_num, input.head_dim,
+                q_head_per_kv * split_kv_q_token_num_threshold,
+                input.output_buffer_elem_size);
+      metadata_ptr->reduction_scratchpad_size_per_kv_head =
+          ((sc.get_reduction_scratchpad_size() + 63) / 64) * 64;
+    }
+    int64_t scratchpad_size =
+        metadata_ptr->attention_scratchpad_size_per_thread *
+            metadata_ptr->thread_num +
+        metadata_ptr->reduction_scratchpad_size_per_kv_head *
+            (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+        scratchpad_size);
+
+    // metadata_ptr->print();
+
+    // test out of boundary access
+    // {
+    //     float* cache_ptr =
+    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
+    //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
+    //     }
+    // }
+
+    return metadata_tensor;
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> calcu_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t q_left_pos,
+      int32_t q_right_pos, int32_t sliding_window_left,
+      int32_t sliding_window_right) {
+    if (sliding_window_left != -1) {
+      kv_left_pos = std::max(kv_left_pos, q_left_pos - sliding_window_left);
+    }
+    if (sliding_window_right != -1) {
+      kv_right_pos = std::min(kv_right_pos, q_right_pos + sliding_window_right);
+    }
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> align_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t align_factor) {
+    kv_left_pos = (kv_left_pos / align_factor) * align_factor;
+    kv_right_pos =
+        ((kv_right_pos + align_factor - 1) / align_factor) * align_factor;
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  static int64_t calcu_default_tile_size(int64_t cache_size, int64_t head_dim,
+                                         int64_t elem_size,
+                                         int64_t q_buffer_elem_size,
+                                         int64_t logits_buffer_elem_size,
+                                         int64_t output_buffer_elem_size,
+                                         int64_t max_num_q_per_iter,
+                                         int64_t round_size) {
+    // For CPU, different from CUDA, Q@K^T results should also be hold in cache,
+    // using float32. Intermediate outputs should be float32 to be compatible
+    // with AMX Then the cache includes:
+    //  - Q: q_tile_size * head_dim * q_buffer_elem_size
+    //  - K, V: 2 * k_tile_size * head_dim * elem_size
+    //  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size
+    //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size
+    // By default, let tile_size = q_tile_size = k_tile_size. To record
+    // is_first_iter states in a static array, require the default tile <= 128 *
+    // max_num_q_per_iter
+
+    int64_t tile_size =
+        cache_size / (head_dim * (q_buffer_elem_size + 2 * elem_size +
+                                  output_buffer_elem_size) +
+                      max_num_q_per_iter * logits_buffer_elem_size);
+    tile_size = std::min(tile_size, MaxQTileIterNum * max_num_q_per_iter);
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t calcu_tile_size_with_constant_q(
+      int64_t cache_size, int64_t head_dim, int64_t elem_size,
+      int64_t q_buffer_elem_size, int64_t logits_buffer_elem_size,
+      int64_t output_buffer_elem_size, int64_t max_num_q_per_iter,
+      int64_t round_size, int64_t q_tile_size, bool one_round) {
+    // calculate tile_size with known q_tile_size
+    // If one_round is True, the outer Q tile loop time is 1, then the K,V will
+    // not be included in the cache
+    int64_t tile_size;
+    if (one_round) {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter);
+    } else {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter +
+           2 * head_dim * elem_size);
+    }
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t get_available_l2_size() {
+    static int64_t size = []() {
+#if defined(__APPLE__)
+      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+      int64_t l2_cache_size = 0;
+      size_t len = sizeof(l2_cache_size);
+      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+          l2_cache_size > 0) {
+        return l2_cache_size >> 1;  // use 50% of L2 cache
+      }
+      // Fallback if sysctlbyname fails
+      return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
+      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+      TORCH_CHECK_NE(l2_cache_size, -1);
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
+    }();
+    return size;
+  }
+
+ private:
+  int64_t available_cache_size_;
+};
+
+struct AttentionInput {
+  AttentionMetadata* metadata;
+  int32_t num_tokens;
+  int32_t num_heads;
+  int32_t num_kv_heads;
+  int32_t block_size;
+  void* query;
+  int64_t query_num_tokens_stride;
+  int64_t query_num_heads_stride;
+  int64_t cache_num_blocks_stride;
+  int64_t cache_num_kv_heads_stride;
+  int64_t blt_num_tokens_stride;
+  void* key_cache;
+  void* value_cache;
+  void* output;
+  int32_t* query_start_loc;
+  int32_t* seq_lens;
+  int32_t* block_table;
+  float* alibi_slopes;
+  c10::BFloat16* s_aux;
+  float scale;
+  bool causal;
+  int32_t sliding_window_left;
+  int32_t sliding_window_right;
+  float softcap;
+};
+
+#define DEFINE_CPU_ATTENTION_PARAMS                                         \
+  q_buffer_t *__restrict__ q_heads_buffer,                                  \
+      kv_cache_t *__restrict__ k_head_cache_ptr,                            \
+      kv_cache_t *__restrict__ v_head_cache_ptr,                            \
+      logits_buffer_t *__restrict__ logits_buffer,                          \
+      float *__restrict__ partial_q_buffer, float *__restrict__ max_buffer, \
+      float *__restrict__ sum_buffer, int32_t *__restrict__ block_table,    \
+      const int32_t kv_tile_start_pos, const int32_t kv_tile_end_pos,       \
+      const int32_t kv_tile_token_num,                                      \
+      const int64_t kv_cache_num_blocks_stride, const int32_t q_head_num,   \
+      const int32_t q_token_num, const int32_t q_tile_start_pos,            \
+      const int32_t q_heads_per_kv, const int32_t block_size,               \
+      const int32_t left_window_size, const int32_t right_window_size,      \
+      float scale, const float softcap_scale,                               \
+      const float *__restrict__ alibi_slopes, const bool is_first_iter,     \
+      const bool use_sink, const bool debug_info
+
+#define CPU_ATTENTION_PARAMS                                                  \
+  q_heads_buffer, k_head_cache_ptr, v_head_cache_ptr, logits_buffer,          \
+      partial_q_buffer, max_buffer, sum_buffer, block_table,                  \
+      kv_tile_start_pos, kv_tile_end_pos, kv_tile_token_num,                  \
+      kv_cache_num_blocks_stride, q_head_num, q_token_num, q_tile_start_pos,  \
+      q_heads_per_kv, block_size, left_window_size, right_window_size, scale, \
+      softcap_scale, alibi_slopes, is_first_iter, use_sink, debug_info
+
+enum class AttentionGemmPhase { QK, PV };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+// ARM only supports BF16 with ARMv8.6-A extension
+#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
+#else
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+#endif
+
+#if !defined(__powerpc__)
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+#endif
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+template <typename attention_impl_t>
+class AttentionMainLoop {
+ public:
+  using query_t = typename attention_impl_t::query_t;
+  using q_buffer_t = typename attention_impl_t::q_buffer_t;
+  using kv_cache_t = typename attention_impl_t::kv_cache_t;
+  using logits_buffer_t = typename attention_impl_t::logits_buffer_t;
+  using partial_output_buffer_t =
+      typename attention_impl_t::partial_output_buffer_t;
+  using prob_buffer_t = typename attention_impl_t::prob_buffer_t;
+
+  static constexpr int64_t max_q_head_num_per_iter =
+      attention_impl_t::MaxQHeadNumPerIteration;
+  static constexpr int64_t blocksize_alignment =
+      attention_impl_t::BlockSizeAlignment;
+  static constexpr int64_t headdim_alignment =
+      attention_impl_t::HeadDimAlignment;
+  static constexpr int64_t head_dim = attention_impl_t::HeadDim;
+  static constexpr ISA ISAType = attention_impl_t::ISAType;
+  static constexpr bool scale_on_logits =
+      attention_impl_t::scale_on_logits;  // apply scale on logits, otherwise
+                                          // apply scale on q_buffer
+
+  template <typename tile_gemm_t>
+  class Attention {
+   public:
+    // Args:
+    //  - q_heads_buffer: [MaxQHeadNumPerIteration, head_dim]
+    //  - k_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - v_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - logits_buffer: [MaxQHeadNumPerIteration, kv_tile_token_num], store Q@K
+    //  - logits partial_q_buffer: [MaxQHeadNumPerIteration, head_dim], store
+    //  partial output
+    //  - max_buffer: [MaxQHeadNumPerIteration, 1], store max logits
+    //  - sum_buffer: [MaxQHeadNumPerIteration, 1], store sum of exp
+    //  - block_table
+    //  - kv_tile_start_pos: start position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_end_pos: end position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_token_num: KV token num, aligned to BlockSizeAlignment
+    //  - kv_cache_num_blocks_stride
+    //  - q_head_num: head num of q_tile
+    //  - q_token_num: token num of q_tile, should be q_head_num /
+    //  q_heads_per_kv
+    //  - q_tile_start_pos: start pos of the first token in q_heads_buffer
+    //  - q_heads_per_kv
+    //  - block_size
+    //  - left_window_size
+    //  - right_window_size
+    //  - scale
+    //  - softcap_scale
+    //  - alibi_slopes
+    //  - is_first_iter
+    //  - use_sink
+    //  - debug_info
+    void operator()(DEFINE_CPU_ATTENTION_PARAMS) {
+      // k_cache_token_group_stride: stride of K cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t k_cache_token_group_stride =
+          attention_impl_t::k_cache_token_group_stride(block_size);
+      // v_cache_token_group_stride: stride of V cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t v_cache_token_group_stride =
+          attention_impl_t::v_cache_token_group_stride(block_size);
+      // v_cache_head_group_stride: stride of V cache when move to next
+      // HeadDimAlignment head dims in a block
+      const int64_t v_cache_head_group_stride =
+          attention_impl_t::v_cache_head_group_stride(block_size);
+      const int32_t token_group_num = kv_tile_token_num / blocksize_alignment;
+      const int32_t token_group_num_per_block =
+          block_size / blocksize_alignment;
+      const int32_t start_block_idx = kv_tile_start_pos / block_size;
+      const int32_t start_block_offset = kv_tile_start_pos % block_size;
+      const int32_t start_block_group_offset =
+          start_block_offset / blocksize_alignment;
+      const int32_t end_block_idx =
+          (kv_tile_start_pos + kv_tile_token_num - 1) / block_size + 1;
+
+      // compute Q@K logits
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * k_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        logits_buffer_t* curr_logits_buffer = logits_buffer;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* k_cache_block_ptr =
+              k_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+
+          for (int32_t block_group_idx = 0;
+               block_group_idx < curr_group_num_in_block; ++block_group_idx) {
+            // logits_tile = q_tile @ k_tile, [MaxQHeadNumPerIteration,
+            // BlockSizeAlignment] = [MaxQHeadNumPerIteration, head_dim] @
+            // [head_dim, BlockSizeAlignment]
+
+            // By default, logits_buffer, q_buffer and k_cache are row-major,
+            // but may be packed by ISA implementation.
+            tile_gemm_t::template gemm<AttentionGemmPhase::QK, head_dim>(
+                q_head_num, q_heads_buffer, k_cache_block_ptr,
+                curr_logits_buffer, head_dim, block_size, kv_tile_token_num,
+                block_size, head_dim, false);
+
+            if constexpr (scale_on_logits) {
+              float* __restrict__ scale_curr_logits_buffer = curr_logits_buffer;
+              vec_op::FP32Vec16 scale_vec(scale);
+              for (int32_t i = 0; i < q_head_num; ++i) {
+                static_assert(blocksize_alignment % 16 == 0);
+                constexpr int32_t vec_num = blocksize_alignment / 16;
+                vec_op::unroll_loop<int32_t, vec_num>([&](int32_t vec_idx) {
+                  vec_op::FP32Vec16 vec(scale_curr_logits_buffer +
+                                        vec_idx * 16);
+                  vec = vec * scale_vec;
+                  vec.save(scale_curr_logits_buffer + vec_idx * 16);
+                });
+                scale_curr_logits_buffer += kv_tile_token_num;
+              }
+            }
+
+            // Move buffer ptrs
+            k_cache_block_ptr += k_cache_token_group_stride;
+            curr_logits_buffer += blocksize_alignment;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+        }
+      }
+
+      // process logits
+      {
+        // if (debug_info){
+        //     print_logits("raw logits", logits_buffer, q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num);
+        // }
+
+        if (softcap_scale != 0.0f) {
+          apply_softcap(logits_buffer, kv_tile_token_num, q_head_num,
+                        kv_tile_token_num, softcap_scale);
+          // print_logits("softcap raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        if (alibi_slopes != nullptr) {
+          apply_alibi_slopes(logits_buffer, alibi_slopes, kv_tile_token_num,
+                             q_tile_start_pos, kv_tile_start_pos, q_token_num,
+                             kv_tile_token_num, q_heads_per_kv);
+
+          // print_logits("alibi raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        apply_mask(logits_buffer, kv_tile_token_num, q_tile_start_pos,
+                   kv_tile_start_pos, kv_tile_end_pos, q_token_num,
+                   q_heads_per_kv, left_window_size, right_window_size);
+
+        // if (debug_info){
+        // print_logits("masked logits", logits_buffer, q_head_num,
+        // kv_tile_token_num, kv_tile_token_num);
+        // print_logits("old_max", max_buffer, 1, q_head_num, q_head_num);
+        // print_logits("old_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+
+        apply_softmax(logits_buffer, partial_q_buffer, max_buffer, sum_buffer,
+                      kv_tile_token_num, q_head_num, kv_tile_token_num,
+                      is_first_iter, use_sink);
+
+        // if (debug_info){
+        //     print_logits("softmax logits",
+        //     reinterpret_cast<prob_buffer_t*>(logits_buffer), q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num * sizeof(logits_buffer_t) /
+        //     sizeof(prob_buffer_t));
+        //     print_logits("new_max", max_buffer, 1, q_head_num, q_head_num);
+        //     print_logits("new_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+      }
+
+      // compute P@V
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * v_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        int32_t head_dim_group_num = head_dim / headdim_alignment;
+        prob_buffer_t* curr_prob_buffer =
+            reinterpret_cast<prob_buffer_t*>(logits_buffer);
+        int64_t prob_buffer_stride =
+            kv_tile_token_num *
+            (sizeof(logits_buffer_t) / sizeof(prob_buffer_t));
+        partial_output_buffer_t* curr_partial_q_buffer = partial_q_buffer;
+        bool accum_c = !is_first_iter;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* v_cache_block_ptr =
+              v_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+          int32_t curr_token_num =
+              curr_group_num_in_block * blocksize_alignment;
+
+          for (int32_t head_dim_group_idx = 0;
+               head_dim_group_idx < head_dim_group_num; ++head_dim_group_idx) {
+            // output_tile = p_tile @ v_tile, [MaxQHeadNumPerIteration,
+            // HeadDimAlignment] = [MaxQHeadNumPerIteration, block_size] @
+            // [block_size, HeadDimAlignment]
+            tile_gemm_t::template gemm<AttentionGemmPhase::PV, -1>(
+                q_head_num, curr_prob_buffer, v_cache_block_ptr,
+                curr_partial_q_buffer, prob_buffer_stride, head_dim, head_dim,
+                block_size, curr_token_num, accum_c);
+
+            // Update
+            curr_partial_q_buffer += headdim_alignment;
+            v_cache_block_ptr += v_cache_head_group_stride;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+          curr_prob_buffer += curr_token_num;
+          curr_partial_q_buffer = partial_q_buffer;
+          accum_c = true;
+        }
+      }
+      //   if (debug_info) {
+      //     print_logits("output", partial_q_buffer, q_head_num, head_dim,
+      //     head_dim);
+      //   }
+    }
+
+    void apply_mask(logits_buffer_t* __restrict__ logits_buffer,
+                    const int64_t logits_buffer_stride,
+                    const int32_t q_tile_start_pos,
+                    const int32_t kv_tile_start_pos,
+                    const int32_t kv_tile_end_pos, const int32_t q_token_num,
+                    const int32_t q_heads_per_kv,
+                    const int32_t sliding_window_left,
+                    const int32_t sliding_window_right) {
+      // Apply mask
+      constexpr logits_buffer_t neg_inf =
+          -std::numeric_limits<logits_buffer_t>::infinity();
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      int32_t curr_token_pos = q_tile_start_pos;
+      for (int32_t token_idx = 0; token_idx < q_token_num; ++token_idx) {
+        int32_t left_kv_pos = [&]() {
+          int32_t pos = kv_tile_start_pos;
+          if (sliding_window_left != -1) {
+            pos = std::max(pos, curr_token_pos - sliding_window_left);
+          }
+          return pos;
+        }();
+
+        int32_t right_kv_pos = [&]() {
+          int32_t pos = kv_tile_end_pos;
+          if (sliding_window_right != -1) {
+            pos = std::min(pos,
+                           std::max(kv_tile_start_pos,
+                                    curr_token_pos + sliding_window_right + 1));
+          }
+          return pos;
+        }();
+
+        int32_t left_invalid_token_num = left_kv_pos - kv_tile_start_pos;
+        int32_t right_invalid_token_num = kv_tile_end_pos - right_kv_pos;
+        for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+          logits_buffer_t* __restrict__ curr_logits_buffer_tail =
+              curr_logits_buffer + right_kv_pos - kv_tile_start_pos;
+          for (int32_t i = 0; i < left_invalid_token_num; ++i) {
+            curr_logits_buffer[i] = neg_inf;
+          }
+          for (int32_t i = 0; i < right_invalid_token_num; ++i) {
+            curr_logits_buffer_tail[i] = neg_inf;
+          }
+
+          curr_logits_buffer += logits_buffer_stride;
+        }
+
+        ++curr_token_pos;
+      }
+    }
+
+    void apply_softmax(logits_buffer_t* __restrict__ logits_buffer,
+                       float* __restrict__ partial_q_buffer,
+                       float* __restrict__ max_buffer,
+                       float* __restrict__ sum_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, bool is_first_iter,
+                       bool use_sink) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      using prob_buffer_vec_t = typename VecTypeTrait<prob_buffer_t>::vec_t;
+      static_assert(sizeof(prob_buffer_t) <= sizeof(logits_buffer_t));
+
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      float* __restrict__ curr_partial_q_buffer = partial_q_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      const int32_t head_vec_num = head_dim / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        float init_max_val = max_buffer[i];
+        float init_sum_val = sum_buffer[i];
+
+        // apply scale and compute max
+        vec_op::FP32Vec16 max_vec(init_max_val);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            max_vec = vec.max(max_vec);
+
+            curr_logits_buffer_iter += 16;
+          }
+        }
+        float new_max_val = max_vec.reduce_max();
+        float rescale_factor = init_max_val - new_max_val;
+
+        // use same rescale threshold with FA4.
+        // https://github.com/Dao-AILab/flash-attention/blob/1b8e1e641c6a179be9a0538b7f40fd595050b735/flash_attn/cute/flash_fwd_sm100.py#L1271
+        bool need_rescale = rescale_factor < -8.0;
+        if (!need_rescale) {
+          new_max_val = init_max_val;
+        } else {
+          max_buffer[i] = new_max_val;
+        }
+
+        // sub max, compute exp and sum
+        max_vec = vec_op::FP32Vec16(new_max_val);
+        vec_op::FP32Vec16 sum_vec(0.0);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          prob_buffer_t* __restrict__ curr_prob_buffer_iter =
+              reinterpret_cast<prob_buffer_t*>(curr_logits_buffer);
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec - max_vec;
+
+            // compute exp
+#ifdef DEFINE_FAST_EXP
+            vec = fast_exp(vec);
+            prob_buffer_vec_t output_vec(vec);
+            output_vec.save(curr_prob_buffer_iter);
+#else
+            vec.save(curr_logits_buffer_iter);
+            for (int32_t k = 0; k < 16; ++k) {
+              curr_logits_buffer_iter[k] = std::exp(curr_logits_buffer_iter[k]);
+            }
+            vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+
+            sum_vec = sum_vec + vec;
+
+            curr_logits_buffer_iter += 16;
+            curr_prob_buffer_iter += 16;
+          }
+        }
+        float new_sum_val = sum_vec.reduce_sum();
+
+        // rescale sum and partial outputs
+        if (need_rescale) {
+          // compute rescale factor
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          // rescale sum
+          new_sum_val += rescale_factor * init_sum_val;
+
+          // rescale output
+          if (!is_first_iter) {
+            float* __restrict__ curr_partial_q_buffer_iter =
+                curr_partial_q_buffer;
+            for (int32_t j = 0; j < head_vec_num; ++j) {
+              vec_op::FP32Vec16 vec(curr_partial_q_buffer_iter);
+              vec = vec * rescale_factor_vec;
+              vec.save(curr_partial_q_buffer_iter);
+
+              curr_partial_q_buffer_iter += 16;
+            }
+          }
+        } else {
+          new_sum_val += init_sum_val;
+        }
+
+        sum_buffer[i] = new_sum_val;
+
+        curr_logits_buffer += logits_buffer_stride;
+        curr_partial_q_buffer += head_dim;
+      }
+    }
+
+    void apply_softcap(logits_buffer_t* __restrict__ logits_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, float softcap_scale) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      float inv_softcap_scale = 1.0 / softcap_scale;
+      vec_op::FP32Vec16 softcap_scale_vec(softcap_scale);
+      vec_op::FP32Vec16 inv_softcap_scale_vec(inv_softcap_scale);
+      vec_op::FP32Vec16 ones_vec(1.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+            curr_logits_buffer;
+        for (int32_t j = 0; j < vec_num; ++j) {
+          vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+          vec = vec * inv_softcap_scale_vec;
+
+#ifdef DEFINE_FAST_EXP
+          vec = fast_exp(vec);
+          vec_op::FP32Vec16 inv_vec = ones_vec / vec;
+          vec = (vec - inv_vec) / (vec + inv_vec);
+#else
+          vec.save(curr_logits_buffer_iter);
+          for (int k = 0; k < 16; ++k) {
+            curr_logits_buffer_iter[k] = std::tanh(curr_logits_buffer_iter[k]);
+          }
+          vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+          vec = vec * softcap_scale_vec;
+          vec.save(curr_logits_buffer_iter);
+
+          curr_logits_buffer_iter += 16;
+        }
+
+        curr_logits_buffer += logits_buffer_stride;
+      }
+    }
+
+    void apply_alibi_slopes(logits_buffer_t* __restrict__ logits_buffer,
+                            const float* __restrict__ alibi_slopes,
+                            const int64_t logits_buffer_stride,
+                            const int32_t q_tile_start_pos,
+                            const int32_t kv_tile_start_pos,
+                            const int32_t q_token_num,
+                            const int32_t kv_tile_token_num,
+                            const int32_t q_heads_per_kv) {
+      alignas(64) constexpr float initial_arange_vals[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      const int32_t vec_num = kv_tile_token_num / 16;
+
+      vec_op::FP32Vec16 initial_arange_vals_vec(initial_arange_vals);
+      initial_arange_vals_vec =
+          initial_arange_vals_vec + vec_op::FP32Vec16((float)kv_tile_start_pos);
+      vec_op::FP32Vec16 pos_offset_vec(16.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      for (int32_t i = 0; i < q_token_num; ++i) {
+        vec_op::FP32Vec16 curr_q_pos_vec((float)(i + q_tile_start_pos));
+        for (int32_t j = 0; j < q_heads_per_kv; ++j) {
+          vec_op::FP32Vec16 alibi_scale_vec(alibi_slopes[j]);
+          vec_op::FP32Vec16 curr_kv_pos_vec(initial_arange_vals_vec);
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t k = 0; k < vec_num; ++k) {
+            vec_op::FP32Vec16 alibi_bias_vec =
+                alibi_scale_vec * (curr_kv_pos_vec - curr_q_pos_vec);
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec + alibi_bias_vec;
+
+            vec.save(curr_logits_buffer_iter);
+
+            curr_kv_pos_vec = curr_kv_pos_vec + pos_offset_vec;
+            curr_logits_buffer_iter += 16;
+          }
+          curr_logits_buffer += logits_buffer_stride;
+        }
+      }
+    }
+  };
+
+ public:
+  void operator()(const AttentionInput* input) {
+    const int thread_num = omp_get_max_threads();
+    TORCH_CHECK_EQ(input->metadata->thread_num, thread_num);
+    std::atomic<int32_t> guard_counter(0);
+    std::atomic<int32_t>* guard_counter_ptr = &guard_counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int thread_id = 0; thread_id < thread_num; ++thread_id) {
+      AttentionMetadata& metadata = *input->metadata;
+      if (metadata.workitem_group_num == 0) {
+        continue;
+      }
+
+      attention_impl_t attn_impl;
+
+      // general information
+      const int32_t q_head_num = input->num_heads;
+      const int32_t kv_head_num = input->num_kv_heads;
+      const int32_t q_heads_per_kv = q_head_num / kv_head_num;
+      const bool use_gqa =
+          (max_q_head_num_per_iter % q_heads_per_kv == 0) ? true : false;
+      const int32_t actual_kv_head_num = use_gqa ? kv_head_num : q_head_num;
+      const int32_t actual_q_heads_per_kv = use_gqa ? q_heads_per_kv : 1;
+      TORCH_CHECK_LE(actual_q_heads_per_kv, max_q_head_num_per_iter);
+      const int32_t max_q_token_num_per_iter =
+          max_q_head_num_per_iter / actual_q_heads_per_kv;
+      const int64_t q_token_num_stride = input->query_num_tokens_stride;
+      const int64_t q_head_num_stride = input->query_num_heads_stride;
+      const int64_t kv_cache_head_num_stride = input->cache_num_kv_heads_stride;
+      const int64_t kv_cache_block_num_stride = input->cache_num_blocks_stride;
+      const int32_t sliding_window_left = input->sliding_window_left;
+      const int32_t sliding_window_right = input->sliding_window_right;
+      const int32_t block_size = input->block_size;
+      const float scale = input->scale;
+      const float softcap_scale = input->softcap;
+      const float* alibi_slopes = input->alibi_slopes;
+      const c10::BFloat16* s_aux = input->s_aux;
+
+      const bool casual = input->causal;
+      int32_t* const block_table = input->block_table;
+      const int64_t block_table_stride = input->blt_num_tokens_stride;
+
+      // init buffers
+      void* scratchpad_ptr =
+          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+              ->get_data<void>();
+      AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
+
+      const int32_t total_reduction_split_num = metadata.reduction_split_num;
+      if (metadata.reduction_split_num > 0) {
+        // reset split flag
+        for (int32_t head_idx = thread_id; head_idx < actual_kv_head_num;
+             head_idx += thread_num) {
+          buffer_manager.update(head_idx, total_reduction_split_num, head_dim,
+                                0, sizeof(partial_output_buffer_t));
+          volatile bool* __restrict__ curr_flag_ptr =
+              buffer_manager.get_reduce_flag_buffer();
+          for (int32_t split_idx = 0; split_idx < total_reduction_split_num;
+               ++split_idx) {
+            curr_flag_ptr[split_idx] = false;
+          }
+        }
+      }
+
+      const int64_t available_cache_size =
+          AttentionScheduler::get_available_l2_size();
+      const int32_t default_tile_size =
+          AttentionScheduler::calcu_default_tile_size(
+              available_cache_size, head_dim, sizeof(kv_cache_t),
+              sizeof(q_buffer_t), sizeof(logits_buffer_t),
+              sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+              max_q_head_num_per_iter);
+      const int32_t default_q_tile_token_num =
+          default_tile_size / actual_q_heads_per_kv;
+
+      AttentionWorkItemGroup* const workitem_groups =
+          metadata.workitem_groups_ptr;
+      const int32_t* cu_workitem_num_per_thread =
+          metadata.cu_workitem_num_per_thread;
+      ReductionWorkItemGroup* const reduction_items =
+          metadata.reduction_items_ptr;
+
+      const int32_t effective_thread_num = metadata.effective_thread_num;
+      const int32_t reduction_item_num = metadata.reduction_item_num;
+      const int32_t split_kv_q_token_num_threshold =
+          metadata.split_kv_q_token_num_threshold;
+      const int32_t workitem_groups_counter_num =
+          actual_kv_head_num * effective_thread_num;
+      const int32_t reduction_items_counter_num =
+          actual_kv_head_num * reduction_item_num;
+      const int32_t total_counter_num =
+          workitem_groups_counter_num + reduction_items_counter_num;
+
+      if (metadata.reduction_split_num > 0) {
+        ++(*guard_counter_ptr);
+        while (guard_counter_ptr->load() != thread_num) {
+#ifdef FAST_SPINNING
+          FAST_SPINNING
+#else
+          std::this_thread::yield();
+#endif
+        }
+      }
+
+      // main loop
+      for (;;) {
+        int64_t task_idx = metadata.acquire_counter();
+
+        if (task_idx >= total_counter_num) {
+          // no more tasks, leave loop
+          break;
+        }
+
+        if (task_idx < workitem_groups_counter_num) {
+          // attention task
+          // map task_idx to workitem_groups
+          const int32_t kv_head_idx = task_idx / effective_thread_num;
+          const int32_t thread_offset = task_idx % effective_thread_num;
+          AttentionWorkItemGroup* const curr_workitem_groups =
+              workitem_groups + cu_workitem_num_per_thread[thread_offset];
+          const int32_t curr_workitem_groups_num =
+              cu_workitem_num_per_thread[thread_offset + 1] -
+              cu_workitem_num_per_thread[thread_offset];
+
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+
+          for (int32_t workitem_group_idx = 0;
+               workitem_group_idx < curr_workitem_groups_num;
+               ++workitem_group_idx) {
+            AttentionWorkItemGroup* const current_workitem_group =
+                &curr_workitem_groups[workitem_group_idx];
+
+            const int32_t current_group_idx = current_workitem_group->req_id;
+            const int32_t kv_start_pos =
+                current_workitem_group->kv_split_pos_start;
+            const int32_t kv_end_pos = current_workitem_group->kv_split_pos_end;
+            const int32_t curr_spilt_id = current_workitem_group->split_id;
+            const int32_t q_token_id_start =
+                current_workitem_group->q_token_id_start;
+            const int32_t q_token_num = current_workitem_group->q_token_num;
+
+            // taskgroup general information
+            const int32_t q_end = input->query_start_loc[current_group_idx + 1];
+            const int32_t q_start = input->query_start_loc[current_group_idx];
+            const int32_t seq_len = input->seq_lens[current_group_idx];
+            const int32_t q_start_pos =
+                (casual ? seq_len - (q_end - q_start) : 0);
+            const int32_t block_num = (seq_len + block_size - 1) / block_size;
+            // Only apply sink for the first KV split
+            bool use_sink = (s_aux != nullptr &&
+                             current_workitem_group->local_split_id == 0);
+
+            for (int32_t q_token_offset = 0; q_token_offset < q_token_num;
+                 q_token_offset += default_q_tile_token_num) {
+              bool first_iter_flag[AttentionScheduler::MaxQTileIterNum];
+              for (int32_t i = 0; i < AttentionScheduler::MaxQTileIterNum;
+                   ++i) {
+                first_iter_flag[i] = true;
+              }
+
+              const int32_t q_token_start_idx =
+                  q_start + q_token_offset + q_token_id_start;
+              const int32_t actual_q_token_num = std::min(
+                  default_q_tile_token_num, q_token_num - q_token_offset);
+              const int32_t q_head_tile_size =
+                  actual_q_token_num * actual_q_heads_per_kv;
+              const int32_t rounded_q_head_tile_size =
+                  ((q_head_tile_size + max_q_head_num_per_iter - 1) /
+                   max_q_head_num_per_iter) *
+                  max_q_head_num_per_iter;
+              const int32_t kv_tile_size =
+                  AttentionScheduler::calcu_tile_size_with_constant_q(
+                      available_cache_size, head_dim, sizeof(kv_cache_t),
+                      sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                      sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                      blocksize_alignment, rounded_q_head_tile_size,
+                      rounded_q_head_tile_size <= max_q_head_num_per_iter);
+
+              // update buffers
+              buffer_manager.update(
+                  head_dim, sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                  sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                  rounded_q_head_tile_size, kv_tile_size);
+              q_buffer_t* q_buffer = buffer_manager.get_q_buffer<q_buffer_t>();
+              float* logits_buffer = buffer_manager.get_logits_buffer();
+              float* partial_q_buffer = buffer_manager.get_output_buffer();
+              float* max_buffer = buffer_manager.get_max_buffer();
+              float* sum_buffer = buffer_manager.get_sum_buffer();
+
+              const int32_t q_tile_start_pos =
+                  q_start_pos + q_token_offset + q_token_id_start;
+              const int32_t q_tile_end_pos =
+                  q_tile_start_pos + actual_q_token_num;
+              const auto [kv_tile_start_pos, kv_tile_end_pos] =
+                  AttentionScheduler::calcu_kv_tile_pos(
+                      kv_start_pos, kv_end_pos, q_tile_start_pos,
+                      q_tile_end_pos, sliding_window_left,
+                      sliding_window_right);
+              const auto [rounded_kv_tile_start_pos, rounded_kv_tile_end_pos] =
+                  AttentionScheduler::align_kv_tile_pos(
+                      kv_tile_start_pos, kv_tile_end_pos, blocksize_alignment);
+
+              int32_t curr_kv_head_idx =
+                  use_gqa ? kv_head_idx
+                          : (kv_head_idx /
+                             q_heads_per_kv);  // for GQA disabled case
+
+              // std::printf("thread_id: %d, req_id: %d, q_token_start: %d,
+              // q_token_end: %d, q_head_start: %d, q_head_end: %d, kv_head_idx:
+              // %d, kv_pos_start: %d, kv_pos_end: %d\n",
+              //                 thread_id, current_group_idx,
+              //                 q_token_start_idx, q_token_start_idx +
+              //                 actual_q_token_num, q_head_start_idx,
+              //                 q_head_start_idx + actual_q_heads_per_kv,
+              //                 curr_kv_head_idx, kv_tile_start_pos,
+              //                 kv_tile_end_pos);
+
+              // move buffers
+              kv_cache_t* curr_k_cache =
+                  reinterpret_cast<kv_cache_t*>(input->key_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              kv_cache_t* curr_v_cache =
+                  reinterpret_cast<kv_cache_t*>(input->value_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              query_t* const q_tile_ptr =
+                  reinterpret_cast<query_t*>(input->query) +
+                  q_token_start_idx * q_token_num_stride +
+                  q_head_start_idx * q_head_num_stride;
+              size_t output_buffer_offset =
+                  q_token_start_idx * q_head_num * head_dim +
+                  q_head_start_idx * head_dim;
+              int32_t* curr_block_table =
+                  block_table + current_group_idx * block_table_stride;
+              const float* curr_alibi_slopes =
+                  (alibi_slopes != nullptr ? alibi_slopes + q_head_start_idx
+                                           : nullptr);
+              const c10::BFloat16* curr_s_aux =
+                  (s_aux != nullptr ? s_aux + q_head_start_idx : nullptr);
+
+              // copy the Q tile to q_buffer, the logical layout of q_buffer is
+              // [actual_q_token_num, actual_q_heads_per_kv, head_dim]
+              {
+                attn_impl.copy_q_heads_tile(
+                    q_tile_ptr, q_buffer, actual_q_token_num,
+                    actual_q_heads_per_kv, q_token_num_stride,
+                    q_head_num_stride, scale);
+              }
+
+              if (use_sink) {
+                alignas(64) float s_aux_fp32[16];
+#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+                // ARM without native BF16 support: manual conversion
+                for (int i = 0; i < 16; ++i) {
+                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
+                }
+#else
+                // All other platforms have BF16Vec16 available
+                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
+                vec_op::FP32Vec16 vec_fp32(vec_bf16);
+                vec_fp32.save(s_aux_fp32);
+#endif
+
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 1.0f;
+                    curr_max_buffer[head_idx] = s_aux_fp32[head_idx];
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              } else {
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 0.0f;
+                    curr_max_buffer[head_idx] =
+                        std::numeric_limits<float>::lowest();
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              }
+
+              // compute loop
+              for (int32_t kv_tile_pos = rounded_kv_tile_start_pos;
+                   kv_tile_pos < rounded_kv_tile_end_pos;
+                   kv_tile_pos += kv_tile_size) {
+                const int32_t kv_tile_pos_left = kv_tile_pos;
+                const int32_t kv_tile_pos_right = std::min(
+                    kv_tile_pos_left + kv_tile_size, rounded_kv_tile_end_pos);
+                for (int32_t q_head_tile_token_offset = 0;
+                     q_head_tile_token_offset < actual_q_token_num;
+                     q_head_tile_token_offset += max_q_token_num_per_iter) {
+                  const int32_t q_tile_pos_left =
+                      q_tile_start_pos + q_head_tile_token_offset;
+                  const int32_t q_tile_token_num =
+                      std::min(max_q_token_num_per_iter,
+                               actual_q_token_num - q_head_tile_token_offset);
+                  const int32_t q_tile_head_offset =
+                      q_head_tile_token_offset * actual_q_heads_per_kv;
+                  const int32_t q_tile_head_num =
+                      q_tile_token_num * actual_q_heads_per_kv;
+                  const int32_t q_tile_pos_right =
+                      q_tile_pos_left + q_tile_token_num;
+                  const auto [actual_kv_tile_pos_left,
+                              actual_kv_tile_pos_right] =
+                      AttentionScheduler::calcu_kv_tile_pos(
+                          kv_tile_pos_left, kv_tile_pos_right, q_tile_pos_left,
+                          q_tile_pos_right, sliding_window_left,
+                          sliding_window_right);
+                  const int32_t q_iter_idx =
+                      q_head_tile_token_offset / max_q_token_num_per_iter;
+
+                  if (actual_kv_tile_pos_right <= actual_kv_tile_pos_left) {
+                    continue;
+                  }
+
+                  // align kv_pos to blocksize_alignment
+                  const auto [aligned_actual_kv_tile_pos_left,
+                              aligned_actual_kv_tile_pos_right] =
+                      AttentionScheduler::align_kv_tile_pos(
+                          actual_kv_tile_pos_left, actual_kv_tile_pos_right,
+                          blocksize_alignment);
+                  const int32_t actual_kv_token_num =
+                      aligned_actual_kv_tile_pos_right -
+                      aligned_actual_kv_tile_pos_left;
+
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,
+                  //   q_token_end: %d, q_token_num: %d, q_head_num: %d,
+                  //   q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,
+                  //   kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx +
+                  //             q_head_tile_token_offset + q_tile_token_num,
+                  //             q_tile_token_num, q_tile_head_num,
+                  //             q_tile_pos_left, q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+
+                  // Move buffers
+                  q_buffer_t* curr_q_heads_buffer =
+                      q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_partial_q_buffer =
+                      partial_q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_max_buffer = max_buffer + q_tile_head_offset;
+                  float* curr_sum_buffer = sum_buffer + q_tile_head_offset;
+
+                  bool debug_info = false;
+                  //   bool debug_info = (
+                  //     q_head_start_idx == 4 &&
+                  //     (q_token_start_idx + q_head_tile_token_offset) <=
+                  //     4
+                  //     && (q_token_start_idx + q_head_tile_token_offset +
+                  //     q_tile_token_num) > 4
+                  //   );
+                  // if (debug_info) {
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,"
+                  //   "q_token_end: %d, q_token_num: %d, q_head_num: %d,"
+                  //   "q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,"
+                  //   "kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx
+                  //             + q_head_tile_token_offset +
+                  //             q_tile_token_num, q_tile_token_num,
+                  //             q_tile_head_num, q_tile_pos_left,
+                  //             q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+                  // }
+
+                  attn_impl.template execute_attention<Attention>(
+                      curr_q_heads_buffer, curr_k_cache, curr_v_cache,
+                      logits_buffer, curr_partial_q_buffer, curr_max_buffer,
+                      curr_sum_buffer, curr_block_table,
+                      aligned_actual_kv_tile_pos_left,
+                      aligned_actual_kv_tile_pos_right, actual_kv_token_num,
+                      kv_cache_block_num_stride, q_tile_head_num,
+                      q_tile_token_num, q_tile_pos_left, actual_q_heads_per_kv,
+                      block_size, sliding_window_left, sliding_window_right,
+                      scale, softcap_scale, curr_alibi_slopes,
+                      first_iter_flag[q_iter_idx], use_sink, debug_info);
+                  first_iter_flag[q_iter_idx] = false;
+                }
+              }
+
+              // write back partial results to output buffer or reduction buffer
+              {
+                if (curr_spilt_id == -1) {
+                  final_output(partial_q_buffer,
+                               reinterpret_cast<query_t*>(input->output) +
+                                   output_buffer_offset,
+                               sum_buffer, actual_q_heads_per_kv,
+                               actual_q_token_num, q_head_num);
+                } else {
+                  const int32_t stride =
+                      actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+                  buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                        head_dim, stride, sizeof(float));
+                  volatile bool* split_flag_buffer =
+                      buffer_manager.get_reduce_flag_buffer() + curr_spilt_id;
+                  float* split_output_buffer =
+                      buffer_manager.get_reduce_output_buffer() +
+                      curr_spilt_id * stride * head_dim;
+                  float* split_max_buffer =
+                      buffer_manager.get_reduce_max_buffer() +
+                      curr_spilt_id * stride;
+                  float* split_sum_buffer =
+                      buffer_manager.get_reduce_sum_buffer() +
+                      curr_spilt_id * stride;
+
+                  partial_output(partial_q_buffer, max_buffer, sum_buffer,
+                                 q_head_tile_size, split_output_buffer,
+                                 split_max_buffer, split_sum_buffer,
+                                 split_flag_buffer);
+                }
+              }
+            }
+          }
+        } else {
+          task_idx -= workitem_groups_counter_num;
+          const int32_t kv_head_idx = task_idx / reduction_item_num;
+          const int32_t item_offset = task_idx % reduction_item_num;
+          ReductionWorkItemGroup* const curr_workitem_groups =
+              reduction_items + item_offset;
+          const int32_t curr_output_token_idx =
+              curr_workitem_groups->q_token_id_start;
+          const int32_t curr_output_token_num =
+              curr_workitem_groups->q_token_id_num;
+          const int32_t curr_split_id = curr_workitem_groups->split_start_id;
+          const int32_t curr_split_num = curr_workitem_groups->split_num;
+          const int32_t current_group_idx = curr_workitem_groups->req_id;
+          const int32_t curr_output_head_num =
+              curr_output_token_num * actual_q_heads_per_kv;
+
+          const int32_t q_start = input->query_start_loc[current_group_idx];
+          const int32_t q_token_start_idx = q_start + curr_output_token_idx;
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+          size_t output_buffer_offset =
+              q_token_start_idx * q_head_num * head_dim +
+              q_head_start_idx * head_dim;
+
+          const int32_t stride =
+              actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+          buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                head_dim, stride, sizeof(float));
+          volatile bool* split_flag_buffer =
+              buffer_manager.get_reduce_flag_buffer() + curr_split_id;
+          float* split_output_buffer =
+              buffer_manager.get_reduce_output_buffer() +
+              curr_split_id * stride * head_dim;
+          float* split_max_buffer =
+              buffer_manager.get_reduce_max_buffer() + curr_split_id * stride;
+          float* split_sum_buffer =
+              buffer_manager.get_reduce_sum_buffer() + curr_split_id * stride;
+
+          reduce_splits(split_output_buffer, split_max_buffer, split_sum_buffer,
+                        split_flag_buffer, stride, curr_output_head_num,
+                        curr_split_num);
+          final_output(
+              split_output_buffer,
+              reinterpret_cast<query_t*>(input->output) + output_buffer_offset,
+              split_sum_buffer, actual_q_heads_per_kv, curr_output_token_num,
+              q_head_num);
+        }
+      }
+    }
+    // Reset counter for next call
+    input->metadata->reset_counter();
+  }
+
+  void reduce_splits(float* __restrict__ split_output_buffer,
+                     float* __restrict__ split_max_buffer,
+                     float* __restrict__ split_sum_buffer,
+                     volatile bool* __restrict__ flags,
+                     const int32_t head_num_per_split,
+                     const int32_t curr_head_num, const int32_t split_num) {
+#ifdef DEFINE_FAST_EXP
+    DEFINE_FAST_EXP
+#endif
+    // restrict curr_head_num <= 16 in the scheduler
+    // elems in split_max_buffer, split_sum_buffer are not cache alignment, use
+    // local buffers to reduce false-sharing
+    alignas(64) float local_max[16];
+    alignas(64) float local_sum[16];
+
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    float* __restrict__ curr_split_max_buffer = split_max_buffer;
+    float* __restrict__ curr_split_sum_buffer = split_sum_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t split_idx = 0; split_idx < split_num; ++split_idx) {
+      while (!flags[split_idx]) {
+#ifdef FAST_SPINNING
+        FAST_SPINNING
+#else
+        std::this_thread::yield();
+#endif
+      }
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (split_idx > 0) {
+        float* __restrict__ curr_output_buffer = split_output_buffer;
+        float* __restrict__ curr_split_output_buffer_iter =
+            curr_split_output_buffer;
+        for (int32_t head_idx = 0; head_idx < curr_head_num; ++head_idx) {
+          float final_max = local_max[head_idx];
+          float curr_max = curr_split_max_buffer[head_idx];
+          float final_sum = local_sum[head_idx];
+          float curr_sum = curr_split_sum_buffer[head_idx];
+          float* __restrict__ non_scale_output_iter =
+              final_max > curr_max ? curr_output_buffer
+                                   : curr_split_output_buffer_iter;
+          float* __restrict__ scale_output_iter =
+              final_max > curr_max ? curr_split_output_buffer_iter
+                                   : curr_output_buffer;
+          float rescale_factor = final_max > curr_max ? curr_max - final_max
+                                                      : final_max - curr_max;
+
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          local_sum[head_idx] = final_max > curr_max
+                                    ? final_sum + rescale_factor * curr_sum
+                                    : rescale_factor * final_sum + curr_sum;
+
+          final_max = std::max(final_max, curr_max);
+          local_max[head_idx] = final_max;
+          for (int32_t i = 0; i < head_dim_group_num; ++i) {
+            vec_op::FP32Vec16 non_scale_vec(non_scale_output_iter);
+            vec_op::FP32Vec16 scale_vec(scale_output_iter);
+            vec_op::FP32Vec16 final_vec =
+                non_scale_vec + scale_vec * rescale_factor_vec;
+            final_vec.save(curr_output_buffer);
+
+            non_scale_output_iter += 16;
+            scale_output_iter += 16;
+            curr_output_buffer += 16;
+          }
+          curr_split_output_buffer_iter += head_dim;
+        }
+      } else {
+        vec_op::FP32Vec16 final_max(split_max_buffer);
+        final_max.save(local_max);
+        vec_op::FP32Vec16 final_sum(split_sum_buffer);
+        final_sum.save(local_sum);
+      }
+
+      curr_split_output_buffer += head_num_per_split * head_dim;
+      curr_split_max_buffer += head_num_per_split;
+      curr_split_sum_buffer += head_num_per_split;
+    }
+    // write back final max and sum
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = local_max[i];
+      split_sum_buffer[i] = local_sum[i];
+    }
+  }
+
+  void partial_output(float* __restrict__ partial_output_buffer,
+                      float* __restrict__ partial_max_buffer,
+                      float* __restrict__ partial_sum_buffer,
+                      int32_t curr_head_num,
+                      float* __restrict__ split_output_buffer,
+                      float* __restrict__ split_max_buffer,
+                      float* __restrict__ split_sum_buffer,
+                      volatile bool* __restrict__ flag) {
+    float* __restrict__ curr_partial_output_buffer = partial_output_buffer;
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = partial_max_buffer[i];
+      split_sum_buffer[i] = partial_sum_buffer[i];
+      for (int32_t j = 0; j < head_dim_group_num; ++j) {
+        vec_op::FP32Vec16 vec(curr_partial_output_buffer);
+        vec.save(curr_split_output_buffer);
+
+        curr_partial_output_buffer += 16;
+        curr_split_output_buffer += 16;
+      }
+    }
+    std::atomic_thread_fence(std::memory_order_release);
+    *flag = true;
+  }
+
+  void final_output(float* __restrict__ partial_q_buffer,
+                    query_t* __restrict__ curr_output_buffer,
+                    float* __restrict__ sum_buffer,
+                    const int32_t q_heads_per_kv,
+                    const int32_t actual_q_token_num,
+                    const int32_t q_head_num) {
+    // final output
+    using output_vec_t = typename VecTypeTrait<query_t>::vec_t;
+
+    float* __restrict__ curr_partial_output_buffer = partial_q_buffer;
+    float* __restrict__ curr_sum_buffer = sum_buffer;
+    constexpr int32_t group_num_per_head = head_dim / 16;
+    const int32_t partial_q_buffer_stride = q_heads_per_kv * head_dim;
+    const int32_t output_buffer_stride = q_head_num * head_dim;
+    for (int32_t token_idx = 0; token_idx < actual_q_token_num; ++token_idx) {
+      float* __restrict__ curr_partial_output_buffer_iter =
+          curr_partial_output_buffer;
+      query_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+      for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+        vec_op::FP32Vec16 inv_sum_scale_vec(1.0 / *curr_sum_buffer);
+
+        for (int32_t i = 0; i < group_num_per_head; ++i) {
+          vec_op::FP32Vec16 vec(curr_partial_output_buffer_iter);
+          // divide the final sum val of softmax here
+          vec = inv_sum_scale_vec * vec;
+
+          // cast to query type
+          output_vec_t output_vec(vec);
+          output_vec.save(curr_output_buffer_iter);
+
+          // update
+          curr_partial_output_buffer_iter += 16;
+          curr_output_buffer_iter += 16;
+        }
+
+        // update
+        curr_sum_buffer += 1;
+      }
+
+      // update
+      curr_partial_output_buffer += partial_q_buffer_stride;
+      curr_output_buffer += output_buffer_stride;
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..6458e4341937033fee2cabd26564db44f4570993
--- /dev/null
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -0,0 +1,63 @@
+#ifndef CPU_ATTN_MACROS_H
+#define CPU_ATTN_MACROS_H
+
+// x86_64
+#ifdef __x86_64__
+  #define FAST_SPINNING _mm_pause();
+
+  #ifdef __AVX512F__
+    #define DEFINE_FAST_EXP                                                    \
+      const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f);             \
+      const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f);             \
+      const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f);             \
+      const __m512 vec_factorial_4 = _mm512_set1_ps(0.0418978221f);            \
+      const __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f);           \
+      const __m512 vec_exp_log2ef =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b));                  \
+      const __m512 vec_half = _mm512_set1_ps(0.5f);                            \
+      const __m512 vec_one = _mm512_set1_ps(1.f);                              \
+      const __m512 vec_zero = _mm512_set1_ps(0.f);                             \
+      const __m512 vec_two = _mm512_set1_ps(2.f);                              \
+      const __m512 vec_ln2f =                                                  \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218));                  \
+      const __m512 vec_ln_flt_min =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));                  \
+      const __m512 vec_ln_flt_max =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
+      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
+      const int n_mantissa_bits = 23;                                          \
+      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+                          always_inline)) {                                    \
+        __m512 values = vec.reg;                                               \
+        auto less_ln_flt_min_mask =                                            \
+            _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);      \
+        auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);                  \
+        vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);                      \
+        auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);      \
+        auto vec_fx_i = _mm512_cvt_roundps_epi32(                              \
+            vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);                \
+        vec_fx = _mm512_cvtepi32_ps(vec_fx_i);                                 \
+        auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);       \
+        auto vec_res =                                                         \
+            _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);   \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);             \
+        auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);                  \
+        auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);            \
+        auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);    \
+        vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); \
+        auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);             \
+        vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask,             \
+                                             vec_two_pow_n, vec_zero);         \
+        vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);                       \
+        vec_res = _mm512_mul_ps(vec_res, vec_two);                             \
+        vec_op::FP32Vec16 res(vec_res);                                        \
+        return res;                                                            \
+      };
+  #endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..479313f0e19f4b3285372646ddd69cde05915102
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -0,0 +1,248 @@
+#ifndef CPU_ATTN_VEC_HPP
+#define CPU_ATTN_VEC_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename kv_cache_t>
+class TileGemm82 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
+    float* __restrict__ curr_c_0 = c_tile;
+    float* __restrict__ curr_c_1 = c_tile + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+      curr_b_1 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      32;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      32;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 8;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm82<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key as column-major
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value as row-major
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_vec16.hpp b/csrc/cpu/cpu_attn_vec16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7402312c09243021244648cb4415aa40a1064d22
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -0,0 +1,171 @@
+#ifndef CPU_ATTN_VEC16_HPP
+#define CPU_ATTN_VEC16_HPP
+
+#include "cpu_attn_vec.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 16-1-16 pattern, 16 regs for A, 1 regs for B, 16 regs for C, [16, K] @ [k,
+// 16]
+template <typename kv_cache_t>
+class TileGemm161 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 9:
+      case 10:
+      case 11:
+      case 12:
+        gemm_micro<12>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+      case 13:
+      case 14:
+      case 15:
+      case 16:
+        gemm_micro<16>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 16);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    float* __restrict__ curr_c_0 = c_tile;
+
+    vec_op::FP32Vec16 c_regs[M];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i] = vec_op::FP32Vec16(curr_m_c_0);
+
+        // update
+        curr_m_c_0 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i] = c_regs[i] + a_reg * fp32_b_0_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i].save(curr_c_0);
+
+      // update
+      curr_c_0 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      16;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      16;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 16;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC16;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm161<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 982f7c07a13bd33a1deab10786e58cae527d5c0d..7ddf028e6e13104126c4ce21f35f7aade871cd36 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -40,6 +40,23 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
+// Function to get the timestamp using RDTSCP
+FORCE_INLINE uint64_t bench_timestamp() {
+  unsigned int cycles_low, cycles_high;
+  asm volatile(
+      ".intel_syntax noprefix\n\t"
+      "CPUID\n\t"        // Serialize instruction stream to ensure previous
+                         // instructions complete
+      "RDTSCP\n\t"       // Read TSC and core ID
+      "mov %0, edx\n\t"  // Store high 32 bits of TSC
+      "mov %1, eax\n\t"  // Store low 32 bits of TSC
+      ".att_syntax"
+      : "=r"(cycles_high), "=r"(cycles_low)::"rax", "rbx", "rcx",
+        "rdx"  // Clobbered registers
+  );
+  return (uint64_t)cycles_high << 32 | cycles_low;
+}
+
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
@@ -407,6 +424,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float reduce_min() const { return _mm512_reduce_min_ps(reg); }
 
+  float get_last_elem() const { return _mm512_cvtss_f32(reg); }
+
   template <int group_size>
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
@@ -446,9 +465,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
 
-  explicit FP32Vec16(const FP32Vec16& data)
-      : reg_low(data.reg_low), reg_high(data.reg_high) {}
-
   explicit FP32Vec16(const FP32Vec4& data)
       : reg_low((__m256)_mm256_inserti128_si256(
             _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
@@ -504,6 +520,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                      _mm256_div_ps(reg_high, b.reg_high));
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_max_ps(reg_low, b.reg_low),
+                     _mm256_max_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_max() const {
+    __m256 v = _mm256_max_ps(reg_low, reg_high);
+    // Permute to compare elements within 128-bit lanes
+    __m256 v_shuffled = _mm256_permute_ps(
+        v, 0b00001011);  // Swap halves within each 128-bit lane
+    __m256 v_max = _mm256_max_ps(v, v_shuffled);
+
+    v_shuffled = _mm256_permute_ps(
+        v_max, 0b00000001);  // Shuffle elements within each 128-bit lane
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // Permute to compare elements between 128-bit lanes
+    v_shuffled =
+        _mm256_permute2f128_ps(v_max, v_max, 0b00000001);  // Swap 128-bit lanes
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // At this point, the maximum value is present in all elements of v_max.
+    // Extract the first element for the scalar result.
+    return _mm256_cvtss_f32(v_max);  // Extract the lowest 32-bit float
+  }
+
   float reduce_sum() const {
     FP32Vec8 low = FP32Vec8(reg_low);
     FP32Vec8 high = FP32Vec8(reg_high);
@@ -642,7 +684,7 @@ inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
     : reg(_mm256_insertf128_si256(
           _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
-          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+          FP16Vec8(FP32Vec8(v.reg_high)).reg, 1)) {}
 #endif
 
 #ifdef __AVX512BF16__
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index 6def0e061fa96f16a31dfeeb9079d0b56f24c79f..02a8072ccf306526f65ba0bc6b911bd30bcf530a 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -5,6 +5,7 @@
 #include "common/memory.hpp"
 
 #include "dnnl_helper.h"
+#include "scratchpad_manager.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -22,23 +23,6 @@ void release_dnnl_matmul_handler(int64_t handler) {
   delete ptr;
 }
 
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
-
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
  public:
@@ -137,9 +121,8 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
 }
 
 void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
-                                   {b_k_stride_, b_n_stride_});
+    void* original_b_ptr, dnnl::memory::desc original_b_md,
+    dnnl::memory::desc b_target_mem_desc) {
   dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
   dnnl::memory packed_weight(b_target_mem_desc, default_engine());
   {
@@ -188,7 +171,8 @@ template <>
 struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
   size_t operator()(
       const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
+           hash<int>()(static_cast<int>(val.b_type));
   }
 };
 
@@ -217,7 +201,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
 
 bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
                 const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
-  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
+         l.b_type == r.b_type;
 }
 
 bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
@@ -250,7 +235,9 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
   if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
     assert(!use_azp_);
   };
-  prepack_weight(args.b_ptr,
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+                                   {b_k_stride_, b_n_stride_});
+  prepack_weight(args.b_ptr, original_b_md,
                  create_primitive_desc(
                      MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                    .use_bias = false,
@@ -412,12 +399,25 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
   assert(ab_type_ == dnnl::memory::data_type::f32 ||
          ab_type_ == dnnl::memory::data_type::bf16 ||
          ab_type_ == dnnl::memory::data_type::f16);
-  prepack_weight(args.b_ptr,
+
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+                                   {b_k_stride_, b_n_stride_});
+
+  prepack_weight(args.b_ptr, original_b_md,
                  create_primitive_desc(
-                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-                                   .use_bias = false,
-                                   .bias_type = dnnl::memory::data_type::undef},
+                     MSizeCacheKey{
+#ifdef VLLM_USE_ACL
+                         // Arm Compute Library (ACL) backend for oneDNN does
+                         // not support runtime
+                         // dimensions, so we set M to a default value
+                         .a_m_size = 128,
+                         .a_m_stride = b_k_size_,
+#else
+                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
+                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+#endif
+                         .use_bias = false,
+                         .bias_type = dnnl::memory::data_type::undef},
                      true)
                      .weights_desc());
   init_runtime_memory_cache(args);
@@ -443,13 +443,31 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
   c_storage->set_data_handle((void*)args.c_ptr);
   c_mem_desc->dims[0] = args.a_m_size;
 
+#ifndef VLLM_USE_ACL
+  // We do not support in ACL backend of oneDNN, we handle bias by:
+  // 1. copying it into the result tensor
+  // 2. attaching a fused-sum post-op to the matmul primitive
   if (args.use_bias) {
     auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
     bias_storage->set_data_handle((void*)args.bias_ptr);
   }
-
+#endif
   dnnl::matmul matmul = get_matmul_cache(args);
 
+// With ACL backend of oneDNN, the required memory format might change when the
+// source tensor dims change. This does not really happen in practice, so isn't
+// a performance hit, but we need to support it because the API allows for it.
+#ifdef VLLM_USE_ACL
+  auto new_expected_wei_desc =
+      dnnl::matmul::primitive_desc(
+          const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
+          .weights_desc();
+  if (new_expected_wei_desc != b_target_mem_desc_) {
+    prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
+                   b_target_mem_desc_, new_expected_wei_desc);
+  }
+#endif
+
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
   scratchpad_storage->set_data_handle(
       DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
@@ -461,8 +479,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
     const MSizeCacheKey& key) {
   if (m_size_cache_.get() == nullptr) {
-    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
-    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+    ClassMatmulCacheKey class_key = {
+        .b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
+    m_size_cache_ =
+        get_matul_class_primitive_cache(class_key, primitive_cache_size_);
   }
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
@@ -484,7 +504,13 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
   } else {
     a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                               {key.a_m_stride, 1});
+#ifdef VLLM_USE_ACL
+    // ACL's backend of oneDNN always expects the weight format to be "any"
+    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+                              dnnl::memory::format_tag::any);
+#else
     b_md = b_target_mem_desc_;
+#endif
   }
   dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                           dnnl::memory::format_tag::ab);
@@ -494,8 +520,18 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
 
   if (key.use_bias) {
     dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+// Since ACL's matmuls don't support passing a bias_md, we apply the bias
+// through a fused-sum post-op
+#ifdef VLLM_USE_ACL
+    dnnl::post_ops post_ops;
+    post_ops.append_sum();
+    attr.set_post_ops(post_ops);
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+                                        attr);
+#else
     return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                         c_md, attr);
+#endif
   } else {
     return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                         attr);
@@ -511,13 +547,23 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
                    default_engine(), nullptr);
   set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
 
+// ACL matmuls don't support bias_md, so we don't need these
+#ifndef VLLM_USE_ACL
   memory_cache_[DNNL_ARG_BIAS] =
       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                    default_engine(), nullptr);
   set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-
+#endif
   memory_cache_[DNNL_ARG_SCRATCHPAD] =
       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                    default_engine(), nullptr);
   set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
+
+bool is_onednn_acl_supported() {
+#ifdef VLLM_USE_ACL
+  return true;
+#else
+  return false;
+#endif
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index ad6773d2b9fd6d89f9530780810d8912f5076570..b841ed73058e9bda3b3e2b1f89e40dfae9c48c70 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,30 +59,6 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
 }
 
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
 class DNNLMatMulPrimitiveHandler {
  public:
   virtual ~DNNLMatMulPrimitiveHandler() = default;
@@ -101,7 +77,7 @@ class DNNLMatMulPrimitiveHandler {
  protected:
   DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
 
-  void prepack_weight(void* original_b_ptr,
+  void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
                       dnnl::memory::desc b_target_mem_desc);
 
   void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
@@ -199,6 +175,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
   struct ClassMatmulCacheKey {
     dnnl_dim_t b_n_size;
     dnnl_dim_t b_k_size;
+    dnnl::memory::data_type b_type;
 
     friend bool operator==(const ClassMatmulCacheKey& l,
                            const ClassMatmulCacheKey& r);
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index 1c42a75bc2d6189d06f9e871fdd171cff1a1b5d8..6d062c71e7674c20d5df11b0d7f8c9bb492d155a 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -527,21 +527,42 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
   MatMulPrimitiveHandler* ptr =
       reinterpret_cast<MatMulPrimitiveHandler*>(handler);
 
+// ACL matmuls expect contiguous source tensors
+#ifdef VLLM_USE_ACL
+  torch::Tensor a_contig = a.contiguous();
+#endif
+
   MatMulPrimitiveHandler::ExecArgs exec_args;
+
+#ifdef VLLM_USE_ACL
+  exec_args.a_m_size = a_contig.size(0);
+  exec_args.a_m_stride = a_contig.stride(0);
+#else
   exec_args.a_m_size = a.size(0);
   exec_args.a_m_stride = a.stride(0);
-
+#endif
   VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
     if (bias.has_value()) {
       exec_args.use_bias = true;
       exec_args.bias_type = get_dnnl_type<scalar_t>();
+#ifdef VLLM_USE_ACL
+      // ACL matmuls in oneDNN do not support a bias.
+      // We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
+      c.copy_(bias.value());
+#else
       exec_args.bias_ptr = bias->data_ptr<scalar_t>();
+#endif
     } else {
       exec_args.use_bias = false;
       exec_args.bias_type = get_dnnl_type<void>();
       exec_args.bias_ptr = nullptr;
     }
+#ifdef VLLM_USE_ACL
+    exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
+#else
     exec_args.a_ptr = a.data_ptr<scalar_t>();
+
+#endif
     exec_args.c_ptr = c.data_ptr<scalar_t>();
 
     ptr->execute(exec_args);
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05cd435f34b7a644176ca99cc518cf0f03dc5294
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.cpp
@@ -0,0 +1,23 @@
+#include <cstdlib>
+
+#include "scratchpad_manager.h"
+
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ecf59192f84532770783abea95dc25586abe049
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.h
@@ -0,0 +1,31 @@
+#ifndef SCRATCHPAD_MANAGER_H
+#define SCRATCHPAD_MANAGER_H
+
+#include <cstddef>
+#include <cstdio>
+
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
+#endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
index 7e64e1c5219806ea443df224a626ac3c1000b229..e43aa203777aba291b0afc542a5b6523b1bb6742 100644
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -192,7 +192,7 @@ class SHMManager {
                       const int group_size)
       : _rank(rank),
         _group_size(group_size),
-        _thread_num(torch::get_num_threads()),
+        _thread_num(omp_get_max_threads()),
         _shm_names({""}),
         _shared_mem_ptrs({nullptr}),
         _shm_ctx(nullptr) {
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index d279c03e0b59abcb5561468c93970a41034b3216..9fefd88cd9b08b0bb8c30d2c1302a858727e2530 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -27,6 +27,8 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
                const std::optional<torch::Tensor>& bias, int64_t handler);
 
+bool is_onednn_acl_supported();
+
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
@@ -72,25 +74,38 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                      const std::optional<at::Tensor>& bias,
                                      at::ScalarType out_dtype, bool is_vnni);
 
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split);
+
+void cpu_attn_reshape_and_cache(const torch::Tensor& key,
+                                const torch::Tensor& value,
+                                torch::Tensor& key_cache,
+                                torch::Tensor& value_cache,
+                                const torch::Tensor& slot_mapping,
+                                const std::string& isa);
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query, const torch::Tensor& key_cache,
+    const torch::Tensor& value_cache, torch::Tensor& output,
+    const torch::Tensor& query_start_loc, const torch::Tensor& seq_lens,
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table, const double softcap,
+    const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux);
+
+// Note: just for avoiding importing errors
+void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
   ops.def(
       "dynamic_4bit_int_moe("
       "Tensor x, Tensor topk_ids, Tensor topk_weights,"
@@ -100,20 +115,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
 
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
   // Activation ops
 
   // Activation function used in SwiGLU.
@@ -181,6 +182,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int handler) -> ()");
   ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
 
+  // Check if oneDNN was built with ACL backend
+  ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
+
   // Create oneDNN W8A8 handler
   ops.def(
       "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
@@ -254,37 +258,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
            &int8_scaled_mm_with_quant);
 #endif
-}
 
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  Tensor k_scale, Tensor v_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-
-  cache_ops.def(
-      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                     Tensor! kv_cache,"
-      "                     Tensor slot_mapping,"
-      "                     str kv_cache_dtype,"
-      "                     Tensor scale) -> ()");
-  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
+  // CPU attention kernels
+  ops.def(
+      "get_scheduler_metadata(int num_req, int num_heads_q, int num_heads_kv, "
+      "int head_dim, Tensor seq_lens, ScalarType dtype, Tensor "
+      "query_start_loc, bool casual, int window_size, str isa_hint, bool "
+      "enable_kv_split) -> Tensor",
+      &get_scheduler_metadata);
+  ops.def(
+      "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
+      "isa) -> ()",
+      &cpu_attn_reshape_and_cache);
+  ops.def(
+      "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
+      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
+      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
+      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
+      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
+      &cpu_attention_with_kv_cache);
+
+  // placeholders
+  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 02514edce80733f4ebcc24d1be906dfd0c4c574b..c5a48352e30894739d08ac47edff4fd79f2eacf9 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,6 +45,16 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    // Verify all CPUs are on the same NUMA node
+    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
+      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
+      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
+                  " is on NUMA node ", node_id, ", but CPU ",
+                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                  ". All CPUs should be on the same NUMA node for optimal "
+                  "performance. Memory will be bound to NUMA node ",
+                  mem_node_id, ".");
+    }
     bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
     bitmask* src_mask = numa_get_membind();
 
diff --git a/csrc/cub_helpers.h b/csrc/cub_helpers.h
index 470a63a22cab07e98e461a8a4e258465a255957d..18e4e343ad8b7a1b07028326731fdc98484f310d 100644
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@@ -12,6 +12,7 @@ using CubMaxOp = cub::Max;
   #endif  // CUB_VERSION
 #else
   #include <hipcub/hipcub.hpp>
-using CubAddOp = cub::Sum;
-using CubMaxOp = cub::Max;
+namespace cub = hipcub;
+using CubAddOp = hipcub::Sum;
+using CubMaxOp = hipcub::Max;
 #endif  // USE_ROCM
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index fab6ca36d422ef21de8c51b51f7b816e53841e63..78dc840a98b677de9555dbd542b22f36d6ed68fd 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -3,14 +3,58 @@
 // need to be unsigned long long
 #include <iostream>
 
+#include "cumem_allocator_compat.h"
+
+#ifndef USE_ROCM
+static const char* PYARGS_PARSE = "KKKK";
+#else
+  #include <cstdlib>
+  #include <cerrno>
+  #include <climits>
+
+// Default chunk size 256MB for ROCm. Can be overridden at runtime by the
+// environment variable VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE, specified in megabytes
+// (MB). The env value is parsed with strtoull as an integer number of MB
+// (decimal or 0x hex). The parsed MB value is converted to bytes. If
+// parsing fails, the value is 0, or the multiplication would overflow,
+// the default (256MB) is used.
+static const unsigned long long DEFAULT_MEMCREATE_CHUNK_SIZE =
+    (256ULL * 1024ULL * 1024ULL);
+
+static unsigned long long get_memcreate_chunk_size() {
+  const char* env = getenv("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE");
+  if (!env) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  char* endptr = nullptr;
+  errno = 0;
+  unsigned long long val_mb = strtoull(env, &endptr, 0);
+  if (endptr == env || errno != 0) {
+    // parsing failed, fallback to default
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  if (val_mb == 0) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+
+  const unsigned long long MB = 1024ULL * 1024ULL;
+  // guard against overflow when converting MB -> bytes
+  if (val_mb > (ULLONG_MAX / MB)) {
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  return val_mb * MB;
+}
+
+static inline unsigned long long my_min(unsigned long long a,
+                                        unsigned long long b) {
+  return a < b ? a : b;
+}
+
+static const char* PYARGS_PARSE = "KKKO";
+#endif
+
 extern "C" {
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include <sys/types.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
 
 char error_msg[10240];  // 10KB buffer to store error messages
 CUresult no_error = CUresult(0);
@@ -49,7 +93,12 @@ void ensure_context(unsigned long long device) {
 }
 
 void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+#ifndef USE_ROCM
                     CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                    CUmemGenericAllocationHandle** p_memHandle,
+                    unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   ensure_context(device);
   // Define memory allocation properties
   CUmemAllocationProp prop = {};
@@ -58,6 +107,7 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
   if (error_code != 0) {
@@ -67,6 +117,39 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   if (error_code != 0) {
     return;
   }
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUDA_CHECK(cuMemCreate(p_memHandle[i], chunk_sizes[i], &prop, 0));
+    if (error_code != 0) {
+      // Clean up previously created handles
+      for (auto j = 0; j < i; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+  }
+  unsigned long long allocated_size = 0;
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUDA_CHECK(cuMemMap(map_addr, chunk_sizes[i], 0, *(p_memHandle[i]), 0));
+    if (error_code != 0) {
+      // unmap previously mapped chunks
+      unsigned long long unmapped_size = 0;
+      for (auto j = 0; j < i; ++j) {
+        void* unmap_addr = (void*)((uintptr_t)d_mem + unmapped_size);
+        cuMemUnmap(unmap_addr, chunk_sizes[j]);
+        unmapped_size += chunk_sizes[j];
+      }
+      // release all created handles
+      for (auto j = 0; j < num_chunks; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+#endif
+
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = device;
@@ -82,10 +165,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 
 void unmap_and_release(unsigned long long device, ssize_t size,
                        CUdeviceptr d_mem,
+#ifndef USE_ROCM
                        CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                       CUmemGenericAllocationHandle** p_memHandle,
+                       unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
   // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
   ensure_context(device);
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemUnmap(d_mem, size));
   if (error_code != 0) {
     return;
@@ -94,6 +183,30 @@ void unmap_and_release(unsigned long long device, ssize_t size,
   if (error_code != 0) {
     return;
   }
+#else
+  unsigned long long allocated_size = 0;
+  CUresult first_error = no_error;
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUresult status = cuMemUnmap(map_addr, chunk_sizes[i]);
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUresult status = cuMemRelease(*(p_memHandle[i]));
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+  }
+
+  if (first_error != no_error) {
+    CUDA_CHECK(first_error);
+  }
+#endif
 }
 
 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -120,6 +233,36 @@ PyObject* create_tuple_from_c_integers(unsigned long long a,
   return tuple;  // Return the created tuple
 }
 
+PyObject* create_tuple_from_c_mixed(unsigned long long a, unsigned long long b,
+                                    unsigned long long c,
+                                    CUmemGenericAllocationHandle** vec,
+                                    unsigned long long* chunk_sizes,
+                                    size_t num_chunks) {
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;
+  }
+
+  // PyObject* list = PyList_New(vec.size());
+  PyObject* list = PyList_New(num_chunks);
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* addr_size_pair = PyTuple_New(2);
+    PyObject* addr = PyLong_FromUnsignedLongLong((unsigned long long)(vec[i]));
+    PyObject* size =
+        PyLong_FromUnsignedLongLong((unsigned long long)(chunk_sizes[i]));
+    PyTuple_SetItem(addr_size_pair, 0, addr);
+    PyTuple_SetItem(addr_size_pair, 1, size);
+    PyList_SetItem(list, i, addr_size_pair);
+  }
+
+  PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, list);
+
+  return tuple;
+}
+
 // ---------------------------------------------------------------------------
 // Our exported C functions that call Python:
 
@@ -147,14 +290,55 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
 
   CUdeviceptr d_mem;
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
   if (error_code != 0) {
     return nullptr;
   }
+#else
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, granularity, 0, 0));
+  if (error_code != 0) {
+    return nullptr;
+  }
+#endif
+
+#ifndef USE_ROCM
   // allocate the CUmemGenericAllocationHandle
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)malloc(
           sizeof(CUmemGenericAllocationHandle));
+#else
+  // Make sure chunk size is aligned with hardware granularity. The base
+  // chunk size can be configured via environment variable
+  // ``VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE``; otherwise
+  // DEFAULT_MEMCREATE_CHUNK_SIZE is used.
+  size_t base_chunk = (size_t)get_memcreate_chunk_size();
+  size_t aligned_chunk_size =
+      ((base_chunk + granularity - 1) / granularity) * granularity;
+  size_t num_chunks =
+      (alignedSize + aligned_chunk_size - 1) / aligned_chunk_size;
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  for (auto i = 0; i < num_chunks; ++i) {
+    p_memHandle[i] = (CUmemGenericAllocationHandle*)malloc(
+        sizeof(CUmemGenericAllocationHandle));
+    if (p_memHandle[i] == nullptr) {
+      std::cerr << "ERROR: malloc failed for p_memHandle[" << i << "].\n";
+      for (auto j = 0; j < i; ++j) {
+        free(p_memHandle[j]);
+      }
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)my_min(
+        (unsigned long long)(alignedSize - i * aligned_chunk_size),
+        (unsigned long long)aligned_chunk_size);
+  }
+#endif
 
   if (!g_python_malloc_callback) {
     std::cerr << "ERROR: g_python_malloc_callback not set.\n";
@@ -164,9 +348,15 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   // Acquire GIL (not in stable ABI officially, but often works)
   PyGILState_STATE gstate = PyGILState_Ensure();
 
+#ifndef USE_ROCM
   PyObject* arg_tuple = create_tuple_from_c_integers(
       (unsigned long long)device, (unsigned long long)alignedSize,
       (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+#else
+  PyObject* arg_tuple = create_tuple_from_c_mixed(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, p_memHandle, chunk_sizes, num_chunks);
+#endif
 
   // Call g_python_malloc_callback
   PyObject* py_result =
@@ -182,7 +372,27 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   PyGILState_Release(gstate);
 
   // do the final mapping
+#ifndef USE_ROCM
   create_and_map(device, alignedSize, d_mem, p_memHandle);
+#else
+  create_and_map(device, alignedSize, d_mem, p_memHandle, chunk_sizes,
+                 num_chunks);
+  free(chunk_sizes);
+#endif
+
+  if (error_code != 0) {
+    // free address and the handle
+    CUDA_CHECK(cuMemAddressFree(d_mem, alignedSize));
+#ifndef USE_ROCM
+    free(p_memHandle);
+#else
+    for (size_t i = 0; i < num_chunks; ++i) {
+      free(p_memHandle[i]);
+    }
+    free(p_memHandle);
+#endif
+    return nullptr;
+  }
 
   return (void*)d_mem;
 }
@@ -206,36 +416,96 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
 
   if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
     PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+  if (!PyArg_ParseTuple(py_result, PYARGS_PARSE, &recv_device, &recv_size,
                         &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
-  PyGILState_Release(gstate);
+  // For ROCm, copy the Python list of (addr,size) pairs into C arrays while
+  // holding the GIL. Then release the GIL and call the unmap/release helper
+  // using the copied arrays. This avoids calling PyList_* APIs without the
+  // GIL (which is undefined behavior and can crash when called from other
+  // threads).
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+#ifdef USE_ROCM
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for p_memHandle in my_free."
+              << std::endl;
+    return;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for chunk_sizes in my_free."
+              << std::endl;
+    return;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+  }
 
-  // recv_size == size
-  // recv_device == device
+  // Drop temporary Python refs, then release the GIL before calling into
+  // non-Python APIs.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);
 
-  // Free memory
+  unmap_and_release(device, size, d_mem, p_memHandle, chunk_sizes, num_chunks);
+#else
+  // Non-ROCm path: simple integer handle already extracted; drop temporary
+  // Python refs while still holding the GIL, then release it.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);
 
-  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
   unmap_and_release(device, size, d_mem, p_memHandle);
+#endif
 
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
-  if (error_code != 0) {
-    return;
+#ifndef USE_ROCM
+  free(p_memHandle);
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    free(p_memHandle[i]);
   }
   free(p_memHandle);
+  free(chunk_sizes);
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -271,19 +541,87 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  if (!PyList_Check(recv_p_memHandle)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "Expected a list for the 4th argument on ROCm");
+    return nullptr;
+  }
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  if (num_chunks < 0) {
+    return nullptr;  // PyList_Size sets an exception on error.
+  }
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    if (item == nullptr || !PyTuple_Check(item) || PyTuple_Size(item) != 2) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      PyErr_SetString(
+          PyExc_TypeError,
+          "List items must be tuples of size 2 (handle_addr, size)");
+      return nullptr;
+    }
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    if (addr_py == nullptr || size_py == nullptr) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;  // PyTuple_GetItem sets an exception
+    }
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+  }
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                    num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
@@ -301,19 +639,56 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                 num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
diff --git a/csrc/cumem_allocator_compat.h b/csrc/cumem_allocator_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..74f4bc9eeadf40a816dbe091f9379d43b6f94b58
--- /dev/null
+++ b/csrc/cumem_allocator_compat.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#ifdef USE_ROCM
+////////////////////////////////////////
+// For compatibility with CUDA and ROCm
+////////////////////////////////////////
+  #include <hip/hip_runtime_api.h>
+
+extern "C" {
+  #ifndef CUDA_SUCCESS
+    #define CUDA_SUCCESS hipSuccess
+  #endif  // CUDA_SUCCESS
+
+// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+typedef unsigned long long CUdevice;
+typedef hipDeviceptr_t CUdeviceptr;
+typedef hipError_t CUresult;
+typedef hipCtx_t CUcontext;
+typedef hipStream_t CUstream;
+typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
+typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
+typedef hipMemAllocationProp CUmemAllocationProp;
+typedef hipMemAccessDesc CUmemAccessDesc;
+
+  #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+  #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+  #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+  #define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+  #define CU_MEM_ALLOCATION_COMP_NONE 0x0
+
+// Error Handling
+// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
+CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
+  *pStr = hipGetErrorString(hipError);
+  return CUDA_SUCCESS;
+}
+
+// Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
+CUresult cuCtxGetCurrent(CUcontext* ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxGetCurrent(ctx);
+}
+
+CUresult cuCtxSetCurrent(CUcontext ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxSetCurrent(ctx);
+}
+
+// Primary Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
+CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
+  return hipDevicePrimaryCtxRetain(ctx, dev);
+}
+
+// Virtual Memory Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return hipMemAddressFree(ptr, size);
+}
+
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags) {
+  return hipMemAddressReserve(ptr, size, alignment, addr, flags);
+}
+
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
+                     const CUmemAllocationProp* prop,
+                     unsigned long long flags) {
+  return hipMemCreate(handle, size, prop, flags);
+}
+
+CUresult cuMemGetAllocationGranularity(
+    size_t* granularity, const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return hipMemGetAllocationGranularity(granularity, prop, option);
+}
+
+CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags) {
+  return hipMemMap(dptr, size, offset, handle, flags);
+}
+
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return hipMemRelease(handle);
+}
+
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc* desc, size_t count) {
+  return hipMemSetAccess(ptr, size, desc, count);
+}
+
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return hipMemUnmap(ptr, size);
+}
+}  // extern "C"
+
+#else
+////////////////////////////////////////
+// Import CUDA headers for NVIDIA GPUs
+////////////////////////////////////////
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index 1dd7101acc27ded603cd1999104e599571345308..34fb64c413db2bfb07f532b554d98567072391dd 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-from typing import Union
 
 from cutlass_library import *
 
@@ -22,31 +21,31 @@ class MixedInputKernelScheduleType(enum.Enum):
     TmaWarpSpecializedCooperative = enum_auto()
 
 
-VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
     **DataTypeNames,  # type: ignore
     **{
         VLLMDataType.u4b8: "u4b8",
         VLLMDataType.u8b128: "u8b128",
-    }
+    },
 }
 
-VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
     **DataTypeTag,  # type: ignore
     **{
         VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
         VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    }
+    },
 }
 
-VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
     **DataTypeSize,  # type: ignore
     **{
         VLLMDataType.u4b8: 4,
         VLLMDataType.u8b128: 8,
-    }
+    },
 }
 
-VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
     VLLMDataType.u4b8: "vllm::kU4B8",
     VLLMDataType.u8b128: "vllm::kU8B128",
     DataType.u4: "vllm::kU4",
@@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     DataType.bf16: "vllm::kBfloat16",
 }
 
-VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
     DataType.u8: "at::ScalarType::Byte",
     DataType.s8: "at::ScalarType::Char",
     DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@@ -67,15 +66,11 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     DataType.f32: "at::ScalarType::Float",
 }
 
-VLLMKernelScheduleTag: dict[Union[
-    MixedInputKernelScheduleType, KernelScheduleType], str] = {
-        **KernelScheduleTag,  # type: ignore
-        **{
-            MixedInputKernelScheduleType.TmaWarpSpecialized:
-            "cutlass::gemm::KernelTmaWarpSpecialized",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
-        }
-    }
+VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
+    **KernelScheduleTag,  # type: ignore
+    **{
+        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
+        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
+        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
+    },
+}
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 995374a50b037df4bc5a31057fd3ad1857f3201d..9ae0ed975edde1931dc4bac22df41992a4290276 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -88,3 +88,32 @@
 #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(                                              \
       TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_VEC_SIZE(VEC_SIZE, ...) \
+  switch (VEC_SIZE) {                         \
+    case 16: {                                \
+      constexpr int vec_size = 16;            \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 8: {                                 \
+      constexpr int vec_size = 8;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 4: {                                 \
+      constexpr int vec_size = 4;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 2: {                                 \
+      constexpr int vec_size = 2;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    default: {                                \
+      constexpr int vec_size = 1;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+  }
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..baff8363162efa2864a8ea30ae987da78628d281
--- /dev/null
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "type_convert.cuh"
+
+#define CHECK_TYPE(x, st)                                              \
+  TORCH_CHECK(x.scalar_type() == st, #x " dtype is ", x.scalar_type(), \
+              ", while ", st, " is expected")
+#define CHECK_TH_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_TH_CUDA(x);    \
+  CHECK_CONTIGUOUS(x)
+
+#ifdef USE_ROCM
+  #define FINAL_MASK 0xffffffffffffffffULL
+
+  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// implementation is copy/pasted from the implementation in ROCm 7.0
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+  #endif
+#else
+  #define FINAL_MASK 0xffffffff
+#endif
+
+namespace tensorrt_llm::common {
+template <typename T, int num>
+struct packed_as;
+// Specialization for packed_as used in this kernel.
+template <>
+struct packed_as<uint, 1> {
+  using type = uint;
+};
+
+template <>
+struct packed_as<uint, 2> {
+  using type = uint2;
+};
+
+template <>
+struct packed_as<uint, 4> {
+  using type = uint4;
+};
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+inline __device__ __host__ T divUp(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+}  // namespace tensorrt_llm::common
+
+namespace tensorrt_llm::kernels {
+// NOTE(zhuhaoran): This kernel is adapted from TensorRT-LLM implementation,
+// with added support for passing the cos_sin_cache as an input.
+// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu
+
+// Perform per-head QK Norm and RoPE in a single kernel.
+// scalar_t_in: data type of QKV and RMSNorm weights
+// scalar_t_cache: data type of cos/sin cache
+// head_dim: the dimension of each head
+// interleave: interleave=!is_neox.
+template <typename scalar_t_in, typename scalar_t_cache, int head_dim,
+          bool interleave>
+__global__ void fusedQKNormRopeKernel(
+    void* qkv_void,                  // Combined QKV tensor
+    int const num_heads_q,           // Number of query heads
+    int const num_heads_k,           // Number of key heads
+    int const num_heads_v,           // Number of value heads
+    float const eps,                 // Epsilon for RMS normalization
+    void const* q_weight_void,       // RMSNorm weights for query
+    void const* k_weight_void,       // RMSNorm weights for key
+    void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
+    int64_t const* position_ids,     // Position IDs for RoPE
+    int const num_tokens             // Number of tokens
+) {
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
+                std::is_same_v<scalar_t_cache, c10::BFloat16>) {
+    return;
+  } else {
+#endif
+
+    using Converter = vllm::_typeConvert<scalar_t_in>;
+    static_assert(Converter::exists,
+                  "Input QKV data type is not supported for this CUDA "
+                  "architecture or toolkit version.");
+    using T_in = typename Converter::hip_type;
+    using T2_in = typename Converter::packed_hip_type;
+
+    using CacheConverter = vllm::_typeConvert<scalar_t_cache>;
+    static_assert(CacheConverter::exists,
+                  "Cache data type is not supported for this CUDA architecture "
+                  "or toolkit version.");
+    using T_cache = typename CacheConverter::hip_type;
+
+    T_in* qkv = reinterpret_cast<T_in*>(qkv_void);
+    T_in const* q_weight = reinterpret_cast<T_in const*>(q_weight_void);
+    T_in const* k_weight = reinterpret_cast<T_in const*>(k_weight_void);
+    T_cache const* cos_sin_cache =
+        reinterpret_cast<T_cache const*>(cos_sin_cache_void);
+
+    int const warpsPerBlock = blockDim.x / 32;
+    int const warpId = threadIdx.x / 32;
+    int const laneId = threadIdx.x % 32;
+
+    // Calculate global warp index to determine which head/token this warp
+    // processes
+    int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId;
+
+    // Total number of attention heads (Q and K)
+    int const total_qk_heads = num_heads_q + num_heads_k;
+
+    // Determine which token and head type (Q or K) this warp processes
+    int const tokenIdx = globalWarpIdx / total_qk_heads;
+    int const localHeadIdx = globalWarpIdx % total_qk_heads;
+
+    // Skip if this warp is assigned beyond the number of tokens
+    if (tokenIdx >= num_tokens) return;
+
+    bool const isQ = localHeadIdx < num_heads_q;
+    int const headIdx = isQ ? localHeadIdx : localHeadIdx - num_heads_q;
+
+    int const num_heads = num_heads_q + num_heads_k + num_heads_v;
+
+    static_assert(head_dim % (32 * 2) == 0,
+                  "head_dim must be divisible by 64 (each warp processes one "
+                  "head, and each thread gets even number of "
+                  "elements)");
+    constexpr int numElemsPerThread = head_dim / 32;
+    float elements[numElemsPerThread];
+    constexpr int elemSizeBytes = numElemsPerThread * sizeof(__nv_bfloat16);
+    static_assert(elemSizeBytes % 4 == 0,
+                  "numSizeBytes must be a multiple of 4");
+    constexpr int vecSize =
+        elemSizeBytes /
+        4;  // Use packed_as<uint, vecSize> to perform loading/saving.
+    using vec_T = typename tensorrt_llm::common::packed_as<uint, vecSize>::type;
+
+    int offsetWarp;  // Offset for the warp
+    if (isQ) {
+      // Q segment: token offset + head offset within Q segment
+      offsetWarp = tokenIdx * num_heads * head_dim + headIdx * head_dim;
+    } else {
+      // K segment: token offset + entire Q segment + head offset within K
+      // segment
+      offsetWarp = tokenIdx * num_heads * head_dim + num_heads_q * head_dim +
+                   headIdx * head_dim;
+    }
+    int offsetThread = offsetWarp + laneId * numElemsPerThread;
+
+    // Sum of squares for RMSNorm
+    float sumOfSquares = 0.0f;
+
+    // Load.
+    {
+      vec_T vec = *reinterpret_cast<vec_T const*>(&qkv[offsetThread]);
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Interpret the generic vector chunk as the specific packed type
+        T2_in packed_val = *(reinterpret_cast<T2_in*>(&vec) + i);
+        // Convert to float2 for computation
+        float2 vals = Converter::convert(packed_val);
+        sumOfSquares += vals.x * vals.x;
+        sumOfSquares += vals.y * vals.y;
+
+        elements[2 * i] = vals.x;
+        elements[2 * i + 1] = vals.y;
+      }
+    }
+
+    // Reduce sum across warp using the utility function
+    sumOfSquares = tensorrt_llm::common::warpReduceSum(sumOfSquares);
+
+    // Compute RMS normalization factor
+    float rms_rcp = rsqrtf(sumOfSquares / static_cast<float>(head_dim) + eps);
+
+    // Normalize elements
+#pragma unroll
+    for (int i = 0; i < numElemsPerThread; i++) {
+      int dim = laneId * numElemsPerThread + i;
+      float weight = isQ ? Converter::convert(q_weight[dim])
+                         : Converter::convert(k_weight[dim]);
+      elements[i] *= rms_rcp * weight;
+    }
+
+    // Apply RoPE to normalized elements
+    float elements2[numElemsPerThread];  // Additional buffer required for RoPE.
+
+    int64_t pos_id = position_ids[tokenIdx];
+
+    // Calculate cache pointer for this position - similar to
+    // pos_encoding_kernels.cu
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
+    int const embed_dim = head_dim / 2;
+    T_cache const* cos_ptr = cache_ptr;
+    T_cache const* sin_ptr = cache_ptr + embed_dim;
+
+    if constexpr (interleave) {
+      // Perform interleaving. Use pre-computed cos/sin values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread / 2; ++i) {
+        int const idx0 = 2 * i;
+        int const idx1 = 2 * i + 1;
+
+        float const val0 = elements[idx0];
+        float const val1 = elements[idx1];
+
+        int const dim_idx = laneId * numElemsPerThread + idx0;
+        int const half_dim = dim_idx / 2;
+        float const cos_val =
+            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float const sin_val =
+            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[idx0] = val0 * cos_val - val1 * sin_val;
+        elements[idx1] = val0 * sin_val + val1 * cos_val;
+      }
+    } else {
+      // Before data exchange with in warp, we need to sync.
+      __syncwarp();
+      // Get the data from the other half of the warp. Use pre-computed cos/sin
+      // values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread; i++) {
+        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
+        if (laneId < 16) {
+          elements2[i] = -elements2[i];
+        }
+
+        int dim_idx = laneId * numElemsPerThread + i;
+        dim_idx = (dim_idx * 2) % head_dim;
+        int half_dim = dim_idx / 2;
+        // Use pre-computed cos/sin from cache
+        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+      }
+      // __shfl_xor_sync does not provide memfence. Need to sync again.
+      __syncwarp();
+    }
+
+    // Store.
+    {
+      vec_T vec;
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Convert from float2 back to the specific packed type
+        T2_in packed_val = Converter::convert(
+            make_float2(elements[2 * i], elements[2 * i + 1]));
+        // Place it into the generic vector
+        *(reinterpret_cast<T2_in*>(&vec) + i) = packed_val;
+      }
+      *reinterpret_cast<vec_T*>(&qkv[offsetThread]) = vec;
+    }
+
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  }
+#endif
+}
+
+// Borrowed from
+// https://github.com/flashinfer-ai/flashinfer/blob/8125d079a43e9a0ba463a4ed1b639cefd084cec9/include/flashinfer/pos_enc.cuh#L568
+#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
+  if (interleave) {                                      \
+    const bool INTERLEAVE = true;                        \
+    __VA_ARGS__                                          \
+  } else {                                               \
+    const bool INTERLEAVE = false;                       \
+    __VA_ARGS__                                          \
+  }
+
+template <typename scalar_t_in, typename scalar_t_cache>
+void launchFusedQKNormRope(void* qkv, int const num_tokens,
+                           int const num_heads_q, int const num_heads_k,
+                           int const num_heads_v, int const head_dim,
+                           float const eps, void const* q_weight,
+                           void const* k_weight, void const* cos_sin_cache,
+                           bool const interleave, int64_t const* position_ids,
+                           cudaStream_t stream) {
+  constexpr int blockSize = 256;
+
+  int const warpsPerBlock = blockSize / 32;
+  int const totalQKHeads = num_heads_q + num_heads_k;
+  int const totalWarps = num_tokens * totalQKHeads;
+
+  int const gridSize = common::divUp(totalWarps, warpsPerBlock);
+  dim3 gridDim(gridSize);
+  dim3 blockDim(blockSize);
+
+  switch (head_dim) {
+    case 64:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 128:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 256:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported head dimension for fusedQKNormRope: ", head_dim);
+  }
+}
+}  // namespace tensorrt_llm::kernels
+
+void fused_qk_norm_rope(
+    torch::Tensor& qkv,       // Combined QKV tensor [num_tokens,
+                              // (num_heads_q+num_heads_k+num_heads_v)*head_dim]
+    int64_t num_heads_q,      // Number of query heads
+    int64_t num_heads_k,      // Number of key heads
+    int64_t num_heads_v,      // Number of value heads
+    int64_t head_dim,         // Dimension per head
+    double eps,               // Epsilon for RMS normalization
+    torch::Tensor& q_weight,  // RMSNorm weights for query [head_dim]
+    torch::Tensor& k_weight,  // RMSNorm weights for key [head_dim]
+    torch::Tensor& cos_sin_cache,  // Cos/sin cache [max_position, head_dim]
+    bool is_neox,                  // Whether RoPE is applied in Neox style
+    torch::Tensor& position_ids    // Position IDs for RoPE [num_tokens]
+) {
+  // Input validation
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(position_ids);
+  CHECK_INPUT(q_weight);
+  CHECK_INPUT(k_weight);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_TYPE(position_ids, torch::kInt64);
+
+  TORCH_CHECK(qkv.dim() == 2,
+              "QKV tensor must be 2D: [num_tokens, "
+              "(num_heads_q+num_heads_k+num_heads_v)*head_dim]");
+  TORCH_CHECK(position_ids.dim() == 1, "Position IDs must be 1D: [num_tokens]");
+  TORCH_CHECK(q_weight.dim() == 1, "Query weights must be 1D: [head_dim]");
+  TORCH_CHECK(k_weight.dim() == 1, "Key weights must be 1D: [head_dim]");
+  TORCH_CHECK(cos_sin_cache.dim() == 2,
+              "Cos/sin cache must be 2D: [max_position, head_dim]");
+  TORCH_CHECK(q_weight.size(0) == head_dim,
+              "Query weights size must match head dimension");
+  TORCH_CHECK(k_weight.size(0) == head_dim,
+              "Key weights size must match head dimension");
+  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
+              "Cos/sin cache dimension must match head_dim");
+  TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
+                  qkv.scalar_type() == k_weight.scalar_type(),
+              "qkv, q_weight and k_weight must have the same dtype");
+
+  int64_t num_tokens = qkv.size(0);
+  TORCH_CHECK(position_ids.size(0) == num_tokens,
+              "Number of tokens in position_ids must match QKV");
+
+  int64_t total_heads = num_heads_q + num_heads_k + num_heads_v;
+  TORCH_CHECK(
+      qkv.size(1) == total_heads * head_dim,
+      "QKV tensor size must match total number of heads and head dimension");
+
+  auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(qkv.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+    using qkv_scalar_t = scalar_t;
+    VLLM_DISPATCH_FLOATING_TYPES(
+        cos_sin_cache.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+          using cache_scalar_t = scalar_t;
+          tensorrt_llm::kernels::launchFusedQKNormRope<qkv_scalar_t,
+                                                       cache_scalar_t>(
+              qkv.data_ptr(), static_cast<int>(num_tokens),
+              static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
+              static_cast<int>(num_heads_v), static_cast<int>(head_dim),
+              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              cos_sin_cache.data_ptr(), !is_neox,
+              reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
+              stream);
+        });
+  });
+}
\ No newline at end of file
diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h
index d5a89690111bc007c093361f93bcd20fd8bbcccc..92d7ef802f97f26e49c41b154732ae8b4208aa05 100644
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@@ -8,11 +8,37 @@
   #define VLLM_LAUNCH_BLOCKS_CAP 4
 #endif
 
-// compile-time estimate of max threads per SM for launch bounds.
+// Compile-time estimate of max threads per SM for launch bounds.
+// Families: 1024, 1536, 2048 threads/SM.
 #ifndef VLLM_MAX_THREADS_PER_SM
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-    #define VLLM_MAX_THREADS_PER_SM 1536
+  #ifdef __CUDA_ARCH__
+
+    /* 1024 thr/SM: Turing (sm_75) */
+    #if (__CUDA_ARCH__ == 750)
+      #define VLLM_MAX_THREADS_PER_SM 1024
+
+    /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
+        GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
+    #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
+        (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) ||  \
+        (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
+        (__CUDA_ARCH__ == 1210)
+      #define VLLM_MAX_THREADS_PER_SM 1536
+
+    /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
+        Hopper (sm_90), Blackwell (sm_100/103) */
+    #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
+        (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) ||   \
+        (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
+      #define VLLM_MAX_THREADS_PER_SM 2048
+
+    /* Fallback: use 2048 for unknown future CCs */
+    #else
+      #define VLLM_MAX_THREADS_PER_SM 2048
+    #endif
+
   #else
+  /* Host pass (no __CUDA_ARCH__): neutral default */
     #define VLLM_MAX_THREADS_PER_SM 2048
   #endif
 #endif
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 93c73d58390e1eda930fd0bac8f7416497d7baf9..48771e4b3aff9def9ff972bc8638fab4b061dcc6 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -1,6 +1,8 @@
 #include "type_convert.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
+#include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"
 
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -8,7 +10,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
+template <typename scalar_t, int VEC_SIZE>
 __global__ void rms_norm_kernel(
     scalar_t* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -17,11 +19,21 @@ __global__ void rms_norm_kernel(
     const float epsilon, const int num_tokens, const int hidden_size) {
   __shared__ float s_variance;
   float variance = 0.0f;
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
     variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
@@ -32,10 +44,20 @@ __global__ void rms_norm_kernel(
   }
   __syncthreads();
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
+  scalar_t* out_row = out + blockIdx.x * hidden_size;
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  auto* v_out = reinterpret_cast<vec_n_t<scalar_t, VEC_SIZE>*>(out_row);
+  for (int i = threadIdx.x; i < hidden_size / VEC_SIZE; i += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> dst;
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[i];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      dst.val[j] = ((scalar_t)(x * s_variance)) * src2.val[j];
+    }
+    v_out[i] = dst;
   }
 }
 
@@ -135,211 +157,6 @@ fused_add_rms_norm_kernel(
   }
 }
 
-/* Function specialization in the case of FP16/BF16 tensors.
-   Additional optimizations we can make in this case are
-   packed and vectorized operations, which help with the
-   memory latency bottleneck.
-
-   _f16VecPN struct extends _f16Vec to add operations specifically required for
-   polynomial normalization (poly norm).
-   The original _f16Vec does not include the sum-of-powers computation or
-   in-place polynomial normalization logic. */
-template <typename scalar_t, int width>
-struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
-  using Base = _f16Vec<scalar_t, width>;
-  using Converter = typename Base::Converter;
-  using T1 = typename Base::T1;
-  using T2 = typename Base::T2;
-  using Base::data;
-
-  __device__ auto sum_pows() const {
-    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-      float x2 = z.x * z.x;
-      float x4 = x2 * x2;
-      float x6 = x4 * x2;
-
-      float y2 = z.y * z.y;
-      float y4 = y2 * y2;
-      float y6 = y4 * y2;
-
-      s2 += x2 + y2;
-      s4 += x4 + y4;
-      s6 += x6 + y6;
-    }
-    return std::make_tuple(s2, s4, s6);
-  }
-
-  __device__ void poly_norm_inplace(const float w2_inv_std,
-                                    const float w1_inv_std2,
-                                    const float w0_inv_std3, const float bias) {
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-
-      float x2 = z.x * z.x;
-      float x3 = x2 * z.x;
-      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
-
-      float y2 = z.y * z.y;
-      float y3 = y2 * z.y;
-      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
-
-      auto out = Converter::convert(z);
-      data[i] = out.x;
-      data[i + 1] = out.y;
-    }
-  }
-};
-
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  // Sanity checks on our vector struct and type-punned pointer arithmetic
-  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
-  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
-
-  /* These and the argument pointers are all declared `restrict` as they are
-     not aliased in practice. Argument pointers should not be dereferenced
-     in this kernel as that would be undefined behavior */
-  auto* __restrict__ input_v =
-      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
-  const int vec_hidden_size = hidden_size / width;
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    auto [x2, x4, x6] = temp.sum_pows();
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
-    out_v[id] = temp;
-  }
-}
-
-/* Generic poly_norm_kernel
-   The width field is not used here but necessary for other specializations.
- */
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x4 = x2 * x2;
-    float x6 = x4 * x2;
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x3 = x2 * x;
-
-    out[blockIdx.x * hidden_size + idx] =
-        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
-                   s_bias);
-  }
-}
-
 }  // namespace vllm
 
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@@ -351,18 +168,34 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
   TORCH_CHECK(weight.is_contiguous());
 
   int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  int64_t input_stride = input.stride(-2);
 
+  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
+  // Instead, we use a 2d view to get the second-innermost stride.
+  // That way the dimensions (except the last one) can be arbitrarily permuted.
+  torch::Tensor input_view = input.view({-1, hidden_size});
+
+  int num_tokens = input_view.numel() / hidden_size;
+  int64_t input_stride = input_view.stride(-2);
+
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
-  });
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input_view.scalar_type(), "rms_norm_kernel", [&] {
+        const int calculated_vec_size =
+            std::gcd(16 / sizeof(scalar_t), hidden_size);
+        const int block_size =
+            std::min(hidden_size / calculated_vec_size, max_block_size);
+        dim3 block(block_size);
+        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
+              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
+              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+              hidden_size);
+        });
+      });
 }
 
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
@@ -379,6 +212,8 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                         torch::Tensor& residual,  // [..., hidden_size]
                         torch::Tensor& weight,    // [hidden_size]
                         double epsilon) {
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == residual.scalar_type());
   TORCH_CHECK(residual.is_contiguous());
   TORCH_CHECK(weight.is_contiguous());
   int hidden_size = input.size(-1);
@@ -413,55 +248,11 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                           wt_ptr % req_alignment_bytes == 0;
   bool offsets_are_multiple_of_vector_width =
       hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
+  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
+      !batch_invariant_launch) {
     LAUNCH_FUSED_ADD_RMS_NORM(8);
   } else {
     LAUNCH_FUSED_ADD_RMS_NORM(0);
   }
 }
-
-#define LAUNCH_FUSED_POLY_NORM(width)                                         \
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
-    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
-        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
-        hidden_size);                                                         \
-  });
-
-void poly_norm(torch::Tensor& out,     // [..., hidden_size]
-               torch::Tensor& input,   // [..., hidden_size]
-               torch::Tensor& weight,  // [3]
-               torch::Tensor& bias,    // [1]
-               double epsilon) {
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.data_ptr() != input.data_ptr());
-
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-
-  dim3 grid(num_tokens);
-  /* This kernel is memory-latency bound in many scenarios.
-     When num_tokens is large, a smaller block size allows
-     for increased block occupancy on CUs and better latency
-     hiding on global mem ops. */
-  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
-  dim3 block(std::min(hidden_size, max_block_size));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  /*If the tensor types are FP16/BF16, try to use the optimized kernel
-    with packed + vectorized ops.
-    Max optimization is achieved with a width-8 vector of FP16/BF16s
-    since we can load at most 128 bits at once in a global memory op.
-    However, this requires each tensor's data to be aligned to 16
-    bytes.
-   */
-  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
-  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
-  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  if (ptrs_are_aligned && hidden_size % 8 == 0) {
-    LAUNCH_FUSED_POLY_NORM(8);
-  } else {
-    LAUNCH_FUSED_POLY_NORM(0);
-  }
-}
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index be134089bd6d483e5b57787dd13759abb8e9a005..0880b8d50a79536ae4aa4f1c32d368056b0efe42 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -6,9 +6,11 @@
  */
 
 #include "type_convert.cuh"
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
+#include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"
 
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -16,7 +18,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename fp8_type>
+template <typename scalar_t, typename fp8_type, int VEC_SIZE>
 __global__ void rms_norm_static_fp8_quant_kernel(
     fp8_type* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -27,10 +29,21 @@ __global__ void rms_norm_static_fp8_quant_kernel(
   __shared__ float s_variance;
   float variance = 0.0f;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
+
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
     variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
@@ -44,11 +57,18 @@ __global__ void rms_norm_static_fp8_quant_kernel(
   // invert scale to avoid division
   float const scale_inv = 1.0f / *scale;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
-    out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  for (int idx = threadIdx.x; idx < hidden_size / VEC_SIZE; idx += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[idx];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[idx];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      float const out_norm = ((scalar_t)(x * s_variance)) * src2.val[j];
+      out[blockIdx.x * hidden_size + idx * VEC_SIZE + j] =
+          scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+    }
   }
 }
 
@@ -174,20 +194,29 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
   int input_stride = input.stride(-2);
   int num_tokens = input.numel() / hidden_size;
 
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
-              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      input_stride, weight.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), epsilon, num_tokens,
-                      hidden_size);
+              const int calculated_vec_size =
+                  std::gcd(16 / sizeof(scalar_t), hidden_size);
+              const int block_size =
+                  std::min(hidden_size / calculated_vec_size, max_block_size);
+              dim3 block(block_size);
+              VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+                vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t,
+                                                       vec_size>
+                    <<<grid, block, 0, stream>>>(
+                        out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                        input_stride, weight.data_ptr<scalar_t>(),
+                        scale.data_ptr<float>(), epsilon, num_tokens,
+                        hidden_size);
+              });
             });
       });
 }
@@ -215,6 +244,8 @@ void fused_add_rms_norm_static_fp8_quant(
     double epsilon) {
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(residual.is_contiguous());
+  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
   int hidden_size = input.size(-1);
   int input_stride = input.stride(-2);
   int num_tokens = input.numel() / hidden_size;
@@ -240,7 +271,9 @@ void fused_add_rms_norm_static_fp8_quant(
   auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
   bool ptrs_are_aligned =
       inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
+  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
+      !batch_invariant_launch) {
     LAUNCH_FUSED_ADD_RMS_NORM(8);
   } else {
     LAUNCH_FUSED_ADD_RMS_NORM(0);
diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index 13c6178941cf8422527027eff51f44f1b4969398..7d22dd8b84a39dfd22af737739558ff9a1976d01 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -24,6 +24,8 @@ struct SSMParamsBase {
     int64_t pad_slot_id;
 
     bool delta_softplus;
+    bool cache_enabled;
+    int block_size;
 
     index_t A_d_stride;
     index_t A_dstate_stride;
@@ -46,8 +48,9 @@ struct SSMParamsBase {
     index_t out_z_batch_stride;
     index_t out_z_d_stride;
     index_t ssm_states_batch_stride;
-    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dim_stride;
     index_t ssm_states_dstate_stride;
+    index_t cache_indices_stride;
 
     // Common data pointers.
     void *__restrict__ A_ptr;
@@ -66,6 +69,9 @@ struct SSMParamsBase {
     void *__restrict__ cache_indices_ptr;
     void *__restrict__ has_initial_state_ptr;
 
+    void *__restrict__ block_idx_first_scheduled_token_ptr;  // (batch,) - first block to write
+    void *__restrict__ block_idx_last_scheduled_token_ptr;   // (batch,) - last block to write
+    void *__restrict__ initial_state_idx_ptr;  // (batch,) - index of the initial state to use
 };
 
 
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index d534e138d26d6123f0138902bdcd8508297f16ce..fb2a2e5789999ab532f7b6f9d9357cc563e7be49 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -119,7 +119,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
 
     const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
         : reinterpret_cast<int *>(params.cache_indices_ptr);
-    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; 
     // cache_index == params.pad_slot_id is defined as padding, so we exit early
     if (cache_index == params.pad_slot_id){
         return;
@@ -133,9 +133,18 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
-    cache_index * params.ssm_states_batch_stride + 
-    dim_id * kNRows * params.ssm_states_dim_stride;
+
+    typename Ktraits::state_t *ssm_states;
+    if (params.cache_enabled) {
+        // APC mode: ssm_states points to the base, we'll use absolute cache slots later
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    } else {
+        // Non-APC mode: offset by cache_index as before
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            cache_index * params.ssm_states_batch_stride +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    }
     
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
@@ -159,7 +168,22 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     // }
 
     constexpr int kChunkSize = kNThreads * kNItems;
-    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+
+    // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
+    const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
+    const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
+
+    const int* batch_cache_indices = cache_indices != nullptr ?
+                                     cache_indices + batch_id * params.cache_indices_stride : nullptr;
+    const int* block_idx_first_scheduled = params.block_idx_first_scheduled_token_ptr != nullptr ?
+                                           reinterpret_cast<const int*>(params.block_idx_first_scheduled_token_ptr) : nullptr;
+    const int* block_idx_last_scheduled = params.block_idx_last_scheduled_token_ptr != nullptr ?
+                                          reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
+    const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
+                                   reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
+
+    const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
+
     for (int chunk = 0; chunk < n_chunks; ++chunk) {
         input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
 
@@ -219,7 +243,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             if constexpr (kIsVariableC) {
                 auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                 load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
                 if constexpr (!kIsVariableB) {
                     #pragma unroll
                     for (int r = 0; r < kNRows; ++r) {
@@ -242,7 +266,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 for (int i = 0; i < kNItems; ++i) {
                     thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                  !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    
                     if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
                         if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
                             thread_data[i] = make_float2(1.f, 0.f);
@@ -250,8 +273,24 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                     }
                 }
                 // Initialize running total
-
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
+                scan_t running_prefix;
+                if (chunk > 0) {
+                    running_prefix = smem_running_prefix[state_idx + r * MAX_DSTATE];
+                } else {
+                    // Load initial state
+                    if (params.cache_enabled && has_initial_state && batch_cache_indices != nullptr) {
+                        size_t state_offset = load_cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+                        running_prefix = make_float2(1.0, float(ssm_states[state_offset]));
+                    } else if (has_initial_state) {
+                        // Non-APC mode: load from current batch position
+                        running_prefix = make_float2(1.0, float(ssm_states[state_idx * params.ssm_states_dstate_stride]));
+                    } else {
+                        // No initial state
+                        running_prefix = make_float2(1.0, 0.0);
+                    }
+                }
 
                 SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                 typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -260,8 +299,25 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 // There's a syncthreads in the scan op, so we don't need to sync here.
                 // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
                 if (threadIdx.x == 0) {
-                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
-                    if (chunk == n_chunks - 1) {
+                    smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
+
+                    // Store state at the end of each chunk when cache is enabled
+                    if (params.cache_enabled && batch_cache_indices != nullptr) {
+
+                        size_t cache_slot;
+                        if (chunk == n_chunks - 1) {
+                            cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
+                        } else {
+                            cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
+                        }
+
+                        size_t state_offset = cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+
+                        ssm_states[state_offset] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                    } else if (!params.cache_enabled && chunk == n_chunks - 1) {
+                        // Non-APC mode: store only final state at current batch position
                         ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
@@ -274,7 +330,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 }
             }
         }
-        
         input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
             + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
         __syncthreads();
@@ -346,7 +401,9 @@ template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
-        if (params.seqlen <= 128) {           
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 128) {
             selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
@@ -358,7 +415,9 @@ void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
             selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
-        if (params.seqlen <= 256) {
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
             selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
@@ -437,13 +496,17 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         const std::optional<at::Tensor>& D,
                         const std::optional<at::Tensor>& delta_bias,
                         const torch::Tensor ssm_states,
-                        bool has_z, 
+                        bool has_z,
                         bool delta_softplus,
                         const std::optional<at::Tensor>& query_start_loc,
                         const std::optional<at::Tensor>& cache_indices,
                         const std::optional<at::Tensor>& has_initial_state,
                         bool varlen,
-                        int64_t pad_slot_id) {
+                        int64_t pad_slot_id,
+                        int64_t block_size,
+                        const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                        const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                        const std::optional<torch::Tensor> &initial_state_idx) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -477,6 +540,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
     params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
     params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
 
+    // Set cache parameters - cache is enabled if we have direct cache writing params
+    params.cache_enabled = block_idx_first_scheduled_token.has_value();
+    params.block_size = static_cast<int>(block_size);
+
+    // Set direct cache writing pointers
+    params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
+    params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
+    params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;
 
     // All stride are in elements, not bytes.
     params.A_d_stride = A.stride(0);
@@ -504,9 +575,11 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(0);
 
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
 
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
+
     }
     else{
         if (!is_variable_B) {
@@ -537,8 +610,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(1);
         
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
+
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
     }
 }
 
@@ -554,7 +629,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   const torch::Tensor &ssm_states,
                   // used to identify padding entries if cache_indices provided
                   // in case of padding, the kernel will return early
-                  int64_t pad_slot_id) {
+                  int64_t pad_slot_id,
+                  int64_t block_size,
+                  const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                  const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                  const std::optional<torch::Tensor> &initial_state_idx) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -646,7 +725,16 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
         auto cache_indices_ = cache_indices.value();
         TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
         TORCH_CHECK(cache_indices_.is_cuda());
-        CHECK_SHAPE(cache_indices_, batch_size);
+
+        // cache_indices can be either 1D (batch_size,) for non-APC mode
+        // or 2D (batch_size, max_positions) for APC mode
+        const bool is_apc_mode = block_idx_first_scheduled_token.has_value();
+        if (is_apc_mode) {
+            TORCH_CHECK(cache_indices_.dim() == 2, "cache_indices must be 2D for APC mode");
+            TORCH_CHECK(cache_indices_.size(0) == batch_size, "cache_indices first dimension must match batch_size");
+        } else {
+            CHECK_SHAPE(cache_indices_, batch_size);
+        }
     }
    
 
@@ -686,7 +774,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                        cache_indices,
                        has_initial_state,
                        varlen,
-                       pad_slot_id
+                       pad_slot_id,
+                       block_size,
+                       block_idx_first_scheduled_token,
+                       block_idx_last_scheduled_token,
+                       initial_state_idx
                        );
 
     
diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
index 1d06fc6b5b0a07f2be8ac49b63886f68581b2784..df47bb8dd1d7d817e1378f3f533e091426bb99fc 100644
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -87,30 +87,23 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
   const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
   const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
 
-  // Per-expert outputs filled in parallel
-  std::vector<torch::Tensor> y_list(E);
-  y_list.resize(E);
+  auto X_all = x_c.index_select(/*dim=*/0, expert_tokens);
+  if (apply_router_weight_on_input) {
+    X_all = X_all.mul(expert_gates.unsqueeze(1));
+  }
+  auto Y_all = at::empty({offsets[E], H}, x_c.options());
 
   at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+    c10::InferenceMode guard;
     for (int64_t e = e_begin; e < e_end; ++e) {
       const int64_t te = counts[e];
       if (te == 0) {
-        y_list[e] = at::empty({0, H}, x_c.options());
         continue;
       }
 
       const int64_t start = offsets[e];
 
-      auto sel_tokens =
-          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-      auto gates_e =
-          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-
-      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
-
-      if (apply_router_weight_on_input) {
-        x_e = x_e.mul(gates_e.unsqueeze(1));
-      }
+      auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
 
       auto w13_e = w13_packed.select(/*dim=*/0, e);
       auto w2_e = w2_packed.select(/*dim=*/0, e);
@@ -137,17 +130,15 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
       // W2
       auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
 
-      if (!apply_router_weight_on_input) {
-        y = y.mul(gates_e.unsqueeze(1));
-      }
-
       // Store per-expert result
-      y_list[e] = y;
+      Y_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te).copy_(y);
     }
   });
 
-  // Concatenate all expert outputs to match expert_tokens order
-  auto Y_all = at::cat(y_list, /*dim=*/0);
+  if (!apply_router_weight_on_input) {
+    Y_all = Y_all.mul(expert_gates.unsqueeze(1));
+  }
+
   auto out = at::zeros({T, H}, x.options());
   out =
       at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index c93f9d54d780c25a4ccaef64c400028dbebed21d..69b4c1fb11d1a8c26ca4ca61bb217d81524159da 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -427,11 +427,29 @@ __device__ inline bool is_finite(const T val) {
 #endif
 }
 
+// Scoring function enums
+enum ScoringFunc {
+  SCORING_NONE = 0,    // no activation function
+  SCORING_SIGMOID = 1  // apply sigmoid
+};
+
+// Efficient sigmoid approximation from TensorRT-LLM
+__device__ inline float sigmoid_accurate(float x) {
+  return 0.5f * tanhf(0.5f * x) + 0.5f;
+}
+
 template <typename T>
-__device__ void topk_with_k2(T* output, T const* input,
+__device__ inline T apply_sigmoid(T val) {
+  float f = cuda_cast<float, T>(val);
+  return cuda_cast<T, float>(sigmoid_accurate(f));
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input, T const* bias,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
-                             int const num_experts_per_group) {
+                             int const num_experts_per_group,
+                             int const scoring_func) {
   // Get the top2 per thread
   T largest = neg_inf<T>();
   T second_largest = neg_inf<T>();
@@ -439,6 +457,12 @@ __device__ void topk_with_k2(T* output, T const* input,
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
       T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+
       if (value > largest) {
         second_largest = largest;
         largest = value;
@@ -448,7 +472,13 @@ __device__ void topk_with_k2(T* output, T const* input,
     }
   } else {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      largest = input[i];
+      T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+      largest = value;
     }
   }
 
@@ -472,17 +502,21 @@ __device__ void topk_with_k2(T* output, T const* input,
 }
 
 template <typename T>
-__global__ void topk_with_k2_kernel(T* output, T* input,
+__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                     int64_t const num_tokens,
                                     int64_t const num_cases,
                                     int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
+                                    int64_t const num_experts_per_group,
+                                    int const scoring_func) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
 
   int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
   if (case_id < num_cases) {
     input += case_id * num_experts_per_group;
+    // bias is per expert group, offset to current group
+    int32_t group_id = case_id % n_group;
+    T const* group_bias = bias + group_id * num_experts_per_group;
     output += case_id;
 
     cg::thread_block block = cg::this_thread_block();
@@ -491,7 +525,8 @@ __global__ void topk_with_k2_kernel(T* output, T* input,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+    topk_with_k2(output, input, group_bias, tile, lane_id,
+                 num_experts_per_group, scoring_func);
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
@@ -500,16 +535,15 @@ __global__ void topk_with_k2_kernel(T* output, T* input,
 
 template <typename T, typename IdxT>
 __global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
-    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
+    T const* bias, int64_t const num_tokens, int64_t const n_group,
     int64_t const topk_group, int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor) {
+    double routed_scaling_factor, int scoring_func) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
   int32_t case_id =
       blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores_with_bias += case_id * num_experts;
   scores += case_id * num_experts;
   group_scores += case_id * n_group;
   topk_values += case_id * topk;
@@ -577,10 +611,16 @@ __global__ void group_idx_and_topk_idx_kernel(
         int32_t offset = i_group * num_experts_per_group;
         for (int32_t i = lane_id; i < align_num_experts_per_group;
              i += WARP_SIZE) {
-          T candidates = (i < num_experts_per_group) &&
-                                 is_finite(scores_with_bias[offset + i])
-                             ? scores_with_bias[offset + i]
-                             : neg_inf<T>();
+          T candidates = neg_inf<T>();
+          if (i < num_experts_per_group) {
+            // Apply scoring function (if any) and add bias
+            T input = scores[offset + i];
+            if (is_finite(input)) {
+              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
+                                                          : input;
+              candidates = score + bias[offset + i];
+            }
+          }
           queue.add(candidates, offset + i);
         }
         if (group_scores[i_group] == topk_group_value) {
@@ -602,11 +642,12 @@ __global__ void group_idx_and_topk_idx_kernel(
     for (int i = lane_id;
          i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
          i += WARP_SIZE) {
-      T value =
-          i < topk
-              ? scores[s_topk_idx[i]]
-              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      T value = cuda_cast<T, float>(0.0f);
       if (i < topk) {
+        // Load the score value (without bias) for normalization
+        T input = scores[s_topk_idx[i]];
+        value =
+            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
         s_topk_value[i] = value;
       }
       topk_sum +=
@@ -627,12 +668,12 @@ __global__ void group_idx_and_topk_idx_kernel(
           value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
         }
         topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = cuda_cast<T, float>(value);
+        topk_values[i] = value;
       }
     } else {
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
         topk_indices[i] = i;
-        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+        topk_values[i] = 1.0f / topk;
       }
     }
     // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
@@ -644,12 +685,12 @@ __global__ void group_idx_and_topk_idx_kernel(
 }
 
 template <typename T, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
-                   IdxT* topk_indices, T* scores_with_bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, bool enable_pdl = false,
+void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
+                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
+                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk,
+                   bool const renormalize, double const routed_scaling_factor,
+                   int const scoring_func, bool enable_pdl = false,
                    cudaStream_t const stream = 0) {
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -664,8 +705,9 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group);
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group,
+                     scoring_func);
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -682,19 +724,18 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
   config.numAttrs = 1;
   config.attrs = attrs;
   cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, scores_with_bias, num_tokens,
-                     n_group, topk_group, topk, num_experts,
-                     num_experts / n_group, renormalize, routed_scaling_factor);
+                     topk_values, topk_indices, bias, num_tokens, n_group,
+                     topk_group, topk, num_experts, num_experts / n_group,
+                     renormalize, routed_scaling_factor, scoring_func);
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
   template void invokeNoAuxTc<T, IdxT>(                                     \
-      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
-      T * scores_with_bias, int64_t const num_tokens,                       \
-      int64_t const num_experts, int64_t const n_group,                     \
-      int64_t const topk_group, int64_t const topk, bool const renormalize, \
-      double const routed_scaling_factor, bool enable_pdl,                  \
-      cudaStream_t const stream);
+      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
+      T const* bias, int64_t const num_tokens, int64_t const num_experts,   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,  \
+      bool const renormalize, double const routed_scaling_factor,           \
+      int const scoring_func, bool enable_pdl, cudaStream_t const stream);
 
 INSTANTIATE_NOAUX_TC(float, int32_t);
 INSTANTIATE_NOAUX_TC(half, int32_t);
@@ -703,28 +744,32 @@ INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
 }  // namespace vllm
 
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor) {
-  auto data_type = scores_with_bias.scalar_type();
-  auto input_size = scores_with_bias.sizes();
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func = 0) {
+  auto data_type = scores.scalar_type();
+  auto input_size = scores.sizes();
   int64_t num_tokens = input_size[0];
   int64_t num_experts = input_size[1];
-  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
   TORCH_CHECK(num_experts % n_group == 0,
               "num_experts should be divisible by n_group");
   TORCH_CHECK(n_group <= 32,
               "n_group should be smaller than or equal to 32 for now");
   TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
+                  scoring_func == vllm::moe::SCORING_SIGMOID,
+              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");
 
   torch::Tensor group_scores = torch::empty(
       {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  // Always output float32 for topk_values (eliminates Python-side conversion)
   torch::Tensor topk_values = torch::empty(
-      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
   torch::Tensor topk_indices = torch::empty(
       {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
 
-  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+  auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());
 
   switch (data_type) {
     case torch::kFloat16:
@@ -732,11 +777,11 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
       vllm::moe::invokeNoAuxTc<half, int32_t>(
           reinterpret_cast<half*>(scores.mutable_data_ptr()),
           reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,
           num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     case torch::kFloat32:
       // Handle Float32
@@ -745,20 +790,20 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
           reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
           reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,
           num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     case torch::kBFloat16:
       // Handle BFloat16
       vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
           reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
           reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
-          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     default:
       // Handle other data types
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 698deb107cc06193b1f98de2e7b4676e2a9c4130..be5b68cc53e6f0e870dac99a0c3dad531cf59a7f 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -17,25 +17,30 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
 
-TEMPLATE = ("template __global__ void Marlin<"
-            "{{scalar_t}}, "
-            "{{w_type_id}}, "
-            "{{s_type_id}}, "
-            "{{threads}}, "
-            "{{thread_m_blocks}}, "
-            "{{thread_n_blocks}}, "
-            "{{thread_k_blocks}}, "
-            "{{'true' if m_block_size_8 else 'false'}}, "
-            "{{stages}}, "
-            "{{group_blocks}}, "
-            "{{'true' if is_zp_float else 'false'}}>"
-            "( MARLIN_KERNEL_PARAMS );")
+TEMPLATE = (
+    "template __global__ void Marlin<"
+    "{{scalar_t}}, "
+    "{{w_type_id}}, "
+    "{{s_type_id}}, "
+    "{{threads}}, "
+    "{{thread_m_blocks}}, "
+    "{{thread_n_blocks}}, "
+    "{{thread_k_blocks}}, "
+    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{stages}}, "
+    "{{group_blocks}}, "
+    "{{'true' if is_zp_float else 'false'}}>"
+    "( MARLIN_KERNEL_PARAMS );"
+)
 
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kFE2M1f"
+    "vllm::kU4",
+    "vllm::kU4B8",
+    "vllm::kU8B128",
+    "vllm::kFE4M3fn",
+    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 
@@ -58,11 +63,12 @@ def generate_new_kernels():
         all_template_str_list = []
 
         for group_blocks, m_blocks, thread_configs in itertools.product(
-                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-
+            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+        ):
             # act order case only support gptq-int4 and gptq-int8
             if group_blocks == 0 and scalar_type not in [
-                    "vllm::kU4B8", "vllm::kU8B128"
+                "vllm::kU4B8",
+                "vllm::kU8B128",
             ]:
                 continue
             if thread_configs[2] == 256:
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 629348bf8876439c9c7c8c4abd333ef9cc41ff0d..b3d0c0aa58e9efb188b55ba4e100107490f0fe58 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -8,12 +8,77 @@
 
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
+#include "core/math.hpp"
 
 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
 
 namespace vllm {
 namespace moe {
 
+namespace batched_moe_align_block_size {
+
+// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
+static constexpr int32_t num_threads = 1024;
+static constexpr int32_t num_blocks = 1;
+__global__ void batched_moe_align_block_size_kernel(
+    int32_t const num_batches, int32_t const max_tokens_per_batch,
+    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
+    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
+    int32_t* __restrict__ num_tokens_post_pad) {
+  // TODO(varun): This is a naive implementation. Could be optimized.
+
+  size_t const batch_id = threadIdx.x;
+  size_t const stride = blockDim.x * gridDim.x;
+  int32_t const num_blocks_per_batch =
+      CEILDIV(max_tokens_per_batch, block_size);
+  int32_t const sorted_ids_size =
+      num_blocks_per_batch * num_batches * block_size;
+  int32_t const block_ids_size = sorted_ids_size / block_size;
+  int32_t const SENTINEL =
+      num_batches * max_tokens_per_batch;  // To denote invalid entries.
+  // Intialize sorted_ids
+  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
+    sorted_ids[i] = SENTINEL;
+  }
+  // Intialize expert_ids with -1
+  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
+    block_ids[i] = -1;
+  }
+
+  int32_t b_num_tokens = 0;
+  if (batch_id < num_batches) {
+    b_num_tokens = batch_num_tokens[batch_id];
+  }
+  int32_t const ceil_b_num_tokens =
+      CEILDIV(b_num_tokens, block_size) * block_size;
+
+  // Compute prefix sum over token counts per expert
+  using BlockScan = cub::BlockScan<int32_t, 1024>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  int cumsum_val;
+  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
+  __syncthreads();
+
+  bool const is_last_batch = batch_id == (num_batches - 1);
+  if (is_last_batch) {
+    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
+  }
+
+  if (batch_id < num_batches) {
+    int32_t const batch_offset = batch_id * max_tokens_per_batch;
+    for (size_t i = 0; i < b_num_tokens; ++i) {
+      sorted_ids[cumsum_val + i] = batch_offset + i;
+    }
+
+    int32_t const block_start = cumsum_val / block_size;
+    int32_t const num_blocks = ceil_b_num_tokens / block_size;
+    for (size_t i = 0; i < num_blocks; ++i) {
+      block_ids[block_start + i] = batch_id;
+    }
+  }
+}
+}  // namespace batched_moe_align_block_size
+
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
     const scalar_t* __restrict__ topk_ids,
@@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
       });
 }
 
+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& batch_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor batch_ids,
+                                  torch::Tensor num_tokens_post_pad) {
+  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int32_t const B = batch_num_tokens.size(0);
+  int32_t const num_blocks_per_batch =
+      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
+  int32_t const num_blocks = num_blocks_per_batch * B;
+  int64_t const sorted_ids_size = num_blocks * block_size;
+
+  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
+  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
+  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
+  TORCH_CHECK(B <= batched_kernel::num_threads);
+
+  batched_kernel::batched_moe_align_block_size_kernel<<<
+      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
+      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
+      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
+      num_tokens_post_pad.data_ptr<int32_t>());
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
              torch::Tensor& output)  // [num_tokens, hidden_size]
 {
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..360f1312cf57928bf22c802caa17b22da93c0336
--- /dev/null
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@@ -0,0 +1,174 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Atomic.cuh>
+
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
+#include "core/math.hpp"
+
+namespace {
+
+__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
+                                         int32_t col) {
+  return row * total_col + col;
+}
+
+}  // namespace
+
+// TODO: Refactor common parts with moe_align_sum_kernels
+template <typename scalar_t, typename token_cnts_t>
+__global__ void moe_lora_align_sum_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+    int64_t block_size, int num_experts, int max_loras, size_t numel,
+    int max_num_tokens_padded, int max_num_m_blocks,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* lora_ids) {
+  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
+
+  // Initialize sorted_token_ids with numel
+  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
+    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
+  }
+
+  // Initialize expert_ids with -1
+  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
+    expert_ids[lora_id * max_num_m_blocks + it] = -1;
+  }
+
+  // Initialize total_tokens_post_pad with 0
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+  }
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
+  }
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int mask = token_lora_mapping[i / topk_num] == lora_id;
+    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
+    tokens_cnts[idx] += mask;
+  }
+
+  __syncthreads();
+
+  // For each expert we accumulate the token counts from the different threads.
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+    }
+  }
+
+  __syncthreads();
+
+  // We accumulate the token counts of all experts in thread 0.
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] +
+                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
+                           block_size) *
+                      block_size;
+    }
+    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  /**
+   * For each expert, each thread processes the tokens of the corresponding
+   * blocks and stores the corresponding expert_id for each block.
+   */
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
+          threadIdx.x;
+    }
+  }
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    /** The cumsum[expert_id] stores the starting index of the tokens that the
+     * expert with expert_id needs to process, and
+     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
+     * processed by the expert with expert_id within the current thread's token
+     * shard.
+     */
+    int32_t rank_post_pad =
+        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
+        cumsum[expert_id];
+
+    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
+    atomicAdd(
+        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
+        (i - numel) * mask);
+    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
+  }
+}
+
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids) {
+  const int topk_num = topk_ids.size(1);
+
+  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
+  TORCH_CHECK(num_thread <= 1024,
+              "num_thread must be less than 1024, "
+              "and fallback is not implemented yet.");
+  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
+                             (num_experts + 1) * sizeof(int32_t);
+
+  if (shared_mem > device_max_shared_mem) {
+    TORCH_CHECK(false,
+                "Shared memory usage exceeds device limit, and global memory "
+                "fallback is not implemented yet.");
+  }
+
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+        dim3 blockDim(num_thread);
+        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
+        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
+            topk_ids.data_ptr<scalar_t>(),
+            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
+            max_loras, topk_ids.numel(), max_num_tokens_padded,
+            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
+            expert_ids.data_ptr<int32_t>(), topk_num,
+            num_tokens_post_pad.data_ptr<int32_t>(),
+            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
+      });
+}
\ No newline at end of file
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 92fc280b362b90c31b5c996c66ec440b342f2e35..11c6875f7f1d044e1f777257c3f7f904a3cb7170 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -4,7 +4,7 @@
 
 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                   torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output);
+                  torch::Tensor& gating_output, bool renormalize);
 
 void moe_sum(torch::Tensor& input, torch::Tensor& output);
 
@@ -12,6 +12,21 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
+
+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& expert_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor expert_ids,
+                                  torch::Tensor num_tokens_post_pad);
+
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor b_qweight, torch::Tensor b_scales,
@@ -24,9 +39,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_K, int64_t bit);
 
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor);
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func);
 #endif
 
 bool moe_permute_unpermute_supported();
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 53573ada86ba9c926b2822ac10530f46a85667b5..af6e6fcd482c7cbca73093338c08ae9270547293 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -16,12 +16,23 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <type_traits>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 
+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+    #include <cuda_fp16.h>
+#else
+    #include <hip/hip_bf16.h>
+    #include <hip/hip_fp16.h>
+    typedef __hip_bfloat16 __nv_bfloat16;
+    typedef __hip_bfloat162 __nv_bfloat162;
+#endif
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -36,16 +47,27 @@ template <
     /// Alignment requirement in bytes
     int Alignment = sizeof(T) * N
 >
-class alignas(Alignment) AlignedArray {
-    float data[N];
+struct alignas(Alignment) AlignedArray {
+    T data[N];
 };
 
+template <typename T>
+__device__ __forceinline__ float toFloat(T value) {
+    if constexpr (std::is_same_v<T, float>) {
+        return value;
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return __bfloat162float(value);
+    } else if constexpr (std::is_same_v<T, __half>) {
+        return __half2float(value);
+    }
+}
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
-template <int TPB>
+template <int TPB, typename InputType>
 __launch_bounds__(TPB) __global__
-    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+    void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
 {
     using BlockReduce = cub::BlockReduce<float, TPB>;
     __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
     for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
     {
         const int idx = thread_row_offset + ii;
-        threadData = max(static_cast<float>(input[idx]), threadData);
+        const float val = toFloat(input[idx]);
+        threadData = max(val, threadData);
     }
 
     const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
     for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
     {
         const int idx = thread_row_offset + ii;
-        threadData += exp((static_cast<float>(input[idx]) - float_max));
+        const float val = toFloat(input[idx]);
+        threadData += expf(val - float_max);
     }
 
     const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
     for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
     {
         const int idx = thread_row_offset + ii;
-        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
-        output[idx] = val;
+        const float val = toFloat(input[idx]);
+        const float softmax_val = expf(val - float_max) * normalizing_factor;
+        output[idx] = softmax_val;
     }
 }
 
@@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
     const int num_experts,
     const int k,
     const int start_expert,
-    const int end_expert)
+    const int end_expert,
+    const bool renormalize)
 {
 
     using cub_kvp = cub::KeyValuePair<int, float>;
@@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
 
     const bool row_is_active = finished ? !finished[block_row] : true;
     const int thread_read_offset = blockIdx.x * num_experts;
+    float selected_sum = 0.f;
     for (int k_idx = 0; k_idx < k; ++k_idx)
     {
         thread_kvp.key = 0;
@@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
             indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
             assert(indices[idx] >= 0);
             source_rows[idx] = k_idx * num_rows + block_row;
+            if (renormalize) {
+                selected_sum += result_kvp.value;
+            }
         }
         __syncthreads();
     }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (threadIdx.x == 0) {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx) {
+                const int idx = k * block_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }
 
 // ====================== TopK softmax things ===============================
@@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
   2) This implementation assumes k is small, but will work for any k.
 */
 
-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert)
+    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
 {
+    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
+                      std::is_same_v<InputType, __half>,
+                  "InputType must be float, __nv_bfloat16, or __half");
+
     // We begin by enforcing compile time assertions and setting up compile time constants.
     static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
     static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
 
     // Number of bytes each thread pulls in per load
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
     static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
     static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
     static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
 
+    if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
+        static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
+            "ELTS_PER_LDG must be 1 or even for 16-bit conversion");
+    }
+
     // Restrictions based on previous section.
     static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
     static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 
     // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
     // row it will read.
-    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+    const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
 
     // Now, we compute the group each thread belong to in order to determine the first column to start loads.
     const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
     const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
-    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
-
-    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
-    // this can support all powers of 2 up to 16.
-    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
-    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
-    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+    const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
 
     // Finally, we pull in the data from global mem
     float row_chunk[VPT];
-    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
-    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+
+    // NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
+    if constexpr (std::is_same_v<InputType, float>) {
+        using VecType = AlignedArray<float, ELTS_PER_LDG>;
+        VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
+        const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
-    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
-    {
-        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+            row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        }
+    } else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
+                        *reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __bfloat162float(*scalar_ptr);
+            }
+        }
+    } else if constexpr (std::is_same_v<InputType, __half>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__half, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __half22float2(
+                        *reinterpret_cast<const __half2*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __half2float(*scalar_ptr);
+            }
+        }
     }
 
     // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
     int start_col = first_elt_read_by_thread;
     static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
 
+    float selected_sum = 0.f;
     for (int k_idx = 0; k_idx < k; ++k_idx)
     {
         // First, each thread does the local argmax
@@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
             output[idx] = max_val;
             indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
             source_rows[idx] = k_idx * num_rows + thread_row;
+            if (renormalize) {
+                selected_sum += max_val;
+            }
         }
 
         // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
             }
         }
     }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (thread_group_idx == 0)
+        {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx)
+            {
+                const int idx = k * thread_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }
 
 namespace detail
 {
 // Constructs some constants needed to partition the work across threads at compile time.
-template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
+template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
 struct TopkConstants
 {
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
     static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
     static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
     static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@@ -397,20 +508,21 @@ struct TopkConstants
 };
 } // namespace detail
 
-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
-    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
+void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+    cudaStream_t stream)
 {
-    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
-    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
     static constexpr int VPT = Constants::VPT;
     static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
     const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
     const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
 
     dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
 }
 
 #ifndef USE_ROCM
@@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
     static_assert(WARP_SIZE == 32,                                                    \
                   "Unsupported warp size. Only 32 is supported for CUDA");            \
     topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices,                           \
-        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
+        num_tokens, topk, 0, num_experts, renormalize, stream);
 #else
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
     if (WARP_SIZE == 64) {                                                               \
         topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
     } else if (WARP_SIZE == 32) {                                                        \
         topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
     } else {                                                                             \
         assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
     }
 #endif
 
-template <typename IndType>
+template <typename IndType, typename InputType>
 void topkGatingSoftmaxKernelLauncher(
-    const float* gating_output,
+    const InputType* gating_output,
     float* topk_weights,
     IndType* topk_indices,
     int* token_expert_indices,
@@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
     const int num_tokens,
     const int num_experts,
     const int topk,
+    const bool renormalize,
     cudaStream_t stream) {
     static constexpr int WARPS_PER_TB = 4;
     static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
 #ifndef USE_ROCM
-    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+    // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
+    // elements can be loaded by a warp
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
+    (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
 #endif
     switch (num_experts) {
         case 1:
@@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
             TORCH_CHECK(softmax_workspace != nullptr,
                 "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
             static constexpr int TPB = 256;
-            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
                 gating_output, nullptr, softmax_workspace, num_experts);
             moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
                 softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts);
+                num_experts, topk, 0, num_experts, renormalize);
         }
     }
 }
@@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace moe
 } // namespace vllm
 
+
+template<typename ComputeType>
+void dispatch_topk_softmax_launch(
+    torch::Tensor& gating_output,
+    torch::Tensor& topk_weights,
+    torch::Tensor& topk_indices,
+    torch::Tensor& token_expert_indices,
+    torch::Tensor& softmax_workspace,
+    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
+{
+    if (topk_indices.scalar_type() == at::ScalarType::Int) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    }
+}
+
 void topk_softmax(
     torch::Tensor& topk_weights,                // [num_tokens, topk]
     torch::Tensor& topk_indices,                // [num_tokens, topk]
     torch::Tensor& token_expert_indices,        // [num_tokens, topk]
-    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    bool renormalize)
 {
     const int num_experts = gating_output.size(-1);
     const auto num_tokens = gating_output.numel() / num_experts;
@@ -534,45 +689,19 @@ void topk_softmax(
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
-
-    if(topk_indices.scalar_type() == at::ScalarType::Int)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<uint32_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else {
-        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int64_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
+
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
+        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
+        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
+        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
     }
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 8f33d6cd666faf4d4a4588df3acf1d3a379d9268..bd95ade40a083b98d02f334dc6d01519ce4054fc 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   // Apply topk softmax to the gating outputs.
   m.def(
       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
   // Calculate the result of moe by summing up the partial results
@@ -22,6 +22,33 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     Tensor! num_tokens_post_pad) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size, but for the batched case.
+  m.def(
+      "batched_moe_align_block_size(int max_tokens_per_batch,"
+      "                     int block_size, Tensor expert_num_tokens,"
+      "                     Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  m.impl("batched_moe_align_block_size", torch::kCUDA,
+         &batched_moe_align_block_size);
+
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  m.def(
+      "moe_lora_align_block_size(Tensor topk_ids,"
+      "                     Tensor token_lora_mapping,"
+      "                     int num_experts,"
+      "                     int block_size, int max_loras, "
+      "                     int max_num_tokens_padded, "
+      "                     int max_num_m_blocks, "
+      "                     Tensor !sorted_token_ids,"
+      "                     Tensor !experts_ids,"
+      "                     Tensor !num_tokens_post_pad,"
+      "                     Tensor !adapter_enabled,"
+      "                     Tensor !lora_ids) -> () ");
+  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
+
 #ifndef USE_ROCM
   m.def(
       "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
@@ -80,9 +107,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   // Apply grouped topk routing to select experts.
   m.def(
-      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "grouped_topk(Tensor scores, int n_group, int "
       "topk_group, int topk, bool renormalize, float "
-      "routed_scaling_factor) -> (Tensor, Tensor)");
+      "routed_scaling_factor, Tensor bias, int scoring_func) -> (Tensor, "
+      "Tensor)");
   m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index c108050d6b98172f2e2ce357bec49d53765426fe..864e8aef1d000189925a2528a32370668554f650 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,14 +92,25 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
-void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-               torch::Tensor& bias, double epsilon);
+void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
+                        int64_t num_heads_k, int64_t num_heads_v,
+                        int64_t head_dim, double eps, torch::Tensor& q_weight,
+                        torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
+                        bool is_neox, torch::Tensor& position_ids);
 
 void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& prompt_mask,
                                  const torch::Tensor& output_mask,
                                  const torch::Tensor& repetition_penalties);
 
+void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
+                   const torch::Tensor& rowEnds, torch::Tensor& indices,
+                   int64_t numRows, int64_t stride0, int64_t stride1);
+
+void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
+                          const torch::Tensor& seq_lens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1);
+
 // void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 //                                torch::Tensor& weight, torch::Tensor& scale,
 //                                double epsilon);
@@ -133,12 +144,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& input,
                               torch::Tensor& input_global_scale);
 #endif
-void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
+// void persistent_masked_m_silu_mul_quant(
+//     const at::Tensor& input,   // (E, T, 2*H)
+//     const at::Tensor& counts,  // (E)
+//     at::Tensor& y_q,           // (E, T, H) [OUT]
+//     at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+//     bool use_ue8m0);
 
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 
@@ -304,7 +315,7 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit);
+                        bool use_exllama, bool use_v2_format, int64_t bit);
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
@@ -318,17 +329,19 @@ void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 //     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
 //     std::optional<torch::Tensor> const& scale_ub);
 
-void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
-                        const torch::Tensor& A, const torch::Tensor& B,
-                        const torch::Tensor& C,
-                        const std::optional<torch::Tensor>& D_,
-                        const std::optional<torch::Tensor>& z_,
-                        const std::optional<torch::Tensor>& delta_bias_,
-                        bool delta_softplus,
-                        const std::optional<torch::Tensor>& query_start_loc,
-                        const std::optional<torch::Tensor>& cache_indices,
-                        const std::optional<torch::Tensor>& has_initial_state,
-                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
+void selective_scan_fwd(
+    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
+    const torch::Tensor& B, const torch::Tensor& C,
+    const std::optional<torch::Tensor>& D_,
+    const std::optional<torch::Tensor>& z_,
+    const std::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
+    const std::optional<torch::Tensor>& query_start_loc,
+    const std::optional<torch::Tensor>& cache_indices,
+    const std::optional<torch::Tensor>& has_initial_state,
+    const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size,
+    const std::optional<torch::Tensor>& block_idx_first_scheduled_token,
+    const std::optional<torch::Tensor>& block_idx_last_scheduled_token,
+    const std::optional<torch::Tensor>& initial_state_idx);
 
 torch::Tensor dynamic_4bit_int_moe_cpu(
     torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index b94cc9ce5086c4c62bb95c416229abc21b52c82c..0c3bcf3b64b268c996dec02152962ff17e802750 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -7,7 +7,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
 
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 
 #include <c10/util/Float8_e4m3fn.h>
 
@@ -114,13 +114,22 @@ __global__ void act_and_mul_quant_kernel(
 }
 
 __device__ __forceinline__ float silu(float x) {
-  return (__fdividef(x, (1.f + expf(-x))));
+  return __fdividef(x, (1.f + expf(-x)));
 }
 
 __device__ __forceinline__ float2 silu2(float2 x) {
   return make_float2(silu(x.x), silu(x.y));
 }
 
+__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
+#ifndef USE_ROCM
+  return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
+                        __float2bfloat16_rn(silu(x.y)));
+#else
+  return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
+#endif
+}
+
 #ifndef USE_ROCM
 __device__ __forceinline__ float warp_max(float v) {
   static constexpr unsigned FULL_MASK = 0xffffffffu;
@@ -223,224 +232,337 @@ constexpr __nv_bfloat16 get_fp8_min() {
     return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
   }
 }
-#ifndef USE_ROCM
-template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
-          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
+
+template <typename Idx_t>
+__device__ __forceinline__ int warp_expert_search(
+    int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
+  const Idx_t* input_ptr = input + idx;
+  int base_offset = 0;
+
+  for (;;) {
+    bool move_on = (idx < n && *input_ptr <= val);
+
+    unsigned mask = __ballot_sync(0xffffffff, move_on);
+
+    if (mask != 0xffffffffu) {
+      int last_lane = 31 - __clz(mask);
+      return base_offset + last_lane;
+    }
+
+    input_ptr += 32;
+    base_offset += 32;
+    idx += 32;
+  }
+}
+
+template <int num_parallel_tokens>
+__device__ __forceinline__ void token_bounds(int32_t n_tokens,
+                                             int32_t worker_id,
+                                             int32_t& n_tokens_lower,
+                                             int32_t& n_tokens_upper) {
+  if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
+    if (worker_id >= num_parallel_tokens) return;
+    n_tokens_lower = worker_id;
+    n_tokens_upper = worker_id + 1;
+  } else {
+    int32_t chunk_size = n_tokens / num_parallel_tokens;
+    int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
+    auto calc_id = [&](int32_t id) {
+      if (id < residual)
+        return min(n_tokens, id * (chunk_size + 1));
+      else
+        return min(n_tokens, id * chunk_size + residual);
+    };
+    n_tokens_lower = calc_id(worker_id);
+    n_tokens_upper = calc_id(worker_id + 1);
+  }
+}
+
+template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
-
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
     // sizes
-    int H, int G,
-
+    Idx_t E, Idx_t T, Idx_t H,
     // strides (in elements)
     Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
     Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
-  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
-  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
-  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
-  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
+#ifndef USE_ROCM
+  static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
 
-  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
-  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
+  static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
+  static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
 
-  // We split the shared memory in half, corresponding to gate and up matrices:
-  // [...gate_i, ...up_i]  where 0 <= i < stages.
-  static constexpr int32_t S_NUM_128 =
-      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
-  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
-  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
-  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
-  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
+  static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
+  static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
 
-  const int32_t tid = threadIdx.x;
-  const int32_t warp_id = tid / WARP_SIZE;
-  const int32_t lane_id = tid % WARP_SIZE;
+  extern __shared__ __align__(16) __int128_t smem_128[];
 
-  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
+  int* s_expert_offsets =
+      reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
 
-  // block handles one (expert e, group g)
-  int32_t pid = blockIdx.x;
-  int32_t e = pid / G;
-  int32_t g = pid % G;
+  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
+  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
+  // We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
+  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
+  int tid = threadIdx.x;
+  int warp_id = tid >> 5;
+  int lane_id = tid & 0x1f;
+
+  int running_sum{};
+  if (!warp_id) {
+    for (int i = 0; i < E; i += WARP_SIZE) {
+      bool valid = (i + threadIdx.x) < E;
+      int value =
+          (valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
+          (!lane_id ? running_sum : 0);
+
+      for (int offset = 1; offset < 32; offset *= 2) {
+        int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
+        if (lane_id >= offset) value += n;
+      }
 
-  const int32_t n_tokens = counts[e * stride_counts_e];
+      if (valid) {
+        s_expert_offsets[i + threadIdx.x + 1] = value;
+      }
 
-  if (!n_tokens) {
-    return;  // Exit ASAP.
+      running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
+    }
+
+    if (!lane_id) {
+      s_expert_offsets[0] = 0;
+    }
   }
 
-  const Idx_t stride_i_t_128 = stride_i_t / 8u;
+  __syncthreads();
 
-  int32_t n_tokens_lower, n_tokens_upper;
+  int32_t total_tokens = s_expert_offsets[E];
 
+  const int warp_position_yq = warp_id * (H / NUM_WARPS);
+  const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
+
+  // A single block will handle tokens_per_block tokens.
   // Each block i iterates over tokens of a slice of n_tokens =
   // expert_counts[i], with the size of chunk being
   // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
   // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
-  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
-    // Specialize this, but can be likely fused.
-    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
-      return;
-    }
-    n_tokens_lower = blockIdx.y;
-    n_tokens_upper = blockIdx.y + 1;
-  } else {
-    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
-    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
-    auto calc_id = [&](int32_t id) {
-      if (id < residual) {
-        return min(n_tokens, id * (chunk_size + 1));
-      } else {
-        return min(n_tokens, id * chunk_size + residual);
-      }
-    };
-    n_tokens_lower = calc_id(blockIdx.y);
-    n_tokens_upper = calc_id(blockIdx.y + 1);
-  }
 
-  if (n_tokens_lower >= n_tokens_upper) {
+  // Each warp will get space to store its hidden dim for gate and up.
+  __int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
+  __int128_t* smem_load_ptr = s_hidden_load + lane_id;
+
+  const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
+
+  int32_t compute_pipeline_offset_64 = 0;
+  int32_t load_stage_offset{};
+  const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
+
+  __int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
+                                warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
+                                lane_id;
+  __int64_t* s_gate64_ptr = smem_compute_ptr;
+  __int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
+
+  int tokens_lower, tokens_upper;
+
+  token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
+                            tokens_upper);
+
+  Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
+  int token_id = tokens_lower;
+  int32_t t_load{};
+
+  if (token_id < tokens_upper) {
+    expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
+    expert_offset = s_expert_offsets[expert_id];
+    next_expert_offset = s_expert_offsets[expert_id + 1];
+  } else {
+    // This thread block has no work to do.
     return;
   }
 
-  // We do calculations here, using constexpr wherever possible.
-  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
-  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
-  const Idx_t base_yq =
-      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
-  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
-  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
-  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
-                      stride_i_t_128 * n_tokens_lower;
-  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
-  auto y_s_ptr =
-      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
-  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
-                 stride_yq_t * n_tokens_lower + 4 * lane_id;
-  int32_t t_load = n_tokens_lower, load_stage_id = 0;
-  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
-  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
-  int32_t stage_offset{};
-
-  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
-  static constexpr int32_t LOAD_STAGE_MOD =
-      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
-
-  // Two halves of all threads in a block conduct global loads for gate and up,
-  // repsectively.
+  int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
+
+  Idx_t base_i = ((expert_id * stride_i_e) / 8) +
+                 (token_id - expert_offset) * stride_i_t / 8;
+  const Idx_t gate_warp_offset =
+      warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
+
+  const __int128_t* input_128_ptr =
+      reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
+      ((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
+  __int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
+
+  auto token_offset = token_id - expert_offset;
+
   auto load_and_advance_y_pred = [&] {
-    if (t_load < n_tokens_upper) {
-      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
-      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
+    if (t_load < t_load_bound) {
+      // Here we are simply continuing to load data
+      // from the current token.
+      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
 
       // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
       // unnecessary ALU ops.
-      stage_offset += LOAD_STAGE_SIZE;
-      stage_offset %= LOAD_STAGE_MOD;
+      load_stage_offset += LOAD_STAGE_SIZE;
+      load_stage_offset %= LOAD_STAGE_MOD;
 
-      if (tid < HALF_THREAD_COUNT) {
-        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
-        gate_128_ptr += stride_i_t_128;
+      cp_async4(smem_load_ptr_staged, load_ptr);
+      load_ptr += GROUP_SIZE / 8;
+      ++t_load;
+    } else if (token_id + 1 < tokens_upper) {
+      // We loaded everything from the current token, let's move on
+      // to the next one, and we checked that we have more tokens to load.
+      ++token_id;
+      t_load = 0;
+      if (token_id >= next_expert_offset) {
+        // We need to find the next expert.
+        do {
+          // This is a loop because it's possible
+          // that some experts are assigned 0 tokens.
+          // NOTE: We are guaranteed that there's at least
+          // one more token left so we don't have to check for
+          // expert_id bounds.
+          ++expert_id;
+          // This skips 1 memory read.
+          expert_offset = next_expert_offset;
+          next_expert_offset = s_expert_offsets[expert_id + 1];
+        } while (next_expert_offset == expert_offset);
+
+        base_i = expert_id * (stride_i_e / 8);
+        token_offset = 0;
+        load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
       } else {
-        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
-        up_128_ptr += stride_i_t_128;
+        // We remain within the same expert, so just
+        // move by H/4 __int128_t (2 * H/8).
+        base_i += stride_yq_t / 4;
+        token_offset++;
       }
+
+      load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
+
+      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
+
+      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
+      // unnecessary ALU ops.
+      load_stage_offset += LOAD_STAGE_SIZE;
+      load_stage_offset %= LOAD_STAGE_MOD;
+
+      cp_async4(smem_load_ptr_staged, load_ptr);
+      load_ptr += GROUP_SIZE / 8;
       ++t_load;
-      ++load_stage_id;
     }
     // We fence even if there is nothing to load to simplify pipelining.
     cp_async_fence();
   };
 
+  // We need to warm-up the pipeline.
   #pragma unroll
   for (int i = 0; i < NUM_STAGES - 1; i++) {
     load_and_advance_y_pred();
   }
 
-  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
-                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
-                          lane_id;
-  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
+  __nv_fp8x4_e4m3* y_q_base_ptr =
+      reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
 
-  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
-  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
 
-  int32_t compute_pipeline_offset_64 = 0;
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
 
-  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-    __nv_bfloat162 results_bf162[2];
+  for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
+    const Idx_t base_ys = expert_id * stride_ys_e;
+    auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
+    __nv_fp8x4_e4m3* y_q_ptr =
+        y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
+                        warp_position_yq * stride_yq_h) /
+                           4;
+    const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
 
-    cp_async_wait<NUM_STAGES - 2>();
-    __syncthreads();
+    for (int i = 0; i < COMPUTE_LIMIT; i++) {
+      cp_async_wait<NUM_STAGES - 2>();
+      __syncthreads();
+      load_and_advance_y_pred();
 
-    // We double-buffer pipelined loads so that the next load will
-    // concurrently run with compute without overwrites.
-    load_and_advance_y_pred();
+      __int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
+      __int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
 
-    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
-    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
+      // COMPUTE_STAGE_SIZE/MOD must also be constexpr!
+      compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
+      compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
 
-    // STAGE_SIZE must also be constexpr!
-    compute_pipeline_offset_64 += STAGE_SIZE;
-    compute_pipeline_offset_64 %= STAGE_MOD;
+      __int64_t gate64 = *gate64_ptr;
+      __int64_t up64 = *up64_ptr;
 
-    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
-    __int64_t gate64 = *s_gate_compute_64;
-    __nv_bfloat162* s_gate_compute_32 =
-        reinterpret_cast<__nv_bfloat162*>(&gate64);
-
-    __int64_t up64 = *s_up_compute_64;
-    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
+      // Compute
+      __nv_bfloat162 res[2];
+      __nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
+      __nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
 
   #pragma unroll
-    for (int i = 0; i < 2; i++) {
-      // For silu, we make sure that div is emitted.
-      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
-      results_bf162[i] = __float22bfloat162_rn(gate);
-    }
-
-  #pragma unroll
-    for (int i = 0; i < 2; i++) {
-      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
-    }
+      for (int32_t k = 0; k < 2; ++k) {
+        __nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
+        res[k] = __hmul2(gate, s_up_comp[k]);
+      }
 
-    auto _y_max2 =
-        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
+      auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
 
-    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
+      _y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
 
-    // An entire group is assigned to a single warp, so a simple warp reduce
-    // is used.
-    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
+      __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
 
-    if constexpr (USE_UE8M0) {
-      y_s = hexp2(hceil(hlog2(y_s)));
-    }
+      if constexpr (CEIL_UE8M0) {
+        y_s = hexp2(hceil(hlog2(y_s)));
+      }
 
-    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
+      __nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
 
-    auto y_s2 = make_bfloat162(inv_y, inv_y);
+      auto y_s2 = make_bfloat162(inv_y, inv_y);
 
   #pragma unroll
-    for (int32_t i = 0; i < 2; ++i) {
-      results_bf162[i] =
-          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
-               __bfloat162bfloat162(fp8_max));
-    }
+      for (int32_t k = 0; k < 2; ++k) {
+        res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
+                      __bfloat162bfloat162(fp8_max));
+      }
+
+      *y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
+      y_q_ptr += WARP_SIZE * stride_yq_h;
+
+      if (!lane_id) {
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
 
-    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
-    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
-    y_q_ptr += stride_yq_t;
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
 
-    if (lane_id == 0) {
-      *y_s_ptr = y_s;
-      y_s_ptr += stride_ys_t;
+        current_group_id += 1;
+      }
     }
   }
-}
 #endif
+}
 
 }  // namespace vllm
 
@@ -475,25 +597,26 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
 
-void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
+void persistent_masked_m_silu_mul_quant(
+    const at::Tensor& input,              // (E, T, 2*H)
+    const at::Tensor& tokens_per_expert,  // (E)
+    at::Tensor& y_q,                      // (E, T, H) [OUT]
+    at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
+    bool cast_scale_ue8m0) {
 #ifndef USE_ROCM
-  // This kernel relies heavily on cp.async and fp8 support.
+
   // This kernel currently only supports H % 128 == 0 and assumes a
   // fixed GROUP_SIZE of 128.
+  static constexpr int GROUP_SIZE = 128;
+
   TORCH_CHECK(input.dtype() == torch::kBFloat16);
   TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
               y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
-  TORCH_CHECK(input.size(-1) % 256 == 0);
+  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
 
-  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
-  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
-  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
 
   using Idx_t = int64_t;
 
@@ -506,85 +629,107 @@ void silu_mul_fp8_quant_deep_gemm_cuda(
   Idx_t stride_yq_e = y_q.stride(0);
   Idx_t stride_yq_t = y_q.stride(1);
   Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);
 
-  Idx_t stride_counts_e = counts.stride(0);
+  Idx_t stride_counts_e = tokens_per_expert.stride(0);
 
-  static constexpr int GROUP_SIZE = 128;
+  int const NUM_GROUPS = H / GROUP_SIZE;
 
-  #define KERNEL_FN                                                         \
-    if (use_ue8m0) {                                                        \
-      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
-                                                NUM_PARALLEL_TOKENS, true>  \
-          <<<grid, block, 0, stream>>>(                                     \
-              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
-              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
-              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
-              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
-              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
-              stride_counts_e);                                             \
-    } else {                                                                \
-      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
-                                                NUM_PARALLEL_TOKENS, false> \
-          <<<grid, block, 0, stream>>>(                                     \
-              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
-              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
-              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
-              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
-              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
-              stride_counts_e);                                             \
-    }
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  #define KERNEL_CALL_H                                       \
-    if (H % (4 * GROUP_SIZE) == 0) {                          \
-      static constexpr int NUM_WARPS = 4;                     \
-      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
-      KERNEL_FN                                               \
-    } else {                                                  \
-      static constexpr int NUM_WARPS = 1;                     \
-      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
-      KERNEL_FN                                               \
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
+    static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
+    int sms = SILU_V2_BLOCK_COUNT;                                             \
+    static constexpr int max_shared_mem_bytes =                                \
+        GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2;                               \
+    dim3 grid(sms), block(THREAD_COUNT);                                       \
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));          \
+    VLLM_DISPATCH_FP8_TYPES(                                                   \
+        y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
+          vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
+              <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
+                  reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
+                  reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
+                  T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
+        });
+
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
     }
 
-  #define KERNEL_CALL_TOP_LEVEL                      \
-    if (num_parallel_tokens == 1) {                  \
-      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 2) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 4) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 8) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 16) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 16; \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 32) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 32; \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 64) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 64; \
-      KERNEL_CALL_H                                  \
-    }
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
+  }
 
-  Idx_t G;
-  dim3 block, grid;
-  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
-    G = H / Idx_t(group_size * num_warps);
-    grid = dim3(E * G, _num_parallel_tokens);
-    block = dim3(num_warps * WARP_SIZE);
-  };
+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
+  }
 
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
-                          "silu_mul_fp8_quant_deep_gemm_kernel",
-                          [&] { KERNEL_CALL_TOP_LEVEL });
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
 
 #endif
 }
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
deleted file mode 100644
index 24564efbd21be8db286d58632c1f65c2c79f81ea..0000000000000000000000000000000000000000
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ /dev/null
@@ -1,136 +0,0 @@
-#pragma once
-
-#include "scaled_mm.cuh"
-#include "cutlass_gemm_caller.cuh"
-
-/**
- * This file defines Gemm kernel configurations for SM100 (fp8) based on the
- * Gemm shape.
- */
-
-namespace vllm {
-
-using c3x::cutlass_gemm_caller;
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_default {
-  // M in (256, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_256, _128, _128>;
-  using ClusterShape = Shape<_2, _2, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M256 {
-  // M in (64, 256]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M64 {
-  // M in (16, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M16 {
-  // M in [1, 16]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
-                                            torch::Tensor const& a,
-                                            torch::Tensor const& b,
-                                            EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  using Cutlass3xGemmDefault =
-      typename sm100_fp8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM16 =
-      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM256 =
-      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 16) {
-    // m in [1, 16]
-    return cutlass_gemm_caller<Cutlass3xGemmM16>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
-    // m in (16, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 256) {
-    // m in (64, 256]
-    return cutlass_gemm_caller<Cutlass3xGemmM256>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (256, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
-                                          torch::Tensor const& a,
-                                          torch::Tensor const& b,
-                                          EpilogueArgs&&... epilogue_args) {
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::bfloat16_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::half_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  }
-}
-
-}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 2c8df6144bf4d1b4a8324b1ae7d92b2009172dd8..5b007e5ea3283a04f706f78a3c89dbb43b84d8f1 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "core/registration.h"
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
@@ -418,3 +420,7 @@ void cutlass_fp4_group_mm(
       "12.8 or above.");
 #endif
 }
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
+}
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 5575ee8e4197eab12a31bc37ea872021ecc8b7f0..6d69852bb4e4f9c639fd6ad1ffad197555d9e54c 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -31,6 +31,13 @@
 
 namespace vllm {
 
+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return (x + y - 1) / y * y;
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +49,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  int sf_m = round_up<int>(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_int + col] = 0x00;
+    }
+  }
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
   // Input tensor row/col loops.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +82,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
               rowIdx, colIdx, numCols, SFout);
 
       out_pos =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
     }
   }
 }
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index a4f8a4669f1e39b0caf70b10117cfb7a044ac531..677c25a4315c6a59656bbb4e520d604719402b99 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -159,7 +159,11 @@ void rms_norm_dynamic_per_token_quant(
   if (scale_ub.has_value()) {
     TORCH_CHECK(out.dtype() == kFp8Type);
   }
+  TORCH_CHECK(weight.dtype() == input.dtype());
   TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
 
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index 5256d21598a990aec8394f07bccc270757271804..a1f68cd11108903ff4fb2063f8ff27529d8c35cf 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -6,7 +6,7 @@
 
 #include "quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-// #include "quantization/fp8/common.cuh"
+// #include "quantization/w8a8/fp8/common.cuh"
 
 namespace vllm {
 
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 4a11059f1e98f4e509c4cdecc00b041b8ea96a08..c38228655da0aab2fdb6bfa2def6425a207d18a6 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -189,7 +189,7 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
                                                 const uint32_t*, const half*,
                                                 half*, const int, const int,
                                                 const int, const int,
-                                                const int*);
+                                                const bool, const int*);
 
 template <bool first_block, int m_count>
 __global__ void gemm_half_q_half_gptq_4bit_kernel(
@@ -197,12 +197,15 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, half* __restrict__ c,
     const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
   MatrixView_half a_(a, size_m, size_k);
   MatrixView_half_rw c_(c, size_m, size_n);
   MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto t = threadIdx.x;
 
   // Block
@@ -260,10 +263,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
   half2 y1y16[4][2];
   b_gptq_qzeros_.item4(zeros, group, n);
   b_gptq_scales_.item4_f(scales, group, n);
-  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 
   // Column result
   float block_c[m_count][4] = {};
@@ -276,10 +279,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
       nextgroup += groupsize;
       b_gptq_qzeros_.item4(zeros, group, n);
       b_gptq_scales_.item4_f(scales, group, n);
-      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
     }
 
 #pragma unroll
@@ -333,12 +336,15 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, half* __restrict__ c,
     const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
   MatrixView_half a_(a, size_m, size_k);
   MatrixView_half_rw c_(c, size_m, size_n);
   MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto t = threadIdx.x;
 
   // Block
@@ -413,10 +419,10 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
       int4 load_int4 = *b_ptr4;
 
       half2 dq[4][8];
-      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
-      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
-      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
-      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
 
 #pragma unroll
       for (int m = 0; m < m_count; m++) {
@@ -452,12 +458,15 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, half* __restrict__ c,
     const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
   MatrixView_half a_(a, size_m, size_k);
   MatrixView_half_rw c_(c, size_m, size_n);
   MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto t = threadIdx.x;
 
   // Block
@@ -538,13 +547,13 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
 
       half2 dq[4][16];
       dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
-                      size_n, zeros[0] + 1);
+                      size_n, zeros[0] + zero_offset);
       dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
-                      size_n, zeros[1] + 1);
+                      size_n, zeros[1] + zero_offset);
       dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
-                      size_n, zeros[2] + 1);
+                      size_n, zeros[2] + zero_offset);
       dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
-                      size_n, zeros[3] + 1);
+                      size_n, zeros[3] + zero_offset);
 
 #pragma unroll
       for (int m = 0; m < m_count; m++) {
@@ -578,12 +587,15 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, half* __restrict__ c,
     const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
   MatrixView_half a_(a, size_m, size_k);
   MatrixView_half_rw c_(c, size_m, size_n);
   MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto t = threadIdx.x;
 
   // Block
@@ -662,13 +674,13 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
 
       half2 dq[4][4];
       dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
-                     zeros[0] + 1);
+                     zeros[0] + zero_offset);
       dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
-                     zeros[1] + 1);
+                     zeros[1] + zero_offset);
       dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
-                     zeros[2] + 1);
+                     zeros[2] + zero_offset);
       dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
-                     zeros[3] + 1);
+                     zeros[3] + zero_offset);
 
       for (int m = 0; m < m_count; m++) {
         block_c[m][0] =
@@ -734,7 +746,8 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
                                 const uint32_t* b_gptq_qzeros,
                                 const half* b_gptq_scales, const int* b_q_perm,
                                 half* c, int size_m, int size_n, int size_k,
-                                int m_count, int groups, int bit) {
+                                int m_count, int groups, bool use_v2_format,
+                                int bit) {
   dim3 blockDim, gridDim;
   blockDim.x = BLOCK_KN_SIZE;
   blockDim.y = 1;
@@ -747,20 +760,23 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
       pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
-                                           b_gptq_scales, c, size_m, size_n,
-                                           size_k, groups, b_q_perm);
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k,
+      groups, use_v2_format, b_q_perm);
 }
 
 __global__ void reconstruct_exllama_8bit_kernel(
     const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
   MatrixView_half_rw b_(b, size_k, size_n);
   MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
   auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
@@ -816,13 +832,13 @@ __global__ void reconstruct_exllama_8bit_kernel(
 
       half2 dq[4][4];
       dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
-                     zeros[0] + 1);
+                     zeros[0] + zero_offset);
       dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
-                     zeros[1] + 1);
+                     zeros[1] + zero_offset);
       dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
-                     zeros[2] + 1);
+                     zeros[2] + zero_offset);
       dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
-                     zeros[3] + 1);
+                     zeros[3] + zero_offset);
 
       // half* dqh = (half*)dq;
       if (b_q_perm) {
@@ -853,11 +869,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
     const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
   MatrixView_half_rw b_(b, size_k, size_n);
   MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
   auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
@@ -892,10 +911,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
   half2 y1y16[4][2];
   b_gptq_qzeros_.item4(zeros, group, n);
   b_gptq_scales_.item4_h2(scales, group, n);
-  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 
   __syncthreads();
 
@@ -908,10 +927,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
       nextgroup += groupsize;
       b_gptq_qzeros_.item4(zeros, group, n);
       b_gptq_scales_.item4_h2(scales, group, n);
-      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
     }
 
     for (int p = 0; p < 4; p++) {
@@ -958,11 +977,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
     const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
   MatrixView_half_rw b_(b, size_k, size_n);
   MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
   auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
@@ -1020,13 +1042,13 @@ __global__ void reconstruct_exllama_3bit_kernel(
 
       half2 dq[4][16];
       dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
-                      size_n, zeros[0] + 1);
+                      size_n, zeros[0] + zero_offset);
       dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
-                      size_n, zeros[1] + 1);
+                      size_n, zeros[1] + zero_offset);
       dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
-                      size_n, zeros[2] + 1);
+                      size_n, zeros[2] + zero_offset);
       dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
-                      size_n, zeros[3] + 1);
+                      size_n, zeros[3] + zero_offset);
 
       if (b_q_perm) {
         for (int j = 0; j < 16; j++) {
@@ -1056,11 +1078,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
     const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
     const uint32_t* __restrict__ b_gptq_qzeros,
     const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
   MatrixView_half_rw b_(b, size_k, size_n);
   MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
   auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
@@ -1112,10 +1137,10 @@ __global__ void reconstruct_exllama_2bit_kernel(
       int4 load_int4 = *b_ptr4;
 
       half2 dq[4][8];
-      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
-      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
-      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
-      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
 
       b_ptr += size_n;
       // half* dqh = (half*)dq;
@@ -1147,7 +1172,7 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
                          const uint32_t* b_gptq_qzeros,
                          const half* b_gptq_scales, const int* b_q_perm,
                          half* out, int height, int width, int groups,
-                         int bit) {
+                         bool use_v2_format, int bit) {
   dim3 blockDim, gridDim;
   blockDim.x = BLOCK_KN_SIZE;
   blockDim.y = 1;
@@ -1166,14 +1191,14 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
       b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
-      out);
+      use_v2_format, out);
 }
 
 __global__ void gemm_half_q_half_alt_4bit_kernel(
     const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
     half* __restrict__ mul, const half* __restrict__ scales,
     const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
-    int batch, int height, int width) {
+    int batch, int height, int width, bool use_v2_format) {
   int zero_width = width / 8;
   int vec_height = height * 4;
   const int blockwidth2 = BLOCK_KN_SIZE / 2;
@@ -1183,6 +1208,9 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
   int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
   auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
   if (threadIdx.x < h_end) {
     for (int m = 0; m < b_end; ++m) {
@@ -1227,10 +1255,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
       half2 zero = __halves2half2(
           __hmul(scale_f,
                  __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
-                               1)),
-          __hmul(scale_f2,
-                 __int2half_rn(
-                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
+                               zero_offset)),
+          __hmul(
+              scale_f2,
+              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) -
+                            zero_offset)));
       scales_tmp[tmp_k] = scale;
       zeros_tmp[tmp_k] = zero;
     }
@@ -1272,7 +1301,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
     const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
     half* __restrict__ mul, const half* __restrict__ scales,
     const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
-    int batch, int height, int width) {
+    int batch, int height, int width, bool use_v2_format) {
   int zero_width = width / 4;
   int vec_height = height * 2;
   const int blockwidth2 = BLOCK_KN_SIZE / 2;
@@ -1282,6 +1311,9 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
   int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
   auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
   if (threadIdx.x < h_end) {
     for (int m = 0; m < b_end; ++m) {
@@ -1316,12 +1348,13 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
       half scale_f2 = scales[g2 * width + w];
       half2 scale = __halves2half2(scale_f, scale_f2);
       half2 zero = __halves2half2(
-          __hmul(scale_f,
-                 __int2half_rn(
-                     -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
-          __hmul(scale_f2,
-                 __int2half_rn(
-                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
+          __hmul(scale_f, __int2half_rn(
+                              -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) -
+                              zero_offset)),
+          __hmul(
+              scale_f2,
+              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) -
+                            zero_offset)));
       scales_tmp[tmp_k] = scale;
       zeros_tmp[tmp_k] = zero;
     }
@@ -1359,7 +1392,7 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
                           const uint32_t* b_gptq_qzeros,
                           const half* b_gptq_scales, const int* b_g_idx,
                           half* c, int size_m, int size_n, int size_k,
-                          int bit) {
+                          bool use_v2_format, int bit) {
   dim3 blockDim, gridDim;
   blockDim.x = BLOCK_KN_SIZE;
   blockDim.y = 1;
@@ -1376,17 +1409,15 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   kernel<<<gridDim, blockDim, 0, stream>>>(
       (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
-      size_m, size_k / 32 * bit, size_n);
+      size_m, size_k / 32 * bit, size_n, use_v2_format);
 }
 
 template <class T, int bit>
-__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
-                                        const half* __restrict__ w_scales,
-                                        const uint32_t* __restrict__ w_zeros,
-                                        const int* __restrict__ g_idx,
-                                        const int height, const int width,
-                                        const int group,
-                                        half* __restrict__ out) {
+__global__ void reconstruct_gptq_kernel(
+    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
+    const int height, const int width, const int group,
+    const bool use_v2_format, half* __restrict__ out) {
   // Start of block
 
   auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
@@ -1399,6 +1430,9 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
   MatrixView_half w_scales_(w_scales, group, width);
   T w_zeros_(w_zeros, group, width);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   uint32_t w_read = w[blockIdx.y * width + column];
   half* out_ptr = out_.item_ptr(row, column);
 
@@ -1406,7 +1440,7 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
   for (int s = 0; s < 32; s += bit) {
     int group = g_idx[row + s / bit];
     half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
     half w_item =
         __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
                w_scale);
@@ -1419,7 +1453,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
     const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
     const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
     const int height, const int width, const int group,
-    half* __restrict__ out) {
+    const bool use_v2_format, half* __restrict__ out) {
   // Start of block
   auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
   auto row = blockIdx.y * 32;
@@ -1431,6 +1465,9 @@ __global__ void reconstruct_gptq_3bit_kernel(
   MatrixView_half w_scales_(w_scales, group, width);
   MatrixView_q3_row w_zeros_(w_zeros, group, width);
 
+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
   uint32_t w1 = w[(blockIdx.y * 3) * width + column];
   uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
   uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
@@ -1440,7 +1477,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
   for (int i = 0; i < 32; i += 1) {
     int group = g_idx[row + i];
     half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
     int w_item;
     if (i == 10) {
       w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
@@ -1460,7 +1497,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
 
 void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
                       const half* b_gptq_scales, const int* b_g_idx, half* out,
-                      int height, int width, int groups, int bit) {
+                      int height, int width, int groups, bool use_v2_format,
+                      int bit) {
   dim3 blockDim, gridDim;
   blockDim.x = BLOCK_KN_SIZE;
   blockDim.y = 1;
@@ -1480,7 +1518,7 @@ void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
                                            b_gptq_qzeros, b_g_idx, height,
-                                           width, groups, out);
+                                           width, groups, use_v2_format, out);
 }
 
 void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
@@ -1488,7 +1526,8 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
                            const uint32_t* b_gptq_qzeros,
                            const half* b_gptq_scales, const int* b_g_idx,
                            half* c, half* temp_dq, int size_m, int size_n,
-                           int size_k, int groups, bool use_exllama, int bit) {
+                           int size_k, int groups, bool use_exllama,
+                           bool use_v2_format, int bit) {
   bool use_reconstruct;
   if (use_exllama) {
     use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
@@ -1502,10 +1541,10 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
     // Reconstruct FP16 matrix, then cuBLAS
     if (use_exllama) {
       reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                          temp_dq, size_k, size_n, groups, bit);
+                          temp_dq, size_k, size_n, groups, use_v2_format, bit);
     } else {
       reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                       temp_dq, size_k, size_n, groups, bit);
+                       temp_dq, size_k, size_n, groups, use_v2_format, bit);
     }
 
     const half alpha = __float2half(1.0f);
@@ -1521,18 +1560,18 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
     if (max_chunks) {
       gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
                                  b_g_idx, c, last_chunk, size_n, size_k,
-                                 BLOCK_M_SIZE_MAX, groups, bit);
+                                 BLOCK_M_SIZE_MAX, groups, use_v2_format, bit);
     }
 
     if (last_chunk_size) {
-      gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
-                                 b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                                 c + last_chunk * size_n, last_chunk_size,
-                                 size_n, size_k, last_chunk_size, groups, bit);
+      gemm_half_q_half_cuda_part(
+          a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+          b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k,
+          last_chunk_size, groups, use_v2_format, bit);
     }
   } else {
     gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                         c, size_m, size_n, size_k, bit);
+                         c, size_m, size_n, size_k, use_v2_format, bit);
   }
 }
 
@@ -1819,7 +1858,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit) {
+                        bool use_exllama, bool use_v2_format, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
@@ -1837,7 +1876,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
       c.size(1),              // n
       a.size(1),              // k
       b_gptq_qzeros.size(0),  // group number
-      use_exllama, bit);
+      use_exllama, use_v2_format, bit);
   return c;
 }
 
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index 8ba617a9e6555a1bb46ecfd8b26a4ea4512bf952..e607107b3e77c347ee703122a6deac6157522058 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
   return out;
 }
 
-torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                     c10::SymInt size_k, c10::SymInt size_n,
-                                     int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("awq_marlin_repack", &awq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 7576e0548abe98466d67cbc531379ba348c769b1..42d3b456096ee6764df8cfda2ebf9680ca3c974c 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -17,28 +17,32 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
 
-TEMPLATE = ("template __global__ void Marlin<"
-            "{{scalar_t}}, "
-            "{{w_type_id}}, "
-            "{{s_type_id}}, "
-            "{{threads}}, "
-            "{{thread_m_blocks}}, "
-            "{{thread_n_blocks}}, "
-            "{{thread_k_blocks}}, "
-            "{{'true' if m_block_size_8 else 'false'}}, "
-            "{{stages}}, "
-            "{{group_blocks}}, "
-            "{{'true' if is_zp_float else 'false'}}>"
-            "( MARLIN_KERNEL_PARAMS );")
+TEMPLATE = (
+    "template __global__ void Marlin<"
+    "{{scalar_t}}, "
+    "{{w_type_id}}, "
+    "{{s_type_id}}, "
+    "{{threads}}, "
+    "{{thread_m_blocks}}, "
+    "{{thread_n_blocks}}, "
+    "{{thread_k_blocks}}, "
+    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{stages}}, "
+    "{{group_blocks}}, "
+    "{{'true' if is_zp_float else 'false'}}>"
+    "( MARLIN_KERNEL_PARAMS );"
+)
 
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kFE2M1f"
+    "vllm::kU4",
+    "vllm::kU4B8",
+    "vllm::kU8B128",
+    "vllm::kFE4M3fn",
+    "vllm::kFE2M1f",
 ]
-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
-                  (128, 64, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 # group_blocks:
@@ -59,11 +63,12 @@ def generate_new_kernels():
         all_template_str_list = []
 
         for group_blocks, m_blocks, thread_configs in itertools.product(
-                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-
+            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+        ):
             # act order case only support gptq-int4 and gptq-int8
             if group_blocks == 0 and scalar_type not in [
-                    "vllm::kU4B8", "vllm::kU8B128"
+                "vllm::kU4B8",
+                "vllm::kU8B128",
             ]:
                 continue
             if thread_configs[2] == 256:
@@ -93,8 +98,7 @@ def generate_new_kernels():
             c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
 
             is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
-                    group_blocks == 4:
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
                 # HQQ (is_zp_float = true) only supports
                 # 4bit quantization and fp16
                 is_zp_float_list.append(True)
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 7c2d089a70d95a7e35c0239851056a63cc4cb4af..ad80d51ece94e252b1dda02d6b21ee1169831d0b 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
   return out;
 }
 
-torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, c10::SymInt size_k,
-                                      c10::SymInt size_n, int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("gptq_marlin_repack", &gptq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
index 5369d409f9b2181b7adcef486c17fe6bcfb1d1f7..aff11326d78e9e381d0c2e2ddf592302a61695bb 100644
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
     });
 
     if (numel % 256 != 0) {
-        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+        out = out.narrow(0, 0, numel / had_size);
     }
 
     if (inplace && out.data_ptr() != x.data_ptr()) {
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 8fd536ef46e3d0970d96e30a1d45aa358811630b..8bd17ba69cec1dc2ed015fcfa175f2eb1dca92eb 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -9,23 +9,23 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
-from typing import Optional, Union
 
 import jinja2
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
-                                            EpilogueScheduleType,
-                                            MixedInputKernelScheduleType,
-                                            TileSchedulerTag,
-                                            TileSchedulerType, VLLMDataType,
-                                            VLLMDataTypeNames,
-                                            VLLMDataTypeSize, VLLMDataTypeTag,
-                                            VLLMDataTypeTorchDataTypeTag,
-                                            VLLMDataTypeVLLMScalarTypeTag,
-                                            VLLMKernelScheduleTag)
-
-# yapf: enable
+from vllm_cutlass_library_extension import (
+    DataType,
+    EpilogueScheduleTag,
+    EpilogueScheduleType,
+    MixedInputKernelScheduleType,
+    TileSchedulerTag,
+    TileSchedulerType,
+    VLLMDataType,
+    VLLMDataTypeNames,
+    VLLMDataTypeSize,
+    VLLMDataTypeTag,
+    VLLMDataTypeTorchDataTypeTag,
+    VLLMDataTypeVLLMScalarTypeTag,
+    VLLMKernelScheduleTag,
+)
 
 #
 #   Generator templating
@@ -258,7 +258,7 @@ class ScheduleConfig:
 @dataclass(frozen=True)
 class TypeConfig:
     a: DataType
-    b: Union[DataType, VLLMDataType]
+    b: DataType | VLLMDataType
     b_group_scale: DataType
     b_group_zeropoint: DataType
     b_channel_scale: DataType
@@ -279,25 +279,30 @@ class PrepackTypeConfig:
 class ImplConfig:
     types: TypeConfig
     schedules: list[ScheduleConfig]
-    heuristic: list[tuple[Optional[str], ScheduleConfig]]
+    heuristic: list[tuple[str | None, ScheduleConfig]]
 
 
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
     tile_shape = (
         f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
     )
-    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
-                     f"x{schedule_config.cluster_shape_mnk[1]}" +
-                     f"x{schedule_config.cluster_shape_mnk[2]}")
-    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
-        .split("::")[-1]
-    epilogue_schedule = EpilogueScheduleTag[
-        schedule_config.epilogue_schedule].split("::")[-1]
-    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
-        .split("::")[-1]
-
-    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
-            f"_{epilogue_schedule}_{tile_scheduler}")
+    cluster_shape = (
+        f"{schedule_config.cluster_shape_mnk[0]}"
+        + f"x{schedule_config.cluster_shape_mnk[1]}"
+        + f"x{schedule_config.cluster_shape_mnk[2]}"
+    )
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
+        "::"
+    )[-1]
+    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
+        "::"
+    )[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
+
+    return (
+        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+        + f"_{epilogue_schedule}_{tile_scheduler}"
+    )
 
 
 # mostly unique shorter sch_sig
@@ -316,18 +321,24 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
 
 # unique type_name
 def generate_type_signature(kernel_types: TypeConfig):
-    return str("".join([
-        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-        for field in fields(TypeConfig)
-    ]))
+    return str(
+        "".join(
+            [
+                VLLMDataTypeNames[getattr(kernel_types, field.name)]
+                for field in fields(TypeConfig)
+            ]
+        )
+    )
 
 
 def generate_type_option_name(kernel_types: TypeConfig):
-    return ", ".join([
-        f"{field.name.replace('b_', 'with_')+'_type'}=" +
-        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-        for field in fields(TypeConfig)
-    ])
+    return ", ".join(
+        [
+            f"{field.name.replace('b_', 'with_') + '_type'}="
+            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
+            for field in fields(TypeConfig)
+        ]
+    )
 
 
 def is_power_of_two(n):
@@ -335,7 +346,6 @@ def is_power_of_two(n):
 
 
 def to_cute_constant(value: list[int]):
-
     def _to_cute_constant(value: int):
         if is_power_of_two(value):
             return f"_{value}"
@@ -350,11 +360,11 @@ def to_cute_constant(value: list[int]):
 
 def unique_schedules(impl_configs: list[ImplConfig]):
     # Use dict over set for deterministic ordering
-    return list({
-        sch: None
-        for impl_config in impl_configs
-        for sch in impl_config.schedules
-    }.keys())
+    return list(
+        {
+            sch: None for impl_config in impl_configs for sch in impl_config.schedules
+        }.keys()
+    )
 
 
 def unsigned_type_with_bitwidth(num_bits):
@@ -380,7 +390,7 @@ template_globals = {
     "gen_type_sig": generate_type_signature,
     "unique_schedules": unique_schedules,
     "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
-    "gen_type_option_name": generate_type_option_name
+    "gen_type_option_name": generate_type_option_name,
 }
 
 
@@ -398,23 +408,28 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
     sources = []
 
-    sources.append((
-        "machete_mm_dispatch",
-        mm_dispatch_template.render(impl_configs=impl_configs),
-    ))
+    sources.append(
+        (
+            "machete_mm_dispatch",
+            mm_dispatch_template.render(impl_configs=impl_configs),
+        )
+    )
 
     prepack_types = []
     for impl_config in impl_configs:
-        convert_type = impl_config.types.a \
-             if impl_config.types.b_group_scale == DataType.void \
-             else impl_config.types.b_group_scale
+        convert_type = (
+            impl_config.types.a
+            if impl_config.types.b_group_scale == DataType.void
+            else impl_config.types.b_group_scale
+        )
         prepack_types.append(
             PrepackTypeConfig(
                 a=impl_config.types.a,
                 b_num_bits=VLLMDataTypeSize[impl_config.types.b],
                 convert=convert_type,
                 accumulator=impl_config.types.accumulator,
-            ))
+            )
+        )
 
     def prepacked_type_key(prepack_type: PrepackTypeConfig):
         # For now, we can just use the first accumulator type seen since
@@ -430,10 +445,14 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
             unique_prepack_types.append(prepack_type)
             prepack_types_seen.add(key)
 
-    sources.append((
-        "machete_prepack",
-        prepack_dispatch_template.render(types=unique_prepack_types, ),
-    ))
+    sources.append(
+        (
+            "machete_prepack",
+            prepack_dispatch_template.render(
+                types=unique_prepack_types,
+            ),
+        )
+    )
 
     # Split up impls across files
     num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
@@ -466,10 +485,12 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
         curr_impl_in_file += len(files_impls[-1][-1].schedules)
 
     for part, file_impls in enumerate(files_impls):
-        sources.append((
-            f"machete_mm_impl_part{part+1}",
-            mm_impl_template.render(impl_configs=file_impls),
-        ))
+        sources.append(
+            (
+                f"machete_mm_impl_part{part + 1}",
+                mm_impl_template.render(impl_configs=file_impls),
+            )
+        )
 
     return sources
 
@@ -514,8 +535,7 @@ def generate():
     # For now we use the same heuristic for all types
     # Heuristic is currently tuned for H100s
     default_heuristic = [
-        (cond, ScheduleConfig(*tile_config,
-                              **sch_common_params))  # type: ignore
+        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
         for cond, tile_config in default_tile_heuristic_config.items()
     ]
 
@@ -541,14 +561,18 @@ def generate():
             a_token_scale=DataType.void,
             out=a,
             accumulator=DataType.f32,
-        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-        for a in (DataType.f16, DataType.bf16))
+        )
+        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16)
+    )
 
     impl_configs += [
         ImplConfig(x[0], x[1], x[2])
-        for x in zip(GPTQ_kernel_type_configs,
-                     itertools.repeat(get_unique_schedules(default_heuristic)),
-                     itertools.repeat(default_heuristic))
+        for x in zip(
+            GPTQ_kernel_type_configs,
+            itertools.repeat(get_unique_schedules(default_heuristic)),
+            itertools.repeat(default_heuristic),
+        )
     ]
 
     AWQ_kernel_type_configs = list(
@@ -561,14 +585,18 @@ def generate():
             a_token_scale=DataType.void,
             out=a,
             accumulator=DataType.f32,
-        ) for b in (DataType.u4, DataType.u8)
-        for a in (DataType.f16, DataType.bf16))
+        )
+        for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16)
+    )
 
     impl_configs += [
         ImplConfig(x[0], x[1], x[2])
-        for x in zip(AWQ_kernel_type_configs,
-                     itertools.repeat(get_unique_schedules(default_heuristic)),
-                     itertools.repeat(default_heuristic))
+        for x in zip(
+            AWQ_kernel_type_configs,
+            itertools.repeat(get_unique_schedules(default_heuristic)),
+            itertools.repeat(default_heuristic),
+        )
     ]
 
     # TODO: Support W4A8 when ready
diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/w8a8/cutlass/Epilogues.md
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/Epilogues.md
rename to csrc/quantization/w8a8/cutlass/Epilogues.md
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
similarity index 96%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index dbf79a0651159cf0fdac87e331b7b9ef6f03aa6b..e7bb061ba0244a7394a13d8967abd79b4946f711 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
       } else {
         cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
             OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
             cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
             out, a, b, a_scales, b_scales);
       }
@@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
       } else {
         cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
             OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
             cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
             out, a, b, a_scales, b_scales);
       }
@@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
       } else {
           cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
               OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
             cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
             out, a, b, a_scales, b_scales);
       }
@@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
     // TMA epilogue isn't compatible with Swap A/B
     cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
         OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
-        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
         cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
         out, a, b, a_scales, b_scales);
   }
 }
 
-}  // namespace vllm
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
similarity index 97%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
index 147eb8efc07782adcd970e3351cd0d41ce8efeec..c40d49966271466358937e49b9b19c822f982db8 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -48,7 +48,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
   using ElementBlockScale = float;
 
   using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
-        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::GMMA::Major::MN, cute::GMMA::Major::K>;
 
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
similarity index 90%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
index 3af59267bd60c6d4b105c061d947b610d510ba9e..2204a49257b08651884e636c517b44773210ccf9 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
@@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
         int8_func(c, a, b, a_scales, b_scales, bias);
       } else {
-        TORCH_CHECK(false, "Int8 not supported for this architecture");
+        int32_t version_num = get_sm_version_num();
+        TORCH_CHECK(
+            false, "Int8 not supported on SM", version_num,
+            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
       }
     }
   } else {
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
similarity index 69%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
index cf2cccc913f62d41e3b74e0e0037452faadb01e7..62aeb927ccdcb00569a57e57af77e6b16141e3a9 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
@@ -1,6 +1,5 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm100_fp8_dispatch.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 namespace vllm {
 
@@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
+                                                      b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
+                                                       b_scales);
   }
 }
 
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c950008b4139a276b353e1cf07f2e6a1617888b1
--- /dev/null
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -0,0 +1,318 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm100_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_default {
+  // M in (256, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M256 {
+  // M in (64, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64_swap_ab {
+  // This config is for M in (16, 64] and K >= 4096
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64 {
+  // This config is for M = 64 and K < 4096 (do not enable swap AB in such case)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using Cutlass3xGemm =
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M16_swap_ab {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _32, _128>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            torch::Tensor const& a_scales,
+                                            torch::Tensor const& b_scales,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16SwapAB =
+      typename sm100_fp8_config_M16_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64SwapAB =
+      typename sm100_fp8_config_M64_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, EnableBias>::Cutlass3xGemm;
+
+  using Cutlass3xGemmM256 =
+      typename sm100_fp8_config_M256<InType, OutType,
+                                     EnableBias>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const k = a.size(1);
+
+  if (m <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM16SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
+    // m in (16, 64]
+    if (m == 64 && k < 4096) {
+      // do not enable swap AB
+      return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64>(
+          out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+
+  } else if (m <= 256) {
+    // m in (64, 256]
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM256>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <bool EnableBias, typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
similarity index 92%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 4ff3e65f2b2e143040f31a54f4886b2c1e6ae0da..b8433214be1bab57c4c426339a8edfd8abe82756 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
                                ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M8192_K6144 {
+  // M >= 8192, K >= 6144
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
 template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
@@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm90_fp8_config_default<InType, OutType,
                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM8192_K6144 =
+      typename sm90_fp8_config_M8192_K6144<InType, OutType,
+                                           EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
 
@@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const n = b.size(1);
+  uint32_t const k = a.size(1);
 
   if (m <= 16) {
     // m in [1, 16]
@@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
     // m in (64, 128]
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
         out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m >= 8192 && k >= 6144) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
rename to csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
rename to csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/moe_data.cu
rename to csrc/quantization/w8a8/cutlass/moe/moe_data.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
similarity index 98%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 84843ee6e094927a9d1a5ad7206b2a9479ffdb48..1001af05ff0031111b67eb81947747020bc82366 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -67,8 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              std::optional<torch::Tensor> const& bias);
 #endif
 
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
+#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
+    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
+    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -253,7 +254,7 @@ void cutlass_moe_mm(
     bool per_act_token, bool per_out_ch) {
   int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100) {
+  if (version_num >= 100 && version_num < 110) {
     cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                          expert_offsets, problem_sizes, a_strides, b_strides,
                          c_strides, per_act_token, per_out_ch);
@@ -261,7 +262,7 @@ void cutlass_moe_mm(
   }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90) {
+  if (version_num >= 90 && version_num < 100) {
     cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
similarity index 99%
rename from csrc/quantization/fp8/amd/quant_utils.cuh
rename to csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
index 6e3556c311c77a249791ebba094a04e0a33480a5..646c803892de1e7680ca6a06f7c977ddbb782178 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
@@ -7,7 +7,7 @@
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
 
-#include "../../../attention/attention_dtypes.h"
+#include "../../../../attention/attention_dtypes.h"
 
 namespace vllm {
 #ifdef USE_ROCM
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/w8a8/fp8/common.cu
similarity index 99%
rename from csrc/quantization/fp8/common.cu
rename to csrc/quantization/w8a8/fp8/common.cu
index 45d6d5082ce4944966e2f72672eb160455e11606..7a822fb8fb8aa8b8492906e4666f6c9402079837 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
-#include "../../cub_helpers.h"
-#include "../vectorization_utils.cuh"
+#include "cub_helpers.h"
+#include "quantization/vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
 
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/w8a8/fp8/common.cuh
similarity index 100%
rename from csrc/quantization/fp8/common.cuh
rename to csrc/quantization/w8a8/fp8/common.cuh
diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
similarity index 99%
rename from csrc/quantization/fp8/nvidia/quant_utils.cuh
rename to csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
index 5361a8b1a598c20705c7dacb249a951c7eb6fd98..421e8092474bde9f41d4ea349ef11d5ab56c236b 100644
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../../../attention/attention_dtypes.h"
+#include "../../../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
similarity index 98%
rename from csrc/quantization/fp8/per_token_group_quant.cu
rename to csrc/quantization/w8a8/fp8/per_token_group_quant.cu
index 91d489fdef862065579de27fe645d1404c41936e..e3ab0676b254ea4afb0bd2b5333da65be3b010f6 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -1,6 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
 
-#include "../per_token_group_quant_8bit.h"
+#include "quantization/w8a8/per_token_group_quant_8bit.h"
 
 #include <cmath>
 
@@ -8,9 +8,9 @@
 
 #include <torch/all.h>
 
-#include "../vectorization.cuh"
-#include "../vectorization_utils.cuh"
-#include "../../dispatch_utils.h"
+#include "quantization/vectorization.cuh"
+#include "quantization/vectorization_utils.cuh"
+#include "dispatch_utils.h"
 
 __device__ __forceinline__ float GroupReduceMax(float val) {
   unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
@@ -212,4 +212,4 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                                double fp8_max, bool scale_ue8m0) {
   per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                              fp8_min, fp8_max, scale_ue8m0);
-}
+}
\ No newline at end of file
diff --git a/csrc/quantization/w8a8/int8/per_token_group_quant.cu b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9d808a176f538536989218baa594c742e1ee01ad
--- /dev/null
+++ b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
@@ -0,0 +1,12 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+
+#include "quantization/w8a8/per_token_group_quant_8bit.h"
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             int8_min, int8_max);
+}
\ No newline at end of file
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/w8a8/int8/scaled_quant.cu
similarity index 94%
rename from csrc/quantization/compressed_tensors/int8_quant_kernels.cu
rename to csrc/quantization/w8a8/int8/scaled_quant.cu
index bcfde9fbcbbefd1f1f0e287531bcb352f4dda9c1..be8ecfeacf8c0b5d0a5dff2d39216d8d25adf1ca 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/w8a8/int8/scaled_quant.cu
@@ -1,15 +1,12 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
-
-#ifndef USE_ROCM
-  #include "../per_token_group_quant_8bit.h"
-#endif
+#include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
 
-#include "../../cub_helpers.h"
-#include "../../dispatch_utils.h"
-#include "../vectorization_utils.cuh"
+#include "dispatch_utils.h"
+#include "quantization/vectorization_utils.cuh"
+#include "cub_helpers.h"
 
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@@ -25,7 +22,6 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
   float dst = std::nearbyint(x);
 
   // saturate
-
   // See https://github.com/pytorch/pytorch/issues/127666
   // See https://github.com/llvm/llvm-project/issues/95183
   // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@@ -84,7 +80,6 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
       static_cast<int32_t>(std::numeric_limits<int8_t>::max());
 
   // saturate
-
   // See https://github.com/pytorch/pytorch/issues/127666
   // See https://github.com/llvm/llvm-project/issues/95183
   // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@@ -176,7 +171,6 @@ __global__ void dynamic_scaled_int8_quant_kernel(
 
   float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
 
-  // 2. quantize
   vectorize_with_alignment<16>(
       row_in, row_out, hidden_size, tid, stride,
       [=] __device__(int8_t& dst, const scalar_t& src) {
@@ -194,7 +188,6 @@ struct MinMax {
 
   __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
 
-  // add a value to the MinMax
   __host__ __device__ MinMax& operator+=(float v) {
     min = fminf(min, v);
     max = fmaxf(max, v);
@@ -228,7 +221,6 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
   const scalar_t* row_in = input + token_idx * hidden_size;
   int8_t* row_out = output + token_idx * hidden_size;
 
-  // 1. calculate min & max
   MinMax thread_mm;
   vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
                                     [&] __device__(const scalar_t& src) {
@@ -261,7 +253,6 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
   const float inv_s = 1.f / scale_sh;
   const azp_t azp = azp_sh;
 
-  // 2. quantize
   vectorize_with_alignment<16>(
       row_in, row_out, hidden_size, tid, stride,
       [=] __device__(int8_t& dst, const scalar_t& src) {
@@ -285,6 +276,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
   dim3 const block(std::min(hidden_size, 256));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@@ -316,6 +308,7 @@ void dynamic_scaled_int8_quant(
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
   dim3 const block(std::min(hidden_size, 256));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
@@ -332,14 +325,4 @@ void dynamic_scaled_int8_quant(
                   hidden_size);
         }
       });
-}
-
-#ifndef USE_ROCM
-void per_token_group_quant_int8(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double int8_min, double int8_max) {
-  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
-                             int8_min, int8_max);
-}
-#endif
+}
\ No newline at end of file
diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
similarity index 73%
rename from csrc/quantization/per_token_group_quant_8bit.h
rename to csrc/quantization/w8a8/per_token_group_quant_8bit.h
index 537b61bc4303f8f2a73d87f8bcbb247b5418348b..25d4ecd1131a1d7994690c67295b63de525b2518 100644
--- a/csrc/quantization/per_token_group_quant_8bit.h
+++ b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
@@ -1,7 +1,6 @@
 #pragma once
 #include <torch/all.h>
 
-// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                 torch::Tensor& output_q,
diff --git a/csrc/quickreduce/quick_reduce.h b/csrc/quickreduce/quick_reduce.h
index 4fe4c44be7eb9942c7f22850fcd7dc693929dd79..4cc35300bf874a8d55e2962142204909f7cfa0f2 100644
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@@ -22,13 +22,14 @@ template <typename AllReduceKernel, typename T>
 __global__ __quickreduce_launch_bounds_two_shot__ static void
 allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
                             int rank, uint8_t** dbuffer_list,
-                            uint32_t data_offset, uint32_t flag_color) {
+                            uint32_t data_offset, uint32_t flag_color,
+                            int64_t data_size_per_phase) {
   int block = blockIdx.x;
   int grid = gridDim.x;
 
   while (block < num_blocks) {
     AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
-                         flag_color);
+                         flag_color, data_size_per_phase);
     block += grid;
     flag_color++;
   }
@@ -41,21 +42,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
     hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                        dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                        num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
   } else if (world_size == 4) {                                             \
     using LineCodec = __codec<T, 4>;                                        \
     using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
     hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                        dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                        num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
   } else if (world_size == 8) {                                             \
     using LineCodec = __codec<T, 8>;                                        \
     using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
     hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                        dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                        num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
   }
 
 enum QuickReduceQuantLevel {
diff --git a/csrc/quickreduce/quick_reduce_impl.cuh b/csrc/quickreduce/quick_reduce_impl.cuh
index 17816c552d25b19c2bbf1007ac3cb91208c68ece..38dc9938fc8a4bcb3284cc2f10ba124ba4555d19 100644
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@@ -553,13 +553,12 @@ struct AllReduceTwoshot {
       int const rank,                      // rank index
       uint8_t** __restrict__ buffer_list,  // communication buffers
       uint32_t const data_offset,          // offset to start of the data buffer
-      uint32_t flag_color) {
+      uint32_t flag_color, int64_t data_size_per_phase) {
     // Topology
     int thread = threadIdx.x + threadIdx.y * kWavefront;
     uint8_t* rank_buffer = buffer_list[rank];
     Codec codec(thread, rank);
     int block_id = blockIdx.x;
-    int grid_size = gridDim.x;
     // --------------------------------------------------------
     // Read input into registers
     int32x4_t tA[kAtoms];
@@ -588,12 +587,10 @@ struct AllReduceTwoshot {
     // rank responsible for this segment.
     uint32_t comm_data0_offset =
         data_offset + block_id * Codec::kTransmittedTileSize;
-    uint32_t comm_data1_offset =
-        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
+    uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
 
     uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
-    uint32_t comm_flags1_offset =
-        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
+    uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
 
     for (int r = 0; r < kWorldSize; r++) {
       int32x4_t* send_buffer =
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 133a545045b12473c543748b30db0a8dc0b788b9..a339c5641bb4a30db119cdc74c083cc6adfe2c61 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -23,7 +23,7 @@
 
 #include <algorithm>
 #include "../attention/dtype_fp8.cuh"
-#include "../quantization/fp8/amd/quant_utils.cuh"
+#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
 
 // ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
 #if !defined(HIP_FP8_TYPE_OCP)
@@ -40,7 +40,8 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
   #define __HIP__FP8MFMA__
 #endif
 
-#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
+#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
+                           defined(__gfx1150__) || defined(__gfx1151__))
   #define __HIP__GFX11__
 #endif
 
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index e4600350d3ea1fdecc967b34db96b2c970e3f6d9..2ef579a1b7537631198d0bd151b0e2a3eaeb0c2b 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -11,7 +11,7 @@
 
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 
 #if defined(__HIPCC__) && \
     (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index b0cce2e98d22133c2bfc287f2cbba1a485464827..b7e092ce994a8e0229e8ff160a17e779de938c1c 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -44,6 +44,270 @@ __global__ void apply_repetition_penalties_kernel(
   }
 }
 
+static inline __device__ uint16_t extractBinIdx(float x) {
+  union {
+    __half h;
+    uint16_t u16;
+  } tmp;
+  tmp.h = __float2half_rn(x);
+  tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
+  return 511 - (tmp.u16 >> 7);
+}
+
+template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
+__device__ void topKPerRowJob(const float* logits, const int rowStart,
+                              const int rowEnd, const int rowIdx,
+                              int* outIndices, int stride0, int stride1) {
+  // The number of elements per thread for the final top-k sort.
+  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
+  // The class to sort the elements during the final top-k sort.
+  #ifdef USE_ROCM
+    using TopKSort = hipcub::BlockRadixSort<float, kNumThreadsPerBlock,
+                                        kNumTopKItemsPerThread, int>;
+  #else
+    using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
+                                        kNumTopKItemsPerThread, int>;    
+  #endif   
+
+  // The number of slots for the final pass.
+  static constexpr int kNumFinalItems = 3072;
+  // The number of elements per thread for the final sort.
+  static constexpr int kNumFinalItemsPerThread =
+      kNumFinalItems / kNumThreadsPerBlock;
+  // The class to sort the elements during the final pass.
+  #ifdef USE_ROCM
+    using FinalSort = hipcub::BlockRadixSort<float, kNumThreadsPerBlock,
+                                        kNumFinalItemsPerThread, int>;
+  #else
+    using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
+                                        kNumFinalItemsPerThread, int>;
+  #endif
+
+  // The class to compute the inclusive prefix-sum over the histogram.
+  #ifdef USE_ROCM
+    using Scan = hipcub::BlockScan<int, kNumThreadsPerBlock>;
+  #else
+    using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
+  #endif
+
+  // Shared memory to compute the block scan.
+  __shared__ typename Scan::TempStorage smemScan;
+
+  // The structure to store the final items (for the final pass).
+  struct FinalItems {
+    // Shared memory to store the indices for the final pass.
+    int indices[kNumFinalItems];
+    // Shared memory to store the logits for the final pass.
+    float logits[kNumFinalItems];
+  };
+
+  // Shared memory to compute the block sort.
+  __shared__ union {
+    FinalItems items;
+    typename FinalSort::TempStorage finalSort;
+    typename TopKSort::TempStorage topKSort;
+  } smemFinal;
+
+  // Shared memory to store the histogram.
+  __shared__ int smemHistogram[kNumBins];
+  // Shared memory to store the selected indices.
+  __shared__ int smemIndices[kTopK];
+  // Shared memory to store the threshold bin.
+  __shared__ int smemThresholdBinIdx[1];
+  // Shared memory counter to register the candidates for the final phase.
+  __shared__ int smemFinalDstIdx[1];
+
+  // The length of the row.
+  int rowLen = rowEnd - rowStart;
+
+  // Shortcut if the length of the row is smaller than Top-K. Indices are not
+  // sorted by their corresponding logit.
+  if (rowLen <= kTopK) {
+    for (int rowIt = threadIdx.x; rowIt < rowLen;
+         rowIt += kNumThreadsPerBlock) {
+      int idx = rowStart + rowIt;
+      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
+    }
+    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
+         rowIt += kNumThreadsPerBlock) {
+      outIndices[rowIdx * kTopK + rowIt] = -1;
+    }
+    return;
+  }
+
+  // Clear the histogram.
+  if (threadIdx.x < kNumBins) {
+    smemHistogram[threadIdx.x] = 0;
+  }
+
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Fetch elements one-by-one.
+  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
+       rowIt += kNumThreadsPerBlock) {
+    uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
+    atomicAdd(&smemHistogram[idx], 1);
+  }
+
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Read the values from SMEM.
+  int binCount{0};
+  if (threadIdx.x < kNumBins) {
+    binCount = smemHistogram[threadIdx.x];
+  }
+
+  // Make sure each thread has read its value.
+  __syncthreads();
+
+  // Compute the prefix sum.
+  int prefixSum{0}, totalSum{0};
+  Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
+
+  // Update the histogram with the prefix sums.
+  if (threadIdx.x < kNumBins) {
+    smemHistogram[threadIdx.x] = prefixSum;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // Find the last valid bin.
+  if (threadIdx.x < kNumBins) {
+    int nextPrefixSum =
+        threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
+    if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
+      smemThresholdBinIdx[0] = threadIdx.x;
+    }
+  }
+
+  // Clear the counter to store the items for the final phase.
+  if (threadIdx.x == 0) {
+    smemFinalDstIdx[0] = 0;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // The threshold bin.
+  int thresholdBinIdx = smemThresholdBinIdx[0];
+
+  // Fetch elements one-by-one and populate the shared memory buffers.
+  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
+       rowIt += kNumThreadsPerBlock) {
+    float logit = logits[rowIdx * stride0 + rowIt * stride1];
+    uint16_t idx = extractBinIdx(logit);
+    if (idx < thresholdBinIdx) {
+      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
+      smemIndices[dstIdx] = rowIt;
+    } else if (idx == thresholdBinIdx) {
+      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
+      if (dstIdx < kNumFinalItems) {
+        smemFinal.items.logits[dstIdx] = logit;
+        smemFinal.items.indices[dstIdx] = rowIt;
+      }
+    }
+  }
+
+  // Make sure the elements are in shared memory.
+  __syncthreads();
+
+  // The logits of the elements to be sorted in the final pass.
+  float finalLogits[kNumFinalItemsPerThread];
+  // The indices of the elements to be sorted in the final pass.
+  int finalIndices[kNumFinalItemsPerThread];
+
+// Init.
+#pragma unroll
+  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+    finalLogits[ii] = -FLT_MAX;
+  }
+
+// Read the elements from SMEM.
+#pragma unroll
+  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+    if (srcIdx < smemFinalDstIdx[0]) {
+      finalLogits[ii] = smemFinal.items.logits[srcIdx];
+      finalIndices[ii] = smemFinal.items.indices[srcIdx];
+    }
+  }
+
+  // Make sure the shared memory has been read.
+  __syncthreads();
+
+  // Sort the elements.
+  FinalSort(smemFinal.finalSort)
+      .SortDescendingBlockedToStriped(finalLogits, finalIndices);
+
+  // Copy the data back to the shared memory storage.
+  int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
+#pragma unroll
+  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+    int dstIdx = baseIdx + srcIdx;
+    if (dstIdx < kTopK) {
+      smemIndices[dstIdx] = finalIndices[ii];
+    }
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+// Store to global memory.
+#pragma unroll
+  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
+    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
+    outIndices[offset] =
+        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
+  }
+}
+
+template <int kNumThreadsPerBlock = 512>
+static __global__ void topKPerRow(const float* logits, const int* rowStarts,
+                                  const int* rowEnds, int* outIndices,
+                                  int stride0, int stride1) {
+  // The number of bins in the histogram.
+  static constexpr int kNumBins = 512;
+
+  // The top-k width.
+  static constexpr int kTopK = 2048;
+
+  // The row computed by this block.
+  int rowIdx = blockIdx.x;
+
+  // The range of logits within the row.
+  int rowStart = rowStarts[rowIdx];
+  int rowEnd = rowEnds[rowIdx];
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
+      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+}
+
+template <int kNumThreadsPerBlock = 512>
+static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
+                                        int* outIndices, int stride0,
+                                        int stride1, int next_n) {
+  // The number of bins in the histogram.
+  static constexpr int kNumBins = 512;
+
+  // The top-k width.
+  static constexpr int kTopK = 2048;
+
+  // The row computed by this block.
+  int rowIdx = blockIdx.x;
+
+  // The range of logits within the row.
+  int rowStart = 0;
+  int seq_len = seqLens[rowIdx / next_n];
+  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
+      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+}
+
 }  // namespace vllm
 
 void apply_repetition_penalties_(
@@ -85,4 +349,32 @@ void apply_repetition_penalties_(
                 repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
                 tile_size);
       });
-}
\ No newline at end of file
+}
+
+void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
+                          const torch::Tensor& seqLens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1) {
+  // Compute the results on the device.
+  constexpr int kNumThreadsPerBlock = 512;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  vllm::topKPerRowDecode<kNumThreadsPerBlock>
+      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
+          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+          indices.data_ptr<int>(), static_cast<int>(stride0),
+          static_cast<int>(stride1), static_cast<int>(next_n));
+}
+
+void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
+                   const torch::Tensor& rowEnds, torch::Tensor& indices,
+                   int64_t numRows, int64_t stride0, int64_t stride1) {
+  // Compute the results on the device.
+  constexpr int kNumThreadsPerBlock = 512;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  vllm::topKPerRow<kNumThreadsPerBlock>
+      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
+          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
+          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
+          static_cast<int>(stride0), static_cast<int>(stride1));
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4f27fd8aecc8f68ecab2dcd212ff72634da6f6fd..43960c3f7ff8d367996332d8589e2a2c40855f00 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -33,11 +33,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
 //   ops.def(
-//       "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
-//       "y_q, Tensor! y_s, int group_size, "
-//       "bool use_ue8m0, int num_parallel_tokens) -> ()");
-//   ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
-//            &silu_mul_fp8_quant_deep_gemm_cuda);
+//       "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
+//       "y_q, Tensor! y_s,"
+//       "bool use_ue8m0) -> ()");
+//   ops.impl("persistent_masked_m_silu_mul_quant", torch::kCUDA,
+//            &persistent_masked_m_silu_mul_quant);
 
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@@ -175,11 +175,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
-  // Polynomial Normalization.
+  // Function for fused QK Norm and RoPE
   ops.def(
-      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
-      "epsilon) -> ()");
-  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
+      "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
+      "int num_heads_k, int num_heads_v, int head_dim, float eps, "
+      "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
+      "bool is_neox, Tensor position_ids) -> ()");
+  ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
 
   // Apply repetition penalties to logits in-place
   ops.def(
@@ -188,6 +190,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("apply_repetition_penalties_", torch::kCUDA,
            &apply_repetition_penalties_);
 
+  // Optimized top-k per row operation
+  ops.def(
+      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
+      "Tensor! indices, int numRows, int stride0, "
+      "int stride1) -> ()");
+  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
+
+  ops.def(
+      "top_k_per_row_decode(Tensor logits, int next_n, "
+      "Tensor seq_lens, Tensor! indices, int numRows, "
+      "int stride0, int stride1) -> ()");
+  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
+
   // Layernorm-quant
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
 //   ops.def(
@@ -401,7 +416,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
       " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
       {stride_tag});
-  ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
+  // conditionally compiled so impl registration is in source file
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
@@ -554,7 +569,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // to prevent the meta function registry.
   ops.def(
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
-      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
+      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
+      "use_v2_format, int bit) "
       "-> Tensor",
       {stride_tag});
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
@@ -607,7 +623,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor? cache_indices,"
       "Tensor? has_initial_state,"
       "Tensor! ssm_states,"
-      "int pad_slot_id) -> ()");
+      "int pad_slot_id,"
+      "int block_size,"
+      "Tensor? block_idx_first_scheduled_token,"
+      "Tensor? block_idx_last_scheduled_token,"
+      "Tensor? initial_state_idx) -> ()");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   // Hadamard transforms
@@ -724,6 +744,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "int quant_block_size, str kv_cache_dtype) -> ()");
   cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
                  &indexer_k_quant_and_cache);
+
+  cache_ops.def(
+      "cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
+      "dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
+  cache_ops.impl("cp_gather_indexer_k_quant_cache", torch::kCUDA,
+                 &cp_gather_indexer_k_quant_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
index 21b9d0ae515dfe9dffd9311ce3769617bb4acb2c..2678f69e19b6c7ce83b990cf3bf06d5f263ef520 100644
--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -29,6 +29,22 @@ struct _typeConvert {
   static constexpr bool exists = false;
 };
 
+template <>
+struct _typeConvert<float> {
+  static constexpr bool exists = true;
+  using hip_type = float;
+  using packed_hip_type = float2;
+  using packed_hip_type4 = float4;  // For 128-bit vectorization
+
+  __device__ static __forceinline__ float convert(hip_type x) { return x; }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
+    return x;
+  }
+  __device__ static __forceinline__ float4 convert(packed_hip_type4 x) {
+    return x;
+  }
+};
+
 #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
 // CUDA < 12.0 runs into issues with packed type conversion
 template <>
@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> {
   using hip_type = __half;
   using packed_hip_type = __half2;
 
-  __device__ static inline float convert(hip_type x) { return __half2float(x); }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
+    return __half2float(x);
+  }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __half22float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2half_rn(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22half2_rn(x);
   }
 };
 
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) || defined(USE_ROCM)
 // CUDA_ARCH < 800 does not have BF16 support
-// TODO: Add in ROCm support once public headers handle bf16 maturely
+// ROCm 7.0+ supports bfloat16
 template <>
 struct _typeConvert<c10::BFloat16> {
   static constexpr bool exists = true;
   using hip_type = __nv_bfloat16;
   using packed_hip_type = __nv_bfloat162;
 
-  __device__ static inline float convert(hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
     return __bfloat162float(x);
   }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __bfloat1622float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2bfloat16(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22bfloat162_rn(x);
   }
 };
-  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) ||
+          // defined(USE_ROCM)
 #endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
           // 12000))
 
@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp += T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] += other.data[i];
+          data[i + 1] += other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp += T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp *= T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] *= other.data[i];
+          data[i + 1] *= other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp *= T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 79d17cd1a0a252d8ffee5b37d425216f6d65a126..964700e2a43accc9b1345de4cfc2ed2b86a5b540 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,7 +5,7 @@
 # docs/contributing/dockerfile/dockerfile.md and
 # docs/assets/contributing/dockerfile-stages-dependency.png
 
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12
 
 # By parameterizing the base images, we allow third-party to use their own
@@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-# Important: We build with an old version of Ubuntu to maintain broad 
+# Important: We build with an old version of Ubuntu to maintain broad
 # compatibility with other Linux OSes. The main reason for this is that the
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
@@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=450
+ARG VLLM_MAX_SIZE_MB=500
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -273,6 +273,7 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
+# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
 ARG GDRCOPY_CUDA_VERSION=12.8
 # Keep in line with FINAL_BASE_IMAGE
 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
@@ -356,65 +357,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# If we need to build FlashInfer wheel before its release:
-# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
-# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-# $ cd flashinfer
-# $ git checkout v0.2.6.post1
-# $ python -m flashinfer.aot
-# $ python -m build --no-isolation --wheel
-# $ ls -la dist
-# -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
-# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
-
-# Install FlashInfer from source
-ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.3.1"
-# Flag to control whether to compile FlashInfer AOT kernels
-# Set to "true" to enable AOT compilation:
-# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
-ARG FLASHINFER_AOT_COMPILE=false
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
-  . /etc/environment
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer
-    pushd flashinfer
-        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-            if [[ "${CUDA_VERSION}" == 11.* ]]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-            else
-                # CUDA 12.8+ supports 10.0a and 12.0
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-            fi
-            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
-            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
-            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
-            # Build AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer.aot
-            # Install with no-build-isolation since we already built AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                uv pip install --system --no-build-isolation . \
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-            # Download pre-compiled cubins
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
-        else
-            echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
-            uv pip install --system . \
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-        fi
-    popd
-    rm -rf flashinfer
-BASH
+# Install FlashInfer pre-compiled kernel cache and binaries
+# https://docs.flashinfer.ai/installation.html
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-cubin==0.5.2 \
+    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+    && flashinfer show-config
+
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
@@ -436,7 +386,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
 
 COPY tools/install_gdrcopy.sh install_gdrcopy.sh
 RUN set -eux; \
@@ -451,9 +401,15 @@ RUN set -eux; \
 # Install EP kernels(pplx-kernels and DeepEP)
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
 ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
+RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
     && bash install_python_libraries.sh
 
+# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
+# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
+# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
+# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
@@ -467,6 +423,7 @@ ARG PYTHON_VERSION
 
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -479,7 +436,8 @@ ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
     if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt; \
+        uv pip install --system -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
 # install development dependencies (for testing)
@@ -526,7 +484,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     else \
         BITSANDBYTES_VERSION="0.46.1"; \
     fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
@@ -539,5 +497,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
 FROM vllm-openai-base AS vllm-openai
 
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["vllm", "serve"]
 #################### OPENAI API SERVER ####################
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index b80157d2fd75c4467defdcaf88492c6002420d13..4c961defaeda2997166424c559612bb7f45cc7e2 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -13,10 +13,11 @@
 #   vllm-dev: used for development
 #
 # Build arguments:
-#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
+#   VLLM_CPU_AMXBF16=false (default)|true
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -31,7 +32,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
     apt-get update -y \
-    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
         gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
@@ -47,7 +48,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 ENV UV_HTTP_TIMEOUT=500
 
-# Install Python dependencies 
+# Install Python dependencies
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
@@ -79,6 +80,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build
 
+ARG max_jobs=32
+ENV MAX_JOBS=${max_jobs}
+
 ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512=0
@@ -89,6 +93,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
+# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
+ARG VLLM_CPU_AMXBF16=0
+ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
 
@@ -104,20 +111,24 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
     --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 
 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
+# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cpu-test.txt 
+    uv pip install -r requirements/cpu-test.txt
 
 ######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev
@@ -130,12 +141,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils 
+    uv pip install -e tests/vllm_test_utils
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop
 
 COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
 
@@ -160,11 +171,12 @@ ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
 ADD ./.buildkite/ ./.buildkite/
 
+# Create symlink for vllm-workspace to maintain CI compatibility
+RUN ln -sf /workspace /vllm-workspace
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils 
-
-ENTRYPOINT ["bash"]
+    uv pip install -e tests/vllm_test_utils
 
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai
@@ -176,4 +188,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
     uv pip install dist/*.whl
 
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 6a9c3fa7dbed832ecfc07147521cdd77ca601bc2..b88b9c49922000a1692413aa5e6a6aa34613df63 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.3.1
+# release version: v0.5.2
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     echo "git clone flashinfer..." \
     && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
-    && git checkout v0.3.1 \
+    && git checkout v0.5.2 \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
     && rm -rf build \
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index aaff240388f2cdc5af70a2b1bf39430fb2ef7cc7..ad9eae94b83dd830e38506703f1ecde17f3ddc7b 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -1,4 +1,4 @@
-ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
 
 ###############################################################
 # Stage to build openblas
@@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
 
 ARG MAX_JOBS
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
     && source /opt/rh/gcc-toolset-13/enable \
     && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
 FROM centos-deps-builder AS base-builder
 
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 # Set Environment Variables for venv, cargo & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
        libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
        harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
-       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
     && dnf clean all \
     && PREFIX=/usr/local make -C /openblas install \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
@@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
 FROM base-builder AS torch-builder
 
 ARG MAX_JOBS
-ARG TORCH_VERSION=2.6.0
+ARG TORCH_VERSION=2.7.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable &&  \
@@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
     PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
 
-ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     BUILD_VERSION=${TORCHVISION_VERSION} \
     uv build --wheel --out-dir /torchwheels/ --no-build-isolation
 
-ARG TORCHAUDIO_VERSION=2.6.0
+ARG TORCHAUDIO_VERSION=2.7.0
 ARG BUILD_SOX=1
 ARG BUILD_KALDI=1
 ARG BUILD_RNNT=1
@@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
 
 ARG MAX_JOBS
 ARG PYARROW_PARALLEL
-ARG PYARROW_VERSION=19.0.1
+ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     make install -j ${MAX_JOBS:-$(nproc)} && \
     cd ../../python/ && \
     uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
-    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
     PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
     python setup.py build_ext \
     --build-type=release --bundle-arrow-cpp \
@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
     && make -j ${MAX_JOBS:-$(nproc)}
 
 
+###############################################################
+# Stage to build numba 
+###############################################################
+
+FROM base-builder AS numba-builder
+
+ARG MAX_JOBS
+ARG NUMBA_VERSION=0.61.2
+
+# Clone all required dependencies
+RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
+    git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
+    cd ./numba && \
+    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
+       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
+    fi && python -m build --wheel --installer=uv --outdir /numbawheels/
+
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=numba-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
+    dnf install llvm15 llvm15-devel -y && \
+    rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
     uv pip install maturin && \
@@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
     --mount=type=bind,src=.,dst=/src/,rw \
     source /opt/rh/gcc-toolset-13/enable && \
-    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    export PATH=$PATH:/usr/lib64/llvm15/bin && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
     make -C /numactl install && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
-    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
     cd /src/ && \
     uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
     uv pip install /vllmwheel/*.whl
@@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
 
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 # Set Environment Variables for venv & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
 COPY --from=openblas-builder /tmp/control /dev/null
+COPY --from=numba-builder /tmp/control /dev/null
 
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
-    tar findutils openssl \
+    libomp tar findutils openssl llvm15 llvm15-devel \
     pkgconfig xsimd g++ gcc-fortran libsndfile \
     libtiff libjpeg openjpeg2 zlib zeromq \
     freetype lcms2 libwebp tcl tk utf8proc \
-    harfbuzz fribidi libraqm libimagequant libxcb \
+    harfbuzz fribidi libraqm libimagequant libxcb util-linux \
     python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
-    && microdnf clean all \
+    && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv --no-cache \
     && make -C /numactl install \
@@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+
 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
@@ -314,4 +340,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index c8900212e5a1b857ceddba726e4ca7004dd4b32c..731a97d93da1fd049567cc5d55566fb00df8fdf7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -12,9 +12,20 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl
-# Remove sccache    
+# Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+# Install UV
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
@@ -59,13 +70,15 @@ FROM base AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
-    && pip install -U -r requirements/rocm-test.txt \
+    && uv pip install --system -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm-test.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
@@ -75,7 +88,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
     && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install lm-eval[api]==0.4.4 \
     && python3 -m pip install pytest-shard
 
 # -----------------------
@@ -90,14 +102,17 @@ RUN case "$(which python3)" in \
             rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
         *) ;; esac
 
-RUN python3 -m pip install --upgrade huggingface-hub[cli]
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --upgrade huggingface-hub[cli]
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
 
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 87fcb18b1c037d2a2695368d67ce78e1f4068b4c..df4f9b6c26e7dea0c8d1a208e0c68116d86ce19d 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,13 +1,13 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
-ARG TRITON_BRANCH="f9e5bf54"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
+ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="b2fb6885"
+ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="2ab9f4cd"
+ARG AITER_BRANCH="59bd8ff2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -15,10 +15,13 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 
+# Required for RCCL in ROCm7.1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+
 ARG PYTHON_VERSION=3.12
 
 RUN mkdir -p /app
@@ -141,4 +144,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 9942b7626f81e77a518e830bce0b4d8e81c6e594..2d2068d3453fef9f6350e2c37b4f41ff51110b02 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \
 
 # Install development utilities
 RUN microdnf install -y \
-    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
+    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
     openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
     clang llvm-devel llvm-static clang-devel && \
@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     rustup default stable && \
     rustup show
 
-FROM python-install AS torch
-ARG TORCH_VERSION=2.7.0
-ENV export _GLIBCXX_USE_CXX11_ABI=1
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-
-WORKDIR /tmp
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone https://github.com/pytorch/pytorch.git && \
-    cd pytorch && \
-    git checkout v2.7.0 && \
-    git submodule sync && \
-    git submodule update --init --recursive && \
-    uv pip install cmake ninja && \
-    uv pip install -r requirements.txt && \
-    python setup.py bdist_wheel
-    
-
 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VERSION=2.7.0
-ARG TORCH_VISION_VERSION=v0.20.1
+ARG TORCH_VISION_VERSION=v0.23.0
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     git clone https://github.com/pytorch/vision.git && \
     cd vision && \
     git checkout $TORCH_VISION_VERSION && \
-    TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
-    uv pip install -v $TORCH_WHL_FILE && \
+    uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
     python setup.py bdist_wheel
 
 FROM python-install AS hf-xet-builder
@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
        sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
     fi && python setup.py bdist_wheel
-
-# Edit aws-lc-sys to support s390x
-FROM python-install AS aws-lc-sys-editor
-WORKDIR /tmp
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-ARG AWS_LC_VERSION=v0.30.0
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone --recursive https://github.com/aws/aws-lc-rs.git && \
-    cd aws-lc-rs && \
-    git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
-    git submodule sync && \
-    git submodule update --init --recursive && \
-    cd aws-lc-sys && \
-    sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
-    sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
-    sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
     
 # Build Outlines Core
 FROM python-install AS outlines-core-builder
@@ -226,17 +181,17 @@ WORKDIR /tmp
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-ARG OUTLINES_CORE_VERSION=0.2.10
+COPY requirements/common.txt /tmp/requirements/common.txt
+ARG OUTLINES_CORE_VERSION
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
     --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
+    OUTLINES_CORE_VERSION=${OUTLINES_CORE_VERSION:-$(grep -E '^outlines_core\s*==\s*[0-9.]+' /tmp/requirements/common.txt | grep -Eo '[0-9.]+')} && \
+    if [ -z "${OUTLINES_CORE_VERSION}" ]; then echo "ERROR: Could not determine outlines_core version"; exit 1; fi && \
     git clone https://github.com/dottxt-ai/outlines-core.git && \
     cd outlines-core && \
     git checkout tags/${OUTLINES_CORE_VERSION} && \
     sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
-    echo '[patch.crates-io]' >> Cargo.toml && \
-    echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
     uv pip install maturin && \
     python -m maturin build --release --out dist
 
@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
 
 # Set correct library path for torch and numactl
-ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
 ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
 ENV UV_LINK_MODE=copy
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
 ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
+ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
+ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
+ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
 
 COPY . /workspace/vllm
 WORKDIR /workspace/vllm
@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
-    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
     --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
      HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
-     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
      LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
      NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
      OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
-        $TORCH_WHL_FILE \
         $LLVM_WHL_FILE \
         $NUMBA_WHL_FILE \
         $OUTLINES_CORE_WHL_FILE \
@@ -309,4 +263,4 @@ USER 2000
 WORKDIR /home/vllm
 
 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ef422352509a97e8b93cbe6aa296c97fa961bba1..5d5b82c4fa5af8dbfe75b28f5040d3dd21370333 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
     libxext6 \
     libgl1 \
     lsb-release \
+    libaio-dev \
     numactl \
     wget \
     vim \
@@ -54,7 +55,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
-    python3 setup.py install
+    pip install --no-build-isolation .
 
 CMD ["/bin/bash"]
 
@@ -64,9 +65,14 @@ FROM vllm-base AS vllm-openai
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+# install nixl from source code
+ENV NIXL_VERSION=0.7.0
+RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docs/.nav.yml b/docs/.nav.yml
index c103ed476d76d68417c62765348088d63ae7cfbe..3151ea0e2ec22983dd61a3dc6657c3d6d1fcbf49 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -46,7 +46,10 @@ nav:
       - contributing/model/multimodal.md
       - contributing/model/transcription.md
     - CI: contributing/ci
-    - Design Documents: design
+    - Design Documents:
+      - Plugins:
+        - design/*plugin*.md
+      - design/*
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/README.md b/docs/README.md
index ae95717def4cd81f37005601bfecdb7c80461cf1..0c279c19f96ca7da550364899b312f475c264cf7 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
 Where to get started with vLLM depends on the type of user. If you are looking to:
 
 - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
-- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
-- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
 
 For information about the development of vLLM, see:
 
@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
diff --git a/docs/api/README.md b/docs/api/README.md
index 86e310f567dd3bc05e0e3a9e53419fe81757046c..d3a141f3273082a1c706977dc4d450b419409a8e 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.CompilationConfig][]
 - [vllm.config.VllmConfig][]
 
-[](){ #offline-inference-api }
-
 ## Offline Inference
 
 LLM Class.
@@ -45,18 +43,14 @@ Engine classes for offline and online inference.
 
 Inference parameters for vLLM APIs.
 
-[](){ #sampling-params }
-
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
 
-[](){ #multi-modality }
-
 ## Multi-Modality
 
 vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
 
-Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].
 
 Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
diff --git a/docs/api/vllm/.meta.yml b/docs/api/vllm/.meta.yml
index c15adfec644cfa9a790f3e600441e153c2a81779..d105540fee792cffc6469d93040f61cbf0d76d8c 100644
--- a/docs/api/vllm/.meta.yml
+++ b/docs/api/vllm/.meta.yml
@@ -1,2 +1,2 @@
 search:
-  boost: 0.5
+  exclude: true
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 0838bfa37fe62d60fba9adcbd18c81de0809f253..f8c104ba14259e820a5b4ddf4d872b8696d82b33 100644
Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/docs/assets/contributing/load-pattern-examples.png b/docs/assets/contributing/load-pattern-examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..9f356dc24fa3a8e8c2b2609748dfdb8b8926d97c
Binary files /dev/null and b/docs/assets/contributing/load-pattern-examples.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-catalog.png b/docs/assets/deployment/hf-inference-endpoints-catalog.png
new file mode 100644
index 0000000000000000000000000000000000000000..a26681eec7b33bcdfddb335d31e92910062b82c1
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-catalog.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-choose-infra.png b/docs/assets/deployment/hf-inference-endpoints-choose-infra.png
new file mode 100644
index 0000000000000000000000000000000000000000..09e92ad3fc7a097034dd144f6f0c21ea74396307
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-choose-infra.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png b/docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png
new file mode 100644
index 0000000000000000000000000000000000000000..687db6e03212fbe1e7a0125221f9345a88eaa28a
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-configure-container.png b/docs/assets/deployment/hf-inference-endpoints-configure-container.png
new file mode 100644
index 0000000000000000000000000000000000000000..834d0dda65acccc1f011b77889dfc359ade2922a
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-configure-container.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-create-endpoint.png b/docs/assets/deployment/hf-inference-endpoints-create-endpoint.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b0d12d1caf01b1d1b07cc174fa6538164b8815
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-create-endpoint.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-locate-deploy-button.png b/docs/assets/deployment/hf-inference-endpoints-locate-deploy-button.png
new file mode 100644
index 0000000000000000000000000000000000000000..4fc6fe8eebefdd3bb7c64cc1ee1b10e9c8e9d981
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-locate-deploy-button.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-new-endpoint.png b/docs/assets/deployment/hf-inference-endpoints-new-endpoint.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2e6ad8d78bbdb695d8539ec545a92a10fd80d
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-new-endpoint.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-select-hardware.png b/docs/assets/deployment/hf-inference-endpoints-select-hardware.png
new file mode 100644
index 0000000000000000000000000000000000000000..444863b17c1c0c357f34e9dfb6026c571160351d
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-select-hardware.png differ
diff --git a/docs/assets/deployment/hf-inference-endpoints-select-model.png b/docs/assets/deployment/hf-inference-endpoints-select-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..44f66520fd12d1fbe2b48e9031e9a9c30de97933
Binary files /dev/null and b/docs/assets/deployment/hf-inference-endpoints-select-model.png differ
diff --git a/docs/assets/design/cuda_graphs/current_design.png b/docs/assets/design/cuda_graphs/current_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..045b8bbd6bfd4defb854153204cb76c5b9b1fc31
Binary files /dev/null and b/docs/assets/design/cuda_graphs/current_design.png differ
diff --git a/docs/assets/design/cuda_graphs/executor_runtime.png b/docs/assets/design/cuda_graphs/executor_runtime.png
new file mode 100644
index 0000000000000000000000000000000000000000..f8d8abe43aac11f3806aee1d040ae874c71c4953
Binary files /dev/null and b/docs/assets/design/cuda_graphs/executor_runtime.png differ
diff --git a/docs/assets/design/cuda_graphs/previous_design.png b/docs/assets/design/cuda_graphs/previous_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..db1432288a2feb278f70397691f1553425f29805
Binary files /dev/null and b/docs/assets/design/cuda_graphs/previous_design.png differ
diff --git a/docs/assets/design/cuda_graphs/wrapper_flow.png b/docs/assets/design/cuda_graphs/wrapper_flow.png
new file mode 100644
index 0000000000000000000000000000000000000000..749dc7f8bc5cc016e033f2148f59652250a09142
Binary files /dev/null and b/docs/assets/design/cuda_graphs/wrapper_flow.png differ
diff --git a/docs/assets/design/debug_vllm_compile/design_diagram.png b/docs/assets/design/debug_vllm_compile/design_diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..9dfd45ec1a3152cb28bd1996f30214d0076d9024
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/design_diagram.png differ
diff --git a/docs/assets/design/debug_vllm_compile/dynamic_shapes.png b/docs/assets/design/debug_vllm_compile/dynamic_shapes.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a018cc79c66404f89f72f13709f7d052ffafab0
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/dynamic_shapes.png differ
diff --git a/docs/assets/design/debug_vllm_compile/tlparse_inductor.png b/docs/assets/design/debug_vllm_compile/tlparse_inductor.png
new file mode 100644
index 0000000000000000000000000000000000000000..cbef753e6dd698e8f65d874c7b2388630cec5992
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/tlparse_inductor.png differ
diff --git a/docs/assets/features/disagg_encoder/disagg_encoder_flow.png b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png
new file mode 100644
index 0000000000000000000000000000000000000000..2951468c11d9a55f98af4c26eda60cebebb169f4
Binary files /dev/null and b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png differ
diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml
index 6c2c09d566a3ab0209ddf27fc8f3e30adeae18c1..d2d2905703ec570c0971dcc63c649a4175cea7c4 100644
--- a/docs/cli/.nav.yml
+++ b/docs/cli/.nav.yml
@@ -5,4 +5,4 @@ nav:
   - complete.md
   - run-batch.md
   - vllm bench:
-    - bench/*.md
+    - bench/**/*.md
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
index 21ab13e63781a8976d9962954df5ccb512deaabe..ea7ea7321ffcdb8acc6c2a6ee97385123a4280bc 100644
--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_latency.md"
+--8<-- "docs/argparse/bench_latency.inc.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
index f7c415c6becb553158d89f1b282cbdda5ad98a2a..f7dc8036cc262dcd3308c62980806d68e055bf24 100644
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_serve.md"
+--8<-- "docs/argparse/bench_serve.inc.md"
diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md
new file mode 100644
index 0000000000000000000000000000000000000000..a101330e093cc4dfd94173b6bb2382c6792542ca
--- /dev/null
+++ b/docs/cli/bench/sweep/plot.md
@@ -0,0 +1,9 @@
+# vllm bench sweep plot
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_plot.inc.md"
diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0468f06fc287014c415b0c1bb3071ae767a6d01
--- /dev/null
+++ b/docs/cli/bench/sweep/serve.md
@@ -0,0 +1,9 @@
+# vllm bench sweep serve
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_serve.inc.md"
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
new file mode 100644
index 0000000000000000000000000000000000000000..5642ec67eb0077fe1861cc7dbaf546b9331d94e4
--- /dev/null
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -0,0 +1,9 @@
+# vllm bench sweep serve_sla
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
index e4ff5ce43c9cee195620699cd0776b1327a9bd05..e7f618fb4d14797042b241a77a65d092f1b383c1 100644
--- a/docs/cli/bench/throughput.md
+++ b/docs/cli/bench/throughput.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_throughput.md"
+--8<-- "docs/argparse/bench_throughput.inc.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
index b006cb8de60d06aa83b44ff5550017578b924511..0246bd431b10170ca3d3c75f8bff787bbc1f670a 100644
--- a/docs/cli/chat.md
+++ b/docs/cli/chat.md
@@ -1,5 +1,5 @@
 # vllm chat
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/chat.md"
+--8<-- "docs/argparse/chat.inc.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
index 400359acf4fb8304105f54601f929d108d2249aa..eb2ffdaabac25fc97c3dbea0d6696b8b88674b8b 100644
--- a/docs/cli/complete.md
+++ b/docs/cli/complete.md
@@ -1,5 +1,5 @@
 # vllm complete
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/complete.md"
+--8<-- "docs/argparse/complete.inc.md"
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
index f7d401b8dad2b5e35d3226b22348c6113f5e08da..758fbda283978596cce5eb75c7406115bc92af84 100644
--- a/docs/cli/run-batch.md
+++ b/docs/cli/run-batch.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/run-batch.md"
+--8<-- "docs/argparse/run-batch.inc.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
index 2c8f9d320f5df6db2f4e8d97b1a45ce4e04fe225..35652fec587b3c4e2756cf6791e79a9fa16cd9f7 100644
--- a/docs/cli/serve.md
+++ b/docs/cli/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/serve.md"
+--8<-- "docs/argparse/serve.inc.md"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index a3004249b758b512fff363307f38776b7940ada5..0735f452df96058e49d174441ba77c283068d430 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,7 +1,19 @@
 # Meetups
 
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+We host regular meetups around the world. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights.
 
+## Upcoming Meetups
+
+Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_project), join our [Slack](https://slack.vllm.ai), and follow vLLM on [Luma](https://luma.com/vLLM-Meetups) to get notified about new events.
+
+## Past Meetups
+
+Below you'll find slides and recordings from our previous meetups:
+
+- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
+- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
+- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
+- [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
 - [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
 - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
@@ -22,4 +34,12 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share
 - [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
 - [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
 
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
+## Get Involved
+
+**Want to host or speak at a vLLM meetup?** We're always looking for speakers and sponsors for our meetups. Whether you want to:
+
+- Share your vLLM feature, use case, project extension, or deployment experience
+- Host a meetup in your city
+- Sponsor an event
+
+Please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index 6ad3a662526641bbb0febbff9db4692977f81c8f..8abb07caaab62deb8b5b65c359d2b2a7a66e2287 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -34,6 +34,7 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
+- Volcengine
 
 Slack Sponsor: Anyscale
 
diff --git a/docs/configuration/README.md b/docs/configuration/README.md
index 6a8fbc79f4aff98c5d0786c37c2e4dfbad811086..85ae642ba6dd080a34af7e47c6efb4654d80b291 100644
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
@@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
 
 There are three main levels of configuration, from highest priority to lowest priority:
 
-- [Request parameters][completions-api] and [input arguments][sampling-params]
+- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
 - [Engine arguments](./engine_args.md)
 - [Environment variables](./env_vars.md)
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index efda9c8e019ebfc1441a3b30e2581aa21695670b..5ce43c7984057e434cb0366a6640fba45306ae52 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
 ```python
 from vllm import LLM
 
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
-          tensor_parallel_size=2)
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```
 
 !!! warning
@@ -24,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
 !!! note
     With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
 
-    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+    You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Quantization
 
@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
 ```python
 from vllm import LLM
 
-llm = LLM(model="adept/fuyu-8b",
-          max_model_len=2048,
-          max_num_seqs=2)
+llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 ```
 
 ## Reduce CUDA Graphs
@@ -61,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
 
     ```python
     from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationLevel
+    from vllm.config import CompilationConfig, CompilationMode
 
     llm = LLM(
         model="meta-llama/Llama-3.1-8B-Instruct",
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             # By default, it goes up to max_num_seqs
             cudagraph_capture_sizes=[1, 2, 4, 8, 16],
         ),
@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
 
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
-          enforce_eager=True)
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
 ```
 
 ## Adjust cache size
@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
 from vllm import LLM
 
 # Accept up to 3 images and 1 video per prompt
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"image": 3, "video": 1})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 3, "video": 1},
+)
 ```
 
 You can go a step further and disable unused modalities completely by setting its limit to zero.
@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
 from vllm import LLM
 
 # Accept any number of images but no videos
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"video": 0})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"video": 0},
+)
 ```
 
 You can even run a multi-modal model for text-only inference:
@@ -118,10 +118,52 @@ You can even run a multi-modal model for text-only inference:
 from vllm import LLM
 
 # Don't accept images. Just text.
-llm = LLM(model="google/gemma-3-27b-it",
-          limit_mm_per_prompt={"image": 0})
+llm = LLM(
+    model="google/gemma-3-27b-it",
+    limit_mm_per_prompt={"image": 0},
+)
 ```
 
+### Configurable options
+
+`limit_mm_per_prompt` also accepts configurable options per modality. In the configurable form, you still specify `count`, and you may optionally provide size hints that control how vLLM profiles and reserves memory for your multi‑modal inputs. This helps you tune memory for the actual media you expect, instead of the model’s absolute maxima.
+
+Configurable options by modality:
+
+- `image`: `{"count": int, "width": int, "height": int}`
+- `video`: `{"count": int, "num_frames": int, "width": int, "height": int}`
+- `audio`: `{"count": int, "length": int}`
+
+Details could be found in [`ImageDummyOptions`][vllm.config.multimodal.ImageDummyOptions], [`VideoDummyOptions`][vllm.config.multimodal.VideoDummyOptions], and [`AudioDummyOptions`][vllm.config.multimodal.AudioDummyOptions].
+
+Examples:
+
+```python
+from vllm import LLM
+
+# Up to 5 images per prompt, profile with 512x512.
+# Up to 1 video per prompt, profile with 32 frames at 640x640.
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={
+        "image": {"count": 5, "width": 512, "height": 512},
+        "video": {"count": 1, "num_frames": 32, "width": 640, "height": 640},
+    },
+)
+```
+
+For backward compatibility, passing an integer works as before and is interpreted as `{"count": <int>}`. For example:
+
+- `limit_mm_per_prompt={"image": 5}` is equivalent to `limit_mm_per_prompt={"image": {"count": 5}}`
+- You can mix formats: `limit_mm_per_prompt={"image": 5, "video": {"count": 1, "num_frames": 32, "width": 640, "height": 640}}`
+
+!!! note
+    - The size hints affect memory profiling only. They shape the dummy inputs used to compute reserved activation sizes. They do not change how inputs are actually processed at inference time.
+    - If a hint exceeds what the model can accept, vLLM clamps it to the model's effective maximum and may log a warning.
+
+!!! warning
+    These size hints currently only affect activation memory profiling. Encoder cache size is determined by the actual inputs at runtime and is not limited by these hints.
+
 ## Multi-modal processor arguments
 
 For certain models, you can adjust the multi-modal processor arguments to
@@ -133,14 +175,14 @@ Here are some examples:
 from vllm import LLM
 
 # Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_kwargs={"max_pixels": 768 * 768},  # Default is 1280 * 28 * 28
+)
 
 # Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
+llm = LLM(
+    model="OpenGVLab/InternVL2-2B",
+    mm_processor_kwargs={"max_dynamic_patch": 4},  # Default is 12
+)
 ```
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 5c74610ebd290da6f8048f9066ada4b0d22f4c0d..b0d390d7e1cbbff3372756e0568c08e3c44a048f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp
 
 In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
 
-[](){ #chunked-prefill }
-
 ## Chunked Prefill
 
 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
@@ -100,7 +98,7 @@ from vllm import LLM
 llm = LLM(
     model="meta-llama/Llama-3.3-70B-Instruct,
     tensor_parallel_size=4,
-    pipeline_parallel_size=2
+    pipeline_parallel_size=2,
 )
 ```
 
@@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 
 Known supported models (with corresponding benchmarks):
 
-- dots_ocr (<gh-pr:25466>)
-- GLM-4.1V or above (<gh-pr:23168>)
-- InternVL (<gh-pr:23909>)
-- Kimi-VL (<gh-pr:23817>)
-- Llama4 (<gh-pr:18368>)
-- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
-- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
-- Step3 (<gh-pr:22697>)
+- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
+- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
+- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
+- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
+- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
+- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
+- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
+- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
 
 ## Input Processing
 
@@ -257,18 +255,24 @@ Examples:
 
 ```python
 # Use a larger cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=8,
+)
 
 # Use a shared-memory based IPC cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          tensor_parallel_size=2,
-          mm_processor_cache_type="shm",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    tensor_parallel_size=2,
+    mm_processor_cache_type="shm",
+    mm_processor_cache_gb=8,
+)
 
 # Disable the cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=0)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=0,
+)
 ```
 
 ### Cache Placement
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index c1cc5577bc7abda32c44d85df510d40d8ceceb3c..baaf21f01f066d3c4ddd3c4a99f0a5b8f86b8807 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+To see the available options, take a look at the [CLI Reference](../cli/README.md)!
 
 ## Configuration file
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index e456077e04958a3b177f8d9f602e4ff060dbe6db..2d24c9c6e2e9593c312f6d845cc6aefb9291a1f9 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -4,7 +4,7 @@ This doc serves as a collection of handy tips for optimizing your vLLM on TPU wo
 
 ## Get started
 
-Looking for setup and installation instructions? Find them [here](../getting_started/installation/google_tpu.md).
+Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/).
 
 ### TPU workload sizing
 
@@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
 
 ### Tune your workloads
 
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
 
 ### Future Topics We'll Cover
 
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index b0a95b3b3d3a5f8070797775fcbd72f8f1b7c6b5..9c1b5a3b66d408874e64f640a034125aa2c89b6d 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -16,13 +16,13 @@ Finally, one of the most impactful ways to support us is by raising awareness ab
 Unsure on where to start? Check out the following links for tasks to work on:
 
 - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
-    - [Selected onboarding tasks](gh-project:6)
+    - [Selected onboarding tasks](https://github.com/orgs/vllm-project/projects/6)
 - [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
-    - [Models with multi-modal capabilities](gh-project:10)
+    - [Models with multi-modal capabilities](https://github.com/orgs/vllm-project/projects/10)
 
 ## License
 
-See <gh-file:LICENSE>.
+See [LICENSE](../../LICENSE).
 
 ## Developing
 
@@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 
 !!! tip
-    vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
+    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
 
     Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 
@@ -83,12 +83,12 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
 
     ```bash
     pre-commit run --hook-stage manual markdownlint
-    pre-commit run --hook-stage manual mypy-3.9
+    pre-commit run --hook-stage manual mypy-3.10
     ```
 
 ### Documentation
 
-MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
+MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
 
 Get started with:
 
@@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
 !!! important
-    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+    If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
 
 ## Pull Requests & Code Reviews
 
@@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
 
 ### DCO and Signed-off-by
 
-When contributing changes to this project, you must agree to the <gh-file:DCO>.
+When contributing changes to this project, you must agree to the [DCO](../../DCO).
 Commits must include a `Signed-off-by:` header which certifies agreement with
 the terms of the DCO.
 
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index a97d1fa6a3a552db4f4da5e86e17e6ea28ffe4d1..c9bc9cfe28a3506c082985af003d959d01ed3351 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -6,11 +6,9 @@ toc_depth: 4
 
 vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
 
-- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
-- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
-- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
-
-[Benchmark CLI]: #benchmark-cli
+- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
+- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
+- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 
 ## Benchmark CLI
 
@@ -29,12 +27,13 @@ th {
 | Dataset | Online | Offline | Data Path |
 |---------|--------|---------|-----------|
 | ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
-| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
+| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
 | ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
 | BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
 | Random | ✅ | ✅ | `synthetic` |
 | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
+| RandomForReranking | ✅ | ✅ | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
 | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@@ -67,13 +66,13 @@ Legend:
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
-First start serving your model
+First start serving your model:
 
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
-Then run the benchmarking script
+Then run the benchmarking script:
 
 ```bash
 # download dataset
@@ -87,7 +86,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-If successful, you will see the following output
+If successful, you will see the following output:
 
 ```text
 ============ Serving Benchmark Result ============
@@ -125,7 +124,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 
 ```bash
 # start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
+vllm serve meta-llama/Llama-3.1-8B-Instruct
 ```
 
 ```bash
@@ -167,7 +166,7 @@ vllm bench serve \
 ##### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
     --speculative-config $'{"method": "ngram",
     "num_speculative_tokens": 5, "prompt_lookup_max": 5,
     "prompt_lookup_min": 2}'
@@ -184,7 +183,7 @@ vllm bench serve \
 ##### Spec Bench Benchmark with Speculative Decoding
 
 ``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
     --speculative-config $'{"method": "ngram",
     "num_speculative_tokens": 5, "prompt_lookup_max": 5,
     "prompt_lookup_min": 2}'
@@ -319,6 +318,73 @@ The following arguments can be used to control the ramp-up:
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
+##### Load Pattern Configuration
+
+vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
+
+###### Load Pattern Control Parameters
+
+- `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
+- `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
+- `--max-concurrency`: Limits concurrent outstanding requests. If this argument is not provided, concurrency is unlimited. Set a value to simulate backpressure.
+
+These parameters work together to create realistic load patterns with carefully chosen defaults. The `--request-rate` parameter defaults to `inf` (infinite), which sends all requests immediately for maximum throughput testing. When set to finite values, it uses either a Poisson process (default `--burstiness=1.0`) or Gamma distribution for realistic request timing. The `--burstiness` parameter only takes effect when `--request-rate` is not infinite - a value of 1.0 creates natural Poisson traffic, while lower values (0.1-0.5) create bursty patterns and higher values (2.0-5.0) create uniform spacing. The `--max-concurrency` parameter defaults to `None` (unlimited) but can be set to simulate real-world constraints where a load balancer or API gateway limits concurrent connections. When combined, these parameters allow you to simulate everything from unrestricted stress testing (`--request-rate=inf`) to production-like scenarios with realistic arrival patterns and resource constraints.
+
+The `--burstiness` parameter mathematically controls request arrival patterns using a Gamma distribution where:
+
+- Shape parameter: `burstiness` value
+- Coefficient of Variation (CV): $\frac{1}{\sqrt{burstiness}}$
+- Traffic characteristics:
+    - `burstiness = 0.1`: Highly bursty traffic (CV ≈ 3.16) - stress testing
+    - `burstiness = 1.0`: Natural Poisson traffic (CV = 1.0) - realistic simulation  
+    - `burstiness = 5.0`: Uniform traffic (CV ≈ 0.45) - controlled load testing
+
+![Load Pattern Examples](../assets/contributing/load-pattern-examples.png)
+
+*Figure: Load pattern examples for each use case. Top row: Request arrival timelines showing cumulative requests over time. Bottom row: Inter-arrival time distributions showing traffic variability patterns. Each column represents a different use case with its specific parameter settings and resulting traffic characteristics.*
+
+Load Pattern Recommendations by Use Case:
+
+| Use Case           | Burstiness   | Request Rate    | Max Concurrency | Description                                               |
+| ---                | ---          | ---             | ---             | ---                                                       |
+| Maximum Throughput | N/A          | Infinite        | Limited         | **Most common**: Simulates load balancer/gateway limits with unlimited user demand |
+| Realistic Testing  | 1.0          | Moderate (5-20) | Infinite        | Natural Poisson traffic patterns for baseline performance |
+| Stress Testing     | 0.1-0.5      | High (20-100)   | Infinite        | Challenging burst patterns to test resilience             |
+| Latency Profiling  | 2.0-5.0      | Low (1-10)      | Infinite        | Uniform load for consistent timing analysis               |
+| Capacity Planning  | 1.0          | Variable        | Limited         | Test resource limits with realistic constraints           |
+| SLA Validation     | 1.0          | Target rate     | SLA limit       | Production-like constraints for compliance testing        |
+
+These load patterns help evaluate different aspects of your vLLM deployment, from basic performance characteristics to resilience under challenging traffic conditions.
+
+The **Maximum Throughput** pattern (`--request-rate=inf --max-concurrency=<limit>`) is the most commonly used configuration for production benchmarking. This simulates real-world deployment architectures where:
+
+- Users send requests as fast as they can (infinite rate)
+- A load balancer or API gateway controls the maximum concurrent connections
+- The system operates at its concurrency limit, revealing true throughput capacity
+- `--burstiness` has no effect since request timing is not controlled when rate is infinite
+
+This pattern helps determine optimal concurrency settings for your production load balancer configuration.
+
+To effectively configure load patterns, especially for **Capacity Planning** and **SLA Validation** use cases, you need to understand your system's resource limits. During startup, vLLM reports KV cache configuration that directly impacts your load testing parameters:
+
+```text
+GPU KV cache size: 15,728,640 tokens
+Maximum concurrency for 8,192 tokens per request: 1920
+```
+
+Where:
+
+- GPU KV cache size: Total tokens that can be cached across all concurrent requests
+- Maximum concurrency: Theoretical maximum concurrent requests for the given `max_model_len`
+- Calculation: `max_concurrency = kv_cache_size / max_model_len`
+
+Using KV cache metrics for load pattern configuration:
+
+- For Capacity Planning: Set `--max-concurrency` to 80-90% of the reported maximum to test realistic resource constraints
+- For SLA Validation: Use the reported maximum as your SLA limit to ensure compliance testing matches production capacity
+- For Realistic Testing: Monitor memory usage when approaching theoretical limits to understand sustainable request rates
+- Request rate guidance: Use the KV cache size to estimate sustainable request rates for your specific workload and sequence lengths
+
 </details>
 
 #### 📈 Offline Throughput Benchmark
@@ -366,7 +432,6 @@ Total num output tokens:  1280
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
 vllm bench throughput \
     --dataset-name=hf \
     --dataset-path=likaixin/InstructCoder \
@@ -661,8 +726,7 @@ Benchmark the performance of multi-modal requests in vLLM.
 Start vLLM:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
+vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
   --dtype bfloat16 \
   --limit-mm-per-prompt '{"image": 1}' \
   --allowed-local-media-path /path/to/sharegpt4v/images
@@ -688,8 +752,7 @@ vllm bench serve \
 Start vLLM:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
+vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
   --dtype bfloat16 \
   --limit-mm-per-prompt '{"video": 1}' \
   --allowed-local-media-path /path/to/sharegpt4video/videos
@@ -716,7 +779,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
 
 Notes:
 
-- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
 - Video sampling is not yet implemented.
 
 Start the server (example):
@@ -783,7 +846,305 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 
 </details>
 
-[](){ #performance-benchmarks }
+#### Embedding Benchmark
+
+Benchmark the performance of embedding requests in vLLM.
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+##### Text Embeddings
+
+Unlike generative models which use Completions API or Chat Completions API,
+you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
+
+You can use any text dataset to benchmark the model, such as ShareGPT.
+
+Start the server:
+
+```bash
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+Run the benchmark:
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --model jinaai/jina-embeddings-v3 \
+  --backend openai-embeddings \
+  --endpoint /v1/embeddings \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+##### Multi-modal Embeddings
+
+Unlike generative models which use Completions API or Chat Completions API,
+you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
+
+- CLIP: `--backend openai-embeddings-clip`
+- VLM2Vec: `--backend openai-embeddings-vlm2vec`
+
+For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
+
+You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
+For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
+
+Serve and benchmark CLIP:
+
+```bash
+# Run this in another process
+vllm serve openai/clip-vit-base-patch32
+
+# Run these one by one after the server is up
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --model openai/clip-vit-base-patch32 \
+  --backend openai-embeddings-clip \
+  --endpoint /v1/embeddings \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
+
+vllm bench serve \
+  --model openai/clip-vit-base-patch32 \
+  --backend openai-embeddings-clip \
+  --endpoint /v1/embeddings \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat
+```
+
+Serve and benchmark VLM2Vec:
+
+```bash
+# Run this in another process
+vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
+  --trust-remote-code \
+  --chat-template examples/template_vlm2vec_phi3v.jinja
+
+# Run these one by one after the server is up
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --model TIGER-Lab/VLM2Vec-Full \
+  --backend openai-embeddings-vlm2vec \
+  --endpoint /v1/embeddings \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
+
+vllm bench serve \
+  --model TIGER-Lab/VLM2Vec-Full \
+  --backend openai-embeddings-vlm2vec \
+  --endpoint /v1/embeddings \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat
+```
+
+</details>
+
+#### Reranker Benchmark
+
+Benchmark the performance of rerank requests in vLLM.
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Unlike generative models which use Completions API or Chat Completions API,
+you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
+
+For reranking, the only supported dataset is `--dataset-name random-rerank`
+
+Start the server:
+
+```bash
+vllm serve BAAI/bge-reranker-v2-m3
+```
+
+Run the benchmark:
+
+```bash
+vllm bench serve \
+  --model BAAI/bge-reranker-v2-m3 \
+  --backend vllm-rerank \
+  --endpoint /v1/rerank \
+  --dataset-name random-rerank \
+  --tokenizer BAAI/bge-reranker-v2-m3 \
+  --random-input-len 512 \
+  --num-prompts 10 \
+  --random-batch-size 5
+```
+
+For reranker models, this will create `num_prompts / random_batch_size` requests with
+`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
+In the example above, this results in 2 rerank requests with 5 "documents" each where
+each document has close to 512 tokens.
+
+Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
+with an embedding model, also set `--no_reranker`. Because in this case the query is
+treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
+to account for the extra prompt which is the query. The token accounting to report the
+throughput numbers correctly is also adjusted.
+
+</details>
+
+## Parameter Sweeps
+
+### Online Benchmark
+
+[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
+
+Follow these steps to run the script:
+
+1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
+2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
+3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
+
+    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
+
+    ```json
+    [
+        {
+            "max_num_seqs": 32,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 4096
+        },
+        {
+            "max_num_seqs": 256,
+            "max_num_batched_tokens": 4096
+        }
+    ]
+    ```
+
+4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
+
+    - Example: Using different input/output lengths for random dataset:
+
+    ```json
+    [
+        {
+            "random_input_len": 128,
+            "random_output_len": 32
+        },
+        {
+            "random_input_len": 256,
+            "random_output_len": 64
+        },
+        {
+            "random_input_len": 512,
+            "random_output_len": 128
+        }
+    ]
+    ```
+
+5. Determine where you want to save the results, and pass that to `--output-dir`.
+
+Example command:
+
+```bash
+vllm bench sweep serve \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    -o benchmarks/results
+```
+
+!!! important
+    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
+    You can use `--dry-run` to preview the commands to be run.
+
+    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
+    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
+
+!!! note
+    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+
+!!! tip
+    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
+  
+### SLA Auto-Tuner
+
+[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
+
+For example, to ensure E2E latency within different target values for 99% of requests:
+
+```json
+[
+    {
+        "p99_e2el_ms": "<=200"
+    },
+    {
+        "p99_e2el_ms": "<=500"
+    },
+    {
+        "p99_e2el_ms": "<=1000"
+    },
+    {
+        "p99_e2el_ms": "<=2000"
+    }
+]
+```
+
+Example command:
+
+```bash
+vllm bench sweep serve_sla \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    --sla-params benchmarks/sla_hparams.json \
+    --sla-variable max_concurrency \
+    -o benchmarks/results
+```
+
+The algorithm for adjusting the SLA variable is as follows:
+
+1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
+    - For example, the initial request rate is set to the concurrency under infinite QPS.
+2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
+3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+
+!!! important
+    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+
+    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+
+### Visualizer
+
+[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
+
+Example command:
+
+```bash
+vllm bench sweep plot benchmarks/results/<timestamp> \
+    --var-x max_concurrency \
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --curve-by api_server_count,max_num_batched_tokens \
+    --filter-by 'max_concurrency<=1024'
+```
+
+!!! tip
+    You can use `--dry-run` to preview the figures to be plotted.
 
 ## Performance Benchmarks
 
@@ -803,7 +1164,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf
 Then, run below command inside the docker instance.
 
 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 
 When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
@@ -821,14 +1182,28 @@ For more results visualization, check the [visualizing the results](https://gith
 
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
+
+### Continuous Benchmarking
+
+The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
+
+#### How It Works
+
+The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
+
+- **Serving tests**: Measure request handling and API performance
+- **Throughput tests**: Evaluate token generation rates
+- **Latency tests**: Assess response time characteristics
 
-[](){ #nightly-benchmarks }
+#### Benchmark Configuration
 
-## Nightly Benchmarks
+The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
 
-These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
+1. Navigate to the appropriate GPU directory in the benchmarks configuration
+2. Add your model specifications to the corresponding configuration files
+3. The new models will be included in the next scheduled benchmark run
 
-The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
+#### Viewing Results
 
-More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
+All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md
index d7e2dfbca87609c0bc30086391baf91c8a1fbb4e..dad04e75fbb619f1096fd92f094d5fd0fd143ac5 100644
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
 
 Strip timestamps and colorization:
 
-<gh-file:.buildkite/scripts/ci-clean-log.sh>
+[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
 
 ```bash
 ./ci-clean-log.sh ci.log
@@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
 
 CI test failures may be flaky. Use a bash loop to run repeatedly:
 
-<gh-file:.buildkite/scripts/rerun-test.sh>
+[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
 
 ```bash
 ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 3dae62dd5d944c1450bb227768184af990a78eb3..09fd85a466eed9dc2b9fa5bf4b7c0a437ddf1f48 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
 update along with a list of potential issues and how to address them.
 
 ## Test PyTorch release candidates (RCs)
@@ -85,9 +85,9 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.
 
-While ongoing efforts like [#17419](gh-issue:17419)
+While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
 address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
-to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
+to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
 
 1. Increase the timeout limit to 10 hours so that the build doesn't time out.
@@ -95,40 +95,22 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
 to warm it up so that future builds are faster.
 
 <p align="center" width="100%">
-    <img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
 ## Update dependencies
 
-Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
+Several vLLM dependencies like xFormers depend on PyTorch and need
 to be updated accordingly. Rather than waiting for all of them to publish new
 releases (which would take too much time), they can be built from
 source to unblock the update process.
 
-### FlashInfer
-
-Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
-export FLASHINFER_ENABLE_SM90=1
-uv pip install --system \
-    --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
-```
-
-One caveat is that building FlashInfer from source adds approximately 30
-minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
-public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
-team if you want to get the package published there.
-
 ### xFormers
 
-Similar to FlashInfer, here is how to build and install xFormers from source:
-
 ```bash
-export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
+export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
 MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
 ```
 
 ## Update all the different vLLM platforms
@@ -138,5 +120,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
-While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
-<gh-pr:17444> completed the update for XPU.
+While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
diff --git a/docs/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md
index a7ff99aa26d5409e64000895302eedc7c5540398..14184b96936613d878f0235da3e43d96530eb487 100644
--- a/docs/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@@ -1,6 +1,6 @@
 # Dockerfile
 
-We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](../../deployment/docker.md).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index 36068bc14876be25667db1c62e752d5c5bc95949..13f3edb7e1af1d9985d02d5a3d0070abe3633bcb 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,7 +1,7 @@
 # Summary
 
 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
 
 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
 
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index aafdb1058e03c087add4811a1ff1e672ef4056f5..a7b54f015c2da8a1ecf043dc7fab365b10846c35 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
 ## 1. Bring your model code
 
 First, clone the PyTorch model code from the source repository.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
 !!! warning
@@ -56,13 +56,13 @@ The initialization code should look like this:
 
 ### Computation Code
 
-- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+- Add a `embed_input_ids` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
 
 ```python
 class MyModel(nn.Module):
         ...
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         ... 
 ```
 
@@ -73,8 +73,8 @@ def forward(
     self,
     input_ids: torch.Tensor,
     positions: torch.Tensor,
-    intermediate_tensors: Optional[IntermediateTensors] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
 ) -> torch.Tensor:
     ...
 ```
@@ -83,7 +83,7 @@ def forward(
     Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
     If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
 
-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
 
 ## 3. (Optional) Implement tensor parallelism and quantization support
 
@@ -130,22 +130,22 @@ We consider 3 different scenarios:
 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
 
-For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
-For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
 Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
 V0-only classes and code will be removed in the very near future.
-The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
 
-For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
 These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
 
-For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
 We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
-Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
-Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
-The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
+Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
+The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 87d34d207cde36bed3cd6c502db14654081e191b..c2ca199220a1a7947b57ae069585857d59887855 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -16,7 +16,7 @@ Further update the model as follows:
             ...
 
             @classmethod
-            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            def get_placeholder_str(cls, modality: str, i: int) -> str | None:
                 if modality.startswith("image"):
                     return "<image>"
 
@@ -36,7 +36,7 @@ Further update the model as follows:
   
   More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
 
-- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+- Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
 
     ??? code
 
@@ -45,14 +45,14 @@ Further update the model as follows:
             ...
 
             def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-
                 assert self.vision_encoder is not None
                 image_features = self.vision_encoder(image_input)
                 return self.multi_modal_projector(image_features)
 
-            def get_multimodal_embeddings(
-                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+            def embed_multimodal(
+                self,
+                **kwargs: object,
+            ) -> MultiModalEmbeddings | None:
                 # Validate the multimodal input keyword arguments
                 image_input = self._parse_and_validate_image_input(**kwargs)
                 if image_input is None:
@@ -66,35 +66,12 @@ Further update the model as follows:
 !!! important
     The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
 
-- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
-
-    ??? code
-
-        ```python
-        from .utils import merge_multimodal_embeddings
-
-        class YourModelForImage2Seq(nn.Module):
-            ...
+!!! note
+    By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
+    [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
+    This logic can be found at [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids].
 
-            def get_input_embeddings(
-                self,
-                input_ids: torch.Tensor,
-                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-            ) -> torch.Tensor:
-
-                # `get_input_embeddings` should already be implemented for the language 
-                # model as one of the requirements of basic vLLM model implementation.
-                inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-                if multimodal_embeddings is not None:
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids=input_ids, 
-                        inputs_embeds=inputs_embeds, 
-                        multimodal_embeddings=multimodal_embeddings,
-                        placeholder_token_id=self.config.image_token_index)
-
-                return inputs_embeds
-        ```
+    You may override this method if additional logic is required for your model when merging embeddings. 
 
 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
 
@@ -133,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
 For example, if the model supports any number of images but only one video per prompt:
 
 ```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+def get_supported_mm_limits(self) -> Mapping[str, int | None]:
     return {"image": None, "video": 1}
 ```
 
@@ -281,17 +258,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
             self,
             seq_len: int,
             mm_counts: Mapping[str, int],
+            mm_options: Mapping[str, BaseDummyOptions] | None = None,
         ) -> MultiModalDataDict:
             num_images = mm_counts.get("image", 0)
 
             target_width, target_height = \
                 self.info.get_image_size_with_most_features()
 
+            image_overrides = mm_options.get("image") if mm_options else None
+
             return {
                 "image":
                 self._get_dummy_images(width=target_width,
                                     height=target_height,
-                                    num_images=num_images)
+                                    num_images=num_images,
+                                    overrides=image_overrides)
             }
         ```
 
@@ -440,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     ```python
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        return ImageSize(width=image_processor.size["width"],
-                            height=image_processor.size["height"])
+        return ImageSize(
+            width=image_processor.size["width"],
+            height=image_processor.size["height"],
+        )
     ```
 
     Fuyu does not expect image placeholders in the inputs to HF processor, so
@@ -461,16 +444,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
             self,
             seq_len: int,
             mm_counts: Mapping[str, int],
+            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
         ) -> MultiModalDataDict:
             target_width, target_height = \
                 self.info.get_image_size_with_most_features()
             num_images = mm_counts.get("image", 0)
 
+            image_overrides = mm_options.get("image") if mm_options else None
+
             return {
                 "image":
-                self._get_dummy_images(width=target_width,
-                                    height=target_height,
-                                    num_images=num_images)
+                self._get_dummy_images(
+                    width=target_width,
+                    height=target_height,
+                    num_images=num_images,
+                    overrides=image_overrides,
+                )
             }
         ```
 
@@ -518,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
     ```
 
     !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
         pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 
 === "With postprocessing: Fuyu"
@@ -580,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
         ```
 
     !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
         for text-only inputs to prevent unnecessary warnings from HF processor.
 
     !!! note
@@ -759,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                            [_NEWLINE_TOKEN_ID]) * nrows
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
             return PromptUpdateDetails.select_token_id(
                 image_tokens + [bos_token_id],
@@ -796,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
-                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                                [_NEWLINE_TOKEN_ID]) * nrows
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
                 return PromptUpdateDetails.select_token_id(
                     image_tokens + [bos_token_id],
@@ -825,9 +812,11 @@ to register them to the multi-modal registry:
   from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
 
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+                                         info=YourProcessingInfo,
-+                                         dummy_inputs=YourDummyInputsBuilder)
++ @MULTIMODAL_REGISTRY.register_processor(
++     YourMultiModalProcessor,
++     info=YourProcessingInfo,
++     dummy_inputs=YourDummyInputsBuilder,
++ )
   class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
 
@@ -839,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
 
 Examples:
 
-- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
-- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
+- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 
 ### Handling prompt updates unrelated to multi-modal data
 
@@ -848,9 +837,9 @@ Examples:
 
 Examples:
 
-- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
-- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
-- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
+- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
+- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 
 ### Custom HF processor
 
@@ -858,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
 
 Examples:
 
-- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
-- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
-- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
+- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
+- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
+- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
index 35f35ffa4cde6986bf6b34ee0b0b61fd90830d8e..400d0f75caca5cd07c95734895029410cd73334c 100644
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
 
 ## Built-in models
 
-To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
 This gives you the ability to modify the codebase and test your model.
 
-After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
-Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
+Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
 
 !!! important
@@ -42,7 +42,7 @@ def register():
 
     ModelRegistry.register_model(
         "YourModelForCausalLM",
-        "your_code:YourModelForCausalLM"
+        "your_code:YourModelForCausalLM",
     )
 ```
 
diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md
index 1206ad36771eaeefcda31fbffde2e545a2115247..3ccd90cc66f773c1d718f8d28d293ad4d9cf1cb8 100644
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
 
 ### Model loading
 
-Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
 !!! important
@@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct
 
 ### Model correctness
 
-These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
 
 #### Generative models
 
-For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
 
 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
 
 #### Pooling models
 
-For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
-
-[](){ #mm-processing-tests }
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
 
 ### Multi-modal processing
 
 #### Common tests
 
-Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
 
 - Text + multi-modal data
 - Tokens + multi-modal data
@@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
 
 #### Model-specific tests
 
-You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
 
-For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index 62e58e5c6ac58dc97b7746b7339e39673d09f763..fca941acd50762375035e7031d7473e270c0f922 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -15,8 +15,9 @@ Declare supported languages and capabilities:
 - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
 
 ??? code "supported_languages and supports_transcription_only"
+
     ```python
-    from typing import ClassVar, Mapping, Optional, Literal
+    from typing import ClassVar, Mapping, Literal
     import numpy as np
     import torch
     from torch import nn
@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
 This is for controlling general behavior of the API when serving your model:
 
 ??? code "get_speech_to_text_config()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
 Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -81,10 +84,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
         ) -> PromptType:
             # Example with a free-form instruction prompt
             task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
 Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -117,10 +121,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
         ) -> PromptType:
             if language is None:
                 raise ValueError("Language must be specified")
@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
 If your model requires a language and you want a default, override this method (see Whisper):
 
 ??? code "validate_language()"
+
     ```python
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             logger.warning(
-                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+                "Defaulting to language='en'. If you wish to transcribe "
+                "audio in a different language, pass the `language` field "
+                "in the TranscriptionRequest."
+            )
             language = "en"
         return super().validate_language(language)
     ```
@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
 Provide a fast duration→token estimate to improve streaming usage statistics:
 
 ??? code "get_num_audio_tokens()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -175,7 +184,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-        ) -> Optional[int]:
+        ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
     ```
@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
 Relevant server logic:
 
 ??? code "_preprocess_speech_to_text()"
+
     ```python
     # vllm/entrypoints/openai/speech_to_text.py
     async def _preprocess_speech_to_text(...):
@@ -238,9 +248,9 @@ No extra registration is required beyond having your model class available via t
 
 ## Examples in-tree
 
-- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
-- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
-- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
+- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 
 ## Test with the API
 
@@ -268,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
       http://localhost:8000/v1/audio/translations
     ```
 
-Or check out more examples in <gh-file:examples/online_serving>.
+Or check out more examples in [examples/online_serving](../../../examples/online_serving).
 
 !!! note
     - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 5b83d93274f0def5bd6caba0d671a4075dd90d37..7634cc0859edf3efa6c7f98451f3fc64a69c1521 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -33,14 +33,13 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
 
 #### Offline Inference
 
-Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
+Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
 
 #### OpenAI Server
 
 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    python -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3-70B
+    vllm serve meta-llama/Llama-3.1-8B-Instruct
 ```
 
 vllm bench command:
@@ -48,7 +47,7 @@ vllm bench command:
 ```bash
 vllm bench serve \
     --backend vllm \
-    --model meta-llama/Meta-Llama-3-70B \
+    --model meta-llama/Llama-3.1-8B-Instruct \
     --dataset-name sharegpt \
     --dataset-path sharegpt.json \
     --profile \
@@ -71,18 +70,21 @@ apt update
 apt install nsight-systems-cli
 ```
 
-### Example commands and usage
+!!! tip
+    When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
 
-When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+The Nsight Systems profiler can be launched with `nsys profile ...`, with a few recommended flags for vLLM: `--trace-fork-before-exec=true --cuda-graph-trace=node`.
+
+### Example commands and usage
 
 #### Offline Inference
 
-For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+For basic usage, you can just append the profiling command before any existing script you would run for offline inference.
 
 The following is an example using the `vllm bench latency` script:
 
 ```bash
-nsys profile -o report.nsys-rep \
+nsys profile  \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
 vllm bench latency \
@@ -96,40 +98,29 @@ vllm bench latency \
 
 #### OpenAI Server
 
-To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, but you will need to specify a few other arguments to enable dynamic capture similarly to the Torch Profiler:
 
 ```bash
 # server
-nsys profile -o report.nsys-rep \
+VLLM_TORCH_CUDA_PROFILE=1 \
+nsys profile \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
-    --delay 30 \
-    --duration 60 \
+    --capture-range=cudaProfilerApi \
+    --capture-range-end repeat \
     vllm serve meta-llama/Llama-3.1-8B-Instruct
 
 # client
 vllm bench serve \
     --backend vllm \
     --model meta-llama/Llama-3.1-8B-Instruct \
-    --num-prompts 1 \
-    --dataset-name random \
-    --random-input 1024 \
-    --random-output 512
-```
-
-In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
-
-```bash
-nsys sessions list
-```
-
-to get the session id in the form of `profile-XXXXX`, then run:
-
-```bash
-nsys stop --session=profile-XXXXX
+    --dataset-name sharegpt \
+    --dataset-path sharegpt.json \
+    --profile \
+    --num-prompts 2
 ```
 
-to manually kill the profiler and generate your `nsys-rep` report.
+With `--profile`, vLLM will capture a profile for each run of `vllm bench serve`. Once the server is killed, the profiles will all be saved.
 
 #### Analysis
 
@@ -160,14 +151,34 @@ GUI example:
 
 <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
 
+## Continuous Profiling
+
+There is a [GitHub CI workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-profiling.yml) in the PyTorch infrastructure repository that provides continuous profiling for different models on vLLM. This automated profiling helps track performance characteristics over time and across different model configurations.
+
+### How It Works
+
+The workflow currently runs weekly profiling sessions for selected models, generating detailed performance traces that can be analyzed using different tools to identify performance regressions or optimization opportunities. But, it can be triggered manually as well, using the Github Action tool.
+
+### Adding New Models
+
+To extend the continuous profiling to additional models, you can modify the [profiling-tests.json](https://github.com/pytorch/pytorch-integration-testing/blob/main/vllm-profiling/cuda/profiling-tests.json) configuration file in the PyTorch integration testing repository. Simply add your model specifications to this file to include them in the automated profiling runs.
+
+### Viewing Profiling Results
+
+The profiling traces generated by the continuous profiling workflow are publicly available on the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). Look for the **Profiling traces** table to access and download the traces for different models and runs.
+
 ## Profiling vLLM Python Code
 
 The Python standard library includes
 [cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
 code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
-Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+Both the `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` functions can be
 used to profile a section of code.
 
+!!! note
+    The legacy import paths `vllm.utils.cprofile` and `vllm.utils.cprofile_context` are deprecated.
+    Please use `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` instead.
+
 ### Example usage - decorator
 
 The first helper is a Python decorator that can be used to profile a function.
@@ -175,9 +186,9 @@ If a filename is specified, the profile will be saved to that file. If no filena
 specified, profile data will be printed to stdout.
 
 ```python
-import vllm.utils
+from vllm.utils.profiling import cprofile
 
-@vllm.utils.cprofile("expensive_function.prof")
+@cprofile("expensive_function.prof")
 def expensive_function():
     # some expensive code
     pass
@@ -189,13 +200,13 @@ The second helper is a context manager that can be used to profile a block of
 code. Similar to the decorator, the filename is optional.
 
 ```python
-import vllm.utils
+from vllm.utils.profiling import cprofile_context
 
 def another_function():
     # more expensive code
     pass
 
-with vllm.utils.cprofile_context("another_function.prof"):
+with cprofile_context("another_function.prof"):
     another_function()
 ```
 
@@ -208,3 +219,11 @@ One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
 pip install snakeviz
 snakeviz expensive_function.prof
 ```
+
+### Analyzing Garbage Collection Costs
+
+Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
+
+- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
+- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
+  collected objects for each gc.collect
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1f19f2fecfab16d94f3651dec2a69f6cc12a5e47..1c639f3533d47938fd400bd0e24f5c81d4dc51cc 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -1,7 +1,5 @@
 # Using Docker
 
-[](){ #deployment-docker-pre-built-image }
-
 ## Use vLLM's Official Docker Image
 
 vLLM offers an official Docker image for deployment.
@@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env "HF_TOKEN=$HF_TOKEN" \
     -p 8000:8000 \
     --ipc=host \
     vllm/vllm-openai:latest \
@@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
 ```bash
 podman run --device nvidia.com/gpu=all \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  --env "HF_TOKEN=$HF_TOKEN" \
   -p 8000:8000 \
   --ipc=host \
   docker.io/vllm/vllm-openai:latest \
@@ -37,17 +35,17 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
     memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 !!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
 
     If you need to use those dependencies (having accepted the license terms),
     create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
     ```Dockerfile
-    FROM vllm/vllm-openai:v0.9.0
+    FROM vllm/vllm-openai:v0.11.0
 
     # e.g. install the `audio` optional dependencies
     # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.9.0
+    RUN uv pip install --system vllm[audio]==0.11.0
     ```
 
 !!! tip
@@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
     RUN uv pip install --system git+https://github.com/huggingface/transformers.git
     ```
 
-[](){ #deployment-docker-build-image-from-source }
-
 ## Building vLLM's Docker Image from Source
 
-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
 
 ```bash
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
@@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
 docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     -p 8000:8000 \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HF_TOKEN=<secret>" \
     vllm/vllm-openai <args...>
 ```
 
diff --git a/docs/deployment/frameworks/anyscale.md b/docs/deployment/frameworks/anyscale.md
index 9957c5b14134456717f21c7ee094bf033e37f1c6..965742ec072624a92a3756dc3663f3617a2686aa 100644
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@@ -1,11 +1,9 @@
 # Anyscale
 
-[](){ #deployment-anyscale }
-
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
 
 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
 
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
 
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
index 7517ee771c09774cfe5bc914865fe4ed9e79359f..5790087ed5c2739fa4a7e7af19d2c7c26bb889dd 100644
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
 1. Start the vLLM server with the supported chat completion model, e.g.
 
     ```bash
-    python -m vllm.entrypoints.openai.api_server \
-        --model mistralai/Mistral-7B-Instruct-v0.2
+    vllm serve mistralai/Mistral-7B-Instruct-v0.2
     ```
 
 1. Call it with AutoGen:
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index 1f233c3204a15bb3504409ded4577c0a49fa1ffa..960347d9525c45ad08d84906689352fe7a364b5b 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
 
 ??? console "Command"
 
-    ```python
+    ```bash
     curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
     -H 'Content-Type: application/json' \
     -H 'Authorization: <JWT TOKEN>' \
@@ -81,7 +81,7 @@ You should get a response like:
 
 ??? console "Response"
 
-    ```python
+    ```json
     {
         "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
         "result": {
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 002935da560094b8f7ebed457bbf09872933b56b..5f7cef1a87dfbeaf9e338c04ae71ac71881b8812 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -29,8 +29,8 @@ pip install vllm
     - API Path: `/chat/completions`
     - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-    ![](../../assets/deployment/chatbox-settings.png)
+    ![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
 
 1. Go to `Just chat`, and start to chat:
 
-    ![](../../assets/deployment/chatbox-chat.png)
+    ![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index 820ef0cbed9fafc2fbe955fc5fdffc5922b22631..673cbf4b6a24af48ef319f399ee74b00f5f98c4c 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
     - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
     - **Completion Mode**: `Completion`
 
-    ![](../../assets/deployment/dify-settings.png)
+    ![Dify settings screen](../../assets/deployment/dify-settings.png)
 
 1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-    ![](../../assets/deployment/dify-create-chatbot.png)
+    ![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
 
 1. Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-    ![](../../assets/deployment/dify-chat.png)
+    ![Dify chat screen](../../assets/deployment/dify-chat.png)
diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index fe4d87f78f2aaf1b8c3e8f2768e83bb3fabb3dbc..9d2c7f5bb565fcb7a9a8feb96165a79205901a1a 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
 
     client = OpenAI(
         base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
     )
 
     completion = client.chat.completions.create(
@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
                 "role": "user",
                 "content": "Compose a poem that explains the concept of recursion in programming.",
             }
-        ]
+        ],
     )
 
     print(completion.choices[0].message.content)
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index 836305cf15c42e5918e89c3486210231b8e01071..b53b829d6d3c0d6b63c911aaf05c1e785ae1b9d7 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -34,7 +34,7 @@ pip install vllm haystack-ai
         api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
         model="mistralai/Mistral-7B-Instruct-v0.1",
         api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-        generation_kwargs = {"max_tokens": 512}
+        generation_kwargs={"max_tokens": 512},
     )
 
     response = generator.run(
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
index e5d44945ba725616e363c400ba61f695963f8149..1d9e3632593ad456b03665d2881224bca6274577 100644
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -13,7 +13,7 @@ Before you begin, ensure that you have the following:
 - A running Kubernetes cluster
 - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
-- An S3 with the model which will be deployed
+- (Optional) An S3 bucket or other storage with the model weights, if using automatic model download
 
 ## Installing the chart
 
@@ -61,10 +61,16 @@ The following table describes configurable parameters of the chart in `values.ya
 | deploymentStrategy | object | {} | Deployment strategy configuration |
 | externalConfigs | list | [] | External configuration |
 | extraContainers | list | [] | Additional containers configuration |
-| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
-| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
-| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
-| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
+| extraInit | object | {"modelDownload":{"enabled":true},"initContainers":[],"pvcStorage":"1Gi"} | Additional configuration for init containers |
+| extraInit.modelDownload | object | {"enabled":true} | Model download functionality configuration |
+| extraInit.modelDownload.enabled | bool | true | Enable automatic model download job and wait container |
+| extraInit.modelDownload.image | object | {"repository":"amazon/aws-cli","tag":"2.6.4","pullPolicy":"IfNotPresent"} | Image for model download operations |
+| extraInit.modelDownload.waitContainer | object | {} | Wait container configuration (command, args, env) |
+| extraInit.modelDownload.downloadJob | object | {} | Download job configuration (command, args, env) |
+| extraInit.initContainers | list | [] | Custom init containers (appended after model download if enabled) |
+| extraInit.pvcStorage | string | "1Gi" | Storage size for the PVC |
+| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | (Optional) Path of the model on S3 |
+| extraInit.awsEc2MetadataDisabled | bool | true | (Optional) Disable AWS EC2 metadata service |
 | extraPorts | list | [] | Additional ports configuration |
 | gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
 | image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
@@ -98,3 +104,36 @@ The following table describes configurable parameters of the chart in `values.ya
 | serviceName | string | "" | Service name |
 | servicePort | int | 80 | Service port |
 | labels.environment | string | test | Environment name |
+
+## Configuration Examples
+
+### Using S3 Model Download (Default)
+
+```yaml
+extraInit:
+  modelDownload:
+    enabled: true
+  pvcStorage: "10Gi"
+  s3modelpath: "models/llama-7b"
+```
+
+### Using Custom Init Containers Only
+
+For use cases like llm-d where you need custom sidecars without model download:
+
+```yaml
+extraInit:
+  modelDownload:
+    enabled: false
+  initContainers:
+    - name: llm-d-routing-proxy
+      image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      imagePullPolicy: IfNotPresent
+      ports:
+        - containerPort: 8080
+          name: proxy
+      securityContext:
+        runAsUser: 1000
+      restartPolicy: Always
+  pvcStorage: "10Gi"
+```
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
new file mode 100644
index 0000000000000000000000000000000000000000..05df0dacd8f11600e3f98f418f86e875f7d9b0f3
--- /dev/null
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -0,0 +1,170 @@
+# Hugging Face Inference Endpoints
+
+## Overview
+
+Models compatible with vLLM can be deployed on Hugging Face Inference Endpoints, either starting from the [Hugging Face Hub](https://huggingface.co) or directly from the [Inference Endpoints](https://endpoints.huggingface.co/) interface. This allows you to serve models in a fully managed environment with GPU acceleration, auto-scaling, and monitoring, without managing the infrastructure manually.
+
+For advanced details on vLLM integration and deployment options, see [Advanced Deployment Details](#advanced-deployment-details).
+
+## Deployment Methods
+
+- [**Method 1: Deploy from the Catalog.**](#method-1-deploy-from-the-catalog) One-click deploy models from the Hugging Face Hub with ready-made optimized configurations.
+- [**Method 2: Guided Deployment (Transformers Models).**](#method-2-guided-deployment-transformers-models) Instantly deploy models tagged with `transformers` from the Hub UI using the **Deploy** button.
+- [**Method 3: Manual Deployment (Advanced Models).**](#method-3-manual-deployment-advanced-models) For models that either use custom code with the `transformers` tag, or don’t run with standard `transformers` but are supported by vLLM. This method requires manual configuration.
+
+### Method 1: Deploy from the Catalog
+
+This is the easiest way to get started with vLLM on Hugging Face Inference Endpoints. You can browse a catalog of models with verified and optimized deployment configuration at [Inference Endpoints](https://endpoints.huggingface.co/catalog) to maximize performance.
+
+1. Go to [Endpoints Catalog](https://endpoints.huggingface.co/catalog) and in the **Inference Server** options, select `vLLM`.This will display the current list of models with optimized preconfigured options.
+
+    ![Endpoints Catalog](../../assets/deployment/hf-inference-endpoints-catalog.png)
+
+1. Select the desired model and click **Create Endpoint**.
+
+    ![Create Endpoint](../../assets/deployment/hf-inference-endpoints-create-endpoint.png)
+
+1. Once the deployment is ready, you can use the endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console, remembering to append `/v1` as required.
+
+    ```python
+    # pip install openai
+    from openai import OpenAI
+    import os
+
+    client = OpenAI(
+        base_url=DEPLOYMENT_URL,
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
+    )
+
+    chat_completion = client.chat.completions.create(
+        model="HuggingFaceTB/SmolLM3-3B",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Give me a brief explanation of gravity in simple terms.",
+                    }
+                ],
+            }
+        ],
+        stream=True,
+    )
+
+    for message in chat_completion:
+        print(message.choices[0].delta.content, end="")
+    ```
+
+!!! note
+    The catalog provides models optimized for vLLM, including GPU settings and inference engine configurations. You can monitor the endpoint and update the **container or its configuration** from the Inference Endpoints UI.
+
+### Method 2: Guided Deployment (Transformers Models)
+
+This method applies to models with the [`transformers` library tag](https://huggingface.co/models?library=transformers) in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
+
+1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).  
+   For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
+
+2. Locate the **Deploy** button. The button appears for models tagged with `transformers` at the top right of the [model card](https://huggingface.co/ibm-granite/granite-docling-258M).
+
+    ![Locate deploy button](../../assets/deployment/hf-inference-endpoints-locate-deploy-button.png)
+
+3. Click to **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
+
+    ![Click deploy button](../../assets/deployment/hf-inference-endpoints-click-deploy-button.png)
+
+4. Select the Hardware (we choose AWS>GPU>T4 for the example) and Container Configuration. Choose `vLLM` as the container type and finalize the deployment pressing **Create Endpoint**.
+
+    ![Select Hardware](../../assets/deployment/hf-inference-endpoints-select-hardware.png)
+
+5. Use the deployed endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console (remember to add `/v1` needed). You can then use your endpoint programmatically or via the SDK.
+
+    ```python
+    # pip install openai
+    from openai import OpenAI
+    import os
+
+    client = OpenAI(
+        base_url=DEPLOYMENT_URL,
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
+    )
+
+    chat_completion = client.chat.completions.create(
+        model="ibm-granite/granite-docling-258M",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": "Convert this page to docling.",
+                    },
+                ]
+            }
+        ],
+        stream=True,
+    )
+
+    for message in chat_completion:
+        print(message.choices[0].delta.content, end="")
+    ```
+
+!!! note
+    This method uses best-guess defaults. You may need to adjust the configuration to fit your specific requirements.
+
+### Method 3: Manual Deployment (Advanced Models)
+
+Some models require manual deployment because they:
+
+- Use custom code with the `transformers` tag
+- Don't run with standard `transformers` but are supported by `vLLM`
+
+These models cannot be deployed using the **Deploy** button on the model card.
+
+In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.ocr`](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
+
+1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
+
+    ![New Endpoint](../../assets/deployment/hf-inference-endpoints-new-endpoint.png)
+
+2. Search the model in the Hub. In the dialog, switch to **Hub** and search for the desired model.
+
+    ![Select model](../../assets/deployment/hf-inference-endpoints-select-model.png)
+
+3. Choosing infrastructure. On the configuration page, select the cloud provider and hardware from the available options.  
+   For this demo, we choose AWS and L4 GPU. Adjust according to your hardware needs.
+
+    ![Choose Infra](../../assets/deployment/hf-inference-endpoints-choose-infra.png)
+
+4. Configure the container. Scroll to the **Container Configuration** and select `vLLM` as the container type.
+
+    ![Configure Container](../../assets/deployment/hf-inference-endpoints-configure-container.png)
+
+5. Create the endpoint. Click **Create Endpoint** to deploy the model.
+
+    Once the endpoint is ready, you can use it with the OpenAI Completion API, cURL, or other SDKs. Remember to append `/v1` to the deployment URL if needed.
+
+!!! note
+    You can adjust the **container settings** (Container URI, Container Arguments) from the Inference Endpoints UI and press **Update Endpoint**. This redeploys the endpoint with the updated container configuration. Changes to the model itself require creating a new endpoint or redeploying with a different model. For example, for this demo, you may need to update the Container URI to the nightly image (`vllm/vllm-openai:nightly`) and add the `--trust-remote-code` flag in the container arguments.
+
+## Advanced Deployment Details
+
+With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
+
+Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
+
+The platform integrates seamlessly with the Hugging Face Hub, allowing you to deploy any vLLM- or `transformers`-compatible model, track usage, and update the inference engine directly. The vLLM engine comes preconfigured, enabling optimized inference and easy switching between models or engines without modifying your code. This setup simplifies production deployment: endpoints are ready in minutes, include monitoring and logging, and let you focus on serving models rather than maintaining infrastructure.
+
+## Next Steps
+
+- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
+- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
+- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
+- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index 0d6c3729911adc56527da6b868086cabd4b2f2f6..9ea7c0373d2a1c6e0f7159a378e45c4c44d24ac9 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -36,15 +36,16 @@ pip install vllm litellm
     ```python
     import litellm 
 
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
+    messages = [{"content": "Hello, how are you?", "role": "user"}]
 
     # hosted_vllm is prefix key word and necessary
     response = litellm.completion(
-                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-                messages=messages,
-                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-                temperature=0.2,
-                max_tokens=80)
+        model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+        messages=messages,
+        api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+        temperature=0.2,
+        max_tokens=80,
+    )
 
     print(response)
     ```
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 3b9fa3ea43d642e67ea1cd8c8d8ccbd790a36473..14710a8dc3334f7d24d48b57becf73246761f6a8 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
               - name: vllm-leader
                 image: docker.io/vllm/vllm-openai:latest
                 env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                     value: <your-hf-token>
                 command:
                   - sh
@@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
                     ephemeral-storage: 800Gi
                     cpu: 125
                 env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                     value: <your-hf-token>
                 volumeMounts:
                   - mountPath: /dev/shm
diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md
index eaa51bb613287334e62717baa2e65dd0115d1b0a..505c129613dea5a982c141e3eba5b45928a32fc0 100644
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
         For example:
 
         ```console
-        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
+        vllm serve <model> --host 0.0.0.0 --port 8000
         ```
 
 3. Start the Open WebUI Docker container:
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index d86ab1600f1269646cf968cd5a3f12c8b7ce9b8a..8a5d18807d06d46bf0a5d36f07a2186e69a31ee2 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -36,11 +36,11 @@ pip install -U vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
 
 1. Run the script
 
-    ```python
+    ```bash
     python retrieval_augmented_generation_with_langchain.py
     ```
 
@@ -74,10 +74,10 @@ pip install vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
 
 1. Run the script:
 
-    ```python
+    ```bash
     python retrieval_augmented_generation_with_llamaindex.py
     ```
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index 06e2fed38f0561d1388a511f0b63ec13073017a1..f4a984a6433e29b2543f36e593be549b766fa38d 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
       ports: 8081  # Expose to internet traffic.
 
     envs:
+      PYTHONUNBUFFERED: 1
       MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
       HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
 
@@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
     run: |
       conda activate vllm
       echo 'Starting vllm api server...'
-      python -u -m vllm.entrypoints.openai.api_server \
+      vllm serve $MODEL_NAME \
         --port 8081 \
-        --model $MODEL_NAME \
         --trust-remote-code \
         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
         2>&1 | tee api_server.log &
@@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
       ports: 8081  # Expose to internet traffic.
 
     envs:
+      PYTHONUNBUFFERED: 1
       MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
       HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
 
@@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
     run: |
       conda activate vllm
       echo 'Starting vllm api server...'
-      python -u -m vllm.entrypoints.openai.api_server \
+      vllm serve $MODEL_NAME \
         --port 8081 \
-        --model $MODEL_NAME \
         --trust-remote-code \
         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
         2>&1 | tee api_server.log
@@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
       ports: 8081  # Expose to internet traffic.
 
     envs:
+      PYTHONUNBUFFERED: 1
       MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
       HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
 
@@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
     run: |
       conda activate vllm
       echo 'Starting vllm api server...'
-      python -u -m vllm.entrypoints.openai.api_server \
+      vllm serve $MODEL_NAME \
         --port 8081 \
-        --model $MODEL_NAME \
         --trust-remote-code \
         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
         2>&1 | tee api_server.log
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
index c119878f137a4945882e7570dde6e94a25a554d3..1b214e1a32aab57ea10ceade20e3eab0d81b84e7 100644
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@@ -20,7 +20,7 @@ pip install vllm streamlit openai
     vllm serve Qwen/Qwen1.5-0.5B-Chat
     ```
 
-1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
 
 1. Start the streamlit web UI and start to chat:
 
diff --git a/docs/deployment/integrations/kaito.md b/docs/deployment/integrations/kaito.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff050d3eeaf475582ab91873a33756b685178ba5
--- /dev/null
+++ b/docs/deployment/integrations/kaito.md
@@ -0,0 +1,5 @@
+# KAITO
+
+[KAITO](https://kaito-project.github.io/kaito/docs/) is a Kubernetes operator that supports deploying and serving LLMs with vLLM. It offers managing large models via container images with built-in OpenAI-compatible inference, auto-provisioning GPU nodes and curated model presets.
+
+Please refer to [quick start](https://kaito-project.github.io/kaito/docs/quick-start) for more details.
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index fae392589c06024b784cb2685053c079bf9c7034..2f1894ccf0022a835fbc02b80fda79b97a73df17 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -55,7 +55,7 @@ sudo kubectl port-forward svc/vllm-router-service 30080:80
 And then you can send out a query to the OpenAI-compatible API to check the available models:
 
 ```bash
-curl -o- http://localhost:30080/models
+curl -o- http://localhost:30080/v1/models
 ```
 
 ??? console "Output"
@@ -78,7 +78,7 @@ curl -o- http://localhost:30080/models
 To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
 
 ```bash
-curl -X POST http://localhost:30080/completions \
+curl -X POST http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "facebook/opt-125m",
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index ca23e0b9fd8af1fa209540aef059985e9ef9d5f2..abffb7bc5f948a608ab0bb8f74f61bce97cc3f9e 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 
 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
+- [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
 - [KubeRay](integrations/kuberay.md)
 - [kubernetes-sigs/lws](frameworks/lws.md)
@@ -48,11 +49,14 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
     metadata:
       name: hf-token-secret
     type: Opaque
-    data:
-      token: $(HF_TOKEN)
+    stringData:
+      token: "REPLACE_WITH_TOKEN"
     EOF
     ```
 
+Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token,
+see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens).
+
 Next, start the vLLM server as a Kubernetes Deployment and Service:
 
 ??? console "Config"
@@ -81,7 +85,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
               "vllm serve meta-llama/Llama-3.2-1B-Instruct"
             ]
             env:
-            - name: HUGGING_FACE_HUB_TOKEN
+            - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: hf-token-secret
@@ -208,7 +212,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                 "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
               ]
               env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                 valueFrom:
                   secretKeyRef:
                     name: hf-token-secret
@@ -297,7 +301,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                 "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
               ]
               env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                 valueFrom:
                   secretKeyRef:
                     name: hf-token-secret
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index b3178e77f845cb5f3591429e396d785dde9fadc5..034068cddac3966f70e00e8946c35d579ba7d1d9 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -2,8 +2,6 @@
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
-[](){ #nginxloadbalancer-nginx-build }
-
 ## Build Nginx Container
 
 This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
@@ -27,8 +25,6 @@ Build the container:
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```
 
-[](){ #nginxloadbalancer-nginx-conf }
-
 ## Create Simple Nginx Config file
 
 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
@@ -53,8 +49,6 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
     }
     ```
 
-[](){ #nginxloadbalancer-nginx-vllm-container }
-
 ## Build vLLM Container
 
 ```bash
@@ -73,16 +67,12 @@ docker build \
     --build-arg https_proxy=$https_proxy
 ```
 
-[](){ #nginxloadbalancer-nginx-docker-network }
-
 ## Create Docker Network
 
 ```bash
 docker network create vllm_nginx
 ```
 
-[](){ #nginxloadbalancer-nginx-launch-container }
-
 ## Launch vLLM Containers
 
 Notes:
@@ -122,8 +112,6 @@ Notes:
 !!! note
     If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
 
-[](){ #nginxloadbalancer-nginx-launch-nginx }
-
 ## Launch Nginx
 
 ```bash
@@ -135,8 +123,6 @@ docker run \
     --name nginx-lb nginx-lb:latest
 ```
 
-[](){ #nginxloadbalancer-nginx-verify-nginx }
-
 ## Verify That vLLM Servers Are Ready
 
 ```bash
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 6b708677602593a02e8decb164eedf66a543dc14..b67b084a851a82bdce71eb101baad33e2428cd33 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -47,9 +47,9 @@ Here is a sample of `LLM` class usage:
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
-More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
+More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.
 
-The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
+The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
 
 ### OpenAI-Compatible API Server
 
@@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```
 
-The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
+The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
 
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
@@ -69,7 +69,12 @@ Sometimes you may see the API server entrypoint used directly instead of via the
 python -m vllm.entrypoints.openai.api_server --model <model>
 ```
 
-That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
+!!! warning
+
+    `python -m vllm.entrypoints.openai.api_server` is deprecated
+    and may become unsupported in a future release.
+
+That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
 
 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
 
@@ -96,7 +101,7 @@ processing.
 - **Output Processing**: Processes the outputs generated by the model, decoding the
   token IDs from a language model into human-readable text.
 
-The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
+The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
 
 ### AsyncLLMEngine
 
@@ -106,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.
 
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
+API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
 
-The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
+The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
 
 ## Worker
 
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
new file mode 100644
index 0000000000000000000000000000000000000000..66bf3b27d1f52d4b4dea2f8f7bdfc01469a40637
--- /dev/null
+++ b/docs/design/cuda_graphs.md
@@ -0,0 +1,234 @@
+# CUDA Graphs
+
+This write-up introduces the new CUDA Graphs modes in vLLM v1 beyond previous [torch.compile integration](torch_compile.md). To summarize, we:
+
+1. Added flexible `cudagraph_mode` configuration
+2. Made full CUDA Graphs support orthogonal to compilation
+3. Introduced a CUDA Graphs dispatcher as a central controller that picks the desired runtime mode and CUDA Graphs per batch automatically
+
+In this document we will discuss the:
+
+* [Motivation](#motivation)
+* [CUDA Graphs modes](#cudagraphmodes)
+* [Detailed design](#detailed-design)
+* [Example usage of the different CUDA Graphs modes](#usage-guide)
+
+!!! note
+    In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
+
+!!! note
+    The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
+
+## Motivation
+
+Initial piecewise compilation was built to allow piecewise cudagraph capture, excluding cudagraph-unsupported operations (mainly attention). This allowed some speedup from cudagraphs while maintaining compatibility with all attention backends. We later added support for "full cudagraphs" by not compiling piecewise, so that we could further reduce the latency in cases where attention supported cudagraphs. However, this tight coupling between compilation and cudagraph capture led to an all-or-nothing experience with little flexibility. Many attention backends also weren’t ready for unified "full" CUDA Graphs capture (e.g., only FlashAttention 3 supports it currently) or only support CUDA Graphs for pure decode batches (e.g., Flashinfer, FlashMLA, and Mamba, etc.). That led to confusing performance/compatibility tradeoffs, inconsistent CUDA Graphs support, and increasingly complex code structure.
+
+This led us to seek a more fine-grained CUDA Graphs solution with the following features:
+
+* Explicitly aware of CUDA Graphs for prefill/mixed or (uniform-)decode batch and capture them separately.
+* Separate CUDAGraph capture logic from compilation (as much as feasible) for feature orthogonality, which suggest:
+    * Capturing piecewise and full cudagraphs using the same compiled graph, and
+    * Full cudagraph capture without compilation.
+* Dispatch between full and piecewise cudagraph at runtime depending on batch composition.
+* Centralized control of CUDAGraph behavior for reduced code complexity and allowed more extendibility.
+
+These features allow the most flexibility for cudagraph capture and compilation for all kinds of startup/performance tradeoffs and feature support.
+
+## `CudagraphModes`
+
+[CUDAGraphMode][vllm.config.compilation.CUDAGraphMode] is the single knob you tune in `CompilationConfig.cudagraph_mode`:
+
+* `NONE` — turn CUDA Graphs off. Good for debugging.
+* `PIECEWISE` —  a single-mode strategy (and past default). It is the most flexible: attention or other CUDA Graphs-incompatible operations stay eager, everything else goes into CUDA Graphs. Requires piecewise compilation.
+* `FULL` — a single-mode strategy, which only captures full CUDA Graphs for non-uniform batches, then uniform-decode batches reuse the CUDA Graph of non-uniform batch of the same batch_size, since they are compatible; can be good for small models or workloads with small prompts.
+* `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
+* `FULL_AND_PIECEWISE` — (default mode) full CUDA Graph for uniform decode, piecewise CUDA Graphs for others; generally the most performant setting, especially for low latency with small models or MoEs, but also requires the most memory and takes the longest to capture.
+
+Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_PIECEWISE` for better performance, (for pooling models, it's still `PIECEWISE`). Otherwise, e.g. if piecewise compilation unavailable, we default to `NONE`.
+
+While `NONE` , `PIECEWISE`, and `FULL` are single-mode configurations and simply equivalent to past implementations of eager execution, piecewise CUDA Graphs, and full CUDA Graphs respectively, `FULL_DECODE_ONLY` and `FULL_AND_PIECEWISE` are newly appended dual-mode configurations, which require dispatching to switch between concrete runtime modes according to runtime batches dynamically.
+
+!!! note
+    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potantial `NONE` if no suitable CUDA Graph available), depending on the batch composition.
+
+While cascade attention is not cudagraph compatible, it is now compatible with all possible cudagraph mode configurations. If a batch uses cascade attention, it always gets dispatched to `PIECEWISE` mode if available (otherwise `NONE`).
+
+!!! note
+    Not all CUDA Graph modes are compatible with every attention backend. We automatically "downgrade" modes to the closest supported mode. For example, if a backend only supports CUDA Graphs for pure decode/uniform batches, we convert `FULL` to `FULL_AND_PIECEWISE` if piecewise compilation is enabled, and `FULL_DECODE_ONLY` otherwise.
+
+## Detailed Design
+
+### Overview
+
+The new CUDA Graphs logic is built on top of piecewise compilation and supports dual CUDA Graphs runtime mode switching. The system contains the following core components:
+
+* [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper]: wrapper that handles CUDAGraph capture & replay on the wrapped callable
+* [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher]: the central controller that contains the single source of truth about CUDA Graphs and handles dispatching between them.
+* [CUDAGraphMode][vllm.config.compilation.CUDAGraphMode]: enum describing the supported and runtime modes (introduced above).
+* [BatchDescriptor][vllm.forward_context.BatchDescriptor], serving as a unique representation of the runtime batch used for dispatching.
+
+See the following figures for a quick comparison between the previous and current design patterns of CUDA Graphs with inductor compilation. We can see that previously the CUDA Graphs logic and compilation logic were tightly coupled into the vllm `PiecewiseBackend`, and CUDA Graphs was implicitly dispatched by `batch_size` idly. Now the CUDA Graphs logic is separated into the `CUDAGraphWrapper` class, responsible for both full and piecewise CUDA Graphs abilities, and dispatching is **explicitly** done via **runtime mode** plus the `BatchDescriptor` as the **dispatch key** via `CudagraphDispatcher`.
+
+**Before:**
+
+![previous_design](../assets/design/cuda_graphs/previous_design.png)
+
+**After:**
+
+![new_design](../assets/design/cuda_graphs/current_design.png)
+
+### `BatchDescriptor`
+
+[BatchDescriptor][vllm.forward_context.BatchDescriptor] is a component within `ForwardContext`, alongside the CUDA Graphs runtime modes, serving as the core structure for dispatching keys at runtime. The prototype is:
+
+```python
+class BatchDescriptor(NamedTuple):
+    num_tokens: int
+    uniform_decode: bool = False
+```
+
+where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
+
+The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
+
+!!! note
+    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
+
+### `CudagraphDispatcher`
+
+The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
+
+The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.
+
+The dispatch code looks like:
+
+```python
+batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
+runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
+# execution
+with set_forward_context(
+    ..., 
+    cudagraph_runtime_mode=runtime_mode, 
+    batch_descriptor=batch_descriptor,
+):
+     output = self.model(...)
+```
+
+Inside the `dispatch()` method, the dispatcher will search the proper CUDA Graphs runtime mode and existing dispatching keys for a return. We basically search the existing keys following the priority: `FULL`>`PIECEWISE`>`None`. If the dispatching key does not exist, default to return `NONE` mode for eager execution. The implementations can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/v1/cudagraph_dispatcher.py#L91).
+
+Here is a simplified illustration of the workflow at runtime in the model executor:
+![executor_runtime](../assets/design/cuda_graphs/executor_runtime.png)
+
+### `CUDAGraphWrapper`
+
+A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wraps a runnable and simply mimics the runnable with appended CUDA Graphs abilities. Each wrapper instance is bound to a specific `runtime_mode`, which is restricted to `PIECEWISE` and `FULL` mode, and takes responsibility for capturing/replaying and passing through (directly calling) the runnable.  At runtime, each wrapper would:
+
+1. inspect the runtime_mode and batch_descriptor(dispatching key) from the global forward context.
+2. If runtime_mode is `NONE` or runtime_mode does not match the mode of the wrapper, just call the runnable directly.
+3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
+a new entry and cache it) or replay (if key exists in the cache).
+
+The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
+
+#### Nested Wrapper design
+
+The core mechanism of making a full CUDA Graphs and piecewise CUDA Graphs coexist and compatible is the nested CUDA Graphs wrapper design, building on top of piecewise compilation with only a single piecewise FX graph.  We wrap a FULL mode wrapper outside the entire model for the full CUDA Graphs functionality; meanwhile, each piecewise backend is wrapped via a `PIECEWISE` mode wrapper inside the compilation.
+
+The flow chart below should clearly describe how it works.
+![wrapper_flow](../assets/design/cuda_graphs/wrapper_flow.png)
+
+Therefore, for a `FULL` runtime mode, it is safe to capture/replay a full CUDA Graph since the piecewise wrapper is not activated. The situation is similar for `PIECEWISE` mode, as there are no conflicts between the `FULL` mode wrapper and `PIECEWISE` mode wrappers.  For the `NONE` runtime mode, both `FULL` and `PIECEWISE` wrappers would not be activated, so we simply fall through to eager execution.
+
+### Full CUDA Graph capturing & warm-up
+
+The CUDA Graphs capturing happens when the runner first calls the model forward (using `_dummy_run`) with a non-`NONE` runtime mode. For full CUDA Graph capture, we explicitly capture different cases (i.e., prefill/mixed batch or uniform_decode batch) by properly setting attention metadata to make sure the underlying attention backends launch the desired kernel routines. To distinguish prefill/mixed batch or uniform_decode batch, the most important property is the `max_query_len` in attn_metadata (true for most attention backends). We set it to the desired `uniform_query_len` for uniform_decode otherwise we make it just the `num_tokens` for a non-uniform_decode batch.
+
+The CUDA Graphs wrapper no longer manages the warm-up logic. The warm-up process is now controlled directly by the GPU model runner, where the `NONE` runtime mode is assigned to play an eager execution for warm-up. When warming up for a full CUDA Graph, it is also important to explicitly run attention during the warmup `dummy_run` call.
+
+## CUDA Graphs Compatibility of Attention Backends
+
+To signal the CUDA Graphs compatibility of the attention backends, we introduce a new enum type [AttentionCGSupport][vllm.v1.attention.backends.utils.AttentionCGSupport], which is an enum type that tracks the capability of the attention backend to support CUDA Graphs. The value is sorted in the order of the capability, i.e., `ALWAYS`> `UNIFORM_BATCH`> `UNIFORM_SINGLE_TOKEN_DECODE`> `NEVER`.
+
+```python
+class AttentionCGSupport(enum.Enum):
+    """ Constants for the CUDA Graphs support of the attention backend
+    Here we do not consider the cascade attention, as currently
+    it is never CUDA Graphs supported."""
+
+    ALWAYS = 3
+    """CUDA Graphs always supported; supports mixed-prefill-decode"""
+    UNIFORM_BATCH = 2
+    """CUDA Graphs supported for batches the only contain query lengths that are
+    the same, this can be used for spec-decode 
+        i.e. "decodes" are 1 + num_speculative_tokens"""
+    UNIFORM_SINGLE_TOKEN_DECODE = 1
+    """CUDA Graphs supported for batches the only contain query_len==1 decodes"""
+    NEVER = 0
+    """NO CUDA Graphs support"""
+```
+
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code for [this][vllm.v1.worker.gpu_model_runner.GPUModelRunner._check_and_update_cudagraph_mode].
+
+The following table lists backends that support full CUDA Graphs at the time of writing.
+
+| Attention Backend | cudagraph_support | Comments |
+|:---|:---|:---|
+| FlashAttention v2 | `UNIFORM_BATCH` | Actually `ALWAYS` but workaround to fallback to `FULL_AND_PIECEWISE` for performance reason |
+| FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
+| Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
+| AITER FlashAttention | `UNIFORM_BATCH`| |
+| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
+| FlashMLA | `UNIFORM_BATCH` | |
+| FlashInferMLA | `UNIFORM_BATCH` | |
+| AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
+
+Unlisted backends are all declared as `NEVER`.
+
+## Usage guide
+
+Now the CLI is directly using the uppercase string of cudagraph_mode for compilation_config: `--compilation-config '{"cudagraph_mode": "..."}'`, where `...` should be one of `NONE`, `PIECEWISE`, `FULL`, `FULL_DECODE_ONLY`, and `FULL_AND_PIECEWISE`. Note that all `PIECEWISE` related modes require piecewise compilation, and all `FULL` related modes need CUDA Graphs support of attention backends. For example:
+
+```bash
+vllm serve --model meta-llama/Llama-3.1-8B-Instruct --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}'
+```
+
+### Python examples
+
+```python
+import os
+os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
+
+import vllm
+from vllm.config import CUDAGraphMode
+
+compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+model = vllm.LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    dtype="auto",
+    compilation_config=compilation_config,
+)
+sampling_params = vllm.SamplingParams(
+    temperature=0,  # greedy decoding
+    max_tokens=1024,
+)
+outputs = model.generate(
+    ["My name is John and"],
+    sampling_params=sampling_params,
+)
+```
+
+### Piecewise compilation and full graph custom passes (attention fusion, sequence parallelism)
+
+Unfortunately, some custom compile passes have to see the whole graph to be effective and hence aren't compatible with piecewise compilation. This includes `AttnFusionPass` and `SequenceParallelismPass`. As a short-term solution, we automatically disable piecewise compilation (by setting `splitting_ops=[]`) when attention fusion is enabled. We use CUDA Graph modes `FULL` or `FULL_DECODE_ONLY` (depending on backend support). However, this leads to another optimization incompatibility and confusing performance tradeoffs.
+
+Long term, we've added the ability to partition the graph in Inductor instead of right after Dynamo. It can be enabled with `CompilationConfig.use_inductor_graph_partition=True` but is currently experimental and only available with `torch>=2.9`. This also increases compilation time as it has to compile the whole graph and cannot reuse piecewise compilation artifacts. Once vLLM supports 2.9, we plan to make this the default approach as it will also speed up piecewise cudagraph capture.
+
+## About the Performance
+
+See the following links for examples:
+
+* [20059#issuecomment-3160858458](https://github.com/vllm-project/vllm/pull/20059#issuecomment-3160858458)
+* [20059#issuecomment-3188735226](https://github.com/vllm-project/vllm/pull/20059#issuecomment-3188735226)
+* [20059#issuecomment-3219888738](https://github.com/vllm-project/vllm/pull/20059#issuecomment-3219888738)
diff --git a/docs/design/dbo.md b/docs/design/dbo.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2d98ccd063fa38f5146b709b320d897a95ef6cf
--- /dev/null
+++ b/docs/design/dbo.md
@@ -0,0 +1,88 @@
+# Dual Batch Overlap
+
+## Motivation
+
+The core motivation of the DBO system in vLLM is to overlap the sparse all-to-all communication in the MoE layer with the surrounding computation. This system currently only targets DP+EP deployments.
+
+## Introduction
+
+The Dual Batch Overlap system works by splitting the batch in the model runner, creating two worker threads, and then running the model on each of these worker threads. When DBO is enabled, yield points within the `FusedMoEModularKernel` allow the two CPU worker threads (also called UBatch threads) to ping-pong between each other so that when one is running compute, the other is waiting on communication. Throughout the code, ubatch may be used as a short form of microbatch; this is an ASCII-friendly version of the short form µ-batch.
+
+The DBO system includes modifications to `GpuModelRunner` and `ModularKernel`, and defines two utility classes: `UBatchWrapper` and `UBatchContext`. `UBatchWrapper` manages thread lifecycle and CUDA graph execution of the model. `UBatchContext` wraps `ForwardContext` to coordinate synchronization between the two UBatch threads.
+
+Below is the overlap schedule that is currently implemented in vLLM.
+
+```python
+# Schedule notation legend:
+#    S = Shared expert
+#    A0 = MLA qkv proj,
+#    A1 = Core attn + out proj + MoE gate
+#    D = Dispatch
+#    C = Combine
+
+# Comp: |-A0₀-A1₀-||-MLP₁-||-S₁-MLP₀-||-S₀-A0₁-A1₁-|
+# Comm: |----D₁---||--D₀--||----C₁---||-----C₀-----|
+# Order: D₁ send, A0₀, A1₀, D₁ recv, D₀ send, MLP₁, D₀ recv,
+#        C₁ send, S₁, MLP₀, C₁ recv, C₀ send, S₀, A0₁, A1₁, C₀ recv.
+# MLP_SHARED_OVERLAP = "mlp_shared_overlap"
+```
+
+## Running with DBO
+
+To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve command. This must be run in conjunction with `--data-parallel-size N` where N is greater than 1 and `--enable-expert-parallel`. Additionally, there are two configuration knobs.
+
+* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
+* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
+
+Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
+
+Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
+EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
+
+Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
+
+## DBO Components
+
+* GPUModelRunner
+* UBatchWrapper
+* UBatchContext
+
+### GPU Model Runner
+
+The batch is split into microbatches by the `GPUModelRunner` class. This is accomplished in two steps. First, coordination across all DP ranks is performed to determine whether microbatching will be applied. Microbatching must be uniform across all DP ranks. If microbatching is not feasible for any DP rank, it is disabled for all ranks. If all DP ranks are going to microbatch, the total number of tokens is padded up to the max number of tokens amongst all ranks. If any rank would end up with an empty second microbatch after the padding is applied, microbatching will be aborted and no ranks will microbatch. Once microbatching has been initiated by all ranks, the second step is performed. The `CommonAttentionMetadata` is sliced in half by the `GPUModelRunner` so that there is one attention metadata per-microbatch.
+
+### UBatchWrapper
+
+gpu_ubatch_wrapper
+
+The `UBatchWrapper` class is a model wrapper that's responsible for all of the thread, UBatchContext, and CUDA graph management for DBO. It's designed to be relatively transparent to the GPU Model Runner.
+
+The implementation runs the model twice, once for each microbatch. Each model invocation occurs within a UBatch thread. These threads are launched in parallel and are synchronized using the `UBatchContext`. Each thread is provided with a sliced version of the attention metadata that is used to run its half of the batch.
+
+CUDA graphs for DBO are entirely managed by the `UBatchWrapper`. Because of this, DBO only supports running with Full CUDA graphs. However, once a DBO CUDA graph has been captured, it can be replayed without any multithreading or CPU synchronization.
+
+#### Interfaces
+
+The `__init__` method takes in the model, VllmConfig, CUDAGraphMode, and device.
+
+The `forward` method exclusively takes in model arguments. It determines whether or not to run with DBO based on whether a `ubatch_slices` object is present in the `forward_context`. Otherwise, the model is run without DBO.
+
+### UBatchContext
+
+ubatch_context
+
+The `UBatchContext` class is a `ForwardContext` wrapper class that is used by the `UBatchWrapper` class to synchronize the two UBatch threads. It should only be instantiated by using `make_ubatch_contexts`.
+
+When one of the UBatch threads reaches a `dbo_yield` call, it pauses, and starts the other thread which will run until it reaches the same `dbo_yield` call. This "ping-pong" dynamic continues, with threads swapping at each `dbo_yield call`, until the model's execution is complete.
+
+The current implementation has all `dbo_yield` and `dbo_maybe_run_recv_hook` calls in the `FusedMoEModularKernel.forward` method.
+
+#### Interfaces
+
+The `make_ubatch_context` function initializes two `UBatchContexts`, one for each UBatch thread. It takes two CUDA streams, the preexisting `ForwardContexts` and a CPU thread barrier. This function should be used exclusively to instantiate `UBatchContexts`. It will handle all of the event initialization.
+
+The `dbo_register_recv_hook` method registers a callback that can be returned by the `FusedMoEPrepareAndFinalize` class in the other UBatch thread’s `UBatchContext`. The callback will be run when the other thread calls `dbo_maybe_run_recv_hook`. This is typically used to wait on an all-to-all kernel.
+
+The `dbo_maybe_run_recv_hook` method runs a callback that’s set by the `dbo_register_recv_hook` function if that callback exists.
+
+The `dbo_yield` method puts the current thread to sleep and wakes up the other UBatch thread.
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b454e851b54ee3837a498aa219f1ef281fb7dfb
--- /dev/null
+++ b/docs/design/debug_vllm_compile.md
@@ -0,0 +1,239 @@
+# How to debug the vLLM-torch.compile integration
+
+TL;DR:
+
+- use tlparse to acquire torch.compile logs. Include these logs in bug reports and/or support asks.
+- The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
+
+| Online Flag | Offline Flag   |      Result |
+|----------|----------|-------------|
+| --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
+| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) |  Turn off CUDAGraphs only |
+| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+
+## vLLM-torch.compile overview
+
+To improve performance, vLLM leverages torch.compile and CUDAGraphs to speed things up.
+torch.compile generates optimized kernels for PyTorch code while CUDAGraphs eliminates overhead.
+Most notably, vLLM-compile is NOT torch.compile, it is a custom compiler built using internal PyTorch Compile APIs.
+
+![vLLM-compile diagram](../assets/design/debug_vllm_compile/design_diagram.png)
+
+- Given a model, we do a full graph capture via TorchDynamo that is dynamic on the batch size (number of tokens)
+- vLLM then optionally splits and/or specializes this graph and then uses TorchInductor to compile each graph into a compiled artifact.
+This step may use vLLM custom Inductor passes to further optimize the graph.
+- The compiled artifact is saved to vLLM's compile cache so that it can be loaded in the future.
+- vLLM applies CUDAGraphs to reduce CPU overheads.
+
+Things can go wrong in each of the four steps. When something does go wrong, please try to isolate the subsystem
+that went wrong -- this will allow you to turn off the minimal number of things to keep reliability
+goals while minimizing impact to performance and also helps us (vLLM) when you open a bug report.
+
+For more details on the design, please see the following resources:
+
+- [Introduction to vLLM-torch.compile blogpost](https://blog.vllm.ai/2025/08/20/torch-compile.html)
+- [vLLM-torch.compile integration design](https://docs.vllm.ai/en/latest/design/torch_compile.html)
+- [vLLM Office Hours #26](https://www.youtube.com/live/xLyxc7hxCJc?si=Xulo9pe53C6ywf0V&t=561)
+- [Talk at PyTorch Conference 2025](https://youtu.be/1wV1ESbGrVQ?si=s1GqymUfwiwOrDTg&t=725)
+
+## Use tlparse
+
+Use [tlparse](https://github.com/meta-pytorch/tlparse) to acquire torch.compile logs. These logs show all stages of the compilation process,
+including the fused kernels that torch.compile produces.
+If you can, we recommend sending these or pieces of these along with any bug reports --
+they are very helpful.
+
+Install tlparse:
+
+```sh
+pip install tlparse
+```
+
+Usage (offline inference)
+
+```sh
+TORCH_TRACE=~/trace_dir python my_script.py
+tlparse ~/trace_dir/<the_first_log_file>
+```
+
+Usage (serving)
+
+```sh
+TORCH_TRACE=~/trace_dir vllm serve
+# ctrl-c out of the server
+tlparse ~/trace_dir/<the_first_log_file>
+```
+
+The `tlparse` command outputs some HTML files (perhaps into e.g. `./tl_out/index.html`).
+Open it to see the logs. It'll look something like the following:
+
+![tlparse example](../assets/design/debug_vllm_compile/tlparse_inductor.png)
+
+## Turn off vLLM-torch.compile integration
+
+Pass `--enforce-eager` to turn off the vLLM-torch.compile integration and run entirely
+in eager mode. This includes turning off CUDAGraphs.
+
+```sh
+# Online
+vllm serve --enforce-eager
+```
+
+```py
+# Offline
+LLM(model, enforce_eager=True)
+```
+
+To turn off just torch.compile, pass `mode = NONE` to the compilation config.
+(`-O` is short for `--compilation_config`):
+
+```sh
+# Online
+vllm serve -O.mode=0
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CompilationMode
+LLM(model, compilation_config=CompilationConfig(mode=CompilationMode.NONE))
+```
+
+To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
+
+```sh
+# Online
+vllm serve -O.cudagraph_mode=NONE
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CUDAGraphMode
+LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
+```
+
+## Debugging TorchDynamo
+
+vLLM requires model code be capturable into a full graph via TorchDynamo (torch.compile's frontend).
+TorchDynamo does not support all of Python. It will error (in fullgraph mode) if it cannot support
+a feature (this is sometimes known as a graph break).
+
+If you encounter a graph break, please [open an issue to pytorch/pytorch](https://github.com/pytorch/pytorch) so the PyTorch devs can prioritize.
+Then, try your best to rewrite the code to avoid the graph break.
+For more information, see this [Dynamo guide](https://docs.pytorch.org/docs/stable/compile/programming_model.dynamo_core_concepts.html).
+
+## Debugging Dynamic Shape full graph capture
+
+vLLM requires that the model's forward pass be capturable into a full graph that is dynamic
+on the batch size (i.e. the number of tokens). It (by default) compiles this one graph into
+one artifact and uses this artifact for all batch sizes.
+
+If your code cannot be captured with Dynamic Shapes, you may see silent incorrectness,
+loud errors, or CUDA illegal memory accesses. For example, the following is not
+capturable into a single graph:
+
+```py
+if data.size[0] % 128 == 0:
+    foo(...)
+else:
+    bar(...)
+```
+
+This problem is easy to diagnose. Use tlparse and click on `compilation_metrics`:
+it will tell you symbolic constraints on the batch size. If there is any constraint
+that restricts the batch sizes, then we've got a problem.
+
+![Bad tlparse example](../assets/design/debug_vllm_compile/dynamic_shapes.png)
+
+To avoid this, please either:
+
+1. avoid branching on the number of tokens
+2. wrap the branching logic into a custom operator. TorchDynamo does not
+trace into custom operators.
+
+## Debugging TorchInductor
+
+TorchInductor takes a captured graph and then compiles it down to some Python code
+that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
+produce an incorrect triton kernel. This may manifest as silent incorrectness,
+CUDA illegal memory accesses, or loud errors.
+
+To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
+to the compilation config:
+
+```sh
+# online
+vllm serve -O.backend=eager
+```
+
+```py
+# offline
+LLM(compilation_config=CompilationConfig(backend='eager'))
+```
+
+If Inductor is at fault, [file a bug to PyTorch](https://github.com/pytorch/pytorch).
+If you're feeling adventurous, you can debug the triton kernels in the Inductor output code
+(that you can locate via using tlparse).
+
+![tlparse example](../assets/design/debug_vllm_compile/tlparse_inductor.png)
+
+You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output code.
+
+### Editable TorchInductor code
+
+You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
+or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+
+This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
+and print statements in the output code.
+
+## Debugging vLLM-compile cache
+
+vLLM built its own cache for torch.compile artifacts. The idea is that the artifacts
+can be compiled once and then reused after they have been compiled. This
+is a layer on top of [torch.compile's compiler cache](https://docs.pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html).
+
+While torch.compile's compiler cache is rock-stable, vLLM's compiler cache is unfortunately
+not always correct. You can disable it via setting `VLLM_DISABLE_COMPILE_CACHE=1`.
+
+You can also manually remove this cache.
+
+- Remove vLLM's compile cache with `rm -rf ~/.cache/vllm` (look at logs to see if the location changed)
+- Remove torch.compile's built-in caches with `rm -rf /tmp/torchinductor_$(whoami)`
+
+vLLM's cache is a mapping from cache key to a compiled artifact. vLLM computes
+the cache key via combining multiple factors (e.g. config flags and model name).
+If vLLM's compile cache is wrong, this usually means that a factor is missing.
+Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310)
+of how vLLM computes part of the cache key.
+
+## Debugging CUDAGraphs
+
+CUDAGraphs is a feature that allows one to:
+
+- Capture a callable that launches 1+ CUDA kernels into a CUDAGraph
+- Replay the CUDAGraph
+
+The captured CUDAGraph contains all of the memory used during the capture process.
+The replay of the CUDAGraph reads and writes to exactly the same regions of memory.
+
+This leads to some restrictions:
+
+1. In order to use CUDAGraphs on new data, you'll need to copy the data into a buffer
+that the CUDAGraph is reading from
+2. CUDAGraphs only capture CUDA kernels, they don't capture work done on CPU.
+
+vLLM uses the raw CUDAGraphs API, which is unsafe when used incorrectly.
+
+To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
+
+```sh
+# Online
+vllm serve -O.cudagraph_mode=NONE
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CUDAGraphMode
+LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
+```
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index cb2037b575e53271bf87c888a14ccd546b9baf97..e1a96be6c3445b7d517faca53daf19b0bb8231f8 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
 
 Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
 
@@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
 
 The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
 
 !!! note
     The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
 
 The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
 
-Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
 
 `FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
 The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
@@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
-![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
 
 ### FusedMoEPermuteExpertsUnpermute
 
@@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
 It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
 `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
 
-![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
 
 ### FusedMoEModularKernel
 
@@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
 
 #### Step 1: Add an All2All manager
 
-The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
 
 #### Step 2: Add a FusedMoEPrepareAndFinalize Type
 
@@ -213,59 +213,37 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
 
 ### How To Unit Test
 
-We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
 
 The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
 compatible, runs some correctness tests.
 If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
 
-1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
 `Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
-`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
 
 Doing this will add the new implementation to the test suite.
 
 ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
 
-The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
 Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
 As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.
 
 ### How To Profile
 
-Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
 The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
 `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
 Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
 
 ## FusedMoEPrepareAndFinalize Implementations
 
-The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
-
-| Implementation | Type | Comments |
-| :--- | :--- | :--- |
-| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
-| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
-| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
-| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
-| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
-| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
+See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-modular-all2all-backends) for a list of all the available modular prepare and finalize subclasses.
 
 ## FusedMoEPermuteExpertsUnpermute
 
-The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
-
-| Implementation | Type | Comment |
-| :--- | :--- | :--- |
-| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
-| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
-| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
-| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
-| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
-| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
-| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
-| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
-| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
-| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
+See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-experts-kernels) for a list of all the available modular experts.
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index e70ee4a076e54ad77d739331006841a0a9d81ff7..91ab4deae71df33b45deb52fe1be383a2e7e81c2 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -1,19 +1,18 @@
 # IO Processor Plugins
 
-IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
 When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
 ## Writing an IO Processor Plugin
 
-IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
 
 ```python
-IOProcessorInput = TypeVar('IOProcessorInput')
-IOProcessorOutput = TypeVar('IOProcessorOutput')
+IOProcessorInput = TypeVar("IOProcessorInput")
+IOProcessorOutput = TypeVar("IOProcessorOutput")
 
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
-
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
 
@@ -21,52 +20,66 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     def pre_process(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         raise NotImplementedError
 
     async def pre_process_async(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         return self.pre_process(prompt, request_id, **kwargs)
 
     @abstractmethod
-    def post_process(self,
-                     model_output: Sequence[PoolingRequestOutput],
-                     request_id: Optional[str] = None,
-                     **kwargs) -> IOProcessorOutput:
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: str | None = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
         raise NotImplementedError
 
     async def post_process_async(
         self,
         model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
+        # We cannot guarantee outputs are returned in the same order they were
+        # fed to vLLM.
+        # Let's sort them by id before post_processing
+        sorted_output = sorted(
+            [(i, item) async for i, item in model_output], key=lambda output: output[0]
+        )
+        collected_output = [output[1] for output in sorted_output]
         return self.post_process(collected_output, request_id, **kwargs)
 
     @abstractmethod
     def parse_request(self, request: Any) -> IOProcessorInput:
         raise NotImplementedError
 
+    def validate_or_generate_params(
+        self, params: SamplingParams | PoolingParams | None = None
+    ) -> SamplingParams | PoolingParams:
+        return params or PoolingParams()
+
     @abstractmethod
     def output_to_response(
-            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        self, plugin_output: IOProcessorOutput
+    ) -> IOProcessorResponse:
         raise NotImplementedError
 ```
 
 The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
 
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
-
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index 20d78ca3aae2c08edaa0abdb043795f389f330ca..8eadeb386fcf25c7622aa7ff15ca8a08d7052d15 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
     from collections.abc import Sequence
     from dataclasses import dataclass
     from enum import Enum, auto
-    from typing import TYPE_CHECKING, Optional
+    from typing import TYPE_CHECKING
 
     import torch
 
@@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
         @abstractmethod
         def update_state(
             self,
-            batch_update: Optional["BatchUpdate"],
+            batch_update: "BatchUpdate" | None,
         ) -> None:
             """Called when there are new output tokens, prior
             to each forward pass.
@@ -254,7 +254,15 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
                 changes to the batch makeup.
             """
             raise NotImplementedError
-            
+
+        @classmethod
+        def validate_params(cls, sampling_params: SamplingParams):
+            """Validate sampling params for this logits processor.
+
+            Raise ValueError for invalid ones.
+            """
+            return None
+
     ```
 
 A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods:
@@ -274,11 +282,15 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
     * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
     * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
 
-* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
     * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
     * Use the `BatchUpdate` members to update logits processor internal state
     * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
 
+* `validate_params(cls, sampling_params: SamplingParams)`:
+    * Raise `ValueError` if `SamplingParams` has invalid arguments (especially custom arguments) used by logits processor.
+    * When request is sent to entrypoint, `validate_params()` will validate `SamplingParams` and refuse request with invalid arguments.
+
 ### `BatchUpdate` data structure
 
 The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`):
@@ -399,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -536,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
 
 Review these logits processor implementations for guidance on writing built-in logits processors.
 
-Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
+Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
 
 * Allowed token IDs
 
diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd0dc6dc9c7bb0046e0dd3bab508d173d8609e29
--- /dev/null
+++ b/docs/design/lora_resolver_plugins.md
@@ -0,0 +1,220 @@
+# LoRA Resolver Plugins
+
+This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
+They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
+
+## Overview
+
+LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
+receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
+to locate and load the adapter from their configured storage locations. This enables:
+
+- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Automatic Discovery**: Seamless integration with existing LoRA workflows
+- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
+
+## Prerequisites
+
+Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
+
+### Required Environment Variables
+
+1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   ```
+
+2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
+   ```bash
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   ```
+
+3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
+   ```bash
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+### Optional Environment Variables
+
+- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
+
+## Available Resolvers
+
+### lora_filesystem_resolver
+
+The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
+
+#### Setup Steps
+
+1. **Create the LoRA adapter storage directory**:
+   ```bash
+   mkdir -p /path/to/lora/adapters
+   ```
+
+2. **Set environment variables**:
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+3. **Start vLLM server**:
+   Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
+   ```bash
+   python -m vllm.entrypoints.openai.api_server \
+       --model your-base-model \
+       --enable-lora
+   ```
+
+#### Directory Structure Requirements
+
+The filesystem resolver expects LoRA adapters to be organized in the following structure:
+
+```text
+/path/to/lora/adapters/
+├── adapter1/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+├── adapter2/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+└── ...
+```
+
+Each adapter directory must contain:
+
+- **`adapter_config.json`**: Required configuration file with the following structure:
+  ```json
+  {
+    "peft_type": "LORA",
+    "base_model_name_or_path": "your-base-model-name",
+    "r": 16,
+    "lora_alpha": 32,
+    "target_modules": ["q_proj", "v_proj"],
+    "bias": "none",
+    "modules_to_save": null,
+    "use_rslora": false,
+    "use_dora": false
+  }
+  ```
+
+- **`adapter_model.bin`**: The LoRA adapter weights file
+
+#### Usage Example
+
+1. **Prepare your LoRA adapter**:
+   ```bash
+   # Assuming you have a LoRA adapter in /tmp/my_lora_adapter
+   cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
+   ```
+
+2. **Verify the directory structure**:
+   ```bash
+   ls -la /path/to/lora/adapters/my_sql_adapter/
+   # Should show: adapter_config.json, adapter_model.bin, etc.
+   ```
+
+3. **Make a request using the adapter**:
+   ```bash
+   curl http://localhost:8000/v1/completions \
+       -H "Content-Type: application/json" \
+       -d '{
+           "model": "my_sql_adapter",
+           "prompt": "Generate a SQL query for:",
+           "max_tokens": 50,
+           "temperature": 0.1
+       }'
+   ```
+
+#### How It Works
+
+1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
+2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
+3. If found, it validates the `adapter_config.json` file
+4. If the configuration matches the base model and is valid, the adapter is loaded
+5. The request is processed normally with the newly loaded adapter
+6. The adapter remains available for future requests
+
+## Advanced Configuration
+
+### Multiple Resolvers
+
+You can configure multiple resolver plugins to load adapters from different sources:
+
+'lora_s3_resolver' is an example of a custom resolver you would need to implement
+
+```bash
+export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
+```
+
+All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
+
+### Custom Resolver Implementation
+
+To implement your own resolver plugin:
+
+1. **Create a new resolver class**:
+   ```python
+   from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+   from vllm.lora.request import LoRARequest
+   
+   class CustomResolver(LoRAResolver):
+       async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
+           # Your custom resolution logic here
+           pass
+   ```
+
+2. **Register the resolver**:
+   ```python
+   def register_custom_resolver():
+       resolver = CustomResolver()
+       LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
+   ```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
+   - Ensure the directory exists and is accessible
+   - Check file permissions on the directory
+
+2. **"LoRA adapter not found"**
+   - Verify the adapter directory name matches the requested model name
+   - Check that `adapter_config.json` exists and is valid JSON
+   - Ensure `adapter_model.bin` exists in the directory
+
+3. **"Invalid adapter configuration"**
+   - Verify `peft_type` is set to "LORA"
+   - Check that `base_model_name_or_path` matches your base model
+   - Ensure `target_modules` is properly configured
+
+4. **"LoRA rank exceeds maximum"**
+   - Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
+
+### Debugging Tips
+
+1. **Enable debug logging**:
+   ```bash
+   export VLLM_LOGGING_LEVEL=DEBUG
+   ```
+
+2. **Verify environment variables**:
+   ```bash
+   echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
+   echo $VLLM_PLUGINS
+   echo $VLLM_LORA_RESOLVER_CACHE_DIR
+   ```
+
+3. **Test adapter configuration**:
+   ```bash
+   python -c "
+   import json
+   with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
+       config = json.load(f)
+   print('Config valid:', config)
+   "
+   ```
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 90b2fd32f2979421466c6f7efafb1cb50888f53c..313c9aaebd26bb2100c9faa88727a47fecbbfcb0 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -1,12 +1,12 @@
 # Metrics
 
-Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
+vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
 
 ## Objectives
 
-- Achieve parity of metrics between v0 and v1.
-- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
-- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
+- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
+- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
+- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
 
 ## Background
 
@@ -17,51 +17,42 @@ Metrics in vLLM can be categorized as follows:
 
 The mental model is that server-level metrics help explain the values of request-level metrics.
 
-### v0 Metrics
-
-In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
-
-- `vllm:num_requests_running` (Gauge)
-- `vllm:num_requests_swapped` (Gauge)
-- `vllm:num_requests_waiting` (Gauge)
-- `vllm:gpu_cache_usage_perc` (Gauge)
-- `vllm:cpu_cache_usage_perc` (Gauge)
-- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
-- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
-- `vllm:prompt_tokens_total` (Counter)
-- `vllm:generation_tokens_total` (Counter)
-- `vllm:request_success_total` (Counter)
-- `vllm:request_prompt_tokens` (Histogram)
-- `vllm:request_generation_tokens` (Histogram)
-- `vllm:time_to_first_token_seconds` (Histogram)
-- `vllm:time_per_output_token_seconds` (Histogram)
-- `vllm:e2e_request_latency_seconds` (Histogram)
-- `vllm:request_queue_time_seconds` (Histogram)
-- `vllm:request_inference_time_seconds` (Histogram)
-- `vllm:request_prefill_time_seconds` (Histogram)
-- `vllm:request_decode_time_seconds` (Histogram)
-- `vllm:request_max_num_generation_tokens` (Histogram)
-- `vllm:num_preemptions_total` (Counter)
-- `vllm:cache_config_info` (Gauge)
-- `vllm:lora_requests_info` (Gauge)
-- `vllm:tokens_total` (Counter)
-- `vllm:iteration_tokens_total` (Histogram)
-- `vllm:time_in_queue_requests` (Histogram)
-- `vllm:model_forward_time_milliseconds` (Histogram)
-- `vllm:model_execute_time_milliseconds` (Histogram)
-- `vllm:request_params_n` (Histogram)
-- `vllm:request_params_max_tokens` (Histogram)
-- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
-- `vllm:spec_decode_efficiency` (Gauge)
-- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
-- `vllm:spec_decode_num_draft_tokens_total` (Counter)
-- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+### Metrics Overview
+
+### v1 Metrics
+
+In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+
+- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
+- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
+- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
+- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
+- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
+- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
+- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
+- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
+- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
+- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
+- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
+- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
+- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
+- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
+- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
+- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
+- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
+- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
+- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
+- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
+- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
+- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
+- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
+- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
 
 These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
 
 ### Grafana Dashboard
 
-vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana/README.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
 
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
@@ -80,13 +71,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:request_decode_time_seconds` - Requests decode time.
 - `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
 
-See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
 
 ### Prometheus Client Library
 
-Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
 
-With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
 
 ```bash
 $ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
@@ -99,7 +90,9 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
 
 ### Multi-process Mode
 
-In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
+Historically, metrics were collected in the engine core process and multiprocess mode was used to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
+
+More recently, metrics are collected in the API server process and multiprocess mode is only used when `--api-server-count > 1`. See <https://github.com/vllm-project/vllm/pull/17546> and details on [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
 
 ### Built in Python/Process Metrics
 
@@ -116,41 +109,37 @@ The following metrics are supported by default by `prometheus_client`, but they
 - `process_open_fds`
 - `process_max_fds`
 
-This is relevant because if we move away from multiprocess mode in v1,
-we get these back. However, it's questionable how relevant these are
-if they don't aggregate these stats for all processes that make up a
-vLLM instance.
+Therefore, these metrics are unavailable when `--api-server-count > 1`. It's questionable how relevant these are since they do not aggregate these stats for all processes that make up a vLLM instance.
 
-### v0 PRs and Issues
+## Metrics Design
 
-For background, these are some of the relevant PRs which added the v0 metrics:
+The ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where was where much of the metrics design was planned. For example, see where [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
 
-- <gh-pr:1890>
-- <gh-pr:2316>
-- <gh-pr:2730>
-- <gh-pr:4464>
-- <gh-pr:7279>
+### Legacy PRs
 
-Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+To help understand the background to the metrics design, here are some of the relevant PRs which added the original, now legacy, metrics:
 
-## v1 Design
+- <https://github.com/vllm-project/vllm/pull/1890>
+- <https://github.com/vllm-project/vllm/pull/2316>
+- <https://github.com/vllm-project/vllm/pull/2730>
+- <https://github.com/vllm-project/vllm/pull/4464>
+- <https://github.com/vllm-project/vllm/pull/7279>
 
-### v1 PRs
+### Metrics Implementation PRs
 
-For background, here are the relevant v1 PRs relating to the v1
-metrics issue <gh-issue:10582>:
+For background, here are the relevant PRs relating to the metrics implementation <https://github.com/vllm-project/vllm/issues/10582>:
 
-- <gh-pr:11962>
-- <gh-pr:11973>
-- <gh-pr:10907>
-- <gh-pr:12416>
-- <gh-pr:12478>
-- <gh-pr:12516>
-- <gh-pr:12530>
-- <gh-pr:12561>
-- <gh-pr:12579>
-- <gh-pr:12592>
-- <gh-pr:12644>
+- <https://github.com/vllm-project/vllm/pull/11962>
+- <https://github.com/vllm-project/vllm/pull/11973>
+- <https://github.com/vllm-project/vllm/pull/10907>
+- <https://github.com/vllm-project/vllm/pull/12416>
+- <https://github.com/vllm-project/vllm/pull/12478>
+- <https://github.com/vllm-project/vllm/pull/12516>
+- <https://github.com/vllm-project/vllm/pull/12530>
+- <https://github.com/vllm-project/vllm/pull/12561>
+- <https://github.com/vllm-project/vllm/pull/12579>
+- <https://github.com/vllm-project/vllm/pull/12592>
+- <https://github.com/vllm-project/vllm/pull/12644>
 
 ### Metrics Collection
 
@@ -394,15 +383,14 @@ distinguish between per-adapter counts. This should be revisited.
 Note that `multiprocess_mode="livemostrecent"` is used - the most
 recent metric is used, but only from currently running processes.
 
-This was added in <gh-pr:9477> and there is
+This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
 [at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
-If we revisit this design and deprecate the old metric, we should reduce
-the need for a significant deprecation period by making the change in
-v0 also and asking this project to move to the new metric.
+If we revisit this design and deprecate the old metric, we should
+coordinate with downstream users so they can migrate before the removal.
 
 ### Prefix Cache metrics
 
-The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
 some interesting points which may be relevant to how we approach
 future metrics.
 
@@ -439,8 +427,8 @@ suddenly (from their perspective) when it is removed, even if there is
 an equivalent metric for them to use.
 
 As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
-[deprecated](gh-pr:2764) (with a comment in the code),
-[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
+[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
+[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
 
 In general:
 
@@ -460,20 +448,20 @@ the project-wide deprecation policy.
 
 ### Unimplemented - `vllm:tokens_total`
 
-Added by <gh-pr:4464>, but apparently never implemented. This can just be
+Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
 removed.
 
 ### Duplicated - Queue Time
 
 The `vllm:time_in_queue_requests` Histogram metric was added by
-<gh-pr:9659> and its calculation is:
+<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
 
 ```python
     self.metrics.first_scheduled_time = now
     self.metrics.time_in_queue = now - self.metrics.arrival_time
 ```
 
-Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
 us with:
 
 ```python
@@ -491,7 +479,7 @@ if seq_group.is_finished():
 
 This seems duplicative, and one of them should be removed. The latter
 is used by the Grafana dashboard, so we should deprecate or remove the
-former from v0.
+former.
 
 ### Prefix Cache Hit Rate
 
@@ -500,7 +488,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
 
 ### KV Cache Offloading
 
-Two v0 metrics relate to a "swapped" preemption mode that is no
+Two legacy metrics relate to a "swapped" preemption mode that is no
 longer relevant in v1:
 
 - `vllm:num_requests_swapped`
@@ -511,7 +499,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
 
-In v0, [vLLM has long supported beam search](gh-issue:6226). The
+Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block
 sharing between requests, and copy-on-write to do branching. CPU
@@ -524,7 +512,7 @@ and the part of the prompt that was evicted can be recomputed.
 
 SequenceGroup was removed in V1, although a replacement will be
 required for "parallel sampling" (`n>1`).
-[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
+[Beam search was moved out of the core](https://github.com/vllm-project/vllm/issues/8306). There was a
 lot of complex code for a very uncommon feature.
 
 In V1, with prefix caching being better (zero over head) and therefore
@@ -535,11 +523,11 @@ better.
 
 ### Parallel Sampling
 
-Some v0 metrics are only relevant in the context of "parallel
+Some legacy metrics are only relevant in the context of "parallel
 sampling". This is where the `n` parameter in a request is used to
 request multiple completions from the same prompt.
 
-As part of adding parallel sampling support in <gh-pr:10980>, we should
+As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
 also add these metrics.
 
 - `vllm:request_params_n` (Histogram)
@@ -554,7 +542,7 @@ also add these metrics.
 
 ### Speculative Decoding
 
-Some v0 metrics are specific to "speculative decoding". This is where
+Some legacy metrics are specific to "speculative decoding". This is where
 we generate candidate tokens using a faster, approximate method or
 model and then validate those tokens with the larger model.
 
@@ -564,9 +552,9 @@ model and then validate those tokens with the larger model.
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
 
-There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
-revisit the v0 metrics in this context.
+revisit these metrics in this context.
 
 !!! note
     We should probably expose acceptance rate as separate accepted
@@ -585,7 +573,7 @@ see:
 - [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
 - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
 - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
-- <gh-issue:5041> and <gh-pr:12726>.
+- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
   
 This is a non-trivial topic. Consider this comment from Rob:
 
@@ -639,7 +627,7 @@ metrics are often relatively straightforward to add:
    metrics are usually of very limited use unless they can be enabled
    by default and in production.
 3. They have an impact on development and maintenance of the
-   project. Every metric added to v0 has made this v1 effort more
+   project. Every metric added over time has made this effort more
    time-consuming, and perhaps not all metrics justify this ongoing
    investment in their maintenance.
 
@@ -650,24 +638,24 @@ performance and health. Tracing, on the other hand, tracks individual
 requests as they move through different services and components. Both
 fall under the more general heading of "Observability".
 
-v0 has support for OpenTelemetry tracing:
+vLLM has support for OpenTelemetry tracing:
 
-- Added by <gh-pr:4687>
+- Added by <https://github.com/vllm-project/vllm/pull/4687> and reinstated by <https://github.com/vllm-project/vllm/pull/20372>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
 - [User-facing docs](../examples/online_serving/opentelemetry.md)
 - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
-  
+
 OpenTelemetry has a
 [Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
 
-Since metrics is a big enough topic on its own, we are going to tackle
-the topic of tracing in v1 separately.
+Since metrics is a big enough topic on its own, we consider the topic
+of tracing to be quite separate from metrics.
 
 ### OpenTelemetry Model Forward vs Execute Time
 
-In v0, we have the following two metrics:
+The current implementation exposes the following two metrics:
 
 - `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
   in the model forward pass when this request was in the batch.
@@ -683,7 +671,7 @@ documentation for this option states:
 > use of possibly costly and or blocking operations and hence might
 > have a performance impact.
 
-The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
 as:
 
 ```text
diff --git a/docs/design/mm_processing.md b/docs/design/mm_processing.md
index 1e9b6ad6e821e319affbd13d96523e00ec9e3312..ee56ac5b98ef338a0adb0d6c44063e46131818ed 100644
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@@ -1,6 +1,6 @@
 # Multi-Modal Data Processing
 
-To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
 
 Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
 
@@ -41,14 +41,10 @@ While HF processors support text + multi-modal inputs natively, this is not so f
 
 Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
 
-[](){ #mm-dummy-text }
-
 ### Dummy text
 
 We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
-[](){ #mm-automatic-prompt-updating }
-
 ### Automatic prompt updating
 
 We address the second issue by implementing model-agnostic code in
@@ -60,8 +56,8 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
 
 ## Processor Output Caching
 
-Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
 
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
 
-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
new file mode 100644
index 0000000000000000000000000000000000000000..7663b82266f0ba06121abcafd7c6e1b2c22af93c
--- /dev/null
+++ b/docs/design/moe_kernel_features.md
@@ -0,0 +1,120 @@
+# Fused MoE Kernel features
+
+The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
+
+## Fused MoE Modular All2All backends
+
+There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
+
+The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
+
+The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
+
+The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type.  The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process.  If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
+
+Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
+
+Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
+
+unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`.  All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
+
+<style>
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+</style>
+
+| Backend                               | Output act. format | Quant. types    | Quant. format          | Async | Apply Weight On Input | Sub-class                                                                                                                                                     |
+|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| naive                                 | standard           | all<sup>1</sup> | G,A,T                  | N     | <sup>6</sup>          | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl]                                                                                  |
+| pplx                                  | batched            | fp8,int8        | G,A,T                  | Y     | Y                     | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize]                                                 |
+| deepep_high_throughput                | standard           | fp8             | G(128),A,T<sup>2</sup> | Y     | Y                     | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize]                                    |
+| deepep_low_latency                    | batched            | fp8             | G(128),A,T<sup>3</sup> | Y     | Y                     | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize]                                    |
+| flashinfer_all2allv                   | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
+| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
+| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
+| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard           | fp8,int8        | G,A,T                  | N     | Y                     | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP]                                                |
+| BatchedPrepareAndFinalize<sup>5</sup> | batched            | fp8,int8        | G,A,T                  | N     | Y                     | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize]                                               |
+
+!!! info "Table key"
+    1. All types: mxfp4, nvfp4, int4, int8, fp8
+    2. A,T quantization occurs after dispatch.
+    3. All quantization happens after dispatch.
+    4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
+    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine.  These cannot be selected via environment variable.  These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
+    6. This depends on the experts implementation.
+
+    ---
+
+    - G - Grouped
+    - G(N) - Grouped w/block size N
+    - A - Per activation token
+    - T - Per tensor
+
+Modular kernels are supported by the following `FusedMoEMethodBase` classes.
+
+- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
+- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
+- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
+- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
+- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
+
+## Fused MoE Experts Kernels
+
+The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+
+Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
+
+Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
+
+Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
+
+As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
+
+Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
+
+To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
+
+| Kernel                       | Input act. format     | Quant. types     | Quant. format | Activation function                                         | Apply Weight On Input | Modular | Source                                                                                                                                                                                                                                                                                                      |
+|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| triton                       | standard              | all<sup>1</sup>  | G,A,T         | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y                     | Y       | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts]                                                                                                                                        |
+| triton (batched)             | batched               | all<sup>1</sup>  | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts]                                                                                                                                                                                                       |
+| deep gemm                    | standard,</br>batched | fp8              | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| cutlass_fp4                  | standard,</br>batched | nvfp4            | A,T           | silu                                                        | Y                     | Y       | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4]                                                                                                                        |
+| cutlass_fp8                  | standard,</br>batched | fp8              | A,T           | silu, gelu                                                  | Y                     | Y       | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8]            |
+| flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
+| gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
+| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
+| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
+| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
+| trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
+| pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
+| iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
+| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
+| cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
+| naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
+
+!!! info "Table key"
+    1. All types: mxfp4, nvfp4, int4, int8, fp8
+    2. A dispatcher wrapper around triton and deep gemm experts.  Will select based on type + shape + quantization params
+    3. uint4, uint8, fp8, fp4
+    4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
+    5. The `activation` parameter is ignored and SwiGlu is used by default instead.
+    6. Only handled by or supported when used with modular kernels.
+
+## Modular Kernel "families"
+
+The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
+
+| backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
+|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
+| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
+| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
+| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 6e92b20d267b4207e35c195fd44747e6e7cc4853..d6bd922788294a63a3ebe9365a052ef726b69747 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -2,7 +2,7 @@
 
 ## Debugging
 
-Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
+Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
 page for information on known issues and how to solve them.
 
 ## Introduction
@@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
 
 Related PRs:
 
-- <gh-pr:8823>
+- <https://github.com/vllm-project/vllm/pull/8823>
 
 ## Prior State in v1
 
diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
index adf838306bc77fda7e5aa125bf2de57d103d1ff6..4674bef8d2b64ce90a17666b71857f20a7f329b8 100644
--- a/docs/design/p2p_nccl_connector.md
+++ b/docs/design/p2p_nccl_connector.md
@@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20001 \
         --tensor-parallel-size 1 \
@@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20002 \
         --tensor-parallel-size 1 \
@@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20003 \
         --tensor-parallel-size 1 \
@@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20004 \
         --tensor-parallel-size 1 \
@@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20001 \
         --tensor-parallel-size 1 \
@@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20002 \
         --tensor-parallel-size 1 \
@@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20003 \
         --tensor-parallel-size 1 \
@@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20004 \
         --tensor-parallel-size 1 \
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 37193809776ae03191f4815e856c18b180ea7c8b..dc2f7c4aed3c3ab5ba135a0a63b0610ed863e3de 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -41,7 +41,7 @@ Every plugin has three parts:
 
 1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
-3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+3. **Plugin value**: The fully qualified name of the function or module to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
 
 ## Types of supported plugins
 
@@ -49,7 +49,9 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
-- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
+
+- **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
 
 ## Guidelines for Writing Plugins
 
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 9941837bf16522a1051eeca25627198fe47d7c47..bd4070f381d8164b2f8efb8ca9e67ff1b20c0792 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -112,8 +112,8 @@ class KVCacheBlock:
     ref_cnt: int
 
     # The pointers to form a doubly linked list for the free queue.
-    prev_free_block: Optional["KVCacheBlock"] = None
-    next_free_block: Optional["KVCacheBlock"] = None
+    prev_free_block: "KVCacheBlock | None" = None
+    next_free_block: "KVCacheBlock | None" = None
 ```
 
 There are two design points to highlight:
@@ -213,22 +213,22 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 ![Example Time 1](../assets/design/prefix_caching/example-time-1.png)
 
-**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+**Time 2: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-![Example Time 3](../assets/design/prefix_caching/example-time-3.png)
+![Example Time 2](../assets/design/prefix_caching/example-time-3.png)
 
-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
+**Time 3: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-![Example Time 4](../assets/design/prefix_caching/example-time-4.png)
+![Example Time 3](../assets/design/prefix_caching/example-time-4.png)
 
-**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+**Time 4: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-![Example Time 5](../assets/design/prefix_caching/example-time-5.png)
+![Example Time 4](../assets/design/prefix_caching/example-time-5.png)
 
-**Time 6: Request 1 is finished and free.**
+**Time 5: Request 1 is finished and free.**
 
-![Example Time 6](../assets/design/prefix_caching/example-time-6.png)
+![Example Time 5](../assets/design/prefix_caching/example-time-6.png)
 
-**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+**Time 6: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-![Example Time 7](../assets/design/prefix_caching/example-time-7.png)
+![Example Time 6](../assets/design/prefix_caching/example-time-7.png)
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 47ac4958dbf7ffd4b450bc05dee11f5007d02fe1..27edc4f89201dc63b907f69fb4bb4d50e1f2494e 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -2,7 +2,10 @@
 
 In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
 
-Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+
+!!! note
+    For more information and the latest progress of `torch.compile` integration, see this [Blog Post](https://blog.vllm.ai/2025/08/20/torch-compile.html).
 
 ## Compilation Cache
 
@@ -16,14 +19,16 @@ vLLM will take all the available factors into consideration, and decide a direct
 
 The factors considered include:
 
-- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
-- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
 - The model's forward function and the relevant functions called by the forward function (see below)
 
 With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
 
 A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
 
+By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
@@ -133,7 +138,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu
 
 ## Cudagraph Capture
 
-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph that aligns with the piecewise compilation. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
 
 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
 
@@ -150,6 +155,4 @@ Then it will only capture cudagraph for the specified sizes. It can be useful to
 
 ### Full Cudagraph capture
 
-It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`.
-
-Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled.
+It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models or MOEs. See [CUDA Graphs](cuda_graphs.md) for more details.
diff --git a/docs/features/README.md b/docs/features/README.md
index 10cc448cc2ee358adbbac34b411c4b353abbb91f..ad9de9ff8f368c62922c4c424312c7b088bb162e 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -36,45 +36,43 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
+| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
 | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
 | [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
-| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
+| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
+| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
 
 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
 
-[](){ #feature-x-hardware }
-
 ### Feature x Hardware
 
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
-| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [APC](automatic_prefix_caching.md)                        | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
index c529da684e365f0b2e989a65e7c442777d671bae..3718a4b74eb26bae23d869d32a2a414a791ba62a 100644
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
 
 Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
 
-<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
 
 ## Example workloads
 
diff --git a/docs/features/batch_invariance.md b/docs/features/batch_invariance.md
new file mode 100644
index 0000000000000000000000000000000000000000..b196db9d9c25ca2d56d098a7ed82fd8fd525b7db
--- /dev/null
+++ b/docs/features/batch_invariance.md
@@ -0,0 +1,133 @@
+# Batch Invariance
+
+!!! note
+    Batch invariance is currently in beta. Some features are still under active development.
+    Track progress and planned improvements at <https://github.com/vllm-project/vllm/issues/27433>
+
+This document shows how to enable batch invariance in vLLM. Batch invariance ensures that the output of a model is deterministic and independent of the batch size or the order of requests in a batch.
+
+## Motivation
+
+Batch invariance is crucial for several use cases:
+
+- **Framework debugging**: Deterministic outputs make it easier to debug issues in the inference framework, as the same input will always produce the same output regardless of batching.
+- **Model debugging**: Helps identify issues in model implementations by ensuring consistent behavior across different batch configurations.
+- **Reinforcement Learning (RL)**: RL training often requires deterministic rollouts for reproducibility and stable training.
+- **Large-scale inference systems**: Systems that use vLLM as a component benefit from deterministic behavior for testing, validation, and consistency guarantees.
+
+## Hardware Requirements
+
+Batch invariance currently requires NVIDIA GPUs with compute capability 9.0 or higher:
+
+- **H-series**: H100, H200
+- **B-series**: B100, B200
+
+## Enabling Batch Invariance
+
+Batch invariance can be enabled by setting the `VLLM_BATCH_INVARIANT` environment variable to `1`:
+
+```bash
+export VLLM_BATCH_INVARIANT=1
+```
+
+### Online Inference (Server Mode)
+
+To start a vLLM server with batch invariance enabled:
+
+```bash
+VLLM_BATCH_INVARIANT=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
+```
+
+Then use the OpenAI-compatible client:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="EMPTY",
+    base_url="http://localhost:8000/v1",
+)
+
+# These requests will produce deterministic outputs
+# regardless of batch size or order
+response = client.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    prompt="The future of AI is",
+    max_tokens=100,
+    temperature=0.7,
+    seed=42,
+)
+
+print(response.choices[0].text)
+```
+
+### Offline Inference
+
+For offline batch inference with batch invariance:
+
+```python
+import os
+os.environ["VLLM_BATCH_INVARIANT"] = "1"
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+    "Machine learning enables",
+    "Deep learning models can",
+]
+
+sampling_params = SamplingParams(
+    temperature=0.7,
+    top_p=0.95,
+    max_tokens=100,
+    seed=42,
+)
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    tensor_parallel_size=1,
+)
+
+# Outputs will be deterministic regardless of batch size
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated: {generated_text!r}\n")
+```
+
+## Tested Models
+
+Batch invariance has been tested and verified on the following models:
+
+- **DeepSeek series**: `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-V3-0324`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`
+- **Qwen3 (Dense)**: `Qwen/Qwen3-1.7B`, `Qwen/Qwen3-8B`
+- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
+- **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
+
+Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose).
+
+## Implementation Details
+
+When batch invariance is enabled, vLLM:
+
+1. Uses deterministic kernel implementations for attention and other operations
+2. Ensures consistent numerical behavior across different batch sizes
+3. Disables certain optimizations that may introduce non-determinism (such as custom all-reduce operations in tensor parallel mode)
+
+!!! note
+    Enabling batch invariance may impact performance compared to the default non-deterministic mode. This trade-off is intentional to guarantee reproducibility.
+
+## Future Improvements
+
+The batch invariance feature is under active development. Planned improvements include:
+
+- Support for additional GPU architectures
+- Expanded model coverage
+- Performance optimizations
+- Additional testing and validation
+
+For the latest status and to contribute ideas, see the [tracking issue](https://github.com/vllm-project/vllm/issues/27433).
diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md
index 74ed40835b4d4fa623f8520986e9ad51679b95b0..728a2c89901de78fb6b64366f37c5aa9c6f056e4 100644
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@@ -4,6 +4,9 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
 
 Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
 
+!!! note
+    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
+
 ## Offline Custom Arguments
 
 Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`:
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 201b340c5972c1a262261b08702639f85c34e13f..5ddef9db1611bf7fc0b43fbeace5f313a70c85d9 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -18,6 +18,11 @@ In vLLM, logits processors operate at batch granularity. During a given engine s
 
 Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods:
 
+* `validate_params(cls, sampling_params: SamplingParams)`:
+    * Raise `ValueError` if `SamplingParams` has invalid arguments (especially custom arguments) used by logits processor.
+    * When request is sent to entrypoint, `validate_params()` will validate `SamplingParams` and refuse request with invalid arguments.
+    * **Note:** it's important to implement `validate_params()` to prevent invalid parameters for custom logits processor. Otherwise requests with invalid parameters can cause unexpected behaviour in custom logits processor.
+
 * `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
     * `vllm_config`: engine configuration data structure
     * `device`: hardware accelerator device info
@@ -66,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -93,7 +98,6 @@ The contrived example below implements a custom logits processor which consumes
 ??? code "Example custom logits processor definition"
 
     ``` python
-    from typing import Optional
     import torch
     from vllm.config import VllmConfig
     from vllm.sampling_params import SamplingParams
@@ -104,6 +108,14 @@ The contrived example below implements a custom logits processor which consumes
     class DummyLogitsProcessor(LogitsProcessor):
         """Fake logit processor to support unit testing and examples"""
 
+        @classmethod
+        def validate_params(cls, params: SamplingParams):
+            target_token: int | None = params.extra_args and params.extra_args.get(
+                "target_token"
+            )
+            if target_token is not None and not isinstance(target_token, int):
+                raise ValueError(f"target_token value {target_token} is not int")
+
         def __init__(self, vllm_config: "VllmConfig", device: torch.device,
                     is_pin_memory: bool):
             self.req_info: dict[int, int] = {}
@@ -112,13 +124,14 @@ The contrived example below implements a custom logits processor which consumes
             """Never impacts greedy sampling"""
             return False
 
-        def update_state(self, batch_update: Optional[BatchUpdate]):
+        def update_state(self, batch_update: BatchUpdate | None):
             if not batch_update:
                 return
 
             # Process added requests.
             for index, params, _, _ in batch_update.added:
                 assert params is not None
+                self.validate_params(params)
                 if params.extra_args and (target_token :=
                                         params.extra_args.get("target_token")):
                     self.req_info[index] = target_token
@@ -158,6 +171,7 @@ The contrived example below implements a custom logits processor which consumes
             logits[rows, cols] = values_to_keep
 
             return logits
+
     ```
 
 In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor.
@@ -181,7 +195,13 @@ RequestLogitsProcessor = Union[
 
 While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above.
 
-You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
+You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.):
+
+* Override `AdapterLogitsProcessor.validate_params(cls,params)` to validate request's sampling parameters.
+
+* Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit.
+
+* Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
 
 ??? code "Example of Wrapping a Request-Level Logits Processor"
 
@@ -221,6 +241,16 @@ You can wrap the request-level logits processor by subclassing `AdapterLogitsPro
         """Example of wrapping a fake request-level logit processor to create a
         batch-level logits processor"""
 
+        @classmethod
+        def validate_params(cls, params: SamplingParams):
+            target_token: Any | None = params.extra_args and params.extra_args.get(
+                "target_token"
+            )
+            if target_token is not None and not isinstance(target_token, int):
+                raise ValueError(
+                    f"target_token value {target_token} is not int"
+                )
+
         def is_argmax_invariant(self) -> bool:
             return False
 
@@ -241,18 +271,11 @@ You can wrap the request-level logits processor by subclassing `AdapterLogitsPro
             Returns:
             `Callable` request logits processor, or None
             """
-            target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+            target_token: Any | None = params.extra_args and params.extra_args.get(
                 "target_token"
             )
             if target_token is None:
                 return None
-            if not isinstance(target_token, int):
-                logger.warning(
-                    "target_token value %s is not int; not applying logits"
-                    " processor to request.",
-                    target_token,
-                )
-                return None
             return DummyPerReqLogitsProcessor(target_token)
     ```
 
@@ -263,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
 
 ## Ways to Load Your Custom Logits Processor in vLLM
 
-Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
+Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
 
 This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
 
@@ -415,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
 
 ## Best Practices for Writing Custom Logits Processors
 
-Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
+Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
 
 * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
     * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@@ -442,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
 
     * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
 
-* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d40af7069822b0f09c145b5100a951ceeaefcca
--- /dev/null
+++ b/docs/features/disagg_encoder.md
@@ -0,0 +1,75 @@
+# Disaggregated Encoder
+
+A **disaggregated encoder** runs the vision-encoder stage of a multimodal LLM in a process that is separate from the pre-fill / decoder stage. Deploying these two stages in independent vLLM instances brings three practical benefits:
+
+1. **Independent, fine-grained scaling**  
+2. **Lower time-to-first-token (TTFT)**  
+3. **Cross-process reuse and caching of encoder outputs**
+
+Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-CpnuLLzmR8l9BAE>
+
+---
+
+## 1  Motivation
+
+### 1. Independent, fine-grained scaling
+
+* Vision encoders are lightweight, while language models are orders of magnitude larger.  
+* The language model can be parallelised without affecting the encoder fleet.  
+* Encoder nodes can be added or removed independently.
+
+### 2. Lower time-to-first-token (TTFT)
+
+* Language-only requests bypass the vision encoder entirely.  
+* Encoder output is injected only at required attention layers, shortening the pre-fill critical path.
+
+### 3. Cross-process reuse and caching
+
+* In-process encoders confine reuse to a single worker.  
+* A remote, shared cache lets any worker retrieve existing embeddings, eliminating redundant computation.
+
+---
+
+## 2  Usage Example
+
+The current reference pathway is **SharedStorageConnector**.  
+Below ready-to-run scripts shows the workflow:
+
+1 Encoder instance + 1 PD instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+
+1 Encoder instance + 1 Prefill instance + 1 Decode instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+
+---
+
+## 3  Test Script
+
+Please refer to the directories `tests/v1/ec_connector`
+
+## 4  Development
+
+Disaggregated encoding is implemented by running two parts:
+
+* **Encoder instance** – a vLLM instance to performs vision encoding.  
+* **Prefill/Decode (PD) instance(s)** – runs language pre-fill and decode.
+    * PD can be in either a single normal instance with `disagg_encoder_example.sh` (E->PD) or in disaggregated instances with `disagg_epd_example.sh` (E->P->D)
+
+A connector transfers encoder-cache (EC) embeddings from the encoder instance to the PD instance.  
+All related code is under `vllm/distributed/ec_transfer`.
+
+### Key abstractions
+
+* **ECConnector** – interface for retrieving EC caches produced by the encoder.  
+    * *Scheduler role* – checks cache existence and schedules loads.  
+    * *Worker role* – loads the embeddings into memory.
+
+Here is a figure illustrating disaggregate encoder flow:
+
+![Disaggregated Encoder Flow](../assets/features/disagg_encoder/disagg_encoder_flow.png)
+
+For the PD disaggregation part, the Prefill instance receive cache exactly the same as the disaggregate encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfer KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execute of the PDinstance.
+
+`docs/features/disagg_prefill.md` shows the brief idea about the disaggregated prefill (v0)
+
+We create the example setup with the **NixlConnector** from `vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py` and referred to the `tests/v1/kv_connector/nixl_integration/toy_proxy_server.py` to facilitate the kv transfer between P and D;
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index fe065b52268a6465654696e5d77d2fa41a5a4953..fd4f249f2ec6ca412200834024a22da157d6c693 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -17,14 +17,14 @@ Two main reasons:
 
 ## Usage example
 
-Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
+Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
 
 Now supports 5 types of connectors:
 
-- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
-- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
-- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
-- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
+- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
+- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
+- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
 
   ```bash
@@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
 
 ## Benchmarks
 
-Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
+Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
 
 ## Development
 
@@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
 
 We recommend three ways of implementations:
 
-- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
 - **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
 - **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/features/interleaved_thinking.md b/docs/features/interleaved_thinking.md
new file mode 100644
index 0000000000000000000000000000000000000000..7343324b48494bd7313b71aebffa2a8e3ef23149
--- /dev/null
+++ b/docs/features/interleaved_thinking.md
@@ -0,0 +1,118 @@
+# Interleaved Thinking
+
+## Introduction
+
+Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
+
+Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
+
+## How Interleaved Thinking Works
+
+With interleaved thinking, the model can:
+
+- Reason about the results of a tool call before deciding what to do next
+- Chain multiple tool calls with reasoning steps in between
+- Make more nuanced decisions based on intermediate results
+- Provide transparent reasoning for its tool selection process
+
+## Supported Models
+
+vLLM currently supports the following interleaved thinking models:
+
+| Model Series | Reasoning Parser Name |
+|--------------|-----------------------|
+| moonshotai/Kimi-K2-Thinking    |  kimi_k2  |
+| MiniMaxAI/MiniMax-M2           |  minimax_m2  |
+
+## Example Usage
+
+To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
+
+??? code
+
+    ```python
+    """
+    vllm serve MiniMaxAI/MiniMax-M2 \
+      --tensor-parallel-size 4 \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2 \
+      --enable-auto-tool-choice
+    """
+    import json
+    
+    from openai import OpenAI
+    
+    client = OpenAI(base_url="http://localhost:8000/v1",     api_key="dummy")
+    
+    
+    def get_current_weather(location: str, unit: "str"):
+        """Get the current weather in a given location"""
+        if unit == "celsius":
+            return f"The current temperature in {location} is 22°C."
+        else:
+            return f"The current temperature in {location} is 72°F."
+    
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given     location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g.,     'San Francisco, CA'",
+                        },
+                        "unit": {"type": "string", "enum":     ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    
+    tool_call = response.choices[0].message.tool_calls[0].function
+    
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": response.choices[0].message.tool_calls,
+            "reasoning": response.choices[0].message.reasoning, # append reasoning
+        }
+    )
+    
+    # Simulate tool execution
+    available_tools = {"get_weather": get_current_weather}
+    
+    completion_tool_calls = response.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    response_2 = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(response_2.choices[0].message.content)
+    ```
+This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
diff --git a/docs/features/lora.md b/docs/features/lora.md
index db794b2ebd71d85eea6331a2f29836b1ebf7258d..d42a3cef76bdeadce708e6942422804e7979451d 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
 
 LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
 
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
 ```python
@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
     sampling_params = SamplingParams(
         temperature=0,
         max_tokens=256,
-        stop=["[/assistant]"]
+        stop=["[/assistant]"],
     )
 
     prompts = [
@@ -43,11 +43,11 @@ the third parameter is the path to the LoRA adapter.
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
     )
     ```
 
-Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
                 lora_request = LoRARequest(
                     lora_name=lora_name,
                     lora_path=local_path,
-                    lora_int_id=abs(hash(lora_name))
+                    lora_int_id=abs(hash(lora_name)),
                 )
                 return lora_request
         ```
@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
         if has_audio:
             question = f"<|audio|>{question}"
         chat = [
-            {
-                "role": "user",
-                "content": question
-            }
+            {"role": "user", "content": question},
         ]
         return tokenizer.apply_chat_template(chat, tokenize=False)
 
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 4438c398d342a36bf5e36e4f2580045d3f471c3f..cde2ec165712b69dadc360670384bbc6878959d1 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -1,9 +1,9 @@
 # Multimodal Inputs
 
-This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
+This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.
 
 !!! note
-    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
     and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 
 !!! tip
@@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
         print(generated_text)
     ```
 
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": {
-            "image": [image1, image2]
-        },
+        "multi_modal_data": {"image": [image1, image2]},
     })
 
     for o in outputs:
@@ -164,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
         print(generated_text)
     ```
 
-Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
+Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)
 
 If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
 
@@ -183,21 +181,24 @@ conversation = [
     {"role": "assistant", "content": "Hello! How can I assist you today?"},
     {
         "role": "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        },{
-            "type": "image_pil",
-            "image_pil": image_pil
-        }, {
-            "type": "image_embeds",
-            "image_embeds": image_embeds
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }],
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            },
+            {
+                "type": "image_pil",
+                "image_pil": image_pil,
+            },
+            {
+                "type": "image_embeds",
+                "image_embeds": image_embeds,
+            },
+            {
+                "type": "text",
+                "text": "What's in these images?",
+            },
+        ],
     },
 ]
 
@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
     message = {
         "role": "user",
         "content": [
-            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+            {
+                "type": "text",
+                "text": "Describe this set of frames. Consider the frames to be a part of the same video.",
+            },
         ],
     }
     for i in range(len(video_frames)):
@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
     )
 
     # Custom brand color background (e.g., blue)
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
     )
     ```
 
@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
         limit_mm_per_prompt={"video": 1},
     )
 
-    sampling_params = SamplingParams(
-        max_tokens=1024,
-    )
+    sampling_params = SamplingParams(max_tokens=1024)
 
     video_messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant.",
+        },
+        {
+            "role": "user",
+            "content": [
                 {"type": "text", "text": "describe this video."},
                 {
                     "type": "video",
                     "video": video_path,
                     "total_pixels": 20480 * 28 * 28,
-                    "min_pixels": 16 * 28 * 28
-                }
+                    "min_pixels": 16 * 28 * 28,
+                },
             ]
         },
     ]
@@ -339,26 +346,32 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
     !!! note
         'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
 
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 
 ### Audio Inputs
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference/audio_language.py>
+Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
 
 ### Embedding Inputs
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
+You must enable this feature via `enable_mm_embeds=True`.
+
+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!
+
 ??? code
 
     ```python
     from vllm import LLM
 
     # Inference with image embeddings as input
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)
 
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
@@ -390,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
     image_embeds = torch.load(...)
 
     # Qwen2-VL
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    llm = LLM(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        limit_mm_per_prompt={"image": 4},
+        enable_mm_embeds=True,
+    )
     mm_data = {
         "image": {
             "image_embeds": image_embeds,
@@ -400,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
     }
 
     # MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    llm = LLM(
+        "openbmb/MiniCPM-V-2_6",
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 4},
+        enable_mm_embeds=True,
+    )
     mm_data = {
         "image": {
             "image_embeds": image_embeds,
@@ -427,11 +449,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
     A chat template is **required** to use Chat Completions API.
     For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
 
-    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
     If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
 
-    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
-    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
+    For certain models, we provide alternative chat templates inside [examples](../../examples).
+    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
 
 ### Image Inputs
 
@@ -465,55 +487,59 @@ Then, you can use the OpenAI client as follows:
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What’s in this image?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        url": image_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    # NOTE: The prompt formatting with the image token `<image>` is not needed
+                    # since the prompt will be processed automatically by the API server.
+                    {
+                        "type": "text",
+                        "text": "What’s in this image?",
                     },
-                    "uuid": image_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_url,  # Optional
+                    },
+                ],
+            }
+        ],
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
     # Multi-image input inference
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What are the animals in these images?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_duck
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the animals in these images?",
                     },
-                    "uuid": image_url_duck # Optional
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_lion
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
+                        "uuid": image_url_duck,  # Optional
                     },
-                    "uuid": image_url_lion # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
+                        "uuid": image_url_lion,  # Optional
+                    },
+                ],
+            }
+        ],
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
     ```
 
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 
 !!! tip
     Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -560,23 +586,22 @@ Then, you can use the OpenAI client as follows:
 
     ## Use video url in the payload
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this video?"
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": video_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this video?",
                     },
-                    "uuid": video_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                        "uuid": video_url,  # Optional
+                    },
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -585,7 +610,7 @@ Then, you can use the OpenAI client as follows:
     print("Chat completion output from image url:", result)
     ```
 
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 
 !!! note
     By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@@ -652,23 +677,25 @@ Then, you can use the OpenAI client as follows:
     audio_base64 = encode_base64_content_from_url(audio_url)
 
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        "data": audio_base64,
-                        "format": "wav"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this audio?",
                     },
-                    "uuid": audio_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                        "uuid": audio_url,  # Optional
+                    },
+                ],
+            },
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -683,22 +710,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
 
     ```python
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this audio?",
                     },
-                    "uuid": audio_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "audio_url",
+                        "audio_url": {"url": audio_url},
+                        "uuid": audio_url,  # Optional
+                    },
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -707,7 +734,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
     print("Chat completion output from audio url:", result)
     ```
 
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 
 !!! note
     By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -720,7 +747,13 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
 ### Embedding Inputs
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
+
+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!
 
 #### Image Embedding Inputs
 
@@ -747,43 +780,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 
     # Basic usage - this is equivalent to the LLaVA example for offline inference
     model = "llava-hf/llava-1.5-7b-hf"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": f"{base64_image_embedding}",
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
     model = "Qwen/Qwen2-VL-2B-Instruct"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_grid_thw": f"{base64_image_grid_thw}",  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_sizes": f"{base64_image_sizes}",  # Required by openbmb/MiniCPM-V-2_6
         },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": [
             {
-                "type": "text",
-                "text": "What's in this image?",
+                "role": "system",
+                "content": "You are a helpful assistant.",
             },
-            embeds,
-            ],
-        },
-    ],
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this image?",
+                    },
+                    embeds,
+                ],
+            },
+        ],
         model=model,
     )
     ```
@@ -802,22 +840,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
         {
             "type": "image_embeds",
             "image_embeds": None,
-            "uuid": image_uuid
+            "uuid": image_uuid,
         },
 
         # input_audio:
         {
             "type": "input_audio",
             "input_audio": None,
-            "uuid": audio_uuid
+            "uuid": audio_uuid,
         },
 
         # PIL Image:
         {
             "type": "image_pil",
-            "image_pil": None
-            "uuid": image_uuid
-        }
+            "image_pil": None,
+            "uuid": image_uuid,
+        },
 
     ```
 
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index afecbc82947b48eb3a6e5be782eed88b56de82d6..1ce038f4d6525b579043d60464178bbb4e2c7a76 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -9,7 +9,13 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.
 
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
-- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
+
+For non-cuda platform, please install nixl with ucx build from source, instructed as below.
+
+```bash
+python tools/install_nixl_from_source_ubuntu.py
+```
 
 ### Transport Configuration
 
@@ -75,7 +81,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
     - Default: 5600
     - **Required for both prefiller and decoder instances**
     - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
-    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank (e.g., with `--tensor-parallel-size=4` and base_port=5600, tp_rank 0..3 use ports 5600, 5601, 5602, 5603 on that node).
+    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank (e.g., with `--data-parallel-size=2` and base_port=5600, dp_rank 0..1 use port 5600, 5601 on that node).
     - Used for the initial NIXL handshake between the prefiller and the decoder
 
 - `VLLM_NIXL_SIDE_CHANNEL_HOST`: Host for side channel communication
@@ -84,7 +90,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
     - Connection info is passed via KVTransferParams from prefiller to decoder for handshake
 
 - `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
-    - Default: 120
+    - Default: 480
     - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
 
 ## Multi-Instance Setup
@@ -150,10 +156,20 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
     NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
     Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
 
+## Experimental Feature
+
+### Heterogenuous KV Layout support
+
+Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
+
+```bash
+--kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
+```
+
 ## Example Scripts/Code
 
 Refer to these example scripts in the vLLM repository:
 
-- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
-- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
-- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
index f9d3c1fb6c23d2666329835a62212a045de95fab..b81d2f28e3b9debdfbd98c3368441932f971e978 100644
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -16,16 +16,20 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
 
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
 
-<gh-file:examples/offline_inference/prompt_embed_inference.py>
+[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
 
 When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
 
 Prompt embeddings are passed in as base64 encoded torch tensors.
 
+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!
+
 ### Transformers Inputs via OpenAI Client
 
 First, launch the OpenAI-compatible server:
@@ -37,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
 
 Then, you can use the OpenAI client as follows:
 
-<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
+[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 4c8377871e141416b53c4e5fb42208347aee7791..74f005c496ee53b1cbc7026fcffa17399981ad8d 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -64,4 +64,4 @@ th:not(:first-child) {
 !!! note
     This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+    For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index fc998387d29aa781fde9e5c910691cda148cba6a..e77e8b5a1f4152c81e27abffb4d0ec6a10cefb71 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -1,5 +1,9 @@
 # AutoAWQ
 
+> ⚠️ **Warning:**
+    The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
+    For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
+
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
@@ -18,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
     from awq import AutoAWQForCausalLM
     from transformers import AutoTokenizer
 
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
+    quant_path = "mistral-instruct-v0.2-awq"
+    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
 
     # Load model
     model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+        model_path,
+        low_cpu_mem_usage=True,
+        use_cache=False,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
index ac766d5e29228139f89174395d415543e2c8ac42..9c14f362b663fd9159085525e59308a1b36c0c77 100644
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 bits, group_size, sym = 4, 128, True
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 53b689ad53ff689b6c9b6380e85c4470145e82d3..c3a127657622356cbd48dfb4854b36c5cc418719 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -34,7 +34,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitblas"
+    quantization="bitblas",
 )
 ```
 
@@ -53,6 +53,6 @@ llm = LLM(
         dtype=torch.float16,
         trust_remote_code=True,
         quantization="bitblas",
-        max_model_len=1024
+        max_model_len=1024,
     )
     ```
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
index 3b15a6072d47a25717909354dd6630d2db5c37f5..2348c7739c066c4074da4a308e9357de0498f4bb 100644
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
 ```
 
@@ -43,7 +43,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitsandbytes"
+    quantization="bitsandbytes",
 )
 ```
 
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 834c03cbe05b031d5d275687cb0d64bed7f32874..0c5111fb8af0de7be16cdf34c1e28e16ea0b6128 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
 
     # Configure the simple PTQ quantization
     recipe = QuantizationModifier(
-      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+        targets="Linear",
+        scheme="FP8_DYNAMIC",
+        ignore=["lm_head"],
+    )
 
     # Apply the quantization algorithm.
     oneshot(model=model, recipe=recipe)
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
index 2a1c3bdd775f18a4213da701647d69f8304200cc..2a731e9b7e0329ca3cee5335a0624313a893e8ee 100644
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
       conversation = [
          {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
          },
          {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
          },
          {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
          },
          {
             "role": "user",
@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
       sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
       # Create an LLM.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      llm = LLM(
+         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+      )
       # Generate texts from the prompts. The output is a list of RequestOutput objects
       # that contain the prompt, generated text, and other information.
       outputs = llm.chat(conversation, sampling_params)
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index 47cb2d65bae4709c3e9d2b6bed551f709059e286..f14a931725da498559ee340612ddabd3b26e1ce9 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
     calibration_dataset = load_dataset(
         "allenai/c4",
         data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train"
+        split="train",
     ).select(range(1024))["text"]
 
     quant_config = QuantizeConfig(bits=4, group_size=128)
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index d6fdac7b07f7f98986d9a57b141f6f2baddacb1c..035e7ea291f9e07fbc86d020250069444339d040 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
         },
         ignore=["lm_head"],
         update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01
+        dampening_frac=0.01,
     )
     ```
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 247d0cbdd3f149220ace2b12309b1097ec35c045..ec8a77f74ffef2ebc03170c0c7dc56561c056b18 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
 !!! note
-    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
+    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+
+!!! warning
+    **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
+    Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
 
 ## Prerequisites
 
@@ -40,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index 39ae03b1bdac0051284503380d52144e2996ce11..c48ccb719a79d703cabd024b7eb2c7b025df157e 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     from vllm import LLM, SamplingParams
 
     def main():
-
         model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+
+        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
         llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
 
         sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index b2b417309e92b717289b0ab75a75a99bab1a4120..56cf057678be61950f03a266577be708c5dea712 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
     from vllm import LLM, SamplingParams
 
     sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-            kv_cache_dtype="fp8",
-            calculate_kv_scales=True)
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        kv_cache_dtype="fp8",
+        calculate_kv_scales=True,
+    )
     prompt = "London is the capital of"
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
     print(out)
@@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
     # Select calibration dataset
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 047cc8382445b9d005fa05e363e9bab8080fe067..bd7bc186e13aabaf40b1cf5c05b7b1c92257ac65 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -48,7 +48,9 @@ to fetch model and tokenizer.
     MAX_SEQ_LEN = 512
 
     model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+        MODEL_ID,
+        device_map="auto",
+        dtype="auto",
     )
     model.eval()
 
@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
     dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
     text_data = dataset["text"][:NUM_CALIBRATION_DATA]
 
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-        batch_size=BATCH_SIZE, drop_last=True)
+    tokenized_outputs = tokenizer(
+        text_data,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN,
+    )
+    calib_dataloader = DataLoader(
+        tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE,
+        drop_last=True,
+    )
     ```
 
 ### 3. Set the Quantization Configuration
@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                         load_quant_algo_config_from_file)
 
     # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-        is_dynamic=False).to_quantization_spec()
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
+        observer_method="min_max",
+        is_dynamic=False,
+    ).to_quantization_spec()
 
     # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-        weight=FP8_PER_TENSOR_SPEC)
+    global_quant_config = QuantizationConfig(
+        input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC,
+    )
 
     # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
     KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
     kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {name :
-        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                        weight=global_quant_config.weight,
-                        output_tensors=KV_CACHE_SPEC)
-        for name in kv_cache_layer_names_for_llama}
+    kv_cache_quant_config = {
+        name: QuantizationConfig(
+            input_tensors=global_quant_config.input_tensors,
+            weight=global_quant_config.weight,
+            output_tensors=KV_CACHE_SPEC,
+        )
+        for name in kv_cache_layer_names_for_llama
+    }
     layer_quant_config = kv_cache_quant_config.copy()
 
     # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
     algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
 
     EXCLUDE_LAYERS = ["lm_head"]
@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
         layer_quant_config=layer_quant_config,
         kv_cache_quant_config=kv_cache_quant_config,
         exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config)
+        algo_config=algo_config,
+    )
     ```
 
 ### 4. Quantize the Model and Export
@@ -165,8 +182,11 @@ for more exporting format details.
     EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
     exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
     with torch.no_grad():
-        exporter.export_safetensors_model(freezed_model,
-            quant_config=quant_config, tokenizer=tokenizer)
+        exporter.export_safetensors_model(
+            freezed_model,
+            quant_config=quant_config,
+            tokenizer=tokenizer,
+        )
     ```
 
 ### 5. Evaluation in vLLM
@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM.
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-            kv_cache_dtype='fp8',quantization='quark')
+    llm = LLM(
+        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
     # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
@@ -231,9 +254,9 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
                           --tasks gsm8k
 ```
 
-## Using MXFP4 models
+## Using OCP MX (MXFP4, MXFP6) models
 
-vLLM supports loading MXFP4 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+vLLM supports loading MXFP4 and MXFP6 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
 
 The scheme currently only supports dynamic quantization for activations.
 
@@ -241,17 +264,53 @@ Example usage, after installing the latest AMD Quark release:
 
 ```bash
 vllm serve fxmarty/qwen_1.5-moe-a2.7b-mxfp4 --tensor-parallel-size 1
+# or, for a model using fp6 activations and fp4 weights:
+vllm serve fxmarty/qwen1.5_moe_a2.7b_chat_w_fp4_a_fp6_e2m3 --tensor-parallel-size 1
 ```
 
-A simulation of the matrix multiplication execution in MXFP4 can be run on devices that do not support MXFP4 operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from MXFP4 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate MXFP4 models using vLLM, or alternatively to benefit from the ~4x memory savings (compared to float16 and bfloat16).
+A simulation of the matrix multiplication execution in MXFP4/MXFP6 can be run on devices that do not support OCP MX operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from FP4/FP6 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate FP4/FP6 models using vLLM, or alternatively to benefit from the ~2.5-4x memory savings (compared to float16 and bfloat16).
 
 To generate offline models quantized using MXFP4 data type, the easiest approach is to use AMD Quark's [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html), as an example:
 
 ```bash
 python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
-    --quant_scheme w_mxfp4_a_mxfp4_sym \
+    --quant_scheme w_mxfp4_a_mxfp4 \
     --output_dir qwen_1.5-moe-a2.7b-mxfp4 \
     --skip_evaluation \
     --model_export hf_format \
     --group_size 32
 ```
+
+The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations.
+
+## Using Quark Quantized layerwise Auto Mixed Precision (AMP) Models
+
+vLLM also supports loading layerwise mixed precision model quantized using AMD Quark. Currently, mixed scheme of {MXFP4, FP8} is supported, where FP8 here denotes for FP8 per-tensor scheme. More mixed precision schemes are planned to be supported in a near future, including
+
+- Unquantized Linear and/or MoE layer(s) as an option for each layer, i.e., mixed of {MXFP4, FP8, BF16/FP16}
+- MXFP6 quantization extension, i.e., {MXFP4, MXFP6, FP8, BF16/FP16}
+
+Although one can maximize serving throughput using the lowest precision supported on a given device (e.g. MXFP4 for AMD Instinct MI355, FP8 for AMD Instinct MI300), these aggressive schemes can be detrimental to accuracy recovering from quantization on target tasks. Mixed precision allows to strike a balance between maximizing accuracy and throughput.
+
+There are two steps to generate and deploy a mixed precision model quantized with AMD Quark, as shown below.
+
+### 1. Quantize a model using mixed precision in AMD Quark
+
+Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
+
+As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
+
+- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+
+### 2. inference the quantized mixed precision model in vLLM
+
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+
+```bash
+lm_eval --model vllm \
+    --model_args pretrained=amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8,tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False \
+    --tasks mmlu \
+    --batch_size auto
+```
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
index 6932445997012a59ddaf5f692f5dee19a2a6389b..b95b560882bb12b1f39e11c8e1f1dedd40bbfd0b 100644
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
     quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype="auto",
+        dtype="auto",
         device_map="auto",
         quantization_config=quantization_config
     )
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 85681669dfb220452fe72bd5d2f3c1d39ba5b6d6..5f26c7cf182b914b4231ea30894b7b43d5d55613 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -2,7 +2,10 @@
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
-Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+!!! warning
+    `reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
 
 ## Supported Models
 
@@ -11,15 +14,20 @@ vLLM currently supports the following reasoning models:
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
+| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
+| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
+| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+| [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
-| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 
 !!! note
-    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
     The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
+    DeepSeek-V3.1 tool calling is supported in non-thinking mode.
 
 ## Quickstart
 
@@ -56,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th
     # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content:", reasoning_content)
+    print("reasoning:", reasoning)
     print("content:", content)
     ```
 
-The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+The `reasoning` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
 
 ## Streaming chat completions
 
-Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+Streaming chat completions are also supported for reasoning models. The `reasoning` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
 
 ??? console "Json"
 
@@ -83,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
                 "index": 0,
                 "delta": {
                     "role": "assistant",
-                    "reasoning_content": "is",
+                    "reasoning": "is",
                 },
                 "logprobs": null,
                 "finish_reason": null
@@ -92,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
     }
     ```
 
-OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+OpenAI Python client library does not officially support `reasoning` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning` attribute is present in the response. For example:
 
 ??? code
 
@@ -115,27 +123,29 @@ OpenAI Python client library does not officially support `reasoning_content` att
     # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
     # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
     # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-    stream = client.chat.completions.create(model=model,
-                                            messages=messages,
-                                            stream=True)
+    stream = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stream=True,
+    )
 
     print("client: Start streaming chat completions...")
-    printed_reasoning_content = False
+    printed_reasoning = False
     printed_content = False
 
     for chunk in stream:
-        # Safely extract reasoning_content and content from delta,
+        # Safely extract reasoning and content from delta,
         # defaulting to None if attributes don't exist or are empty strings
-        reasoning_content = (
-            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        reasoning = (
+            getattr(chunk.choices[0].delta, "reasoning", None) or None
         )
         content = getattr(chunk.choices[0].delta, "content", None) or None
 
-        if reasoning_content is not None:
-            if not printed_reasoning_content:
-                printed_reasoning_content = True
-                print("reasoning_content:", end="", flush=True)
-            print(reasoning_content, end="", flush=True)
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
         elif content is not None:
             if not printed_content:
                 printed_content = True
@@ -144,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 
-The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning`.
 
 ??? code
 
@@ -157,38 +167,40 @@ The reasoning content is also available when both tool calling and the reasoning
 
     client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
 
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-                },
-                "required": ["location", "unit"]
-            }
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                }
+            },
         }
-    }]
+    ]
 
     response = client.chat.completions.create(
         model=client.models.list().data[0].id,
         messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
         tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
     )
 
     print(response)
     tool_call = response.choices[0].message.tool_calls[0].function
 
-    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"reasoning: {response.choices[0].message.reasoning}")
     print(f"Function called: {tool_call.name}")
     print(f"Arguments: {tool_call.arguments}")
     ```
 
-For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
+For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
 
 ## Limitations
 
@@ -196,7 +208,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 
 ## How to support a new reasoning model
 
-You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 
 ??? code
 
@@ -210,12 +222,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
     # in --reasoning-parser.
-    @ReasoningParserManager.register_module(["example"])
     class ExampleParser(ReasoningParser):
         def __init__(self, tokenizer: AnyTokenizer):
             super().__init__(tokenizer)
 
-        def extract_reasoning_content_streaming(
+        def extract_reasoning_streaming(
             self,
             previous_text: str,
             current_text: str,
@@ -223,7 +234,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
             previous_token_ids: Sequence[int],
             current_token_ids: Sequence[int],
             delta_token_ids: Sequence[int],
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
             """
             Instance method that should be implemented for extracting reasoning
             from an incomplete response; for use when handling reasoning calls and
@@ -232,9 +243,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
             previously been parsed and extracted (see constructor)
             """
 
-        def extract_reasoning_content(
-                self, model_output: str, request: ChatCompletionRequest
-        ) -> tuple[Optional[str], Optional[str]]:
+        def extract_reasoning(
+            self,
+            model_output: str,
+            request: ChatCompletionRequest | ResponsesRequest,
+        ) -> tuple[str | None, str | None]:
             """
             Extract reasoning content from a complete model-generated string.
 
@@ -252,9 +265,15 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
             tuple[Optional[str], Optional[str]]
                 A tuple containing the reasoning content and the content.
             """
+    # Register the reasoning parser
+    ReasoningParserManager.register_lazy_module(
+        name="example",
+        module_path="vllm.reasoning.example_reasoning_parser",
+        class_name="ExampleParser",
+    )
     ```
 
-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 
 ??? code
 
@@ -272,10 +291,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
         @classmethod
         def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-            return cls(start_token_id=tokenizer.encode(
-                "<think>", add_special_tokens=False)[0],
-                    end_token_id=tokenizer.encode("</think>",
-                                                    add_special_tokens=False)[0])
+            return cls(
+                start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
+                end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
+            )
 
         def is_reasoning_end(self, input_ids: list[int]) -> bool:
             return self.end_token_id in input_ids
diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 5749b02d26f45a30b8ce8ad8a6d59cdcfbfe07bd..9ab167ab9a2378e47d17d22e57a1bdeaf33688ea 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -11,7 +11,10 @@ Key benefits:
 - **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
 
 !!! note
-    This feature is only supported on CUDA platform.
+    This feature is now supported on CUDA and ROCm platform.
+
+!!! note
+    For more information, see this [Blog Post](https://blog.vllm.ai/2025/10/26/sleep-mode.html).
 
 ## Sleep levels
 
@@ -31,6 +34,7 @@ llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
 #### Python API
 
 ```python
+# Sleep level 1
 # Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
 llm.sleep(level=1)
 
@@ -38,6 +42,21 @@ llm.sleep(level=1)
 llm.wake_up()
 ```
 
+```python
+# Sleep level 2
+# Put the engine to sleep (level=2: discard both weights and KV cache)
+llm.sleep(level=2)
+
+# Reallocate weights memory only
+llm.wake_up(tags=["weights"])
+
+# Load weights in-place
+llm.collective_rpc("reload_weights")
+
+# Reallocate KV cache
+llm.wake_up(tags=["kv_cache"])
+```
+
 #### RLHF weight updates
 
 During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
@@ -64,17 +83,40 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
 When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
 
 ```bash
-VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen3-0.6B \
+VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
   --enable-sleep-mode \
   --port 8000
 ```
 
+Below is an example of how to sleep and wake up a model in level 1.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=1'
+curl -X POST 'http://localhost:8000/wake_up'
+```
+
+And this is an example of how to sleep and wake up a model in level 2.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=2'
+# Reallocate weights memory only
+curl -X POST 'http://localhost:8000/wake_up?tags=weights'
+# Load weights in-place
+curl -X POST 'http://localhost:8000/collective_rpc' -H 'Content-Type: application/json' -d '{"method":"reload_weights"}'
+# Reallocate KV cache
+curl -X POST 'http://localhost:8000/wake_up?tags=kv_cache'
+```
+
 #### HTTP endpoints
 
 - `POST /sleep?level=1` — Put the model to sleep (`level=1`).
 - `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
+- `POST /collective_rpc` — Perform a collective remote procedure call (RPC).
 - `GET /is_sleeping` — Check if the model is sleeping.
 
 !!! note
     These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.
+
+## Limitation
+
+On ROCm, the virtual memory allocation on ROCm is done through chunked memory allocation. You can control the chunk size through `VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE` (in MB). The default value is set at 256MB. The larger the chunk size the faster the performance. However, setting it too large will cause OOM. So if you encounter OOM when using sleep mode. Try reducing the chunk size. It is recommended to define the chunk size as a power of 2.
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index 597a8e8644278ffe4f4afc884c4ec54f499018a5..6097500cac01f581ffbcba219a4ce242859617cb 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -3,7 +3,7 @@
 !!! warning
     Please note that speculative decoding in vLLM is not yet optimized and does
     not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
-    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+    The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>
 
 !!! warning
     Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
@@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin
 To perform the same with an online mode launch the server:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
+vllm serve facebook/opt-6.7b \
     --host 0.0.0.0 \
     --port 8000 \
-    --model facebook/opt-6.7b \
     --seed 42 \
     -tp 1 \
     --gpu_memory_utilization 0.8 \
@@ -131,6 +130,46 @@ matching n-grams in the prompt. For more information read [this thread.](https:/
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
+## Speculating using Suffix Decoding
+
+The following code configures vLLM to use speculative decoding where proposals are generated using Suffix Decoding ([technical report](https://arxiv.org/abs/2411.04975)).
+
+Like n-gram, Suffix Decoding can generate draft tokens by pattern-matching using the last `n` generated tokens. Unlike n-gram, Suffix Decoding (1) can pattern-match against both the prompt and previous generations, (2) uses frequency counts to propose the most likely continuations, and (3) speculates an adaptive number of tokens for each request at each iteration to get better acceptance rates.
+
+Suffix Decoding can achieve better performance for tasks with high repetition, such as code-editing, agentic loops (e.g. self-reflection, self-consistency), and RL rollouts.
+
+!!! tip "Install Arctic Inference"
+    Suffix Decoding requires [Arctic Inference](https://github.com/snowflakedb/ArcticInference). You can install it with `pip install arctic-inference`.
+
+!!! tip "Suffix Decoding Speculative Tokens"
+    Suffix Decoding will speculate a dynamic number of tokens for each request at each decoding step, so the `num_speculative_tokens` configuration specifies the *maximum* number of speculative tokens. It is suggested to use a high number such as `16` or `32` (default).
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "suffix",
+            "num_speculative_tokens": 32,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
 ## Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
@@ -184,7 +223,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).
 
 ??? code
 
@@ -219,8 +258,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:
 
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
-   If you are using vllm version before <gh-pr:12304>, please use the
+   be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
+   If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
    [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
    and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
 
@@ -230,7 +269,7 @@ A few important things to consider when using the EAGLE based draft models:
 
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: <gh-issue:9565>.
+   investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.
 
 4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
    That is, to specify `"method": "eagle3"` in `speculative_config`.
@@ -268,7 +307,7 @@ speculative decoding, breaking down the guarantees into three key areas:
     >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
     > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
     >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-    >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
     >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
 
 3. **vLLM Logprob Stability**
@@ -290,4 +329,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
 - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
 - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
 - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
-- [Dynamic speculative decoding](gh-issue:4565)
+- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 901d87e7ed3d9dc72cc7c03380970363848f72ad..e38627c7078845158733d4968de2de229b1246e0 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
             }
         },
     )
-    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("reasoning: ", completion.choices[0].message.reasoning)
     print("content: ", completion.choices[0].message.content)
     ```
 
@@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
 
-An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
+An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
 
 ## Offline Inference
 
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 16693db7255ca786bbcf71784c8371373735d311..7e6c69e717dba3c7c3e3d56d5f27f1cd4d8fa252 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools:
         return f"Getting the weather for {location} in {unit}..."
     tool_functions = {"get_weather": get_weather}
 
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                    },
+                    "required": ["location", "unit"],
                 },
-                "required": ["location", "unit"]
-            }
-        }
-    }]
+            },
+        },
+    ]
 
     response = client.chat.completions.create(
         model=client.models.list().data[0].id,
         messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
         tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
     )
 
     tool_call = response.choices[0].message.tool_calls[0].function
@@ -145,16 +147,23 @@ Supported models:
 Known issues:
 
 1. Mistral 7B struggles to generate parallel tool calls correctly.
-2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
+2. **For Transformers tokenization backend only**: Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
    much shorter than what vLLM generates. Since an exception is thrown when this condition
    is not met, the following additional chat templates are provided:
 
-    * <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
+    * [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
       it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
-    * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
+    * [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
       when tools are provided, that results in much better reliability when working with parallel tool calling.
 
-Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+Recommended flags:
+
+1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
+
+    `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
+
+2. To use the default Transformers tokenization backend:
+    `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
 ### Llama Models (`llama3_json`)
 
@@ -178,28 +187,32 @@ Known issues:
 
 VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
 
-* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
+* [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
 models, but tweaked so that it works better with vLLM.
-* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
+* [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
 images.
 
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 
 VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
 
-* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
+* [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
 
 For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
 
-#### IBM Granite
+### IBM Granite
 
 Supported models:
 
+* `ibm-granite/granite-4.0-h-small` and other Granite 4.0 models
+
+    Recommended flags: `--tool-call-parser hermes`
+
 * `ibm-granite/granite-3.0-8b-instruct`
 
     Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
 
-    <gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
+    [examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
 
 * `ibm-granite/granite-3.1-8b-instruct`
 
@@ -211,7 +224,7 @@ Supported models:
 
     Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
 
-    <gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+    [examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
 
 ### InternLM Models (`internlm`)
 
@@ -269,8 +282,8 @@ Flags: `--tool-call-parser hermes`
 
 Supported models:
 
-* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
-* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
+* `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
 
 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
 
@@ -278,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m
 
 Supported models:
 
-* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
-* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+* `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
+* `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))
 
 Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
@@ -287,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
 Supported models:
 
-* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
+* `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))
 
 Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
 
@@ -308,7 +321,7 @@ Supported models:
 Flags:
 
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
-* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
+* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b`
 
 ### LongCat-Flash-Chat Models (`longcat`)
 
@@ -323,8 +336,10 @@ Flags: `--tool-call-parser longcat`
 
 Supported models:
 
-* `ZhipuAI/GLM-4.5`
-* `ZhipuAI/GLM-4.5-Air`
+* `zai-org/GLM-4.5`
+* `zai-org/GLM-4.5-Air`
+* `zai-org/GLM-4.6`
+* `zai-org/GLM-4.6-Air`
 
 Flags: `--tool-call-parser glm45`
 
@@ -337,6 +352,16 @@ Supported models:
 
 Flags: `--tool-call-parser qwen3_xml`
 
+### Olmo 3 Models (`olmo3`)
+
+Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
+
+Supported models:
+
+* TODO (will be updated after Olmo 3 release)
+
+Flags: `--tool-call-parser olmo3`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
@@ -354,12 +379,12 @@ Limitations:
 
 Example supported models:
 
-* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
-* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
-* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
-* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
-* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
-* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
+* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
+* `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
 
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 
@@ -368,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 
 ## How to Write a Tool Parser Plugin
 
-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
 
 Here is a summary of a plugin file:
 
@@ -382,15 +407,13 @@ Here is a summary of a plugin file:
     # the name list in register_module can be used
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
-    @ToolParserManager.register_module(["example"])
     class ExampleToolParser(ToolParser):
         def __init__(self, tokenizer: AnyTokenizer):
             super().__init__(tokenizer)
 
         # adjust request. e.g.: set skip special tokens
         # to False for tool call output.
-        def adjust_request(
-                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
             return request
 
         # implement the tool call parse for stream call
@@ -403,7 +426,7 @@ Here is a summary of a plugin file:
             current_token_ids: Sequence[int],
             delta_token_ids: Sequence[int],
             request: ChatCompletionRequest,
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
             return delta
 
         # implement the tool parse for non-stream call
@@ -415,6 +438,12 @@ Here is a summary of a plugin file:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=text)
+    # register the tool parser to ToolParserManager
+    ToolParserManager.register_lazy_module(
+        name="example",
+        module_path="vllm.entrypoints.openai.tool_parsers.example",
+        class_name="ExampleToolParser",
+    )
 
     ```
 
diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml
index ba1f8099a6456b2010e4fbda20d5143197f62b11..683322cf3b7b062f5cb05a0a83527fd6e9a0e643 100644
--- a/docs/getting_started/installation/.nav.yml
+++ b/docs/getting_started/installation/.nav.yml
@@ -2,4 +2,4 @@ nav:
   - README.md
   - gpu.md
   - cpu.md
-  - google_tpu.md
+  - TPU: https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 5e57d23f4a1df41b39ae7bffab3828d6cac79ba6..d5082bc7dd3a905e3c7f7ab7051d2001f2637cd5 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -11,7 +11,6 @@ vLLM supports the following hardware platforms:
     - [ARM AArch64](cpu.md#arm-aarch64)
     - [Apple silicon](cpu.md#apple-silicon)
     - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
-- [Google TPU](google_tpu.md)
 
 ## Hardware Plugins
 
@@ -20,8 +19,10 @@ The backends below live **outside** the main `vllm` repository and follow the
 
 | Accelerator | PyPI / package | Repository |
 |-------------|----------------|------------|
+| Google TPU | `tpu-inference` | <https://github.com/vllm-project/tpu-inference> |
 | Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
 | Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
 | MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
 | Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
 | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
+| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
similarity index 79%
rename from docs/getting_started/installation/cpu/apple.inc.md
rename to docs/getting_started/installation/cpu.apple.inc.md
index 7e2ed55008a57786b4c1200d005ff50f414fba8e..4dc707d5f9a149f462b32f2eb65d591001d787ed 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-uv pip install -r requirements/cpu.txt
+uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
 uv pip install -e .
 ```
 
+!!! tip
+    The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
+    
+    The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
+
 !!! note
     On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
similarity index 56%
rename from docs/getting_started/installation/cpu/arm.inc.md
rename to docs/getting_started/installation/cpu.arm.inc.md
index e45baa0aa4938b0ceae6d8a63abc2e8c9df63e32..9cae9ed1a212e0f1f04f428c40bc2023328c5e9c 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
---8<-- "docs/getting_started/installation/cpu/build.inc.md"
+First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```bash
+sudo apt-get update  -y
+sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Second, clone the vLLM project:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Third, install required dependencies:
+
+```bash
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
+```
+
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Finally, build and install vLLM:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
+```
+
+If you want to develop vLLM, install it in editable mode instead.
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+```
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index f8b4f75308df7c347769b302b1782cc0c3840a64..be99cef3723e62a0185b43c304addc38f9b2ff88 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -4,39 +4,39 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 === "Intel/AMD x86"
 
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:installation"
 
 === "ARM AArch64"
 
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:installation"
 
 === "Apple silicon"
 
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:installation"
 
 === "IBM Z (S390X)"
 
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
 
 ## Requirements
 
-- Python: 3.9 -- 3.12
+- Python: 3.10 -- 3.13
 
 === "Intel/AMD x86"
 
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:requirements"
 
 === "ARM AArch64"
 
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:requirements"
 
 === "Apple silicon"
 
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:requirements"
 
 === "IBM Z (S390X)"
 
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:requirements"
 
 ## Set up using Python
 
@@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels.
 
 === "Intel/AMD x86"
 
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
 
 === "ARM AArch64"
 
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-wheel-from-source"
 
 === "Apple silicon"
 
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:build-wheel-from-source"
 
 === "IBM Z (s390x)"
 
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-wheel-from-source"
 
 ## Set up using Docker
 
@@ -72,29 +72,29 @@ Currently, there are no pre-built CPU wheels.
 
 === "Intel/AMD x86"
 
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
 
 ### Build image from source
 
 === "Intel/AMD x86"
 
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-image-from-source"
 
 === "ARM AArch64"
 
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
 
 === "Apple silicon"
 
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
 
 === "IBM Z (S390X)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-image-from-source"
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM to run more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
@@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
 
 ### Which `dtype` should be used?
 
-- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
+- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
 
 ### How to launch a vLLM service on CPU?
 
@@ -128,7 +128,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
 
-- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following.
+- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
 
 - On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
@@ -156,12 +156,12 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
     14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
     15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
 
-    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
     $ export VLLM_CPU_OMP_THREADS_BIND=0-7
     $ python examples/offline_inference/basic/basic.py
     ```
 
-- When deploy vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on a same NUMA node to avoid cross NUMA node memory access.
+- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
 
 ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
 
@@ -171,7 +171,9 @@ This value is 4GB by default. Larger space can support more concurrent requests,
 
 First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
 
-Inference batch size is an important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
+Use multiples of 32 as `--block-size`, which is 128 by default.
+
+Inference batch size is an important parameter for the performance. A larger batch usually provides higher throughput, a smaller batch provides lower latency. Tuning the max batch size starting from the default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
 
 - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
     - Offline Inference: `4096 * world_size`
@@ -192,8 +194,8 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
 - Both of them require `amx` CPU flag.
-    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
-    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+    - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
+    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
 
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
similarity index 85%
rename from docs/getting_started/installation/cpu/s390x.inc.md
rename to docs/getting_started/installation/cpu.s390x.inc.md
index f9c4ccb942fac3c4089f4ec5dadfcb23f786d17a..c2163139a7c5d9a9f078184b50f5e05c580acc5d 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -2,7 +2,7 @@
 
 vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
-Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
@@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source.
     Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
 
 ```bash
-    sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+    sed -i '/^torch/d' requirements/build.txt    # remove torch from requirements/build.txt since we use nightly builds
     uv pip install -v \
         --torch-backend auto \
-        -r requirements-build.txt \
-        -r requirements-cpu.txt \
+        -r requirements/build.txt \
+        -r requirements/cpu.txt \
     VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
         uv pip install dist/*.whl
 ```
 
 ??? console "pip"
     ```bash
-        sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+        sed -i '/^torch/d' requirements/build.txt    # remove torch from requirements/build.txt since we use nightly builds
         pip install -v \
             --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
-            -r requirements-build.txt \
-            -r requirements-cpu.txt \
+            -r requirements/build.txt \
+            -r requirements/cpu.txt \
         VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
             pip install dist/*.whl
     ```
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
similarity index 95%
rename from docs/getting_started/installation/cpu/x86.inc.md
rename to docs/getting_started/installation/cpu.x86.inc.md
index 00f3b726b1a0eb48ae820bf196ca95f1efb3bc40..310f179cb89ca43b873b12f3ece97164a0eda38b 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -83,7 +83,7 @@ uv pip install dist/*.whl
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
-    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
     - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
     ```toml title="pyproject.toml"
     [build-system]
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
deleted file mode 100644
index 4bd4d39a6f80b669569a3e57f9e1c9ae30ccdd09..0000000000000000000000000000000000000000
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ /dev/null
@@ -1,45 +0,0 @@
-First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-```bash
-sudo apt-get update  -y
-sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-```
-
-Second, clone the vLLM project:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git vllm_source
-cd vllm_source
-```
-
-Third, install required dependencies:
-
-```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend cpu
-uv pip install -r requirements/cpu.txt --torch-backend cpu
-```
-
-??? console "pip"
-    ```bash
-    pip install --upgrade pip
-    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    ```
-
-Finally, build and install vLLM:
-
-```bash
-VLLM_TARGET_DEVICE=cpu python setup.py install
-```
-
-If you want to develop vLLM, install it in editable mode instead.
-
-```bash
-VLLM_TARGET_DEVICE=cpu python setup.py develop
-```
-
-!!! note
-    If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
-
-# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md
deleted file mode 100644
index 6f09babb3aba002a093eeafb6432d98957809ee7..0000000000000000000000000000000000000000
--- a/docs/getting_started/installation/google_tpu.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Google TPU
-
-Tensor Processing Units (TPUs) are Google's custom-developed application-specific
-integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
-are available in different versions each with different hardware specifications.
-For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
-For more information on the TPU versions supported with vLLM, see:
-
-- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
-- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
-- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
-- [TPU v4](https://cloud.google.com/tpu/docs/v4)
-
-These TPU versions allow you to configure the physical arrangements of the TPU
-chips. This can improve throughput and networking performance. For more
-information see:
-
-- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
-- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
-- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
-- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
-
-In order for you to use Cloud TPUs you need to have TPU quota granted to your
-Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
-GPC project and are specified in terms of TPU version, the number of TPU you
-want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
-
-For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
-
-You may need additional persistent storage for your TPU VMs. For more
-information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
-
-!!! warning
-    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-
-## Requirements
-
-- Google Cloud TPU VM
-- TPU versions: v6e, v5e, v5p, v4
-- Python: 3.11 or newer
-
-### Provision Cloud TPUs
-
-You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
-or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
-API (preferred). This section shows how to create TPUs using the queued resource API. For
-more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
-Queued resources enable you to request Cloud TPU resources in a queued manner.
-When you request queued resources, the request is added to a queue maintained by
-the Cloud TPU service. When the requested resource becomes available, it's
-assigned to your Google Cloud project for your immediate exclusive use.
-
-!!! note
-    In all of the following commands, replace the ALL CAPS parameter names with
-    appropriate values. See the parameter descriptions table for more information.
-
-### Provision Cloud TPUs with GKE
-
-For more information about using TPUs with GKE, see:
-
-- [About TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/tpus)
-- [Deploy TPU workloads in GKE Standard](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus)
-- [Plan for TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus)
-
-## Configure a new environment
-
-### Provision a Cloud TPU with the queued resource API
-
-Create a TPU v5e with 4 TPU chips:
-
-```bash
-gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
-  --node-id TPU_NAME \
-  --project PROJECT_ID \
-  --zone ZONE \
-  --accelerator-type ACCELERATOR_TYPE \
-  --runtime-version RUNTIME_VERSION \
-  --service-account SERVICE_ACCOUNT
-```
-
-| Parameter name     | Description                                                                                                                                                                                              |
-|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request.                                                                                                                                                     |
-| TPU_NAME           | The user-assigned name of the TPU which is created when the queued resource request is allocated.                                                                                                        |
-| PROJECT_ID         | Your Google Cloud project                                                                                                                                                                                |
-| ZONE               | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones]                                  |
-| ACCELERATOR_TYPE   | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. |
-| RUNTIME_VERSION    | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s).                                              |
-| SERVICE_ACCOUNT    | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`              |
-
-Connect to your TPU VM using SSH:
-
-```bash
-gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE
-```
-
-!!! note
-    When configuring `RUNTIME_VERSION` ("TPU software version") on GCP, ensure it matches the TPU generation you've selected by referencing the [TPU VM images] compatibility matrix. Using an incompatible version may prevent vLLM from running correctly.
-
-[TPU versions]: https://cloud.google.com/tpu/docs/runtimes
-[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes
-[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built TPU wheels.
-
-### Build wheel from source
-
-Install Miniconda:
-
-```bash
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-bash Miniconda3-latest-Linux-x86_64.sh
-source ~/.bashrc
-```
-
-Create and activate a Conda environment for vLLM:
-
-```bash
-conda create -n vllm python=3.12 -y
-conda activate vllm
-```
-
-Clone the vLLM repository and go to the vLLM directory:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git && cd vllm
-```
-
-Uninstall the existing `torch` and `torch_xla` packages:
-
-```bash
-pip uninstall torch torch-xla -y
-```
-
-Install build dependencies:
-
-```bash
-pip install -r requirements/tpu.txt
-sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
-```
-
-Run the setup script:
-
-```bash
-VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
-
-### Build image from source
-
-You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
-
-```bash
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
-```
-
-Run the Docker image with the following command:
-
-```bash
-# Make sure to add `--privileged --net host --shm-size=16G`.
-docker run --privileged --net host --shm-size=16G -it vllm-tpu
-```
-
-!!! note
-    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
-    possible input shapes and compiles an XLA graph for each shape. The
-    compilation time may take 20~30 minutes in the first run. However, the
-    compilation time reduces to ~5 minutes afterwards because the XLA graphs are
-    cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default).
-
-!!! tip
-    If you encounter the following error:
-
-    ```console
-    from torch._C import *  # noqa: F403
-    ImportError: libopenblas.so.0: cannot open shared object file: No such
-    file or directory
-    ```
-
-    Install OpenBLAS with the following command:
-
-    ```bash
-    sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
-    ```
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
similarity index 94%
rename from docs/getting_started/installation/gpu/cuda.inc.md
rename to docs/getting_started/installation/gpu.cuda.inc.md
index 9e64c6f2540af92c92501f9ac3438e62325a14d1..b2d0d64a2d35555320ca0627f8cb60b8ebd65c1c 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
 # --8<-- [start:set-up-using-python]
 
 !!! note
-    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <https://github.com/vllm-project/vllm/issues/8420> for more details.
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
-Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.
 
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
@@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126
 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
-[](){ #install-the-latest-code }
-
 #### Install the latest code
 
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
@@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
 uv pip install --editable .
 ```
 
-You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
+You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
 #### Full build (with compilation)
 
@@ -250,7 +248,7 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
 
 Another way to access the latest code is to use the docker images:
 
@@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
-See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
+See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
 
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
 
-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
 
 # --8<-- [end:supported-features]
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index e688cefea07637b7bb3d7ad549a9c39cbdebb4f0..bc7508b29475f180c4dfbe116111598f2ccc5cd2 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -4,35 +4,35 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:installation"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:installation"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:installation"
 
 ## Requirements
 
 - OS: Linux
-- Python: 3.9 -- 3.12
+- Python: 3.10 -- 3.13
 
 !!! note
     vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows).
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:requirements"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:requirements"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:requirements"
 
 ## Set up using Python
 
@@ -42,45 +42,43 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:set-up-using-python"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:set-up-using-python"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
 ### Pre-built wheels
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-wheels"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-wheels"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
-
-[](){ #build-from-source }
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"
 
 ### Build wheel from source
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-wheel-from-source"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-wheel-from-source"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-wheel-from-source"
 
 ## Set up using Docker
 
@@ -88,40 +86,40 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-images"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"
 
 ### Build image from source
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-image-from-source"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-image-from-source"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"
 
 ## Supported features
 
 === "NVIDIA CUDA"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:supported-features"
 
 === "AMD ROCm"
 
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:supported-features"
 
 === "Intel XPU"
 
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:supported-features"
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
similarity index 51%
rename from docs/getting_started/installation/gpu/rocm.inc.md
rename to docs/getting_started/installation/gpu.rocm.inc.md
index 37c6647929b512ca086de1ac6c65cbda4a618d6c..c80ba9478f6be694c0d68157e4cd1dc3e606d38b 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM supports AMD GPUs with ROCm 6.3 or above.
+vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
 
 !!! tip
     [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
@@ -11,9 +11,10 @@ vLLM supports AMD GPUs with ROCm 6.3 or above.
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
-- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
+- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201), Ryzen AI MAX / AI 300 Series (gfx1151/1150)
 - ROCm 6.3 or above
     - MI350 requires ROCm 7.0 or above
+    - Ryzen AI MAX / AI 300 Series requires ROCm 7.0.2 or above
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -28,57 +29,63 @@ Currently, there are no pre-built ROCm wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
+!!! tip
+    - If you found that the following installation step does not work for you, please refer to [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base). Dockerfile is a form of installation steps.
+
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
     - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
     - [PyTorch](https://pytorch.org/)
 
-    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.4.3_ubuntu24.04_py3.12_pytorch_release_2.6.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
 
     Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:
 
     ```bash
     # Install PyTorch
     pip uninstall torch -y
-    pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4
+    pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0
     ```
 
-1. Install [Triton for ROCm](https://github.com/triton-lang/triton)
+1. Install [Triton for ROCm](https://github.com/ROCm/triton.git)
 
-    Install ROCm's Triton (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+    Install ROCm's Triton following the instructions from [ROCm/triton](https://github.com/ROCm/triton.git)
 
     ```bash
     python3 -m pip install ninja cmake wheel pybind11
     pip uninstall -y triton
-    git clone https://github.com/triton-lang/triton.git
+    git clone https://github.com/ROCm/triton.git
     cd triton
-    git checkout e5be006
+    # git checkout $TRITON_BRANCH
+    git checkout f9e5bf54
     if [ ! -f setup.py ]; then cd python; fi
     python3 setup.py install
     cd ../..
     ```
 
     !!! note
-        If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+        - The validated `$TRITON_BRANCH` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
+        - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
 
-2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention)
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention.git)
 
-    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support)
-    Alternatively, wheels intended for vLLM use can be accessed under the releases.
+    Install ROCm's flash attention (v2.8.0) following the instructions from [ROCm/flash-attention](https://github.com/Dao-AILab/flash-attention#amd-rocm-support)
 
-    For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+    For example, for ROCm 7.0, suppose your gfx arch is `gfx942`. To get your gfx architecture, run `rocminfo |grep gfx`.
 
     ```bash
     git clone https://github.com/Dao-AILab/flash-attention.git
     cd flash-attention
-    git checkout 1a7f4dfa
+    # git checkout $FA_BRANCH
+    git checkout 0e60e394
     git submodule update --init
-    GPU_ARCHS="gfx90a" python3 setup.py install
+    GPU_ARCHS="gfx942" python3 setup.py install
     cd ..
     ```
 
     !!! note
-        You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+        - The validated `$FA_BRANCH` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
+
 
 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
 
@@ -92,11 +99,13 @@ Currently, there are no pre-built ROCm wheels.
     ```
 
     !!! note
-        You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
+        - You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
+        - The validated `$AITER_BRANCH_OR_COMMIT` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
+        
 
-4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
+4. Build vLLM. For example, vLLM on ROCM 7.0 can be built with the following steps:
 
-    ??? console "Commands"
+    ???+ console "Commands"
 
         ```bash
         pip install --upgrade pip
@@ -109,31 +118,48 @@ Currently, there are no pre-built ROCm wheels.
             scipy \
             huggingface-hub[cli,hf_transfer] \
             setuptools_scm
-        pip install "numpy<2"
         pip install -r requirements/rocm.txt
 
-        # Build vLLM for MI210/MI250/MI300.
-        export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+        # To build for a single architecture (e.g., MI300) for faster installation (recommended):
+        export PYTORCH_ROCM_ARCH="gfx942"
+
+        # To build vLLM for multiple arch MI210/MI250/MI300, use this instead
+        # export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
         python3 setup.py develop
         ```
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
     !!! tip
-        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
-        - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-        - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
         - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
 
 !!! tip
     - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/vllm-optimization.html).
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
 The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
 docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed.
+
+???+ console "Commands"
+    ```bash
+    docker pull rocm/vllm-dev:nightly # to get the latest image
+    docker run -it --rm \
+    --network=host \
+    --group-add=video \
+    --ipc=host \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device /dev/kfd \
+    --device /dev/dri \
+    -v <path/to/your/models>:/app/models \
+    -e HF_HOME="/app/models" \
+    rocm/vllm-dev:nightly
+    ```
 
 !!! tip
     Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
@@ -144,33 +170,33 @@ docker image designed for validating inference performance on the AMD Instinct
 
 Building the Docker image from source is the recommended way to use vLLM with ROCm.
 
-#### (Optional) Build an image with ROCm software stack
+??? info "(Optional) Build an image with ROCm software stack"
 
-Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
-**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
-If you choose to build this rocm_base image yourself, the steps are as follows.
+    Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
+    **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
+    If you choose to build this rocm_base image yourself, the steps are as follows.
 
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+    It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
-```json
-{
-    "features": {
-        "buildkit": true
+    ```json
+    {
+        "features": {
+            "buildkit": true
+        }
     }
-}
-```
+    ```
 
-To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
+    To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default:
 
-```bash
-DOCKER_BUILDKIT=1 docker build \
-    -f docker/Dockerfile.rocm_base \
-    -t rocm/vllm-dev:base .
-```
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+        -f docker/Dockerfile.rocm_base \
+        -t rocm/vllm-dev:base .
+    ```
 
 #### Build an image with vLLM
 
-First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```bash
@@ -181,24 +207,24 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 7.0 by default, but also supports ROCm 5.7, 6.0, 6.1, 6.2, 6.3, and 6.4, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base)
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
 
-To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default:
 
-```bash
-DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-```
+???+ console "Commands"
+    ```bash
+    DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+    ```
 
 To run the above docker image `vllm-rocm`, use the below command:
 
-??? console "Command"
-
+???+ console "Commands"
     ```bash
     docker run -it \
     --network=host \
@@ -217,6 +243,6 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
 
-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
 
 # --8<-- [end:supported-features]
diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
similarity index 90%
rename from docs/getting_started/installation/gpu/xpu.inc.md
rename to docs/getting_started/installation/gpu.xpu.inc.md
index ed1dc0418cf7e5f19ed5ed6444c0f08282010f8a..620a660a240ed4c5fd1e92ce9b5f0f069692017e 100644
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 docker run -it \
              --rm \
              --network=host \
-             --device /dev/dri \
+             --device /dev/dri:/dev/dri \
              -v /dev/dri/by-path:/dev/dri/by-path \
+             --ipc=host \
+             --privileged \
              vllm-xpu-env
 ```
 
@@ -67,8 +69,7 @@ docker run -it \
 XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-     --model=facebook/opt-13b \
+vllm serve facebook/opt-13b \
      --dtype=bfloat16 \
      --max_model_len=1024 \
      --distributed-executor-backend=mp \
@@ -76,7 +77,7 @@ python -m vllm.entrypoints.openai.api_server \
      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.
 
 # --8<-- [end:supported-features]
 # --8<-- [start:distributed-backend]
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
index 06794f8d3120e5cf8d5f43d73f88f175454967dc..ba78c329723edeb7b44f04694b6f8551648020cd 100644
--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
+On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
 
 ```bash
 uv venv --python 3.12 --seed
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 2af26626d207ded68d9025c1c9067babd8335243..cfc8b4d9838a7570316955db84b1a2e8af0e2538 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -2,51 +2,84 @@
 
 This guide will help you quickly get started with vLLM to perform:
 
-- [Offline batched inference][quickstart-offline]
-- [Online serving using OpenAI-compatible server][quickstart-online]
+- [Offline batched inference](#offline-batched-inference)
+- [Online serving using OpenAI-compatible server](#openai-compatible-server)
 
 ## Prerequisites
 
 - OS: Linux
-- Python: 3.9 -- 3.13
+- Python: 3.10 -- 3.13
 
 ## Installation
 
-If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
+=== "NVIDIA CUDA"
 
-It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+    If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
 
-```bash
-uv venv --python 3.12 --seed
-source .venv/bin/activate
-uv pip install vllm --torch-backend=auto
-```
+    It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
 
-`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
+    ```bash
+    uv venv --python 3.12 --seed
+    source .venv/bin/activate
+    uv pip install vllm --torch-backend=auto
+    ```
 
-Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
+    `uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
 
-```bash
-uv run --with vllm vllm --help
-```
+    Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
 
-You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
+    ```bash
+    uv run --with vllm vllm --help
+    ```
 
-```bash
-conda create -n myenv python=3.12 -y
-conda activate myenv
-pip install --upgrade uv
-uv pip install vllm --torch-backend=auto
-```
+    You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
+
+    ```bash
+    conda create -n myenv python=3.12 -y
+    conda activate myenv
+    pip install --upgrade uv
+    uv pip install vllm --torch-backend=auto
+    ```
+
+=== "AMD ROCm"
+
+    Use a pre-built docker image from Docker Hub. The public stable image is [rocm/vllm:latest](https://hub.docker.com/r/rocm/vllm). There is also a development image at [rocm/vllm-dev](https://hub.docker.com/r/rocm/vllm-dev).
+    
+    The `-v` flag in the `docker run` command below mounts a local directory into the container. Replace `<path/to/your/models>` with the path on your host machine to the directory containing your models. The models will then be accessible inside the container at `/app/models`.
+    
+    ???+ console "Commands"
+        ```bash
+        docker pull rocm/vllm-dev:nightly # to get the latest image
+        docker run -it --rm \
+        --network=host \
+        --group-add=video \
+        --ipc=host \
+        --cap-add=SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --device /dev/kfd \
+        --device /dev/dri \
+        -v <path/to/your/models>:/app/models \
+        -e HF_HOME="/app/models" \
+        rocm/vllm-dev:nightly
+        ```
+
+=== "Google TPU"
+
+    To run vLLM on Google TPUs, you need to install the `vllm-tpu` package.
+    
+    ```bash
+    uv pip install vllm-tpu
+    ```
+
+    !!! note
+        For more detailed instructions, including Docker, installing from source, and troubleshooting, please refer to the [vLLM on TPU documentation](https://docs.vllm.ai/projects/tpu/en/latest/).
 
 !!! note
     For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
 
-[](){ #quickstart-offline }
-
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
 
 The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
 
@@ -57,7 +90,7 @@ The first line of this example imports the classes [LLM][vllm.LLM] and [Sampling
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](../api/README.md#inference-parameters).
 
 !!! important
     By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
@@ -135,8 +168,6 @@ for output in outputs:
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         ```
 
-[](){ #quickstart-online }
-
 ## OpenAI-Compatible Server
 
 vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
@@ -150,7 +181,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 
 !!! note
     By default, the server uses a predefined chat template stored in the tokenizer.
-    You can learn about overriding it [here][chat-template].
+    You can learn about overriding it [here](../serving/openai_compatible_server.md#chat-template).
 !!! important
     By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
@@ -194,12 +225,14 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                        prompt="San Francisco is a")
+    completion = client.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        prompt="San Francisco is a",
+    )
     print("Completion result:", completion)
     ```
 
-A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
+A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
 
 ### OpenAI Chat Completions API with vLLM
 
@@ -239,7 +272,7 @@ Alternatively, you can use the `openai` Python package:
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "Tell me a joke."},
-        ]
+        ],
     )
     print("Chat response:", chat_response)
     ```
@@ -248,7 +281,17 @@ Alternatively, you can use the `openai` Python package:
 
 Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
 
-If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
+
+- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
+
+For AMD ROCm, you can futher control the specific Attention implementation using the following variables:
+
+- Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
+- AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
+- Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
+- AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
 
 !!! warning
-    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
+    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index d026235dd9d585a3d58fc39159b4c7be89599a34..735074c08b8c86feb54895cc3c88c88d19c7217f 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.metadata
+import importlib.util
 import logging
 import sys
-from argparse import SUPPRESS, HelpFormatter
+import traceback
+from argparse import SUPPRESS, Action, HelpFormatter
+from collections.abc import Iterable
+from importlib.machinery import ModuleSpec
 from pathlib import Path
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 from unittest.mock import MagicMock, patch
 
 from pydantic_core import core_schema
@@ -16,59 +20,111 @@ ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
 
 sys.path.insert(0, str(ROOT_DIR))
-sys.modules["vllm._C"] = MagicMock()
+
+
+def mock_if_no_torch(mock_module: str, mock: MagicMock):
+    if not importlib.util.find_spec("torch"):
+        sys.modules[mock_module] = mock
+
+
+# Mock custom op code
+class MockCustomOp:
+    @staticmethod
+    def register(name):
+        def decorator(cls):
+            return cls
+
+        return decorator
+
+
+mock_if_no_torch("vllm._C", MagicMock())
+mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
+mock_if_no_torch(
+    "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
+)
+
+
+# Mock any version checks by reading from compiled CI requirements
+with open(ROOT_DIR / "requirements/test.txt") as f:
+    VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
+importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
+
+
+# Make torch.nn.Parameter safe to inherit from
+mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
 class PydanticMagicMock(MagicMock):
     """`MagicMock` that's able to generate pydantic-core schemas."""
 
+    def __init__(self, *args, **kwargs):
+        name = kwargs.pop("name", None)
+        super().__init__(*args, **kwargs)
+        self.__spec__ = ModuleSpec(name, None)
+
     def __get_pydantic_core_schema__(self, source_type, handler):
         return core_schema.any_schema()
 
 
-def auto_mock(module, attr, max_mocks=50):
+def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
     """Function that automatically mocks missing modules during imports."""
-    logger.info("Importing %s from %s", attr, module)
+    logger.info("Importing %s from %s", attr, module_name)
+
     for _ in range(max_mocks):
         try:
+            module = importlib.import_module(module_name)
+
             # First treat attr as an attr, then as a submodule
-            with patch("importlib.metadata.version", return_value="0.0.0"):
-                return getattr(importlib.import_module(module), attr,
-                               importlib.import_module(f"{module}.{attr}"))
-        except importlib.metadata.PackageNotFoundError as e:
-            raise e
+            if hasattr(module, attr):
+                return getattr(module, attr)
+
+            return importlib.import_module(f"{module_name}.{attr}")
         except ModuleNotFoundError as e:
+            assert e.name is not None
             logger.info("Mocking %s for argparse doc generation", e.name)
-            sys.modules[e.name] = PydanticMagicMock()
+            sys.modules[e.name] = PydanticMagicMock(name=e.name)
+        except Exception:
+            logger.exception("Failed to import %s.%s: %s", module_name, attr)
 
     raise ImportError(
-        f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
-
-
-latency = auto_mock("vllm.benchmarks", "latency")
-serve = auto_mock("vllm.benchmarks", "serve")
-throughput = auto_mock("vllm.benchmarks", "throughput")
+        f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
+    )
+
+
+bench_latency = auto_mock("vllm.benchmarks", "latency")
+bench_serve = auto_mock("vllm.benchmarks", "serve")
+bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
+bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
+bench_sweep_serve_sla = auto_mock(
+    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
+)
+bench_throughput = auto_mock("vllm.benchmarks", "throughput")
 AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
 EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
 ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
-cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
-run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
+openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+
+if TYPE_CHECKING:
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = auto_mock(
+        "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+    )
 
 
 class MarkdownFormatter(HelpFormatter):
     """Custom formatter that generates markdown for argument groups."""
 
-    def __init__(self, prog, starting_heading_level=3):
-        super().__init__(prog,
-                         max_help_position=float('inf'),
-                         width=float('inf'))
+    def __init__(self, prog: str, starting_heading_level: int = 3):
+        super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
+
         self._section_heading_prefix = "#" * starting_heading_level
         self._argument_heading_prefix = "#" * (starting_heading_level + 1)
         self._markdown_output = []
 
-    def start_section(self, heading):
+    def start_section(self, heading: str):
         if heading not in {"positional arguments", "options"}:
             heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
             self._markdown_output.append(heading_md)
@@ -76,37 +132,36 @@ class MarkdownFormatter(HelpFormatter):
     def end_section(self):
         pass
 
-    def add_text(self, text):
+    def add_text(self, text: str):
         if text:
             self._markdown_output.append(f"{text.strip()}\n\n")
 
     def add_usage(self, usage, actions, groups, prefix=None):
         pass
 
-    def add_arguments(self, actions):
+    def add_arguments(self, actions: Iterable[Action]):
         for action in actions:
-            if (len(action.option_strings) == 0
-                    or "--help" in action.option_strings):
+            if len(action.option_strings) == 0 or "--help" in action.option_strings:
                 continue
 
-            option_strings = f'`{"`, `".join(action.option_strings)}`'
+            option_strings = f"`{'`, `'.join(action.option_strings)}`"
             heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
             self._markdown_output.append(heading_md)
 
             if choices := action.choices:
-                choices = f'`{"`, `".join(str(c) for c in choices)}`'
-                self._markdown_output.append(
-                    f"Possible choices: {choices}\n\n")
-            elif ((metavar := action.metavar)
-                  and isinstance(metavar, (list, tuple))):
-                metavar = f'`{"`, `".join(str(m) for m in metavar)}`'
-                self._markdown_output.append(
-                    f"Possible choices: {metavar}\n\n")
+                choices = f"`{'`, `'.join(str(c) for c in choices)}`"
+                self._markdown_output.append(f"Possible choices: {choices}\n\n")
+            elif (metavar := action.metavar) and isinstance(metavar, (list, tuple)):
+                metavar = f"`{'`, `'.join(str(m) for m in metavar)}`"
+                self._markdown_output.append(f"Possible choices: {metavar}\n\n")
 
             if action.help:
                 self._markdown_output.append(f"{action.help}\n\n")
 
             if (default := action.default) != SUPPRESS:
+                # Make empty string defaults visible
+                if default == "":
+                    default = '""'
                 self._markdown_output.append(f"Default: `{default}`\n\n")
 
     def format_help(self):
@@ -116,7 +171,7 @@ class MarkdownFormatter(HelpFormatter):
 
 def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
     """Create a parser for the given class with markdown formatting.
-    
+
     Args:
         cls: The class to create a parser for
         **kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.
@@ -124,10 +179,19 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
     Returns:
         FlexibleArgumentParser: A parser with markdown formatting for the class.
     """
-    parser = FlexibleArgumentParser(add_json_tip=False)
-    parser.formatter_class = MarkdownFormatter
-    with patch("vllm.config.DeviceConfig.__post_init__"):
-        _parser = add_cli_args(parser, **kwargs)
+    try:
+        parser = FlexibleArgumentParser(add_json_tip=False)
+        parser.formatter_class = MarkdownFormatter
+        with patch("vllm.config.DeviceConfig.__post_init__"):
+            _parser = add_cli_args(parser, **kwargs)
+    except ModuleNotFoundError as e:
+        # Auto-mock runtime imports
+        if tb_list := traceback.extract_tb(e.__traceback__):
+            path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
+            auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
+            return create_parser(add_cli_args, **kwargs)
+        else:
+            raise e
     # add_cli_args might be in-place so return parser if _parser is None
     return _parser or parser
 
@@ -143,30 +207,33 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Create parsers to document
     parsers = {
-        "engine_args":
-        create_parser(EngineArgs.add_cli_args),
-        "async_engine_args":
-        create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
-        "serve":
-        create_parser(cli_args.make_arg_parser),
-        "chat":
-        create_parser(ChatCommand.add_cli_args),
-        "complete":
-        create_parser(CompleteCommand.add_cli_args),
-        "bench_latency":
-        create_parser(latency.add_cli_args),
-        "bench_throughput":
-        create_parser(throughput.add_cli_args),
-        "bench_serve":
-        create_parser(serve.add_cli_args),
-        "run-batch":
-        create_parser(run_batch.make_arg_parser),
+        # Engine args
+        "engine_args": create_parser(EngineArgs.add_cli_args),
+        "async_engine_args": create_parser(
+            AsyncEngineArgs.add_cli_args, async_args_only=True
+        ),
+        # CLI
+        "serve": create_parser(openai_cli_args.make_arg_parser),
+        "chat": create_parser(ChatCommand.add_cli_args),
+        "complete": create_parser(CompleteCommand.add_cli_args),
+        "run-batch": create_parser(openai_run_batch.make_arg_parser),
+        # Benchmark CLI
+        "bench_latency": create_parser(bench_latency.add_cli_args),
+        "bench_serve": create_parser(bench_serve.add_cli_args),
+        "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
+        "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
+        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
+        "bench_throughput": create_parser(bench_throughput.add_cli_args),
     }
 
     # Generate documentation for each parser
     for stem, parser in parsers.items():
-        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
+        doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
         # Specify encoding for building on Windows
         with open(doc_path, "w", encoding="utf-8") as f:
             f.write(super(type(parser), parser).format_help())
         logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
+
+
+if __name__ == "__main__":
+    on_startup("build", False)
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 0cbaebb598a346b54da799682df68c9648db5f9e..6e4fb039e3a075ea36ee98cc48568d2818bdaec2 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -11,7 +11,7 @@ import regex as re
 logger = logging.getLogger("mkdocs")
 
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
-ROOT_DIR_RELATIVE = '../../../../..'
+ROOT_DIR_RELATIVE = "../../../../.."
 EXAMPLE_DIR = ROOT_DIR / "examples"
 EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
 
@@ -36,7 +36,7 @@ def fix_case(text: str) -> str:
         r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
     }
     for pattern, repl in subs.items():
-        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
+        text = re.sub(rf"\b{pattern}\b", repl, text, flags=re.IGNORECASE)
     return text
 
 
@@ -58,7 +58,8 @@ class Example:
         determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
         determine_title() -> str: Determines the title of the document.
         generate() -> str: Generates the documentation content.
-    """ # noqa: E501
+    """  # noqa: E501
+
     path: Path
     category: str = None
     main_file: Path = field(init=False)
@@ -84,9 +85,8 @@ class Example:
             Markdown file found in the directory.
         Raises:
             IndexError: If no Markdown files are found in the directory.
-        """ # noqa: E501
-        return self.path if self.path.is_file() else list(
-            self.path.glob("*.md")).pop()
+        """  # noqa: E501
+        return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
 
     def determine_other_files(self) -> list[Path]:
         """
@@ -98,7 +98,7 @@ class Example:
 
         Returns:
             list[Path]: A list of Path objects representing the other files in the directory.
-        """ # noqa: E501
+        """  # noqa: E501
         if self.path.is_file():
             return []
         is_other_file = lambda file: file.is_file() and file != self.main_file
@@ -109,25 +109,25 @@ class Example:
             # Specify encoding for building on Windows
             with open(self.main_file, encoding="utf-8") as f:
                 first_line = f.readline().strip()
-            match = re.match(r'^#\s+(?P<title>.+)$', first_line)
+            match = re.match(r"^#\s+(?P<title>.+)$", first_line)
             if match:
-                return match.group('title')
+                return match.group("title")
         return fix_case(self.path.stem.replace("_", " ").title())
 
     def fix_relative_links(self, content: str) -> str:
         """
         Fix relative links in markdown content by converting them to gh-file
         format.
-        
+
         Args:
             content (str): The markdown content to process
-            
+
         Returns:
             str: Content with relative links converted to gh-file format
         """
         # Regex to match markdown links [text](relative_path)
         # This matches links that don't start with http, https, ftp, or #
-        link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)'
+        link_pattern = r"\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)"
 
         def replace_link(match):
             link_text = match.group(1)
@@ -137,22 +137,31 @@ class Example:
             gh_file = (self.main_file.parent / relative_path).resolve()
             gh_file = gh_file.relative_to(ROOT_DIR)
 
-            return f'[{link_text}](gh-file:{gh_file})'
+            # Make GitHub URL
+            url = "https://github.com/vllm-project/vllm/"
+            url += "tree/main" if self.path.is_dir() else "blob/main"
+            gh_url = f"{url}/{gh_file}"
+
+            return f"[{link_text}]({gh_url})"
 
         return re.sub(link_pattern, replace_link, content)
 
     def generate(self) -> str:
         content = f"# {self.title}\n\n"
-        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        url = "https://github.com/vllm-project/vllm/"
+        url += "tree/main" if self.path.is_dir() else "blob/main"
+        content += f"Source <{url}/{self.path.relative_to(ROOT_DIR)}>.\n\n"
 
         # Use long code fence to avoid issues with
         # included files containing code fences too
         code_fence = "``````"
 
         if self.is_code:
-            content += (f"{code_fence}{self.main_file.suffix[1:]}\n"
-                        f'--8<-- "{self.main_file}"\n'
-                        f"{code_fence}\n")
+            content += (
+                f"{code_fence}{self.main_file.suffix[1:]}\n"
+                f'--8<-- "{self.main_file}"\n'
+                f"{code_fence}\n"
+            )
         else:
             with open(self.main_file) as f:
                 # Skip the title from md snippets as it's been included above
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index 1a84039abc14f3c4db6245a62fd2fb06a49cf971..12db2265b9f82dfd5d913adaec3c6a6e3addfae7 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -7,7 +7,7 @@ from typing import Literal
 
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
-    if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
+    if os.getenv("READTHEDOCS_VERSION_TYPE") == "tag":
         # remove the warning banner if the version is a tagged release
         mkdocs_dir = Path(__file__).parent.parent
         announcement_path = mkdocs_dir / "overrides/main.html"
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index 6fce6bd8130e02088bd07254c18b93e6bbc542e6..f36a64ed7a3b872a9ed48e707541ec4d0c0bdcf4 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -1,122 +1,95 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This is basically a port of MyST parser’s external URL resolution mechanism
-(https://myst-parser.readthedocs.io/en/latest/syntax/cross-referencing.html#customising-external-url-resolution)
-to work with MkDocs.
+MkDocs hook to enable the following links to render correctly:
 
-It allows Markdown authors to use GitHub shorthand links like:
-
-  - [Text](gh-issue:123)
-  - <gh-pr:456>
-  - [File](gh-file:path/to/file.py#L10)
-
-These are automatically rewritten into fully qualified GitHub URLs pointing to
-issues, pull requests, files, directories, or projects in the
-`vllm-project/vllm` repository.
+- Relative file links outside of the `docs/` directory, e.g.:
+    - [Text](../some_file.py)
+    - [Directory](../../some_directory/)
+- GitHub URLs for issues, pull requests, and projects, e.g.:
+    - Adds GitHub icon before links
+    - Replaces raw links with descriptive text,
+        e.g. <...pull/123> -> [Pull Request #123](.../pull/123)
+    - Works for external repos too by including the `owner/repo` in the link title
 
 The goal is to simplify cross-referencing common GitHub resources
 in project docs.
 """
 
+from pathlib import Path
+
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
 from mkdocs.structure.pages import Page
 
+ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
+DOC_DIR = ROOT_DIR / "docs"
 
-def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
-                     files: Files) -> str:
-    """
-    Custom MkDocs plugin hook to rewrite special GitHub reference links
-    in Markdown.
-
-    This function scans the given Markdown content for specially formatted
-    GitHub shorthand links, such as:
-      - `[Link text](gh-issue:123)`
-      - `<gh-pr:456>`
-    
-    And rewrites them into fully-qualified GitHub URLs with GitHub icons:
-      - `[:octicons-mark-github-16: Link text](https://github.com/vllm-project/vllm/issues/123)`
-      - `[:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)`
-
-    Supported shorthand types:
-      - `gh-issue`
-      - `gh-pr`
-      - `gh-project`
-      - `gh-dir`
-      - `gh-file`
-
-    Args:
-        markdown (str): The raw Markdown content of the page.
-        page (Page): The MkDocs page object being processed.
-        config (MkDocsConfig): The MkDocs site configuration.
-        files (Files): The collection of files in the MkDocs build.
-
-    Returns:
-        str: The updated Markdown content with GitHub shorthand links replaced.
-    """
-    gh_icon = ":octicons-mark-github-16:"
-    gh_url = "https://github.com"
-    repo_url = f"{gh_url}/vllm-project/vllm"
-    org_url = f"{gh_url}/orgs/vllm-project"
-
-    # Mapping of shorthand types to their corresponding GitHub base URLs
-    urls = {
-        "issue": f"{repo_url}/issues",
-        "pr": f"{repo_url}/pull",
-        "project": f"{org_url}/projects",
-        "dir": f"{repo_url}/tree/main",
-        "file": f"{repo_url}/blob/main",
-    }
-
-    # Default title prefixes for auto links
-    titles = {
-        "issue": "Issue #",
-        "pr": "Pull Request #",
-        "project": "Project #",
-        "dir": "",
-        "file": "",
-    }
-
-    # Regular expression to match GitHub shorthand links
-    scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
-    inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
-    auto_link = re.compile(f"<{scheme}>")
-
-    def replace_inline_link(match: re.Match) -> str:
-        """
-        Replaces a matched inline-style GitHub shorthand link
-        with a full Markdown link.
-        
-        Example:
-            [My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
-        """
-        url = f'{urls[match.group("type")]}/{match.group("path")}'
-        if fragment := match.group("fragment"):
-            url += f"#{fragment}"
-
-        return f'[{gh_icon} {match.group("title")}]({url})'
-
-    def replace_auto_link(match: re.Match) -> str:
-        """
-        Replaces a matched autolink-style GitHub shorthand
-        with a full Markdown link.
-        
-        Example:
-            <gh-pr:456> → [:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)
-        """
-        type = match.group("type")
+
+gh_icon = ":octicons-mark-github-16:"
+
+# Regex pieces
+TITLE = r"(?P<title>[^\[\]<>]+?)"
+REPO = r"(?P<repo>.+?/.+?)"
+TYPE = r"(?P<type>issues|pull|projects)"
+NUMBER = r"(?P<number>\d+)"
+FRAGMENT = r"(?P<fragment>#[^\s]+)?"
+URL = f"https://github.com/{REPO}/{TYPE}/{NUMBER}{FRAGMENT}"
+RELATIVE = r"(?!(https?|ftp)://|#)(?P<path>[^\s]+?)"
+
+# Common titles to use for GitHub links when none is provided in the link.
+TITLES = {"issues": "Issue ", "pull": "Pull Request ", "projects": "Project "}
+
+# Regex to match GitHub issue, PR, and project links with optional titles.
+github_link = re.compile(rf"(\[{TITLE}\]\(|<){URL}(\)|>)")
+# Regex to match relative file links with optional titles.
+relative_link = re.compile(rf"\[{TITLE}\]\({RELATIVE}\)")
+
+
+def on_page_markdown(
+    markdown: str, *, page: Page, config: MkDocsConfig, files: Files
+) -> str:
+    def replace_relative_link(match: re.Match) -> str:
+        """Replace relative file links with URLs if they point outside the docs dir."""
+        title = match.group("title")
         path = match.group("path")
-        title = f"{titles[type]}{path}"
-        url = f"{urls[type]}/{path}"
-        if fragment := match.group("fragment"):
-            url += f"#{fragment}"
+        path = (Path(page.file.abs_src_path).parent / path).resolve()
+
+        # Check if the path exists and is outside the docs dir
+        if not path.exists() or path.is_relative_to(DOC_DIR):
+            return match.group(0)
+
+        # Files and directories have different URL schemes on GitHub
+        slug = "tree/main" if path.is_dir() else "blob/main"
 
+        path = path.relative_to(ROOT_DIR)
+        url = f"https://github.com/vllm-project/vllm/{slug}/{path}"
         return f"[{gh_icon} {title}]({url})"
 
-    # Replace both inline and autolinks
-    markdown = inline_link.sub(replace_inline_link, markdown)
-    markdown = auto_link.sub(replace_auto_link, markdown)
+    def replace_github_link(match: re.Match) -> str:
+        """Replace GitHub issue, PR, and project links with enhanced Markdown links."""
+        repo = match.group("repo")
+        type = match.group("type")
+        number = match.group("number")
+        # Title and fragment could be None
+        title = match.group("title") or ""
+        fragment = match.group("fragment") or ""
+
+        # Use default titles for raw links
+        if not title:
+            title = TITLES[type]
+            if "vllm-project" not in repo:
+                title += repo
+            title += f"#{number}"
+
+        url = f"https://github.com/{repo}/{type}/{number}{fragment}"
+        return f"[{gh_icon} {title}]({url})"
+
+    markdown = relative_link.sub(replace_relative_link, markdown)
+    markdown = github_link.sub(replace_github_link, markdown)
+
+    if "interface" in str(page.file.abs_src_path):
+        print(markdown)
 
     return markdown
diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md
index 2a5a18102dc28d06e16f9def8561d56f6a3039ab..0f30d4e2f69d267302f3b5d1da6419245842c85a 100644
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors
 
 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
 
-To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
+To enable this feature, use the `--load-format fastsafetensors` command-line argument
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
index 992dddf385d0dc7422a8dda770c38363060034f9..fc9d5eec3803ece66e3992739f5abe740b6dd29f 100644
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -24,6 +24,13 @@ vllm serve s3://core-llm/Llama-3-8b \
     --load-format runai_streamer
 ```
 
+To run model from Google Cloud Storage run:
+
+```bash
+vllm serve gs://core-llm/Llama-3-8b \
+    --load-format runai_streamer
+```
+
 To run model from a S3 compatible object store run:
 
 ```bash
@@ -38,6 +45,15 @@ vllm serve s3://core-llm/Llama-3-8b \
 
 You can tune parameters using `--model-loader-extra-config`:
 
+You can tune `distributed` that controls whether distributed streaming should be used. This is currently only possible on CUDA and ROCM devices. This can significantly improve loading times from object storage or high-throughput network fileshares.
+You can read further about Distributed streaming [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/usage.md#distributed-streaming)
+
+```bash
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer \
+    --model-loader-extra-config '{"distributed":true}'
+```
+
 You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
 For reading from S3, it will be the number of client instances the host is opening to the S3 server.
 
@@ -75,7 +91,7 @@ vllm serve /path/to/sharded/model \
     --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
 ```
 
-To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/offline_inference/save_sharded_state.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
 
 The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
 
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index f70ab0c6f4e5c4d950544b06c7e0b364c77a4a63..3df80d5af6c4dd53fa6dd8d1795e8db9daf1cad4 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -60,7 +60,7 @@ from vllm import LLM
 llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
-    enable_lora=True
+    enable_lora=True,
 )
 ```
 
@@ -97,6 +97,6 @@ llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
     enable_lora=True,
-    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
+    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
 )
 ```
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 05f8d16cc4ca767a36ef6af36f0c600735fb1c96..be2f25bf06616d1607b2fecacf2d17c3132ed655 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -59,7 +59,7 @@ for output in outputs:
     By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
     However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
-A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
+A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
 
 ### `LLM.beam_search`
 
@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
     conversation = [
         {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
         },
         {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
         },
         {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
         },
         {
             "role": "user",
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
+A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
@@ -140,5 +140,5 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
-- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
-- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
+- [Completions API](../serving/openai_compatible_server.md#completions-api) is similar to `LLM.generate` but only accepts text.
+- [Chat API](../serving/openai_compatible_server.md#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 50982d3d0d0f3361581759fd0f67af174063170a..18bb645ea9a9c394c0c91a3712be6dccf0640557 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -9,7 +9,7 @@ before returning them.
 !!! note
     We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
 
-    We are now planning to optimize pooling models in vLLM. Please comment on <gh-issue:21796> if you have any suggestions!
+    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
 
 ## Configuration
 
@@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe
 vLLM will attempt to automatically convert the model according to the architecture names
 shown in the table below.
 
-| Architecture                                    | `--convert` | Supported pooling tasks       |
-|-------------------------------------------------|-------------|-------------------------------|
-| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `encode`, `embed`             |
-| `*For*Classification`, `*ClassificationModel`   | `classify`  | `encode`, `classify`, `score` |
-| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `encode`                      |
+| Architecture                                    | `--convert` | Supported pooling tasks               |
+|-------------------------------------------------|-------------|---------------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
+| `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
+| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `token_classify`                      |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.
@@ -45,12 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to
 [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
 enabling the corresponding APIs:
 
-| Task       | APIs                                 |
-|------------|--------------------------------------|
-| `encode`   | `LLM.reward(...)`                    |
-| `embed`    | `LLM.embed(...)`, `LLM.score(...)`\* |
-| `classify` | `LLM.classify(...)`                  |
-| `score`    | `LLM.score(...)`                     |
+| Task             | APIs                                                                          |
+|------------------|-------------------------------------------------------------------------------|
+| `embed`          | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` |
+| `classify`       | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`               |
+| `score`          | `LLM.score(...)`                                                              |
+| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")`           |
+| `token_embed`    | `LLM.encode(..., pooling_task="token_embed")`                                 |
+| `plugin`         | `LLM.encode(..., pooling_task="plugin")`                                      |
 
 \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
 
@@ -98,7 +100,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
+A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
 
 ### `LLM.classify`
 
@@ -115,7 +117,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
+A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
 
 ### `LLM.score`
 
@@ -130,19 +132,20 @@ It is designed for embedding models and cross-encoder models. Embedding models u
 from vllm import LLM
 
 llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
-(output,) = llm.score("What is the capital of France?",
-                      "The capital of Brazil is Brasilia.")
+(output,) = llm.score(
+    "What is the capital of France?",
+    "The capital of Brazil is Brasilia.",
+)
 
 score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
+A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
 
 ### `LLM.reward`
 
 The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
-It returns the extracted hidden states directly.
 
 ```python
 from vllm import LLM
@@ -154,20 +157,22 @@ data = output.outputs.data
 print(f"Data: {data!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic/reward.py>
+A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
 
 ### `LLM.encode`
 
 The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly.
 
 !!! note
     Please use one of the more specific methods or set the task directly when using `LLM.encode`:
 
     - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
     - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`.
     - For similarity scores, use `LLM.score(...)`.  
+    - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
+    - For token classification, use `pooling_task="token_classify"`.
+    - For multi-vector retrieval, use `pooling_task="token_embed"`
+    - For IO Processor Plugins , use `pooling_task="plugin"`
 
 ```python
 from vllm import LLM
@@ -183,10 +188,47 @@ print(f"Data: {data!r}")
 
 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
-- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
-- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
-- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
-- [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
+- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
+- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
+- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.
+- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
+
+!!! note
+    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+
+    - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
+    - For rewards, `task":"token_classify"`.
+    - For token classification, use `task":"token_classify"`.
+    - For multi-vector retrieval, use `task":"token_embed"`
+    - For IO Processor Plugins , use `task":"plugin"`
+
+```python
+# start a supported embeddings model server with `vllm serve`, e.g.
+# vllm serve intfloat/e5-small
+import requests
+
+host = "localhost"
+port = "8000"
+model_name = "intfloat/e5-small"
+
+api_url = f"http://{host}:{port}/pooling"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+prompt = {"model": model_name, "input": prompts, "task": "embed"}
+
+response = requests.post(api_url, json=prompt)
+
+for output in response.json()["data"]:
+    data = output["data"]
+    print(f"Data: {data!r} (size={len(data)})")
+```
 
 ## Matryoshka Embeddings
 
@@ -209,7 +251,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
-```text
+```bash
 vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```
 
@@ -220,27 +262,31 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
 
-llm = LLM(model="jinaai/jina-embeddings-v3",
-          runner="pooling",
-          trust_remote_code=True)
-outputs = llm.embed(["Follow the white rabbit."],
-                    pooling_params=PoolingParams(dimensions=32))
+llm = LLM(
+    model="jinaai/jina-embeddings-v3",
+    runner="pooling",
+    trust_remote_code=True,
+)
+outputs = llm.embed(
+    ["Follow the white rabbit."],
+    pooling_params=PoolingParams(dimensions=32),
+)
 print(outputs[0].outputs)
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/pooling/embed_matryoshka_fy.py>
+A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
 
 ### Online Inference
 
 Use the following command to start vllm server.
 
-```text
+```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 ```
 
 You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
 
-```text
+```bash
 curl http://127.0.0.1:8000/v1/embeddings \
   -H 'accept: application/json' \
   -H 'Content-Type: application/json' \
@@ -258,4 +304,17 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
-An OpenAI client example can be found here: <gh-file:examples/online_serving/pooling/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+
+## Deprecated Features
+
+### Encode task
+
+We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+
+- `token_embed` is the same as embed, using normalize as activation.
+- `token_classify` is the same as classify, default using softmax as activation.
+
+### Remove softmax from PoolingParams
+
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 650f6249228247855047f14dd5516043983f9852..bd14bbb9ab6621e12854a789f1aa88bbad2dc9d5 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -9,30 +9,30 @@ Alongside each architecture, we include some popular models that use it.
 
 ### vLLM
 
-If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
+If vLLM natively supports a model, its implementation can be found in [vllm/model_executor/models](../../vllm/model_executor/models).
 
-These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
-
-[](){ #transformers-backend }
+These models are what we list in [supported text models](#list-of-text-only-language-models) and [supported multimodal models](#list-of-multimodal-language-models).
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
 
-Currently, the Transformers backend works for the following:
+Currently, the Transformers modeling backend works for the following:
 
 - Modalities: embedding models, language models and vision-language models*
-- Architectures: encoder-only, decoder-only
+- Architectures: encoder-only, decoder-only, mixture-of-experts
 - Attention types: full attention and/or sliding attention
 
 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
 
-If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
+If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
 
 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
-    - Pipeline parallel
+    - Data parallel
     - Tensor parallel
+    - Expert parallel
+    - Pipeline parallel
 
 Checking if the modeling backend is Transformers is as simple as:
 
@@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
 
 If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
 
-If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
+If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
 
 !!! note
     For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
 
 If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
-For a model to be compatible with the Transformers backend for vLLM it must:
+For a model to be compatible with the Transformers modeling backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
     - The model directory must have the correct structure (e.g. `config.json` is present).
     - `config.json` must contain `auto_map.AutoModel`.
-- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
+- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
     - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
@@ -66,18 +66,21 @@ If the compatible model is:
 - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 - in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
-This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
-
-[](){ #writing-custom-models }
+This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
 #### Writing custom models
 
-This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
-To make your model compatible with the Transformers backend, it needs:
+To make your model compatible with the Transformers modeling backend, it needs:
 
 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
-    1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
+    - If your model is encoder-only:
+        1. Add `is_causal = False` to `MyAttention`.
+    - If your model is mixture-of-experts (MoE):
+        1. Your sparse MoE block must have an attribute called `experts`.
+        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
 
@@ -104,6 +107,23 @@ class MyAttention(nn.Module):
         )
         ...
 
+# Only do this for mixture-of-experts models
+class MyExperts(nn.ModuleList):
+    def forward(self, hidden_states, top_k_index, top_k_weights):
+        ...
+
+# Only do this for mixture-of-experts models
+class MySparseMoEBlock(nn.Module):
+    def __init__(self, config):
+        ...
+        self.experts = MyExperts(config)
+        ...
+
+    def forward(self, hidden_states: torch.Tensor):
+        ...
+        hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)
+        ...
+
 class MyModel(PreTrainedModel):
     _supports_attention_backend = True
 ```
@@ -114,7 +134,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
@@ -162,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 !!! tip
     The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -276,8 +296,8 @@ https_proxy=http://your.proxy.server:port  vllm serve <model_name>
 ```python
 import os
 
-os.environ['http_proxy'] = 'http://your.proxy.server:port'
-os.environ['https_proxy'] = 'http://your.proxy.server:port'
+os.environ["http_proxy"] = "http://your.proxy.server:port"
+os.environ["https_proxy"] = "http://your.proxy.server:port"
 ```
 
 ### ModelScope
@@ -304,8 +324,6 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```
 
-[](){ #feature-status-legend }
-
 ## Feature Status Legend
 
 - ✅︎ indicates that the feature is supported for the model.
@@ -314,8 +332,6 @@ print(output)
 
 - ⚠️ indicates that the feature is available but may have known issues or limitations.
 
-[](){ #supported-text-models }
-
 ## List of Text-only Language Models
 
 ### Generative Models
@@ -333,108 +349,114 @@ th {
 }
 </style>
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
-| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
-| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
-| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
-| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
-| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
-| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
-| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
-| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
-| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
-| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
-| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
-| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
-| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
-| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
-| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ |
-| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
-| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
-| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
-| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
-| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | |
-| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
-| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
-| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
-| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
-| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
-| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
-| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
-| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
-| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
-| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
-| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
-| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
-| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
-
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
-
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
+| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
+| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
+| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
+| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
+| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
+| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ |
+| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
+| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
+| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
+| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
+| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ |
+| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ |
+| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ |
+| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
+| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
+| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
+| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
+| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
+| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ |
+| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ |
+| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ |
+| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
+| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
+| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
+| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
+| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
+| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
+| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ |
+| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
+| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
+| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
+| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
+| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
+| `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
+| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
+| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
+| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
+| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
+| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
+| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
+| `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ |
+| `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
+| `Lfm2MoeForCausalLM`  | LFM2MoE  | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
+| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
+| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
+| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
+| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
+| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
+| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
+| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
+| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
+| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
+| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
+| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
+| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
+| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ |
+| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
+| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
+| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
+| `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
+| `PanguEmbeddedForCausalLM` |openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
+| `PanguUltraMoEForCausalLM` |openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
+| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
+| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
+| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
+| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
+| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
+| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
+| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
+| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
+| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
+| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
+| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
+| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
+| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
+| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
+| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | |
+| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
+| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
+| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
+
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ |
 
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
@@ -451,21 +473,21 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
-| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
-| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
-| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
-| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  | ✅︎ |
-| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  | ✅︎ |
-| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
+| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
+| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |
+| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
+| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
+| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -492,11 +514,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
+| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -509,16 +531,16 @@ If your model is not in the above list, we will try to automatically convert the
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
-| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  | ✅︎ |
-| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
+| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  |
+| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
+| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -541,7 +563,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     ```
 
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/pooling/qwen3_reranker.py>.
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
 
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@@ -551,13 +573,13 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 
 These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
+| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
+| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
+| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -573,14 +595,13 @@ If your model is not in the above list, we will try to automatically convert the
 
 These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
-| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  | ✅︎ |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|-----------------------------|-----------------------------------------|
+| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  |
+| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |
 
 !!! note
-    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
-
-[](){ #supported-mm-models }
+    Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
 
 ## List of Multimodal Language Models
 
@@ -601,35 +622,34 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
 
-!!! important
-    **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
-    or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
+!!! tip
+    For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
 
-    Offline inference:
+!!! note
+    vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models.
+    If you wish to use a model with LoRA in the multi-modal encoder,
+    please merge the weights into the base model first before running it in vLLM like a regular model.
 
     ```python
-    from vllm import LLM
-
-    llm = LLM(
-        model="Qwen/Qwen2-VL-7B-Instruct",
-        limit_mm_per_prompt={"image": 4},
-    )
-    ```
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    def merge_and_save(model_id: str, output_dir: str):
+        base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+        lora_model = PeftModel.from_pretrained(
+            base_model,
+            model_id,
+            config=PeftConfig.from_pretrained(model_id),
+        )
+        model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+        model._hf_peft_config_loaded = False  # Needed to save the merged model
 
-    Online serving:
+        processor = AutoProcessor.from_pretrained(model_id)
 
-    ```bash
-    vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
+        model.save_pretrained(output_dir)
+        processor.save_pretrained(output_dir)
     ```
 
-    **This is no longer required if you are using vLLM V1.**
-
-!!! tip
-    For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
-
-!!! note
-    vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-
 ### Generative Models
 
 See [this page](generative_models.md) for more information on how to use generative models.
@@ -638,69 +658,74 @@ See [this page](generative_models.md) for more information on how to use generat
 
 These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
-| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
-| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
-| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
-| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
-| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
-| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
-| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
-| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
-| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
-| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
-| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
-| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
-| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
-| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
-| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
-| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
-| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ |
-| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
-| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
-| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
-| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
-| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ |
-| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
-| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
-| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
-| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
-| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
-| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
-| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
-| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ | ✅︎ |
-| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
-| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
-
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
-
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
-| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
+| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ |
+| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
+| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
+| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
+| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
+| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
+| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
+| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
+| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
+| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
+| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
+| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
+| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
+| `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
+| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
+| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ |
+| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ |
+| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ |
+| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | |
+| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ |
+| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
+| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
+| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
+| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
+| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
+| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
+| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
+| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
+| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
+| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
+| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
+| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
+| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ |
+| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ |
+| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ |
+| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ |
+| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | |
+| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
+| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
+| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
+
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
+| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
@@ -740,57 +765,29 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 
-!!! warning
-    The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
-
-    For the best results, we recommend using the following dependency versions (tested on A10 and L40):
-
-    ??? code "Dependency versions"
-
-        ```text
-        # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
-        torch==2.5.1
-        torchvision==0.20.1
-        transformers==4.48.1
-        tokenizers==0.21.0
-        tiktoken==0.7.0
-        vllm==0.7.0
-
-        # Optional but recommended for improved performance and stability
-        triton==3.1.0
-        xformers==0.0.28.post3
-        uvloop==0.21.0
-        protobuf==5.29.3
-        openai==1.60.2
-        opencv-python-headless==4.11.0.86
-        pillow==10.4.0
-
-        # Installed FlashAttention (for float16 only)
-        flash-attn>=2.5.6  # Not used in float32, but should be documented
-        ```
-
-    **Note:** Make sure you understand the security implications of using outdated packages.
-
 !!! note
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
-    For more details, please see: <gh-pr:4087#issuecomment-2250397630>
+    For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
 !!! warning
     Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 
 !!! note
-    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
-    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
 
 #### Transcription
 
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ |
-| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|-------------------|----------------------|---------------------------|
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
+
+!!! note
+    `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
 
 ### Pooling Models
 
@@ -805,11 +802,13 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 
 The following table lists those that are tested in vLLM.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
-| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
-| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
+| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
+| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
+| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
+| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
 
 <sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -821,9 +820,9 @@ The following table lists those that are tested in vLLM.
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA](../features/lora.md)   | [PP](../serving/parallelism_scaling.md)   | [V1](gh-issue:8779)   |
-|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
-| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|
+| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -853,5 +852,5 @@ We have the following levels of testing for models:
 
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](../../tests) and [examples](../../examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/serving/context_parallel_deployment.md b/docs/serving/context_parallel_deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..dacdf312ee55bf1c1081bccdd2ac4e079cef11d8
--- /dev/null
+++ b/docs/serving/context_parallel_deployment.md
@@ -0,0 +1,47 @@
+# Context Parallel Deployment
+
+Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are:
+
+- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens.
+- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput).
+
+## Prefill Context Parallel
+
+During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
+
+Depending on the use case, there're two possible strategies:
+
+1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
+2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
+
+Both approaches are under active development.
+
+## Decode Context Parallel
+
+Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs.
+
+For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache.
+
+1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed.
+2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp <num_gpus>` to the command line.
+3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp <size>` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases.
+
+Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size.
+
+Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120).
+
+Case study:
+
+For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication.
+
+For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node.
+
+For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication.
+
+In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication.
+
+Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase.
+
+## Technical Discussions
+
+The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/).
diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md
index 9ff9f59c54e500273192e3c4c0492820b9d919b2..eff9c5d5e4efa1397eb43edcab540c8b6e7da58f 100644
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th
 
 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
 
-This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see <gh-file:examples/offline_inference/data_parallel.py>.
+This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/offline_inference/data_parallel.py).
 
 There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing.
 
@@ -69,6 +69,7 @@ There are several notable differences when using Ray:
 - A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
 - There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
 - There is no need to specify `--data-parallel-rpc-port`
+- When a single DP group requires multiple nodes, *e.g.* in case a single model replica needs to run on at least two nodes, make sure to set `VLLM_RAY_DP_PACK_STRATEGY="span"` in which case `--data-parallel-size-local` is ignored and will be automatically determined
 - Remote DP ranks will be allocated based on node resources of the Ray cluster
 
 Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
diff --git a/docs/serving/distributed_troubleshooting.md b/docs/serving/distributed_troubleshooting.md
index bd45f010ed2ae211de33785fd2b8e1dd70deda37..b5354a7e55d5c4adb15cdbab99d23139337e22fc 100644
--- a/docs/serving/distributed_troubleshooting.md
+++ b/docs/serving/distributed_troubleshooting.md
@@ -4,11 +4,11 @@ For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).
 
 ## Verify inter-node GPU communication
 
-After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to <gh-file:examples/online_serving/run_cluster.sh>, for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <gh-issue:6803>.
+After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
 
 ## No available node types can fulfill resource request
 
-The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in <gh-file:examples/online_serving/run_cluster.sh> (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <gh-issue:7815>.
+The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.
 
 ## Ray observability
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index e44a914c726dbd41570d72aa5aeb8fbb8b61e5ff..ec07896592ba38797712fbb7586808081597fb0f 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -8,19 +8,22 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen
 
 Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:
 
-1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](gh-file:tools/ep_kernels).
+1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
 2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
-3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](gh-file:tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
+3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
 
 ### Backend Selection Guide
 
-vLLM provides three communication backends for EP:
+vLLM provides multiple communication backends for EP. Use `--all2all-backend` to select one:
 
 | Backend | Use Case | Features | Best For |
 |---------|----------|----------|----------|
-| `pplx` | Single node | Chunked prefill support | Development, best for intra-node deployments |
-| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout | High-throughput scenarios, prefill-dominated workloads |
-| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout | Low-latency scenarios, decode-dominated workloads |
+| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
+| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
+| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
+| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
+| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
+| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
 
 ## Single Node Deployment
 
@@ -47,11 +50,11 @@ The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parall
 
 ```bash
 # Single node EP deployment with pplx backend
-VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 \
-    vllm serve deepseek-ai/DeepSeek-V3-0324 \
-    --tensor-parallel-size 1 \      # Tensor parallelism across 1 GPU
+vllm serve deepseek-ai/DeepSeek-V3-0324 \
+    --tensor-parallel-size 1 \       # Tensor parallelism across 1 GPU
     --data-parallel-size 8 \         # Data parallelism across 8 processes
-    --enable-expert-parallel         # Enable expert parallelism
+    --enable-expert-parallel \       # Enable expert parallelism
+    --all2all-backend pplx           # Use pplx communication backend
 ```
 
 ## Multi-Node Deployment
@@ -70,8 +73,8 @@ The following example deploys `DeepSeek-V3-0324` across 2 nodes using `deepep_lo
 
 ```bash
 # Node 1 (Primary - handles incoming requests)
-VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \
-    vllm serve deepseek-ai/DeepSeek-V3-0324 \
+vllm serve deepseek-ai/DeepSeek-V3-0324 \
+    --all2all-backend deepep_low_latency \
     --tensor-parallel-size 1 \               # TP size per node
     --enable-expert-parallel \               # Enable EP
     --data-parallel-size 16 \                # Total DP size across all nodes
@@ -81,8 +84,8 @@ VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \
     --api-server-count=8                     # Number of API servers for load handling (scaling this out to total ranks are recommended)
 
 # Node 2 (Secondary - headless mode, no API server)
-VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \
-    vllm serve deepseek-ai/DeepSeek-V3-0324 \
+vllm serve deepseek-ai/DeepSeek-V3-0324 \
+    --all2all-backend deepep_low_latency \
     --tensor-parallel-size 1 \               # TP size per node
     --enable-expert-parallel \               # Enable EP
     --data-parallel-size 16 \                # Total DP size across all nodes
@@ -169,11 +172,12 @@ Single node deployment with EPLB enabled:
 
 ```bash
 # Single node with EPLB load balancing
-VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V3-0324 \
-    --tensor-parallel-size 1 \     # Tensor parallelism
-    --data-parallel-size 8 \        # Data parallelism  
-    --enable-expert-parallel \      # Enable EP
-    --enable-eplb \                 # Enable load balancer
+vllm serve deepseek-ai/DeepSeek-V3-0324 \
+    --tensor-parallel-size 1 \       # Tensor parallelism
+    --data-parallel-size 8 \         # Data parallelism
+    --enable-expert-parallel \       # Enable EP
+    --all2all-backend pplx \         # Use pplx communication backend
+    --enable-eplb \                  # Enable load balancer
     --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```
 
@@ -191,7 +195,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
 
 ### Setup Steps
 
-1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
+1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](../../tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. For non-cuda platform to install nixl with non-cuda UCX build, run the [install_nixl_from_source_ubuntu.py](../../tools/install_nixl_from_source_ubuntu.py) script.
 
 2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'`
 
@@ -239,10 +243,10 @@ try:
                 "remote_engine_id": None,     # Will be populated by vLLM
                 "remote_block_ids": None,     # Will be populated by vLLM
                 "remote_host": None,          # Will be populated by vLLM
-                "remote_port": None           # Will be populated by vLLM
+                "remote_port": None,          # Will be populated by vLLM
             }
         },
-        extra_headers={"X-Request-Id": request_id}
+        extra_headers={"X-Request-Id": request_id},
     )
     
     print("-" * 50)
@@ -258,7 +262,7 @@ try:
         extra_body={
             "kv_transfer_params": prefill_response.kv_transfer_params  # Pass KV cache info
         },
-        extra_headers={"X-Request-Id": request_id}  # Same request ID
+        extra_headers={"X-Request-Id": request_id},  # Same request ID
     )
     
     print("-" * 50)
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 47074f411ac99da2b2967b472ea43b5eccc4d76a..192a61ea5b903aa165af3340cd8a9d7cf108041b 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
     ```python
     from langchain_community.llms import VLLM
 
-    llm = VLLM(model="mosaicml/mpt-7b",
-            trust_remote_code=True,  # mandatory for hf models
-            max_new_tokens=128,
-            top_k=10,
-            top_p=0.95,
-            temperature=0.8,
-            # tensor_parallel_size=... # for distributed inference
+    llm = VLLM(
+        model="mosaicml/mpt-7b",
+        trust_remote_code=True,  # mandatory for hf models
+        max_new_tokens=128,
+        top_k=10,
+        top_p=0.95,
+        temperature=0.8,
+        # for distributed inference
+        # tensor_parallel_size=...,
     )
 
     print(llm("What is the capital of France ?"))
diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md
index ddda47690002a5ae683333dc54e346e31ab189cd..b3d211871821008db4685e10327c71bce86d1a49 100644
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@@ -19,7 +19,7 @@ The available APIs depend on the model type:
 - [Pooling models](../models/pooling_models.md) output their hidden states directly.
 
 !!! info
-    [API Reference][offline-inference-api]
+    [API Reference](../api/README.md#offline-inference)
 
 ## Ray Data LLM API
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index bac3f6c1fe90c6ce90739cb94b9962d94628ffd7..821628e6e31741ad2a90767b31377a5da267ba7d 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
     completion = client.chat.completions.create(
         model="NousResearch/Meta-Llama-3-8B-Instruct",
         messages=[
-            {"role": "user", "content": "Hello!"}
-        ]
+            {"role": "user", "content": "Hello!"},
+        ],
     )
 
     print(completion.choices[0].message)
@@ -44,46 +44,44 @@ To call the server, in your preferred text editor, create a script that uses an
 
 We currently support the following OpenAI APIs:
 
-- [Completions API][completions-api] (`/v1/completions`)
+- [Completions API](#completions-api) (`/v1/completions`)
     - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API][chat-api] (`/v1/chat/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template][chat-template].
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
     - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API][embeddings-api] (`/v1/embeddings`)
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
     - Only applicable to [embedding models](../models/pooling_models.md).
-- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
+- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
     - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
-- [Translation API][translations-api] (`/v1/audio/translations`)
+- [Translation API](#translations-api) (`/v1/audio/translations`)
     - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 
 In addition, we have the following custom APIs:
 
-- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`)
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
     - Applicable to any model with a tokenizer.
-- [Pooling API][pooling-api] (`/pooling`)
+- [Pooling API](#pooling-api) (`/pooling`)
     - Applicable to all [pooling models](../models/pooling_models.md).
-- [Classification API][classification-api] (`/classify`)
+- [Classification API](#classification-api) (`/classify`)
     - Only applicable to [classification models](../models/pooling_models.md).
-- [Score API][score-api] (`/score`)
+- [Score API](#score-api) (`/score`)
     - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
-- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
     - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
     - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
     - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
     - Only applicable to [cross-encoder models](../models/pooling_models.md).
 
-[](){ #chat-template }
-
 ## Chat Template
 
 In order for the language model to support chat protocol, vLLM requires the model to include
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+specifies how roles, messages, and other chat-specific tokens are encoded in the input.
 
 An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those models,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
 template, or the template in string form. Without a chat template, the server will not be able to process chat
 and all chat requests will error.
@@ -92,7 +90,7 @@ and all chat requests will error.
 vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
+vLLM community provides a set of chat templates for popular models. You can find them under the [examples](../../examples) directory.
 
 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
@@ -101,8 +99,13 @@ both a `type` and a `text` field. An example is provided below:
 completion = client.chat.completions.create(
     model="NousResearch/Meta-Llama-3-8B-Instruct",
     messages=[
-        {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-    ]
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
+            ],
+        },
+    ],
 )
 ```
 
@@ -130,11 +133,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 completion = client.chat.completions.create(
     model="NousResearch/Meta-Llama-3-8B-Instruct",
     messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
     ],
     extra_body={
-        "structured_outputs": {"choice": ["positive", "negative"]}
-    }
+        "structured_outputs": {"choice": ["positive", "negative"]},
+    },
 )
 ```
 
@@ -149,11 +152,11 @@ with `--enable-request-id-headers`.
     completion = client.chat.completions.create(
         model="NousResearch/Meta-Llama-3-8B-Instruct",
         messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
         ],
         extra_headers={
             "x-request-id": "sentiment-classification-00001",
-        }
+        },
     )
     print(completion._request_id)
 
@@ -162,25 +165,23 @@ with `--enable-request-id-headers`.
         prompt="A robot may not injure a human being",
         extra_headers={
             "x-request-id": "completion-test",
-        }
+        },
     )
     print(completion._request_id)
     ```
 
 ## API Reference
 
-[](){ #completions-api }
-
 ### Completions API
 
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: <gh-file:examples/online_serving/openai_completion_client.py>
+Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
 
 #### Extra parameters
 
-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.
 
 ??? code
 
@@ -196,8 +197,6 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
     ```
 
-[](){ #chat-api }
-
 ### Chat API
 
 Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
@@ -209,11 +208,11 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
 
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
+Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
 
 #### Extra parameters
 
-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.
 
 ??? code
 
@@ -229,17 +228,37 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
     ```
 
-[](){ #embeddings-api }
-
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
-which will be treated as a single prompt to the model.
+Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
+
+If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
+which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
 
-Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
+??? code
+
+    ```python
+    from openai import OpenAI
+    from openai._types import NOT_GIVEN, NotGiven
+    from openai.types.chat import ChatCompletionMessageParam
+    from openai.types.create_embedding_response import CreateEmbeddingResponse
+
+    def create_chat_embeddings(
+        client: OpenAI,
+        *,
+        messages: list[ChatCompletionMessageParam],
+        model: str,
+        encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
+    ) -> CreateEmbeddingResponse:
+        return client.post(
+            "/embeddings",
+            cast_to=CreateEmbeddingResponse,
+            body={"messages": messages, "model": model, "encoding_format": encoding_format},
+        )
+    ```
 
 #### Multi-modal inputs
 
@@ -254,7 +273,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
     vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
       --trust-remote-code \
       --max-model-len 4096 \
-      --chat-template examples/template_vlm2vec.jinja
+      --chat-template examples/template_vlm2vec_phi3v.jinja
     ```
 
     !!! important
@@ -262,34 +281,36 @@ and passing a list of `messages` in the request. Refer to the examples below for
         to run this model in embedding mode instead of text generation mode.
 
         The custom chat template is completely different from the original one for this model,
-        and can be found here: <gh-file:examples/template_vlm2vec.jinja>
+        and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
 
     Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
     ??? code
 
         ```python
-        import requests
-
+        from openai import OpenAI
+        client = OpenAI(
+            base_url="http://localhost:8000/v1",
+            api_key="EMPTY",
+        )
         image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-        response = requests.post(
-            "http://localhost:8000/v1/embeddings",
-            json={
-                "model": "TIGER-Lab/VLM2Vec-Full",
-                "messages": [{
+        response = create_chat_embeddings(
+            client,
+            model="TIGER-Lab/VLM2Vec-Full",
+            messages=[
+                {
                     "role": "user",
                     "content": [
                         {"type": "image_url", "image_url": {"url": image_url}},
                         {"type": "text", "text": "Represent the given image."},
                     ],
-                }],
-                "encoding_format": "float",
-            },
+                }
+            ],
+            encoding_format="float",
         )
-        response.raise_for_status()
-        response_json = response.json()
-        print("Embedding output:", response_json["data"][0]["embedding"])
+
+        print("Image embedding output:", response.data[0].embedding)
         ```
 
 === "DSE-Qwen2-MRL"
@@ -307,13 +328,13 @@ and passing a list of `messages` in the request. Refer to the examples below for
         Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
 
         Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-        by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
+        by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
 
     !!! important
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
         example below for details.
 
-Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py>
+Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
 
 #### Extra parameters
 
@@ -340,8 +361,6 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
     ```
 
-[](){ #transcriptions-api }
-
 ### Transcriptions API
 
 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
@@ -350,17 +369,96 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 !!! note
     To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 
-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
-<!-- TODO: api enforced limits + uploading audios -->
+Code example: [examples/online_serving/openai_transcription_client.py](../../examples/online_serving/openai_transcription_client.py)
 
 #### API Enforced Limits
 
 Set the maximum audio file size (in MB) that VLLM will accept, via the
 `VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
 
+#### Uploading Audio Files
+
+The Transcriptions API supports uploading audio files in various formats including FLAC, MP3, MP4, MPEG, MPGA, M4A, OGG, WAV, and WEBM.
+
+**Using OpenAI Python Client:**
+
+??? code
+
+    ```python
+    from openai import OpenAI
+
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+
+    # Upload audio file from disk
+    with open("audio.mp3", "rb") as audio_file:
+        transcription = client.audio.transcriptions.create(
+            model="openai/whisper-large-v3-turbo",
+            file=audio_file,
+            language="en",
+            response_format="verbose_json",
+        )
+
+    print(transcription.text)
+    ```
+
+**Using curl with multipart/form-data:**
+
+??? code
+
+    ```bash
+    curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
+      -H "Authorization: Bearer token-abc123" \
+      -F "file=@audio.mp3" \
+      -F "model=openai/whisper-large-v3-turbo" \
+      -F "language=en" \
+      -F "response_format=verbose_json"
+    ```
+
+**Supported Parameters:**
+
+- `file`: The audio file to transcribe (required)
+- `model`: The model to use for transcription (required)
+- `language`: The language code (e.g., "en", "zh") (optional)
+- `prompt`: Optional text to guide the transcription style (optional)
+- `response_format`: Format of the response ("json", "text") (optional)
+- `temperature`: Sampling temperature between 0 and 1 (optional)
+
+For the complete list of supported parameters including sampling parameters and vLLM extensions, see the [protocol definitions](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L2182).
+
+**Response Format:**
+
+For `verbose_json` response format:
+
+??? code
+
+    ```json
+    {
+      "text": "Hello, this is a transcription of the audio file.",
+      "language": "en",
+      "duration": 5.42,
+      "segments": [
+        {
+          "id": 0,
+          "seek": 0,
+          "start": 0.0,
+          "end": 2.5,
+          "text": "Hello, this is a transcription",
+          "tokens": [50364, 938, 428, 307, 275, 28347],
+          "temperature": 0.0,
+          "avg_logprob": -0.245,
+          "compression_ratio": 1.235,
+          "no_speech_prob": 0.012
+        }
+      ]
+    }
+    ```
+
 #### Extra Parameters
 
-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.
 
 ??? code
 
@@ -376,8 +474,6 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
     ```
 
-[](){ #translations-api }
-
 ### Translations API
 
 Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
@@ -388,11 +484,11 @@ Please mind that the popular `openai/whisper-large-v3-turbo` model does not supp
 !!! note
     To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`.
 
-Code example: <gh-file:examples/online_serving/openai_translation_client.py>
+Code example: [examples/online_serving/openai_translation_client.py](../../examples/online_serving/openai_translation_client.py)
 
 #### Extra Parameters
 
-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.
 
 ```python
 --8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
@@ -404,8 +500,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
 ```
 
-[](){ #tokenizer-api }
-
 ### Tokenizer API
 
 Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
@@ -414,17 +508,13 @@ It consists of two endpoints:
 - `/tokenize` corresponds to calling `tokenizer.encode()`.
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
-[](){ #pooling-api }
-
 ### Pooling API
 
 Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
 
-The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: <gh-file:examples/online_serving/pooling/openai_pooling_client.py>
-
-[](){ #classification-api }
+Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
 
 ### Classification API
 
@@ -432,7 +522,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
 
 We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
 
-Code example: <gh-file:examples/online_serving/pooling/openai_classification_client.py>
+Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
 
 #### Example Requests
 
@@ -541,8 +631,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
 ```
 
-[](){ #score-api }
-
 ### Score API
 
 Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
@@ -550,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
+Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
 
 #### Single inference
 
@@ -709,29 +797,29 @@ You can pass multi-modal inputs to scoring models by passing `content` including
                 "model": "jinaai/jina-reranker-m0",
                 "text_1": "slm markdown",
                 "text_2": {
-                  "content": [
-                          {
-                              "type": "image_url",
-                              "image_url": {
-                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                              },
-                          },
-                          {
-                              "type": "image_url",
-                              "image_url": {
-                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                              },
-                          },
-                      ]
-                  }
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                            },
+                        },
+                    ],
                 },
+            },
         )
         response.raise_for_status()
         response_json = response.json()
         print("Scoring output:", response_json["data"][0]["score"])
         print("Scoring output:", response_json["data"][1]["score"])
         ```
-Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
+Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
 
 #### Extra parameters
 
@@ -748,8 +836,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
 ```
 
-[](){ #rerank-api }
-
 ### Re-rank API
 
 Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
@@ -763,7 +849,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 
-Code example: <gh-file:examples/online_serving/pooling/jinaai_rerank_client.py>
+Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
 
 #### Example Request
 
@@ -841,6 +927,6 @@ Key capabilities:
 - Scales from a single GPU to a multi-node cluster without code changes.
 - Provides observability and autoscaling policies through Ray dashboards and metrics.
 
-The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/online_serving/ray_serve_deepseek.py](../../examples/online_serving/ray_serve_deepseek.py).
 
 Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index cef1127fc5c15bf39983ed498c1c0630c015021c..14cd3b057791cfe5e1299adfc7ba0501a41d6acc 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -72,7 +72,7 @@ For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.htm
 
 ### Ray cluster setup with containers
 
-The helper script <gh-file:examples/online_serving/run_cluster.sh> starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
+The helper script [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
 
 Choose one node as the head node and run:
 
@@ -132,7 +132,7 @@ vllm serve /path/to/the/model/in/the/container \
 
 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
-<gh-file:examples/online_serving/run_cluster.sh> helper script.
+[examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
 
 ## Enabling GPUDirect RDMA
diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md
index f608a630ab7a5581b39ff86eeebdd7968c6e9b7c..0b7e384dc8d6a5cde7e8fdb0b4ba1470b4eb9cea 100644
--- a/docs/training/rlhf.md
+++ b/docs/training/rlhf.md
@@ -1,8 +1,20 @@
 # Reinforcement Learning from Human Feedback
 
-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. vLLM can be used to generate the completions for RLHF.
 
-vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
+The following open-source RL libraries use vLLM for fast rollouts (sorted alphabetically and non-exhaustive):
+
+- [Cosmos-RL](https://github.com/nvidia-cosmos/cosmos-rl)
+- [ms-swift](https://github.com/modelscope/ms-swift/tree/main)
+- [NeMo-RL](https://github.com/NVIDIA-NeMo/RL)
+- [Open Instruct](https://github.com/allenai/open-instruct)
+- [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF)
+- [PipelineRL](https://github.com/ServiceNow/PipelineRL)
+- [Prime-RL](https://github.com/PrimeIntellect-ai/prime-rl)
+- [SkyRL](https://github.com/NovaSky-AI/SkyRL)
+- [TRL](https://github.com/huggingface/trl)
+- [Unsloth](https://github.com/unslothai/unsloth)
+- [verl](https://github.com/volcengine/verl)
 
 See the following basic examples to get started if you don't want to use an existing library:
 
@@ -12,4 +24,5 @@ See the following basic examples to get started if you don't want to use an exis
 
 See the following notebooks showing how to use vLLM for GRPO:
 
+- [Efficient Online Training with GRPO and vLLM in TRL](https://huggingface.co/learn/cookbook/grpo_vllm_online_training)
 - [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
diff --git a/docs/training/trl.md b/docs/training/trl.md
index c7c1a5a3bbd1ed638359d00a17b20c54a7e862ad..acf48cc4ecb33f03742aaa4d85851c744b9e966a 100644
--- a/docs/training/trl.md
+++ b/docs/training/trl.md
@@ -1,12 +1,54 @@
 # Transformers Reinforcement Learning
 
-Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
+[Transformers Reinforcement Learning](https://huggingface.co/docs/trl) (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
 
 Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
 
-See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
+See the [vLLM integration guide](https://huggingface.co/docs/trl/main/en/vllm_integration) in the TRL documentation for more information.
+
+TRL currently supports the following online trainers with vLLM:
+
+- [GRPO](https://huggingface.co/docs/trl/main/en/grpo_trainer)
+- [Online DPO](https://huggingface.co/docs/trl/main/en/online_dpo_trainer)
+- [RLOO](https://huggingface.co/docs/trl/main/en/rloo_trainer)
+- [Nash-MD](https://huggingface.co/docs/trl/main/en/nash_md_trainer)
+- [XPO](https://huggingface.co/docs/trl/main/en/xpo_trainer)
+
+To enable vLLM in TRL, set the `use_vllm` flag in the trainer configuration to `True`.
+
+## Modes of Using vLLM During Training
+
+TRL supports **two modes** for integrating vLLM during training: **server mode** and **colocate mode**. You can control how vLLM operates during training with the `vllm_mode` parameter.
+
+### Server mode
+
+In **server mode**, vLLM runs as an independent process on dedicated GPUs and communicates with the trainer through HTTP requests. This configuration is ideal when you have separate GPUs for inference, as it isolates generation workloads from training, ensuring stable performance and easier scaling.
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
+```
+
+### Colocate mode
+
+In **colocate mode**, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+Some trainers also support **vLLM sleep mode**, which offloads parameters and caches to GPU RAM during training, helping reduce memory usage. Learn more in the [memory optimization docs](https://huggingface.co/docs/trl/main/en/reducing_memory_usage#vllm-sleep-mode).
 
 !!! info
-    For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
-    - [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
-    - [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
+    For detailed configuration options and flags, refer to the documentation of the specific trainer you are using.
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 0c63d01f0f99fbbf9b2533149b77014a167601a3..4e8ece2c0605245d846faab311f53bfad84ce32d 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index a494dcf19191f0c880ffb3bfc48fe64d0079f82b..d8a1943209c1ee1602c16c02bc8e88f1fb7ddcb1 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -6,7 +6,7 @@ reproducible results:
 - For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 - For V0: Set the global seed (see below).
 
-Example: <gh-file:examples/offline_inference/reproducibility.py>
+Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
 !!! warning
 
@@ -39,7 +39,7 @@ In V1, the `seed` parameter defaults to `0` which sets the random state for each
     It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
     for workflows such as speculative decoding.
     
-    For more information, see: <gh-pr:17929>
+    For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
 
 ### Locality of random state
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 6e700d1faaa9c3e2f94f606a36f59d61c1bad329..1995045fd5562028e6864756f65f69d8eb9f948a 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -24,7 +24,7 @@ If the model is too large to fit in a single GPU, you will get an out-of-memory
 
 ## Generation quality changed
 
-In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+In v0.8.0, the source of default sampling parameters was changed in <https://github.com/vllm-project/vllm/pull/12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
 
 In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
 
@@ -38,7 +38,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
-- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. (WARNING: This flag will slow down the token generation by **over 100x**. Do not use unless absolutely needed.)
 
 ## Breakpoints
 
@@ -80,8 +80,6 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
 If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
-[](){ #troubleshooting-incorrect-hardware-driver }
-
 ## Incorrect hardware/driver
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@@ -178,8 +176,6 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b
 
     Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 
-[](){ #troubleshooting-python-multiprocessing }
-
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
@@ -238,7 +234,7 @@ if __name__ == '__main__':
 
 ## `torch.compile` Error
 
-vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
+vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
 
 ??? code
 
@@ -257,7 +253,7 @@ vLLM heavily depends on `torch.compile` to optimize the model for better perform
     print(f(x))
     ```
 
-If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <gh-issue:12219> for example.
+If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <https://github.com/vllm-project/vllm/issues/12219> for example.
 
 ## Model failed to be inspected
 
@@ -297,7 +293,7 @@ But you are sure that the model is in the [list of supported models](../models/s
 
 ## Failed to infer device type
 
-If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
+If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](../../vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](https://github.com/vllm-project/vllm/pull/14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
 
 ## NCCL error: unhandled system error during `ncclCommInitRank`
 
@@ -320,8 +316,12 @@ Traceback (most recent call last):
 
 This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA.
 
+## CUDA error: the provided PTX was compiled with an unsupported toolchain
+
+If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
+
 ## Known Issues
 
-- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
 - To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations.
 - In some PCIe machines (e.g. machines without NVLink), if you see an error like `transport/shm.cc:590 NCCL WARN Cuda failure 217 'peer access is not supported between these two devices'`, it's likely caused by a driver bug. See [this issue](https://github.com/NVIDIA/nccl/issues/1838) for more details. In that case, you can try to set `NCCL_CUMEM_HOST_ENABLE=0` to disable the feature, or upgrade your driver to the latest version.
diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md
index 4c7a7ff019e8ca02240694bbe9eeba1df73dc7e9..6225478d52d00e45a9b3c96b3287afd0af61a983 100644
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@@ -6,7 +6,7 @@ A subset of the data, after cleaning and aggregation, will be publicly released
 
 ## What data is collected?
 
-The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
+The list of data collected by the latest version of vLLM can be found here: [vllm/usage/usage_lib.py](../../vllm/usage/usage_lib.py)
 
 Here is an example as of v0.4.0:
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 340aaf54bb72024645d65972599bf3b7ab18dd60..8d8a9e0f50805b2f9a02eb901cf68aea170c993d 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -2,12 +2,10 @@
 
 !!! announcement
 
-    We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
+    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
-To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
-
 ## Why vLLM V1?
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
@@ -88,20 +86,14 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
 
-vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
-
-!!! tip
-
-    This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
-
 See below for the status of models that are not yet supported or have more features planned in V1.
 
 #### Embedding Models
 
 The initial basic support is now functional.
 
-Later, we will consider using [hidden states processor](gh-issue:12249),
-which is based on [global logits processor](gh-pr:13360)
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
+which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
 to enable simultaneous generation and embedding using the same engine instance in V1.
 
 #### Mamba Models
@@ -130,13 +122,13 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
 | **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
 | **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<gh-pr:15191>)</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
 | **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](gh-issue:13414))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](gh-issue:13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](gh-pr:13360))</nobr> |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
 | **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
 
 !!! note
@@ -174,11 +166,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
 
 ##### Sampling features
 
-- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
   feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
 
 ##### KV Cache features
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 65a87d2dd9e8ebb023bb81e6f824bae27619e4bf..04e6f99f8957ec69cc0485bd8033bde1a5158453 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -10,7 +10,7 @@ on HuggingFace model repository.
 
 import os
 from dataclasses import asdict
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -18,7 +18,7 @@ from transformers import AutoTokenizer
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.lora.request import LoRARequest
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 question_per_audio_count = {
@@ -30,11 +30,11 @@ question_per_audio_count = {
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[dict[str, list[int]]] = None
-    multi_modal_data: Optional[dict[str, Any]] = None
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    prompt: str | None = None
+    prompt_token_ids: dict[str, list[int]] | None = None
+    multi_modal_data: dict[str, Any] | None = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -43,12 +43,15 @@ class ModelRequestData(NamedTuple):
 
 
 # Voxtral
+# Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.messages import (
+    from mistral_common.protocol.instruct.chunk import (
         AudioChunk,
         RawAudio,
         TextChunk,
+    )
+    from mistral_common.protocol.instruct.messages import (
         UserMessage,
     )
     from mistral_common.protocol.instruct.request import ChatCompletionRequest
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index 9e7036fea61347d8f1a7701beff7aa2ad9627a7a..c42b00730fe43beda6d4717d7d199aaf2f001a95 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def create_parser():
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index dc3bc399ca8a9e1958b6a73665c74200a0b0cf2e..b72ddde1fb55366f3f2a60d14b6c38d80f4bf962 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 158836728beeded0df544510d169723ef127b34b..eeb7137ff7bae59b6c9c37f758527a9da2e6411b 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 6a41ef4d84bb6702a7351704f9e729b53272f478..9650dcfe967b3a5ef6716be3d2bbc1e24b1c0d1d 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def create_parser():
diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
index aa173cf96f5bc606320eb1a3b1ec6fac753b6e98..e9508568655da926935f28c1c3fbdebd81589a74 100644
--- a/examples/offline_inference/basic/reward.py
+++ b/examples/offline_inference/basic/reward.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index c9ca7a8bf06b8e0516d82cbc69d4eb73a84e9eb1..cbca50eb5efa8a8771759e20a531cc881167ab7d 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 0076d4d30ee8ecb299b90ea5c904dc3e15bc1cce..0b281fc41a341bd4730ece21dc3b90b3147d1316 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -33,7 +33,7 @@ import os
 from time import sleep
 
 from vllm import LLM, SamplingParams
-from vllm.utils import get_open_port
+from vllm.utils.network_utils import get_open_port
 
 
 def parse_args():
@@ -95,7 +95,7 @@ def parse_args():
     parser.add_argument(
         "--compilation-config",
         type=int,
-        help=("Compilation optimization (O) level 0-3."),
+        help=("Compilation optimization (O) mode 0-3."),
     )
     parser.add_argument(
         "--quantization",
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 4a1b0c40604b2b8bc2f08e24e0fb6ee6269870ba..c1d6c6db53dfb49601fea992341494925b7edcec 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -13,7 +13,7 @@ from typing import NamedTuple
 
 from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 class ModelRequestData(NamedTuple):
diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/offline_inference/kv_load_failure_recovery/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..230a16812b25ead5bc06e36b4f7f93431b703b63
--- /dev/null
+++ b/examples/offline_inference/kv_load_failure_recovery/README.md
@@ -0,0 +1,30 @@
+# KV Load Failure Recovery Test
+
+This example builds upon the `disaggregated-prefill-v1` example in `examples/offline_inference`.
+
+It demonstrates vLLM's ability to recover from KV load failures in both synchronous and asynchronous loading modes. The goal is to verify that vLLM correctly identifies invalid KV blocks, reschedules the affected requests, and ensures successful and consistent output.
+
+## Files
+
+- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `disaggregated-prefill-v1`).
+- `decode_example.py` – performs the decode stage. Accepts:
+    - `--simulate-failure`: simulates KV load failure using a custom connector.
+    - `--async-load`: enables asynchronous KV loading mode.
+- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
+- `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
+    1. Normal decode (baseline).
+    2. Decode with simulated sync KV load failure.
+    3. Decode with simulated async KV load failure.
+
+    Finally, it compares the output of the baseline with the recovered outputs to verify correctness.
+
+## How It Works
+
+- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
+- The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
+- If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
+
+## Usage
+
+```bash
+./run.sh
diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..69523f56eace399cf9ae7bc331b23a0370bfba6b
--- /dev/null
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    """Read prompts from prefill_output.txt"""
+    prompts = []
+    try:
+        with open("prefill_output.txt") as f:
+            for line in f:
+                prompts.append(line.strip())
+        print(f"Loaded {len(prompts)} prompts from prefill_output.txt")
+        return prompts
+    except FileNotFoundError:
+        print("Error: prefill_output.txt file not found")
+        exit(-1)
+
+
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--simulate-failure", action="store_true", help="Simulate KV load failure."
+    )
+    parser.add_argument(
+        "--async-load", action="store_true", help="Simulate async KV load"
+    )
+    args = parser.parse_args()
+
+    if args.simulate_failure:
+        ktc = KVTransferConfig(
+            kv_connector="RogueSharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "shared_storage_path": "local_storage",
+                "async_load": args.async_load,
+            },
+            kv_connector_module_path="rogue_shared_storage_connector",
+        )
+        out_file = (
+            "async_decode_recovered_output.txt"
+            if args.async_load
+            else "sync_decode_recovered_output.txt"
+        )
+    else:
+        ktc = KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "shared_storage_path": "local_storage",
+            },
+        )
+        out_file = "decode_output.txt"
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
+        kv_transfer_config=ktc,
+    )
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    sep_str = "-" * 30
+    with open(out_file, "w", encoding="utf-8") as f:
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            out_str = f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}"
+            print(out_str)
+            print(sep_str)
+            f.write(out_str)
+            f.write(sep_str)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..047b81c82df535f0fc0acf93fc7731eb98a10ba1
--- /dev/null
+++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
+        context + "Hello, my name is",
+        context + "The capital of France is",
+        context2 + "Your name is",
+        context2 + "The capital of China is",
+    ]
+
+
+def main():
+    prompts = read_prompts()
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+    )
+
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+    # Write new_prompts to prefill_output.txt
+    with open("prefill_output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to prefill_output.txt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b2acea4c9457c523209fc7189d5d7f9f5e158fb
--- /dev/null
+++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+import logging
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
+    SharedStorageConnector,
+    SharedStorageConnectorMetadata,
+)
+from vllm.forward_context import ForwardContext
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.request import Request
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+@dataclass
+class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
+    req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
+
+    @classmethod
+    def from_base(cls, base: SharedStorageConnectorMetadata):
+        return cls(requests=base.requests)
+
+
+class RogueSharedStorageConnector(SharedStorageConnector):
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
+            "async_load", False
+        )
+        self._invalid_block_ids: set = None
+        self._seen_requests: set = set()
+        self._req_to_block_ids: dict[str, list[int]] = dict()
+
+    def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
+        assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
+        index, failed_request = next(
+            (
+                (i, x)
+                for i, x in enumerate(connector_metadata.requests)
+                if not x.is_store
+            ),
+            (None, None),
+        )
+        if index is not None:
+            del connector_metadata.requests[index]
+            self._invalid_block_ids = set(
+                (
+                    failed_request.slot_mapping[:: self._block_size] // self._block_size
+                ).tolist()
+            )
+            logger.info(
+                "Simulating failure to load all KV blocks for the "
+                "first load request. Total blocks: %d",
+                len(self._invalid_block_ids),
+            )
+        super().bind_connector_metadata(connector_metadata)
+
+    def clear_connector_metadata(self) -> None:
+        self._invalid_block_ids = None
+        super().clear_connector_metadata()
+
+    def start_load_kv(self, forward_context: ForwardContext, **kwargs) -> None:
+        if self._async_load and forward_context.attn_metadata is None:
+            # Bypass  sanity check in super().start_load_kv
+            forward_context.attn_metadata = "None"
+
+        super().start_load_kv(forward_context, **kwargs)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        if self._async_load:
+            meta = self._get_connector_metadata()
+            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
+            if meta.req_to_block_ids:
+                return None, set(meta.req_to_block_ids)
+
+        return None, None
+
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        return self._invalid_block_ids
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        if request.request_id in self._seen_requests:
+            return 0, False
+
+        self._seen_requests.add(request.request_id)
+
+        num_tokens, _ = super().get_num_new_matched_tokens(request, num_computed_tokens)
+        return num_tokens, self._async_load and num_tokens > 0
+
+    def update_state_after_alloc(
+        self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int
+    ):
+        """
+        Update KVConnector state after block allocation.
+
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        super().update_state_after_alloc(request, blocks, num_external_tokens)
+
+        if num_external_tokens > 0:
+            self._req_to_block_ids[request.request_id] = blocks.get_block_ids()[0]
+
+    def build_connector_meta(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> KVConnectorMetadata:
+        if not self._async_load:
+            base = super().build_connector_meta(scheduler_output)
+            meta = RogueSharedStorageConnectorMetadata.from_base(base)
+        else:
+            meta = RogueSharedStorageConnectorMetadata()
+            if self._requests_need_load:
+                for req_id, request in self._requests_need_load.items():
+                    meta.add_request(
+                        token_ids=request.prompt_token_ids,
+                        block_ids=self._req_to_block_ids[req_id],
+                        block_size=self._block_size,
+                        is_store=False,
+                        mm_hashes=[],
+                    )
+                # Clear state
+                self._requests_need_load.clear()
+        meta.req_to_block_ids = self._req_to_block_ids
+        self._req_to_block_ids = dict()
+        return meta
diff --git a/examples/offline_inference/kv_load_failure_recovery/run.sh b/examples/offline_inference/kv_load_failure_recovery/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..53fe2385d46d17ded0ecb9390d1ffedde63cf3c7
--- /dev/null
+++ b/examples/offline_inference/kv_load_failure_recovery/run.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Constants
+SHARED_STORAGE_DIR="local_storage"
+PREFILL_OUTPUT="prefill_output.txt"
+DECODE_OUTPUT="decode_output.txt"
+SYNC_DECODE_RECOVERED_OUTPUT="sync_decode_recovered_output.txt"
+ASYNC_DECODE_RECOVERED_OUTPUT="async_decode_recovered_output.txt"
+
+# Cleanup
+rm -rf "$SHARED_STORAGE_DIR"
+rm -f "$PREFILL_OUTPUT" "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
+
+# Run inference examples
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure --async-load
+
+# Compare outputs
+if ! cmp -s "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"; then
+    echo "❌ Outputs differ: sync recovery failed."
+    diff -u "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"
+    exit 1
+fi
+
+if ! cmp -s "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"; then
+    echo "❌ Outputs differ: async recovery failed."
+    diff -u "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
+    exit 1
+fi
+
+echo "✅ Outputs match: recovery successful."
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index d7f2a1633113d0f2b92b232affd20f50ccace3f9..d9215255a808133756e6729816c8c90fccec633f 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -8,7 +8,7 @@ for processing prompts with various sampling parameters.
 import argparse
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def create_test_prompts() -> list[tuple[str, SamplingParams]]:
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index cc78c0cbbf7c07ad5dadee1eba89776d5376ce3d..c5ae35985c17c961fa8a982a87f25bce6514948a 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -11,7 +11,7 @@ python save_sharded_state.py \
     --model /path/to/load \
     --quantization deepspeedfp \
     --tensor-parallel-size 8 \
-    --output /path/to/save/sharded/modele
+    --output /path/to/save/sharded/model
 
 python load_sharded_state.py \
     --model /path/to/saved/sharded/model \
@@ -25,7 +25,7 @@ python load_sharded_state.py \
 import dataclasses
 
 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py
index 4112a498f37ab64943750a1decdca87a982acb88..ce000872dc96e96dc166c72295dec9cd25918d5d 100644
--- a/examples/offline_inference/logits_processor/custom.py
+++ b/examples/offline_inference/logits_processor/custom.py
@@ -33,7 +33,7 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 
-from typing import Optional
+from typing import Any
 
 import torch
 
@@ -50,6 +50,16 @@ from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: Any | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"target_token value {target_token} {type(target_token)} is not int"
+            )
+
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
@@ -58,15 +68,18 @@ class DummyLogitsProcessor(LogitsProcessor):
     def is_argmax_invariant(self) -> bool:
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
+        def extract_extra_arg(params: SamplingParams) -> int | None:
+            self.validate_params(params)
+            return params.extra_args and params.extra_args.get("target_token")
+
         process_dict_updates(
             self.req_info,
             batch_update,
             # This function returns the LP's per-request state based on the
             # request details, or None if this LP does not apply to the
             # request.
-            lambda params, _, __: params.extra_args
-            and (params.extra_args.get("target_token")),
+            lambda params, _, __: extract_extra_arg(params),
         )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py
index 4c19bb4ce2bae2339ad1e64444b15cdecee6a8a5..5763fff5410dd1dc354700ddb7390c2492e34c9f 100644
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
@@ -39,7 +39,7 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -76,13 +76,21 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     """Example of wrapping a fake request-level logit processor to create a
     batch-level logits processor"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: Any | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(f"target_token value {target_token} is not int")
+
     def is_argmax_invariant(self) -> bool:
         return False
 
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
@@ -96,18 +104,11 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
         Returns:
           `Callable` request logits processor, or None
         """
-        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+        target_token: Any | None = params.extra_args and params.extra_args.get(
             "target_token"
         )
         if target_token is None:
             return None
-        if not isinstance(target_token, int):
-            logger.warning(
-                "target_token value %s is not int; not applying logits"
-                " processor to request.",
-                target_token,
-            )
-            return None
         return DummyPerReqLogitsProcessor(target_token)
 
 
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py
index 62947d122e01cba9dc9377540f9da8be9c5900a5..acd2c47f230f0e07e9fb0e770f4a24c17987a48e 100644
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
 device, the first and third requests would not repeat the same token.
 """
 
-from typing import Optional
-
 import torch
 
 from vllm import LLM, SamplingParams
@@ -79,6 +77,14 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     """Example of overriding the wrapper class `__init__()` in order to utilize
     info about the device type"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token = params.extra_args and params.extra_args.get("target_token")
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"`target_token` has to be an integer, got {target_token}."
+            )
+
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
@@ -91,7 +97,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
@@ -115,13 +121,6 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
             is None
         ):
             return None
-        if not isinstance(target_token, int):
-            logger.warning(
-                "target_token value %s is not int; not applying logits"
-                " processor to request.",
-                target_token,
-            )
-            return None
         return DummyPerReqLogitsProcessor(target_token)
 
 
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 00d4cb9eb4c41948780e6a2beb0d71fb8c94b06b..dc5c6202fa57b9ae0843458bc3fedfa7cd730da9 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
 """
 
 import gc
-from typing import Optional
 
 import torch
 from huggingface_hub import snapshot_download
@@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
 
 def create_test_prompts(
     lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
     return [
         # this is an example of using quantization without LoRA
         (
@@ -56,7 +55,7 @@ def create_test_prompts(
 
 def process_requests(
     engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
@@ -78,7 +77,7 @@ def process_requests(
 
 
 def initialize_engine(
-    model: str, quantization: str, lora_repo: Optional[str]
+    model: str, quantization: str, lora_repo: str | None
 ) -> LLMEngine:
     """Initialize the LLMEngine."""
 
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index d5b1b4ad29a92dfd41d6ab9ef07f4221c54a57c2..6a533eb5c937f6fcb66a7ef42fe8de4fb5ec0795 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -4,8 +4,7 @@
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
 
-Note that still not support `v1`:
-VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+Note that this example is out of date and not supported in vLLM v1.
 """
 
 import gc
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6040683c68bcd8d1a5281f63946325dc794d23c7..6c23cf342e06b8bbba9ca5f0a3f0195095a4bc7e 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -7,8 +7,6 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """
 
-from typing import Optional
-
 from huggingface_hub import snapshot_download
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
 
 def create_test_prompts(
     lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
     """Create a list of test prompts with their sampling parameters.
 
     2 requests for base model, 4 requests for the LoRA. We define 2
@@ -68,7 +66,7 @@ def create_test_prompts(
 
 def process_requests(
     engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 3c6f6c7a6c5884d2065d1e4e0b37285f7bb7222d..7d5a1af8f5a4ae6dcd96393ed9ac8c743ba9ab6a 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
     """
     try:
         url = s3_client.generate_presigned_url(
-            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
+            ClientMethod=client_method,
+            Params=method_parameters,
+            ExpiresIn=expires_in,
         )
     except ClientError:
         raise
@@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
 
 s3_client = boto3.client("s3")
 input_url = generate_presigned_url(
-    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
+    s3_client,
+    "get_object",
+    {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
+    expires_in=3600,
 )
 output_url = generate_presigned_url(
-    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
+    s3_client,
+    "put_object",
+    {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
+    expires_in=3600,
 )
 print(f"{input_url=}")
 print(f"{output_url=}")
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index 79afbd9cfac47813f45c7c343f7cb68c7470aa2f..ad78be38716b6deeee64a69ad336da8d41220eb7 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -14,7 +14,7 @@ python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_na
 
 ## Embed jina_embeddings_v3 usage
 
-Only text matching task is supported for now. See <gh-pr:16120>
+Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
 
 ```bash
 python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
@@ -26,12 +26,30 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```
 
+## Multi vector retrieval usage
+
+```bash
+python examples/offline_inference/pooling/multi_vector_retrieval.py
+```
+
 ## Named Entity Recognition (NER) usage
 
 ```bash
 python examples/offline_inference/pooling/ner.py
 ```
 
+## Prithvi Geospatial MAE usage
+
+```bash
+python examples/offline_inference/pooling/prithvi_geospatial_mae.py
+```
+
+## IO Processor Plugins for Prithvi Geospatial MAE
+
+```bash
+python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
+```
+
 ## Qwen3 reranker usage
 
 ```bash
diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py
index 33a63deee91bbd99e3b5acd5fc669e91252a26e0..b117b0bd5fbe0b18d5231b3e8640283ea2572c19 100644
--- a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py
index 6871bcfccf1b96f37519e76ddc5a8be32cb44e52..6544df852303dcd0d0d4fa21494909ba7ce94b4a 100644
--- a/examples/offline_inference/pooling/embed_matryoshka_fy.py
+++ b/examples/offline_inference/pooling/embed_matryoshka_fy.py
@@ -4,7 +4,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs, PoolingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7d1c3ba2167524ae07790f85e3e0142e0947c2
--- /dev/null
+++ b/examples/offline_inference/pooling/multi_vector_retrieval.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="BAAI/bge-m3",
+        runner="pooling",
+        enforce_eager=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass runner="pooling" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = llm.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        print(len(embeds))
+
+    # Generate embedding for each token. The output is a list of PoolingRequestOutput.
+    outputs = llm.encode(prompts, pooling_task="token_embed")
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        multi_vector = output.outputs.data
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py
index f18742fac0d54e26e759401614f2d1d671ed299a..34c80e7ccffd308c68428cba39a14203a68966f0 100644
--- a/examples/offline_inference/pooling/ner.py
+++ b/examples/offline_inference/pooling/ner.py
@@ -5,7 +5,7 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
@@ -33,7 +33,7 @@ def main(args: Namespace):
     label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
 
     # Run inference
-    outputs = llm.encode(prompts)
+    outputs = llm.encode(prompts, pooling_task="token_classify")
 
     for prompt, output in zip(prompts, outputs):
         logits = output.outputs.data
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/pooling/prithvi_geospatial_mae.py
similarity index 98%
rename from examples/offline_inference/prithvi_geospatial_mae.py
rename to examples/offline_inference/pooling/prithvi_geospatial_mae.py
index 1a5879a6d35f54a84721063bf5e09e359ee04432..b093c77c00b77ddc9d891b6831e8be0980163db4 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/pooling/prithvi_geospatial_mae.py
@@ -3,7 +3,6 @@
 import argparse
 import datetime
 import os
-from typing import Union
 
 import albumentations
 import numpy as np
@@ -50,6 +49,7 @@ class PrithviMAE:
             dtype="float16",
             enforce_eager=True,
             model_impl="terratorch",
+            enable_mm_embeds=True,
         )
 
     def run(self, input_data, location_coords):
@@ -64,7 +64,7 @@ class PrithviMAE:
         }
 
         prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
-        outputs = self.model.encode(prompt, use_tqdm=False)
+        outputs = self.model.encode(prompt, pooling_task="plugin", use_tqdm=False)
 
         return outputs[0].outputs.data
 
@@ -160,7 +160,7 @@ def load_example(
     file_paths: list[str],
     mean: list[float] = None,
     std: list[float] = None,
-    indices: Union[list[int], None] = None,
+    indices: list[int] | None = None,
 ):
     """Build an input example by loading images in *file_paths*.
 
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
similarity index 80%
rename from examples/offline_inference/prithvi_geospatial_mae_io_processor.py
rename to examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
index 418c40645f9f262e1f3ecdcaa6726ffbb6786c3d..b8637b89e08f04ddf4953dda79dd63b58aa4f47d 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
@@ -6,14 +6,14 @@ import os
 import torch
 
 from vllm import LLM
-from vllm.pooling_params import PoolingParams
 
 # This example shows how to perform an offline inference that generates
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Requirement - install plugin at:
-#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# Requirements:
+# - install TerraTorch v1.1 (or later):
+#   pip install terratorch>=v1.1
 
 
 def main():
@@ -36,15 +36,12 @@ def main():
         # to avoid the model going OOM.
         # The maximum number depends on the available GPU memory
         max_num_seqs=32,
-        io_processor_plugin="prithvi_to_tiff",
+        io_processor_plugin="terratorch_segmentation",
         model_impl="terratorch",
+        enable_mm_embeds=True,
     )
 
-    pooling_params = PoolingParams(task="encode", softmax=False)
-    pooler_output = llm.encode(
-        img_prompt,
-        pooling_params=pooling_params,
-    )
+    pooler_output = llm.encode(img_prompt, pooling_task="plugin")
     output = pooler_output[0].outputs
 
     print(output)
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index dfcbd8c8d36059b5505996f37c6e7b2dd0f5a0b9..3b127e4fd29df2451551826d87bf0766d64c5f22 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -13,7 +13,7 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
 DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index 16d44cbadbc989e7e31b48644fc408d380f9176d..d8fb50d7fe55c823792acb0857ae95dc5a2eb49b 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
 
 # Read vision and audio inputs from a single video file
 # NOTE: V1 engine does not support interleaved modalities yet.
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q use_audio_in_video
 
 # Multiple audios
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q multi_audios
 ```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 62effd5c8b62e3b40bb078b62628405159652e21..ed005e6a69b80d8912b2cc5b2066f997944d0596 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -7,13 +7,12 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
 
 from typing import NamedTuple
 
-import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.multimodal.image import convert_image_mode
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 class QueryResult(NamedTuple):
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
     )
     asset = VideoAsset(name="baby_reading", num_frames=16)
     audio = asset.get_audio(sampling_rate=16000)
-    assert not envs.VLLM_USE_V1, (
-        "V1 does not support use_audio_in_video. "
-        "Please launch this example with "
-        "`VLLM_USE_V1=0`."
-    )
+
     return QueryResult(
         inputs={
             "prompt": prompt,
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index ed974b90b57eefb115613b8133aabd798c89d2d6..0c09e603271de47e213c1534ba4612dd1592e525 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -38,7 +38,7 @@ from rlhf_utils import stateless_init_process_group
 from transformers import AutoModelForCausalLM
 
 from vllm import LLM, SamplingParams
-from vllm.utils import get_ip, get_open_port
+from vllm.utils.network_utils import get_ip, get_open_port
 
 
 class MyLLM(LLM):
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index c0e60b9793407da33284334868fe4d2d6d744b4c..13def88439ef2dd723c722c912977d8f04625232 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-from typing import Callable, Optional, TypedDict
+from collections.abc import Callable
+from typing import TypedDict
 
 import torch
 import zmq
@@ -71,7 +72,7 @@ class WorkerExtension:
 
 
 def rebuild_ipc(
-    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+    handle: tuple[Callable, tuple], device_id: int | None = None
 ) -> torch.Tensor:
     func, args = handle
     list_args = list(args)
@@ -109,7 +110,7 @@ class ColocateWorkerExtension:
             self._zmq_ctx = zmq.Context()
         socket = self._zmq_ctx.socket(zmq.REP)
         socket.connect(zmq_handles[self.report_device_id()])
-        buffer: Optional[torch.Tensor] = None
+        buffer: torch.Tensor | None = None
         while True:
             payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
                 socket.recv_pyobj()
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 41d7a3492320858d58fe97acacf519433fb97ad9..e25f46b126e6f3da7e6acae315f42efef5d784df 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -30,7 +30,7 @@ from pathlib import Path
 
 from vllm import LLM, EngineArgs
 from vllm.model_executor.model_loader import ShardedStateLoader
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def parse_args():
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index af65b6d38e02cbf5b6f8d5660341229a3347cddb..3cdc3b245b72a498305e0920101caa253fb2d554 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -9,25 +9,25 @@ from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector
 
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]
 
 
diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py
index 8e888a100254e31688ab838881240e96b25305d1..eb7ed969ea4bf50eb923268805e90993f7f2857f 100644
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@@ -4,17 +4,88 @@
 experimental support for data-parallel inference with torchrun
 Note the data load balancing and distribution is done out of the vllm engine,
 no internal lb supported in external_launcher mode.
+
+To run this example:
+```bash
+$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
+```
+
+With custom parallelism settings:
+```bash
+$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+```
 """
 
+import argparse
+
 from vllm import LLM, SamplingParams
 
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data-parallel inference with torchrun"
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=2,
+        help="Data parallel size (default: 2)",
+    )
+    parser.add_argument(
+        "--enable-ep",
+        action="store_true",
+        help="Enable expert parallel (default: False)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/Phi-mini-MoE-instruct",
+        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model length (default: 4096)",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.6,
+        help="GPU memory utilization (default: 0.6)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Random seed (default: 1)",
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+
 # Create prompts, the same across all ranks
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
     "The capital of France is",
     "The future of AI is",
-] * 50
+]
 
 # Create sampling parameters, the same across all ranks
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
@@ -25,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
-    tensor_parallel_size=1,
-    data_parallel_size=2,
-    pipeline_parallel_size=1,
-    enable_expert_parallel=False,
+    model=args.model,
+    tensor_parallel_size=args.tp_size,
+    data_parallel_size=args.dp_size,
+    pipeline_parallel_size=args.pp_size,
+    enable_expert_parallel=args.enable_ep,
     distributed_executor_backend="external_launcher",
-    max_model_len=4096,
-    gpu_memory_utilization=0.6,
-    seed=1,
+    max_model_len=args.max_model_len,
+    gpu_memory_utilization=args.gpu_memory_utilization,
+    seed=args.seed,
 )
 
 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
@@ -45,14 +116,13 @@ prompts = [
 
 outputs = llm.generate(prompts, sampling_params)
 
-
-# all ranks will have the same outputs
-print("-" * 50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
-    print("-" * 50)
+    print(
+        f"DP Rank: {dp_rank} Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n"
+    )
+
 """
 Further tips:
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index f8ddb5a22b31a23b44653a87985c67e7a9a7138a..624de2a2debc3a4d6cd8a262227b03f4835b8d88 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -12,7 +12,7 @@ import os
 import random
 from contextlib import contextmanager
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -22,14 +22,15 @@ from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
     prompts: list[str]
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None
+    sampling_params: list[SamplingParams] | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -90,16 +91,25 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# BLIP-2
-def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
+# Bee-8B
+def run_bee(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
+    model_name = "Open-Bee/Bee-8B-RL"
+
+    prompts = [
+        (
+            f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<image>\n{question}<|im_end|>"
+            f"<|im_start|>assistant\n<think>\n"
+        )
+        for question in questions
+    ]
 
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
+        model=model_name,
+        max_model_len=16384,
         limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
     )
 
     return ModelRequestData(
@@ -108,15 +118,15 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Chameleon
-def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
+# BLIP-2
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    prompts = [f"{question}<image>" for question in questions]
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="facebook/chameleon-7b",
-        max_model_len=4096,
-        max_num_seqs=2,
+        model="Salesforce/blip2-opt-2.7b",
         limit_mm_per_prompt={modality: 1},
     )
 
@@ -126,15 +136,16 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Dots-OCR
-def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
+# Chameleon
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+    prompts = [f"{question}<image>" for question in questions]
     engine_args = EngineArgs(
-        model="rednote-hilab/dots.ocr",
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
         limit_mm_per_prompt={modality: 1},
-        trust_remote_code=True,
     )
 
     return ModelRequestData(
@@ -190,6 +201,66 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+
+    assert modality == "image"
+
+    model_name = "deepseek-ai/DeepSeek-OCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        limit_mm_per_prompt={modality: 1},
+        logits_processors=[NGramPerReqLogitsProcessor],
+    )
+
+    # deepseek-ocr use plain prompt template
+    prompts = [f"<image>\n{question}" for question in questions]
+
+    # The following sampling params config is taken from
+    # the official Deepseek-OCR inference example.
+    # (IMPORTANT) Use the custom logits processor and avoid skipping
+    # special tokens for this model for the optimal OCR performance.
+    sampling_params = [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=8192,
+            # ngram logit processor args
+            extra_args=dict(
+                ngram_size=30,
+                window_size=90,
+                # whitelist: <td>, </td>
+                whitelist_token_ids={128821, 128822},
+            ),
+            skip_special_tokens=False,
+        )
+        for _ in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+# Dots-OCR
+def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+    engine_args = EngineArgs(
+        model="rednote-hilab/dots.ocr",
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Ernie4.5-VL
 def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
@@ -576,7 +647,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
 
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "internlm/Intern-S1"
+    model_name = "internlm/Intern-S1-mini"
 
     engine_args = EngineArgs(
         model=model_name,
@@ -733,6 +804,26 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# LightOnOCR
+def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
+        for _ in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="lightonai/LightOnOCR-1B",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
@@ -1140,15 +1231,37 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
     elif modality == "video":
         placeholder = "<video>"
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+    prompts = [
+        f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
         for question in questions
     ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
     )
 
+
+# PaddleOCR-VL
+def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+    prompts = [
+        (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
+        for question in questions
+    ]
+
     return ModelRequestData(
         engine_args=engine_args,
         prompts=prompts,
@@ -1423,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
         mm_processor_kwargs={
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
-            "fps": [1],
+            "fps": 1,
         },
         limit_mm_per_prompt={modality: 1},
     )
@@ -1691,11 +1804,13 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
     "aria": run_aria,
     "aya_vision": run_aya_vision,
+    "bee": run_bee,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
-    "dots_ocr": run_dots_ocr,
     "command_a_vision": run_command_a_vision,
     "deepseek_vl_v2": run_deepseek_vl2,
+    "deepseek_ocr": run_deepseek_ocr,
+    "dots_ocr": run_dots_ocr,
     "ernie45_vl": run_ernie45_vl,
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
@@ -1712,6 +1827,7 @@ model_example_map = {
     "keye_vl": run_keye_vl,
     "keye_vl1_5": run_keye_vl1_5,
     "kimi_vl": run_kimi_vl,
+    "lightonocr": run_lightonocr,
     "llama4": run_llama4,
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -1727,6 +1843,7 @@ model_example_map = {
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
     "ovis2_5": run_ovis2_5,
+    "paddleocr_vl": run_paddleocr_vl,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
@@ -1957,8 +2074,12 @@ def main(args):
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(
-        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    sampling_params = (
+        SamplingParams(
+            temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+        )
+        if req_data.sampling_params is None
+        else req_data.sampling_params
     )
 
     assert args.num_prompts > 0
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 51b41f34b2ff6aa15fd21caaaed31f728f3fa451..d6e169548f15baaee8d68b9249fc1895709cd4a6 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -9,7 +9,7 @@ using the chat template defined by the model.
 import os
 from argparse import Namespace
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
@@ -18,22 +18,22 @@ from transformers import AutoProcessor, AutoTokenizer
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]
 
 
@@ -41,9 +41,10 @@ class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
     prompt: str
     image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    lora_requests: list[LoRARequest] | None = None
+    sampling_params: SamplingParams | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -107,6 +108,41 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_bee(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Open-Bee/Bee-8B-RL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        trust_remote_code=True,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "CohereLabs/command-a-vision-07-2025"
 
@@ -166,6 +202,46 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
+    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+
+    model_name = "deepseek-ai/DeepSeek-OCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        logits_processors=[NGramPerReqLogitsProcessor],
+    )
+
+    placeholder = "<image>\n" * len(image_urls)
+    prompt = placeholder + question
+
+    # The following sampling params config is taken from
+    # the official Deepseek-OCR inference example.
+    # (IMPORTANT) Use the custom logits processor and avoid skipping
+    # special tokens for this model for the optimal OCR performance.
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=8192,
+        # ngram logit processor args
+        extra_args=dict(
+            ngram_size=30,
+            window_size=90,
+            # whitelist: <td>, </td>
+            whitelist_token_ids={128821, 128822},
+        ),
+        skip_special_tokens=False,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+        sampling_params=sampling_params,
+    )
+
+
 def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "google/gemma-3-4b-it"
 
@@ -309,7 +385,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
 
 
 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "internlm/Intern-S1"
+    model_name = "internlm/Intern-S1-mini"
 
     engine_args = EngineArgs(
         model=model_name,
@@ -371,13 +447,14 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=131072,
-        tensor_parallel_size=8,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -389,29 +466,32 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        }
+        },
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
+    image_data = [fetch_image(url) for url in image_urls]
+
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
     )
 
 
-def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
-    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
-    # it will generate poor response for multi-image inputs!
-    model_name = "llava-hf/llava-1.5-7b-hf"
+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=32768,
+        max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -423,28 +503,32 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        }
+        },
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
+    image_data = [fetch_image(url) for url in image_urls]
+
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
     )
 
 
-def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -459,7 +543,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
         }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
@@ -472,12 +556,13 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=16384,
-        max_num_seqs=16,
+        max_model_len=131072,
+        tensor_parallel_size=8,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -505,14 +590,13 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
     )
 
 
-def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
-
+def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
+    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
+    # it will generate poor response for multi-image inputs!
+    model_name = "llava-hf/llava-1.5-7b-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        max_num_seqs=5,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -524,32 +608,28 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        },
+        }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    image_data = [fetch_image(url) for url in image_urls]
-
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
     )
 
 
-def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
-
+def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
         max_model_len=8192,
-        max_num_seqs=5,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -561,32 +641,28 @@ def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        },
+        }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    image_data = [fetch_image(url) for url in image_urls]
-
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
     )
 
 
-def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
-
+def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=4,
+        max_model_len=16384,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -601,7 +677,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
@@ -713,13 +789,32 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
     placeholders = "\n".join(
         f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
     )
-    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+    prompt = (
+        f"<|im_start|>user\n\n{placeholders}\n{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
     )
 
+    placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
+    prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
+
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
@@ -1217,8 +1312,10 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
     "aria": load_aria,
     "aya_vision": load_aya_vision,
+    "bee": load_bee,
     "command_a_vision": load_command_a_vision,
     "deepseek_vl_v2": load_deepseek_vl2,
+    "deepseek_ocr": load_deepseek_ocr,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
@@ -1236,6 +1333,7 @@ model_example_map = {
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
     "ovis2_5": load_ovis2_5,
+    "paddleocr_vl": load_paddleocr_vl,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "phi4_multimodal": load_phi4_multimodal,
@@ -1253,7 +1351,7 @@ model_example_map = {
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: int | None):
     req_data = model_example_map[model](question, image_urls)
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@@ -1279,7 +1377,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
@@ -1291,8 +1389,12 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[in
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
-    sampling_params = SamplingParams(
-        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    sampling_params = (
+        SamplingParams(
+            temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+        )
+        if req_data.sampling_params is None
+        else req_data.sampling_params
     )
     outputs = llm.chat(
         [
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 0cc0c1e708b1237537d4400ab30c4c9ed3f66e25..63d85d5d9eef59882e8abd9ffed1a04d395f9b6d 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -10,14 +10,18 @@ on HuggingFace model repository.
 
 from argparse import Namespace
 from dataclasses import asdict
-from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from pathlib import Path
+from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
 
 from PIL.Image import Image
 
 from vllm import LLM, EngineArgs
 from vllm.entrypoints.score_utils import ScoreMultiModalParam
 from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+ROOT_DIR = Path(__file__).parent.parent.parent
+EXAMPLES_DIR = ROOT_DIR / "examples"
 
 
 class TextQuery(TypedDict):
@@ -43,15 +47,39 @@ class TextImagesQuery(TypedDict):
 
 
 QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
 
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
-    prompt: Optional[str] = None
-    image: Optional[Image] = None
-    query: Optional[str] = None
-    documents: Optional[ScoreMultiModalParam] = None
+    prompt: str | None = None
+    image: Image | None = None
+    query: str | None = None
+    documents: ScoreMultiModalParam | None = None
+
+
+def run_clip(query: Query) -> ModelRequestData:
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = ""  # For image input, make sure that the prompt text is empty
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="openai/clip-vit-base-patch32",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
 
 
 def run_e5_v(query: Query) -> ModelRequestData:
@@ -82,23 +110,74 @@ def run_e5_v(query: Query) -> ModelRequestData:
     )
 
 
-def run_vlm2vec(query: Query) -> ModelRequestData:
+def run_jinavl_reranker(query: Query) -> ModelRequestData:
+    if query["modality"] != "text+images":
+        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        runner="pooling",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        query=query["text"],
+        documents=query["image"],
+    )
+
+
+def run_siglip(query: Query) -> ModelRequestData:
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = ""  # For image input, make sure that the prompt text is empty
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="google/siglip-base-patch16-224",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def _get_vlm2vec_prompt_image(query: Query, image_token: str):
     if query["modality"] == "text":
         text = query["text"]
-        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        prompt = f"Find me an everyday image that matches the given caption: {text}"
         image = None
     elif query["modality"] == "image":
-        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
         image = query["image"]
     elif query["modality"] == "text+image":
         text = query["text"]
-        prompt = (
-            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
-        )
+        prompt = f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
         image = query["image"]
     else:
         modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
+        raise ValueError(f"Unsupported query modality: {modality!r}")
+
+    return prompt, image
+
+
+def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
 
     engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
@@ -116,26 +195,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
     )
 
 
-def run_jinavl_reranker(query: Query) -> ModelRequestData:
-    if query["modality"] != "text+images":
-        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
+    # vLLM does not support LoRA adapters on multi-modal encoder,
+    # so we merge the weights first
+    from huggingface_hub.constants import HF_HUB_CACHE
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
+
+    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        model_id,
+        config=PeftConfig.from_pretrained(model_id),
+    )
+    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+    model._hf_peft_config_loaded = False  # Needed to save the merged model
+
+    processor = AutoProcessor.from_pretrained(
+        model_id,
+        # `min_pixels` and `max_pixels` are deprecated for
+        # transformers `preprocessor_config.json`
+        size={"shortest_edge": 3136, "longest_edge": 12845056},
+    )
+    processor.chat_template = load_chat_template(
+        # The original chat template is not correct
+        EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
+    )
+
+    merged_path = str(
+        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
+    )
+    print(f"Saving merged model to {merged_path}...")
+    print(
+        "NOTE: This directory is not tracked by `huggingface_hub` "
+        "so you have to delete this manually if you don't want it anymore."
+    )
+    model.save_pretrained(merged_path)
+    processor.save_pretrained(merged_path)
+    print("Done!")
+
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
 
     engine_args = EngineArgs(
-        model="jinaai/jina-reranker-m0",
+        model=merged_path,
         runner="pooling",
-        max_model_len=32768,
-        trust_remote_code=True,
+        max_model_len=4096,
         mm_processor_kwargs={
             "min_pixels": 3136,
-            "max_pixels": 602112,
+            "max_pixels": 12845056,
         },
         limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
         engine_args=engine_args,
-        query=query["text"],
-        documents=query["image"],
+        prompt=prompt,
+        image=image,
     )
 
 
@@ -186,7 +305,7 @@ def get_query(modality: QueryModality):
     raise ValueError(msg)
 
 
-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality, seed: int | None):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
@@ -216,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
         print("-" * 50)
 
 
-def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+def run_score(model: str, modality: QueryModality, seed: int | None):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
@@ -231,9 +350,12 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
 
 
 model_example_map = {
+    "clip": run_clip,
     "e5_v": run_e5_v,
-    "vlm2vec": run_vlm2vec,
     "jinavl_reranker": run_jinavl_reranker,
+    "siglip": run_siglip,
+    "vlm2vec_phi3v": run_vlm2vec_phi3v,
+    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
 }
 
 
@@ -246,7 +368,7 @@ def parse_args():
         "--model-name",
         "-m",
         type=str,
-        default="vlm2vec",
+        default="vlm2vec_phi3v",
         choices=model_example_map.keys(),
         help="The name of the embedding model.",
     )
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
index bfe81121d1fd4b5bce0eab4f2fb206ce48f7f156..4376aac488f05e95297506718bb4e18951165d2c 100644
--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -19,3 +19,15 @@ This directory contains a Helm chart for deploying the vllm application. The cha
 - templates/pvc.yaml: Template for Persistent Volume Claims.
 - templates/secrets.yaml: Template for Kubernetes Secrets.
 - templates/service.yaml: Template for creating Services.
+
+## Running Tests
+
+This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
+
+```bash
+# Install plugin
+helm plugin install https://github.com/helm-unittest/helm-unittest
+
+# Run tests
+helm unittest .
+```
diff --git a/examples/online_serving/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
index a9690bad3c94512eff8a1c560f75acecf66e0888..3226c1d79c42868bfe6810511994ab5d4c0e8a35 100644
--- a/examples/online_serving/chart-helm/templates/_helpers.tpl
+++ b/examples/online_serving/chart-helm/templates/_helpers.tpl
@@ -123,9 +123,6 @@ runAsUser:
 {{-   end }}
 {{- end }}
 
-{{- define "chart.extraInitImage" -}}
-"amazon/aws-cli:2.6.4"
-{{- end }}
 
 {{- define "chart.extraInitEnv" -}}
 - name: S3_ENDPOINT_URL
@@ -148,11 +145,15 @@ runAsUser:
     secretKeyRef:
       name: {{ .Release.Name }}-secrets
       key: s3accesskey
+{{- if .Values.extraInit.s3modelpath }}
 - name: S3_PATH
   value: "{{ .Values.extraInit.s3modelpath }}"
+{{- end }}
+{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
 - name: AWS_EC2_METADATA_DISABLED
   value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
 {{- end }}
+{{- end }}
 
 {{/*
   Define chart labels
diff --git a/examples/online_serving/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
index 536983b587be24fdf14862f5e7a697974a2ce125..a0a3c4b9ee5239366dd15336822fe52d351c6865 100644
--- a/examples/online_serving/chart-helm/templates/deployment.yaml
+++ b/examples/online_serving/chart-helm/templates/deployment.yaml
@@ -72,16 +72,21 @@ spec:
         {{ toYaml . | nindent 8 }}
         {{- end }}
 
-      {{-   if .Values.extraInit  }}
+      {{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
       initContainers:
+      {{- if .Values.extraInit.modelDownload.enabled }}
       - name: wait-download-model
-        image: {{ include "chart.extraInitImage" . }}
-        command: 
-          - /bin/bash
+        image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+        imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+        command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
         args:
-          - -eucx
-          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
-        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
+        env:
+        {{- if .Values.extraInit.modelDownload.waitContainer.env }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
+        {{- else }}
+        {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- end }}
         resources:
           requests:
             cpu: 200m
@@ -93,6 +98,10 @@ spec:
         - name: {{ .Release.Name }}-storage
           mountPath: /data
       {{- end }}
+      {{- with .Values.extraInit.initContainers }}
+      {{- toYaml . | nindent 6 }}
+      {{- end }}
+      {{- end }}
       volumes:
         - name: {{ .Release.Name }}-storage
           persistentVolumeClaim:
diff --git a/examples/online_serving/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
index f9ea3541e78d2a64978d4c69859cf4433bb3106f..98d313916ca481ce810a90cede95435b33f8eab0 100644
--- a/examples/online_serving/chart-helm/templates/job.yaml
+++ b/examples/online_serving/chart-helm/templates/job.yaml
@@ -1,4 +1,4 @@
-{{-   if .Values.extraInit  }}
+{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -12,13 +12,17 @@ spec:
    spec:
     containers:
     - name: job-download-model
-      image: {{ include "chart.extraInitImage" . }}
-      command: 
-        - /bin/bash
+      image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+      imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+      command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
       args:
-        - -eucx
-        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
-      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
+      env:
+      {{- if .Values.extraInit.modelDownload.downloadJob.env }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
+      {{- else }}
+      {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- end }}
       volumeMounts:
         - name: {{ .Release.Name }}-storage
           mountPath: /data
diff --git a/examples/online_serving/chart-helm/tests/deployment_test.yaml b/examples/online_serving/chart-helm/tests/deployment_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b7472cf0fd436657e24a4786f99c5c59d3c8016
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/deployment_test.yaml
@@ -0,0 +1,135 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+tests:
+  - it: should create wait-download-model init container when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers: [ ]
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - isNotEmpty:
+          path: spec.template.spec.initContainers
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[0].imagePullPolicy
+          value: IfNotPresent
+
+  - it: should only create custom init containers when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 1
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[0].ports[0].containerPort
+          value: 8080
+
+  - it: should create both wait-download-model and custom init containers when both are enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 2
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[1].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[1].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[1].ports[0].containerPort
+          value: 8080
\ No newline at end of file
diff --git a/examples/online_serving/chart-helm/tests/job_test.yaml b/examples/online_serving/chart-helm/tests/job_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25d40ff265132283e8ca34fdaa106eeb47cc6e34
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/job_test.yaml
@@ -0,0 +1,61 @@
+suite: test job
+templates:
+  - job.yaml
+tests:
+  - it: should create job when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: spec.template.spec.containers[0].name
+          value: job-download-model
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.restartPolicy
+          value: OnFailure
+
+  - it: should not create job when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "download" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
diff --git a/examples/online_serving/chart-helm/tests/pvc_test.yaml b/examples/online_serving/chart-helm/tests/pvc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a8b37da7e8bd14ae17f0243c7543cb802672579
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/pvc_test.yaml
@@ -0,0 +1,32 @@
+suite: test pvc
+templates:
+  - pvc.yaml
+tests:
+  # Test Case: PVC Created When extraInit Defined
+  - it: should create pvc when extraInit is defined
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: ["/bin/bash"]
+            args: ["-c", "wait"]
+          downloadJob:
+            command: ["/bin/bash"]
+            args: ["-c", "download"]
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.accessModes[0]
+          value: ReadWriteOnce
+      - equal:
+          path: spec.resources.requests.storage
+          value: 10Gi
\ No newline at end of file
diff --git a/examples/online_serving/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
index 812d54bde13971f938356dad08599243aefbe8ac..0d0e0098bc19415fa42ce8d4c799de3f4d46ae4f 100644
--- a/examples/online_serving/chart-helm/values.schema.json
+++ b/examples/online_serving/chart-helm/values.schema.json
@@ -136,6 +136,70 @@
         "extraInit": {
             "type": "object",
             "properties": {
+                "modelDownload": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "image": {
+                            "type": "object",
+                            "properties": {
+                                "repository": {
+                                    "type": "string"
+                                },
+                                "tag": {
+                                    "type": "string"
+                                },
+                                "pullPolicy": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": ["repository", "tag", "pullPolicy"]
+                        },
+                        "waitContainer": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        },
+                        "downloadJob": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        }
+                    },
+                    "required": ["enabled", "image", "waitContainer", "downloadJob"]
+                },
+                "initContainers": {
+                    "type": "array",
+                    "items": {"type": "object"}
+                },
                 "s3modelpath": {
                     "type": "string"
                 },
@@ -147,9 +211,9 @@
                 }
             },
             "required": [
-                "pvcStorage",
-                "s3modelpath",
-                "awsEc2MetadataDisabled"
+                "modelDownload",
+                "initContainers",
+                "pvcStorage"
             ]
         },
         "extraContainers": {
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 28dba9a6f6882a4173013ea2e76b29eb4ce77813..96b724fc9d20bec25043c24c03f741043624ed6c 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -75,10 +75,65 @@ maxUnavailablePodDisruptionBudget: ""
 
 # -- Additional configuration for the init container
 extraInit:
-   # -- Path of the model on the s3 which hosts model weights and config files
+  # -- Model download functionality (optional)
+  modelDownload:
+    # -- Enable model download job and wait container
+    enabled: true
+    # -- Image configuration for model download operations
+    image:
+      # -- Image repository
+      repository: "amazon/aws-cli"
+      # -- Image tag
+      tag: "2.6.4"
+      # -- Image pull policy
+      pullPolicy: "IfNotPresent"
+    # -- Wait container configuration (init container that waits for model to be ready)
+    waitContainer:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the wait container
+      args:
+        - "-eucx"
+        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+    # -- Download job configuration (job that actually downloads the model)
+    downloadJob:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the download job
+      args:
+        - "-eucx"
+        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+
+  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
+  initContainers: []
+  # Example for llm-d sidecar:
+  # initContainers:
+  #   - name: llm-d-routing-proxy
+  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+  #     imagePullPolicy: IfNotPresent
+  #     ports:
+  #       - containerPort: 8080
+  #         name: proxy
+  #     securityContext:
+  #       runAsUser: 1000
+
+  # -- Path of the model on the s3 which hosts model weights and config files
   s3modelpath: "relative_s3_model_path/opt-125m"
-   # -- Storage size of the s3
+  # -- Storage size for the PVC
   pvcStorage: "1Gi"
+  # -- Disable AWS EC2 metadata service
   awsEc2MetadataDisabled: true
 
 # -- Additional containers configuration
diff --git a/examples/online_serving/dashboards/perses/performance_statistics.yaml b/examples/online_serving/dashboards/perses/performance_statistics.yaml
index 2e8d24c3324b9bf62d5ebc38cbf931cfa2cac003..8030fe2f00a959e2ceb3e2c652700ddc7651721c 100644
--- a/examples/online_serving/dashboards/perses/performance_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml
@@ -530,7 +530,7 @@ spec:
                     name: accelerators-thanos-querier-datasource
                   # Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
                   query: >
-                    100 * avg(vllm:gpu_cache_usage_perc)
+                    100 * avg(vllm:kv_cache_usage_perc)
 
     "18":
       kind: Panel
diff --git a/examples/online_serving/dashboards/perses/query_statistics.yaml b/examples/online_serving/dashboards/perses/query_statistics.yaml
index 28109aae8151195729217d363d8320ff63d0e1e3..ad8e047f6dfef51aee2a8a2dd14e78c7d5aedafe 100644
--- a/examples/online_serving/dashboards/perses/query_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/query_statistics.yaml
@@ -98,7 +98,7 @@ spec:
                 kind: PrometheusTimeSeriesQuery
                 spec:
                   datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
-                  query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
+                  query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
                   minStep: "15s"
 
     core_running_ts:
@@ -168,7 +168,7 @@ spec:
                 spec:
                   datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
                   # multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
-                  query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
                   minStep: "15s"
 
     core_kv_usage_pct_ts:
@@ -187,7 +187,7 @@ spec:
                 kind: PrometheusTimeSeriesQuery
                 spec:
                   datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
-                  query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
                   minStep: "15s"
 
     # --- Per-Pod breakdowns (works on Simulator & Real) ---
@@ -246,7 +246,7 @@ spec:
                 spec:
                   datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
                   # if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
-                  query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
                   minStep: "15s"
 
     # --- Real vLLM only (zeros on simulator) ---
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5813a3cecf73b87adf6b79017cb754c10fc885f8
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -0,0 +1,119 @@
+# Disaggregated Encoder
+
+These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
+
+For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
+
+## Files
+
+- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
+
+- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
+```
+
+## Encoder Instances
+
+Encoder engines should be launched with the following flags:
+
+- `--enforce-eager` **(required)** – The current EPD implementation is only compatible with encoder instances running in this mode.
+
+- `--no-enable-prefix-caching` **(required)** – Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
+
+- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
+
+## Local media inputs
+
+To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
+
+```bash
+--allowed-local-media-path $MEDIA_PATH
+```
+
+The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
+
+## EC connector and KV transfer
+
+The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+
+```bash
+# Add to encoder instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_producer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+
+# Add to prefill/prefill+decode instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_consumer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+```
+
+`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
+
+If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
+
+```bash
+# Add to prefill instance:    
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_producer"
+}' 
+
+# Add to decode instance:
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_consumer"
+}' 
+```
+
+## Proxy Instance Flags (`disagg_epd_proxy.py`)
+
+| Flag | Description |
+|------|-------------|
+| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
+| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
+| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
+| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
+
+Example usage:
+For E + PD setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8002" \
+      --prefill-servers-urls "disable" \
+      --decode-servers-urls "http://pd1:8003,http://pd2:8004"
+```
+
+For E + P + D setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8001" \
+      --prefill-servers-urls "http://p1:8003,http://p2:8004" \ 
+      --decode-servers-urls "http://d1:8005,http://d2:8006"
+```
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
new file mode 100644
index 0000000000000000000000000000000000000000..57489df64f51e02f8c477193fde248a27ba119dd
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-2}"
+GPU_P="${GPU_P:-2}"
+GPU_D="${GPU_D:-3}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+export UCX_TLS=all
+export UCX_NET_DEVICES=all
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+P_LOG=$LOG_PATH/p_${START_TIME}.log
+D_LOG=$LOG_PATH/d_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_P" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_producer"
+    }' \
+    >"${P_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_D" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_consumer"
+    }' \
+    >"${D_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_PORT
+wait_for_server $DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+    --decode-servers-urls "http://localhost:$DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6073e0580b11d9d7577f0cacf6e0287be78c314a
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-0}"
+GPU_PD="${GPU_PD:-1}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+PD_LOG=$LOG_PATH/pd_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill+Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${PD_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "disable" \
+    --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f99683c2bf33a273b3d014c90da31a48e76840
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+disagg_encoder_proxy.py
+
+Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
+clusters:
+  • encode  (multimodal feature extraction)
+  • decode  (language-model inference)
+
+For MM input we:
+    1. Extract *every* image/audio item.
+    2. Fire N concurrent requests to the encoder cluster
+       (one request per item, with **all text removed**).
+    3. Wait for all of them to succeed.
+    4. Forward the *original* request to a decode server.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import random
+import uuid
+from collections.abc import AsyncIterator
+
+import aiohttp
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+###############################################################################
+# FastAPI app & global state
+###############################################################################
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
+)
+logger = logging.getLogger("proxy")
+
+app = FastAPI()
+encode_session: aiohttp.ClientSession | None = None
+prefill_session: aiohttp.ClientSession | None = None
+decode_session: aiohttp.ClientSession | None = None
+
+###############################################################################
+# Utils
+###############################################################################
+
+
+MM_TYPES = {"image_url", "audio_url", "input_audio"}
+
+
+def extract_mm_items(request_data: dict) -> list[dict]:
+    """
+    Return *all* image/audio items that appear anywhere in `messages`.
+
+    Each returned dict looks like:
+        { "type": "image_url", "image_url": {...} }
+    """
+    items: list[dict] = []
+    for msg in request_data.get("messages", []):
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+
+        for item in content:
+            if item.get("type") in MM_TYPES:
+                items.append(item)
+    return items
+
+
+async def fanout_encoder_primer(
+    orig_request: dict,
+    e_urls: list[str],
+    req_id: str,
+) -> None:
+    """
+    1. Build one request *per MM item* with all text removed.
+    2. Send them concurrently to the encode cluster.
+    3. Raise if any of them fails.
+    """
+    logger.info("[%s] Processing multimodal items...", req_id)
+
+    mm_items = extract_mm_items(orig_request)
+    if not mm_items:
+        logger.info("[%s] No multimodal items, skipping encoder", req_id)
+        return  # nothing to do
+
+    logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
+
+    tasks = []
+
+    # Round-robin over encode servers to distribute load a bit
+    url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
+
+    for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
+        # Derive a *child* request id:  <parent>:<index>:<random-short>
+        child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
+        headers = {"x-request-id": child_req_id}
+
+        encoder_req = {
+            # You *may* need to keep additional fields
+            "model": orig_request.get("model"),
+            "messages": [
+                {"role": "user", "content": [item]},
+            ],
+            # Only need 1 token so the server actually runs the encoder path
+            "max_tokens": 1,
+            "stream": False,
+        }
+        tasks.append(
+            encode_session.post(
+                f"{target_url}/v1/chat/completions",
+                json=encoder_req,
+                headers=headers,
+            )
+        )
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Fail fast if any sub-request failed
+    for idx, r in enumerate(results):
+        if isinstance(r, Exception):
+            logger.error(
+                "[%s] Encoder request #%d raised exception: %s",
+                req_id,
+                idx,
+                r,
+                exc_info=r,
+            )
+            raise HTTPException(
+                status_code=502, detail=f"Encoder request failed: {str(r)}"
+            )
+        if r.status != 200:
+            try:
+                detail = await r.text()
+            except Exception:
+                detail = "<unable to read body>"
+            logger.error(
+                "[%s] Encoder request #%d returned status %s: %s",
+                req_id,
+                idx,
+                r.status,
+                detail,
+            )
+            raise HTTPException(
+                status_code=r.status,
+                detail=f"Encoder request failed: {detail}",
+            )
+
+    logger.info(
+        "[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
+    )
+
+
+async def maybe_prefill(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """
+    - Do prefill-only task if p_url exist;
+    - Return modified request data with kv transfer params (for nixl connector)
+    - Else, skip and return the original request data for decode
+    """
+    if p_url:
+        logger.info("[%s] Processing through prefill: %s", req_id, p_url)
+
+        prefill_response = await process_prefill_stage(req_data, p_url, req_id)
+        # for nixl connector to facilitate kv transfer...
+        prefill_response_json = await prefill_response.json()
+        kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
+
+        return req_data
+    else:
+        return req_data
+
+
+async def process_prefill_stage(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """Process request through Prefill stage and return kv_transfer_params"""
+    logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
+
+    prefill_request = req_data.copy()
+    prefill_request["kv_transfer_params"] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    prefill_request["stream"] = False
+    prefill_request["max_tokens"] = 1
+    if "max_completion_tokens" in prefill_request:
+        prefill_request["max_completion_tokens"] = 1
+    if "stream_options" in prefill_request:
+        del prefill_request["stream_options"]
+
+    headers = {"x-request-id": req_id}
+    try:
+        prefill_response = await prefill_session.post(
+            f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
+        )
+        prefill_response.raise_for_status()
+
+        if prefill_response.status != 200:
+            error_text = await prefill_response.text()
+            logger.error(
+                "[%s] Prefill request failed with status %d: %s",
+                req_id,
+                prefill_response.status,
+                error_text,
+            )
+            raise HTTPException(
+                status_code=prefill_response.status,
+                detail={"error": "Prefill request failed", "message": error_text},
+            )
+        logger.info("[%s] Prefill request completed successfully", req_id)
+
+        return prefill_response
+
+    except Exception as e:
+        logger.error("Prefill processing failed: %s", str(e))
+        raise HTTPException(
+            status_code=500,
+            detail={"error": "Prefill processing error", "message": str(e)},
+        ) from e
+
+
+###############################################################################
+# Middleware for request/response logging
+###############################################################################
+
+
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Middleware to log all incoming requests and responses"""
+    req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+    # Log incoming request
+    logger.info(
+        ">>> [%s] %s %s from %s",
+        req_id,
+        request.method,
+        request.url.path,
+        request.client.host if request.client else "unknown",
+    )
+
+    try:
+        # Process request
+        response = await call_next(request)
+
+        # Log response
+        logger.info(
+            "<<< [%s] %s %s completed with status %d",
+            req_id,
+            request.method,
+            request.url.path,
+            response.status_code,
+        )
+
+        return response
+    except Exception as e:
+        # Log errors
+        logger.exception(
+            "!!! [%s] %s %s failed with error: %s",
+            req_id,
+            request.method,
+            request.url.path,
+            str(e),
+        )
+        raise
+
+
+###############################################################################
+# FastAPI lifecycle
+###############################################################################
+
+
+@app.on_event("startup")
+async def on_startup() -> None:
+    global encode_session, prefill_session, decode_session
+    timeout = aiohttp.ClientTimeout(total=100_000)
+    connector = aiohttp.TCPConnector(limit=0, force_close=False)
+    encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    if app.state.p_urls:
+        # only setup if prefill instance(s) exist
+        prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+
+
+@app.on_event("shutdown")
+async def on_shutdown() -> None:
+    global encode_session, prefill_session, decode_session
+    if encode_session:
+        await encode_session.close()
+    if prefill_session:
+        await prefill_session.close()
+    if decode_session:
+        await decode_session.close()
+
+
+###############################################################################
+# Core forwarding
+###############################################################################
+
+
+async def forward_non_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> dict:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Non-streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions", json=req_data, headers=headers
+        ) as resp:
+            resp.raise_for_status()
+            return await resp.json()
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
+        raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
+
+
+async def forward_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> AsyncIterator[str]:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions",
+            json=req_data,
+            headers=headers,
+        ) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024):
+                if chunk:
+                    yield chunk.decode("utf-8", errors="ignore")
+
+        logger.info("[%s] Streaming completed", req_id)
+
+    except HTTPException:
+        logger.exception("[%s] HTTPException in forward_stream", req_id)
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Proxy streaming error: {str(e)}"
+        ) from e
+
+
+###############################################################################
+# Public routes
+###############################################################################
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        req_data = await request.json()
+        req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+        e_urls = app.state.e_urls  # we want the full list for fan-out
+        p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+        d_url = random.choice(app.state.d_urls)
+
+        is_streaming = req_data.get("stream", False)
+
+        if is_streaming:
+            return StreamingResponse(
+                forward_stream(req_data, req_id, e_urls, p_url, d_url),
+                media_type="text/event-stream",
+            )
+        result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
+        return JSONResponse(content=result)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error in chat_completions endpoint: %s", str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Request processing error: {str(e)}"
+        ) from e
+
+
+@app.get("/v1/models")
+async def list_models():
+    async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
+        resp.raise_for_status()
+        return await resp.json()
+
+
+@app.get("/health")
+async def health_check():
+    async def healthy(urls):
+        if not urls:
+            return "empty"
+        for u in urls:
+            try:
+                async with encode_session.get(f"{u}/health") as resp:
+                    resp.raise_for_status()
+            except Exception:
+                return "unhealthy"
+        return "healthy"
+
+    e_status, p_status, d_status = await asyncio.gather(
+        healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
+    )
+
+    overall_healthy = all(
+        status != "unhealthy" for status in (e_status, p_status, d_status)
+    )
+
+    status_code = 200 if overall_healthy else 503
+
+    return JSONResponse(
+        {
+            "proxy": "healthy",
+            "encode_cluster": e_status,
+            "prefill_cluster": p_status,
+            "decode_cluster": d_status,
+        },
+        status_code=status_code,
+    )
+
+
+###############################################################################
+# Simple profiler fan-out (unchanged except for sessions)
+###############################################################################
+
+
+async def _post_if_available(
+    session: aiohttp.ClientSession,
+    url: str,
+    payload: dict,
+    headers: dict,
+) -> dict | None:
+    """
+    POST `payload` to `url`.
+
+    Returns
+    -------
+    • The decoded JSON body on success (2xx)
+    • None if the endpoint does not exist (404)
+    • Raises for anything else.
+    """
+    try:
+        resp = await session.post(url, json=payload, headers=headers)
+        if resp.status == 404:  # profiling disabled on that server
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        resp.raise_for_status()
+        return await resp.json(content_type=None)
+    except aiohttp.ClientResponseError as exc:
+        # Pass 404 through the branch above, re-raise everything else
+        if exc.status == 404:
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        raise
+    except Exception:
+        # Network errors etc.: propagate
+        raise
+
+
+async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
+    """
+    Fire & forget to both clusters, tolerate 404.
+    """
+    headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
+
+    encode_task = _post_if_available(
+        encode_session, f"{e_url}/{cmd}_profile", payload, headers
+    )
+    prefill_task = (
+        _post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
+        if p_url is not None
+        else asyncio.sleep(0)
+    )
+    decode_task = _post_if_available(
+        decode_session, f"{d_url}/{cmd}_profile", payload, headers
+    )
+
+    encode_res, prefill_res, decode_res = await asyncio.gather(
+        encode_task, prefill_task, decode_task
+    )
+
+    # If *all* clusters said “I don’t have that route”, surface an error
+    if encode_res is prefill_res is decode_res is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Profiling endpoints are disabled on all clusters",
+        )
+
+    return {
+        "encode": encode_res,  # may be None
+        "prefill": prefill_res,  # may be None
+        "decode": decode_res,  # may be None
+    }
+
+
+@app.post("/start_profile")
+async def start_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("start", body, e_url, p_url, d_url)
+
+
+@app.post("/stop_profile")
+async def stop_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("stop", body, e_url, p_url, d_url)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--encode-servers-urls",
+        required=True,
+        help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
+    )
+    parser.add_argument(
+        "--prefill-servers-urls",
+        required=True,
+        help=(
+            'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
+            'to enable E->P->D, set "disable" or "none" to enable E->PD',
+        ),
+    )
+    parser.add_argument(
+        "--decode-servers-urls",
+        required=True,
+        help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
+    )
+
+    args = parser.parse_args()
+    app.state.e_urls = [
+        u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
+    ]
+    app.state.d_urls = [
+        u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
+    ]
+    # handle prefill instances
+    if args.prefill_servers_urls.lower() in ("disable", "none", ""):
+        app.state.p_urls = []
+        logger.info(
+            "Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
+        )
+    else:
+        app.state.p_urls = [
+            u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
+        ]
+        logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
+
+    logger.info("Proxy listening on %s:%s", args.host, args.port)
+    logger.info("Encode servers: %s", app.state.e_urls)
+    logger.info("Prefill instances %s", app.state.p_urls)
+    logger.info("Decode servers: %s", app.state.d_urls)
+
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+        loop="uvloop",
+        access_log=True,
+    )
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index d39edb0b9d15cb648d0f68e36f471ad9f39401c7..2b8482ec717af3e56175f0d861d846d44d890484 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -23,7 +23,7 @@ import logging
 import os
 import sys
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import aiohttp
 import requests
@@ -49,12 +49,9 @@ class Proxy:
         decode_instances: list[str],
         model: str,
         scheduling_policy: SchedulingPolicy,
-        custom_create_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
-        custom_create_chat_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
+        custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
+        custom_create_chat_completion: Callable[[Request], StreamingResponse]
+        | None = None,
     ):
         self.prefill_instances = prefill_instances
         self.decode_instances = decode_instances
@@ -203,9 +200,9 @@ class Proxy:
                 async with session.post(
                     url=url, json=data, headers=headers
                 ) as response:
-                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:
                         if use_chunked:
-                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                            async for chunk_bytes in response.content.iter_chunked(
                                 1024
                             ):
                                 yield chunk_bytes
@@ -348,9 +345,9 @@ class ProxyServer:
     def __init__(
         self,
         args: argparse.Namespace,
-        scheduling_policy: Optional[SchedulingPolicy] = None,
-        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
-        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        scheduling_policy: SchedulingPolicy | None = None,
+        create_completion: Callable[[Request], StreamingResponse] | None = None,
+        create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
     ):
         self.validate_parsed_serve_args(args)
         self.port = args.port
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
index 7b0b12bb34d25e9d534fd0aeb20bd6894f4c7394..1e7acccb4ff94b680d11d3f13aebbd9e9cb721c0 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -166,7 +166,7 @@ main() {
         local kv_port=$((21001 + i))
 
         echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
         --host 0.0.0.0 \
         --port $port \
@@ -194,7 +194,7 @@ main() {
         local kv_port=$((22001 + i))
 
         echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
         --host 0.0.0.0 \
         --port $port \
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
index 1234ebba4d81887e769f6cc887c6066ec12813e6..6845545b6fd17777a928e2adb521f890304a4cee 100644
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 
 export RAY_DEDUP_LOGS=0
-export VLLM_USE_V1=1
 export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 9fd55fc9ddc94082fc6de288ed6eb3189181cf5c..19f6bd5726102f9ed02bfa9eb502a4fa3526a95c 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any
 
 import msgspec
 import zmq
 from msgspec.msgpack import Decoder
 
-from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.core.kv_cache_utils import ExternalBlockHash
 
 
 #
@@ -24,17 +24,17 @@ class KVCacheEvent(
 
 
 class BlockStored(KVCacheEvent):
-    block_hashes: list[BlockHash]
-    parent_block_hash: Optional[BlockHash]
+    block_hashes: list[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
-    lora_id: Optional[int]
-    medium: Optional[str]
+    lora_id: int | None
+    medium: str | None
 
 
 class BlockRemoved(KVCacheEvent):
-    block_hashes: list[BlockHash]
-    medium: Optional[str]
+    block_hashes: list[ExternalBlockHash]
+    medium: str | None
 
 
 class AllBlocksCleared(KVCacheEvent):
@@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
 
 
 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 
 
 def process_event(event_batch):
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
index cb230913a422ff49f591b9048ca45082de7cb7ac..04d21e0489402e823218a573dffbd97b5b8047b5 100644
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from typing import Optional
+import threading
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger
 
 """
 To run this example, run the following commands simultaneously with
@@ -22,37 +23,64 @@ send a request to the instance with DP rank 1.
 """
 
 
+def _do_background_logging(engine, interval, stop_event):
+    try:
+        while not stop_event.is_set():
+            asyncio.run(engine.do_log_stats())
+            stop_event.wait(interval)
+    except Exception as e:
+        print(f"vLLM background logging shutdown: {e}")
+        pass
+
+
 async def main():
     engine_args = AsyncEngineArgs(
         model="ibm-research/PowerMoE-3b",
         data_parallel_size=2,
+        tensor_parallel_size=1,
         dtype="auto",
         max_model_len=2048,
         data_parallel_address="127.0.0.1",
         data_parallel_rpc_port=62300,
         data_parallel_size_local=1,
         enforce_eager=True,
+        enable_log_requests=True,
+        disable_custom_all_reduce=True,
     )
 
-    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
-
+    engine_client = AsyncLLMEngine.from_engine_args(
+        engine_args,
+        # Example: Using aggregated logger
+        stat_loggers=[AggregatedLoggingStatLogger],
+    )
+    stop_logging_event = threading.Event()
+    logging_thread = threading.Thread(
+        target=_do_background_logging,
+        args=(engine_client, 5, stop_logging_event),
+        daemon=True,
+    )
+    logging_thread.start()
     sampling_params = SamplingParams(
         temperature=0.7,
         top_p=0.9,
         max_tokens=100,
     )
+    num_prompts = 10
+    for i in range(num_prompts):
+        prompt = "Who won the 2004 World Series?"
+        final_output: RequestOutput | None = None
+        async for output in engine_client.generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            request_id=f"abcdef-{i}",
+            data_parallel_rank=1,
+        ):
+            final_output = output
+        if final_output:
+            print(final_output.outputs[0].text)
 
-    prompt = "Who won the 2004 World Series?"
-    final_output: Optional[RequestOutput] = None
-    async for output in engine_client.generate(
-        prompt=prompt,
-        sampling_params=sampling_params,
-        request_id="abcdef",
-        data_parallel_rank=1,
-    ):
-        final_output = output
-    if final_output:
-        print(final_output.outputs[0].text)
+    stop_logging_event.set()
+    logging_thread.join()
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 37216a5cfe5741f672bfa02f8eed5c4d1e1c36c7..520cbca003aa513f8986179d04fe96fe6765ceb6 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -26,7 +26,7 @@ import requests
 from openai import OpenAI
 from utils import get_first_model
 
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
@@ -38,11 +38,13 @@ client = OpenAI(
     base_url=openai_api_base,
 )
 
+headers = {"User-Agent": "vLLM Example Client"}
+
 
 def encode_base64_content_from_url(content_url: str) -> str:
     """Encode a content retrieved from a remote url to base64 format."""
 
-    with requests.get(content_url) as response:
+    with requests.get(content_url, headers=headers) as response:
         response.raise_for_status()
         result = base64.b64encode(response.content).decode("utf-8")
 
@@ -50,19 +52,19 @@ def encode_base64_content_from_url(content_url: str) -> str:
 
 
 # Text-only inference
-def run_text_only(model: str) -> None:
+def run_text_only(model: str, max_completion_tokens: int) -> None:
     chat_completion = client.chat.completions.create(
         messages=[{"role": "user", "content": "What's the capital of France?"}],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion.choices[0].message.content
-    print("Chat completion output:", result)
+    print("Chat completion output:\n", result)
 
 
 # Single-image input inference
-def run_single_image(model: str) -> None:
+def run_single_image(model: str, max_completion_tokens: int) -> None:
     ## Use image url in the payload
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
     chat_completion_from_url = client.chat.completions.create(
@@ -79,11 +81,11 @@ def run_single_image(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from image url:", result)
+    print("Chat completion output from image url:\n", result)
 
     ## Use base64 encoded image in the payload
     image_base64 = encode_base64_content_from_url(image_url)
@@ -101,7 +103,7 @@ def run_single_image(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_base64.choices[0].message.content
@@ -109,9 +111,9 @@ def run_single_image(model: str) -> None:
 
 
 # Multi-image input inference
-def run_multi_image(model: str) -> None:
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+def run_multi_image(model: str, max_completion_tokens: int) -> None:
+    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
     chat_completion_from_url = client.chat.completions.create(
         messages=[
             {
@@ -130,15 +132,15 @@ def run_multi_image(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output:", result)
+    print("Chat completion output:\n", result)
 
 
 # Video input inference
-def run_video(model: str) -> None:
+def run_video(model: str, max_completion_tokens: int) -> None:
     video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
     video_base64 = encode_base64_content_from_url(video_url)
 
@@ -157,11 +159,11 @@ def run_video(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from image url:", result)
+    print("Chat completion output from video url:\n", result)
 
     ## Use base64 encoded video in the payload
     chat_completion_from_base64 = client.chat.completions.create(
@@ -178,15 +180,15 @@ def run_video(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded image:", result)
+    print("Chat completion output from base64 encoded video:\n", result)
 
 
 # Audio input inference
-def run_audio(model: str) -> None:
+def run_audio(model: str, max_completion_tokens: int) -> None:
     from vllm.assets.audio import AudioAsset
 
     audio_url = AudioAsset("winning_call").url
@@ -211,11 +213,11 @@ def run_audio(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from input audio:\n", result)
 
     # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
@@ -235,11 +237,11 @@ def run_audio(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
+    print("Chat completion output from audio url:\n", result)
 
     # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
@@ -259,14 +261,14 @@ def run_audio(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded audio:", result)
+    print("Chat completion output from base64 encoded audio:\n", result)
 
 
-def run_multi_audio(model: str) -> None:
+def run_multi_audio(model: str, max_completion_tokens: int) -> None:
     from vllm.assets.audio import AudioAsset
 
     # Two different audios to showcase batched inference.
@@ -300,11 +302,11 @@ def run_multi_audio(model: str) -> None:
             }
         ],
         model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from input audio:\n", result)
 
 
 example_function_map = {
@@ -330,13 +332,20 @@ def parse_args():
         choices=list(example_function_map.keys()),
         help="Conversation type with multimodal data.",
     )
+    parser.add_argument(
+        "--max-completion-tokens",
+        "-n",
+        type=int,
+        default=128,
+        help="Maximum number of tokens to generate for each completion.",
+    )
     return parser.parse_args()
 
 
 def main(args) -> None:
     chat_type = args.chat_type
     model = get_first_model(client)
-    example_function_map[chat_type](model)
+    example_function_map[chat_type](model, args.max_completion_tokens)
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 41dbb3236297c37b5f9c51883551418575f702f3..0bd1d05322f81633f4fbe8f2f4a8cde9d6ade65e 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -161,6 +161,7 @@ def main():
         {
             "role": "assistant",
             "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
         }
     )
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 6ff65b18f66740082861e2b2efee379aac919dbe..c00d712b351d77825a115f3162f5b31202fd5aea 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -5,7 +5,7 @@ To run this example, you can start the vLLM server
 without any specific flags:
 
 ```bash
-VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+vllm serve unsloth/Llama-3.2-1B-Instruct \
     --structured-outputs-config.backend outlines
 ```
 
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 4006d07f73b0011452b4076a86a415f303499106..1dfc3084646ddac943132adac4065c5a59b46fa1 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example demonstrates how to use tool calling with reasoning models 
-like QwQ-32B. The reasoning_content will not be parsed by the tool 
+like QwQ-32B. The reasoning will not be parsed by the tool 
 calling process; only the final output will be parsed.
 
 To run this example, you need to start the vLLM server with both 
@@ -78,7 +78,7 @@ messages = [
 
 
 def extract_reasoning_and_calls(chunks: list):
-    reasoning_content = ""
+    reasoning = ""
     tool_call_idx = -1
     arguments = []
     function_names = []
@@ -97,9 +97,9 @@ def extract_reasoning_and_calls(chunks: list):
                 if tool_call.function.arguments:
                     arguments[tool_call_idx] += tool_call.function.arguments
         else:
-            if hasattr(chunk.choices[0].delta, "reasoning_content"):
-                reasoning_content += chunk.choices[0].delta.reasoning_content
-    return reasoning_content, arguments, function_names
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
 
 
 def main():
@@ -115,7 +115,7 @@ def main():
     tool_calls = client.chat.completions.create(
         messages=messages, model=model, tools=tools
     )
-    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
     print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
     print(
         f"function arguments: "
@@ -129,9 +129,9 @@ def main():
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
 
-    print(f"reasoning_content: {reasoning_content}")
+    print(f"reasoning: {reasoning}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
 
@@ -144,7 +144,7 @@ def main():
     )
 
     tool_call = tool_calls.choices[0].message.tool_calls[0].function
-    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
     print(f"function name: {tool_call.name}")
     print(f"function arguments: {tool_call.arguments}")
     print("----------Stream Generate With Named Function Calling--------------")
@@ -159,8 +159,8 @@ def main():
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
-    print(f"reasoning_content: {reasoning_content}")
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning: {reasoning}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
     print("\n\n")
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 932dbeb2e7a2425c65af70eb8201f51a9029874f..87043897b058c6c4adcda520c1c7923b0f8e48b3 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -38,10 +38,10 @@ def main():
     # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content for Round 1:", reasoning_content)
+    print("reasoning for Round 1:", reasoning)
     print("content for Round 1:", content)
 
     # Round 2
@@ -54,10 +54,10 @@ def main():
     )
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content for Round 2:", reasoning_content)
+    print("reasoning for Round 2:", reasoning)
     print("content for Round 2:", content)
 
 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 7d1ea37714599d7b7f2f1307ea237ab6e6f001f1..8e262701b72018352f6058a9dd11f27e93e31fab 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -20,7 +20,7 @@ in real-time as they are generated by the model. This is useful for scenarios
 where you want to display chat completions to the user as they are generated
 by the model.
 
-Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+Remember to check content and reasoning exist in `ChatCompletionChunk`,
 content may not exist leading to errors if you try to access it.
 """
 
@@ -47,22 +47,20 @@ def main():
     stream = client.chat.completions.create(model=model, messages=messages, stream=True)
 
     print("client: Start streaming chat completions...")
-    printed_reasoning_content = False
+    printed_reasoning = False
     printed_content = False
 
     for chunk in stream:
-        # Safely extract reasoning_content and content from delta,
+        # Safely extract reasoning and content from delta,
         # defaulting to None if attributes don't exist or are empty strings
-        reasoning_content = (
-            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
-        )
+        reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
         content = getattr(chunk.choices[0].delta, "content", None) or None
 
-        if reasoning_content is not None:
-            if not printed_reasoning_content:
-                printed_reasoning_content = True
-                print("reasoning_content:", end="", flush=True)
-            print(reasoning_content, end="", flush=True)
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
         elif content is not None:
             if not printed_content:
                 printed_content = True
diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..276010197b5ab0e66a87c143912015e04adb8a44
--- /dev/null
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled.
+Reasoning models can be used through the Responses API as seen here 
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
+      --structured-outputs-config.backend xgrammar \
+      --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def get_weather(latitude: float, longitude: float) -> str:
+    """
+    Mock function to simulate getting weather data.
+    In a real application, this would call an external weather API.
+    """
+    return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+
+tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get current temperature for provided coordinates in celsius.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {"type": "number"},
+                "longitude": {"type": "number"},
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }
+]
+
+input_messages = [
+    {"role": "user", "content": "What's the weather like in Paris today?"}
+]
+
+
+def main():
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+    response = client.responses.create(
+        model=model, input=input_messages, tools=tools, tool_choice="required"
+    )
+
+    for out in response.output:
+        if out.type == "function_call":
+            print("Function call:", out.name, out.arguments)
+            tool_call = out
+    args = json.loads(tool_call.arguments)
+    result = get_weather(args["latitude"], args["longitude"])
+
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+        tools=tools,
+    )
+    print(response_2.output_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 2c271b6a32bc255057f32f395bcfaa034e5eb3d3..b76ad21f04818d3cb6912677cbc1574b9e49573d 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -3,47 +3,95 @@
 ## Cohere rerank usage
 
 ```bash
+# vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/cohere_rerank_client.py
 ```
 
+## Embedding requests base64 encoding_format usage
+
+```bash
+# vllm serve intfloat/e5-small
+python examples/online_serving/pooling/embedding_requests_base64_client.py
+```
+
+## Embedding requests bytes encoding_format usage
+
+```bash
+# vllm serve intfloat/e5-small
+python examples/online_serving/pooling/embedding_requests_bytes_client.py
+```
+
 ## Jinaai rerank usage
 
 ```bash
+# vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```
 
+## Multi vector retrieval usage
+
+```bash
+# vllm serve BAAI/bge-m3
+python examples/online_serving/pooling/multi_vector_retrieval_client.py
+```
+
 ## Named Entity Recognition (NER) usage
 
 ```bash
-python examples/online_serving/pooling/ner.py
+# vllm serve boltuix/NeuroBERT-NER
+python examples/online_serving/pooling/ner_client.py
 ```
 
-## Openai chat embedding for multimodal usage
+## OpenAI chat embedding for multimodal usage
 
 ```bash
 python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
 ```
 
-## Openai classification usage
+## OpenAI classification usage
 
 ```bash
+# vllm serve jason9693/Qwen2.5-1.5B-apeach
 python examples/online_serving/pooling/openai_classification_client.py
 ```
 
-## Openai embedding usage
+## OpenAI cross_encoder score usage
 
 ```bash
+# vllm serve BAAI/bge-reranker-v2-m3
+python examples/online_serving/pooling/openai_cross_encoder_score.py
+```
+
+## OpenAI cross_encoder score for multimodal usage
+
+```bash
+# vllm serve jinaai/jina-reranker-m0
+python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
+```
+
+## OpenAI embedding usage
+
+```bash
+# vllm serve intfloat/e5-small
 python examples/online_serving/pooling/openai_embedding_client.py
 ```
 
-## Openai embedding matryoshka dimensions usage
+## OpenAI embedding matryoshka dimensions usage
 
 ```bash
+# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 ```
 
-## Openai pooling usage
+## OpenAI pooling usage
 
 ```bash
+# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
 python examples/online_serving/pooling/openai_pooling_client.py
 ```
+
+## Online Prithvi Geospatial MAE usage
+
+```bash
+python examples/online_serving/pooling/prithvi_geospatial_mae.py
+```
diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py
index 63c9ff9e9398031ea0a710296628af34e845f490..b32209967be9ac7c6e95f891102f257d916aab23 100644
--- a/examples/online_serving/pooling/cohere_rerank_client.py
+++ b/examples/online_serving/pooling/cohere_rerank_client.py
@@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """
 
-from typing import Union
-
 import cohere
 from cohere import Client, ClientV2
 
@@ -25,7 +23,7 @@ documents = [
 
 
 def cohere_rerank(
-    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+    client: Client | ClientV2, model: str, query: str, documents: list[str]
 ) -> dict:
     return client.rerank(model=model, query=query, documents=documents)
 
diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/online_serving/pooling/embedding_requests_base64_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2399b58c11feb135eddbb47357ad314ea6fd9a
--- /dev/null
+++ b/examples/online_serving/pooling/embedding_requests_base64_client.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for embedding API using vLLM API server
+NOTE:
+    start a supported embeddings model server with `vllm serve`, e.g.
+    vllm serve intfloat/e5-small
+"""
+
+import argparse
+import base64
+
+import requests
+import torch
+
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    binary2tensor,
+)
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="intfloat/e5-small")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/v1/embeddings"
+    model_name = args.model
+
+    # The OpenAI client does not support the embed_dtype and endianness parameters.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": "vLLM is great!",
+                "encoding_format": "base64",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+
+            embedding = []
+            for data in response.json()["data"]:
+                binary = base64.b64decode(data["embedding"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                embedding.append(tensor.to(torch.float32))
+            embedding = torch.cat(embedding)
+            print(embed_dtype, endianness, embedding.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/online_serving/pooling/embedding_requests_bytes_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2832f1b54ce7c5d91470a5788f565ef4ed8edf9
--- /dev/null
+++ b/examples/online_serving/pooling/embedding_requests_bytes_client.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for embedding API using vLLM API server
+NOTE:
+    start a supported embeddings model server with `vllm serve`, e.g.
+    vllm serve intfloat/e5-small
+"""
+
+import argparse
+import json
+
+import requests
+import torch
+
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    decode_pooling_output,
+)
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="intfloat/e5-small")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/v1/embeddings"
+    model_name = args.model
+
+    # The OpenAI client does not support the bytes encoding_format.
+    # The OpenAI client does not support the embed_dtype and endianness parameters.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": "vLLM is great!",
+                "encoding_format": "bytes",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+            metadata = json.loads(response.headers["metadata"])
+            body = response.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+
+            embedding = decode_pooling_output(items=items, body=body)
+            embedding = [x.to(torch.float32) for x in embedding]
+            embedding = torch.cat(embedding)
+            print(embed_dtype, endianness, embedding.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/online_serving/pooling/multi_vector_retrieval_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef8c4745aa5317f81d68660ba6110b5ebb55046a
--- /dev/null
+++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example online usage of Pooling API for multi vector retrieval.
+
+Run `vllm serve <model> --runner pooling`
+to start up the server in vLLM. e.g.
+
+vllm serve BAAI/bge-m3
+"""
+
+import argparse
+
+import requests
+import torch
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-m3")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    prompt = {"model": model_name, "input": prompts}
+
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    for output in pooling_response.json()["data"]:
+        multi_vector = torch.tensor(output["data"])
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/pooling/ner.py b/examples/online_serving/pooling/ner_client.py
similarity index 100%
rename from examples/online_serving/pooling/ner.py
rename to examples/online_serving/pooling/ner_client.py
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
index 30cb3325b9b183fdb5a084107d270f731761a8d9..261b810ce5d03f12cd354014a6cbee3729469309 100644
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@@ -1,72 +1,130 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
-"""Example Python client for multimodal embedding API using vLLM API server
-NOTE:
-    start a supported multimodal embeddings model server with `vllm serve`, e.g.
-    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
+"""Example Python client for multimodal embedding API using vLLM API server.
+
+Refer to each `run_*` function for the command to run the server for that model.
 """
 
 import argparse
 import base64
 import io
+from typing import Literal
 
-import requests
+from openai import OpenAI
+from openai._types import NOT_GIVEN, NotGiven
+from openai.types.chat import ChatCompletionMessageParam
+from openai.types.create_embedding_response import CreateEmbeddingResponse
 from PIL import Image
 
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
-def vlm2vec():
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "TIGER-Lab/VLM2Vec-Full",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": image_url}},
-                        {"type": "text", "text": "Represent the given image."},
-                    ],
-                }
-            ],
-            "encoding_format": "float",
-        },
+def create_chat_embeddings(
+    client: OpenAI,
+    *,
+    messages: list[ChatCompletionMessageParam],
+    model: str,
+    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
+) -> CreateEmbeddingResponse:
+    """
+    Convenience function for accessing vLLM's Chat Embeddings API,
+    which is an extension of OpenAI's existing Embeddings API.
+    """
+    return client.post(
+        "/embeddings",
+        cast_to=CreateEmbeddingResponse,
+        body={"messages": messages, "model": model, "encoding_format": encoding_format},
     )
-    response.raise_for_status()
-    response_json = response.json()
 
-    print("Embedding output:", response_json["data"][0]["embedding"])
 
+def run_clip(client: OpenAI, model: str):
+    """
+    Start the server using:
+
+    vllm serve openai/clip-vit-base-patch32 \
+        --runner pooling
+    """
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "a photo of a cat"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Text embedding output:", response.data[0].embedding)
+
+
+def run_dse_qwen2_vl(client: OpenAI, model: str):
+    """
+    Start the server using:
 
-def dse_qwen2_vl(inp: dict):
-    # Embedding an Image
-    if inp["type"] == "image":
-        messages = [
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
+        --runner pooling \
+        --trust-remote-code \
+        --max-model-len 8192 \
+        --chat-template examples/template_dse_qwen2_vl.jinja
+    """
+    response = create_chat_embeddings(
+        client,
+        messages=[
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": inp["image_url"],
+                            "url": image_url,
                         },
                     },
                     {"type": "text", "text": "What is shown in this image?"},
                 ],
             }
-        ]
-    # Embedding a Text Query
-    else:
-        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
-        # of the minimum input size
-        buffer = io.BytesIO()
-        image_placeholder = Image.new("RGB", (56, 56))
-        image_placeholder.save(buffer, "png")
-        buffer.seek(0)
-        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
-        messages = [
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image embedding output:", response.data[0].embedding)
+
+    # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+    # of the minimum input size
+    buffer = io.BytesIO()
+    image_placeholder = Image.new("RGB", (56, 56))
+    image_placeholder.save(buffer, "png")
+    buffer.seek(0)
+    image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+    response = create_chat_embeddings(
+        client,
+        messages=[
             {
                 "role": "user",
                 "content": [
@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict):
                             "url": f"data:image/jpeg;base64,{image_placeholder}",
                         },
                     },
-                    {"type": "text", "text": f"Query: {inp['content']}"},
+                    {"type": "text", "text": "Query: What is the weather like today?"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Text embedding output:", response.data[0].embedding)
+
+
+def run_siglip(client: OpenAI, model: str):
+    """
+    Start the server using:
+
+    vllm serve google/siglip-base-patch16-224 \
+        --runner pooling
+    """
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
                 ],
             }
-        ]
-
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "MrLight/dse-qwen2-2b-mrl-v1",
-            "messages": messages,
-            "encoding_format": "float",
-        },
+        ],
+        model=model,
+        encoding_format="float",
     )
-    response.raise_for_status()
-    response_json = response.json()
 
-    print("Embedding output:", response_json["data"][0]["embedding"])
+    print("Image embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "a photo of a cat"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Text embedding output:", response.data[0].embedding)
+
+
+def run_vlm2vec(client: OpenAI, model: str):
+    """
+    Start the server using:
+
+    vllm serve TIGER-Lab/VLM2Vec-Full \
+        --runner pooling \
+        --trust-remote-code \
+        --max-model-len 4096 \
+        --chat-template examples/template_vlm2vec_phi3v.jinja
+    """
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "Represent the given image with the following question: What is in the image.",
+                    },
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image+Text embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "A cat and a dog"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Text embedding output:", response.data[0].embedding)
+
+
+model_example_map = {
+    "clip": run_clip,
+    "dse_qwen2_vl": run_dse_qwen2_vl,
+    "siglip": run_siglip,
+    "vlm2vec": run_vlm2vec,
+}
 
 
 def parse_args():
@@ -103,29 +267,24 @@ def parse_args():
     parser.add_argument(
         "--model",
         type=str,
-        choices=["vlm2vec", "dse_qwen2_vl"],
+        choices=model_example_map.keys(),
         required=True,
-        help="Which model to call.",
+        help="The name of the embedding model.",
     )
     return parser.parse_args()
 
 
 def main(args):
-    if args.model == "vlm2vec":
-        vlm2vec()
-    elif args.model == "dse_qwen2_vl":
-        dse_qwen2_vl(
-            {
-                "type": "image",
-                "image_url": image_url,
-            }
-        )
-        dse_qwen2_vl(
-            {
-                "type": "text",
-                "content": "What is the weather like today?",
-            }
-        )
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model_id = models.data[0].id
+
+    model_example_map[args.model](client, model_id)
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/pooling/openai_cross_encoder_score.py
similarity index 100%
rename from examples/online_serving/openai_cross_encoder_score.py
rename to examples/online_serving/pooling/openai_cross_encoder_score.py
diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
similarity index 100%
rename from examples/online_serving/openai_cross_encoder_score_for_multimodal.py
rename to examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/pooling/prithvi_geospatial_mae.py
similarity index 90%
rename from examples/online_serving/prithvi_geospatial_mae.py
rename to examples/online_serving/pooling/prithvi_geospatial_mae.py
index 611a7cbc89fa2a98a998af3d6405fbbb37a0808f..a6246999c14d61bd5a596f9a4edc33e2176c4e05 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/pooling/prithvi_geospatial_mae.py
@@ -11,14 +11,15 @@ import requests
 # image as input, process it using the multimodal data processor, and
 # perform inference.
 # Requirements :
-# - install plugin at:
-#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - install TerraTorch v1.1 (or later):
+#   pip install terratorch>=v1.1
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
 #   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
-#   --io-processor-plugin prithvi_to_tiff
+#   --io-processor-plugin terratorch_segmentation
+#   --enable-mm-embeds
 
 
 def main():
@@ -34,7 +35,6 @@ def main():
         },
         "priority": 0,
         "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
-        "softmax": False,
     }
 
     ret = requests.post(server_endpoint, json=request_payload_url)
diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index 37abc9de926fd1d7541e7ae0ffae4f9ef87e332f..1c89d459383094f52b8848d9aa82ba04ff500d37 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -852,7 +852,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}",
           "instant": false,
           "legendFormat": "GPU Cache Usage",
           "range": true,
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index d24b553df27c7e1925fe541ef41f448c76fb1422..af53443b9101aa5c1ed657c69e9604720b51efab 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
     },
     # Set to the node's accelerator type.
     accelerator_type="H100",
-    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
     # Customize engine arguments as required (for example, vLLM engine kwargs).
     engine_kwargs={
         "tensor_parallel_size": 8,
diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh
index 522b9566212bb0f45ed251cf8eaa3e2c828daea7..0756d4b0ae556a5f5cbc37c39d3efc82d7a52d29 100644
--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -83,6 +83,29 @@ else
     RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
 fi
 
+# Parse VLLM_HOST_IP from additional args if present.
+# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
+VLLM_HOST_IP=""
+for arg in "${ADDITIONAL_ARGS[@]}"; do
+    if [[ $arg == "-e" ]]; then
+        continue
+    fi
+    if [[ $arg == VLLM_HOST_IP=* ]]; then
+        VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
+        break
+    fi
+done
+
+# Build Ray IP environment variables if VLLM_HOST_IP is set.
+# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
+RAY_IP_VARS=()
+if [ -n "${VLLM_HOST_IP}" ]; then
+    RAY_IP_VARS=(
+        -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+        -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+    )
+fi
+
 # Launch the container with the assembled parameters.
 # --network host: Allows Ray nodes to communicate directly via host networking
 # --shm-size 10.24g: Increases shared memory
@@ -95,5 +118,6 @@ docker run \
     --shm-size 10.24g \
     --gpus all \
     -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    "${RAY_IP_VARS[@]}" \
     "${ADDITIONAL_ARGS[@]}" \
     "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
diff --git a/examples/online_serving/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
index 75a99ffc1f1556f838ffd9e29a1c66937d96b842..1a6b6780ef2a3ce1ec6eabdd91e5bf6bbde07e6e 100644
--- a/examples/online_serving/sagemaker-entrypoint.sh
+++ b/examples/online_serving/sagemaker-entrypoint.sh
@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
 done < <(env | grep "^${PREFIX}")
 
 # Pass the collected arguments to the main entrypoint
-exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
\ No newline at end of file
+exec vllm serve "${ARGS[@]}"
\ No newline at end of file
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index 64c8a91782806993496736dd849c3f1d4f2d80fb..d60dbf4d753a7a9b0d992d30c756626c79ac383d 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
         for chunk in response:
             delta = chunk.choices[0].delta
             # Stream reasoning first
-            if reason and hasattr(delta, "reasoning_content") and live_think:
-                rc = delta.reasoning_content
+            if reason and hasattr(delta, "reasoning") and live_think:
+                rc = delta.reasoning
                 if rc:
                     think_text += rc
                     live_think.markdown(think_text + "▌")
@@ -262,8 +262,8 @@ def server_supports_reasoning():
         messages=[{"role": "user", "content": "Hi"}],
         stream=False,
     )
-    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
-        resp.choices[0].message.reasoning_content
+    return hasattr(resp.choices[0].message, "reasoning") and bool(
+        resp.choices[0].message.reasoning
     )
 
 
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md
index d2777a43d47828b0727bd70d8c5beb40164f9708..7f539716ecf8f4f6fac152226827d27dd020fd7c 100644
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
 
 ```bash
 uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
-    structured-output
+    structured-outputs
 ```
 
 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/online_serving/structured_outputs/pyproject.toml
index 8f31405ff584a8b421db4a7b5c775b6def6ff930..5e366ab0a03d3cbff74d78d563afb35d345d788c 100644
--- a/examples/online_serving/structured_outputs/pyproject.toml
+++ b/examples/online_serving/structured_outputs/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "examples-online-structured-outputs"
-requires-python = ">=3.9, <3.13"
+requires-python = ">=3.10, <3.14"
 dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
 version = "0.0.0"
 
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index 3ea6c73e90e8f6125cbcc34560fa868da6395eb6..ff473d044e3236e02dde7099b1943d904aa098fd 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -1,21 +1,15 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import argparse
 import asyncio
 import enum
 import os
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal
 
 import openai
 import pydantic
-
-if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionChunk
-
+from openai.types.chat import ChatCompletionChunk
 
 ConstraintsFormat = Literal[
     "choice",
@@ -39,7 +33,7 @@ async def print_stream_response(
     async for chunk in stream_response:
         delta = chunk.choices[0].delta
 
-        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
         content_chunk_text = delta.content
 
         if args.reasoning:
@@ -261,8 +255,8 @@ async def cli():
         for constraint, response in zip(constraints, results):
             print(f"\n\n{constraint}:")
             message = response.choices[0].message
-            if args.reasoning and hasattr(message, "reasoning_content"):
-                print(f"  Reasoning: {message.reasoning_content or ''}")
+            if args.reasoning and hasattr(message, "reasoning"):
+                print(f"  Reasoning: {message.reasoning or ''}")
             print(f"  Content: {message.content!r}")
 
 
diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..88ee43c5d9cdfe3eaf641b6a6472de059b777ae6
--- /dev/null
+++ b/examples/online_serving/token_generation_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+
+
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main(client)
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index e10ee4e2a9a9ad18bf803ccdac82cd9041b2a48e..53036b3eb0ff32edb0582a6e86abe7e739573052 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
 from vllm.engine.arg_utils import EngineArgs
 
 
-def setup_environment_variables(vllm_version: str):
+def setup_environment_variables():
     # LMCache-related environment variables
     # Use experimental features in LMCache
     os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
     os.environ["LMCACHE_LOCAL_CPU"] = "True"
     # Set local CPU memory limit to 5.0 GB
     os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-    if vllm_version == "v0":
-        os.environ["VLLM_USE_V1"] = "0"
 
 
 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str):
     ktc = KVTransferConfig(
         kv_connector=lmcache_connector,
         kv_role="kv_both",
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
     # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    if vllm_version == "v0":
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-            enable_chunked_prefill=True,  # Only in v0
-        )
-    else:
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-        )
+    llm_args = EngineArgs(
+        model=model,
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+    )
 
     llm = LLM(**asdict(llm_args))
     try:
@@ -116,18 +105,10 @@ def parse_args():
 
 
 def main():
-    args = parse_args()
-
-    if args.version == "v0":
-        lmcache_connector = "LMCacheConnector"
-        model = "mistralai/Mistral-7B-Instruct-v0.2"
-    else:
-        lmcache_connector = "LMCacheConnectorV1"
-        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-
-    setup_environment_variables(args.version)
-
-    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
+    lmcache_connector = "LMCacheConnectorV1"
+    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    setup_environment_variables()
+    with build_llm_with_lmcache(lmcache_connector, model) as llm:
         # This example script runs two requests with a shared prefix.
         # Define the shared prompt and specific prompts
         shared_prompt = "Hello, how are you?" * 1000
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 2b7f0beab2277f928da0bbf06c93e2d7c388e832..3644a03b32edef2f2fabad422518d890096eb618 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import (
     tensorize_vllm_model,
     tensorizer_kwargs_arg,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = logging.getLogger()
 
 
-# yapf conflicts with isort for this docstring
-# yapf: disable
 """
 tensorize_vllm_model.py is a script that can be used to serialize and 
 deserialize vLLM models. These models can be loaded using tensorizer 
@@ -86,7 +84,7 @@ directly to load models:
 from vllm import LLM
 llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
-    load_format="tensorizer"
+    load_format="tensorizer",
 )
 ```
 
@@ -132,7 +130,8 @@ def get_parser():
         "can be loaded using tensorizer directly to the GPU "
         "extremely quickly. Tensor encryption and decryption is "
         "also supported, although libsodium must be installed to "
-        "use it.")
+        "use it."
+    )
     parser = EngineArgs.add_cli_args(parser)
 
     parser.add_argument(
@@ -144,13 +143,14 @@ def get_parser():
         "along with the model by instantiating a TensorizerConfig object, "
         "creating a dict from it with TensorizerConfig.to_serializable(), "
         "and passing it to LoRARequest's initializer with the kwarg "
-        "tensorizer_config_dict."
+        "tensorizer_config_dict.",
     )
 
-    subparsers = parser.add_subparsers(dest='command', required=True)
+    subparsers = parser.add_subparsers(dest="command", required=True)
 
     serialize_parser = subparsers.add_parser(
-        'serialize', help="Serialize a model to `--serialized-directory`")
+        "serialize", help="Serialize a model to `--serialized-directory`"
+    )
 
     serialize_parser.add_argument(
         "--suffix",
@@ -163,7 +163,9 @@ def get_parser():
             "`--suffix` is `v1`, the serialized model tensors will be "
             "saved to "
             "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
-            "If none is provided, a random UUID will be used."))
+            "If none is provided, a random UUID will be used."
+        ),
+    )
     serialize_parser.add_argument(
         "--serialized-directory",
         type=str,
@@ -175,108 +177,127 @@ def get_parser():
         "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
         "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
         "where `suffix` is given by `--suffix` or a random UUID if not "
-        "provided.")
+        "provided.",
+    )
 
     serialize_parser.add_argument(
         "--serialization-kwargs",
         type=tensorizer_kwargs_arg,
         required=False,
-        help=("A JSON string containing additional keyword arguments to "
-              "pass to Tensorizer's TensorSerializer during "
-              "serialization."))
+        help=(
+            "A JSON string containing additional keyword arguments to "
+            "pass to Tensorizer's TensorSerializer during "
+            "serialization."
+        ),
+    )
 
     serialize_parser.add_argument(
         "--keyfile",
         type=str,
         required=False,
-        help=("Encrypt the model weights with a randomly-generated binary key,"
-              " and save the key at this path"))
+        help=(
+            "Encrypt the model weights with a randomly-generated binary key,"
+            " and save the key at this path"
+        ),
+    )
 
     deserialize_parser = subparsers.add_parser(
-        'deserialize',
-        help=("Deserialize a model from `--path-to-tensors`"
-              " to verify it can be loaded and used."))
+        "deserialize",
+        help=(
+            "Deserialize a model from `--path-to-tensors`"
+            " to verify it can be loaded and used."
+        ),
+    )
 
     deserialize_parser.add_argument(
         "--path-to-tensors",
         type=str,
         required=False,
-        help="The local path or S3 URI to the model tensors to deserialize. ")
+        help="The local path or S3 URI to the model tensors to deserialize. ",
+    )
 
     deserialize_parser.add_argument(
         "--serialized-directory",
         type=str,
         required=False,
         help="Directory with model artifacts for loading. Assumes a "
-             "model.tensors file exists therein. Can supersede "
-             "--path-to-tensors.")
+        "model.tensors file exists therein. Can supersede "
+        "--path-to-tensors.",
+    )
 
     deserialize_parser.add_argument(
         "--keyfile",
         type=str,
         required=False,
-        help=("Path to a binary key to use to decrypt the model weights,"
-              " if the model was serialized with encryption"))
+        help=(
+            "Path to a binary key to use to decrypt the model weights,"
+            " if the model was serialized with encryption"
+        ),
+    )
 
     deserialize_parser.add_argument(
         "--deserialization-kwargs",
         type=tensorizer_kwargs_arg,
         required=False,
-        help=("A JSON string containing additional keyword arguments to "
-              "pass to Tensorizer's `TensorDeserializer` during "
-              "deserialization."))
+        help=(
+            "A JSON string containing additional keyword arguments to "
+            "pass to Tensorizer's `TensorDeserializer` during "
+            "deserialization."
+        ),
+    )
 
     TensorizerArgs.add_cli_args(deserialize_parser)
 
     return parser
 
-def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
-                                              cfg: TensorizerConfig):
+
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
     for k, v in extra_cfg.items():
         if hasattr(cfg, k):
             setattr(cfg, k, v)
             logger.info(
                 "Updating TensorizerConfig with %s from "
-                "--model-loader-extra-config provided", k
+                "--model-loader-extra-config provided",
+                k,
             )
 
+
 def deserialize(args, tensorizer_config):
     if args.lora_path:
         tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
-        llm = LLM(model=args.model,
-                  load_format="tensorizer",
-                  tensor_parallel_size=args.tensor_parallel_size,
-                  model_loader_extra_config=tensorizer_config,
-                  enable_lora=True,
+        llm = LLM(
+            model=args.model,
+            load_format="tensorizer",
+            tensor_parallel_size=args.tensor_parallel_size,
+            model_loader_extra_config=tensorizer_config,
+            enable_lora=True,
         )
         sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=256,
-            stop=["[/assistant]"]
+            temperature=0, max_tokens=256, stop=["[/assistant]"]
         )
 
         # Truncating this as the extra text isn't necessary
-        prompts = [
-            "[user] Write a SQL query to answer the question based on ..."
-        ]
+        prompts = ["[user] Write a SQL query to answer the question based on ..."]
 
         # Test LoRA load
         print(
             llm.generate(
-            prompts,
-            sampling_params,
-            lora_request=LoRARequest("sql-lora",
-                                     1,
-                                     args.lora_path,
-                                     tensorizer_config_dict = tensorizer_config
-                                     .to_serializable())
+                prompts,
+                sampling_params,
+                lora_request=LoRARequest(
+                    "sql-lora",
+                    1,
+                    args.lora_path,
+                    tensorizer_config_dict=tensorizer_config.to_serializable(),
+                ),
             )
         )
     else:
-        llm = LLM(model=args.model,
-                  load_format="tensorizer",
-                  tensor_parallel_size=args.tensor_parallel_size,
-                  model_loader_extra_config=tensorizer_config
+        llm = LLM(
+            model=args.model,
+            load_format="tensorizer",
+            tensor_parallel_size=args.tensor_parallel_size,
+            model_loader_extra_config=tensorizer_config,
         )
     return llm
 
@@ -285,17 +306,20 @@ def main():
     parser = get_parser()
     args = parser.parse_args()
 
-    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
-    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
-    s3_endpoint = (getattr(args, 's3_endpoint', None)
-                or os.environ.get("S3_ENDPOINT_URL", None))
+    s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
+        "S3_ACCESS_KEY_ID", None
+    )
+    s3_secret_access_key = getattr(
+        args, "s3_secret_access_key", None
+    ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
+    s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
+        "S3_ENDPOINT_URL", None
+    )
 
     credentials = {
         "s3_access_key_id": s3_access_key_id,
         "s3_secret_access_key": s3_secret_access_key,
-        "s3_endpoint": s3_endpoint
+        "s3_endpoint": s3_endpoint,
     }
 
     model_ref = args.model
@@ -309,25 +333,25 @@ def main():
     if args.model_loader_extra_config:
         extra_config = json.loads(args.model_loader_extra_config)
 
-
-    tensorizer_dir = (args.serialized_directory or
-                      extra_config.get("tensorizer_dir"))
-    tensorizer_uri = (getattr(args, "path_to_tensors", None)
-                      or extra_config.get("tensorizer_uri"))
+    tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
+    tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
+        "tensorizer_uri"
+    )
 
     if tensorizer_dir and tensorizer_uri:
-        parser.error("--serialized-directory and --path-to-tensors "
-                     "cannot both be provided")
+        parser.error(
+            "--serialized-directory and --path-to-tensors cannot both be provided"
+        )
 
     if not tensorizer_dir and not tensorizer_uri:
-        parser.error("Either --serialized-directory or --path-to-tensors "
-                     "must be provided")
-
+        parser.error(
+            "Either --serialized-directory or --path-to-tensors must be provided"
+        )
 
     if args.command == "serialize":
         engine_args = EngineArgs.from_cli_args(args)
 
-        input_dir = tensorizer_dir.rstrip('/')
+        input_dir = tensorizer_dir.rstrip("/")
         suffix = args.suffix if args.suffix else uuid.uuid4().hex
         base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
         if engine_args.tensor_parallel_size > 1:
@@ -339,15 +363,14 @@ def main():
             tensorizer_uri=model_path,
             encryption_keyfile=keyfile,
             serialization_kwargs=args.serialization_kwargs or {},
-            **credentials
+            **credentials,
         )
 
         if args.lora_path:
             tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
             tensorize_lora_adapter(args.lora_path, tensorizer_config)
 
-        merge_extra_config_with_tensorizer_config(extra_config,
-                                                  tensorizer_config)
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
         tensorize_vllm_model(engine_args, tensorizer_config)
 
     elif args.command == "deserialize":
@@ -356,11 +379,10 @@ def main():
             tensorizer_dir=args.serialized_directory,
             encryption_keyfile=keyfile,
             deserialization_kwargs=args.deserialization_kwargs or {},
-            **credentials
+            **credentials,
         )
 
-        merge_extra_config_with_tensorizer_config(extra_config,
-                                                  tensorizer_config)
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
         deserialize(args, tensorizer_config)
     else:
         raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/examples/pyproject.toml b/examples/pyproject.toml
deleted file mode 100644
index f825cb203269cf2f2e17bf1f530cfb2a599559f9..0000000000000000000000000000000000000000
--- a/examples/pyproject.toml
+++ /dev/null
@@ -1,54 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-
-[tool.ruff.format]
-docstring-code-format = true
\ No newline at end of file
diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec_phi3v.jinja
similarity index 100%
rename from examples/template_vlm2vec.jinja
rename to examples/template_vlm2vec_phi3v.jinja
diff --git a/examples/template_vlm2vec_qwen2vl.jinja b/examples/template_vlm2vec_qwen2vl.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..3ab099d8f546d423f8d1c89943463af5ffac24b8
--- /dev/null
+++ b/examples/template_vlm2vec_qwen2vl.jinja
@@ -0,0 +1,15 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[]) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/format.sh b/format.sh
deleted file mode 100755
index 6ba93e0a19ba8339359a18a0ded32ab4fde54ffb..0000000000000000000000000000000000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install' to install the pre-commit hooks."
-echo "Then linters will run automatically before each commit."
\ No newline at end of file
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 1535fcc622cdc326088299f605f02651371d60c0..bf97093dafb113f620802154d936ecfb93b64532 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -102,7 +102,6 @@ plugins:
           - https://numpy.org/doc/stable/objects.inv
           - https://pytorch.org/docs/stable/objects.inv
           - https://psutil.readthedocs.io/en/stable/objects.inv
-          - https://huggingface.co/docs/transformers/main/en/objects.inv
 
 markdown_extensions:
   - attr_list
@@ -143,8 +142,3 @@ extra_javascript:
   - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
   - mkdocs/javascript/edit_and_feedback.js
   - mkdocs/javascript/slack_and_forum.js
-
-# Makes the url format end in .html rather than act as a dir
-# So index.md generates as index.html and is available under URL /index.html
-# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
-use_directory_urls: false
diff --git a/pyproject.toml b/pyproject.toml
index 88c5c4067f5aef799c85bcdb7ed42f4dba289764..a250ab6567f1295f6989704df11452396437af29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,9 @@ requires = [
     "cmake>=3.26.1",
     "ninja",
     "packaging>=24.2",
-    "setuptools>=77.0.3,<80.0.0",
+    "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.8.0",
+    "torch == 2.9.0",
     "wheel",
     "jinja2",
 ]
@@ -20,7 +20,6 @@ license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -31,7 +30,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9,<3.14"
+requires-python = ">=3.10,<3.14"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]
@@ -52,27 +51,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi
 where = ["."]
 include = ["vllm*"]
 
-[tool.yapfignore]
-ignore_patterns = [
-    ".buildkite/**",
-    "benchmarks/**",
-    "build/**",
-    "examples/**",
-]
-
-[tool.ruff]
-# Allow lines to be as long as 80.
-line-length = 80
-
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
-# Python 3.8 typing - skip V0 code
-"vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/engine/**/*.py" = ["UP006", "UP035"]
-"vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/worker/**/*.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
 select = [
@@ -87,7 +69,7 @@ select = [
     # flake8-simplify
     "SIM",
     # isort
-    # "I",
+    "I",
     # flake8-logging-format
     "G",
 ]
@@ -96,29 +78,23 @@ ignore = [
     "F405", "F403",
     # lambda expression assignment
     "E731",
+    # zip without `strict=`
+    "B905",
     # Loop control variable not used within loop body
     "B007",
     # f-string format
     "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
 ]
 
+[tool.ruff.format]
+docstring-code-format = true
+
 [tool.mypy]
 plugins = ['pydantic.mypy']
 ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"
 
-[tool.isort]
-skip_glob = [
-    ".buildkite/*",
-    "benchmarks/*",
-    "examples/*",
-]
-use_parentheses = true
-skip_gitignore = true
-
 [tool.pytest.ini_options]
 markers = [
     "slow_test",
@@ -126,6 +102,7 @@ markers = [
     "core_model: enable this model test in each PR instead of only nightly",
     "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
     "cpu_model: enable this model test in CPU tests",
+    "cpu_test: mark test as CPU-only test",
     "split: run this test as part of a split",
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
@@ -206,6 +183,7 @@ ba = "ba"
 
 [tool.typos.type.py.extend-words]
 ba = "ba"
+nd = "nd"
 
 [tool.typos.type.cpp]
 extend-glob = ["*.cu"]
diff --git a/requirements/build.txt b/requirements/build.txt
index 5f826a1afa1440399a70e064dc6009e064b5145a..23ff8d4fdc1c0624a107facb717bf78b3af7fbcd 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -2,9 +2,9 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.8.0
+torch==2.9.0
 wheel
 jinja2>=3.1.6
 regex
diff --git a/requirements/common.txt b/requirements/common.txt
index 409f686974c7c5406ad7ce9404b5b2333d3b4c20..e17f1936ac62105393b2c037f262d1b785c2461e 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,39 +7,38 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.2
+transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.99.1  # For Responses API with reasoning content
-pydantic >= 2.11.7
+pydantic >= 2.12.0
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
-importlib_metadata; python_version < '3.10'
-mistral_common[image,audio] >= 1.5.4 # requires numpy>=1.25 #1.8.2
+mistral_common[image] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
-setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.11.0 # required for compressed-tensors
-depyf==0.19.0 # required for profiling and debugging with compilation config
+compressed-tensors == 0.12.2 # required for compressed-tensors
+depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
@@ -49,3 +48,5 @@ pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
+anthropic == 0.71.0
+model-hosting-container-standards < 1.0.0
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index b511b0f5d31b32d8507b4b8e39a8a77fe57bcef5..81d429a5e5f8df230afc011c85fca05502bb7d2a 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,11 +1,13 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
 regex
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 2db6d87ee67bc139b53e7aa90caabe9ba8bd92a8..d11787df4d92bf279adb4b1a5622c36c5f904c27 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,15 +1,14 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
+numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
-torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 3f8b8fca3209aaca2471d192b8cb59c2ce540de4..d63fe9e1e77c12034d1b3e2c69bb14c85f98e274 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -1,14 +1,14 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.8.0
-torchaudio==2.8.0
+torch==2.9.0
+torchaudio==2.9.0
 # These must be updated alongside torch
-torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
-xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
+torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
+# FlashInfer should be updated together with the Dockerfile
+flashinfer-python==0.5.2
diff --git a/requirements/docs.txt b/requirements/docs.txt
index d1c546398780a196fcad0b0abc75e8b928014b75..32e004b2b64babc43d40f873d1add4d96c413d6f 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -9,10 +9,8 @@ mkdocs-git-revision-date-localized-plugin
 mkdocs-minify-plugin
 regex
 ruff
-
-# Required for argparse hook only
--f https://download.pytorch.org/whl/cpu
-cachetools
-msgspec
 pydantic
-torch
+
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index 3b610e0d97363dba0c9d664bf9dfb3ea0ac1a930..b1f3269cd381322c53ad7845a09a1e3ddb15017f 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.5.1 # Required for disaggregated prefill
+nixl >= 0.6.0 # Required for disaggregated prefill
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index a529bf4504e40ed552bdf454935ce1a563e212c8..d9c5d89c1d52fb5f98311e347fc50c4f7ed8fc80 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -23,14 +23,14 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.52.4
-tokenizers==0.21.1
+transformers==4.57.1
+tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.46.1
@@ -40,10 +40,8 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
+pydantic>=2.12 # 2.11 leads to error on python 3.13
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index a86a8ab6df14951e2d44f962eeac0802f1a28cb2..b977e80be067f65024d0ddf1b356b60e6fee73b6 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,17 +1,17 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.3
-torch==2.8.0
-torchvision==0.23.0
-torchaudio==2.8.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.4
+torch==2.9.0
+torchvision==0.24.0
+torchaudio==2.9.0
 
-triton==3.3.0
+triton==3.5.0
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.2.4
+amdsmi==6.4.3
 timm>=1.0.17
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 869fb28c3d85cea640d4efaf2e9a83cd7ba974d5..432e11977872d77ac23bcbd59c80e013a3c56c6b 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,8 +1,10 @@
 # Common dependencies
 -r common.txt
 tblib==3.1.0
+bm25s==0.2.13
+pystemmer==3.0.0
 
-# entrypoints test
+# Entrypoints test
 # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 audioread==3.0.1
 cffi==1.17.1
@@ -15,11 +17,11 @@ soundfile==0.13.1
 soxr==0.5.0.post1
 librosa==0.10.2.post1
 
-# entrypoints test
+# Entrypoints test
 #vllm[video] # required by entrypoints/openai/test_video.py
 decord==0.6.0
 
-# entrypoints test
+# Entrypoints test
 #sentence-transformers # required by entrypoints/openai/test_score.py
 sentence-transformers==3.4.1
 
@@ -29,4 +31,11 @@ matplotlib==3.10.3
 # Multi-Modal Models Test (Extended) 3
 blobfile==3.0.0
 
+# Required for openai schema test.
+schemathesis==3.39.15
 
+# Required for mteb test
+mteb[bm25s]>=1.38.11, <2
+
+# Required for eval tests
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index eb45a70a41ace8ea901daaf62f37e16df39eb4f6..d81b864ce094f89884b12657259af993a2d66b7b 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,12 +1,9 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for AMD GPUs
-boto3
-botocore
 datasets
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 peft
@@ -15,7 +12,6 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
 # conch-triton-kernels==1.2.1
 timm>=1.0.17
diff --git a/requirements/test.in b/requirements/test.in
index 451bd738791075e112bb6e6a897727fd1ed25cef..30d97e9b9c7d05c71f04f53f465dfc47e442cddb 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,12 +24,12 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.8.0
-torchaudio==2.8.0
-torchvision==0.23.0
+torch==2.9.0
+torchaudio==2.9.0
+torchvision==0.24.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
@@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.55.2
-tokenizers==0.21.1
+transformers==4.57.1
+tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.46.1
@@ -48,12 +48,12 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+arctic-inference == 0.1.1 # Required for suffix decoding test
+numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
+pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
+gpt-oss >= 0.0.7; python_version > '3.11'
diff --git a/requirements/test.txt b/requirements/test.txt
index 3519aa524f4189b845ba14becbbaba204892c641..3263b74c08797b582808507cbf8069ff3d2cfbf3 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -10,18 +10,19 @@ aenum==3.1.16
     # via lightly
 affine==2.4.0
     # via rasterio
-aiohappyeyeballs==2.4.3
+aiohappyeyeballs==2.6.1
     # via aiohttp
-aiohttp==3.10.11
+aiohttp==3.13.0
     # via
     #   aiohttp-cors
     #   datasets
     #   fsspec
+    #   gpt-oss
     #   lm-eval
     #   ray
 aiohttp-cors==0.8.1
     # via ray
-aiosignal==1.3.1
+aiosignal==1.4.0
     # via aiohttp
 albucore==0.0.16
     # via terratorch
@@ -39,6 +40,8 @@ anyio==4.6.2.post1
     # via
     #   httpx
     #   starlette
+arctic-inference==0.1.1
+    # via -r requirements/test.in
 argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
@@ -72,7 +75,9 @@ blobfile==3.0.0
 bm25s==0.2.13
     # via mteb
 boto3==1.35.57
-    # via tensorizer
+    # via
+    #   runai-model-streamer-s3
+    #   tensorizer
 botocore==1.35.57
     # via
     #   boto3
@@ -101,6 +106,8 @@ chardet==5.2.0
     # via mbstrdecoder
 charset-normalizer==3.4.0
     # via requests
+chz==0.3.0
+    # via gpt-oss
 click==8.1.7
     # via
     #   black
@@ -171,7 +178,9 @@ distlib==0.3.9
 dnspython==2.7.0
     # via email-validator
 docker==7.1.0
-    # via mlflow
+    # via
+    #   gpt-oss
+    #   mlflow
 docopt==0.6.2
     # via num2words
 docstring-parser==0.17.0
@@ -197,7 +206,9 @@ eval-type-backport==0.2.2
 evaluate==0.4.3
     # via lm-eval
 fastapi==0.116.1
-    # via mlflow-skinny
+    # via
+    #   gpt-oss
+    #   mlflow-skinny
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -249,13 +260,31 @@ gitdb==4.0.12
 gitpython==3.1.44
     # via mlflow-skinny
 google-api-core==2.24.2
-    # via opencensus
+    # via
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   opencensus
 google-auth==2.40.2
     # via
     #   databricks-sdk
     #   google-api-core
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   runai-model-streamer-gcs
+google-cloud-core==2.4.3
+    # via google-cloud-storage
+google-cloud-storage==3.4.0
+    # via runai-model-streamer-gcs
+google-crc32c==1.7.1
+    # via
+    #   google-cloud-storage
+    #   google-resumable-media
+google-resumable-media==2.7.2
+    # via google-cloud-storage
 googleapis-common-protos==1.70.0
     # via google-api-core
+gpt-oss==0.0.8
+    # via -r requirements/test.in
 graphene==3.4.3
     # via mlflow
 graphql-core==3.2.6
@@ -283,6 +312,8 @@ hf-xet==1.1.7
     # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
+html2text==2025.4.15
+    # via gpt-oss
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
@@ -417,6 +448,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215
 lxml==5.3.0
     # via
     #   blobfile
+    #   gpt-oss
     #   sacrebleu
 mako==1.3.10
     # via alembic
@@ -444,7 +476,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.8.2
+mistral-common==1.8.5
     # via -r requirements/test.in
 mlflow==2.22.0
     # via terratorch
@@ -543,42 +575,44 @@ numpy==1.26.4
     #   tritonclient
     #   vocos
     #   xarray
-nvidia-cublas-cu12==12.8.4.1
+nvidia-cublas-cu12==12.9.1.4
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-cupti-cu12==12.9.79
     # via torch
-nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-nvrtc-cu12==12.9.86
     # via torch
-nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cuda-runtime-cu12==12.9.79
     # via torch
 nvidia-cudnn-cu12==9.10.2.21
     # via torch
-nvidia-cufft-cu12==11.3.3.83
+nvidia-cufft-cu12==11.4.1.4
     # via torch
-nvidia-cufile-cu12==1.13.1.3
+nvidia-cufile-cu12==1.14.1.1
     # via torch
-nvidia-curand-cu12==10.3.9.90
+nvidia-curand-cu12==10.3.10.19
     # via torch
-nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusolver-cu12==11.7.5.82
     # via torch
-nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparse-cu12==12.5.10.65
     # via
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cusparselt-cu12==0.7.1
     # via torch
-nvidia-nccl-cu12==2.27.3
+nvidia-nccl-cu12==2.27.5
     # via torch
-nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvjitlink-cu12==12.9.86
     # via
     #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.8.90
+nvidia-nvshmem-cu12==3.3.20
+    # via torch
+nvidia-nvtx-cu12==12.9.79
     # via torch
 omegaconf==2.3.0
     # via
@@ -586,6 +620,8 @@ omegaconf==2.3.0
     #   lightning
 open-clip-torch==2.32.0
     # via -r requirements/test.in
+openai-harmony==0.0.4
+    # via gpt-oss
 opencensus==0.11.4
     # via ray
 opencensus-context==0.1.3
@@ -706,7 +742,9 @@ prometheus-client==0.22.0
     #   opentelemetry-exporter-prometheus
     #   ray
 propcache==0.2.0
-    # via yarl
+    # via
+    #   aiohttp
+    #   yarl
 proto-plus==1.26.1
     # via google-api-core
 protobuf==5.28.3
@@ -749,19 +787,21 @@ pycparser==2.22
     # via cffi
 pycryptodomex==3.22.0
     # via blobfile
-pydantic==2.11.7
+pydantic==2.12.0
     # via
     #   -r requirements/test.in
     #   albumentations
     #   datamodel-code-generator
     #   fastapi
+    #   gpt-oss
     #   lightly
     #   mistral-common
     #   mlflow-skinny
     #   mteb
+    #   openai-harmony
     #   pydantic-extra-types
     #   ray
-pydantic-core==2.33.2
+pydantic-core==2.41.1
     # via pydantic
 pydantic-extra-types==2.10.5
     # via mistral-common
@@ -888,6 +928,8 @@ requests==2.32.3
     #   docker
     #   evaluate
     #   google-api-core
+    #   google-cloud-storage
+    #   gpt-oss
     #   huggingface-hub
     #   lightly
     #   lm-eval
@@ -925,10 +967,12 @@ rsa==4.9.1
     # via google-auth
 rtree==1.4.0
     # via torchgeo
-runai-model-streamer==0.11.0
-    # via -r requirements/test.in
-runai-model-streamer-s3==0.11.0
+runai-model-streamer==0.15.0
     # via -r requirements/test.in
+runai-model-streamer-gcs==0.15.0
+    # via runai-model-streamer
+runai-model-streamer-s3==0.15.0
+    # via runai-model-streamer
 s3transfer==0.10.3
     # via boto3
 sacrebleu==2.4.3
@@ -972,14 +1016,11 @@ sentence-transformers==3.2.1
     # via
     #   -r requirements/test.in
     #   mteb
-sentencepiece==0.2.0
-    # via mistral-common
 setuptools==77.0.3
     # via
     #   lightning-utilities
     #   pytablewriter
     #   torch
-    #   triton
 shapely==2.1.1
     # via
     #   geopandas
@@ -1031,6 +1072,8 @@ starlette-testclient==0.4.1
     # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
+structlog==25.4.0
+    # via gpt-oss
 sympy==1.13.3
     # via
     #   einx
@@ -1043,14 +1086,17 @@ tblib==3.1.0
     # via -r requirements/test.in
 tcolorpy==0.1.6
     # via pytablewriter
-tenacity==9.0.0
+tenacity==9.1.2
     # via
+    #   gpt-oss
     #   lm-eval
     #   plotly
 tensorboardx==2.6.4
     # via lightning
 tensorizer==2.10.1
     # via -r requirements/test.in
+termcolor==3.1.0
+    # via gpt-oss
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
     # via -r requirements/test.in
 threadpoolctl==3.5.0
@@ -1059,8 +1105,9 @@ tifffile==2025.3.30
     # via
     #   scikit-image
     #   terratorch
-tiktoken==0.7.0
+tiktoken==0.12.0
     # via
+    #   gpt-oss
     #   lm-eval
     #   mistral-common
 timm==1.0.17
@@ -1070,7 +1117,7 @@ timm==1.0.17
     #   segmentation-models-pytorch
     #   terratorch
     #   torchgeo
-tokenizers==0.21.1
+tokenizers==0.22.0
     # via
     #   -r requirements/test.in
     #   transformers
@@ -1078,7 +1125,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.8.0+cu128
+torch==2.9.0+cu129
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1107,7 +1154,7 @@ torch==2.8.0+cu128
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.8.0+cu128
+torchaudio==2.9.0+cu129
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1120,7 +1167,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.23.0+cu128
+torchvision==0.24.0+cu129
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1151,7 +1198,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.55.2
+transformers==4.57.1
     # via
     #   -r requirements/test.in
     #   genai-perf
@@ -1161,7 +1208,7 @@ transformers==4.55.2
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.4.0
+triton==3.5.0
     # via torch
 tritonclient==2.51.0
     # via
@@ -1178,10 +1225,12 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typeshed-client==2.8.2
     # via jsonargparse
-typing-extensions==4.12.2
+typing-extensions==4.15.0
     # via
+    #   aiosignal
     #   albumentations
     #   alembic
+    #   chz
     #   fastapi
     #   graphene
     #   huggingface-hub
@@ -1205,7 +1254,7 @@ typing-extensions==4.12.2
     #   typer
     #   typeshed-client
     #   typing-inspection
-typing-inspection==0.4.1
+typing-inspection==0.4.2
     # via pydantic
 tzdata==2024.2
     # via pandas
@@ -1221,7 +1270,9 @@ urllib3==2.2.3
     #   responses
     #   tritonclient
 uvicorn==0.35.0
-    # via mlflow-skinny
+    # via
+    #   gpt-oss
+    #   mlflow-skinny
 vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
 virtualenv==20.31.2
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 74f5b05b2382ae993f0cc24fce4d6a9508833989..59ea710684a2c8056307f8c90114e395d9dfff47 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -5,15 +5,14 @@ ray>=2.9
 cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-nixl==0.3.0 # for PD disaggregation
+numba == 0.61.2 # Required for N-gram speculative decoding
 torch==2.8.0+xpu
 torchaudio
 torchvision
 --extra-index-url=https://download.pytorch.org/whl/xpu
 
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
diff --git a/setup.py b/setup.py
index 1f0e34a3db1b11f6f2139a5d6e26518936284f83..f1abe38374188ecd9856133567b5233352f2fb01 100644
--- a/setup.py
+++ b/setup.py
@@ -34,32 +34,36 @@ logger = logging.getLogger(__name__)
 
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
-envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
 if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
-    logger.warning(
-        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
     VLLM_TARGET_DEVICE = "cpu"
-elif not (sys.platform.startswith("linux")
-          or sys.platform.startswith("darwin")):
+elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
     logger.warning(
         "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
-        "so vLLM may not be able to run correctly", sys.platform)
+        "so vLLM may not be able to run correctly",
+        sys.platform,
+    )
     VLLM_TARGET_DEVICE = "empty"
-elif (sys.platform.startswith("linux") and torch.version.cuda is None
-      and os.getenv("VLLM_TARGET_DEVICE") is None
-      and torch.version.hip is None):
+elif (
+    sys.platform.startswith("linux")
+    and torch.version.cuda is None
+    and os.getenv("VLLM_TARGET_DEVICE") is None
+    and torch.version.hip is None
+):
     # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 
 
 def is_sccache_available() -> bool:
-    return which("sccache") is not None and \
-        not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
+    return which("sccache") is not None and not bool(
+        int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))
+    )
 
 
 def is_ccache_available() -> bool:
@@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool:
 
 
 class CMakeExtension(Extension):
-
-    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
+    def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
         super().__init__(name, sources=[], py_limited_api=True, **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
@@ -121,8 +124,8 @@ class cmake_build_ext(build_ext):
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
                 logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.",
-                    nvcc_threads)
+                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
+                )
             else:
                 nvcc_threads = 1
             num_jobs = max(1, num_jobs // nvcc_threads)
@@ -146,36 +149,36 @@ class cmake_build_ext(build_ext):
         cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 
         cmake_args = [
-            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
+            "-DCMAKE_BUILD_TYPE={}".format(cfg),
+            "-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
         ]
 
         verbose = envs.VERBOSE
         if verbose:
-            cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+            cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
 
         if is_sccache_available():
             cmake_args += [
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
+                "-DCMAKE_C_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
             ]
         elif is_ccache_available():
             cmake_args += [
-                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
-                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
-                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
-                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
+                "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
             ]
 
         # Pass the python executable to cmake so it can find an exact
         # match.
-        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+        cmake_args += ["-DVLLM_PYTHON_EXECUTABLE={}".format(sys.executable)]
 
         # Pass the python path to cmake so it can reuse the build dependencies
         # on subsequent calls to python.
-        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
+        cmake_args += ["-DVLLM_PYTHON_PATH={}".format(":".join(sys.path))]
 
         # Override the base directory for FetchContent downloads to $ROOT/.deps
         # This allows sharing dependencies between profiles,
@@ -183,7 +186,7 @@ class cmake_build_ext(build_ext):
         # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
         fc_base_dir = os.path.join(ROOT_DIR, ".deps")
         fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
-        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
+        cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
 
         #
         # Setup parallelism and build tool
@@ -191,30 +194,38 @@ class cmake_build_ext(build_ext):
         num_jobs, nvcc_threads = self.compute_num_jobs()
 
         if nvcc_threads:
-            cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+            cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
 
         if is_ninja_available():
-            build_tool = ['-G', 'Ninja']
+            build_tool = ["-G", "Ninja"]
             cmake_args += [
-                '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
-                '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+                "-DCMAKE_JOB_POOL_COMPILE:STRING=compile",
+                "-DCMAKE_JOB_POOLS:STRING=compile={}".format(num_jobs),
             ]
         else:
             # Default build tool to whatever cmake picks.
             build_tool = []
         # Make sure we use the nvcc from CUDA_HOME
         if _is_cuda():
-            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
+            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
+        elif _is_hip():
+            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
+
+        other_cmake_args = os.environ.get("CMAKE_ARGS")
+        if other_cmake_args:
+            cmake_args += other_cmake_args.split()
+
         subprocess.check_call(
-            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
-            cwd=self.build_temp)
+            ["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
+            cwd=self.build_temp,
+        )
 
     def build_extensions(self) -> None:
         # Ensure that CMake is present and working
         try:
-            subprocess.check_output(['cmake', '--version'])
+            subprocess.check_output(["cmake", "--version"])
         except OSError as e:
-            raise RuntimeError('Cannot find CMake executable') from e
+            raise RuntimeError("Cannot find CMake executable") from e
 
         # Create build directory if it does not exist.
         if not os.path.exists(self.build_temp):
@@ -253,13 +264,18 @@ class cmake_build_ext(build_ext):
             # CMake appends the extension prefix to the install path,
             # and outdir already contains that prefix, so we need to remove it.
             prefix = outdir
-            for _ in range(ext.name.count('.')):
+            for _ in range(ext.name.count(".")):
                 prefix = prefix.parent
 
             # prefix here should actually be the same for all components
             install_args = [
-                "cmake", "--install", ".", "--prefix", prefix, "--component",
-                target_name(ext.name)
+                "cmake",
+                "--install",
+                ".",
+                "--prefix",
+                prefix,
+                "--component",
+                target_name(ext.name),
             ]
             subprocess.check_call(install_args, cwd=self.build_temp)
 
@@ -270,12 +286,15 @@ class cmake_build_ext(build_ext):
         # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
         # directory so that they can be included in the editable build
         import glob
-        files = glob.glob(os.path.join(self.build_lib, "vllm",
-                                       "vllm_flash_attn", "**", "*.py"),
-                          recursive=True)
+
+        files = glob.glob(
+            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "**", "*.py"),
+            recursive=True,
+        )
         for file in files:
-            dst_file = os.path.join("vllm/vllm_flash_attn",
-                                    file.split("vllm/vllm_flash_attn/")[-1])
+            dst_file = os.path.join(
+                "vllm/vllm_flash_attn", file.split("vllm/vllm_flash_attn/")[-1]
+            )
             print(f"Copying {file} to {dst_file}")
             os.makedirs(os.path.dirname(dst_file), exist_ok=True)
             self.copy_file(file, dst_file)
@@ -285,8 +304,7 @@ class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
     def run(self) -> None:
-        assert _is_cuda(
-        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
 
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
@@ -307,9 +325,9 @@ class precompiled_wheel_utils:
                 wheel_filename = wheel_url_or_path.split("/")[-1]
                 temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                 wheel_path = os.path.join(temp_dir, wheel_filename)
-                print(f"Downloading wheel from {wheel_url_or_path} "
-                      f"to {wheel_path}")
+                print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
                 from urllib.request import urlretrieve
+
                 urlretrieve(wheel_url_or_path, filename=wheel_path)
             else:
                 wheel_path = wheel_url_or_path
@@ -330,25 +348,29 @@ class precompiled_wheel_utils:
                 ]
 
                 compiled_regex = re.compile(
-                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                 file_members = list(
-                    filter(lambda x: x.filename in files_to_copy,
-                           wheel.filelist))
+                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
+                )
                 file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename),
-                           wheel.filelist))
+                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+                )
 
                 for file in file_members:
                     print(f"[extract] {file.filename}")
                     target_path = os.path.join(".", file.filename)
                     os.makedirs(os.path.dirname(target_path), exist_ok=True)
-                    with wheel.open(file.filename) as src, open(
-                            target_path, "wb") as dst:
+                    with (
+                        wheel.open(file.filename) as src,
+                        open(target_path, "wb") as dst,
+                    ):
                         shutil.copyfileobj(src, dst)
 
                     pkg = os.path.dirname(file.filename).replace("/", ".")
                     package_data_patch.setdefault(pkg, []).append(
-                        os.path.basename(file.filename))
+                        os.path.basename(file.filename)
+                    )
 
             return package_data_patch
         finally:
@@ -364,10 +386,13 @@ class precompiled_wheel_utils:
 
         try:
             # Get the latest commit hash of the upstream main branch.
-            resp_json = subprocess.check_output([
-                "curl", "-s",
-                "https://api.github.com/repos/vllm-project/vllm/commits/main"
-            ]).decode("utf-8")
+            resp_json = subprocess.check_output(
+                [
+                    "curl",
+                    "-s",
+                    "https://api.github.com/repos/vllm-project/vllm/commits/main",
+                ]
+            ).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
             # In Docker build context, .git may be immutable or missing.
@@ -377,25 +402,32 @@ class precompiled_wheel_utils:
             # Check if the upstream_main_commit exists in the local repo
             try:
                 subprocess.check_output(
-                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"]
+                )
             except subprocess.CalledProcessError:
                 # If not present, fetch it from the remote repository.
                 # Note that this does not update any local branches,
                 # but ensures that this commit ref and its history are
                 # available in our local repo.
-                subprocess.check_call([
-                    "git", "fetch", "https://github.com/vllm-project/vllm",
-                    "main"
-                ])
+                subprocess.check_call(
+                    ["git", "fetch", "https://github.com/vllm-project/vllm", "main"]
+                )
 
             # Then get the commit hash of the current branch that is the same as
             # the upstream main commit.
-            current_branch = subprocess.check_output(
-                ["git", "branch", "--show-current"]).decode("utf-8").strip()
+            current_branch = (
+                subprocess.check_output(["git", "branch", "--show-current"])
+                .decode("utf-8")
+                .strip()
+            )
 
-            base_commit = subprocess.check_output([
-                "git", "merge-base", f"{upstream_main_commit}", current_branch
-            ]).decode("utf-8").strip()
+            base_commit = (
+                subprocess.check_output(
+                    ["git", "merge-base", f"{upstream_main_commit}", current_branch]
+                )
+                .decode("utf-8")
+                .strip()
+            )
             return base_commit
         except ValueError as err:
             raise ValueError(err) from None
@@ -403,7 +435,9 @@ class precompiled_wheel_utils:
             logger.warning(
                 "Failed to get the base commit in the main branch. "
                 "Using the nightly wheel. The libraries in this "
-                "wheel may not be compatible with your dev branch: %s", err)
+                "wheel may not be compatible with your dev branch: %s",
+                err,
+            )
             return "nightly"
 
 
@@ -413,12 +447,13 @@ def _no_device() -> bool:
 
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
-    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
+    return VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()
 
 
 def _is_hip() -> bool:
-    return (VLLM_TARGET_DEVICE == "cuda"
-            or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
+    return (
+        VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm"
+    ) and torch.version.hip is not None
 
 
 def _is_tpu() -> bool:
@@ -457,8 +492,12 @@ def get_rocm_version():
         minor = ctypes.c_uint32()
         patch = ctypes.c_uint32()
 
-        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
-                                  ctypes.byref(patch)) == 0):
+        if (
+            get_rocm_core_version(
+                ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)
+            )
+            == 0
+        ):
             return f"{major.value}.{minor.value}.{patch.value}"
         return None
     except Exception:
@@ -471,8 +510,9 @@ def get_nvcc_cuda_version() -> Version:
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
     assert CUDA_HOME is not None, "CUDA_HOME is not set"
-    nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
-                                          universal_newlines=True)
+    nvcc_output = subprocess.check_output(
+        [CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
+    )
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
@@ -484,18 +524,31 @@ def get_gaudi_sw_version():
     Returns the driver version.
     """
     # Enable console printing for `hl-smi` check
-    output = subprocess.run("hl-smi",
-                            shell=True,
-                            text=True,
-                            capture_output=True,
-                            env={"ENABLE_CONSOLE": "true"})
+    output = subprocess.run(
+        "hl-smi",
+        shell=True,
+        text=True,
+        capture_output=True,
+        env={"ENABLE_CONSOLE": "true"},
+    )
     if output.returncode == 0 and output.stdout:
-        return output.stdout.split("\n")[2].replace(
-            " ", "").split(":")[1][:-1].split("-")[0]
+        return (
+            output.stdout.split("\n")[2]
+            .replace(" ", "")
+            .split(":")[1][:-1]
+            .split("-")[0]
+        )
     return "0.0.0"  # when hl-smi is not available
 
 
 def get_vllm_version() -> str:
+    # Allow overriding the version. This is useful to build platform-specific
+    # wheels (e.g. CPU, TPU) without modifying the source.
+    if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
+        print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
+        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
+        return get_version(write_to="vllm/_version.py")
+
     version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
@@ -541,8 +594,11 @@ def get_requirements() -> list[str]:
         for line in requirements:
             if line.startswith("-r "):
                 resolved_requirements += _read_requirements(line.split()[1])
-            elif not line.startswith("--") and not line.startswith(
-                    "#") and line.strip() != "":
+            elif (
+                not line.startswith("--")
+                and not line.startswith("#")
+                and line.strip() != ""
+            ):
                 resolved_requirements.append(line)
         return resolved_requirements
 
@@ -553,7 +609,7 @@ def get_requirements() -> list[str]:
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
         for req in requirements:
-            if ("vllm-flash-attn" in req and cuda_major != "12"):
+            if "vllm-flash-attn" in req and cuda_major != "12":
                 # vllm-flash-attn is built only for CUDA 12.x.
                 # Skip for other versions.
                 continue
@@ -568,8 +624,7 @@ def get_requirements() -> list[str]:
     elif _is_xpu():
         requirements = _read_requirements("xpu.txt")
     else:
-        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, or CPU.")
+        raise ValueError("Unsupported platform, please use CUDA, ROCm, or CPU.")
     return requirements
 
 
@@ -577,6 +632,7 @@ ext_modules = []
 
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 # if _is_hip():
 #     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -585,15 +641,13 @@ if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
     if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
         # FA3 requires CUDA 12.3 or later
-        ext_modules.append(
-            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
         # Optional since this doesn't get built (produce an .so file) when
         # not targeting a hopper system
+        ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
         ext_modules.append(
-            CMakeExtension(name="vllm._flashmla_C", optional=True))
-        ext_modules.append(
-            CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
-    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
+            CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
+        )
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
@@ -614,6 +668,7 @@ if envs.VLLM_USE_PRECOMPILED:
         wheel_url = wheel_location
     else:
         import platform
+
         arch = platform.machine()
         if arch == "x86_64":
             wheel_tag = "manylinux1_x86_64"
@@ -623,8 +678,11 @@ if envs.VLLM_USE_PRECOMPILED:
             raise ValueError(f"Unsupported architecture: {arch}")
         base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
         wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = (
+            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        )
         from urllib.request import urlopen
+
         try:
             with urlopen(wheel_url) as resp:
                 if resp.status != 200:
@@ -633,8 +691,7 @@ if envs.VLLM_USE_PRECOMPILED:
             print(f"[warn] Falling back to nightly wheel: {e}")
             wheel_url = nightly_wheel_url
 
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url)
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
 
@@ -645,8 +702,9 @@ if not ext_modules:
     cmdclass = {}
 else:
     cmdclass = {
-        "build_ext":
-        precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+        "build_ext": precompiled_build_ext
+        if envs.VLLM_USE_PRECOMPILED
+        else cmake_build_ext
     }
 
 setup(
@@ -655,18 +713,17 @@ setup(
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
-        "bench": ["pandas", "datasets"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": [
-            "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
-            "google-cloud-storage", "runai-model-streamer-s3", "boto3"
-        ],
-        "audio": ["librosa", "soundfile",
-                  "mistral_common[audio]"],  # Required for audio processing
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
+        "audio": [
+            "librosa",
+            "soundfile",
+            "mistral_common[audio]",
+        ],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
-        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.3.1"],
+        "flashinfer": [],  # Kept for backwards compatibility
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 411f3e01bc2cd42feb913317d3ea4b0b43785ec2..0cf1e85d4e8eede25fce0a34a9acfaabc25f3f6e 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
+
 import os
 import weakref
 from unittest.mock import Mock
@@ -12,14 +13,14 @@ import pytest
 import torch
 
 from vllm import LLM
-from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
+from vllm.v1.engine.llm_engine import LLMEngine
 
 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
 MODELS = [
-    "google/gemma-2-2b-it",
+    "hmellor/tiny-random-Gemma2ForCausalLM",
     "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
@@ -28,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("distilbert/distilgpt2")
+    llm = LLM("hmellor/tiny-random-LlamaForCausalLM")
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -37,16 +38,21 @@ def test_vllm_gc_ed():
 
 
 def _fix_prompt_embed_outputs(
-        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
-        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    vllm_outputs: list[tuple[list[int], str]],
+    hf_model: HfRunner,
+    example_prompts: list[str],
+) -> list[tuple[list[int], str]]:
     fixed_vllm_outputs = []
     for vllm_output, hf_input, prompt in zip(
-            vllm_outputs, hf_model.get_inputs(example_prompts),
-            example_prompts):
+        vllm_outputs, hf_model.get_inputs(example_prompts), example_prompts
+    ):
         hf_input_ids = hf_input["input_ids"].tolist()[0]
         fixed_vllm_outputs.append(
-            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
-             prompt + vllm_output[1]))
+            (
+                hf_input_ids + vllm_output[0][len(hf_input_ids) :],
+                prompt + vllm_output[1],
+            )
+        )
     return fixed_vllm_outputs
 
 
@@ -69,8 +75,7 @@ def test_models(
     enable_prompt_embeds: bool,
 ) -> None:
     if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(
-            f"{backend} does not support gemma2 with full context length.")
+        pytest.skip(f"{backend} does not support gemma2 with full context length.")
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
@@ -78,34 +83,35 @@ def test_models(
         # 5042 tokens for gemma2
         # gemma2 has alternating sliding window size of 4096
         # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = "The following numbers of the sequence " + ", ".join(
-            str(i) for i in range(1024)) + " are:"
+        prompt = (
+            "The following numbers of the sequence "
+            + ", ".join(str(i) for i in range(1024))
+            + " are:"
+        )
         example_prompts = [prompt]
 
         with hf_runner(model) as hf_model:
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             if enable_prompt_embeds:
                 with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(
-                        example_prompts)
+                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
 
         with VllmRunner(
-                model,
-                max_model_len=8192,
-                enforce_eager=enforce_eager,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
-                async_scheduling=async_scheduling,
-                distributed_executor_backend=model_executor,
+            model,
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
+            async_scheduling=async_scheduling,
+            distributed_executor_backend=model_executor,
         ) as vllm_model:
             if enable_prompt_embeds:
-                vllm_outputs = vllm_model.generate_greedy(
-                    prompt_embeds, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                 vllm_outputs = _fix_prompt_embed_outputs(
-                    vllm_outputs, hf_model, example_prompts)
+                    vllm_outputs, hf_model, example_prompts
+                )
             else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         check_outputs_equal(
             outputs_0_lst=hf_outputs,
@@ -117,21 +123,18 @@ def test_models(
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite, extra_env", [
-        ("distilbert/distilgpt2", "ray", "", "L4", {}),
-        ("distilbert/distilgpt2", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
-        ("distilbert/distilgpt2", "mp", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
+    "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
+    [
+        ("facebook/opt-125m", "ray", "", "L4", {}),
+        ("facebook/opt-125m", "mp", "", "L4", {}),
+        ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "A100", {}),
-        ("distilbert/distilgpt2", "mp", "", "A100", {}),
-    ])
+        ("facebook/opt-125m", "ray", "", "A100", {}),
+        ("facebook/opt-125m", "mp", "", "A100", {}),
+    ],
+)
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
     monkeypatch: pytest.MonkeyPatch,
@@ -149,13 +152,14 @@ def test_models_distributed(
         pytest.skip(f"Skip test for {test_suite}")
 
     with monkeypatch.context() as monkeypatch_context:
-        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-            if enable_prompt_embeds:
-                pytest.skip(
-                    "enable_prompt_embeds does not work with ray compiled dag."
-                )
-            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+        if (
+            model == "meta-llama/Llama-3.2-1B-Instruct"
+            and distributed_executor_backend == "ray"
+            and attention_backend == ""
+            and test_suite == "L4"
+            and enable_prompt_embeds
+        ):  # noqa
+            pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
 
         if attention_backend:
             monkeypatch_context.setenv(
@@ -175,30 +179,26 @@ def test_models_distributed(
         # will hurt multiprocessing backend with fork method
         # (the default method).
         with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                distributed_executor_backend=distributed_executor_backend,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
         ) as vllm_model:
             if enable_prompt_embeds:
                 with hf_runner(model, dtype=dtype) as hf_model:
                     with torch.no_grad():
-                        prompt_embeds = hf_model.get_prompt_embeddings(
-                            example_prompts)
-                    vllm_outputs = vllm_model.generate_greedy(
-                        prompt_embeds, max_tokens)
+                        prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                     vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts)
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                        vllm_outputs, hf_model, example_prompts
+                    )
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
                 with hf_runner(model, dtype=dtype) as hf_model:
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -209,27 +209,18 @@ def test_models_distributed(
 
 
 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-
-    from vllm.envs import VLLM_USE_V1
-
-    if not VLLM_USE_V1:
-        pytest.skip("Skipping V0 test, dump input not supported")
-
     # Needed to mock an error in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-    with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
+    with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
+        if isinstance(vllm_model.llm.llm_engine, LLMEngine):
             v1_test_failed_model_execution(vllm_model)
 
 
 def v1_test_failed_model_execution(vllm_model):
-
     engine = vllm_model.llm.llm_engine
-    mocked_execute_model = Mock(
-        side_effect=RuntimeError("Mocked Critical Error"))
-    engine.engine_core.engine_core.model_executor.execute_model =\
-                mocked_execute_model
+    mocked_execute_model = Mock(side_effect=RuntimeError("Mocked Critical Error"))
+    engine.engine_core.engine_core.model_executor.execute_model = mocked_execute_model
 
     with pytest.raises(RuntimeError) as exc_info:
         prompts = [
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 28bfe9e7c8020de5c3fd8a90747cd59880067a75..89839372c309a53f656eb6f3e43e346bb42a9225 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -5,5 +5,6 @@ from ..utils import compare_two_settings
 
 
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
-                         ["--cpu-offload-gb", "1"])
+    compare_two_settings(
+        "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
+    )
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 508740ab293898b1f9d1ae1bc6c089b22860b18e..754ef20dbeb2b5f1583310cf674d4c5770d83573 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,17 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
+
 import pytest
 import torch
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, AsyncEngineArgs, AsyncLLMEngine, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.utils import GiB_bytes
+from vllm.platforms import current_platform
+from vllm.utils.mem_constants import GiB_bytes
 
 from ..utils import create_new_process_for_each_test
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_python_error():
     """
     Test if Python error occurs when there's low-level
@@ -23,13 +26,13 @@ def test_python_error():
     tensors = []
     with allocator.use_memory_pool():
         # allocate 70% of the total memory
-        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
         tensors.append(x)
     # release the memory
     allocator.sleep()
 
     # allocate more memory than the total memory
-    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
     tensors.append(y)
     with pytest.raises(RuntimeError):
         # when the allocator is woken up, it should raise an error
@@ -37,21 +40,21 @@ def test_python_error():
         allocator.wake_up()
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_basic_cumem():
     # some tensors from default memory pool
     shape = (1024, 1024)
-    x = torch.empty(shape, device='cuda')
+    x = torch.empty(shape, device="cuda")
     x.zero_()
 
     # some tensors from custom memory pool
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
         # custom memory pool
-        y = torch.empty(shape, device='cuda')
+        y = torch.empty(shape, device="cuda")
         y.zero_()
         y += 1
-        z = torch.empty(shape, device='cuda')
+        z = torch.empty(shape, device="cuda")
         z.zero_()
         z += 2
 
@@ -70,20 +73,20 @@ def test_basic_cumem():
     assert torch.allclose(output, torch.ones_like(output) * 3)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_cumem_with_cudagraph():
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
-        weight = torch.eye(1024, device='cuda')
+        weight = torch.eye(1024, device="cuda")
     with allocator.use_memory_pool(tag="discard"):
-        cache = torch.empty(1024, 1024, device='cuda')
+        cache = torch.empty(1024, 1024, device="cuda")
 
     def model(x):
         out = x @ weight
-        cache[:out.size(0)].copy_(out)
+        cache[: out.size(0)].copy_(out)
         return out + 1
 
-    x = torch.empty(128, 1024, device='cuda')
+    x = torch.empty(128, 1024, device="cuda")
 
     # warmup
     model(x)
@@ -109,80 +112,72 @@ def test_cumem_with_cudagraph():
     model_graph.replay()
 
     # cache content is as expected
-    assert torch.allclose(x, cache[:x.size(0)])
+    assert torch.allclose(x, cache[: x.size(0)])
 
     # output content is as expected
     assert torch.allclose(y, x + 1)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 @pytest.mark.parametrize(
-    "model, use_v1",
+    "model",
     [
         # sleep mode with safetensors
-        ("meta-llama/Llama-3.2-1B", True),
+        "hmellor/tiny-random-LlamaForCausalLM",
         # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", True),
-    ])
-def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    with monkeypatch.context() as m:
-        assert use_v1
-        m.setenv("VLLM_USE_V1", "1")
-        free, total = torch.cuda.mem_get_info()
-        used_bytes_baseline = total - free  # in case other process is running
-        llm = LLM(model, enable_sleep_mode=True)
-        prompt = "How are you?"
-        sampling_params = SamplingParams(temperature=0, max_tokens=10)
-        output = llm.generate(prompt, sampling_params)
-
-        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-        # which is difficult to measure in the test. therefore, we only
-        # test sleep level 1 here.
-        llm.sleep(level=1)
-
-        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-        # now the memory usage is mostly cudagraph memory pool,
-        # and it should be less than the model weights (1B model, 2GiB weights)
-
-        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-        # is captured but cannot be releasesd from PyTorch due to a known bug,
-        # therefore high memory usage after `llm.sleep` is called is expected.
-        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-        # in V1.
-        if use_v1:
-            assert used_bytes < 7 * GiB_bytes
-        else:
-            assert used_bytes < 2 * GiB_bytes
-
-        llm.wake_up()
-        output2 = llm.generate(prompt, sampling_params)
-        # cmp output
-        assert output[0].outputs[0].text == output2[0].outputs[0].text
+        "facebook/opt-125m",
+    ],
+)
+def test_end_to_end(model: str):
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
 
-        llm.sleep(level=1)
-        llm.wake_up(tags=["weights"])
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)
 
-        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)
 
-        # should just reallocate memory for weights (1B model, ~2GiB weights)
-        if use_v1:
-            assert used_bytes < 10 * GiB_bytes
-        else:
-            assert used_bytes < 6 * GiB_bytes
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    assert used_bytes < 7 * GiB_bytes
 
-        # now allocate kv cache memory
-        llm.wake_up(tags=["kv_cache"])
-        output3 = llm.generate(prompt, sampling_params)
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
 
-        # cmp output
-        assert output[0].outputs[0].text == output3[0].outputs[0].text
+    llm.sleep(level=1)
+    llm.wake_up(tags=["weights"])
+
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+    # should just reallocate memory for weights (1B model, ~2GiB weights)
+    assert used_bytes < 10 * GiB_bytes
+
+    # now allocate kv cache memory
+    llm.wake_up(tags=["kv_cache"])
+    output3 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output3[0].outputs[0].text
 
 
 @create_new_process_for_each_test()
 def test_deep_sleep():
-    model = "Qwen/Qwen3-0.6B"
+    model = "hmellor/tiny-random-LlamaForCausalLM"
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
     llm = LLM(model, enable_sleep_mode=True)
@@ -209,3 +204,42 @@ def test_deep_sleep():
 
     # cmp output
     assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+
+@create_new_process_for_each_test()
+def test_deep_sleep_async():
+    async def test():
+        model = "hmellor/tiny-random-LlamaForCausalLM"
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        engine_args = AsyncEngineArgs(
+            model=model,
+            enable_sleep_mode=True,
+        )
+
+        llm = AsyncLLMEngine.from_engine_args(engine_args)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        outputs = llm.generate(prompt, sampling_params, request_id="test_request_id1")
+        async for output in outputs:
+            pass
+
+        # Put the engine to deep sleep
+        await llm.sleep(level=2)
+
+        await llm.wake_up(tags=["weights"])
+        await llm.collective_rpc("reload_weights")
+        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+        assert used_bytes < 4 * GiB_bytes
+
+        # now allocate kv cache and cuda graph memory
+        await llm.wake_up(tags=["kv_cache"])
+        outputs2 = llm.generate(prompt, sampling_params, request_id="test_request_id2")
+        async for output2 in outputs2:
+            pass
+
+        # cmp output
+        assert output.outputs[0].text == output2.outputs[0].text
+
+    asyncio.run(test())
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
index 2279c846e01cd5a206166b7a21df0745bd481f0a..54075a3a15e638c4177a16282835c04da9e80b5e 100644
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -10,8 +10,18 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 @pytest.mark.benchmark
 def test_bench_latency():
     command = [
-        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
-        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
     ]
     result = subprocess.run(command, capture_output=True, text=True)
     print(result.stdout)
diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py
index 26cae369cdd5dd0a9d6faa18cd750b37385ac104..57f6893061825160b0d5688cc063a31598ab3d97 100644
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Any, NamedTuple, Optional, cast
+from typing import Any, NamedTuple, cast
 
 import numpy as np
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from vllm.benchmarks.datasets import (RandomDataset, RandomMultiModalDataset,
-                                      SampleRequest)
+from vllm.benchmarks.datasets import (
+    RandomDataset,
+    RandomMultiModalDataset,
+    SampleRequest,
+)
 
 
 @pytest.fixture(scope="session")
@@ -27,11 +30,9 @@ class Params(NamedTuple):
 
 @pytest.fixture(scope="session")
 def random_dataset_params() -> Params:
-    return Params(num_requests=16,
-                  prefix_len=7,
-                  range_ratio=0.3,
-                  input_len=50,
-                  output_len=20)
+    return Params(
+        num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
+    )
 
 
 def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
@@ -39,13 +40,15 @@ def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
     return (req.prompt, req.prompt_len, req.expected_output_len)
 
 
-def _collect_samples(dataset: RandomDataset,
-                    tokenizer: PreTrainedTokenizerBase,
-                     num_requests: int = 16,
-                     prefix_len: int = 7,
-                     range_ratio: float = 0.3,
-                     input_len: int = 50,
-                     output_len: int = 20) -> list[tuple[str, int, int]]:
+def _collect_samples(
+    dataset: RandomDataset,
+    tokenizer: PreTrainedTokenizerBase,
+    num_requests: int = 16,
+    prefix_len: int = 7,
+    range_ratio: float = 0.3,
+    input_len: int = 50,
+    output_len: int = 20,
+) -> list[tuple[str, int, int]]:
     samples = dataset.sample(
         tokenizer=tokenizer,
         num_requests=num_requests,
@@ -59,8 +62,8 @@ def _collect_samples(dataset: RandomDataset,
 
 @pytest.mark.benchmark
 def test_random_dataset_same_seed(
-        hf_tokenizer: PreTrainedTokenizerBase,
-        random_dataset_params: Params) -> None:
+    hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
+) -> None:
     """Same seed should yield identical outputs, even if global RNGs change.
 
     This guards against accidental reliance on Python's random or np.random
@@ -70,13 +73,15 @@ def test_random_dataset_same_seed(
     common_seed = 123
     dataset_a = RandomDataset(random_seed=common_seed)
     dataset_b = RandomDataset(random_seed=common_seed)
-    a = _collect_samples(dataset_a,
-                         hf_tokenizer,
-                         num_requests=p.num_requests,
-                         prefix_len=p.prefix_len,
-                         range_ratio=p.range_ratio,
-                         input_len=p.input_len,
-                         output_len=p.output_len)
+    a = _collect_samples(
+        dataset_a,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
 
     # Perturb global RNG state to ensure isolation
     random.seed(999)
@@ -84,43 +89,50 @@ def test_random_dataset_same_seed(
     np.random.seed(888)
     _ = [np.random.random() for _ in range(100)]
 
-    b = _collect_samples(dataset_b,
-                         hf_tokenizer,
-                         num_requests=p.num_requests,
-                         prefix_len=p.prefix_len,
-                         range_ratio=p.range_ratio,
-                         input_len=p.input_len,
-                         output_len=p.output_len)
+    b = _collect_samples(
+        dataset_b,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
     assert a == b
 
+
 @pytest.mark.benchmark
 def test_random_dataset_different_seeds(
-        hf_tokenizer: PreTrainedTokenizerBase,
-        random_dataset_params: Params) -> None:
+    hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
+) -> None:
     """Different seeds should change outputs with overwhelming likelihood."""
     p = random_dataset_params
     seed_a = 0
     dataset_a = RandomDataset(random_seed=seed_a)
-    a = _collect_samples(dataset_a,
-                         hf_tokenizer,
-                         num_requests=p.num_requests,
-                         prefix_len=p.prefix_len,
-                         range_ratio=p.range_ratio,
-                         input_len=p.input_len,
-                         output_len=p.output_len)
+    a = _collect_samples(
+        dataset_a,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
 
     seed_b = 999
     dataset_b = RandomDataset(random_seed=seed_b)
     # Perturb global RNG with same seed as dataset_a to ensure isolation
     random.seed(seed_a)
     np.random.seed(seed_a)
-    b = _collect_samples(dataset_b,
-                         hf_tokenizer,
-                         num_requests=p.num_requests,
-                         prefix_len=p.prefix_len,
-                         range_ratio=p.range_ratio,
-                         input_len=p.input_len,
-                         output_len=p.output_len)
+    b = _collect_samples(
+        dataset_b,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
     assert a != b
 
 
@@ -128,6 +140,7 @@ def test_random_dataset_different_seeds(
 # RandomMultiModalDataset tests
 # -----------------------------
 
+
 def _mm_fingerprint_sample(
     req: SampleRequest,
 ) -> tuple[str, int, int, int, list[str]]:
@@ -152,8 +165,13 @@ def _mm_fingerprint_sample(
             item_prefixes.append(f"video:{url[:22]}")
         else:
             item_prefixes.append("unknown:")
-    return (req.prompt, req.prompt_len, req.expected_output_len, len(items),
-            item_prefixes)
+    return (
+        req.prompt,
+        req.prompt_len,
+        req.expected_output_len,
+        len(items),
+        item_prefixes,
+    )
 
 
 def _collect_mm_samples(
@@ -167,8 +185,8 @@ def _collect_mm_samples(
     output_len: int = 5,
     base_items_per_request: int = 2,
     num_mm_items_range_ratio: float = 0.0,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
-    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
+    bucket_config: dict[tuple[int, int, int], float] | None = None,
     enable_multimodal_chat: bool = False,
 ) -> list[SampleRequest]:
     if limit_mm_per_prompt is None:
@@ -214,6 +232,7 @@ def test_random_mm_different_seeds(
     fb = [_mm_fingerprint_sample(s) for s in b]
     assert fa != fb
 
+
 @pytest.mark.benchmark
 def test_random_mm_respects_limits(
     hf_tokenizer: PreTrainedTokenizerBase,
@@ -271,9 +290,9 @@ def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
     for s in samples:
         assert s.multi_modal_data == []
 
+
 @pytest.mark.benchmark
-def test_random_mm_num_items_per_prompt(
-    hf_tokenizer: PreTrainedTokenizerBase) -> None:
+def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
     ds = RandomMultiModalDataset(random_seed=0)
     # Fixed number of images per prompt
     # set num_mm_items_range_ratio to 0.0
@@ -300,7 +319,6 @@ def test_random_mm_num_items_per_prompt(
 def test_random_mm_bucket_config_not_mutated(
     hf_tokenizer: PreTrainedTokenizerBase,
 ) -> None:
-
     ds = RandomMultiModalDataset(random_seed=0)
     # This bucket config is not normalized to sum to 1
     # and has more buckets than requested images
@@ -321,7 +339,6 @@ def test_random_mm_bucket_config_not_mutated(
     # Ensure the original dict content is unchanged
     assert original == snapshot
 
-
     # Vary number of mm items per prompt
     # set num_mm_items_range_ratio to 0.5
     samples_varying_items = _collect_mm_samples(
@@ -342,3 +359,126 @@ def test_random_mm_bucket_config_not_mutated(
         assert len(mm_data) >= 1
         for it in mm_data:
             assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    """Test video sampling functionality in RandomMultiModalDataset."""
+    ds = RandomMultiModalDataset(random_seed=42)
+
+    # Test with video bucket configuration
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    assert len(samples) == 5
+
+    # Check that we have both images and videos
+    video_count = 0
+    image_count = 0
+
+    for s in samples:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        if item.get("type") == "video_url":
+            video_count += 1
+            # Verify video URL format
+            url = item.get("video_url", {}).get("url", "")
+            assert url.startswith("data:video/mp4;base64,")
+        elif item.get("type") == "image_url":
+            image_count += 1
+            # Verify image URL format
+            url = item.get("image_url", {}).get("url", "")
+            assert url.startswith("data:image/jpeg;base64,")
+
+    # Should have some videos due to 0.7 probability
+    assert video_count > 0
+    assert image_count > 0
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    """Test sampling with only video buckets."""
+    ds = RandomMultiModalDataset(random_seed=42)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    assert len(samples) == 3
+
+    for s in samples:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        assert item.get("type") == "video_url"
+        url = item.get("video_url", {}).get("url", "")
+        assert url.startswith("data:video/mp4;base64,")
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_deterministic_sampling(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    """Test that video sampling is deterministic with same seed."""
+    seed = 123
+    ds_a = RandomMultiModalDataset(random_seed=seed)
+    ds_b = RandomMultiModalDataset(random_seed=seed)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    a = _collect_mm_samples(
+        ds_a,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    b = _collect_mm_samples(
+        ds_b,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa == fb
diff --git a/tests/benchmarks/test_random_multimodal_dataset_video.py b/tests/benchmarks/test_random_multimodal_dataset_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..db19a169e359c42254bbf0b7161a33a57945963a
--- /dev/null
+++ b/tests/benchmarks/test_random_multimodal_dataset_video.py
@@ -0,0 +1,398 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+from tempfile import NamedTemporaryFile
+from typing import Any, cast
+
+import cv2
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import RandomMultiModalDataset, SampleRequest
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    """Use a small, commonly available tokenizer."""
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+@pytest.fixture
+def video_dataset() -> RandomMultiModalDataset:
+    """Create a RandomMultiModalDataset instance for testing."""
+    return RandomMultiModalDataset(random_seed=42)
+
+
+@pytest.mark.benchmark
+def test_generate_synthetic_video_different_seeds():
+    """Test that different seeds produce different videos."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=456)
+
+    width, height, num_frames = 64, 48, 8
+
+    video1 = dataset1.generate_synthetic_video(width, height, num_frames)
+    video2 = dataset2.generate_synthetic_video(width, height, num_frames)
+
+    # Videos should be different due to different seeds
+    assert video1["bytes"] != video2["bytes"]
+
+
+@pytest.mark.benchmark
+def test_map_config_to_modality(video_dataset: RandomMultiModalDataset):
+    """Test modality mapping for different configurations."""
+    # Test image configuration (num_frames = 1)
+    assert video_dataset.map_config_to_modality((256, 256, 1)) == "image"
+    assert video_dataset.map_config_to_modality((720, 1280, 1)) == "image"
+
+    # Test video configurations (num_frames > 1)
+    assert video_dataset.map_config_to_modality((256, 256, 8)) == "video"
+    assert video_dataset.map_config_to_modality((720, 1280, 16)) == "video"
+    assert video_dataset.map_config_to_modality((64, 64, 32)) == "video"
+
+    # Test invalid configurations
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.map_config_to_modality((256, 256, 0))
+
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.map_config_to_modality((256, 256, -1))
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_video(video_dataset: RandomMultiModalDataset):
+    """Test generating multimodal items for video configurations."""
+    # Test video item generation
+    video_config = (64, 48, 8)  # height, width, num_frames
+    result = video_dataset.generate_mm_item(video_config)
+
+    # Check the result structure matches OpenAI API format
+    assert isinstance(result, dict)
+    assert result["type"] == "video_url"
+    assert "video_url" in result
+    assert "url" in result["video_url"]
+
+    # Check that the URL is a data URL with base64 encoded video
+    url = result["video_url"]["url"]
+    assert url.startswith("data:video/mp4;base64,")
+
+    # Decode and verify the video content
+    base64_data = url.split(",")[1]
+    video_bytes = base64.b64decode(base64_data)
+    assert len(video_bytes) > 0
+
+    # Verify the video can be decoded
+    with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+        temp_path = temp_file.name
+        temp_file.write(video_bytes)
+
+    try:
+        cap = cv2.VideoCapture(temp_path)
+        assert cap.isOpened()
+
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        assert frame_count == 8
+        assert frame_width == 48
+        assert frame_height == 64
+
+        cap.release()
+    finally:
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_image(video_dataset: RandomMultiModalDataset):
+    """Test generating multimodal items for image configurations."""
+    # Test image item generation
+    image_config = (64, 48, 1)  # height, width, num_frames=1
+    result = video_dataset.generate_mm_item(image_config)
+
+    # Check the result structure matches OpenAI API format
+    assert isinstance(result, dict)
+    assert result["type"] == "image_url"
+    assert "image_url" in result
+    assert "url" in result["image_url"]
+
+    # Check that the URL is a data URL with base64 encoded image
+    url = result["image_url"]["url"]
+    assert url.startswith("data:image/jpeg;base64,")
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_invalid_config(video_dataset: RandomMultiModalDataset):
+    """Test error handling for invalid configurations."""
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.generate_mm_item((256, 256, 0))
+
+
+@pytest.mark.benchmark
+def test_sample_with_video_buckets(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with video bucket configurations."""
+    # Configure bucket with video probability > 0
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 5, "video": 3}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 5
+
+    # Check that samples contain both images and videos
+    video_count = 0
+    image_count = 0
+
+    for sample in samples:
+        assert isinstance(sample, SampleRequest)
+        assert sample.multi_modal_data is not None
+        assert isinstance(sample.multi_modal_data, list)
+
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) == 2  # base_items_per_request
+
+        for item in mm_data:
+            if item["type"] == "video_url":
+                video_count += 1
+                # Verify video URL format
+                url = item["video_url"]["url"]
+                assert url.startswith("data:video/mp4;base64,")
+            elif item["type"] == "image_url":
+                image_count += 1
+                # Verify image URL format
+                url = item["image_url"]["url"]
+                assert url.startswith("data:image/jpeg;base64,")
+
+    # Should have some videos due to 0.7 probability
+    assert video_count > 0
+    assert image_count > 0
+
+
+@pytest.mark.benchmark
+def test_sample_video_only_buckets(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with only video buckets."""
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 2}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 3
+
+    for sample in samples:
+        assert isinstance(sample, SampleRequest)
+        assert sample.multi_modal_data is not None
+        assert isinstance(sample.multi_modal_data, list)
+
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        assert item["type"] == "video_url"
+        url = item["video_url"]["url"]
+        assert url.startswith("data:video/mp4;base64,")
+
+
+@pytest.mark.benchmark
+def test_sample_respects_video_limits(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test that sampling respects video limits per prompt."""
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    # Set very low video limit
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 3
+
+    for sample in samples:
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) <= 1  # Should respect video limit
+
+
+@pytest.mark.benchmark
+def test_sample_mixed_buckets_with_zero_probability(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with mixed buckets including zero probability entries."""
+    bucket_config = {
+        (64, 64, 1): 0.5,  # Images
+        (64, 64, 8): 0.5,  # Videos
+        (128, 128, 16): 0.0,  # Zero probability videos (should be ignored)
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=4,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 4
+
+    # Should only see 64x64 videos, not 128x128 videos
+    for sample in samples:
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        for item in mm_data:
+            if item["type"] == "video_url":
+                # Decode video to verify dimensions
+                url = item["video_url"]["url"]
+                base64_data = url.split(",")[1]
+                video_bytes = base64.b64decode(base64_data)
+
+                with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:  # noqa
+                    temp_path = temp_file.name
+                    temp_file.write(video_bytes)
+
+                try:
+                    cap = cv2.VideoCapture(temp_path)
+                    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                    cap.release()
+
+                    # Should be 64x64, not 128x128
+                    assert frame_width == 64
+                    assert frame_height == 64
+                finally:
+                    if os.path.exists(temp_path):
+                        os.unlink(temp_path)
+
+
+@pytest.mark.benchmark
+def test_sample_deterministic_with_videos(hf_tokenizer: PreTrainedTokenizerBase):
+    """Test that sampling with videos is deterministic with same seed."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=123)
+
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples1 = dataset1.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    samples2 = dataset2.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples1) == len(samples2)
+
+    # Compare multimodal data
+    for s1, s2 in zip(samples1, samples2):
+        assert s1.multi_modal_data == s2.multi_modal_data
+
+
+@pytest.mark.benchmark
+def test_sample_different_seeds_produce_different_videos(
+    hf_tokenizer: PreTrainedTokenizerBase,
+):
+    """Test that different seeds produce different video content."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=456)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples1 = dataset1.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=2,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    samples2 = dataset2.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=2,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    # Video content should be different
+    for s1, s2 in zip(samples1, samples2):
+        mm_data1 = cast(list[dict[str, Any]], s1.multi_modal_data)
+        mm_data2 = cast(list[dict[str, Any]], s2.multi_modal_data)
+
+        assert len(mm_data1) == len(mm_data2) == 1
+
+        url1 = mm_data1[0]["video_url"]["url"]
+        url2 = mm_data2[0]["video_url"]["url"]
+
+        assert url1 != url2  # Different video content
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index fafbef5f37180e9c9c69d4cb260a7b4a9391499f..90d685c966d3eea10daa9efd3ac5c93c116763a5 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -11,9 +11,7 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
-    ]
+    args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
@@ -46,6 +44,7 @@ def test_bench_serve(server):
 
     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
 
+
 @pytest.mark.benchmark
 def test_bench_serve_chat(server):
     command = [
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
index b61e51db4fbe46ef69f1975b3e238052ebe7c3b4..a579b59e8af46f1780829a53d7c6e0c7311d293d 100644
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -10,8 +10,18 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 @pytest.mark.benchmark
 def test_bench_throughput():
     command = [
-        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
-        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+        "vllm",
+        "bench",
+        "throughput",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
     ]
     result = subprocess.run(command, capture_output=True, text=True)
     print(result.stdout)
diff --git a/tests/ci_envs.py b/tests/ci_envs.py
index d16ecce1ef8dd4c81af9b0f72d92463cee164656..f3a54f308cd8e5d9f298e707e9a12601e2d29d35 100644
--- a/tests/ci_envs.py
+++ b/tests/ci_envs.py
@@ -5,13 +5,16 @@ These envs only work for a small part of the tests, fix what you need!
 """
 
 import os
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
+
+from vllm.envs import maybe_convert_bool
 
 if TYPE_CHECKING:
     VLLM_CI_NO_SKIP: bool = False
-    VLLM_CI_DTYPE: Optional[str] = None
-    VLLM_CI_HEAD_DTYPE: Optional[str] = None
-    VLLM_CI_HF_DTYPE: Optional[str] = None
+    VLLM_CI_DTYPE: str | None = None
+    VLLM_CI_HEAD_DTYPE: str | None = None
+    VLLM_CI_HF_DTYPE: str | None = None
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # A model family has many models with the same architecture.
@@ -24,6 +27,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None),
     # Allow changing the head dtype used by transformers in tests
     "VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None),
+    # Allow control over whether tests use enforce_eager
+    "VLLM_CI_ENFORCE_EAGER": lambda: maybe_convert_bool(
+        os.getenv("VLLM_CI_ENFORCE_EAGER", None)
+    ),
 }
 
 
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index f25c367433f41c7d6be8609a31bd88fd03216e4c..fa426190067f650acca09bf598be63df78735829 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -2,18 +2,23 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
+from contextlib import nullcontext
 from copy import deepcopy
-from typing import Callable, Union
 
+import depyf
 from torch import fx
 from torch._ops import OpOverload
+from torch.fx._utils import lazy_format_graph_code
 
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.compilation.pass_manager import with_pattern_match_debug
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
+
+logger = init_logger("vllm.tests.compile.backend")
 
 
 class LazyInitPass(InductorPass):
@@ -23,8 +28,7 @@ class LazyInitPass(InductorPass):
     and then immediately invoke it.
     """
 
-    def __init__(self, pass_cls: type[VllmInductorPass],
-                 vllm_config: VllmConfig):
+    def __init__(self, pass_cls: type[VllmInductorPass], vllm_config: VllmConfig):
         self.pass_cls = pass_cls
         self.vllm_config = weakref.proxy(vllm_config)  # avoid cycle
 
@@ -45,24 +49,34 @@ class TestBackend:
     Inductor config is default-initialized from VllmConfig.CompilationConfig.
     """
 
-    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
-                                                             None]]):
+    def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
         self.custom_passes = list(passes)
-        compile_config = get_current_vllm_config().compilation_config
-        self.inductor_config = compile_config.inductor_compile_config
-        self.inductor_config['force_disable_caches'] = True
-        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
+        vllm_config = get_current_vllm_config()
+        compile_config = vllm_config.compilation_config
+        # Deepcopy to allow multiple TestBackend instances to use the same VllmConfig
+        self.inductor_config = deepcopy(compile_config.inductor_compile_config)
+        self.inductor_config["force_disable_caches"] = True
+        self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
+
+        if debug_dump_path := vllm_config.compile_debug_dump_path():
+            logger.debug("Dumping depyf output to %s", debug_dump_path)
+            self.debug_ctx = depyf.prepare_debug(debug_dump_path.as_posix())
+        else:
+            self.debug_ctx = nullcontext()
 
     def __call__(self, graph: fx.GraphModule, example_inputs):
         self.graph_pre_compile = deepcopy(graph)
         from torch._inductor.compile_fx import compile_fx
-        return compile_fx(graph,
-                          example_inputs,
-                          config_patches=self.inductor_config)
+
+        with self.debug_ctx:
+            return compile_fx(
+                graph, example_inputs, config_patches=self.inductor_config
+            )
 
     @with_pattern_match_debug
     def post_pass(self, graph: fx.Graph):
         self.graph_pre_pass = deepcopy(graph)
+        lazy_format_graph_code("graph_pre_pass", graph.owning_module)
 
         VllmInductorPass.dump_prefix = 0
         for pass_ in self.custom_passes:
@@ -72,6 +86,7 @@ class TestBackend:
         VllmInductorPass.dump_prefix = None
 
         self.graph_post_pass = deepcopy(graph)
+        lazy_format_graph_code("graph_post_pass", graph.owning_module)
         # assign by reference, will reflect the final state of the graph
         self.final_graph = graph
 
@@ -82,8 +97,7 @@ class TestBackend:
             assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
             assert num_pre > num_post, f"All nodes remain for op {op.name()}"
             if fully_replaced:
-                assert num_post == 0, \
-                    f"Unexpected op {op.name()} in post-pass graph"
+                assert num_post == 0, f"Unexpected op {op.name()} in post-pass graph"
 
     def check_after_ops(self, ops: Sequence[OpOverload]):
         for op in ops:
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index b02c1b565671da49cf5f9f5323903b8c7c216263..c6d4b5272dbcffdded3286825cc348f34699f695 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -3,15 +3,15 @@
 import contextlib
 import os
 import weakref
-from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
 @contextlib.contextmanager
@@ -33,121 +33,44 @@ def temporary_environ(env_vars):
                 os.environ[k] = v
 
 
-@dataclass
-class BackendConfig:
-    name: str
-    env_vars: dict
-    comp_config: dict
-    specific_gpu_arch: Optional[tuple] = None
-
-
-# Define all backend configurations of full cudagraph to be tested
-backend_configs = {
-    # FA3 on Hopper
-    "FA3":
-    BackendConfig(name="FA3",
-                  env_vars={
-                      "VLLM_FLASH_ATTN_VERSION": "3",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FlashMLA on Hopper
-    "FlashMLA":
-    BackendConfig(name="FlashMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FlashAttention MLA on Hopper
-    "FlashAttentionMLA":
-    BackendConfig(name="FlashAttentionMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_DECODE_ONLY",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # Cutlass MLA on Blackwell
-    "CutlassMLA":
-    BackendConfig(
-        name="CutlassMLA",
-        env_vars={
-            "VLLM_USE_V1": "1",
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-            "FORCE_NUM_KV_SPLITS":
-            "1",  # TODO: remove this when hang issue is fixed
-        },
-        comp_config={
-            "cudagraph_mode": "FULL_AND_PIECEWISE",
-            "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
-        },
-        specific_gpu_arch=(10, 0)),
-    # FA2
-    "FA2":
-    BackendConfig(name="FA2",
-                  env_vars={
-                      "VLLM_FLASH_ATTN_VERSION": "2",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL",
-                  }),
-    # Triton Attention
-    "TritonAttn":
-    BackendConfig(name="TritonAttn",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
-                  comp_config={
-                      "cudagraph_mode": "FULL",
-                  }),
-    # FlashInfer
-    "FlashInfer":
-    BackendConfig(name="FlashInfer",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
-}
-
-test_params_full_cudagraph = []
+model_backends_full_cudagraph = []
 
 # deepseek-ai/DeepSeek-V2-Lite with MLA
 MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
-    test_params_full_cudagraph.append(
-        pytest.param(
-            ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))
+    model_backends_full_cudagraph.append(
+        ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
+    )
 
 # Qwen/Qwen2-1.5B-Instruct with other backends
 other_backend_configs = [
     backend_configs[c] for c in backend_configs if c not in MLA_backends
 ]
 for backend_config in other_backend_configs:
-    test_params_full_cudagraph.append(
-        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))
+    model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
 
 
 @pytest.fixture(scope="class")
 def llm_pair(request):
-    model, backend_config = request.param
+    model, backend_config, use_inductor_graph_partition = request.param
+    backend_config.comp_config["use_inductor_graph_partition"] = (
+        use_inductor_graph_partition
+    )
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition only supported in torch>=2.9")
 
     # Dynamically skip test if GPU capability is not met
-    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
-        != current_platform.get_device_capability():
+    if (
+        backend_config.specific_gpu_arch
+        and backend_config.specific_gpu_arch != current_platform.get_device_capability()
+    ):
         if backend_config.specific_gpu_arch == (9, 0):
             pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
         elif backend_config.specific_gpu_arch == (10, 0):
             pytest.skip("Only Blackwell GPUs support Cutlass MLA")
 
     env_vars = {
-        "VLLM_USE_V1": "1",
         # Force native sampler to avoid potential nondeterminism in FlashInfer
         # when per-request generators are not used in V1.
         "VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -160,8 +83,7 @@ def llm_pair(request):
             trust_remote_code=True,
             max_model_len=1024,
             max_num_seqs=128,
-            compilation_config=\
-                CompilationConfig(**backend_config.comp_config),
+            compilation_config=CompilationConfig(**backend_config.comp_config),
             generation_config="vllm",
             seed=42,
         )
@@ -187,7 +109,15 @@ def llm_pair(request):
     )
 
 
-@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
+@pytest.mark.parametrize(
+    "llm_pair",
+    [
+        pytest.param((model, backend_config, use_inductor_graph_partition))
+        for model, backend_config in model_backends_full_cudagraph
+        for use_inductor_graph_partition in [True, False]
+    ],
+    indirect=True,
+)
 class TestFullCUDAGraph:
     """
     Use a class such that an llm pair is constructed once for all
@@ -197,20 +127,22 @@ class TestFullCUDAGraph:
     meaning there would be multiple LLM instances hogging memory simultaneously.
     """
 
-    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
-        (1, 10),
-        (7, 10),
-        (16, 10),
-        (25, 10),
-        (32, 10),
-        (45, 10),
-        (64, 10),
-        (123, 10),
-        (8, 5),
-        (8, 30),
-    ])
-    def test_full_cudagraph(self, batch_size, max_tokens,
-                            llm_pair: tuple[LLM, LLM]):
+    @pytest.mark.parametrize(
+        ("batch_size", "max_tokens"),
+        [
+            (1, 10),
+            (7, 10),
+            (16, 10),
+            (25, 10),
+            (32, 10),
+            (45, 10),
+            (64, 10),
+            (123, 10),
+            (8, 5),
+            (8, 30),
+        ],
+    )
+    def test_full_cudagraph(self, batch_size, max_tokens, llm_pair: tuple[LLM, LLM]):
         """
         Test various batch sizes and max_tokens to ensure that the
         full cudagraph compilation works for padded cases too.
@@ -221,26 +153,33 @@ class TestFullCUDAGraph:
         prompts = ["the quick brown fox"] * batch_size
         # Use purely greedy decoding to avoid top-p truncation sensitivity
         # that can amplify tiny numeric differences across runtimes.
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         top_p=1.0)
+        sampling_params = SamplingParams(
+            temperature=0.0, max_tokens=max_tokens, top_p=1.0
+        )
 
         piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
         full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
 
         # Check that all responses are the same
-        for piecewise_res, full_res in zip(piecewise_responses,
-                                           full_responses):
-            assert piecewise_res.outputs[0].text.lower() == \
-                full_res.outputs[0].text.lower()
+        for piecewise_res, full_res in zip(piecewise_responses, full_responses):
+            assert (
+                piecewise_res.outputs[0].text.lower()
+                == full_res.outputs[0].text.lower()
+            )
 
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
-            # Flex_Attention is not supported with full cuda graph
-    }), pytest.raises(RuntimeError):
-        LLM(model="Qwen/Qwen2-1.5B-Instruct",
-            compilation_config=CompilationConfig(cudagraph_mode="FULL"))
+    with (
+        temporary_environ(
+            {
+                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
+                # Flex_Attention is not supported with full cuda graph
+            }
+        ),
+        pytest.raises(RuntimeError),
+    ):
+        LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+        )
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index 5cfebfce9ea2a8729a2c91205c44fa2337fefcc7..6d3788af9de0d8f8456dbc0bfd3581597e1ebd16 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -5,16 +5,24 @@ Test (piecewise) compilation with a simple model where multiple submodules
 are compiled and graph captured separately.
 """
 
+import pytest
 import torch
 from torch import nn
 
 from vllm.compilation.backends import set_model_tag
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.decorators import (ignore_torch_compile,
-                                         support_torch_compile)
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         VllmConfig, set_current_vllm_config)
+from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+from ...utils import create_new_process_for_each_test
 
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
@@ -27,12 +35,7 @@ RANDOM_SEED = 0
 
 @support_torch_compile
 class ParentModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -40,7 +43,6 @@ class ParentModel(nn.Module):
 
 
 class Attention(nn.Module):
-
     def __init__(self, mlp_size: int, hidden_size: int) -> None:
         super().__init__()
         self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False)
@@ -51,17 +53,21 @@ class Attention(nn.Module):
         nn.init.xavier_normal_(
             self.pre_attn.weight.data,
             generator=torch.Generator().manual_seed(RANDOM_SEED),
-            gain=0.001)
+            gain=0.001,
+        )
         nn.init.xavier_normal_(
             self.post_attn.weight.data,
             generator=torch.Generator().manual_seed(RANDOM_SEED),
-            gain=0.001)
+            gain=0.001,
+        )
 
     def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
         x_f32 = x.float()
-        return (x_f32 * torch.rsqrt(
-            torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6) *
-                self.rms_norm_weight).to(x.dtype)
+        return (
+            x_f32
+            * torch.rsqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6)
+            * self.rms_norm_weight
+        ).to(x.dtype)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.pre_attn(x)
@@ -76,14 +82,15 @@ class Attention(nn.Module):
 
 @support_torch_compile
 class CompiledAttention(nn.Module):
-
-    def __init__(self,
-                 *,
-                 mlp_size: int,
-                 hidden_size: int,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        *,
+        mlp_size: int,
+        hidden_size: int,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__()
         self.attn = Attention(mlp_size, hidden_size)
 
@@ -93,21 +100,21 @@ class CompiledAttention(nn.Module):
 
 @support_torch_compile
 class CompiledAttentionTwo(CompiledAttention):
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.attn(x) + x
 
 
 @ignore_torch_compile
 class SimpleModelWithTwoGraphs(ParentModel):
-
-    def __init__(self,
-                 *,
-                 mlp_size: int,
-                 hidden_size: int,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        *,
+        mlp_size: int,
+        hidden_size: int,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Test will fail without set_model_tag here with error:
         # "ValueError: too many values to unpack (expected 3)"
@@ -142,118 +149,174 @@ class SimpleModelWithTwoGraphs(ParentModel):
 
 
 @torch.inference_mode
-def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
-              cudagraph_runtime_mode: CUDAGraphMode):
+def run_model(
+    vllm_config: VllmConfig,
+    model: nn.Module,
+    inputs: torch.Tensor,
+    cudagraph_runtime_mode: CUDAGraphMode,
+):
     with set_forward_context({}, vllm_config=vllm_config):
         # warmup for the model with cudagraph_mode NONE
         model(inputs)
 
         # simulate cudagraphs capturing
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             model(inputs[:2])
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=1, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=1,
+            ),
+        ):
             model(inputs[:1])
 
         # simulate cudagraphs replay
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             output = model(inputs[:2])
 
         output = output.cpu()
         return output.cpu()
 
 
-def test_multi_graph_piecewise_compile_outputs_equal():
+@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_multi_graph_piecewise_compile(
+    use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
+):
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
     outputs = []
 
-    # piecewise compile
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        splitting_ops=["silly.attention"],
-        cudagraph_capture_sizes=[1, 2],
-    ))
+    # vllmcompile compile
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+            splitting_ops=["silly::attention"],
+            cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
+        )
+    )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
-        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
-                                         hidden_size=HIDDEN_SIZE,
-                                         vllm_config=vllm_config,
-                                         prefix='').eval().cuda()
+        model = (
+            SimpleModelWithTwoGraphs(
+                mlp_size=MLP_SIZE,
+                hidden_size=HIDDEN_SIZE,
+                vllm_config=vllm_config,
+                prefix="",
+            )
+            .eval()
+            .cuda()
+        )
 
     # Pre-allocate memory for CUDAGraph which expects
     # static tensor addresses
     inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
 
+    if use_inductor_graph_partition:
+        # Splitting happens at Inductor lowering level,
+        # total piecewise fx graphs is equal to total graphs
+        num_piecewise_fx = 2
+        num_piecewise_capturable_fx = 2
+    else:
+        # attn_one, attn_two each has 3 piecewise graphs
+        # (pre attn, post attn, silly_attention) each
+        num_piecewise_fx = 6
+        # attn_one, attn_two has pre attn and post attn each, total=4
+        num_piecewise_capturable_fx = 4
+
     with compilation_counter.expect(
-            num_graphs_seen=2,  # two graphs for the model
-            num_piecewise_graphs_seen=6,
-            # attn_one, attn_two each has 3 piecewise graphs
-            # (pre attn, post attn, silly_attention) each
-            num_piecewise_capturable_graphs_seen=4,
-            # attn_one, attn_two has pre attn and post attn each, total=4
-            num_backend_compilations=4,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=8,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=2,  # two graphs for the model
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        num_cudagraph_captured=8,  # num_cudagraph_sizes * num_partitions
     ):
-        outputs.append(
-            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
+        outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # no compile or cudagraph
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.NO_COMPILATION, ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.NONE,
+        )
+    )
     cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     with set_current_vllm_config(vllm_config):
-        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
-                                         hidden_size=HIDDEN_SIZE,
-                                         vllm_config=vllm_config,
-                                         prefix='').eval().cuda()
+        model = (
+            SimpleModelWithTwoGraphs(
+                mlp_size=MLP_SIZE,
+                hidden_size=HIDDEN_SIZE,
+                vllm_config=vllm_config,
+                prefix="",
+            )
+            .eval()
+            .cuda()
+        )
 
     with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
+        num_graphs_seen=0,
+        num_piecewise_graphs_seen=0,
+        num_piecewise_capturable_graphs_seen=0,
+        num_backend_compilations=0,
+        num_cudagraph_captured=0,
     ):
-        outputs.append(
-            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
+        outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # piecewise compile without CUDA graph
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=False,
-        splitting_ops=["silly.attention"],
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.NONE,
+            splitting_ops=["silly::attention"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
+        )
+    )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
-        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
-                                         hidden_size=HIDDEN_SIZE,
-                                         vllm_config=vllm_config,
-                                         prefix='').eval().cuda()
+        model = (
+            SimpleModelWithTwoGraphs(
+                mlp_size=MLP_SIZE,
+                hidden_size=HIDDEN_SIZE,
+                vllm_config=vllm_config,
+                prefix="",
+            )
+            .eval()
+            .cuda()
+        )
 
     with compilation_counter.expect(
-            num_graphs_seen=2,
-            num_piecewise_graphs_seen=6,
-            num_piecewise_capturable_graphs_seen=4,
-            num_backend_compilations=4,
-            num_cudagraph_captured=0,  # no cudagraph captured
+        num_graphs_seen=2,
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        num_cudagraph_captured=0,  # no cudagraph captured
     ):
-        outputs.append(
-            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
+        outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # Generally don't expect outputs with and without inductor
     # to be bitwise equivalent
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 41055f431569c1fa0df150f21c0b4f4bf57d40ea..e258133ab50a7571029d2020cefe2d8f8ba27e58 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -11,11 +11,17 @@ from torch import nn
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         VllmConfig, set_current_vllm_config)
-from vllm.envs import VLLM_USE_V1
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import is_torch_equal_or_newer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+from ...utils import create_new_process_for_each_test
 
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
@@ -23,12 +29,7 @@ from ..silly_attention import get_global_counter, reset_global_counter
 
 @support_torch_compile
 class SillyModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -60,53 +61,64 @@ def _run_simple_model(
     expected_num_backend_compilations,
     expected_num_cudagraph_captured,
 ):
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        use_inductor=use_inductor,
-        splitting_ops=splitting_ops,
-        use_inductor_graph_partition=use_inductor_graph_partition,
-        cudagraph_copy_inputs=True,
-        cudagraph_capture_sizes=[1, 2],
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor=use_inductor,
+            splitting_ops=splitting_ops,
+            use_inductor_graph_partition=use_inductor_graph_partition,
+            cudagraph_copy_inputs=True,
+            cudagraph_capture_sizes=[1, 2],
+        )
+    )
     with set_current_vllm_config(vllm_config):
-        model = SillyModel(vllm_config=vllm_config, prefix='')
+        model = SillyModel(vllm_config=vllm_config, prefix="")
 
     inputs = torch.randn(100).cuda()
 
-    with compilation_counter.expect(
+    with (
+        compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
-            num_piecewise_capturable_graphs_seen=
-            expected_num_piecewise_capturable_graphs_seen,
+            num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
             num_backend_compilations=expected_num_backend_compilations,
             num_cudagraph_captured=expected_num_cudagraph_captured,
-    ), set_forward_context(None,
-                           vllm_config=vllm_config):  # background context
+        ),
+        set_forward_context(None, vllm_config=vllm_config),
+    ):  # background context
         # warm up with background context
         model(inputs)
 
         # capturing/replaying should under context of cudagraph dispatching
         with set_forward_context(
-                None,
-                vllm_config=vllm_config,
-                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
-                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            None,
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             model(torch.randn(2).cuda())
         with set_forward_context(
-                None,
-                vllm_config=vllm_config,
-                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
-                batch_descriptor=BatchDescriptor(num_tokens=1, )):
+            None,
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=1,
+            ),
+        ):
             model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         reset_global_counter()
         with set_forward_context(
-                None,
-                vllm_config=vllm_config,
-                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
-                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            None,
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             output = model(input)
         assert get_global_counter() == 2
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@@ -114,42 +126,42 @@ def _run_simple_model(
 
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
+@create_new_process_for_each_test("spawn")
 def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
     _run_simple_model(
-        splitting_ops=["silly.attention"],
+        splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
         use_inductor=use_inductor,
-        expected_num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-        expected_num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-        expected_num_backend_compilations=
-        3,  # num_piecewise_capturable_graphs_seen
-        expected_num_cudagraph_captured=
-        6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # 2 * num_layers + 1
+        expected_num_piecewise_graphs_seen=5,
+        # 1 + num_layers
+        expected_num_piecewise_capturable_graphs_seen=3,
+        # num_piecewise_capturable_graphs_seen
+        expected_num_backend_compilations=3,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        expected_num_cudagraph_captured=6,
     )
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
-def test_simple_inductor_graph_partition(splitting_ops):
-    assert VLLM_USE_V1
+def test_simple_inductor_graph_partition(monkeypatch):
     if not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
+    # disable compile cache so that we run separately for different splitting_ops
+    # and get the expected number of cudagraphs captured.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
 
     _run_simple_model(
-        # inductor graph partition automatically resets splitting_ops
-        # to be an empty list
-        splitting_ops=splitting_ops,
+        splitting_ops=["silly::attention"],
         use_inductor_graph_partition=True,
         use_inductor=True,
-        expected_num_piecewise_graphs_seen=
-        1,  # since not splitting at fx graph level
-        expected_num_piecewise_capturable_graphs_seen=
-        1,  # since not splitting at fx graph level
-        expected_num_backend_compilations=
-        1,  # since not splitting at fx graph level
-        expected_num_cudagraph_captured=
-        6,  # inductor graph partition still captures 6
-        # graph, same as fx graph partition.
+        # Since not splitting at fx graph level
+        expected_num_piecewise_graphs_seen=1,
+        # Since not splitting at fx graph level
+        expected_num_piecewise_capturable_graphs_seen=1,
+        # Since not splitting at fx graph level
+        expected_num_backend_compilations=1,
+        # Inductor graph partition still captures 6 graph, same as fx graph partition
+        expected_num_cudagraph_captured=6,
     )
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index cba7517647e51294226bf291cf0990a04ed7ba16..915fbc6ce7f3922d00db09391ecfbaba5e66a1ca 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,8 +8,10 @@ This is a tractable model, the weights and computation are specially designed
 if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
+
+from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 import torch
@@ -17,9 +19,17 @@ from torch import nn
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         VllmConfig, set_current_vllm_config)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+from ...utils import create_new_process_for_each_test
 
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
@@ -43,15 +53,14 @@ class LlamaConfig:
             factors.append((k, v))
         factors.sort()
         import hashlib
-        return hashlib.md5(str(factors).encode(),
-                           usedforsecurity=False).hexdigest()
+
+        return hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
 
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
 
 
 class LlamaMLP(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.gate_up_projection = nn.Linear(
@@ -66,31 +75,31 @@ class LlamaMLP(nn.Module):
         )
 
         if config.tractable_init:
-            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
-            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.gate_up_projection.weight.data[: config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size :])
             nn.init.eye_(self.down_projection.weight.data)
         else:
-            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
-            nn.init.xavier_normal_(self.down_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
+            nn.init.xavier_normal_(
+                self.gate_up_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
+            nn.init.xavier_normal_(
+                self.down_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
 
     def forward(self, x):
         # for tractable_init and positive input, this is
         # essentially an elementwise-square
         x = self.gate_up_projection(x)
-        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
-            x[:, x.size(1) // 2:])
+        x = x[:, : x.size(1) // 2] * torch.nn.functional.relu(x[:, x.size(1) // 2 :])
         x = self.down_projection(x)
         return x
 
 
 class LlamaAttention(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.qkv_projection = nn.Linear(
@@ -106,21 +115,25 @@ class LlamaAttention(nn.Module):
         )
 
         if config.tractable_init:
-            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
-            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
-                                                         config.hidden_size])
-            nn.init.eye_(self.qkv_projection.weight.data[2 *
-                                                         config.hidden_size:])
+            nn.init.eye_(self.qkv_projection.weight.data[: config.hidden_size])
+            nn.init.eye_(
+                self.qkv_projection.weight.data[
+                    config.hidden_size : 2 * config.hidden_size
+                ]
+            )
+            nn.init.eye_(self.qkv_projection.weight.data[2 * config.hidden_size :])
             nn.init.eye_(self.output_projection.weight.data)
         else:
-            nn.init.xavier_normal_(self.qkv_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
-            nn.init.xavier_normal_(self.output_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
+            nn.init.xavier_normal_(
+                self.qkv_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
+            nn.init.xavier_normal_(
+                self.output_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
 
     def forward(
         self,
@@ -144,7 +157,6 @@ class LlamaAttention(nn.Module):
 
 
 class LlamaDecoderLayer(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.self_attention = LlamaAttention(config)
@@ -154,7 +166,7 @@ class LlamaDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         For tractable computation:
@@ -164,7 +176,7 @@ class LlamaDecoderLayer(nn.Module):
         - if residual is not None, the outputs are:
             - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
             - hidden_states = (residual + 1) ** 2
-        """ # noqa
+        """  # noqa
         if residual is None:
             residual = hidden_states
             hidden_states = hidden_states + 1
@@ -173,8 +185,9 @@ class LlamaDecoderLayer(nn.Module):
             residual = hidden_states
             hidden_states = hidden_states + 1
 
-        hidden_states = self.self_attention(positions=positions,
-                                            hidden_states=hidden_states)
+        hidden_states = self.self_attention(
+            positions=positions, hidden_states=hidden_states
+        )
 
         hidden_states = hidden_states + residual
         residual = hidden_states
@@ -186,27 +199,29 @@ class LlamaDecoderLayer(nn.Module):
 
 @support_torch_compile
 class LlamaModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 config: LlamaConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        config: LlamaConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__()
         self.embedding_tokens = nn.Embedding(
             num_embeddings=config.vocab_size,
             embedding_dim=config.hidden_size,
         )
         self.layers = nn.ModuleList(
-            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+            [LlamaDecoderLayer(config) for _ in range(config.num_layers)]
+        )
 
         # this is the initial value of the hidden states
         self.embedding_tokens.weight.data.fill_(config.init_value)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
     ) -> torch.Tensor:
         hidden_states = self.embedding_tokens(input_ids)
@@ -216,168 +231,195 @@ class LlamaModel(nn.Module):
         return hidden_states
 
 
-def tractable_computation(input_ids: torch.Tensor,
-                          positions: torch.Tensor,
-                          config: LlamaConfig,
-                          init_value: float = 1.0) -> torch.Tensor:
-    hidden_states = torch.ones(input_ids.size(0),
-                               config.hidden_size,
-                               device=input_ids.device,
-                               dtype=input_ids.dtype) * init_value
+def tractable_computation(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    config: LlamaConfig,
+    init_value: float = 1.0,
+) -> torch.Tensor:
+    hidden_states = (
+        torch.ones(
+            input_ids.size(0),
+            config.hidden_size,
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
+        * init_value
+    )
 
     # first layer
     residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
-    hidden_states = (residual + 1)**2
+    hidden_states = (residual + 1) ** 2
 
     # following layers
     for _ in range(config.num_layers - 1):
         hidden_states = hidden_states + residual
         residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
-        hidden_states = (residual + 1)**2
+        hidden_states = (residual + 1) ** 2
 
     return hidden_states
 
 
 @torch.inference_mode
-def run_model(llama_config,
-              use_compile: bool,
-              use_inductor: bool,
-              split_attn: bool = False) -> torch.Tensor:
-
-    if use_compile:
-        compilation_config = CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
-            use_cudagraph=True,
-            use_inductor=use_inductor,
-            cudagraph_capture_sizes=[1, 2],
-        )
-        if split_attn:
-            compilation_config.splitting_ops = ["silly.attention"]
-        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
-    else:
-        compilation_config = CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION, )
-        cudagraph_runtime_mode = CUDAGraphMode.NONE
-
-    vllm_config = VllmConfig(compilation_config=compilation_config,
-                             additional_config=llama_config)
+def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
+    # Start with a fresh copy to make sure there's no cache dir sharing
+    compile_config = deepcopy(compile_config)
+    cudagraph_runtime_mode = compile_config.cudagraph_mode
+
+    vllm_config = VllmConfig(
+        compilation_config=compile_config, additional_config=llama_config
+    )
     with set_current_vllm_config(vllm_config):
-        model = LlamaModel(config=llama_config,
-                           vllm_config=vllm_config,
-                           prefix="").eval().cuda()
+        model = (
+            LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
+            .eval()
+            .cuda()
+        )
 
-    with set_forward_context({},
-                             vllm_config=vllm_config):  # background context
+    with set_forward_context({}, vllm_config=vllm_config):  # background context
         B = 16  # max batch size
-        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
         positions = torch.arange(B).cuda()
 
         # warmup for the model with cudagraph_mode NONE
         model(input_ids, positions)
 
         # simulate cudagraphs capturing
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             model(input_ids[:2], positions[:2])
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=1, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=1,
+            ),
+        ):
             model(input_ids[:1], positions[:1])
 
         input_ids[:2].zero_()
         # simulate cudagraphs replay
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             output = model(input_ids[:2], positions[:2])
 
         output = output.cpu()
 
         if llama_config.tractable_init:
-            expected_output = tractable_computation(input_ids[:2],
-                                                    positions[:2],
-                                                    llama_config).cpu()
+            expected_output = tractable_computation(
+                input_ids[:2], positions[:2], llama_config
+            ).cpu()
 
             assert torch.allclose(output, expected_output)
         else:
             return output.cpu()
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
-def test_toy_llama(use_inductor: bool):
+@pytest.mark.parametrize(
+    "backend, use_inductor_graph_partition",
+    [
+        ("eager", False),  # No inductor
+        ("inductor", False),  # Inductor, Dynamo partition
+        ("inductor", True),  # Inductor, Inductor partition
+    ],
+)
+@create_new_process_for_each_test("spawn")
+def test_toy_llama(
+    backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
+):
+    # We disable the vLLM compile cache into a new tmp dir for 1 reason:
+    # 1. To make sure we can properly track the number of Inductor compilations.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition only supported in torch>=2.9")
+
     # compare output with and without piecewise compilation
 
-    llama_config = LlamaConfig(hidden_size=128,
-                               mlp_size=256,
-                               vocab_size=128,
-                               num_layers=12)
+    llama_config = LlamaConfig(
+        hidden_size=128, mlp_size=256, vocab_size=128, num_layers=12
+    )
+
+    tractable_config = LlamaConfig(
+        hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
+    )
+
+    compile_config_no_compile = CompilationConfig(
+        mode=CompilationMode.NONE,
+        cudagraph_mode=CUDAGraphMode.NONE,
+        backend="eager",
+    )
+
+    compile_config_no_split = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        use_inductor_graph_partition=use_inductor_graph_partition,
+        cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        backend=backend,
+        cudagraph_capture_sizes=[1, 2],
+    )
 
-    tractable_config = LlamaConfig(hidden_size=128,
-                                   mlp_size=256,
-                                   vocab_size=128,
-                                   num_layers=2,
-                                   tractable_init=True)
+    compile_config_split = deepcopy(compile_config_no_split)
+    compile_config_split.splitting_ops = ["silly::attention"]
 
     outputs = []
     with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
+        num_graphs_seen=0,
+        num_piecewise_graphs_seen=0,
+        num_piecewise_capturable_graphs_seen=0,
+        num_backend_compilations=0,
+        num_cudagraph_captured=0,
     ):
-        outputs.append(
-            run_model(llama_config, use_inductor=False, use_compile=False))
-    run_model(tractable_config, use_inductor=False, use_compile=False)
+        outputs.append(run_model(llama_config, compile_config_no_compile))
 
-    if use_inductor:
+    run_model(tractable_config, compile_config_no_compile)
+
+    if backend == "inductor":
         kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
     else:
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=1,
-            num_piecewise_capturable_graphs_seen=1,
-            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=
-            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-            **kwargs,
+        num_graphs_seen=1,  # one graph for the model
+        num_piecewise_graphs_seen=1,
+        num_piecewise_capturable_graphs_seen=1,
+        num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2,
+        **kwargs,
     ):
-        outputs.append(
-            run_model(llama_config,
-                      use_inductor=use_inductor,
-                      use_compile=True))
-    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
+        outputs.append(run_model(llama_config, compile_config_no_split))
+
+    run_model(tractable_config, compile_config_no_split)
+
+    if use_inductor_graph_partition:
+        num_piecewise_fx = 1
+        num_piecewise_capturable_fx = 1
+    else:
+        num_piecewise_fx = 2 * llama_config.num_layers + 1
+        num_piecewise_capturable_fx = 1 + llama_config.num_layers
 
     with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=2 * llama_config.num_layers +
-            1,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=1 +
-            llama_config.num_layers,  # 1 + num_layers
-            num_backend_compilations=1 +
-            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=2 *
-        (1 + llama_config.num_layers
-         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=1,  # one graph for the model
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        # num_cudagraph_sizes * num_partitions
+        num_cudagraph_captured=2 * (1 + llama_config.num_layers),
     ):
-        outputs.append(
-            run_model(llama_config,
-                      use_inductor=use_inductor,
-                      use_compile=True,
-                      split_attn=True))
-    run_model(tractable_config,
-              use_inductor=use_inductor,
-              use_compile=True,
-              split_attn=True)
+        outputs.append(run_model(llama_config, compile_config_split))
+    run_model(tractable_config, compile_config_split)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
@@ -388,17 +430,15 @@ def benchmark():
     from triton.testing import do_bench
 
     # similar to llama 3.1-8B
-    llama_config = LlamaConfig(hidden_size=4096,
-                               mlp_size=14336,
-                               vocab_size=128 * 1024,
-                               num_layers=32)
+    llama_config = LlamaConfig(
+        hidden_size=4096, mlp_size=14336, vocab_size=128 * 1024, num_layers=32
+    )
 
     # a tiny model to measure the overhead
     # of piecewise cudagraph
-    llama_config = LlamaConfig(hidden_size=40,
-                               mlp_size=80,
-                               vocab_size=128,
-                               num_layers=2)
+    llama_config = LlamaConfig(
+        hidden_size=40, mlp_size=80, vocab_size=128, num_layers=2
+    )
 
     cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
 
@@ -411,25 +451,27 @@ def benchmark():
     for piecewise in [False, True]:
         if piecewise:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
-                use_cudagraph=True,
-                splitting_ops=["silly.attention"],
+                mode=CompilationMode.VLLM_COMPILE,
+                splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
 
         vllm_config = VllmConfig(compilation_config=compilation_config)
         with set_current_vllm_config(vllm_config):
-            model = LlamaModel(config=llama_config,
-                               vllm_config=vllm_config,
-                               prefix="").eval().cuda().to(torch.bfloat16)
+            model = (
+                LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
+                .eval()
+                .cuda()
+                .to(torch.bfloat16)
+            )
 
         B = 256  # max batch size
-        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
         positions = torch.arange(B).cuda().to(torch.bfloat16)
 
         graphs = {}
@@ -451,22 +493,31 @@ def benchmark():
                 # and use it later, because it will look up the name `b` in the
                 # enclosing scope, and the value of `b` will always be 256.
                 # it is fine here, because we only use the lambda function once.
-                runtime = do_bench(lambda: graphs[b][0]  # noqa
-                                   (input_ids[:b], positions[:b]))  # noqa
+                runtime = do_bench(
+                    lambda: graphs[b][0](  # noqa
+                        input_ids[:b],  # noqa
+                        positions[:b],  # noqa
+                    )
+                )
                 piecewise_cudagraph_time[b] = runtime
             else:
                 runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
-                eager_runtime = do_bench(
-                    lambda: model(input_ids[:b], positions[:b]))  # noqa
+                eager_runtime = do_bench(lambda: model(input_ids[:b], positions[:b]))  # noqa
                 full_cudagraph_time[b] = runtime
                 eager_time[b] = eager_runtime
 
     # print in tabular format
     print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
     for b in cudagraph_sizes:
-        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-              f"\t{piecewise_cudagraph_time[b]:.3f}")
+        print(
+            f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+            f"\t{piecewise_cudagraph_time[b]:.3f}"
+        )
 
 
 if __name__ == "__main__":
-    benchmark()
+    # Protect against subprocess reimport when using spawn_new_process_for_each_test
+    import os
+
+    if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
+        benchmark()
diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index baedafbae99f1c5691d25a4d014288267d19594c..29c02f6e6a1d31995c49f5d04572b308e5c116db 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -8,7 +8,7 @@ Centralizes custom operation definitions to avoid duplicate registrations.
 import torch
 from torch.library import Library
 
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
 # Shared library for all compilation test operations
 # Using "silly" namespace to match existing test expectations
@@ -31,8 +31,9 @@ def reset_global_counter():
     _global_counter = 0
 
 
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
+def silly_attention(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     """
     Unified attention implementation that depends on
     all inputs and affects the output.
@@ -47,8 +48,9 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out.copy_(q + k + v)
 
 
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
+def silly_attention_fake(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     """Fake implementation for testing"""
     return
 
@@ -60,5 +62,4 @@ direct_register_custom_op(
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
     target_lib=silly_lib,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65e5a25934d20ffae3db0ac110119d1d0195e67
--- /dev/null
+++ b/tests/compile/test_aot_compile.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from contextlib import contextmanager
+
+import pytest
+import torch
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.forward_context import set_forward_context
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def reference_fn(x: torch.Tensor):
+    assert x.shape[0] <= 42
+    assert x.shape[0] % 2 == 0
+    for _ in range(3000):
+        x = x + x.shape[0]
+    return x
+
+
+@support_torch_compile
+class CompiledMod(torch.nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        return reference_fn(x)
+
+
+def make_vllm_config() -> VllmConfig:
+    return VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+        )
+    )
+
+
+@contextmanager
+def use_vllm_config(vllm_config: VllmConfig):
+    with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config):
+        yield
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        vllm_config = make_vllm_config()
+        args = (torch.randn(10, 10),)
+        expected = reference_fn(*args)
+        with use_vllm_config(vllm_config):
+            m.setenv("VLLM_USE_AOT_COMPILE", "0")
+            with (
+                pytest.raises(RuntimeError, match="Detected recompile"),
+                torch.compiler.set_stance("fail_on_recompile"),
+            ):
+                CompiledMod(vllm_config=vllm_config)(*args)
+
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            torch._dynamo.reset()
+            with torch.compiler.set_stance("fail_on_recompile"):
+                actual = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(actual, expected)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
+    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        vllm_config = make_vllm_config()
+        with use_vllm_config(vllm_config), pytest.raises(FileNotFoundError):
+            CompiledMod(vllm_config=vllm_config)(*args)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                expected = CompiledMod(vllm_config=vllm_config)(*args)
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                ret = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(ret, expected)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_shape_env(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that the shape environment is correctly serialized and preserved
+    when loading from cache.
+    """
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 1dc21365d55770bcb99453dff6fdecd62b2aef0c..71ee2287814381090908ac1cb71b12a88246aacc 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -8,18 +8,31 @@ import torch
 
 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
-from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
-                         PassConfig, VllmConfig)
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_reduce_scatter)
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_reduce_scatter,
+)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (compare_two_settings, create_new_process_for_each_test,
-                     multi_gpu_test)
+from ..utils import (
+    compare_two_settings,
+    create_new_process_for_each_test,
+    multi_gpu_test,
+)
 from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -33,21 +46,20 @@ prompts = [
 
 
 class TestMMRSModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
         self.dtype = dtype
-        self.gate_proj = torch.nn.Parameter(torch.empty(
-            (self.hidden_size * 2, hidden_size)),
-                                            requires_grad=False)
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((self.hidden_size * 2, hidden_size)), requires_grad=False
+        )
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
     def forward(self, hidden_states):
         """
         Forward pass implementing the mm + reduce scatter in the FX graph
-    
+
         """
         # Reshape input
         view = hidden_states.reshape(-1, self.hidden_size)
@@ -66,14 +78,13 @@ class TestMMRSModel(torch.nn.Module):
 
 
 class TestAGMMModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
         self.dtype = dtype
-        self.weight = torch.nn.Parameter(torch.empty(
-            (hidden_size, hidden_size)),
-                                         requires_grad=False)
+        self.weight = torch.nn.Parameter(
+            torch.empty((hidden_size, hidden_size)), requires_grad=False
+        )
         # Initialize weights
         torch.nn.init.normal_(self.weight, std=0.02)
 
@@ -96,32 +107,35 @@ class TestAGMMModel(torch.nn.Module):
 
 
 class _BaseScaledMMModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
         self.dtype = dtype
-        self.weight = torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)\
-            .contiguous().transpose(0, 1)
+        self.weight = (
+            torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
 
         # Initialize scale_b for _scaled_mm.
         self.scale_b = torch.ones(1, self.hidden_size, dtype=torch.float32)
 
 
 class TestScaledMMRSModel(_BaseScaledMMModel):
-
     def forward(self, input: torch.Tensor):
         """
         Forward pass implementing the scaled_mm + reduce scatter in the FX graph
-    
+
         """
         fp8_input = input.to(FP8_DTYPE)
         scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
-        scaled_mm = torch._scaled_mm(fp8_input,
-                                     self.weight,
-                                     scale_a=scale_a,
-                                     scale_b=self.scale_b,
-                                     out_dtype=self.dtype)
+        scaled_mm = torch._scaled_mm(
+            fp8_input,
+            self.weight,
+            scale_a=scale_a,
+            scale_b=self.scale_b,
+            out_dtype=self.dtype,
+        )
         reduce_scatter = tensor_model_parallel_reduce_scatter(scaled_mm, dim=0)
         return reduce_scatter
 
@@ -129,11 +143,10 @@ class TestScaledMMRSModel(_BaseScaledMMModel):
         return [torch.ops.vllm.reduce_scatter.default]
 
     def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 
 
 class TestAGScaledMMModel(_BaseScaledMMModel):
-
     def forward(self, input: torch.Tensor):
         """
         Forward pass implementing the all gather + scaled_mm in the FX graph
@@ -143,11 +156,13 @@ class TestAGScaledMMModel(_BaseScaledMMModel):
         all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)
 
         scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
-        scaled_mm = torch._scaled_mm(all_gather,
-                                     self.weight,
-                                     scale_a=scale_a,
-                                     scale_b=self.scale_b,
-                                     out_dtype=self.dtype)
+        scaled_mm = torch._scaled_mm(
+            all_gather,
+            self.weight,
+            scale_a=scale_a,
+            scale_b=self.scale_b,
+            out_dtype=self.dtype,
+        )
         return scaled_mm
 
     def ops_in_model_before(self):
@@ -158,20 +173,22 @@ class TestAGScaledMMModel(_BaseScaledMMModel):
 
 
 class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
-
     def forward(self, input: torch.Tensor):
         """
         Forward pass implementing the cutlass_scaled_mm + reduce scatter
         in the FX graph
-    
+
         """
         fp8_input = input.to(FP8_DTYPE)
         scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
-        mm_out = torch.empty((fp8_input.shape[0], self.weight.shape[1]),
-                             dtype=self.dtype,
-                             device=input.device)
-        torch.ops._C.cutlass_scaled_mm(mm_out, fp8_input, self.weight, scale_a,
-                                       self.scale_b, None)
+        mm_out = torch.empty(
+            (fp8_input.shape[0], self.weight.shape[1]),
+            dtype=self.dtype,
+            device=input.device,
+        )
+        torch.ops._C.cutlass_scaled_mm(
+            mm_out, fp8_input, self.weight, scale_a, self.scale_b, None
+        )
         reduce_scatter = tensor_model_parallel_reduce_scatter(mm_out, dim=0)
         return reduce_scatter
 
@@ -179,14 +196,13 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
         return [torch.ops.vllm.reduce_scatter.default]
 
     def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 
 
 class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
-
     def forward(self, input: torch.Tensor):
         """
-        Forward pass implementing the all gather + cutlass_scaled_mm 
+        Forward pass implementing the all gather + cutlass_scaled_mm
         in the FX graph
         """
         # Reshape input
@@ -195,11 +211,14 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
 
         scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
 
-        mm_out = torch.empty((all_gather.shape[0], self.weight.shape[1]),
-                             dtype=self.dtype,
-                             device=all_gather.device)
-        torch.ops._C.cutlass_scaled_mm(mm_out, all_gather, self.weight,
-                                       scale_a, self.scale_b, None)
+        mm_out = torch.empty(
+            (all_gather.shape[0], self.weight.shape[1]),
+            dtype=self.dtype,
+            device=all_gather.device,
+        )
+        torch.ops._C.cutlass_scaled_mm(
+            mm_out, all_gather, self.weight, scale_a, self.scale_b, None
+        )
         return mm_out
 
     def ops_in_model_before(self):
@@ -210,23 +229,43 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", [
-    TestMMRSModel, TestAGMMModel, TestScaledMMRSModel, TestAGScaledMMModel,
-    TestCutlassScaledMMRSModel, TestAGCutlassScaledMMModel
-])
+@pytest.mark.parametrize(
+    "test_model",
+    [
+        TestMMRSModel,
+        TestAGMMModel,
+        TestScaledMMRSModel,
+        TestAGScaledMMModel,
+        TestCutlassScaledMMRSModel,
+        TestAGCutlassScaledMMModel,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
-                               hidden_size: int, dtype: torch.dtype):
-    if test_model in (TestScaledMMRSModel, TestAGScaledMMModel,
-                      TestCutlassScaledMMRSModel,
-                      TestAGCutlassScaledMMModel) and dtype == torch.float16:
+@pytest.mark.parametrize("dynamic", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_async_tp_pass_replace(
+    test_model: str,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    dynamic: bool,
+):
+    if (
+        test_model
+        in (
+            TestScaledMMRSModel,
+            TestAGScaledMMModel,
+            TestCutlassScaledMMRSModel,
+            TestAGCutlassScaledMMModel,
+        )
+        and dtype == torch.float16
+    ):
         pytest.skip(
-            "Only bf16 high precision output types are supported for " \
+            "Only bf16 high precision output types are supported for "
             "per-token (row-wise) scaling"
         )
 
@@ -235,19 +274,33 @@ def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                test_model,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                dynamic,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(async_tp_pass_on_test_model, num_processes)
 
 
-def async_tp_pass_on_test_model(local_rank: int, world_size: int,
-                                test_model_cls: torch.nn.Module,
-                                batch_size: int, seq_len: int,
-                                hidden_size: int, dtype: torch.dtype):
+def async_tp_pass_on_test_model(
+    local_rank: int,
+    world_size: int,
+    test_model_cls: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    dynamic: bool,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -255,13 +308,15 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
@@ -269,27 +324,40 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 
     # configure vllm config for SequenceParallelismPass
     vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
-        enable_async_tp=True, ), )
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(
+            enable_async_tp=True,
+        ),
+    )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
-    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
+    model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+    vllm_config.model_config = ModelConfig(
+        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
+    )
 
     async_tp_pass = AsyncTPPass(vllm_config)
     backend = TestBackend(async_tp_pass)
 
-    model = test_model_cls(hidden_size,
-                           dtype)  # Pass dtype to model constructor
+    assert (
+        async_tp_pass.compilation_config.splitting_ops
+        == vllm_config.compilation_config.splitting_ops
+    )
+    assert (
+        async_tp_pass.compilation_config.use_inductor_graph_partition
+        == vllm_config.compilation_config.use_inductor_graph_partition
+    )
+
+    model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                dtype=dtype,
-                                requires_grad=False)
+    hidden_states = torch.randn(
+        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+    )
+
+    if dynamic:
+        torch._dynamo.mark_dynamic(hidden_states, 0)
 
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states)
@@ -306,10 +374,10 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_id", [
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
-])
+@pytest.mark.parametrize(
+    "model_id",
+    ["meta-llama/Llama-3.2-1B-Instruct", "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"],
+)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("async_tp_enabled", [True])
 @pytest.mark.parametrize("distributed_backend", ["mp"])
@@ -342,16 +410,10 @@ def test_async_tp_pass_correctness(
         common_args.append("--enforce-eager")
 
     compilation_config = {
-        'level': 3,
-        'compile_sizes': [2, 4, 8],
-        'splitting_ops': [],
-        'pass_config': {
-            'enable_async_tp': async_tp_enabled
-        },
-    }
-
-    async_tp_env = tp_env = {
-        "VLLM_USE_V1": "1",
+        "mode": CompilationMode.VLLM_COMPILE,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {"enable_async_tp": async_tp_enabled},
     }
 
     async_tp_args = [
@@ -372,9 +434,4 @@ def test_async_tp_pass_correctness(
         "mp",
     ]
 
-    compare_two_settings(model_id,
-                         async_tp_args,
-                         tp_args,
-                         async_tp_env,
-                         tp_env,
-                         method="generate")
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index a1e5127ebeeb218f75bc7d6da06706b5399eeaf1..3f6898607f6b981115635c7a42db84db058f32fe 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import dataclasses
 
 import pytest
 
-from vllm.config import CompilationLevel
-from vllm.utils import cuda_device_count_stateless
+from vllm.config import CompilationMode
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
 
@@ -23,7 +21,7 @@ class TestSetting:
 
 
 # we cannot afford testing the full Cartesian product
-# of all models and all levels
+# of all models and all modes
 @pytest.mark.parametrize(
     "test_setting",
     [
@@ -79,14 +77,15 @@ class TestSetting:
             method="encode",
         ),
         # vision language model
-        TestSetting(
-            model="microsoft/Phi-3.5-vision-instruct",
-            model_args=["--trust-remote-code", "--max-model-len", "2048"],
-            pp_size=2,
-            tp_size=1,
-            attn_backend="FLASH_ATTN",
-            method="generate_with_image",
-        ),
+        # See https://github.com/vllm-project/vllm/issues/26716.
+        # TestSetting(
+        #     model="microsoft/Phi-3.5-vision-instruct",
+        #     model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        #     pp_size=2,
+        #     tp_size=1,
+        #     attn_backend="FLASH_ATTN",
+        #     method="generate_with_image",
+        # ),
     ],
 )
 def test_compile_correctness(
@@ -103,43 +102,54 @@ def test_compile_correctness(
     attn_backend = test_setting.attn_backend
     method = test_setting.method
     if cuda_device_count_stateless() < pp_size * tp_size:
-        pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
-                    f"{cuda_device_count_stateless()}")
+        pytest.skip(
+            f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
+            f"{cuda_device_count_stateless()}"
+        )
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
         final_args = [
-            "--enforce-eager", *model_args, "-pp",
-            str(pp_size), "-tp",
-            str(tp_size)
+            *model_args,
+            "-pp",
+            str(pp_size),
+            "-tp",
+            str(tp_size),
+            "-O.cudagraph_mode=none",
         ]
 
         all_args: list[list[str]] = []
         all_envs: list[dict[str, str] | None] = []
 
-        for level in [
-                CompilationLevel.NO_COMPILATION,
-                CompilationLevel.PIECEWISE,
+        for comp_mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O{level}"])
-            all_envs.append({})
+            for mode in [CompilationMode.NONE, comp_mode]:
+                all_args.append(
+                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                )
 
-        # inductor will change the output, so we only compare if the output
-        # is close, not exactly the same.
-        compare_all_settings(
-            model,
-            all_args,
-            all_envs,
-            method=method if method != "generate" else "generate_close")
-        all_envs.clear()
-        all_args.clear()
+            # inductor will change the output, so we only compare if the output
+            # is close, not exactly the same.
+            compare_all_settings(
+                model,
+                all_args,
+                all_envs,
+                method=method if method != "generate" else "generate_close",
+            )
+            all_envs.clear()
+            all_args.clear()
 
-        for level in [
-                CompilationLevel.NO_COMPILATION,
-                CompilationLevel.DYNAMO_AS_IS,
-                CompilationLevel.DYNAMO_ONCE,
+        for mode in [
+            CompilationMode.NONE,
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O{level}"])
+            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
+            all_envs.append({})
             all_envs.append({})
 
         compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 7afd6251bbbd546a3655cd80ff6d7886e02522f3..1e8a882a7f3eb97a61c9c3c558da547f5553b7ca 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -1,29 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+from contextlib import nullcontext
+from unittest.mock import patch
+
 import pytest
+from pydantic import ValidationError
 
-import vllm
 from vllm.compilation.counter import compilation_counter
-from vllm.config import CompilationConfig, VllmConfig
-from vllm.utils import _is_torch_equal_or_newer
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.compilation import CompilationMode
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
 
-def test_version():
-    assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.0a0+gitc82a174', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.0', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.1', '2.8.0.dev')
-    assert not _is_torch_equal_or_newer('2.7.1', '2.8.0.dev')
 
+def test_version():
+    # Test the version comparison logic using the private function
+    assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.1", "2.8.0.dev")
+    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
-def test_use_cudagraphs_dynamic(monkeypatch):
-    assert vllm.envs.VLLM_USE_V1
-    vllm_config = VllmConfig()
-    assert vllm_config.compilation_config.use_cudagraph
 
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+def test_copy_pass():
     vllm_config = VllmConfig()
-    assert not vllm_config.compilation_config.use_cudagraph
+    inductor_pass = FixFunctionalizationPass(vllm_config)
+    copied_inductor_pass = copy.deepcopy(inductor_pass)
+    assert (
+        copied_inductor_pass.compilation_config.use_inductor_graph_partition
+        == vllm_config.compilation_config.use_inductor_graph_partition
+    )
+    assert (
+        copied_inductor_pass.compilation_config.splitting_ops
+        == vllm_config.compilation_config.splitting_ops
+    )
 
 
 def test_custom_op():
@@ -41,63 +57,80 @@ def test_custom_op():
 # may be influenced by other tests.
 @pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
-    assert vllm.envs.VLLM_USE_V1
-
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
-    monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
 
     compilation_config = {
-        "use_cudagraph": False,  # speed things up a bit
+        "cudagraph_mode": CUDAGraphMode.NONE,  # speed things up a bit
     }
     with (
-            compilation_counter.expect(num_cache_entries_updated=0,
-                                       num_compiled_artifacts_saved=0),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config=compilation_config,
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(
+            num_cache_entries_updated=0, num_compiled_artifacts_saved=0
+        ),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config=compilation_config,
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-@pytest.mark.parametrize("enabled", [True, False])
-def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
-    assert vllm.envs.VLLM_USE_V1
-
+@pytest.mark.parametrize(
+    "cudagraph_mode,num_cudagraph_captured",
+    [
+        (CUDAGraphMode.NONE, 0),
+        (CUDAGraphMode.FULL_DECODE_ONLY, 1),
+        (CUDAGraphMode.PIECEWISE, 13),
+        (CUDAGraphMode.FULL_AND_PIECEWISE, 14),
+    ],
+)
+def test_use_cudagraphs(
+    vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
+):
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     compilation_config = {
         "cudagraph_capture_sizes": [100],
-        "use_cudagraph": enabled,
+        "cudagraph_mode": cudagraph_mode,
     }
+    num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
     with (
-            compilation_counter.expect(
-                num_graphs_seen=1,
-                num_gpu_runner_capture_triggers=1 if enabled else 0,
-                num_cudagraph_captured=13 if enabled else 0,
-            ),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config=compilation_config,
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(
+            num_graphs_seen=1,
+            num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
+            num_cudagraph_captured=num_cudagraph_captured,
+        ),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config=compilation_config,
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-def test_dynamo_as_is(vllm_runner, monkeypatch):
+def test_stock_torch_compile(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-            compilation_counter.expect(dynamo_as_is_count=1),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config={"level": 1},
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(stock_torch_compile_count=1),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
 
 
@@ -105,15 +138,16 @@ def test_dynamo_as_is(vllm_runner, monkeypatch):
 @pytest.mark.forked
 def test_no_compilation(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
-
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     with (
-            compilation_counter.expect(num_graphs_seen=0,
-                                       dynamo_as_is_count=0),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config={"level": 0},
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config={"mode": CompilationMode.NONE},
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
 
 
@@ -121,13 +155,223 @@ def test_no_compilation(vllm_runner, monkeypatch):
 @pytest.mark.forked
 def test_enforce_eager(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-            compilation_counter.expect(num_graphs_seen=0,
-                                       dynamo_as_is_count=0),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        enforce_eager=True,
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
+        ) as _,
+    ):
         pass
+
+
+def test_splitting_ops_dynamic():
+    # Default config
+    config = VllmConfig()
+    # Default V1 config leaves cudagraph mode unset; splitting ops are only
+    # populated when the engine decides to use piecewise compilation.
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert not config.compilation_config.splitting_ops_contain_attention()
+
+    # When use_inductor_graph_partition=True
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            splitting_ops=["vllm::unified_attention"],
+        )
+    )
+    # with inductor partition we use splitting_ops directly for
+    # partition rules
+    assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
+
+    # When attn_fusion pass enabled.
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+    assert config.compilation_config.splitting_ops == []
+    # cudagraph mode also fall back to FULL
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
+
+    # splitting_ops can not contain attention ops when attn_fusion
+    # pass enabled.
+    with pytest.raises(ValidationError):
+        config = VllmConfig(
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                pass_config={"enable_attn_fusion": True, "enable_noop": True},
+                custom_ops=["+quant_fp8"],
+                cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                # work around for accessing all attntion ops
+                splitting_ops=CompilationConfig()._attention_ops,
+            )
+        )
+
+    # When both use_inductor_graph_partition and attn_fusion pass enabled.
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+    # With inductor graph partition, attn_fusion and splitting_ops
+    # work together. Default splitting_ops include attention ops.
+    assert config.compilation_config.splitting_ops_contain_attention()
+    # enable_attn_fusion is directly supported under
+    # use_inductor_graph_partition=True, and cudagraph_mode
+    # is unchanged.
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+
+
+def test_should_split():
+    import torch
+
+    from vllm.compilation.partition_rules import should_split
+
+    graph = torch.fx.Graph()
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.aten.add.default,
+        args=(),
+        kwargs={},
+    )
+
+    # supports OpOverloadPacket
+    splitting_ops = ["aten::add"]
+    assert should_split(node, splitting_ops)
+
+    # supports OpOverload
+    splitting_ops = ["aten::add.default"]
+    assert should_split(node, splitting_ops)
+
+    # supports OpOverload
+    splitting_ops = ["aten::add.Tensor"]
+    assert not should_split(node, splitting_ops)
+
+    q, k, v, out = [torch.randn(1)] * 4
+
+    # supports custom ops as OpOverloadPacket
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.silly.attention,
+        args=(q, k, v, out),
+        kwargs={},
+    )
+
+    splitting_ops = ["silly::attention"]
+    assert should_split(node, splitting_ops)
+
+    # supports custom ops as OpOverload
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.silly.attention.default,
+        args=(q, k, v, out),
+        kwargs={},
+    )
+
+    splitting_ops = ["silly::attention"]
+    assert should_split(node, splitting_ops)
+
+    splitting_ops = ["silly::attention.default"]
+    assert should_split(node, splitting_ops)
+
+
+@pytest.mark.skipif(
+    not current_platform.support_static_graph_mode(),
+    reason="Skip if not cudagraph mode supported",
+)
+@pytest.mark.parametrize(
+    (
+        "cudagraph_capture_sizes",
+        "max_cudagraph_capture_size",
+        "tp_size",
+        "enable_sequence_parallelism",
+        "max_num_batched_tokens",
+        "cudagraph_mode",
+        "expected_max_size",
+    ),
+    [
+        (None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        (
+            [1, 2, 4],
+            8,
+            1,
+            False,
+            2048,
+            CUDAGraphMode.FULL_AND_PIECEWISE,
+            ValidationError,
+        ),
+        ([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
+        (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
+        # truncated to nearest multiple of 8 or 16
+        (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        # max from list
+        ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
+        # filtered out 15 due to SP
+        ([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        # limited by the max_tokens
+        ([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        # the list should contain at least 1 element when use cudagraph
+        ([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
+        # the max capturing size should be >= 1 when use cudagraph
+        (None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
+    ],
+)
+def test_cudagraph_sizes_post_init(
+    cudagraph_capture_sizes,
+    max_cudagraph_capture_size,
+    tp_size,
+    enable_sequence_parallelism,
+    max_num_batched_tokens,
+    cudagraph_mode,
+    expected_max_size,
+):
+    ctx = nullcontext()
+    if expected_max_size == ValidationError:
+        ctx = pytest.raises(expected_max_size)
+
+    with (
+        ctx,
+        patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
+    ):
+        compilation_config = CompilationConfig(
+            cudagraph_capture_sizes=cudagraph_capture_sizes,
+            max_cudagraph_capture_size=max_cudagraph_capture_size,
+            pass_config={
+                "enable_sequence_parallelism": enable_sequence_parallelism,
+                "enable_fusion": True,
+                "enable_noop": True,
+            },
+            cudagraph_mode=cudagraph_mode,
+        )
+        engine_args = EngineArgs(
+            model="facebook/opt-125m",
+            tensor_parallel_size=tp_size,
+            max_num_seqs=min(max_num_batched_tokens, 128),
+            max_num_batched_tokens=max_num_batched_tokens,
+            compilation_config=compilation_config,
+        )
+        vllm_config = engine_args.create_engine_config()
+
+        assert (
+            vllm_config.compilation_config.max_cudagraph_capture_size
+            == expected_max_size
+        )
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index d73586d53ff3ed44f1f549725feea6f558973eb5..1850cc8f1479a086fdcbd6ed03520c29bee3b7fd 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -1,14 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
 import torch
 from torch import nn
 
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.decorators import (ignore_torch_compile,
-                                         support_torch_compile)
-from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
-                         CUDAGraphMode, VllmConfig, set_current_vllm_config)
+from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from . import silly_attention  # noqa: F401
@@ -18,56 +25,86 @@ MLP_SIZE = 128
 
 
 @torch.inference_mode
-def run_model(vllm_config: VllmConfig, model: nn.Module,
-              cudagraph_runtime_mode: CUDAGraphMode):
+def run_model(
+    vllm_config: VllmConfig, model: nn.Module, cudagraph_runtime_mode: CUDAGraphMode
+):
     with set_forward_context({}, vllm_config=vllm_config):
         # warmup for the model with cudagraph_mode NONE
         model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
 
         # simulate cudagraphs capturing
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             model(torch.randn(2, MLP_SIZE).cuda())
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=1, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=1,
+            ),
+        ):
             model(torch.randn(1, MLP_SIZE).cuda())
 
         # simulate cudagraphs replay
-        with set_forward_context({},
-                                 vllm_config=vllm_config,
-                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-                                 batch_descriptor=BatchDescriptor(
-                                     num_tokens=2, )):
+        with set_forward_context(
+            {},
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=BatchDescriptor(
+                num_tokens=2,
+            ),
+        ):
             output = model(torch.randn(2, MLP_SIZE).cuda())
 
         output = output.cpu()
         return output.cpu()
 
 
-def test_ignore_torch_compile_decorator():
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch):
+    # disable compile cache so that we can count the number of compilations
+    # appropriately
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
     # piecewise
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        splitting_ops=["silly.attention"],
-        cudagraph_capture_sizes=[1, 2],
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            splitting_ops=["silly::attention"],
+            cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
+        )
+    )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
+    expected_num_graphs_seen = 1
+    expected_num_cudagraph_captured = (
+        4  # num_cudagraph_sizes * num cudagraphs to capture
+    )
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 1
+        expected_num_piecewise_capturable_graphs_seen = 1
+        expected_num_backend_compilations = 1
+    else:
+        expected_num_piecewise_graphs_seen = 3
+        expected_num_piecewise_capturable_graphs_seen = 2
+        expected_num_backend_compilations = 2
+
     @support_torch_compile
     class A(nn.Module):
-
-        def __init__(self,
-                     *,
-                     vllm_config: VllmConfig,
-                     prefix: str = '',
-                     **kwargs) -> None:
+        def __init__(
+            self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs
+        ) -> None:
             super().__init__()
 
         def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -79,66 +116,58 @@ def test_ignore_torch_compile_decorator():
             return x
 
     @ignore_torch_compile
-    class B(A):
-        ...
+    class B(A): ...
 
     @support_torch_compile
-    class C(B):
-        ...
+    class C(B): ...
 
     with set_current_vllm_config(vllm_config):
-        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+        mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
 
     # A has support_torch_compile
     with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
     with set_current_vllm_config(vllm_config):
-        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
+        mod_B = B(vllm_config=vllm_config, prefix="").eval().cuda()
 
     # B's ignore_torch_compile should override A's support_torch_compile
     with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
+        num_graphs_seen=0,
+        num_piecewise_graphs_seen=0,
+        num_piecewise_capturable_graphs_seen=0,
+        num_backend_compilations=0,
+        num_cudagraph_captured=0,
     ):
         run_model(vllm_config, mod_B, cudagraph_runtime_mode)
 
     with set_current_vllm_config(vllm_config):
-        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
+        mod_C = C(vllm_config=vllm_config, prefix="").eval().cuda()
 
     # C's support_torch_compile should override B's ignore_torch_compile
     with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_C, cudagraph_runtime_mode)
 
 
 # Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=True
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
+@support_torch_compile(
+    enable_if=lambda vllm_config: vllm_config.cache_config.kv_sharing_fast_prefill
+)
 class B(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -152,15 +181,11 @@ class B(nn.Module):
 
 # Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=False
-@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
-                       cache_config.kv_sharing_fast_prefill)
+@support_torch_compile(
+    enable_if=lambda vllm_config: not vllm_config.cache_config.kv_sharing_fast_prefill
+)
 class A(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
         self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
         self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
@@ -174,55 +199,88 @@ class A(nn.Module):
         return x
 
 
-def test_conditional_compile_enable_if():
-    vllm_config = VllmConfig(cache_config=CacheConfig(
-        kv_sharing_fast_prefill=True, ),
-                             compilation_config=CompilationConfig(
-                                 level=CompilationLevel.PIECEWISE,
-                                 use_cudagraph=True,
-                                 splitting_ops=["silly.attention"],
-                                 cudagraph_capture_sizes=[1, 2],
-                             ))
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch):
+    # disable compile cache so that we can count the number of compilations
+    # appropriately
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_sharing_fast_prefill=True,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            splitting_ops=["silly::attention"],
+            cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
+        ),
+    )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
-        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+        mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
+
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 2
+        expected_num_piecewise_capturable_graphs_seen = 2
+        expected_num_backend_compilations = 2
+    else:
+        expected_num_piecewise_graphs_seen = 6
+        expected_num_piecewise_capturable_graphs_seen = 4
+        expected_num_backend_compilations = 4
 
     # A has support_torch_compile but enable_if fn returns False
     # enalbe_if will be True for B, so we expect mod1 and mod2
     # to be compiled
     with compilation_counter.expect(
-            num_graphs_seen=2,
-            num_piecewise_graphs_seen=6,
-            # 3 piecewise graphs per instance of B()
-            num_piecewise_capturable_graphs_seen=4,
-            num_backend_compilations=4,
-            num_cudagraph_captured=8,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=2,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        # 3 piecewise graphs per instance of B()
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=8,
+        # num_cudagraph_sizes * num cudagraphable graphs to capture
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
     # Set kv_sharing_fast_prefill=False
     # which will cause A to be compiled and B to not be compiled
-    vllm_config = VllmConfig(cache_config=CacheConfig(
-        kv_sharing_fast_prefill=False, ),
-                             compilation_config=CompilationConfig(
-                                 level=CompilationLevel.PIECEWISE,
-                                 use_cudagraph=True,
-                                 splitting_ops=["silly.attention"],
-                                 cudagraph_capture_sizes=[1, 2],
-                             ))
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_sharing_fast_prefill=False,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            splitting_ops=["silly::attention"],
+            cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
+        ),
+    )
 
     with set_current_vllm_config(vllm_config):
-        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+        mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
+
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 1
+        expected_num_piecewise_capturable_graphs_seen = 1
+        expected_num_backend_compilations = 1
+    else:
+        # 3 attn ops and 4 non-attn ops
+        expected_num_piecewise_graphs_seen = 7
+        expected_num_piecewise_capturable_graphs_seen = 4
+        expected_num_backend_compilations = 4
 
     with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=7,
-            # 3 attn ops and 4 non-attn ops
-            num_piecewise_capturable_graphs_seen=4,
-            num_backend_compilations=4,
-            num_cudagraph_captured=8,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=1,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        # 3 attn ops and 4 non-attn ops
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=8,
+        # num_cudagraph_sizes * num cudagraphable graphs to capture
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 870aa553ca6284a28e4f02003f84eaa0e7958b40..b4e5e56ac9fe641420d372f30845f353f2f47fe4 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,67 +1,75 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
-import logging
 import tempfile
-from typing import Any, Optional, Union
+from pathlib import Path
+from typing import Any
 
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
-from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         PassConfig)
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
-from vllm.utils import is_torch_equal_or_newer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ..utils import create_new_process_for_each_test
 
 
-def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
+def models_list(*, all: bool = True, keywords: list[str] | None = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
-        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-            "dtype": torch.float16,
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-            "dtype": torch.float16,
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
+        (
+            "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
+            {"dtype": torch.float16},
+        ),
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
     if all:
+        TEST_MODELS.extend(
+            [
+                ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
+                (
+                    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+                    {"dtype": torch.float16},
+                ),
+            ]
+        )
 
         # TODO: figure out why this fails.
         if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-                "quantization": "gguf"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
+            )
 
         if is_quant_method_supported("gptq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-                "quantization": "gptq"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
+            )
 
         if is_quant_method_supported("gptq_marlin"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-                "quantization": "gptq_marlin"
-            }))
+            TEST_MODELS.append(
+                (
+                    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+                    {"quantization": "gptq_marlin"},
+                )
+            )
 
         if is_quant_method_supported("gptq_marlin_24"):
-            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-                "quantization": "gptq_marlin_24"
-            }))
+            TEST_MODELS.append(
+                (
+                    "alexm-nm/tinyllama-24-marlin24-4bit-g128",
+                    {"quantization": "gptq_marlin_24"},
+                )
+            )
 
         if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-                "quantization": "AWQ"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {"quantization": "AWQ"})
+            )
 
     if keywords is None:
         return TEST_MODELS
@@ -72,110 +80,145 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
 
 
 @pytest.mark.parametrize(
-    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+    "compilation_mode",
+    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
 )
-@pytest.mark.parametrize("model_info", models_list(all=True))
+@pytest.mark.parametrize("model, model_kwargs", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
-    model_info: tuple[str, dict[str, Any]],
-    optimization_level: int,
+    model: str,
+    model_kwargs: dict[str, Any],
+    compilation_mode: int,
 ):
-    model, model_kwargs = model_info
+    if (
+        "w8a8" in model
+        or "w8w8" in model
+        and current_platform.has_device_capability((10, 0))
+    ):
+        # int8 removed on Blackwell:
+        pytest.skip("int8 support removed on Blackwell")
 
     with monkeypatch.context():
         print(f"MODEL={model}")
 
-        run_model(optimization_level, model, model_kwargs)
+        run_model(compilation_mode, model, **model_kwargs)
 
 
 # TODO(luka) add other supported compilation config scenarios here
 @pytest.mark.parametrize(
-    "compilation_config, model_info",
+    "compilation_config, model, model_kwargs",
     [
         # additional compile sizes, only some of the models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           compile_sizes=[1, 2]), model)
-        for model in models_list(all=False)
-    ] + [
+        (
+            CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
+            *model_info,
+        )
+        for model_info in models_list(all=False)
+    ]
+    + [
         # RMSNorm + quant fusion, only 8-bit quant models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           custom_ops=["+rms_norm"],
-                           pass_config=PassConfig(enable_fusion=True,
-                                                  enable_noop=True)), model)
-        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
-    ] + [
+        (
+            CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                custom_ops=["+rms_norm"],
+                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            ),
+            *model_info,
+        )
+        for model_info in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
+    ]
+    + [
         # Test depyf integration works
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           debug_dump_path=tempfile.gettempdir()),
-         ("facebook/opt-125m", {})),
-    ] + [
+        (
+            CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                debug_dump_path=Path(tempfile.gettempdir()),
+            ),
+            "facebook/opt-125m",
+            {},
+        ),
+    ]
+    + [
         # graph inductor partition
         (
             CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 # inductor graph partition uses
                 # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                 use_inductor_graph_partition=True,
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
-                compile_sizes=[1, 2]),
-            model) for model in models_list(all=False)
+                compile_sizes=[1, 2],
+            ),
+            *model_info,
+        )
+        for model_info in models_list(all=False)
         if is_torch_equal_or_newer("2.9.0.dev")
-    ])
+    ],
+)
 # only test some of the models
 @create_new_process_for_each_test()
 def test_custom_compile_config(
     compilation_config: CompilationConfig,
-    model_info: tuple[str, dict[str, Any]],
+    model: str,
+    model_kwargs: dict[str, Any],
 ):
-    if (compilation_config.use_inductor_graph_partition
-            and not is_torch_equal_or_newer("2.9.0.dev")):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
+    if (
+        "w8a8" in model
+        or "w8w8" in model
+        and current_platform.has_device_capability((10, 0))
+    ):
+        # int8 removed on Blackwell:
+        pytest.skip("int8 support removed on Blackwell")
+
+    if compilation_config.use_inductor_graph_partition and not is_torch_equal_or_newer(
+        "2.9.0.dev"
+    ):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
-    model, model_kwargs = model_info
     print(f"MODEL={model}")
-    run_model(compilation_config, model, model_kwargs)
+    run_model(compilation_config, model, **model_kwargs)
 
 
-def test_inductor_graph_partition_attn_fusion(caplog_vllm):
-    if not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
+@pytest.mark.parametrize(
+    "compilation_mode",
+    [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
+)
+@pytest.mark.parametrize(
+    "model, backend",
+    [
+        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+        (
+            "deepseek-ai/DeepSeek-V2-Lite",
+            AttentionBackendEnum.FLASHINFER_MLA,
+        ),  # MLA (Multi-head Latent Attention) model
+    ],
+)
+def test_fp8_kv_scale_compile(
+    monkeypatch: pytest.MonkeyPatch,
+    compilation_mode: int,
+    model: str,
+    backend: AttentionBackendEnum | None,
+):
+    if backend:
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
 
-    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
-    compilation_config = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_inductor_graph_partition=True,
-        cudagraph_mode=CUDAGraphMode.PIECEWISE,
-        custom_ops=["+quant_fp8"],
-        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
-    )
     model_kwargs = {
-        "kv_cache_dtype": "fp8",
-        "max_model_len": 1024,
+        "quantization": "fp8",
+        "kv_cache_dtype": "fp8_e4m3",
+        "calculate_kv_scales": True,
+        "max_model_len": 512,
     }
-    with caplog_vllm.at_level(
-            logging.DEBUG), global_force_attn_backend_context_manager(
-                _Backend.FLASHINFER):
-        run_model(compilation_config, model, model_kwargs)
-
-    try:
-        assert ("Fused quantization onto 48 attention nodes"
-                in caplog_vllm.text), caplog_vllm.text
-    except AssertionError:
-        # Note: this message is only triggered when the compilation goes
-        # through the custom pass. Due to multiple layers of cache on
-        # PyTorch side, the compilation of a graph may be cached such
-        # that custom pass directly goes through cache. In this case,
-        # we go through this branch and assert that the pass is not
-        # triggered.
-        assert "Fused quantization" not in caplog_vllm.text
-
-
-def run_model(compile_config: Union[int, CompilationConfig], model: str,
-              model_kwargs: dict[str, Any]):
+    run_model(compilation_mode, model, **model_kwargs)
+
+
+def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
+    compilation_config = (
+        compile_config
+        if isinstance(compile_config, CompilationConfig)
+        else CompilationConfig(mode=compile_config)
+    )
+
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -183,12 +226,17 @@ def run_model(compile_config: Union[int, CompilationConfig], model: str,
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0)
+    # Allow override from model_kwargs
+    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
+    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
+
+    # No cudagraphs by default
+    if compilation_config.cudagraph_mode is None:
+        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
     llm = LLM(
         model=model,
-        enforce_eager=True,
-        tensor_parallel_size=1,
-        disable_custom_all_reduce=True,
-        compilation_config=compile_config,
+        compilation_config=compilation_config,
         **model_kwargs,
     )
     outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 2ee9aa7476beb7335e14a813dc582ed5e4055fd9..11ae96e930da7d27592fac7b0ad5ee5e659f6df6 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -5,114 +5,262 @@ import pytest
 import torch
 
 import vllm.envs as envs
-from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import FUSED_OPS, RMSNormQuantFusionPass
+from vllm.compilation.fusion import RMSNormQuantFusionPass
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import CompilationConfig, PassConfig, VllmConfig
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.config import (
+    CompilationConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
-OPS_IN_MODEL = [
-    torch.ops._C.rotary_embedding.default,
-    torch.ops._C.fused_add_rms_norm.default,
-]
+TEST_FP8 = current_platform.supports_fp8()
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class TestSiluMul(torch.nn.Module):
+    def __init__(self, hidden_size: int = 128):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.wscale = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
+
+        if TEST_FP8:
+            self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=True,
+                act_quant_group_shape=GroupShape.PER_TENSOR,
+            )
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        if TEST_FP8:
+            x2 = self.fp8_linear.apply(y, self.w, self.wscale, input_scale=self.wscale)
+            return x2
+        else:
+            return y
+
+    def example_inputs(self, num_tokens=32, hidden_size=128):
+        return (torch.rand(num_tokens, hidden_size * 2),)
+
+    def ops_in_model(self, do_fusion):
+        if TEST_FP8 and do_fusion:
+            return [torch.ops._C.silu_and_mul_quant.default]
+        else:
+            return [torch.ops._C.silu_and_mul.default]
+
+    def ops_not_in_model(self):
+        return []
+
+
+class TestFusedAddRMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size=16, intermediate_size=32):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((intermediate_size, hidden_size))
+        )
+        self.norm = RMSNorm(intermediate_size, 1e-05)
+        self.norm.weight = torch.nn.Parameter(torch.ones(intermediate_size))
+
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+        if TEST_FP8:
+            self.fp8_linear = Fp8LinearOp(act_quant_static=True)
+
+            self.scale = torch.rand(1, dtype=torch.float32)
+            self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
+            self.wscale = torch.rand(1, dtype=torch.float32)
+
+    def forward(self, hidden_states, residual):
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        # matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+
+        # layer normalization
+        norm_output, residual_output = self.norm(mm, residual)
+
+        if TEST_FP8:
+            # scaled_mm with static input quantization
+            fp8_linear_result = self.fp8_linear.apply(
+                norm_output,
+                self.w,
+                self.wscale,
+                input_scale=self.scale.to(norm_output.device),
+            )
+
+            return fp8_linear_result, residual_output
+
+        else:
+            return norm_output, residual_output
+
+    def example_inputs(self, batch_size=8, hidden_size=16, seq_len=16):
+        hidden_states = torch.randn((batch_size * seq_len, hidden_size))
+        residual = torch.randn((batch_size * seq_len, hidden_size))
+        return (hidden_states, residual)
+
+    def ops_in_model(self, do_fusion):
+        if TEST_FP8 and do_fusion:
+            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
+        else:
+            return [torch.ops._C.fused_add_rms_norm.default]
+
+    def ops_not_in_model(self):
+        return []
 
-RMS_OP = torch.ops._C.rms_norm.default
 
-RMS_QUANT_OPS = {
-    "static_fp8": [
-        torch.ops._C.rms_norm_static_fp8_quant.default,
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-    ],
-}
+class TestRotaryEmbedding(torch.nn.Module):
+    def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
+        super().__init__()
+        self.head_dim = head_dim
+        self.rotary_dim = rotary_dim or head_dim
 
-SILU_MUL_OP = torch.ops._C.silu_and_mul.default
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position,
+            base=base,
+        )
 
-SILU_MUL_QUANT_OP = torch.ops._C.silu_and_mul_quant.default
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
+    def forward(self, positions, q, k):
+        q_rotated, k_rotated = self.rotary_emb(positions, q, k)
+        return q_rotated, k_rotated
+
+    def example_inputs(self, num_tokens=32, head_dim=64):
+        positions = torch.arange(num_tokens, dtype=torch.long)
+        q = torch.randn(num_tokens, head_dim)
+        k = torch.randn(num_tokens, head_dim)
+        return (positions, q, k)
+
+    def ops_in_model(self, do_fusion):
+        return [torch.ops._C.rotary_embedding.default]
+
+    def ops_not_in_model(self):
+        return []
+
+
+class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
+    def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.hidden_size = head_dim * num_heads
+
+        self.qkv_proj = torch.nn.Linear(
+            self.hidden_size, self.hidden_size * 3, bias=False
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=base,
+        )
+
+    def forward(self, positions, hidden_states):
+        # Simulate the pattern: mm -> split_with_sizes -> rotary_embedding
+        # -> slice_scatter -> split_with_sizes
+
+        qkv = self.qkv_proj(hidden_states)
+        split_sizes = [self.hidden_size, self.hidden_size, self.hidden_size]
+        q, k, v = torch.split(qkv, split_sizes, dim=-1)
+
+        q_rotated, k_rotated = self.rotary_emb(positions, q, k)
+
+        qkv_updated = torch.cat([q_rotated, k_rotated, v], dim=-1)
+        return qkv_updated
+
+    def example_inputs(self, num_tokens=32, head_dim=64, num_heads=4):
+        hidden_size = head_dim * num_heads
+        positions = torch.arange(num_tokens, dtype=torch.long)
+        hidden_states = torch.randn(num_tokens, hidden_size)
+        return (positions, hidden_states)
+
+    def ops_in_model(self, do_fusion):
+        return [torch.ops._C.rotary_embedding.default]
+
+    def ops_not_in_model(self):
+        return [torch.ops.aten.slice_scatter.default]
+
+
+MODELS = [
+    TestSiluMul,
+    TestFusedAddRMSNorm,
+    TestRotaryEmbedding,
+    TestRotaryEmbeddingSliceScatter,
 ]
 
 
-@pytest.mark.parametrize(
-    "model, quant_key",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
-      kFp8DynamicTokenSym)])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("model_class", MODELS)
 @pytest.mark.parametrize("do_fusion", [True, False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
-def test_fix_functionalization(model: str, quant_key: QuantKey,
-                               do_fusion: bool):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA")
+def test_fix_functionalization(
+    model_class: torch.nn.Module, do_fusion: bool, dtype: torch.dtype
+):
     torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            custom_ops=["all"],
+            pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True),
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        assert RMSNorm.enabled()
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+        act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
+
+        passes = (
+            [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass]
+            if do_fusion
+            else [noop_pass, cleanup_pass]
+        )
+        func_pass = FixFunctionalizationPass(vllm_config)
+
+        backend_func = TestBackend(*passes, func_pass)
+        backend_no_func = TestBackend(*passes)
+
+        model = model_class()
+        torch.compile(model, backend=backend_func)(*model.example_inputs())
+        torch.compile(model, backend=backend_no_func)(*model.example_inputs())
+
+        # check if the functionalization pass is applied
+        for op in model.ops_in_model(do_fusion):
+            find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+            assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
 
-    vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(
-        pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True))
-    noop_pass = NoOpEliminationPass(vllm_config)
-    fusion_pass = RMSNormQuantFusionPass(vllm_config)
-    cleanup_pass = PostCleanupPass(vllm_config)
-    act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
-
-    passes = [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass
-              ] if do_fusion else [noop_pass, cleanup_pass]
-    func_pass = FixFunctionalizationPass(vllm_config)
-    backend_func = TestBackend(*passes, func_pass)
-    backend_no_func = TestBackend(*passes)
-
-    # instantiate a full engine and manually compile the model 2x
-    # (with and without FixFunctionalizationPass)
-    llm = LLM(model=model, enforce_eager=True)
-    model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
-    orig_model = model_runner.model
-    # TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
-    # Can only do that by using the decorator but then we'd have to instantiate
-    # 2 LLM instances.
-
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    model_runner.model = torch.compile(orig_model,
-                                       fullgraph=True,
-                                       backend=backend_func)
-    gen_func = llm.generate(prompts, sampling_params)
-
-    model_runner.model = torch.compile(orig_model,
-                                       fullgraph=True,
-                                       backend=backend_no_func)
-
-    gen_no_func = llm.generate(prompts, sampling_params)
-
-    for output_func, output_no_func in zip(gen_func, gen_no_func):
-        assert output_func.outputs[0].text == output_no_func.outputs[0].text
-
-    # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
-    # and replaced by fused quantized ops in RMS_QUANT_OPS.
-    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
-               ] if do_fusion else [RMS_OP]
-    silu_mul_ops = [SILU_MUL_QUANT_OP] if do_fusion and \
-        quant_key == kFp8StaticTensorSym else [
-        SILU_MUL_OP
-    ]
-
-    ops = OPS_IN_MODEL + rms_ops + silu_mul_ops
-
-    for op in ops:
-        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
-                                  op) is None  # noqa: E501
-
-    # make sure the ops were all de-functionalized
-    found = dict()
-    for node in backend_func.graph_post_pass.nodes:
-        for op in ops:
-            if is_func(node, op):
-                found[op] = True
-    assert all(found[op] for op in ops)
+        # make sure the ops were all de-functionalized
+        found = dict()
+        for node in backend_func.graph_post_pass.nodes:
+            for op in model.ops_in_model(do_fusion):
+                if is_func(node, op):
+                    found[op] = True
+            for op in model.ops_not_in_model():
+                if is_func(node, op):
+                    found[op] = True
+        assert all(found[op] for op in model.ops_in_model(do_fusion))
+        assert all(not found.get(op) for op in model.ops_not_in_model())
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 3d8897d3f18b8bccd87fb8844d76fa9f24b72f1f..286f2276367a02cbcfb6f870afbfec52a67a61d7 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -5,17 +5,29 @@ import pytest
 import torch
 
 import vllm.plugins
-from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     RMSNormQuantFusionPass)
+from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass
+from vllm.compilation.fx_utils import find_op_nodes
+from vllm.compilation.matcher_utils import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
-                         VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, QuantKey, ScaleDesc)
+    GroupShape,
+    QuantKey,
+    ScaleDesc,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity)
+    Fp8LinearOp,
+    cutlass_fp8_supported,
+    maybe_create_device_identity,
+)
 from vllm.platforms import current_platform
 
 from ..utils import override_cutlass_fp8_supported
@@ -23,25 +35,34 @@ from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
-class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, static: bool,
-                 cuda_force_torch: bool, *args, **kwargs):
+class TestModel(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float,
+        static: bool,
+        cuda_force_torch: bool,
+        *args,
+        **kwargs,
+    ):
         super().__init__(*args, **kwargs)
         self.cuda_force_torch = cuda_force_torch
-        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
-        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
-        self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
+        self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
         if static:
-            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
         else:
-            self.scale = [None for _ in range(2)]
+            self.scale = [None for _ in range(3)]
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(2)
+            for _ in range(3)
         ]
 
         with override_cutlass_fp8_supported(not cuda_force_torch):
@@ -50,57 +71,97 @@ class TestModel(torch.nn.Module):
                 act_quant_group_shape=group_shape,
             )
 
+        self.enable_rms_norm_custom_op = self.norm[0].enabled()
+        self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
+
     def forward(self, x):
-        resid = torch.sqrt(x)
+        # avoid having graph input be an arg to a pattern directly
+        x = resid = torch.relu(x)
         y = self.norm[0](x)
 
-        x2 = self.fp8_linear.apply(y,
-                                   self.w[0],
-                                   self.wscale[0],
-                                   input_scale=self.scale[0])
+        x2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
+        )
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = self.fp8_linear.apply(y2,
-                                   self.w[1],
-                                   self.wscale[1],
-                                   input_scale=self.scale[1])
+        x3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
         y3, resid = self.norm[2](x3, resid)  # use resid here
-        return y3
 
-    def ops_in_model_before(self):
-        return [QUANT_OPS[self.key]]
+        x4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
         return [
-            FUSED_OPS[FusedRMSQuantKey(self.key, False)],
-            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+            FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
+            FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
         ]
 
+    def ops_in_model_before(self):
+        return (
+            [QUANT_OPS[self.quant_key]]
+            if self.enable_quant_fp8_custom_op
+            else [torch.ops.aten.reciprocal]
+        )
+
+    def ops_in_model_before_partial(self):
+        return (
+            [RMS_OP, RMS_ADD_OP]
+            if self.enable_rms_norm_custom_op
+            else [torch.ops.aten.rsqrt]
+        )
+
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
-@pytest.mark.parametrize("cuda_force_torch",
-                         [True, False] if cutlass_fp8_supported() else [True])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only test on CUDA and ROCm")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              cuda_force_torch):
+@pytest.mark.parametrize(
+    "cuda_force_torch", [True, False] if cutlass_fp8_supported() else [True]
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm"
+)
+def test_fusion_rmsnorm_quant(
+    dtype,
+    hidden_size,
+    num_tokens,
+    eps,
+    static,
+    enable_rms_norm_custom_op,
+    enable_quant_fp8_custom_op,
+    cuda_force_torch,
+):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
     maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        custom_ops=["+rms_norm", "+quant_fp8"],
-        pass_config=PassConfig(enable_fusion=True, enable_noop=True),
-    ))
+    custom_ops = []
+    if enable_rms_norm_custom_op:
+        custom_ops.append("+rms_norm")
+    if enable_quant_fp8_custom_op:
+        custom_ops.append("+quant_fp8")
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+        ),
+    )
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
         noop_pass = NoOpEliminationPass(vllm_config)
@@ -108,31 +169,39 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         cleanup_pass = PostCleanupPass(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+        backend2 = TestBackend(noop_pass, cleanup_pass)
         model = TestModel(hidden_size, eps, static, cuda_force_torch)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
 
-        result = model(x)
+        model_fused = torch.compile(model, backend=backend)
+        result_fused = model_fused(x)
 
-        model2 = torch.compile(model, backend=backend)
-        result2 = model2(x)
+        model_unfused = torch.compile(model, backend=backend2)
+        result_unfused = model_unfused(x)
 
-        # Higher tol for dynamic, even higher for bfloat16
-        if static:
-            ATOL, RTOL = (1e-3, 1e-3)
-        elif dtype == torch.float16:
+        if dtype == torch.float16:
             ATOL, RTOL = (2e-3, 2e-3)
         else:
             ATOL, RTOL = (1e-2, 1e-2)
 
-        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
-        assert fusion_pass.matched_count == 2
-
-        # In pre-nodes, fp8 quant should be there and fused kernels should not
+        assert fusion_pass.matched_count == 3
         backend.check_before_ops(model.ops_in_model_before())
-
-        # In post-nodes, fused kernels should be there and fp8 quant should not
+        backend.check_before_ops(
+            model.ops_in_model_before_partial(), fully_replaced=False
+        )
         backend.check_after_ops(model.ops_in_model_after())
+
+        # If RMSNorm custom op is disabled (native/torch impl used),
+        # there's a risk that the fused add doesn't get included in the
+        # replacement and only the rms part gets fused with quant.
+        # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
+        if not enable_rms_norm_custom_op:
+            n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
+            # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
+            assert n_add_nodes(backend.graph_pre_pass) == 7
+            assert n_add_nodes(backend.graph_post_pass) == 2
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 60f32c863208d77945af37ef82b1ea4a6d10c50f..6d0a0ed7d89d20ee0eed9ddd3c926f55b2d87fea 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -6,59 +6,66 @@ import pytest
 import torch
 
 import vllm.envs as envs
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.collective_fusion import AllReduceFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
-                         ModelConfig, PassConfig, VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.distributed import tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    GroupShape, QuantFP8)
+    Fp8LinearOp,
+    GroupShape,
+)
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 from ..utils import has_module_attribute, multi_gpu_test
 from .backend import TestBackend
 
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.norm = RMSNorm(hidden_size, eps)
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        view = hidden_states.reshape(-1, self.hidden_size)
-        all_reduce = tensor_model_parallel_all_reduce(view)
-        norm = self.norm(all_reduce)
-        return norm
+    def forward(self, x):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-    def ops_in_model_before(self):
-        return [torch.ops.vllm.all_reduce.default]
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
 
-    def ops_in_model_after(self):
-        return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
+        y2, resid = self.norm[1](x2, resid)
 
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
 
-class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
+        y3, resid = self.norm[2](x3, resid)
 
-    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.eps = eps
-        self.norm = RMSNorm(hidden_size, eps)
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
 
-    def forward(self, hidden_states, residual):
-        view = hidden_states.reshape(-1, self.hidden_size)
-        all_reduce = tensor_model_parallel_all_reduce(view)
-        norm, _ = self.norm(all_reduce, residual)
-        return norm
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
@@ -67,27 +74,53 @@ class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
 
 
-class TestAllReduceFusedAddRMSNormStaticQuantFP8Model(torch.nn.Module):
-
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
     def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.norm = RMSNorm(hidden_size, eps)
-        self.quant_fp8 = QuantFP8(static=True,
-                                  group_shape=GroupShape.PER_TENSOR)
-        self.scale = torch.rand(1, dtype=torch.float32)
-        self.output = torch.empty((token_num, hidden_size),
-                                  dtype=torch.float32)
-
-    def forward(self, hidden_states, residual):
-        view = hidden_states.reshape(-1, self.hidden_size)
-        all_reduce = tensor_model_parallel_all_reduce(view)
-        norm_output, residual_output = self.norm(all_reduce, residual)
-        torch.ops._C.static_scaled_fp8_quant(self.output,
-                                             norm_output.contiguous(),
-                                             self.scale)
-        return self.output, residual_output
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+            for _ in range(3)
+        ]
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
+        )
+
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
+        )
+
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
+
+        z3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        z4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
@@ -96,35 +129,58 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP8Model(torch.nn.Module):
         return [
             torch.ops.vllm.all_reduce.default,
             torch.ops._C.static_scaled_fp8_quant.default
+            if self.fp8_linear.quant_fp8.enabled()
+            else torch.ops.aten.reciprocal.default,
         ]
 
 
 class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
-
     def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.norm = RMSNorm(hidden_size, eps)
-        self.scale = torch.rand(1, dtype=torch.float32)
-        self.output = torch.empty((token_num, hidden_size),
-                                  dtype=torch.float32)
-
-        round_up = lambda x, y: (x + y - 1) // y * y
-        rounded_m = round_up(token_num, 128)
-        scale_n = hidden_size // 16
-        rounded_n = round_up(scale_n, 4)
-        self.output_scale = torch.empty((rounded_m, rounded_n // 4),
-                                        dtype=torch.int32)
-
-    def forward(self, hidden_states, residual):
-        view = hidden_states.reshape(-1, self.hidden_size)
-        all_reduce = tensor_model_parallel_all_reduce(view)
-        norm_output, residual_output = self.norm(all_reduce, residual)
-        norm_output = norm_output.reshape(-1, norm_output.shape[-1])
-        torch.ops._C.scaled_fp4_quant(self.output, norm_output,
-                                      self.output_scale, self.scale)
-        return self.output, residual_output, self.output_scale
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
+        self.agscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        wgscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.alpha = [1 / (w * a) for w, a in zip(wgscale, self.agscale)]
+
+        wq_gen, wscale_gen = zip(
+            *(scaled_fp4_quant(w, wg) for w, wg in zip(self.w, wgscale))
+        )
+        self.wq, self.wscale = list(wq_gen), list(wscale_gen)
+        print(f"{self.wq=}, {self.wscale=}")
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        yq, y_scale = scaled_fp4_quant(y, self.agscale[0])
+        z2 = cutlass_scaled_fp4_mm(
+            yq, self.wq[0], y_scale, self.wscale[0], self.alpha[0], out_dtype=y.dtype
+        )
+
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
+
+        yq2, y_scale2 = scaled_fp4_quant(y2, self.agscale[1])
+        z3 = cutlass_scaled_fp4_mm(
+            yq2, self.wq[1], y_scale2, self.wscale[1], self.alpha[1], out_dtype=y2.dtype
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        yq3, y_scale3 = scaled_fp4_quant(y3, self.agscale[2])
+        z4 = cutlass_scaled_fp4_mm(
+            yq3, self.wq[2], y_scale3, self.wscale[2], self.alpha[2], out_dtype=y3.dtype
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
@@ -132,54 +188,81 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
     def ops_in_model_before(self):
         return [
             torch.ops.vllm.all_reduce.default,
-            torch.ops._C.scaled_fp4_quant.default
+            torch.ops._C.scaled_fp4_quant.default,
         ]
 
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "test_model",
+    "test_model, enable_quant_fp8_custom_op",
     [
-        TestAllReduceRMSNormModel,
-        TestAllReduceFusedAddRMSNormModel,
-        TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
-        # TODO: Enable with torch==2.8.0
-        # TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
-    ])
+        (TestAllReduceRMSNormModel, False),
+        (TestAllReduceRMSNormStaticQuantFP8Model, True),
+        (TestAllReduceRMSNormStaticQuantFP8Model, False),
+        (TestAllReduceFusedAddRMSNormStaticQuantFP4Model, False),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
-@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("hidden_size", [64])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 @pytest.mark.skipif(
     not find_spec("flashinfer")
     or not has_module_attribute("flashinfer.comm", "trtllm_allreduce_fusion"),
     reason="flashinfer is not found or flashinfer "
-    "is not compiled with trtllm_allreduce_fusion")
-def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module,
-                                        batch_size: int, seq_len: int,
-                                        hidden_size: int, dtype: torch.dtype):
+    "is not compiled with trtllm_allreduce_fusion",
+)
+def test_all_reduce_fusion_pass_replace(
+    test_model: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_rms_norm_custom_op,
+    enable_quant_fp8_custom_op,
+):
     num_processes = 2
-    if (test_model == TestAllReduceFusedAddRMSNormStaticQuantFP4Model
-            and not current_platform.has_device_capability(100)):
-        pytest.skip("Skip as nvfp4 is only supported on "
-                    "devices with compute capability 10.0 (Blackwell)")
+    if (
+        test_model == TestAllReduceFusedAddRMSNormStaticQuantFP4Model
+        and not current_platform.has_device_capability(100)
+    ):
+        pytest.skip(
+            "Skip as nvfp4 is only supported on "
+            "devices with compute capability 10.0 (Blackwell)"
+        )
 
     def run_torch_spawn(fn, nprocs):
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                test_model,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                enable_rms_norm_custom_op,
+                enable_quant_fp8_custom_op,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(all_reduce_fusion_pass_on_test_model, num_processes)
 
 
-def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
-                                         test_model_cls: torch.nn.Module,
-                                         batch_size: int, seq_len: int,
-                                         hidden_size: int, dtype: torch.dtype):
+def all_reduce_fusion_pass_on_test_model(
+    local_rank: int,
+    world_size: int,
+    test_model_cls: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_rms_norm_custom_op,
+    enable_quant_fp8_custom_op,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -187,50 +270,63 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        custom_ops=["+rms_norm", "+quant_fp8"]))
+    custom_ops = []
+    if enable_rms_norm_custom_op:
+        custom_ops.append("+rms_norm")
+    if enable_quant_fp8_custom_op:
+        custom_ops.append("+quant_fp8")
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops
+        )
+    )
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_fi_allreduce_fusion=True, enable_noop=True)
+        enable_fi_allreduce_fusion=True, enable_noop=True
+    )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+    vllm_config.parallel_config.rank = local_rank  # Setup rank for debug path
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
-    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
-
-    all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
-    noop_pass = NoOpEliminationPass(vllm_config)
-    func_pass = FixFunctionalizationPass(vllm_config)
-    cleanup_pass = PostCleanupPass(vllm_config)
-
-    backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass,
-                          cleanup_pass)
-
-    token_num = batch_size * seq_len
-    model = test_model_cls(hidden_size, token_num)
-
-    hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
-    residual = torch.randn((token_num, hidden_size), requires_grad=False)
-
-    compiled_model = torch.compile(model, backend=backend)
-    compiled_model(hidden_states, residual)
-
-    assert all_reduce_fusion_pass.matched_count == 1
-    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
-    backend.check_after_ops(model.ops_in_model_after())
-    del all_reduce_fusion_pass
+    model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+    vllm_config.model_config = ModelConfig(
+        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
+    )
+    with set_current_vllm_config(vllm_config):
+        all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
+        noop_pass = NoOpEliminationPass(vllm_config)
+        func_pass = FixFunctionalizationPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+
+        backend = TestBackend(
+            noop_pass, all_reduce_fusion_pass, func_pass, cleanup_pass
+        )
+
+        token_num = batch_size * seq_len
+        model = test_model_cls(hidden_size, token_num)
+
+        hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
+
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
+
+        assert all_reduce_fusion_pass.matched_count == 4, (
+            f"{all_reduce_fusion_pass.matched_count=}"
+        )
+        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+        backend.check_after_ops(model.ops_in_model_after())
+        del all_reduce_fusion_pass
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index eb8c4913542809cfff2660283f8f1d7d9134f2bf..ea61c94953a77dfe9da34a175395c79ba4f7e840 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,168 +1,60 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-from typing import Optional
 
 import pytest
 import torch._dynamo
 
 from tests.compile.backend import LazyInitPass, TestBackend
-from tests.models.utils import check_outputs_equal
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata)
-from vllm import LLM, SamplingParams
+from tests.utils import flat_product
+from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
+from vllm.compilation.matcher_utils import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
-                         ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+    QuantKey,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
-from vllm.utils import is_torch_equal_or_newer
+from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 
-# globals needed for string-import custom Dynamo backend field
-backend: Optional[TestBackend] = None
-backend_unfused: Optional[TestBackend] = None
-
-
-@pytest.mark.parametrize(
-    "model, quant_key",
-    [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
-@pytest.mark.parametrize("use_triton_fa", [True, False])
-@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="V0 attn quant fusion only on ROCm")
-def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
-                             quant_key: QuantKey, use_triton_fa: bool):
-    # Clean Dynamo cache to avoid reusing other test cases
-    # (for some reason the reset at the end is not enough)
-    torch._dynamo.reset()
-
-    # Use global backends
-    global backend, backend_unfused
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
-
-    # Prompt 4 seems too open-ended, differs between fused and unfused
-    # (both outputs look reasonable though)
-    prompts = example_prompts[:4] + example_prompts[5:]
-
-    compile_config = CompilationConfig(
-        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
-        # DYNAMO_ONCE does not properly propagate shapes.
-        level=CompilationLevel.DYNAMO_AS_IS,
-        backend="tests.compile.test_fusion_attn.backend_unfused",
-        custom_ops=["+quant_fp8"],
-    )
-    vllm_config = VllmConfig(compilation_config=compile_config,
-                             model_config=ModelConfig(
-                                 model=model,
-                                 dtype=torch.bfloat16,
-                             ))
-    backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
-
-    llm = LLM(model,
-              enforce_eager=True,
-              compilation_config=compile_config,
-              gpu_memory_utilization=0.5,
-              max_model_len=2048)
-
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=10,
-                                     top_p=0.95)
-
-    unfused_output = llm.generate(prompts, sampling_params)
-    backend_unfused = None  # Reset backend to make sure llm gets released
-    del llm
-
-    compile_config = CompilationConfig(
-        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
-        # DYNAMO_ONCE does not properly propagate shapes.
-        level=CompilationLevel.DYNAMO_AS_IS,
-        backend="tests.compile.test_fusion_attn.backend",
-        custom_ops=["+quant_fp8"],
-    )
-    vllm_config = VllmConfig(compilation_config=compile_config,
-                             model_config=ModelConfig(
-                                 model=model,
-                                 dtype=torch.bfloat16,
-                             ))
-
-    # AttnFusionPass needs attention layers to be registered in config upon init
-    # so we initialize it during compilation.
-    attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
-    backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
-    llm2 = LLM(model,
-               enforce_eager=True,
-               compilation_config=compile_config,
-               gpu_memory_utilization=0.5,
-               max_model_len=2048)
-
-    # check support
-    attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key)
-        for key, layer in compile_config.static_forward_context.items()
-    ]
-
-    print(f"{attn_fusion_supported=}")
-    if any(attn_fusion_supported):
-        # Check quant ops
-        backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
-
-    # attention ops present in both, just output_scale param changes
-    attn_nodes_pre = list(find_op_nodes(ATTN_OP, backend.graph_pre_pass))
-    attn_nodes_post = list(find_op_nodes(ATTN_OP, backend.graph_post_pass))
-    assert len(attn_nodes_pre) == len(attn_nodes_post)
-
-    for i in range(len(attn_nodes_pre)):
-        assert attn_nodes_pre[i].kwargs["output_scale"] is None
-        fused = attn_nodes_post[i].kwargs["output_scale"] is not None
-        assert fused == attn_fusion_supported[i], \
-            f"Node {i} {'' if fused else 'not '} expected " \
-            f"to have fused output quant"
-
-    # check outputs
-    fused_output = llm2.generate(prompts, sampling_params)
-
-    # transform outputs to format expected by check_outputs_equal
-    sample_outs = lambda s: (list(s.token_ids), s.text)
-    outs_lst = lambda ros: [sample_outs(ro.outputs[0]) for ro in ros]
-
-    check_outputs_equal(
-        outputs_0_lst=outs_lst(unfused_output),
-        outputs_1_lst=outs_lst(fused_output),
-        name_0="unfused",
-        name_1="fused",
-    )
-
-    # Clean Dynamo cache to avoid polluting other case(s)
-    torch._dynamo.reset()
-
-    # Reset backend to make sure llm2 gets released
-    backend = None
-
 
 class AttentionQuantPatternModel(torch.nn.Module):
     """Base model for AttentionQuantPattern fusion."""
 
-    def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
-                 kv_cache_dtype: torch.dtype, device: torch.device,
-                 vllm_config: VllmConfig, **kwargs):
+    def __init__(
+        self,
+        num_qo_heads: int,
+        num_kv_heads: int,
+        head_size: int,
+        kv_cache_dtype: torch.dtype,
+        device: torch.device,
+        vllm_config: VllmConfig,
+        **kwargs,
+    ):
         super().__init__()
         self.num_qo_heads = num_qo_heads
         self.num_kv_heads = num_kv_heads
@@ -197,49 +89,75 @@ class AttentionQuantPatternModel(torch.nn.Module):
             device=self.device,
         )
 
-    def build_attn_metadata(self, batch_size: int, use_hnd: bool) \
-            -> AttentionMetadata:
+    def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         """Initialize attention metadata."""
 
         # Create common attn metadata
-        batch_spec = BatchSpec(seq_lens=[1] * batch_size,
-                               query_lens=[1] * batch_size)
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size)
         common_attn_metadata = create_common_attn_metadata(
-            batch_spec,
-            self.block_size,
-            self.device,
-            arange_block_indices=True)
+            batch_spec, self.block_size, self.device, arange_block_indices=True
+        )
 
-        max_blocks = (max(batch_spec.seq_lens) + self.block_size -
-                      1) // self.block_size
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
         num_blocks = batch_size * max_blocks
+        backend = self.attn.backend
 
-        # Create dummy KV cache for FlashInfer TRTLLM
-        #   - NHD: [num_blocks, block_size, num_kv_heads, head_size]
-        #   - HND: [num_blocks, num_kv_heads, block_size, head_size]
-        kv_cache = torch.zeros(num_blocks,
-                               2,
-                               self.num_kv_heads,
-                               self.block_size,
-                               self.head_size,
-                               dtype=self.kv_cache_dtype,
-                               device=self.device)
-        if current_platform.is_rocm():
+        # TODO(luka) use get_kv_cache_stride_order
+        # Create dummy KV cache for the selected backend
+        if backend == AttentionBackendEnum.ROCM_ATTN:
             # k/v as 1st dimention
-            if use_hnd:
-                kv_cache = kv_cache.permute(1, 0, 2, 3, 4)
-            else:
-                kv_cache = kv_cache.permute(1, 0, 3, 2, 4)
-        else:
+            # HND: [num_blocks, num_kv_heads, block_size, head_size]
+            kv_cache = torch.zeros(
+                2,
+                num_blocks,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+            # k/v as 1st dimention
+            # NHD: [num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache = torch.zeros(
+                2,
+                num_blocks,
+                self.block_size,
+                self.num_kv_heads,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == AttentionBackendEnum.TRITON_ATTN:
             # k/v as 2nd dimention
-            # Create kv_cache in HND layout and permute to NHD layout
-            # (later will be permuted back to HND layout in forward pass)
-            kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+            # NHD: [num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache = torch.zeros(
+                num_blocks,
+                2,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == AttentionBackendEnum.FLASHINFER:
+            kv_cache = torch.zeros(
+                num_blocks,
+                2,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            ).permute(0, 1, 3, 2, 4)
+        else:
+            raise ValueError(f"Unsupported backend: {backend}")
         self.attn.kv_cache = [kv_cache]
 
         # Build attn metadata
         self.attn_metadata = self.builder.build(
-            common_prefix_len=0, common_attn_metadata=common_attn_metadata)
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
+        )
 
         return self.attn_metadata
 
@@ -254,27 +172,30 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
 
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=self.quant_key.scale.static,
-            act_quant_group_shape=self.quant_key.scale.group_shape)
+            act_quant_group_shape=self.quant_key.scale.group_shape,
+        )
 
         hidden_size = self.num_qo_heads * self.head_size
         self.w = kwargs.get(
-            "w", {
-                "weight":
-                torch.randn(hidden_size, hidden_size).to(
-                    dtype=FP8_DTYPE, device=self.device).t(),
-                "wscale":
-                torch.tensor([1.0], dtype=torch.float32, device=self.device),
-                "scale":
-                torch.tensor([1.0], dtype=torch.float32, device=self.device),
-            })
+            "w",
+            {
+                "weight": torch.randn(hidden_size, hidden_size)
+                .to(dtype=FP8_DTYPE, device=self.device)
+                .t(),
+                "wscale": torch.tensor([1.0], dtype=torch.float32, device=self.device),
+                "scale": torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            },
+        )
 
     def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
         """Forward pass that creates the pattern to be fused."""
         attn_output = self.attn(q, k, v)
-        return self.fp8_linear.apply(input=attn_output,
-                                     weight=self.w["weight"],
-                                     weight_scale=self.w["wscale"],
-                                     input_scale=self.w["scale"])
+        return self.fp8_linear.apply(
+            input=attn_output,
+            weight=self.w["weight"],
+            weight_scale=self.w["wscale"],
+            input_scale=self.w["scale"],
+        )
 
 
 class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
@@ -287,91 +208,113 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
 
         hidden_size = self.num_qo_heads * self.head_size
         self.w = kwargs.get(
-            "w", {
-                "weight":
-                torch.randint(256, (hidden_size, hidden_size // 2),
-                              dtype=FP4_DTYPE,
-                              device=self.device),
-                "wscale_swizzled":
-                torch.randn(hidden_size, hidden_size // 16).to(
-                    dtype=FP8_DTYPE, device=self.device),
-                "wscale":
-                torch.tensor([500], dtype=torch.float32, device=self.device),
-                "scale":
-                torch.tensor([0.002], dtype=torch.float32, device=self.device),
-            })
+            "w",
+            {
+                "weight": torch.randint(
+                    256,
+                    (hidden_size, hidden_size // 2),
+                    dtype=FP4_DTYPE,
+                    device=self.device,
+                ),
+                "wscale_swizzled": torch.randn(hidden_size, hidden_size // 16).to(
+                    dtype=FP8_DTYPE, device=self.device
+                ),
+                "wscale": torch.tensor([500], dtype=torch.float32, device=self.device),
+                "scale": torch.tensor([0.002], dtype=torch.float32, device=self.device),
+            },
+        )
 
     def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
         """Forward pass that creates the pattern to be fused."""
         attn_output = self.attn(q, k, v)
         quant_output, output_block_scale = scaled_fp4_quant(
-            attn_output, 1 / self.w["scale"])
-        return cutlass_scaled_fp4_mm(a=quant_output,
-                                     b=self.w["weight"],
-                                     block_scale_a=output_block_scale,
-                                     block_scale_b=self.w["wscale_swizzled"],
-                                     alpha=self.w["scale"] * self.w["wscale"],
-                                     out_dtype=attn_output.dtype)
+            attn_output, 1 / self.w["scale"]
+        )
+        return cutlass_scaled_fp4_mm(
+            a=quant_output,
+            b=self.w["weight"],
+            block_scale_a=output_block_scale,
+            block_scale_b=self.w["wscale_swizzled"],
+            alpha=self.w["scale"] * self.w["wscale"],
+            out_dtype=attn_output.dtype,
+        )
+
 
+MODELS_FP8: list[tuple[str, type]] = []
+MODELS_FP4: list[tuple[str, type]] = []
+HEADS: list[tuple[int, int]] = []
+SPLIT_ATTENTION: list[bool] = []
+BACKENDS_FP8: list[AttentionBackendEnum] = []
+BACKENDS_FP4: list[AttentionBackendEnum] = []
 
 if current_platform.is_cuda():
-    MODELS = [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-               TestAttentionFp8StaticQuantPatternModel),
-              ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
-               TestAttentionNvfp4QuantPatternModel)]
     HEADS = [(64, 8), (40, 8)]
+    MODELS_FP8 = [
+        (
+            "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+            TestAttentionFp8StaticQuantPatternModel,
+        )
+    ]
+    MODELS_FP4 = [
+        (
+            "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            TestAttentionNvfp4QuantPatternModel,
+        )
+    ]
+    BACKENDS_FP8 = [AttentionBackendEnum.TRITON_ATTN, AttentionBackendEnum.FLASHINFER]
+    BACKENDS_FP4 = [AttentionBackendEnum.FLASHINFER]
+
 elif current_platform.is_rocm():
-    MODELS = [("amd/Llama-3.1-8B-Instruct-FP8-KV",
-               TestAttentionFp8StaticQuantPatternModel)]
     HEADS = [(32, 8), (40, 8)]
-else:
-    MODELS = []
-    HEADS = []
+    MODELS_FP8 = [
+        ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
+    ]
+    BACKENDS = [
+        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+        AttentionBackendEnum.ROCM_ATTN,
+        AttentionBackendEnum.TRITON_ATTN,
+    ]
 
 
 @pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
 @pytest.mark.parametrize("head_size", [128])
-@pytest.mark.parametrize("batch_size",
-                         [7, 256, 533] if current_platform.is_cuda() else [8])
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("model_name, model_class", MODELS)
-@pytest.mark.parametrize("backend",
-                         [_Backend.FLASHINFER] if current_platform.is_cuda()
-                         else [_Backend.TRITON_ATTN])
 @pytest.mark.parametrize(
-    "split_attention",
-    [False, True] if current_platform.is_rocm() else [False])
-# TODO(boyuan): test inductor graph partition on rocm
+    "batch_size", [7, 256, 533] if current_platform.is_cuda() else [8]
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize(
-    "use_inductor_graph_partition",
-    [False] if current_platform.is_rocm() else [False, True])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only test ROCm or CUDA")
+    "backend, model_name, model_class, custom_ops",
+    # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
+    list(flat_product(BACKENDS_FP8, MODELS_FP8, ["+quant_fp8", "-quant_fp8"]))
+    # quant_fp4 only has the custom impl
+    + list(flat_product(BACKENDS_FP4, MODELS_FP4, [""])),
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA"
+)
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(current_platform.is_cuda()
-                    and not current_platform.is_device_capability((10, 0)),
-                    reason="On CUDA only test on SM100(Blackwell)")
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only test ROCm or CUDA")
-def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
-                                 head_size: int, batch_size: int,
-                                 dtype: torch.dtype, model_name: str,
-                                 model_class: type[AttentionQuantPatternModel],
-                                 backend: _Backend, split_attention: bool,
-                                 use_inductor_graph_partition: bool,
-                                 monkeypatch, dist_init, caplog_vllm):
+def test_attention_quant_pattern(
+    num_qo_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    batch_size: int,
+    dtype: torch.dtype,
+    custom_ops: str,
+    model_name: str,
+    model_class: type[AttentionQuantPatternModel],
+    backend: AttentionBackendEnum,
+    dist_init,
+):
     """Test AttentionStaticQuantPattern fusion pass"""
+    if backend == AttentionBackendEnum.FLASHINFER and (
+        not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
+    ):
+        pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
 
-    if use_inductor_graph_partition and not is_torch_equal_or_newer(
-            "2.9.0.dev"):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    if split_attention:
-        monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
     vllm_config = VllmConfig(
@@ -382,25 +325,16 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         ),
         scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
-            custom_ops=["+quant_fp8"],
-            use_inductor_graph_partition=use_inductor_graph_partition,
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops_list,
         ),
-        cache_config=CacheConfig(cache_dtype="fp8"))
+        cache_config=CacheConfig(cache_dtype="fp8"),
+    )
 
     # Create test inputs
-    q = torch.randn(batch_size,
-                    num_qo_heads * head_size,
-                    dtype=dtype,
-                    device=device)
-    k = torch.randn(batch_size,
-                    num_kv_heads * head_size,
-                    dtype=dtype,
-                    device=device)
-    v = torch.randn(batch_size,
-                    num_kv_heads * head_size,
-                    dtype=dtype,
-                    device=device)
+    q = torch.randn(batch_size, num_qo_heads * head_size, dtype=dtype, device=device)
+    k = torch.randn(batch_size, num_kv_heads * head_size, dtype=dtype, device=device)
+    v = torch.randn(batch_size, num_kv_heads * head_size, dtype=dtype, device=device)
 
     # Mark first dimension as dynamic for realistic testing
     torch._dynamo.mark_dynamic(q, 0)
@@ -409,42 +343,50 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
 
     # Run model directly without compilation and fusion
     vllm_config_unfused = copy.deepcopy(vllm_config)
-    with set_current_vllm_config(vllm_config_unfused), set_forward_context(
-            attn_metadata=None, vllm_config=vllm_config_unfused
-    ), global_force_attn_backend_context_manager(backend):
-        model_unfused = model_class(num_qo_heads=num_qo_heads,
-                                    num_kv_heads=num_kv_heads,
-                                    head_size=head_size,
-                                    kv_cache_dtype=FP8_DTYPE,
-                                    device=device,
-                                    vllm_config=vllm_config_unfused)
+    with (
+        set_current_vllm_config(vllm_config_unfused),
+        set_forward_context(attn_metadata=None, vllm_config=vllm_config_unfused),
+        global_force_attn_backend_context_manager(backend),
+    ):
+        model_unfused = model_class(
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            kv_cache_dtype=FP8_DTYPE,
+            device=device,
+            vllm_config=vllm_config_unfused,
+        )
         model_unfused = model_unfused.to(device)
 
         forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-            batch_size, use_hnd=split_attention)
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(batch_size)
 
-        # Run model directly without compilation and fusion
-        result_unfused = model_unfused(q, k, v)
+        # Run model directly without fusion
+        # Still compile so query QuantFP8 has closer numerics
+        result_unfused = torch.compile(model_unfused, fullgraph=True)(q, k, v)
 
     # Run model with attn fusion enabled
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_attn_fusion=True, enable_noop=True)
-    with set_current_vllm_config(vllm_config), set_forward_context(
-            attn_metadata=None, vllm_config=vllm_config
-    ), global_force_attn_backend_context_manager(backend):
-        model_fused = model_class(num_qo_heads=num_qo_heads,
-                                  num_kv_heads=num_kv_heads,
-                                  head_size=head_size,
-                                  kv_cache_dtype=FP8_DTYPE,
-                                  device=device,
-                                  vllm_config=vllm_config,
-                                  w=model_unfused.w)
+        enable_attn_fusion=True, enable_noop=True
+    )
+    with (
+        set_current_vllm_config(vllm_config),
+        set_forward_context(attn_metadata=None, vllm_config=vllm_config),
+        global_force_attn_backend_context_manager(backend),
+    ):
+        model_fused = model_class(
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            kv_cache_dtype=FP8_DTYPE,
+            device=device,
+            vllm_config=vllm_config,
+            w=model_unfused.w,
+        )
         model_fused = model_fused.to(device)
 
         forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
-            batch_size, use_hnd=split_attention)
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
 
         # Create test backend with fusion passes enabled
         noop_pass = NoOpEliminationPass(vllm_config)
@@ -454,14 +396,14 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass)
 
         # Compile model with fusion enabled
-        model_compiled = torch.compile(model_fused,
-                                       backend=test_backend,
-                                       fullgraph=True)
+        model_compiled = torch.compile(
+            model_fused, backend=test_backend, fullgraph=True
+        )
         assert model_compiled.attn._o_scale_float is None
 
         result_fused_1 = model_compiled(q, k, v)
 
-        if backend == _Backend.FLASHINFER:
+        if backend == AttentionBackendEnum.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
@@ -471,49 +413,60 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
 
             assert model_compiled.attn._o_scale_float is not None
 
-            torch.testing.assert_close(result_unfused,
-                                       result_fused_2,
-                                       atol=1e-2,
-                                       rtol=1e-2)
+            torch.testing.assert_close(
+                result_unfused, result_fused_2, atol=1e-2, rtol=1e-2
+            )
 
     # Check attn fusion support
-    quant_key = model_class.quant_key
+    quant_key: QuantKey = model_class.quant_key
     attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key) for key, layer in
-        vllm_config.compilation_config.static_forward_context.items()
+        layer.impl.fused_output_quant_supported(quant_key)
+        for key, layer in vllm_config.compilation_config.static_forward_context.items()
     ]
-    if any(attn_fusion_supported):
-        # Check quantization ops in the graph before and after fusion
-        test_backend.check_before_ops([QUANT_OPS[quant_key]],
-                                      fully_replaced=True)
+    assert sum(attn_fusion_supported) == len(attn_fusion_supported), (
+        "All layers should support attention fusion"
+    )
+
+    # Check quantization ops in the graph before and after fusion
+    quant_op = (
+        torch.ops.aten.reciprocal
+        if "-quant_fp8" in custom_ops_list
+        else QUANT_OPS[quant_key]
+    )
+
+    # Note: for fp8, fully_replaced=False because query quant ops remain in graph.
+    # Only output quant ops are fused into attention.
+    test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Quant)
 
     # access the underlying `AttnFusionPass` on the `LazyInitPass`
     assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
 
     # Check attention ops in the graph before and after fusion
     attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
-    attn_nodes_post = list(find_op_nodes(ATTN_OP,
-                                         test_backend.graph_post_pass))
+    attn_nodes_post = list(find_op_nodes(ATTN_OP, test_backend.graph_post_pass))
 
     assert len(attn_nodes_pre) > 0, "Should have attention nodes before fusion"
-    assert len(attn_nodes_pre) == len(attn_nodes_post), \
+    assert len(attn_nodes_pre) == len(attn_nodes_post), (
         "Should have same number of attention nodes before and after fusion"
-    assert attn_nodes_pre[0].kwargs.get("output_scale") is None, \
+    )
+    assert attn_nodes_pre[0].kwargs.get("output_scale") is None, (
         "Attention should not have output_scale before fusion"
-    assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
+    )
+    assert attn_nodes_post[0].kwargs.get("output_scale") is not None, (
         "Attention should have output_scale after fusion"
+    )
 
-    assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, \
+    assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, (
         "Attention should not have output_block_scale before fusion"
+    )
     if quant_key.dtype == FP8_DTYPE:
-        assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, \
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, (
             "Attention should not have output_block_scale after FP8 fusion"
+        )
     elif quant_key.dtype == FP4_DTYPE:
-        assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
-            "Attention should have output_block_scale after FP4 fusion"  # noqa: E501
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
+            "Attention should have output_block_scale after FP4 fusion"
+        )
 
     # Check that results are close
-    torch.testing.assert_close(result_unfused,
-                               result_fused_1,
-                               atol=1e-2,
-                               rtol=1e-2)
+    torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..f22d60ef000b2286f666da35270b51755f5564cc
--- /dev/null
+++ b/tests/compile/test_fusions_e2e.py
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import itertools
+import logging
+from collections.abc import Iterable
+from typing import Any, NamedTuple
+
+import pytest
+import regex as re
+
+from tests.v1.attention.utils import AttentionBackendEnum
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+from ..utils import flat_product, multi_gpu_test
+
+is_blackwell = lambda: current_platform.is_device_capability(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+class Matches(NamedTuple):
+    attention_fusion: int = 0
+    allreduce_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
+
+
+class ModelBackendTestCase(NamedTuple):
+    model_name: str
+    model_kwargs: dict[str, Any]
+    backend: AttentionBackendEnum
+    matches: Matches
+
+
+MODELS_FP8: list[ModelBackendTestCase] = []
+MODELS_FP4: list[ModelBackendTestCase] = []
+MODELS: list[ModelBackendTestCase] = []  # tp-only
+
+if current_platform.is_cuda():
+    MODELS_FP8 = [
+        ModelBackendTestCase(
+            # Use smaller model for L40s in CI
+            model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
+            #  so FI attention+fp8_quant is at least tested once
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
+        ),
+        ModelBackendTestCase(
+            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
+            # https://github.com/vllm-project/vllm/issues/28568
+            # TODO FlashInfer attn broken on Blackwell for llama4:
+            # https://github.com/vllm-project/vllm/issues/28604
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=48,
+                allreduce_fusion=96,
+                sequence_parallel=96,
+                async_tp=95,  # mlp is moe, no fusion there
+            ),
+        ),
+    ]
+
+    MODELS_FP4 = [
+        ModelBackendTestCase(
+            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
+        ),
+    ]
+
+    # TP only
+    MODELS = [
+        ModelBackendTestCase(
+            model_name="meta-llama/Llama-3.1-8B-Instruct",
+            model_kwargs=dict(max_model_len=1024),
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
+        ),
+        ModelBackendTestCase(
+            model_name="Qwen/Qwen3-30B-A3B",
+            model_kwargs=dict(max_model_len=1024),
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=97,
+                sequence_parallel=97,
+                async_tp=96,  # MLP is MoE, half the fusions of dense
+            ),
+        ),
+    ]
+
+elif current_platform.is_rocm():
+    MODELS_FP8 = [
+        ModelBackendTestCase(
+            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
+            model_kwargs=dict(max_model_len=1024),
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(attention_fusion=32),
+        ),
+        ModelBackendTestCase(
+            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
+            model_kwargs=dict(max_model_len=1024),
+            backend=AttentionBackendEnum.ROCM_ATTN,
+            matches=Matches(attention_fusion=32),
+        ),
+        ModelBackendTestCase(
+            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
+            model_kwargs=dict(max_model_len=1024),
+            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+            matches=Matches(attention_fusion=32),
+        ),
+    ]
+
+CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
+
+
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
+    list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
+    # quant_fp4 only has the custom impl
+    + list(flat_product(MODELS_FP4, [""])),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+def test_attn_quant(
+    model_name: str,
+    model_kwargs: dict[str, Any],
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if backend == AttentionBackendEnum.FLASHINFER and (
+        not is_blackwell() or not has_flashinfer()
+    ):
+        pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at
+        # CUDAGraphMode.NONE here because it derives an attention backend that
+        # does not support full cudagraphs
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        custom_ops=custom_ops_list,
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        splitting_ops=splitting_ops,
+        # Common
+        mode=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(compilation_config, model_name, **model_kwargs)
+
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.attention_fusion
+
+
+CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
+
+
+def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
+    for op_list in itertools.product(*custom_ops_lists):
+        yield ",".join(op_list)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Toggle RMSNorm and QuantFP8 for FP8 models
+    list(
+        flat_product(
+            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
+        )
+    )
+    # Toggle RMSNorm for FP4 models and unquant models
+    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_cuda()
+    or not has_flashinfer()
+    or not current_platform.has_device_capability(90),
+    reason="allreduce+rmsnorm fusion requires flashinfer",
+)
+def test_tp2_attn_quant_allreduce_rmsnorm(
+    model_name: str,
+    model_kwargs: dict,
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
+        # FlashInfer attn fusion requires Blackwell
+        matches = matches._replace(attention_fusion=0)
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        custom_ops=custom_ops_list,
+        splitting_ops=splitting_ops,
+        # Common
+        mode=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(
+            enable_attn_fusion=True,
+            enable_noop=True,
+            enable_fi_allreduce_fusion=True,
+        ),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(
+            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
+        )
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.allreduce_fusion
+    assert int(log_matches[1]) == matches.allreduce_fusion
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Toggle RMSNorm and QuantFP8 for FP8 models
+    list(
+        flat_product(
+            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
+        )
+    )
+    # Toggle RMSNorm for FP4 models and unquant models
+    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="sequence parallel only tested on CUDA",
+)
+def test_tp2_attn_quant_async_tp(
+    model_name: str,
+    model_kwargs: dict,
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if is_blackwell():
+        # TODO: https://github.com/vllm-project/vllm/issues/27893
+        pytest.skip("Blackwell is not supported for AsyncTP pass")
+
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER:
+        if not has_flashinfer():
+            pytest.skip("FlashInfer backend requires flashinfer installed")
+        if not is_blackwell():
+            # FlashInfer attn fusion requires Blackwell
+            matches = matches._replace(attention_fusion=0)
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        custom_ops=custom_ops_list,
+        splitting_ops=splitting_ops,
+        # Common
+        level=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(
+            enable_attn_fusion=True,
+            enable_noop=True,
+            enable_sequence_parallelism=True,
+            enable_async_tp=True,
+        ),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(
+            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
+        )
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.sequence_parallel
+    assert int(log_matches[1]) == matches.sequence_parallel
+
+    log_matches = re.findall(
+        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.async_tp
+    assert int(log_matches[1]) == matches.async_tp
+
+
+def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
+    compilation_config = (
+        compile_config
+        if isinstance(compile_config, CompilationConfig)
+        else CompilationConfig(mode=compile_config)
+    )
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    # Allow override from model_kwargs
+    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
+    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
+
+    # No cudagraphs by default
+    if compilation_config.cudagraph_mode is None:
+        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
+    llm = LLM(
+        model=model,
+        compilation_config=compilation_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd783843a6263330286109c997a16bddc3abf14
--- /dev/null
+++ b/tests/compile/test_graph_partition.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+
+import pytest
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+
+
+def test_getitem_moved_to_producer_subgraph():
+    """
+    Test that getitem operations are moved to the same subgraph as their input,
+    preventing tuple inputs to submodules.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # Following ops should become second submodule that consumes tuple
+        result_0 = torch.relu(chunks[0])
+        result_1 = torch.relu(chunks[1])
+        return torch.cat([result_0, result_1], dim=0)
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    # Split on tuple producer aten::split
+    split_ops = ["aten::split.Tensor"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 2, "Graph should be split into 2 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        getitem_on_placeholder = []
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                getitem_on_placeholder.append(node)
+
+        assert len(getitem_on_placeholder) == 0, (
+            f"Submodule {split_item.submod_name} has getitem operations on "
+            f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
+            "This means tuple inputs were not properly eliminated."
+        )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch"
+
+
+def test_no_tuple_inputs_with_multiple_consumers():
+    """
+    Test that when a tuple is consumed by multiple split operations,
+    getitem operations are properly moved to avoid tuple inputs.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # These should become second submodule consuming tuple
+        result_1 = torch.relu(chunks[0])
+        result_2 = torch.relu(chunks[1])
+
+        # Artificial graph splitting point to create another
+        # independent submodule that consumes tuple later
+        # This would become the third submodule
+        result_1 = torch.sigmoid(result_1)
+
+        # Fourth submodule that consumes tuple
+        result = torch.cat([chunks[0], chunks[1], result_1, result_2])
+        return result
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    split_ops = ["aten::split.Tensor", "aten::sigmoid"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 4, "Graph should be split into 4 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                pytest.fail(
+                    f"Submodule {split_item.submod_name} has getitem on "
+                    f"placeholder {node.args[0].name}, indicating it receives "
+                    "a tuple input"
+                )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..621f6a51a918fc4e56fd7a8c024c8147ea23db15
--- /dev/null
+++ b/tests/compile/test_multimodal_compile.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.compilation.counter import compilation_counter
+from vllm.config import VllmConfig
+from vllm.config.compilation import CompilationMode
+from vllm.platforms import current_platform
+
+
+def test_compile():
+    vllm_config = VllmConfig()
+    # Default configuration does not compile mm encoder
+    assert not vllm_config.compilation_config.compile_mm_encoder
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
+    """Test that Qwen2.5-VL vision submodules are compiled.
+
+    This test verifies that the 3 vision submodules (Qwen2_5_VisionPatchEmbed,
+    Qwen2_5_VisionBlock, and Qwen2_5_VisionPatchMerger) are properly tagged
+    for compilation by checking that num_models_seen increases by at least 3.
+    """
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    with (
+        # NOTE: Qwen2.5-VL has 35 models in total - the LLM backend
+        # Vision Patch Embed, Vision Patch Merger, and then 32 Vision Blocks
+        # (one for each layer) - in the future, we should fix vLLM compilation
+        # logic to handle this case and only compile the Vision submodules once
+        # and reuse the compiled code for all layers
+        # See https://github.com/vllm-project/vllm/issues/27590
+        compilation_counter.expect(num_models_seen=35),
+        vllm_runner(
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            max_model_len=2048,
+            gpu_memory_utilization=0.8,
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
+        ) as _,
+    ):
+        pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
+    """Test that Qwen2.5-VL vision submodules are not compiled when the
+    config is passed off
+    """
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    with (
+        compilation_counter.expect(num_models_seen=1),
+        vllm_runner(
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            max_model_len=2048,
+            gpu_memory_utilization=0.8,
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": False,
+            },
+        ) as _,
+    ):
+        pass
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
index 242d531312675e4f716f699a56d47f92a91bc6f1..0ccc1a01616298dba833e97dff2b83d39eb17e39 100644
--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -6,24 +6,29 @@ import torch
 
 import vllm
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
-                         VllmConfig)
+from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
 
 from .backend import TestBackend
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float16, torch.bfloat16, torch.float32])
-@pytest.mark.parametrize("num_tokens", [256, 1024])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# Important edge case is when `num_tokens == buffer_size`
+@pytest.mark.parametrize(
+    ("num_tokens", "buffer_size"), [(256, 256), (256, 512), (1024, 1024), (1024, 1025)]
+)
 @pytest.mark.parametrize("hidden_size", [64, 4096])
-def test_noop_elimination(dtype, num_tokens, hidden_size):
+def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
 
     class Model(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.pos_embed = torch.empty(buffer_size, hidden_size, dtype=dtype)
 
         def forward(self, x):
+            x += self.pos_embed[: x.shape[0]]
             # Chain of reshapes
             y = x.reshape(-1, 128, 32)
             z = y.reshape(-1, 4096)
@@ -32,7 +37,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size):
             # Final reshape that should remain
             b = a.reshape(-1, 128, 32)
             # No-op slice
-            c = b[0:b.shape[0]]
+            c = b[0 : b.shape[0]]
             # The pass should replace the result of this op with `c`.
             d = torch.slice_scatter(
                 torch.ones_like(c),  # Dummy tensor to be scattered into
@@ -43,10 +48,12 @@ def test_noop_elimination(dtype, num_tokens, hidden_size):
             )
             return d
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        pass_config=PassConfig(enable_noop=True),
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config=PassConfig(enable_noop=True),
+        )
+    )
     with vllm.config.set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
 
@@ -66,9 +73,10 @@ def test_noop_elimination(dtype, num_tokens, hidden_size):
         torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
         # The no-op reshape and slice should be eliminated.
+        # The initial slice on the positional embedding should remain.
         # The chain of reshapes should be fused into a single reshape.
         assert backend.op_count(torch.ops.aten.reshape.default) == 1
-        assert backend.op_count(torch.ops.aten.slice.Tensor) == 0
+        assert backend.op_count(torch.ops.aten.slice.Tensor) == 1
         assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0
 
 
@@ -82,17 +90,18 @@ def test_non_noop_slice_preserved():
     x = torch.randn(16, 16)
 
     class SliceModel(torch.nn.Module):
-
         def forward(self, x):
             base = x.clone()
             src = torch.ones(15, 16)
             y = torch.slice_scatter(base, src, dim=0, start=0, end=-1)
             return x[0:-1, :], y
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        pass_config=PassConfig(enable_noop=True),
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config=PassConfig(enable_noop=True),
+        )
+    )
     with vllm.config.set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
         backend = TestBackend(noop_pass)
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 251cc46e9e9899bb0c28588a56f99998cba6a503..1c40c599f748760adc1ea85d1f80841bedc9ab4d 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -28,7 +28,6 @@ def test_bad_callable():
 
 # Pass that inherits from InductorPass
 class ProperPass(InductorPass):
-
     def __call__(self, graph: torch.fx.graph.Graph) -> None:
         pass
 
@@ -39,12 +38,12 @@ class ProperPass(InductorPass):
         ProperPass(),
         # Can also wrap callables in CallableInductorPass for compliance
         CallableInductorPass(simple_callable),
-        CallableInductorPass(simple_callable,
-                             InductorPass.hash_source(__file__))
+        CallableInductorPass(simple_callable, InductorPass.hash_source(__file__)),
     ],
 )
 def test_pass_manager_uuid(callable):
-    config = VllmConfig()
+    # Some passes need dtype to be set
+    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
@@ -65,8 +64,9 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.enable_fusion = not \
-        config2.compilation_config.pass_config.enable_fusion
+    config2.compilation_config.pass_config.enable_fusion = (
+        not config2.compilation_config.pass_config.enable_fusion
+    )
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
     pass_manager3.add(callable)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..511e50f5fdc242dc5e874976427f7f6f05a40ad0
--- /dev/null
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.compile.backend import TestBackend
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
+from vllm.compilation.qk_norm_rope_fusion import (
+    FUSED_QK_ROPE_OP,
+    QKNormRoPEFusionPass,
+)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+RSQRT_OP = torch.ops.aten.rsqrt.default
+INDEX_SELECT_OP = torch.ops.aten.index.Tensor
+
+
+class QKNormRoPETestModel(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        eps: float,
+        is_neox: bool,
+        vllm_config: VllmConfig,
+        dtype: torch.dtype,
+        prefix: str = "model.layers.0.self_attn.attn",
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = num_heads * head_dim
+        self.kv_size = num_kv_heads * head_dim
+        self.rotary_dim = head_dim
+        self.eps = eps
+        self.dtype = dtype
+
+        # Register layer metadata for the fusion pass via Attention.
+        self.attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=1.0 / self.head_dim**0.5,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix=prefix,
+            attn_type=AttentionType.DECODER,
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.rotary_emb = RotaryEmbedding(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position_embeddings=4096,
+            base=10000,
+            is_neox_style=is_neox,
+            dtype=self.dtype,
+        )
+        self.enable_rms_norm_custom_op = self.q_norm.enabled()
+        self.enable_rope_custom_op = self.rotary_emb.enabled()
+
+    def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        ops = []
+        if self.enable_rms_norm_custom_op:
+            ops.append(RMS_OP)
+        else:
+            ops.append(RSQRT_OP)
+
+        if self.enable_rope_custom_op:
+            if self.rotary_emb.use_flashinfer:
+                ops.append(FLASHINFER_ROTARY_OP)
+            else:
+                ops.append(ROTARY_OP)
+        else:
+            ops.append(INDEX_SELECT_OP)
+        return ops
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [FUSED_QK_ROPE_OP]
+
+
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("is_neox", [True, False])
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_rope_custom_op", [True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_qk_norm_rope_fusion(
+    eps, is_neox, enable_rms_norm_custom_op, enable_rope_custom_op, dtype
+):
+    if not hasattr(torch.ops._C, "fused_qk_norm_rope"):
+        pytest.skip("fused_qk_norm_rope custom op not available")
+
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    custom_ops: list[str] = []
+    if enable_rms_norm_custom_op:
+        custom_ops.append("+rms_norm")
+    if enable_rope_custom_op:
+        custom_ops.append("+rotary_embedding")
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(
+                enable_qk_norm_rope_fusion=True,
+                enable_noop=True,
+            ),
+        ),
+    )
+
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    T = 5
+
+    with set_current_vllm_config(vllm_config):
+        model = QKNormRoPETestModel(
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            eps=eps,
+            is_neox=is_neox,
+            vllm_config=vllm_config,
+            dtype=dtype,
+        )
+
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = QKNormRoPEFusionPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+
+        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+        backend_baseline = TestBackend(noop_pass, cleanup_pass)
+
+        qkv = torch.randn(T, model.q_size + 2 * model.kv_size)
+        pos = torch.arange(T, dtype=torch.long, device=qkv.device)
+        qkv_unfused = qkv.clone()
+        pos_unfused = pos.clone()
+
+        torch._dynamo.mark_dynamic(qkv, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+        model_fused = torch.compile(model, backend=backend)
+        q_fused, k_fused, v_fused = model_fused(qkv, pos)
+
+        torch._dynamo.mark_dynamic(qkv_unfused, 0)
+        torch._dynamo.mark_dynamic(pos_unfused, 0)
+        model_unfused = torch.compile(model, backend=backend_baseline)
+        q_unfused, k_unfused, v_unfused = model_unfused(qkv_unfused, pos_unfused)
+
+        if dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
+
+        assert fusion_pass.matched_count == 1
+
+        backend.check_before_ops(model.ops_in_model_before())
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index b2734e915bbbf9a9c7cbdb0bed157486757ac36f..9cd7f64b04af5fbef64e3482e48e147bfec20b78 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -5,23 +5,32 @@ import pytest
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.fx_utils import find_auto_fn
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
-from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
-                         PassConfig, VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    CUDAGraphMode,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    get_current_vllm_config,
+    set_current_vllm_config,
+)
 from vllm.distributed import tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 from ..utils import multi_gpu_test
 from .backend import TestBackend
@@ -35,190 +44,194 @@ prompts = [
 ]
 
 
-class TestModel(torch.nn.Module):
-
-    def __init__(self,
-                 hidden_size=16,
-                 intermediate_size=32,
-                 vllm_config: VllmConfig = None):
+class TestAllReduceRMSNormModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)))
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
-        
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
-            
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
-
-        #matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
-
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
-
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-
-        return norm_output, residual_output
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
+
+    def forward(self, x):
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
+
+        y2, resid = self.norm[1](x2, resid)
+
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
+
+        y3, resid = self.norm[2](x3, resid)
+
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
+
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
         return [
+            torch.ops.vllm.all_gather.default,
             torch.ops.vllm.reduce_scatter.default,
-            torch.ops.vllm.all_gather.default
         ]
 
     def ops_in_model(self):
-        return [torch.ops._C.fused_add_rms_norm.default]
-
+        if RMSNorm.enabled():
+            return [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.fused_add_rms_norm.default,
+            ]
+        else:
+            return []
 
-class TestQuantModel(torch.nn.Module):
 
-    def __init__(self,
-                 hidden_size=16,
-                 intermediate_size=32,
-                 vllm_config: VllmConfig = None):
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
+        self.vllm_config = get_current_vllm_config()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.vllm_config = vllm_config
-        self.gate_proj = torch.nn.Parameter(torch.empty(
-            (intermediate_size, hidden_size)),
-                                            requires_grad=False)
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-        self.fp8_linear = Fp8LinearOp(act_quant_static=True)
-
-        self.scale = torch.rand(1, dtype=torch.float32)
-        # Create a weight that is compatible with torch._scaled_mm,
-        # which expects a column-major layout.
-        self.w = torch.rand(hidden_size,
-                            intermediate_size).to(dtype=FP8_DTYPE).t()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
-        
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
-            
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
-
-        #matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
-
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
-
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-
-        # scaled_mm with static input quantization
-        fp8_linear_result = self.fp8_linear.apply(norm_output,
-                                                  self.w,
-                                                  self.wscale,
-                                                  input_scale=self.scale.to(
-                                                      norm_output.device))
-
-        return fp8_linear_result, residual_output
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+            for _ in range(3)
+        ]
 
-    def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default
-                         ]  # Always removed by SP
-        # The following are only removed if fusion happens
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
-            ops_to_remove.extend([
-                torch.ops._C.fused_add_rms_norm.default,
-                torch.ops._C.static_scaled_fp8_quant.default,
-            ])
-        return ops_to_remove
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
+        )
+
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
+        )
+
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
+
+        z3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        z4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
-        ops_to_add = [
+        return [
+            torch.ops.vllm.all_gather.default,
             torch.ops.vllm.reduce_scatter.default,
-            torch.ops.vllm.all_gather.default
         ]
-        # The following is only added if fusion happens
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
-            ops_to_add.append(
-                torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
-        return ops_to_add
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
+        ]
 
     def ops_in_model(self):
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
-            # If fusion happens, the fused op is the one
-            # we check for (de)functionalization
-            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-                    ]  # noqa: E501
-        else:
-            # If no fusion, the original ops are checked
+        if self.vllm_config.compilation_config.pass_config.enable_fusion:
+            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
+        elif RMSNorm.enabled():
             return [
                 torch.ops._C.fused_add_rms_norm.default,
-                # TODO  functionalization pass does not handle this yet
-                # torch.ops._C.static_scaled_fp8_quant.default,
             ]
+        elif self.fp8_linear.quant_fp8.enabled():
+            return [
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+        else:
+            return []
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
+@pytest.mark.parametrize(
+    "test_model_cls, custom_ops",
+    [
+        (TestAllReduceRMSNormModel, "+rms_norm"),
+        (TestAllReduceRMSNormModel, "-rms_norm"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_sequence_parallelism_pass(test_model_cls: type[torch.nn.Module],
-                                   batch_size: int, seq_len: int,
-                                   hidden_size: int, dtype: torch.dtype,
-                                   enable_fusion: bool):
+@pytest.mark.parametrize("dynamic", [False, True])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_sequence_parallelism_pass(
+    test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_fusion: bool,
+    dynamic: bool,
+):
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model_cls,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype, enable_fusion),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                test_model_cls,
+                custom_ops,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                enable_fusion,
+                dynamic,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
 
 
 def sequence_parallelism_pass_on_test_model(
-        local_rank: int, world_size: int,
-        test_model_cls: type[torch.nn.Module], batch_size: int, seq_len: int,
-        hidden_size: int, dtype: torch.dtype, enable_fusion: bool):
+    local_rank: int,
+    world_size: int,
+    test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_fusion: bool,
+    dynamic: bool,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -226,84 +239,93 @@ def sequence_parallelism_pass_on_test_model(
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
-    vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
-        enable_sequence_parallelism=True,
-        enable_fusion=enable_fusion,
-        enable_noop=True))  # NoOp needed for fusion
-    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+    compilation_config = CompilationConfig(
+        splitting_ops=[],  # avoid automatic rms_norm enablement
+        cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
+        custom_ops=custom_ops_list,
+        pass_config=PassConfig(
+            enable_sequence_parallelism=True,
+            enable_fusion=enable_fusion,
+            enable_noop=True,
+        ),
+    )  # NoOp needed for fusion
+    device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
-    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
-
-    noop_pass = NoOpEliminationPass(vllm_config)
-    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
-    func_pass = FixFunctionalizationPass(vllm_config)
-    cleanup_pass = PostCleanupPass(vllm_config)
-
-    passes_for_backend: list[VllmInductorPass] = \
-        [noop_pass, sequence_parallelism_pass]
+    model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+    model_config = ModelConfig(
+        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        device_config=device_config,
+        compilation_config=compilation_config,
+    )
+
+    with set_current_vllm_config(vllm_config):
+        noop_pass = NoOpEliminationPass(vllm_config)
+        sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+        assert (
+            sequence_parallelism_pass.compilation_config.splitting_ops
+            == vllm_config.compilation_config.splitting_ops
+        )
+        assert (
+            sequence_parallelism_pass.compilation_config.use_inductor_graph_partition
+            == vllm_config.compilation_config.use_inductor_graph_partition
+        )
+        passes_for_backend: list[VllmInductorPass] = [
+            noop_pass,
+            sequence_parallelism_pass,
+        ]
 
-    if enable_fusion:
-        fusion_pass = RMSNormQuantFusionPass(vllm_config)
-        passes_for_backend.append(fusion_pass)
+        if enable_fusion:
+            fusion_pass = RMSNormQuantFusionPass(vllm_config)
+            passes_for_backend.append(fusion_pass)
 
-    passes_for_backend.append(cleanup_pass)
+        passes_for_backend.append(cleanup_pass)
 
-    backend_no_func = TestBackend(*passes_for_backend)
-    backend_func = TestBackend(*passes_for_backend, func_pass)
+        backend = TestBackend(*passes_for_backend)
 
-    model = test_model_cls(hidden_size,
-                           hidden_size * 2,
-                           vllm_config=vllm_config)
+        model = test_model_cls(hidden_size)
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                dtype=dtype)
-    residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
+        hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
-    compiled_model_no_func = torch.compile(model, backend=backend_no_func)
-    compiled_model_no_func(hidden_states, residual)
-    compiled_model_func = torch.compile(model, backend=backend_func)
-    compiled_model_func(hidden_states, residual)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
 
-    assert sequence_parallelism_pass.matched_count == 1
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
 
-    # In pre-nodes, all reduce should be there,
-    # reduce scatter and all gather should not
-    backend_no_func.check_before_ops(model.ops_in_model_before())
+        assert sequence_parallelism_pass.matched_count == 4
 
-    # In post-nodes, reduce scatter and all gather should be there,
-    # all reduce should not
-    backend_no_func.check_after_ops(model.ops_in_model_after())
+        # In pre-nodes, all reduce should be there,
+        # reduce scatter and all gather should not
+        for op in model.ops_in_model_before():
+            assert backend.op_count(op, before=True) == 4
 
-    # check if the functionalization pass is applied
-    for op in model.ops_in_model():
-        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
-                                  op) is None  # noqa: E501
+        # In post-nodes, reduce scatter and all gather should be there,
+        # all reduce should not
+        for op in model.ops_in_model_after():
+            assert backend.op_count(op, before=False) == 4
 
-    # make sure the ops were all de-functionalized
-    found = dict()
-    for node in backend_func.graph_post_pass.nodes:
         for op in model.ops_in_model():
-            if is_func(node, op):
-                found[op] = True
-    assert all(found[op] for op in model.ops_in_model())
+            find_auto_fn(backend.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index c445f4dde2cc45efde13de999c155816a448c411..0ddb82b7c3fc29a753259a3b11545f2f4c471a3d 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import cast
+import itertools
 
 import pytest
 import torch
@@ -8,20 +8,31 @@ import torch
 import vllm.envs as envs
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.compilation.activation_quant_fusion import (
-    FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
-# yapf: enable
+    FUSED_OPS,
+    SILU_MUL_OP,
+    ActivationQuantFusionPass,
+)
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import CompilationConfig, PassConfig, VllmConfig
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
+    GroupShape,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, cutlass_fp8_supported)
+    Fp8LinearOp,
+    maybe_create_device_identity,
+)
 from vllm.platforms import current_platform
 
 from ..utils import override_cutlass_fp8_supported
@@ -36,7 +47,6 @@ def is_nvfp4_supported():
 
 
 class TestSiluMulFp8QuantModel(torch.nn.Module):
-
     def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
@@ -50,31 +60,39 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
                 act_quant_static=True,
                 act_quant_group_shape=GroupShape.PER_TENSOR,
             )
+        self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
+        self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
     def forward(self, x):
         y = self.silu_and_mul(x)
-        x2 = self.fp8_linear.apply(y,
-                                   self.w,
-                                   self.wscale,
-                                   input_scale=self.wscale)
+        x2 = self.fp8_linear.apply(y, self.w, self.wscale, input_scale=self.wscale)
         return x2
 
     def ops_in_model_before(self):
-        return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
+        return [
+            SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
+            (
+                QUANT_OPS[kFp8StaticTensorSym]
+                if self.enable_quant_fp8_custom_op
+                else torch.ops.aten.reciprocal
+            ),
+        ]
 
     def ops_in_model_after(self):
         return [FUSED_OPS[kFp8StaticTensorSym]]
 
 
 class TestSiluMulNvfp4QuantModel(torch.nn.Module):
-
     def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
         super().__init__()
         from vllm.compilation.activation_quant_fusion import (
-            silu_and_mul_nvfp4_quant_supported)
+            silu_and_mul_nvfp4_quant_supported,
+        )
+
         assert silu_and_mul_nvfp4_quant_supported
 
         self.silu_and_mul = SiluAndMul()
+        self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
 
         # create nvfp4 weight
         w = torch.rand((hidden_size, hidden_size))
@@ -88,16 +106,21 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
     def forward(self, x):
         y = self.silu_and_mul(x)
         y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
-        out = cutlass_scaled_fp4_mm(a=y_quant,
-                                    b=self.w,
-                                    block_scale_a=y_block_scale,
-                                    block_scale_b=self.w_block_scale,
-                                    alpha=self.alpha,
-                                    out_dtype=y.dtype)
+        out = cutlass_scaled_fp4_mm(
+            a=y_quant,
+            b=self.w,
+            block_scale_a=y_block_scale,
+            block_scale_b=self.w_block_scale,
+            alpha=self.alpha,
+            out_dtype=y.dtype,
+        )
         return out
 
     def ops_in_model_before(self):
-        return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
+        return [
+            SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
+            QUANT_OPS[kNvfp4Quant],
+        ]
 
     def ops_in_model_after(self):
         return [FUSED_OPS[kNvfp4Quant]]
@@ -106,64 +129,80 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("enable_silu_mul_custom_op", [True, False])
 @pytest.mark.parametrize(
-    "model_class",
-    cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
-         if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
+    "model_class, enable_quant_fp8_custom_op, cuda_force_torch",
+    list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False]))
+    + [(TestSiluMulNvfp4QuantModel, False, False)],
+)
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
-@pytest.mark.parametrize("cuda_force_torch",
-                         [True, False] if cutlass_fp8_supported() else [True])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
-                    reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
-                                   cuda_force_torch):
-    if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
-        pytest.skip("Duplicate tests for NVFP4")
+@pytest.mark.skipif(
+    envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
+)
+def test_fusion_silu_and_mul_quant(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel],
+    enable_silu_mul_custom_op: bool,
+    enable_quant_fp8_custom_op: bool,
+    cuda_force_torch: bool,
+):
+    if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
+        pytest.skip("NVFP4 is not supported on this GPU.")
 
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
+    maybe_create_device_identity()
 
     x = torch.rand(num_tokens, hidden_size * 2)
 
     # Reshape pass is needed for the fusion pass to work
-    config = VllmConfig()
-    config.compilation_config = CompilationConfig(
-        pass_config=PassConfig(enable_fusion=True, enable_noop=True))
-    fusion_pass = ActivationQuantFusionPass(config)
-
-    passes = [
-        NoOpEliminationPass(config), fusion_pass,
-        PostCleanupPass(config)
-    ]
-    backend = TestBackend(*passes)
-    model = model_class(hidden_size=hidden_size,
-                        cuda_force_torch=cuda_force_torch,
-                        x=x)
-
-    # First dimension dynamic
-    torch._dynamo.mark_dynamic(x, 0)
-
-    result = model(x)
-
-    model2 = torch.compile(model, backend=backend)
-    result2 = model2(x)
-
-    # Check that it gives the same answer
-    if model_class == TestSiluMulFp8QuantModel:
-        atol, rtol = 1e-3, 1e-3
-    elif model_class == TestSiluMulNvfp4QuantModel:
-        atol, rtol = 1e-1, 1e-1
-
-    torch.testing.assert_close(result[0].to(dtype=dtype),
-                               result2[0].to(dtype=dtype),
-                               atol=atol,
-                               rtol=rtol)
-
-    assert fusion_pass.matched_count == 1
-
-    # In pre-nodes, quant op should be present and fused kernels should not
-    backend.check_before_ops(model.ops_in_model_before())
-
-    # In post-nodes, fused kernels should be present and quant op should not
-    backend.check_after_ops(model.ops_in_model_after())
+    custom_ops = []
+    if enable_silu_mul_custom_op:
+        custom_ops.append("+silu_and_mul")
+    if enable_quant_fp8_custom_op:
+        custom_ops.append("+quant_fp8")
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+        ),
+    )
+
+    with set_current_vllm_config(config):
+        fusion_pass = ActivationQuantFusionPass(config)
+
+        passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
+        backend = TestBackend(*passes)
+        model = model_class(
+            hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x
+        )
+
+        # First dimension dynamic
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result = model(x)
+
+        model2 = torch.compile(model, backend=backend)
+        result2 = model2(x)
+
+        # Check that it gives the same answer
+        if model_class == TestSiluMulFp8QuantModel:
+            atol, rtol = 1e-3, 1e-3
+        elif model_class == TestSiluMulNvfp4QuantModel:
+            atol, rtol = 1e-1, 1e-1
+
+        torch.testing.assert_close(
+            result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol
+        )
+
+        assert fusion_pass.matched_count == 1
+
+        # In pre-nodes, quant op should be present and fused kernels should not
+        backend.check_before_ops(model.ops_in_model_before())
+
+        # In post-nodes, fused kernels should be present and quant op should not
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 5e39f6821d16ca69c7303c6fe713db041c3a2e83..356cac7af258b5d8889d4818d97246916d7632a0 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,64 +1,135 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
+import os
+
+import pytest
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 
 
 class MyMod(torch.nn.Module):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
-
-
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
 
+class MyWrapper(TorchCompileWithNoGuardsWrapper):
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable,
-                         compilation_level=CompilationLevel.DYNAMO_ONCE)
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
+
+
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
+    """Test basic functionality of TorchCompileWithNoGuardsWrapper."""
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+    vllm_config.compilation_config.backend = "inductor"
 
-def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    # Test DYNAMO_TRACE_ONCE
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(result3, expected3), (
+            f"Expected {result3}, got {expected3}"
+        )
+
+    # with STOCK_TORCH_COMPILE we do not remove guards.
+    vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x,
-                       None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(
-            new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.mode = None
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    # Run with both parameter values
+
+    class MockMonkeypatch:
+        def setenv(self, name, value):
+            os.environ[name] = value
+
+    mp = MockMonkeypatch()
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=False")
+    test_torch_compile_wrapper(False, mp)
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=True")
+    test_torch_compile_wrapper(True, mp)
+
+    print("All tests passed!")
diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py
index e37b6b95941e964230845218c4cc164930ec2952..61c3df0a23483096bc24d5af8c7723dc84dfcfd1 100644
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -14,8 +14,9 @@ def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
     """
 
     def create_config():
-        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
-                                 trust_remote_code=True)
+        engine_args = EngineArgs(
+            model="deepseek-ai/DeepSeek-V2-Lite", trust_remote_code=True
+        )
         return engine_args.create_engine_config()
 
     # Create config with CUDA_VISIBLE_DEVICES set normally
@@ -34,16 +35,18 @@ def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
     empty_config_dict.pop("instance_id", None)
 
     assert deep_compare(normal_config_dict, empty_config_dict), (
-        "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\""
-        " should be equivalent")
+        'Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=""'
+        " should be equivalent"
+    )
 
 
 def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
     # In testing, this method needs to be nested inside as ray does not
     # see the test module.
     def create_config():
-        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
-                                 trust_remote_code=True)
+        engine_args = EngineArgs(
+            model="deepseek-ai/DeepSeek-V2-Lite", trust_remote_code=True
+        )
         return engine_args.create_engine_config()
 
     config = create_config()
@@ -51,6 +54,7 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
     assert parallel_config.ray_runtime_env is None
 
     import ray
+
     ray.init()
 
     runtime_env = {
@@ -59,13 +63,13 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
         },
     }
 
-    config_ref = ray.remote(create_config).options(
-        runtime_env=runtime_env).remote()
+    config_ref = ray.remote(create_config).options(runtime_env=runtime_env).remote()
 
     config = ray.get(config_ref)
     parallel_config = config.parallel_config
     assert parallel_config.ray_runtime_env is not None
-    assert parallel_config.ray_runtime_env.env_vars().get(
-        "TEST_ENV_VAR") == "test_value"
+    assert (
+        parallel_config.ray_runtime_env.env_vars().get("TEST_ENV_VAR") == "test_value"
+    )
 
     ray.shutdown()
diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
index d4d4be293280ba64998b775aa6d0824d1daec008..56dc542f1c76ddb69ea330a91c7e9e3c8d6ea25d 100644
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -8,21 +8,18 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.v1.engine.async_llm import AsyncLLM
 
 
-def test_mp_reducer(monkeypatch):
+def test_mp_reducer():
     """
     Test that _reduce_config reducer is registered when AsyncLLM is instantiated
     without transformers_modules. This is a regression test for
     https://github.com/vllm-project/vllm/pull/18640.
     """
 
-    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
     # Ensure transformers_modules is not in sys.modules
-    if 'transformers_modules' in sys.modules:
-        del sys.modules['transformers_modules']
+    if "transformers_modules" in sys.modules:
+        del sys.modules["transformers_modules"]
 
-    with patch('multiprocessing.reducer.register') as mock_register:
+    with patch("multiprocessing.reducer.register") as mock_register:
         engine_args = AsyncEngineArgs(
             model="facebook/opt-125m",
             max_model_len=32,
@@ -36,7 +33,8 @@ def test_mp_reducer(monkeypatch):
         )
 
         assert mock_register.called, (
-            "multiprocessing.reducer.register should have been called")
+            "multiprocessing.reducer.register should have been called"
+        )
 
         vllm_config_registered = False
         for call_args in mock_register.call_args_list:
@@ -45,8 +43,7 @@ def test_mp_reducer(monkeypatch):
                 vllm_config_registered = True
 
                 reducer_func = call_args[0][1]
-                assert callable(
-                    reducer_func), "Reducer function should be callable"
+                assert callable(reducer_func), "Reducer function should be callable"
                 break
 
         assert vllm_config_registered, (
diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d02893e52f1e82d4729e8d0b52395e382fff484
--- /dev/null
+++ b/tests/config/test_multimodal_config.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.multimodal import MultiModalConfig
+
+
+def test_mm_encoder_attn_backend_str_conversion():
+    config = MultiModalConfig(mm_encoder_attn_backend="FLASH_ATTN")
+    assert config.mm_encoder_attn_backend == AttentionBackendEnum.FLASH_ATTN
+
+
+def test_mm_encoder_attn_backend_invalid():
+    with pytest.raises(ValueError):
+        MultiModalConfig(mm_encoder_attn_backend="not_a_backend")
+
+
+def test_mm_encoder_attn_backend_hash_updates():
+    base_hash = MultiModalConfig().compute_hash()
+    overridden_hash = MultiModalConfig(
+        mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
+    ).compute_hash()
+    assert base_hash != overridden_hash
diff --git a/tests/conftest.py b/tests/conftest.py
index a50985a465e6cfd8d70f4ac67b96322f66192885..b17081352edcf1b7f5e208aaae0a6cd150af342f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa
+import contextlib
+import pathlib
+from copy import deepcopy
 
 from tblib import pickling_support
 
+# Import fixture
+from tests.v1.entrypoints.conftest import sample_json_schema  # noqa
+
+# ruff: noqa
+
 # Install support for pickling exceptions so that we can nicely propagate
 # failures from tests running in a subprocess.
 # This should be run before any custom exception subclasses are defined.
@@ -21,7 +27,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast
 
 import numpy as np
 import pytest
@@ -30,30 +36,35 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
-                          BatchEncoding, BatchFeature)
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BatchEncoding,
+    BatchFeature,
+)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from tests.models.utils import (TokensTextLogprobs,
-                                TokensTextLogprobsPromptLogprobs)
-from vllm import LLM, SamplingParams
+from tests.models.utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
+from vllm import LLM, SamplingParams, envs
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config.model import (ConvertOption, RunnerOption,
-                               _get_and_verify_dtype)
+from vllm.config.model import ConvertOption, RunnerOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.inputs import TextPrompt
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils import set_default_torch_num_threads
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.torch_utils import set_default_torch_num_threads
 
 logger = init_logger(__name__)
 
@@ -64,7 +75,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
 
-_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+_PromptMultiModalInput = list[_M] | list[list[_M]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
 PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
@@ -83,12 +94,13 @@ class ImageAssetPrompts(TypedDict):
 
 
 class ImageTestAssets(list[ImageAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
         """
@@ -105,11 +117,12 @@ class VideoAssetPrompts(TypedDict):
 
 
 class VideoTestAssets(list[VideoAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            VideoAsset("baby_reading"),
-        ])
+        super().__init__(
+            [
+                VideoAsset("baby_reading"),
+            ]
+        )
 
     def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
         return [prompts["baby_reading"]]
@@ -121,12 +134,13 @@ class AudioAssetPrompts(TypedDict):
 
 
 class AudioTestAssets(list[AudioAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            AudioAsset("mary_had_lamb"),
-            AudioAsset("winning_call"),
-        ])
+        super().__init__(
+            [
+                AudioAsset("mary_had_lamb"),
+                AudioAsset("winning_call"),
+            ]
+        )
 
     def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
         return [prompts["mary_had_lamb"], prompts["winning_call"]]
@@ -140,26 +154,6 @@ AUDIO_ASSETS = AudioTestAssets()
 """Singleton instance of {class}`AudioTestAssets`."""
 
 
-@pytest.fixture(scope="function", autouse=True)
-def cleanup_VLLM_USE_V1(monkeypatch):
-    """
-    The V1 oracle sets "VLLM_USE_V1" during loading. This means
-    that each invocation of a test change the env variable.
-
-    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
-    made during the test run by vLLM will be cleaned up.
-
-    This fixture is used by every test.
-    """
-
-    # If VLLM_USE_V1 is not set, set then delete. This will
-    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
-    # if VLLM modifies the value of envs.VLLM_USE_V1.
-    if "VLLM_USE_V1" not in os.environ:
-        monkeypatch.setenv("VLLM_USE_V1", "")
-        monkeypatch.delenv("VLLM_USE_V1")
-
-
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
@@ -221,6 +215,7 @@ def example_system_message() -> str:
 
 class DecoderPromptType(Enum):
     """For encoder/decoder models only."""
+
     CUSTOM = 1
     NONE = 2
     EMPTY_STR = 3
@@ -254,15 +249,13 @@ _R = TypeVar("_R")
 
 
 class HfRunner:
-
     def get_default_device(self):
         from vllm.platforms import current_platform
 
-        return ("cpu"
-                if current_platform.is_cpu() else current_platform.device_type)
+        return "cpu" if current_platform.is_cpu() else current_platform.device_type
 
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
-        if x is None or isinstance(x, (bool, )):
+    def wrap_device(self, x: _T, device: str | None = None) -> _T:
+        if x is None or isinstance(x, (bool,)):
             return x
 
         if device is None:
@@ -281,17 +274,20 @@ class HfRunner:
         model_name: str,
         dtype: str = "auto",
         *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
         trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
         # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
     ) -> None:
-        init_ctx = (nullcontext() if default_torch_num_threads is None else
-                    set_default_torch_num_threads(default_torch_num_threads))
+        init_ctx = (
+            nullcontext()
+            if default_torch_num_threads is None
+            else set_default_torch_num_threads(default_torch_num_threads)
+        )
 
         with init_ctx:
             self._init(
@@ -310,7 +306,7 @@ class HfRunner:
         model_name: str,
         dtype: str = "auto",
         *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
         trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
@@ -325,7 +321,7 @@ class HfRunner:
             trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(
+        self.dtype = dtype = _get_and_verify_dtype(
             self.model_name,
             self.config,
             dtype=dtype,
@@ -333,7 +329,7 @@ class HfRunner:
         )
 
         model_kwargs = model_kwargs if model_kwargs is not None else {}
-        model_kwargs.setdefault("torch_dtype", torch_dtype)
+        model_kwargs.setdefault("dtype", dtype)
 
         if is_sentence_transformer:
             # Lazy init required for AMD CI
@@ -363,14 +359,15 @@ class HfRunner:
             )
 
             # in case some unquantized custom models are not in same dtype
-            if (getattr(model, "quantization_method", None) is None
-                    and any(p.dtype != self.dtype
-                            for p in model.parameters())):
+            if getattr(model, "quantization_method", None) is None and any(
+                p.dtype != self.dtype for p in model.parameters()
+            ):
                 model = model.to(dtype=self.dtype)
 
-            if (getattr(model, "quantization_method", None) != "bitsandbytes"
-                    and len({p.device
-                             for p in model.parameters()}) < 2):
+            if (
+                getattr(model, "quantization_method", None) != "bitsandbytes"
+                and len({p.device for p in model.parameters()}) < 2
+            ):
                 model = model.to(device=self.device)
 
             self.model = model
@@ -378,16 +375,17 @@ class HfRunner:
         if not skip_tokenizer_init:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
         # don't put this import at the top level
         # it will call torch.cuda.device_count()
         from transformers import AutoProcessor  # noqa: F401
+
         self.processor = AutoProcessor.from_pretrained(
             model_name,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
         if skip_tokenizer_init:
@@ -395,11 +393,11 @@ class HfRunner:
 
     def get_inputs(
         self,
-        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[Union[BatchFeature, BatchEncoding]]:
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+    ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -409,31 +407,46 @@ class HfRunner:
         if audios is not None:
             assert len(prompts) == len(audios)
 
-        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
+        all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
-            processor_kwargs: dict[str, Any] = {
-                "text": prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and (image := images[i]) is not None:
-                processor_kwargs["images"] = image
-            if videos is not None and (video := videos[i]) is not None:
-                processor_kwargs["videos"] = video
-            if audios is not None and (audio_inputs := audios[i]) is not None:
-                # HACK - not all processors take sampling_rate; we should
-                # clean this up in the future.
-                if len(audio_inputs) == 2:
-                    audio, sr = audio_inputs
-                    processor_kwargs["audio"] = audio
-                    processor_kwargs["sampling_rate"] = sr
-                else:
-                    processor_kwargs["audio"] = audio_inputs
-
-            inputs = self.processor(**processor_kwargs)
-            if isinstance(inputs, BatchFeature):
-                inputs = inputs.to(dtype=self.dtype)
-
-            all_inputs.append(inputs)
+            if isinstance(prompt, str):
+                processor_kwargs: dict[str, Any] = {
+                    "text": prompt,
+                    "return_tensors": "pt",
+                }
+                if images is not None and (image := images[i]) is not None:
+                    processor_kwargs["images"] = image
+                if videos is not None and (video := videos[i]) is not None:
+                    processor_kwargs["videos"] = video
+                if audios is not None and (audio_inputs := audios[i]) is not None:
+                    # HACK - not all processors take sampling_rate; we should
+                    # clean this up in the future.
+                    if len(audio_inputs) == 2:
+                        audio, sr = audio_inputs
+                        processor_kwargs["audio"] = audio
+                        processor_kwargs["sampling_rate"] = sr
+                    else:
+                        processor_kwargs["audio"] = audio_inputs
+
+                inputs = self.processor(**processor_kwargs)
+                if isinstance(inputs, BatchFeature):
+                    inputs = inputs.to(dtype=self.dtype)
+                all_inputs.append(inputs)
+            else:
+                # check that prompt is (batched) list of integers (token ids)
+                if not is_list_of(prompt, typ=int, check="all"):
+                    raise ValueError(
+                        "Prompt must be a list of ints corresponding to the prompt token ids."
+                    )
+                # check that no multimodal input is provided
+                if images or videos or audios:
+                    raise ValueError(
+                        "When providing prompt token ids multimodal inputs are not supported."
+                    )
+                input_dict = {
+                    "input_ids": torch.tensor(prompt, dtype=torch.long).unsqueeze(0),
+                }
+                all_inputs.append(input_dict)
 
         return all_inputs
 
@@ -466,16 +479,15 @@ class HfRunner:
 
     def generate(
         self,
-        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
@@ -495,48 +507,50 @@ class HfRunner:
 
     def generate_greedy(
         self,
-        prompts: list[str],
+        prompts: list[str] | list[list[int]],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
-        outputs = self.generate(prompts,
-                                do_sample=False,
-                                max_new_tokens=max_tokens,
-                                images=images,
-                                videos=videos,
-                                audios=audios,
-                                **kwargs)
+        outputs = self.generate(
+            prompts,
+            do_sample=False,
+            max_new_tokens=max_tokens,
+            images=images,
+            videos=videos,
+            audios=audios,
+            **kwargs,
+        )
 
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
 
     def generate_beam_search(
         self,
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        outputs = self.generate(prompts,
-                                do_sample=False,
-                                max_new_tokens=max_tokens,
-                                num_beams=beam_width,
-                                num_return_sequences=beam_width,
-                                images=images,
-                                videos=videos,
-                                audios=audios)
+        outputs = self.generate(
+            prompts,
+            do_sample=False,
+            max_new_tokens=max_tokens,
+            num_beams=beam_width,
+            num_return_sequences=beam_width,
+            images=images,
+            videos=videos,
+            audios=audios,
+        )
 
         for i in range(len(outputs)):
             output_ids, output_str = outputs[i]
             for j in range(len(output_ids)):
                 output_ids[j] = [
-                    x for x in output_ids[j]
-                    if x != self.tokenizer.pad_token_id
+                    x for x in output_ids[j] if x != self.tokenizer.pad_token_id
                 ]
             outputs[i] = (output_ids, output_str)
         return outputs
@@ -545,15 +559,14 @@ class HfRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[list[torch.Tensor]]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
@@ -566,8 +579,7 @@ class HfRunner:
                 return_dict_in_generate=True,
                 **kwargs,
             )
-            seq_logprobs = self._hidden_states_to_seq_logprobs(
-                output.hidden_states)
+            seq_logprobs = self._hidden_states_to_seq_logprobs(output.hidden_states)
             all_logprobs.append(seq_logprobs)
         return all_logprobs
 
@@ -597,7 +609,7 @@ class HfRunner:
     def _hidden_states_to_logprobs(
         self,
         hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
     ) -> tuple[list[dict[int, float]], int]:
         seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
         output_len = len(hidden_states)
@@ -625,16 +637,15 @@ class HfRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: Optional[int],
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        num_logprobs: int | None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         all_logprobs: list[list[dict[int, float]]] = []
         all_output_ids: list[list[int]] = []
@@ -654,8 +665,7 @@ class HfRunner:
             (
                 seq_logprobs_lst,
                 output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states,
-                                                num_logprobs)
+            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
@@ -665,19 +675,16 @@ class HfRunner:
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [
+            (output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs
+        ]
 
-    def encode(self, prompts: list[str], *args,
-               **kwargs) -> list[list[torch.Tensor]]:
+    def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
         return self.model.encode(prompts, *args, **kwargs)
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-        return self.model.predict(prompts,
-                                  *args,
-                                  convert_to_tensor=True,
-                                  **kwargs)
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
+        return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)
 
     def __enter__(self):
         return self
@@ -712,24 +719,34 @@ class VllmRunner:
         model_name: str,
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
-        tokenizer_name: Optional[str] = None,
+        tokenizer_name: str | None = None,
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
-        seed: Optional[int] = 0,
-        max_model_len: Optional[int] = 1024,
+        seed: int | None = 0,
+        max_model_len: int | None = 1024,
         dtype: str = "auto",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16 if not torch.xpu.is_available() else 64,
-        enable_chunked_prefill: Optional[bool] = False,
+        enable_chunked_prefill: bool | None = False,
         swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: bool | None = False,
         # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
         **kwargs,
     ) -> None:
-        init_ctx = (nullcontext() if default_torch_num_threads is None else
-                    set_default_torch_num_threads(default_torch_num_threads))
+        init_ctx = (
+            nullcontext()
+            if default_torch_num_threads is None
+            else set_default_torch_num_threads(default_torch_num_threads)
+        )
+
+        if not kwargs.get("compilation_config", None):
+            # Note(@tdoublep): This is set to 4 because some tests (e.g., hybrid
+            # model tests) may set max_num_seqs=4. If min cudagraph_capture_size is
+            # set to larger than max_num_seqs, then it will lead to *no* graphs
+            # being captured which can trigger edge cases that we don't handle yet.
+            kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
 
         with init_ctx:
             self.llm = LLM(
@@ -753,21 +770,29 @@ class VllmRunner:
 
     def get_inputs(
         self,
-        prompts: Union[list[str], list[torch.Tensor], list[int]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[TextPrompt]:
-
-        if any(x is not None and len(x) != len(prompts)
-               for x in [images, videos, audios]):
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+    ) -> list[dict[str, Any]]:
+        if any(
+            x is not None and len(x) != len(prompts) for x in [images, videos, audios]
+        ):
             raise ValueError(
-                "All non-None multimodal inputs must have the same length as "
-                "prompts")
+                "All non-None multimodal inputs must have the same length as prompts"
+            )
 
-        inputs = []
+        inputs = list[dict[str, Any]]()
         for i, prompt in enumerate(prompts):
-            multi_modal_data = {}
+            prompt_dict = dict[str, Any]()
+            if isinstance(prompt, str):
+                prompt_dict["prompt"] = prompt
+            elif isinstance(prompt, list):
+                prompt_dict["prompt_token_ids"] = prompt
+            else:
+                prompt_dict["prompt_embeds"] = prompt
+
+            multi_modal_data = dict[str, Any]()
             if images is not None and (image := images[i]) is not None:
                 multi_modal_data["image"] = image
             if videos is not None and (video := videos[i]) is not None:
@@ -775,51 +800,47 @@ class VllmRunner:
             if audios is not None and (audio := audios[i]) is not None:
                 multi_modal_data["audio"] = audio
 
-            text_prompt_kwargs: dict[str, Any] = {
-                "multi_modal_data": multi_modal_data or None
-            }
-            if isinstance(prompt, str):
-                text_prompt_kwargs["prompt"] = prompt
-            elif isinstance(prompt, list):
-                text_prompt_kwargs["prompt_token_ids"] = prompt
-            else:
-                text_prompt_kwargs["prompt_embeds"] = prompt
+            if multi_modal_data:
+                prompt_dict["multi_modal_data"] = multi_modal_data
 
-            inputs.append(TextPrompt(**text_prompt_kwargs))
+            inputs.append(prompt_dict)
 
         return inputs
 
     def generate(
         self,
-        prompts: Union[list[str], list[torch.Tensor]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
         sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        return_logprobs: bool = False,
         **kwargs: Any,
-    ) -> list[tuple[list[list[int]], list[str]]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+    ) -> list[tuple[list[list[int]], list[str]]] | tuple[list, list]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
-        req_outputs = self.llm.generate(inputs,
-                                        sampling_params=sampling_params,
-                                        **kwargs)
+        req_outputs = self.llm.generate(
+            inputs, sampling_params=sampling_params, **kwargs
+        )
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
+        logprobs = []
         for req_output in req_outputs:
             prompt_str = req_output.prompt
             prompt_ids = req_output.prompt_token_ids
             req_sample_output_ids: list[list[int]] = []
             req_sample_output_strs: list[str] = []
+            req_logprobs = []
             for sample in req_output.outputs:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 req_sample_output_ids.append(prompt_ids + output_ids)
                 req_sample_output_strs.append((prompt_str or "") + output_str)
+                if sample.logprobs:
+                    req_logprobs.extend(sample.logprobs)
             outputs.append((req_sample_output_ids, req_sample_output_strs))
-        return outputs
+            logprobs.append(req_logprobs)
+        return outputs if not return_logprobs else (outputs, logprobs)
 
     @staticmethod
     def _final_steps_generate_w_logprobs(
@@ -832,83 +853,86 @@ class VllmRunner:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 output_logprobs = sample.logprobs
-            outputs.append((output_ids, output_str, output_logprobs,
-                            req_output.prompt_logprobs))
+            outputs.append(
+                (output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
+            )
         return outputs
 
     def generate_w_logprobs(
         self,
         prompts: list[str],
         sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
-
-        req_outputs = self.llm.generate(inputs,
-                                        sampling_params=sampling_params,
-                                        **kwargs)
-
-        toks_str_logsprobs_prompt_logprobs = (
-            self._final_steps_generate_w_logprobs(req_outputs))
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
+
+        req_outputs = self.llm.generate(
+            inputs, sampling_params=sampling_params, **kwargs
+        )
+
+        toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
+            req_outputs
+        )
         # Omit prompt logprobs if not required by sampling params
-        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
-                toks_str_logsprobs_prompt_logprobs)
+        return (
+            [x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+            if sampling_params.prompt_logprobs is None
+            else toks_str_logsprobs_prompt_logprobs
+        )
 
     def generate_greedy(
         self,
-        prompts: Union[list[str], list[torch.Tensor]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts,
-                                greedy_params,
-                                images=images,
-                                videos=videos,
-                                audios=audios,
-                                **kwargs)
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        outputs = self.generate(
+            prompts,
+            greedy_params,
+            images=images,
+            videos=videos,
+            audios=audios,
+            **kwargs,
+        )
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
 
     def generate_greedy_logprobs(
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: Optional[int],
-        num_prompt_logprobs: Optional[int] = None,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        stop: Optional[list[str]] = None,
+        num_logprobs: int | None,
+        num_prompt_logprobs: int | None = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
+        stop_token_ids: list[int] | None = None,
+        stop: list[str] | None = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=num_prompt_logprobs,
             stop_token_ids=stop_token_ids,
-            stop=stop)
+            stop=stop,
+        )
 
-        return self.generate_w_logprobs(prompts,
-                                        greedy_logprobs_params,
-                                        images=images,
-                                        audios=audios,
-                                        videos=videos,
-                                        **kwargs)
+        return self.generate_w_logprobs(
+            prompts,
+            greedy_logprobs_params,
+            images=images,
+            audios=audios,
+            videos=videos,
+            **kwargs,
+        )
 
     def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
         """
@@ -917,15 +941,14 @@ class VllmRunner:
         :param prompts: list of prompts to score
         :return: perplexity score of each prompt
         """
-        outputs = self.generate_greedy_logprobs(prompts,
-                                                max_tokens=1,
-                                                num_logprobs=None,
-                                                num_prompt_logprobs=0)
+        outputs = self.generate_greedy_logprobs(
+            prompts, max_tokens=1, num_logprobs=None, num_prompt_logprobs=0
+        )
 
         perplexities = []
         for output in outputs:
             output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
             assert token_datas[0] is None
             token_log_probs = []
             for token_data in token_datas[1:]:
@@ -944,20 +967,18 @@ class VllmRunner:
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        concurrency_limit: Optional[int] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        concurrency_limit: int | None = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
-
-        outputs = self.llm.beam_search(inputs,
-                                       BeamSearchParams(beam_width=beam_width,
-                                                        max_tokens=max_tokens),
-                                       concurrency_limit=concurrency_limit)
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
+
+        outputs = self.llm.beam_search(
+            inputs,
+            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens),
+            concurrency_limit=concurrency_limit,
+        )
         returned_outputs = []
         for output in outputs:
             token_ids = [x.tokens for x in output.sequences]
@@ -969,23 +990,26 @@ class VllmRunner:
         req_outputs = self.llm.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
-    def embed(self,
-              prompts: list[str],
-              images: Optional[PromptImageInput] = None,
-              videos: Optional[PromptVideoInput] = None,
-              audios: Optional[PromptAudioInput] = None,
-              *args,
-              **kwargs) -> list[list[float]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+    def embed(
+        self,
+        prompts: list[str],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        *args,
+        **kwargs,
+    ) -> list[list[float]]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
         req_outputs = self.llm.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
-    def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.llm.encode(prompts)
+    def token_embed(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def token_classify(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
         return [req_output.outputs.data for req_output in req_outputs]
 
     def reward(self, prompts: list[str]) -> list[list[float]]:
@@ -994,8 +1018,8 @@ class VllmRunner:
 
     def score(
         self,
-        text_1: Union[str, list[str]],
-        text_2: Union[str, list[str]],
+        text_1: list[str] | str,
+        text_2: list[str] | str,
         *args,
         **kwargs,
     ) -> list[float]:
@@ -1024,6 +1048,7 @@ def vllm_runner():
 @pytest.fixture()
 def temporary_enable_log_propagate():
     import logging
+
     logger = logging.getLogger("vllm")
     logger.propagate = True
     yield
@@ -1037,12 +1062,108 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
     yield caplog
 
 
+@pytest.fixture()
+def caplog_mp_fork():
+    """
+    This fixture enables capturing logs from a forked MP subprocess.
+    It should be used in conjunction with caplog_vllm.
+
+    By default, subprocess logs do not go through the parent process.
+    We instead create a queue listener in the parent process which
+    forwards logs to the logger's other handlers, and add a QueueHandler
+    to the root logger. Forked subprocesses will inherit the root logger
+    and pass their messages to the queue, which the listener will forward
+    to the root logger, which can be captured by caplog.
+
+    Note that this workaround only works for fork; with spawn, the subprocess
+    reinitializes logging and does not automatically inherit the queue.
+    We'd have to manually pass the queue to the subprocess at the spawn point.
+    See caplog_mp_spawn below.
+    """
+
+    @contextlib.contextmanager
+    def ctx():
+        import logging.handlers
+        import multiprocessing as mp
+
+        logger_queue: mp.Queue[logging.LogRecord] = mp.Queue()
+        logger = logging.getLogger()
+        handlers = logger.handlers
+
+        # The listener works on a background thread, not inherited by the child.
+        queue_listener = logging.handlers.QueueListener(logger_queue, *handlers)
+        queue_listener.start()
+
+        # Add queue handler after creating the listener to avoid cycle
+        logger.addHandler(logging.handlers.QueueHandler(logger_queue))
+        yield
+        queue_listener.stop()
+
+    return ctx
+
+
+class LogHolder:
+    def __init__(self):
+        self.text = None
+
+
+@pytest.fixture()
+def caplog_mp_spawn(tmp_path, monkeypatch):
+    """
+    This fixture enables capturing logs from a forked MP subprocess.
+    It does not require caplog_vllm (but it only contains logs from the child).
+
+    By default, subprocess logs do not go through the parent process.
+    We instead add a FileHandler to the config so the spawned child process
+    writes its logs to a temp file.
+    In the parent, we read the file and return the contents.
+
+    Note: this method could be extended to fork by either reconfiguring logging
+    in the parent or using a SocketHandler:
+    https://docs.python.org/3/howto/logging-cookbook.html#sending-and-receiving-logging-events-across-a-network # noqa: E501
+    """
+
+    @contextlib.contextmanager
+    def ctx(level: int | str):
+        from vllm.logger import DEFAULT_LOGGING_CONFIG
+
+        config_path = tmp_path / "vllm_logging_config.json"
+        log_path = tmp_path / "vllm.log"
+        log_holder = LogHolder()
+
+        config = deepcopy(DEFAULT_LOGGING_CONFIG)
+        if envs.VLLM_LOGGING_CONFIG_PATH:
+            path = pathlib.Path(envs.VLLM_LOGGING_CONFIG_PATH)
+            assert path.exists()
+            config = json.loads(path.read_text())
+
+        config["loggers"]["vllm"]["handlers"] += ["vllm_file"]
+        config["handlers"]["vllm_file"] = {
+            "class": "logging.FileHandler",
+            "formatter": "vllm",
+            "level": level,
+            "filename": log_path.as_posix(),
+        }
+
+        config_path.write_text(json.dumps(config))
+
+        with monkeypatch.context() as monkeypatch_ctx:
+            monkeypatch_ctx.setenv("VLLM_LOGGING_CONFIG_PATH", config_path.as_posix())
+            monkeypatch_ctx.setenv("VLLM_CONFIGURE_LOGGING", "1")
+            yield log_holder
+
+        log_holder.text = log_path.read_text()
+
+    return ctx
+
+
 @pytest.fixture(scope="session")
 def num_gpus_available():
     """Get number of GPUs without initializing the CUDA context
     in current process."""
 
     from vllm.platforms import current_platform
+
     return current_platform.device_count()
 
 
@@ -1056,12 +1177,11 @@ _dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
 def dummy_opt_path():
     json_path = os.path.join(_dummy_opt_path, "config.json")
     if not os.path.exists(_dummy_opt_path):
-        snapshot_download(repo_id="facebook/opt-125m",
-                          local_dir=_dummy_opt_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
-                          ])
+        snapshot_download(
+            repo_id="facebook/opt-125m",
+            local_dir=_dummy_opt_path,
+            ignore_patterns=["*.bin", "*.bin.index.json", "*.pt", "*.h5", "*.msgpack"],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1075,12 +1195,18 @@ def dummy_opt_path():
 def dummy_llava_path():
     json_path = os.path.join(_dummy_llava_path, "config.json")
     if not os.path.exists(_dummy_llava_path):
-        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
-                          local_dir=_dummy_llava_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack", "*.safetensors"
-                          ])
+        snapshot_download(
+            repo_id="llava-hf/llava-1.5-7b-hf",
+            local_dir=_dummy_llava_path,
+            ignore_patterns=[
+                "*.bin",
+                "*.bin.index.json",
+                "*.pt",
+                "*.h5",
+                "*.msgpack",
+                "*.safetensors",
+            ],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1094,12 +1220,18 @@ def dummy_llava_path():
 def dummy_gemma2_embedding_path():
     json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
     if not os.path.exists(_dummy_gemma2_embedding_path):
-        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
-                          local_dir=_dummy_gemma2_embedding_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack", "*.safetensors"
-                          ])
+        snapshot_download(
+            repo_id="BAAI/bge-multilingual-gemma2",
+            local_dir=_dummy_gemma2_embedding_path,
+            ignore_patterns=[
+                "*.bin",
+                "*.bin.index.json",
+                "*.pt",
+                "*.h5",
+                "*.msgpack",
+                "*.safetensors",
+            ],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1112,10 +1244,9 @@ def dummy_gemma2_embedding_path():
 # Add the flag `--optional` to allow run tests
 # that are marked with @pytest.mark.optional
 def pytest_addoption(parser):
-    parser.addoption("--optional",
-                     action="store_true",
-                     default=False,
-                     help="run optional test")
+    parser.addoption(
+        "--optional", action="store_true", default=False, help="run optional test"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
@@ -1183,11 +1314,10 @@ def _find_free_port() -> int:
 
 
 class LocalAssetServer:
-
     address: str
     port: int
-    server: Optional[http.server.ThreadingHTTPServer]
-    thread: Optional[threading.Thread]
+    server: http.server.ThreadingHTTPServer | None
+    thread: threading.Thread | None
 
     def __init__(self, address: str = "127.0.0.1") -> None:
         self.address = address
@@ -1198,9 +1328,9 @@ class LocalAssetServer:
     def __enter__(self):
         self.port = _find_free_port()
         self.server = http.server.ThreadingHTTPServer(
-            (self.address, self.port), AssetHandler)
-        self.thread = threading.Thread(target=self.server.serve_forever,
-                                       daemon=True)
+            (self.address, self.port), AssetHandler
+        )
+        self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
         self.thread.start()
         return self
 
@@ -1234,7 +1364,7 @@ class LocalAssetServer:
 @pytest.fixture(scope="session")
 def local_asset_server() -> Generator[LocalAssetServer, None, None]:
     """
-    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. 
+    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port.
     The server currently servers images at:
     http://127.0.0.1:<port>/<name>.<ext>
     """
@@ -1254,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
     """Indirect fixture: takes a list of names, returns list of full URLs."""
     names: list[str] = request.param
     return [local_asset_server.url_for(name) for name in names]
+
+
+@pytest.fixture
+def disable_deepgemm_ue8m0(monkeypatch):
+    from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+    with monkeypatch.context() as monkeypatch_ctx:
+        monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
+        is_deep_gemm_e8m0_used.cache_clear()
+        yield
+        # Clear cache so the next time it is used it is processed with the
+        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
+        is_deep_gemm_e8m0_used.cache_clear()
diff --git a/tests/cuda/test_cuda_context.py b/tests/cuda/test_cuda_context.py
index f973b284b87e1761a6b3a39e0162f36ecda0033f..6336f2112c66e3b35e4e9dffc55a4d23368c3cca 100644
--- a/tests/cuda/test_cuda_context.py
+++ b/tests/cuda/test_cuda_context.py
@@ -13,7 +13,7 @@ from vllm.platforms import current_platform
 def check_cuda_context():
     """Check CUDA driver context status"""
     try:
-        cuda = ctypes.CDLL('libcuda.so')
+        cuda = ctypes.CDLL("libcuda.so")
         device = ctypes.c_int()
         result = cuda.cuCtxGetDevice(ctypes.byref(device))
         return (True, device.value) if result == 0 else (False, None)
@@ -27,9 +27,11 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
         # New thread should have no CUDA context initially
         valid_before, device_before = check_cuda_context()
         if valid_before:
-            return False, \
-                "CUDA context should not exist in new thread, " \
-                f"got device {device_before}"
+            return (
+                False,
+                "CUDA context should not exist in new thread, "
+                f"got device {device_before}",
+            )
 
         # Test setting CUDA context
         current_platform.set_device(device_input)
@@ -39,8 +41,7 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
         if not valid_after:
             return False, "CUDA context should be valid after set_cuda_context"
         if device_id != expected_device_id:
-            return False, \
-                f"Expected device {expected_device_id}, got {device_id}"
+            return False, f"Expected device {expected_device_id}, got {device_id}"
 
         return True, "Success"
     except Exception as e:
@@ -50,30 +51,30 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
 class TestSetCudaContext:
     """Test suite for the set_cuda_context function."""
 
-    @pytest.mark.skipif(not current_platform.is_cuda(),
-                        reason="CUDA not available")
-    @pytest.mark.parametrize(argnames="device_input,expected_device_id",
-                             argvalues=[
-                                 (0, 0),
-                                 (torch.device('cuda:0'), 0),
-                                 ('cuda:0', 0),
-                             ],
-                             ids=["int", "torch_device", "string"])
-    def test_set_cuda_context_parametrized(self, device_input,
-                                           expected_device_id):
+    @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+    @pytest.mark.parametrize(
+        argnames="device_input,expected_device_id",
+        argvalues=[
+            (0, 0),
+            (torch.device("cuda:0"), 0),
+            ("cuda:0", 0),
+        ],
+        ids=["int", "torch_device", "string"],
+    )
+    def test_set_cuda_context_parametrized(self, device_input, expected_device_id):
         """Test setting CUDA context in isolated threads."""
         with ThreadPoolExecutor(max_workers=1) as executor:
-            future = executor.submit(run_cuda_test_in_thread, device_input,
-                                     expected_device_id)
+            future = executor.submit(
+                run_cuda_test_in_thread, device_input, expected_device_id
+            )
             success, message = future.result(timeout=30)
         assert success, message
 
-    @pytest.mark.skipif(not current_platform.is_cuda(),
-                        reason="CUDA not available")
+    @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
     def test_set_cuda_context_invalid_device_type(self):
         """Test error handling for invalid device type."""
         with pytest.raises(ValueError, match="Expected a cuda device"):
-            current_platform.set_device(torch.device('cpu'))
+            current_platform.set_device(torch.device("cpu"))
 
 
 if __name__ == "__main__":
diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
index ae06a985c7ecdc86cf4a9ca895b5e4cff89fb5d8..a77626df5dc78ab9a506776a459066e76d2cc2ea 100644
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -17,20 +17,16 @@ def test_computed_prefix_blocks(model: str):
     prompt = (
         "You are a helpful assistant. How do I build a car from cardboard and "
         "paper clips? Is there an easy to follow video tutorial available "
-        "online for free?")
+        "online for free?"
+    )
 
     llm = LLM(model=model)
-    sampling_params = SamplingParams(max_tokens=10,
-                                     temperature=0.0,
-                                     detokenize=False)
+    sampling_params = SamplingParams(max_tokens=10, temperature=0.0, detokenize=False)
 
-    outputs_no_detokenization = llm.generate(prompt,
-                                             sampling_params)[0].outputs[0]
+    outputs_no_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
     sampling_params.detokenize = True
-    outputs_with_detokenization = llm.generate(prompt,
-                                               sampling_params)[0].outputs[0]
+    outputs_with_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
 
-    assert outputs_no_detokenization.text == ''
-    assert outputs_with_detokenization.text != ''
-    assert outputs_no_detokenization.token_ids == \
-        outputs_with_detokenization.token_ids
+    assert outputs_no_detokenization.text == ""
+    assert outputs_with_detokenization.text != ""
+    assert outputs_no_detokenization.token_ids == outputs_with_detokenization.token_ids
diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py
index 26003373c569c53b9fd5a03523e453d060a3d736..1f8e944695bdc8438518fcb9f85120db509fb33c 100644
--- a/tests/detokenizer/test_min_tokens.py
+++ b/tests/detokenizer/test_min_tokens.py
@@ -8,15 +8,17 @@ from vllm import SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
 
-PROMPT = "Hello, my name is Lee, and I'm a student in the " + \
-         "college of engineering"
+PROMPT = "Hello, my name is Lee, and I'm a student in the " + "college of engineering"
 
 
-@pytest.mark.parametrize("min_tokens,stop,truth", [
-    (0, None, " is Lee, and I'm a student in the college of engineering"),
-    (0, "e", " is L"),
-    (5, "e", " is Lee, and I'm a stud"),
-])
+@pytest.mark.parametrize(
+    "min_tokens,stop,truth",
+    [
+        (0, None, " is Lee, and I'm a student in the college of engineering"),
+        (0, "e", " is L"),
+        (5, "e", " is Lee, and I'm a stud"),
+    ],
+)
 def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
     """Test for a specific min_tokens and stop.
 
@@ -31,16 +33,18 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
         stop=stop,
         min_tokens=min_tokens,
     )
-    request = EngineCoreRequest(request_id="",
-                                prompt_token_ids=prompt_token_ids,
-                                mm_features=None,
-                                sampling_params=params,
-                                pooling_params=None,
-                                eos_token_id=None,
-                                arrival_time=0.0,
-                                lora_request=None,
-                                cache_salt=None,
-                                data_parallel_rank=None)
+    request = EngineCoreRequest(
+        request_id="",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
 
     detokenizer = FastIncrementalDetokenizer(tokenizer, request)
 
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 1ff679789c9595cd75c484eb748e0a3aabf61a6f..6565949cc50fcc89004f797296bfacecfb021fdb 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -31,34 +31,39 @@ def test_stop_reason(vllm_model, example_prompts):
     llm = vllm_model.llm
 
     # test stop token
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               ignore_eos=True,
-                               seed=SEED,
-                               max_tokens=MAX_TOKENS,
-                               stop_token_ids=[stop_token_id]))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True,
+            seed=SEED,
+            max_tokens=MAX_TOKENS,
+            stop_token_ids=[stop_token_id],
+        ),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "stop"
         assert output.stop_reason == stop_token_id
 
     # test stop string
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               ignore_eos=True,
-                               seed=SEED,
-                               max_tokens=MAX_TOKENS,
-                               stop="."))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True, seed=SEED, max_tokens=MAX_TOKENS, stop="."
+        ),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "stop"
         assert output.stop_reason == STOP_STR
 
     # test EOS token
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               seed=SEED, max_tokens=MAX_TOKENS))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(seed=SEED, max_tokens=MAX_TOKENS),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "length" or (
-            output.finish_reason == "stop" and output.stop_reason is None)
+            output.finish_reason == "stop" and output.stop_reason is None
+        )
diff --git a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
index 9b32a2927f2de97ff55db68b6b23ec14559cc58e..5624332ef71d63aeef624cf7fa41dbb293d157cf 100644
--- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
+++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
@@ -14,7 +14,6 @@ def include_stop_str_in_output(request):
 
 
 class _DummyDetokenizer(BaseIncrementalDetokenizer):
-
     def __init__(self, request: EngineCoreRequest):
         super().__init__(request)
 
@@ -27,7 +26,8 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
     params = SamplingParams(
         stop=stop,
         include_stop_str_in_output=include_stop_str_in_output,
-        min_tokens=min_tokens)
+        min_tokens=min_tokens,
+    )
     # Keep other fields minimal for unit test purposes.
     req = EngineCoreRequest(
         request_id="test",
@@ -44,26 +44,25 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
     return req
 
 
-def test_stop_string_while_stop_token_terminates(
-        include_stop_str_in_output: bool):
+def test_stop_string_while_stop_token_terminates(include_stop_str_in_output: bool):
     """
     This test verifies that the detokenizer correctly handles the case where
     the generated token sequence contains both:
     - a stop token
     - an <eos> token
-    
+
     The detokenizer should respect the stop string and truncate the output
     accordingly.
-    
+
     Imagine the following sequence:
     - "abcdeZ" is generated, where "Z" is the <eos> token.
     - "cd" is the stop string.
-    
+
     If include_stop_str_in_output=False, the detokenizer should truncate the
     output to "ab" because the stop string "cd" is excluded.
     If include_stop_str_in_output=True, the detokenizer should include the stop
     string "cd" in the output, resulting in "abcd".
-    
+
 
     This verifies the behavioral change introduced in BaseIncrementalDetokenizer
     where stop-string evaluation occurs before the early-return on
@@ -78,8 +77,9 @@ def test_stop_string_while_stop_token_terminates(
     token_ids = [ord(c) for c in generated_text]
 
     # Create a request with the stop string and initialize the detokenizer.
-    req = _make_request(stop=[stop_string],
-                        include_stop_str_in_output=include_stop_str_in_output)
+    req = _make_request(
+        stop=[stop_string], include_stop_str_in_output=include_stop_str_in_output
+    )
     detok = _DummyDetokenizer(req)
 
     # Simulate that the last token ('Z') is a stop token (stop_terminated=True).
@@ -99,5 +99,4 @@ def test_stop_string_while_stop_token_terminates(
 
     # get_next_output_text should return the full text when finished=True.
     # (Buffering only applies during streaming when finished=False.)
-    assert detok.get_next_output_text(finished=True,
-                                      delta=False) == expected_text
+    assert detok.get_next_output_text(finished=True, delta=False) == expected_text
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index 46f7d58c438cbe949ff9d4fd10ed49d27e7feb0b..6b829c26103595d64f8828d148caa1d1ec79ae1c 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -1,22 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 
-from vllm import LLM, SamplingParams, envs
+from vllm import LLM, SamplingParams
 
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
 
 
-def _test_stopping(llm: LLM,
-                   expected_output: str,
-                   expected_reason: Any,
-                   stop: Optional[list[str]] = None,
-                   stop_token_ids: Optional[list[int]] = None,
-                   include_in_output: bool = False) -> None:
+def _test_stopping(
+    llm: LLM,
+    expected_output: str,
+    expected_reason: Any,
+    stop: list[str] | None = None,
+    stop_token_ids: list[int] | None = None,
+    include_in_output: bool = False,
+) -> None:
     output = llm.generate(
         "A story about vLLM:\n",
         SamplingParams(
@@ -25,7 +27,8 @@ def _test_stopping(llm: LLM,
             stop=stop,
             stop_token_ids=stop_token_ids,
             include_stop_str_in_output=include_in_output,
-        ))[0].outputs[0]
+        ),
+    )[0].outputs[0]
 
     assert output is not None
     assert output.text == expected_output
@@ -33,17 +36,21 @@ def _test_stopping(llm: LLM,
 
 
 def _stop_basic(llm):
-    _test_stopping(llm,
-                   stop=["."],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".")
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=".",
+    )
 
-    _test_stopping(llm,
-                   stop=["."],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".")
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization.",
+        expected_reason=".",
+    )
 
 
 def _stop_multi_tokens(llm):
@@ -52,52 +59,59 @@ def _stop_multi_tokens(llm):
         stop=["group of peo", "short"],
         include_in_output=False,
         expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo")
+        expected_reason="group of peo",
+    )
 
     _test_stopping(
         llm,
         stop=["group of peo", "short"],
         include_in_output=True,
-        expected_output=
-        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo")
+        expected_output="VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo",
+    )
 
 
 def _stop_partial_token(llm):
-    _test_stopping(llm,
-                   stop=["gani"],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani")
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer or",
+        expected_reason="gani",
+    )
 
-    _test_stopping(llm,
-                   stop=["gani"],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani")
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organi",
+        expected_reason="gani",
+    )
 
 
 def _stop_token_id(llm):
     # token id 13013 => " organization"
 
-    _test_stopping(llm,
-                   stop_token_ids=[13013],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013)
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer",
+        expected_reason=13013,
+    )
 
-    _test_stopping(llm,
-                   stop_token_ids=[13013],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013)
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=13013,
+    )
 
 
 @pytest.mark.skip_global_cleanup
 def test_stop_strings():
-    # If V0, must set enforce_eager=False since we use
-    # async output processing below.
-    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=True)
 
     _stop_basic(llm)
     _stop_multi_tokens(llm)
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 7dc4a0cc3d58249d80cfc14c433bafc5979081f3..9c146a3323d909a74be704b0b5403b27b237a8e4 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Optional, Union
 
 import msgspec
 import msgspec.msgpack
@@ -78,8 +77,8 @@ class MockSubscriber:
 
     def __init__(
         self,
-        pub_endpoints: Union[str, list[str]],
-        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        pub_endpoints: str | list[str],
+        replay_endpoints: str | list[str] | None = None,
         topic: str = "",
         decode_type=SampleBatch,
     ):
@@ -111,8 +110,7 @@ class MockSubscriber:
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
 
-    def receive_one(self,
-                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
         """Receive a single message with timeout"""
         if not self.sub.poll(timeout):
             return None
@@ -135,8 +133,7 @@ class MockSubscriber:
 
         self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
 
-    def receive_replay(self,
-                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+    def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
         """Receive replayed messages from a specific replay socket"""
         if not self.replay_sockets:
             raise ValueError("Replay sockets not initialized")
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index e2de462612b47e8b1b27402d991ac0d3c6850d4e..1ddce64f8e6149c6f731a93898b4d8dff6555246 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -12,7 +12,8 @@ import torch.distributed as dist
 
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
-    CustomAllreduce)
+    CustomAllreduce,
+)
 
 # create a cpu process group for communicating metadata (ipc handle)
 dist.init_process_group(backend="gloo")
@@ -52,7 +53,8 @@ for p in pointers:
         assert ord(host_data[i]) == byte_value, (
             f"Rank {rank} failed"
             f" to verify buffer {p}. Expected {byte_value}, "
-            f"got {ord(host_data[i])}")
+            f"got {ord(host_data[i])}"
+        )
 
 print(f"Rank {rank} verified all buffers")
 
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 8d84cc2d0ffe6dca38a01234ac4a88738e8620f1..ba80ee6fb83babb79cfc3f0bddec2c71520890f1 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -5,21 +5,26 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
 
-from __future__ import annotations
-
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce,
-                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed import (
+    broadcast_tensor_dict,
+    get_pp_group,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+    tensor_model_parallel_reduce_scatter,
+)
 
-from ..utils import (init_test_distributed_environment, multi_gpu_test,
-                     multi_process_parallel)
+from ..utils import (
+    init_test_distributed_environment,
+    multi_gpu_test,
+    multi_process_parallel,
+)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -37,12 +42,11 @@ def all_reduce_test_worker(
 
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_elements = 8
     all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
     ]
     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
     t = all_tensors[rank % tp_size]
@@ -51,28 +55,31 @@ def all_reduce_test_worker(
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
-                               pp_size: int, rank: int,
-                               distributed_init_port: str):
+def reduce_scatter_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     num_elements = 8
     all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
     ]
 
     index = rank % tp_size
     partition_size = num_elements // tp_size
     all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    expected = all_reduce[index * partition_size : (index + 1) * partition_size]
     t = all_tensors[index]
     t = tensor_model_parallel_reduce_scatter(t, 0)
     torch.testing.assert_close(t, expected)
@@ -92,8 +99,7 @@ def all_gather_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_dimensions = 3
     tensor_size = list(range(2, num_dimensions + 2))
     total_size = 1
@@ -101,8 +107,10 @@ def all_gather_test_worker(
         total_size *= s
     for all_gather_dimension in range(num_dimensions):
         all_tensors = [
-            torch.arange(total_size, dtype=torch.float32,
-                         device="cuda").reshape(tensor_size) * (r + 1)
+            torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
+                tensor_size
+            )
+            * (r + 1)
             for r in range(tp_size)
         ]
         expected = torch.cat(all_tensors, dim=all_gather_dimension)
@@ -125,8 +133,7 @@ def broadcast_tensor_dict_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     test_dict = {
         # device tensor
         "a": torch.arange(8, dtype=torch.float32, device="cuda"),
@@ -134,10 +141,7 @@ def broadcast_tensor_dict_test_worker(
         "b": torch.arange(16, dtype=torch.int8, device="cpu"),
         "c": "test",
         "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
         # empty tensor
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
@@ -166,8 +170,7 @@ def send_recv_tensor_dict_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     test_dict = {
         # device tensor
@@ -176,10 +179,7 @@ def send_recv_tensor_dict_test_worker(
         "b": torch.arange(16, dtype=torch.int8, device="cpu"),
         "c": "test",
         "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
         # empty tensor
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
@@ -211,8 +211,7 @@ def send_recv_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     size = 64
     test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
@@ -229,10 +228,10 @@ def send_recv_test_worker(
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
+)
 def test_multi_process_tensor_parallel(
     monkeypatch: pytest.MonkeyPatch,
     tp_size: int,
@@ -244,7 +243,8 @@ def test_multi_process_tensor_parallel(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
-    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
+)
 def test_multi_process_pipeline_parallel(
     monkeypatch: pytest.MonkeyPatch,
     pp_size: int,
@@ -256,11 +256,16 @@ def test_multi_process_pipeline_parallel(
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    send_recv_test_worker, send_recv_tensor_dict_test_worker,
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [
+        send_recv_test_worker,
+        send_recv_tensor_dict_test_worker,
+        all_reduce_test_worker,
+        all_gather_test_worker,
+        broadcast_tensor_dict_test_worker,
+    ],
+)
 def test_multi_process_tensor_parallel_pipeline_parallel(
     tp_size: int,
     pp_size: int,
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 11685bc90c41310370c275b21508d4a314ccd8be..b16fd0d06b14539e3f0a1b9571bf5e4da5e93dea 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -7,14 +7,16 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
+import torch
 
-from vllm.config import RunnerOption
+from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -29,44 +31,35 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
+    dcp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
 
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
+    attn_backend: str | None = None
 
 
 @dataclass
 class CPTestSettings:
     parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
     distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
     runner: RunnerOption
     test_options: CPTestOptions
 
-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
-
     @staticmethod
     def detailed(
         *,
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
+        dcp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
+        attn_backend: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
@@ -74,36 +67,44 @@ class CPTestSettings:
                 for dcp_multiplier in [0.5, 1]:
                     for chunked_prefill_val in [True]:
                         parallel_setups.append(
-                            ParallelSetup(tp_size=tp_base,
-                                          pp_size=pp_multiplier * pp_base,
-                                          dcp_size=int(dcp_multiplier *
-                                                       tp_base),
-                                          eager_mode=eager_mode_val,
-                                          chunked_prefill=chunked_prefill_val))
+                            ParallelSetup(
+                                tp_size=tp_base,
+                                pp_size=pp_multiplier * pp_base,
+                                dcp_size=int(dcp_multiplier * tp_base),
+                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                eager_mode=eager_mode_val,
+                                chunked_prefill=chunked_prefill_val,
+                            )
+                        )
         return CPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp"],
-            vllm_major_versions=["1"],
             runner=runner,
-            test_options=CPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=CPTestOptions(
+                multi_node_only=multi_node_only,
+                load_format=load_format,
+                attn_backend=attn_backend,
+            ),
         )
 
     def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.runner, opts)
+            for backend in self.distributed_backends:
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    self.runner,
+                    opts,
+                )
 
 
 def _compare_cp_with_tp(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: CPTestOptions,
     num_gpus_available: int,
@@ -115,11 +116,12 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
+        dcp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only, load_format, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -148,8 +150,10 @@ def _compare_cp_with_tp(
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
     if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
@@ -177,10 +181,12 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
-    cp_env = tp_env = {
-        "VLLM_USE_V1":
-        vllm_major_version,  # Note(hc): DCP only support V1 engine only
-    }
+    if not attn_backend:
+        cp_env = tp_env = {}
+    else:
+        cp_env = tp_env = {
+            "VLLM_ATTENTION_BACKEND": attn_backend,
+        }
 
     cp_args = [
         *common_args,
@@ -190,6 +196,8 @@ def _compare_cp_with_tp(
         str(pp_size),
         "--decode-context-parallel-size",
         str(dcp_size),
+        "--dcp-kv-cache-interleave-size",
+        str(dcp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -204,44 +212,50 @@ def _compare_cp_with_tp(
         distributed_backend,
     ]
 
-    try:
-        compare_two_settings(model_id,
-                             cp_args,
-                             tp_args,
-                             cp_env,
-                             tp_env,
-                             method=method,
-                             max_wait_seconds=720)
-    except Exception:
-        testing_ray_compiled_graph = cp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(
+        model_id,
+        cp_args,
+        tp_args,
+        cp_env,
+        tp_env,
+        method=method,
+        max_wait_seconds=720,
+    )
 
 
 CP_TEXT_GENERATION_MODELS = {
-    # [MLA attention only]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat":
-    [CPTestSettings.detailed(),
-     CPTestSettings.detailed(tp_base=2)],
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(),
+        CPTestSettings.detailed(tp_base=2),
+        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+    ],
+    "bigcode/gpt_bigcode-santacoder": [
+        CPTestSettings.detailed(),
+        CPTestSettings.detailed(tp_base=2),
+    ],
 }
 
 CP_TEST_MODELS = [
     # TODO support other models
     # [LANGUAGE GENERATION]
     "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "bigcode/gpt_bigcode-santacoder",
 ]
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "runner",
+        "test_options",
+    ),
     [
-        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
-        for setting in settings for params in setting.iter_params(model_id)
+        params
+        for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for setting in settings
+        for params in setting.iter_params(model_id)
         if model_id in CP_TEST_MODELS
     ],
 )
@@ -250,17 +264,28 @@ def test_cp_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: CPTestOptions,
     num_gpus_available,
 ):
-    _compare_cp_with_tp(model_id,
-                        parallel_setup,
-                        distributed_backend,
-                        vllm_major_version,
-                        runner,
-                        test_options,
-                        num_gpus_available,
-                        method="generate",
-                        is_multimodal=False)
+    if (
+        model_id == "deepseek-ai/DeepSeek-V2-Lite-Chat"
+        and torch.cuda.get_device_capability() < (9, 0)
+    ):
+        pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
+    if (
+        model_id == "bigcode/gpt_bigcode-santacoder"
+        and torch.cuda.get_device_capability() != (9, 0)
+    ):
+        pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
+
+    _compare_cp_with_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 9212c04deec90f156c1502c61b0a544fcff302e5..f6e274be938474de07da4d8ce16d6dedeb2a0f3c 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -8,12 +8,14 @@ import ray
 import torch
 import torch.distributed as dist
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.parallel_state import get_tp_group, graph_capture
 
-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -33,8 +35,7 @@ def graph_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tp_group().device_group
 
@@ -60,18 +61,15 @@ def graph_allreduce(
             for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
                     # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                     torch.cuda.synchronize()
                     graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for i in range(num_communication):
                             out1 = tensor_model_parallel_all_reduce(inp1)
                             # the input buffer is immediately modified to test
@@ -96,8 +94,7 @@ def eager_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
         # we use the first group to communicate once
         # and the second group to communicate twice
@@ -132,5 +129,4 @@ def test_custom_allreduce(
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index b93696e4be0e1ed1405e971dfabc250061fcaf98..ea7a88abda245cf721504405d8761587dab91469 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from ..entrypoints.openai.test_oot_registration import (
-    run_and_test_dummy_opt_api_server)
+from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
 
 
 def test_distributed_oot(dummy_opt_path: str):
diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py
index e47ccba99c81d26646fd691d94900f4c594df30b..79805a7cce53b56bc9e19d667e9935a6634a8e41 100644
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -10,10 +10,12 @@ from vllm.distributed.eplb.rebalance_algo import rebalance_experts
 def test_basic_rebalance():
     """Test basic rebalancing functionality"""
     # Example from https://github.com/deepseek-ai/eplb
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )
 
     num_layers = weight.shape[0]
     num_replicas = 16
@@ -21,45 +23,49 @@ def test_basic_rebalance():
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify output shapes
     assert phy2log.shape == (
         2,
         16,
     ), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
-    assert (log2phy.shape[0] == 2
-            ), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
-    assert (
-        log2phy.shape[1] == 12
-    ), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    assert log2phy.shape[0] == 2, (
+        f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
+    )
+    assert log2phy.shape[1] == 12, (
+        f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    )
     assert logcnt.shape == (
         2,
         12,
     ), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
 
     # Verify physical to logical expert mapping range is correct
-    assert torch.all(phy2log >= 0) and torch.all(
-        phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
+    assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
+        "Physical to logical mapping should be in range [0, 12)"
+    )
 
     # Verify expert count reasonableness
-    assert torch.all(
-        logcnt >= 1), "Each logical expert should have at least 1 replica"
-    assert (
-        torch.sum(logcnt, dim=1).sum() == num_replicas *
-        num_layers), f"Total replicas should be {num_replicas * num_layers}"
+    assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
+    assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
+        f"Total replicas should be {num_replicas * num_layers}"
+    )
 
     # Verify expected output
-    expected_phy2log = torch.tensor([
-        [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
-        [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
-    ])
+    expected_phy2log = torch.tensor(
+        [
+            [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
+            [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
+        ]
+    )
     assert torch.all(phy2log == expected_phy2log)
 
-    expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
-                                    [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
+    expected_logcnt = torch.tensor(
+        [[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
+    )
     assert torch.all(logcnt == expected_logcnt)
 
 
@@ -71,9 +77,9 @@ def test_single_gpu_case():
     num_nodes = 1
     num_gpus = 1
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 4)
@@ -93,19 +99,19 @@ def test_equal_weights():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 8)
     assert logcnt.shape == (1, 8)
 
     # With equal weights, each expert should have exactly one replica
-    assert torch.all(
-        logcnt == 1
-    ), "With equal weights and no replication, " \
-       "each expert should have exactly 1 replica"
+    assert torch.all(logcnt == 1), (
+        "With equal weights and no replication, "
+        "each expert should have exactly 1 replica"
+    )
 
 
 def test_extreme_weight_imbalance():
@@ -116,35 +122,37 @@ def test_extreme_weight_imbalance():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 12)
     assert logcnt.shape == (1, 8)
 
     # Expert with highest weight (index 0) should have more replicas
-    assert (
-        logcnt[0, 0]
-        > logcnt[0, 1]), "Expert with highest weight should have more replicas"
+    assert logcnt[0, 0] > logcnt[0, 1], (
+        "Expert with highest weight should have more replicas"
+    )
 
 
 def test_multiple_layers():
     """Test multiple layers case"""
-    weight = torch.tensor([
-        [10, 20, 30, 40, 50, 60],  # First layer
-        [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
-        [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
-    ])
+    weight = torch.tensor(
+        [
+            [10, 20, 30, 40, 50, 60],  # First layer
+            [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
+            [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
+        ]
+    )
     num_replicas = 8
     num_groups = 2
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (3, 8)
@@ -152,12 +160,12 @@ def test_multiple_layers():
 
     # Verify expert allocation is reasonable for each layer
     for layer in range(3):
-        assert torch.all(phy2log[layer] >= 0) and torch.all(
-            phy2log[layer] < 6
-        ), f"Layer {layer} physical to logical mapping" \
-            "should be in range [0, 6)"
-        assert (torch.sum(logcnt[layer]) == num_replicas
-                ), f"Layer {layer} total replicas should be {num_replicas}"
+        assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
+            f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
+        )
+        assert torch.sum(logcnt[layer]) == num_replicas, (
+            f"Layer {layer} total replicas should be {num_replicas}"
+        )
 
 
 def test_parameter_validation():
@@ -179,17 +187,19 @@ def test_parameter_validation():
 
 def test_small_scale_hierarchical():
     """Test small-scale hierarchical load balancing"""
-    weight = torch.tensor([
-        [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
-    ])
+    weight = torch.tensor(
+        [
+            [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
+        ]
+    )
     num_replicas = 12
     num_groups = 4  # 4 groups, 2 experts each
     num_nodes = 2  # 2 nodes
     num_gpus = 4  # 4 GPUs
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify basic constraints
     assert phy2log.shape == (1, 12)
@@ -199,8 +209,9 @@ def test_small_scale_hierarchical():
 
     # Expert with highest weight should have more replicas
     max_weight_expert = torch.argmax(weight[0])
-    assert (logcnt[0, max_weight_expert]
-            >= 2), "Highest weight expert should have multiple replicas"
+    assert logcnt[0, max_weight_expert] >= 2, (
+        "Highest weight expert should have multiple replicas"
+    )
 
 
 def test_global_load_balance_fallback():
@@ -213,9 +224,9 @@ def test_global_load_balance_fallback():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Should work normally, just using global load balancing strategy
     assert phy2log.shape == (1, 8)
@@ -235,9 +246,9 @@ def test_device_compatibility(device):
     num_nodes = 1
     num_gpus = 2
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Function will convert to CPU internally, but should handle different
     # device inputs normally
@@ -250,7 +261,8 @@ def test_additional_cases():
 
     # Test case 1: Large-scale distributed setup
     weight1 = torch.tensor(
-        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
+        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
+    )
     phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
 
     assert phy2log1.shape == (1, 24)
@@ -258,10 +270,12 @@ def test_additional_cases():
     assert torch.sum(logcnt1) == 24
 
     # Test case 2: Different weight distributions
-    weight2 = torch.tensor([
-        [200, 150, 100, 50, 25, 12],  # Decreasing weights
-        [12, 25, 50, 100, 150, 200],  # Increasing weights
-    ])
+    weight2 = torch.tensor(
+        [
+            [200, 150, 100, 50, 25, 12],  # Decreasing weights
+            [12, 25, 50, 100, 150, 200],  # Increasing weights
+        ]
+    )
     phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
 
     assert phy2log2.shape == (2, 10)
@@ -274,19 +288,21 @@ def test_additional_cases():
 
 
 if __name__ == "__main__":
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )
 
     num_replicas = 16
     num_groups = 4
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
     print(phy2log)
 
     test_basic_rebalance()
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index de9ed1eabbac67d49ed9c5b82672289a7d1bb4d0..7b45ae82c72d4f87ad2e1ebc4d08c1b560830d60 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -9,12 +9,13 @@ import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import (
-    rearrange_expert_weights_inplace)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_tp_group,
-                                             init_distributed_environment)
-from vllm.utils import update_environment_variables
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+    init_distributed_environment,
+)
+from vllm.utils.system_utils import update_environment_variables
 
 
 def distributed_run(fn, world_size):
@@ -22,13 +23,13 @@ def distributed_run(fn, world_size):
     processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -45,7 +46,7 @@ def worker_fn_wrapper(fn):
     # and update the environment variables in the function
     def wrapped_fn(env):
         update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
         torch.cuda.set_device(device)
         init_distributed_environment()
@@ -60,20 +61,20 @@ def worker_fn_wrapper(fn):
 
 
 def create_expert_indices_with_redundancy(
-        num_layers: int,
-        num_logical_experts: int,
-        total_physical_experts: int,
-        redundancy_config: list[int],  # redundancy for each logical expert
+    num_layers: int,
+    num_logical_experts: int,
+    total_physical_experts: int,
+    redundancy_config: list[int],  # redundancy for each logical expert
 ) -> torch.Tensor:
     """
     Create expert indices with redundancy.
-    
+
     Args:
         num_layers: number of layers
         num_logical_experts: number of logical experts
         total_physical_experts: total number of physical experts
         redundancy_config: redundancy for each logical expert
-    
+
     Returns:
         indices: Shape (num_layers, total_physical_experts)
     """
@@ -106,11 +107,11 @@ def create_expert_weights(
 ) -> list[list[torch.Tensor]]:
     """
     Create fake expert weights tensor for testing.
-    
+
     Use `arange` to generate predictable weights values, based on logical
     expert ID.
     All replicas of the same logical expert should have the same weights.
-    
+
     Args:
         physical_to_logical_mapping: Shape (num_layers, num_local_experts)
             mapping[layer, physical_pos] = logical_expert_id
@@ -120,27 +121,27 @@ def create_expert_weights(
     for layer in range(num_layers):
         layer_weights = []
         for weight_idx, hidden_size in enumerate(hidden_sizes):
-            weight_tensor = torch.zeros(num_local_experts,
-                                        hidden_size,
-                                        device=device,
-                                        dtype=torch.float32)
+            weight_tensor = torch.zeros(
+                num_local_experts, hidden_size, device=device, dtype=torch.float32
+            )
 
             for local_expert in range(num_local_experts):
                 # Get the logical expert ID for this physical expert
                 global_pos = rank * num_local_experts + local_expert
                 logical_expert_id = physical_to_logical_mapping[
-                    layer, global_pos].item()
+                    layer, global_pos
+                ].item()
 
                 # Generate weights based on logical expert ID
                 # (so that all replicas of the same logical expert have the
                 # same weights)
-                base_value = (logical_expert_id * 1000 + layer * 100 +
-                              weight_idx * 10)
-                weight_tensor[local_expert] = torch.arange(base_value,
-                                                           base_value +
-                                                           hidden_size,
-                                                           device=device,
-                                                           dtype=torch.float32)
+                base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
+                weight_tensor[local_expert] = torch.arange(
+                    base_value,
+                    base_value + hidden_size,
+                    device=device,
+                    dtype=torch.float32,
+                )
 
             layer_weights.append(weight_tensor)
         expert_weights.append(layer_weights)
@@ -182,12 +183,15 @@ def verify_expert_weights_after_shuffle(
 
                 # Check if the weights are correct
                 actual_weights = weight_tensor[local_expert]
-                expected_base = (expected_logical_expert * 1000 + layer * 100 +
-                                 weight_idx * 10)
-                expected_weights = torch.arange(expected_base,
-                                                expected_base + hidden_size,
-                                                device=actual_weights.device,
-                                                dtype=actual_weights.dtype)
+                expected_base = (
+                    expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
+                )
+                expected_weights = torch.arange(
+                    expected_base,
+                    expected_base + hidden_size,
+                    device=actual_weights.device,
+                    dtype=actual_weights.dtype,
+                )
 
                 torch.testing.assert_close(
                     actual_weights,
@@ -195,7 +199,8 @@ def verify_expert_weights_after_shuffle(
                     msg=f"Layer {layer}, weight {weight_idx},"
                     f"local expert {local_expert}: "
                     f"weights do not match. "
-                    f"Expected logical expert {expected_logical_expert}")
+                    f"Expected logical expert {expected_logical_expert}",
+                )
 
 
 def verify_redundant_experts_have_same_weights(
@@ -222,23 +227,23 @@ def verify_redundant_experts_have_same_weights(
                 total_physical_experts,
                 hidden_size,
                 device=expert_weights[layer][weight_idx].device,
-                dtype=expert_weights[layer][weight_idx].dtype)
+                dtype=expert_weights[layer][weight_idx].dtype,
+            )
 
             # Use all_gather to collect expert weights from current node
             # expert_weights[layer][weight_idx] shape:
             # [num_local_experts, hidden_size]
             local_weights = expert_weights[layer][
-                weight_idx]  # [num_local_experts, hidden_size]
+                weight_idx
+            ]  # [num_local_experts, hidden_size]
 
             # Split tensor along dim 0 into a list for all_gather
-            gathered_weights_list = torch.chunk(gathered_weights,
-                                                world_size,
-                                                dim=0)
+            gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)
 
             torch.distributed.all_gather(
                 # Output list: each element corresponds to one rank's weights
                 list(gathered_weights_list),
-                local_weights  # Input: current rank's local weights
+                local_weights,  # Input: current rank's local weights
             )
 
             all_weights.append(gathered_weights)
@@ -266,7 +271,8 @@ def verify_redundant_experts_have_same_weights(
                         msg=f"Layer {layer}, weight {weight_idx},"
                         f"logical expert {logical_expert_id}: "
                         f"Physical expert {physical_pos} has different weights"
-                        f"than expected")
+                        f"than expected",
+                    )
 
 
 @pytest.mark.parametrize(
@@ -290,10 +296,11 @@ def verify_redundant_experts_have_same_weights(
         # 4 GPU, 8 experts per GPU
         # 16 logical experts, 32 physical experts, 16 redundant experts
         (4, 8, 8, 16),
-    ])
-def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
-                                                  num_local_experts,
-                                                  num_logical_experts):
+    ],
+)
+def test_rearrange_expert_weights_with_redundancy(
+    world_size, num_layers, num_local_experts, num_logical_experts
+):
     """Test the functionality of rearranging expert weights with redundancy."""
 
     if torch.cuda.device_count() < world_size:
@@ -304,8 +311,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
         # Initialize model parallel (using tensor parallel as an entrypoint
         # to expert parallel)
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -316,8 +323,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
         hidden_sizes = [32, 64]  # Two different weight matrices
 
         # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(num_logical_experts,
-                                                     total_physical_experts)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
         old_indices = create_expert_indices_with_redundancy(
             num_layers,
@@ -328,7 +336,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
 
         # Create new expert indices (with redundancy)
         new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts)
+            num_logical_experts, total_physical_experts
+        )
         new_indices = create_expert_indices_with_redundancy(
             num_layers,
             num_logical_experts,
@@ -337,9 +346,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
         )
 
         # Create expert weights
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
 
         # Execute weight rearrangement
         rearrange_expert_weights_inplace(
@@ -383,8 +392,8 @@ def test_rearrange_expert_weights_no_change(world_size):
     @worker_fn_wrapper
     def worker_fn():
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -401,12 +410,12 @@ def test_rearrange_expert_weights_no_change(world_size):
 
         # Same indices - no change
         indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            redundancy_config)
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )
 
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )
 
         # Save original weights
         original_weights = []
@@ -422,7 +431,8 @@ def test_rearrange_expert_weights_no_change(world_size):
             indices,  # Same indices
             expert_weights,
             ep_group,
-            is_profile=False)
+            is_profile=False,
+        )
 
         # Verify that the weights have not changed
         for layer in range(num_layers):
@@ -430,8 +440,8 @@ def test_rearrange_expert_weights_no_change(world_size):
                 torch.testing.assert_close(
                     expert_weights[layer][weight_idx],
                     original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain "
-                    f"unchanged")
+                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
+                )
 
     distributed_run(worker_fn, world_size)
 
@@ -446,8 +456,8 @@ def test_rearrange_expert_weights_profile_mode(world_size):
     @worker_fn_wrapper
     def worker_fn():
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -460,21 +470,23 @@ def test_rearrange_expert_weights_profile_mode(world_size):
         hidden_sizes = [32]
 
         # Create different index distributions
-        old_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
-        new_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
         old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            old_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
         new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            new_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )
 
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
 
         # Save original weights
         original_weights = []
@@ -490,7 +502,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
             new_indices,
             expert_weights,
             ep_group,
-            is_profile=True  # Profile mode
+            is_profile=True,  # Profile mode
         )
 
         # In profile mode, the weights should remain unchanged
@@ -499,6 +511,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
                 torch.testing.assert_close(
                     expert_weights[layer][weight_idx],
                     original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged")
+                    msg="In profile mode, the weights should remain unchanged",
+                )
 
     distributed_run(worker_fn, world_size)
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..11e23f128f331ed5c2df8f0a262bbdc4ec030b6d
--- /dev/null
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import lm_eval
+import pytest
+
+from tests.utils import large_gpu_mark
+
+
+def get_model_args(
+    model_name: str,
+    spec_model_name: str,
+    spec_method: str,
+    tp_size: int,
+    model_max_len: int,
+) -> dict:
+    speculative_config = {
+        "method": spec_method,
+        "model": spec_model_name,
+        "num_speculative_tokens": 1,
+        "max_model_len": model_max_len,
+    }
+
+    model_args = {
+        "pretrained": model_name,
+        "dtype": "auto",
+        "add_bos_token": True,
+        "tensor_parallel_size": tp_size,
+        "gpu_memory_utilization": 0.7,
+        "speculative_config": speculative_config,
+        "enable_expert_parallel": True,
+        "num_redundant_experts": tp_size,
+        "eplb_window_size": 128,
+        "eplb_step_interval": 1024,
+        "eplb_log_balancedness": False,
+        "enable_eplb": True,
+        "max_model_len": model_max_len,
+    }
+    return model_args
+
+
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        pytest.param(
+            ("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86),
+            marks=large_gpu_mark(min_gb=80),
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+                4,
+                0.92,
+            ),
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
+        ),
+    ],
+    ids=["qwen3_next_mtp", "llama4_eagle"],
+)
+def test_eplb_spec_decode(
+    monkeypatch: pytest.MonkeyPatch,
+    model_setup: tuple[str, str, str, int, float],
+):
+    """
+    Test the correctness of EPLB speculative decoding with GSM8K dataset.
+    Applicable to MoE models with mtp or eagle spec decode.
+    """
+    method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+
+    model_args = get_model_args(
+        model_name=model_name,
+        spec_model_name=spec_model_name,
+        spec_method=method,
+        tp_size=tp_size,
+        model_max_len=4096,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 8be9ee0a1889dab1f60a90245bc285b6f83d5044..f17b7997c5888263e99df7d908476fa5d2ac62fc 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -6,24 +6,29 @@ import time
 import msgspec
 import pytest
 
-from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
-                                        NullEventPublisher)
+from vllm.distributed.kv_events import (
+    EventBatch,
+    EventPublisherFactory,
+    NullEventPublisher,
+)
 
 DP_RANK = 0
 
 
 class EventSample(
-        msgspec.Struct,
-        tag=True,  # type: ignore
-        array_like=True  # type: ignore
+    msgspec.Struct,
+    tag=True,  # type: ignore
+    array_like=True,  # type: ignore
 ):
     """Test event for publisher testing"""
+
     id: int
     value: str
 
 
 class SampleBatch(EventBatch):
     """Test event batch for publisher testing"""
+
     events: list[EventSample]
 
 
@@ -44,10 +49,8 @@ def test_basic_publishing(publisher, subscriber):
 
     seq, received = result
     assert seq == 0, "Sequence number mismatch"
-    assert received.ts == pytest.approx(test_batch.ts,
-                                        abs=0.1), ("Timestamp mismatch")
-    assert len(received.events) == len(
-        test_batch.events), ("Number of events mismatch")
+    assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
+    assert len(received.events) == len(test_batch.events), "Number of events mismatch"
 
     for i, event in enumerate(received.events):
         assert event.id == i, "Event id mismatch"
@@ -88,9 +91,9 @@ def test_replay_mechanism(publisher, subscriber):
     assert len(replayed) > 0, "No replayed messages received"
     seqs = [seq for seq, _ in replayed]
     assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
-    assert seqs == list(range(min(seqs),
-                              max(seqs) +
-                              1)), ("Replayed messages not consecutive")
+    assert seqs == list(range(min(seqs), max(seqs) + 1)), (
+        "Replayed messages not consecutive"
+    )
 
 
 def test_buffer_limit(publisher, subscriber, publisher_config):
@@ -126,6 +129,7 @@ def test_topic_filtering(publisher_config):
     pub = EventPublisherFactory.create(publisher_config, DP_RANK)
 
     from .conftest import MockSubscriber
+
     sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
     sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
 
@@ -137,11 +141,13 @@ def test_topic_filtering(publisher_config):
 
         foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
         assert all(msg is not None for msg in foo_received), (
-            "Subscriber with matching topic should receive messages")
+            "Subscriber with matching topic should receive messages"
+        )
 
         bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
         assert all(msg is None for msg in bar_received), (
-            "Subscriber with non-matching topic should receive no messages")
+            "Subscriber with non-matching topic should receive no messages"
+        )
     finally:
         pub.shutdown()
         sub_foo.close()
@@ -178,8 +184,7 @@ def test_high_volume(publisher, subscriber):
 
     publisher_thread.join()
 
-    assert len(received) >= num_batches * 0.9, (
-        "We should have received most messages")
+    assert len(received) >= num_batches * 0.9, "We should have received most messages"
 
     seqs = [seq for seq, _ in received]
     assert sorted(seqs) == seqs, "Sequence numbers should be in order"
@@ -209,13 +214,15 @@ def test_data_parallel_rank_tagging(publisher_config):
         # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
         expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
         expected_endpoint_1 = base_endpoint.replace(
-            ":5557", ":5558")  # rank 1 gets port + 1
+            ":5557", ":5558"
+        )  # rank 1 gets port + 1
     else:
         # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
         expected_endpoint_0 = base_endpoint  # rank 0 gets base
         expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
 
     from .conftest import MockSubscriber
+
     sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
     sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
 
@@ -241,18 +248,67 @@ def test_data_parallel_rank_tagging(publisher_config):
 
         # Verify DP rank tagging
         assert received_0.data_parallel_rank == 0, (
-            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}"
+        )
         assert received_1.data_parallel_rank == 1, (
-            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}"
+        )
 
         # Verify event content is correct
-        assert len(
-            received_0.events) == 2, "Wrong number of events from rank 0"
-        assert len(
-            received_1.events) == 3, "Wrong number of events from rank 1"
+        assert len(received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(received_1.events) == 3, "Wrong number of events from rank 1"
 
     finally:
         pub_0.shutdown()
         pub_1.shutdown()
         sub_0.close()
         sub_1.close()
+
+
+def test_event_publisher_factory():
+    """Test event publisher factory creation behavior under different configurations"""
+    from vllm.config.kv_events import KVEventsConfig
+    from vllm.distributed.kv_events import ZmqEventPublisher
+
+    # test config is None
+    publisher = EventPublisherFactory.create(None, DP_RANK)
+    assert isinstance(publisher, NullEventPublisher)
+    publisher.shutdown()
+
+    # test disable kv cache events
+    config = KVEventsConfig(
+        enable_kv_cache_events=False,
+        publisher="zmq",  # Even if zmq is specified, should return NullEventPublisher
+        endpoint="tcp://localhost:5557",
+    )
+    publisher = EventPublisherFactory.create(config, DP_RANK)
+    assert isinstance(publisher, NullEventPublisher)
+    publisher.shutdown()
+
+    # test zmq publisher
+    config = KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint="inproc://test-factory-true",
+    )
+    publisher = EventPublisherFactory.create(config, DP_RANK)
+    assert isinstance(publisher, ZmqEventPublisher)
+    publisher.shutdown()
+
+    # test unknown publisher
+    with pytest.raises(ValueError, match="Input should be"):
+        KVEventsConfig(
+            enable_kv_cache_events=True,
+            publisher="unknown_publisher",
+            endpoint="tcp://localhost:5557",
+        )
+
+    # test publisher not specified
+    config = KVEventsConfig(
+        enable_kv_cache_events=True,
+        # publisher not specified, should default to "zmq"
+        endpoint="tcp://localhost:5557",
+    )
+    publisher = EventPublisherFactory.create(config, DP_RANK)
+    assert isinstance(publisher, ZmqEventPublisher)
+    publisher.shutdown()
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index f273f302e72e8789b9e8d9c8399764ec5c535731..0228d42a76a0f38071ec3071130cb5d1a02c13d3 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
-from vllm.config import RunnerOption
+from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
 
 class EPTestOptions(NamedTuple):
     trust_remote_code: bool
-    tokenizer_mode: Optional[str]
-    load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None
+    tokenizer_mode: str | None
+    load_format: str | None = None
+    hf_overrides: str | None = None
 
 
 @dataclass
@@ -40,34 +40,30 @@ class EPTestSettings:
         tp_base: int = 2,
         runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
     ):
         return EPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
+                ),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
+                ),
             ],
             distributed_backends=["mp", "ray"],
             runner=runner,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
         )
 
     @staticmethod
@@ -76,22 +72,22 @@ class EPTestSettings:
         tp_base: int = 2,
         runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
     ):
         return EPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
             ],
             distributed_backends=["mp"],
             runner=runner,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
         )
 
     def iter_params(self, model_name: str):
@@ -99,17 +95,20 @@ class EPTestSettings:
 
         for parallel_setup in self.parallel_setups:
             for distributed_backend in self.distributed_backends:
-                yield (model_name, parallel_setup, distributed_backend,
-                       self.runner, opts)
+                yield (
+                    model_name,
+                    parallel_setup,
+                    distributed_backend,
+                    self.runner,
+                    opts,
+                )
 
 
 # NOTE: You can adjust tp_base locally to fit the model in GPU
 # The values displayed here are only a rough indicator of the size of the model
 
-# yapf: disable
 TEST_MODELS = {
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
-        trust_remote_code=True),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
     "mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
 }
 
@@ -191,22 +190,24 @@ def _compare_tp(
     ]
 
     try:
-        compare_two_settings(model_name,
-                             ep_args,
-                             tp_args,
-                             ep_env,
-                             tp_env,
-                             method=method,
-                             max_wait_seconds=360)
+        compare_two_settings(
+            model_name,
+            ep_args,
+            tp_args,
+            ep_env,
+            tp_env,
+            method=method,
+            max_wait_seconds=360,
+        )
     except Exception:
         raise
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
     [
-        params for model_name, settings in TEST_MODELS.items()
+        params
+        for model_name, settings in TEST_MODELS.items()
         for params in settings.iter_params(model_name)
     ],
 )
@@ -219,10 +220,12 @@ def test_ep(
     test_options: EPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_name,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate")
+    _compare_tp(
+        model_name,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+    )
diff --git a/tests/distributed/test_expert_placement.py b/tests/distributed/test_expert_placement.py
index a3b1b3193deb069bb0538ebc729ba71cc69d5f65..8b3a64b9c13497339863637d059334db4608b747 100644
--- a/tests/distributed/test_expert_placement.py
+++ b/tests/distributed/test_expert_placement.py
@@ -6,17 +6,13 @@ import pytest
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 
 
-def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
-                               global_num_experts):
+def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts):
     """Verify that the expert map follows the round_robin pattern."""
     # Calculate expected local experts (supporting non-divisible cases)
     base_experts = global_num_experts // ep_size
     remainder = global_num_experts % ep_size
 
-    if ep_rank < remainder:
-        local_num_experts = base_experts + 1
-    else:
-        local_num_experts = base_experts
+    local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts
 
     # Expected expert IDs for this rank in round_robin pattern
     # For non-divisible cases, ranks with extra experts start earlier
@@ -30,24 +26,21 @@ def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
         if global_expert_id in expected_expert_ids:
             local_expert_id = expert_map[global_expert_id]
             expected_local_id = expected_expert_ids.index(global_expert_id)
-            assert (
-                local_expert_id == expected_local_id
-            ), f"Global expert {global_expert_id} should map to local expert " \
+            assert local_expert_id == expected_local_id, (
+                f"Global expert {global_expert_id} should map to local expert "
                 f"{expected_local_id}, got {local_expert_id}"
+            )
         else:
-            assert (
-                expert_map[global_expert_id] == -1
-            ), f"Global expert {global_expert_id} should not be mapped to " \
-                f"this rank"
+            assert expert_map[global_expert_id] == -1, (
+                f"Global expert {global_expert_id} should not be mapped to this rank"
+            )
 
     # Verify that all local expert IDs are consecutive starting from 0
-    local_expert_ids = [
-        expert_map[global_id] for global_id in expected_expert_ids
-    ]
+    local_expert_ids = [expert_map[global_id] for global_id in expected_expert_ids]
     expected_local_ids = list(range(local_num_experts))
-    assert (
-        local_expert_ids == expected_local_ids
-    ), f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
+    assert local_expert_ids == expected_local_ids, (
+        f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
+    )
 
 
 @pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@@ -78,8 +71,9 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
 
     for test_global_experts, test_ep_size in test_cases:
         # Ensure ep_size matches world_size
-        assert (test_ep_size == world_size
-                ), f"ep_size {test_ep_size} must equal world_size {world_size}"
+        assert test_ep_size == world_size, (
+            f"ep_size {test_ep_size} must equal world_size {world_size}"
+        )
 
         # Test each rank
         for ep_rank in range(world_size):
@@ -91,28 +85,29 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
             else:
                 expected_test_local = base_experts
 
-            test_local_experts, test_expert_map = determine_expert_map(
+            test_local_experts, test_expert_map, _ = determine_expert_map(
                 ep_size=test_ep_size,
                 ep_rank=ep_rank,
                 global_num_experts=test_global_experts,
                 expert_placement_strategy=expert_placement_strategy,
             )
 
-            assert (
-                test_local_experts == expected_test_local
-            ), f"For {test_global_experts} experts on {test_ep_size} ranks, " \
-                f"rank {ep_rank}: expected {expected_test_local} local" \
+            assert test_local_experts == expected_test_local, (
+                f"For {test_global_experts} experts on {test_ep_size} ranks, "
+                f"rank {ep_rank}: expected {expected_test_local} local"
                 f"experts, got {test_local_experts}"
+            )
 
             if test_expert_map is not None:
-                assert test_expert_map.shape == (
-                    test_global_experts,
-                ), f"Expected expert map shape ({test_global_experts},), " \
+                assert test_expert_map.shape == (test_global_experts,), (
+                    f"Expected expert map shape ({test_global_experts},), "
                     f"got {test_expert_map.shape}"
+                )
 
                 # Verify round_robin pattern for this test case
-                verify_round_robin_pattern(test_expert_map, ep_rank,
-                                           test_ep_size, test_global_experts)
+                verify_round_robin_pattern(
+                    test_expert_map, ep_rank, test_ep_size, test_global_experts
+                )
 
 
 @pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@@ -121,7 +116,7 @@ def test_expert_placement_edge_cases(expert_placement_strategy, world_size):
     """Test edge cases for round_robin expert placement."""
 
     # Test case 1: ep_size = 1 (should return None for expert_map)
-    local_num_experts, expert_map = determine_expert_map(
+    local_num_experts, expert_map, _ = determine_expert_map(
         ep_size=1,
         ep_rank=0,
         global_num_experts=8,
@@ -147,48 +142,103 @@ def test_determine_expert_map_comprehensive():
     # expert_placement_strategy, expected_local, expected_map_pattern)
     test_cases = [
         # Round robin placement tests
-        (2, 0, 8, "round_robin", 4, [0, -1, 1, -1, 2, -1, 3,
-                                     -1]),  # rank 0 gets even experts
-        (2, 1, 8, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1,
-                                     3]),  # rank 1 gets odd experts
-        (2, 0, 9, "round_robin", 5, [0, -1, 1, -1, 2, -1, 3, -1, 4
-                                     ]),  # rank 0 gets 5 experts (even + last)
-        (2, 1, 9, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1, 3,
-                                     -1]),  # rank 1 gets 4 experts (odd)
-
+        (
+            2,
+            0,
+            8,
+            "round_robin",
+            4,
+            [0, -1, 1, -1, 2, -1, 3, -1],
+        ),  # rank 0 gets even experts
+        (
+            2,
+            1,
+            8,
+            "round_robin",
+            4,
+            [-1, 0, -1, 1, -1, 2, -1, 3],
+        ),  # rank 1 gets odd experts
+        (
+            2,
+            0,
+            9,
+            "round_robin",
+            5,
+            [0, -1, 1, -1, 2, -1, 3, -1, 4],
+        ),  # rank 0 gets 5 experts (even + last)
+        (
+            2,
+            1,
+            9,
+            "round_robin",
+            4,
+            [-1, 0, -1, 1, -1, 2, -1, 3, -1],
+        ),  # rank 1 gets 4 experts (odd)
         # 4-rank tests
-        (4, 0, 8, "round_robin", 2, [0, -1, -1, -1, 1, -1, -1,
-                                     -1]),  # rank 0 gets experts 0, 4
-        (4, 1, 8, "round_robin", 2, [-1, 0, -1, -1, -1, 1, -1,
-                                     -1]),  # rank 1 gets experts 1, 5
-        (4, 2, 8, "round_robin", 2, [-1, -1, 0, -1, -1, -1, 1,
-                                     -1]),  # rank 2 gets experts 2, 6
-        (4, 3, 8, "round_robin", 2, [-1, -1, -1, 0, -1, -1, -1,
-                                     1]),  # rank 3 gets experts 3, 7
+        (
+            4,
+            0,
+            8,
+            "round_robin",
+            2,
+            [0, -1, -1, -1, 1, -1, -1, -1],
+        ),  # rank 0 gets experts 0, 4
+        (
+            4,
+            1,
+            8,
+            "round_robin",
+            2,
+            [-1, 0, -1, -1, -1, 1, -1, -1],
+        ),  # rank 1 gets experts 1, 5
+        (
+            4,
+            2,
+            8,
+            "round_robin",
+            2,
+            [-1, -1, 0, -1, -1, -1, 1, -1],
+        ),  # rank 2 gets experts 2, 6
+        (
+            4,
+            3,
+            8,
+            "round_robin",
+            2,
+            [-1, -1, -1, 0, -1, -1, -1, 1],
+        ),  # rank 3 gets experts 3, 7
     ]
 
-    for ep_size, ep_rank, global_num_experts, expert_placement_strategy, \
-        expected_local, expected_map_pattern in test_cases:
-        local_num_experts, expert_map = determine_expert_map(
+    for (
+        ep_size,
+        ep_rank,
+        global_num_experts,
+        expert_placement_strategy,
+        expected_local,
+        expected_map_pattern,
+    ) in test_cases:
+        local_num_experts, expert_map, _ = determine_expert_map(
             ep_size=ep_size,
             ep_rank=ep_rank,
             global_num_experts=global_num_experts,
             expert_placement_strategy=expert_placement_strategy,
         )
 
-        assert local_num_experts == expected_local, \
-            f"ep_size={ep_size}, ep_rank={ep_rank}, " \
-            f"global_num_experts={global_num_experts}, " \
-            f"expert_placement_strategy={expert_placement_strategy}: " \
+        assert local_num_experts == expected_local, (
+            f"ep_size={ep_size}, ep_rank={ep_rank}, "
+            f"global_num_experts={global_num_experts}, "
+            f"expert_placement_strategy={expert_placement_strategy}: "
             f"expected {expected_local} local experts, got {local_num_experts}"
+        )
 
         if expected_map_pattern is None:
             assert expert_map is None, "Expected expert_map to be None"
         else:
             assert expert_map is not None, "Expected expert_map to not be None"
             actual_map = expert_map.tolist()
-            assert actual_map == expected_map_pattern, \
-                f"ep_size={ep_size}, ep_rank={ep_rank}, " \
-                f"global_num_experts={global_num_experts}, " \
-                f"expert_placement_strategy={expert_placement_strategy}: " \
+            assert actual_map == expected_map_pattern, (
+                f"ep_size={ep_size}, ep_rank={ep_rank}, "
+                f"global_num_experts={global_num_experts}, "
+                f"expert_placement_strategy={expert_placement_strategy}: "
                 f"expected map {expected_map_pattern}, got {actual_map}"
+            )
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index d447876f6cc7cc26882b9fc9d8983a7063a17103..b190b2820451ba76ddceba9d82de06ebf22f9b59 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig,
-                         VllmConfig, set_current_vllm_config)
+from vllm.config import (
+    DeviceConfig,
+    KVTransferConfig,
+    ModelConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
-    get_kv_connector_cache_layout)
+    get_kv_connector_cache_layout,
+)
 from vllm.logger import init_logger
 
 logger = init_logger("test_expert_parallel")
@@ -23,8 +29,9 @@ def test_get_kv_connector_cache_layout_with_lmcache_connector():
         kv_connector="LMCacheConnectorV1",
         kv_role="kv_both",
     )
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"), kv_transfer_config=kv_transfer_config
+    )
     with set_current_vllm_config(vllm_config):
         # Test with default settings
         layout = get_kv_connector_cache_layout()
@@ -37,9 +44,11 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
         kv_role="kv_both",
     )
     model_config = ModelConfig()
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             model_config=model_config,
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"),
+        model_config=model_config,
+        kv_transfer_config=kv_transfer_config,
+    )
     with set_current_vllm_config(vllm_config):
         # Test with default settings
         layout = get_kv_connector_cache_layout()
@@ -47,25 +56,22 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
 
 
 def test_get_kv_connector_cache_layout_with_multi_connector():
-    kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector",
-                                          kv_role="kv_both",
-                                          kv_connector_extra_config={
-                                              "connectors": [{
-                                                  "kv_connector":
-                                                  "SharedStorageConnector",
-                                                  "kv_role":
-                                                  "kv_both"
-                                              }, {
-                                                  "kv_connector":
-                                                  "NixlConnector",
-                                                  "kv_role":
-                                                  "kv_both"
-                                              }]
-                                          })
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="MultiConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "connectors": [
+                {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"},
+                {"kv_connector": "NixlConnector", "kv_role": "kv_both"},
+            ]
+        },
+    )
     model_config = ModelConfig()
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             model_config=model_config,
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"),
+        model_config=model_config,
+        kv_transfer_config=kv_transfer_config,
+    )
     with set_current_vllm_config(vllm_config):
         # Test with default settings
         layout = get_kv_connector_cache_layout()
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index ef17a51fff0e1f8367cf7ad736c4b97ec1557ca9..5d3f524f4d2f38256783b3ecac5e30bed5cb2c89 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -18,20 +18,19 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 from vllm import initialize_ray_cluster
 from vllm.config import ParallelConfig
-from vllm.executor.ray_utils import _wait_until_pg_removed
-from vllm.utils import get_ip
+from vllm.utils.network_utils import get_ip
+from vllm.v1.executor.ray_utils import _wait_until_pg_removed
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.mark.skipif(not VLLM_MULTI_NODE,
-                    reason="Need at least 2 nodes to run the test.")
+@pytest.mark.skipif(
+    not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
+)
 def test_multi_node_assignment() -> None:
-
     # NOTE: important to keep this class definition here
     # to let ray use cloudpickle to serialize it.
     class Actor:
-
         def get_ip(self):
             return get_ip()
 
@@ -41,8 +40,7 @@ def test_multi_node_assignment() -> None:
 
         current_ip = get_ip()
         workers = []
-        for bundle_id, bundle in enumerate(
-                config.placement_group.bundle_specs):
+        for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
diff --git a/tests/distributed/test_multiproc_executor.py b/tests/distributed/test_multiproc_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e741a79bc4ed9b7f8677df2d4ebca069e088ed22
--- /dev/null
+++ b/tests/distributed/test_multiproc_executor.py
@@ -0,0 +1,437 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Integration tests for MultiprocExecutor at the executor level.
+This test directly tests the executor without going through the LLM interface,
+focusing on executor initialization, RPC calls, and distributed execution.
+"""
+
+import multiprocessing
+import os
+
+from tests.utils import multi_gpu_test
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_open_port
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+MODEL = "facebook/opt-125m"
+
+
+def create_vllm_config(
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    max_model_len: int = 256,
+    gpu_memory_utilization: float = 0.3,
+    distributed_executor_backend: str = "mp",
+    nnodes: int = 1,
+    node_rank: int = 0,
+    master_port: int = 0,
+) -> VllmConfig:
+    """Create a VllmConfig for testing using EngineArgs."""
+    engine_args = EngineArgs(
+        model=MODEL,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+
+    # Override distributed node settings if needed
+    if nnodes > 1 or node_rank > 0:
+        vllm_config.parallel_config.nnodes = nnodes
+        vllm_config.parallel_config.node_rank = node_rank
+        vllm_config.parallel_config.master_port = master_port
+    if nnodes > 1:
+        vllm_config.parallel_config.disable_custom_all_reduce = True
+
+    return vllm_config
+
+
+def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
+    """Create a minimal SchedulerOutput for testing."""
+    # This is a simplified version - in practice you'd need proper
+    # SchedulerOutput construction based on the actual vLLM v1 API
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_resumed_reqs=[],
+        scheduled_running_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+    )
+
+
+def test_multiproc_executor_initialization():
+    """Test that MultiprocExecutor can be initialized with proper config."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor - this should initialize workers
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 1, "World size should be 1 for single GPU"
+    assert executor.local_world_size == 1, "Local world size should be 1"
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_initialization_tensor_parallel():
+    """Test MultiprocExecutor initialization with tensor parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 2, "World size should be 2 for TP=2"
+    assert executor.local_world_size == 2, "Local world size should be 2"
+    assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
+
+    # Verify output rank calculation
+    output_rank = executor._get_output_rank()
+    assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_collective_rpc():
+    """Test collective RPC calls to all workers."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test check_health RPC - should work without errors
+        executor.check_health()
+
+        # Test that RPC works correctly
+        # Note: We're just testing that the RPC mechanism works,
+        # not testing actual model execution here
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_failure_callback():
+    """Test failure callback registration and invocation."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test callback registration
+        callback_invoked = []
+
+        def test_callback():
+            callback_invoked.append(True)
+
+        # Register callback
+        executor.register_failure_callback(test_callback)
+
+        # Callback should not be invoked yet
+        assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
+
+        # Simulate failure
+        executor.is_failed = True
+
+        # Register another callback - should be invoked immediately
+        executor.register_failure_callback(test_callback)
+        assert len(callback_invoked) == 1, (
+            "Callback should be invoked when executor is failed"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_worker_monitor():
+    """Test that worker monitor is set up correctly."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify all worker processes are alive
+        for worker in executor.workers:
+            assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
+
+        # Verify executor is not in failed state
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+        # After shutdown, workers should be terminated
+        import time
+
+        time.sleep(0.5)  # Give processes time to terminate
+        for worker in executor.workers:
+            assert not worker.proc.is_alive(), (
+                f"Worker rank {worker.rank} should terminate after shutdown"
+            )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_get_response_message_queues():
+    """Test message queue retrieval for different ranks."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Get all message queues
+        all_queues = executor.get_response_mqs()
+        assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
+
+        # Get message queue for specific rank
+        rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
+        assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
+
+        rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
+        assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_shutdown_cleanup():
+    """Test that shutdown properly cleans up resources."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor is set up
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) > 0, "Should have at least one worker"
+
+    # Shutdown
+    executor.shutdown()
+
+    # Verify cleanup
+    import time
+
+    time.sleep(0.5)  # Give processes time to terminate
+
+    for worker in executor.workers:
+        assert not worker.proc.is_alive(), "Worker processes should be terminated"
+
+    # Verify shutdown event is set
+    assert executor.shutdown_event.is_set(), "Shutdown event should be set"
+
+    # Multiple shutdowns should be safe (idempotent)
+    executor.shutdown()
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_pipeline_parallel():
+    """Test MultiprocExecutor with pipeline parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify executor properties
+        assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
+        assert len(executor.workers) == 4, "Should have 4 workers"
+
+        # Verify output rank calculation
+        # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
+        # Specifically rank 2 (first rank of last PP stage)
+        output_rank = executor._get_output_rank()
+        assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
+
+        # Verify max_concurrent_batches for pipeline parallel
+        assert executor.max_concurrent_batches == 2, (
+            "Max concurrent batches should equal PP size"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_properties():
+    """Test various executor properties and configurations."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test supports_pp property
+        assert MultiprocExecutor.supports_pp is True, (
+            "MultiprocExecutor should support pipeline parallelism"
+        )
+
+        # Test world_size calculation
+        assert executor.world_size == (
+            executor.parallel_config.tensor_parallel_size
+            * executor.parallel_config.pipeline_parallel_size
+        ), "World size should equal TP * PP"
+
+        # Test local_world_size calculation
+        assert executor.local_world_size == (
+            executor.parallel_config.world_size // executor.parallel_config.nnodes
+        ), "Local world size should be world_size / nnodes"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_multi_node():
+    """
+    Test MultiprocExecutor with multi-node configuration.
+    This simulates 2 nodes with TP=4:
+    - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
+    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
+    Total world_size = 4, nnodes = 2
+    """
+    port = get_open_port()
+    # symm_mem does not work for simulating multi instance in single node
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
+        """Run a single node's executor."""
+        executor = None
+        try:
+            # Set CUDA_VISIBLE_DEVICES for this node
+            if node_rank == 0:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+
+            # Create config for this node
+            vllm_config = create_vllm_config(
+                tensor_parallel_size=4,  # Total TP across all nodes
+                pipeline_parallel_size=1,
+                nnodes=2,  # 2 nodes
+                node_rank=node_rank,
+                master_port=port,  # same port
+            )
+
+            # Create executor for this node
+            executor = MultiprocExecutor(vllm_config=vllm_config)
+
+            # Verify node-specific properties
+            assert executor.world_size == 4, (
+                f"World size should be 4 on node {node_rank}"
+            )
+            assert executor.local_world_size == 2, (
+                f"Local world size should be 2 on node {node_rank}"
+            )
+            assert len(executor.workers) == 2, (
+                f"Should have 2 local workers on node {node_rank}"
+            )
+
+            # Verify worker ranks are correct for this node
+            expected_ranks = [node_rank * 2, node_rank * 2 + 1]
+            actual_ranks = sorted([w.rank for w in executor.workers])
+            assert actual_ranks == expected_ranks, (
+                f"Node {node_rank} should have workers "
+                f"with ranks {expected_ranks}, got {actual_ranks}"
+            )
+            # Verify all workers are alive
+            for worker in executor.workers:
+                assert worker.proc.is_alive(), (
+                    f"Worker rank {worker.rank} should be alive on node {node_rank}"
+                )
+            # executor.gen
+            # Put success result in queue BEFORE shutdown to avoid hanging
+            result_queue.put({"node": node_rank, "success": True})
+            import time
+
+            time.sleep(2)
+            executor.shutdown()
+        except Exception as e:
+            # Put failure result in queue
+            result_queue.put({"node": node_rank, "success": False, "error": str(e)})
+            raise e
+        finally:
+            if executor is not None:
+                executor.shutdown()
+
+    # Create a queue to collect results from both processes
+    result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
+
+    # Start both node processes
+    processes = []
+    for node_rank in range(2):
+        p = multiprocessing.Process(
+            target=run_node,
+            args=(node_rank, result_queue, port),
+            name=f"Node{node_rank}",
+        )
+        p.start()
+        processes.append(p)
+
+    # Wait for both processes to complete
+    all_completed = True
+    for p in processes:
+        p.join(timeout=60)
+        if p.is_alive():
+            p.terminate()
+            p.join(timeout=20)
+            if p.is_alive():
+                p.kill()
+                p.join()
+            all_completed = False
+
+    # Check results from both nodes
+    results: list[dict[str, int | bool]] = []
+    while len(results) < 2:
+        try:
+            result = result_queue.get(timeout=1)
+            results.append(result)
+        except Exception:
+            pass
+    assert all_completed, "Not all processes completed successfully"
+    assert len(results) == 2, f"Expected 2 results, got {len(results)}"
+    assert results[0]["success"], f"Node 0 failed: {results[0]}"
+    assert results[1]["success"], f"Node 1 failed: {results[1]}"
diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py
index ffc913742620d104cb613ec723dcca27110716f8..eeb74bdf5357869f62090f94123d478605a417a9 100644
--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -11,17 +11,19 @@ import torch.multiprocessing as mp
 
 import vllm.envs as envs
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.distributed.device_communicators.cuda_communicator import (
-    CudaCommunicator)
-from vllm.distributed.device_communicators.pynccl import (
-    register_nccl_symmetric_ops)
+from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
+from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
 from vllm.distributed.device_communicators.pynccl_allocator import (
-    get_nccl_mem_pool, is_symmetric_memory_enabled)
-from vllm.distributed.parallel_state import (get_tp_group,
-                                             init_distributed_environment,
-                                             initialize_model_parallel)
+    get_nccl_mem_pool,
+    is_symmetric_memory_enabled,
+)
+from vllm.distributed.parallel_state import (
+    get_tp_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 torch.manual_seed(42)
 random.seed(44)
@@ -38,31 +40,32 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
         torch.cuda.set_device(device)
         torch.set_default_device(device)
         torch.set_default_dtype(dtype)
-        update_environment_variables({
-            "RANK": str(local_rank),
-            "LOCAL_RANK": str(local_rank),
-            "WORLD_SIZE": str(world_size),
-            "MASTER_ADDR": "localhost",
-            "MASTER_PORT": "12345",
-        })
+        update_environment_variables(
+            {
+                "RANK": str(local_rank),
+                "LOCAL_RANK": str(local_rank),
+                "WORLD_SIZE": str(world_size),
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            }
+        )
 
         init_distributed_environment()
         initialize_model_parallel(tensor_model_parallel_size=world_size)
 
-        cuda_communicator = typing.cast(CudaCommunicator,
-                                        get_tp_group().device_communicator)
+        cuda_communicator = typing.cast(
+            CudaCommunicator, get_tp_group().device_communicator
+        )
         pynccl_comm = cuda_communicator.pynccl_comm
         if get_nccl_mem_pool() is None:
-            pytest.skip("NCCL allocator compilation failed "
-                        "(probably missing NCCL headers).")
+            pytest.skip(
+                "NCCL allocator compilation failed (probably missing NCCL headers)."
+            )
         if not is_symmetric_memory_enabled():
             pytest.skip("NCCL symmetric memory allreduce is disabled.")
 
         register_nccl_symmetric_ops(pynccl_comm)
-        input = torch.randint(1,
-                              23, (test_size_elements, ),
-                              dtype=dtype,
-                              device=device)
+        input = torch.randint(1, 23, (test_size_elements,), dtype=dtype, device=device)
         input_clone = input.clone()
         output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
         assert output is not None
@@ -77,8 +80,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
     reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
 )
 @pytest.mark.parametrize("world_size", [2])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
@@ -88,7 +90,5 @@ def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
     monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
     monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
 
-    mp.spawn(nccl_symm_mem_allreduce_worker,
-             args=(world_size, ),
-             nprocs=world_size)
+    mp.spawn(nccl_symm_mem_allreduce_worker, args=(world_size,), nprocs=world_size)
     cleanup_dist_env_and_memory()
diff --git a/tests/distributed/test_node_count.py b/tests/distributed/test_node_count.py
index e3c36ef5ef379791a97d72f3dd2df0f95f26fd35..34e10084095a3bdcd668e7b9368f3aa7426370d6 100644
--- a/tests/distributed/test_node_count.py
+++ b/tests/distributed/test_node_count.py
@@ -7,7 +7,7 @@ import torch.distributed as dist
 
 from vllm.distributed.parallel_state import _node_count
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import get_ip, get_open_port
+from vllm.utils.network_utils import get_ip, get_open_port
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
@@ -32,12 +32,15 @@ if __name__ == "__main__":
         # Expected node count based on environment variable)
         expected = int(os.environ.get("NUM_NODES", "1"))
 
-        assert test_result == expected, \
-            f"Expected {expected} nodes, got {test_result}"
+        assert test_result == expected, f"Expected {expected} nodes, got {test_result}"
 
         if pg == dist.group.WORLD:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using torch distributed!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using torch distributed!"
+            )
         else:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using StatelessProcessGroup!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using StatelessProcessGroup!"
+            )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index aa28ed9ce25e5e87ec43105e06319091a429648b..0ab94d30858fb91f5561f7676b3e773a312e7030 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -7,10 +7,11 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
@@ -34,7 +35,7 @@ class ParallelSetup(NamedTuple):
 
 class PPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
 
 
 @dataclass
@@ -51,30 +52,21 @@ class PPTestSettings:
         pp_base: int = 2,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         return PPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True),
+                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
+                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
+                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
+                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
+                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
             ],
             distributed_backends=["mp", "ray"],
             runner=runner,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -84,19 +76,17 @@ class PPTestSettings:
         pp_base: int = 2,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
-
         return PPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True),
+                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
             ],
             distributed_backends=["mp"],
             runner=runner,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     def iter_params(self, model_id: str):
@@ -110,12 +100,11 @@ class PPTestSettings:
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
 # The values displayed here are only a rough indicator of the size of the model
 
-# yapf: disable
 TEXT_GENERATION_MODELS = {
     # [Decoder-only]
     # Uses Llama
     # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
-    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
     "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
     "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
@@ -149,7 +138,7 @@ TEXT_GENERATION_MODELS = {
     # Uses Llama
     # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
     "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
@@ -160,13 +149,15 @@ TEXT_GENERATION_MODELS = {
     "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "microsoft/phi-2": PPTestSettings.fast(),
     "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
-    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
+        multi_node_only=True, load_format="dummy"
+    ),
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
     "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
     "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
     "bigcode/starcoder2-3b": PPTestSettings.fast(),
-    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
     # FIXME: Cannot load tokenizer in latest transformers version.
     # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
     # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
@@ -206,7 +197,6 @@ MULTIMODAL_MODELS = {
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
     "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
 }
-# yapf: enable
 
 # NOTE: You can update this on your local machine to run specific tests
 TEST_MODELS = [
@@ -254,7 +244,7 @@ def _compare_tp(
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
     hf_config = get_config(model_id, trust_remote_code)
-    skip_tokenizer_init = model_info.skip_tokenizer_init
+    require_embed_inputs = model_info.require_embed_inputs
     max_num_seqs = model_info.max_num_seqs
 
     dtype = "float16"
@@ -281,8 +271,10 @@ def _compare_tp(
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
     if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
@@ -307,32 +299,31 @@ def _compare_tp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
-    if skip_tokenizer_init:
-        common_args.append("--skip-tokenizer-init")
+    if require_embed_inputs:
+        common_args.extend(
+            [
+                "--skip-tokenizer-init",
+                "--enable-prompt-embeds",
+                "--enable-mm-embeds",
+            ]
+        )
     if max_num_seqs:
         common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
 
     if distributed_backend == "ray":
-        # For V1, test Ray Compiled Graph for all the tests
+        # Test Ray Compiled Graph for all the tests
         pp_env = {
-            "VLLM_USE_V1": "1",
-            "VLLM_USE_RAY_COMPILED_DAG": "1",
-            "VLLM_USE_RAY_SPMD_WORKER": "1",
             "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
         }
         # Temporary. Currently when zeromq + SPMD is used, it does not properly
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
     elif distributed_backend == "mp":
-        pp_env = {
-            "VLLM_USE_V1": "1",
-        }
+        pp_env = None
     else:
         pp_env = None
 
-    tp_env = {
-        "VLLM_USE_V1": "1",
-    }
+    tp_env = None
 
     pp_args = [
         *common_args,
@@ -357,20 +348,16 @@ def _compare_tp(
         "mp",
     ]
 
-    compare_two_settings(model_id,
-                         pp_args,
-                         tp_args,
-                         pp_env,
-                         tp_env,
-                         method=method)
+    compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
     [
-        params for model_id, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -382,22 +369,25 @@ def test_tp_language_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
     [
-        params for model_id, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -409,22 +399,25 @@ def test_tp_language_embedding(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="encode",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="encode",
+        is_multimodal=False,
+    )
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
     [
-        params for model_id, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -436,11 +429,13 @@ def test_tp_multimodal_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=True)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=True,
+    )
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 69ceedd345a89d967c450051da3e8f8cdb0940a5..4df6f43970d70a0cf4d6150e2ef1dffd335aac10 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -9,7 +9,6 @@ from vllm.distributed.utils import get_pp_indices
 
 
 def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m:
 
         def _verify(partition_str, num_layers, pp_size, goldens):
@@ -57,7 +56,8 @@ def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
         (5, 3, 0, (0, 2)),
         (5, 3, 1, (2, 4)),
         (5, 3, 2, (4, 5)),
-    ])
+    ],
+)
 def test_uneven_auto_partition(
     num_hidden_layers: int,
     pp_size: int,
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 5ca65a0e8d2c9371a8dfe634afbd0d500b045d14..2f2b43cb4cc2b4edf0d49a54bb2faf6a716457fd 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,23 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
 import pytest
+from typing_extensions import LiteralString
 
 from ..utils import compare_two_settings, create_new_process_for_each_test
 
-if TYPE_CHECKING:
-    from typing_extensions import LiteralString
-
 
-@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
-    (2, "JackFram/llama-160m"),
-])
-@pytest.mark.parametrize("ATTN_BACKEND", [
-    "FLASH_ATTN",
-])
+@pytest.mark.parametrize(
+    "PP_SIZE, MODEL_NAME",
+    [
+        (2, "JackFram/llama-160m"),
+    ],
+)
+@pytest.mark.parametrize(
+    "ATTN_BACKEND",
+    [
+        "FLASH_ATTN",
+    ],
+)
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
     monkeypatch: pytest.MonkeyPatch,
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index abfad9ebfe7d04400e58eda77ba265a7eb62156b..c3085beeb3564f103228bf41224938db89d501c4 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,14 +9,16 @@ import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_world_group, graph_capture,
-                                             init_distributed_environment)
-from vllm.utils import update_environment_variables
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_world_group,
+    graph_capture,
+    init_distributed_environment,
+)
+from vllm.utils.system_utils import update_environment_variables
 
 
 def distributed_run(fn, world_size):
@@ -24,13 +26,13 @@ def distributed_run(fn, world_size):
     processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -47,7 +49,7 @@ def worker_fn_wrapper(fn):
     # and update the environment variables in the function
     def wrapped_fn(env):
         update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
         torch.cuda.set_device(device)
         init_distributed_environment()
@@ -58,17 +60,18 @@ def worker_fn_wrapper(fn):
 
 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
-    tensor = torch.ones(16, 1024, 1024,
-                        dtype=torch.float32).cuda(pynccl_comm.rank)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
     tensor = pynccl_comm.all_reduce(tensor)
     torch.cuda.synchronize()
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl():
     distributed_run(worker_fn, 2)
 
@@ -78,7 +81,7 @@ def multiple_allreduce_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     groups = [
         torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
-        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
     ]
     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
@@ -95,8 +98,9 @@ def multiple_allreduce_worker_fn():
         assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce():
     # this tests pynccl for multiple tp groups, in a standalone way
     # i.e. call `pynccl_comm.all_reduce` directly
@@ -121,8 +125,9 @@ def multiple_allreduce_with_vllm_worker_fn():
             assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce_with_vllm():
     # this tests pynccl for multiple tp groups, together with vllm
     # i.e. call `tensor_model_parallel_all_reduce`
@@ -133,10 +138,11 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
     with torch.no_grad():
         graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                         device=get_world_group().device)
+        pynccl_comm = PyNcclCommunicator(
+            get_world_group().cpu_group, device=get_world_group().device
+        )
         # run something in the default stream to initialize torch engine
-        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
+        a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
         torch.cuda.synchronize()
         with torch.cuda.graph(graph):
             a_out = pynccl_comm.all_reduce(a)
@@ -148,84 +154,90 @@ def worker_fn_with_cudagraph():
 
 @worker_fn_wrapper
 def all_gather_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    result = torch.zeros(num_elems * world_size,
-                         dtype=torch.float32,
-                         device=device)
-
-    expected = torch.cat([
-        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
-        for r in range(world_size)
-    ]).to(device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)
+
+    expected = torch.cat(
+        [
+            torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+            for r in range(world_size)
+        ]
+    ).to(device)
 
     pynccl_comm.all_gather(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gather():
     distributed_run(all_gather_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def all_gatherv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     assert world_size <= 8
     sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
     num_elems = sizes[rank]
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
     result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
 
-    expected = torch.cat([
-        torch.arange(sizes[r], dtype=torch.float32) + r * 100
-        for r in range(world_size)
-    ]).to(device)
+    expected = torch.cat(
+        [
+            torch.arange(sizes[r], dtype=torch.float32) + r * 100
+            for r in range(world_size)
+        ]
+    ).to(device)
 
     pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gatherv():
     distributed_run(all_gatherv_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def reduce_scatter_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    assert (num_elems % world_size == 0)
-    result = torch.zeros(num_elems // world_size,
-                         dtype=torch.float32,
-                         device=device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    assert num_elems % world_size == 0
+    result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)
 
     # Calculate expected result for this rank's chunk
     scattered_size = num_elems // world_size
@@ -233,34 +245,37 @@ def reduce_scatter_worker_fn():
         torch.arange(num_elems, dtype=torch.float32) + r * num_elems
         for r in range(world_size)
     ]
-    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
-                   for tensor in all_tensors).to(device)
+    expected = sum(
+        tensor[rank * scattered_size : (rank + 1) * scattered_size]
+        for tensor in all_tensors
+    ).to(device)
 
     pynccl_comm.reduce_scatter(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatter():
     distributed_run(reduce_scatter_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def reduce_scatterv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     assert world_size <= 8
     sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
     num_elems = sum(sizes)
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
     result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
 
     # Calculate expected result for this rank's chunk
@@ -278,41 +293,41 @@ def reduce_scatterv_worker_fn():
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatterv():
     distributed_run(reduce_scatterv_worker_fn, 2)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_with_cudagraph():
     distributed_run(worker_fn_with_cudagraph, 2)
 
 
 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
     if pynccl_comm.rank == 0:
-        tensor = torch.ones(16, 1024, 1024,
-                            dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
     else:
-        tensor = torch.empty(16, 1024, 1024,
-                             dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
 
     if pynccl_comm.rank == 0:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     assert torch.all(tensor == 1).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_send_recv():
     distributed_run(send_recv_worker_fn, 2)
 
@@ -322,27 +337,20 @@ def multiple_send_recv_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     groups = [
         torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
-        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
     ]
     group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
     if torch.distributed.get_rank() == 0:
         tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     elif torch.distributed.get_rank() == 1:
-        tensor = 2 * torch.ones(
-            16, 1024, 1024, dtype=torch.float32, device=device)
+        tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     else:
-        tensor = torch.empty(16,
-                             1024,
-                             1024,
-                             dtype=torch.float32,
-                             device=device)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
     if torch.distributed.get_rank() in [0, 1]:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
         assert torch.all(tensor == 1).cpu().item()
@@ -350,14 +358,16 @@ def multiple_send_recv_worker_fn():
         assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_send_recv():
     distributed_run(multiple_send_recv_worker_fn, 4)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_broadcast():
     distributed_run(broadcast_worker_fn, 4)
 
@@ -366,19 +376,17 @@ def test_pynccl_broadcast():
 def broadcast_worker_fn():
     # Test broadcast for every root rank.
     # Essentially this is an all-gather operation.
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
     recv_tensors = [
-        torch.empty(16,
-                    1024,
-                    1024,
-                    dtype=torch.float32,
-                    device=pynccl_comm.device)
+        torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
         for i in range(pynccl_comm.world_size)
     ]
-    recv_tensors[pynccl_comm.rank] = torch.ones(
-        16, 1024, 1024, dtype=torch.float32,
-        device=pynccl_comm.device) * pynccl_comm.rank
+    recv_tensors[pynccl_comm.rank] = (
+        torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
+        * pynccl_comm.rank
+    )
 
     for i in range(pynccl_comm.world_size):
         pynccl_comm.broadcast(recv_tensors[i], src=i)
diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py
index 6245ccbeca877b0b174183d0686830f887f111c1..53d906bbc7bd85324226b1c7991594d2d8e2b647 100644
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import multiprocessing
 import random
 
 import pytest
@@ -8,20 +9,21 @@ import ray
 import torch
 import torch.distributed as dist
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm import _custom_ops as ops
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.parallel_state import get_tp_group, graph_capture
 from vllm.platforms import current_platform
 
-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)
 
 torch.manual_seed(42)
 random.seed(44)
 # Size over 8MB is sufficient for custom quick allreduce.
-test_sizes = [
-    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
-]
+test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
 for i, v in enumerate(test_sizes):
     test_sizes[i] -= v % 8
 
@@ -38,8 +40,7 @@ def graph_quickreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tp_group().device_group
 
@@ -64,18 +65,15 @@ def graph_quickreduce(
         for sz in test_sizes:
             for dtype in [torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(1,
-                                         23, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(-23,
-                                         1, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                     torch.cuda.synchronize()
                     graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for _ in range(num_communication):
                             out1 = tensor_model_parallel_all_reduce(inp1)
                             dist.all_reduce(inp1, group=group)
@@ -99,39 +97,127 @@ def eager_quickreduce(
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
 
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
         # Size over 8MB is sufficient for custom quick allreduce.
         sz = 16 * 1024 * 1024
         fa = get_tp_group().device_communicator.qr_comm
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.float16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
+        )
         out = fa.quick_all_reduce(inp)
         torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
 
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.bfloat16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
+        )
         out = fa.quick_all_reduce(inp)
         torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
 
 
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test quick allreduce for rocm")
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
+)
 @pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
 @pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
-def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                                pipeline_parallel_size, test_target,
-                                quant_mode):
+def test_custom_quick_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+    quant_mode,
+):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
 
     monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
 
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
+
+
+def qr_variable_input(rank, world_size):
+    """
+    When the tensor parallelism is set to 4 or 8, frequent changes
+    in the input shape can cause QuickReduce to hang (this issue
+    has been observed with the gpt_oss model).
+    """
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    qr_max_size = None  # MB
+    _ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
+    ranks = []
+    for i in range(world_size):
+        ranks.append(i)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:29500",
+        rank=rank,
+        world_size=world_size,
+    )
+    cpu_group = torch.distributed.new_group(ranks, backend="nccl")
+
+    handle = ops.qr_get_handle(_ptr)
+    world_size = dist.get_world_size(group=cpu_group)
+    handles = [None] * world_size
+    dist.all_gather_object(handles, handle, group=cpu_group)
+    ops.qr_open_handles(_ptr, handles)
+
+    num = 1
+    s1 = 1024
+    while num < 50000:  # 50000 is sufficient to identify issues.
+        dtype = torch.float16
+        if num % 2 == 0:
+            s2 = 1024
+            inp1 = torch.zeros(
+                (s1, s2), dtype=dtype, device=torch.cuda.current_device()
+            )
+        else:
+            s2 = 2048
+            inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
+        result = torch.empty_like(inp1)
+        # FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
+        ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
+        try:
+            if inp1[0, 0] == 0:
+                assert torch.all(result == 0)
+            else:
+                assert torch.all(result == world_size)
+        except AssertionError:
+            print("Assertion failed! Allreduce results are incorrect.")
+            raise
+        num += 1
+
+
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
+)
+@pytest.mark.parametrize("tp_size", [4, 8])
+@pytest.mark.parametrize("pipeline_parallel_size", [1])
+def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    multiprocessing.set_start_method("spawn", force=True)
+    # 60s is enough
+    timeout = 60
+    processes = []
+    for rank in range(tp_size):
+        p = multiprocessing.Process(target=qr_variable_input, args=(rank, tp_size))
+        p.start()
+        processes.append((rank, p))
+    for rank, p in processes:
+        p.join(timeout=timeout)
+        if p.is_alive():
+            for r, proc in processes:
+                if proc.is_alive():
+                    proc.terminate()
+                    proc.join()
+            raise RuntimeError(f"QuickReduce hang detected after {timeout} seconds!")
+
+
+if __name__ == "__main__":
+    test_custom_quick_allreduce_variable_input(tp_size=4, pipeline_parallel_size=1)
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 94ad8f4f1213a9ac1ca2a4e1a01f7b2324b0d36b..4444327f01daa11bf957763762060e8bcf03157b 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,11 +3,24 @@
 
 import os
 
+import torch
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import get_ip, get_open_port
+from vllm.utils.network_utils import get_ip, get_open_port
+
+
+def _run_test(pg):
+    test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    if pg == dist.group.WORLD:
+        print("Same node test passed! when using torch distributed!")
+    else:
+        print("Same node test passed! when using StatelessProcessGroup!")
+
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
@@ -22,16 +35,15 @@ if __name__ == "__main__":
         dist.broadcast_object_list(recv, src=0)
         ip, port = recv
 
-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
 
     for pg in [dist.group.WORLD, stateless_pg]:
-        test_result = all(in_the_same_node_as(pg, source_rank=0))
-
-        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-        assert test_result == expected, \
-            f"Expected {expected}, got {test_result}"
-        if pg == dist.group.WORLD:
-            print("Same node test passed! when using torch distributed!")
+        if os.environ.get("VLLM_TEST_WITH_DEFAULT_DEVICE_SET", "0") == "1":
+            default_devices = ["cpu"]
+            if torch.cuda.is_available():
+                default_devices.append("cuda")
+            for device in default_devices:
+                torch.set_default_device(device)
+                _run_test(pg)
         else:
-            print("Same node test passed! when using StatelessProcessGroup!")
+            _run_test(pg)
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index ded3d834faf004aea5d75ca278595e8a1f7ddd0c..f38c509775ed55fa0f0614f0205408370dcb6ac3 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -7,15 +7,19 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
-from vllm.config import RunnerOption
+from vllm.config.compilation import CompilationMode
+from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -35,29 +39,16 @@ class ParallelSetup(NamedTuple):
 
 class SPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
 
 
 @dataclass
 class SPTestSettings:
     parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
     distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
     runner: RunnerOption
     test_options: SPTestOptions
 
-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
-
     @staticmethod
     def detailed(
         *,
@@ -65,25 +56,28 @@ class SPTestSettings:
         pp_base: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False, True]:
             for pp_multiplier in [1, 2]:
                 for chunked_prefill_val in [False, True]:
                     parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -93,25 +87,28 @@ class SPTestSettings:
         pp_base: int = 1,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False, True]:
             for pp_multiplier in [1, 2]:
                 for chunked_prefill_val in [False, True]:
                     parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -121,43 +118,51 @@ class SPTestSettings:
         pp_base: int = 1,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for fusion_val in [False, True]:
             parallel_setups.append(
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              enable_fusion=fusion_val,
-                              eager_mode=True,
-                              chunked_prefill=False))
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=pp_base,
+                    enable_fusion=fusion_val,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                )
+            )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.runner, opts)
+            for backend in self.distributed_backends:
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    self.runner,
+                    opts,
+                )
 
 
 def _compare_sp(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available: int,
+    use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -178,7 +183,7 @@ def _compare_sp(
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
-    skip_tokenizer_init = model_info.skip_tokenizer_init
+    require_embed_inputs = model_info.require_embed_inputs
 
     if load_format == "dummy":
         # Avoid OOM
@@ -200,8 +205,10 @@ def _compare_sp(
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
     if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
@@ -228,22 +235,25 @@ def _compare_sp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
-    if skip_tokenizer_init:
-        common_args.append("--skip-tokenizer-init")
+    if require_embed_inputs:
+        common_args.extend(
+            [
+                "--skip-tokenizer-init",
+                "--enable-prompt-embeds",
+                "--enable-mm-embeds",
+            ]
+        )
 
     compilation_config = {
-        'level': 3,
-        'custom_ops': ["+rms_norm"],
-        'compile_sizes': [4, 8],
-        'pass_config': {
-            'enable_sequence_parallelism': True,
-            'enable_fusion': enable_fusion,
-            'enable_noop': True,
+        "mode": CompilationMode.VLLM_COMPILE,
+        "compile_sizes": [4, 8],
+        "pass_config": {
+            "enable_sequence_parallelism": True,
+            "enable_async_tp": enable_async_tp,
+            "enable_fusion": enable_fusion,
+            "enable_noop": True,
         },
-    }
-
-    tp_sp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
+        "use_inductor_graph_partition": use_inductor_graph_partition,
     }
 
     tp_sp_args = [
@@ -258,9 +268,6 @@ def _compare_sp(
         json.dumps(compilation_config),
     ]
 
-    tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
     tp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -269,62 +276,71 @@ def _compare_sp(
         "mp",
     ]
 
-    try:
-        compare_two_settings(model_id,
-                             tp_sp_args,
-                             tp_args,
-                             tp_sp_env,
-                             tp_env,
-                             method=method)
-    except Exception:
-        testing_ray_compiled_graph = tp_sp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
 
 
 SP_TEXT_GENERATION_MODELS = {
     # [Decoder-only]
-    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
+    "hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }
 
 SP_TEST_MODELS = [
     # TODO support other models
     # [LANGUAGE GENERATION]
-    "meta-llama/Llama-3.2-1B-Instruct",
+    "hmellor/tiny-random-LlamaForCausalLM",
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "runner",
+        "test_options",
+    ),
     [
-        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        params
+        for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
         if model_id in SP_TEST_MODELS
     ],
 )
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available,
+    use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
 ):
-    _compare_sp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
+    # Skip FP8 SP-only test on sm89 (compute capability 8.9)
+    if (
+        "fp8" in model_id.lower()
+        and current_platform.get_device_capability() < (9, 0)
+        and (not enable_async_tp)
+    ):
+        pytest.skip("FP8 reduction support begins with sm90 capable devices.")
+
+    _compare_sp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        use_inductor_graph_partition,
+        enable_async_tp=enable_async_tp,
+        method="generate",
+        is_multimodal=False,
+    )
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index e1357b4a34e99b7c3449fea15bca247799a5de4a..a7ace62e1b542a2b49b150d3c8ab744df66c3530 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -10,7 +10,8 @@ import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import get_open_port, update_environment_variables
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.system_utils import update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
@@ -26,13 +27,13 @@ def distributed_run(fn, world_size):
     processes = []
     for i in range(number_of_processes):
         env = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -57,25 +58,23 @@ def worker_fn_wrapper(fn):
 
 @worker_fn_wrapper
 def worker_fn():
-
     rank = dist.get_rank()
     if rank == 0:
         port = get_open_port()
-        ip = '127.0.0.1'
+        ip = "127.0.0.1"
         dist.broadcast_object_list([ip, port], src=0)
     else:
         recv = [None, None]
         dist.broadcast_object_list(recv, src=0)
         ip, port = recv  # type: ignore
 
-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
 
     for pg in [dist.group.WORLD, stateless_pg]:
-
         writer_rank = 2
         broadcaster = MessageQueue.create_from_process_group(
-            pg, 40 * 1024, 2, writer_rank)
+            pg, 40 * 1024, 2, writer_rank
+        )
         if rank == writer_rank:
             seed = random.randint(0, 1000)
             dist.broadcast_object_list([seed], writer_rank)
diff --git a/tests/distributed/test_shm_buffer.py b/tests/distributed/test_shm_buffer.py
index f70028b8796090b6e19f6aff9836231b0020ded0..9fe409edc3ca206d44328e3b6169e74397961387 100644
--- a/tests/distributed/test_shm_buffer.py
+++ b/tests/distributed/test_shm_buffer.py
@@ -4,8 +4,11 @@
 import traceback
 import unittest
 
+import numpy as np
+
 from vllm.distributed.device_communicators.shm_object_storage import (
-    SingleWriterShmRingBuffer)
+    SingleWriterShmRingBuffer,
+)
 
 
 class TestSingleWriterShmRingBuffer(unittest.TestCase):
@@ -25,18 +28,21 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
         """Test opening an existing buffer"""
         # First create a buffer
         self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )
 
         # Then open it with another instance
         reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
         self.assertFalse(reader_buffer.is_writer)
-        self.assertEqual(reader_buffer.shared_memory.name,
-                         self.ring_buffer.shared_memory.name)
+        self.assertEqual(
+            reader_buffer.shared_memory.name, self.ring_buffer.shared_memory.name
+        )
 
     def test_buffer_access(self):
         """Test accessing allocated buffers"""
         self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )
 
         size = 100
         address, monotonic_id = self.ring_buffer.allocate_buf(size)
@@ -44,11 +50,11 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
         # Write some test data
         test_data = b"Hello, World!" * 7  # 91 bytes
         with self.ring_buffer.access_buf(address) as (data_buf, metadata):
-            data_buf[0:len(test_data)] = test_data
+            data_buf[0 : len(test_data)] = test_data
 
         # Read it back
         with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
-            read_data = bytes(data_buf2[0:len(test_data)])
+            read_data = bytes(data_buf2[0 : len(test_data)])
             read_id = metadata2[0]
 
         self.assertEqual(read_data, test_data)
@@ -58,7 +64,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
         """Test that MemoryError is raised when buffer is full"""
         small_buffer_size = 200
         self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=small_buffer_size, create=True)
+            data_buffer_size=small_buffer_size, create=True
+        )
 
         # Fill up the buffer
         self.ring_buffer.allocate_buf(100)
@@ -72,7 +79,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
         """Test allocation and freeing of buffers"""
         small_buffer_size = 200
         self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=small_buffer_size, create=True)
+            data_buffer_size=small_buffer_size, create=True
+        )
 
         size = 80
         # Write some data
@@ -81,7 +89,7 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
             address, monotonic_id = self.ring_buffer.allocate_buf(size)
             with self.ring_buffer.access_buf(address) as (data_buf, metadata):
                 data_buf[0:4] = (0).to_bytes(4, "little")  # 0 for not in-use
-                data_buf[4:len(test_data) + 4] = test_data
+                data_buf[4 : len(test_data) + 4] = test_data
             print(self.ring_buffer.metadata)
             freed_ids = self.ring_buffer.free_buf(lambda *args: True)
             print(f"  Freed IDs: {freed_ids}")
@@ -90,7 +98,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
     def test_clear_buffer(self):
         """Test clearing the buffer"""
         self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )
 
         # Allocate some buffers
         for _ in range(3):
@@ -106,6 +115,69 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
         self.assertEqual(self.ring_buffer.data_buffer_start, 0)
         self.assertEqual(self.ring_buffer.data_buffer_end, 0)
 
+    def test_allocation_cycles(self):
+        buffer_size = 100
+        ring = SingleWriterShmRingBuffer(data_buffer_size=buffer_size, create=True)
+
+        # tracking allocations for assertions
+        allocated_bitmap = np.zeros(
+            (buffer_size,), dtype=np.bool_
+        )  # addr -> is_allocated
+        allocation_map = dict()  # monotonic_id -> (addr, size)
+
+        def count_allocated(bitmap) -> int:
+            return np.sum(bitmap).item()
+
+        def is_free_fn(a, b) -> bool:
+            return True
+
+        def mark_allocated_with_assertion(id, addr, size):
+            addr = addr % buffer_size
+            self.assertEqual(count_allocated(allocated_bitmap[addr : addr + size]), 0)
+
+            allocated_bitmap[addr : addr + size] = True
+            allocation_map[id] = (addr, size)
+
+        def mark_freed_with_assertion(id):
+            self.assertTrue(id in allocation_map)
+
+            addr, size = allocation_map.pop(id)
+            addr = addr % buffer_size
+            self.assertEqual(
+                count_allocated(allocated_bitmap[addr : addr + size]), size
+            )
+
+            allocated_bitmap[addr : addr + size] = False
+
+        def ring_free(free_size=None):
+            freed_ids = ring.free_buf(is_free_fn, free_size)
+            for freed_id in freed_ids:
+                mark_freed_with_assertion(freed_id)
+
+        def ring_allocate(allocate_size):
+            allocate_size_with_md = allocate_size + ring.MD_SIZE
+            try:
+                addr, monotonic_id = ring.allocate_buf(allocate_size)
+                mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
+            except MemoryError:
+                # free 2x size for enough space if wrapping happened
+                ring_free(allocate_size_with_md * 2)
+
+                # retry allocating
+                addr, monotonic_id = ring.allocate_buf(allocate_size)
+                mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
+
+        # 1. allocation & free cycles
+        for _ in range(33):
+            # will consume 2 + 8 = 10 bytes per allocation
+            ring_allocate(2)
+
+        # 2. free all allocations
+        ring_free()
+
+        # 3. try allocate the largest possible buffer
+        ring_allocate(buffer_size - ring.MD_SIZE)
+
 
 def main():
     """Main function demonstrating usage and running tests"""
@@ -121,8 +193,7 @@ def main():
     # Manual demonstration
     try:
         print("Creating ring buffer...")
-        writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048,
-                                                  create=True)
+        writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, create=True)
         reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())
 
         print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
@@ -140,7 +211,7 @@ def main():
                 # Write some test data
                 with writer_buffer.access_buf(address) as (data_buf, metadata):
                     test_message = f"Test message {i}".encode()
-                    data_buf[0:len(test_message)] = test_message
+                    data_buf[0 : len(test_message)] = test_message
 
             except MemoryError as e:
                 print(f"  Failed to allocate {size} bytes: {e}")
diff --git a/tests/distributed/test_shm_storage.py b/tests/distributed/test_shm_storage.py
index 03495222bc1b831032a97f9bea841ecdbcf4357f..b9a5c22447fd809c40aeb725faf2027c44704f11 100644
--- a/tests/distributed/test_shm_storage.py
+++ b/tests/distributed/test_shm_storage.py
@@ -12,28 +12,33 @@ import torch
 
 # Assuming these are imported from your module
 from vllm.distributed.device_communicators.shm_object_storage import (
-    MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer)
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
-                                    MultiModalSharedField)
+    MsgpackSerde,
+    SingleWriterShmObjectStorage,
+    SingleWriterShmRingBuffer,
+)
+from vllm.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalSharedField,
+)
 
 
 def _dummy_elem(modality: str, key: str, size: int):
     return MultiModalFieldElem(
         modality=modality,
         key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=torch.empty((size,), dtype=torch.int8),
         field=MultiModalSharedField(1),
     )
 
 
 def _dummy_item(modality: str, size_by_key: dict[str, int]):
-    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
-    ])
+    return MultiModalKwargsItem.from_elems(
+        [_dummy_elem(modality, key, size) for key, size in size_by_key.items()]
+    )
 
 
 class TestSingleWriterShmObjectStorage(unittest.TestCase):
-
     def setUp(self):
         """Set up test fixtures before each test method."""
         ring_buffer = SingleWriterShmRingBuffer(
@@ -208,8 +213,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
         with self.assertRaises(ValueError) as context:
             self.storage.get(address, monotonic_id + 100)
 
-        self.assertIn("has been modified or is invalid", \
-            str(context.exception))
+        self.assertIn("has been modified or is invalid", str(context.exception))
 
     def test_clear_storage(self):
         """Test clearing the storage."""
@@ -234,8 +238,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
 # Reader process function
 def reader_process(process_id, storage_handle, items_to_read):
     """Reader process that connects to existing shared memory and reads data."""
-    reader_storage = SingleWriterShmObjectStorage.create_from_handle(
-        storage_handle)
+    reader_storage = SingleWriterShmObjectStorage.create_from_handle(storage_handle)
 
     print(f"Reader {process_id} started")
 
@@ -276,11 +279,7 @@ def run_multiprocess_example():
 
         # Test basic data types
         test_data = [
-            ("user_data", {
-                "name": "Alice",
-                "age": 30,
-                "scores": [95, 87, 92]
-            }),
+            ("user_data", {"name": "Alice", "age": 30, "scores": [95, 87, 92]}),
             ("simple_string", "Hello, World!"),
             ("number", 42),
             ("list_data", [1, 2, 3, "four", 5.0]),
@@ -301,8 +300,9 @@ def run_multiprocess_example():
         # initialize lock for reader processes
         handle.reader_lock = Lock()
         for i in range(storage.n_readers):
-            p = multiprocessing.Process(target=reader_process,
-                                        args=(i, handle, stored_items))
+            p = multiprocessing.Process(
+                target=reader_process, args=(i, handle, stored_items)
+            )
             processes.append(p)
             p.start()
 
diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py
index 83e1fe47aeec0950dea587e97c608acd805f3131..b8f04cf8e62c1cd9d4751e482a86d90e4b0e200e 100644
--- a/tests/distributed/test_symm_mem_allreduce.py
+++ b/tests/distributed/test_symm_mem_allreduce.py
@@ -14,15 +14,16 @@ import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
-from vllm.distributed.device_communicators.cuda_communicator import (
-    CudaCommunicator)
-from vllm.distributed.parallel_state import (get_tp_group,
-                                             init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
+from vllm.distributed.parallel_state import (
+    get_tp_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 torch.manual_seed(42)
 random.seed(44)
@@ -32,8 +33,7 @@ test_size_elements = 1024 * 1024
 
 def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
     monkeypatch = pytest.MonkeyPatch()
-    config = VllmConfig(parallel_config=ParallelConfig(
-        tensor_parallel_size=world_size))
+    config = VllmConfig(parallel_config=ParallelConfig(tensor_parallel_size=world_size))
 
     with monkeypatch.context() as m, set_current_vllm_config(config):
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
@@ -42,34 +42,34 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
         torch.cuda.set_device(device)
         torch.set_default_device(device)
         torch.set_default_dtype(dtype)
-        update_environment_variables({
-            'RANK': str(local_rank),
-            'LOCAL_RANK': str(local_rank),
-            'WORLD_SIZE': str(world_size),
-            'MASTER_ADDR': 'localhost',
-            'MASTER_PORT': '12345',
-        })
+        update_environment_variables(
+            {
+                "RANK": str(local_rank),
+                "LOCAL_RANK": str(local_rank),
+                "WORLD_SIZE": str(world_size),
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            }
+        )
 
         init_distributed_environment()
         initialize_model_parallel(tensor_model_parallel_size=world_size)
 
-        cuda_communicator = typing.cast(CudaCommunicator,
-                                        get_tp_group().device_communicator)
+        cuda_communicator = typing.cast(
+            CudaCommunicator, get_tp_group().device_communicator
+        )
         symm_mem_comm = cuda_communicator.symm_mem_comm
         if symm_mem_comm is None or symm_mem_comm.disabled:
             # can't use skip under multiprocessing
             q.put("SymmMemCommunicator is not available or disabled.")
             return
 
-        inp_direct_symm_mem = torch.randint(1,
-                                            23, (test_size_elements, ),
-                                            dtype=dtype,
-                                            device=device)
+        inp_direct_symm_mem = torch.randint(
+            1, 23, (test_size_elements,), dtype=dtype, device=device
+        )
         if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
             # can't use skip under multiprocessing
-            q.put(
-                "SymmMemCommunicator isn't used for this world and input size."
-            )
+            q.put("SymmMemCommunicator isn't used for this world and input size.")
             return
 
         original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
@@ -78,42 +78,37 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
 
         group = get_tp_group().device_group
         dist.all_reduce(original_inp_direct_symm_mem, group=group)
-        torch.testing.assert_close(out_direct_symm_mem,
-                                   original_inp_direct_symm_mem,
-                                   atol=2.5,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            out_direct_symm_mem, original_inp_direct_symm_mem, atol=2.5, rtol=0.1
+        )
 
         # Test tensor_model_parallel_all_reduce which should use symm_mem
-        inp_tensor_parallel = torch.randint(-23,
-                                            1, (test_size_elements, ),
-                                            dtype=dtype,
-                                            device=device)
+        inp_tensor_parallel = torch.randint(
+            -23, 1, (test_size_elements,), dtype=dtype, device=device
+        )
         original_inp_tensor_parallel = inp_tensor_parallel.clone()
-        out_tensor_parallel = tensor_model_parallel_all_reduce(
-            inp_tensor_parallel)
+        out_tensor_parallel = tensor_model_parallel_all_reduce(inp_tensor_parallel)
         dist.all_reduce(original_inp_tensor_parallel, group=group)
-        torch.testing.assert_close(out_tensor_parallel,
-                                   original_inp_tensor_parallel,
-                                   atol=2.5,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            out_tensor_parallel, original_inp_tensor_parallel, atol=2.5, rtol=0.1
+        )
 
 
 @pytest.mark.skipif(
     not current_platform.is_cuda(),
-    reason="SymmMemAllreduce is only available for CUDA platforms.")
+    reason="SymmMemAllreduce is only available for CUDA platforms.",
+)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                            pipeline_parallel_size):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_symm_mem_allreduce(
+    monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
+):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    q = mp.get_context('spawn').Queue()
-    mp.spawn(symm_mem_allreduce_worker,
-             args=(world_size, q),
-             nprocs=world_size)
+    q = mp.get_context("spawn").Queue()
+    mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
     try:
         val = q.get(timeout=1)
     except queue.Empty:
@@ -126,18 +121,20 @@ def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
 
 @pytest.mark.skipif(
     not current_platform.is_cuda(),
-    reason="SymmMemAllreduce is only available for CUDA platforms.")
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
+    reason="SymmMemAllreduce is only available for CUDA platforms.",
+)
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
     world_size = 4
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
     # Verify that the DataParallel runs without error
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             enforce_eager=True,
-                             enable_prefix_caching=True,
-                             data_parallel_size=2,
-                             tensor_parallel_size=2,
-                             data_parallel_backend="mp")
+    engine_args = EngineArgs(
+        model="distilbert/distilgpt2",
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        data_parallel_size=2,
+        tensor_parallel_size=2,
+        data_parallel_backend="mp",
+    )
     LLMEngine.from_engine_args(engine_args)
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 9f2c3eaec3597978a12f833c9c889af57234fb5f..f415409d7b377b33263a6be6e370ba7e10b07507 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -24,13 +24,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # set different `gpu_memory_utilization` and `swap_space` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
-llm = LLM(model="facebook/opt-125m",
-          tensor_parallel_size=2,
-          pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
-          distributed_executor_backend="external_launcher",
-          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4),
-          seed=0)
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
+    distributed_executor_backend="external_launcher",
+    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    swap_space=random.randint(1, 4),
+    seed=0,
+)
 
 outputs = llm.generate(prompts, sampling_params)
 
@@ -48,15 +50,14 @@ def test_consistent_across_ranks(obj):
         assert container[0] == obj
 
 
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
 
 # make sure we can access the model parameters from the calling process
 # of the `LLM` instance.
-params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
-              model.parameters())
+params = list(
+    llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
+)
 test_consistent_across_ranks(len(params))
 
 # all ranks should have the same outputs
@@ -65,5 +66,4 @@ for output in outputs:
     generated_text = output.outputs[0].text
     test_consistent_across_ranks(prompt)
     test_consistent_across_ranks(generated_text)
-    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py
index 2d6b930fcc07eacf1091db6a9fdb7dde412dd81a..1aa7f17935704086ac8225824ecf209650d38962 100644
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -24,23 +24,22 @@ dp_rank = int(os.getenv("DP_RANK", "0"))
 
 if dp_size > 1:
     # distribute the prompts across the data parallel ranks
-    prompts = [
-        prompt for idx, prompt in enumerate(prompts)
-        if idx % dp_size == dp_rank
-    ]
+    prompts = [prompt for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank]
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # set different `gpu_memory_utilization` and `swap_space` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
-llm = LLM(model="microsoft/Phi-mini-MoE-instruct",
-          tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
-          pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
-          enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
-          distributed_executor_backend="external_launcher",
-          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4),
-          seed=0)
+llm = LLM(
+    model="microsoft/Phi-mini-MoE-instruct",
+    tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
+    pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
+    enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
+    distributed_executor_backend="external_launcher",
+    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    swap_space=random.randint(1, 4),
+    seed=0,
+)
 
 outputs = llm.generate(prompts, sampling_params)
 
@@ -54,21 +53,18 @@ def test_consistent_across_ranks(obj):
         dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
     else:
         container = [None]
-        dist.broadcast_object_list(container,
-                                   src=group.ranks[0],
-                                   group=cpu_group)
+        dist.broadcast_object_list(container, src=group.ranks[0], group=cpu_group)
         assert container[0] == obj
 
 
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
 
 # make sure we can access the model parameters from the calling process
 # of the `LLM` instance.
-params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
-              model.parameters())
+params = list(
+    llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
+)
 test_consistent_across_ranks(len(params))
 
 # all ranks should have the same outputs
@@ -77,5 +73,4 @@ for output in outputs:
     generated_text = output.outputs[0].text
     test_consistent_across_ranks(prompt)
     test_consistent_across_ranks(generated_text)
-    print(f"Rank {group_rank}, Prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Rank {group_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 0287ad94e388624fc811fcfb4d85f93bdb1f5bda..8289f697fea69b6517a796cce8f7a2a875085f5f 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -10,21 +10,20 @@ import torch
 import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import (cuda_device_count_stateless, get_open_port,
-                        update_environment_variables)
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from ..utils import multi_gpu_test
 
 
 @ray.remote
 class _CUDADeviceCountStatelessTestActor:
-
     def get_count(self):
         return cuda_device_count_stateless()
 
     def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     def get_cuda_visible_devices(self):
         return envs.CUDA_VISIBLE_DEVICES
@@ -34,10 +33,9 @@ def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
-        num_gpus=2).remote()
-    assert len(
-        sorted(ray.get(
-            actor.get_cuda_visible_devices.remote()).split(","))) == 2
+        num_gpus=2
+    ).remote()
+    assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
     assert ray.get(actor.get_count.remote()) == 2
     ray.get(actor.set_cuda_visible_devices.remote("0"))
     assert ray.get(actor.get_count.remote()) == 1
@@ -46,15 +44,13 @@ def test_cuda_device_count_stateless():
 
 
 def cpu_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
     data = torch.tensor([rank])
     data = pg1.broadcast_obj(data, src=2)
     assert data.item() == 2
@@ -68,16 +64,14 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
     torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
@@ -96,10 +90,9 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def broadcast_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     if rank == 2:
         pg1.broadcast_obj("secret", src=2)
     else:
@@ -109,10 +102,9 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def allgather_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     data = pg1.all_gather_obj(rank)
     assert data == list(range(WORLD_SIZE))
     pg1.barrier()
@@ -121,7 +113,8 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
 @pytest.mark.skip(reason="This test is flaky and prone to hang.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
-    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
+)
 def test_stateless_process_group(worker):
     port1 = get_open_port()
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -129,12 +122,14 @@ def test_stateless_process_group(worker):
         port2 = get_open_port()
     WORLD_SIZE = 4
     from multiprocessing import get_context
+
     ctx = get_context("fork")
     processes = []
     for i in range(WORLD_SIZE):
         rank = i
         processes.append(
-            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
+        )
     for p in processes:
         p.start()
     for p in processes:
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 33888f008f04b52b8fe724b0a324310ccb6b0292..472b1487ef440c8b911d9b12c43c202bfa890f92 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -5,27 +5,35 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import pytest
 
 from vllm.config import CompilationConfig, config
-from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, get_type_hints, is_not_builtin,
-                                   is_type, literal_to_kwargs, optional_type,
-                                   parse_type)
-from vllm.utils import FlexibleArgumentParser
-
-
-@pytest.mark.parametrize(("type", "value", "expected"), [
-    (int, "42", 42),
-    (float, "3.14", 3.14),
-    (str, "Hello World!", "Hello World!"),
-    (json.loads, '{"foo":1,"bar":2}', {
-        "foo": 1,
-        "bar": 2
-    }),
-])
+from vllm.engine.arg_utils import (
+    EngineArgs,
+    contains_type,
+    get_kwargs,
+    get_type,
+    get_type_hints,
+    is_not_builtin,
+    is_type,
+    literal_to_kwargs,
+    optional_type,
+    parse_type,
+)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+@pytest.mark.parametrize(
+    ("type", "value", "expected"),
+    [
+        (int, "42", 42),
+        (float, "3.14", 3.14),
+        (str, "Hello World!", "Hello World!"),
+        (json.loads, '{"foo":1,"bar":2}', {"foo": 1, "bar": 2}),
+    ],
+)
 def test_parse_type(type, value, expected):
     parse_type_func = parse_type(type)
     assert parse_type_func(value) == expected
@@ -37,50 +45,56 @@ def test_optional_type():
     assert optional_type_func("42") == 42
 
 
-@pytest.mark.parametrize(("type_hint", "type", "expected"), [
-    (int, int, True),
-    (int, float, False),
-    (list[int], list, True),
-    (list[int], tuple, False),
-    (Literal[0, 1], Literal, True),
-])
+@pytest.mark.parametrize(
+    ("type_hint", "type", "expected"),
+    [
+        (int, int, True),
+        (int, float, False),
+        (list[int], list, True),
+        (list[int], tuple, False),
+        (Literal[0, 1], Literal, True),
+    ],
+)
 def test_is_type(type_hint, type, expected):
     assert is_type(type_hint, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({float, int}, int, True),
-    ({int, tuple}, int, True),
-    ({int, tuple[int]}, int, True),
-    ({int, tuple[int, ...]}, int, True),
-    ({int, tuple[int]}, float, False),
-    ({int, tuple[int, ...]}, float, False),
-    ({str, Literal["x", "y"]}, Literal, True),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "type", "expected"),
+    [
+        ({float, int}, int, True),
+        ({int, tuple}, int, True),
+        ({int, tuple[int]}, int, True),
+        ({int, tuple[int, ...]}, int, True),
+        ({int, tuple[int]}, float, False),
+        ({int, tuple[int, ...]}, float, False),
+        ({str, Literal["x", "y"]}, Literal, True),
+    ],
+)
 def test_contains_type(type_hints, type, expected):
     assert contains_type(type_hints, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({int, float}, int, int),
-    ({int, float}, str, None),
-    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "type", "expected"),
+    [
+        ({int, float}, int, int),
+        ({int, float}, str, None),
+        ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
+    ],
+)
 def test_get_type(type_hints, type, expected):
     assert get_type(type_hints, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "expected"), [
-    ({Literal[1, 2]}, {
-        "type": int,
-        "choices": [1, 2]
-    }),
-    ({str, Literal["x", "y"]}, {
-        "type": str,
-        "metavar": ["x", "y"]
-    }),
-    ({Literal[1, "a"]}, Exception),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "expected"),
+    [
+        ({Literal[1, 2]}, {"type": int, "choices": [1, 2]}),
+        ({str, Literal["x", "y"]}, {"type": str, "metavar": ["x", "y"]}),
+        ({Literal[1, "a"]}, Exception),
+    ],
+)
 def test_literal_to_kwargs(type_hints, expected):
     context = nullcontext()
     if expected is Exception:
@@ -101,9 +115,9 @@ class NestedConfig:
 class DummyConfig:
     regular_bool: bool = True
     """Regular bool with default True"""
-    optional_bool: Optional[bool] = None
+    optional_bool: bool | None = None
     """Optional bool with default None"""
-    optional_literal: Optional[Literal["x", "y"]] = None
+    optional_literal: Literal["x", "y"] | None = None
     """Optional literal with default None"""
     tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
     """Tuple with variable length"""
@@ -113,8 +127,10 @@ class DummyConfig:
     """List with variable length"""
     list_literal: list[Literal[1, 2]] = field(default_factory=list)
     """List with literal choices"""
-    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    list_union: list[str | type[object]] = field(default_factory=list)
     """List with union type"""
+    set_n: set[int] = field(default_factory=lambda: {1, 2, 3})
+    """Set with variable length"""
     literal_literal: Literal[Literal[1], Literal[2]] = 1
     """Literal of literals with default 1"""
     json_tip: dict = field(default_factory=dict)
@@ -123,22 +139,27 @@ class DummyConfig:
     """Nested config"""
 
 
-@pytest.mark.parametrize(("type_hint", "expected"), [
-    (int, False),
-    (DummyConfig, True),
-])
+@pytest.mark.parametrize(
+    ("type_hint", "expected"),
+    [
+        (int, False),
+        (DummyConfig, True),
+    ],
+)
 def test_is_not_builtin(type_hint, expected):
     assert is_not_builtin(type_hint) == expected
 
 
 @pytest.mark.parametrize(
-    ("type_hint", "expected"), [
+    ("type_hint", "expected"),
+    [
         (Annotated[int, "annotation"], {int}),
-        (Optional[int], {int, type(None)}),
-        (Annotated[Optional[int], "annotation"], {int, type(None)}),
-        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+        (int | None, {int, type(None)}),
+        (Annotated[int | None, "annotation"], {int, type(None)}),
+        (Annotated[int, "annotation"] | None, {int, type(None)}),
     ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+    ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
+)
 def test_get_type_hints(type_hint, expected):
     assert get_type_hints(type_hint) == expected
 
@@ -165,6 +186,9 @@ def test_get_kwargs():
     # lists with unions should become str type.
     # If not, we cannot know which type to use for parsing
     assert kwargs["list_union"]["type"] is str
+    # sets should work like lists
+    assert kwargs["set_n"]["type"] is int
+    assert kwargs["set_n"]["nargs"] == "+"
     # literals of literals should have merged choices
     assert kwargs["literal_literal"]["choices"] == [1, 2]
     # dict should have json tip in help
@@ -178,24 +202,16 @@ def test_get_kwargs():
     ("arg", "expected"),
     [
         (None, dict()),
-        ('{"video": {"num_frames": 123} }', {
-            "video": {
-                "num_frames": 123
-            }
-        }),
+        ('{"video": {"num_frames": 123} }', {"video": {"num_frames": 123}}),
         (
             '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
             {
-                "video": {
-                    "num_frames": 123,
-                    "fps": 1.0,
-                    "foo": "bar"
-                },
-                "image": {
-                    "foo": "bar"
-                }
-            }),
-    ])
+                "video": {"num_frames": 123, "fps": 1.0, "foo": "bar"},
+                "image": {"foo": "bar"},
+            },
+        ),
+    ],
+)
 def test_media_io_kwargs_parser(arg, expected):
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
@@ -215,39 +231,47 @@ def test_compilation_config():
 
     # set to O3
     args = parser.parse_args(["-O0"])
-    assert args.compilation_config.level == 0
+    assert args.compilation_config.mode == 0
 
     # set to O 3 (space)
     args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.level == 1
+    assert args.compilation_config.mode == 1
 
     # set to O 3 (equals)
     args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.level == 2
+    assert args.compilation_config.mode == 2
 
-    # set to O.level 3
-    args = parser.parse_args(["-O.level", "3"])
-    assert args.compilation_config.level == 3
+    # set to O.mode 3
+    args = parser.parse_args(["-O.mode", "3"])
+    assert args.compilation_config.mode == 3
 
     # set to string form of a dict
-    args = parser.parse_args([
-        "-O",
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": false}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and not args.compilation_config.use_inductor)
+    args = parser.parse_args(
+        [
+            "-O",
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '"use_inductor": false}',
+        ]
+    )
+    assert (
+        args.compilation_config.mode == 3
+        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+        and not args.compilation_config.use_inductor
+    )
 
     # set to string form of a dict
-    args = parser.parse_args([
-        "--compilation-config="
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": true}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and args.compilation_config.use_inductor)
+    args = parser.parse_args(
+        [
+            "--compilation-config="
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '"use_inductor": true}',
+        ]
+    )
+    assert (
+        args.compilation_config.mode == 3
+        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+        and args.compilation_config.use_inductor
+    )
 
 
 def test_prefix_cache_default():
@@ -255,8 +279,7 @@ def test_prefix_cache_default():
     args = parser.parse_args([])
 
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert (not engine_args.enable_prefix_caching
-            ), "prefix caching defaults to off."
+    assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
@@ -269,29 +292,15 @@ def test_prefix_cache_default():
     assert not engine_args.enable_prefix_caching
 
 
-# yapf: disable
-@pytest.mark.parametrize(("arg", "expected", "option"), [
-    (None, None, "mm-processor-kwargs"),
-    ("{}", {}, "mm-processor-kwargs"),
-    (
-        '{"num_crops": 4}',
-        {
-            "num_crops": 4
-        },
-        "mm-processor-kwargs"
-    ),
-    (
-        '{"foo": {"bar": "baz"}}',
-        {
-            "foo":
-            {
-                "bar": "baz"
-            }
-        },
-        "mm-processor-kwargs"
-    ),
-])
-# yapf: enable
+@pytest.mark.parametrize(
+    ("arg", "expected", "option"),
+    [
+        (None, None, "mm-processor-kwargs"),
+        ("{}", {}, "mm-processor-kwargs"),
+        ('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
+        ('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
+    ],
+)
 def test_composite_arg_parser(arg, expected, option):
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
@@ -303,8 +312,7 @@ def test_composite_arg_parser(arg, expected, option):
 
 def test_human_readable_model_len():
     # `exit_on_error` disabled to test invalid values below
-    parser = EngineArgs.add_cli_args(
-        FlexibleArgumentParser(exit_on_error=False))
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
 
     args = parser.parse_args([])
     assert args.max_model_len is None
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index 9eb3dfc09224e0c3cff49533dc2504556c741948..54a88586d8edd0d5259eb3e4c6919c57e3d95337 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -5,12 +5,12 @@ import pytest
 
 from ..conftest import IMAGE_ASSETS
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+        "cherry_blossom": "USER: <image>\nWhat is the season?\nASSISTANT:",
+    }
+)
 
 models = ["llava-hf/llava-1.5-7b-hf"]
 
@@ -19,8 +19,7 @@ models = ["llava-hf/llava-1.5-7b-hf"]
 def test_context_length_too_short(vllm_runner, image_assets, model):
     images = [asset.pil_image for asset in image_assets]
 
-    with pytest.raises(ValueError,
-                       match="longer than the maximum model length"):
+    with pytest.raises(ValueError, match="longer than the maximum model length"):
         vllm_model = vllm_runner(
             model,
             max_model_len=128,  # LLaVA has a feature size of 576
@@ -29,6 +28,6 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
         )
 
         with vllm_model:
-            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
-                                       max_tokens=1,
-                                       images=[images[0]])
+            vllm_model.generate_greedy(
+                [HF_IMAGE_PROMPTS[0]], max_tokens=1, images=[images[0]]
+            )
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index da75806ccf4dec8e3a01495db8887d3f37e2e96e..a52e1cb7df33d2ebe47ed3c01c59550188f90d87 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -26,8 +26,10 @@ def sample_token_ids():
 
 @pytest.fixture
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 @pytest.fixture
@@ -35,40 +37,27 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
             "skills": {
                 "type": "array",
-                "items": {
-                    "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
+                "items": {"type": "string", "maxLength": 10},
+                "minItems": 3,
             },
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
-                        "duration": {
-                            "type": "number"
-                        },
-                        "position": {
-                            "type": "string"
-                        }
+                        "company": {"type": "string"},
+                        "duration": {"type": "number"},
+                        "position": {"type": "string"},
                     },
-                    "required": ["company", "position"]
-                }
-            }
+                    "required": ["company", "position"],
+                },
+            },
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required": ["name", "age", "skills", "work_history"],
     }
 
 
@@ -80,65 +69,54 @@ def sample_complex_json_schema():
             "score": {
                 "type": "integer",
                 "minimum": 0,
-                "maximum": 100  # Numeric range
+                "maximum": 100,  # Numeric range
             },
             "grade": {
                 "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
+                "pattern": "^[A-D]$",  # Regex pattern
             },
             "email": {
                 "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
             },
             "tags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "pattern":
-                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
-                }
-            }
+                    # Combining length and pattern restrictions
+                    "pattern": "^[a-z]{1,10}$",
+                },
+            },
         },
-        "required": ["score", "grade", "email", "tags"]
+        "required": ["score", "grade", "email", "tags"],
     }
 
 
 @pytest.fixture
 def sample_definition_json_schema():
     return {
-        '$defs': {
-            'Step': {
-                'properties': {
-                    'explanation': {
-                        'title': 'Explanation',
-                        'type': 'string'
-                    },
-                    'output': {
-                        'title': 'Output',
-                        'type': 'string'
-                    }
+        "$defs": {
+            "Step": {
+                "properties": {
+                    "explanation": {"title": "Explanation", "type": "string"},
+                    "output": {"title": "Output", "type": "string"},
                 },
-                'required': ['explanation', 'output'],
-                'title': 'Step',
-                'type': 'object'
+                "required": ["explanation", "output"],
+                "title": "Step",
+                "type": "object",
             }
         },
-        'properties': {
-            'steps': {
-                'items': {
-                    '$ref': '#/$defs/Step'
-                },
-                'title': 'Steps',
-                'type': 'array'
+        "properties": {
+            "steps": {
+                "items": {"$ref": "#/$defs/Step"},
+                "title": "Steps",
+                "type": "array",
             },
-            'final_answer': {
-                'title': 'Final Answer',
-                'type': 'string'
-            }
+            "final_answer": {"title": "Final Answer", "type": "string"},
         },
-        'required': ['steps', 'final_answer'],
-        'title': 'MathReasoning',
-        'type': 'object'
+        "required": ["steps", "final_answer"],
+        "title": "MathReasoning",
+        "type": "object",
     }
 
 
@@ -149,62 +127,77 @@ def sample_enum_json_schema():
         "properties": {
             "status": {
                 "type": "string",
-                "enum": ["active", "inactive",
-                         "pending"]  # Literal values using enum
+                "enum": ["active", "inactive", "pending"],  # Literal values using enum
             },
             "priority": {
                 "type": "string",
-                "enum": ["low", "medium", "high", "critical"]
+                "enum": ["low", "medium", "high", "critical"],
             },
             "category": {
                 "type": "object",
                 "properties": {
                     "type": {
                         "type": "string",
-                        "enum": ["bug", "feature", "improvement"]
+                        "enum": ["bug", "feature", "improvement"],
                     },
                     "severity": {
                         "type": "integer",
-                        "enum": [1, 2, 3, 4,
-                                 5]  # Enum can also contain numbers
-                    }
+                        "enum": [1, 2, 3, 4, 5],  # Enum can also contain numbers
+                    },
                 },
-                "required": ["type", "severity"]
+                "required": ["type", "severity"],
             },
             "flags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "enum": ["urgent", "blocked", "needs_review", "approved"]
-                }
-            }
+                    "enum": ["urgent", "blocked", "needs_review", "approved"],
+                },
+            },
         },
-        "required": ["status", "priority", "category", "flags"]
+        "required": ["status", "priority", "category", "flags"],
     }
 
 
 @pytest.fixture
 def sample_structured_outputs_choices():
     return [
-        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
-        "Ruby", "Swift", "Kotlin"
+        "Python",
+        "Java",
+        "JavaScript",
+        "C++",
+        "C#",
+        "PHP",
+        "TypeScript",
+        "Ruby",
+        "Swift",
+        "Kotlin",
     ]
 
 
 @pytest.fixture
 def sample_sql_statements():
-    return ("""
+    return """
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
 column: "col_1" | "col_2"
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
+"""
 
 
 @pytest.fixture(scope="session")
 def zephyr_lora_files():
     """Download zephyr LoRA files once per test session."""
     from huggingface_hub import snapshot_download
+
     return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="session")
+def opt125_lora_files() -> str:
+    """Download opt-125m LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="peft-internal-testing/opt-125m-dummy-lora")
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 5d605e906e81bf715c7f6a8df2b6735f5b464276..af607720c8b0ecd577acc309962034e6293e199a 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -48,58 +48,47 @@ def run_test(model_name, more_args=None):
 
     measured_value = results["results"][TASK][FILTER]
     assert model_name in EXPECTED_VALUES, (
-        f"Cannot find the expected value for the model {model_name=}")
+        f"Cannot find the expected value for the model {model_name=}"
+    )
     expected_value = EXPECTED_VALUES[model_name]
-    assert (measured_value - RTOL < expected_value
-            and measured_value + RTOL > expected_value
-            ), f"Expected: {expected_value} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 
 # TODO: [AlexM] Fix it with new CI/CD tests
-TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 is currently only supported on CUDA and TPU")
 @pytest.mark.parametrize("model", MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine(model):
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
 
-            more_args = "max_model_len=2048,max_num_seqs=64"
+        more_args = "max_model_len=2048,max_num_seqs=64"
 
-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(model, more_args)
+    run_test(model, more_args)
 
 
-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 is currently only supported on CUDA and TPU")
 @pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-        model, monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
-            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
 
-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(model, more_args)
+    run_test(model, more_args)
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index bf460d0fb25d3e02af8957cda75572d27fe99552..a9698632b82e0423322bb4b57456660c54d2104a 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -6,6 +6,7 @@ import pytest
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.sampling_params import SamplingParams
 
 from ..openai.test_vision import TEST_IMAGE_ASSETS
 
@@ -14,9 +15,30 @@ from ..openai.test_vision import TEST_IMAGE_ASSETS
 def text_llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="function")
+def llm_for_failure_test():
+    """
+    Fixture for testing issue #26081.
+    Uses a small max_model_len to easily trigger length errors.
+    """
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        seed=0,
+        max_model_len=128,
+        disable_log_stats=True,
+    )
 
     yield weakref.proxy(llm)
 
@@ -28,14 +50,8 @@ def text_llm():
 def test_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
     ]
     outputs = text_llm.chat(messages)
     assert len(outputs) == 1
@@ -46,25 +62,13 @@ def test_multi_chat(text_llm):
     prompt2 = "Explain what among us is."
 
     conversation1 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
     ]
 
     conversation2 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt2
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt2},
     ]
 
     messages = [conversation1, conversation2]
@@ -94,26 +98,22 @@ def vision_llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
-                         indirect=True)
+@pytest.mark.parametrize(
+    "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
+)
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
     outputs = vision_llm.chat(messages)
     assert len(outputs) >= 0
 
@@ -124,14 +124,8 @@ def test_llm_chat_tokenization_no_double_bos(text_llm):
     Check we get a single BOS token for llama chat.
     """
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello!"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello!"},
     ]
     outputs = text_llm.chat(messages)
     assert len(outputs) == 1
@@ -167,14 +161,8 @@ def thinking_llm():
 @pytest.mark.parametrize("enable_thinking", [True, False])
 def test_chat_extra_kwargs(thinking_llm, enable_thinking):
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "What is 1+1?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What is 1+1?"},
     ]
 
     outputs = thinking_llm.chat(
@@ -193,3 +181,32 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking):
     else:
         # The chat template includes dummy thinking process
         assert think_id in prompt_token_ids
+
+
+def test_chat_batch_failure_cleanup(llm_for_failure_test):
+    """
+    Tests that if a batch call to llm.chat() fails mid-way
+    (e.g., due to one invalid prompt), the requests that
+    were already enqueued are properly aborted and do not
+    pollute the queue for subsequent calls.
+    (Fixes Issue #26081)
+    """
+    llm = llm_for_failure_test
+    valid_msg = [{"role": "user", "content": "Hello"}]
+    long_text = "This is a very long text to test the error " * 50
+    invalid_msg = [{"role": "user", "content": long_text}]
+    batch_1 = [
+        valid_msg,
+        valid_msg,
+        invalid_msg,
+    ]
+    batch_2 = [
+        valid_msg,
+        valid_msg,
+    ]
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    with pytest.raises(ValueError, match="longer than the maximum model length"):
+        llm.chat(batch_1, sampling_params=sampling_params)
+    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
+    assert len(outputs_2) == len(batch_2)
+    assert llm.llm_engine.get_num_unfinished_requests() == 0
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 3a13f8c979f23b038da8ae25dca505b9389a016c..747676ac9567502f96b6a22df79286247a4b7ada 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
 from vllm import LLM
 
@@ -12,6 +13,8 @@ from ...utils import create_new_process_for_each_test
 @pytest.mark.parametrize("backend", ["mp", "ray"])
 @create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
     if tp_size == 1 and backend == "ray":
         pytest.skip("Skip duplicate test case")
     if tp_size == 1:
@@ -23,9 +26,11 @@ def test_collective_rpc(tp_size, backend, monkeypatch):
         return self.rank
 
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              load_format="dummy",
-              tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend)
+    llm = LLM(
+        model="hmellor/tiny-random-LlamaForCausalLM",
+        enforce_eager=True,
+        load_format="dummy",
+        tensor_parallel_size=tp_size,
+        distributed_executor_backend=backend,
+    )
     assert llm.collective_rpc(echo_rank) == list(range(tp_size))
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index e0ecb02d4f5639c62d3f27e69a41da7982ae0374..34465b7d27080b322118982a9109e25d3aa9f911 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -29,11 +29,13 @@ TOKEN_IDS = [
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.10,
-              enforce_eager=True)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,
+    )
 
     yield weakref.proxy(llm)
 
@@ -69,6 +71,26 @@ def test_multiple_sampling_params(llm: LLM):
     assert len(PROMPTS) == len(outputs)
 
 
+def test_multiple_priority(llm: LLM):
+    # Generate works when priority is None
+    outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
+    assert len(PROMPTS) == len(outputs)
+
+    # Generate works when length of priority is same as the len(PROMPTS)
+    outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the length of priority does not match the length of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(
+            PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
+        )
+
+    # Exception raised, if the priority list is empty
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
+
+
 def test_max_model_len():
     max_model_len = 20
     llm = LLM(
@@ -81,8 +103,22 @@ def test_max_model_len():
     outputs = llm.generate(PROMPTS, sampling_params)
     for output in outputs:
         num_total_tokens = len(output.prompt_token_ids) + len(
-            output.outputs[0].token_ids)
+            output.outputs[0].token_ids
+        )
         # Total tokens must not exceed max_model_len.
         # It can be less if generation finishes due to other reasons (e.g., EOS)
         # before reaching the absolute model length limit.
         assert num_total_tokens <= max_model_len
+
+
+def test_log_stats():
+    llm = LLM(
+        model=MODEL_NAME,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    outputs = llm.generate(PROMPTS, sampling_params=None)
+
+    # disable_log_stats is False, every output should have metrics
+    assert all(output.metrics is not None for output in outputs)
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index 533da9e6d6eac403fdadaf6dc79de3c479f60017..896091533ad29fbfb71915bf11d569447fd0d7dc 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -16,9 +16,8 @@ def test_gpu_memory_utilization():
     # makes sure gpu_memory_utilization is per-instance limit,
     # not a global limit
     llms = [
-        LLM(model="facebook/opt-125m",
-            gpu_memory_utilization=0.3,
-            enforce_eager=True) for i in range(3)
+        LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
+        for i in range(3)
     ]
     for llm in llms:
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/entrypoints/llm/test_mm_cache_stats.py b/tests/entrypoints/llm/test_mm_cache_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ee99124409d0b060a36d98f31f865f6bb4ce77
--- /dev/null
+++ b/tests/entrypoints/llm/test_mm_cache_stats.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+
+import pytest
+import regex as re
+
+from vllm import LLM
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.v1.metrics import loggers as stat_loggers
+from vllm.v1.metrics.reader import Counter, Metric
+
+from ..openai.test_vision import TEST_IMAGE_ASSETS
+
+
+def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+            ],
+        }
+    ]
+
+
+def _get_counter_value(metrics: list[Metric], name: str):
+    metric = next(m for m in metrics if m.name == name)
+    assert isinstance(metric, Counter)
+    return metric.value
+
+
+def _get_mm_cache_stats(metrics: list[Metric]):
+    mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
+    mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
+
+    return mm_cache_queries, mm_cache_hits
+
+
+def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
+    caplog_vllm.clear()
+    with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
+        llm.llm_engine.do_log_stats()
+
+    assert len(caplog_vllm.records) == 1
+    msg = caplog_vllm.records[0].getMessage()
+
+    assert "MM cache hit rate" in msg
+    match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
+    assert match is not None
+    return float(match.group(1))
+
+
+@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
+@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
+def test_mm_cache_stats(
+    num_gpus_available,
+    image_urls,
+    mm_processor_cache_type,
+    caplog_vllm,
+):
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        mm_processor_cache_type=mm_processor_cache_type,
+        disable_log_stats=False,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[1]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
+
+    # NOTE: This only resets hit rate stats in CachingMetrics
+    # The raw queries and hits counts remain unaffected
+    llm.reset_mm_cache()
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[1]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index b219b33d1760e1c0611bb63e8bfcd750dc1eb396..c17486d962f342702256b80a31c110f97c84df26 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -2,18 +2,33 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
 from vllm import LLM
 
 
 def test_empty_prompt():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
+    with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
         llm.generate([""])
 
 
-@pytest.mark.skip_v1
 def test_out_of_vocab_token():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='out of vocabulary'):
+    with pytest.raises(ValueError, match="out of vocabulary"):
         llm.generate({"prompt_token_ids": [999999]})
+
+
+def test_require_mm_embeds():
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        enforce_eager=True,
+        enable_mm_embeds=False,
+    )
+    with pytest.raises(ValueError, match="--enable-mm-embeds"):
+        llm.generate(
+            {
+                "prompt": "<image>",
+                "multi_modal_data": {"image": torch.empty(1, 1, 1)},
+            }
+        )
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index f8ed5dda260ff087ab3f72012f995033d30b0769..25e663f3af0eb2ae2ae1126f4070dad6e7bc867d 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
+
 import dataclasses
 import importlib
 import sys
@@ -91,12 +92,11 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
 
 
 def _re_import_modules():
-    hf_hub_module_names = [
-        k for k in sys.modules if k.startswith("huggingface_hub")
-    ]
+    hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
     transformers_module_names = [
-        k for k in sys.modules if k.startswith("transformers")
-        and not k.startswith("transformers_modules")
+        k
+        for k in sys.modules
+        if k.startswith("transformers") and not k.startswith("transformers_modules")
     ]
 
     reload_exception = None
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
index 0ecdd4245df43d4fc8f9b1091d5428c7abdfef64..b40079d8dc3d5d380208e4d26bd7708efb005e6f 100644
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@@ -7,14 +7,14 @@ from vllm.assets.audio import AudioAsset
 
 @pytest.fixture
 def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
+    path = AudioAsset("mary_had_lamb").get_local_path()
     with open(str(path), "rb") as f:
         yield f
 
 
 @pytest.fixture
 def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
+    path = AudioAsset("winning_call").get_local_path()
     with open(str(path), "rb") as f:
         yield f
 
@@ -22,6 +22,6 @@ def winning_call():
 @pytest.fixture
 def foscolo():
     # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
+    path = AudioAsset("azacinto_foscolo").get_local_path()
     with open(str(path), "rb") as f:
         yield f
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 624acd5ffde730bf3294581e265d0bc19f8332dd..5b23b4239027956fb438b4927ac2a723d7218e9e 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
 """
 
 import lm_eval
-import pytest
 
 from vllm.platforms import current_platform
 
@@ -44,14 +43,15 @@ def run_test(more_args):
     print(f"Running with: {args}")
 
     with RemoteOpenAIServer(
-            MODEL_NAME, args,
-            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
+        MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
+    ) as remote_server:
         url = f"{remote_server.url_for('v1')}/completions"
 
         model_args = (
             f"model={MODEL_NAME},"
             f"base_url={url},"
-            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
 
         results = lm_eval.simple_evaluate(
             model="local-completions",
@@ -60,24 +60,19 @@ def run_test(more_args):
         )
 
         measured_value = results["results"][TASK][FILTER]
-        assert (measured_value - RTOL < EXPECTED_VALUE
-                and measured_value + RTOL > EXPECTED_VALUE
-                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu()
-                    and not current_platform.is_xpu(),
-                    reason="V1 currently only supported on CUDA, XPU and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine():
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        more_args = []
+    more_args = []
 
-        # Limit compilation time for V1
-        if current_platform.is_tpu():
-            more_args = ["--max-num-seqs", "64"]
+    # Limit compilation time for V1
+    if current_platform.is_tpu():
+        more_args = ["--max-num-seqs", "64"]
 
-        run_test(more_args)
+    run_test(more_args)
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 9122b7003bf9a3cb9e0baaaccab8abb7a0938e59..7821ade63ac389bf8baf4bd30cc2f35ce4ba508e 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -7,6 +7,7 @@ a baseline.
 This simulates real work usage of the API and makes sure that the frontend and
 AsyncLLMEngine are working correctly.
 """
+
 import asyncio
 import io
 import time
@@ -45,7 +46,8 @@ async def transcribe_audio(client, tokenizer, y, sr):
         # NOTE there's no streaming in transcriptions, can't measure ttft
     latency = end_time - start_time
     num_output_tokens = len(
-        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+        tokenizer(transcription.text, add_special_tokens=False).input_ids
+    )
     return latency, num_output_tokens, transcription.text
 
 
@@ -73,8 +75,8 @@ async def process_dataset(model, client, data, concurrent_request):
     for sample in data:
         audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
         task = asyncio.create_task(
-            bound_transcribe(sem, client, tokenizer, (audio, sr),
-                             sample["text"]))
+            bound_transcribe(sem, client, tokenizer, (audio, sr), sample["text"])
+        )
         tasks.append(task)
     return await asyncio.gather(*tasks)
 
@@ -98,34 +100,35 @@ def print_performance_metrics(results, total_time):
 
 
 def add_duration(sample):
-    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
-    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+    sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
     return sample
 
 
-def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
     ## Load and filter the dataset
     dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
-    if 'duration_ms' not in dataset[0]:
+    if "duration_ms" not in dataset[0]:
         # compute duration to filter
         dataset = dataset.map(add_duration)
 
     # Whisper max supported duration
-    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
     return dataset
 
 
-def run_evaluation(model: str,
-                   client,
-                   dataset,
-                   max_concurrent_reqs: int,
-                   n_examples: int = -1,
-                   print_metrics: bool = True):
+def run_evaluation(
+    model: str,
+    client,
+    dataset,
+    max_concurrent_reqs: int,
+    n_examples: int = -1,
+    print_metrics: bool = True,
+):
     if n_examples > 0:
         dataset = dataset.select(range(n_examples))
     start = time.perf_counter()
-    results = asyncio.run(
-        process_dataset(model, client, dataset, max_concurrent_reqs))
+    results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
     end = time.perf_counter()
     total_time = end - start
     print(f"Total Test Time: {total_time:.4f} seconds")
@@ -135,8 +138,7 @@ def run_evaluation(model: str,
     predictions = [res[2] for res in results]
     references = [res[3] for res in results]
     wer = load("wer")
-    wer_score = 100 * wer.compute(references=references,
-                                  predictions=predictions)
+    wer_score = 100 * wer.compute(references=references, predictions=predictions)
     print("WER:", wer_score)
     return wer_score
 
@@ -145,26 +147,25 @@ def run_evaluation(model: str,
 @pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
 # Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
 @pytest.mark.parametrize(
-    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
+)
 # NOTE: Expected WER measured with equivalent hf.transformers args:
 # whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
 @pytest.mark.parametrize("expected_wer", [12.744980])
-def test_wer_correctness(model_name,
-                         dataset_repo,
-                         expected_wer,
-                         n_examples=-1,
-                         max_concurrent_request=None):
+def test_wer_correctness(
+    model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
+):
     # TODO refactor to use `ASRDataset`
-    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+    with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
         dataset = load_hf_dataset(dataset_repo)
 
         if not max_concurrent_request:
             # No max concurrency
-            max_concurrent_request = n_examples if n_examples > 0\
-                else len(dataset)
+            max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
 
         client = remote_server.get_async_client()
-        wer = run_evaluation(model_name, client, dataset,
-                             max_concurrent_request, n_examples)
+        wer = run_evaluation(
+            model_name, client, dataset, max_concurrent_request, n_examples
+        )
         if expected_wer:
             torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 80261597b11a8d5244f2e285168883816ceb818f..682420a83a442ee7a152dc039a6e43261bb6c12f 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import random
-from typing import Callable
+from collections.abc import Callable
 
 import openai
 import pytest
@@ -44,15 +44,11 @@ async def client(server):
     ids=["completion", "chat"],
     argnames=["create_func_gen", "content_body"],
     argvalues=[
-        (lambda x: x.completions.create, {
-            "prompt": " ".join(['A'] * 10_000)
-        }),
-        (lambda x: x.chat.completions.create, {
-            "messages": [{
-                "role": "user",
-                "content": " ".join(['A'] * 10_000)
-            }]
-        }),
+        (lambda x: x.completions.create, {"prompt": " ".join(["A"] * 10_000)}),
+        (
+            lambda x: x.chat.completions.create,
+            {"messages": [{"role": "user", "content": " ".join(["A"] * 10_000)}]},
+        ),
     ],
 )
 async def test_with_and_without_truncate(
@@ -65,15 +61,15 @@ async def test_with_and_without_truncate(
     body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
 
     num_requests = 10
-    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
-                              (num_requests - num_requests // 2))
+    truncate_prompt_tokens = [1000] * (num_requests // 2) + [None] * (
+        num_requests - num_requests // 2
+    )
     random.shuffle(truncate_prompt_tokens)
 
-    bodies = [{
-        **body, "extra_body": {
-            'truncate_prompt_tokens': t
-        }
-    } for t in truncate_prompt_tokens]
+    bodies = [
+        {**body, "extra_body": {"truncate_prompt_tokens": t}}
+        for t in truncate_prompt_tokens
+    ]
 
     async def get_status_code(**kwargs):
         try:
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 2d33d3c3a6b5431f3c30029f5fdbad67b51ed89b..a2d8993441fcd81d8d46038786d5a899a6073fce 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -53,27 +53,34 @@ def base64_encoded_audio() -> dict[str, str]:
     }
 
 
+def dummy_messages_from_audio_url(
+    audio_urls: str | list[str],
+    content_text: str = "What's happening in this audio?",
+):
+    if isinstance(audio_urls, str):
+        audio_urls = [audio_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "audio_url", "audio_url": {"url": audio_url}}
+                    for audio_url in audio_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
-async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
-                                         model_name: str, audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_single_chat_session_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = dummy_messages_from_audio_url(audio_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -82,13 +89,15 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -110,56 +119,41 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
-async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": audio_url
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_error_on_invalid_audio_url_type(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": audio_url},
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # audio_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url":
-                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = dummy_messages_from_audio_url(
+        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -168,13 +162,15 @@ async def test_single_chat_session_audio_base64encoded(
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -198,25 +194,26 @@ async def test_single_chat_session_audio_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_input_audio(
-        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: dict[str, str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -224,13 +221,15 @@ async def test_single_chat_session_input_audio(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -252,24 +251,10 @@ async def test_single_chat_session_input_audio(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
-async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
-                                    model_name: str, audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_chat_streaming_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = dummy_messages_from_audio_url(audio_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -309,27 +294,27 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
-async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
-                                          model_name: str, audio_url: str,
-                                          base64_encoded_audio: dict[str,
-                                                                     str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_chat_streaming_input_audio(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -369,26 +354,12 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
-async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            } for audio_url in audio_urls),
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
+)
+async def test_multi_audio_input(
+    client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
+):
+    messages = dummy_messages_from_audio_url(audio_urls)
 
     if len(audio_urls) > MAXIMUM_AUDIOS:
         with pytest.raises(openai.BadRequestError):  # test multi-audio input
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a55941976cd826acf41586c9dcfda3bfba6bd0bc..e63a6f10cbc7f6af8a126e2d8af0e25e449cb517 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -3,12 +3,15 @@
 
 import asyncio
 from http import HTTPStatus
+from unittest.mock import AsyncMock, Mock
 
 import openai
 import pytest
 import pytest_asyncio
 import requests
+from fastapi import Request
 
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.version import __version__ as VLLM_VERSION
 
 from ...utils import RemoteOpenAIServer
@@ -16,9 +19,9 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def server_args(request: pytest.FixtureRequest) -> list[str]:
-    """ Provide extra arguments to the server via indirect parametrization
+    """Provide extra arguments to the server via indirect parametrization
 
     Usage:
 
@@ -80,8 +83,10 @@ async def client(server):
     "server_args",
     [
         pytest.param([], id="default-frontend-multiprocessing"),
-        pytest.param(["--disable-frontend-multiprocessing"],
-                     id="disable-frontend-multiprocessing")
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -97,8 +102,10 @@ async def test_show_version(server: RemoteOpenAIServer):
     "server_args",
     [
         pytest.param([], id="default-frontend-multiprocessing"),
-        pytest.param(["--disable-frontend-multiprocessing"],
-                     id="disable-frontend-multiprocessing")
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -112,11 +119,13 @@ async def test_check_health(server: RemoteOpenAIServer):
 @pytest.mark.parametrize(
     "server_args",
     [
-        pytest.param(["--max-model-len", "10100"],
-                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--max-model-len", "10100"], id="default-frontend-multiprocessing"
+        ),
         pytest.param(
             ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
-            id="disable-frontend-multiprocessing")
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -131,14 +140,16 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
     # Request about 2 million tokens
     for _ in range(200):
         task = asyncio.create_task(
-            client.chat.completions.create(messages=chat_input,
-                                           model=MODEL_NAME,
-                                           max_tokens=10000,
-                                           extra_body={"min_tokens": 10000}))
+            client.chat.completions.create(
+                messages=chat_input,
+                model=MODEL_NAME,
+                max_tokens=10000,
+                extra_body={"min_tokens": 10000},
+            )
+        )
         tasks.append(task)
 
-    done, pending = await asyncio.wait(tasks,
-                                       return_when=asyncio.ALL_COMPLETED)
+    done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
 
     # Make sure all requests were sent to the server and timed out
     # (We don't want to hide other errors like 400s that would invalidate this
@@ -151,16 +162,15 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
     # If the server had not cancelled all the other requests, then it would not
     # be able to respond to this one within the timeout
     client = server.get_async_client(timeout=5)
-    response = await client.chat.completions.create(messages=chat_input,
-                                                    model=MODEL_NAME,
-                                                    max_tokens=10)
+    response = await client.chat.completions.create(
+        messages=chat_input, model=MODEL_NAME, max_tokens=10
+    )
 
     assert len(response.choices) == 1
 
 
 @pytest.mark.asyncio
 async def test_request_wrong_content_type(server: RemoteOpenAIServer):
-
     chat_input = [{"role": "user", "content": "Write a long story"}]
     client = server.get_async_client()
 
@@ -169,17 +179,13 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
             messages=chat_input,
             model=MODEL_NAME,
             max_tokens=10000,
-            extra_headers={
-                "Content-Type": "application/x-www-form-urlencoded"
-            })
+            extra_headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
 
 
 @pytest.mark.parametrize(
     "server_args",
-    [
-        pytest.param(["--enable-server-load-tracking"],
-                     id="enable-server-load-tracking")
-    ],
+    [pytest.param(["--enable-server-load-tracking"], id="enable-server-load-tracking")],
     indirect=True,
 )
 @pytest.mark.asyncio
@@ -202,7 +208,8 @@ async def test_server_load(server: RemoteOpenAIServer):
 
     # Start the completion request in a background thread.
     completion_future = asyncio.create_task(
-        asyncio.to_thread(make_long_completion_request))
+        asyncio.to_thread(make_long_completion_request)
+    )
 
     # Give a short delay to ensure the request has started.
     await asyncio.sleep(0.1)
@@ -220,3 +227,24 @@ async def test_server_load(server: RemoteOpenAIServer):
     response = requests.get(server.url_for("load"))
     assert response.status_code == HTTPStatus.OK
     assert response.json().get("server_load") == 0
+
+
+@pytest.mark.asyncio
+async def test_health_check_engine_dead_error():
+    # Import the health function directly to test it in isolation
+    from vllm.entrypoints.openai.api_server import health
+
+    # Create a mock request that simulates what FastAPI would provide
+    mock_request = Mock(spec=Request)
+    mock_app_state = Mock()
+    mock_engine_client = AsyncMock()
+    mock_engine_client.check_health.side_effect = EngineDeadError()
+    mock_app_state.engine_client = mock_engine_client
+    mock_request.app.state = mock_app_state
+
+    # Test the health function directly with our mocked request
+    # This simulates what would happen if the engine dies
+    response = await health(mock_request)
+
+    # Assert that it returns 503 Service Unavailable
+    assert response.status_code == 503
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 3bdfef7b4adbce67e534de2db7a9b347e331b5a4..d25958f602b39650f2d47d2cba958a75203389ba 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,7 +3,6 @@
 
 # imports for structured outputs tests
 import json
-from typing import Optional
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -21,17 +20,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
-    monkeypatch_module.setenv('VLLM_USE_V1', '1')
-
+def server(zephyr_lora_files):  # noqa: F811
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -68,20 +57,18 @@ async def client(server):
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         max_completion_tokens=5,
         temperature=0.0,
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is None
@@ -94,13 +81,10 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -108,7 +92,8 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
         max_completion_tokens=5,
         temperature=0.0,
         logprobs=True,
-        top_logprobs=0)
+        top_logprobs=0,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
@@ -122,13 +107,10 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -136,7 +118,8 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
         max_completion_tokens=5,
         temperature=0.0,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
@@ -149,41 +132,39 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # Default max_logprobs is 20, so this should raise an error
     with pytest.raises((openai.BadRequestError, openai.APIError)):
-        stream = await client.chat.completions.create(model=model_name,
-                                                      messages=messages,
-                                                      max_completion_tokens=10,
-                                                      logprobs=True,
-                                                      top_logprobs=21,
-                                                      stream=True)
+        stream = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=21,
+            stream=True,
+        )
         async for chunk in stream:
             ...
 
     with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=model_name,
-                                             messages=messages,
-                                             max_completion_tokens=10,
-                                             logprobs=True,
-                                             top_logprobs=30,
-                                             stream=False)
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=30,
+            stream=False,
+        )
 
     # the server should still work afterwards
     chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        stream=False)
+        model=model_name, messages=messages, max_completion_tokens=10, stream=False
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
 
@@ -193,27 +174,20 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
     "model_name, prompt_logprobs",
     [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
 )
-async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
-                                    model_name: str,
-                                    prompt_logprobs: Optional[int]):
+async def test_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
+):
     params: dict = {
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "Who won the world series in 2020?"
-        }, {
-            "role":
-            "assistant",
-            "content":
-            "The Los Angeles Dodgers won the World Series in 2020."
-        }, {
-            "role": "user",
-            "content": "Where was it played?"
-        }],
-        "model":
-        model_name
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
     }
 
     if prompt_logprobs is not None:
@@ -236,29 +210,21 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
-                                                  model_name: str):
+async def test_more_than_one_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str
+):
     params: dict = {
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "Who won the world series in 2020?"
-        }, {
-            "role":
-            "assistant",
-            "content":
-            "The Los Angeles Dodgers won the World Series in 2020."
-        }, {
-            "role": "user",
-            "content": "Where was it played?"
-        }],
-        "model":
-        model_name,
-        "extra_body": {
-            "prompt_logprobs": 1
-        }
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
+        "extra_body": {"prompt_logprobs": 1},
     }
 
     completion_1 = await client.chat.completions.create(**params)
@@ -275,15 +241,11 @@ async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_single_chat_session(client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -291,14 +253,16 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert chat_completion.id is not None
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+        completion_tokens=10, prompt_tokens=37, total_tokens=47
+    )
 
     message = choice.message
     assert message.content is not None and len(message.content) >= 10
@@ -323,13 +287,10 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -371,15 +332,13 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
+async def test_chat_completion_stream_options(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
 
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.chat.completions.create(
@@ -388,36 +347,34 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False})
+        stream_options={"include_usage": False},
+    )
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options={"include_usage": True,
     #                                   "continuous_usage_stats": False}}
-    stream = await client.chat.completions.create(model=model_name,
-                                                  messages=messages,
-                                                  max_completion_tokens=10,
-                                                  temperature=0.0,
-                                                  stream=True,
-                                                  stream_options={
-                                                      "include_usage":
-                                                      True,
-                                                      "continuous_usage_stats":
-                                                      False
-                                                  })
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True, "continuous_usage_stats": False},
+    )
 
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
         else:
             assert chunk.usage is None
-            final_chunk = await stream.__anext__()
+            final_chunk = await anext(stream)
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
@@ -428,7 +385,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             max_completion_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": None})
+            stream_options={"include_usage": None},
+        )
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
@@ -438,7 +396,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             max_completion_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": True})
+            stream_options={"include_usage": True},
+        )
 
     # Test stream=True, stream_options={"include_usage": True,
     #                           "continuous_usage_stats": True}
@@ -457,14 +416,17 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     last_completion_tokens = 0
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
-        assert last_completion_tokens == 0 or \
-               chunk.usage.completion_tokens > last_completion_tokens or \
-               (
-                   not chunk.choices and
-                   chunk.usage.completion_tokens == last_completion_tokens
-               )
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert (
+            last_completion_tokens == 0
+            or chunk.usage.completion_tokens > last_completion_tokens
+            or (
+                not chunk.choices
+                and chunk.usage.completion_tokens == last_completion_tokens
+            )
+        )
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         last_completion_tokens = chunk.usage.completion_tokens
 
     assert last_completion_tokens == 10
@@ -475,37 +437,36 @@ async def test_structured_outputs_choice_chat(
     client: openai.AsyncOpenAI,
     sample_structured_outputs_choices,
 ):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
         extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_structured_outputs_choices
 
     messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
+    messages.append({"role": "user", "content": "I disagree, pick another one"})
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
         extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_structured_outputs_choices
     assert choice1 != choice2
@@ -516,38 +477,35 @@ async def test_structured_outputs_json_chat(
     client: openai.AsyncOpenAI,
     sample_json_schema,
 ):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(structured_outputs={"json": sample_json_schema}))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}),
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
     jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": message.content})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(structured_outputs={"json": sample_json_schema}))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}),
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -561,21 +519,19 @@ async def test_structured_outputs_regex_chat(
     client: openai.AsyncOpenAI,
     sample_regex,
 ):
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example IP address with this regex: {sample_regex}"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example IP address with this regex: {sample_regex}",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(structured_outputs={"regex": sample_regex}))
+        extra_body=dict(structured_outputs={"regex": sample_regex}),
+    )
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -586,7 +542,8 @@ async def test_structured_outputs_regex_chat(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(structured_outputs={"regex": sample_regex}))
+        extra_body=dict(structured_outputs={"regex": sample_regex}),
+    )
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -595,40 +552,33 @@ async def test_structured_outputs_regex_chat(
 
 @pytest.mark.asyncio
 async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.chat.completions.create(
             model=MODEL_NAME,
             messages=messages,
-            extra_body=dict(
-                structured_outputs={"regex": {
-                    1: "Python",
-                    2: "C++"
-                }}))
+            extra_body=dict(structured_outputs={"regex": {1: "Python", 2: "C++"}}),
+        )
 
 
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat_logprobs(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices):
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+    client: openai.AsyncOpenAI, sample_structured_outputs_choices
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
@@ -636,7 +586,9 @@ async def test_structured_outputs_choice_chat_logprobs(
         logprobs=True,
         top_logprobs=5,
         extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -647,159 +599,22 @@ async def test_structured_outputs_choice_chat_logprobs(
         assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
 
 
-@pytest.mark.asyncio
-async def test_named_tool_use(
-    client: openai.AsyncOpenAI,
-    sample_json_schema,
-):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content": ("Give an example JSON for an employee "
-                    "profile using the specified tool.")
-    }]
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name",
-            "description": "This is a dummy function",
-            "parameters": sample_json_schema
-        }
-    }]
-    tool_choice = {
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name"
-        }
-    }
-
-    # non-streaming
-
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_completion_tokens=1000,
-        tools=tools,
-        tool_choice=tool_choice,
-    )
-    message = chat_completion.choices[0].message
-    assert len(message.content) == 0
-    json_string = message.tool_calls[0].function.arguments
-    json1 = json.loads(json_string)
-    jsonschema.validate(instance=json1, schema=sample_json_schema)
-
-    messages.append({"role": "assistant", "content": json_string})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
-
-    # streaming
-
-    stream = await client.chat.completions.create(model=MODEL_NAME,
-                                                  messages=messages,
-                                                  max_completion_tokens=1000,
-                                                  tools=tools,
-                                                  tool_choice=tool_choice,
-                                                  stream=True)
-
-    output = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.role:
-            assert delta.role == "assistant"
-        assert delta.content is None or len(delta.content) == 0
-        if delta.tool_calls:
-            output.append(delta.tool_calls[0].function.arguments)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    json2 = json.loads("".join(output))
-    jsonschema.validate(instance=json2, schema=sample_json_schema)
-    assert json1["name"] != json2["name"]
-    assert json1["age"] != json2["age"]
-
-
-@pytest.mark.asyncio
-async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  sample_json_schema):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=MODEL_NAME,
-                                             messages=messages,
-                                             max_completion_tokens=1000,
-                                             tool_choice={
-                                                 "type": "function",
-                                                 "function": {
-                                                     "name":
-                                                     "dummy_function_name"
-                                                 }
-                                             })
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice={
-                "type": "function",
-                "function": {
-                    "name": "nondefined_function_name"
-                }
-            })
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice={})
-
-
 @pytest.mark.asyncio
 async def test_response_format_json_object(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role":
-                "user",
-                "content": ('what is 1+1? please respond with a JSON object, '
-                            'the format is {"result": 2}')
-            }],
-            response_format={"type": "json_object"})
+            messages=[
+                {
+                    "role": "user",
+                    "content": (
+                        "what is 1+1? please respond with a JSON object, "
+                        'the format is {"result": 2}'
+                    ),
+                }
+            ],
+            response_format={"type": "json_object"},
+        )
 
         content = resp.choices[0].message.content
         assert content is not None
@@ -815,10 +630,7 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": prompt
-            }],
+            messages=[{"role": "user", "content": prompt}],
         )
         content = resp.choices[0].message.content
         assert content is not None
@@ -829,10 +641,7 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": prompt
-            }],
+            messages=[{"role": "user", "content": prompt}],
             response_format={
                 "type": "json_schema",
                 "json_schema": {
@@ -840,13 +649,12 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
                     "schema": {
                         "type": "object",
                         "properties": {
-                            "result": {
-                                "type": "integer"
-                            },
+                            "result": {"type": "integer"},
                         },
                     },
-                }
-            })
+                },
+            },
+        )
 
         content = resp.choices[0].message.content
         assert content is not None
@@ -859,13 +667,16 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
 async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?",
-            "extra_field": "0",
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "user",
+                "content": "what is 1+1?",
+                "extra_field": "0",
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     content = resp.choices[0].message.content
     assert content is not None
@@ -873,20 +684,23 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_complex_message_content(client: openai.AsyncOpenAI):
+    content = [
+        {
+            "type": "text",
+            "text": "what is 1+1? please provide the result without any other text.",
+        }
+    ]
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role":
-            "user",
-            "content": [{
-                "type":
-                "text",
-                "text":
-                "what is 1+1? please provide the result without any other text."
-            }]
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": content,
+            }
+        ],
         temperature=0,
-        seed=0)
+        seed=0,
+    )
     content = resp.choices[0].message.content
     assert content == "2"
 
@@ -898,24 +712,27 @@ async def test_custom_role(client: openai.AsyncOpenAI):
 
     resp1 = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": "what is 1+1?",
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": "what is 1+1?",
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     resp2 = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": [{
-                "type": "text",
-                "text": "what is 1+1?"
-            }]
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": [{"type": "text", "text": "what is 1+1?"}],
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     content1 = resp1.choices[0].message.content
     content2 = resp2.choices[0].message.content
@@ -924,34 +741,32 @@ async def test_custom_role(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_long_seed(client: openai.AsyncOpenAI):
-    for seed in [
-            torch.iinfo(torch.long).min - 1,
-            torch.iinfo(torch.long).max + 1
-    ]:
+    for seed in [torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1]:
         with pytest.raises(BadRequestError) as exc_info:
             await client.chat.completions.create(
                 model=MODEL_NAME,
-                messages=[{
-                    "role": "system",
-                    "content": "You are a helpful assistant.",
-                }],
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant.",
+                    }
+                ],
                 temperature=0,
-                seed=seed)
+                seed=seed,
+            )
 
-        assert ("greater_than_equal" in exc_info.value.message
-                or "less_than_equal" in exc_info.value.message)
+        assert (
+            "greater_than_equal" in exc_info.value.message
+            or "less_than_equal" in exc_info.value.message
+        )
 
 
 @pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -963,8 +778,9 @@ async def test_invocations(server: RemoteOpenAIServer,
 
     chat_completion = await client.chat.completions.create(**request_args)
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_completion.model_dump()
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index ce965eb82924896b48f3b0a6e416c1d13bc11a1f..b3b8b700336db15a7d2d088f3b1784e988f6f1e6 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -7,12 +7,23 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 
+from vllm.config import ModelConfig
+
 from ...utils import RemoteOpenAIServer
 
 # # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
 
 
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="float16",
+    )
+    return config.get_vocab_size()
+
+
 @pytest.fixture(scope="module")
 def server():
     args = [
@@ -23,7 +34,7 @@ def server():
         "--max-model-len",
         "4080",
         "--max-logprobs",  # test prompt_logprobs equal to -1
-        "151936"
+        "151936",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -46,27 +57,26 @@ class TestCase(NamedTuple):
     "test_case",
     [
         TestCase(model_name=MODEL_NAME, echo=True),
-        TestCase(model_name=MODEL_NAME, echo=False)
+        TestCase(model_name=MODEL_NAME, echo=False),
     ],
 )
 async def test_chat_session_with_echo_and_continue_final_message(
-        client: openai.AsyncOpenAI, test_case: TestCase):
+    client: openai.AsyncOpenAI, test_case: TestCase
+):
     saying: str = "Here is a common saying about apple. An apple a day, keeps"
     # test echo with continue_final_message parameter
     chat_completion = await client.chat.completions.create(
         model=test_case.model_name,
-        messages=[{
-            "role": "user",
-            "content": "tell me a common saying"
-        }, {
-            "role": "assistant",
-            "content": saying
-        }],
+        messages=[
+            {"role": "user", "content": "tell me a common saying"},
+            {"role": "assistant", "content": saying},
+        ],
         extra_body={
             "echo": test_case.echo,
             "continue_final_message": True,
-            "add_generation_prompt": False
-        })
+            "add_generation_prompt": False,
+        },
+    )
     assert chat_completion.id is not None
     assert len(chat_completion.choices) == 1
 
@@ -83,13 +93,10 @@ async def test_chat_session_with_echo_and_continue_final_message(
 
 @pytest.mark.asyncio
 async def test_prompt_logprobs(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "Beijing is the capital of which country?"
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Beijing is the capital of which country?"},
+    ]
 
     completion = await client.chat.completions.create(
         model=MODEL_NAME,
@@ -103,17 +110,15 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_top_logprobs(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "Beijing is the capital of which country?"
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Beijing is the capital of which country?"},
+    ]
 
     completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
+        max_tokens=1,
         extra_body={
             "top_logprobs": -1,
             "logprobs": "true",
@@ -122,3 +127,6 @@ async def test_top_logprobs(client: openai.AsyncOpenAI):
     assert completion.choices[0].logprobs is not None
     assert completion.choices[0].logprobs.content is not None
     assert len(completion.choices[0].logprobs.content) > 0
+    assert len(
+        completion.choices[0].logprobs.content[0].top_logprobs
+    ) == get_vocab_size(MODEL_NAME)
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index 9fa7ab83555afbdb4caf59f84129806760c8bd25..6539613ed17b9d16a02f9f08d3b1ae5c64e31258 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -49,10 +49,7 @@ async def test_chat_logit_bias_valid(client):
 
     completion = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "Testing valid logit bias"
-        }],
+        messages=[{"role": "user", "content": "Testing valid logit bias"}],
         max_tokens=5,
         logit_bias={str(valid_token_id): 1.0},
     )
@@ -69,10 +66,7 @@ async def test_chat_logit_bias_invalid(client):
     with pytest.raises(openai.BadRequestError) as excinfo:
         await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": "Testing invalid logit bias"
-            }],
+            messages=[{"role": "user", "content": "Testing invalid logit bias"}],
             max_tokens=5,
             logit_bias={str(invalid_token_id): 1.0},
         )
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index ce90a67c015177743cfb934c1e6ee098c2ac425a..ee79ed59c41021885a95d4458f6580ff59eddd2e 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -4,8 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
-                                         load_chat_template)
+from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -17,48 +16,54 @@ assert chatml_jinja_path.exists()
 
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATION_OUTPUT = [
-    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        True,
+        False,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
-"""),
-    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
+""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        False,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
-What is the capital of"""),
-    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
+What is the capital of""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        True,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
-The capital of"""),
+The capital of""",
+    ),
 ]
 
 TEST_MESSAGES = [
-    {
-        'role': 'user',
-        'content': 'Hello'
-    },
-    {
-        'role': 'assistant',
-        'content': 'Hi there!'
-    },
-    {
-        'role': 'user',
-        'content': 'What is the capital of'
-    },
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+    {"role": "user", "content": "What is the capital of"},
 ]
-ASSISTANT_MESSAGE_TO_CONTINUE = {
-    'role': 'assistant',
-    'content': 'The capital of'
-}
+ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
 
 
 def test_load_chat_template():
@@ -68,8 +73,11 @@ def test_load_chat_template():
     # Test assertions
     assert template_content is not None
     # Hard coded value for template_chatml.jinja
-    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+    assert (
+        template_content
+        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+    )
 
 
 def test_no_load_chat_template_filelike():
@@ -91,9 +99,11 @@ def test_no_load_chat_template_literallike():
 
 @pytest.mark.parametrize(
     "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATION_OUTPUT)
-def test_get_gen_prompt(model, template, add_generation_prompt,
-                        continue_final_message, expected_output):
+    MODEL_TEMPLATE_GENERATION_OUTPUT,
+)
+def test_get_gen_prompt(
+    model, template, add_generation_prompt, continue_final_message, expected_output
+):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
@@ -104,9 +114,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         trust_remote_code=model_info.trust_remote_code,
         revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     # Initialize the tokenizer
     tokenizer = get_tokenizer(
@@ -119,7 +132,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     mock_request = ChatCompletionRequest(
         model=model,
         messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        if continue_final_message else TEST_MESSAGES,
+        if continue_final_message
+        else TEST_MESSAGES,
         add_generation_prompt=add_generation_prompt,
         continue_final_message=continue_final_message,
     )
@@ -138,4 +152,5 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     # Test assertion
     assert result == expected_output, (
         f"The generated prompt does not match the expected output for "
-        f"model {model} and template {template}")
+        f"model {model} and template {template}"
+    )
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 03730b67283c409a72301e8268346a085a6ff68e..7b3092b563030db2321941c5d063bcaa3e324aae 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -14,9 +14,14 @@ MODEL_NAME = "Qwen/QwQ-32B"
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
     args = [
-        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
-        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
-        "hermes"
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--reasoning-parser",
+        "deepseek_r1",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -29,57 +34,53 @@ async def client(server):
         yield async_client
 
 
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     }
-}]
-
-MESSAGES = [{
-    "role": "user",
-    "content": "Hi! How are you doing today?"
-}, {
-    "role": "assistant",
-    "content": "I'm doing well! How can I help you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
+]
+
+MESSAGES = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, "
+        "in fahrenheit?",
+    },
+]
 
 FUNC_NAME = "get_current_weather"
 FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
 
 
 def extract_reasoning_and_calls(chunks: list):
-    reasoning_content = ""
+    reasoning = ""
     tool_call_idx = -1
     arguments = []
     function_names = []
@@ -98,16 +99,14 @@ def extract_reasoning_and_calls(chunks: list):
                 if tool_call.function.arguments:
                     arguments[tool_call_idx] += tool_call.function.arguments
         else:
-            if hasattr(chunk.choices[0].delta, "reasoning_content"):
-                reasoning_content += chunk.choices[0].delta.reasoning_content
-    return reasoning_content, arguments, function_names
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
 
 
 # test streaming
 @pytest.mark.asyncio
-async def test_chat_streaming_of_tool_and_reasoning(
-        client: openai.AsyncOpenAI):
-
+async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     stream = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=MESSAGES,
@@ -120,9 +119,8 @@ async def test_chat_streaming_of_tool_and_reasoning(
     async for chunk in stream:
         chunks.append(chunk)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-        chunks)
-    assert len(reasoning_content) > 0
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    assert len(reasoning) > 0
     assert len(function_names) > 0 and function_names[0] == FUNC_NAME
     assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
 
@@ -130,7 +128,6 @@ async def test_chat_streaming_of_tool_and_reasoning(
 # test full generate
 @pytest.mark.asyncio
 async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
-
     tool_calls = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=MESSAGES,
@@ -139,8 +136,6 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
         stream=False,
     )
 
-    assert len(tool_calls.choices[0].message.reasoning_content) > 0
-    assert tool_calls.choices[0].message.tool_calls[0].function.name \
-          == FUNC_NAME
-    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
-          == FUNC_ARGS
+    assert len(tool_calls.choices[0].message.reasoning) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index c8160c5f2d0e3520f66e23d741eb96581ab13f5a..608e509e59e8a6391664d76e9753495899e19ba5 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -40,7 +40,8 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_completion_stream_options_and_logprobs_with_long_prompts(
-        client: openai.AsyncOpenAI):
+    client: openai.AsyncOpenAI,
+):
     # Test stream with long prompt
     prompt = "What is the capital of France?" * 400
 
@@ -62,8 +63,9 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts(
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
         assert chunk.usage.completion_tokens >= 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         if not finished:
             tokens_received += 1
             assert chunk.choices[0].text
@@ -77,15 +79,13 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts(
 
 @pytest.mark.asyncio
 async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
-        client: openai.AsyncOpenAI):
+    client: openai.AsyncOpenAI,
+):
     # Test stream with long prompt
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?" * 400
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?" * 400},
+    ]
     stream = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
@@ -106,8 +106,9 @@ async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
         assert chunk.usage.completion_tokens >= 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
 
         if not finished:
             if chunk.choices[0].delta.content == "":
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 9a1c0ea13b54f98746c92266622fee7162a1c60e..b5d71c20bb4ea081f39c17cffe4865fa5957fbb7 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -5,17 +5,16 @@ import json
 
 import pytest
 
-from vllm.entrypoints.openai.cli_args import (make_arg_parser,
-                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 from ...utils import VLLM_PATH
 
 LORA_MODULE = {
     "name": "module2",
     "path": "/path/to/module2",
-    "base_model_name": "llama"
+    "base_model_name": "llama",
 }
 CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
 assert CHATML_JINJA_PATH.exists()
@@ -31,45 +30,51 @@ def serve_parser():
 def test_config_arg_parsing(serve_parser, cli_config_file):
     args = serve_parser.parse_args([])
     assert args.port == 8000
-    args = serve_parser.parse_args(['--config', cli_config_file])
+    args = serve_parser.parse_args(["--config", cli_config_file])
     assert args.port == 12312
-    args = serve_parser.parse_args([
-        '--config',
-        cli_config_file,
-        '--port',
-        '9000',
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--config",
+            cli_config_file,
+            "--port",
+            "9000",
+        ]
+    )
     assert args.port == 9000
-    args = serve_parser.parse_args([
-        '--port',
-        '9000',
-        '--config',
-        cli_config_file,
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--port",
+            "9000",
+            "--config",
+            cli_config_file,
+        ]
+    )
     assert args.port == 9000
 
 
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
     # Test old format: name=path
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        'module1=/path/to/module1',
-    ])
-    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+        ]
+    )
+    expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
     assert args.lora_modules == expected
 
 
 def test_valid_json_format(serve_parser):
     # Test valid JSON format input
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        json.dumps(LORA_MODULE),
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            json.dumps(LORA_MODULE),
+        ]
+    )
     expected = [
-        LoRAModulePath(name='module2',
-                       path='/path/to/module2',
-                       base_model_name='llama')
+        LoRAModulePath(name="module2", path="/path/to/module2", base_model_name="llama")
     ]
     assert args.lora_modules == expected
 
@@ -77,47 +82,53 @@ def test_valid_json_format(serve_parser):
 def test_invalid_json_format(serve_parser):
     # Test invalid JSON format input, missing closing brace
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
-        ])
+        serve_parser.parse_args(
+            ["--lora-modules", '{"name": "module3", "path": "/path/to/module3"']
+        )
 
 
 def test_invalid_type_error(serve_parser):
     # Test type error when values are not JSON or key=value
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules',
-            'invalid_format'  # This is not JSON or key=value format
-        ])
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                "invalid_format",  # This is not JSON or key=value format
+            ]
+        )
 
 
 def test_invalid_json_field(serve_parser):
     # Test valid JSON format but missing required fields
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules',
-            '{"name": "module4"}'  # Missing required 'path' field
-        ])
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                '{"name": "module4"}',  # Missing required 'path' field
+            ]
+        )
 
 
 def test_empty_values(serve_parser):
     # Test when no LoRA modules are provided
-    args = serve_parser.parse_args(['--lora-modules', ''])
+    args = serve_parser.parse_args(["--lora-modules", ""])
     assert args.lora_modules == []
 
 
 def test_multiple_valid_inputs(serve_parser):
     # Test multiple valid inputs (both old and JSON format)
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        'module1=/path/to/module1',
-        json.dumps(LORA_MODULE),
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+            json.dumps(LORA_MODULE),
+        ]
+    )
     expected = [
-        LoRAModulePath(name='module1', path='/path/to/module1'),
-        LoRAModulePath(name='module2',
-                       path='/path/to/module2',
-                       base_model_name='llama')
+        LoRAModulePath(name="module1", path="/path/to/module1"),
+        LoRAModulePath(
+            name="module2", path="/path/to/module2", base_model_name="llama"
+        ),
     ]
     assert args.lora_modules == expected
 
@@ -133,40 +144,46 @@ def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
 
 def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
     """Ensure validation passes with tool choice enabled with a call parser"""
-    args = serve_parser.parse_args(args=[
-        "--enable-auto-tool-choice",
-        "--tool-call-parser",
-        "mistral",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--tool-call-parser",
+            "mistral",
+        ]
+    )
     validate_parsed_serve_args(args)
 
 
 def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
     """Ensure validation fails if reasoning is enabled with auto tool choice"""
-    args = serve_parser.parse_args(args=[
-        "--enable-auto-tool-choice",
-        "--reasoning-parser",
-        "deepseek_r1",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
     with pytest.raises(TypeError):
         validate_parsed_serve_args(args)
 
 
 def test_passes_with_reasoning_parser(serve_parser):
-    """Ensure validation passes if reasoning is enabled 
+    """Ensure validation passes if reasoning is enabled
     with a reasoning parser"""
-    args = serve_parser.parse_args(args=[
-        "--reasoning-parser",
-        "deepseek_r1",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
     validate_parsed_serve_args(args)
 
 
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
-        args=["--chat-template",
-              CHATML_JINJA_PATH.absolute().as_posix()])
+        args=["--chat-template", CHATML_JINJA_PATH.absolute().as_posix()]
+    )
     validate_parsed_serve_args(args)
 
 
@@ -179,8 +196,14 @@ def test_chat_template_validation_for_sad_paths(serve_parser):
 
 @pytest.mark.parametrize(
     "cli_args, expected_middleware",
-    [(["--middleware", "middleware1", "--middleware", "middleware2"
-       ], ["middleware1", "middleware2"]), ([], [])])
+    [
+        (
+            ["--middleware", "middleware1", "--middleware", "middleware2"],
+            ["middleware1", "middleware2"],
+        ),
+        ([], []),
+    ],
+)
 def test_middleware(serve_parser, cli_args, expected_middleware):
     """Ensure multiple middleware args are parsed properly"""
     args = serve_parser.parse_args(args=cli_args)
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/openai/test_collective_rpc.py
index 37c0b7a900ac4a5ed6696afa23e37a2e66a14e83..cbd6b02f05dce267f0d2aae10eddfb41415f087e 100644
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@@ -12,7 +12,6 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 class TestWorkerExtension:
-
     def get_model_name(self) -> str:
         """Test non-pydantic return type."""
         return MODEL_NAME
@@ -41,20 +40,18 @@ def server():
         "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
     ]
     with RemoteOpenAIServer(
-            MODEL_NAME,
-            args,
-            env_dict={
-                "VLLM_SERVER_DEV_MODE": "1",
-                "CUDA_VISIBLE_DEVICES": "0"
-            },
+        MODEL_NAME,
+        args,
+        env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
     ) as remote_server:
         yield remote_server
 
 
 def test_get_model_name(server):
     """Test basic response"""
-    response = requests.post(server.url_for("collective_rpc"),
-                             json={"method": "get_model_name"})
+    response = requests.post(
+        server.url_for("collective_rpc"), json={"method": "get_model_name"}
+    )
     assert response.status_code == 200
     results = response.json()
     assert "results" in results
@@ -63,8 +60,9 @@ def test_get_model_name(server):
 
 def test_return_none(server):
     """Test return none"""
-    response = requests.post(server.url_for("collective_rpc"),
-                             json={"method": "return_none"})
+    response = requests.post(
+        server.url_for("collective_rpc"), json={"method": "return_none"}
+    )
     assert response.status_code == 200
     results = response.json()
     assert results["results"] == [None]
@@ -74,12 +72,10 @@ def test_echo_args_kwargs(server):
     """Test args, kwargs, and dict response"""
     args = ["arg1", "arg2"]
     kwargs = {"key1": "value1", "key2": "value2"}
-    response = requests.post(server.url_for("collective_rpc"),
-                             json={
-                                 "method": "echo_args_kwargs",
-                                 "args": args,
-                                 "kwargs": kwargs
-                             })
+    response = requests.post(
+        server.url_for("collective_rpc"),
+        json={"method": "echo_args_kwargs", "args": args, "kwargs": kwargs},
+    )
     assert response.status_code == 200
     results = response.json()
     result = results["results"][0]
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 4355603fcd70bd819bb19924d0b611f09323d5df..53369f074eca8629a6893f115f5ff45b61ccae57 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
-from typing import Union
+import json
 
+import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
@@ -25,15 +26,14 @@ tools = [
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description":
-                        "The city to find the weather for, e.g. 'Vienna'",
+                        "description": "The city to find the weather for, e.g. "
+                        "'Vienna'",
                         "default": "Vienna",
                     },
                     "country": {
-                        "type":
-                        "string",
-                        "description":
-                        "The country that the city is in, e.g. 'Austria'",
+                        "type": "string",
+                        "description": "The country that the city is in, e.g. "
+                        "'Austria'",
                     },
                     "unit": {
                         "type": "string",
@@ -62,8 +62,7 @@ tools = [
                             "include_forecast": {
                                 "type": "boolean",
                                 "default": False,
-                                "description":
-                                "Whether to include a 24-hour forecast",
+                                "description": "Whether to include a 24-hour forecast",
                                 "title": "Include Forecast",
                             },
                             "language": {
@@ -89,21 +88,18 @@ tools = [
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description":
-                        "The city to get the forecast for, e.g. 'Vienna'",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'Vienna'",
                         "default": "Vienna",
                     },
                     "country": {
-                        "type":
-                        "string",
-                        "description":
-                        "The country that the city is in, e.g. 'Austria'",
+                        "type": "string",
+                        "description": "The country that the city is in, e.g. "
+                        "'Austria'",
                     },
                     "days": {
-                        "type":
-                        "integer",
-                        "description":
-                        "Number of days to get the forecast for (1-7)",
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
                     },
                     "unit": {
                         "type": "string",
@@ -118,19 +114,11 @@ tools = [
 ]
 
 messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
     {
         "role": "user",
-        "content": "Hi! How are you doing today?"
-    },
-    {
-        "role": "assistant",
-        "content": "I'm doing well! How can I help you?"
-    },
-    {
-        "role":
-        "user",
-        "content":
-        "Can you tell me what the current weather is in Berlin and the "\
+        "content": "Can you tell me what the current weather is in Berlin and the "
         "forecast for the next 5 days, in fahrenheit?",
     },
 ]
@@ -150,7 +138,7 @@ def server():  # noqa: F811
         "--reasoning-parser",
         "qwen3",
         "--gpu-memory-utilization",
-        "0.4"
+        "0.4",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -166,18 +154,22 @@ async def client(server):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("stream", [True, False])
-@pytest.mark.parametrize("tool_choice", [
-    "auto", "required", {
-        "type": "function",
-        "function": {
-            "name": "get_current_weather"
-        }
-    }
-])
+@pytest.mark.parametrize(
+    "tool_choice",
+    [
+        "auto",
+        "required",
+        {"type": "function", "function": {"name": "get_current_weather"}},
+    ],
+)
 @pytest.mark.parametrize("enable_thinking", [True, False])
-async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
-                                 stream: bool, tool_choice: Union[str, dict],
-                                 enable_thinking: bool):
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    stream: bool,
+    tool_choice: str | dict,
+    enable_thinking: bool,
+):
     if not stream:
         # Non-streaming test
         chat_completion = await client.chat.completions.create(
@@ -185,16 +177,11 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
             model=model_name,
             tools=tools,
             tool_choice=tool_choice,
-            extra_body={
-                "chat_template_kwargs": {
-                    "enable_thinking": enable_thinking
-                }
-            })
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
         if enable_thinking:
-            assert chat_completion.choices[0].message.\
-                reasoning_content is not None
-            assert chat_completion.choices[0].message.\
-                reasoning_content != ""
+            assert chat_completion.choices[0].message.reasoning is not None
+            assert chat_completion.choices[0].message.reasoning != ""
         assert chat_completion.choices[0].message.tool_calls is not None
         assert len(chat_completion.choices[0].message.tool_calls) > 0
     else:
@@ -205,18 +192,23 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
             tools=tools,
             tool_choice=tool_choice,
             stream=True,
-            extra_body={
-                "chat_template_kwargs": {
-                    "enable_thinking": enable_thinking
-                }
-            })
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
 
         output = []
+        reasoning = []
         async for chunk in output_stream:
-            if chunk.choices and chunk.choices[0].delta.tool_calls:
-                output.extend(chunk.choices[0].delta.tool_calls)
+            if chunk.choices:
+                if enable_thinking and getattr(
+                    chunk.choices[0].delta, "reasoning", None
+                ):
+                    reasoning.append(chunk.choices[0].delta.reasoning)
+                if chunk.choices[0].delta.tool_calls:
+                    output.extend(chunk.choices[0].delta.tool_calls)
 
         assert len(output) > 0
+        if enable_thinking:
+            assert len(reasoning) > 0
 
 
 @pytest.fixture(scope="module")
@@ -237,12 +229,11 @@ def k2_server():  # noqa: F811
     ]
     # hack to test kimi_k2 tool use tool_id format.
     # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            override_hf_configs={
-                                "model_type": 'kimi_k2',
-                                'kv_lora_rank': None
-                            }) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
+    ) as remote_server:
         yield remote_server
 
 
@@ -256,20 +247,20 @@ async def k2_client(k2_server):
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("stream", [True, False])
 @pytest.mark.parametrize("tool_choice", ["required"])
-async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
-                               stream: bool, tool_choice: str):
-
+async def test_tool_id_kimi_k2(
+    k2_client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: str
+):
     if not stream:
         # Non-streaming test
         chat_completion = await k2_client.chat.completions.create(
-            messages=messages,
-            model=model_name,
-            tools=tools,
-            tool_choice=tool_choice)
+            messages=messages, model=model_name, tools=tools, tool_choice=tool_choice
+        )
         assert chat_completion.choices[0].message.tool_calls is not None
         assert len(chat_completion.choices[0].message.tool_calls) > 0
-        assert chat_completion.choices[0].message.tool_calls[
-            0].id == 'functions.get_current_weather:0'
+        assert chat_completion.choices[0].message.tool_calls[0].id in [
+            "functions.get_current_weather:0",
+            "functions.get_forecast:1",
+        ]
     else:
         # Streaming test
         output_stream = await k2_client.chat.completions.create(
@@ -277,42 +268,48 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
             model=model_name,
             tools=tools,
             tool_choice=tool_choice,
-            stream=True)
+            stream=True,
+        )
 
         output = []
         async for chunk in output_stream:
             if chunk.choices and chunk.choices[0].delta.tool_calls:
                 output.extend(chunk.choices[0].delta.tool_calls)
         for o in output:
-            assert o.id is None or o.id == 'functions.get_current_weather:0'
+            assert o.id is None or o.id in [
+                "functions.get_current_weather:0",
+                "functions.get_forecast:1",
+            ]
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("arguments", ["{}", ''])
-async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
-                                 arguments: str):
+@pytest.mark.parametrize("arguments", ["{}", ""])
+async def test_no_args_tool_call(
+    client: openai.AsyncOpenAI, model_name: str, arguments: str
+):
     # Step 1: Define a tool that requires no parameters
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_current_time",
-            "description":
-            "Get the current date and time. No parameters needed.",
-            "parameters": {
-                "type": "object",
-                "properties": {},  # No parameters
-                "required": []  # No required fields
-            }
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_time",
+                "description": "Get the current date and time. No parameters needed.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {},  # No parameters
+                    "required": [],  # No required fields
+                },
+            },
         }
-    }]
+    ]
     messages = [{"role": "user", "content": "What time is it now?"}]
     # Step 2: Send user message and let model decide whether to call the tool
     response = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         tools=tools,
-        tool_choice="auto"  # Let model choose automatically
+        tool_choice="auto",  # Let model choose automatically
     )
 
     # Step 3: Check if model wants to call a tool
@@ -328,11 +325,13 @@ async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
             messages.append(message)
             current_time = datetime.datetime.now()
             result = current_time.isoformat()
-            messages.append({
-                "role": "tool",
-                "tool_call_id": tool_call.id,
-                "content": result,
-            })
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_call.id,
+                    "content": result,
+                }
+            )
             # Step 5: Send tool result back to model to continue conversation
             final_response = await client.chat.completions.create(
                 model=model_name,
@@ -344,3 +343,144 @@ async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
     else:
         # No tool called — just print model's direct reply
         assert message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": (
+                "Give an example JSON for an employee profile using the specified tool."
+            ),
+        },
+    ]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema,
+            },
+        }
+    ]
+    tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=tools,
+        temperature=0.0,
+        tool_choice=tool_choice,
+    )
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=tools,
+        tool_choice=tool_choice,
+        temperature=0.0,
+        stream=True,
+    )
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+async def test_inconsistent_tool_choice_and_tools(
+    client: openai.AsyncOpenAI, sample_json_schema
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tool_choice={
+                "type": "function",
+                "function": {"name": "dummy_function_name"},
+            },
+        )
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
+                }
+            ],
+            tool_choice={
+                "type": "function",
+                "function": {"name": "nondefined_function_name"},
+            },
+        )
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
+                }
+            ],
+            tool_choice={},
+        )
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index ae51025455b10508d25fc178d1d84c63acd99c7e..0a057b1848ad6b6b35c744ea301926eee04c4082 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -3,11 +3,13 @@
 
 import base64
 import io
+import json
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import torch
+
 # downloading lora to test lora requests
 from openai import BadRequestError
 from transformers import AutoConfig
@@ -16,13 +18,16 @@ from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
+LORA_SERVING_MODEL_NAME = "opt125m-lora"
 
 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
 
 
-@pytest.fixture(scope="module")
-def default_server_args() -> list[str]:
-    return [
+@pytest.fixture(scope="module", params=["use-lora"])
+def default_server_args(
+    request: pytest.FixtureRequest, opt125_lora_files: str
+) -> list[str]:
+    args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -35,6 +40,27 @@ def default_server_args() -> list[str]:
         "--enable-prompt-embeds",
     ]
 
+    if request.param == "use-lora":
+        lora_module_1 = {
+            "name": LORA_SERVING_MODEL_NAME,
+            "path": opt125_lora_files,
+            "base_model_name": MODEL_NAME,
+        }
+
+        args.extend(
+            [
+                "--enable-lora",
+                "--lora-module",
+                json.dumps(lora_module_1),
+                "--max-lora-rank",
+                "64",
+                "--max-cpu-loras",
+                "2",
+            ]
+        )
+
+    return args
+
 
 EXAMPLE_PROMPTS = [
     "Hello, my name is",
@@ -45,7 +71,7 @@ EXAMPLE_PROMPTS = [
 def _encode_embeds(embeds: torch.Tensor):
     buffer = io.BytesIO()
     torch.save(embeds, buffer)
-    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
 
 
 @pytest.fixture(scope="module")
@@ -57,8 +83,7 @@ def example_prompt_embeds(hf_runner):
     return [_encode_embeds(item) for item in example_embeddings]
 
 
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
+@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
 def server_with_prompt_embeds(default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
@@ -74,7 +99,7 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
 async def test_completions_with_prompt_embeds(
     example_prompt_embeds,
     client_with_prompt_embeds: openai.AsyncOpenAI,
@@ -88,7 +113,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     assert len(completion.choices[0].text) >= 1
     assert completion.choices[0].prompt_logprobs is None
 
@@ -98,7 +124,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
     assert len(completion.choices) == 2
     assert len(completion.choices[0].text) >= 1
     assert len(completion.choices[1].text) >= 1
@@ -109,7 +136,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     single_output = single_completion.choices[0].text
 
     stream = await client_with_prompt_embeds.completions.create(
@@ -118,7 +146,8 @@ async def test_completions_with_prompt_embeds(
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     chunks = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -137,12 +166,12 @@ async def test_completions_with_prompt_embeds(
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
     chunks_stream_embeds: list[list[str]] = [[], []]
     finish_reason_count = 0
     async for chunk in stream:
-        chunks_stream_embeds[chunk.choices[0].index].append(
-            chunk.choices[0].text)
+        chunks_stream_embeds[chunk.choices[0].index].append(chunk.choices[0].text)
         if chunk.choices[0].finish_reason is not None:
             finish_reason_count += 1
     assert finish_reason_count == 2
@@ -157,7 +186,8 @@ async def test_completions_with_prompt_embeds(
         prompt="This is a prompt",
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     assert len(completion.choices) == 2
     completion_text_only = await client_with_prompt_embeds.completions.create(
         model=model_name,
@@ -170,18 +200,18 @@ async def test_completions_with_prompt_embeds(
         prompt="",
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     # Embeddings responses should be handled first
-    assert completion_mixed.choices[0].text == completion_embeds_only.choices[
-        0].text
-    assert completion_mixed.choices[1].text == completion_text_only.choices[
-        0].text
+    assert completion_mixed.choices[0].text == completion_embeds_only.choices[0].text
+    assert completion_mixed.choices[1].text == completion_text_only.choices[0].text
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
 async def test_completions_errors_with_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
+):
     # Test error case: invalid prompt_embeds
     with pytest.raises(BadRequestError):
         await client_with_prompt_embeds.completions.create(
@@ -189,12 +219,13 @@ async def test_completions_errors_with_prompt_embeds(
             model=model_name,
             max_tokens=5,
             temperature=0.0,
-            extra_body={"prompt_embeds": "invalid_base64"})
+            extra_body={"prompt_embeds": "invalid_base64"},
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
 async def test_completions_with_logprobs_and_prompt_embeds(
     example_prompt_embeds,
     client_with_prompt_embeds: openai.AsyncOpenAI,
@@ -211,7 +242,8 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         temperature=0.0,
         echo=False,
         logprobs=logprobs_arg,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
 
     logprobs = completion.choices[0].logprobs
     assert logprobs is not None
@@ -230,7 +262,8 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         temperature=0.0,
         echo=False,
         logprobs=logprobs_arg,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
 
     assert len(completion.choices) == 2
     for choice in completion.choices:
@@ -240,8 +273,7 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         assert len(logprobs.token_logprobs) == 5
         assert len(logprobs.top_logprobs) == 5
         for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) == 5
 
 
@@ -258,8 +290,18 @@ async def test_prompt_logprobs_raises_error(
             prompt="",
             max_tokens=5,
             temperature=0.0,
-            extra_body={
-                "prompt_embeds": encoded_embeds,
-                "prompt_logprobs": True
-            },
+            extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
         )
+
+
+@pytest.mark.asyncio
+async def test_empty_prompt_embeds(
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+) -> None:
+    await client_with_prompt_embeds.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": []},
+    )
diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index b9c466a6fbeb6825247cb08015aa12b0c0fbcb40..818ee2644b54746a515eaf03fc59d7dc3dd9da7c 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -16,8 +16,7 @@ from ...utils import RemoteOpenAIServer
 # need a multimodal model for these tests.
 
 # Contains a modality specific lora alongside the base model
-MULTIMODAL_MODEL_NAME = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct")
+MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
 
 ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
@@ -25,13 +24,12 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
 
 @pytest.fixture(scope="module")
 def multimodal_server():  # noqa: F811
-
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "half",
         "--max-model-len",
-        "12800",
+        "4096",
         "--enforce-eager",
         # lora config below
         "--enable-lora",
@@ -45,11 +43,12 @@ def multimodal_server():  # noqa: F811
         "--gpu-memory-utilization",
         "0.8",
         "--default-mm-loras",
-        f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}",
+        f'{{"audio": "{AUDIO_LORA_PATH}"}}',
     ]
 
-    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args,
-                            max_wait_seconds=480) as remote_server:
+    with RemoteOpenAIServer(
+        MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
+    ) as remote_server:
         yield remote_server
 
 
@@ -70,25 +69,25 @@ async def test_default_mm_lora_chat_completions(
     multi_modal_client: openai.AsyncOpenAI,
     audio_assets: AudioTestAssets,
 ):
-    messages = [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "Can you transcribe this audio?",
-        }, {
-            "type": "audio_url",
-            "audio_url": {
-                "url": audio_assets[0].url
-            },
-        }]
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Can you transcribe this audio?",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": audio_assets[0].url},
+                },
+            ],
+        }
+    ]
 
     chat_completion = await multi_modal_client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=128,
-        temperature=0.0)
+        model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
+    )
 
     assert len(chat_completion.choices) > 0
 
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d527c45c1fae213d6cd14bfa1b04c4faef70fe1
--- /dev/null
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture(scope="module")
+def chat_server_with_force_include_usage(request):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "128",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "4",
+        "--enable-force-include-usage",
+        "--port",
+        "55857",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
+    async with chat_server_with_force_include_usage.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_with_enable_force_include_usage(
+    chat_client_with_force_include_usage: openai.AsyncOpenAI,
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+
+    stream = await chat_client_with_force_include_usage.chat.completions.create(
+        model="Qwen/Qwen3-0.6B",
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        if not len(chunk.choices):
+            assert chunk.usage.prompt_tokens >= 0
+            assert (
+                last_completion_tokens == 0
+                or chunk.usage.completion_tokens > last_completion_tokens
+                or (
+                    not chunk.choices
+                    and chunk.usage.completion_tokens == last_completion_tokens
+                )
+            )
+            assert chunk.usage.total_tokens == (
+                chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+            )
+        else:
+            assert chunk.usage is None
+
+
+@pytest.fixture(scope="module")
+def transcription_server_with_force_include_usage():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--enable-force-include-usage",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def transcription_client_with_force_include_usage(
+    transcription_server_with_force_include_usage,
+):
+    async with (
+        transcription_server_with_force_include_usage.get_async_client() as async_client
+    ):
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_transcription_with_enable_force_include_usage(
+    transcription_client_with_force_include_usage, winning_call
+):
+    res = (
+        await transcription_client_with_force_include_usage.audio.transcriptions.create(
+            model="openai/whisper-large-v3-turbo",
+            file=winning_call,
+            language="en",
+            temperature=0.0,
+            stream=True,
+            timeout=30,
+        )
+    )
+
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            usage = chunk.usage
+            assert isinstance(usage, dict)
+            assert usage["prompt_tokens"] > 0
+            assert usage["completion_tokens"] > 0
+            assert usage["total_tokens"] > 0
+        else:
+            assert not hasattr(chunk, "usage")
diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbfae4f268d5e4b6177f12fd518b8661e586a08b
--- /dev/null
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""
+
+import json
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import (
+    StructuredOutputsParams,
+)
+from vllm.entrypoints.tool_server import ToolServer
+from vllm.reasoning.gptoss_reasoning_parser import (
+    GptOssReasoningParser,
+)
+
+
+class TestGptOssStructuralTagsIntegration:
+    """Integration tests for structural tags in GPT-OSS tool calls."""
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Create a mock tokenizer."""
+        tokenizer = Mock()
+        tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        return tokenizer
+
+    @pytest.fixture
+    def gptoss_parser(self, mock_tokenizer):
+        """Create a real GptOssReasoningParser instance."""
+        return GptOssReasoningParser(mock_tokenizer)
+
+    @pytest.fixture
+    def tool_server_with_python(self):
+        """Create a tool server with Python tool enabled."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
+        return tool_server
+
+    @pytest.fixture
+    def tool_server_empty(self):
+        """Create a tool server with no tools."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(return_value=False)
+        return tool_server
+
+    def test_end_to_end_no_tools(self, gptoss_parser):
+        """Test end-to-end flow when no tools are available."""
+        # Test the parser directly
+        result = gptoss_parser.prepare_structured_tag(None, None)
+        parsed_result = json.loads(result)
+
+        # Verify basic structure
+        assert parsed_result["type"] == "structural_tag"
+        assert parsed_result["format"]["type"] == "triggered_tags"
+        assert len(parsed_result["format"]["tags"]) == 1
+
+        # Verify only analysis channel is allowed
+        analysis_tag = parsed_result["format"]["tags"][0]
+        assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
+        assert analysis_tag["content"]["type"] == "any_text"
+        assert analysis_tag["end"] == "<|end|>"
+
+        # Verify triggers
+        assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
+        assert parsed_result["format"]["stop_after_first"] is False
+
+    def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
+        """Test end-to-end flow with Python tool enabled."""
+        result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
+        parsed_result = json.loads(result)
+
+        # Should have analysis tag + 2 python tags
+        assert len(parsed_result["format"]["tags"]) == 3
+
+        # Verify all expected tags are present
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        expected_begins = [
+            "<|channel|>analysis<|message|>",
+            "<|channel|>commentary to=python",
+            "<|channel|>analysis to=python",
+        ]
+
+        for expected in expected_begins:
+            assert expected in tag_begins
+
+        # Verify triggers include commentary
+        assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
+        assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
+
+    def test_structured_outputs_params_integration(
+        self, gptoss_parser, tool_server_with_python
+    ):
+        """Test integration with StructuredOutputsParams."""
+        # Generate structural tag
+        structural_tag = gptoss_parser.prepare_structured_tag(
+            None, tool_server_with_python
+        )
+
+        # Create StructuredOutputsParams
+        params = StructuredOutputsParams(structural_tag=structural_tag)
+
+        # Verify the tag is properly stored and accessible
+        assert params.structural_tag == structural_tag
+
+        # Verify the tag is valid JSON
+        parsed_tag = json.loads(params.structural_tag)
+        assert parsed_tag["type"] == "structural_tag"
+
+    @pytest.mark.parametrize(
+        "browser, python, container, expected_tags",
+        [
+            # No tools
+            (False, False, False, 1),
+            # Single tool
+            (True, False, False, 3),
+            # Multiple tools
+            (True, True, False, 5),
+            # All tools
+            (True, True, True, 7),
+        ],
+    )
+    def test_tool_server_interaction_flow(
+        self, gptoss_parser, browser, python, container, expected_tags
+    ):
+        """Test the complete tool server interaction flow."""
+
+        # Create a mock ToolServer
+        tool_server = Mock(spec=ToolServer)
+
+        # Simulate tool availability based on parameters
+        tool_server.has_tool = Mock(
+            side_effect=lambda tool: {
+                "browser": browser,
+                "python": python,
+                "container": container,
+            }.get(tool, False)
+        )
+
+        # Run the parser and verify results
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_result = json.loads(result)
+
+        # Validate number of tags
+        assert len(parsed_result["format"]["tags"]) == expected_tags
+
+        # Verify tool-specific tags exist for enabled tools
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        for tool, enabled in {
+            "browser": browser,
+            "python": python,
+            "container": container,
+        }.items():
+            if enabled:
+                assert f"<|channel|>commentary to={tool}" in tag_begins
+                assert f"<|channel|>analysis to={tool}" in tag_begins
+
+    def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
+        """Test that original tags are preserved when provided."""
+        original_tag = '{"type": "custom_tag", "data": "preserved"}'
+
+        result = gptoss_parser.prepare_structured_tag(
+            original_tag, tool_server_with_python
+        )
+
+        # Should return original tag unchanged
+        assert result == original_tag
+
+    @pytest.mark.parametrize(
+        "tools",
+        [
+            [],
+            ["browser"],
+            ["python"],
+            ["container"],
+            ["browser", "python"],
+            ["browser", "container"],
+            ["python", "container"],
+            ["browser", "python", "container"],
+        ],
+    )
+    def test_json_validity_comprehensive(self, gptoss_parser, tools):
+        """Test JSON validity across all possible tool combinations."""
+
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+
+        # Should be valid JSON
+        parsed_result = json.loads(result)
+
+        # Should have correct structure
+        assert parsed_result["type"] == "structural_tag"
+        assert "format" in parsed_result
+        assert "tags" in parsed_result["format"]
+        assert "triggers" in parsed_result["format"]
+
+        # Tag count should be: 1 (analysis) + 2 * len(tools)
+        expected_tag_count = 1 + (2 * len(tools))
+        assert len(parsed_result["format"]["tags"]) == expected_tag_count
+
+    def test_error_handling_invalid_tool_server(self, gptoss_parser):
+        """Test error handling with invalid tool server."""
+        # Tool server that raises exceptions
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
+
+        # Should handle gracefully and still return a valid tag
+        with pytest.raises(Exception, match="Tool server error"):
+            gptoss_parser.prepare_structured_tag(None, tool_server)
+
+    def test_concurrent_requests_isolation(self, gptoss_parser):
+        """Test that concurrent requests don't interfere with each other."""
+        # Simulate concurrent requests with different tool servers
+        tool_server_1 = Mock(spec=ToolServer)
+        tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
+
+        tool_server_2 = Mock(spec=ToolServer)
+        tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
+
+        # Generate tags concurrently
+        result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
+        result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
+
+        # Parse results
+        parsed_1 = json.loads(result_1)
+        parsed_2 = json.loads(result_2)
+
+        # Verify they have different tool configurations
+        tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
+        tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
+
+        # Result 1 should have python tags
+        assert "<|channel|>commentary to=python" in tags_1
+        assert "<|channel|>commentary to=browser" not in tags_1
+
+        # Result 2 should have browser tags
+        assert "<|channel|>commentary to=browser" in tags_2
+        assert "<|channel|>commentary to=python" not in tags_2
+
+    def test_tag_format_consistency(self, gptoss_parser):
+        """Test that all generated tags follow consistent format."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(
+            side_effect=lambda tool: tool in ["python", "browser"]
+        )
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_result = json.loads(result)
+
+        # Verify all tags have required fields
+        for tag in parsed_result["format"]["tags"]:
+            assert "begin" in tag
+            assert "content" in tag
+            assert "end" in tag
+            assert tag["content"]["type"] == "any_text"
+            assert tag["end"] == "<|end|>"
+
+            # Verify begin format
+            assert tag["begin"].startswith("<|channel|>")
+
+    def test_trigger_configuration(self, gptoss_parser):
+        """Test trigger configuration for different tool setups."""
+        # Test with no tools
+        result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
+        parsed_no_tools = json.loads(result_no_tools)
+        assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
+
+        # Test with tools
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
+
+        result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_with_tools = json.loads(result_with_tools)
+
+        expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
+        assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 6f2addd3649da2a3ec4309819fd5f43563dd0bca..c74f805961bc81767b04fefda8587b247021ffc0 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -20,52 +20,25 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 BADREQUEST_CASES = [
     (
         "test_rank",
-        {
-            "r": 1024
-        },
+        {"r": 1024},
         "is greater than max_lora_rank",
     ),
-    (
-        "test_bias",
-        {
-            "bias": "all"
-        },
-        "Adapter bias cannot be used without bias_enabled",
-    ),
-    ("test_dora", {
-        "use_dora": True
-    }, "does not yet support DoRA"),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
     (
         "test_modules_to_save",
-        {
-            "modules_to_save": ["lm_head"]
-        },
+        {"modules_to_save": ["lm_head"]},
         "only supports modules_to_save being None",
     ),
 ]
 
 
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, monkeypatch_module,
-                                  zephyr_lora_files):
-
-    use_v1 = request.param
-    assert use_v1
-    monkeypatch_module.setenv('VLLM_USE_V1', '1')
-
+def server_with_lora_modules_json(request, zephyr_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
         "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
+        "base_model_name": MODEL_NAME,
     }
 
     args = [
@@ -96,14 +69,12 @@ def server_with_lora_modules_json(request, monkeypatch_module,
 
 @pytest_asyncio.fixture
 async def client(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
+    async with server_with_lora_modules_json.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -111,22 +82,18 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
+    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
-                                    zephyr_lora_files):
-
-    response = await client.post("load_lora_adapter",
-                                 cast_to=str,
-                                 body={
-                                     "lora_name": "zephyr-lora-3",
-                                     "lora_path": zephyr_lora_files
-                                 })
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+    )
     # Ensure adapter loads before querying /models
     assert "success" in response
 
@@ -141,37 +108,37 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
     with pytest.raises(openai.NotFoundError):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": "notfound",
-                              "lora_path": "/not/an/adapter"
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+        )
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
-                                          tmp_path):
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
     (invalid_files / "adapter_config.json").write_text("this is not json")
 
     with pytest.raises(openai.BadRequestError):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": "invalid-json",
-                              "lora_path": str(invalid_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "invalid-json", "lora_path": str(invalid_files)},
+        )
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("test_name,config_change,expected_error",
-                         BADREQUEST_CASES)
-async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
-                                        zephyr_lora_files, test_name: str,
-                                        config_change: dict,
-                                        expected_error: str):
+@pytest.mark.parametrize("test_name,config_change,expected_error", BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    zephyr_lora_files,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
     # Create test directory
     test_dir = tmp_path / test_name
 
@@ -191,29 +158,28 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
 
     # Test loading the adapter
     with pytest.raises(openai.BadRequestError, match=expected_error):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": test_name,
-                              "lora_path": str(test_dir)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": test_name, "lora_path": str(test_dir)},
+        )
 
 
 @pytest.mark.asyncio
-async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
-                                      zephyr_lora_files):
+async def test_multiple_lora_adapters(
+    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+):
     """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
 
     # This test file configures the server with --max-cpu-loras=2 and this test
     # will concurrently load 10 adapters, so it should flex the LRU cache
     async def load_and_run_adapter(adapter_name: str):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": adapter_name,
-                              "lora_path": str(zephyr_lora_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+        )
         for _ in range(3):
             await client.completions.create(
                 model=adapter_name,
@@ -223,8 +189,7 @@ async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
 
     lora_tasks = []
     for i in range(10):
-        lora_tasks.append(
-            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
 
     results, _ = await asyncio.wait(lora_tasks)
 
@@ -234,8 +199,8 @@ async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
-
+    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
     (invalid_files / "adapter_config.json").write_text("this is not json")
@@ -266,20 +231,18 @@ async def test_loading_invalid_adapters_does_not_break_others(
     # Run a bunch of bad adapter loads
     for _ in range(25):
         with suppress(openai.NotFoundError):
-            await client.post("load_lora_adapter",
-                              cast_to=str,
-                              body={
-                                  "lora_name": "notfound",
-                                  "lora_path": "/not/an/adapter"
-                              })
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+            )
     for _ in range(25):
         with suppress(openai.BadRequestError):
-            await client.post("load_lora_adapter",
-                              cast_to=str,
-                              body={
-                                  "lora_name": "invalid",
-                                  "lora_path": str(invalid_files)
-                              })
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "invalid", "lora_path": str(invalid_files)},
+            )
 
     # Ensure all the running requests with lora adapters succeeded
     stop_good_requests_event.set()
@@ -288,12 +251,11 @@ async def test_loading_invalid_adapters_does_not_break_others(
         assert not isinstance(r, Exception), f"Got exception {r}"
 
     # Ensure we can load another adapter and run it
-    await client.post("load_lora_adapter",
-                      cast_to=str,
-                      body={
-                          "lora_name": "valid",
-                          "lora_path": zephyr_lora_files
-                      })
+    await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+    )
     await client.completions.create(
         model="valid",
         prompt=["Hello there", "Foo bar bazz buzz"],
@@ -310,12 +272,11 @@ async def test_beam_search_with_lora_adapters(
     """Validate that async beam search can be used with lora."""
 
     async def load_and_run_adapter(adapter_name: str):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": adapter_name,
-                              "lora_path": str(zephyr_lora_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+        )
         for _ in range(3):
             await client.completions.create(
                 model=adapter_name,
@@ -326,8 +287,7 @@ async def test_beam_search_with_lora_adapters(
 
     lora_tasks = []
     for i in range(3):
-        lora_tasks.append(
-            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
 
     results, _ = await asyncio.wait(lora_tasks)
 
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 0561158dcf65a0a7b9d967e9ac4a72d5e9d4d336..b05fa379c69fcb1784f2e30a10fd25dd17f6306b 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -4,7 +4,6 @@
 from contextlib import suppress
 from dataclasses import dataclass, field
 from http import HTTPStatus
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -12,8 +11,7 @@ import pytest
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -33,19 +31,20 @@ class MockHFConfig:
 @dataclass
 class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
+
     model: str = MODEL_NAME
     tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
     tokenizer_mode: str = "auto"
     max_model_len: int = 100
-    tokenizer_revision: Optional[str] = None
-    multimodal_config: MultiModalConfig = field(
-        default_factory=MultiModalConfig)
+    tokenizer_revision: str | None = None
+    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
-    logits_processor_pattern: Optional[str] = None
-    diff_sampling_param: Optional[dict] = None
+    logits_processors: list[str] | None = None
+    logits_processor_pattern: str | None = None
+    diff_sampling_param: dict | None = None
     allowed_local_media_path: str = ""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
     skip_tokenizer_init: bool = False
@@ -55,17 +54,21 @@ class MockModelConfig:
 
 
 class MockLoRAResolver(LoRAResolver):
-
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
         if lora_name == "test-lora":
-            return LoRARequest(lora_name="test-lora",
-                               lora_int_id=1,
-                               lora_local_path="/fake/path/test-lora")
+            return LoRARequest(
+                lora_name="test-lora",
+                lora_int_id=1,
+                lora_local_path="/fake/path/test-lora",
+            )
         elif lora_name == "invalid-lora":
-            return LoRARequest(lora_name="invalid-lora",
-                               lora_int_id=2,
-                               lora_local_path="/fake/path/invalid-lora")
+            return LoRARequest(
+                lora_name="invalid-lora",
+                lora_int_id=2,
+                lora_local_path="/fake/path/invalid-lora",
+            )
         return None
 
 
@@ -96,8 +99,7 @@ def mock_serving_setup():
             return True
         if lora_request.lora_name == "invalid-lora":
             # Simulate failure during addition (e.g. invalid format)
-            raise ValueError(f"Simulated failure adding LoRA: "
-                             f"{lora_request.lora_name}")
+            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
         return True
 
     mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
@@ -106,28 +108,33 @@ def mock_serving_setup():
         for _ in []:
             yield _
 
-    mock_engine.generate = MagicMock(spec=AsyncLLM.generate,
-                                     side_effect=mock_generate)
+    mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
 
     mock_engine.generate.reset_mock()
     mock_engine.add_lora.reset_mock()
 
-    mock_model_config = MockModelConfig()
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
 
-    serving_completion = OpenAIServingCompletion(mock_engine,
-                                                 mock_model_config,
-                                                 models,
-                                                 request_logger=None)
+    serving_completion = OpenAIServingCompletion(
+        mock_engine, models, request_logger=None
+    )
+
+    serving_completion._process_inputs = AsyncMock(
+        return_value=(MagicMock(name="engine_request"), {})
+    )
 
     return mock_engine, serving_completion
 
 
 @pytest.mark.asyncio
-async def test_serving_completion_with_lora_resolver(mock_serving_setup,
-                                                     monkeypatch):
+async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
@@ -149,14 +156,13 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup,
     assert called_lora_request.lora_name == lora_model_name
 
     mock_engine.generate.assert_called_once()
-    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
     assert isinstance(called_lora_request, LoRARequest)
     assert called_lora_request.lora_name == lora_model_name
 
 
 @pytest.mark.asyncio
-async def test_serving_completion_resolver_not_found(mock_serving_setup,
-                                                     monkeypatch):
+async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
@@ -179,7 +185,8 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup,
 
 @pytest.mark.asyncio
 async def test_serving_completion_resolver_add_lora_fails(
-        mock_serving_setup, monkeypatch):
+    mock_serving_setup, monkeypatch
+):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e390ad496428be945123960fd38c53782676698
--- /dev/null
+++ b/tests/entrypoints/openai/test_messages.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import anthropic
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--served-model-name",
+        "claude-3-7-sonnet-latest",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client_anthropic() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_simple_messages(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[{"role": "user", "content": "how are you!"}],
+    )
+    assert resp.stop_reason == "end_turn"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_system_message(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        system="you are a helpful assistant",
+        messages=[{"role": "user", "content": "how are you!"}],
+    )
+    assert resp.stop_reason == "end_turn"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[{"role": "user", "content": "how are you!"}],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk.model_dump_json())
+
+
+@pytest.mark.asyncio
+async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[
+            {"role": "user", "content": "What's the weather like in New York today?"}
+        ],
+        tools=[
+            {
+                "name": "get_current_weather",
+                "description": "Useful for querying the weather in a specified city.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City or region, for example: "
+                            "New York, London, Tokyo, etc.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        ],
+        stream=False,
+    )
+    assert resp.stop_reason == "tool_use"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "What's the weather like in New York today?",
+            }
+        ],
+        tools=[
+            {
+                "name": "get_current_weather",
+                "description": "Useful for querying the weather in a specified city.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City or region, for example: "
+                            "New York, London, Tokyo, etc.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        ],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk.model_dump_json())
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index f0b61902eb56896f17a55aaa08b9b2de98460eb5..4e7b765d7713fb909f8b22c6b8fcdf898b5efcc1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,27 +16,18 @@ from transformers import AutoTokenizer
 
 from vllm import version
 
+from ...conftest import LocalAssetServer
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODELS = {
+    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
+}
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
-@pytest.fixture(scope="module", params=[True])
-def use_v1(request):
-    # Module-scoped variant of run_with_both_engines
-    #
-    # Use this fixture to run a test with both v0 and v1, and
-    # also to conditionalize the test logic e.g.
-    #
-    # def test_metrics_exist(use_v1, server, client):
-    #     ...
-    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
-    #     for metric in expected:
-    #         assert metric in response.text
-    #
-    # @skip_v1 wouldn't work here because this is a module-level
-    # fixture - per-function decorators would have no effect
+@pytest.fixture(scope="module", params=list(MODELS.keys()))
+def model_key(request):
     yield request.param
 
 
@@ -54,19 +45,21 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="module",
-                params=[
-                    "",
-                    "--enable-chunked-prefill",
-                    "--disable-frontend-multiprocessing",
-                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
-                ])
-def server(use_v1, default_server_args, request):
+@pytest.fixture(
+    scope="module",
+    params=[
+        "",
+        "--enable-chunked-prefill",
+        "--disable-frontend-multiprocessing",
+        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
+    ],
+)
+def server(model_key, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
-                            env_dict=env_dict) as remote_server:
+
+    model_name = MODELS[model_key]
+    with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
         yield remote_server
 
 
@@ -77,66 +70,82 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
-
-_NUM_REQUESTS = 10
-_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
-_NUM_GENERATION_TOKENS_PER_REQUEST = 10
-
-# {metric_family: [(suffix, expected_value)]}
-EXPECTED_VALUES = {
-    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:time_per_output_token_seconds":
-    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
-    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prompt_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
-    "vllm:request_generation_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
-    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_max_tokens": [
-        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS)
-    ],
-    "vllm:iteration_tokens_total":
-    [("_sum", _NUM_REQUESTS *
-      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
-     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
-    "vllm:prompt_tokens": [("_total",
-                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
-    "vllm:generation_tokens": [
-        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
-    ],
-    "vllm:request_success": [("_total", _NUM_REQUESTS)],
-}
+
+
+def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
+    num_prompt_tokens = len(prompt_ids)
+
+    # {metric_family: [(suffix, expected_value)]}
+    return {
+        "vllm:time_to_first_token_seconds": [("_count", num_requests)],
+        "vllm:time_per_output_token_seconds": [
+            ("_count", num_requests * (max_tokens - 1))
+        ],
+        "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
+        "vllm:request_queue_time_seconds": [("_count", num_requests)],
+        "vllm:request_inference_time_seconds": [("_count", num_requests)],
+        "vllm:request_prefill_time_seconds": [("_count", num_requests)],
+        "vllm:request_decode_time_seconds": [("_count", num_requests)],
+        "vllm:request_prompt_tokens": [
+            ("_sum", num_requests * num_prompt_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:request_generation_tokens": [
+            ("_sum", num_requests * max_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:request_params_n": [("_count", num_requests)],
+        "vllm:request_params_max_tokens": [
+            ("_sum", num_requests * max_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:iteration_tokens_total": [
+            (
+                "_sum",
+                num_requests * (num_prompt_tokens + max_tokens),
+            ),
+            ("_count", num_requests * max_tokens),
+        ],
+        "vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
+        "vllm:generation_tokens": [("_total", num_requests * max_tokens)],
+        "vllm:request_success": [("_total", num_requests)],
+    }
 
 
 @pytest.mark.asyncio
-async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient, use_v1: bool):
-    for _ in range(_NUM_REQUESTS):
+async def test_metrics_counts(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    if model_key == "multimodal":
+        pytest.skip("Unnecessary test")
+
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+    num_requests = 10
+    max_tokens = 10
+
+    for _ in range(num_requests):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
-            model=MODEL_NAME,
-            prompt=_TOKENIZED_PROMPT,
-            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
+            model=model_name,
+            prompt=prompt_ids,
+            max_tokens=max_tokens,
+        )
 
     response = requests.get(server.url_for("metrics"))
     print(response.text)
     assert response.status_code == HTTPStatus.OK
 
     # Loop over all expected metric_families
-    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
-                or (not server.show_hidden_metrics
-                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
+    expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
+    for metric_family, suffix_values_list in expected_values.items():
+        if metric_family not in EXPECTED_METRICS_V1 or (
+            not server.show_hidden_metrics
+            and metric_family in HIDDEN_DEPRECATED_METRICS
+        ):
             continue
 
         found_metric = False
@@ -160,72 +169,17 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
                             assert sample.value == expected_value, (
                                 f"{metric_name_w_suffix} expected value of "
                                 f"{expected_value} did not match found value "
-                                f"{sample.value}")
+                                f"{sample.value}"
+                            )
                             break
                     assert found_suffix, (
                         f"Did not find {metric_name_w_suffix} in prom endpoint"
                     )
                 break
 
-        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
+        assert found_metric, f"Did not find {metric_family} in prom endpoint"
 
 
-EXPECTED_METRICS = [
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
-    "vllm:e2e_request_latency_seconds_sum",
-    "vllm:e2e_request_latency_seconds_bucket",
-    "vllm:e2e_request_latency_seconds_count",
-    "vllm:request_queue_time_seconds_sum",
-    "vllm:request_queue_time_seconds_bucket",
-    "vllm:request_queue_time_seconds_count",
-    "vllm:request_inference_time_seconds_sum",
-    "vllm:request_inference_time_seconds_bucket",
-    "vllm:request_inference_time_seconds_count",
-    "vllm:request_prefill_time_seconds_sum",
-    "vllm:request_prefill_time_seconds_bucket",
-    "vllm:request_prefill_time_seconds_count",
-    "vllm:request_decode_time_seconds_sum",
-    "vllm:request_decode_time_seconds_bucket",
-    "vllm:request_decode_time_seconds_count",
-    "vllm:request_prompt_tokens_sum",
-    "vllm:request_prompt_tokens_bucket",
-    "vllm:request_prompt_tokens_count",
-    "vllm:request_generation_tokens_sum",
-    "vllm:request_generation_tokens_bucket",
-    "vllm:request_generation_tokens_count",
-    "vllm:request_params_n_sum",
-    "vllm:request_params_n_bucket",
-    "vllm:request_params_n_count",
-    "vllm:request_params_max_tokens_sum",
-    "vllm:request_params_max_tokens_bucket",
-    "vllm:request_params_max_tokens_count",
-    "vllm:iteration_tokens_total",
-    "vllm:num_preemptions_total",
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "vllm:request_success_total",
-    "vllm:cache_config_info",
-    # labels in cache_config_info
-    "block_size",
-    "cache_dtype",
-    "cpu_offload_gb",
-    "enable_prefix_caching",
-    "gpu_memory_utilization",
-    "num_cpu_blocks",
-    "num_gpu_blocks",
-    "num_gpu_blocks_override",
-    "sliding_window",
-    "swap_space_bytes",
-]
-
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
@@ -279,6 +233,11 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
+EXPECTED_METRICS_MM = [
+    "vllm:mm_cache_queries",
+    "vllm:mm_cache_hits",
+]
+
 HIDDEN_DEPRECATED_METRICS: list[str] = [
     "vllm:gpu_cache_usage_perc",
     "vllm:gpu_prefix_cache_queries",
@@ -290,30 +249,73 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
 
 
 @pytest.mark.asyncio
-async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient, use_v1: bool):
+async def test_metrics_exist(
+    local_asset_server: LocalAssetServer,
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    model_name = MODELS[model_key]
+
     # sending a request triggers the metrics to be logged.
-    await client.completions.create(model=MODEL_NAME,
-                                    prompt="Hello, my name is",
-                                    max_tokens=5,
-                                    temperature=0.0)
+    if model_key == "text":
+        await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=5,
+            temperature=0.0,
+        )
+    else:
+        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": local_asset_server.url_for(
+                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                ),
+                            },
+                        },
+                        {"type": "text", "text": "What's in this image?"},
+                    ],
+                }
+            ],
+            max_tokens=5,
+            temperature=0.0,
+        )
 
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (metric in HIDDEN_DEPRECATED_METRICS
-                and not server.show_hidden_metrics):
+    expected_metrics = EXPECTED_METRICS_V1
+    if model_key == "multimodal":
+        # NOTE: Don't use in-place assignment
+        expected_metrics = expected_metrics + EXPECTED_METRICS_MM
+
+    for metric in expected_metrics:
+        if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
             continue
         assert metric in response.text
 
 
 @pytest.mark.asyncio
-async def test_abort_metrics_reset(server: RemoteOpenAIServer,
-                                   client: openai.AsyncClient, use_v1: bool):
-
-    running_requests, waiting_requests, kv_cache_usage = (
-        _get_running_metrics_from_api(server, use_v1))
+async def test_abort_metrics_reset(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+
+    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
+        server,
+    )
 
     # Expect no running requests or kvcache usage
     assert running_requests == 0
@@ -325,18 +327,21 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
     for _ in range(3):
         task = asyncio.create_task(
             client.completions.create(
-                model=MODEL_NAME,
-                prompt=_TOKENIZED_PROMPT,
+                model=model_name,
+                prompt=prompt_ids,
                 max_tokens=100,  # Long generation to give time to abort
-                temperature=0.0))
+                temperature=0.0,
+            )
+        )
         tasks.append(task)
 
     # Wait a bit for requests to start processing
     await asyncio.sleep(0.5)
 
     # Check that we have running requests
-    running_requests, waiting_requests, kv_cache_usage = (
-        _get_running_metrics_from_api(server, use_v1))
+    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
+        server,
+    )
 
     # Expect running requests and kvcache usage
     assert running_requests > 0
@@ -355,20 +360,21 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
 
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests_after, waiting_requests_after, kv_cache_usage_after = (
-        _get_running_metrics_from_api(server, use_v1))
+        _get_running_metrics_from_api(server)
+    )
 
-    assert running_requests_after == 0,\
-        (f"Expected 0 running requests after abort, got "
-         f"{running_requests_after}")
-    assert waiting_requests_after == 0,\
-        (f"Expected 0 waiting requests after abort, got "
-         f"{waiting_requests_after}")
-    assert kv_cache_usage_after == 0,\
-        (f"Expected 0% KV cache usage after abort, got "
-         f"{kv_cache_usage_after}")
+    assert running_requests_after == 0, (
+        f"Expected 0 running requests after abort, got {running_requests_after}"
+    )
+    assert waiting_requests_after == 0, (
+        f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
+    )
+    assert kv_cache_usage_after == 0, (
+        f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
+    )
 
 
-def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
     """Return (running_count, waiting_count, kv_cache_usage)"""
 
     response = requests.get(server.url_for("metrics"))
@@ -377,8 +383,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests, waiting_requests, kv_cache_usage = None, None, None
 
-    kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
-                             if use_v1 else "vllm:gpu_cache_usage_perc")
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
 
     for family in text_string_to_metric_families(response.text):
         if family.name == "vllm:num_requests_running":
@@ -404,35 +409,37 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     return running_requests, waiting_requests, kv_cache_usage
 
 
-def test_metrics_exist_run_batch(use_v1: bool):
+def test_metrics_exist_run_batch():
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
     port = "8001"
     server_url = f"http://{base_url}:{port}"
 
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(input_batch)
         input_file.flush()
-        proc = subprocess.Popen([
-            sys.executable,
-            "-m",
-            "vllm.entrypoints.openai.run_batch",
-            "-i",
-            input_file.name,
-            "-o",
-            output_file.name,
-            "--model",
-            "intfloat/multilingual-e5-small",
-            "--enable-metrics",
-            "--url",
-            base_url,
-            "--port",
-            port,
-        ],
-                                env={"VLLM_USE_V1": "1"})
+        proc = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.run_batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+                "--enable-metrics",
+                "--url",
+                base_url,
+                "--port",
+                port,
+            ],
+        )
 
         def is_server_up(url):
             try:
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 4ee34b19dea320997024ae74977ae438e196b11e..7d2968d965066282e61921ded6c77a80ad9bae38 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -52,6 +52,5 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
+    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index f0ce50debe494224eb7562483e2bfc0f7804b9ec..ba463be1d5cd7637cc202587f6c8eb2c3f60016e 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -25,13 +25,10 @@ def run_and_test_dummy_opt_api_server(model, tp=1):
         client = server.get_client()
         completion = client.chat.completions.create(
             model=model,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant."
-            }, {
-                "role": "user",
-                "content": "Hello!"
-            }],
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello!"},
+            ],
             temperature=0,
         )
         generated_text = completion.choices[0].message.content
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 73f79ac28d1102499c06af097cd5e3c636459adf..64fdaf08893adb626609e571f937116126b6b044 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -75,10 +75,11 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
             http://localhost:8000/v1/chat/completions
         """  # noqa: E501
         if hasattr(case, "body") and isinstance(case.body, dict):
-            if ("messages" in case.body
-                    and isinstance(case.body["messages"], list)
-                    and len(case.body["messages"]) > 0):
-
+            if (
+                "messages" in case.body
+                and isinstance(case.body["messages"], list)
+                and len(case.body["messages"]) > 0
+            ):
                 for message in case.body["messages"]:
                     if not isinstance(message, dict):
                         continue
@@ -86,10 +87,11 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
                     # Check for invalid file type in tokenize endpoint
                     if op.method.lower() == "post" and op.path == "/tokenize":
                         content = message.get("content", [])
-                        if (isinstance(content, list) and len(content) > 0
-                                and any(
-                                    item.get("type") == "file"
-                                    for item in content)):
+                        if (
+                            isinstance(content, list)
+                            and len(content) > 0
+                            and any(item.get("type") == "file" for item in content)
+                        ):
                             return False
 
                     # Check for invalid tool_calls with non-function types
@@ -106,10 +108,13 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
             # Causing a server error in EBNF grammar parsing
             # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
             structured_outputs = case.body.get("structured_outputs", {})
-            grammar = structured_outputs.get("grammar") if isinstance(
-                structured_outputs, dict) else None
+            grammar = (
+                structured_outputs.get("grammar")
+                if isinstance(structured_outputs, dict)
+                else None
+            )
 
-            if grammar == '':
+            if grammar == "":
                 # Allow None (will be handled as no grammar)
                 # But skip empty strings
                 return False
@@ -133,9 +138,8 @@ def test_openapi_stateless(case: schemathesis.Case):
 
     timeout = {
         # requires a longer timeout
-        ("POST", "/v1/chat/completions"):
-        LONG_TIMEOUT_SECONDS,
+        ("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
     }.get(key, DEFAULT_TIMEOUT_SECONDS)
 
-    #No need to verify SSL certificate for localhost
+    # No need to verify SSL certificate for localhost
     case.call_and_validate(verify=False, timeout=timeout)
diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py
index eb387998c2cc438c43693e7289c62f9df8b3e110..b67d6147937d172a5539f6462f02b0680fe81f88 100644
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -37,7 +37,7 @@ def server(request: pytest.FixtureRequest):
         "--enforce-eager",
         "--max-num-seqs",
         "2",
-        *passed_params
+        *passed_params,
     ]
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
@@ -73,8 +73,9 @@ async def test_missing_api_token(server: RemoteOpenAIServer):
 )
 @pytest.mark.asyncio
 async def test_passed_api_token(server: RemoteOpenAIServer):
-    response = requests.get(server.url_for("v1/models"),
-                            headers={"Authorization": "Bearer test"})
+    response = requests.get(
+        server.url_for("v1/models"), headers={"Authorization": "Bearer test"}
+    )
     assert response.status_code == HTTPStatus.OK
 
 
@@ -110,7 +111,8 @@ async def test_enable_request_id_header(server: RemoteOpenAIServer):
 )
 @pytest.mark.asyncio
 async def test_custom_request_id_header(server: RemoteOpenAIServer):
-    response = requests.get(server.url_for("health"),
-                            headers={"X-Request-Id": "Custom"})
+    response = requests.get(
+        server.url_for("health"), headers={"X-Request-Id": "Custom"}
+    )
     assert "X-Request-Id" in response.headers
     assert response.headers.get("X-Request-Id") == "Custom"
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ed44a33bf81f3bde836f37439cb2a965d8ed879
--- /dev/null
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[True])
+def server(request, monkeypatch_module):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    client = openai.OpenAI(
+        api_key="EMPTY",
+        base_url=f"http://localhost:{server.port}/v1",
+        default_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    response_with_raw = raw_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        extra_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = response_with_raw.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_completion_with_orca_header(client: openai.AsyncOpenAI):
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    completion = await raw_client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = completion.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_single_completion(client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+        temperature=0.0,
+    )
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index bb4c633e5e5021277791c3431a093f814e57b4da..cd5661e5739feb429f1645bb64a542d4ebb1647b 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
+from unittest.mock import Mock
 
 # imports for structured outputs tests
 import openai
@@ -10,16 +11,12 @@ import pytest
 import regex as re
 import torch
 
-from vllm.entrypoints.renderer import BaseRenderer
+from vllm.config import ModelConfig
+from vllm.entrypoints.renderer import CompletionRenderer
 
 from ...utils import RemoteOpenAIServer
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
-
 @pytest.mark.asyncio
 async def test_empty_prompt():
     model_name = "gpt2"
@@ -28,15 +25,16 @@ async def test_empty_prompt():
         client = remote_server.get_async_client()
 
         with pytest.raises(
-                openai.BadRequestError,
-                match=
-                "Either prompt or prompt_embeds must be provided and non-empty."
+            openai.BadRequestError,
+            match="Either prompt or prompt_embeds must be provided and non-empty.",
         ):
-            await client.completions.create(model=model_name,
-                                            prompt="",
-                                            max_tokens=5,
-                                            temperature=0.0,
-                                            extra_body={"prompt_embeds": []})
+            await client.completions.create(
+                model=model_name,
+                prompt="",
+                max_tokens=5,
+                temperature=0.0,
+                extra_body={"prompt_embeds": []},
+            )
 
 
 @pytest.mark.asyncio
@@ -46,23 +44,27 @@ async def test_out_of_vocab_token_ids():
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
 
-        with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.*out of vocabulary.*').pattern):
-            await client.completions.create(model=model_name,
-                                            prompt=[999999],
-                                            max_tokens=5,
-                                            temperature=0.0)
+        with pytest.raises(
+            openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
+        ):
+            await client.completions.create(
+                model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
+            )
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
 @pytest.mark.parametrize(
-    "layout",
-    [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr])
+    "layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
+)
 @pytest.mark.parametrize("seq_len", [2, 10])
 @pytest.mark.parametrize("hidden_size", [2, 10])
-def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
-                            seq_len: int, hidden_size: int):
+def test_load_prompt_embeds(
+    dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
+):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = True
+    renderer = CompletionRenderer(model_config, tokenizer=None)
+
     # construct arbitrary tensors of various dtypes, layouts, and sizes.
     # We need to check against different layouts to make sure that if a user
     # uses sparse tensors to reduce the transmission size of prompt embeddings,
@@ -87,11 +89,30 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
     buffer.seek(0)
     encoded_tensor = pybase64.b64encode(buffer.getvalue())
 
-    loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
+    loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
     assert len(loaded_prompt_embeds) == 1
     loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
     assert loaded_tensor.device.type == "cpu"
     assert loaded_tensor.layout == torch.strided
-    torch.testing.assert_close(loaded_tensor,
-                               tensor.to("cpu").to_dense(),
-                               equal_nan=True)
+    torch.testing.assert_close(
+        loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
+    )
+
+
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("seq_len", [2])
+@pytest.mark.parametrize("hidden_size", [2])
+def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = False
+    renderer = CompletionRenderer(model_config, tokenizer=None)
+
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
+        renderer.load_prompt_embeds(encoded_tensor)
diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b1cfb58b5022d1c82af76fd732bb72976e7807
--- /dev/null
+++ b/tests/entrypoints/openai/test_protocol.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from openai_harmony import (
+    Message,
+)
+
+from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
+
+
+def test_serialize_message() -> None:
+    dict_value = {"a": 1, "b": "2"}
+    assert serialize_message(dict_value) == dict_value
+
+    msg_value = {
+        "role": "assistant",
+        "name": None,
+        "content": [{"type": "text", "text": "Test 1"}],
+        "channel": "analysis",
+    }
+    msg = Message.from_dict(msg_value)
+    assert serialize_message(msg) == msg_value
+
+
+def test_serialize_messages() -> None:
+    assert serialize_messages(None) is None
+    assert serialize_messages([]) is None
+
+    dict_value = {"a": 3, "b": "4"}
+    msg_value = {
+        "role": "assistant",
+        "name": None,
+        "content": [{"type": "text", "text": "Test 2"}],
+        "channel": "analysis",
+    }
+    msg = Message.from_dict(msg_value)
+    assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]
diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py
index b0eb84712c199a98006282969c25c0c94ed27c82..0dc2430caef7cc9f54646792c8a88da952471774 100644
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@@ -13,6 +13,7 @@ MODEL_NAME = "openai/gpt-oss-20b"
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
@@ -25,6 +26,8 @@ def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
     with monkeypatch_module.context() as m:
         m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
         m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        # Helps the model follow instructions better
+        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
         with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
             yield remote_server
 
@@ -36,8 +39,9 @@ def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
     with monkeypatch_module.context() as m:
         m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
         m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
-        m.setenv("GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
-                 "code_interpreter,container")
+        m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container")
+        # Helps the model follow instructions better
+        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
         with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
             yield remote_server
 
@@ -56,51 +60,102 @@ async def mcp_enabled_client(mcp_enabled_server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
-async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI,
-                                         model_name: str):
+async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str):
     response = await mcp_enabled_client.responses.create(
         model=model_name,
-        # TODO: Ideally should be able to set max tool calls
-        # to prevent multi-turn, but it is not currently supported
-        # would speed up the test
-        input=("What's the first 4 digits after the decimal point of "
-               "cube root of `19910212 * 20250910`? "
-               "Show only the digits. The python interpreter is not stateful "
-               "and you must print to see the output."),
-        tools=[{
-            "type": "mcp",
-            "server_label": "code_interpreter",
-            # URL unused for DemoToolServer
-            "server_url": "http://localhost:8888"
-        }],
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                # URL unused for DemoToolServer
+                "server_url": "http://localhost:8888",
+            }
+        ],
+        extra_body={"enable_response_messages": True},
     )
     assert response is not None
     assert response.status == "completed"
-    assert response.usage.output_tokens_details.tool_output_tokens > 0
+    # Verify output messages: Tool calls and responses on analysis channel
+    tool_call_found = False
+    tool_response_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool call should be on analysis channel"
+            )
+        author = message.get("author", {})
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            tool_response_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool response should be on analysis channel"
+            )
+
+    assert tool_call_found, "Should have found at least one Python tool call"
+    assert tool_response_found, "Should have found at least one Python tool response"
+    for message in response.input_messages:
+        assert message.get("author").get("role") != "developer", (
+            "No developer messages should be present with valid mcp tool"
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
-async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI,
-                                          model_name: str):
+async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
     response = await mcp_disabled_client.responses.create(
         model=model_name,
-        # TODO: Ideally should be able to set max tool calls
-        # to prevent multi-turn, but it is not currently supported
-        # would speed up the test
-        input=("What's the first 4 digits after the decimal point of "
-               "cube root of `19910212 * 20250910`? "
-               "Show only the digits. The python interpreter is not stateful "
-               "and you must print to see the output."),
-        tools=[{
-            "type": "mcp",
-            "server_label": "code_interpreter",
-            # URL unused for DemoToolServer
-            "server_url": "http://localhost:8888"
-        }],
+        input=(
+            "Execute the following code if the tool is present: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                # URL unused for DemoToolServer
+                "server_url": "http://localhost:8888",
+            }
+        ],
+        extra_body={"enable_response_messages": True},
     )
     assert response is not None
     assert response.status == "completed"
-    assert response.usage.output_tokens_details.tool_output_tokens == 0
+    # Verify output messages: No tool calls and responses
+    tool_call_found = False
+    tool_response_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool call should be on analysis channel"
+            )
+        author = message.get("author", {})
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            tool_response_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool response should be on analysis channel"
+            )
+
+    assert not tool_call_found, "Should not have a python call"
+    assert not tool_response_found, "Should not have a tool response"
+    for message in response.input_messages:
+        assert message.get("author").get("role") != "developer", (
+            "No developer messages should be present without a valid tool"
+        )
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index c28970afc731fbd4145af8d99e5a218f06a78030..dea8d2d28f61a34da7f625384e955d25d6a342b9 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -8,28 +8,41 @@ import pytest
 import pytest_asyncio
 import requests
 from openai import BadRequestError, NotFoundError, OpenAI
+from openai_harmony import (
+    Message,
+)
 
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "openai/gpt-oss-20b"
 
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
+GET_WEATHER_SCHEMA = {
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get current temperature for provided coordinates in celsius.",  # noqa
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "latitude": {"type": "number"},
+            "longitude": {"type": "number"},
+        },
+        "required": ["latitude", "longitude"],
+        "additionalProperties": False,
+    },
+    "strict": True,
+}
 
 
 @pytest.fixture(scope="module")
-def server(monkeypatch_module: pytest.MonkeyPatch):
+def server():
     args = ["--enforce-eager", "--tool-server", "demo"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
 
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
 
 
 @pytest_asyncio.fixture
@@ -94,22 +107,10 @@ async def test_chat(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
         input=[
-            {
-                "role": "system",
-                "content": "Respond in Korean."
-            },
-            {
-                "role": "user",
-                "content": "Hello!"
-            },
-            {
-                "role": "assistant",
-                "content": "Hello! How can I help you today?"
-            },
-            {
-                "role": "user",
-                "content": "What is 13 * 24? Explain your answer."
-            },
+            {"role": "system", "content": "Respond in Korean."},
+            {"role": "user", "content": "Hello!"},
+            {"role": "assistant", "content": "Hello! How can I help you today?"},
+            {"role": "user", "content": "What is 13 * 24? Explain your answer."},
         ],
     )
     assert response is not None
@@ -124,10 +125,7 @@ async def test_chat_with_input_type(client: OpenAI, model_name: str):
         input=[
             {
                 "role": "user",
-                "content": [{
-                    "type": "input_text",
-                    "text": "What is 13*24?"
-                }],
+                "content": [{"type": "input_text", "text": "What is 13*24?"}],
             },
         ],
     )
@@ -141,14 +139,10 @@ async def test_structured_output(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
         input=[
-            {
-                "role": "system",
-                "content": "Extract the event information."
-            },
+            {"role": "system", "content": "Extract the event information."},
             {
                 "role": "user",
-                "content":
-                "Alice and Bob are going to a science fair on Friday.",
+                "content": "Alice and Bob are going to a science fair on Friday.",
             },
         ],
         text={
@@ -158,18 +152,9 @@ async def test_structured_output(client: OpenAI, model_name: str):
                 "schema": {
                     "type": "object",
                     "properties": {
-                        "name": {
-                            "type": "string"
-                        },
-                        "date": {
-                            "type": "string"
-                        },
-                        "participants": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
-                            }
-                        },
+                        "name": {"type": "string"},
+                        "date": {"type": "string"},
+                        "participants": {"type": "array", "items": {"type": "string"}},
                     },
                     "required": ["name", "date", "participants"],
                     "additionalProperties": False,
@@ -319,11 +304,10 @@ async def test_streaming_types(client: OpenAI, model_name: str):
 
         stack_of_event_types = []
         async for event in response:
-            if event.type == 'response.created':
+            if event.type == "response.created":
                 stack_of_event_types.append(event.type)
-            elif event.type == 'response.completed':
-                assert stack_of_event_types[-1] == pairs_of_event_types[
-                    event.type]
+            elif event.type == "response.completed":
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
                 stack_of_event_types.pop()
             if event.type.endswith("added"):
                 stack_of_event_types.append(event.type)
@@ -332,12 +316,59 @@ async def test_streaming_types(client: OpenAI, model_name: str):
                     continue
                 stack_of_event_types.append(event.type)
             elif event.type.endswith("done"):
-                assert stack_of_event_types[-1] == pairs_of_event_types[
-                    event.type]
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
                 stack_of_event_types.pop()
         assert len(stack_of_event_types) == 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+        "response.function_call_arguments.done": "response.function_call_arguments.delta",  # noqa
+    }
+
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+
+    stack_of_event_types = []
+    async for event in stream_response:
+        if event.type == "response.created":
+            stack_of_event_types.append(event.type)
+        elif event.type == "response.completed":
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+        if event.type.endswith("added"):
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("delta"):
+            if stack_of_event_types[-1] == event.type:
+                continue
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("done"):
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+    assert len(stack_of_event_types) == 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("background", [True, False])
@@ -345,7 +376,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
     # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
-        # "What is 13 * 24? Use python to calculate the result.",
+        "What is 13 * 24? Use python to calculate the result.",
         # "When did Jensen found NVIDIA? Search it and answer the year only.",
     ]
 
@@ -358,15 +389,11 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 # {
                 #     "type": "web_search_preview"
                 # },
-                # {
-                #     "type": "code_interpreter",
-                #     "container": {
-                #         "type": "auto"
-                #     }
-                # },
+                {"type": "code_interpreter", "container": {"type": "auto"}},
             ],
             stream=True,
             background=background,
+            extra_body={"enable_response_messages": True},
         )
 
         current_item_id = ""
@@ -375,10 +402,30 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
         events = []
         current_event_mode = None
         resp_id = None
+        checked_response_completed = False
         async for event in response:
             if event.type == "response.created":
                 resp_id = event.response.id
 
+            # test vllm custom types are in the response
+            if event.type in [
+                "response.completed",
+                "response.in_progress",
+                "response.created",
+            ]:
+                assert "input_messages" in event.response.model_extra
+                assert "output_messages" in event.response.model_extra
+                if event.type == "response.completed":
+                    # make sure the serialization of content works
+                    for msg in event.response.model_extra["output_messages"]:
+                        # make sure we can convert the messages back into harmony
+                        Message.from_dict(msg)
+
+                    for msg in event.response.model_extra["input_messages"]:
+                        # make sure we can convert the messages back into harmony
+                        Message.from_dict(msg)
+                    checked_response_completed = True
+
             if current_event_mode != event.type:
                 current_event_mode = event.type
                 print(f"\n[{event.type}] ", end="", flush=True)
@@ -388,21 +435,21 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 assert event.item.id != current_item_id
                 current_item_id = event.item.id
             elif event.type in [
-                    "response.output_text.delta",
-                    "response.reasoning_text.delta"
+                "response.output_text.delta",
+                "response.reasoning_text.delta",
             ]:
                 assert event.item_id == current_item_id
 
             # verify content_index_id is correct
             if event.type in [
-                    "response.content_part.added",
-                    "response.reasoning_part.added"
+                "response.content_part.added",
+                "response.reasoning_part.added",
             ]:
                 assert event.content_index != current_content_index
                 current_content_index = event.content_index
             elif event.type in [
-                    "response.output_text.delta",
-                    "response.reasoning_text.delta"
+                "response.output_text.delta",
+                "response.reasoning_text.delta",
             ]:
                 assert event.content_index == current_content_index
 
@@ -412,25 +459,28 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 print(f"{event.delta}", end="", flush=True)
             elif "response.code_interpreter_call_code.done" in event.type:
                 print(f"Code: {event.code}", end="", flush=True)
-            elif ("response.output_item.added" in event.type
-                  and event.item.type == "web_search_call"):
+            elif (
+                "response.output_item.added" in event.type
+                and event.item.type == "web_search_call"
+            ):
                 print(f"Web search: {event.item.action}", end="", flush=True)
             events.append(event)
 
         assert len(events) > 0
         response_completed_event = events[-1]
         assert len(response_completed_event.response.output) > 0
+        assert checked_response_completed
 
         if background:
             starting_after = 5
             async with await client.responses.retrieve(
-                    response_id=resp_id,
-                    stream=True,
-                    starting_after=starting_after) as stream:
+                response_id=resp_id, stream=True, starting_after=starting_after
+            ) as stream:
                 counter = starting_after
                 async for event in stream:
                     counter += 1
                     assert event == events[counter]
+            assert counter == len(events) - 1
 
 
 @pytest.mark.asyncio
@@ -440,9 +490,7 @@ async def test_web_search(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
         input="Who is the president of South Korea as of now?",
-        tools=[{
-            "type": "web_search_preview"
-        }],
+        tools=[{"type": "web_search_preview"}],
     )
     assert response is not None
     assert response.status == "completed"
@@ -450,27 +498,29 @@ async def test_web_search(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
 async def test_code_interpreter(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
         # TODO: Ideally should be able to set max tool calls
         # to prevent multi-turn, but it is not currently supported
         # would speed up the test
-        input=("What's the first 4 digits after the decimal point of "
-               "cube root of `19910212 * 20250910`? "
-               "Show only the digits. The python interpreter is not stateful "
-               "and you must print to see the output."),
-        tools=[{
-            "type": "code_interpreter",
-            "container": {
-                "type": "auto"
-            }
-        }],
+        input=(
+            "What's the first 4 digits after the decimal point of "
+            "cube root of `19910212 * 20250910`? "
+            "Show only the digits. The python interpreter is not stateful "
+            "and you must print to see the output."
+        ),
+        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,  # More deterministic output in response
     )
     assert response is not None
     assert response.status == "completed"
     assert response.usage.output_tokens_details.tool_output_tokens > 0
+    for item in response.output:
+        if item.type == "message":
+            output_string = item.content[0].text
+            print("output_string: ", output_string, flush=True)
+            assert "5846" in output_string
 
 
 def get_weather(latitude, longitude):
@@ -485,11 +535,17 @@ def get_place_to_travel():
     return "Paris"
 
 
+def get_horoscope(sign):
+    return f"{sign}: Next Tuesday you will befriend a baby otter."
+
+
 def call_function(name, args):
     if name == "get_weather":
         return get_weather(**args)
     elif name == "get_place_to_travel":
         return get_place_to_travel()
+    elif name == "get_horoscope":
+        return get_horoscope(**args)
     else:
         raise ValueError(f"Unknown function: {name}")
 
@@ -497,26 +553,7 @@ def call_function(name, args):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
-    tools = [{
-        "type": "function",
-        "name": "get_weather",
-        "description":
-        "Get current temperature for provided coordinates in celsius.",  # noqa
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "latitude": {
-                    "type": "number"
-                },
-                "longitude": {
-                    "type": "number"
-                },
-            },
-            "required": ["latitude", "longitude"],
-            "additionalProperties": False,
-        },
-        "strict": True,
-    }]
+    tools = [GET_WEATHER_SCHEMA]
 
     response = await client.responses.create(
         model=model_name,
@@ -539,11 +576,13 @@ async def test_function_calling(client: OpenAI, model_name: str):
 
     response_2 = await client.responses.create(
         model=model_name,
-        input=[{
-            "type": "function_call_output",
-            "call_id": tool_call.call_id,
-            "output": str(result),
-        }],
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
         tools=tools,
         previous_response_id=response.id,
     )
@@ -580,32 +619,12 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
             },
             "strict": True,
         },
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description":
-            "Get current temperature for provided coordinates in celsius.",  # noqa
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "latitude": {
-                        "type": "number"
-                    },
-                    "longitude": {
-                        "type": "number"
-                    },
-                },
-                "required": ["latitude", "longitude"],
-                "additionalProperties": False,
-            },
-            "strict": True,
-        },
+        GET_WEATHER_SCHEMA,
     ]
 
     response = await client.responses.create(
         model=model_name,
-        input=
-        "Help me plan a trip to a random place. And tell me the weather there.",
+        input="Help me plan a trip to a random place. And tell me the weather there.",
         tools=tools,
     )
     assert response is not None
@@ -622,11 +641,13 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
 
     response_2 = await client.responses.create(
         model=model_name,
-        input=[{
-            "type": "function_call_output",
-            "call_id": tool_call.call_id,
-            "output": str(result),
-        }],
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
         tools=tools,
         previous_response_id=response.id,
     )
@@ -644,11 +665,13 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
 
     response_3 = await client.responses.create(
         model=model_name,
-        input=[{
-            "type": "function_call_output",
-            "call_id": tool_call.call_id,
-            "output": str(result),
-        }],
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
         tools=tools,
         previous_response_id=response_2.id,
     )
@@ -660,26 +683,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_required(client: OpenAI, model_name: str):
-    tools = [{
-        "type": "function",
-        "name": "get_weather",
-        "description":
-        "Get current temperature for provided coordinates in celsius.",  # noqa
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "latitude": {
-                    "type": "number"
-                },
-                "longitude": {
-                    "type": "number"
-                },
-            },
-            "required": ["latitude", "longitude"],
-            "additionalProperties": False,
-        },
-        "strict": True,
-    }]
+    tools = [GET_WEATHER_SCHEMA]
 
     with pytest.raises(BadRequestError):
         await client.responses.create(
@@ -690,34 +694,30 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
         )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_system_message_with_tools(client: OpenAI, model_name: str):
+    from vllm.entrypoints.harmony_utils import get_system_message
+
+    # Test with custom tools enabled - commentary channel should be available
+    sys_msg = get_system_message(with_custom_tools=True)
+    valid_channels = sys_msg.content[0].channel_config.valid_channels
+    assert "commentary" in valid_channels
+
+    # Test with custom tools disabled - commentary channel should be removed
+    sys_msg = get_system_message(with_custom_tools=False)
+    valid_channels = sys_msg.content[0].channel_config.valid_channels
+    assert "commentary" not in valid_channels
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_full_history(client: OpenAI, model_name: str):
-    tools = [{
-        "type": "function",
-        "name": "get_weather",
-        "description":
-        "Get current temperature for provided coordinates in celsius.",  # noqa
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "latitude": {
-                    "type": "number"
-                },
-                "longitude": {
-                    "type": "number"
-                },
-            },
-            "required": ["latitude", "longitude"],
-            "additionalProperties": False,
-        },
-        "strict": True,
-    }]
+    tools = [GET_WEATHER_SCHEMA]
 
-    input_messages = [{
-        "role": "user",
-        "content": "What's the weather like in Paris today?"
-    }]
+    input_messages = [
+        {"role": "user", "content": "What's the weather like in Paris today?"}
+    ]
 
     response = await client.responses.create(
         model=model_name,
@@ -734,8 +734,7 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
 
     result = call_function(name, args)
 
-    input_messages.extend(
-        response.output)  # append model's function call message
+    input_messages.extend(response.output)  # append model's function call message
     input_messages.append(
         {  # append result message
             "type": "function_call_output",
@@ -756,14 +755,205 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_output_messages_enabled(client: OpenAI, model_name: str,
-                                       server):
+async def test_function_calling_with_stream(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+    assert stream_response is not None
+    final_tool_calls = {}
+    final_tool_calls_named = {}
+    async for event in stream_response:
+        if event.type == "response.output_item.added":
+            if event.item.type != "function_call":
+                continue
+            final_tool_calls[event.output_index] = event.item
+            final_tool_calls_named[event.item.name] = event.item
+        elif event.type == "response.function_call_arguments.delta":
+            index = event.output_index
+            tool_call = final_tool_calls[index]
+            if tool_call:
+                tool_call.arguments += event.delta
+                final_tool_calls_named[tool_call.name] = tool_call
+        elif event.type == "response.function_call_arguments.done":
+            assert event.arguments == final_tool_calls_named[event.name].arguments
+    for tool_call in final_tool_calls.values():
+        if (
+            tool_call
+            and tool_call.type == "function_call"
+            and tool_call.name == "get_weather"
+        ):
+            args = json.loads(tool_call.arguments)
+            result = call_function(tool_call.name, args)
+            input_list += [tool_call]
+            break
+    assert result is not None
+    response = await client.responses.create(
+        model=model_name,
+        input=input_list
+        + [
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        stream=True,
+    )
+    assert response is not None
+    async for event in response:
+        # check that no function call events in the stream
+        assert event.type != "response.function_call_arguments.delta"
+        assert event.type != "response.function_call_arguments.done"
+        # check that the response contains output text
+        if event.type == "response.completed":
+            assert len(event.response.output) > 0
+            assert event.response.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
     response = await client.responses.create(
         model=model_name,
         input="What is the capital of South Korea?",
-        extra_body={"enable_response_messages": True})
+        extra_body={"enable_response_messages": True},
+    )
 
     assert response is not None
     assert response.status == "completed"
     assert len(response.input_messages) > 0
     assert len(response.output_messages) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_call_with_previous_input_messages(
+    client: OpenAI, model_name: str
+):
+    """Test function calling using previous_input_messages
+    for multi-turn conversation with a function call"""
+
+    # Define the get_horoscope tool
+    tools = [
+        {
+            "type": "function",
+            "name": "get_horoscope",
+            "description": "Get today's horoscope for an astrological sign.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sign": {"type": "string"},
+                },
+                "required": ["sign"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    # Step 1: First call with the function tool
+    stream_response = await client.responses.create(
+        model=model_name,
+        input="What is the horoscope for Aquarius today?",
+        tools=tools,
+        extra_body={"enable_response_messages": True},
+        stream=True,
+    )
+
+    response = None
+    async for event in stream_response:
+        if event.type == "response.completed":
+            response = event.response
+
+    assert response is not None
+    assert response.status == "completed"
+
+    # Step 2: Parse the first output to find the function_call type
+    function_call = None
+    for item in response.output:
+        if item.type == "function_call":
+            function_call = item
+            break
+
+    assert function_call is not None, "Expected a function_call in the output"
+    assert function_call.name == "get_horoscope"
+    assert function_call.call_id is not None
+
+    # Verify the format matches expectations
+    args = json.loads(function_call.arguments)
+    assert "sign" in args
+
+    # Step 3: Call the get_horoscope function
+    result = call_function(function_call.name, args)
+    assert "Aquarius" in result
+    assert "baby otter" in result
+
+    # Get the input_messages and output_messages from the first response
+    first_input_messages = response.input_messages
+    first_output_messages = response.output_messages
+
+    # Construct the full conversation history using previous_input_messages
+    previous_messages = (
+        first_input_messages
+        + first_output_messages
+        + [
+            {
+                "role": "tool",
+                "name": "functions.get_horoscope",
+                "content": [{"type": "text", "text": str(result)}],
+            }
+        ]
+    )
+
+    # Step 4: Make another responses.create() call with previous_input_messages
+    stream_response_2 = await client.responses.create(
+        model=model_name,
+        tools=tools,
+        input="",
+        extra_body={
+            "previous_input_messages": previous_messages,
+            "enable_response_messages": True,
+        },
+        stream=True,
+    )
+
+    async for event in stream_response_2:
+        if event.type == "response.completed":
+            response_2 = event.response
+
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # verify only one system message / developer message
+    num_system_messages_input = 0
+    num_developer_messages_input = 0
+    num_function_call_input = 0
+    for message_dict in response_2.input_messages:
+        message = Message.from_dict(message_dict)
+        if message.author.role == "system":
+            num_system_messages_input += 1
+        elif message.author.role == "developer":
+            num_developer_messages_input += 1
+        elif message.author.role == "tool":
+            num_function_call_input += 1
+    assert num_system_messages_input == 1
+    assert num_developer_messages_input == 1
+    assert num_function_call_input == 1
+
+    # Verify the output makes sense - should contain information about the horoscope
+    output_text = response_2.output_text.lower()
+    assert (
+        "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
+    )
diff --git a/tests/entrypoints/openai/test_responses_function_call_parsing.py b/tests/entrypoints/openai/test_responses_function_call_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5a11c867eb0a06444d6396abe243b16a8a52e7
--- /dev/null
+++ b/tests/entrypoints/openai/test_responses_function_call_parsing.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test function call parsing in ResponsesRequest."""
+
+import json
+
+import pytest
+from openai.types.responses import ResponseFunctionToolCall
+
+from vllm.entrypoints.openai.protocol import ResponsesRequest
+
+
+def test_function_call_dict_converted_to_object():
+    """Test that function_call dictionaries are correctly parsed into
+    ResponseFunctionToolCall objects."""
+    # Create a request with function_call as dict
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "call_id": "fc_123",
+                "name": "get_weather",
+                "arguments": '{"location": "Boston", "unit": "celsius"}',
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify the input item is now a ResponseFunctionToolCall object
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_123"
+    assert request.input[0].name == "get_weather"
+    assert request.input[0].arguments == '{"location": "Boston", "unit": "celsius"}'
+
+
+def test_direct_function_call_object_preservation():
+    """Test that ResponseFunctionToolCall objects passed directly are preserved."""
+    # Create a request with ResponseFunctionToolCall object
+    function_call = ResponseFunctionToolCall(
+        type="function_call",
+        call_id="fc_456",
+        name="get_stock_price",
+        arguments='{"symbol": "AAPL"}',
+    )
+
+    request_data = {"model": "gpt-oss", "input": [function_call]}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify the object is preserved
+    assert len(request.input) == 1
+    assert request.input[0] is function_call
+
+
+def test_mixed_input_types_with_function_calls():
+    """Test parsing with mixed input types including function calls."""
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            # Valid Message type
+            {
+                "type": "message",
+                "role": "user",
+                "content": [{"type": "input_text", "text": "What's the weather?"}],
+            },
+            # Function call that should be parsed
+            {
+                "type": "function_call",
+                "call_id": "fc_789",
+                "name": "check_weather",
+                "arguments": '{"location": "NYC"}',
+            },
+            # Another function call
+            {
+                "type": "function_call",
+                "call_id": "fc_790",
+                "name": "get_time",
+                "arguments": "{}",
+            },
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify mixed types are handled correctly
+    assert len(request.input) == 3
+    # First item should be validated as Message
+    assert request.input[0]["type"] == "message"
+    # Second item should be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[1], ResponseFunctionToolCall)
+    assert request.input[1].call_id == "fc_789"
+    assert request.input[1].name == "check_weather"
+    # Third item should also be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[2], ResponseFunctionToolCall)
+    assert request.input[2].call_id == "fc_790"
+    assert request.input[2].name == "get_time"
+
+
+def test_function_call_with_complex_arguments():
+    """Test parsing function calls with complex nested arguments."""
+    complex_args = {
+        "query": "weather forecast",
+        "filters": {
+            "location": {"city": "San Francisco", "state": "CA"},
+            "timeRange": {"start": "2024-01-01", "end": "2024-01-07"},
+            "metrics": ["temperature", "humidity", "precipitation"],
+        },
+        "options": {"format": "detailed", "includeAlerts": True},
+    }
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "call_id": "fc_complex",
+                "name": "advanced_weather_query",
+                "arguments": json.dumps(complex_args),
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify complex arguments are preserved correctly
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_complex"
+    assert request.input[0].name == "advanced_weather_query"
+
+    # Parse the arguments back to verify they're intact
+    parsed_args = json.loads(request.input[0].arguments)
+    assert parsed_args == complex_args
+
+
+def test_invalid_function_call_fallback():
+    """Test that invalid function call dictionaries fall back gracefully."""
+    # Missing required field 'call_id'
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {"type": "function_call", "name": "incomplete_function", "arguments": "{}"}
+        ],
+    }
+
+    # This should not raise an error during model creation
+    # The validator should keep the original dict and let Pydantic
+    # handle validation
+    with pytest.raises(ValueError):
+        # Pydantic should raise a validation error for the invalid structure
+        ResponsesRequest(**request_data)
+
+
+def test_string_input_not_affected():
+    """Test that string input is not affected by the validator."""
+    request_data = {"model": "gpt-oss", "input": "This is a simple string input"}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify string input remains unchanged
+    assert request.input == "This is a simple string input"
+
+
+def test_empty_list_input():
+    """Test that empty list input is handled correctly."""
+    request_data = {"model": "gpt-oss", "input": []}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify empty list is preserved
+    assert request.input == []
+
+
+def test_function_call_output_not_affected():
+    """Test that FunctionCallOutput is not affected by the function_call parsing."""
+
+    # Test with FunctionCallOutput as dict (should not be parsed)
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call_output",
+                "call_id": "fc_output_123",
+                "output": "The weather in Boston is 72°F and sunny.",
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # FunctionCallOutput should remain as dict (not converted to an object)
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], dict)
+    assert request.input[0]["type"] == "function_call_output"
+    assert request.input[0]["call_id"] == "fc_output_123"
+    assert request.input[0]["output"] == "The weather in Boston is 72°F and sunny."
+
+
+def test_mixed_function_call_and_output():
+    """Test that function_call is parsed while function_call_output is preserved."""
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            # This should be parsed to ResponseFunctionToolCall
+            {
+                "type": "function_call",
+                "call_id": "fc_call_456",
+                "name": "get_weather",
+                "arguments": '{"location": "NYC"}',
+            },
+            # This should remain as dict
+            {
+                "type": "function_call_output",
+                "call_id": "fc_call_456",
+                "output": "NYC weather is 68°F with light rain",
+            },
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    assert len(request.input) == 2
+
+    # First item should be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_call_456"
+    assert request.input[0].name == "get_weather"
+
+    # Second item should remain as dict (FunctionCallOutput)
+    assert isinstance(request.input[1], dict)
+    assert request.input[1]["type"] == "function_call_output"
+    assert request.input[1]["call_id"] == "fc_call_456"
+    assert request.input[1]["output"] == "NYC weather is 68°F with light rain"
+
+
+def test_function_call_validation_failure_logs_debug(caplog):
+    """Test that validation failures are logged at debug level."""
+    from unittest.mock import patch
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "name": "incomplete_function",
+                "arguments": "{}",  # Missing call_id
+            }
+        ],
+    }
+
+    # Mock the logger to verify debug was called
+    with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
+        with pytest.raises(ValueError):
+            ResponsesRequest(**request_data)
+
+        # Verify debug was called with expected message
+        mock_logger.debug.assert_called_once()
+        call_args = mock_logger.debug.call_args[0][0]
+        assert "Failed to parse function_call" in call_args
+
+
+def test_validator_handles_iterator_input():
+    """Test that validator can handle ValidatorIterator input (Pydantic internal)."""
+
+    # This test simulates when Pydantic passes a ValidatorIterator instead of a list
+    # This happened with complex nested structures containing reasoning + function_call
+
+    # Create test data that would normally be a list
+    test_input_items = [
+        {
+            "type": "message",
+            "role": "user",
+            "content": [{"type": "input_text", "text": "Test"}],
+        },
+        {
+            "type": "reasoning",
+            "id": "rs_1",
+            "summary": [{"type": "summary_text", "text": "Test reasoning"}],
+            "content": [{"type": "reasoning_text", "text": "Test content"}],
+        },
+        {
+            "type": "function_call",
+            "call_id": "call_1",
+            "name": "test_function",
+            "arguments": '{"test": "value"}',
+            "id": "fc_1",
+        },
+    ]
+
+    # Mock data where input is an iterator (simulates Pydantic ValidatorIterator)
+    mock_data = {
+        "model": "test-model",
+        "input": iter(test_input_items),  # Iterator instead of list
+    }
+
+    # This should NOT raise an error with the fixed validator
+    try:
+        request = ResponsesRequest(**mock_data)
+
+        # Verify the validator processed the data correctly
+        assert len(request.input) == 3
+
+        # Verify function_call was converted to ResponseFunctionToolCall object
+        function_call_item = None
+        for item in request.input:
+            if isinstance(item, ResponseFunctionToolCall):
+                function_call_item = item
+                break
+
+        assert function_call_item is not None
+        assert function_call_item.call_id == "call_1"
+        assert function_call_item.name == "test_function"
+
+    except Exception as e:
+        pytest.fail(f"Validator should handle iterator input, but failed with: {e}")
+
+
+def test_validator_handles_empty_iterator():
+    """Test validator handles empty iterator gracefully."""
+    mock_data = {
+        "model": "test-model",
+        "input": iter([]),  # Empty iterator
+    }
+
+    request = ResponsesRequest(**mock_data)
+    assert request.input == []
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
index ff8f193fec5520cbf6170c9c5021dd46dfc99f48..feef48a36dfa1a420b4d8b090383ecc97ef5978c 100644
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -27,8 +27,12 @@ def server():
 
 
 @pytest.mark.asyncio
-async def test_basic_completion_with_emoji(server):
+@pytest.mark.parametrize("return_token_ids", [True, False, None])
+async def test_basic_completion_with_emoji(server, return_token_ids: bool | None):
     """Test basic completion with emoji to verify token_ids field."""
+    extra_body = None
+    if return_token_ids is not None:
+        extra_body = {"return_token_ids": return_token_ids}
     async with server.get_async_client() as client:
         # Test with return_token_ids enabled
         completion = await client.completions.create(
@@ -37,7 +41,7 @@ async def test_basic_completion_with_emoji(server):
             max_tokens=10,
             temperature=0,
             logprobs=1,
-            extra_body={"return_token_ids": True},
+            extra_body=extra_body,
         )
 
         # Check the raw response to see the structure
@@ -45,18 +49,27 @@ async def test_basic_completion_with_emoji(server):
 
         # Verify prompt_token_ids field is present in the completion response
         assert "prompt_token_ids" in completion_dict["choices"][0]
+        if not return_token_ids:
+            # If return_token_ids is False, token_ids should not be present
+            assert completion_dict["choices"][0].get("token_ids") is None
+            assert completion_dict["choices"][0].get("prompt_token_ids") is None
+            # Skip further checks
+            return
         assert isinstance(completion.choices[0].prompt_token_ids, list)
 
         # Check against the expected prompt token IDs
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
         encoded_tokens = tokenizer.encode(
-            "Complete this sentence with emojis: I love coding 🚀")
+            "Complete this sentence with emojis: I love coding 🚀"
+        )
         # Check that encoded_tokens is a subsequence of prompt_token_ids
-        assert any(completion.choices[0].prompt_token_ids[i:i +
-                                                          len(encoded_tokens)]
-                   == encoded_tokens for i in range(
-                       len(completion.choices[0].prompt_token_ids) -
-                       len(encoded_tokens) + 1))
+        assert any(
+            completion.choices[0].prompt_token_ids[i : i + len(encoded_tokens)]
+            == encoded_tokens
+            for i in range(
+                len(completion.choices[0].prompt_token_ids) - len(encoded_tokens) + 1
+            )
+        )
 
         # Verify token_ids field is present in the choice
         assert completion.choices[0].token_ids is not None
@@ -86,44 +99,38 @@ async def test_basic_completion_with_emoji(server):
 @pytest.mark.asyncio
 async def test_chat_completion_with_tool_use(server):
     """Test chat completion with tool use (get_weather function)."""
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
-                        "description": "The unit of temperature",
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The unit of temperature",
+                        },
                     },
+                    "required": ["location"],
                 },
-                "required": ["location"],
             },
-        },
-    }]
+        }
+    ]
 
     async with server.get_async_client() as client:
         # Test with return_token_ids enabled
         response = await client.chat.completions.create(
             model=MODEL_NAME,
             messages=[
-                {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role": "user",
-                    "content": "What's the weather like in Paris?"
-                },
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What's the weather like in Paris?"},
             ],
             tools=tools,
             tool_choice="auto",
@@ -145,10 +152,11 @@ async def test_chat_completion_with_tool_use(server):
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
         prompt_text = tokenizer.decode(response.prompt_token_ids)
         assert prompt_text.startswith(
-            "<|im_start|>system\nYou are a helpful assistant.")
+            "<|im_start|>system\nYou are a helpful assistant."
+        )
         assert prompt_text.endswith(
-            "What's the weather like in Paris?<|im_end|>\n"
-            "<|im_start|>assistant\n")
+            "What's the weather like in Paris?<|im_end|>\n<|im_start|>assistant\n"
+        )
 
         response_text = tokenizer.decode(response.choices[0].token_ids)
         assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
@@ -164,14 +172,8 @@ async def test_chat_completion_with_tool_use(server):
         response_without = await client.chat.completions.create(
             model=MODEL_NAME,
             messages=[
-                {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role": "user",
-                    "content": "What's the weather like in Paris?"
-                },
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What's the weather like in Paris?"},
             ],
             tools=tools,
             tool_choice="auto",
@@ -203,7 +205,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
             extra_body={
                 "return_token_ids": True,
                 "return_tokens_as_token_ids": True,
-                "prompt_logprobs": 1
+                "prompt_logprobs": 1,
             },
         )
 
@@ -228,16 +230,17 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
         # The prompt_token_ids should match the prompt portion
         assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
         response_token_ids_length = len(completion.choices[0].token_ids)
-        assert logprobs_token_ids[-response_token_ids_length:] == \
-            completion.choices[0].token_ids
+        assert (
+            logprobs_token_ids[-response_token_ids_length:]
+            == completion.choices[0].token_ids
+        )
 
         # Verify tokenizer consistency
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
 
         # Decode prompt tokens
         if completion.choices[0].prompt_token_ids:
-            prompt_text = tokenizer.decode(
-                completion.choices[0].prompt_token_ids)
+            prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
             # The decoded prompt should match or close to original prompt
             assert "Hello, world" in prompt_text
 
@@ -255,10 +258,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
             stream=True,
             echo=False,
             logprobs=1,
-            extra_body={
-                "return_token_ids": True,
-                "return_tokens_as_token_ids": True
-            },
+            extra_body={"return_token_ids": True, "return_tokens_as_token_ids": True},
         )
 
         # Collect streamed tokens
@@ -287,14 +287,8 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
 async def test_chat_completion_with_emoji_and_token_ids(server):
     """Test chat completion with emojis to verify token_ids handling."""
     chat_messages = [
-        {
-            "role": "system",
-            "content": "You like to use emojis in your responses."
-        },
-        {
-            "role": "user",
-            "content": "Repeat after me: I love cats 🐱"
-        },
+        {"role": "system", "content": "You like to use emojis in your responses."},
+        {"role": "user", "content": "Repeat after me: I love cats 🐱"},
     ]
     async with server.get_async_client() as client:
         response = await client.chat.completions.create(
@@ -319,15 +313,16 @@ async def test_chat_completion_with_emoji_and_token_ids(server):
 
         decoded_prompt = tokenizer.decode(response.prompt_token_ids)
         assert decoded_prompt.startswith(
-            "<|im_start|>system\nYou like to use emojis in your responses.")
+            "<|im_start|>system\nYou like to use emojis in your responses."
+        )
         assert decoded_prompt.endswith(
-            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
+            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n"
+        )
 
         decoded_response = tokenizer.decode(response.choices[0].token_ids)
         # The content should match the response text
         # except the ending <|im_end|>
-        assert decoded_response == response.choices[
-            0].message.content + "<|im_end|>"
+        assert decoded_response == response.choices[0].message.content + "<|im_end|>"
 
         # Test with streaming
         stream = await client.chat.completions.create(
@@ -348,14 +343,14 @@ async def test_chat_completion_with_emoji_and_token_ids(server):
                 assert chunk.prompt_token_ids is not None
                 assert isinstance(chunk.prompt_token_ids, list)
                 # Check the prompt_token_ids match the initial prompt
-                decoded_prompt_stream = tokenizer.decode(
-                    chunk.prompt_token_ids)
+                decoded_prompt_stream = tokenizer.decode(chunk.prompt_token_ids)
                 assert decoded_prompt_stream == decoded_prompt
                 first_chunk = False
             else:
                 chunk_dump = chunk.model_dump()
-                assert "prompt_token_ids" not in chunk_dump, \
+                assert "prompt_token_ids" not in chunk_dump, (
                     "Subsequent chunks should not have prompt_token_ids"
+                )
 
             if chunk.choices:
                 if chunk.choices[0].delta.content:
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index ef9d5234f23177e49e0fd10c64fee4459c9e226f..adbcc1f2430c464ccd7bfcfec70848343d0abab6 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -44,22 +44,19 @@ def server_fixture(request, default_server_args):  # noqa: F811
         with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
             yield (remote_server, True)
     else:
-        with RemoteOpenAIServer(MODEL_NAME,
-                                default_server_args) as remote_server:
+        with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
             yield (remote_server, False)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
-async def test_completion_return_tokens_as_token_ids_completion(
-        server_fixture):
+async def test_completion_return_tokens_as_token_ids_completion(server_fixture):
     server, use_server_flag = server_fixture
     request_args = {}
     if not use_server_flag:
         request_args["return_tokens_as_token_ids"] = True
 
     async with server.get_async_client() as client:
-
         completion = await client.completions.create(
             model=MODEL_NAME,
             # Include Unicode characters to test for dividing a single
@@ -70,7 +67,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
             temperature=0,
             max_tokens=10,
             logprobs=1,
-            extra_body=request_args)
+            extra_body=request_args,
+        )
 
         text = completion.choices[0].text
         token_strs = completion.choices[0].logprobs.tokens
@@ -104,22 +102,22 @@ async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
             # Include Unicode characters to test for dividing a single
             # character across multiple tokens: 🎉 is [28705, 31862] for the
             # Zephyr tokenizer
-            messages=[{
-                "role": "system",
-                "content": "You like to respond in only emojis, like 🎉"
-            }, {
-                "role": "user",
-                "content": "Please write some emojis: 🐱🐶🎉"
-            }],
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You like to respond in only emojis, like 🎉",
+                },
+                {"role": "user", "content": "Please write some emojis: 🐱🐶🎉"},
+            ],
             temperature=0,
             max_tokens=8,
             logprobs=True,
-            extra_body=request_args)
+            extra_body=request_args,
+        )
 
         text = response.choices[0].message.content
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
         token_ids = []
         for logprob_content in response.choices[0].logprobs.content:
-            token_ids.append(
-                int(logprob_content.token.removeprefix("token_id:")))
+            token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
         assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 7b4966848b9dea935a540e824e28933f40eacb0d..6bcb80878f07a13090480125b87e30ef08b8c456 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -51,26 +51,31 @@ class TestCase(NamedTuple):
             model_name=MODEL_NAME,
             base_url=["v1"],  # http://localhost:8000/v1
             api_key=ERROR_API_KEY,
-            expected_error=openai.AuthenticationError),
+            expected_error=openai.AuthenticationError,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
             api_key=ERROR_API_KEY,
-            expected_error=openai.AuthenticationError),
+            expected_error=openai.AuthenticationError,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=["v1"],  # http://localhost:8000/v1
             api_key=API_KEY,
-            expected_error=None),
+            expected_error=None,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
             api_key=API_KEY,
-            expected_error=None),
+            expected_error=None,
+        ),
     ],
 )
-async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
-                                                   test_case: TestCase):
+async def test_chat_session_root_path_with_api_key(
+    server: RemoteOpenAIServer, test_case: TestCase
+):
     saying: str = "Here is a common saying about apple. An apple a day, keeps"
     ctx = contextlib.nullcontext()
     if test_case.expected_error is not None:
@@ -79,20 +84,16 @@ async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
         client = openai.AsyncOpenAI(
             api_key=test_case.api_key,
             base_url=server.url_for(*test_case.base_url),
-            max_retries=0)
+            max_retries=0,
+        )
         chat_completion = await client.chat.completions.create(
             model=test_case.model_name,
-            messages=[{
-                "role": "user",
-                "content": "tell me a common saying"
-            }, {
-                "role": "assistant",
-                "content": saying
-            }],
-            extra_body={
-                "continue_final_message": True,
-                "add_generation_prompt": False
-            })
+            messages=[
+                {"role": "user", "content": "tell me a common saying"},
+                {"role": "assistant", "content": saying},
+            ],
+            extra_body={"continue_final_message": True, "add_generation_prompt": False},
+        )
 
         assert chat_completion.id is not None
         assert len(chat_completion.choices) == 1
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index e23f41e983b0d2b0b2e5a032fb6bf514fe21ee50..f951b57fe726920659b179ecc4aa7ff0cc94fb48 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -9,22 +9,28 @@ import pytest
 
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
-# ruff: noqa: E501
-INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
-
-INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
-
-INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
-{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
-{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
+# ruff: noqa: E501
+INPUT_BATCH = (
+    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
+).format(MODEL_NAME)
+
+INVALID_INPUT_BATCH = (
+    '{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
+).format(MODEL_NAME)
+
+INPUT_EMBEDDING_BATCH = (
+    '{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
+    '{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
+    '{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
+    '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
+)
 
 INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
@@ -33,17 +39,29 @@ INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/re
 {"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
 
+INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
+
 
 def test_empty_file():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write("")
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "intfloat/multilingual-e5-small"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -53,15 +71,24 @@ def test_empty_file():
 
 
 def test_completions():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INPUT_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                MODEL_NAME,
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -77,30 +104,48 @@ def test_completions_invalid_input():
     """
     Ensure that we fail when the input doesn't conform to the openai api.
     """
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INVALID_INPUT_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                MODEL_NAME,
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode != 0, f"{proc=}"
 
 
 def test_embeddings():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INPUT_EMBEDDING_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "intfloat/multilingual-e5-small"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -112,24 +157,66 @@ def test_embeddings():
             BatchRequestOutput.model_validate_json(line)
 
 
-@pytest.mark.parametrize("input_batch",
-                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
 def test_score(input_batch):
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(input_batch)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm",
-            "run-batch",
-            "-i",
-            input_file.name,
-            "-o",
-            output_file.name,
-            "--model",
-            "BAAI/bge-reranker-v2-m3",
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "BAAI/bge-reranker-v2-m3",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+
+def test_reasoning_parser():
+    """
+    Test that reasoning_parser parameter works correctly in run_batch.
+    """
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_REASONING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "Qwen/Qwen3-0.6B",
+                "--reasoning-parser",
+                "qwen3",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -144,3 +231,10 @@ def test_score(input_batch):
             line_dict = json.loads(line)
             assert isinstance(line_dict, dict)
             assert line_dict["error"] is None
+
+            # Check that reasoning is present and not empty
+            reasoning = line_dict["response"]["body"]["choices"][0]["message"][
+                "reasoning"
+            ]
+            assert reasoning is not None
+            assert len(reasoning) > 0
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 07f39fe2b9bd0242ec30c6950e04f32f97c23a31..dd10384a7e8c0f81c97bfd68cbc6dfaad1f1bc12 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,44 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional
-from unittest.mock import MagicMock
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 import pytest_asyncio
+from openai import OpenAI
 
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 from ...utils import RemoteOpenAIServer
 
-if TYPE_CHECKING:
-    from openai import OpenAI
-
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
 
 
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
 
 
-@pytest.fixture(scope="module",
-                params=[True, False],
-                ids=["with_tool_parser", "without_tool_parser"])
+@pytest.fixture(
+    scope="module",
+    params=[True, False],
+    ids=["with_tool_parser", "without_tool_parser"],
+)
 def with_tool_parser(request) -> bool:
     return request.param
 
@@ -56,21 +53,25 @@ def default_server_args(with_tool_parser: bool):
         "0.8",
     ]
     if with_tool_parser:
-        args.extend([
-            "--tool-call-parser",
-            "openai",
-            "--enable-auto-tool-choice",
-        ])
+        args.extend(
+            [
+                "--tool-call-parser",
+                "openai",
+                "--enable-auto-tool-choice",
+            ]
+        )
     return args
 
 
 @pytest.fixture(scope="module")
-def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
-                  default_server_args: list[str]):
+def gptoss_server(
+    monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str]
+):
     with monkeypatch_module.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
-                                default_server_args) as remote_server:
+        with RemoteOpenAIServer(
+            GPT_OSS_MODEL_NAME, default_server_args
+        ) as remote_server:
             yield remote_server
 
 
@@ -81,44 +82,41 @@ async def gptoss_client(gptoss_server):
 
 
 @pytest.mark.asyncio
-async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
-                                                with_tool_parser: bool):
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city": {
-                        "type": "string"
-                    },
-                    "state": {
-                        "type": "string"
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
+async def test_gpt_oss_chat_tool_call_streaming(
+    gptoss_client: OpenAI, with_tool_parser: bool
+):
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
                     },
+                    "required": ["city", "state", "unit"],
                 },
-                "required": ["city", "state", "unit"],
             },
-        },
-    }]
+        }
+    ]
 
     messages = [
-        {
-            "role": "user",
-            "content": "What is the weather in Dallas, TX?"
-        },
+        {"role": "user", "content": "What is the weather in Dallas, TX?"},
     ]
 
     stream = await gptoss_client.chat.completions.create(
         model=GPT_OSS_MODEL_NAME,
         messages=messages,
         tools=tools if with_tool_parser else None,
-        stream=True)
+        stream=True,
+    )
 
     name = None
     args_buf = ""
@@ -143,43 +141,34 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
 
 
 @pytest.mark.asyncio
-async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
-                                       with_tool_parser: bool):
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, with_tool_parser: bool):
     if not with_tool_parser:
         pytest.skip("skip non-tool for multi-turn tests")
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city": {
-                        "type": "string"
-                    },
-                    "state": {
-                        "type": "string"
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
                     },
+                    "required": ["city", "state", "unit"],
                 },
-                "required": ["city", "state", "unit"],
             },
-        },
-    }]
+        }
+    ]
 
     messages = [
-        {
-            "role": "system",
-            "content": "you are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "What is the weather in Dallas, TX with celsius?"
-        },
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "What is the weather in Dallas, TX with celsius?"},
     ]
 
     first = await gptoss_client.chat.completions.create(
@@ -197,10 +186,9 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
     assert not first_msg.content
 
     messages.append({"role": "assistant", "content": args1})
-    messages.append({
-        "role": "user",
-        "content": "Now convert to celsius and return JSON only"
-    })
+    messages.append(
+        {"role": "user", "content": "Now convert to celsius and return JSON only"}
+    )
 
     second = await gptoss_client.chat.completions.create(
         model=GPT_OSS_MODEL_NAME,
@@ -209,8 +197,135 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
         temperature=0.0,
     )
     second_msg = second.choices[0].message
-    assert (second_msg.content is not None and len(second_msg.content) > 0) or \
-        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
+    assert (second_msg.content is not None and len(second_msg.content) > 0) or (
+        second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_gpt_oss_tool_message_array_content(
+    gptoss_client: OpenAI, with_tool_parser: bool
+):
+    """Test that tool messages support both string and array content formats."""
+    if not with_tool_parser:
+        pytest.skip("skip non-tool for array content tests")
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                    },
+                    "required": ["city", "state"],
+                },
+            },
+        }
+    ]
+
+    # Test 1: Tool message with string content
+    messages_string = [
+        {"role": "user", "content": "What's the weather in Paris?"},
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"city": "Paris", "state": "TX"}',
+                    },
+                }
+            ],
+        },
+        {"role": "tool", "content": "The weather in Paris, TX is sunny, 22°C"},
+    ]
+
+    response_string = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages_string,
+        tools=tools,
+        temperature=0.0,
+    )
+
+    assert response_string is not None
+    assert response_string.choices[0].message is not None
+
+    # Test 2: Tool message with array content
+    messages_array = [
+        {"role": "user", "content": "What's the weather in Dallas?"},
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "id": "call_456",
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"city": "Dallas", "state": "TX"}',
+                    },
+                }
+            ],
+        },
+        {
+            "role": "tool",
+            "content": [
+                {"type": "text", "text": "f2e897a7-2705-4337-8193-2a8f57b81618"}
+            ],
+        },
+    ]
+
+    response_array = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages_array,
+        tools=tools,
+        temperature=0.0,
+    )
+
+    assert response_array is not None
+    assert response_array.choices[0].message is not None
+
+    # Test 3: Tool message with multiple array content items
+    messages_multi_array = [
+        {"role": "user", "content": "Search for information"},
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "id": "call_789",
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"city": "Austin", "state": "TX"}',
+                    },
+                }
+            ],
+        },
+        {
+            "role": "tool",
+            "content": [
+                {"type": "text", "text": "Weather data: "},
+                {"type": "text", "text": "Austin, TX - Partly cloudy, 25°C"},
+                {"type": "text", "text": " with 60% humidity"},
+            ],
+        },
+    ]
+
+    response_multi_array = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages_multi_array,
+        tools=tools,
+        temperature=0.0,
+    )
+
+    assert response_multi_array is not None
+    assert response_multi_array.choices[0].message is not None
 
 
 MODEL_NAME = "openai-community/gpt2"
@@ -218,7 +333,7 @@ MODEL_NAME_SHORT = "gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 BASE_MODEL_PATHS = [
     BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
-    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT)
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
 ]
 
 
@@ -230,6 +345,7 @@ class MockHFConfig:
 @dataclass
 class MockModelConfig:
     task = "generate"
+    runner_type = "generate"
     tokenizer = MODEL_NAME
     trust_remote_code = False
     tokenizer_mode = "auto"
@@ -237,37 +353,68 @@ class MockModelConfig:
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processors: list[str] | None = None
     logits_processor_pattern = None
-    diff_sampling_param: Optional[dict] = None
+    diff_sampling_param: dict | None = None
     allowed_local_media_path: str = ""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
+def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_chat = OpenAIServingChat(
+        engine,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
+
+    async def _fake_process_inputs(
+        request_id,
+        engine_prompt,
+        sampling_params,
+        *,
+        lora_request,
+        trace_headers,
+        priority,
+    ):
+        return dict(engine_prompt), {}
+
+    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
+    return serving_chat
+
+
 @dataclass
 class MockEngine:
-
-    async def get_model_config(self):
-        return MockModelConfig()
+    model_config: MockModelConfig = field(default_factory=MockModelConfig)
+    processor: MagicMock = field(default_factory=MagicMock)
+    io_processor: MagicMock = field(default_factory=MagicMock)
 
 
 async def _async_serving_chat_init():
     engine = MockEngine()
-    model_config = await engine.get_model_config()
-
-    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
-    serving_completion = OpenAIServingChat(engine,
-                                           model_config,
-                                           models,
-                                           response_role="assistant",
-                                           chat_template=CHAT_TEMPLATE,
-                                           chat_template_content_format="auto",
-                                           request_logger=None)
+
+    models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
+    serving_completion = OpenAIServingChat(
+        engine,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
     return serving_completion
 
 
@@ -281,17 +428,11 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=MockModelConfig())
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     MockModelConfig(),
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
     messages = [{"role": "user", "content": "what is 1+1?"}]
 
     async def return_model_name(*args):
@@ -317,24 +458,15 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=MockModelConfig())
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     MockModelConfig(),
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
 
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     with suppress(Exception):
@@ -359,26 +491,17 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = mock_model_config
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
 
     # Test Case 1: No max_tokens specified in request
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     with suppress(Exception):
@@ -413,26 +536,17 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = mock_model_config
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
 
     # Test case 1: No max_tokens specified, defaults to context_window
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     with suppress(Exception):
@@ -459,35 +573,25 @@ async def test_serving_chat_should_set_correct_max_tokens():
 
 @pytest.mark.asyncio
 async def test_serving_chat_could_load_correct_generation_config():
-
     mock_model_config = MockModelConfig()
     mock_model_config.diff_sampling_param = {
         "temperature": 0.5,
-        "repetition_penalty": 1.05
+        "repetition_penalty": 1.05,
     }
 
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = mock_model_config
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
 
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     with suppress(Exception):
@@ -524,35 +628,103 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine = MagicMock(spec=AsyncLLM)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
+    mock_engine.model_config = mock_model_config
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
 
-    # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    serving_chat = _build_serving_chat(mock_engine)
 
     # Test cache_salt
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     # By default, cache_salt in the engine prompt is not set
     with suppress(Exception):
         await serving_chat.create_chat_completion(req)
-    assert "cache_salt" not in mock_engine.generate.call_args.args[0]
+    engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
+    assert "cache_salt" not in engine_prompt
 
     # Test with certain cache_salt
     req.cache_salt = "test_salt"
     with suppress(Exception):
         await serving_chat.create_chat_completion(req)
-    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
+    engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
+    assert engine_prompt.get("cache_salt") == "test_salt"
+
+
+@pytest.mark.asyncio
+async def test_serving_chat_data_parallel_rank_extraction():
+    """Test that data_parallel_rank is properly extracted from header and
+    passed to engine."""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    # Mock the generate method to return an async generator
+    async def mock_generate(*args, **kwargs):
+        # Yield a fake RequestOutput
+        from vllm.outputs import CompletionOutput, RequestOutput
+
+        yield RequestOutput(
+            request_id="test-request",
+            prompt="test prompt",
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="test response",
+                    token_ids=[4, 5, 6],
+                    cumulative_logprob=0.0,
+                    logprobs=None,
+                    finish_reason="stop",
+                    stop_reason=None,
+                )
+            ],
+            finished=True,
+        )
+
+    mock_engine.generate = AsyncMock(side_effect=mock_generate)
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    # Test when data_parallel_rank is present in header
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "what is 1+1?"}],
+    )
+
+    # Mock request with X-data-parallel-rank header
+    mock_raw_request = MagicMock()
+    mock_raw_request.headers = {"X-data-parallel-rank": "2"}
+    mock_raw_request.state = MagicMock()
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req, mock_raw_request)
+
+    # Verify that data_parallel_rank was passed to engine.generate
+    assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs
+    assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] == 2
+
+    # Test when data_parallel_rank is not present (defaults to None)
+    req_no_dp = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "what is 2+2?"}],
+    )
+
+    # Mock request with no header
+    mock_raw_request_no_dp = MagicMock()
+    mock_raw_request_no_dp.headers = {}
+    mock_raw_request_no_dp.state = MagicMock()
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req_no_dp, mock_raw_request_no_dp)
+
+    # Verify that data_parallel_rank defaults to None
+    assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs
+    assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d8871441a75bfbe3119332c34e09a20ae3c970
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+
+
+@pytest.fixture()
+def serving() -> OpenAIServing:
+    """Create a minimal OpenAIServing instance for testing."""
+
+    # Create minimal mocks
+    engine_client = Mock()
+    model_config = Mock(spec=ModelConfig)
+    model_config.max_model_len = 32768
+    models = Mock(spec=OpenAIServingModels)
+    models.model_config = model_config
+    models.processor = Mock()
+    models.io_processor = Mock()
+
+    serving = OpenAIServing(
+        engine_client=engine_client,
+        models=models,
+        request_logger=None,
+    )
+    return serving
+
+
+@pytest.mark.asyncio
+async def test_async_mistral_tokenizer_does_not_block_event_loop(
+    serving: OpenAIServing,
+):
+    expected_tokens = [1, 2, 3]
+
+    # Mock the blocking version to sleep
+    def mocked_apply_chat_template(*_args, **_kwargs):
+        time.sleep(2)
+        return expected_tokens
+
+    mock_tokenizer = Mock(spec=MistralTokenizer)
+    mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
+
+    task = serving._apply_mistral_chat_template_async(
+        tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
+    )
+
+    # Ensure the event loop is not blocked
+    blocked_count = 0
+    for _i in range(20):  # Check over ~2 seconds
+        start = time.perf_counter()
+        await asyncio.sleep(0)
+        elapsed = time.perf_counter() - start
+
+        # an overly generous elapsed time for slow machines
+        if elapsed >= 0.5:
+            blocked_count += 1
+
+        await asyncio.sleep(0.1)
+
+    # Ensure task completes
+    tokens = await task
+    assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
+    assert blocked_count == 0, "Event loop blocked during tokenization"
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index bc6a0341f59f6de371b5304645f8c7b50c60afd4..3c022870dba4baf259122a4427f93dc114a7e8d3 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -8,31 +8,36 @@ import pytest
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              LoadLoRAAdapterRequest,
-                                              UnloadLoRAAdapterRequest)
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
-LORA_LOADING_SUCCESS_MESSAGE = (
-    "Success: LoRA adapter '{lora_name}' added successfully.")
+LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
 LORA_UNLOADING_SUCCESS_MESSAGE = (
-    "Success: LoRA adapter '{lora_name}' removed successfully.")
+    "Success: LoRA adapter '{lora_name}' removed successfully."
+)
 
 
 async def _async_serving_models_init() -> OpenAIServingModels:
-    mock_model_config = MagicMock(spec=ModelConfig)
     mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
+    mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
-
-    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
-                                         base_model_paths=BASE_MODEL_PATHS,
-                                         model_config=mock_model_config,
-                                         lora_modules=None)
+    mock_engine_client.model_config = mock_model_config
+    mock_engine_client.processor = MagicMock()
+    mock_engine_client.io_processor = MagicMock()
+
+    serving_models = OpenAIServingModels(
+        engine_client=mock_engine_client,
+        base_model_paths=BASE_MODEL_PATHS,
+        lora_modules=None,
+    )
     await serving_models.init_static_loras()
 
     return serving_models
@@ -42,19 +47,18 @@ async def _async_serving_models_init() -> OpenAIServingModels:
 async def test_serving_model_name():
     serving_models = await _async_serving_models_init()
     assert serving_models.model_name(None) == MODEL_NAME
-    request = LoRARequest(lora_name="adapter",
-                          lora_path="/path/to/adapter2",
-                          lora_int_id=1)
+    request = LoRARequest(
+        lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1
+    )
     assert serving_models.model_name(request) == request.lora_name
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter",
-                                     lora_path="/path/to/adapter2")
+    request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2")
     response = await serving_models.load_lora_adapter(request)
-    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter")
     assert len(serving_models.lora_requests) == 1
     assert "adapter" in serving_models.lora_requests
     assert serving_models.lora_requests["adapter"].lora_name == "adapter"
@@ -73,15 +77,16 @@ async def test_load_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
-    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
-        lora_name='adapter1')
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
     assert len(serving_models.lora_requests) == 1
 
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.error.type == "InvalidUserInput"
@@ -92,15 +97,15 @@ async def test_load_lora_adapter_duplicate():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
     assert len(serving_models.lora_requests) == 1
 
     request = UnloadLoRAAdapterRequest(lora_name="adapter1")
     response = await serving_models.unload_lora_adapter(request)
-    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
-        lora_name='adapter1')
+    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
     assert len(serving_models.lora_requests) == 0
 
 
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e11b61020c5b910b80b41503a9b208f9590274
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import AsyncExitStack
+from unittest.mock import MagicMock
+
+import pytest
+import pytest_asyncio
+from openai.types.responses.tool import (
+    CodeInterpreterContainerCodeInterpreterToolAuto,
+    LocalShell,
+    Mcp,
+    Tool,
+)
+
+from vllm.entrypoints.context import ConversationContext
+from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
+from vllm.entrypoints.openai.serving_responses import (
+    OpenAIServingResponses,
+    extract_tool_types,
+)
+from vllm.entrypoints.tool_server import ToolServer
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+
+
+class MockConversationContext(ConversationContext):
+    """Mock conversation context for testing"""
+
+    def __init__(self):
+        self.init_tool_sessions_called = False
+        self.init_tool_sessions_args = None
+        self.init_tool_sessions_kwargs = None
+
+    def append_output(self, output) -> None:
+        pass
+
+    def append_tool_output(self, output) -> None:
+        pass
+
+    async def call_tool(self):
+        return []
+
+    def need_builtin_tool_call(self) -> bool:
+        return False
+
+    def render_for_completion(self):
+        return []
+
+    async def init_tool_sessions(self, tool_server, exit_stack, request_id, mcp_tools):
+        self.init_tool_sessions_called = True
+        self.init_tool_sessions_args = (tool_server, exit_stack, request_id, mcp_tools)
+
+    async def cleanup_session(self) -> None:
+        pass
+
+
+@pytest.fixture
+def mock_serving_responses():
+    """Create a mock OpenAIServingResponses instance"""
+    serving_responses = MagicMock(spec=OpenAIServingResponses)
+    serving_responses.tool_server = MagicMock(spec=ToolServer)
+    return serving_responses
+
+
+@pytest.fixture
+def mock_context():
+    """Create a mock conversation context"""
+    return MockConversationContext()
+
+
+@pytest.fixture
+def mock_exit_stack():
+    """Create a mock async exit stack"""
+    return MagicMock(spec=AsyncExitStack)
+
+
+def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None:
+    tools: list[Tool] = []
+    assert extract_tool_types(tools) == set()
+
+    tools.append(LocalShell(type="local_shell"))
+    assert extract_tool_types(tools) == {"local_shell"}
+
+    tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto"))
+    assert extract_tool_types(tools) == {"local_shell", "auto"}
+
+    tools.extend(
+        [
+            Mcp(type="mcp", server_label="random", server_url=""),
+            Mcp(type="mcp", server_label="container", server_url=""),
+            Mcp(type="mcp", server_label="code_interpreter", server_url=""),
+            Mcp(type="mcp", server_label="web_search_preview", server_url=""),
+        ]
+    )
+    # When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set,
+    # mcp tool types are all ignored.
+    assert extract_tool_types(tools) == {"local_shell", "auto"}
+
+    # container is allowed, it would be extracted
+    monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container")
+    assert extract_tool_types(tools) == {"local_shell", "auto", "container"}
+
+    # code_interpreter and web_search_preview are allowed,
+    # they would be extracted
+    monkeypatch.setenv(
+        "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview"
+    )
+    assert extract_tool_types(tools) == {
+        "local_shell",
+        "auto",
+        "code_interpreter",
+        "web_search_preview",
+    }
+
+
+class TestInitializeToolSessions:
+    """Test class for _initialize_tool_sessions method"""
+
+    @pytest_asyncio.fixture
+    async def serving_responses_instance(self):
+        """Create a real OpenAIServingResponses instance for testing"""
+        # Create minimal mocks for required dependencies
+        engine_client = MagicMock()
+
+        model_config = MagicMock()
+        model_config.hf_config.model_type = "test"
+        model_config.get_diff_sampling_param.return_value = {}
+        engine_client.model_config = model_config
+
+        engine_client.processor = MagicMock()
+        engine_client.io_processor = MagicMock()
+
+        models = MagicMock()
+
+        tool_server = MagicMock(spec=ToolServer)
+
+        # Create the actual instance
+        instance = OpenAIServingResponses(
+            engine_client=engine_client,
+            models=models,
+            request_logger=None,
+            chat_template=None,
+            chat_template_content_format="auto",
+            tool_server=tool_server,
+        )
+
+        return instance
+
+    @pytest.mark.asyncio
+    async def test_initialize_tool_sessions(
+        self, serving_responses_instance, mock_context, mock_exit_stack
+    ):
+        """Test that method works correctly with only MCP tools"""
+
+        request = ResponsesRequest(input="test input", tools=[])
+
+        # Call the method
+        await serving_responses_instance._initialize_tool_sessions(
+            request, mock_context, mock_exit_stack
+        )
+        assert mock_context.init_tool_sessions_called is False
+
+        # Create only MCP tools
+        tools = [
+            {"type": "web_search_preview"},
+            {"type": "code_interpreter", "container": {"type": "auto"}},
+        ]
+
+        request = ResponsesRequest(input="test input", tools=tools)
+
+        # Call the method
+        await serving_responses_instance._initialize_tool_sessions(
+            request, mock_context, mock_exit_stack
+        )
+
+        # Verify that init_tool_sessions was called
+        assert mock_context.init_tool_sessions_called
+
+    def test_validate_create_responses_input(
+        self, serving_responses_instance, mock_context, mock_exit_stack
+    ):
+        request = ResponsesRequest(
+            input="test input",
+            previous_input_messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What is my horoscope? I am an Aquarius.",
+                        }
+                    ],
+                }
+            ],
+            previous_response_id="lol",
+        )
+        error = serving_responses_instance._validate_create_responses_input(request)
+        assert error is not None
+        assert error.error.type == "invalid_request_error"
+
+
+class TestValidateGeneratorInput:
+    """Test class for _validate_generator_input method"""
+
+    @pytest_asyncio.fixture
+    async def serving_responses_instance(self):
+        """Create a real OpenAIServingResponses instance for testing"""
+        # Create minimal mocks for required dependencies
+        engine_client = MagicMock()
+
+        model_config = MagicMock()
+        model_config.hf_config.model_type = "test"
+        model_config.get_diff_sampling_param.return_value = {}
+        engine_client.model_config = model_config
+
+        engine_client.processor = MagicMock()
+        engine_client.io_processor = MagicMock()
+
+        models = MagicMock()
+
+        # Create the actual instance
+        instance = OpenAIServingResponses(
+            engine_client=engine_client,
+            models=models,
+            request_logger=None,
+            chat_template=None,
+            chat_template_content_format="auto",
+        )
+
+        # Set max_model_len for testing
+        instance.max_model_len = 100
+
+        return instance
+
+    def test_validate_generator_input(self, serving_responses_instance):
+        """Test _validate_generator_input with valid prompt length"""
+        # Create an engine prompt with valid length (less than max_model_len)
+        valid_prompt_token_ids = list(range(5))  # 5 tokens < 100 max_model_len
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=valid_prompt_token_ids)
+
+        # Call the method
+        result = serving_responses_instance._validate_generator_input(engine_prompt)
+
+        # Should return None for valid input
+        assert result is None
+
+        # create an invalid engine prompt
+        invalid_prompt_token_ids = list(range(200))  # 100 tokens >= 100 max_model_len
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
+
+        # Call the method
+        result = serving_responses_instance._validate_generator_input(engine_prompt)
+
+        # Should return an ErrorResponse
+        assert result is not None
+        assert isinstance(result, ErrorResponse)
diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d843e35b86f20bdcd94ba13fb8b74d088e969f
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.v1.engine.detokenizer import check_stop_strings
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def messages():
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "How many countries are in the EU?"},
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(request):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    extra_args = getattr(request, "param", None)
+    if extra_args is not None:
+        args = args + (
+            list(extra_args)
+            if isinstance(extra_args, (list, tuple))
+            else [str(extra_args)]
+        )
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_endpoint(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+
+@pytest.mark.asyncio
+async def test_same_response_as_chat_completions(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    for ignore_eos in [True, False]:
+        payload = {
+            "model": MODEL_NAME,
+            "token_ids": token_ids,
+            "sampling_params": {
+                "max_tokens": 24,
+                "temperature": 0.0,
+                # NOTE coordinator will set this to skip detokenization
+                "detokenize": False,
+                "ignore_eos": ignore_eos,
+            },
+            "stream": False,
+        }
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_data = generate_resp.json()
+        generate_res = tokenizer.decode(
+            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        )
+
+        payload = {
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "stream": False,
+            "ignore_eos": ignore_eos,
+            "chat_template_kwargs": dict(enable_thinking=False),
+        }
+        completions_resp = await client.post("/v1/chat/completions", json=payload)
+        completions_data = completions_resp.json()
+        completions_res = completions_data["choices"][0]["message"]["content"]
+
+        assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+async def test_stop_string_workflow(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+            # stop strings are only supported when detokenize is True.
+            "stop": ["27 member"],
+        },
+        # TODO stream test is much more interesting
+        "stream": False,
+    }
+    with pytest.raises(httpx.HTTPStatusError):
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_resp.raise_for_status()
+
+    payload["sampling_params"]["stop"] = None
+    generate_resp = await client.post(
+        GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
+    )
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    # NOTE This is under the responsibility of the coordinator
+    # stop_checker = StopChecker(
+    #     max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
+    # )
+    stop_str, truncate_to = check_stop_strings(
+        generate_res, len(generate_res), ["27 member"], False
+    )
+    assert stop_str == "27 member"
+    # abort request that hit stop string (requires tokens-only mode)
+    # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
+    # res.raise_for_status()
+    generate_res = generate_res[:truncate_to]
+
+    # Get stop_str response from chat completions
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "stop": ["27 member"],
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+    assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server",
+    [
+        [
+            "--enable-lora",
+            "--lora-modules",
+            "Alice=charent/self_cognition_Alice",
+            "Bob=charent/self_cognition_Bob",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+        ]
+    ],
+    indirect=True,
+)
+async def test_generate_with_lora_adapter(client, tokenizer, messages):
+    # Verify adapters are listed
+    models_resp = await client.get("/v1/models")
+    models_resp.raise_for_status()
+    models = {m["id"] for m in models_resp.json().get("data", [])}
+    assert {"Alice", "Bob"}.issubset(models)
+
+    # Generate using a LoRA adapter by specifying its name as the model
+    payload = {
+        "model": "Alice",
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": "Alice",
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+        },
+        "stream": False,
+    }
+    generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    payload = {
+        "model": "Alice",
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+
+    assert generate_res == completions_res
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 29a94c852bba6fa40af2263f9069f3ce33a19c00..d75119cb7b43d5c3d84e9116ecb22facc4bb5828 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,40 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import signal
+import subprocess
+import sys
+import time
+
 import openai
 import pytest
 
-from ...utils import RemoteOpenAIServer
+from vllm.utils.network_utils import get_open_port
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
 
 @pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
-    # dtype, max-len etc set so that this can run in CI
-    args = [
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        "--max-num-seqs",
-        "128",
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        async with remote_server.get_async_client() as client:
-
-            with pytest.raises(
-                (openai.APIConnectionError, openai.InternalServerError)):
-                # Asking for lots of prompt logprobs will currently crash the
-                # engine. This may change in the future when that bug is fixed
-                prompt = "Hello " * 4000
-                await client.completions.create(
-                    model=MODEL_NAME,
-                    prompt=prompt,
-                    extra_body={"prompt_logprobs": 10})
-
-            # Now the server should shut down
-            return_code = remote_server.proc.wait(timeout=8)
-            assert return_code is not None
+    """Verify that API returns connection error when server process is killed.
+
+    Starts a vLLM server, kills it to simulate a crash, then verifies that
+    subsequent API calls fail appropriately.
+    """
+
+    port = get_open_port()
+
+    proc = subprocess.Popen(
+        [
+            # dtype, max-len etc set so that this can run in CI
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.api_server",
+            "--model",
+            MODEL_NAME,
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "128",
+            "--enforce-eager",
+            "--port",
+            str(port),
+            "--gpu-memory-utilization",
+            "0.05",
+            "--max-num-seqs",
+            "2",
+            "--disable-frontend-multiprocessing",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
+    )
+
+    # Wait for server startup
+    start_time = time.time()
+    client = openai.AsyncOpenAI(
+        base_url=f"http://localhost:{port}/v1",
+        api_key="dummy",
+        max_retries=0,
+        timeout=10,
+    )
+
+    # Poll until server is ready
+    while time.time() - start_time < 30:
+        try:
+            await client.completions.create(
+                model=MODEL_NAME, prompt="Hello", max_tokens=1
+            )
+            break
+        except Exception:
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                stdout, stderr = proc.communicate(timeout=1)
+                pytest.fail(
+                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
+                )
+    else:
+        proc.terminate()
+        proc.wait(timeout=5)
+        pytest.fail("Server failed to start in 30 seconds")
+
+    # Kill server to simulate crash
+    proc.terminate()
+    time.sleep(1)
+
+    # Verify API calls now fail
+    with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
+        await client.completions.create(
+            model=MODEL_NAME, prompt="This should fail", max_tokens=1
+        )
+
+    return_code = proc.wait(timeout=5)
+    assert return_code is not None
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 0dd6af17ef22752dbb7a7145a1ee496e56311f8f..5f94ac6da2c25a87f96a60da4565ab8c835b53f3 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
+from prometheus_client.parser import text_string_to_metric_families
 
 from ...utils import RemoteOpenAIServer
 
@@ -20,32 +21,46 @@ def test_sleep_mode():
         "--enable-sleep-mode",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={
-                                "VLLM_SERVER_DEV_MODE": "1",
-                                "CUDA_VISIBLE_DEVICES": "0"
-                            }) as remote_server:
-        response = requests.post(remote_server.url_for("sleep"),
-                                 params={"level": "1"})
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
+    ) as remote_server:
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
         assert response.status_code == 200
         response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 0
+        assert weights_offloaded == 1
+        assert discard_all == 0
+
         response = requests.post(remote_server.url_for("wake_up"))
         assert response.status_code == 200
         response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
 
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 1
+        assert weights_offloaded == 0
+        assert discard_all == 0
+
         # test wake up with tags
-        response = requests.post(remote_server.url_for("sleep"),
-                                 params={"level": "1"})
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
         assert response.status_code == 200
 
-        response = requests.post(remote_server.url_for("wake_up"),
-                                 params={"tags": ["weights"]})
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["weights"]}
+        )
         assert response.status_code == 200
 
         # is sleeping should be false after waking up any part of the engine
@@ -53,10 +68,43 @@ def test_sleep_mode():
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
-        response = requests.post(remote_server.url_for("wake_up"),
-                                 params={"tags": ["kv_cache"]})
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
+        )
         assert response.status_code == 200
 
         response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
+
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 1
+        assert weights_offloaded == 0
+        assert discard_all == 0
+
+
+def _get_sleep_metrics_from_api(response: requests.Response):
+    """Return (awake, weights_offloaded, discard_all)"""
+
+    awake, weights_offloaded, discard_all = None, None, None
+
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:engine_sleep_state":
+            for sample in family.samples:
+                if sample.name == "vllm:engine_sleep_state":
+                    for label_name, label_value in sample.labels.items():
+                        if label_value == "awake":
+                            awake = sample.value
+                        elif label_value == "weights_offloaded":
+                            weights_offloaded = sample.value
+                        elif label_value == "discard_all":
+                            discard_all = sample.value
+
+    assert awake is not None
+    assert weights_offloaded is not None
+    assert discard_all is not None
+
+    return awake, weights_offloaded, discard_all
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index 058e96f203c3806b3bfb05a2ddc19f59b5825c93..80b7cd9f4cbc9913951780528feb67869219e7c6 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -11,7 +11,10 @@ import torch.cuda
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (
-    TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
+    TensorizerConfig,
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+)
 
 from ...utils import RemoteOpenAIServer
 
@@ -29,21 +32,20 @@ def cleanup():
     _cleanup()
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def tmp_dir():
     with tempfile.TemporaryDirectory() as path:
         yield path
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def model_uri(tmp_dir):
     yield f"{tmp_dir}/model.tensors"
 
 
 @pytest.fixture(scope="module")
 def tensorize_model_and_lora(tmp_dir, model_uri):
-    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
-                                         lora_dir=tmp_dir)
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir)
     args = EngineArgs(model=MODEL_NAME)
 
     tensorize_lora_adapter(LORA_PATH, tensorizer_config)
@@ -66,8 +68,11 @@ def server(model_uri, tensorize_model_and_lora):
 
     ## Start OpenAI API server
     args = [
-        "--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
-        "--enable-lora"
+        "--load-format",
+        "tensorizer",
+        "--served-model-name",
+        MODEL_NAME,
+        "--enable-lora",
     ]
 
     model_dir = os.path.dirname(model_uri)
@@ -85,10 +90,9 @@ async def client(server):
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     _cleanup()
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -97,4 +101,5 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     assert len(completion.choices[0].text) >= 5
     assert completion.choices[0].finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
index ed003939c44bebed2e8ddc7f099b1c94a7905c9f..25eb5882be89cfbaba994af99aabb48d8749b54c 100644
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -6,8 +6,7 @@ import tempfile
 
 import pytest
 
-from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf)
+from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
@@ -23,7 +22,8 @@ def server():
         MODEL_NAME,
         allow_patterns=["*"],
         cache_dir=MODEL_PATH,
-        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"],
+    )
     args = [
         "--max-model-len",
         "2048",
@@ -61,13 +61,14 @@ async def test_token_in_token_out_and_logprobs(server):
         )
 
         # Verify all fields are present
-        assert (completion.choices[0].token_ids is not None
-                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert (
+            completion.choices[0].token_ids is not None
+            and 0 < len(completion.choices[0].token_ids) <= 20
+        )
         assert completion.choices[0].prompt_token_ids is not None
 
         # Decode prompt tokens
         if completion.choices[0].prompt_token_ids:
-            prompt_text = tokenizer.decode(
-                completion.choices[0].prompt_token_ids)
+            prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
             # The decoded prompt should match or close to original prompt
             assert prompt_text == text
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index ecb7f50fa740023464b7f42d12eadb164f65aa3a..7fd32e1c7be1d6e183654be6c06c16e1e0c0fde7 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -53,19 +53,20 @@ async def test_tokenize_completions(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_special in [False, True]:
         prompt = "vllm1 This is a test prompt."
         tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
-        response = requests.post(server.url_for("tokenize"),
-                                 json={
-                                     "add_special_tokens": add_special,
-                                     "model": model_name,
-                                     "prompt": prompt
-                                 })
+        response = requests.post(
+            server.url_for("tokenize"),
+            json={
+                "add_special_tokens": add_special,
+                "model": model_name,
+                "prompt": prompt,
+            },
+        )
         response.raise_for_status()
 
         result = response.json()
@@ -86,48 +87,39 @@ async def test_tokenize_chat(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
-            conversation = [{
-                "role": "user",
-                "content": "Hi there!"
-            }, {
-                "role": "assistant",
-                "content": "Nice to meet you!"
-            }, {
-                "role": "user",
-                "content": "Can I ask a question? vllm1"
-            }]
+            conversation = [
+                {"role": "user", "content": "Hi there!"},
+                {"role": "assistant", "content": "Nice to meet you!"},
+                {"role": "user", "content": "Can I ask a question? vllm1"},
+            ]
             for continue_final in [False, True]:
                 if add_generation and continue_final:
                     continue
                 if continue_final:
-                    conversation.append({
-                        "role": "assistant",
-                        "content": "Sure,"
-                    })
+                    conversation.append({"role": "assistant", "content": "Sure,"})
 
                 prompt = tokenizer.apply_chat_template(
                     add_generation_prompt=add_generation,
                     continue_final_message=continue_final,
                     conversation=conversation,
-                    tokenize=False)
-                tokens = tokenizer.encode(prompt,
-                                          add_special_tokens=add_special)
-
-                response = requests.post(server.url_for("tokenize"),
-                                         json={
-                                             "add_generation_prompt":
-                                             add_generation,
-                                             "continue_final_message":
-                                             continue_final,
-                                             "add_special_tokens": add_special,
-                                             "messages": conversation,
-                                             "model": model_name
-                                         })
+                    tokenize=False,
+                )
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+                response = requests.post(
+                    server.url_for("tokenize"),
+                    json={
+                        "add_generation_prompt": add_generation,
+                        "continue_final_message": continue_final,
+                        "add_special_tokens": add_special,
+                        "messages": conversation,
+                        "model": model_name,
+                    },
+                )
                 response.raise_for_status()
 
                 result = response.json()
@@ -148,41 +140,35 @@ async def test_tokenize_chat_with_tools(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
-            conversation = [{
-                "role":
-                "user",
-                "content":
-                "What's the weather like in Paris today?",
-            }]
-
-            tools = [{
-                "type": "function",
-                "function": {
-                    "name": "get_weather",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {
-                            "location": {
-                                "type": "string"
-                            }
+            conversation = [
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris today?",
+                }
+            ]
+
+            tools = [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"location": {"type": "string"}},
                         },
                     },
-                },
-            }]
+                }
+            ]
 
             for continue_final in [False, True]:
                 if add_generation and continue_final:
                     continue
                 if continue_final:
-                    conversation.append({
-                        "role": "assistant",
-                        "content": "Sure,"
-                    })
+                    conversation.append({"role": "assistant", "content": "Sure,"})
 
                 prompt = tokenizer.apply_chat_template(
                     add_generation_prompt=add_generation,
@@ -191,8 +177,7 @@ async def test_tokenize_chat_with_tools(
                     tools=tools,
                     tokenize=False,
                 )
-                tokens = tokenizer.encode(prompt,
-                                          add_special_tokens=add_special)
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
                 response = requests.post(
                     server.url_for("tokenize"),
@@ -225,17 +210,12 @@ async def test_tokenize_with_return_token_strs(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     prompt = "This is a token_strs test prompt! vllm1"
     response = requests.post(
         server.url_for("tokenize"),
-        json={
-            "prompt": prompt,
-            "model": model_name,
-            "return_token_strs": True
-        },
+        json={"prompt": prompt, "model": model_name, "return_token_strs": True},
     )
     response.raise_for_status()
 
@@ -260,17 +240,14 @@ async def test_detokenize(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
 
-    response = requests.post(server.url_for("detokenize"),
-                             json={
-                                 "model": model_name,
-                                 "tokens": tokens
-                             })
+    response = requests.post(
+        server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
+    )
     response.raise_for_status()
 
     assert response.json() == {"prompt": prompt}
@@ -319,14 +296,15 @@ async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
     }
     for field, expected_type in field_types.items():
         if field in result and result[field] is not None:
-            assert isinstance(
-                result[field],
-                expected_type), (f"{field} should be {expected_type.__name__}")
+            assert isinstance(result[field], expected_type), (
+                f"{field} should be {expected_type.__name__}"
+            )
 
 
 @pytest.mark.asyncio
 async def test_tokenizer_info_added_tokens_structure(
-    server: RemoteOpenAIServer, ):
+    server: RemoteOpenAIServer,
+):
     """Test added_tokens_decoder structure if present."""
     response = requests.get(server.url_for("tokenizer_info"))
     response.raise_for_status()
@@ -337,25 +315,23 @@ async def test_tokenizer_info_added_tokens_structure(
             assert isinstance(token_id, str), "Token IDs should be strings"
             assert isinstance(token_info, dict), "Token info should be a dict"
             assert "content" in token_info, "Token info should have content"
-            assert "special" in token_info, (
-                "Token info should have special flag")
-            assert isinstance(token_info["special"],
-                              bool), ("Special flag should be boolean")
+            assert "special" in token_info, "Token info should have special flag"
+            assert isinstance(token_info["special"], bool), (
+                "Special flag should be boolean"
+            )
 
 
 @pytest.mark.asyncio
 async def test_tokenizer_info_consistency_with_tokenize(
-    server: RemoteOpenAIServer, ):
+    server: RemoteOpenAIServer,
+):
     """Test that tokenizer info is consistent with tokenization endpoint."""
     info_response = requests.get(server.url_for("tokenizer_info"))
     info_response.raise_for_status()
     info = info_response.json()
     tokenize_response = requests.post(
         server.url_for("tokenize"),
-        json={
-            "model": MODEL_NAME,
-            "prompt": "Hello world!"
-        },
+        json={"model": MODEL_NAME, "prompt": "Hello world!"},
     )
     tokenize_response.raise_for_status()
     tokenize_result = tokenize_response.json()
@@ -363,7 +339,8 @@ async def test_tokenizer_info_consistency_with_tokenize(
     tokenize_max_len = tokenize_result.get("max_model_len")
     if info_max_len and tokenize_max_len:
         assert info_max_len >= tokenize_max_len, (
-            "Info max length should be >= tokenize max length")
+            "Info max length should be >= tokenize max length"
+        )
 
 
 @pytest.mark.asyncio
@@ -374,6 +351,5 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
     result = response.json()
     chat_template = result.get("chat_template")
     if chat_template:
-        assert isinstance(chat_template,
-                          str), ("Chat template should be a string")
+        assert isinstance(chat_template, str), "Chat template should be a string"
         assert chat_template.strip(), "Chat template should not be empty"
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 23c99da97ad3acf5a9221c927f8bcbecf5e85b39..88580ed899f1a7aa08688e37c35a7bdc1ce9ae21 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -17,8 +17,12 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]
 
 
@@ -36,8 +40,8 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    "model_name",
-    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
+    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
+)
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -52,14 +56,50 @@ async def test_basic_audio(mary_had_lamb, model_name):
             file=mary_had_lamb,
             language="en",
             response_format="text",
-            temperature=0.0)
+            temperature=0.0,
+        )
         out = json.loads(transcription)
-        out_text = out['text']
-        out_usage = out['usage']
+        out_text = out["text"]
+        out_usage = out["usage"]
         assert "Mary had a little lamb," in out_text
         assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "mary had a little lamb" in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
 @pytest.mark.asyncio
 async def test_basic_audio_gemma(foscolo):
     # Gemma accuracy on some of the audio samples we use is particularly bad,
@@ -67,15 +107,18 @@ async def test_basic_audio_gemma(foscolo):
     model_name = "google/gemma-3n-E2B-it"
     server_args = ["--enforce-eager"]
 
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=480
+    ) as remote_server:
         client = remote_server.get_async_client()
         transcription = await client.audio.transcriptions.create(
             model=model_name,
             file=foscolo,
             language="it",
             response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
         assert "da cui vergine nacque Venere" in out
 
 
@@ -85,24 +128,21 @@ async def test_non_asr_model(winning_call):
     model_name = "JackFram/llama-68m"
     with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(model=model_name,
-                                                       file=winning_call,
-                                                       language="en",
-                                                       temperature=0.0)
+        res = await client.audio.transcriptions.create(
+            model=model_name, file=winning_call, language="en", temperature=0.0
+        )
         err = res.error
         assert err["code"] == 400 and not res.text
-        assert err[
-            "message"] == "The model does not support Transcriptions API"
+        assert err["message"] == "The model does not support Transcriptions API"
 
 
 @pytest.mark.asyncio
 async def test_bad_requests(mary_had_lamb, client):
     # invalid language
     with pytest.raises(openai.BadRequestError):
-        await client.audio.transcriptions.create(model=MODEL_NAME,
-                                                 file=mary_had_lamb,
-                                                 language="hh",
-                                                 temperature=0.0)
+        await client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
 
 
 @pytest.mark.asyncio
@@ -114,17 +154,18 @@ async def test_long_audio_request(mary_had_lamb, client):
     repeated_audio = np.tile(audio, 10)
     # Repeated audio to buffer
     buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format='WAV')
+    sf.write(buffer, repeated_audio, sr, format="WAV")
     buffer.seek(0)
     transcription = await client.audio.transcriptions.create(
         model=MODEL_NAME,
         file=buffer,
         language="en",
         response_format="text",
-        temperature=0.0)
+        temperature=0.0,
+    )
     out = json.loads(transcription)
-    out_text = out['text']
-    out_usage = out['usage']
+    out_text = out["text"]
+    out_usage = out["usage"]
     counts = out_text.count("Mary had a little lamb")
     assert counts == 10, counts
     assert out_usage["seconds"] == 161, out_usage["seconds"]
@@ -135,10 +176,8 @@ async def test_completion_endpoints(client):
     # text to text model
     res = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }])
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
     err = res.error
     assert err["code"] == 400
     assert err["message"] == "The model does not support Chat Completions API"
@@ -157,16 +196,19 @@ async def test_streaming_response(winning_call, client):
         file=winning_call,
         response_format="json",
         language="en",
-        temperature=0.0)
-    res = await client.audio.transcriptions.create(model=MODEL_NAME,
-                                                   file=winning_call,
-                                                   language="en",
-                                                   temperature=0.0,
-                                                   stream=True,
-                                                   timeout=30)
+        temperature=0.0,
+    )
+    res = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
     # Reconstruct from chunks and validate
     async for chunk in res:
-        text = chunk.choices[0]['delta']['content']
+        text = chunk.choices[0]["delta"]["content"]
         transcription += text
 
     assert transcription == res_no_stream.text
@@ -180,9 +222,9 @@ async def test_stream_options(winning_call, client):
         language="en",
         temperature=0.0,
         stream=True,
-        extra_body=dict(stream_include_usage=True,
-                        stream_continuous_usage_stats=True),
-        timeout=30)
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
     final = False
     continuous = True
     async for chunk in res:
@@ -190,7 +232,7 @@ async def test_stream_options(winning_call, client):
             # final usage sent
             final = True
         else:
-            continuous = continuous and hasattr(chunk, 'usage')
+            continuous = continuous and hasattr(chunk, "usage")
     assert final and continuous
 
 
@@ -198,27 +240,31 @@ async def test_stream_options(winning_call, client):
 async def test_sampling_params(mary_had_lamb, client):
     """
     Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked. 
+    are different when extreme sampling parameters values are picked.
     """
     transcription = await client.audio.transcriptions.create(
         model=MODEL_NAME,
         file=mary_had_lamb,
         language="en",
         temperature=0.8,
-        extra_body=dict(seed=42,
-                        repetition_penalty=1.9,
-                        top_k=12,
-                        top_p=0.4,
-                        min_p=0.5,
-                        frequency_penalty=1.8,
-                        presence_penalty=2.0))
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
 
     greedy_transcription = await client.audio.transcriptions.create(
         model=MODEL_NAME,
         file=mary_had_lamb,
         language="en",
         temperature=0.0,
-        extra_body=dict(seed=42))
+        extra_body=dict(seed=42),
+    )
 
     assert greedy_transcription.text != transcription.text
 
@@ -226,15 +272,16 @@ async def test_sampling_params(mary_had_lamb, client):
 @pytest.mark.asyncio
 async def test_audio_prompt(mary_had_lamb, client):
     prompt = "This is a speech, recorded in a phonograph."
-    #Prompts should not omit the part of original prompt while transcribing.
+    # Prompts should not omit the part of original prompt while transcribing.
     prefix = "The first words I spoke in the original phonograph"
     transcription = await client.audio.transcriptions.create(
         model=MODEL_NAME,
         file=mary_had_lamb,
         language="en",
         response_format="text",
-        temperature=0.0)
-    out = json.loads(transcription)['text']
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
     assert prefix in out
     transcription_wprompt = await client.audio.transcriptions.create(
         model=MODEL_NAME,
@@ -242,6 +289,7 @@ async def test_audio_prompt(mary_had_lamb, client):
         language="en",
         response_format="text",
         prompt=prompt,
-        temperature=0.0)
-    out_prompt = json.loads(transcription_wprompt)['text']
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
     assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index eb7879927b9b697381771d5611a1d98bf9f85136..d7d407484f16d98a76899551489a797a9f1924d8 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
+
 # imports for structured outputs tests
 import json
 
@@ -17,8 +18,9 @@ from ...utils import RemoteOpenAIServer
 SERVER_ARGS = ["--enforce-eager"]
 
 
-@pytest.fixture(scope="module",
-                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+@pytest.fixture(
+    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
+)
 def server(request):
     # Parametrize over model name
     with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
@@ -38,14 +40,48 @@ async def test_non_asr_model(foscolo):
     model_name = "JackFram/llama-68m"
     with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
-        res = await client.audio.translations.create(model=model_name,
-                                                     file=foscolo,
-                                                     temperature=0.0)
+        res = await client.audio.translations.create(
+            model=model_name, file=foscolo, temperature=0.0
+        )
         err = res.error
         assert err["code"] == 400 and not res.text
         assert err["message"] == "The model does not support Translations API"
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # NOTE - careful to call this test before the module scoped server
+    # fixture, otherwise it'll OOMkill the CI
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            extra_body=dict(language="en", to_language="es"),
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "pequeño" in out.split(" ")
+
+
 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
 async def test_basic_audio(foscolo, client_and_model):
@@ -56,8 +92,9 @@ async def test_basic_audio(foscolo, client_and_model):
         response_format="text",
         # TODO remove `language="it"` once language detection is implemented
         extra_body=dict(language="it", to_language="en"),
-        temperature=0.0)
-    out = json.loads(translation)['text'].strip().lower()
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
     assert "greek sea" in out
 
 
@@ -72,8 +109,9 @@ async def test_audio_prompt(foscolo, client_and_model):
         prompt=prompt,
         extra_body=dict(language="it", to_language="en"),
         response_format="text",
-        temperature=0.0)
-    out = json.loads(transcription)['text']
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
     assert "Nor will I ever touch the sacred" not in out
     assert prompt not in out
 
@@ -87,7 +125,8 @@ async def test_streaming_response(foscolo, client_and_model, server):
         file=foscolo,
         response_format="json",
         extra_body=dict(language="it", to_language="en", seed=42),
-        temperature=0.0)
+        temperature=0.0,
+    )
 
     # Stream via HTTPX since OpenAI translation client doesn't expose streaming
     server, model_name = server
@@ -104,16 +143,14 @@ async def test_streaming_response(foscolo, client_and_model, server):
     foscolo.seek(0)
     async with httpx.AsyncClient() as http_client:
         files = {"file": foscolo}
-        async with http_client.stream("POST",
-                                      url,
-                                      headers=headers,
-                                      data=data,
-                                      files=files) as response:
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
             async for line in response.aiter_lines():
                 if not line:
                     continue
                 if line.startswith("data: "):
-                    line = line[len("data: "):]
+                    line = line[len("data: ") :]
                 if line.strip() == "[DONE]":
                     break
                 chunk = json.loads(line)
@@ -124,9 +161,10 @@ async def test_streaming_response(foscolo, client_and_model, server):
     # NOTE There's a small non-deterministic issue here, likely in the attn
     # computation, which will cause a few tokens to be different, while still
     # being very close semantically.
-    assert sum([
-        x == y for x, y in zip(res_stream, res_no_stream.text.split())
-    ]) >= len(res_stream) * 0.9
+    assert (
+        sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
+        >= len(res_stream) * 0.9
+    )
 
 
 @pytest.mark.asyncio
@@ -148,16 +186,14 @@ async def test_stream_options(foscolo, server):
     continuous = True
     async with httpx.AsyncClient() as http_client:
         files = {"file": foscolo}
-        async with http_client.stream("POST",
-                                      url,
-                                      headers=headers,
-                                      data=data,
-                                      files=files) as response:
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
             async for line in response.aiter_lines():
                 if not line:
                     continue
                 if line.startswith("data: "):
-                    line = line[len("data: "):]
+                    line = line[len("data: ") :]
                 if line.strip() == "[DONE]":
                     break
                 chunk = json.loads(line)
@@ -180,13 +216,14 @@ async def test_long_audio_request(foscolo, client_and_model):
     repeated_audio = np.tile(audio, 2)
     # Repeated audio to buffer
     buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format='WAV')
+    sf.write(buffer, repeated_audio, sr, format="WAV")
     buffer.seek(0)
     translation = await client.audio.translations.create(
         model=model_name,
         file=buffer,
         extra_body=dict(language="it", to_language="en"),
         response_format="text",
-        temperature=0.0)
-    out = json.loads(translation)['text'].strip().lower()
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
     assert out.count("greek sea") == 2
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index ad4dff00daaa40aa9e9054c13e5ec1ebcd07baa7..7ecdac518f97ffd624bb9223168f62d0b7d0a468 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -55,27 +55,34 @@ def base64_encoded_video() -> dict[str, str]:
     }
 
 
+def dummy_messages_from_video_url(
+    video_urls: str | list[str],
+    content_text: str = "What's in this video?",
+):
+    if isinstance(video_urls, str):
+        video_urls = [video_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "video_url", "video_url": {"url": video_url}}
+                    for video_url in video_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_single_chat_session_video(client: openai.AsyncOpenAI,
-                                         model_name: str, video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_single_chat_session_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -84,13 +91,15 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -112,54 +121,36 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": video_url
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_error_on_invalid_video_url_type(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": video_url},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     # video_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
-                                                    model_name: str,
-                                                    video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_single_chat_session_video_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -168,36 +159,27 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url":
-                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -206,13 +188,15 @@ async def test_single_chat_session_video_base64encoded(
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -236,58 +220,36 @@ async def test_single_chat_session_video_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded_beamsearch(
-        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url":
-                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
+
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         n=2,
         max_completion_tokens=10,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_chat_streaming_video(client: openai.AsyncOpenAI,
-                                    model_name: str, video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_chat_streaming_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -327,27 +289,12 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "video_urls",
-    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
-async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
-                                 video_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            } for video_url in video_urls),
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
+)
+async def test_multi_video_input(
+    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
+):
+    messages = dummy_messages_from_video_url(video_urls)
 
     if len(video_urls) > MAXIMUM_VIDEOS:
         with pytest.raises(openai.BadRequestError):  # test multi-video input
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index a324e86666055eccb8952264bc9134181a1a35c9..2a7df08ea3b0efb017e8c4a7a21e4771bdf6f150 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -34,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "This image shows a Venn diagram with three over",
+        "The image shows a colorful Venn diagram with",
     ],
     [
         "This image displays a gradient of colors ranging from",
@@ -71,26 +71,51 @@ async def client(server):
 @pytest.fixture(scope="session")
 def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_asset:
-        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        image_asset: encode_image_base64(
+            local_asset_server.get_image_asset(image_asset)
+        )
         for image_asset in TEST_IMAGE_ASSETS
     }
 
 
+def dummy_messages_from_image_url(
+    image_urls: str | list[str],
+    content_text: str = "What's in this image?",
+):
+    if isinstance(image_urls, str):
+        image_urls = [image_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 def get_hf_prompt_tokens(model_name, content, image_url):
-    processor = AutoProcessor.from_pretrained(model_name,
-                                              trust_remote_code=True,
-                                              num_crops=4)
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
 
     placeholder = "<|image_1|>\n"
-    messages = [{
-        "role": "user",
-        "content": f"{placeholder}{content}",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": f"{placeholder}{content}",
+        }
+    ]
     images = [fetch_image(image_url)]
 
     prompt = processor.tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True)
+        messages, tokenize=False, add_generation_prompt=True
+    )
     inputs = processor(prompt, images, return_tensors="pt")
 
     return inputs.input_ids.shape[1]
@@ -99,25 +124,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_single_chat_session_image(client: openai.AsyncOpenAI,
-                                         model_name: str, image_url: str):
+async def test_single_chat_session_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = dummy_messages_from_image_url(image_url, content_text)
 
     max_completion_tokens = 10
     # test single completion
@@ -127,17 +138,18 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
         max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
         completion_tokens=max_completion_tokens,
         prompt_tokens=hf_prompt_tokens,
-        total_tokens=hf_prompt_tokens + max_completion_tokens)
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -159,55 +171,38 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               image_url: str):
+async def test_error_on_invalid_image_url_type(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": image_url
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": image_url},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     # image_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
-                                                    model_name: str,
-                                                    image_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+async def test_single_chat_session_image_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    content_text = "What's in this image?"
+    messages = dummy_messages_from_image_url(image_url, content_text)
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -216,10 +211,13 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
@@ -227,27 +225,17 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
-        image_url: str, base64_encoded_image: dict[str, str]):
-
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    raw_image_url: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        content_text,
+    )
 
     max_completion_tokens = 10
     # test single completion
@@ -257,17 +245,18 @@ async def test_single_chat_session_image_base64encoded(
         max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
         completion_tokens=max_completion_tokens,
         prompt_tokens=hf_prompt_tokens,
-        total_tokens=hf_prompt_tokens + max_completion_tokens)
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -291,36 +280,27 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
-        client: openai.AsyncOpenAI, model_name: str, image_idx: int,
-        base64_encoded_image: dict[str, str]):
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_idx: int,
+    base64_encoded_image: dict[str, str],
+):
     # NOTE: This test also validates that we pass MM data through beam search
     raw_image_url = TEST_IMAGE_ASSETS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
+    )
+
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         n=2,
         max_completion_tokens=10,
         temperature=0.0,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
     for actual, expected_str in zip(chat_completion.choices, expected_res):
         assert actual.message.content == expected_str
@@ -329,24 +309,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_chat_streaming_image(client: openai.AsyncOpenAI,
-                                    model_name: str, image_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+async def test_chat_streaming_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    messages = dummy_messages_from_image_url(image_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -388,26 +354,12 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize(
     "image_urls",
     [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
-    indirect=True)
-async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    indirect=True,
+)
+async def test_multi_image_input(
+    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
+):
+    messages = dummy_messages_from_image_url(image_urls)
 
     if len(image_urls) > MAXIMUM_IMAGES:
         with pytest.raises(openai.BadRequestError):  # test multi-image input
@@ -443,7 +395,8 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
 @pytest.mark.parametrize(
     "image_urls",
     [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
-    indirect=True)
+    indirect=True,
+)
 async def test_completions_with_image(
     client: openai.AsyncOpenAI,
     model_name: str,
@@ -452,13 +405,9 @@ async def test_completions_with_image(
     for image_url in image_urls:
         chat_completion = await client.chat.completions.create(
             messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
                 {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role":
-                    "user",
+                    "role": "user",
                     "content": [
                         {
                             "type": "text",
@@ -468,7 +417,7 @@ async def test_completions_with_image(
                             "type": "image_url",
                             "image_url": {
                                 "url": image_url,
-                            }
+                            },
                         },
                     ],
                 },
@@ -485,7 +434,8 @@ async def test_completions_with_image(
 @pytest.mark.parametrize(
     "image_urls",
     [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
-    indirect=True)
+    indirect=True,
+)
 async def test_completions_with_image_with_uuid(
     client: openai.AsyncOpenAI,
     model_name: str,
@@ -494,13 +444,9 @@ async def test_completions_with_image_with_uuid(
     for image_url in image_urls:
         chat_completion = await client.chat.completions.create(
             messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
                 {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role":
-                    "user",
+                    "role": "user",
                     "content": [
                         {
                             "type": "text",
@@ -511,7 +457,7 @@ async def test_completions_with_image_with_uuid(
                             "image_url": {
                                 "url": image_url,
                             },
-                            "uuid": image_url
+                            "uuid": image_url,
                         },
                     ],
                 },
@@ -525,34 +471,25 @@ async def test_completions_with_image_with_uuid(
         # Second request, with empty image but the same uuid.
         chat_completion_with_empty_image = await client.chat.completions.create(
             messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
                 {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role":
-                    "user",
+                    "role": "user",
                     "content": [
                         {
                             "type": "text",
                             "text": "Describe this image.",
                         },
-                        {
-                            "type": "image_url",
-                            "image_url": {},
-                            "uuid": image_url
-                        },
+                        {"type": "image_url", "image_url": {}, "uuid": image_url},
                     ],
                 },
             ],
             model=model_name,
         )
-        assert chat_completion_with_empty_image.choices[
-            0].message.content is not None
+        assert chat_completion_with_empty_image.choices[0].message.content is not None
         assert isinstance(
-            chat_completion_with_empty_image.choices[0].message.content, str)
-        assert len(
-            chat_completion_with_empty_image.choices[0].message.content) > 0
+            chat_completion_with_empty_image.choices[0].message.content, str
+        )
+        assert len(chat_completion_with_empty_image.choices[0].message.content) > 0
 
 
 @pytest.mark.asyncio
@@ -564,13 +501,9 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit(
     with pytest.raises(openai.BadRequestError):
         _ = await client.chat.completions.create(
             messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
                 {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role":
-                    "user",
+                    "role": "user",
                     "content": [
                         {
                             "type": "text",
@@ -579,7 +512,7 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit(
                         {
                             "type": "image_url",
                             "image_url": {},
-                            "uuid": "uuid_not_previously_seen"
+                            "uuid": "uuid_not_previously_seen",
                         },
                     ],
                 },
@@ -593,7 +526,8 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit(
 @pytest.mark.parametrize(
     "image_urls",
     [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
-    indirect=True)
+    indirect=True,
+)
 async def test_completions_with_image_with_incorrect_uuid_format(
     client: openai.AsyncOpenAI,
     model_name: str,
@@ -602,13 +536,9 @@ async def test_completions_with_image_with_incorrect_uuid_format(
     for image_url in image_urls:
         chat_completion = await client.chat.completions.create(
             messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
                 {
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                },
-                {
-                    "role":
-                    "user",
+                    "role": "user",
                     "content": [
                         {
                             "type": "text",
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_vision_embeds.py
similarity index 51%
rename from tests/entrypoints/openai/test_skip_tokenizer.py
rename to tests/entrypoints/openai/test_vision_embeds.py
index b469fc76fc7a28a958277e5b288ee24db106366a..a6593c5b05e2edef462724ef1c98e240c1731120 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@@ -15,31 +15,7 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
 
 
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--runner",
-        "pooling",
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        DTYPE,
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--skip-tokenizer-init",
-        "--max-num-seqs",
-        "32",
-        "--model-impl",
-        "terratorch"
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_request(server: RemoteOpenAIServer, model_name: str):
-
+def _terratorch_dummy_inputs(model_name: str):
     pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
     location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
 
@@ -47,41 +23,61 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
     torch.save(pixel_values, buffer_tiff)
     buffer_tiff.seek(0)
     binary_data = buffer_tiff.read()
-    base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
+    base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
 
     buffer_coord = io.BytesIO()
     torch.save(location_coords, buffer_coord)
     buffer_coord.seek(0)
     binary_data = buffer_coord.read()
-    base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
-
-    prompt = {
-        "model":
-        model_name,
-        "additional_data": {
-            "prompt_token_ids": [1]
-        },
-        "encoding_format":
-        "base64",
-        "messages": [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_embeds",
-                "image_embeds": {
-                    "pixel_values": base64_tensor_embedding,
-                    "location_coords": base64_coord_embedding,
-                },
-            }],
-        }]
+    base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    return {
+        "model": model_name,
+        "additional_data": {"prompt_token_ids": [1]},
+        "encoding_format": "base64",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": {
+                            "pixel_values": base64_tensor_embedding,
+                            "location_coords": base64_coord_embedding,
+                        },
+                    }
+                ],
+            }
+        ],
     }
 
-    # test single pooling
-    response = requests.post(server.url_for("pooling"), json=prompt)
-    response.raise_for_status()
 
-    output = response.json()["data"][0]['data']
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_request(model_name: str):
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--max-num-seqs",
+        "32",
+        "--model-impl",
+        "terratorch",
+        "--skip-tokenizer-init",
+        "--enable-mm-embeds",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as server:
+        prompt = _terratorch_dummy_inputs(model_name)
+
+        # test single pooling
+        response = requests.post(server.url_for("pooling"), json=prompt)
+        response.raise_for_status()
 
-    np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
+        output = response.json()["data"][0]["data"]
 
-    assert len(np_response) == 524288
+        np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
+        assert len(np_response) == 524288
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ac5e5b9a8fa5cbe263a05bd1ae98d2d7e439b4
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+@pytest.fixture(scope="function")
+def default_tokenizer() -> AnyTokenizer:
+    return AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 1da06be2eba92f7a13d3d63d705956cd15e605e1..38008dafe32b20ffa4e7eddf142addf6d1d88bcc 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -6,8 +6,7 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import (
-    Hermes2ProToolParser)
+from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from ....utils import RemoteOpenAIServer
@@ -27,61 +26,64 @@ SERVER_ARGS = [
     f"{LORA_MODEL}",
 ]
 
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {
-                    "type": "string",
-                    "description":
-                    "The city and state, e.g. San Francisco, CA",
-                },
-                "unit": {
-                    "type": "string",
-                    "enum": ["celsius", "fahrenheit"],
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
+                "required": ["location"],
             },
-            "required": ["location"],
         },
-    },
-}]
-
-PRODUCT_TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_product_info",
-        "description": "Get detailed information of a product based on its "
-        "product ID.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "inserted": {
-                    "type": "boolean",
-                    "description": "inserted.",
-                },
-                "product_id": {
-                    "type": "integer",
-                    "description": "The product ID of the product.",
+    }
+]
+
+PRODUCT_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_product_info",
+            "description": "Get detailed information of a product based on its "
+            "product ID.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "inserted": {
+                        "type": "boolean",
+                        "description": "inserted.",
+                    },
+                    "product_id": {
+                        "type": "integer",
+                        "description": "The product ID of the product.",
+                    },
                 },
+                "required": ["product_id", "inserted"],
             },
-            "required": ["product_id", "inserted"],
         },
-    },
-}]
+    }
+]
 
 MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
 
-PRODUCT_MESSAGES = [{
-    "role":
-    "user",
-    "content":
-    "Hi! Do you have any detailed information about the product id "
-    "7355608 and inserted true?",
-}]
+PRODUCT_MESSAGES = [
+    {
+        "role": "user",
+        "content": "Hi! Do you have any detailed information about the product id "
+        "7355608 and inserted true?",
+    }
+]
 
 
 @pytest.mark.asyncio
@@ -150,7 +152,8 @@ async def test_streaming_tool_call():
                     tool_call_chunks[index]["name"] += tool_chunk.function.name
                 if tool_chunk.function.arguments:
                     tool_call_chunks[index]["arguments"] += (
-                        tool_chunk.function.arguments)
+                        tool_chunk.function.arguments
+                    )
 
         assert len(tool_call_chunks) == 1
         reconstructed_tool_call = tool_call_chunks[0]
@@ -240,7 +243,8 @@ async def test_streaming_product_tool_call():
                     tool_call_chunks[index]["name"] += tool_chunk.function.name
                 if tool_chunk.function.arguments:
                     tool_call_chunks[index]["arguments"] += (
-                        tool_chunk.function.arguments)
+                        tool_chunk.function.arguments
+                    )
 
         assert len(tool_call_chunks) == 1
         reconstructed_tool_call = tool_call_chunks[0]
@@ -291,9 +295,7 @@ def test_hermes_parser_streaming_just_forward_text(
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
-    text = (
-        """This is some prior text that has nothing to do with tool calling."""
-    )
+    text = """This is some prior text that has nothing to do with tool calling."""
     tokens = qwen_tokenizer.encode(text)
     previous_text = ""
     delta_messages = []
@@ -348,8 +350,9 @@ def test_hermes_parser_streaming_failure_case_bug_19056(
             delta_messages.append(delta)
 
     assert delta_messages[0].tool_calls[0].function.name == "final_answer"
-    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
-                             for delta in delta_messages)
+    tool_call_args = "".join(
+        delta.tool_calls[0].function.arguments or "" for delta in delta_messages
+    )
     assert tool_call_args == '{"trigger": true}'
 
 
@@ -383,13 +386,13 @@ def test_hermes_parser_streaming(
         if delta is not None:
             delta_messages.append(delta)
     print(delta_messages)
-    assert (delta_messages[0].tool_calls[0].function.name ==
-            "get_current_temperature")
-    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
-                             for delta in delta_messages)
+    assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature"
+    tool_call_args = "".join(
+        delta.tool_calls[0].function.arguments or "" for delta in delta_messages
+    )
     assert tool_call_args == (
-        '{"location":"San Francisco, California, United States", '
-        '"unit": "celsius"}')
+        '{"location":"San Francisco, California, United States", "unit": "celsius"}'
+    )
 
 
 def test_hermes_parser_non_streaming_no_tool_call(
diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
index bd8e06513e13e7cdd159f2a4a8a3789261792d20..bdd5344652c4b5516becc48786188e6adfdebfd5 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -8,15 +8,18 @@ from unittest.mock import MagicMock
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 
 
 def make_tool_call(name, arguments):
-    return ToolCall(type="function",
-                    function=FunctionCall(name=name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=name, arguments=json.dumps(arguments)),
+    )
 
 
 # TODO: add reason prefix and suffix.
@@ -29,70 +32,68 @@ def make_tool_call(name, arguments):
         ("How can I help you today?", [], "How can I help you today?"),
         # Single tool call, no content
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}]</tool_calls>",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}]</tool_calls>',  # noqa: E501
             [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                })
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
             ],
-            None),
+            None,
+        ),
         # Multiple tool calls
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}, {\"name\": \"register_user\", \"arguments\": {\"name\": \"John Doe\", \"age\": 37, \"address\": {\"city\": \"San Francisco\", \"state\": \"CA\"}, \"role\": null, \"passed_test\": true, \"aliases\": [\"John\", \"Johnny\"]}}]</tool_calls>",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}, {"name": "register_user", "arguments": {"name": "John Doe", "age": 37, "address": {"city": "San Francisco", "state": "CA"}, "role": null, "passed_test": true, "aliases": ["John", "Johnny"]}}]</tool_calls>',  # noqa: E501
             [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                }),
                 make_tool_call(
-                    "register_user", {
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                ),
+                make_tool_call(
+                    "register_user",
+                    {
                         "name": "John Doe",
                         "age": 37,
-                        "address": {
-                            "city": "San Francisco",
-                            "state": "CA"
-                        },
+                        "address": {"city": "San Francisco", "state": "CA"},
                         "role": None,
                         "passed_test": True,
-                        "aliases": ["John", "Johnny"]
-                    })
+                        "aliases": ["John", "Johnny"],
+                    },
+                ),
             ],
-            None),
+            None,
+        ),
         # Content before tool call
         (
-            "I will call the tool now. <tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Boston\"}}]</tool_calls>",  #noqa: E501
+            'I will call the tool now. <tool_calls>[{"name": "get_weather", "arguments": {"city": "Boston"}}]</tool_calls>',  # noqa: E501
             [make_tool_call("get_weather", {"city": "Boston"})],
-            "I will call the tool now. "),
+            "I will call the tool now. ",
+        ),
         # Content after tool call (should be stripped)
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Seattle\"}}]</tool_calls>\nThank you!",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "Seattle"}}]</tool_calls>\nThank you!',  # noqa: E501
             [make_tool_call("get_weather", {"city": "Seattle"})],
-            None),
+            None,
+        ),
         (
-            "<tool_calls>[{\"name\": \"complex_tool\", \"arguments\": {\"level1\": {\"level2\": {\"level3\": {\"value\": 123}}}}}]</tool_calls>",
+            '<tool_calls>[{"name": "complex_tool", "arguments": {"level1": {"level2": {"level3": {"value": 123}}}}}]</tool_calls>',
             [
                 make_tool_call(
-                    "complex_tool",
-                    {"level1": {
-                        "level2": {
-                            "level3": {
-                                "value": 123
-                            }
-                        }
-                    }})
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
             ],
             None,
         ),
-    ])
-def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
-                                          expected_content):
+    ],
+)
+def test_hunyuan_a13b_tool_parser_extract(
+    model_output, expected_tool_calls, expected_content
+):
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=False)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=False
+    )
 
     # align the random id.
     for idx in range(len(tool_calls)):
@@ -102,49 +103,74 @@ def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
 
 
 # Streaming test: simulate incremental output
-@pytest.mark.parametrize("model_deltas,expected_tool_calls", [
-    ([
-        "<tool_calls>[{\"name\": \"get_weather\", ",
-        "\"arguments\": {\"city\": \"San Francisco\", ",
-        "\"metric\": \"celsius\"}}]", "</tool_calls>"
-    ], [
-        make_tool_call("get_weather", {
-            "city": "San Francisco",
-            "metric": "celsius"
-        })
-    ]),
-    ([
-        "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    ([
-        "", "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>", "\n</answer>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    pytest.param([
-        "<tool_calls>[{\"name\": \"complex_tool\",", " \"arguments\": ",
-        " {\"level1\": {\"level2\": ", "{\"level3\": {\"value\": 123}}}}}",
-        "]</tool_calls>"
-    ], [
-        make_tool_call("complex_tool",
-                       {"level1": {
-                           "level2": {
-                               "level3": {
-                                   "value": 123
-                               }
-                           }
-                       }})
+@pytest.mark.parametrize(
+    "model_deltas,expected_tool_calls",
+    [
+        (
+            [
+                '<tool_calls>[{"name": "get_weather", ',
+                '"arguments": {"city": "San Francisco", ',
+                '"metric": "celsius"}}]',
+                "</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
+            ],
+        ),
+        (
+            [
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        (
+            [
+                "",
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+                "\n</answer>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        pytest.param(
+            [
+                '<tool_calls>[{"name": "complex_tool",',
+                ' "arguments": ',
+                ' {"level1": {"level2": ',
+                '{"level3": {"value": 123}}}}}',
+                "]</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
+            ],
+            marks=pytest.mark.xfail(
+                reason="stream parsing not support nested json yet."
+            ),
+        ),
     ],
-                 marks=pytest.mark.xfail(
-                     reason="stream parsing not support nested json yet.")),
-])
+)
 def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
     mock_tokenizer = MagicMock()
 
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_deltas, assert_one_tool_per_delta=False
+    )
 
     # align the random id.
     for idx in range(len(reconstructor.tool_calls)):
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 09726c7e3e5b52c6064cf20fcef716cdb0d70f07..2b68a653f46008197690951680a69f5e158ef92e 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -2,24 +2,23 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from transformers import AutoTokenizer
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
-from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import (
-    Llama3JsonToolParser)
+from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 
 @pytest.fixture
-def parser():
-    # Use a small tokenizer for testing
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    return Llama3JsonToolParser(tokenizer)
+def parser(default_tokenizer: AnyTokenizer):
+    return Llama3JsonToolParser(default_tokenizer)
 
 
 def test_extract_tool_calls_simple(parser):
     # Test with a simple tool call
-    model_output = ('Here is the result: {"name": "getOpenIncidentsTool", '
-                    '"parameters": {}} Would you like to know more?')
+    model_output = (
+        'Here is the result: {"name": "getOpenIncidentsTool", '
+        '"parameters": {}} Would you like to know more?'
+    )
     result = parser.extract_tool_calls(model_output, None)
 
     assert isinstance(result, ExtractedToolCallInformation)
@@ -34,8 +33,8 @@ def test_extract_tool_calls_simple(parser):
 def test_extract_tool_calls_with_arguments(parser):
     # Test with a tool call that has arguments
     model_output = (
-        '{"name": "searchTool", "parameters": {"query": "test query", '
-        '"limit": 10}}')
+        '{"name": "searchTool", "parameters": {"query": "test query", "limit": 10}}'
+    )
     result = parser.extract_tool_calls(model_output, None)
 
     assert result.tools_called is True
@@ -81,7 +80,8 @@ def test_extract_tool_calls_multiple_json(parser):
     model_output = (
         '{"name": "searchTool", "parameters": {"query": "test1"}}; '
         '{"name": "getOpenIncidentsTool", "parameters": {}}; '
-        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+        '{"name": "searchTool", "parameters": {"query": "test2"}}'
+    )
     result = parser.extract_tool_calls(model_output, None)
 
     assert result.tools_called is True
@@ -105,7 +105,8 @@ def test_extract_tool_calls_multiple_json_with_whitespace(parser):
     model_output = (
         '{"name": "searchTool", "parameters": {"query": "test1"}} ; '
         '{"name": "getOpenIncidentsTool", "parameters": {}} ; '
-        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+        '{"name": "searchTool", "parameters": {"query": "test2"}}'
+    )
     result = parser.extract_tool_calls(model_output, None)
 
     assert result.tools_called is True
@@ -118,11 +119,12 @@ def test_extract_tool_calls_multiple_json_with_whitespace(parser):
 def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
     # Test with multiple JSONs and surrounding text
     model_output = (
-        'Here are the results: '
+        "Here are the results: "
         '{"name": "searchTool", "parameters": {"query": "test1"}}; '
         '{"name": "getOpenIncidentsTool", "parameters": {}}; '
         '{"name": "searchTool", "parameters": {"query": "test2"}} '
-        'Would you like to know more?')
+        "Would you like to know more?"
+    )
     result = parser.extract_tool_calls(model_output, None)
 
     assert result.tools_called is True
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 8c86b4889e15b20aa38c3937060229c77567ce0c..d297432eab64451b92f4fc8aab53cac2050020e5 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -6,9 +6,12 @@ from unittest.mock import MagicMock, patch
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
@@ -16,12 +19,14 @@ SIMPLE_FUNCTION_CALL = FunctionCall(
     name="get_weather",
     arguments='{"city": "LA", "metric": "C"}',
 )
-MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
-                              "age=9, "
-                              "address={'city': 'LA', 'state': 'CA'}, "
-                              "role=None, "
-                              "passed_test=True, "
-                              "aliases=['John', 'Johnny'])]")
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "[register_user(name='Doe', "
+    "age=9, "
+    "address={'city': 'LA', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])]"
+)
 MORE_TYPES_FUNCTION_CALL = FunctionCall(
     name="register_user",
     arguments='{"name": "Doe", '
@@ -34,7 +39,7 @@ MORE_TYPES_FUNCTION_CALL = FunctionCall(
 PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
 PARAMETERLESS_FUNCTION_CALL = FunctionCall(
     name="get_weather",
-    arguments='{}',
+    arguments="{}",
 )
 EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
 EMPTY_DICT_FUNCTION_CALL = FunctionCall(
@@ -47,25 +52,27 @@ EMPTY_LIST_FUNCTION_CALL = FunctionCall(
     arguments='{"steps": []}',
 )
 ESCAPED_STRING_FUNCTION_OUTPUT = (
-    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]"
+)
 ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
     name="get_weather",
     arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
 )
 PYTHON_TAG_FUNCTION_OUTPUT = (
-    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>"
+)
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool):
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
     model_output = "How can I help you today?"
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content == model_output
     assert len(tool_calls) == 0
@@ -75,98 +82,141 @@ test_str = "<|python_start|>"
 test_str += "[get_weather(city='LA', metric='C'),"
 test_str += "register_user(name='Doe', age=9)]"
 TEST_CASES = [
-    pytest.param(True,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="simple_streaming"),
-    pytest.param(False,
-                 SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="simple_nonstreaming"),
-    pytest.param(True,
-                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_streaming"),
-    pytest.param(False,
-                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_nonstreaming"),
-    pytest.param(True,
-                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_streaming"),
-    pytest.param(False,
-                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_nonstreaming"),
-    pytest.param(True,
-                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_streaming"),
-    pytest.param(False,
-                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_nonstreaming"),
-    pytest.param(True,
-                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_streaming"),
-    pytest.param(False,
-                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_nonstreaming"),
-    pytest.param(True,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_streaming"),
-    pytest.param(False,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_nonstreaming"),
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False, SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], id="simple_nonstreaming"
+    ),
+    pytest.param(
+        True,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
     pytest.param(
         True,
         "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
         [
             SIMPLE_FUNCTION_CALL,
-            FunctionCall(name="register_user",
-                         arguments='{"name": "Doe", "age": 9}')
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
         ],
-        id="parallel_calls_streaming"),
+        id="parallel_calls_streaming",
+    ),
     pytest.param(
         False,
         "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
         [
             SIMPLE_FUNCTION_CALL,
-            FunctionCall(name="register_user",
-                         arguments='{"name": "Doe", "age": 9}')
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_streaming",
+    ),
+    pytest.param(
+        False,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        test_str,
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        + "register_user(name='Doe', age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
         ],
-        id="parallel_calls_nonstreaming"),
-    pytest.param(True,
-                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="python_tag_streaming"),
-    pytest.param(False,
-                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="python_tag_nonstreaming"),
-    pytest.param(True,
-                 test_str, [
-                     SIMPLE_FUNCTION_CALL,
-                     FunctionCall(name="register_user",
-                                  arguments='{"name": "Doe", "age": 9}')
-                 ],
-                 id="parallel_calls_streaming"),
-    pytest.param(False,
-                 "<|python_start|>[get_weather(city='LA', metric='C'), " +
-                 "register_user(name='Doe', age=9)]", [
-                     SIMPLE_FUNCTION_CALL,
-                     FunctionCall(name="register_user",
-                                  arguments='{"name": "Doe", "age": 9}')
-                 ],
-                 id="parallel_calls_nonstreaming"),
+        id="parallel_calls_nonstreaming",
+    ),
 ]
 
 
-@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
-                         TEST_CASES)
-def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: list[FunctionCall]):
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: AnyTokenizer,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert len(tool_calls) == len(expected_tool_calls)
     for actual, expected in zip(tool_calls, expected_tool_calls):
@@ -174,10 +224,10 @@ def test_tool_call(streaming: bool, model_output: str,
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps():
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
     model_output_deltas = [
         "<|python_start|>[get_weather(city='LA', metric='C'), "
         "get_weather(), "
@@ -185,7 +235,8 @@ def test_streaming_tool_call_with_large_steps():
     ]
 
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
 
     assert reconstructor.other_content == ""
     assert len(reconstructor.tool_calls) == 3
@@ -195,11 +246,11 @@ def test_streaming_tool_call_with_large_steps():
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
     """test regex timeout is handled gracefully"""
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
 
     fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
 
@@ -207,10 +258,10 @@ def test_regex_timeout_handling(streaming: bool):
     mock_regex = MagicMock()
     mock_regex.match.side_effect = TimeoutError("Regex timeout")
 
-    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
-        content, tool_calls = run_tool_extraction(tool_parser,
-                                                  fake_problematic_input,
-                                                  streaming=streaming)
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
 
         # should treat as regular text when regex times out
         assert content == fake_problematic_input
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cff9a8ebf1e983d8af43fdde52e26c1c7956ab
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=null, "
+    "passed_test=true, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming_json_literals",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming_json_literals",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: AnyTokenizer,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "<function_calls>get_weather(city='San",
+        " Francisco', metric='celsius')\n"
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+    """test regex timeout is handled gracefully"""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index d83137472598e1bf34350b105b8d8e95e18506fa..fcd3df16e5cfa22632b5bb38e4400120d24dce19 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -6,9 +6,12 @@ from unittest.mock import MagicMock, patch
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -22,7 +25,8 @@ MORE_TYPES_FUNCTION_OUTPUT = (
     "address={'city': 'San Francisco', 'state': 'CA'}, "
     "role=None, "
     "passed_test=True, "
-    "aliases=['John', 'Johnny'])")
+    "aliases=['John', 'Johnny'])"
+)
 MORE_TYPES_FUNCTION_CALL = FunctionCall(
     name="register_user",
     arguments='{"name": "John Doe", '
@@ -35,7 +39,7 @@ MORE_TYPES_FUNCTION_CALL = FunctionCall(
 PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
 PARAMETERLESS_FUNCTION_CALL = FunctionCall(
     name="get_weather",
-    arguments='{}',
+    arguments="{}",
 )
 EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
 EMPTY_DICT_FUNCTION_CALL = FunctionCall(
@@ -48,7 +52,8 @@ EMPTY_LIST_FUNCTION_CALL = FunctionCall(
     arguments='{"steps": []}',
 )
 ESCAPED_STRING_FUNCTION_OUTPUT = (
-    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
 ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
     name="get_weather",
     arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
@@ -56,83 +61,122 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool):
-    mock_tokenizer = MagicMock()
+def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        default_tokenizer
+    )
     model_output = "How can I help you today?"
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content == model_output
     assert len(tool_calls) == 0
 
 
 TEST_CASES = [
-    pytest.param(True,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
-                 id="simple_streaming"),
-    pytest.param(False,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
-                 id="simple_nonstreaming"),
-    pytest.param(True,
-                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_streaming"),
-    pytest.param(False,
-                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_nonstreaming"),
-    pytest.param(True,
-                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
-                 [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_streaming"),
-    pytest.param(False,
-                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
-                 [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_nonstreaming"),
-    pytest.param(True,
-                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_streaming"),
-    pytest.param(False,
-                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_nonstreaming"),
-    pytest.param(True,
-                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_streaming"),
-    pytest.param(False,
-                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_nonstreaming"),
-    pytest.param(True,
-                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_streaming"),
-    pytest.param(False,
-                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_nonstreaming"),
-    pytest.param(True,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
-                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
-                 id="parallel_calls_streaming"),
-    pytest.param(False,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
-                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
-                 id="parallel_calls_nonstreaming"),
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
 ]
 
 
-@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
-                         TEST_CASES)
-def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: list[FunctionCall]):
-    mock_tokenizer = MagicMock()
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: AnyTokenizer,
+):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        default_tokenizer
+    )
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content is None
     assert len(tool_calls) == len(expected_tool_calls)
@@ -141,10 +185,10 @@ def test_tool_call(streaming: bool, model_output: str,
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps():
-    mock_tokenizer = MagicMock()
+def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        default_tokenizer
+    )
     model_output_deltas = [
         "[get_weather(city='San",
         " Francisco', metric='celsius'), "
@@ -153,7 +197,8 @@ def test_streaming_tool_call_with_large_steps():
     ]
 
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
 
     assert reconstructor.other_content == ""
     assert len(reconstructor.tool_calls) == 3
@@ -163,11 +208,11 @@ def test_streaming_tool_call_with_large_steps():
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
     """test regex timeout is handled gracefully"""
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        default_tokenizer
+    )
 
     fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
 
@@ -175,10 +220,10 @@ def test_regex_timeout_handling(streaming: bool):
     mock_regex = MagicMock()
     mock_regex.match.side_effect = TimeoutError("Regex timeout")
 
-    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
-        content, tool_calls = run_tool_extraction(tool_parser,
-                                                  fake_problematic_input,
-                                                  streaming=streaming)
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
 
         # should treat as regular text when regex times out
         assert content == fake_problematic_input
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index e1b41f45f55480962e7f15eca534c5c541a4089b..38899f2632554660775a71d5d609b3ed918a0934 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -2,17 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 
 class StreamingToolReconstructor:
-
     def __init__(self, assert_one_tool_per_delta: bool = True):
         self.tool_calls: list[ToolCall] = []
         self.other_content: str = ""
@@ -23,68 +25,79 @@ class StreamingToolReconstructor:
             self.other_content += delta.content
         else:
             assert delta.tool_calls, (
-                "Streaming results should have either content or tool calls "
-                "(or both)")
+                "Streaming results should have either content or tool calls (or both)"
+            )
         if self._assert_one_tool_per_delta:
             # Note: This isn't strictly required by the API and may not be
             # possible to adhere to depending on the token space and number of
             # tokens per streamed response from the model, but it is required
             # by tool_use tests, so we enforce it here by default also.
             assert len(delta.tool_calls) < 2, (
-                "Streaming should include only one tool call per update.")
+                "Streaming should include only one tool call per update."
+            )
         for call_delta in delta.tool_calls:
             assert call_delta.type is None or call_delta.type == "function", (
                 "Streaming tool calls should only emit function calls. Got "
-                f"{call_delta.type}")
-            current_tool_call = self.tool_calls[
-                call_delta.index] if call_delta.index < len(
-                    self.tool_calls) else None
+                f"{call_delta.type}"
+            )
+            current_tool_call = (
+                self.tool_calls[call_delta.index]
+                if call_delta.index < len(self.tool_calls)
+                else None
+            )
             if current_tool_call:
-                assert (not call_delta.function.name), (
+                assert not call_delta.function.name, (
                     "Streaming tool calls should emit the full function name "
-                    f"exactly once. Got {call_delta.function.name}")
-                assert (not call_delta.id), (
+                    f"exactly once. Got {call_delta.function.name}"
+                )
+                assert not call_delta.id, (
                     "Streaming tool calls must emit function id only once. Got "
-                    f"{call_delta.id}")
-                assert (call_delta.index == len(self.tool_calls) - 1), (
+                    f"{call_delta.id}"
+                )
+                assert call_delta.index == len(self.tool_calls) - 1, (
                     f"Incorrect index for tool delta. Got {call_delta.index}, "
-                    f"expected {len(self.tool_calls) - 1}")
-                current_tool_call.function.arguments += (
-                    call_delta.function.arguments)
+                    f"expected {len(self.tool_calls) - 1}"
+                )
+                current_tool_call.function.arguments += call_delta.function.arguments
             else:
                 assert call_delta.id is not None, (
-                    "Streaming tool calls must have an id on first appearance")
+                    "Streaming tool calls must have an id on first appearance"
+                )
                 assert call_delta.function.name is not None, (
-                    "Streaming tool calls must have a function name on first "
-                    "appearance")
+                    "Streaming tool calls must have a function name on first appearance"
+                )
                 assert call_delta.index == len(self.tool_calls), (
                     f"Incorrect index for tool delta. Got {call_delta.index}, "
-                    f"expected {len(self.tool_calls)}")
+                    f"expected {len(self.tool_calls)}"
+                )
                 self.tool_calls.append(
-                    ToolCall(id=call_delta.id,
-                             function=FunctionCall(
-                                 name=call_delta.function.name,
-                                 arguments=call_delta.function.arguments
-                                 or "")))
+                    ToolCall(
+                        id=call_delta.id,
+                        function=FunctionCall(
+                            name=call_delta.function.name,
+                            arguments=call_delta.function.arguments or "",
+                        ),
+                    )
+                )
 
 
 def run_tool_extraction(
     tool_parser: ToolParser,
     model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
     assert_one_tool_per_delta: bool = True,
-) -> tuple[Union[str, None], list[ToolCall]]:
+) -> tuple[str | None, list[ToolCall]]:
     if streaming:
         reconstructor = run_tool_extraction_streaming(
             tool_parser,
             model_output,
             request,
-            assert_one_tool_per_delta=assert_one_tool_per_delta)
+            assert_one_tool_per_delta=assert_one_tool_per_delta,
+        )
         return reconstructor.other_content or None, reconstructor.tool_calls
     else:
-        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
-                                                     request)
+        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, request)
         assert extracted.tools_called == bool(extracted.tool_calls)
         return extracted.content, extracted.tool_calls
 
@@ -92,21 +105,42 @@ def run_tool_extraction(
 def run_tool_extraction_nonstreaming(
     tool_parser: ToolParser,
     model_output: str,
-    request: Union[ChatCompletionRequest, None] = None
+    request: ChatCompletionRequest | None = None,
 ) -> ExtractedToolCallInformation:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return tool_parser.extract_tool_calls(model_output, request)
 
 
+def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]:
+    # Split a string into a series of deltas using the provided tokenizer. Each
+    # delta will be the string equivalent of a single token.
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    previously_decoded_text = ""
+    deltas = []
+    for i in range(1, len(token_ids) + 1):
+        current_tokens = token_ids[:i]
+        current_text = tokenizer.decode(current_tokens)
+        new_text = current_text[len(previously_decoded_text) :]
+        previously_decoded_text = current_text
+        deltas.append(new_text)
+    return deltas
+
+
 def run_tool_extraction_streaming(
     tool_parser: ToolParser,
     model_deltas: Iterable[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     assert_one_tool_per_delta: bool = True,
 ) -> StreamingToolReconstructor:
+    if isinstance(model_deltas, str):
+        model_deltas = split_string_into_token_deltas(
+            tool_parser.model_tokenizer, model_deltas
+        )
+
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingToolReconstructor(
-        assert_one_tool_per_delta=assert_one_tool_per_delta)
+        assert_one_tool_per_delta=assert_one_tool_per_delta
+    )
     previous_text = ""
     previous_tokens: list[int] = []
     for delta in model_deltas:
@@ -118,8 +152,14 @@ def run_tool_extraction_streaming(
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
         delta_message = tool_parser.extract_tool_calls_streaming(
-            previous_text, current_text, delta, previous_tokens,
-            current_tokens, token_delta, request)
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+            request,
+        )
         if delta_message is not None:
             reconstructor.append_delta(delta_message)
         previous_text = current_text
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
index 12a4875bdacfd2c8a23a89c400f4fe9e154eeabc..7f16638e51e2c255096d63433956d7295d8908bc 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
@@ -5,8 +5,11 @@ import os
 import pytest
 
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder,
-    run_mteb_embed_task)
+    MTEB_EMBED_TASKS,
+    MTEB_EMBED_TOL,
+    OpenAIClientMtebEncoder,
+    run_mteb_embed_task,
+)
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
@@ -17,10 +20,7 @@ MAIN_SCORE = 0.7422994752439667
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--runner", "pooling", "--enforce-eager",
-        "--disable-uvicorn-access-log"
-    ]
+    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py
index 7c059d16b38635022bc2d258227282a9f7a73405..1afe68b189db84e14abf5dbdbc38afbce0397dcc 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_score.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py
@@ -5,8 +5,13 @@ import os
 import pytest
 
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
-    RerankClientMtebEncoder, ScoreClientMtebEncoder, run_mteb_rerank)
+    MTEB_RERANK_LANGS,
+    MTEB_RERANK_TASKS,
+    MTEB_RERANK_TOL,
+    RerankClientMtebEncoder,
+    ScoreClientMtebEncoder,
+    run_mteb_rerank,
+)
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
@@ -17,10 +22,7 @@ st_main_score = 0.33457
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--runner", "pooling", "--enforce-eager",
-        "--disable-uvicorn-access-log"
-    ]
+    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
@@ -29,8 +31,7 @@ def server():
 def test_mteb_score(server):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
-    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
-                                      MTEB_RERANK_LANGS)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
@@ -44,8 +45,7 @@ def test_mteb_score(server):
 def test_mteb_rerank(server):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
-    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
-                                      MTEB_RERANK_LANGS)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py
index ff5cea11a9182cd8d0bf46349b6a9fd7ae8caa15..1063c3b6b755c66b2769058ee1c5ed092808aad0 100644
--- a/tests/entrypoints/pooling/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -19,12 +19,14 @@ prompts = ["The chef prepared a delicious meal."]
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
 
     yield weakref.proxy(llm)
 
@@ -35,32 +37,35 @@ def llm():
 
 @pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-
-    def get_outputs(activation):
+    def get_outputs(use_activation):
         outputs = llm.classify(
             prompts,
-            pooling_params=PoolingParams(activation=activation),
-            use_tqdm=False)
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
         return torch.tensor([x.outputs.probs for x in outputs])
 
-    default = get_outputs(activation=None)
-    w_activation = get_outputs(activation=True)
-    wo_activation = get_outputs(activation=False)
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
 
-    assert torch.allclose(default, w_activation,
-                          atol=1e-2), "Default should use activation."
-    assert not torch.allclose(
-        w_activation, wo_activation,
-        atol=1e-2), "wo_activation should not use activation."
-    assert torch.allclose(
-        softmax(wo_activation), w_activation, atol=1e-2
-    ), "w_activation should be close to activation(wo_activation)."
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
 
 
+@pytest.mark.skip_global_cleanup
 def test_encode_api(llm: LLM):
+    # chunked prefill does not support all pooling
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, use_tqdm=False)
+        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
 
 
 def test_score_api(llm: LLM):
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py
index 485f04ed6d8491fc8106e08fed3f1173ef393bb6..5455b5f91fc099058e958ba87ced908398943712 100644
--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/llm/test_embedding.py
@@ -19,12 +19,14 @@ prompts = ["The chef prepared a delicious meal."]
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
 
     yield weakref.proxy(llm)
 
@@ -34,22 +36,27 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-def test_pooling_params(llm: LLM):
+def test_encode_api(llm: LLM):
+    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
+    multi_vector = outputs[0].outputs.data
+    assert multi_vector.shape == (11, 384)
+
 
+def test_pooling_params(llm: LLM):
     def get_outputs(normalize):
-        outputs = llm.embed(prompts,
-                            pooling_params=PoolingParams(normalize=normalize),
-                            use_tqdm=False)
+        outputs = llm.embed(
+            prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
+        )
         return torch.tensor([x.outputs.embedding for x in outputs])
 
     default = get_outputs(normalize=None)
     w_normal = get_outputs(normalize=True)
     wo_normal = get_outputs(normalize=False)
 
-    assert torch.allclose(default, w_normal,
-                          atol=1e-2), "Default should use normal."
-    assert not torch.allclose(w_normal, wo_normal,
-                              atol=1e-2), "wo_normal should not use normal."
-    assert torch.allclose(
-        w_normal, F.normalize(wo_normal, p=2, dim=-1),
-        atol=1e-2), "w_normal should be close to normal(wo_normal)."
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py
index eae3e234378f22aefa7702c3016af684a4d7cdf4..ca85d2758fce465c70e3100e3990745904404a4c 100644
--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/llm/test_encode.py
@@ -31,12 +31,14 @@ TOKEN_IDS = [
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
 
     yield weakref.proxy(llm)
 
@@ -55,24 +57,27 @@ def test_multiple_pooling_params(llm: LLM):
     ]
 
     # Multiple PoolingParams should be matched with each prompt
-    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
     assert len(PROMPTS) == len(outputs)
 
     # Exception raised, if the size of params does not match the size of prompts
     with pytest.raises(ValueError):
-        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
+        outputs = llm.encode(
+            PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
+        )
 
     # Single PoolingParams should be applied to every prompt
     single_pooling_params = PoolingParams()
-    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
+    outputs = llm.encode(
+        PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
+    )
     assert len(PROMPTS) == len(outputs)
 
     # pooling_params is None, default params should be applied
-    outputs = llm.encode(PROMPTS, pooling_params=None)
+    outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
     assert len(PROMPTS) == len(outputs)
 
 
-@pytest.mark.skip_global_cleanup
 def test_right_side_truncation(llm: LLM):
     # Embeddings models should truncate the end of the prompt
     tokenizer = llm.get_tokenizer()
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py
index 11d164c978a9264929011c8218383e0bd14dcdd2..0255704cecd94429992a130d0b611105df19b548 100644
--- a/tests/entrypoints/pooling/llm/test_reward.py
+++ b/tests/entrypoints/pooling/llm/test_reward.py
@@ -19,13 +19,15 @@ prompts = ["The chef prepared a delicious meal."]
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              trust_remote_code=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        trust_remote_code=True,
+        seed=0,
+    )
 
     yield weakref.proxy(llm)
 
@@ -34,23 +36,25 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-
-    def get_outputs(softmax):
-        outputs = llm.reward(prompts,
-                             pooling_params=PoolingParams(softmax=softmax),
-                             use_tqdm=False)
+    def get_outputs(use_activation):
+        outputs = llm.reward(
+            prompts,
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
         return torch.cat([x.outputs.data for x in outputs])
 
-    default = get_outputs(softmax=None)
-    w_softmax = get_outputs(softmax=True)
-    wo_softmax = get_outputs(softmax=False)
-
-    assert torch.allclose(default, w_softmax,
-                          atol=1e-2), "Default should use softmax."
-    assert not torch.allclose(w_softmax, wo_softmax,
-                              atol=1e-2), "wo_softmax should not use softmax."
-    assert torch.allclose(
-        softmax(wo_softmax), w_softmax,
-        atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py
index 447378f989d09cae7599ad8635deb8bd0a5881d6..b69c6a47c1913ff494da98db1844b7f31a8d4d27 100644
--- a/tests/entrypoints/pooling/llm/test_score.py
+++ b/tests/entrypoints/pooling/llm/test_score.py
@@ -17,12 +17,14 @@ MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
 
     yield weakref.proxy(llm)
 
@@ -31,29 +33,29 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-
-    def get_outputs(activation):
+    def get_outputs(use_activation):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
         outputs = llm.score(
             text_1,
             text_2,
-            pooling_params=PoolingParams(activation=activation),
-            use_tqdm=False)
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
         return torch.tensor([x.outputs.score for x in outputs])
 
-    default = get_outputs(activation=None)
-    w_activation = get_outputs(activation=True)
-    wo_activation = get_outputs(activation=False)
-
-    assert torch.allclose(default, w_activation,
-                          atol=1e-2), "Default should use activation."
-    assert not torch.allclose(
-        w_activation, wo_activation,
-        atol=1e-2), "wo_activation should not use activation."
-    assert torch.allclose(
-        softmax(wo_activation), w_activation, atol=1e-2
-    ), "w_activation should be close to activation(wo_activation)."
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
index 26c2c8e6af17d9e9721977ee02647b06737a2a5b..25080d4189c2db5ff4a9588631c72c4305942e72 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse
+from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse
 
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
@@ -28,21 +28,16 @@ def server():
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_single_input_classification(server: RemoteOpenAIServer,
-                                     model_name: str):
+def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
     input_text = "This product was excellent and exceeded my expectations"
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": input_text
-        },
+        json={"model": model_name, "input": input_text},
     )
 
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert output.object == "list"
     assert output.model == MODEL_NAME
@@ -52,8 +47,17 @@ def test_single_input_classification(server: RemoteOpenAIServer,
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_multiple_inputs_classification(server: RemoteOpenAIServer,
-                                        model_name: str):
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
         "The product arrived on time and works perfectly",
         "I'm very satisfied with my purchase, would buy again",
@@ -65,13 +69,9 @@ def test_multiple_inputs_classification(server: RemoteOpenAIServer,
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": input_texts
-        },
+        json={"model": model_name, "input": input_texts},
     )
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert len(output.data) == len(input_texts)
     for i, item in enumerate(output.data):
@@ -88,16 +88,11 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": long_text,
-            "truncate_prompt_tokens": 5
-        },
+        json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
     )
 
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert len(output.data) == 1
     assert output.data[0].index == 0
@@ -107,15 +102,12 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
-                                              model_name: str):
+def test_invalid_truncate_prompt_tokens_error(
+    server: RemoteOpenAIServer, model_name: str
+):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": "test",
-            "truncate_prompt_tokens": 513
-        },
+        json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
     )
 
     error = classification_response.json()
@@ -127,10 +119,7 @@ def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
 def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": ""
-        },
+        json={"model": model_name, "input": ""},
     )
 
     error = classification_response.json()
@@ -139,18 +128,13 @@ def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_batch_classification_empty_list(server: RemoteOpenAIServer,
-                                         model_name: str):
+def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": []
-        },
+        json={"model": model_name, "input": []},
     )
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert output.object == "list"
     assert isinstance(output.data, list)
@@ -161,15 +145,17 @@ def test_batch_classification_empty_list(server: RemoteOpenAIServer,
 async def test_invocations(server: RemoteOpenAIServer):
     request_args = {
         "model": MODEL_NAME,
-        "input": "This product was excellent and exceeded my expectations"
+        "input": "This product was excellent and exceeded my expectations",
     }
 
-    classification_response = requests.post(server.url_for("classify"),
-                                            json=request_args)
+    classification_response = requests.post(
+        server.url_for("classify"), json=request_args
+    )
     classification_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     classification_output = classification_response.json()
@@ -177,51 +163,56 @@ async def test_invocations(server: RemoteOpenAIServer):
 
     assert classification_output.keys() == invocation_output.keys()
     for classification_data, invocation_data in zip(
-            classification_output["data"], invocation_output["data"]):
+        classification_output["data"], invocation_output["data"]
+    ):
         assert classification_data.keys() == invocation_data.keys()
         assert classification_data["probs"] == pytest.approx(
-            invocation_data["probs"], rel=0.01)
+            invocation_data["probs"], rel=0.01
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_activation(server: RemoteOpenAIServer, model_name: str):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
     input_text = ["This product was excellent and exceeded my expectations"]
 
-    async def get_outputs(activation):
-        response = requests.post(server.url_for("classify"),
-                                 json={
-                                     "model": model_name,
-                                     "input": input_text,
-                                     "activation": activation
-                                 })
+    async def get_outputs(use_activation):
+        response = requests.post(
+            server.url_for("classify"),
+            json={
+                "model": model_name,
+                "input": input_text,
+                "use_activation": use_activation,
+            },
+        )
         outputs = response.json()
-        return torch.tensor([x['probs'] for x in outputs["data"]])
+        return torch.tensor([x["probs"] for x in outputs["data"]])
 
-    default = await get_outputs(activation=None)
-    w_activation = await get_outputs(activation=True)
-    wo_activation = await get_outputs(activation=False)
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
 
-    assert torch.allclose(default, w_activation,
-                          atol=1e-2), "Default should use activation."
-    assert not torch.allclose(
-        w_activation, wo_activation,
-        atol=1e-2), "wo_activation should not use activation."
-    assert torch.allclose(
-        F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
-    ), "w_activation should be close to activation(wo_activation)."
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_pooling(server: RemoteOpenAIServer, model_name: str):
-    # pooling api uses ALL pooling, which does not support chunked prefill.
+async def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
     response = requests.post(
-        server.url_for("pooling"),
+        server.url_for("score"),
         json={
             "model": model_name,
-            "input": "test",
-            "encoding_format": "float"
+            "text_1": "ping",
+            "text_2": "pong",
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
@@ -229,14 +220,14 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_score(server: RemoteOpenAIServer, model_name: str):
-    # score api is only enabled for num_labels == 1.
+async def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
     response = requests.post(
-        server.url_for("score"),
+        server.url_for("rerank"),
         json={
             "model": model_name,
-            "text_1": "ping",
-            "text_2": "pong",
+            "query": "ping",
+            "documents": ["pong"],
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
@@ -244,14 +235,58 @@ def test_score(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_rerank(server: RemoteOpenAIServer, model_name: str):
-    # rerank api is only enabled for num_labels == 1.
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
     response = requests.post(
-        server.url_for("rerank"),
+        server.url_for("pooling"),
         json={
             "model": model_name,
-            "query": "ping",
-            "documents": ["pong"],
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
+    # token_classify uses ALL pooling, which does not support chunked prefill.
+    task = "token_classify"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
index 37a10e79d4fc770af68afe26dc1027703b74c3ec..e971b23e8f1a07976bfd156a9ec68abb4893d761 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
+import json
 
 import numpy as np
 import openai
@@ -11,12 +12,21 @@ import requests
 import torch
 import torch.nn.functional as F
 
-from tests.models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.openai.protocol import (
+    EmbeddingResponse,
+    PoolingResponse,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    binary2tensor,
+    decode_pooling_output,
+)
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
@@ -50,15 +60,13 @@ async def client(server):
 
 @pytest.fixture(scope="module")
 def hf_model(hf_runner):
-    with hf_runner(MODEL_NAME, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
         yield hf_model
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
-                                model_name: str):
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -70,7 +78,8 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -90,7 +99,8 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -102,12 +112,12 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
-                               model_name: str):
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
     # test list[str]
     input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
     ]
     embedding_response = await client.embeddings.create(
         model=model_name,
@@ -115,7 +125,8 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
@@ -128,15 +139,20 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
     run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test list[list[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
     embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 4
@@ -148,19 +164,23 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_embedding(server: RemoteOpenAIServer,
-                                      client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+async def test_conversation_embedding(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     chat_response = requests.post(
         server.url_for("v1/embeddings"),
@@ -189,64 +209,176 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         extra_body={"add_special_tokens": False},
     )
     completion_embeddings = EmbeddingResponse.model_validate(
-        completion_response.model_dump(mode="json"))
+        completion_response.model_dump(mode="json")
+    )
 
     assert chat_embeddings.id is not None
     assert completion_embeddings.id is not None
     assert chat_embeddings.created <= completion_embeddings.created
-    assert chat_embeddings.model_dump(
-        exclude={"id", "created"}) == (completion_embeddings.model_dump(
-            exclude={"id", "created"}))
+    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
+        completion_embeddings.model_dump(exclude={"id", "created"})
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
-                                      model_name: str):
+async def test_batch_base64_embedding(
+    hf_model, client: openai.AsyncOpenAI, model_name: str
+):
     input_texts = [
         "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
+        "The best thing about vLLM is that it supports many different models",
     ]
 
-    responses_float = await client.embeddings.create(input=input_texts,
-                                                     model=model_name,
-                                                     encoding_format="float")
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
     float_data = [d.embedding for d in responses_float.data]
     run_embedding_correctness_test(hf_model, input_texts, float_data)
 
-    responses_base64 = await client.embeddings.create(input=input_texts,
-                                                      model=model_name,
-                                                      encoding_format="base64")
+    responses_base64 = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64"
+    )
     base64_data = []
     for data in responses_base64.data:
         base64_data.append(
-            np.frombuffer(base64.b64decode(data.embedding),
-                          dtype="float32").tolist())
+            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
+        )
 
     run_embedding_correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
-    responses_default = await client.embeddings.create(input=input_texts,
-                                                       model=model_name)
+    responses_default = await client.embeddings.create(
+        input=input_texts, model=model_name
+    )
     default_data = [d.embedding for d in responses_default.data]
     run_embedding_correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
-                                           model_name: str):
+async def test_base64_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            responses_base64 = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "base64",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            base64_data = []
+            for data in responses_base64.json()["data"]:
+                binary = base64.b64decode(data["embedding"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                base64_data.append(tensor.to(torch.float32).tolist())
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=base64_data,
+                name_0="float_data",
+                name_1="base64_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            metadata = json.loads(responses_bytes.headers["metadata"])
+            body = responses_bytes.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
+async def test_params_not_supported(
+    server: RemoteOpenAIServer, model_name: str, param_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_base64 = requests.post(
+        server.url_for("/v1/embeddings"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "base64",
+            param_name: f"bad_{param_name}",
+        },
+    )
+
+    assert responses_base64.status_code == 400
+    assert "literal_error" in responses_base64.json()["error"]["message"]
+    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     # test single embedding
     embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        extra_body={"truncate_prompt_tokens": 10})
+        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
+    )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -256,15 +388,34 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 10
 
     input_tokens = [
-        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
-        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+        1,
+        24428,
+        289,
+        18341,
+        26165,
+        285,
+        19323,
+        283,
+        289,
+        26789,
+        3871,
+        28728,
+        9901,
+        340,
+        2229,
+        385,
+        340,
+        315,
+        28741,
+        28804,
+        2,
     ]
     embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        extra_body={"truncate_prompt_tokens": 10})
+        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
+    )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -276,8 +427,9 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
-                                                   model_name: str):
+async def test_single_embedding_truncation_invalid(
+    client: openai.AsyncOpenAI, model_name: str
+):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
@@ -286,15 +438,17 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
         response = await client.embeddings.create(
             model=model_name,
             input=input_texts,
-            extra_body={"truncate_prompt_tokens": 8193})
+            extra_body={"truncate_prompt_tokens": 8193},
+        )
         assert "error" in response.object
-        assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in response.message
+        assert (
+            "truncate_prompt_tokens value is greater than max_model_len. "
+            "Please, select a smaller truncation size." in response.message
+        )
 
 
 @pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -307,35 +461,43 @@ async def test_invocations(server: RemoteOpenAIServer,
 
     completion_response = await client.embeddings.create(**request_args)
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     completion_output = completion_response.model_dump()
     invocation_output = invocation_response.json()
 
     assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
         assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="completion",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=[completion_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="completion",
+            name_1="invocation",
+        )
 
 
 @pytest.mark.asyncio
 async def test_invocations_conversation(server: RemoteOpenAIServer):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -343,25 +505,28 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
         "encoding_format": "float",
     }
 
-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json=request_args)
+    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
     chat_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_response.json()
     invocation_output = invocation_response.json()
 
     assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
         assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="chat",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=[chat_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="chat",
+            name_1="invocation",
+        )
 
 
 @pytest.mark.asyncio
@@ -374,23 +539,88 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
             "model": MODEL_NAME,
             "input": input_text,
             "encoding_format": "float",
-            "normalize": normalize
+            "normalize": normalize,
         }
 
-        response = requests.post(server.url_for("v1/embeddings"),
-                                 json=request_args)
+        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
         outputs = response.json()
 
-        return torch.tensor([x['embedding'] for x in outputs["data"]])
+        return torch.tensor([x["embedding"] for x in outputs["data"]])
 
     default = await get_outputs(normalize=None)
     w_normal = await get_outputs(normalize=True)
     wo_normal = await get_outputs(normalize=False)
 
-    assert torch.allclose(default, w_normal,
-                          atol=1e-2), "Default should use normal."
-    assert not torch.allclose(w_normal, wo_normal,
-                              atol=1e-2), "wo_normal should not use normal."
-    assert torch.allclose(
-        w_normal, F.normalize(wo_normal, p=2, dim=-1),
-        atol=1e-2), "w_normal should be close to normal(wo_normal)."
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "token_embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
index 3c7e88daa8ff3a394040b7a5c78727bab0e32acc..ba9fb642627725e220c17fdf52d57514027ae522 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@@ -4,23 +4,22 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 
-from typing import Optional
-
 import openai
 import pytest
 
 from tests.conftest import HfRunner
-from tests.models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                   is_matryoshka=True,
-                   matryoshka_dimensions=[256]),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        matryoshka_dimensions=[256],
+    ),
 ]
 
 input_texts = [
@@ -48,15 +47,14 @@ def server(model_info, dtype: str):
         dtype,
         "--enforce-eager",
         "--max-model-len",
-        "512"
+        "512",
     ]
 
     if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
         # Manually enable Matryoshka Embeddings
-        args.extend([
-            "--trust_remote_code", "--hf_overrides",
-            '{"matryoshka_dimensions":[256]}'
-        ])
+        args.extend(
+            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
+        )
 
     with RemoteOpenAIServer(model_info.name, args) as remote_server:
         yield remote_server
@@ -64,14 +62,16 @@ def server(model_info, dtype: str):
 
 @pytest.fixture(scope="module")
 def hf_model(hf_runner, model_info, dtype: str):
-    with hf_runner(model_info.name, dtype=dtype,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(
+        model_info.name, dtype=dtype, is_sentence_transformer=True
+    ) as hf_model:
         yield hf_model
 
 
 @pytest.mark.asyncio
-async def test_matryoshka(model_info: EmbedModelInfo,
-                          server: RemoteOpenAIServer, hf_model: HfRunner):
+async def test_matryoshka(
+    model_info: EmbedModelInfo, server: RemoteOpenAIServer, hf_model: HfRunner
+):
     client = server.get_async_client()
 
     async def make_request_and_correctness_test(dimensions):
@@ -84,7 +84,8 @@ async def test_matryoshka(model_info: EmbedModelInfo,
             encoding_format="float",
         )
         embeddings = EmbeddingResponse.model_validate(
-            embedding_response.model_dump(mode="json"))
+            embedding_response.model_dump(mode="json")
+        )
 
         assert embeddings.id is not None
         assert len(embeddings.data) == 3
@@ -97,18 +98,17 @@ async def test_matryoshka(model_info: EmbedModelInfo,
             assert len(embeddings.data[0].embedding) == dimensions
 
         vllm_outputs = [d.embedding for d in embeddings.data]
-        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
-                                       dimensions)
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
 
     if model_info.is_matryoshka:
-        valid_dimensions: list[Optional[int]] = [None]
+        valid_dimensions: list[int | None] = [None]
         if model_info.matryoshka_dimensions is not None:
             valid_dimensions += model_info.matryoshka_dimensions[:2]
 
         for dimensions in valid_dimensions:
             await make_request_and_correctness_test(dimensions)
 
-        invalid_dimensions: list[Optional[int]] = [-1]
+        invalid_dimensions: list[int | None] = [-1]
         if model_info.matryoshka_dimensions is not None:
             assert 5 not in model_info.matryoshka_dimensions
             invalid_dimensions.append(5)
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
index ab5f765c28ed64549d9499872cedce187c377a4c..f977c81a9084e6f5ef81163f2b8b3098114b3b15 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@@ -31,7 +31,6 @@ def _generate_random_text(word_count: int) -> str:
         "that",
         "these",
         "those",
-
         # Action verbs
         "create",
         "build",
@@ -80,7 +79,6 @@ def _generate_random_text(word_count: int) -> str:
         "finish",
         "deliver",
         "provide",
-
         # Technology and science nouns
         "system",
         "application",
@@ -132,7 +130,6 @@ def _generate_random_text(word_count: int) -> str:
         "optimization",
         "performance",
         "efficiency",
-
         # General nouns
         "project",
         "team",
@@ -175,7 +172,7 @@ def _generate_random_text(word_count: int) -> str:
         "session",
         "meeting",
         "discussion",
-        "decision"
+        "decision",
     ]
 
     words = []
@@ -189,7 +186,7 @@ def _generate_random_text(word_count: int) -> str:
     result = []
     for i, word in enumerate(words_list):
         result.append(word)
-        if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1):
+        if (i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1:
             result[-1] += "."
 
     return " ".join(result)
@@ -216,9 +213,11 @@ def server_with_chunked_processing():
         "--enforce-eager",
         "--max-model-len",
         "512",  # Set smaller max_model_len to trigger chunking mechanism
-        '--pooler-config',
-        ('{"pooling_type": "MEAN", "normalize": true, '
-         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
+        "--pooler-config",
+        (
+            '{"pooling_type": "MEAN", "normalize": true, '
+            '"enable_chunked_processing": true, "max_embed_len": 10000}'
+        ),
         "--gpu-memory-utilization",
         "0.8",
     ]
@@ -230,23 +229,22 @@ def server_with_chunked_processing():
 @pytest_asyncio.fixture
 async def client_with_chunked_processing(server_with_chunked_processing):
     """Create async client with chunking processing support."""
-    async with server_with_chunked_processing.get_async_client(
-    ) as async_client:
+    async with server_with_chunked_processing.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_long_text_embedding_1500_chars(
-        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
-    """Test embedding processing for ~1500 character long text 
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test embedding processing for ~1500 character long text
     (~1028 tokens, exceeding 512 token limit)."""
 
     # Verify text length
     # Verify text has sufficient word count (approximately 1500 words)
     word_count = len(LONG_TEXT_1500_WORDS.split())
-    assert word_count >= 1400, (
-        f"Test text word count insufficient: {word_count} words")
+    assert word_count >= 1400, f"Test text word count insufficient: {word_count} words"
 
     # Send embedding request
     embedding_response = await client_with_chunked_processing.embeddings.create(
@@ -257,12 +255,14 @@ async def test_long_text_embedding_1500_chars(
 
     # Verify response structure
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding
-               ) == 384  # multilingual-e5-small embedding dimension
+    assert (
+        len(embeddings.data[0].embedding) == 384
+    )  # multilingual-e5-small embedding dimension
     assert embeddings.usage.completion_tokens == 0
     # Due to chunked processing, token count should
     # reflect actual processed tokens
@@ -274,26 +274,26 @@ async def test_long_text_embedding_1500_chars(
 
     # Verify embedding vector validity
     embedding_vector = embeddings.data[0].embedding
-    assert all(
-        isinstance(x, float)
-        for x in embedding_vector), "Embedding vector should contain floats"
-    assert not all(
-        x == 0
-        for x in embedding_vector), "Embedding vector should not be all zeros"
+    assert all(isinstance(x, float) for x in embedding_vector), (
+        "Embedding vector should contain floats"
+    )
+    assert not all(x == 0 for x in embedding_vector), (
+        "Embedding vector should not be all zeros"
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_long_text_embedding_2500_chars(
-        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
     """Test embedding processing for ~2500 character long text
     (~2048 tokens, requiring multiple chunks)."""
 
     # Verify text length
     # Verify text has sufficient word count (approximately 2500 words)
     word_count = len(LONG_TEXT_2500_WORDS.split())
-    assert word_count >= 2300, (
-        f"Test text word count insufficient: {word_count} words")
+    assert word_count >= 2300, f"Test text word count insufficient: {word_count} words"
 
     # Send embedding request
     embedding_response = await client_with_chunked_processing.embeddings.create(
@@ -304,12 +304,14 @@ async def test_long_text_embedding_2500_chars(
 
     # Verify response structure
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding
-               ) == 384  # multilingual-e5-small embedding dimension
+    assert (
+        len(embeddings.data[0].embedding) == 384
+    )  # multilingual-e5-small embedding dimension
     assert embeddings.usage.completion_tokens == 0
     # Due to chunked processing, token count should
     # reflect actual processed tokens
@@ -321,18 +323,19 @@ async def test_long_text_embedding_2500_chars(
 
     # Verify embedding vector validity
     embedding_vector = embeddings.data[0].embedding
-    assert all(
-        isinstance(x, float)
-        for x in embedding_vector), "Embedding vector should contain floats"
-    assert not all(
-        x == 0
-        for x in embedding_vector), "Embedding vector should not be all zeros"
+    assert all(isinstance(x, float) for x in embedding_vector), (
+        "Embedding vector should contain floats"
+    )
+    assert not all(x == 0 for x in embedding_vector), (
+        "Embedding vector should not be all zeros"
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_batch_long_text_embedding(
-        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
     """Test batch long text embedding processing."""
 
     input_texts = [
@@ -350,7 +353,8 @@ async def test_batch_long_text_embedding(
 
     # Verify response structure
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 3  # Three input texts
@@ -375,13 +379,16 @@ async def test_batch_long_text_embedding(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_chunked_vs_normal_consistency(
-        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
     """Test consistency between chunked and
     normal processing (using short text)."""
 
     # Use a short text within the 512 token limit
-    short_text = ("Artificial intelligence technology is changing our world, "
-                  "bringing unprecedented opportunities and challenges.")
+    short_text = (
+        "Artificial intelligence technology is changing our world, "
+        "bringing unprecedented opportunities and challenges."
+    )
 
     # Send embedding request
     embedding_response = await client_with_chunked_processing.embeddings.create(
@@ -392,7 +399,8 @@ async def test_chunked_vs_normal_consistency(
 
     # Verify response structure
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -411,7 +419,8 @@ async def test_chunked_vs_normal_consistency(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_chunked_processing_response_format(
-        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
     """Test response format and structure during chunked processing."""
 
     # Test with long text to trigger chunking
@@ -423,7 +432,8 @@ async def test_chunked_processing_response_format(
 
     # Verify response structure
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -433,8 +443,10 @@ async def test_chunked_processing_response_format(
     # Verify embedding vector properties
     embedding_vector = embeddings.data[0].embedding
     import math
+
     vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
     # Check that the vector is normalized
     # (default behavior for most embedding models)
     assert 0.8 < vector_norm < 1.2, (
-        f"Vector norm should be reasonable, actual: {vector_norm}")
+        f"Vector norm should be reasonable, actual: {vector_norm}"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/openai/test_pooling.py
index 9f58955cfb40bec639ef3a4f9247c1834d5aad70..4b20c5b0fa84d0e48ba2143a8a9b97c65b343025 100644
--- a/tests/entrypoints/pooling/openai/test_pooling.py
+++ b/tests/entrypoints/pooling/openai/test_pooling.py
@@ -2,15 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
+import json
 
 import numpy as np
 import pytest
 import requests
+import torch
 
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    binary2tensor,
+    decode_pooling_output,
+)
 
 MODEL_NAME = "internlm/internlm2-1_8b-reward"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
@@ -46,11 +55,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
     # test single pooling
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_texts,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -66,11 +71,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
     input_tokens = [1, 1, 1, 1, 1]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_tokens,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -88,16 +89,13 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
 async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
     # test list[str]
     input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
     ]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_texts,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -110,15 +108,15 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
     assert poolings.usage.total_tokens == 29
 
     # test list[list[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_tokens,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -133,18 +131,21 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_pooling(server: RemoteOpenAIServer,
-                                    model_name: str):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     chat_response = requests.post(
         server.url_for("pooling"),
@@ -180,24 +181,22 @@ async def test_conversation_pooling(server: RemoteOpenAIServer,
         },
     )
     completions_response.raise_for_status()
-    completion_poolings = PoolingResponse.model_validate(
-        completions_response.json())
+    completion_poolings = PoolingResponse.model_validate(completions_response.json())
 
     assert chat_poolings.id is not None
     assert completion_poolings.id is not None
     assert chat_poolings.created <= completion_poolings.created
-    assert chat_poolings.model_dump(
-        exclude={"id", "created"}) == (completion_poolings.model_dump(
-            exclude={"id", "created"}))
+    assert chat_poolings.model_dump(exclude={"id", "created"}) == (
+        completion_poolings.model_dump(exclude={"id", "created"})
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_pooling(server: RemoteOpenAIServer,
-                                    model_name: str):
+async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
         "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
+        "The best thing about vLLM is that it supports many different models",
     ]
 
     float_response = requests.post(
@@ -210,9 +209,7 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     )
     float_response.raise_for_status()
     responses_float = PoolingResponse.model_validate(float_response.json())
-    float_data = [
-        np.array(d.data).squeeze(-1).tolist() for d in responses_float.data
-    ]
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
 
     base64_response = requests.post(
         server.url_for("pooling"),
@@ -228,13 +225,15 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     decoded_responses_base64_data = []
     for data in responses_base64.data:
         decoded_responses_base64_data.append(
-            np.frombuffer(base64.b64decode(data.data),
-                          dtype="float32").tolist())
-
-    check_embeddings_close(embeddings_0_lst=float_data,
-                           embeddings_1_lst=decoded_responses_base64_data,
-                           name_0="float32",
-                           name_1="base64")
+            np.frombuffer(base64.b64decode(data.data), dtype="float32").tolist()
+        )
+
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=decoded_responses_base64_data,
+        name_0="float32",
+        name_1="base64",
+    )
 
     # Default response is float32 decoded from base64 by OpenAI Client
     default_response = requests.post(
@@ -250,10 +249,136 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
         np.array(d.data).squeeze(-1).tolist() for d in responses_default.data
     ]
 
-    check_embeddings_close(embeddings_0_lst=float_data,
-                           embeddings_1_lst=default_data,
-                           name_0="float32",
-                           name_1="default")
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=default_data,
+        name_0="float32",
+        name_1="default",
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_base64_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    url = server.url_for("pooling")
+    float_response = requests.post(
+        url,
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float",
+        },
+    )
+    responses_float = PoolingResponse.model_validate(float_response.json())
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
+
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            responses_base64 = requests.post(
+                url,
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "base64",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            base64_data = []
+            for data in responses_base64.json()["data"]:
+                binary = base64.b64decode(data["data"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                base64_data.append(tensor.to(torch.float32).tolist())
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=base64_data,
+                name_0="float_data",
+                name_1="base64_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    url = server.url_for("pooling")
+    float_response = requests.post(
+        url,
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float",
+        },
+    )
+    responses_float = PoolingResponse.model_validate(float_response.json())
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                url,
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            metadata = json.loads(responses_bytes.headers["metadata"])
+            body = responses_bytes.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
+async def test_params_not_supported(
+    server: RemoteOpenAIServer, model_name: str, param_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_base64 = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "base64",
+            param_name: f"bad_{param_name}",
+        },
+    )
+
+    assert responses_base64.status_code == 400
+    assert "literal_error" in responses_base64.json()["error"]["message"]
+    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
 
 
 @pytest.mark.asyncio
@@ -268,39 +393,46 @@ async def test_invocations(server: RemoteOpenAIServer):
         "encoding_format": "float",
     }
 
-    completion_response = requests.post(server.url_for("pooling"),
-                                        json=request_args)
+    completion_response = requests.post(server.url_for("pooling"), json=request_args)
     completion_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     completion_output = completion_response.json()
     invocation_output = invocation_response.json()
 
     assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
         assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=completion_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="completion",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=completion_data["data"],
+            embeddings_1_lst=invocation_data["data"],
+            name_0="completion",
+            name_1="invocation",
+        )
 
 
 @pytest.mark.asyncio
 async def test_invocations_conversation(server: RemoteOpenAIServer):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -311,18 +443,22 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
     chat_response = requests.post(server.url_for("pooling"), json=request_args)
     chat_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_response.json()
     invocation_output = invocation_response.json()
 
     assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
         assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=chat_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="chat",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=chat_data["data"],
+            embeddings_1_lst=invocation_data["data"],
+            name_0="chat",
+            name_1="invocation",
+        )
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index 992cb5147ef0d516557d38d4f5ac6db616b3f829..1d85190c12a191a3aa2042e0dcd2a7dd47e74121 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import RerankResponse
+from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
 
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
@@ -25,15 +25,18 @@ def server():
 def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents,
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": query,
+            "documents": documents,
+        },
+    )
     rerank_response.raise_for_status()
     rerank = RerankResponse.model_validate(rerank_response.json())
 
@@ -49,16 +52,14 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
     documents = [
         "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.", "Cross-encoder models are neat"
+        "The capital of France is Paris.",
+        "Cross-encoder models are neat",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents,
-                                        "top_n": 2
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
+    )
     rerank_response.raise_for_status()
     rerank = RerankResponse.model_validate(rerank_response.json())
 
@@ -71,28 +72,26 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
-
     query = "What is the capital of France?" * 100
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents},
+    )
     assert rerank_response.status_code == 400
     # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in \
-        rerank_response.text
+    assert "Please reduce the length of the input." in rerank_response.text
 
 
 def test_invocations(server: RemoteOpenAIServer):
     query = "What is the capital of France?"
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
     request_args = {
@@ -101,58 +100,118 @@ def test_invocations(server: RemoteOpenAIServer):
         "documents": documents,
     }
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json=request_args)
+    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
     rerank_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     rerank_output = rerank_response.json()
     invocation_output = invocation_response.json()
 
     assert rerank_output.keys() == invocation_output.keys()
-    for rerank_result, invocations_result in zip(rerank_output["results"],
-                                                 invocation_output["results"]):
+    for rerank_result, invocations_result in zip(
+        rerank_output["results"], invocation_output["results"]
+    ):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.05)
+            invocations_result["relevance_score"], rel=0.05
+        )
         # TODO: reset this tolerance to 0.01 once we find
         # an alternative to flash_attn with bfloat16
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_activation(server: RemoteOpenAIServer, model_name: str):
-
-    async def get_outputs(activation):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
+    async def get_outputs(use_activation):
         query = "What is the capital of France?"
         documents = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        response = requests.post(server.url_for("rerank"),
-                                 json={
-                                     "model": model_name,
-                                     "query": query,
-                                     "documents": documents,
-                                     "activation": activation
-                                 })
+        response = requests.post(
+            server.url_for("rerank"),
+            json={
+                "model": model_name,
+                "query": query,
+                "documents": documents,
+                "use_activation": use_activation,
+            },
+        )
         outputs = response.json()
 
-        return torch.tensor([x['relevance_score'] for x in outputs["results"]])
+        return torch.tensor([x["relevance_score"] for x in outputs["results"]])
+
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 1
 
-    default = await get_outputs(activation=None)
-    w_activation = await get_outputs(activation=True)
-    wo_activation = await get_outputs(activation=False)
 
-    assert torch.allclose(default, w_activation,
-                          atol=1e-2), "Default should use activation."
-    assert not torch.allclose(
-        w_activation, wo_activation,
-        atol=1e-2), "wo_activation should not use activation."
-    assert torch.allclose(
-        F.sigmoid(wo_activation), w_activation, atol=1e-2
-    ), "w_activation should be close to activation(wo_activation)."
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py
index d676ecccbc87c67f12b755d639e8bf7aa43120cb..b8f796d47efaa1647e20915a6081695abf8795b2 100644
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
@@ -12,14 +12,8 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ScoreResponse
 
 MODELS = [
-    {
-        "name": "BAAI/bge-reranker-v2-m3",
-        "is_cross_encoder": True
-    },
-    {
-        "name": "BAAI/bge-base-en-v1.5",
-        "is_cross_encoder": False
-    },
+    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
+    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
 ]
 DTYPE = "half"
 
@@ -28,9 +22,7 @@ def run_transformers(hf_model, model, text_pairs):
     if model["is_cross_encoder"]:
         return hf_model.predict(text_pairs).tolist()
     else:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         return [
             F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
             for pair in hf_embeddings
@@ -54,8 +46,9 @@ def server(model: dict[str, Any]):
 def runner(model: dict[str, Any], hf_runner):
     kwargs = {
         "dtype": DTYPE,
-        "is_cross_encoder" if model["is_cross_encoder"]\
-              else "is_sentence_transformer": True
+        "is_cross_encoder"
+        if model["is_cross_encoder"]
+        else "is_sentence_transformer": True,
     }
 
     with hf_runner(model["name"], **kwargs) as hf_model:
@@ -63,21 +56,23 @@ def runner(model: dict[str, Any], hf_runner):
 
 
 class TestModel:
-
-    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
-                                    model: dict[str, Any], runner):
+    def test_text_1_str_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = "What is the capital of France?"
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -93,23 +88,26 @@ class TestModel:
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
-                                     model: dict[str, Any], runner):
+    def test_text_1_list_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = [
             "What is the capital of the United States?",
-            "What is the capital of France?"
+            "What is the capital of France?",
         ]
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -125,17 +123,20 @@ class TestModel:
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
-                                   model: dict[str, Any], runner):
+    def test_text_1_str_text_2_str(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -151,40 +152,41 @@ class TestModel:
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_score_max_model_len(self, server: RemoteOpenAIServer,
-                                 model: dict[str, Any]):
-
+    def test_score_max_model_len(
+        self, server: RemoteOpenAIServer, model: dict[str, Any]
+    ):
         text_1 = "What is the capital of France?" * 20
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         assert score_response.status_code == 400
         # Assert just a small fragments of the response
-        assert "Please reduce the length of the input." in \
-            score_response.text
+        assert "Please reduce the length of the input." in score_response.text
 
         # Test truncation
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                           "truncate_prompt_tokens": 101
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+                "truncate_prompt_tokens": 101,
+            },
+        )
         assert score_response.status_code == 400
-        assert "Please, select a smaller truncation size." in \
-            score_response.text
+        assert "Please, select a smaller truncation size." in score_response.text
 
-    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
-                                                                       Any]):
+    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
@@ -194,62 +196,64 @@ class TestModel:
             "text_2": text_2,
         }
 
-        score_response = requests.post(server.url_for("score"),
-                                       json=request_args)
+        score_response = requests.post(server.url_for("score"), json=request_args)
         score_response.raise_for_status()
 
-        invocation_response = requests.post(server.url_for("invocations"),
-                                            json=request_args)
+        invocation_response = requests.post(
+            server.url_for("invocations"), json=request_args
+        )
         invocation_response.raise_for_status()
 
         score_output = score_response.json()
         invocation_output = invocation_response.json()
 
         assert score_output.keys() == invocation_output.keys()
-        for score_data, invocation_data in zip(score_output["data"],
-                                               invocation_output["data"]):
+        for score_data, invocation_data in zip(
+            score_output["data"], invocation_output["data"]
+        ):
             assert score_data.keys() == invocation_data.keys()
             assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.05)
+                invocation_data["score"], rel=0.05
+            )
             # TODO: reset this tolerance to 0.01 once we find
             # an alternative to flash_attn with bfloat16
 
-    def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
-                                                                      Any]):
-
-        def get_outputs(activation):
+    def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
+        def get_outputs(use_activation):
             text_1 = "What is the capital of France?"
             text_2 = "The capital of France is Paris."
-            response = requests.post(server.url_for("score"),
-                                     json={
-                                         "model": model["name"],
-                                         "text_1": text_1,
-                                         "text_2": text_2,
-                                         "activation": activation
-                                     })
+            response = requests.post(
+                server.url_for("score"),
+                json={
+                    "model": model["name"],
+                    "text_1": text_1,
+                    "text_2": text_2,
+                    "use_activation": use_activation,
+                },
+            )
             if response.status_code != 200:
                 return response
 
             outputs = response.json()
-            return torch.tensor([x['score'] for x in outputs["data"]])
+            return torch.tensor([x["score"] for x in outputs["data"]])
 
         if model["is_cross_encoder"]:
-
-            default = get_outputs(activation=None)
-            w_activation = get_outputs(activation=True)
-            wo_activation = get_outputs(activation=False)
-
-            assert torch.allclose(default, w_activation,
-                                  atol=1e-2), "Default should use activation."
-            assert not torch.allclose(
-                w_activation, wo_activation,
-                atol=1e-2), "wo_activation should not use activation."
-            assert torch.allclose(
-                F.sigmoid(wo_activation), w_activation, atol=1e-2
-            ), "w_activation should be close to activation(wo_activation)."
+            default = get_outputs(use_activation=None)
+            w_activation = get_outputs(use_activation=True)
+            wo_activation = get_outputs(use_activation=False)
+
+            assert torch.allclose(default, w_activation, atol=1e-2), (
+                "Default should use activation."
+            )
+            assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+                "wo_activation should not use activation."
+            )
+            assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
+                "w_activation should be close to activation(wo_activation)."
+            )
         else:
-            get_outputs(activation=None)
+            get_outputs(use_activation=None)
 
             # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(activation=True)
+            response = get_outputs(use_activation=True)
             assert response.status_code == 400
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py
index 6bdf5ce7c4a6c91a349fb929eec3171160209b6a..6889628dc914540bb896f79b202ee4b9debb9bc1 100644
--- a/tests/entrypoints/pooling/openai/test_truncation.py
+++ b/tests/entrypoints/pooling/openai/test_truncation.py
@@ -54,12 +54,10 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
-    response = await client.post(path="embeddings",
-                                 cast_to=object,
-                                 body={**kwargs})
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
     assert response["usage"]["prompt_tokens"] == truncation_size
 
@@ -70,12 +68,10 @@ async def test_zero_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
-    response = await client.post(path="embeddings",
-                                 cast_to=object,
-                                 body={**kwargs})
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
     assert response["usage"]["prompt_tokens"] == truncation_size
 
@@ -86,7 +82,7 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
     with pytest.raises(openai.BadRequestError) as err:
@@ -95,9 +91,11 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
     assert err.value.status_code == 400
     error_details = err.value.response.json()["error"]
     assert error_details["type"] == "BadRequestError"
-    expected_message = ("truncate_prompt_tokens value is "
-                        "greater than max_model_len."
-                        " Please, select a smaller truncation size.")
+    expected_message = (
+        "truncate_prompt_tokens value is "
+        "greater than max_model_len."
+        " Please, select a smaller truncation size."
+    )
     assert error_details["message"] == expected_message
 
 
@@ -107,11 +105,9 @@ async def test_max_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
-    response = await client.post(path="embeddings",
-                                 cast_to=object,
-                                 body={**kwargs})
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
     assert response["usage"]["prompt_tokens"] == max_model_len
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/openai/test_vision_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2616e057b175521d5706e72cb77213e83c8cd53
--- /dev/null
+++ b/tests/entrypoints/pooling/openai/test_vision_classification.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py
index 48434e36eb2659e27b9a26b6d9cbf8b06fcd3ae0..944392d66fa5f299bca40a76ce4de8e09ad16030 100644
--- a/tests/entrypoints/pooling/openai/test_vision_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py
@@ -14,7 +14,7 @@ from vllm.multimodal.utils import encode_image_base64, fetch_image
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
 
-vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
 assert vlm2vec_jinja_path.exists()
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -50,16 +50,15 @@ def server():
 @pytest.fixture(scope="session")
 def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url:
-        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
         for image_url in TEST_IMAGE_ASSETS
     }
 
 
 def get_hf_prompt_tokens(model_name, content, image_url):
-    processor = AutoProcessor.from_pretrained(model_name,
-                                              trust_remote_code=True,
-                                              num_crops=4)
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
@@ -71,39 +70,28 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
-                               image_url: str):
+async def test_image_embedding(
+    server: RemoteOpenAIServer, model_name: str, image_url: str
+):
     content_text = "Represent the given image."
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     response = requests.post(
         server.url_for("v1/embeddings"),
-        json={
-            "model": model_name,
-            "messages": messages,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "messages": messages, "encoding_format": "float"},
     )
     response.raise_for_status()
     embeddings = EmbeddingResponse.model_validate(response.json())
 
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
diff --git a/tests/fastsafetensors_loader/__init__.py b/tests/entrypoints/sagemaker/__init__.py
similarity index 100%
rename from tests/fastsafetensors_loader/__init__.py
rename to tests/entrypoints/sagemaker/__init__.py
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c859c2527d25819fdabf947bc4125ab43beef4e
--- /dev/null
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Shared fixtures and utilities for SageMaker tests."""
+
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# Model name constants used across tests
+MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
+LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
+
+# SageMaker header constants
+HEADER_SAGEMAKER_CLOSED_SESSION_ID = "X-Amzn-SageMaker-Closed-Session-Id"
+HEADER_SAGEMAKER_SESSION_ID = "X-Amzn-SageMaker-Session-Id"
+HEADER_SAGEMAKER_NEW_SESSION_ID = "X-Amzn-SageMaker-New-Session-Id"
+
+
+@pytest.fixture(scope="session")
+def smollm2_lora_files():
+    """Download LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=LORA_ADAPTER_NAME_SMOLLM)
+
+
+@pytest.fixture(scope="module")
+def basic_server_with_lora(smollm2_lora_files):
+    """Basic server fixture with standard configuration."""
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--max-lora-rank",
+        "256",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+    with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def async_client(basic_server_with_lora: RemoteOpenAIServer):
+    """Async OpenAI client fixture for use with basic_server."""
+    async with basic_server_with_lora.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d4f8e885824abd0ac9bd9e3f4d87da9e1300a3b
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
@@ -0,0 +1,734 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration tests for handler override functionality.
+
+Tests real customer usage scenarios:
+- Using @custom_ping_handler and @custom_invocation_handler decorators
+  to override handlers
+- Setting environment variables for handler specifications
+- Writing customer scripts with custom_sagemaker_ping_handler() and
+  custom_sagemaker_invocation_handler() functions
+- Priority: env vars > decorators > customer script files > framework
+  defaults
+
+Note: These tests focus on validating server responses rather than directly calling
+get_ping_handler() and get_invoke_handler() to ensure full integration testing.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestHandlerOverrideIntegration:
+    """Integration tests simulating real customer usage scenarios.
+
+    Each test simulates a fresh server startup where customers:
+    - Use @custom_ping_handler and @custom_invocation_handler decorators
+    - Set environment variables (CUSTOM_FASTAPI_PING_HANDLER, etc.)
+    - Write customer scripts with custom_sagemaker_ping_handler() and
+      custom_sagemaker_invocation_handler() functions
+    """
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+        self._clear_env_vars()
+
+    def teardown_method(self):
+        """Cleanup after each test."""
+        self._clear_env_vars()
+
+    def _clear_caches(self):
+        """Clear handler registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.handler import (
+                handler_registry,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            handler_registry.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    def _clear_env_vars(self):
+        """Clear SageMaker environment variables."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            # Clear SageMaker env vars
+            for var in [
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME,
+            ]:
+                os.environ.pop(var, None)
+
+            # Clear FastAPI env vars
+            for var in [
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER,
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER,
+            ]:
+                os.environ.pop(var, None)
+        except ImportError:
+            pass
+
+    @pytest.mark.asyncio
+    async def test_customer_script_functions_auto_loaded(self):
+        """Test customer scenario: script functions automatically override
+        framework defaults."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with ping() and invoke() functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "customer_override", 
+        "message": "Custom ping from customer script"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Custom response from customer script"],
+        "source": "customer_override"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Customer sets SageMaker environment variables to point to their script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Customer tests their server and sees their overrides work
+                # automatically
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their functions are used
+                assert ping_data["source"] == "customer_override"
+                assert ping_data["message"] == "Custom ping from customer script"
+                assert invoke_data["source"] == "customer_override"
+                assert invoke_data["predictions"] == [
+                    "Custom response from customer script"
+                ]
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_customer_decorator_usage(self):
+        """Test customer scenario: using @custom_ping_handler and
+        @custom_invocation_handler decorators."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+@sagemaker_standards.custom_ping_handler
+async def my_ping():
+    return {
+        "type": "ping",
+        "source": "customer_decorator"
+    }
+
+@sagemaker_standards.custom_invocation_handler  
+async def my_invoke(request: Request):
+    return {
+        "type": "invoke", 
+        "source": "customer_decorator"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their handlers are used by the server
+                assert ping_data["source"] == "customer_decorator"
+                assert invoke_data["source"] == "customer_decorator"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_handler_priority_order(self):
+        """Test priority: @custom_ping_handler/@custom_invocation_handler
+        decorators vs script functions."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script with both decorator and regular functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+# Customer uses @custom_ping_handler decorator (higher priority than script functions)
+@sagemaker_standards.custom_ping_handler
+async def decorated_ping():
+    return {
+        "status": "healthy",
+        "source": "ping_decorator_in_script", 
+        "priority": "decorator"
+    }
+
+# Customer also has a regular function (lower priority than
+# @custom_ping_handler decorator)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_function",
+        "priority": "function"
+    }
+
+# Customer has a regular invoke function
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke_function",
+        "priority": "function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # @custom_ping_handler decorator has higher priority than
+                # script function
+                assert ping_data["source"] == "ping_decorator_in_script"
+                assert ping_data["priority"] == "decorator"
+
+                # Script function is used for invoke
+                assert invoke_data["source"] == "script_invoke_function"
+                assert invoke_data["priority"] == "function"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_environment_variable_script_loading(self):
+        """Test that environment variables correctly specify script location
+        and loading."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script in a specific directory
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Loaded via environment variables"],
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Test environment variable script loading
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Verify that the script was loaded via environment variables
+                assert ping_data["source"] == "env_loaded_script"
+                assert ping_data["method"] == "environment_variable_loading"
+                assert invoke_data["source"] == "env_loaded_script"
+                assert invoke_data["method"] == "environment_variable_loading"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_framework_default_handlers(self):
+        """Test that framework default handlers work when no customer
+        overrides exist."""
+        args = [
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "2048",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "32",
+        ]
+
+        # Explicitly pass empty env_dict to ensure no SageMaker env vars are set
+        # This prevents pollution from previous tests
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            env_dict = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: "",
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: "",
+            }
+        except ImportError:
+            env_dict = {}
+
+        with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=env_dict) as server:
+            # Test that default ping works
+            ping_response = requests.get(server.url_for("ping"))
+            assert ping_response.status_code == 200
+
+            # Test that default invocations work
+            invoke_response = requests.post(
+                server.url_for("invocations"),
+                json={
+                    "model": MODEL_NAME_SMOLLM,
+                    "messages": [{"role": "user", "content": "Hello"}],
+                    "max_tokens": 5,
+                },
+            )
+            assert invoke_response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_handler_env_var_override(self):
+        """Test CUSTOM_FASTAPI_PING_HANDLER and CUSTOM_FASTAPI_INVOCATION_HANDLER
+        environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with both env var handlers and script functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request, Response
+import json
+
+async def env_var_ping_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var_ping",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_var_invoke_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var_invoke",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_ping",
+        "method": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke",
+        "method": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to override both handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_var_ping_handler"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_var_invoke_handler"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler override
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable should override script function
+                assert ping_data["method"] == "environment_variable"
+                assert ping_data["source"] == "env_var_ping"
+
+                # Test invocation handler override
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable should override script function
+                assert invoke_data["method"] == "environment_variable"
+                assert invoke_data["source"] == "env_var_invoke"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_env_var_priority_over_decorator_and_script(self):
+        """Test that environment variables have highest priority over decorators
+        and script functions for both ping and invocation handlers."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with all three handler types for both ping and invocation
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request, Response
+import json
+
+# Environment variable handlers (highest priority)
+async def env_priority_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_priority_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+# Decorator handlers (medium priority)
+@sagemaker_standards.custom_ping_handler
+async def decorator_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+@sagemaker_standards.custom_invocation_handler
+async def decorator_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Decorator response"],
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+# Script functions (lowest priority)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script",
+        "priority": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script",
+        "priority": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to specify highest priority handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_priority_ping"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_priority_invoke"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler priority
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert ping_data["priority"] == "environment_variable"
+                assert ping_data["source"] == "env_var"
+
+                # Test invocation handler priority
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert invoke_data["priority"] == "environment_variable"
+                assert invoke_data["source"] == "env_var"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2867efdc584074d163bac7b229ad8f5f1dea173
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai  # use the official async_client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import MODEL_NAME_SMOLLM
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # The SageMaker standards library creates a POST /adapters endpoint
+    # that maps to the load_lora_adapter handler with request shape:
+    # {"lora_name": "body.name", "lora_path": "body.src"}
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "smollm2-lora-sagemaker", "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    models = await async_client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == smollm2_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME_SMOLLM
+    assert dynamic_lora_model.id == "smollm2-lora-sagemaker"
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter
+    adapter_name = "smollm2-lora-sagemaker-unload"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Verify it's in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name in adapter_ids
+
+    # Now unload it using DELETE /adapters/{adapter_name}
+    # The SageMaker standards maps this to unload_lora_adapter with:
+    # {"lora_name": "path_params.adapter_name"}
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", adapter_name),
+    )
+    unload_response.raise_for_status()
+
+    # Verify it's no longer in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name not in adapter_ids
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_not_found(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "nonexistent-adapter", "src": "/path/does/not/exist"},
+    )
+    assert load_response.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_invalid_files(
+    basic_server_with_lora: RemoteOpenAIServer,
+    tmp_path,
+):
+    invalid_files = tmp_path / "invalid_adapter"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("not valid json")
+
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "invalid-adapter", "src": str(invalid_files)},
+    )
+    assert load_response.status_code == 400
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_nonexistent_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    # Attempt to unload an adapter that doesn't exist
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", "nonexistent-adapter-name"),
+    )
+    assert unload_response.status_code in (400, 404)
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_invocations_with_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter via SageMaker endpoint
+    adapter_name = "smollm2-lora-invoke-test"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Now test the /invocations endpoint with the adapter
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={
+            "X-Amzn-SageMaker-Adapter-Identifier": adapter_name,
+        },
+        json={
+            "prompt": "Hello, how are you?",
+            "max_tokens": 10,
+        },
+    )
+    invocation_response.raise_for_status()
+    invocation_output = invocation_response.json()
+
+    # Verify we got a valid completion response
+    assert "choices" in invocation_output
+    assert len(invocation_output["choices"]) > 0
+    assert "text" in invocation_output["choices"][0]
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_multiple_adapters_load_unload(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    adapter_names = [f"sagemaker-adapter-{i}" for i in range(5)]
+
+    # Load all adapters
+    for adapter_name in adapter_names:
+        load_response = requests.post(
+            basic_server_with_lora.url_for("adapters"),
+            json={"name": adapter_name, "src": smollm2_lora_files},
+        )
+        load_response.raise_for_status()
+
+    # Verify all are in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name in adapter_ids
+
+    # Unload all adapters
+    for adapter_name in adapter_names:
+        unload_response = requests.delete(
+            basic_server_with_lora.url_for("adapters", adapter_name),
+        )
+        unload_response.raise_for_status()
+
+    # Verify all are removed from models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name not in adapter_ids
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ed0c7e289730e8e46437d3705a11d41a90217e
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration test for middleware loader functionality.
+
+Tests that customer middlewares get called correctly with a vLLM server.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestMiddlewareIntegration:
+    """Integration test for middleware with vLLM server."""
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+
+    def _clear_caches(self):
+        """Clear middleware registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.fastapi.middleware import (
+                middleware_registry,
+            )
+            from model_hosting_container_standards.common.fastapi.middleware.source.decorator_loader import (  # noqa: E501
+                decorator_loader,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            middleware_registry.clear_middlewares()
+            decorator_loader.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    @pytest.mark.asyncio
+    async def test_customer_middleware_with_vllm_server(self):
+        """Test that customer middlewares work with actual vLLM server.
+
+        Tests decorator-based middlewares (@custom_middleware, @input_formatter,
+        @output_formatter)
+        on multiple endpoints (chat/completions, invocations).
+        """
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script with multiple decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware, input_formatter, output_formatter
+)
+
+# Global flag to track if input formatter was called
+_input_formatter_called = False
+
+@input_formatter
+async def customer_input_formatter(request):
+    # Process input - mark that input formatter was called
+    global _input_formatter_called
+    _input_formatter_called = True
+    return request
+
+@custom_middleware("throttle")
+async def customer_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Customer-Throttle"] = "applied"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "throttle,"
+    return response
+
+@output_formatter
+async def customer_output_formatter(response):
+    global _input_formatter_called
+    response.headers["X-Customer-Processed"] = "true"
+    # Since input_formatter and output_formatter are combined into
+    # pre_post_process middleware,
+    # if output_formatter is called, input_formatter should have been called too
+    if _input_formatter_called:
+        response.headers["X-Input-Formatter-Called"] = "true"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "output_formatter,"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to point to customer script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test 1: Middlewares applied to chat/completions endpoint
+                chat_response = requests.post(
+                    server.url_for("v1/chat/completions"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert chat_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in chat_response.headers
+                assert chat_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in chat_response.headers
+                assert chat_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in chat_response.headers
+                assert chat_response.headers["X-Input-Formatter-Called"] == "true"
+
+                # Verify middleware execution order
+                execution_order = chat_response.headers.get(
+                    "X-Middleware-Order", ""
+                ).rstrip(",")
+                order_parts = execution_order.split(",") if execution_order else []
+                assert "throttle" in order_parts
+                assert "output_formatter" in order_parts
+
+                # Test 2: Middlewares applied to invocations endpoint
+                invocations_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert invocations_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in invocations_response.headers
+                assert (
+                    invocations_response.headers["X-Input-Formatter-Called"] == "true"
+                )
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_with_ping_endpoint(self):
+        """Test that middlewares work with SageMaker ping endpoint."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware
+)
+
+@custom_middleware("pre_post_process")
+async def ping_tracking_middleware(request, call_next):
+    response = await call_next(request)
+    if request.url.path == "/ping":
+        response.headers["X-Ping-Tracked"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping endpoint with middleware
+                response = requests.get(server.url_for("ping"))
+
+                assert response.status_code == 200
+                assert "X-Ping-Tracked" in response.headers
+                assert response.headers["X-Ping-Tracked"] == "true"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_env_var_override(self):
+        """Test middleware environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with middleware functions specified via env vars
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+# Global flag to track if pre_process was called
+_pre_process_called = False
+
+async def env_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Env-Throttle"] = "applied"
+    return response
+
+async def env_pre_process(request: Request) -> Request:
+    # Mark that pre_process was called
+    global _pre_process_called
+    _pre_process_called = True
+    return request
+
+async def env_post_process(response):
+    global _pre_process_called
+    if hasattr(response, 'headers'):
+        response.headers["X-Env-Post-Process"] = "applied"
+        # Since pre_process and post_process are combined into
+        # pre_post_process middleware,
+        # if post_process is called, pre_process should have been called too
+        if _pre_process_called:
+            response.headers["X-Pre-Process-Called"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables for middleware
+            # Use script_name with .py extension as per plugin example
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_MIDDLEWARE_THROTTLE: (
+                    f"{script_name}:env_throttle_middleware"
+                ),
+                FastAPIEnvVars.CUSTOM_PRE_PROCESS: f"{script_name}:env_pre_process",
+                FastAPIEnvVars.CUSTOM_POST_PROCESS: f"{script_name}:env_post_process",
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                response = requests.get(server.url_for("ping"))
+                assert response.status_code == 200
+
+                # Check if environment variable middleware was applied
+                headers = response.headers
+
+                # Verify that env var middlewares were applied
+                assert "X-Env-Throttle" in headers, (
+                    "Throttle middleware should be applied via env var"
+                )
+                assert headers["X-Env-Throttle"] == "applied"
+
+                assert "X-Env-Post-Process" in headers, (
+                    "Post-process middleware should be applied via env var"
+                )
+                assert headers["X-Env-Post-Process"] == "applied"
+
+                # Verify that pre_process was called
+                assert "X-Pre-Process-Called" in headers, (
+                    "Pre-process should be called via env var"
+                )
+                assert headers["X-Pre-Process-Called"] == "true"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6206000385bd90de658e4fede4c06f1d9c42d2c2
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import openai  # use the official client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    HEADER_SAGEMAKER_CLOSED_SESSION_ID,
+    HEADER_SAGEMAKER_NEW_SESSION_ID,
+    HEADER_SAGEMAKER_SESSION_ID,
+    MODEL_NAME_SMOLLM,
+)
+
+CLOSE_BADREQUEST_CASES = [
+    (
+        "nonexistent_session_id",
+        {"session_id": "nonexistent-session-id"},
+        {},
+        "session not found",
+    ),
+    ("malformed_close_request", {}, {"extra-field": "extra-field-data"}, None),
+]
+
+
+@pytest.mark.asyncio
+async def test_create_session_badrequest(basic_server_with_lora: RemoteOpenAIServer):
+    bad_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        json={"requestType": "NEW_SESSION", "extra-field": "extra-field-data"},
+    )
+
+    assert bad_response.status_code == 400
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_name,session_id_change,request_body_change,expected_error",
+    CLOSE_BADREQUEST_CASES,
+)
+async def test_close_session_badrequest(
+    basic_server_with_lora: RemoteOpenAIServer,
+    test_name: str,
+    session_id_change: dict[str, str],
+    request_body_change: dict[str, str],
+    expected_error: str | None,
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    if request_body_change:
+        close_request_json.update(request_body_change)
+    bad_session_id = session_id_change.get("session_id")
+    bad_close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: bad_session_id or valid_session_id},
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert bad_close_response.status_code == 400
+    if expected_error:
+        assert expected_error in bad_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_close_session_invalidrequest(
+    basic_server_with_lora: RemoteOpenAIServer, async_client: openai.AsyncOpenAI
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    invalid_close_response = requests.post(
+        url,
+        # no headers to specify session_id
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert invalid_close_response.status_code == 424
+    assert "invalid session_id" in invalid_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_session(basic_server_with_lora: RemoteOpenAIServer):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    # test invocation with session id
+
+    request_args = {
+        "model": MODEL_NAME_SMOLLM,
+        "prompt": "what is 1+1?",
+        "max_completion_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": False,
+    }
+
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json=request_args,
+    )
+    invocation_response.raise_for_status()
+
+    # close created session, should succeed
+    close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    close_response.raise_for_status()
+
+    assert (
+        close_response.headers.get(HEADER_SAGEMAKER_CLOSED_SESSION_ID)
+        == valid_session_id
+    )
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index 34b05ad17b025844a639ec9afba28329f2ccc811..3fadbf2ef0dd073a77d32d5bea444bc542161616 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -5,13 +5,11 @@ import multiprocessing
 import socket
 import threading
 import time
-from typing import Optional
 from unittest.mock import patch
 
 import pytest
 
-from vllm.v1.utils import (APIServerProcessManager,
-                           wait_for_completion_or_failure)
+from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
 # Global variables to control worker behavior
 WORKER_RUNTIME_SECONDS = 0.5
@@ -30,26 +28,22 @@ def api_server_args():
     """Fixture to provide arguments for APIServerProcessManager."""
     sock = socket.socket()
     return {
-        "target_server_fn":
-        mock_run_api_server_worker,
-        "listen_address":
-        "localhost:8000",
-        "sock":
-        sock,
-        "args":
-        "test_args",  # Simple string to avoid pickling issues
-        "num_servers":
-        3,
+        "target_server_fn": mock_run_api_server_worker,
+        "listen_address": "localhost:8000",
+        "sock": sock,
+        "args": "test_args",  # Simple string to avoid pickling issues
+        "num_servers": 3,
         "input_addresses": [
-            "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
-            "tcp://127.0.0.1:5003"
+            "tcp://127.0.0.1:5001",
+            "tcp://127.0.0.1:5002",
+            "tcp://127.0.0.1:5003",
         ],
         "output_addresses": [
-            "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
-            "tcp://127.0.0.1:6003"
+            "tcp://127.0.0.1:6001",
+            "tcp://127.0.0.1:6002",
+            "tcp://127.0.0.1:6003",
         ],
-        "stats_update_address":
-        "tcp://127.0.0.1:7000",
+        "stats_update_address": "tcp://127.0.0.1:7000",
     }
 
 
@@ -95,8 +89,9 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
             assert not proc.is_alive()
 
 
-@patch("vllm.entrypoints.cli.serve.run_api_server_worker_proc",
-       mock_run_api_server_worker)
+@patch(
+    "vllm.entrypoints.cli.serve.run_api_server_worker_proc", mock_run_api_server_worker
+)
 def test_wait_for_completion_or_failure(api_server_args):
     """Test that wait_for_completion_or_failure works with failures."""
     global WORKER_RUNTIME_SECONDS
@@ -109,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
         assert len(manager.processes) == 3
 
         # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
 
         def run_with_exception_capture():
             try:
@@ -118,8 +113,7 @@ def test_wait_for_completion_or_failure(api_server_args):
                 result["exception"] = e
 
         # Start a thread to run wait_for_completion_or_failure
-        wait_thread = threading.Thread(target=run_with_exception_capture,
-                                       daemon=True)
+        wait_thread = threading.Thread(target=run_with_exception_capture, daemon=True)
         wait_thread.start()
 
         # Let all processes run for a short time
@@ -174,8 +168,7 @@ def test_normal_completion(api_server_args):
 
         # Verify all processes have terminated
         for i, proc in enumerate(manager.processes):
-            assert not proc.is_alive(
-            ), f"Process {i} still alive after terminate()"
+            assert not proc.is_alive(), f"Process {i} still alive after terminate()"
 
         # Now call wait_for_completion_or_failure
         # since all processes have already
@@ -198,13 +191,13 @@ def test_external_process_monitoring(api_server_args):
     # Create and start the external process
     # (simulates local_engine_manager or coordinator)
     spawn_context = multiprocessing.get_context("spawn")
-    external_proc = spawn_context.Process(target=mock_run_api_server_worker,
-                                          name="MockExternalProcess")
+    external_proc = spawn_context.Process(
+        target=mock_run_api_server_worker, name="MockExternalProcess"
+    )
     external_proc.start()
 
     # Create the class to simulate a coordinator
     class MockCoordinator:
-
         def __init__(self, proc):
             self.proc = proc
 
@@ -224,18 +217,18 @@ def test_external_process_monitoring(api_server_args):
         assert len(manager.processes) == 3
 
         # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
 
         def run_with_exception_capture():
             try:
-                wait_for_completion_or_failure(api_server_manager=manager,
-                                               coordinator=mock_coordinator)
+                wait_for_completion_or_failure(
+                    api_server_manager=manager, coordinator=mock_coordinator
+                )
             except Exception as e:
                 result["exception"] = e
 
         # Start a thread to run wait_for_completion_or_failure
-        wait_thread = threading.Thread(target=run_with_exception_capture,
-                                       daemon=True)
+        wait_thread = threading.Thread(target=run_with_exception_capture, daemon=True)
         wait_thread.start()
 
         # Terminate the external process to trigger a failure
@@ -246,21 +239,23 @@ def test_external_process_monitoring(api_server_args):
         wait_thread.join(timeout=1.0)
 
         # The wait thread should have completed
-        assert not wait_thread.is_alive(
-        ), "wait_for_completion_or_failure thread still running"
+        assert not wait_thread.is_alive(), (
+            "wait_for_completion_or_failure thread still running"
+        )
 
         # Verify that an exception was raised with appropriate error message
         assert result["exception"] is not None, "No exception was raised"
         error_message = str(result["exception"])
-        assert "died with exit code" in error_message, \
+        assert "died with exit code" in error_message, (
             f"Unexpected error message: {error_message}"
-        assert "MockExternalProcess" in error_message, \
+        )
+        assert "MockExternalProcess" in error_message, (
             f"Error doesn't mention external process: {error_message}"
+        )
 
         # Verify that all API server processes were terminated as a result
         for i, proc in enumerate(manager.processes):
-            assert not proc.is_alive(
-            ), f"API server process {i} was not terminated"
+            assert not proc.is_alive(), f"API server process {i} was not terminated"
 
     finally:
         # Clean up
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index a268f573ef905e935c4ecc39b9cd4026c116ae02..22b96587045bb730e7277d3c79e746d7e732486d 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -3,27 +3,31 @@
 
 import warnings
 from collections.abc import Mapping
-from typing import Literal, Optional
+from typing import Literal
 
 import pytest
-from mistral_common.tokens.tokenizers.base import (SpecialTokenPolicy,
-                                                   SpecialTokens)
-from mistral_common.tokens.tokenizers.tekken import (SpecialTokenInfo,
-                                                     Tekkenizer)
+from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
-                                         parse_chat_messages,
-                                         parse_chat_messages_futures,
-                                         resolve_chat_template_content_format,
-                                         resolve_chat_template_kwargs,
-                                         resolve_hf_chat_template)
+from vllm.entrypoints.chat_utils import (
+    _try_extract_ast,
+    apply_mistral_chat_template,
+    load_chat_template,
+    parse_chat_messages,
+    parse_chat_messages_futures,
+    resolve_chat_template_content_format,
+    resolve_chat_template_kwargs,
+    resolve_hf_chat_template,
+)
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
-from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
-                                   encode_video_base64)
+from vllm.multimodal.utils import (
+    encode_audio_base64,
+    encode_image_base64,
+    encode_video_base64,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
@@ -69,6 +73,19 @@ def phi3v_model_config_mm_interleaved():
     )
 
 
+@pytest.fixture(scope="function")
+def phi3v_model_config_image_embeds():
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+        enable_mm_embeds=True,
+    )
+
+
 @pytest.fixture(scope="module")
 def phi3v_tokenizer():
     return get_tokenizer(PHI3V_MODEL_ID)
@@ -148,9 +165,9 @@ def audio_url():
 
 
 def _assert_mm_data_is_image_input(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
     image_count: int,
-    skipped_image_indices: Optional[list] = None,
+    skipped_image_indices: list | None = None,
 ) -> None:
     assert mm_data is not None
     assert set(mm_data.keys()) == {"image"}
@@ -165,9 +182,9 @@ def _assert_mm_data_is_image_input(
 
 
 def _assert_mm_uuids(
-    mm_uuids: Optional[MultiModalUUIDDict],
+    mm_uuids: MultiModalUUIDDict | None,
     media_count: int,
-    expected_uuids: list[Optional[str]],
+    expected_uuids: list[str | None],
     modality: str = "image",
 ) -> None:
     if len(expected_uuids) > 0:
@@ -177,8 +194,7 @@ def _assert_mm_uuids(
         image_uuids = mm_uuids.get(modality)
         assert image_uuids is not None
 
-        assert isinstance(image_uuids,
-                          list) and len(image_uuids) == media_count
+        assert isinstance(image_uuids, list) and len(image_uuids) == media_count
 
         assert image_uuids == expected_uuids
     else:
@@ -190,10 +206,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
 
 
 def _assert_mm_data_inputs(
-        mm_data: Optional[MultiModalDataDict],
-        data_count: MultiModalDataCounts,
-        skipped_media_indices: Optional[dict[
-            str, list]] = None,  # modality -> list[int]
+    mm_data: MultiModalDataDict | None,
+    data_count: MultiModalDataCounts,
+    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
 ) -> None:
     assert mm_data is not None
     assert set(data_count.keys()) == (set(mm_data.keys()))
@@ -204,8 +219,7 @@ def _assert_mm_data_inputs(
         assert isinstance(modality_data, list) and len(modality_data) == n
 
         if skipped_media_indices is not None:
-            skipped_media_indices_for_modality = skipped_media_indices.get(
-                modality)
+            skipped_media_indices_for_modality = skipped_media_indices.get(modality)
             assert skipped_media_indices_for_modality is not None
             for i in skipped_media_indices_for_modality:
                 assert modality_data[i] is None
@@ -217,31 +231,23 @@ def test_parse_chat_messages_single_image(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
@@ -253,32 +259,29 @@ def test_parse_chat_messages_single_image_with_uuid(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url,
+                        },
+                        "uuid": image_uuid,
                     },
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
@@ -290,30 +293,27 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid,
+                    },
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
@@ -325,33 +325,30 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
-                        "uuid": image_uuid,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url,
+                            "uuid": image_uuid,
+                        },
+                        "bad_uuid_key": image_uuid,
                     },
-                    "bad_uuid_key": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
@@ -365,41 +362,39 @@ def test_parse_chat_messages_multiple_images_with_uuids(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url,
+                        },
+                        "uuid": image_uuid1,
                     },
-                    "uuid": image_uuid1,
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url,
+                        },
+                        "uuid": image_uuid2,
                     },
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
@@ -413,37 +408,35 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid1,
-                },
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid1,
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid2,
+                    },
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
@@ -457,39 +450,37 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url,
+                        },
+                        "uuid": image_uuid1,
                     },
-                    "uuid": image_uuid1,
-                },
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid2,
+                    },
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
@@ -502,32 +493,27 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_uuid,
                     },
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(await mm_future, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
@@ -540,33 +526,28 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid,
+                    },
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
-    _assert_mm_data_is_image_input(await mm_future,
-                                   1,
-                                   skipped_image_indices=[0])
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
+    _assert_mm_data_is_image_input(await mm_future, 1, skipped_image_indices=[0])
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
 
@@ -580,39 +561,35 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_uuid1,
                     },
-                    "uuid": image_uuid1,
-                },
-                {
-                    "type": "image_pil",
-                    "image_pil": ImageAsset("cherry_blossom").pil_image,
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in these images?"
-                },
-            ],
-        }],
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                        "uuid": image_uuid2,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_future, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
@@ -627,40 +604,36 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": None,
-                    "uuid": image_uuid1,
-                },
-                {
-                    "type": "image_pil",
-                    "image_pil": None,
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in these images?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": image_uuid1,
+                    },
+                    {
+                        "type": "image_pil",
+                        "image_pil": None,
+                        "uuid": image_uuid2,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
-    _assert_mm_data_is_image_input(await mm_future,
-                                   2,
-                                   skipped_image_indices=[0, 1])
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
+    _assert_mm_data_is_image_input(await mm_future, 2, skipped_image_indices=[0, 1])
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
 
@@ -673,38 +646,34 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
     image_uuid2 = "my_uuid_2"
 
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
                     },
-                },
-                {
-                    "type": "image_pil",
-                    "image_pil": ImageAsset("cherry_blossom").pil_image,
-                    "uuid": image_uuid2,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in these images?"
-                },
-            ],
-        }],
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                        "uuid": image_uuid2,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_future, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
 
@@ -716,16 +685,10 @@ def test_parse_chat_messages_empty_system(
     # Test string format
     conversation, _, _ = parse_chat_messages(
         [
-            {
-                "role": "system",
-                "content": ""
-            },
+            {"role": "system", "content": ""},
             {
                 "role": "user",
-                "content": [{
-                    "type": "text",
-                    "text": "Who are you?"
-                }],
+                "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
         mistral_model_config,
@@ -733,29 +696,17 @@ def test_parse_chat_messages_empty_system(
         content_format="string",
     )
     assert conversation == [
-        {
-            "role": "system",
-            "content": ""
-        },
-        {
-            "role": "user",
-            "content": "Who are you?"
-        },
+        {"role": "system", "content": ""},
+        {"role": "user", "content": "Who are you?"},
     ]
 
     # Test openai format
     conversation, _, _ = parse_chat_messages(
         [
-            {
-                "role": "system",
-                "content": ""
-            },
+            {"role": "system", "content": ""},
             {
                 "role": "user",
-                "content": [{
-                    "type": "text",
-                    "text": "Who are you?"
-                }],
+                "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
         mistral_model_config,
@@ -763,20 +714,8 @@ def test_parse_chat_messages_empty_system(
         content_format="openai",
     )
     assert conversation == [
-        {
-            "role": "system",
-            "content": [{
-                "type": "text",
-                "text": ""
-            }]
-        },
-        {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
-        },
+        {"role": "system", "content": [{"type": "text", "text": ""}]},
+        {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
     ]
 
 
@@ -787,31 +726,23 @@ async def test_parse_chat_messages_single_image_async(
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "What's in the image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(await mm_future, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
@@ -822,37 +753,30 @@ def test_parse_chat_messages_multiple_images(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "image_pil",
-                    "image_pil": ImageAsset("cherry_blossom").pil_image,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in these images?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -863,64 +787,56 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_pil",
-                    "image_pil": None,
-                    "uuid": uuid
-                },
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_pil", "image_pil": None, "uuid": uuid},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in this image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
 def test_parse_chat_messages_empty_image_embeds_with_uuid(
-    phi3v_model_config,
+    phi3v_model_config_image_embeds,
     phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_embeds",
-                    "image_embeds": None,
-                    "uuid": uuid
-                },
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-            ],
-        }],
-        phi3v_model_config,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in this image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?",
+        }
+    ]
     assert mm_data is not None
     assert "image" in mm_data
     assert mm_data["image"] is None
@@ -929,35 +845,31 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
 
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
-    phi3v_model_config,
+    phi3v_model_config_image_embeds,
     phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_embeds",
-                    "image_embeds": None,
-                    "uuid": uuid
-                },
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-            ],
-        }],
-        phi3v_model_config,
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in this image?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?",
+        }
+    ]
     mm_data = await mm_future
     assert mm_data is not None
     assert "image" in mm_data
@@ -972,37 +884,30 @@ async def test_parse_chat_messages_multiple_images_async(
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "image_pil",
-                    "image_pil": ImageAsset("cherry_blossom").pil_image,
-                },
-                {
-                    "type": "text",
-                    "text": "What's in these images?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_future, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1013,40 +918,29 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type":
-                    "text",
-                    "text":
-                    "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
+                    },
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's in <|image_1|> and how does it compare to <|image_2|>?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's in <|image_1|> and how does it compare to <|image_2|>?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1057,42 +951,32 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type":
-                    "text",
-                    "text":
-                    "What's in <|image_1|> and how does it compare to the other one?",  # noqa: E501
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What's in <|image_1|> and how does it compare to "
+                        "the other one?",
+                    },
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
-        "other one?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
+            "the other one?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1105,39 +989,18 @@ def test_parse_chat_messages_multiple_images_across_messages(
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "What's in this image?"
-                    },
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in this image?"},
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "What about this one?"
-                    },
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What about this one?"},
                 ],
             },
         ],
@@ -1147,18 +1010,9 @@ def test_parse_chat_messages_multiple_images_across_messages(
     )
 
     assert conversation == [
-        {
-            "role": "user",
-            "content": "<|image_1|>\nWhat's in this image?"
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
-        },
-        {
-            "role": "user",
-            "content": "<|image_2|>\nWhat about this one?"
-        },
+        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
@@ -1173,41 +1027,26 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": image_uuid,
                     },
-                    {
-                        "type": "text",
-                        "text": "What's in this image?"
-                    },
+                    {"type": "text", "text": "What's in this image?"},
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": image_uuid,
                     },
-                    {
-                        "type": "text",
-                        "text": "What about this one?"
-                    },
+                    {"type": "text", "text": "What about this one?"},
                 ],
             },
         ],
@@ -1217,18 +1056,9 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
     )
 
     assert conversation == [
-        {
-            "role": "user",
-            "content": "<|image_1|>\nWhat's in this image?"
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
-        },
-        {
-            "role": "user",
-            "content": "<|image_2|>\nWhat about this one?"
-        },
+        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
@@ -1242,19 +1072,10 @@ def test_parse_chat_messages_context_text_format(
         [
             {
                 "role": "user",
-                "content": [{
-                    "type": "text",
-                    "text": "What's in this text?"
-                }],
-            },
-            {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role": "user",
-                "content": "What about this one?"
+                "content": [{"type": "text", "text": "What's in this text?"}],
             },
+            {"role": "assistant", "content": "Some stuff."},
+            {"role": "user", "content": "What about this one?"},
         ],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -1264,24 +1085,15 @@ def test_parse_chat_messages_context_text_format(
     assert conversation == [
         {
             "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }],
+            "content": [{"type": "text", "text": "What's in this text?"}],
         },
         {
             "role": "assistant",
-            "content": [{
-                "type": "text",
-                "text": "Some stuff."
-            }],
+            "content": [{"type": "text", "text": "Some stuff."}],
         },
         {
             "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What about this one?"
-            }],
+            "content": [{"type": "text", "text": "What about this one?"}],
         },
     ]
     assert mm_data is None
@@ -1300,34 +1112,26 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
         )
         with pytest.raises(ValueError, match="At most"):
             parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": image_url},
                             },
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": image_url},
                             },
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": image_url},
                             },
-                        },
-                        {
-                            "type": "text",
-                            "text": "What's in these images?"
-                        },
-                    ],
-                }],
+                            {"type": "text", "text": "What's in these images?"},
+                        ],
+                    }
+                ],
                 phi3v_model_config,
                 phi3v_tokenizer,
                 content_format="string",
@@ -1348,45 +1152,28 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
             parse_chat_messages(
                 [
                     {
-                        "role":
-                        "user",
+                        "role": "user",
                         "content": [
                             {
                                 "type": "image_url",
-                                "image_url": {
-                                    "url": image_url
-                                },
-                            },
-                            {
-                                "type": "text",
-                                "text": "What's in this image?"
+                                "image_url": {"url": image_url},
                             },
+                            {"type": "text", "text": "What's in this image?"},
                         ],
                     },
+                    {"role": "assistant", "content": "Some stuff."},
                     {
-                        "role": "assistant",
-                        "content": "Some stuff."
-                    },
-                    {
-                        "role":
-                        "user",
+                        "role": "user",
                         "content": [
                             {
                                 "type": "image_url",
-                                "image_url": {
-                                    "url": image_url
-                                },
+                                "image_url": {"url": image_url},
                             },
                             {
                                 "type": "image_url",
-                                "image_url": {
-                                    "url": image_url
-                                },
-                            },
-                            {
-                                "type": "text",
-                                "text": "What about these two?"
+                                "image_url": {"url": image_url},
                             },
+                            {"type": "text", "text": "What about these two?"},
                         ],
                     },
                 ],
@@ -1402,30 +1189,27 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                "What's in these images?",
-                {
-                    "image_url": image_url
-                },
-                {
-                    "image_url": image_url
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    "What's in these images?",
+                    {"image_url": image_url},
+                    {"image_url": image_url},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1436,48 +1220,33 @@ def test_parse_chat_messages_multiple_images_interleave(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "I need you to compare this image",
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "and this one"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Do they have differences?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "I need you to compare this image",
+                    },
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "and this one"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Do they have differences?"},
+                ],
+            }
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+            "Do they have differences?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1489,48 +1258,33 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "I need you to compare this image",
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "and this one"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Do they have differences?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "I need you to compare this image",
+                    },
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "and this one"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Do they have differences?"},
+                ],
+            }
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+            "Do they have differences?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
@@ -1543,50 +1297,41 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
 ):
     image_uuid = str(hash(image_url))
     conversation, mm_data, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "I need you to compare this image",
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "I need you to compare this image",
                     },
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "and this one"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_uuid,
                     },
-                    "uuid": image_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "Do they have differences?"
-                },
-            ],
-        }],
+                    {"type": "text", "text": "and this one"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_uuid,
+                    },
+                    {"type": "text", "text": "Do they have differences?"},
+                ],
+            }
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?",
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+            "Do they have differences?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
@@ -1599,43 +1344,19 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "Be accurate."
-                    },
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Be accurate."},
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
                 ],
             },
         ],
@@ -1649,20 +1370,14 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
             "role": "user",
             "content": "What's on this image?\n<|image_1|>\nBe accurate.",
         },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
-        },
-        {
-            "role": "user",
-            "content": "What's on this image?\n<|image_2|>"
-        },
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
-def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(  # noqa: E501
+def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
     phi3v_model_config_mm_interleaved,
     phi3v_tokenizer,
     image_url,
@@ -1671,43 +1386,25 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": image_uuid,
                     },
-                    {
-                        "type": "text",
-                        "text": "Be accurate."
-                    },
+                    {"type": "text", "text": "Be accurate."},
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": image_uuid,
                     },
                 ],
@@ -1723,14 +1420,8 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
             "role": "user",
             "content": "What's on this image?\n<|image_1|>\nBe accurate.",
         },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
-        },
-        {
-            "role": "user",
-            "content": "What's on this image?\n<|image_2|>"
-        },
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
@@ -1744,61 +1435,24 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     audio_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [
-            {
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "Now listen to this audio"
-                    },
-                    {
-                        "type": "audio_url",
-                        "audio_url": {
-                            "url": audio_url
-                        }
-                    },
-                ],
-            },
+        [
             {
-                "role": "assistant",
-                "content": "Some stuff."
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Now listen to this audio"},
+                    {"type": "audio_url", "audio_url": {"url": audio_url}},
+                ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "And what's in the video?"
-                    },
-                    {
-                        "type": "video_url",
-                        "video_url": {
-                            "url": video_url
-                        }
-                    },
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "And what's in the video?"},
+                    {"type": "video_url", "video_url": {"url": video_url}},
                 ],
             },
         ],
@@ -1809,35 +1463,25 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
 
     assert conversation == [
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
         },
+        {"role": "assistant", "content": "Some stuff."},
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
         },
     ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
-    _assert_mm_uuids(mm_uuids,
-                     2,
-                     modality="image",
-                     expected_uuids=[None, None])
+    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=[None, None])
     _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
-def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(  # noqa: E501
+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
     qwen25omni_model_config_mm_interleaved,
     qwen25omni_tokenizer,
     image_url,
@@ -1847,61 +1491,36 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": "image_123",
                     },
-                    {
-                        "type": "text",
-                        "text": "Now listen to this audio"
-                    },
+                    {"type": "text", "text": "Now listen to this audio"},
                     {
                         "type": "audio_url",
-                        "audio_url": {
-                            "url": audio_url
-                        },
+                        "audio_url": {"url": audio_url},
                         "uuid": "audio_123",
                     },
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": "image_123",
                     },
-                    {
-                        "type": "text",
-                        "text": "And what's in the video?"
-                    },
+                    {"type": "text", "text": "And what's in the video?"},
                     {
                         "type": "video_url",
-                        "video_url": {
-                            "url": video_url
-                        },
+                        "video_url": {"url": video_url},
                         "uuid": "video_123",
                     },
                 ],
@@ -1914,38 +1533,24 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
 
     assert conversation == [
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
         },
+        {"role": "assistant", "content": "Some stuff."},
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
         },
     ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
-    _assert_mm_uuids(mm_uuids,
-                     2,
-                     modality="image",
-                     expected_uuids=["image_123", "image_123"])
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="video",
-                     expected_uuids=["video_123"])
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="audio",
-                     expected_uuids=["audio_123"])
+    _assert_mm_uuids(
+        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
+    )
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
 
 
 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
@@ -1958,22 +1563,15 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
                         "image_url": None,
                         "uuid": "image_123",
                     },
-                    {
-                        "type": "text",
-                        "text": "Now listen to this audio"
-                    },
+                    {"type": "text", "text": "Now listen to this audio"},
                     {
                         "type": "audio_url",
                         "audio_url": None,
@@ -1981,27 +1579,17 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
                     },
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
                         "image_url": None,
                         "uuid": "image_123",
                     },
-                    {
-                        "type": "text",
-                        "text": "And what's in the video?"
-                    },
+                    {"type": "text", "text": "And what's in the video?"},
                     {
                         "type": "video_url",
                         "video_url": None,
@@ -2017,47 +1605,28 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
 
     assert conversation == [
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
         },
+        {"role": "assistant", "content": "Some stuff."},
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
         },
     ]
 
-    _assert_mm_data_inputs(mm_data, {
-        "image": 2,
-        "video": 1,
-        "audio": 1
-    },
-                           skipped_media_indices={
-                               "image": [0, 1],
-                               "video": [0],
-                               "audio": [0]
-                           })
-    _assert_mm_uuids(mm_uuids,
-                     2,
-                     modality="image",
-                     expected_uuids=["image_123", "image_123"])
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="video",
-                     expected_uuids=["video_123"])
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="audio",
-                     expected_uuids=["audio_123"])
+    _assert_mm_data_inputs(
+        mm_data,
+        {"image": 2, "video": 1, "audio": 1},
+        skipped_media_indices={"image": [0, 1], "video": [0], "audio": [0]},
+    )
+    _assert_mm_uuids(
+        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
+    )
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
 
 
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
@@ -2070,59 +1639,28 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        },
+                        "image_url": {"url": image_url},
                         "uuid": "image_123",
                     },
-                    {
-                        "type": "text",
-                        "text": "Now listen to this audio"
-                    },
-                    {
-                        "type": "audio_url",
-                        "audio_url": {
-                            "url": audio_url
-                        }
-                    },
+                    {"type": "text", "text": "Now listen to this audio"},
+                    {"type": "audio_url", "audio_url": {"url": audio_url}},
                 ],
             },
+            {"role": "assistant", "content": "Some stuff."},
             {
-                "role": "assistant",
-                "content": "Some stuff."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": "What's on this image?"
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "And what's in the video?"
-                    },
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "And what's in the video?"},
                     {
                         "type": "video_url",
-                        "video_url": {
-                            "url": video_url
-                        },
+                        "video_url": {"url": video_url},
                         "uuid": "video_123",
                     },
                 ],
@@ -2135,34 +1673,21 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
 
     assert conversation == [
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
         },
+        {"role": "assistant", "content": "Some stuff."},
         {
-            "role":
-            "user",
-            "content":
-            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
         },
     ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
-    _assert_mm_uuids(mm_uuids,
-                     2,
-                     modality="image",
-                     expected_uuids=["image_123", None])
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="video",
-                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=["image_123", None])
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
@@ -2172,36 +1697,25 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     image_url,
 ):
     with pytest.raises(
-            ValueError,
-            match=r"Found more '<|image_1|>' placeholders in input prompt "
-            "than actual multimodal data items.",
+        ValueError,
+        match=r"Found more '<|image_1|>' placeholders in input prompt "
+        "than actual multimodal data items.",
     ):
         parse_chat_messages(
-            [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type":
-                        "text",
-                        "text":
-                        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-                        "Do they have differences?",
-                    },
-                ],
-            }],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {
+                            "type": "text",
+                            "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+                            "Do they have differences?",
+                        },
+                    ],
+                }
+            ],
             phi3v_model_config_mm_interleaved,
             phi3v_tokenizer,
             content_format="string",
@@ -2228,9 +1742,12 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     # Build the tokenizer
     tokenizer = get_tokenizer(
@@ -2238,14 +1755,20 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         trust_remote_code=model_config.trust_remote_code,
     )
 
-    tools = ([{
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name",
-            "description": "This is a dummy function",
-            "parameters": sample_json_schema,
-        },
-    }] if use_tools else None)
+    tools = (
+        [
+            {
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema,
+                },
+            }
+        ]
+        if use_tools
+        else None
+    )
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
@@ -2263,33 +1786,39 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         (
             QWEN2VL_MODEL_ID,
             {
-                "add_vision_id", "add_generation_prompt",
-                "continue_final_message", "tools"
+                "add_vision_id",
+                "add_generation_prompt",
+                "continue_final_message",
+                "tools",
             },
         ),
         (
             QWEN3_MODEL_ID,
             {
-                "enable_thinking", "add_generation_prompt",
-                "continue_final_message", "tools"
+                "enable_thinking",
+                "add_generation_prompt",
+                "continue_final_message",
+                "tools",
             },
         ),
     ],
 )
-def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
-                                         expected_kwargs):
+
+def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwargs):
     """checks that chat_template is a dict type for HF models."""
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    tools = ([{
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name",
-            "description": "This is a dummy function",
-            "parameters": sample_json_schema,
-        },
-    }])
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema,
+            },
+        }
+    ]
 
     chat_template_kwargs = {
         # both unused
@@ -2297,6 +1826,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
         "unsed_kwargs_2": "abc",
         # should not appear
         "chat_template": "{% Hello world! %}",
+        "tokenize": True,
         # used by tokenizer
         "continue_final_message": True,
         "tools": tools,
@@ -2315,9 +1845,12 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     # Build the tokenizer
     tokenizer = get_tokenizer(
@@ -2332,27 +1865,71 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
         tools=tools,
         model_config=model_config,
     )
+    with pytest.raises(
+        ValueError, match="Found unexpected chat template kwargs from request"
+    ):
+        # should raise error if `chat_template_kwargs` contains
+        # `chat_template` or `tokenize`
+        resolve_chat_template_kwargs(
+            tokenizer,
+            chat_template=chat_template,
+            chat_template_kwargs=chat_template_kwargs,
+        )
     resolved_chat_template_kwargs = resolve_chat_template_kwargs(
         tokenizer,
         chat_template=chat_template,
         chat_template_kwargs=chat_template_kwargs,
+        raise_on_unexpected=False,
     )
     assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs
 
+    # Additional test: Verify HF base parameters work with **kwargs tokenizers
+    # This validates the fix for tokenizers like Kimi K2 that use **kwargs
+    # to receive standard HuggingFace parameters instead of declaring them explicitly
+    from vllm.entrypoints.chat_utils import _get_hf_base_chat_template_params
+
+    hf_base_params = _get_hf_base_chat_template_params()
+    # Verify common HF parameters are in the base class
+    assert {"add_generation_prompt", "tools", "continue_final_message"}.issubset(
+        hf_base_params
+    ), f"Expected HF base params not found in {hf_base_params}"
+
+    # Test with a mock tokenizer that uses **kwargs (like Kimi K2)
+    class MockTokenizerWithKwargs:
+        def apply_chat_template(self, conversation, **kwargs):
+            return "mocked_output"
+
+    mock_tokenizer = MockTokenizerWithKwargs()
+    mock_kwargs = {
+        "add_generation_prompt": True,
+        "tools": tools,
+        "continue_final_message": False,
+        "unknown_param": "should_be_filtered",
+    }
+    resolved_mock = resolve_chat_template_kwargs(
+        mock_tokenizer, chat_template, mock_kwargs, raise_on_unexpected=False
+    )
+    # HF base params should pass through even with **kwargs tokenizer
+    assert "add_generation_prompt" in resolved_mock
+    assert "tools" in resolved_mock
+    assert "continue_final_message" in resolved_mock
+    # Unknown params should be filtered out
+    assert "unknown_param" not in resolved_mock
+    
 
 # NOTE: Qwen2-Audio default chat template is specially defined inside
 # processor class instead of using `tokenizer_config.json`
-# yapf: disable
 @pytest.mark.parametrize(
     ("model", "expected_format"),
-    [(PHI3V_MODEL_ID, "string"),
-     (QWEN2VL_MODEL_ID, "openai"),
-     (QWEN25VL_MODEL_ID, "openai"),
-     (ULTRAVOX_MODEL_ID, "string"),
-     (QWEN2AUDIO_MODEL_ID, "openai"),
-     (LLAMA_GUARD_MODEL_ID, "openai")],
+    [
+        (PHI3V_MODEL_ID, "string"),
+        (QWEN2VL_MODEL_ID, "openai"),
+        (QWEN25VL_MODEL_ID, "openai"),
+        (ULTRAVOX_MODEL_ID, "string"),
+        (QWEN2AUDIO_MODEL_ID, "openai"),
+        (LLAMA_GUARD_MODEL_ID, "openai"),
+    ],
 )
-# yapf: enable
 def test_resolve_content_format_hf_defined(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
@@ -2364,9 +1941,12 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
         model,
@@ -2398,18 +1978,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     assert resolved_format == expected_format
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("model", "expected_format"),
-    [("Salesforce/blip2-opt-2.7b", "string"),
-     ("facebook/chameleon-7b", "string"),
-     ("deepseek-ai/deepseek-vl2-tiny", "string"),
-     ("adept/fuyu-8b", "string"),
-     ("google/paligemma-3b-mix-224", "string"),
-     ("Qwen/Qwen-VL", "string"),
-     ("Qwen/Qwen-VL-Chat", "string")],
+    [
+        ("Salesforce/blip2-opt-2.7b", "string"),
+        ("facebook/chameleon-7b", "string"),
+        ("deepseek-ai/deepseek-vl2-tiny", "string"),
+        ("adept/fuyu-8b", "string"),
+        ("google/paligemma-3b-mix-224", "string"),
+        ("Qwen/Qwen-VL", "string"),
+        ("Qwen/Qwen-VL-Chat", "string"),
+    ],
 )
-# yapf: enable
 def test_resolve_content_format_fallbacks(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
@@ -2421,9 +2001,12 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
         model_config.tokenizer,
@@ -2455,29 +2038,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     assert resolved_format == expected_format
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("template_path", "expected_format"),
-    [("template_alpaca.jinja", "string"),
-     ("template_baichuan.jinja", "string"),
-     ("template_chatglm.jinja", "string"),
-     ("template_chatglm2.jinja", "string"),
-     ("template_chatml.jinja", "string"),
-     ("template_dse_qwen2_vl.jinja", "openai"),
-     ("template_falcon_180b.jinja", "string"),
-     ("template_falcon.jinja", "string"),
-     ("template_inkbot.jinja", "string"),
-     ("template_teleflm.jinja", "string"),
-     ("template_vlm2vec.jinja", "openai"),
-     ("tool_chat_template_granite_20b_fc.jinja", "string"),
-     ("tool_chat_template_hermes.jinja", "string"),
-     ("tool_chat_template_internlm2_tool.jinja", "string"),
-     ("tool_chat_template_llama3.1_json.jinja", "openai"),
-     ("tool_chat_template_llama3.2_json.jinja", "openai"),
-     ("tool_chat_template_mistral_parallel.jinja", "string"),
-     ("tool_chat_template_mistral.jinja", "string")],
+    [
+        ("template_alpaca.jinja", "string"),
+        ("template_baichuan.jinja", "string"),
+        ("template_chatglm.jinja", "string"),
+        ("template_chatglm2.jinja", "string"),
+        ("template_chatml.jinja", "string"),
+        ("template_dse_qwen2_vl.jinja", "openai"),
+        ("template_falcon_180b.jinja", "string"),
+        ("template_falcon.jinja", "string"),
+        ("template_inkbot.jinja", "string"),
+        ("template_teleflm.jinja", "string"),
+        ("template_vlm2vec_phi3v.jinja", "openai"),
+        ("template_vlm2vec_qwen2vl.jinja", "openai"),
+        ("tool_chat_template_granite_20b_fc.jinja", "string"),
+        ("tool_chat_template_hermes.jinja", "string"),
+        ("tool_chat_template_internlm2_tool.jinja", "string"),
+        ("tool_chat_template_llama3.1_json.jinja", "openai"),
+        ("tool_chat_template_llama3.2_json.jinja", "openai"),
+        ("tool_chat_template_mistral_parallel.jinja", "string"),
+        ("tool_chat_template_mistral.jinja", "string"),
+    ],
 )
-# yapf: enable
 def test_resolve_content_format_examples(template_path, expected_format):
     model_config = ModelConfig(
         PHI3V_MODEL_ID,  # Dummy
@@ -2510,40 +2094,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
     assert resolved_format == expected_format
 
 
-def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
-                                                    mistral_tokenizer):
-    messages = [{
-        "role":
-        "system",
-        "content": [{
-            "type": "text",
-            "text": "You are a helpful assistant."
-        }, {
-            "type":
-            "thinking",
-            "closed":
-            True,
-            "thinking":
-            "Only return the answer when you are confident."
-        }]
-    }, {
-        "role": "user",
-        "content": "What is 2+2?"
-    }, {
-        "role":
-        "assistant",
-        "content": [{
-            "type": "text",
-            "text": "Let me think about it."
-        }, {
-            "type": "thinking",
-            "closed": True,
-            "thinking": "2+2 = 4"
-        }, {
-            "type": "text",
-            "text": "The answer is 4.",
-        }],
-    }]
+def test_parse_chat_messages_include_thinking_chunk(
+    mistral_model_config, mistral_tokenizer
+):
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."},
+                {
+                    "type": "thinking",
+                    "closed": True,
+                    "thinking": "Only return the answer when you are confident.",
+                },
+            ],
+        },
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "content": [
+                {"type": "text", "text": "Let me think about it."},
+                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
+                {
+                    "type": "text",
+                    "text": "The answer is 4.",
+                },
+            ],
+        },
+    ]
 
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
@@ -2552,122 +2130,80 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
         content_format="openai",
     )
 
-    expected_conversation = [{
-        "role":
-        "system",
-        "content": [{
-            "type": "text",
-            "text": "You are a helpful assistant."
-        }, {
-            "type": "text",
-            "text": "Only return the answer when you are confident."
-        }],
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "What is 2+2?"
-        }],
-    }, {
-        "role":
-        "assistant",
-        "content": [
-            {
-                "type": "text",
-                "text": "Let me think about it."
-            },
-            {
-                "type": "text",
-                "text": "2+2 = 4"
-            },
-            {
-                "type": "text",
-                "text": "The answer is 4."
-            },
-        ]
-    }]
+    expected_conversation = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."},
+                {
+                    "type": "text",
+                    "text": "Only return the answer when you are confident.",
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": "What is 2+2?"}],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {"type": "text", "text": "Let me think about it."},
+                {"type": "text", "text": "2+2 = 4"},
+                {"type": "text", "text": "The answer is 4."},
+            ],
+        },
+    ]
 
     assert conversation_with_thinking == expected_conversation
 
 
 def test_apply_mistral_chat_template_thinking_chunk():
-    # Moved import here to avoid yapf and isort conflicts
-    from vllm.entrypoints.chat_utils import apply_mistral_chat_template
-    messages = [{
-        "role":
-        "system",
-        "content": [{
-            "type": "text",
-            "text": "You are a helpful assistant."
-        }, {
-            "type":
-            "thinking",
-            "closed":
-            True,
-            "thinking":
-            "Only return the answer when you are confident."
-        }]
-    }, {
-        "role": "user",
-        "content": "What is 2+2?"
-    }, {
-        "role":
-        "assistant",
-        "content": [{
-            "type": "text",
-            "text": "Let me think about it."
-        }, {
-            "type": "thinking",
-            "closed": True,
-            "thinking": "2+2 = 4"
-        }, {
-            "type": "text",
-            "text": "The answer is 4.",
-        }],
-    }, {
-        "role": "user",
-        "content": "Thanks, what is 3+3?"
-    }]
-
-    # TODO(Julien): upon model release change to a tokenizer already configured.
-    # =================================================================
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."},
+                {
+                    "type": "thinking",
+                    "closed": True,
+                    "thinking": "Only return the answer when you are confident.",
+                },
+            ],
+        },
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "content": [
+                {"type": "text", "text": "Let me think about it."},
+                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
+                {
+                    "type": "text",
+                    "text": "The answer is 4.",
+                },
+            ],
+        },
+        {"role": "user", "content": "Thanks, what is 3+3?"},
+    ]
     mistral_tokenizer = MistralTokenizer.from_pretrained(
-        "mistralai/Devstral-Small-2507")
-    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
-    # Add think special tokens to the tokenizer
-    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
-        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
-    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
-        rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
-        k: v
-        for k, v in
-        mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
-        if v not in {35, 36}
-    }
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.begin_think.value] = 35
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.end_think.value] = 36
-    mistral_tokenizer.instruct.BEGIN_THINK = 35
-    mistral_tokenizer.instruct.END_THINK = 36
-    # =================================================================
-
-    tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
-                                             messages,
-                                             chat_template=None,
-                                             tools=None)
+        "mistralai/Magistral-Small-2509"
+    )
+
+    tokens_ids = apply_mistral_chat_template(
+        mistral_tokenizer, messages, chat_template=None, tools=None
+    )
 
     string_tokens = mistral_tokenizer.mistral.decode(
-        tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
+        tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
+    )
 
     expected_tokens = (
         r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
         r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
         r"[INST]What is 2+2?[/INST]"
         r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
-        r"[INST]Thanks, what is 3+3?[/INST]")
+        r"[INST]Thanks, what is 3+3?[/INST]"
+    )
 
     assert string_tokens == expected_tokens
 
@@ -2678,37 +2214,33 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
 ):
     audio_uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "input_audio",
-                    "input_audio": {},
-                    "uuid": audio_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What does the audio say?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {},
+                        "uuid": audio_uuid,
+                    },
+                    {"type": "text", "text": "What does the audio say?"},
+                ],
+            }
+        ],
         qwen2_audio_model_config,
         qwen2_audio_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
+            "audio say?",
+        }
+    ]
     _assert_mm_data_inputs(mm_data, {"audio": 1})
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="audio",
-                     expected_uuids=[audio_uuid])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
 
 
 @pytest.mark.asyncio
@@ -2718,34 +2250,30 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
 ):
     audio_uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "input_audio",
-                    "input_audio": {},
-                    "uuid": audio_uuid,
-                },
-                {
-                    "type": "text",
-                    "text": "What does the audio say?"
-                },
-            ],
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {},
+                        "uuid": audio_uuid,
+                    },
+                    {"type": "text", "text": "What does the audio say?"},
+                ],
+            }
+        ],
         qwen2_audio_model_config,
         qwen2_audio_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
+            "audio say?",
+        }
+    ]
     _assert_mm_data_inputs(await mm_future, {"audio": 1})
-    _assert_mm_uuids(mm_uuids,
-                     1,
-                     modality="audio",
-                     expected_uuids=[audio_uuid])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py
index 2afe9758c2adffed4d82d47decaec707cea153b0..31ea856224f900a59f4d7f37d602ed8f317a2ed1 100644
--- a/tests/entrypoints/test_context.py
+++ b/tests/entrypoints/test_context.py
@@ -6,16 +6,14 @@ from unittest.mock import MagicMock, patch
 import pytest
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
-from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
+from vllm.entrypoints.context import (
+    HarmonyContext,
+    StreamingHarmonyContext,
+    TurnMetrics,
+)
 from vllm.outputs import CompletionOutput, RequestOutput
 
 
-# Helper function for Python < 3.10 compatibility
-async def async_next(async_iterator):
-    """Compatibility function equivalent to Python 3.10's anext()."""
-    return await async_iterator.__anext__()
-
-
 def create_mock_request_output(
     prompt_token_ids=None,
     output_token_ids=None,
@@ -48,10 +46,9 @@ def create_mock_request_output(
     )
 
 
-async def generate_mock_outputs(num_turns,
-                                prompt_token_counts,
-                                output_token_counts,
-                                cached_token_counts=None):
+async def generate_mock_outputs(
+    num_turns, prompt_token_counts, output_token_counts, cached_token_counts=None
+):
     """Generate a sequence of mock RequestOutput objects to simulate multiple
     turns."""
     if cached_token_counts is None:
@@ -73,8 +70,9 @@ async def generate_mock_outputs(num_turns,
 @pytest.fixture
 def mock_parser():
     """Set up a mock parser for tests."""
-    with patch("vllm.entrypoints.context.get_streamable_parser_for_assistant"
-               ) as mock_parser_factory:
+    with patch(
+        "vllm.entrypoints.context.get_streamable_parser_for_assistant"
+    ) as mock_parser_factory:
         # Create a mock parser object
         parser = MagicMock()
         parser.messages = []
@@ -107,8 +105,12 @@ def test_single_turn_token_counting():
 
     # Verify internal state tracking
     assert not context.is_first_turn
-    assert context.previous_turn.input_tokens == 5
-    assert context.previous_turn.output_tokens == 3
+    assert len(context.all_turn_metrics) == 1
+    previous_turn = context.all_turn_metrics[0]
+    assert previous_turn.input_tokens == 5
+    assert previous_turn.output_tokens == 3
+    assert previous_turn.cached_input_tokens == 2
+    assert previous_turn.tool_output_tokens == 0
 
 
 @pytest.mark.asyncio
@@ -124,12 +126,12 @@ async def test_multi_turn_token_counting():
     prompt_token_counts = [5, 15, 20]
     output_token_counts = [3, 4, 5]
     cached_token_counts = [0, 5, 15]
-    mock_generator = generate_mock_outputs(3, prompt_token_counts,
-                                           output_token_counts,
-                                           cached_token_counts)
+    mock_generator = generate_mock_outputs(
+        3, prompt_token_counts, output_token_counts, cached_token_counts
+    )
 
     # First turn - initial prompt and response
-    mock_output1 = await async_next(mock_generator)
+    mock_output1 = await anext(mock_generator)
     context.append_output(mock_output1)
 
     # At this point, we should have 5 prompt tokens and 3 output tokens
@@ -138,7 +140,7 @@ async def test_multi_turn_token_counting():
     assert context.num_tool_output_tokens == 0
 
     # Second turn - after tool output
-    mock_output2 = await async_next(mock_generator)
+    mock_output2 = await anext(mock_generator)
     context.append_output(mock_output2)
     # Current prompt tokens (15) - last_turn_input_tokens (5) -
     # last_turn_output_tokens (3) = 7
@@ -150,7 +152,7 @@ async def test_multi_turn_token_counting():
     assert context.num_cached_tokens == 5
 
     # Third turn - final response
-    mock_output3 = await async_next(mock_generator)
+    mock_output3 = await anext(mock_generator)
     context.append_output(mock_output3)
     # Additional tool output tokens from third turn:
     # Current prompt (20) - last_turn_input_tokens (15) -
@@ -162,6 +164,15 @@ async def test_multi_turn_token_counting():
     assert context.num_tool_output_tokens == expected_tool_output
     assert context.num_cached_tokens == 5 + 15
 
+    # Validate all turn metrics
+    assert len(context.all_turn_metrics) == 3
+    for i, turn in enumerate(context.all_turn_metrics):
+        assert turn.input_tokens == prompt_token_counts[i]
+        assert turn.output_tokens == output_token_counts[i]
+        assert turn.cached_input_tokens == cached_token_counts[i]
+    assert context.all_turn_metrics[1].tool_output_tokens == 7
+    assert context.all_turn_metrics[2].tool_output_tokens == 1
+
 
 def test_empty_output_tokens():
     """Test behavior when RequestOutput has empty output tokens."""
@@ -251,7 +262,7 @@ async def test_single_turn_no_tool_output():
     """Test that first turn never generates tool output tokens."""
     context = HarmonyContext(
         messages=[],
-        available_tools=["browser"]  # Tools available
+        available_tools=["browser"],  # Tools available
     )
 
     # Even with large prompt in first turn, no tool tokens should be counted
@@ -320,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
     # Create a streaming context
     context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
 
+    num_prompt_tokens = [3, 8, 13]
+    num_output_tokens = [3, 3, 2]
+    num_cached_tokens = [0, 3, 8]
+
     # Simulate three turns of conversation:
     # Turn 1: stream tokens one by one, then finish the message
     # Turn 2: new prompt, stream more tokens with a reasoning segment
@@ -331,23 +346,26 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
         create_mock_request_output(
             prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
             output_token_ids=[101],  # Single token
-            num_cached_tokens=0,
+            num_cached_tokens=num_cached_tokens[0],
             finished=False,  # Not end of message yet
-        ))
+        )
+    )
 
     # Second token of first turn
     context.append_output(
         create_mock_request_output(
             output_token_ids=[102],
             finished=False,
-        ))
+        )
+    )
 
     # Last token of first turn (finished=True signals end of message)
     context.append_output(
         create_mock_request_output(
             output_token_ids=[103],
             finished=True,  # End of message
-        ))
+        )
+    )
 
     # Check token counts after first turn
     assert context.num_prompt_tokens == 3  # Initial prompt tokens
@@ -362,25 +380,36 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
     # First token of second turn
     context.append_output(
         create_mock_request_output(
-            prompt_token_ids=[1, 2, 3, 101, 102, 103, 4,
-                              5],  # 8 tokens (includes previous)
+            prompt_token_ids=[
+                1,
+                2,
+                3,
+                101,
+                102,
+                103,
+                4,
+                5,
+            ],  # 8 tokens (includes previous)
             output_token_ids=[201],
-            num_cached_tokens=3,  # Some tokens cached
+            num_cached_tokens=num_cached_tokens[1],  # Some tokens cached
             finished=False,
-        ))
+        )
+    )
 
     # More tokens in reasoning channel
     context.append_output(
         create_mock_request_output(
             output_token_ids=[202],
             finished=False,
-        ))
+        )
+    )
 
     context.append_output(
         create_mock_request_output(
             output_token_ids=[203],
             finished=True,  # End of reasoning message
-        ))
+        )
+    )
 
     # Check counts after second turn (reasoning message)
     assert context.num_prompt_tokens == 3 + 8  # Initial + second prompt
@@ -399,30 +428,56 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
     context.append_output(
         create_mock_request_output(
             prompt_token_ids=[
-                1, 2, 3, 101, 102, 103, 4, 5, 201, 202, 203, 6, 7
+                1,
+                2,
+                3,
+                101,
+                102,
+                103,
+                4,
+                5,
+                201,
+                202,
+                203,
+                6,
+                7,
             ],  # 13 tokens
             output_token_ids=[301],
-            num_cached_tokens=8,  # More cached tokens
+            num_cached_tokens=num_cached_tokens[2],  # More cached tokens
             finished=False,
-        ))
+        )
+    )
 
     context.append_output(
         create_mock_request_output(
             output_token_ids=[302],
             finished=True,
-        ))
+        )
+    )
 
     # Final token counts check
-    assert context.num_prompt_tokens == 3 + 8 + 13  # All prompts
-    assert context.num_output_tokens == 3 + 3 + 2  # All outputs
+    assert context.num_prompt_tokens == sum(num_prompt_tokens)  # All prompts
+    assert context.num_output_tokens == sum(num_output_tokens)  # All outputs
     assert context.num_reasoning_tokens == 3  # Unchanged from second turn
-    assert context.num_cached_tokens == 3 + 8  # Accumulated cached tokens
+    assert context.num_cached_tokens == sum(
+        num_cached_tokens
+    )  # Accumulated cached tokens
 
     # Additional tool tokens from third turn
     # Formula: this turn prompt - last turn prompt - last turn output
     additional_tool_tokens = 13 - 8 - 3  # = 2
-    assert context.num_tool_output_tokens == expected_tool_tokens \
-        + additional_tool_tokens
+    assert (
+        context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
+    )
+
+    # Validate all turn metrics
+    assert len(context.all_turn_metrics) == 3
+    for i, turn in enumerate(context.all_turn_metrics):
+        assert turn.input_tokens == num_prompt_tokens[i]
+        assert turn.output_tokens == num_output_tokens[i]
+        assert turn.cached_input_tokens == num_cached_tokens[i]
+    assert context.all_turn_metrics[1].tool_output_tokens == 2
+    assert context.all_turn_metrics[2].tool_output_tokens == 2
 
 
 @pytest.mark.asyncio
@@ -442,8 +497,7 @@ async def test_streaming_message_synchronization(mock_parser):
             recipient=Role.ASSISTANT,
         )
     ]
-    context = StreamingHarmonyContext(messages=initial_messages,
-                                      available_tools=[])
+    context = StreamingHarmonyContext(messages=initial_messages, available_tools=[])
 
     # Verify initial state
     assert len(context._messages) == 1
@@ -461,9 +515,10 @@ async def test_streaming_message_synchronization(mock_parser):
 
     # This should trigger the message synchronization logic
     context.append_output(
-        create_mock_request_output(prompt_token_ids=[1, 2, 3],
-                                   output_token_ids=[101],
-                                   finished=False))
+        create_mock_request_output(
+            prompt_token_ids=[1, 2, 3], output_token_ids=[101], finished=False
+        )
+    )
 
     # Verify that messages were synchronized
     assert len(context._messages) == 2
@@ -485,12 +540,13 @@ async def test_streaming_message_synchronization(mock_parser):
             author=Author(role=Role.ASSISTANT, name="assistant"),
             content=[TextContent(text="Response 4")],
             recipient=Role.USER,
-        ))
+        )
+    )
 
     # Create another output to trigger synchronization again
-    mock_output2 = create_mock_request_output(prompt_token_ids=[1, 2, 3],
-                                              output_token_ids=[102],
-                                              finished=True)
+    mock_output2 = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3], output_token_ids=[102], finished=True
+    )
 
     context.append_output(mock_output2)
 
@@ -498,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser):
     assert len(context._messages) == 3
     assert context.num_init_messages == 1
     assert context._messages[2].content[0].text == "Response 4"
+
+
+def test_turn_metrics_copy_and_reset():
+    """Test TurnMetrics copy and reset methods work correctly."""
+    # Create a TurnMetrics with specific values
+    original_metrics = TurnMetrics(
+        input_tokens=10,
+        output_tokens=20,
+        cached_input_tokens=5,
+        tool_output_tokens=3,
+    )
+
+    # Test copy functionality
+    copied_metrics = original_metrics.copy()
+
+    # Verify copy has same values
+    assert copied_metrics.input_tokens == 10
+    assert copied_metrics.output_tokens == 20
+    assert copied_metrics.cached_input_tokens == 5
+    assert copied_metrics.tool_output_tokens == 3
+
+    # Verify they are separate objects
+    assert copied_metrics is not original_metrics
+
+    # Modify copy to ensure independence
+    copied_metrics.input_tokens = 999
+    assert original_metrics.input_tokens == 10  # Original unchanged
+    assert copied_metrics.input_tokens == 999
+
+    # Test reset functionality
+    original_metrics.reset()
+
+    # Verify all fields are reset to zero
+    assert original_metrics.input_tokens == 0
+    assert original_metrics.output_tokens == 0
+    assert original_metrics.cached_input_tokens == 0
+    assert original_metrics.tool_output_tokens == 0
+
+    # Verify copied metrics are unaffected by reset
+    assert copied_metrics.input_tokens == 999
+    assert copied_metrics.output_tokens == 20
+    assert copied_metrics.cached_input_tokens == 5
+    assert copied_metrics.tool_output_tokens == 3
diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/test_harmony_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fa051a678d6883943e288ec9adf350496d60a4b
--- /dev/null
+++ b/tests/entrypoints/test_harmony_utils.py
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from openai_harmony import Role
+
+from vllm.entrypoints.harmony_utils import (
+    has_custom_tools,
+    parse_input_to_harmony_message,
+)
+
+
+class TestParseInputToHarmonyMessage:
+    """Tests for parse_input_to_harmony_message function."""
+
+    def test_assistant_message_with_tool_calls(self):
+        """Test parsing assistant message with tool calls."""
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"location": "San Francisco"}',
+                    }
+                },
+                {
+                    "function": {
+                        "name": "search_web",
+                        "arguments": '{"query": "latest news"}',
+                    }
+                },
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 2
+
+        # First tool call
+        assert messages[0].author.role == Role.ASSISTANT
+        assert messages[0].content[0].text == '{"location": "San Francisco"}'
+        assert messages[0].channel == "commentary"
+        assert messages[0].recipient == "functions.get_weather"
+        assert messages[0].content_type == "json"
+
+        # Second tool call
+        assert messages[1].author.role == Role.ASSISTANT
+        assert messages[1].content[0].text == '{"query": "latest news"}'
+        assert messages[1].channel == "commentary"
+        assert messages[1].recipient == "functions.search_web"
+        assert messages[1].content_type == "json"
+
+    def test_assistant_message_with_empty_tool_call_arguments(self):
+        """Test parsing assistant message with tool call having None arguments."""
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {
+                        "name": "get_current_time",
+                        "arguments": None,
+                    }
+                }
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].content[0].text == ""
+        assert messages[0].recipient == "functions.get_current_time"
+
+    def test_tool_message_with_string_content(self):
+        """Test parsing tool message with string content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "get_weather",
+            "content": "The weather in San Francisco is sunny, 72°F",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.get_weather"
+        assert (
+            messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F"
+        )
+        assert messages[0].channel == "commentary"
+
+    def test_tool_message_with_array_content(self):
+        """Test parsing tool message with array content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "search_results",
+            "content": [
+                {"type": "text", "text": "Result 1: "},
+                {"type": "text", "text": "Result 2: "},
+                {
+                    "type": "image",
+                    "url": "http://example.com/img.png",
+                },  # Should be ignored
+                {"type": "text", "text": "Result 3"},
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
+
+    def test_tool_message_with_empty_content(self):
+        """Test parsing tool message with None content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "empty_tool",
+            "content": None,
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].content[0].text == ""
+
+    def test_system_message(self):
+        """Test parsing system message."""
+        chat_msg = {
+            "role": "system",
+            "content": "You are a helpful assistant",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        # System messages are converted using Message.from_dict
+        # which should preserve the role
+        assert messages[0].author.role == Role.SYSTEM
+
+    def test_developer_message(self):
+        """Test parsing developer message."""
+        chat_msg = {
+            "role": "developer",
+            "content": "Use concise language",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.DEVELOPER
+
+    def test_user_message_with_string_content(self):
+        """Test parsing user message with string content."""
+        chat_msg = {
+            "role": "user",
+            "content": "What's the weather in San Francisco?",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert messages[0].content[0].text == "What's the weather in San Francisco?"
+
+    def test_user_message_with_array_content(self):
+        """Test parsing user message with array content."""
+        chat_msg = {
+            "role": "user",
+            "content": [
+                {"text": "What's in this image? "},
+                {"text": "Please describe it."},
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert len(messages[0].content) == 2
+        assert messages[0].content[0].text == "What's in this image? "
+        assert messages[0].content[1].text == "Please describe it."
+
+    def test_assistant_message_with_string_content(self):
+        """Test parsing assistant message with string content (no tool calls)."""
+        chat_msg = {
+            "role": "assistant",
+            "content": "Hello! How can I help you today?",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.ASSISTANT
+        assert messages[0].content[0].text == "Hello! How can I help you today?"
+
+    def test_pydantic_model_input(self):
+        """Test parsing Pydantic model input (has model_dump method)."""
+
+        class MockPydanticModel:
+            def model_dump(self, exclude_none=True):
+                return {
+                    "role": "user",
+                    "content": "Test message",
+                }
+
+        chat_msg = MockPydanticModel()
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.USER
+        assert messages[0].content[0].text == "Test message"
+
+    def test_message_with_empty_content(self):
+        """Test parsing message with empty string content."""
+        chat_msg = {
+            "role": "user",
+            "content": "",
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].content[0].text == ""
+
+    def test_tool_call_with_missing_function_fields(self):
+        """Test parsing tool call with missing name or arguments."""
+        chat_msg = {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "function": {}  # Missing both name and arguments
+                }
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].recipient == "functions."
+        assert messages[0].content[0].text == ""
+
+    def test_array_content_with_missing_text(self):
+        """Test parsing array content where text field is missing."""
+        chat_msg = {
+            "role": "user",
+            "content": [
+                {},  # Missing text field
+                {"text": "actual text"},
+            ],
+        }
+
+        messages = parse_input_to_harmony_message(chat_msg)
+
+        assert len(messages) == 1
+        assert len(messages[0].content) == 2
+        assert messages[0].content[0].text == ""
+        assert messages[0].content[1].text == "actual text"
+
+
+def test_has_custom_tools() -> None:
+    assert not has_custom_tools(set())
+    assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})
+    assert has_custom_tools({"others"})
+    assert has_custom_tools(
+        {"web_search_preview", "code_interpreter", "container", "others"}
+    )
diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py
index 1f55b1fba613ba84ec2e7e898c902209d979b32a..b0ef3dd045bdfab82676093b35c317b4924aa1a1 100644
--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@@ -3,7 +3,6 @@
 
 import io
 from dataclasses import dataclass
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 
 import pybase64
@@ -17,11 +16,11 @@ from vllm.inputs.data import is_embeds_prompt
 @dataclass
 class MockModelConfig:
     max_model_len: int = 100
-    encoder_config: Optional[dict] = None
+    encoder_config: dict | None = None
+    enable_prompt_embeds: bool = True
 
 
 class MockTokenizerResult:
-
     def __init__(self, input_ids):
         self.input_ids = input_ids
 
@@ -45,9 +44,11 @@ def mock_async_tokenizer():
 
 @pytest.fixture
 def renderer(mock_model_config, mock_tokenizer):
-    return CompletionRenderer(model_config=mock_model_config,
-                              tokenizer=mock_tokenizer,
-                              async_tokenizer_pool={})
+    return CompletionRenderer(
+        model_config=mock_model_config,
+        tokenizer=mock_tokenizer,
+        async_tokenizer_pool={},
+    )
 
 
 class TestRenderPrompt:
@@ -57,7 +58,8 @@ class TestRenderPrompt:
     async def test_token_input(self, renderer):
         tokens = [101, 7592, 2088]
         results = await renderer.render_prompt(
-            prompt_or_prompts=tokens, config=RenderConfig(max_length=100))
+            prompt_or_prompts=tokens, config=RenderConfig(max_length=100)
+        )
 
         assert len(results) == 1
         assert results[0]["prompt_token_ids"] == tokens
@@ -66,7 +68,8 @@ class TestRenderPrompt:
     async def test_token_list_input(self, renderer):
         token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]]
         results = await renderer.render_prompt(
-            prompt_or_prompts=token_lists, config=RenderConfig(max_length=100))
+            prompt_or_prompts=token_lists, config=RenderConfig(max_length=100)
+        )
 
         assert len(results) == 3
         assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
@@ -75,14 +78,12 @@ class TestRenderPrompt:
 
     @pytest.mark.asyncio
     async def test_text_input(self, renderer, mock_async_tokenizer):
-        mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 7592, 2088])
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+        mock_async_tokenizer.return_value = MockTokenizerResult([101, 7592, 2088])
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
         results = await renderer.render_prompt(
-            prompt_or_prompts="Hello world",
-            config=RenderConfig(max_length=100))
+            prompt_or_prompts="Hello world", config=RenderConfig(max_length=100)
+        )
 
         assert len(results) == 1
         assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
@@ -90,15 +91,13 @@ class TestRenderPrompt:
 
     @pytest.mark.asyncio
     async def test_text_list_input(self, renderer, mock_async_tokenizer):
-        mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 7592, 2088])
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+        mock_async_tokenizer.return_value = MockTokenizerResult([101, 7592, 2088])
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
         text_list_input = ["Hello world", "How are you?", "Good morning"]
         results = await renderer.render_prompt(
-            prompt_or_prompts=text_list_input,
-            config=RenderConfig(max_length=100))
+            prompt_or_prompts=text_list_input, config=RenderConfig(max_length=100)
+        )
 
         assert len(results) == 3
         for result in results:
@@ -107,31 +106,31 @@ class TestRenderPrompt:
 
     @pytest.mark.asyncio
     async def test_no_truncation(self, renderer, mock_async_tokenizer):
-        mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 7592, 2088])
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+        mock_async_tokenizer.return_value = MockTokenizerResult([101, 7592, 2088])
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
         results = await renderer.render_prompt(
-            prompt_or_prompts="Hello world",
-            config=RenderConfig(max_length=100))
+            prompt_or_prompts="Hello world", config=RenderConfig(max_length=100)
+        )
 
         assert len(results) == 1
         call_args = mock_async_tokenizer.call_args
-        assert "truncation" not in call_args.kwargs or call_args.kwargs[
-            "truncation"] is False
+        assert (
+            "truncation" not in call_args.kwargs
+            or call_args.kwargs["truncation"] is False
+        )
 
     @pytest.mark.asyncio
     async def test_truncation_positive(self, renderer, mock_async_tokenizer):
         mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 7592, 2088])  # Truncated
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+            [101, 7592, 2088]
+        )  # Truncated
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
-        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
-                                               config=RenderConfig(
-                                                   max_length=100,
-                                                   truncate_prompt_tokens=50))
+        results = await renderer.render_prompt(
+            prompt_or_prompts="Hello world",
+            config=RenderConfig(max_length=100, truncate_prompt_tokens=50),
+        )
 
         assert len(results) == 1
         call_args = mock_async_tokenizer.call_args
@@ -142,14 +141,14 @@ class TestRenderPrompt:
     async def test_truncation_negative(self, renderer, mock_async_tokenizer):
         # Test that negative truncation uses model's max_model_len
         mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 7592, 2088])  # Truncated to max_model_len
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+            [101, 7592, 2088]
+        )  # Truncated to max_model_len
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
-        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
-                                               config=RenderConfig(
-                                                   max_length=200,
-                                                   truncate_prompt_tokens=-1))
+        results = await renderer.render_prompt(
+            prompt_or_prompts="Hello world",
+            config=RenderConfig(max_length=200, truncate_prompt_tokens=-1),
+        )
 
         assert len(results) == 1
         call_args = mock_async_tokenizer.call_args
@@ -159,12 +158,11 @@ class TestRenderPrompt:
     @pytest.mark.asyncio
     async def test_token_truncation_last_elements(self, renderer):
         # Test that token truncation keeps the last N elements
-        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108,
-                       109]  # 10 tokens
-        results = await renderer.render_prompt(prompt_or_prompts=long_tokens,
-                                               config=RenderConfig(
-                                                   max_length=100,
-                                                   truncate_prompt_tokens=5))
+        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]  # 10 tokens
+        results = await renderer.render_prompt(
+            prompt_or_prompts=long_tokens,
+            config=RenderConfig(max_length=100, truncate_prompt_tokens=5),
+        )
 
         assert len(results) == 1
         # Should keep the last 5 tokens: [105, 106, 107, 108, 109]
@@ -175,30 +173,30 @@ class TestRenderPrompt:
         long_tokens = list(range(150))  # Exceeds max_model_len=100
 
         with pytest.raises(ValueError, match="maximum context length"):
-            await renderer.render_prompt(prompt_or_prompts=long_tokens,
-                                         config=RenderConfig(max_length=100))
+            await renderer.render_prompt(
+                prompt_or_prompts=long_tokens, config=RenderConfig(max_length=100)
+            )
 
     @pytest.mark.asyncio
     async def test_no_tokenizer_for_text(self, mock_model_config):
         renderer_no_tokenizer = CompletionRenderer(
-            model_config=mock_model_config,
-            tokenizer=None,
-            async_tokenizer_pool={})
+            model_config=mock_model_config, tokenizer=None, async_tokenizer_pool={}
+        )
 
         with pytest.raises(ValueError, match="No tokenizer available"):
             await renderer_no_tokenizer.render_prompt(
-                prompt_or_prompts="Hello world",
-                config=RenderConfig(max_length=100))
+                prompt_or_prompts="Hello world", config=RenderConfig(max_length=100)
+            )
 
     @pytest.mark.asyncio
     async def test_token_input_with_needs_detokenization(
-            self, renderer, mock_async_tokenizer):
+        self, renderer, mock_async_tokenizer
+    ):
         # When needs_detokenization=True for token inputs, renderer should
         # use the async tokenizer to decode and include the original text
         # in the returned prompt object.
         mock_async_tokenizer.decode = AsyncMock(return_value="decoded text")
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
         tokens = [1, 2, 3, 4]
         results = await renderer.render_prompt(
@@ -213,7 +211,6 @@ class TestRenderPrompt:
 
 
 class TestRenderEmbedPrompt:
-
     def _create_test_embed_bytes(self, tensor: torch.Tensor) -> bytes:
         """Helper to create base64-encoded tensor bytes"""
         buffer = io.BytesIO()
@@ -244,9 +241,7 @@ class TestRenderEmbedPrompt:
             torch.randn(8, 512, dtype=torch.float32),
             torch.randn(12, 512, dtype=torch.float32),
         ]
-        embed_bytes_list = [
-            self._create_test_embed_bytes(t) for t in test_tensors
-        ]
+        embed_bytes_list = [self._create_test_embed_bytes(t) for t in test_tensors]
 
         results = await renderer.render_prompt_and_embeds(
             prompt_embeds=embed_bytes_list,
@@ -307,13 +302,10 @@ class TestRenderEmbedPrompt:
         assert results[0]["prompt_embeds"].shape == (10, 768)
 
     @pytest.mark.asyncio
-    async def test_both_prompts_and_embeds(self, renderer,
-                                           mock_async_tokenizer):
+    async def test_both_prompts_and_embeds(self, renderer, mock_async_tokenizer):
         # Set up text tokenization
-        mock_async_tokenizer.return_value = MockTokenizerResult(
-            [101, 102, 103])
-        renderer.async_tokenizer_pool[
-            renderer.tokenizer] = mock_async_tokenizer
+        mock_async_tokenizer.return_value = MockTokenizerResult([101, 102, 103])
+        renderer.async_tokenizer_pool[renderer.tokenizer] = mock_async_tokenizer
 
         # Create embed
         test_tensor = torch.randn(5, 256, dtype=torch.float32)
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48bf06088bc056c67700a8dc3ddfe6284928b05c
--- /dev/null
+++ b/tests/entrypoints/test_responses_utils.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.entrypoints.responses_utils import (
+    convert_tool_responses_to_completions_format,
+)
+
+
+class TestResponsesUtils:
+    """Tests for convert_tool_responses_to_completions_format function."""
+
+    def test_convert_tool_responses_to_completions_format(self):
+        """Test basic conversion of a flat tool schema to nested format."""
+        input_tool = {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        }
+
+        result = convert_tool_responses_to_completions_format(input_tool)
+
+        assert result == {"type": "function", "function": input_tool}
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
index 33ad2cfd3a33aaa33cee952dd280bd868ee6860c..b56fbd9fee7e018061da0be864fa003b3f2fde6b 100644
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -11,7 +11,6 @@ from vllm.entrypoints.ssl import SSLCertRefresher
 
 
 class MockSSLContext(SSLContext):
-
     def __init__(self):
         self.load_cert_chain_count = 0
         self.load_ca_count = 0
@@ -34,7 +33,7 @@ class MockSSLContext(SSLContext):
 
 
 def create_file() -> str:
-    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
+    with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as f:
         return f.name
 
 
diff --git a/tests/evals/gpt_oss/__init__.py b/tests/evals/gpt_oss/__init__.py
index 0fec1fe5bcdfd0b2603420e9ac87122fc32bda1a..208f01a7cb5ee04c88d276fec2082cd4e830884b 100644
--- a/tests/evals/gpt_oss/__init__.py
+++ b/tests/evals/gpt_oss/__init__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/evals/gpt_oss/conftest.py b/tests/evals/gpt_oss/conftest.py
index 35528c0a6a36a5788d741e1a7588b3280a03d0d5..2f140ae2c8e9bfb419e256f12766da773a0a203a 100644
--- a/tests/evals/gpt_oss/conftest.py
+++ b/tests/evals/gpt_oss/conftest.py
@@ -8,11 +8,9 @@ Pytest configuration for GPT-OSS evaluation tests.
 def pytest_addoption(parser):
     """Add command line options for pytest."""
     parser.addoption("--model", action="store", help="Model name to evaluate")
-    parser.addoption("--metric",
-                     action="store",
-                     type=float,
-                     help="Expected metric threshold")
-    parser.addoption("--server-args",
-                     action="store",
-                     default="",
-                     help="Additional server arguments")
+    parser.addoption(
+        "--metric", action="store", type=float, help="Expected metric threshold"
+    )
+    parser.addoption(
+        "--server-args", action="store", default="", help="Additional server arguments"
+    )
diff --git a/tests/evals/gpt_oss/test_gpqa_correctness.py b/tests/evals/gpt_oss/test_gpqa_correctness.py
index 4cc4041a60ce7fa26a126a65cf592ed9e3ad9448..151deaa059f0d5cef778c8491ce30782497c43db 100644
--- a/tests/evals/gpt_oss/test_gpqa_correctness.py
+++ b/tests/evals/gpt_oss/test_gpqa_correctness.py
@@ -25,8 +25,19 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
 
     # Build the command to run the evaluation
     cmd = [
-        sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
-        model_name, "--reasoning-effort", "low", "--base-url", base_url
+        sys.executable,
+        "-m",
+        "gpt_oss.evals",
+        "--eval",
+        "gpqa",
+        "--model",
+        model_name,
+        "--reasoning-effort",
+        "low",
+        "--base-url",
+        base_url,
+        "--n-threads",
+        "200",
     ]
 
     try:
@@ -36,7 +47,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
             text=True,
             capture_output=True,
             timeout=1800,  # 30 minute timeout
-            env={"OPENAI_API_KEY": "dummy"})
+            env={"OPENAI_API_KEY": "dummy"},
+        )
 
         print("Evaluation process output:\n", result.stdout)
 
@@ -47,14 +59,16 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
 
         # If we still can't find it, raise an error
         raise ValueError(
-            f"Could not parse score from evaluation output:\n{result.stdout}")
+            f"Could not parse score from evaluation output:\n{result.stdout}"
+        )
 
     except subprocess.TimeoutExpired as e:
         raise RuntimeError("Evaluation timed out") from e
     except subprocess.CalledProcessError as e:
         raise RuntimeError(
             f"Evaluation failed with exit code {e.returncode}:\n"
-            f"stdout: {e.stdout}\nstderr: {e.stderr}") from e
+            f"stdout: {e.stdout}\nstderr: {e.stderr}"
+        ) from e
 
 
 def test_gpqa_correctness(request):
@@ -71,19 +85,20 @@ def test_gpqa_correctness(request):
         server_args = server_args_str.split()
 
     # Add standard server arguments
-    server_args.extend([
-        "--max-model-len",
-        "32768",
-        "--trust-remote-code",
-    ])
+    server_args.extend(
+        [
+            "--trust-remote-code",
+        ]
+    )
 
     print(f"Starting GPQA evaluation for model: {model_name}")
     print(f"Expected metric threshold: {expected_metric}")
     print(f"Server args: {' '.join(server_args)}")
 
     # Launch server and run evaluation
-    with RemoteOpenAIServer(model_name, server_args,
-                            max_wait_seconds=1800) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=1800
+    ) as remote_server:
         base_url = remote_server.url_for("v1")
         print(f"Server started at: {base_url}")
 
@@ -97,6 +112,7 @@ def test_gpqa_correctness(request):
         # Verify metric is within tolerance
         assert measured_metric >= expected_metric - TOL, (
             f"GPQA metric too low: {measured_metric:.4f} < "
-            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}")
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
+        )
 
         print(f"✅ GPQA test passed for {model_name}")
diff --git a/tests/evals/gsm8k/__init__.py b/tests/evals/gsm8k/__init__.py
index 0fec1fe5bcdfd0b2603420e9ac87122fc32bda1a..208f01a7cb5ee04c88d276fec2082cd4e830884b 100644
--- a/tests/evals/gsm8k/__init__.py
+++ b/tests/evals/gsm8k/__init__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index c5dbceeeb2b452b77b40e00cef02a237ffd66040..9297bf6ddf2d3ce107ccf1fe70780fb1093ba566 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+max_model_len: 4096
diff --git a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b7bdd1e65bb31599a7f3784d3addce25ecc487e
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@@ -0,0 +1,6 @@
+model_name: "nvidia/Qwen3-30B-A3B-FP4"
+accuracy_threshold: 0.89
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
+
diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c9b1084de7bc288f6bf7ac2c99bf97a16ae8c01
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -0,0 +1,5 @@
+Qwen3-0.6B-FP8.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml
+DeepSeek-V2-Lite-Instruct-FP8.yaml
+Qwen3-30B-A3B-NVFP4.yaml
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
index d96b0a66ede2b18313dd6036de7ee5bf4a231313..1932a13cdfc63bbe57c44e9067974e5664130bea 100644
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@@ -6,13 +6,12 @@ from pathlib import Path
 
 def pytest_addoption(parser):
     """Add custom command line options."""
-    parser.addoption("--config-list-file",
-                     default="configs/models-small.txt",
-                     help="File containing list of config files to test")
-    parser.addoption("--tp-size",
-                     default=1,
-                     type=int,
-                     help="Tensor parallel size")
+    parser.addoption(
+        "--config-list-file",
+        default="configs/models-small.txt",
+        help="File containing list of config files to test",
+    )
+    parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
 
 
 def pytest_generate_tests(metafunc):
@@ -55,12 +54,10 @@ def pytest_generate_tests(metafunc):
 
         # Generate test parameters
         if config_files:
-            metafunc.parametrize(["config_filename", "tp_size"],
-                                 [(config_file, int(tp_size))
-                                  for config_file in config_files],
-                                 ids=[
-                                     f"{config_file.stem}-tp{tp_size}"
-                                     for config_file in config_files
-                                 ])
+            metafunc.parametrize(
+                ["config_filename", "tp_size"],
+                [(config_file, int(tp_size)) for config_file in config_files],
+                ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
+            )
         else:
             print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index 7d0ce25f75dd42dc66597a3b0b684b350844a3ac..0421f8bb18592d39ad3f5ff56112579c1d6c5f04 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -12,7 +12,6 @@ import json
 import os
 import time
 from collections.abc import Generator
-from typing import Optional, Union
 
 import aiohttp
 import numpy as np
@@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
 INVALID = -9999999
 
 
-def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+def download_and_cache_file(url: str, filename: str | None = None) -> str:
     """Download and cache a file from a URL."""
     if filename is None:
         filename = os.path.join("/tmp", url.split("/")[-1])
@@ -76,14 +75,20 @@ def get_answer_value(answer_str: str) -> int:
         return INVALID
 
 
-async def call_vllm_api(session: aiohttp.ClientSession,
-                        prompt: str,
-                        temperature: float,
-                        max_tokens: int,
-                        stop: Optional[list[str]] = None,
-                        url: Optional[str] = None,
-                        seed: Optional[int] = None) -> str:
-    """Call vLLM's OpenAI-compatible completions endpoint."""
+async def call_vllm_api(
+    session: aiohttp.ClientSession,
+    prompt: str,
+    temperature: float,
+    max_tokens: int,
+    stop: list[str] | None = None,
+    url: str | None = None,
+    seed: int | None = None,
+) -> tuple[str, int]:
+    """Call vLLM's OpenAI-compatible completions endpoint.
+
+    Returns:
+        Tuple of (response_text, completion_tokens)
+    """
     data = {
         "prompt": prompt,
         "temperature": temperature,
@@ -94,26 +99,29 @@ async def call_vllm_api(session: aiohttp.ClientSession,
         data["seed"] = seed
 
     try:
-        async with session.post(f"{url}/v1/completions",
-                                json=data) as response:
+        async with session.post(f"{url}/v1/completions", json=data) as response:
             response.raise_for_status()
             result = await response.json()
-            return result["choices"][0]["text"]
+            text = result["choices"][0]["text"]
+            completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
+            return text, completion_tokens
     except Exception as e:
         print(f"Error calling vLLM API: {e}")
-        return ""
-
-
-def evaluate_gsm8k(num_questions: int = 1319,
-                   num_shots: int = 5,
-                   max_tokens: int = 256,
-                   host: str = "http://127.0.0.1",
-                   port: int = 8000,
-                   temperature: float = 0.0,
-                   seed: Optional[int] = 42) -> dict[str, Union[float, int]]:
+        return "", 0
+
+
+def evaluate_gsm8k(
+    num_questions: int = 1319,
+    num_shots: int = 5,
+    max_tokens: int = 256,
+    host: str = "http://127.0.0.1",
+    port: int = 8000,
+    temperature: float = 0.0,
+    seed: int | None = 42,
+) -> dict[str, float | int]:
     """
     Evaluate GSM8K accuracy using vLLM serve endpoint.
-    
+
     Returns dict with accuracy, invalid_rate, latency, etc.
     """
     base_url = f"{host}:{port}"
@@ -127,8 +135,10 @@ def evaluate_gsm8k(num_questions: int = 1319,
     # Build few-shot examples from train split (like lm-eval does)
     few_shot_examples = ""
     for i in range(num_shots):
-        few_shot_examples += (f"Question: {train_data[i]['question']}\n"
-                              f"Answer: {train_data[i]['answer']}\n\n")
+        few_shot_examples += (
+            f"Question: {train_data[i]['question']}\n"
+            f"Answer: {train_data[i]['answer']}\n\n"
+        )
 
     # Prepare test questions and labels from test split
     questions = []
@@ -142,10 +152,11 @@ def evaluate_gsm8k(num_questions: int = 1319,
     # Run evaluation
     async def run_async_evaluation():
         states: list[str] = [""] * num_questions
+        output_tokens: list[int] = [0] * num_questions
 
-        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
             prompt = few_shot_examples + questions[i]
-            answer = await call_vllm_api(
+            answer, tokens = await call_vllm_api(
                 session=session,
                 prompt=prompt,
                 temperature=temperature,
@@ -155,32 +166,37 @@ def evaluate_gsm8k(num_questions: int = 1319,
                 seed=seed,
             )
             states[i] = answer
-            return answer
+            output_tokens[i] = tokens
+            return answer, tokens
 
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
-                total=600)) as session:
+        async with aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(total=600)
+        ) as session:
             tasks = [get_answer(session, i) for i in range(num_questions)]
             await tqdm.gather(*tasks, desc="Evaluating")
 
-        return states
+        return states, output_tokens
 
-    print(f"Running GSM8K evaluation: {num_questions} questions, "
-          f"{num_shots}-shot")
+    print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
 
     tic = time.perf_counter()
-    states = asyncio.run(run_async_evaluation())
+    states, output_tokens = asyncio.run(run_async_evaluation())
     latency = time.perf_counter() - tic
 
     # Compute metrics
     preds = [get_answer_value(state) for state in states]
     accuracy = np.mean(np.array(preds) == np.array(labels))
     invalid_rate = np.mean(np.array(preds) == INVALID)
+    total_output_tokens = sum(output_tokens)
+    tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
 
     result = {
         "accuracy": accuracy,
         "invalid_rate": invalid_rate,
         "latency": latency,
         "questions_per_second": num_questions / latency,
+        "total_output_tokens": total_output_tokens,
+        "tokens_per_second": tokens_per_second,
         "num_questions": num_questions,
         "num_shots": num_shots,
         "max_tokens": max_tokens,
@@ -191,36 +207,28 @@ def evaluate_gsm8k(num_questions: int = 1319,
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="GSM8K evaluation for vLLM serve")
-    parser.add_argument("--num-shots",
-                        type=int,
-                        default=5,
-                        help="Number of few-shot examples")
-    parser.add_argument("--num-questions",
-                        type=int,
-                        default=1319,
-                        help="Number of questions to evaluate")
-    parser.add_argument("--max-tokens",
-                        type=int,
-                        default=256,
-                        help="Max tokens for generation")
-    parser.add_argument("--host",
-                        type=str,
-                        default="http://127.0.0.1",
-                        help="Host URL")
+    parser = argparse.ArgumentParser(description="GSM8K evaluation for vLLM serve")
+    parser.add_argument(
+        "--num-shots", type=int, default=5, help="Number of few-shot examples"
+    )
+    parser.add_argument(
+        "--num-questions",
+        type=int,
+        default=1319,
+        help="Number of questions to evaluate",
+    )
+    parser.add_argument(
+        "--max-tokens", type=int, default=256, help="Max tokens for generation"
+    )
+    parser.add_argument("--host", type=str, default="http://127.0.0.1", help="Host URL")
     parser.add_argument("--port", type=int, default=8000, help="Port number")
-    parser.add_argument("--temperature",
-                        type=float,
-                        default=0.0,
-                        help="Temperature for generation")
-    parser.add_argument("--seed",
-                        type=int,
-                        default=42,
-                        help="Random seed for reproducibility")
-    parser.add_argument("--save-results",
-                        type=str,
-                        help="Save results to JSON file")
+    parser.add_argument(
+        "--temperature", type=float, default=0.0, help="Temperature for generation"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for reproducibility"
+    )
+    parser.add_argument("--save-results", type=str, help="Save results to JSON file")
 
     args = parser.parse_args()
 
@@ -240,6 +248,8 @@ def main() -> None:
     print(f"Invalid responses: {result['invalid_rate']:.3f}")
     print(f"Total latency: {result['latency']:.3f} s")
     print(f"Questions per second: {result['questions_per_second']:.3f}")
+    print(f"Total output tokens: {result['total_output_tokens']}")
+    print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
 
     # Optional file saving
     if args.save_results:
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index a12dd49dbea6d7b0f345500b374e483e9060b2d3..b5d67df7bf3db48f40e1a96bd938dd23f3a19fba 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -62,10 +62,12 @@ def test_gsm8k_correctness_param(config_filename, tp_size):
         str(tp_size),
     ]
 
+    env_dict = eval_config.get("env", None)
+
     # Launch server and run evaluation
-    with RemoteOpenAIServer(eval_config["model_name"],
-                            server_args,
-                            max_wait_seconds=480) as remote_server:
+    with RemoteOpenAIServer(
+        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
+    ) as remote_server:
         server_url = remote_server.url_for("v1")
 
         results = launch_gsm8k_eval(eval_config, server_url, tp_size)
@@ -85,6 +87,7 @@ def test_gsm8k_correctness_param(config_filename, tp_size):
         # Verify accuracy is within tolerance
         assert measured_accuracy >= expected_accuracy - RTOL, (
             f"Accuracy too low: {measured_accuracy:.3f} < "
-            f"{expected_accuracy:.3f} - {RTOL:.3f}")
+            f"{expected_accuracy:.3f} - {RTOL:.3f}"
+        )
 
         print(f"✅ GSM8K test passed for {eval_config['model_name']}")
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 9d65159bf64fe11f212fb25da6cf671d7ded72e9..6561e9556fa7ad42dbc3747b3e13195dc9526345 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -6,11 +6,7 @@ import torch
 # Reference default values of atol and rtol are from
 # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
-default_rtol = {
-    torch.float16: 1e-3,
-    torch.bfloat16: 1.6e-2,
-    torch.float: 1.3e-6
-}
+default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6}
 
 
 def get_default_atol(output) -> float:
diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py
index 88a2fb62b2540f7ff06c7d32a9657e416912ef42..e520267320c0bca27835c97eaf433492f39a084b 100644
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -3,8 +3,10 @@
 
 import pytest
 
-from vllm.utils import (create_kv_caches_with_random,
-                        create_kv_caches_with_random_flash)
+from vllm.utils.torch_utils import (
+    create_kv_caches_with_random,
+    create_kv_caches_with_random_flash,
+)
 
 
 @pytest.fixture()
diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 2d882bdf4066fd913c22410bed6037cae8d9debd..1dec46e33f22e69149ae51ab1d6681fa3dcddbb8 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -27,8 +26,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -39,7 +38,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -57,10 +56,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -74,11 +76,10 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="Only ROCm is supported")
-@pytest.mark.parametrize("seq_lens",
-                         [[(10, 1328), (5, 18),
-                           (129, 463)], [(8, 523), (24, 37), (3, 2011)]])
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="Only ROCm is supported")
+@pytest.mark.parametrize(
+    "seq_lens", [[(10, 1328), (5, 18), (129, 463)], [(8, 523), (24, 37), (3, 2011)]]
+)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -92,12 +93,12 @@ def test_varlen_with_paged_kv(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -109,34 +110,27 @@ def test_varlen_with_paged_kv(
     assert num_query_heads % num_kv_heads == 0
     max_query_len = max(query_lens)
     max_kv_len = max(kv_lens)
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
 
-    cu_seq_lens = torch.tensor([0] + kv_lens,
-                               dtype=torch.int32).cumsum(dim=0,
-                                                         dtype=torch.int32)
+    cu_seq_lens = torch.tensor([0] + kv_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     output = torch.empty_like(query)
 
@@ -187,5 +181,7 @@ def test_varlen_with_paged_kv(
     atol, rtol = 2e-2, 2e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index c7abf652f111b08bf2af0ee4e8e5b16e998e6759..9662e73321ebee0c02d3794221f474fc3a726357 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
-from typing import Optional
 
 import pytest
 import torch
@@ -12,7 +11,7 @@ from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.platforms import current_platform
-from vllm.utils import get_max_shared_memory_bytes
+from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
 if not current_platform.is_rocm():
     from xformers import ops as xops
@@ -42,9 +41,7 @@ BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 def ref_masked_attention(
@@ -52,7 +49,7 @@ def ref_masked_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
     attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
     if attn_mask is not None:
@@ -71,7 +68,7 @@ def ref_single_query_cached_kv_attention(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
     scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
 ) -> None:
     num_query_heads = query.shape[1]
     num_kv_heads = value_cache.shape[1]
@@ -110,8 +107,7 @@ def ref_single_query_cached_kv_attention(
             # Create the ALiBi bias used in the paged attention kernel.
             position_ids = torch.arange(seq_len).int()
             alibi_bias = (position_ids - seq_len + 1).float()
-            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
-                1, 1, -1)
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
 
         out = ref_masked_attention(q, keys, values, scale, alibi_bias)
         out = out.view(num_query_heads, head_size)
@@ -119,8 +115,8 @@ def ref_single_query_cached_kv_attention(
 
 
 @pytest.mark.parametrize(
-    "version",
-    ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
+    "version", ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]
+)
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -143,13 +139,18 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
-    if ((kv_cache_dtype == "fp8" and head_size % 16)
-            or (version == "rocm" and head_size not in (64, 128))):
+    if (kv_cache_dtype == "fp8" and head_size % 16) or (
+        version == "rocm" and head_size not in (64, 128)
+    ):
         pytest.skip()
 
-    if (version == "rocm" and current_platform.is_navi()
-            and (kv_cache_dtype == "fp8" or head_size != 128
-                 or block_size != 16 or use_alibi)):
+    if (
+        version == "rocm"
+        and current_platform.is_navi()
+        and (
+            kv_cache_dtype == "fp8" or head_size != 128 or block_size != 16 or use_alibi
+        )
+    ):
         pytest.skip()
 
     global PARTITION_SIZE
@@ -177,18 +178,24 @@ def test_paged_attention(
     block_tables_lst: list[list[int]] = []
     for _ in range(num_seqs):
         block_table = [
-            random.randint(0, NUM_BLOCKS - 1)
-            for _ in range(max_num_blocks_per_seq)
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
         ]
         block_tables_lst.append(block_table)
 
     block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
+    key_caches, value_caches = kv_cache_factory(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
@@ -214,18 +221,37 @@ def test_paged_attention(
             v_scale,
         )
 
-        opcheck(torch.ops._C.paged_attention_v1,
-                (output, query, key_cache, value_cache, num_kv_heads, scale,
-                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
-                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
-                cond=(head_size == HEAD_SIZES[0]
-                      and block_size == BLOCK_SIZES[0]))
+        opcheck(
+            torch.ops._C.paged_attention_v1,
+            (
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                0,
+                0,
+                0,
+                64,
+                0,
+            ),
+            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+        )
 
     elif version in ("v2", "rocm"):
         if current_platform.is_rocm() and version == "rocm":
             PARTITION_SIZE = PARTITION_SIZE_ROCM
 
-        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         assert PARTITION_SIZE % block_size == 0
         num_seqs, num_heads, head_size = output.shape
         tmp_output = torch.empty(
@@ -258,13 +284,34 @@ def test_paged_attention(
                 v_scale,
             )
 
-            opcheck(torch.ops._C.paged_attention_v2,
-                    (output, exp_sums, max_logits, tmp_output, query,
-                     key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
-                    cond=(head_size == HEAD_SIZES[0]
-                          and block_size == BLOCK_SIZES[0]))
+            opcheck(
+                torch.ops._C.paged_attention_v2,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                    0,
+                    0,
+                    0,
+                    64,
+                    0,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
 
         else:
             ops.paged_attention_rocm(
@@ -288,13 +335,30 @@ def test_paged_attention(
                 v_scale,
             )
 
-            opcheck(torch.ops._rocm_C.paged_attention,
-                    (output, exp_sums, max_logits, tmp_output, query,
-                     key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale),
-                    cond=(head_size == HEAD_SIZES[0]
-                          and block_size == BLOCK_SIZES[0]))
+            opcheck(
+                torch.ops._rocm_C.paged_attention,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    None,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
 
     else:
         raise AssertionError(f"Unknown version: {version}")
@@ -303,18 +367,17 @@ def test_paged_attention(
     if kv_cache_dtype == "fp8":
         # Convert cache data back to dtype.
         x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
-                           block_size, x)
-        dequantized_key_cache = torch.empty(size=key_cache_shape,
-                                            dtype=dtype,
-                                            device=device)
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+        dequantized_key_cache = torch.empty(
+            size=key_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_key_cache, key_cache)
         key_cache = dequantized_key_cache
 
         value_cache_shape = value_cache.shape
-        dequantized_value_cache = torch.empty(size=value_cache_shape,
-                                              dtype=dtype,
-                                              device=device)
+        dequantized_value_cache = torch.empty(
+            size=value_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_value_cache, value_cache)
         value_cache = dequantized_value_cache
 
@@ -351,7 +414,7 @@ def ref_multi_query_kv_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
-    alibi_bias: Optional[list[torch.Tensor]],
+    alibi_bias: list[torch.Tensor] | None,
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
@@ -367,8 +430,9 @@ def ref_multi_query_kv_attention(
         if alibi_bias:
             attn_mask = alibi_bias[i]
         else:
-            attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                                   diagonal=1)
+            attn_mask = torch.triu(
+                torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1
+            )
             attn_mask = attn_mask * torch.finfo(dtype).min
             attn_mask = attn_mask.to(dtype=dtype)
 
@@ -390,8 +454,9 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
@@ -413,13 +478,11 @@ def test_multi_query_kv_attention(
 
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(num_tokens,
-                      num_query_heads + 2 * num_kv_heads,
-                      head_size,
-                      dtype=dtype)
+    qkv = torch.empty(
+        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
+    )
     qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split(
-        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
+    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
 
     num_queries_per_kv = num_query_heads // num_kv_heads
     if num_queries_per_kv > 1:
@@ -429,8 +492,7 @@ def test_multi_query_kv_attention(
     alibi_bias = None
     if use_alibi:
         alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
-                                    seq_lens)
+        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
         output = torch.empty_like(query)
         start = 0
         # Dynamic sequence length not supported with custom attn_bias.
@@ -442,7 +504,8 @@ def test_multi_query_kv_attention(
                 value[None, start:end],
                 attn_bias=attn_bias[i],
                 p=0.0,
-                scale=scale)
+                scale=scale,
+            )
             output[start:end].copy_(out.view_as(query[start:end]))
             start += seq_len
         # xformers.AttentionBias to Tensor for use in reference impl.
@@ -485,8 +548,9 @@ def test_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 @torch.inference_mode()
 def test_multi_query_kv_attention_with_alibi(
     num_seqs: int,
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 5b7969f1dd455fb7061fb726f0a7698d83c9ca0a..3b8e939300a273ab5e81a88135c902ba2dc8f4c1 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -15,16 +15,18 @@ from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
 # Define MLA and non-MLA backends separately
 DEVICE_MLA_BACKENDS = {
     "cuda": [
-        "TRITON_MLA", "FLASHMLA", "FLASHINFER_MLA", "FLASH_ATTN_MLA",
-        "CUTLASS_MLA"
+        "TRITON_MLA",
+        "FLASHMLA",
+        "FLASHINFER_MLA",
+        "FLASH_ATTN_MLA",
+        "CUTLASS_MLA",
     ],
     "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
     "cpu": [],
@@ -32,14 +34,15 @@ DEVICE_MLA_BACKENDS = {
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
     "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
-    "hip": ["ROCM_FLASH"],
-    "cpu": ["TORCH_SDPA"],
+    "hip": ["ROCM_ATTN"],
+    "cpu": ["CPU_ATTN"],
 }
 
 DEVICE_MLA_BLOCK_SIZES = {
     "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
     "hip": [16, 1],  # HIP requires special handling for block_size=1
-    "cpu": [16]  # CPU uses fixed block size from test cases
+    # "cpu": [16]  # CPU uses fixed block size from test cases
+    "cpu": [],  # FIXME(woosuk): Temporarily disable CPU tests
 }
 
 
@@ -47,12 +50,13 @@ def generate_params():
     params = []
     for use_mla in [True, False]:
         for device in ["cuda", "hip", "cpu"]:
-            backends = DEVICE_MLA_BACKENDS[
-                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            backends = (
+                DEVICE_MLA_BACKENDS[device]
+                if use_mla
+                else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            )
             for name in backends:
-                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
-                    16
-                ]
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [16]
                 for block_size in block_sizes:
                     params.append(
                         pytest.param(
@@ -60,14 +64,13 @@ def generate_params():
                             name,
                             use_mla,
                             block_size,
-                            id=
-                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
-                        ))
+                            id=f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}",
+                        )
+                    )
     return params
 
 
-@pytest.mark.parametrize("device, name, use_mla, block_size",
-                         generate_params())
+@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
 def test_env(
     device: str,
     name: str,
@@ -77,19 +80,16 @@ def test_env(
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv(STR_BACKEND_ENV_VAR, name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
-            assert backend.get_name() == "TORCH_SDPA"
+            assert backend.get_name() == "CPU_ATTN"
 
         elif device == "hip":
-            with patch("vllm.attention.selector.current_platform",
-                       RocmPlatform()):
+            with patch("vllm.platforms.current_platform", RocmPlatform()):
                 if use_mla:
                     # ROCm MLA backend logic:
                     # - TRITON_MLA: supported when block_size != 1
@@ -100,50 +100,33 @@ def test_env(
                     if name == "TRITON_MLA" and block_size == 1:
                         # TRITON_MLA doesn't support block_size == 1
                         with pytest.raises(ValueError) as exc_info:
-                            get_attn_backend(16,
-                                             torch.float16,
-                                             None,
-                                             block_size,
-                                             use_mla=use_mla)
-                        assert f"The selected backend, {name}" in str(
-                            exc_info.value)
-                    elif name == "ROCM_AITER_MLA" and block_size != 1:
-                        # ROCM_AITER_MLA only supports block_size == 1
-                        with pytest.raises(ValueError) as exc_info:
-                            get_attn_backend(16,
-                                             torch.float16,
-                                             None,
-                                             block_size,
-                                             use_mla=use_mla)
-                        assert f"The selected backend, {name}" in str(
-                            exc_info.value)
+                            get_attn_backend(
+                                16, torch.float16, None, block_size, use_mla=use_mla
+                            )
+                        assert f"The selected backend, {name}" in str(exc_info.value)
                     else:
                         # Valid backend-block_size combination
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   use_mla=use_mla)
+                        backend = get_attn_backend(
+                            16, torch.float16, None, block_size, use_mla=use_mla
+                        )
                         expected = name
                         assert backend.get_name() == expected
                 else:
-                    backend = get_attn_backend(16,
-                                               torch.float16,
-                                               None,
-                                               block_size,
-                                               use_mla=use_mla)
-                    expected = "TRITON_ATTN"
+                    backend = get_attn_backend(
+                        16, torch.float16, None, block_size, use_mla=use_mla
+                    )
+                    expected = "ROCM_ATTN"
                     assert backend.get_name() == expected
 
         elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
                 if use_mla:
                     # CUDA MLA backend logic:
                     # - CUTLASS_MLA: only supported with block_size == 128
-                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    #   and Blackwell GPUs (SM 10.x), V1 only
                     # - FLASHINFER_MLA: only supported on Blackwell GPUs
-                    #   (SM 10.0+), V1 only
+                    #   (SM 10.x), V1 only
                     # - FLASHMLA: only supported with block_size == 64
                     # - FLASH_ATTN_MLA: V1 only
                     # - TRITON_MLA: fallback for other cases
@@ -151,129 +134,116 @@ def test_env(
                     if name == "CUTLASS_MLA":
                         if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
-                            pytest.skip(
-                                "CUTLASS_MLA only supports block_size 128")
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       use_mla=use_mla)
-                            expected = "CUTLASS_MLA"
-                            assert backend.get_name() == expected
+                            pytest.skip("CUTLASS_MLA only supports block_size 128")
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
                         if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
-                                "FlashInfer MLA only supports block_size 32 "
-                                "or 64")
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       use_mla=use_mla)
-                            expected = "FLASHINFER_MLA"
-                            assert backend.get_name() == expected
+                                "FlashInfer MLA only supports block_size 32 or 64"
+                            )
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHMLA":
                         if block_size != 64:
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
-                        else:
-                            from vllm.v1.attention.backends.mla.flashmla import (  # noqa: E501
-                                is_flashmla_supported)
-                            is_supported, _ = is_flashmla_supported()
-                            if not is_supported:
-                                pytest.skip(
-                                    "FlashMLA not supported on this platform")
-                            else:
-                                backend = get_attn_backend(16,
-                                                           torch.float16,
-                                                           None,
-                                                           block_size,
-                                                           use_mla=use_mla)
-                                expected = name
-                                assert backend.get_name() == expected
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
+
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
+                        backend = get_attn_backend(
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   use_mla=use_mla)
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
                         expected = "FLASH_ATTN_MLA"
                         assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   use_mla=use_mla)
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
                         expected = "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
-                    backend = get_attn_backend(16,
-                                               torch.float16,
-                                               None,
-                                               block_size,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        64, torch.float16, None, block_size, use_mla=use_mla
+                    )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
                 elif name == "XFORMERS":
-                    backend = get_attn_backend(32,
-                                               torch.float16,
-                                               None,
-                                               block_size,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        32, torch.float16, None, block_size, use_mla=use_mla
+                    )
                     expected = "XFORMERS"
                     assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
-                    backend = get_attn_backend(32,
-                                               torch.float16,
-                                               None,
-                                               block_size,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        32, torch.float16, None, block_size, use_mla=use_mla
+                    )
                     expected = "FLASH_ATTN"
                     assert backend.get_name() == expected
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_fp32_fallback(
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    if device == "cpu":
+        with patch("vllm.platforms.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "CPU_ATTN"
 
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA"
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "FLEX_ATTENTION"
+    elif device == "cuda":
+        with patch("vllm.platforms.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
-    pytest.skip("Skipping as current backend selector does not " \
-                "handle fallbacks when a backend is set via env var.")
+    pytest.skip(
+        "Skipping as current backend selector does not "
+        "handle fallbacks when a backend is set via env var."
+    )
 
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
 
         # Unsupported CUDA arch
-        monkeypatch.setattr(torch.cuda,
-                            "get_device_capability",
-                            lambda _=None: (7, 5))
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
@@ -294,17 +264,17 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
         # flash-attn is not installed
         import sys
-        original_module = sys.modules.get('vllm_flash_attn')
-        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
+
+        original_module = sys.modules.get("vllm_flash_attn")
+        monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
         backend = get_attn_backend(16, torch.float16, None, 16)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
         # Restore the original module if it existed
         if original_module is not None:
-            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
-                                original_module)
+            monkeypatch.setitem(sys.modules, "vllm_flash_attn", original_module)
         else:
-            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
+            monkeypatch.delitem(sys.modules, "vllm_flash_attn", raising=False)
 
         # Unsupported head size
         backend = get_attn_backend(17, torch.float16, None, 16)
@@ -313,9 +283,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
 def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
     """Test that invalid attention backend names raise ValueError."""
-    with monkeypatch.context() as m, patch(
-            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1")
+    with (
+        monkeypatch.context() as m,
+        patch("vllm.platforms.current_platform", CudaPlatform()),
+    ):
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
         # Should raise ValueError for invalid backend
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 6e096a4c3999e109f219af816a5e52e8d7efc8c3..f33a27d1fd85ac5881baf3abd533f6ec64e1fb4f 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -10,7 +10,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
-COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
 DTYPES = [torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -32,9 +32,7 @@ NUM_BLOCKS = [1024, 10000]
 
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # We assume fp8 is always enabled for testing.
 KV_CACHE_DTYPE = ["auto", "fp8"]
@@ -85,24 +83,33 @@ def test_copy_blocks(
         block_mapping.append((src, dst2))
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
-                                                num_layers, num_heads,
-                                                head_size, kv_cache_dtype,
-                                                dtype, seed, device)
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        num_layers,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
 
     # Clone the KV caches.
     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 
     # Call the copy blocks kernel.
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device=device).view(-1, 2)
-
-    opcheck(torch.ops._C_cache_ops.copy_blocks,
-            (key_caches, value_caches, block_mapping_tensor),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-            cond=(head_size == HEAD_SIZES[0]))
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device=device
+    ).view(-1, 2)
+
+    opcheck(
+        torch.ops._C_cache_ops.copy_blocks,
+        (key_caches, value_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+        cond=(head_size == HEAD_SIZES[0]),
+    )
     ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
@@ -115,8 +122,7 @@ def test_copy_blocks(
     # Compare the results.
     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
         torch.testing.assert_close(key_cache, cloned_key_cache)
-    for value_cache, cloned_value_cache in zip(value_caches,
-                                               cloned_value_caches):
+    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
         torch.testing.assert_close(value_cache, cloned_value_cache)
 
 
@@ -155,10 +161,17 @@ def test_reshape_and_cache(
     _, key, value = qkv.unbind(dim=1)
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
-                                                num_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
@@ -176,12 +189,30 @@ def test_reshape_and_cache(
         cloned_value_cache = value_cache.clone()
 
     # Call the reshape_and_cache kernel.
-    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
-            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-             k_scale, v_scale),
-            cond=(head_size == HEAD_SIZES[0]))
-    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, k_scale, v_scale)
+    opcheck(
+        torch.ops._C_cache_ops.reshape_and_cache,
+        (
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        ),
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
@@ -202,14 +233,12 @@ def test_reshape_and_cache(
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
     if kv_cache_dtype == "fp8":
-        torch.testing.assert_close(result_key_cache,
-                                   cloned_key_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
-        torch.testing.assert_close(result_value_cache,
-                                   cloned_value_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
         torch.testing.assert_close(value_cache, cloned_value_cache)
@@ -254,15 +283,8 @@ def test_reshape_and_cache_flash(
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
-    qkv = torch.randn(num_tokens,
-                      3,
-                      num_heads,
-                      head_size,
-                      dtype=dtype,
-                      device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype, device=device)
     _, key, value = qkv.unbind(dim=1)
 
     # Create the KV caches.
@@ -293,48 +315,73 @@ def test_reshape_and_cache_flash(
 
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
-        cloned_key_cache = torch.empty_like(key_cache_compact,
-                                            dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
-                        kv_cache_dtype)
-        cloned_value_cache = torch.empty_like(value_cache_compact,
-                                              dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache_compact,
-                        v_scale.item(), kv_cache_dtype)
+        cloned_key_cache = torch.empty_like(key_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            cloned_key_cache, key_cache_compact, k_scale.item(), kv_cache_dtype
+        )
+        cloned_value_cache = torch.empty_like(value_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            cloned_value_cache, value_cache_compact, v_scale.item(), kv_cache_dtype
+        )
     else:
         cloned_key_cache = key_cache_compact.clone()
         cloned_value_cache = value_cache_compact.clone()
     # Call the reshape_and_cache kernel.
     if implementation == "cuda":
-        opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
-                (key, value, key_cache, value_cache, slot_mapping,
-                 kv_cache_dtype, k_scale, v_scale),
-                cond=(head_size == HEAD_SIZES[0]))
-        ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                    slot_mapping, kv_cache_dtype, k_scale,
-                                    v_scale)
+        opcheck(
+            torch.ops._C_cache_ops.reshape_and_cache_flash,
+            (
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            ),
+            cond=(head_size == HEAD_SIZES[0]),
+        )
+        ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
     elif implementation == "triton":
         from vllm.attention.ops.triton_reshape_and_cache_flash import (
-            triton_reshape_and_cache_flash)
-        triton_reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                       slot_mapping, kv_cache_dtype, k_scale,
-                                       v_scale)
+            triton_reshape_and_cache_flash,
+        )
+
+        triton_reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
     key_cache_compact = permute_and_compact(key_cache)
     value_cache_compact = permute_and_compact(value_cache)
 
     if kv_cache_dtype == "fp8":
-        result_key_cache = torch.empty_like(key_cache_compact,
-                                            dtype=torch.float16)
-        ops.convert_fp8(result_key_cache,
-                        key_cache_compact,
-                        k_scale.item(),
-                        kv_dtype=kv_cache_dtype)
-        result_value_cache = torch.empty_like(value_cache_compact,
-                                              dtype=torch.float16)
-        ops.convert_fp8(result_value_cache,
-                        value_cache_compact,
-                        v_scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        result_key_cache = torch.empty_like(key_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            result_key_cache, key_cache_compact, k_scale.item(), kv_dtype=kv_cache_dtype
+        )
+        result_value_cache = torch.empty_like(value_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            result_value_cache,
+            value_cache_compact,
+            v_scale.item(),
+            kv_dtype=kv_cache_dtype,
+        )
 
     # Run the reference implementation.
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
@@ -352,14 +399,12 @@ def test_reshape_and_cache_flash(
             cloned_value_cache[block_idx, :, block_offset, :] = value[i]
 
     if kv_cache_dtype == "fp8":
-        torch.testing.assert_close(result_key_cache,
-                                   cloned_key_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
-        torch.testing.assert_close(result_value_cache,
-                                   cloned_value_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
     else:
         torch.testing.assert_close(key_cache_compact, cloned_key_cache)
         torch.testing.assert_close(value_cache_compact, cloned_value_cache)
@@ -396,8 +441,8 @@ def test_swap_blocks(
 
     current_platform.seed_everything(seed)
 
-    src_device = device if direction[0] == "cuda" else 'cpu'
-    dst_device = device if direction[1] == "cuda" else 'cpu'
+    src_device = device if direction[0] == "cuda" else "cpu"
+    dst_device = device if direction[1] == "cuda" else "cpu"
 
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
@@ -408,42 +453,62 @@ def test_swap_blocks(
         dst_blocks = random.sample(range(num_blocks), num_mappings)
 
     block_mapping = list(zip(src_blocks, dst_blocks))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device="cpu").view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device="cpu"
+    ).view(-1, 2)
 
     # Create the KV caches on the first device.
     src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
-        seed, src_device)
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        src_device,
+    )
 
     # Create the KV caches on the second device.
     dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
-        seed, dst_device)
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        dst_device,
+    )
 
     src_key_caches_clone = src_key_caches[0].clone()
     src_value_caches_clone = src_value_caches[0].clone()
 
     # Call the swap_blocks kernel.
-    do_opcheck = (head_size == HEAD_SIZES[0])
-    opcheck(torch.ops._C_cache_ops.swap_blocks,
-            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
-            cond=do_opcheck)
-    opcheck(torch.ops._C_cache_ops.swap_blocks,
-            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
-            cond=do_opcheck)
-
-    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
-                    block_mapping_tensor)
-    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
-                    block_mapping_tensor)
+    do_opcheck = head_size == HEAD_SIZES[0]
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+
+    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping_tensor)
+    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping_tensor)
 
     for src, dst in block_mapping:
-        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
-                                   dist_key_caches[0][dst].cpu())
-        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
-                                   dist_value_caches[0][dst].cpu())
+        torch.testing.assert_close(
+            src_key_caches_clone[src].cpu(), dist_key_caches[0][dst].cpu()
+        )
+        torch.testing.assert_close(
+            src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()
+        )
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -489,11 +554,9 @@ def _create_mla_cache(
     device: str,
 ) -> torch.Tensor:
     cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
-    return torch.zeros(num_blocks,
-                       block_size,
-                       entry_size,
-                       dtype=cache_dtype,
-                       device=device)
+    return torch.zeros(
+        num_blocks, block_size, entry_size, dtype=cache_dtype, device=device
+    )
 
 
 def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
@@ -533,20 +596,16 @@ def test_concat_and_cache_mla(
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 
     kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
-    k_pe = torch.randn(num_tokens,
-                       qk_rope_head_dim,
-                       dtype=dtype,
-                       device=device)
+    k_pe = torch.randn(num_tokens, qk_rope_head_dim, dtype=dtype, device=device)
     entry_size = kv_lora_rank + qk_rope_head_dim
 
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
-    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                 kv_cache_dtype, device)
+    kv_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
 
     for i in range(num_tokens):
@@ -558,10 +617,7 @@ def test_concat_and_cache_mla(
 
     if kv_cache_dtype == "fp8":
         ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
-        ops.convert_fp8(ref_kv_cache,
-                        ref_temp,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(ref_kv_cache, ref_temp, scale.item(), kv_dtype=kv_cache_dtype)
     else:
         ref_kv_cache = ref_temp
 
@@ -571,24 +627,18 @@ def test_concat_and_cache_mla(
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
-                             kv_cache_dtype, scale)
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale)
 
     if kv_cache_dtype == "fp8":
         result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
-        ops.convert_fp8(result_temp,
-                        kv_cache.contiguous(),
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(
+            result_temp, kv_cache.contiguous(), scale.item(), kv_dtype=kv_cache_dtype
+        )
         expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
-        ops.convert_fp8(expected_temp,
-                        ref_kv_cache,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
-        torch.testing.assert_close(result_temp,
-                                   expected_temp,
-                                   atol=0.001,
-                                   rtol=0.1)
+        ops.convert_fp8(
+            expected_temp, ref_kv_cache, scale.item(), kv_dtype=kv_cache_dtype
+        )
+        torch.testing.assert_close(result_temp, expected_temp, atol=0.001, rtol=0.1)
     else:
         torch.testing.assert_close(kv_cache, ref_kv_cache)
 
@@ -620,24 +670,21 @@ def test_concat_and_cache_ds_mla(
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 
     kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
-    k_pe = torch.randn(num_tokens,
-                       qk_rope_head_dim,
-                       dtype=dtype,
-                       device=device)
+    k_pe = torch.randn(num_tokens, qk_rope_head_dim, dtype=dtype, device=device)
     entry_size = kv_lora_rank + (4 * 4) + (2 * qk_rope_head_dim)
 
     scale = torch.tensor(1.0, dtype=torch.float32, device=device)
-    kv_cache = _create_mla_cache(num_blocks,
-                                 block_size,
-                                 entry_size,
-                                 dtype=torch.uint8,
-                                 kv_cache_dtype=kv_cache_dtype,
-                                 device=device)
+    kv_cache = _create_mla_cache(
+        num_blocks,
+        block_size,
+        entry_size,
+        dtype=torch.uint8,
+        kv_cache_dtype=kv_cache_dtype,
+        device=device,
+    )
 
     ref_cache = torch.zeros_like(kv_cache, dtype=kv_cache.dtype)
     tile_data = torch.zeros(128, dtype=dtype, device=device)
@@ -664,14 +711,16 @@ def test_concat_and_cache_ds_mla(
             manual_max = abs(tile_data_float[0])
             for j in range(1, 128):
                 manual_max = max(manual_max, abs(tile_data_float[j]))
-            tile_scale = manual_max / 448.
+            tile_scale = manual_max / 448.0
 
             ref_cache_32bit[kv_lora_rank // 4 + tile_idx] = tile_scale
 
-            ops.convert_fp8(ref_cache_slice[tile_start:tile_end],
-                            tile_data,
-                            tile_scale.item(),
-                            kv_dtype="fp8")
+            ops.convert_fp8(
+                ref_cache_slice[tile_start:tile_end],
+                tile_data,
+                tile_scale.item(),
+                kv_dtype="fp8",
+            )
 
         for j in range(qk_rope_head_dim):
             ref_cache_16bit[kv_lora_rank // 2 + 8 + j] = k_pe[i, j]
@@ -682,8 +731,7 @@ def test_concat_and_cache_ds_mla(
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
-                             kv_cache_dtype, scale)
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale)
 
     for i in range(num_tokens):
         slot = slot_mapping[i].item()
@@ -694,12 +742,14 @@ def test_concat_and_cache_ds_mla(
 
         kv_nope = kv_cache_slice[:kv_lora_rank]
         ref_nope = ref_cache_slice[:kv_lora_rank]
-        kv_scales = kv_cache_slice.view(torch.float32)[kv_lora_rank //
-                                                       4:kv_lora_rank // 4 + 4]
-        ref_scales = ref_cache_slice.view(
-            torch.float32)[kv_lora_rank // 4:kv_lora_rank // 4 + 4]
-        kv_rope = kv_cache_slice.view(dtype)[kv_lora_rank // 2 + 8:]
-        ref_rope = ref_cache_slice.view(dtype)[kv_lora_rank // 2 + 8:]
+        kv_scales = kv_cache_slice.view(torch.float32)[
+            kv_lora_rank // 4 : kv_lora_rank // 4 + 4
+        ]
+        ref_scales = ref_cache_slice.view(torch.float32)[
+            kv_lora_rank // 4 : kv_lora_rank // 4 + 4
+        ]
+        kv_rope = kv_cache_slice.view(dtype)[kv_lora_rank // 2 + 8 :]
+        ref_rope = ref_cache_slice.view(dtype)[kv_lora_rank // 2 + 8 :]
 
         torch.testing.assert_close(kv_nope, ref_nope, atol=0.001, rtol=0.1)
         torch.testing.assert_close(kv_scales, ref_scales, atol=0.001, rtol=0.1)
@@ -734,8 +784,9 @@ def test_copy_blocks_mla(
 
     kv_caches = []
     for _ in range(num_layers):
-        kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                     kv_cache_dtype, device)
+        kv_cache = _create_mla_cache(
+            num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+        )
         _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
         kv_caches.append(kv_cache)
 
@@ -752,9 +803,9 @@ def test_copy_blocks_mla(
         dst2 = dst_blocks[2 * i + 1]
         block_mapping.append((src, dst1))
         block_mapping.append((src, dst2))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device=device).view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device=device
+    ).view(-1, 2)
 
     for src, dst in block_mapping:
         for ref_cache in ref_caches:
@@ -795,10 +846,12 @@ def test_swap_blocks_mla(
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
-    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
-    dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
+    src_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
+    dst_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
 
     _fill_mla_cache(src_cache, kv_cache_dtype)
     _fill_mla_cache(dst_cache, kv_cache_dtype)
@@ -810,9 +863,9 @@ def test_swap_blocks_mla(
     remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remaining_blocks, num_mappings)
     block_mapping = list(zip(src_blocks, dst_blocks))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device="cpu").view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device="cpu"
+    ).view(-1, 2)
 
     opcheck(
         torch.ops._C_cache_ops.swap_blocks,
@@ -827,7 +880,8 @@ def test_swap_blocks_mla(
             src_cache_clone[src].cpu(),
             dst_cache[dst].cpu(),
             msg=f"Block {src} from src should have been swapped to block "
-            f"{dst} in dst_cache.")
+            f"{dst} in dst_cache.",
+        )
 
 
 @pytest.mark.parametrize("kv_lora_rank", [512])
@@ -840,32 +894,36 @@ def test_swap_blocks_mla(
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
-                                            block_size, num_blocks,
-                                            max_seq_len, batch_size, dtype,
-                                            kv_cache_dtype, device):
+def test_gather_and_maybe_dequant_cache_mla(
+    kv_lora_rank,
+    qk_rope_head_dim,
+    block_size,
+    num_blocks,
+    max_seq_len,
+    batch_size,
+    dtype,
+    kv_cache_dtype,
+    device,
+):
     entry_size = kv_lora_rank + qk_rope_head_dim
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
-    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
+    src_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0,
-                                   max_seq_len + 1, (batch_size, ),
-                                   device=device)
+    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
 
     total_tokens = seq_len_tensor.sum()
-    cu_seq_lens = torch.empty((batch_size + 1),
-                              dtype=torch.int32,
-                              device=device)
+    cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
-    block_table = torch.empty((batch_size, num_blocks),
-                              dtype=torch.int32,
-                              device=device)
+    block_table = torch.empty(
+        (batch_size, num_blocks), dtype=torch.int32, device=device
+    )
 
     for b in range(batch_size):
         perm = torch.randperm(num_blocks, device=device)
@@ -893,10 +951,8 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
         remaining = s - (tot - 1) * block_size
         last_block_data = src_cache[blocks[-1], :remaining, :]
         if kv_cache_dtype == "fp8":
-            dequantized_last_block = torch.empty_like(last_block_data,
-                                                      dtype=dtype)
-            ops.convert_fp8(dequantized_last_block, last_block_data,
-                            scale.item())
+            dequantized_last_block = torch.empty_like(last_block_data, dtype=dtype)
+            ops.convert_fp8(dequantized_last_block, last_block_data, scale.item())
             gathered_rows.append(dequantized_last_block)
         else:
             gathered_rows.append(last_block_data)
@@ -907,14 +963,29 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
 
     opcheck(
         torch.ops._C_cache_ops.gather_and_maybe_dequant_cache,
-        (src_cache, dst, block_table, cu_seq_lens, batch_size, kv_cache_dtype,
-         scale, None),
+        (
+            src_cache,
+            dst,
+            block_table,
+            cu_seq_lens,
+            batch_size,
+            kv_cache_dtype,
+            scale,
+            None,
+        ),
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.gather_and_maybe_dequant_cache(src_cache, dst, block_table,
-                                       cu_seq_lens, batch_size, kv_cache_dtype,
-                                       scale, None)
+    ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        kv_cache_dtype,
+        scale,
+        None,
+    )
     torch.testing.assert_close(dst, expected)
 
 
@@ -925,42 +996,46 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
 @pytest.mark.parametrize("max_seq_len", [512])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("dtype", [torch.float32])
-@pytest.mark.parametrize("kv_cache_dtype",
-                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize(
+    "kv_cache_dtype", ["auto"]
+)  # You can also test "fp8" if needed.
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
-                             num_blocks, max_seq_len, batch_size, dtype,
-                             kv_cache_dtype, device):
+def test_cp_gather_cache_mla(
+    kv_lora_rank,
+    qk_rope_head_dim,
+    block_size,
+    num_blocks,
+    max_seq_len,
+    batch_size,
+    dtype,
+    kv_cache_dtype,
+    device,
+):
     entry_size = kv_lora_rank + qk_rope_head_dim
-    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
+    src_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0,
-                                   max_seq_len + 1, (batch_size, ),
-                                   device=device)
+    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
 
     total_tokens = seq_len_tensor.sum()
-    cu_seq_lens = torch.empty((batch_size + 1),
-                              dtype=torch.int32,
-                              device=device)
+    cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
-    block_table = torch.empty((batch_size, num_blocks),
-                              dtype=torch.int32,
-                              device=device)
+    block_table = torch.empty(
+        (batch_size, num_blocks), dtype=torch.int32, device=device
+    )
 
     for b in range(batch_size):
         perm = torch.randperm(num_blocks, device=device)
         block_table[b, :] = perm
 
-    dst = torch.zeros((total_tokens, entry_size),
-                      dtype=src_cache.dtype,
-                      device=device)
+    dst = torch.zeros((total_tokens, entry_size), dtype=src_cache.dtype, device=device)
 
     expected_batches = []
     for b in range(batch_size):
@@ -1016,20 +1091,16 @@ def test_concat_and_cache_mla_cpu(
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 
     kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
-    k_pe = torch.randn(num_tokens,
-                       qk_rope_head_dim,
-                       dtype=dtype,
-                       device=device)
+    k_pe = torch.randn(num_tokens, qk_rope_head_dim, dtype=dtype, device=device)
     entry_size = kv_lora_rank + qk_rope_head_dim
 
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
-    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                 kv_cache_dtype, device)
+    kv_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
 
     for i in range(num_tokens):
@@ -1041,10 +1112,7 @@ def test_concat_and_cache_mla_cpu(
 
     if kv_cache_dtype == "fp8":
         ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
-        ops.convert_fp8(ref_kv_cache,
-                        ref_temp,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(ref_kv_cache, ref_temp, scale.item(), kv_dtype=kv_cache_dtype)
     else:
         ref_kv_cache = ref_temp
 
@@ -1054,6 +1122,5 @@ def test_concat_and_cache_mla_cpu(
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
-                             kv_cache_dtype, scale)
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale)
     torch.testing.assert_close(kv_cache, ref_kv_cache)
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 1e7e7e0a7f84b990b405c3a7b90d539bd7e58357..20f573821b25f1b530976df277291d5a25a97ffe 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.flash_attn import (cascade_attention,
-                                                   merge_attn_states)
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  is_fa_version_supported)
+from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
+from vllm.vllm_flash_attn import (
+    fa_version_unsupported_reason,
+    flash_attn_varlen_func,
+    is_fa_version_supported,
+)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -37,21 +37,14 @@ def test_merge_kernel(
     assert num_query_heads % num_kv_heads == 0
 
     # Prepare inputs.
-    prefix_output = torch.randn(num_tokens,
-                                num_query_heads,
-                                head_size,
-                                dtype=dtype)
-    suffix_output = torch.randn(num_tokens,
-                                num_query_heads,
-                                head_size,
-                                dtype=dtype)
+    prefix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
+    suffix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
     prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
     suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
 
     # Run the kernel.
     output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
-    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
-                      suffix_lse)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse)
 
     # Reference implementation.
     max_lse = torch.maximum(prefix_lse, suffix_lse)
@@ -91,14 +84,16 @@ def test_cascade(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
 
     current_platform.seed_everything(0)
 
@@ -107,11 +102,9 @@ def test_cascade(
     num_query_heads = num_heads[0]
     num_kv_heads = num_heads[1]
     assert num_query_heads % num_kv_heads == 0
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
 
     seq_lens, common_prefix_len = seq_lens_and_common_prefix
@@ -122,26 +115,21 @@ def test_cascade(
     max_kv_len = max(kv_lens)
 
     total_num_query_tokens = sum(query_lens)
-    query = torch.randn(total_num_query_tokens,
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    query = torch.randn(total_num_query_tokens, num_query_heads, head_size, dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     assert common_prefix_len > 0
     assert common_prefix_len % block_size == 0
     num_common_kv_blocks = common_prefix_len // block_size
     # Make sure the first `num_common_kv_blocks` blocks are the same.
-    block_tables[:, :num_common_kv_blocks] = \
-        block_tables[0, :num_common_kv_blocks]
+    block_tables[:, :num_common_kv_blocks] = block_tables[0, :num_common_kv_blocks]
 
     # Run the regular attention.
     ref_output = flash_attn_varlen_func(
@@ -161,8 +149,7 @@ def test_cascade(
 
     # Run cascade attention.
     assert all(common_prefix_len < kv_len for kv_len in kv_lens)
-    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
-                                        dtype=torch.int32)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32)
     prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32)
     suffix_kv_lens = kv_lens_tensor - common_prefix_len
     output = torch.empty_like(query)
@@ -183,6 +170,7 @@ def test_cascade(
         logits_soft_cap=soft_cap if soft_cap is not None else 0,
         block_table=block_tables,
         common_prefix_len=common_prefix_len,
+        max_num_splits=0,  # no max
         fa_version=fa_version,
     )
 
diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3b1799ba48e66f0ddff053183af07aea44a9c9
--- /dev/null
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -0,0 +1,575 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+
+NUM_HEADS = [
+    (4, 4),
+    (8, 2),
+    (9, 3),
+]
+HEAD_SIZES = [96, 128]
+QTYPES = [torch.bfloat16, torch.half, torch.float32]
+SLIDING_WINDOWS = [None, 256]
+NUM_BLOCKS = [
+    1024,
+]
+SEQ_LENS = [  # (q_len, kv_len)
+    [(1, 213), (1, 1), (1, 312), (1, 7), (1, 7812)],  # decode batch
+    [(2345, 2345), (5, 5), (3, 16), (134, 5131)],  # prefill batch
+    [(992, 2456), (1, 1234), (98, 1145), (1, 4162), (2345, 2345)],  # mixed batch
+]
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+
+    return tensor
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes.float()
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+    dtype = query.dtype
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+
+    if alibi_slopes is not None:
+        alibi_slopes = alibi_slopes[:, None, None]
+
+    if s_aux is not None:
+        s_aux = s_aux.float()
+        s_aux = s_aux[:, None, None]
+
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len].float()
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len].float()
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len].float()
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+
+        if alibi_slopes is not None:
+            q_start_pos = kv_len - query_len
+            q_pos = q_start_pos + torch.arange(0, query_len)[None, :, None]
+            kv_pos = torch.arange(0, kv_len)[None, None, :]
+            dist = q_pos - kv_pos
+            alibi_bias = -alibi_slopes * dist
+            attn += alibi_bias
+
+        attn.masked_fill_(mask, float("-inf"))
+
+        if s_aux is not None:
+            s_aux_ext = s_aux.repeat(1, query_len, 1)
+            attn = torch.cat((s_aux_ext, attn), dim=-1)
+
+        attn = torch.softmax(attn, dim=-1)
+
+        if s_aux is not None:
+            attn = attn[:, :, 1:]
+
+        out = torch.einsum("hqk,khd->qhd", attn, v).to(dtype=dtype)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@torch.inference_mode()
+def varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    # for n heads the set of slopes is the geometric sequence that starts
+    # 2^(-8/n)
+    alibi_slopes = _get_alibi_slopes(num_query_heads) if use_alibi else None
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=False,
+    )
+
+    out_without_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_without_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=True,
+    )
+
+    out_with_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_with_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+        alibi_slopes=alibi_slopes,
+        s_aux=s_aux,
+    )
+
+    atol, rtol = 1.5e-2, 1e-2
+    (
+        torch.testing.assert_close(out_with_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_with_split - ref_output))}",
+    )
+    (
+        torch.testing.assert_close(out_without_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_without_split - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec"])
+def test_varlen_with_paged_kv_normal_vec(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["amx"])
+@pytest.mark.skipif(
+    not torch._C._cpu._is_amx_tile_supported(), reason="no AMX support."
+)
+def test_varlen_with_paged_kv_normal_amx(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [48])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec16"])
+def test_varlen_with_paged_kv_normal_vec16(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [50])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_softcap(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [True])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_alibi(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [True])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_sink(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py
index 5078bd730a1a31cfb05587ae4dfb20c42be15e1f..a60f4e385a8930f025fc2651342ec5dcf6e7fed6 100644
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
-from typing import Optional
 
 import pytest
 import torch
@@ -12,33 +11,37 @@ from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 
 
-def cal_diff(x: torch.Tensor,
-             y: torch.Tensor,
-             name: str,
-             use_fp8: bool = False,
-             diff_threshold: Optional[float] = None) -> None:
+def cal_diff(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    name: str,
+    use_fp8: bool = False,
+    diff_threshold: float | None = None,
+) -> None:
     x, y = x.double(), y.double()
-    cos_diff = 1 - 2 * (x * y).sum().item() / max(
-        (x * x + y * y).sum().item(), 1e-12)
+    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
     if diff_threshold is not None:
         # directly compare the cos_diff with the threshold
         assert cos_diff < diff_threshold
     else:
         # use the default threshold
-        if (use_fp8):
+        if use_fp8:
             assert cos_diff < 1e-4
         else:
             assert cos_diff < 1e-5
 
 
-CUTLASS_MLA_UNSUPPORTED_REASON = \
-    "Cutlass MLA Requires compute capability of 10 or above." \
-    if not current_platform.is_device_capability(100) \
+CUTLASS_MLA_UNSUPPORTED_REASON = (
+    "Cutlass MLA Requires compute capability of 10 or above."
+    if not current_platform.is_device_capability(100)
     else "Cutlass MLA is supported"
+)
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(100),
-                    reason=CUTLASS_MLA_UNSUPPORTED_REASON)
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(100),
+    reason=CUTLASS_MLA_UNSUPPORTED_REASON,
+)
 @pytest.mark.parametrize("b", [128])
 @pytest.mark.parametrize("s_q", [1])
 @pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
@@ -54,40 +57,40 @@ CUTLASS_MLA_UNSUPPORTED_REASON = \
     [
         torch.bfloat16,
         # fp8 can have occasional precision-related failures.
-        pytest.param(torch.float8_e4m3fn, marks=pytest.mark.flaky(reruns=2))
-    ])
+        pytest.param(torch.float8_e4m3fn, marks=pytest.mark.flaky(reruns=2)),
+    ],
+)
 @torch.inference_mode()
-def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
-                            causal, varlen, torch_dtype):
+def test_cutlass_mla_decode(
+    b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen, torch_dtype
+):
     device = torch.device("cuda:0")
-    if torch_dtype == torch.float8_e4m3fn:
-        init_dtype = torch.bfloat16
-    else:
-        init_dtype = torch_dtype
+    init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
     torch.set_default_dtype(init_dtype)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
     torch.manual_seed(42)
     random.seed(42)
 
-    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}")
+    print(
+        f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+        f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}"
+    )
 
     use_fp8 = torch_dtype == torch.float8_e4m3fn
-    scale = math.sqrt(d)**(-1)
-    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    scale = math.sqrt(d) ** (-1)
+    cache_seqlens = torch.full((b,), mean_sk, dtype=torch.int32)
     if varlen:
         for i in range(b):
-            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
-                                   s_q)
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), s_q)
     total_seqlens = cache_seqlens.sum().item()
     max_seqlen = cache_seqlens.max().item()
     max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
 
     q = torch.randn(b, s_q, h_q, d)
-    block_table = torch.arange(b * max_seqlen_pad // block_size,
-                               dtype=torch.int32).view(
-                                   b, max_seqlen_pad // block_size)
+    block_table = torch.arange(
+        b * max_seqlen_pad // block_size, dtype=torch.int32
+    ).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
     blocked_v = blocked_k[..., :dv]
 
@@ -121,22 +124,29 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
             q_pe = q_pe_padded
 
         kv_cache_flat = blocked_k.squeeze(2)
-        device_properties = torch.cuda.get_device_properties(
-            torch.device("cuda:0"))
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda:0"))
         sm_count = device_properties.multi_processor_count
         workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
-            max_seqlen * block_size, b, sm_count, num_kv_splits=1)
-        workspace = torch.empty(workspace_size,
-                                device="cuda",
-                                dtype=torch.uint8)
+            max_seqlen * block_size, b, sm_count, num_kv_splits=1
+        )
+        workspace = torch.empty(workspace_size, device="cuda", dtype=torch.uint8)
 
         out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
-        output_lse = torch.empty((b, MAX_HEADS),
-                                 dtype=torch.float32,
-                                 device=q_nope.device)
-        ops.sm100_cutlass_mla_decode(out_ans, output_lse, q_nope, q_pe,
-                                     kv_cache_flat, cache_seqlens, block_table,
-                                     workspace, scale, 1)
+        output_lse = torch.empty(
+            (b, MAX_HEADS), dtype=torch.float32, device=q_nope.device
+        )
+        ops.sm100_cutlass_mla_decode(
+            out_ans,
+            output_lse,
+            q_nope,
+            q_pe,
+            kv_cache_flat,
+            cache_seqlens,
+            block_table,
+            workspace,
+            scale,
+            1,
+        )
         return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous()
 
     def scaled_dot_product_attention(query, key, value, is_causal=False):
@@ -150,8 +160,7 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
             s_q = query.shape[-2]
             s_k = key.shape[-2]
             attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
-            temp_mask = torch.ones(s_q, s_k,
-                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
             attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
             attn_bias.to(query.dtype)
             attn_weight += attn_bias
@@ -161,10 +170,16 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
 
     def ref_mla():
         q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
-        blocked_k_ = (blocked_k.to(torch.float) *
-                      descale_k).to(init_dtype) if use_fp8 else blocked_k
-        blocked_v_ = (blocked_v.to(torch.float) *
-                      descale_k).to(init_dtype) if use_fp8 else blocked_v
+        blocked_k_ = (
+            (blocked_k.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_k
+        )
+        blocked_v_ = (
+            (blocked_v.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_v
+        )
         out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
         lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
         for i in range(b):
@@ -191,8 +206,9 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
 
     t = triton.testing.do_bench(cutlass_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d +
-             b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + (
-                 b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
-    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,",
-          f"{bytes / 10 ** 6 / t:.0f} GB/s")
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d) * (
+        torch.finfo(torch_dtype).bits // 8
+    ) + (b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(
+        f"{t:.3f} ms, {FLOPS / 10**9 / t:.0f} TFLOPS,", f"{bytes / 10**6 / t:.0f} GB/s"
+    )
diff --git a/tests/kernels/attention/test_deepgemm_attention.py b/tests/kernels/attention/test_deepgemm_attention.py
index 2d901e408b27577b8e2e599d365665ac7cd5ae0e..e2ae3b833b204294fa96d4c0406554ae7db3ee3e 100644
--- a/tests/kernels/attention/test_deepgemm_attention.py
+++ b/tests/kernels/attention/test_deepgemm_attention.py
@@ -6,10 +6,16 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import cdiv, has_deep_gemm
-from vllm.utils.deep_gemm import (_ceil_to_ue8m0, calc_diff, fp8_mqa_logits,
-                                  fp8_paged_mqa_logits, get_num_sms,
-                                  get_paged_mqa_logits_metadata)
+from vllm.utils.deep_gemm import (
+    _ceil_to_ue8m0,
+    calc_diff,
+    fp8_mqa_logits,
+    fp8_paged_mqa_logits,
+    get_num_sms,
+    get_paged_mqa_logits_metadata,
+)
+from vllm.utils.import_utils import has_deep_gemm
+from vllm.utils.math_utils import cdiv
 
 
 def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:
@@ -24,17 +30,18 @@ def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:
         device=x.device,
         dtype=torch.uint8,
     )
-    x_fp8[:, :block_size * head_dim] = x_scaled.view(
-        num_blocks, block_size * head_dim).view(dtype=torch.uint8)
-    x_fp8[:,
-          block_size * head_dim:] = sf.view(num_blocks,
-                                            block_size).view(dtype=torch.uint8)
+    x_fp8[:, : block_size * head_dim] = x_scaled.view(
+        num_blocks, block_size * head_dim
+    ).view(dtype=torch.uint8)
+    x_fp8[:, block_size * head_dim :] = sf.view(num_blocks, block_size).view(
+        dtype=torch.uint8
+    )
     return x_fp8.view(num_blocks, block_size, num_heads, head_dim + 4)
 
 
 def per_custom_dims_cast_to_fp8(
-        x: torch.Tensor, dims: tuple,
-        use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, dims: tuple, use_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
     excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
     x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
     sf = x_amax / 448.0
@@ -69,13 +76,14 @@ def _ref_fp8_mqa_logits(
     q = q.float()
     k = k.float()
 
-    mask_lo = (torch.arange(0, seq_len_kv, device="cuda")[None, :]
-               >= cu_seqlen_ks[:, None])
-    mask_hi = (torch.arange(0, seq_len_kv, device="cuda")[None, :]
-               < cu_seqlen_ke[:, None])
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
+    )
     mask = mask_lo & mask_hi
-
-    score = torch.einsum("mhd,and->hmn", q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
     logits = logits.masked_fill(~mask, float("-inf"))
 
@@ -84,14 +92,15 @@ def _ref_fp8_mqa_logits(
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="SM90 and SM100 only")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90), reason="SM90 and SM100 only"
+)
 def test_deepgemm_fp8_mqa_logits():
     torch.manual_seed(0)
     random.seed(0)
     num_heads, head_dim = 32, 128
-    for seq_len in (512, ):
-        for seq_len_kv in (1024, ):
+    for seq_len in (512,):
+        for seq_len_kv in (1024,):
             for disable_cp in (False, True):
                 q = torch.randn(
                     seq_len,
@@ -100,24 +109,23 @@ def test_deepgemm_fp8_mqa_logits():
                     device="cuda",
                     dtype=torch.bfloat16,
                 )
-                kv = torch.randn(seq_len_kv,
-                                 head_dim,
-                                 device="cuda",
-                                 dtype=torch.bfloat16)
-                weights = torch.randn(seq_len,
-                                      num_heads,
-                                      device="cuda",
-                                      dtype=torch.float32)
+                kv = torch.randn(
+                    seq_len_kv, head_dim, device="cuda", dtype=torch.bfloat16
+                )
+                weights = torch.randn(
+                    seq_len, num_heads, device="cuda", dtype=torch.float32
+                )
 
                 if disable_cp:
                     ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
-                    ke = torch.arange(seq_len, dtype=torch.int,
-                                      device="cuda") + (seq_len_kv - seq_len)
+                    ke = torch.arange(seq_len, dtype=torch.int, device="cuda") + (
+                        seq_len_kv - seq_len
+                    )
                 else:
                     ks, ke = _generate_cp_test_data(seq_len, seq_len_kv)
 
                 q_fp8 = q.to(torch.float8_e4m3fn)
-                kv_fp8 = per_custom_dims_cast_to_fp8(kv, (0, ), False)
+                kv_fp8 = per_custom_dims_cast_to_fp8(kv, (0,), False)
                 logits = fp8_mqa_logits(q_fp8, kv_fp8, weights, ks, ke)
 
                 ref_logits = _ref_fp8_mqa_logits(
@@ -157,11 +165,10 @@ def _ref_fp8_paged_mqa_logits(
     context_lens_list = context_lens.tolist()
     for i in range(batch_size):
         context_len = context_lens_list[i]
-        q_offsets = torch.arange(context_len - next_n,
-                                 context_len,
-                                 device="cuda")
-        weight_slice = (weights[i * next_n:(i + 1) * next_n, :].transpose(
-            0, 1).contiguous())
+        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
         for block_rk in range(cdiv(context_len, block_size)):
             block_idx = block_tables[i][block_rk]
             qx, kx = q[i], kv_cache[block_idx]
@@ -170,28 +177,30 @@ def _ref_fp8_paged_mqa_logits(
                 (block_rk + 1) * block_size,
                 device="cuda",
             )
-            mask = (k_offsets[None, :] < context_len) & (k_offsets[None, :]
-                                                         <= q_offsets[:, None])
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
             s = torch.where(
                 mask[None, :, :],
                 (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
-                    logits.dtype),
+                    logits.dtype
+                ),
                 float("-inf"),
             )
             s = torch.relu(s) * weight_slice[..., None]
             s = s.sum(dim=0)
             logits[
-                i * next_n:(i + 1) * next_n,
-                block_rk * block_size:(block_rk + 1) * block_size,
-            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s,
-                            float("-inf"))
+                i * next_n : (i + 1) * next_n,
+                block_rk * block_size : (block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
     return logits
 
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="SM90 and SM100 only")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90), reason="SM90 and SM100 only"
+)
 def test_deepgemm_fp8_paged_mqa_logits():
     torch.manual_seed(0)
     random.seed(0)
@@ -199,7 +208,7 @@ def test_deepgemm_fp8_paged_mqa_logits():
     max_model_len = 4096
     for batch_size, next_n in [(4, 1), (2, 2)]:
         for heads, index_dim in [(32, 128)]:
-            for avg_kv in (2048, ):
+            for avg_kv in (2048,):
                 num_blocks, blocksize = max_model_len * 2, 64
 
                 q = torch.randn(
@@ -218,12 +227,14 @@ def test_deepgemm_fp8_paged_mqa_logits():
                     dtype=torch.float32,
                 )
 
-                context_lens = (torch.randint(int(0.8 * avg_kv),
-                                              int(1.2 * avg_kv),
-                                              (batch_size, )).cuda().to(
-                                                  torch.int32))
-                max_block_len = ((context_lens.max().item() + blocksize - 1) //
-                                 blocksize * blocksize)
+                context_lens = (
+                    torch.randint(int(0.8 * avg_kv), int(1.2 * avg_kv), (batch_size,))
+                    .cuda()
+                    .to(torch.int32)
+                )
+                max_block_len = (
+                    (context_lens.max().item() + blocksize - 1) // blocksize * blocksize
+                )
                 block_tables = torch.zeros(
                     (batch_size, max_block_len),
                     device="cuda",
@@ -243,7 +254,8 @@ def test_deepgemm_fp8_paged_mqa_logits():
                 kv_cache_fp8 = kv_cache_cast_to_fp8(kv_cache)
 
                 schedule_metadata = get_paged_mqa_logits_metadata(
-                    context_lens, blocksize, get_num_sms())
+                    context_lens, blocksize, get_num_sms()
+                )
                 logits = fp8_paged_mqa_logits(
                     q_fp8,
                     kv_cache_fp8,
@@ -263,15 +275,18 @@ def test_deepgemm_fp8_paged_mqa_logits():
                     max_model_len,
                 )
 
-                positions = (torch.arange(max_model_len,
-                                          device="cuda").unsqueeze(0).expand(
-                                              batch_size * next_n, -1))
-                row_indices = (
-                    torch.arange(batch_size * next_n, device="cuda") // next_n)
+                positions = (
+                    torch.arange(max_model_len, device="cuda")
+                    .unsqueeze(0)
+                    .expand(batch_size * next_n, -1)
+                )
+                row_indices = torch.arange(batch_size * next_n, device="cuda") // next_n
                 next_n_offset = (
-                    torch.arange(batch_size * next_n, device="cuda") % next_n)
-                mask = positions <= (context_lens[row_indices] - next_n +
-                                     next_n_offset).unsqueeze(1)
+                    torch.arange(batch_size * next_n, device="cuda") % next_n
+                )
+                mask = positions <= (
+                    context_lens[row_indices] - next_n + next_n_offset
+                ).unsqueeze(1)
 
                 logits = logits.masked_fill(~mask, 0)
                 ref_logits = ref_logits.masked_fill(~mask, 0)
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 2544703f8bf919a75fbd22104ba71779ce758e9b..6e5468969bf25f9c5b462504b8049903a0455e2c 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  flash_attn_with_kvcache,
-                                  is_fa_version_supported)
+from vllm.vllm_flash_attn import (
+    fa_version_unsupported_reason,
+    flash_attn_varlen_func,
+    is_fa_version_supported,
+)
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -32,8 +32,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -44,7 +44,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -62,10 +62,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -80,123 +83,9 @@ def ref_paged_attn(
 
 
 @pytest.mark.parametrize("use_out", [True, False])
-@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
-@pytest.mark.parametrize("fa_version", [2, 3])
-@pytest.mark.parametrize("q_dtype", QDTYPES)
-@torch.inference_mode()
-def test_flash_attn_with_paged_kv(
-    use_out: bool,
-    kv_lens: list[int],
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    block_size: int,
-    soft_cap: Optional[float],
-    num_blocks: int,
-    sliding_window: Optional[int],
-    fa_version: int,
-    q_dtype: Optional[torch.dtype],
-) -> None:
-    torch.set_default_device("cuda")
-    if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
-    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip("Flash attention with quantized inputs is only "
-                    "supported on version 3 with bfloat16 base type")
-
-    current_platform.seed_everything(0)
-    num_seqs = len(kv_lens)
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
-    max_kv_len = max(kv_lens)
-    scale = head_size**-0.5
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
-
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
-    value_cache = torch.randn_like(key_cache)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
-
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
-
-    q = query.unsqueeze(1)
-    out = torch.empty_like(q) if use_out else None
-
-    maybe_quantized_query = q
-    maybe_quantized_key_cache = key_cache
-    maybe_quantized_value_cache = value_cache
-    q_descale = None
-    k_descale = None
-    v_descale = None
-    if q_dtype is not None:
-        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
-        maybe_quantized_query = q.to(q_dtype)
-        maybe_quantized_key_cache = key_cache.to(q_dtype)
-        maybe_quantized_value_cache = value_cache.to(q_dtype)
-
-        scale_shape = (num_seqs, num_kv_heads)
-        q_descale = torch.ones(scale_shape, dtype=torch.float32)
-        k_descale = torch.ones(scale_shape, dtype=torch.float32)
-        v_descale = torch.ones(scale_shape, dtype=torch.float32)
-
-    output = flash_attn_with_kvcache(
-        q=maybe_quantized_query,
-        k_cache=maybe_quantized_key_cache,
-        v_cache=maybe_quantized_value_cache,
-        out=out,
-        softmax_scale=scale,
-        causal=True,
-        block_table=block_tables,
-        cache_seqlens=kv_lens_tensor,
-        softcap=soft_cap if soft_cap is not None else 0,
-        window_size=window_size,
-        fa_version=fa_version,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-    )
-    output = output if not use_out else out
-    output = output.squeeze(1)
-
-    atol, rtol = 1.5e-2, 1e-2
-    if q_dtype is not None:
-        atol, rtol = 1.5e-1, 1.5e-1
-
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap,
-                                sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
-
-
-@pytest.mark.parametrize("use_out", [True, False])
-@pytest.mark.parametrize("seq_lens",
-                         [[(1, 1328), (5, 18),
-                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -212,21 +101,25 @@ def test_varlen_with_paged_kv(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
     fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
     if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip("Flash attention with quantized inputs is only "
-                    "supported on version 3 with bfloat16 base type")
+        pytest.skip(
+            "Flash attention with quantized inputs is only "
+            "supported on version 3 with bfloat16 base type"
+        )
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
@@ -236,30 +129,23 @@ def test_varlen_with_paged_kv(
     assert num_query_heads % num_kv_heads == 0
     max_query_len = max(query_lens)
     max_kv_len = max(kv_lens)
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     out = torch.empty_like(query) if use_out else None
 
@@ -315,5 +201,7 @@ def test_varlen_with_paged_kv(
     atol, rtol = 1.5e-2, 1e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index a821a74aba93d85aeaf7d5a55cbaa1ea7e8f89d2..82ec2ef14e56cf6dea78c57e83043c2dc8607df2 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import flashinfer
 import pytest
@@ -26,8 +25,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -38,7 +37,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -56,10 +55,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -87,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -101,20 +103,16 @@ def test_flashinfer_decode_with_paged_kv(
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
 
-    key_value_cache = torch.randn(NUM_BLOCKS,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     kv_indptr = [0]
     kv_indices = []
@@ -135,9 +133,9 @@ def test_flashinfer_decode_with_paged_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
-                use_tensor_cores=True)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=True
+    )
     wrapper.plan(
         kv_indptr,
         kv_indices,
@@ -155,17 +153,21 @@ def test_flashinfer_decode_with_paged_kv(
 
     output = wrapper.run(query, key_value_cache)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap,
-                                sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+        sliding_window=sliding_window,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
@@ -182,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -196,16 +198,10 @@ def test_flashinfer_prefill_with_paged_kv(
     max_kv_len = max(kv_lens)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_value_cache = torch.randn(NUM_BLOCKS,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
@@ -215,10 +211,9 @@ def test_flashinfer_prefill_with_paged_kv(
     value_cache /= head_size**0.5
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     qo_indptr = [0]
     kv_indptr = [0]
@@ -242,8 +237,7 @@ def test_flashinfer_prefill_with_paged_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, "NHD")
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
     wrapper.plan(
         qo_indptr,
         kv_indptr,
@@ -264,17 +258,21 @@ def test_flashinfer_prefill_with_paged_kv(
         key_value_cache,
     )
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=query_lens,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap,
-                                sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+        sliding_window=sliding_window,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
@@ -284,9 +282,13 @@ def test_flashinfer_prefill_with_paged_kv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 def test_flashinfer_prefill_with_paged_fp8_kv(
-        seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
-        head_size: int, dtype: torch.dtype, block_size: int,
-        soft_cap: Optional[float]) -> None:
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+) -> None:
     pytest.skip("TODO: fix the accuracy issue")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -301,17 +303,11 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
     kv_cache_dtype = torch.float8_e4m3fn
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
     NUM_BLOCKS_FP8 = 2048
-    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
     key_cache /= head_size**0.5
     value_cache /= head_size**0.5
@@ -319,15 +315,15 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     k_scale = key_cache.amax().item() / 448.0
     v_scale = value_cache.amax().item() / 448.0
 
-    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
-                             dim=1).to(kv_cache_dtype)
+    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale], dim=1).to(
+        kv_cache_dtype
+    )
 
-    assert (kv_cache_fp8.shape == key_value_cache.shape)
+    assert kv_cache_fp8.shape == key_value_cache.shape
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS_FP8,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     qo_indptr = [0]
     kv_indptr = [0]
@@ -351,8 +347,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, "NHD")
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
     wrapper.plan(
         qo_indptr,
         kv_indptr,
@@ -369,19 +364,23 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
     output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache.squeeze(1),
-                                value_cache=value_cache.squeeze(1),
-                                query_lens=query_lens,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache.squeeze(1),
+        value_cache=value_cache.squeeze(1),
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
@@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
@@ -414,12 +413,9 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
     NUM_BLOCKS_FP8 = 2048
-    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
     key_cache /= head_size**0.5
     value_cache /= head_size**0.5
@@ -429,14 +425,13 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 
     key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
     value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
-    assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
+    assert key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1
     kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS_FP8,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     kv_indptr = [0]
     kv_indices = []
@@ -457,32 +452,38 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
-                    use_tensor_cores=use_tensor_cores)
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
-                 logits_soft_cap=soft_cap)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
+    )
     output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
     # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py
index 02225432f77fc63aea37a5bc1abfe62e06b89017..0350136677c6b159e2f7a52985c4886552b74427 100644
--- a/tests/kernels/attention/test_flashinfer_mla_decode.py
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -13,34 +13,29 @@ FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
 if not current_platform.has_device_capability(100):
     pytest.skip(
         reason="FlashInfer MLA Requires compute capability of 10 or above.",
-        allow_module_level=True)
+        allow_module_level=True,
+    )
 
 
 def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
 ):
     bs, num_heads, v_head_dim = out.shape
     head_dim = query.shape[2]
 
     for i in range(bs):
         # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
         v = kv[:, :, :v_head_dim]
 
         q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
         out[i] = o.view(num_heads, v_head_dim)
 
     return out
@@ -50,7 +45,7 @@ def ref_mla(
 @pytest.mark.parametrize("bs", [1, 2, 4, 16])
 @pytest.mark.parametrize("block_size", [32, 64])
 def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int):
-    torch.set_default_device('cuda')
+    torch.set_default_device("cuda")
     torch.manual_seed(42)
 
     # Deepseek R1 config
@@ -59,11 +54,11 @@ def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int):
     qk_nope_head_dim = 128
     qk_rope_head_dim = 64
     qk_head_dim = kv_lora_rank + qk_rope_head_dim
-    scale = (qk_nope_head_dim + qk_rope_head_dim)**-0.5
+    scale = (qk_nope_head_dim + qk_rope_head_dim) ** -0.5
 
     MAX_SEQ_LEN = 1024
 
-    seq_lens = [torch.randint(2, MAX_SEQ_LEN, (1, )).item() for _ in range(bs)]
+    seq_lens = [torch.randint(2, MAX_SEQ_LEN, (1,)).item() for _ in range(bs)]
     seq_lens[-1] = MAX_SEQ_LEN
     max_seq_len = max(seq_lens)
     seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
@@ -86,12 +81,12 @@ def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int):
     block_id = 0
     for i in range(bs):
         num_blocks_needed = blocks_per_seq[i]
-        block_tables[i, :num_blocks_needed] = all_block_ids[block_id:block_id +
-                                                            num_blocks_needed]
+        block_tables[i, :num_blocks_needed] = all_block_ids[
+            block_id : block_id + num_blocks_needed
+        ]
         block_id += num_blocks_needed
 
-    kv_cache = torch.randn(block_tables.numel(), block_size,
-                           qk_head_dim).to(dtype)
+    kv_cache = torch.randn(block_tables.numel(), block_size, qk_head_dim).to(dtype)
     q = torch.randn(bs, num_heads, qk_head_dim).to(dtype)
 
     out_ref = q.new_zeros(bs, num_heads, kv_lora_rank)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index bd3ba554b32e2cbfd75dbc5f489875f5e8ce487c..693b849ebc5d7aae122937a0dfe48ff191765e86 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -1,20 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import flashinfer
 import pytest
 import torch
 
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    dequantize_nvfp4_to_dtype,
+    get_nvfp4_global_scale,
+)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 if not current_platform.is_device_capability(100):
-    pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
-                allow_module_level=True)
+    pytest.skip(
+        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
+    )
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -47,6 +48,7 @@ KV_LAYOUT = ["HND"]  # currently only HND is supported
 BLOCK_SIZE = [16]
 WINDOW_LEFT = [-1, 127]
 SOFT_CAP = [None, 50.0]
+HAS_SINKS = [True, False]
 
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 
@@ -61,11 +63,11 @@ NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
 @pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", SOFT_CAP)
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
     dtype: torch.dtype,
-    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
-                        Optional[torch.dtype]],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
@@ -73,10 +75,11 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_layout: str,
     block_size: int,
     window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
+    has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    current_platform.seed_everything(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
@@ -98,7 +101,16 @@ def test_flashinfer_trtllm_decode_with_baseline(
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    # max_q_len = 1
+    q_lens = torch.ones((batch_size,), dtype=torch.int32)
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+        ]
+    )
+
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
     if q_quant_dtype == FP8_DTYPE:
         query, q_scale = to_float8(query)
         ref_query = query.to(dtype) * q_scale
@@ -106,10 +118,10 @@ def test_flashinfer_trtllm_decode_with_baseline(
         q_scale = 1.0
         ref_query = query
 
-    kv_lens = torch.randint(1, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens = torch.randint(1, max_kv_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
 
-    seq_lens = kv_lens
+    seq_lens = kv_lens + q_lens
     max_seq_len = torch.max(seq_lens).item()
 
     kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
@@ -122,10 +134,9 @@ def test_flashinfer_trtllm_decode_with_baseline(
     k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (batch_size, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+    )
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
@@ -146,40 +157,55 @@ def test_flashinfer_trtllm_decode_with_baseline(
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Decode
-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout, use_tensor_cores=True)
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_qo_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 sm_scale=sm_scale,
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
-                 window_left=window_left,
-                 logits_soft_cap=soft_cap)
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
 
+    wrapper.plan(
+        qo_indptr=q_indptr,
+        paged_kv_indptr=kv_indptr,
+        paged_kv_indices=kv_indices,
+        paged_kv_last_page_len=kv_last_page_lens,
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim_qk=head_size,
+        page_size=block_size,
+        causal=True,
+        sm_scale=sm_scale,
+        window_left=window_left,
+        logits_soft_cap=soft_cap,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
     output = torch.empty(ref_query.shape, dtype=dtype)
-    wrapper.run(ref_query, ref_kv_cache, out=output)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
     o_scale = 1.0
-    o_sf_scale = None
+    o_sf_scale_float = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
     elif o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
+        o_sf_scale = get_nvfp4_global_scale(output)
+        o_sf_scale_float = o_sf_scale.item()
 
     # TRTLLM Decode
     if o_quant_dtype == FP4_DTYPE:
         output_trtllm = flashinfer.utils.FP4Tensor(
-            torch.empty(query.shape[:-1] + (query.shape[-1] // 2, ),
-                        dtype=torch.uint8),
-            torch.empty((round_up(query.shape[0], 128),
-                         round_up(query.shape[1] * query.shape[2] // 16, 4)),
-                        dtype=torch.float8_e4m3fn),
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
         )
     else:
         output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
@@ -194,30 +220,36 @@ def test_flashinfer_trtllm_decode_with_baseline(
         bmm1_scale=q_scale * k_scale * sm_scale,
         bmm2_scale=v_scale / o_scale,
         window_left=window_left,
-        o_sf_scale=o_sf_scale,
+        sinks=sinks,
+        o_sf_scale=o_sf_scale_float,
         out=output_trtllm,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
     elif o_quant_dtype == FP4_DTYPE:
         output_trtllm.data = output_trtllm.data.reshape(
-            -1, query.shape[1] * query.shape[2] // 2)
-        output_trtllm = dequantize_nvfp4_to_dtype(output_trtllm.data,
-                                                  output_trtllm.scale,
-                                                  o_sf_scale, dtype,
-                                                  query.device)
-        output_trtllm = output_trtllm.reshape(-1, query.shape[1],
-                                              query.shape[2])
+            -1, query.shape[1] * query.shape[2] // 2
+        )
+        output_trtllm = dequantize_nvfp4_to_dtype(
+            output_trtllm.data, output_trtllm.scale, o_sf_scale, dtype, query.device
+        )
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
 
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 3e-1, 1e0
+        rtol, atol = 7e-2, 9e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
-        rtol, atol = 5e-2, 7e-2
+        rtol, atol = 3e-2, 4e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 2e-2, 2e-2
+    elif kv_quant_dtype == FP8_DTYPE:
+        rtol, atol = 4e-2, 6e-2
     else:
-        rtol, atol = 1e-2, 2e-2
+        rtol, atol = 1e-2, 1e-2
 
-    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - output_trtllm))}"
+    (
+        torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - output_trtllm))}",
+    )
 
 
 @pytest.mark.parametrize("dtype", DTYPE)
@@ -230,11 +262,11 @@ def test_flashinfer_trtllm_decode_with_baseline(
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
 @pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
 @torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
     dtype: torch.dtype,
-    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
-                        Optional[torch.dtype]],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
@@ -242,10 +274,11 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_layout: str,
     block_size: int,
     window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
+    has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    current_platform.seed_everything(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
@@ -270,17 +303,16 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    q_lens = torch.randint(1, max_q_len, (batch_size, ), dtype=torch.int32)
+    q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
     q_lens[-1] = max_q_len
-    q_indptr = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
-    ])
-
-    query = torch.randn(torch.sum(q_lens).item(),
-                        num_qo_heads,
-                        head_size,
-                        dtype=dtype)
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+        ]
+    )
+
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
     if q_quant_dtype == FP8_DTYPE:
         query, q_scale = to_float8(query)
         ref_query = query.to(dtype) * q_scale
@@ -288,7 +320,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         q_scale = 1.0
         ref_query = query
 
-    kv_lens = torch.randint(0, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens = torch.randint(1, max_kv_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
 
     seq_lens = kv_lens + q_lens
@@ -304,10 +336,9 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (batch_size, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+    )
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
@@ -328,41 +359,55 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Prefill
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout)
-    wrapper.plan(q_indptr,
-                 kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_qo_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 causal=True,
-                 sm_scale=sm_scale,
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
-                 window_left=window_left,
-                 logits_soft_cap=soft_cap)
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
 
+    wrapper.plan(
+        qo_indptr=q_indptr,
+        paged_kv_indptr=kv_indptr,
+        paged_kv_indices=kv_indices,
+        paged_kv_last_page_len=kv_last_page_lens,
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim_qk=head_size,
+        page_size=block_size,
+        causal=True,
+        sm_scale=sm_scale,
+        window_left=window_left,
+        logits_soft_cap=soft_cap,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
     output = torch.empty(ref_query.shape, dtype=dtype)
-    wrapper.run(ref_query, ref_kv_cache, out=output)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
     o_scale = 1.0
-    o_sf_scale = None
+    o_sf_scale_float = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
     elif o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
+        o_sf_scale = get_nvfp4_global_scale(output)
+        o_sf_scale_float = o_sf_scale.item()
 
     # TRTLLM Prefill
     if o_quant_dtype == FP4_DTYPE:
         output_trtllm = flashinfer.utils.FP4Tensor(
-            torch.empty(query.shape[:-1] + (query.shape[-1] // 2, ),
-                        dtype=torch.uint8),
-            torch.empty((round_up(query.shape[0], 128),
-                         round_up(query.shape[1] * query.shape[2] // 16, 4)),
-                        dtype=torch.float8_e4m3fn),
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
         )
     else:
         output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
@@ -381,29 +426,31 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
         window_left=window_left,
-        o_sf_scale=o_sf_scale,
+        sinks=sinks,
+        o_sf_scale=o_sf_scale_float,
         out=output_trtllm,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
     elif o_quant_dtype == FP4_DTYPE:
         output_trtllm.data = output_trtllm.data.reshape(
-            -1, query.shape[1] * query.shape[2] // 2)
-        output_trtllm = dequantize_nvfp4_to_dtype(output_trtllm.data,
-                                                  output_trtllm.scale,
-                                                  o_sf_scale, dtype,
-                                                  query.device)
-        output_trtllm = output_trtllm.reshape(-1, query.shape[1],
-                                              query.shape[2])
+            -1, query.shape[1] * query.shape[2] // 2
+        )
+        output_trtllm = dequantize_nvfp4_to_dtype(
+            output_trtllm.data, output_trtllm.scale, o_sf_scale, dtype, query.device
+        )
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
 
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 4e-1, 1e0
+        rtol, atol = 1e-1, 2e-1
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
-        rtol, atol = 5e-2, 7e-2
-    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
         rtol, atol = 4e-2, 6e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 2e-2, 3e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
-    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - output_trtllm))}"
+    (
+        torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - output_trtllm))}",
+    )
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index bddd7e5c50ed0d20c0c5791fc059f700bda0611b..8379129b401168dd6d10961c4385fa59a2de4f51 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -7,30 +7,35 @@ import random
 import pytest
 import torch
 
-from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
-                                         get_mla_metadata,
-                                         is_flashmla_supported)
+from vllm.attention.ops.flashmla import (
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+    is_flashmla_dense_supported,
+)
 from vllm.triton_utils import triton
 
 
-def cal_diff(x: torch.Tensor,
-             y: torch.Tensor,
-             name: str,
-             use_fp8: bool = False) -> None:
+def cal_diff(
+    x: torch.Tensor, y: torch.Tensor, name: str, use_fp8: bool = False
+) -> None:
     x, y = x.double(), y.double()
-    cos_diff = 1 - 2 * (x * y).sum().item() / max(
-        (x * x + y * y).sum().item(), 1e-12)
-    if (use_fp8):
+    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
+    if use_fp8:
         assert cos_diff < 1e-4
     else:
         assert cos_diff < 1e-5
 
-FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
-    if not is_flashmla_supported()[0] else "FlashMLA is supported"
+
+FLASH_MLA_UNSUPPORTED_REASON = (
+    is_flashmla_dense_supported()[1]
+    if not is_flashmla_dense_supported()[0]
+    else "FlashMLA is supported"
+)
 
 
-@pytest.mark.skipif(not is_flashmla_supported()[0],
-                    reason=FLASH_MLA_UNSUPPORTED_REASON)
+@pytest.mark.skipif(
+    not is_flashmla_dense_supported()[0], reason=FLASH_MLA_UNSUPPORTED_REASON
+)
 @pytest.mark.parametrize("b", [128])
 @pytest.mark.parametrize("s_q", [1, 2])
 @pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
@@ -41,47 +46,49 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.parametrize("causal", [True])
 @pytest.mark.parametrize("varlen", [False, True])
-@pytest.mark.parametrize("torch_dtype",
-                         [torch.bfloat16, torch.float16, torch.float8_e4m3fn])
+@pytest.mark.parametrize(
+    "torch_dtype", [torch.bfloat16, torch.float16, torch.float8_e4m3fn]
+)
 @torch.inference_mode()
-def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen, torch_dtype):
+def test_flash_mla(
+    b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen, torch_dtype
+):
     device = torch.device("cuda:0")
-    if torch_dtype == torch.float8_e4m3fn:
-        init_dtype = torch.bfloat16
-    else:
-        init_dtype = torch_dtype
+    init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
     torch.set_default_dtype(init_dtype)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
     torch.manual_seed(0)
     random.seed(0)
 
-    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}")
+    print(
+        f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+        f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}"
+    )
 
     use_fp8 = torch_dtype == torch.float8_e4m3fn
-    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    cache_seqlens = torch.full((b,), mean_sk, dtype=torch.int32)
     if varlen:
         for i in range(b):
-            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
-                                   s_q)
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), s_q)
     total_seqlens = cache_seqlens.sum().item()
     max_seqlen = cache_seqlens.max().item()
     max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
 
     q = torch.randn(b, s_q, h_q, d)
-    block_table = torch.arange(b * max_seqlen_pad // block_size,
-                               dtype=torch.int32).view(
-                                   b, max_seqlen_pad // block_size)
+    block_table = torch.arange(
+        b * max_seqlen_pad // block_size, dtype=torch.int32
+    ).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
     for i in range(b):
-        blocked_k.view(b, max_seqlen_pad, h_kv,
-                       d)[i, cache_seqlens[i].item():] = float("nan")
+        blocked_k.view(b, max_seqlen_pad, h_kv, d)[i, cache_seqlens[i].item() :] = (
+            float("nan")
+        )
     blocked_v = blocked_k[..., :dv]
 
     tile_scheduler_metadata, num_splits = get_mla_metadata(
-        cache_seqlens, s_q * h_q // h_kv, h_kv)
+        cache_seqlens, s_q * h_q // h_kv, h_kv
+    )
 
     init_dtype = q.dtype
     if use_fp8:
@@ -119,8 +126,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
             s_q = query.shape[-2]
             s_k = key.shape[-2]
             attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
-            temp_mask = torch.ones(s_q, s_k,
-                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
             attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
             attn_bias.to(query.dtype)
             attn_weight += attn_bias
@@ -130,10 +136,16 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
 
     def ref_mla():
         q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
-        blocked_k_ = (blocked_k.to(torch.float) *
-                      descale_k).to(init_dtype) if use_fp8 else blocked_k
-        blocked_v_ = (blocked_v.to(torch.float) *
-                      descale_k).to(init_dtype) if use_fp8 else blocked_v
+        blocked_k_ = (
+            (blocked_k.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_k
+        )
+        blocked_v_ = (
+            (blocked_v.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_v
+        )
         out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
         lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
         for i in range(b):
@@ -156,8 +168,9 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
 
     t = triton.testing.do_bench(flash_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d +
-             b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + (
-                 b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
-    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,",
-          f"{bytes / 10 ** 6 / t:.0f} GB/s")
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d) * (
+        torch.finfo(torch_dtype).bits // 8
+    ) + (b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(
+        f"{t:.3f} ms, {FLOPS / 10**9 / t:.0f} TFLOPS,", f"{bytes / 10**6 / t:.0f} GB/s"
+    )
diff --git a/tests/kernels/attention/test_flashmla_sparse.py b/tests/kernels/attention/test_flashmla_sparse.py
index 9036e4e7800bf985b3391ab2bb3636b69bdd2ab0..7ee6f4b07b4a9b474d134cf5fa9294c353065ca9 100644
--- a/tests/kernels/attention/test_flashmla_sparse.py
+++ b/tests/kernels/attention/test_flashmla_sparse.py
@@ -4,18 +4,12 @@ import pytest
 import torch
 
 
-def _cuda_sm90_available() -> bool:
-    if not torch.cuda.is_available():
-        return False
-    major, _ = torch.cuda.get_device_capability()
-    return major == 9
-
-
 def test_sparse_flashmla_metadata_smoke():
     import vllm.attention.ops.flashmla as fm
-    ok, reason = fm.is_flashmla_supported()
-    if not ok or not _cuda_sm90_available():
-        pytest.skip(reason or "SM90 not available")
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
 
     device = torch.device("cuda")
     batch_size = 1
@@ -27,21 +21,24 @@ def test_sparse_flashmla_metadata_smoke():
 
     cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
 
-    tile_md, num_splits = fm.get_mla_metadata(cache_seqlens,
-                                              q_seq_per_hk,
-                                              num_heads_k,
-                                              num_heads_q=num_heads_q,
-                                              topk=topk,
-                                              is_fp8_kvcache=True)
+    tile_md, num_splits = fm.get_mla_metadata(
+        cache_seqlens,
+        q_seq_per_hk,
+        num_heads_k,
+        num_heads_q=num_heads_q,
+        topk=topk,
+        is_fp8_kvcache=True,
+    )
     assert tile_md.dtype == torch.int32
     assert num_splits.dtype == torch.int32
 
 
 def test_sparse_flashmla_decode_smoke():
     import vllm.attention.ops.flashmla as fm
-    ok, reason = fm.is_flashmla_supported()
-    if not ok or not _cuda_sm90_available():
-        pytest.skip(reason or "SM90 not available")
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
 
     device = torch.device("cuda")
     batch_size = 1
@@ -58,36 +55,42 @@ def test_sparse_flashmla_decode_smoke():
     q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
     # q_heads_per_hk = num_heads_q // num_heads_k
     cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
-    tile_md, num_splits = fm.get_mla_metadata(cache_seqlens,
-                                              q_seq_per_hk,
-                                              num_heads_k,
-                                              num_heads_q=num_heads_q,
-                                              topk=topk,
-                                              is_fp8_kvcache=True)
+    tile_md, num_splits = fm.get_mla_metadata(
+        cache_seqlens,
+        q_seq_per_hk,
+        num_heads_k,
+        num_heads_q=num_heads_q,
+        topk=topk,
+        is_fp8_kvcache=True,
+    )
 
     # Inputs
-    q = torch.zeros((batch_size, seqlen_q, num_heads_q, head_dim_k),
-                    dtype=torch.bfloat16,
-                    device=device)
-    k_cache = torch.zeros((1, page_block_size, num_heads_k, bytes_per_token),
-                          dtype=torch.uint8,
-                          device=device)
-    indices = torch.zeros((batch_size, seqlen_q, topk),
-                          dtype=torch.int32,
-                          device=device)
-
-    block_table = torch.zeros((batch_size, 128),
-                              dtype=torch.int32,
-                              device=device)
-    out, lse = fm.flash_mla_with_kvcache(q,
-                                         k_cache,
-                                         block_table,
-                                         cache_seqlens,
-                                         head_dim_v,
-                                         tile_md,
-                                         num_splits,
-                                         indices=indices,
-                                         is_fp8_kvcache=True)
+    q = torch.zeros(
+        (batch_size, seqlen_q, num_heads_q, head_dim_k),
+        dtype=torch.bfloat16,
+        device=device,
+    )
+    k_cache = torch.zeros(
+        (1, page_block_size, num_heads_k, bytes_per_token),
+        dtype=torch.uint8,
+        device=device,
+    )
+    indices = torch.zeros(
+        (batch_size, seqlen_q, topk), dtype=torch.int32, device=device
+    )
+
+    block_table = torch.zeros((batch_size, 128), dtype=torch.int32, device=device)
+    out, lse = fm.flash_mla_with_kvcache(
+        q,
+        k_cache,
+        block_table,
+        cache_seqlens,
+        head_dim_v,
+        tile_md,
+        num_splits,
+        indices=indices,
+        is_fp8_kvcache=True,
+    )
     assert out.shape[0] == batch_size
     assert out.shape[-1] == head_dim_v
     assert lse.shape[0] == batch_size
@@ -95,9 +98,10 @@ def test_sparse_flashmla_decode_smoke():
 
 def test_sparse_flashmla_prefill_smoke():
     import vllm.attention.ops.flashmla as fm
-    ok, reason = fm.is_flashmla_supported()
-    if not ok or not _cuda_sm90_available():
-        pytest.skip(reason or "SM90 not available")
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
 
     device = torch.device("cuda")
     s_q = 1
@@ -112,8 +116,7 @@ def test_sparse_flashmla_prefill_smoke():
     kv = torch.zeros((s_kv, h_kv, d_qk), dtype=torch.bfloat16, device=device)
     indices = torch.zeros((s_q, h_kv, topk), dtype=torch.int32, device=device)
 
-    out, max_logits, lse = fm.flash_mla_sparse_prefill(q, kv, indices, 1.0,
-                                                       d_v)
+    out, max_logits, lse = fm.flash_mla_sparse_prefill(q, kv, indices, 1.0, d_v)
     assert out.shape == (s_q, h_q, d_v)
     assert max_logits.shape == (s_q, h_q)
     assert lse.shape == (s_q, h_q)
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index de45ee1ed5ccac3fd084f698d111bb2e549fc841..ec938caff2c6dfcb9ac7645779d2ceb242fc13c0 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -4,8 +4,7 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.lightning_attn import (
-    linear_decode_forward_triton)
+from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
 from vllm.platforms import current_platform
 
 NUM_HEADS = [4, 8]
@@ -17,8 +16,8 @@ DTYPES = [torch.float32]
 
 def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
     """Reference implementation of lightning attention core algorithm
-    
-    The difference from the main implementation is that this processes 
+
+    The difference from the main implementation is that this processes
     each step sequentially, instead of using parallelized triton kernels
     """
     B, H, S, D = q.shape
@@ -34,10 +33,7 @@ def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
 
     # More efficient implementation
     # Convert decay factors to matrix form
-    if ed.dim() == 1:
-        decay = torch.exp(-ed).view(1, -1, 1, 1)
-    else:
-        decay = torch.exp(-ed)
+    decay = torch.exp(-ed).view(1, -1, 1, 1) if ed.dim() == 1 else torch.exp(-ed)
 
     for b in range(B):
         for step in range(S):
@@ -62,8 +58,7 @@ def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
     # The actual implementation returns a tensor of shape [B, H, 2, D, E]
     # where dimension 2 contains both KV and KV history
     kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
-    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
-                               dim=2)  # [B, H, 2, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped], dim=2)  # [B, H, 2, D, E]
 
     return output, final_kv_cache
 
@@ -109,7 +104,7 @@ def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
             out_h = torch.matmul(q_bh, kv_new)
 
             # Update output and cache
-            output[b, h * D:(h + 1) * D] = out_h
+            output[b, h * D : (h + 1) * D] = out_h
             kv_caches[b, h] = kv_new
 
     return output
@@ -135,12 +130,9 @@ def test_linear_decode_forward_triton(
     k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
     v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
 
-    kv_caches = base * torch.randn(batch_size,
-                                   num_heads,
-                                   head_size,
-                                   head_size,
-                                   dtype=dtype,
-                                   device="cuda")
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_caches_copy = kv_caches.clone()
 
@@ -150,15 +142,14 @@ def test_linear_decode_forward_triton(
 
     slot_idx = torch.arange(batch_size, device="cuda")
 
-    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
-                                                 slope_rate, slot_idx)
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
 
-    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
-                                               slope_rate, slot_idx)
-    torch.testing.assert_close(triton_output,
-                               reference_output,
-                               rtol=1e-1,
-                               atol=1e-1)
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
+    torch.testing.assert_close(triton_output, reference_output, rtol=1e-1, atol=1e-1)
     torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
 
     assert triton_output.shape == (batch_size, num_heads * head_size)
@@ -184,12 +175,9 @@ def test_linear_decode_forward_triton_with_padding(
     k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
     v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
 
-    kv_caches = base * torch.randn(batch_size,
-                                   num_heads,
-                                   head_size,
-                                   head_size,
-                                   dtype=dtype,
-                                   device="cuda")
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_caches_copy = kv_caches.clone()
 
@@ -199,14 +187,15 @@ def test_linear_decode_forward_triton_with_padding(
 
     slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
 
-    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
-                                                 slope_rate, slot_idx)
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
 
-    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
-                                               slope_rate, slot_idx)
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
 
-    padding_mask = (slot_idx
-                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+    padding_mask = (slot_idx != -1).unsqueeze(1).expand(-1, num_heads * head_size)
 
     triton_masked = triton_output[padding_mask]
     reference_masked = reference_output[padding_mask]
@@ -217,15 +206,11 @@ def test_linear_decode_forward_triton_with_padding(
 
     for i in range(batch_size):
         if valid_indices[i] > 0:
-            torch.testing.assert_close(kv_caches[i],
-                                       kv_caches_copy[i],
-                                       rtol=rtol,
-                                       atol=atol)
+            torch.testing.assert_close(
+                kv_caches[i], kv_caches_copy[i], rtol=rtol, atol=atol
+            )
 
-    torch.testing.assert_close(triton_masked,
-                               reference_masked,
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(triton_masked, reference_masked, rtol=rtol, atol=atol)
 
     assert triton_output.shape == (batch_size, num_heads * head_size)
 
@@ -249,39 +234,33 @@ def test_lightning_attention_reference(
     current_platform.seed_everything(42)
 
     base = 0.01
-    q = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
-    k = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
-    v = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
 
     ed = torch.zeros(num_heads, device="cuda")
     for h in range(num_heads):
         ed[h] = 0.1 * (h + 1)
 
-    kv_history = base * torch.randn(batch_size,
-                                    num_heads,
-                                    head_size,
-                                    head_size,
-                                    dtype=dtype,
-                                    device="cuda")
+    kv_history = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_history_clone = kv_history.clone()
 
     ref_output, ref_kv_cache = reference_lightning_attention(
-        q, k, v, ed, 256, kv_history)
+        q, k, v, ed, 256, kv_history
+    )
 
     from vllm.model_executor.layers.lightning_attn import lightning_attention
+
     actual_output, actual_kv_cache = lightning_attention(
-        q, k, v, ed, 256, kv_history_clone)
+        q, k, v, ed, 256, kv_history_clone
+    )
 
     atol, rtol = 1.5e-1, 1.5e-1
     torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
-    torch.testing.assert_close(ref_kv_cache,
-                               actual_kv_cache,
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(ref_kv_cache, actual_kv_cache, rtol=rtol, atol=atol)
 
     assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
     assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 9d1a301ebe304f4880129449d57ae98220daf513..9b084f2f660b29b329f4c491133f88e21c274c3f 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -1,25 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
 
 from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
 from vllm.attention.ops.triton_merge_attn_states import (
-    merge_attn_states as merge_attn_states_triton)
+    merge_attn_states as merge_attn_states_triton,
+)
 from vllm.platforms import current_platform
 
 
 # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
 # can be used to combine partial attention results (in the split-KV case)
 def merge_attn_states_torch(
-        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-        output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+    output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
     p_lse = prefix_lse
     s_lse = suffix_lse
@@ -32,15 +32,13 @@ def merge_attn_states_torch(
     s_lse = s_lse - max_lse
     p_lse_exp = torch.exp(p_lse)
     s_lse_exp = torch.exp(s_lse)
-    out_se = (p_lse_exp + s_lse_exp)
+    out_se = p_lse_exp + s_lse_exp
     if output_lse is not None:
         output_lse = torch.log(out_se) + max_lse
     p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
     s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
-    p_scale = torch.transpose(p_scale, 0,
-                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
-    s_scale = torch.transpose(s_scale, 0,
-                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    p_scale = torch.transpose(p_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
     output = prefix_output * p_scale + suffix_output * s_scale
     return output, output_lse
 
@@ -55,8 +53,10 @@ all_case_info: list[tuple] = []
 
 def generate_markdown_table():
     global all_case_info
-    table_header = ("| tokens | heads | headsize | dtype "
-                    "| device | torch | triton | cuda | speedup |")
+    table_header = (
+        "| tokens | heads | headsize | dtype "
+        "| device | torch | triton | cuda | speedup |"
+    )
     table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
 
     def shortly_dtype(dtype: torch.dtype) -> str:
@@ -68,16 +68,26 @@ def generate_markdown_table():
     print(table_header)
     print(table_separator)
     for info in all_case_info:
-        (num_tokens, num_heads, head_size, dtype, device,
-         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
-         performance_improved) = info
+        (
+            num_tokens,
+            num_heads,
+            head_size,
+            dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        ) = info
         dtype = shortly_dtype(dtype)
         device = shortly_device(device)
-        print(f"| {num_tokens} | {num_heads} | {head_size} "
-              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
-              f"| {avg_time_triton_kernel:.5f}ms "
-              f"| {avg_time_cuda_kernel:.5f}ms "
-              f"| {performance_improved:.4f}x |")
+        print(
+            f"| {num_tokens} | {num_heads} | {head_size} "
+            f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+            f"| {avg_time_triton_kernel:.5f}ms "
+            f"| {avg_time_cuda_kernel:.5f}ms "
+            f"| {performance_improved:.4f}x |"
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
@@ -85,29 +95,28 @@ def generate_markdown_table():
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("output_dtype", DTYPES)
 @torch.inference_mode()
-def test_merge_attn_states(num_tokens: int, num_query_heads: int,
-                           head_size: int, output_dtype: torch.dtype):
+def test_merge_attn_states(
+    num_tokens: int, num_query_heads: int, head_size: int, output_dtype: torch.dtype
+):
     if not current_platform.is_cuda():
-        pytest.skip('Currently only support compare triton merge_attn_states '
-                    'with custom cuda merge_attn_states kernel')
+        pytest.skip(
+            "Currently only support compare triton merge_attn_states "
+            "with custom cuda merge_attn_states kernel"
+        )
 
     NUM_TOKENS = num_tokens
     NUM_HEADS = num_query_heads
     HEAD_SIZE = head_size
 
-    print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
-          f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
-          f"Device: {current_platform.get_device_name()}")
+    print(
+        f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+        f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+        f"Device: {current_platform.get_device_name()}"
+    )
 
     # prefix_lse and suffix_lse contain inf and normal values
-    prefix_lse = torch.randn(NUM_HEADS,
-                             NUM_TOKENS,
-                             dtype=torch.float32,
-                             device="cuda")
-    suffix_lse = torch.randn(NUM_HEADS,
-                             NUM_TOKENS,
-                             dtype=torch.float32,
-                             device="cuda")
+    prefix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
 
     # Generate boolean masks
     mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
@@ -117,23 +126,23 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
     mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
 
-    prefix_lse[mask_prefix] = float('inf')
-    suffix_lse[mask_suffix] = float('inf')
+    prefix_lse[mask_prefix] = float("inf")
+    suffix_lse[mask_suffix] = float("inf")
 
     # Other input tensors (need to be initialized but
     # no actual calculation needed)
-    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                         dtype=output_dtype,
-                         device="cuda")
-    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
-                             dtype=torch.float32,
-                             device="cuda")
-    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                                dtype=output_dtype,
-                                device="cuda")
-    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                                dtype=output_dtype,
-                                device="cuda")
+    output = torch.zeros(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    output_lse = torch.zeros(
+        (NUM_HEADS, NUM_TOKENS), dtype=torch.float32, device="cuda"
+    )
+    prefix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    suffix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
 
     warmup_times = 2
     repeat_times = 20
@@ -149,15 +158,25 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     suffix_lse_torch = suffix_lse.clone()
     for _ in range(warmup_times):
         output_torch, output_lse_torch = merge_attn_states_torch(
-            output_torch, prefix_output, prefix_lse_torch, suffix_output,
-            suffix_lse_torch, output_lse_torch)
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
         output_torch, output_lse_torch = merge_attn_states_torch(
-            output_torch, prefix_output, prefix_lse_torch, suffix_output,
-            suffix_lse_torch, output_lse_torch)
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_torch_kernel += start.elapsed_time(end)
@@ -173,16 +192,26 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     end = torch.cuda.Event(enable_timing=True)
 
     for _ in range(warmup_times):
-        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse,
-                                 output_lse_ref_triton)
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
-        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse,
-                                 output_lse_ref_triton)
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_triton_kernel += start.elapsed_time(end)
@@ -195,14 +224,26 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     output_lse_cuda = output_lse.clone()
 
     for _ in range(warmup_times):
-        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
-                               suffix_output, suffix_lse, output_lse_cuda)
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
-        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
-                               suffix_output, suffix_lse, output_lse_cuda)
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_cuda_kernel += start.elapsed_time(end)
@@ -213,8 +254,10 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
     print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
     print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
-    print(f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
-          f"Performance: {performance_improved:.5f}x")
+    print(
+        f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+        f"Performance: {performance_improved:.5f}x"
+    )
     print("-" * 100)
 
     # 4. Correctness compare
@@ -232,35 +275,45 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     # states operation.
     output_ref = output_ref_triton
     output_lse_ref = output_lse_ref_triton
-    torch.testing.assert_close(output_cuda.float(),
-                               output_ref.float(),
-                               atol=1e-3,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        output_cuda.float(), output_ref.float(), atol=1e-3, rtol=rtol
+    )
     print("Output all match, max abs diff:")
     print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
     print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
     print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
     print("-" * 100)
 
-    torch.testing.assert_close(output_lse_cuda.float(),
-                               output_lse_ref.float(),
-                               atol=1e-3,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        output_lse_cuda.float(), output_lse_ref.float(), atol=1e-3, rtol=rtol
+    )
     print("Output LSE all match, max abs diff:")
     print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
     print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
     print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
     print("-" * 100)
 
-    print("All output values test passed! All inf values "
-          "are correctly replaced with -inf.")
+    print(
+        "All output values test passed! All inf values "
+        "are correctly replaced with -inf."
+    )
     print("-" * 100)
 
     device = current_platform.get_device_name()
     all_case_info.append(
-        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
-         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
-         performance_improved))
-    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
-                              len(NUM_QUERY_HEADS) * len(DTYPES)):
+        (
+            NUM_TOKENS,
+            NUM_HEADS,
+            HEAD_SIZE,
+            output_dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        )
+    )
+    if len(all_case_info) == (
+        len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * len(NUM_QUERY_HEADS) * len(DTYPES)
+    ):
         generate_markdown_table()
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index d37b968ed979203b55c82fb8e4a862fb06552174..183bbf3bf4e036d7d7620fd267059d3e04577d53 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -5,13 +5,15 @@ Test:
 
 * Tests for MultiHeadAttention layer
 """
+
 from unittest.mock import patch
 
 import pytest
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import MultiHeadAttention
-from vllm.attention.selector import _Backend, _cached_get_attn_backend
+from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
@@ -20,11 +22,11 @@ from vllm.platforms.rocm import RocmPlatform
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
     # Clear xformers availability cache
     import vllm.attention.layer as layer_module
+
     layer_module.USE_XFORMERS_OPS = None
 
 
@@ -36,51 +38,65 @@ def test_mha_attn_platform(device: str):
     torch.set_default_dtype(torch.float16)
 
     if device == "cpu":
-        with patch("vllm.attention.layer.current_platform", CpuPlatform()), \
-             patch("vllm.model_executor.models.vision.current_platform",
-                   CpuPlatform()):
+        with (
+            patch("vllm.attention.layer.current_platform", CpuPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
+        ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
-        with patch("vllm.attention.layer.current_platform", RocmPlatform()), \
-             patch("vllm.model_executor.models.vision.current_platform",
-                   RocmPlatform()):
+        with (
+            patch("vllm.attention.layer.current_platform", RocmPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
+        ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
-        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
-             patch("vllm.model_executor.models.vision.current_platform",
-                   CudaPlatform()):
+        with (
+            patch("vllm.attention.layer.current_platform", CudaPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
+        ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA not available
         # - should use xformers
-        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
-             patch("vllm.model_executor.models.vision.current_platform",
-                   CudaPlatform()), \
-             patch("vllm.attention.layer.check_upstream_fa_availability",
-                   return_value=False):
+        with (
+            patch("vllm.attention.layer.current_platform", CudaPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
+            patch(
+                "vllm.attention.layer.check_upstream_fa_availability",
+                return_value=False,
+            ),
+        ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.XFORMERS
+            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA available
         # - should use upstream FA
-        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
-             patch("vllm.model_executor.models.vision.current_platform",
-                   CudaPlatform()), \
-             patch("vllm.attention.layer.check_upstream_fa_availability",
-                   return_value=True), \
-             patch.dict('sys.modules', {'flash_attn': type('MockFlashAttn', (),
-                                                           {
-                 'flash_attn_varlen_func': lambda *args, **kwargs: None
-             })()}):
+        with (
+            patch("vllm.attention.layer.current_platform", CudaPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
+            patch(
+                "vllm.attention.layer.check_upstream_fa_availability", return_value=True
+            ),
+            patch.dict(
+                "sys.modules",
+                {
+                    "flash_attn": type(
+                        "MockFlashAttn",
+                        (),
+                        {"flash_attn_varlen_func": lambda *args, **kwargs: None},
+                    )()
+                },
+            ),
+        ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def ref_attention(
@@ -107,9 +123,11 @@ NUM_HEADS = [1, 16]
 NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = (
+    [torch.half, torch.bfloat16, torch.float]
+    if not current_platform.is_rocm()
+    else [torch.half, torch.bfloat16]
+)
 CUDA_DEVICES = ["cuda"]
 
 
@@ -137,10 +155,9 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(num_heads,
-                              head_size,
-                              scale=scale,
-                              num_kv_heads=num_kv_heads)
+    attn = MultiHeadAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
     output = attn(q, k, v)
 
     assert num_heads % num_kv_heads == 0
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index f8b307c595dea25606489db56d60f3a612e7bdda..e1a7e50c2b56a48a184342fd1c50011afb40d85e 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -7,34 +7,28 @@ from torch import Tensor
 
 import vllm._custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 
 def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
 ):
     bs, num_heads, v_head_dim = out.shape
     head_dim = query.shape[2]
 
     for i in range(bs):
         # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
         v = kv[:, :, :v_head_dim]
 
         q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
         out[i] = o.view(num_heads, v_head_dim)
 
     return out
@@ -63,18 +57,17 @@ def test_mla_decode_cpu(
     torch.set_default_dtype(dtype)
     torch.manual_seed(0)
 
-    scale = d**(-0.5)
+    scale = d ** (-0.5)
     if varlen:
         seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
         seq_lens = seq_lens.clip(2).to(torch.int32)
     else:
-        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+        seq_lens = torch.full((bs,), mean_seq_len, dtype=torch.int32)
     max_seq_len = seq_lens.max().item()
     seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
 
     q = torch.randn(bs, h_q, d)
-    block_table = torch.arange(bs * seqlen_pad // block_size,
-                               dtype=torch.int32)
+    block_table = torch.arange(bs * seqlen_pad // block_size, dtype=torch.int32)
     block_table = block_table.view(bs, seqlen_pad // block_size)
 
     kv_cache = torch.randn(block_table.numel(), block_size, d)
@@ -82,8 +75,7 @@ def test_mla_decode_cpu(
         kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
 
     out_mla = q.new_zeros(bs, h_q, dv)
-    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
-                               seq_lens)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table, seq_lens)
 
     out_ref = q.new_zeros(bs, h_q, dv)
     ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
diff --git a/tests/kernels/attention/test_pack_unpack_triton.py b/tests/kernels/attention/test_pack_unpack_triton.py
index 20c0b262b4795a6b7f3fbf72b06f83d690fe2889..d2aa14738d9d953b0414b9b03db5ab207a7670d0 100644
--- a/tests/kernels/attention/test_pack_unpack_triton.py
+++ b/tests/kernels/attention/test_pack_unpack_triton.py
@@ -39,7 +39,7 @@ def test_pack_seq_basic_fp8():
             start_idx = sum(lengths_list[:b])
             seq_len = lengths_list[b]
 
-            expected_data = x[start_idx:start_idx + seq_len].to(torch.float32)
+            expected_data = x[start_idx : start_idx + seq_len].to(torch.float32)
             actual_data = packed[b, :seq_len].to(torch.float32)
 
             assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
@@ -62,7 +62,7 @@ def test_pack_seq_custom_padding_fp8():
         # Check valid data
         for b in range(B):
             start_idx = b * 10
-            expected_data = x[start_idx:start_idx + 10].to(torch.float32)
+            expected_data = x[start_idx : start_idx + 10].to(torch.float32)
             actual_data = result[b, :10].to(torch.float32)
             assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
 
@@ -73,9 +73,7 @@ def test_pack_seq_custom_padding_fp8():
         elif pad_value > 0:
             assert torch.all(padded_data > 50)  # Large positive values
         else:
-            assert torch.allclose(padded_data,
-                                  torch.zeros_like(padded_data),
-                                  atol=1e-2)
+            assert torch.allclose(padded_data, torch.zeros_like(padded_data), atol=1e-2)
 
 
 def test_pack_seq_default_negative_inf_padding_fp8():
@@ -93,7 +91,8 @@ def test_pack_seq_default_negative_inf_padding_fp8():
     # Check that padding is large negative values (fp8 representation of -inf)
     padded_data = result[:, 10:].to(torch.float32)
     assert torch.all(
-        padded_data < -100)  # fp8 -inf is represented as large negative number
+        padded_data < -100
+    )  # fp8 -inf is represented as large negative number
 
 
 def test_pack_seq_edge_cases_fp8():
@@ -142,7 +141,7 @@ def test_pack_seq_different_block_sizes_fp8():
         # Check that valid data is preserved (within fp8 precision)
         for b in range(B):
             start_idx = b * 25
-            expected_data = x[start_idx:start_idx + 25].to(torch.float32)
+            expected_data = x[start_idx : start_idx + 25].to(torch.float32)
             actual_data = result[b, :25].to(torch.float32)
             assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
 
@@ -198,10 +197,7 @@ def test_pack_unpack_roundtrip_fp8():
 
         # Unpack without explicit start locations (computed in kernel)
         unpacked_with_loc = unpack_seq_triton(packed, lengths)
-        assert_close(x_f32,
-                     unpacked_with_loc.to(torch.float32),
-                     rtol=1e-3,
-                     atol=1e-2)
+        assert_close(x_f32, unpacked_with_loc.to(torch.float32), rtol=1e-3, atol=1e-2)
 
 
 def test_unpack_seq_triton_edge_cases_fp8():
@@ -216,10 +212,7 @@ def test_unpack_seq_triton_edge_cases_fp8():
     packed = pack_seq_triton(x, lengths)
     unpacked = unpack_seq_triton(packed, lengths)
     assert unpacked.shape == x.shape
-    assert_close(x.to(torch.float32),
-                 unpacked.to(torch.float32),
-                 rtol=1e-1,
-                 atol=1e-2)
+    assert_close(x.to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2)
 
     # Test with very short sequences
     x = torch.randn(20, 4, 8, dtype=torch.float32, device=device) * 0.1
@@ -228,10 +221,9 @@ def test_unpack_seq_triton_edge_cases_fp8():
     packed = pack_seq_triton(x, lengths)
     unpacked = unpack_seq_triton(packed, lengths)
     # Only compare the first 3 elements that were actually packed
-    assert_close(x[:3].to(torch.float32),
-                 unpacked.to(torch.float32),
-                 rtol=1e-1,
-                 atol=1e-2)
+    assert_close(
+        x[:3].to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2
+    )
 
     x = torch.randn(15, 8, 16, dtype=torch.float32, device=device) * 0.1
     x = x.to(dtype=dtype)
@@ -239,7 +231,4 @@ def test_unpack_seq_triton_edge_cases_fp8():
     packed = pack_seq_triton(x, lengths)
     unpacked = unpack_seq_triton(packed, lengths)
     assert unpacked.shape == x.shape
-    assert_close(x.to(torch.float32),
-                 unpacked.to(torch.float32),
-                 rtol=1e-1,
-                 atol=1e-2)
+    assert_close(x.to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2)
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 0695f84aea1af4709746ae93898237119301e077..78cdbbbf7379dd245a2db9a795b1e8e87886137c 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -8,29 +8,92 @@ from collections.abc import Callable
 
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+import torch.nn.functional as F
 
-from tests.kernels.utils import make_alibi_bias
-from vllm.attention.ops.chunked_prefill_paged_decode import (
-    chunked_prefill_paged_decode)
+from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 64]
 HEAD_SIZES = [24, 128]
 DTYPES = [torch.float16]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 SLIDING_WINDOW = [0, 16, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
 
 
+def create_causal_attention_mask_for_sdpa(
+    query_lens: list[int],
+    seq_lens: list[int],
+    sliding_window: int = 0,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+) -> torch.Tensor:
+    total_queries = sum(query_lens)
+    total_keys = sum(seq_lens)
+
+    # Create a mask filled with -inf
+    mask = torch.full(
+        (total_queries, total_keys), float("-inf"), device=device, dtype=dtype
+    )
+
+    query_start = 0
+    key_start = 0
+
+    for query_len, seq_len in zip(query_lens, seq_lens):
+        query_end = query_start + query_len
+        key_end = key_start + seq_len
+        q_indices = torch.arange(query_len, device=device)
+        k_indices = torch.arange(seq_len, device=device)
+        q_pos_in_seq = seq_len - query_len + q_indices
+
+        valid_mask = k_indices[None, :] <= q_pos_in_seq[:, None]
+
+        if sliding_window > 0:
+            valid_mask &= k_indices[None, :] >= (
+                q_pos_in_seq[:, None] - sliding_window + 1
+            )
+
+        mask[query_start:query_end, key_start:key_end][valid_mask] = 0.0
+
+        query_start = query_end
+        key_start = key_end
+
+    return mask
+
+
+def create_alibi_causal_mask(
+    query_len: int,
+    seq_len: int,
+    alibi_slopes: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    query_pos = torch.arange(
+        seq_len - query_len, seq_len, device=device, dtype=torch.float32
+    )
+    key_pos = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+    rel_pos = key_pos[None, :] - query_pos[:, None]
+
+    # Apply ALiBi slopes: [num_heads, query_len, seq_len]
+    alibi_bias = alibi_slopes[:, None, None] * rel_pos[None, :, :]
+    alibi_bias = alibi_bias.to(dtype)
+
+    # Apply causal mask: prevent attending to future positions
+    # causal_mask[i, j] = True if key_pos[j] <= query_pos[i]
+    causal_mask = key_pos[None, :] <= query_pos[:, None]
+    alibi_bias = alibi_bias.masked_fill(~causal_mask[None, :, :], float("-inf"))
+
+    # Add batch dimension: [1, num_heads, query_len, seq_len]
+    # SDPA expects batch dimension even for single sequences
+    return alibi_bias.unsqueeze(0)
+
+
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -50,12 +113,17 @@ def test_contexted_kv_attention(
     device: str,
     op: Callable,
 ) -> None:
-
-    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
-            89):
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
-            'Triton limitation: fp8e4nv data type is not supported on CUDA'
-            ' arch < 89')
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
 
     current_platform.seed_everything(0)
     torch.set_default_device(device)
@@ -93,38 +161,29 @@ def test_contexted_kv_attention(
         cache_dtype = dtype
     else:
         cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
-    block_table = values[:BS * max_block_per_request].view(
-        BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                            dtype=torch.long),
-                               dim=0)
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    )
     for i in range(BS):
         for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
         cur_ctx = 0
         block_id = 0
         while cur_ctx < b_ctx_len[i]:
@@ -135,112 +194,125 @@ def test_contexted_kv_attention(
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
             cur_ctx += block_size
             block_id += 1
     # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
-    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
-                           8).permute(0, 2, 3, 1, 4).contiguous()
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
     # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
-    v_cache = v_cache.view(-1, block_size, num_kv_heads,
-                           head_size).permute(0, 2, 3, 1).contiguous()
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
     k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       sliding_window=sliding_window)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
     torch.cuda.synchronize()
     start_time = time.time()
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       sliding_window=sliding_window)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
 
     scale = float(1.0 / (head_size**0.5))
 
-    attn_op = xops.fmha.cutlass.FwOp()
-
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
-                           query.shape[-1])
-        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
-                                        num_queries_per_kv, key.shape[-1])
-        value = value[:, :,
-                      None, :].expand(value.shape[0], num_kv_heads,
-                                      num_queries_per_kv, value.shape[-1])
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
-
-    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
-    if sliding_window > 0:
-        attn_bias = attn_bias.make_local_attention_from_bottomright(
-            sliding_window)
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+    # Reshape for SDPA: (seq_len, num_heads, head_size) ->
+    # (1, num_heads, seq_len, head_size)
+    query_sdpa = query.view(num_tokens, num_kv_heads, num_queries_per_kv, head_size)
+    query_sdpa = query_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, num_tokens, head_size
+    )
+
+    # Expand key and value for GQA/MQA to match query heads
+    key_sdpa = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    key_sdpa = key_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    value_sdpa = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+    value_sdpa = value_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    attn_mask = create_causal_attention_mask_for_sdpa(
+        query_lens, seq_lens, sliding_window, device=device, dtype=dtype
+    )
+
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     start_time = time.time()
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
-    output_ref = output_ref.reshape(output.shape)
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
+
+    # Reshape output back to (num_tokens, num_heads, head_size)
+    output_ref = output_ref.view(num_heads, num_tokens, head_size)
+    output_ref = output_ref.permute(1, 0, 2).contiguous()
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
@@ -262,12 +334,17 @@ def test_contexted_kv_attention_alibi(
     device: str,
     op: Callable,
 ) -> None:
-
-    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
-            89):
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
-            'Triton limitation: fp8e4nv data type is not supported on CUDA'
-            ' arch < 89')
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
 
     current_platform.seed_everything(0)
     torch.set_default_device(device)
@@ -280,9 +357,9 @@ def test_contexted_kv_attention_alibi(
 
     def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
-        closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+        closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
         base = torch.tensor(
-            2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+            2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
             dtype=torch.float32,
         )
         powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
@@ -290,17 +367,16 @@ def test_contexted_kv_attention_alibi(
 
         if closest_power_of_2 != total_num_heads:
             extra_base = torch.tensor(
-                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
                 dtype=torch.float32,
             )
-            num_remaining_heads = min(closest_power_of_2,
-                                      total_num_heads - closest_power_of_2)
-            extra_powers = torch.arange(start=1,
-                                        end=1 + 2 * num_remaining_heads,
-                                        step=2,
-                                        dtype=torch.int32)
-            slopes = torch.cat(
-                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+            num_remaining_heads = min(
+                closest_power_of_2, total_num_heads - closest_power_of_2
+            )
+            extra_powers = torch.arange(
+                start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+            )
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
         return slopes
 
     alibi_slopes = _get_alibi_slopes(num_heads).to(device)
@@ -328,38 +404,29 @@ def test_contexted_kv_attention_alibi(
         cache_dtype = dtype
     else:
         cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
-    block_table = values[:BS * max_block_per_request].view(
-        BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                            dtype=torch.long),
-                               dim=0)
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    )
     for i in range(BS):
         for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
         cur_ctx = 0
         block_id = 0
         while cur_ctx < b_ctx_len[i]:
@@ -370,137 +437,142 @@ def test_contexted_kv_attention_alibi(
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
             cur_ctx += block_size
             block_id += 1
     # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
-    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
-                           8).permute(0, 2, 3, 1, 4).contiguous()
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
     # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
-    v_cache = v_cache.view(-1, block_size, num_kv_heads,
-                           head_size).permute(0, 2, 3, 1).contiguous()
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
     k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       alibi_slopes=alibi_slopes)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
     torch.cuda.synchronize()
     start_time = time.time()
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       alibi_slopes=alibi_slopes)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
     scale = float(1.0 / (head_size**0.5))
 
-    # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
-    # we have to pad query tensor before MQA/GQA expanding.
-    if query.shape[0] != key.shape[0]:
-        query_pad = torch.empty(sum(seq_lens),
-                                num_heads,
-                                head_size,
-                                dtype=dtype)
-        query_pad.uniform_(-1e-3, 1e-3)
-        seq_start = 0
-        query_start = 0
-        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-            seq_end = seq_start + seq_len
-            query_end = query_start + query_len
-            query_pad[seq_start:seq_end, ...] = torch.cat([
-                torch.zeros(
-                    seq_len - query_len, num_heads, head_size, dtype=dtype),
-                query[query_start:query_end, ...]
-            ],
-                                                          dim=0)
-            seq_start += seq_len
-            query_start += query_len
-        query = query_pad
-
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
-                                        num_queries_per_kv, key.shape[-1])
-        value = value[:, :,
-                      None, :].expand(value.shape[0], num_kv_heads,
-                                      num_queries_per_kv, value.shape[-1])
-        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
-        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
-        # codebase. We save some time reshaping alibi matrix at runtime.
-        key = key.reshape(key.shape[0], -1, key.shape[-1])
-        value = value.reshape(value.shape[0], -1, value.shape[-1])
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
-
-    attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
+    # Prepare query, key, value for SDPA
+    # Expand key and value for GQA/MQA to match query heads
+    key_expanded = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    value_expanded = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+
     output_ref = torch.empty_like(output)
-    seq_start = 0
-    query_start = 0
+
+    torch.cuda.synchronize()
     start_time = time.time()
-    # Attention with alibi slopes.
-    # FIXME(DefTruth): Because xformers does not support dynamic sequence
-    # lengths with custom attention bias, we process each prompt one by
-    # one. This is inefficient, especially when we have many short prompts.
-    # modified from: vllm/v1/attention/backends/xformers.py#L343
+
+    query_start = 0
+    key_start = 0
     for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-        seq_end = seq_start + seq_len
         query_end = query_start + query_len
-        out = xops.memory_efficient_attention_forward(query[:,
-                                                            seq_start:seq_end],
-                                                      key[:,
-                                                          seq_start:seq_end],
-                                                      value[:,
-                                                            seq_start:seq_end],
-                                                      attn_bias=attn_bias[i],
-                                                      p=0.0,
-                                                      scale=scale)
-        out = out.view_as(query[:, seq_start:seq_end]).view(
-            seq_len, num_heads, head_size)
-        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
-                                                         ...])
-        seq_start += seq_len
-        query_start += query_len
+        key_end = key_start + seq_len
+
+        # Get query, key, value for this sequence
+        q = query[query_start:query_end]  # [query_len, num_heads, head_size]
+        k = key_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+        v = value_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+
+        # Reshape for SDPA: (batch=1, num_heads, seq_len, head_size)
+        q_sdpa = q.view(query_len, num_kv_heads, num_queries_per_kv, head_size)
+        q_sdpa = (
+            q_sdpa.permute(1, 2, 0, 3)
+            .reshape(1, num_heads, query_len, head_size)
+            .contiguous()
+        )
+
+        k_sdpa = (
+            k.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+        v_sdpa = (
+            v.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+
+        # Create ALiBi causal mask for this sequence using utility function
+        alibi_mask = create_alibi_causal_mask(
+            query_len, seq_len, alibi_slopes, device, dtype
+        )
+
+        # Compute attention
+        out = F.scaled_dot_product_attention(
+            q_sdpa,
+            k_sdpa,
+            v_sdpa,
+            attn_mask=alibi_mask,
+            dropout_p=0.0,
+            scale=scale,
+        )
+
+        # Reshape output back to [query_len, num_heads, head_size]
+        out = out.view(num_heads, query_len, head_size).permute(1, 0, 2)
+        output_ref[query_start:query_end].copy_(out)
+
+        query_start = query_end
+        key_start = key_end
+
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
@@ -532,9 +604,16 @@ def test_contexted_kv_attention_f32(
     device: str,
     op: Callable,
 ) -> None:
-    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
-                                sliding_window, dtype, kv_cache_dtype, device,
-                                op)
+    test_contexted_kv_attention(
+        num_heads,
+        num_queries_per_kv,
+        head_size,
+        sliding_window,
+        dtype,
+        kv_cache_dtype,
+        device,
+        op,
+    )
 
 
 @pytest.mark.optional
@@ -555,5 +634,6 @@ def test_contexted_kv_attention_alibi_f32(
     device: str,
     op: Callable,
 ) -> None:
-    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
-                                      dtype, kv_cache_dtype, device, op)
+    test_contexted_kv_attention_alibi(
+        num_heads, num_queries_per_kv, head_size, dtype, kv_cache_dtype, device, op
+    )
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index a5b4bddaf4755d7fea7d012d5dd631ff27e93c8c..9b7fb664956c6e6aacdf7981c258784181549ff1 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -11,57 +11,39 @@ from vllm.utils import STR_BACKEND_ENV_VAR
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_ATTN")
 
         # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform",
-                            RocmPlatform())
+        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
 
         # Test standard ROCm attention
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "TRITON_ATTN")
+        assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
 
         # MLA test for deepseek related
 
         # change the attention backend to triton MLA
         m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
-        backend = get_attn_backend(576,
-                                   torch.bfloat16,
-                                   "auto",
-                                   16,
-                                   False,
-                                   use_mla=True)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # If attention backend is None
         # If use_mla is true
         # The selected backend is triton MLA
         m.setenv(STR_BACKEND_ENV_VAR, None)
-        backend = get_attn_backend(576,
-                                   torch.bfloat16,
-                                   "auto",
-                                   16,
-                                   False,
-                                   use_mla=True)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # change the attention backend to AITER MLA
         m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
-        backend = get_attn_backend(576,
-                                   torch.bfloat16,
-                                   "auto",
-                                   1,
-                                   False,
-                                   use_mla=True)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
         # If attention backend is None
@@ -70,10 +52,5 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # The selected backend is ROCM_AITER_MLA
         m.setenv(STR_BACKEND_ENV_VAR, None)
         m.setenv("VLLM_ROCM_USE_AITER", "1")
-        backend = get_attn_backend(576,
-                                   torch.bfloat16,
-                                   "auto",
-                                   1,
-                                   False,
-                                   use_mla=True)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index 48aacac8376b540d44f53d3b37d2d0198b97ffc6..04085fe5fa0fede90b50242a9c617aedb5696bba 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 
 @pytest.mark.parametrize("B", [3, 5])
@@ -24,14 +24,12 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
     num_kv_splits = 8
 
     num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
-    req_to_page = torch.randint(0,
-                                CACHE_SIZE // PAGE_SIZE,
-                                (B, num_pages_per_batch, 1),
-                                device="cuda")
+    req_to_page = torch.randint(
+        0, CACHE_SIZE // PAGE_SIZE, (B, num_pages_per_batch, 1), device="cuda"
+    )
     req_to_token = req_to_page * PAGE_SIZE
     req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
-    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
-        1, 1, -1)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(1, 1, -1)
     req_to_token = req_to_token.view(B, -1)
     req_to_token = req_to_token[:, :seq_len].contiguous()
 
@@ -48,7 +46,7 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
 
     lse = torch.zeros(B, H_Q, dtype=dtype, device="cuda")
 
-    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
 
     attn_logits = torch.empty(
         (B, H_Q, num_kv_splits, D_V + 1),
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 5cff29b15aa3feec13ffeab29e686021af63fac4..bf4d2179af5f901faf9133778d24b5d7b7e2f8af 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -14,9 +13,11 @@ HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16]
 
 DTYPES = [torch.bfloat16]
-QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
-    None, torch.float8_e4m3fnuz
-]
+QDTYPES = (
+    [None, torch.float8_e4m3fn]
+    if not current_platform.is_rocm()
+    else [None, torch.float8_e4m3fnuz]
+)
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
@@ -30,8 +31,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -42,7 +43,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -60,10 +61,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None and soft_cap > 0:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -77,9 +81,9 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
-@pytest.mark.parametrize("seq_lens",
-                         [[(1, 1328), (5, 18),
-                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -93,12 +97,12 @@ def test_triton_unified_attn(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
 
@@ -111,30 +115,23 @@ def test_triton_unified_attn(
     assert num_query_heads % num_kv_heads == 0
     max_query_len = max(query_lens)
     max_kv_len = max(kv_lens)
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     output = torch.empty_like(query)
 
@@ -188,5 +185,7 @@ def test_triton_unified_attn(
     atol, rtol = 1.5e-2, 1e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index ec5c60fd7b0e2b7b5498048c83484ad93ef3acd4..e8777ec4f59e8cb2e50385daca945be2592e7d65 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -8,19 +8,23 @@ import torch
 
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
-                                                   GeluAndMul, MulAndSilu,
-                                                   NewGELU, QuickGELU,
-                                                   SiluAndMul, SwigluOAIAndMul)
+from vllm.model_executor.layers.activation import (
+    FastGELU,
+    FatreluAndMul,
+    GeluAndMul,
+    MulAndSilu,
+    NewGELU,
+    QuickGELU,
+    SiluAndMul,
+    SwigluOAIAndMul,
+)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.parametrize(
@@ -73,24 +77,19 @@ def test_act_and_mul(
     out = layer(x)
     ref_out = layer.forward_native(x)
     if activation == "swigluoai_and_mul":
-
         rtol = {
-            #For fp16, change the relative tolerance from 1e-3 to 2e-3
-            torch.float16:
-            2e-3,
-            torch.bfloat16:
-            2e-2,
-            torch.float:
-            1.3e-6
+            # For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16: 2e-3,
+            torch.bfloat16: 2e-2,
+            torch.float: 1.3e-6,
         }
 
         def _get_rtol(output) -> float:
             return rtol[output.dtype]
 
-        torch.testing.assert_close(out,
-                                   ref_out,
-                                   atol=get_default_atol(out),
-                                   rtol=_get_rtol(out))
+        torch.testing.assert_close(
+            out, ref_out, atol=get_default_atol(out), rtol=_get_rtol(out)
+        )
     else:
         # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
         # equivalent to the native PyTorch implementations, so we can do exact
@@ -98,7 +97,7 @@ def test_act_and_mul(
         torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
-    output_shape = (x.shape[:-1] + (d, ))
+    output_shape = x.shape[:-1] + (d,)
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
@@ -108,9 +107,14 @@ def test_act_and_mul(
         opcheck(fn, (out, x))
 
 
-@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
-                                        (NewGELU, torch.ops._C.gelu_new),
-                                        (QuickGELU, torch.ops._C.gelu_quick)])
+@pytest.mark.parametrize(
+    "activation",
+    [
+        (FastGELU, torch.ops._C.gelu_fast),
+        (NewGELU, torch.ops._C.gelu_new),
+        (QuickGELU, torch.ops._C.gelu_quick),
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -132,10 +136,9 @@ def test_activation(
     fn = activation[1]
     out = layer(x)
     ref_out = layer.forward_native(x)
-    torch.testing.assert_close(out,
-                               ref_out,
-                               atol=get_default_atol(out),
-                               rtol=get_default_rtol(out))
+    torch.testing.assert_close(
+        out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
+    )
 
     out = torch.empty_like(x)
     opcheck(fn, (out, x))
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..a23959e353da9b6cbe606a7eccf62e8eccd82c03
--- /dev/null
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+IS_NEOX = [True, False]
+EPS_VALUES = [1e-5, 1e-6]
+SEEDS = [13]
+CUDA_DEVICES = ["cuda:0"]
+
+
+def _apply_qk_norm_rope(
+    qkv: torch.Tensor,
+    positions: torch.Tensor,
+    q_norm: RMSNorm,
+    k_norm: RMSNorm,
+    rope: RotaryEmbedding,
+    num_heads_q: int,
+    num_heads_kv: int,
+    head_dim: int,
+) -> torch.Tensor:
+    q_size = num_heads_q * head_dim
+    kv_size = num_heads_kv * head_dim
+
+    q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+
+    q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim)
+    q_by_head = q_norm.forward_native(q_by_head)
+    q = q_by_head.view(q.shape)
+
+    k_by_head = k.view(*k.shape[:-1], k.shape[-1] // head_dim, head_dim)
+    k_by_head = k_norm.forward_native(k_by_head)
+    k = k_by_head.view(k.shape)
+
+    q, k = rope.forward_native(positions, q, k)
+    return torch.cat([q, k, v], dim=-1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="fused_qk_norm_rope custom op requires cuda and rocm platform",
+)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("is_neox", IS_NEOX)
+@pytest.mark.parametrize("eps", EPS_VALUES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_fused_qk_norm_rope_matches_reference(
+    device: str,
+    dtype: torch.dtype,
+    is_neox: bool,
+    eps: float,
+    seed: int,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    num_tokens = 4
+
+    total_dim = (num_heads + 2 * num_kv_heads) * head_dim
+    qkv_base = torch.randn(num_tokens, total_dim, dtype=dtype, device=device)
+    qkv_fused = qkv_base.clone()
+    positions = torch.arange(num_tokens, dtype=torch.long, device=device)
+
+    q_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    k_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    q_norm.weight.data.normal_(mean=1.0, std=0.1)
+    k_norm.weight.data.normal_(mean=1.0, std=0.1)
+    q_weight = q_norm.weight.data
+    k_weight = k_norm.weight.data
+
+    rope = RotaryEmbedding(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position_embeddings=4096,
+        base=10000.0,
+        is_neox_style=is_neox,
+        dtype=dtype,
+    ).to(device)
+
+    ref_result = _apply_qk_norm_rope(
+        qkv=qkv_base,
+        positions=positions,
+        q_norm=q_norm,
+        k_norm=k_norm,
+        rope=rope,
+        num_heads_q=num_heads,
+        num_heads_kv=num_kv_heads,
+        head_dim=head_dim,
+    )
+
+    opcheck(
+        torch.ops._C.fused_qk_norm_rope,
+        (
+            qkv_fused.clone(),
+            num_heads,
+            num_kv_heads,
+            num_kv_heads,
+            head_dim,
+            eps,
+            q_weight,
+            k_weight,
+            rope.cos_sin_cache,
+            is_neox,
+            positions.view(-1),
+        ),
+    )
+
+    torch.ops._C.fused_qk_norm_rope(
+        qkv_fused,
+        num_heads,
+        num_kv_heads,
+        num_kv_heads,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        rope.cos_sin_cache,
+        is_neox,
+        positions.view(-1),
+    )
+
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(
+        qkv_fused,
+        ref_result,
+        atol=ATOL,
+        rtol=RTOL,
+    )
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 19703b8a2f978fa7757623b0f29fc584cd8b48f6..b5fc653ca735342cb4102a65a1d9c074ebec5c7f 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -12,11 +11,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 
 DTYPES = [torch.bfloat16, torch.float]
 QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
-VEC_HIDDEN_SIZES = range(1024, 1030)
+VEC_HIDDEN_SIZES = [1024, 1025, 1027, 1029]
 # Avoid combinatorial explosion with full Cartesian product
 NUM_TOKENS_HIDDEN_SIZES = [
     *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
-    *[(83, i) for i in [1, 1033, 2048, 5120]],
     *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
     *[(4096, i) for i in [1, 64, 5137]],
 ]
@@ -24,23 +22,20 @@ NUM_TOKENS_HIDDEN_SIZES = [
 ADD_RESIDUAL = [False, True]
 SCALE_UBS = [True, False]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 EPS = 1e-6
 
 ## Helpers
 
 
-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
-    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
 
-def ref_rms_norm(rms_norm_layer: RMSNorm,
-                 x: torch.Tensor,
-                 residual: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+def ref_rms_norm(
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
+) -> tuple[torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
         out, residual = rms_norm_layer.forward_native(x, residual)
@@ -50,12 +45,13 @@ def ref_rms_norm(rms_norm_layer: RMSNorm,
     return out, residual
 
 
-def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ref_dynamic_per_token_quant(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
 
@@ -64,53 +60,56 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
 
     # Quant
     if quant_dtype == torch.float8_e4m3fn:
-        torch_out, scales = ops.scaled_fp8_quant(torch_out,
-                                                 scale_ub=scale_ub,
-                                                 use_per_token_if_dynamic=True)
+        torch_out, scales = ops.scaled_fp8_quant(
+            torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
+        )
     else:
         assert quant_dtype == torch.int8
-        torch_out, scales = ops.scaled_int8_quant(torch_out)
+        torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
 
     return torch_out, scales, residual
 
 
-def ref_impl(rms_norm_layer: RMSNorm,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
-                                       residual, scale_ub)
+def ref_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    return ref_dynamic_per_token_quant(
+        rms_norm_layer, x, quant_dtype, residual, scale_ub
+    )
 
 
-def ops_dynamic_per_token_quant(weight: torch.Tensor,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ops_dynamic_per_token_quant(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
-    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
-                                                       quant_dtype, scale_ub,
-                                                       residual)
+    out, scales = ops.rms_norm_dynamic_per_token_quant(
+        x, weight, EPS, quant_dtype, scale_ub, residual
+    )
     return out, scales, residual
 
 
-def ops_impl(weight: torch.Tensor,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
-                                       scale_ub)
+def ops_impl(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
 
 
 @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
-@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("has_scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
@@ -120,7 +119,7 @@ def test_rms_norm(
     num_tokens: int,
     hidden_size: int,
     add_residual: bool,
-    scale_ub: bool,
+    has_scale_ub: bool,
     dtype: torch.dtype,
     quant_dtype: torch.dtype,
     seed: int,
@@ -131,7 +130,7 @@ def test_rms_norm(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
-    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+    if has_scale_ub and quant_dtype != torch.float8_e4m3fn:
         # skip
         return
 
@@ -144,31 +143,51 @@ def test_rms_norm(
     scale = 1 / (hidden_size)
     x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
     residual = torch.randn_like(x) * scale if add_residual else None
-    if scale_ub is not None:
+    if has_scale_ub:
         rms_x, _ = ref_rms_norm(layer, x, residual)
-        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")
+    else:
+        scale_ub = None
 
-    ref_out, ref_scales, ref_residual = \
-        ref_impl(layer, x, quant_dtype, residual, scale_ub)
-    ops_out, ops_scales, ops_residual = \
-        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+    ref_out, ref_scales, ref_residual = ref_impl(
+        layer, x, quant_dtype, residual, scale_ub
+    )
+    ops_out, ops_scales, ops_residual = ops_impl(
+        layer.weight, x, quant_dtype, residual, scale_ub
+    )
 
     assert ref_out.dtype == quant_dtype
     assert ops_out.dtype == quant_dtype
-    assert torch.allclose(ref_scales, ops_scales)
     if quant_dtype == torch.int8:
+        assert torch.allclose(ref_scales, ops_scales, atol=1e-6)
         # big atol to account for round-off errors.
         assert torch.allclose(ref_out, ops_out, atol=1)
     else:
-        assert torch.allclose(ref_out.to(dtype=torch.float32),
-                              ops_out.to(dtype=torch.float32))
+        assert torch.allclose(ref_scales, ops_scales)
+        a = ref_out.to(dtype=torch.float32)
+        b = ops_out.to(dtype=torch.float32)
+        ok = torch.allclose(a, b)
+        if not ok:
+            # fallback: compare dequantized values with relaxed tolerance
+            a_deq = a * ref_scales.view(-1, 1)
+            b_deq = b * ops_scales.view(-1, 1)
+            # NOTE: It is possible that some future test cases trigger this
+            # max diff due to precision issues. If such an error is
+            # encountered, it's recommended to inspect the differences between
+            # all corresponding elements from each tensor (e.g. by looping over
+            # them) and checking how many the max diff error shows up on (just
+            # a few bad elements should still be considered acceptable).
+            ok = torch.allclose(a_deq, b_deq, rtol=5e-2, atol=5e-2)
+        assert ok
     if add_residual:
         assert torch.allclose(ref_residual, ops_residual)
 
     output = torch.empty_like(x, dtype=quant_dtype)
-    scales = torch.empty((x.numel() // x.shape[-1], 1),
-                         device=x.device,
-                         dtype=torch.float32)
-
-    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
-            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
+    scales = torch.empty(
+        (x.numel() // x.shape[-1], 1), device=x.device, dtype=torch.float32
+    )
+
+    opcheck(
+        torch.ops._C.rms_norm_dynamic_per_token_quant,
+        (output, x, layer.weight, scales, 1e-5, scale_ub, residual),
+    )
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index 53e6d793cf2f9f28d96a21fc9224dd0163e01d66..49bd77f6795fc3cf0ce8a4dbd90108faaac88cf3 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -6,18 +6,15 @@ import torch
 
 from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
-                8199]  # Arbitrary values for testing
+HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -63,49 +60,21 @@ def test_rms_norm(
         torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
 
     if residual is not None:
-        opcheck(torch.ops._C.fused_add_rms_norm,
-                (x, residual, layer.weight.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.fused_add_rms_norm,
+            (x, residual, layer.weight.data, layer.variance_epsilon),
+        )
     else:
-        opcheck(torch.ops._C.rms_norm,
-                (out, x, layer.weight.data, layer.variance_epsilon))
-
-
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_poly_norm(
-    num_tokens: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    layer = PolyNorm().to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    layer.bias.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    x *= scale
-
-    ref_out = layer.forward_native(x)
-    out = layer(x)
-    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
-
-    opcheck(
-        torch.ops._C.poly_norm,
-        (out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
+@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("strided_input", [False, True])
@@ -144,7 +113,8 @@ def test_fused_rms_norm_quant(
 
     if add_residual:
         torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
+        )
 
         # Unfused kernel is in-place so it goes second
         # Also use a separate clone of x to avoid modifying the input
@@ -152,29 +122,32 @@ def test_fused_rms_norm_quant(
         x_unfused = x_unfused_base[..., :hidden_size]
         assert x_unfused.is_contiguous() != strided_input
         torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused.contiguous(),
-                                             quant_scale_t)
+        torch.ops._C.static_scaled_fp8_quant(
+            out_quant, x_unfused.contiguous(), quant_scale_t
+        )
 
         torch.cuda.synchronize()
-        torch.testing.assert_close(residual_fused,
-                                   residual,
-                                   atol=1e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
         opcheck(
             torch.ops._C.fused_add_rms_norm_static_fp8_quant,
-            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
+        )
     else:
-        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
-                                               quant_scale_t, 1e-6)
+        torch.ops._C.rms_norm_static_fp8_quant(
+            out_quant_fused, x, weight, quant_scale_t, 1e-6
+        )
 
         torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
-                                             quant_scale_t)
-
-        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
-                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
 
-    torch.testing.assert_close(out_quant.to(dtype=torch.float32),
-                               out_quant_fused.to(dtype=torch.float32),
-                               atol=1e-3,
-                               rtol=1e-3)
+        opcheck(
+            torch.ops._C.rms_norm_static_fp8_quant,
+            (out_quant_fused, x, weight, quant_scale_t, 1e-6),
+        )
+
+    torch.testing.assert_close(
+        out_quant.to(dtype=torch.float32),
+        out_quant_fused.to(dtype=torch.float32),
+        atol=1e-3,
+        rtol=1e-3,
+    )
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 5a903438f5e9922db5a0df20028e4117279c43d5..02b795721f46eb3ec7997e681e1c0e11154fb6e4 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -14,25 +14,25 @@ from vllm.platforms import current_platform
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
-def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
-                       head_size: int, max_position_embeddings: int,
-                       dtype: torch.dtype, device: torch.device):
+def generate_test_data(
+    num_tokens: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    max_position_embeddings: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
     """Generate test data for given configuration."""
     current_platform.seed_everything(42)
     # Create 2D positions (3, num_tokens) for multimodal case
-    positions = torch.randint(0,
-                              max_position_embeddings // 4, (3, num_tokens),
-                              device=device)
+    positions = torch.randint(
+        0, max_position_embeddings // 4, (3, num_tokens), device=device
+    )
 
     # Create query and key tensors
-    query = torch.randn(num_tokens,
-                        num_q_heads * head_size,
-                        dtype=dtype,
-                        device=device)
-    key = torch.randn(num_tokens,
-                      num_kv_heads * head_size,
-                      dtype=dtype,
-                      device=device)
+    query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
+    key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
 
     return positions, query, key
 
@@ -59,7 +59,8 @@ MODELS_TO_TEST = [
                 Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
                 reason="Qwen3-VL only available after Transformers v4.57",
             )
-        ]),
+        ],
+    ),
     MRoPETestInfo(
         model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
         marks=[
@@ -67,24 +68,33 @@ MODELS_TO_TEST = [
                 Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
                 reason="Qwen3-VL only available after Transformers v4.57",
             )
-        ]),
+        ],
+    ),
 ]
 
 num_tokens_list = [11, 8192]
 
 
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Skipping CUDA/ROCm only tests.")
-@pytest.mark.parametrize("model_info, model_name", [
-    pytest.param(test_config, test_config.model_name, marks=test_config.marks)
-    for test_config in MODELS_TO_TEST
-])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize(
+    "model_info, model_name",
+    [
+        pytest.param(test_config, test_config.model_name, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
 @pytest.mark.parametrize("tp_size", [1, 2])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
-def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
-               dtype: torch.dtype, num_tokens: int):
-
+def test_mrope(
+    model_name: str,
+    model_info: MRoPETestInfo,
+    tp_size: int,
+    dtype: torch.dtype,
+    num_tokens: int,
+):
     atol = model_info.atol
     rtol = model_info.rtol
 
@@ -96,8 +106,11 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
     total_num_heads = config.num_attention_heads
     num_heads = total_num_heads // tp_size
     num_kv_heads = max(1, total_num_kv_heads // tp_size)
-    head_dim = (config.head_dim if hasattr(config, "head_dim") else
-                config.hidden_size // total_num_heads)
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim")
+        else config.hidden_size // total_num_heads
+    )
     is_neox_style = True
 
     rope_theta = config.rope_theta
@@ -117,9 +130,9 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
 
     # create q k v input tensors
     # create rotary pos emb input tensors
-    positions, query, key = generate_test_data(num_tokens, num_heads,
-                                               num_kv_heads, head_dim,
-                                               max_position, dtype, device)
+    positions, query, key = generate_test_data(
+        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
+    )
 
     query_native, key_native = mrope_helper_class.forward_native(
         positions,
@@ -137,19 +150,26 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
     torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
 
 
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Skipping CUDA/ROCm only tests.")
-@pytest.mark.parametrize("model_info, model_name", [
-    pytest.param(test_config, test_config.model_name, marks=test_config.marks)
-    for test_config in MODELS_TO_TEST
-])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize(
+    "model_info, model_name",
+    [
+        pytest.param(test_config, test_config.model_name, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
 @pytest.mark.parametrize("tp_size", [1, 2])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
-def test_mrope_torch_compile_tracing(model_name: str,
-                                     model_info: MRoPETestInfo, tp_size: int,
-                                     dtype: torch.dtype, num_tokens: int):
-
+def test_mrope_torch_compile_tracing(
+    model_name: str,
+    model_info: MRoPETestInfo,
+    tp_size: int,
+    dtype: torch.dtype,
+    num_tokens: int,
+):
     atol = model_info.atol
     rtol = model_info.rtol
 
@@ -161,8 +181,11 @@ def test_mrope_torch_compile_tracing(model_name: str,
     total_num_heads = config.num_attention_heads
     num_heads = total_num_heads // tp_size
     num_kv_heads = max(1, total_num_kv_heads // tp_size)
-    head_dim = (config.head_dim if hasattr(config, "head_dim") else
-                config.hidden_size // total_num_heads)
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim")
+        else config.hidden_size // total_num_heads
+    )
     is_neox_style = True
     rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
@@ -180,16 +203,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
     ).to(device=device)
 
     # Generate test data
-    positions, query, key = generate_test_data(num_tokens, num_heads,
-                                               num_kv_heads, head_dim,
-                                               max_position, dtype, device)
+    positions, query, key = generate_test_data(
+        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
+    )
 
     # Create a wrapper that makes the in-place function appear functional
     def functional_forward_cuda(pos, q, k):
         """Wrapper that converts in-place operation to functional style
 
         CUDA Graph does not support in-place operations.
-        This wrapper creates working copies of the 
+        This wrapper creates working copies of the
         input tensors and modifies them.
         """
         q_work = q.clone()  # Create working copies
@@ -206,11 +229,13 @@ def test_mrope_torch_compile_tracing(model_name: str,
     )
 
     try:
-        compiled_forward_cuda = torch.compile(functional_forward_cuda,
-                                              fullgraph=True,
-                                              backend="inductor",
-                                              mode="reduce-overhead",
-                                              dynamic=False)
+        compiled_forward_cuda = torch.compile(
+            functional_forward_cuda,
+            fullgraph=True,
+            backend="inductor",
+            mode="reduce-overhead",
+            dynamic=False,
+        )
 
         # Run compiled version
         query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
@@ -225,25 +250,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
         mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
 
         # Verify results
-        torch.testing.assert_close(query_compiled_cuda,
-                                   query_cuda,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(key_compiled_cuda,
-                                   key_cuda,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(query_compiled_cuda,
-                                   query_native,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(key_compiled_cuda,
-                                   key_native,
-                                   atol=atol,
-                                   rtol=rtol)
+        torch.testing.assert_close(
+            query_compiled_cuda, query_cuda, atol=atol, rtol=rtol
+        )
+        torch.testing.assert_close(key_compiled_cuda, key_cuda, atol=atol, rtol=rtol)
+        torch.testing.assert_close(
+            query_compiled_cuda, query_native, atol=atol, rtol=rtol
+        )
+        torch.testing.assert_close(key_compiled_cuda, key_native, atol=atol, rtol=rtol)
 
         print("✓ forward_cuda successfully traced with torch.compile inductor")
 
     except Exception as e:
-        pytest.fail(
-            f"forward_cuda failed to trace with torch.compile inductor: {e}")
+        pytest.fail(f"forward_cuda failed to trace with torch.compile inductor: {e}")
diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
index e18f6230dbceabb3f507e683b6bc25be47bb12d9..08fdd0e055eac6895cdb950b4a885a3a2b7fa618 100644
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -8,11 +8,11 @@ from tests.kernels.utils import opcheck
 from vllm._custom_ops import permute_cols
 
 
-@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
-@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 def test_permute_cols(shape, dtype):
     x = torch.randn(shape, dtype=dtype).cuda()
     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
     opcheck(torch.ops._C.permute_cols, (x, perm))
     y = permute_cols(x, perm)
-    torch.testing.assert_close(y, x[:, perm])
\ No newline at end of file
+    torch.testing.assert_close(y, x[:, perm])
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index 1235e3222a781ec143d26886e79ca900a7debb60..c35ee5016ba05b2dcded7404c46d94d8217d0743 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from itertools import product
-from typing import Callable, Optional
 
 import pytest
 import torch
@@ -12,37 +12,40 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
 IS_NEOX_STYLE = [True, False]
-DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 112, 120, 256]
+DTYPES = [torch.bfloat16, torch.float]
+HEAD_SIZES = [64, 80, 120, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 NUM_HEADS = [17]  # Arbitrary values for testing
 BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 USE_KEY = [True, False]
 
 
-def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                           head_size: int) -> tuple[int, ...]:
+def _get_flat_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads * head_size)
 
 
 # For testing sliced tensors
-def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                             head_size: int) -> tuple[int, ...]:
+def _get_padded_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads, head_size + 64)
 
 
-def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                            head_size: int) -> tuple[int, ...]:
+def _get_batch_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads, head_size)
 
 
 TENSORS_SHAPES_FN = [
-    _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape
+    _get_batch_tensor_shape,
+    _get_flat_tensor_shape,
+    _get_padded_tensor_shape,
 ]
 
 
@@ -65,7 +68,7 @@ def test_rotary_embedding(
     seq_len: int,
     num_heads: int,
     head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
     dtype: torch.dtype,
     seed: int,
     device: str,
@@ -97,41 +100,63 @@ def test_rotary_embedding(
     ref_query, ref_key = rope.forward_native(positions, query, key)
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(
+        out_query,
+        ref_query,
+        atol=get_default_atol(out_query),
+        rtol=get_default_rtol(out_query),
+    )
     if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
+        torch.testing.assert_close(
+            out_key,
+            ref_key,
+            atol=get_default_atol(out_key),
+            rtol=get_default_rtol(out_key),
+        )
     else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
+        assert ref_key is None and out_key is None, "expected returned key to be None"
 
 
 @torch.inference_mode()
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
     BASES = [10000, 1000000]
-    ROPE_SCALINGS = (None, {
-        "rope_type": "linear",
-        "factor": (1, )
-    }, {
-        "rope_type": "dynamic",
-        "factor": 1
-    })
-    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-                ROPE_SCALINGS, DTYPES)
+    ROPE_SCALINGS = (
+        None,
+        {"rope_type": "linear", "factor": (1,)},
+        {"rope_type": "dynamic", "factor": 1},
+    )
+    settings = (
+        HEAD_SIZES,
+        ROTARY_DIMS,
+        MAX_POSITIONS,
+        BASES,
+        IS_NEOX_STYLE,
+        ROPE_SCALINGS,
+        DTYPES,
+    )
     rope_setting_id_map: dict[str, int] = {}
     for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
         # different settings cannot share the same rope module
         assert id(rope) not in rope_setting_id_map.values()
         assert all(x.dtype == dtype for x in rope.buffers())
@@ -139,11 +164,25 @@ def test_rope_module_cache():
         rope_setting_id_map[str(setting)] = id(rope)
 
     for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
         # check if cache take effect
         assert id(rope) == rope_setting_id_map[str(setting)]
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 5857dd5ba3fad8b9d1146a6c25fcabf772df8e72..30c64e0bd72a7e13213d4ee4b5873386db4b4126 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -4,8 +4,6 @@
 Tests for miscellaneous utilities
 """
 
-from typing import Optional
-
 import pytest
 import torch
 
@@ -13,17 +11,20 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 
 
-def rotary_embedding_opcheck(rot,
-                             positions: torch.Tensor,
-                             query: torch.Tensor,
-                             key: Optional[torch.Tensor] = None):
+def rotary_embedding_opcheck(
+    rot,
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor | None = None,
+):
     cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
 
     # ops.rotary_embedding() is a in-place operation
     # that updates the query and key tensors.
-    opcheck(torch.ops._C.rotary_embedding,
-            (positions, query, key, rot.head_size, cos_sin_cache,
-             rot.is_neox_style))
+    opcheck(
+        torch.ops._C.rotary_embedding,
+        (positions, query, key, rot.head_size, cos_sin_cache, rot.is_neox_style),
+    )
 
 
 @pytest.mark.parametrize("device", ["cuda"])
@@ -34,26 +35,30 @@ def rotary_embedding_opcheck(rot,
 @pytest.mark.parametrize("seq_len", [11, 1024])
 @pytest.mark.parametrize("use_key", [True, False])
 @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
-def test_rotary_embedding_opcheck(dist_init, device, max_position,
-                                  is_neox_style, rotary_dim, head_size,
-                                  seq_len, use_key, head_stride_is_contiguous):
+def test_rotary_embedding_opcheck(
+    dist_init,
+    device,
+    max_position,
+    is_neox_style,
+    rotary_dim,
+    head_size,
+    seq_len,
+    use_key,
+    head_stride_is_contiguous,
+):
     batch_size = 1
     base = 10000
     num_heads = 7
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
+    rot = RotaryEmbedding(
+        head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
+    )
 
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device=device)
+    positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
     head_stride = head_size + (64 if head_stride_is_contiguous else 0)
 
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads,
-                        head_stride,
-                        dtype=torch.float32,
-                        device=device)
+    query = torch.randn(
+        batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
+    )
     key = torch.randn_like(query) if use_key else None
     query = query[..., :head_size]
     key = key[..., :head_size] if use_key else None
@@ -64,5 +69,8 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
     # [..., num_heads * head_dim] shape/layout
     if head_stride_is_contiguous:
         rotary_embedding_opcheck(
-            rot, positions, query.flatten(start_dim=-2),
-            key.flatten(start_dim=-2) if use_key else None)
+            rot,
+            positions,
+            query.flatten(start_dim=-2),
+            key.flatten(start_dim=-2) if use_key else None,
+        )
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index c71215e4c646ba64a62fff37cbf7ab0169df4f08..2690346af4d30b9fa5a967fa98ed12a770d21533 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -3,22 +3,17 @@
 import pytest
 import torch
 
-from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cpu_write(device):
     torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
     cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
     assert cuda_view.device.type == "cuda"
 
@@ -40,11 +35,7 @@ def test_cpu_write(device):
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_gpu_write(device):
     torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
     cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
     assert cuda_view.device.type == "cuda"
 
@@ -59,4 +50,4 @@ def test_gpu_write(device):
 
     assert cpu_tensor[0, 0] == 2
     assert cpu_tensor[2, 3] == 4
-    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
+    assert cpu_tensor[4, 5] == -2
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index 411bd9e904b044eec3ce8271738905451b0d42f4..4647b97c477183f0ec8864fd28ad2176fd4fdf9c 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -10,18 +9,20 @@ from einops import rearrange
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.platforms import current_platform
 
 
 def causal_conv1d_ref(
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    initial_states: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
+    initial_states: torch.Tensor | None = None,
     return_final_states: bool = False,
-    final_states_out: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    final_states_out: torch.Tensor | None = None,
+    activation: str | None = "silu",
 ):
     """
     x: (batch, dim, seqlen)
@@ -39,18 +40,15 @@ def causal_conv1d_ref(
     seqlen = x.shape[-1]
     dim, width = weight.shape
     if initial_states is None:
-        out = F.conv1d(x,
-                       weight.unsqueeze(1),
-                       bias,
-                       padding=width - 1,
-                       groups=dim)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
     else:
         x = torch.cat([initial_states, x], dim=-1)
         out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
     out = out[..., :seqlen]
     if return_final_states:
         final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
-            dtype_in)  # (batch, dim, width - 1)
+            dtype_in
+        )  # (batch, dim, width - 1)
         if final_states_out is not None:
             final_states_out.copy_(final_states)
         else:
@@ -59,12 +57,9 @@ def causal_conv1d_ref(
     return (out, None) if not return_final_states else (out, final_states_out)
 
 
-def causal_conv1d_update_ref(x,
-                             conv_state,
-                             weight,
-                             bias=None,
-                             activation=None,
-                             cache_seqlens=None):
+def causal_conv1d_update_ref(
+    x, conv_state, weight, bias=None, activation=None, cache_seqlens=None
+):
     """
     x: (batch, dim) or (batch, dim, seqlen)
     conv_state: (batch, dim, state_len), where state_len >= width - 1
@@ -91,24 +86,25 @@ def causal_conv1d_update_ref(x,
     assert weight.shape == (dim, width)
     if cache_seqlens is None:
         x_new = torch.cat([conv_state, x], dim=-1).to(
-            weight.dtype)  # (batch, dim, state_len + seqlen)
+            weight.dtype
+        )  # (batch, dim, state_len + seqlen)
         conv_state.copy_(x_new[:, :, -state_len:])
     else:
         width_idx = torch.arange(
-            -(width - 1), 0, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
-            -1, dim, -1)
-        x_new = torch.cat([conv_state.gather(2, width_idx), x],
-                          dim=-1).to(weight.dtype)
-        copy_idx = torch.arange(
-            seqlen, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        copy_idx = torch.remainder(copy_idx,
-                                   state_len).unsqueeze(1).expand(-1, dim, -1)
+            -(width - 1), 0, dtype=torch.long, device=x.device
+        ).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = (
+            torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        )
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(
+            0
+        ) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
         conv_state.scatter_(2, copy_idx, x)
-    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
-                   groups=dim)[:, :, -seqlen:]
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[
+        :, :, -seqlen:
+    ]
     if unsqueeze:
         out = out.squeeze(-1)
     return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
@@ -117,15 +113,17 @@ def causal_conv1d_update_ref(x,
 @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
-def causal_conv1d_opcheck_fn(x: torch.Tensor,
-                             weight: torch.Tensor,
-                             bias: Optional[torch.Tensor] = None,
-                             cu_seq_len: Optional[torch.Tensor] = None,
-                             cache_indices: Optional[torch.Tensor] = None,
-                             has_initial_state: Optional[torch.Tensor] = None,
-                             conv_states: Optional[torch.Tensor] = None,
-                             activation: Optional[str] = "silu",
-                             pad_slot_id: int = PAD_SLOT_ID):
+def causal_conv1d_opcheck_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    cu_seq_len: torch.Tensor | None = None,
+    cache_indices: torch.Tensor | None = None,
+    has_initial_state: torch.Tensor | None = None,
+    conv_states: torch.Tensor | None = None,
+    activation: str | None = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
     """
     x: (batch, dim, seqlen)
     weight: (dim, width)
@@ -150,8 +148,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("seqlen", [1])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
-                              itype):
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, itype):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -167,23 +164,26 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
     bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
     conv_state_ref = conv_state.detach().clone()
     activation = None if not silu_activation else "silu"
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation)
-    out_ref = causal_conv1d_update_ref(x_ref,
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+
+    conv_state_indices = torch.arange(batch, dtype=torch.int32, device=device)
+
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=conv_state_indices,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref, conv_state_ref, weight, bias, activation=activation
+    )
 
     assert torch.equal(conv_state, conv_state_ref)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("silu_activation", [False, True])
 @pytest.mark.parametrize("has_bias", [False, True])
 @pytest.mark.parametrize("seqlen", [1, 3])
@@ -192,9 +192,9 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [True, False])
 @pytest.mark.parametrize("batch_size", [3])
-def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
-                                                width, seqlen, has_bias,
-                                                silu_activation, itype):
+def test_causal_conv1d_update_with_batch_gather(
+    batch_size, with_padding, dim, width, seqlen, has_bias, silu_activation, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -209,31 +209,30 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
     total_entries = 10 * batch_size
 
     # x will be (batch, dim, seqlen) with contiguous along dim-axis
-    x = torch.randn(padded_batch_size, seqlen, dim, device=device,
-                    dtype=itype).transpose(1, 2)
+    x = torch.randn(
+        padded_batch_size, seqlen, dim, device=device, dtype=itype
+    ).transpose(1, 2)
 
     x_ref = x.clone()
 
     conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[conv_state_indices] = False
-    padded_state_indices = torch.concat([
-        conv_state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            conv_state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
 
     # conv_state will be (cache_lines, dim, state_len)
     # with contiguous along dim-axis
-    conv_state = torch.randn(total_entries,
-                             width - 1,
-                             dim,
-                             device=device,
-                             dtype=itype).transpose(1, 2)
+    conv_state = torch.randn(
+        total_entries, width - 1, dim, device=device, dtype=itype
+    ).transpose(1, 2)
 
     conv_state_for_padding_test = conv_state.clone()
 
@@ -242,22 +241,23 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
     conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
     activation = None if not silu_activation else "silu"
 
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation,
-                               conv_state_indices=padded_state_indices,
-                               pad_slot_id=PAD_SLOT_ID)
-    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation
+    )
 
     assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
-    assert torch.equal(conv_state[unused_states_bool],
-                       conv_state_for_padding_test[unused_states_bool])
+    assert torch.equal(
+        conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]
+    )
     assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
 
 
@@ -265,12 +265,13 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
 @pytest.mark.parametrize("width", [4])
-@pytest.mark.parametrize('seqlen', [8, 30, 249, 2049, 4096])
-@pytest.mark.parametrize('dim', [64, 4096])
-@pytest.mark.parametrize('with_padding', [True, False])
-@pytest.mark.parametrize('batch', [4, 10])
-def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
-                              has_bias, silu_activation, itype):
+@pytest.mark.parametrize("seqlen", [8, 249, 4096])
+@pytest.mark.parametrize("dim", [64, 4096])
+@pytest.mark.parametrize("with_padding", [True, False])
+@pytest.mark.parametrize("batch", [4, 10])
+def test_causal_conv1d_varlen(
+    batch, with_padding, dim, seqlen, width, has_bias, silu_activation, itype
+):
     device = "cuda"
     torch.cuda.empty_cache()
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
@@ -288,19 +289,19 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
 
     seqlens.append(
         torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
     assert sum(seqlens[-1]) == seqlen
     assert all(s > 0 for s in seqlens[-1])
 
     total_entries = batch_size * 10
     cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0)
     x = rearrange(
         torch.randn(1, seqlen, 4096 + dim + 64, device=device, dtype=itype),
-        "b s d -> b d s")[:, 4096:4096 + dim, :]
+        "b s d -> b d s",
+    )[:, 4096 : 4096 + dim, :]
 
     weight = torch.randn(dim, width, device=device, dtype=itype)
 
@@ -309,34 +310,34 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
     weight_ref = weight.clone()
     bias_ref = bias.clone() if bias is not None else None
     activation = None if not silu_activation else "silu"
-    final_states = torch.randn(total_entries,
-                               width - 1,
-                               dim,
-                               device=x.device,
-                               dtype=x.dtype).transpose(1, 2)
+    final_states = torch.randn(
+        total_entries, width - 1, dim, device=x.device, dtype=x.dtype
+    ).transpose(1, 2)
     final_states_ref = final_states.clone()
-    has_initial_states = torch.randint(0,
-                                       2, (cumsum.shape[0] - 1, ),
-                                       dtype=torch.bool,
-                                       device=x.device)
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=x.device)[:batch_size]
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-    out = causal_conv1d_fn(x.squeeze(0),
-                           weight,
-                           bias=bias,
-                           conv_states=final_states,
-                           query_start_loc=cumsum.cuda(),
-                           cache_indices=padded_state_indices,
-                           has_initial_state=has_initial_states,
-                           activation=activation,
-                           pad_slot_id=PAD_SLOT_ID)
+    has_initial_states = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=x.device
+    )
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=x.device)[
+        :batch_size
+    ]
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+    out = causal_conv1d_fn(
+        x.squeeze(0),
+        weight,
+        bias=bias,
+        conv_states=final_states,
+        query_start_loc=cumsum.cuda(),
+        cache_indices=padded_state_indices,
+        has_initial_state=has_initial_states,
+        activation=activation,
+        pad_slot_id=PAD_SLOT_ID,
+    )
 
     out_ref = []
     out_ref_b = []
@@ -353,16 +354,20 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
                 bias_ref,
                 activation=activation,
                 return_final_states=True,
-                final_states_out=final_states_ref[
-                    padded_state_indices[i]].unsqueeze(0),
-                initial_states=final_states_ref[padded_state_indices[i]].
-                unsqueeze(0) if has_initial_states[i] else None))
+                final_states_out=final_states_ref[padded_state_indices[i]].unsqueeze(0),
+                initial_states=final_states_ref[padded_state_indices[i]].unsqueeze(0)
+                if has_initial_states[i]
+                else None,
+            )
+        )
     out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
     out_ref_tensor = torch.cat(out_ref, dim=0)
 
-    assert torch.allclose(final_states[state_indices],
-                          final_states_ref[state_indices],
-                          rtol=rtol,
-                          atol=atol)
-    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
+    assert torch.allclose(
+        final_states[state_indices],
+        final_states_ref[state_indices],
+        rtol=rtol,
+        atol=atol,
+    )
+    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
     assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index 16c310726ad16dc39dc303af6d4bea0a2440d445..6fca33acd48a3f9e465b7049fbad6c0d7616b6ed 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -7,11 +7,13 @@ import pytest
 import torch
 
 from tests.utils import multi_gpu_test
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
 from vllm.platforms import current_platform
-from vllm.utils import update_environment_variables
+from vllm.utils.system_utils import update_environment_variables
 
 
 @multi_gpu_test(num_gpus=2)
@@ -23,15 +25,15 @@ from vllm.utils import update_environment_variables
         (64, 1),
         (64, 2),
         (64, 4),  # hidden_size be divisible by num_gpus
-        (100, 5),  # and n_groups must divide hidden_size
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", [torch.float16])
 def test_mixer2_gated_norm_multi_gpu(
     batch_size: int,
     seq_len: int,
     hidden_size_n_groups: tuple[int, int],
     dtype: torch.dtype,
-    device: str = 'cuda',
+    device: str = "cuda",
 ):
     hidden_size, n_groups = hidden_size_n_groups
     num_processes = 2
@@ -39,17 +41,19 @@ def test_mixer2_gated_norm_multi_gpu(
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(
-                                        num_processes,
-                                        batch_size,
-                                        seq_len,
-                                        hidden_size,
-                                        n_groups,
-                                        dtype,
-                                        device,
-                                    ),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                batch_size,
+                seq_len,
+                hidden_size,
+                n_groups,
+                dtype,
+                device,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)
 
@@ -71,20 +75,22 @@ def mixer2_gated_norm_tensor_parallel(
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # create random weights an inputs
-    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    weight = torch.rand((hidden_size,), dtype=dtype, device=device)
     hidden_states = torch.randn(batch_size, seq_len, hidden_size)
     gate_states = torch.randn(batch_size, seq_len, hidden_size)
 
@@ -97,14 +103,18 @@ def mixer2_gated_norm_tensor_parallel(
 
     # create gated-norm without TP to compute reference
     # - utilize mock patching to disable TP when
-    with (unittest.mock.patch(
+    with (
+        unittest.mock.patch(
             "vllm.model_executor.layers.mamba.mamba_mixer2."
             "get_tensor_model_parallel_world_size",
-            return_value=1),
-          unittest.mock.patch(
-              "vllm.model_executor.layers.mamba.mamba_mixer2."
-              "get_tensor_model_parallel_rank",
-              return_value=0)):
+            return_value=1,
+        ),
+        unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_rank",
+            return_value=0,
+        ),
+    ):
         mixer_single_gpu = Mixer2RMSNormGated(
             full_hidden_size=hidden_size,
             full_n_groups=n_groups,
@@ -115,12 +125,13 @@ def mixer2_gated_norm_tensor_parallel(
     # generate and compare
     N = hidden_size // world_size
     output = mixer(
-        hidden_states[..., local_rank * N:(local_rank + 1) * N],
-        gate_states[..., local_rank * N:(local_rank + 1) * N],
+        hidden_states[..., local_rank * N : (local_rank + 1) * N],
+        gate_states[..., local_rank * N : (local_rank + 1) * N],
     )
     ref_output = mixer_single_gpu(hidden_states, gate_states)
-    torch.testing.assert_close(output,
-                               ref_output[...,
-                                          local_rank * N:(local_rank + 1) * N],
-                               atol=5e-3,
-                               rtol=1e-3)
+    torch.testing.assert_close(
+        output,
+        ref_output[..., local_rank * N : (local_rank + 1) * N],
+        atol=5e-3,
+        rtol=1e-3,
+    )
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 4c32ae81b34c54023320015805acbf587cb7d043..98edc959957d096aeb1e7a5d48e8964cb4acde65 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -10,20 +10,15 @@ from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops  # noqa: F401
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+    selective_scan_fn,
+    selective_state_update,
+)
 from vllm.platforms import current_platform
 
 
-def selective_state_update_ref(state,
-                               x,
-                               dt,
-                               A,
-                               B,
-                               C,
-                               D=None,
-                               z=None,
-                               dt_bias=None,
-                               dt_softplus=False):
+def selective_state_update_ref(
+    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
+):
     """
     Argument:
         state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
@@ -73,16 +68,17 @@ def selective_state_update_ref(state,
         assert dt_bias.shape == (nheads, dim)
         dt = dt + dt_bias
     dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
-                   A)  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
+    dA = torch.exp(
+        rearrange(dt, "b h d -> b h d 1") * A
+    )  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
     dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
-    state.copy_(state * dA +
-                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+        B, "b h n -> b h 1 n"
+    )  # (batch, nheads, dim, dstate)
+    state.copy_(
+        state * dA + dB * rearrange(x, "b h d -> b h d 1")
+    )  # (batch, dim, dstate
     out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
     if D is not None:
         out += (x * D).to(out.dtype)
@@ -92,18 +88,20 @@ def selective_state_update_ref(state,
     return out
 
 
-def selective_scan_ref(u,
-                       delta,
-                       A,
-                       B,
-                       C,
-                       D=None,
-                       z=None,
-                       delta_bias=None,
-                       delta_softplus=False,
-                       return_last_state=False,
-                       prev_state=None,
-                       final_state_out=None):
+def selective_scan_ref(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    return_last_state=False,
+    prev_state=None,
+    final_state_out=None,
+):
     """
     u: r(B D L)
     delta: r(B D L)
@@ -132,26 +130,26 @@ def selective_scan_ref(u,
     C = C.float()
     x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
     ys = []
-    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
     if not is_variable_B:
-        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
     else:
         if B.dim() == 3:
-            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
         else:
             B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
     if is_variable_C and C.dim() == 4:
         C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
     for i in range(u.shape[2]):
         x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
         if not is_variable_C:
-            y = torch.einsum('bdn,dn->bd', x, C)
+            y = torch.einsum("bdn,dn->bd", x, C)
         else:
             if C.dim() == 3:
-                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
             else:
-                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
         if i == u.shape[2] - 1:
             if final_state_out is None:
                 final_state_out = x
@@ -166,20 +164,26 @@ def selective_scan_ref(u,
     return out if not return_last_state else (out, final_state_out)
 
 
-def selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D=None,
-                              z=None,
-                              delta_bias=None,
-                              delta_softplus=False,
-                              cu_seq_len=None,
-                              cache_indices=None,
-                              has_initial_state=None,
-                              ssm_states=None,
-                              pad_slot_id=PAD_SLOT_ID):
+def selective_scan_opcheck_fn(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    cu_seq_len=None,
+    cache_indices=None,
+    has_initial_state=None,
+    ssm_states=None,
+    pad_slot_id=PAD_SLOT_ID,
+    block_size=2048,
+    block_idx_first_scheduled_token=None,
+    block_idx_last_scheduled_token=None,
+    initial_state_idx=None,
+):
     """if return_last_state is True, returns (out, last_state)
     last_state has shape (batch, dim, dstate).
     """
@@ -206,30 +210,59 @@ def selective_scan_opcheck_fn(u,
 
     # Disable test_autograd_registration for now as it seems to trigger
     # a bogus error.
-    opcheck(torch.ops._C.selective_scan_fwd,
-            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
-             cache_indices, has_initial_state, ssm_states, pad_slot_id),
-            test_utils=["test_schema", "test_faketensor"])
-
-
-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype',
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+    opcheck(
+        torch.ops._C.selective_scan_fwd,
+        (
+            u,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            delta_softplus,
+            cu_seq_len,
+            cache_indices,
+            has_initial_state,
+            ssm_states,
+            pad_slot_id,
+            block_size,
+            block_idx_first_scheduled_token,
+            block_idx_last_scheduled_token,
+            initial_state_idx,
+        ),
+        test_utils=["test_schema", "test_faketensor"],
+    )
+
+
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("seqlen", [128, 1024, 4096])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
 @pytest.mark.parametrize("varBC_groups", [1, 2])
 @pytest.mark.parametrize("is_variable_C", [True])
 @pytest.mark.parametrize("is_variable_B", [True])
-@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
-def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
-                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
-                        wtype, scan_chunks):
+@pytest.mark.parametrize("scan_chunks", [1, 3])
+def test_selective_scan(
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    seqlen,
+    itype,
+    wtype,
+    scan_chunks,
+):
     if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
         pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
     rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
         rtol, atol = 3e-2, 5e-2
@@ -242,7 +275,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
     batch_size = 1
     dim = 4
     dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
     A_ref = A.clone()
     if not is_variable_B:
         B_shape = [dim, dstate]
@@ -250,9 +283,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         B_shape = [batch_size, dstate, seqlen]
     else:
         B_shape = [batch_size, varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
     B_ref = B.clone()
     if not is_variable_C:
         C_shape = [dim, dstate]
@@ -260,27 +291,27 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         C_shape = [batch_size, dstate, seqlen]
     else:
         C_shape = [batch_size, varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
     C_ref = C.clone()
     D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
     D_ref = D.clone()
-    z = torch.randn(batch_size, dim, seqlen, device=device,
-                    dtype=itype) if has_z else None
+    z = (
+        torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
+        if has_z
+        else None
+    )
     z_ref = z.clone() if has_z else None
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
     u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
     u_ref = u.clone()
-    delta = (0.5 *
-             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(batch_size, dim, seqlen, device=device, dtype=itype)
     delta_ref = delta.clone()
     state_shape = (batch_size, u.shape[1], int(A.shape[1]))
-    state = torch.randn(state_shape,
-                        device=u.device,
-                        dtype=itype,
-                        requires_grad=False)
+    state = torch.randn(state_shape, device=u.device, dtype=itype, requires_grad=False)
     state_ref = state.clone()
     out = None
     out_ref = None
@@ -312,9 +343,15 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
             z=_z,
             delta_bias=delta_bias,
             delta_softplus=delta_softplus,
-            has_initial_state=torch.ones(batch_size,
-                                         device=u.device,
-                                         dtype=torch.bool) if c > 0 else None)
+            has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool)
+            if c > 0
+            else None,
+            pad_slot_id=PAD_SLOT_ID,
+            block_size=2048,
+            block_idx_first_scheduled_token=None,
+            block_idx_last_scheduled_token=None,
+            initial_state_idx=None,
+        )
         outs.append(out)
     if len(outs) > 1:
         out = torch.cat(outs, dim=-1)
@@ -329,29 +366,32 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         z=z_ref,
         delta_bias=delta_bias,
         delta_softplus=delta_softplus,
-        return_last_state=True)
+        return_last_state=True,
+    )
 
     assert out is not None and out_ref is not None
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
     assert state is not None and state_ref is not None
     assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
 
-    selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D,
-                              z,
-                              delta_bias=delta_bias,
-                              delta_softplus=delta_softplus,
-                              ssm_states=state)
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias=delta_bias,
+        delta_softplus=delta_softplus,
+        ssm_states=state,
+        block_size=2048,
+    )
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [False, True])
-@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dstate", [16, 64])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
 def test_selective_state_update(dim, dstate, has_z, itype):
     device = "cuda"
@@ -374,52 +414,47 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     D = torch.randn(dim, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state.detach().clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True, out=out
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )
 
     assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype', [torch.float32])
-@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32])
+@pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096])
 @pytest.mark.parametrize("return_last_state", [True])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
 @pytest.mark.parametrize("varBC_groups", [1, 2])
 @pytest.mark.parametrize("is_variable_C", [True])
 @pytest.mark.parametrize("is_variable_B", [True])
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [False, True])
-def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
-                               varBC_groups, has_D, has_z, has_delta_bias,
-                               delta_softplus, return_last_state, seqlen,
-                               itype, wtype):
+def test_selective_scan_varlen(
+    with_padding,
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    return_last_state,
+    seqlen,
+    itype,
+    wtype,
+):
     if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
         pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
     rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
         rtol, atol = 3e-2, 5e-2
@@ -443,72 +478,79 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
     eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
     seqlens.append(
         torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
 
     assert sum(seqlens[-1]) == seqlen
     assert all(s > 0 for s in seqlens[-1])
 
     total_entries = batch_size * 10
     cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0).cuda()
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0).cuda()
 
     dim = 4
     dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
     A_ref = A.clone()
     B_shape = [varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
     B_ref = B.clone()
     C_shape = [varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
     C_ref = C.clone()
     D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
     D_ref = D.clone()
     z = torch.randn(dim, seqlen, device=device, dtype=itype)
     z_ref = z.clone()
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
     u = torch.randn(dim, seqlen, device=device, dtype=itype)
     u_ref = u.clone()
-    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(dim, seqlen, device=device, dtype=itype)
     delta_ref = delta.clone()
     out = None
     out_ref = None
 
     prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
-    prev_state = torch.randn(prev_state_shape,
-                             device=u.device,
-                             dtype=itype,
-                             requires_grad=False)
+    prev_state = torch.randn(
+        prev_state_shape, device=u.device, dtype=itype, requires_grad=False
+    )
     prev_state_ref = prev_state.clone()
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=u.device)[:batch_size]
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=u.device)[
+        :batch_size
+    ]
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-
-    has_initial_state = torch.randint(0,
-                                      2, (cumsum.shape[0] - 1, ),
-                                      dtype=torch.bool,
-                                      device=u.device)
-    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
-                            delta_softplus, cumsum, padded_state_indices,
-                            has_initial_state)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+
+    has_initial_state = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=u.device
+    )
+    out = selective_scan_fn(
+        u,
+        prev_state,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+    )
     outs_ref = []
     splits = [
         torch.split(var, seqlens[0], dim=-1)
@@ -530,33 +572,47 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
             delta_softplus=delta_softplus,
             return_last_state=return_last_state,
             prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
-            if has_initial_state[i] else None,
-            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
-                0))
+            if has_initial_state[i]
+            else None,
+            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(0),
+        )
         outs_ref.append(out_ref_s)
     out_ref = torch.cat(outs_ref, dim=-1)[0]
 
-    unpadded_out = out[:, :out_ref[0].shape[-1]]
+    unpadded_out = out[:, : out_ref[0].shape[-1]]
     print("Output diff max", (unpadded_out - out_ref).max())
     print("Output diff mean", (unpadded_out - out_ref).mean())
     print("Output state diff max", (prev_state - prev_state_ref).max())
     print("Output state diff mean", (prev_state - prev_state_ref).mean())
     assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
-    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
-                              delta_softplus, cumsum, padded_state_indices,
-                              has_initial_state, prev_state)
-
-
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+        prev_state,
+        block_size=2048,
+    )
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [True])
-@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dstate", [16, 64])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [True, False])
-def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
-                                                   has_z, itype):
+def test_selective_state_update_with_batch_indices(
+    with_padding, dim, dstate, has_z, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
     if itype == torch.bfloat16:
@@ -571,17 +627,17 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     total_entries = 10 * batch_size
     state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
     state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
     x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
     out = torch.empty_like(x)
     dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
@@ -593,61 +649,60 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].clone()
     state_before = state.clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           state_batch_indices=padded_state_indices,
-                           pad_slot_id=PAD_SLOT_ID,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x[:batch_size],
-                                         dt[:batch_size],
-                                         A,
-                                         B[:batch_size],
-                                         C[:batch_size],
-                                         D=D,
-                                         z=z[:batch_size],
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+        out=out,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref,
+        x[:batch_size],
+        dt[:batch_size],
+        A,
+        B[:batch_size],
+        C[:batch_size],
+        D=D,
+        z=z[:batch_size],
+        dt_bias=dt_bias,
+        dt_softplus=True,
+    )
 
     print("Output diff max", (out[:batch_size] - out_ref).max())
     print("Output diff mean", (out[:batch_size] - out_ref).mean())
     print("Output state diff max", (state[state_indices, :] - state_ref).max())
-    print("Output state diff mean",
-          (state[state_indices, :] - state_ref).mean())
+    print("Output state diff mean", (state[state_indices, :] - state_ref).mean())
     # test padded entries stay the same
     if with_padding:
-        assert torch.equal(state_before[unused_states_bool],
-                           state[unused_states_bool])
-        assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
-        assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
-        assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
-        assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
+        assert torch.equal(state_before[unused_states_bool], state[unused_states_bool])
+        assert torch.equal(x[batch_size + 1 :], x[batch_size + 1 :])
+        assert torch.equal(dt[batch_size + 1 :], dt[batch_size + 1 :])
+        assert torch.equal(B[batch_size + 1 :], B[batch_size + 1 :])
+        assert torch.equal(C[batch_size + 1 :], C[batch_size + 1 :])
 
     # test "real" entries
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [False, True])
 @pytest.mark.parametrize("tie_hdim", [False, True])
-@pytest.mark.parametrize("ngroups", [1, 2, 4])
-@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("ngroups", [1, 4])
+@pytest.mark.parametrize("dstate", [16, 64])
 @pytest.mark.parametrize("dim", [2048, 4096])
 def test_selective_state_update_with_heads_with_batch_indices(
-        dim, dstate, ngroups, has_z, tie_hdim, itype):
+    dim, dstate, ngroups, has_z, tie_hdim, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
     if itype == torch.bfloat16:
@@ -659,71 +714,55 @@ def test_selective_state_update_with_heads_with_batch_indices(
     nheads = dim // headdim
 
     total_entries = 10 * batch_size
-    state = torch.randn(total_entries,
-                        nheads,
-                        headdim,
-                        dstate,
-                        dtype=itype,
-                        device=device)
+    state = torch.randn(
+        total_entries, nheads, headdim, dstate, dtype=itype, device=device
+    )
     state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
+        dtype=torch.int32, device=device
+    )
 
     x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
     out = torch.empty_like(x)
     if not tie_hdim:
-        dt = torch.randn(batch_size,
-                         nheads,
-                         headdim,
-                         device=device,
-                         dtype=itype)
+        dt = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
         dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
         A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
         D = torch.randn(nheads, headdim, device=device)
     else:
-        dt = repeat(torch.randn(batch_size, nheads, device=device,
-                                dtype=itype),
-                    "b h -> b h p",
-                    p=headdim)
-        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
-                         "h -> h p",
-                         p=headdim)
-        A = repeat(-torch.rand(nheads, device=device) - 1.0,
-                   "h -> h p n",
-                   p=headdim,
-                   n=dstate)
+        dt = repeat(
+            torch.randn(batch_size, nheads, device=device, dtype=itype),
+            "b h -> b h p",
+            p=headdim,
+        )
+        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0, "h -> h p", p=headdim)
+        A = repeat(
+            -torch.rand(nheads, device=device) - 1.0, "h -> h p n", p=headdim, n=dstate
+        )
         D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
     B = torch.randn(batch_size, ngroups, dstate, device=device)
     C = torch.randn(batch_size, ngroups, dstate, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].detach().clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           state_batch_indices=state_indices,
-                           pad_slot_id=PAD_SLOT_ID,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+        out=out,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )
 
     print(f"Output max diff: {(out - out_ref).abs().max().item()}")
     print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 927af32588e66a3c219e204b4b9c89a02d95375a..0b0b82e484a1c42323bfd65361cfc6667976abbe 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -7,10 +7,10 @@ import torch.nn.functional as F
 from einops import rearrange, repeat
 
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined_varlen)
+    mamba_chunk_scan_combined_varlen,
+)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba2_attn import (
-    _query_start_loc_to_chunk_indices_offsets)
+from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata
 
 # Added by the IBM Team, 2024
 
@@ -22,12 +22,10 @@ def segsum(x):
     """Calculates segment sum."""
     T = x.size(-1)
     x = repeat(x, "... d -> ... d e", e=T)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=-1)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=-1)
     x = x.masked_fill(~mask, 0)
     x_segsum = torch.cumsum(x, dim=-2)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=0)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
     x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
     return x_segsum
 
@@ -46,8 +44,9 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     assert X.shape[1] % block_len == 0
 
     # Rearrange into blocks/chunks
-    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
-                  for x in (X, A, B, C))
+    X, A, B, C = (
+        rearrange(x, "b (c l) ... -> b c l ...", l=block_len) for x in (X, A, B, C)
+    )
 
     A = rearrange(A, "b c l h -> b h c l")
     A_cumsum = torch.cumsum(A, dim=-1)
@@ -74,7 +73,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     # 4. Compute state -> output conversion per chunk
     # (left term of low-rank factorization of off-diagonal blocks; C terms)
     state_decay_out = torch.exp(A_cumsum)
-    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+    Y_off = torch.einsum("bclhn,bchpn,bhcl->bclhp", C, states, state_decay_out)
 
     # Add output of intra-chunk and inter-chunk terms
     # (diagonal and off-diagonal blocks)
@@ -82,42 +81,31 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     return Y, final_state
 
 
-def generate_random_inputs(batch_size,
-                           seqlen,
-                           n_heads,
-                           d_head,
-                           itype,
-                           device='cuda'):
-
+def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"):
     current_platform.seed_everything(0)
-    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device))
     dt = F.softplus(
-        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
-        4)
-    X = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    B = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    C = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4
+    )
+    X = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
 
     return A, dt, X, B, C
 
 
-def generate_continuous_batched_examples(example_lens_by_batch,
-                                         num_examples,
-                                         full_length,
-                                         last_taken,
-                                         exhausted,
-                                         n_heads,
-                                         d_head,
-                                         itype,
-                                         device='cuda',
-                                         return_naive_ref=True):
-
+def generate_continuous_batched_examples(
+    example_lens_by_batch,
+    num_examples,
+    full_length,
+    last_taken,
+    exhausted,
+    n_heads,
+    d_head,
+    itype,
+    device="cuda",
+    return_naive_ref=True,
+):
     # this function generates a random examples of certain length
     # and then cut according to "example_lens_by_batch" and feed
     # them in continuous batches to the kernels.
@@ -126,23 +114,20 @@ def generate_continuous_batched_examples(example_lens_by_batch,
     # reference output.
 
     # generate the full-length example
-    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
-                                            d_head, itype)
+    A, dt, X, B, C = generate_random_inputs(
+        num_examples, full_length, n_heads, d_head, itype
+    )
 
     if return_naive_ref:
-        Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
-                                                      A * dt,
-                                                      B,
-                                                      C,
-                                                      block_len=full_length //
-                                                      4)
+        Y_min, final_state_min = ssd_minimal_discrete(
+            X * dt.unsqueeze(-1), A * dt, B, C, block_len=full_length // 4
+        )
 
     # internal function that outputs a cont batch of examples
     # given a tuple of lengths for each example in the batch
     # e.g., example_lens=(8, 4) means take 8 samples from first eg,
     #       4 examples from second eg, etc
     def get_continuous_batch(example_lens: tuple[int, ...]):
-
         indices = []
         for i, x in enumerate(example_lens):
             c = last_taken.get(i, 0)
@@ -150,8 +135,10 @@ def generate_continuous_batched_examples(example_lens_by_batch,
             last_taken[i] = (c + x) % full_length
             exhausted[i] = last_taken[i] == 0
 
-        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
-                              ]).unsqueeze(0) for x in (dt, X, B, C))
+        return (
+            torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)]).unsqueeze(0)
+            for x in (dt, X, B, C)
+        )
 
     # internal function that maps "n" to the appropriate right boundary
     # value when forming continuous batches from examples of length given
@@ -163,19 +150,20 @@ def generate_continuous_batched_examples(example_lens_by_batch,
 
     IND_E = None
     for spec in example_lens_by_batch:
-
         # get the (maybe partial) example seen in this cont batch
         dt2, X2, B2, C2 = get_continuous_batch(spec)
 
         # get the metadata
-        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
-        seq_idx = torch.zeros(cu_seqlens[-1],
-                              dtype=torch.int32,
-                              device=cu_seqlens.device)
-        for i, (srt, end) in enumerate(zip(
+        cu_seqlens = torch.tensor((0,) + spec, device=device).cumsum(dim=0)
+        seq_idx = torch.zeros(
+            cu_seqlens[-1], dtype=torch.int32, device=cu_seqlens.device
+        )
+        for i, (srt, end) in enumerate(
+            zip(
                 cu_seqlens,
                 cu_seqlens[1:],
-        )):
+            )
+        ):
             seq_idx[srt:end] = i
 
         # for cont batch
@@ -190,19 +178,21 @@ def generate_continuous_batched_examples(example_lens_by_batch,
         X2 = X2.squeeze(0)
         B2 = B2.squeeze(0)
         C2 = C2.squeeze(0)
-        yield ([Y_min[s, IND_S[s]:IND_E[s]]
-                for s in range(num_examples)] if return_naive_ref else None,
-               cu_seqlens, seq_idx, (A, dt2, X2, B2, C2))
+        yield (
+            [Y_min[s, IND_S[s] : IND_E[s]] for s in range(num_examples)]
+            if return_naive_ref
+            else None,
+            cu_seqlens,
+            seq_idx,
+            (A, dt2, X2, B2, C2),
+        )
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
-@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("n_heads", [4, 16, 32])
+@pytest.mark.parametrize("d_head", [5, 8, 32, 128])
 @pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
-def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
-                                         itype):
-
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
     # this tests the kernels on a single example (bs=1)
 
     # TODO: the bfloat16 case requires higher thresholds. To be investigated
@@ -219,19 +209,16 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
     #   it is not an operational limitation.
     seqlen, chunk_size = seq_len_chunk_size
 
-    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
-                                            d_head, itype)
-
-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
-                                                  B, C, chunk_size)
-
-    cu_seqlens = torch.tensor((0, seqlen), device='cuda').cumsum(dim=0)
-    seq_idx = torch.zeros(seqlen, dtype=torch.int32, device=cu_seqlens.device)
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype)
 
-    chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                cu_seqlens, chunk_size, cu_seqlens[-1])
+    Y_min, final_state_min = ssd_minimal_discrete(
+        X * dt.unsqueeze(-1), A * dt, B, C, chunk_size
+    )
 
+    cu_seqlens = torch.tensor((0, seqlen), device="cuda").cumsum(dim=0)
+    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
+        compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+    )
     # varlen has implicit batch=1
     X = X.squeeze(0)
     dt = dt.squeeze(0)
@@ -239,62 +226,64 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
     B = B.squeeze(0)
     C = C.squeeze(0)
     Y = torch.empty_like(X)
-    final_state = mamba_chunk_scan_combined_varlen(X,
-                                                   dt,
-                                                   A,
-                                                   B,
-                                                   C,
-                                                   chunk_size,
-                                                   D=None,
-                                                   cu_seqlens=cu_seqlens,
-                                                   seq_idx=seq_idx,
-                                                   chunk_indices=chunk_indices,
-                                                   chunk_offsets=chunk_offsets,
-                                                   out=Y)
+    final_state = mamba_chunk_scan_combined_varlen(
+        X,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        cu_seqlens=cu_seqlens.to(torch.int32),
+        cu_chunk_seqlens=cu_chunk_seqlens,
+        last_chunk_indices=last_chunk_indices,
+        seq_idx=seq_idx_chunks,
+        out=Y,
+        D=None,
+    )
 
     # just test the last in sequence
     torch.testing.assert_close(Y[-1], Y_min[0, -1], atol=atol, rtol=rtol)
 
     # just test the last head
     # NOTE, in the kernel we always cast states to fp32
-    torch.testing.assert_close(final_state[:, -1].to(torch.float32),
-                               final_state_min[:, -1].to(torch.float32),
-                               atol=atol,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        final_state[:, -1].to(torch.float32),
+        final_state_min[:, -1].to(torch.float32),
+        atol=atol,
+        rtol=rtol,
+    )
 
 
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
-@pytest.mark.parametrize("n_heads", [4, 8, 13])
-@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
+@pytest.mark.parametrize("itype", [torch.float32])
+@pytest.mark.parametrize("n_heads", [4, 8])
+@pytest.mark.parametrize("d_head", [5, 16, 32])
 @pytest.mark.parametrize(
     "seq_len_chunk_size_cases",
     [
-
         # small-ish chunk_size (8)
         (64, 8, 2, [(64, 32), (64, 32)]),
-        (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
         (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
-        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
-                    (4, 4)]),  # chunk_size larger than cont batches
-        (64, 8, 5, [
-            (64, 32, 16, 8, 8),
-            (8, 16, 32, 16, 8),
-            (8, 8, 16, 32, 16),
-        ]),  # mode examples with varied lengths
-
+        (
+            64,
+            8,
+            2,
+            [(4, 4), (4, 4), (4, 4), (4, 4)],
+        ),  # chunk_size larger than cont batches
+        (64, 8, 5, [(64, 32, 16, 8, 8)]),
         # large-ish chunk_size (256)
-        (64, 256, 1, [(5, ), (1, ), (1, ),
-                      (1, )]),  # irregular sizes with small sequences
-        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
-                      (1, 2)]),  # irregular sizes with small sequences
-
+        (64, 256, 1, [(5,), (1,), (1,), (1,)]),  # irregular sizes with small sequences
+        (
+            64,
+            256,
+            2,
+            [(5, 30), (1, 2), (1, 2), (1, 2)],
+        ),  # irregular sizes with small sequences
         # we also need to test some large seqlen
         # to catch errors with init states decay
         (768, 128, 2, [(138, 225), (138, 225)]),
-    ])
-def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
-                                     itype):
-
+    ],
+)
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, itype):
     # this test with multiple examples in a continuous batch
     # (i.e. chunked prefill)
 
@@ -312,14 +301,18 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
     exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
 
     states = None
-    for Y_min, cu_seqlens, seq_idx, (
-            A, dt, X, B, C) in generate_continuous_batched_examples(
-                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
-                d_head, itype):
-
-        chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                cu_seqlens, chunk_size, cu_seqlens[-1])
+    for Y_min, cu_seqlens, _token_seq_idx, (
+        A,
+        dt,
+        X,
+        B,
+        C,
+    ) in generate_continuous_batched_examples(
+        cases, num_examples, seqlen, last_taken, exhausted, n_heads, d_head, itype
+    ):
+        cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
+            compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+        )
 
         Y = torch.empty_like(X)
         new_states = mamba_chunk_scan_combined_varlen(
@@ -329,20 +322,19 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             B,
             C,
             chunk_size,
+            cu_seqlens=cu_seqlens.to(torch.int32),
+            cu_chunk_seqlens=cu_chunk_seqlens,
+            last_chunk_indices=last_chunk_indices,
+            seq_idx=seq_idx_chunks,
+            out=Y,
             D=None,
-            cu_seqlens=cu_seqlens,
-            seq_idx=seq_idx,
-            chunk_indices=chunk_indices,
-            chunk_offsets=chunk_offsets,
             initial_states=states,
-            out=Y,
         )
 
         # just test the last in sequence
         for i in range(num_examples):
-
             # just test one dim and dstate
-            Y_eg = Y[cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_eg = Y[cu_seqlens[i] : cu_seqlens[i + 1], 0, 0]
             Y_min_eg = Y_min[i][:, 0, 0]
             torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol)
 
@@ -350,18 +342,16 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
         states = new_states
         for i, clear in exhausted.items():
             if clear:
-                states[i].fill_(0.)
+                states[i].fill_(0.0)
                 exhausted[i] = False
 
 
 @pytest.mark.parametrize("chunk_size", [8, 256])
-@pytest.mark.parametrize("seqlens", [
-    (16, 2, 8, 13),
-    (270, 88, 212, 203),
-    (16, 20),
-])
+@pytest.mark.parametrize(
+    "seqlens",
+    [(16, 20), (270, 88, 212, 203)],
+)
 def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
-
     # This test verifies the correctness of the chunked prefill implementation
     # in the mamba2 ssd kernels, by comparing concatenation (in the sequence
     # dimension) of chunked results with the full sequence result.
@@ -390,22 +380,25 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
     last_taken: dict = {}  # map: eg -> pointer to last taken sample
     exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
     _, cu_seqlens, seq_idx, (A, dt, X, B, C) = next(
-        generate_continuous_batched_examples([seqlens],
-                                             num_sequences,
-                                             max_seqlen,
-                                             last_taken,
-                                             exhausted,
-                                             n_heads,
-                                             d_head,
-                                             itype,
-                                             return_naive_ref=False))
+        generate_continuous_batched_examples(
+            [seqlens],
+            num_sequences,
+            max_seqlen,
+            last_taken,
+            exhausted,
+            n_heads,
+            d_head,
+            itype,
+            return_naive_ref=False,
+        )
+    )
     seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device)
     device = X.device
 
     ## full seqlen computation
-    chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                cu_seqlens, chunk_size, cu_seqlens[-1])
+    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
+        compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+    )
     Y_ref = torch.empty_like(X)
     state_ref = mamba_chunk_scan_combined_varlen(
         X,
@@ -414,45 +407,47 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
         B,
         C,
         chunk_size,
+        cu_seqlens=cu_seqlens.to(torch.int32),
+        cu_chunk_seqlens=cu_chunk_seqlens,
+        last_chunk_indices=last_chunk_indices,
+        seq_idx=seq_idx_chunks,
+        out=Y_ref,
         D=None,
-        cu_seqlens=cu_seqlens,
-        seq_idx=seq_idx,
-        chunk_indices=chunk_indices,
-        chunk_offsets=chunk_offsets,
         initial_states=None,
-        out=Y_ref,
     )
 
     ## chunked seqlen computation
     # first chunk
     chunked_seqlens = seqlens // 2
-    chunked_cu_seqlens = torch.cat([
-        torch.tensor([0], device=device),
-        torch.cumsum(chunked_seqlens, dim=0)
-    ],
-                                   dim=0)
-    chunked_seq_idx = torch.repeat_interleave(
-        torch.arange(len(chunked_seqlens), device=device),
-        chunked_seqlens,
-        output_size=chunked_cu_seqlens[-1]).to(torch.int32)
+    chunked_cu_seqlens = torch.cat(
+        [torch.tensor([0], device=device), torch.cumsum(chunked_seqlens, dim=0)], dim=0
+    )
     chunked_input_seq_len = chunked_cu_seqlens[-1]
     X_chunked = torch.zeros_like(X)[:chunked_input_seq_len, ...]
     dt_chunked = torch.zeros_like(dt)[:chunked_input_seq_len, ...]
     B_chunked = torch.zeros_like(B)[:chunked_input_seq_len, ...]
     C_chunked = torch.zeros_like(C)[:chunked_input_seq_len, ...]
     for i in range(num_sequences):
-        # fmt: off
-        chunk_f = lambda x, i: x[cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...]  # noqa: E501
-
-        X_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i)  # noqa: E501
-        dt_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i)  # noqa: E501
-        B_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i)  # noqa: E501
-        C_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i)  # noqa: E501
-        # fmt: on
-
-    chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
+        chunk_f = lambda x, i: x[
+            cu_seqlens[i] : cu_seqlens[i] + chunked_seqlens[i], ...
+        ]
+
+        X_chunked[chunked_cu_seqlens[i] : chunked_cu_seqlens[i + 1], ...] = chunk_f(
+            X, i
+        )
+        dt_chunked[chunked_cu_seqlens[i] : chunked_cu_seqlens[i + 1], ...] = chunk_f(
+            dt, i
+        )
+        B_chunked[chunked_cu_seqlens[i] : chunked_cu_seqlens[i + 1], ...] = chunk_f(
+            B, i
+        )
+        C_chunked[chunked_cu_seqlens[i] : chunked_cu_seqlens[i + 1], ...] = chunk_f(
+            C, i
+        )
+
+    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
+        compute_varlen_chunk_metadata(chunked_cu_seqlens, chunk_size)
+    )
     Y_partial = torch.empty_like(X_chunked)
     partial_state = mamba_chunk_scan_combined_varlen(
         X_chunked,
@@ -461,59 +456,70 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
         B_chunked,
         C_chunked,
         chunk_size,
+        cu_seqlens=chunked_cu_seqlens.to(torch.int32),
+        cu_chunk_seqlens=cu_chunk_seqlens,
+        last_chunk_indices=last_chunk_indices,
+        seq_idx=seq_idx_chunks,
+        out=Y_partial,
         D=None,
-        cu_seqlens=chunked_cu_seqlens,
-        seq_idx=chunked_seq_idx,
-        chunk_indices=chunk_indices,
-        chunk_offsets=chunk_offsets,
         initial_states=None,
-        out=Y_partial,
     )
 
     # remaining chunk
     remaining_chunked_seqlens = seqlens - chunked_seqlens
-    remaining_chunked_cu_seqlens = torch.cat([
-        torch.tensor([0], device=device),
-        torch.cumsum(remaining_chunked_seqlens, dim=0)
-    ],
-                                             dim=0)
-    remaining_chunked_seq_idx = torch.repeat_interleave(
-        torch.arange(len(remaining_chunked_seqlens), device=device),
-        remaining_chunked_seqlens,
-        output_size=remaining_chunked_cu_seqlens[-1]).to(torch.int32)
+    remaining_chunked_cu_seqlens = torch.cat(
+        [
+            torch.tensor([0], device=device),
+            torch.cumsum(remaining_chunked_seqlens, dim=0),
+        ],
+        dim=0,
+    )
     remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
-    # fmt: off
-    remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_dt_chunked = torch.zeros_like(dt)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_B_chunked = torch.zeros_like(B)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_C_chunked = torch.zeros_like(C)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...]
+    remaining_dt_chunked = torch.zeros_like(dt)[:remaining_chunked_input_seq_len, ...]
+    remaining_B_chunked = torch.zeros_like(B)[:remaining_chunked_input_seq_len, ...]
+    remaining_C_chunked = torch.zeros_like(C)[:remaining_chunked_input_seq_len, ...]
     for i in range(num_sequences):
-        remaining_chunk_f = lambda x, i: x[cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...]  # noqa: E501
-
-        remaining_X_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i)  # noqa: E501
-        remaining_dt_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i)  # noqa: E501
-        remaining_B_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i)  # noqa: E501
-        remaining_C_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i)  # noqa: E501
+        remaining_chunk_f = lambda x, i: x[
+            cu_seqlens[i] + chunked_seqlens[i] : cu_seqlens[i + 1], ...
+        ]
+
+        remaining_X_chunked[
+            remaining_chunked_cu_seqlens[i] : remaining_chunked_cu_seqlens[i + 1], ...
+        ] = remaining_chunk_f(X, i)
+        remaining_dt_chunked[
+            remaining_chunked_cu_seqlens[i] : remaining_chunked_cu_seqlens[i + 1], ...
+        ] = remaining_chunk_f(dt, i)
+        remaining_B_chunked[
+            remaining_chunked_cu_seqlens[i] : remaining_chunked_cu_seqlens[i + 1], ...
+        ] = remaining_chunk_f(B, i)
+        remaining_C_chunked[
+            remaining_chunked_cu_seqlens[i] : remaining_chunked_cu_seqlens[i + 1], ...
+        ] = remaining_chunk_f(C, i)
 
     # assert input chunking is correct
-    concat_chunk_f = lambda pt1, pt2, i: torch.cat([
-        pt1[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
-        pt2[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
+    concat_chunk_f = lambda pt1, pt2, i: torch.cat(
+        [
+            pt1[chunked_cu_seqlens[i] : chunked_cu_seqlens[i + 1], ...],
+            pt2[
+                remaining_chunked_cu_seqlens[i] : remaining_chunked_cu_seqlens[i + 1],
+                ...,
+            ],
         ],
-        dim=0)
-    concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=0)  # noqa: E501
-    # fmt: on
+        dim=0,
+    )
+    concat_batch_f = lambda pt1, pt2: torch.cat(
+        [concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=0
+    )
 
     assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
     assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt)
     assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B)
     assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)
 
-    chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                remaining_chunked_cu_seqlens,
-                chunk_size,
-                remaining_chunked_cu_seqlens[-1])
+    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
+        compute_varlen_chunk_metadata(remaining_chunked_cu_seqlens, chunk_size)
+    )
 
     Y_chunked = torch.empty_like(remaining_X_chunked)
     state_chunked = mamba_chunk_scan_combined_varlen(
@@ -523,32 +529,34 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
         remaining_B_chunked,
         remaining_C_chunked,
         chunk_size,
+        cu_seqlens=remaining_chunked_cu_seqlens.to(torch.int32),
+        cu_chunk_seqlens=cu_chunk_seqlens,
+        last_chunk_indices=last_chunk_indices,
+        seq_idx=seq_idx_chunks,
+        out=Y_chunked,
         D=None,
-        cu_seqlens=remaining_chunked_cu_seqlens,
-        seq_idx=remaining_chunked_seq_idx,
-        chunk_indices=chunk_indices,
-        chunk_offsets=chunk_offsets,
         initial_states=partial_state,
-        out=Y_chunked,
     )
     Y = concat_batch_f(Y_partial, Y_chunked)
 
     # kernel chunked is same as kernel overall
     for i in range(num_sequences):
-        Y_seq = Y[cu_seqlens[i]:cu_seqlens[i + 1], ...]
-        Y_ref_seq = Y_ref[cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_seq = Y[cu_seqlens[i] : cu_seqlens[i + 1], ...]
+        Y_ref_seq = Y_ref[cu_seqlens[i] : cu_seqlens[i + 1], ...]
         torch.testing.assert_close(
-            Y_seq[:chunked_seqlens[i], ...],
-            Y_ref_seq[:chunked_seqlens[i], ...],
+            Y_seq[: chunked_seqlens[i], ...],
+            Y_ref_seq[: chunked_seqlens[i], ...],
             atol=atol,
             rtol=rtol,
-            msg=lambda x: f"seq{i} output part1 " + x)  # noqa: B023
+            msg=lambda x, i=i: f"seq{i} output part1 " + x,
+        )
         torch.testing.assert_close(
-            Y_seq[chunked_seqlens[i]:, ...],
-            Y_ref_seq[chunked_seqlens[i]:, ...],
+            Y_seq[chunked_seqlens[i] :, ...],
+            Y_ref_seq[chunked_seqlens[i] :, ...],
             atol=atol,
             rtol=rtol,
-            msg=lambda x: f"seq{i} output part2 " + x)  # noqa: B023
+            msg=lambda x, i=i: f"seq{i} output part2 " + x,
+        )
 
         state_seq = state_chunked[i]
         state_seq_ref = state_ref[i]
@@ -557,4 +565,5 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
             state_seq_ref,
             atol=atol,
             rtol=rtol,
-            msg=lambda x: f"seq{i} state " + x)  # noqa: B023
+            msg=lambda x, i=i: f"seq{i} state " + x,
+        )
diff --git a/tests/kernels/moe/modular_kernel_tools/cli_args.py b/tests/kernels/moe/modular_kernel_tools/cli_args.py
index b95d87cd04f57e2c41d2548c01c5b781314ae12d..d46847fbf6a3c2cdc1ce44dc39bf9ac5999af31f 100644
--- a/tests/kernels/moe/modular_kernel_tools/cli_args.py
+++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py
@@ -9,18 +9,19 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 
 from .common import Config
-from .mk_objects import (MK_ALL_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES,
-                         MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+from .mk_objects import (
+    MK_ALL_PREPARE_FINALIZE_TYPES,
+    MK_FUSED_EXPERT_TYPES,
+    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
+)
 
 
 def make_config_arg_parser(description: str):
-
     def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
         for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
             if pf.__name__ == s:
                 return pf
-        raise ValueError(
-            f"Cannot find a PrepareFinalize type that matches {s}")
+        raise ValueError(f"Cannot find a PrepareFinalize type that matches {s}")
 
     def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
         for fe in MK_FUSED_EXPERT_TYPES:
@@ -45,15 +46,18 @@ def make_config_arg_parser(description: str):
         "--pf-type",
         type=to_pf_class_type,
         required=True,
-        help=("Choose a PrepareFinalize Type : "
-              f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"),
+        help=(
+            "Choose a PrepareFinalize Type : "
+            f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"
+        ),
     )
     parser.add_argument(
         "--experts-type",
         type=to_experts_class_type,
         required=True,
-        help=(f"Choose a FusedExpert type : "
-              f"{[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"),
+        help=(
+            f"Choose a FusedExpert type : {[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"
+        ),
     )
     parser.add_argument(
         "-m",
@@ -74,66 +78,65 @@ def make_config_arg_parser(description: str):
         default=1024,
         help="N dimension of the first fused-moe matmul",
     )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=32,
-                        help="Global num experts")
-    parser.add_argument("--topk",
-                        nargs="+",
-                        type=int,
-                        default=[4, 1],
-                        help="num topk")
+    parser.add_argument(
+        "--num-experts", type=int, default=32, help="Global num experts"
+    )
+    parser.add_argument("--topk", nargs="+", type=int, default=[4, 1], help="num topk")
     parser.add_argument(
         "--fused-moe-chunk-size",
         type=int,
-        help="Fused moe chunk size used for the non-batched fused experts impl."
+        help="Fused moe chunk size used for the non-batched fused experts impl.",
     )
 
     # Quant args
-    parser.add_argument("--quant-dtype",
-                        type=to_quant_torch_dtype,
-                        help="Quant datatype")
-    parser.add_argument("--per-token-quantized-activations",
-                        action='store_true',
-                        help=("The input activations must be per-token "
-                              "quantized"))
-    parser.add_argument("--per-channel-quantized-weights",
-                        action="store_true",
-                        help="The weights must be per-channel quantized.")
-    parser.add_argument("--block-shape",
-                        nargs="+",
-                        type=int,
-                        help="Quantization block shape")
+    parser.add_argument(
+        "--quant-dtype", type=to_quant_torch_dtype, help="Quant datatype"
+    )
+    parser.add_argument(
+        "--per-token-quantized-activations",
+        action="store_true",
+        help=("The input activations must be per-token quantized"),
+    )
+    parser.add_argument(
+        "--per-channel-quantized-weights",
+        action="store_true",
+        help="The weights must be per-channel quantized.",
+    )
+    parser.add_argument(
+        "--block-shape", nargs="+", type=int, help="Quantization block shape"
+    )
 
     # Torch trace profile generation args
-    parser.add_argument("--torch-trace-dir-path",
-                        type=str,
-                        default=None,
-                        help="Get torch trace for single execution")
+    parser.add_argument(
+        "--torch-trace-dir-path",
+        type=str,
+        default=None,
+        help="Get torch trace for single execution",
+    )
 
     return parser
 
 
 def _validate_args(args: argparse.Namespace):
-
     if args.quant_dtype is not None:
         assert args.quant_dtype == torch.float8_e4m3fn
         if args.block_shape is not None:
             assert len(args.block_shape) == 2, (
-                f"block shape must have 2 elements. got {args.block_shape}")
+                f"block shape must have 2 elements. got {args.block_shape}"
+            )
 
     if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
-        assert args.world_size == 1, (
-            "Single GPU objects need world size set to 1")
+        assert args.world_size == 1, "Single GPU objects need world size set to 1"
 
     if args.torch_trace_dir_path is not None:
         from pathlib import Path
+
         assert Path(args.torch_trace_dir_path).is_dir(), (
-            f"Please create {args.torch_trace_dir_path}")
+            f"Please create {args.torch_trace_dir_path}"
+        )
 
 
 def make_config(args: argparse.Namespace) -> Config:
-
     _validate_args(args)
 
     quant_config = None
@@ -142,7 +145,8 @@ def make_config(args: argparse.Namespace) -> Config:
             quant_dtype=args.quant_dtype,
             per_act_token_quant=args.per_token_quantized_activations,
             per_out_ch_quant=args.per_channel_quantized_weights,
-            block_shape=args.block_shape)
+            block_shape=args.block_shape,
+        )
 
     return Config(
         Ms=args.m,
@@ -156,4 +160,5 @@ def make_config(args: argparse.Namespace) -> Config:
         fused_experts_type=args.experts_type,
         fused_moe_chunk_size=args.fused_moe_chunk_size,
         world_size=args.world_size,
-        torch_trace_dir_path=args.torch_trace_dir_path)
+        torch_trace_dir_path=args.torch_trace_dir_path,
+    )
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index b5fcc4cd70bf8409d2d8cd469e3cbb66ddcce47a..1d925dc1bea8ff11dca8737cf3dbbe351d972f48 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -1,31 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 
 import vllm._custom_ops as ops
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
-
-from .mk_objects import (TestMoEQuantConfig, expert_info, make_fused_experts,
-                         make_prepare_finalize, prepare_finalize_info)
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
+
+from .mk_objects import (
+    TestMoEQuantConfig,
+    expert_info,
+    make_fused_experts,
+    make_prepare_finalize,
+    prepare_finalize_info,
+)
 from .parallel_utils import ProcessGroupInfo
 
 
-def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
+def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
     if t is None:
         return f"{name} : None"
     else:
@@ -34,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
 
 @dataclass
 class Config:
-    Ms: Union[list[int], int]
+    Ms: list[int] | int
     K: int
     N: int
     E: int
-    topks: Union[list[int], int]
+    topks: list[int] | int
     dtype: torch.dtype
-    quant_config: Optional[TestMoEQuantConfig]
+    quant_config: TestMoEQuantConfig | None
 
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
     fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
 
-    fused_moe_chunk_size: Optional[int]
+    fused_moe_chunk_size: int | None
     world_size: int
 
-    torch_trace_dir_path: Optional[str] = None
+    torch_trace_dir_path: str | None = None
 
     def __post_init__(self):
         if self.quant_config is None:
@@ -83,7 +93,7 @@ class Config:
         return self.Ms
 
     @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
         assert self.quant_config is not None
         return self.quant_config.quant_dtype
 
@@ -94,8 +104,7 @@ class Config:
 
     @property
     def is_per_tensor_act_quant(self) -> bool:
-        return (not self.is_per_act_token_quant
-                and self.quant_block_shape is None)
+        return not self.is_per_act_token_quant and self.quant_block_shape is None
 
     @property
     def is_per_out_ch_quant(self) -> bool:
@@ -103,7 +112,7 @@ class Config:
         return self.quant_config.per_out_ch_quant
 
     @property
-    def quant_block_shape(self) -> Optional[list[int]]:
+    def quant_block_shape(self) -> list[int] | None:
         assert self.quant_config is not None
         return self.quant_config.block_shape
 
@@ -129,28 +138,30 @@ class Config:
         }
 
         backend = self.all2all_backend()
+        vllm_config.parallel_config.all2all_backend = backend
         if backend is not None:
             env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
 
         if self.fused_moe_chunk_size is not None:
             env_dict.update(
-                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}
+            )
 
         return vllm_config, env_dict
 
     def is_fp8_block_quantized(self):
-        return (self.quant_dtype == torch.float8_e4m3fn
-                and self.quant_block_shape is not None)
+        return (
+            self.quant_dtype == torch.float8_e4m3fn
+            and self.quant_block_shape is not None
+        )
 
     def is_batched_prepare_finalize(self):
         info = prepare_finalize_info(self.prepare_finalize_type)
-        return (mk.FusedMoEActivationFormat.BatchedExperts ==
-                info.activation_format)
+        return mk.FusedMoEActivationFormat.BatchedExperts == info.activation_format
 
     def is_batched_fused_experts(self):
         info = expert_info(self.fused_experts_type)
-        return (mk.FusedMoEActivationFormat.BatchedExperts ==
-                info.activation_format)
+        return mk.FusedMoEActivationFormat.BatchedExperts == info.activation_format
 
     def is_standard_fused_experts(self):
         info = expert_info(self.fused_experts_type)
@@ -190,89 +201,109 @@ class Config:
 
     def needs_deep_ep(self):
         info = prepare_finalize_info(self.prepare_finalize_type)
-        return (info.backend == "deepep_high_throughput"
-                or info.backend == "deepep_low_latency")
+        return (
+            info.backend == "deepep_high_throughput"
+            or info.backend == "deepep_low_latency"
+        )
 
     def all2all_backend(self):
         info = prepare_finalize_info(self.prepare_finalize_type)
         return info.backend
 
-    def is_valid(self):
+    def is_valid(self) -> tuple[bool, str | None]:
         # Check prepare-finalize and fused-experts compatibility
         if self.is_batched_prepare_finalize():
             if not self.is_batched_fused_experts():
-                return False
+                return False, "Mismatched format."
         else:
             if not self.is_standard_fused_experts():
-                return False
+                return False, "Mismatched format."
 
         use_chunking = self.fused_moe_chunk_size is not None
         if use_chunking and not self.is_fe_supports_chunking():
-            return False
+            return False, "Chunking not supported."
 
         # Check quantization sanity
-        if (int(self.is_per_act_token_quant) +
-                int(self.is_per_tensor_act_quant) +
-                int(self.quant_block_shape is not None)) > 1:
+        if (
+            int(self.is_per_act_token_quant)
+            + int(self.is_per_tensor_act_quant)
+            + int(self.quant_block_shape is not None)
+        ) > 1:
             # invalid quant config
-            return False
+            return False, f"Bad quant_config {self.quant_config}."
 
         # check type support
         if self.quant_dtype is None:
-            if (self.dtype not in self.pf_supported_types()
-                    or self.dtype not in self.fe_supported_types()):
-                return False
+            if (
+                self.dtype not in self.pf_supported_types()
+                or self.dtype not in self.fe_supported_types()
+            ):
+                return False, (
+                    f"Unsupported type {self.dtype} not in "
+                    f"{self.pf_supported_types()} and "
+                    f"{self.fe_supported_types()}."
+                )
         else:
-            if (self.quant_dtype not in self.pf_supported_types()
-                    or self.quant_dtype not in self.fe_supported_types()):
-                return False
+            if (
+                self.quant_dtype not in self.pf_supported_types()
+                or self.quant_dtype not in self.fe_supported_types()
+            ):
+                return False, (
+                    f"Unsupported quant type {self.quant_dtype} "
+                    f"not in {self.pf_supported_types()} and "
+                    f"{self.fe_supported_types()}."
+                )
 
         # Check block quanization support
         is_block_quatized = self.quant_block_shape is not None
         if is_block_quatized and self.quant_dtype is None:
-            return False
+            return False, "No block quantization support."
+
         if is_block_quatized and not self.is_block_quant_supported():
-            return False
+            return False, "Mismatched block quantization support."
 
         # deep_gemm only works with block-quantized
         if self.needs_deep_gemm() and not is_block_quatized:
-            return False
+            return False, "Needs DeepGEMM but not block quantized."
 
         # Check dependencies (turn into asserts?)
         if self.needs_deep_ep() and not has_deep_ep():
-            return False
+            return False, "Needs DeepEP, but DeepEP not available."
         if self.needs_deep_gemm() and not has_deep_gemm():
-            return False
+            return False, "Needs DeepGEMM, but DeepGEMM not available."
         if self.needs_pplx() and not has_pplx():  # noqa: SIM103
-            return False
+            return False, "Needs PPLX, but PPLX not available."
 
-        return True
+        return True, None
 
 
 @dataclass
 class WeightTensors:
     w1: torch.Tensor
     w2: torch.Tensor
-    w1_scale: Optional[torch.Tensor]
-    w2_scale: Optional[torch.Tensor]
-    w1_gs: Optional[torch.Tensor] = None
-    w2_gs: Optional[torch.Tensor] = None
+    w1_scale: torch.Tensor | None
+    w2_scale: torch.Tensor | None
+    w1_gs: torch.Tensor | None = None
+    w2_gs: torch.Tensor | None = None
 
     def describe(self):
         s = ""
         s += "== Weight Tensors: \n"
-        s += f' - {_describe_tensor(self.w1, "w1")} \n'
-        s += f' - {_describe_tensor(self.w2, "w2")} \n'
-        s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
-        s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
-        s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n'
-        s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n'
+        s += f" - {_describe_tensor(self.w1, 'w1')} \n"
+        s += f" - {_describe_tensor(self.w2, 'w2')} \n"
+        s += f" - {_describe_tensor(self.w1_scale, 'w1_scale')} \n"
+        s += f" - {_describe_tensor(self.w2_scale, 'w2_scale')} \n"
+        s += f" - {_describe_tensor(self.w1_gs, 'w1_gs')} \n"
+        s += f" - {_describe_tensor(self.w2_gs, 'w2_gs')} \n"
         return s
 
     def is_quantized(self) -> bool:
         # or w1_scale is not None?
-        return (self.w1.dtype == torch.float8_e4m3fn
-                or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8)
+        return (
+            self.w1.dtype == torch.float8_e4m3fn
+            or self.w1.dtype == torch.uint8
+            or self.w1.dtype == torch.int8
+        )
 
     def to_current_device(self):
         device = torch.cuda.current_device()
@@ -289,16 +320,13 @@ class WeightTensors:
         if self.w2_gs is not None:
             self.w2_gs = self.w2_gs.to(device=device)
 
-    def slice_weights(self, rank: int,
-                      num_local_experts: int) -> "WeightTensors":
+    def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors":
         s = rank * num_local_experts
         e = s + num_local_experts
         w1 = self.w1[s:e, :, :]
         w2 = self.w2[s:e, :, :]
-        w1_scale = self.w1_scale[
-            s:e, :, :] if self.w1_scale is not None else None
-        w2_scale = self.w2_scale[
-            s:e, :, :] if self.w2_scale is not None else None
+        w1_scale = self.w1_scale[s:e, :, :] if self.w1_scale is not None else None
+        w2_scale = self.w2_scale[s:e, :, :] if self.w2_scale is not None else None
         w1_gs = self.w1_gs[s:e] if self.w1_gs is not None else None
         w2_gs = self.w2_gs[s:e] if self.w2_gs is not None else None
 
@@ -313,45 +341,42 @@ class WeightTensors:
             in_dtype=config.dtype,
             quant_dtype=config.quant_dtype,
             block_shape=config.quant_block_shape,
-            per_out_ch_quant=config.
-            is_per_act_token_quant,  # or config.is_per_out_ch_quant
+            # or config.is_per_out_ch_quant
+            per_out_ch_quant=config.is_per_act_token_quant,
+        )
+        return WeightTensors(
+            w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
         )
-        return WeightTensors(w1=w1,
-                             w2=w2,
-                             w1_scale=w1_scale,
-                             w2_scale=w2_scale,
-                             w1_gs=w1_gs,
-                             w2_gs=w2_gs)
 
 
 @dataclass
 class RankTensors:
     hidden_states: torch.Tensor
-    hidden_states_scale: Optional[torch.Tensor]
+    hidden_states_scale: torch.Tensor | None
 
     topk_weights: torch.Tensor
     topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor]
+    expert_map: torch.Tensor | None
 
     def describe(self):
         s = ""
         s += "== Rank Tensors: \n"
-        s += f' - {_describe_tensor(self.hidden_states, "HS")} \n'
-        s += f' - {_describe_tensor(self.hidden_states_scale, "HS_scale")} \n'
-        s += f' - {_describe_tensor(self.topk_weights, "topk_weights")} \n'
-        s += f' - {_describe_tensor(self.topk_ids, "topk_ids")} \n'
-        s += f' - {_describe_tensor(self.expert_map, "expert_map")} \n'
+        s += f" - {_describe_tensor(self.hidden_states, 'HS')} \n"
+        s += f" - {_describe_tensor(self.hidden_states_scale, 'HS_scale')} \n"
+        s += f" - {_describe_tensor(self.topk_weights, 'topk_weights')} \n"
+        s += f" - {_describe_tensor(self.topk_ids, 'topk_ids')} \n"
+        s += f" - {_describe_tensor(self.expert_map, 'expert_map')} \n"
         return s
 
     @staticmethod
     def make_hidden_states(
-            config: Config) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        config: Config,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         Return hidden_states
         """
         m, k, dtype = (config.M, config.K, config.dtype)
-        a = (torch.randn(
-            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0)
+        a = torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0
 
         if config.quant_dtype is None:
             return a, None
@@ -362,36 +387,29 @@ class RankTensors:
         # first - so further quantize and dequantize will yield the same
         # values.
         if config.is_per_tensor_act_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(
-                a, use_per_token_if_dynamic=False)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=False)
             return a_q.float().mul(a_scales).to(dtype), a_scales
 
         if config.is_per_act_token_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(a,
-                                                 use_per_token_if_dynamic=True)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
             return a_q.float().mul(a_scales).to(dtype), None
 
         assert config.quant_block_shape is not None
         block_k = config.quant_block_shape[1]
         a_q, a_scales = per_token_cast_to_fp8(a, block_size=block_k)
-        return a_q.float().view(
-            (-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(dtype), None
+        return a_q.float().view((-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(
+            dtype
+        ), None
 
     @staticmethod
     def make(config: Config, pgi: ProcessGroupInfo):
-
         dtype = config.dtype
         topk, m, _ = (config.topk, config.M, config.K)
-        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(
-            config)
+        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(config)
 
-        num_local_experts, global_num_experts = (config.num_local_experts,
-                                                 config.E)
-        score = torch.randn((m, global_num_experts),
-                            device="cuda",
-                            dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
-                                               False)
+        num_local_experts, global_num_experts = (config.num_local_experts, config.E)
+        score = torch.randn((m, global_num_experts), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False)
 
         # distribute topk_ids evenly
         for mi in range(m):
@@ -400,14 +418,15 @@ class RankTensors:
 
         expert_map = None
         if config.world_size > 1 and config.supports_expert_map():
-            expert_map = torch.full((global_num_experts, ),
-                                    fill_value=-1,
-                                    dtype=torch.int32)
+            expert_map = torch.full(
+                (global_num_experts,), fill_value=-1, dtype=torch.int32
+            )
             s = pgi.rank * num_local_experts
             e = s + num_local_experts
             expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-            expert_map = expert_map.to(device=torch.cuda.current_device(),
-                                       dtype=torch.int32)
+            expert_map = expert_map.to(
+                device=torch.cuda.current_device(), dtype=torch.int32
+            )
 
         return RankTensors(
             hidden_states=hidden_states,
@@ -418,9 +437,9 @@ class RankTensors:
         )
 
 
-def reference_moe_impl(config: Config, weights: WeightTensors,
-                       rank_tensors: RankTensors) -> torch.Tensor:
-
+def reference_moe_impl(
+    config: Config, weights: WeightTensors, rank_tensors: RankTensors
+) -> torch.Tensor:
     if config.quant_dtype == "nvfp4":
         quant_blocksize = 16
         dtype = config.dtype
@@ -433,8 +452,10 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
         w2_blockscale = weights.w2_scale
         w2_gs = weights.w2_gs
 
-        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(
-            rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32)
+        a_global_scale = (
+            (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX)
+            / torch.amax(rank_tensors.hidden_states.flatten(), dim=-1)
+        ).to(torch.float32)
 
         assert w1_gs is not None
         assert w2_gs is not None
@@ -447,14 +468,17 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
         assert w2_blockscale.shape[2] % 4 == 0
 
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
-            rank_tensors.hidden_states, a_global_scale)
+            rank_tensors.hidden_states, a_global_scale
+        )
 
-        a = dequantize_nvfp4_to_dtype(a_fp4,
-                                      a_scale_interleaved,
-                                      a_global_scale,
-                                      dtype=dtype,
-                                      device=a_fp4.device,
-                                      block_size=quant_blocksize)
+        a = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=dtype,
+            device=a_fp4.device,
+            block_size=quant_blocksize,
+        )
 
         e = w1_q.shape[0]
         n = w1_q.shape[1] // 2
@@ -464,18 +488,22 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
         w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
-            w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
-                                                w1_blockscale[idx],
-                                                w1_gs[idx],
-                                                dtype=dtype,
-                                                device=w1_q.device,
-                                                block_size=quant_blocksize)
-            w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
-                                                w2_blockscale[idx],
-                                                w2_gs[idx],
-                                                dtype=dtype,
-                                                device=w2_q.device,
-                                                block_size=quant_blocksize)
+            w1[idx] = dequantize_nvfp4_to_dtype(
+                w1_q[idx],
+                w1_blockscale[idx],
+                w1_gs[idx],
+                dtype=dtype,
+                device=w1_q.device,
+                block_size=quant_blocksize,
+            )
+            w2[idx] = dequantize_nvfp4_to_dtype(
+                w2_q[idx],
+                w2_blockscale[idx],
+                w2_gs[idx],
+                dtype=dtype,
+                device=w2_q.device,
+                block_size=quant_blocksize,
+            )
         a_scale = None
         w1_scale = None
         w2_scale = None
@@ -493,27 +521,29 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
         per_act_token_quant = config.is_per_act_token_quant
         block_shape = config.quant_block_shape
 
-    return torch_experts(a=a,
-                         w1=w1,
-                         w2=w2,
-                         topk_weight=rank_tensors.topk_weights,
-                         topk_ids=rank_tensors.topk_ids,
-                         global_num_experts=config.E,
-                         expert_map=None,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a_scale,
-                         quant_dtype=quant_dtype,
-                         per_act_token_quant=per_act_token_quant,
-                         block_shape=block_shape,
-                         apply_router_weights_on_input=config.topk == 1
-                         and config.supports_apply_weight_on_input())
+    return torch_experts(
+        a=a,
+        w1=w1,
+        w2=w2,
+        topk_weight=rank_tensors.topk_weights,
+        topk_ids=rank_tensors.topk_ids,
+        global_num_experts=config.E,
+        expert_map=None,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a_scale,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+        apply_router_weights_on_input=config.topk == 1
+        and config.supports_apply_weight_on_input(),
+    )
 
 
 def _make_gscale(num_experts: int) -> torch.Tensor:
-    return torch.ones((num_experts, ),
-                      device=torch.cuda.current_device(),
-                      dtype=torch.float32)
+    return torch.ones(
+        (num_experts,), device=torch.cuda.current_device(), dtype=torch.float32
+    )
 
 
 def make_modular_kernel(
@@ -521,12 +551,12 @@ def make_modular_kernel(
     vllm_config: VllmConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEModularKernel:
-
     def next_power_of_2(x):
         import math
+
         if x == 0:
             return 1
-        return 2**math.ceil(math.log2(x))
+        return 2 ** math.ceil(math.log2(x))
 
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
@@ -546,9 +576,9 @@ def make_modular_kernel(
     )
 
     # make modular kernel
-    prepare_finalize = make_prepare_finalize(config.prepare_finalize_type,
-                                             config.all2all_backend(), moe,
-                                             quant_config)
+    prepare_finalize = make_prepare_finalize(
+        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
+    )
 
     fused_experts = make_fused_experts(
         config.fused_experts_type,
@@ -559,7 +589,8 @@ def make_modular_kernel(
     )
 
     modular_kernel = mk.FusedMoEModularKernel(
-        prepare_finalize=prepare_finalize, fused_experts=fused_experts)
+        prepare_finalize=prepare_finalize, fused_experts=fused_experts
+    )
 
     return modular_kernel
 
@@ -587,10 +618,8 @@ def run_modular_kernel(
         w1_scale=rank_weights.w1_scale,
         w2_scale=rank_weights.w2_scale,
         a1_scale=rank_tensors.hidden_states_scale,
-        g1_alphas=(1 / rank_weights.w1_gs)
-        if rank_weights.w1_gs is not None else None,
-        g2_alphas=(1 / rank_weights.w2_gs)
-        if rank_weights.w2_gs is not None else None,
+        g1_alphas=(1 / rank_weights.w1_gs) if rank_weights.w1_gs is not None else None,
+        g2_alphas=(1 / rank_weights.w2_gs) if rank_weights.w2_gs is not None else None,
         a1_gscale=gscale,
         a2_gscale=gscale,
         block_shape=config.quant_block_shape,
@@ -603,38 +632,30 @@ def run_modular_kernel(
     # impls might update the tensor in place
     hidden_states = rank_tensors.hidden_states.clone()
 
-    topk_ids = rank_tensors.topk_ids.to(
-        mk.prepare_finalize.topk_indices_dtype())
+    topk_ids = rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype())
 
     mk_kwargs = {
-        "hidden_states":
-        hidden_states,
-        "w1":
-        rank_weights.w1,
-        "w2":
-        rank_weights.w2,
-        "topk_weights":
-        rank_tensors.topk_weights,
-        "topk_ids":
-        topk_ids,
-        "expert_map":
-        rank_tensors.expert_map,
-        "global_num_experts":
-        config.E,
-        "apply_router_weight_on_input":
-        config.topk == 1 and config.supports_apply_weight_on_input(),
+        "hidden_states": hidden_states,
+        "w1": rank_weights.w1,
+        "w2": rank_weights.w2,
+        "topk_weights": rank_tensors.topk_weights,
+        "topk_ids": topk_ids,
+        "expert_map": rank_tensors.expert_map,
+        "global_num_experts": config.E,
+        "apply_router_weight_on_input": config.topk == 1
+        and config.supports_apply_weight_on_input(),
     }
 
     num_tokens = rank_tensors.hidden_states.shape[0]
-    num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size,
-                                        device="cuda",
-                                        dtype=torch.int)
+    num_tokens_across_dp = torch.tensor(
+        [num_tokens] * config.world_size, device="cuda", dtype=torch.int
+    )
 
     with set_forward_context(
-            None,
-            vllm_config,
-            num_tokens=num_tokens,
-            num_tokens_across_dp=num_tokens_across_dp,
+        None,
+        vllm_config,
+        num_tokens=num_tokens,
+        num_tokens_across_dp=num_tokens_across_dp,
     ):
         out = mk.forward(**mk_kwargs)
 
diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
index c1037b60bf3837c93ea0966fff1bb71c43f80f49..95db6327c4f105ea6b6fc3decb53ff2693408bc8 100644
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -4,20 +4,26 @@
 import copy
 from enum import Enum
 from itertools import product
-from typing import Optional
 
 import torch
 from tqdm import tqdm
 
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG)
+from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG
 from vllm.platforms import current_platform
 
-from .common import (Config, RankTensors, WeightTensors, reference_moe_impl,
-                     run_modular_kernel)
-from .mk_objects import (MK_FUSED_EXPERT_TYPES,
-                         MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_QUANT_CONFIGS)
+from .common import (
+    Config,
+    RankTensors,
+    WeightTensors,
+    reference_moe_impl,
+    run_modular_kernel,
+)
+from .mk_objects import (
+    MK_FUSED_EXPERT_TYPES,
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS,
+)
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
 
 
@@ -38,8 +44,9 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
@@ -60,8 +67,7 @@ def rank_worker(
         rank_tensors = RankTensors.make(cfgx, pgi)
 
         # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, rank_tensors)
 
         with set_current_vllm_config(vllm_config):
             ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
@@ -70,28 +76,27 @@ def rank_worker(
 
 
 def make_feature_matrix(csv_file_path: str):
-
     from dataclasses import asdict
 
     import pandas as pd
 
-    def add_to_results(config: Config,
-                       success: Result,
-                       results_df: Optional[pd.DataFrame] = None):
+    def add_to_results(
+        config: Config, success: Result, results_df: pd.DataFrame | None = None
+    ):
         config_dict = asdict(config)
-        config_dict['prepare_finalize_type'] = config_dict[
-            'prepare_finalize_type'].__name__
-        config_dict['fused_experts_type'] = config_dict[
-            'fused_experts_type'].__name__
-        config_dict['per_tensor_act_quant'] = config.is_per_tensor_act_quant
-        quant_config_dict = config_dict['quant_config']
-        del config_dict['quant_config']
+        config_dict["prepare_finalize_type"] = config_dict[
+            "prepare_finalize_type"
+        ].__name__
+        config_dict["fused_experts_type"] = config_dict["fused_experts_type"].__name__
+        config_dict["per_tensor_act_quant"] = config.is_per_tensor_act_quant
+        quant_config_dict = config_dict["quant_config"]
+        del config_dict["quant_config"]
         if quant_config_dict is None:
             quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
             quant_config_dict = asdict(quant_config)
 
         config_dict |= quant_config_dict
-        result_dict = config_dict | {'success': success.name}
+        result_dict = config_dict | {"success": success.name}
 
         result_df = pd.DataFrame([result_dict])
         if results_df is None:
@@ -112,32 +117,41 @@ def make_feature_matrix(csv_file_path: str):
     Q_TYPES = MK_QUANT_CONFIGS
 
     combinations = list(
-        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES))
+        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
+    )
 
-    results_df: Optional[pd.DataFrame] = None
+    results_df: pd.DataFrame | None = None
     for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
-            combinations):  #noqa: E501
-        config = Config(Ms=[m],
-                        K=k,
-                        N=n,
-                        E=e,
-                        topks=topks,
-                        dtype=dtype,
-                        prepare_finalize_type=pf_type,
-                        fused_experts_type=experts_type,
-                        quant_config=quant_config,
-                        world_size=2,
-                        fused_moe_chunk_size=None)
+        combinations
+    ):
+        config = Config(
+            Ms=[m],
+            K=k,
+            N=n,
+            E=e,
+            topks=topks,
+            dtype=dtype,
+            prepare_finalize_type=pf_type,
+            fused_experts_type=experts_type,
+            quant_config=quant_config,
+            world_size=2,
+            fused_moe_chunk_size=None,
+        )
 
         success = None
-        if config.is_valid():
+        if config.is_valid()[0]:
             print(f"Running config : {config.describe()} ...")
             try:
                 weights: WeightTensors = WeightTensors.make(config)
                 vllm_config, env_dict = config.make_env_data()
-                parallel_launch_with_config(config.world_size, rank_worker,
-                                            vllm_config, env_dict, config,
-                                            weights)
+                parallel_launch_with_config(
+                    config.world_size,
+                    rank_worker,
+                    vllm_config,
+                    env_dict,
+                    config,
+                    weights,
+                )
                 success = Result.PASS
             except Exception as _:
                 success = Result.FAIL
@@ -150,25 +164,33 @@ def make_feature_matrix(csv_file_path: str):
         results_df.to_csv(f"{csv_file_path}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
     from pathlib import Path
-    parser = argparse.ArgumentParser(description=(
-        "Make ModularKernel feature matrix \n"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  #noqa: E501
-        "-f ./feature_matrices/feature_matrix.csv"))
-
-    parser.add_argument("-f",
-                        "--feature-matrix-csv-file-path",
-                        type=str,
-                        required=True,
-                        help="File name to Generate a .csv file")
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Make ModularKernel feature matrix \n"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  # noqa: E501
+            "-f ./feature_matrices/feature_matrix.csv"
+        )
+    )
+
+    parser.add_argument(
+        "-f",
+        "--feature-matrix-csv-file-path",
+        type=str,
+        required=True,
+        help="File name to Generate a .csv file",
+    )
     args = parser.parse_args()
 
     csv_path = args.feature_matrix_csv_file_path
-    assert csv_path.endswith(
-        'csv'), f"Need a file path ending with .csv, got {csv_path}"
-    assert Path(csv_path).parent.is_dir(
-    ), f"Cannot find parent directory for {Path(csv_path).parent}"
+    assert csv_path.endswith("csv"), (
+        f"Need a file path ending with .csv, got {csv_path}"
+    )
+    assert Path(csv_path).parent.is_dir(), (
+        f"Cannot find parent directory for {Path(csv_path).parent}"
+    )
 
     make_feature_matrix(args.feature_matrix_csv_file_path)
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 57a1da7b4b1a06cc598998c9a1739570a8b06790..d79fdfbe07af3497c287b86d5a339aec33b148d8 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -1,58 +1,69 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+    BatchedDeepGemmExperts,
+)
+from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
+    BatchedTritonOrDeepGemmExperts,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
-                                                        TritonExperts)
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    cutlass_fp8_supported)
+    cutlass_fp8_supported,
+)
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.deep_gemm import is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 
 
 @dataclass
 class TestMoEQuantConfig:
-    quant_dtype: Union[torch.dtype, str, None]
+    quant_dtype: torch.dtype | str | None
     per_out_ch_quant: bool
     per_act_token_quant: bool
-    block_shape: Optional[list[int]]
+    block_shape: list[int] | None
 
 
 @dataclass
 class PrepareFinalizeInfo:
     activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
     blocked_quantization_support: bool
-    backend: Optional[str]
+    backend: str | None
     supports_apply_weight_on_input: bool = True
 
 
 @dataclass
 class ExpertInfo:
     activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
     blocked_quantization_support: bool
     supports_chunking: bool
     supports_expert_map: bool
@@ -60,8 +71,7 @@ class ExpertInfo:
     needs_deep_gemm: bool = False
 
 
-PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize,
-                            PrepareFinalizeInfo] = {}
+PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, PrepareFinalizeInfo] = {}
 EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {}
 MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
 MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
@@ -70,8 +80,11 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
 
 standard_format = mk.FusedMoEActivationFormat.Standard
 batched_format = mk.FusedMoEActivationFormat.BatchedExperts
-common_float_types: list[Union[torch.dtype, str]] = [
-    torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32
+common_float_types: list[torch.dtype | str] = [
+    torch.float8_e4m3fn,
+    torch.bfloat16,
+    torch.float16,
+    torch.float32,
 ]
 common_float_and_int_types = common_float_types + [torch.int8]
 nvfp4_types = ["nvfp4"]
@@ -81,9 +94,9 @@ fp8_types = [torch.float8_e4m3fn]
 def register_prepare_and_finalize(
     kind,
     activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
     blocked_quantization_support: bool,
-    backend: Optional[str],
+    backend: str | None,
     force_multigpu: bool = False,
     supports_apply_weight_on_input: bool = True,
 ):
@@ -110,7 +123,7 @@ def register_prepare_and_finalize(
 def register_experts(
     kind,
     activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
     blocked_quantization_support: bool,
     supports_chunking: bool,
     supports_expert_map: bool,
@@ -185,10 +198,12 @@ register_experts(
 
 # Disable on blackwell for now
 if has_deep_ep() and not current_platform.has_device_capability(100):
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
+        DeepEPHTPrepareAndFinalize,
+    )
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
+        DeepEPLLPrepareAndFinalize,
+    )
 
     register_prepare_and_finalize(
         DeepEPHTPrepareAndFinalize,
@@ -208,7 +223,9 @@ if has_deep_ep() and not current_platform.has_device_capability(100):
 
 if has_pplx():
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
+
     register_prepare_and_finalize(
         PplxPrepareAndFinalize,
         batched_format,
@@ -217,18 +234,19 @@ if has_pplx():
         backend="pplx",
     )
 
-if (has_flashinfer_cutlass_fused_moe()
-        and current_platform.has_device_capability(100)):
-    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-        FlashInferExperts)
+if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+        FlashInferExperts,
+    )
     from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
         FlashInferCutlassMoEPrepareAndFinalize,
-        create_flashinfer_prepare_finalize)
+        create_flashinfer_prepare_finalize,
+    )
 
     register_prepare_and_finalize(
         FlashInferCutlassMoEPrepareAndFinalize,
         standard_format,
-        nvfp4_types,
+        nvfp4_types + fp8_types,
         blocked_quantization_support=True,
         backend=None,
         force_multigpu=True,
@@ -238,7 +256,7 @@ if (has_flashinfer_cutlass_fused_moe()
     register_experts(
         FlashInferExperts,
         standard_format,
-        nvfp4_types,
+        nvfp4_types + fp8_types,
         blocked_quantization_support=True,
         supports_chunking=True,
         # Note: this is a hack to get it to run for now
@@ -267,7 +285,7 @@ if has_deep_gemm() and is_deep_gemm_supported():
         supports_expert_map=True,
         needs_matching_quant=False,
         needs_deep_gemm=True,
-    ),
+    )
     register_experts(
         BatchedTritonOrDeepGemmExperts,
         batched_format,
@@ -290,8 +308,11 @@ if has_deep_gemm() and is_deep_gemm_supported():
     )
 
 if cutlass_fp8_supported():
-    from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8,
-                                                      CutlassExpertsFp8)
+    from vllm.model_executor.layers.fused_moe import (
+        CutlassBatchedExpertsFp8,
+        CutlassExpertsFp8,
+    )
+
     register_experts(
         CutlassExpertsFp8,
         standard_format,
@@ -310,8 +331,8 @@ if cutlass_fp8_supported():
     )
 
 if cutlass_fp4_supported():
-    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        CutlassExpertsFp4)
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4
+
     register_experts(
         CutlassExpertsFp4,
         standard_format,
@@ -321,33 +342,43 @@ if cutlass_fp4_supported():
         supports_expert_map=False,
     )
 
-MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
+MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
     None,
     # per-channel / per-column weights and per-tensor activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=True,
-                       per_act_token_quant=False,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
     # per-channel / per-column weights and per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=True,
-                       per_act_token_quant=True,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
     # per-tensor weights and per-tensor activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=False,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
     # per-tensor weights and per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=True,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
     # block-quantized weights and 128 block per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=False,
-                       block_shape=[128, 128]),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=[128, 128],
+    ),
     # TODO (varun) : Should we test the following combinations ?
     # block-quantized weights and per-token activations
     # block-quantized weights and per-tensor activations
@@ -355,27 +386,29 @@ MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
 
 if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
     MK_QUANT_CONFIGS += [
-        TestMoEQuantConfig(quant_dtype="nvfp4",
-                           per_out_ch_quant=False,
-                           per_act_token_quant=False,
-                           block_shape=None),
+        TestMoEQuantConfig(
+            quant_dtype="nvfp4",
+            per_out_ch_quant=False,
+            per_act_token_quant=False,
+            block_shape=None,
+        ),
     ]
 
 
 def make_prepare_finalize(
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
-    backend: Optional[str],
+    backend: str | None,
     moe: FusedMoEConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEPrepareAndFinalize:
     if backend != "naive" and backend is not None:
-        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(
-            moe, quant_config)
+        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
         assert prepare_finalize is not None
         return prepare_finalize
     elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
         return create_flashinfer_prepare_finalize(
-            use_dp=moe.moe_parallel_config.dp_size > 1)
+            use_dp=moe.moe_parallel_config.dp_size > 1
+        )
     else:
         return MoEPrepareAndFinalizeNoEP()
 
@@ -391,10 +424,10 @@ def make_cutlass_strides(
     n: int,
     k: int,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+    ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
     return ab_strides1, ab_strides2, c_strides1, c_strides2
 
 
@@ -405,7 +438,6 @@ def make_fused_experts(
     num_dispatchers: int,
     N: int,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
-
     batch_kwargs = {
         "max_num_tokens": moe.max_num_tokens,
         "num_dispatchers": num_dispatchers,
@@ -430,7 +462,7 @@ def make_fused_experts(
         print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
         experts = BatchedTritonOrDeepGemmExperts(**kwargs)
     elif fused_experts_type == DeepGemmExperts:
-        print("Making DeepGemmExperts {quant_config} ...")
+        print(f"Making DeepGemmExperts {quant_config} ...")
         experts = DeepGemmExperts(quant_config)
     elif fused_experts_type == TritonExperts:
         kwargs = quant_kwargs
diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
index 459b785e6504e8d4e69214fdad4699e2f00d86c2..8528ee0cdee6c2dd0354bf6f733f4fd65292491a 100644
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -3,17 +3,16 @@
 import dataclasses
 import os
 import traceback
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Concatenate
 
 import torch
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.utils import get_open_port
+from vllm.distributed import init_distributed_environment, initialize_model_parallel
+from vllm.utils.network_utils import get_open_port
 
 ## Parallel Processes Utils
 
@@ -30,10 +29,11 @@ class ProcessGroupInfo:
     device: torch.device
 
 
-def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
-                     local_rank: int):
-
+def _set_vllm_config(
+    vllm_config: VllmConfig, world_size: int, rank: int, local_rank: int
+):
     import tempfile
+
     temp_file = tempfile.mkstemp()[1]
 
     with set_current_vllm_config(vllm_config):
@@ -46,13 +46,10 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
         )
 
         initialize_model_parallel(
-            tensor_model_parallel_size=vllm_config.parallel_config.
-            tensor_parallel_size,
-            pipeline_model_parallel_size=vllm_config.parallel_config.
-            pipeline_parallel_size,
+            tensor_model_parallel_size=vllm_config.parallel_config.tensor_parallel_size,
+            pipeline_model_parallel_size=vllm_config.parallel_config.pipeline_parallel_size,
         )
-        cpu_group = torch.distributed.new_group(list(range(world_size)),
-                                                backend="gloo")
+        cpu_group = torch.distributed.new_group(list(range(world_size)), backend="gloo")
     return cpu_group
 
 
@@ -62,10 +59,9 @@ def _worker_parallel_launch(
     world_local_size: int,
     node_rank: int,
     init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any,
-                                 P], None],
-    vllm_config: Optional[VllmConfig],
-    env_dict: Optional[dict],
+    worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None],
+    vllm_config: VllmConfig | None,
+    env_dict: dict | None,
     *args: P.args,
     **kwargs: P.kwargs,
 ) -> None:
@@ -131,7 +127,8 @@ def parallel_launch_with_config(
             worker,
             vllm_config,
             env_dict,
-        ) + args,
+        )
+        + args,
         nprocs=world_size,
         join=True,
     )
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index 0da6ee3543521fee7bd3db87701d8d6a2ff84212..a3e264c5f5e2889827dbe79a61bb32fafb1c2b4f 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
+from collections.abc import Callable
 from itertools import product
-from typing import Any, Callable
+from typing import Any
 
 import torch
 
@@ -14,28 +15,31 @@ from .common import Config, RankTensors, WeightTensors, make_modular_kernel
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
 
 
-def do_profile(fn: Callable,
-               fn_kwargs: dict[Any, Any],
-               pgi: ProcessGroupInfo,
-               config: Config,
-               num_warmups: int = 5):
+def do_profile(
+    fn: Callable,
+    fn_kwargs: dict[Any, Any],
+    pgi: ProcessGroupInfo,
+    config: Config,
+    num_warmups: int = 5,
+):
     for _ in range(num_warmups):
         fn(**fn_kwargs)
 
     with torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            with_stack=True,
-            record_shapes=True,
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        with_stack=True,
+        record_shapes=True,
     ) as tprof:
         fn(**fn_kwargs)
         torch.cuda.synchronize(torch.cuda.current_device())
 
     # TODO (varun): Add a descriptive trace file name
     tprof.export_chrome_trace(
-        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json")
+        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json"
+    )
 
 
 def profile_modular_kernel(
@@ -82,6 +86,7 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if config.fused_moe_chunk_size is not None:
         assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
@@ -108,20 +113,25 @@ def rank_worker(
 def run(config: Config):
     weights: WeightTensors = WeightTensors.make(config)
     vllm_config, env_dict = config.make_env_data()
-    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+    parallel_launch_with_config(
+        config.world_size, rank_worker, vllm_config, env_dict, config, weights
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     from .cli_args import make_config, make_config_arg_parser
-    parser = make_config_arg_parser(description=(
-        "Run single prepare-finalize & fused-experts combination test"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  #noqa: E501
-        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
-    ))
+
+    parser = make_config_arg_parser(
+        description=(
+            "Run single prepare-finalize & fused-experts combination test"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  # noqa: E501
+            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+        )
+    )
     args = parser.parse_args()
     assert args.torch_trace_dir_path is not None, (
-        "Please pass in a directory to store torch traces")
+        "Please pass in a directory to store torch traces"
+    )
     config = make_config(args)
 
     run(config)
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index 1ad361ae07333eeef09649c4f855988b07f6bc94..90728c1e30a468b227f4d200a0308eb552898ce8 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -3,24 +3,28 @@
 """
 DeepEP test utilities
 """
+
 import dataclasses
 import os
 import traceback
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Concatenate
 
 import torch
 from torch.distributed import ProcessGroup
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import ParamSpec
 
-from vllm.utils import get_open_port, has_deep_ep
+from vllm.utils.import_utils import has_deep_ep
+from vllm.utils.network_utils import get_open_port
 
 if has_deep_ep():
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
+        DeepEPHTPrepareAndFinalize,
+    )
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
+        DeepEPLLPrepareAndFinalize,
+    )
 
 ## Parallel Processes Utils
 
@@ -96,7 +100,8 @@ def parallel_launch(
             0,
             f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
             worker,
-        ) + args,
+        )
+        + args,
         nprocs=world_size,
         join=True,
     )
@@ -118,48 +123,57 @@ class DeepEPLLArgs:
     use_fp8_dispatch: bool
 
 
-def make_deepep_ht_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       dp_size: int,
-                       ht_args: DeepEPHTArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
-
+def make_deepep_ht_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    ht_args: DeepEPHTArgs,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
+):
     import deep_ep
 
     # high throughput a2a
     num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
     num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
-    buffer = deep_ep.Buffer(group=pg,
-                            num_nvl_bytes=num_nvl_bytes,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=low_latency_mode,
-                            num_qps_per_rank=num_qps_per_rank)
-    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      num_dispatchers=pgi.world_size,
-                                      dp_size=dp_size,
-                                      rank_expert_offset=pgi.rank *
-                                      ht_args.num_local_experts)
-
-
-def make_deepep_ll_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       deepep_ll_args: DeepEPLLArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
+    buffer = deep_ep.Buffer(
+        group=pg,
+        num_nvl_bytes=num_nvl_bytes,
+        num_rdma_bytes=num_rdma_bytes,
+        low_latency_mode=low_latency_mode,
+        num_qps_per_rank=num_qps_per_rank,
+    )
+    return DeepEPHTPrepareAndFinalize(
+        buffer=buffer,
+        num_dispatchers=pgi.world_size,
+        dp_size=dp_size,
+        rank_expert_offset=pgi.rank * ht_args.num_local_experts,
+    )
+
 
+def make_deepep_ll_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    deepep_ll_args: DeepEPLLArgs,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
+):
     import deep_ep
 
     # low-latency a2a
     num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
-        pgi.world_size, deepep_ll_args.num_experts)
+        deepep_ll_args.max_tokens_per_rank,
+        deepep_ll_args.hidden_size,
+        pgi.world_size,
+        deepep_ll_args.num_experts,
+    )
 
-    buffer = deep_ep.Buffer(group=pg,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=True,
-                            num_qps_per_rank=deepep_ll_args.num_experts //
-                            pgi.world_size)
+    buffer = deep_ep.Buffer(
+        group=pg,
+        num_rdma_bytes=num_rdma_bytes,
+        low_latency_mode=True,
+        num_qps_per_rank=deepep_ll_args.num_experts // pgi.world_size,
+    )
 
     return DeepEPLLPrepareAndFinalize(
         buffer=buffer,
@@ -169,17 +183,20 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
     )
 
 
-def make_deepep_a2a(pg: ProcessGroup,
-                    pgi: ProcessGroupInfo,
-                    dp_size: int,
-                    deepep_ht_args: Optional[DeepEPHTArgs],
-                    deepep_ll_args: Optional[DeepEPLLArgs],
-                    q_dtype: Optional[torch.dtype] = None,
-                    block_shape: Optional[list[int]] = None):
+def make_deepep_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    deepep_ht_args: DeepEPHTArgs | None,
+    deepep_ll_args: DeepEPLLArgs | None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
+):
     if deepep_ht_args is not None:
         assert deepep_ll_args is None
-        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
-                                  block_shape)
+        return make_deepep_ht_a2a(
+            pg, pgi, dp_size, deepep_ht_args, q_dtype, block_shape
+        )
 
     assert deepep_ll_args is not None
     return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py
index afec97e8cffd009f674d0cccc5126a6cd997d0d8..59cecd60d3d61be7b82c730f3a79790ee31d99e5 100644
--- a/tests/kernels/moe/test_batched_deepgemm.py
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -5,13 +5,14 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config)
+    BatchedDeepGemmExperts,
+)
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+    BatchedPrepareAndFinalize,
+    BatchedTritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported
 
 from .test_deepgemm import make_block_quant_fp8_weights
@@ -19,15 +20,15 @@ from .test_deepgemm import make_block_quant_fp8_weights
 BLOCK_SIZE = [128, 128]
 
 
-@pytest.mark.skipif(not is_deep_gemm_supported(),
-                    reason="Requires deep_gemm kernels")
+@pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels")
 @pytest.mark.parametrize("E", [16, 32])  # number of experts
 @pytest.mark.parametrize("T", [256, 512])  # tokens per expert
 @pytest.mark.parametrize("K", [128, 256])  # hidden dim
 @pytest.mark.parametrize("N", [512, 1024])  # intermediate dim per expert
 @pytest.mark.parametrize("topk", [2, 4])
-def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
-                                    monkeypatch):
+def test_batched_deepgemm_vs_triton(
+    E: int, T: int, K: int, N: int, topk: int, monkeypatch
+):
     """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
 
     monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 7e79828937c77f9b9bfbc104274d9f8f2c996131..2285709fa7d602e954a9e3c1cddea3508ca7b524 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -2,56 +2,50 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 import torch
 
-from tests.kernels.moe.utils import (batched_moe,
-                                     make_quantized_test_activations,
-                                     make_test_weights, naive_batched_moe)
+from tests.kernels.moe.utils import (
+    batched_moe,
+    make_quantized_test_activations,
+    make_test_weights,
+    naive_batched_moe,
+)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    invoke_moe_batched_triton_kernel)
+    invoke_moe_batched_triton_kernel,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl
 
 MNK_FACTORS = [
     (1, 128, 128),
-    (1, 128, 2048),
     (1, 512, 512),
-    (1, 1024, 128),
     (1, 1024, 2048),
     (32, 128, 128),
     (32, 512, 512),
     (32, 1024, 2048),
-    (45, 128, 128),
     (45, 128, 2048),
-    (45, 512, 512),
     (45, 1024, 128),
-    (45, 1024, 2048),
     (64, 512, 512),
     (64, 1024, 2048),
-    (222, 128, 128),
     (222, 128, 2048),
-    (222, 1024, 128),
     (222, 1024, 2048),
 ]
 NUM_EXPERTS = [8, 64]
 TOP_KS = [1, 2, 6]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
 class BatchedMMConfig:
     in_dtype: torch.dtype
-    quant_dtype: Optional[torch.dtype]
+    quant_dtype: torch.dtype | None
     out_dtype: torch.dtype
     num_experts: int
     max_tokens_per_expert: int
@@ -68,23 +62,32 @@ class BatchedMMTensors:
 
     @staticmethod
     def make_tensors(config: BatchedMMConfig):
-        A = torch.randn(
-            (config.num_experts, config.max_tokens_per_expert, config.K),
+        A = (
+            torch.randn(
+                (config.num_experts, config.max_tokens_per_expert, config.K),
+                device="cuda",
+                dtype=config.in_dtype,
+            )
+            / 10
+        )
+        B = torch.randn(
+            (config.num_experts, config.N, config.K),
             device="cuda",
-            dtype=config.in_dtype) / 10
-        B = torch.randn((config.num_experts, config.N, config.K),
-                        device="cuda",
-                        dtype=config.in_dtype)
+            dtype=config.in_dtype,
+        )
         C = torch.zeros(
             (config.num_experts, config.max_tokens_per_expert, config.N),
             device="cuda",
-            dtype=config.out_dtype)
+            dtype=config.out_dtype,
+        )
 
-        num_expert_tokens = torch.randint(low=0,
-                                          high=config.max_tokens_per_expert,
-                                          size=(config.num_experts, ),
-                                          device="cuda",
-                                          dtype=torch.int32)
+        num_expert_tokens = torch.randint(
+            low=0,
+            high=config.max_tokens_per_expert,
+            size=(config.num_experts,),
+            device="cuda",
+            dtype=torch.int32,
+        )
 
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
@@ -96,14 +99,28 @@ class BatchedMMTensors:
 @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
-def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
-                    N: int, dtype: torch.dtype,
-                    block_shape: Optional[list[int]],
-                    per_act_token_quant: bool):
+def test_batched_mm(
+    num_experts: int,
+    max_tokens_per_expert: int,
+    K: int,
+    N: int,
+    dtype: torch.dtype,
+    block_shape: list[int] | None,
+    per_act_token_quant: bool,
+):
+    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
+    and those tests will be skipped on unsupported hardware."""
     current_platform.seed_everything(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
+    if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability(
+        89
+    ):
+        pytest.skip(
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
     if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8:
         pytest.skip("Don't test blocking for non-quantized types.")
 
@@ -117,11 +134,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         act_dtype = dtype
         quant_dtype = None
 
-    num_expert_tokens = torch.randint(low=0,
-                                      high=max_tokens_per_expert,
-                                      size=(num_experts, ),
-                                      device="cuda",
-                                      dtype=torch.int32)
+    num_expert_tokens = torch.randint(
+        low=0,
+        high=max_tokens_per_expert,
+        size=(num_experts,),
+        device="cuda",
+        dtype=torch.int32,
+    )
 
     A, A_q, A_scale = make_quantized_test_activations(
         num_experts,
@@ -151,7 +170,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     compute_tl_dtype = {
         torch.float16: tl.float16,
         torch.bfloat16: tl.bfloat16,
-        torch.float32: tl.float32
+        torch.float32: tl.float32,
     }[test_output.dtype]
 
     assert A_q.dtype == B_q.dtype
@@ -173,7 +192,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         config={
             "BLOCK_SIZE_M": 16,
             "BLOCK_SIZE_N": 16,
-            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
+            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32,
         },
         per_act_token_quant=per_act_token_quant,
         block_shape=block_shape,
@@ -186,11 +205,16 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         num_expert_tokens,
     )
 
-    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
-                                                      num_expert_tokens,
-                                                      A_scale, B_scale,
-                                                      block_shape,
-                                                      per_act_token_quant)
+    q_ref_output = native_batched_masked_quant_matmul(
+        A_q,
+        B_q,
+        q_ref_output,
+        num_expert_tokens,
+        A_scale,
+        B_scale,
+        block_shape,
+        per_act_token_quant,
+    )
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
@@ -217,13 +241,22 @@ def test_fused_moe_batched_experts(
     topk: int,
     dtype: torch.dtype,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     input_scales: bool,
 ):
+    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
+    and those tests will be skipped on unsupported hardware."""
     current_platform.seed_everything(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
+    if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability(
+        89
+    ):
+        pytest.skip(
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
     if topk > e:
         pytest.skip("topk > e")
 
@@ -308,12 +341,6 @@ def test_fused_moe_batched_experts(
             block_shape=block_shape,
         )
 
-    torch.testing.assert_close(batched_output,
-                               baseline_output,
-                               atol=3e-2,
-                               rtol=2e-2)
+    torch.testing.assert_close(batched_output, baseline_output, atol=3e-2, rtol=2e-2)
 
-    torch.testing.assert_close(triton_output,
-                               batched_output,
-                               atol=2e-2,
-                               rtol=2e-2)
+    torch.testing.assert_close(triton_output, batched_output, atol=2e-2, rtol=2e-2)
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index da383e18c37215b01d2ce3d2deab484a9322f426..88db4b3e537c253b68eb91e885b78122426582f1 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -5,31 +5,34 @@ import pytest
 import torch
 
 from tests.kernels.moe.utils import make_test_quant_config, make_test_weights
-from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_fp8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
+    _valid_deep_gemm_shape,
+    deep_gemm_moe_fp8,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, modular_triton_fused_moe)
+    fused_topk,
+    modular_triton_fused_moe,
+)
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+)
+from vllm.utils.import_utils import has_deep_gemm
 
 dg_available = has_deep_gemm()
 
-if dg_available:
-    from deep_gemm import get_m_alignment_for_contiguous_layout
-
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
@@ -37,57 +40,43 @@ DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 # and its hidden size is 7168.
 MNK_FACTORS = [
     (1, 128, 128),
-    (1, 512, 512),
     (1, 128, 7168),
     (1, 1024, 7168),
     (1, 4608, 128),
-    (1, 4608, 512),
     (1, 4608, 7168),
     (83, 128, 128),
     (83, 512, 512),
-    (83, 1024, 7168),
     (83, 4608, 512),
     (83, 4608, 7168),
-    (128, 128, 128),
     (128, 512, 512),
     (128, 1024, 7168),
-    (128, 4608, 512),
     (128, 4608, 7168),
     (2048, 128, 128),
     (2048, 1024, 7168),
     (2048, 4608, 512),
     (2048, 4608, 7168),
     (8192, 128, 128),
-    (8192, 512, 512),
     (8192, 128, 7168),
     (8192, 1024, 7168),
-    (8192, 4608, 512),
     (8192, 4608, 7168),
 ]
 
 MNK_FACTORS_DG = [
     (128, 128, 128),
-    (128, 512, 512),
     (128, 128, 7168),
     (128, 1024, 7168),
     (128, 4608, 128),
-    (128, 4608, 512),
     (128, 4608, 7168),
-    (192, 128, 128),
     (192, 512, 512),
     (192, 1024, 7168),
-    (192, 4608, 512),
     (192, 4608, 7168),
     (1335, 128, 128),
     (1335, 1024, 7168),
     (1335, 4608, 512),
     (1335, 4608, 7168),
     (2048, 128, 128),
-    (2048, 512, 512),
     (2048, 128, 7168),
     (2048, 1024, 7168),
-    (2048, 4608, 128),
-    (2048, 4608, 512),
     (2048, 4608, 7168),
 ]
 
@@ -97,8 +86,7 @@ TOP_KS = [1, 2, 6]
 SEEDS = [0]
 
 
-def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
-                             block_shape):
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids, block_shape):
     """Fused moe with block-wise quantization using native torch."""
     B, D = a.shape
     topk = topk_ids.size(1)
@@ -114,23 +102,17 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
+            inter_out = native_w8a8_block_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
             act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_fp8(
-                act_out, block_k)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
+            out[mask] = native_w8a8_block_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 # Skip all tests if CUDA is not available
@@ -149,8 +131,9 @@ def setup_cuda():
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
-                                  monkeypatch):
+def test_w8a8_block_fp8_fused_moe(
+    M, N, K, E, topk, block_size, dtype, seed, monkeypatch
+):
     if topk > E:
         pytest.skip(f"Skipping test; topk={topk} > E={E}")
 
@@ -188,12 +171,9 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
             block_size,
         )
 
-        out = fused_experts(a,
-                            w1,
-                            w2,
-                            topk_weights,
-                            topk_ids,
-                            quant_config=quant_config)
+        out = fused_experts(
+            a, w1, w2, topk_weights, topk_ids, quant_config=quant_config
+        )
 
         m_out = m_fused_moe(a, w1, w2, topk_weights, topk_ids)
 
@@ -210,8 +190,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
-def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
-                                            monkeypatch):
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch):
     if topk > E:
         pytest.skip(f"Skipping test: topk={topk} > E={E}")
 
@@ -223,8 +202,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
     torch.manual_seed(seed)
 
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
-    block_m = get_m_alignment_for_contiguous_layout()
-    block_size = [block_m, block_m]
+    block_size = get_mk_alignment_for_contiguous_layout()
     dtype = torch.bfloat16
 
     a = torch.randn((M, K), dtype=dtype) / 10
@@ -245,36 +223,38 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
     # setup code in case we are able to revisit this later.
     use_compile = False
 
-    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
-                     and current_platform.is_cuda_alike())
+    use_cudagraph = (
+        chunk_size < M and N >= 1024 and K >= 1024 and current_platform.is_cuda_alike()
+    )
 
     topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
-        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weights,
-                                           topk_ids, block_size)
+        ref_out = torch_w8a8_block_fp8_moe(
+            a, w1, w2, w1_s, w2_s, topk_weights, topk_ids, block_size
+        )
 
         if use_compile:
-            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
-                                                 backend="inductor",
-                                                 fullgraph=True)
+            deep_gemm_moe_fp8_fn = torch.compile(
+                deep_gemm_moe_fp8, backend="inductor", fullgraph=True
+            )
             torch._dynamo.mark_dynamic(a, 0)
             torch._dynamo.mark_dynamic(topk_weights, 0)
             torch._dynamo.mark_dynamic(topk_ids, 0)
         else:
             deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
 
-        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                   topk_ids)
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
 
         if use_cudagraph:
             out.fill_(0)
             stream = torch.cuda.Stream()
             graph = torch.cuda.CUDAGraph()
             with torch.cuda.graph(graph, stream=stream):
-                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                           topk_ids)
+                out = deep_gemm_moe_fp8_fn(
+                    a, w1, w2, w1_s, w2_s, topk_weights, topk_ids
+                )
             torch.cuda.synchronize()
             graph.replay()
             torch.cuda.synchronize()
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 041a13ca5585abae7af797a4a5827b320011d7bf..e35ca4caa9dbc4433b192569e9d5e202bb1b95dc 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -5,51 +5,42 @@ import pytest
 import torch
 
 from tests.kernels.moe.utils import make_test_quant_config
-from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_int8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
-DTYPES = [torch.half, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 
 MNK_FACTORS = [
     (1, 128, 128),
-    (1, 512, 512),
     (1, 128, 7168),
     (1, 1024, 7168),
-    (1, 4096, 128),
     (1, 4096, 512),
     (1, 4096, 7168),
-    (33, 128, 128),
     (33, 512, 512),
     (33, 128, 7168),
     (33, 1024, 7168),
     (33, 4096, 128),
-    (33, 4096, 512),
     (33, 4096, 7168),
     (128, 128, 128),
-    (128, 512, 512),
     (128, 1024, 7168),
     (128, 4096, 512),
     (128, 4096, 7168),
-    (222, 128, 128),
     (222, 512, 512),
     (222, 1024, 7168),
-    (222, 4096, 512),
     (222, 4096, 7168),
     (2048, 128, 128),
     (2048, 1024, 7168),
-    (2048, 4096, 512),
     (2048, 4096, 4096),
 ]
 
@@ -77,24 +68,18 @@ def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
+            inter_out = native_w8a8_block_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
             act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_int8(
-                act_out, block_k)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(act_out, block_k)
             act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            out[mask] = native_w8a8_block_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -131,15 +116,19 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
-        out = fused_experts(a,
-                            w1,
-                            w2,
-                            topk_weights,
-                            topk_ids,
-                            quant_config=quant_config)
-        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, quant_config.w1_scale,
-                                            quant_config.w2_scale, score, topk,
-                                            block_size)
+        out = fused_experts(
+            a, w1, w2, topk_weights, topk_ids, quant_config=quant_config
+        )
+        ref_out = torch_w8a8_block_int8_moe(
+            a,
+            w1,
+            w2,
+            quant_config.w1_scale,
+            quant_config.w2_scale,
+            score,
+            topk,
+            block_size,
+        )
 
     # Check results
     torch.testing.assert_close(out, ref_out, atol=0.065, rtol=0.065)
diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py
index 1768baaf1ca71a43dcbf27da15ad2d36aa55c00a..39138be83bccb8477659fdebf48630f658a3a51c 100644
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels
 """
 
 import dataclasses
-from typing import Optional
 
 import pytest
 import torch
@@ -15,9 +14,8 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
 
 @dataclasses.dataclass
 class TestTensors:
-
     topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor] = None
+    expert_map: torch.Tensor | None = None
 
     def to_device(self, device: str):
         self.topk_ids = self.topk_ids.to(device=device)
@@ -25,32 +23,31 @@ class TestTensors:
             self.expert_map = self.expert_map.to(device=device)
 
     @staticmethod
-    def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
-             topk_ids_dtype: torch.dtype) -> "TestTensors":
-
+    def make(
+        num_tokens: int,
+        num_topk: int,
+        num_experts: int,
+        device: str,
+        topk_ids_dtype: torch.dtype,
+    ) -> "TestTensors":
         # make topk ids
-        topk_ids = torch.empty((num_tokens, num_topk),
-                               device=device,
-                               dtype=torch.int64)
+        topk_ids = torch.empty((num_tokens, num_topk), device=device, dtype=torch.int64)
         for x in range(num_tokens):
             topk_ids[x] = torch.randperm(num_experts)[:num_topk]
         topk_ids = topk_ids.to(dtype=torch.int64)
         return TestTensors(topk_ids=topk_ids)
 
-    def with_ep_rank(self, ep_rank: int, num_global_experts: int,
-                     num_local_experts: int, device: str):
+    def with_ep_rank(
+        self, ep_rank: int, num_global_experts: int, num_local_experts: int, device: str
+    ):
         # make an expert map
-        expert_map = torch.empty((num_global_experts),
-                                 device=device,
-                                 dtype=torch.int32)
+        expert_map = torch.empty((num_global_experts), device=device, dtype=torch.int32)
         expert_map.fill_(-1)
         s = ep_rank * num_local_experts
         e = s + num_local_experts
-        expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
-                                       device=device)
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)), device=device)
 
-        return TestTensors(topk_ids=self.topk_ids.clone(),
-                           expert_map=expert_map)
+        return TestTensors(topk_ids=self.topk_ids.clone(), expert_map=expert_map)
 
 
 def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
@@ -68,49 +65,49 @@ def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
         expert_num_tokens[eid] += count
 
 
-def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
-                                      num_experts: int, ep_size: int,
-                                      topk_ids_dtype: torch.dtype):
-
+def do_test_compute_expert_num_tokens(
+    num_tokens: int,
+    num_topk: int,
+    num_experts: int,
+    ep_size: int,
+    topk_ids_dtype: torch.dtype,
+):
     assert num_topk <= num_experts
 
-    tt = TestTensors.make(num_tokens,
-                          num_topk,
-                          num_experts,
-                          topk_ids_dtype=topk_ids_dtype,
-                          device="cpu")
+    tt = TestTensors.make(
+        num_tokens, num_topk, num_experts, topk_ids_dtype=topk_ids_dtype, device="cpu"
+    )
 
     num_global_experts = num_experts
     assert num_global_experts % ep_size == 0
     num_local_experts = num_global_experts // ep_size
     for ep_rank in range(ep_size):
-        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
-                                  num_local_experts, "cpu")
+        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts, num_local_experts, "cpu")
 
-        ref_expert_num_tokens = torch.zeros((num_local_experts),
-                                            device="cpu",
-                                            dtype=torch.int32)
+        ref_expert_num_tokens = torch.zeros(
+            (num_local_experts), device="cpu", dtype=torch.int32
+        )
         ref_impl(tt_rank, ref_expert_num_tokens)
         ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
 
         tt_rank.to_device("cuda")
         # Test with expert_map
         triton_expert_num_tokens_w_emap = count_expert_num_tokens(
-            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
+            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map
+        )
 
         # Test without expert map
         topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
         triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
-            topk_ids, num_local_experts, expert_map=None)
+            topk_ids, num_local_experts, expert_map=None
+        )
 
-        torch.testing.assert_close(ref_expert_num_tokens,
-                                   triton_expert_num_tokens_w_emap,
-                                   atol=0,
-                                   rtol=0)
-        torch.testing.assert_close(ref_expert_num_tokens,
-                                   triton_expert_num_tokens_wo_emap,
-                                   atol=0,
-                                   rtol=0)
+        torch.testing.assert_close(
+            ref_expert_num_tokens, triton_expert_num_tokens_w_emap, atol=0, rtol=0
+        )
+        torch.testing.assert_close(
+            ref_expert_num_tokens, triton_expert_num_tokens_wo_emap, atol=0, rtol=0
+        )
 
 
 @pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317])
@@ -118,22 +115,29 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
 @pytest.mark.parametrize("num_experts", [64])
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
-def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
-                                   num_experts: int, ep_size: int,
-                                   topk_ids_dtype: torch.dtype):
-    do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
-                                      ep_size, topk_ids_dtype)
+def test_compute_expert_num_tokens(
+    num_tokens: int,
+    num_topk: int,
+    num_experts: int,
+    ep_size: int,
+    topk_ids_dtype: torch.dtype,
+):
+    do_test_compute_expert_num_tokens(
+        num_tokens, num_topk, num_experts, ep_size, topk_ids_dtype
+    )
 
 
 @pytest.mark.parametrize("numel", list(range(1, 8192, 111)))
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("ep_size", [2])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
-def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
-                                              ep_size: int,
-                                              topk_ids_dtype: torch.dtype):
-    do_test_compute_expert_num_tokens(num_tokens=numel,
-                                      num_topk=1,
-                                      num_experts=num_experts,
-                                      ep_size=ep_size,
-                                      topk_ids_dtype=topk_ids_dtype)
+def test_compute_expert_num_tokens_from_numel(
+    numel: int, num_experts: int, ep_size: int, topk_ids_dtype: torch.dtype
+):
+    do_test_compute_expert_num_tokens(
+        num_tokens=numel,
+        num_topk=1,
+        num_experts=num_experts,
+        ep_size=ep_size,
+        topk_ids_dtype=topk_ids_dtype,
+    )
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 3b1618dacac7b72081b068f5f77c8be5bcd30e7d..1c10cb3b2c699e64916bfbd9a3db3d23e9f7e265 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -13,23 +13,28 @@ from tests.kernels.moe.utils import per_token_cast_to_fp8
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
+from vllm.utils.math_utils import cdiv
 
 
-@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
-    (4, 8192, 7168, 4096),
-    (4, 8192, 2048, 7168),
-    (8, 4096, 7168, 4096),
-    (8, 4096, 2048, 7168),
-    (32, 1024, 7168, 4096),
-    (32, 1024, 2048, 7168),
-])
+@pytest.mark.parametrize(
+    "num_groups, expected_m_per_group, k, n",
+    [
+        (4, 8192, 7168, 4096),
+        (4, 8192, 2048, 7168),
+        (8, 4096, 7168, 4096),
+        (8, 4096, 2048, 7168),
+        (32, 1024, 7168, 4096),
+        (32, 1024, 2048, 7168),
+    ],
+)
 @pytest.mark.parametrize("out_dtype", [torch.float16])
 @pytest.mark.skipif(
     (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.")
+        current_platform.get_device_capability()
+    ),
+    reason="Block Scaled Grouped GEMM is only supported on SM100.",
+)
 def test_cutlass_grouped_gemm(
     num_groups: int,
     expected_m_per_group: int,
@@ -40,8 +45,7 @@ def test_cutlass_grouped_gemm(
     device = "cuda"
     alignment = 128
     group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3))
-        for _ in range(num_groups)
+        int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
     ]
     m = sum([cdiv(m, alignment) * alignment for m in group_ms])
 
@@ -58,20 +62,22 @@ def test_cutlass_grouped_gemm(
     expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
 
     x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, cdiv(n, 128), k // 128),
-                         device=device,
-                         dtype=torch.float))
+    y_fp8 = (
+        torch.empty_like(y, dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
+        ),
+    )
     for i in range(num_groups):
         y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
 
     for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
+        a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
+        a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
         b = y_fp8[0][i].t()
         b_scale = y_fp8[1][i].t()
         baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
+        ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
 
     ops.cutlass_blockwise_scaled_grouped_mm(
         out,
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index ca6be767dab39606a4071108456ebb47b2a4c635..c15837f145705482a5b68e88253b0bac3c2fdbf1 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -3,7 +3,6 @@
 import copy
 import dataclasses
 from math import prod
-from typing import Optional
 
 import pytest
 import torch
@@ -11,13 +10,15 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, fp8_w8a8_moe_quant_config)
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    fp8_w8a8_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    cutlass_moe_fp8, run_cutlass_moe_fp8)
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
-                                                            fused_topk)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    cutlass_moe_fp8,
+    run_cutlass_moe_fp8,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [40, 64]
@@ -25,28 +26,22 @@ TOP_KS = [6, 8]
 
 MNK_FACTORS = [
     (2, 1024, 1024),
-    (2, 1024, 1536),
     (2, 3072, 1024),
     (2, 3072, 1536),
     (7, 3072, 1536),
     (64, 1024, 1024),
     (64, 1024, 1536),
     (64, 3072, 1024),
-    (64, 3072, 1536),
     (224, 1024, 1024),
-    (224, 1024, 1536),
     (224, 3072, 1024),
     (224, 3072, 1536),
     (32768, 1024, 1024),
     # These sizes trigger wrong answers.
-    #(7232, 2048, 5120),
-    #(40000, 2048, 5120),
+    # (7232, 2048, 5120),
+    # (40000, 2048, 5120),
 ]
 
-vllm_config = VllmConfig(parallel_config=ParallelConfig(
-    pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
+vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 
 
 @dataclasses.dataclass
@@ -60,42 +55,45 @@ class MOETensors:
     c_strides2: torch.Tensor
 
     @staticmethod
-    def make_moe_tensors(m: int, k: int, n: int, e: int,
-                         dtype: torch.dtype) -> "MOETensors":
+    def make_moe_tensors(
+        m: int, k: int, n: int, e: int, dtype: torch.dtype
+    ) -> "MOETensors":
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
         w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
         w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        return MOETensors(a=a,
-                          w1=w1,
-                          w2=w2,
-                          ab_strides1=ab_strides1,
-                          c_strides1=c_strides1,
-                          ab_strides2=ab_strides2,
-                          c_strides2=c_strides2)
+        ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        return MOETensors(
+            a=a,
+            w1=w1,
+            w2=w2,
+            ab_strides1=ab_strides1,
+            c_strides1=c_strides1,
+            ab_strides2=ab_strides2,
+            c_strides2=c_strides2,
+        )
 
 
 @dataclasses.dataclass
 class MOETensors8Bit(MOETensors):
     # quantized
-    a_q: Optional[torch.Tensor] = None  # a -> a_q
-    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
-    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
-    a_scale: Optional[torch.Tensor] = None
-    w1_scale: Optional[torch.Tensor] = None
-    w2_scale: Optional[torch.Tensor] = None
+    a_q: torch.Tensor | None = None  # a -> a_q
+    w1_q: torch.Tensor | None = None  # w1 -> w1_q
+    w2_q: torch.Tensor | None = None  # w2 -> w2_q
+    a_scale: torch.Tensor | None = None
+    w1_scale: torch.Tensor | None = None
+    w2_scale: torch.Tensor | None = None
     # dequantized
-    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
-    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
-    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+    a_d: torch.Tensor | None = None  # a -> a_q -> a_d
+    w1_d: torch.Tensor | None = None  # w1 -> w1_q -> w1_d
+    w2_d: torch.Tensor | None = None  # w2 -> w2_q -> w2_d
 
     @staticmethod
-    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
-                              per_act_token: bool,
-                              per_out_channel: bool) -> "MOETensors8Bit":
+    def make_moe_tensors_8bit(
+        m: int, k: int, n: int, e: int, per_act_token: bool, per_out_channel: bool
+    ) -> "MOETensors8Bit":
         dtype = torch.half
         q_dtype = torch.float8_e4m3fn
 
@@ -106,24 +104,21 @@ class MOETensors8Bit(MOETensors):
         k_b_scales = k if per_out_channel else 1
         # Get the right scale for tests.
         a_q, a_scale = ops.scaled_fp8_quant(
-            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token)
+            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token
+        )
 
         w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
 
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
+        w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
         for expert in range(e):
             w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                moe_tensors_fp16.w1[expert],
-                use_per_token_if_dynamic=per_out_channel)
+                moe_tensors_fp16.w1[expert], use_per_token_if_dynamic=per_out_channel
+            )
             w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                moe_tensors_fp16.w2[expert],
-                use_per_token_if_dynamic=per_out_channel)
+                moe_tensors_fp16.w2[expert], use_per_token_if_dynamic=per_out_channel
+            )
 
         # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
         a_d = a_q.float().mul(a_scale).to(dtype)
@@ -133,31 +128,37 @@ class MOETensors8Bit(MOETensors):
             w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
             w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
 
-        return MOETensors8Bit(a=moe_tensors_fp16.a,
-                              w1=moe_tensors_fp16.w1,
-                              w2=moe_tensors_fp16.w2,
-                              ab_strides1=moe_tensors_fp16.ab_strides1,
-                              c_strides1=moe_tensors_fp16.c_strides1,
-                              ab_strides2=moe_tensors_fp16.ab_strides2,
-                              c_strides2=moe_tensors_fp16.c_strides2,
-                              a_q=a_q,
-                              w1_q=w1_q,
-                              w2_q=w2_q,
-                              a_scale=a_scale,
-                              w1_scale=w1_scale,
-                              w2_scale=w2_scale,
-                              a_d=a_d,
-                              w1_d=w1_d,
-                              w2_d=w2_d)
-
-
-def run_with_expert_maps(num_experts: int, num_local_experts: int,
-                         **cutlass_moe_kwargs):
-
+        return MOETensors8Bit(
+            a=moe_tensors_fp16.a,
+            w1=moe_tensors_fp16.w1,
+            w2=moe_tensors_fp16.w2,
+            ab_strides1=moe_tensors_fp16.ab_strides1,
+            c_strides1=moe_tensors_fp16.c_strides1,
+            ab_strides2=moe_tensors_fp16.ab_strides2,
+            c_strides2=moe_tensors_fp16.c_strides2,
+            a_q=a_q,
+            w1_q=w1_q,
+            w2_q=w2_q,
+            a_scale=a_scale,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a_d=a_d,
+            w1_d=w1_d,
+            w2_d=w2_d,
+        )
+
+
+def run_with_expert_maps(
+    num_experts: int, num_local_experts: int, **cutlass_moe_kwargs
+):
     def slice_experts():
         slice_params = [
-            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
-            "c_strides2"
+            "w1_q",
+            "w2_q",
+            "ab_strides1",
+            "ab_strides2",
+            "c_strides1",
+            "c_strides2",
         ]
         full_tensors = {
             k: v
@@ -173,9 +174,7 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int,
             # make expert map
             expert_map = [-1] * num_experts
             expert_map[s:e] = list(range(num_local_experts))
-            expert_map = torch.tensor(expert_map,
-                                      dtype=torch.int32,
-                                      device="cuda")
+            expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
             # update cutlass moe arg with expert_map
             cutlass_moe_kwargs["expert_map"] = expert_map
@@ -198,18 +197,26 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int,
     return out_tensor
 
 
-def run_8_bit(moe_tensors: MOETensors8Bit,
-              topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              per_act_token: bool,
-              per_out_ch: bool,
-              num_local_experts: Optional[int] = None) -> torch.Tensor:
-    assert not any([
-        t is None for t in [
-            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
-            moe_tensors.w2_scale, moe_tensors.a_scale
+def run_8_bit(
+    moe_tensors: MOETensors8Bit,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+    num_local_experts: int | None = None,
+) -> torch.Tensor:
+    assert not any(
+        [
+            t is None
+            for t in [
+                moe_tensors.w1_q,
+                moe_tensors.w2_q,
+                moe_tensors.w1_scale,
+                moe_tensors.w2_scale,
+                moe_tensors.a_scale,
+            ]
         ]
-    ])
+    )
 
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=moe_tensors.w1_scale,
@@ -222,16 +229,16 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
     )
 
     kwargs = {
-        'a': moe_tensors.a,
-        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
-        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
-        'topk_weights': topk_weights,
-        'topk_ids': topk_ids,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides1': moe_tensors.c_strides1,
-        'c_strides2': moe_tensors.c_strides2,
-        'quant_config': quant_config,
+        "a": moe_tensors.a,
+        "w1_q": moe_tensors.w1_q,  # type: ignore[union-attr]
+        "w2_q": moe_tensors.w2_q,  # type: ignore[union-attr]
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "ab_strides1": moe_tensors.ab_strides1,
+        "ab_strides2": moe_tensors.ab_strides2,
+        "c_strides1": moe_tensors.c_strides1,
+        "c_strides2": moe_tensors.c_strides2,
+        "quant_config": quant_config,
     }
 
     num_experts = moe_tensors.w1.size(0)
@@ -243,7 +250,8 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
     return run_with_expert_maps(
         num_experts,
         num_local_experts,  # type: ignore[arg-type]
-        **kwargs)
+        **kwargs,
+    )
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@@ -253,8 +261,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_no_graph(
     m: int,
     n: int,
@@ -264,30 +274,23 @@ def test_cutlass_moe_8_bit_no_graph(
     per_act_token: bool,
     per_out_ch: bool,
     monkeypatch,
-    ep_size: Optional[int] = None,
+    ep_size: int | None = None,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_ch)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
 
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-        triton_output = fused_experts(mt.a_d,
-                                      mt.w1_d,
-                                      mt.w2_d,
-                                      topk_weights,
-                                      topk_ids,
-                                      quant_config=quant_config)
+        triton_output = fused_experts(
+            mt.a_d, mt.w1_d, mt.w2_d, topk_weights, topk_ids, quant_config=quant_config
+        )
 
         if ep_size is not None:
             assert e % ep_size == 0, "Cannot distribute experts evenly"
@@ -295,15 +298,15 @@ def test_cutlass_moe_8_bit_no_graph(
         else:
             number_local_experts = None
 
-        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
-                                   per_out_ch, number_local_experts)
+        cutlass_output = run_8_bit(
+            mt, topk_weights, topk_ids, per_act_token, per_out_ch, number_local_experts
+        )
 
         # Note 5.5 only needed for larger problem sizes, 5 works ok for
         # the rest.
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5.5e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(
+            triton_output, cutlass_output, atol=5.5e-2, rtol=1e-2
+        )
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@@ -313,8 +316,10 @@ def test_cutlass_moe_8_bit_no_graph(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_cuda_graph(
     m: int,
     n: int,
@@ -330,39 +335,30 @@ def test_cutlass_moe_8_bit_cuda_graph(
     with set_current_vllm_config(vllm_config):
         dtype = torch.half
 
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_ch)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-        triton_output = fused_experts(mt.a_d,
-                                      mt.w1_d,
-                                      mt.w2_d,
-                                      topk_weights,
-                                      topk_ids,
-                                      quant_config=quant_config)
+        triton_output = fused_experts(
+            mt.a_d, mt.w1_d, mt.w2_d, topk_weights, topk_ids, quant_config=quant_config
+        )
 
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run_8_bit(mt, topk_weights, topk_ids,
-                                       per_act_token, per_out_ch)
+            cutlass_output = run_8_bit(
+                mt, topk_weights, topk_ids, per_act_token, per_out_ch
+            )
 
         torch.cuda.synchronize()
         graph.replay()
         torch.cuda.synchronize()
 
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=9e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(triton_output, cutlass_output, atol=9e-2, rtol=1e-2)
 
 
 @pytest.mark.parametrize("m", [64])
@@ -375,8 +371,10 @@ def test_cutlass_moe_8_bit_cuda_graph(
 @pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_EP(
     m: int,
     n: int,
@@ -388,8 +386,9 @@ def test_cutlass_moe_8_bit_EP(
     ep_size: int,
     monkeypatch,
 ):
-    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
-                                    per_out_channel, monkeypatch, ep_size)
+    test_cutlass_moe_8_bit_no_graph(
+        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+    )
 
 
 LARGE_MNK_FACTORS = [
@@ -406,8 +405,10 @@ LARGE_MNK_FACTORS = [
 @pytest.mark.parametrize("ep_size", [8])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_EP_large(
     m: int,
     n: int,
@@ -419,8 +420,9 @@ def test_cutlass_moe_8_bit_EP_large(
     ep_size: int,
     monkeypatch,
 ):
-    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
-                                    per_out_channel, monkeypatch, ep_size)
+    test_cutlass_moe_8_bit_no_graph(
+        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+    )
 
 
 @pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
@@ -430,8 +432,10 @@ def test_cutlass_moe_8_bit_EP_large(
 @pytest.mark.parametrize("ep_size", [8])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_run_cutlass_moe_fp8(
     m: int,
     n: int,
@@ -444,14 +448,12 @@ def test_run_cutlass_moe_fp8(
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(vllm_config):
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_channel)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(
+            m, k, n, e, per_act_token, per_out_channel
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
         # we want to make sure there is at least one token that's generated in
         # this expert shard and at least one token that's NOT generated in this
         # expert shard
@@ -462,12 +464,12 @@ def test_run_cutlass_moe_fp8(
         workspace2_shape = (m * topk, max(n, k))
         output_shape = (m, k)
 
-        workspace13 = torch.empty(prod(workspace13_shape),
-                                  device="cuda",
-                                  dtype=mt.a.dtype)
-        workspace2 = torch.empty(prod(workspace2_shape),
-                                 device="cuda",
-                                 dtype=mt.a.dtype)
+        workspace13 = torch.empty(
+            prod(workspace13_shape), device="cuda", dtype=mt.a.dtype
+        )
+        workspace2 = torch.empty(
+            prod(workspace2_shape), device="cuda", dtype=mt.a.dtype
+        )
 
         num_local_experts = e // ep_size
         start, end = 0, num_local_experts
@@ -475,36 +477,55 @@ def test_run_cutlass_moe_fp8(
         expert_map[start:end] = list(range(num_local_experts))
         expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
 
         activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
-        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
-                                                   torch.float8_e4m3fn,
-                                                   per_act_token)
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            mt.a, mt.a_scale, torch.float8_e4m3fn, per_act_token
+        )
         global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
         func = lambda output: run_cutlass_moe_fp8(
-            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
-            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
-            a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2,
-            workspace13, workspace2, None, mt.a.dtype, per_act_token,
-            per_out_channel, False, topk_weights)
+            output,
+            a1q,
+            mt.w1_q,
+            mt.w2_q,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            mt.w1_scale,
+            mt.w2_scale,
+            a1q_scale,
+            None,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
+            workspace13,
+            workspace2,
+            None,
+            mt.a.dtype,
+            per_act_token,
+            per_out_channel,
+            False,
+            topk_weights,
+        )
 
         workspace13.random_()
-        output_random_workspace = torch.empty(output_shape,
-                                              device="cuda",
-                                              dtype=mt.a.dtype)
+        output_random_workspace = torch.empty(
+            output_shape, device="cuda", dtype=mt.a.dtype
+        )
         func(output_random_workspace)
 
         workspace13.fill_(0)
-        output_zero_workspace = torch.zeros(output_shape,
-                                            device="cuda",
-                                            dtype=mt.a.dtype)
+        output_zero_workspace = torch.zeros(
+            output_shape, device="cuda", dtype=mt.a.dtype
+        )
         func(output_zero_workspace)
 
-        torch.testing.assert_close(output_random_workspace,
-                                   output_zero_workspace,
-                                   atol=5e-3,
-                                   rtol=1e-3)
+        torch.testing.assert_close(
+            output_random_workspace, output_zero_workspace, atol=5e-3, rtol=1e-3
+        )
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index ced5457d4f53b246145cf0f3149038c5f7e78802..455ecacef5ec3a3f635698b7054575e5f15605df 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -7,7 +7,7 @@ fp8 block-quantized case.
 """
 
 import dataclasses
-from typing import Optional
+from contextlib import contextmanager
 
 import pytest
 import torch.distributed
@@ -15,33 +15,40 @@ from torch.distributed import ProcessGroup
 from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, fp8_w8a8_moe_quant_config)
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
 
 if has_deep_ep():
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
+        DeepEPHTPrepareAndFinalize,
+    )
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
+        DeepEPLLPrepareAndFinalize,
+    )
 
     from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 if has_deep_gemm():
-
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-        BatchedDeepGemmExperts)
-    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-        DeepGemmExperts)
+        BatchedDeepGemmExperts,
+    )
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 
 requires_deep_ep = pytest.mark.skipif(
     not has_deep_ep(),
@@ -56,11 +63,29 @@ requires_deep_gemm = pytest.mark.skipif(
 P = ParamSpec("P")
 
 
+@contextmanager
+def with_dp_metadata(M: int, world_size: int):
+    num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.data_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_forward_context(
+        None,
+        vllm_config,
+        num_tokens=M,
+        num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        yield
+
+
 def next_power_of_2(x):
     import math
+
     if x == 0:
         return 1
-    return 2**math.ceil(math.log2(x))
+    return 2 ** math.ceil(math.log2(x))
 
 
 def make_block_quant_fp8_weights(
@@ -72,13 +97,9 @@ def make_block_quant_fp8_weights(
     """
     Return weights w1q, w2q, w1_scale, w2_scale
     """
-    (_, w1q, w1_scale, _), (_, w2q, w2_scale,
-                            _) = make_test_weights(e,
-                                                   n,
-                                                   k,
-                                                   torch.bfloat16,
-                                                   torch.float8_e4m3fn,
-                                                   block_shape=block_size)
+    (_, w1q, w1_scale, _), (_, w2q, w2_scale, _) = make_test_weights(
+        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_shape=block_size
+    )
     return w1q, w2q, w1_scale, w2_scale
 
 
@@ -93,28 +114,28 @@ class TestConfig:
     block_size: list[int]
     # configs for testing low-latency kernels
     low_latency: bool
-    use_fp8_dispatch: Optional[bool] = False
+    use_fp8_dispatch: bool | None = False
 
 
 @dataclasses.dataclass
 class TestTensors:
     rank_tokens: torch.Tensor  # all ranks make this many tokens
-    rank_token_scales: Optional[torch.Tensor]
+    rank_token_scales: torch.Tensor | None
     topk: torch.Tensor
     topk_weights: torch.Tensor
     config: TestConfig
 
     @staticmethod
     def make(config: TestConfig, rank) -> "TestTensors":
-
         dtype = torch.bfloat16
         topk, m, k = (config.topk, config.m, config.k)
 
         fp8_info = torch.finfo(torch.float8_e4m3fn)
         fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-        rank_tokens = torch.randn(
-            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = (
+            torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        )
         rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
         rank_token_scales = None
 
@@ -122,25 +143,32 @@ class TestTensors:
             low=0,
             high=config.num_experts,
             size=(m, topk),
-            device=torch.cuda.current_device()).to(dtype=torch.int64)
+            device=torch.cuda.current_device(),
+        ).to(dtype=torch.int64)
 
-        topk_weights = torch.randn(topk_ids.shape,
-                                   dtype=torch.float32,
-                                   device=torch.cuda.current_device())
+        topk_weights = torch.randn(
+            topk_ids.shape, dtype=torch.float32, device=torch.cuda.current_device()
+        )
 
-        return TestTensors(rank_tokens=rank_tokens,
-                           rank_token_scales=rank_token_scales,
-                           topk=topk_ids,
-                           topk_weights=topk_weights,
-                           config=config)
+        return TestTensors(
+            rank_tokens=rank_tokens,
+            rank_token_scales=rank_token_scales,
+            topk=topk_ids,
+            topk_weights=topk_weights,
+            config=config,
+        )
 
 
 def make_ll_modular_kernel(
-        pg: ProcessGroup, pgi: ProcessGroupInfo, max_tokens_per_rank: int,
-        dp_size: int, hidden_size: int, q_dtype: Optional[torch.dtype],
-        test_config: TestConfig,
-        quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel:
-
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    max_tokens_per_rank: int,
+    dp_size: int,
+    hidden_size: int,
+    q_dtype: torch.dtype | None,
+    test_config: TestConfig,
+    quant_config: FusedMoEQuantConfig,
+) -> FusedMoEModularKernel:
     assert test_config.low_latency
     assert test_config.use_fp8_dispatch is not None
 
@@ -153,26 +181,30 @@ def make_ll_modular_kernel(
             max_tokens_per_rank=max_tokens_per_rank,
             hidden_size=hidden_size,
             num_experts=test_config.num_experts,
-            use_fp8_dispatch=test_config.use_fp8_dispatch),
+            use_fp8_dispatch=test_config.use_fp8_dispatch,
+        ),
         q_dtype=q_dtype,
-        block_shape=test_config.block_size)
+        block_shape=test_config.block_size,
+    )
 
     fused_experts = BatchedDeepGemmExperts(
         max_num_tokens=max_tokens_per_rank,
         num_dispatchers=pgi.world_size // dp_size,
         quant_config=quant_config,
     )
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
 def make_ht_modular_kernel(
-        pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
-        num_local_experts: int, q_dtype: Optional[torch.dtype],
-        test_config: TestConfig,
-        quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel:
-
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    num_local_experts: int,
+    q_dtype: torch.dtype | None,
+    test_config: TestConfig,
+    quant_config: FusedMoEQuantConfig,
+) -> FusedMoEModularKernel:
     assert not test_config.low_latency
     assert test_config.use_fp8_dispatch is None
 
@@ -183,76 +215,82 @@ def make_ht_modular_kernel(
         deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
         deepep_ll_args=None,
         q_dtype=q_dtype,
-        block_shape=test_config.block_size)
+        block_shape=test_config.block_size,
+    )
 
     fused_experts = DeepGemmExperts(quant_config)
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
 def make_modular_kernel(
-        pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
-        num_local_experts: int, test_tensors: TestTensors,
-        quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel:
-
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    num_local_experts: int,
+    test_tensors: TestTensors,
+    quant_config: FusedMoEQuantConfig,
+) -> FusedMoEModularKernel:
     q_dtype = torch.float8_e4m3fn
     test_config = test_tensors.config
 
     mk: FusedMoEModularKernel
     # Make modular kernel
     if test_config.low_latency:
-        max_tokens_per_rank = max(
-            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        max_tokens_per_rank = max(64, next_power_of_2(test_tensors.rank_tokens.size(0)))
         hidden_size = test_tensors.rank_tokens.size(-1)
 
-        mk = make_ll_modular_kernel(pg=pg,
-                                    pgi=pgi,
-                                    max_tokens_per_rank=max_tokens_per_rank,
-                                    dp_size=dp_size,
-                                    hidden_size=hidden_size,
-                                    q_dtype=q_dtype,
-                                    test_config=test_config,
-                                    quant_config=quant_config)
+        mk = make_ll_modular_kernel(
+            pg=pg,
+            pgi=pgi,
+            max_tokens_per_rank=max_tokens_per_rank,
+            dp_size=dp_size,
+            hidden_size=hidden_size,
+            q_dtype=q_dtype,
+            test_config=test_config,
+            quant_config=quant_config,
+        )
     else:
-        mk = make_ht_modular_kernel(pg,
-                                    pgi,
-                                    dp_size,
-                                    num_local_experts,
-                                    q_dtype,
-                                    test_config,
-                                    quant_config=quant_config)
+        mk = make_ht_modular_kernel(
+            pg,
+            pgi,
+            dp_size,
+            num_local_experts,
+            q_dtype,
+            test_config,
+            quant_config=quant_config,
+        )
 
     return mk
 
 
-def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                             dp_size: int, test_tensors: TestTensors,
-                             w1: torch.Tensor, w2: torch.Tensor,
-                             w1_scale: Optional[torch.Tensor],
-                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
-
+def deepep_deepgemm_moe_impl(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+) -> torch.Tensor:
     test_config = test_tensors.config
     num_experts = test_config.num_experts
     num_local_experts = w1.size(0)
 
     def build_expert_map():
         num_local_experts = w1.size(0)
-        expert_map = torch.full((num_experts, ),
-                                fill_value=-1,
-                                dtype=torch.int32)
+        expert_map = torch.full((num_experts,), fill_value=-1, dtype=torch.int32)
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(),
-                             dtype=torch.int32)
+        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
 
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=w1_scale,
         w2_scale=w2_scale,
         # Low-Latency kernels can't dispatch scales.
-        a1_scale=(None if test_config.low_latency else
-                  test_tensors.rank_token_scales),
+        a1_scale=(None if test_config.low_latency else test_tensors.rank_token_scales),
         block_shape=test_config.block_size,
     )
 
@@ -263,26 +301,38 @@ def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
         dp_size=dp_size,
         num_local_experts=num_local_experts,
         test_tensors=test_tensors,
-        quant_config=quant_config)
-
-    out = mk.forward(hidden_states=test_tensors.rank_tokens,
-                     w1=w1,
-                     w2=w2,
-                     topk_weights=test_tensors.topk_weights,
-                     topk_ids=test_tensors.topk,
-                     inplace=False,
-                     activation="silu",
-                     global_num_experts=num_experts,
-                     expert_map=build_expert_map(),
-                     apply_router_weight_on_input=False)
-    return out
+        quant_config=quant_config,
+    )
 
+    with with_dp_metadata(
+        M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
+    ):
+        out = mk.forward(
+            hidden_states=test_tensors.rank_tokens,
+            w1=w1,
+            w2=w2,
+            topk_weights=test_tensors.topk_weights,
+            topk_ids=test_tensors.topk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            apply_router_weight_on_input=False,
+        )
+    return out
 
-def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
-                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                a1_scale: torch.Tensor, block_shape: list[int]):
 
+def triton_impl(
+    a: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    a1_scale: torch.Tensor,
+    block_shape: list[int],
+):
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=w1_scale,
         w2_scale=w2_scale,
@@ -300,7 +350,8 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
         quant_config=quant_config,
         # Make sure this is set to False so we
         # don't end up comparing the same implementation.
-        allow_deep_gemm=False)
+        allow_deep_gemm=False,
+    )
 
 
 def _test_deepep_deepgemm_moe(
@@ -321,22 +372,21 @@ def _test_deepep_deepgemm_moe(
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, pgi.rank)
-    block_shape = [
-        w1.size(1) // w1_scale.size(1),
-        w1.size(2) // w1_scale.size(2)
-    ]
+    block_shape = [w1.size(1) // w1_scale.size(1), w1.size(2) // w1_scale.size(2)]
 
     with set_current_vllm_config(VllmConfig()):
         # Reference
-        triton_moe = triton_impl(a=test_tensors.rank_tokens,
-                                 topk_ids=test_tensors.topk,
-                                 topk_weights=test_tensors.topk_weights,
-                                 w1=w1,
-                                 w2=w2,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale,
-                                 a1_scale=test_tensors.rank_token_scales,
-                                 block_shape=block_shape)
+        triton_moe = triton_impl(
+            a=test_tensors.rank_tokens,
+            topk_ids=test_tensors.topk,
+            topk_weights=test_tensors.topk_weights,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=test_tensors.rank_token_scales,
+            block_shape=block_shape,
+        )
 
         # Slice experts for this rank.
         num_local_experts = config.num_experts // pgi.world_size
@@ -369,7 +419,6 @@ def _test_deepep_deepgemm_moe(
 MNKs = [
     (8, 128, 128),
     (8, 128, 512),
-    (8, 512, 512),
     (3, 1024, 2048),
     (32, 128, 1024),
     (45, 512, 2048),
@@ -390,14 +439,16 @@ NUM_EXPERTS = [32]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
-                    reason="Skipping test for Blackwell DeepGEMM")
-def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
-                                topk: int, world_dp_size: tuple[int, int]):
+def test_ht_deepep_deepgemm_moe(
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
+):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
-    import deep_gemm
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -405,25 +456,36 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
 
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     block_size = [block_m, block_m]
 
     world_size, dp_size = world_dp_size
-    config = TestConfig(topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts,
-                        per_act_token_quant=False,
-                        block_size=block_size,
-                        low_latency=False,
-                        use_fp8_dispatch=None)
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        per_act_token_quant=False,
+        block_size=block_size,
+        low_latency=False,
+        use_fp8_dispatch=None,
+    )
 
     w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
-        num_experts, n, k, block_size)
+        num_experts, n, k, block_size
+    )
 
-    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
-                    w2, w1_scale, w2_scale)
+    parallel_launch(
+        world_size,
+        _test_deepep_deepgemm_moe,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+    )
 
 
 MNKs = [
@@ -448,8 +510,6 @@ USE_FP8_DISPATCH = [False]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
-                    reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
@@ -457,10 +517,12 @@ def test_ll_deepep_deepgemm_moe(
     use_fp8_dispatch: bool,
     block_size: list[int],
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
     """
+    assert not is_deep_gemm_e8m0_used()
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -482,7 +544,16 @@ def test_ll_deepep_deepgemm_moe(
     )
 
     w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
-        num_experts, n, k, block_size)
+        num_experts, n, k, block_size
+    )
 
-    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
-                    w2, w1_scale, w2_scale)
+    parallel_launch(
+        world_size,
+        _test_deepep_deepgemm_moe,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+    )
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 54d3a62b03fccb387aa25dda9f5913b17f3cd91d..d78b8250463a96dcfa63854d5b77aa5641e08274 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -5,7 +5,6 @@ Test deepep dispatch-combine logic
 """
 
 import dataclasses
-from typing import Optional, Union
 
 import pytest
 import torch.distributed
@@ -16,23 +15,24 @@ from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import TritonExperts
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+    per_token_group_quant_fp8,
+)
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep
+from vllm.utils.import_utils import has_deep_ep
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 if has_deep_ep():
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
+        DeepEPHTPrepareAndFinalize,
+    )
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
+        DeepEPLLPrepareAndFinalize,
+    )
 
     from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
@@ -45,7 +45,7 @@ MAX_TOKENS_PER_RANK = 64
 
 
 def make_weights(
-        e, n, k, dtype
+    e, n, k, dtype
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Return weights w1, w2, w1_scale, w2_scale
@@ -64,17 +64,15 @@ def make_weights(
     k_b_scales = k
     w1_q = torch.empty_like(w1, dtype=dtype)
     w2_q = torch.empty_like(w2, dtype=dtype)
-    w1_scale = torch.empty((e, n_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((e, k_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
+    w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
     for expert in range(e):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-            w1[expert], use_per_token_if_dynamic=True)
+            w1[expert], use_per_token_if_dynamic=True
+        )
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-            w2[expert], use_per_token_if_dynamic=True)
+            w2[expert], use_per_token_if_dynamic=True
+        )
     return w1_q, w2_q, w1_scale, w2_scale
 
 
@@ -91,7 +89,7 @@ class TestConfig:
 @dataclasses.dataclass
 class TestTensors:
     rank_tokens: torch.Tensor  # all ranks make this many tokens
-    rank_token_scales: Optional[torch.Tensor]
+    rank_token_scales: torch.Tensor | None
     topk: torch.Tensor
     topk_weights: torch.Tensor
     config: TestConfig
@@ -100,24 +98,25 @@ class TestTensors:
     def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
         # TODO (varun) - check that float16 works ?
         assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
-        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
-                       else config.dtype)
-        rank_tokens = torch.randn(
-            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        token_dtype = (
+            torch.bfloat16 if config.dtype == torch.float8_e4m3fn else config.dtype
+        )
+        rank_tokens = (
+            torch.randn((config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        )
         rank_token_scales = None
 
-        topk = torch.randint(low=0,
-                             high=config.num_experts,
-                             size=(config.m, config.topk),
-                             device="cuda").to(dtype=torch.int64)
-        topk_weights = torch.randn(topk.shape,
-                                   dtype=torch.float32,
-                                   device="cuda")
-        return TestTensors(rank_tokens=rank_tokens,
-                           rank_token_scales=rank_token_scales,
-                           topk=topk,
-                           topk_weights=topk_weights,
-                           config=config)
+        topk = torch.randint(
+            low=0, high=config.num_experts, size=(config.m, config.topk), device="cuda"
+        ).to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape, dtype=torch.float32, device="cuda")
+        return TestTensors(
+            rank_tokens=rank_tokens,
+            rank_token_scales=rank_token_scales,
+            topk=topk,
+            topk_weights=topk_weights,
+            config=config,
+        )
 
 
 def make_modular_kernel(
@@ -128,32 +127,35 @@ def make_modular_kernel(
     dp_size: int,
     num_experts: int,
     num_local_experts: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
     use_fp8_dispatch: bool,
     quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
-
-    ht_args: Optional[DeepEPHTArgs] = None
-    ll_args: Optional[DeepEPLLArgs] = None
+    ht_args: DeepEPHTArgs | None = None
+    ll_args: DeepEPLLArgs | None = None
 
     if low_latency_mode:
-        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
-                               hidden_size=hidden_size,
-                               num_experts=num_experts,
-                               use_fp8_dispatch=use_fp8_dispatch)
+        ll_args = DeepEPLLArgs(
+            max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+            hidden_size=hidden_size,
+            num_experts=num_experts,
+            use_fp8_dispatch=use_fp8_dispatch,
+        )
     else:
         assert not use_fp8_dispatch, (
-            "FP8 Dispatch is valid only for low-latency kernels")
+            "FP8 Dispatch is valid only for low-latency kernels"
+        )
         ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
 
-    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
-        make_deepep_a2a(pg = pg,
-                        pgi = pgi,
-                        dp_size = dp_size,
-                        q_dtype = q_dtype,
-                        block_shape = None,
-                        deepep_ht_args = ht_args,
-                        deepep_ll_args = ll_args)
+    a2a: DeepEPHTPrepareAndFinalize | DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        q_dtype=q_dtype,
+        block_shape=None,
+        deepep_ht_args=ht_args,
+        deepep_ll_args=ll_args,
+    )
 
     num_dispatchers = pgi.world_size // dp_size
 
@@ -167,8 +169,7 @@ def make_modular_kernel(
     else:
         fused_experts = TritonExperts(quant_config=quant_config)
 
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
@@ -180,25 +181,21 @@ def deep_ep_moe_impl(
     test_tensors: TestTensors,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     num_experts: int,
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ) -> torch.Tensor:
-
     num_local_experts = w1.size(0)
 
     def build_expert_map():
         num_local_experts = w1.size(0)
-        expert_map = torch.full((num_experts, ),
-                                fill_value=-1,
-                                dtype=torch.int32)
+        expert_map = torch.full((num_experts,), fill_value=-1, dtype=torch.int32)
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(),
-                             dtype=torch.int32)
+        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
 
     hidden_size = test_tensors.rank_tokens.size(1)
     is_quantized = w1.dtype == torch.float8_e4m3fn
@@ -214,11 +211,12 @@ def deep_ep_moe_impl(
         topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
         topk_chunk = test_tensors.topk[chunk_start:chunk_end]
         rank_token_scales_chunk = test_tensors.rank_token_scales
-        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
-                0) == total_num_tokens:
+        if (
+            rank_token_scales_chunk is not None
+            and rank_token_scales_chunk.size(0) == total_num_tokens
+        ):
             # per act token
-            rank_token_scales_chunk = rank_token_scales_chunk[
-                chunk_start:chunk_end]
+            rank_token_scales_chunk = rank_token_scales_chunk[chunk_start:chunk_end]
 
         quant_config = FusedMoEQuantConfig.make(
             q_dtype,
@@ -230,26 +228,37 @@ def deep_ep_moe_impl(
 
         # Make modular kernel
         mk: FusedMoEModularKernel = make_modular_kernel(
-            pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts,
-            num_local_experts, q_dtype, use_fp8_dispatch, quant_config)
-
-        out = mk.forward(hidden_states=rank_tokens_chunk,
-                         w1=w1,
-                         w2=w2,
-                         topk_weights=topk_weights_chunk,
-                         topk_ids=topk_chunk,
-                         inplace=False,
-                         activation="silu",
-                         global_num_experts=num_experts,
-                         expert_map=build_expert_map(),
-                         apply_router_weight_on_input=False)
+            pg,
+            pgi,
+            low_latency_mode,
+            hidden_size,
+            dp_size,
+            num_experts,
+            num_local_experts,
+            q_dtype,
+            use_fp8_dispatch,
+            quant_config,
+        )
+
+        out = mk.forward(
+            hidden_states=rank_tokens_chunk,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights_chunk,
+            topk_ids=topk_chunk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            apply_router_weight_on_input=False,
+        )
 
         if not skip_result_store:
-            out_hidden_states[chunk_start:chunk_end, :].copy_(
-                out, non_blocking=True)
+            out_hidden_states[chunk_start:chunk_end, :].copy_(out, non_blocking=True)
 
-    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
-                             if low_latency_mode else total_num_tokens)
+    max_num_tokens_per_dp = (
+        MAX_TOKENS_PER_RANK if low_latency_mode else total_num_tokens
+    )
 
     for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
         chunk_start = chunk_start_
@@ -258,9 +267,9 @@ def deep_ep_moe_impl(
         chunk_start = min(chunk_start, total_num_tokens - 1)
         chunk_end = min(chunk_end, total_num_tokens)
 
-        process_chunk(chunk_start,
-                      chunk_end,
-                      skip_result_store=chunk_start_ >= total_num_tokens)
+        process_chunk(
+            chunk_start, chunk_end, skip_result_store=chunk_start_ >= total_num_tokens
+        )
 
     return out_hidden_states
 
@@ -269,23 +278,28 @@ def torch_moe_impl(
     test_tensors: TestTensors,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     using_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
-
-    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
-                                 test_tensors.topk_weights)
+    a, topk_ids, topk_weights = (
+        test_tensors.rank_tokens,
+        test_tensors.topk,
+        test_tensors.topk_weights,
+    )
     if using_fp8_dispatch:
         # The DeepEP implementation is requested to dispatch using FP8.
         # For numerical stability for testing, emulate the fp8 dispatch by
         # blockwise quant and de-quant.
         assert not per_act_token_quant
         a = test_tensors.rank_tokens
-        aq, aq_scale = per_token_group_quant_fp8(a, 128)
-        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
-            a.shape).to(a.dtype)
+        aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
+        a = (
+            (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
+            .view(a.shape)
+            .to(a.dtype)
+        )
 
     is_quantized = w1.dtype == torch.float8_e4m3fn
     a_dtype = a.dtype
@@ -306,8 +320,9 @@ def torch_moe_impl(
             e_w = topk_weights[i][j]
             w1_e = w1[e]
             w2_e = w2[e]
-            o_i += (SiluAndMul()
-                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+            o_i += (
+                SiluAndMul()(a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)
+            ) * e_w
 
     if is_quantized:
         out = out.to(dtype=a_dtype)
@@ -322,33 +337,41 @@ def _deep_ep_moe(
     config: TestConfig,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
-
     if not low_latency_mode:
         assert not use_fp8_dispatch, (
-            "FP8 dispatch interface is available only in low-latency mode")
+            "FP8 dispatch interface is available only in low-latency mode"
+        )
 
     is_quantized = w1.dtype == torch.float8_e4m3fn
     w1 = w1.to(device=torch.cuda.current_device())
     w2 = w2.to(device=torch.cuda.current_device())
     if is_quantized:
         w1_scale = w1_scale.to(  # type: ignore
-            device=torch.cuda.current_device())
+            device=torch.cuda.current_device()
+        )
         w2_scale = w2_scale.to(  # type: ignore
-            device=torch.cuda.current_device())
+            device=torch.cuda.current_device()
+        )
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, low_latency_mode)
 
     with set_current_vllm_config(VllmConfig()):
         # Reference
-        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
-                                        w2_scale, use_fp8_dispatch,
-                                        per_act_token_quant)
+        torch_combined = torch_moe_impl(
+            test_tensors,
+            w1,
+            w2,
+            w1_scale,
+            w2_scale,
+            use_fp8_dispatch,
+            per_act_token_quant,
+        )
 
         # Splice experts for this rank.
         num_local_experts = config.num_experts // pgi.world_size
@@ -420,18 +443,23 @@ def test_deep_ep_moe(
 
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    config = TestConfig(dtype=dtype,
-                        topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts)
+    config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
-    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
-                    per_act_token_quant)
+    parallel_launch(
+        world_size,
+        _deep_ep_moe,
+        low_latency_mode,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        use_fp8_dispatch,
+        per_act_token_quant,
+    )
 
 
 MNKs = [
@@ -467,8 +495,7 @@ def test_low_latency_deep_ep_moe(
 ):
     low_latency_mode = True
 
-    if (low_latency_mode
-            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+    if low_latency_mode and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES:
         pytest.skip(
             f"Skipping test as hidden size {k} is not in list of supported "
             f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
@@ -476,15 +503,20 @@ def test_low_latency_deep_ep_moe(
 
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    config = TestConfig(dtype=dtype,
-                        topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts)
+    config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
-    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
-                    False)
+    parallel_launch(
+        world_size,
+        _deep_ep_moe,
+        low_latency_mode,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        use_fp8_dispatch,
+        False,
+    )
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index d575b6d4ca62c0c371a9165fb4247b32a0cf789c..9b1054f7d0ab845e03c452586c228656dc130b1c 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -11,14 +11,18 @@ import math
 import pytest
 import torch
 
-from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config)
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+
 # vLLM fused-expert reference (Triton fallback + DeepGEMM option)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
-from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
-                                  per_block_cast_to_fp8)
+    per_token_group_quant_fp8,
+)
+from vllm.utils.deep_gemm import (
+    calc_diff,
+    is_deep_gemm_supported,
+    per_block_cast_to_fp8,
+)
 
 BLOCK_SIZE = [128, 128]
 
@@ -37,8 +41,10 @@ def make_block_quant_fp8_weights(
       w2 shape: (E, K, N)
     """
     dtype = torch.bfloat16
-    fp8_max, fp8_min = torch.finfo(torch.float8_e4m3fn).max, torch.finfo(
-        torch.float8_e4m3fn).min
+    fp8_max, fp8_min = (
+        torch.finfo(torch.float8_e4m3fn).max,
+        torch.finfo(torch.float8_e4m3fn).min,
+    )
 
     # bf16 reference weights
     w1_bf16 = torch.randn(e, 2 * n, k, device="cuda", dtype=dtype) / 10
@@ -54,24 +60,16 @@ def make_block_quant_fp8_weights(
 
     w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
     w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
-    w1_s = torch.empty(e,
-                       n_tiles_w1,
-                       k_tiles_w1,
-                       device="cuda",
-                       dtype=torch.float32)
-    w2_s = torch.empty(e,
-                       n_tiles_w2,
-                       k_tiles_w2,
-                       device="cuda",
-                       dtype=torch.float32)
+    w1_s = torch.empty(e, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32)
+    w2_s = torch.empty(e, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32)
 
     for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size=block_size,
-                                               use_ue8m0=True)
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size=block_size,
-                                               use_ue8m0=True)
+        w1[i], w1_s[i] = per_block_cast_to_fp8(
+            w1_bf16[i], block_size=block_size, use_ue8m0=True
+        )
+        w2[i], w2_s[i] = per_block_cast_to_fp8(
+            w2_bf16[i], block_size=block_size, use_ue8m0=True
+        )
 
     return w1, w2, w1_s, w2_s
 
@@ -81,18 +79,17 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
     Run one (M,N,K) configuration on a single GPU and assert DeepGEMM ==
     Triton baseline within tolerance.
     """
-    tokens_bf16 = torch.randn(
-        m, k, device="cuda", dtype=torch.bfloat16).clamp_min_(-1).clamp_max_(1)
+    tokens_bf16 = (
+        torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        .clamp_min_(-1)
+        .clamp_max_(1)
+    )
     _, a1_scale = per_token_group_quant_fp8(tokens_bf16, block_size[1])
 
     # expert weight tensors
-    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(num_experts, n, k,
-                                                      block_size)
+    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(num_experts, n, k, block_size)
 
-    router_logits = torch.randn(m,
-                                num_experts,
-                                device="cuda",
-                                dtype=torch.float32)
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=torch.float32)
     topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
     topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
 
@@ -133,10 +130,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
 # Note: N <= 512 will disable the deepgemm path due to performance issues.
 MNKs = [
     (1024, 768, 128),
-    (1024, 768, 512),
     (2048, 768, 512),
     (512, 1024, 1024),
-    (512, 2048, 2048),
     (4096, 4096, 1024),
 ]
 
@@ -147,15 +142,14 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize(("m", "n", "k"), MNKs)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
-@pytest.mark.skipif(not is_deep_gemm_supported(),
-                    reason="Requires deep_gemm kernels")
+@pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels")
 def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch):
-
     with monkeypatch.context() as mp:
         mp.setenv("VLLM_USE_DEEP_GEMM", "1")
 
         _fused_moe_mod = importlib.import_module(
-            "vllm.model_executor.layers.fused_moe.fused_moe")
+            "vllm.model_executor.layers.fused_moe.fused_moe"
+        )
 
         call_counter = {"cnt": 0}
 
@@ -165,8 +159,7 @@ def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch):
             call_counter["cnt"] += 1
             return orig_fn(*args, **kwargs)
 
-        monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8",
-                            _spy_deep_gemm_moe_fp8)
+        monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8", _spy_deep_gemm_moe_fp8)
 
         if topk > num_experts:
             pytest.skip(f"topk={topk} > num_experts={num_experts}")
@@ -181,6 +174,7 @@ def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch):
         )
 
         # ensure that the DeepGEMM path was indeed taken.
-        assert call_counter["cnt"] == 1, \
-            f"DeepGEMM path was not executed during the test. " \
+        assert call_counter["cnt"] == 1, (
+            f"DeepGEMM path was not executed during the test. "
             f"Call counter: {call_counter['cnt']}"
+        )
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index c3be7f28fb2426439c736a6d58f1056a44e2fa47..218df4a2632c331aa106dbf3c7b216f9c6a39dcc 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -7,31 +7,36 @@ import torch
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config)
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8,
-    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
-    swap_w13_to_w31)
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    input_to_float8)
+    apply_flashinfer_per_tensor_scale_fp8,
+    flashinfer_cutlass_moe_fp8,
+    register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31,
+)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
-if not has_flashinfer_cutlass_fused_moe(
-) or not current_platform.has_device_capability(100):
-    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
-                allow_module_level=True)
+if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
+    90
+):
+    pytest.skip(
+        "Supported for sm >= 90",
+        allow_module_level=True,
+    )
 
 NUM_EXPERTS = [16]
 TOP_KS = [1]
 
 MNK_FACTORS = [
     (256, 8192, 5120),
-    (256, 4096, 5120),
-    (127, 8192, 5120),
     (127, 4096, 5120),
     (10, 8192, 5120),
     (10, 4096, 5120),
@@ -39,10 +44,7 @@ MNK_FACTORS = [
     (1, 4096, 5120),
 ]
 
-vllm_config = VllmConfig(parallel_config=ParallelConfig(
-    pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
+vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 
 
 def quant_fp8_per_tensor_batches(a):
@@ -74,18 +76,21 @@ class TestData:
     layer: torch.nn.Module
 
     @staticmethod
-    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
-                              reorder: bool) -> "TestData":
-        hidden_states = torch.randn(
-            (m, k), device="cuda", dtype=torch.bfloat16) / 10
-        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+    def make_moe_tensors_8bit(
+        m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
+    ) -> "TestData":
+        is_gated = activation != "relu2_no_mul"
+
+        hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+        w13 = torch.randn(
+            (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
+        )
         w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
 
         # Scale to fp8
         _, a1_scale = input_to_float8(hidden_states)
         a1_scale = 1.0 / a1_scale
-        a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(
-            dtype=torch.float32)
+        a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(dtype=torch.float32)
         w13_quantized, w13_weight_scale = quant_fp8_per_tensor_batches(w13)
         w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)
 
@@ -102,8 +107,7 @@ class TestData:
         # flashinfer expects swapped rows for w13
         layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
         if reorder:
-            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
-                                              layer.w2_weight)
+            rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
         layer.custom_routing_function = Llama4MoE.custom_routing_function
         layer.intermediate_size_per_partition = n
         layer.ep_rank = 0
@@ -132,6 +136,8 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
     topk: int,
     monkeypatch,
 ):
+    if not current_platform.has_device_capability(100):
+        pytest.skip("Test is only supported for sm >= 100")
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
@@ -145,7 +151,8 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
             top_k=topk,
             renormalize=False,
             custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax")
+            scoring_func="softmax",
+        )
 
         quant_config = fp8_w8a8_moe_quant_config(
             w1_scale=td.w13_weight_scale,
@@ -178,32 +185,31 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
             top_k=topk,
             num_expert_group=None,
             topk_group=None,
-            apply_router_weight_on_input=True)
+            apply_router_weight_on_input=True,
+        )
 
-        torch.testing.assert_close(output,
-                                   flashinfer_output,
-                                   atol=5.5e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(output, flashinfer_output, atol=5.5e-2, rtol=1e-2)
 
 
-@pytest.mark.skip(
-    "Requires flashinfer version that contains https://github.com/flashinfer-ai/flashinfer/pull/1472"
-)
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
 def test_flashinfer_cutlass_moe_fp8_no_graph(
     m: int,
     n: int,
     k: int,
     e: int,
     topk: int,
+    activation: str,
     monkeypatch,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+        td = TestData.make_moe_tensors_8bit(
+            m, k, n, e, reorder=False, activation=activation
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -213,13 +219,18 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
             top_k=topk,
             renormalize=False,
             custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax")
+            scoring_func="softmax",
+        )
 
         quant_config = fp8_w8a8_moe_quant_config(
             w1_scale=td.w13_weight_scale,
+            g1_alphas=(td.w13_weight_scale * td.a1_scale).squeeze(),
             w2_scale=td.w2_weight_scale,
+            g2_alphas=(td.w2_weight_scale * td.a2_scale).squeeze(),
             a1_scale=td.a1_scale,
+            a1_gscale=td.a1_scale,
             a2_scale=td.a2_scale,
+            a2_gscale=1.0 / td.a2_scale,
             per_act_token_quant=False,
         )
 
@@ -230,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=False,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
@@ -239,18 +250,23 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
 
         td.layer.dp_size = 1
 
+        def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
+            return quant_config
+
+        td.layer.get_fused_moe_quant_config = get_fused_moe_quant_config
+        td.layer.quant_method = td.layer
+
         flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
             td.hidden_states,
             td.layer,
             topk_weights,
             topk_ids,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
         )
 
-        torch.testing.assert_close(output,
-                                   flashinfer_cutlass_output,
-                                   atol=5.5e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(
+            output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
+        )
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index 8bf096b798cb8668a81c55001a9e5310518e3e25..be3e36865d1a444fdc4ce3b16fc887ce853d8382 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -4,33 +4,38 @@ import pytest
 import torch
 
 from tests.kernels.moe.utils import make_test_quant_config
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from tests.kernels.utils import torch_moe
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+    FlashInferExperts,
+    is_valid_flashinfer_cutlass_fused_moe,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
-if not has_flashinfer_cutlass_fused_moe(
-) or not current_platform.has_device_capability(100):
-    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
-                allow_module_level=True)
+if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
+    100
+):
+    pytest.skip(
+        "Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+        allow_module_level=True,
+    )
 
 MNK_FACTORS = [
     (2, 1024, 1024),
-    (2, 1024, 1536),
     (2, 3072, 1024),
     (2, 3072, 1536),
-    (64, 1024, 1024),
     (64, 1024, 1536),
     (64, 3072, 1024),
     (64, 2048, 1536),
@@ -42,15 +47,15 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", [40, 64, 256])
 @pytest.mark.parametrize("topk", [1, 6, 8])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @torch.inference_mode()
-def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
-                                     dtype: torch.dtype):
+def test_flashinfer_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
         quant_blocksize = 16
@@ -66,10 +71,7 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
         assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
 
@@ -87,16 +89,19 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         )
 
         # Reference check:
-        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_global_scale = (
+            (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+        ).to(torch.float32)
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
         _, m_k = a_fp4.shape
-        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                               a_scale_interleaved,
-                                               a_global_scale,
-                                               dtype=a.dtype,
-                                               device=a.device,
-                                               block_size=quant_blocksize)
+        a_in_dtype = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=a.dtype,
+            device=a.device,
+            block_size=quant_blocksize,
+        )
 
         w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
@@ -104,23 +109,26 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         for idx in range(0, e):
             w1_d[idx] = dequantize_nvfp4_to_dtype(
                 w1_q[idx],
-                quant_config.w1_scale[idx], (1 / quant_config.g1_alphas[idx]),
+                quant_config.w1_scale[idx],
+                (1 / quant_config.g1_alphas[idx]),
                 dtype=dtype,
                 device=w1_q.device,
-                block_size=quant_blocksize)
+                block_size=quant_blocksize,
+            )
             w2_d[idx] = dequantize_nvfp4_to_dtype(
                 w2_q[idx],
-                quant_config.w2_scale[idx], (1 / quant_config.g2_alphas[idx]),
+                quant_config.w2_scale[idx],
+                (1 / quant_config.g2_alphas[idx]),
                 dtype=dtype,
                 device=w2_q.device,
-                block_size=quant_blocksize)
+                block_size=quant_blocksize,
+            )
 
         torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
 
-        torch.testing.assert_close(torch_output,
-                                   flashinfer_output,
-                                   atol=1e-1,
-                                   rtol=1e-1)
+        torch.testing.assert_close(
+            torch_output, flashinfer_output, atol=1e-1, rtol=1e-1
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index 024993c7677dd2701f6e827da6c6b2171cdbee64..dfd317bcf72f16f56445ac28fb1d1af36ffe0184 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.utils import has_triton_kernels
+from vllm.utils.import_utils import has_triton_kernels
 
 if not has_triton_kernels():
     pytest.skip(
@@ -17,22 +17,17 @@ if not has_triton_kernels():
 import triton_kernels.swiglu
 from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 from triton_kernels.numerics import InFlexData
-from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp,
-                                                  upcast_from_mxfp)
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
 from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
 from triton_kernels.tensor_details import layout
 from triton_kernels.testing import assert_close
 
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize)
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    BatchedOAITritonExperts, triton_kernel_moe_forward)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+    triton_kernel_moe_forward,
+)
 from vllm.model_executor.layers.utils import shuffle_weight
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 
 def deshuffle(w: torch.Tensor):
@@ -46,13 +41,11 @@ def deshuffle(w: torch.Tensor):
 def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
     randbits = [torch.randperm(E) for _ in range(M)]
     x_list = [
-        (-1)**i *
-        ((16384 +
-          ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
+        (-1) ** i
+        * ((16384 + ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
         for i, bits in enumerate(randbits)
     ]
-    exp_data = torch.stack(x_list).to(
-        device="cuda")  # simulating gate_output (M, E)
+    exp_data = torch.stack(x_list).to(device="cuda")  # simulating gate_output (M, E)
 
     # create input tensor
     x = torch.randn((M, K), dtype=torch.bfloat16, device="cuda")
@@ -120,20 +113,21 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
             value=0,
         )
 
-        w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
-                            mode="constant",
-                            value=0)
-        w2_bias_tri = F.pad(w2_bias_tri, (0, w2_right_pad, 0, 0),
-                            mode="constant",
-                            value=0)
+        w1_bias_tri = F.pad(
+            w1_bias_tri, (0, w1_right_pad, 0, 0), mode="constant", value=0
+        )
+        w2_bias_tri = F.pad(
+            w2_bias_tri, (0, w2_right_pad, 0, 0), mode="constant", value=0
+        )
 
         x_tri = F.pad(x_tri, (0, x_pad, 0, 0), mode="constant", value=0)
 
-        w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
-            mx_axis=1)
+        w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
         w_scale_layout, w_scale_layout_opts = (
             layout.make_default_matmul_mxfp4_w_scale_layout(
-                mx_axis=1, num_warps=num_warps))
+                mx_axis=1, num_warps=num_warps
+            )
+        )
 
         w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
         w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, torch.bfloat16, axis=1)
@@ -141,29 +135,33 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
         w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
         w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, torch.bfloat16, axis=1)
 
-        w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
-                                **w_layout_opts)
+        w1_tri = convert_layout(
+            wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts
+        )
         w1_scale_tri = convert_layout(
             wrap_torch_tensor(w1_scale_tri),
             w_scale_layout,
             **w_scale_layout_opts,
         )
 
-        w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
-                                **w_layout_opts)
+        w2_tri = convert_layout(
+            wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts
+        )
         w2_scale_tri = convert_layout(
             wrap_torch_tensor(w2_scale_tri),
             w_scale_layout,
             **w_scale_layout_opts,
         )
 
-        pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
-                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
-        pc2 = PrecisionConfig(weight_scale=w2_scale_tri,
-                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
+        pc1 = PrecisionConfig(
+            weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+        )
+        pc2 = PrecisionConfig(
+            weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+        )
 
         # tucuate so the rest can run properly
-        w1 = w1[..., :K, :2 * N]
+        w1 = w1[..., :K, : 2 * N]
         w2 = w2[..., :N, :K]
 
         w1 = deshuffle(w1)
@@ -261,7 +259,8 @@ class Case:
 @pytest.mark.parametrize(
     ", ".join(f.name for f in fields(Case)),
     [
-        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+        tuple(getattr(case, f.name) for f in fields(Case))
+        for case in [
             # Case(a_dtype="bf16", w_dtype="bf16"),
             # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
             Case(a_dtype="bf16", w_dtype="mx4")
@@ -297,8 +296,8 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
     quant_config = FusedMoEQuantConfig.make(
         w1_bias=w1_bias_tri,
         w2_bias=w2_bias_tri,
-        w1_precision=pc1,
-        w2_precision=pc2,
+        w1_scale=pc1,
+        w2_scale=pc2,
     )
 
     out_triton_monolithic = triton_kernel_moe_forward(
@@ -321,118 +320,7 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
         gating_output=exp_data,
         topk=topk,
     )
-    assert_close(ref=out_ref,
-                 tri=out_triton_monolithic,
-                 maxtol=0.025,
-                 rmstol=0.005)
-
-
-def batched_moe(
-    a: torch.Tensor,
-    w1,
-    w2,
-    gating_output: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    w1_bias: torch.Tensor,
-    w2_bias: torch.Tensor,
-    w1_precision: PrecisionConfig,
-    w2_precision: PrecisionConfig,
-) -> torch.Tensor:
-    max_num_tokens = round_up(a.shape[0], 64)
-
-    quant_config = FusedMoEQuantConfig.make(
-        w1_precision=w1_precision,
-        w2_precision=w2_precision,
-        w1_bias=w1_bias,
-        w2_bias=w2_bias,
-    )
-
-    fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(
-            max_num_tokens,
-            num_dispatchers=1,
-            num_local_experts=w1.shape[0],
-            rank=0,
-        ),
-        BatchedOAITritonExperts(
-            max_num_tokens=max_num_tokens,
-            num_dispatchers=1,
-            quant_config=quant_config,
-        ),
-    )
-
-    topk_weight, topk_ids, _ = fused_topk(a, gating_output, topk, renormalize)
-
-    return fused_experts(
-        a,
-        w1,
-        w2,
-        topk_weight,
-        topk_ids,
-    )
-
-
-@pytest.mark.parametrize(
-    ", ".join(f.name for f in fields(Case)),
-    [
-        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
-            # Case(a_dtype="bf16", w_dtype="bf16"),
-            # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
-            Case(a_dtype="bf16", w_dtype="mx4")
-        ]
-    ],
-)
-@pytest.mark.parametrize("num_token", [64])
-@pytest.mark.parametrize("ep", [1, 2, 4, 8])
-def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep):
-    M = num_token
-    E = ModelConfig.num_experts // ep
-    K = ModelConfig.hidden_size
-    N = ModelConfig.intermediate_size
-    topk = ModelConfig.experts_per_token
-
-    (
-        x,
-        w1,
-        w1_bias,
-        w2,
-        w2_bias,
-        exp_data,
-        x_tri,
-        w1_tri,
-        w2_tri,
-        exp_data_tri,
-        w1_bias_tri,
-        w2_bias_tri,
-        pc1,
-        pc2,
-    ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=4)
-
-    out_tri = batched_moe(
-        a=x_tri,
-        w1=w1_tri,
-        w2=w2_tri,
-        gating_output=exp_data_tri,
-        topk=topk,
-        renormalize=True,
-        w1_bias=w1_bias_tri,
-        w2_bias=w2_bias_tri,
-        w1_precision=pc1,
-        w2_precision=pc2,
-    )
-    out_tri = out_tri[..., :K]
-
-    out_ref = oai_moe_forward(
-        hidden_states=x,
-        w1=w1,
-        w1_bias=w1_bias,
-        w2=w2,
-        w2_bias=w2_bias,
-        gating_output=exp_data,
-        topk=topk,
-    )
-    assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005)
+    assert_close(ref=out_ref, tri=out_triton_monolithic, maxtol=0.025, rmstol=0.005)
 
 
 def test_unit_shuffle():
diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
index 646e763194fd648305f3e1abb768615d77d6c7d7..662e0723b75833def8e8f0c1b07eb74b996663dc 100644
--- a/tests/kernels/moe/test_grouped_topk.py
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -4,16 +4,20 @@
 
 Run `pytest tests/kernels/moe/test_grouped_topk.py`.
 """
+
 import pytest
 import torch
 
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_grouped_topk,
-                                                            grouped_topk)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_grouped_topk,
+    grouped_topk,
+)
 from vllm.platforms import current_platform
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 @pytest.mark.parametrize("n_token", [1, 33, 64])
 @pytest.mark.parametrize("n_hidden", [1024, 2048])
 @pytest.mark.parametrize("n_expert", [16])
@@ -23,23 +27,26 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize("topk_group", [2])
 @pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
 @pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5])
-@pytest.mark.parametrize("dtype",
-                         [torch.float16, torch.bfloat16, torch.float32])
-def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
-                      n_hidden: int, n_expert: int, topk: int,
-                      renormalize: bool, num_expert_group: int,
-                      topk_group: int, scoring_func: str,
-                      routed_scaling_factor: float, dtype: torch.dtype):
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+def test_grouped_topk(
+    monkeypatch: pytest.MonkeyPatch,
+    n_token: int,
+    n_hidden: int,
+    n_expert: int,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int,
+    topk_group: int,
+    scoring_func: str,
+    routed_scaling_factor: float,
+    dtype: torch.dtype,
+):
     current_platform.seed_everything(0)
-    hidden_states = torch.randn((n_token, n_hidden),
-                                dtype=dtype,
-                                device="cuda")
-    gating_output = torch.randn((n_token, n_expert),
-                                dtype=dtype,
-                                device="cuda")
-    e_score_correction_bias = torch.randn((n_expert, ),
-                                          dtype=torch.float32,
-                                          device="cuda")
+    hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda")
+    gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda")
+    e_score_correction_bias = torch.randn(
+        (n_expert,), dtype=torch.float32, device="cuda"
+    )
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
@@ -52,7 +59,8 @@ def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
             topk_group=topk_group,
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+        )
 
         test_topk_weights, test_topk_ids = fused_grouped_topk(
             hidden_states=hidden_states,
@@ -63,14 +71,11 @@ def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
             topk_group=topk_group,
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+        )
 
         if renormalize:
-            torch.testing.assert_close(baseline_topk_weights,
-                                       test_topk_weights,
-                                       atol=2e-2,
-                                       rtol=0)
-        torch.testing.assert_close(baseline_topk_ids,
-                                   test_topk_ids,
-                                   atol=0,
-                                   rtol=0)
+            torch.testing.assert_close(
+                baseline_topk_weights, test_topk_weights, atol=2e-2, rtol=0
+            )
+        torch.testing.assert_close(baseline_topk_ids, test_topk_ids, atol=0, rtol=0)
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 1c7e62d7aa4c8d2541e295e52c7d74ab3c949452..e3b8621b452fafc1d12b1c84619e44808c42c81a 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -5,7 +5,7 @@ import copy
 import textwrap
 import traceback
 from itertools import product
-from typing import Optional
+from typing import Any
 
 import pytest
 import torch
@@ -13,22 +13,33 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-
-from ...utils import multi_gpu_test
-from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
-                                          reference_moe_impl,
-                                          run_modular_kernel)
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.torch_utils import cuda_device_count_stateless
+
+from .modular_kernel_tools.common import (
+    Config,
+    RankTensors,
+    WeightTensors,
+    reference_moe_impl,
+    run_modular_kernel,
+)
 from .modular_kernel_tools.mk_objects import (
-    MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
-    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, TestMoEQuantConfig,
-    expert_info)
-from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
-                                                  parallel_launch_with_config)
+    MK_FUSED_EXPERT_TYPES,
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS,
+    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
+    TestMoEQuantConfig,
+    expert_info,
+)
+from .modular_kernel_tools.parallel_utils import (
+    ProcessGroupInfo,
+    parallel_launch_with_config,
+)
 
-has_any_multi_gpu_package = (has_deep_ep() or has_deep_gemm() or has_pplx()
-                             or has_flashinfer_cutlass_fused_moe())
+has_any_multi_gpu_package = (
+    has_deep_ep() or has_deep_gemm() or has_pplx() or has_flashinfer_cutlass_fused_moe()
+)
 
 meets_multi_gpu_requirements = pytest.mark.skipif(
     not has_any_multi_gpu_package,
@@ -64,9 +75,9 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if base_config.fused_moe_chunk_size is not None:
-        assert (
-            base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
@@ -93,8 +104,7 @@ def rank_worker(
             rank_tensors = RankTensors.make(config, pgi)
 
             # modular kernel out
-            mk_out = run_modular_kernel(pgi, vllm_config, config, weights,
-                                        rank_tensors)
+            mk_out = run_modular_kernel(pgi, vllm_config, config, weights, rank_tensors)
 
             with set_current_vllm_config(vllm_config):
                 ref_out = reference_moe_impl(config, weights, rank_tensors)
@@ -115,20 +125,22 @@ def rank_worker(
     if len(exceptions) > 0:
         raise RuntimeError(
             f"{len(exceptions)} of {count} tests failed in child process, "
-            f"rank={pgi.rank}.")
+            f"rank={pgi.rank}."
+        )
     else:
-        print(f"{count} of {count} tests passed in child process, "
-              f"rank={pgi.rank}.")
+        print(f"{count} of {count} tests passed in child process, rank={pgi.rank}.")
 
 
 def run(config: Config, verbose: bool):
-    assert config.is_valid()
+    assert config.is_valid()[0]
+    assert not is_nyi_config(config)
 
     weights: WeightTensors = WeightTensors.make(config)
 
     vllm_config, env_dict = config.make_env_data()
-    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights, verbose)
+    parallel_launch_with_config(
+        config.world_size, rank_worker, vllm_config, env_dict, config, weights, verbose
+    )
 
 
 Ms = [32, 64]
@@ -149,31 +161,104 @@ def is_nyi_config(config: Config) -> bool:
     if info.needs_matching_quant:
         # The triton kernels expect both per-act-token-quant and
         # per-out-ch-quant or neither.
-        unsupported_quant_config = ((config.is_per_act_token_quant +
-                                     config.is_per_out_ch_quant) == 1)
+        unsupported_quant_config = (
+            config.is_per_act_token_quant + config.is_per_out_ch_quant
+        ) == 1
         return unsupported_quant_config
 
     return not info.supports_expert_map
 
 
-@pytest.mark.parametrize("k", Ks)
-@pytest.mark.parametrize("n", Ns)
-@pytest.mark.parametrize("e", Es)
-@pytest.mark.parametrize("dtype", DTYPEs)
-@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
+def generate_valid_test_cases(
+    world_size: int, prepare_finalize_types
+) -> list[tuple[Any, ...]]:
+    cases = []
+    total = 0
+
+    for k, n, e, dtype, quant_config, combination, chunk_size in product(
+        Ks,
+        Ns,
+        Es,
+        DTYPEs,
+        MK_QUANT_CONFIGS,
+        product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES),
+        FUSED_MOE_CHUNK_SIZEs,
+    ):
+        total = total + 1
+
+        config = Config(
+            Ms=Ms,
+            K=k,
+            N=n,
+            E=e,
+            topks=TOPKs,
+            dtype=dtype,
+            quant_config=quant_config,
+            prepare_finalize_type=combination[0],
+            fused_experts_type=combination[1],
+            fused_moe_chunk_size=chunk_size,
+            world_size=world_size,
+        )
+
+        # TODO(bnell): figure out how to get verbose flag here.
+        verbose = False  # pytestconfig.getoption('verbose') > 0
+
+        valid, reason = config.is_valid()
+
+        if not valid:
+            if verbose:
+                print(f"Test config {config} is not valid: {reason}")
+            continue
+
+        if is_nyi_config(config):
+            if verbose:
+                print(f"Test config {config} is nyi.")
+            continue
+
+        cases.append(
+            (
+                k,
+                n,
+                e,
+                dtype,
+                quant_config,
+                combination[0],
+                combination[1],
+                chunk_size,
+                world_size,
+            )
+        )
+
+    print(f"{len(cases)} of {total} valid configs generated.")
+
+    return cases
+
+
 @pytest.mark.parametrize(
-    "combination",
-    product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
-@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
-@pytest.mark.parametrize("world_size", [2])
-@multi_gpu_test(num_gpus=2)
+    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size",
+    generate_valid_test_cases(
+        world_size=2, prepare_finalize_types=MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
+    ),
+)
 @meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
-        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: Optional[TestMoEQuantConfig],
-        combination: tuple[mk.FusedMoEPrepareAndFinalize,
-                           mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
+    k: int,
+    n: int,
+    e: int,
+    dtype: torch.dtype,
+    quant_config: TestMoEQuantConfig | None,
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
+    chunk_size: int | None,
+    world_size: int,
+    pytestconfig,
+):
+    if cuda_device_count_stateless() < world_size:
+        pytest.skip(
+            f"Not enough GPUs available to run, got "
+            f"{cuda_device_count_stateless()} exepected "
+            f"{world_size}."
+        )
 
     config = Config(
         Ms=Ms,
@@ -183,38 +268,35 @@ def test_modular_kernel_combinations_multigpu(
         topks=TOPKs,
         dtype=dtype,
         quant_config=quant_config,
-        prepare_finalize_type=combination[0],
-        fused_experts_type=combination[1],
-        fused_moe_chunk_size=fused_moe_chunk_size,
+        prepare_finalize_type=prepare_finalize_type,
+        fused_experts_type=fused_experts_type,
+        fused_moe_chunk_size=chunk_size,
         world_size=world_size,
     )
-
-    if not config.is_valid():
-        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
-
-    if is_nyi_config(config):
-        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
-
-    verbosity = pytestconfig.getoption('verbose')
+    verbosity = pytestconfig.getoption("verbose")
     run(config, verbosity > 0)
 
 
-@pytest.mark.parametrize("k", Ks)
-@pytest.mark.parametrize("n", Ns)
-@pytest.mark.parametrize("e", Es)
-@pytest.mark.parametrize("dtype", DTYPEs)
-@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
 @pytest.mark.parametrize(
-    "combination",
-    product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
-@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
-@pytest.mark.parametrize("world_size", [1])
+    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size",
+    generate_valid_test_cases(
+        world_size=1, prepare_finalize_types=MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES
+    ),
+)
 def test_modular_kernel_combinations_singlegpu(
-        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: Optional[TestMoEQuantConfig],
-        combination: tuple[mk.FusedMoEPrepareAndFinalize,
-                           mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
+    k: int,
+    n: int,
+    e: int,
+    dtype: torch.dtype,
+    quant_config: TestMoEQuantConfig | None,
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
+    chunk_size: int | None,
+    world_size: int,
+    pytestconfig,
+):
+    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
+    and those tests will be skipped on unsupported hardware."""
     config = Config(
         Ms=Ms,
         K=k,
@@ -223,31 +305,33 @@ def test_modular_kernel_combinations_singlegpu(
         topks=TOPKs,
         dtype=dtype,
         quant_config=quant_config,
-        prepare_finalize_type=combination[0],
-        fused_experts_type=combination[1],
-        fused_moe_chunk_size=fused_moe_chunk_size,
+        prepare_finalize_type=prepare_finalize_type,
+        fused_experts_type=fused_experts_type,
+        fused_moe_chunk_size=chunk_size,
         world_size=world_size,
     )
 
-    if not config.is_valid():
-        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
-
-    if is_nyi_config(config):
-        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
-
-    verbosity = pytestconfig.getoption('verbose')
+    if (
+        quant_config is not None and quant_config.quant_dtype == torch.float8_e4m3fn
+    ) and not current_platform.has_device_capability(89):
+        pytest.skip(
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+    verbosity = pytestconfig.getoption("verbose")
     run(config, verbosity > 0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Ability to test individual PrepareAndFinalize and FusedExperts combination
-    from .modular_kernel_tools.cli_args import (make_config,
-                                                make_config_arg_parser)
-    parser = make_config_arg_parser(description=(
-        "Run single prepare-finalize & fused-experts combination test"
-        "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "  #noqa: E501
-        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
-    ))
+    from .modular_kernel_tools.cli_args import make_config, make_config_arg_parser
+
+    parser = make_config_arg_parser(
+        description=(
+            "Run single prepare-finalize & fused-experts combination test"
+            "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
+            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+        )
+    )
     args = parser.parse_args()
     config = make_config(args)
 
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 00835bec9a15c0ed8d7e5369572086cf8825968e..0550c2d9e2125022f3e53ba140ccc86347a9e8f3 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -4,8 +4,13 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+
 import functools
-from typing import Callable, Optional, Union
+import importlib
+import sys
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
 
 import pytest
 import torch
@@ -17,26 +22,41 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, int4_w4a16_moe_quant_config,
-    int8_w8a16_moe_quant_config)
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    int4_w4a16_moe_quant_config,
+    int8_w8a16_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    batched_fused_marlin_moe,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, modular_triton_fused_moe)
+    fused_topk,
+    modular_triton_fused_moe,
+)
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as iterative_moe)
+    fused_moe as iterative_moe,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    marlin_permute_bias)
+    marlin_permute_bias,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    rand_marlin_weight_mxfp4_like, rand_marlin_weight_nvfp4_like)
+    rand_marlin_weight_mxfp4_like,
+    rand_marlin_weight_nvfp4_like,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    marlin_quant_fp8_torch)
+    marlin_quant_fp8_torch,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    awq_marlin_quantize, marlin_quantize)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    quantize_weights)
+    awq_marlin_quantize,
+    marlin_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_weights
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@@ -49,8 +69,6 @@ FUSED_MOE_MNK_FACTORS = [
     (1, 128, 128),
     (1, 2048, 128),
     (33, 2048, 128),
-    (222, 1024, 1024),
-    (32768, 128, 128),
     (32768, 2048, 511),
     (40000, 1024, 1024),
 ]
@@ -59,17 +77,14 @@ FUSED_MOE_WN16_MNK_FACTORS = [
     (1, 128, 128),
     (1, 1024, 1024),
     (32, 2048, 128),
-    (32, 1024, 1024),
     (222, 2048, 1024),
 ]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
-    baseline: Union[Callable, torch.Tensor],
+    baseline: Callable | torch.Tensor,
     moe_fn: Callable,
     a: torch.Tensor,
     w1: torch.Tensor,
@@ -77,7 +92,7 @@ def run_moe_test(
     score: torch.Tensor,
     topk: int,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     padding: bool = False,
     use_compile: bool = False,
     use_cudagraph: bool = False,
@@ -87,13 +102,15 @@ def run_moe_test(
     if isinstance(baseline, torch.Tensor):
         baseline_output = baseline
     else:
-        baseline_output = baseline(a,
-                                   w1,
-                                   w2,
-                                   score,
-                                   topk,
-                                   global_num_experts=global_num_experts,
-                                   expert_map=expert_map)
+        baseline_output = baseline(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
 
     # Pad the weight if moe padding is enabled
     if padding:
@@ -105,34 +122,35 @@ def run_moe_test(
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(score, 0)
 
-    test_output = moe_fn(a,
-                         w1,
-                         w2,
-                         score,
-                         topk,
-                         global_num_experts=global_num_experts,
-                         expert_map=expert_map)
+    test_output = moe_fn(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+    )
 
     if use_cudagraph:
         test_output.fill_(0)
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            test_output = moe_fn(a,
-                                 w1,
-                                 w2,
-                                 score,
-                                 topk,
-                                 global_num_experts=global_num_experts,
-                                 expert_map=expert_map)
+            test_output = moe_fn(
+                a,
+                w1,
+                w2,
+                score,
+                topk,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
         torch.cuda.synchronize()
         graph.replay()
         torch.cuda.synchronize()
 
-    torch.testing.assert_close(test_output,
-                               baseline_output,
-                               atol=atol,
-                               rtol=rtol)
+    torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol)
 
     return baseline_output
 
@@ -176,11 +194,8 @@ def test_fused_moe(
 
     if ep_size > 1:
         local_e = e // ep_size
-        e_ids = torch.randint(0,
-                              e, (local_e, ),
-                              device="cuda",
-                              dtype=torch.int32)
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_ids = torch.randint(0, e, (local_e,), device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1 = w1[e_ids]
         w2 = w2[e_ids]
@@ -201,16 +216,18 @@ def test_fused_moe(
         score: torch.Tensor,
         topk: int,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
+        expert_map: torch.Tensor | None = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
-        return m_fused_moe_fn(a,
-                              w1,
-                              w2,
-                              topk_weights,
-                              topk_ids,
-                              global_num_experts=global_num_experts,
-                              expert_map=expert_map)
+        return m_fused_moe_fn(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
 
     fused_moe_fn = functools.partial(fused_moe, renormalize=False)
 
@@ -234,19 +251,22 @@ def test_fused_moe(
     # setup code in case we are able to revisit this later.
     use_compile = False
 
-    use_cudagraph = (n >= 1024 and k >= 1024
-                     and current_platform.is_cuda_alike())
+    use_cudagraph = n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
 
     with set_current_vllm_config(vllm_config):
         baseline_output = runner(torch_moe, iterative_moe)
-        runner(baseline_output,
-               fused_moe_fn,
-               use_compile=use_compile,
-               use_cudagraph=use_cudagraph)
-        runner(baseline_output,
-               m_fused_moe,
-               use_compile=use_compile,
-               use_cudagraph=use_cudagraph)
+        runner(
+            baseline_output,
+            fused_moe_fn,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
+        runner(
+            baseline_output,
+            m_fused_moe,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
 
 
 @pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
@@ -257,9 +277,18 @@ def test_fused_moe(
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("weight_bits", [4, 8])
-def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
-                        ep_size: int, dtype: torch.dtype, group_size: int,
-                        has_zp: bool, weight_bits: int):
+def test_fused_moe_wn16(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    group_size: int,
+    has_zp: bool,
+    weight_bits: int,
+):
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -274,35 +303,40 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
     w1_ref = w1.clone()
     w2_ref = w2.clone()
-    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
-                             device="cuda",
-                             dtype=torch.uint8)
-    w2_qweight = torch.empty((e, k, n // pack_factor),
-                             device="cuda",
-                             dtype=torch.uint8)
-    w1_scales = torch.empty((e, 2 * n, k // group_size),
-                            device="cuda",
-                            dtype=dtype)
-    w2_scales = torch.empty((e, k, n // group_size),
-                            device="cuda",
-                            dtype=dtype)
-    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
-                            device="cuda",
-                            dtype=torch.uint8)
-    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
-                            device="cuda",
-                            dtype=torch.uint8)
+    w1_qweight = torch.empty(
+        (e, 2 * n, k // pack_factor), device="cuda", dtype=torch.uint8
+    )
+    w2_qweight = torch.empty((e, k, n // pack_factor), device="cuda", dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size), device="cuda", dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size), device="cuda", dtype=dtype)
+    w1_qzeros = torch.empty(
+        (e, 2 * n // pack_factor, k // group_size), device="cuda", dtype=torch.uint8
+    )
+    w2_qzeros = torch.empty(
+        (e, k // pack_factor, n // group_size), device="cuda", dtype=torch.uint8
+    )
 
     for i in range(e * 2):
         expert_id = i % e
         if i // e == 0:
-            w, w_ref, w_qweight, w_scales, w_qzeros = \
-                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w1,
+                w1_ref,
+                w1_qweight,
+                w1_scales,
+                w1_qzeros,
+            )
         else:
-            w, w_ref, w_qweight, w_scales, w_qzeros = \
-                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w2,
+                w2_ref,
+                w2_qweight,
+                w2_scales,
+                w2_qzeros,
+            )
         weight, qweight, scales, qzeros = quantize_weights(
-            w[expert_id].T, quant_type, group_size, has_zp, False)
+            w[expert_id].T, quant_type, group_size, has_zp, False
+        )
         weight = weight.T
         qweight = qweight.T.contiguous().to(torch.uint8)
         scales = scales.T
@@ -321,11 +355,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
     if ep_size > 1:
         local_e = e // ep_size
-        e_ids = torch.randint(0,
-                              e, (local_e, ),
-                              device="cuda",
-                              dtype=torch.int32)
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_ids = torch.randint(0, e, (local_e,), device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1_ref = w1_ref[e_ids]
         w2_ref = w2_ref[e_ids]
@@ -344,28 +375,27 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
         assert weight_bits == 8
         quant_config_builder = int8_w8a16_moe_quant_config
 
-    quant_config = quant_config_builder(w1_scale=w1_scales,
-                                        w2_scale=w2_scales,
-                                        w1_zp=w1_qzeros if has_zp else None,
-                                        w2_zp=w2_qzeros if has_zp else None,
-                                        block_shape=[0, group_size])
+    quant_config = quant_config_builder(
+        w1_scale=w1_scales,
+        w2_scale=w2_scales,
+        w1_zp=w1_qzeros if has_zp else None,
+        w2_zp=w2_qzeros if has_zp else None,
+        block_shape=[0, group_size],
+    )
 
     with set_current_vllm_config(vllm_config):
-        triton_output = fused_moe(a,
-                                  w1_qweight,
-                                  w2_qweight,
-                                  score,
-                                  topk,
-                                  renormalize=False,
-                                  global_num_experts=e,
-                                  expert_map=e_map,
-                                  quant_config=quant_config)
-        torch_output = torch_moe(a,
-                                 w1_ref,
-                                 w2_ref,
-                                 score,
-                                 topk,
-                                 expert_map=e_map)
+        triton_output = fused_moe(
+            a,
+            w1_qweight,
+            w2_qweight,
+            score,
+            topk,
+            renormalize=False,
+            global_num_experts=e,
+            expert_map=e_map,
+            quant_config=quant_config,
+        )
+        torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, expert_map=e_map)
 
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
@@ -373,34 +403,35 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 @torch.inference_mode()
-def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
-                     use_rocm_aiter: bool, monkeypatch):
+def test_mixtral_moe(
+    dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
     # clear the cache before every test
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        is_rocm_aiter_moe_enabled)
-    is_rocm_aiter_moe_enabled.cache_clear()
+    # Force reload aiter_ops to pick up the new environment variables.
+    if "rocm_aiter_ops" in sys.modules:
+        importlib.reload(rocm_aiter_ops)
+
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
-
         if dtype == torch.float32:
             pytest.skip("AITER ROCm test skip for float32")
 
-    monkeypatch.setenv('RANK', "0")
-    monkeypatch.setenv('LOCAL_RANK', "0")
-    monkeypatch.setenv('WORLD_SIZE', "1")
-    monkeypatch.setenv('MASTER_ADDR', 'localhost')
-    monkeypatch.setenv('MASTER_PORT', '12345')
+    monkeypatch.setenv("RANK", "0")
+    monkeypatch.setenv("LOCAL_RANK", "0")
+    monkeypatch.setenv("WORLD_SIZE", "1")
+    monkeypatch.setenv("MASTER_ADDR", "localhost")
+    monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
-    with (set_current_vllm_config(vllm_config),
-          set_forward_context(None, vllm_config)):
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
         config = MixtralConfig()
         hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
         vllm_moe = MixtralMoE(
@@ -416,27 +447,30 @@ def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
         # Load the weights
         vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
         for i in range(config.num_local_experts):
-            weights = (hf_moe.experts[i].w1.weight.data,
-                       hf_moe.experts[i].w3.weight.data)
+            weights = (
+                hf_moe.experts[i].w1.weight.data,
+                hf_moe.experts[i].w3.weight.data,
+            )
             vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
             vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
         # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
-        hf_inputs = torch.randn(
-            (1, 64, config.hidden_size)).to(dtype).to("cuda")
+        hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
         # vLLM uses 1D query [num_tokens, hidden_dim]
         vllm_inputs = hf_inputs.flatten(0, 1)
 
         # Pad the weight if moe padding is enabled
         if padding:
-            vllm_moe.experts.w13_weight = Parameter(F.pad(
-                vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[...,
-                                                                      0:-128],
-                                                    requires_grad=False)
-            vllm_moe.experts.w2_weight = Parameter(F.pad(
-                vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[...,
-                                                                     0:-128],
-                                                   requires_grad=False)
+            vllm_moe.experts.w13_weight = Parameter(
+                F.pad(vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[
+                    ..., 0:-128
+                ],
+                requires_grad=False,
+            )
+            vllm_moe.experts.w2_weight = Parameter(
+                F.pad(vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
+                requires_grad=False,
+            )
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
@@ -451,29 +485,31 @@ def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
     }
 
     if use_rocm_aiter:
-        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
-        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
-        torch.testing.assert_close(hf_states.flatten(0, 1),
-                                   vllm_states,
-                                   rtol=0.01,
-                                   atol=100)
+        # The values of rtol and atol are set based on the tests in ROCM AITER package.
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
+        torch.testing.assert_close(
+            hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
+        )
     else:
-        torch.testing.assert_close(hf_states.flatten(0, 1),
-                                   vllm_states,
-                                   rtol=mixtral_moe_tol[dtype],
-                                   atol=mixtral_moe_tol[dtype])
+        torch.testing.assert_close(
+            hf_states.flatten(0, 1),
+            vllm_states,
+            rtol=mixtral_moe_tol[dtype],
+            atol=mixtral_moe_tol[dtype],
+        )
 
 
 def marlin_moe_generate_valid_test_cases():
     import itertools
+
     m_list = [1, 123, 666]
     n_list = [128, 1024]
     k_list = [256, 2048]
     e_list = [4, 12]
     topk_list = [2, 3]
     ep_size_list = [1, 4]
-    dtype_list = [torch.half, torch.bfloat16]
-    group_size_list = [-1, 16, 32, 128]
+    dtype_list = [torch.bfloat16]
+    group_size_list = [-1, 32, 128]
     act_order_list = [True, False]
     quant_type_list = [
         scalar_types.float4_e2m1f,
@@ -484,16 +520,24 @@ def marlin_moe_generate_valid_test_cases():
     ]
     is_k_full_list = [True, False]
 
-    all_combinations = itertools.product(m_list, n_list, k_list, e_list,
-                                         topk_list, ep_size_list, dtype_list,
-                                         group_size_list, act_order_list,
-                                         quant_type_list, is_k_full_list)
-
-    def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order,
-                   quant_type, is_k_full):
+    all_combinations = itertools.product(
+        m_list,
+        n_list,
+        k_list,
+        e_list,
+        topk_list,
+        ep_size_list,
+        dtype_list,
+        group_size_list,
+        act_order_list,
+        quant_type_list,
+        is_k_full_list,
+    )
 
-        if quant_type == scalar_types.float8_e4m3fn and \
-                group_size not in [-1, 128]:
+    def is_invalid(
+        m, n, k, e, topk, ep_size, dtype, group_size, act_order, quant_type, is_k_full
+    ):
+        if quant_type == scalar_types.float8_e4m3fn and group_size not in [-1, 128]:
             return False
         if quant_type == scalar_types.float4_e2m1f:
             if group_size not in [16, 32]:
@@ -521,10 +565,110 @@ def marlin_moe_generate_valid_test_cases():
     return cases
 
 
+@dataclass
+class MarlinMoEWeightData:
+    w_ref: torch.Tensor
+    qweight: torch.Tensor
+    scales: torch.Tensor
+    global_scale: torch.Tensor | None
+    g_idx: torch.Tensor | None
+    zeros: torch.Tensor | None
+    sort_indices: torch.Tensor | None
+    marlin_bias: torch.Tensor | None
+
+    @staticmethod
+    def make(
+        w: torch.Tensor,
+        quant_type: ScalarType,
+        group_size: int,
+        act_order: bool | None = None,
+        bias: torch.Tensor | None = None,
+    ) -> "MarlinMoEWeightData":
+        assert w.ndim == 3
+        has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+        k = w.shape[-1]
+
+        w_ref_l: list[torch.Tensor] = []
+        qweight_l: list[torch.Tensor] = []
+        scales_l: list[torch.Tensor] = []
+        global_scale_l: list[torch.Tensor] = []
+        zeros_l: list[torch.Tensor] = []
+        g_idx_l: list[torch.Tensor] = []
+        sort_indices_l: list[torch.Tensor] = []
+        bias_l: list[torch.Tensor] = []
+
+        for i in range(w.shape[0]):
+            if quant_type == scalar_types.float4_e2m1f:
+                if group_size == 16:
+                    w_ref, qweight, scales, global_scale = (
+                        rand_marlin_weight_nvfp4_like(w[i], group_size)
+                    )
+                else:
+                    w_ref, qweight, scales = rand_marlin_weight_mxfp4_like(
+                        w[i], group_size
+                    )
+                    global_scale = None
+
+                w_ref_l.append(w_ref.T)
+                qweight_l.append(qweight)
+                scales_l.append(scales)
+                if global_scale is not None:
+                    global_scale_l.append(global_scale)
+            elif quant_type == scalar_types.float8_e4m3fn:
+                w_ref, qweight, scales = marlin_quant_fp8_torch(w[i], group_size)
+                w_ref_l.append(w_ref.T)
+                qweight_l.append(qweight)
+                scales_l.append(scales)
+            elif has_zp:
+                w_ref, qweight, scales, zeros = awq_marlin_quantize(
+                    w[i].transpose(1, 0), quant_type, group_size
+                )
+
+                w_ref_l.append(w_ref.T)
+                qweight_l.append(qweight)
+                scales_l.append(scales)
+                zeros_l.append(zeros)
+            else:
+                test_perm = torch.randperm(k)
+                w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+                    w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+                )
+
+                w_ref_l.append(w_ref.T)
+                qweight_l.append(qweight)
+                scales_l.append(scales)
+                g_idx_l.append(g_idx)
+                sort_indices_l.append(sort_indices)
+
+            if bias is not None:
+                bias_l.append(marlin_permute_bias(bias[i]))
+
+        w_ref = stack_and_dev(w_ref_l)
+        qweight = stack_and_dev(qweight_l).contiguous()
+        scales = stack_and_dev(scales_l)
+        global_scale = stack_and_dev(global_scale_l) if global_scale_l else None
+        g_idx = stack_and_dev(g_idx_l) if g_idx_l else None
+        zeros = stack_and_dev(zeros_l) if zeros_l else None
+        sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
+        marlin_bias = stack_and_dev(bias_l) if bias_l else None
+
+        return MarlinMoEWeightData(
+            w_ref=w_ref,
+            qweight=qweight,
+            scales=scales,
+            global_scale=global_scale,
+            g_idx=g_idx,
+            zeros=zeros,
+            sort_indices=sort_indices,
+            marlin_bias=marlin_bias,
+        )
+
+
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.parametrize(("m, n, k, e, topk, ep_size, dtype, group_size,"
-                          "act_order, quant_type, is_k_full"),
-                         marlin_moe_generate_valid_test_cases())
+@pytest.mark.parametrize(
+    ("m, n, k, e, topk, ep_size, dtype, group_size,act_order, quant_type, is_k_full"),
+    marlin_moe_generate_valid_test_cases(),
+)
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
     m: int,
@@ -540,7 +684,6 @@ def test_fused_marlin_moe(
     is_k_full: bool,
 ):
     torch.cuda.manual_seed(0)
-    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
@@ -549,162 +692,54 @@ def test_fused_marlin_moe(
     if ep_size > 1:
         local_e = e // ep_size
         e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e]
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1 = w1[e_ids]
         w2 = w2[e_ids]
     else:
         e_map = None
 
-    w_ref1_l = []
-    qweight1_l = []
-    scales1_l = []
-    global_scale1_l = []
-    zeros1_l = []
-    g_idx1_l = []
-    sort_indices1_l = []
+    w1_data = MarlinMoEWeightData.make(
+        w=w1, quant_type=quant_type, group_size=group_size, act_order=act_order
+    )
 
-    for i in range(w1.shape[0]):
-        if quant_type == scalar_types.float4_e2m1f:
-            if group_size == 16:
-                w_ref1, qweight1, scales1, global_scale1 = \
-                    rand_marlin_weight_nvfp4_like(w1[i], group_size)
-            else:
-                w_ref1, qweight1, scales1 = \
-                    rand_marlin_weight_mxfp4_like(w1[i], group_size)
-                global_scale1 = None
-
-            w_ref1_l.append(w_ref1.T)
-            qweight1_l.append(qweight1)
-            scales1_l.append(scales1)
-            if global_scale1 is not None:
-                global_scale1_l.append(global_scale1)
-        elif quant_type == scalar_types.float8_e4m3fn:
-            w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
-                w1[i], group_size)
-            w_ref1_l.append(w_ref1.T)
-            qweight1_l.append(qweight1)
-            scales1_l.append(scales1)
-        elif has_zp:
-            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
-                w1[i].transpose(1, 0), quant_type, group_size)
-
-            w_ref1_l.append(w_ref1.T)
-            qweight1_l.append(qweight1)
-            scales1_l.append(scales1)
-            zeros1_l.append(zeros1)
-        else:
-            test_perm = torch.randperm(k)
-            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
-                marlin_quantize(w1[i].transpose(1, 0), quant_type,
-                                group_size, act_order, test_perm)
-
-            w_ref1_l.append(w_ref1.T)
-            qweight1_l.append(qweight1)
-            scales1_l.append(scales1)
-            g_idx1_l.append(g_idx1)
-            sort_indices1_l.append(sort_indices1)
-
-    w_ref1 = stack_and_dev(w_ref1_l)
-    qweight1 = stack_and_dev(qweight1_l).contiguous()
-    scales1 = stack_and_dev(scales1_l)
-    global_scale1 = stack_and_dev(global_scale1_l) if global_scale1_l else None
-    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
-    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
-    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
-
-    w_ref2_l = []
-    qweight2_l = []
-    scales2_l = []
-    global_scale2_l = []
-    zeros2_l = []
-    g_idx2_l = []
-    sort_indices2_l = []
-
-    for i in range(w2.shape[0]):
-        if quant_type == scalar_types.float4_e2m1f:
-            if group_size == 16:
-                w_ref2, qweight2, scales2, global_scale2 = \
-                    rand_marlin_weight_nvfp4_like(w2[i], group_size)
-            else:
-                w_ref2, qweight2, scales2 = \
-                    rand_marlin_weight_mxfp4_like(w2[i], group_size)
-                global_scale2 = None
-
-            w_ref2_l.append(w_ref2.T)
-            qweight2_l.append(qweight2)
-            scales2_l.append(scales2)
-            if global_scale2 is not None:
-                global_scale2_l.append(global_scale2)
-        elif quant_type == scalar_types.float8_e4m3fn:
-            w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
-                w2[i], group_size)
-            w_ref2_l.append(w_ref2.T)
-            qweight2_l.append(qweight2)
-            scales2_l.append(scales2)
-        elif has_zp:
-            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
-                w2[i].transpose(1, 0), quant_type, group_size)
-
-            w_ref2_l.append(w_ref2.T)
-            qweight2_l.append(qweight2)
-            scales2_l.append(scales2)
-            zeros2_l.append(zeros2)
-        else:
-            test_perm = torch.randperm(n)
-            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
-                marlin_quantize(w2[i].transpose(1, 0), quant_type,
-                                group_size, act_order, test_perm)
-
-            w_ref2_l.append(w_ref2.T)
-            qweight2_l.append(qweight2)
-            scales2_l.append(scales2)
-            g_idx2_l.append(g_idx2)
-            sort_indices2_l.append(sort_indices2)
-
-    w_ref2 = stack_and_dev(w_ref2_l)
-    qweight2 = stack_and_dev(qweight2_l).contiguous()
-    scales2 = stack_and_dev(scales2_l)
-    global_scale2 = stack_and_dev(global_scale2_l) if global_scale2_l else None
-    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
-    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
-    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+    w2_data = MarlinMoEWeightData.make(
+        w=w2, quant_type=quant_type, group_size=group_size, act_order=act_order
+    )
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a,
-                                 w_ref1,
-                                 w_ref2,
-                                 score,
-                                 topk,
-                                 expert_map=e_map)
-
-    marlin_output = torch.ops.vllm.fused_marlin_moe(
+        torch_output = torch_moe(
+            a, w1_data.w_ref, w2_data.w_ref, score, topk, expert_map=e_map
+        )
+
+    marlin_output = fused_marlin_moe(
         a,
-        qweight1,
-        qweight2,
+        w1_data.qweight,
+        w2_data.qweight,
         None,
         None,
-        scales1,
-        scales2,
+        w1_data.scales,
+        w2_data.scales,
         score,
         topk_weights,
         topk_ids,
         global_num_experts=e,
         expert_map=e_map,
-        global_scale1=global_scale1,
-        global_scale2=global_scale2,
-        g_idx1=g_idx1,
-        g_idx2=g_idx2,
-        sort_indices1=sort_indices1,
-        sort_indices2=sort_indices2,
-        w1_zeros=zeros1,
-        w2_zeros=zeros2,
+        global_scale1=w1_data.global_scale,
+        global_scale2=w2_data.global_scale,
+        g_idx1=w1_data.g_idx,
+        g_idx2=w2_data.g_idx,
+        sort_indices1=w1_data.sort_indices,
+        sort_indices2=w2_data.sort_indices,
+        w1_zeros=w1_data.zeros,
+        w2_zeros=w2_data.zeros,
         quant_type_id=quant_type.id,
-        is_k_full=is_k_full)
+        is_k_full=is_k_full,
+    )
 
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
@@ -729,95 +764,55 @@ def test_fused_marlin_moe_with_bias(m):
     b_bias1 = torch.randn((e, 2 * n), device="cuda", dtype=dtype) / 10
     b_bias2 = torch.randn((e, k), device="cuda", dtype=dtype) / 10
 
-    b_bias1_l = []
-    w_ref1_l = []
-    qweight1_l = []
-    scales1_l = []
-    g_idx1_l = []
-    sort_indices1_l = []
-
-    for i in range(w1.shape[0]):
-        test_perm = torch.randperm(k)
-        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
-            marlin_quantize(w1[i].transpose(1, 0), quant_type,
-                            group_size, act_order, test_perm)
-
-        w_ref1_l.append(w_ref1.T)
-        qweight1_l.append(qweight1)
-        scales1_l.append(scales1)
-        g_idx1_l.append(g_idx1)
-        sort_indices1_l.append(sort_indices1)
-        b_bias1_l.append(marlin_permute_bias(b_bias1[i]))
-
-    w_ref1 = stack_and_dev(w_ref1_l)
-    qweight1 = stack_and_dev(qweight1_l).contiguous()
-    scales1 = stack_and_dev(scales1_l)
-    global_scale1 = None
-    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
-    zeros1 = None
-    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
-    marlin_bias1 = stack_and_dev(b_bias1_l) if b_bias1_l else None
-
-    b_bias2_l = []
-    w_ref2_l = []
-    qweight2_l = []
-    scales2_l = []
-    g_idx2_l = []
-    sort_indices2_l = []
-
-    for i in range(w2.shape[0]):
-        test_perm = torch.randperm(n)
-        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
-            marlin_quantize(w2[i].transpose(1, 0), quant_type,
-                            group_size, act_order, test_perm)
-
-        w_ref2_l.append(w_ref2.T)
-        qweight2_l.append(qweight2)
-        scales2_l.append(scales2)
-        g_idx2_l.append(g_idx2)
-        sort_indices2_l.append(sort_indices2)
-        b_bias2_l.append(marlin_permute_bias(b_bias2[i]))
-
-    w_ref2 = stack_and_dev(w_ref2_l)
-    qweight2 = stack_and_dev(qweight2_l).contiguous()
-    scales2 = stack_and_dev(scales2_l)
-    global_scale2 = None
-    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
-    zeros2 = None
-    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
-    marlin_bias2 = stack_and_dev(b_bias2_l) if b_bias2_l else None
+    w1_data = MarlinMoEWeightData.make(
+        w=w1,
+        quant_type=quant_type,
+        group_size=group_size,
+        act_order=act_order,
+        bias=b_bias1,
+    )
+
+    w2_data = MarlinMoEWeightData.make(
+        w=w2,
+        quant_type=quant_type,
+        group_size=group_size,
+        act_order=act_order,
+        bias=b_bias2,
+    )
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1,
-                                 b_bias2)
+        torch_output = torch_moe(
+            a, w1_data.w_ref, w2_data.w_ref, score, topk, b_bias1, b_bias2
+        )
 
-    marlin_output = torch.ops.vllm.fused_marlin_moe(
+    marlin_output = fused_marlin_moe(
         a,
-        qweight1,
-        qweight2,
-        marlin_bias1,
-        marlin_bias2,
-        scales1,
-        scales2,
+        w1_data.qweight,
+        w2_data.qweight,
+        w1_data.marlin_bias,
+        w2_data.marlin_bias,
+        w1_data.scales,
+        w2_data.scales,
         score,
         topk_weights,
         topk_ids,
         global_num_experts=e,
         expert_map=None,
-        global_scale1=global_scale1,
-        global_scale2=global_scale2,
-        g_idx1=g_idx1,
-        g_idx2=g_idx2,
-        sort_indices1=sort_indices1,
-        sort_indices2=sort_indices2,
-        w1_zeros=zeros1,
-        w2_zeros=zeros2,
+        global_scale1=w1_data.global_scale,
+        global_scale2=w2_data.global_scale,
+        g_idx1=w1_data.g_idx,
+        g_idx2=w2_data.g_idx,
+        sort_indices1=w1_data.sort_indices,
+        sort_indices2=w2_data.sort_indices,
+        w1_zeros=w1_data.zeros,
+        w2_zeros=w2_data.zeros,
         quant_type_id=quant_type.id,
-        is_k_full=is_k_full)
+        is_k_full=is_k_full,
+    )
 
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
@@ -825,34 +820,71 @@ def test_fused_marlin_moe_with_bias(m):
 def test_moe_align_block_size_opcheck():
     num_experts = 4
     block_size = 4
-    topk_ids = torch.randint(0,
-                             num_experts, (3, 4),
-                             dtype=torch.int32,
-                             device='cuda')
+    topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
     sorted_ids.fill_(topk_ids.numel())
     max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids = torch.empty((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    opcheck(
+        torch.ops._moe_C.moe_align_block_size,
+        (
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+        ),
+    )
+
 
-    opcheck(torch.ops._moe_C.moe_align_block_size,
-            (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
-             num_tokens_post_pad))
+def test_batched_moe_align_block_size_opcheck():
+    max_tokens_per_batch = 512
+    num_experts = 4
+    block_size = 16
+
+    expert_num_tokens = torch.randint(
+        low=0,
+        high=max_tokens_per_batch,
+        size=(num_experts,),
+        dtype=torch.int32,
+        device="cuda",
+    )
 
+    max_num_tokens_padded = num_experts * max(max_tokens_per_batch, block_size)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
+    assert max_num_tokens_padded % block_size == 0
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device="cuda")
+
+    opcheck(
+        torch.ops._moe_C.batched_moe_align_block_size,
+        (
+            max_tokens_per_batch,
+            block_size,
+            expert_num_tokens,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+        ),
+    )
+
+
+@pytest.mark.parametrize("m", [1, 33, 222])
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
     input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
@@ -864,3 +896,240 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
     torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
 
     opcheck(torch.ops._moe_C.moe_sum, (input, actual))
+
+
+@pytest.mark.parametrize("m", [1, 33])
+@pytest.mark.parametrize("n,k", [(128, 128)])
+@pytest.mark.parametrize("e", [8])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("with_bias", [False, True])
+@pytest.mark.parametrize("activation", ["silu"])
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only test")
+def test_cpu_fused_moe_basic(m, n, k, e, topk, dtype, with_bias, activation):
+    from vllm.model_executor.layers.fused_moe.cpu_fused_moe import CPUFusedMOE
+
+    device = "cpu"
+    torch.manual_seed(7)
+
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w13 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
+    router_logits = torch.randn((m, e), device=device, dtype=dtype)
+
+    b1 = b2 = None
+    if with_bias:
+        b1 = torch.randn((e, 2 * n), device=device, dtype=dtype) / 10
+        b2 = torch.randn((e, k), device=device, dtype=dtype) / 10
+
+    ref = (
+        torch_moe(a, w13, w2, router_logits, topk, b1, b2)
+        if with_bias
+        else torch_moe(a, w13, w2, router_logits, topk)
+    )
+
+    class _Dummy(torch.nn.Module):
+        def __init__(self, w13, w2, b1=None, b2=None):
+            super().__init__()
+            self.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
+            self.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
+            if b1 is not None:
+                self.w13_bias = torch.nn.Parameter(b1, requires_grad=False)
+            if b2 is not None:
+                self.w2_bias = torch.nn.Parameter(b2, requires_grad=False)
+
+    layer = _Dummy(w13, w2, b1, b2).to(dtype)
+    fused = CPUFusedMOE(layer)
+    out = fused(
+        layer=layer,
+        x=a,
+        use_grouped_topk=False,
+        top_k=topk,
+        router_logits=router_logits,
+        renormalize=False,
+        global_num_experts=e,
+        expert_map=None,
+        custom_routing_function=None,
+        scoring_func="softmax",
+        routed_scaling_factor=1.0,
+        e_score_correction_bias=None,
+        apply_router_weight_on_input=False,
+        activation=activation,
+    )
+
+    # Tolerances: fp32 tight; bf16 looser (esp. with bias)
+    if dtype == torch.float32:
+        atol = 1e-3
+    elif with_bias:
+        atol = 8e-2
+    else:
+        atol = 5e-2
+    torch.testing.assert_close(out, ref, atol=atol, rtol=0)
+
+
+@pytest.mark.parametrize("m", [16, 32, 64])
+@pytest.mark.parametrize("n", [128])
+@pytest.mark.parametrize("k", [128])
+@pytest.mark.parametrize("e", [8, 12, 16, 32])
+@pytest.mark.parametrize("topk", [2, 4])
+@pytest.mark.parametrize("max_tokens_per_batch", [16, 32, 64])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_batched_fused_marlin_moe(
+    m: int, n: int, k: int, e: int, topk: int, max_tokens_per_batch: int
+):
+    print(
+        f"testing m={m}, n={n}, k={k}, e={e}, "
+        f"topk={topk}, "
+        f"max_tokens_per_batch={max_tokens_per_batch}"
+    )
+    torch.cuda.manual_seed(0)
+
+    dtype = torch.bfloat16
+    quant_dtype = scalar_types.float4_e2m1f
+    group_size = 32
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+
+    w1_data = MarlinMoEWeightData.make(
+        w=w1, quant_type=quant_dtype, group_size=group_size, act_order=None
+    )
+    w2_data = MarlinMoEWeightData.make(
+        w=w2, quant_type=quant_dtype, group_size=group_size, act_order=None
+    )
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+
+    class BatchedRun:
+        @staticmethod
+        def _make_expert_num_tokens_cpu(
+            e: int,  # num_experts
+            topk_ids_cpu: torch.Tensor,
+        ) -> torch.Tensor:
+            expert_num_tokens_cpu = torch.zeros((e,), dtype=torch.int32, device="cpu")
+            for topk_id in torch.flatten(topk_ids_cpu):
+                expert_num_tokens_cpu[topk_id] += 1
+            return expert_num_tokens_cpu
+
+        def __init__(
+            self,
+            max_tokens_per_batch: int,
+            num_experts: int,
+            _topk_ids: torch.Tensor,
+            _topk_weights: torch.Tensor,
+        ):
+            self.max_tokens_per_batch = max_tokens_per_batch
+            self.e = num_experts
+            self.topk_ids_cpu = _topk_ids.to("cpu")
+            self.topk_weights_cpu = _topk_weights.to("cpu")
+            self.expert_num_tokens_cpu = self._make_expert_num_tokens_cpu(
+                self.e, self.topk_ids_cpu
+            )
+
+        def is_valid(self):
+            """
+            Return True only if the input can be represented in a Batched
+            format.
+            """
+            return torch.all(self.expert_num_tokens_cpu <= self.max_tokens_per_batch)
+
+        def _scatter(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            hidden_states_cpu = hidden_states.to("cpu")
+            K = hidden_states_cpu.size(1)
+            batched_hidden_states_cpu = torch.empty(
+                (e, max_tokens_per_batch, K),
+                dtype=hidden_states_cpu.dtype,
+                device="cpu",
+            )
+
+            counter_cpu = torch.zeros_like(self.expert_num_tokens_cpu)
+            for t_idx, token in enumerate(hidden_states_cpu):
+                for topk_id in self.topk_ids_cpu[t_idx]:
+                    pos_in_batch = counter_cpu[topk_id]
+                    batched_hidden_states_cpu[topk_id, pos_in_batch] = token
+                    counter_cpu[topk_id] += 1
+            assert torch.allclose(counter_cpu, self.expert_num_tokens_cpu)
+            return batched_hidden_states_cpu.to("cuda")
+
+        def _gather(
+            self, batched_outputs: torch.Tensor, gather_outputs: torch.Tensor
+        ) -> torch.Tensor:
+            batched_outputs_cpu = batched_outputs.to("cpu")
+            gather_outputs_cpu = torch.zeros_like(gather_outputs)
+
+            counter_cpu = torch.zeros((e,), device="cpu", dtype=torch.int32)
+            md = gather_outputs_cpu.size(0)
+            for t_idx in range(md):
+                token = None
+                for topk_id, topk_weight in zip(
+                    self.topk_ids_cpu[t_idx], self.topk_weights_cpu[t_idx]
+                ):
+                    pos_in_batch = counter_cpu[topk_id]
+                    t = batched_outputs_cpu[topk_id, pos_in_batch] * topk_weight
+                    if token is None:
+                        token = t
+                    else:
+                        token += t
+                    counter_cpu[topk_id] += 1
+                assert token is not None
+                gather_outputs_cpu[t_idx] = token
+            gather_outputs.copy_(gather_outputs_cpu)
+            return gather_outputs
+
+        def run(
+            self, hidden_states: torch.Tensor, fused_marlin_moe_kwargs: dict[Any, Any]
+        ) -> torch.Tensor:
+            assert hidden_states.ndim == 2
+            assert self.is_valid()
+
+            batched_hidden_states = self._scatter(hidden_states)
+
+            kwargs = fused_marlin_moe_kwargs | {
+                "hidden_states": batched_hidden_states,
+                "expert_num_tokens": self.expert_num_tokens_cpu.to("cuda"),
+            }
+            batched_outputs = batched_fused_marlin_moe(**kwargs)
+
+            output = torch.zeros_like(hidden_states)
+            output = self._gather(batched_outputs, output)
+            return output
+
+    kwargs = {
+        "w1": w1_data.qweight,
+        "w2": w2_data.qweight,
+        "bias1": None,
+        "bias2": None,
+        "w1_scale": w1_data.scales,
+        "w2_scale": w2_data.scales,
+        "gating_output": score,
+        "global_num_experts": e,
+        "expert_map": None,
+        "global_scale1": w1_data.global_scale,
+        "global_scale2": w2_data.global_scale,
+        "g_idx1": w1_data.g_idx,
+        "g_idx2": w2_data.g_idx,
+        "sort_indices1": w1_data.sort_indices,
+        "sort_indices2": w2_data.sort_indices,
+        "w1_zeros": w1_data.zeros,
+        "w2_zeros": w2_data.zeros,
+        "quant_type_id": quant_dtype.id,
+        "is_k_full": True,
+    }
+
+    # Reference
+    fused_marlin_moe_kwargs = kwargs | {
+        "hidden_states": a,
+        "topk_ids": topk_ids,
+        "topk_weights": topk_weights,
+    }
+    ref_marlin_output = fused_marlin_moe(**fused_marlin_moe_kwargs)
+
+    # Batched
+    br = BatchedRun(max_tokens_per_batch, e, topk_ids, topk_weights)
+    if not br.is_valid():
+        pytest.skip("Cannot represent data in Batched Format.")
+    marlin_output = br.run(a, kwargs)
+
+    torch.testing.assert_close(marlin_output, ref_marlin_output, atol=1e-3, rtol=0)
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 5dfc8d9fab32b39d61e628b5f5a421aae64d277f..8975f00bd4c6e16b1630f5137bc04e7aa292b0a5 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -5,15 +5,15 @@
 Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
 """
 
-from typing import Optional
-
 import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size)
+    batched_moe_align_block_size,
+    moe_align_block_size,
+)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 NUM_TOKENS = [1, 3, 256, 2256, 4096]
 NUM_EXPERTS = [32, 160, 256, 257]
@@ -60,37 +60,40 @@ def _verify_expert_level_sorting(
     in topk_ids in the final sorted_ids however this does not impact quality.
     """
     # Group tokens by expert from the golden implementation
-    golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids,
-                                                   expert_ids, block_size,
-                                                   valid_length, total_tokens)
+    golden_expert_tokens = _group_tokens_by_expert(
+        golden_sorted_ids, expert_ids, block_size, valid_length, total_tokens
+    )
 
-    actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids,
-                                                   expert_ids, block_size,
-                                                   valid_length, total_tokens)
+    actual_expert_tokens = _group_tokens_by_expert(
+        actual_sorted_ids, expert_ids, block_size, valid_length, total_tokens
+    )
 
-    assert set(golden_expert_tokens.keys()) == set(
-        actual_expert_tokens.keys()), (
-            f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
-            f"actual={set(actual_expert_tokens.keys())}")
+    assert set(golden_expert_tokens.keys()) == set(actual_expert_tokens.keys()), (
+        f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
+        f"actual={set(actual_expert_tokens.keys())}"
+    )
 
     for expert_id in golden_expert_tokens:
-        golden_tokens = torch.tensor(golden_expert_tokens[expert_id],
-                                     device=actual_sorted_ids.device)
-        actual_tokens = torch.tensor(actual_expert_tokens[expert_id],
-                                     device=actual_sorted_ids.device)
+        golden_tokens = torch.tensor(
+            golden_expert_tokens[expert_id], device=actual_sorted_ids.device
+        )
+        actual_tokens = torch.tensor(
+            actual_expert_tokens[expert_id], device=actual_sorted_ids.device
+        )
         assert torch.equal(
-            torch.sort(golden_tokens)[0],
-            torch.sort(actual_tokens)[0]), (
-                f"Expert {expert_id} token mismatch: "
-                f"golden={golden_expert_tokens[expert_id]}, "
-                f"actual={actual_expert_tokens[expert_id]}")
+            torch.sort(golden_tokens)[0], torch.sort(actual_tokens)[0]
+        ), (
+            f"Expert {expert_id} token mismatch: "
+            f"golden={golden_expert_tokens[expert_id]}, "
+            f"actual={actual_expert_tokens[expert_id]}"
+        )
 
 
 def torch_moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     pad_sorted_ids: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
@@ -104,40 +107,38 @@ def torch_moe_align_block_size(
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
 
-    flattened_token_indices = torch.arange(topk_ids.numel(),
-                                           device=topk_ids.device,
-                                           dtype=torch.int32)
+    flattened_token_indices = torch.arange(
+        topk_ids.numel(), device=topk_ids.device, dtype=torch.int32
+    )
     flattened_expert_ids = topk_ids.flatten()
-    sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids,
-                                                 stable=True)
+    sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids, stable=True)
     sorted_token_indices = flattened_token_indices[sort_indices]
 
-    expert_token_counts = torch.zeros(num_experts,
-                                      dtype=torch.int64,
-                                      device=topk_ids.device)
+    expert_token_counts = torch.zeros(
+        num_experts, dtype=torch.int64, device=topk_ids.device
+    )
     for expert_id in range(num_experts):
         mask = sorted_expert_ids == expert_id
         expert_token_counts[expert_id] = mask.sum()
 
-    expert_padded_counts = torch.zeros(num_experts,
-                                       dtype=torch.int64,
-                                       device=topk_ids.device)
+    expert_padded_counts = torch.zeros(
+        num_experts, dtype=torch.int64, device=topk_ids.device
+    )
     for expert_id in range(num_experts):
         original_count = expert_token_counts[expert_id]
         if original_count > 0:
             expert_padded_counts[expert_id] = (
-                (original_count + block_size - 1) // block_size) * block_size
+                (original_count + block_size - 1) // block_size
+            ) * block_size
 
     sorted_token_ids = torch.full(
-        (max_num_tokens_padded, ),
+        (max_num_tokens_padded,),
         topk_ids.numel(),
         dtype=torch.int32,
         device=topk_ids.device,
     )
     max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size
-    expert_ids = torch.zeros(max_num_blocks,
-                             dtype=torch.int32,
-                             device=topk_ids.device)
+    expert_ids = torch.zeros(max_num_blocks, dtype=torch.int32, device=topk_ids.device)
 
     current_pos = 0
     current_block = 0
@@ -147,20 +148,20 @@ def torch_moe_align_block_size(
         num_expert_tokens = expert_tokens.shape[0]
 
         if num_expert_tokens > 0:
-            sorted_token_ids[current_pos:current_pos +
-                             num_expert_tokens] = (expert_tokens)
+            sorted_token_ids[current_pos : current_pos + num_expert_tokens] = (
+                expert_tokens
+            )
 
             expert_blocks_needed = expert_padded_counts[expert_id] // block_size
-            expert_ids[current_block:current_block +
-                       expert_blocks_needed] = (expert_id)
+            expert_ids[current_block : current_block + expert_blocks_needed] = expert_id
 
             current_pos += expert_padded_counts[expert_id]
             current_block += expert_blocks_needed
 
     total_padded_tokens = expert_padded_counts.sum()
-    num_tokens_post_pad = torch.tensor([total_padded_tokens],
-                                       dtype=torch.int32,
-                                       device=topk_ids.device)
+    num_tokens_post_pad = torch.tensor(
+        [total_padded_tokens], dtype=torch.int32, device=topk_ids.device
+    )
 
     if expert_map is not None:
         expert_ids = expert_map[expert_ids]
@@ -173,37 +174,32 @@ def torch_moe_align_block_size(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("pad_sorted_ids", [False, True])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
-def test_moe_align_block_size(m: int, topk: int, num_experts: int,
-                              block_size: int, pad_sorted_ids: bool):
+def test_moe_align_block_size(
+    m: int, topk: int, num_experts: int, block_size: int, pad_sorted_ids: bool
+):
     """Test moe_align_block_size without expert mapping"""
     topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
     for i in range(m):
         experts = torch.randperm(num_experts, device="cuda")[:topk]
         topk_ids[i] = experts
 
-    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
-        moe_align_block_size(
-            topk_ids=topk_ids,
-            block_size=block_size,
-            num_experts=num_experts,
-            pad_sorted_ids=pad_sorted_ids,
-        ))
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = moe_align_block_size(
+        topk_ids=topk_ids,
+        block_size=block_size,
+        num_experts=num_experts,
+        pad_sorted_ids=pad_sorted_ids,
+    )
     golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
         torch_moe_align_block_size(
             topk_ids=topk_ids,
             block_size=block_size,
             num_experts=num_experts,
             pad_sorted_ids=pad_sorted_ids,
-        ))
+        )
+    )
 
-    torch.testing.assert_close(actual_num_tokens,
-                               golden_num_tokens,
-                               atol=0,
-                               rtol=0)
-    torch.testing.assert_close(actual_expert_ids,
-                               golden_expert_ids,
-                               atol=0,
-                               rtol=0)
+    torch.testing.assert_close(actual_num_tokens, golden_num_tokens, atol=0, rtol=0)
+    torch.testing.assert_close(actual_expert_ids, golden_expert_ids, atol=0, rtol=0)
 
     # For sorted_token_ids, verify block-level correctness rather than exact
     # order Tokens within each expert's blocks can be in any order, but expert
@@ -219,16 +215,18 @@ def test_moe_align_block_size(m: int, topk: int, num_experts: int,
 
     total_tokens = m * topk
     assert actual_num_tokens.item() % block_size == 0, (
-        "num_tokens_post_pad should be divisible by block_size")
+        "num_tokens_post_pad should be divisible by block_size"
+    )
     assert actual_num_tokens.item() >= total_tokens, (
-        "num_tokens_post_pad should be at least total_tokens")
+        "num_tokens_post_pad should be at least total_tokens"
+    )
     valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens]
     assert len(valid_tokens) == total_tokens, (
-        f"Should have exactly {total_tokens} valid tokens, "
-        f"got {len(valid_tokens)}")
-    assert (actual_expert_ids >= 0).all() and (
-        actual_expert_ids
-        < num_experts).all(), "expert_ids should contain valid expert indices"
+        f"Should have exactly {total_tokens} valid tokens, got {len(valid_tokens)}"
+    )
+    assert (actual_expert_ids >= 0).all() and (actual_expert_ids < num_experts).all(), (
+        "expert_ids should contain valid expert indices"
+    )
 
 
 @pytest.mark.parametrize("m", [16, 32])
@@ -236,46 +234,37 @@ def test_moe_align_block_size(m: int, topk: int, num_experts: int,
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
-def test_moe_align_block_size_with_expert_map(m: int, topk: int,
-                                              num_experts: int,
-                                              block_size: int):
+def test_moe_align_block_size_with_expert_map(
+    m: int, topk: int, num_experts: int, block_size: int
+):
     """Test moe_align_block_size with expert mapping (EP scenario)"""
     topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
     for i in range(m):
         experts = torch.randperm(num_experts, device="cuda")[:topk]
         topk_ids[i] = experts
 
-    expert_map = torch.full((num_experts, ),
-                            -1,
-                            device="cuda",
-                            dtype=torch.int32)
+    expert_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
     local_experts = list(range(0, num_experts, 2))
     for i, expert_id in enumerate(local_experts):
         expert_map[expert_id] = i
 
-    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
-        moe_align_block_size(
-            topk_ids=topk_ids,
-            block_size=block_size,
-            num_experts=num_experts,
-            expert_map=expert_map,
-        ))
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = moe_align_block_size(
+        topk_ids=topk_ids,
+        block_size=block_size,
+        num_experts=num_experts,
+        expert_map=expert_map,
+    )
     golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
         torch_moe_align_block_size(
             topk_ids=topk_ids,
             block_size=block_size,
             num_experts=num_experts,
             expert_map=expert_map,
-        ))
-
-    torch.testing.assert_close(actual_num_tokens,
-                               golden_num_tokens,
-                               atol=0,
-                               rtol=0)
-    torch.testing.assert_close(actual_expert_ids,
-                               golden_expert_ids,
-                               atol=0,
-                               rtol=0)
+        )
+    )
+
+    torch.testing.assert_close(actual_num_tokens, golden_num_tokens, atol=0, rtol=0)
+    torch.testing.assert_close(actual_expert_ids, golden_expert_ids, atol=0, rtol=0)
     _verify_expert_level_sorting(
         actual_sorted_ids,
         golden_sorted_ids,
@@ -290,26 +279,118 @@ def test_moe_align_block_size_deterministic():
     m, topk, num_experts, block_size = 128, 2, 32, 64
 
     torch.manual_seed(42)
-    topk_ids = torch.randint(0,
-                             num_experts, (m, topk),
-                             device="cuda",
-                             dtype=torch.int32)
+    topk_ids = torch.randint(
+        0, num_experts, (m, topk), device="cuda", dtype=torch.int32
+    )
 
     # expect the results to be reproducible
     results = []
     for _ in range(5):
         sorted_ids, expert_ids, num_tokens = moe_align_block_size(
-            topk_ids=topk_ids, block_size=block_size, num_experts=num_experts)
-        results.append(
-            (sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))
+            topk_ids=topk_ids, block_size=block_size, num_experts=num_experts
+        )
+        results.append((sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))
 
     for i in range(1, len(results)):
-        assert torch.equal(
-            results[0][0],
-            results[i][0]), ("sorted_ids should be deterministic")
-        assert torch.equal(
-            results[0][1],
-            results[i][1]), ("expert_ids should be deterministic")
-        assert torch.equal(
-            results[0][2],
-            results[i][2]), ("num_tokens should be deterministic")
+        assert torch.equal(results[0][0], results[i][0]), (
+            "sorted_ids should be deterministic"
+        )
+        assert torch.equal(results[0][1], results[i][1]), (
+            "expert_ids should be deterministic"
+        )
+        assert torch.equal(results[0][2], results[i][2]), (
+            "num_tokens should be deterministic"
+        )
+
+
+@pytest.mark.parametrize("max_tokens_per_batch", [13, 16, 512])
+@pytest.mark.parametrize("num_experts", [8, 16, 32, 64])
+@pytest.mark.parametrize("block_size", [8, 16, 32, 64])
+@pytest.mark.parametrize("simulate_empty_batches", [False, True])
+def test_batched_moe_align_block_size(
+    max_tokens_per_batch: int,
+    num_experts: int,
+    block_size: int,
+    simulate_empty_batches: bool,
+):
+    def ref_outputs(
+        expert_num_tokens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        E = expert_num_tokens.size(0)
+
+        # Round up so each batch can be split to blocks evenly.
+        Msum = round_up(max_tokens_per_batch, block_size) * E
+        ref_sorted_ids = torch.empty((Msum,), dtype=torch.int32)
+        ref_expert_ids = torch.empty((Msum // block_size,), dtype=torch.int32)
+        ref_num_tokens_post_pad = torch.empty((1,), dtype=torch.int32)
+
+        # Intialize
+        sentinel = E * max_tokens_per_batch
+        ref_sorted_ids.fill_(sentinel)
+        ref_expert_ids.fill_(-1)
+
+        # Fill ref_sorted_ids
+        i = 0
+        for expert_id, expert_nt in enumerate(expert_num_tokens):
+            token_offset = expert_id * max_tokens_per_batch
+            for j in range(expert_nt):
+                ref_sorted_ids[i] = token_offset + j
+                i += 1
+            # round up i to the next block_size
+            i = round_up(i, block_size)
+
+        ref_num_tokens_post_pad[0] = i
+
+        # Fill expert_ids
+        nt_ceil_sum = 0
+        for expert_id, expert_nt in enumerate(expert_num_tokens):
+            expert_ids_offset = nt_ceil_sum // block_size
+            ceil_expert_nt = round_up(int(expert_nt.item()), block_size)
+            num_blocks = ceil_expert_nt // block_size
+            for x in range(num_blocks):
+                ref_expert_ids[expert_ids_offset + x] = expert_id
+            nt_ceil_sum += ceil_expert_nt
+
+        return (
+            ref_sorted_ids.to("cuda"),
+            ref_expert_ids.to("cuda"),
+            ref_num_tokens_post_pad.to("cuda"),
+        )
+
+    # Compute expert_num_tokens
+    expert_num_tokens = torch.randint(
+        low=0,
+        high=max_tokens_per_batch,
+        size=(num_experts,),
+        device="cpu",
+        dtype=torch.int32,
+    )
+    if simulate_empty_batches:
+        # mark half the batches to have 0 tokens
+        zero_batches = torch.randperm(num_experts)[: num_experts // 2]
+        expert_num_tokens[zero_batches] = 0
+
+    # ref outputs
+    ref_sorted_ids, ref_expert_ids, ref_num_tokens_post_pad = ref_outputs(
+        expert_num_tokens
+    )
+
+    # outputs
+    sorted_ids, expert_ids, num_tokens_post_pad = batched_moe_align_block_size(
+        max_tokens_per_batch, block_size, expert_num_tokens.to("cuda")
+    )
+
+    assert ref_sorted_ids.size() == sorted_ids.size(), (
+        f"{ref_sorted_ids.size()} vs {sorted_ids.size()}"
+    )
+    assert ref_expert_ids.size() == expert_ids.size(), (
+        f"{ref_expert_ids.size()} vs {expert_ids.size()}"
+    )
+    assert ref_num_tokens_post_pad.size() == num_tokens_post_pad.size(), (
+        f"{ref_num_tokens_post_pad.size()} vs {num_tokens_post_pad.size()}"
+    )
+    torch.testing.assert_close(ref_sorted_ids, sorted_ids, atol=0, rtol=0)
+    torch.testing.assert_close(ref_expert_ids, expert_ids, atol=0, rtol=0)
+    torch.testing.assert_close(
+        ref_num_tokens_post_pad, num_tokens_post_pad, atol=0, rtol=0
+    )
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index d71664d94b9c8de6f488f8d9774685afcfa7b6ad..ba1f657b3ecdaf3fc7dd80e50cfbb8a19e76d566 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -5,8 +5,6 @@
 Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
 """
 
-from typing import Optional
-
 import numpy as np
 import pytest
 import torch
@@ -14,7 +12,10 @@ import torch
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
+    moe_permute,
+    moe_permute_unpermute_supported,
+    moe_unpermute,
+)
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [16, 64, 256]
@@ -24,35 +25,34 @@ current_platform.seed_everything(0)
 
 
 def torch_permute(
-        hidden_states: torch.Tensor,
-        topk_ids: torch.Tensor,
-        #   token_expert_indices: torch.Tensor,
-        topk: int,
-        n_expert: int,
-        n_local_expert: int,
-        start_expert: int,
-        expert_map: Optional[torch.Tensor] = None,
-        align_block_size: Optional[int] = None,
-        fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    #   token_expert_indices: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+    start_expert: int,
+    expert_map: torch.Tensor | None = None,
+    align_block_size: int | None = None,
+    fill_invalid_expert: int = -1,
+) -> list[torch.Tensor]:
     n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
     if expert_map is not None:
-        is_local_expert = (expert_map[topk_ids] != -1)
-        not_local_expert = (expert_map[topk_ids] == -1)
-        topk_ids = is_local_expert * (
-            topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
-    token_expert_indices = torch.arange(0,
-                                        n_token * topk,
-                                        dtype=torch.int32,
-                                        device=hidden_states.device).reshape(
-                                            (n_token, topk))
+        is_local_expert = expert_map[topk_ids] != -1
+        not_local_expert = expert_map[topk_ids] == -1
+        topk_ids = is_local_expert * (topk_ids - start_expert) + not_local_expert * (
+            topk_ids + n_expert
+        )
+    token_expert_indices = torch.arange(
+        0, n_token * topk, dtype=torch.int32, device=hidden_states.device
+    ).reshape((n_token, topk))
 
-    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
-                                                 stable=True)
+    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(), stable=True)
     dst_row_id2src_row_id_map = token_expert_indices.flatten()[sorted_indices]
 
-    expert_first_token_offset = torch.zeros(n_local_expert + 1,
-                                            dtype=torch.int64,
-                                            device="cuda")
+    expert_first_token_offset = torch.zeros(
+        n_local_expert + 1, dtype=torch.int64, device="cuda"
+    )
     idx = 0
     for i in range(0, n_local_expert):
         cnt = 0
@@ -64,116 +64,133 @@ def torch_permute(
     _, src2dst_idx = torch.sort(dst_row_id2src_row_id_map)
     valid_row_idx = []
     if align_block_size is None:
-
-        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
-                                               topk, ...]
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map // topk, ...]
         permuted_row_size = permuted_hidden_states.shape[0]
-        m_indices = torch.empty(permuted_row_size,
-                                device="cuda",
-                                dtype=torch.int32).fill_(fill_invalid_expert)
+        m_indices = torch.empty(
+            permuted_row_size, device="cuda", dtype=torch.int32
+        ).fill_(fill_invalid_expert)
         for i in range(1, n_local_expert + 1):
             first_token_offset = expert_first_token_offset[i - 1]
             last_token_offset = expert_first_token_offset[i]
             m_indices[first_token_offset:last_token_offset] = i - 1
         src_row_id2dst_row_id_map = torch.arange(
-            0, n_token * topk, device="cuda",
-            dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
+            0, n_token * topk, device="cuda", dtype=torch.int32
+        )[src2dst_idx].reshape((n_token, topk))
         valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
-        dst_row_id2src_row_id_map[
-            expert_first_token_offset[-1]:] = n_token * topk
+        dst_row_id2src_row_id_map[expert_first_token_offset[-1] :] = n_token * topk
         return [
-            permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
-            valid_row_idx
+            permuted_hidden_states,
+            expert_first_token_offset,
+            src_row_id2dst_row_id_map,
+            dst_row_id2src_row_id_map,
+            m_indices,
+            valid_row_idx,
         ]
     else:
-        permuted_row_size = (topk * n_token + n_expert *
-                             (align_block_size - 1) + align_block_size -
-                             1) // align_block_size * align_block_size
-        permuted_idx = torch.full((permuted_row_size, ),
-                                  n_token * topk,
-                                  dtype=torch.int32,
-                                  device=hidden_states.device)
-        permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
-                                             device="cuda",
-                                             dtype=hidden_states.dtype)
-        align_src_row_id2dst_row_id = torch.empty(n_token * topk,
-                                                  device="cuda",
-                                                  dtype=torch.int32)
-        align_expert_first_token_offset = torch.zeros_like(
-            expert_first_token_offset)
-        m_indices = torch.empty(permuted_row_size,
-                                device="cuda",
-                                dtype=torch.int32).fill_(fill_invalid_expert)
+        permuted_row_size = (
+            (topk * n_token + n_expert * (align_block_size - 1) + align_block_size - 1)
+            // align_block_size
+            * align_block_size
+        )
+        permuted_idx = torch.full(
+            (permuted_row_size,),
+            n_token * topk,
+            dtype=torch.int32,
+            device=hidden_states.device,
+        )
+        permuted_hidden_states = torch.empty(
+            (permuted_row_size, n_hidden), device="cuda", dtype=hidden_states.dtype
+        )
+        align_src_row_id2dst_row_id = torch.empty(
+            n_token * topk, device="cuda", dtype=torch.int32
+        )
+        align_expert_first_token_offset = torch.zeros_like(expert_first_token_offset)
+        m_indices = torch.empty(
+            permuted_row_size, device="cuda", dtype=torch.int32
+        ).fill_(fill_invalid_expert)
         # get align_permuted_hidden_states,
         # valid row_idx and align_expert_first_token_offset
         for i in range(1, n_local_expert + 1):
             first_token_offset = expert_first_token_offset[i - 1]
             last_token_offset = expert_first_token_offset[i]
             n_token_in_expert = last_token_offset - first_token_offset
-            align_expert_first_token_offset[
-                i] = align_expert_first_token_offset[
-                    i - 1] + (n_token_in_expert + align_block_size -
-                              1) // align_block_size * align_block_size
+            align_expert_first_token_offset[i] = (
+                align_expert_first_token_offset[i - 1]
+                + (n_token_in_expert + align_block_size - 1)
+                // align_block_size
+                * align_block_size
+            )
             align_first_token_offset = align_expert_first_token_offset[i - 1]
             align_last_token_offset = align_expert_first_token_offset[i]
             dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
-                first_token_offset:first_token_offset + n_token_in_expert]
+                first_token_offset : first_token_offset + n_token_in_expert
+            ]
             # store token in current expert with align_first_token_offset
-            permuted_hidden_states[align_first_token_offset:\
-                                   align_first_token_offset+n_token_in_expert,\
-                                      ...] = hidden_states[\
-                                       dst_row_id2src_row_id_in_expert // topk,\
-                                          ...]
-            permuted_idx[align_first_token_offset:\
-                         align_first_token_offset+\
-                         n_token_in_expert] = dst_row_id2src_row_id_in_expert
+            permuted_hidden_states[
+                align_first_token_offset : align_first_token_offset + n_token_in_expert,
+                ...,
+            ] = hidden_states[dst_row_id2src_row_id_in_expert // topk, ...]
+            permuted_idx[
+                align_first_token_offset : align_first_token_offset + n_token_in_expert
+            ] = dst_row_id2src_row_id_in_expert
             # set current expert m_indices
             m_indices[align_first_token_offset:align_last_token_offset] = i - 1
             valid_row_idx += [
-                i for i in range(align_first_token_offset,
-                                 align_first_token_offset + n_token_in_expert)
+                i
+                for i in range(
+                    align_first_token_offset,
+                    align_first_token_offset + n_token_in_expert,
+                )
             ]
         # get align_src_row_id2dst_row_id
         for i in range(n_token * topk):
             eid = sorted_topk_ids[i]
-            if (eid >= n_local_expert):
+            if eid >= n_local_expert:
                 # check token not in local expert
-                align_src_row_id2dst_row_id[
-                    i] = align_expert_first_token_offset[-1]
+                align_src_row_id2dst_row_id[i] = align_expert_first_token_offset[-1]
                 continue
             first_token_offset = expert_first_token_offset[eid]
             align_first_token_offset = align_expert_first_token_offset[eid]
             token_offset = i - first_token_offset
-            align_src_row_id2dst_row_id[
-                i] = align_first_token_offset + token_offset
-        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[\
-            src2dst_idx].reshape((n_token, topk))
+            align_src_row_id2dst_row_id[i] = align_first_token_offset + token_offset
+        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[src2dst_idx].reshape(
+            (n_token, topk)
+        )
         return [
-            permuted_hidden_states, align_expert_first_token_offset,
-            align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
+            permuted_hidden_states,
+            align_expert_first_token_offset,
+            align_src_row_id2dst_row_id,
+            permuted_idx,
+            m_indices,
+            valid_row_idx,
         ]
 
 
-def torch_unpermute(permuted_hidden_states: torch.Tensor,
-                    topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                    token_expert_indices: torch.Tensor,
-                    src_row_id2dst_row_id_map: torch.Tensor,
-                    valid_row_idx: torch.Tensor, topk: int,
-                    n_expert: int) -> torch.Tensor:
+def torch_unpermute(
+    permuted_hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    src_row_id2dst_row_id_map: torch.Tensor,
+    valid_row_idx: torch.Tensor,
+    topk: int,
+    n_expert: int,
+) -> torch.Tensor:
     # ignore invalid row
     n_hidden = permuted_hidden_states.shape[1]
-    mask = torch.zeros(permuted_hidden_states.shape[0],
-                       dtype=bool,
-                       device="cuda")
+    mask = torch.zeros(permuted_hidden_states.shape[0], dtype=bool, device="cuda")
     mask[valid_row_idx] = True
     permuted_hidden_states[~mask] = 0
 
     permuted_hidden_states = permuted_hidden_states[
-        src_row_id2dst_row_id_map.flatten(), ...]
+        src_row_id2dst_row_id_map.flatten(), ...
+    ]
     permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
-    output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
-        permuted_hidden_states.dtype)
+    output = (
+        (permuted_hidden_states * topk_weights.unsqueeze(2))
+        .sum(1)
+        .to(permuted_hidden_states.dtype)
+    )
     return output
 
 
@@ -184,59 +201,76 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("align_block_size", [None, 128])
-def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
-                               n_expert: int, ep_size: int, dtype: torch.dtype,
-                               align_block_size: Optional[int]):
+def test_moe_permute_unpermute(
+    n_token: int,
+    n_hidden: int,
+    topk: int,
+    n_expert: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    align_block_size: int | None,
+):
     if not moe_permute_unpermute_supported():
         pytest.skip("moe_permute_unpermute is not supported on this platform.")
     fill_invalid_expert = 0
     ep_rank = np.random.randint(0, ep_size)
     expert_map = None
     n_local_expert = n_expert
-    if (ep_size != 1):
-        n_local_expert, expert_map = determine_expert_map(
-            ep_size, ep_rank, n_expert)
+    if ep_size != 1:
+        n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert)
         expert_map = expert_map.cuda()
     start_expert = n_local_expert * ep_rank
     current_platform.seed_everything(0)
     hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
     gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
     topk_weights, topk_ids, token_expert_indices = fused_topk(
-        hidden_states, gating_output, topk, False)
-    (gold_permuted_hidden_states, gold_expert_first_token_offset,
-     gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
-     valid_row_idx) = torch_permute(
-         hidden_states,
-         topk_ids,
-         # token_expert_indices,
-         topk,
-         n_expert,
-         n_local_expert,
-         start_expert,
-         expert_map=expert_map,
-         align_block_size=align_block_size,
-         fill_invalid_expert=fill_invalid_expert)
+        hidden_states, gating_output, topk, False
+    )
+    (
+        gold_permuted_hidden_states,
+        gold_expert_first_token_offset,
+        gold_inv_permuted_idx,
+        gold_permuted_idx,
+        gold_m_indices,
+        valid_row_idx,
+    ) = torch_permute(
+        hidden_states,
+        topk_ids,
+        # token_expert_indices,
+        topk,
+        n_expert,
+        n_local_expert,
+        start_expert,
+        expert_map=expert_map,
+        align_block_size=align_block_size,
+        fill_invalid_expert=fill_invalid_expert,
+    )
 
-    (permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
-     m_indices) = moe_permute(hidden_states=hidden_states,
-                              a1q_scale=None,
-                              topk_ids=topk_ids,
-                              n_expert=n_expert,
-                              n_local_expert=n_local_expert,
-                              expert_map=expert_map,
-                              align_block_size=align_block_size,
-                              fill_invalid_expert=fill_invalid_expert)
+    (
+        permuted_hidden_states,
+        _,
+        expert_first_token_offset,
+        inv_permuted_idx,
+        m_indices,
+    ) = moe_permute(
+        hidden_states=hidden_states,
+        a1q_scale=None,
+        topk_ids=topk_ids,
+        n_expert=n_expert,
+        n_local_expert=n_local_expert,
+        expert_map=expert_map,
+        align_block_size=align_block_size,
+        fill_invalid_expert=fill_invalid_expert,
+    )
 
     # check expert_first_token_offset
-    torch.testing.assert_close(gold_expert_first_token_offset,
-                               expert_first_token_offset,
-                               atol=0,
-                               rtol=0)
+    torch.testing.assert_close(
+        gold_expert_first_token_offset, expert_first_token_offset, atol=0, rtol=0
+    )
     # check src_row_id2dst_row_id_map
-    torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
-                               inv_permuted_idx,
-                               atol=0,
-                               rtol=0)
+    torch.testing.assert_close(
+        gold_inv_permuted_idx.flatten(), inv_permuted_idx, atol=0, rtol=0
+    )
     # check mindice
     # current kernel usage assumes deepgemm requires align_block_size
     # when it's not provided then we don't compute m_indices (for cutlass)
@@ -244,19 +278,28 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
         torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
 
     # check permuted_hidden_states, only valid token
-    torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
-                               permuted_hidden_states[valid_row_idx],
-                               atol=0,
-                               rtol=0)
+    torch.testing.assert_close(
+        gold_permuted_hidden_states[valid_row_idx],
+        permuted_hidden_states[valid_row_idx],
+        atol=0,
+        rtol=0,
+    )
     # add a random tensor to simulate group gemm
-    result0 = 0.5 * permuted_hidden_states + torch.randn_like(
-        permuted_hidden_states)
+    result0 = 0.5 * permuted_hidden_states + torch.randn_like(permuted_hidden_states)
     result4 = torch.empty_like(hidden_states)
-    moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
-                  expert_first_token_offset)
+    moe_unpermute(
+        result4, result0, topk_weights, inv_permuted_idx, expert_first_token_offset
+    )
 
-    gold4 = torch_unpermute(result0, topk_weights, topk_ids,
-                            token_expert_indices, inv_permuted_idx,
-                            valid_row_idx, topk, n_local_expert)
+    gold4 = torch_unpermute(
+        result0,
+        topk_weights,
+        topk_ids,
+        token_expert_indices,
+        inv_permuted_idx,
+        valid_row_idx,
+        topk,
+        n_local_expert,
+    )
     # check unpermuted hidden
     torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index a48bfeb10b2e6828b9ba36818fb3922a73ca34be..aa544fe0e0f63c93927bcf3d13ccf9469ecefa56 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -4,9 +4,11 @@ import pytest
 import torch
 
 from tests.kernels.moe.utils import make_test_weights
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from tests.kernels.utils import torch_moe
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
@@ -16,16 +18,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip("Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        "Nvfp4 Requires compute capability of 10 or above.", allow_module_level=True
+    )
 
 MNK_FACTORS = [
     (2, 1024, 1024),
     (2, 1024, 1536),
     (2, 3072, 1024),
-    (2, 3072, 1536),
     (64, 1024, 1024),
-    (64, 1024, 1536),
     (64, 3072, 1024),
     (64, 2048, 1536),
     (224, 1024, 1024),
@@ -36,38 +37,36 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", [40, 64, 256])
 @pytest.mark.parametrize("topk", [1, 6, 8])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @torch.inference_mode()
-def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
-                                  dtype: torch.dtype):
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
         quant_blocksize = 16
 
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
-        (_, w1_q, w1_blockscale,
-         w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights(
-             e,
-             n,
-             k,
-             in_dtype=dtype,
-             quant_dtype="nvfp4",
-             block_shape=None,  # use quant_blocksize?
-             per_out_ch_quant=False,
-         )
+        (_, w1_q, w1_blockscale, w1_gs), (_, w2_q, w2_blockscale, w2_gs) = (
+            make_test_weights(
+                e,
+                n,
+                k,
+                in_dtype=dtype,
+                quant_dtype="nvfp4",
+                block_shape=None,  # use quant_blocksize?
+                per_out_ch_quant=False,
+            )
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
-        a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
-        a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+        a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
 
         assert w1_gs is not None
         assert w2_gs is not None
@@ -97,40 +96,44 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         )
 
         # Reference check:
-        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_global_scale = (
+            (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+        ).to(torch.float32)
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
 
-        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                               a_scale_interleaved,
-                                               a_global_scale,
-                                               dtype=a.dtype,
-                                               device=a.device,
-                                               block_size=quant_blocksize)
+        a_in_dtype = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=a.dtype,
+            device=a.device,
+            block_size=quant_blocksize,
+        )
 
         w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
-            w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
-                                                  w1_blockscale[idx],
-                                                  w1_gs[idx],
-                                                  dtype=dtype,
-                                                  device=w1_q.device,
-                                                  block_size=quant_blocksize)
-            w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
-                                                  w2_blockscale[idx],
-                                                  w2_gs[idx],
-                                                  dtype=dtype,
-                                                  device=w2_q.device,
-                                                  block_size=quant_blocksize)
+            w1_d[idx] = dequantize_nvfp4_to_dtype(
+                w1_q[idx],
+                w1_blockscale[idx],
+                w1_gs[idx],
+                dtype=dtype,
+                device=w1_q.device,
+                block_size=quant_blocksize,
+            )
+            w2_d[idx] = dequantize_nvfp4_to_dtype(
+                w2_q[idx],
+                w2_blockscale[idx],
+                w2_gs[idx],
+                dtype=dtype,
+                device=w2_q.device,
+                block_size=quant_blocksize,
+            )
 
         torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
 
-        torch.testing.assert_close(torch_output,
-                                   cutlass_output,
-                                   atol=1e-1,
-                                   rtol=1e-1)
+        torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
similarity index 55%
rename from tests/kernels/moe/test_mxfp4_moe.py
rename to tests/kernels/moe/test_ocp_mx_moe.py
index 61d3311cc1624f5f63ad158ff9217e147ac43391..91b508d4163cc6db147c3e49c4fa3ae2bcb7f542 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -4,36 +4,40 @@
 import importlib.metadata
 from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import Optional
 
 import pytest
 import torch
 from packaging import version
 
-from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
-    QuarkLinearMethod, QuarkW4A4MXFP4)
-from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
-    QuarkW4A4MXFp4MoEMethod)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 
 QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
-    importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
 
-TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
-) and current_platform.is_device_capability(100)
+TRTLLM_GEN_MXFP4_AVAILABLE = (
+    current_platform.is_cuda() and current_platform.is_device_capability(100)
+)
 
-HOPPER_MXFP4_BF16_AVAILABLE = (current_platform.is_cuda()
-                               and current_platform.is_device_capability(90)
-                               and has_flashinfer())
+HOPPER_MXFP4_BF16_AVAILABLE = (
+    current_platform.is_cuda()
+    and current_platform.is_device_capability(90)
+    and has_flashinfer()
+)
 
 if TRTLLM_GEN_MXFP4_AVAILABLE:
-    from flashinfer import (fp4_quantize, mxfp8_quantize,
-                            next_positive_power_of_2,
-                            reorder_rows_for_gated_act_gemm, shuffle_matrix_a,
-                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+    from flashinfer import (
+        fp4_quantize,
+        mxfp8_quantize,
+        next_positive_power_of_2,
+        reorder_rows_for_gated_act_gemm,
+        shuffle_matrix_a,
+        shuffle_matrix_sf_a,
+        trtllm_fp4_block_scale_moe,
+    )
     from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
-    from flashinfer.fused_moe.core import _maybe_get_cached_w2_permute_indices
+    from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache
 
 
 @dataclass
@@ -48,45 +52,57 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
-@pytest.mark.parametrize('model_case', [
-    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
-    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
-    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
-])
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.parametrize(
+    "model_case",
+    [
+        ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=2),
+        ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+        ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1),
+        ModelCase("fxmarty/Llama-3.1-70B-Instruct-2-layers-mxfp6", tp=1),
+        ModelCase("fxmarty/Llama-3.1-70B-Instruct-2-layers-mxfp6", tp=4),
+    ],
+)
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
     if torch.cuda.device_count() < model_case.tp:
-        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
-                    f"{torch.cuda.device_count()}")
+        pytest.skip(
+            f"This test requires >={model_case.tp} gpus, got only "
+            f"{torch.cuda.device_count()}"
+        )
 
-    with vllm_runner(model_case.model_id,
-                     tensor_parallel_size=model_case.tp,
-                     load_format="dummy") as llm:
+    # `cuda_graph_sizes=[16]` to reduce load time.
+    with vllm_runner(
+        model_case.model_id,
+        tensor_parallel_size=model_case.tp,
+        load_format="dummy",
+        cuda_graph_sizes=[16],
+    ) as llm:
+        # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562
+        # def check_model(model):
+        #     from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+        #         QuarkLinearMethod)
+        #     from vllm.model_executor.layers.quantization.quark.schemes.quark_ocp_mx import QuarkOCP_MX  # noqa: E501
+        #     from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+        #         QuarkOCP_MX_MoEMethod)
 
-        def check_model(model):
-            layer = model.model.layers[0]
+        #     layer = model.model.layers[0]
 
-            qkv_proj = layer.self_attn.qkv_proj
+        #     qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
-            assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        #     assert isinstance(qkv_proj.scheme, QuarkOCP_MX)
 
-            assert isinstance(layer.mlp.experts.quant_method,
-                              QuarkW4A4MXFp4MoEMethod)
+        #     assert isinstance(layer.mlp.experts.quant_method,
+        #                       QuarkOCP_MX_MoEMethod)
 
-        if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
-            llm.apply_model(check_model)
+        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+        #     llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Today I am in the French Alps and",
-                                     max_tokens=20)
+        output = llm.generate_greedy("Today I am in the French Alps and", max_tokens=20)
         assert output
 
 
-def swiglu(x,
-           alpha: float = 1.702,
-           beta: float = 1.0,
-           limit: Optional[float] = None):
+def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None):
     # Note we add an extra bias of 1 to the linear layer
     x_glu, x_linear = torch.chunk(x, 2, dim=-1)
     if limit is not None:
@@ -96,24 +112,19 @@ def swiglu(x,
     return out_glu * (x_linear + beta)
 
 
-fp4_lookup_table = [
-    0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6
-]
+fp4_lookup_table = [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6]
 
 
 def mxfp4_dequantize(x, scale):
     assert x.dtype == torch.uint8
     x = x.view(torch.uint8).to(torch.int32)
-    x_unpacked = torch.zeros(*x.shape[:-1],
-                             x.shape[-1] * 2,
-                             dtype=torch.int32,
-                             device=x.device)
+    x_unpacked = torch.zeros(
+        *x.shape[:-1], x.shape[-1] * 2, dtype=torch.int32, device=x.device
+    )
     x_unpacked[..., 0::2].copy_(x & 0xF)
     x_unpacked[..., 1::2].copy_((x >> 4) & 0xF)
 
-    x_float = torch.zeros(x_unpacked.shape,
-                          dtype=torch.float32,
-                          device=x.device)
+    x_float = torch.zeros(x_unpacked.shape, dtype=torch.float32, device=x.device)
     for i, val in enumerate(fp4_lookup_table):
         x_float[x_unpacked == i] = val
 
@@ -162,9 +173,10 @@ def reference_moe(
     t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
     t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
 
-    if act_type == 'mxfp8':
-        t_quantized, t_scale = mxfp8_quantize(t.to(torch.bfloat16),
-                                              is_sf_swizzled_layout=False)
+    if act_type == "mxfp8":
+        t_quantized, t_scale = mxfp8_quantize(
+            t.to(torch.bfloat16), is_sf_swizzled_layout=False
+        )
         t = mxfp8_dequantize(t_quantized, t_scale)
     # MLP #2
     mlp2_weight = w2[expert_indices, ...]
@@ -221,37 +233,53 @@ def tg_mxfp4_moe(
     transpose_optimized: bool = False,
 ) -> torch.Tensor:
     sf_block_size = 32
-    assert (w13_weight.dim() == 3 and w13_weight.shape[0] == num_experts
-            and w13_weight.shape[1] == intermediate_size * 2
-            and w13_weight.shape[2] == hidden_size // 2)
-    assert (w13_weight_scale.dim() == 3
-            and w13_weight_scale.shape[0] == num_experts
-            and w13_weight_scale.shape[1] == intermediate_size * 2
-            and w13_weight_scale.shape[2] == hidden_size // sf_block_size)
-    assert (w2_weight.dim() == 3 and w2_weight.shape[0] == num_experts
-            and w2_weight.shape[1] == hidden_size
-            and w2_weight.shape[2] == intermediate_size // 2)
-    assert (w2_weight_scale.dim() == 3
-            and w2_weight_scale.shape[1] == hidden_size
-            and w2_weight_scale.shape[2] == intermediate_size // sf_block_size)
-    assert (w13_bias.dim() == 2 and w13_bias.shape[0] == num_experts
-            and w13_bias.shape[1] == intermediate_size * 2)
-    assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts
-            and w2_bias.shape[1] == hidden_size)
+    assert (
+        w13_weight.dim() == 3
+        and w13_weight.shape[0] == num_experts
+        and w13_weight.shape[1] == intermediate_size * 2
+        and w13_weight.shape[2] == hidden_size // 2
+    )
+    assert (
+        w13_weight_scale.dim() == 3
+        and w13_weight_scale.shape[0] == num_experts
+        and w13_weight_scale.shape[1] == intermediate_size * 2
+        and w13_weight_scale.shape[2] == hidden_size // sf_block_size
+    )
+    assert (
+        w2_weight.dim() == 3
+        and w2_weight.shape[0] == num_experts
+        and w2_weight.shape[1] == hidden_size
+        and w2_weight.shape[2] == intermediate_size // 2
+    )
+    assert (
+        w2_weight_scale.dim() == 3
+        and w2_weight_scale.shape[1] == hidden_size
+        and w2_weight_scale.shape[2] == intermediate_size // sf_block_size
+    )
+    assert (
+        w13_bias.dim() == 2
+        and w13_bias.shape[0] == num_experts
+        and w13_bias.shape[1] == intermediate_size * 2
+    )
+    assert (
+        w2_bias.dim() == 2
+        and w2_bias.shape[0] == num_experts
+        and w2_bias.shape[1] == hidden_size
+    )
 
     # Swap w1 and w3 as the definition of
     # swiglu is different in the trtllm-gen
     w13_weight_scale_ = w13_weight_scale.clone()
     w13_weight_ = w13_weight.clone()
     w13_bias_ = w13_bias.clone()
-    w13_weight[:, :intermediate_size, :].copy_(
-        w13_weight_[:, intermediate_size:, :])
-    w13_weight[:, intermediate_size:, :].copy_(
-        w13_weight_[:, :intermediate_size, :])
+    w13_weight[:, :intermediate_size, :].copy_(w13_weight_[:, intermediate_size:, :])
+    w13_weight[:, intermediate_size:, :].copy_(w13_weight_[:, :intermediate_size, :])
     w13_weight_scale[:, :intermediate_size, :].copy_(
-        w13_weight_scale_[:, intermediate_size:, :])
+        w13_weight_scale_[:, intermediate_size:, :]
+    )
     w13_weight_scale[:, intermediate_size:, :].copy_(
-        w13_weight_scale_[:, :intermediate_size, :])
+        w13_weight_scale_[:, :intermediate_size, :]
+    )
     w13_bias[:, :intermediate_size].copy_(w13_bias_[:, intermediate_size:])
     w13_bias[:, intermediate_size:].copy_(w13_bias_[:, :intermediate_size])
 
@@ -261,18 +289,23 @@ def tg_mxfp4_moe(
     w13_bias_interleaved = []
     for i in range(num_experts):
         w13_weight_interleaved.append(
-            reorder_rows_for_gated_act_gemm(w13_weight[i].clone()))
+            reorder_rows_for_gated_act_gemm(w13_weight[i].clone())
+        )
         w13_weight_scale_interleaved.append(
-            reorder_rows_for_gated_act_gemm(w13_weight_scale[i].clone()))
+            reorder_rows_for_gated_act_gemm(w13_weight_scale[i].clone())
+        )
         w13_bias_interleaved.append(
-            reorder_rows_for_gated_act_gemm(w13_bias[i].clone().reshape(-1,
-                                                                        1)))
+            reorder_rows_for_gated_act_gemm(w13_bias[i].clone().reshape(-1, 1))
+        )
     w13_weight = torch.stack(w13_weight_interleaved).reshape(
-        num_experts, 2 * intermediate_size, hidden_size // 2)
+        num_experts, 2 * intermediate_size, hidden_size // 2
+    )
     w13_weight_scale = torch.stack(w13_weight_scale_interleaved).reshape(
-        num_experts, 2 * intermediate_size, hidden_size // 32)
+        num_experts, 2 * intermediate_size, hidden_size // 32
+    )
     w13_bias = torch.stack(w13_bias_interleaved).reshape(
-        num_experts, 2 * intermediate_size)
+        num_experts, 2 * intermediate_size
+    )
 
     # Shuffle weights and scaling factors for transposed mma output
     gemm1_weights_shuffled = []
@@ -286,92 +319,120 @@ def tg_mxfp4_moe(
     if transpose_optimized:
         for i in range(num_experts):
             # w13 weight shuffling
-            permute_indices = _maybe_get_cached_w2_permute_indices(
+            permute_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w13_weight[i].view(torch.uint8),
                 epilogue_tile_m,
             )
-            gemm1_weights_shuffled.append(w13_weight[i].view(
-                torch.uint8)[permute_indices.to(
-                    w13_weight.device)].contiguous())
+            gemm1_weights_shuffled.append(
+                w13_weight[i]
+                .view(torch.uint8)[permute_indices.to(w13_weight.device)]
+                .contiguous()
+            )
             # w13 scale shuffling
-            permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+            permute_sf_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w13_weight_scale[i].view(torch.uint8),
                 epilogue_tile_m,
                 num_elts_per_sf=16,
             )
             gemm1_scales_shuffled.append(
-                nvfp4_block_scale_interleave(w13_weight_scale[i].view(
-                    torch.uint8)[permute_sf_indices.to(
-                        w13_weight_scale.device)].contiguous()))
+                nvfp4_block_scale_interleave(
+                    w13_weight_scale[i]
+                    .view(torch.uint8)[permute_sf_indices.to(w13_weight_scale.device)]
+                    .contiguous()
+                )
+            )
             # w13 bias shuffling
-            permute_bias_indices = _maybe_get_cached_w2_permute_indices(
+            permute_bias_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w13_bias[i].clone().reshape(-1, 1),
                 epilogue_tile_m,
             )
-            gemm1_bias_shuffled.append(w13_bias[i].clone().reshape(
-                -1, 1)[permute_bias_indices.to(w13_bias.device)].contiguous())
+            gemm1_bias_shuffled.append(
+                w13_bias[i]
+                .clone()
+                .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
+                .contiguous()
+            )
             # w2 weight shuffling
-            permute_indices = _maybe_get_cached_w2_permute_indices(
+            permute_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w2_weight[i].view(torch.uint8),
                 epilogue_tile_m,
             )
-            gemm2_weights_shuffled.append(w2_weight[i].view(
-                torch.uint8)[permute_indices.to(
-                    w2_weight.device)].contiguous())
+            gemm2_weights_shuffled.append(
+                w2_weight[i]
+                .view(torch.uint8)[permute_indices.to(w2_weight.device)]
+                .contiguous()
+            )
             # w2 scale shuffling
-            permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+            permute_sf_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w2_weight_scale[i].view(torch.uint8),
                 epilogue_tile_m,
                 num_elts_per_sf=16,
             )
             gemm2_scales_shuffled.append(
-                nvfp4_block_scale_interleave(w2_weight_scale[i].view(
-                    torch.uint8)[permute_sf_indices.to(
-                        w2_weight_scale.device)].contiguous()))
+                nvfp4_block_scale_interleave(
+                    w2_weight_scale[i]
+                    .view(torch.uint8)[permute_sf_indices.to(w2_weight_scale.device)]
+                    .contiguous()
+                )
+            )
             # w2 bias shuffling
-            permute_indices = _maybe_get_cached_w2_permute_indices(
+            permute_indices = get_w2_permute_indices_with_cache(
                 _cache_permute_indices,
                 w2_bias[i].clone().reshape(-1, 1),
                 epilogue_tile_m,
             )
-            gemm2_bias_shuffled.append(w2_bias[i].clone().reshape(
-                -1, 1)[permute_indices.to(w2_bias.device)].contiguous())
+            gemm2_bias_shuffled.append(
+                w2_bias[i]
+                .clone()
+                .reshape(-1, 1)[permute_indices.to(w2_bias.device)]
+                .contiguous()
+            )
 
     else:
         for i in range(num_experts):
             gemm1_weights_shuffled.append(
-                shuffle_matrix_a(w13_weight[i].view(torch.uint8),
-                                 epilogue_tile_m))
+                shuffle_matrix_a(w13_weight[i].view(torch.uint8), epilogue_tile_m)
+            )
             gemm1_scales_shuffled.append(
-                shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
-                                    epilogue_tile_m))
+                shuffle_matrix_sf_a(
+                    w13_weight_scale[i].view(torch.uint8), epilogue_tile_m
+                )
+            )
 
             gemm2_weights_shuffled.append(
-                shuffle_matrix_a(w2_weight[i].view(torch.uint8),
-                                 epilogue_tile_m))
+                shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m)
+            )
             gemm2_scales_shuffled.append(
-                shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
-                                    epilogue_tile_m))
+                shuffle_matrix_sf_a(
+                    w2_weight_scale[i].view(torch.uint8), epilogue_tile_m
+                )
+            )
             gemm1_bias_shuffled.append(
-                shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m))
+                shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m)
+            )
             gemm2_bias_shuffled.append(
-                shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m))
+                shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m)
+            )
 
     w13_weight = torch.stack(gemm1_weights_shuffled)
-    w13_weight_scale = torch.stack(gemm1_scales_shuffled).reshape(
-        num_experts, 2 * intermediate_size,
-        hidden_size // sf_block_size).view(torch.float8_e4m3fn)
+    w13_weight_scale = (
+        torch.stack(gemm1_scales_shuffled)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
+        .view(torch.float8_e4m3fn)
+    )
     w13_bias = torch.stack(gemm1_bias_shuffled).reshape(num_experts, -1)
 
     w2_weight = torch.stack(gemm2_weights_shuffled)
-    w2_weight_scale = torch.stack(gemm2_scales_shuffled).reshape(
-        num_experts, hidden_size,
-        intermediate_size // sf_block_size).view(torch.float8_e4m3fn)
+    w2_weight_scale = (
+        torch.stack(gemm2_scales_shuffled)
+        .reshape(num_experts, hidden_size, intermediate_size // sf_block_size)
+        .view(torch.float8_e4m3fn)
+    )
     w2_bias = torch.stack(gemm2_bias_shuffled).reshape(num_experts, -1)
 
     tg_result = trtllm_fp4_block_scale_moe(
@@ -401,7 +462,8 @@ def tg_mxfp4_moe(
         routed_scaling_factor=None,
         tile_tokens_dim=get_tile_tokens_dim(hidden_states, topk, num_experts),
         routing_method_type=1,  # renormalize
-        do_finalize=True)[0]
+        do_finalize=True,
+    )[0]
     return tg_result
 
 
@@ -424,20 +486,21 @@ def check_accuracy(a, b, atol, rtol, percent):
     if mismatch_percent > 1 - percent:
         raise Exception(
             f"Mismatch percentage is {mismatch_percent:.4f} for rtol {rtol} "
-            f"(threshold: {1-percent:.4f})")
+            f"(threshold: {1 - percent:.4f})"
+        )
 
 
 @pytest.mark.parametrize("topk", [1, 4])
 @pytest.mark.parametrize("num_experts", [32, 128])
 @pytest.mark.parametrize("num_tokens", [1, 128, 1024])
 @pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
-@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
-                                              (1.702, 1.0, 7.0)])
-@pytest.mark.parametrize("act_type", ['mxfp8', 'bf16'])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), (1.702, 1.0, 7.0)])
+@pytest.mark.parametrize("act_type", ["mxfp8", "bf16"])
 @pytest.mark.parametrize("transpose_optimized", [False, True])
 @pytest.mark.skipif(
     not TRTLLM_GEN_MXFP4_AVAILABLE,
-    reason="nvidia gpu and compute capability sm100 is required for this test")
+    reason="nvidia gpu and compute capability sm100 is required for this test",
+)
 def test_trtllm_gen_mxfp4_fused_moe(
     topk: int,
     num_experts: int,
@@ -446,51 +509,58 @@ def test_trtllm_gen_mxfp4_fused_moe(
     hidden_size: int,
     alpha: float,
     beta: float,
-    limit: Optional[float],
+    limit: float | None,
     act_type: str,
     transpose_optimized: bool,
 ):
     seed = 42
     torch.manual_seed(seed)
-    hidden_states = torch.randn(num_tokens,
-                                hidden_size,
-                                device="cuda:0",
-                                dtype=torch.bfloat16)
-    w13 = (torch.randn(num_experts,
-                       intermediate_size * 2,
-                       hidden_size,
-                       device="cuda:0",
-                       dtype=torch.bfloat16))
-    w2 = (torch.randn(num_experts,
-                      hidden_size,
-                      intermediate_size,
-                      device="cuda:0",
-                      dtype=torch.bfloat16))
-    bias13 = torch.randn(num_experts, intermediate_size * 2,
-                         device="cuda:0") * 10
+    hidden_states = torch.randn(
+        num_tokens, hidden_size, device="cuda:0", dtype=torch.bfloat16
+    )
+    w13 = torch.randn(
+        num_experts,
+        intermediate_size * 2,
+        hidden_size,
+        device="cuda:0",
+        dtype=torch.bfloat16,
+    )
+    w2 = torch.randn(
+        num_experts,
+        hidden_size,
+        intermediate_size,
+        device="cuda:0",
+        dtype=torch.bfloat16,
+    )
+    bias13 = torch.randn(num_experts, intermediate_size * 2, device="cuda:0") * 10
     bias2 = torch.randn(num_experts, hidden_size, device="cuda:0") * 10
-    router_logits = torch.rand(num_tokens, num_experts,
-                               dtype=torch.float32).cuda()
-
-    w13, w13_scale = fp4_quantize(w13,
-                                  torch.tensor(1.0, device="cuda:0"),
-                                  32,
-                                  sf_use_ue8m0=True,
-                                  is_sf_swizzled_layout=False)
+    router_logits = torch.rand(num_tokens, num_experts, dtype=torch.float32).cuda()
+
+    w13, w13_scale = fp4_quantize(
+        w13,
+        torch.tensor(1.0, device="cuda:0"),
+        32,
+        sf_use_ue8m0=True,
+        is_sf_swizzled_layout=False,
+    )
     w13_scale = w13_scale.view(torch.float8_e4m3fn).reshape(
-        num_experts, intermediate_size * 2, hidden_size // 32)
-    w2, w2_scale = fp4_quantize(w2,
-                                torch.tensor(1.0, device="cuda:0"),
-                                32,
-                                sf_use_ue8m0=True,
-                                is_sf_swizzled_layout=False)
+        num_experts, intermediate_size * 2, hidden_size // 32
+    )
+    w2, w2_scale = fp4_quantize(
+        w2,
+        torch.tensor(1.0, device="cuda:0"),
+        32,
+        sf_use_ue8m0=True,
+        is_sf_swizzled_layout=False,
+    )
     w2_scale = w2_scale.view(torch.float8_e4m3fn).reshape(
-        num_experts, hidden_size, intermediate_size // 32)
-    if act_type == 'mxfp8':
+        num_experts, hidden_size, intermediate_size // 32
+    )
+    if act_type == "mxfp8":
         hidden_states, hidden_states_scale = mxfp8_quantize(
-            hidden_states, is_sf_swizzled_layout=False)
-        hidden_states_scale = hidden_states_scale.view(
-            torch.float8_e4m3fn).reshape(-1)
+            hidden_states, is_sf_swizzled_layout=False
+        )
+        hidden_states_scale = hidden_states_scale.view(torch.float8_e4m3fn).reshape(-1)
     else:
         hidden_states_scale = None
 
@@ -500,9 +570,10 @@ def test_trtllm_gen_mxfp4_fused_moe(
     w2_ref = mxfp4_dequantize(w2.clone(), w2_scale.clone())
     bias13_ref = bias13
     bias2_ref = bias2
-    if act_type == 'mxfp8':
-        hidden_states_ref = mxfp8_dequantize(
-            hidden_states, hidden_states_scale).to(torch.float32)
+    if act_type == "mxfp8":
+        hidden_states_ref = mxfp8_dequantize(hidden_states, hidden_states_scale).to(
+            torch.float32
+        )
     else:
         hidden_states_ref = hidden_states.to(torch.float32)
     # Process tokens in chunks of 32 to reduce memory usage
@@ -529,29 +600,31 @@ def test_trtllm_gen_mxfp4_fused_moe(
 
     # trtllm-gen result
     if alpha is not None:
-        alpha = torch.full((num_experts, ), alpha, device=hidden_states.device)
+        alpha = torch.full((num_experts,), alpha, device=hidden_states.device)
     if limit is not None:
-        limit = torch.full((num_experts, ), limit, device=hidden_states.device)
+        limit = torch.full((num_experts,), limit, device=hidden_states.device)
     if beta is not None:
-        beta = torch.full((num_experts, ), beta, device=hidden_states.device)
-    tg_result = tg_mxfp4_moe(router_logits,
-                             topk,
-                             num_experts,
-                             intermediate_size,
-                             hidden_size,
-                             hidden_states,
-                             hidden_states_scale,
-                             w13,
-                             w13_scale,
-                             bias13,
-                             w2,
-                             w2_scale,
-                             bias2,
-                             act_type,
-                             alpha=alpha,
-                             beta=beta,
-                             limit=limit,
-                             transpose_optimized=transpose_optimized)
+        beta = torch.full((num_experts,), beta, device=hidden_states.device)
+    tg_result = tg_mxfp4_moe(
+        router_logits,
+        topk,
+        num_experts,
+        intermediate_size,
+        hidden_size,
+        hidden_states,
+        hidden_states_scale,
+        w13,
+        w13_scale,
+        bias13,
+        w2,
+        w2_scale,
+        bias2,
+        act_type,
+        alpha=alpha,
+        beta=beta,
+        limit=limit,
+        transpose_optimized=transpose_optimized,
+    )
     # relatively loose check since the mxfp4 quantization is less accurate
     check_accuracy(ref_result, tg_result, atol=0, rtol=0.3, percent=0.8)
 
@@ -573,8 +646,7 @@ def _interleave_scales_lastdim_by4(scales: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("num_tokens", [1, 128])
 @pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
-@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
-                                              (1.702, 1.0, 7.0)])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), (1.702, 1.0, 7.0)])
 @pytest.mark.skipif(
     not HOPPER_MXFP4_BF16_AVAILABLE,
     reason="nvidia gpu sm90 and flashinfer are required for this test",
@@ -587,58 +659,79 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
     hidden_size: int,
     alpha: float,
     beta: float,
-    limit: Optional[float],
+    limit: float | None,
 ):
     torch.manual_seed(42)
     device = "cuda:0"
 
     # Inputs
-    hidden_states = torch.randn(num_tokens,
-                                hidden_size,
-                                device=device,
-                                dtype=torch.bfloat16)
+    hidden_states = torch.randn(
+        num_tokens, hidden_size, device=device, dtype=torch.bfloat16
+    )
     # Random MXFP4 weights and scales (uint8), contiguous [w1; w3]
     w13_q = torch.randint(
         0,
-        256, (num_experts, 2 * intermediate_size, hidden_size // 2),
+        256,
+        (num_experts, 2 * intermediate_size, hidden_size // 2),
         device=device,
-        dtype=torch.uint8)
+        dtype=torch.uint8,
+    )
     w13_scale = torch.randint(
         118,
-        123, (num_experts, 2 * intermediate_size, hidden_size // 32),
+        123,
+        (num_experts, 2 * intermediate_size, hidden_size // 32),
         device=device,
-        dtype=torch.uint8)
+        dtype=torch.uint8,
+    )
 
-    w2_q = torch.randint(0,
-                         256,
-                         (num_experts, hidden_size, intermediate_size // 2),
-                         device=device,
-                         dtype=torch.uint8)
+    w2_q = torch.randint(
+        0,
+        256,
+        (num_experts, hidden_size, intermediate_size // 2),
+        device=device,
+        dtype=torch.uint8,
+    )
     w2_scale = torch.randint(
         118,
-        123, (num_experts, hidden_size, intermediate_size // 32),
+        123,
+        (num_experts, hidden_size, intermediate_size // 32),
         device=device,
-        dtype=torch.uint8)
+        dtype=torch.uint8,
+    )
     # Bias contiguous [b1; b3]
-    bias13 = (torch.randn(num_experts,
-                          2 * intermediate_size,
-                          device=device,
-                          dtype=torch.bfloat16) * 10)
-    bias2 = (torch.randn(
-        num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10)
-    router_logits = torch.rand(num_tokens,
-                               num_experts,
-                               dtype=torch.float32,
-                               device=device)
+    bias13 = (
+        torch.randn(
+            num_experts, 2 * intermediate_size, device=device, dtype=torch.bfloat16
+        )
+        * 10
+    )
+    bias2 = (
+        torch.randn(num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10
+    )
+    router_logits = torch.rand(
+        num_tokens, num_experts, dtype=torch.float32, device=device
+    )
 
     w13_ref = mxfp4_dequantize(w13_q.clone(), w13_scale.clone()).reshape(
-        num_experts, 2 * intermediate_size, hidden_size)
+        num_experts, 2 * intermediate_size, hidden_size
+    )
     w2_ref = mxfp4_dequantize(w2_q.clone(), w2_scale.clone()).reshape(
-        num_experts, hidden_size, intermediate_size)
-    ref = reference_moe(router_logits.to(torch.float32), topk, num_experts,
-                        hidden_states.to(torch.float32), w13_ref,
-                        bias13.to(torch.float32), w2_ref,
-                        bias2.to(torch.float32), alpha, beta, limit, 'bf16')
+        num_experts, hidden_size, intermediate_size
+    )
+    ref = reference_moe(
+        router_logits.to(torch.float32),
+        topk,
+        num_experts,
+        hidden_states.to(torch.float32),
+        w13_ref,
+        bias13.to(torch.float32),
+        w2_ref,
+        bias2.to(torch.float32),
+        alpha,
+        beta,
+        limit,
+        "bf16",
+    )
 
     from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
@@ -654,23 +747,24 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
     w13_s_inter = _interleave_scales_lastdim_by4(w13_s)
     w2_s_inter = _interleave_scales_lastdim_by4(w2_scale)
 
-    routing_weights = torch.nn.functional.softmax(router_logits,
-                                                  dim=1,
-                                                  dtype=torch.float32)
-    token_final_scales, token_selected_experts = torch.topk(routing_weights,
-                                                            topk,
-                                                            dim=-1)
-    token_final_scales = (token_final_scales /
-                          token_final_scales.sum(dim=-1, keepdim=True))
+    routing_weights = torch.nn.functional.softmax(
+        router_logits, dim=1, dtype=torch.float32
+    )
+    token_final_scales, token_selected_experts = torch.topk(
+        routing_weights, topk, dim=-1
+    )
+    token_final_scales = token_final_scales / token_final_scales.sum(
+        dim=-1, keepdim=True
+    )
     token_selected_experts = token_selected_experts.to(torch.int).contiguous()
 
     out = torch.empty_like(hidden_states, dtype=torch.bfloat16)
     if alpha is not None:
-        alpha = torch.full((num_experts, ), alpha, device=hidden_states.device)
+        alpha = torch.full((num_experts,), alpha, device=hidden_states.device)
     if beta is not None:
-        beta = torch.full((num_experts, ), beta, device=hidden_states.device)
+        beta = torch.full((num_experts,), beta, device=hidden_states.device)
     if limit is not None:
-        limit = torch.full((num_experts, ), limit, device=hidden_states.device)
+        limit = torch.full((num_experts,), limit, device=hidden_states.device)
 
     _ = flashinfer_cutlass_fused_moe(
         input=hidden_states,
@@ -680,8 +774,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
         fc2_expert_weights=w2_q,
         output_dtype=torch.bfloat16,
         output=out,
-        quant_scales=[w13_s_inter.to(torch.uint8),
-                      w2_s_inter.to(torch.uint8)],
+        quant_scales=[w13_s_inter.to(torch.uint8), w2_s_inter.to(torch.uint8)],
         fc1_expert_biases=w13_b,
         fc2_expert_biases=bias2.to(torch.bfloat16),
         swiglu_alpha=alpha,
@@ -702,11 +795,13 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("num_tokens", [1, 128])
 @pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
-@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
-                                              (1.702, 1.0, 7.0)])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), (1.702, 1.0, 7.0)])
 @pytest.mark.skipif(
-    not (current_platform.is_cuda()
-         and current_platform.is_device_capability(100) and has_flashinfer()),
+    not (
+        current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+        and has_flashinfer()
+    ),
     reason="NVIDIA GPU sm100 and flashinfer are required for this test",
 )
 def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
@@ -715,40 +810,51 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
     num_tokens: int,
     intermediate_size: int,
     hidden_size: int,
-    alpha: Optional[float],
-    beta: Optional[float],
-    limit: Optional[float],
+    alpha: float | None,
+    beta: float | None,
+    limit: float | None,
 ):
     torch.manual_seed(42)
     device = "cuda:0"
 
     # Inputs
-    hidden_states = torch.randn(num_tokens,
-                                hidden_size,
-                                device=device,
-                                dtype=torch.bfloat16)
+    hidden_states = torch.randn(
+        num_tokens, hidden_size, device=device, dtype=torch.bfloat16
+    )
     # Float weights in w13 format [w1; w3]
-    w13 = (torch.randn(num_experts,
-                       2 * intermediate_size,
-                       hidden_size,
-                       device=device,
-                       dtype=torch.bfloat16) / 10)
-    w2 = (torch.randn(num_experts,
-                      hidden_size,
-                      intermediate_size,
-                      device=device,
-                      dtype=torch.bfloat16) / 10)
+    w13 = (
+        torch.randn(
+            num_experts,
+            2 * intermediate_size,
+            hidden_size,
+            device=device,
+            dtype=torch.bfloat16,
+        )
+        / 10
+    )
+    w2 = (
+        torch.randn(
+            num_experts,
+            hidden_size,
+            intermediate_size,
+            device=device,
+            dtype=torch.bfloat16,
+        )
+        / 10
+    )
     # Bias contiguous [b1; b3]
-    bias13 = (torch.randn(num_experts,
-                          2 * intermediate_size,
-                          device=device,
-                          dtype=torch.bfloat16) * 10)
-    bias2 = (torch.randn(
-        num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10)
-    router_logits = torch.rand(num_tokens,
-                               num_experts,
-                               dtype=torch.float32,
-                               device=device)
+    bias13 = (
+        torch.randn(
+            num_experts, 2 * intermediate_size, device=device, dtype=torch.bfloat16
+        )
+        * 10
+    )
+    bias2 = (
+        torch.randn(num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10
+    )
+    router_logits = torch.rand(
+        num_tokens, num_experts, dtype=torch.float32, device=device
+    )
 
     # Quantize weights to MXFP4 per expert (SM100 path)
     from flashinfer import mxfp4_quantize
@@ -761,36 +867,56 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
             sfs.append(sf)
         return torch.stack(qs), torch.stack(sfs)
 
-    def dequant_mxfp4_batches(mat_fp4: torch.Tensor,
-                              scale_tensor: torch.Tensor):
+    def dequant_mxfp4_batches(mat_fp4: torch.Tensor, scale_tensor: torch.Tensor):
         num_batches = mat_fp4.size(0)
         scale_tensor = scale_tensor.view(num_batches, -1)
         from flashinfer import mxfp4_dequantize
-        return torch.stack([
-            mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
-            for b in range(num_batches)
-        ])
+
+        return torch.stack(
+            [
+                mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
+                for b in range(num_batches)
+            ]
+        )
 
     w13_q, w13_scale = quant_mxfp4_batches(w13, num_experts)
     w2_q, w2_scale = quant_mxfp4_batches(w2, num_experts)
 
     # Reference result using dequantized tensors and reference_moe
-    w13_ref = dequant_mxfp4_batches(
-        w13_q.view(torch.uint8),
-        w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, 2 * intermediate_size, hidden_size).to(device)
-    w2_ref = dequant_mxfp4_batches(
-        w2_q.view(torch.uint8),
-        w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, hidden_size, intermediate_size).to(device)
+    w13_ref = (
+        dequant_mxfp4_batches(
+            w13_q.view(torch.uint8), w13_scale.view(torch.uint8).reshape(-1)
+        )
+        .to(torch.float32)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size)
+        .to(device)
+    )
+    w2_ref = (
+        dequant_mxfp4_batches(
+            w2_q.view(torch.uint8), w2_scale.view(torch.uint8).reshape(-1)
+        )
+        .to(torch.float32)
+        .reshape(num_experts, hidden_size, intermediate_size)
+        .to(device)
+    )
 
     # Quantize activations for SM100 path and dequantize for reference
     hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)
     # Reference uses BF16 input but quantizes intermediate activation to MXFP8
-    ref = reference_moe(router_logits.to(torch.float32), topk, num_experts,
-                        hidden_states.to(torch.float32), w13_ref,
-                        bias13.to(torch.float32), w2_ref,
-                        bias2.to(torch.float32), alpha, beta, limit, 'mxfp8')
+    ref = reference_moe(
+        router_logits.to(torch.float32),
+        topk,
+        num_experts,
+        hidden_states.to(torch.float32),
+        w13_ref,
+        bias13.to(torch.float32),
+        w2_ref,
+        bias2.to(torch.float32),
+        alpha,
+        beta,
+        limit,
+        "mxfp8",
+    )
 
     # Prepare inputs for FlashInfer CUTLASS fused MoE
     from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
@@ -807,31 +933,28 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
     w13_b = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
 
     # Build routing for kernel
-    routing_weights = torch.nn.functional.softmax(router_logits,
-                                                  dim=1,
-                                                  dtype=torch.float32)
-    token_final_scales, token_selected_experts = torch.topk(routing_weights,
-                                                            topk,
-                                                            dim=-1)
-    token_final_scales = (token_final_scales /
-                          token_final_scales.sum(dim=-1, keepdim=True))
+    routing_weights = torch.nn.functional.softmax(
+        router_logits, dim=1, dtype=torch.float32
+    )
+    token_final_scales, token_selected_experts = torch.topk(
+        routing_weights, topk, dim=-1
+    )
+    token_final_scales = token_final_scales / token_final_scales.sum(
+        dim=-1, keepdim=True
+    )
     token_selected_experts = token_selected_experts.to(torch.int).contiguous()
 
     out = torch.empty_like(hidden_states, dtype=torch.bfloat16)
     if alpha is not None:
-        alpha_t = torch.full((num_experts, ),
-                             alpha,
-                             device=hidden_states.device)
+        alpha_t = torch.full((num_experts,), alpha, device=hidden_states.device)
     else:
         alpha_t = None
     if beta is not None:
-        beta_t = torch.full((num_experts, ), beta, device=hidden_states.device)
+        beta_t = torch.full((num_experts,), beta, device=hidden_states.device)
     else:
         beta_t = None
     if limit is not None:
-        limit_t = torch.full((num_experts, ),
-                             limit,
-                             device=hidden_states.device)
+        limit_t = torch.full((num_experts,), limit, device=hidden_states.device)
     else:
         limit_t = None
 
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 59126cef6adbbe531b4fd4510d1f5893f59ebdac..dd4eb4da913bddedce9108180efb30eb7c9d6b3f 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -9,24 +8,25 @@ import torch
 from tests.kernels.utils import torch_experts
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    CutlassBatchedExpertsFp8)
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 try:
     from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                      nvshmem_finalize, nvshmem_get_unique_id,
-                                      nvshmem_init)
+    from pplx_kernels.nvshmem import (
+        nvshmem_alloc_empty_unique_id,
+        nvshmem_finalize,
+        nvshmem_get_unique_id,
+        nvshmem_init,
+    )
+
     has_pplx = True
 except ImportError:
     has_pplx = False
@@ -50,12 +50,12 @@ def chunk_by_rank(t, r, w):
     chunk = rank_chunk(num, r, w)
     rem = num % w
     if rem == 0 or r < rem:
-        return t[(r * chunk):(r + 1) * chunk].contiguous()
+        return t[(r * chunk) : (r + 1) * chunk].contiguous()
     else:
         long_chunks = (num // w + 1) * rem
         short_chunks = (r - rem) * chunk
         start = long_chunks + short_chunks
-        return t[start:start + chunk].contiguous()
+        return t[start : start + chunk].contiguous()
 
 
 def pplx_cutlass_moe(
@@ -72,10 +72,12 @@ def pplx_cutlass_moe(
     out_dtype,
     per_act_token: bool,
     per_out_ch: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
+
     assert torch.cuda.current_device() == pgi.local_rank
 
     num_tokens, hidden_dim = a.shape
@@ -126,35 +128,40 @@ def pplx_cutlass_moe(
         ata,
         max_num_tokens=max_num_tokens,
         num_local_experts=num_local_experts,
-        num_dispatchers=num_dispatchers)
-
-    ab_strides1 = torch.full((num_local_experts, ),
-                             hidden_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    ab_strides2 = torch.full((num_local_experts, ),
-                             intermediate_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides1 = torch.full((num_local_experts, ),
-                            2 * intermediate_dim,
-                            device="cuda",
-                            dtype=torch.int64)
-    c_strides2 = torch.full((num_local_experts, ),
-                            hidden_dim,
-                            device="cuda",
-                            dtype=torch.int64)
+        num_dispatchers=num_dispatchers,
+    )
+
+    ab_strides1 = torch.full(
+        (num_local_experts,), hidden_dim, device="cuda", dtype=torch.int64
+    )
+    ab_strides2 = torch.full(
+        (num_local_experts,), intermediate_dim, device="cuda", dtype=torch.int64
+    )
+    c_strides1 = torch.full(
+        (num_local_experts,), 2 * intermediate_dim, device="cuda", dtype=torch.int64
+    )
+    c_strides2 = torch.full(
+        (num_local_experts,), hidden_dim, device="cuda", dtype=torch.int64
+    )
 
     experts = CutlassBatchedExpertsFp8(
-        num_local_experts, num_dispatchers, out_dtype, ab_strides1,
-        ab_strides2, c_strides1, c_strides2,
+        num_local_experts,
+        num_dispatchers,
+        out_dtype,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
         fp8_w8a8_moe_quant_config(
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
             w1_scale=chunk_by_rank(w1_scale, rank, world_size),
             w2_scale=chunk_by_rank(w2_scale, rank, world_size),
             a1_scale=chunk_by_rank(a1_scale, rank, world_size)
-            if per_act_token else a1_scale[rank]))
+            if per_act_token
+            else a1_scale[rank],
+        ),
+    )
 
     fused_cutlass_experts = FusedMoEModularKernel(
         prepare_finalize,
@@ -162,10 +169,10 @@ def pplx_cutlass_moe(
     )
 
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
-                                      world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
-                                   world_size).to(torch.uint32).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank, world_size).to(device)
+    chunk_topk_ids = (
+        chunk_by_rank(topk_ids, rank, world_size).to(torch.uint32).to(device)
+    )
 
     out = fused_cutlass_experts(
         a_chunk,
@@ -174,7 +181,7 @@ def pplx_cutlass_moe(
         chunk_topk_weight,
         chunk_topk_ids,
         global_num_experts=num_experts,
-        expert_map=None,  #TODO
+        expert_map=None,  # TODO
     )
 
     torch.cuda.synchronize()
@@ -185,8 +192,6 @@ def pplx_cutlass_moe(
 
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def _pplx_moe(
@@ -210,35 +215,48 @@ def _pplx_moe(
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         with set_current_vllm_config(vllm_config):
-            torch_output = torch_experts(a_full, w1_full, w2_full,
-                                         topk_weights, topk_ids)
-            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                           w2_scale, topk_weights, topk_ids,
-                                           a1_scale, out_dtype, per_act_token,
-                                           per_out_ch, group_name)
-
-            torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                         pgi.world_size).to(pplx_output.device)
+            torch_output = torch_experts(
+                a_full, w1_full, w2_full, topk_weights, topk_ids
+            )
+            pplx_output = pplx_cutlass_moe(
+                pgi,
+                dp_size,
+                a,
+                w1,
+                w2,
+                w1_scale,
+                w2_scale,
+                topk_weights,
+                topk_ids,
+                a1_scale,
+                out_dtype,
+                per_act_token,
+                per_out_ch,
+                group_name,
+            )
+
+            torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
+                pplx_output.device
+            )
 
         # Uncomment if more debugging is needed
         # print("PPLX OUT:", pplx_output)
         # print("TORCH OUT:", torch_output)
 
-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=0.05,
-                                   rtol=0)
+        torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -251,13 +269,15 @@ def _pplx_moe(
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  # , [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 @requires_pplx
 def test_cutlass_moe_pplx(
     m: int,
@@ -273,7 +293,6 @@ def test_cutlass_moe_pplx(
     current_platform.seed_everything(7)
 
     with set_current_vllm_config(vllm_config):
-
         dtype = torch.half
 
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
@@ -283,22 +302,18 @@ def test_cutlass_moe_pplx(
         n_b_scales = 2 * n if per_out_ch else 1
         k_b_scales = k if per_out_ch else 1
 
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
+        w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
 
         for expert in range(e):
             w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
+                w1[expert], use_per_token_if_dynamic=per_out_ch
+            )
             w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
+                w2[expert], use_per_token_if_dynamic=per_out_ch
+            )
 
         w1_d = torch.empty_like(w1)
         w2_d = torch.empty_like(w2)
@@ -307,19 +322,35 @@ def test_cutlass_moe_pplx(
             w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
         world_size, dp_size = world_dp_size
-        a_scale1 = torch.randn(
-            (m if per_act_token else 1, 1), device="cuda",
-            dtype=torch.float32) / 10.0
+        a_scale1 = (
+            torch.randn(
+                (m if per_act_token else 1, 1), device="cuda", dtype=torch.float32
+            )
+            / 10.0
+        )
         if not per_act_token:
             a_scale1 = a_scale1.repeat(world_size, 1)
 
-        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
-                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
-                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch,
-                        use_internode)
+        parallel_launch(
+            world_size,
+            _pplx_moe,
+            dp_size,
+            a,
+            w1_q,
+            w2_q,
+            w1_scale,
+            w2_scale,
+            topk_weights,
+            topk_ids,
+            a_scale1,
+            dtype,
+            a,
+            w1_d,
+            w2_d,
+            per_act_token,
+            per_out_ch,
+            use_internode,
+        )
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 4ca4a1e79c57c6ebf8e3e0d6d41998721aee16aa..f671b23d300ce72d58a8fd24b77e54640d06f4ba 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,42 +4,48 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
+
 import copy
 import itertools
 import textwrap
 import traceback
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import pytest
 import torch
 
 try:
     from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                      nvshmem_finalize, nvshmem_get_unique_id,
-                                      nvshmem_init)
+    from pplx_kernels.nvshmem import (
+        nvshmem_alloc_empty_unique_id,
+        nvshmem_finalize,
+        nvshmem_get_unique_id,
+        nvshmem_init,
+    )
+
     has_pplx = True
 except ImportError:
     has_pplx = False
 
-from tests.kernels.moe.modular_kernel_tools.parallel_utils import (
-    _set_vllm_config)
-from tests.kernels.moe.utils import (make_shared_experts, make_test_weights,
-                                     naive_batched_moe)
+from tests.kernels.moe.modular_kernel_tools.parallel_utils import _set_vllm_config
+from tests.kernels.moe.utils import (
+    make_shared_experts,
+    make_test_weights,
+    naive_batched_moe,
+)
 from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate,
+)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -59,7 +65,7 @@ BATCHED_MOE_MNK_FACTORS = [
 
 PPLX_COMBOS = [
     # TODO(bnell): figure out why this fails, seems to be test problem
-    #(1, 128, 128),
+    # (1, 128, 128),
     (2, 128, 512),
     (3, 1024, 2048),
     (4, 128, 128),
@@ -75,15 +81,13 @@ TOP_KS = [1, 2, 6]
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def torch_prepare(
     a: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
-    max_num_tokens: Optional[int] = None,
+    max_num_tokens: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert topk_ids.dim() == 2
     assert topk_ids.shape[0] == a.shape[0]
@@ -91,17 +95,16 @@ def torch_prepare(
     num_tokens, hidden_dim = a.shape
     topk = topk_ids.shape[1]
 
-    tokens_per_expert = torch.bincount(topk_ids.view(-1),
-                                       minlength=num_experts)
+    tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
 
     assert tokens_per_expert.numel() == num_experts
 
     if max_num_tokens is None:
         max_num_tokens = int(tokens_per_expert.max().item())
 
-    b_a = torch.zeros((num_experts, max_num_tokens, hidden_dim),
-                      dtype=a.dtype,
-                      device=a.device)
+    b_a = torch.zeros(
+        (num_experts, max_num_tokens, hidden_dim), dtype=a.dtype, device=a.device
+    )
 
     token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
 
@@ -109,28 +112,29 @@ def torch_prepare(
         for j in range(topk):
             expert_id = topk_ids[token, j]
             idx = token_counts[expert_id]
-            b_a[expert_id, idx:idx + 1, :] = a[token, :]
+            b_a[expert_id, idx : idx + 1, :] = a[token, :]
             token_counts[expert_id] = token_counts[expert_id] + 1
 
     return b_a, tokens_per_expert
 
 
-def torch_finalize(b_out: torch.Tensor, topk_weight: torch.Tensor,
-                   topk_ids: torch.Tensor) -> torch.Tensor:
+def torch_finalize(
+    b_out: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor
+) -> torch.Tensor:
     num_tokens = topk_ids.shape[0]
     num_experts = b_out.shape[0]
     K = b_out.shape[-1]
     out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
-    expert_counts = torch.zeros(num_experts,
-                                dtype=torch.int,
-                                device=b_out.device)
+    expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
     for token in range(num_tokens):
         expert_ids = topk_ids[token]
         for i in range(expert_ids.numel()):
             expert_id = expert_ids[i]
             idx = expert_counts[expert_id]
-            out[token, :] = out[token, :] + b_out[expert_id, idx:idx +
-                                                  1, :] * topk_weight[token, i]
+            out[token, :] = (
+                out[token, :]
+                + b_out[expert_id, idx : idx + 1, :] * topk_weight[token, i]
+            )
             expert_counts[expert_id] = expert_counts[expert_id] + 1
 
     return out
@@ -149,17 +153,18 @@ def torch_batched_moe(
     num_tokens, topk = topk_ids.shape
     _, max_num_tokens, K = b_a.shape
     assert num_experts == b_a.shape[0] and w2.shape[1] == K
-    out = torch.zeros((num_experts, max_num_tokens, K),
-                      dtype=b_a.dtype,
-                      device=b_a.device)
-    tmp = torch.empty((max_num_tokens, w1.shape[1] // 2),
-                      dtype=b_a.dtype,
-                      device=b_a.device)
+    out = torch.zeros(
+        (num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device
+    )
+    tmp = torch.empty(
+        (max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device
+    )
     for expert in range(num_experts):
         num = tokens_per_expert[expert]
         if num > 0:
             torch.ops._C.silu_and_mul(
-                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1))
+                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)
+            )
             out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
 
     return torch_finalize(out, topk_weight, topk_ids)
@@ -186,20 +191,16 @@ def test_fused_moe_batched_experts(
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight,
-                                        topk_ids)  # only for baseline
+        baseline_output = torch_experts(
+            a, w1, w2, topk_weight, topk_ids
+        )  # only for baseline
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
         batched_output = naive_batched_moe(
-            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
+            a, w1, w2, topk_weight, topk_ids
+        )  # pick torch_experts or this
 
-    torch.testing.assert_close(baseline_output,
-                               torch_output,
-                               atol=2e-2,
-                               rtol=0)
-    torch.testing.assert_close(baseline_output,
-                               batched_output,
-                               atol=2e-2,
-                               rtol=0)
+    torch.testing.assert_close(baseline_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(baseline_output, batched_output, atol=2e-2, rtol=0)
 
 
 def create_pplx_prepare_finalize(
@@ -211,13 +212,15 @@ def create_pplx_prepare_finalize(
     dp_size: int,
     world_size: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+        PplxPrepareAndFinalize,
+        pplx_hidden_dim_scale_bytes,
+    )
 
     max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
     num_local_experts = rank_chunk(num_experts, 0, world_size)
@@ -266,28 +269,25 @@ def rank_chunk(num: int, r: int, w: int) -> int:
 
 def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
     chunk = rank_chunk(t.shape[0], r, w)
-    return t[(r * chunk):(r + 1) * chunk]
+    return t[(r * chunk) : (r + 1) * chunk]
 
 
-def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
-                        w: int) -> Optional[torch.Tensor]:
+def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
     if t is not None:
         return chunk_by_rank(t, r, w)
     else:
         return t
 
 
-def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
-                         w: int) -> Optional[torch.Tensor]:
+def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
     if t is not None and t.numel() > 1:
         chunk = rank_chunk(t.shape[0], r, w)
-        return t[(r * chunk):(r + 1) * chunk]
+        return t[(r * chunk) : (r + 1) * chunk]
     else:
         return t
 
 
-def chunk_scales(t: Optional[torch.Tensor], start: int,
-                 end: int) -> Optional[torch.Tensor]:
+def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None:
     if t is not None and t.numel() > 1:
         return t[start:end]
     else:
@@ -305,10 +305,10 @@ def pplx_prepare_finalize(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ) -> torch.Tensor:
     assert torch.cuda.current_device() == pgi.local_rank
 
@@ -350,8 +350,7 @@ def pplx_prepare_finalize(
         device=device,
     )
 
-    if (quant_dtype is not None and not per_act_token_quant
-            and block_shape is None):
+    if quant_dtype is not None and not per_act_token_quant and block_shape is None:
         a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     else:
@@ -375,8 +374,7 @@ def pplx_prepare_finalize(
         ),
     )
 
-    b_a = dummy_work(
-        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
+    b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
 
     prepare_finalize.finalize(
         out,
@@ -403,22 +401,24 @@ def _pplx_prepare_finalize(
     score: torch.Tensor,
     topk: torch.Tensor,
     num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
     use_internode: bool,
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
             group_name = None
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
@@ -426,22 +426,28 @@ def _pplx_prepare_finalize(
 
         a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
 
-        torch_output = (a_rep.view(m, topk, k) *
-                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
-                            dim=1)
-
-        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
-                                            topk_ids, num_experts, quant_dtype,
-                                            block_shape, per_act_token_quant,
-                                            group_name)
+        torch_output = (
+            a_rep.view(m, topk, k) * topk_weight.view(m, topk, 1).to(a_rep.dtype)
+        ).sum(dim=1)
+
+        pplx_output = pplx_prepare_finalize(
+            pgi,
+            dp_size,
+            a,
+            topk_weight,
+            topk_ids,
+            num_experts,
+            quant_dtype,
+            block_shape,
+            per_act_token_quant,
+            group_name,
+        )
 
-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pgi.device)
+        torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
+            pgi.device
+        )
 
-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -465,7 +471,7 @@ def test_pplx_prepare_finalize_slow(
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     use_internode: bool,
 ):
     if dtype == torch.float8_e4m3fn:
@@ -491,13 +497,23 @@ def test_pplx_prepare_finalize_slow(
     a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
     score = torch.randn((m, e), device=device, dtype=act_dtype)
 
-    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, quant_dtype, block_shape, per_act_token_quant,
-                    use_internode)
+    parallel_launch(
+        world_size,
+        _pplx_prepare_finalize,
+        dp_size,
+        a,
+        score,
+        topk,
+        e,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        use_internode,
+    )
 
 
 def pplx_moe(
-    group_name: Optional[str],
+    group_name: str | None,
     rank: int,
     world_size: int,
     dp_size: int,
@@ -506,18 +522,17 @@ def pplx_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
-    shared_experts: Optional[torch.nn.Module] = None,
-) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-
+    shared_experts: torch.nn.Module | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
     topk = topk_ids.shape[1]
@@ -579,21 +594,23 @@ def pplx_moe(
     # large enough to trigger chunking. I'm leaving the flag and
     # setup code in case we are able to revisit this later.
     if use_compile:
-        _fused_experts = torch.compile(fused_experts,
-                                       backend='inductor',
-                                       fullgraph=True)
+        _fused_experts = torch.compile(
+            fused_experts, backend="inductor", fullgraph=True
+        )
         torch._dynamo.mark_dynamic(a_chunk, 0)
         torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
         torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
     else:
         _fused_experts = fused_experts
 
-    out = _fused_experts(a_chunk,
-                         w1_chunk,
-                         w2_chunk,
-                         chunk_topk_weight,
-                         chunk_topk_ids,
-                         global_num_experts=num_experts)
+    out = _fused_experts(
+        a_chunk,
+        w1_chunk,
+        w2_chunk,
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts,
+    )
 
     if use_cudagraphs:
         if isinstance(out, tuple):
@@ -604,12 +621,14 @@ def pplx_moe(
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            out = _fused_experts(a_chunk,
-                                 w1_chunk,
-                                 w2_chunk,
-                                 chunk_topk_weight,
-                                 chunk_topk_ids,
-                                 global_num_experts=num_experts)
+            out = _fused_experts(
+                a_chunk,
+                w1_chunk,
+                w2_chunk,
+                chunk_topk_weight,
+                chunk_topk_ids,
+                global_num_experts=num_experts,
+            )
 
         torch.cuda.synchronize()
         graph.replay()
@@ -630,25 +649,27 @@ def _pplx_moe(
     score: torch.Tensor,
     topk: int,
     num_experts: int,
-    w1_s: Optional[torch.Tensor] = None,
-    w2_s: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_s: torch.Tensor | None = None,
+    w2_s: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     use_internode: bool = False,
-    shared_experts: Optional[torch.nn.Module] = None,
+    shared_experts: torch.nn.Module | None = None,
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
             group_name = None
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         m, k = a.shape
@@ -666,8 +687,7 @@ def _pplx_moe(
         w1_s = w1_s.to(device) if w1_s is not None else None
         w2_s = w2_s.to(device) if w2_s is not None else None
 
-        if (quant_dtype is not None and not per_act_token_quant
-                and block_shape is None):
+        if quant_dtype is not None and not per_act_token_quant and block_shape is None:
             a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
             a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         else:
@@ -677,10 +697,7 @@ def _pplx_moe(
         with set_current_vllm_config(vllm_config), override_config(moe_config):
             topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
-            if shared_experts is not None:
-                shared_output = shared_experts(a)
-            else:
-                shared_output = None
+            shared_output = shared_experts(a) if shared_experts is not None else None
 
             torch_output = torch_experts(
                 a,
@@ -742,31 +759,27 @@ def _pplx_moe(
         if shared_output is not None:
             assert pplx_shared_output is not None
             chunked_shared_output = chunk_by_rank(
-                shared_output, pgi.rank,
-                pgi.world_size).to(pplx_shared_output.device)
+                shared_output, pgi.rank, pgi.world_size
+            ).to(pplx_shared_output.device)
         else:
             chunked_shared_output = None
 
         chunked_batch_output = chunk_by_rank(
-            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+            batched_output, pgi.rank, pgi.world_size
+        ).to(pplx_output.device)
 
-        torch.testing.assert_close(batched_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(batched_output, torch_output, atol=3e-2, rtol=3e-2)
 
-        torch.testing.assert_close(pplx_output,
-                                   chunked_batch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(
+            pplx_output, chunked_batch_output, atol=3e-2, rtol=3e-2
+        )
 
         if shared_experts is not None:
             assert chunked_shared_output is not None
             assert pplx_shared_output is not None
-            torch.testing.assert_close(pplx_shared_output,
-                                       chunked_shared_output,
-                                       atol=3e-2,
-                                       rtol=3e-2)
+            torch.testing.assert_close(
+                pplx_shared_output, chunked_shared_output, atol=3e-2, rtol=3e-2
+            )
 
     finally:
         if use_internode:
@@ -791,7 +804,7 @@ def test_pplx_moe_slow(
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     use_internode: bool,
 ):
     current_platform.seed_everything(7)
@@ -823,15 +836,33 @@ def test_pplx_moe_slow(
         per_out_ch_quant=per_act_token_quant,
     )
 
-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
-                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
-                    use_internode)
-
+    parallel_launch(
+        world_size,
+        _pplx_moe,
+        dp_size,
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        e,
+        w1_s,
+        w2_s,
+        quant_dtype,
+        per_act_token_quant,
+        block_shape,
+        use_internode,
+    )
 
-def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
-                    use_shared_experts: bool, make_weights: bool,
-                    test_fn: Callable):
 
+def _pplx_test_loop(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    use_internode: bool,
+    use_shared_experts: bool,
+    make_weights: bool,
+    test_fn: Callable,
+):
     def format_result(msg, ex=None):
         if ex is not None:
             x = str(ex)
@@ -850,12 +881,12 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
         new_vllm_config = copy.deepcopy(vllm_config)
         new_vllm_config.parallel_config.data_parallel_size = pgi.world_size
         new_vllm_config.parallel_config.enable_expert_parallel = True
-        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank,
-                         pgi.local_rank)
+        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
 
     current_platform.seed_everything(7)
-    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
-                               [False, True], [None, [128, 128]])
+    combos = itertools.product(
+        PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
+    )
     exceptions = []
     count = 0
     for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
@@ -873,13 +904,11 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
             f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
             f"dtype={dtype}, per_act_token={per_act_token_quant}, "
             f"block_shape={block_shape}, use_internode={use_internode}, "
-            f"use_shared_experts={use_shared_experts}")
+            f"use_shared_experts={use_shared_experts}"
+        )
 
-        if not use_fp8_w8a8 and (per_act_token_quant
-                                 or block_shape is not None):
-            print(
-                f"{test_desc} - Skip quantization test for non-quantized type."
-            )
+        if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+            print(f"{test_desc} - Skip quantization test for non-quantized type.")
             continue
 
         if per_act_token_quant and block_shape is not None:
@@ -934,10 +963,10 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
     if len(exceptions) > 0:
         raise RuntimeError(
             f"{len(exceptions)} of {count} tests failed in child process, "
-            f"rank={pgi.rank}.")
+            f"rank={pgi.rank}."
+        )
     else:
-        print(f"{count} of {count} tests passed in child process, "
-              f"rank={pgi.rank}.")
+        print(f"{count} of {count} tests passed in child process, rank={pgi.rank}.")
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@@ -950,8 +979,15 @@ def test_pplx_prepare_finalize(
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
-                    use_internode, False, False, _pplx_prepare_finalize)
+    parallel_launch(
+        world_size * dp_size,
+        _pplx_test_loop,
+        dp_size,
+        use_internode,
+        False,
+        False,
+        _pplx_prepare_finalize,
+    )
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@@ -966,5 +1002,12 @@ def test_pplx_moe(
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode,
-                    use_shared_experts, True, _pplx_moe)
+    parallel_launch(
+        world_size,
+        _pplx_test_loop,
+        dp_size,
+        use_internode,
+        use_shared_experts,
+        True,
+        _pplx_moe,
+    )
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index 1c51c530c193c12470913f1272d37dc3fad1f722..d4724d749fc98c35ab8c99c2592939d1597e47b4 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -24,13 +24,14 @@ aiter_available = importlib.util.find_spec("aiter") is not None
 
 pytestmark = pytest.mark.skipif(
     not (current_platform.is_rocm() and aiter_available),
-    reason="AITER ops are only available on ROCm with aiter package installed")
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
 
 
 def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
     """Test that the custom op is correctly registered."""
     # Check if the op exists in torch.ops.vllm
-    assert hasattr(torch.ops.vllm, 'rocm_aiter_biased_grouped_topk')
+    assert hasattr(torch.ops.vllm, "rocm_aiter_biased_grouped_topk")
 
     # Check if the op is callable
     assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
@@ -39,7 +40,7 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
 def test_rocm_aiter_grouped_topk_custom_op_registration():
     """Test that the custom op is correctly registered."""
     # Check if the op exists in torch.ops.vllm
-    assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
+    assert hasattr(torch.ops.vllm, "rocm_aiter_grouped_topk")
 
     # Check if the op is callable
     assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
@@ -56,25 +57,29 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
     renormalize = True
     scale_factor = 1.0
 
-    gating_output = torch.randn((token, expert),
-                                dtype=torch.bfloat16,
-                                device="cuda")
-    e_score_correction_bias = torch.randn((expert, ),
-                                          dtype=torch.bfloat16,
-                                          device="cuda")
+    gating_output = torch.randn((token, expert), dtype=torch.bfloat16, device="cuda")
+    e_score_correction_bias = torch.randn(
+        (expert,), dtype=torch.bfloat16, device="cuda"
+    )
 
     device = gating_output.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
-    topk_weights = torch.empty((token, topk),
-                               dtype=torch.float32,
-                               device=device)
+    topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     # Define a function that uses the op
-    def biased_grouped_topk_fn(gating_output, e_score_correction_bias,
-                               topk_weights, topk_ids):
+    def biased_grouped_topk_fn(
+        gating_output, e_score_correction_bias, topk_weights, topk_ids
+    ):
         return torch.ops.vllm.rocm_aiter_biased_grouped_topk(
-            gating_output, e_score_correction_bias, topk_weights, topk_ids,
-            num_expert_group, topk_group, renormalize, scale_factor)
+            gating_output,
+            e_score_correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scale_factor,
+        )
 
     # Verify the op's fake implementation
     torch.library.opcheck(
@@ -84,51 +89,49 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
             "num_expert_group": num_expert_group,
             "topk_group": topk_group,
             "need_renorm": renormalize,
-            "routed_scaling_factor": scale_factor
+            "routed_scaling_factor": scale_factor,
         },
-        test_utils=("test_faketensor"))
+        test_utils=("test_faketensor"),
+    )
 
     # Compile the function with appropriate settings
-    compiled_fn = torch.compile(biased_grouped_topk_fn,
-                                fullgraph=True,
-                                backend="inductor",
-                                mode="reduce-overhead",
-                                dynamic=False)
-
-    topk_weights_original = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_original = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
-
-    topk_weights_compiled = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_compiled = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
+    compiled_fn = torch.compile(
+        biased_grouped_topk_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    topk_weights_original = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_original = torch.empty((token, topk), dtype=torch.int32, device=device)
+
+    topk_weights_compiled = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_compiled = torch.empty((token, topk), dtype=torch.int32, device=device)
 
     # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
-    biased_grouped_topk_fn(gating_output, e_score_correction_bias,
-                           topk_weights_original, topk_ids_original)
-    compiled_fn(gating_output, e_score_correction_bias, topk_weights_compiled,
-                topk_ids_compiled)
+    biased_grouped_topk_fn(
+        gating_output, e_score_correction_bias, topk_weights_original, topk_ids_original
+    )
+    compiled_fn(
+        gating_output, e_score_correction_bias, topk_weights_compiled, topk_ids_compiled
+    )
 
     # Sort the results for comparison since the order might not be deterministic
     topk_ids_original, indices_original = torch.sort(topk_ids_original)
-    topk_weights_original = torch.gather(topk_weights_original, 1,
-                                         indices_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1, indices_original)
 
     topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
-    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
-                                         indices_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1, indices_compiled)
 
     # Verify results match
-    assert torch.allclose(topk_weights_original,
-                          topk_weights_compiled,
-                          rtol=1e-2,
-                          atol=1e-2)
+    assert torch.allclose(
+        topk_weights_original, topk_weights_compiled, rtol=1e-2, atol=1e-2
+    )
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
 
 
@@ -144,73 +147,73 @@ def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
     scoring_func = "softmax"
     scale_factor = 1.0
 
-    gating_output = torch.randn((token, expert),
-                                dtype=torch.bfloat16,
-                                device="cuda")
+    gating_output = torch.randn((token, expert), dtype=torch.bfloat16, device="cuda")
 
     device = gating_output.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
-    topk_weights = torch.empty((token, topk),
-                               dtype=torch.float32,
-                               device=device)
+    topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     # Define a function that uses the op
     def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
         return torch.ops.vllm.rocm_aiter_grouped_topk(
-            gating_output, topk_weights, topk_ids, num_expert_group,
-            topk_group, renormalize, scoring_func, scale_factor)
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scoring_func,
+            scale_factor,
+        )
 
     # Verify the op's fake implementation
-    torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
-                          (gating_output, topk_weights, topk_ids),
-                          kwargs={
-                              "num_expert_group": num_expert_group,
-                              "topk_group": topk_group,
-                              "need_renorm": renormalize,
-                              "scoring_func": scoring_func,
-                              "routed_scaling_factor": scale_factor
-                          },
-                          test_utils=("test_faketensor"))
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_grouped_topk,
+        (gating_output, topk_weights, topk_ids),
+        kwargs={
+            "num_expert_group": num_expert_group,
+            "topk_group": topk_group,
+            "need_renorm": renormalize,
+            "scoring_func": scoring_func,
+            "routed_scaling_factor": scale_factor,
+        },
+        test_utils=("test_faketensor"),
+    )
 
     # Compile the function with appropriate settings
-    compiled_fn = torch.compile(grouped_topk_fn,
-                                fullgraph=True,
-                                backend="inductor",
-                                mode="reduce-overhead",
-                                dynamic=False)
-
-    topk_weights_original = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_original = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
-
-    topk_weights_compiled = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_compiled = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
+    compiled_fn = torch.compile(
+        grouped_topk_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    topk_weights_original = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_original = torch.empty((token, topk), dtype=torch.int32, device=device)
+
+    topk_weights_compiled = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_compiled = torch.empty((token, topk), dtype=torch.int32, device=device)
 
     # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
-    grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
-                    scoring_func)
-    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
-                scoring_func)
+    grouped_topk_fn(
+        gating_output, topk_weights_original, topk_ids_original, scoring_func
+    )
+    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled, scoring_func)
 
     # Sort the results for comparison since the order might not be deterministic
     topk_ids_original, indices_original = torch.sort(topk_ids_original)
-    topk_weights_original = torch.gather(topk_weights_original, 1,
-                                         indices_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1, indices_original)
 
     topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
-    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
-                                         indices_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1, indices_compiled)
 
     # Verify results match
-    assert torch.allclose(topk_weights_original,
-                          topk_weights_compiled,
-                          rtol=1e-2,
-                          atol=1e-2)
+    assert torch.allclose(
+        topk_weights_original, topk_weights_compiled, rtol=1e-2, atol=1e-2
+    )
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 383b5ebfba9b72402e8d72404ae513d8861903ed..d6b78dd2c2323a8c62fa329cdbfdeb40f1d95926 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -1,121 +1,287 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
+import random
+
 import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm_cuda)
+    persistent_masked_m_silu_mul_quant,
+)
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.math_utils import cdiv, round_up
 
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
     (1, 1, 128, fp8_dtype),
-    (1, 4, 128, fp8_dtype),
-    (2, 4, 256, fp8_dtype),
-    (32, 64, 256, fp8_dtype),
-    (17, 31, 768, fp8_dtype),
-    (1, 1, 128 * 1, fp8_dtype),
-    (1, 1, 128 * 2, fp8_dtype),
-    (1, 1, 128 * 3, fp8_dtype),
-    (1, 1, 128 * 4, fp8_dtype),
-    (8, 16, 128 * 1, fp8_dtype),
-    (8, 16, 128 * 2, fp8_dtype),
-    (8, 16, 128 * 3, fp8_dtype),
+    (1, 4, 128 * 1, fp8_dtype),
+    (2, 4, 128 * 2, fp8_dtype),
+    (1, 4, 128 * 3, fp8_dtype),
     (8, 16, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 5, fp8_dtype),
+    (8, 16, 128 * 6, fp8_dtype),
+    (8, 16, 128 * 7, fp8_dtype),
+    (8, 16, 128 * 8, fp8_dtype),
+    (8, 16, 128 * 9, fp8_dtype),
     (8, 64, 7168, fp8_dtype),
+    (8, 128, 128 * 33, fp8_dtype),
+    (1, 4, 128 * 10, fp8_dtype),
     (8, 128, 7168, fp8_dtype),
-    (8, 256, 7168, fp8_dtype),
     (8, 512, 7168, fp8_dtype),
     (8, 1024, 7168, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
     (256, 8, 7168, fp8_dtype),
-    (256, 16, 7168, fp8_dtype),
     (256, 32, 7168, fp8_dtype),
     (256, 64, 7168, fp8_dtype),
-
     # Only add a few fnuz tests to help with long CI times.
     (8, 512, 7168, torch.float8_e4m3fnuz),
     (8, 1024, 7168, torch.float8_e4m3fnuz),
 ]
 
 
+def as_uint8(x) -> torch.Tensor:
+    return (
+        torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
+    )
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
+    x_f32 = x.to(torch.float32)
+    act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
+    assert act_f32.dtype == torch.float32
+    return act_f32.to(torch.bfloat16)
+
+
+def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
+    eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
+    one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
+    fp8_max_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_min_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_max_inv = one_bf16 / fp8_max_bf16
+    assert fp8_max_inv.dtype == torch.bfloat16
+
+    assert x.size(-1) % group_size == 0
+    num_groups = x.numel() // group_size
+    x_og_shape = x.shape
+
+    x = x.to(torch.bfloat16)
+    x = x.view((-1, group_size))
+    amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
+    assert amax.dtype == torch.bfloat16
+    s = amax * fp8_max_inv
+
+    if ceil_ue8m0:
+        s = torch.exp2(
+            torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
+        ).to(torch.bfloat16)
+
+    inv_s = one_bf16 / s
+    inv_s = inv_s.view((num_groups, 1))
+    xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
+        fp8_dtype
+    )
+
+    xq = xq.view(x_og_shape)
+    xs = s.view((-1, xq.size(-1) // group_size))
+    return xq, xs
+
+
+def silu_mul_quant(
+    gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert gate.size(-1) % group_size == 0
+    assert up.size(-1) % group_size == 0
+
+    assert gate.dtype == torch.bfloat16
+    assert up.dtype == torch.bfloat16
+
+    act_bf16 = silu(gate)
+    assert act_bf16.dtype == torch.bfloat16
+
+    # act & mul
+    a_m = act_bf16 * up
+    assert a_m.dtype == torch.bfloat16
+
+    q, s = do_quant(a_m, group_size, ceil_ue8m0)
+    return q, s
+
+
+def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
+    """
+    pack float32 scales into a int32 tensor
+    """
+    assert x.dtype == torch.float32
+    E, T, G = x.size()
+
+    # Add i32_padding here so we can view it as a i32 tensor later on.
+    i32_padding = round_up(G, 4) - G
+    ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
+
+    ref_s_i32 = ref_s_i8.view(torch.int32)
+
+    return ref_s_i32
+
+
+def ref_with_scale_fmt(
+    E: int,
+    T: int,
+    H: int,
+    group_size: int,
+    tokens_per_expert: torch.Tensor,
+    gate: torch.Tensor,
+    up: torch.Tensor,
+    scale_fmt: DeepGemmQuantScaleFMT,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    The precision types of the operations triggered by this function
+    match closely with the kernel implementation so we compare more
+    accurately.
+    """
+    scale_dtype = (
+        torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+    )
+    ceil_ue8m0 = scale_fmt in [
+        DeepGemmQuantScaleFMT.UE8M0,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]
+
+    ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
+    ref_s_f32 = torch.empty(
+        (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    )
+
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        if nt == 0:
+            continue
+        ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
+            gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
+        )
+
+    if scale_dtype == torch.float32:
+        return ref_q, ref_s_f32
+
+    assert scale_dtype == torch.int32
+    return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
+
+
+def token_random(E, T, H2, tokens_per_expert):
+    """
+    Initialize each token in a random range so we test a range of
+    scale values.
+    """
+    y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
+    for e in range(E):
+        for t in range(tokens_per_expert[e].item()):
+            exp = random.choice(range(1, 20))
+            y[e, t].uniform_(-(2**exp), 2**exp)
+    return y
+
+
 @pytest.mark.parametrize("E,T,H,fp8_type", CASES)
 @torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
     group_size = 128
     current_platform.seed_everything(42)
 
-    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
     tokens_per_expert = torch.randint(
-        low=T // 2,
+        low=0,
         high=T,
-        size=(E, ),
+        size=(E,),
         dtype=torch.int32,
         device="cuda",
     )
 
-    # Run the Triton kernel
-    y_q, y_s = silu_mul_fp8_quant_deep_gemm_cuda(y,
-                                                 tokens_per_expert,
-                                                 group_size=group_size)
+    # Input tensor of shape (E, T, 2*H)
+    y = token_random(E, T, 2 * H, tokens_per_expert)
+
+    gate = y[..., :H].to(torch.bfloat16)
+    up = y[..., H:].to(torch.bfloat16)
 
-    torch.cuda.synchronize()
-    fp8_info = torch.finfo(fp8_dtype)
-    fp8_max = fp8_info.max
-    fp8_min = fp8_info.min
-    eps = 1e-10
+    scale_fmts = [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
-    y1 = y[..., :H].float()
-    y2 = y[..., H:]
-    silu_x = y1 * torch.sigmoid(y1)
-    merged = silu_x * y2
+    # Run the SiLU V2 kernel
+    for scale_fmt in scale_fmts:
+        y_q, y_s = persistent_masked_m_silu_mul_quant(
+            y,
+            tokens_per_expert,
+            group_size=group_size,
+            quant_scale_fmt=scale_fmt,
+        )
 
-    for e in range(E):
-        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty((T, cdiv(H, group_size)),
-                            dtype=torch.float32,
-                            device="cuda")
-        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
-
-        for t in range(nt):
-            data = merged[e, t].float()
-            ref_q_row = torch.empty_like(data)
-
-            # process full groups
-            n_full_groups = H // group_size
-            if n_full_groups > 0:
-                data_grp = data[:n_full_groups * group_size].view(
-                    n_full_groups, group_size)
-                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data[:n_full_groups *
-                              group_size] / scale.repeat_interleave(group_size)
-                ref_q_row[:n_full_groups * group_size] = scaled.clamp(
-                    fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, :n_full_groups] = scale
-
-            # process remainder group
-            rem = H % group_size
-            if rem > 0:
-                data_rem = data[-rem:]
-                amax = data_rem.abs().amax().clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data_rem / scale
-                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, -1] = scale
-
-            ref_q[t] = ref_q_row
-
-        y_se = y_s[e].float()
-        y_qe = y_q[e].float()
-
-        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
-        torch.testing.assert_close(
-            y_qe[:nt].to(torch.float32),
-            ref_q[:nt].to(torch.float32),
-            atol=2,
-            rtol=2e-1,
+        ref_y_q, ref_y_s = ref_with_scale_fmt(
+            E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
         )
+
+        # deepgemm scales transform
+        dg_scales = None
+        if (
+            has_deep_gemm()
+            and current_platform.has_device_capability(100)
+            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+        ):
+            from deep_gemm import transform_sf_into_required_layout
+
+            _q, _s = ref_with_scale_fmt(
+                E,
+                T,
+                H,
+                group_size,
+                tokens_per_expert,
+                gate,
+                up,
+                scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+            )
+            dg_scales = transform_sf_into_required_layout(
+                sf=_s,
+                mn=_q.size(1),
+                k=_q.size(2),
+                recipe=(1, 128, 128),
+                num_groups=_q.size(0),
+                is_sfa=True,
+            )
+
+        expected_scale_dtype = (
+            torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+        )
+        assert y_s.dtype == expected_scale_dtype
+        assert ref_y_s.dtype == expected_scale_dtype
+
+        for e in range(E):
+            nt = tokens_per_expert[e].item()
+
+            torch.testing.assert_close(
+                y_q[e, :nt].to(torch.float32),
+                ref_y_q[e, :nt].to(torch.float32),
+            )
+
+            if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
+                G = H // group_size
+                y_s_sliced = as_uint8(y_s[e])
+                ref_s_sliced = as_uint8(ref_y_s[e])
+                torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
+                if dg_scales is not None:
+                    dg_sliced = as_uint8(dg_scales[e])
+                    torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
+            else:
+                torch.testing.assert_close(
+                    y_s[e, :nt],
+                    ref_y_s[e, :nt],
+                )
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 1c31464b30e7f21cecdfa8a93de036dba6b3eab7..7a467e160b78408904fdf52f8fc9fae5ae39d040 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -11,17 +11,13 @@ from tests.kernels.moe.utils import fused_moe
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config)
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
@@ -31,14 +27,13 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     B = B.to(torch.float32)
 
     assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
-    assert B.ndim == 2 and B.is_contiguous(
-    ), "B must be a 2D contiguous tensor"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
 
     # Reshape input
     M = A.numel() // A.shape[-1]
     B = B.t()  # Transpose weight matrix
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (K, )
+    origin_C_shape = A.shape[:-1] + (K,)
     A = A.reshape(M, N)
 
     # As is per-token [M, 1], Bs is per-column [1, K]
@@ -88,17 +83,17 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
             act_out = SiluAndMul().forward_native(inter_out)
             # Quantize activation output with per-token
             act_out_q, act_out_s = ops.scaled_fp8_quant(
-                act_out, use_per_token_if_dynamic=True)
+                act_out, use_per_token_if_dynamic=True
+            )
 
             # Second MLP layer
-            out[mask] = native_w8a8_per_token_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     output_dtype=a.dtype)
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
     # Apply routing weights and sum
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -116,8 +111,10 @@ TOP_KS = [2, 6]
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
-                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     torch.manual_seed(seed)
@@ -133,12 +130,10 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
 
     # Generate int8 weights
     w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
-    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min,
-                                   max=fp8_max).to(torch.float8_e4m3fn)
+    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
-    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min,
-                                   max=fp8_max).to(torch.float8_e4m3fn)
+    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     # Generate scale for each column (per-column quantization)
     w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
@@ -163,7 +158,7 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
         )
 
     # Check results
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.05
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index 7a0feb6a2079536bae38088f4ce222ee9b2a1848..c7e6c4240e8536dcde0e6e8ea393b6f0ff9f68ee 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -1,24 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import per_block_cast_to_int8
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX)
+from tests.kernels.quantization.nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
-from vllm.utils import round_up
+    BatchedPrepareAndFinalize,
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
+from vllm.utils.math_utils import round_up
 
 
 def triton_moe(
@@ -27,13 +26,13 @@ def triton_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     quant_config = FusedMoEQuantConfig.make(
         quant_dtype,
@@ -45,12 +44,7 @@ def triton_moe(
         a2_scale=a2_scale,
     )
 
-    return fused_experts(a,
-                         w1,
-                         w2,
-                         topk_weight,
-                         topk_ids,
-                         quant_config=quant_config)
+    return fused_experts(a, w1, w2, topk_weight, topk_ids, quant_config=quant_config)
 
 
 def batched_moe(
@@ -59,13 +53,13 @@ def batched_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     max_num_tokens = round_up(a.shape[0], 64)
 
@@ -80,10 +74,9 @@ def batched_moe(
     )
 
     fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
-                                  rank=0),
+        BatchedPrepareAndFinalize(
+            max_num_tokens, num_dispatchers=1, num_local_experts=w1.shape[0], rank=0
+        ),
         BatchedTritonExperts(
             max_num_tokens=max_num_tokens,
             num_dispatchers=1,
@@ -100,13 +93,13 @@ def naive_batched_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     max_num_tokens = round_up(a.shape[0], 64)
 
@@ -121,10 +114,9 @@ def naive_batched_moe(
     )
 
     fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
-                                  rank=0),
+        BatchedPrepareAndFinalize(
+            max_num_tokens, num_dispatchers=1, num_local_experts=w1.shape[0], rank=0
+        ),
         NaiveBatchedExperts(
             max_num_tokens=max_num_tokens,
             num_dispatchers=1,
@@ -135,8 +127,9 @@ def naive_batched_moe(
     return fused_experts(a, w1, w2, topk_weight, topk_ids)
 
 
-def chunk_scales(scales: Optional[torch.Tensor], start: int,
-                 end: int) -> Optional[torch.Tensor]:
+def chunk_scales(
+    scales: torch.Tensor | None, start: int, end: int
+) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             return scales
@@ -150,22 +143,24 @@ def make_quantized_test_activations(
     m: int,
     k: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10
     a_q = a
     a_scale = None
 
     if quant_dtype is not None:
-        assert (quant_dtype == torch.float8_e4m3fn
-                or quant_dtype == torch.int8), "only fp8/int8 supported"
+        assert quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8, (
+            "only fp8/int8 supported"
+        )
         a_q = torch.zeros_like(a, dtype=quant_dtype)
         a_scale_l = [None] * E
         for e in range(E):
             a_q[e], a_scale_l[e] = moe_kernel_quantize_input(
-                a[e], None, quant_dtype, per_act_token_quant, block_shape)
+                a[e], None, quant_dtype, per_act_token_quant, block_shape
+            )
         a_scale = torch.stack(a_scale_l)
 
         if not per_act_token_quant and block_shape is None:
@@ -176,13 +171,16 @@ def make_quantized_test_activations(
 
 def moe_quantize_weights(
     w: torch.Tensor,
-    w_s: Optional[torch.Tensor],
-    quant_dtype: Union[torch.dtype, str, None],
+    w_s: torch.Tensor | None,
+    quant_dtype: torch.dtype | str | None,
     per_token_quant: bool,
-    block_shape: Optional[list[int]],
-) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
-    assert (quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8
-            or quant_dtype == "nvfp4"), "only fp8/int8/nvfp4 supported"
+    block_shape: list[int] | None,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+    assert (
+        quant_dtype == torch.float8_e4m3fn
+        or quant_dtype == torch.int8
+        or quant_dtype == "nvfp4"
+    ), "only fp8/int8/nvfp4 supported"
 
     w_gs = None
 
@@ -199,10 +197,12 @@ def moe_quantize_weights(
     else:
         if quant_dtype == torch.int8:
             w, w_s = ops.scaled_int8_quant(
-                w, w_s, use_per_token_if_dynamic=per_token_quant)
+                w, w_s, use_per_token_if_dynamic=per_token_quant
+            )
         elif quant_dtype == torch.float8_e4m3fn:
             w, w_s = ops.scaled_fp8_quant(
-                w, w_s, use_per_token_if_dynamic=per_token_quant)
+                w, w_s, use_per_token_if_dynamic=per_token_quant
+            )
         elif quant_dtype == "nvfp4":
             assert not per_token_quant
             w_amax = torch.abs(w).max().to(torch.float32)
@@ -219,11 +219,10 @@ def make_test_weight(
     rows: int,
     cols: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | str | None = None,
+    block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
-           Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
     w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
     w_gs = None
 
@@ -233,7 +232,8 @@ def make_test_weight(
         w_gs_l = [None] * e
         for idx in range(e):
             w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights(
-                w_16[idx], None, quant_dtype, per_out_ch_quant, block_shape)
+                w_16[idx], None, quant_dtype, per_out_ch_quant, block_shape
+            )
 
         w = torch.stack(w_l)
         w_s = torch.stack(w_s_l)
@@ -261,29 +261,28 @@ def make_test_weights(
     n: int,
     k: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | str | None = None,
+    block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
-) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
-                 Optional[torch.Tensor]],
-           tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
-                 Optional[torch.Tensor]]]:
+) -> tuple[
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
+]:
     return (
-        make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
-                         per_out_ch_quant),
-        make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
-                         per_out_ch_quant),
+        make_test_weight(
+            e, 2 * n, k, in_dtype, quant_dtype, block_shape, per_out_ch_quant
+        ),
+        make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, per_out_ch_quant),
     )
 
 
 def per_token_cast_to_fp8(
-        x: torch.Tensor,
-        block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
     pad_size = (block_size - (n % block_size)) % block_size
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
+    x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x
     x_view = x.view(m, -1, block_size)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
     fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
@@ -295,9 +294,9 @@ def make_test_quant_config(
     n: int,
     k: int,
     in_dtype: torch.dtype,
-    quant_dtype: Union[torch.dtype, str, None] = None,
+    quant_dtype: torch.dtype | str | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -310,30 +309,34 @@ def make_test_quant_config(
     )
 
     # Hacky/trivial scales for nvfp4.
-    a1_gscale: Optional[torch.Tensor] = None
-    a2_gscale: Optional[torch.Tensor] = None
+    a1_gscale: torch.Tensor | None = None
+    a2_gscale: torch.Tensor | None = None
     if quant_dtype == "nvfp4":
-        a1_gscale = torch.ones((e, ), device="cuda", dtype=torch.float32)
-        a2_gscale = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a1_gscale = torch.ones((e,), device="cuda", dtype=torch.float32)
+        a2_gscale = torch.ones((e,), device="cuda", dtype=torch.float32)
         a1_scale = a1_gscale
         a2_scale = a2_gscale
     else:
         a1_scale = None
         a2_scale = None
 
-    return w1, w2, FusedMoEQuantConfig.make(
-        quant_dtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        a1_gscale=a1_gscale,
-        a2_gscale=a2_gscale,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        # TODO: make sure this is handled properly
-        g1_alphas=(1 / w1_gs) if w1_gs is not None else None,
-        g2_alphas=(1 / w2_gs) if w2_gs is not None else None,
+    return (
+        w1,
+        w2,
+        FusedMoEQuantConfig.make(
+            quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_gscale=a1_gscale,
+            a2_gscale=a2_gscale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            # TODO: make sure this is handled properly
+            g1_alphas=(1 / w1_gs) if w1_gs is not None else None,
+            g2_alphas=(1 / w2_gs) if w2_gs is not None else None,
+        ),
     )
 
 
@@ -344,25 +347,27 @@ def fused_moe(
     score: torch.Tensor,
     topk: int,
     renormalize: bool = False,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    topk_weights, topk_ids, _ = fused_topk(hidden_states, score.float(), topk,
-                                           renormalize)
-    return fused_experts(hidden_states,
-                         w1,
-                         w2,
-                         topk_weights,
-                         topk_ids,
-                         global_num_experts=global_num_experts,
-                         expert_map=expert_map,
-                         quant_config=quant_config)
+    topk_weights, topk_ids, _ = fused_topk(
+        hidden_states, score.float(), topk, renormalize
+    )
+    return fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        quant_config=quant_config,
+    )
 
 
 # CustomOp?
 class BaselineMM(torch.nn.Module):
-
     def __init__(
         self,
         b: torch.Tensor,
@@ -372,15 +377,11 @@ class BaselineMM(torch.nn.Module):
         self.b = b.to(dtype=torch.float32)
         self.out_dtype = out_dtype
 
-    def forward(
-            self,
-            a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        return torch.mm(a.to(dtype=torch.float32),
-                        self.b).to(self.out_dtype), None
+    def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return torch.mm(a.to(dtype=torch.float32), self.b).to(self.out_dtype), None
 
 
 class TestMLP(torch.nn.Module):
-
     def __init__(
         self,
         w1: torch.Tensor,
@@ -410,7 +411,6 @@ def make_naive_shared_experts(
 
 
 class RealMLP(torch.nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -421,41 +421,52 @@ class RealMLP(torch.nn.Module):
         quant_config=None,
         reduce_results: bool = True,
         prefix: str = "",
-        w1_s: Optional[torch.Tensor] = None,
-        w2_s: Optional[torch.Tensor] = None,
+        w1_s: torch.Tensor | None = None,
+        w2_s: torch.Tensor | None = None,
     ) -> None:
         from vllm.model_executor.layers.linear import (
-            MergedColumnParallelLinear, RowParallelLinear)
+            MergedColumnParallelLinear,
+            RowParallelLinear,
+        )
 
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
+            prefix=f"{prefix}.gate_up_proj",
+        )
         self.gate_up_proj.register_parameter(
-            "weight", torch.nn.Parameter(w1, requires_grad=False))
+            "weight", torch.nn.Parameter(w1, requires_grad=False)
+        )
         self.gate_up_proj.register_parameter(
-            "weight_scale", torch.nn.Parameter(w1_s, requires_grad=False))
+            "weight_scale", torch.nn.Parameter(w1_s, requires_grad=False)
+        )
         self.gate_up_proj.register_parameter(
-            "input_scale",
-            None)  #torch.nn.Parameter(None, requires_grad=False))
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            "input_scale", None
+        )  # torch.nn.Parameter(None, requires_grad=False))
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         self.down_proj.register_parameter(
-            "weight", torch.nn.Parameter(w2, requires_grad=False))
+            "weight", torch.nn.Parameter(w2, requires_grad=False)
+        )
         self.down_proj.register_parameter(
-            "weight_scale", torch.nn.Parameter(w2_s, requires_grad=False))
+            "weight_scale", torch.nn.Parameter(w2_s, requires_grad=False)
+        )
         self.down_proj.register_parameter(
-            "input_scale",
-            None)  #torch.nn.Parameter(None, requires_grad=False))
+            "input_scale", None
+        )  # torch.nn.Parameter(None, requires_grad=False))
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -469,7 +480,7 @@ def make_shared_experts(
     N: int,
     K: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
+    quant_dtype: torch.dtype | str | None = None,
 ) -> torch.nn.Module:
     from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 
@@ -496,13 +507,6 @@ def make_shared_experts(
             w2_s = None
             quant_config = None
 
-        return RealMLP(K,
-                       N,
-                       w1,
-                       w2,
-                       "silu",
-                       quant_config,
-                       w1_s=w1_s,
-                       w2_s=w2_s)
+        return RealMLP(K, N, w1, w2, "silu", quant_config, w1_s=w1_s, w2_s=w2_s)
     finally:
         torch.set_default_dtype(old_dtype)
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 01a1ad2e7a0a5f60a74355999977308c671c61c5..830d43569e98beff5fc7657afcbeb876b6da0a2a 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    group_broadcast)
+from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
@@ -16,26 +14,32 @@ ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
-    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
-def ref_dynamic_per_token_quant(x: torch.tensor,
-                                quant_dtype: torch.dtype,
-                                scale_ub: Optional[torch.tensor] = None) \
-        -> tuple[torch.tensor, torch.tensor]:
 
+def ref_dynamic_per_token_quant(
+    x: torch.Tensor, quant_dtype: torch.dtype, scale_ub: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert quant_dtype in [torch.int8, FP8_DTYPE]
     if scale_ub is not None:
         assert quant_dtype == FP8_DTYPE
 
-    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
-            else torch.finfo(quant_dtype)
-    qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                        else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                        else qtype_traits.min
+    qtype_traits = (
+        torch.iinfo(quant_dtype)
+        if quant_dtype == torch.int8
+        else torch.finfo(quant_dtype)
+    )
+    qtype_traits_max = (
+        ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else qtype_traits.max
+    )
+    qtype_traits_min = (
+        -ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else qtype_traits.min
+    )
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
     s_512 = as_float32_tensor(512.0)
@@ -56,15 +60,13 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
         iscales = as_float32_tensor(s_1 / scales)
         torch_out = as_float32_tensor(x) * iscales
         torch_out = torch_out.round()
-        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits_max).to(quant_dtype)
+        torch_out = torch_out.clamp(qtype_traits_min, qtype_traits_max).to(quant_dtype)
     else:
         assert quant_dtype == FP8_DTYPE
         min_scaling_factor = s_1 / (qtype_max * s_512)
         scales = scales.clamp(min=min_scaling_factor)
         torch_out = as_float32_tensor(x) / scales
-        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits_max).to(quant_dtype)
+        torch_out = torch_out.clamp(qtype_traits_min, qtype_traits_max).to(quant_dtype)
 
     return torch_out, scales
 
@@ -72,16 +74,20 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 # The int8 version is very similar. Incorporate the int8 version, like in
 # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
 # kernel
-def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-                    -> tuple[torch.tensor, torch.tensor]:
-
+def ref_dynamic_per_tensor_fp8_quant(
+    x: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
     fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                    else fp8_traits.max
-    fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                    else fp8_traits.min
+    fp8_traits_max = (
+        ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else fp8_traits.max
+    )
+    fp8_traits_min = (
+        -ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else fp8_traits.min
+    )
     fp8_max = as_float32_tensor(fp8_traits_max)
     one = as_float32_tensor(1.0)
 
@@ -92,9 +98,12 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
     x_max = as_float32_tensor(x.abs().max())
     ref_scale = x_max / fp8_max
     ref_iscale = one / ref_scale
-    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
-        fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
-    return ref_out, ref_scale.view((1, ))
+    ref_out = (
+        (as_float32_tensor(x) * ref_iscale)
+        .clamp(fp8_traits_min, fp8_traits_max)
+        .to(FP8_DTYPE)
+    )
+    return ref_out, ref_scale.view((1, 1))
 
 
 def native_w8a8_block_matmul(
@@ -126,7 +135,7 @@ def native_w8a8_block_matmul(
 
     M = A.numel() // A.shape[-1]
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
+    origin_C_shape = A.shape[:-1] + (N,)
     A = A.reshape(M, A.shape[-1])
     As = As.reshape(M, As.shape[-1])
     n_tiles = (N + block_n - 1) // block_n
@@ -137,19 +146,19 @@ def native_w8a8_block_matmul(
     C_shape = (M, N)
     C = torch.zeros(C_shape, dtype=compute_type, device=A.device)
 
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
     ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
 
     for i in range(k_tiles):
         for j in range(n_tiles):
@@ -163,14 +172,14 @@ def native_w8a8_block_matmul(
     return C
 
 
-def native_per_token_group_quant_fp8(x,
-                                     group_size,
-                                     eps=1e-10,
-                                     dtype=torch.float8_e4m3fn):
+def native_per_token_group_quant_fp8(
+    x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
+):
     """Function to perform per-token-group quantization on an input tensor
     `x` using native torch."""
-    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` must "
-                                           "be divisible by `group_size`")
+    assert x.shape[-1] % group_size == 0, (
+        "the last dimension of `x` must be divisible by `group_size`"
+    )
     assert x.is_contiguous(), "`x` is not contiguous"
 
     finfo = torch.finfo(dtype)
@@ -178,28 +187,25 @@ def native_per_token_group_quant_fp8(x,
     fp8_max = finfo.max
 
     x_ = x.reshape(x.numel() // group_size, group_size)
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
     x_s = amax / fp8_max
     x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
     x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
 
     return x_q, x_s
 
 
-def native_per_token_group_quant_int8(x,
-                                      group_size,
-                                      eps=1e-10,
-                                      dtype=torch.int8):
+def native_per_token_group_quant_int8(x, group_size, eps=1e-10, dtype=torch.int8):
     """Function to perform per-token-group quantization on an input tensor
     `x` using native torch.
 
     It converts the tensor values into int8 values and returns the
     quantized tensor along with the scaling factor used for quantization.
     """
-    assert (x.shape[-1] % group_size == 0
-            ), "the last dimension of `x` must be divisible by `group_size`"
+    assert x.shape[-1] % group_size == 0, (
+        "the last dimension of `x` must be divisible by `group_size`"
+    )
     assert x.is_contiguous(), "`x` is not contiguous"
 
     iinfo = torch.iinfo(dtype)
@@ -208,13 +214,13 @@ def native_per_token_group_quant_int8(x,
 
     x_ = x.reshape(x.numel() // group_size, group_size)
     # Use float32 for scale calculation for stability
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
     x_s = amax / int8_max
-    x_q = (x_.to(torch.float32) / x_s).round().clamp(
-        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = (
+        (x_.to(torch.float32) / x_s).round().clamp(min=int8_min, max=int8_max).to(dtype)
+    )  # Round before clamping
     x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
 
     return x_q, x_s
 
@@ -229,9 +235,9 @@ def per_block_cast_to_int8(
     block_m, block_n = block_shape
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
+    x_padded = torch.zeros(
+        (round_up(m, block_m), round_up(n, block_n)), dtype=x.dtype, device=x.device
+    )
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
@@ -243,10 +249,10 @@ def per_block_cast_to_int8(
 
 def dequant(
     t: torch.Tensor,
-    scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
+    scale: torch.Tensor | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    out_dtype: Optional[torch.dtype] = torch.float32,
+    out_dtype: torch.dtype | None = torch.float32,
 ) -> torch.Tensor:
     if scale is not None:
         f32 = torch.float32
@@ -260,17 +266,18 @@ def dequant(
 
 def batched_dequant(
     t: torch.Tensor,
-    scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
+    scale: torch.Tensor | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    out_dtype: Optional[torch.dtype] = torch.float32,
+    out_dtype: torch.dtype | None = torch.float32,
 ) -> torch.Tensor:
     if scale is not None:
         assert t.shape[0] == scale.shape[0]
         out = torch.empty_like(t, dtype=out_dtype)
         for e in range(t.shape[0]):
-            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
-                             out_dtype)
+            out[e] = dequant(
+                t[e], scale[e], block_shape, per_act_token_quant, out_dtype
+            )
         return out
 
     return t.to(out_dtype)
@@ -281,9 +288,9 @@ def native_batched_masked_quant_matmul(
     B: torch.Tensor,
     C: torch.Tensor,
     num_expert_tokens: torch.Tensor,
-    A_scale: Optional[torch.Tensor] = None,
-    B_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    A_scale: torch.Tensor | None = None,
+    B_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
     per_act_token_quant: bool = False,
 ) -> torch.Tensor:
     num_expert_tokens_cpu = num_expert_tokens.clone()
@@ -294,15 +301,17 @@ def native_batched_masked_quant_matmul(
         num_tokens = num_expert_tokens_cpu[e]
         if A.dtype.itemsize == 1 and block_shape is not None:
             assert A_scale is not None and B_scale is not None
-            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
-                                           block_shape, C.dtype)
+            tmp = native_w8a8_block_matmul(
+                A[e], B[e], A_scale[e], B_scale[e], block_shape, C.dtype
+            )
             C[e, :num_tokens, :] = tmp[:num_tokens, :]
         elif A.dtype.itemsize == 1 and block_shape is None:
             assert A_scale is not None and B_scale is not None
             A_dq = dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
             B_dq = dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
-            C[e, :num_tokens, :] = (
-                A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
+            C[e, :num_tokens, :] = (A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(
+                C.dtype
+            )
         else:
             assert A_scale is None
             assert B_scale is None
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index fc4e125550180e22561c139b057f282fe7c0864c..5e6d54c42e89b391be9b2cfba447b769e23048ab 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -8,8 +8,9 @@ from vllm.scalar_type import scalar_types
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
-kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
-                            dtype=torch.float32)
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
 
 
 def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
@@ -22,12 +23,9 @@ def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
     return out[0:m, 0:k]
 
 
-def dequantize_nvfp4_to_dtype(tensor_fp4,
-                              tensor_sf,
-                              global_scale,
-                              dtype,
-                              device,
-                              block_size=16):
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
     """Dequantize the fp4 tensor back to high precision."""
     # Two fp4 values are packed into one uint8.
     assert tensor_fp4.dtype == torch.uint8
@@ -68,8 +66,11 @@ def break_fp4_bytes(a, dtype):
     return values.reshape(m, n * 2).to(dtype=dtype)
 
 
+def get_nvfp4_global_scale(a: torch.Tensor):
+    return (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.abs(a).max().to(torch.float32)
+
+
 def quant_nvfp4_tensor(a: torch.Tensor):
-    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.abs(a).max().to(torch.float32))
+    a_global_scale = get_nvfp4_global_scale(a)
     a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale)
     return a_quant, a_block_scale, a_global_scale
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 3de9cb364468453fce03a30f648569df19519238..e5f056f04f8c056876f1f7f9d1e150907051f03f 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -6,24 +6,25 @@ import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
-    ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
-    ALLSPARK_AMPERE_N_ALIGN)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    quantize_weights)
+    ALLSPARK_AMPERE_K_ALIGN,
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_AMPERE_N_ALIGN,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_weights
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 
-def is_gptq_allspark_supported(min_capability: int,
-                               max_capability: int) -> bool:
+def is_gptq_allspark_supported(min_capability: int, max_capability: int) -> bool:
     if not current_platform.is_cuda():
         return False
 
     capability = current_platform.get_device_capability()
     assert capability is not None
 
-    return capability.to_int() >= min_capability \
-        and capability.to_int() <= max_capability
+    return (
+        capability.to_int() >= min_capability and capability.to_int() <= max_capability
+    )
 
 
 MNK_FACTORS = [
@@ -43,7 +44,8 @@ HAS_ZP_OPTS = [False, True]
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def rand_data(shape, dtype=torch.float16):
@@ -52,7 +54,8 @@ def rand_data(shape, dtype=torch.float16):
 
 @pytest.mark.skipif(
     not is_gptq_allspark_supported(80, 89),
-    reason="AllSpark Ampere kernel is not supported on this GPU type.")
+    reason="AllSpark Ampere kernel is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("group_size", [-1])
 @pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
@@ -67,8 +70,9 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
     weight = rand_data((k, n), dtype=dtype)
 
     # Quantize (and apply act_order if provided)
-    w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
-                                        group_size, has_zp)
+    w_ref, qw, s, zp = quantize_weights(
+        weight, scalar_types.uint8b128, group_size, has_zp
+    )
 
     qw = qw.to(torch.uint8)
     if has_zp:
@@ -79,20 +83,42 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
 
     n_32align = (n + 32 - 1) // 32 * 32
 
-    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
-        qw, s, zp, has_zp)
-    opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
-            (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
-             n_32align))
-
-    opcheck(torch.ops._C.allspark_w8a16_gemm,
-            (input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
-             sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
-    output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
-                                     n, group_size, sm_count, sm_version,
-                                     ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
-                                     has_zp, True)
+    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(qw, s, zp, has_zp)
+    opcheck(
+        torch.ops._C.rearrange_kn_weight_as_n32k16_order,
+        (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n, n_32align),
+    )
+
+    opcheck(
+        torch.ops._C.allspark_w8a16_gemm,
+        (
+            input,
+            qw_reorder,
+            s_reorder,
+            zp_reorder,
+            n,
+            group_size,
+            sm_count,
+            sm_version,
+            ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+            has_zp,
+            True,
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    output = ops.allspark_w8a16_gemm(
+        input,
+        qw_reorder,
+        s_reorder,
+        zp_reorder,
+        n,
+        group_size,
+        sm_count,
+        sm_version,
+        ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+        has_zp,
+        True,
+    )
 
     output_ref = torch.matmul(input, w_ref)
     torch.cuda.synchronize()
diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index bc0868123d82ac01555fc999d5eba997ab778291..efb62ca3799a9a0c414dbaa4b742383ba80f62b4 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -8,40 +8,42 @@ from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops  # noqa: F401
 
 
-@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
-                    reason="AWQ is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not hasattr(torch.ops._C, "awq_dequantize"),
+    reason="AWQ is not supported on this GPU type.",
+)
 def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_TRITON_AWQ", "0")
-        qweight = torch.randint(-2000000000,
-                                2000000000, (8192, 256),
-                                device='cuda',
-                                dtype=torch.int32)
-        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        qweight = torch.randint(
+            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
+        )
+        scales = torch.rand((64, 2048), device="cuda", dtype=torch.float16)
+        zeros = torch.empty((64, 256), device="cuda", dtype=torch.int32)
         split_k_iters = 0
         thx = 0
         thy = 0
-        opcheck(torch.ops._C.awq_dequantize,
-                (qweight, scales, zeros, split_k_iters, thx, thy))
+        opcheck(
+            torch.ops._C.awq_dequantize,
+            (qweight, scales, zeros, split_k_iters, thx, thy),
+        )
 
 
 @pytest.mark.skip(reason="Not working; needs investigation.")
-@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
-                    reason="AWQ is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not hasattr(torch.ops._C, "awq_gemm"),
+    reason="AWQ is not supported on this GPU type.",
+)
 def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_TRITON_AWQ", "0")
-        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-        qweight = torch.randint(-2000000000,
-                                2000000000, (8192, 256),
-                                device='cuda',
-                                dtype=torch.int32)
-        scales = torch.randint(-2000000000,
-                               2000000000, (64, 256),
-                               device='cuda',
-                               dtype=torch.int32)
-        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        input = torch.rand((2, 8192), device="cuda", dtype=torch.float16)
+        qweight = torch.randint(
+            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
+        )
+        scales = torch.randint(
+            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
+        )
+        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
         split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm,
-                (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 9354495642b2814519bb0136a01ae16044b3aa9e..069bd743553483fbf754aae87436ce0656be9165 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -4,11 +4,15 @@
 
 Run `pytest tests/kernels/quantization/test_awq_triton.py`.
 """
+
 import pytest
 import torch
 
 from vllm.model_executor.layers.quantization.awq_triton import (
-    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
+    AWQ_TRITON_SUPPORTED_GROUP_SIZES,
+    awq_dequantize_triton,
+    awq_gemm_triton,
+)
 from vllm.platforms import current_platform
 
 device = "cuda"
@@ -33,23 +37,24 @@ def reverse_awq_order(t: torch.Tensor):
 # qweights - [R     , C // 8], int32
 # scales   - [R // G, C     ], float16
 # zeros    - [R // G, C // 8], int32
-def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
-                         qzeros: torch.Tensor,
-                         group_size: int) -> torch.Tensor:
-
+def awq_dequantize_torch(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, group_size: int
+) -> torch.Tensor:
     if group_size == -1:
         group_size = qweight.shape[0]
 
     bits = 4
     shifts = torch.arange(0, 32, bits, device=qzeros.device)
 
-    iweights = torch.bitwise_right_shift(qweight[:, :, None],
-                                         shifts[None, None, :]).to(torch.int8)
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
 
     iweights = iweights.view(iweights.shape[0], -1)
 
-    zeros = torch.bitwise_right_shift(qzeros[:, :, None],
-                                      shifts[None, None, :]).to(torch.int8)
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
     zeros = zeros.view(qzeros.shape[0], -1)
     zeros = reverse_awq_order(zeros)
 
@@ -70,7 +75,6 @@ def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
 @pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
 @pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
 def test_dequantize(qweight_rows, qweight_cols, group_size):
-
     if group_size == -1:
         group_size = qweight_rows
 
@@ -84,25 +88,27 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
 
     current_platform.seed_everything(0)
 
-    qweight = torch.randint(0,
-                            torch.iinfo(torch.int32).max,
-                            (qweight_rows, qweight_cols),
-                            dtype=qweight_dtype,
-                            device=device)
-    scales = torch.rand(scales_rows,
-                        scales_cols,
-                        dtype=scales_dtype,
-                        device=device)
-    zeros = torch.randint(0,
-                          torch.iinfo(torch.int32).max,
-                          (zeros_rows, zeros_cols),
-                          dtype=zeros_dtype,
-                          device=device)
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_rows, qweight_cols),
+        dtype=qweight_dtype,
+        device=device,
+    )
+    scales = torch.rand(scales_rows, scales_cols, dtype=scales_dtype, device=device)
+    zeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (zeros_rows, zeros_cols),
+        dtype=zeros_dtype,
+        device=device,
+    )
 
     iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
 
-    assert (not torch.any(torch.isinf(iweights_triton))
-            and not torch.any(torch.isnan(iweights_triton)))
+    assert not torch.any(torch.isinf(iweights_triton)) and not torch.any(
+        torch.isnan(iweights_triton)
+    )
 
     iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
 
@@ -119,7 +125,6 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
 @pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("splitK", [1, 8])
 def test_gemm(N, K, M, splitK, group_size):
-
     if group_size == -1:
         group_size = K
 
@@ -138,35 +143,29 @@ def test_gemm(N, K, M, splitK, group_size):
 
     current_platform.seed_everything(0)
 
-    input = torch.rand((input_rows, input_cols),
-                       dtype=input_dtype,
-                       device=device)
-    qweight = torch.randint(0,
-                            torch.iinfo(torch.int32).max,
-                            (qweight_rows, qweight_cols),
-                            device=device)
-    qzeros = torch.randint(0,
-                           torch.iinfo(torch.int32).max,
-                           (qzeros_rows, qzeros_cols),
-                           device=device)
-    scales = torch.rand((scales_rows, scales_cols),
-                        dtype=scales_dtype,
-                        device=device)
-
-    output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
-                                    split_k_iters)
-
-    assert (not torch.any(torch.isinf(output_triton))
-            and not torch.any(torch.isnan(output_triton)))
+    input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
+    qweight = torch.randint(
+        0, torch.iinfo(torch.int32).max, (qweight_rows, qweight_cols), device=device
+    )
+    qzeros = torch.randint(
+        0, torch.iinfo(torch.int32).max, (qzeros_rows, qzeros_cols), device=device
+    )
+    scales = torch.rand((scales_rows, scales_cols), dtype=scales_dtype, device=device)
+
+    output_triton = awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
+
+    assert not torch.any(torch.isinf(output_triton)) and not torch.any(
+        torch.isnan(output_triton)
+    )
 
     dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
 
     output_torch = torch.matmul(input, dequantized_weights)
 
-    assert (not torch.any(torch.isinf(output_torch))
-            and not torch.any(torch.isnan(output_torch)))
+    assert not torch.any(torch.isinf(output_torch)) and not torch.any(
+        torch.isnan(output_torch)
+    )
 
-    torch.testing.assert_close(output_triton.cpu(),
-                               output_torch.cpu(),
-                               atol=1e-1,
-                               rtol=1e-1)
+    torch.testing.assert_close(
+        output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1
+    )
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 211d1ecfe6e4d0bcc87205f4ea1992e11279b527..e9973c1fcc15e12aa0782a51e16fa81fcf3bd695 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -7,24 +7,28 @@ import itertools
 import pytest
 import torch
 
-from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_fp8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+    cutlass_scaled_mm,
+    per_token_group_quant_fp8,
+    w8a8_triton_block_scaled_mm,
+)
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (fp8_gemm_nt,
-                                  get_col_major_tma_aligned_tensor,
-                                  per_block_cast_to_fp8)
+from vllm.utils.deep_gemm import (
+    fp8_gemm_nt,
+    get_col_major_tma_aligned_tensor,
+    per_block_cast_to_fp8,
+)
+from vllm.utils.import_utils import has_deep_gemm
 
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
@@ -51,7 +55,8 @@ def setup_cuda():
 
 @pytest.mark.parametrize(
     "num_tokens,d,dtype,group_size,seed",
-    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
+)
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -60,15 +65,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
     out, scale = per_token_group_quant_fp8(x, group_size)
 
-    assert torch.allclose(out.to(torch.float32),
-                          ref_out.to(torch.float32),
-                          rtol=0.15)
+    assert torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
     assert torch.allclose(scale, ref_scale)
 
 
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -89,13 +93,12 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
-    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
 
 
@@ -126,32 +129,32 @@ def test_w8a8_block_fp8_cutlass_matmul():
 
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
     # Hopper requires row-major format for scales
-    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(
-        90) else Bs
+    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(90) else Bs
 
-    A_fp8, As = per_token_group_quant_fp8(A_fp32,
-                                          block_size[1],
-                                          column_major_scales=False)
+    A_fp8, As = per_token_group_quant_fp8(
+        A_fp32, block_size[1], column_major_scales=False
+    )
     # CUTLASS uses column-major format for scales
     A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8(
-        A_fp32, block_size[1], column_major_scales=True)
+        A_fp32, block_size[1], column_major_scales=True
+    )
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
-    out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass,
-                            block_size, out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    out = cutlass_scaled_mm(
+        A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass, block_size, out_dtype
+    )
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
 
 
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
-@pytest.mark.skipif(not has_deep_gemm(),
-                    reason="DeepGemm kernels not available.")
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     # only aligned sizes
@@ -171,20 +174,20 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     As = As_fp8.to(torch.float32)
     Bs = Bs_fp8.to(torch.float32)
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
     # Transpose earlier so that the testing will not trigger transposing kernels
     As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)
 
-    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+    out = torch.zeros((M, N), device="cuda", dtype=out_dtype)
 
-    assert As_fp8.shape == (M, (K + 127) //
-                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    assert As_fp8.shape == (M, (K + 127) // 128), (
+        f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    )
 
     fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index fac82cf9c8b5eb81a463094a1b1e7ff01a20eb36..310091b6a554d0aad865b00414d5d82d28dd8dfa 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -10,16 +10,14 @@ import torch
 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    w8a8_block_int8_matmul)
+    w8a8_block_int8_matmul,
+)
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
@@ -36,8 +34,10 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -58,11 +58,10 @@ def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
     As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
     out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index ae61b3b3a28a817fd573e06fd859d15afa9db96a..cfdb3658028a64cac74dc4285205193f8abf0d57 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -11,12 +11,11 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
@@ -40,9 +39,7 @@ def prune_to_2_4(tensor):
 
     # Create binary mask
     mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
 
     # Apply mask and reshape back
     pruned = reshaped * mask
@@ -55,32 +52,31 @@ def prune_to_2_4(tensor):
 
 # This function checks that applying an identity matrix multiplication
 # to the compressed weights yields the original uncompressed weights.
-def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
-                                         b_compressed: torch.Tensor,
-                                         b_metadata: torch.Tensor):
-
+def check_compress_decompress_invariance(
+    dtype: torch.dtype,
+    b: torch.Tensor,
+    b_compressed: torch.Tensor,
+    b_metadata: torch.Tensor,
+):
     # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
     # same dtype as its inputs. This line addresses that constraint while
     # arbitrarily using bfloat16 for the int8/fp8 cases.
     out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16
 
-    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
-    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
-    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
-                                            b_compressed,
-                                            b_metadata,
-                                            eye_scale,
-                                            eye_scale,
-                                            out_dtype=out_dtype)
+    eye = torch.eye(b.shape[0], device="cuda", dtype=dtype)
+    eye_scale = torch.ones(1, device="cuda", dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(
+        eye, b_compressed, b_metadata, eye_scale, eye_scale, out_dtype=out_dtype
+    )
 
     torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)
 
 
 def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
+    dtype: torch.dtype, m: int, n: int, k: int
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda')
-    b = torch.randn((n, k), device='cuda').t()
+    a = torch.randn((m, k), device="cuda")
+    b = torch.randn((n, k), device="cuda").t()
 
     if dtype == torch.int8:
         # ensure A and B aren't all zeros after rounding
@@ -107,32 +103,25 @@ def make_rand_sparse_tensors(
     return b_compressed, e, a, b
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 # Test working with a subset of A and B for sparse matmul
 def test_cutlass_sparse_subset():
-
     big_m = 1024
     m, n, k = 512, 512, 512
 
     # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k)
     a = whole_a[0:m, 0:k]
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=torch.bfloat16
+    )
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
@@ -161,105 +150,87 @@ MNK_FACTORS = [
 
 
 # Test working with a subset of A and B for sparse matmul
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m, n, k", MNK_FACTORS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype],
-                             use_bias: bool):
-
+def test_cutlass_sparse_gemm(
+    m: int, k: int, n: int, dtype: type[torch.dtype], use_bias: bool
+):
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
     scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
     scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
 
-    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=dtype) if use_bias else None
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=dtype, bias=bias
+    )
 
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=dtype, bias=bias)
 
     torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m, k, n", MNK_FACTORS)
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
-
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
     out_dtype = torch.bfloat16
 
-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
     torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m,k,n", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                                  per_out_ch: bool, use_bias: bool):
-
+def test_cutlass_sparse_int8_gemm(
+    m: int, n: int, k: int, per_act_token: bool, per_out_ch: bool, use_bias: bool
+):
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
     out_dtype = torch.bfloat16
 
-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
-
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
+
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
     torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 65320509e173f36fa9e9e371ba2d5369ed1a0804..de595b0a34e467c9a01efe2e40f84c784caaec79 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
 """
+
 import random
 
 import pytest
@@ -12,7 +13,7 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 MNK_FACTORS = [
     (1, 256, 128),
@@ -36,9 +37,7 @@ MNK_FACTORS = [
     (512, 24576, 128),
 ]
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # -1 means full extent in that dimension
 TENSORWISE_GROUP_SHAPE = (-1, -1)
@@ -60,18 +59,19 @@ def group_scale_helper(shape, group_shape):
 def scale_shape(shape, group_shape):
     assert len(shape) == len(group_shape)
     group_shape = group_scale_helper(shape, group_shape)
-    return tuple(
-        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
-
-
-def cutlass_fp8_gemm_helper(m: int,
-                            n: int,
-                            k: int,
-                            a_scale_group_shape: tuple,
-                            b_scale_group_shape: tuple,
-                            use_bias: bool,
-                            out_dtype: type[torch.dtype] = torch.bfloat16,
-                            device: str = "cuda"):
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def cutlass_fp8_gemm_helper(
+    m: int,
+    n: int,
+    k: int,
+    a_scale_group_shape: tuple,
+    b_scale_group_shape: tuple,
+    use_bias: bool,
+    out_dtype: type[torch.dtype] = torch.bfloat16,
+    device: str = "cuda",
+):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
     a = to_fp8(torch.randn((m, k), device=device))
@@ -80,36 +80,34 @@ def cutlass_fp8_gemm_helper(m: int,
     a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
     b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
-    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+    scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
+    scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
     # make scales M-major for blockwise quant, doesn't affect 1D scales
     scale_a = scale_a.t().contiguous().t()
     # make scales K-major for blockwise quant, doesn't affect 1D scales
     scale_b = scale_b.t().contiguous().t()
 
-    if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
-    else:
-        bias = None
+    bias = torch.rand((n,), device=device, dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
     torch.testing.assert_close(out, baseline, rtol=5e-1, atol=1.5e-1)
 
-    opcheck(torch.ops._C.cutlass_scaled_mm,
-            (out, a, b, scale_a, scale_b, bias))
+    opcheck(torch.ops._C.cutlass_scaled_mm, (out, a, b, scale_a, scale_b, bias))
 
 
-def cutlass_int8_gemm_helper(m: int,
-                             n: int,
-                             k: int,
-                             a_scale_group_shape: tuple,
-                             b_scale_group_shape: tuple,
-                             use_bias: bool,
-                             out_dtype: type[torch.dtype] = torch.bfloat16,
-                             device: str = "cuda"):
+def cutlass_int8_gemm_helper(
+    m: int,
+    n: int,
+    k: int,
+    a_scale_group_shape: tuple,
+    b_scale_group_shape: tuple,
+    use_bias: bool,
+    out_dtype: type[torch.dtype] = torch.bfloat16,
+    device: str = "cuda",
+):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
     a = to_int8(torch.randn((m, k), device=device) * 5)
@@ -118,158 +116,202 @@ def cutlass_int8_gemm_helper(m: int,
     a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
     b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
-    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+    scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
+    scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
-    if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
-    else:
-        bias = None
+    bias = torch.rand((n,), device=device, dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
-    opcheck(torch.ops._C.cutlass_scaled_mm,
-            (out, a, b, scale_a, scale_b, bias))
+    opcheck(torch.ops._C.cutlass_scaled_mm, (out, a, b, scale_a, scale_b, bias))
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
-                          b_scale_group_shape, use_bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                            use_bias)
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
-                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize(
+    "a_scale_group_shape,b_scale_group_shape", [((1, 128), (128, 128))]
+)
 @pytest.mark.parametrize("use_bias", [False])
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="FP8 blockwise is not supported on this GPU type.")
-def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
-                                          a_scale_group_shape,
-                                          b_scale_group_shape, use_bias: bool):
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="FP8 blockwise is not supported on this GPU type.",
+)
+def test_cutlass_fp8_blockwise_scale_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
         return
     if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
         return
     if m % 4 != 0 and current_platform.has_device_capability(100):
         return
-    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                            use_bias)
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
-                           b_scale_group_shape, use_bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                             use_bias)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+def test_cutlass_int8_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
+    cutlass_int8_gemm_helper(
+        m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
-                                        b_scale_group_shape,
-                                        out_dtype: type[torch.dtype],
-                                        use_bias: bool):
-    cutlass_int8_gemm_helper(512,
-                             512,
-                             512,
-                             a_scale_group_shape,
-                             b_scale_group_shape,
-                             use_bias,
-                             out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+def test_cutlass_int8_gemm_output_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_int8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
-                                       b_scale_group_shape,
-                                       out_dtype: type[torch.dtype],
-                                       use_bias: bool):
-    cutlass_fp8_gemm_helper(512,
-                            512,
-                            512,
-                            a_scale_group_shape,
-                            b_scale_group_shape,
-                            use_bias,
-                            out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
-                         [((1, 128), (128, 128))])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_output_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape,b_scale_group_shape", [((1, 128), (128, 128))]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [False])
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="FP8 blockwise is not supported on this GPU type.")
-def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
-                                                b_scale_group_shape,
-                                                out_dtype: type[torch.dtype],
-                                                use_bias: bool):
-    cutlass_fp8_gemm_helper(512,
-                            512,
-                            512,
-                            a_scale_group_shape,
-                            b_scale_group_shape,
-                            use_bias,
-                            out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="FP8 blockwise is not supported on this GPU type.",
+)
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
-                                  use_bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
-                            b_scale_group_shape, use_bias, torch.bfloat16,
-                            device)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_devices(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool, device: str
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        torch.bfloat16,
+        device,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
-                                   use_bias: bool, device: str):
-    cutlass_int8_gemm_helper(512,
-                             512,
-                             512,
-                             a_scale_group_shape,
-                             b_scale_group_shape,
-                             use_bias,
-                             out_dtype=torch.bfloat16,
-                             device=device)
+def test_cutlass_int8_gemm_devices(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool, device: str
+):
+    cutlass_int8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=torch.bfloat16,
+        device=device,
+    )
 
 
 # For the following two tests:
@@ -277,32 +319,42 @@ def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
 # of a large power of two. In any case, the kernel will have a naive fallback
 # when N and K are not divisible by 16. But M is the number of tokens and the
 # kernel must handle any M thrown at it.
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
-                                  use_bias: bool):
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_m_sweep(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
-                                    b_scale_group_shape, use_bias)
+            cutlass_fp8_gemm_helper(
+                m, nk, nk, a_scale_group_shape, b_scale_group_shape, use_bias
+            )
 
 
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
-                                   use_bias: bool):
+def test_cutlass_int8_gemm_m_sweep(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
-                                     b_scale_group_shape, use_bias)
+            cutlass_int8_gemm_helper(
+                m, nk, nk, a_scale_group_shape, b_scale_group_shape, use_bias
+            )
 
 
 @pytest.mark.parametrize("m", [32, 64, 128])
@@ -310,8 +362,7 @@ def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
 @pytest.mark.parametrize("k", [64, 128, 256])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.skip
-def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
-                                    out_dtype: torch.dtype):
+def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int, out_dtype: torch.dtype):
     # Currently, the test is failing because folding azp into
     # 16-bit bias loses too much precision
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
@@ -328,7 +379,7 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
 
     b_dq = scale_b * bq_f32
 
-    azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_a = torch.rand((1,), device="cuda", dtype=torch.float32) * 10 + 1.5
     azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
     azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
 
@@ -340,18 +391,17 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
     J = torch.ones((1, k), device="cuda", dtype=torch.float32)
     azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
     assert azp_bias.shape == (1, n)
-    assert azp_bias[0, :].shape == (n, )
-
-    baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
-        (aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
-            dtype=out_dtype, device='cuda')
-
-    out = ops.cutlass_scaled_mm(aq_i8,
-                                bq_i8,
-                                scale_a,
-                                scale_b,
-                                out_dtype=out_dtype,
-                                bias=azp_bias[0, :])
+    assert azp_bias[0, :].shape == (n,)
+
+    baseline_q = (
+        scale_a.to(device="cpu")
+        * scale_b.to(device="cpu")
+        * ((aq_i32 + azp_aq_i8).to(device="cpu") @ bq_i32.to(device="cpu"))
+    ).to(dtype=out_dtype, device="cuda")
+
+    out = ops.cutlass_scaled_mm(
+        aq_i8, bq_i8, scale_a, scale_b, out_dtype=out_dtype, bias=azp_bias[0, :]
+    )
     torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
     torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
 
@@ -362,8 +412,9 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("azp_per_token", [True, False])
-def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
-                          use_bias: bool, azp_per_token: bool):
+def test_cutlass_int8_azp(
+    m: int, n: int, k: int, out_dtype: torch.dtype, use_bias: bool, azp_per_token: bool
+):
     m_azp = m if azp_per_token else 1
     scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
@@ -377,16 +428,12 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     bq_f32 = bq_i8.to(dtype=torch.float32)
     b_dq = scale_b * bq_f32
 
-    azp_a = torch.rand(
-        (m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_a = torch.rand((m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
     azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
     azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
 
     a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
-    torch.testing.assert_close(a_dq,
-                               scale_a * aq_f32 - azp_a,
-                               rtol=1e-4,
-                               atol=1e-3)
+    torch.testing.assert_close(a_dq, scale_a * aq_f32 - azp_a, rtol=1e-4, atol=1e-3)
 
     if use_bias:
         bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
@@ -396,8 +443,8 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
 
     # int32 mm not supported on CUDA
-    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
-    cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
+    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device="cpu")
+    cq = (a_noazp_i32_cpu @ bq_i32.to(device="cpu")).to(device="cuda")
     baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
 
     # Hadamard is just the sum of the cols
@@ -406,14 +453,14 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     func_bias = bias if use_bias else None
 
     if azp_per_token:
-        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
-                                        out_dtype, azp_adj_i32, azp_i32,
-                                        func_bias)
+        out = ops.cutlass_scaled_mm_azp(
+            aq_i8, bq_i8, scale_a, scale_b, out_dtype, azp_adj_i32, azp_i32, func_bias
+        )
     else:
         azp_with_adj_i32 = azp_i32 * azp_adj_i32
-        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
-                                        out_dtype, azp_with_adj_i32, None,
-                                        func_bias)
+        out = ops.cutlass_scaled_mm_azp(
+            aq_i8, bq_i8, scale_a, scale_b, out_dtype, azp_with_adj_i32, None, func_bias
+        )
 
     # bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
     # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
@@ -423,13 +470,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
 
     if azp_per_token:
-        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
-                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
-                 func_bias))
+        opcheck(
+            torch.ops._C.cutlass_scaled_mm_azp,
+            (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32, func_bias),
+        )
     else:
-        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
-                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
-                 func_bias))
+        opcheck(
+            torch.ops._C.cutlass_scaled_mm_azp,
+            (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None, func_bias),
+        )
 
 
 # Test working with a subset of A and B
@@ -445,23 +494,14 @@ def test_cutlass_subset():
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    out = ops.cutlass_scaled_mm(a,
-                                b,
-                                scale_a,
-                                scale_b,
-                                out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
 # Test to make sure cuda graphs work
 class CutlassLayer(torch.nn.Module):
-
     def __init__(self, b, scale_a, scale_b, out_dtype):
         super().__init__()
         self.b = b
@@ -470,8 +510,9 @@ class CutlassLayer(torch.nn.Module):
         self.out_dtype = out_dtype
 
     def forward(self, a):
-        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
-                                     self.out_dtype)
+        return ops.cutlass_scaled_mm(
+            a, self.b, self.scale_a, self.scale_b, self.out_dtype
+        )
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
@@ -485,10 +526,8 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
     m_a_scales = m if per_act_token else 1
     n_b_scales = n if per_out_ch else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
+    scale_a = torch.randn((m_a_scales, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n_b_scales), device="cuda", dtype=torch.float32) / 10
 
     # Construct a trivial model with a single layer that calls a CUTLASS kernel
     model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
@@ -502,13 +541,14 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
     out.zero_()
     g.replay()
 
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
+    baseline = torch.mm(
+        scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)
+    ).to(torch.bfloat16)
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
 def test_cutlass_support_opcheck():
-    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability,))
 
 
 @pytest.mark.parametrize("num_experts", [8, 64])
@@ -517,11 +557,13 @@ def test_cutlass_support_opcheck():
 @pytest.mark.parametrize("use_bias", [False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
-                                per_out_ch: bool, use_bias: bool):
-
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
+def test_cutlass_fp8_group_gemm(
+    num_experts: int, per_act_token: bool, per_out_ch: bool, use_bias: bool
+):
     # Device and dtype setup
     device = "cuda"
     out_dtype = torch.half
@@ -533,13 +575,9 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     b_scales_tensors = []
     baseline_tensors = []
 
-    expert_offsets = torch.zeros((num_experts + 1),
-                                 device=device,
-                                 dtype=torch.int64)
+    expert_offsets = torch.zeros((num_experts + 1), device=device, dtype=torch.int64)
 
-    problem_sizes = torch.zeros((num_experts, 3),
-                                device=device,
-                                dtype=torch.int32)
+    problem_sizes = torch.zeros((num_experts, 3), device=device, dtype=torch.int32)
 
     if not per_act_token:
         one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
@@ -566,75 +604,76 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
         b_tensors.append(b_g)
 
         # Set up A/B scales
-        scale_b = torch.randn((1, n_b_scales),
-                              device=device,
-                              dtype=torch.float32)
+        scale_b = torch.randn((1, n_b_scales), device=device, dtype=torch.float32)
         b_scales_tensors.append(scale_b)
 
         if per_act_token:
-            scale_a = torch.randn((m_a_scales, 1),
-                                  device=device,
-                                  dtype=torch.float32)
+            scale_a = torch.randn((m_a_scales, 1), device=device, dtype=torch.float32)
             a_scales_tensors.append(scale_a)
         else:
             scale_a = one_scale_a
 
         # Compute baseline result for this group
-        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
-                                        None)
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype, None)
         baseline_tensors.append(baseline_g)
 
-    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
-                                    device=device,
-                                    dtype=torch.float8_e4m3fn)
-    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
-                                    device=device,
-                                    dtype=torch.float8_e4m3fn)
+    a_tensors_stacked = torch.empty(
+        (expert_offsets[num_experts], k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+    b_tensors_stacked = torch.empty(
+        (num_experts, n_g, k_g), device=device, dtype=torch.float8_e4m3fn
+    )
 
     for g in range(num_experts):
-        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
-                                                           1]] = a_tensors[g]
+        a_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]] = a_tensors[g]
         b_tensors_stacked[g] = b_tensors[g].t()
     b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
 
     if per_act_token:
         a_scales_tensors_stacked = torch.empty(
-            (expert_offsets[num_experts], 1),
-            device=device,
-            dtype=torch.float32)
+            (expert_offsets[num_experts], 1), device=device, dtype=torch.float32
+        )
         for g in range(num_experts):
-            a_scales_tensors_stacked[
-                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+            a_scales_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]] = (
+                a_scales_tensors[g]
+            )
     else:
         a_scales_tensors_stacked = one_scale_a
 
-    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
-                                           device=device,
-                                           dtype=torch.float32)
+    b_scales_tensors_stacked = torch.empty(
+        (num_experts, n_b_scales), device=device, dtype=torch.float32
+    )
     for g in range(num_experts):
         b_scales_tensors_stacked[g] = b_scales_tensors[g]
 
-    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
-                                      device=device,
-                                      dtype=out_dtype)
-
-    ab_strides = torch.full((num_experts, ),
-                            a_tensors_stacked.stride(0),
-                            device="cuda",
-                            dtype=torch.int64)
-    c_strides = torch.full((num_experts, ),
-                           out_tensors_stacked.stride(0),
-                           device="cuda",
-                           dtype=torch.int64)
-
-    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
-                       b_tensors_stacked, a_scales_tensors_stacked,
-                       b_scales_tensors_stacked, expert_offsets[:-1],
-                       problem_sizes, ab_strides, ab_strides, c_strides,
-                       per_act_token, per_out_ch)
+    out_tensors_stacked = torch.zeros(
+        (expert_offsets[num_experts], n_g), device=device, dtype=out_dtype
+    )
+
+    ab_strides = torch.full(
+        (num_experts,), a_tensors_stacked.stride(0), device="cuda", dtype=torch.int64
+    )
+    c_strides = torch.full(
+        (num_experts,), out_tensors_stacked.stride(0), device="cuda", dtype=torch.int64
+    )
+
+    ops.cutlass_moe_mm(
+        out_tensors_stacked,
+        a_tensors_stacked,
+        b_tensors_stacked,
+        a_scales_tensors_stacked,
+        b_scales_tensors_stacked,
+        expert_offsets[:-1],
+        problem_sizes,
+        ab_strides,
+        ab_strides,
+        c_strides,
+        per_act_token,
+        per_out_ch,
+    )
 
     # Validate each group's result against the baseline
     for g in range(num_experts):
         baseline = baseline_tensors[g]
-        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        c = out_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]]
         torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index f659408efe8c68d056fbc456411158b77a1dd27e..465e24fd7eb972d80e280a84a7bd00456d5d0de1 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -6,14 +6,15 @@ Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
 """
 
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_rows, quantize_weights)
+    pack_rows,
+    quantize_weights,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -24,16 +25,33 @@ from vllm.scalar_type import ScalarType, scalar_types
 #  have kernels and some kernels support multiple quantization methods.
 IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
 
-MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
-              (13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
-              (64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
-              (1024, 4096, 8192), (1024, 8192, 4096)]
+MNK_SHAPES = [
+    (1, 128, 128),
+    (1, 512, 1024),
+    (1, 4096, 4096),
+    (1, 8192, 28672),
+    (13, 8192, 4096),
+    (26, 4096, 8192),
+    (64, 4096, 4096),
+    (64, 8192, 28672),
+    (257, 128, 4096),
+    (257, 4096, 4096),
+    (1024, 4096, 8192),
+    (1024, 8192, 4096),
+]
 
 # TODO(czhu): get supported schedules from fn
 SCHEDULES = [
-    '128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
-    '128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
-    '128x256_1x1x1', '128x256_2x1x1'
+    "128x16_1x1x1",
+    "256x16_1x1x1",
+    "128x32_1x1x1",
+    "256x32_1x1x1",
+    "128x64_1x1x1",
+    "256x64_1x1x1",
+    "128x128_1x1x1",
+    "256x128_1x1x1",
+    "128x256_1x1x1",
+    "128x256_2x1x1",
 ]
 
 
@@ -41,10 +59,10 @@ SCHEDULES = [
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 @dataclass
@@ -60,19 +78,23 @@ class Tensors:
 
 # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
 #  Ch Scales Type, Tok Scales Type)
-TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
-                      Optional[torch.dtype], bool]
+TestTypeTuple = tuple[
+    list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool
+]
 TEST_TYPES = [
     *(
-        TypeConfig(act_type=torch.float8_e4m3fn,
-                   weight_type=w_type,
-                   output_type=o_type,
-                   group_scale_type=torch.float8_e4m3fn,
-                   channel_scale_type=torch.float32,
-                   token_scale_type=torch.float32)
+        TypeConfig(
+            act_type=torch.float8_e4m3fn,
+            weight_type=w_type,
+            output_type=o_type,
+            group_scale_type=torch.float8_e4m3fn,
+            channel_scale_type=torch.float32,
+            token_scale_type=torch.float32,
+        )
         for w_type in [scalar_types.int4]
         # TODO(czhu): fp16 out type
-        for o_type in [torch.bfloat16]),
+        for o_type in [torch.bfloat16]
+    ),
 ]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
@@ -86,26 +108,28 @@ IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
 # For testing quantized linear kernels
 def to_fp8(tensor: torch.Tensor):
     finfo = torch.finfo(torch.float8_e4m3fn)
-    return tensor.clamp(min=finfo.min,
-                        max=finfo.max).to(dtype=torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min, max=finfo.max).to(dtype=torch.float8_e4m3fn)
 
 
-def cutlass_quantize_and_pack(atype: torch.dtype,
-                              w: torch.Tensor,
-                              wtype: ScalarType,
-                              stype: Optional[torch.dtype],
-                              group_size: Optional[int],
-                              zero_points: bool = False):
+def cutlass_quantize_and_pack(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: torch.dtype | None,
+    group_size: int | None,
+    zero_points: bool = False,
+):
     assert wtype.is_integer(), "TODO: support floating point weights"
 
-    w_ref, w_q, w_s, w_zp = quantize_weights(w,
-                                             wtype,
-                                             group_size=group_size,
-                                             zero_points=zero_points)
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w, wtype, group_size=group_size, zero_points=zero_points
+    )
 
     # since scales are cast to fp8, we need to compute w_ref this way
-    w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
-        torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
+    w_ref = (
+        (w_q).to(torch.float32)
+        * w_s.to(atype).to(torch.float32).repeat_interleave(group_size, dim=0)
+    ).to(atype)
 
     # bit mask prevents sign extending int4 when packing
     w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
@@ -117,12 +141,14 @@ def cutlass_quantize_and_pack(atype: torch.dtype,
     return w_ref, w_q_packed, w_s_packed, w_zp
 
 
-def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
-                        group_size: Optional[int]) -> Tensors:
+def create_test_tensors(
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
+) -> Tensors:
     m, n, k = shape
 
-    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
-          group_size)
+    print(
+        "create_test_tensors, shape:", shape, "types:", types, "group_size:", group_size
+    )
 
     a = to_fp8(torch.randn((m, k), device="cuda"))
     w = to_fp8(torch.randn((k, n), device="cuda"))
@@ -133,30 +159,34 @@ def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
         w = w.to(torch.float16)
 
     w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
-        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
-        False)
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size, False
+    )
 
     a_ref = a.to(torch.float32)
     w_ref = w_ref.to(torch.float32)
 
     # for the practical use case we need per-tok scales for fp8 activations
-    w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
+    w_tok_s = torch.randn((m,), device="cuda", dtype=types.token_scale_type)
     # weights are already per-group quantized, use placeholder here
-    w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
-
-    return Tensors(w_ref=w_ref,
-                   a_ref=a_ref,
-                   a=a,
-                   w_q=w_q_packed,
-                   w_g_s=w_s,
-                   w_ch_s=w_ch_s,
-                   w_tok_s=w_tok_s)
+    w_ch_s = torch.ones((n,), device="cuda", dtype=types.channel_scale_type)
+
+    return Tensors(
+        w_ref=w_ref,
+        a_ref=a_ref,
+        a=a,
+        w_q=w_q_packed,
+        w_g_s=w_s,
+        w_ch_s=w_ch_s,
+        w_tok_s=w_tok_s,
+    )
 
 
-def mm_test_helper(types: TypeConfig,
-                   tensors: Tensors,
-                   group_size: Optional[int] = None,
-                   schedule: Optional[str] = None):
+def mm_test_helper(
+    types: TypeConfig,
+    tensors: Tensors,
+    group_size: int | None = None,
+    schedule: str | None = None,
+):
     # CUTLASS upstream uses fp8 with fastaccum as reference
     # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
     output_ref = torch._scaled_mm(
@@ -165,7 +195,8 @@ def mm_test_helper(types: TypeConfig,
         tensors.w_tok_s.unsqueeze(1),
         tensors.w_ch_s.unsqueeze(0),
         out_dtype=types.output_type,
-        use_fast_accum=True)
+        use_fast_accum=True,
+    )
 
     output = ops.cutlass_w4a8_mm(
         a=tensors.a,
@@ -179,17 +210,15 @@ def mm_test_helper(types: TypeConfig,
     print(output)
     print(output_ref)
 
-    torch.testing.assert_close(output,
-                               output_ref.to(output.dtype),
-                               rtol=1e-3,
-                               atol=1e-3)
+    torch.testing.assert_close(
+        output, output_ref.to(output.dtype), rtol=1e-3, atol=1e-3
+    )
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="CUTLASS W4A8 is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 @pytest.mark.parametrize("schedule", SCHEDULES)
 def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
@@ -201,7 +230,6 @@ def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
 
 # Test to make sure cuda graphs work
 class W4A8Layer(torch.nn.Module):
-
     def __init__(self, **kwargs):
         super().__init__()
         self.kwargs = kwargs
@@ -210,8 +238,9 @@ class W4A8Layer(torch.nn.Module):
         return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type."
+)
 def test_w4a8_cuda_graph():
     m, n, k = 512, 4096, 4096
 
@@ -224,10 +253,11 @@ def test_w4a8_cuda_graph():
     zero_points = False
 
     w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
-        a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
+        a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points
+    )
 
-    w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
-    w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
+    w_tok_s = torch.randn((m,), device="cuda", dtype=torch.float32)
+    w_ch_s = torch.ones((n,), device="cuda", dtype=torch.float32)
 
     # Construct a trivial model with a single layer that calls the kernel
     model = W4A8Layer(
@@ -244,7 +274,8 @@ def test_w4a8_cuda_graph():
         w_tok_s.unsqueeze(1),
         w_ch_s.unsqueeze(0),
         out_dtype=torch.bfloat16,
-        use_fast_accum=True)
+        use_fast_accum=True,
+    )
 
     # Run the model with a cuda graph
     stream = torch.cuda.Stream()
diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
index 131086a5f7034d010cab0cb077758bdc6f8ec62f..1e5c7dafb0f5a48e36babd3d8e9d75cee507b467 100644
--- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -2,8 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
-from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
-                         convert_swizzled_to_linear, dequantize_nvfp4_to_dtype)
+from nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    convert_swizzled_to_linear,
+    dequantize_nvfp4_to_dtype,
+)
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
@@ -41,18 +45,12 @@ def get_ref_results(
     _, m_k = a_fp4.shape
     _, n_k = b_fp4.shape
     assert m_k == n_k
-    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                           a_sf,
-                                           a_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
-    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
-                                           b_sf,
-                                           b_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4, a_sf, a_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
+    b_in_dtype = dequantize_nvfp4_to_dtype(
+        b_fp4, b_sf, b_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
     return torch.matmul(a_in_dtype, b_in_dtype.t())
 
 
@@ -72,8 +70,7 @@ def test_flashinfer_nvfp4_gemm(
     autotune: bool,
 ) -> None:
     if backend == "trtllm" and dtype == torch.float16:
-        pytest.skip(
-            "Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
+        pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
 
     current_platform.seed_everything(seed)
     m, n, packed_k = shape
@@ -82,10 +79,12 @@ def test_flashinfer_nvfp4_gemm(
     a_dtype = torch.randn((m, k), dtype=dtype, device=device)
     b_dtype = torch.randn((n, k), dtype=dtype, device=device)
 
-    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
-    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
     alpha = 1.0 / (a_global_scale * b_global_scale)
     # ops.scaled_fp4_quant returns swizzled scales, while weights
     # from checkpoints are in linear scales.
@@ -113,14 +112,18 @@ def test_flashinfer_nvfp4_gemm(
 
     if backend == "trtllm":
         epilogue_tile_m = 128
-        b_fp4 = flashinfer.shuffle_matrix_a(b_fp4.view(torch.uint8),
-                                            epilogue_tile_m)
+        b_fp4 = flashinfer.shuffle_matrix_a(b_fp4.view(torch.uint8), epilogue_tile_m)
 
         b_scale_interleaved = convert_swizzled_to_linear(
-            b_scale_interleaved, n, k, block_size)
-        b_scale_interleaved = (flashinfer.shuffle_matrix_sf_a(
-            b_scale_interleaved.view(torch.uint8), epilogue_tile_m).reshape(
-                b_scale_interleaved.shape).view(torch.float8_e4m3fn))
+            b_scale_interleaved, n, k, block_size
+        )
+        b_scale_interleaved = (
+            flashinfer.shuffle_matrix_sf_a(
+                b_scale_interleaved.view(torch.uint8), epilogue_tile_m
+            )
+            .reshape(b_scale_interleaved.shape)
+            .view(torch.float8_e4m3fn)
+        )
 
     with flashinfer.autotune(autotune):
         out = flashinfer_scaled_fp4_mm(
@@ -133,7 +136,4 @@ def test_flashinfer_nvfp4_gemm(
             backend=backend,
         )
 
-    torch.testing.assert_close(out,
-                               expected_out.to(dtype=dtype),
-                               atol=1e-1,
-                               rtol=1e-1)
+    torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1)
diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
index 9f669c6df8bd5fc579af41491a49c943e490574b..b30821b6895bcf9d275ebda9076750c6ac29247e 100644
--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -9,8 +9,7 @@ from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
 
 if not current_platform.has_device_capability(100):
     pytest.skip(
-        reason=
-        "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+        reason="Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
         allow_module_level=True,
     )
 
@@ -53,7 +52,7 @@ def test_flashinfer_fp8_gemm(
     ).to(dtype=dtype)
 
     if use_bias:
-        bias = torch.randn((n, ), dtype=dtype, device=device)
+        bias = torch.randn((n,), dtype=dtype, device=device)
         expected_out = expected_out + bias
     else:
         bias = None
diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index c2e70ffb8d343c45c9cb91df7e73bf8568cd3a5e..19aa21b96a57372eee6d0be2c4d175b1661b6da5 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -5,9 +5,11 @@ import pytest
 import torch
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (FP8_DTYPE,
-                                       ref_dynamic_per_tensor_fp8_quant,
-                                       ref_dynamic_per_token_quant)
+from tests.kernels.quant_utils import (
+    FP8_DTYPE,
+    ref_dynamic_per_tensor_fp8_quant,
+    ref_dynamic_per_token_quant,
+)
 from tests.kernels.utils import opcheck
 from vllm.platforms import current_platform
 
@@ -18,23 +20,25 @@ SCALE_UBS = [True, False]
 SEEDS = [0]
 
 
-def opcheck_fp8_quant(output,
-                      input,
-                      scale=None,
-                      scale_ub=None,
-                      use_per_token_if_dynamic=False):
+def opcheck_fp8_quant(
+    output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False
+):
     if scale is not None:
         opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
     elif use_per_token_if_dynamic:
-        scale = torch.empty((input.shape[0], 1),
-                            device=input.device,
-                            dtype=torch.float32)
-        opcheck(torch.ops._C.dynamic_per_token_scaled_fp8_quant,
-                (output, input, scale, scale_ub))
+        scale = torch.empty(
+            (input.shape[0], 1), device=input.device, dtype=torch.float32
+        )
+        opcheck(
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant,
+            (output, input, scale, scale_ub),
+        )
     else:
-        scale = torch.empty((input.numel() // input.shape[-1], 1),
-                            device=input.device,
-                            dtype=torch.float32)
+        scale = torch.empty(
+            (input.numel() // input.shape[-1], 1),
+            device=input.device,
+            dtype=torch.float32,
+        )
         opcheck(torch.ops._C.dynamic_scaled_fp8_quant, (output, input, scale))
 
 
@@ -44,30 +48,29 @@ def opcheck_fp8_quant(output,
 @pytest.mark.parametrize("scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
-                                     dtype: torch.dtype, scale_ub: bool,
-                                     seed: int) -> None:
+def test_dynamic_per_token_fp8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") + 1e-6  # avoid nans
+    x = (
+        torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
+    )  # avoid nans
 
-    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
-            if scale_ub else None
+    scale_ub = (
+        torch.mean(x).to(dtype=torch.float32, device="cuda") if scale_ub else None
+    )
     ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
-    ops_out, ops_scales = ops.scaled_fp8_quant(x,
-                                               scale_ub=scale_ub,
-                                               use_per_token_if_dynamic=True)
+    ops_out, ops_scales = ops.scaled_fp8_quant(
+        x, scale_ub=scale_ub, use_per_token_if_dynamic=True
+    )
 
     torch.testing.assert_close(ref_scales, ops_scales)
-    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
-                               ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
 
-    opcheck_fp8_quant(ops_out,
-                      x,
-                      None,
-                      scale_ub,
-                      use_per_token_if_dynamic=True)
+    opcheck_fp8_quant(ops_out, x, None, scale_ub, use_per_token_if_dynamic=True)
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -75,8 +78,9 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
-                                      dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_per_tensor_fp8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
@@ -85,8 +89,9 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
     ops_out, ops_scale = ops.scaled_fp8_quant(x)
 
     torch.testing.assert_close(ref_scale, ops_scale)
-    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
-                               ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
 
     opcheck_fp8_quant(ops_out, x)
 
diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py
index 720eee62760db7c8b3176dc3aa38af626abbbd15..6628ac650fd5ff1c628fdfaeb04abb82058e029e 100644
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -6,8 +6,7 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 
 
@@ -18,11 +17,14 @@ from vllm.platforms import current_platform
         (64, 1024, 64),  # Medium
         (128, 2048, 128),  # Large
         (8, 513, 64),  # Non-divisible (native only)
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [42])
+@pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
-def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
-                                      group_size: int, seed: int) -> None:
+def test_quantfp8_group_functionality(
+    batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
+) -> None:
     """Test QuantFP8 group quantization with various configurations.
 
     Tests both CUDA and native implementations, column-major scales,
@@ -30,15 +32,17 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
     """
     current_platform.seed_everything(seed)
 
-    x = torch.randn(
-        (batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
+    x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
     expected_num_groups = (hidden_dim + group_size - 1) // group_size
     is_divisible = hidden_dim % group_size == 0
 
     group_shape = GroupShape(1, group_size)
-    quant_op = QuantFP8(static=False,
-                        group_shape=group_shape,
-                        column_major_scales=False)
+    quant_op = QuantFP8(
+        static=False,
+        group_shape=group_shape,
+        column_major_scales=False,
+        use_ue8m0=use_ue8m0,
+    )
 
     # 1. Test native implementation (always available)
     x_quant_native, scales_native = quant_op.forward_native(x.clone())
@@ -46,11 +50,19 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
     assert scales_native.shape == (batch_size, expected_num_groups)
 
     # 2. Test column-major scales configuration
-    quant_op_col = QuantFP8(static=False,
-                            group_shape=group_shape,
-                            column_major_scales=True)
+    quant_op_col = QuantFP8(
+        static=False,
+        group_shape=group_shape,
+        column_major_scales=True,
+        use_ue8m0=use_ue8m0,
+    )
     _, scales_col = quant_op_col.forward_native(x.clone())
-    assert scales_col.shape == (expected_num_groups, batch_size)
+    assert scales_col.shape == (batch_size, expected_num_groups)
+    assert scales_col.stride(0) == 1
+    assert scales_col.stride(1) == batch_size
+
+    # Test column-major scales consistency
+    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
 
     # 3. Test CUDA implementation (only for divisible dimensions)
     if is_divisible:
@@ -68,47 +80,57 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
 
 
 @pytest.mark.parametrize("seed", [42])
+@pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int) -> None:
+def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
     current_platform.seed_everything(seed)
 
     group_size = 64
 
     # Test with 3D input
-    batch1, batch2, hidden_dim = 4, 8, 512
-    x_3d = torch.randn(
-        (batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
+    batch1, batch2, hidden_dim = 4, 8, 1024
+    x_3d = (
+        torch.randn((batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda")
+        * 8
+    )
 
     group_shape = GroupShape(1, group_size)
-    quant_op = QuantFP8(static=False,
-                        group_shape=group_shape,
-                        column_major_scales=False)
+    quant_op = QuantFP8(
+        static=False,
+        group_shape=group_shape,
+        column_major_scales=False,
+        use_ue8m0=use_ue8m0,
+    )
 
     x_quant, scales = quant_op.forward_native(x_3d.clone())
     assert x_quant.shape == x_3d.shape
     assert scales.shape == (batch1, batch2, hidden_dim // group_size)
 
     # Test column_major_scales with multi-dim
-    quant_op_col = QuantFP8(static=False,
-                            group_shape=group_shape,
-                            column_major_scales=True)
+    quant_op_col = QuantFP8(
+        static=False,
+        group_shape=group_shape,
+        column_major_scales=True,
+        use_ue8m0=use_ue8m0,
+    )
     _, scales_col = quant_op_col.forward_native(x_3d.clone())
-    assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)
+    assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
 
     # Test with 4D input
     batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
-    x_4d = torch.randn((batch1, batch2, batch3, hidden_dim),
-                       dtype=torch.bfloat16,
-                       device="cuda") * 8
+    x_4d = (
+        torch.randn(
+            (batch1, batch2, batch3, hidden_dim), dtype=torch.bfloat16, device="cuda"
+        )
+        * 8
+    )
 
     x_quant_4d, scales_4d = quant_op.forward_native(x_4d.clone())
     assert x_quant_4d.shape == x_4d.shape
-    assert scales_4d.shape == (batch1, batch2, batch3,
-                               hidden_dim // group_size)
+    assert scales_4d.shape == (batch1, batch2, batch3, hidden_dim // group_size)
 
     _, scales_4d_col = quant_op_col.forward_native(x_4d.clone())
-    assert scales_4d_col.shape == (batch1, batch2, hidden_dim // group_size,
-                                   batch3)
+    assert scales_4d_col.shape == (batch1, batch2, hidden_dim // group_size, batch3)
 
 
 @pytest.mark.parametrize("seed", [42])
@@ -120,30 +142,24 @@ def test_quantfp8_group_edge_cases(seed: int) -> None:
     group_size = 64
 
     # Test with single group (group_size >= hidden_dim)
-    x_small = torch.randn(
-        (batch_size, 32), dtype=torch.bfloat16, device="cuda") * 8
+    x_small = torch.randn((batch_size, 32), dtype=torch.bfloat16, device="cuda") * 8
     group_shape = GroupShape(1, group_size)
-    quant_op = QuantFP8(static=False,
-                        group_shape=group_shape,
-                        column_major_scales=False)
+    quant_op = QuantFP8(
+        static=False, group_shape=group_shape, column_major_scales=False
+    )
 
     x_quant_small, scales_small = quant_op.forward_native(x_small.clone())
     assert x_quant_small.shape == x_small.shape
     assert scales_small.shape == (batch_size, 1)
 
     # Test with zero inputs
-    x_zero = torch.zeros((batch_size, 256),
-                         dtype=torch.bfloat16,
-                         device="cuda")
+    x_zero = torch.zeros((batch_size, 256), dtype=torch.bfloat16, device="cuda")
     x_quant_zero, scales_zero = quant_op.forward_native(x_zero.clone())
     assert x_quant_zero.shape == x_zero.shape
     assert (scales_zero > 0).all(), "Scales should be clamped to minimum"
 
     # Test very large values
-    x_large = torch.full((batch_size, 256),
-                         1000.0,
-                         dtype=torch.bfloat16,
-                         device="cuda")
+    x_large = torch.full((batch_size, 256), 1000.0, dtype=torch.bfloat16, device="cuda")
     x_quant_large, scales_large = quant_op.forward_native(x_large.clone())
     assert x_quant_large.shape == x_large.shape
     # FP8 max is typically 448 or 224, so scales should be > 1
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
index 07651fef39bf4bc69e512d7721cfcda49d8bbb76..0dc24187f2b346e11fea37fcc02f36b42b4fa6b5 100644
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -13,33 +13,42 @@ from vllm import _custom_ops as ops  # noqa: F401
 def test_ggml_opcheck(quant_type):
     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
     shape = [256, 1152]
-    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
     m = qweight.shape[0]
     n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize,
-            (qweight, quant_type, m, n, torch.float16))
+    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n, torch.float16))
 
-    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
-    opcheck(torch.ops._C.ggml_mul_mat_a8,
-            (qweight, x, quant_type, qweight.shape[0]))
-    opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
-            (qweight, x, quant_type, qweight.shape[0]))
+    x = torch.rand((m, 512), device="cuda", dtype=torch.float16)
+    opcheck(torch.ops._C.ggml_mul_mat_a8, (qweight, x, quant_type, qweight.shape[0]))
+    opcheck(
+        torch.ops._C.ggml_mul_mat_vec_a8, (qweight, x, quant_type, qweight.shape[0])
+    )
 
     shape = [256, 1024, 336]
-    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
-    x = torch.rand((1, 1024), device='cuda', dtype=torch.float16)
-    sorted_token_ids = torch.arange(776, device='cuda')
-    expert_ids = torch.randint(0, 256, (194, ), device='cuda')
-    num_tokens_post_padded = torch.tensor([1],
-                                          dtype=torch.int64,
-                                          device='cuda')
-
-    opcheck(torch.ops._C.ggml_moe_a8,
-            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
-             quant_type, qweight.shape[0], 1, x.shape[0]))
+    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
+    x = torch.rand((1, 1024), device="cuda", dtype=torch.float16)
+    sorted_token_ids = torch.arange(776, device="cuda")
+    expert_ids = torch.randint(0, 256, (194,), device="cuda")
+    num_tokens_post_padded = torch.tensor([1], dtype=torch.int64, device="cuda")
 
-    topk_ids = torch.zeros((1, 1), device='cuda', dtype=torch.int32)
+    opcheck(
+        torch.ops._C.ggml_moe_a8,
+        (
+            x,
+            qweight,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            quant_type,
+            qweight.shape[0],
+            1,
+            x.shape[0],
+        ),
+    )
+
+    topk_ids = torch.zeros((1, 1), device="cuda", dtype=torch.int32)
 
     opcheck(
         torch.ops._C.ggml_moe_a8_vec,
-        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]))
+        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]),
+    )
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index 436d5cb640219797f512a9f40b2d48f1b0df4b3c..0988ba01759f250dc0c120220f7038dc6d19b9f7 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -18,8 +18,8 @@ GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
 
 
 def get_gguf_sample_tensors(
-        hidden_size: int,
-        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    hidden_size: int, quant_type: GGMLQuantizationType
+) -> list[ReaderTensor]:
     sample_dir = GGUF_SAMPLE
     filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
     sample_file = Path(sample_dir) / filename
@@ -27,8 +27,8 @@ def get_gguf_sample_tensors(
 
 
 def get_gguf_MoE_tensors(
-        hidden_size: int,
-        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    hidden_size: int, quant_type: GGMLQuantizationType
+) -> list[ReaderTensor]:
     sample_dir = GGUF_SAMPLE_MOE
     filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
     sample_file = Path(sample_dir) / filename
@@ -68,17 +68,20 @@ QUANT_TYPES = [
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_dequantize(hidden_size: int, dtype: torch.dtype,
-                    quant_type: GGMLQuantizationType):
+def test_dequantize(
+    hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType
+):
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     for tensor in tensors:
         shape_str = tensor.name.split("_")[-1]
         shape = map(int, shape_str.split("x"))
 
-        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
-                                  device="cuda").to(dtype)
-        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape), dtype)
+        ref_output = torch.tensor(
+            dequantize(tensor.data, quant_type), device="cuda"
+        ).to(dtype)
+        output = ops.ggml_dequantize(
+            torch.tensor(tensor.data, device="cuda"), quant_type, *list(shape), dtype
+        )
 
         torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
 
@@ -87,20 +90,21 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_mmvq(hidden_size: int, dtype: torch.dtype,
-              quant_type: GGMLQuantizationType):
+def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
     current_platform.seed_everything(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
     for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type),
-                              device="cuda").to(dtype)
+        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
+            dtype
+        )
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
-                                         qweight.shape[0]).to(dtype)
+        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type, qweight.shape[0]).to(
+            dtype
+        )
 
         torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
 
@@ -121,17 +125,23 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
         GGMLQuantizationType.Q4_0,
         GGMLQuantizationType.Q5_0,
         GGMLQuantizationType.Q8_0,
-    ])
+    ],
+)
 @torch.inference_mode()
-def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-             quant_type: GGMLQuantizationType):
+def test_mmq(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_type: GGMLQuantizationType,
+):
     current_platform.seed_everything(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
     for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type),
-                              device="cuda").to(dtype)
+        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
+            dtype
+        )
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
@@ -141,10 +151,9 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
         # bfloat16 tends to accumulate and can greatly inflate rtol
         # since outputs are also very close to 0
         rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
-        torch.testing.assert_close(output,
-                                   ref_output,
-                                   atol=atols[dtype],
-                                   rtol=rtols[dtype])
+        torch.testing.assert_close(
+            output, ref_output, atol=atols[dtype], rtol=rtols[dtype]
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -153,35 +162,46 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-             quant_type: GGMLQuantizationType, top_k: int):
+def test_moe(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_type: GGMLQuantizationType,
+    top_k: int,
+):
     current_platform.seed_everything(0)
     H, E = 1024, 256
 
     x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
 
     topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype)
-    topk_ids = torch.randint(0,
-                             E, (num_tokens, top_k),
-                             device="cuda",
-                             dtype=torch.int32)
+    topk_ids = torch.randint(
+        0, E, (num_tokens, top_k), device="cuda", dtype=torch.int32
+    )
 
     tensors = get_gguf_MoE_tensors(hidden_size, quant_type)
 
     w13 = tensors[0]
     w2 = tensors[1]
 
-    w13_dequant = torch.tensor(dequantize(w13.data, quant_type),
-                               device="cuda").to(dtype)
-
-    w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
-                              device="cuda").to(dtype)
-
-    output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
-                             torch.tensor(w2.data,
-                                          device="cuda"), topk_weights,
-                             topk_ids, quant_type, quant_type, "silu")
-
-    ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
-                               topk_ids).reshape(output.shape)
+    w13_dequant = torch.tensor(dequantize(w13.data, quant_type), device="cuda").to(
+        dtype
+    )
+
+    w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype)
+
+    output = _fused_moe_gguf(
+        x,
+        torch.tensor(w13.data, device="cuda"),
+        torch.tensor(w2.data, device="cuda"),
+        topk_weights,
+        topk_ids,
+        quant_type,
+        quant_type,
+        "silu",
+    )
+
+    ref_output = fused_experts(
+        x, w13_dequant, w2_dequant, topk_weights, topk_ids
+    ).reshape(output.shape)
     torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py
index 7fb57a1576bd86c57c081f7c651fa34ce85f791b..7bc7f97ce75b80bb9d36eceecce2e8079c7954de 100644
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -8,25 +8,28 @@ from vllm import _custom_ops as ops  # noqa: F401
 
 
 def test_gptq_shuffle_opcheck():
-    weight = torch.randint(-2000000,
-                           2000000, (1792, 4096),
-                           device='cuda',
-                           dtype=torch.int32)
-    perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    weight = torch.randint(
+        -2000000, 2000000, (1792, 4096), device="cuda", dtype=torch.int32
+    )
+    perm = torch.empty((0,), device="cuda", dtype=torch.int32)
     bit = 4
     opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
 
 
 def test_gptq_gemm_opcheck():
-    a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
-    weight = torch.randint(-2000000,
-                           2000000, (512, 6144),
-                           device='cuda',
-                           dtype=torch.int32)
-    zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
-    scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
-    idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    a = torch.rand((240, 4096), device="cuda", dtype=torch.float16)
+    weight = torch.randint(
+        -2000000, 2000000, (512, 6144), device="cuda", dtype=torch.int32
+    )
+    zeros = torch.zeros((32, 768), device="cuda", dtype=torch.int32)
+    scales = torch.rand((32, 6144), device="cuda", dtype=torch.float16)
+    idx = torch.empty((0,), device="cuda", dtype=torch.int32)
     use_exllama = True
     bit = 4
-    opcheck(torch.ops._C.gptq_gemm,
-            (a, weight, zeros, scales, idx, use_exllama, bit))
+    # Test both GPTQv1 and GPTQv2 format
+    opcheck(
+        torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, True, bit)
+    )
+    opcheck(
+        torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, False, bit)
+    )
diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py
index 127d68072e3f3d946de8cf85c56e77a736714987..3ccee9db048cf56e0b87e5cd96630c6e32b53be6 100644
--- a/tests/kernels/quantization/test_hadacore.py
+++ b/tests/kernels/quantization/test_hadacore.py
@@ -15,7 +15,8 @@ from vllm import _custom_ops as ops
 def test_hadacore(batch_size, hidden_dim, dtype=torch.bfloat16, device="cuda"):
     x = torch.eye(hidden_dim, dtype=dtype, device=device)
     hadamard = deterministic_hadamard_matrix(
-        hidden_dim, dtype=torch.float64, device="cuda") / math.sqrt(hidden_dim)
+        hidden_dim, dtype=torch.float64, device="cuda"
+    ) / math.sqrt(hidden_dim)
 
     y = ops.hadacore_transform(x.clone())
     y_true = (x.to(hadamard.dtype) @ hadamard.T).to(y.dtype)
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index f2271e6be54204b91444413939acea9c3bf6ab6d..0e31e9aabea85d099ca10f5c39078dbfa99f6b82 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -11,12 +11,12 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    per_token_quant_int8)
+    per_token_quant_int8,
+)
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
@@ -26,14 +26,13 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     B = B.to(torch.float32)
 
     assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
-    assert B.ndim == 2 and B.is_contiguous(
-    ), "B must be a 2D contiguous tensor"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
 
     # Reshape input
     M = A.numel() // A.shape[-1]
     B = B.t()  # Transpose weight matrix
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (K, )
+    origin_C_shape = A.shape[:-1] + (K,)
     A = A.reshape(M, N)
 
     # As is per-token [M, 1], Bs is per-column [1, K]
@@ -43,8 +42,7 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     return C.reshape(origin_C_shape).to(output_dtype)
 
 
-def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk, topk_weight,
-                              topk_ids):
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk, topk_weight, topk_ids):
     """This function performs fused moe with per-column int8 quantization
     using native torch."""
 
@@ -66,25 +64,22 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk, topk_weight,
         mask = topk_ids == i
         if mask.sum():
             # First MLP layer: note that a_s is now per-token
-            inter_out = native_w8a8_per_token_matmul(a_q[mask],
-                                                     w1[i],
-                                                     a_s[mask],
-                                                     w1_s[i],
-                                                     output_dtype=a.dtype)
+            inter_out = native_w8a8_per_token_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], output_dtype=a.dtype
+            )
             # Activation function
             act_out = SiluAndMul().forward_native(inter_out)
             # Quantize activation output with per-token
             act_out_q, act_out_s = per_token_quant_int8(act_out)
 
             # Second MLP layer
-            out[mask] = native_w8a8_per_token_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     output_dtype=a.dtype)
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
     # Apply routing weights and sum
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -102,8 +97,10 @@ TOP_KS = [2, 6]
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
-                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     torch.manual_seed(seed)
@@ -130,8 +127,9 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weights, topk_ids = torch.topk(score, topk)
 
-    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk,
-                                        topk_weights, topk_ids)
+    ref_out = torch_w8a8_per_column_moe(
+        a, w1, w2, w1_s, w2_s, topk, topk_weights, topk_ids
+    )
 
     quant_config = FusedMoEQuantConfig.make(
         torch.int8,
@@ -151,7 +149,7 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     )
 
     # Check results
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.05
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index c1c9bf191d5b5391774de966799b13b02d7419f1..48e947db5fa782d8a161410a66f80ee002e42770 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -18,26 +18,24 @@ SCALE = [0.1, 2.1]
 
 def opcheck_int8_quant_static(output, input, scale, azp=None):
     if azp is None:
-        opcheck(torch.ops._C.static_scaled_int8_quant,
-                (output, input, scale, None))
+        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale, None))
     else:
-        opcheck(torch.ops._C.static_scaled_int8_quant,
-                (output, input, scale, azp))
+        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale, azp))
 
 
 def opcheck_int8_quant_dynamic(output, input, symmetric=True):
-    scale = torch.empty((input.numel() // input.shape[-1], 1),
-                        device=input.device,
-                        dtype=torch.float32)
+    scale = torch.empty(
+        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
+    )
     if symmetric:
-        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
-                (output, input, scale, None))
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale, None))
     else:
-        azp = torch.empty((input.numel() // input.shape[-1], 1),
-                          device=input.device,
-                          dtype=torch.int32)
-        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
-                (output, input, scale, azp))
+        azp = torch.empty(
+            (input.numel() // input.shape[-1], 1),
+            device=input.device,
+            dtype=torch.int32,
+        )
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale, azp))
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -45,8 +43,9 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
-                                   dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_scaled_int8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -68,30 +67,31 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
-                                       dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_scaled_int8_azp_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") * 1000 - 300
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
 
     x_token_max, _ = x.to(dtype=torch.float32).max(dim=1, keepdim=True)
     x_token_min, _ = x.to(dtype=torch.float32).min(dim=1, keepdim=True)
 
     # calculate scale and azp, and adjust the range
     scales = (x_token_max - x_token_min) / torch.tensor(255.0)
-    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(
-        torch.int32)
+    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(torch.int32)
 
-    torch_out = ((x / scales).round() + azps).clamp(
-        int8_traits.min, int8_traits.max).to(torch.int8)
-    assert torch_out.min() >= int8_traits.min and torch_out.max(
-    ) <= int8_traits.max
+    torch_out = (
+        ((x / scales).round() + azps)
+        .clamp(int8_traits.min, int8_traits.max)
+        .to(torch.int8)
+    )
+    assert torch_out.min() >= int8_traits.min and torch_out.max() <= int8_traits.max
 
     ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
 
-    if (not torch.allclose(scales_out, scales)):
+    if not torch.allclose(scales_out, scales):
         print(torch.argmax(torch.abs(scales_out - scales)))
     torch.testing.assert_close(scales_out, scales)
     # big atol to account for rounding errors
@@ -108,17 +108,18 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("scale", SCALE)
 @torch.inference_mode()
-def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
-                                  dtype: torch.dtype, seed: int,
-                                  scale: float) -> None:
+def test_static_scaled_int8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
     scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
-    out1 = (x / scale_arg).round().clamp(int8_traits.min,
-                                         int8_traits.max).to(torch.int8)
+    out1 = (
+        (x / scale_arg).round().clamp(int8_traits.min, int8_traits.max).to(torch.int8)
+    )
     out2, scale2, _ = scaled_int8_quant(x, scale_arg)
     assert scale2 is scale_arg
 
@@ -135,24 +136,28 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("scale", SCALE)
 @pytest.mark.parametrize("azp", [-255, 54])
 @torch.inference_mode()
-def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
-                                      dtype: torch.dtype, seed: int,
-                                      scale: float, azp: int) -> None:
+def test_static_scaled_int8_azp_quant(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    scale: float,
+    azp: int,
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") * 1000 - 300
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
 
-    out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
-                                             int8_traits.max).to(torch.int8)
+    out1 = (
+        ((x / scale).round() + azp)
+        .clamp(int8_traits.min, int8_traits.max)
+        .to(torch.int8)
+    )
     scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
     azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
 
-    out2, scale2, azp2 = scaled_int8_quant(x,
-                                           scale_arg,
-                                           azp_arg,
-                                           symmetric=False)
+    out2, scale2, azp2 = scaled_int8_quant(x, scale_arg, azp_arg, symmetric=False)
     assert scale2 is scale_arg
     assert azp2 is azp_arg
 
@@ -172,10 +177,7 @@ def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
     int32_traits = torch.iinfo(torch.int32)
     val = float(int32_traits.max if is_max else int32_traits.min)
 
-    x_vals = [[
-        nextafter(val, inf), val + 1, val, val - 1,
-        nextafter(val, -inf)
-    ]]
+    x_vals = [[nextafter(val, inf), val + 1, val, val - 1, nextafter(val, -inf)]]
     x = torch.tensor(x_vals, dtype=torch.float32, device="cuda")
 
     # The calculation in the kernel is: cast<int8>(cast<int32>(x / scale) + azp)
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 50584f3f82d4ca903219ae3b993e1b9426249858..efa81de158d3847e2e09c2adc18c179096c980bc 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -7,7 +7,6 @@ Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 
 import math
 from dataclasses import dataclass, fields
-from typing import Optional
 
 import pytest
 import torch
@@ -15,15 +14,16 @@ import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.machete_utils import (
-    query_machete_supported_group_sizes)
+    query_machete_supported_group_sizes,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_rows, quantize_weights)
+    pack_rows,
+    quantize_weights,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
@@ -49,11 +49,11 @@ MNK_SHAPES = [
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 @dataclass
@@ -62,39 +62,48 @@ class Tensors:
     a_ref: torch.Tensor
     a: torch.Tensor
     w_q: torch.Tensor
-    w_g_s: Optional[torch.Tensor]
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_s: torch.Tensor | None
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None
 
 
 # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
 #  Ch Scales Type, Tok Scales Type)
 # NOTE: None "Scale Type" means the act type is floating point
 #       None "Output Type" means the output type is the same as the act type
-TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
-                      Optional[torch.dtype], bool]
+TestTypeTuple = tuple[
+    list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool
+]
 TEST_TYPES = [
     # GPTQ style
-    *(TypeConfig(act_type=a_type,
-                 weight_type=w_type,
-                 output_type=None,
-                 group_scale_type=a_type,
-                 group_zero_type=None,
-                 channel_scale_type=None,
-                 token_scale_type=None)
-      for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
-      for a_type in [torch.float16, torch.bfloat16]),
+    *(
+        TypeConfig(
+            act_type=a_type,
+            weight_type=w_type,
+            output_type=None,
+            group_scale_type=a_type,
+            group_zero_type=None,
+            channel_scale_type=None,
+            token_scale_type=None,
+        )
+        for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
+        for a_type in [torch.float16, torch.bfloat16]
+    ),
     # AWQ style
-    *(TypeConfig(act_type=a_type,
-                 weight_type=w_type,
-                 output_type=None,
-                 group_scale_type=a_type,
-                 group_zero_type=a_type,
-                 channel_scale_type=None,
-                 token_scale_type=None)
-      for w_type in [scalar_types.uint4, scalar_types.uint8]
-      for a_type in [torch.float16, torch.bfloat16]),
+    *(
+        TypeConfig(
+            act_type=a_type,
+            weight_type=w_type,
+            output_type=None,
+            group_scale_type=a_type,
+            group_zero_type=a_type,
+            channel_scale_type=None,
+            token_scale_type=None,
+        )
+        for w_type in [scalar_types.uint4, scalar_types.uint8]
+        for a_type in [torch.float16, torch.bfloat16]
+    ),
     # # QQQ style
     # *(TypeConfig(act_type=torch.int8,
     #              weight_type=scalar_types.uint4b8,
@@ -129,21 +138,22 @@ def rand_data(shape, dtype=torch.float16, scale=1, offset=0):
         return torch.randint(-8, 7, shape, dtype=dtype, device="cuda")
 
 
-def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
+def maybe_convert_zeropoints(zps: torch.Tensor | None, s: torch.Tensor):
     return zps if zps is None else -1 * s * (zps.to(s.dtype))
 
 
-def group_size_valid(shape: tuple[int, int, int],
-                     group_size: Optional[int]) -> bool:
+def group_size_valid(shape: tuple[int, int, int], group_size: int | None) -> bool:
     return group_size is None or group_size == -1 or shape[2] % group_size == 0
 
 
-def machete_quantize_and_pack(atype: torch.dtype,
-                              w: torch.Tensor,
-                              wtype: ScalarType,
-                              stype: Optional[torch.dtype],
-                              group_size: Optional[int],
-                              zero_points: bool = False):
+def machete_quantize_and_pack(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: torch.dtype | None,
+    group_size: int | None,
+    zero_points: bool = False,
+):
     assert wtype.is_integer(), "TODO: support floating point weights"
 
     w_ref, w_q, w_s, w_zp = quantize_weights(
@@ -152,7 +162,8 @@ def machete_quantize_and_pack(atype: torch.dtype,
         group_size=group_size,
         zero_points=zero_points,
         # to match how the kernel applies zps
-        ref_zero_points_after_scales=True)
+        ref_zero_points_after_scales=True,
+    )
 
     w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
     w_q = w_q.t().contiguous().t()  # convert to col major
@@ -163,15 +174,18 @@ def machete_quantize_and_pack(atype: torch.dtype,
     return w_ref, w_q_machete, w_s, w_zp
 
 
-def create_test_tensors(shape: tuple[int, int, int],
-                        types: TypeConfig,
-                        group_size: Optional[int],
-                        subset_stride_factor: Optional[int] = None) -> Tensors:
+def create_test_tensors(
+    shape: tuple[int, int, int],
+    types: TypeConfig,
+    group_size: int | None,
+    subset_stride_factor: int | None = None,
+) -> Tensors:
     m, n, k = shape
     factor = subset_stride_factor or 1
 
-    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
-          group_size)
+    print(
+        "create_test_tensors, shape:", shape, "types:", types, "group_size:", group_size
+    )
 
     a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2)
     w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1)
@@ -186,8 +200,13 @@ def create_test_tensors(shape: tuple[int, int, int],
         w = w.to(torch.float16)
 
     w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
-        types.group_zero_type is not None)
+        a.dtype,
+        w,
+        types.weight_type,
+        types.group_scale_type,
+        group_size,
+        types.group_zero_type is not None,
+    )
 
     if not a.dtype.is_floating_point:
         aiinfo = torch.iinfo(a.dtype)
@@ -196,35 +215,47 @@ def create_test_tensors(shape: tuple[int, int, int],
     a_ref = a.to(torch.float32)
     w_ref = w_ref.to(torch.float32)
 
-    w_ch_s = None if types.channel_scale_type is None else\
-        rand_data((n,), types.channel_scale_type)
-    w_tok_s = None if types.token_scale_type is None else\
-        rand_data((m,), types.token_scale_type)
+    w_ch_s = (
+        None
+        if types.channel_scale_type is None
+        else rand_data((n,), types.channel_scale_type)
+    )
+    w_tok_s = (
+        None
+        if types.token_scale_type is None
+        else rand_data((m,), types.token_scale_type)
+    )
 
-    return Tensors(w_ref=w_ref,
-                   a_ref=a_ref,
-                   a=a,
-                   w_q=w_q_packed,
-                   w_g_s=w_s,
-                   w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
-                   w_ch_s=w_ch_s,
-                   w_tok_s=w_tok_s)
+    return Tensors(
+        w_ref=w_ref,
+        a_ref=a_ref,
+        a=a,
+        w_q=w_q_packed,
+        w_g_s=w_s,
+        w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
+        w_ch_s=w_ch_s,
+        w_tok_s=w_tok_s,
+    )
 
 
 # None stype means scales use the same dtype as a
-def machete_mm_test_helper(types: TypeConfig,
-                           tensors: Tensors,
-                           group_size: Optional[int] = None,
-                           schedule: Optional[str] = None):
+def machete_mm_test_helper(
+    types: TypeConfig,
+    tensors: Tensors,
+    group_size: int | None = None,
+    schedule: str | None = None,
+):
     output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
     output_ref_type = output_ref.dtype
 
     if tensors.w_ch_s is not None:
-        output_ref = (output_ref.to(tensors.w_ch_s.dtype) *
-                      tensors.w_ch_s.unsqueeze(0)).to(output_ref_type)
+        output_ref = (
+            output_ref.to(tensors.w_ch_s.dtype) * tensors.w_ch_s.unsqueeze(0)
+        ).to(output_ref_type)
     if tensors.w_tok_s is not None:
-        output_ref = (output_ref.to(tensors.w_tok_s.dtype) *
-                      tensors.w_tok_s.unsqueeze(1)).to(output_ref_type)
+        output_ref = (
+            output_ref.to(tensors.w_tok_s.dtype) * tensors.w_tok_s.unsqueeze(1)
+        ).to(output_ref_type)
 
     output = ops.machete_mm(
         a=tensors.a,
@@ -245,24 +276,24 @@ def machete_mm_test_helper(types: TypeConfig,
     # Relax atol as our reduction dim becomes larger (more rounding error)
     # Relax atol when we have zeropoints since the way machete applies
     #  zeropoints (after scales) causes noise around 0
-    atol = 1 if tensors.w_g_zp is not None\
+    atol = (
+        1
+        if tensors.w_g_zp is not None
         else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1)
+    )
     rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1
-    torch.testing.assert_close(output,
-                               output_ref.to(output.dtype),
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(
+        output, output_ref.to(output.dtype), rtol=rtol, atol=atol
+    )
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_all_schedules(shape, types: TypeConfig):
-
-    group_sizes: list[Optional[int]] = []
+    group_sizes: list[int | None] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
@@ -275,23 +306,23 @@ def test_machete_all_schedules(shape, types: TypeConfig):
         tensors = create_test_tensors(shape, types, group_size)
         print(f"MNK = {shape}")
         for schedule in ops.machete_supported_schedules(
-                types.act_type,
-                types.weight_type,
-                group_scales_type=types.group_scale_type,
-                group_zeros_type=types.group_scale_type,
-                out_type=types.output_type):
+            types.act_type,
+            types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_scale_type,
+            out_type=types.output_type,
+        ):
             print(f"Testing schedule {schedule}")
             machete_mm_test_helper(types, tensors, group_size, schedule)
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_heuristic(shape, types: TypeConfig):
-    group_sizes: list[Optional[int]] = []
+    group_sizes: list[int | None] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
@@ -306,19 +337,22 @@ def test_machete_heuristic(shape, types: TypeConfig):
 
 
 # Test working on other devices
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_machete_devices(device: str):
     group_size = 128
 
-    type_config = TypeConfig(act_type=torch.float16,
-                             weight_type=scalar_types.uint4b8,
-                             output_type=None,
-                             group_scale_type=torch.float16,
-                             group_zero_type=None,
-                             channel_scale_type=None,
-                             token_scale_type=None)
+    type_config = TypeConfig(
+        act_type=torch.float16,
+        weight_type=scalar_types.uint4b8,
+        output_type=None,
+        group_scale_type=torch.float16,
+        group_zero_type=None,
+        channel_scale_type=None,
+        token_scale_type=None,
+    )
 
     tensors = create_test_tensors((512, 4096, 4096), type_config, group_size)
 
@@ -331,29 +365,30 @@ def test_machete_devices(device: str):
 
 
 # Test working with a subset of A and B
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 def test_machete_subset():
     group_size = 128
 
-    type_config = TypeConfig(act_type=torch.float16,
-                             weight_type=scalar_types.uint4b8,
-                             output_type=None,
-                             group_scale_type=torch.float16,
-                             group_zero_type=None,
-                             channel_scale_type=None,
-                             token_scale_type=None)
-
-    tensors = create_test_tensors((512, 4096, 4096),
-                                  type_config,
-                                  group_size,
-                                  subset_stride_factor=2)
+    type_config = TypeConfig(
+        act_type=torch.float16,
+        weight_type=scalar_types.uint4b8,
+        output_type=None,
+        group_scale_type=torch.float16,
+        group_zero_type=None,
+        channel_scale_type=None,
+        token_scale_type=None,
+    )
+
+    tensors = create_test_tensors(
+        (512, 4096, 4096), type_config, group_size, subset_stride_factor=2
+    )
     machete_mm_test_helper(type_config, tensors, group_size)
 
 
 # Test to make sure cuda graphs work
 class MacheteLayer(torch.nn.Module):
-
     def __init__(self, **kwargs):
         super().__init__()
         self.kwargs = kwargs
@@ -362,8 +397,9 @@ class MacheteLayer(torch.nn.Module):
         return ops.machete_mm(a=a, **self.kwargs)
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 def test_machete_cuda_graph():
     m, n, k = 512, 4096, 4096
 
@@ -375,7 +411,8 @@ def test_machete_cuda_graph():
     zero_points = False
 
     w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        a.dtype, b, wtype, stype, group_size, zero_points)
+        a.dtype, b, wtype, stype, group_size, zero_points
+    )
 
     # Construct a trivial model with a single layer that calls a machete kernel
     model = MacheteLayer(
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 0be020085bfa42698d4090f3cb6b982c39b515e2..0833115fcf30190e4c2be73b2bdf202393e1c248 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
+
 import pytest
 import torch
 
@@ -11,24 +12,44 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+    GPTQ_MARLIN_24_MAX_PARALLEL,
+    GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
+    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
-    query_marlin_supported_quant_types)
+    MARLIN_SUPPORTED_GROUP_SIZES,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace_new,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    query_marlin_supported_quant_types,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_mxfp4_like,
-    rand_marlin_weight_nvfp4_like)
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
+    rand_marlin_weight_mxfp4_like,
+    rand_marlin_weight_nvfp4_like,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    marlin_quant_fp8_torch)
+    marlin_quant_fp8_torch,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
-    marlin_weights)
+    MarlinWorkspace,
+    awq_marlin_quantize,
+    get_weight_perm,
+    marlin_quantize,
+    marlin_weights,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
-    marlin_24_quantize)
+    marlin_24_quantize,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
+    awq_pack,
+    gptq_pack,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
 from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
@@ -56,24 +77,27 @@ DTYPES = [torch.float16, torch.bfloat16]
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type",
-                         query_marlin_supported_quant_types(False, False))
+@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(False, False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
-                            act_order, mnk_factors):
+def test_gptq_marlin_repack(
+    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_k = k_chunk * k_factor
@@ -96,7 +120,8 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        b_weight, quant_type, group_size, act_order)
+        b_weight, quant_type, group_size, act_order
+    )
 
     # Pack to GPTQ format
     q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
@@ -109,11 +134,14 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 
     # Pack to Marlin format
     weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
-                                  weight_perm)
+    marlin_q_w_1 = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+    )
 
-    opcheck(torch.ops._C.gptq_marlin_repack,
-            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
+    opcheck(
+        torch.ops._C.gptq_marlin_repack,
+        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits),
+    )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -128,16 +156,16 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type",
-                         query_marlin_supported_quant_types(True))
+@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(True))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
-                           mnk_factors):
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_k = k_chunk * k_factor
@@ -152,21 +180,22 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize
-    w_ref, q_w, s, zp = quantize_weights(b_weight,
-                                         quant_type,
-                                         group_size,
-                                         zero_points=True)
+    w_ref, q_w, s, zp = quantize_weights(
+        b_weight, quant_type, group_size, zero_points=True
+    )
 
     # Pack to AWQ format
     q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
     weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
-                                  weight_perm)
+    marlin_q_w_1 = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+    )
 
-    opcheck(torch.ops._C.awq_marlin_repack,
-            (q_w_awq, size_k, size_n, quant_type.size_bits))
+    opcheck(
+        torch.ops._C.awq_marlin_repack, (q_w_awq, size_k, size_n, quant_type.size_bits)
+    )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
@@ -180,23 +209,34 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
 @pytest.mark.parametrize(
-    "group_size",
-    set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES))
+    "group_size", set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES)
+)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
 @pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 @pytest.mark.parametrize("dtype", DTYPES)
-def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
-                          mnk_factors, act_order, is_k_full, use_atomic_add,
-                          use_fp32_reduce, dtype):
+def test_gptq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    quant_type,
+    group_size,
+    mnk_factors,
+    act_order,
+    is_k_full,
+    use_atomic_add,
+    use_fp32_reduce,
+    dtype,
+):
     m_factor, n_factor, k_factor = mnk_factors
     has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
@@ -225,11 +265,13 @@ def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
             return
 
         if group_size == 16:
-            w_ref, marlin_q_w, marlin_s, marlin_s2 = \
-                rand_marlin_weight_nvfp4_like(b_weight.T, group_size)
+            w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_nvfp4_like(
+                b_weight.T, group_size
+            )
         else:
-            w_ref, marlin_q_w, marlin_s = \
-                rand_marlin_weight_mxfp4_like(b_weight.T, group_size)
+            w_ref, marlin_q_w, marlin_s = rand_marlin_weight_mxfp4_like(
+                b_weight.T, group_size
+            )
             marlin_s2 = None
 
         g_idx = None
@@ -240,8 +282,7 @@ def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
             return
         if act_order:
             return
-        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
-            b_weight.T, group_size)
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b_weight.T, group_size)
         g_idx = None
         sort_indices = None
         marlin_zp = None
@@ -250,7 +291,8 @@ def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
         if group_size == 16:
             return
         w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-            b_weight, quant_type, group_size)
+            b_weight, quant_type, group_size
+        )
         g_idx = None
         sort_indices = None
         marlin_s2 = None
@@ -258,18 +300,37 @@ def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
         if group_size == 16:
             return
         w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-            b_weight, quant_type, group_size, act_order)
+            b_weight, quant_type, group_size, act_order
+        )
         marlin_zp = None
         marlin_s2 = None
 
     workspace = marlin_make_workspace_new(w_ref.device)
 
-    opcheck(torch.ops._C.gptq_marlin_gemm,
-            (a_input, None, marlin_q_w, None, marlin_s, marlin_s2, marlin_zp,
-             g_idx, sort_indices, workspace, quant_type.id, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add,
-             use_fp32_reduce, False),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(
+        torch.ops._C.gptq_marlin_gemm,
+        (
+            a_input,
+            None,
+            marlin_q_w,
+            None,
+            marlin_s,
+            marlin_s2,
+            marlin_zp,
+            g_idx,
+            sort_indices,
+            workspace,
+            quant_type.id,
+            a_input.shape[0],
+            b_weight.shape[1],
+            a_input.shape[1],
+            is_k_full,
+            use_atomic_add,
+            use_fp32_reduce,
+            False,
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
 
     output = ops.gptq_marlin_gemm(
         a_input,
@@ -302,23 +363,40 @@ def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
 
 # TODO: find better way to test this?
 @torch.compile(fullgraph=True)
-def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta,
-                          marlin_24_s, scratch, quant_type, size_m, size_n,
-                          size_k):
-    return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta,
-                                   marlin_24_s, scratch, quant_type, size_m,
-                                   size_n, size_k)
+def marlin_24_gemm_tester(
+    a_input,
+    marlin_24_q_w_comp,
+    marlin_24_meta,
+    marlin_24_s,
+    scratch,
+    quant_type,
+    size_m,
+    size_n,
+    size_k,
+):
+    return ops.gptq_marlin_24_gemm(
+        a_input,
+        marlin_24_q_w_comp,
+        marlin_24_meta,
+        marlin_24_s,
+        scratch,
+        quant_type,
+        size_m,
+        size_n,
+        size_k,
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
-                             mnk_factors):
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -328,19 +406,31 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
 
-    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
+    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = marlin_24_quantize(
+        b_weight, quant_type, group_size
+    )
 
-    workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
-                                   GPTQ_MARLIN_24_MAX_PARALLEL)
+    workspace_24 = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
+    )
 
     output_ref = torch.matmul(a_input, w_24_ref)
 
-    opcheck(torch.ops._C.gptq_marlin_24_gemm,
-            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
-             workspace_24.scratch, quant_type.id, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(
+        torch.ops._C.gptq_marlin_24_gemm,
+        (
+            a_input,
+            marlin_24_q_w_comp,
+            marlin_24_meta,
+            marlin_24_s,
+            workspace_24.scratch,
+            quant_type.id,
+            a_input.shape[0],
+            b_weight.shape[1],
+            a_input.shape[1],
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
 
     output = marlin_24_gemm_tester(
         a_input,
@@ -361,8 +451,10 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES)
@@ -386,22 +478,22 @@ def test_hqq_marlin_gemm(
     a_input = rand_data((size_m, size_k))
     dev = a_input.device
 
-    b_weight = torch.randint(0,
-                             10, (size_n, size_k),
-                             dtype=torch.uint8,
-                             device=dev)
+    b_weight = torch.randint(0, 10, (size_n, size_k), dtype=torch.uint8, device=dev)
     scale = rand_data((size_n, size_k // group_size))
     zero = rand_data((size_n, size_k // group_size))
 
     gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
 
     sort_indices = torch.empty(0, dtype=torch.int, device=dev)
-    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
-                                        4).to(dev)
-    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
-                                     group_size).to(dev)
-    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
-                                      group_size).to(dev)
+    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n, 4).to(
+        dev
+    )
+    marlin_s = marlin_permute_scales(
+        scale.transpose(1, 0), size_k, size_n, group_size
+    ).to(dev)
+    marlin_zp = marlin_permute_scales(
+        zero.transpose(1, 0), size_k, size_n, group_size
+    ).to(dev)
 
     g_idx = marlin_make_empty_g_idx(dev)
     g_idx_sort_indices = marlin_make_empty_g_idx(dev)
@@ -433,8 +525,7 @@ def test_hqq_marlin_gemm(
     s_flat = scale.reshape(-1, 1)
     dequant = (b_flat - zp_flat) * s_flat
 
-    output_ref = torch.matmul(a_input,
-                              dequant.reshape(b_weight.shape).transpose(1, 0))
+    output_ref = torch.matmul(a_input, dequant.reshape(b_weight.shape).transpose(1, 0))
 
     torch.cuda.synchronize()
 
@@ -451,11 +542,12 @@ def test_marlin_gemm_subset_input():
     big_m = size_m * 2
     big_k = size_k * 2
 
-    a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
+    a_input = rand_data((big_m, big_k))[8 : size_m + 8, 8 : size_k + 8]
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-        b_weight, quant_type, group_size, False)
+        b_weight, quant_type, group_size, False
+    )
 
     marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
     workspace = marlin_make_workspace_new(a_input.device)
@@ -497,12 +589,13 @@ def test_marlin_gemm_with_bias(size_m):
     size_k, size_n = 1024, 2048
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
-    b_bias = rand_data((size_n, )) * 10
+    b_bias = rand_data((size_n,)) * 10
 
     marlin_bias = marlin_permute_bias(b_bias)
 
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-        b_weight, quant_type, group_size, False)
+        b_weight, quant_type, group_size, False
+    )
 
     marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
     workspace = marlin_make_workspace_new(a_input.device)
diff --git a/tests/kernels/quantization/test_mxfp4_qutlass.py b/tests/kernels/quantization/test_mxfp4_qutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bacbef2046b445b687718a04b09e39061d0a1fc
--- /dev/null
+++ b/tests/kernels/quantization/test_mxfp4_qutlass.py
@@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+import pytest
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+
+from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.platforms import current_platform
+
+if not torch.cuda.is_available():
+    pytest.skip("CUDA required for these tests.", allow_module_level=True)
+
+if not (
+    current_platform.has_device_capability(100)
+    or current_platform.has_device_capability(120)
+):
+    pytest.skip(
+        reason="Tests require compute capability 10.0 (100) or 12.0 (120).",
+        allow_module_level=True,
+    )
+
+
+# ----- Helpers -----
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _rtne_fp4(x: torch.Tensor):
+    device = x.device
+    grid = torch.tensor(
+        [
+            -6.0,
+            -4.0,
+            -3.0,
+            -2.0,
+            -1.5,
+            -1.0,
+            -0.5,
+            -0.0,
+            0.0,
+            0.5,
+            1.0,
+            1.5,
+            2.0,
+            3.0,
+            4.0,
+            6.0,
+        ],
+        dtype=x.dtype,
+        device=x.device,
+    )
+    grid_int = torch.tensor(
+        [-1, -2, -3, -4, -5, -6, -7, -8, 0, 1, 2, 3, 4, 5, 6, 7],
+        dtype=torch.uint8,
+        device=device,
+    )
+    inds = torch.bucketize(x, grid)
+    lo, hi = (inds - 1).clamp(min=0, max=15), inds.clamp(min=0, max=15)
+    g_lo, g_hi = grid[lo], grid[hi]
+    pick_hi = (g_hi - x < x - g_lo) | (g_hi - x == x - g_lo) & (grid_int[hi] % 2 == 0)
+    y = torch.where(pick_hi, g_hi, g_lo)
+    y_int = torch.where(pick_hi, grid_int[hi], grid_int[lo])
+    y_int_packed = (y_int[..., 1::2] & 0xF) << 4 | y_int[..., ::2] & 0xF
+    return y, y_int_packed
+
+
+def _dq_fp4(x_e2m1: torch.Tensor, x_e8m0: torch.Tensor, alpha: float):
+    device = x_e2m1.device
+
+    x_e2m1_i32 = x_e2m1.view(dtype=torch.uint8).to(dtype=torch.int32)
+    x_e2m1_unpacked = torch.stack(
+        [x_e2m1_i32 & 0xF, (x_e2m1_i32 >> 4) & 0xF], dim=-1
+    ).flatten(start_dim=-2)
+
+    grid_dq = torch.tensor(
+        [
+            0.0,
+            0.5,
+            1.0,
+            1.5,
+            2.0,
+            3.0,
+            4.0,
+            6.0,
+            -0.0,
+            -0.5,
+            -1.0,
+            -1.5,
+            -2.0,
+            -3.0,
+            -4.0,
+            -6.0,
+        ],
+        dtype=torch.float64,
+        device=device,
+    )
+    x_fp4_dq = grid_dq[x_e2m1_unpacked]
+    scales_dq = x_e8m0.to(torch.float64)
+
+    x_dq = (x_fp4_dq.unflatten(dim=-1, sizes=(-1, 32)) * scales_dq[..., None]).flatten(
+        start_dim=-2
+    ) / alpha
+    return x_dq, x_fp4_dq, scales_dq
+
+
+def _unpack_mask(clip_mask: torch.Tensor) -> torch.Tensor:
+    clip_mask_unpacked_dq = torch.zeros(
+        *clip_mask.shape[:-1],
+        clip_mask.size(-1) * 8,
+        dtype=torch.bool,
+        device=clip_mask.device,
+    )
+    for i in range(8):
+        clip_mask_unpacked_dq[..., i::8] = (clip_mask >> i) & 1
+    return clip_mask_unpacked_dq
+
+
+def _forward_quantize_ref(
+    x: torch.Tensor, h: torch.Tensor, rot_size: int, quest: bool = True
+):
+    device = x.device
+    xh_ref64 = (
+        x.unflatten(dim=-1, sizes=(-1, rot_size)).to(dtype=torch.float64)
+        @ h.reshape(rot_size, rot_size).to(dtype=torch.float64)
+    ).flatten(start_dim=-2)
+
+    if quest:
+        scales_ref64_ = (
+            xh_ref64.unflatten(dim=-1, sizes=(-1, 32)).std(dim=-1, correction=0)
+            * (2.92247856 / 6.0)
+            + 1e-8
+        )
+    else:
+        abs_max = xh_ref64.unflatten(dim=-1, sizes=(-1, 32)).abs().amax(dim=-1)
+        scales_ref64_ = abs_max + 1e-8
+
+    xh_e8m0_ref = scales_ref64_.log2().floor().exp2().to(dtype=torch.float8_e8m0fnu)
+    scales_ref64 = xh_e8m0_ref.to(dtype=torch.float64)
+
+    xh_scaled_ref64 = (
+        xh_ref64.unflatten(dim=-1, sizes=(-1, 32)) / scales_ref64[..., None]
+    ).flatten(start_dim=-2)
+    if not quest:
+        xh_scaled_ref64 *= 3
+
+    clip_mask_unpacked_ref = xh_scaled_ref64.abs() < 6.0
+    clip_mask_ref = torch.zeros(
+        *x.shape[:-1], x.size(-1) // 8, dtype=torch.uint8, device=device
+    )
+    for i in range(8):
+        clip_mask_ref |= clip_mask_unpacked_ref[..., i::8].to(dtype=torch.uint8) << i
+
+    xh_fp4_ref, xh_e2m1_ref = _rtne_fp4(xh_scaled_ref64)
+    xh_dq, xh_fp4_dq, scales_dq = _dq_fp4(
+        xh_e2m1_ref, xh_e8m0_ref, alpha=1.0 if quest else 3.0
+    )
+    clip_mask_unpacked_dq = _unpack_mask(clip_mask_ref)
+
+    assert xh_fp4_dq.equal(xh_fp4_ref)
+    assert scales_dq.equal(scales_ref64)
+    assert clip_mask_unpacked_dq.equal(clip_mask_unpacked_ref)
+
+    return (
+        xh_dq,
+        clip_mask_unpacked_ref,
+        (xh_e2m1_ref, xh_e8m0_ref, clip_mask_ref),
+    )
+
+
+DTYPE = torch.bfloat16
+DEVICE = torch.device("cuda:0")
+
+ROT_SIZES = [32, 64, 128]
+SEEDS = [0]
+BATCHES = [1, 16]
+
+LLAMA_MODELS = {
+    "7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
+    "13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
+    "33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
+    "70B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
+}
+
+
+@pytest.fixture(autouse=True)
+def _seed_each_test():
+    current_platform.seed_everything(0)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+
+
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@torch.inference_mode()
+def test_fused_quantization_absmax(rot_size: int):
+    dtype, device = DTYPE, DEVICE
+    h = get_hadamard_matrix(rot_size, dtype, device)
+    x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
+
+    xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, quest=False)
+    xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="abs_max")
+    xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)
+    xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=3.0)
+
+    torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
+    assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
+
+    m, n, k = 1, 504, 4096
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="abs_max")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="abs_max")
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0, backend="triton")
+    b_scale_block = to_blocked(b_e8m0, backend="triton")
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block, alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))
+
+
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@torch.inference_mode()
+def test_fused_quantization_quest(rot_size: int):
+    dtype, device = DTYPE, DEVICE
+    h = get_hadamard_matrix(rot_size, dtype, device)
+    x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
+
+    xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, quest=True)
+    xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="quest")
+    xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)
+    xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=1.0)
+
+    torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
+    assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
+
+    m, n, k = 504, 504, 2048
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="quest")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="quest")
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0, backend="triton")
+    b_scale_block = to_blocked(b_e8m0, backend="triton")
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block, alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))
+
+
+@pytest.mark.parametrize("model", list(LLAMA_MODELS.keys()))
+@pytest.mark.parametrize("layer_idx", [0, 1, 2, 3])
+@pytest.mark.parametrize("batch", [1, 16])
+@pytest.mark.parametrize("had_size", ROT_SIZES)
+@torch.inference_mode()
+def test_llama_shapes(model: str, layer_idx: int, batch: int, had_size: int):
+    dtype, device = DTYPE, DEVICE
+    m = batch
+    k, n = LLAMA_MODELS[model][layer_idx]
+
+    h = get_hadamard_matrix(had_size, dtype, device)
+
+    a = torch.rand(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.rand(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="quest")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="quest")
+
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0, backend="triton")
+    b_scale_block = to_blocked(b_e8m0, backend="triton")
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block, alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index 3a8f4c17598c2f84842c0d5e2d42468eb5f3c222..12f1008ecf27ffb294f50c49c34325182a29d080 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -8,15 +8,27 @@ from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
 
 DTYPES = [torch.float16, torch.bfloat16]
 SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
-PAD_SHAPES = [(90, 64), (150, 64), (128, 48), (128, 80), (150, 80), (90, 48),
-              (90, 128), (150, 128), (150, 48), (90, 80)]
+PAD_SHAPES = [
+    (90, 64),
+    (150, 64),
+    (128, 48),
+    (128, 80),
+    (150, 80),
+    (90, 48),
+    (90, 128),
+    (150, 128),
+    (150, 48),
+    (90, 80),
+]
 SEEDS = [42]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
@@ -31,7 +43,22 @@ FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 # 0001 -> 0.5
 # 0000 -> 0
 E2M1_TO_FLOAT32 = [
-    0., 0.5, 1., 1.5, 2., 3., 4., 6., 0., -0.5, -1., -1.5, -2., -3., -4., -6.
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
 ]
 BLOCK_SIZE = 16
 
@@ -74,8 +101,7 @@ def ref_nvfp4_quant(x, global_scale):
     assert x.ndim == 2
     m, n = x.shape
     x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
-    vec_max = torch.max(torch.abs(x), dim=-1,
-                        keepdim=True)[0].to(torch.float32)
+    vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32)
     scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
     scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
     output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
@@ -131,7 +157,7 @@ def test_quantize_to_fp4(
 def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     dtype = torch.float16
     current_platform.seed_everything(42)
-    torch.set_default_device('cuda:0')
+    torch.set_default_device("cuda:0")
 
     m, n = pad_shape
 
@@ -142,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
     out, out_scale = ops.scaled_fp4_quant(x, global_scale)
-
     scale_ans = recover_swizzled_scales(out_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
-
     torch.testing.assert_close(out_ans, out_ref)
     torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/tests/kernels/quantization/test_nvfp4_qutlass.py b/tests/kernels/quantization/test_nvfp4_qutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824a080f5047abfb6b87c108a5465c3833f9edb
--- /dev/null
+++ b/tests/kernels/quantization/test_nvfp4_qutlass.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+import pytest
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+
+from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
+from vllm._custom_ops import fusedQuantizeNv
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.platforms import current_platform
+
+if not torch.cuda.is_available():
+    pytest.skip("CUDA required for these tests.", allow_module_level=True)
+
+if not (
+    current_platform.has_device_capability(100)
+    or current_platform.has_device_capability(120)
+):
+    pytest.skip(
+        reason="Tests require compute capability 10.0 (100) or 12.0 (120).",
+        allow_module_level=True,
+    )
+
+
+# ----- Helpers -----
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _rtne_fp4(x: torch.Tensor):
+    device = x.device
+    grid = torch.tensor(
+        [
+            -6.0,
+            -4.0,
+            -3.0,
+            -2.0,
+            -1.5,
+            -1.0,
+            -0.5,
+            -0.0,
+            0.0,
+            0.5,
+            1.0,
+            1.5,
+            2.0,
+            3.0,
+            4.0,
+            6.0,
+        ],
+        dtype=x.dtype,
+        device=x.device,
+    )
+    grid_int = torch.tensor(
+        [-1, -2, -3, -4, -5, -6, -7, -8, 0, 1, 2, 3, 4, 5, 6, 7],
+        dtype=torch.uint8,
+        device=device,
+    )
+    inds = torch.bucketize(x, grid)
+    lo, hi = (inds - 1).clamp(min=0, max=15), inds.clamp(min=0, max=15)
+    g_lo, g_hi = grid[lo], grid[hi]
+    pick_hi = (g_hi - x < x - g_lo) | (g_hi - x == x - g_lo) & (grid_int[hi] % 2 == 0)
+    y = torch.where(pick_hi, g_hi, g_lo)
+    y_int = torch.where(pick_hi, grid_int[hi], grid_int[lo])
+    y_int_packed = (y_int[..., 1::2] & 0xF) << 4 | y_int[..., ::2] & 0xF
+    return y, y_int_packed
+
+
+def _dq_fp4(x_e2m1: torch.Tensor, x_e4m3: torch.Tensor, alpha: float):
+    device = x_e2m1.device
+
+    x_e2m1_i32 = x_e2m1.view(dtype=torch.uint8).to(dtype=torch.int32)
+    x_e2m1_unpacked = torch.stack(
+        [x_e2m1_i32 & 0xF, (x_e2m1_i32 >> 4) & 0xF], dim=-1
+    ).flatten(start_dim=-2)
+
+    grid_dq = torch.tensor(
+        [
+            0.0,
+            0.5,
+            1.0,
+            1.5,
+            2.0,
+            3.0,
+            4.0,
+            6.0,
+            -0.0,
+            -0.5,
+            -1.0,
+            -1.5,
+            -2.0,
+            -3.0,
+            -4.0,
+            -6.0,
+        ],
+        dtype=torch.float64,
+        device=device,
+    )
+    x_fp4_dq = grid_dq[x_e2m1_unpacked]
+
+    scales_dq = x_e4m3.to(torch.float64)
+    x_dq = (x_fp4_dq.unflatten(dim=-1, sizes=(-1, 16)) * scales_dq[..., None]).flatten(
+        start_dim=-2
+    ) / alpha  # * (4. / 3.)
+    return x_dq, x_fp4_dq, scales_dq
+
+
+def _unpack_mask(clip_mask: torch.Tensor) -> torch.Tensor:
+    clip_mask_unpacked_dq = torch.zeros(
+        *clip_mask.shape[:-1],
+        clip_mask.size(-1) * 8,
+        dtype=torch.bool,
+        device=clip_mask.device,
+    )
+    for i in range(8):
+        clip_mask_unpacked_dq[..., i::8] = (clip_mask >> i) & 1
+    return clip_mask_unpacked_dq
+
+
+def _forward_quantize_ref(x: torch.Tensor, h: torch.Tensor, rot_size: int):
+    device = x.device
+
+    xh_ref64 = (
+        x.unflatten(dim=-1, sizes=(-1, rot_size)).to(dtype=torch.float64)
+        @ h.reshape(rot_size, rot_size).to(dtype=torch.float64)
+    ).flatten(start_dim=-2)
+
+    abs_max = xh_ref64.unflatten(dim=-1, sizes=(-1, 16)).abs().amax(dim=-1)
+    scales_ref64_ = abs_max + 1e-8
+
+    xh_e4m3_ref = scales_ref64_.to(dtype=torch.float8_e4m3fn)
+    scales_ref64 = xh_e4m3_ref.to(dtype=torch.float64)
+    xh_scaled_ref64 = (
+        xh_ref64.unflatten(dim=-1, sizes=(-1, 16)) / scales_ref64[..., None]
+    ).flatten(start_dim=-2)
+
+    xh_scaled_ref64 *= 6.0
+
+    clip_mask_unpacked_ref = xh_scaled_ref64.abs() < 6.0
+    clip_mask_ref = torch.zeros(
+        *x.shape[:-1], x.size(-1) // 8, dtype=torch.uint8, device=device
+    )
+    for i in range(8):
+        clip_mask_ref |= clip_mask_unpacked_ref[..., i::8].to(dtype=torch.uint8) << i
+
+    xh_fp4_ref, xh_e2m1_ref = _rtne_fp4(xh_scaled_ref64)
+    xh_dq, xh_fp4_dq, scales_dq = _dq_fp4(xh_e2m1_ref, xh_e4m3_ref, 6.0)
+    clip_mask_unpacked_dq = _unpack_mask(clip_mask_ref)
+
+    assert xh_fp4_dq.equal(xh_fp4_ref)
+    assert scales_dq.equal(scales_ref64)
+    assert clip_mask_unpacked_dq.equal(clip_mask_unpacked_ref)
+
+    return (
+        xh_dq,
+        clip_mask_unpacked_ref,
+        (xh_e2m1_ref, xh_e4m3_ref, clip_mask_ref),
+    )
+
+
+DTYPE = torch.bfloat16
+DEVICE = torch.device("cuda:0")
+ROT_SIZES = [16, 32, 64, 128]
+GLOBAL_SCALES = [6.0]
+
+LLAMA_MODELS = {
+    "7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
+    "13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
+    "33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
+    "70B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
+}
+
+
+@pytest.fixture(autouse=True)
+def _seed_each_test():
+    current_platform.seed_everything(0)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+
+
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@pytest.mark.parametrize("global_scale_value", GLOBAL_SCALES)
+@torch.inference_mode()
+def test_fused_quantization(rot_size: int, global_scale_value: float):
+    dtype, device = DTYPE, DEVICE
+    h = get_hadamard_matrix(rot_size, dtype, device)
+    x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
+    global_scale = torch.tensor([global_scale_value], device=device)
+
+    xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size)
+    xh_e2m1, xh_e4m3 = fusedQuantizeNv(x, h, global_scale)
+    xh_e4m3 = xh_e4m3.reshape(2, 4096, 4096 // 16)
+    xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e4m3, alpha=global_scale_value)
+
+    torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
+    assert (xh_dq != xh_dq_ref).float().mean() <= 1e-1
+
+    m, n, k = 504, 4096 * 2, 4096
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e4m3 = fusedQuantizeNv(a, h, global_scale)
+    b_e2m1, b_e4m3 = fusedQuantizeNv(b, h, global_scale)
+
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e4m3[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e4m3[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e4m3, backend="triton").view(-1, k // 16)
+    b_scale_block = to_blocked(b_e4m3, backend="triton").view(-1, k // 16)
+    alpha = torch.tensor([1.0], device=device)
+    out = ops.cutlass_scaled_fp4_mm(
+        a_e2m1, b_e2m1, a_scale_block, b_scale_block, alpha, torch.bfloat16
+    )
+    assert out.equal(out_ref.to(dtype=out.dtype))
+
+
+@pytest.mark.parametrize("model", list(LLAMA_MODELS.keys()))
+@pytest.mark.parametrize("layer_idx", [0, 1, 2, 3])
+@pytest.mark.parametrize("batch", [1, 16])
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@torch.inference_mode()
+def test_llama_shapes(model: str, layer_idx: int, batch: int, rot_size: int):
+    dtype, device = DTYPE, DEVICE
+    m = batch
+    k, n = LLAMA_MODELS[model][layer_idx]
+
+    h = get_hadamard_matrix(rot_size, dtype, device)
+
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    global_scale = torch.tensor([1.0], device=device)
+
+    a_e2m1, a_e4m3 = fusedQuantizeNv(a, h, global_scale)
+    b_e2m1, b_e4m3 = fusedQuantizeNv(b, h, global_scale)
+
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e4m3[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e4m3[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e4m3, backend="triton").view(-1, k // 16)
+    b_scale_block = to_blocked(b_e4m3, backend="triton").view(-1, k // 16)
+    alpha = torch.tensor([1.0], device=device)
+    out = ops.cutlass_scaled_fp4_mm(
+        a_e2m1, b_e2m1, a_scale_block, b_scale_block, alpha, torch.bfloat16
+    )
+    assert out.equal(out_ref.to(dtype=out.dtype))
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 67e041f2b71c4f7f9bb3342f46c56b21cbb7b7ed..434564737c889da3d44c413c4c7a2e41f1995e00 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -2,15 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
-from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
-                         dequantize_nvfp4_to_dtype)
+from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
 
 DTYPES = [torch.float16, torch.bfloat16]
 # m, n, k
@@ -19,26 +20,31 @@ PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
 SHAPES.extend(PAD_SHAPES)
 
 SEEDS = [42]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 
 
-def get_ref_results(a_fp4, b_fp4, a_sf, b_sf, a_global_scale, b_global_scale,
-                    m, n, dtype, block_size, device):
+def get_ref_results(
+    a_fp4,
+    b_fp4,
+    a_sf,
+    b_sf,
+    a_global_scale,
+    b_global_scale,
+    m,
+    n,
+    dtype,
+    block_size,
+    device,
+):
     _, m_k = a_fp4.shape
     _, n_k = b_fp4.shape
-    assert (m_k == n_k)
-    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                           a_sf,
-                                           a_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
-    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
-                                           b_sf,
-                                           b_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
+    assert m_k == n_k
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4, a_sf, a_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
+    b_in_dtype = dequantize_nvfp4_to_dtype(
+        b_fp4, b_sf, b_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
     return torch.matmul(a_in_dtype, b_in_dtype.t())
 
 
@@ -60,25 +66,34 @@ def test_nvfp4_gemm(
     a_dtype = torch.randn((m, k), dtype=dtype, device=device)
     b_dtype = torch.randn((n, k), dtype=dtype, device=device)
 
-    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
-    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
-    alpha = 1. / (a_global_scale * b_global_scale)
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
     # ops.scaled_fp4_quant returns swizzled scales, while weights
     # from checkpoints are in linear scales.
     a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
     b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
 
     # get_ref_results unswizzles the scales internally.
-    expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
-                                   b_scale_interleaved, a_global_scale,
-                                   b_global_scale, m, n, dtype, block_size,
-                                   device)
-    out = ops.cutlass_scaled_fp4_mm(a_fp4, b_fp4, a_scale_interleaved,
-                                    b_scale_interleaved, alpha, dtype)
+    expected_out = get_ref_results(
+        a_fp4,
+        b_fp4,
+        a_scale_interleaved,
+        b_scale_interleaved,
+        a_global_scale,
+        b_global_scale,
+        m,
+        n,
+        dtype,
+        block_size,
+        device,
+    )
+    out = ops.cutlass_scaled_fp4_mm(
+        a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+    )
 
-    torch.testing.assert_close(out,
-                               expected_out.to(dtype=dtype),
-                               atol=1e-1,
-                               rtol=1e-1)
+    torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1)
diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py
index 07f17d1efe641e2705233b9309527445cefba453..7a65004545306d4021634b5c3ec4d980b652220b 100644
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -13,15 +13,15 @@ from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
 @pytest.mark.parametrize("scale_ue8m0", [False, True])
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-def test_per_token_group_quant_fp8(shape, column_major: bool,
-                                   scale_ue8m0: bool, group_size: int):
+def test_per_token_group_quant_fp8(
+    shape, column_major: bool, scale_ue8m0: bool, group_size: int
+):
     device = "cuda"
 
     torch.manual_seed(42)
     num_tokens, hidden_dim = shape
 
-    x = (torch.randn(
-        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+    x = torch.randn((num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8
 
     # cuda path
     out_q, scale = fp8_utils.per_token_group_quant_fp8(
@@ -53,8 +53,7 @@ def test_per_token_group_quant_int8(shape, group_size: int):
     torch.manual_seed(42)
     num_tokens, hidden_dim = shape
 
-    x = (torch.randn(
-        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+    x = torch.randn((num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8
 
     # cuda path
     out_q, scale = int8_utils.per_token_group_quant_int8(
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index 6de5fc9c56010d3a2de6dd6dda1df98fea412b53..15ff6d536413605c3258fa5d3b336a6292172b2d 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -8,6 +8,7 @@ import torch
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 from vllm.platforms import current_platform
+from vllm.utils.platform_utils import get_cu_count
 
 DTYPES = [torch.bfloat16, torch.float16]
 # Specific (N, K, M) combinations for targeted testing
@@ -63,12 +64,11 @@ SEEDS = [0]
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 @torch.inference_mode()
 def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     torch.manual_seed(seed)
-    #TODO: Zero-centering the inputs causes errors for LLMM1!
+    # TODO: Zero-centering the inputs causes errors for LLMM1!
     #      Without that the numbers quickly saturate, and may
     #      be giving false matches.
     A = torch.rand(n, k, dtype=dtype, device="cuda")
@@ -83,14 +83,13 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
-    A = torch.rand(n, k, dtype=dtype, device="cuda") - .5
-    B = torch.rand(m, k, dtype=dtype, device="cuda") - .5
+    A = torch.rand(n, k, dtype=dtype, device="cuda") - 0.5
+    B = torch.rand(m, k, dtype=dtype, device="cuda") - 0.5
 
     ref_out = torch.nn.functional.linear(A, B)
     out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count)
@@ -101,16 +100,15 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
-    A = (torch.rand(n, k, dtype=dtype, device="cuda") - .5) * xavier
-    B = (torch.rand(m, k, dtype=dtype, device="cuda") - .5) * xavier
-    BIAS = torch.rand(m, dtype=dtype, device="cuda") - .5
+    A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
+    B = (torch.rand(m, k, dtype=dtype, device="cuda") - 0.5) * xavier
+    BIAS = torch.rand(m, dtype=dtype, device="cuda") - 0.5
 
     ref_out = torch.nn.functional.linear(A, B, BIAS)
     out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
@@ -121,16 +119,15 @@ def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
-    A = (torch.rand(n, k, dtype=dtype, device="cuda") - .5) * xavier
-    B = (torch.rand(m, k, dtype=dtype, device="cuda") - .5) * xavier
-    BIAS = torch.rand(n, m, dtype=dtype, device="cuda") - .5
+    A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
+    B = (torch.rand(m, k, dtype=dtype, device="cuda") - 0.5) * xavier
+    BIAS = torch.rand(n, m, dtype=dtype, device="cuda") - 0.5
 
     ref_out = torch.nn.functional.linear(A, B, BIAS)
     out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
@@ -143,7 +140,8 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(
     not (current_platform.is_rocm() and current_platform.supports_fp8()),
-    reason="only test for rocm fp8")
+    reason="only test for rocm fp8",
+)
 def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
 
@@ -153,13 +151,17 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
     B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
 
-    ref_out = torch._scaled_mm(A,
-                               B.t(),
-                               out_dtype=dtype,
-                               scale_a=scale_a,
-                               scale_b=scale_b)
-    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
-                        current_platform.get_cu_count())
+    ref_out = torch._scaled_mm(
+        A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
+    )
+    out = ops.wvSplitKQ(
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+    )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
 
@@ -169,25 +171,30 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(
     not (current_platform.is_rocm() and current_platform.supports_fp8()),
-    reason="only test for rocm fp8")
+    reason="only test for rocm fp8",
+)
 def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
-    A = (torch.rand(n, k, device="cuda") - .5) * xavier
-    B = (torch.rand(m, k, device="cuda") - .5) * xavier
-    BIAS = torch.rand(m, dtype=dtype, device="cuda") - .5
+    A = (torch.rand(n, k, device="cuda") - 0.5) * xavier
+    B = (torch.rand(m, k, device="cuda") - 0.5) * xavier
+    BIAS = torch.rand(m, dtype=dtype, device="cuda") - 0.5
 
     A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
     B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
 
-    ref_out = torch._scaled_mm(A,
-                               B.t(),
-                               out_dtype=dtype,
-                               scale_a=scale_a,
-                               scale_b=scale_b,
-                               bias=BIAS)
-    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
-                        current_platform.get_cu_count(), BIAS)
+    ref_out = torch._scaled_mm(
+        A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, bias=BIAS
+    )
+    out = ops.wvSplitKQ(
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+        BIAS,
+    )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
index a40d0c4ef12243e942e0d89b3ba54e17452bc97e..4617464a3978817aded41b9f8c1fca60eb9b5c98 100644
--- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
@@ -3,16 +3,20 @@
 import pytest
 import torch
 
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from vllm._custom_ops import scaled_fp4_quant
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
 
 FP4_DTYPE = torch.uint8
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -30,24 +34,24 @@ def test_silu_mul_nvfp4_quant(
     shape: tuple[int, int],
 ) -> None:
     current_platform.seed_everything(42)
-    device = 'cuda:0'
+    device = "cuda:0"
     torch.set_default_device(device)
 
     x = torch.randn(shape, dtype=dtype)
 
     # ref op
     ref_output = SiluAndMul().forward_native(x)
-    ref_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                        torch.abs(ref_output).max().to(torch.float32))
-    ref_output_quant, ref_block_scale = scaled_fp4_quant(
-        ref_output, ref_global_scale)
+    ref_global_scale = (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.abs(
+        ref_output
+    ).max().to(torch.float32)
+    ref_output_quant, ref_block_scale = scaled_fp4_quant(ref_output, ref_global_scale)
 
     # fused op
     fused_output_quant = torch.empty_like(ref_output_quant)
     fused_block_scale = torch.empty_like(ref_block_scale)
-    torch.ops._C.silu_and_mul_nvfp4_quant(fused_output_quant,
-                                          fused_block_scale, x,
-                                          ref_global_scale)
+    torch.ops._C.silu_and_mul_nvfp4_quant(
+        fused_output_quant, fused_block_scale, x, ref_global_scale
+    )
 
     # check dtype
     assert ref_output_quant.dtype == FP4_DTYPE
@@ -59,17 +63,14 @@ def test_silu_mul_nvfp4_quant(
     assert ref_block_scale.shape == fused_block_scale.shape
 
     # check dequantized output
-    ref_output_dequant = dequantize_nvfp4_to_dtype(ref_output_quant,
-                                                   ref_block_scale,
-                                                   ref_global_scale, dtype,
-                                                   device)
-    fused_output_dequant = dequantize_nvfp4_to_dtype(fused_output_quant,
-                                                     fused_block_scale,
-                                                     ref_global_scale, dtype,
-                                                     device)
+    ref_output_dequant = dequantize_nvfp4_to_dtype(
+        ref_output_quant, ref_block_scale, ref_global_scale, dtype, device
+    )
+    fused_output_dequant = dequantize_nvfp4_to_dtype(
+        fused_output_quant, fused_block_scale, ref_global_scale, dtype, device
+    )
 
     atol, rtol = 3e-1, 3e-1
-    torch.testing.assert_close(ref_output_dequant,
-                               fused_output_dequant,
-                               atol=atol,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        ref_output_dequant, fused_output_dequant, atol=atol, rtol=rtol
+    )
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index d8cfb5710dbadf68aaf938f614f3b8a73c12ad59..6633a8bbd3c6024254270755312f126782f9697a 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -4,8 +4,8 @@
 
 Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
+
 import importlib
-from typing import Optional
 
 import pytest
 import torch
@@ -15,17 +15,19 @@ from vllm.platforms import current_platform
 device = "cuda"
 
 triton_scaled_mm_module = importlib.import_module(
-    "vllm.model_executor.layers.quantization.compressed_tensors."
-    "triton_scaled_mm")
+    "vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm"
+)
 triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
 
 
-def torch_scaled_mm(a: torch.Tensor,
-                    b: torch.Tensor,
-                    scale_a: torch.Tensor,
-                    scale_b: torch.Tensor,
-                    out_dtype: type[torch.dtype],
-                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def torch_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     out = torch.mm(a.to(torch.float32), b.to(torch.float32))
     out = scale_a * out
     out = scale_b.T * out
@@ -44,20 +46,22 @@ def get_8bit_types():
 
 
 # This test is to check regressions for int8 support on ROCm.
-@pytest.mark.parametrize("model_path", [
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-])
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="Should only run on ROCm")
-def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
-                                      max_tokens, num_logprobs):
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="Should only run on ROCm")
+def test_rocm_compressed_tensors_w8a8(
+    vllm_runner, example_prompts, model_path, max_tokens, num_logprobs
+):
     dtype = "bfloat16"
 
     with vllm_runner(model_path, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
-                                            num_logprobs)
+        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs)
 
 
 MNK_FACTORS = [
@@ -76,10 +80,10 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("use_scalar_scale_a", [True, False])
 @pytest.mark.parametrize("use_scalar_scale_b", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
-                   use_scalar_scale_b, use_bias):
-    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
-                                                    ).is_floating_point()
+def test_scaled_mm(
+    M, N, K, in_dtype, out_dtype, use_scalar_scale_a, use_scalar_scale_b, use_bias
+):
+    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
 
     current_platform.seed_everything(0)
 
@@ -93,10 +97,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
     #
     # So, the values here are kept small enough to avoid this situation.
     if is_floating_point_type(in_dtype):
-        a = (0.25 * torch.rand(
-            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
-        b = (0.25 * torch.rand(
-            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
+        a = (0.25 * torch.rand((M, K), dtype=torch.float32, device=device)).to(in_dtype)
+        b = (0.25 * torch.rand((K, N), dtype=torch.float32, device=device)).to(in_dtype)
     else:
         a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
         b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
@@ -113,7 +115,7 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
 
     bias = None
     if use_bias:
-        bias = torch.rand((N, ), device=device, dtype=out_dtype)
+        bias = torch.rand((N,), device=device, dtype=out_dtype)
 
     c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
index 90380b872d6c247033c2e36ae77012fcb287b843..a4619f5846b166633648d4513411eff70706f0a5 100644
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -4,8 +4,10 @@ import pytest
 import torch
 
 from tests.kernels.utils import opcheck
-from vllm._custom_ops import (apply_repetition_penalties_cuda,
-                              apply_repetition_penalties_torch)
+from vllm._custom_ops import (
+    apply_repetition_penalties_cuda,
+    apply_repetition_penalties_torch,
+)
 from vllm.platforms import current_platform
 
 NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
@@ -21,8 +23,9 @@ DTYPES = [torch.float32, torch.float16]
 @pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test for checking CUDA kernel")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test for checking CUDA kernel"
+)
 @torch.inference_mode()
 def test_apply_repetition_penalties(
     num_seqs: int,
@@ -32,7 +35,7 @@ def test_apply_repetition_penalties(
     seed: int,
 ) -> None:
     """
-    Test the apply_repetition_penalties custom op 
+    Test the apply_repetition_penalties custom op
     against a reference implementation.
     """
     current_platform.seed_everything(seed)
@@ -46,39 +49,40 @@ def test_apply_repetition_penalties(
     output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
 
     # Mark some tokens as repeated in prompt and output
-    prompt_indices = torch.randint(0, vocab_size,
-                                   (num_seqs, max(1, vocab_size // 200)))
-    output_indices = torch.randint(0, vocab_size,
-                                   (num_seqs, max(1, vocab_size // 200)))
+    prompt_indices = torch.randint(0, vocab_size, (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size, (num_seqs, max(1, vocab_size // 200)))
 
     for i in range(num_seqs):
         prompt_mask[i, prompt_indices[i]] = True
         output_mask[i, output_indices[i]] = True
 
     # Create repetition penalties tensor
-    repetition_penalties = torch.full((num_seqs, ),
-                                      repetition_penalty,
-                                      dtype=dtype)
+    repetition_penalties = torch.full((num_seqs,), repetition_penalty, dtype=dtype)
 
     # Run all three implementations
     logits_torch = logits.clone()
     logits_cuda = logits.clone()
 
-    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
-                                     repetition_penalties)
-    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
-                                    repetition_penalties)
+    apply_repetition_penalties_torch(
+        logits_torch, prompt_mask, output_mask, repetition_penalties
+    )
+    apply_repetition_penalties_cuda(
+        logits_cuda, prompt_mask, output_mask, repetition_penalties
+    )
 
     # Compare all outputs to reference
     torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
 
     # Test the operator by applying the opcheck utility
-    opcheck(torch.ops._C.apply_repetition_penalties_,
-            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+    opcheck(
+        torch.ops._C.apply_repetition_penalties_,
+        (logits.clone(), prompt_mask, output_mask, repetition_penalties),
+    )
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test for checking CUDA kernel")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test for checking CUDA kernel"
+)
 @torch.inference_mode()
 def test_apply_repetition_penalties_zero_seqs() -> None:
     """
@@ -104,22 +108,24 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
     # No tokens to mark as repeated since num_seqs=0
 
     # Create repetition penalties tensor
-    repetition_penalties = torch.full((num_seqs, ),
-                                      repetition_penalty,
-                                      dtype=dtype)
+    repetition_penalties = torch.full((num_seqs,), repetition_penalty, dtype=dtype)
 
     # Run all three implementations
     logits_torch = logits.clone()
     logits_cuda = logits.clone()
 
-    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
-                                     repetition_penalties)
-    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
-                                    repetition_penalties)
+    apply_repetition_penalties_torch(
+        logits_torch, prompt_mask, output_mask, repetition_penalties
+    )
+    apply_repetition_penalties_cuda(
+        logits_cuda, prompt_mask, output_mask, repetition_penalties
+    )
 
     # Compare all outputs to reference
     torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
 
     # Test the operator by applying the opcheck utility
-    opcheck(torch.ops._C.apply_repetition_penalties_,
-            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+    opcheck(
+        torch.ops._C.apply_repetition_penalties_,
+        (logits.clone(), prompt_mask, output_mask, repetition_penalties),
+    )
diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..f944c6dcfa73b3ea18031923f8bc83fb6c1bd6ab
--- /dev/null
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.fla.ops.layernorm_guard import (
+    layer_norm_fwd,
+    layernorm_fn,
+    rms_norm_ref,
+)
+from vllm.platforms import current_platform
+
+
+def layer_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    """Reference implementation for both layer norm and RMS norm."""
+    if is_rms_norm:
+        # Use the imported rms_norm_ref for RMS norm cases
+        return rms_norm_ref(
+            x,
+            weight,
+            bias,
+            z=z,
+            eps=eps,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            upcast=True,
+        )
+
+    # Layer norm implementation
+    dtype = x.dtype
+    x = x.float()
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    z = z.float() if z is not None else None
+
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+
+    if group_size is None:
+        # Layer norm: subtract mean
+        mean = x.mean(dim=-1, keepdim=True)
+        var = ((x - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        out = (x - mean) * rstd * weight
+        if bias is not None:
+            out = out + bias
+    else:
+        # Group norm
+        from einops import rearrange
+
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        mean = x_group.mean(dim=-1, keepdim=True)
+        var = ((x_group - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        x_group = (x_group - mean) * rstd
+        out = rearrange(x_group, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+
+    return out.to(dtype)
+
+
+DTYPES = [torch.bfloat16, torch.float32]
+# Test various M sizes to ensure rows_per_block logic works correctly
+NUM_TOKENS = [
+    1,
+    7,
+    16,
+    63,
+    128,
+    256,
+    512,
+    1024,
+    2048,
+    4096,
+    5789,
+    8189,
+    8191,
+    16383,
+    32767,
+]
+HIDDEN_SIZES = [64, 128, 256, 1024]
+GROUP_SIZES = [None, 64, 128]  # None means full hidden size
+NORM_BEFORE_GATE = [True, False]
+IS_RMS_NORM = [True, False]
+SEEDS = [0, 42]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_basic(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    is_rms_norm: bool,
+) -> None:
+    """Test basic layer norm forward pass without z (gate) tensor."""
+    current_platform.seed_everything(seed)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm)
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes
+    if not is_rms_norm:
+        assert mean.shape == (num_tokens,)
+    assert rstd.shape == (num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", [128, 256, 1024])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_gate(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    norm_before_gate: bool,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with z (gate) tensor."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x,
+        weight,
+        bias,
+        z=z,
+        eps=eps,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [128, 512])
+@pytest.mark.parametrize("hidden_size", [512, 1024])
+@pytest.mark.parametrize("group_size", [64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_groups(
+    num_tokens: int,
+    hidden_size: int,
+    group_size: int,
+    dtype: torch.dtype,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with group normalization."""
+    if hidden_size % group_size != 0:
+        pytest.skip(
+            f"hidden_size {hidden_size} not divisible by group_size {group_size}"
+        )
+
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    ngroups = hidden_size // group_size
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes for groups
+    if not is_rms_norm:
+        assert mean.shape == (ngroups * num_tokens,)
+    assert rstd.shape == (ngroups * num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_layer_norm_rows_per_block(
+    num_tokens: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that rows_per_block logic works correctly for various M sizes."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = 1024
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.inference_mode()
+def test_strided_input(dtype: torch.dtype) -> None:
+    """Test that the kernel handles non-contiguous (strided)
+    inputs correctly."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    num_tokens = 128
+    hidden_size = 1024
+
+    # Create a larger tensor and take a strided slice
+    x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device)
+    x = x_large[:, :hidden_size]
+
+    # Make it contiguous for the kernel
+    x_contiguous = x.contiguous()
+
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel with contiguous input
+    out, mean, rstd = layer_norm_fwd(
+        x_contiguous, weight, bias, eps, z=None, is_rms_norm=False
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False
+    )
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [1, 128, 2048])
+@pytest.mark.parametrize("hidden_size", [768, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_output_buffer_provided(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the kernel works when an output buffer is provided."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Pre-allocate output buffer
+    out_buffer = torch.empty_like(x)
+
+    # Run the triton kernel with provided output
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False
+    )
+
+    # Check that the provided buffer was used
+    assert out.data_ptr() == out_buffer.data_ptr()
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 16, 1024),  # 3D tensor
+        (2, 8, 512, 256),  # 4D tensor
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_multidimensional_input(
+    shape: tuple,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the autograd function handles multidimensional inputs."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = shape[-1]
+
+    # Create inputs
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run through autograd function
+    out = layernorm_fn(x, weight, bias, z=None, eps=eps)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    assert out.shape == x.shape
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    # Run a quick smoke test
+    test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
+    test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False)
+    test_layer_norm_rows_per_block(513, torch.float16)
+    print("All smoke tests passed!")
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index 39753c0cc15b90b39b0f2f7c8318d117cb0e8394..ae33f422d3732f16ed9a03a6367b708df1cf3f69 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -9,11 +9,13 @@ import pytest
 import torch
 from packaging import version
 
-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config)
-from vllm.v1.attention.backends.flex_attention import (
-    FlexAttentionMetadataBuilder)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    create_vllm_config,
+)
+from vllm.v1.attention.backends.flex_attention import FlexAttentionMetadataBuilder
 
 from ..models.utils import check_embeddings_close, check_logprobs_close
 
@@ -53,30 +55,34 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
 
     # Run with flex attention
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
 
         set_seed(seed)
-        with vllm_runner(model_name,
-                         runner="generate",
-                         tensor_parallel_size=1,
-                         num_gpu_blocks_override=128,
-                         enforce_eager=True) as llm_flex:
+        with vllm_runner(
+            model_name,
+            runner="generate",
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        ) as llm_flex:
             output_flex = llm_flex.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs)
+                prompts, max_tokens, num_logprobs
+            )
 
     # Run with default backend
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         set_seed(seed)
-        with vllm_runner(model_name,
-                         runner="generate",
-                         tensor_parallel_size=1,
-                         num_gpu_blocks_override=128,
-                         enforce_eager=True,
-                         gpu_memory_utilization=0.85) as llm_default:
+        with vllm_runner(
+            model_name,
+            runner="generate",
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+            gpu_memory_utilization=0.85,
+        ) as llm_default:
             output_default = llm_default.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs)
+                prompts, max_tokens, num_logprobs
+            )
 
     check_logprobs_close(
         outputs_0_lst=output_flex,
@@ -105,26 +111,30 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
 
     # Run with flex attention
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-        with vllm_runner(model_name,
-                         runner="pooling",
-                         dtype=torch.bfloat16,
-                         tensor_parallel_size=1,
-                         max_model_len=100,
-                         enforce_eager=True) as llm_flex:
+        with vllm_runner(
+            model_name,
+            runner="pooling",
+            dtype=torch.bfloat16,
+            tensor_parallel_size=1,
+            max_model_len=100,
+            enforce_eager=True,
+        ) as llm_flex:
             flex_outputs = llm_flex.embed(prompts)
 
     # Run with default backend
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        with vllm_runner(model_name,
-                         runner="pooling",
-                         dtype=torch.bfloat16,
-                         tensor_parallel_size=1,
-                         max_model_len=100,
-                         enforce_eager=True) as llm_default:
-            default_outputs = llm_default.embed(prompts)
+    with (
+        monkeypatch.context() as m,
+        vllm_runner(
+            model_name,
+            runner="pooling",
+            dtype=torch.bfloat16,
+            tensor_parallel_size=1,
+            max_model_len=100,
+            enforce_eager=True,
+        ) as llm_default,
+    ):
+        default_outputs = llm_default.embed(prompts)
 
     check_embeddings_close(
         embeddings_0_lst=flex_outputs,
@@ -147,27 +157,29 @@ def test_block_mask_direct_vs_slow_path():
     """
     device = torch.device("cuda")
 
-    vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
-                                     block_size=16,
-                                     max_model_len=1024)
+    vllm_config = create_vllm_config(
+        model_name="meta-llama/Meta-Llama-3-8B", block_size=16, max_model_len=1024
+    )
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
 
     # Use a mixed batch that will create groups spanning multiple sequences
-    batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
-                           query_lens=[33, 5, 32, 64],
-                           name="test_mixed_batch")
+    batch_spec = BatchSpec(
+        seq_lens=[35, 64, 128, 256], query_lens=[33, 5, 32, 64], name="test_mixed_batch"
+    )
 
     common_attn_metadata = create_common_attn_metadata(
-        batch_spec, vllm_config.cache_config.block_size, device)
+        batch_spec, vllm_config.cache_config.block_size, device
+    )
 
-    builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
-                                           device)
+    builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config, device)
 
-    metadata_direct = builder.build(common_prefix_len=0,
-                                    common_attn_metadata=common_attn_metadata)
+    metadata_direct = builder.build(
+        common_prefix_len=0, common_attn_metadata=common_attn_metadata
+    )
     builder.direct_build = False
-    metadata_slow = builder.build(common_prefix_len=0,
-                                  common_attn_metadata=common_attn_metadata)
+    metadata_slow = builder.build(
+        common_prefix_len=0, common_attn_metadata=common_attn_metadata
+    )
 
     assert metadata_direct.block_mask is not None
     assert metadata_slow.block_mask is not None
@@ -184,20 +196,20 @@ def test_block_mask_direct_vs_slow_path():
     missing_details = []
 
     for group_idx in range(num_groups):
-        direct_blocks = set(
-            direct_indices[group_idx, :direct_num[group_idx]].tolist())
-        slow_blocks = set(
-            slow_indices[group_idx, :slow_num[group_idx]].tolist())
+        direct_blocks = set(direct_indices[group_idx, : direct_num[group_idx]].tolist())
+        slow_blocks = set(slow_indices[group_idx, : slow_num[group_idx]].tolist())
 
         missing_blocks = slow_blocks - direct_blocks
         if missing_blocks:
             all_contained = False
             missing_details.append(
-                f"Group {group_idx}: missing {sorted(missing_blocks)}")
+                f"Group {group_idx}: missing {sorted(missing_blocks)}"
+            )
 
     assert all_contained, (
-        "Direct path is missing blocks required by slow path:\n" +
-        "\n".join(missing_details))
+        "Direct path is missing blocks required by slow path:\n"
+        + "\n".join(missing_details)
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index 803453a20d81d360b6fa5b1f3c02f564458be16d..c79e6105e69fa339b4dbc8b228edea6e7d7a59a3 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -13,13 +13,12 @@ QUANT_DTYPES = [current_platform.fp8_dtype()]
 NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
 HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
-def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
-             scale: torch.Tensor) -> torch.Tensor:
+def ref_impl(
+    silu_and_mul: SiluAndMul, x: torch.Tensor, scale: torch.Tensor
+) -> torch.Tensor:
     silu_and_mul_out = silu_and_mul.forward_native(x)
     out, scales = ops.scaled_fp8_quant(silu_and_mul_out, scale)
     return out
@@ -27,9 +26,7 @@ def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
 
 def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     out_shape = (x.shape[0], x.shape[1] // 2)
-    out = torch.empty(out_shape,
-                      dtype=current_platform.fp8_dtype(),
-                      device=x.device)
+    out = torch.empty(out_shape, dtype=current_platform.fp8_dtype(), device=x.device)
     torch.ops._C.silu_and_mul_quant(out, x, scale)
     return out
 
@@ -57,7 +54,7 @@ def test_silu_and_mul(
     layer = SiluAndMul()
 
     # Make inputs
-    scale = (torch.randn((1), device=device, dtype=torch.float32))
+    scale = torch.randn((1), device=device, dtype=torch.float32)
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
 
     ref_out = ref_impl(layer, x, scale)
@@ -66,6 +63,7 @@ def test_silu_and_mul(
     assert ref_out.dtype == quant_dtype
     assert ops_out.dtype == quant_dtype
     assert ref_out.shape == ops_out.shape
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
+    assert torch.allclose(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
     opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index 198a8fdf0c337755236e555676f0f69c99b83f0d..4e681ca6ac831b12a037f2783d31d0b48adc516b 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Integration tests for FlexAttention backend vs default backend"""
-
-from typing import Optional
 
 import pytest
 import torch
@@ -38,30 +35,33 @@ def ref_int8_scaled_mm(
     b: torch.Tensor,
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
-    azp: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor],
+    azp: torch.Tensor | None,
+    bias: torch.Tensor | None,
     output_type: torch.dtype,
 ):
     if azp is not None:
         a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
-    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
-                      (scale_b * b.to(dtype=torch.float32)))
+    output = torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    )
     if bias is not None:
         output += bias.float()
 
     return output.to(dtype=output_type)
 
 
-def onednn_int8_gemm_test_helper(primitive_cache_size: int,
-                                 m: int,
-                                 n: int,
-                                 k: int,
-                                 per_tensor_a_quant: bool,
-                                 per_tensor_b_quant: bool,
-                                 use_azp: bool,
-                                 use_bias: bool,
-                                 out_dtype: torch.dtype = torch.bfloat16,
-                                 device: str = "cpu"):
+def onednn_int8_gemm_test_helper(
+    primitive_cache_size: int,
+    m: int,
+    n: int,
+    k: int,
+    per_tensor_a_quant: bool,
+    per_tensor_b_quant: bool,
+    use_azp: bool,
+    use_bias: bool,
+    out_dtype: torch.dtype = torch.bfloat16,
+    device: str = "cpu",
+):
     # Test for a oneDNN kernel with per-tensor / per-token activation
     # quantization and per-tensor / per-output channel weight quantization.
     a = to_int8(torch.randn((m, k), device=device) * 5)
@@ -70,8 +70,8 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
     a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
     b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
 
-    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
-    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+    scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
+    scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
     if use_azp:
         azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
@@ -81,10 +81,7 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
         azp = None
         azp_adj = None
 
-    if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
-    else:
-        bias = None
+    bias = torch.rand((n,), device=device, dtype=out_dtype) * 10 if use_bias else None
 
     handler = ops.create_onednn_scaled_mm(
         b,
@@ -105,20 +102,21 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
         # To test runtime bias setting
         out = torch.zeros((m, n), dtype=out_dtype)
         ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
-        baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
-                                      out_dtype)
+        baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None, out_dtype)
 
         torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
-def onednn_gemm_test_helper(primitive_cache_size: int,
-                            m: int,
-                            n: int,
-                            k: int,
-                            use_bias: bool,
-                            use_stride: bool,
-                            dtype: torch.dtype = torch.bfloat16,
-                            device: str = "cpu"):
+def onednn_gemm_test_helper(
+    primitive_cache_size: int,
+    m: int,
+    n: int,
+    k: int,
+    use_bias: bool,
+    use_stride: bool,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str = "cpu",
+):
     if use_stride:
         a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5
         a = a[:, :k]
@@ -128,7 +126,7 @@ def onednn_gemm_test_helper(primitive_cache_size: int,
     b = torch.rand((n, k), dtype=dtype, device=device) * 1.5
 
     if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=dtype) * 5
+        bias = torch.rand((n,), device=device, dtype=dtype) * 5
         bias_f32 = bias.float()
     else:
         bias = None
@@ -140,16 +138,18 @@ def onednn_gemm_test_helper(primitive_cache_size: int,
     )
 
     out = ops.onednn_mm(handler, a, bias)
-    baseline = torch.nn.functional.linear(a.float(), b.float(),
-                                          bias_f32).to(dtype=a.dtype)
+    baseline = torch.nn.functional.linear(a.float(), b.float(), bias_f32).to(
+        dtype=a.dtype
+    )
 
     torch.testing.assert_close(out, baseline)
 
     if use_bias:
         # To test runtime bias setting
         out = ops.onednn_mm(handler, a, None)
-        baseline = torch.nn.functional.linear(a.float(), b.float(),
-                                              None).to(dtype=a.dtype)
+        baseline = torch.nn.functional.linear(a.float(), b.float(), None).to(
+            dtype=a.dtype
+        )
 
         torch.testing.assert_close(out, baseline)
 
diff --git a/tests/kernels/test_shuffle_rows.py b/tests/kernels/test_shuffle_rows.py
index 7d02e1764e7d4a8af3992461292bd9257452d758..c7de64066e87b1cadf907a5653ed20c433f2a71a 100644
--- a/tests/kernels/test_shuffle_rows.py
+++ b/tests/kernels/test_shuffle_rows.py
@@ -14,20 +14,15 @@ from vllm.platforms import current_platform
 
 @pytest.mark.parametrize("num_tokens", [1, 16, 64, 128, 256, 512, 1024])
 @pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 2048, 4096])
-@pytest.mark.parametrize("dtype",
-                         [torch.float16, torch.bfloat16, torch.float32])
-def test_shuffle_rows_basic(num_tokens: int, hidden_size: int,
-                            dtype: torch.dtype):
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+def test_shuffle_rows_basic(num_tokens: int, hidden_size: int, dtype: torch.dtype):
     """Test basic functionality of shuffle_rows with various tensor sizes and
     dtypes."""
     if not current_platform.is_cuda():
         pytest.skip("shuffle_rows requires CUDA")
 
     # Create input tensor
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
 
     # Create a simple permutation map (identity mapping)
     dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
@@ -47,24 +42,18 @@ def test_shuffle_rows_basic(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("num_tokens", [16, 64, 128])
 @pytest.mark.parametrize("hidden_size", [128, 512, 1024])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_shuffle_rows_permutation(num_tokens: int, hidden_size: int,
-                                  dtype: torch.dtype):
+def test_shuffle_rows_permutation(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype
+):
     """Test shuffle_rows with actual permutation."""
     if not current_platform.is_cuda():
         pytest.skip("shuffle_rows requires CUDA")
 
     # Create input tensor
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
 
     # Create a reverse permutation map
-    dst2src_map = torch.arange(num_tokens - 1,
-                               -1,
-                               -1,
-                               device="cuda",
-                               dtype=torch.int32)
+    dst2src_map = torch.arange(num_tokens - 1, -1, -1, device="cuda", dtype=torch.int32)
 
     # Test shuffle_rows
     output = shuffle_rows(input_tensor, dst2src_map)
@@ -90,17 +79,13 @@ def test_shuffle_rows_expansion(num_tokens: int, hidden_size: int):
     dtype = torch.float16
 
     # Create input tensor
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
 
     # Create a mapping that duplicates some tokens (expansion)
     expanded_size = num_tokens * 2
-    dst2src_map = torch.randint(0,
-                                num_tokens, (expanded_size, ),
-                                device="cuda",
-                                dtype=torch.int32)
+    dst2src_map = torch.randint(
+        0, num_tokens, (expanded_size,), device="cuda", dtype=torch.int32
+    )
 
     # Test shuffle_rows
     output = shuffle_rows(input_tensor, dst2src_map)
@@ -113,10 +98,9 @@ def test_shuffle_rows_expansion(num_tokens: int, hidden_size: int):
     # Verify that each output row matches the corresponding input row
     for i in range(expanded_size):
         src_idx = dst2src_map[i].item()
-        torch.testing.assert_close(output[i],
-                                   input_tensor[src_idx],
-                                   atol=1e-6,
-                                   rtol=1e-5)
+        torch.testing.assert_close(
+            output[i], input_tensor[src_idx], atol=1e-6, rtol=1e-5
+        )
 
 
 @pytest.mark.parametrize("num_tokens", [16, 64])
@@ -132,10 +116,7 @@ def test_shuffle_rows_random_permutation(num_tokens: int, hidden_size: int):
     torch.manual_seed(42)
 
     # Create input tensor
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
 
     # Create a random permutation map
     dst2src_map = torch.randperm(num_tokens, device="cuda", dtype=torch.int32)
@@ -151,10 +132,9 @@ def test_shuffle_rows_random_permutation(num_tokens: int, hidden_size: int):
     # Verify that each output row matches the corresponding input row
     for i in range(num_tokens):
         src_idx = dst2src_map[i].item()
-        torch.testing.assert_close(output[i],
-                                   input_tensor[src_idx],
-                                   atol=1e-6,
-                                   rtol=1e-5)
+        torch.testing.assert_close(
+            output[i], input_tensor[src_idx], atol=1e-6, rtol=1e-5
+        )
 
 
 def test_shuffle_rows_edge_cases():
@@ -188,10 +168,7 @@ def test_shuffle_rows_moe_like_scenario():
     topk = 2
 
     # Simulate input tokens
-    input_tensor = torch.randn(batch_size,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(batch_size, hidden_size, device="cuda", dtype=dtype)
 
     # Simulate expert assignment (each token goes to topk experts)
     # This creates a mapping where tokens are duplicated for multiple experts
@@ -215,14 +192,12 @@ def test_shuffle_rows_moe_like_scenario():
     for i in range(batch_size):
         for k in range(topk):
             output_idx = i * topk + k
-            torch.testing.assert_close(output[output_idx],
-                                       input_tensor[i],
-                                       atol=1e-6,
-                                       rtol=1e-5)
+            torch.testing.assert_close(
+                output[output_idx], input_tensor[i], atol=1e-6, rtol=1e-5
+            )
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float16, torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
 def test_shuffle_rows_dtype_consistency(dtype: torch.dtype):
     """Test that shuffle_rows preserves dtype correctly."""
     if not current_platform.is_cuda():
@@ -232,10 +207,7 @@ def test_shuffle_rows_dtype_consistency(dtype: torch.dtype):
     hidden_size = 512
 
     # Create input tensor with specific dtype
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
     dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
 
     # Test shuffle_rows
@@ -257,10 +229,7 @@ def test_shuffle_rows_device_consistency():
     dtype = torch.float16
 
     # Create input tensor on CUDA
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
     dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
 
     # Test shuffle_rows
@@ -281,10 +250,7 @@ def test_shuffle_rows_contiguous_output():
     dtype = torch.float16
 
     # Create input tensor
-    input_tensor = torch.randn(num_tokens,
-                               hidden_size,
-                               device="cuda",
-                               dtype=dtype)
+    input_tensor = torch.randn(num_tokens, hidden_size, device="cuda", dtype=dtype)
     dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
 
     # Test shuffle_rows
diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py
new file mode 100644
index 0000000000000000000000000000000000000000..cadda27b49e9c279484f1e716c5e2a6436188e1c
--- /dev/null
+++ b/tests/kernels/test_top_k_per_row.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+# Test parameters
+NUM_ROWS = [1, 32, 2050]
+TOP_K_VALUES = [2048]
+BATCH_SIZE = [1, 2, 4, 2048, 4096]
+NEXT_N = [1, 2, 4, 8]
+
+
+def create_random_logits(
+    row_starts: torch.Tensor,
+    row_ends: torch.Tensor,
+    vocab_size: int,
+    dtype: torch.dtype,
+    seed: int,
+) -> torch.Tensor:
+    """Create random logits tensor for testing."""
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # Generate logits with some structure to make testing more meaningful
+    logits = torch.randn(row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda")
+    for i, end in enumerate(row_ends):
+        logits[i, end:] = float("-inf")
+    return logits
+
+
+def create_row_boundaries(
+    seq_len: int, vocab_size: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Create row start and end indices for testing."""
+    row_starts = torch.zeros(seq_len, dtype=torch.int32, device="cuda")
+    row_ends = torch.arange(1, seq_len + 1, device="cuda", dtype=torch.int32)
+    return row_starts, row_ends
+
+
+def compare_top_k_results(
+    logits: torch.Tensor,
+    cuda_indices: torch.Tensor,
+    torch_indices: torch.Tensor,
+    row_starts: torch.Tensor,
+    row_ends: torch.Tensor,
+    top_k: int,
+    tolerance: float = 1e-5,
+) -> bool:
+    """
+    Compare results from CUDA top_k_per_row with torch.topk.
+    Both results should be sorted and contain the same top-k elements.
+    """
+    num_rows = cuda_indices.shape[0]
+
+    for row_idx in range(num_rows):
+        # Get valid elements using row boundaries
+        row_start = row_starts[row_idx].item()
+        row_end = row_ends[row_idx].item()
+        row_length = row_end - row_start
+        num_valid = min(top_k, row_length)
+        cuda_row_indices = cuda_indices[row_idx][:num_valid].cpu()
+        torch_row_indices = torch_indices[row_idx][:num_valid].cpu()
+
+        # Compare the sets of indices first
+        cuda_set = set(cuda_row_indices.tolist())
+        torch_set = set(torch_row_indices.tolist())
+        if cuda_set == torch_set:
+            continue
+
+        # Any difference in elements, compare the values
+        logits_row = logits[row_idx]
+        cuda_row_values = [logits_row[i] for i in cuda_row_indices]
+        torch_row_values = [logits_row[i] for i in torch_row_indices]
+
+        cuda_only_values, torch_only_values = [], []
+        for idx in cuda_set - torch_set:
+            cuda_pos = (cuda_row_indices == idx).nonzero(as_tuple=True)[0]
+            cuda_only_values.append(cuda_row_values[cuda_pos[0]])
+
+        for idx in torch_set - cuda_set:
+            torch_pos = (torch_row_indices == idx).nonzero(as_tuple=True)[0]
+            torch_only_values.append(torch_row_values[torch_pos[0]])
+
+        if len(cuda_only_values) != len(torch_only_values):
+            return False
+        if not torch.allclose(
+            torch.tensor(cuda_only_values),
+            torch.tensor(torch_only_values),
+            rtol=tolerance,
+            atol=tolerance,
+        ):
+            return False
+
+    return True
+
+
+@pytest.mark.parametrize("num_rows", NUM_ROWS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row(
+    num_rows: int,
+    top_k: int,
+) -> None:
+    """
+    Test top_k_per_row.
+    """
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    vocab_size = 20000
+    row_starts, row_ends = create_row_boundaries(num_rows, vocab_size)
+    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+
+    # Create output tensors
+    indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
+
+    # Run CUDA implementation
+    torch.ops._C.top_k_per_row(
+        logits,
+        row_starts,
+        row_ends,
+        indices,
+        num_rows,
+        logits.stride(0),
+        logits.stride(1),
+    )
+
+    # Run reference implementation
+    torch_indices = logits.topk(min(top_k, max(row_ends)), dim=-1)[1]
+    mask_lo = torch_indices >= 0
+    mask_hi = (torch_indices - (row_ends - row_starts)[:, None]) < 0
+    mask = mask_lo & mask_hi
+    torch_indices = torch_indices.masked_fill(~mask, -1)
+
+    # Compare results
+    assert compare_top_k_results(
+        logits, indices, torch_indices, row_starts, row_ends, top_k
+    ), "CUDA top_k_per_row results don't match torch.topk"
+
+
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("next_n", NEXT_N)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row_decode(
+    top_k: int,
+    batch_size: int,
+    next_n: int,
+) -> None:
+    """
+    Test top_k_per_row with seq_lens tensor.
+    """
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    num_rows = batch_size * next_n
+    vocab_size = 20000
+    seq_lens = torch.randint(
+        vocab_size, (batch_size,), dtype=torch.int32, device="cuda"
+    )
+    row_starts = torch.zeros(num_rows, dtype=torch.int32, device="cuda")
+    row_indices = torch.arange(num_rows, device="cuda") // next_n
+    next_n_offset = torch.arange(num_rows, device="cuda") % next_n
+    row_ends = seq_lens[row_indices] - next_n + next_n_offset + 1
+    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+
+    # Create output tensors
+    indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
+
+    # Run CUDA implementation
+    torch.ops._C.top_k_per_row_decode(
+        logits,
+        next_n,
+        seq_lens,
+        indices,
+        num_rows,
+        logits.stride(0),
+        logits.stride(1),
+    )
+
+    torch.cuda.synchronize()
+
+    # Run reference implementation
+    torch_indices = logits.topk(min(top_k, max(row_ends)), dim=-1)[1]
+    mask_lo = torch_indices >= 0
+    mask_hi = (torch_indices - (row_ends - row_starts)[:, None]) < 0
+    mask = mask_lo & mask_hi
+    torch_indices = torch_indices.masked_fill(~mask, -1)
+
+    # Compare results
+    assert compare_top_k_results(
+        logits, indices, torch_indices, row_starts, row_ends, top_k
+    ), "CUDA top_k_per_row results don't match torch.topk"
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
deleted file mode 100644
index 1c31cfb25e5acd11213f84adaed692f744e182af..0000000000000000000000000000000000000000
--- a/tests/kernels/test_triton_flash_attention.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the triton_flash_attention kernel
-
-Run `pytest tests/kernels/test_triton_flash_attention.py`.
-"""
-import pytest
-import torch
-
-from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
-                                                       MetaData,
-                                                       compute_alibi_tensor,
-                                                       scale_fp8,
-                                                       triton_attention_rocm)
-from vllm.platforms import current_platform
-
-
-class ReferenceAttention:
-
-    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
-                 input_metadata):
-        self.Z = Z
-        self.HQ = HQ
-        self.HK = HK
-        self.N_CTX_Q = N_CTX_Q
-        self.N_CTX_K = N_CTX_K
-        self.D_HEAD = D_HEAD
-        self.use_alibi = use_alibi
-        self.dtype = dtype
-        self.input_metadata = input_metadata
-
-    def fwd(self, q, k, v):
-        scores = torch.einsum('bhqd,bhkd->bhqk', q,
-                              k).float() * self.input_metadata.sm_scale
-        if self.input_metadata.causal:
-            mask = torch.tril(torch.ones(self.N_CTX_Q,
-                                         self.N_CTX_K,
-                                         device="cuda"),
-                              diagonal=self.N_CTX_K - self.N_CTX_Q)
-            scores[:, :, mask == 0] = float("-inf")
-
-        if self.input_metadata.bias is not None:
-            scores += self.input_metadata.bias
-
-        if self.use_alibi:
-            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
-                                           self.N_CTX_Q, self.N_CTX_K)
-
-        p = torch.softmax(scores, dim=-1)
-        if self.input_metadata.causal:
-            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
-            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
-            # So we fix this by converting the NaNs to 0s, which is what they
-            # should be out of the softmax.
-            nan_mask = torch.isnan(p)
-            p[nan_mask == 1] = 0
-        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
-        # compare
-        if self.input_metadata.layout == 'bshd':
-            ref_out = ref_out.transpose(1, 2).clone()
-        return ref_out
-
-    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
-        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-            self.dtype)
-        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-            self.dtype)
-        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-            self.dtype)
-        result = self.fwd(q, k, v)
-        if self.input_metadata.o_scale is not None:
-            result, _ = scale_fp8(result, self.input_metadata.o_scale)
-        return result
-
-    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-        k_descale, v_descale = (self.input_metadata.k_descale,
-                                self.input_metadata.v_descale)
-        k_dequantized = (k_quantized.to(torch.float32) *
-                         k_descale.to(torch.float32)).to(self.dtype)
-        v_dequantized = (v_quantized.to(torch.float32) *
-                         v_descale.to(torch.float32)).to(self.dtype)
-        return self.fwd(q, k_dequantized, v_dequantized)
-
-    def varlen_fwd(self, q, k, v, is_mqa=False):
-        ref_out = torch.empty_like(q)
-        if is_mqa:
-            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
-            # the size aligns with Q.
-            k_ref = k.view(k.shape[0], k.shape[1], 1,
-                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
-            v_ref = v.view(v.shape[0], v.shape[1], 1,
-                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
-        else:
-            k_ref = k
-            v_ref = v
-
-        for i in range(0, self.input_metadata.num_contexts):
-            start_q, start_k = self.input_metadata.cu_seqlens_q[
-                i], self.input_metadata.cu_seqlens_k[i]
-            end_q, end_k = self.input_metadata.cu_seqlens_q[
-                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
-            k_curr = k_ref[start_k:end_k]
-            v_curr = v_ref[start_k:end_k]
-            if is_mqa:
-                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
-                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
-            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
-                                  k_curr).float()
-            p = torch.softmax(scores * self.input_metadata.sm_scale,
-                              dim=-1).half()
-            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
-        return ref_out
-
-
-def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
-    q_descale = None
-    if not fp8_kv:
-        q, q_descale = scale_fp8(q)
-    k, k_descale = scale_fp8(k)
-    v, v_descale = scale_fp8(v)
-
-    # In real world use case, the p scale would be a parameter trained by the
-    # model.
-    p_scale = None
-
-    o_scale = torch.rand(1, device="cuda",
-                         requires_grad=False) if use_o_scale else None
-
-    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
-
-
-def input_helper(
-    Z,
-    HQ,
-    HK,
-    N_CTX_Q,
-    N_CTX_K,
-    D_HEAD,
-    dtype,
-    layout=None,
-    use_alibi=None,
-    causal=None,
-    is_fp8=False,
-    fp8_kv=False,
-    use_o_scale=False,
-    use_bias=False,
-):
-    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    current_platform.seed_everything(0)
-
-    # Initialize q, k, v
-    if layout == 'bhsd':
-        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
-        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
-    elif layout == 'bshd':
-        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
-        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
-
-    if use_alibi:
-        # for n heads the set of slopes is the geometric sequence that starts
-        # 2^(-8/n)
-        alibi_slopes = torch.tensor(
-            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
-            dtype=torch.float32,
-            device="cuda").repeat(Z, 1)
-    else:
-        alibi_slopes = None
-
-    if use_bias:
-        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
-                           dtype=dtype,
-                           device="cuda",
-                           requires_grad=False)
-    else:
-        bias = None
-
-    q = torch.randn(q_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    k = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    v = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-
-    if is_fp8:
-        (q, k, v, q_descale, k_descale, v_descale, p_scale,
-         o_scale) = quantize_input(q,
-                                   k,
-                                   v,
-                                   use_o_scale=use_o_scale,
-                                   fp8_kv=fp8_kv)
-    else:
-        q_descale = k_descale = v_descale = p_scale = o_scale = None
-
-    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
-                              max_seqlens_q=N_CTX_Q,
-                              max_seqlens_k=N_CTX_K,
-                              layout=layout,
-                              alibi_slopes=alibi_slopes,
-                              alibi_batch=Z,
-                              alibi_nheads=HQ,
-                              q_descale=q_descale,
-                              k_descale=k_descale,
-                              v_descale=v_descale,
-                              p_scale=p_scale,
-                              o_scale=o_scale,
-                              bias=bias,
-                              seqlen_q=N_CTX_Q,
-                              seqlen_k=N_CTX_K)
-    return q, k, v, input_metadata
-
-
-def varlen_input_helper(Z,
-                        HQ,
-                        HK,
-                        N_CTX_Q,
-                        N_CTX_K,
-                        D_HEAD,
-                        dtype,
-                        equal_seqlens=False):
-    current_platform.seed_everything(0)
-
-    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
-    # seqs
-    if not equal_seqlens:
-        max_seqlens_q = N_CTX_Q // Z
-        max_seqlens_k = N_CTX_K // Z
-        seqlens_q = torch.randint(1,
-                                  max_seqlens_q + 1, (Z, ),
-                                  dtype=torch.int32)
-        seqlens_k = torch.randint(1,
-                                  max_seqlens_k + 1, (Z, ),
-                                  dtype=torch.int32)
-    else:
-        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
-        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
-
-    # Calculate cumulative sequence lengths
-    cu_seqlens_q = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_q.cumsum(dim=0, dtype=torch.int32)
-    ])
-    cu_seqlens_k = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_k.cumsum(dim=0, dtype=torch.int32)
-    ])
-    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
-    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
-
-    # Initialize q, k, v with variable lengths
-    total_q = cu_seqlens_q[-1].item()
-    total_k = cu_seqlens_k[-1].item()
-    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    sm_scale = D_HEAD**-0.5
-    input_metadata = MetaData(sm_scale=sm_scale)
-    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-    return q, k, v, input_metadata
-
-
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (1, 48, 12, 1, 1, 64),
-    (4, 4, 4, 128, 128, 65),
-    (16, 48, 48, 1, 1, 128),
-    (64, 48, 24, 3, 3, 128),
-    (4, 4, 4, 113, 123, 1),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_alibi', [True, False])
-@pytest.mark.parametrize('layout', ['bshd'])
-def test_op_fwd(Z,
-                HQ,
-                HK,
-                N_CTX_Q,
-                N_CTX_K,
-                D_HEAD,
-                causal,
-                use_alibi,
-                layout,
-                dtype=torch.float16):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                           dtype, layout, use_alibi, causal)
-
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    # Transpose here if layout is bshd so we have same reference code for all
-    # layouts
-    if layout == 'bshd':
-        q = q.transpose(1, 2).clone()
-        k = k.transpose(1, 2).clone()
-        v = v.transpose(1, 2).clone()
-    # Replicate K and V if using MQA/GQA
-    if HQ != HK:
-        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
-                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(k.shape[0], -1, k.shape[2],
-                                                  k.shape[3])
-        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
-                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(v.shape[0], -1, v.shape[2],
-                                                  v.shape[3])
-
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                  use_alibi, dtype, input_metadata)
-    ref_out = ref_impl.fwd(q, k, v)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('layout', ['bhsd'])
-@pytest.mark.parametrize('use_o_scale', [True, False])
-@pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
-                    reason="Triton FP8 requires CUDA 9.0 or higher")
-def test_op_fwd_fp8(Z,
-                    H,
-                    N_CTX_Q,
-                    N_CTX_K,
-                    D_HEAD,
-                    causal,
-                    layout,
-                    use_o_scale,
-                    dtype=torch.float32):
-    current_platform.seed_everything(0)
-
-    # Disable grad to save memory it won't run into OOM on CI machine.
-    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
-    # dtype, layout)
-
-    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        causal=causal,
-        layout=layout,
-        is_fp8=True,
-        use_o_scale=use_o_scale)
-
-    o = torch.empty_like(q_quantized) if use_o_scale else None
-
-    tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
-                                       o, input_metadata)
-
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
-
-    # compare
-    torch.testing.assert_close(ref_out.to(torch.float32),
-                               tri_out.to(torch.float32),
-                               atol=7e-2,
-                               rtol=2e-1)
-
-
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-    (4, 4, 113, 123, 1),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('layout', ['bhsd'])
-def test_op_fwd_fp8_kv(Z,
-                       H,
-                       N_CTX_Q,
-                       N_CTX_K,
-                       D_HEAD,
-                       causal,
-                       layout,
-                       dtype=torch.float32):
-    current_platform.seed_everything(0)
-
-    q, k_quantized, v_quantized, input_metadata = input_helper(Z,
-                                                               H,
-                                                               H,
-                                                               N_CTX_Q,
-                                                               N_CTX_K,
-                                                               D_HEAD,
-                                                               dtype,
-                                                               causal=causal,
-                                                               layout=layout,
-                                                               is_fp8=True,
-                                                               fp8_kv=True)
-
-    o = torch.empty_like(q)
-
-    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
-                                       input_metadata)
-
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
-
-
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_bias', [True])
-@pytest.mark.parametrize('dtype', [torch.bfloat16])
-def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z,
-                                           H,
-                                           H,
-                                           N_CTX_Q,
-                                           N_CTX_K,
-                                           D_HEAD,
-                                           dtype,
-                                           layout='bhsd',
-                                           causal=causal,
-                                           use_bias=use_bias)
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.fwd(q, k, v)
-
-    # compare
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
-                                                 (4, 48, 512, 64),
-                                                 (16, 48, 512, 64),
-                                                 (64, 48, 128, 128)])
-@pytest.mark.parametrize('causal', [True, False])
-def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
-
-    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
-                                  input_metadata)
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
-                                                      (4, 48, 12, 256, 64),
-                                                      (4, 48, 4, 512, 64),
-                                                      (4, 64, 16, 128, 128)])
-@pytest.mark.parametrize('causal', [False])
-def test_op_varlen_mqa_fwd(Z,
-                           HQ,
-                           HK,
-                           N_CTX,
-                           D_HEAD,
-                           causal,
-                           dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 0fdaa600aefa11088e41d678fca08bfcf9cea6ae..5d5a26fbfc2cdf8673f80080c6b01758d69a9c60 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -4,23 +4,23 @@
 
 import itertools
 import random
-import unittest
 from collections.abc import Sequence
 from numbers import Number
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
+from unittest.mock import patch
 
 import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+from vllm.attention import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
-from vllm.platforms.interface import _Backend
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
-                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.utils import (
+    STR_BACKEND_ENV_VAR,
+)
+from vllm.utils.torch_utils import make_tensor_with_pad
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
@@ -39,7 +39,7 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
 
 
 class QKVInputs(NamedTuple):
-    '''
+    """
     Data structure for representing unpacked attention inputs,
     query/key/values and their sequence lengths.
 
@@ -49,7 +49,7 @@ class QKVInputs(NamedTuple):
                              num_heads x head_size) attention inputs
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
-    '''
+    """
 
     query: torch.Tensor
     key: torch.Tensor
@@ -59,7 +59,7 @@ class QKVInputs(NamedTuple):
 
 
 class QKVO(NamedTuple):
-    '''
+    """
     Data structure for representing unpacked attention inputs,
     alongside unpacked known-correct attention output
 
@@ -69,14 +69,14 @@ class QKVO(NamedTuple):
                              num_heads x head_size) attention inputs
         * ideal_output: unpacked (batch_size x padded_seq_len x
                         num_heads x head_size) known-correct attention output
-    '''
+    """
 
     qkv: QKVInputs
     ideal_output: torch.Tensor
 
 
 class PackedQKVInputs(NamedTuple):
-    '''
+    """
     Data structure for representing packed attention inputs
 
     Attributes:
@@ -88,19 +88,19 @@ class PackedQKVInputs(NamedTuple):
                              packed tensor
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
-    '''
+    """
 
     query: torch.Tensor
     key: torch.Tensor
     value: torch.Tensor
-    q_start_loc_list: Optional[list[int]]
-    kv_start_loc_list: Optional[list[int]]
-    q_seq_lens: Optional[list[int]]
-    kv_seq_lens: Optional[list[int]]
+    q_start_loc_list: list[int] | None
+    kv_start_loc_list: list[int] | None
+    q_seq_lens: list[int] | None
+    kv_seq_lens: list[int] | None
 
 
 class PackedQKVO(NamedTuple):
-    '''
+    """
     Data structure for representing packed attention inputs,
     alongside packed known-correct attention output
 
@@ -110,28 +110,28 @@ class PackedQKVO(NamedTuple):
                       x head_size) attention inputs
         * ideal_output: packed (number_of_tokens x num_heads
                         x head_size) known-correct attention output
-    '''
+    """
 
-    packed_qkv: Optional[PackedQKVInputs]
+    packed_qkv: PackedQKVInputs | None
     ideal_output: torch.Tensor
 
 
 class KVMemoryMap(NamedTuple):
-    '''
+    """
     Data structure for encapsulating KV cache memory mapping.
 
     Attributes:
 
         * block_tables: KV cache block tables
         * slot_mapping: mapping of sequence offset to physical address
-    '''
+    """
 
     block_tables: torch.Tensor
     slot_mapping: torch.Tensor
 
 
 class PhaseTestParameters(NamedTuple):
-    '''
+    """
     Data structure for encapsulating the test parameters
     for a given test "phase" (prefill or decode phase) and attention
     scenario (encoder, decoder-self, encoder/decoder-cross)
@@ -143,51 +143,53 @@ class PhaseTestParameters(NamedTuple):
                        output
         * kv_mmap: KV cache memory mapping, specific to this test phase &
                    attention scenario
-    '''
+    """
 
     packed_qkvo: PackedQKVO
-    kv_mmap: Optional[KVMemoryMap]
+    kv_mmap: KVMemoryMap | None
 
 
 def maybe_make_int_tensor(
-    _list: Optional[list[int]],
-    device: Union[torch.device, str],
+    _list: list[int] | None,
+    device: torch.device | str,
 ) -> torch.Tensor:
-    '''
+    """
     Convert Python int list to a 1D int torch.Tensor on `device`
 
     Returns:
 
     * If _list is not None: 1D int torch.Tensor on `device`
     * None otherwise
-    '''
-    return None if _list is None else torch.tensor(
-        _list, dtype=torch.int, device=device)
+    """
+    return (
+        None if _list is None else torch.tensor(_list, dtype=torch.int, device=device)
+    )
 
 
 def maybe_make_long_tensor(
-    _list: Optional[list[int]],
-    device: Union[torch.device, str],
+    _list: list[int] | None,
+    device: torch.device | str,
 ) -> torch.Tensor:
-    '''
+    """
     Convert Python int list to a 1D long torch.Tensor on `device`
 
     Returns:
 
     * If _list is not None: 1D long torch.Tensor on `device`
     * None otherwise
-    '''
-    return None if _list is None else torch.tensor(
-        _list, dtype=torch.long, device=device)
+    """
+    return (
+        None if _list is None else torch.tensor(_list, dtype=torch.long, device=device)
+    )
 
 
-def maybe_max(_list: Optional[list]) -> Optional[Number]:
-    '''
+def maybe_max(_list: list | None) -> Number | None:
+    """
     Returns:
 
     * If _list is not None: max(_list)
     * None otherwise
-    '''
+    """
     return None if _list is None else max(_list)
 
 
@@ -195,7 +197,7 @@ def make_causal_mask(
     q_max_seq_len: int,
     kv_max_seq_len: int,
 ) -> torch.Tensor:
-    '''
+    """
     Create a q_max_seq_len x kv_max_seq_len causal mask
 
     Arguments:
@@ -206,19 +208,19 @@ def make_causal_mask(
     Returns:
 
     * 2D tensor, q_max_seq_len x kv_max_seq_len
-    '''
+    """
 
     # Create a matrix where entry (i, j) is True if i >= j
     mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
     # Replace True with float('-inf') and False with 0
-    mask = mask.masked_fill(mask == 1,
-                            float('-inf')).masked_fill(mask == 0, 0.0)
+    mask = mask.masked_fill(mask == 1, float("-inf")).masked_fill(mask == 0, 0.0)
     return mask
 
 
-def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
-                                  backend_name: str) -> None:
-    '''
+def override_backend_env_variable(
+    mpatch: pytest.MonkeyPatch, backend_name: str
+) -> None:
+    """
     Override the environment variable indicating the vLLM backend temporarily,
     using pytest monkeypatch to ensure that the env vars get
     reset once the test context exits.
@@ -227,18 +229,20 @@ def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
 
     * mpatch: pytest monkeypatch instance
     * backend_name: attention backend name to force
-    '''
+    """
     mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
 
 
-def ref_masked_attention(query: torch.Tensor,
-                         key: torch.Tensor,
-                         value: torch.Tensor,
-                         scale: float,
-                         custom_mask: Optional[torch.Tensor] = None,
-                         q_seq_lens: Optional[list] = None,
-                         kv_seq_lens: Optional[list] = None) -> torch.Tensor:
-    '''
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    custom_mask: torch.Tensor | None = None,
+    q_seq_lens: list | None = None,
+    kv_seq_lens: list | None = None,
+) -> torch.Tensor:
+    """
     "Golden" masked attention reference. Supports two types of masking:
 
     * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
@@ -260,14 +264,14 @@ def ref_masked_attention(query: torch.Tensor,
     Returns:
 
     * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
-    '''
+    """
 
     assert q_seq_lens is not None
     assert kv_seq_lens is not None
 
     batch_size = query.shape[0]
-    assert (len(q_seq_lens) == batch_size)
-    assert (len(kv_seq_lens) == batch_size)
+    assert len(q_seq_lens) == batch_size
+    assert len(kv_seq_lens) == batch_size
 
     attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
 
@@ -295,15 +299,15 @@ def ref_masked_attention(query: torch.Tensor,
 def make_qkv(
     batch_size: int,
     max_q_seq_len: int,
-    max_kv_seq_len: Optional[int],
+    max_kv_seq_len: int | None,
     num_heads: int,
     head_size: int,
-    device: Union[torch.device, str],
-    force_kv_seq_lens: Optional[list[int]] = None,
+    device: torch.device | str,
+    force_kv_seq_lens: list[int] | None = None,
     attn_type: AttentionType = AttentionType.ENCODER_DECODER,
     force_max_len: bool = False,
 ) -> tuple[QKVInputs, QKVInputs, QKVInputs]:
-    '''
+    """
     Construct QKV test tensors for self- and cross-attention.
 
     Generates three query/key/value triplets:
@@ -340,14 +344,12 @@ def make_qkv(
     * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
     * Prefill QKVInputs structure (containing all but the last sequence offset)
     * Decode QKVInputs structure (containing all only the last sequence offset)
-    '''
+    """
 
     if force_max_len:
         q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
     else:
-        q_seq_lens = [
-            random.randint(2, max_q_seq_len) for _ in range(batch_size)
-        ]
+        q_seq_lens = [random.randint(2, max_q_seq_len) for _ in range(batch_size)]
     kv_seq_lens = None
     if force_kv_seq_lens is not None:
         kv_seq_lens = force_kv_seq_lens
@@ -360,50 +362,44 @@ def make_qkv(
         if force_max_len:
             kv_seq_lens = [max_kv_seq_len] * batch_size
         else:
-            kv_seq_lens = [
-                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
-            ]
-
-    query = torch.rand(
-        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
-    key = torch.rand(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-    value = torch.rand(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-
-    prefill_query = torch.zeros(
-        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
-    prefill_key = torch.zeros(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-    prefill_value = torch.zeros(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-
-    decode_query = torch.zeros(
-        (batch_size, 1, num_heads, head_size)).to(device)
+            kv_seq_lens = [random.randint(2, max_kv_seq_len) for _ in range(batch_size)]
+
+    query = torch.rand((batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    key = torch.rand((batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    value = torch.rand((batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    prefill_query = torch.zeros((batch_size, max_q_seq_len, num_heads, head_size)).to(
+        device
+    )
+    prefill_key = torch.zeros((batch_size, max_kv_seq_len, num_heads, head_size)).to(
+        device
+    )
+    prefill_value = torch.zeros((batch_size, max_kv_seq_len, num_heads, head_size)).to(
+        device
+    )
+
+    decode_query = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
     decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
-    decode_value = torch.zeros(
-        (batch_size, 1, num_heads, head_size)).to(device)
+    decode_value = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
 
-    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
-                                                      kv_seq_lens)):
+    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens, kv_seq_lens)):
         query[bdx, q_seq_len:, :, :] = 0
         key[bdx, kv_seq_len:, :, :] = 0
         value[bdx, kv_seq_len:, :, :] = 0
 
-        prefill_query[bdx,
-                      0:(q_seq_len - 1), :, :] = query[bdx,
-                                                       0:(q_seq_len - 1), :, :]
-        prefill_key[bdx,
-                    0:(kv_seq_len - 1), :, :] = key[bdx,
-                                                    0:(kv_seq_len - 1), :, :]
-        prefill_value[bdx, 0:(kv_seq_len -
-                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
-
-        decode_query[bdx, :, :, :] = query[bdx,
-                                           (q_seq_len - 1):q_seq_len, :, :]
-        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
-        decode_value[bdx, :, :, :] = value[bdx,
-                                           (kv_seq_len - 1):kv_seq_len, :, :]
+        prefill_query[bdx, 0 : (q_seq_len - 1), :, :] = query[
+            bdx, 0 : (q_seq_len - 1), :, :
+        ]
+        prefill_key[bdx, 0 : (kv_seq_len - 1), :, :] = key[
+            bdx, 0 : (kv_seq_len - 1), :, :
+        ]
+        prefill_value[bdx, 0 : (kv_seq_len - 1), :, :] = value[
+            bdx, 0 : (kv_seq_len - 1), :, :
+        ]
+
+        decode_query[bdx, :, :, :] = query[bdx, (q_seq_len - 1) : q_seq_len, :, :]
+        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1) : kv_seq_len, :, :]
+        decode_value[bdx, :, :, :] = value[bdx, (kv_seq_len - 1) : kv_seq_len, :, :]
 
     prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
     prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
@@ -417,25 +413,29 @@ def make_qkv(
             key,
             value,
             q_seq_lens,
-            kv_seq_lens),
+            kv_seq_lens,
+        ),
         QKVInputs(
             prefill_query,  # Prefill subset of QKV sequences
             prefill_key,
             prefill_value,
             prefill_q_seq_lens,
-            prefill_kv_seq_lens),
+            prefill_kv_seq_lens,
+        ),
         QKVInputs(
             decode_query,  # Decode subset of KV sequences
             decode_key,
             decode_value,
             decode_q_seq_lens,
-            decode_kv_seq_lens))
+            decode_kv_seq_lens,
+        ),
+    )
 
 
 def pack_tensor(
-        unpacked_tensor: torch.Tensor, seq_lens: list[int],
-        device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]:
-    '''
+    unpacked_tensor: torch.Tensor, seq_lens: list[int], device: torch.device | str
+) -> tuple[torch.Tensor, list[int]]:
+    """
     Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
     unpadded number_of_tokens x num_heads x head_size tensor, where
     number_of_tokens = sum(seq_lens)
@@ -451,7 +451,7 @@ def pack_tensor(
     * packed_tensor: number_of_tokens x num_heads x head_size
     * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
       list(itertools.accumulate(seq_lens))
-    '''
+    """
 
     num_tok = sum(seq_lens)
     num_heads = unpacked_tensor.shape[-2]
@@ -460,16 +460,15 @@ def pack_tensor(
     packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
 
     for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
-
-        packed_tensor[start_loc:(
-            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
+        packed_tensor[start_loc : (start_loc + seq_len), :, :] = unpacked_tensor[
+            bdx, :seq_len, :, :
+        ]
 
     return packed_tensor, start_loc_list
 
 
-def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
-                                           str]) -> PackedQKVInputs:
-    '''
+def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
+    """
     Individually pack each of Q, K and V, each with dimensions batch_size x
     padded_seq_len x num_heads x head_size, into respective number_of_tokens x
     num_heads x head_size tensors.
@@ -488,66 +487,26 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
 
     * Packed (number_of_tokens x num_heads x head_size) QKV inputs
       derived from unpacked inputs
-    '''
+    """
 
     if qkv.query is None:
         packed_query = None
         q_start_loc_list = None
     else:
-        packed_query, q_start_loc_list = pack_tensor(qkv.query,
-                                                     qkv.q_seq_lens,
-                                                     device=device)
-    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
-                                                qkv.kv_seq_lens,
-                                                device=device)
+        packed_query, q_start_loc_list = pack_tensor(
+            qkv.query, qkv.q_seq_lens, device=device
+        )
+    packed_key, kv_start_loc_list = pack_tensor(qkv.key, qkv.kv_seq_lens, device=device)
     packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
     return PackedQKVInputs(
-        packed_query, packed_key, packed_value, q_start_loc_list,
+        packed_query,
+        packed_key,
+        packed_value,
+        q_start_loc_list,
         kv_start_loc_list,
         (None if q_start_loc_list is None else qkv.q_seq_lens),
-        qkv.kv_seq_lens)
-
-
-def make_backend(backend_name: str) -> AttentionBackend:
-    '''
-    Construct the backend instance determined by the backend_name string
-    argument.
-
-    Note: at time of writing the Attention wrapper automatically selects
-    its own backend for Attention.forward(); so the backend instance which
-    you generate with this function is not meant to be used for *running*
-    inference, but rather for generating compatible metadata structures
-    using backend.make_metadata()
-
-
-    Returns:
-
-    * Backend instance
-    '''
-    if backend_name == STR_XFORMERS_ATTN_VAL:
-        from vllm.v1.attention.backends.xformers import (
-            XFormersAttentionBackend)
-        return XFormersAttentionBackend()
-    if backend_name == STR_FLASH_ATTN_VAL:
-        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
-        return FlashAttentionBackend()
-    if backend_name == "TRITON_ATTN":
-        from vllm.v1.attention.backends.triton_attn import (
-            TritonAttentionBackend)
-        return TritonAttentionBackend()
-    if backend_name == "FLEX_ATTENTION":
-        from vllm.v1.attention.backends.flex_attention import (
-            FlexAttentionBackend)
-        return FlexAttentionBackend()
-    if backend_name == "TORCH_SDPA":
-        from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
-        return TorchSDPABackend()
-    if backend_name == "FLASHINFER":
-        from vllm.v1.attention.backends.flashinfer import FlashInferBackend
-        return FlashInferBackend()
-
-    raise AssertionError(
-        f"Unrecognized backend_name {backend_name} for unit test")
+        qkv.kv_seq_lens,
+    )
 
 
 def make_alibi_bias(
@@ -565,7 +524,8 @@ def make_alibi_bias(
     attn_biases: list[Any] = []
     num_heads = alibi_slopes.shape[0]
     assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads")
+        "ALiBi slopes expect at least as many heads as KV heads"
+    )
 
     for seq_len in seq_lens:
         bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
@@ -587,13 +547,21 @@ def make_alibi_bias(
 
 
 def _make_metadata_tensors(
-    seq_lens: Optional[list[int]],
-    context_lens: Optional[list[int]],
-    encoder_seq_lens: Optional[list[int]],
-    device: Union[torch.device, str],
-) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
-           torch.Tensor, torch.Tensor, Optional[int]]:
-    '''
+    seq_lens: list[int] | None,
+    context_lens: list[int] | None,
+    encoder_seq_lens: list[int] | None,
+    device: torch.device | str,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Any,
+    Any,
+    torch.Tensor | None,
+    torch.Tensor,
+    torch.Tensor,
+    int | None,
+]:
+    """
     Build scalar & tensor values required to build attention metadata structure.
 
     Arguments:
@@ -613,48 +581,61 @@ def _make_metadata_tensors(
     * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
     * encoder_seq_start_loc: start idx of each encoder sequence
     * max_encoder_seq_len: encoder seq_lens list, as tensor
-    '''
+    """
     seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
     context_lens_tensor = maybe_make_int_tensor(context_lens, device)
     max_context_len = maybe_max(context_lens)
     max_seq_len = maybe_max(seq_lens)
 
     encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
-    max_encoder_seq_len = (None if encoder_seq_lens is None else
-                           max(encoder_seq_lens))
+    max_encoder_seq_len = None if encoder_seq_lens is None else max(encoder_seq_lens)
 
     seq_start_loc = None
 
     if seq_lens_tensor is not None:
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=seq_lens_tensor.device)
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-
-    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
-                                        dtype=torch.int32,
-                                        device=encoder_seq_lens_tensor.device)
-    torch.cumsum(encoder_seq_lens_tensor,
-                 dim=0,
-                 dtype=encoder_seq_start_loc.dtype,
-                 out=encoder_seq_start_loc[1:])
-
-    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
-            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
-            max_encoder_seq_len)
-
-
-def make_kv_cache(num_blocks: int,
-                  num_heads: int,
-                  head_size: int,
-                  block_size: int,
-                  device: Union[torch.device, str],
-                  backend: str,
-                  default_val: float = 0.0) -> torch.Tensor:
-    '''
+        seq_start_loc = torch.zeros(
+            seq_lens_tensor.shape[0] + 1,
+            dtype=torch.int32,
+            device=seq_lens_tensor.device,
+        )
+        torch.cumsum(
+            seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, out=seq_start_loc[1:]
+        )
+
+    encoder_seq_start_loc = torch.zeros(
+        encoder_seq_lens_tensor.shape[0] + 1,
+        dtype=torch.int32,
+        device=encoder_seq_lens_tensor.device,
+    )
+    torch.cumsum(
+        encoder_seq_lens_tensor,
+        dim=0,
+        dtype=encoder_seq_start_loc.dtype,
+        out=encoder_seq_start_loc[1:],
+    )
+
+    return (
+        seq_lens_tensor,
+        context_lens_tensor,
+        max_context_len,
+        max_seq_len,
+        seq_start_loc,
+        encoder_seq_lens_tensor,
+        encoder_seq_start_loc,
+        max_encoder_seq_len,
+    )
+
+
+def make_kv_cache(
+    num_blocks: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    device: torch.device | str,
+    backend: str,
+    default_val: float = 0.0,
+) -> torch.Tensor:
+    """
     Create a fake KV cache.
 
     Arguments:
@@ -672,41 +653,46 @@ def make_kv_cache(num_blocks: int,
     *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
-    '''
-    if backend == 'XFORMERS':
-        kv_cache = torch.rand(
-            (2, num_blocks, block_size * num_heads * head_size)).to(device)
-    elif backend == 'FLASH_ATTN':
-        kv_cache = torch.rand(
-            (2, num_blocks, block_size, num_heads, head_size)).to(device)
+    """
+    if backend == "XFORMERS":
+        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
+            device
+        )
+    elif backend == "FLASH_ATTN":
+        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
+            device
+        )
     else:
         raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
-            f"'FLASH_ATTN'.")
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
+        )
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
 
 
 def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
-    '''
+    """
     Compute the minimum number of blocks required to hold num_tokens tokens,
     given block_size
-    '''
+    """
     return (num_tokens + block_size) // block_size
 
 
-def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
+def make_empty_slot_mapping_tensor(device: torch.device | str):
     return maybe_make_long_tensor([], device)
 
 
-def make_empty_block_tables_tensor(device: Union[torch.device, str]):
+def make_empty_block_tables_tensor(device: torch.device | str):
     return torch.tensor([], device=device)
 
 
-def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
-                       device: Union[torch.device, str]):
-    '''
+def split_slot_mapping(
+    slot_mapping_list: torch.Tensor,
+    seq_lens: list[int],
+    device: torch.device | str,
+):
+    """
     Split a slot mapping into valid prefill- and decode-phase slot mappings.
 
     Context:
@@ -744,28 +730,32 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
       reflecting all N prefill prompts
     * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
       all N decoded tokens
-    '''
+    """
 
     prefill_slot_mapping = []
     decode_slot_mapping = []
 
     base_idx = 0
     for seq_len in seq_lens:
-        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
-                                                                seq_len - 1)])
+        prefill_slot_mapping.extend(
+            slot_mapping_list[base_idx : (base_idx + seq_len - 1)]
+        )
         decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
         base_idx += seq_len
 
-    return (maybe_make_long_tensor(prefill_slot_mapping, device),
-            maybe_make_long_tensor(decode_slot_mapping, device))
+    return (
+        maybe_make_long_tensor(prefill_slot_mapping, device),
+        maybe_make_long_tensor(decode_slot_mapping, device),
+    )
 
 
 def make_block_tables_slot_mapping(
-        block_size: int,
-        seq_lens: list[int],
-        device: Union[torch.device, str],
-        block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]:
-    '''
+    block_size: int,
+    seq_lens: list[int],
+    device: torch.device | str,
+    block_base_addr: int = 0,
+) -> tuple[torch.Tensor, list[int], int]:
+    """
     Construct fake block tables & slot mappings.
 
     For a sequence with num_tokens tokens the minimum number
@@ -802,12 +792,11 @@ def make_block_tables_slot_mapping(
     * block_tables_tensor: block table for sequence
     * slot_mapping_list: slot mapping for sequence
     * max_block_idx: the highest block address within this block table
-    '''
+    """
 
     # Provision minimum number of KV cache blocks
     num_blocks_list = [
-        _num_tokens_to_min_blocks(num_tokens, block_size)
-        for num_tokens in seq_lens
+        _num_tokens_to_min_blocks(num_tokens, block_size) for num_tokens in seq_lens
     ]
     max_block_table_len = max(num_blocks_list)
     block_table_pad_tokens = 10
@@ -820,11 +809,11 @@ def make_block_tables_slot_mapping(
     max_block_idx = block_base_idx
     for sdx, num_tokens in enumerate(seq_lens):
         num_blocks = num_blocks_list[sdx]
-        block_table = list(
-            range(block_base_idx, block_base_idx - num_blocks, -1))
+        block_table = list(range(block_base_idx, block_base_idx - num_blocks, -1))
         for idx in range(num_tokens):
-            mapping_value = (
-                idx % block_size) + block_table[idx // block_size] * block_size
+            mapping_value = (idx % block_size) + block_table[
+                idx // block_size
+            ] * block_size
             slot_mapping_list.append(mapping_value)
 
         block_base_idx -= num_blocks
@@ -841,200 +830,10 @@ def make_block_tables_slot_mapping(
     return (block_tables_tensor, slot_mapping_list, max_block_idx)
 
 
-def make_test_metadata(
-    attn_backend: _Backend,
-    is_prompt: bool,
-    seq_lens: Optional[list[int]],
-    decoder_test_params: Optional[PhaseTestParameters],
-    device: Union[torch.device, str],
-    encoder_test_params: Optional[PhaseTestParameters] = None,
-    cross_test_params: Optional[PhaseTestParameters] = None
-) -> AttentionMetadata:
-    '''
-    Construct fake attention metadata for a given test phase
-    (prefill-phase or decode-phase).
-
-    encoder_test_params and cross_test_params arguments allow encoder
-    attention and enc/dec cross-attention (respectively) to use distinct
-    metadata values from decoder self-attention (decoder_test_params.)
-
-    if encoder_test_params and cross_test_params are None, the attention
-    metadata will support decoder-only scenario.
-
-    Assumptions:
-
-    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
-
-    Arguments:
-
-    * attn_backend_name: Backend for sourcing attention kernels
-    * is_prompt: prefill if True, o/w decode
-    * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params;
-                           this function requires
-                           kv_mmap (memory mapping) field
-    * device: CPU or CUDA device
-    * encoder_test_params: encoder attention test params;
-                           this function requires encoder query
-                           sequence lengths field. If None,
-                           encoder query sequence lengths are
-                           treated as None
-    * cross_test_params: enc/dec cross-attention test params;
-                         this function requires kv_mmap field.
-                         If None, KV cache memory map data
-                         structures are treated as None
-
-    Return:
-
-    * AttentionMetadata structure
-    '''
-
-    # Decoder self-attention memory mapping
-    # decoder_test_params is None signals encoder-only
-    # scenario, so kv_mmap is None
-    kv_mmap = (None
-               if decoder_test_params is None else decoder_test_params.kv_mmap)
-
-    # This function constructs metadata assuming no chunked prefill,
-    # i.e. 100% prefill tokens or 100% decode tokens
-    #
-    # - If is_prompt, num_prefills_or_decodes is the number of prefills
-    #   and num_prefill_or_decode_tokens is the number of prefill tokens
-    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
-    #   and num_prefill_or_decode_tokens is the number of decode tokens
-    #
-    # seq_lens is None signals encoder-only
-    # scenario, in which case num_prefills_or_decodes and
-    # num_prefill_or_decode_tokens are unused
-    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
-
-    num_prefill_or_decode_tokens = (None if seq_lens is None else (
-        sum(seq_lens) if is_prompt else len(seq_lens)))
-
-    # Seems for non-prefix-caching scenarios context_lens
-    # is never needed
-    context_lens = None
-
-    if encoder_test_params is None:
-        encoder_seq_lens = None
-        num_encoder_tokens = None
-    else:
-        # Encoder/decoder or encoder-only models only:
-        # * Extract encoder input sequence lengths
-        assert encoder_test_params.packed_qkvo.packed_qkv is not None
-        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
-        num_encoder_tokens = (None if encoder_seq_lens is None else
-                              (sum(encoder_seq_lens)))
-
-    if cross_test_params is None:
-        cross_kv_mmap = None
-    else:
-        # Encoder/decoder or encoder-only models only:
-        # * Extract *cross-attention* slot_mapping and block table
-        #   (kv_mmap)
-        cross_kv_mmap = cross_test_params.kv_mmap
-
-    attn_backend_obj = make_backend(attn_backend.name)
-
-    if is_prompt:
-        # Prefill-phase scenario
-
-        num_prefills = num_prefills_or_decodes
-        num_prefill_tokens = num_prefill_or_decode_tokens
-        num_decode_tokens = 0
-
-        (
-            seq_lens_tensor,
-            context_lens_tensor,
-            _,
-            _,
-            seq_start_loc,
-            encoder_seq_lens_tensor,
-            encoder_seq_start_loc,
-            max_encoder_seq_len,
-        ) = _make_metadata_tensors(seq_lens,
-                                   context_lens,
-                                   encoder_seq_lens,
-                                   device=device)
-        return attn_backend_obj.make_metadata(
-            num_prefills=num_prefills,
-            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
-            enable_kv_scales_calculation=True,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            seq_start_loc=seq_start_loc,
-            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
-            max_decode_seq_len=0,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
-            use_cuda_graph=False,
-            num_encoder_tokens=num_encoder_tokens,
-            encoder_seq_lens=encoder_seq_lens,
-            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
-            encoder_seq_start_loc=encoder_seq_start_loc,
-            max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.slot_mapping),
-            cross_block_tables=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.block_tables))
-
-    else:  # not is_prompt
-        # Decode-phase scenario
-
-        assert kv_mmap is not None
-        assert num_prefill_or_decode_tokens is not None
-        assert seq_lens is not None
-
-        num_prefills = 0
-        num_prefill_tokens = 0
-        num_decode_tokens = num_prefill_or_decode_tokens
-
-        (
-            seq_lens_tensor,
-            context_lens_tensor,
-            _,
-            _,
-            seq_start_loc,
-            encoder_seq_lens_tensor,
-            encoder_seq_start_loc,
-            max_encoder_seq_len,
-        ) = _make_metadata_tensors(seq_lens,
-                                   context_lens,
-                                   encoder_seq_lens,
-                                   device=device)
-
-        return attn_backend_obj.make_metadata(
-            num_prefills=num_prefills,
-            slot_mapping=kv_mmap.slot_mapping,
-            enable_kv_scales_calculation=True,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            seq_start_loc=seq_start_loc,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=max(seq_lens),
-            max_decode_query_len=1,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=kv_mmap.block_tables,
-            use_cuda_graph=False,
-            num_encoder_tokens=num_encoder_tokens,
-            encoder_seq_lens=encoder_seq_lens,
-            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
-            encoder_seq_start_loc=encoder_seq_start_loc,
-            max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.slot_mapping),
-            cross_block_tables=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.block_tables))
-
-
-def assert_actual_matches_ideal(test_params: PhaseTestParameters,
-                                output_under_test: torch.Tensor,
-                                backend: str) -> None:
-    '''
+def assert_actual_matches_ideal(
+    test_params: PhaseTestParameters, output_under_test: torch.Tensor, backend: str
+) -> None:
+    """
     Assert that observed output matches the ideal output
     contained in the test parameters data structure.
 
@@ -1042,24 +841,24 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
 
     * test_params: Test parameters including packed ideal output
     * output_under_test: actually observed output value
-    '''
+    """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == 'XFORMERS':
-        torch.testing.assert_close(ideal_output,
-                                   output_under_test.view_as(ideal_output))
+    if backend == "XFORMERS":
+        torch.testing.assert_close(
+            ideal_output, output_under_test.view_as(ideal_output)
+        )
 
-    elif backend == 'FLASH_ATTN':
+    elif backend == "FLASH_ATTN":
         # For FlashAttention override the accuracy thresholds to non default
         # values since we notice a higher difference between the ideal and
         # actual output.
-        torch.testing.assert_close(ideal_output,
-                                   output_under_test.view_as(ideal_output),
-                                   atol=0.01,
-                                   rtol=0.016)
+        torch.testing.assert_close(
+            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+        )
     else:
         raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
-            f"'FLASH_ATTN'.")
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
+        )
 
 
 # Copied/modified from torch._refs.__init__.py
@@ -1073,19 +872,15 @@ def fp8_allclose(
     """
     Reference implementation of torch.allclose
     """
-    torch._refs._check_close_args(name="torch.allclose",
-                                  a=a,
-                                  b=b,
-                                  rtol=rtol,
-                                  atol=atol)
+    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
 
     return bool(
         torch.all(
-            torch.isclose(a.double(),
-                          b.double(),
-                          rtol=rtol,
-                          atol=atol,
-                          equal_nan=equal_nan)).item())
+            torch.isclose(
+                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
+            )
+        ).item()
+    )
 
 
 # Marlin MoE test utils
@@ -1098,7 +893,8 @@ def stack_and_dev(tensors: list[torch.Tensor]):
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def torch_experts(
@@ -1108,22 +904,23 @@ def torch_experts(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     global_num_experts: int = -1,
-    b_bias1: Optional[torch.Tensor] = None,
-    b_bias2: Optional[torch.Tensor] = None,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    b_bias1: torch.Tensor | None = None,
+    b_bias2: torch.Tensor | None = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     apply_router_weights_on_input: bool = False,
 ) -> torch.Tensor:
-    assert (global_num_experts == -1
-            or (global_num_experts == w1.shape[0] and expert_map is None)
-            or (expert_map is not None
-                and global_num_experts == expert_map.shape[0]))
+    assert (
+        global_num_experts == -1
+        or (global_num_experts == w1.shape[0] and expert_map is None)
+        or (expert_map is not None and global_num_experts == expert_map.shape[0])
+    )
 
     M, K = a.shape
     topk = topk_ids.shape[1]
@@ -1138,8 +935,9 @@ def torch_experts(
 
     if a1_scale:
         assert not per_act_token_quant and block_shape is None
-    a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
-                                           per_act_token_quant, block_shape)
+    a, a_scale = moe_kernel_quantize_input(
+        a, a1_scale, quant_dtype, per_act_token_quant, block_shape
+    )
 
     num_experts = w1.shape[0]
 
@@ -1159,31 +957,35 @@ def torch_experts(
                 tmp2 = SiluAndMul()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
-                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
-                        tmp1.dtype)
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
             elif block_shape is not None:
                 # block quantized
-                assert (a_scale is not None and w1_scale is not None
-                        and w2_scale is not None)
-                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
-                                                w1_scale[i], block_shape,
-                                                out.dtype)
+                assert (
+                    a_scale is not None
+                    and w1_scale is not None
+                    and w2_scale is not None
+                )
+                tmp1 = native_w8a8_block_matmul(
+                    a[mask], w1[i], a_scale[mask], w1_scale[i], block_shape, out.dtype
+                )
                 if b_bias1 is not None:
                     tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
-                    block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant, block_shape
+                )
 
-                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
-                                                     w2_scale[i], block_shape,
-                                                     out.dtype)
+                out[mask] = native_w8a8_block_matmul(
+                    tmp2, w2[i], b_scale, w2_scale[i], block_shape, out.dtype
+                )
                 if b_bias2 is not None:
-                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
-                        tmp1.dtype)
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
             else:
-                assert (a_scale is not None and w1_scale is not None
-                        and w2_scale is not None)
+                assert (
+                    a_scale is not None
+                    and w1_scale is not None
+                    and w2_scale is not None
+                )
                 scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
 
                 tmp1 = a[mask].to(f32) * scales
@@ -1195,37 +997,50 @@ def torch_experts(
                 tmp2 = SiluAndMul()(tmp1).to(out.dtype)
 
                 tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
-                    block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant, block_shape
+                )
                 assert b_scale is not None
 
                 tmp2 = tmp2.to(f32) * b_scale
                 w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                 out[mask] = (tmp2 @ w2_dq).to(out.dtype)
                 if b_bias2 is not None:
-                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
-                        out.dtype)
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(out.dtype)
 
     if apply_router_weights_on_input:
         return out
     else:
-        return (out.view(M, -1, w2.shape[1]).to(f32) *
-                topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
-
-
-def torch_moe(a: torch.Tensor,
-              w1: torch.Tensor,
-              w2: torch.Tensor,
-              score: torch.Tensor,
-              topk: int,
-              b_bias1: Optional[torch.Tensor] = None,
-              b_bias2: Optional[torch.Tensor] = None,
-              global_num_experts: int = -1,
-              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return (
+            (out.view(M, -1, w2.shape[1]).to(f32) * topk_weight.view(M, -1, 1))
+            .sum(dim=1)
+            .to(out.dtype)
+        )
+
+
+def torch_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    b_bias1: torch.Tensor | None = None,
+    b_bias2: torch.Tensor | None = None,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
-    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
-                         b_bias1, b_bias2, expert_map)
+    return torch_experts(
+        a,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        global_num_experts,
+        b_bias1,
+        b_bias2,
+        expert_map,
+    )
 
 
 def torch_moe_single(a, w, score, topk):
@@ -1244,41 +1059,47 @@ def torch_moe_single(a, w, score, topk):
 
 # A special version of op check that has a restricted default set of test_utils
 # and a patched version of allclose that supports fp8 types.
-def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
-                      torch._library.custom_ops.CustomOpDef],
-            args: tuple[Any, ...],
-            kwargs: Optional[dict[str, Any]] = None,
-            *,
-            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
-            raise_exception: bool = True,
-            cond: bool = True) -> dict[str, str]:
-    with unittest.mock.patch('torch.allclose', new=fp8_allclose):
-        return torch.library.opcheck(
-            op,
-            args,
-            kwargs,
-            test_utils=test_utils,
-            raise_exception=raise_exception) if cond else {}
+def opcheck(
+    op: torch._ops.OpOverload
+    | torch._ops.OpOverloadPacket
+    | torch._library.custom_ops.CustomOpDef,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any] | None = None,
+    *,
+    test_utils: str | Sequence[str] = ALL_OPCHECK_TEST_UTILS,
+    raise_exception: bool = True,
+    cond: bool = True,
+) -> dict[str, str]:
+    with patch("torch.allclose", new=fp8_allclose):
+        return (
+            torch.library.opcheck(
+                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
+            )
+            if cond
+            else {}
+        )
 
 
 # For testing quantized linear kernels
 def to_fp8(tensor: torch.Tensor):
     finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
 
 
 def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+def baseline_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
     # in numpy simply stretches dimensions with an extent of 1 to match
     # the target shape by repeating the data along that dimension (broadcasting)
@@ -1297,16 +1118,19 @@ def baseline_scaled_mm(a: torch.Tensor,
         for i, s in enumerate(shape):
             if t.shape[i] != s and t.shape[i] != 1:
                 assert s % t.shape[i] == 0
-                t = t.unsqueeze(i + 1)\
-                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
-                  .flatten(i, i + 1)
+                t = (
+                    t.unsqueeze(i + 1)
+                    .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                    .flatten(i, i + 1)
+                )
         return t
 
     scale_a = group_broadcast(scale_a, a.shape)
     scale_b = group_broadcast(scale_b, b.shape)
 
-    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
-                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+    output = torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    ).to(out_dtype)
 
     if bias is not None:
         output = output + bias
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index ca2f04dabfc98d434d189fdc4b006ba91ed12fa2..a61ccef700624b2dc537272aa8e2480cc9b65fd2 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -8,8 +8,7 @@ import torch
 from tqdm import tqdm
 
 from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
-    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
 from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 
 # TODO: the test depends on a lot of fields in the current implementation.
@@ -17,7 +16,6 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 
 
 def test_run(my_rank, buffer, device):
-
     # buffer should be empty in the beginning
     if my_rank == 0:
         assert buffer.buffer_size == 0
@@ -27,7 +25,7 @@ def test_run(my_rank, buffer, device):
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
-    roi = (tokens > 0)
+    roi = tokens > 0
     if my_rank == 0:
         key = 2.0 * torch.ones([5, 6]).to(device)
         value = 3.0 * torch.ones([5, 6]).to(device)
@@ -55,7 +53,6 @@ def test_run(my_rank, buffer, device):
 
 
 def stress_test(my_rank, buf, device):
-
     torch.distributed.barrier()
     torch.manual_seed(100)
 
@@ -66,7 +63,8 @@ def stress_test(my_rank, buf, device):
             torch.rand(100).to(device),  # key
             torch.rand(100).to(device),  # value
             torch.rand(100).to(device),  # hidden
-        ) for i in tqdm(range(200))
+        )
+        for i in tqdm(range(200))
     ]
 
     random.seed(my_rank)
@@ -115,12 +113,11 @@ def stress_test(my_rank, buf, device):
 
 
 if __name__ == "__main__":
-
-    my_rank = int(os.environ['RANK'])
+    my_rank = int(os.environ["RANK"])
 
     torch.distributed.init_process_group(
-        backend='gloo',
-        init_method='tcp://localhost:12398',
+        backend="gloo",
+        init_method="tcp://localhost:12398",
         world_size=2,
         rank=my_rank,
     )
@@ -128,8 +125,8 @@ if __name__ == "__main__":
     print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
-        kv_connector='P2pNcclConnector',
-        kv_buffer_device='cuda',
+        kv_connector="P2pNcclConnector",
+        kv_buffer_device="cuda",
         kv_buffer_size=1e9,
         kv_rank=my_rank,
         kv_role="kv_both",  # this arg doesn't matter in this test
@@ -160,4 +157,4 @@ if __name__ == "__main__":
     buffer.close()
     data_pipe.close()
     cpu_pipe.close()
-    print('Done')
+    print("Done")
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
index 7a04174870dafaac44c7789108d49f4f8de2ead1..b9a28e4bceb7c242b751f12d0facf6adb7b2a9fd 100644
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -9,21 +9,19 @@ import torch
 
 
 def run_python_script(script_name, timeout):
-    script_name = f'kv_transfer/{script_name}'
+    script_name = f"kv_transfer/{script_name}"
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK":
-                 "0"},  # Set the RANK environment variable for process 0
+            env={"RANK": "0"},  # Set the RANK environment variable for process 0
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
 
         process1 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK":
-                 "1"},  # Set the RANK environment variable for process 1
+            env={"RANK": "1"},  # Set the RANK environment variable for process 1
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
@@ -34,11 +32,9 @@ def run_python_script(script_name, timeout):
 
         # Check the return status of both processes
         if process0.returncode != 0:
-            pytest.fail(
-                f"Test {script_name} failed for RANK=0, {process0.returncode}")
+            pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
         if process1.returncode != 0:
-            pytest.fail(
-                f"Test {script_name} failed for RANK=1, {process1.returncode}")
+            pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
 
     except subprocess.TimeoutExpired:
         # If either process times out, terminate both and fail the test
@@ -53,15 +49,14 @@ def run_python_script(script_name, timeout):
 @pytest.mark.parametrize(
     "script_name,timeout",
     [
-        ("test_lookup_buffer.py",
-         60),  # Second test case with a 60-second timeout
-        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
-    ])
+        ("test_lookup_buffer.py", 60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120),  # First test case with a 120-second timeout
+    ],
+)
 def test_run_python_script(script_name, timeout):
     # Check the number of GPUs
     if torch.cuda.device_count() < 2:
-        pytest.skip(
-            f"Skipping test {script_name} because <2 GPUs are available")
+        pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
 
     # Run the test if there are at least 2 GPUs
     run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 99ad2b43aeac8b6af1415a9c3d7f176dfe3c3c47..5762224eff76d5703f2c8f5124922c3d36f1806e 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -15,7 +15,7 @@ def test_run(my_rank, pipe):
     print(f"rank {my_rank} test_run starts....")
     # test run
     x = torch.tensor([1]).to(pipe.device)
-    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
+    y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
     if my_rank == 0:
         pipe.send_tensor(x)
         print(f"rank {my_rank} sent tensor x")
@@ -53,9 +53,8 @@ def stress_test(my_rank, pipe):
     for i in tqdm(range(500)):
         mean = torch.rand(1).item() * 100
         std = torch.rand(1).item() * 100
-        size = torch.randint(900, 1000, (2, ))
-        x = torch.normal(mean * 1.0, std * 1.0,
-                         size=size.tolist()).to(pipe.device)
+        size = torch.randint(900, 1000, (2,))
+        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
 
         # 5% probability of sending a None
         if torch.rand(1).item() < 0.05:
@@ -96,20 +95,16 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.distributed.barrier()
 
     for i in tqdm(range(500)):
-
         tensors = []
 
         if my_rank == 0:
             # create tensor
-            tensors = [
-                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
-            ]
+            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
 
         torch.distributed.barrier()
 
         if my_rank == 0:
-            t = torch.tensor([time.time()],
-                             dtype=torch.float64).to(pipe.device)
+            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
             for tensor in tensors:
                 pipe.send_tensor(tensor)
             pipe.send_tensor(t)
@@ -121,24 +116,23 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     torch.distributed.barrier()
 
-    print('Latency test passed.')
-    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+    print("Latency test passed.")
+    print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
 
 
 if __name__ == "__main__":
-
-    my_rank = int(os.environ['RANK'])
+    my_rank = int(os.environ["RANK"])
 
     torch.distributed.init_process_group(
-        backend='gloo',
-        init_method='tcp://localhost:12398',
+        backend="gloo",
+        init_method="tcp://localhost:12398",
         world_size=2,
         rank=my_rank,
     )
 
     config = KVTransferConfig(
-        kv_connector='P2pNcclConnector',
-        kv_buffer_device='cuda',
+        kv_connector="P2pNcclConnector",
+        kv_buffer_device="cuda",
         kv_buffer_size=1e9,
         kv_rank=my_rank,
         kv_role="kv_both",  # this arg doesn't matter in this test
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index b539a7bf5d76c31303ba837581bae9c60f21ac16..d8ff9339bb49b1d154390b09acdb8d7904af5a3a 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -10,12 +10,16 @@ import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -47,11 +51,13 @@ def dist_init():
     if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
-    init_distributed_environment(world_size=1,
-                                 rank=0,
-                                 distributed_init_method=f"file://{temp_file}",
-                                 local_rank=0,
-                                 backend=backend)
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend=backend,
+    )
     initialize_model_parallel(1, 1)
     yield
     cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -66,10 +72,9 @@ def dist_init_torch_only():
         backend = "gloo"
 
     temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(world_size=1,
-                                         rank=0,
-                                         init_method=f"file://{temp_file}",
-                                         backend=backend)
+    torch.distributed.init_process_group(
+        world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
+    )
 
 
 class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@@ -79,24 +84,30 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
 @pytest.fixture
 def dummy_model() -> nn.Module:
     model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", ColumnParallelLinear(50, 10)),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", ColumnParallelLinear(50, 10)),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+            ]
+        )
+    )
     model.config = MagicMock()
     model.embedding_modules = {"lm_head": "lm_head"}
     model.unpadded_vocab_size = 32000
@@ -106,24 +117,30 @@ def dummy_model() -> nn.Module:
 @pytest.fixture
 def dummy_model_gate_up() -> nn.Module:
     model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+            ]
+        )
+    )
     model.config = MagicMock()
     model.packed_modules_mapping = {
         "gate_up_proj": [
@@ -213,6 +230,26 @@ def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
 
+@pytest.fixture(scope="session")
+def deepseekv2_lora_files():
+    return snapshot_download(repo_id="wuchen01/DeepSeek-V2-Lite-Chat-All-LoRA")
+
+
+@pytest.fixture(scope="session")
+def gptoss20b_lora_files():
+    return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql")
+
+
+@pytest.fixture(scope="session")
+def qwen3moe_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen3-moe-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def olmoe_lora_files():
+    return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
+
+
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 35d0245759154df2d623023b2a175dfcef8716aa..9a82ab99ea9c9c42bac0fe65eaee82ecb5292c89 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -7,11 +7,12 @@ import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
@@ -26,14 +27,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
     return lora_requests
 
 
-async def requests_processing_time(llm,
-                                   lora_requests: list[LoRARequest]) -> float:
-
-    sampling_params = SamplingParams(n=1,
-                                     temperature=0.0,
-                                     top_p=1.0,
-                                     ignore_eos=True,
-                                     max_tokens=1)
+async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
+    sampling_params = SamplingParams(
+        n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
+    )
 
     generators = []
     start = time.perf_counter()
@@ -41,11 +38,11 @@ async def requests_processing_time(llm,
     for lora_request in lora_requests:
         lora_int_id = lora_request.lora_int_id
         generator = llm.generate(
-            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
-                              multi_modal_data=None),  # type: ignore 
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None),  # type: ignore
             sampling_params=sampling_params,
             lora_request=lora_request,
-            request_id=f"test{lora_int_id}")
+            request_id=f"test{lora_int_id}",
+        )
         generators.append(generator)
 
     all_gens = merge_async_iterators(*generators)
@@ -58,13 +55,13 @@ async def requests_processing_time(llm,
 
 @pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
-    """ 
+    """
     The add_lora function is used to preload some LoRA adapters into the
     engine in anticipation of future requests using these adapters. To test
     this functionality, we use the async engine to process some requests - We
     do it twice, once with add_lora() preloading and once without.
 
-    We measure the request processing time in both cases and expect the time 
+    We measure the request processing time in both cases and expect the time
     to be lesser in the case with add_lora() calls.
     """
     lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
@@ -78,18 +75,18 @@ async def test_add_lora(chatglm3_lora_files):
         max_loras=max_loras,
         max_lora_rank=LORA_RANK,
         max_model_len=128,
-        gpu_memory_utilization=0.8,  #avoid OOM
+        gpu_memory_utilization=0.8,  # avoid OOM
         trust_remote_code=True,
-        enforce_eager=True)
+        enforce_eager=True,
+    )
 
     # split lora_requests into 3 parts
     part_size = len(lora_requests) // 3
     dummy_run_requests = lora_requests[:part_size]
-    warmup_run_requests = lora_requests[part_size:part_size * 2]
-    cold_run_requests = lora_requests[part_size * 2:]
+    warmup_run_requests = lora_requests[part_size : part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2 :]
 
     async with build_async_engine_client_from_engine_args(engine_args) as llm:
-
         # Dummy run - So any 1-time functionality like triton kernel compilation
         # is complete here.
         await requests_processing_time(llm, dummy_run_requests)
@@ -101,18 +98,16 @@ async def test_add_lora(chatglm3_lora_files):
         # Test that all all_lora calls are successful.
         assert all(add_lora_results)
 
-        time_with_add_lora = await requests_processing_time(
-            llm, warmup_run_requests)
+        time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
 
         # Run without any warmup
-        time_cold_start = await requests_processing_time(
-            llm, cold_run_requests)
+        time_cold_start = await requests_processing_time(llm, cold_run_requests)
 
-    print(f"time hot-start {time_with_add_lora} vs "
-          f"time cold-start {time_cold_start} ")
+    print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
 
     assert time_with_add_lora < time_cold_start, (
         f"time_with_add_lora={time_with_add_lora}, "
         f"time_cold_start={time_cold_start}"
         "The engine request processing time with LoRA pre-loading "
-        "must be less than the version that does on-demand LoRA loading.")
+        "must be less than the version that does on-demand LoRA loading."
+    )
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 5cffb8cfcc26d9fed397f68c61b77b275d2b089b..8f42243387d29d411dd98a49075e17543a05ae3a 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
+import vllm.config
 from vllm.lora.request import LoRARequest
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
@@ -12,7 +13,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
 
 EXPECTED_LORA_OUTPUT = [
     "SELECT count(*) FROM singer",
-    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",
     "SELECT name ,  country ,  age FROM singer ORDER BY age",
 ]
 
@@ -21,20 +22,24 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query=(
+                "What is the average, minimum, and maximum "
+                "age of all singers from France?"
+            )
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+            query=(
+                "Show name, country, age for all singers ordered "
+                "by age from the oldest to the youngest."
+            )
         ),
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -47,13 +52,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=512,
+        enable_lora=True,
+        max_loras=2,
+        max_num_seqs=16,
+        max_lora_rank=64,
+        trust_remote_code=True,
+    )
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -64,17 +71,21 @@ def test_chatglm3_lora(chatglm3_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=512,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=64,
+        max_num_seqs=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -85,21 +96,24 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
     # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
     # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
     # more GPU memory causing vLLM to OOM
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True,
-                   gpu_memory_utilization=0.85)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=512,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        gpu_memory_utilization=0.8,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_deepseekv2_tp.py b/tests/lora/test_deepseekv2_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3496fa88e6bb0214b5df9e3d567f634fb7860e3
--- /dev/null
+++ b/tests/lora/test_deepseekv2_tp.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# NOTE To avoid overloading the CI pipeline, this test script will
+# not be triggered on CI and is primarily intended for local testing
+# and verification.
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+PROMPT_TEMPLATE = "<｜begin▁of▁sentence｜>You are a helpful assistant.\n\nUser: {context}\n\nAssistant:"  # noqa: E501
+
+
+def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int):
+    prompts = [
+        PROMPT_TEMPLATE.format(context="Who are you?"),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # return generated_texts
+    expected_lora_output = [
+        "I am \u5f20\u5b50\u8c6a, an AI assistant developed by \u9648\u58eb\u680b.",  # noqa: E501
+    ]
+    for i in range(len(expected_lora_output)):
+        assert generated_texts[i].startswith(expected_lora_output[i])
+
+
+def test_deepseekv2_lora(deepseekv2_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+    generate_and_test(llm, deepseekv2_lora_files, 1)
+
+
+def test_deepseekv2(deepseekv2_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+    )
+    generate_and_test(llm, deepseekv2_lora_files, 1)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_deepseekv2_tp2(deepseekv2_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        tensor_parallel_size=2,
+    )
+    generate_and_test(llm, deepseekv2_lora_files, 2)
+
+
+@multi_gpu_test(num_gpus=4)
+def test_deepseekv2_tp4(deepseekv2_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        tensor_parallel_size=4,
+    )
+    generate_and_test(llm, deepseekv2_lora_files, 2)
diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index f615ceda76b56e96dfaa55f543522c90e37342be..dfc45e78e464f72addd12484ad635bccda6374f6 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -30,17 +30,15 @@ VLLM_RUNNER_BASE_KWARGS = {
     "enable_lora": "True",
     "max_num_seqs": 2,
     "max_lora_rank": 320,
-    "max_model_len": 12800,
+    # Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change.
+    "max_model_len": 4096,
     "gpu_memory_utilization": 0.8,
-    "limit_mm_per_prompt": {
-        "audio": 1
-    },
+    "limit_mm_per_prompt": {"audio": 1},
     "enforce_eager": True,
 }
 
 
-def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
-             **kwargs):
+def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
     inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
 
     # Apply any additional kwargs as overrides to the base kwargs
@@ -53,11 +51,11 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
                 max_tokens=128,
                 audios=audios,
                 lora_request=lora_request,
-            ) for prompts, audios in inputs
+            )
+            for prompts, audios in inputs
         ]
 
-        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
-            expected_suffix)
+        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
 
 
 def test_active_default_mm_lora(
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ab4a87c65f85039faad8ba89b94f768f04608e
--- /dev/null
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -0,0 +1,319 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.lora.ops.triton_ops import fused_moe_lora
+from vllm.platforms import current_platform
+
+
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
+
+
+def round_up(x, base):
+    return ((x + base - 1) // base) * base
+
+
+def CEILDIV(x, y):
+    return (x + y - 1) // y
+
+
+def assign_loras_to_tokens(num_tokens: int, num_sequences: int, max_loras: int):
+    """
+    Split `num_tokens` into `num_sequences` sequences.
+    Each sequence randomly selects 1 LoRA index from [0, max_loras),
+    and all tokens in that sequence are assigned this LoRA index.
+
+    Args:
+        num_tokens (int): Total number of tokens.
+        num_sequences (int): Number of sequences to split the tokens into.
+        max_loras (int): Total number of available LoRA modules.
+
+    Returns:
+        torch.Tensor: 1D tensor of shape [num_tokens], where each value
+                      is the LoRA index assigned to that token.
+    """
+    assert num_sequences > 0 and max_loras > 0
+    assert num_tokens >= num_sequences, "num_tokens must be >= num_sequences"
+
+    # Compute token distribution per sequence (distribute remainder evenly)
+    tokens_per_seq = num_tokens // num_sequences
+    remainder = num_tokens % num_sequences
+
+    token_lora_mapping = torch.empty(num_tokens, dtype=torch.int32)
+
+    start = 0
+    for seq_idx in range(num_sequences):
+        # Determine the token range for this sequence
+        end = start + tokens_per_seq + (1 if seq_idx < remainder else 0)
+
+        # Randomly select one LoRA ID for this sequence
+        lora_id = random.randint(0, max_loras - 1)
+
+        # Assign the same LoRA ID to all tokens in this sequence
+        token_lora_mapping[start:end] = lora_id
+
+        start = end
+
+    return token_lora_mapping
+
+
+def assign_experts_to_tokens(num_tokens: int, num_experts: int, top_k_num: int):
+    """
+    For each token, randomly select `top_k_num` distinct experts out of `num_experts`,
+    and assign normalized random weights that sum to 1.
+
+    Args:
+        num_tokens (int): Total number of tokens.
+        num_experts (int): Total number of available experts.
+        top_k_num (int): Number of experts to select per token.
+
+    Returns:
+        expert_indices (torch.Tensor): shape [num_tokens, top_k_num],
+                                       expert index for each token.
+        expert_weights (torch.Tensor): shape [num_tokens, top_k_num],
+                                       normalized weights (sum = 1 per row).
+    """
+    assert top_k_num <= num_experts, "top_k_num must be <= num_experts"
+
+    # Randomly select top_k_num distinct experts for each token
+    expert_indices = torch.empty((num_tokens, top_k_num), dtype=torch.int32)
+    for i in range(num_tokens):
+        # Randomly choose unique expert indices
+        selected = torch.randperm(num_experts)[:top_k_num]
+        expert_indices[i] = selected
+
+    # Generate random weights and normalize along dim=1
+    expert_weights = torch.rand((num_tokens, top_k_num), dtype=torch.float32)
+    expert_weights = expert_weights / expert_weights.sum(dim=1, keepdim=True)
+
+    return expert_indices, expert_weights
+
+
+def sample_data(
+    num_tokens: int,
+    num_sequences: int,
+    max_loras: int,
+    num_experts: int,
+    top_k_num: int,
+):
+    topk_ids, topk_weights = assign_experts_to_tokens(
+        num_tokens, num_experts, top_k_num
+    )
+    token_lora_mapping = assign_loras_to_tokens(num_tokens, num_sequences, max_loras)
+    return topk_ids, topk_weights, token_lora_mapping
+
+
+def use_fused_moe_lora_kernel(
+    topk_ids,
+    topk_weights,
+    token_lora_mapping,
+    max_lora_rank,
+    top_k_num,
+    lora_a_stacked,
+    lora_b_stacked,
+    hidden_states,
+    output,
+    max_loras,
+    num_experts,
+    block_size,
+):
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
+
+    # init output tensors
+    sorted_token_ids = torch.empty(
+        (max_loras * max_num_tokens_padded,),
+        dtype=torch.int32,
+    )
+    expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32)
+    num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32)
+    adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32)
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32)
+
+    # call kernel
+    ops.moe_lora_align_block_size(
+        topk_ids,
+        token_lora_mapping,
+        num_experts,
+        block_size,
+        max_loras,
+        max_num_tokens_padded,
+        max_num_m_blocks,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        adapter_enabled,
+        lora_ids,
+    )
+
+    config = {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "NUM_WARPS": 4,
+        "NUM_STAGES": 3,
+        "SPLIT_K": 1,
+    }
+
+    mul_routed_weight = False
+    expert_ids = expert_ids.view(max_loras, -1)
+    sorted_token_ids = sorted_token_ids.view(max_loras, -1)
+
+    fused_moe_lora(
+        output,
+        hidden_states,
+        lora_a_stacked,
+        lora_b_stacked,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        max_lora_rank,
+        top_k_num,
+        lora_ids,
+        adapter_enabled,
+        config["BLOCK_SIZE_M"],
+        config["BLOCK_SIZE_N"],
+        config["BLOCK_SIZE_K"],
+        config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
+        config["SPLIT_K"],
+        config["BLOCK_SIZE_M"],
+        config["BLOCK_SIZE_N"],
+        config["BLOCK_SIZE_K"],
+        config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
+        config["SPLIT_K"],
+        mul_routed_weight,
+    )
+
+    return output
+
+
+def use_torch(
+    hidden_states,
+    token_lora_mapping,
+    topk_ids,
+    lora_a_stacked,
+    lora_b_stacked,
+    top_k_num,
+):
+    outputs = []
+    for i in range(hidden_states.shape[0]):
+        lora_idx = token_lora_mapping[i]
+        expert_ids = topk_ids[i]
+        lora_a = lora_a_stacked[0][lora_idx][expert_ids]
+        lora_b = lora_b_stacked[0][lora_idx][expert_ids]
+        tensors = [
+            hidden_states[i] @ lora_a[x].T @ lora_b[x].T for x in range(top_k_num)
+        ]
+        outputs.append(torch.stack(tensors, dim=0))
+    return torch.stack(outputs, dim=0)
+
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"cuda:{0}"]
+SEED = [42]
+
+
+@pytest.mark.parametrize("num_tokens", [100])
+@pytest.mark.parametrize("top_k_num", [6, 12])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("max_loras", [4, 6, 16])
+@pytest.mark.parametrize("N", [1408])
+@pytest.mark.parametrize("K", [2048])
+@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_fused_moe_lora_kernel(
+    num_tokens,
+    top_k_num,
+    num_experts,
+    max_loras,
+    N,
+    K,
+    max_lora_rank,
+    block_size,
+    dtype,
+    device,
+    seed,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+    # the number of randomly generated sentences.
+    num_sequences = 10
+    # generate data
+    topk_ids, topk_weights, token_lora_mapping = sample_data(
+        num_tokens, num_sequences, max_loras, num_experts, top_k_num
+    )
+
+    # init lora weights
+    lora_a_stacked = [
+        torch.rand(
+            (
+                max_loras,
+                num_experts,
+                max_lora_rank,
+                K,
+            ),
+            dtype=dtype,
+        )
+    ]
+    lora_b_stacked = [
+        torch.rand(
+            (
+                max_loras,
+                num_experts,
+                N,
+                max_lora_rank,
+            ),
+            dtype=dtype,
+        )
+    ]
+    hidden_states = torch.rand(
+        (
+            num_tokens,
+            K,
+        ),
+        dtype=dtype,
+    )
+
+    # fused_moe_lora_kernel output
+    output = torch.zeros((num_tokens, top_k_num, N), dtype=dtype)
+    use_fused_moe_lora_kernel(
+        topk_ids,
+        topk_weights,
+        token_lora_mapping,
+        max_lora_rank,
+        top_k_num,
+        lora_a_stacked,
+        lora_b_stacked,
+        hidden_states,
+        output,
+        max_loras,
+        num_experts,
+        block_size,
+    )
+    # pytorch output
+    output2 = use_torch(
+        hidden_states,
+        token_lora_mapping,
+        topk_ids,
+        lora_a_stacked,
+        lora_b_stacked,
+        top_k_num,
+    )
+
+    torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..711d514a39eb3cbcd794dfa83efa5acde3824601
--- /dev/null
+++ b/tests/lora/test_gptoss_tp.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "openai/gpt-oss-20b"
+
+PROMPT_TEMPLATE = """<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
+Knowledge cutoff: 2024-06
+Current date: 2025-10-29
+
+Reasoning: medium
+
+# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+farm contains tables such as city, farm, farm_competition, competition_record. Table city has columns such as City_ID, Official_Name, Status, Area_km_2, Population, Census_Ranking. City_ID is the primary key.
+Table farm has columns such as Farm_ID, Year, Total_Horses, Working_Horses, Total_Cattle, Oxen, Bulls, Cows, Pigs, Sheep_and_Goats. Farm_ID is the primary key.
+Table farm_competition has columns such as Competition_ID, Year, Theme, Host_city_ID, Hosts. Competition_ID is the primary key.
+Table competition_record has columns such as Competition_ID, Farm_ID, Rank. Competition_ID is the primary key.
+The Host_city_ID of farm_competition is the foreign key of City_ID of city.
+The Farm_ID of competition_record is the foreign key of Farm_ID of farm.
+The Competition_ID of competition_record is the foreign key of Competition_ID of farm_competition.
+
+
+###Input:
+{context}
+
+###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
+    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
+    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
+]
+
+
+def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
+    prompts = [
+        PROMPT_TEMPLATE.format(
+            context="Give the average number of working horses on farms with more than 5000 total horses."  # noqa: E501
+        ),  # noqa: E501
+        PROMPT_TEMPLATE.format(
+            context="What are the maximum and minimum number of cows across all farms."
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the maximum and minimum number of cows across all farms."
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
+
+
+def test_gpt_oss_lora(gptoss20b_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=8,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
+
+    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        tensor_parallel_size=2,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
+
+    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index ced0afc50cb91e353cf91854bf64aaecf7b44297..8f18f0144193230a59dc18d85194e1a0bfb3036c 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -4,7 +4,6 @@
 import random
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -12,32 +11,38 @@ import torch
 import torch.nn.functional as F
 
 from vllm.config.lora import LoRAConfig
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              ColumnParallelLinearWithShardedLoRA,
-                              LogitsProcessorWithLoRA, LoRAMapping,
-                              MergedColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithShardedLoRA,
-                              MergedQKVParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithShardedLoRA,
-                              QKVParallelLinearWithLoRA,
-                              QKVParallelLinearWithShardedLoRA,
-                              ReplicatedLinearWithLoRA,
-                              RowParallelLinearWithLoRA,
-                              RowParallelLinearWithShardedLoRA,
-                              VocabParallelEmbeddingWithLoRA)
-# yapf: enable
+from vllm.lora.layers import (
+    BaseLayerWithLoRA,
+    ColumnParallelLinearWithLoRA,
+    ColumnParallelLinearWithShardedLoRA,
+    LogitsProcessorWithLoRA,
+    LoRAMapping,
+    MergedColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithLoRA,
+    QKVParallelLinearWithShardedLoRA,
+    ReplicatedLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+    RowParallelLinearWithShardedLoRA,
+    VocabParallelEmbeddingWithLoRA,
+)
 from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+    get_masked_input_and_mask,
+)
 from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import current_platform
 
@@ -51,11 +56,14 @@ TOLERANCES = {
 
 pytestmark = pytest.mark.skipif(
     not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
-    reason="Backend not supported")
+    reason="Backend not supported",
+)
 
-DEVICES = ([
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-] if current_platform.is_cuda_alike() else ["cpu"])
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
 
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
@@ -68,8 +76,8 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2
 @pytest.fixture(autouse=True)
 def clean_cache_reset_device(reset_default_device):
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
-    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
-                                                _LORA_B_PTR_DICT)
+    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+
     _LORA_B_PTR_DICT.clear()
     _LORA_A_PTR_DICT.clear()
 
@@ -79,13 +87,14 @@ def clean_cache_reset_device(reset_default_device):
 @pytest.fixture(autouse=True)
 def skip_cuda_with_stage_false(request):
     """
-    On cuda-like platforms, we use the same kernels for prefill and decode 
+    On cuda-like platforms, we use the same kernels for prefill and decode
     stage, and 'stage' is generally ignored, so we only need to test once.
     """
     if current_platform.is_cuda_alike():
         try:
             if hasattr(request.node, "callspec") and hasattr(
-                    request.node.callspec, "params"):
+                request.node.callspec, "params"
+            ):
                 params = request.node.callspec.params
                 if "stage" in params and params["stage"] is False:
                     pytest.skip("Skip test when stage=False")
@@ -94,9 +103,9 @@ def skip_cuda_with_stage_false(request):
     yield
 
 
-def get_random_id_to_index(num_loras: int,
-                           num_slots: int,
-                           log: bool = True) -> list[Optional[int]]:
+def get_random_id_to_index(
+    num_loras: int, num_slots: int, log: bool = True
+) -> list[int | None]:
     """Creates a random lora_id_to_index mapping.
 
     Args:
@@ -109,9 +118,10 @@ def get_random_id_to_index(num_loras: int,
     if num_loras > num_slots:
         raise ValueError(
             f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
-            "num_loras must be less than or equal to num_slots.")
+            "num_loras must be less than or equal to num_slots."
+        )
 
-    slots: list[Optional[int]] = [None] * num_slots
+    slots: list[int | None] = [None] * num_slots
     random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
     for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
         slots[slot_idx] = lora_id
@@ -123,7 +133,7 @@ def get_random_id_to_index(num_loras: int,
 
 
 def populate_loras(
-    id_to_index: list[Optional[int]],
+    id_to_index: list[int | None],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
     generate_embeddings_tensor: int = 0,
@@ -158,19 +168,18 @@ def populate_loras(
             subloras: list[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
-                sublora = DummyLoRAManager(
-                    layer_weights.device).init_random_lora(
-                        module_name=f"fake_{i}",
-                        weight=layer_weights,
-                        generate_embeddings_tensor=generate_embeddings_tensor,
-                    )
-                sublora.lora_b = sublora.lora_b[(sublora_len *
-                                                 i):(sublora_len * (i + 1)), :]
+                sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
+                    module_name=f"fake_{i}",
+                    weight=layer_weights,
+                    generate_embeddings_tensor=generate_embeddings_tensor,
+                )
+                sublora.lora_b = sublora.lora_b[
+                    (sublora_len * i) : (sublora_len * (i + 1)), :
+                ]
                 sublora.optimize()
                 subloras.append(sublora)
 
-            lora = PackedLoRALayerWeights.pack(
-                subloras) if repeats > 1 else subloras[0]
+            lora = PackedLoRALayerWeights.pack(subloras) if repeats > 1 else subloras[0]
 
             layer.set_lora(
                 slot_idx,
@@ -191,7 +200,7 @@ def create_random_inputs(
     input_size: tuple[int, ...],
     input_range: tuple[float, float],
     input_type: torch.dtype = torch.int,
-    device: torch.device = "cuda"
+    device: torch.device = "cuda",
 ) -> tuple[list[torch.Tensor], list[int], list[int]]:
     """Creates random inputs.
 
@@ -213,14 +222,15 @@ def create_random_inputs(
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
-                torch.randint(low=int(low),
-                              high=int(high),
-                              size=input_size,
-                              device=device))
+                torch.randint(
+                    low=int(low), high=int(high), size=input_size, device=device
+                )
+            )
         else:
             inputs.append(
-                torch.rand(size=input_size, dtype=input_type, device=device) *
-                high + low)
+                torch.rand(size=input_size, dtype=input_type, device=device) * high
+                + low
+            )
 
         lora_id = random.choice(active_lora_ids)
         index_mapping += [lora_id] * input_size[0]
@@ -258,9 +268,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def create_random_embedding_layer():
         embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -286,15 +296,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_embedding(torch.cat(inputs))
 
@@ -306,15 +319,12 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
                 input_,
                 lora.lora_a.T,
             )
-            result += (after_a @ lora.lora_b.T)
+            result += after_a @ lora.lora_b.T
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -324,24 +334,24 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=[0],
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_embedding(torch.cat(inputs))
         expected_result = embedding(torch.cat(inputs))
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -351,9 +361,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
-def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
-                                        vocab_size, stage) -> None:
-
+def test_embeddings_with_new_embeddings(
+    dist_init, num_loras, device, vocab_size, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -361,9 +371,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def create_random_embedding_layer():
         embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -373,12 +383,12 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
         expanded_embedding = VocabParallelEmbedding(
             vocab_size + lora_config.lora_extra_vocab_size * max_loras,
             256,
-            org_num_embeddings=vocab_size)
+            org_num_embeddings=vocab_size,
+        )
         expanded_embedding.weight.data[:vocab_size, :] = embedding_data
         # We need to deepcopy the embedding as it will be modified
         # in place
-        lora_embedding = VocabParallelEmbeddingWithLoRA(
-            deepcopy(expanded_embedding))
+        lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
         lora_embedding.create_lora_weights(max_loras, lora_config)
 
         return expanded_embedding, lora_embedding
@@ -392,7 +402,8 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
             id_to_index,
             layer=lora_embedding,
             layer_weights=torch.zeros(
-                (256, vocab_size + lora_config.lora_extra_vocab_size)),
+                (256, vocab_size + lora_config.lora_extra_vocab_size)
+            ),
             generate_embeddings_tensor=256,
         )
 
@@ -410,52 +421,53 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
         original_inputs = deepcopy(inputs)
 
         # Force some of the inputs to be in the extended embeddings range
         # to guarantee that their behavior is tested.
-        for input_, original_input_, lora_id in zip(inputs, original_inputs,
-                                                    prompt_mapping):
+        for input_, original_input_, lora_id in zip(
+            inputs, original_inputs, prompt_mapping
+        ):
             embedding_id = lora_id - 1
             input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
             original_input_[-1] = vocab_size
-            input_[-2] = vocab_size + (
-                (embedding_id + 1) * embeddings_tensor_len - 1)
+            input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
 
-        expanded_embedding.weight[vocab_size:vocab_size +
-                                  (embeddings_tensor_len *
-                                   max_loras)] = torch.cat(embeddings_tensors)
+        expanded_embedding.weight[
+            vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
+        ] = torch.cat(embeddings_tensors)
 
         lora_result = lora_embedding(torch.cat(original_inputs))
 
         expected_results: list[torch.Tensor] = []
-        for input_, original_input_, lora_id in zip(inputs, original_inputs,
-                                                    prompt_mapping):
+        for input_, original_input_, lora_id in zip(
+            inputs, original_inputs, prompt_mapping
+        ):
             lora = lora_dict[lora_id]
             result = expanded_embedding(input_)
             after_a = F.embedding(
                 original_input_,
                 lora.lora_a.T,
             )
-            result += (after_a @ lora.lora_b.T)
+            result += after_a @ lora.lora_b.T
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -465,24 +477,24 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=[0],
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
+            device=device,
+        )
         original_inputs = deepcopy(inputs)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
         lora_result = lora_embedding(torch.cat(original_inputs))
         expected_result = expanded_embedding(torch.cat(inputs))
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -490,9 +502,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
-def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
-                                  stage) -> None:
-
+def test_lm_head_logits_processor(
+    dist_init, num_loras, device, vocab_size, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -500,22 +512,25 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def _pretest():
-        linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
-                                1024,
-                                vocab_size,
-                                params_dtype=torch.float16)
+        linear = ParallelLMHead(
+            vocab_size + lora_config.lora_extra_vocab_size,
+            1024,
+            vocab_size,
+            params_dtype=torch.float16,
+        )
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
         logits_processor = LogitsProcessor(
-            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
+            vocab_size + lora_config.lora_extra_vocab_size, vocab_size
+        )
         lora_logits_processor = LogitsProcessorWithLoRA(
-            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
-            None)
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
+        )
         lora_logits_processor.create_lora_weights(max_loras, lora_config)
 
         return linear, logits_processor, lora_logits_processor
@@ -542,10 +557,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -556,25 +570,24 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
         input_ = torch.rand(20, 1024)
 
         lora_result = lora_logits_processor._get_logits(
-            hidden_states=torch.cat(inputs),
-            lm_head=linear,
-            embedding_bias=None)
+            hidden_states=torch.cat(inputs), lm_head=linear, embedding_bias=None
+        )
 
         original_lm_head = deepcopy(linear)
 
-        linear.weight[logits_processor.
-                      org_vocab_size:logits_processor.org_vocab_size +
-                      embeddings_tensor_len] = embeddings_tensor
+        linear.weight[
+            logits_processor.org_vocab_size : logits_processor.org_vocab_size
+            + embeddings_tensor_len
+        ] = embeddings_tensor
 
-        logits_processor.org_vocab_size = (vocab_size +
-                                           lora_config.lora_extra_vocab_size)
+        logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
-            result = logits_processor._get_logits(hidden_states=input_,
-                                                  lm_head=linear,
-                                                  embedding_bias=None)
-            result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
+            result = logits_processor._get_logits(
+                hidden_states=input_, lm_head=linear, embedding_bias=None
+            )
+            result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
             result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
@@ -591,10 +604,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -606,17 +618,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
             lm_head=original_lm_head,
-            embedding_bias=None)[:, :vocab_size]
+            embedding_bias=None,
+        )[:, :vocab_size]
         expected_result = logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
             lm_head=original_lm_head,
-            embedding_bias=None)
+            embedding_bias=None,
+        )
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -629,7 +640,6 @@ def test_linear_replicated(
     device,
     stage,
 ) -> None:
-
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -644,17 +654,17 @@ def test_linear_replicated(
     )
 
     def create_random_linear_replicated_layer():
-
-        linear = ReplicatedLinear(4096,
-                                  4096,
-                                  bias=False,
-                                  params_dtype=torch.float16)
+        linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
         linear.weight.data = torch.rand_like(linear.weight.data)
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
         lora_linear.create_lora_weights(max_loras, lora_config)
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == 1)
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == 1
+        )
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -676,10 +686,9 @@ def test_linear_replicated(
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -692,7 +701,6 @@ def test_linear_replicated(
 
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
-
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
             result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
@@ -700,10 +708,7 @@ def test_linear_replicated(
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -716,22 +721,19 @@ def test_linear_replicated(
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+        punica_wrapper.update_metadata(
+            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+        )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -740,9 +742,9 @@ def test_linear_replicated(
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage) -> None:
-
+def test_linear_parallel(
+    dist_init, num_loras, orientation, fully_shard, device, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -759,25 +761,32 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
-            linear = RowParallelLinear(4096,
-                                       4096,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = RowParallelLinear(
+                4096, 4096, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
-                           else RowParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                RowParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else RowParallelLinearWithShardedLoRA(linear)
+            )
         else:
-            linear = ColumnParallelLinear(4096,
-                                          4096,
-                                          bias=False,
-                                          params_dtype=torch.float16)
+            linear = ColumnParallelLinear(
+                4096, 4096, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (ColumnParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           ColumnParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                ColumnParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else ColumnParallelLinearWithShardedLoRA(linear)
+            )
         lora_linear.create_lora_weights(max_loras, lora_config)
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == 1)
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == 1
+        )
 
         return linear, lora_linear
 
@@ -800,10 +809,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -823,10 +831,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -839,22 +844,19 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+        punica_wrapper.update_metadata(
+            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+        )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -863,9 +865,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage) -> None:
-
+def test_column_parallel_packed(
+    dist_init, num_loras, repeats, fully_shard, device, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -882,33 +884,35 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
-            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
-                                                bias=False,
-                                                params_dtype=torch.float16)
+            linear = MergedColumnParallelLinear(
+                4096, [4096] * repeats, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           MergedColumnParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                MergedColumnParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else MergedColumnParallelLinearWithShardedLoRA(linear)
+            )
         elif repeats == 3:
-            linear = QKVParallelLinear(4096,
-                                       64,
-                                       32,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = QKVParallelLinear(
+                4096, 64, 32, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (MergedQKVParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           MergedQKVParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                MergedQKVParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else MergedQKVParallelLinearWithShardedLoRA(linear)
+            )
         else:
-            linear = QKVParallelLinear(4096,
-                                       64,
-                                       32,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = QKVParallelLinear(
+                4096, 64, 32, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = QKVParallelLinearWithLoRA(
-                linear
-            ) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear)
+            lora_linear = (
+                QKVParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else QKVParallelLinearWithShardedLoRA(linear)
+            )
 
         @dataclass
         class FakeConfig:
@@ -917,11 +921,15 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
             num_attention_heads = 32
 
         n_slices = repeats
-        lora_linear.create_lora_weights(max_loras,
-                                        lora_config,
-                                        model_config=FakeConfig())
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == n_slices)
+        lora_linear.create_lora_weights(
+            max_loras, lora_config, model_config=FakeConfig()
+        )
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == n_slices
+        )
 
         return linear, lora_linear
 
@@ -946,10 +954,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
             lora_mapping,
@@ -966,18 +973,14 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
             for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[0] * i:sublora.lora_b.shape[0] *
-                       (i + 1)] += (
-                           input_ @ sublora.lora_a.T @ sublora.lora_b.T *
-                           sublora.scaling)
+                result[
+                    :, sublora.lora_b.shape[0] * i : sublora.lora_b.shape[0] * (i + 1)
+                ] += input_ @ sublora.lora_a.T @ sublora.lora_b.T * sublora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         for slot_idx in range(max_loras):
             lora_linear.reset_lora(slot_idx)
@@ -988,10 +991,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
             lora_mapping,
@@ -1005,15 +1007,13 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
-    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
+    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
+)
 def test_vocab_parallel_embedding_indices(tp_size, seed):
     random.seed(seed)
     vocab_size = random.randint(4000, 64000)
@@ -1031,20 +1031,24 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
     token_ids: list[int] = []
 
     for tp_rank in range(tp_size):
-        with patch(
+        with (
+            patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
-                return_value=tp_rank
-        ), patch(
+                return_value=tp_rank,
+            ),
+            patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
-                return_value=tp_size):
+                return_value=tp_size,
+            ),
+        ):
             vocab_embedding = VocabParallelEmbedding(
-                vocab_size, 1, org_num_embeddings=org_vocab_size)
+                vocab_size, 1, org_num_embeddings=org_vocab_size
+            )
         vocab_size_padded = vocab_embedding.num_embeddings_padded
         shard_indices = vocab_embedding.shard_indices
         # Assert that the ranges are contiguous
         assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
-        assert (shard_indices.added_vocab_start_index ==
-                last_added_vocab_end_index)
+        assert shard_indices.added_vocab_start_index == last_added_vocab_end_index
 
         # Ensure that we are not exceeding the vocab size
         computed_vocab_size += shard_indices.num_elements_padded
@@ -1053,22 +1057,39 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
 
         # Ensure that the ranges are not overlapping
         all_org_tokens.extend(
-            range(shard_indices.org_vocab_start_index,
-                  shard_indices.org_vocab_end_index))
+            range(
+                shard_indices.org_vocab_start_index, shard_indices.org_vocab_end_index
+            )
+        )
         all_added_tokens.extend(
-            range(shard_indices.added_vocab_start_index,
-                  shard_indices.added_vocab_end_index))
+            range(
+                shard_indices.added_vocab_start_index,
+                shard_indices.added_vocab_end_index,
+            )
+        )
 
         token_ids.extend(
-            range(shard_indices.org_vocab_start_index,
-                  shard_indices.org_vocab_end_index))
-        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
-                                 shard_indices.num_org_elements))
+            range(
+                shard_indices.org_vocab_start_index, shard_indices.org_vocab_end_index
+            )
+        )
+        token_ids.extend(
+            [-1]
+            * (shard_indices.num_org_elements_padded - shard_indices.num_org_elements)
+        )
+        token_ids.extend(
+            range(
+                shard_indices.added_vocab_start_index,
+                shard_indices.added_vocab_end_index,
+            )
+        )
         token_ids.extend(
-            range(shard_indices.added_vocab_start_index,
-                  shard_indices.added_vocab_end_index))
-        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
-                                 shard_indices.num_added_elements))
+            [-1]
+            * (
+                shard_indices.num_added_elements_padded
+                - shard_indices.num_added_elements
+            )
+        )
 
         last_org_vocab_end_index = shard_indices.org_vocab_end_index
         last_added_vocab_end_index = shard_indices.added_vocab_end_index
@@ -1096,130 +1117,165 @@ def test_get_masked_input_and_mask():
     x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
 
     # base tp 1 case, no padding
-    modified_x, _ = get_masked_input_and_mask(x,
-                                              org_vocab_start_index=0,
-                                              org_vocab_end_index=8,
-                                              added_vocab_start_index=8,
-                                              added_vocab_end_index=12,
-                                              num_org_vocab_padding=0)
+    modified_x, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=8,
+        added_vocab_start_index=8,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0,
+    )
     assert torch.equal(x, modified_x)
 
     # tp 2 case, no padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=0)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=4,
+        added_vocab_start_index=8,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_1, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=8,
         added_vocab_start_index=10,
         added_vocab_end_index=12,
-        num_org_vocab_padding=0)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+        num_org_vocab_padding=0,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5])
+    )
 
     # tp 4 case, no padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=2,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=9,
-                                                     num_org_vocab_padding=0)
-    modified_x_rank_1, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=2,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=9,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=0)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=2,
+        added_vocab_start_index=8,
+        added_vocab_end_index=9,
+        num_org_vocab_padding=0,
+    )
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=2,
+        org_vocab_end_index=4,
+        added_vocab_start_index=9,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_2, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=6,
         added_vocab_start_index=10,
         added_vocab_end_index=11,
-        num_org_vocab_padding=0)
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_3, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=6,
         org_vocab_end_index=8,
         added_vocab_start_index=11,
         added_vocab_end_index=12,
-        num_org_vocab_padding=0)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
-    assert torch.equal(modified_x_rank_2,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
-    assert torch.equal(modified_x_rank_3,
-                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+        num_org_vocab_padding=0,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_2, torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_3, torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2])
+    )
 
     # base tp 1 case, with padding
-    modified_x, _ = get_masked_input_and_mask(x,
-                                              org_vocab_start_index=0,
-                                              org_vocab_end_index=8,
-                                              added_vocab_start_index=8,
-                                              added_vocab_end_index=12,
-                                              num_org_vocab_padding=2)
-    assert torch.equal(modified_x,
-                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+    modified_x, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=8,
+        added_vocab_start_index=8,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13])
+    )
 
     # tp 2 case, with padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=2)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=4,
+        added_vocab_start_index=8,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_1, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=8,
         added_vocab_start_index=10,
         added_vocab_end_index=12,
-        num_org_vocab_padding=2)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7])
+    )
 
     # tp 4 case, with padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=2,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=9,
-                                                     num_org_vocab_padding=2)
-    modified_x_rank_1, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=2,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=9,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=2)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=2,
+        added_vocab_start_index=8,
+        added_vocab_end_index=9,
+        num_org_vocab_padding=2,
+    )
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=2,
+        org_vocab_end_index=4,
+        added_vocab_start_index=9,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_2, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=6,
         added_vocab_start_index=10,
         added_vocab_end_index=11,
-        num_org_vocab_padding=2)
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_3, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=6,
         org_vocab_end_index=8,
         added_vocab_start_index=11,
         added_vocab_end_index=12,
-        num_org_vocab_padding=2)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
-    assert torch.equal(modified_x_rank_2,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
-    assert torch.equal(modified_x_rank_3,
-                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_2, torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_3, torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4])
+    )
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index a6770e6d32af890989f227ecd7226641089b66a1..7bbd1e364d19e1b099c8c2337deab67a53e2f3e8 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
-from typing import Union
+
+import pytest
 
 import vllm
+import vllm.config
 from vllm import LLM
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -15,31 +17,32 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
 EXPECTED_LORA_OUTPUT = [
     "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
     "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
     "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
 ]
 
 
-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
+def do_sample(
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: int,
+    tensorizer_config_dict: dict | None = None,
+) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
     ]
 
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          skip_special_tokens=False,
-                                          stop=["[/assistant]"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+    )
 
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
@@ -49,14 +52,19 @@ def do_sample(llm: vllm.LLM,
                 str(lora_id),
                 lora_id,
                 lora_path,
-                tensorizer_config_dict=tensorizer_config_dict)
-            if lora_id else None)
+                tensorizer_config_dict=tensorizer_config_dict,
+            )
+            if lora_id
+            else None,
+        )
     else:
         outputs = llm.generate(
             prompts,
             sampling_params,
             lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-            if lora_id else None)
+            if lora_id
+            else None,
+        )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -67,42 +75,52 @@ def do_sample(llm: vllm.LLM,
     return generated_texts
 
 
-def generate_and_test(llm,
-                      sql_lora_files,
-                      tensorizer_config_dict: Union[dict, None] = None):
+def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
     print("lora adapter created")
     print("lora 1")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=1,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("lora 2")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=2) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=2,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("removing lora")
 
 
 @create_new_process_for_each_test()
-def test_llama_lora(sql_lora_files):
-
+@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
+def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
     llm = vllm.LLM(
         MODEL_PATH,
         tokenizer=sql_lora_files,
         enable_lora=True,
         # also test odd max_num_seqs
         max_num_seqs=13,
-        max_loras=4)
+        max_loras=4,
+        compilation_config=vllm.config.CompilationConfig(
+            cudagraph_specialize_lora=cudagraph_specialize_lora,
+        ),
+    )
     generate_and_test(llm, sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
-
     llm = vllm.LLM(
         MODEL_PATH,
         tokenizer=sql_lora_files,
@@ -115,9 +133,7 @@ def test_llama_lora_tp4(sql_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
-
     llm = vllm.LLM(
         MODEL_PATH,
         tokenizer=sql_lora_files,
@@ -131,10 +147,9 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-@create_new_process_for_each_test()
-def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
-                                            sql_lora_huggingface_id):
-
+def test_tp2_serialize_and_deserialize_lora(
+    tmp_path, sql_lora_files, sql_lora_huggingface_id
+):
     # Run the tensorizing of the LoRA adapter and the model in a subprocess
     # to guarantee cleanup
 
@@ -145,17 +160,28 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     lora_path = sql_lora_huggingface_id
     suffix = "test"
     try:
-        result = subprocess.run([
-            sys.executable,
-            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
-            str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
-            '{"limit_cpu_concurrency": 4}'
-        ],
-                                check=True,
-                                capture_output=True,
-                                text=True)
+        result = subprocess.run(
+            [
+                sys.executable,
+                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                "--model",
+                MODEL_PATH,
+                "--lora-path",
+                lora_path,
+                "--tensor-parallel-size",
+                str(tp_size),
+                "serialize",
+                "--serialized-directory",
+                str(tmp_path),
+                "--suffix",
+                suffix,
+                "--serialization-kwargs",
+                '{"limit_cpu_concurrency": 4}',
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
     except subprocess.CalledProcessError as e:
         print("Tensorizing failed.")
         print("STDOUT:\n", e.stdout)
@@ -167,21 +193,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
     tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
 
-    loaded_llm = LLM(model=model_ref,
-                     tokenizer=sql_lora_files,
-                     load_format="tensorizer",
-                     enable_lora=True,
-                     enforce_eager=True,
-                     model_loader_extra_config=tensorizer_config,
-                     max_num_seqs=13,
-                     tensor_parallel_size=2,
-                     max_loras=2)
+    loaded_llm = LLM(
+        model=model_ref,
+        tokenizer=sql_lora_files,
+        load_format="tensorizer",
+        enable_lora=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config,
+        max_num_seqs=13,
+        tensor_parallel_size=2,
+        max_loras=2,
+    )
 
     tc_as_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
     print("lora 1")
-    assert do_sample(loaded_llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tc_as_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
diff --git a/tests/lora/test_llm_with_multi_loras.py b/tests/lora/test_llm_with_multi_loras.py
index 3d8dd512a201939b65297073e847afaf0a38bc9e..269a1ade7734f22fc2219dfb0f0cbe3a337a04b4 100644
--- a/tests/lora/test_llm_with_multi_loras.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -5,6 +5,7 @@ This script contains:
 1. test multi loras service with tp >= 2
 2. test multi loras request
 """
+
 import pytest
 
 from tests.utils import multi_gpu_test
@@ -25,20 +26,14 @@ LORA_RANK = 8
 LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
 LORA_TEST_EXPECTED = [
     "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.",  # noqa: E501
-    "I am Alice, an AI assistant developed by GitHub/Charent.",  # noqa: E501
+    "I am Alice, an AI assistant developed by GitHub/Charent.",
 ]
 
 
 def format_chatml_messages(prompt: str):
     return [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        },
-        {
-            "role": "user",
-            "content": prompt
-        },
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
     ]
 
 
@@ -57,7 +52,6 @@ def make_add_lora_request(name: str, path: str):
 
 @multi_gpu_test(num_gpus=2)
 def test_multi_loras_with_tp_sync():
-
     llm = LLM(
         model=MODEL_PATH,
         enable_lora=True,
@@ -116,15 +110,17 @@ def test_multi_loras_with_tp_sync():
 
     def reload_lora(name: str):
         """
-        reload a lora to simulate the case: 
-        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` 
+        reload a lora to simulate the case:
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
         for dynamic lora loading and unloading
         """
         remove_lora_response = llm.llm_engine.remove_lora(
-            lora_id=LORA_NAME_ID_MAP[name])
+            lora_id=LORA_NAME_ID_MAP[name]
+        )
 
         add_lora_response = llm.llm_engine.add_lora(
-            make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
+            make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
+        )
 
         print(f"{remove_lora_response=}, {add_lora_response=}")
 
@@ -134,7 +130,6 @@ def test_multi_loras_with_tp_sync():
         assert outputs == expected
 
     for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
-
         output_text = call_llm_get_outputs(prompt, "Alice")
         check_outputs(output_text, expected_output)
 
@@ -175,8 +170,7 @@ def test_multiple_lora_requests():
     PROMPTS = ["Hello, my name is"] * 2
     LORA_NAME = "Alice"
     lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1,
-                    LORA_NAME_PATH_MAP[LORA_NAME])
+        LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
         for idx in range(len(PROMPTS))
     ]
     # Multiple SamplingParams should be matched with each prompt
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index ebc0f26378d27ee388079779b10a44f3526a0a15..2219d470e91a1efc50105948eb18df998fc13f3f 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -8,9 +8,7 @@ from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
-lora_lst = [
-    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
-]
+lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
 BAICHUAN_LORA_MODULES = [
     "W_pack",
     "o_proj",
@@ -37,8 +35,9 @@ def test_load_checkpoints(
         else:
             expected_lora_modules.append(module)
     if lora_name == "baichuan7B":
-        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_lora_files, max_position_embeddings=4096
+        )
         # For the baichuan7B model, load it's LoRA,
         # and the test should pass.
         LoRAModel.from_local_checkpoint(
@@ -48,13 +47,15 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
         # such as "model.layers.0.self_atten.W_pack", and
         # the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_zero_lora_files, max_position_embeddings=4096
+        )
         LoRAModel.from_local_checkpoint(
             baichuan_zero_lora_files,
             expected_lora_modules,
@@ -62,12 +63,14 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
         # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_regex_lora_files, max_position_embeddings=4096
+        )
         LoRAModel.from_local_checkpoint(
             baichuan_regex_lora_files,
             expected_lora_modules,
@@ -75,13 +78,15 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
         expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
-        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            chatglm3_lora_files, max_position_embeddings=4096
+        )
         with pytest.raises(ValueError, match=expected_error):
             LoRAModel.from_local_checkpoint(
                 chatglm3_lora_files,
@@ -90,11 +95,11 @@ def test_load_checkpoints(
                 lora_model_id=1,
                 device="cpu",
                 embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules)
+                embedding_padding_modules=embed_padding_modules,
+            )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
-
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
@@ -113,8 +118,9 @@ def test_lora_weights_mapping(baichuan_lora_files):
             ".layers.": ".baichuan_layers.",
         },
     )
-    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        baichuan_lora_files, max_position_embeddings=4096
+    )
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 221d5237823ca3e6e0e9c234087fe87e6b23ccd9..e914393fee8aaf81c4ecc13ee2f33b26f0f55949 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -3,11 +3,13 @@
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
+
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.lora.request import LoRARequest
 from vllm.v1.engine.llm_engine import LLMEngine
 
@@ -17,23 +19,24 @@ LORA_RANK = 8
 
 
 def make_lora_request(lora_id: int):
-    return LoRARequest(lora_name=f"{lora_id}",
-                       lora_int_id=lora_id,
-                       lora_path=LORA_MODULE_PATH)
+    return LoRARequest(
+        lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
+    )
 
 
 def test_lora_functions_sync():
-
     max_loras = 4
     # Create engine in eager-mode. Due to high max_loras, the CI can
     # OOM during cuda-graph capture.
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             enable_lora=True,
-                             max_loras=max_loras,
-                             max_lora_rank=LORA_RANK,
-                             max_model_len=128,
-                             gpu_memory_utilization=0.8,
-                             enforce_eager=True)
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     llm = LLMEngine.from_engine_args(engine_args)
 
@@ -70,15 +73,16 @@ def test_lora_functions_sync():
 
 @pytest.mark.asyncio
 async def test_lora_functions_async():
-
     max_loras = 4
-    engine_args = AsyncEngineArgs(model=MODEL_PATH,
-                                  enable_lora=True,
-                                  max_loras=max_loras,
-                                  max_lora_rank=LORA_RANK,
-                                  max_model_len=128,
-                                  gpu_memory_utilization=0.8,
-                                  enforce_eager=True)
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     async def run_check(fn, args, expected: list):
         await fn(args)
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index b46d81f1651a62d4964828701528f936d897d178..7d20faef541aac067cab98fbc5df5b2c2081a58f 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -11,8 +11,12 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
-    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-    "lm_head"
+    "qkv_proj",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+    "embed_tokens",
+    "lm_head",
 ]
 
 
@@ -40,7 +44,8 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules)
+        embedding_padding_modules=embed_padding_modules,
+    )
 
     # Assertions to ensure the model is loaded correctly
     assert lora_model is not None, "LoRAModel is not loaded correctly"
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 6f0a852314081c3af4078f1aadcfcf8d531aaa5a..e7816031142e36201eea0685353d5567f2ac3d65 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -10,16 +10,21 @@ from torch import nn
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
+from vllm.lora.layers import (
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+)
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager)
+from vllm.lora.models import (
+    LoRAMapping,
+    LoRAModel,
+    LoRAModelManager,
+    LRUCacheLoRAModelManager,
+)
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
-                                      WorkerLoRAManager)
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager, WorkerLoRAManager
 from vllm.platforms import current_platform
 
 from .utils import create_peft_lora
@@ -31,22 +36,25 @@ EMBEDDING_MODULES = {
 
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
-DEVICES = ([
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-] if current_platform.is_cuda_alike() else ["cpu"])
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
 
 DEFAULT_DTYPE = torch.get_default_dtype()
 
 
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(
-        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+        os.path.join(sql_lora_files, "new_embeddings.safetensors")
+    )
 
-    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        sql_lora_files, max_position_embeddings=4096
+    )
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
@@ -54,7 +62,8 @@ def test_from_lora_tensors(sql_lora_files, device):
         device=device,
         embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
+    )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
@@ -63,22 +72,27 @@ def test_from_lora_tensors(sql_lora_files, device):
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
         assert lora.lora_b.device == torch.device(device)
-        assert (lora.lora_a.shape[0] == lora.lora_b.shape[1]
-                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[0] == lora.lora_b.shape[1], (
+            f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        )
         assert lora.lora_a.shape[0] == 8
         embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None)
+            (k for k in EMBEDDING_MODULES if k in module_name), None
+        )
         if embeddings_module:
             assert torch.equal(
                 lora.embeddings_tensor,
                 new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device))
+                    device=lora.embeddings_tensor.device
+                ),
+            )
         else:
             assert lora.embeddings_tensor is None
 
 
-def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
-                device: torch.device) -> LoRAModel:
+def create_lora(
+    lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device
+) -> LoRAModel:
     loras: dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
@@ -110,8 +124,7 @@ def create_packed_lora(
             8,
             16,
             torch.rand([8, w.shape[1]], device=device),
-            torch.rand([w.shape[0] // len(replaced_module_names), 8],
-                       device=device),
+            torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device),
         )
     return LoRAModel(lora_id, 8, loras)
 
@@ -119,42 +132,42 @@ def create_packed_lora(
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     manager = LoRAModelManager(
-        model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8,
-                   max_cpu_loras=8,
-                   max_loras=8,
-                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
+        model,
+        1,
+        1,
+        1,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=8, max_loras=8, lora_dtype=DEFAULT_DTYPE
+        ),
+        torch.device(DEVICES[0]),
+    )
     model = manager.model
-    assert isinstance(model.get_submodule("dense1"),
-                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense1"),
-                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense1"), ColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("layer1.dense1"), ColumnParallelLinearWithLoRA
+    )
     assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense2"),
-                      RowParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense2"), RowParallelLinearWithLoRA)
 
 
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=3,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -204,24 +217,21 @@ def test_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=3,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -297,27 +307,22 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora4 = create_lora(4,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=2,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     assert all(x is None for x in manager.lora_index_to_id)
 
     # Add up to capacity
@@ -421,12 +426,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
-                                          tmp_path):
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )
 
     dummy_lora_files = f"{tmp_path}/lora_adapter"
     os.makedirs(dummy_lora_files, exist_ok=True)
@@ -438,13 +441,13 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config,
-                             lora_config=lora_config)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+    )
 
     worker_adapter_manager.max_num_seqs = 4
     worker_adapter_manager.max_num_batched_tokens = 2
@@ -452,52 +455,64 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
     worker_adapter_manager.create_lora_manager(dummy_model)
 
     mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("3", 3, dummy_lora_files),
-        LoRARequest("4", 4, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("3", 3, dummy_lora_files),
+            LoRARequest("4", 4, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files),
-        LoRARequest("5", 5, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("2", 2, dummy_lora_files),
+            LoRARequest("5", 5, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, dummy_lora_files),
-        LoRARequest("7", 7, dummy_lora_files),
-        LoRARequest("8", 8, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, dummy_lora_files),
+            LoRARequest("7", 7, dummy_lora_files),
+            LoRARequest("8", 8, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
@@ -506,41 +521,40 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, dummy_lora_files),
-            LoRARequest("11", 11, dummy_lora_files),
-            LoRARequest("12", 12, dummy_lora_files),
-            LoRARequest("13", 13, dummy_lora_files),
-            LoRARequest("14", 14, dummy_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, dummy_lora_files),
+                LoRARequest("11", 11, dummy_lora_files),
+                LoRARequest("12", 12, dummy_lora_files),
+                LoRARequest("13", 13, dummy_lora_files),
+                LoRARequest("14", 14, dummy_lora_files),
+            ],
+            mapping,
+        )
 
     assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
-                                tmp_path):
+def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
     # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config,
-                             lora_config=lora_config)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
 
-    worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
-                                               EMBEDDING_MODULES,
-                                               EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager = WorkerLoRAManager(
+        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+    )
     worker_adapter_manager.vocab_size = (
-        dummy_model_gate_up.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size)
+        dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
+    )
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
     dummy_lora_files = f"{tmp_path}/lora_adapter"
@@ -553,49 +567,61 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
     )
 
     mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("3", 3, dummy_lora_files),
-        LoRARequest("4", 4, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("3", 3, dummy_lora_files),
+            LoRARequest("4", 4, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files),
-        LoRARequest("5", 5, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("2", 2, dummy_lora_files),
+            LoRARequest("5", 5, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, dummy_lora_files),
-        LoRARequest("7", 7, dummy_lora_files),
-        LoRARequest("8", 8, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, dummy_lora_files),
+            LoRARequest("7", 7, dummy_lora_files),
+            LoRARequest("8", 8, dummy_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
@@ -603,17 +629,19 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, dummy_lora_files),
-            LoRARequest("11", 11, dummy_lora_files),
-            LoRARequest("12", 12, dummy_lora_files),
-            LoRARequest("13", 13, dummy_lora_files),
-            LoRARequest("14", 14, dummy_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, dummy_lora_files),
+                LoRARequest("11", 11, dummy_lora_files),
+                LoRARequest("12", 12, dummy_lora_files),
+                LoRARequest("13", 13, dummy_lora_files),
+                LoRARequest("14", 14, dummy_lora_files),
+            ],
+            mapping,
+        )
 
     assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
@@ -624,7 +652,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
         model,
         module_name="gate_up_proj",
         replaced_module_names=["gate_proj", "up_proj"],
-        device=device)
+        device=device,
+    )
     model_lora1 = create_packed_lora(
         2,
         model,
@@ -634,19 +663,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
         empty_replaced_module_name="gate_proj",
     )
 
-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=2,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     model = manager.model
 
-    assert isinstance(model.get_submodule("gate_up_proj"),
-                      MergedColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("gate_up_proj"), MergedColumnParallelLinearWithLoRA
+    )
     # Verify packed lora is correct
     model_lora_clone = model_lora.clone(1)
     model_lora_clone1 = model_lora1.clone(1)
@@ -659,21 +690,27 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
 
-    torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora_clone.get_lora("gate_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora_clone.get_lora("gate_proj").lora_b)
-    torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora_clone.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora_clone.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora.lora_a[0], model_lora_clone.get_lora("gate_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[0], model_lora_clone.get_lora("gate_proj").lora_b
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_a[1], model_lora_clone.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[1], model_lora_clone.get_lora("up_proj").lora_b
+    )
 
     packed_lora1 = model_lora1.get_lora("gate_up_proj")
     assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
 
     assert packed_lora1.lora_a[0] is None
     assert packed_lora1.lora_b[0] is None
-    torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora_clone1.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora_clone1.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora1.lora_a[1], model_lora_clone1.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
+    )
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 99fe951bbf07009d1fd5c67bfc3c9c56e965bf88..1cf8ed602b6a4eb9da016e2cbdd799d731b13694 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -8,14 +8,15 @@ from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
-from ..utils import create_new_process_for_each_test
+from ..utils import multi_gpu_test
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
     "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
     "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
@@ -34,18 +35,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
         stop_token_ids=[128001, 128009],  # eos_id, eot_id
     )
 
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
+    inputs = [
+        {
+            "prompt": PROMPT_TEMPLATE,
+            "multi_modal_data": {"image": asset.pil_image},
+        }
+        for asset in IMAGE_ASSETS
+    ]
 
     outputs = llm.generate(
         inputs,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
     )
     # Print the outputs.
     generated_texts: list[str] = []
@@ -58,7 +59,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -68,10 +70,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_lora_rank=8,
         enforce_eager=True,
         max_model_len=2048,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
         trust_remote_code=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -82,12 +81,14 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output2[i])
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
+@multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -96,10 +97,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=4,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
         trust_remote_code=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -107,12 +105,14 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
+@multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -122,10 +122,7 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
         max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
-        limit_mm_per_prompt={
-            "image": 1,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 1, "video": 0},
         fully_sharded_loras=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 03e5d8d5d6728716615d57b361c52d1b2053907a..12c73f2d79f75de51da9aa3bdd2b75c21a4f7ca3 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -11,15 +11,15 @@ from vllm.platforms import current_platform
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: list[str]) -> list[str]:
-
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
+) -> list[str]:
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -33,8 +33,11 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
     """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count(
-    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
+    if (
+        torch.cuda.device_count() < tp_size
+        and tp_size > 1
+        and current_platform.is_cuda_alike()
+    ):
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     prompts = [
@@ -53,11 +56,22 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
     )
 
     expected_lora_output = [
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
-        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
-        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
+        [
+            "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])"  # noqa: E501
+        ],
+        [
+            "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
+            "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        ],
+        [
+            "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])"  # noqa: E501
+        ],
     ]
-    assert do_sample(llm, mixtral_lora_files, lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm, mixtral_lora_files, lora_id=2,
-                     prompts=prompts) == expected_lora_output
+
+    def check_outputs(generated: list[str]):
+        assert len(generated) == len(expected_lora_output)
+        for gen, gt_choices in zip(generated, expected_lora_output):
+            assert gen in gt_choices
+
+    check_outputs(do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts))
+    check_outputs(do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts))
diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..72f1d759f1e7a539e3dfafa669fece8e8d5718c9
--- /dev/null
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+
+
+def round_up(x, base):
+    return ((x + base - 1) // base) * base
+
+
+def CEILDIV(x, y):
+    return (x + y - 1) // y
+
+
+def sample_data(num_experts, max_loras, num_tokens, topk_num):
+    topk_ids = torch.zeros((num_tokens, topk_num), dtype=torch.int32)
+    token_lora_mapping = torch.zeros((num_tokens,), dtype=torch.int32)
+
+    for i in range(num_tokens):
+        pool = list(range(num_experts))
+        random.shuffle(pool)
+        for j in range(topk_num):
+            topk_ids[i, j] = pool[j]
+        token_lora_mapping[i] = random.randint(0, max_loras - 1)
+
+    return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
+
+
+@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096])  # 81920
+@pytest.mark.parametrize("topk_num", [6])
+@pytest.mark.parametrize("num_experts", [64, 128])
+@pytest.mark.parametrize("max_loras", [2, 32])
+@pytest.mark.parametrize("block_size", [16])
+def test_moe_lora_align_block_size(
+    num_tokens, topk_num, num_experts, max_loras, block_size
+):
+    # sample data
+    random.seed(1)
+    topk_ids, token_lora_mapping = sample_data(
+        num_experts, max_loras, num_tokens, topk_num
+    )
+
+    # compute paddings
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
+
+    # init output tensors
+    sorted_token_ids = torch.full(
+        (max_loras * max_num_tokens_padded,),
+        topk_ids.numel(),
+        dtype=torch.int32,
+        device="cuda",
+    )
+    expert_ids = torch.full(
+        (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
+    adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
+
+    # call kernel
+    ops.moe_lora_align_block_size(
+        topk_ids,
+        token_lora_mapping,
+        num_experts,
+        block_size,
+        max_loras,
+        max_num_tokens_padded,
+        max_num_m_blocks,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        adapter_enabled,
+        lora_ids,
+    )
+
+    # verify values
+    expert_ids = expert_ids.view(max_loras, -1)
+    sorted_token_ids = sorted_token_ids.view(max_loras, -1, block_size)
+
+    for lora_idx in range(max_loras):
+        for token_idx in range(sorted_token_ids.size(1)):
+            block = sorted_token_ids[lora_idx][token_idx]
+            indices = block[block != topk_ids.numel()]
+            if indices.numel() > 0:
+                expert_id = expert_ids[lora_idx][token_idx]
+                assert torch.all(topk_ids.view(-1)[indices] == expert_id)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e659c1e1a9a07c4ead04678173fc06555910c4c0
--- /dev/null
+++ b/tests/lora/test_olmoe_tp.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "allenai/OLMoE-1B-7B-0125-Instruct"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
+Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
+The People_ID of candidate is the foreign key of People_ID of people.
+
+
+###Input:
+{context}
+
+###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM candidate",
+    "SELECT count(*) FROM candidate",
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+]
+
+EXPECTED_BASE_MODEL_OUTPUT = [
+    "SELECT COUNT(Candidate_ID) FROM candidate",
+    "SELECT COUNT(Candidate_ID) FROM candidate",
+    "SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID",  # noqa: E501
+    "SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1",  # noqa: E501
+]
+
+
+def generate_and_test(
+    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+) -> None:
+    prompts = [
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+        PROMPT_TEMPLATE.format(
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the poll resource associated with the most candidates."
+        ),
+    ]
+
+    lora_request = None
+    if isinstance(lora_id, int):
+        lora_request = LoRARequest(str(lora_id), lora_id, lora_path)
+    elif isinstance(lora_id, list):
+        lora_request = [
+            LoRARequest(str(i), i, lora_path) if i is not None else None
+            for i in lora_id
+        ]
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
+    outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        expected_output = (
+            EXPECTED_LORA_OUTPUT[i]
+            if req_lora_id is not None
+            else EXPECTED_BASE_MODEL_OUTPUT[i]
+        )
+        assert generated_texts[i].startswith(expected_output)
+
+
+def test_olmoe_lora(olmoe_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+
+    generate_and_test(llm, olmoe_lora_files, lora_id=1)
+    generate_and_test(llm, olmoe_lora_files, lora_id=2)
+
+
+def test_olmoe_lora_mixed(olmoe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+
+    generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
+
+
+@multi_gpu_test(num_gpus=2)
+def test_olmoe_lora_tp2(olmoe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=2,
+    )
+
+    generate_and_test(llm, olmoe_lora_files, lora_id=1)
+    generate_and_test(llm, olmoe_lora_files, lora_id=2)
+
+
+@multi_gpu_test(num_gpus=4)
+def test_olmoe_lora_tp4(olmoe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=4,
+    )
+
+    generate_and_test(llm, olmoe_lora_files, lora_id=1)
+    generate_and_test(llm, olmoe_lora_files, lora_id=2)
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index ffffb5d8eab90d8be4b44d927dc6575e4ab6e5d4..9c55c623d444bfc77279854a8a4e536f6c4fc0ee 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -13,34 +13,22 @@ from vllm.lora.peft_helper import PEFTHelper
 ERROR_CASES = [
     (
         "test_rank",
-        {
-            "r": 1024
-        },
+        {"r": 1024},
         "is greater than max_lora_rank",
     ),
-    (
-        "test_bias",
-        {
-            "bias": "all"
-        },
-        "Adapter bias cannot be used without bias_enabled",
-    ),
-    ("test_dora", {
-        "use_dora": True
-    }, "does not yet support DoRA"),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
     (
         "test_modules_to_save",
-        {
-            "modules_to_save": ["lm_head"]
-        },
+        {"modules_to_save": ["lm_head"]},
         "only supports modules_to_save being None",
     ),
 ]
 
 
 def test_peft_helper_pass(sql_lora_files, tmp_path):
-    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        sql_lora_files, max_position_embeddings=4096
+    )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
@@ -74,8 +62,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
     with open(config_path, "w") as f:
         json.dump(adapter_config, f)
 
-    peft_helper = PEFTHelper.from_local_dir(test_dir,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
     peft_helper.validate_legal(lora_config)
     scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
     assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@@ -106,4 +93,5 @@ def test_peft_helper_error(
     # Test loading the adapter
     with pytest.raises(ValueError, match=expected_error):
         PEFTHelper.from_local_dir(
-            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
+            test_dir, max_position_embeddings=4096
+        ).validate_legal(lora_config)
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 14fa79ae5b4469ae8b402c682402f5e839ed89b5..e4df9751077d2e836447febaf8a13493c9071152 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -21,11 +21,18 @@ def reset_device(reset_default_device):
 
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
-        nslices: int, inputs_tensor: torch.Tensor,
-        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
-        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
-        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
-        num_tokens: int, scaling: float):
+    nslices: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    scaling: float,
+):
     """
     Wrapper around torch_ops.sgmv_shrink that handles any nslices.
     """
@@ -44,15 +51,20 @@ def sgmv_shrink_for_nslices(
         )
 
 
-def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
-                            inputs_tensor: torch.Tensor,
-                            lora_weights_lst: list[torch.Tensor],
-                            out_tensor: torch.Tensor,
-                            b_seq_start_loc: torch.Tensor,
-                            seq_len_tensor: torch.Tensor,
-                            prompt_lora_mapping: torch.Tensor, batches: int,
-                            max_seq_length: int, num_tokens: int,
-                            add_inputs: bool) -> None:
+def sgmv_expand_for_nslices(
+    nslices: int,
+    hidden_size: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    add_inputs: bool,
+) -> None:
     """
     Wrapper around torch_ops.sgmv_expand that handles any nslices.
     """
@@ -94,10 +106,17 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()
 
 
-def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             scaling: float):
+def check_lora_shrink_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    scaling: float,
+):
     """
     Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
     kernels.
@@ -116,14 +135,19 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
     max_seq_length, token_nums = data.meta()
 
     # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )
 
     # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
     ref_out_tensor = data.ref_out_tensor
@@ -154,10 +178,17 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
     assert_close(out_tensor, ref_out_tensor)
 
 
-def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             add_inputs: bool):
+def check_lora_expand_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    add_inputs: bool,
+):
     """
     Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
     kernels.
@@ -177,14 +208,19 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
     max_seq_length, token_nums = data.meta()
 
     # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )
 
     # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
     # Setup output tensors
@@ -194,21 +230,25 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
     with _dict_lock:
         # lora_expand kernel
         _LORA_B_PTR_DICT.clear()
-        triton_ops.lora_expand(data.inputs_tensor,
-                               data.lora_weights,
-                               out_tensor,
-                               *lora_meta.meta_args(token_nums=token_nums),
-                               offset_start=0,
-                               add_inputs=add_inputs)
+        triton_ops.lora_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
 
     # Reference
-    sgmv_expand_for_nslices(nslices,
-                            hidden_size,
-                            data.inputs_tensor,
-                            data.lora_weights,
-                            ref_out_tensor,
-                            *sgmv_meta_args,
-                            add_inputs=add_inputs)
+    sgmv_expand_for_nslices(
+        nslices,
+        hidden_size,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        add_inputs=add_inputs,
+    )
 
     assert_close(out_tensor, ref_out_tensor)
 
@@ -299,7 +339,7 @@ HIDDEN_SIZES = [
     128000,
     128256,
 ]
-#The size of TP
+# The size of TP
 divisibility = [1, 2, 8, 16, 64]
 
 all_hidden_size = []
@@ -331,10 +371,10 @@ DEVICES = [f"cuda:{0}"]
 SEED = [0]
 
 
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("batches", test_params["batches"])
+@pytest.mark.parametrize("num_loras", test_params["num_loras"])
+@pytest.mark.parametrize("rank", test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
 @pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
@@ -358,31 +398,35 @@ def test_kernels(
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
     else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )
+
+
+@pytest.mark.parametrize("batches", hs_test_params["batches"])
+@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
+@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
 @pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
@@ -406,22 +450,26 @@ def test_kernels_hidden_size(
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
     else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 2b54b2edd6a9c8dada49fd8aae7ab52ddf75ac9c..06e1b22ab56e53547d460cf341483bdfdc67a441 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -20,28 +20,27 @@ class ModelWithQuantization:
 
 
 MODELS: list[ModelWithQuantization]
-#AWQ quantization is currently not supported in ROCm.
+# AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
     MODELS = [
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
     ]
 else:
     MODELS = [
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="awq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
+        ),
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
     ]
 
 
-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              max_tokens: int = 256) -> list[str]:
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
+) -> list[str]:
     raw_prompts = [
         "Give me an orange-ish brown color",
         "Give me a neon pink color",
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,
 
     prompts = [format_prompt_tuples(p) for p in raw_prompts]
 
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=max_tokens,
-                                          stop=["<|im_end|>"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
+    )
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,
 
 @pytest.mark.parametrize("model", MODELS)
 def test_quant_model_lora(tinyllama_lora_files, model):
-
     llm = vllm.LLM(
         model=model.model_path,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
         max_model_len=400,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
         enable_chunked_prefill=True,
-        tokenizer=tinyllama_lora_files)
+        tokenizer=tinyllama_lora_files,
+    )
 
     if model.quantization is None:
         expected_lora_output = [
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
     def expect_match(output, expected_output):
         # HACK: GPTQ lora outputs are just incredibly unstable.
         # Assert that the outputs changed.
-        if (model.quantization == "gptq"
-                and expected_output is expected_lora_output):
+        if model.quantization == "gptq" and expected_output is expected_lora_output:
             for i, o in enumerate(output):
-                assert o.startswith(
-                    '#'), f"Expected example {i} to start with # but got {o}"
+                assert o.startswith("#"), (
+                    f"Expected example {i} to start with # but got {o}"
+                )
             return
         assert output == expected_output
 
@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
 
     print("lora adapter created")
     print("lora 1")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=1,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
     expect_match(output, expected_lora_output)
 
     print("lora 2")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=2,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
     expect_match(output, expected_lora_output)
 
     print("removing lora")
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
 
 
 @pytest.mark.parametrize("model", MODELS)
-def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
-                                 model):
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
     if model.quantization == "gptq":
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=2,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 76f3bc0ebf89f810cf9cdcaa6a625a2b3f49d931..1800ca107a4263e31d70ac4177b27c90a1bd51a1 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
@@ -20,7 +19,7 @@ class TestConfig:
     max_loras: int = 2
     max_lora_rank: int = 16
     max_model_len: int = 4096
-    mm_processor_kwargs: Optional[dict[str, int]] = None
+    mm_processor_kwargs: dict[str, int] | None = None
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
@@ -37,7 +36,8 @@ class Qwen2VLTester:
         "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
         "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
         "What is in the image?<|im_end|>\n"
-        "<|im_start|>assistant\n")
+        "<|im_start|>assistant\n"
+    )
 
     def __init__(self, config: TestConfig):
         self.config = config
@@ -56,68 +56,68 @@ class Qwen2VLTester:
             max_model_len=self.config.max_model_len,
         )
 
-    def run_test(self,
-                 images: list[ImageAsset],
-                 expected_outputs: list[str],
-                 lora_id: Optional[int] = None,
-                 temperature: float = 0,
-                 max_tokens: int = 5):
-
+    def run_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[str],
+        lora_id: int | None = None,
+        temperature: float = 0,
+        max_tokens: int = 5,
+    ):
         sampling_params = vllm.SamplingParams(
             temperature=temperature,
             max_tokens=max_tokens,
         )
-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.generate(inputs,
-                                    sampling_params,
-                                    lora_request=lora_request)
-        generated_texts = [
-            output.outputs[0].text.strip() for output in outputs
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
         ]
 
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
+        generated_texts = [output.outputs[0].text.strip() for output in outputs]
+
         # Validate outputs
         for generated, expected in zip(generated_texts, expected_outputs):
-            assert expected.startswith(
-                generated), f"Generated text {generated} doesn't "
+            assert expected.startswith(generated), (
+                f"Generated text {generated} doesn't "
+            )
             f"match expected pattern {expected}"
 
-    def run_beam_search_test(self,
-                             images: list[ImageAsset],
-                             expected_outputs: list[list[str]],
-                             lora_id: Optional[int] = None,
-                             temperature: float = 0,
-                             beam_width: int = 2,
-                             max_tokens: int = 5):
-
-        beam_search_params = BeamSearchParams(beam_width=beam_width,
-                                              max_tokens=max_tokens,
-                                              temperature=temperature)
-
-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.beam_search(inputs,
-                                       beam_search_params,
-                                       lora_request=lora_request)
+    def run_beam_search_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[list[str]],
+        lora_id: int | None = None,
+        temperature: float = 0,
+        beam_width: int = 2,
+        max_tokens: int = 5,
+    ):
+        beam_search_params = BeamSearchParams(
+            beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
+        )
+
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
+        ]
+
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.beam_search(
+            inputs, beam_search_params, lora_request=lora_request
+        )
 
         for output_obj, expected_outs in zip(outputs, expected_outputs):
             output_texts = [seq.text for seq in output_obj.sequences]
-            assert output_texts == expected_outs, \
-                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
+            assert output_texts == expected_outs, (
+                f"Generated texts {output_texts} do not match expected {expected_outs}"
+            )  # noqa: E501
 
 
 TEST_IMAGES = [
@@ -144,27 +144,25 @@ QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
     for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
@@ -176,7 +174,8 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         tester.run_beam_search_test(
             [ImageAsset("cherry_blossom")],
             expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
-            lora_id=lora_id)
+            lora_id=lora_id,
+        )
 
 
 @pytest.mark.xfail(
@@ -185,12 +184,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
 )
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
-                        lora_path=qwen25vl_lora_files)
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
     for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
diff --git a/tests/lora/test_qwen3moe_tp.py b/tests/lora/test_qwen3moe_tp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcac4275cc40e23fb8d0ee0afcc4b1973c055c3f
--- /dev/null
+++ b/tests/lora/test_qwen3moe_tp.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# NOTE To avoid overloading the CI pipeline, this test script will not
+# be triggered on CI and is primarily intended for local testing and verification.
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "Qwen/Qwen3-30B-A3B"
+
+PROMPT_TEMPLATE = """<|im_start|>user
+I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
+Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
+The People_ID of candidate is the foreign key of People_ID of people.
+
+
+###Input:
+{context}
+
+###Response:<|im_end|>
+<|im_start|>assistant"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
+    "<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
+    "<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    "<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+]
+
+
+def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
+    prompts = [
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+        PROMPT_TEMPLATE.format(
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the poll resource associated with the most candidates."
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
+
+
+def test_qwen3moe_lora(qwen3moe_lora_files):
+    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
+    # Otherwise, the lora-test will fail due to CUDA OOM.
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_qwen3moe_lora_tp2(qwen3moe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=2,
+    )
+
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
+
+
+@multi_gpu_test(num_gpus=4)
+def test_qwen3moe_lora_tp4(qwen3moe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=4,
+    )
+
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
+    generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index 6c93e577611f83adcac09729b41405fc6fed7761..9b5dedc4327fb6c40661581ab32e2398d4979c8a 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 
@@ -12,13 +11,15 @@ from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 class DummyLoRAResolver(LoRAResolver):
     """A dummy LoRA resolver for testing."""
 
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
         if lora_name == "test_lora":
             return LoRARequest(
                 lora_name=lora_name,
                 lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
-                lora_int_id=abs(hash(lora_name)))
+                lora_int_id=abs(hash(lora_name)),
+            )
         return None
 
 
@@ -70,6 +71,5 @@ async def test_dummy_resolver_resolve():
     assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
 
     # Test failed resolution
-    result = await dummy_resolver.resolve_lora(base_model_name,
-                                               "nonexistent_lora")
+    result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
     assert result is None
diff --git a/tests/lora/test_transformers_model.py b/tests/lora/test_transformers_model.py
index 723f7a54778feb1742ff7cfda444b376251cd27d..ea1f5f9c32c3f1ea2714253757b4322ac13f1abb 100644
--- a/tests/lora/test_transformers_model.py
+++ b/tests/lora/test_transformers_model.py
@@ -24,20 +24,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+            query="What are all distinct countries where singers above age 20 are from?"  # noqa: E501
         ),
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -49,13 +47,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 
 def test_ilama_lora(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -65,20 +65,23 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -88,20 +91,23 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index b343bef0a920b55b7a295ffc93fa9d25c1212bf4..eb026c2ec0209874f4adca12e05a9cd21fc4eba7 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -2,15 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import OrderedDict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 from unittest.mock import patch
 
 import pytest
 from huggingface_hub.utils import HfHubHTTPError
 from torch import nn
 
-from vllm.lora.utils import (get_adapter_absolute_path,
-                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.lora.utils import (
+    get_adapter_absolute_path,
+    parse_fine_tuned_lora_name,
+    replace_submodule,
+)
 from vllm.model_executor.models.utils import WeightsMapper
 
 
@@ -18,89 +21,85 @@ class LoRANameParserTestConfig(NamedTuple):
     name: str
     module_name: str
     is_lora_a: bool
-    is_bias: bool
-    weights_mapper: Optional[WeightsMapper] = None
+    weights_mapper: WeightsMapper | None = None
 
 
 def test_parse_fine_tuned_lora_name_valid():
     fixture = [
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
-                                 "lm_head", True, False),
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
-                                 "lm_head", False, False),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_A.weight", "lm_head", True, False
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_B.weight", "lm_head", False, False
+        ),
         LoRANameParserTestConfig(
             "base_model.model.model.embed_tokens.lora_embedding_A",
             "model.embed_tokens",
             True,
-            False,
         ),
         LoRANameParserTestConfig(
             "base_model.model.model.embed_tokens.lora_embedding_B",
             "model.embed_tokens",
             False,
-            False,
         ),
         LoRANameParserTestConfig(
             "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
             "model.layers.9.mlp.down_proj",
             True,
-            False,
         ),
         LoRANameParserTestConfig(
             "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
             "model.layers.9.mlp.down_proj",
             False,
-            False,
         ),
         LoRANameParserTestConfig(
             "language_model.layers.9.mlp.down_proj.lora_A.weight",
             "language_model.layers.9.mlp.down_proj",
             True,
-            False,
         ),
         LoRANameParserTestConfig(
             "language_model.layers.9.mlp.down_proj.lora_B.weight",
             "language_model.layers.9.mlp.down_proj",
             False,
-            False,
         ),
         # Test with WeightsMapper
         LoRANameParserTestConfig(
             "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
             "language_model.model.layers.9.mlp.down_proj",
             True,
-            False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
             "language_model.model.layers.9.mlp.down_proj",
             False,
-            False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "model.layers.9.mlp.down_proj.lora_A.weight",
             "language_model.model.layers.9.mlp.down_proj",
             True,
-            False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "model.layers.9.mlp.down_proj.lora_B.weight",
             "language_model.model.layers.9.mlp.down_proj",
             False,
-            False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
     ]
-    for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
-        assert (module_name, is_lora_a,
-                is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
+    for name, module_name, is_lora_a, weights_mapper in fixture:
+        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(
+            name, weights_mapper
+        )
 
 
 def test_parse_fine_tuned_lora_name_invalid():
@@ -115,22 +114,28 @@ def test_parse_fine_tuned_lora_name_invalid():
 
 def test_replace_submodule():
     model = nn.Sequential(
-        OrderedDict([
-            ("dense1", nn.Linear(764, 100)),
-            ("act1", nn.ReLU()),
-            ("dense2", nn.Linear(100, 50)),
-            (
-                "seq1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", nn.Linear(100, 10)),
-                        ("dense2", nn.Linear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", nn.Linear(50, 10)),
-            ("outact", nn.Sigmoid()),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", nn.Linear(764, 100)),
+                ("act1", nn.ReLU()),
+                ("dense2", nn.Linear(100, 50)),
+                (
+                    "seq1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", nn.Linear(100, 10)),
+                                ("dense2", nn.Linear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", nn.Linear(50, 10)),
+                ("outact", nn.Sigmoid()),
+            ]
+        )
+    )
 
     sigmoid = nn.Sigmoid()
 
@@ -143,52 +148,51 @@ def test_replace_submodule():
 
 
 # Unit tests for get_adapter_absolute_path
-@patch('os.path.isabs')
+@patch("os.path.isabs")
 def test_get_adapter_absolute_path_absolute(mock_isabs):
-    path = '/absolute/path/to/lora'
+    path = "/absolute/path/to/lora"
     mock_isabs.return_value = True
     assert get_adapter_absolute_path(path) == path
 
 
-@patch('os.path.expanduser')
+@patch("os.path.expanduser")
 def test_get_adapter_absolute_path_expanduser(mock_expanduser):
     # Path with ~ that needs to be expanded
-    path = '~/relative/path/to/lora'
-    absolute_path = '/home/user/relative/path/to/lora'
+    path = "~/relative/path/to/lora"
+    absolute_path = "/home/user/relative/path/to/lora"
     mock_expanduser.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('os.path.exists')
-@patch('os.path.abspath')
+@patch("os.path.exists")
+@patch("os.path.abspath")
 def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
     # Relative path that exists locally
-    path = 'relative/path/to/lora'
-    absolute_path = '/absolute/path/to/lora'
+    path = "relative/path/to/lora"
+    absolute_path = "/absolute/path/to/lora"
     mock_exist.return_value = True
     mock_abspath.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface(mock_exist,
-                                               mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
     # Hugging Face model identifier
-    path = 'org/repo'
-    absolute_path = '/mock/snapshot/path'
+    path = "org/repo"
+    absolute_path = "/mock/snapshot/path"
     mock_exist.return_value = False
     mock_snapshot_download.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface_error(mock_exist,
-                                                     mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface_error(
+    mock_exist, mock_snapshot_download
+):
     # Hugging Face model identifier with download error
-    path = 'org/repo'
+    path = "org/repo"
     mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError(
-        "failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
     assert get_adapter_absolute_path(path) == path
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9c47abf8f4dceb8112841e2e2fb233670293a3e6..c97f8debd1b9a9fdfe9c42f4a3a7822aa470c62f 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,8 +6,14 @@ import random
 import tempfile
 from unittest.mock import patch
 
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.models import LoRAMapping
@@ -19,12 +25,12 @@ NUM_LORAS = 16
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
-
     def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         lora_mapping = LoRAMapping([], [])
 
         worker.model_runner.lora_manager.set_active_adapters(
-            lora_requests, lora_mapping)
+            lora_requests, lora_mapping
+        )
 
     vllm_config = VllmConfig(
         model_config=ModelConfig(
@@ -49,9 +55,9 @@ def test_worker_apply_lora(sql_lora_files):
             swap_space=0,
             cache_dtype="auto",
         ),
-        lora_config=LoRAConfig(max_lora_rank=8,
-                               max_cpu_loras=NUM_LORAS,
-                               max_loras=NUM_LORAS),
+        lora_config=LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
+        ),
     )
     worker = Worker(
         vllm_config=vllm_config,
@@ -67,23 +73,22 @@ def test_worker_apply_lora(sql_lora_files):
     assert worker.list_loras() == set()
 
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files)
-        for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
     assert worker.list_loras() == {
-        lora_request.lora_int_id
-        for lora_request in lora_requests
+        lora_request.lora_int_id for lora_request in lora_requests
     }
 
     for i in range(NUM_LORAS):
         random.seed(i)
-        iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, NUM_LORAS))
+        iter_lora_requests = random.choices(
+            lora_requests, k=random.randint(1, NUM_LORAS)
+        )
         random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
+        iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
         set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
-            {lora_request.lora_int_id
-             for lora_request in iter_lora_requests})
+            {lora_request.lora_int_id for lora_request in iter_lora_requests}
+        )
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 0432a1a9bba07e6b055ee4efd7a7d59aace0c756..d30b77f0946654f618eefa49d1951c50af45a03f 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -4,7 +4,6 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 from safetensors.torch import save_file
@@ -13,7 +12,6 @@ from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 
 
 class DummyLoRAManager:
-
     def __init__(self, device: torch.device = "cuda:0"):
         super().__init__()
         self._loras: dict[str, LoRALayerWeights] = {}
@@ -36,12 +34,12 @@ class DummyLoRAManager:
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([rank, weight.shape[1]],
-                              dtype=weight.dtype,
-                              device=self._device),
-            lora_b=torch.rand([weight.shape[0], rank],
-                              dtype=weight.dtype,
-                              device=self._device),
+            lora_a=torch.rand(
+                [rank, weight.shape[1]], dtype=weight.dtype, device=self._device
+            ),
+            lora_b=torch.rand(
+                [weight.shape[0], rank], dtype=weight.dtype, device=self._device
+            ),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(
@@ -82,7 +80,7 @@ class DummyLoRAManager:
         module_name: str,
         input_dim: int,
         output_dims: list[int],
-        noop_lora_index: Optional[list[int]] = None,
+        noop_lora_index: list[int] | None = None,
         rank: int = 8,
     ):
         base_loras: list[LoRALayerWeights] = []
@@ -114,7 +112,7 @@ def assert_close(a, b):
 @dataclass
 class PunicaTensors:
     inputs_tensor: torch.Tensor
-    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
+    lora_weights: torch.Tensor | list[torch.Tensor]
     our_out_tensor: torch.Tensor
     ref_out_tensor: torch.Tensor
     b_seq_start_loc: torch.Tensor
@@ -146,27 +144,26 @@ def generate_data(
     op_type,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
     ).to(device)
     total_tokens = seq_len_tensor.sum()
     if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
         lora_weights = torch.rand(
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
         # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
+        ref_out_tensor = torch.zeros(
+            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
+        )
         # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=torch.float32).to(device)
+        our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
+            device
+        )
     else:
         inputs_tensor = torch.rand(
             (total_tokens, max_rank),
@@ -184,15 +181,16 @@ def generate_data(
         ).to(device)
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, )).to(device)
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    ).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]].copy_(lora_index)
+        indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
+            lora_index
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     return PunicaTensors(
@@ -217,8 +215,7 @@ def generate_data_for_expand_nslices(
     nslices,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -234,22 +231,25 @@ def generate_data_for_expand_nslices(
             torch.rand(
                 (lora_nums, hidden_size, max_rank),  # col-major
                 dtype=dtype,
-            ).to(device))
+            ).to(device)
+        )
     # expand op needs to complete y+=a@lora_b, so output is
     # initinized randomly
-    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                dtype=dtype).to(device)
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
+        device
+    )
     # Ensure the same input.
     our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
@@ -276,8 +276,7 @@ def generate_data_for_nslices(
     op_type,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -286,9 +285,7 @@ def generate_data_for_nslices(
 
     lora_weights_lst = []
     if op_type == "shrink":
-
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
 
         for _ in range(nslices):
             if op_type == "shrink":
@@ -296,7 +293,8 @@ def generate_data_for_nslices(
                     torch.rand(
                         (lora_nums, max_rank, hidden_size),  # col-major
                         dtype=dtype,
-                    ).to(device))
+                    ).to(device)
+                )
         # NOTE  shrink kernel using torch.float32 as output type
         # shrink op need atomic_add, so output is initinized by 0
         our_out_tensor = torch.zeros(
@@ -313,23 +311,26 @@ def generate_data_for_nslices(
                 torch.rand(
                     (lora_nums, hidden_size, max_rank),  # col-major
                     dtype=dtype,
-                ).to(device))
+                ).to(device)
+            )
         # expand op needs to complete y+=a@lora_b, so output is
         # initinized randomly
-        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                    dtype=dtype).to(device)
+        our_out_tensor = torch.rand(
+            (total_tokens, hidden_size * nslices), dtype=dtype
+        ).to(device)
 
     # Ensure the same input.
     ref_out_tensor = our_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
@@ -379,24 +380,20 @@ def create_peft_lora(
     }
 
     for module_name in target_modules:
-
         module = model
         for attr in module_name.split("."):
             module = getattr(module, attr)
 
         if hasattr(module, "input_size") and hasattr(module, "output_size"):
-
             in_features = module.input_size
             out_features = module.output_size
 
-        elif hasattr(module, "embedding_dim") and hasattr(
-                module, "num_embeddings"):
+        elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
             # ParallelLMHead
             in_features = module.embedding_dim
             out_features = module.num_embeddings
         else:
-            raise ValueError(
-                f"Unable to determine dimensions for module {module_name}")
+            raise ValueError(f"Unable to determine dimensions for module {module_name}")
 
         lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
deleted file mode 100644
index c6d89d849e9f9f24785aa2d44f145f209d3364e1..0000000000000000000000000000000000000000
--- a/tests/model_executor/conftest.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-
-@pytest.fixture
-def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-
-@pytest.fixture
-def sample_json_schema():
-    return {
-        "type": "object",
-        "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
-            "skills": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
-            },
-            "work_history": {
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "company": {
-                            "type": "string"
-                        },
-                        "duration": {
-                            "type": "number"
-                        },
-                        "position": {
-                            "type": "string"
-                        }
-                    },
-                    "required": ["company", "position"]
-                }
-            }
-        },
-        "required": ["name", "age", "skills", "work_history"]
-    }
diff --git a/tests/mistral_tool_use/__init__.py b/tests/model_executor/model_loader/fastsafetensors_loader/__init__.py
similarity index 100%
rename from tests/mistral_tool_use/__init__.py
rename to tests/model_executor/model_loader/fastsafetensors_loader/__init__.py
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
similarity index 79%
rename from tests/fastsafetensors_loader/test_fastsafetensors_loader.py
rename to tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
index afd411ff4874ec29eb756abfe16046ffe4633b38..f154df6dfc2320b664f85aea74e3d03d3c68dd17 100644
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm import SamplingParams
+from vllm.platforms import current_platform
 
 test_model = "openai-community/gpt2"
 
@@ -15,6 +18,9 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+)
 def test_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format="fastsafetensors") as llm:
         deserialized_outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
similarity index 61%
rename from tests/fastsafetensors_loader/test_weight_utils.py
rename to tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
index 78d23acfec7c564e94e88c0630bf3de2e64f422b..bd216f0e41a472e2786f13eaf90f5831f8ccde86 100644
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
@@ -5,27 +5,33 @@ import glob
 import tempfile
 
 import huggingface_hub.constants
+import pytest
 import torch
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, fastsafetensors_weights_iterator,
-    safetensors_weights_iterator)
+    download_weights_from_hf,
+    fastsafetensors_weights_iterator,
+    safetensors_weights_iterator,
+)
+from vllm.platforms import current_platform
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+)
 def test_fastsafetensors_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("openai-community/gpt2",
-                                 allow_patterns=["*.safetensors"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
+        )
         safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
         assert len(safetensors) > 0
 
         fastsafetensors_tensors = {}
         hf_safetensors_tensors = {}
 
-        for name, tensor in fastsafetensors_weights_iterator(
-                safetensors, True):
+        for name, tensor in fastsafetensors_weights_iterator(safetensors, True):
             fastsafetensors_tensors[name] = tensor
 
         for name, tensor in safetensors_weights_iterator(safetensors, True):
@@ -34,13 +40,10 @@ def test_fastsafetensors_model_loader():
         assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
 
         for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
-            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
-            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
-                name].dtype
-            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
-                name].shape
-            assert torch.all(
-                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+            fastsafetensors_tensor = fastsafetensors_tensor.to("cpu")
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
 
 
 if __name__ == "__main__":
diff --git a/tests/runai_model_streamer_test/__init__.py b/tests/model_executor/model_loader/runai_model_streamer/__init__.py
similarity index 100%
rename from tests/runai_model_streamer_test/__init__.py
rename to tests/model_executor/model_loader/runai_model_streamer/__init__.py
diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
similarity index 61%
rename from tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
rename to tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
index 22bdb3b44eb03fbc802d6404128f4725a6fc574f..1119d0de1c8b85d4e27c0da1591d8b21c1ed2429 100644
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm import SamplingParams
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import get_model_loader
 
 load_format = "runai_streamer"
 test_model = "openai-community/gpt2"
+# TODO(amacaskill): Replace with a GKE owned GCS bucket.
+test_gcs_model = "gs://vertex-model-garden-public-us/codegemma/codegemma-2b/"
 
 prompts = [
     "Hello, my name is",
@@ -32,3 +36,16 @@ def test_runai_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format=load_format) as llm:
         deserialized_outputs = llm.generate(prompts, sampling_params)
         assert deserialized_outputs
+
+
+def test_runai_model_loader_download_files_gcs(
+    vllm_runner, monkeypatch: pytest.MonkeyPatch
+):
+    monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
+    monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
+    monkeypatch.setenv(
+        "CLOUD_STORAGE_EMULATOR_ENDPOINT", "https://storage.googleapis.com"
+    )
+    with vllm_runner(test_gcs_model, load_format=load_format) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py b/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ad7308eeba24259e5115e8be95eac3bfba09fb5
--- /dev/null
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import hashlib
+import os
+import tempfile
+
+import huggingface_hub.constants
+
+from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
+from vllm.transformers_utils.runai_utils import (
+    ObjectStorageModel,
+    is_runai_obj_uri,
+    list_safetensors,
+)
+
+
+def test_is_runai_obj_uri():
+    assert is_runai_obj_uri("gs://some-gcs-bucket/path")
+    assert is_runai_obj_uri("s3://some-s3-bucket/path")
+    assert not is_runai_obj_uri("nfs://some-nfs-path")
+
+
+def test_runai_list_safetensors_local():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf(
+            "openai-community/gpt2",
+            allow_patterns=["*.safetensors", "*.json"],
+            cache_dir=tmpdir,
+        )
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+        parentdir = [os.path.dirname(safetensor) for safetensor in safetensors][0]
+        files = list_safetensors(parentdir)
+        assert len(safetensors) == len(files)
+
+
+def test_runai_pull_files_gcs(monkeypatch):
+    monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
+    # Bypass default project lookup by setting GOOGLE_CLOUD_PROJECT
+    monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
+    filename = "LT08_L1GT_074061_20130309_20170505_01_T2_MTL.txt"
+    gcs_bucket = "gs://gcp-public-data-landsat/LT08/01/074/061/LT08_L1GT_074061_20130309_20170505_01_T2/"
+    gcs_url = f"{gcs_bucket}/{filename}"
+    model = ObjectStorageModel(gcs_url)
+    model.pull_files(gcs_bucket, allow_pattern=[f"*{filename}"])
+    # To re-generate / change URLs:
+    #   gsutil ls -L gs://<gcs-url> | grep "Hash (md5)" | tr -d ' ' \
+    #     | cut -d":" -f2 | base64 -d | xxd -p
+    expected_checksum = "f60dea775da1392434275b311b31a431"
+    hasher = hashlib.new("md5")
+    with open(os.path.join(model.dir, filename), "rb") as f:
+        # Read the file in chunks to handle large files efficiently
+        for chunk in iter(lambda: f.read(4096), b""):
+            hasher.update(chunk)
+    actual_checksum = hasher.hexdigest()
+    assert actual_checksum == expected_checksum
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
similarity index 76%
rename from tests/runai_model_streamer_test/test_weight_utils.py
rename to tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
index ee448c2ccb213fb8f6c6709f862fa9c2aeb1eb47..03691b4a472f1e8a872c51f4520a7e3d995a1009 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
@@ -8,24 +8,25 @@ import huggingface_hub.constants
 import torch
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, runai_safetensors_weights_iterator,
-    safetensors_weights_iterator)
+    download_weights_from_hf,
+    runai_safetensors_weights_iterator,
+    safetensors_weights_iterator,
+)
 
 
 def test_runai_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("openai-community/gpt2",
-                                 allow_patterns=["*.safetensors"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
+        )
         safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
         assert len(safetensors) > 0
 
         runai_model_streamer_tensors = {}
         hf_safetensors_tensors = {}
 
-        for name, tensor in runai_safetensors_weights_iterator(
-                safetensors, True):
+        for name, tensor in runai_safetensors_weights_iterator(safetensors, True):
             runai_model_streamer_tensors[name] = tensor
 
         for name, tensor in safetensors_weights_iterator(safetensors, True):
diff --git a/tests/tensorizer_loader/__init__.py b/tests/model_executor/model_loader/tensorizer_loader/__init__.py
similarity index 100%
rename from tests/tensorizer_loader/__init__.py
rename to tests/model_executor/model_loader/tensorizer_loader/__init__.py
diff --git a/tests/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
similarity index 78%
rename from tests/tensorizer_loader/conftest.py
rename to tests/model_executor/model_loader/tensorizer_loader/conftest.py
index 571dc2e0eb50f83f78224782e1f46e189ae25c13..826ecec71e6cf2ad5fe41a1a670d077089e5b386 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 
@@ -8,9 +8,9 @@ from vllm import LLM, EngineArgs
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader import tensorizer as tensorizer_mod
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import UniProcExecutor
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor import UniProcExecutor
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 MODEL_REF = "facebook/opt-125m"
 
@@ -32,7 +32,6 @@ def cleanup():
 
 @pytest.fixture()
 def just_serialize_model_tensors(model_ref, monkeypatch, tmp_path):
-
     def noop(*args, **kwargs):
         return None
 
@@ -56,8 +55,7 @@ def model_path(model_ref, tmp_path):
     yield tmp_path / model_ref / "model.tensors"
 
 
-def assert_from_collective_rpc(engine: LLM, closure: Callable,
-                               closure_kwargs: dict):
+def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: dict):
     res = engine.collective_rpc(method=closure, kwargs=closure_kwargs)
     return all(res)
 
@@ -67,18 +65,13 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable,
 # method. It's purely used as a dummy utility to run methods that test
 # Tensorizer functionality
 class DummyExecutor(UniProcExecutor):
-
     def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rpc_rank=0)
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+        """Initialize the worker and load the model."""
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
         local_rank = 0
         # set local rank as the device index if specified
-        device_info = self.vllm_config.device_config.device.__str__().split(
-            ":")
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
         if len(device_info) > 1:
             local_rank = int(device_info[1])
         rank = 0
@@ -91,7 +84,7 @@ class DummyExecutor(UniProcExecutor):
             is_driver_worker=is_driver_worker,
         )
         self.mm_receiver_cache = None
-        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
 
     @property
@@ -99,5 +92,5 @@ class DummyExecutor(UniProcExecutor):
         return 2
 
     def shutdown(self):
-        if hasattr(self, 'thread_pool'):
+        if hasattr(self, "thread_pool"):
             self.thread_pool.shutdown(wait=False)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
similarity index 70%
rename from tests/tensorizer_loader/test_tensorizer.py
rename to tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
index e00d7c2f80c67b4babfe279d3a385acabebd5f3b..ed5129e1c82060615dca9039fef2610076447b55 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
@@ -14,20 +14,21 @@ import pytest
 import torch
 
 import vllm.model_executor.model_loader.tensorizer
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-# yapf: disable
-from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
-                                                         TensorSerializer,
-                                                         is_vllm_tensorized,
-                                                         open_stream,
-                                                         tensorize_vllm_model)
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig,
+    TensorSerializer,
+    is_vllm_tensorized,
+    open_stream,
+    tensorize_vllm_model,
+)
 from vllm.model_executor.model_loader.tensorizer_loader import (
-    BLACKLISTED_TENSORIZER_ARGS)
-# yapf: enable
-from vllm.utils import PlaceholderModule
+    BLACKLISTED_TENSORIZER_ARGS,
+)
+from vllm.utils.import_utils import PlaceholderModule
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import DummyExecutor, assert_from_collective_rpc
 
 try:
@@ -44,7 +45,7 @@ class TensorizerCaughtError(Exception):
 
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
-pytest_plugins = "pytest_asyncio",
+pytest_plugins = ("pytest_asyncio",)
 
 prompts = [
     "Hello, my name is",
@@ -56,8 +57,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
 
-def patch_init_and_catch_error(self, obj, method_name,
-                               expected_error: type[Exception]):
+def patch_init_and_catch_error(self, obj, method_name, expected_error: type[Exception]):
     original = getattr(obj, method_name, None)
     if original is None:
         raise ValueError("Method '{}' not found.".format(method_name))
@@ -80,17 +80,19 @@ def assert_specific_tensorizer_error_is_raised(
     expected_error: type[Exception],
 ):
     with pytest.raises(TensorizerCaughtError):
-        executor.collective_rpc(patch_init_and_catch_error,
-                                args=(
-                                    obj,
-                                    method_name,
-                                    expected_error,
-                                ))
+        executor.collective_rpc(
+            patch_init_and_catch_error,
+            args=(
+                obj,
+                method_name,
+                expected_error,
+            ),
+        )
 
 
 def is_curl_installed():
     try:
-        subprocess.check_call(['curl', '--version'])
+        subprocess.check_call(["curl", "--version"])
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
@@ -99,13 +101,14 @@ def is_curl_installed():
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
     pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
-    with open(keyfile_path, 'wb') as f:
+    with open(keyfile_path, "wb") as f:
         f.write(encryption_params.key)
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
-        model_ref, vllm_runner, tmp_path, model_path):
+    model_ref, vllm_runner, tmp_path, model_path
+):
     args = EngineArgs(model=model_ref)
     with vllm_runner(model_ref) as vllm_model:
         key_path = tmp_path / model_ref / "model.key"
@@ -113,29 +116,30 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         outputs = vllm_model.generate(prompts, sampling_params)
 
-    config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
-                                              encryption_keyfile=str(key_path))
+    config_for_serializing = TensorizerConfig(
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)
+    )
 
     tensorize_vllm_model(args, config_for_serializing)
 
     config_for_deserializing = TensorizerConfig(
-        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
-
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=config_for_deserializing
-                     ) as loaded_vllm_model:  # noqa: E501
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)
+    )
 
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+    with vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=config_for_deserializing,
+    ) as loaded_vllm_model:  # noqa: E501
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
 
 
-def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
-                                                tmp_path, model_ref,
-                                                model_path):
+def test_deserialized_hf_model_has_same_outputs(
+    hf_runner, vllm_runner, tmp_path, model_ref, model_path
+):
     with hf_runner(model_ref) as hf_model:
         max_tokens = 50
         outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
@@ -143,14 +147,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer = TensorSerializer(stream)
             serializer.write_module(hf_model.model)
 
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=TensorizerConfig(
-                         tensorizer_uri=str(model_path),
-                         num_readers=1,
-                     )) as loaded_hf_model:
+    with vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=TensorizerConfig(
+            tensorizer_uri=str(model_path),
+            num_readers=1,
+        ),
+    ) as loaded_hf_model:
         deserialized_outputs = loaded_hf_model.generate_greedy(
-            prompts, max_tokens=max_tokens)
+            prompts, max_tokens=max_tokens
+        )
 
         assert outputs == deserialized_outputs
 
@@ -159,35 +166,37 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
     model = None
     try:
         model = vllm_runner(
-            model_ref,
-            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+            model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")
+        )
         pytest.fail("Expected RuntimeError for extra config keys")
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert ("ValueError: Unexpected extra config keys for load "
-                "format auto") in combined_output
+        assert (
+            "ValueError: Unexpected extra config keys for load format auto"
+        ) in combined_output
     finally:
         del model
         gc.collect()
         torch.cuda.empty_cache()
 
 
-def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
-                                                  model_ref):
+def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref):
     model = None
     try:
         model = vllm_runner(
             model_ref,
             load_format="safetensors",
-            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"),
+        )
         pytest.fail("Expected RuntimeError for extra config keys")
     except RuntimeError:
         out, err = capfd.readouterr()
 
         combined_output = out + err
-        assert ("ValueError: Unexpected extra config keys "
-                "for load format safetensors") in combined_output
+        assert (
+            "ValueError: Unexpected extra config keys for load format safetensors"
+        ) in combined_output
     finally:
         del model
         gc.collect()
@@ -214,21 +223,24 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert ("ValueError: For a sharded model, tensorizer_uri "
-                "should include a string format template like '%04d' "
-                "to be formatted with the rank "
-                "of the shard") in combined_output
+        assert (
+            "ValueError: For a sharded model, tensorizer_uri "
+            "should include a string format template like '%04d' "
+            "to be formatted with the rank "
+            "of the shard"
+        ) in combined_output
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
 def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
-        vllm_runner, tmp_path):
+    vllm_runner, tmp_path
+):
     model_ref = "EleutherAI/pythia-1.4b"
     # record outputs from un-sharded un-tensorized model
     with vllm_runner(
-            model_ref,
-            disable_custom_all_reduce=True,
-            enforce_eager=True,
+        model_ref,
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
     ) as base_model:
         outputs = base_model.generate(prompts, sampling_params)
 
@@ -254,21 +266,22 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
 
     with vllm_runner(
-            model_ref,
-            tensor_parallel_size=2,
-            load_format="tensorizer",
-            disable_custom_all_reduce=True,
-            enforce_eager=True,
-            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+        model_ref,
+        tensor_parallel_size=2,
+        load_format="tensorizer",
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config,
+    ) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
 
     assert outputs == deserialized_outputs
 
 
 @pytest.mark.flaky(reruns=3)
-def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
-                                                tmp_path, model_path):
+def test_vllm_tensorized_model_has_same_outputs(
+    model_ref, vllm_runner, tmp_path, model_path
+):
     gc.collect()
     torch.cuda.empty_cache()
     config = TensorizerConfig(tensorizer_uri=str(model_path))
@@ -280,11 +293,10 @@ def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
     tensorize_vllm_model(args, config)
     assert is_vllm_tensorized(config)
 
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+    with vllm_runner(
+        model_ref, load_format="tensorizer", model_loader_extra_config=config
+    ) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
@@ -314,15 +326,17 @@ def test_load_with_just_model_tensors(just_serialize_model_tensors, model_ref):
 
 
 def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
-
     serialization_params = {
         "limit_cpu_concurrency": 2,
     }
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
-    llm = LLM(model=model_ref, )
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
+    llm = LLM(
+        model=model_ref,
+    )
 
     def serialization_test(self, *args, **kwargs):
         # This is performed in the ephemeral worker process, so monkey-patching
@@ -340,10 +354,13 @@ def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
             return original(self, *args, **kwargs)
 
         tensorizer.serialization.TensorSerializer.__init__ = (
-            tensorizer_serializer_wrapper)
+            tensorizer_serializer_wrapper
+        )
 
         tensorizer_config = TensorizerConfig(**kwargs["tensorizer_config"])
-        self.save_tensorized_model(tensorizer_config=tensorizer_config, )
+        self.save_tensorized_model(
+            tensorizer_config=tensorizer_config,
+        )
         return to_compare | original_dict == to_compare
 
     kwargs = {"tensorizer_config": config.to_serializable()}
@@ -351,9 +368,7 @@ def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
     assert assert_from_collective_rpc(llm, serialization_test, kwargs)
 
 
-def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
-        tmp_path, capfd):
-
+def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
     deserialization_kwargs = {
         "num_readers": "bar",  # illegal value
     }
@@ -364,8 +379,9 @@ def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -393,7 +409,6 @@ def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
 
 
 def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
-
     deserialization_kwargs = {
         "num_readers": 1,
     }
@@ -404,8 +419,9 @@ def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -441,16 +457,24 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
 
     suffix = "test"
     try:
-        result = subprocess.run([
-            sys.executable,
-            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-            model_ref, "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
-            '{"limit_cpu_concurrency": 4}'
-        ],
-                                check=True,
-                                capture_output=True,
-                                text=True)
+        result = subprocess.run(
+            [
+                sys.executable,
+                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                "--model",
+                model_ref,
+                "serialize",
+                "--serialized-directory",
+                str(tmp_path),
+                "--suffix",
+                suffix,
+                "--serialization-kwargs",
+                '{"limit_cpu_concurrency": 4}',
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
     except subprocess.CalledProcessError as e:
         print("Tensorizing failed.")
         print("STDOUT:\n", e.stdout)
@@ -470,14 +494,20 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
         "deserialization_kwargs": {
             "verify_hash": True,
             "num_readers": 8,
-        }
+        },
     }
 
     cmd = [
-        "-m", "vllm.entrypoints.cli.main", "serve", "--host", "localhost",
-        "--load-format", "tensorizer", model_ref,
+        "-m",
+        "vllm.entrypoints.cli.main",
+        "serve",
+        "--host",
+        "localhost",
+        "--load-format",
+        "tensorizer",
+        model_ref,
         "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config, indent=2)
+        json.dumps(model_loader_extra_config, indent=2),
     ]
 
     proc = await asyncio.create_subprocess_exec(
@@ -500,17 +530,16 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
 
 
 @pytest.mark.parametrize("illegal_value", BLACKLISTED_TENSORIZER_ARGS)
-def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
-                                           illegal_value):
-
+def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd, illegal_value):
     serialization_params = {
         "limit_cpu_concurrency": 2,
     }
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -526,5 +555,6 @@ def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert (f"ValueError: {illegal_value} is not an allowed "
-                f"Tensorizer argument.") in combined_output
+        assert (
+            f"ValueError: {illegal_value} is not an allowed Tensorizer argument."
+        ) in combined_output
diff --git a/tests/model_executor/model_loader/test_registry.py b/tests/model_executor/model_loader/test_registry.py
index 639ee6db9270fd17b2cfe9758267b19535425cc9..020988ccac13c20a52ea627a6642067f3235dc5b 100644
--- a/tests/model_executor/model_loader/test_registry.py
+++ b/tests/model_executor/model_loader/test_registry.py
@@ -6,22 +6,19 @@ from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
-from vllm.model_executor.model_loader import (get_model_loader,
-                                              register_model_loader)
+from vllm.model_executor.model_loader import get_model_loader, register_model_loader
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 
 
 @register_model_loader("custom_load_format")
 class CustomModelLoader(BaseModelLoader):
-
     def __init__(self, load_config: LoadConfig) -> None:
         super().__init__(load_config)
 
     def download_model(self, model_config: ModelConfig) -> None:
         pass
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         pass
 
 
diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py
index 785169f5d22e74d67ac6f079d4a24c0e2bad43c5..5bb841bf2fa0e2a7e7bf9be751cd87ba6d576cfa 100644
--- a/tests/model_executor/model_loader/test_sharded_state_loader.py
+++ b/tests/model_executor/model_loader/test_sharded_state_loader.py
@@ -35,11 +35,13 @@ def test_filter_subtensors():
         "b": torch.empty((2, 4)),
         "c": torch.empty((2, 4, 8)),
     }
-    state_dict.update({
-        "x": state_dict["b"],
-        "y": state_dict["c"][1, 2, :],
-        "z": state_dict["c"][1, :, 4],
-    })
+    state_dict.update(
+        {
+            "x": state_dict["b"],
+            "y": state_dict["c"][1, 2, :],
+            "z": state_dict["c"][1, :, 4],
+        }
+    )
     filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
     assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
     for key, tensor in filtered_state_dict.items():
@@ -49,8 +51,9 @@ def test_filter_subtensors():
 
 @pytest.fixture(scope="module")
 def llama_3p2_1b_files():
-    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
-                                  ignore_patterns=["*.bin*", "original/*"])
+    input_dir = snapshot_download(
+        "meta-llama/Llama-3.2-1B-Instruct", ignore_patterns=["*.bin*", "original/*"]
+    )
 
     yield input_dir
 
@@ -63,8 +66,7 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     if is_v1_engine:
         # For V1 engine, we need to use engine_core.save_sharded_state
         print("Using V1 engine save path")
-        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
-            path=output_dir)
+        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
     else:
         # For V0 engine
         print("Using V0 engine save path")
@@ -74,8 +76,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
         if os.path.isdir(os.path.join(input_dir, file)):
-            shutil.copytree(os.path.join(input_dir, file),
-                            os.path.join(output_dir, file))
+            shutil.copytree(
+                os.path.join(input_dir, file), os.path.join(output_dir, file)
+            )
         elif not any(fnmatch.fnmatch(file, ext) for ext in weights_patterns):
             shutil.copy(os.path.join(input_dir, file), output_dir)
 
@@ -90,37 +93,42 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
-def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files):
+def test_sharded_state_loader(
+    enable_lora, tp_size, num_gpus_available, llama_3p2_1b_files
+):
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    weights_patterns = ("*.safetensors", )
+    weights_patterns = ("*.safetensors",)
     gpu_memory_utilization = 0.8
     input_dir = llama_3p2_1b_files
     ctx = mp.get_context("spawn")
 
     # Run in separate processes for memory & CUDA isolation
     with TemporaryDirectory() as output_dir:
-        p = ctx.Process(target=_run_writer,
-                        args=(input_dir, output_dir, weights_patterns),
-                        kwargs=dict(
-                            tensor_parallel_size=tp_size,
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            enforce_eager=True,
-                        ))
+        p = ctx.Process(
+            target=_run_writer,
+            args=(input_dir, output_dir, weights_patterns),
+            kwargs=dict(
+                tensor_parallel_size=tp_size,
+                gpu_memory_utilization=gpu_memory_utilization,
+                enforce_eager=True,
+            ),
+        )
         p.start()
         p.join()
 
         queue = ctx.Queue()
 
-        p = ctx.Process(target=_run_generate,
-                        args=(input_dir, queue),
-                        kwargs=dict(
-                            enable_lora=enable_lora,
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            tensor_parallel_size=tp_size,
-                        ))
+        p = ctx.Process(
+            target=_run_generate,
+            args=(input_dir, queue),
+            kwargs=dict(
+                enable_lora=enable_lora,
+                gpu_memory_utilization=gpu_memory_utilization,
+                tensor_parallel_size=tp_size,
+            ),
+        )
         p.start()
         # Call queue.get() before p.join() to prevent deadlock:
         # If p.join() is called before queue.get() and the queue is full,
@@ -134,14 +142,16 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
 
         queue = ctx.Queue()
 
-        p = ctx.Process(target=_run_generate,
-                        args=(output_dir, queue),
-                        kwargs=dict(
-                            enable_lora=enable_lora,
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            tensor_parallel_size=tp_size,
-                            load_format="sharded_state",
-                        ))
+        p = ctx.Process(
+            target=_run_generate,
+            args=(output_dir, queue),
+            kwargs=dict(
+                enable_lora=enable_lora,
+                gpu_memory_utilization=gpu_memory_utilization,
+                tensor_parallel_size=tp_size,
+                load_format="sharded_state",
+            ),
+        )
         p.start()
         # Call queue.get() before p.join() to prevent deadlock:
         # If p.join() is called before queue.get() and the queue is full,
diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab75933ee31e7c4ab1d4a71c0e4b1a3c6637bb2
--- /dev/null
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.model_executor.models.utils import get_draft_quant_config
+from vllm.platforms import current_platform
+
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
+
+
+def test_get_draft_quant_config_with_draft_model():
+    mock_draft_model_config = Mock(spec=ModelConfig)
+    mock_load_config = Mock(spec=LoadConfig)
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = mock_draft_model_config
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = mock_load_config
+
+    mock_quant_config = Mock()
+    with patch.object(
+        VllmConfig, "get_quantization_config", return_value=mock_quant_config
+    ):
+        result = get_draft_quant_config(mock_vllm_config)
+
+        # Verify the function calls get_quantization_config with draft model config
+        VllmConfig.get_quantization_config.assert_called_once_with(
+            mock_draft_model_config, mock_load_config
+        )
+        assert result == mock_quant_config
+
+
+def test_get_draft_quant_config_without_draft_model():
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = None
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = Mock(spec=LoadConfig)
+
+    result = get_draft_quant_config(mock_vllm_config)
+
+    assert result is None
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("device", DEVICES)
+def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+    import torch
+
+    from vllm.model_executor.layers.linear import ReplicatedLinear
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+
+    input_size = 256
+    output_size = 128
+
+    fc_no_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=None,
+        prefix="fc",
+    )
+
+    assert fc_no_quant.quant_config is None
+    assert fc_no_quant.input_size == input_size
+    assert fc_no_quant.output_size == output_size
+
+    mock_quant_config = Mock()
+    fc_with_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=mock_quant_config,
+        prefix="fc",
+    )
+
+    assert fc_with_quant.quant_config == mock_quant_config
+
+    # Check forward pass
+    x = torch.randn(2, input_size, dtype=torch.float16)
+    output, _ = fc_no_quant(x)
+    assert output.shape == (2, output_size)
+
+
+def test_kv_cache_scale_name_handling():
+    # Mock a quant config that supports cache scales
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Condition check in load_weights
+    name = "layers.0.self_attn.k_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Check if get_cache_scale is called and returns expected value
+    mock_quant_config.get_cache_scale.assert_called_once_with(name)
+    assert scale_name == "layers.0.self_attn.kv_scale"
+
+
+def test_kv_cache_scale_name_no_scale():
+    # Mock a quant config that returns None for get_cache_scale
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value=None)
+
+    name = "layers.0.mlp.gate_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Should return None for weights that don't have cache scales
+    assert scale_name is None
+
+
+def test_maybe_remap_kv_scale_name():
+    from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": Mock(),
+        "layers.1.self_attn.kv_scale": Mock(),
+    }
+
+    name = "layers.0.self_attn.some_scale"
+    remapped = maybe_remap_kv_scale_name(name, params_dict)
+
+    assert remapped in params_dict or remapped == name or remapped is None
+
+
+def test_load_weights_kv_scale_handling():
+    kv_scale_param = Mock()
+    kv_scale_param.weight_loader = Mock()
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": kv_scale_param,
+    }
+
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Load_weights logic for KV cache scales
+    name = "layers.0.self_attn.k_proj.weight"
+    loaded_weight_tensor = torch.tensor([1.0, 2.0])
+
+    if mock_quant_config is not None:
+        scale_name = mock_quant_config.get_cache_scale(name)
+        if scale_name:
+            param = params_dict[scale_name]
+            assert param is kv_scale_param
+            weight_to_load = (
+                loaded_weight_tensor
+                if loaded_weight_tensor.dim() == 0
+                else loaded_weight_tensor[0]
+            )
+
+            assert scale_name == "layers.0.self_attn.kv_scale"
+            assert weight_to_load == loaded_weight_tensor[0]
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 92ce10a9efc0b8274736f0020450e975feb6cc52..9121284de85b7cc6f48cca94418d95108f6c3afe 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,24 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.activation import (GeluAndMul,
-                                                   ReLUSquaredActivation,
-                                                   SiluAndMul)
-from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
-                                                            vllm_topk_softmax)
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled)
-from vllm.model_executor.layers.layernorm import (RMSNorm,
-                                                  dispatch_rocm_rmsnorm_func,
-                                                  fused_add_rms_norm, rms_norm)
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
+from vllm.model_executor.layers.activation import (
+    GeluAndMul,
+    ReLUSquaredActivation,
+    SiluAndMul,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_topk_func,
+    vllm_topk_softmax,
+)
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm,
+    dispatch_rocm_rmsnorm_func,
+    fused_add_rms_norm,
+    rms_norm,
+)
 from vllm.platforms import current_platform
 
 RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@@ -31,50 +34,58 @@ class Relu3(ReLUSquaredActivation):
 
 
 @pytest.mark.parametrize(
-    "env, torch_level, use_inductor, ops_enabled, default_on",
+    "env, compilation_mode, backend, ops_enabled, default_on",
     [
         # Default values based on compile level
         # - All by default (no Inductor compilation)
-        (None, 0, False, [True] * 4, True),
-        (None, 1, True, [True] * 4, True),
-        (None, 2, False, [True] * 4, True),
+        (None, 0, "eager", [True] * 4, True),
+        (None, 1, "eager", [True] * 4, True),
+        (None, 2, "eager", [True] * 4, True),
+        (None, 3, "eager", [True] * 4, True),
         # - None by default (with Inductor)
-        (None, 3, True, [False] * 4, False),
-        (None, 4, True, [False] * 4, False),
-        # - All by default (without Inductor)
-        (None, 3, False, [True] * 4, True),
-        (None, 4, False, [True] * 4, True),
+        (None, 0, "inductor", [True] * 4, True),
+        # - None by default (with Inductor)
+        (None, 1, "inductor", [False] * 4, False),
+        (None, 2, "inductor", [False] * 4, False),
+        (None, 3, "inductor", [False] * 4, False),
         # Explicitly enabling/disabling
         #
         # Default: all
         #
         # All but SiluAndMul
-        ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
+        ("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True),
         # Only ReLU3
-        ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
+        ("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False),
         # All but SiluAndMul
-        ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
+        ("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True),
         # All but ReLU3 (even if ReLU2 is on)
-        ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True),
+        ("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
         # RMSNorm and SiluAndMul
-        ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
+        ("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False),
         # All but RMSNorm
-        ("-rms_norm", 3, False, [0, 1, 1, 1], True),
+        ("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
         #
         # Default: none
         #
         # Only ReLU3
-        ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
+        ("none,+relu3", 3, "inductor", [0, 0, 0, 1], False),
         # All but RMSNorm
-        ("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
-    ])
-def test_enabled_ops(env: Optional[str], torch_level: int, use_inductor: bool,
-                     ops_enabled: list[int], default_on: bool):
-    custom_ops = env.split(',') if env else []
+        ("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True),
+    ],
+)
+def test_enabled_ops(
+    env: str | None,
+    compilation_mode: int,
+    backend: str,
+    ops_enabled: list[int],
+    default_on: bool,
+):
+    custom_ops = env.split(",") if env else []
     vllm_config = VllmConfig(
-        compilation_config=CompilationConfig(use_inductor=bool(use_inductor),
-                                             level=torch_level,
-                                             custom_ops=custom_ops))
+        compilation_config=CompilationConfig(
+            backend=backend, mode=compilation_mode, custom_ops=custom_ops
+        )
+    )
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
@@ -102,77 +113,50 @@ def test_enabled_ops(env: Optional[str], torch_level: int, use_inductor: bool,
 
 
 @pytest.mark.parametrize(
-    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
+    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"]
+)
 def test_enabled_ops_invalid(env: str):
     with pytest.raises(Exception):  # noqa
-        vllm_config = VllmConfig(compilation_config=CompilationConfig(
-            custom_ops=env.split(",")))
+        vllm_config = VllmConfig(
+            compilation_config=CompilationConfig(custom_ops=env.split(","))
+        )
         with set_current_vllm_config(vllm_config):
             RMSNorm(1024).enabled()
 
 
-@pytest.mark.skipif(
-    not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
-    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
-@pytest.mark.parametrize("use_cutlass", [True, False])
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
-def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
-                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
-                                  monkeypatch):
-
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
-                       use_rocm_aiter_gemm_w8a8_blockscale)
-
-    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
-        int(use_rocm_aiter_gemm_w8a8_blockscale)))
-    block_scale_func = dispatch_w8a8_blockscale_func(
-        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
-    if use_cutlass:
-        assert block_scale_func == cutlass_scaled_mm
-    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
-            use_rocm_aiter_gemm_w8a8_blockscale):
-        assert block_scale_func == (
-            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
-    else:
-        assert block_scale_func == w8a8_block_fp8_matmul
-
-
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    topk_func = dispatch_topk_func()
-    is_rocm_aiter_moe_enabled.cache_clear()
-    if current_platform.is_rocm() and int(use_rocm_aiter):
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_topk_softmax)
-        assert topk_func == rocm_aiter_topk_softmax
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_topk_dispatch(use_rocm_aiter: bool):
+    topk_func = dispatch_topk_func(use_rocm_aiter)
+
+    if current_platform.is_rocm() and use_rocm_aiter:
+        assert topk_func == rocm_aiter_ops.topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
 
 
 @pytest.mark.parametrize("add_residual", [True, False])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="AITER is a feature exclusive for ROCm")
-def test_rms_norm_dispatch(add_residual: bool, dtype: torch.dtype,
-                           use_rocm_aiter: str, use_rocm_aiter_norm: str,
-                           monkeypatch):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
-
-    should_use_rocm_aiter = current_platform.is_rocm() and int(use_rocm_aiter) \
-        and int(use_rocm_aiter_norm) and dtype in RMS_NORM_SUPPORTED_DTYPES
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("use_rocm_aiter", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
+)
+def test_rms_norm_dispatch(
+    add_residual: bool, dtype: torch.dtype, use_rocm_aiter: bool
+):
+    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype, use_rocm_aiter)
+
+    should_use_rocm_aiter = (
+        current_platform.is_rocm()
+        and use_rocm_aiter
+        and dtype in RMS_NORM_SUPPORTED_DTYPES
+    )
 
     if add_residual and should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        assert rms_norm_func == rocm_aiter_ops.rms_norm2d_with_add
     elif should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
+        assert rms_norm_func == rocm_aiter_ops.rms_norm
     elif add_residual:
         assert rms_norm_func == fused_add_rms_norm
     else:
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index c7b15c6ae118621e4f08a137f267edaed12c91d0..489ac1e6475b9de39c65f37441109b995ac5b466 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -5,8 +5,12 @@ import os
 
 import pytest
 
-from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler,
-                                               MeanPool, PoolingType)
+from vllm.model_executor.layers.pooler import (
+    CLSPool,
+    DispatchPooler,
+    MeanPool,
+    PoolingType,
+)
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -15,25 +19,28 @@ MAX_MODEL_LEN = 128
 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")
 
-MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-base")
+MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "intfloat/multilingual-e5-base")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
     # to use apply_model
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_name=MODEL_NAME,
-                     revision=REVISION,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=MODEL_NAME,
+        revision=REVISION,
+        dtype="float16",
+        max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         model_config = vllm_model.llm.llm_engine.model_config
         model_tokenizer = vllm_model.llm.llm_engine.tokenizer
@@ -60,20 +67,24 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert output
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
     # to use apply_model
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
-                     revision=REVISION_ROBERTA,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=MODEL_NAME_ROBERTA,
+        revision=REVISION_ROBERTA,
+        dtype="float16",
+        max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         model_config = vllm_model.llm.llm_engine.model_config
         model_tokenizer = vllm_model.llm.llm_engine.tokenizer
@@ -93,16 +104,16 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
             assert isinstance(pooler := model.pooler, DispatchPooler)
-            assert isinstance(pooler.poolers_by_task["embed"].pooling,
-                              MeanPool)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling, MeanPool)
 
         vllm_model.apply_model(check_model)
 
         assert output
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test loading roberta-base model with no lm_head.
@@ -110,11 +121,12 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     # to use apply_model
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     model_name = "FacebookAI/roberta-base"
-    with vllm_runner(model_name=model_name,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=model_name, dtype="float16", max_model_len=MAX_MODEL_LEN
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
 
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index df625b8d60049e37a6b7a6c827bb595661525fda..6dc120ddbac9a8f90d425ecfdd2949fb6723dfed 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -9,23 +9,24 @@ import pytest
 from huggingface_hub.utils import LocalEntryNotFoundError
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, enable_hf_transfer)
+    download_weights_from_hf,
+    enable_hf_transfer,
+)
 
 
 def test_hf_transfer_auto_activation():
     if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
         # in case it is already set, we can't test the auto activation
-        pytest.skip(
-            "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
+        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
     enable_hf_transfer()
     try:
         # enable hf hub transfer if available
         import hf_transfer  # type: ignore # noqa
+
         HF_TRANSFER_ACTIVE = True
     except ImportError:
         HF_TRANSFER_ACTIVE = False
-    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
-            HF_TRANSFER_ACTIVE)
+    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
 
 
 def test_download_weights_from_hf():
@@ -34,22 +35,30 @@ def test_download_weights_from_hf():
         # if offline is set and model is not cached
         huggingface_hub.constants.HF_HUB_OFFLINE = True
         with pytest.raises(LocalEntryNotFoundError):
-            download_weights_from_hf("facebook/opt-125m",
-                                     allow_patterns=["*.safetensors", "*.bin"],
-                                     cache_dir=tmpdir)
+            download_weights_from_hf(
+                "facebook/opt-125m",
+                allow_patterns=["*.safetensors", "*.bin"],
+                cache_dir=tmpdir,
+            )
 
         # download the model
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("facebook/opt-125m",
-                                 allow_patterns=["*.safetensors", "*.bin"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "facebook/opt-125m",
+            allow_patterns=["*.safetensors", "*.bin"],
+            cache_dir=tmpdir,
+        )
 
         # now it should work offline
         huggingface_hub.constants.HF_HUB_OFFLINE = True
-        assert download_weights_from_hf(
-            "facebook/opt-125m",
-            allow_patterns=["*.safetensors", "*.bin"],
-            cache_dir=tmpdir) is not None
+        assert (
+            download_weights_from_hf(
+                "facebook/opt-125m",
+                allow_patterns=["*.safetensors", "*.bin"],
+                cache_dir=tmpdir,
+            )
+            is not None
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 39c4dd735b725ae9ed70c137fa61388bb043b9a6..0cdb7c9a603f2761e19aa5accbc25c5b8bbffd50 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
@@ -39,7 +38,11 @@ AITER_MODEL_LIST = [
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.slow_test,
+                pytest.mark.cpu_model,
+            ],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -51,10 +54,15 @@ AITER_MODEL_LIST = [
         pytest.param(
             "google/gemma-1.1-2b-it",  # gemma
             marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
             ],
         ),
+        pytest.param(
+            "google/gemma-2-2b-it",  # test hybrid attention
+            marks=[pytest.mark.cpu_model],
+        ),
         pytest.param(
             "zai-org/chatglm3-6b",  # chatglm (text-only)
         ),
@@ -64,9 +72,7 @@ AITER_MODEL_LIST = [
         ),
         pytest.param(
             "openbmb/MiniCPM3-4B",
-            # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model,
-                   large_gpu_mark(min_gb=32)],
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
         ),
         pytest.param(
             "facebook/opt-125m",  # opt
@@ -82,8 +88,9 @@ AITER_MODEL_LIST = [
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
             marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
             ],
         ),
         pytest.param(
@@ -93,23 +100,28 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model],
-        ),
-        pytest.param(
-            "allenai/OLMoE-1B-7B-0924-Instruct",
-            marks=[pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
-        pytest.param("swiss-ai/Apertus-8B-2509"),  # apertus
-    ])
+        pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 @pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
-                use_prompt_embeds: bool, monkeypatch) -> None:
-
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    use_rocm_aiter: bool,
+    use_prompt_embeds: bool,
+    monkeypatch,
+) -> None:
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
@@ -125,34 +137,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
 
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
-                                                       else None)
+        prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
 
         prompt_token_ids = []
         for prompt in example_prompts:
-            token_ids = hf_model.tokenizer(prompt,
-                                           return_tensors="pt").input_ids.to(
-                                               hf_model.model.device)
+            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
+                hf_model.model.device
+            )
             prompt_token_ids.append(token_ids)
             if prompt_embeds is not None:
-                prompt_embeds.append(hf_model.model.get_input_embeddings()(
-                    token_ids).squeeze(0))
+                prompt_embeds.append(
+                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
+                )
 
     with vllm_runner(
-            model,
-            tokenizer_name=model_info.tokenizer or model,
-            tokenizer_mode=model_info.tokenizer_mode,
-            trust_remote_code=model_info.trust_remote_code,
-            max_num_seqs=2,
-            enable_prompt_embeds=use_prompt_embeds,
+        model,
+        tokenizer_name=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        max_num_seqs=2,
+        enable_prompt_embeds=use_prompt_embeds,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
         if prompt_embeds is not None:
             vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
-                prompt_embeds, max_tokens, num_logprobs)
+                prompt_embeds, max_tokens, num_logprobs
+            )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
diff --git a/tests/models/language/generation/test_gemma.py b/tests/models/language/generation/test_gemma.py
index 60a4bc14be8826cc5bdc142e15172f03071711aa..246b893be315d18488567ec1436e01d9b1348d1c 100644
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
     with monkeypatch.context() as m:
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
         with vllm_runner(
-                model,
-                load_format="dummy",
+            model,
+            load_format="dummy",
         ) as llm:
             if model == "google/gemma-3-4b-it":
                 normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.language_model.model.
-                    normalizer.cpu().item())
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()  # noqa: E501
+                )
                 config = llm.llm.llm_engine.model_config.hf_config.text_config
             else:
                 normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.model.normalizer.cpu(
-                    ).item())
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
                 config = llm.llm.llm_engine.model_config.hf_config
             assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py
index 2a39f78a708eed6127ae9800a617a93380261fdc..e569e75ff3a82d82ead60b27918857ac70189a0d 100644
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -26,11 +26,13 @@ def test_models(
 ) -> None:
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index e60a86075b8bc7e8f082c7d394fddb4a994893bd..37830093cd3c5a0dc76279e168448c62d1603b21 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
+
 import pytest
 
 from tests.models.registry import HF_EXAMPLE_MODELS
@@ -8,7 +10,7 @@ from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
-from ...utils import check_logprobs_close
+from ...utils import check_logprobs_close, check_outputs_equal
 
 # Mark all tests as hybrid
 pytestmark = pytest.mark.hybrid_model
@@ -17,12 +19,14 @@ pytestmark = pytest.mark.hybrid_model
 # meaning that it will be used in all tests in this file
 # The rest of the models will only be tested by test_models
 
+APC_MULTIPLY_BY = 300
+
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
     # mamba2-codestral in transformers is broken pending:
     # https://github.com/huggingface/transformers/pull/40861
-    #"yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # "yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]
 
 HYBRID_MODELS = [
@@ -63,7 +67,6 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -73,11 +76,13 @@ def test_models(
 
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -107,13 +112,14 @@ def test_batching(
     for_loop_outputs = []
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         for prompt in example_prompts:
-            single_output, = vllm_model.generate_greedy_logprobs([prompt],
-                                                                 max_tokens,
-                                                                 num_logprobs)
+            (single_output,) = vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs
+            )
             for_loop_outputs.append(single_output)
 
         batched_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=for_loop_outputs,
@@ -132,8 +138,8 @@ def test_chunked_prefill_with_parallel_sampling(
     max_tokens: int,
 ) -> None:
     """
-    Tests chunked prefill in conjunction with n > 1. 
-    
+    Tests chunked prefill in conjunction with n > 1.
+
     In this case, prefill is populated with decoding tokens and
     we test that it doesn't fail.
 
@@ -141,16 +147,13 @@ def test_chunked_prefill_with_parallel_sampling(
     decoding steps inside a chunked prefill forward pass
     (where we have both prefill and decode together)
     """
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
+    sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
     with vllm_runner(
-            model,
-            enable_chunked_prefill=True,
-            # forces prefill chunks with decoding
-            max_num_batched_tokens=MAX_NUM_SEQS * 3,
-            max_num_seqs=MAX_NUM_SEQS,
+        model,
+        enable_chunked_prefill=True,
+        # forces prefill chunks with decoding
+        max_num_batched_tokens=MAX_NUM_SEQS * 3,
+        max_num_seqs=MAX_NUM_SEQS,
     ) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
@@ -168,10 +171,8 @@ def test_mamba_cache_cg_padding(
     batch size. If it's not, a torch RuntimeError will be raised because
     tensor dimensions aren't compatible.
     """
-    vllm_config = EngineArgs(model=model,
-                             trust_remote_code=True).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
+    vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
@@ -181,7 +182,8 @@ def test_mamba_cache_cg_padding(
         pytest.fail(
             "Couldn't run batch size which is not equal to a Cuda Graph "
             "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
+            "Could be related to mamba cache not padded correctly"
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -203,8 +205,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
         with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
             vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
     except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up properly between"
+            "steps finished requests registered unnecessarily "
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -213,10 +217,10 @@ def test_state_cleanup(
     example_prompts,
     model: str,
 ) -> None:
-    """ 
+    """
     This test is for verifying that the Hybrid state is cleaned up between
     steps.
-    
+
     If it's not cleaned, an error would be expected.
     """
     try:
@@ -224,8 +228,10 @@ def test_state_cleanup(
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up between states, "
+            "could be related to finished_requests_ids"
+        )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -239,15 +245,19 @@ def test_distributed_correctness(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=1,
-                     max_num_seqs=2) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
         vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model, tensor_parallel_size=2,
-                     max_num_seqs=2) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
         vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=vllm_outputs_tp_1,
@@ -269,7 +279,6 @@ def test_full_cuda_graph(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -279,11 +288,13 @@ def test_full_cuda_graph(
 
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -296,8 +307,9 @@ def test_full_cuda_graph(
 @pytest.mark.parametrize("model", FP32_STATE_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("cache_dtype_param",
-                         ["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
+@pytest.mark.parametrize(
+    "cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
+)
 def test_fp32_cache_state(
     hf_runner,
     vllm_runner,
@@ -308,7 +320,6 @@ def test_fp32_cache_state(
     num_logprobs: int,
     cache_dtype_param: str,
 ) -> None:
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -318,13 +329,15 @@ def test_fp32_cache_state(
 
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model,
-                     max_num_seqs=MAX_NUM_SEQS,
-                     **{cache_dtype_param: "float32"}) as vllm_model:
+    with vllm_runner(
+        model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -332,3 +345,414 @@ def test_fp32_cache_state(
         name_0="hf",
         name_1="vllm",
     )
+
+
+# Helper functions for the APC tests
+def _get_vllm_runner_params(
+    model: str,
+    max_model_len: int,
+    tensor_parallel_size: int = 1,
+):
+    return {
+        "model_name": model,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": False,
+        "max_model_len": max_model_len,
+        "tensor_parallel_size": tensor_parallel_size,
+        "gpu_memory_utilization": 0.4,
+    }
+
+
+def _get_vLLM_output(
+    vllm_runner,
+    kwargs,
+    prompts,
+    max_tokens,
+    num_logprobs,
+    num_repetitions=1,
+    vllm_model=None,
+):
+    outs = []
+    if vllm_model is None:
+        vllm_model = vllm_runner(**kwargs)
+    for _ in range(num_repetitions):
+        if num_logprobs < 0:
+            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
+        else:
+            vllm_output = vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs
+            )
+        outs.append(vllm_output)
+
+    return outs, vllm_model
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_single_prompt(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_single_prompt_block_align_alignment(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts. This custom prompt is used, as it causes the most issues
+    generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        # Retrieve the default mamba state block size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
+
+    # In case the hybrid model does not have the
+    # "mamba_block_size" assume a fixed constant
+    if mamba_block_size is None:
+        mamba_block_size = 512
+
+    mamba_block_size_multiplier = 10
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )
+
+        # Check alignment of the output logits when using APC
+        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+            # In the first repetition, the caches are filled
+            # In the second repetition, these caches are reused
+
+            compare_operator(
+                outputs_0_lst=vllm_outputs_no_cache[0],
+                outputs_1_lst=vllm_outputs_cache_itn,
+                name_0="vllm_no_cache",
+                name_1=f"vllm_cache_it_{r_idx + 1}",
+            )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_all_cached_outputs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_block_align_alignment(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts. This custom prompt is used, as it causes the most issues
+    prompt_text = "The president of the United States is "
+    prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
+    generated_prompts = [
+        prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
+    ]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        # Retrieve the default mamba state block size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
+
+    # In case the hybrid model does not have the
+    # "mamba_block_size" assume a fixed constant
+    if mamba_block_size is None:
+        mamba_block_size = 512
+
+    mamba_block_size_multiplier = 10
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )
+
+        # Check alignment of the output logits when using APC
+        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+            # In the first repetition, the caches are filled
+            # In the second repetition, these caches are reused
+
+            compare_operator(
+                outputs_0_lst=vllm_outputs_no_cache[0],
+                outputs_1_lst=vllm_outputs_cache_itn,
+                name_0="vllm_no_cache",
+                name_1=f"vllm_cache_it_{r_idx + 1}",
+            )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_partial_cached_outputs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    # Cache only part of all the prompts
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
+    )
+
+    compare_operator(
+        outputs_0_lst=vllm_outputs_no_cache[0][:3],
+        outputs_1_lst=vllm_outputs_partial_cache[0],
+        name_0="vllm_no_cache",
+        name_1="vllm_partial_cache",
+    )
+
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+        vllm_model=vllm_model,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 845afbfa8a45ede4c7977f3e44fbb50d487e29d4..0ae83ec16020a4cd7a679b4db5ee4c019196ac91 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -6,7 +6,9 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
-    MistralToolCall, MistralToolParser)
+    MistralToolCall,
+    MistralToolParser,
+)
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
@@ -33,136 +35,118 @@ SYMBOLIC_LANG_PROMPTS = [
 ]
 
 # for function calling
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
-                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     },
-}, {
-    "type": "function",
-    "function": {
-        "name": "rewrite",
-        "description": "Rewrites text",
-        "parameters": {
-            "type": "object",
-            "required": [],
-            "properties": {
-                "text": {
-                    "type": "string",
-                    "description": "The input text to rewrite."
-                }
-            }
-        }
-    }
-}]
-MSGS = [
     {
-        "role": "system",
-        "content": "You are an assistant."
+        "type": "function",
+        "function": {
+            "name": "rewrite",
+            "description": "Rewrites text",
+            "parameters": {
+                "type": "object",
+                "required": [],
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The input text to rewrite.",
+                    }
+                },
+            },
+        },
     },
+]
+MSGS = [
+    {"role": "system", "content": "You are an assistant."},
     {
-        "role":
-        "user",
-        "content":
-        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
+        "role": "user",
+        "content": "Could you please rewrite the below article? \n\n My English needs "
+        "improvving, maybe I make errors.",
     },
     {
-        "role":
-        "assistant",
-        "content":
-        "",
-        "tool_calls": [{
-            "id": "bbc5b7ede",
-            "type": "function",
-            "function": {
-                "name":
-                "rewrite",
-                "arguments":
-                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "id": "bbc5b7ede",
+                "type": "function",
+                "function": {
+                    "name": "rewrite",
+                    "arguments": '{"text":"My English needs improvving, maybe '
+                    'I make errors."}',
+                },
             }
-        }]
+        ],
     },
     {
         "role": "tool",
-        "content":
-        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
+        "content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
+        'I make errors."}',
         "tool_call_id": "bbc5b7ede",
-        "name": "rewrite"
+        "name": "rewrite",
     },
     {
         "role": "assistant",
-        "content": "---\n\nMy English needs improving, maybe I make errors"
+        "content": "---\n\nMy English needs improving, maybe I make errors",
     },
     {
-        "role":
-        "user",
-        "content": ("Can you tell me what the temperate"
-                    " will be in Dallas, in fahrenheit?")
-    }
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
 ]
 
 SAMPLE_JSON_SCHEMA = {
     "type": "object",
     "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
         "skills": {
             "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
+            "items": {"type": "string", "maxLength": 10},
+            "minItems": 3,
         },
         "work_history": {
             "type": "array",
             "items": {
                 "type": "object",
                 "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "number"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
+                    "company": {"type": "string"},
+                    "duration": {"type": "number"},
+                    "position": {"type": "string"},
                 },
-                "required": ["company", "position"]
-            }
-        }
+                "required": ["company", "position"],
+            },
+        },
     },
-    "required": ["name", "age", "skills", "work_history"]
+    "required": ["name", "age", "skills", "work_history"],
 }
 
 
@@ -170,17 +154,25 @@ SAMPLE_JSON_SCHEMA = {
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model, dtype=dtype,
-                     tokenizer_mode="mistral") as vllm_model:
+    with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -194,27 +186,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
-                        max_tokens: int, num_logprobs: int) -> None:
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
     ) as mistral_format_model:
         mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
+        model,
+        dtype=dtype,
+        tokenizer_mode="auto",
+        load_format="safetensors",
+        config_format="hf",
     ) as hf_format_model:
         hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_format_outputs,
@@ -226,34 +226,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(vllm_runner, model: str,
-                                    dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=8192,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=8192,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
         for prompt in SYMBOLIC_LANG_PROMPTS:
             msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.llm.chat([msg],
-                                          sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
-
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
         msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.llm.chat(msgs,
-                                      tools=TOOLS,
-                                      sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.llm.chat(
+            msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
+        )
 
         tokenizer = vllm_model.llm.get_tokenizer()
         tool_parser = MistralToolParser(tokenizer)
@@ -265,10 +266,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         assert parsed_message.tools_called
 
         assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
-        assert parsed_message.tool_calls[
-            0].function.name == "get_current_weather"
-        assert parsed_message.tool_calls[
-            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.tool_calls[0].function.name == "get_current_weather"
+        assert (
+            parsed_message.tool_calls[0].function.arguments
+            == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
+        )  # noqa
         assert parsed_message.content is None
 
 
@@ -297,17 +299,10 @@ def test_mistral_function_call_nested_json():
         "city": "Dallas",
         "state": "TX",
         "unit": "fahrenheit",
-        "sub_dict": {
-            "foo": "bar",
-            "inner": {
-                "x": 1,
-                "y": 2
-            }
-        },
+        "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
     }
 
-    model_output = (
-        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+    model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
 
     parsed = parser.extract_tool_calls(model_output, None)
 
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 6c9cc2821c30fbe2e72337a72b3204b38d8c87d8..e640655784ccb5b5b99e5e02ba242990717fbcb9 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -15,62 +15,56 @@ MODELS = [
 
 def test_phimoe_routing_function():
     from vllm.model_executor.models.phimoe import phimoe_routing_function
+
     test_case = {
         0: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.1, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
         },
         1: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.4, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
-        }
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
     }
 
     ground_truth = {
         0: {
-            "topk_weights":
-            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+            "topk_weights": torch.tensor(
+                [1.0, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
         },
         1: {
-            "topk_weights":
-            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
-        }
+            "topk_weights": torch.tensor(
+                [0.5, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        },
     }
 
     for test_id in test_case:
         topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
-        assert torch.allclose(topk_weights,
-                              ground_truth[test_id]["topk_weights"])
+        assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
         assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
 
 
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="This test takes a lot time to run on CPU, "
-                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="This test takes a lot time to run on CPU, "
+    "and vllm CI's disk space is not enough for this model.",
+)
 @large_gpu_test(min_gb=80)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -87,11 +81,13 @@ def test_models(
 ) -> None:
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py
index 6225bbe3377bd25704839f0b01d82c47b53a58be..59740505e82704f205cfa1b7e09fdee77f7e779c 100644
--- a/tests/models/language/generation_ppl_test/ppl_utils.py
+++ b/tests/models/language/generation_ppl_test/ppl_utils.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/docs/transformers/perplexity
-from typing import Optional, cast
+from typing import cast
 
-import pytest
 import torch
 from datasets import load_dataset
 
 import tests.ci_envs as ci_envs
-from tests.models.utils import (GenerateModelInfo,
-                                TokensTextLogprobsPromptLogprobs)
+from tests.models.utils import (
+    GenerateModelInfo,
+    TokensTextLogprobsPromptLogprobs,
+    get_vllm_extra_kwargs,
+)
 from vllm.logprobs import Logprob
 
 # See #24485
@@ -18,41 +20,25 @@ MAX_LENGTH = 1024
 
 
 @torch.inference_mode
-def wikitext_ppl_test(hf_runner,
-                      vllm_runner,
-                      model_info: GenerateModelInfo,
-                      max_length=MAX_LENGTH,
-                      vllm_extra_kwargs=None,
-                      atol=PPL_TOL):
-
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
+def wikitext_ppl_test(
+    hf_runner,
+    vllm_runner,
+    model_info: GenerateModelInfo,
+    max_length=MAX_LENGTH,
+    vllm_extra_kwargs=None,
+    atol=PPL_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
-    with vllm_runner(model_info.name,
-                     gpu_memory_utilization=0.7,
-                     max_model_len=max_length,
-                     max_num_seqs=1,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model_info.name,
+        gpu_memory_utilization=0.7,
+        max_model_len=max_length,
+        max_num_seqs=1,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
         # Use max_num_seqs=1 to avoid OOM,
         # and avoid batch different requests together.
 
@@ -60,7 +46,7 @@ def wikitext_ppl_test(hf_runner,
 
         # Confirm whether vllm is using the correct architecture
         if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
+            assert model_info.architecture in model_config.architectures
 
         max_length = min(model_config.max_model_len - 1, max_length)
         stride = max_length
@@ -74,16 +60,18 @@ def wikitext_ppl_test(hf_runner,
             end_loc = min(begin_loc + max_length, n_tokens)
             chunks.append(tokens[begin_loc:end_loc])
 
-        outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
-                                                      max_tokens=1,
-                                                      num_logprobs=None,
-                                                      num_prompt_logprobs=0,
-                                                      use_tqdm=False)
-        nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+        outputs = vllm_model.generate_greedy_logprobs(
+            prompts=chunks,
+            max_tokens=1,
+            num_logprobs=None,
+            num_prompt_logprobs=0,
+            use_tqdm=False,
+        )
+        nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
         n_tokens = 0
         for output in outputs:
             output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
 
             assert token_datas[0] is None
             token_log_probs = []
@@ -94,7 +82,8 @@ def wikitext_ppl_test(hf_runner,
                 token_log_probs.append(token_log_prob)
 
             neg_log_likelihood = -torch.tensor(
-                token_log_probs, dtype=torch.float32, device="cpu").sum()
+                token_log_probs, dtype=torch.float32, device="cpu"
+            ).sum()
             nll_sum += neg_log_likelihood
             n_tokens += len(token_log_probs)
         vllm_ppl = float(torch.exp(nll_sum / n_tokens))
@@ -104,14 +93,13 @@ def wikitext_ppl_test(hf_runner,
     # Accelerate ppl test by setting Transformers ppl score to a constant
     if model_info.hf_ppl is None:
         with hf_runner(
-                model_info.name,
-                dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+            model_info.name,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
         ) as hf_model:
-            nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+            nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
             n_tokens = 0
             for chunk in chunks:
-                inputs = hf_model.wrap_device(
-                    {"input_ids": torch.tensor([chunk])})
+                inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
                 input_ids = inputs["input_ids"]
                 outputs = hf_model.model(input_ids, labels=input_ids)
                 neg_log_likelihood = outputs.loss
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 86751e0a4d5f4ce23f104f2e1aa1748d99dee4eb..4ac40656bc62a2937c5f9fde76fc8860b29b314f 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -1,20 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Optional
 
 import pytest
 
 from tests.conftest import HfRunner
-from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
-                                matryoshka_fy)
+from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
 
 
 def run_embedding_correctness_test(
     hf_model: "HfRunner",
     inputs: list[str],
     vllm_outputs: Sequence[list[float]],
-    dimensions: Optional[int] = None,
+    dimensions: int | None = None,
 ):
     hf_outputs = hf_model.encode(inputs)
     if dimensions:
@@ -29,12 +27,14 @@ def run_embedding_correctness_test(
     )
 
 
-def correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts,
-                                  vllm_extra_kwargs=None,
-                                  hf_model_callback=None):
+def correctness_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    example_prompts,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+):
     pytest.skip("Debug only, ci prefers to use mteb test.")
 
     # The example_prompts has ending "\n", for example:
@@ -51,18 +51,16 @@ def correctness_test_embed_models(hf_runner,
     if model_info.hf_overrides is not None:
         vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
 
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
     with hf_runner(
-            model_info.name,
-            dtype=model_info.hf_dtype,
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype=model_info.hf_dtype,
+        is_sentence_transformer=True,
     ) as hf_model:
-
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
index 15e24c59d1dd9a119fa552195dcd4e4d360e8dc0..0904c7e877ef4bd2b828e63d529e159f5eb8ad76 100644
--- a/tests/models/language/pooling/test_auto_prefix_cache_support.py
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -4,8 +4,7 @@ import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
 
-from tests.models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 
 
 @pytest.mark.parametrize(
@@ -20,28 +19,38 @@ def test_classify_models(
     model: str,
     dtype: str,
 ) -> None:
+    # example_prompts is too short for testing prefix_caching
+    example_prompts = [s * 10 for s in example_prompts]
 
-    example_prompts = example_prompts * 2
-
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     enable_prefix_caching=True) as vllm_model:
+    with vllm_runner(
+        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
+    ) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert cache_config.enable_prefix_caching
-        vllm_outputs = vllm_model.classify(example_prompts)
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        # First Run
+        vllm_model.classify(example_prompts)
+
+        # assert prefix_caching works
+        pooling_outputs = vllm_model.llm.encode(
+            example_prompts, pooling_task="classify"
+        )
+        for output in pooling_outputs:
+            assert output.num_cached_tokens > 0
+        vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
+
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
         hf_output = torch.tensor(hf_output)
         vllm_output = torch.tensor(vllm_output)
 
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
 
 
 @pytest.mark.parametrize(
@@ -56,21 +65,30 @@ def test_embed_models(
     model: str,
     dtype: str,
 ):
-    example_prompts = [str(s).strip() for s in example_prompts] * 2
+    # example_prompts is too short for testing prefix_caching
+    example_prompts = [str(s).strip() * 10 for s in example_prompts]
 
     with vllm_runner(
-            model,
-            runner="pooling",
-            max_model_len=None,
-            enable_prefix_caching=True,
+        model,
+        runner="pooling",
+        max_model_len=None,
+        enable_prefix_caching=True,
     ) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert cache_config.enable_prefix_caching
-        vllm_outputs = vllm_model.embed(example_prompts)
+
+        # First Run
+        vllm_model.embed(example_prompts)
+
+        # assert prefix_caching works
+        pooling_outputs = vllm_model.llm.encode(example_prompts, pooling_task="embed")
+        for output in pooling_outputs:
+            assert output.num_cached_tokens > 0
+        vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
 
     with hf_runner(
-            model,
-            is_sentence_transformer=True,
+        model,
+        is_sentence_transformer=True,
     ) as hf_model:
         run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
 
@@ -81,13 +99,14 @@ def test_embed_models(
         "intfloat/e5-small",
         "Alibaba-NLP/gte-Qwen2-1.5B-instruct",  # is_causal == False
         "papluca/xlm-roberta-base-language-detection",
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
-                           dtype: str) -> None:
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     enable_prefix_caching=True) as vllm_model:
+def test_non_causal_models(
+    hf_runner, vllm_runner, example_prompts, model: str, dtype: str
+) -> None:
+    with vllm_runner(
+        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
+    ) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert not cache_config.enable_prefix_caching
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 8e398830d39df96fc5f5972fb233798af0cb8728..2723bb21de97bc4d775d76b81d3e50681b7bc1ca 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -10,34 +10,30 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize(
     "model",
     [
-        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.slow_test
-                     ]),
+        pytest.param(
+            "jason9693/Qwen2.5-1.5B-apeach",
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
+            ],
+        ),
     ],
 )
-@pytest.mark.parametrize("dtype",
-                         ["half"] if current_platform.is_rocm() else ["float"])
+@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
     # check logits difference
@@ -48,5 +44,6 @@ def test_models(
         # the tolerance value of 1e-2 is selected based on the
         # half datatype tests in
         # tests/models/language/pooling/test_embedding.py
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 17513d1bb20d73b3dbf731142275029c58fdb129..93b9843311d338cdbca1969ca3c4137435710964 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 
 from vllm.config import PoolerConfig
-from vllm.platforms import current_platform
 
 from ...utils import check_embeddings_close
 
@@ -18,20 +16,24 @@ from ...utils import check_embeddings_close
         # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
         # model code with bidirectional attention.
         # [Decoder-only]
-        pytest.param("BAAI/bge-multilingual-gemma2",
-                     marks=[pytest.mark.core_model, pytest.mark.slow_test]),
+        pytest.param(
+            "BAAI/bge-multilingual-gemma2",
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+        ),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
-            # CPU v1 doesn't support sliding window
-            marks=[pytest.mark.core_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.cpu_model]),
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
+        ),
         # [Encoder-only]
         pytest.param(
             "BAAI/bge-base-en-v1.5",
             marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
             ],
         ),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
@@ -48,23 +50,17 @@ def test_models(
     vllm_runner,
     example_prompts,
     model,
-    monkeypatch,
 ) -> None:
-
-    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
-        vllm_extra_kwargs["pooler_config"] = \
-            PoolerConfig(pooling_type="MEAN", normalize=False)
+        vllm_extra_kwargs["pooler_config"] = PoolerConfig(
+            pooling_type="MEAN", normalize=False
+        )
 
-    max_model_len: Optional[int] = 512
+    max_model_len: int | None = 512
     if model in [
-            "sentence-transformers/all-MiniLM-L12-v2",
-            "sentence-transformers/stsb-roberta-base-v2"
+        "sentence-transformers/all-MiniLM-L12-v2",
+        "sentence-transformers/stsb-roberta-base-v2",
     ]:
         max_model_len = None
 
@@ -79,10 +75,9 @@ def test_models(
     with hf_runner(model, is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model,
-                     runner="pooling",
-                     max_model_len=max_model_len,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
     check_embeddings_close(
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d41b93233d5a8200de0cc92c00c687f32018c53
--- /dev/null
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import TokensPrompt
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-0.6B"],
+)
+@torch.inference_mode
+def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
+    n_prompt_tokens = [55, 56, 57]
+    token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
+
+    with vllm_runner(
+        model,
+        max_model_len=128,
+        enforce_eager=True,
+        runner="pooling",
+        enable_chunked_prefill=False,
+        enable_prefix_caching=True,
+    ) as vllm_model:
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
+            assert output.num_cached_tokens == 0
+
+        # test enable_prefix_caching plus all pooling
+        # we need to skip reading cache at this request by
+        # request.skip_reading_prefix_cache
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
+            assert output.num_cached_tokens == 0
+
+        # skip_reading_prefix_cache can still write to cache
+        # to accelerate following requests
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert output.num_cached_tokens > 0
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 17a55d916b1ff7b16af626557d4ee655679a5ffb..0adc9b5cf25f6fa8cacd3da356d9dc360e2c2d00 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import numpy as np
 import openai
 import pytest
@@ -70,8 +68,9 @@ async def run_client_embeddings(
 
 
 def gritlm_instruction(instruction):
-    return ("<|user|>\n" + instruction +
-            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+    return (
+        "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
+    )
 
 
 def get_test_data():
@@ -80,7 +79,8 @@ def get_test_data():
     README.md in https://github.com/ContextualAI/gritlm
     """
     q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract", )
+        "Given a scientific paper title, retrieve the paper's abstract",
+    )
     queries = [
         "Bitcoin: A Peer-to-Peer Electronic Cash System",
         "Generative Representational Instruction Tuning",
@@ -114,9 +114,9 @@ def test_gritlm_offline_embedding(vllm_runner):
     queries, q_instruction, documents, d_instruction = get_test_data()
 
     with vllm_runner(
-            MODEL_NAME,
-            runner="pooling",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        runner="pooling",
+        max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
 
@@ -161,9 +161,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
     with vllm_runner(
-            MODEL_NAME,
-            runner="generate",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        runner="generate",
+        max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
 
diff --git a/tests/models/language/pooling/test_head_dtype.py b/tests/models/language/pooling/test_head_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..b60d4dade49a6ced2a9f46c4fc94475df486a708
--- /dev/null
+++ b/tests/models/language/pooling/test_head_dtype.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["nie3e/sentiment-polish-gpt2-small"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for head_dtype_str in ["float32", "model"]:
+        with vllm_runner(
+            model,
+            max_model_len=512,
+            dtype=dtype,
+            hf_overrides={"head_dtype": head_dtype_str},
+        ) as vllm_model:
+            model_config = vllm_model.llm.llm_engine.model_config
+            model_dtype = model_config.dtype
+            head_dtype = model_config.head_dtype
+
+            if head_dtype_str == "float32":
+                assert head_dtype == torch.float32
+            elif head_dtype_str == "model":
+                assert head_dtype == model_dtype
+
+            vllm_outputs = vllm_model.classify(example_prompts)
+
+        for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+            hf_output = torch.tensor(hf_output).float()
+            vllm_output = torch.tensor(vllm_output).float()
+
+            assert torch.allclose(hf_output, vllm_output, atol=1e-2)
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index 9814cad48a807c1383055ebd5c314fbbd8e8d99c..2482452645ef1d65cd72b0d4ba64ccc5d6efab74 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -2,18 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.config.pooler import PoolerConfig
-from vllm.platforms import current_platform
 
 
 def test_idefics_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -21,16 +14,18 @@ def test_idefics_multimodal(
         "The future of AI is",
     ]
 
-    with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
-                     runner="pooling",
-                     task="classify",
-                     convert="classify",
-                     load_format="dummy",
-                     max_model_len=512,
-                     enforce_eager=True,
-                     tensor_parallel_size=1,
-                     disable_log_stats=True,
-                     dtype="bfloat16") as vllm_model:
+    with vllm_runner(
+        model_name="HuggingFaceM4/Idefics3-8B-Llama3",
+        runner="pooling",
+        task="classify",
+        convert="classify",
+        load_format="dummy",
+        max_model_len=512,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_log_stats=True,
+        dtype="bfloat16",
+    ) as vllm_model:
         llm = vllm_model.get_llm()
         outputs = llm.classify(prompts)
         for output in outputs:
@@ -38,36 +33,30 @@ def test_idefics_multimodal(
 
 
 def update_config(config):
-    config.text_config.update({
-        "architectures": ["Gemma3ForSequenceClassification"],
-        "classifier_from_token": ["A", "B", "C", "D", "E"],
-        "method":
-        "no_post_processing",
-        "id2label": {
-            "A": "Chair",
-            "B": "Couch",
-            "C": "Table",
-            "D": "Bed",
-            "E": "Cupboard"
-        },
-    })
+    config.text_config.update(
+        {
+            "architectures": ["Gemma3ForSequenceClassification"],
+            "classifier_from_token": ["A", "B", "C", "D", "E"],
+            "method": "no_post_processing",
+            "id2label": {
+                "A": "Chair",
+                "B": "Couch",
+                "C": "Table",
+                "D": "Bed",
+                "E": "Cupboard",
+            },
+        }
+    )
     return config
 
 
 def test_gemma_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
-    messages = [{
-        "role":
-        "system",
-        "content":
-        """
+    messages = [
+        {
+            "role": "system",
+            "content": """
     You are a helpful assistant. You will be given a product description
     which may also include an image. Classify the following product into
     one of the categories:
@@ -78,38 +67,39 @@ def test_gemma_multimodal(
     D = bed
     E = cupboard
 
-    You'll answer with exactly one letter (A, B, C, D, or E)."""
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url":
-                "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
-            }
-        }, {
-            "type": "text",
-            "text": "A fine 19th century piece of furniture."
-        }]
-    }]
-
-    with vllm_runner(model_name="google/gemma-3-4b-it",
-                     runner="pooling",
-                     task="classify",
-                     convert="classify",
-                     load_format="auto",
-                     hf_overrides=update_config,
-                     pooler_config=PoolerConfig(pooling_type="LAST"),
-                     max_model_len=512,
-                     enforce_eager=True,
-                     tensor_parallel_size=1,
-                     disable_log_stats=True,
-                     dtype="bfloat16") as vllm_model:
+    You'll answer with exactly one letter (A, B, C, D, or E).""",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+                    },
+                },
+                {"type": "text", "text": "A fine 19th century piece of furniture."},
+            ],
+        },
+    ]
 
+    with vllm_runner(
+        model_name="google/gemma-3-4b-it",
+        runner="pooling",
+        task="classify",
+        convert="classify",
+        load_format="auto",
+        hf_overrides=update_config,
+        pooler_config=PoolerConfig(pooling_type="LAST"),
+        max_model_len=512,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_log_stats=True,
+        dtype="bfloat16",
+    ) as vllm_model:
         llm = vllm_model.get_llm()
         prompts = llm.preprocess_chat(messages)
 
         result = llm.classify(prompts)
         assert result[0].outputs.probs[0] > 0.95
-        assert all(c < 0.05 for c in result[0].outputs.probs[1:])
\ No newline at end of file
+        assert all(c < 0.05 for c in result[0].outputs.probs[1:])
diff --git a/tests/models/language/pooling/test_multi_vector_retrieval.py b/tests/models/language/pooling/test_multi_vector_retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..302f2df135579d14818236bbde5d11780a179cde
--- /dev/null
+++ b/tests/models/language/pooling/test_multi_vector_retrieval.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["BAAI/bge-m3"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=None,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(example_prompts)
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            embedding = output.last_hidden_state[0].float()
+            # normal
+            hf_outputs.append(embedding.cpu())
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
diff --git a/tests/models/language/pooling/test_multilabel_classification_support.py b/tests/models/language/pooling/test_multilabel_classification_support.py
index 45366f20941447fbc64b82e83d2645b166ac002b..472fee71711a637112bc512f1953afc73fa115ce 100644
--- a/tests/models/language/pooling/test_multilabel_classification_support.py
+++ b/tests/models/language/pooling/test_multilabel_classification_support.py
@@ -20,14 +20,15 @@ def test_classify_models(
     with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
         hf_output = torch.tensor(hf_output)
         vllm_output = torch.tensor(vllm_output)
 
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index c34c36fd98150badd1755a3800f8f9d6bf60f90b..88f088c603276079c803cc41c96322f8868de923 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo
 
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
-    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
-    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    # EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    # EmbedModelInfo("nomic-ai/CodeRankEmbed"),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
-    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+    # EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
 ]
 
 rope_theta = 1000
@@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
-    with vllm_runner(model_info.name, runner="pooling",
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None
+    ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
             # For nomic-embed-text-v2-moe the length is set to 512
             # by sentence_bert_config.json.
             assert model_config.max_model_len == 512
         else:
-            assert (
-                model_config.max_model_len == original_max_position_embeddings)
+            assert model_config.max_model_len == original_max_position_embeddings
 
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_set_max_model_len_legal(model_info, vllm_runner):
     # set max_model_len <= 512
-    with vllm_runner(model_info.name, runner="pooling",
-                     max_model_len=256) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=256
+    ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.max_model_len == 256
 
@@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
         # For nomic-embed-text-v2-moe the length is set to 512
         # by sentence_bert_config.json.
         with pytest.raises(ValueError):
-            with vllm_runner(model_info.name,
-                             runner="pooling",
-                             max_model_len=1024):
+            with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
                 pass
     else:
-        with vllm_runner(model_info.name, runner="pooling",
-                         max_model_len=1024) as vllm_model:
+        with vllm_runner(
+            model_info.name, runner="pooling", max_model_len=1024
+        ) as vllm_model:
             model_config = vllm_model.llm.llm_engine.model_config
             assert model_config.max_model_len == 1024
 
@@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
 def test_set_max_model_len_illegal(model_info, vllm_runner):
     # set max_model_len > 2048
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name, runner="pooling",
-                         max_model_len=4096):
+        with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
             pass
 
     # set max_model_len > 2048 by hf_overrides
     hf_overrides = {"max_model_len": 4096}
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=None,
+            hf_overrides=hf_overrides,
+        ):
             pass
 
 
@@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
         },
-        "max_model_len": max_model_len
+        "max_model_len": max_model_len,
     }
 
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     hf_overrides=hf_overrides):
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
+    ):
         pass
 
 
@@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
-        }
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
     }
     # illegal max_model_len
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=max_model_len + 1,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=max_model_len + 1,
+            hf_overrides=hf_overrides,
+        ):
             pass
 
     hf_overrides = {
@@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
         },
-        "max_model_len": max_model_len + 1
+        "max_model_len": max_model_len + 1,
     }
     # illegal max_model_len by hf_overrides
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=None,
+            hf_overrides=hf_overrides,
+        ):
             pass
diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
index 9b3fbd6a6cd093ab5901a008a7980390d9f8bd9f..deb5de984d90917e8e8ee98a1579d5035893a0c3 100644
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -10,10 +10,7 @@ from vllm.config import PoolerConfig
 
 @pytest.mark.parametrize(
     "model",
-    [
-        "jason9693/Qwen2.5-1.5B-apeach",
-        "papluca/xlm-roberta-base-language-detection"
-    ],
+    ["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
 )
 @pytest.mark.parametrize("dtype", ["half"])
 def test_classify_models_using_activation(
@@ -23,30 +20,32 @@ def test_classify_models_using_activation(
     model: str,
     dtype: str,
 ) -> None:
-
     with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(activation=False)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(use_activation=False),
+    ) as vllm_model:
         wo_activation_out = vllm_model.classify(example_prompts)
 
     with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(activation=True)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(use_activation=True),
+    ) as vllm_model:
         w_activation_out = vllm_model.classify(example_prompts)
 
-    for wo_activation, w_activation in zip(wo_activation_out,
-                                           w_activation_out):
+    for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
         wo_activation = torch.tensor(wo_activation)
         w_activation = torch.tensor(w_activation)
 
-        assert not torch.allclose(wo_activation, w_activation,
-                                  atol=1e-2), "pooler_config is not working"
-        assert torch.allclose(softmax(wo_activation), w_activation,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
+            "pooler_config is not working"
+        )
+        assert torch.allclose(
+            softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
+        )
 
 
 @pytest.mark.parametrize(
@@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
     model: str,
     dtype: str,
 ) -> None:
-
     with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(normalize=False)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
         wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
 
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(normalize=True)) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
         w_normalize = torch.tensor(vllm_model.embed(example_prompts))
 
-    assert not torch.allclose(
-        wo_normalize, w_normalize,
-        atol=1e-2), "pooler_config normalize is not working"
+    assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
+        "pooler_config normalize is not working"
+    )
     assert torch.allclose(
-        F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
-        atol=1e-2), "w_normal should be close to normal(wo_normal)."
+        F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
+    ), "w_normal should be close to normal(wo_normal)."
 
 
 @pytest.mark.parametrize(
@@ -92,32 +93,75 @@ def test_embed_models_using_normalize(
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
-def test_reward_models_using_softmax(
+def test_reward_models_using_activation(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
 ) -> None:
+    with vllm_runner(
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(use_activation=False),
+    ) as vllm_model:
+        wo_activation = vllm_model.reward(example_prompts)
 
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(softmax=False)) as vllm_model:
-        wo_softmax = vllm_model.encode(example_prompts)
-
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(softmax=True)) as vllm_model:
-        w_softmax = vllm_model.encode(example_prompts)
-
-    for wo, w in zip(wo_softmax, w_softmax):
+    with vllm_runner(
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(use_activation=True),
+    ) as vllm_model:
+        w_activation = vllm_model.reward(example_prompts)
+
+    for wo, w in zip(wo_activation, w_activation):
         wo = torch.tensor(wo)
         w = torch.tensor(w)
 
-        assert not torch.allclose(
-            wo, w, atol=1e-2), "pooler_config softmax is not working"
-        assert torch.allclose(
-            softmax(wo), w,
-            atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config activation is not working"
+        )
+        assert torch.allclose(softmax(wo), w, atol=1e-2), (
+            "w_activation should be close to activation(wo_activation)."
+        )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/multilingual-e5-small",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_multi_vector_retrieval_models_using_normalize(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
+        wo_normalize = vllm_model.token_embed(example_prompts)
+
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
+        w_normalize = vllm_model.token_embed(example_prompts)
+
+    for wo, w in zip(wo_normalize, w_normalize):
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config normalize is not working"
+        )
+        assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
+            "w_normal should be close to normal(wo_normal)."
+        )
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 4ac91b5aed506047d0ca247c8b281f005598d3ee..c42186c7db9a3af2740b74fd379ffc94a35a3324 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -16,10 +16,8 @@ from ...utils import check_transformers_version
 def math_step_prompts():
     # ruff: noqa: E501
     data = {
-        "system":
-        "Please reason step by step, and put your final answer within \\boxed{}. ",
-        "query":
-        "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
+        "system": "Please reason step by step, and put your final answer within \\boxed{}. ",
+        "query": "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
         "response": [
             "To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
             "On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
@@ -27,16 +25,16 @@ def math_step_prompts():
             "To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
         ],
     }
-    answer = "<extra_0>".join(data['response']) + "<extra_0>"
+    answer = "<extra_0>".join(data["response"]) + "<extra_0>"
     prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
     return [prompt]
 
 
 def step_reward_patch_hf_model(hf_model: HfRunner):
-
     # Patch the hf_runner to use the step reward function
-    def make_step_rewards(logits: torch.Tensor,
-                          token_masks: torch.Tensor) -> list[list[float]]:
+    def make_step_rewards(
+        logits: torch.Tensor, token_masks: torch.Tensor
+    ) -> list[list[float]]:
         probabilities = F.softmax(logits, dim=-1)
         probabilities = probabilities * token_masks.unsqueeze(-1)
 
@@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
         outputs = hf_model.model(input_ids=input_ids)
 
         step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
-        token_masks = (input_ids == step_sep_id)
+        token_masks = input_ids == step_sep_id
         return make_step_rewards(outputs[0], token_masks)
 
     hf_model.reward = reward  # type: ignore[attr-defined]
@@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
 @pytest.mark.parametrize(
     "model",
     [
-        pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
@@ -76,19 +76,14 @@ def test_prm_models(
     math_step_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
-    check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
-                               max_transformers_version="4.53.2")
+    check_transformers_version(
+        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
+    )
 
     if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
 
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.reward(math_step_prompts)
 
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index ef9d5530cde15ada77b03b51c153e2807f30d930..416a43070f0e02ef0d4c366b3904e66592e6173f 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
     assert len(vllm_outputs) == 1
@@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -101,17 +98,15 @@ def emb_model_name(request):
 def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
     text_pair = [TEXTS_1[0], TEXTS_2[0]]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
         hf_embeddings = hf_model.encode(text_pair)
-        hf_outputs = [
-            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
-        ]
+        hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
 
-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
     assert len(vllm_outputs) == 1
@@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
         [TEXTS_1[0], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         hf_outputs = [
             F.cosine_similarity(*map(torch.tensor, pair), dim=0)
             for pair in hf_embeddings
         ]
 
-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
         [TEXTS_1[1], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         hf_outputs = [
             F.cosine_similarity(*map(torch.tensor, pair), dim=0)
             for pair in hf_embeddings
         ]
 
-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
     assert len(vllm_outputs) == 2
diff --git a/tests/models/language/pooling/test_splade_sparse_pooler.py b/tests/models/language/pooling/test_splade_sparse_pooler.py
new file mode 100644
index 0000000000000000000000000000000000000000..af4fd764ef535ae0017c48f3d328b2e2fa582b03
--- /dev/null
+++ b/tests/models/language/pooling/test_splade_sparse_pooler.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+
+import pytest
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.models.bert import (
+    BertMLMHead,
+    SPLADESparsePooler,
+)
+
+# ---------------------------------------------------------------------
+# Functional test: SPLADE formula correctness (no HF download needed)
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
+@torch.inference_mode
+def test_splade_pooler_matches_reference_formula(B, T, H, V):
+    """Ensure SPLADESparsePooler forward() matches the mathematical formula:
+    log1p(relu(logits)) -> max over sequence length (after masking)."""
+    torch.manual_seed(0)
+
+    # Prepare [B] sequences of shape [T, H]
+    hs_list = [torch.randn(T, H) for _ in range(B)]
+    hs_tenser = torch.cat(hs_list)
+
+    # Simulate PoolingMetadata (only required fields)
+    prompt_lens = [T, T - 1]
+    prompt_lens_tenser = torch.tensor(prompt_lens, dtype=torch.int32)
+    token_ids = torch.tensor(
+        [
+            [101, 5, 102],  # Batch 0: [CLS], token, [SEP]
+            [101, 6, 6],  # Batch 1: [CLS], token, token (last token ignored)
+        ],
+        dtype=torch.long,
+    )
+    meta = types.SimpleNamespace(
+        prompt_lens=prompt_lens_tenser, prompt_token_ids=token_ids
+    )
+
+    # MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
+    try:
+        mlm_head = BertMLMHead(hidden_size=H, vocab_size=V, layer_norm_eps=1e-12)
+    except Exception:
+        mlm_head = nn.Linear(H, V, bias=True)
+
+    # Forward pass through SPLADE pooler
+    pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
+    pooled = pooler(hidden_states=hs_tenser, pooling_metadata=meta)  # list of [V]
+
+    # Basic output checks
+    assert isinstance(pooled, torch.Tensor) and len(pooled) == B
+    for vec in pooled:
+        assert vec.shape == (V,)
+        assert torch.isfinite(vec).all()
+        assert (vec >= 0).all(), "SPLADE outputs must be non-negative."
+
+    # Reference implementation for comparison
+    def ref_one(hs: torch.Tensor, L: int, tid_row: torch.Tensor) -> torch.Tensor:
+        keep = torch.ones(L, dtype=torch.bool)
+        if L > 0 and tid_row[0].item() == 101:  # remove CLS
+            keep[0] = False
+        if L > 0 and tid_row[L - 1].item() == 102:  # remove SEP
+            keep[L - 1] = False
+
+        valid = hs[:L][keep[:L]]
+        if valid.numel() == 0:
+            return torch.zeros(V, dtype=torch.float32)
+
+        logits = mlm_head(valid)  # [L', V]
+        scores = torch.log1p(torch.relu(logits))  # [L', V]
+        return scores.max(dim=0).values.to(torch.float32)
+
+    torch.testing.assert_close(
+        pooled[0],
+        ref_one(hs_list[0], prompt_lens[0], token_ids[0]),
+        rtol=1e-4,
+        atol=1e-4,
+    )
+    torch.testing.assert_close(
+        pooled[1],
+        ref_one(hs_list[1], prompt_lens[1], token_ids[1]),
+        rtol=1e-4,
+        atol=1e-4,
+    )
diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
index fd5e48a8b14491b641655966a206b08175d6e728..2dfc0072126bc2558086ec09cf6947717b634809 100644
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -11,7 +11,7 @@ from tests.models.utils import softmax
 # The float32 is required for this tiny model to pass the test.
 @pytest.mark.parametrize("dtype", ["float"])
 @torch.inference_mode
-def test_models(
+def test_bert_models(
     hf_runner,
     vllm_runner,
     example_prompts,
@@ -19,11 +19,11 @@ def test_models(
     dtype: str,
 ) -> None:
     with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForTokenClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
         tokenizer = hf_model.tokenizer
         hf_outputs = []
         for prompt in example_prompts:
@@ -37,3 +37,34 @@ def test_models(
         hf_output = torch.tensor(hf_output).cpu().float()
         vllm_output = torch.tensor(vllm_output).cpu().float()
         assert torch.allclose(hf_output, vllm_output, 1e-2)
+
+
+@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_modernbert_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.token_classify(example_prompts)
+
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index c6ef899958a07b4722f408a311faa2029b8b35a2..f1870ddbee5100489cdb46404511ad2726643c89 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
 field."""
 
 
-def test_smaller_truncation_size(vllm_runner,
-                                 model_name=MODEL_NAME,
-                                 input_str=input_str):
-
+def test_smaller_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
     truncate_prompt_tokens = 10
 
-    with vllm_runner(model_name, runner="pooling",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", max_model_len=max_model_len
+    ) as vllm_model:
         vllm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
     prompt_tokens = vllm_output[0].prompt_token_ids
 
     assert len(prompt_tokens) == truncate_prompt_tokens
 
 
-def test_max_truncation_size(vllm_runner,
-                             model_name=MODEL_NAME,
-                             input_str=input_str):
+def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
     truncate_prompt_tokens = -1
 
-    with vllm_runner(model_name, runner="pooling",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", max_model_len=max_model_len
+    ) as vllm_model:
         vllm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
     prompt_tokens = vllm_output[0].prompt_token_ids
 
     assert len(prompt_tokens) == max_model_len
 
 
-def test_bigger_truncation_size(vllm_runner,
-                                model_name=MODEL_NAME,
-                                input_str=input_str):
-
+def test_bigger_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
     truncate_prompt_tokens = max_model_len + 1
 
-    with pytest.raises(ValueError), vllm_runner(
-            model_name, runner="pooling",
-            max_model_len=max_model_len) as vllm_model:
-
+    with (
+        pytest.raises(ValueError),
+        vllm_runner(
+            model_name, runner="pooling", max_model_len=max_model_len
+        ) as vllm_model,
+    ):
         llm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
-        assert llm_output == f"""truncate_prompt_tokens value 
+        assert (
+            llm_output
+            == f"""truncate_prompt_tokens value 
                 ({truncate_prompt_tokens}) is greater than 
                 max_model_len ({max_model_len}). Please, select 
                 a smaller truncation size."""
+        )
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 7b3c02fbbd9f81272568b05956b680b237d2be57..0384ff82790f076e046cf7ac2066dd43d9ba4ddf 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -3,17 +3,19 @@
 
 import tempfile
 from collections.abc import Sequence
-from typing import Optional
 
 import mteb
 import numpy as np
-import pytest
 import requests
 import torch
 
 import tests.ci_envs as ci_envs
-from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
-                                check_embeddings_close)
+from tests.models.utils import (
+    EmbedModelInfo,
+    RerankModelInfo,
+    check_embeddings_close,
+    get_vllm_extra_kwargs,
+)
 
 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -30,7 +32,6 @@ MTEB_RERANK_TOL = 2e-3
 
 
 class VllmMtebEncoder(mteb.Encoder):
-
     def __init__(self, vllm_model):
         super().__init__()
         self.llm = vllm_model
@@ -53,8 +54,7 @@ class VllmMtebEncoder(mteb.Encoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -64,17 +64,15 @@ class VllmMtebEncoder(mteb.Encoder):
         queries = [s[0] for s in sentences]
         corpus = [s[1] for s in sentences]
 
-        outputs = self.llm.score(queries,
-                                 corpus,
-                                 truncate_prompt_tokens=-1,
-                                 use_tqdm=False)
+        outputs = self.llm.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
         scores = np.array(outputs)
         scores = scores[np.argsort(r)]
         return scores
 
 
 class OpenAIClientMtebEncoder(mteb.Encoder):
-
     def __init__(self, model_name: str, client):
         super().__init__()
         self.model_name = model_name
@@ -87,8 +85,9 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
 
-        embeddings = self.client.embeddings.create(model=self.model_name,
-                                                   input=sentences)
+        embeddings = self.client.embeddings.create(
+            model=self.model_name, input=sentences
+        )
         outputs = [d.embedding for d in embeddings.data]
         embeds = np.array(outputs)
         embeds = embeds[np.argsort(r)]
@@ -96,7 +95,6 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
 
 
 class ScoreClientMtebEncoder(mteb.Encoder):
-
     def __init__(self, model_name: str, url):
         super().__init__()
         self.model_name = model_name
@@ -105,8 +103,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -122,27 +119,30 @@ class ScoreClientMtebEncoder(mteb.Encoder):
         return scores
 
     def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "text_1": query,
-                                     "text_2": corpus,
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['data'][0]["score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "text_1": query,
+                "text_2": corpus,
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["data"][0]["score"]
 
 
 class RerankClientMtebEncoder(ScoreClientMtebEncoder):
-
     def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "query": query,
-                                     "documents": [corpus],
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['results'][0]["relevance_score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "query": query,
+                "documents": [corpus],
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["results"][0]["relevance_score"]
 
 
 def run_mteb_embed_task(encoder, tasks):
@@ -161,41 +161,25 @@ def run_mteb_embed_task(encoder, tasks):
     return main_score
 
 
-def mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None,
-                           hf_model_callback=None,
-                           atol=MTEB_EMBED_TOL):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
+def mteb_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    atol=MTEB_EMBED_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     # Test embed_dims, isnan and whether to use normalize
     example_prompts = ["The chef prepared a delicious meal." * 1000]
 
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
-
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=model_info.max_model_len,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
 
         # Confirm whether vllm is using the correct architecture
@@ -204,28 +188,32 @@ def mteb_test_embed_models(hf_runner,
 
         # Confirm whether vllm uses the correct default_pooling_type, which
         # relates to whether chunked prefill and prefix caching are enabled
-        assert (model_config._model_info.default_pooling_type ==
-                model_info.default_pooling_type)
+        assert (
+            model_config._model_info.default_pooling_type
+            == model_info.default_pooling_type
+        )
 
-        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
-                                              MTEB_EMBED_TASKS)
+        vllm_main_score = run_mteb_embed_task(
+            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
+        )
         vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
         head_dtype = model_config.head_dtype
 
-        # Test embed_dims, isnan and whether to use normalize
-        vllm_outputs = vllm_model.embed(example_prompts,
-                                        truncate_prompt_tokens=-1)
-        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
+        # Test embedding_size, isnan and whether to use normalize
+        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
+        outputs_tensor = torch.tensor(vllm_outputs)
+        assert not torch.any(torch.isnan(outputs_tensor))
+        embedding_size = model_config.embedding_size
+        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
 
     # Accelerate mteb test by setting
     # SentenceTransformers mteb score to a constant
     if model_info.mteb_score is None:
         with hf_runner(
-                model_info.name,
-                is_sentence_transformer=True,
-                dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+            model_info.name,
+            is_sentence_transformer=True,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
         ) as hf_model:
-
             # e.g. setting default parameters for the encode method of hf_runner
             if hf_model_callback is not None:
                 hf_model_callback(hf_model)
@@ -233,7 +221,7 @@ def mteb_test_embed_models(hf_runner,
             st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
             st_dtype = next(hf_model.model.parameters()).dtype
 
-            # Test embed_dims and whether to use normalize
+            # Check embeddings close to hf outputs
             hf_outputs = hf_model.encode(example_prompts)
             check_embeddings_close(
                 embeddings_0_lst=hf_outputs,
@@ -247,8 +235,7 @@ def mteb_test_embed_models(hf_runner,
         st_dtype = "Constant"
 
     print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
-          vllm_main_score)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
@@ -282,26 +269,21 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
             top_k=10,
             save_predictions=True,
             output_folder=f"{results_folder}/stage2",
-            previous_results=
-            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
+            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
             encode_kwargs={"show_progress_bar": False},
         )
         main_score = results[0].scores["test"][0]["main_score"]
     return main_score
 
 
-def mteb_test_rerank_models_hf(hf_runner,
-                               model_name,
-                               hf_dtype="float32",
-                               hf_model_callback=None):
-    with hf_runner(model_name, is_cross_encoder=True,
-                   dtype=hf_dtype) as hf_model:
-
+def mteb_test_rerank_models_hf(
+    hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
+):
+    with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
         original_predict = hf_model.predict
 
         def _predict(
-            sentences: list[tuple[str, str,
-                                  Optional[str]]],  # query, corpus, prompt
+            sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
             *args,
             **kwargs,
         ):
@@ -315,64 +297,52 @@ def mteb_test_rerank_models_hf(hf_runner,
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
-        st_main_score = run_mteb_rerank(hf_model,
-                                        tasks=MTEB_RERANK_TASKS,
-                                        languages=MTEB_RERANK_LANGS)
+        st_main_score = run_mteb_rerank(
+            hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
+        )
         st_dtype = next(hf_model.model.model.parameters()).dtype
     return st_main_score, st_dtype
 
 
-def mteb_test_rerank_models(hf_runner,
-                            vllm_runner,
-                            model_info: RerankModelInfo,
-                            vllm_extra_kwargs=None,
-                            hf_model_callback=None,
-                            vllm_mteb_encoder=VllmMtebEncoder,
-                            atol=MTEB_RERANK_TOL):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     max_num_seqs=8,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
-
+def mteb_test_rerank_models(
+    hf_runner,
+    vllm_runner,
+    model_info: RerankModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    vllm_mteb_encoder=VllmMtebEncoder,
+    atol=MTEB_RERANK_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
+
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=None,
+        max_num_seqs=8,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
 
         # Confirm whether vllm is using the correct architecture
         if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
+            assert model_info.architecture in model_config.architectures
 
         # Score API is only enabled for num_labels == 1
         assert model_config.hf_config.num_labels == 1
 
         # Confirm whether vllm uses the correct default_pooling_type, which
         # relates to whether chunked prefill and prefix caching are enabled
-        assert (model_config._model_info.default_pooling_type ==
-                model_info.default_pooling_type)
+        assert (
+            model_config._model_info.default_pooling_type
+            == model_info.default_pooling_type
+        )
 
-        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
-                                          tasks=MTEB_RERANK_TASKS,
-                                          languages=MTEB_RERANK_LANGS)
+        vllm_main_score = run_mteb_rerank(
+            vllm_mteb_encoder(vllm_model),
+            tasks=MTEB_RERANK_TASKS,
+            languages=MTEB_RERANK_LANGS,
+        )
         vllm_dtype = model_config.dtype
         head_dtype = model_config.head_dtype
 
@@ -380,14 +350,14 @@ def mteb_test_rerank_models(hf_runner,
     # SentenceTransformers mteb score to a constant
     if model_info.mteb_score is None:
         st_main_score, st_dtype = mteb_test_rerank_models_hf(
-            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
+            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
+        )
     else:
         st_main_score = model_info.mteb_score
         st_dtype = "Constant"
 
     print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
-          vllm_main_score)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py
index e131c9b1038ded91d3a13a7b71b9220343b16378..bad13e2457146d427501e7e4bfeb39511253a14b 100644
--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@@ -2,67 +2,76 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+    RerankModelInfo,
+)
 
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
-                             architecture="BertModel",
-                             mteb_score=0.779336792,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-en",
+        architecture="BertModel",
+        mteb_score=0.779336792,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-en", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-en", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
     ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo("BAAI/bge-m3",
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.787343078,
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-m3",
+        architecture="XLMRobertaModel",
+        mteb_score=0.787343078,
+        enable_test=True,
+    ),
     ########## Qwen2Model
-    LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
-                              architecture="Qwen2Model",
-                              mteb_score=0.75724465,
-                              dtype="float32",
-                              enable_test=True),
+    LASTPoolingEmbedModelInfo(
+        "BAAI/bge-code-v1",
+        architecture="Qwen2Model",
+        mteb_score=0.75724465,
+        dtype="float32",
+        enable_test=True,
+    ),
 ]
 
 RERANK_MODELS = [
@@ -71,33 +80,35 @@ RERANK_MODELS = [
         "BAAI/bge-reranker-base",
         architecture="XLMRobertaForSequenceClassification",
         mteb_score=0.32398,
-        enable_test=True),
+        enable_test=True,
+    ),
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-large",
         architecture="XLMRobertaForSequenceClassification",
-        enable_test=False),
+        enable_test=False,
+    ),
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-v2-m3",
         architecture="XLMRobertaForSequenceClassification",
-        enable_test=False)
+        enable_test=False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 1eca2a2c0abd97a685fa70ac70e53855649cfee1..2927a37111364192a7274b1712a02dc81c551a86 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import pytest
@@ -8,53 +8,50 @@ import torch
 
 from tests.conftest import HfRunner
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebEncoder, mteb_test_rerank_models)
+    VllmMtebEncoder,
+    mteb_test_rerank_models,
+)
 from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification",
-                               mteb_score=0.33757,
-                               hf_overrides={
-                                   "architectures":
-                                   ["GemmaForSequenceClassification"],
-                                   "classifier_from_token": ["Yes"],
-                                   "method":
-                                   "no_post_processing",
-                               }),
+    LASTPoolingRerankModelInfo(
+        "BAAI/bge-reranker-v2-gemma",
+        architecture="GemmaForSequenceClassification",
+        mteb_score=0.33757,
+        hf_overrides={
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        },
+    ),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
 
 
 class GemmaRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
 
     @torch.no_grad()
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def get_inputs(pairs, tokenizer, prompt=None):
             if prompt is None:
                 prompt = PROMPT
 
             sep = "\n"
-            prompt_inputs = tokenizer(prompt,
-                                      return_tensors=None,
-                                      add_special_tokens=False)["input_ids"]
-            sep_inputs = tokenizer(sep,
-                                   return_tensors=None,
-                                   add_special_tokens=False)["input_ids"]
+            prompt_inputs = tokenizer(
+                prompt, return_tensors=None, add_special_tokens=False
+            )["input_ids"]
+            sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
+                "input_ids"
+            ]
             inputs = []
             for query, passage in pairs:
                 query_inputs = tokenizer(
@@ -78,8 +75,7 @@ class GemmaRerankerHfRunner(HfRunner):
                     return_token_type_ids=False,
                     add_special_tokens=False,
                 )
-                item["input_ids"] = item[
-                    "input_ids"] + sep_inputs + prompt_inputs
+                item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
                 item["attention_mask"] = [1] * len(item["input_ids"])
                 inputs.append(item)
             return tokenizer.pad(
@@ -95,14 +91,19 @@ class GemmaRerankerHfRunner(HfRunner):
             inputs = inputs.to(self.model.device)
             _n_tokens = inputs["input_ids"].shape[1]
             logits = self.model(**inputs, return_dict=True).logits
-            _scores = (logits[:, -1,
-                              self.yes_loc].view(-1, ).float().sigmoid())
+            _scores = (
+                logits[:, -1, self.yes_loc]
+                .view(
+                    -1,
+                )
+                .float()
+                .sigmoid()
+            )
             scores.append(_scores[0].item())
         return torch.Tensor(scores)
 
 
 class GemmaMtebEncoder(VllmMtebEncoder):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.query_template = "A: {query}\n"
@@ -110,12 +111,10 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
-
         _sentences = []
         for query, corpus, prompt in sentences:
             query = self.query_template.format(query=query)
@@ -127,8 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-
-    mteb_test_rerank_models(GemmaRerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_mteb_encoder=GemmaMtebEncoder)
+    mteb_test_rerank_models(
+        GemmaRerankerHfRunner,
+        vllm_runner,
+        model_info,
+        vllm_mteb_encoder=GemmaMtebEncoder,
+    )
diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
index ad320fae0c85a5a7e4b824d0843db4d7054f5179..638ffc7a62b0ea5b6352d014cbb50ec88dcb5e72 100644
--- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@@ -2,22 +2,30 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from tests.models.utils import (CLSPoolingRerankModelInfo,
-                                LASTPoolingRerankModelInfo, RerankModelInfo)
+from tests.models.utils import (
+    CLSPoolingRerankModelInfo,
+    LASTPoolingRerankModelInfo,
+    RerankModelInfo,
+)
 
 from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                              mteb_score=0.32898,
-                              architecture="BertForSequenceClassification"),
-    LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-                               mteb_score=0.25736,
-                               architecture="Qwen3ForSequenceClassification")
+    CLSPoolingRerankModelInfo(
+        "cross-encoder/ms-marco-TinyBERT-L-2-v2",
+        mteb_score=0.32898,
+        architecture="BertForSequenceClassification",
+    ),
+    LASTPoolingRerankModelInfo(
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+        mteb_score=0.25736,
+        architecture="Qwen3ForSequenceClassification",
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py
index 9ae43fd05bf78c47a8a66bef1c942fe821325199..a22821fd65b5a07aa5e585dc70ef380843e12fce 100644
--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@@ -3,74 +3,93 @@
 
 import pytest
 
-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+    RerankModelInfo,
+)
 
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo("thenlper/gte-large",
-                             mteb_score=0.76807651,
-                             architecture="BertModel",
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("thenlper/gte-base",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-small",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-large",
+        mteb_score=0.76807651,
+        architecture="BertModel",
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-base", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-small", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-base-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
+    ),
     ########### NewModel
     # These three architectures are almost the same, but not exactly the same.
     # For example,
     # - whether to use token_type_embeddings
     # - whether to use context expansion
     # So only test one (the most widely used) model
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
-                             architecture="GteNewModel",
-                             mteb_score=0.775074696,
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
-                             architecture="GteNewModel",
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
-                             architecture="GteNewModel",
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-multilingual-base",
+        architecture="GteNewModel",
+        mteb_score=0.775074696,
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-base-en-v1.5",
+        architecture="GteNewModel",
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-large-en-v1.5",
+        architecture="GteNewModel",
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=False,
+    ),
     ########### Qwen2ForCausalLM
-    LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                              mteb_score=0.758473459018872,
-                              architecture="Qwen2ForCausalLM",
-                              enable_test=True),
+    LASTPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+        mteb_score=0.758473459018872,
+        architecture="Qwen2ForCausalLM",
+        enable_test=True,
+    ),
     ########## ModernBertModel
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
-                             mteb_score=0.748193353,
-                             architecture="ModernBertModel",
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-modernbert-base",
+        mteb_score=0.748193353,
+        architecture="ModernBertModel",
+        enable_test=True,
+    ),
     ########## Qwen3ForCausalLM
-    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
-                              mteb_score=0.771163695,
-                              architecture="Qwen3ForCausalLM",
-                              dtype="float32",
-                              enable_test=True),
-    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
-                              architecture="Qwen3ForCausalLM",
-                              dtype="float32",
-                              enable_test=False),
+    LASTPoolingEmbedModelInfo(
+        "Qwen/Qwen3-Embedding-0.6B",
+        mteb_score=0.771163695,
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=True,
+    ),
+    LASTPoolingEmbedModelInfo(
+        "Qwen/Qwen3-Embedding-4B",
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=False,
+    ),
 ]
 
 RERANK_MODELS = [
@@ -79,31 +98,32 @@ RERANK_MODELS = [
         "Alibaba-NLP/gte-reranker-modernbert-base",
         mteb_score=0.33386,
         architecture="ModernBertForSequenceClassification",
-        enable_test=True),
+        enable_test=True,
+    ),
     CLSPoolingRerankModelInfo(
         "Alibaba-NLP/gte-multilingual-reranker-base",
         mteb_score=0.33062,
         architecture="GteNewForSequenceClassification",
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
-        enable_test=True),
+        enable_test=True,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling_mteb_test/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py
index 0d6026898ad4addf490404ebf7c93abff48d6492..1d078db69236ac5972c3460284a8cea645486b24 100644
--- a/tests/models/language/pooling_mteb_test/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@@ -2,50 +2,55 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
 
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    CLSPoolingEmbedModelInfo("intfloat/e5-small",
-                             architecture="BertModel",
-                             mteb_score=0.742285423,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("intfloat/e5-base",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/e5-large",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-small",
+        architecture="BertModel",
+        mteb_score=0.742285423,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-base", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-large", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
+    ),
     ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.779325955,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
-                             architecture="XLMRobertaModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
-                             architecture="XLMRobertaModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-base",
+        architecture="XLMRobertaModel",
+        mteb_score=0.779325955,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-large",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-large-instruct",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py
index 0a77a78bb31b672d3f5593d8aa9a96716093be59..c2065bcd6eb4c2b6d1bc0f057a3c7179d86d7dee 100644
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -5,60 +5,69 @@ from functools import partial
 import pytest
 
 from tests.models.language.pooling.embed_utils import (
-    check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                RerankModelInfo)
+    check_embeddings_close,
+    correctness_test_embed_models,
+    matryoshka_fy,
+)
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    RerankModelInfo,
+)
 from vllm import PoolingParams
 
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
-    CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
-                             mteb_score=0.824413164,
-                             architecture="XLMRobertaModel",
-                             is_matryoshka=True)
+    CLSPoolingEmbedModelInfo(
+        "jinaai/jina-embeddings-v3",
+        mteb_score=0.824413164,
+        architecture="XLMRobertaModel",
+        is_matryoshka=True,
+        dtype="float32",
+    )
 ]
 
 RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         "jinaai/jina-reranker-v2-base-multilingual",
         mteb_score=0.33643,
-        architecture="XLMRobertaForSequenceClassification")
+        architecture="XLMRobertaForSequenceClassification",
+    )
 ]
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
-
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     def hf_model_callback(model):
         model.encode = partial(model.encode, task="text-matching")
 
-    mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info,
-                           hf_model_callback=hf_model_callback)
+    mteb_test_embed_models(
+        hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
+    )
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
     def hf_model_callback(model):
         model.encode = partial(model.encode, task="text-matching")
 
-    correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info,
-                                  example_prompts,
-                                  hf_model_callback=hf_model_callback)
+    correctness_test_embed_models(
+        hf_runner,
+        vllm_runner,
+        model_info,
+        example_prompts,
+        hf_model_callback=hf_model_callback,
+    )
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
 
 
@@ -81,32 +90,32 @@ def test_matryoshka(
     example_prompts = [str(s).strip() for s in example_prompts]
 
     with hf_runner(
-            model_info.name,
-            dtype=dtype,
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype=dtype,
+        is_sentence_transformer=True,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts, task="text-matching")
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     dtype=dtype,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", dtype=dtype, max_model_len=None
+    ) as vllm_model:
         assert vllm_model.llm.llm_engine.model_config.is_matryoshka
 
         matryoshka_dimensions = (
-            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
+            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
+        )
         assert matryoshka_dimensions is not None
 
         if dimensions not in matryoshka_dimensions:
             with pytest.raises(ValueError):
                 vllm_model.embed(
-                    example_prompts,
-                    pooling_params=PoolingParams(dimensions=dimensions))
+                    example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+                )
         else:
             vllm_outputs = vllm_model.embed(
-                example_prompts,
-                pooling_params=PoolingParams(dimensions=dimensions))
+                example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+            )
 
             check_embeddings_close(
                 embeddings_0_lst=hf_outputs,
diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index 05ebb4ec4d3f50efdf652332fdd03a9520a5f4b7..fd04dc1990238ae86ee354c3772d2de46447a647 100644
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -17,46 +17,45 @@ mxbai_rerank_hf_overrides = {
 }
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
-                               architecture="Qwen2ForSequenceClassification",
-                               hf_overrides=mxbai_rerank_hf_overrides,
-                               mteb_score=0.273,
-                               enable_test=True),
-    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
-                               architecture="Qwen2ForSequenceClassification",
-                               hf_overrides=mxbai_rerank_hf_overrides,
-                               enable_test=False)
+    LASTPoolingRerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-base-v2",
+        architecture="Qwen2ForSequenceClassification",
+        hf_overrides=mxbai_rerank_hf_overrides,
+        mteb_score=0.273,
+        enable_test=True,
+    ),
+    LASTPoolingRerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-large-v2",
+        architecture="Qwen2ForSequenceClassification",
+        hf_overrides=mxbai_rerank_hf_overrides,
+        enable_test=False,
+    ),
 ]
 
 
 class MxbaiRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
         self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
             for key in inputs:
                 inputs[key] = inputs[key].to(self.model.device)
             return inputs
diff --git a/tests/models/language/pooling_mteb_test/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py
index 61512fd0dff18f4b035eb2f5f6c76534daf1ffb2..c54a43052483a6683df786a6405b7e28edcb01ce 100644
--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@@ -3,39 +3,42 @@
 
 import pytest
 
-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
 
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
-                             architecture="NomicBertModel",
-                             mteb_score=0.737568559,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
-                             architecture="NomicBertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
-                             architecture="NomicBertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
-                             architecture="NomicBertModel",
-                             mteb_score=0.715488912,
-                             enable_test=True)
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1",
+        architecture="NomicBertModel",
+        mteb_score=0.737568559,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1.5",
+        architecture="NomicBertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v2-moe",
+        architecture="NomicBertModel",
+        mteb_score=0.715488912,
+        enable_test=True,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 65403081dc0f81f9e51f6e0862bbda8a63a632c6..00e99f44cfdb1cb624a32619d7d5b7030d3779f7 100644
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -18,46 +18,45 @@ qwen3_reranker_hf_overrides = {
 }
 
 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
-                               architecture="Qwen3ForSequenceClassification",
-                               mteb_score=0.25736,
-                               hf_overrides=qwen3_reranker_hf_overrides,
-                               enable_test=True),
-    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
-                               architecture="Qwen3ForSequenceClassification",
-                               hf_overrides=qwen3_reranker_hf_overrides,
-                               enable_test=False)
+    LASTPoolingRerankModelInfo(
+        "Qwen/Qwen3-Reranker-0.6B",
+        architecture="Qwen3ForSequenceClassification",
+        mteb_score=0.25736,
+        hf_overrides=qwen3_reranker_hf_overrides,
+        enable_test=True,
+    ),
+    LASTPoolingRerankModelInfo(
+        "Qwen/Qwen3-Reranker-4B",
+        architecture="Qwen3ForSequenceClassification",
+        hf_overrides=qwen3_reranker_hf_overrides,
+        enable_test=False,
+    ),
 ]
 
 
 class Qwen3RerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
         self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
             for key in inputs:
                 inputs[key] = inputs[key].to(self.model.device)
             return inputs
@@ -82,20 +81,18 @@ class Qwen3RerankerHfRunner(HfRunner):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-
     mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 @multi_gpu_test(num_gpus=2)
-def test_rerank_models_mteb_tp(vllm_runner,
-                               model_info: RerankModelInfo) -> None:
-
+def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
     assert model_info.architecture == "Qwen3ForSequenceClassification"
 
     vllm_extra_kwargs: dict[str, Any] = {
         "tensor_parallel_size": 2,
     }
 
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(
+        Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
+    )
diff --git a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
index 91bad2c4e42fc1dc7483a3dbc77b074224f33acc..3c30628aeaa49cc3d4a860e84381a7e5861a1e56 100644
--- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@@ -3,62 +3,75 @@
 
 import pytest
 
-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
 
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             mteb_score=0.714927797,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
-                             is_matryoshka=False,
-                             architecture="NomicBertModel",
-                             mteb_score=0.681146831,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                             is_matryoshka=True,
-                             architecture="BertModel",
-                             mteb_score=0.649088363,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
-                             is_matryoshka=True,
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.712258299,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                             is_matryoshka=True,
-                             architecture="GteModel",
-                             mteb_score=0.706622444,
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-xs",
+        is_matryoshka=False,
+        architecture="BertModel",
+        mteb_score=0.714927797,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-s",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-long",
+        is_matryoshka=False,
+        architecture="NomicBertModel",
+        mteb_score=0.681146831,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        architecture="BertModel",
+        mteb_score=0.649088363,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l-v2.0",
+        is_matryoshka=True,
+        architecture="XLMRobertaModel",
+        mteb_score=0.712258299,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v2.0",
+        is_matryoshka=True,
+        architecture="GteModel",
+        mteb_score=0.706622444,
+        enable_test=True,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py
index bd493e7e2ba099d904a486487062cf66b1788e71..74fe4b9bcc03ff719a865a4957d61b0fc9e3b77c 100644
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@@ -2,8 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo)
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+)
 
 from .mteb_utils import mteb_test_embed_models
 
@@ -15,15 +18,16 @@ ST_PROJECTOR_MODELS = [
         mteb_score=0.688611955,
         enable_test=True,
     ),
-    LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
-                              architecture="Gemma3TextModel",
-                              mteb_score=0.7473819294684156,
-                              enable_test=True)
+    LASTPoolingEmbedModelInfo(
+        "google/embeddinggemma-300m",
+        architecture="Gemma3TextModel",
+        mteb_score=0.7473819294684156,
+        enable_test=True,
+        dtype="float32",
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
-
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index f6fa01436ed630f11e0ad05cf79f1ea71dadbb28..5f42aa7591d92965f37c9a393d5912d23269745d 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -3,47 +3,54 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+
 import math
-import os
 from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
-from transformers import (AutoModel, AutoModelForImageTextToText,
-                          AutoModelForTextToWaveform)
+from packaging.version import Version
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoModelForTextToWaveform,
+)
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
-from vllm.utils import identity
+from vllm.utils.func_utils import identity
 
-from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
-                          ImageTestAssets, VideoTestAssets, VllmRunner)
-from ....utils import (create_new_process_for_each_test, large_gpu_mark,
-                       multi_gpu_marks)
+from ....conftest import (
+    IMAGE_ASSETS,
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
+from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
-from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
-                              VLMTestInfo, VLMTestType)
-
-# This hack is needed for phi3v & paligemma models
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+from .vlm_utils.types import (
+    CustomTestOptions,
+    ExpandableVLMTestArgs,
+    VLMTestInfo,
+    VLMTestType,
+)
 
-# yapf: disable
 COMMON_BROADCAST_SETTINGS = {
     "test_type": VLMTestType.IMAGE,
     "dtype": "half",
     "max_tokens": 5,
     "tensor_parallel_size": 2,
     "hf_model_kwargs": {"device_map": "auto"},
-    "image_size_factors": [(.25, 0.5, 1.0)],
+    "image_size_factors": [(0.25, 0.5, 1.0)],
     "distributed_executor_backend": (
         "ray",
         "mp",
-    )
+    ),
 }
 
 ### Test configuration for specific models
@@ -83,51 +90,49 @@ VLM_TEST_SETTINGS = {
     #### Core tests to always run in the CI
     "llava": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
-        test_type=(
-            VLMTestType.EMBEDDING,
-            VLMTestType.IMAGE,
-            VLMTestType.CUSTOM_INPUTS
-        ),
+        test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         convert_assets_to_embeddings=model_utils.get_llava_embeddings,
         max_model_len=4096,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
-                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
-            ),
-            limit_mm_per_prompt={"image": 4},
-        )],
-        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
-        dtype="bfloat16" if current_platform.is_cpu() else "auto",
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                    formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+                ),
+                limit_mm_per_prompt={"image": 4},
+            )
+        ],
+        vllm_runner_kwargs={"enable_mm_embeds": True},
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "paligemma": VLMTestInfo(
         models=["google/paligemma-3b-mix-224"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=identity,
-        img_idx_to_prompt = lambda idx: "",
+        img_idx_to_prompt=lambda idx: "",
         # Paligemma uses its own sample prompts because the default one fails
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "caption es",
-            "cherry_blossom": "What is in the picture?",
-        }),
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "caption es",
+                "cherry_blossom": "What is in the picture?",
+            }
+        ),
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
         dtype="bfloat16",
-        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
+        marks=[
+            pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
+        ],
     ),
     "qwen2_5_vl": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
-        ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
+        enforce_eager=False,
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -137,27 +142,46 @@ VLM_TEST_SETTINGS = {
     ),
     "qwen2_5_omni": VLMTestInfo(
         models=["Qwen/Qwen2.5-Omni-3B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
+        max_model_len=4096,
+        max_num_seqs=2,
+        num_logprobs=6 if current_platform.is_cpu() else 5,
+        auto_cls=AutoModelForTextToWaveform,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
+    "qwen3_vl": VLMTestInfo(
+        models=["Qwen/Qwen3-VL-4B-Instruct"],
         test_type=(
             VLMTestType.IMAGE,
             VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
+            VLMTestType.VIDEO,
         ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+        enforce_eager=False,
+        needs_video_metadata=True,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",  # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        num_logprobs= 6 if current_platform.is_cpu() else 5,
-        auto_cls=AutoModelForTextToWaveform,
+        num_logprobs=20,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
+        patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[
+            pytest.mark.core_model,
+        ],
     ),
     "ultravox": VLMTestInfo(
-        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
+        models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
         test_type=VLMTestType.AUDIO,
-        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         audio_idx_to_prompt=lambda idx: "<|audio|>",
         max_model_len=4096,
         max_num_seqs=2,
@@ -171,9 +195,11 @@ VLM_TEST_SETTINGS = {
     "llava-onevision-transformers": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.IMAGE,
-        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         max_model_len=16384,
-        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
+            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+        ),
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         image_size_factors=[(0.25, 0.5, 1.0)],
@@ -185,10 +211,24 @@ VLM_TEST_SETTINGS = {
         # when processing the 3rd prompt in vLLM
         marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
     ),
+    # Gemma3 has bidirectional mask on images
+    "gemma3-transformers": VLMTestInfo(
+        models=["google/gemma-3-4b-it"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        max_model_len=4096,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
     "idefics3-transformers": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
@@ -204,8 +244,8 @@ VLM_TEST_SETTINGS = {
     "qwen2_5_vl-transformers": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
         test_type=VLMTestType.IMAGE,
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -220,16 +260,18 @@ VLM_TEST_SETTINGS = {
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<vlm_image>Please describe the image shortly.",
-            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
-        }),
-        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<vlm_image>Please describe the image shortly.",
+                "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+            }
+        ),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
         stop_str=["<|im_end|>"],
         image_size_factors=[(0.10, 0.15)],
         max_tokens=64,
@@ -238,12 +280,14 @@ VLM_TEST_SETTINGS = {
     "aya_vision": VLMTestInfo(
         models=["CohereForAI/aya-vision-8b"],
         test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<image>What is the season?",  # noqa: E501
-        }),
-        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>What's the content in the center of the image?",
+                "cherry_blossom": "<image>What is the season?",
+            }
+        ),
+        multi_image_prompt="<image><image>Describe the two images in detail.",
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -252,12 +296,14 @@ VLM_TEST_SETTINGS = {
     "aya_vision-multi_image": VLMTestInfo(
         models=["CohereForAI/aya-vision-8b"],
         test_type=(VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<image>What is the season?",  # noqa: E501
-        }),
-        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>What's the content in the center of the image?",
+                "cherry_blossom": "<image>What is the season?",
+            }
+        ),
+        multi_image_prompt="<image><image>Describe the two images in detail.",
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -282,27 +328,29 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         # For chameleon, we only compare the sequences
-        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
-        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc=lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
         max_tokens=8,
         dtype="bfloat16",
     ),
     "deepseek_vl_v2": VLMTestInfo(
-        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
+        models=["Isotr0py/deepseek-vl2-tiny"],  # model repo using dynamic module
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
-            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
-        }),
-        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>\nWhat's the content in the center of the image?",
+                "cherry_blossom": "<image>\nPlease infer the season with reason in details.",  # noqa: E501
+            }
+        ),
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",  # noqa: E501
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
-        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
-        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],
+        image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -321,11 +369,13 @@ VLM_TEST_SETTINGS = {
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
-        }),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<start_of_image>What is the season?",
+            }
+        ),
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
@@ -337,11 +387,13 @@ VLM_TEST_SETTINGS = {
     "glm4v": VLMTestInfo(
         models=["zai-org/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
-        }),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
+            }
+        ),
         max_model_len=2048,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@@ -356,9 +408,9 @@ VLM_TEST_SETTINGS = {
     "glm4_1v": VLMTestInfo(
         models=["zai-org/GLM-4.1V-9B-Thinking"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
         max_model_len=2048,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@@ -375,23 +427,27 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.video_with_metadata_glm4_1v(),
-            limit_mm_per_prompt={"video": 1},
-        )],
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.video_with_metadata_glm4_1v(),
+                limit_mm_per_prompt={"video": 1},
+            )
+        ],
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "h2ovl": VLMTestInfo(
-        models = [
+        models=[
             "h2oai/h2ovl-mississippi-800m",
             "h2oai/h2ovl-mississippi-2b",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<image>\nWhat is the season?",
-        }),
+        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>\nWhat's the content in the center of the image?",
+                "cherry_blossom": "<image>\nWhat is the season?",
+            }
+        ),
         multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
         max_model_len=8192,
         use_tokenizer_eos=True,
@@ -401,7 +457,7 @@ VLM_TEST_SETTINGS = {
     "idefics3": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
@@ -416,11 +472,13 @@ VLM_TEST_SETTINGS = {
             # "OpenGVLab/Mono-InternVL-2B",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<image>\nWhat is the season?",
-        }),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>\nWhat's the content in the center of the image?",
+                "cherry_blossom": "<image>\nWhat is the season?",
+            }
+        ),
         multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
         max_model_len=4096,
         use_tokenizer_eos=True,
@@ -431,7 +489,7 @@ VLM_TEST_SETTINGS = {
             "OpenGVLab/InternVL3-1B",
         ],
         test_type=VLMTestType.VIDEO,
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
         video_idx_to_prompt=lambda idx: "<video>",
         max_model_len=8192,
         use_tokenizer_eos=True,
@@ -444,7 +502,7 @@ VLM_TEST_SETTINGS = {
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO,
         ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
         video_idx_to_prompt=lambda idx: "<video>",
         max_model_len=8192,
@@ -454,7 +512,7 @@ VLM_TEST_SETTINGS = {
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>",  # noqa: E501
         img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
         max_model_len=8192,
         max_num_seqs=2,
@@ -465,11 +523,11 @@ VLM_TEST_SETTINGS = {
     ),
     "llama4": VLMTestInfo(
         models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda _: "<|image|>",
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         distributed_executor_backend="mp",
-        image_size_factors=[(.25, 0.5, 1.0)],
+        image_size_factors=[(0.25, 0.5, 1.0)],
         hf_model_kwargs={"device_map": "auto"},
         max_model_len=8192,
         max_num_seqs=4,
@@ -485,28 +543,34 @@ VLM_TEST_SETTINGS = {
         max_model_len=10240,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
-                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
-            ),
-            limit_mm_per_prompt={"image": 4},
-        )],
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                    formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
+                ),
+                limit_mm_per_prompt={"image": 4},
+            )
+        ],
     ),
     "llava_onevision": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
-        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
-        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
+            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+        ),
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
-                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
-            ),
-            limit_mm_per_prompt={"video": 4},
-        )],
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
+                    formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+                ),
+                limit_mm_per_prompt={"video": 4},
+            )
+        ],
     ),
     "llava_next_video": VLMTestInfo(
         models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
@@ -548,7 +612,9 @@ VLM_TEST_SETTINGS = {
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
+            ["<|im_end|>", "<|endoftext|>"]
+        ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
         # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@@ -561,13 +627,15 @@ VLM_TEST_SETTINGS = {
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
+            ["<|im_end|>", "<|endoftext|>"]
+        ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
     ),
     "minimax_vl_01": VLMTestInfo(
         models=["MiniMaxAI/MiniMax-VL-01"],
-        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>",  # noqa: E501
         img_idx_to_prompt=lambda _: "<image>",
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         max_model_len=8192,
@@ -589,8 +657,8 @@ VLM_TEST_SETTINGS = {
     "ovis1_6-gemma2": VLMTestInfo(
         models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n",
         max_model_len=4096,
         max_num_seqs=2,
         dtype="half",
@@ -602,8 +670,8 @@ VLM_TEST_SETTINGS = {
     "ovis2": VLMTestInfo(
         models=["AIDC-AI/Ovis2-1B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n",
         max_model_len=4096,
         max_num_seqs=2,
         dtype="half",
@@ -613,13 +681,9 @@ VLM_TEST_SETTINGS = {
     ),
     "ovis2_5": VLMTestInfo(
         models=["AIDC-AI/Ovis2.5-2B"],
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
-        ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n",
         video_idx_to_prompt=lambda idx: "<video>\n",
         max_model_len=4096,
         max_num_seqs=2,
@@ -628,10 +692,27 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
         hf_model_kwargs={"revision": "refs/pr/5"},
     ),
+    "paddleocr_vl": VLMTestInfo(
+        models=["PaddlePaddle/PaddleOCR-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        img_idx_to_prompt=lambda idx: (
+            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+        ),
+        multi_image_prompt=(
+            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Describe these two images separately."
+        ),
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForCausalLM,
+        image_size_factors=[(), (0.25,)],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
         max_model_len=4096,
         max_num_seqs=2,
@@ -661,20 +742,14 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
-        # FIXME: https://github.com/huggingface/transformers/issues/38358
-        marks=[pytest.mark.skip("Model initialization fails")],
     ),
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
-        ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
-        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",    # noqa: E501
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
+        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -685,12 +760,14 @@ VLM_TEST_SETTINGS = {
     "skywork_r1v": VLMTestInfo(
         models=["Skywork/Skywork-R1V-38B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
-            "cherry_blossom": "<image>\nWhat is the season?",
-        }),
-        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<image>\nWhat's the content in the center of the image?",
+                "cherry_blossom": "<image>\nWhat is the season?",
+            }
+        ),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
         max_model_len=4096,
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
@@ -705,6 +782,7 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
+        num_logprobs=10,
     ),
     "tarsier": VLMTestInfo(
         models=["omni-research/Tarsier-7b"],
@@ -722,9 +800,9 @@ VLM_TEST_SETTINGS = {
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO,
         ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -737,11 +815,11 @@ VLM_TEST_SETTINGS = {
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForImageTextToText,
-        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
-        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc=lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
         marks=multi_gpu_marks(num_gpus=2),
-        **COMMON_BROADCAST_SETTINGS # type: ignore
+        **COMMON_BROADCAST_SETTINGS,  # type: ignore
     ),
     "llava-broadcast": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
@@ -750,7 +828,7 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
-        **COMMON_BROADCAST_SETTINGS # type: ignore
+        **COMMON_BROADCAST_SETTINGS,  # type: ignore
     ),
     "llava_next-broadcast": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
@@ -759,12 +837,12 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
-        **COMMON_BROADCAST_SETTINGS # type: ignore
+        **COMMON_BROADCAST_SETTINGS,  # type: ignore
     ),
     ### Custom input edge-cases for specific models
     "intern_vl-diff-patches": VLMTestInfo(
         models=["OpenGVLab/InternVL2-2B"],
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
         use_tokenizer_eos=True,
@@ -773,7 +851,8 @@ VLM_TEST_SETTINGS = {
             CustomTestOptions(
                 inputs=inp,
                 limit_mm_per_prompt={"image": 2},
-            ) for inp in custom_inputs.different_patch_input_cases_internvl()
+            )
+            for inp in custom_inputs.different_patch_input_cases_internvl()
         ],
     ),
     "llava_onevision-multiple-images": VLMTestInfo(
@@ -782,14 +861,24 @@ VLM_TEST_SETTINGS = {
         max_model_len=16384,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
+            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+        ),
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
-                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            ),
-            limit_mm_per_prompt={"image": 4},
-        )],
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                    formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+                ),
+                limit_mm_per_prompt={"image": 4},
+            )
+        ],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.1"),
+                reason="This model is broken in Transformers v4.57.1",
+            )
+        ],
     ),
     # regression test for https://github.com/vllm-project/vllm/issues/15122
     "qwen2_5_vl-windows-attention": VLMTestInfo(
@@ -799,10 +888,12 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        custom_test_opts=[CustomTestOptions(
-            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
-            limit_mm_per_prompt={"image": 1},
-        )],
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
+                limit_mm_per_prompt={"image": 1},
+            )
+        ],
     ),
     "llama4": VLMTestInfo(
         models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
@@ -821,7 +912,6 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=80), multi_gpu_marks(num_gpus=8)],
     ),
 }
-# yapf: enable
 
 
 def _mark_splits(
@@ -842,7 +932,7 @@ def _mark_splits(
     new_test_settings = dict[str, VLMTestInfo]()
 
     for i in range(num_groups):
-        models_in_group = models[i * split_size:(i + 1) * split_size]
+        models_in_group = models[i * split_size : (i + 1) * split_size]
 
         for model in models_in_group:
             for info in test_infos_by_model[model]:
@@ -873,7 +963,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_single_image_models(
     tmp_path: PosixPath,
     model_type: str,
@@ -899,7 +990,8 @@ def test_single_image_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_multi_image_models(
     tmp_path: PosixPath,
     model_type: str,
@@ -925,7 +1017,8 @@ def test_multi_image_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_image_embedding_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -949,7 +1042,8 @@ def test_image_embedding_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_video_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -973,7 +1067,8 @@ def test_video_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.AUDIO,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_audio_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -997,7 +1092,8 @@ def test_audio_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -1020,7 +1116,8 @@ def test_custom_inputs_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
 def test_single_image_models_heavy(
     tmp_path: PosixPath,
@@ -1047,7 +1144,8 @@ def test_single_image_models_heavy(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
 def test_multi_image_models_heavy(
     tmp_path: PosixPath,
@@ -1074,7 +1172,8 @@ def test_multi_image_models_heavy(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
 def test_image_embedding_models_heavy(
     model_type: str,
@@ -1099,7 +1198,8 @@ def test_image_embedding_models_heavy(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_video_models_heavy(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -1123,7 +1223,8 @@ def test_video_models_heavy(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.AUDIO,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_audio_models_heavy(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -1147,7 +1248,8 @@ def test_audio_models_heavy(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index c1305e0ae31cee61750095cf952f770fcbe6a97e..e39dfc888779e5ca0c0f838d72dcd8c325dfc8a4 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional
 
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq
@@ -10,8 +9,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 
-from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
-                          VllmRunner)
+from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
@@ -19,8 +17,8 @@ HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date:
 
 
 def vllm_to_hf_output(
-    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: tuple[list[int], str, SampleLogprobs | None],
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize hf output to be comparable with vllm output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -47,7 +45,7 @@ def run_test(
     max_tokens: int,
     num_logprobs: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -64,50 +62,49 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=1,
-            dtype=dtype,
-            limit_mm_per_prompt={"audio": 1},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=64,
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"audio": 1},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=64,
+        enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("audio", 1, audio_lora_path)
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+                lora_request=lora_request,
+            )
             for prompts, audios in inputs
         ]
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
-
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
         hf_processor = hf_model.processor
         eos_token_id = hf_processor.tokenizer.eos_token_id
 
         hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    audios=[audios],
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[audios],
+                eos_token_id=eos_token_id,
+            )
             for prompts, audios in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(output) for output in vllm_outputs
-            ],
+            outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
             name_0="hf",
             name_1="vllm",
         )
@@ -118,9 +115,16 @@ def run_test(
 @pytest.mark.parametrize("max_model_len", [2048])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, model: str,
-                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 1ef56af33a09448e7dc1fbd23eedf59ad4a0e205..a773db19825e1c3fbd83e65bde16d1663d0f4289 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     give the same result.
     """
 
-    image_cherry = convert_image_mode(
-        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
     image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
     images = [image_cherry, image_stop]
     video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
         ),
     ]
 
-    with vllm_runner(model,
-                     runner="generate",
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": 2},
-                     max_model_len=32768,
-                     max_num_seqs=2,
-                     tensor_parallel_size=1,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        runner="generate",
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 2},
+        max_model_len=32768,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    ) as vllm_model:
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy(prompts,
-                                       max_tokens,
-                                       images=images,
-                                       videos=videos)
+            vllm_model.generate_greedy(
+                prompts, max_tokens, images=images, videos=videos
+            )
             for prompts, images, videos in inputs
         ]
 
     all_results = [output[0][1] for output in vllm_outputs_per_case]
-    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
-               for total_str in all_results]
-    prompt_lengths = [prompt_len for _, prompt_len in outputs]
-    generated_strs = [
-        total_str[prompt_len:] for total_str, prompt_len in outputs
+    outputs = [
+        (total_str, total_str.find("assistant\n") + len("assistant\n"))
+        for total_str in all_results
     ]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
     interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
     interleaved_output_str, noninterleaved_output_str = generated_strs
 
diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f98bde1d91ea990aa677b90d1a31e9ee4ad7840
--- /dev/null
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+from typing import NamedTuple
+
+import pytest
+from PIL.Image import Image
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.multimodal.utils import encode_image_base64
+
+MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
+
+QUESTION = "What is the content of each image?"
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    image_data: list[Image]
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    sampling_params: SamplingParams | None = None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("question", [QUESTION])
+def test_keye_vl(
+    image_assets,
+    question: str,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
+    ]
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=None
+    )
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        assert len(generated_text) > 10, (
+            f"Generated text is too short: {generated_text}"
+        )
+        print("-" * 50)
diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py
index bacc9ef94f49d9069f3123372c39cf2f604d4391..6fc2efa418ddfbdfcfceb8e982e59c9a723fad85 100644
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -18,13 +18,11 @@ from typing import Any
 import pytest
 import torch
 from safetensors.torch import save_file
-from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
-                          GenerationConfig)
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
 
 from vllm import LLM, SamplingParams
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec)
+from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
 
 from ....utils import multi_gpu_test
 
@@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:
 
 
 def create_reduced_maverick_model(
-    original_model_name:
-    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
     output_dir: str = "/tmp/reduced_maverick",
     text_layers: int = 4,
     num_experts: int = 4,
@@ -118,7 +115,8 @@ def create_reduced_maverick_model(
 
     print(
         f"Creating reduced Maverick model with {text_layers} text layers and "
-        f"{vision_layers} vision layers...")
+        f"{vision_layers} vision layers..."
+    )
 
     # Create output directory
     output_path = Path(output_dir)
@@ -126,19 +124,23 @@ def create_reduced_maverick_model(
         if force_recreate:
             shutil.rmtree(output_path)
         else:
-            print(f"Output directory {output_dir} already exists. "
-                  "Use --force-recreate to overwrite.")
+            print(
+                f"Output directory {output_dir} already exists. "
+                "Use --force-recreate to overwrite."
+            )
             return str(output_path)
 
     output_path.mkdir(parents=True, exist_ok=True)
 
     try:
         print("Loading original model configuration...")
-        original_config = AutoConfig.from_pretrained(original_model_name,
-                                                     trust_remote_code=True)
+        original_config = AutoConfig.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
         print("Creating reduced configuration...")
-        reduced_config = create_reduced_config(original_config, text_layers,
-                                               num_experts, vision_layers)
+        reduced_config = create_reduced_config(
+            original_config, text_layers, num_experts, vision_layers
+        )
 
         config_path = output_path / "config.json"
         with open(config_path, "w") as f:
@@ -149,8 +151,7 @@ def create_reduced_maverick_model(
         copy_tokenizer_files(original_model_name, output_path)
 
         print("Creating reduced safetensors files...")
-        create_reduced_safetensors(original_config, reduced_config,
-                                   output_path)
+        create_reduced_safetensors(original_config, reduced_config, output_path)
 
         print("Creating preprocessor config...")
         create_preprocessor_config(original_config, output_path)
@@ -173,9 +174,9 @@ def create_reduced_maverick_model(
         raise
 
 
-def create_reduced_config(original_config: Any, text_layers: int,
-                          num_experts: int,
-                          vision_layers: int) -> dict[str, Any]:
+def create_reduced_config(
+    original_config: Any, text_layers: int, num_experts: int, vision_layers: int
+) -> dict[str, Any]:
     """Create a reduced configuration based on the original."""
 
     # Convert config to dictionary
@@ -185,23 +186,20 @@ def create_reduced_config(original_config: Any, text_layers: int,
     if "text_config" in config_dict:
         original_text_layers = config_dict["text_config"]["num_hidden_layers"]
         config_dict["text_config"]["num_hidden_layers"] = text_layers
-        print(
-            f"Reduced text layers from {original_text_layers} to {text_layers}"
-        )
+        original_layer_types = config_dict["text_config"]["layer_types"]
+        config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
+        print(f"Reduced text layers from {original_text_layers} to {text_layers}")
 
         original_num_experts = config_dict["text_config"]["num_local_experts"]
         config_dict["text_config"]["num_local_experts"] = num_experts
-        print(
-            f"Reduced num experts from {original_num_experts} to {num_experts}"
-        )
+        print(f"Reduced num experts from {original_num_experts} to {num_experts}")
 
         hidden_dim_divisor = 4
 
         original_hidden_size = config_dict["text_config"]["hidden_size"]
         new_hidden_size = original_hidden_size // hidden_dim_divisor
         config_dict["text_config"]["hidden_size"] = new_hidden_size
-        print(f"Reduced hidden size from {original_hidden_size} to "
-              f"{new_hidden_size}")
+        print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
 
         original_head_dim = config_dict["text_config"]["head_dim"]
         new_head_dim = original_head_dim // hidden_dim_divisor
@@ -210,15 +208,12 @@ def create_reduced_config(original_config: Any, text_layers: int,
 
     # Reduce vision layers
     if "vision_config" in config_dict:
-        original_vision_layers = config_dict["vision_config"][
-            "num_hidden_layers"]
+        original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
         config_dict["vision_config"]["num_hidden_layers"] = vision_layers
-        print(f"Reduced vision layers from {original_vision_layers} "
-              f"to {vision_layers}")
+        print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
 
     # Update model name to indicate it's a reduced version
-    config_dict["_name_or_path"] = (
-        f"reduced_maverick_{text_layers}t_{vision_layers}v")
+    config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
 
     return config_dict
 
@@ -227,16 +222,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
     """Copy tokenizer files from the original model."""
 
     try:
-        tokenizer = AutoTokenizer.from_pretrained(original_model_name,
-                                                  trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
         tokenizer.save_pretrained(output_path)
         print("Tokenizer files copied successfully")
     except Exception as e:
         print(f"Warning: Could not copy tokenizer files: {e}")
 
 
-def create_preprocessor_config(original_config: Any,
-                               output_path: Path) -> None:
+def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
     """Create preprocessor_config.json for multimodal model."""
 
     # Try to load the original preprocessor config
@@ -254,9 +249,9 @@ def create_preprocessor_config(original_config: Any,
         raise
 
 
-def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
-                                                                          Any],
-                               output_path: Path) -> None:
+def create_reduced_safetensors(
+    original_config: Any, reduced_config: dict[str, Any], output_path: Path
+) -> None:
     """Create safetensors files with weights for the reduced model."""
 
     print("Generating synthetic weights for reduced model...")
@@ -279,8 +274,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
     save_weights_to_safetensors(weights, output_path)
 
 
-def create_text_model_weights(
-        text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
     """Create synthetic weights for the text model with MoE structure."""
 
     weights = {}
@@ -291,19 +285,18 @@ def create_text_model_weights(
     intermediate_size_mlp = text_config["intermediate_size_mlp"]
     num_layers = text_config["num_hidden_layers"]
     num_attention_heads = text_config["num_attention_heads"]
-    num_key_value_heads = text_config.get("num_key_value_heads",
-                                          num_attention_heads)
+    num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
 
     # MoE specific parameters
     num_experts = text_config.get("num_local_experts")
-    assert (num_experts
-            is not None), "num_local_experts must be specified for MoE"
+    assert num_experts is not None, "num_local_experts must be specified for MoE"
 
     head_dim = hidden_size // num_attention_heads
 
     # Embedding layers
     weights["language_model.model.embed_tokens.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.float16)
+        vocab_size, hidden_size, dtype=torch.float16
+    )
 
     # Transformer layers
     for layer_idx in range(num_layers):
@@ -312,95 +305,105 @@ def create_text_model_weights(
 
         # Self-attention weights (separate q, k, v projections)
         weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
         print("Self-attention weights created.")
 
         # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
         # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
         # 0,2,4,... are dense
         interleave_step = text_config.get("interleave_moe_layer_step", 1)
-        is_moe_layer = (interleave_step > 0
-                        and (layer_idx + 1) % interleave_step == 0)
+        is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
 
         if is_moe_layer:
             # MoE layer structure
             # 1. Router weights
-            weights[
-                f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
-                    num_experts, hidden_size, dtype=torch.float16)
+            weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                num_experts, hidden_size, dtype=torch.float16
+            )
 
             # 2. Individual expert weights (not fused)
             for expert_idx in range(num_experts):
-                expert_prefix = (
-                    f"{layer_prefix}.feed_forward.experts.{expert_idx}")
+                expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
 
                 weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                 weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                 weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
-                    hidden_size, intermediate_size, dtype=torch.bfloat16)
+                    hidden_size, intermediate_size, dtype=torch.bfloat16
+                )
 
                 # Expert weight scales (FP8 quantization)
-                weights[
-                    f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
-                        intermediate_size, 1, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
                 weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
-                    intermediate_size, 1, dtype=torch.bfloat16)
-                weights[
-                    f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
-                        hidden_size, 1, dtype=torch.bfloat16)
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                    hidden_size, 1, dtype=torch.bfloat16
+                )
 
             # 3. Shared expert weights
             shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
             weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
             weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
             weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
-                hidden_size, intermediate_size, dtype=torch.bfloat16)
+                hidden_size, intermediate_size, dtype=torch.bfloat16
+            )
             print(f"MoE feed-forward weights created for layer {layer_idx}.")
         else:
             # Dense layer structure
-            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
-                torch.randn(hidden_size,
-                            intermediate_size_mlp,
-                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
+            )
             print(f"Dense feed-forward weights created for layer {layer_idx}.")
 
         # Layer norms
         weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
         print("Layer norms created.")
 
     # Final layer norm and output projection
     weights["language_model.model.norm.weight"] = torch.ones(
-        hidden_size, dtype=torch.bfloat16)
+        hidden_size, dtype=torch.bfloat16
+    )
     weights["language_model.lm_head.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.bfloat16)
+        vocab_size, hidden_size, dtype=torch.bfloat16
+    )
 
     return weights
 
 
 def create_vision_model_weights(
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    vision_config: dict[str, Any],
+) -> dict[str, torch.Tensor]:
     """Create synthetic weights for the vision model."""
 
     weights = {}
@@ -414,47 +417,62 @@ def create_vision_model_weights(
         layer_prefix = f"vision_model.model.layers.{layer_idx}"
 
         weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
 
         weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
-            intermediate_size, hidden_size, dtype=torch.bfloat16)
+            intermediate_size, hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
-            intermediate_size, dtype=torch.bfloat16)
+            intermediate_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
-            hidden_size, intermediate_size, dtype=torch.bfloat16)
+            hidden_size, intermediate_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
 
         weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
         weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
 
     return weights
 
 
 def create_shared_weights(
-        text_config: dict[str, Any],
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    text_config: dict[str, Any], vision_config: dict[str, Any]
+) -> dict[str, torch.Tensor]:
     """Create weights for shared components (vision-language connector)"""
 
     weights = {}
@@ -464,13 +482,15 @@ def create_shared_weights(
 
     # Vision-language connector (projects vision features to text space)
     weights["multi_modal_projector.linear_1.weight"] = torch.randn(
-        text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16
+    )
 
     return weights
 
 
-def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
-                                output_path: Path) -> None:
+def save_weights_to_safetensors(
+    weights: dict[str, torch.Tensor], output_path: Path
+) -> None:
     """Save weights to safetensors files and create index."""
 
     # Determine how to shard the weights
@@ -507,18 +527,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
     else:
         # Multiple shards
         for i, shard in enumerate(shards):
-            filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
+            filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
             save_file(shard, output_path / filename)
             for name in shard:
                 weight_map[name] = filename
-            print(f"Saved shard {i+1}/{len(shards)}: {filename}")
+            print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
 
     # Create index file
     index_data = {
         "metadata": {
-            "total_size":
-            sum(tensor.numel() * tensor.element_size()
-                for tensor in weights.values())
+            "total_size": sum(
+                tensor.numel() * tensor.element_size() for tensor in weights.values()
+            )
         },
         "weight_map": weight_map,
     }
@@ -528,8 +548,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
         json.dump(index_data, f, indent=2)
 
     print(f"Created index file: {index_path}")
-    print(f"Total model size: "
-          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
+    print(
+        f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
+    )
 
 
 def check_attention_spec_interleaved_rope(
@@ -540,8 +561,7 @@ def check_attention_spec_interleaved_rope(
 ):
     """Check that the attention spec is correct."""
     assert isinstance(llm.llm_engine.model_executor, Executor)
-    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
-    )
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
     for rank in range(num_ranks):
         kv_cache_specs = kv_cache_specs_per_rank[rank]
         assert len(kv_cache_specs.keys()) == num_attention_layers
@@ -551,16 +571,14 @@ def check_attention_spec_interleaved_rope(
             else:
                 expected_spec = ChunkedLocalAttentionSpec
             assert isinstance(
-                kv_cache_specs[
-                    f"language_model.model.layers.{i}.self_attn.attn"],
-                expected_spec)
+                kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec,
+            )
 
 
 def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
     """Test the created reduced model with vLLM."""
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=50)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
 
     if should_profile:
         llm.start_profile()
@@ -571,15 +589,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
     print("Test generation successful!")
     for output in outputs:
         print(f"Prompt: {output.prompt}")
-        print(f"Output: "
-              f"{output.outputs[0].text}")
+        print(f"Output: {output.outputs[0].text}")
         print("-" * 40)
 
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "original_model_name,text_layers,num_experts,vision_layers,",
-    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
+)
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.parametrize("tp,ep", [(2, True)])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -597,7 +615,6 @@ def test_dummy_maverick(
     profile: bool = False,
 ) -> None:
     # Disable multiprocessing allows us to access model executor from LLM engine
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     model_path = create_reduced_maverick_model(
@@ -640,7 +657,8 @@ def main():
     import argparse
 
     parser = argparse.ArgumentParser(
-        description="Create a reduced-layer Maverick model")
+        description="Create a reduced-layer Maverick model"
+    )
     parser.add_argument(
         "--output-dir",
         default="/tmp/reduced_maverick",
@@ -652,10 +670,7 @@ def main():
         default=4,
         help="Number of text transformer layers",
     )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=4,
-                        help="Number of experts")
+    parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
     parser.add_argument(
         "--vision-layers",
         type=int,
@@ -667,12 +682,12 @@ def main():
         action="store_true",
         help="Force recreation if output directory exists",
     )
-    parser.add_argument("--test",
-                        action="store_true",
-                        help="Test the created model with vLLM")
-    parser.add_argument("--profile",
-                        action="store_true",
-                        help="Profile the created model with vLLM")
+    parser.add_argument(
+        "--test", action="store_true", help="Test the created model with vLLM"
+    )
+    parser.add_argument(
+        "--profile", action="store_true", help="Profile the created model with vLLM"
+    )
     parser.add_argument(
         "--test-original",
         action="store_true",
@@ -687,16 +702,18 @@ def main():
     args = parser.parse_args()
 
     if args.test:
-        test_dummy_maverick(original_model_name=args.original_model,
-                            output_dir=args.output_dir,
-                            text_layers=args.text_layers,
-                            num_experts=args.num_experts,
-                            vision_layers=args.vision_layers,
-                            force_recreate=args.force_recreate,
-                            tp=2,
-                            ep=True,
-                            enforce_eager=True,
-                            profile=args.profile)
+        test_dummy_maverick(
+            original_model_name=args.original_model,
+            output_dir=args.output_dir,
+            text_layers=args.text_layers,
+            num_experts=args.num_experts,
+            vision_layers=args.vision_layers,
+            force_recreate=args.force_recreate,
+            tp=2,
+            ep=True,
+            enforce_eager=True,
+            profile=args.profile,
+        )
 
     if args.test_original:
         run_maverick_serving(args.original_model)
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
index db8984d8656fc1018b867cf9881a957c877787c7..62456221711ed55d26f8569e2486d0cc3a8d86cf 100644
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -3,7 +3,6 @@
 
 import os
 from collections.abc import Sequence
-from typing import Optional
 
 import librosa
 import pytest
@@ -12,44 +11,45 @@ from huggingface_hub import snapshot_download
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)
 
-model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
-                               revision="refs/pr/70")
+model_path = snapshot_download(
+    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+)
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
     model: str,
     *,
     max_model_len: int,
@@ -58,7 +58,7 @@ def run_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -75,28 +75,30 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
-            trust_remote_code=False,
+        model,
+        task="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
+        trust_remote_code=False,
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
             for prompts, images, audios in inputs
         ]
 
@@ -108,17 +110,18 @@ def run_test(
         hf_processor = hf_model.processor
         eos_token_id = hf_processor.tokenizer.eos_token_id
         hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+            )
             for prompts, images, audios in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_outputs,
@@ -145,16 +148,27 @@ def run_test(
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     run_test(
         hf_runner,
@@ -189,16 +203,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
         (
             [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
             None,
         ),
     ]
@@ -222,10 +246,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     # use the example speech question so that the model outputs are reasonable
     audio = librosa.load(speech_question, sr=16000)
     image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 77e2b90dd5e9653ecbd139be66d748740fbff8d8..665bed512392cc43e6b522e88f02544616b1cc40 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -3,7 +3,6 @@
 
 import os
 from collections.abc import Sequence
-from typing import Optional
 
 import librosa
 import pytest
@@ -15,33 +14,40 @@ from vllm.assets.image import ImageAsset
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
-from vllm.platforms import current_platform
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)
 
 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]
 
 
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
+):
     """Sanitize vllm output to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -61,18 +67,11 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
     model: str,
     *,
     max_model_len: int,
@@ -81,7 +80,7 @@ def run_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -98,27 +97,29 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
             for prompts, images, audios in inputs
         ]
 
@@ -127,42 +128,36 @@ def run_test(
     pytest.skip("HF impl is not compatible with current transformers")
 
     hf_model_kwargs = {"_attn_implementation": "sdpa"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
         hf_processor = hf_model.processor
         eos_token_id = hf_processor.tokenizer.eos_token_id
 
-        def patch_hf_processor(*args,
-                               text="",
-                               images=None,
-                               audio=None,
-                               sampling_rate=None,
-                               **kwargs):
+        def patch_hf_processor(
+            *args, text="", images=None, audio=None, sampling_rate=None, **kwargs
+        ):
             audios = None
             if audio is not None and sampling_rate is not None:
                 audios = [(audio, sampling_rate)]
-            return hf_processor(*args,
-                                text=text,
-                                images=images,
-                                audios=audios,
-                                **kwargs)
+            return hf_processor(
+                *args, text=text, images=images, audios=audios, **kwargs
+            )
 
         hf_model.processor = patch_hf_processor
 
         hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id,
-                                                    num_logits_to_keep=0)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+                num_logits_to_keep=0,
+            )
             for prompts, images, audios in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_outputs,
@@ -189,16 +184,27 @@ def run_test(
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     run_test(
         hf_runner,
@@ -233,16 +239,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
         (
             [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
             None,
         ),
     ]
@@ -266,10 +282,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     # use the example speech question so that the model outputs are reasonable
     audio = librosa.load(speech_question, sr=None)
     image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 715b08ef90e549c15e829f865152f78c8e5ee656..3cad2c43d5623c34546fe4e081626788855a12ea 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import pytest
 from mistral_common.multimodal import download_image
-from mistral_common.protocol.instruct.messages import ImageURLChunk
+from mistral_common.protocol.instruct.chunk import ImageURLChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
@@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."
 
 
 def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": PROMPT,
-        }] + [{
-            "type": "image_url",
-            "image_url": {
-                "url": url
-            }
-        } for url in urls],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": PROMPT,
+                }
+            ]
+            + [{"type": "image_url", "image_url": {"url": url}} for url in urls],
+        }
+    ]
 
 
 def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "content": PROMPT,
-        }, *({
-            "type": "image",
-            "image": download_image(url)
-        } for url in urls)],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "content": PROMPT,
+                },
+                *({"type": "image", "image": download_image(url)} for url in urls),
+            ],
+        }
+    ]
 
 
 def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@@ -117,7 +117,7 @@ FIXTURE_LOGPROBS_CHAT = {
     MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
 }
 
-OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
+OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
 
 
 # For the test author to store golden output in JSON
@@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text, [{
-        k: asdict(v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in (logprobs or [])])
-                 for tokens, text, logprobs in outputs]
+    json_data = [
+        (
+            tokens,
+            text,
+            [
+                {k: asdict(v) for k, v in token_logprobs.items()}
+                for token_logprobs in (logprobs or [])
+            ],
+        )
+        for tokens, text, logprobs in outputs
+    ]
 
     with open(filename, "w") as f:
         json.dump(json_data, f)
@@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text, [{
-        int(k): Logprob(**v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
+    return [
+        (
+            tokens,
+            text,
+            [
+                {int(k): Logprob(**v) for k, v in token_logprobs.items()}
+                for token_logprobs in logprobs
+            ],
+        )
+        for tokens, text, logprobs in json_data
+    ]
 
 
 @large_gpu_test(min_gb=80)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
-              local_asset_server) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
-        FIXTURE_LOGPROBS_CHAT[model])
+def test_chat(
+    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
-            max_model_len=max_model_len,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=max_model_len,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = []
 
@@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
     for i in range(len(logprobs)):
         assert logprobs[i][-1] is None
         logprobs[i] = logprobs[i][:-1]
-    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 1dc3188d60bd8b52271e71ee124cf55cc8055a87..3ba665710af46e8c7dbd2745aa86f04a01b48f3f 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
     return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
 
 
-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_5_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_5_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)
 
 
 @pytest.mark.core_model
@@ -33,13 +34,23 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
-                                      video_pruning_rate: float,
-                                      num_frames: int, dtype: str,
-                                      max_tokens: int) -> None:
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_evs_functionality(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
     """Test EVS (Efficient Video Sampling) functionality with different
     pruning rates.
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
     # Sample frames from video assets
     sampled_vids = [
@@ -51,19 +62,16 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
     videos = [sampled_vids[0]]
 
     # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=1,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 1},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 1},
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
         # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
 
         # Basic validation that we got a response
         assert len(outputs) == 1
@@ -83,10 +91,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
-                                       video_pruning_rate: float,
-                                       num_frames: int, dtype: str,
-                                       max_tokens: int) -> None:
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_evs_batched_videos(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
     """Test EVS functionality with batched videos.
 
     This test validates that:
@@ -94,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
     2. Both pruning configurations work with multiple videos
     3. The model doesn't crash when processing multiple videos simultaneously
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
     # Sample frames from video assets
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
@@ -102,23 +120,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
 
     # Test batched videos
     prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
-    videos = [sampled_vids[0],
-              sampled_vids[0]]  # Use same video twice for testing
+    videos = [sampled_vids[0], sampled_vids[0]]  # Use same video twice for testing
 
     # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 2},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 2},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
         # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
 
         # Basic validation that we got responses for both videos
         assert len(outputs) == 2
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index c8a3513ac7ad1ec0f44a06151ff6a97bca12bdc3..e10b8e1e77af11a83b4e778e9f826a0aa5a0b70e 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, TypedDict
 
 import numpy.typing as npt
 import pytest
@@ -11,8 +11,13 @@ from PIL import Image
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
-from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
-                          PromptVideoInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    VIDEO_ASSETS,
+    PromptImageInput,
+    PromptVideoInput,
+    VllmRunner,
+)
 from ...utils import check_logprobs_close
 
 
@@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
     return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
 
 
-IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the biggest text's content in this image?",
-    ),
-    "cherry_blossom":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the season shown in this image? ",
-        "Reply with a short sentence (no more than 20 words)",
-    ),
-})
-
-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the biggest text's content in this image?",
+        ),
+        "cherry_blossom": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the season shown in this image? ",
+            "Reply with a short sentence (no more than 20 words)",
+        ),
+    }
+)
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)
 
 MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
     IMAGE_PLACEHOLDER,
@@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 
 def batch_make_image_embeddings(
-        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    image_batches: list[Image.Image | list[Image.Image]],
+    processor,
+    llm: VllmRunner,
+) -> list[Qwen2VLPromptImageEmbeddingInput]:
     """batched image embeddings for Qwen2-VL
 
-    This will infer all images' embeddings in a single batch, 
+    This will infer all images' embeddings in a single batch,
       and split the result according to input batches.
 
     image_batches:
       - Single-image batches: `list[Image.Image]`
       - Multiple-image batches: `list[list[Image.Image]]]`
-    
+
     returns: `list[Qwen2VLPromptImageEmbeddingInput]`
     """
 
@@ -108,9 +116,9 @@ def batch_make_image_embeddings(
     # image to pixel values
     image_processor = processor.image_processor
 
-    preprocess_result = image_processor \
-        .preprocess(images=images, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=images, return_tensors="pt"
+    ).data
     pixel_values = preprocess_result["pixel_values"]
     image_grid_thw = preprocess_result["image_grid_thw"]
 
@@ -119,12 +127,13 @@ def batch_make_image_embeddings(
         with torch.no_grad():
             visual = model.visual
 
-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=image_grid_thw_on_device
+            ).cpu()
 
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
@@ -137,16 +146,21 @@ def batch_make_image_embeddings(
         merge_size = image_processor.merge_size
         cur_batch_embed_len = sum(
             grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count])
+            for grid_thw in image_grid_thw[
+                image_counter : image_counter + cur_batch_image_count
+            ]
+        )
 
-        result.append({
-            "image_embeds":
-            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "image_grid_thw":
-            image_grid_thw[image_counter:image_counter +
-                           cur_batch_image_count],
-        })
+        result.append(
+            {
+                "image_embeds": image_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "image_grid_thw": image_grid_thw[
+                    image_counter : image_counter + cur_batch_image_count
+                ],
+            }
+        )
 
         embed_counter += cur_batch_embed_len
         image_counter += cur_batch_image_count
@@ -160,13 +174,13 @@ def batch_make_image_embeddings(
 
 
 def batch_make_video_embeddings(
-        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    video_batches: PromptVideoInput, processor, llm: VllmRunner
+) -> list[Qwen2VLPromptVideoEmbeddingInput]:
     """batched video embeddings for Qwen2-VL
 
     A NDArray represents a single video's all frames.
 
-    This will infer all videos' embeddings in a single batch, 
+    This will infer all videos' embeddings in a single batch,
       and split the result according to input batches.
 
     video_batches:
@@ -191,9 +205,9 @@ def batch_make_video_embeddings(
     # video to pixel values
     image_processor = processor.image_processor
 
-    preprocess_result = image_processor \
-        .preprocess(images=None, videos=videos, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=None, videos=videos, return_tensors="pt"
+    ).data
     pixel_values = preprocess_result["pixel_values_videos"]
     video_grid_thw = preprocess_result["video_grid_thw"]
 
@@ -202,12 +216,13 @@ def batch_make_video_embeddings(
         with torch.no_grad():
             visual = model.visual
 
-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=video_grid_thw_on_device
+            ).cpu()
 
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
@@ -220,16 +235,21 @@ def batch_make_video_embeddings(
         merge_size = image_processor.merge_size
         cur_batch_embed_len = sum(
             grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count])
+            for grid_thw in video_grid_thw[
+                video_counter : video_counter + cur_batch_video_count
+            ]
+        )
 
-        result.append({
-            "video_embeds":
-            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "video_grid_thw":
-            video_grid_thw[video_counter:video_counter +
-                           cur_batch_video_count],
-        })
+        result.append(
+            {
+                "video_embeds": video_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "video_grid_thw": video_grid_thw[
+                    video_counter : video_counter + cur_batch_video_count
+                ],
+            }
+        )
 
         embed_counter += cur_batch_embed_len
         video_counter += cur_batch_video_count
@@ -252,7 +272,7 @@ def run_embedding_input_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between
     original image/video input and image/video embeddings input.
@@ -263,25 +283,25 @@ def run_embedding_input_test(
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=4000,
-            max_num_seqs=3,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            default_torch_num_threads=1,
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=3,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        default_torch_num_threads=1,
+        enable_mm_embeds=True,
     ) as vllm_model:
         outputs_per_case_for_original_input = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None,
+            )
             for prompts, images, videos in inputs
         ]
 
@@ -290,17 +310,19 @@ def run_embedding_input_test(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
-                images=batch_make_image_embeddings(
-                    images, processor, vllm_model) if images else None,
-                videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model) if videos else None)
+                images=batch_make_image_embeddings(images, processor, vllm_model)
+                if images
+                else None,
+                videos=batch_make_video_embeddings(videos, processor, vllm_model)
+                if videos
+                else None,
+            )
             for prompts, images, videos in inputs
         ]
 
-    for outputs_for_original_input, \
-        outputs_for_embeddings_input \
-        in zip(outputs_per_case_for_original_input,
-            outputs_per_case_for_embeddings_input):
+    for outputs_for_original_input, outputs_for_embeddings_input in zip(
+        outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
+    ):
         check_logprobs_close(
             outputs_0_lst=outputs_for_original_input,
             outputs_1_lst=outputs_for_embeddings_input,
@@ -325,17 +347,26 @@ def run_embedding_input_test(
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype, max_tokens,
-                                         num_logprobs, monkeypatch) -> None:
+def test_qwen2_vl_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    monkeypatch,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
             [prompt for _ in size_factors],
             [rescale_image_size(image, factor) for factor in size_factors],
             [],
-        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+        )
+        for image, prompt in zip(images, IMAGE_PROMPTS)
+    ]
 
     run_embedding_input_test(
         vllm_runner,
@@ -366,21 +397,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
-                                                  model, size_factors,
-                                                  dtype: str, max_tokens: int,
-                                                  num_logprobs: int) -> None:
+def test_qwen2_vl_multiple_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: list[tuple[list[str], PromptImageInput,
-                                PromptVideoInput]] = [(
-                                    [MULTIIMAGE_PROMPT for _ in size_factors],
-                                    [[
-                                        rescale_image_size(image, factor)
-                                        for image in images
-                                    ] for factor in size_factors],
-                                    [],
-                                )]
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [MULTIIMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            [],
+        )
+    ]
 
     run_embedding_input_test(
         vllm_runner,
@@ -410,22 +447,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+def test_qwen2_vl_video_embeddings_input(
+    vllm_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     num_frames = 4
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
         for asset in video_assets
     ]
 
-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
             [prompt for _ in size_factors],
             [],
             [rescale_video_size(video, factor) for factor in size_factors],
-        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+        )
+        for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
+    ]
 
     run_embedding_input_test(
         vllm_runner,
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index e7e7bd3154a113651688442298530224f349ba69..6bfec6c2c8d302f6146494eeefb60b5778c6ece5 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
-AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
-    "mary_had_lamb":
-    "Transcribe this into English.",
-    "winning_call":
-    "What is happening in this audio clip?",
-})
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": "Transcribe this into English.",
+        "winning_call": "What is happening in this audio clip?",
+    }
+)
 
 MULTI_AUDIO_PROMPT = "Describe each of the audios above."
 
@@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
     "enable_chunked_prefill": True,
     "max_num_seqs": 2,
     # Use a very small limit to exercise chunked prefill.
-    "max_num_batched_tokens": 16
+    "max_num_batched_tokens": 16,
 }
 
 
@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
     for key, value in params_kwargs.items():
         if isinstance(value, bool):
             if value:
-                args.append(f"--{key.replace('_','-')}")
+                args.append(f"--{key.replace('_', '-')}")
         else:
-            args.append(f"--{key.replace('_','-')}={value}")
+            args.append(f"--{key.replace('_', '-')}={value}")
     return args
 
 
-@pytest.fixture(params=[
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
+@pytest.fixture(
+    params=[
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ]
+)
 def server(request, audio_assets: AudioTestAssets):
     args = [
-        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
         "--limit-mm-per-prompt",
-        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+        json.dumps({"audio": len(audio_assets)}),
+        "--trust-remote-code",
     ] + params_kwargs_to_cli_args(request.param)
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
         yield remote_server
 
 
@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     placeholder = f"{placeholder}\n" * audio_count
 
-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": f"{placeholder}{question}"}],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
 
 
 def run_multi_audio_test(
@@ -99,19 +104,21 @@ def run_multi_audio_test(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={
-                         "audio":
-                         max((len(audio) for _, audio in prompts_and_audios))
-                     },
-                     **kwargs) as vllm_model:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={
+            "audio": max((len(audio) for _, audio in prompts_and_audios))
+        },
+        **kwargs,
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             [prompt for prompt, _ in prompts_and_audios],
             max_tokens,
             num_logprobs=num_logprobs,
-            audios=[audios for _, audios in prompts_and_audios])
+            audios=[audios for _, audios in prompts_and_audios],
+        )
 
     # The HuggingFace model doesn't support multiple audios yet, so
     # just assert that some tokens were generated.
@@ -122,21 +129,25 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int, num_logprobs: int,
-                                     vllm_kwargs: dict) -> None:
-
-    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
-                              VLLM_PLACEHOLDER)
+@pytest.mark.parametrize(
+    "vllm_kwargs",
+    [
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ],
+)
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    vllm_kwargs: dict,
+) -> None:
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
     run_multi_audio_test(
         vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
 async def test_online_serving(client, audio_assets: AudioTestAssets):
     """Exercises online serving with/without chunked prefill enabled."""
 
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *[{
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio.url
-                }
-            } for audio in audio_assets],
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
-
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[
+                    {"type": "audio_url", "audio_url": {"url": audio.url}}
+                    for audio in audio_assets
+                ],
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",  # noqa: E501
+                },
+            ],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index b4439dfe020c221461c70f61b777c821c88a4341..18a50c3a555da7de73f3dc76da474c104758d535 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -6,8 +6,8 @@ import json
 import pytest
 import pytest_asyncio
 from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
@@ -17,8 +17,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
 
 MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]
 
 
@@ -30,10 +34,9 @@ def server(request, audio_assets: AudioTestAssets):
         json.dumps({"audio": len(audio_assets)}),
     ] + MISTRAL_FORMAT_ARGS
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
         yield remote_server
 
 
@@ -64,15 +67,17 @@ def _get_prompt(audio_assets, question):
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
     run_multi_audio_test(
         vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
@@ -92,23 +97,17 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
         return audio_dict
 
     audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *audio_chunks,
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
-
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    text = f"What's happening in these {len(audio_assets)} audio clips?"
+    messages = [
+        {
+            "role": "user",
+            "content": [*audio_chunks, {"type": "text", "text": text}],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index e0e9980b883391e8b674499fffb858d6644cbc36..eca2b61e37d5379663f320f171aaf2c69cc3434c 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 
@@ -12,8 +11,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test
 
 PROMPTS = [
     {
-        "prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
         "multi_modal_data": {
             "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
         },
@@ -25,9 +23,8 @@ PROMPTS = [
                 "audio": AudioAsset("winning_call").audio_and_sample_rate,
             },
         },
-        "decoder_prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    }
+        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    },
 ]
 
 EXPECTED = {
@@ -41,7 +38,7 @@ EXPECTED = {
         " is June and the third base. They're going to wave him in. The throw"
         " to the plate will be late. The Mariners are going to play for the"
         " American League Championship. I don't believe it. It just continues"
-        " by all five."
+        " by all five.",
     ],
     "openai/whisper-small": [
         " The first words I spoke in the original pornograph. A little piece"
@@ -51,7 +48,7 @@ EXPECTED = {
         " comes joy. Here is Junior to third base. They're gonna wave him"
         " in. The throw to the plate will be late. The Mariners are going to"
         " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my."
+        " just continues. My, oh my.",
     ],
     "openai/whisper-medium": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -62,7 +59,7 @@ EXPECTED = {
         " Jorgen at third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh"
-        " my."
+        " my.",
     ],
     "openai/whisper-large-v3": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -73,7 +70,7 @@ EXPECTED = {
         " Junior to third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
+        " my.",
     ],
     "openai/whisper-large-v3-turbo": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -84,8 +81,8 @@ EXPECTED = {
         " Junior to third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
-    ]
+        " my.",
+    ],
 }
 
 
@@ -94,17 +91,17 @@ def run_test(
     model: str,
     *,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ) -> None:
     prompt_list = PROMPTS * 10
     expected_list = EXPECTED[model] * 10
 
     with vllm_runner(
-            model,
-            dtype="half",
-            max_model_len=448,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype="half",
+        max_model_len=448,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         llm = vllm_model.llm
 
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 133d5d6ee2ef86e4a9b2fbc6b1067c7c6763097d..47852453c0585335088825e570031a300825bb7f 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,29 +1,40 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Helpers for building inputs that can be leveraged for different test types.
-"""
-from collections.abc import Iterable
+"""Helpers for building inputs that can be leveraged for different test types."""
+
+from collections.abc import Callable, Iterable
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
+from typing import Any
 
+import numpy.typing as npt
 import torch
 
 from vllm.multimodal.audio import AudioResampler
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
 
 from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
-from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
-                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
-                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
-                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
-                    VLMTestInfo)
-
-
-def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
-                                                                     str],
-                             test_placeholder: str) -> str:
+from .types import (
+    SINGLE_AUDIO_BASE_PROMPT,
+    SINGLE_IMAGE_BASE_PROMPTS,
+    TEST_AUDIO_PLACEHOLDER,
+    TEST_IMG_PLACEHOLDER,
+    TEST_VIDEO_PLACEHOLDER,
+    VIDEO_BASE_PROMPT,
+    ImageSizeWrapper,
+    PromptWithMultiModalInput,
+    SizeType,
+    VLMTestInfo,
+)
+
+
+def replace_test_placeholder(
+    prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
+) -> str:
     """Given a prompt, replaces each test placeholder with the
     model-specific tag.
     """
@@ -35,11 +46,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
     return img_prompt
 
 
-def get_model_prompts(base_prompts: Iterable[str],
-                      img_idx_to_prompt: Optional[Callable[[int], str]],
-                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      audio_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> list[str]:
+def get_model_prompts(
+    base_prompts: Iterable[str],
+    img_idx_to_prompt: Callable[[int], str] | None,
+    video_idx_to_prompt: Callable[[int], str] | None,
+    audio_idx_to_prompt: Callable[[int], str] | None,
+    prompt_formatter: Callable[[str], str],
+) -> list[str]:
     """Given a model-agnostic base prompt and test configuration for a model(s)
     to be tested, update the media placeholders and apply the prompt formatting
     to get the test prompt string for this model.
@@ -56,19 +69,19 @@ def get_model_prompts(base_prompts: Iterable[str],
         # Replace the multimodal placeholders in the base prompt with
         # the correct ones for the model that we are testing
         if img_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   img_idx_to_prompt,
-                                                   TEST_IMG_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
+            )
 
         if video_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   video_idx_to_prompt,
-                                                   TEST_VIDEO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
+            )
 
         if audio_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   audio_idx_to_prompt,
-                                                   TEST_AUDIO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
+            )
 
         # Apply the prompt formatter to wrap the base prompt with
         # the correct media placeholders to get the model test prompt
@@ -81,17 +94,18 @@ def build_single_image_inputs_from_test_info(
     test_info: VLMTestInfo,
     image_assets: ImageTestAssets,
     size_wrapper: ImageSizeWrapper,
-    tmp_path: Optional[PosixPath] = None,
+    tmp_path: PosixPath | None = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build single image inputs")
+        raise ValueError("Prompt formatter must be set to build single image inputs")
 
-    model_prompts = get_model_prompts(test_info.single_image_prompts,
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        test_info.single_image_prompts,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
 
     # For models that require a local path / URL encoded in the image; export
     # assets and encode into tmp_path for this test. This should be avoided
@@ -110,8 +124,8 @@ def build_single_image_inputs_from_test_info(
 
 
 def build_single_image_inputs(
-        images, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    images, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
     # For every image / prompt pair, get a pair containing two lists of
     # length size_factors, where the first contains duplicates of the model
     # prompt [str], and the second contains copies of the image after being
@@ -125,7 +139,8 @@ def build_single_image_inputs(
                 apply_image_size_scaling(image, size, size_wrapper.type)
                 for size in size_wrapper.data
             ],
-        ) for image, prompt in zip(images, model_prompts)
+        )
+        for image, prompt in zip(images, model_prompts)
     ]
 
 
@@ -133,17 +148,18 @@ def build_multi_image_inputs_from_test_info(
     test_info: VLMTestInfo,
     image_assets: ImageTestAssets,
     size_wrapper: ImageSizeWrapper,
-    tmp_path: Optional[PosixPath] = None,
+    tmp_path: PosixPath | None = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build multi image inputs")
+        raise ValueError("Prompt formatter must be set to build multi image inputs")
 
-    model_prompts = get_model_prompts([test_info.multi_image_prompt],
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        [test_info.multi_image_prompt],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
 
     if test_info.prompt_path_encoder is not None:
         if tmp_path is None:
@@ -164,16 +180,20 @@ def build_multi_image_inputs_from_test_info(
 
 
 def build_multi_image_inputs(
-        image_lists, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    image_lists, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
     return [
         PromptWithMultiModalInput(
             prompts=[prompt for _ in size_wrapper.data],
-            image_data=[[
-                apply_image_size_scaling(image, size, size_wrapper.type)
-                for image in images
-            ] for size in size_wrapper.data],
-        ) for images, prompt in zip(image_lists, model_prompts)
+            image_data=[
+                [
+                    apply_image_size_scaling(image, size, size_wrapper.type)
+                    for image in images
+                ]
+                for size in size_wrapper.data
+            ],
+        )
+        for images, prompt in zip(image_lists, model_prompts)
     ]
 
 
@@ -185,10 +205,10 @@ def build_embedding_inputs_from_test_info(
     # These conditions will always be true if invoked through filtering,
     # but we still check them in case this is ever called directly
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build image embedding inputs")
-    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
-            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
+        factor == 1.0 for factor in size_wrapper.data
+    ):
         raise ValueError("Embedding tests require constant (1.0) size factors")
     if test_info.convert_assets_to_embeddings is None:
         raise ValueError("No conversion func for getting embeddings found")
@@ -209,8 +229,7 @@ def build_embedding_inputs_from_test_info(
     assert len(images) == len(model_prompts)
 
     inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
-    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
-                                                size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
     return inputs, vllm_embeddings
 
 
@@ -219,6 +238,7 @@ def build_video_inputs_from_test_info(
     video_assets: VideoTestAssets,
     size_wrapper: ImageSizeWrapper,
     num_frames: int,
+    needs_video_metadata: bool,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
         raise ValueError("Prompt formatter must be set to build video inputs")
@@ -231,25 +251,48 @@ def build_video_inputs_from_test_info(
     )
 
     sampled_vids = [
-        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        sample_frames_with_video_metadata(
+            (asset.np_ndarrays, asset.metadata),
+            num_frames,
+        )
         for asset in video_assets
     ]
 
-    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
-                    else rescale_video_size)
+    video_scaler = (
+        resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
+    )
 
     return [
         PromptWithMultiModalInput(
             prompts=[prompt for _ in size_wrapper.data],
             video_data=[
-                video_scaler(video, size) for size in size_wrapper.data
+                (
+                    video_scaler(video, size)
+                    if not needs_video_metadata
+                    else (video_scaler(video, size), meta)
+                )
+                for size in size_wrapper.data
             ],
-        ) for video, prompt in zip(sampled_vids, model_prompts)
+        )
+        for (video, meta), prompt in zip(sampled_vids, model_prompts)
     ]
 
 
-def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
-                             size_type: SizeType):
+def sample_frames_with_video_metadata(
+    video_with_meta: tuple[npt.NDArray, dict[str, Any]],
+    num_frames: int,
+) -> tuple[npt.NDArray, dict[str, Any]]:
+    video, meta = video_with_meta
+    video = sample_frames_from_video(video, num_frames)
+
+    meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
+    meta["total_num_frames"] = num_frames
+    meta["fps"] = meta["duration"] / num_frames
+    meta["frames_indices"] = list(range(num_frames))
+    return video, meta
+
+
+def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
     """Applies a size scaler to one image; this can be an image size factor,
     which scales the image while maintaining the aspect ratio"""
     # Special case for embeddings; if it's a tensor, it's only valid if we
@@ -285,13 +328,16 @@ def build_audio_inputs_from_test_info(
         method="librosa",
     )
     audios = [asset.audio_and_sample_rate for asset in audio_assets]
-    resampled_audios = [(
-        resampler.resample(
-            audio,
-            orig_sr=sr,
-        ),
-        int(resampler.target_sr),
-    ) for audio, sr in audios]
+    resampled_audios = [
+        (
+            resampler.resample(
+                audio,
+                orig_sr=sr,
+            ),
+            int(resampler.target_sr),
+        )
+        for audio, sr in audios
+    ]
 
     return [
         PromptWithMultiModalInput(
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index 1edb512135343c386bdc8dcbf701897c6e9cb212..d42150bcbf672b82d5b979de52c539d9fc569ff2 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -4,19 +4,28 @@
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
 """
+
 import itertools
 from collections import OrderedDict
 from collections.abc import Iterable
 
 import pytest
 
-from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
-                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+from .types import (
+    EMBEDDING_SIZE_FACTORS,
+    ExpandableVLMTestArgs,
+    ImageSizeWrapper,
+    SizeType,
+    VLMTestInfo,
+    VLMTestType,
+)
 
 
 def get_filtered_test_settings(
-        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    new_proc_per_test: bool,
+) -> dict[str, VLMTestInfo]:
     """Given the dict of potential test settings to run, return a subdict
     of tests who have the current test type enabled with the matching val for
     fork_per_test.
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
     def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
         return test_info.test_type == test_type or (
             isinstance(test_info.test_type, Iterable)
-            and test_type in test_info.test_type)
+            and test_type in test_info.test_type
+        )
 
     matching_tests = {}
     for test_name, test_info in test_settings.items():
@@ -36,68 +46,77 @@ def get_filtered_test_settings(
                 assert test_info.convert_assets_to_embeddings is not None
             # Custom test inputs need to explicitly define the mm limit/inputs
             if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
-                assert (test_info.custom_test_opts is not None
-                        and isinstance(test_info.custom_test_opts, Iterable))
+                assert test_info.custom_test_opts is not None and isinstance(
+                    test_info.custom_test_opts, Iterable
+                )
             # For all types besides custom inputs, we need a prompt formatter
             else:
                 assert test_info.prompt_formatter is not None
 
             # Everything looks okay; keep if this is correct proc handling
-            if (test_info.distributed_executor_backend
-                    is not None) == new_proc_per_test:
+            if (
+                test_info.distributed_executor_backend is not None
+            ) == new_proc_per_test:
                 matching_tests[test_name] = test_info
 
     return matching_tests
 
 
-def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
-                             test_type: VLMTestType,
-                             create_new_process_for_each_test: bool):
+def get_parametrized_options(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    create_new_process_for_each_test: bool,
+):
     """Converts all of our VLMTestInfo into an expanded list of parameters.
     This is similar to nesting pytest parametrize calls, but done directly
     through an itertools product so that each test can set things like
     size factors etc, while still running in isolated test cases.
     """
     matching_tests = get_filtered_test_settings(
-        test_settings, test_type, create_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test
+    )
 
     # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
 
     def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
         # This is essentially the same as nesting a bunch of mark.parametrize
         # decorators, but we do it programmatically to allow overrides for on
         # a per-model basis, while still being able to execute each of these
         # as individual test cases in pytest.
-        iter_kwargs = OrderedDict([
-            ("model", ensure_wrapped(test_info.models)),
-            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-            ("dtype", ensure_wrapped(test_info.dtype)),
-            ("distributed_executor_backend",
-             ensure_wrapped(test_info.distributed_executor_backend)),
-        ])
+        iter_kwargs = OrderedDict(
+            [
+                ("model", ensure_wrapped(test_info.models)),
+                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+                ("dtype", ensure_wrapped(test_info.dtype)),
+                (
+                    "distributed_executor_backend",
+                    ensure_wrapped(test_info.distributed_executor_backend),
+                ),
+            ]
+        )
 
         # num_frames is video only
         if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(
-                test_info.num_video_frames)
+            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
+            iter_kwargs["needs_video_metadata"] = ensure_wrapped(
+                test_info.needs_video_metadata
+            )
 
         # No sizes passed for custom inputs, since inputs are directly provided
         if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
             wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
             if wrapped_sizes is None:
-                raise ValueError(
-                    f"Sizes must be set for test type {test_type}")
+                raise ValueError(f"Sizes must be set for test type {test_type}")
             iter_kwargs["size_wrapper"] = wrapped_sizes
 
-        #Otherwise expand the custom test options instead
+        # Otherwise expand the custom test options instead
         elif test_type == VLMTestType.CUSTOM_INPUTS:
             if test_info.custom_test_opts is None:
                 raise ValueError("Test has type CUSTOM_INPUTS, but none given")
             iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
 
-        # yapf: disable
         # Wrap all model cases in a pytest parameter & pass marks through
         return [
             pytest.param(
@@ -105,10 +124,10 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                 ExpandableVLMTestArgs(
                     **{k: v for k, v in zip(iter_kwargs.keys(), case)}
                 ),
-                marks=test_info.marks if test_info.marks is not None else []
-            ) for case in list(itertools.product(*iter_kwargs.values()))
+                marks=test_info.marks if test_info.marks is not None else [],
+            )
+            for case in list(itertools.product(*iter_kwargs.values()))
         ]
-        # yapf: enable
 
     # Get a list per model type, where each entry contains a tuple of all of
     # that model type's cases, then flatten them into the top level so that
@@ -121,8 +140,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
 
 
 def get_wrapped_test_sizes(
-        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
+    test_info: VLMTestInfo, test_type: VLMTestType
+) -> tuple[ImageSizeWrapper, ...]:
     """Given a test info which may have size factors or fixed sizes, wrap them
     and combine them into an iterable, each of which will be used in parameter
     expansion.
@@ -133,18 +152,18 @@ def get_wrapped_test_sizes(
     """
     # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
     if test_type == VLMTestType.EMBEDDING:
-        return tuple([
-            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
-            for factor in EMBEDDING_SIZE_FACTORS
-        ])
+        return tuple(
+            [
+                ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+                for factor in EMBEDDING_SIZE_FACTORS
+            ]
+        )
     # Audio and Custom inputs have preprocessed inputs
     elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
         return tuple()
 
-    size_factors = test_info.image_size_factors \
-        if test_info.image_size_factors else []
-    fixed_sizes = test_info.image_sizes \
-        if test_info.image_sizes else []
+    size_factors = test_info.image_size_factors if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
 
     wrapped_factors = [
         ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@@ -152,8 +171,7 @@ def get_wrapped_test_sizes(
     ]
 
     wrapped_sizes = [
-        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
-        for size in fixed_sizes
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
     ]
 
     return tuple(wrapped_factors + wrapped_sizes)
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 11d44120b875ff83cbdb8dc4d8c0724ce23cd6f8..03ff3bcf6307bd0231d52b469f1cb82f7976145d 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
-from typing import Any, Callable, Optional
+
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import RunnerOption
+from vllm.config.model import RunnerOption
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .....conftest import HfRunner, VllmRunner
@@ -26,21 +28,21 @@ def run_test(
     enforce_eager: bool,
     max_model_len: int,
     max_num_seqs: int,
-    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
-    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-    stop_str: Optional[list[str]],
+    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+    stop_str: list[str] | None,
     limit_mm_per_prompt: dict[str, int],
-    vllm_runner_kwargs: Optional[dict[str, Any]],
-    hf_model_kwargs: Optional[dict[str, Any]],
-    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
+    vllm_runner_kwargs: dict[str, Any] | None,
+    hf_model_kwargs: dict[str, Any] | None,
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
     runner: RunnerOption = "auto",
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
     tensor_parallel_size: int = 1,
-    vllm_embeddings: Optional[torch.Tensor] = None,
+    vllm_embeddings: torch.Tensor | None = None,
 ):
     """Modality agnostic test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
@@ -69,23 +71,25 @@ def run_test(
         vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
     if model_info.hf_overrides:
         vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
-    if model_info.skip_tokenizer_init:
-        vllm_runner_kwargs_[
-            "skip_tokenizer_init"] = model_info.skip_tokenizer_init
+    if model_info.require_embed_inputs:
+        for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
+            vllm_runner_kwargs_[k] = model_info.require_embed_inputs
 
     if vllm_runner_kwargs:
         vllm_runner_kwargs_.update(vllm_runner_kwargs)
 
-    with vllm_runner(model,
-                     max_model_len=max_model_len,
-                     max_num_seqs=max_num_seqs,
-                     dtype=dtype,
-                     limit_mm_per_prompt=limit_mm_per_prompt,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=enforce_eager,
-                     runner=runner,
-                     **vllm_runner_kwargs_) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        dtype=dtype,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=enforce_eager,
+        runner=runner,
+        **vllm_runner_kwargs_,
+    ) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
 
         vllm_kwargs: dict[str, Any] = {}
@@ -95,21 +99,19 @@ def run_test(
             vllm_kwargs["stop"] = stop_str
 
         for prompts, image_data, video_data, audio_data in vllm_inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
             vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
             vllm_output = vllm_model.generate_greedy_logprobs(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
-                **vllm_kwargs_with_mm_data)
+                **vllm_kwargs_with_mm_data,
+            )
             vllm_outputs_per_mm.append(vllm_output)
 
-    hf_model = hf_runner(model,
-                         dtype=dtype,
-                         auto_cls=auto_cls,
-                         model_kwargs=hf_model_kwargs)
+    hf_model = hf_runner(
+        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+    )
 
     # Some models need to patch things like the model processor, e.g., internvl
     if patch_hf_runner is not None:
@@ -129,16 +131,15 @@ def run_test(
             hf_kwargs["stop_strings"] = stop_str
 
         for prompts, image_data, video_data, audio_data in inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
             hf_kwargs_with_mm_data = hf_kwargs | mm_data
             hf_output = hf_model.generate_greedy_logprobs_limit(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
                 tokenizer=tokenizer,
-                **hf_kwargs_with_mm_data)
+                **hf_kwargs_with_mm_data,
+            )
             hf_outputs_per_mm.append(hf_output)
 
     # Apply output processing / sanitation to the vLLM and HF runner results
@@ -150,8 +151,7 @@ def run_test(
         second_runner_processor=vllm_output_post_proc,
     )
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
-                                        vllm_outputs_per_mm):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
         # This is usually check_logprobs_close, but it's passed through to
         # allow things like check_outputs_equal where needed
         comparator(
@@ -171,15 +171,19 @@ def process_runner_outputs(
 ):
     """Applies the runner processor(s) to the runner outputs, if any."""
     if first_runner_processor is not None:
-        first_runner_outputs = process_outputs(first_runner_processor, model,
-                                               first_runner_outputs)
+        first_runner_outputs = process_outputs(
+            first_runner_processor, model, first_runner_outputs
+        )
     if second_runner_processor is not None:
-        second_runner_outputs = process_outputs(second_runner_processor, model,
-                                                second_runner_outputs)
+        second_runner_outputs = process_outputs(
+            second_runner_processor, model, second_runner_outputs
+        )
     return first_runner_outputs, second_runner_outputs
 
 
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
-    return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+    return [
+        [output_processor(res, model) for res in outputs]
+        for outputs in outputs_per_image
+    ]
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index e369416fc49ccda5a7d5c09bd6cab3f9b2ba372a..8c9c390911bdc43f63ba21537b41c127927a2a48 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from typing import Callable
+
+from collections.abc import Callable
 
 from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
 
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
 
 def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
     """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
-    
+
     Args:
         formatter: model-specific prompt formatter.
     """
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
             stop_sign,
             rescale_image_size(stop_sign, 0.25),
             cherry_blossom.resize((183, 488)),
-            cherry_blossom.resize((488, 183))
+            cherry_blossom.resize((488, 183)),
         ],
         cherry_blossom,
     ]
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
     ]
 
 
-def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
-                                          num_frames: int = 16):
+def multi_video_multi_aspect_ratio_inputs(
+    formatter: Callable[[str], str], num_frames: int = 16
+):
     """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
-    
+
     Args:
         formatter: model-specific prompt formatter.
     """
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
             video,
             rescale_video_size(video, 0.25),
             resize_video(video, (183, 488)),
-            resize_video(video, (488, 183))
+            resize_video(video, (488, 183)),
         ],
         video,
     ]
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
 
 def different_patch_input_cases_internvl():
     images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
-    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    formatter = (
+        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    )
     single_img_prompts = [
         "<image>\nWhat's the content in the center of the image?",
         "<image>\nWhat is the season?",
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
 
 
 def windows_attention_image_qwen2_5_vl():
-
     # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
     image = ImageAsset("hato").pil_image
 
     question = "Describe the image."
     img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )
 
     wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
     return build_single_image_inputs([image], [prompt], wrapped_sf)
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
     formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
 
     scales = [0.1, 0.2, 0.25]
-    video_input = [[(rescale_video_size(video_array, scale), metadata)]
-                   for scale in scales]
+    video_input = [
+        [(rescale_video_size(video_array, scale), metadata)] for scale in scales
+    ]
     prompts = [formatted_prompt] * len(video_input)
 
     return [
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index ba55450ec8a90f33070071ee3c74f2662e286835..87cd5c3cd3554490b02c3f09637b621f483d2753 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -4,9 +4,9 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
+
 import types
 from pathlib import PosixPath
-from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -15,20 +15,24 @@ import pytest
 import regex as re
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig, GenerationMixin)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    BatchFeature,
+    GenerationConfig,
+    GenerationMixin,
+)
 from transformers.video_utils import VideoMetadata
 
 from vllm.logprobs import SampleLogprobs
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
 
 
 ####### vLLM output processors functions
-def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [blip2 models] to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -42,8 +46,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
-                           model: str) -> RunnerOutput:
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [fuyu models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -53,8 +56,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 def qwen_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [qwen models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -64,8 +67,8 @@ def qwen_vllm_to_hf_output(
 
 
 def qwen2_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -75,8 +78,8 @@ def qwen2_vllm_to_hf_output(
 
 
 def kimiv_vl_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -85,23 +88,25 @@ def kimiv_vl_vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
-def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                  model: str) -> RunnerOutput:
+def llava_image_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.image_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
 
 
 def llava_video_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, SampleLogprobs | None]:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.video_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
 
 
-def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
-                             mm_token_id: int) -> RunnerOutput:
+def _llava_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str, mm_token_id: int
+) -> RunnerOutput:
     """Sanitize vllm output [Llava models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -109,7 +114,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
     ]
 
@@ -128,8 +134,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
     return config.to_dict()
 
 
-def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                      model: str) -> RunnerOutput:
+def llava_onevision_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
     """Sanitize vllm output [llava-onevision] to compare with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -140,7 +147,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != video_token_id or output_ids[idx - 1] != video_token_id
     ]
 
@@ -151,8 +159,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [mantis] to compare with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -161,8 +168,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
     return output_ids, hf_output_str, out_logprobs
 
 
-def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [phi3v] to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -180,8 +186,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -192,7 +197,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
@@ -205,46 +211,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
-def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<｜end▁of▁sentence｜>"):
         output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def idefics3_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<end_of_utterance>"):
         output_str = output_str.split("<end_of_utterance>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     # Based on Idefics3
     return idefics3_trunc_hf_output(hf_output, model)
 
 
-def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<|eot_id|>"):
         output_str = output_str.split("<|eot_id|>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def minimax_vl_01_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<end_of_sentence>"):
         output_str = output_str.split("<end_of_sentence>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def ultravox_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
 
     tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +262,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
 
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str,
-        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    tmp_path: PosixPath, prompt: str, assets: list[ImageAsset] | ImageTestAssets
+) -> str:
     """Given a temporary dir path, export one or more image assets into the
     tempdir & replace its contents with the local path to the string so that
     the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +313,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         return BatchFeature(data=inputs, tensor_type="pt")
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language.model.embed_tokens
+    )
     return hf_model
 
 
@@ -340,6 +341,29 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
+    """Sanitize vllm output [gemma-3] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id
+        for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id
+    ]
+
+    hf_output_str = output_str
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4V."""
     hf_processor = hf_model.processor
@@ -357,11 +381,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         assert len(contents) == len(images)
 
         return hf_processor.apply_chat_template(
-            [{
-                "role": "user",
-                "image": image,
-                "content": content
-            } for image, content in zip(images, contents)],
+            [
+                {"role": "user", "image": image, "content": content}
+                for image, content in zip(images, contents)
+            ],
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
@@ -369,8 +392,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         )
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.transformer.output_layer
+    )
     return hf_model
 
 
@@ -387,10 +411,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         else:
             video_metadata = None
 
-        return hf_processor(*args,
-                            videos=videos,
-                            video_metadata=video_metadata,
-                            **kwargs)
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )
 
     hf_model.processor = processor
     return hf_model
@@ -406,8 +429,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.use_msac = self.config.use_msac
@@ -415,13 +439,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
-            # yapf: disable
+        def __call__(self, text: str, images: Image | list[Image], **kwargs):
             from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_h2ovl,
+            )
 
-            # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values_h2ovl(
@@ -431,29 +456,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                     max_num=self.max_num,
                     use_thumbnail=self.use_thumbnail,
                     use_msac=self.use_msac,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
             ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
             pixel_values = torch.cat(pixel_values, dim=0)
             for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                 image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -467,19 +489,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.min_num = self.config.min_dynamic_patch
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Image | list[Image], **kwargs):
             from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_skyworkr1v)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_skyworkr1v,
+            )
+
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values_skyworkr1v(
@@ -488,29 +514,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                     min_num=self.min_num,
                     max_num=self.max_num,
                     use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
             ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
             pixel_values = torch.cat(pixel_values, dim=0)
             for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                 image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -524,8 +547,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.min_num = self.config.min_dynamic_patch
@@ -535,13 +559,18 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         def __call__(
             self,
             text: str,
-            images: Union[Image, list[Image]] = None,
-            videos: Union[npt.NDArray, list[npt.NDArray]] = None,
+            images: Image | list[Image] = None,
+            videos: npt.NDArray | list[npt.NDArray] = None,
             **kwargs,
         ):
             from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_internvl,
+                video_to_pixel_values_internvl,
+            )
+
             images = [images] if isinstance(images, Image) else images
             videos = [videos] if isinstance(videos, np.ndarray) else videos
             if images is not None:
@@ -552,7 +581,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                         min_num=self.min_num,
                         max_num=self.max_num,
                         use_thumbnail=self.use_thumbnail,
-                    ) for image in images
+                    )
+                    for image in images
                 ]
                 num_patches_images = [
                     pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -568,7 +598,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                         min_num=1,
                         max_num=1,
                         use_thumbnail=False,
-                    ) for video in videos
+                    )
+                    for video in videos
                 ]
                 num_patches_videos = [
                     pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -580,38 +611,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             while ("<image>" in text) or ("<video>" in text):
                 image_index = text.find("<image>")
                 video_index = text.find("<video>")
-                if image_index == -1 or (video_index > -1
-                                         and video_index < image_index):
+                if image_index == -1 or (
+                    video_index > -1 and video_index < image_index
+                ):
                     num_patches = num_patches_videos.pop(0)
                     pixel_values.append(pixel_values_videos.pop(0))
-                    context_tokens = IMG_START + \
-                        IMG_CONTEXT * self.num_image_token + IMG_END
-                    video_tokens = ''.join([
-                        f'Frame{i+1}: {context_tokens}'
-                        for i in range(num_patches)
-                    ])
-                    text = text.replace('<video>', video_tokens, 1)
+                    context_tokens = (
+                        IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
+                    )
+                    video_tokens = "".join(
+                        [f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
+                    )
+                    text = text.replace("<video>", video_tokens, 1)
                 else:
                     num_patches = num_patches_images.pop(0)
                     pixel_values.append(pixel_values_images.pop(0))
-                    context_tokens = IMG_CONTEXT * self.num_image_token \
-                        * num_patches
+                    context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                     image_tokens = IMG_START + context_tokens + IMG_END
-                    text = text.replace('<image>', image_tokens, 1)
+                    text = text.replace("<image>", image_tokens, 1)
             pixel_values = torch.cat(pixel_values, dim=0)
 
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -619,7 +649,7 @@ def _internvl_generate(
     self,
     pixel_values: torch.FloatTensor,
     input_ids: torch.FloatTensor,
-    attention_mask: Optional[torch.LongTensor] = None,
+    attention_mask: torch.LongTensor | None = None,
     **generate_kwargs,
 ) -> torch.LongTensor:
     """Generate method for InternVL2 model without fixed use_cache."""
@@ -631,7 +661,7 @@ def _internvl_generate(
     input_embeds = input_embeds.reshape(B * N, C)
 
     input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
+    selected = input_ids == self.img_context_token_id
     assert selected.sum() != 0
     input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
 
@@ -778,8 +808,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 
 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )
 
     def processor(*args, text="", images=None, **kwargs):
         text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -787,8 +818,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 
         prompt_start_and_end = {
             "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
             "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
         }
         for start, end in prompt_start_and_end.values():
@@ -797,7 +827,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                 break
 
         prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
-            text_or_conversations=text, images=images)
+            text_or_conversations=text, images=images
+        )
         attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
 
         inputs = {
@@ -813,8 +844,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 
 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )
 
     def processor(*args, text="", images=None, videos=None, **kwargs):
         if images is None:
@@ -825,13 +857,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             videos = []
         else:
             videos = [videos] if isinstance(videos, np.ndarray) else videos
-            videos = [[PIL.Image.fromarray(frame) for frame in vid]
-                      for vid in videos]
+            videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
 
         prompt_start_and_end = {
             "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
             "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
         }
         for start, end in prompt_start_and_end.values():
@@ -842,21 +872,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         images_message = [{"type": "image", "image": img} for img in images]
         videos_message = [{"type": "video", "video": vid} for vid in videos]
 
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *images_message,
-                *videos_message,
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ],
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    *images_message,
+                    *videos_message,
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
 
         input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
-            messages=messages, enable_thinking=True)
+            messages=messages, enable_thinking=True
+        )
         inputs = {
             "inputs": input_ids,
             "pixel_values": pixel_values,
@@ -876,6 +905,54 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def qwen3_vl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, videos=None, **kwargs):
+        if videos is not None and is_list_of(videos, tuple):
+            # batched multi videos
+            do_sample_frames = {video[1]["do_sample_frames"] for video in videos}
+            assert len(do_sample_frames) == 1
+            if kwargs.get("do_sample_frames") is None:
+                kwargs["do_sample_frames"] = do_sample_frames
+            video_metadata = [
+                [
+                    VideoMetadata(
+                        **{k: v for k, v in video[1].items() if k != "do_sample_frames"}
+                    )
+                ]
+                for video in videos
+            ]
+            videos = [[video[0]] for video in videos]
+        elif videos is not None and isinstance(videos, tuple):
+            # single video
+            do_sample_frames = videos[1]["do_sample_frames"]
+            if kwargs.get("do_sample_frames") is None:
+                kwargs["do_sample_frames"] = do_sample_frames
+            video_metadata = [
+                [
+                    VideoMetadata(
+                        **{
+                            k: v
+                            for k, v in videos[1].items()
+                            if k != "do_sample_frames"
+                        }
+                    )
+                ]
+            ]
+            videos = [[videos[0]]]
+        else:
+            video_metadata = None
+
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     from vllm.model_executor.models.tarsier import get_vision_encoder_info
 
diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
index 562f89df13470c05e535f2f589be3b8382dd5f73..218339ef1dffbff7dd2713b89aa817c661346499 100644
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -3,23 +3,34 @@
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
+
 from pathlib import PosixPath
 
-from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
-                           VideoTestAssets, VllmRunner)
+from .....conftest import (
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo
 
 
 ####### Entrypoints for running different test types
-def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                          test_case: ExpandableVLMTestArgs,
-                          hf_runner: type[HfRunner],
-                          vllm_runner: type[VllmRunner],
-                          image_assets: ImageTestAssets):
+def run_single_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs = builders.build_single_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"image": 1},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                         test_case: ExpandableVLMTestArgs,
-                         hf_runner: type[HfRunner],
-                         vllm_runner: type[VllmRunner],
-                         image_assets: ImageTestAssets):
+def run_multi_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs = builders.build_multi_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"image": len(image_assets)},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_embedding_test(*, model_test_info: VLMTestInfo,
-                       test_case: ExpandableVLMTestArgs,
-                       hf_runner: type[HfRunner],
-                       vllm_runner: type[VllmRunner],
-                       image_assets: ImageTestAssets):
+def run_embedding_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper)
+        model_test_info, image_assets, test_case.size_wrapper
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
         limit_mm_per_prompt={"image": 1},
         vllm_embeddings=vllm_embeddings,
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
 def run_video_test(
@@ -90,8 +113,12 @@ def run_video_test(
     assert test_case.size_wrapper is not None
     assert test_case.num_video_frames is not None
     inputs = builders.build_video_inputs_from_test_info(
-        model_test_info, video_assets, test_case.size_wrapper,
-        test_case.num_video_frames)
+        model_test_info,
+        video_assets,
+        test_case.size_wrapper,
+        test_case.num_video_frames,
+        test_case.needs_video_metadata,
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -103,7 +130,8 @@ def run_video_test(
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"video": len(video_assets)},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
 def run_audio_test(
@@ -114,8 +142,7 @@ def run_audio_test(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
 ):
-    inputs = builders.build_audio_inputs_from_test_info(
-        model_test_info, audio_assets)
+    inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
 
     core.run_test(
         hf_runner=hf_runner,
@@ -127,13 +154,17 @@ def run_audio_test(
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"audio": 1},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
-                           test_case: ExpandableVLMTestArgs,
-                           hf_runner: type[HfRunner],
-                           vllm_runner: type[VllmRunner]):
+def run_custom_inputs_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+):
     # Custom test cases can provide inputs directly, but they need to
     # explicitly provided a CustomTestConfig, which wraps the inputs and
     # the limit_mm_per_prompt
@@ -155,4 +186,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 6a82bdfc4cf20bc0bab932468855dcb41f3f7ae3..5c1bc6ac28fe3058ea0215e85de7b49817ead9f3 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,23 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
-from collections.abc import Iterable
+
+from collections.abc import Callable, Iterable
 from enum import Enum
 from pathlib import PosixPath
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import torch
 from pytest import MarkDecorator
 from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import RunnerOption
+from vllm.config.model import RunnerOption
 from vllm.logprobs import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
-                           ImageTestAssets, PromptAudioInput, PromptImageInput,
-                           PromptVideoInput)
+from .....conftest import (
+    AUDIO_ASSETS,
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageAsset,
+    ImageTestAssets,
+    PromptAudioInput,
+    PromptImageInput,
+    PromptVideoInput,
+)
 from ....utils import check_logprobs_close
 
 # meta image tag; will be replaced by the appropriate tag for the model
@@ -25,32 +33,35 @@ TEST_IMG_PLACEHOLDER = "<vlm_image>"
 TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
 TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
 
-# yapf: disable
-SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
-    "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
-})
-SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
-    "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.",    # noqa: E501
-    "winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?",     # noqa: E501
-})
+SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
+        "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
+    }
+)
+SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.",  # noqa: E501
+        "winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?",  # noqa: E501
+    }
+)
 
 MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
 VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
 
 
-IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
-EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
-RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
-# yapf: enable
+IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
+RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
 
 
 class PromptWithMultiModalInput(NamedTuple):
     """Holds the multimodal input for a single test case."""
+
     prompts: list[str]
-    image_data: Optional[PromptImageInput] = None
-    video_data: Optional[PromptVideoInput] = None
-    audio_data: Optional[PromptAudioInput] = None
+    image_data: PromptImageInput | None = None
+    video_data: PromptVideoInput | None = None
+    audio_data: PromptAudioInput | None = None
 
 
 class VLMTestType(Enum):
@@ -76,17 +87,17 @@ class ImageSizeWrapper(NamedTuple):
     type: SizeType
     # A size factor is a wrapper of 0+ floats,
     # while a fixed size contains an iterable of integer pairs
-    data: Union[Iterable[float], Iterable[tuple[int, int]]]
+    data: Iterable[float] | Iterable[tuple[int, int]]
 
 
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
     models: list[str]
-    test_type: Union[VLMTestType, Iterable[VLMTestType]]
+    test_type: VLMTestType | Iterable[VLMTestType]
 
     # Should be None only if this is a CUSTOM_INPUTS test
-    prompt_formatter: Optional[Callable[[str], str]] = None
+    prompt_formatter: Callable[[str], str] | None = None
     img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
     video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
     audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
@@ -100,8 +111,9 @@ class VLMTestInfo(NamedTuple):
 
     # Function for converting ImageAssets to image embeddings;
     # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
-                                                    list[torch.Tensor]]] = None
+    convert_assets_to_embeddings: (
+        Callable[[ImageTestAssets], list[torch.Tensor]] | None
+    ) = None
 
     # Exposed options for vLLM runner; we change these in a several tests,
     # but the defaults are derived from VllmRunner & the engine defaults
@@ -111,25 +123,25 @@ class VLMTestInfo(NamedTuple):
     max_num_seqs: int = 256
     runner: RunnerOption = "auto"
     tensor_parallel_size: int = 1
-    vllm_runner_kwargs: Optional[dict[str, Any]] = None
+    vllm_runner_kwargs: dict[str, Any] | None = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
+    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
-    stop_str: Optional[list[str]] = None
+    stop_str: list[str] | None = None
 
     # Exposed options for HF runner
-    hf_model_kwargs: Optional[dict[str, Any]] = None
+    hf_model_kwargs: dict[str, Any] | None = None
     # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
 
     # Post processors that if defined, will run oun the outputs of the
     # vLLM and HF runner, respectively (useful for sanitization, etc).
-    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
-    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
 
     # Consumes the output of the callables above and checks if they're equal
     comparator: Callable[..., None] = check_logprobs_close
@@ -140,9 +152,10 @@ class VLMTestInfo(NamedTuple):
     max_tokens: int = 128
     num_logprobs: int = 5
     dtype: str = "auto"
-    distributed_executor_backend: Optional[str] = None
+    distributed_executor_backend: str | None = None
     # Only expanded in video tests
-    num_video_frames: int = 16
+    num_video_frames: int | tuple[int] = 16
+    needs_video_metadata: bool = False
 
     # Fixed image sizes / image size factors; most tests use image_size_factors
     # The values provided for these two fields will be stacked and expanded
@@ -150,19 +163,19 @@ class VLMTestInfo(NamedTuple):
     # once per tests (much like concatenating and wrapping in one parametrize
     # call)
     image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
-    image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
+    image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
 
     # Hack for updating a prompt to take into a local path; currently only used
     # for Qwen-VL, which requires encoding the image path / url into the prompt
     # for HF runner
-    prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
-                 str]] = None  # noqa: E501
+    prompt_path_encoder: (
+        Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
+    ) = None  # noqa: E501
 
     # Allows configuring a test to run with custom inputs
-    custom_test_opts: Optional[list[CustomTestOptions]] = None
+    custom_test_opts: list[CustomTestOptions] | None = None
 
-    marks: Optional[list[MarkDecorator]] = None
+    marks: list[MarkDecorator] | None = None
 
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
@@ -190,14 +203,16 @@ class VLMTestInfo(NamedTuple):
 
 class ExpandableVLMTestArgs(NamedTuple):
     """The expanded kwargs which correspond to a single test case."""
+
     model: str
     max_tokens: int
     num_logprobs: int
     dtype: str
-    distributed_executor_backend: Optional[str]
+    distributed_executor_backend: str | None
     # Sizes are used for everything except for custom input tests
-    size_wrapper: Optional[ImageSizeWrapper] = None
+    size_wrapper: ImageSizeWrapper | None = None
     # Video only
-    num_video_frames: Optional[int] = None
+    num_video_frames: int | None = None
+    needs_video_metadata: bool = False
     # Custom inputs only
-    custom_test_opts: Optional[CustomTestOptions] = None
+    custom_test_opts: CustomTestOptions | None = None
diff --git a/tests/models/multimodal/pooling/test_clip.py b/tests/models/multimodal/pooling/test_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c678558f4fae5fd3fd2408538cdc2dc0b42059
--- /dev/null
+++ b/tests/models/multimodal/pooling/test_clip.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import CLIPModel
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    "a photo of a stop sign",
+    "a photo of a cherry blossom",
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "",
+        "cherry_blossom": "",
+    }
+)
+
+MODELS = ["openai/clip-vit-base-patch32"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            inputs = hf_model.wrap_device(inputs)
+
+            if "pixel_values" in inputs:
+                pooled_output = hf_model.model.get_image_features(
+                    pixel_values=inputs.pixel_values,
+                ).squeeze(0)
+            else:
+                pooled_output = hf_model.model.get_text_features(
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                ).squeeze(0)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text_image_no_crash(
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    texts = [HF_TEXT_PROMPTS[0]]
+    images = [image_assets[0].pil_image]
+
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
+        with pytest.raises(ValueError, match="not both"):
+            vllm_model.embed(texts, images=images)
+
+        # Should still be able to run subsequent requests
+        vllm_model.embed(texts)
+        vllm_model.embed([""], images=images)
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index f152ded3fb23a4fea99f9782ca6f308ea56f43d8..ac3eb6e61723d1c65837a593d38dde27cb7e0774 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -17,18 +17,21 @@ HF_TEXT_PROMPTS = [
     # T -> X
     (
         "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
-        Image.new("RGB", (56, 56))),
+        Image.new("RGB", (56, 56)),
+    ),
     # T -> X
-    ("Query: Retrieve an image of this caption: cherry blossom",
-     Image.new("RGB", (56, 56))),
+    (
+        "Query: Retrieve an image of this caption: cherry blossom",
+        Image.new("RGB", (56, 56)),
+    ),
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What is shown in this image?",
-    "cherry_blossom":
-    "What is shown in this image?"
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "What is shown in this image?",
+        "cherry_blossom": "What is shown in this image?",
+    }
+)
 
 MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
 
@@ -36,34 +39,30 @@ MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
 def get_messages(image: Image.Image, text: str, embed_text: bool):
     # assert False, 'remember to use outer [] as required'
     if embed_text:
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": Image.new("RGB", (56, 56)),
-                    "resized_height": 1,
-                    "resized_width": 1
-                },  # need a dummy image here for an easier process.
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": Image.new("RGB", (56, 56)),
+                        "resized_height": 1,
+                        "resized_width": 1,
+                    },  # need a dummy image here for an easier process.
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
     else:
-        messages = [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image",
-                "image": image
-            }, {
-                "type": "text",
-                "text": text
-            }]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
     return messages
 
 
@@ -71,8 +70,10 @@ def apply_chat_template_and_add_eos(
     messages: list[dict],
     apply_chat_template_fn: Callable,
 ):
-    prompt = apply_chat_template_fn(
-        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    prompt = (
+        apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
+        + "<|endoftext|>"
+    )
     return prompt
 
 
@@ -86,16 +87,14 @@ def _run_test(
     *,
     dtype: str,
 ) -> None:
-    '''SET PYTHONPATH'''
+    """SET PYTHONPATH"""
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     enforce_eager=True,
-                     max_model_len=8192) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
+    ) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
         texts = [
             # this is necessary because vllm_model.embed will not apply any
@@ -105,25 +104,25 @@ def _run_test(
             apply_chat_template_and_add_eos(
                 get_messages(image, text, False),
                 apply_chat_template_fn=tokenizer.apply_chat_template,
-            ) for text, image in zip(input_texts, input_images)
+            )
+            for text, image in zip(input_texts, input_images)
             # vllm will replace the pad token with the actual image,
             # which may be a placeholder image, later.
         ]
         vllm_outputs = vllm_model.embed(texts, images=input_images)
 
     hf_outputs = []
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-
+    with hf_runner(
+        model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
+    ) as hf_model:
         prompts = []
-        for text, image, embed_text in zip(input_texts, input_images,
-                                           embed_texts):
+        for text, image, embed_text in zip(input_texts, input_images, embed_texts):
             # dse requires non-standard input processing
             # because it needs an image_pad token
             messages = get_messages(image, text, embed_text)
             prompt = apply_chat_template_and_add_eos(
-                messages, hf_model.processor.apply_chat_template)
+                messages, hf_model.processor.apply_chat_template
+            )
 
             prompts.append(prompt)
 
@@ -145,9 +144,9 @@ def _run_test(
                     return_dict=True,
                     output_hidden_states=True,
                 )
-                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
-                                            p=2,
-                                            dim=-1)
+                pooled_output = F.normalize(
+                    outputs.hidden_states[-1][0, -1], p=2, dim=-1
+                )
 
                 all_outputs.append(pooled_output.tolist())
 
@@ -170,8 +169,9 @@ def test_models_text(
     model: str,
     dtype: str,
 ) -> None:
-    input_texts_images = [(text, image_placeholder)
-                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts_images = [
+        (text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
+    ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
     embed_texts = [True] * len(input_texts)
@@ -198,8 +198,7 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 3e2be34a50ad5ac72bf04d7e2118468ebf8f6c7d..5a97848216b8489147988d844557676989b18282 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import ImageTestAssets
 
@@ -29,7 +29,7 @@ def run_intern_vit_test(
     img_processor = CLIPImageProcessor.from_pretrained(model)
     images = [asset.pil_image for asset in image_assets]
     pixel_values = [
-        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
+        img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
         for images in images
     ]
 
@@ -37,15 +37,16 @@ def run_intern_vit_test(
     if not getattr(config, "norm_type", None):
         config.norm_type = "rms_norm"
 
-    hf_model = AutoModel.from_pretrained(model,
-                                         torch_dtype=torch_dtype,
-                                         trust_remote_code=True).to("cuda")
+    hf_model = AutoModel.from_pretrained(
+        model, dtype=torch_dtype, trust_remote_code=True
+    ).to("cuda")
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).last_hidden_state
         for pixel_value in pixel_values
     ]
 
     from vllm.model_executor.models.intern_vit import InternVisionModel
+
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
 
@@ -54,22 +55,23 @@ def run_intern_vit_test(
 
     vllm_model = vllm_model.to("cuda", torch_dtype)
     vllm_outputs_per_image = [
-        vllm_model(pixel_values=pixel_value.to("cuda"))
-        for pixel_value in pixel_values
+        vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
     ]
     del vllm_model
     cleanup_dist_env_and_memory()
 
     cos_similar = nn.CosineSimilarity(dim=-1)
-    for vllm_output, hf_output in zip(vllm_outputs_per_image,
-                                      hf_outputs_per_image):
+    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
         assert cos_similar(vllm_output, hf_output).mean() > 0.99
 
 
-@pytest.mark.parametrize("model_id", [
-    "OpenGVLab/InternViT-300M-448px",
-    "OpenGVLab/InternViT-6B-448px-V1-5",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "OpenGVLab/InternViT-300M-448px",
+        "OpenGVLab/InternViT-6B-448px-V1-5",
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
     run_intern_vit_test(
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 7ad7a8d284cba9fb1d5ecff075e96e13b20cdbe6..d7b33be7a0adbe57c9506ea2cb7548ca9ef8ec58 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import pytest
 from transformers import AutoModel
@@ -29,32 +28,33 @@ def vllm_reranker(
     query_type: str = "text",
     doc_type: str = "text",
 ):
-
     def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
         return {"type": "image_url", "image_url": {"url": f"{url}"}}
 
-    query: Union[list[str], ScoreMultiModalParam]
+    query: list[str] | ScoreMultiModalParam
     if query_type == "text":
         query = query_strs
     elif query_type == "image":
         query = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in query_strs])
+            content=[create_image_param(url) for url in query_strs]
+        )
 
-    documents: Union[list[str], ScoreMultiModalParam]
+    documents: list[str] | ScoreMultiModalParam
     if doc_type == "text":
         documents = document_strs
     elif doc_type == "image":
         documents = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in document_strs])
+            content=[create_image_param(url) for url in document_strs]
+        )
 
     with vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=dtype,
-            max_num_seqs=2,
-            max_model_len=2048,
-            mm_processor_kwargs=mm_processor_kwargs,
-            limit_mm_per_prompt=limit_mm_per_prompt,
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        max_num_seqs=2,
+        max_model_len=2048,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
     ) as vllm_model:
         outputs = vllm_model.llm.score(query, documents)
 
@@ -78,16 +78,15 @@ def hf_reranker(
     data_pairs = [[query_strs[0], d] for d in document_strs]
 
     with hf_runner(
-            model_name,
-            dtype=dtype,
-            trust_remote_code=True,
-            auto_cls=AutoModel,
-            model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+        model_name,
+        dtype=dtype,
+        trust_remote_code=True,
+        auto_cls=AutoModel,
+        model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
     ) as hf_model:
-        return hf_model.model.compute_score(data_pairs,
-                                            max_length=2048,
-                                            query_type=query_type,
-                                            doc_type=doc_type)
+        return hf_model.model.compute_score(
+            data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
+        )
 
 
 # Visual Documents Reranking
@@ -100,10 +99,12 @@ def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
         "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "image"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -127,10 +128,12 @@ def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
         lower computational requirements.""",  # noqa: E501
         "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
     ]
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "text"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -157,10 +160,12 @@ def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
         "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "text"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -178,10 +183,12 @@ def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
         "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "image"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 50826677581d03d5f492f8a24109883e5d0914f6..2053ce399483108c9ab5edbf3686820fa60df058 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -24,9 +24,10 @@ from ...utils import check_embeddings_close
 #    built with LAPACK support.
 pytestmark = pytest.mark.skipif(
     not current_platform.is_cuda(),
-    reason="Llava Next model uses op that is only supported in CUDA")
+    reason="Llava Next model uses op that is only supported in CUDA",
+)
 
-llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
 
 HF_TEXT_PROMPTS = [
     # T -> X
@@ -34,18 +35,21 @@ HF_TEXT_PROMPTS = [
         "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
     ),
     # T -> X
-    llama3_template.format(
-        "cherry blossom\nSummary above sentence in one word: "),
+    llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # I -> X
-    "stop_sign":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-    # I -> X
-    "cherry_blossom":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # I -> X
+        "stop_sign": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+        # I -> X
+        "cherry_blossom": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+    }
+)
 
 MODELS = ["royokong/e5-v"]
 
@@ -63,23 +67,22 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     max_model_len=4096,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForImageTextToText) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForImageTextToText
+    ) as hf_model:
         # Patch the issue where generation_config.json is missing
-        hf_model.processor.patch_size = \
-            hf_model.model.config.vision_config.patch_size
+        hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
 
         # Patch the issue where image_token_id
         # exceeds the maximum allowed vocab size
         hf_model.model.resize_token_embeddings(
-            hf_model.model.language_model.vocab_size + 1)
+            hf_model.model.language_model.vocab_size + 1
+        )
 
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
@@ -91,8 +94,7 @@ def _run_test(
                 return_dict=True,
                 output_hidden_states=True,
             )
-            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
-                                        dim=-1)
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
 
             all_outputs.append(pooled_output.tolist())
 
@@ -142,8 +144,7 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index f918a0bd781ea90a412ece75d41ac2082e4ec664..c799a5bd3e1ef3bcc33ba7d73afe1bd25dcbac96 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -19,14 +19,14 @@ HF_TEXT_PROMPTS = [
     "Retrieve an image of this caption: cherry blossom",
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # T + I -> X
-    "stop_sign":
-    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
-    # I -> X
-    "cherry_blossom":
-    "<|image_1|> Represent the given image for classification",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # T + I -> X
+        "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+        # I -> X
+        "cherry_blossom": "<|image_1|> Represent the given image for classification",  # noqa: E501
+    }
+)
 
 MODELS = ["TIGER-Lab/VLM2Vec-Full"]
 
@@ -44,14 +44,14 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, runner="pooling", dtype=dtype,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []
@@ -114,18 +114,21 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     # add cases for special_tokens
-    input_texts_images.append((
-        "\n<s><|user|>\n <|image_1|>\n\t <s>"
-        "Represent the given image for classification<|end|>"
-        "\n<|assistant|>\n",
-        Image.open(
-            get_vllm_public_assets(filename="cherry_blossom.jpg",
-                                   s3_prefix=VLM_IMAGES_DIR)),
-    ))
+    input_texts_images.append(
+        (
+            "\n<s><|user|>\n <|image_1|>\n\t <s>"
+            "Represent the given image for classification<|end|>"
+            "\n<|assistant|>\n",
+            Image.open(
+                get_vllm_public_assets(
+                    filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
+                )
+            ),
+        )
+    )
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index 7309660ea5261b6ed23b3fd5db84cc662368d917..5082827962d8752cb6b68d629fc0ea177f2c52d7 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -19,27 +19,28 @@ def _run_test(
     vllm_runner: type[VllmRunner],
     model: str,
 ) -> None:
-
     prompt = [
         {
             # This model deals with no text input
             "prompt_token_ids": [1],
             "multi_modal_data": generate_test_mm_data(),
-        } for _ in range(10)
+        }
+        for _ in range(10)
     ]
 
     with vllm_runner(
-            model,
-            runner="pooling",
-            dtype="half",
-            enforce_eager=True,
-            skip_tokenizer_init=True,
-            # Limit the maximum number of sequences to avoid the
-            # test going OOM during the warmup run
-            max_num_seqs=32,
-            default_torch_num_threads=1,
+        model,
+        runner="pooling",
+        dtype="half",
+        enforce_eager=True,
+        skip_tokenizer_init=True,
+        enable_mm_embeds=True,
+        # Limit the maximum number of sequences to avoid the
+        # test going OOM during the warmup run
+        max_num_seqs=32,
+        default_torch_num_threads=1,
     ) as vllm_model:
-        vllm_model.encode(prompt)
+        vllm_model.llm.encode(prompt, pooling_task="plugin")
 
 
 MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py
index 27b9fe369e8009358d9d8b1cd7826f0f739550e3..1f5baed83fa62238f8286af053ae0e42aa75669d 100644
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -9,7 +9,7 @@ from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.models.radio import RadioModel
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import ImageTestAssets
 
@@ -34,50 +34,62 @@ def run_radio_test(
     # Using `self.get_nearest_supported_resolution`, for assets 432x642 the
     # nearest supported resolution is 432x640.
     pixel_values = [
-        img_processor(
-            image,
-            return_tensors='pt').pixel_values.to(torch_dtype)[:, :, :, :640]
+        img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
+            :, :, :, :640
+        ]
         for image in images
     ]
 
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
 
+    # RADIO model on HF does not properly handle torch_dtype argument
+    # And relies on args["dtype"] which we have to patch manually:
+    config.args["dtype"] = torch_dtype
+
     hf_model = AutoModel.from_pretrained(
         model_id,
         config=config,
-        torch_dtype=torch_dtype,
+        dtype=torch_dtype,
         trust_remote_code=True,
     ).to("cuda")
     hf_model.eval()
 
+    # A HF model has image normalization as a part of model's forward
+    # However in vLLM we don't make normalization a part of the model
+    # forward step since mean/std stored as model's parameters and
+    # subject to precision loss (when using fp16/bf16) which negatively
+    # affects evaluation benchmarks.
+    hf_model.make_preprocessor_external()
+
     hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features
-        for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
     ]
 
-    radio_config = RadioConfig(model_name=config.args["model"],
-                               reg_tokens=config.args["register_multiple"])
+    radio_config = RadioConfig(
+        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+    )
     vllm_model = RadioModel(radio_config)
     vllm_model.load_weights(hf_model.state_dict())
     vllm_model = vllm_model.to("cuda", torch_dtype)
 
     vllm_outputs_per_image = [
-        vllm_model(pixel_values=pixel_value.to("cuda"))
-        for pixel_value in pixel_values
+        vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
     ]
     del vllm_model, hf_model
     cleanup_dist_env_and_memory()
 
     cos_similar = nn.CosineSimilarity(dim=-1)
-    for vllm_output, hf_output in zip(vllm_outputs_per_image,
-                                      hf_outputs_per_image):
+    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
         assert cos_similar(vllm_output, hf_output).mean() > 0.99
 
 
-@pytest.mark.parametrize("model_id", [
-    "nvidia/C-RADIOv2-H",
-])
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "nvidia/C-RADIOv2-H",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
     run_radio_test(
         image_assets,
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..3345b10c099ac13737ef0b3fa5defb91529c8570
--- /dev/null
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import SiglipModel
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    "a photo of a stop sign",
+    "a photo of a cherry blossom",
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "",
+        "cherry_blossom": "",
+    }
+)
+
+MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=64
+    ) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            inputs = hf_model.wrap_device(inputs)
+
+            if "pixel_values" in inputs:
+                pooled_output = hf_model.model.get_image_features(
+                    pixel_values=inputs.pixel_values,
+                ).squeeze(0)
+            else:
+                pooled_output = hf_model.model.get_text_features(
+                    input_ids=inputs.input_ids,
+                ).squeeze(0)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text_image_no_crash(
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    texts = [HF_TEXT_PROMPTS[0]]
+    images = [image_assets[0].pil_image]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        enforce_eager=True,
+        max_model_len=64,
+    ) as vllm_model:
+        with pytest.raises(ValueError, match="not both"):
+            vllm_model.embed(texts, images=images)
+
+        vllm_model.embed(texts)
+        vllm_model.embed([""], images=images)
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index ddc675b0849c6475e12328283a843b5881f708a1..313ab2fa8038b1cd2f18a6677a8fb4d540e6732f 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,28 +1,39 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Set as AbstractSet
 from functools import partial
-from typing import Optional, Union
 
 import numpy as np
 import pytest
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
 from vllm.config import ModelConfig
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               cached_tokenizer_from_config,
-                                               encode_tokens)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.transformers_utils.tokenizer import (
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+    encode_tokens,
+)
 
 from ....multimodal.utils import random_audio, random_image, random_video
-from ...registry import HF_EXAMPLE_MODELS
+from ...registry import (
+    _MULTIMODAL_EXAMPLE_MODELS,
+    _TRANSFORMERS_BACKEND_MODELS,
+    HF_EXAMPLE_MODELS,
+)
 
 
 def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
@@ -34,14 +45,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
         # GLM4.1V doesn't support multiple videos
         video = mm_data["video"]
         num_frames = len(video)
-        mm_data["video"] = (video, {
-            "total_num_frames": num_frames,
-            "fps": num_frames,
-            "duration": 1,
-            "frames_indices": [i for i in range(num_frames)],
-            "video_backend": "opencv",
-            "do_sample_frames": True,
-        })
+        mm_data["video"] = (
+            video,
+            {
+                "total_num_frames": num_frames,
+                "fps": num_frames,
+                "duration": 1,
+                "frames_indices": [i for i in range(num_frames)],
+                "video_backend": "opencv",
+                "do_sample_frames": True,
+            },
+        )
     return mm_data
 
 
@@ -73,6 +87,119 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
     return mm_data
 
 
+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "ovis": False,
+    "ovis2_5": False,
+    "paligemma": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+MM_DATA_PATCHES = {
+    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+    "glm4v_moe": glm4_1v_patch_mm_data,
+    "qwen3_vl": qwen3_vl_patch_mm_data,
+    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
+}
+
+
+def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    for model_arch in model_arch_list:
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+        yield model_info.default
+
+        for extra_type, extra_model_id in model_info.extras.items():
+            if "fp" in extra_type:
+                continue  # Redundant to test quantized models
+
+            yield extra_model_id
+
+
+def _get_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    return list(_iter_model_ids_to_test(model_arch_list))
+
+
+def get_model_ids_to_test():
+    transformers_arch_ids = {
+        model_id
+        for info in _TRANSFORMERS_BACKEND_MODELS.values()
+        for model_id in (info.default, *info.extras.values())
+    }
+    vllm_only_archs = {
+        arch
+        for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items()
+        if not any(
+            model_id in transformers_arch_ids
+            for model_id in (info.default, *info.extras.values())
+        )
+    }
+
+    return _get_model_ids_to_test(vllm_only_archs)
+
+
+def get_text_token_prompts(
+    processor: BaseMultiModalProcessor,
+    mm_data: MultiModalDataDict,
+):
+    dummy_inputs = processor.dummy_inputs
+    tokenizer = processor.info.get_tokenizer()
+    model_config = processor.info.ctx.model_config
+
+    model_type = model_config.hf_config.model_type
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)
+
+    parsed_data = processor.data_parser.parse_mm_data(mm_data)
+    mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
+
+    text_prompt: str | None
+    token_prompt: list[int]
+    if isinstance(tokenizer, MistralTokenizer):
+        images = parsed_data.get("image", [])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=""),
+                        *(ImageChunk(image=image) for image in images),
+                    ]
+                ),
+            ]
+        )
+        res = tokenizer.mistral.encode_chat_completion(request)
+
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = res.tokens
+    else:
+        inputs = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        )
+        assert isinstance(inputs.prompt, str)
+
+        text_prompt = inputs.prompt
+        token_prompt = encode_tokens(
+            tokenizer,
+            text_prompt,
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+        )
+
+    return text_prompt, token_prompt
+
+
 def _test_processing_correctness(
     model_id_or_arch: str,
     hit_rate: float,
@@ -98,9 +225,12 @@ def _test_processing_correctness(
         hf_overrides=model_info.hf_overrides,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@@ -112,66 +242,54 @@ def _test_processing_correctness(
 
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
-    limit_mm_per_prompt = {
+    # Keep integer limits for local data generation
+    limit_mm_per_prompt_ints = {
         modality: 3 if limit is None else limit
         for modality, limit in supported_mm_limits.items()
     }
 
-    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
+        if modality == "video":
+            return VideoDummyOptions(count=count)
+        if modality == "image":
+            return ImageDummyOptions(count=count)
+        if modality == "audio":
+            return AudioDummyOptions(count=count)
+        return BaseDummyOptions(count=count)
+
+    # Assign normalized DummyOptions to the model config
+    model_config.get_multimodal_config().limit_per_prompt = {
+        modality: _to_dummy_options(modality, count)
+        for modality, count in limit_mm_per_prompt_ints.items()
+    }
 
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
-    dummy_inputs = baseline_processor.dummy_inputs
-    tokenizer = baseline_processor.info.get_tokenizer()
 
     rng = np.random.RandomState(0)
 
     input_to_hit = {
         "image": Image.new("RGB", size=(128, 128)),
         "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
+        "audio": (np.zeros((512,)), 16000),
     }
     input_factory = {
-        "image":
-        partial(random_image, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(random_video,
-                rng,
-                min_frames=2,
-                max_frames=16,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+        "image": partial(random_image, rng, min_wh=128, max_wh=256),
+        "video": partial(
+            random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
+        ),
+        "audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
     for batch_idx in range(num_batches):
         mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit + 1))]
-            for k, limit in limit_mm_per_prompt.items()
+            k: [
+                (input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+                for _ in range(rng.randint(limit + 1))
+            ]
+            for k, limit in limit_mm_per_prompt_ints.items()
         }
 
-        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-
-        # Mistral chat outputs tokens directly, rather than text prompts
-        if isinstance(tokenizer, MistralTokenizer):
-            images = mm_data.get("image", [])
-            request = ChatCompletionRequest(messages=[
-                UserMessage(content=[
-                    TextChunk(text=""),
-                    *(ImageChunk(image=image) for image in images),
-                ]),
-            ])
-            res = tokenizer.mistral.encode_chat_completion(request)
-            prompt = res.tokens
-        else:
-            prompt = dummy_inputs.get_dummy_processor_inputs(
-                model_config.max_model_len,
-                mm_counts,
-            ).prompt
-
         # Drop unnecessary keys and test single -> multi conversion
         if rng.rand() < simplify_rate:
             for k in list(mm_data.keys()):
@@ -182,8 +300,6 @@ def _test_processing_correctness(
 
         _test_processing_correctness_one(
             model_config,
-            tokenizer,
-            prompt,
             mm_data,
             baseline_processor,
             cached_processor,
@@ -191,59 +307,17 @@ def _test_processing_correctness(
         )
 
 
-# For some multimodal models, tokenizer will always add bos_token
-# at the beginning of prompt by default, causing hf_processor outputs
-# incorrect token ids. So we need use `add_special_tokens=False` here
-# to leave bos_token to be added by the processor.
-_ADD_SPECIAL_TOKENS_OVERRIDES = {
-    "ovis": False,
-    "ovis2_5": False,
-    "paligemma": False,
-    "ultravox": False,
-    "whisper": False,
-}
-
-_IGNORE_MM_KEYS = {
-    # In Ultravox, the audio_features can be different depending on padding
-    # The slight difference should not be a problem though, since
-    # attention_mask lets us ignore the difference.
-    "ultravox": {"audio_features"},
-}
-
-MM_DATA_PATCHES = {
-    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
-    "glm4v": glm4_1v_patch_mm_data,
-    "glm4v_moe": glm4_1v_patch_mm_data,
-    "qwen3_vl": qwen3_vl_patch_mm_data,
-    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
-}
-
-
 def _test_processing_correctness_one(
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
-    prompt: Union[str, list[int]],
     mm_data: MultiModalDataDict,
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
 ):
     model_type = model_config.hf_config.model_type
-    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
-    if model_type in MM_DATA_PATCHES:
-        mm_data = MM_DATA_PATCHES[model_type](mm_data)
 
-    if isinstance(prompt, str):
-        text_prompt = prompt
-        token_prompt = encode_tokens(
-            tokenizer,
-            prompt,
-            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
-        )
-    else:
-        # Mistral does not support decode_tokens with skip_special_tokens=False
-        text_prompt = None
-        token_prompt = prompt
+    text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
 
     baseline_tokenized_result = baseline_processor.apply(
         token_prompt,
@@ -287,93 +361,21 @@ def _test_processing_correctness_one(
             baseline_text_result,
             baseline_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
         )
 
         _assert_inputs_equal(
             cached_text_result,
             cached_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
         )
 
 
-# yapf: disable
-@pytest.mark.parametrize("model_id", [
-    "rhymes-ai/Aria",
-    "CohereForAI/aya-vision-8b",
-    "Salesforce/blip2-opt-2.7b",
-    "facebook/chameleon-7b",
-    "CohereLabs/command-a-vision-07-2025",
-    "deepseek-ai/deepseek-vl2-tiny",
-    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
-    "adept/fuyu-8b",
-    "google/gemma-3-4b-it",
-    "google/gemma-3n-E2B-it",
-    "zai-org/glm-4v-9b",
-    "zai-org/GLM-4.1V-9B-Thinking",
-    "zai-org/GLM-4.5V",
-    "ibm-granite/granite-speech-3.3-2b",
-    "h2oai/h2ovl-mississippi-800m",
-    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
-    "internlm/Intern-S1",
-    "OpenGVLab/InternVL2-1B",
-    "OpenGVLab/InternVL3-1B",
-    "OpenGVLab/InternVL3_5-1B",
-    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
-    "OpenGVLab/InternVL3_5-30B-A3B",
-    "Kwai-Keye/Keye-VL-8B-Preview",
-    "Kwai-Keye/Keye-VL-1_5-8B",
-    "moonshotai/Kimi-VL-A3B-Instruct",
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    "llava-hf/LLaVA-NeXT-Video-7B-hf",
-    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-    "TIGER-Lab/Mantis-8B-siglip-llama3",
-    "mispeech/midashenglm-7b",
-    "openbmb/MiniCPM-Llama3-V-2_5",
-    "openbmb/MiniCPM-o-2_6",
-    "openbmb/MiniCPM-V-2_6",
-    "MiniMaxAI/MiniMax-VL-01",
-    "allenai/Molmo-7B-D-0924",
-    "allenai/Molmo-7B-O-0924",
-    "nvidia/NVLM-D-72B",
-    "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
-    "AIDC-AI/Ovis1.6-Gemma2-9B",
-    "AIDC-AI/Ovis1.6-Llama3.2-3B",
-    "AIDC-AI/Ovis2-1B",
-    "AIDC-AI/Ovis2.5-2B",
-    "google/paligemma-3b-mix-224",
-    "google/paligemma2-3b-ft-docci-448",
-    "microsoft/Phi-3.5-vision-instruct",
-    "microsoft/Phi-4-multimodal-instruct",
-    "mistralai/Pixtral-12B-2409",
-    "mistral-community/pixtral-12b",
-    "Qwen/Qwen-VL-Chat",
-    "Qwen/Qwen2-VL-2B-Instruct",
-    "Qwen/Qwen2.5-VL-3B-Instruct",
-    "Qwen/Qwen2-Audio-7B-Instruct",
-    "Qwen/Qwen2.5-Omni-3B",
-    "Qwen/Qwen3-VL-4B-Instruct",
-    "Qwen/Qwen3-VL-30B-A3B-Instruct",
-    "YannQi/R-4B",
-    "Skywork/Skywork-R1V-38B",
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-    "stepfun-ai/step3",
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
-    "openai/whisper-large-v3",
-    "omni-research/Tarsier-7b",
-    "omni-research/Tarsier2-Recap-7b",
-    "mistralai/Voxtral-Mini-3B-2507",
-])
+@pytest.mark.parametrize("model_id", get_model_ids_to_test())
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
 def test_processing_correctness(
     model_id: str,
     hit_rate: float,
@@ -381,7 +383,12 @@ def test_processing_correctness(
     simplify_rate: float,
 ):
     if model_id == "google/gemma-3n-E2B-it":
-        pytest.skip("Skipping gemma-3n-E2B-it due to transformers #39911 bug.")
+        pytest.skip("Fix later")
+    if model_id == "OpenGVLab/InternVL2-2B":
+        pytest.skip("Fix later")
+    if model_id == "jinaai/jina-reranker-m0":
+        pytest.skip("Fix later")
+
     _test_processing_correctness(
         model_id,
         hit_rate=hit_rate,
@@ -416,7 +423,7 @@ def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
     *,
-    ignore_mm_keys: Optional[set[str]] = None,
+    ignore_mm_keys: set[str] | None = None,
     msg: str = "",
 ):
     if ignore_mm_keys is None:
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 070ddcd89ee96782bb9d588c75bf843b9791107a..553a5f719bd354e59900493c93ca82f5b5dbfd71 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -24,7 +24,8 @@ from ...utils import build_model_context
         # post-sampled frames (expected behavior)
         (-1, 1, 5),
         (-1, 2, 10),
-    ])
+    ],
+)
 def test_processor_override(
     model_id: str,
     expected_toks_per_frame: int,
@@ -55,10 +56,8 @@ def test_processor_override(
     # Ensure we have the right number of placeholders per num_crops size
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
-    video_tok_count = processed_inputs["prompt_token_ids"].count(
-        video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
-    )["video_grid_thw"][0]
+    video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
 
     assert grid_t == expected_grid_t
     assert video_tok_count == expected_toks_per_frame * grid_t
@@ -71,7 +70,7 @@ def test_video_loader_consistency(
     fps: int,
 ):
     """
-    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    Ensure dynamic video loader (pre-sampled by loader) and normal video
     loader (post-sampled by processor) produce same video processing outputs.
     """
     ctx = build_model_context(
@@ -91,7 +90,8 @@ def test_video_loader_consistency(
 
     static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
     dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps)
+        video_bytes, fps=fps
+    )
 
     # pre-sampled loader shouldn't read all frames
     assert len(dynamic_video) < len(static_video)
@@ -99,12 +99,11 @@ def test_video_loader_consistency(
     static_mm_data = {"video": [(static_video, static_metadata)]}
     dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
 
-    static_outputs = processor.apply(prompt, static_mm_data,
-                                     hf_processor_mm_kwargs)
-    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
-                                      hf_processor_mm_kwargs)
+    static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
 
-    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
-        "prompt_token_ids"]
-    assert static_outputs["mm_kwargs"].get_data(
-    ) == dynamic_outputs["mm_kwargs"].get_data()
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
+    assert (
+        static_outputs["mm_kwargs"].get_data()
+        == dynamic_outputs["mm_kwargs"].get_data()
+    )
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 1adfe21352c41be920d163830c34150b0f0ff988..1701d9dd8f011d08ee3878d52071866dd01bd681 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -23,8 +23,10 @@ def _get_expected_num_patches(
     min_num: int,
     max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
-                                                  get_h2ovl_target_ratios)
+    from vllm.model_executor.models.h2ovl import (
+        calculate_h2ovl_targets,
+        get_h2ovl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -101,24 +103,27 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
 
 
-@pytest.mark.parametrize("model_id", [
-    "h2oai/h2ovl-mississippi-800m",
-    "h2oai/h2ovl-mississippi-2b",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b",
+    ],
+)
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -143,7 +148,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
@@ -165,10 +170,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index d3a55993e5588baf172ebb1179f5ca4e343b9bcb..351b9d018eec250f143521b03d188aeec255af55 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import Idefics3Config
 
@@ -11,14 +12,13 @@ from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
-# yapf: disable
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
     [
         ({"size": {"longest_edge": 364}}, 169),
         ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-    ])
-# yapf: enable
+    ],
+)
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
@@ -42,8 +42,11 @@ def test_processor_override(
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
     prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
 
     # Build mm_data
@@ -57,8 +60,7 @@ def test_processor_override(
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index e4f25f5ac7123315cadbae8685e4763cfb6895a1..b4994295d3a80b1fb3066aab1297f1dc90d5b716 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -24,7 +24,9 @@ def _get_expected_num_patches(
     max_num: int,
 ):
     from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -61,15 +63,15 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
@@ -100,7 +102,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
@@ -122,10 +124,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index bea4f43567eeeb2da9da4f94553db9af31cb81c7..4c0791ea3cece91e35f126248955d2f41b2cf409 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
 @pytest.mark.parametrize("mm_processor_kwargs", [{}])
 @pytest.mark.parametrize("num_imgs", [1, 5])
 @pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@@ -38,13 +37,14 @@ def test_processor_override(
     hf_processor = processor.info.get_hf_processor()
     vocab = tokenizer.get_vocab()
 
-    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
-        + "<|image|>" * num_imgs \
+    prompt = (
+        "<|begin_of_text|><|header_start|>user<|header_end|>"
+        + "<|image|>" * num_imgs
         + "<|eot|><|header_start|>assistant<|header_end|>"
+    )
     mm_data = {
         "image": [
-            image_assets[(i % len(image_assets))].pil_image
-            for i in range(num_imgs)
+            image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
         ]
     }
     if tokenized_prompt:
@@ -64,22 +64,23 @@ def test_processor_override(
         if tiles_x * tiles_y > 1:
             num_x_separators += (tiles_x - 1) * tiles_y
             num_y_separators += tiles_y
-    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
-        == num_x_separators
-    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
-        ==  num_y_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
+    assert (
+        prompt_token_ids.count(vocab[hf_processor.tile_global_token])
+        == num_y_separators
+    )
 
     # image token offsets
     img_locs = processed_inputs["mm_placeholders"].get("image", [])
     assert len(img_locs) == num_imgs
-    assert [img_loc.offset for img_loc in img_locs] == \
-        [i for i, v in enumerate(prompt_token_ids) \
-        if v == config.boi_token_index]
+    assert [img_loc.offset for img_loc in img_locs] == [
+        i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
+    ]
 
     # patch sizes and masks
-    num_patches_per_chunk = processor.info.get_patch_per_chunk(
-        config.vision_config)
-    assert prompt_token_ids.count(config.image_token_index) \
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
+    assert (
+        prompt_token_ids.count(config.image_token_index)
         == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
-    assert len(mm_data["pixel_values"]) \
-        == sum(mm_data["patches_per_image"])
+    )
+    assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index ca34d1d758a46920de1020d9dc28803573b58913..ffe7ca17b5d61b11ebec047368fd3822d83d29e7 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
     image_size: ImageSize,
 ) -> None:
     info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
 
     try:
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
         failed_size_excs.append((image_size, exc))
 
 
-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
 
         # NOTE: There is a BOS token
         assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
-            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        assert (
+            first_placeholder.length
+            == (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        )
 
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
 
 
-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index e6344c4e7e6fd1916fcfc147c04ade60cebe1aa4..f5c552fe6476a545f6bcbceb448c07b4251f1609 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
     image_size: ImageSize,
 ) -> None:
     info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
 
     try:
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
         failed_size_excs.append((image_size, exc))
 
 
-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
         model_id,
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
-            processed_inputs["prompt_token_ids"]) // num_imgs
+        assert (
+            first_placeholder.length
+            == len(processed_inputs["prompt_token_ids"]) // num_imgs
+        )
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
 
@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
 
 
-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 9387212e3f1017a97d6854e32691265e3e6087f8..11e000123511047626eee80ee7ef5e7127c71bef 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
     num_imgs: int,
     image_sizes: list[ImageSize],
 ) -> None:
-
     failed_size_excs = list[tuple[ImageSize, Exception]]()
 
     for size in image_sizes:
-        _validate_image_prompt_replacements_one(processor, num_imgs,
-                                                failed_size_excs, size)
+        _validate_image_prompt_replacements_one(
+            processor, num_imgs, failed_size_excs, size
+        )
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index e7b28ff8ec7f0047aa25cd5f50613705dc5170bf..e5ff2d1391b6239a95385c55d562dd34f9bed1b8 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
+
 import pytest
 from torch import prod
 from transformers import Llama4Config
@@ -17,23 +18,23 @@ def test_profiling(model_id: str, max_model_len: int):
     model_config_kwargs = {
         "max_model_len": max_model_len,
     }
+    mm_counts = {"image": 1}
     ctx = build_model_context(
         model_id,
         model_config_kwargs=model_config_kwargs,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt=mm_counts,
     )
 
-    mm_config = ctx.get_mm_config()
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     profiler = MultiModalProfiler(processor)
 
     decoder_dummy_data = profiler.get_decoder_dummy_data(
         max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
+        mm_counts=mm_counts,
     )
     dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
         max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
+        mm_counts=mm_counts,
     )
 
     hf_config = ctx.get_hf_config(Llama4Config)
@@ -47,21 +48,25 @@ def test_profiling(model_id: str, max_model_len: int):
     image_size = hf_config.vision_config.image_size
     patch_size = hf_config.vision_config.patch_size
     downsample_ratio = int(
-        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
-    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
+    )
+    tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
     chunks_per_image = prod(mm_data["patches_per_image"])
     total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y separator tokens
-    total_tokens = total_num_patches.item() + num_tiles.item(
-    ) + 3  # image start, image, image end
+    num_tiles = (
+        mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
+    )  # x-y separator tokens
+    total_tokens = (
+        total_num_patches.item() + num_tiles.item() + 3
+    )  # image start, image, image end
 
     profiled_tokens = profiler.get_mm_max_contiguous_tokens(
         max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
+        mm_counts=mm_counts,
     )
 
     assert total_tokens == profiled_tokens["image"]
     assert total_tokens == sum(
-        placeholder.length for placeholder in
-        decoder_dummy_data.multi_modal_placeholders["image"])
+        placeholder.length
+        for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
+    )
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index d9f1965a053df68b4918c7c2cbf14acbab86e4c4..5311ab1b78c69e70df37e8dbe1ef1321100b1e20 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -24,7 +24,9 @@ def _get_expected_num_patches(
     max_num: int,
 ):
     from vllm.model_executor.models.nemotron_vl import (
-        calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
+        calculate_nemotron_vl_targets,
+        get_nemotron_vl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -63,22 +65,21 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
     print(total_expected_num_patches)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<image>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
     print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
 
 
-@pytest.mark.parametrize("model_id",
-                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -103,7 +104,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
@@ -125,10 +126,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 1f3646f7948685265a4bd2402afa90f7feca743b..8faff2611e6feb192979b8528ac6b53a2c355a18 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -10,7 +11,6 @@ from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
-# yapf: disable
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
     [
@@ -18,8 +18,8 @@ from ...utils import build_model_context
         ({"num_crops": 16}, 1921),
         # the default num_crops of phi-3.5-vision is 4
         ({}, 757),
-    ])
-# yapf: enable
+    ],
+)
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index f16d261c2c6a4551c827af4859d656016088f6a4..5391555c266755177d7e683fdc947c58a975c568 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -10,7 +11,6 @@ from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
-# yapf: disable
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
     [
@@ -18,8 +18,8 @@ from ...utils import build_model_context
         ({"dynamic_hd": 16}, 4433),
         # the default num_crops of phi-4-multimodal is 36
         ({}, 9585),
-    ])
-# yapf: enable
+    ],
+)
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
@@ -46,8 +46,7 @@ def test_processor_override(
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
 
-    image_size = ctx.get_hf_config(
-    ).embd_layer["image_embd_layer"]["crop_size"]
+    image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
     dummy_image_size = (image_size * 7, image_size * 7)
     dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
     mm_data = {"image": [dummy_image] * num_imgs}
@@ -56,5 +55,6 @@ def test_processor_override(
 
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = processed_inputs["prompt_token_ids"].count(
-        _IMAGE_PLACEHOLDER_TOKEN_ID)
+        _IMAGE_PLACEHOLDER_TOKEN_ID
+    )
     assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 985f4188fdb665fcec638c2f34704516dd050952..9f4cdb6789b2cf3a0348a48a69cfb0f6b87c9f87 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -10,13 +10,13 @@ from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
-# yapf: disable
 @pytest.mark.parametrize(
-    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
+    [
         ({}, 1426, (5704, 1176)),
         ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
-    ])
-# yapf: enable
+    ],
+)
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
@@ -48,8 +48,7 @@ def test_processor_override(
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
 
     assert img_tok_count == expected_toks_per_img * num_imgs
     assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index af8f983388c6cb89105bfce98d60856a52a593cd..6f77d5516d147135d0f4d7c291afd2497629288b 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import SmolVLMConfig
 
@@ -11,14 +12,13 @@ from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
-# yapf: disable
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
     [
         ({"max_image_size": {"longest_edge": 384}}, 1377),
         ({"max_image_size": {"longest_edge": 768}}, 405),
-    ])
-# yapf: enable
+    ],
+)
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
@@ -42,8 +42,11 @@ def test_processor_override(
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
     prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
 
     # Build mm_data
@@ -57,8 +60,7 @@ def test_processor_override(
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 9d132ecc34b7115a72f9ab587027245f9bc69b70..687d1ef349f84398a95cf1dc279c41d8ff3ae1dd 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -4,55 +4,62 @@ import tempfile
 from collections.abc import Iterable
 from contextlib import contextmanager
 from functools import partial
-from typing import Any, Union
+from typing import Any, TypeAlias
 
 import numpy as np
 import pytest
 import torch.nn as nn
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
-from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   supports_multimodal)
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    supports_multimodal,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.torch_utils import set_default_torch_dtype
 
-from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
-
-ARCH_TO_SKIP = {
-    "MolmoForCausalLM": "incompatible requirements",
-}
-ARCH_NEEDS_EXTRAS = [
-    "InternVLChatModel",
-    "Idefics3ForConditionalGeneration",
-    "LlavaForConditionalGeneration",
-    "MiniCPMV",
-    "PaliGemmaForConditionalGeneration",
-]
-REPO_ID_TO_SKIP = {
-    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-}
+from .test_common import get_model_ids_to_test, get_text_token_prompts
 
 ImageInput = list[Image.Image]
-VideoInput = Union[list[Image.Image], list[np.ndarray],
-                   list[tuple[np.ndarray, dict[str, Any]]]]
+VideoInput: TypeAlias = (
+    list[Image.Image] | list[np.ndarray] | list[tuple[np.ndarray, dict[str, Any]]]
+)
 AudioInput = list[tuple[np.ndarray, int]]
 
 
-def _resize_data(_data: Union[Image.Image, np.ndarray],
-                 size_factor: float) -> Union[Image.Image, np.ndarray]:
+MM_OPTIONS_OVERRIDES = {
+    # Qwen3-VL's default profiling video size (64x64) can cause trouble
+    # after resizing, so we override it here for testing.
+    "qwen3_vl": dict(
+        video=VideoDummyOptions(num_frames=128, width=256, height=256),
+    ),
+    "qwen3_vl_moe": dict(
+        video=VideoDummyOptions(num_frames=128, width=256, height=256),
+    ),
+}
+
+
+def _resize_data(
+    _data: Image.Image | np.ndarray, size_factor: float
+) -> Image.Image | np.ndarray:
     assert size_factor <= 1, "Size factor must be less than 1"
     # Image input
     if isinstance(_data, Image.Image):
@@ -72,20 +79,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
         return _data[..., :T, :H, :W, :C]
     # Audio input
     elif isinstance(_data, np.ndarray) and _data.ndim == 1:
-        return _data[:int(len(_data) * size_factor)]
+        return _data[: int(len(_data) * size_factor)]
     raise AssertionError("This line should be unreachable.")
 
 
 def resize_mm_data(
-    data: Union[ImageInput, VideoInput, AudioInput],
-    size_factors: tuple[float,
-                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
-    size_factors = size_factors[:len(data)]
+    data: ImageInput | VideoInput | AudioInput, size_factors: tuple[float, ...]
+) -> ImageInput | VideoInput | AudioInput:
+    size_factors = size_factors[: len(data)]
     if is_list_of(data, (Image.Image, np.ndarray, list)):
         return [_resize_data(d, s) for d, s in zip(data, size_factors)]
     elif is_list_of(data, tuple):
-        return [(_resize_data(d, s), meta)
-                for (d, meta), s in zip(data, size_factors)]
+        return [_resize_data(d, s) for (d, _), s in zip(data, size_factors)]
     raise ValueError("Unsupported multimodal data type.")
 
 
@@ -95,6 +100,8 @@ def create_batched_mm_kwargs(
     processor: BaseMultiModalProcessor,
     size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
 ) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+    model_type = model_config.hf_config.model_type
+
     processing_info = processor.info
     dummy_inputs = processor.dummy_inputs
     supported_mm_limits = processing_info.get_supported_mm_limits()
@@ -105,36 +112,24 @@ def create_batched_mm_kwargs(
     processor_inputs = dummy_inputs.get_dummy_processor_inputs(
         seq_len=model_config.max_model_len,
         mm_counts=mm_counts,
+        mm_options=MM_OPTIONS_OVERRIDES.get(model_type),
     )
     mm_data = processor_inputs.mm_data
     resized_mm_data = {
         modality: resize_mm_data(data, size_factors)
         for modality, data in mm_data.items()
     }
-    # Mistral chat outputs tokens directly, rather than text prompts
-    if model_config.tokenizer_mode == "mistral":
-        images = resized_mm_data.get("image", [])
-        request = ChatCompletionRequest(messages=[
-            UserMessage(content=[
-                TextChunk(text=""),
-                *(ImageChunk(image=image) for image in images),
-            ]),
-        ])
-        tokenizer = processing_info.get_tokenizer()
-        res = tokenizer.mistral.encode_chat_completion(request)
-        prompt = res.tokens
-    else:
-        prompt = processor_inputs.prompt
+
+    # video metadata will be added back to the resized video data here.
+    text_prompt, token_prompt = get_text_token_prompts(processor, resized_mm_data)
+
     mm_kwargs = processor.apply(
-        prompt=prompt,
+        prompt=token_prompt if text_prompt is None else text_prompt,
         mm_data=resized_mm_data,
         hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         tokenization_kwargs=processor_inputs.tokenization_kwargs,
     )["mm_kwargs"].require_data()
-    items = [
-        item for modality in supported_mm_limits
-        for item in mm_kwargs[modality]
-    ]
+    items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
     return group_mm_kwargs_by_modality(
         items,
         merge_by_field_config=model_cls.merge_by_field_config,
@@ -165,38 +160,21 @@ def initialize_dummy_model(
     cleanup_dist_env_and_memory()
 
 
-def get_model_id_to_test(
-        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
-    filtered_results = []
-    for model_arch in model_arch_list:
-        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
-            available_repos = list(
-                map(lambda model_id: (model_arch, model_id),
-                    [model_info.default, *model_info.extras.values()]))
-            filtered_results.extend(available_repos)
-        else:
-            filtered_results.append((model_arch, model_info.default))
-    return filtered_results
-
-
-@pytest.mark.parametrize(
-    "model_arch, model_id",
-    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, model_id: str):
-    if model_arch in ARCH_TO_SKIP:
-        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
-    if model_id in REPO_ID_TO_SKIP:
-        pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}")
-
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+@pytest.mark.parametrize("model_id", get_model_ids_to_test())
+def test_model_tensor_schema(model_id: str):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip",
-                                          check_max_version=False)
+    model_info.check_transformers_version(on_fail="skip")
 
-    hf_overrides_fn = partial(dummy_hf_overrides,
-                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides)
+    model_arch = next(
+        arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
+    )
+
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+    )
 
     model_config = ModelConfig(
         model_id,
@@ -205,7 +183,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
         dtype=model_info.dtype,
     )
@@ -236,13 +216,29 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
         modality: 3 if limit is None else limit
         for modality, limit in supported_mm_limits.items()
     }
-    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
+    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
+        if modality == "video":
+            return VideoDummyOptions(count=count)
+        if modality == "image":
+            return ImageDummyOptions(count=count)
+        if modality == "audio":
+            return AudioDummyOptions(count=count)
+        return BaseDummyOptions(count=count)
+
+    model_config.get_multimodal_config().limit_per_prompt = {
+        modality: _to_dummy_options(modality, count)
+        for modality, count in limit_mm_per_prompt.items()
+    }
     processor = factories.build_processor(ctx, cache=None)
 
     with initialize_dummy_model(model_cls, model_config) as model:
         for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                model_cls, model_config, processor):
+            model_cls, model_config, processor
+        ):
             for method_name in inputs_parse_methods:
-                print(f"Testing `{method_name}` with modality={modality} "
-                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                print(
+                    f"Testing `{method_name}` with modality={modality} "
+                    f"and mm_kwargs{list(mm_kwargs.keys())}"
+                )
                 getattr(model, method_name)(modality=modality, **mm_kwargs)
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index 54a0be99384a8f804190ae7729616f7ef43202f0..e2a2186f470b4ad02364e49e77ae6a7e878883c6 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -7,9 +7,7 @@ from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-# yapf: disable
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_multimodal_processor(model_id):
     model_config = ModelConfig(
         model=model_id,
@@ -18,9 +16,9 @@ def test_multimodal_processor(model_id):
 
     mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
-    image_pil = ImageAsset('cherry_blossom').pil_image
+    image_pil = ImageAsset("cherry_blossom").pil_image
     mm_data = {"image": image_pil}
-    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
+    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n"  # noqa: E501
     str_processed_inputs = mm_processor.apply(
         prompt=str_prompt,
         mm_data=mm_data,
@@ -28,8 +26,23 @@ def test_multimodal_processor(model_id):
     )
 
     ids_prompt = [
-        151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
-        30, 151645, 151644, 77091, 198
+        151644,
+        872,
+        220,
+        151646,
+        198,
+        3838,
+        374,
+        279,
+        2213,
+        315,
+        419,
+        2168,
+        30,
+        151645,
+        151644,
+        77091,
+        198,
     ]
     ids_processed_inputs = mm_processor.apply(
         prompt=ids_prompt,
@@ -37,4 +50,7 @@ def test_multimodal_processor(model_id):
         hf_processor_mm_kwargs={},
     )
 
-    assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
+    assert (
+        str_processed_inputs["prompt_token_ids"]
+        == ids_processed_inputs["prompt_token_ids"]
+    )
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index caf1966ab513f26ffccbf3edf6586b2f8a372c66..2f38dc450ef96c304ff515ddf16ea5ed040f1861 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -19,7 +19,7 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
     """Create weights from safetensors checkpoint metadata"""
     metadata = try_get_safetensors_metadata(repo)
     weight_names = list(metadata.weight_map.keys())
-    with torch.device('meta'):
+    with torch.device("meta"):
         return ((name, torch.empty(0)) for name in weight_names)
 
 
@@ -59,9 +59,12 @@ def test_hf_model_weights_mapper(model_arch: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
     original_weights = create_repo_dummy_weights(model_id)
@@ -83,6 +86,7 @@ def test_hf_model_weights_mapper(model_arch: str):
 
     weights_missing = ref_weight_names - weight_names
     weights_unmapped = weight_names - ref_weight_names
-    assert (not weights_missing and not weights_unmapped), (
+    assert not weights_missing and not weights_unmapped, (
         f"Following weights are not mapped correctly: {weights_unmapped}, "
-        f"Missing expected weights: {weights_missing}.")
+        f"Missing expected weights: {weights_missing}."
+    )
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index e741e4ad90a094458106faa25d7d17c57b5d520b..70464cf7fb416c5946885e2f9fcaab554bf11010 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -11,12 +10,12 @@ from vllm.multimodal.image import rescale_image_size
 from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
 from ..utils import check_logprobs_close
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-    "cherry_blossom":
-    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        "cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    }
+)
 
 
 def run_awq_test(
@@ -30,14 +29,17 @@ def run_awq_test(
     max_tokens: int,
     num_logprobs: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -46,42 +48,41 @@ def run_awq_test(
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            source_model,
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        source_model,
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
     ) as vllm_model:
         source_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs_per_image
         ]
 
     with vllm_runner(
-            quant_model,
-            quantization="awq",
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        quant_model,
+        quantization="awq",
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
     ) as vllm_model:
         quant_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs_per_image
         ]
 
-    for source_outputs, quant_outputs in zip(source_outputs_per_image,
-                                             quant_outputs_per_image):
+    for source_outputs, quant_outputs in zip(
+        source_outputs_per_image, quant_outputs_per_image
+    ):
         # TODO: Check whether using original CLIPVisionModel can improve
         # consistency against HF
         check_logprobs_close(
@@ -113,9 +114,16 @@ def run_awq_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs) -> None:
-
+def test_awq_models(
+    vllm_runner,
+    image_assets,
+    source_model,
+    quant_model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
     run_awq_test(
         vllm_runner,
         image_assets,
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
index 754ac9a29a13254fc356f2bf67e3356672c3eff8..f516cc2724a6b0dd16c78b6cbdbaa3b7464edf8d 100644
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.
 
 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -24,8 +25,10 @@ class ModelPair:
 
 
 model_pairs = [
-    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
-              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+    ModelPair(
+        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
+    ),
 ]
 
 
@@ -43,16 +46,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_bitblas,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
         bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index 25fc44fee90dd01093bb8ab030f42e620b14e384..24220978534ca178362b46af243883cc1d95f780 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -1,22 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''Tests whether bitsandbytes computation is enabled correctly.
+"""Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
-'''
+"""
 
 import pytest
 from transformers import BitsAndBytesConfig
 
 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
 
 from ...utils import compare_two_settings, multi_gpu_test
 from ..utils import check_embeddings_close, check_logprobs_close
 
+pytestmark = pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
+)
+
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
-    ("mistralai/Mistral-7B-Instruct-v0.3",
-     "quantize inflight model with both HF and Mistral format weights")
+    (
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "quantize inflight model with both HF and Mistral format weights",
+    ),
 ]
 
 models_4bit_to_embedding_test = [
@@ -28,72 +36,84 @@ models_4bit_to_moe_test = [
 ]
 
 models_pre_qaunt_4bit_to_test = [
-    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
-     'read pre-quantized 4-bit FP4 model'),
-    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
+    (
+        "PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
+        "read pre-quantized 4-bit FP4 model",
+    ),
+    ("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
 ]
 
 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
+    ("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
     ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, False, hf_model_kwargs)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_qaunt_4bit_to_test)
-def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                       model_name, description) -> None:
+def test_load_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
+    )
 
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
 
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_quant_8bit_to_test)
-def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
 
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @multi_gpu_test(num_gpus=2)
-def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             example_prompts[:1],
-                             model_name,
-                             False,
-                             hf_model_kwargs,
-                             vllm_tp_size=2)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+def test_load_tp_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner,
+        vllm_runner,
+        example_prompts[:1],
+        model_name,
+        False,
+        hf_model_kwargs,
+        vllm_tp_size=2,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @multi_gpu_test(num_gpus=2)
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@@ -115,30 +135,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
     compare_two_settings(model_name, common_args, pp_args)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
-def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
-                            model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-    ))
-    with vllm_runner(model_name,
-                     quantization='bitsandbytes',
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
-                                                    max_tokens=32,
-                                                    num_logprobs=5)
-
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+def test_4bit_bnb_moe_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    )
+    with vllm_runner(
+        model_name,
+        quantization="bitsandbytes",
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
+
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
         transformers_outputs = llm.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens=32, num_logprobs=5)
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
         outputs_1_lst=vllm_outputs,
@@ -147,10 +174,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
     )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_4bit_to_embedding_test)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_4bit_bnb_embedding_model(
     model_name,
@@ -160,7 +188,6 @@ def test_4bit_bnb_embedding_model(
     example_prompts,
     dtype: str,
 ) -> None:
-
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -170,22 +197,23 @@ def test_4bit_bnb_embedding_model(
     example_prompts = [str(s).strip() for s in example_prompts]
 
     # Inflight 4bit quantization
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=dtype,
-                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes",
-                     default_torch_num_threads=1) as vllm_model:
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        gpu_memory_utilization=0.5,
+        quantization="bitsandbytes",
+        default_torch_num_threads=1,
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
     with hf_runner(
-            model_name,
-            dtype=dtype,
-            model_kwargs=hf_model_kwargs,
-            is_sentence_transformer=True,
-            default_torch_num_threads=1,
+        model_name,
+        dtype=dtype,
+        model_kwargs=hf_model_kwargs,
+        is_sentence_transformer=True,
+        default_torch_num_threads=1,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
@@ -210,23 +238,25 @@ def log_generated_texts(prompts, outputs, runner_name):
     return logged_texts
 
 
-def validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             prompts,
-                             model_name,
-                             pre_quant=False,
-                             hf_model_kwargs=None,
-                             vllm_tp_size=1,
-                             max_tokens=8):
-
+def validate_generated_texts(
+    hf_runner,
+    vllm_runner,
+    prompts,
+    model_name,
+    pre_quant=False,
+    hf_model_kwargs=None,
+    vllm_tp_size=1,
+    max_tokens=8,
+):
     # NOTE: run vLLM first, as it requires a clean process
     # when using distributed inference
-    with vllm_runner(model_name,
-                     quantization=None if pre_quant else 'bitsandbytes',
-                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-
+    with vllm_runner(
+        model_name,
+        quantization=None if pre_quant else "bitsandbytes",
+        tensor_parallel_size=vllm_tp_size,
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
         vllm_outputs = llm.generate_greedy(prompts, max_tokens)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
 
@@ -234,9 +264,9 @@ def validate_generated_texts(hf_runner,
         hf_model_kwargs = {}
 
     # Run with HF runner
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
         hf_outputs = llm.generate_greedy(prompts, max_tokens)
         hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
 
@@ -245,8 +275,10 @@ def validate_generated_texts(hf_runner,
         hf_str = hf_log["generated_text"]
         vllm_str = vllm_log["generated_text"]
         prompt = hf_log["prompt"]
-        assert hf_str == vllm_str, (f"Model: {model_name}"
-                                    f"Mismatch between HF and vLLM outputs:\n"
-                                    f"Prompt: {prompt}\n"
-                                    f"HF Output: '{hf_str}'\n"
-                                    f"vLLM Output: '{vllm_str}'")
+        assert hf_str == vllm_str, (
+            f"Model: {model_name}"
+            f"Mismatch between HF and vLLM outputs:\n"
+            f"Prompt: {prompt}\n"
+            f"HF Output: '{hf_str}'\n"
+            f"vLLM Output: '{vllm_str}'"
+        )
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index bb8ae741b61499089d543f165161765d8b6e1c60..2a6f34a9c4823eba25edd70ee6d78edaf0d90216 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -5,30 +5,43 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
+from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
-
 from ..utils import check_logprobs_close
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize(
     "kv_cache_dtype,base_model,test_model",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
+        ),
         # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
         # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct")
-    ])
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("enforce_eager", [True])
@@ -54,38 +67,41 @@ def test_models(
     """
 
     if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
-        pytest.skip(
-            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+        pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
 
-    if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
-        pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
+    if not flash_attn_supports_fp8():
+        pytest.skip(
+            f"{kv_cache_dtype} is not supported on this GPU type with {backend} attention."
+        )
 
     with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
         m.setenv(STR_BACKEND_ENV_VAR, backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
 
         with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         check_logprobs_close(
             outputs_0_lst=baseline_outputs,
@@ -96,15 +112,18 @@ def test_models(
 
 
 @pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(),
-                    reason="test for the CPU backend.")
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
 @pytest.mark.parametrize(
     "kv_cache_dtype,base_model,test_model",
     [
         # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
-    ])
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 def test_cpu_models(
@@ -121,28 +140,30 @@ def test_cpu_models(
     numerical sensitive kernels.
     """
     with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
 
         with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         check_logprobs_close(
             outputs_0_lst=baseline_outputs,
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3e77d3e710393113b93b23319050359f90fe3eb2..5e2438857aeefa004f82ac4cebc5829d2aadf32e 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -100,35 +100,37 @@ def check_model_outputs(
 ):
     tokenizer = AutoTokenizer.from_pretrained(model.original_model)
     if tokenizer.chat_template is not None:
-        messages = [[{
-            'role': 'user',
-            'content': prompt
-        }] for prompt in prompts]
-        prompts = tokenizer.apply_chat_template(messages,
-                                                tokenize=False,
-                                                add_generation_prompt=True)
+        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
 
     # Run gguf model.
-    with vllm_runner(model_name=model.gguf_model,
-                     enforce_eager=True,
-                     tokenizer_name=model.original_model,
-                     dtype=dtype,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=tp_size) as gguf_model:
+    with vllm_runner(
+        model_name=model.gguf_model,
+        enforce_eager=True,
+        tokenizer_name=model.original_model,
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=tp_size,
+    ) as gguf_model:
         gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )
 
     # Run unquantized model.
     # Should run with tp=1, otherwise the test will stuck at
     # nccl initialization.
     with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=1) as original_model:
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as original_model:
         original_outputs = original_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=original_outputs,
@@ -138,12 +140,14 @@ def check_model_outputs(
     )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", [
-    pytest.param(test_config, marks=test_config.marks)
-    for test_config in MODELS
-])
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -157,12 +161,15 @@ def test_models(
     num_logprobs: int,
     tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", [LLAMA_CONFIG])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [8])
@@ -178,5 +185,6 @@ def test_distributed(
     num_logprobs: int,
     tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
diff --git a/tests/models/quantization/test_gpt_oss_attn_quantization.py b/tests/models/quantization/test_gpt_oss_attn_quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..780165ea2ba7a02832b535e8b0386727bb971aba
--- /dev/null
+++ b/tests/models/quantization/test_gpt_oss_attn_quantization.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test attention quantization of gpt-oss model.
+The qkv_proj and o_proj in self_attention can be either quantized or excluded.
+
+Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import huggingface_hub
+import lm_eval
+import pytest
+from packaging import version
+
+MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"]
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+def has_huggingface_access(repo):
+    try:
+        huggingface_hub.list_repo_refs(repo)
+        return True
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        return False
+
+
+HF_HUB_AMD_ORG_ACCESS = all(
+    [has_huggingface_access(model_name) for model_name in MODEL_NAMES]
+)
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False"
+        )
+
+
+EXPECTED_ACCURACIES = {"arc_challenge": 0.20}
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not HF_HUB_AMD_ORG_ACCESS,
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
+@pytest.mark.parametrize("model_name", MODEL_NAMES)
+@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items())
+def test_gpt_oss_attention_quantization(
+    model_name: str, task_name: str, expected_accuracy: float
+):
+    measured_accuracy = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=task_name,
+        batch_size="auto",
+    )["results"][task_name]["acc,none"]
+
+    rtol = 0.05
+    assert (
+        measured_accuracy - rtol < expected_accuracy
+        and measured_accuracy + rtol > expected_accuracy
+    ), f"Expected: {expected_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
index c3aed77525de9a7f490861a95da39ed2dc77ac51..b29c5e769ce8fe00a6e9eae4e43ea80828b6d7a6 100644
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.
 
 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -41,16 +42,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_gptq,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
         bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
index db70a3bd2c0461d808723f8c8970444b1a4d10d4..cf52ae39214d23ab2f8ab0a3c075c4cc2fbadf36 100644
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 """
+
 import os
 
 import pytest
@@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
 MODELS = [
     # act_order==True, group_size=128
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-
     # 8-bit, act_order==True, group_size=channelwise
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-
     # 4-bit, act_order==True, group_size=128
-    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
 ]
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -55,29 +56,34 @@ def test_models(
     model_name, revision = model
 
     # Run marlin.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype=dtype,
-                     quantization="marlin",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
-
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype=dtype,
+        quantization="marlin",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_marlin_model:
         gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
     _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
 
     # Run gptq.
     # The naive gptq kernel doesn't support bf16 yet.
     # Here we always compare fp16/bf16 gpt marlin kernel
     # to fp16 gptq kernel.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype="half",
-                     quantization="gptq",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype="half",
+        quantization="gptq",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
index 9b86ae95ba5c7bae625238e70caf1986159f8389..85426ee5b0898a20ff33353d490be8b61afd7aed 100644
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -24,15 +25,18 @@ class ModelPair:
 
 model_pairs = [
     # 4-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
+    ),
     # # 4-bit, group_size == channelwise
     # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
     #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
-
     # 8-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
+    ),
     # # 8-bit, group_size == channelwise
     # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
     #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@@ -40,10 +44,12 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin_24")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="Marlin24 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [8])
@@ -56,16 +62,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="gptq_marlin_24") as marlin_24_model:
+    with vllm_runner(
+        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+    ) as marlin_24_model:
         marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index e23d4d9d211d8acb092b0a3e9d1f959921d59995..db3af972bb7788405378071e9af8a75480741cda 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -5,6 +5,7 @@
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
+
 import os
 
 import pytest
@@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
 EXPECTED_STRS_MAP = {
     "nvidia/Llama-3.1-8B-Instruct-FP8": [
         "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
         'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+        "**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
     ]
 }
 
@@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
 @pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
     llm = LLM(
@@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
         for prompt in example_prompts
     ]
     params = SamplingParams(max_tokens=20, temperature=0)
@@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
         generated_str = generations[i]
         expected_str = expected_strs[i]
         assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py
index 7b8a334bbc369f2f61090c488bb744db581d3169..d598e405be817da10d90228301be14403feb082e 100644
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
-"""Tests Quark mxfp4 models against ground truth generation
-"""
+"""Tests Quark mxfp4 models against ground truth generation"""
+
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
 
 EXPECTED_STRS_MAP = {
     "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
-        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
-        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
-        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
-        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
-        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
-        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        "\n### Key Features\n\n* **High-throughput Inference**: vLL",
+        "\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
+        "A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
+        "\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
+        "\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
+        "The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
         " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
     ]
 }
@@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
         output_str = output.outputs[0].text
         expected_str = EXPECTED_STRS_MAP[model_name][i]
         assert expected_str == output_str, (
-            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
+        )
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index b3c217e729e4a3c9abed31d2631eb97475d9393b..9f45f142d68b16d8648bd3e080d5d5c9163b0c6c 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -4,6 +4,7 @@
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
 """
+
 import os
 from typing import List
 
@@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
 
 EXPECTED_STRS_MAP = {
     "nvidia/Llama-3.3-70B-Instruct-FP4": [
-        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
-        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
-        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
+        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
+        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
     ]
 }
 
@@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
 @pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build "
-    " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
-                    reason="modelopt_fp4 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt_fp4"),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
     llm = LLM(
@@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
         for prompt in example_prompts
     ]
     params = SamplingParams(max_tokens=20, temperature=0)
@@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
         generated_str = generations[i]
         expected_str = expected_strs[i]
         assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c19ba23cd7cda9eb7ed3dfe6743324b702821810..094f921e4305f4c7c0db2fd8cd209b2c56421610 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -3,14 +3,13 @@
 
 from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
-from typing import Any, Literal, Optional
+from typing import Any, Literal
 
 import pytest
-import torch
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config import ModelDType, TokenizerMode
+from vllm.config.model import ModelDType, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -21,36 +20,42 @@ class _HfExamplesInfo:
     extras: Mapping[str, str] = field(default_factory=dict)
     """Extra models to use for testing this architecture."""
 
-    tokenizer: Optional[str] = None
+    tokenizer: str | None = None
     """Set the tokenizer to load for this architecture."""
 
     tokenizer_mode: TokenizerMode = "auto"
     """Set the tokenizer type for this architecture."""
 
-    speculative_model: Optional[str] = None
+    speculative_model: str | None = None
     """
     The default model to use for testing this architecture, which is only used
     for speculative decoding.
     """
 
-    min_transformers_version: Optional[str] = None
+    speculative_method: str | None = None
+    """
+    The method to use for speculative decoding.
+    """
+
+    min_transformers_version: str | None = None
     """
     The minimum version of HF Transformers that is required to run this model.
     """
 
-    max_transformers_version: Optional[str] = None
+    max_transformers_version: str | None = None
     """
     The maximum version of HF Transformers that this model runs on.
     """
 
-    transformers_version_reason: Optional[str] = None
+    transformers_version_reason: str | None = None
     """
     The reason for the minimum/maximum version requirement.
     """
 
-    skip_tokenizer_init: bool = False
+    require_embed_inputs: bool = False
     """
-    If true, skip initialization of tokenizer and detokenizer. 
+    If `True`, enables prompt and multi-modal embedding inputs while
+    disabling tokenization.
     """
 
     dtype: ModelDType = "auto"
@@ -67,34 +72,31 @@ class _HfExamplesInfo:
 
     is_available_online: bool = True
     """
-    Set this to ``False`` if the name of this architecture no longer exists on
+    Set this to `False` if the name of this architecture no longer exists on
     the HF repo. To maintain backwards compatibility, we have not removed them
     from the main model registry, so without this flag the registry tests will
     fail.
     """
 
     trust_remote_code: bool = False
-    """The ``trust_remote_code`` level required to load the model."""
-
-    v0_only: bool = False
-    """The model is only available with the vLLM V0 engine."""
+    """The `trust_remote_code` level required to load the model."""
 
     hf_overrides: dict[str, Any] = field(default_factory=dict)
-    """The ``hf_overrides`` required to load the model."""
+    """The `hf_overrides` required to load the model."""
 
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = None
     """
     The maximum model length to use for this model. Some models default to a
     length that is too large to fit into memory in CI.
     """
 
-    revision: Optional[str] = None
+    revision: str | None = None
     """
     The specific revision (commit hash, tag, or branch) to use for the model.
     If not specified, the default revision will be used.
     """
 
-    max_num_seqs: Optional[int] = None
+    max_num_seqs: int | None = None
     """Maximum number of sequences to be processed in a single iteration."""
 
     use_original_num_layers: bool = False
@@ -109,13 +111,15 @@ class _HfExamplesInfo:
         on_fail: Literal["error", "skip", "return"],
         check_min_version: bool = True,
         check_max_version: bool = True,
-    ) -> Optional[str]:
+    ) -> str | None:
         """
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
-        if (self.min_transformers_version is None
-                and self.max_transformers_version is None):
+        if (
+            self.min_transformers_version is None
+            and self.max_transformers_version is None
+        ):
             return None
 
         current_version = TRANSFORMERS_VERSION
@@ -125,11 +129,17 @@ class _HfExamplesInfo:
         msg = f"`transformers=={current_version}` installed, but `transformers"
         # Only check the base version for the min/max version, otherwise preview
         # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
-        if (check_min_version and min_version
-                and Version(cur_base_version) < Version(min_version)):
+        if (
+            check_min_version
+            and min_version
+            and Version(cur_base_version) < Version(min_version)
+        ):
             msg += f">={min_version}` is required to run this model."
-        elif (check_max_version and max_version
-              and Version(cur_base_version) > Version(max_version)):
+        elif (
+            check_max_version
+            and max_version
+            and Version(cur_base_version) > Version(max_version)
+        ):
             msg += f"<={max_version}` is required to run this model."
         else:
             return None
@@ -161,505 +171,767 @@ class _HfExamplesInfo:
                 pytest.skip(msg)
 
 
-# yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
-    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-2509",
-                                          min_transformers_version="4.56.0",
-                                          trust_remote_code=True),
-    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
-                                   trust_remote_code=True),
-    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
-                                         trust_remote_code=True),
+    "AfmoeForCausalLM": _HfExamplesInfo(
+        "arcee-ai/Trinity-Nano",
+        is_available_online=False,
+    ),
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
+    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
+    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
     "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base"),
-    "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
-                                         trust_remote_code=True),
-    "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
-                                         trust_remote_code=True),
-    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
-                                         trust_remote_code=True),
-    "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
-                                         trust_remote_code=True),
-    "BailingMoeV2ForCausalLM": _HfExamplesInfo("inclusionAI/Ling-mini-2.0",
-                                         trust_remote_code=True),
-    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.55.3",
-                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
-    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
-                                        {"1b": "bigscience/bloomz-1b1"}),
-    "ChatGLMModel": _HfExamplesInfo("zai-org/chatglm3-6b",
-                                    trust_remote_code=True,
-                                    max_transformers_version="4.48"),
-    "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
-                                                       trust_remote_code=True),
-    "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
-                                         trust_remote_code=True),
-    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
-                                         trust_remote_code=True),
-    "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", # noqa: E501
-                                      trust_remote_code=True,
-                                      is_available_online=False),
+    "ArcticForCausalLM": _HfExamplesInfo(
+        "Snowflake/snowflake-arctic-instruct", trust_remote_code=True
+    ),
+    "BaiChuanForCausalLM": _HfExamplesInfo(
+        "baichuan-inc/Baichuan-7B", trust_remote_code=True
+    ),
+    "BaichuanForCausalLM": _HfExamplesInfo(
+        "baichuan-inc/Baichuan2-7B-chat", trust_remote_code=True
+    ),
+    "BailingMoeForCausalLM": _HfExamplesInfo(
+        "inclusionAI/Ling-lite-1.5", trust_remote_code=True
+    ),
+    "BailingMoeV2ForCausalLM": _HfExamplesInfo(
+        "inclusionAI/Ling-mini-2.0", trust_remote_code=True
+    ),
+    "BambaForCausalLM": _HfExamplesInfo(
+        "ibm-ai-platform/Bamba-9B-v1",
+        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"},
+    ),
+    "BloomForCausalLM": _HfExamplesInfo(
+        "bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}
+    ),
+    "ChatGLMModel": _HfExamplesInfo(
+        "zai-org/chatglm3-6b", trust_remote_code=True, max_transformers_version="4.48"
+    ),
+    "ChatGLMForConditionalGeneration": _HfExamplesInfo(
+        "thu-coai/ShieldLM-6B-chatglm3",
+        trust_remote_code=True,
+    ),
+    "CohereForCausalLM": _HfExamplesInfo(
+        "CohereForAI/c4ai-command-r-v01", trust_remote_code=True
+    ),
+    "Cohere2ForCausalLM": _HfExamplesInfo(
+        "CohereForAI/c4ai-command-r7b-12-2024",
+        trust_remote_code=True,
+    ),
+    "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
-    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
-                                         trust_remote_code=True),
-    "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
-    "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
-                                         trust_remote_code=True),
-    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
-                                         trust_remote_code=True),
+    "DeciLMForCausalLM": _HfExamplesInfo(
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+        trust_remote_code=True,
+    ),
+    "DeepseekForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/deepseek-moe-16b-base",
+        trust_remote_code=True,
+    ),
+    "DeepseekV2ForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        trust_remote_code=True,
+    ),
+    "DeepseekV3ForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/DeepSeek-V3",
+        trust_remote_code=True,
+    ),
     "DeepseekV32ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3.2-Exp"),
-    "Ernie4_5ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
-                                            min_transformers_version="4.54"),
-    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
-                                               min_transformers_version="4.54"),
-    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
-                                         trust_remote_code=True),
-    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B",
-                                          min_transformers_version="4.54"),
-    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
+    "Ernie4_5ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT"),
+    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT"),
+    "ExaoneForCausalLM": _HfExamplesInfo(
+        "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", trust_remote_code=True
+    ),
+    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),
+    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
+    "FalconH1ForCausalLM": _HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
+    "FlexOlmoForCausalLM": _HfExamplesInfo("allenai/Flex-reddit-2x7B-1T"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
-    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma2ForCausalLM": _HfExamplesInfo(
+        "google/gemma-2-9b", extras={"tiny": "google/gemma-2-2b-it"}
+    ),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
-    "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it",
-                                          min_transformers_version="4.53"),
+    "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it"),
     "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
     "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"),
-    "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5",
-                                          min_transformers_version="4.54"),   # noqa: E501
-    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
-                                       {"alias": "gpt2"}),
-    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
-                                             extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
-                                             min_transformers_version="4.55.1",
-                                             transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
-    "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
-                                       {"6b": "EleutherAI/gpt-j-6b"}),
-    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
-                                          {"1b": "EleutherAI/pythia-1.4b"}),
+    "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"),
+    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
+    "GPTBigCodeForCausalLM": _HfExamplesInfo(
+        "bigcode/starcoder",
+        extras={
+            "tiny": "bigcode/tiny_starcoder_py",
+            "santacoder": "bigcode/gpt_bigcode-santacoder",
+        },
+    ),
+    "GPTJForCausalLM": _HfExamplesInfo(
+        "Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}
+    ),
+    "GPTNeoXForCausalLM": _HfExamplesInfo(
+        "EleutherAI/pythia-70m", {"1b": "EleutherAI/pythia-1.4b"}
+    ),
     "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
-                                                   min_transformers_version="4.55.3"),
-    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
-    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
-                                             trust_remote_code=True),
-    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
-                                               trust_remote_code=True),
-    # TODO: Remove is_available_online once their config.json is fixed
-    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
-                                                trust_remote_code=True,
-                                                is_available_online=False),
-    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
-                                           trust_remote_code=True),
-    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
-                                            trust_remote_code=True),
-    "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
-                                              trust_remote_code=True),
-    "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
-                                            trust_remote_code=True),
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo(
+        "ibm-granite/granite-4.0-tiny-preview"
+    ),
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo(
+        "ibm-research/moe-7b-1b-active-shared-experts"
+    ),
+    "Grok1ModelForCausalLM": _HfExamplesInfo(
+        "hpcai-tech/grok-1", trust_remote_code=True
+    ),
+    "HunYuanDenseV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-7B-Instruct"),
+    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
+        "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
+    ),
+    "InternLMForCausalLM": _HfExamplesInfo(
+        "internlm/internlm-chat-7b", trust_remote_code=True
+    ),
+    "InternLM2ForCausalLM": _HfExamplesInfo(
+        "internlm/internlm2-chat-7b", trust_remote_code=True
+    ),
+    "InternLM2VEForCausalLM": _HfExamplesInfo(
+        "OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
+    ),
+    "InternLM3ForCausalLM": _HfExamplesInfo(
+        "internlm/internlm3-8b-instruct", trust_remote_code=True
+    ),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
-    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.55.3",
-                                        extras={
-                                            "tiny": "ai21labs/Jamba-tiny-dev",
-                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
-                                        }),
-    "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B",
-                                       min_transformers_version="4.54"),
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
-                                        extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
-                                                "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
-                                                "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
-    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
-                                        is_available_online=False),
-    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
-                                         is_available_online=False),
-    "LongcatFlashForCausalLM": _HfExamplesInfo
-                ("meituan-longcat/LongCat-Flash-Chat", trust_remote_code=True),
+    "JambaForCausalLM": _HfExamplesInfo(
+        "ai21labs/AI21-Jamba-1.5-Mini",
+        extras={
+            "tiny": "ai21labs/Jamba-tiny-dev",
+            "random": "ai21labs/Jamba-tiny-random",
+        },
+    ),
+    "KimiLinearForCausalLM": _HfExamplesInfo(
+        "moonshotai/Kimi-Linear-48B-A3B-Instruct", trust_remote_code=True
+    ),
+    "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B"),
+    "Lfm2MoeForCausalLM": _HfExamplesInfo(
+        "LiquidAI/LFM2-8B-A1B", min_transformers_version="4.58"
+    ),
+    "LlamaForCausalLM": _HfExamplesInfo(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        extras={
+            "guard": "meta-llama/Llama-Guard-3-1B",
+            "hermes": "NousResearch/Hermes-3-Llama-3.1-8B",
+            "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+            "tiny": "hmellor/tiny-random-LlamaForCausalLM",
+        },
+    ),
+    "LLaMAForCausalLM": _HfExamplesInfo(
+        "decapoda-research/llama-7b-hf", is_available_online=False
+    ),
+    "Llama4ForCausalLM": _HfExamplesInfo(
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    ),
+    "LongcatFlashForCausalLM": _HfExamplesInfo(
+        "meituan-longcat/LongCat-Flash-Chat", trust_remote_code=True
+    ),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
-                                         min_transformers_version="4.55.3",
-                                         extras={
-                                            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
-                                         }),
-    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
-    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
-                                         trust_remote_code=True),
-    "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
-                                         trust_remote_code=True),
+    "Mamba2ForCausalLM": _HfExamplesInfo(
+        "mistralai/Mamba-Codestral-7B-v0.1",
+        extras={
+            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random",
+        },
+    ),
+    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),
+    "MiniCPMForCausalLM": _HfExamplesInfo(
+        "openbmb/MiniCPM-2B-sft-bf16", trust_remote_code=True
+    ),
+    "MiniCPM3ForCausalLM": _HfExamplesInfo(
+        "openbmb/MiniCPM3-4B", trust_remote_code=True
+    ),
     "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
-    "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
-                                                trust_remote_code=True,
-                                                revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
-    "MiniMaxM1ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-M1-40k",
-                                            trust_remote_code=True),
+    "MiniMaxText01ForCausalLM": _HfExamplesInfo(
+        "MiniMaxAI/MiniMax-Text-01",
+        trust_remote_code=True,
+        revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3",
+    ),
+    "MiniMaxM1ForCausalLM": _HfExamplesInfo(
+        "MiniMaxAI/MiniMax-M1-40k", trust_remote_code=True
+    ),
+    "MiniMaxM2ForCausalLM": _HfExamplesInfo(
+        "MiniMaxAI/MiniMax-M2",
+        trust_remote_code=True,
+    ),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
-    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
-                                          {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
-    "MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
-                                        trust_remote_code=True,
-                                        v0_only=True),
+    "MixtralForCausalLM": _HfExamplesInfo(
+        "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        {"tiny": "TitanML/tiny-mixtral"},
+    ),
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
-    "NemotronHForCausalLM": _HfExamplesInfo("nvidia/Nemotron-H-8B-Base-8K",
-                                            trust_remote_code=True),
+    "NemotronHForCausalLM": _HfExamplesInfo(
+        "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
+    ),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
     "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
-    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
-                                      {"1b": "facebook/opt-iml-max-1.3b"}),
-    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
-                                        trust_remote_code=True),
+    "OpenPanguMTPModel": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
+    "OPTForCausalLM": _HfExamplesInfo(
+        "facebook/opt-125m", {"1b": "facebook/opt-iml-max-1.3b"}
+    ),
+    "OrionForCausalLM": _HfExamplesInfo(
+        "OrionStarAI/Orion-14B-Chat", trust_remote_code=True
+    ),
+    "OuroForCausalLM": _HfExamplesInfo("ByteDance/Ouro-1.4B", trust_remote_code=True),
+    "PanguEmbeddedForCausalLM": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Embedded-7B-V1.1", trust_remote_code=True
+    ),
+    "PanguUltraMoEForCausalLM": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
     "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
     "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
-    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
-                                         trust_remote_code=True),
-    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                         max_transformers_version="4.55.4",
-                                         transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
-                                         trust_remote_code=True),
-    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
-                                       max_transformers_version="4.53",
-                                       transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
-                                       trust_remote_code=True),
-    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
-                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
+    "PhiMoEForCausalLM": _HfExamplesInfo(
+        "microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True
+    ),
+    "Plamo2ForCausalLM": _HfExamplesInfo(
+        "pfnet/plamo-2-1b",
+        trust_remote_code=True,
+    ),
+    "QWenLMHeadModel": _HfExamplesInfo(
+        "Qwen/Qwen-7B-Chat",
+        max_transformers_version="4.53",
+        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        trust_remote_code=True,
+    ),
+    "Qwen2ForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
+    ),
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-    "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                            extras={"tiny-random": "tiny-random/qwen3-next-moe"}, # noqa: E501
-                                            min_transformers_version="4.56.3"),
+    "Qwen3NextForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        extras={"tiny-random": "tiny-random/qwen3-next-moe"},
+        min_transformers_version="4.56.3",
+    ),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
-                                          trust_remote_code=True,
-                                          is_available_online=False),
+    "SeedOssForCausalLM": _HfExamplesInfo(
+        "ByteDance-Seed/Seed-OSS-36B-Instruct",
+        trust_remote_code=True,
+    ),
     "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
-    "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
-                                            trust_remote_code=True),
-    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
-                                        trust_remote_code=True),
-    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
-                                            trust_remote_code=True),
-    "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
-                                            trust_remote_code=True),
-    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
-                                         tokenizer="meta-llama/Llama-2-7b",
-                                         trust_remote_code=True),
+    "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", trust_remote_code=True),
+    "SolarForCausalLM": _HfExamplesInfo(
+        "upstage/solar-pro-preview-instruct", trust_remote_code=True
+    ),
+    "TeleChat2ForCausalLM": _HfExamplesInfo(
+        "Tele-AI/TeleChat2-3B", trust_remote_code=True
+    ),
+    "TeleFLMForCausalLM": _HfExamplesInfo(
+        "CofeAI/FLM-2-52B-Instruct-2407", trust_remote_code=True
+    ),
+    "XverseForCausalLM": _HfExamplesInfo(
+        "xverse/XVERSE-7B-Chat",
+        tokenizer="meta-llama/Llama-2-7b",
+        trust_remote_code=True,
+    ),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
-    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
-                                        trust_remote_code=True),
+    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
     "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
 }
 
 _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
-    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                                               trust_remote_code=True),
-    "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
-                                   trust_remote_code=True,
-                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
-    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
-                                               trust_remote_code=True),
-    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
+    "GteModel": _HfExamplesInfo(
+        "Snowflake/snowflake-arctic-embed-m-v2.0", trust_remote_code=True
+    ),
+    "GteNewModel": _HfExamplesInfo(
+        "Alibaba-NLP/gte-base-en-v1.5",
+        trust_remote_code=True,
+        hf_overrides={"architectures": ["GteNewModel"]},
+    ),
+    "InternLM2ForRewardModel": _HfExamplesInfo(
+        "internlm/internlm2-1_8b-reward", trust_remote_code=True
+    ),
+    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
-    "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True),
-    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True),  # noqa: E501
+    "ModernBertModel": _HfExamplesInfo(
+        "Alibaba-NLP/gte-modernbert-base", trust_remote_code=True
+    ),
+    "NomicBertModel": _HfExamplesInfo(
+        "nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True
+    ),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
-    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
-                                           max_transformers_version="4.53",
-                                           transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
-                                                  max_transformers_version="4.53",
-                                                  transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),  # noqa: E501
+    "Qwen2ForRewardModel": _HfExamplesInfo(
+        "Qwen/Qwen2.5-Math-RM-72B",
+        max_transformers_version="4.53",
+        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+    ),
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo(
+        "Qwen/Qwen2.5-Math-PRM-7B",
+        max_transformers_version="4.53",
+        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+    ),
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
+    "BertSpladeSparseEmbeddingModel": _HfExamplesInfo(
+        "naver/splade-v3",
+        hf_overrides={"architectures": ["BertSpladeSparseEmbeddingModel"]},
+    ),
     # [Multimodal]
+    "CLIPModel": _HfExamplesInfo("openai/clip-vit-base-patch32"),
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
-    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
-                                         trust_remote_code=True),
-    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
-                                            dtype=torch.float16,
-                                            enforce_eager=True,
-                                            skip_tokenizer_init=True,
-                                            # This is to avoid the model
-                                            # going OOM in CI
-                                            max_num_seqs=32,
-                                            ),
-    "Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
-                                  dtype=torch.float16,
-                                  enforce_eager=True,
-                                  skip_tokenizer_init=True,
-                                  # This is to avoid the model going OOM in CI
-                                  max_num_seqs=32,
-                                  ),
+    "Phi3VForCausalLM": _HfExamplesInfo(
+        "TIGER-Lab/VLM2Vec-Full", trust_remote_code=True
+    ),
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"),
+    "SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"),
+    "PrithviGeoSpatialMAE": _HfExamplesInfo(
+        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        dtype="float16",
+        enforce_eager=True,
+        require_embed_inputs=True,
+        # This is to avoid the model going OOM in CI
+        max_num_seqs=32,
+    ),
+    "Terratorch": _HfExamplesInfo(
+        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        dtype="float16",
+        enforce_eager=True,
+        require_embed_inputs=True,
+        # This is to avoid the model going OOM in CI
+        max_num_seqs=32,
+    ),
 }
 
 _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
     # [Decoder-only]
-    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
-
+    "GPT2ForSequenceClassification": _HfExamplesInfo(
+        "nie3e/sentiment-polish-gpt2-small"
+    ),
     # [Cross-encoder]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo(
+        "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    ),
     "BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"),
-    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
-                                                       trust_remote_code=True,
-                                                       hf_overrides={
-                                                           "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
+    "GteNewForSequenceClassification": _HfExamplesInfo(
+        "Alibaba-NLP/gte-multilingual-reranker-base",
+        trust_remote_code=True,
+        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+    ),
+    "ModernBertForSequenceClassification": _HfExamplesInfo(
+        "Alibaba-NLP/gte-reranker-modernbert-base"
+    ),
+    "ModernBertForTokenClassification": _HfExamplesInfo(
+        "disham993/electrical-ner-ModernBERT-base"
+    ),
+    "RobertaForSequenceClassification": _HfExamplesInfo(
+        "cross-encoder/quora-roberta-base"
+    ),
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),
 }
 
 _AUTOMATIC_CONVERTED_MODELS = {
     # Use as_seq_cls_model for automatic conversion
-    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
-                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
-                                                                    "method": "no_post_processing"}),  # noqa: E501
-    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"),  # noqa: E501
-    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
-    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
+    "GemmaForSequenceClassification": _HfExamplesInfo(
+        "BAAI/bge-reranker-v2-gemma",
+        hf_overrides={
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        },
+    ),
+    "LlamaForSequenceClassification": _HfExamplesInfo(
+        "Skywork/Skywork-Reward-V2-Llama-3.2-1B"
+    ),
+    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),
+    "Qwen3ForSequenceClassification": _HfExamplesInfo(
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+    ),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
-    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
-    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
-    "Cohere2VisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/command-a-vision-07-2025"), # noqa: E501
-    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
-                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
-                                                max_transformers_version="4.48",  # noqa: E501
-                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
-                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
-    "DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr",
-                                          trust_remote_code=True),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"),
+    "BeeForConditionalGeneration": _HfExamplesInfo(
+        "Open-Bee/Bee-8B-RL",
+        trust_remote_code=True,
+    ),
+    "Blip2ForConditionalGeneration": _HfExamplesInfo(
+        "Salesforce/blip2-opt-2.7b",
+        extras={"6b": "Salesforce/blip2-opt-6.7b"},
+    ),
+    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),
+    "Cohere2VisionForConditionalGeneration": _HfExamplesInfo(
+        "CohereLabs/command-a-vision-07-2025"
+    ),
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/deepseek-vl2-tiny",
+        extras={"fork": "Isotr0py/deepseek-vl2-tiny"},
+        max_transformers_version="4.48",
+        transformers_version_reason="HF model is not compatible.",
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+    ),
+    "DeepseekOCRForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/DeepSeek-OCR",
+    ),
+    "DotsOCRForCausalLM": _HfExamplesInfo(
+        "rednote-hilab/dots.ocr", trust_remote_code=True
+    ),
     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
-    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
-                                                              trust_remote_code=True),
+    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
+        "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        trust_remote_code=True,
+    ),
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
-    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
-                                        min_transformers_version="4.53"),
-    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
-    "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b",
-                                        trust_remote_code=True,
-                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
-    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
-    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
-                                                        min_transformers_version="4.56"),  # noqa: E501
-    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      trust_remote_code=True,
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
-                                      max_transformers_version="4.48",  # noqa: E501
-                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
-    "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",  # noqa: E501
-                                            trust_remote_code=True),
-    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
-                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
-                                                        min_transformers_version="4.56",
-                                                        transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
-    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
-                                                        trust_remote_code=True),  # noqa: E501
-    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
-                                         extras={"2B": "OpenGVLab/InternVL2-2B",
-                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
-                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
-                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
-                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
-                                         trust_remote_code=True),
-    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
-    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
-                                                    trust_remote_code=True),
-    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
-                                                         trust_remote_code=True),
-    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
-                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True),
-    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      max_model_len=10240,
-                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"},  # noqa: E501
-                                                      ),
-    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
-                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
-                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
-    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
-    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
-    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
-    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
-                                                      max_transformers_version="4.48",  # noqa: E501
-                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
-                                                      hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
-    "MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b",
-                            trust_remote_code=True),
-    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                trust_remote_code=True),
-    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
-                                trust_remote_code=True),
-    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
-                                              trust_remote_code=True,
-                                              v0_only=True),
-    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
-                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
-    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
-                                        max_transformers_version="4.48",
-                                        transformers_version_reason="Incorrectly-detected `tensorflow` import.",  # noqa: E501
-                                        extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
-                                        trust_remote_code=True),
-    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
-                              trust_remote_code=True),
-    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
-                                                     trust_remote_code=True),
-    "NemotronH_Nano_VL_V2": _HfExamplesInfo("nano_vl_dummy",
-                                          is_available_online=False,
-                                          trust_remote_code=True),
-    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
-                            max_transformers_version="4.53",
-                            transformers_version_reason="HF model is not compatible",  # noqa: E501
-                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
-                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
-                               trust_remote_code=True),
-    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
-                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
-    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
-                                        trust_remote_code=True,
-                                        max_transformers_version="4.48",
-                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
-                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
-    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
-                                        trust_remote_code=True),
-    "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",  # noqa: E501
-                                                 revision="refs/pr/70"),
-    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
-                                                       tokenizer_mode="mistral"),
-    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
-                                                      extras={"chat": "Qwen/Qwen-VL-Chat"},  # noqa: E501
-                                                      trust_remote_code=True,
-                                                      hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
-    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
-    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
-                                                          max_model_len=4096),
+    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"),
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
+        "ibm-granite/granite-speech-3.3-2b"
+    ),
+    "GLM4VForCausalLM": _HfExamplesInfo(
+        "zai-org/glm-4v-9b",
+        trust_remote_code=True,
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+    ),
+    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),
+    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"),
+    "H2OVLChatModel": _HfExamplesInfo(
+        "h2oai/h2ovl-mississippi-800m",
+        trust_remote_code=True,
+        extras={"2b": "h2oai/h2ovl-mississippi-2b"},
+        max_transformers_version="4.48",
+        transformers_version_reason="HF model is not compatible.",
+    ),
+    "HCXVisionForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+        trust_remote_code=True,
+    ),
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo(
+        "HuggingFaceM4/Idefics3-8B-Llama3",
+        extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
+    ),
+    "InternS1ForConditionalGeneration": _HfExamplesInfo(
+        "internlm/Intern-S1", trust_remote_code=True
+    ),
+    "InternVLChatModel": _HfExamplesInfo(
+        "OpenGVLab/InternVL2-1B",
+        extras={
+            "2B": "OpenGVLab/InternVL2-2B",
+            "3.0": "OpenGVLab/InternVL3-1B",
+            "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",
+            "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",
+            "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
+        },
+        trust_remote_code=True,
+    ),
+    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+    "KeyeForConditionalGeneration": _HfExamplesInfo(
+        "Kwai-Keye/Keye-VL-8B-Preview",
+        trust_remote_code=True,
+    ),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo(
+        "Kwai-Keye/Keye-VL-1_5-8B",
+        trust_remote_code=True,
+    ),
+    "KimiVLForConditionalGeneration": _HfExamplesInfo(
+        "moonshotai/Kimi-VL-A3B-Instruct",
+        extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
+        trust_remote_code=True,
+    ),
+    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
+        "lightonai/LightOnOCR-1B",
+        is_available_online=False,
+    ),
+    "Llama4ForConditionalGeneration": _HfExamplesInfo(
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        max_model_len=10240,
+        extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"},
+    ),
+    "LlavaForConditionalGeneration": _HfExamplesInfo(
+        "llava-hf/llava-1.5-7b-hf",
+        extras={
+            "mistral": "mistral-community/pixtral-12b",
+            "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic",
+        },
+    ),
+    "LlavaNextForConditionalGeneration": _HfExamplesInfo(
+        "llava-hf/llava-v1.6-mistral-7b-hf"
+    ),
+    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo(
+        "llava-hf/LLaVA-NeXT-Video-7B-hf"
+    ),
+    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo(
+        "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+    ),
+    "MantisForConditionalGeneration": _HfExamplesInfo(
+        "TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_transformers_version="4.48",
+        transformers_version_reason="HF model is not compatible.",
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+    ),
+    "MiDashengLMModel": _HfExamplesInfo(
+        "mispeech/midashenglm-7b", trust_remote_code=True
+    ),
+    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
+    "MiniCPMV": _HfExamplesInfo(
+        "openbmb/MiniCPM-Llama3-V-2_5",
+        extras={
+            "2.6": "openbmb/MiniCPM-V-2_6",
+            "4.0": "openbmb/MiniCPM-V-4",
+            "4.5": "openbmb/MiniCPM-V-4_5",
+        },
+        trust_remote_code=True,
+    ),
+    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
+        "MiniMaxAI/MiniMax-VL-01",
+        trust_remote_code=True,
+    ),
+    "Mistral3ForConditionalGeneration": _HfExamplesInfo(
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"},
+    ),
+    "MolmoForCausalLM": _HfExamplesInfo(
+        "allenai/Molmo-7B-D-0924",
+        max_transformers_version="4.48",
+        transformers_version_reason="Incorrectly-detected `tensorflow` import.",
+        extras={"olmo": "allenai/Molmo-7B-O-0924"},
+        trust_remote_code=True,
+    ),
+    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B", trust_remote_code=True),
+    "Llama_Nemotron_Nano_VL": _HfExamplesInfo(
+        "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
+        trust_remote_code=True,
+    ),
+    "NemotronH_Nano_VL_V2": _HfExamplesInfo(
+        "nano_vl_dummy", is_available_online=False, trust_remote_code=True
+    ),
+    "Ovis": _HfExamplesInfo(
+        "AIDC-AI/Ovis2-1B",
+        trust_remote_code=True,
+        max_transformers_version="4.53",
+        transformers_version_reason="HF model is not compatible",
+        extras={
+            "1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
+            "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
+        },
+    ),
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
+    "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
+        "PaddlePaddle/PaddleOCR-VL",
+        trust_remote_code=True,
+    ),
+    "PaliGemmaForConditionalGeneration": _HfExamplesInfo(
+        "google/paligemma-3b-mix-224",
+        extras={"v2": "google/paligemma2-3b-ft-docci-448"},
+    ),
+    "Phi3VForCausalLM": _HfExamplesInfo(
+        "microsoft/Phi-3-vision-128k-instruct",
+        trust_remote_code=True,
+        max_transformers_version="4.48",
+        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
+    ),
+    "Phi4MMForCausalLM": _HfExamplesInfo(
+        "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
+    ),
+    "Phi4MultimodalForCausalLM": _HfExamplesInfo(
+        "microsoft/Phi-4-multimodal-instruct",
+        revision="refs/pr/70",
+    ),
+    "PixtralForConditionalGeneration": _HfExamplesInfo(
+        "mistralai/Pixtral-12B-2409",
+        tokenizer_mode="mistral",
+    ),
+    "QwenVLForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen-VL",
+        extras={"chat": "Qwen/Qwen-VL-Chat"},
+        trust_remote_code=True,
+        max_transformers_version="4.53.3",
+        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
+    ),
+    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen2-Audio-7B-Instruct"
+    ),
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+        max_model_len=4096,
+    ),
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
-    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
-    "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
-                                                        max_model_len=4096,
-                                                        min_transformers_version="4.57",
-                                                        is_available_online=False),
-    "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
-                                                          max_model_len=4096,
-                                                          min_transformers_version="4.57",
-                                                          is_available_online=False),
-    "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
-                                                 trust_remote_code=True),
-    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
-                                           trust_remote_code=True),
-    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
-                                                       min_transformers_version="4.56",
-                                                       transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
-    "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
-                                                        trust_remote_code=True),
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
-                                     trust_remote_code=True),
-    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501
-    "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
-                                                        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),
+    "Qwen3VLForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-4B-Instruct",
+        max_model_len=4096,
+        min_transformers_version="4.57",
+    ),
+    "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-30B-A3B-Instruct",
+        max_model_len=4096,
+        min_transformers_version="4.57",
+    ),
+    "Qwen3OmniMoeForConditionalGeneration": _HfExamplesInfo(
+        "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        max_model_len=4096,
+        min_transformers_version="4.57",
+    ),
+    "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", trust_remote_code=True),
+    "SkyworkR1VChatModel": _HfExamplesInfo(
+        "Skywork/Skywork-R1V-38B", trust_remote_code=True
+    ),
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo(
+        "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+    ),
+    "Step3VLForConditionalGeneration": _HfExamplesInfo(
+        "stepfun-ai/step3", trust_remote_code=True
+    ),
+    "UltravoxModel": _HfExamplesInfo(
+        "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+        trust_remote_code=True,
+    ),
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
+    "Tarsier2ForConditionalGeneration": _HfExamplesInfo(
+        "omni-research/Tarsier2-Recap-7b",
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+    ),
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Voxtral-Mini-3B-2507",
-        min_transformers_version="4.54",
         # disable this temporarily until we support HF format
         is_available_online=False,
     ),
     # [Encoder-decoder]
-    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
+    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
     # [Cross-encoder]
-    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501
+    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }
 
 
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
-    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
-                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
+    "MedusaModel": _HfExamplesInfo(
+        "JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"
+    ),
     # Temporarily disabled.
     # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
-    # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
-    #                                                 speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
-    "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
-                                        speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
-                                        trust_remote_code=True),
-    "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
-                                        speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
-                                        trust_remote_code=True),
-    "EagleLlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B-Instruct", # noqa: E501
-                                             trust_remote_code=True,
-                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
-    "Eagle3LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.1-8B-Instruct",  # noqa: E501
-                                            trust_remote_code=True,
-                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
-                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct",
-                                            use_original_num_layers=True,
-                                            max_model_len=10240),
-    "LlamaForCausalLMEagle3": _HfExamplesInfo("Qwen/Qwen3-8B",  # noqa: E501
-                                            trust_remote_code=True,
-                                            speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
-                                            tokenizer="Qwen/Qwen3-8B",
-                                            use_original_num_layers=True),
+    # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo(
+    #     "JackFram/llama-160m",
+    #     speculative_model="ibm-ai-platform/llama-160m-accelerator"
+    # ),
+    "DeepSeekMTPModel": _HfExamplesInfo(
+        "luccafong/deepseek_mtp_main_random",
+        speculative_model="luccafong/deepseek_mtp_draft_random",
+        trust_remote_code=True,
+    ),
+    "EagleDeepSeekMTPModel": _HfExamplesInfo(
+        "eagle618/deepseek-v3-random",
+        speculative_model="eagle618/eagle-deepseek-v3-random",
+        trust_remote_code=True,
+    ),
+    "EagleLlamaForCausalLM": _HfExamplesInfo(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        trust_remote_code=True,
+        speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
+    ),
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo(
+        "meta-llama/Llama-3.1-8B-Instruct",
+        trust_remote_code=True,
+        speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+        tokenizer="meta-llama/Llama-3.1-8B-Instruct",
+        use_original_num_layers=True,
+        max_model_len=10240,
+    ),
+    "LlamaForCausalLMEagle3": _HfExamplesInfo(
+        "Qwen/Qwen3-8B",
+        trust_remote_code=True,
+        speculative_model="AngelSlim/Qwen3-8B_eagle3",
+        tokenizer="Qwen/Qwen3-8B",
+        use_original_num_layers=True,
+    ),
     "EagleLlama4ForCausalLM": _HfExamplesInfo(
         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
         trust_remote_code=True,
         speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
-        tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"),  # noqa: E501
-    "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
-                                            trust_remote_code=True,
-                                            is_available_online=False,
-                                            speculative_model="openbmb/MiniCPM-2B-sft-bf16",
-                                            tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
-    "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
-                                    trust_remote_code=True,
-                                    speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"),
-    "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
-                                        speculative_model="zai-org/GLM-4.5",
-                                        min_transformers_version="4.54",
-                                        is_available_online=False),
+        tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    ),
+    "EagleMiniCPMForCausalLM": _HfExamplesInfo(
+        "openbmb/MiniCPM-1B-sft-bf16",
+        trust_remote_code=True,
+        speculative_model="openbmb/MiniCPM-2B-sft-bf16",
+        speculative_method="eagle",
+        tokenizer="openbmb/MiniCPM-2B-sft-bf16",
+    ),
+    "ErnieMTPModel": _HfExamplesInfo(
+        "baidu/ERNIE-4.5-21B-A3B-PT",
+        trust_remote_code=True,
+        speculative_model="baidu/ERNIE-4.5-21B-A3B-PT",
+    ),
+    "Glm4MoeMTPModel": _HfExamplesInfo(
+        "zai-org/GLM-4.5",
+        speculative_model="zai-org/GLM-4.5",
+    ),
     "LongCatFlashMTPModel": _HfExamplesInfo(
         "meituan-longcat/LongCat-Flash-Chat",
         trust_remote_code=True,
-        speculative_model="meituan-longcat/LongCat-Flash-Chat"),
-    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
-                                    trust_remote_code=True,
-                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
-    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                     min_transformers_version="4.56.3"),
+        speculative_model="meituan-longcat/LongCat-Flash-Chat",
+    ),
+    "MiMoMTPModel": _HfExamplesInfo(
+        "XiaomiMiMo/MiMo-7B-RL",
+        trust_remote_code=True,
+        speculative_model="XiaomiMiMo/MiMo-7B-RL",
+    ),
+    "Eagle3Qwen2_5vlForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen2.5-VL-7B-Instruct",
+        speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl",
+    ),
+    "Qwen3NextMTP": _HfExamplesInfo(
+        "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
+    ),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
-    "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
-    "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
-    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "TransformersEmbeddingModel": _HfExamplesInfo(
+        "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
+    ),
+    "TransformersForSequenceClassification": _HfExamplesInfo(
+        "papluca/xlm-roberta-base-language-detection",
+        min_transformers_version="5.0.0.dev",
+    ),
+    "TransformersForCausalLM": _HfExamplesInfo(
+        "hmellor/Ilama-3.2-1B", trust_remote_code=True
+    ),
+    "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "TransformersMoEForCausalLM": _HfExamplesInfo(
+        "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
+    ),
+    "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
+    ),
+    "TransformersMoEEmbeddingModel": _HfExamplesInfo(
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
+    ),
+    "TransformersMoEForSequenceClassification": _HfExamplesInfo(
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
+    ),
+    "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
+    "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
+        "google/gemma-3-4b-it"
+    ),
 }
 
 _EXAMPLE_MODELS = {
@@ -682,7 +954,12 @@ class HfExampleModels:
         return self.hf_models.keys()
 
     def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
-        return self.hf_models[model_arch]
+        try:
+            return self.hf_models[model_arch]
+        except KeyError:
+            raise ValueError(
+                f"No example model defined for {model_arch}; please update this file."
+            ) from None
 
     def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
         for info in self.hf_models.values():
@@ -694,7 +971,9 @@ class HfExampleModels:
             if any(extra == model_id for extra in info.extras.values()):
                 return info
 
-        raise ValueError(f"No example model defined for {model_id}")
+        raise ValueError(
+            f"No example model defined for {model_id}; please update this file."
+        )
 
 
 HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 4b4db5d65d87361e3a9ae2fd2b6784d31fb53a7f..d9b95f9043fc43ea19a69393fdda7ff5d1828106 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,14 +7,20 @@ from unittest.mock import patch
 import pytest
 
 from vllm import LLM
-from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import (generate_scheduler_kv_cache_config,
-                                         get_kv_cache_configs)
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (
+    generate_scheduler_kv_cache_config,
+    get_kv_cache_configs,
+)
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from ..utils import create_new_process_for_each_test
-from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
-                       HF_EXAMPLE_MODELS, HfExampleModels)
+from .registry import (
+    _TRANSFORMERS_BACKEND_MODELS,
+    AUTO_EXAMPLE_MODELS,
+    HF_EXAMPLE_MODELS,
+    HfExampleModels,
+)
 from .utils import dummy_hf_overrides
 
 # This minimal list of model architectures is smaller than the total list of
@@ -24,23 +30,32 @@ from .utils import dummy_hf_overrides
 # generation, sequence classification, causal LM, ranking, chat, reward model,
 # multimodal, geospatial, voice, embedding, MTP)
 MINIMAL_MODEL_ARCH_LIST = [
-    "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
-    "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
-    "InternVLChatModel", "InternLM2ForRewardModel",
-    "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
-    "DeepSeekMTPModel", "XLMRobertaModel"
+    "LlavaForConditionalGeneration",
+    "Llama4ForConditionalGeneration",
+    "BertForSequenceClassification",
+    "Gemma3nForCausalLM",
+    "JinaVLForRanking",
+    "InternVLChatModel",
+    "InternLM2ForRewardModel",
+    "TransformersMultiModalForCausalLM",
+    "PrithviGeoSpatialMAE",
+    "UltravoxModel",
+    "DeepSeekMTPModel",
+    "XLMRobertaModel",
 ]
 
 # This list is the complement of the minimal list above. The intention is that
 # this list of models is only tested in a "special case" i.e. most PRs should
 # not test these models
-OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
-                         set(MINIMAL_MODEL_ARCH_LIST))
+OTHER_MODEL_ARCH_LIST = set(HF_EXAMPLE_MODELS.get_supported_archs()) - set(
+    MINIMAL_MODEL_ARCH_LIST
+)
 
 
 @create_new_process_for_each_test()
-def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
-                   EXAMPLE_MODELS: HfExampleModels):
+def can_initialize(
+    model_arch: str, monkeypatch: pytest.MonkeyPatch, EXAMPLE_MODELS: HfExampleModels
+):
     """The reason for using create_new_process_for_each_test is to avoid
     the WARNING:
         "We must use the 'spawn' multiprocessing start method. Overriding
@@ -53,12 +68,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    hf_overrides_fn = partial(dummy_hf_overrides,
-                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides,
-                              use_original_num_layers=getattr(
-                                  model_info, 'use_original_num_layers',
-                                  False))
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+        use_original_num_layers=getattr(model_info, "use_original_num_layers", False),
+    )
 
     # Avoid calling model.forward()
     def _initialize_kv_caches_v1(self, vllm_config):
@@ -68,21 +83,30 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             kv_cache_specs,
             [10 * GiB_bytes],
         )
-        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
-            kv_cache_configs)
-
+        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
         # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
         return 1, 0, scheduler_kv_cache_config
 
-    with (patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-        if model_info.v0_only:
-            # NOTE(woosuk): skip the test for V0-only models
-            return
+    if model_arch == "MiniMaxVL01ForConditionalGeneration":
+        pytest.skip(
+            "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
+        )
+
+    if model_arch == "DeepseekV32ForCausalLM":
+        from vllm.platforms import current_platform
 
-        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
+        capability = current_platform.get_device_capability()
+        if capability and capability.major < 9:
             pytest.skip(
-                "Differential Flash Attention backend has been removed.")
+                f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
+                f"for FLASHMLA_SPARSE backend. Current device has compute "
+                f"capability {capability.major}.{capability.minor}"
+            )
+
+    with (
+        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
+        monkeypatch.context() as m,
+    ):
         if model_arch == "GptOssForCausalLM":
             # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
             # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
@@ -90,53 +114,53 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
         if model_arch == "WhisperForConditionalGeneration":
             m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
             enforce_eager=model_info.enforce_eager,
-            skip_tokenizer_init=model_info.skip_tokenizer_init,
+            skip_tokenizer_init=model_info.require_embed_inputs,
+            enable_prompt_embeds=model_info.require_embed_inputs,
+            enable_mm_embeds=model_info.require_embed_inputs,
             dtype=model_info.dtype,
             speculative_config={
                 "model": model_info.speculative_model,
+                "method": model_info.speculative_method,
                 "num_speculative_tokens": 1,
-            } if model_info.speculative_model else None,
+            }
+            if model_info.speculative_model
+            else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,
             # these tests seem to produce leftover memory
             gpu_memory_utilization=0.80,
             load_format="dummy",
             model_impl="transformers"
-            if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
+            if model_arch in _TRANSFORMERS_BACKEND_MODELS
+            else "vllm",
             hf_overrides=hf_overrides_fn,
-            max_num_seqs=model_info.max_num_seqs)
+            max_num_seqs=model_info.max_num_seqs,
+        )
 
 
 @pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
-def test_can_initialize_small_subset(model_arch: str,
-                                     monkeypatch: pytest.MonkeyPatch):
+def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     """Test initializing small subset of supported models"""
-    if model_arch == "Lfm2ForCausalLM":
-        pytest.skip("Skipping until test supports V1-only models")
     can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
 
 
 @pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
-def test_can_initialize_large_subset(model_arch: str,
-                                     monkeypatch: pytest.MonkeyPatch):
+def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     """Test initializing large subset of supported models
-    
+
     This test covers the complement of the tests covered in the "small subset"
     test.
     """
-    if model_arch == "Lfm2ForCausalLM":
-        pytest.skip("Skipping until test supports V1-only models")
     can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
 
 
-@pytest.mark.parametrize("model_arch",
-                         AUTO_EXAMPLE_MODELS.get_supported_archs())
-def test_implicit_converted_models(model_arch: str,
-                                   monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model_arch", AUTO_EXAMPLE_MODELS.get_supported_archs())
+def test_implicit_converted_models(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 4aa7bb729789311453eb7b673ee5d57a8f304d32..15e94eef4aa00c080dfb224e4e51d13e479f9c1a 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -50,9 +50,9 @@ def test_oot_registration_embedding(
     with monkeypatch.context() as m:
         m.setenv("VLLM_PLUGINS", "register_dummy_model")
         prompts = ["Hello, my name is", "The text does not matter"]
-        llm = LLM(model=dummy_gemma2_embedding_path,
-                  load_format="dummy",
-                  max_model_len=2048)
+        llm = LLM(
+            model=dummy_gemma2_embedding_path, load_format="dummy", max_model_len=2048
+        )
         outputs = llm.embed(prompts)
 
         for output in outputs:
@@ -69,27 +69,28 @@ def test_oot_registration_multimodal(
 ):
     with monkeypatch.context() as m:
         m.setenv("VLLM_PLUGINS", "register_dummy_model")
-        prompts = [{
-            "prompt": "What's in the image?<image>",
-            "multi_modal_data": {
-                "image": image
+        prompts = [
+            {
+                "prompt": "What's in the image?<image>",
+                "multi_modal_data": {"image": image},
             },
-        }, {
-            "prompt": "Describe the image<image>",
-            "multi_modal_data": {
-                "image": image
+            {
+                "prompt": "Describe the image<image>",
+                "multi_modal_data": {"image": image},
             },
-        }]
+        ]
 
         sampling_params = SamplingParams(temperature=0)
-        llm = LLM(model=dummy_llava_path,
-                  load_format="dummy",
-                  max_num_seqs=1,
-                  trust_remote_code=True,
-                  gpu_memory_utilization=0.98,
-                  max_model_len=4096,
-                  enforce_eager=True,
-                  limit_mm_per_prompt={"image": 1})
+        llm = LLM(
+            model=dummy_llava_path,
+            load_format="dummy",
+            max_num_seqs=1,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.98,
+            max_model_len=4096,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image": 1},
+        )
 
         first_token = llm.get_tokenizer().decode(0)
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index ac66d45edbefa29e089583f3a1d030d1e7d11116..9017a0fd91407a3e22a59e3bc620461a39341b4d 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,16 +6,22 @@ import warnings
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import (is_pooling_model,
-                                        is_text_generation_model,
-                                        supports_multimodal)
-from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model,
-                                                 as_seq_cls_model)
-from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
-                                                 _SPECULATIVE_DECODING_MODELS,
-                                                 _TEXT_GENERATION_MODELS,
-                                                 ModelRegistry)
+from vllm.model_executor.models import (
+    is_pooling_model,
+    is_text_generation_model,
+    supports_multimodal,
+)
+from vllm.model_executor.models.adapters import (
+    as_embedding_model,
+    as_reward_model,
+    as_seq_cls_model,
+)
+from vllm.model_executor.models.registry import (
+    _MULTIMODAL_MODELS,
+    _SPECULATIVE_DECODING_MODELS,
+    _TEXT_GENERATION_MODELS,
+    ModelRegistry,
+)
 from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test
@@ -34,8 +40,7 @@ def test_registry_imports(model_arch):
     if model_arch in _SPECULATIVE_DECODING_MODELS:
         return  # Ignore these models which do not have a unified format
 
-    if (model_arch in _TEXT_GENERATION_MODELS
-            or model_arch in _MULTIMODAL_MODELS):
+    if model_arch in _TEXT_GENERATION_MODELS or model_arch in _MULTIMODAL_MODELS:
         assert is_text_generation_model(model_cls)
 
     # All vLLM models should be convertible to a pooling model
@@ -48,13 +53,16 @@ def test_registry_imports(model_arch):
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
-    ("LlamaForCausalLM", False, False, False),
-    ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
-])
+@pytest.mark.parametrize(
+    "model_arch,is_mm,init_cuda,is_ce",
+    [
+        ("LlamaForCausalLM", False, False, False),
+        ("LlavaForConditionalGeneration", True, True, False),
+        ("BertForSequenceClassification", False, False, True),
+        ("RobertaForSequenceClassification", False, False, True),
+        ("XLMRobertaForSequenceClassification", False, False, True),
+    ],
+)
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
     model_info = ModelRegistry._try_inspect_model_cls(model_arch)
     assert model_info is not None
@@ -70,7 +78,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
                 "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
 
 @create_new_process_for_each_test()
@@ -82,7 +91,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
         # ("MLPSpeculatorPreTrainedModel", False, False),
         ("DeepseekV2ForCausalLM", True, False),
         ("Qwen2VLForConditionalGeneration", True, True),
-    ])
+    ],
+)
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
     model_info = ModelRegistry._try_inspect_model_cls(model_arch)
     assert model_info is not None
@@ -97,16 +107,16 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
                 "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
 
 def test_hf_registry_coverage():
-    untested_archs = set(ModelRegistry.get_supported_archs() -
-                         HF_EXAMPLE_MODELS.get_supported_archs())
-
-    # Llama4ForCausalLM does not have a standalone model
-    untested_archs.discard("Llama4ForCausalLM")
+    untested_archs = (
+        ModelRegistry.get_supported_archs() - HF_EXAMPLE_MODELS.get_supported_archs()
+    )
 
     assert not untested_archs, (
         "Please add the following architectures to "
-        f"`tests/models/registry.py`: {untested_archs}")
+        f"`tests/models/registry.py`: {untested_archs}"
+    )
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
index 842e37ea26f67dfd1bdaf7030d95bc8540f51dad..15764145bc1a25db2778c678f99617e143fe1f3a 100644
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -11,32 +11,34 @@ from tests.conftest import VllmRunner
     "model",
     [
         "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
-        "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
+        "mgazz/Prithvi_v2_eo_300_tl_unet_agb",
     ],
 )
 def test_inference(
     vllm_runner: type[VllmRunner],
     model: str,
 ) -> None:
-
     pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
     location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
-    prompt = dict(prompt_token_ids=[1],
-                  multi_modal_data=dict(pixel_values=pixel_values,
-                                        location_coords=location_coords))
+    prompt = dict(
+        prompt_token_ids=[1],
+        multi_modal_data=dict(
+            pixel_values=pixel_values, location_coords=location_coords
+        ),
+    )
     with vllm_runner(
-            model,
-            runner="pooling",
-            dtype="half",
-            enforce_eager=True,
-            skip_tokenizer_init=True,
-            # Limit the maximum number of sequences to avoid the
-            # test going OOM during the warmup run
-            max_num_seqs=32,
-            default_torch_num_threads=1,
+        model,
+        runner="pooling",
+        dtype="half",
+        enforce_eager=True,
+        skip_tokenizer_init=True,
+        enable_mm_embeds=True,
+        # Limit the maximum number of sequences to avoid the
+        # test going OOM during the warmup run
+        max_num_seqs=32,
+        default_torch_num_threads=1,
     ) as vllm_model:
-
         vllm_output = vllm_model.llm.encode(prompt)
         assert torch.equal(
-            torch.isnan(vllm_output[0].outputs.data).any(),
-            torch.tensor(False))
+            torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
+        )
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 1817d4aeee9f95b566e360ee5ef77c47fdbfe2b2..ae5befd2c00b73f04b869c32e659b7f87db85230 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test the functionality of the Transformers backend."""
-from typing import Any, Optional, Union
+"""Test the functionality of the Transformers modeling backend."""
+
+from typing import Any
 
 import pytest
 
@@ -9,16 +10,23 @@ from vllm.platforms import current_platform
 
 from ..conftest import HfRunner, VllmRunner
 from ..utils import multi_gpu_test, prep_prompts
+from .registry import HF_EXAMPLE_MODELS
 from .utils import check_embeddings_close, check_logprobs_close
 
 
+def get_model(arch: str) -> str:
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(arch)
+    model_info.check_transformers_version(on_fail="skip")
+    return model_info.default
+
+
 def check_implementation(
-    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_ref: type[HfRunner | VllmRunner],
     runner_test: type[VllmRunner],
     example_prompts: list[str],
     model: str,
-    kwargs_ref: Optional[dict[str, Any]] = None,
-    kwargs_test: Optional[dict[str, Any]] = None,
+    kwargs_ref: dict[str, Any] | None = None,
+    kwargs_test: dict[str, Any] | None = None,
     **kwargs,
 ):
     if kwargs_ref is None:
@@ -53,13 +61,16 @@ def check_implementation(
 
 @pytest.mark.skipif(
     current_platform.is_rocm(),
-    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
+    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
+)
 @pytest.mark.parametrize(
     "model,model_impl",
     [
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
         ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
-    ])  # trust_remote_code=True by default
+        ("allenai/OLMoE-1B-7B-0924", "transformers"),  # MoE
+    ],
+)  # trust_remote_code=True by default
 def test_models(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
@@ -67,23 +78,34 @@ def test_models(
     model: str,
     model_impl: str,
 ) -> None:
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         model,
-                         model_impl=model_impl)
+    import transformers
+    from packaging.version import Version
+
+    installed = Version(transformers.__version__)
+    required = Version("5.0.0.dev")
+    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
+        pytest.skip(
+            "MoE models with the Transformers modeling backend require "
+            f"transformers>={required}, but got {installed}"
+        )
+
+    check_implementation(
+        hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
+    )
 
 
 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
     prompts, _, _ = prep_prompts(4, (800, 801))
     kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
     kwargs_test = {"model_impl": "transformers", **kwargs_ref}
-    check_implementation(vllm_runner,
-                         vllm_runner,
-                         prompts,
-                         model="hmellor/tiny-random-Gemma2ForCausalLM",
-                         kwargs_ref=kwargs_ref,
-                         kwargs_test=kwargs_test)
+    check_implementation(
+        vllm_runner,
+        vllm_runner,
+        prompts,
+        model="hmellor/tiny-random-Gemma2ForCausalLM",
+        kwargs_ref=kwargs_ref,
+        kwargs_test=kwargs_test,
+    )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -93,24 +115,28 @@ def test_distributed(
     example_prompts,
 ):
     kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct",
-                         kwargs_test=kwargs)
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        "meta-llama/Llama-3.2-1B-Instruct",
+        kwargs_test=kwargs,
+    )
 
 
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="bitsandbytes quantization is currently not supported in rocm.")
-@pytest.mark.parametrize("model, quantization_kwargs", [
-    (
-        "meta-llama/Llama-3.2-1B-Instruct",
-        {
-            "quantization": "bitsandbytes",
-        },
-    ),
-])
+@pytest.mark.parametrize(
+    "model, quantization_kwargs",
+    [
+        ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
+        ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
+        (
+            "meta-llama/Llama-3.2-1B-Instruct",
+            {
+                "quantization": "bitsandbytes",
+            },
+        ),
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_quantization(
@@ -121,22 +147,34 @@ def test_quantization(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+    if (
+        current_platform.is_rocm()
+        and quantization_kwargs.get("quantization", "") == "bitsandbytes"
+    ):
+        pytest.skip("bitsandbytes quantization is currently not supported in rocm.")
+
     with vllm_runner(
-            model, model_impl="auto", enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model,
+        model_impl="auto",
+        enforce_eager=True,
+        **quantization_kwargs,  # type: ignore[arg-type]
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )
 
     with vllm_runner(
-            model,
-            model_impl="transformers",
-            enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model,
+        model_impl="transformers",
+        enforce_eager=True,
+        **quantization_kwargs,  # type: ignore[arg-type]
+    ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.using_transformers_backend()
 
         transformers_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
@@ -152,85 +190,62 @@ def test_quantization(
         # Layers live in `layers`
         "Qwen/Qwen3-Embedding-0.6B",
         # Layers live in `model.layers`
-        "meta-llama/Llama-3.2-1B-Instruct"
+        "meta-llama/Llama-3.2-1B-Instruct",
     ],
 )
 def test_embed_loading(vllm_runner, model):
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     enforce_eager=True,
-                     runner="pooling",
-                     model_impl="transformers") as model_test:
+    with vllm_runner(
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        runner="pooling",
+        model_impl="transformers",
+    ) as model_test:
         model_config = model_test.llm.llm_engine.model_config
         assert model_config.using_transformers_backend()
 
 
 @pytest.mark.parametrize(
-    "model",
-    [
-        # Encoder model
-        "BAAI/bge-base-en-v1.5",
-    ])
-def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
-    import transformers
-    from packaging.version import Version
-    installed = Version(transformers.__version__)
-    required = Version("4.57.0.dev0")
-    if installed < required:
-        pytest.skip("Encoder models with the Transformers backend require "
-                    f"transformers>={required}, but got {installed}")
-
-    with vllm_runner(model, max_model_len=512,
-                     model_impl="transformers") as vllm_model:
+    "arch", ["TransformersEmbeddingModel", "TransformersForSequenceClassification"]
+)
+def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
+    model = get_model(arch)
+
+    vllm_kwargs = dict(max_model_len=None, model_impl="transformers")
+
+    hf_kwargs = dict()
+    if arch == "TransformersEmbeddingModel":
+        hf_kwargs["is_sentence_transformer"] = True
+    elif arch == "TransformersForSequenceClassification":
+        from transformers import AutoModelForSequenceClassification
+
+        hf_kwargs["auto_cls"] = AutoModelForSequenceClassification
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    with (
+        vllm_runner(model, **vllm_kwargs) as vllm_model,
+        hf_runner(model, **hf_kwargs) as hf_model,
+    ):
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.using_transformers_backend()
 
-        vllm_outputs = vllm_model.embed(example_prompts)
-
-    with hf_runner(model, is_sentence_transformer=True) as hf_model:
-        hf_outputs = hf_model.encode(example_prompts)
+        if arch == "TransformersEmbeddingModel":
+            vllm_outputs = vllm_model.embed(example_prompts)
+            hf_outputs = hf_model.encode(example_prompts)
+        elif arch == "TransformersForSequenceClassification":
+            vllm_outputs = vllm_model.classify(example_prompts)
+            hf_outputs = hf_model.classify(example_prompts)
 
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
         embeddings_1_lst=vllm_outputs,
         name_0="hf",
         name_1="vllm",
-        tol=1e-2,
     )
-
-
-@pytest.mark.parametrize(
-    "model",
-    ["jason9693/Qwen2.5-1.5B-apeach"],
-)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_classify(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-) -> None:
-    import torch
-    from transformers import AutoModelForSequenceClassification
-
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     model_impl="transformers") as vllm_model:
-        model_config = vllm_model.llm.llm_engine.model_config
-        assert model_config.using_transformers_backend()
-
-        vllm_outputs = vllm_model.classify(example_prompts)
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
-        hf_outputs = hf_model.classify(example_prompts)
-
-    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output)
-        vllm_output = torch.tensor(vllm_output)
-
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index b52327a1844f6e95c4beb8dc80b2c6f786db4787..7cc4ee3c1856f68ffd9c07f82e979e333b5d0a55 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 import torch
 
 from vllm.model_executor.models.utils import AutoWeightsLoader
 
+pytestmark = pytest.mark.cpu_test
 
-class ModuleWithBatchNorm(torch.nn.Module):
 
+class ModuleWithBatchNorm(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.bn = torch.nn.BatchNorm1d(2)
@@ -17,7 +19,6 @@ class ModuleWithBatchNorm(torch.nn.Module):
 
 
 class ModuleWithNestedBatchNorm(torch.nn.Module):
-
     def __init__(self):
         super().__init__()
         self.nested_mod = ModuleWithBatchNorm()
@@ -64,9 +65,11 @@ def test_module_with_child_containing_batchnorm_can_autoload():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod)
@@ -74,9 +77,9 @@ def test_module_with_child_containing_batchnorm_can_autoload():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
 
 
@@ -98,9 +101,11 @@ def test_module_skip_prefix():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
@@ -108,9 +113,9 @@ def test_module_skip_prefix():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
 
 
@@ -134,9 +139,11 @@ def test_module_skip_substr():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
@@ -144,7 +151,7 @@ def test_module_skip_substr():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index 8744bcbd3a2a63f8d45841b9097dcf3e5217abab..82ba958a58c41dc50c6705fcaa03d6725133cec3 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -8,18 +8,25 @@ import torch.multiprocessing as mp
 
 from tests.utils import multi_gpu_test
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.models.vision import (
-    get_load_balance_assignment, resolve_visual_encoder_outputs,
-    run_dp_sharded_mrope_vision_model, run_dp_sharded_vision_model)
+    get_load_balance_assignment,
+    resolve_visual_encoder_outputs,
+    run_dp_sharded_mrope_vision_model,
+    run_dp_sharded_vision_model,
+)
 from vllm.platforms import current_platform
-from vllm.utils import get_open_port, update_environment_variables
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.system_utils import update_environment_variables
+
+pytestmark = pytest.mark.cpu_test
 
 
 @pytest.mark.parametrize(
-    ("feature_sample_layers", "num_layers_loaded", "max_possible_layers",
-     "expected_features"),
+    ("select_layers", "num_layers_loaded", "max_possible_layers", "expected_features"),
     [
         # All layers loaded
         ([1, 10], 10, 10, [1, 10]),
@@ -27,21 +34,21 @@ from vllm.utils import get_open_port, update_environment_variables
         # Some layers not loaded
         ([1, 10], 10, 20, [1, 10]),
         ([-20, -11], 10, 20, [1, 10]),
-    ])
-def test_resolve_visual_encoder_outputs(feature_sample_layers,
-                                        num_layers_loaded, max_possible_layers,
-                                        expected_features):
+    ],
+)
+def test_resolve_visual_encoder_outputs(
+    select_layers, num_layers_loaded, max_possible_layers, expected_features
+):
     """
     Test that offsets are correctly handled for vision feature layers.
     """
-    encoder_outputs = [
-        torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
-    ]
+    encoder_outputs = [torch.tensor([idx]) for idx in range(num_layers_loaded + 1)]
     output_tensor = resolve_visual_encoder_outputs(
         encoder_outputs=encoder_outputs,
-        feature_sample_layers=feature_sample_layers,
         post_layer_norm=None,
-        max_possible_layers=max_possible_layers)
+        select_layers=select_layers,
+        max_possible_layers=max_possible_layers,
+    )
     assert torch.equal(torch.tensor(expected_features), output_tensor)
 
 
@@ -82,10 +89,11 @@ def test_run_dp_sharded_vision_model(batch_size: int):
     )
 
 
-def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
-                                          batch_size: int, master_port: int):
+def run_dp_sharded_vision_model_vs_direct(
+    local_rank: int, world_size: int, batch_size: int, master_port: int
+):
     """
-    Test that run_dp_sharded_vision_model produces the same results as 
+    Test that run_dp_sharded_vision_model produces the same results as
     calling the model directly.
     """
 
@@ -96,13 +104,15 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     current_platform.set_device(device)
     torch.set_default_device(device)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
@@ -138,28 +148,45 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     [
         # Empty input
         ([], 2, [], [0, 0], [0, 0], "empty input"),
-
         # Fewer samples than GPUs
-        ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0
-                                               ], "fewer samples than GPUs"),
-
+        (
+            [100, 200],
+            4,
+            [1, 0],
+            [1, 1, 0, 0],
+            [200, 100, 0, 0],
+            "fewer samples than GPUs",
+        ),
         # Single GPU
         ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"),
-
         # Balanced assignment
-        ([100, 100, 100, 100
-          ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"),
-
+        (
+            [100, 100, 100, 100],
+            2,
+            [0, 2, 1, 3],
+            [2, 2],
+            [200, 200],
+            "balanced assignment",
+        ),
         # Unbalanced sizes - this one is trickier since the algorithm is greedy
-        ([1000, 100, 200, 50], 2, [0, 2, 1, 3
-                                   ], [1, 3], [1000, 350], "unbalanced sizes"),
+        (
+            [1000, 100, 200, 50],
+            2,
+            [0, 2, 1, 3],
+            [1, 3],
+            [1000, 350],
+            "unbalanced sizes",
+        ),
     ],
 )
-def test_get_load_balance_assignment_cases(sizes, num_gpus,
-                                           expected_shuffle_indices,
-                                           expected_gpu_sample_counts,
-                                           expected_grouped_sizes_per_gpu,
-                                           test_description):
+def test_get_load_balance_assignment_cases(
+    sizes,
+    num_gpus,
+    expected_shuffle_indices,
+    expected_gpu_sample_counts,
+    expected_grouped_sizes_per_gpu,
+    test_description,
+):
     """Test get_load_balance_assignment with various input cases."""
     result = get_load_balance_assignment(sizes, num_gpus=num_gpus)
     (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result
@@ -185,8 +212,7 @@ class SimpleMRopeVisionModel(torch.nn.Module):
         self.out_hidden_size = out_hidden_size
         self.linear = torch.nn.Linear(768, out_hidden_size)
 
-    def forward(self, pixel_values: torch.Tensor,
-                grid_thw_list: list[list[int]]):
+    def forward(self, pixel_values: torch.Tensor, grid_thw_list: list[list[int]]):
         """Simple forward pass that simulates spatial merging."""
         # Apply linear transformation
         embeddings = self.linear(pixel_values)
@@ -209,8 +235,9 @@ class SimpleMRopeVisionModel(torch.nn.Module):
             merged_patches = num_patches // merge_factor
             if merged_patches > 0:
                 # Reshape and average to simulate merging
-                reshaped = image_patches[:merged_patches * merge_factor].view(
-                    merged_patches, merge_factor, -1)
+                reshaped = image_patches[: merged_patches * merge_factor].view(
+                    merged_patches, merge_factor, -1
+                )
                 merged = reshaped.mean(dim=1)
                 merged_embeddings.append(merged)
 
@@ -219,9 +246,11 @@ class SimpleMRopeVisionModel(torch.nn.Module):
         if merged_embeddings:
             return torch.cat(merged_embeddings, dim=0)
         else:
-            return torch.empty((0, self.out_hidden_size),
-                               device=pixel_values.device,
-                               dtype=pixel_values.dtype)
+            return torch.empty(
+                (0, self.out_hidden_size),
+                device=pixel_values.device,
+                dtype=pixel_values.dtype,
+            )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -247,12 +276,11 @@ def test_run_dp_sharded_mrope_vision_model(batch_size: int):
     )
 
 
-def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
-                                                world_size: int,
-                                                batch_size: int,
-                                                master_port: int):
+def run_dp_sharded_mrope_vision_model_vs_direct(
+    local_rank: int, world_size: int, batch_size: int, master_port: int
+):
     """
-    Test that run_dp_sharded_mrope_vision_model produces the same results as 
+    Test that run_dp_sharded_mrope_vision_model produces the same results as
     calling the model directly.
     """
     # Set random seed for reproducibility
@@ -261,13 +289,15 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
     current_platform.set_device(device)
     torch.set_default_device(device)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
@@ -300,10 +330,9 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
 
     # Run the model through the sharded function
     with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
-                                                           pixel_values,
-                                                           grid_thw_list,
-                                                           rope_type="rope_3d")
+        sharded_output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )
         sharded_output = torch.cat(sharded_output, dim=0)
 
     # Check that the world size is set up correctly
@@ -314,10 +343,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
         # Check that the outputs have the same shape
         assert direct_output.shape == sharded_output.shape
         # Check that the outputs are close (they should be identical)
-        assert torch.allclose(direct_output,
-                              sharded_output,
-                              rtol=1e-5,
-                              atol=1e-5)
+        assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
 
 
 @multi_gpu_test(num_gpus=2)
@@ -331,20 +357,23 @@ def test_run_dp_sharded_mrope_vision_model_empty_input():
 
 
 def run_dp_sharded_mrope_vision_model_empty_input_worker(
-        local_rank: int, world_size: int, master_port: int):
+    local_rank: int, world_size: int, master_port: int
+):
     """Test run_dp_sharded_mrope_vision_model with empty input."""
     # Set up distributed environment
     device = f"{current_platform.device_name}:{local_rank}"
     current_platform.set_device(device)
     torch.set_default_device(device)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )
 
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
@@ -357,10 +386,9 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
 
     # Should handle empty input gracefully
     with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model,
-                                                   pixel_values,
-                                                   grid_thw_list,
-                                                   rope_type="rope_3d")
+        output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )
 
     assert len(output) == 0
 
@@ -376,7 +404,8 @@ def test_run_dp_sharded_mrope_vision_model_uneven_load():
 
 
 def run_dp_sharded_mrope_vision_model_uneven_load_worker(
-        local_rank: int, world_size: int, master_port: int):
+    local_rank: int, world_size: int, master_port: int
+):
     """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
     # Set up distributed environment
     current_platform.seed_everything(123)
@@ -384,13 +413,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
     current_platform.set_device(device)
     torch.set_default_device(device)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )
 
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
@@ -398,7 +429,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
     # Create images with very different sizes
     grid_thw_list = [
         [1, 2, 2],  # Small: 4 patches
-        [1, 8, 8],  # Large: 64 patches  
+        [1, 8, 8],  # Large: 64 patches
         [1, 3, 3],  # Medium: 9 patches
     ]
 
@@ -413,15 +444,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
 
     # Should handle uneven distribution without errors
     with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+        output_tuple = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )
 
     # Verify output shape is reasonable
     merge_factor = vision_model.spatial_merge_size**2
     expected_output_patches = list(
-        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list)
+        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list
+    )
 
     for i, output in enumerate(output_tuple):
         assert output.shape[0] == expected_output_patches[i]
@@ -442,8 +473,9 @@ def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int):
         pixel_values_list.append(image_pixels)
 
     pixel_values = torch.cat(pixel_values_list, dim=0)
-    vision_model = SimpleMRopeVisionModel(
-        spatial_merge_size=spatial_merge_size).to(device)
+    vision_model = SimpleMRopeVisionModel(spatial_merge_size=spatial_merge_size).to(
+        device
+    )
 
     with torch.inference_mode():
         output = vision_model(pixel_values, grid_thw_list)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 7e731cffc04718af4ab0e53de72da3215aa6c463..9843887a13204bfdb0f0170e8bd10020f8b6dd50 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -4,17 +4,18 @@
 import warnings
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config import ModelConfig, ModelDType, RunnerOption
+from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
+from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
 
 TokensText = tuple[list[int], str]
@@ -33,16 +34,18 @@ def check_outputs_equal(
     """
     assert len(outputs_0_lst) == len(outputs_1_lst)
 
-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
         # The text and token outputs should exactly match
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\n{name_0}:\t{output_str_0!r}"
+            f"\n{name_1}:\t{output_str_1!r}"
+        )
 
         assert output_str_0 == output_str_1, fail_msg
         assert output_ids_0 == output_ids_1, fail_msg
@@ -54,9 +57,9 @@ def check_outputs_equal(
 # * List of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
-                                                                    float]],
-                                                          SampleLogprobs]]]
+TokensTextLogprobs = tuple[
+    list[int], str, list[dict[int, float]] | SampleLogprobs | None
+]
 
 # Allow for tokens to be represented as str's rather than IDs;
 # tuple of
@@ -65,9 +68,9 @@ TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
 # * Optional list of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
-                                                        list[dict[str,
-                                                                  Logprob]]]]]
+TextTextLogprobs = tuple[
+    list[str], str, list[dict[str, float]] | list[dict[str, Logprob]] | None
+]
 
 # Representation of generated sequence as a tuple of
 # * Token ID list
@@ -77,18 +80,21 @@ TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
 #
 # Allows prompt logprobs to be requested.
 TokensTextLogprobsPromptLogprobs = tuple[
-    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
-    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
+    list[int],
+    str,
+    list[dict[int, float]] | SampleLogprobs | None,
+    list[dict[int, float] | None] | PromptLogprobs | None,
+]
 
 
 def check_logprobs_close(
     *,
-    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
-    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
+    outputs_0_lst: Sequence[
+        TokensTextLogprobs | TokensTextLogprobsPromptLogprobs | TextTextLogprobs
+    ],
+    outputs_1_lst: Sequence[
+        TokensTextLogprobs | TokensTextLogprobsPromptLogprobs | TextTextLogprobs
+    ],
     name_0: str,
     name_1: str,
     num_outputs_0_skip_tokens: int = 0,
@@ -128,9 +134,9 @@ def check_logprobs_close(
     assert len(outputs_0_lst) == len(outputs_1_lst)
 
     # Loop through responses to each prompt.
-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
         assert len(outputs_0) == len(outputs_1)
         if len(outputs_0) == 3:
             assert len(outputs_1) == 3
@@ -155,17 +161,18 @@ def check_logprobs_close(
             ) = outputs_1
 
             # Test prompt logprobs closeness
-            if (prompt_logprobs_0 is not None
-                    and prompt_logprobs_1 is not None):
-                # Both sequences' prompt logprobs lists are not `None``
+            if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
+                # Both sequences' prompt logprobs lists are not `None`
                 # (although individual list elements may be `None`);
                 # for each token's logprobs:
                 for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
-                        zip(prompt_logprobs_0, prompt_logprobs_1)):
+                    zip(prompt_logprobs_0, prompt_logprobs_1)
+                ):
                     fail_msg = (
                         f"Prompt logprobs test:"
                         f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
-                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}"
+                    )
 
                     if logprobs_elem_0 is None:
                         # If the seq 0 token's logprobs are `None`,
@@ -176,20 +183,24 @@ def check_logprobs_close(
                         # the seq 1 token's logprobs must not be `None`
                         assert logprobs_elem_1 is not None, fail_msg
                         # Logprobs check: top-k token choices must be the same
-                        assert (set(logprobs_elem_0.keys()) == set(
-                            logprobs_elem_1.keys())), fail_msg
+                        assert set(logprobs_elem_0.keys()) == set(
+                            logprobs_elem_1.keys()
+                        ), fail_msg
             else:
                 # Both sequence logprobs lists must be `None`
-                fail_msg = (f"Prompt logprobs test:"
-                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
-                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+                fail_msg = (
+                    f"Prompt logprobs test:"
+                    f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
+                    f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}"
+                )
 
-                assert (prompt_logprobs_0 is None
-                        and prompt_logprobs_1 is None), fail_msg
+                assert prompt_logprobs_0 is None and prompt_logprobs_1 is None, fail_msg
         else:
-            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
-                             f"{len(outputs_0)} elements were provided: "
-                             f"{outputs_0}")
+            raise ValueError(
+                f"Outputs tuple must have 3 or 4 elements but "
+                f"{len(outputs_0)} elements were provided: "
+                f"{outputs_0}"
+            )
 
         if logprobs_0 is None:
             logprobs_0 = [None] * len(output_ids_0)
@@ -206,9 +217,9 @@ def check_logprobs_close(
         logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
 
         # Loop through generated tokens.
-        for idx, (output_id_0,
-                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
+        for idx, (output_id_0, output_id_1) in enumerate(
+            zip(output_ids_0, output_ids_1)
+        ):
             is_tok_mismatch = output_id_0 != output_id_1
 
             # If generated tokens don't match
@@ -223,7 +234,8 @@ def check_logprobs_close(
                     f"Test{prompt_idx}:"
                     f"\nMatched tokens:\t{output_ids_0[:idx]}"
                     f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
-                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}"
+                )
 
                 assert logprobs_elem_0 is not None, fail_msg
                 assert logprobs_elem_1 is not None, fail_msg
@@ -244,9 +256,11 @@ def check_logprobs_close(
             if output_str_0 != output_str_1 and warn_on_mismatch:
                 # The token outputs exactly match,
                 # so the text outputs should exactly match as well
-                fail_msg = (f"Test{prompt_idx}:"
-                            f"\n{name_0}:\t{output_str_0!r}"
-                            f"\n{name_1}:\t{output_str_1!r}")
+                fail_msg = (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}"
+                )
 
                 with warnings.catch_warnings():
                     # This ensures that repeated warnings are shown
@@ -260,9 +274,9 @@ def build_model_context(
     model_id: str,
     runner: RunnerOption = "auto",
     dtype: ModelDType = "auto",
-    model_config_kwargs: Optional[dict[str, Any]] = None,
-    mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    model_config_kwargs: dict[str, Any] | None = None,
+    mm_processor_kwargs: dict[str, Any] | None = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
     mm_processor_cache_gb: int = 0,
 ):
     """Creates an InputProcessingContext for a given model.
@@ -295,7 +309,9 @@ def build_model_context(
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
         hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
@@ -317,18 +333,22 @@ def check_embeddings_close(
     assert len(embeddings_0_lst) == len(embeddings_1_lst)
 
     for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
+        zip(embeddings_0_lst, embeddings_1_lst)
+    ):
         assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}"
+        )
 
-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
+        sim = F.cosine_similarity(
+            torch.tensor(embeddings_0), torch.tensor(embeddings_1), dim=0
+        )
 
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\nCosine similarity: \t{sim:.4f}"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\nCosine similarity: \t{sim:.4f}"
+            f"\n{name_0}:\t{embeddings_0[:16]!r}"
+            f"\n{name_1}:\t{embeddings_1[:16]!r}"
+        )
 
         assert sim >= 1 - tol, fail_msg
 
@@ -352,17 +372,18 @@ class ModelInfo:
     name: str
     architecture: str = ""
     dtype: str = "auto"
+    max_model_len: int | None = None
     hf_dtype: str = "float32"
-    hf_overrides: Optional[dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     default_pooling_type: str = ""
     enable_test: bool = True
 
 
 @dataclass
 class EmbedModelInfo(ModelInfo):
-    mteb_score: Optional[float] = None
+    mteb_score: float | None = None
     is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
+    matryoshka_dimensions: list[int] | None = None
 
 
 @dataclass
@@ -377,7 +398,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
 
 @dataclass
 class RerankModelInfo(ModelInfo):
-    mteb_score: Optional[float] = None
+    mteb_score: float | None = None
 
 
 @dataclass
@@ -393,14 +414,43 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
 @dataclass
 class GenerateModelInfo(ModelInfo):
     hf_dtype: str = "auto"
-    hf_ppl: Optional[float] = None
+    hf_ppl: float | None = None
+
+
+def get_vllm_extra_kwargs(model_info: ModelInfo, vllm_extra_kwargs):
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
+    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
+        import pytest
+
+        pytest.skip("Skipping test.")
+
+    # Allow vllm to test using the given dtype, such as float32
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
+
+    # Allow vllm to test using hf_overrides
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
+    # Allow changing the head dtype used by vllm in tests
+    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
+        if "hf_overrides" not in vllm_extra_kwargs:
+            vllm_extra_kwargs["hf_overrides"] = {}
+        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
+
+    # Allow control over whether tests use enforce_eager
+    if ci_envs.VLLM_CI_ENFORCE_EAGER is not None:
+        vllm_extra_kwargs["enforce_eager"] = ci_envs.VLLM_CI_ENFORCE_EAGER
+
+    return vllm_extra_kwargs
 
 
 def dummy_hf_overrides(
     hf_config: PretrainedConfig,
     *,
     model_arch: str = "",
-    exist_overrides: Optional[dict[str, Any]] = None,
+    exist_overrides: dict[str, Any] | None = None,
     use_original_num_layers: bool = False,
 ) -> PretrainedConfig:
     """
@@ -413,71 +463,89 @@ def dummy_hf_overrides(
 
     # Ensure at least 2 expert per group
     # Since `grouped_topk` assumes top-2
-    n_group = getattr(text_config, 'n_group', None)
+    n_group = getattr(text_config, "n_group", None)
     num_experts = n_group * 2 if n_group is not None else 2
 
     # we use three layers for Gemma-3n to check
     # both normal layer and kv_shared_layer
     if use_original_num_layers:
         # Use the original number of layers from the config
-        num_layers = getattr(text_config, 'num_layers', 1)
-        num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
+        num_layers = getattr(text_config, "num_layers", 1)
+        num_hidden_layers = getattr(text_config, "num_hidden_layers", 1)
     else:
         # Use minimal layers for testing
         num_layers = 1
-        num_hidden_layers = (3 if model_arch
-                             == "Gemma3nForConditionalGeneration" else 1)
+        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
 
     update_dict = {
         "num_layers": num_layers,
-        "num_experts": num_experts,
-        "num_experts_per_tok": 2,
-        "num_local_experts": num_experts,
-        # Otherwise there will not be any expert layers
-        "first_k_dense_replace": 0,
-        # To avoid OOM on DeepSeek-V3
-        "n_routed_experts": num_experts,
         # For Gemma-3n
         "num_kv_shared_layers": 1,
     }
 
+    class DummyConfig:
+        hf_text_config = text_config
+
+    # Only set MoE related config when the model has MoE layers.
+    # Otherwise all models detected as MoE by _get_transformers_backend_cls.
+    if ModelConfig.get_num_experts(DummyConfig) > 0:
+        update_dict.update(
+            {
+                "num_experts": num_experts,
+                "num_experts_per_tok": 2,
+                "num_local_experts": num_experts,
+                # Otherwise there will not be any expert layers
+                "first_k_dense_replace": 0,
+                # To avoid OOM on DeepSeek-V3
+                "n_routed_experts": num_experts,
+            }
+        )
+
     # Update num_hidden_layers for non-Longcat architectures
-    if model_arch != "LongcatFlashForCausalLM" \
-            and model_arch != "LongCatFlashMTPModel":
+    if model_arch != "LongcatFlashForCausalLM" and model_arch != "LongCatFlashMTPModel":
         update_dict["num_hidden_layers"] = num_hidden_layers
 
     text_config.update(update_dict)
 
     if hasattr(hf_config, "vision_config"):
-        hf_config.vision_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
+        hf_config.vision_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            }
+        )
 
     # e.g.: ibm-granite/granite-speech-3.3-2b
     if hasattr(hf_config, "encoder_config"):
-        hf_config.encoder_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
+        hf_config.encoder_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            }
+        )
 
     # e.g.: Qwen/Qwen2-Audio-7B-Instruct
     if hasattr(hf_config, "audio_config"):
-        hf_config.audio_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-            "encoder_layers": 1,
-        })
+        hf_config.audio_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+                "encoder_layers": 1,
+            }
+        )
 
     return hf_config
 
 
-def check_transformers_version(model: str,
-                               min_transformers_version: Optional[str] = None,
-                               max_transformers_version: Optional[str] = None):
+def check_transformers_version(
+    model: str,
+    min_transformers_version: str | None = None,
+    max_transformers_version: str | None = None,
+):
     from .registry import _HfExamplesInfo
 
-    return _HfExamplesInfo(model,
-                           min_transformers_version=min_transformers_version,
-                           max_transformers_version=max_transformers_version
-                           ).check_transformers_version(on_fail="skip")
+    return _HfExamplesInfo(
+        model,
+        min_transformers_version=min_transformers_version,
+        max_transformers_version=max_transformers_version,
+    ).check_transformers_version(on_fail="skip")
diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py
index ba39af845041a0edfe61a8365fadf9d195e4a932..189b319e5fcdee02cc0a6bbe3c8edf27fe624b39 100644
--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -8,9 +8,12 @@ from unittest.mock import patch
 import numpy as np
 import pytest
 
-from vllm.multimodal.audio import (AudioMediaIO, AudioResampler,
-                                   resample_audio_librosa,
-                                   resample_audio_scipy)
+from vllm.multimodal.audio import (
+    AudioMediaIO,
+    AudioResampler,
+    resample_audio_librosa,
+    resample_audio_scipy,
+)
 
 
 @pytest.fixture
@@ -21,12 +24,10 @@ def dummy_audio():
 def test_resample_audio_librosa(dummy_audio):
     with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
         mock_resample.return_value = dummy_audio * 2
-        out = resample_audio_librosa(dummy_audio,
-                                     orig_sr=44100,
-                                     target_sr=22050)
-        mock_resample.assert_called_once_with(dummy_audio,
-                                              orig_sr=44100,
-                                              target_sr=22050)
+        out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
+        mock_resample.assert_called_once_with(
+            dummy_audio, orig_sr=44100, target_sr=22050
+        )
         assert np.all(out == dummy_audio * 2)
 
 
@@ -40,8 +41,7 @@ def test_resample_audio_scipy(dummy_audio):
     assert np.all(out_same == dummy_audio)
 
 
-@pytest.mark.xfail(
-    reason="resample_audio_scipy is buggy for non-integer ratios")
+@pytest.mark.xfail(reason="resample_audio_scipy is buggy for non-integer ratios")
 def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
     out = resample_audio_scipy(dummy_audio, orig_sr=5, target_sr=3)
 
@@ -54,13 +54,12 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
 
 def test_audio_resampler_librosa_calls_resample(dummy_audio):
     resampler = AudioResampler(target_sr=22050, method="librosa")
-    with patch(
-            "vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
+    with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
         mock_resample.return_value = dummy_audio
         out = resampler.resample(dummy_audio, orig_sr=44100)
-        mock_resample.assert_called_once_with(dummy_audio,
-                                              orig_sr=44100,
-                                              target_sr=22050)
+        mock_resample.assert_called_once_with(
+            dummy_audio, orig_sr=44100, target_sr=22050
+        )
         assert np.all(out == dummy_audio)
 
 
@@ -69,9 +68,9 @@ def test_audio_resampler_scipy_calls_resample(dummy_audio):
     with patch("vllm.multimodal.audio.resample_audio_scipy") as mock_resample:
         mock_resample.return_value = dummy_audio
         out = resampler.resample(dummy_audio, orig_sr=44100)
-        mock_resample.assert_called_once_with(dummy_audio,
-                                              orig_sr=44100,
-                                              target_sr=22050)
+        mock_resample.assert_called_once_with(
+            dummy_audio, orig_sr=44100, target_sr=22050
+        )
         assert np.all(out == dummy_audio)
 
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 3c737acfbfe28fe96d886e92f072ff6c0338e132..531674c30f55f80233699f14ca8f344ea07a6e09 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import numpy as np
 import pytest
@@ -8,29 +7,36 @@ import torch
 
 from vllm.config import ModelConfig, ParallelConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import (MultiModalCache,
-                                   MultiModalProcessorCacheItem,
-                                   MultiModalProcessorCacheItemMetadata,
-                                   engine_receiver_cache_from_config,
-                                   processor_cache_from_config)
+from vllm.multimodal.cache import (
+    MultiModalCache,
+    MultiModalProcessorCacheItem,
+    MultiModalProcessorCacheItemMetadata,
+    engine_receiver_cache_from_config,
+    processor_cache_from_config,
+)
 from vllm.multimodal.hasher import MultiModalHasher
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
-                                    MultiModalKwargsItems,
-                                    MultiModalSharedField)
+from vllm.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalSharedField,
+)
 from vllm.multimodal.processing import PromptInsertion
 
+pytestmark = pytest.mark.cpu_test
+
 
 def _dummy_elem(
     modality: str,
     key: str,
     size: int,
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
     if rng is None:
-        data = torch.empty((size, ), dtype=torch.int8)
+        data = torch.empty((size,), dtype=torch.int8)
     else:
-        data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))
+        data = torch.from_numpy(rng.randint(4, size=(size,), dtype=np.int8))
 
     return MultiModalFieldElem(
         modality=modality,
@@ -44,44 +50,47 @@ def _dummy_item(
     modality: str,
     size_by_key: dict[str, int],
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
-    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size, rng=rng)
-        for key, size in size_by_key.items()
-    ])
+    return MultiModalKwargsItem.from_elems(
+        [_dummy_elem(modality, key, size, rng=rng) for key, size in size_by_key.items()]
+    )
 
 
 def _dummy_items(
     size_by_key_modality: dict[str, dict[str, int]],
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
-    return MultiModalKwargsItems.from_seq([
-        _dummy_item(modality, size_by_key, rng=rng)
-        for modality, size_by_key in size_by_key_modality.items()
-    ])
+    return MultiModalKwargsItems.from_seq(
+        [
+            _dummy_item(modality, size_by_key, rng=rng)
+            for modality, size_by_key in size_by_key_modality.items()
+        ]
+    )
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("item", "expected_size"),
     [
         (_dummy_item("a", {"a1": 100}), 100),
         (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
         (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
-        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460),  # noqa: E501
+        (
+            _dummy_items(
+                {"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
+            ).get_data(),
+            460,
+        ),  # noqa: E501
     ],
 )
-# yapf: enable
 def test_cache_item_size(item, expected_size):
     cache = MultiModalCache.get_lru_cache(2048, type(item))
 
     cache[""] = item
     assert cache.currsize == expected_size
 
-    prompt_update = PromptInsertion("dummy", "target", "insertion") \
-        .resolve(0)
+    prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
 
     cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
     assert cache.currsize == expected_size
@@ -98,9 +107,9 @@ def _create_vllm_config(
     return VllmConfig(
         model_config=ModelConfig(
             model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-            mm_processor_cache_gb=mm_processor_cache_gb),
-        parallel_config=ParallelConfig(
-            data_parallel_size=1 if enable_ipc else 2),
+            mm_processor_cache_gb=mm_processor_cache_gb,
+        ),
+        parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
     )
 
 
@@ -116,11 +125,9 @@ def _compare_caches(
     seed: int = 0,
 ):
     cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
-    cache_0_p1 = engine_receiver_cache_from_config(config_0,
-                                                   MULTIMODAL_REGISTRY)
+    cache_0_p1 = engine_receiver_cache_from_config(config_0, MULTIMODAL_REGISTRY)
     cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
-    cache_1_p1 = engine_receiver_cache_from_config(config_1,
-                                                   MULTIMODAL_REGISTRY)
+    cache_1_p1 = engine_receiver_cache_from_config(config_1, MULTIMODAL_REGISTRY)
 
     cache_size_gb = max(
         config_0.model_config.multimodal_config.mm_processor_cache_gb,
@@ -134,8 +141,7 @@ def _compare_caches(
         for _ in range(int(item_capacity / hit_rate))
     ]
     all_hashes = [
-        MultiModalHasher.hash_kwargs(item=item.get_data())
-        for item in all_items
+        MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
     ]
 
     # Should not be used since there is nothing to convert to text
@@ -154,7 +160,8 @@ def _compare_caches(
             for _ in range(is_cached_calls_per_iter):
                 cache_0_p0.is_cached(selected_hashes)
             cache_0_p0_out = [
-                item for item, _ in cache_0_p0.get_and_update(
+                item
+                for item, _ in cache_0_p0.get_and_update(
                     [(item, prompt_update.content) for item in selected_items],
                     selected_hashes,
                 )
@@ -166,7 +173,8 @@ def _compare_caches(
             for _ in range(is_cached_calls_per_iter):
                 cache_1_p0.is_cached(selected_hashes)
             cache_1_p0_out = [
-                item for item, _ in cache_1_p0.get_and_update(
+                item
+                for item, _ in cache_1_p0.get_and_update(
                     [(item, prompt_update.content) for item in selected_items],
                     selected_hashes,
                 )
@@ -175,14 +183,12 @@ def _compare_caches(
         if cache_0_p1 is None:
             cache_0_p1_out = cache_0_p0_out
         else:
-            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
-                                                       selected_hashes)
+            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, selected_hashes)
 
         if cache_1_p1 is None:
             cache_1_p1_out = cache_1_p0_out
         else:
-            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
-                                                       selected_hashes)
+            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, selected_hashes)
 
         assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
 
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 2751e38760e1750f1a73476d759dca22facdf563..29064f2737834d66333590042ece3d678e51da20 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -10,6 +10,8 @@ from PIL import Image, ImageDraw
 
 from vllm.multimodal.hasher import MultiModalHasher
 
+pytestmark = pytest.mark.cpu_test
+
 ASSETS_DIR = Path(__file__).parent / "assets"
 assert ASSETS_DIR.exists()
 
@@ -88,8 +90,6 @@ def test_hash_image_exif_id():
 
     hasher = MultiModalHasher
     # first image has UUID in ImageID, so it should hash to that UUID
-    assert hasher.hash_kwargs(image=image1) == hasher.hash_kwargs(
-        image=id.bytes)
+    assert hasher.hash_kwargs(image=image1) == hasher.hash_kwargs(image=id.bytes)
     # second image has non-UUID in ImageID, so it should hash to the image data
-    assert hasher.hash_kwargs(image=image2) == hasher.hash_kwargs(
-        image=image2a)
+    assert hasher.hash_kwargs(image=image2) == hasher.hash_kwargs(image=image2a)
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index 271a85f1195eccbeb2c3e43bfd822921280458be..329a5b0494cb66ee3d91c9a915ed3dd32a3c15fb 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -8,6 +8,8 @@ from PIL import Image, ImageChops
 
 from vllm.multimodal.image import ImageMediaIO, convert_image_mode
 
+pytestmark = pytest.mark.cpu_test
+
 ASSETS_DIR = Path(__file__).parent / "assets"
 assert ASSETS_DIR.exists()
 
@@ -41,8 +43,7 @@ def test_rgba_to_rgb():
 def test_rgba_to_rgb_custom_background(tmp_path):
     """Test RGBA to RGB conversion with custom background colors."""
     # Create a simple RGBA image with transparent and opaque pixels
-    rgba_image = Image.new("RGBA", (10, 10),
-                           (255, 0, 0, 255))  # Red with full opacity
+    rgba_image = Image.new("RGBA", (10, 10), (255, 0, 0, 255))  # Red with full opacity
 
     # Make top-left quadrant transparent
     for i in range(5):
@@ -92,7 +93,7 @@ def test_rgba_to_rgb_custom_background(tmp_path):
     assert blue_numpy[0][0][2] == 255  # B
 
     # Test 4: Test with load_bytes method
-    with open(test_image_path, 'rb') as f:
+    with open(test_image_path, "rb") as f:
         image_data = f.read()
 
     image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
@@ -109,39 +110,47 @@ def test_rgba_background_color_validation():
     """Test that invalid rgba_background_color values are properly rejected."""
 
     # Test invalid types
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color="255,255,255")
 
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=255)
 
     # Test wrong number of elements
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(255, 255))
 
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
 
     # Test non-integer values
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
 
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(255, "255", 255))
 
     # Test out of range values
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(256, 255, 255))
 
-    with pytest.raises(ValueError,
-                       match="rgba_background_color must be a list or tuple"):
+    with pytest.raises(
+        ValueError, match="rgba_background_color must be a list or tuple"
+    ):
         ImageMediaIO(rgba_background_color=(255, -1, 255))
 
     # Test that valid values work
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index ffb3a6fe86b46ca03741b896d9afe06a856840be..88e92bee3a292e9f440fc65e8269712bb340dade 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 
+pytestmark = pytest.mark.cpu_test
 
-def assert_nested_tensors_equal(expected: NestedTensors,
-                                actual: NestedTensors):
+
+def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
     assert type(expected) == type(actual)  # noqa: E721
     if isinstance(expected, torch.Tensor):
         assert torch.equal(expected, actual)
@@ -16,8 +18,9 @@ def assert_nested_tensors_equal(expected: NestedTensors,
             assert_nested_tensors_equal(expected_item, actual_item)
 
 
-def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
-                                   actual: MultiModalKwargs):
+def assert_multimodal_inputs_equal(
+    expected: MultiModalKwargs, actual: MultiModalKwargs
+):
     assert set(expected.keys()) == set(actual.keys())
     for key in expected:
         assert_nested_tensors_equal(expected[key], actual[key])
@@ -49,19 +52,10 @@ def test_multimodal_input_batch_nested_tensors():
     a = torch.rand([2, 3])
     b = torch.rand([2, 3])
     c = torch.rand([2, 3])
-    result = MultiModalKwargs.batch([{
-        "image": [a]
-    }, {
-        "image": [b]
-    }, {
-        "image": [c]
-    }])
-    assert_multimodal_inputs_equal(result, {
-        "image":
-        torch.stack([a.unsqueeze(0),
-                     b.unsqueeze(0),
-                     c.unsqueeze(0)])
-    })
+    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(
+        result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
+    )
 
 
 def test_multimodal_input_batch_heterogeneous_lists():
@@ -70,8 +64,8 @@ def test_multimodal_input_batch_heterogeneous_lists():
     c = torch.rand([1, 2, 3])
     result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
     assert_multimodal_inputs_equal(
-        result,
-        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
+        result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
+    )
 
 
 def test_multimodal_input_batch_multiple_batchable_lists():
@@ -81,9 +75,8 @@ def test_multimodal_input_batch_multiple_batchable_lists():
     d = torch.rand([1, 2, 3])
     result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
     assert_multimodal_inputs_equal(
-        result,
-        {"image": torch.stack([torch.stack([a, b]),
-                               torch.stack([c, d])])})
+        result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
+    )
 
 
 def test_multimodal_input_batch_mixed_stacking_depths():
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 352b5b5b4fd46a8e228253e519f134eac6fcc262..2f04bc6695c8162c596c48db30eab13bfe575b64 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,31 +2,33 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import pytest
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.multimodal.processing import (InputProcessingContext,
-                                        PlaceholderFeaturesInfo,
-                                        PromptIndexTargets, PromptInsertion,
-                                        PromptReplacement, apply_text_matches,
-                                        apply_token_matches,
-                                        find_mm_placeholders,
-                                        iter_token_matches,
-                                        replace_token_matches)
-# yapf: enable
+from vllm.multimodal.processing import (
+    InputProcessingContext,
+    PlaceholderFeaturesInfo,
+    PromptIndexTargets,
+    PromptInsertion,
+    PromptReplacement,
+    apply_text_matches,
+    apply_token_matches,
+    find_mm_placeholders,
+    iter_token_matches,
+    replace_token_matches,
+)
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .utils import random_image
 
+pytestmark = pytest.mark.cpu_test
+
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "expected"),
     [
@@ -36,34 +38,34 @@ from .utils import random_image
             [32000, 32000, 32000],
             [32000],
             [
-                { "start_idx": 0, "end_idx": 1 },
-                { "start_idx": 1, "end_idx": 2 },
-                { "start_idx": 2, "end_idx": 3 },
+                {"start_idx": 0, "end_idx": 1},
+                {"start_idx": 1, "end_idx": 2},
+                {"start_idx": 2, "end_idx": 3},
             ],
         ),
         (
             [32000, 32000, 32000],
             [32000, 32000],
-            [{ "start_idx": 0, "end_idx": 2 }],
+            [{"start_idx": 0, "end_idx": 2}],
         ),
         (
             [32000, 32000, 32000],
             [32000, 32000, 32000],
-            [{ "start_idx": 0, "end_idx": 3 }],
+            [{"start_idx": 0, "end_idx": 3}],
         ),
         (
             [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
             [28747, 32000],
             [
-                { "start_idx": 1, "end_idx": 3 },
-                { "start_idx": 6, "end_idx": 8 },
+                {"start_idx": 1, "end_idx": 3},
+                {"start_idx": 6, "end_idx": 8},
             ],
         ),
         (
             [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
             [28747, 32000, 32000, 32000],
             [
-                { "start_idx": 1, "end_idx": 5 },
+                {"start_idx": 1, "end_idx": 5},
             ],
         ),
         (
@@ -74,14 +76,13 @@ from .utils import random_image
     ],
 )
 @pytest.mark.parametrize("start_idx", [0, 4, 8])
-# yapf: enable
 def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
-    result = list(iter_token_matches(token_ids, match_ids,
-                                     start_idx=start_idx))
+    result = list(iter_token_matches(token_ids, match_ids, start_idx=start_idx))
 
     # Manually constructed results
-    assert [item._asdict() for item in result
-            ] == [item for item in expected if item["start_idx"] >= start_idx]
+    assert [item._asdict() for item in result] == [
+        item for item in expected if item["start_idx"] >= start_idx
+    ]
 
     # Invariants
     match_lens = [end - start for start, end in result]
@@ -89,7 +90,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
     assert all(match_len == len(match_ids) for match_len in match_lens)
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "new_ids", "expected"),
     [
@@ -133,7 +133,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
         ),
     ],
 )
-# yapf: enable
 def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
     result = replace_token_matches(token_ids, match_ids, new_ids)
 
@@ -141,7 +140,6 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
     assert result == expected
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "expected_by_key"),
     [
@@ -158,11 +156,11 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
                 "pattern_1": [],
                 "pattern_2": [],
                 "pattern_3": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_4": [],
                 "pattern_5": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
             },
         ),
@@ -178,26 +176,26 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
             },
             {
                 "pattern_1": [
-                    { "start_idx": 0, "end_idx": 1 },
-                    { "start_idx": 1, "end_idx": 2 },
-                    { "start_idx": 2, "end_idx": 3 },
-                    { "start_idx": 3, "end_idx": 4 },
+                    {"start_idx": 0, "end_idx": 1},
+                    {"start_idx": 1, "end_idx": 2},
+                    {"start_idx": 2, "end_idx": 3},
+                    {"start_idx": 3, "end_idx": 4},
                 ],
                 "pattern_2": [
-                    { "start_idx": 0, "end_idx": 2 },
-                    { "start_idx": 2, "end_idx": 4 },
+                    {"start_idx": 0, "end_idx": 2},
+                    {"start_idx": 2, "end_idx": 4},
                 ],
                 "pattern_3": [
-                    { "start_idx": 0, "end_idx": 3 },
+                    {"start_idx": 0, "end_idx": 3},
                 ],
                 "pattern_4": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_5": [
-                    { "start_idx": 1, "end_idx": 1 },
+                    {"start_idx": 1, "end_idx": 1},
                 ],
                 "pattern_6": [
-                    { "start_idx": 4, "end_idx": 4 },
+                    {"start_idx": 4, "end_idx": 4},
                 ],
             },
         ),
@@ -213,26 +211,25 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
             },
             {
                 "pattern_1": [
-                    { "start_idx": 1, "end_idx": 3 },
-                    { "start_idx": 6, "end_idx": 8 },
+                    {"start_idx": 1, "end_idx": 3},
+                    {"start_idx": 6, "end_idx": 8},
                 ],
                 "pattern_2": [
-                    { "start_idx": 1, "end_idx": 5 },
+                    {"start_idx": 1, "end_idx": 5},
                 ],
                 "pattern_3": [],
                 "pattern_4": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_5": [],
                 "pattern_6": [
-                    { "start_idx": 10, "end_idx": 10 },
+                    {"start_idx": 10, "end_idx": 10},
                 ],
             },
         ),
     ],
 )
 @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
-# yapf: enable
 def test_find_token_matches(
     prompt,
     target_by_key,
@@ -264,7 +261,6 @@ def test_find_token_matches(
     } == expected_by_key
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "expected_by_key"),
     [
@@ -280,16 +276,16 @@ def test_find_token_matches(
                 "pattern_5": PromptIndexTargets.end(),
             },
             {
-                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_1": [{"start_idx": 0, "end_idx": 0}],
                 "pattern_2": [],
                 "pattern_3": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_4": [],
                 "pattern_5": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
-            }
+            },
         ),
         (
             "<image><image><image><image>",
@@ -303,26 +299,26 @@ def test_find_token_matches(
             },
             {
                 "pattern_1": [
-                    { "start_idx": 0, "end_idx": 7 },
-                    { "start_idx": 7, "end_idx": 14 },
-                    { "start_idx": 14, "end_idx": 21 },
-                    { "start_idx": 21, "end_idx": 28 },
+                    {"start_idx": 0, "end_idx": 7},
+                    {"start_idx": 7, "end_idx": 14},
+                    {"start_idx": 14, "end_idx": 21},
+                    {"start_idx": 21, "end_idx": 28},
                 ],
                 "pattern_2": [
-                    { "start_idx": 0, "end_idx": 14 },
-                    { "start_idx": 14, "end_idx": 28 },
+                    {"start_idx": 0, "end_idx": 14},
+                    {"start_idx": 14, "end_idx": 28},
                 ],
                 "pattern_3": [
-                    { "start_idx": 0, "end_idx": 21 },
+                    {"start_idx": 0, "end_idx": 21},
                 ],
                 "pattern_4": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_5": [
-                    { "start_idx": 7, "end_idx": 7 },
+                    {"start_idx": 7, "end_idx": 7},
                 ],
                 "pattern_6": [
-                    { "start_idx": 28, "end_idx": 28 },
+                    {"start_idx": 28, "end_idx": 28},
                 ],
             },
         ),
@@ -338,21 +334,21 @@ def test_find_token_matches(
             },
             {
                 "pattern_1": [
-                    { "start_idx": 0, "end_idx": 13 },
-                    { "start_idx": 27, "end_idx": 40 },
+                    {"start_idx": 0, "end_idx": 13},
+                    {"start_idx": 27, "end_idx": 40},
                 ],
                 "pattern_2": [
-                    { "start_idx": 0, "end_idx": 27 },
+                    {"start_idx": 0, "end_idx": 27},
                 ],
                 "pattern_3": [],
                 "pattern_4": [
-                    { "start_idx": 0, "end_idx": 0 },
+                    {"start_idx": 0, "end_idx": 0},
                 ],
                 "pattern_5": [
-                    { "start_idx": 13, "end_idx": 13 },
+                    {"start_idx": 13, "end_idx": 13},
                 ],
                 "pattern_6": [
-                    { "start_idx": 48, "end_idx": 48 },
+                    {"start_idx": 48, "end_idx": 48},
                 ],
             },
         ),
@@ -366,22 +362,21 @@ def test_find_token_matches(
             },
             {
                 "pattern_1": [
-                    { "start_idx": 0, "end_idx": 9 },
-                    { "start_idx": 16, "end_idx": 25 },
+                    {"start_idx": 0, "end_idx": 9},
+                    {"start_idx": 16, "end_idx": 25},
                 ],
                 "pattern_2": [
-                    { "start_idx": 0, "end_idx": 16 },
-                    { "start_idx": 16, "end_idx": 32 },
+                    {"start_idx": 0, "end_idx": 16},
+                    {"start_idx": 16, "end_idx": 32},
                 ],
                 "pattern_3": [
-                    { "start_idx": 0, "end_idx": 25 },
+                    {"start_idx": 0, "end_idx": 25},
                 ],
             },
         ),
     ],
 )
 @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
-# yapf: enable
 def test_find_text_matches(
     prompt,
     target_by_key,
@@ -413,7 +408,6 @@ def test_find_text_matches(
     } == expected_by_key
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
     [
@@ -541,9 +535,8 @@ def test_find_text_matches(
                 },
             },
         ),
-    ]
+    ],
 )
-# yapf: enable
 def test_find_update_text(
     prompt,
     target_by_key,
@@ -554,13 +547,15 @@ def test_find_update_text(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     for (
-            update_type,
-            expected_by_mm_count,
+        update_type,
+        expected_by_mm_count,
     ) in expected_by_update_type_mm_count.items():
         for mm_count, expected in expected_by_mm_count.items():
             mm_prompt_updates = {
-                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
-                      for i in range(mm_count)]
+                key: [
+                    [update_type(key, target, repl_by_key[key]).resolve(i)]
+                    for i in range(mm_count)
+                ]
                 for key, target in target_by_key.items()
             }
 
@@ -581,7 +576,6 @@ def test_find_update_text(
             assert new_prompt == expected
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
     [
@@ -607,8 +601,43 @@ def test_find_update_text(
             {
                 PromptInsertion: {
                     0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
-                    1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550],  # noqa: E501
-                    2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550],  # noqa: E501
+                    1: [
+                        1,
+                        9833,
+                        28747,
+                        32000,
+                        32000,
+                        32000,
+                        9833,
+                        28747,
+                        32000,
+                        32000,
+                        918,
+                        1550,
+                        918,
+                        1550,
+                    ],  # noqa: E501
+                    2: [
+                        1,
+                        9833,
+                        28747,
+                        32000,
+                        32000,
+                        32000,
+                        32000,
+                        32000,
+                        9833,
+                        28747,
+                        32000,
+                        32000,
+                        918,
+                        1550,
+                        918,
+                        1550,
+                        1550,
+                        918,
+                        1550,
+                    ],  # noqa: E501
                 },
                 PromptReplacement: {
                     0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
@@ -711,9 +740,8 @@ def test_find_update_text(
                 },
             },
         ),
-    ]
+    ],
 )
-# yapf: enable
 def test_find_update_tokens(
     prompt,
     target_by_key,
@@ -724,13 +752,15 @@ def test_find_update_tokens(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     for (
-            update_type,
-            expected_by_mm_count,
+        update_type,
+        expected_by_mm_count,
     ) in expected_by_update_type_mm_count.items():
         for mm_count, expected in expected_by_mm_count.items():
             mm_prompt_updates = {
-                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
-                      for i in range(mm_count)]
+                key: [
+                    [update_type(key, target, repl_by_key[key]).resolve(i)]
+                    for i in range(mm_count)
+                ]
                 for key, target in target_by_key.items()
             }
 
@@ -751,7 +781,6 @@ def test_find_update_tokens(
             assert new_prompt == expected
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     "repl_by_key",
     [
@@ -788,8 +817,7 @@ def test_find_update_tokens(
                         is_embed=None,
                     ),
                 ],
-            }
-
+            },
         ),
         (
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
@@ -820,7 +848,7 @@ def test_find_update_tokens(
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
-            }
+            },
         ),
         (
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
@@ -859,12 +887,11 @@ def test_find_update_tokens(
                         is_embed=None,
                     ),
                 ],
-            }
+            },
         ),
-    ]
+    ],
 )
 @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
-# yapf: enable
 def test_find_mm_placeholders(
     repl_by_key,
     prompt,
@@ -891,8 +918,15 @@ def test_find_mm_placeholders(
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
-    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
-     (2, 1, False), (2, 2, True)],
+    [
+        (0, 0, True),
+        (0, 1, True),
+        (1, 0, False),
+        (1, 1, True),
+        (1, 2, True),
+        (2, 1, False),
+        (2, 2, True),
+    ],
 )
 def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     limit_mm_per_prompt = {"image": limit}
@@ -907,10 +941,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
 
     profiler = MultiModalProfiler(processor)
 
-    if is_valid:
-        exc_ctx = nullcontext()
-    else:
-        exc_ctx = pytest.raises(ValueError, match="At most")
+    exc_ctx = nullcontext() if is_valid else pytest.raises(ValueError, match="At most")
 
     with exc_ctx:
         profiler.get_decoder_dummy_data(
@@ -922,8 +953,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("num_images", "limit", "is_valid"),
-    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
-     (2, 1, False), (2, 2, True)],
+    [
+        (0, 0, True),
+        (0, 1, True),
+        (1, 0, False),
+        (1, 1, True),
+        (1, 2, True),
+        (2, 1, False),
+        (2, 2, True),
+    ],
 )
 def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
     limit_mm_per_prompt = {"image": limit}
@@ -944,10 +982,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
     else:
         mm_data = {"image": [image] * num_images}
 
-    if is_valid:
-        exc_ctx = nullcontext()
-    else:
-        exc_ctx = pytest.raises(ValueError, match="At most")
+    exc_ctx = nullcontext() if is_valid else pytest.raises(ValueError, match="At most")
 
     with exc_ctx:
         processor.apply(
@@ -958,7 +993,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
 
 
 class DummyProcessor:
-
     def __init__(self, a: int = 0, b: int = 0) -> None:
         super().__init__()
 
@@ -969,12 +1003,11 @@ class DummyProcessor:
         self,
         a: int = 0,
         c: int = 0,
-        return_tensors: Optional[str] = None,
+        return_tensors: str | None = None,
     ) -> dict[str, int]:
         return dict(a=a, c=c)
 
 
-# yapf: disable
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 @pytest.mark.parametrize(
     ("config_kwargs", "inference_kwargs", "expected_kwargs"),
@@ -988,7 +1021,6 @@ class DummyProcessor:
         ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
     ],
 )
-# yapf: enable
 def test_hf_processor_init_kwargs(
     model_id,
     config_kwargs,
@@ -1012,7 +1044,6 @@ def test_hf_processor_init_kwargs(
         assert getattr(processor, k) == v
 
 
-# yapf: disable
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 @pytest.mark.parametrize(
     ("config_kwargs", "inference_kwargs", "expected_kwargs"),
@@ -1026,7 +1057,6 @@ def test_hf_processor_init_kwargs(
         ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
     ],
 )
-# yapf: enable
 def test_hf_processor_call_kwargs(
     model_id,
     config_kwargs,
diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
index d31e75bc279f6a5c5150e10cd8871a82426a7834..3b01bda7f54c83b8457c3a567459f28e4c8c1afa 100644
--- a/tests/multimodal/test_registry.py
+++ b/tests/multimodal/test_registry.py
@@ -11,28 +11,24 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 
 from ..models.utils import build_model_context
 
+pytestmark = pytest.mark.cpu_test
+
 
 @pytest.mark.parametrize(
     "model_id,limit_mm_per_prompt,expected",
     [
         ("Qwen/Qwen2-0.5B-Instruct", {}, False),
         ("Qwen/Qwen2.5-VL-3B-Instruct", {}, True),
-        ("Qwen/Qwen2.5-VL-3B-Instruct", {
-            "image": 0,
-            "video": 0
-        }, False),
-        ("Qwen/Qwen2.5-VL-3B-Instruct", {
-            "image": 0
-        }, True),
+        ("Qwen/Qwen2.5-VL-3B-Instruct", {"image": 0, "video": 0}, False),
+        ("Qwen/Qwen2.5-VL-3B-Instruct", {"image": 0}, True),
     ],
 )
 @pytest.mark.core_model
 def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
-    """Test supports_multimodal_inputs returns correct boolean for various 
+    """Test supports_multimodal_inputs returns correct boolean for various
     configs."""
     ctx = build_model_context(
         model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(
-        ctx.model_config) is expected
\ No newline at end of file
+    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index d1a7882a4c37696332ae08133cf42d737ea645cc..ea795fcbbde55f2979dcc366285e2dd3fed74012 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -30,7 +30,6 @@ TEST_VIDEO_URLS = [
 
 @pytest.fixture(scope="module")
 def url_images(local_asset_server) -> dict[str, Image.Image]:
-
     return {
         image_url: local_asset_server.get_image_asset(image_url)
         for image_url in TEST_IMAGE_ASSETS
@@ -39,10 +38,10 @@ def url_images(local_asset_server) -> dict[str, Image.Image]:
 
 def get_supported_suffixes() -> tuple[str, ...]:
     # We should at least test the file types mentioned in GPT-4 with Vision
-    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
 
     # Additional file types that are supported by us
-    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
 
     return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
 
@@ -64,14 +63,16 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
-async def test_fetch_image_base64(url_images: dict[str, Image.Image],
-                                  raw_image_url: str, suffix: str):
+async def test_fetch_image_base64(
+    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
+):
     connector = MediaConnector(
         # Domain restriction should not apply to data URLs.
         allowed_media_domains=[
             "www.bogotobogo.com",
             "github.com",
-        ])
+        ]
+    )
     url_image = url_images[raw_image_url]
 
     try:
@@ -80,14 +81,14 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
         try:
             mime_type = mimetypes.types_map[suffix]
         except KeyError:
-            pytest.skip('No MIME type')
+            pytest.skip("No MIME type")
 
     with NamedTemporaryFile(suffix=suffix) as f:
         try:
             url_image.save(f.name)
         except Exception as e:
-            if e.args[0] == 'cannot write mode RGBA as JPEG':
-                pytest.skip('Conversion not supported')
+            if e.args[0] == "cannot write mode RGBA as JPEG":
+                pytest.skip("Conversion not supported")
 
             raise
 
@@ -113,30 +114,36 @@ async def test_fetch_image_local_files(image_url: str):
         local_connector = MediaConnector(allowed_local_media_path=temp_dir)
 
         origin_image = connector.fetch_image(image_url)
-        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
-                          quality=100,
-                          icc_profile=origin_image.info.get('icc_profile'))
+        origin_image.save(
+            os.path.join(temp_dir, os.path.basename(image_url)),
+            quality=100,
+            icc_profile=origin_image.info.get("icc_profile"),
+        )
 
         image_async = await local_connector.fetch_image_async(
-            f"file://{temp_dir}/{os.path.basename(image_url)}")
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
         image_sync = local_connector.fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}")
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
         with pytest.raises(ValueError, match="must be a subpath"):
             await local_connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
         with pytest.raises(RuntimeError, match="Cannot load local files"):
             await connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
 
         with pytest.raises(ValueError, match="must be a subpath"):
             local_connector.fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
         with pytest.raises(RuntimeError, match="Cannot load local files"):
-            connector.fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
 @pytest.mark.asyncio
@@ -149,18 +156,19 @@ async def test_fetch_image_local_files_with_space_in_name(image_url: str):
 
         origin_image = connector.fetch_image(image_url)
         filename = "file name with space.jpg"
-        origin_image.save(os.path.join(temp_dir, filename),
-                          quality=100,
-                          icc_profile=origin_image.info.get('icc_profile'))
+        origin_image.save(
+            os.path.join(temp_dir, filename),
+            quality=100,
+            icc_profile=origin_image.info.get("icc_profile"),
+        )
 
         try:
             image_async = await local_connector.fetch_image_async(
-                f"file://{temp_dir}/{filename}")
-            image_sync = local_connector.fetch_image(
-                f"file://{temp_dir}/{filename}")
+                f"file://{temp_dir}/{filename}"
+            )
+            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
         except FileNotFoundError as e:
-            pytest.fail(
-                "Failed to fetch image with space in name: {}".format(e))
+            pytest.fail("Failed to fetch image with space in name: {}".format(e))
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
@@ -183,9 +191,12 @@ async def test_fetch_image_error_conversion():
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_fetch_video_http(video_url: str, num_frames: int):
     connector = MediaConnector(
-        media_io_kwargs={"video": {
-            "num_frames": num_frames,
-        }})
+        media_io_kwargs={
+            "video": {
+                "num_frames": num_frames,
+            }
+        }
+    )
 
     video_sync, metadata_sync = connector.fetch_video(video_url)
     video_async, metadata_async = await connector.fetch_video_async(video_url)
@@ -198,8 +209,11 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
 @pytest.mark.parametrize("max_duration", [1, 60, 1800])
 @pytest.mark.parametrize("requested_fps", [2, 24])
 async def test_fetch_video_http_with_dynamic_loader(
-        video_url: str, max_duration: int, requested_fps: int,
-        monkeypatch: pytest.MonkeyPatch):
+    video_url: str,
+    max_duration: int,
+    requested_fps: int,
+    monkeypatch: pytest.MonkeyPatch,
+):
     with monkeypatch.context() as m:
         m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
         connector = MediaConnector(
@@ -208,18 +222,17 @@ async def test_fetch_video_http_with_dynamic_loader(
                     "max_duration": max_duration,
                     "requested_fps": requested_fps,
                 }
-            })
+            }
+        )
 
         video_sync, metadata_sync = connector.fetch_video(video_url)
-        video_async, metadata_async = await connector.fetch_video_async(
-            video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
 
         assert np.array_equal(video_sync, video_async)
         assert metadata_sync == metadata_async
         assert metadata_sync["video_backend"] == "opencv_dynamic"
 
 
-# yapf: disable
 @pytest.mark.parametrize(
     "case",
     [
@@ -250,7 +263,6 @@ async def test_fetch_video_http_with_dynamic_loader(
                 ("image", 0),
             ],
         ),
-
         # Two modalities
         ## Internally sorted
         dict(
@@ -262,7 +274,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                 "audio": [
                     PlaceholderRange(offset=0, length=2),
                     PlaceholderRange(offset=2, length=3),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("audio", 0),
@@ -281,7 +293,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                 "audio": [
                     PlaceholderRange(offset=5, length=2),
                     PlaceholderRange(offset=11, length=4),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("image", 0),
@@ -300,7 +312,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                 "audio": [
                     PlaceholderRange(offset=11, length=4),
                     PlaceholderRange(offset=5, length=2),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("image", 1),
@@ -309,7 +321,6 @@ async def test_fetch_video_http_with_dynamic_loader(
                 ("audio", 0),
             ],
         ),
-
         # Three modalities
         ## Internally sorted
         dict(
@@ -325,7 +336,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                     PlaceholderRange(offset=3, length=4),
                     PlaceholderRange(offset=7, length=5),
                     PlaceholderRange(offset=12, length=6),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("audio", 0),
@@ -349,7 +360,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                 ],
                 "video": [
                     PlaceholderRange(offset=8, length=5),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("image", 0),
@@ -372,7 +383,7 @@ async def test_fetch_video_http_with_dynamic_loader(
                 ],
                 "video": [
                     PlaceholderRange(offset=8, length=5),
-                ]
+                ],
             },
             expected_modality_idxs=[
                 ("image", 0),
@@ -384,7 +395,6 @@ async def test_fetch_video_http_with_dynamic_loader(
         ),
     ],
 )
-# yapf: enable
 def test_argsort_mm_positions(case):
     mm_positions = case["mm_positions"]
     expected_modality_idxs = case["expected_modality_idxs"]
@@ -399,13 +409,16 @@ def test_argsort_mm_positions(case):
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_allowed_media_domains(video_url: str, num_frames: int):
     connector = MediaConnector(
-        media_io_kwargs={"video": {
-            "num_frames": num_frames,
-        }},
+        media_io_kwargs={
+            "video": {
+                "num_frames": num_frames,
+            }
+        },
         allowed_media_domains=[
             "www.bogotobogo.com",
             "github.com",
-        ])
+        ],
+    )
 
     video_sync, metadata_sync = connector.fetch_video(video_url)
     video_async, metadata_async = await connector.fetch_video_async(video_url)
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 05b7b84be7f34ff8161250c908212c4b88c592fd..6572616769a910a7ade0ed2543e08175dbe007fc 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -12,11 +12,12 @@ from PIL import Image
 from vllm.assets.base import get_vllm_public_assets
 from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
 from vllm.multimodal.image import ImageMediaIO
-from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
-                                   VideoMediaIO)
+from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader, VideoMediaIO
 
 from .utils import cosine_similarity, create_video_from_image, normalize_image
 
+pytestmark = pytest.mark.cpu_test
+
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -24,7 +25,6 @@ FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 
 @VIDEO_LOADER_REGISTRY.register("test_video_loader_1")
 class TestVideoLoader1(VideoLoader):
-
     @classmethod
     def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
         return FAKE_OUTPUT_1
@@ -32,7 +32,6 @@ class TestVideoLoader1(VideoLoader):
 
 @VIDEO_LOADER_REGISTRY.register("test_video_loader_2")
 class TestVideoLoader2(VideoLoader):
-
     @classmethod
     def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
         return FAKE_OUTPUT_2
@@ -55,13 +54,10 @@ def test_video_loader_type_doesnt_exist():
 
 @VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
 class Assert10Frames1FPSVideoLoader(VideoLoader):
-
     @classmethod
-    def load_bytes(cls,
-                   data: bytes,
-                   num_frames: int = -1,
-                   fps: float = -1.0,
-                   **kwargs) -> npt.NDArray:
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
+    ) -> npt.NDArray:
         assert num_frames == 10, "bad num_frames"
         assert fps == 1.0, "bad fps"
         return FAKE_OUTPUT_2
@@ -77,11 +73,8 @@ def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
         _ = videoio.load_bytes(b"test")
 
         videoio = VideoMediaIO(
-            imageio, **{
-                "num_frames": 10,
-                "fps": 1.0,
-                "not_used": "not_used"
-            })
+            imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
+        )
         _ = videoio.load_bytes(b"test")
 
         with pytest.raises(AssertionError, match="bad num_frames"):
@@ -104,8 +97,9 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
     Test all functions that use OpenCV for video I/O return RGB format.
     Both RGB and grayscale videos are tested.
     """
-    image_path = get_vllm_public_assets(filename="stop_sign.jpg",
-                                        s3_prefix="vision_model_images")
+    image_path = get_vllm_public_assets(
+        filename="stop_sign.jpg", s3_prefix="vision_model_images"
+    )
     image = Image.open(image_path)
     with tempfile.TemporaryDirectory() as tmpdir:
         if not is_color:
@@ -125,21 +119,24 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
 
         frames = video_to_ndarrays(video_path)
         for frame in frames:
-            sim = cosine_similarity(normalize_image(np.array(frame)),
-                                    normalize_image(np.array(image)))
+            sim = cosine_similarity(
+                normalize_image(np.array(frame)), normalize_image(np.array(image))
+            )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
 
         pil_frames = video_to_pil_images_list(video_path)
         for frame in pil_frames:
-            sim = cosine_similarity(normalize_image(np.array(frame)),
-                                    normalize_image(np.array(image)))
+            sim = cosine_similarity(
+                normalize_image(np.array(frame)), normalize_image(np.array(image))
+            )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
 
         io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
         for frame in io_frames:
-            sim = cosine_similarity(normalize_image(np.array(frame)),
-                                    normalize_image(np.array(image)))
+            sim = cosine_similarity(
+                normalize_image(np.array(frame)), normalize_image(np.array(image))
+            )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 9a58292f9f4a5c4298e9760e868eeb5d799a68f3..485bde939f6905b08a991700c3783f7213c72977 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -8,7 +8,7 @@ from PIL import Image
 
 
 def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    w, h = rng.randint(min_wh, max_wh, size=(2,))
     arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
     return Image.fromarray(arr)
 
@@ -21,7 +21,7 @@ def random_video(
     max_wh: int,
 ):
     num_frames = rng.randint(min_frames, max_frames)
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    w, h = rng.randint(min_wh, max_wh, size=(2,))
     return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
 
 
@@ -66,14 +66,13 @@ def create_video_from_image(
     return video_path
 
 
-def cosine_similarity(A: npt.NDArray,
-                      B: npt.NDArray,
-                      axis: int = -1) -> npt.NDArray:
+def cosine_similarity(A: npt.NDArray, B: npt.NDArray, axis: int = -1) -> npt.NDArray:
     """Compute cosine similarity between two vectors."""
-    return (np.sum(A * B, axis=axis) /
-            (np.linalg.norm(A, axis=axis) * np.linalg.norm(B, axis=axis)))
+    return np.sum(A * B, axis=axis) / (
+        np.linalg.norm(A, axis=axis) * np.linalg.norm(B, axis=axis)
+    )
 
 
 def normalize_image(image: npt.NDArray) -> npt.NDArray:
     """Normalize image to [0, 1] range."""
-    return image.astype(np.float32) / 255.0
\ No newline at end of file
+    return image.astype(np.float32) / 255.0
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index 3e2c2577da66caba8ea81016228619bb340560b9..cd98efdd139098db4d87e314b116dde7d62161c8 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -13,11 +13,10 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def adapter_cache(request, tmpdir_factory):
     # Create dir that mimics the structure of the adapter cache
-    adapter_cache = tmpdir_factory.mktemp(
-        request.module.__name__) / "adapter_cache"
+    adapter_cache = tmpdir_factory.mktemp(request.module.__name__) / "adapter_cache"
     return adapter_cache
 
 
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index 42874f0398f0a547b58bd2bc082747688a89770b..5614f19d1a4f35c9a4a9198f1c1e8e1354a5f4cd 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import base64
 import datetime
 import os
 import tempfile
 import urllib.request
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import albumentations
 import numpy as np
@@ -20,14 +18,15 @@ from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import (IOProcessorRequest,
-                                              IOProcessorResponse)
+from vllm.entrypoints.openai.protocol import IOProcessorRequest, IOProcessorResponse
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput
-from vllm.plugins.io_processors.interface import (IOProcessor,
-                                                  IOProcessorInput,
-                                                  IOProcessorOutput)
+from vllm.plugins.io_processors.interface import (
+    IOProcessor,
+    IOProcessorInput,
+    IOProcessorOutput,
+)
 
 from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput
 
@@ -42,35 +41,25 @@ DEFAULT_INPUT_INDICES = [0, 1, 2, 3, 4, 5]
 
 datamodule_config: DataModuleConfig = {
     "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
-    "batch_size":
-    16,
-    "constant_scale":
-    0.0001,
-    "data_root":
-    "/dccstor/geofm-finetuning/datasets/sen1floods11",
-    "drop_last":
-    True,
-    "no_data_replace":
-    0.0,
-    "no_label_replace":
-    -1,
-    "num_workers":
-    8,
+    "batch_size": 16,
+    "constant_scale": 0.0001,
+    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
+    "drop_last": True,
+    "no_data_replace": 0.0,
+    "no_label_replace": -1,
+    "num_workers": 8,
     "test_transform": [
-        albumentations.Resize(always_apply=False,
-                              height=448,
-                              interpolation=1,
-                              p=1,
-                              width=448),
-        albumentations.pytorch.ToTensorV2(transpose_mask=False,
-                                          always_apply=True,
-                                          p=1.0),
+        albumentations.Resize(
+            always_apply=False, height=448, interpolation=1, p=1, width=448
+        ),
+        albumentations.pytorch.ToTensorV2(
+            transpose_mask=False, always_apply=True, p=1.0
+        ),
     ],
 }
 
 
-def save_geotiff(image: torch.Tensor, meta: dict,
-                 out_format: str) -> str | bytes:
+def save_geotiff(image: torch.Tensor, meta: dict, out_format: str) -> str | bytes:
     """Save multi-band image in Geotiff file.
 
     Args:
@@ -107,9 +96,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
 
 
 def read_geotiff(
-    file_path: Optional[str] = None,
-    path_type: Optional[str] = None,
-    file_data: Optional[bytes] = None,
+    file_path: str | None = None,
+    path_type: str | None = None,
+    file_data: bytes | None = None,
 ) -> tuple[torch.Tensor, dict, tuple[float, float] | None]:
     """Read all bands from *file_path* and return image + meta info.
 
@@ -123,8 +112,8 @@ def read_geotiff(
 
     if all([x is None for x in [file_path, path_type, file_data]]):
         raise Exception("All input fields to read_geotiff are None")
-    write_to_file: Optional[bytes] = None
-    path: Optional[str] = None
+    write_to_file: bytes | None = None
+    path: str | None = None
     if file_data is not None:
         # with tempfile.NamedTemporaryFile() as tmpfile:
         #     tmpfile.write(file_data)
@@ -169,11 +158,11 @@ def read_geotiff(
 
 
 def load_image(
-    data: Union[list[str]],
+    data: list[str],
     path_type: str,
-    mean: Optional[list[float]] = None,
-    std: Optional[list[float]] = None,
-    indices: Optional[Union[list[int], None]] = None,
+    mean: list[float] | None = None,
+    std: list[float] | None = None,
+    indices: list[int] | None | None = None,
 ):
     """Build an input example by loading images in *file_paths*.
 
@@ -219,8 +208,11 @@ def load_image(
                 if len(julian_day) == 3:
                     julian_day = int(julian_day)
                 else:
-                    julian_day = (datetime.datetime.strptime(
-                        julian_day, "%m%d").timetuple().tm_yday)
+                    julian_day = (
+                        datetime.datetime.strptime(julian_day, "%m%d")
+                        .timetuple()
+                        .tm_yday
+                    )
                 temporal_coords.append([year, julian_day])
         except Exception:
             logger.exception("Could not extract timestamp for %s", file)
@@ -233,11 +225,9 @@ def load_image(
 
 
 class PrithviMultimodalDataProcessor(IOProcessor):
-
     indices = [0, 1, 2, 3, 4, 5]
 
     def __init__(self, vllm_config: VllmConfig):
-
         super().__init__(vllm_config)
 
         self.datamodule = Sen1Floods11NonGeoDataModule(
@@ -264,8 +254,7 @@ class PrithviMultimodalDataProcessor(IOProcessor):
             return image_prompt
         if isinstance(request, IOProcessorRequest):
             if not hasattr(request, "data"):
-                raise ValueError(
-                    "missing 'data' field in OpenAIBaseModel Request")
+                raise ValueError("missing 'data' field in OpenAIBaseModel Request")
 
             request_data = request.data
 
@@ -277,7 +266,8 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         raise ValueError("Unable to parse request")
 
     def output_to_response(
-            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        self, plugin_output: IOProcessorOutput
+    ) -> IOProcessorResponse:
         return IOProcessorResponse(
             request_id=plugin_output.request_id,
             data=plugin_output,
@@ -286,10 +276,9 @@ class PrithviMultimodalDataProcessor(IOProcessor):
     def pre_process(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
-
+    ) -> PromptType | Sequence[PromptType]:
         image_data = dict(prompt)
 
         if request_id:
@@ -309,10 +298,8 @@ class PrithviMultimodalDataProcessor(IOProcessor):
             input_data = input_data / 10000  # Convert to range 0-1
 
         self.original_h, self.original_w = input_data.shape[-2:]
-        pad_h = (self.img_size -
-                 (self.original_h % self.img_size)) % self.img_size
-        pad_w = (self.img_size -
-                 (self.original_w % self.img_size)) % self.img_size
+        pad_h = (self.img_size - (self.original_h % self.img_size)) % self.img_size
+        pad_w = (self.img_size - (self.original_w % self.img_size)) % self.img_size
         input_data = np.pad(
             input_data,
             ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
@@ -320,9 +307,9 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         )
 
         batch = torch.tensor(input_data)
-        windows = batch.unfold(3, self.img_size,
-                               self.img_size).unfold(4, self.img_size,
-                                                     self.img_size)
+        windows = batch.unfold(3, self.img_size, self.img_size).unfold(
+            4, self.img_size, self.img_size
+        )
         self.h1, self.w1 = windows.shape[3:5]
         windows = rearrange(
             windows,
@@ -332,8 +319,11 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         )
 
         # Split into batches if number of windows > batch_size
-        num_batches = (windows.shape[0] // self.batch_size
-                       if windows.shape[0] > self.batch_size else 1)
+        num_batches = (
+            windows.shape[0] // self.batch_size
+            if windows.shape[0] > self.batch_size
+            else 1
+        )
         windows = torch.tensor_split(windows, num_batches, dim=0)
 
         if temporal_coords:
@@ -349,25 +339,27 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         for window in windows:
             # Apply standardization
             window = self.datamodule.test_transform(
-                image=window.squeeze().numpy().transpose(1, 2, 0))
+                image=window.squeeze().numpy().transpose(1, 2, 0)
+            )
             window = self.datamodule.aug(window)["image"]
-            prompts.append({
-                "prompt_token_ids": [1],
-                "multi_modal_data": {
-                    "pixel_values": window.to(torch.float16)[0],
-                    "location_coords": location_coords.to(torch.float16),
-                },
-            })
+            prompts.append(
+                {
+                    "prompt_token_ids": [1],
+                    "multi_modal_data": {
+                        "pixel_values": window.to(torch.float16)[0],
+                        "location_coords": location_coords.to(torch.float16),
+                    },
+                }
+            )
 
         return prompts
 
     def post_process(
         self,
         model_output: Sequence[PoolingRequestOutput],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
-
         pred_imgs_list = []
 
         if request_id and (request_id in self.requests_cache):
@@ -376,9 +368,9 @@ class PrithviMultimodalDataProcessor(IOProcessor):
             out_format = "b64_json"
 
         for output in model_output:
-            y_hat = output.outputs.data.argmax(dim=1)
+            y_hat = output.outputs.data.argmax(dim=0)
             pred = torch.nn.functional.interpolate(
-                y_hat.unsqueeze(1).float(),
+                y_hat[None, None, ...].float(),
                 size=self.img_size,
                 mode="nearest",
             )
@@ -399,7 +391,7 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         )
 
         # Cut padded area back to original size
-        pred_imgs = pred_imgs[..., :self.original_h, :self.original_w]
+        pred_imgs = pred_imgs[..., : self.original_h, : self.original_w]
 
         # Squeeze (batch size 1)
         pred_imgs = pred_imgs[0]
@@ -407,10 +399,10 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         if not self.meta_data:
             raise ValueError("No metadata available for the current task")
         self.meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
-        out_data = save_geotiff(_convert_np_uint8(pred_imgs), self.meta_data,
-                                out_format)
+        out_data = save_geotiff(
+            _convert_np_uint8(pred_imgs), self.meta_data, out_format
+        )
 
-        return ImageRequestOutput(type=out_format,
-                                  format="tiff",
-                                  data=out_data,
-                                  request_id=request_id)
+        return ImageRequestOutput(
+            type=out_format, format="tiff", data=out_data, request_id=request_id
+        )
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
index d4c6628211fb2234c28fa2659bf41d1fecabb12d..d1d7873211f251b66f826206ad4488e1145bc7af 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Any, Literal, TypedDict
 
 import albumentations
 from pydantic import BaseModel
@@ -16,12 +16,10 @@ class DataModuleConfig(TypedDict):
     no_data_replace: float
     no_label_replace: int
     num_workers: int
-    test_transform: list[
-        albumentations.core.transforms_interface.BasicTransform]
+    test_transform: list[albumentations.core.transforms_interface.BasicTransform]
 
 
 class ImagePrompt(BaseModel):
-
     data_format: Literal["b64_json", "bytes", "url", "path"]
     """
     This is the data type for the input image
@@ -40,12 +38,12 @@ class ImagePrompt(BaseModel):
     """
 
 
-MultiModalPromptType = Union[ImagePrompt]
+MultiModalPromptType = ImagePrompt
 
 
 class ImageRequestOutput(BaseModel):
     """
-    The output data of an image request to vLLM. 
+    The output data of an image request to vLLM.
 
     Args:
         type (str): The data content type [path, object]
@@ -56,4 +54,4 @@ class ImageRequestOutput(BaseModel):
     type: Literal["path", "b64_json"]
     format: str
     data: str
-    request_id: Optional[str] = None
+    request_id: str | None = None
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index 6307bb63897ac475ad9470594bf375523e0727a7..eeffac5d3eddde2b10432e325c3e0ad8025ff507 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -3,10 +3,11 @@
 
 from setuptools import setup
 
-setup(name='vllm_add_dummy_model',
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
-      entry_points={
-          'vllm.general_plugins':
-          ["register_dummy_model = vllm_add_dummy_model:register"]
-      })
+setup(
+    name="vllm_add_dummy_model",
+    version="0.1",
+    packages=["vllm_add_dummy_model"],
+    entry_points={
+        "vllm.general_plugins": ["register_dummy_model = vllm_add_dummy_model:register"]
+    },
+)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index b2085b01c45c1c335b5f2da957b8b1c6edb49ae0..457187e4b492edef215b8952e9303ddaf73649eb 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -19,5 +19,4 @@ def register():
         )
 
     if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyLlava",
-                                     "vllm_add_dummy_model.my_llava:MyLlava")
+        ModelRegistry.register_model("MyLlava", "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index fc654f20fff22a9f058c434c810b07a446972bdc..98245cdf0c984f81be8b9558e1b6859a9b1fa372 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -15,7 +14,6 @@ from vllm.sequence import IntermediateTensors
 
 
 class MyGemma2Embedding(nn.Module):
-
     is_pooling_model = True
 
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
@@ -23,27 +21,31 @@ class MyGemma2Embedding(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        self.model = Gemma2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode": Pooler.for_encode(pooler_config),
-            "embed": Pooler.for_embed(pooler_config),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": Pooler.for_embed(pooler_config),
+            }
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
@@ -58,8 +60,8 @@ class MyGemma2Embedding(nn.Module):
         return torch.zeros_like(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         weights = self.hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
+        weights = (
+            (name, data) for name, data in weights if not name.startswith("lm_head.")
+        )
         return self.model.load_weights(weights)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index b431ad1ed0928aa0a3ce547f2e03ba75390a6a2a..79af3ad842f5bb161d3788ee0cfed702e572cd86 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,24 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
-                                              LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor,
-                                              LlavaProcessingInfo)
+from vllm.model_executor.models.llava import (
+    LlavaDummyInputsBuilder,
+    LlavaForConditionalGeneration,
+    LlavaMultiModalProcessor,
+    LlavaProcessingInfo,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
-                                        info=LlavaProcessingInfo,
-                                        dummy_inputs=LlavaDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaMultiModalProcessor,
+    info=LlavaProcessingInfo,
+    dummy_inputs=LlavaDummyInputsBuilder,
+)
 class MyLlava(LlavaForConditionalGeneration):
-
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index a6fafff98e9c5ccbd6cf927e4bd1e8242a751276..f1e6e7b10f8b633406e7fd2d48b5159da1bf1489 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -9,9 +8,7 @@ from vllm.model_executor.models.opt import OPTForCausalLM
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
-
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index a531826628cda1e7043aa8c6a80e43719d088f5b..b976dddb7fb5ddecafebba6bcfa7e14e822ca8b3 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -4,13 +4,15 @@
 from setuptools import setup
 
 setup(
-    name='vllm_add_dummy_platform',
-    version='0.1',
-    packages=['vllm_add_dummy_platform'],
+    name="vllm_add_dummy_platform",
+    version="0.1",
+    packages=["vllm_add_dummy_platform"],
     entry_points={
-        'vllm.platform_plugins': [
+        "vllm.platform_plugins": [
             "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
         ],
-        "vllm.general_plugins":
-        ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
-    })
+        "vllm.general_plugins": [
+            "dummy_custom_ops = vllm_add_dummy_platform:register_ops"
+        ],
+    },
+)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index c4fe6ed197f6bb2c0a13a8e8954f2d44532ce7c2..280b68514e19320906965f0c374e730e255a11f7 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
-
-def dummy_platform_plugin() -> Optional[str]:
+def dummy_platform_plugin() -> str | None:
     return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
 
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index e38fb2fbf934ef54ddc3993ab0f8cdee1c5f000b..f2d516f52b8b36673c731f801871926057f27df5 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.attention.backends.placeholder_attn import (
-    PlaceholderAttentionBackend)
+from vllm.attention.backends.placeholder_attn import PlaceholderAttentionBackend
 
 
 class DummyAttentionBackend(PlaceholderAttentionBackend):
-
     @staticmethod
     def get_name() -> str:
         return "Dummy_Backend"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
index 1fcc3fc66617394d86b8ab8884b8090c50c989f0..b7302857452698d5abf422dba5591ba9afff27a1 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
@@ -15,6 +15,5 @@ class DummyRotaryEmbedding(RotaryEmbedding):
         super().__init__(*args, **kwargs)
         self.addition_config = True
 
-    def forward_oot(self, *args,
-                    **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+    def forward_oot(self, *args, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
         return super().forward_oot(*args, **kwargs)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 30d721304b5ca59413e7d3969fc4bf084f93e2dd..a80617a366cab17beaec1d547c741d095696e484 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-from vllm import envs
 
 
 class DummyPlatform(Platform):
@@ -19,12 +18,17 @@ class DummyPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if envs.VLLM_USE_V1:
-            compilation_config = vllm_config.compilation_config
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+        vllm_config.compilation_config.custom_ops = ["all"]
 
-    def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla,
-                             has_sink, use_sparse):
+    def get_attn_backend_cls(
+        self,
+        backend_name,
+        head_size,
+        dtype,
+        kv_cache_dtype,
+        block_size,
+        use_mla,
+        has_sink,
+        use_sparse,
+    ):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/plugins/vllm_add_dummy_stat_logger/dummy_stat_logger/dummy_stat_logger.py b/tests/plugins/vllm_add_dummy_stat_logger/dummy_stat_logger/dummy_stat_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ec35c0d5c97bf0d49bbc31cb897e1bd150c8d0
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_stat_logger/dummy_stat_logger/dummy_stat_logger.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.v1.metrics.loggers import StatLoggerBase
+
+
+class DummyStatLogger(StatLoggerBase):
+    """
+    A dummy stat logger for testing purposes.
+    Implements the minimal interface expected by StatLoggerManager.
+    """
+
+    def __init__(self, vllm_config, engine_idx=0):
+        self.vllm_config = vllm_config
+        self.engine_idx = engine_idx
+        self.recorded = []
+        self.logged = False
+        self.engine_initialized = False
+
+    def record(self, scheduler_stats, iteration_stats, mm_cache_stats, engine_idx):
+        self.recorded.append(
+            (scheduler_stats, iteration_stats, mm_cache_stats, engine_idx)
+        )
+
+    def log(self):
+        self.logged = True
+
+    def log_engine_initialized(self):
+        self.engine_initialized = True
diff --git a/tests/plugins/vllm_add_dummy_stat_logger/setup.py b/tests/plugins/vllm_add_dummy_stat_logger/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..517017724bcc04aebbb31491a3901fbf030bcd08
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_stat_logger/setup.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name="dummy_stat_logger",
+    version="0.1",
+    packages=["dummy_stat_logger"],
+    entry_points={
+        "vllm.stat_logger_plugins": [
+            "dummy_stat_logger = dummy_stat_logger.dummy_stat_logger:DummyStatLogger"  # noqa
+        ]
+    },
+)
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 3567a701a3afa1e78807598dcc5929de591a3b2d..582cf9a0711b1274d8ac0bf64f3d0eaeb1d4d658 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -9,7 +9,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
-from vllm.pooling_params import PoolingParams
 
 MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 
@@ -38,6 +37,7 @@ def server():
         "prithvi_to_tiff",
         "--model-impl",
         "terratorch",
+        "--enable-mm-embeds",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -50,7 +50,6 @@ async def test_prithvi_mae_plugin_online(
     server: RemoteOpenAIServer,
     model_name: str,
 ):
-
     request_payload_url = {
         "data": {
             "data": image_url,
@@ -60,7 +59,7 @@ async def test_prithvi_mae_plugin_online(
         },
         "priority": 0,
         "model": model_name,
-        "softmax": False
+        "softmax": False,
     }
 
     ret = requests.post(
@@ -77,8 +76,8 @@ async def test_prithvi_mae_plugin_online(
     plugin_data = parsed_response.data
 
     assert all(
-        plugin_data.get(attr)
-        for attr in ["type", "format", "data", "request_id"])
+        plugin_data.get(attr) for attr in ["type", "format", "data", "request_id"]
+    )
 
     # We just check that the output is a valid base64 string.
     # Raises an exception and fails the test if the string is corrupted.
@@ -87,7 +86,6 @@ async def test_prithvi_mae_plugin_online(
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
-
     img_prompt = dict(
         data=image_url,
         data_format="url",
@@ -95,30 +93,26 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
         out_data_format="b64_json",
     )
 
-    pooling_params = PoolingParams(task="encode", softmax=False)
-
     with vllm_runner(
-            model_name,
-            runner="pooling",
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=1,
-            model_impl="terratorch",
-            io_processor_plugin="prithvi_to_tiff",
+        model_name,
+        runner="pooling",
+        skip_tokenizer_init=True,
+        enable_mm_embeds=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM in CI.
+        max_num_seqs=1,
+        model_impl="terratorch",
+        io_processor_plugin="prithvi_to_tiff",
     ) as llm_runner:
-        pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
-        )
+        pooler_output = llm_runner.get_llm().encode(img_prompt, pooling_task="plugin")
     output = pooler_output[0].outputs
 
     # verify the output is formatted as expected for this plugin
     assert all(
-        hasattr(output, attr)
-        for attr in ["type", "format", "data", "request_id"])
+        hasattr(output, attr) for attr in ["type", "format", "data", "request_id"]
+    )
 
     # We just check that the output is a valid base64 string.
     # Raises an exception and fails the test if the string is corrupted.
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 1d7e4475011d09afddf8d0ae2848c9bf3319f562..4dace171a8d3bc52eb3f67a8a4adb9e6a410cd1c 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -10,29 +10,38 @@ from vllm.plugins import load_general_plugins
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
+
     current_file = __file__
     import os
+
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/basic/basic.py")
+        "examples",
+        "offline_inference/basic/basic.py",
+    )
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
     from vllm.platforms import _init_trace, current_platform
+
     assert current_platform.device_name == "DummyDevice", (
         f"Expected DummyDevice, got {current_platform.device_name}, "
         "possibly because current_platform is imported before the plugin"
-        f" is loaded. The first import:\n{_init_trace}")
+        f" is loaded. The first import:\n{_init_trace}"
+    )
 
 
 def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
     # simulate workload by running an example
     load_general_plugins()
     from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
     layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
     assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
         f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
-        "possibly because the custom op is not registered correctly.")
+        "possibly because the custom op is not registered correctly."
+    )
     assert hasattr(layer, "addition_config"), (
         "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
-        "which is set by the custom op.")
+        "which is set by the custom op."
+    )
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 099869a82ad21a454974837e0b7a5fc5e801114f..45902cc874c30b293188c20f56fd771f038b03b5 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -10,20 +10,17 @@ from vllm.v1.engine.llm_engine import LLMEngine
 
 
 class DummyV1Scheduler(Scheduler):
-
     def schedule(self):
         raise Exception("Exception raised by DummyV1Scheduler")
 
 
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         # Explicitly turn off engine multiprocessing so
         # that the scheduler runs in this process
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
         with pytest.raises(Exception) as exception_info:
-
             engine_args = EngineArgs(
                 model="facebook/opt-125m",
                 enforce_eager=True,  # reduce test time
@@ -36,5 +33,4 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
             engine.add_request("0", "foo", sampling_params)
             engine.step()
 
-        assert str(
-            exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/tests/plugins_tests/test_stats_logger_plugins.py b/tests/plugins_tests/test_stats_logger_plugins.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb03b1fde4179a28268a99fe3714a5baee301239
--- /dev/null
+++ b/tests/plugins_tests/test_stats_logger_plugins.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from dummy_stat_logger.dummy_stat_logger import DummyStatLogger
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.metrics.loggers import load_stat_logger_plugin_factories
+
+
+def test_stat_logger_plugin_is_discovered(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "dummy_stat_logger")
+
+        factories = load_stat_logger_plugin_factories()
+        assert len(factories) == 1, f"Expected 1 factory, got {len(factories)}"
+        assert factories[0] is DummyStatLogger, (
+            f"Expected DummyStatLogger class, got {factories[0]}"
+        )
+
+        # instantiate and confirm the right type
+        vllm_config = VllmConfig()
+        instance = factories[0](vllm_config)
+        assert isinstance(instance, DummyStatLogger)
+
+
+def test_no_plugins_loaded_if_env_empty(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "")
+
+        factories = load_stat_logger_plugin_factories()
+        assert factories == []
+
+
+def test_invalid_stat_logger_plugin_raises(monkeypatch: pytest.MonkeyPatch):
+    def fake_plugin_loader(group: str):
+        assert group == "vllm.stat_logger_plugins"
+        return {"bad": object()}
+
+    with monkeypatch.context() as m:
+        m.setattr(
+            "vllm.v1.metrics.loggers.load_plugins_by_group",
+            fake_plugin_loader,
+        )
+        with pytest.raises(
+            TypeError,
+            match="Stat logger plugin 'bad' must be a subclass of StatLoggerBase",
+        ):
+            load_stat_logger_plugin_factories()
+
+
+@pytest.mark.asyncio
+async def test_stat_logger_plugin_integration_with_engine(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "dummy_stat_logger")
+
+        engine_args = AsyncEngineArgs(
+            model="facebook/opt-125m",
+            enforce_eager=True,  # reduce test time
+            disable_log_stats=True,  # disable default loggers
+        )
+
+        engine = AsyncLLM.from_engine_args(engine_args=engine_args)
+
+        assert len(engine.logger_manager.stat_loggers) == 2
+        assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1
+        assert isinstance(
+            engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0],
+            DummyStatLogger,
+        )
+
+        engine.shutdown()
diff --git a/tests/quantization/fp_quant.py b/tests/quantization/fp_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..664ce9d111e4e0b00bbe5edfad82083f31245d8a
--- /dev/null
+++ b/tests/quantization/fp_quant.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test model set-up and inference for quantized HF models supported
+on the GPU backend using FPQuant.
+
+Validating the configuration and printing results for manual checking.
+
+Run `pytest tests/quantization/test_fp_quant.py`.
+"""
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+MODELS = [
+    "ISTA-DASLab/Qwen3-0.6B-RTN-NVFP4",
+    "ISTA-DASLab/Qwen3-0.6B-RTN-MXFP4",
+]
+DTYPE = ["bfloat16"]
+EAGER = [True, False]
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp_quant"),
+    reason="FPQuant is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("eager", EAGER)
+def test_fpquant(vllm_runner, model, eager):
+    with vllm_runner(model, enforce_eager=eager) as llm:
+        output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
+    assert output[0][1] == "1 2 3 4 5 6"
diff --git a/tests/quantization/reference_mxfp4.py b/tests/quantization/reference_mxfp4.py
index 2ef251933f68196567cc023cfdd5b9b2ee1a9e70..d84659ed035eb3bb73ae4630df0e196fc988637b 100644
--- a/tests/quantization/reference_mxfp4.py
+++ b/tests/quantization/reference_mxfp4.py
@@ -14,14 +14,15 @@ FLOAT8_E8M0_MAX_EXP = 127
 FLOAT4_EXP_BIAS = 1
 FLOAT4_MANTISSA_BITS = 1
 
-FLOAT16_VAL_TO_ADD = (1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
-FLOAT16_SIGN_EXPONENT_MASK = ((
-    (1 << (FLOAT16_EXP_BITS + 1)) - 1) << FLOAT16_MANTISSA_BITS)
+FLOAT16_VAL_TO_ADD = 1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
+FLOAT16_SIGN_EXPONENT_MASK = (
+    (1 << (FLOAT16_EXP_BITS + 1)) - 1
+) << FLOAT16_MANTISSA_BITS
 
-BFLOAT16_VAL_TO_ADD = (1 <<
-                       (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
-BFLOAT16_SIGN_EXPONENT_MASK = ((
-    (1 << (BFLOAT16_EXP_BITS + 1)) - 1) << BFLOAT16_MANTISSA_BITS)
+BFLOAT16_VAL_TO_ADD = 1 << (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
+BFLOAT16_SIGN_EXPONENT_MASK = (
+    (1 << (BFLOAT16_EXP_BITS + 1)) - 1
+) << BFLOAT16_MANTISSA_BITS
 
 
 def e8m0_to_half(scale, half_dtype: torch.dtype):
@@ -30,19 +31,19 @@ def e8m0_to_half(scale, half_dtype: torch.dtype):
     scale_exp = scale.to(torch.int16) - 127
 
     # This can be implemented with bitwise operations in a proper kernel.
-    scale_half = 2.0**(scale_exp.to(torch.float))
+    scale_half = 2.0 ** (scale_exp.to(torch.float))
 
     return scale_half.to(half_dtype)
 
 
-def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
-                               half_exp_bias: int, half_mantissa_bits: int):
+def upcast_fp4_to_fp16_or_bf16(
+    val, float_dtype: torch.dtype, half_exp_bias: int, half_mantissa_bits: int
+):
     assert val.dtype == torch.uint8
 
-    unpacked = torch.zeros(*val.shape[:-1],
-                           val.shape[-1] * 2,
-                           dtype=torch.uint8,
-                           device=val.device)
+    unpacked = torch.zeros(
+        *val.shape[:-1], val.shape[-1] * 2, dtype=torch.uint8, device=val.device
+    )
     unpacked[..., 1::2] = (val >> 4) & 0x0F  # Extract high 4 bits.
     unpacked[..., ::2] = val & 0x0F  # Extract low 4 bits.
 
@@ -72,8 +73,11 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
     new_exp = new_exp.to(torch.int32)
     sign = sign.to(torch.int32)
 
-    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
-        new_mantissa << (half_mantissa_bits - 1))
+    qdq_val = (
+        (sign << 15)
+        + (new_exp << half_mantissa_bits)
+        + (new_mantissa << (half_mantissa_bits - 1))
+    )
 
     assert qdq_val.max() <= 65535
     assert qdq_val.min() >= 0
@@ -84,8 +88,9 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
     return result
 
 
-def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
-                   float_dtype: torch.dtype) -> torch.Tensor:
+def dq_mxfp4_torch(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
     assert x.dtype == torch.uint8
     assert scale.dtype == torch.uint8
 
@@ -98,10 +103,12 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
 
     scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
 
-    x_half = upcast_fp4_to_fp16_or_bf16(x,
-                                        float_dtype=float_dtype,
-                                        half_exp_bias=half_exp_bias,
-                                        half_mantissa_bits=half_mantissa_bits)
+    x_half = upcast_fp4_to_fp16_or_bf16(
+        x,
+        float_dtype=float_dtype,
+        half_exp_bias=half_exp_bias,
+        half_mantissa_bits=half_mantissa_bits,
+    )
 
     x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
     x_half = x_half * scale_half[..., None]
@@ -110,8 +117,9 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
     return x_half
 
 
-def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
-                         half_exp_bias: int):
+def fp16_to_fp4_simulate(
+    val, half_mantissa_bits: int, half_exp_bits: int, half_exp_bias: int
+):
     # Casts an fp16/bf16 input to the restricted values of float4_e2m1,
     # that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
     # -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
@@ -119,7 +127,7 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     float_type = val.dtype
 
     # "rshift_cuda" not implemented for 'UInt16'
-    val_view = val.view(torch.int16)  #.to(torch.int32)
+    val_view = val.view(torch.int16)  # .to(torch.int32)
 
     exp = val_view >> half_mantissa_bits
     exp = exp & ((1 << half_exp_bits) - 1)
@@ -147,23 +155,15 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
 
     tail = mantissa_plus_one & ((1 << tail_bits) - 1)
 
-    round_close = (tail < half)  # round towards 0
-    round_away = (tail > half)  # round away from 0
+    round_close = tail < half  # round towards 0
+    round_away = tail > half  # round away from 0
     tie = tail == half
 
-    new_mantissa_close = torch.zeros(val.shape,
-                                     device=val.device,
-                                     dtype=torch.bool)
-    new_exp_close = torch.zeros(val.shape,
-                                device=val.device,
-                                dtype=torch.uint16)
+    new_mantissa_close = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
+    new_exp_close = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
-    new_mantissa_away = torch.zeros(val.shape,
-                                    device=val.device,
-                                    dtype=torch.bool)
-    new_exp_away = torch.zeros(val.shape,
-                               device=val.device,
-                               dtype=torch.uint16)
+    new_mantissa_away = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
+    new_exp_away = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
     new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
@@ -202,27 +202,29 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
 
     # Gather round up, round down and tie.
-    new_exp = round_away * new_exp_away \
-        + round_close * new_exp_close \
-        + tie * new_exp_tie
+    new_exp = (
+        round_away * new_exp_away + round_close * new_exp_close + tie * new_exp_tie
+    )
 
-    new_mantissa = round_away * new_mantissa_away \
-        + round_close * new_mantissa_close
+    new_mantissa = round_away * new_mantissa_away + round_close * new_mantissa_close
 
     # if new_exp > 3:
     #     new_mantissa = 1
-    new_mantissa = new_mantissa + (new_exp >
-                                   (2 + half_exp_bias)) * (new_mantissa == 0)
+    new_mantissa = new_mantissa + (new_exp > (2 + half_exp_bias)) * (new_mantissa == 0)
 
     # Clamp the exponent to acceptable values.
     new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
-        new_exp, half_exp_bias - 2, half_exp_bias + 2)
+        new_exp, half_exp_bias - 2, half_exp_bias + 2
+    )
 
     sign = sign.to(torch.int32)
     new_mantissa = new_mantissa.to(torch.int32)
 
-    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
-        new_mantissa << (half_mantissa_bits - 1))
+    qdq_val = (
+        (sign << 15)
+        + (new_exp << half_mantissa_bits)
+        + (new_mantissa << (half_mantissa_bits - 1))
+    )
 
     assert qdq_val.max() <= 65535
     assert qdq_val.min() >= 0
@@ -233,8 +235,9 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     return result
 
 
-def qdq_mxfp4_torch(x: torch.Tensor,
-                    scale_calculation_mode: str = "even") -> torch.Tensor:
+def qdq_mxfp4_torch(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
     half_dtype = x.dtype
 
     if half_dtype == torch.float16:
@@ -258,8 +261,7 @@ def qdq_mxfp4_torch(x: torch.Tensor,
 
     block_max = block_max.view(torch.uint16).to(torch.int32)
 
-    block_max_uint = torch.bitwise_and(block_max + val_to_add,
-                                       sign_exponent_mask)
+    block_max_uint = torch.bitwise_and(block_max + val_to_add, sign_exponent_mask)
 
     assert block_max_uint.max() <= 65535
     assert block_max_uint.min() >= 0
@@ -268,20 +270,23 @@ def qdq_mxfp4_torch(x: torch.Tensor,
 
     block_max = block_max_uint.view(half_dtype)
 
-    scale_exp = FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(
-        torch.int32) - 2
+    scale_exp = (
+        FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(torch.int32) - 2
+    )
 
     scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
 
-    scale = 2.0**(scale_exp - FLOAT8_E8M0_MAX_EXP)
+    scale = 2.0 ** (scale_exp - FLOAT8_E8M0_MAX_EXP)
     scale = scale.to(half_dtype)
 
     x = x / scale[..., None]
 
-    x_fp4 = fp16_to_fp4_simulate(x,
-                                 half_exp_bits=half_exp_bits,
-                                 half_mantissa_bits=half_mantissa_bits,
-                                 half_exp_bias=half_exp_bias)
+    x_fp4 = fp16_to_fp4_simulate(
+        x,
+        half_exp_bits=half_exp_bits,
+        half_mantissa_bits=half_mantissa_bits,
+        half_exp_bias=half_exp_bias,
+    )
 
     x_fp4 = x_fp4 * scale[..., None]
     return x_fp4.reshape(*x_fp4.shape[:-2], -1)
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 1c41d904b8168f496f72058b5060f7c660da40f1..9f5db821950122d86649032fd2354f94c0327df9 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
- on the AutoRound.
+on the AutoRound.
 
- Validating the configuration and printing results for manual checking.
+Validating the configuration and printing results for manual checking.
 
- Run `pytest tests/quantization/test_auto_round.py`.
+Run `pytest tests/quantization/test_auto_round.py`.
 """
 
 import pytest
@@ -14,18 +14,19 @@ from vllm.platforms import current_platform
 
 MODELS = [
     "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  ##auto_round:auto_gptq
-    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound"  ##auto_round:auto_awq
+    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound",  ##auto_round:auto_awq
 ]
 
 
-@pytest.mark.skipif(not current_platform.is_cpu()
-                    and not current_platform.is_xpu()
-                    and not current_platform.is_cuda(),
-                    reason="only supports CPU/XPU/CUDA backend.")
+@pytest.mark.skipif(
+    not current_platform.is_cpu()
+    and not current_platform.is_xpu()
+    and not current_platform.is_cuda(),
+    reason="only supports CPU/XPU/CUDA backend.",
+)
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=8)
+    with vllm_runner(model, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
     assert output
     print(f"{output[0][1]}")
diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd4551ff4b9619b241180ce73e4c90767b08c8d
--- /dev/null
+++ b/tests/quantization/test_blackwell_moe.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+from typing import Any
+
+import pytest
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if not current_platform.is_device_capability(100):
+    pytest.skip(
+        "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
+    )
+
+
+@pytest.fixture(scope="module", autouse=True)
+def set_test_environment():
+    """Sets environment variables required for this test module."""
+    # Make sure TRTLLM attention is available
+    os.environ["VLLM_HAS_FLASHINFER_CUBIN"] = "1"
+    # Set compilation threads to 16 to speed up startup
+    os.environ["FLASHINFER_NVCC_THREADS"] = "16"
+
+
+# Overide the backbone layers to 4 for faster startup
+HF_OVERRIDE_TEXT = {
+    "num_layers": 4,
+    "num_hidden_layers": 4,
+}
+HF_OVERRIDE_MM = {
+    "text_config": {"num_layers": 4, "num_hidden_layers": 4},
+}
+
+
+def can_initialize(
+    model: str,
+    hf_overrides: dict[str, Any] | None = None,
+    extra_args: list[str] | None = None,
+):
+    # Server arguments
+    extra_args = extra_args if extra_args is not None else []
+    server_args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-batched-tokens",
+        "256",
+        "--load-format",
+        "dummy",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": 0}),
+        *extra_args,
+    ]
+
+    # Launch server and make a simple request
+    with RemoteOpenAIServer(
+        model,
+        server_args,
+        max_wait_seconds=1500,  # Due to FlashInfer compile
+        override_hf_configs=hf_overrides,
+    ) as server:
+        client = server.get_client()
+        # Make a simple request to verify the server works
+        completion = client.completions.create(
+            model=model,
+            prompt=["Hello, World!"],
+            temperature=0,
+            max_tokens=2,
+        )
+        print(completion)
+        assert completion.choices[0].text is not None
+
+
+## Llama4 ##
+
+
+@pytest.mark.skip(
+    reason=(
+        "RuntimeError: run_moe() Expected a value of type "
+        "'Optional[List[Tensor]]' for argument '_9' but instead found type "
+        "'list'."
+    )
+)
+def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
+    )
+
+
+def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
+    )
+
+
+def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
+    )
+
+
+def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
+    )
+
+
+## DeepSeekV3 ##
+
+
+def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+@pytest.mark.skip(
+    reason=(
+        "Known issue: lack of kernel support. "
+        "Expected failure: assert self.block_quant is None"
+    )
+)
+def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+## GPT-OSS ##
+
+
+def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
+
+
+def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):
+    can_initialize(
+        "openai/gpt-oss-20b",
+        hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=["--enforce-eager"],
+    )
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c0ab3fbb10622aca5c701a2dfa66b0011de59bbe..e7d902ed26aaa677c8fdfe573c88e72fc6cdb79a 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,23 +5,31 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
 
-from typing import Optional
-
 import pytest
 import torch
 from compressed_tensors.quantization import QuantizationType
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24,
+    CompressedTensorsLinearMethod,
+    CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A8Fp8,
+    CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
+)
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform
 
 # AITER only supports per-channel-per-channel INT8 gemm
@@ -29,7 +37,7 @@ from vllm.platforms import current_platform
 # It does not support mix precision MM and mix quantization scheme.
 ROCM_AITER_SUPPORTED_INT8_MODEL = [
     "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
 ]
 
 # TritonScaledMMLinearKernel only supports symmetric quantization.
@@ -58,13 +66,6 @@ def enable_pickle(monkeypatch):
             2560,
             True,
         ),
-        (
-            "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-            "channel",
-            QuantizationType.INT,
-            2560,
-            True,
-        ),
         (
             "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
             "tensor",
@@ -77,8 +78,10 @@ def enable_pickle(monkeypatch):
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
 
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -92,7 +95,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
             down_proj = layer.mlp.down_proj
 
             # assert zp for symmetric and asymmetric cases
-            def zp_valid(zp: Optional[torch.Tensor]):
+            def zp_valid(zp: torch.Tensor | None):
                 if is_symmetric:
                     return zp is None
 
@@ -103,14 +106,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
             assert zp_valid(gate_up_proj.input_zero_point)
             assert zp_valid(down_proj.input_zero_point)
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(o_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(gate_up_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(down_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
 
             assert qkv_proj.scheme.strategy == strategy
@@ -132,7 +131,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
 
 
@@ -140,15 +139,13 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     "model_path",
     [
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("max_tokens", [8])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_logprobs(
     hf_runner,
     vllm_runner,
@@ -159,15 +156,15 @@ def test_compressed_tensors_w8a8_logprobs(
     use_aiter,
     monkeypatch,
 ):
-
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
         # this will enable VLLM_ROCM_USE_AITER_LINEAR
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -175,18 +172,20 @@ def test_compressed_tensors_w8a8_logprobs(
 
     # skip language translation prompt for the static per tensor models
     if model_path in (
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
     ):
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model_path, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -202,7 +201,7 @@ def test_compressed_tensors_w8a8_logprobs(
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
@@ -210,19 +209,15 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
     "model_args",
     [
         ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
-        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
         (
             "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
             "channel",
         ),
-        (
-            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-            "channel",
-        ),
     ],
 )
 @pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(
     vllm_runner,
     model_args,
@@ -231,26 +226,26 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
 ):
     model_path, strategy = model_args
 
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
         # this will enable VLLM_ROCM_USE_AITER_LINEAR
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    with vllm_runner(model_path, dtype=torch.float16) as llm:
+    with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
             assert not qkv_proj.scheme.is_static_input_scheme
             assert qkv_proj.scheme.strategy == strategy
@@ -258,42 +253,47 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
 
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
-      True, False),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
-      False),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
-      True, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
-      8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
-      "channel", None, 8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
-      "group", 128, 8, False, True)],
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+            "channel",
+            None,
+            8,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+            "group",
+            128,
+            8,
+            False,
+            True,
+        ),
+    ],
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
 )
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
             assert qkv_proj.scheme.strategy == strategy
-            assert qkv_proj.scheme.group_size == (-1
-                                                  if group is None else group)
+            assert qkv_proj.scheme.group_size == (-1 if group is None else group)
 
             assert qkv_proj.scheme.pack_factor == pack_factor
             assert qkv_proj.scheme.symmetric == symmetric
@@ -301,43 +301,42 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
             assert qkv_proj.weight_packed.dtype is torch.int32
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
 def test_compressed_tensors_fp8(vllm_runner):
     model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(
                 qkv_proj.scheme,
                 (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
@@ -353,19 +352,17 @@ def test_compressed_tensors_fp8(vllm_runner):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
 @pytest.mark.skipif(
-    not current_platform.is_kv_cache_dtype_supported("fp8", None),
-    reason="FP8 KV cache is not supported on this device.")
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
-        output = llm.generate_greedy("Hello world!", max_tokens=20)
+    with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=4)
         assert output
 
 
@@ -373,10 +370,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     not sparse_cutlass_supported(),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
-def _test_2of4_quant_models(qkv_proj,
-                            weight_strategy,
-                            input_strategy,
-                            format="dense"):
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
     assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
@@ -390,8 +384,7 @@ def _test_2of4_quant_models(qkv_proj,
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -421,7 +414,7 @@ def _test_2of4_quant_models(qkv_proj,
 )
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -432,14 +425,13 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -469,7 +461,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 )
 def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -485,7 +477,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -521,7 +513,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
 )
 def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -537,7 +529,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -568,7 +560,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
 )
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -579,7 +571,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -594,29 +586,26 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 )
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
             assert qkv_proj.scheme.weight_quant is None
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
             assert sparsity_map.get("Linear").format == "dense"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -626,41 +615,42 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     reason="Cutlass is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
-    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
+)
 def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
     model = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
             assert qkv_proj.scheme.weight_quant is None
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
             assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
 
 @pytest.mark.parametrize(
     "args",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
-      CompressedTensorsW4A16Fp4),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+    [
+        # TODO: Enable once model is available again
+        # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
+    ],
+)
 def test_compressed_tensors_nvfp4(vllm_runner, args):
     model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
@@ -669,11 +659,12 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            if isinstance(qkv_proj.scheme, scheme) or isinstance(
-                    qkv_proj.scheme,
-                    CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            if (
+                isinstance(qkv_proj.scheme, scheme)
+                or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+                and not cutlass_fp4_supported()
+            ):
                 assert True
             else:
                 raise AssertionError("FP4 Scheme Mismatch")
@@ -681,19 +672,19 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
             assert qkv_proj.scheme.group_size == 16
 
         llm.apply_model(check_model)
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
     reason="W4A8 FP8 is not yet supported on this GPU type.",
 )
-@pytest.mark.parametrize("args", [
-    ("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)
-])
+@pytest.mark.parametrize(
+    "args",
+    [("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)],
+)
 def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
     model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
@@ -707,8 +698,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
             down_proj = layer.mlp.down_proj
 
             for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
-                assert isinstance(proj.quant_method,
-                                  CompressedTensorsLinearMethod)
+                assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
                 assert isinstance(proj.scheme, scheme)
 
                 assert proj.weight_packed.dtype is torch.int32
@@ -717,28 +707,63 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
                 assert proj.scheme.group_size == 128
 
         llm.apply_model(check_model)
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
-@pytest.mark.parametrize("model,prompt,exp_perplexity", [
-    (
-        "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
-        "Flat is better than nested.\nSparse is better than dense.",
-        150.0,
-    ),
-    (
-        "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
-        "Flat is better than nested.\nSparse is better than dense.",
-        150.0,
-    ),
-])
-def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
-                                                  exp_perplexity):
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize(
+    "model,prompt,exp_perplexity",
+    [
+        (
+            "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
+            "Flat is better than nested.\nSparse is better than dense.",
+            150.0,
+        ),
+        (
+            "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
+            "Flat is better than nested.\nSparse is better than dense.",
+            150.0,
+        ),
+    ],
+)
+def test_compressed_tensors_transforms_perplexity(
+    vllm_runner, model, prompt, exp_perplexity
+):
     with vllm_runner(model, enforce_eager=True) as llm:
         perplexity = llm.generate_prompt_perplexity([prompt])[0]
         print(perplexity)
         assert perplexity <= exp_perplexity
+
+
+def test_compressed_tensors_fp8_block_enabled(vllm_runner):
+    model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
+    with vllm_runner(model_path, enforce_eager=True) as llm:
+        fp8_dtype = current_platform.fp8_dtype()
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
+            assert isinstance(
+                qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
+            )
+
+            assert qkv_proj.weight.dtype is fp8_dtype
+            assert qkv_proj.weight_scale.dtype is torch.float32
+            assert len(qkv_proj.weight.shape) == 2
+            assert len(qkv_proj.weight_scale.shape) == 2
+
+            input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
+            assert isinstance(input_quant_op, QuantFP8)
+            assert input_quant_op._forward_method == input_quant_op.forward_cuda
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
+        assert output
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 1843bffd211599dc391ada41211dcff2c01e55d5..797b565b91af6323732310fb9ce145f2a24fffc0 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -33,7 +33,6 @@ MODEL_ARG_EXPTYPES = [
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
-
     # AUTOAWQ
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
@@ -55,4 +54,5 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     assert found_quantization_type == expected_type, (
         f"Expected quant_type == {expected_type} for {model_path}, "
         f"but found {found_quantization_type} "
-        f"for no --quantization {quantization_arg} case")
+        f"for no --quantization {quantization_arg} case"
+    )
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 08d9573ecf0b84967cb40305584c94fada19309a..a3fb4a6953474a7d097fce8aa4eb4a5f612cb6dc 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,77 +1,73 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Expanded quantized model tests for CPU offloading
-# Base tests: tests/basic_correctness/test_cpu_offload.py
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ..utils import compare_two_settings
-
-
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-def test_cpu_offload_fp8():
-    # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
-                         ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test loading a quantized checkpoint
-    compare_two_settings("neuralmagic/Qwen2-1.5B-Instruct-FP8", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test GPTQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test GPTQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-                         ["--quantization", "gptq"],
-                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
-                    reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test AWQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test AWQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
-                         ["--quantization", "awq"],
-                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test wNa16
-    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w4a16_marlin24
-    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-                         [], ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w8a8
-    compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
-        ["--cpu-offload-gb", "1"],
-        max_wait_seconds=480)
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Expanded quantized model tests for CPU offloading
+# Base tests: tests/basic_correctness/test_cpu_offload.py
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
+def test_cpu_offload_fp8():
+    # Test loading a quantized checkpoint
+    compare_two_settings(
+        "neuralmagic/Qwen2-1.5B-Instruct-FP8",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test GPTQ Marlin
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("awq_marlin"),
+    reason="awq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test AWQ Marlin
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-AWQ",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test wNa16
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 1e3e69e008bd48d17b481a018519630eb21c922b..2a72f734e431b0c06383e1c95d67aa78fd4a84d8 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -2,9 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
-"""Tests experts_int8 quantization startup and generation, 
+"""Tests experts_int8 quantization startup and generation,
 doesn't test correctness
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -14,8 +15,10 @@ from ..models.registry import HF_EXAMPLE_MODELS
 MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
-                    reason="ExpertsInt8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("experts_int8"),
+    reason="ExpertsInt8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [10])
@@ -30,6 +33,5 @@ def test_model_experts_int8_startup(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model, dtype=dtype,
-                     quantization="experts_int8") as vllm_model:
+    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index db53061cf2d1a5ecda31d1a1277a2a725853f499..f02da2996ffea54676a277be9613855539380747 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -4,31 +4,41 @@
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
 """
+
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
-                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8KVCacheMethod,
+    Fp8LinearMethod,
+)
 from vllm.platforms import current_platform
 
 MODELS = [
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    # The checkpoint below was removed from the HF.
+    # TODO: add a small replacement checkpoint.
+    pytest.param(
+        "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+        marks=pytest.mark.skip(reason="Checkpoint removed from HF."),
+    ),
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            use_rocm_aiter: bool, monkeypatch) -> None:
-
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_model_load_and_run(
+    vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch
+) -> None:
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -43,20 +53,33 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 
 
 KV_CACHE_MODELS = [
-    # Deprecated AutoFP8 format using .kv_scale
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
     # AutoFP8 format using separate .k_scale and .v_scale
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    # The original checkpoint below was removed from the Hub. To unblock CI and
+    # until a small replacement with split K/V scales is found, skip this case.
+    # See PR #27717 for context.
+    pytest.param(
+        "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+        marks=pytest.mark.skip(
+            reason=(
+                "Checkpoint removed from HF; temporarily disabling this "
+                "AutoFP8 split K/V case (PR #27717)."
+            )
+        ),
+    ),
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
-                                     use_rocm_aiter: bool, monkeypatch):
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_kv_cache_model_load_and_run(
+    vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch
+):
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -93,14 +116,22 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
         print(outputs[0][1])
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         use_rocm_aiter: bool, monkeypatch) -> None:
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_load_fp16_model(
+    vllm_runner,
+    kv_cache_dtype: str,
+    force_marlin: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
+) -> None:
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -110,9 +141,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner("facebook/opt-125m",
-                     quantization="fp8",
-                     kv_cache_dtype=kv_cache_dtype) as llm:
+    with vllm_runner(
+        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+    ) as llm:
 
         def check_model(model):
             fc1 = model.model.decoder.layers[0].fc1
@@ -139,26 +170,29 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                     pytest.skip(
                         "Skip `test_load_fp16_model`. "
                         "It only runs on ROCm platform with FP8 compute."
-                        " e.g. MI300X and above.")
+                        " e.g. MI300X and above."
+                    )
             else:  # unsupported platform
-                pytest.skip("Skip `test_load_fp16_model`. "
-                            "It only runs on CUDA and ROCm platform.")
+                pytest.skip(
+                    "Skip `test_load_fp16_model`. "
+                    "It only runs on CUDA and ROCm platform."
+                )
 
         llm.apply_model(check_model)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_scaled_fp8_quant(dtype) -> None:
-
     def quantize_ref(tensor, inv_scale):
         # The reference implementation that fully aligns to
         # the kernel being tested.
         finfo = torch.finfo(torch.float8_e4m3fn)
         scale = inv_scale.reciprocal()
-        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
-                                                           max=finfo.max)
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
         qweight = qweight.to(torch.float8_e4m3fn)
         return qweight
 
@@ -177,26 +211,23 @@ def test_scaled_fp8_quant(dtype) -> None:
 
     # Reference dynamic quantizaton
     y = quantize_ref(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Static quantization
     y, _ = ops.scaled_fp8_quant(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
     y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     torch.testing.assert_close(
         ref_y,
-        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
-                              dtype))
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype),
+    )
 
     # non-contiguous input with padding
     m, n, padded_stride = 975, 512, 576
-    padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") *
-                     13).to(dtype)
+    padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") * 13).to(dtype)
     x_nc = padded_tensor[:, :n]  # shape (m, n) with stride (padded_stride, 1)
 
     assert not x_nc.is_contiguous()
@@ -209,19 +240,21 @@ def test_scaled_fp8_quant(dtype) -> None:
     # reference dynamic quantization
     y_nc = quantize_ref(x_nc, inv_scale_nc)
     torch.testing.assert_close(
-        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
+    )
 
     # static quantization
     y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc)
     torch.testing.assert_close(
-        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
+    )
 
     # padding after non-contiguous input quantization
-    y_nc_pad, _ = ops.scaled_fp8_quant(x_nc,
-                                       inv_scale_nc,
-                                       num_token_padding=m + 10)
+    y_nc_pad, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc, num_token_padding=m + 10)
     assert y_nc_pad.shape[0] == m + 10
     torch.testing.assert_close(
         ref_y_nc,
-        per_tensor_dequantize(torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]),
-                              inv_scale_nc, dtype))
+        per_tensor_dequantize(
+            torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
+        ),
+    )
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 00a5946ed0154e7512050ed74e59d2b722cc3d67..37fe2dd3243aa230072e3262f5f63e6009cea997 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -10,10 +10,10 @@ import torch
 
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
-    get_dynamic_override)
+    get_dynamic_override,
+)
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -21,56 +21,61 @@ PROMPT = "On the surface of Mars, we found"
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-     True),
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-     False),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
+    (
+        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+        False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
-                           monkeypatch):
+def test_gptq_with_dynamic(
+    vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch
+):
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
-    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
-        GPTQLinearMethod)
+    linear_method_cls = (
+        GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
+    )
 
-    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
+    with vllm_runner(
+        model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
+    ) as llm:
 
         def check_model(model):
             for name, submodule in model.named_modules():
                 if name == "lm_head":
-                    assert isinstance(submodule.quant_method,
-                                      linear_method_cls)
-                elif name == 'model.layers.0.self_attn.qkv_proj':
+                    assert isinstance(submodule.quant_method, linear_method_cls)
+                elif name == "model.layers.0.self_attn.qkv_proj":
                     # The first layer is quantized using bits=4, group_size=128
                     # desc_act=True
-                    assert isinstance(submodule.quant_method,
-                                      linear_method_cls)
+                    assert isinstance(submodule.quant_method, linear_method_cls)
                     config = submodule.quant_method.quant_config
                     assert config.weight_bits == 4
                     assert config.group_size == 128
                     assert config.desc_act
-                elif name == 'model.layers.1.self_attn.qkv_proj':
+                elif name == "model.layers.1.self_attn.qkv_proj":
                     # The second layer is quantized using bits=8, group_size=32
                     # desc_act=False
-                    assert isinstance(submodule.quant_method,
-                                      linear_method_cls)
+                    assert isinstance(submodule.quant_method, linear_method_cls)
                     config = submodule.quant_method.quant_config
-                    assert get_dynamic_override(config,
-                                                layer_name=name,
-                                                key="bits") == 8
-                    assert get_dynamic_override(config,
-                                                layer_name=name,
-                                                key="group_size") == 32
+                    assert (
+                        get_dynamic_override(config, layer_name=name, key="bits") == 8
+                    )
+                    assert (
+                        get_dynamic_override(config, layer_name=name, key="group_size")
+                        == 32
+                    )
                     assert not get_dynamic_override(
-                        config, layer_name=name, key="desc_act")
-                elif (name == 'model.layers.2.self_attn.qkv_proj'
-                      or name == 'model.layers.2.mlp.gate_up_proj'):
+                        config, layer_name=name, key="desc_act"
+                    )
+                elif (
+                    name == "model.layers.2.self_attn.qkv_proj"
+                    or name == "model.layers.2.mlp.gate_up_proj"
+                ):
                     # All other layers (layer index >= 2) are not quantized
-                    assert isinstance(submodule.quant_method,
-                                      UnquantizedLinearMethod)
+                    assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
 
         llm.apply_model(check_model)
diff --git a/tests/quantization/test_gptq_v2.py b/tests/quantization/test_gptq_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbafa2e8e7d1fa3a5b0f31e1bdef477392b4d81f
--- /dev/null
+++ b/tests/quantization/test_gptq_v2.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether vllm correctly load and run gptq_v2 format checkpoints.
+
+Run `pytest tests/quantization/test_gptq_v2.py --forked`.
+"""
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+
+# A dummy small model quantized by GPTQModel, stored in GPTQ v2 format
+MODELS = ["XXXXyu/Qwen3-1.7B-w2g64-gptq_v2"]
+
+# Generate multiple sequences for testing, because an 1.7B 2-bit model
+# cannot always generate normal texts.
+N_SEQ = 5
+
+
+@pytest.mark.parametrize("model_id", MODELS)
+def test_model_load(vllm_runner, model_id, monkeypatch):
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+    # Only check the default GPTQ linear method (used for 2/3-bit models).
+    # 4/8-bit linear methods like Marlin already support gptq_v2.
+    linear_method_cls = GPTQLinearMethod
+
+    with vllm_runner(model_id, dtype=torch.float16, max_model_len=512) as llm:
+
+        def check_model(model_id):
+            for name, submodule in model_id.named_modules():
+                # Could check more modules if necessary
+                if name == "model_id.layers.0.self_attn.qkv_proj":
+                    assert isinstance(submodule.quant_method, linear_method_cls)
+
+                    config = submodule.quant_method.quant_config
+                    assert config.checkpoint_format == "gptq_v2"
+                    assert submodule.quant_method.use_v2_format
+
+                    # Just break since currently we only check 1 module
+                    break
+
+        # Check if gptq_v2 format is correctly loaded
+        llm.apply_model(check_model)
+
+
+@pytest.mark.parametrize("model_id", MODELS)
+def test_model_inference(vllm_runner, model_id):
+    # Prepare prompt to test the model's generation result.
+    prompt = "What is the meaning of life?"
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,  # If thinking model, set it to false
+    )
+    sampling_params = SamplingParams(
+        n=N_SEQ,
+        max_tokens=128,
+        temperature=0.7,
+        top_p=0.8,
+        top_k=20,
+        min_p=0,
+        presence_penalty=2,
+    )
+
+    with vllm_runner(model_id, dtype=torch.float16, max_model_len=512) as llm:
+        # Generate a response to verify inference correctness
+        output = llm.generate(text, sampling_params)
+
+    # Make sure the output exists
+    assert output
+    assert output[0][1]
+    assert len(output[0][1]) == N_SEQ
+
+    def has_normal_char_distribution(texts, min_len):
+        for text in texts:
+            # Response too short
+            if len(text) < min_len:
+                return False
+
+            # Basic ratio checks
+            letters = sum(c.isalpha() for c in text)
+            spaces = sum(c.isspace() for c in text)
+            total = len(text)
+
+            letter_ratio = letters / total
+            space_ratio = spaces / total
+
+            # At least 1 normal text should exist within output sequences
+            # Normal text should be mostly letters with reasonable spacing
+            # Some magic numbers, could be adjusted
+            if 0.5 <= letter_ratio <= 0.9 and 0.01 <= space_ratio <= 0.3:
+                return True
+        # No sequence contains normal text, output might be broken
+        return False
+
+    # Apply some simple checks for giberish output
+    # Print the output sequences if failed
+    assert has_normal_char_distribution(output[0][1], 5), output[0][1]
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 34b1b6c2e5b6deb47c1f0d8b10b57efd9119e480..ae9b1df3377dca60227c8cbf00e9933bf452d09e 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
- on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
- 
- Validating the configuration and printing results for manual checking.
+on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
 
- Run `pytest tests/quantization/test_ipex_quant.py`.
+Validating the configuration and printing results for manual checking.
+
+Run `pytest tests/quantization/test_ipex_quant.py`.
 """
 
 import pytest
@@ -19,14 +19,14 @@ MODELS = [
 DTYPE = ["bfloat16"]
 
 
-@pytest.mark.skipif(not current_platform.is_cpu()
-                    and not current_platform.is_xpu(),
-                    reason="only supports Intel CPU/XPU backend.")
+@pytest.mark.skipif(
+    not current_platform.is_cpu() and not current_platform.is_xpu(),
+    reason="only supports Intel CPU/XPU backend.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
     with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index e69d4ad349c38de8ccca57767e3301932930d99d..f009a4cfb870d5ab29636d8675c39e7fd1f2d246 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -9,10 +9,10 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    UnquantizedEmbeddingMethod)
+    UnquantizedEmbeddingMethod,
+)
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -31,20 +31,22 @@ def test_lm_head(
 ) -> None:
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, dtype=torch.float16,
-                     max_model_len=2048) as vllm_model:
+    with vllm_runner(
+        model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
+    ) as vllm_model:
 
         def check_model(model):
             lm_head_layer = model.lm_head
             if lm_head_quantized:
-                assert isinstance(lm_head_layer.quant_method,
-                                  (GPTQLinearMethod, GPTQMarlinLinearMethod))
+                assert isinstance(
+                    lm_head_layer.quant_method,
+                    (GPTQLinearMethod, GPTQMarlinLinearMethod),
+                )
             else:
-                assert isinstance(lm_head_layer.quant_method,
-                                  UnquantizedEmbeddingMethod)
+                assert isinstance(
+                    lm_head_layer.quant_method, UnquantizedEmbeddingMethod
+                )
 
         vllm_model.apply_model(check_model)
 
-        print(
-            vllm_model.generate_greedy(["Hello my name is"],
-                                       max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
diff --git a/tests/quantization/test_mixed_precision.py b/tests/quantization/test_mixed_precision.py
new file mode 100755
index 0000000000000000000000000000000000000000..51526470b42336fa87ca94c5e1b97576e78b91d5
--- /dev/null
+++ b/tests/quantization/test_mixed_precision.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test quark-quantized {MXFP4, FP8} mixed precision models.
+
+Run `pytest tests/quantization/test_mixed_precision.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False"
+        )
+
+
+TEST_CONFIGS = {
+    # Mixed-precision (AMP) model
+    # - Demonstrates end-to-end pipeline functionality
+    "amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8": {"arc_challenge": 0.52, "mmlu": 0.72},
+    # Non-mixed-precision (PTQ) model
+    # - Reference for pipeline compatibility verification -> No conflicts or breakings
+    "amd/Llama-2-70b-chat-hf-FP8-MLPerf-fp8_attn_quark_format": {
+        "arc_challenge": 0.53,
+        "mmlu": 0.61,
+    },
+}
+
+
+@pytest.mark.parametrize("model_name, accuracy_numbers", TEST_CONFIGS.items())
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+def test_mixed_precision_model_accuracies(model_name: str, accuracy_numbers: dict):
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=list(accuracy_numbers.keys()),
+        batch_size=8,
+    )
+
+    rtol = 0.05
+
+    for task, expect_accuracy in accuracy_numbers.items():
+        measured_accuracy = results["results"][task]["acc,none"]
+        assert (
+            measured_accuracy - rtol < expect_accuracy
+            and measured_accuracy + rtol > expect_accuracy
+        ), f"Expected: {expect_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index e7174be73626a6e156d9b1bfcd296f05f7260c20..8abf65d29784d22036f7003ec5950507b4d2f74a 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -19,21 +19,26 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
-@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
-                    reason="ModelOpt FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
 def test_modelopt_fp8_checkpoint_setup(vllm_runner):
     """Test ModelOpt FP8 checkpoint loading and structure validation."""
     # TODO: provide a small publicly available test checkpoint
-    model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
-                  "TinyLlama-1.1B-Chat-v1.0-fp8-0710")
+    model_path = (
+        "/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
+        "TinyLlama-1.1B-Chat-v1.0-fp8-0710"
+    )
 
     # Skip test if checkpoint doesn't exist
     if not os.path.exists(model_path):
-        pytest.skip(f"Test checkpoint not found at {model_path}. "
-                    "This test requires a local ModelOpt FP8 checkpoint.")
+        pytest.skip(
+            f"Test checkpoint not found at {model_path}. "
+            "This test requires a local ModelOpt FP8 checkpoint."
+        )
 
-    with vllm_runner(model_path, quantization="modelopt",
-                     enforce_eager=True) as llm:
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -45,11 +50,12 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
 
             # Check that ModelOpt quantization method is properly applied
             from vllm.model_executor.layers.quantization.modelopt import (
-                ModelOptFp8LinearMethod)
+                ModelOptFp8LinearMethod,
+            )
+
             assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
             assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
-            assert isinstance(gate_up_proj.quant_method,
-                              ModelOptFp8LinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8LinearMethod)
             assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
 
             # Check weight dtype is FP8
@@ -59,23 +65,23 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
             assert down_proj.weight.dtype == torch.float8_e4m3fn
 
             # Check scales are present and have correct dtype
-            assert hasattr(qkv_proj, 'weight_scale')
-            assert hasattr(qkv_proj, 'input_scale')
+            assert hasattr(qkv_proj, "weight_scale")
+            assert hasattr(qkv_proj, "input_scale")
             assert qkv_proj.weight_scale.dtype == torch.float32
             assert qkv_proj.input_scale.dtype == torch.float32
 
-            assert hasattr(o_proj, 'weight_scale')
-            assert hasattr(o_proj, 'input_scale')
+            assert hasattr(o_proj, "weight_scale")
+            assert hasattr(o_proj, "input_scale")
             assert o_proj.weight_scale.dtype == torch.float32
             assert o_proj.input_scale.dtype == torch.float32
 
-            assert hasattr(gate_up_proj, 'weight_scale')
-            assert hasattr(gate_up_proj, 'input_scale')
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert hasattr(gate_up_proj, "input_scale")
             assert gate_up_proj.weight_scale.dtype == torch.float32
             assert gate_up_proj.input_scale.dtype == torch.float32
 
-            assert hasattr(down_proj, 'weight_scale')
-            assert hasattr(down_proj, 'input_scale')
+            assert hasattr(down_proj, "weight_scale")
+            assert hasattr(down_proj, "input_scale")
             assert down_proj.weight_scale.dtype == torch.float32
             assert down_proj.input_scale.dtype == torch.float32
 
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 088b68510cffaa5dd8e850d56164bf3119c30dfa..e8ea4148585bf3000769d308c223e7fbc003d970 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -4,18 +4,19 @@
 
 Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
 """
+
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
-from vllm.model_executor.layers.quantization.ptpc_fp8 import (
-    PTPCFp8LinearMethod)
+from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
 from vllm.platforms import current_platform
 
 UNSUPPORTED_STR = (
     "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
-    "support output dtype of bfloat16. torch.float16 is specified.")
+    "support output dtype of bfloat16. torch.float16 is specified."
+)
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -24,18 +25,21 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
-@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
-                    reason="PTPC FP8 is not supported on this GPU type.")
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="This test is for ROCm GPU.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("ptpc_fp8"),
+    reason="PTPC FP8 is not supported on this GPU type.",
+)
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
 @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
     try:
-        llm = vllm_runner("facebook/opt-125m",
-                          dtype=dtype,
-                          quantization="ptpc_fp8",
-                          kv_cache_dtype=kv_cache_dtype)
+        llm = vllm_runner(
+            "facebook/opt-125m",
+            dtype=dtype,
+            quantization="ptpc_fp8",
+            kv_cache_dtype=kv_cache_dtype,
+        )
     except AssertionError as e:
         if str(e) == UNSUPPORTED_STR:
             # If the error message matches, the test passes
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 930f4acb328fd8c449ada9d70eeddffaf9560e0e..334f9a65e4c03d748b5c974a100ac05528a8838d 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -4,11 +4,10 @@
 
 Run `pytest tests/quantization/test_quark.py`.
 
-See also `tests/kernels/moe/test_mxfp4_moe.py`.
+See also `tests/kernels/moe/test_ocp_mx_moe.py`.
 """
 
 import importlib.metadata
-import os
 from dataclasses import dataclass
 from importlib.util import find_spec
 
@@ -19,23 +18,27 @@ import torch
 from packaging import version
 
 from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
-    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
+    QuarkLinearMethod,
+    QuarkW8A8Fp8,
+    QuarkW8A8Int8,
+)
 from vllm.platforms import current_platform
 
 from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
 
 QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
-    importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
 
 if QUARK_MXFP4_AVAILABLE:
-    from quark.torch.export.nn.modules.realquantizer import (
-        StaticScaledRealQuantizer)
+    from quark.torch.export.nn.modules.realquantizer import StaticScaledRealQuantizer
     from quark.torch.kernel import mx as mx_kernel
     from quark.torch.quantization.config.config import FP4PerGroupSpec
 
 try:
     huggingface_hub.list_repo_refs(
-        "amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ")
+        "amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ"
+    )
     HF_HUB_AMD_ORG_ACCESS = True
 except huggingface_hub.errors.RepositoryNotFoundError:
     HF_HUB_AMD_ORG_ACCESS = False
@@ -47,13 +50,16 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
-@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
-@pytest.mark.parametrize('tp', [1])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.parametrize("tp", [1])
 def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
-    with vllm_runner(model_path,
-                     kv_cache_dtype=kv_cache_dtype,
-                     tensor_parallel_size=tp) as llm:
+    with vllm_runner(
+        model_path,
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
+        tensor_parallel_size=tp,
+    ) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -70,14 +76,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
-@pytest.mark.parametrize('tp', [1])
+@pytest.mark.parametrize("tp", [1])
 def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
     model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
-    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+    with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -89,20 +95,19 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
 
             if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
                 assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
-                assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[
-                    1]
+                assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[1]
                 assert qkv_proj.weight_scale.shape[1] == 1
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
-@pytest.mark.parametrize('tp', [1])
+@pytest.mark.parametrize("tp", [1])
 def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
-    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+    with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -114,7 +119,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
@@ -125,16 +130,18 @@ def test_quark_fp8_parity(vllm_runner):
     llm_kwargs = {
         "tensor_parallel_size": 1,
         "enforce_eager": True,
-        "gpu_memory_utilization": 0.1
+        "gpu_memory_utilization": 0.1,
     }
-    with (vllm_runner(quark_model_id, **llm_kwargs) as
-          quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
+    with (
+        vllm_runner(quark_model_id, **llm_kwargs) as quark_handle,
+        vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle,
+    ):
 
         def get_state_dict(model):
             return {k: v.cpu() for k, v in model.state_dict().items()}
 
-        quark_state_dict, = quark_handle.apply_model(get_state_dict)
-        fp8_state_dict, = fp8_handle.apply_model(get_state_dict)
+        (quark_state_dict,) = quark_handle.apply_model(get_state_dict)
+        (fp8_state_dict,) = fp8_handle.apply_model(get_state_dict)
 
     assert fp8_state_dict.keys() == quark_state_dict.keys()
 
@@ -143,38 +150,93 @@ def test_quark_fp8_parity(vllm_runner):
 
 
 @dataclass
-class ModelCase:
-    model_id: str
-    tp: int
-
-
-@dataclass
-class GSM8KAccuracyTestConfig:
+class AccuracyTestConfig:
     model_name: str
     excepted_value: float
 
-    def get_model_args(self) -> str:
-        return (
-            f"pretrained={self.model_name},"
-            "dtype=auto,add_bos_token=True,tensor_parallel_size=8,gpu_memory_utilization=0.7,max_model_len=38768"
-        )
-
-
-ACCURACY_CONFIGS = [
+    def get_model_args(
+        self,
+        tp_size: int,
+        model_max_len: int | None = None,
+        kwargs: dict | None = None,
+    ) -> dict:
+        if kwargs is None:
+            kwargs = {}
+
+        model_args = {
+            "pretrained": self.model_name,
+            "dtype": "auto",
+            "add_bos_token": True,
+            "tensor_parallel_size": tp_size,
+            "gpu_memory_utilization": 0.7,
+            **kwargs,
+        }
+        if model_max_len is not None:
+            model_args["max_model_len"] = model_max_len
+
+        return model_args
+
+
+GSM8K_ACCURACY_CONFIGS = [
     # Private model.
-    GSM8KAccuracyTestConfig(
+    AccuracyTestConfig(
         model_name="amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant",
-        excepted_value=0.96),
+        excepted_value=0.96,
+    ),
+]
+
+WIKITEXT_ACCURACY_CONFIGS = [
+    AccuracyTestConfig(
+        model_name="fxmarty/qwen1.5_moe_a2.7b_chat_w_fp4_a_fp6_e2m3",
+        excepted_value=11.3,
+    ),
+    AccuracyTestConfig(
+        model_name="fxmarty/qwen1.5_moe_a2.7b_chat_w_fp6_e3m2_a_fp6_e3m2",
+        excepted_value=10.6,
+    ),
+    AccuracyTestConfig(
+        model_name="fxmarty/qwen_1.5-moe-a2.7b-mxfp4", excepted_value=12.4
+    ),
 ]
 
 
-@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(
+            f"This test requires >={tp_size} gpus, got only {torch.cuda.device_count()}"
+        )
+
+    task = "wikitext"
+    rtol = 0.1
+
+    # Smaller cuda_graph_sizes to speed up the test.
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(
+            tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]}
+        ),
+        tasks=task,
+        batch_size=64,
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][task]["word_perplexity,none"]
+    assert (
+        measured_value < EXPECTED_VALUE + rtol
+        and measured_value > EXPECTED_VALUE - rtol
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.parametrize("config", GSM8K_ACCURACY_CONFIGS)
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.skipif(
     not HF_HUB_AMD_ORG_ACCESS,
-    reason="Read access to huggingface.co/amd is required for this test.")
-def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
+def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
     if torch.cuda.device_count() < 8:
         pytest.skip(
             f"This test requires >=8 gpus, got only {torch.cuda.device_count()}"
@@ -183,11 +245,9 @@ def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
     task = "gsm8k"
     rtol = 0.03
 
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
     results = lm_eval.simple_evaluate(
         model="vllm",
-        model_args=config.get_model_args(),
+        model_args=config.get_model_args(tp_size=8, model_max_len=38768),
         tasks=task,
         batch_size=64,
         num_fewshot=8,
@@ -195,28 +255,24 @@ def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
 
     EXPECTED_VALUE = config.excepted_value
     measured_value = results["results"][task]["exact_match,strict-match"]
-    assert (measured_value - rtol < EXPECTED_VALUE
-            and measured_value + rtol > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
-
-    del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
+    assert (
+        measured_value - rtol < EXPECTED_VALUE
+        and measured_value + rtol > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("scalings",
-                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
-def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
-                                     scalings: list[int]):
+@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[int]):
     torch.manual_seed(0)
 
     hidden_size = 64 * 32
-    inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") -
-           0.5) * 2
+    inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") - 0.5) * 2
     for i in range(hidden_size // 32):
-        inp[:, i * 32:(i + 1) *
-            32] = inp[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+        inp[:, i * 32 : (i + 1) * 32] = (
+            inp[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
+        )
 
     inp_kernel = inp.clone()
     inp_kernel_clone = inp_kernel.clone()
@@ -225,20 +281,20 @@ def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
     res_torch = qdq_mxfp4_torch(inp_kernel, "even")
 
     for i in range(hidden_size // 32):
-        assert torch.all(torch.isfinite(res_hip[:, i * 32:(i + 1) * 32]))
-        assert torch.all(torch.isfinite(res_torch[:, i * 32:(i + 1) * 32]))
+        assert torch.all(torch.isfinite(res_hip[:, i * 32 : (i + 1) * 32]))
+        assert torch.all(torch.isfinite(res_torch[:, i * 32 : (i + 1) * 32]))
 
-        torch.testing.assert_close(res_hip[:, i * 32:(i + 1) * 32],
-                                   res_torch[:, i * 32:(i + 1) * 32])
+        torch.testing.assert_close(
+            res_hip[:, i * 32 : (i + 1) * 32], res_torch[:, i * 32 : (i + 1) * 32]
+        )
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("scalings",
-                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
-def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
-                                          scalings: list[int]):
+@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_dequant_kernel_match_quark(
+    float_dtype: torch.dtype, scalings: list[int]
+):
     qspec = FP4PerGroupSpec(
         ch_axis=-1,
         group_size=32,
@@ -265,8 +321,9 @@ def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
 
     # Make it so that different groups have different scales.
     for i in range(hidden_size // 32):
-        w[:, i * 32:(i + 1) *
-          32] = w[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+        w[:, i * 32 : (i + 1) * 32] = (
+            w[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
+        )
 
     observer(w)
     scale, _ = observer._calculate_qparams()
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 03fe59d7e3bfffbd117f0e087659482de991591a..8da048703df93bbe1e632457082567817b8990be 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -6,18 +6,26 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.
 
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
-from typing import Any, Optional
+
+import logging
+from typing import Any
 
 import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.linear import (
+    LinearBase,  # noqa: E501
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import (
-    QuantizationMethods, get_quantization_config, register_quantization_config)
+    QuantizationMethods,
+    get_quantization_config,
+    register_quantization_config,
+)
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig)
+    QuantizationConfig,
+)
 
 
 class FakeQuantLinearMethod(UnquantizedLinearMethod):
@@ -28,10 +36,12 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
         super().__init__()
         self.num_bits = num_bits
 
-    def apply(self,
-              layer: "torch.nn.Module",
-              x: "torch.Tensor",
-              bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         """Perform fake quantization before the linear layer."""
 
         # Calculate the scales dynamically
@@ -40,8 +50,11 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
         scales = (max_val - min_val) / (2**self.num_bits - 1)
 
         # Fake quantize the input
-        quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
-                              2**(self.num_bits - 1) - 1)
+        quant_x = torch.clamp(
+            torch.round(x / scales),
+            -(2 ** (self.num_bits - 1)),
+            2 ** (self.num_bits - 1) - 1,
+        )
         dequant_x = quant_x * scales
 
         return F.linear(dequant_x, layer.weight, bias)
@@ -60,7 +73,7 @@ class CustomQuantConfig(QuantizationConfig):
         """Name of the quantization method."""
         return "custom_quant"
 
-    def get_supported_act_dtypes(self) -> list["torch.dtype"]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         """List of supported activation dtypes."""
         return [torch.float16, torch.bfloat16]
 
@@ -79,38 +92,46 @@ class CustomQuantConfig(QuantizationConfig):
         """Create a config class from the model's quantization config."""
         return CustomQuantConfig(num_bits=config.get("num_bits", 8))
 
-    def get_quant_method(self, layer: "torch.nn.Module",
-                         prefix: str) -> Optional["FakeQuantLinearMethod"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> FakeQuantLinearMethod | None:
         """Get the quantize method to use for the quantized layer."""
         if isinstance(layer, LinearBase):
             return FakeQuantLinearMethod(num_bits=self.num_bits)
         return None
 
 
-def test_register_quantization_config():
+def test_register_quantization_config(caplog_vllm):
     """Test register custom quantization config."""
 
     # The quantization method `custom_quant` should be registered.
     assert get_quantization_config("custom_quant") == CustomQuantConfig
 
     # The quantization method `custom_quant` is already exists,
-    # should raise an error.
-    with pytest.raises(ValueError):
+    # should raise a warning when re-registering it.
+    with caplog_vllm.at_level(logging.WARNING):
         register_quantization_config("custom_quant")(CustomQuantConfig)
 
+    assert any(
+        "The quantization method 'custom_quant' already exists" in message
+        for message in caplog_vllm.messages
+    ), "Expected a warning when re-registering custom_quant"
+
 
-@pytest.mark.parametrize(argnames="model",
-                         argvalues=[
-                             "meta-llama/Llama-3.2-1B-Instruct",
-                         ])
+@pytest.mark.parametrize(
+    argnames="model",
+    argvalues=[
+        "meta-llama/Llama-3.2-1B-Instruct",
+    ],
+)
 def test_custom_quant(vllm_runner, model, monkeypatch):
     """Test infer with the custom quantization method."""
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
-    with vllm_runner(model_name=model,
-                     quantization="custom_quant",
-                     enforce_eager=True) as llm:
+    with vllm_runner(
+        model_name=model, quantization="custom_quant", enforce_eager=True
+    ) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index bc2b468f97d8c33989dbdf1e4d80d84f6ab58fb7..195f1fbbdfc0c8d3773b7f9e1bcb81d8d1e7dfa3 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
-"""Tests RTN quantization startup and generation, 
+"""Tests RTN quantization startup and generation,
 doesn't test correctness
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
 MODELS = [
-    "microsoft/Phi-3-mini-4k-instruct",  # dense model
     "ai21labs/Jamba-tiny-dev",  # MoE model
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("rtn"),
-                    reason="RTN is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("rtn"),
+    reason="RTN is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [10])
@@ -27,6 +29,7 @@ def test_model_rtn_startup(
     dtype: str,
     max_tokens: int,
 ) -> None:
-
-    with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
+    with vllm_runner(
+        model, enforce_eager=True, dtype=dtype, quantization="rtn"
+    ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 8e68f6a2e019fd30395e27ce2c9ddc17b640ed86..82413f36e997f04392a046776342f6bf4a9717e1 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -13,14 +13,14 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/fp8-opt-125m",
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     enforce_eager=True) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        "drisspg/fp8-opt-125m",
+        quantization="torchao",
+        dtype="bfloat16",
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
     assert output
-    print(output)
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
@@ -29,70 +29,372 @@ def test_pre_quantized_model(vllm_runner):
     [
         "cuda:0",
         # {"": "cuda"},
-    ])
-def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
-                                                   pt_load_map_location):
+    ],
+)
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_location):
     torch._dynamo.reset()
     model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location=pt_load_map_location) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location=pt_load_map_location,
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
-        print(output)
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
     torch._dynamo.reset()
     model_name = "jerryzh168/opt-125m-int4wo-per-module"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
-        print(output)
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
     model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
-        print(output)
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 @pytest.mark.skip(
     reason="since torchao nightly is only compatible with torch nightly"
     "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
-    "torchao tests that requires newer versions (0.14.0.dev+) for now")
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
 def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
-    model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
-                  "-0.14.0.dev")
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    model_name = "torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2-0.14.0.dev"
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
+
+        assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_online_quant_config_dict_json(vllm_runner):
+    """Testing on the fly quantization, load_weights integration point,
+    with config dict serialized to json string
+    """
+    torch._dynamo.reset()
+    model_name = "facebook/opt-125m"
+
+    import json
+
+    from torchao.core.config import config_to_dict
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
+        granularity=PerRow()
+    )
+    hf_overrides = {
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
+    }
+    with vllm_runner(
+        model_name=model_name,
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        quantization="torchao",
+        hf_overrides=hf_overrides,
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
+
+        assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_online_quant_config_file(vllm_runner):
+    """Testing on the fly quantization, load_weights integration point,
+    with config file
+    """
+    torch._dynamo.reset()
+    model_name = "facebook/opt-125m"
+    import json
+    from tempfile import NamedTemporaryFile
+
+    from torchao.core.config import config_to_dict
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+
+    with NamedTemporaryFile(mode="w", delete=False) as f:
+        f.write(json.dumps(config_to_dict(config)))
+        # close the file to save it
+        f.close()
+        config_file_name = str(f.name)
+
+        hf_overrides = {"quantization_config_file": config_file_name}
+        with vllm_runner(
+            model_name=model_name,
+            dtype="bfloat16",
+            pt_load_map_location="cuda:0",
+            quantization="torchao",
+            hf_overrides=hf_overrides,
+            enforce_eager=True,
+        ) as llm:
+            output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
+
+            assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_reload_weights():
+    import json
+
+    from torchao.core.config import config_to_dict
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    from vllm import LLM, SamplingParams
+
+    torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
+        granularity=PerRow()
+    )
+
+    hf_overrides = {
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
+    }
+
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        dtype="bfloat16",
+        load_format="dummy",
+        enforce_eager=True,
+        quantization="torchao",
+        hf_overrides=hf_overrides,
+    )
+    # Update load format from `dummy` to `auto`
+    llm.collective_rpc(
+        "update_config", args=({"load_config": {"load_format": "auto"}},)
+    )
+    # Now reload real weights inplace
+    llm.collective_rpc("reload_weights")
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+    outputs = llm.generate(prompts, sampling_params)
+    # make sure it runs
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        assert generated_text
+        # can also uncomment locally to make sure the generated
+        # output makes sense
+        # prompt = output.prompt
+        # print(f"Prompt:    {prompt!r}")
+        # print(f"Output:    {generated_text!r}")
+        # print("-" * 60)
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner):
+    torch._dynamo.reset()
+    model_name = (
+        "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
+    )
+    with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
+
+        assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
+    with vllm_runner(
+        model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
+
+    assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypatch):
+    """We load a model with Int4Tensor (plain format) linear weights
+    and verify that the weight is updated to Int4PreshuffledTensor
+    after loading in vllm
+    """
+    from torchao.quantization import Int4PreshuffledTensor
+    from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
+
+    torch._dynamo.reset()
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+    model_name = "torchao-testing/opt-125m-Int4WeightOnlyConfig-v2-0.14.0.dev"
+    # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
+    # have meta kernel implemented yet, can remove this flag after that is implemented
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        enforce_eager=True,
+    ) as llm:
+
+        def has_int4_preshuffled_tensor_weight(model):
+            return isinstance(
+                model.model.decoder.layers[0].self_attn.qkv_proj.weight,
+                Int4PreshuffledTensor,
+            )
+
+        def get_weight_attrs(model):
+            weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
+            return [
+                weight.requires_grad,
+                weight.input_dim,
+                weight.output_dim,
+                hasattr(weight, "weight_loader"),
+            ]
+
+        llm_engine = llm.get_llm().llm_engine
+        has_int4_preshuffled_tensor = any(
+            llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
+        )
+        weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
+
+        # making sure we are using Int4PreshuffledTensor on H100 GPU, when
+        # fbgemm_gpu_genai
+        # library is installed, otherwise it should be using Int4Tensor
+        if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
+            assert has_int4_preshuffled_tensor
+        else:
+            assert not has_int4_preshuffled_tensor
+
+        assert weight_attrs == [False, 1, 0, True]
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+
+        assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
+    vllm_runner, monkeypatch
+):
+    """We load a bf16 model and online quantize the model to int4, then verify that
+    the weights are updated to Int4PreshuffledTensor after online quantization
+    """
+    from torchao.quantization import Int4PreshuffledTensor
+    from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
+
+    torch._dynamo.reset()
+    model_name = "facebook/opt-125m"
+
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+    import json
+
+    from torchao.core.config import config_to_dict
+    from torchao.quantization import Int4WeightOnlyConfig
+
+    torchao_quant_config = Int4WeightOnlyConfig(
+        group_size=128, int4_packing_format="plain"
+    )
+    hf_overrides = {
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
+    }
+
+    # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
+    # have meta kernel implemented yet, can remove this flag after that is implemented
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        hf_overrides=hf_overrides,
+        enforce_eager=True,
+    ) as llm:
+
+        def has_int4_preshuffled_tensor_weight(model):
+            return isinstance(
+                model.model.decoder.layers[0].self_attn.qkv_proj.weight,
+                Int4PreshuffledTensor,
+            )
+
+        def get_weight_attrs(model):
+            weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
+            return [
+                weight.requires_grad,
+                weight.input_dim,
+                weight.output_dim,
+                hasattr(weight, "weight_loader"),
+            ]
+
+        llm_engine = llm.get_llm().llm_engine
+        has_int4_preshuffled_tensor = any(
+            llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
+        )
+        weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
+
+        # making sure we are using Int4PreshuffledTensor on H100 GPU, when
+        # fbgemm_gpu_genai
+        # library is installed, otherwise it should be using Int4Tensor
+        if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
+            assert has_int4_preshuffled_tensor
+        else:
+            assert not has_int4_preshuffled_tensor
+
+        assert weight_attrs == [False, 1, 0, True]
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
 
         assert output
-        print(output)
 
 
 if __name__ == "__main__":
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index 6a939dcfc2c9c7c1d5b8fa6eac3657350b130a7e..d31b1c7d169b77812f9fd1e5ea7f2cc592411d08 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -44,9 +44,7 @@ def test_tokenizer():
     # Add custom test tokens
     test_tokens = ["<test:think>", "</test:think>", "<alt:start>", "<alt:end>"]
     existing_tokens = set(tokenizer.get_vocab().keys())
-    new_tokens = [
-        token for token in test_tokens if token not in existing_tokens
-    ]
+    new_tokens = [token for token in test_tokens if token not in existing_tokens]
     if new_tokens:
         tokenizer.add_tokens(new_tokens)
     return tokenizer
@@ -54,8 +52,8 @@ def test_tokenizer():
 
 class TestBaseThinkingReasoningParserInit:
     """
-        Test initialization and basic properties of
-        BaseThinkingReasoningParser.
+    Test initialization and basic properties of
+    BaseThinkingReasoningParser.
     """
 
     def test_successful_initialization(self, test_tokenizer):
@@ -76,7 +74,6 @@ class TestBaseThinkingReasoningParserInit:
 
         # Create a parser with tokens not in vocabulary
         class MissingTokenParser(BaseThinkingReasoningParser):
-
             @property
             def start_token(self) -> str:
                 return "<missing:start>"
@@ -85,15 +82,15 @@ class TestBaseThinkingReasoningParserInit:
             def end_token(self) -> str:
                 return "<missing:end>"
 
-        with pytest.raises(RuntimeError,
-                           match="could not locate think start/end tokens"):
+        with pytest.raises(
+            RuntimeError, match="could not locate think start/end tokens"
+        ):
             MissingTokenParser(test_tokenizer)
 
     def test_initialization_with_empty_tokens(self, test_tokenizer):
         """Test that initialization fails with empty token strings."""
 
         class EmptyTokenParser(BaseThinkingReasoningParser):
-
             @property
             def start_token(self) -> str:
                 return ""
@@ -102,8 +99,9 @@ class TestBaseThinkingReasoningParserInit:
             def end_token(self) -> str:
                 return ""
 
-        with pytest.raises(ValueError,
-                           match="start_token and end_token must be defined"):
+        with pytest.raises(
+            ValueError, match="start_token and end_token must be defined"
+        ):
             EmptyTokenParser(test_tokenizer)
 
 
@@ -153,63 +151,57 @@ class TestBaseThinkingReasoningParserMethods:
 class TestBaseThinkingReasoningParserExtraction:
     """Test reasoning content extraction methods."""
 
-    def test_extract_reasoning_content_with_both_tokens(self, test_tokenizer):
+    def test_extract_reasoning_with_both_tokens(self, test_tokenizer):
         """Test extraction when both start and end tokens are present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
-        model_output = ("<test:think>This is reasoning"
-                        "</test:think>This is content")
-        reasoning, content = parser.extract_reasoning_content(
-            model_output, request)
+        model_output = "<test:think>This is reasoning</test:think>This is content"
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is reasoning"
         assert content == "This is content"
 
-    def test_extract_reasoning_content_only_end_token(self, test_tokenizer):
+    def test_extract_reasoning_only_end_token(self, test_tokenizer):
         """Test extraction when only end token is present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
-        model_output = ("This is reasoning</test:think>This is content")
-        reasoning, content = parser.extract_reasoning_content(
-            model_output, request)
+        model_output = "This is reasoning</test:think>This is content"
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is reasoning"
         assert content == "This is content"
 
-    def test_extract_reasoning_content_no_end_token(self, test_tokenizer):
+    def test_extract_reasoning_no_end_token(self, test_tokenizer):
         """Test extraction when no end token is present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = "This is just content"
-        reasoning, content = parser.extract_reasoning_content(
-            model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is just content"
         assert content is None
 
-    def test_extract_reasoning_content_empty_output(self, test_tokenizer):
+    def test_extract_reasoning_empty_output(self, test_tokenizer):
         """Test extraction with empty output."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = ""
-        reasoning, content = parser.extract_reasoning_content(
-            model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == ""
         assert content is None
 
-    def test_extract_reasoning_content_only_tokens(self, test_tokenizer):
+    def test_extract_reasoning_only_tokens(self, test_tokenizer):
         """Test extraction with only tokens and no content."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
-        model_output = ("<test:think></test:think>")
-        reasoning, content = parser.extract_reasoning_content(
-            model_output, request)
+        model_output = "<test:think></test:think>"
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == ""
         assert content is None
@@ -221,19 +213,24 @@ class TestBaseThinkingReasoningParserStreaming:
     @pytest.mark.parametrize("streaming", [True, False])
     def test_simple_reasoning_extraction(self, test_tokenizer, streaming):
         """
-            Test basic reasoning extraction in both
-            streaming and non-streaming modes.
+        Test basic reasoning extraction in both
+        streaming and non-streaming modes.
         """
         parser = TestThinkingReasoningParser(test_tokenizer)
 
         model_output = [
-            "<test:think>", "Some ", "reasoning ", "content", "</test:think>",
-            "Final ", "answer"
+            "<test:think>",
+            "Some ",
+            "reasoning ",
+            "content",
+            "</test:think>",
+            "Final ",
+            "answer",
         ]
 
-        reasoning, content = run_reasoning_extraction(parser,
-                                                      model_output,
-                                                      streaming=streaming)
+        reasoning, content = run_reasoning_extraction(
+            parser, model_output, streaming=streaming
+        )
 
         assert reasoning == "Some reasoning content"
         assert content == "Final answer"
@@ -252,9 +249,7 @@ class TestBaseThinkingReasoningParserStreaming:
             "answer",
         ]
 
-        reasoning, content = run_reasoning_extraction(parser,
-                                                      deltas,
-                                                      streaming=True)
+        reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
 
         assert reasoning == "Some reasoning content"
         assert content == "Final answer"
@@ -271,9 +266,7 @@ class TestBaseThinkingReasoningParserStreaming:
             "Answer",
         ]
 
-        reasoning, content = run_reasoning_extraction(parser,
-                                                      deltas,
-                                                      streaming=True)
+        reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
 
         assert reasoning == "Some reasoning"
         assert content == "Answer"
@@ -290,9 +283,7 @@ class TestBaseThinkingReasoningParserStreaming:
             "end",
         ]
 
-        reasoning, content = run_reasoning_extraction(parser,
-                                                      deltas,
-                                                      streaming=True)
+        reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
 
         assert reasoning == "Some reasoning without end"
         assert content is None
@@ -309,9 +300,7 @@ class TestBaseThinkingReasoningParserStreaming:
             "Final",
         ]
 
-        reasoning, content = run_reasoning_extraction(parser,
-                                                      deltas,
-                                                      streaming=True)
+        reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
 
         assert reasoning == "Reasoning content"
         assert content == "Final"
@@ -319,29 +308,27 @@ class TestBaseThinkingReasoningParserStreaming:
 
 class TestBaseThinkingReasoningParserMultipleImplementations:
     """
-        Test that multiple implementations of
-        BaseThinkingReasoningParser work correctly.
+    Test that multiple implementations of
+    BaseThinkingReasoningParser work correctly.
     """
 
     def test_different_token_implementations(self, test_tokenizer):
         """
-            Test that different implementations 
-            with different tokens work independently.
+        Test that different implementations
+        with different tokens work independently.
         """
         parser1 = TestThinkingReasoningParser(test_tokenizer)
         parser2 = TestThinkingReasoningParserAlt(test_tokenizer)
 
         # Test parser1
-        model_output1 = ("Reasoning1</test:think>Content1")
-        reasoning1, content1 = run_reasoning_extraction(
-            parser1, [model_output1])
+        model_output1 = "Reasoning1</test:think>Content1"
+        reasoning1, content1 = run_reasoning_extraction(parser1, [model_output1])
         assert reasoning1 == "Reasoning1"
         assert content1 == "Content1"
 
         # Test parser2
         model_output2 = "Reasoning2<alt:end>Content2"
-        reasoning2, content2 = run_reasoning_extraction(
-            parser2, [model_output2])
+        reasoning2, content2 = run_reasoning_extraction(parser2, [model_output2])
         assert reasoning2 == "Reasoning2"
         assert content2 == "Content2"
 
@@ -359,7 +346,7 @@ class TestBaseThinkingReasoningParserEdgeCases:
         """Test behavior with multiple end tokens."""
         parser = TestThinkingReasoningParser(test_tokenizer)
 
-        model_output = ("First</test:think>Middle</test:think>Last")
+        model_output = "First</test:think>Middle</test:think>Last"
         reasoning, content = run_reasoning_extraction(parser, [model_output])
 
         # Should stop at first end token
@@ -370,8 +357,7 @@ class TestBaseThinkingReasoningParserEdgeCases:
         """Test behavior with nested-like token patterns."""
         parser = TestThinkingReasoningParser(test_tokenizer)
 
-        model_output = ("<test:think>Outer"
-                        "<test:think>Inner</test:think>Content")
+        model_output = "<test:think>Outer<test:think>Inner</test:think>Content"
         reasoning, content = run_reasoning_extraction(parser, [model_output])
 
         # Should process normally, start from first start token
@@ -382,11 +368,9 @@ class TestBaseThinkingReasoningParserEdgeCases:
         """Test behavior with malformed token-like strings."""
         parser = TestThinkingReasoningParser(test_tokenizer)
 
-        model_output = ("<test:thinking>Not a real token"
-                        "</test:thinking>Content")
+        model_output = "<test:thinking>Not a real token</test:thinking>Content"
         reasoning, content = run_reasoning_extraction(parser, [model_output])
 
         # Should treat as regular content since tokens don't match exactly
-        assert reasoning == ("<test:thinking>Not a real token"
-                             "</test:thinking>Content")
+        assert reasoning == ("<test:thinking>Not a real token</test:thinking>Content")
         assert content is None
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 987f3c48de0c0c592f230f65643870fb4940d91b..91f0c93653d3294420dad50a1aa12485b83c0002 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -21,97 +21,97 @@ def deepseek_r1_qwen_tokenizer():
 
 SIMPLE_REASONING = {
     "output": "This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat</think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "</think>This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "</think>This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "<think>This\nThat</think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "</think>This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "</think>This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 THINK_NO_END = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY = {
     "output": "",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
     "output": "",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
     "is_reasoning_end": False,
 }
 NEW_LINE = {
     "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -121,7 +121,7 @@ NEW_LINE = {
 # or not.
 NEW_LINE_STREAMING = {
     "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
-    "reasoning_content": "\nThis is a reasoning section",
+    "reasoning": "\nThis is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -259,17 +259,17 @@ def test_reasoning(
     output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: list[str] = [
-        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
-        for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(deepseek_r1_qwen_tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        deepseek_r1_qwen_tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
     # Test is_reasoning_end
@@ -281,7 +281,8 @@ def test_reasoning(
     if param_dict["content"] is not None:
         content = parser.extract_content_ids(output_ids)
         assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
-            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"])
+        )
     else:
         content = parser.extract_content_ids(output)
         assert content == []
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e8f0e8dcc9b9cf7598eda1cf49a5390bdf6bdd9
--- /dev/null
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1"
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+@pytest.mark.parametrize(
+    "thinking,expected_parser_type",
+    [
+        (True, DeepSeekR1ReasoningParser),
+        (False, IdentityReasoningParser),
+    ],
+)
+def test_parser_selection(tokenizer, thinking, expected_parser_type):
+    parser = DeepSeekV3ReasoningParser(
+        tokenizer, chat_template_kwargs={"thinking": thinking}
+    )
+
+    assert isinstance(parser._parser, expected_parser_type)
+
+
+def test_identity_reasoning_parser_basic(tokenizer):
+    parser = IdentityReasoningParser(tokenizer)
+
+    # Test is_reasoning_end always returns True
+    input_text = "This is some output"
+    input_tokens = tokenizer.tokenize(input_text)
+    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
+    assert parser.is_reasoning_end(input_ids) is True
+
+    # Test extract_content_ids returns all input_ids
+    assert parser.extract_content_ids(input_ids) == input_ids
+
+    # Test extract_reasoning returns (None, model_output)
+    request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
+    reasoning, content = parser.extract_reasoning(input_text, request)
+    assert reasoning is None
+    assert content == input_text
+
+    # Test extract_reasoning_streaming returns DeltaMessage or None
+    result = parser.extract_reasoning_streaming(
+        previous_text="",
+        current_text="Hello world",
+        delta_text="Hello world",
+        previous_token_ids=[],
+        current_token_ids=input_ids,
+        delta_token_ids=input_ids,
+    )
+    assert isinstance(result, DeltaMessage)
+    assert result.content == "Hello world"
+
+    # If delta_text is empty, should return None
+    result_none = parser.extract_reasoning_streaming(
+        previous_text="Hello world",
+        current_text="Hello world",
+        delta_text="",
+        previous_token_ids=input_ids,
+        current_token_ids=input_ids,
+        delta_token_ids=[],
+    )
+    assert result_none is None
diff --git a/tests/reasoning/test_ernie45_reasoning_parser.py b/tests/reasoning/test_ernie45_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf5507ae68badc11f0d392ffb2f5c893bf770fc
--- /dev/null
+++ b/tests/reasoning/test_ernie45_reasoning_parser.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "ernie45"
+
+REASONING_MODEL_NAME = "baidu/ERNIE-4.5-21B-A3B-Thinking"
+
+
+@pytest.fixture(scope="module")
+def ernie45_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# 带 </think>，非stream
+WITH_THINK = {
+    "output": "abc</think>def",
+    "reasoning": "abc",
+    "content": "def",
+}
+# 带 </think>，stream
+WITH_THINK_STREAM = {
+    "output": "abc</think>def",
+    "reasoning": "abc",
+    "content": "def",
+}
+# without </think>, all is reasoning
+WITHOUT_THINK = {
+    "output": "abc",
+    "reasoning": "abc",
+    "content": None,
+}
+# without </think>, all is reasoning
+WITHOUT_THINK_STREAM = {
+    "output": "abc",
+    "reasoning": "abc",
+    "content": None,
+}
+
+COMPLETE_REASONING = {
+    "output": "abc</think>",
+    "reasoning": "abc",
+    "content": None,
+}
+MULTILINE_REASONING = {
+    "output": "abc\nABC</think>def\nDEF",
+    "reasoning": "abc\nABC",
+    "content": "def\nDEF",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    ernie45_tokenizer,
+):
+    output = ernie45_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = []
+    for token in output:
+        one_token = ernie45_tokenizer.convert_tokens_to_string([token])
+        if one_token:
+            output_tokens.append(one_token)
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        ernie45_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    print()
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
diff --git a/tests/reasoning/test_glm4_moe_reasoning_parser.py b/tests/reasoning/test_glm4_moe_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f7827e5b8277c689f529ef54361e99b14c264b1
--- /dev/null
+++ b/tests/reasoning/test_glm4_moe_reasoning_parser.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "glm45"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "zai-org/GLM-4.5"
+
+
+@pytest.fixture(scope="module")
+def glm45_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+WITH_THINK_STREAM = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+WITHOUT_THINK = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
+}
+
+WITHOUT_THINK_STREAM = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
+}
+
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+MULTILINE_REASONING = {
+    "output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning": "This is a reasoning\nsection",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+ONLY_OPEN_TAG = {
+    "output": "<think>This is a reasoning section",
+    "reasoning": None,
+    "content": "<think>This is a reasoning section",
+    "is_reasoning_end": False,
+}
+
+ONLY_OPEN_TAG_STREAM = {
+    "output": "<think>This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        ONLY_OPEN_TAG,
+        id="only_open_tag",
+    ),
+    pytest.param(
+        True,
+        ONLY_OPEN_TAG_STREAM,
+        id="only_open_tag_stream",
+    ),
+]
+
+STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think>The user is asking for the capital of"""
+
+DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think>The user is asking for the capital of France.</think>
+The capital of France is Paris."""
+
+MULTI_TURN_STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think></think>
+The capital of France is Paris.<|user|>
+What about Chile?<|assistant|>
+<think>The user is asking for the capital of"""
+
+MULTI_TURN_DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think></think>
+The capital of France is Paris.<|user|>
+What about Chile?<|assistant|>
+<think>The user is asking for the capital of Chile.</think>
+The capital of Chile is Santiago."""
+
+REASONING_END_TEST_CASES = [
+    pytest.param(STILL_REASONING_PROMPT, False, id="still_reasoning"),
+    pytest.param(DONE_REASONING_PROMPT, True, id="done_reasoning"),
+    pytest.param(
+        MULTI_TURN_STILL_REASONING_PROMPT, False, id="multi_turn_still_reasoning"
+    ),
+    pytest.param(
+        MULTI_TURN_DONE_REASONING_PROMPT, True, id="multi_turn_done_reasoning"
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    glm45_tokenizer,
+):
+    output = glm45_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        glm45_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        glm45_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+
+    output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+
+@pytest.mark.parametrize("prompt, is_reasoning_end", REASONING_END_TEST_CASES)
+def test_is_reasoning_end_full_prompt(
+    prompt: str, is_reasoning_end: bool, glm45_tokenizer
+):
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        glm45_tokenizer
+    )
+    tokens = glm45_tokenizer.tokenize(prompt)
+    token_ids = glm45_tokenizer.convert_tokens_to_ids(tokens)
+    check_is_reasoning_end = parser.is_reasoning_end(token_ids)
+    assert check_is_reasoning_end == is_reasoning_end
diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..873135d5717f860a775fd26ab456263e1941440a
--- /dev/null
+++ b/tests/reasoning/test_gptoss_reasoning_parser.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.gptoss_reasoning_parser import GptOssReasoningParser
+
+REASONING_MODEL_NAME = "openai/gpt-oss-120b"
+
+
+@pytest.fixture(scope="module")
+def gpt_oss_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+USER_MESSAGE_START = "<|start|>user<|message|>"
+REASONING_SECTION_START = "<|end|><|start|>assistant<|channel|>analysis<|message|>"
+ASSISTANT_CONTENT_START_PREFIX = "<|end|><|start|>assistant<|channel|>final"
+ASSISTANT_CONTENT_START_SUFFIX = "<|message|>"
+ASSISTANT_CONTENT_START = (
+    ASSISTANT_CONTENT_START_PREFIX + ASSISTANT_CONTENT_START_SUFFIX
+)
+
+BASIC_CONTENT = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+BASIC_REASONING_ONLY = {
+    "output": REASONING_SECTION_START + "This is reasoning" + "<|end|>",
+    "is_reasoning_end": False,
+}
+BASIC_NO_REASONING_NO_ASSISTANT = {
+    "output": USER_MESSAGE_START + "This is a user message",
+    "is_reasoning_end": False,
+}
+
+# Edge-case where the model omits the assistant tag entirely.
+BASIC_NO_REASONING_ASSISTANT = {
+    "output": USER_MESSAGE_START + "This is a user message<|end|><|channel|>final",
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX,
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_SUFFIX_ONLY = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_SUFFIX,
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_1_NO_SUFFIX = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON ",
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_1 = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON "
+    + ASSISTANT_CONTENT_START_SUFFIX,
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_1_WITH_CONTENT = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON "
+    + ASSISTANT_CONTENT_START_SUFFIX
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_2 = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|>ReplyAction "
+    + ASSISTANT_CONTENT_START_SUFFIX
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    BASIC_CONTENT,
+    BASIC_REASONING_ONLY,
+    COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY,
+    COMPLEX_CONTENT_SUFFIX_ONLY,
+    COMPLEX_CONTENT_1_NO_SUFFIX,
+    COMPLEX_CONTENT_1,
+    COMPLEX_CONTENT_1_WITH_CONTENT,
+    COMPLEX_CONTENT_2,
+]
+
+
+@pytest.mark.parametrize(
+    "output, is_reasoning_end",
+    [(t["output"], t["is_reasoning_end"]) for t in TEST_CASES],
+)
+def test_gptoss_is_reasoning_end(
+    output,
+    is_reasoning_end,
+    gpt_oss_tokenizer,
+):
+    output = gpt_oss_tokenizer.tokenize(output)
+    parser: ReasoningParser = GptOssReasoningParser(gpt_oss_tokenizer)
+
+    # Test is_reasoning_end
+    output_ids = gpt_oss_tokenizer.convert_tokens_to_ids(output)
+    actual_is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == actual_is_reasoning_end
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index 38cab73a45f22b711566680c629e4d1d13ef5cc6..14aad3ad0818999228d22a66a7fb3f06e800ec4c 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -11,42 +11,38 @@ START_REASONING = "Here is my thought process:"
 START_RESPONSE = "Here is my response:"
 
 SIMPLE_REASONING = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 NO_REASONING = {
     "output": "This is content",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is content",
 }
 MULTIPLE_LINES = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 
@@ -137,14 +133,15 @@ def test_reasoning(
     output_tokens: list[str] = [
         tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
 
@@ -158,7 +155,7 @@ STREAMING_1 = {
     "previous_text": None,
     "current_text": "Here",
     "delta_text": "Here",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # When we fail, we should give what was previously being silenced first
@@ -166,7 +163,7 @@ STREAMING_2 = {
     "previous_text": "Here is my thought",
     "current_text": "Here is my thought failure",
     "delta_text": " failure",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "Here is my thought failure",
 }
 # But then after the first one, we should only add the delta text to content
@@ -174,7 +171,7 @@ STREAMING_3 = {
     "previous_text": "Here wrong",
     "current_text": " words",
     "delta_text": " Here wrong words",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": " words",
 }
 # But then after the first one, we should only add the delta text to content
@@ -182,7 +179,7 @@ STREAMING_4 = {
     "previous_text": "Here is my thought",
     "current_text": "Here is my thought process:",
     "delta_text": " process:",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # Reasoning started successfully; parse reasoning content
@@ -190,7 +187,7 @@ STREAMING_5 = {
     "previous_text": "Here is my thought process:",
     "current_text": "Here is my thought process: foo",
     "delta_text": " foo",
-    "reasoning_content": " foo",
+    "reasoning": " foo",
     "content": None,
 }
 # Response special sequence has started, but not finished.
@@ -198,7 +195,7 @@ STREAMING_6 = {
     "previous_text": "Here is my thought process: foo",
     "current_text": "Here is my thought process: foo Here is",
     "delta_text": " Here is",
-    "reasoning_content": " ",
+    "reasoning": " ",
     "content": None,
 }
 # Response special sequence started, but was broken; the reasoning
@@ -207,7 +204,7 @@ STREAMING_7 = {
     "previous_text": "Here is my thought process: foo Here is",
     "current_text": "Here is my thought process: foo Here is Here",
     "delta_text": " Here",
-    "reasoning_content": "Here is ",
+    "reasoning": "Here is ",
     "content": None,
 }
 # Response special sequence is ongoing
@@ -215,7 +212,7 @@ STREAMING_8 = {
     "previous_text": "Here is my thought process: foo Here is my response:",
     "current_text": "Here is my thought process: foo Here is my response: bar",
     "delta_text": " bar",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": " bar",
 }
 # The delta text has everything; we should be able to correctly parse both
@@ -223,26 +220,23 @@ STREAMING_9 = {
     "previous_text": None,
     "current_text": "Here is my thought process: foo Here is my response: bar",
     "delta_text": "Here is my thought process: foo Here is my response: bar",
-    "reasoning_content": " foo ",
+    "reasoning": " foo ",
     "content": " bar",
 }
 ## The Response is ongoing, and the delta mixes reasoning content / content
 STREAMING_10 = {
     "previous_text": "Here is my thought process: foo",
-    "current_text":
-    "Here is my thought process: foo bar Here is my response: baz",
+    "current_text": "Here is my thought process: foo bar Here is my response: baz",
     "delta_text": " bar Here is my response: baz",
-    "reasoning_content": " bar ",
+    "reasoning": " bar ",
     "content": " baz",
 }
 # The delta text starts a new substring that might be a response special seq
 STREAMING_11 = {
-    "previous_text":
-    "Here is my thought process: This is a reasoning section ",
-    "current_text":
-    "Here is my thought process: This is a reasoning section Here",
+    "previous_text": "Here is my thought process: This is a reasoning section ",
+    "current_text": "Here is my thought process: This is a reasoning section Here",
     "delta_text": "Here",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # The delta text is finishing the response special seq
@@ -250,14 +244,14 @@ STREAMING_12 = {
     "previous_text": "Here is my thought process: foo Here is my response",
     "current_text": "Here is my thought process: foo Here is my response:",
     "delta_text": ":",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 STREAMING_13 = {
     "previous_text": "Here is my thought process: foo Here",
     "current_text": "Here is my thought process: foo Here was",
     "delta_text": " was",
-    "reasoning_content": "Here was",
+    "reasoning": "Here was",
     "content": None,
 }
 
@@ -320,16 +314,19 @@ STREAMING_SUBCASES = [
 @pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
 def test_streaming_subcases(param_dict):
     # Get all of the token IDs
-    previous_token_ids = tokenizer.encode(
-        param_dict["previous_text"]
-    ) if param_dict["previous_text"] is not None else []
+    previous_token_ids = (
+        tokenizer.encode(param_dict["previous_text"])
+        if param_dict["previous_text"] is not None
+        else []
+    )
     current_token_ids = tokenizer.encode(param_dict["current_text"])
     delta_token_ids = tokenizer.encode(param_dict["delta_text"])
 
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
-    response = parser.extract_reasoning_content_streaming(
+    response = parser.extract_reasoning_streaming(
         previous_text=param_dict["previous_text"],
         current_text=param_dict["current_text"],
         delta_text=param_dict["delta_text"],
@@ -339,10 +336,9 @@ def test_streaming_subcases(param_dict):
     )
     # Streaming currently expects at least one of reasoning content / content,
     # so the response should return None in that case.
-    if param_dict["reasoning_content"] is None and param_dict[
-            "content"] is None:
+    if param_dict["reasoning"] is None and param_dict["content"] is None:
         assert response is None
     else:
         assert isinstance(response, DeltaMessage)
-        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["reasoning"] == response.reasoning
         assert param_dict["content"] == response.content
diff --git a/tests/reasoning/test_hunyuan_reasoning_parser.py b/tests/reasoning/test_hunyuan_reasoning_parser.py
index f9238267f02ed390991d26495b2829efbf95a2a3..32e753d2abb721788523fb8d27fb15fc8a75f82e 100644
--- a/tests/reasoning/test_hunyuan_reasoning_parser.py
+++ b/tests/reasoning/test_hunyuan_reasoning_parser.py
@@ -13,55 +13,50 @@ START_RESPONSE = "\n</think>\n<answer>\n"
 END_RESPONSE = "\n</answer>"
 
 NO_REASONING_QUICK_THROUGHT = {
-    "output":
-    f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
-    "reasoning_content": None,
+    "output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
+    "reasoning": None,
     "content": "This is the rest",
 }
 
 SIMPLE_REASONING = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 
 COMPLETE_REASONING_WITH_SYMBOL = {
     "output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section!",
+    "reasoning": "This is a reasoning section!",
     "content": None,
 }
 NO_REASONING = {
     "output": "This is content",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is content",
 }
 MULTIPLE_LINES = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 
@@ -122,9 +117,7 @@ TEST_CASES = [
         NO_REASONING,
         id="no_reasoning_streaming",
     ),
-    pytest.param(True,
-                 NO_REASONING_QUICK_THROUGHT,
-                 id="no_reasoning_quick_stream"),
+    pytest.param(True, NO_REASONING_QUICK_THROUGHT, id="no_reasoning_quick_stream"),
     pytest.param(
         True,
         MULTIPLE_LINES,
@@ -148,8 +141,9 @@ TEST_CASES = [
 ]
 
 # Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("tencent/Hunyuan-A13B-Instruct",
-                                          trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(
+    "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
+)
 
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
@@ -162,12 +156,13 @@ def test_reasoning(
     output_tokens: list[str] = [
         tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 91a22f6f5d7202bea4d51b64729eca6e9137ae7c..5163c863863a7d5d17fdf6e2e18c77f5f4e71b41 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -2,9 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from mistral_common.tokens.tokenizers.base import SpecialTokens
-from mistral_common.tokens.tokenizers.tekken import (SpecialTokenInfo,
-                                                     Tekkenizer)
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -15,125 +12,105 @@ parser_name = "mistral"
 
 @pytest.fixture(scope="module")
 def mistral_tokenizer():
-    # TODO(Julien): upon model release change to a tokenizer already configured.
-    # =================================================================
     mistral_tokenizer = MistralTokenizer.from_pretrained(
-        "mistralai/Devstral-Small-2507")
-    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
-    # Add think special tokens to the tokenizer
-    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
-        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
-    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
-        rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
-        k: v
-        for k, v in
-        mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
-        if v not in {35, 36}
-    }
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.begin_think.value] = 35
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.end_think.value] = 36
-    mistral_tokenizer.instruct.BEGIN_THINK = 35
-    mistral_tokenizer.instruct.END_THINK = 36
-    # =================================================================
+        "mistralai/Magistral-Small-2509"
+    )
     return mistral_tokenizer
 
 
 SIMPLE_REASONING = {
     "output": "This is a reasoning section[/THINK]This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section[/THINK]",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat[/THINK]This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "[THINK]This is a reasoning section[/THINK]",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "[THINK]This\nThat[/THINK]This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 THINK_NO_END = {
     "output": "[THINK]This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY = {
     "output": "",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
     "output": "",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
     "is_reasoning_end": False,
 }
 NEW_LINE = {
     "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -143,7 +120,7 @@ NEW_LINE = {
 # or not.
 NEW_LINE_STREAMING = {
     "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning_content": "\nThis is a reasoning section",
+    "reasoning": "\nThis is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -290,41 +267,47 @@ def test_mistral_reasoning(
     if index_think != -1:
         output_before_think = output[:index_think]
         output_tokens += mistral_tokenizer.tokenizer.encode(
-            output_before_think, False, False)
+            output_before_think, False, False
+        )
         output_tokens += [mistral_tokenizer.instruct.BEGIN_THINK]
 
         if index_end_think != -1:
-            output_middle = output[index_think + len_think:index_end_think]
-            output_after_think = output[index_end_think + len_end_think:]
+            output_middle = output[index_think + len_think : index_end_think]
+            output_after_think = output[index_end_think + len_end_think :]
             output_tokens += mistral_tokenizer.tokenizer.encode(
-                output_middle, False, False)
+                output_middle, False, False
+            )
             output_tokens += [mistral_tokenizer.instruct.END_THINK]
             output_tokens += mistral_tokenizer.tokenizer.encode(
-                output_after_think, False, False)
+                output_after_think, False, False
+            )
         else:
-            output_middle = output[index_think + len_think:]
+            output_middle = output[index_think + len_think :]
             output_tokens += mistral_tokenizer.tokenizer.encode(
-                output_middle, False, False)
+                output_middle, False, False
+            )
     elif index_end_think != -1:
         output_before_think = output[:index_end_think]
-        output_after_think = output[index_end_think + len_end_think:]
+        output_after_think = output[index_end_think + len_end_think :]
         output_tokens += mistral_tokenizer.tokenizer.encode(
-            output_before_think, False, False)
+            output_before_think, False, False
+        )
         output_tokens += [mistral_tokenizer.instruct.END_THINK]
         output_tokens += mistral_tokenizer.tokenizer.encode(
-            output_after_think, False, False)
+            output_after_think, False, False
+        )
     else:
-        output_tokens += mistral_tokenizer.tokenizer.encode(
-            output, False, False)
+        output_tokens += mistral_tokenizer.tokenizer.encode(output, False, False)
 
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(mistral_tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        mistral_tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction_mistral(parser,
-                                                          output_tokens,
-                                                          streaming=streaming)
+    reasoning, content = run_reasoning_extraction_mistral(
+        parser, output_tokens, streaming=streaming
+    )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
     # Test is_reasoning_end
@@ -335,7 +318,8 @@ def test_mistral_reasoning(
     if param_dict["content"] is not None:
         content = parser.extract_content_ids(output_tokens)
         assert content == mistral_tokenizer.tokenizer.encode(
-            param_dict["content"], bos=False, eos=False)
+            param_dict["content"], bos=False, eos=False
+        )
     else:
         content = parser.extract_content_ids(output_tokens)
         assert content == []
diff --git a/tests/reasoning/test_olmo3_reasoning_parser.py b/tests/reasoning/test_olmo3_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0e72e2a45634be0ea26a60df3d4ceeaf3b0046
--- /dev/null
+++ b/tests/reasoning/test_olmo3_reasoning_parser.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "olmo3"
+START_REASONING = "<think>"
+END_REASONING = "</think>"
+
+NO_REASONING = {
+    "output": f"{START_REASONING}{END_REASONING}No thoughts, head empty!",
+    "reasoning": None,
+    "content": "No thoughts, head empty!",
+}
+
+NO_REASONING_WITH_NEWLINE = {
+    "output": f"{START_REASONING}\n{END_REASONING}\n\nNo thoughts, head empty!",
+    "reasoning": "\n",
+    "content": "\n\nNo thoughts, head empty!",
+}
+
+SIMPLE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{END_REASONING}This is the rest",  # noqa: E501
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+
+SIMPLE_REASONING_WITH_NEWLINE = {
+    "output": f"{START_REASONING} Look!\n\nI'm thinking...{END_REASONING}\nThis is the rest",  # noqa: E501
+    "reasoning": " Look!\n\nI'm thinking...",
+    "content": "\nThis is the rest",
+}
+
+SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = {
+    "output": f"{START_REASONING}\nLook!\nI'm thinking...\n\n{END_REASONING}\n\n\nThis is the rest",  # noqa: E501
+    "reasoning": "\nLook!\nI'm thinking...\n\n",
+    "content": "\n\n\nThis is the rest",
+}
+
+NO_REASONING_ONLY_END_THINK = {
+    "output": f"{END_REASONING}\n\nNo thoughts, head empty!",
+    "reasoning": None,
+    "content": "\n\nNo thoughts, head empty!",
+}
+
+REASONING_ONLY_END_THINK = {
+    "output": f"The user is asking me not to think.{END_REASONING}No thoughts!",
+    "reasoning": "The user is asking me not to think.",
+    "content": "No thoughts!",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,  # not streaming
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,  # not streaming
+        NO_REASONING_WITH_NEWLINE,
+        id="no_reasoning_with_newline",
+    ),
+    pytest.param(
+        False,  # not streaming
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,  # not streaming
+        SIMPLE_REASONING_WITH_NEWLINE,
+        id="simple_reasoning_with_newline",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES,
+        id="simple_reasoning_with_multiple_newlines",
+    ),
+    pytest.param(
+        False,  # not streaming
+        NO_REASONING_ONLY_END_THINK,
+        id="no_reasoning_only_end_think",
+    ),
+    pytest.param(
+        False,  # not streaming
+        REASONING_ONLY_END_THINK,
+        id="yes_reasoning_only_end_think",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        NO_REASONING_WITH_NEWLINE,
+        id="no_reasoning_with_newline_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        SIMPLE_REASONING_WITH_NEWLINE,
+        id="simple_reasoning_with_newline_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES,
+        id="simple_reasoning_with_multiple_newlines_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        NO_REASONING_ONLY_END_THINK,
+        id="no_reasoning_only_end_think_streaming",
+    ),
+    pytest.param(
+        True,  # enable streaming
+        REASONING_ONLY_END_THINK,
+        id="yes_reasoning_only_end_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("allenai/dolma2-tokenizer")
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict[str, str],
+):
+    output = tokenizer.tokenize(param_dict["output"])
+
+    # decode everything to tokens
+    model_output: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
+    parser: ReasoningParser = parser_cls(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(
+        reasoning_parser=parser, model_output=model_output, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index 2d5557d5cdc13b260a3df61ebe7952049d872135..92a8b6ab3761501524775c25ec1f8b72eaed5263 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -22,48 +22,47 @@ def qwen3_tokenizer():
 # 带 <think></think>，非stream
 WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 # 带 <think></think>，stream
 WITH_THINK_STREAM = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 # 不带 <think></think>，非stream
 WITHOUT_THINK = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
 }
 # 不带 <think></think>，stream
 WITHOUT_THINK_STREAM = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
 }
 
 COMPLETE_REASONING = {
     "output": "<think>This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTILINE_REASONING = {
-    "output":
-    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
-    "reasoning_content": "This is a reasoning\nsection",
+    "output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning": "This is a reasoning\nsection",
     "content": "This is the rest\nThat",
 }
 ONLY_OPEN_TAG = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "<think>This is a reasoning section",
 }
 
 ONLY_OPEN_TAG_STREAM = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 
@@ -131,12 +130,13 @@ def test_reasoning(
     output_tokens: list[str] = [
         qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(qwen3_tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        qwen3_tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_seedoss_reasoning_parser.py b/tests/reasoning/test_seedoss_reasoning_parser.py
index bb5dc0f4ffe4d5f2639e767adcdaac0160f5499f..33d56d32965a0ad6d20a01bf2e7655b1497354c2 100644
--- a/tests/reasoning/test_seedoss_reasoning_parser.py
+++ b/tests/reasoning/test_seedoss_reasoning_parser.py
@@ -28,53 +28,49 @@ def seedoss_tokenizer():
 
 SIMPLE_REASONING: dict[str, Any] = {
     "output": "This is a reasoning section</seed:think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING: dict[str, Any] = {
     "output": "This is a reasoning section</seed:think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT: dict[str, Any] = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING: dict[str, Any] = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES: dict[str, Any] = {
     "output": "This\nThat</seed:think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 WITH_START_TOKEN: dict[str, Any] = {
-    "output": ("<seed:think>This is a reasoning section"
-               "</seed:think>This is the rest"),
-    "reasoning_content":
-    "This is a reasoning section",
-    "content":
-    "This is the rest",
-    "is_reasoning_end":
-    True,
+    "output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 ONLY_END_TOKEN: dict[str, Any] = {
     "output": "Some reasoning</seed:think>This is the rest",
-    "reasoning_content": "Some reasoning",
+    "reasoning": "Some reasoning",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 NO_TOKENS: dict[str, Any] = {
     "output": "This is just content without any reasoning tokens",
-    "reasoning_content": "This is just content without any reasoning tokens",
+    "reasoning": "This is just content without any reasoning tokens",
     "content": None,
     "is_reasoning_end": False,
 }
@@ -96,9 +92,10 @@ def test_simple_reasoning(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming)
+        parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
+    )
 
-    assert reasoning == SIMPLE_REASONING["reasoning_content"]
+    assert reasoning == SIMPLE_REASONING["reasoning"]
     assert content == SIMPLE_REASONING["content"]
 
 
@@ -109,9 +106,10 @@ def test_complete_reasoning(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming)
+        parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
+    )
 
-    assert reasoning == COMPLETE_REASONING["reasoning_content"]
+    assert reasoning == COMPLETE_REASONING["reasoning"]
     assert content == COMPLETE_REASONING["content"]
 
 
@@ -122,9 +120,10 @@ def test_no_content(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, NO_CONTENT["output"])], streaming=streaming)
+        parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
+    )
 
-    assert reasoning == NO_CONTENT["reasoning_content"]
+    assert reasoning == NO_CONTENT["reasoning"]
     assert content == NO_CONTENT["content"]
 
 
@@ -135,9 +134,10 @@ def test_multiple_lines(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming)
+        parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
+    )
 
-    assert reasoning == MULTIPLE_LINES["reasoning_content"]
+    assert reasoning == MULTIPLE_LINES["reasoning"]
     assert content == MULTIPLE_LINES["content"]
 
 
@@ -148,25 +148,27 @@ def test_with_start_token(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming)
+        parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
+    )
 
-    assert reasoning == WITH_START_TOKEN["reasoning_content"]
+    assert reasoning == WITH_START_TOKEN["reasoning"]
     assert content == WITH_START_TOKEN["content"]
 
 
 @pytest.mark.parametrize("streaming", [True, False])
 def test_only_end_token(seedoss_tokenizer, streaming):
     """
-        Test reasoning extraction with only end token
-        (SeedOSS typical behavior).
+    Test reasoning extraction with only end token
+    (SeedOSS typical behavior).
     """
     parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming)
+        parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
+    )
 
-    assert reasoning == ONLY_END_TOKEN["reasoning_content"]
+    assert reasoning == ONLY_END_TOKEN["reasoning"]
     assert content == ONLY_END_TOKEN["content"]
 
 
@@ -177,9 +179,10 @@ def test_no_tokens(seedoss_tokenizer, streaming):
     parser = parser_cls(seedoss_tokenizer)
 
     reasoning, content = run_reasoning_extraction(
-        parser, [cast(str, NO_TOKENS["output"])], streaming=streaming)
+        parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
+    )
 
-    assert reasoning == NO_TOKENS["reasoning_content"]
+    assert reasoning == NO_TOKENS["reasoning"]
     assert content == NO_TOKENS["content"]
 
 
@@ -225,13 +228,9 @@ def test_streaming_delta_processing(seedoss_tokenizer):
     parser = parser_cls(seedoss_tokenizer)
 
     # Test streaming with incremental tokens
-    deltas = [
-        "Some ", "reasoning ", "content", "</seed:think>", "Final ", "answer"
-    ]
+    deltas = ["Some ", "reasoning ", "content", "</seed:think>", "Final ", "answer"]
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  deltas,
-                                                  streaming=True)
+    reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
 
     assert reasoning == "Some reasoning content"
     assert content == "Final answer"
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 9af5fa5addbc2ab457ed14e253626e8772b7249d..bd0b230a847cb1f9dc7ed62a1b471d69e37d5be9 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,44 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 
 class StreamingReasoningReconstructor:
-
     def __init__(self):
-        self.reasoning_content = None
+        self.reasoning = None
         self.other_content = None
 
     def append_delta(self, delta: DeltaMessage):
         # content and the reasoning content should not be present
         # at the same time
-        assert delta.content is None or delta.reasoning_content is None, (
-            "Both content and reasoning content are present in the "
-            "delta message")
+        assert delta.content is None or delta.reasoning is None, (
+            "Both content and reasoning content are present in the delta message"
+        )
+        assert delta.reasoning == delta.reasoning_content, (
+            "reasoning_content should be present for backwards compatibility"
+        )
         if delta.content is not None:
             if self.other_content is None:
                 self.other_content = delta.content
             else:
                 self.other_content += delta.content
         else:
-            if self.reasoning_content is None:
-                self.reasoning_content = delta.reasoning_content
+            if self.reasoning is None:
+                self.reasoning = delta.reasoning
             else:
-                self.reasoning_content += delta.reasoning_content
+                self.reasoning += delta.reasoning
 
 
 def run_reasoning_extraction(
     reasoning_parser: ReasoningParser,
     model_output: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
-) -> tuple[Optional[str], Optional[str]]:
+) -> tuple[str | None, str | None]:
     if streaming:
         reconstructor = run_reasoning_extraction_streaming(
             reasoning_parser,
@@ -46,23 +46,25 @@ def run_reasoning_extraction(
             request,
         )
         return (
-            reconstructor.reasoning_content,
+            reconstructor.reasoning,
             reconstructor.other_content or None,
         )
     else:
         reasoning, content = run_reasoning_extraction_nonstreaming(
-            reasoning_parser, model_output, request)
+            reasoning_parser, model_output, request
+        )
         return reasoning, content
 
 
 def run_reasoning_extraction_mistral(
     reasoning_parser: ReasoningParser,
     model_output: list[int],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
-) -> tuple[Optional[str], Optional[str]]:
-    assert isinstance(reasoning_parser.model_tokenizer,
-                      MistralTokenizer), type(reasoning_parser.model_tokenizer)
+) -> tuple[str | None, str | None]:
+    assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
+        reasoning_parser.model_tokenizer
+    )
     if streaming:
         reconstructor = run_reasoning_extraction_streaming_mistral(
             reasoning_parser,
@@ -70,31 +72,34 @@ def run_reasoning_extraction_mistral(
             request,
         )
         return (
-            reconstructor.reasoning_content,
+            reconstructor.reasoning,
             reconstructor.other_content or None,
         )
     else:
         str_output = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
-            model_output)
+            model_output
+        )
         reasoning, content = run_reasoning_extraction_nonstreaming(
-            reasoning_parser, str_output, request)
+            reasoning_parser, str_output, request
+        )
         return reasoning, content
 
 
 def run_reasoning_extraction_nonstreaming(
     reasoning_parser: ReasoningParser,
     model_output: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
-) -> tuple[Optional[str], Optional[str]]:
+    request: ChatCompletionRequest | None = None,
+) -> tuple[str | None, str | None]:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
-    return reasoning_parser.extract_reasoning_content(
-        model_output=''.join(model_output), request=request)
+    return reasoning_parser.extract_reasoning(
+        model_output="".join(model_output), request=request
+    )
 
 
 def run_reasoning_extraction_streaming(
     reasoning_parser: ReasoningParser,
     model_deltas: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> StreamingReasoningReconstructor:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingReasoningReconstructor()
@@ -108,7 +113,7 @@ def run_reasoning_extraction_streaming(
         ]
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
-        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+        delta_message = reasoning_parser.extract_reasoning_streaming(
             previous_text,
             current_text,
             delta,
@@ -126,21 +131,21 @@ def run_reasoning_extraction_streaming(
 def run_reasoning_extraction_streaming_mistral(
     reasoning_parser: ReasoningParser,
     model_deltas: list[int],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> StreamingReasoningReconstructor:
-    assert isinstance(reasoning_parser.model_tokenizer,
-                      MistralTokenizer), type(reasoning_parser.model_tokenizer)
+    assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
+        reasoning_parser.model_tokenizer
+    )
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingReasoningReconstructor()
     previous_text = ""
     previous_tokens: list[int] = []
     for model_delta in model_deltas:
         token_delta = [model_delta]
-        delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
-            [model_delta])[0]
+        delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
-        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+        delta_message = reasoning_parser.extract_reasoning_streaming(
             previous_text,
             current_text,
             delta,
diff --git a/tests/rocm/aiter/test_grouped_quant.py b/tests/rocm/aiter/test_grouped_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f0f1eda3558096ae32d93003ba1fd795e5473c
--- /dev/null
+++ b/tests/rocm/aiter/test_grouped_quant.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# This is a test for the AITER group_fp8_quant op.
+# It tests if the AITER op is
+# 1. correctly defined the relationship between
+#    implementation and fake function
+# 2. can be used with torch.compile
+# 3. can be used with CUDA graphs
+# This file will be skipped if AITER is not installed
+# and the platform is not ROCm.
+
+import importlib.util
+
+import pytest
+import torch
+
+# this import statement is needed to ensure the ops are registered
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.platforms import current_platform
+
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
+
+
+def test_rocm_aiter_group_fp8_quant_fake_implementation():
+    """Test that the fake implementation is correctly
+    defined for torch.ops.vllm.rocm_aiter_group_fp8_quant."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Verify the op's fake implementation using torch.library.opcheck
+    # This checks that the fake function returns tensors with correct shapes and dtypes
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        (input_tensor, group_size),
+        test_utils=("test_faketensor",),
+    )
+
+
+def test_rocm_aiter_group_fp8_quant_torch_compile_with_cudagraph():
+    """Test that rocm_aiter_ops.group_fp8_quant
+    with group size 128 can be used with
+    torch.compile in cudagraph mode."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Define a function that uses the op
+    def group_fp8_quant_fn(x):
+        return rocm_aiter_ops.group_fp8_quant(x, group_size)
+
+    # Compile with cudagraph mode
+    compiled_fn = torch.compile(
+        group_fp8_quant_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    # Run eager mode
+    x_fp8_eager, scales_eager = group_fp8_quant_fn(input_tensor)
+
+    # Run compiled version (first run will trigger compilation)
+    x_fp8_compiled, scales_compiled = compiled_fn(input_tensor)
+
+    # Verify shapes match
+    assert x_fp8_compiled.shape == x_fp8_eager.shape
+    assert scales_compiled.shape == scales_eager.shape
+
+    # Verify expected shapes
+    assert x_fp8_compiled.shape == (M, N)
+    expected_scale_cols = (N + group_size - 1) // group_size
+    assert scales_compiled.shape == (M, expected_scale_cols)
+
+    # Verify results match
+    assert torch.allclose(
+        x_fp8_compiled.to(torch.float32),
+        x_fp8_eager.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled, scales_eager, rtol=1e-3, atol=1e-3)
+
+    # Test with different input (reusing compiled graph)
+    input_tensor_2 = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+    x_fp8_eager_2, scales_eager_2 = group_fp8_quant_fn(input_tensor_2)
+    x_fp8_compiled_2, scales_compiled_2 = compiled_fn(input_tensor_2)
+
+    # Verify second run also produces correct results
+    assert torch.allclose(
+        x_fp8_compiled_2.to(torch.float32),
+        x_fp8_eager_2.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled_2, scales_eager_2, rtol=1e-3, atol=1e-3)
+
+
+def test_rocm_aiter_group_fp8_quant_different_shapes():
+    """Test rocm_aiter_ops.group_fp8_quant with different input shapes."""
+    group_size = 128
+
+    test_shapes = [
+        (64, 2048),
+        (256, 8192),
+        (32, 1024),
+        (512, 4096),
+    ]
+
+    for M, N in test_shapes:
+        input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+        x_fp8, scales = rocm_aiter_ops.group_fp8_quant(input_tensor, group_size)
+
+        # Verify shapes
+        assert x_fp8.shape == (M, N)
+        expected_scale_cols = (N + group_size - 1) // group_size
+        assert scales.shape == (M, expected_scale_cols)
+
+        # Verify dtypes
+        from aiter import dtypes
+
+        assert x_fp8.dtype == dtypes.fp8
+        assert scales.dtype == torch.float32
diff --git a/tests/runai_model_streamer_test/test_runai_utils.py b/tests/runai_model_streamer_test/test_runai_utils.py
deleted file mode 100644
index bde77ff6650636ba5c94f4448f5c7debae53340f..0000000000000000000000000000000000000000
--- a/tests/runai_model_streamer_test/test_runai_utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import glob
-import os
-import tempfile
-
-import huggingface_hub.constants
-
-from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf)
-from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
-                                                 list_safetensors)
-
-
-def test_is_runai_obj_uri():
-    assert is_runai_obj_uri("gs://some-gcs-bucket/path")
-    assert is_runai_obj_uri("s3://some-s3-bucket/path")
-    assert not is_runai_obj_uri("nfs://some-nfs-path")
-
-
-def test_runai_list_safetensors_local():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("openai-community/gpt2",
-                                 allow_patterns=["*.safetensors", "*.json"],
-                                 cache_dir=tmpdir)
-        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
-        assert len(safetensors) > 0
-        parentdir = [
-            os.path.dirname(safetensor) for safetensor in safetensors
-        ][0]
-        files = list_safetensors(parentdir)
-        assert len(safetensors) == len(files)
-
-
-if __name__ == "__main__":
-    test_is_runai_obj_uri()
-    test_runai_list_safetensors_local()
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2960ffcbd9eab97b56f74b88eb8842e85c778807..78f5ab3e2d19c0925132b38572724d8d60ad5220 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -36,19 +36,21 @@ def test_beam_search_single_input(
 ) -> None:
     example_prompts = example_prompts[:1]
     with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
+        hf_outputs = hf_model.generate_beam_search(
+            example_prompts, beam_width, max_tokens
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
+        vllm_outputs = vllm_model.generate_beam_search(
+            example_prompts, beam_width, max_tokens
+        )
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
+        for j, (hf_text, vllm_text) in enumerate(
+            zip(hf_output_texts, vllm_output_texts)
+        ):
             print(f">>>{j}-th hf output:")
             print(hf_text)
             print(f">>>{j}-th vllm output:")
@@ -56,8 +58,8 @@ def test_beam_search_single_input(
         assert len(hf_output_ids) == len(vllm_output_ids)
         for j in range(len(hf_output_ids)):
             assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+            )
 
 
 @pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
@@ -76,30 +78,29 @@ def test_beam_search_with_concurrency_limit(
 ) -> None:
     # example_prompts[1]&[3]&[7] fails due to unknown reason even without
     # concurrency limit. skip them for now.
-    example_prompts = (example_prompts[:8])
+    example_prompts = example_prompts[:8]
     concurrency_limit = 2
     assert len(example_prompts) > concurrency_limit
     with vllm_runner(model, dtype=dtype) as vllm_model:
         outputs_with_limit = vllm_model.generate_beam_search(
-            example_prompts,
-            beam_width,
-            max_tokens,
-            concurrency_limit=concurrency_limit)
+            example_prompts, beam_width, max_tokens, concurrency_limit=concurrency_limit
+        )
         outputs_without_limit = []
 
         for i in range(0, len(example_prompts), concurrency_limit):
             outputs_without_limit.extend(
                 vllm_model.generate_beam_search(
-                    example_prompts[i:i + concurrency_limit], beam_width,
-                    max_tokens))
+                    example_prompts[i : i + concurrency_limit], beam_width, max_tokens
+                )
+            )
 
     correct = True
     for i in range(len(example_prompts)):
         output_ids_with_limit, output_texts_with_limit = outputs_with_limit[i]
-        output_ids_without_limit, output_texts_without_limit = (
-            outputs_without_limit[i])
+        output_ids_without_limit, output_texts_without_limit = outputs_without_limit[i]
         for j, (text_with_limit, text_without_limit) in enumerate(
-                zip(output_texts_with_limit, output_texts_without_limit)):
+            zip(output_texts_with_limit, output_texts_without_limit)
+        ):
             print(f">>>{j}-th with limit output:")
             print(text_with_limit)
             print(f">>>{j}-th without limit output:")
@@ -107,8 +108,10 @@ def test_beam_search_with_concurrency_limit(
         assert len(output_ids_with_limit) == len(output_ids_without_limit)
         for j in range(len(output_ids_with_limit)):
             if output_ids_with_limit[j] != output_ids_without_limit[j]:
-                print(f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n"
-                      f"-limit: {output_ids_without_limit}")
+                print(
+                    f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n"
+                    f"-limit: {output_ids_without_limit}"
+                )
                 correct = False
     assert correct
 
@@ -131,11 +134,10 @@ def test_beam_search_passes_multimodal_data(
     model = "Qwen/Qwen2-Audio-7B-Instruct"
     audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
     prompts = [
-        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
+        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
     ]
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSeq2SeqLM) as hf_model:
         audio_token_id = hf_model.config.audio_token_index
         eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
         hf_outputs = hf_model.generate_beam_search(
@@ -153,17 +155,15 @@ def test_beam_search_passes_multimodal_data(
             audios=audios,
         )
 
-    seq_with_no_audio_toks = lambda seq: [
-        tok for tok in seq if tok != audio_token_id
-    ]
+    seq_with_no_audio_toks = lambda seq: [tok for tok in seq if tok != audio_token_id]
 
     for i in range(len(prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
 
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
+        for j, (hf_text, vllm_text) in enumerate(
+            zip(hf_output_texts, vllm_output_texts)
+        ):
             print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
             print(hf_text)
             print(f">>>{j}-th vllm output:")
@@ -176,12 +176,10 @@ def test_beam_search_passes_multimodal_data(
             # token to match features, while the vLLM helper maintains the
             # single audio token in the input text
             filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
-            filtered_vllm_output_ids = seq_with_no_audio_toks(
-                vllm_output_ids[j])
+            filtered_vllm_output_ids = seq_with_no_audio_toks(vllm_output_ids[j])
 
             # HF output IDs may contain the end of sequence
-            if len(filtered_hf_output_ids
-                   ) == len(filtered_vllm_output_ids) + 1:
+            if len(filtered_hf_output_ids) == len(filtered_vllm_output_ids) + 1:
                 assert filtered_hf_output_ids[-1] == eos_token_id
                 filtered_hf_output_ids = filtered_hf_output_ids[:-1]
 
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 1d77d37a5d58164452c1c6c0ea5dddcecdf4fa19..d1609b24cc5a82233bec2ec6ceef24877c3159ef 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -25,11 +25,11 @@ def test_ignore_eos(
     max_tokens: int,
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
+        sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
 
         for prompt in example_prompts:
             ignore_eos_output = vllm_model.llm.generate(
-                prompt, sampling_params=sampling_params)
+                prompt, sampling_params=sampling_params
+            )
             output_length = len(ignore_eos_output[0].outputs[0].token_ids)
             assert output_length == max_tokens
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9d227599cde505fa0c99840d7e6c06a4b1233e0
--- /dev/null
+++ b/tests/samplers/test_logprobs.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.logprobs import FlatLogprobs
+
+MODELS = ["distilbert/distilgpt2"]
+MAX_TOKENS = 5
+NUM_TOP_LOGPROBS = 5
+NUM_PROMPT_LOGPROBS = 7
+MAX_LOGPROBS = max(NUM_TOP_LOGPROBS, NUM_PROMPT_LOGPROBS)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("greedy", [True, False])
+@pytest.mark.parametrize("flat_logprobs", [True, False])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    greedy,
+    flat_logprobs,
+    example_prompts,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0")
+    with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
+        sampling_params = SamplingParams(
+            temperature=0.0 if greedy else 1.0,
+            top_p=1.0,
+            max_tokens=MAX_TOKENS,
+            logprobs=NUM_TOP_LOGPROBS,
+            prompt_logprobs=NUM_PROMPT_LOGPROBS,
+        )
+        results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    assert len(results) == len(example_prompt_tokens)
+    for i, (result, prompt_tokens) in enumerate(zip(results, example_prompt_tokens)):
+        decode_tokens, _, decode_logprobs, prompt_logprobs = result
+
+        # Ensure the return type of logprobs is accurate
+        assert isinstance(prompt_logprobs, FlatLogprobs if flat_logprobs else list)
+        assert isinstance(decode_logprobs, FlatLogprobs if flat_logprobs else list)
+
+        ########################
+        # Check prompt logprobs
+        ########################
+        assert len(prompt_tokens) == len(prompt_logprobs)
+        # No logprob for first prompt token
+        assert not prompt_logprobs[0]
+        for position, (token, logprobs) in enumerate(
+            zip(prompt_tokens[1:], prompt_logprobs[1:]), start=1
+        ):
+            # Ensure logprobs of prompt token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_PROMPT_LOGPROBS or NUM_PROMPT_LOGPROBS+1
+            assert NUM_PROMPT_LOGPROBS <= len(logprobs) <= NUM_PROMPT_LOGPROBS + 1
+            # Ensure top NUM_PROMPT_LOGPROBS is always extracted
+            assert set(range(1, NUM_PROMPT_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
+
+        ########################
+        # Check sample logprobs
+        ########################
+        assert len(decode_tokens) == len(decode_logprobs)
+        for position, (token, logprobs) in enumerate(
+            zip(decode_tokens, decode_logprobs)
+        ):
+            # Ensure logprobs of chosen token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            if greedy:
+                # For greedy sampling, all chosen logprob should be top ranked
+                assert logprob.rank == 1
+            else:
+                assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_TOP_LOGPROBS or NUM_TOP_LOGPROBS+1
+            assert NUM_TOP_LOGPROBS <= len(logprobs) <= NUM_TOP_LOGPROBS + 1
+            # Ensure top NUM_TOP_LOGPROBS logprobs is always extracted
+            assert set(range(1, NUM_TOP_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 128e8f552a1614cec791aa92ca997a1c44e2178e..74047d2f0355876add201631049d03e66b58aedc 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -5,26 +5,18 @@
 Run `pytest tests/samplers/test_no_bad_words.py`.
 
 """
-from typing import Optional
 
-import pytest
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 
 
-@pytest.fixture(autouse=True)
-def v1(monkeypatch):
-    """Only run on vLLM v1."""
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
-
 def _generate(
     llm: LLM,
     prompt: str,
     num_prompt_tokens: int,
     temperature: float = 0,
-    bad_words: Optional[list[str]] = None,
+    bad_words: list[str] | None = None,
 ) -> list[int]:
     sampling_params = SamplingParams(
         temperature=temperature,
@@ -43,31 +35,28 @@ def _generate(
 
 
 class TestOneTokenBadWord:
-    MODEL = "TheBloke/Llama-2-7B-fp16"
+    MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 
-    PROMPT = "Hi! How are"
-    TARGET_TOKEN = "you"
+    PROMPT = "How old are "
+    TARGET_TOKEN = "mn"
 
     def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
 
         self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id = self._encode(self.TARGET_TOKEN,
-                                            add_special_tokens=False)[0]
+        self.target_token_id = self._encode(
+            self.TARGET_TOKEN, add_special_tokens=False
+        )[0]
 
     def test_one_token_bad_word(self, vllm_runner):
         with vllm_runner(self.MODEL) as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[0] == self.target_token_id
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN])
             assert self.target_token_id not in output_token_ids
 
-    def _generate(self,
-                  llm: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
+    def _generate(self, llm: LLM, bad_words: list[str] | None = None) -> list[int]:
         return _generate(
             llm=llm,
             prompt=self.PROMPT,
@@ -75,11 +64,8 @@ class TestOneTokenBadWord:
             bad_words=bad_words,
         )
 
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
+    def _encode(self, prompt: str, add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids
 
 
 class TestTwoTokenBadWord:
@@ -92,72 +78,80 @@ class TestTwoTokenBadWord:
     NEIGHBOUR_TOKEN2 = "older"
 
     def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.MODEL, add_prefix_space=True
+        )
 
         self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
-                                             add_special_tokens=False)[0]
-        self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
-                                             add_special_tokens=False)[0]
-        self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
-                                                add_special_tokens=False)[0]
+        self.target_token_id1 = self._encode(
+            self.TARGET_TOKEN1, add_special_tokens=False
+        )[0]
+        self.target_token_id2 = self._encode(
+            self.TARGET_TOKEN2, add_special_tokens=False
+        )[0]
+        self.neighbour_token_id2 = self._encode(
+            self.NEIGHBOUR_TOKEN2, add_special_tokens=False
+        )[0]
 
     def test_two_token_bad_word(self, vllm_runner):
         with vllm_runner(self.MODEL, dtype="half") as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[:2] == [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN1])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN1])
             assert self.target_token_id1 not in output_token_ids
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN2])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN2])
             assert output_token_ids[0] == self.target_token_id1
             assert self.target_token_id2 not in output_token_ids
 
             output_token_ids = self._generate(
-                llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
+                llm, bad_words=[f"{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}"]
+            )
             assert output_token_ids[0] == self.target_token_id1
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
+                output_token_ids, [self.target_token_id1, self.target_token_id2]
+            )
             # Model dependent behaviour
             assert output_token_ids[:2] == [
-                self.target_token_id1, self.neighbour_token_id2
+                self.target_token_id1,
+                self.neighbour_token_id2,
             ]
 
             output_token_ids = self._generate(
                 llm,
                 bad_words=[
-                    f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
-                    f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
-                ])
+                    f"{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}",
+                    f"{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}",
+                ],
+            )
             assert output_token_ids[0] == self.target_token_id1
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
+                output_token_ids, [self.target_token_id1, self.target_token_id2]
+            )
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.neighbour_token_id2
+                self.target_token_id1,
+                self.neighbour_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.neighbour_token_id2])
-            assert ((self.target_token_id2 in output_token_ids)
-                    or (self.neighbour_token_id2 in output_token_ids))
-
-    def _generate(self,
-                  llm: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
+                output_token_ids, [self.target_token_id1, self.neighbour_token_id2]
+            )
+            assert (self.target_token_id2 in output_token_ids) or (
+                self.neighbour_token_id2 in output_token_ids
+            )
+
+    def _generate(self, llm: LLM, bad_words: list[str] | None = None) -> list[int]:
         return _generate(
             llm=llm,
             prompt=self.PROMPT,
@@ -187,8 +181,5 @@ class TestTwoTokenBadWord:
 
         return False
 
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
+    def _encode(self, prompt: str, add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
deleted file mode 100644
index 220a4a53f46713eb2c53940ed9914c89fd87afb5..0000000000000000000000000000000000000000
--- a/tests/samplers/test_ranks.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_ranks(
-    vllm_runner,
-    model,
-    dtype,
-    example_prompts,
-):
-    max_tokens = 5
-    num_top_logprobs = 5
-    num_prompt_logprobs = 5
-
-    with vllm_runner(model, dtype=dtype,
-                     max_logprobs=num_top_logprobs) as vllm_model:
-
-        ## Test greedy logprobs ranks
-        vllm_sampling_params = SamplingParams(
-            temperature=0.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
-        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                      vllm_sampling_params)
-
-        ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
-        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
-
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
-
-    for result in res:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks
-        for token, logprobs in zip(result[0], result[2]):
-            assert logprobs[token].rank >= 1
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index 21bcb6b822d1f3088e0cc9cc8f441e6c2a304e7a..ddcdd2a51ab9f439c423aa8a5f097d12ec4f9f93 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -37,4 +37,5 @@ if use_blame:
 
 assert not any_module_imported(), (
     f"Some the modules in {module_names} are imported. To see the first"
-    f" import location, run the test with `use_blame=True`.")
+    f" import location, run the test with `use_blame=True`."
+)
diff --git a/tests/standalone_tests/pytorch_nightly_dependency.sh b/tests/standalone_tests/pytorch_nightly_dependency.sh
index cb531e13ecb8192117c1a72214b9feb6f343402d..fd93ad76bed0f1a2b39a5754cd41673aec04497d 100644
--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@@ -37,6 +37,6 @@ if diff before.txt after.txt; then
 else
   echo "torch version overridden by nightly_torch_test.txt, \
   if the dependency is not triggered by the pytroch nightly test,\
-  please add the dependency to the list 'white_list'  in tools/generate_nightly_torch_test.py"
+  please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
   exit 1
 fi
diff --git a/tests/test_config.py b/tests/test_config.py
index 90d0c78c451f6ef4d64a4402bc246fdf6be87f93..bba2fbec3db29eeef3c7b3a20fc79d7f5968c085 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -23,8 +23,8 @@ def test_compile_config_repr_succeeds():
 
     # test that repr(config) succeeds
     val = repr(config)
-    assert 'VllmConfig' in val
-    assert 'inductor_passes' in val
+    assert "VllmConfig" in val
+    assert "inductor_passes" in val
 
 
 @dataclass
@@ -51,8 +51,7 @@ def test_get_field():
 
 @dataclass
 class _TestNestedConfig:
-    a: _TestConfigFields = field(
-        default_factory=lambda: _TestConfigFields(a=0))
+    a: _TestConfigFields = field(default_factory=lambda: _TestConfigFields(a=0))
 
 
 def test_update_config():
@@ -79,20 +78,19 @@ def test_update_config():
 
 # Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
     [
         ("distilbert/distilgpt2", "generate", "none", "generate"),
         ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
-         "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
-def test_auto_task(model_id, expected_runner_type, expected_convert_type,
-                   expected_task):
+def test_auto_task(
+    model_id, expected_runner_type, expected_convert_type, expected_task
+):
     config = ModelConfig(model_id, task="auto")
 
     assert config.runner_type == expected_runner_type
@@ -101,20 +99,19 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
 
 # Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
     [
         ("distilbert/distilgpt2", "pooling", "embed", "embed"),
         ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
-         "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
         ("openai/whisper-small", "pooling", "embed", "embed"),
     ],
 )
-def test_score_task(model_id, expected_runner_type, expected_convert_type,
-                    expected_task):
+def test_score_task(
+    model_id, expected_runner_type, expected_convert_type, expected_task
+):
     config = ModelConfig(model_id, task="score")
 
     assert config.runner_type == expected_runner_type
@@ -123,14 +120,14 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,
 
 # Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
     [
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
-def test_transcription_task(model_id, expected_runner_type,
-                            expected_convert_type, expected_task):
+def test_transcription_task(
+    model_id, expected_runner_type, expected_convert_type, expected_task
+):
     config = ModelConfig(model_id, task="transcription")
 
     assert config.runner_type == expected_runner_type
@@ -200,8 +197,9 @@ def test_disable_sliding_window(model_id_expected):
     assert model_config.max_model_len == expected
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_pooling_config():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
     model_config = ModelConfig(model_id)
@@ -211,8 +209,9 @@ def test_get_pooling_config():
     assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_pooling_config_from_args():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
     pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
@@ -227,16 +226,18 @@ def test_get_pooling_config_from_args():
         ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"),  # LLM
         ("intfloat/e5-small", "CLS", "MEAN"),  # BertModel
         ("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"),  # reward
-        ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP")  # step reward
-    ])
+        ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP"),  # step reward
+    ],
+)
 def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
     model_config = ModelConfig(model_id)
     assert model_config._model_info.default_pooling_type == default_pooling_type
     assert model_config.pooler_config.pooling_type == pooling_type
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_bert_tokenization_sentence_transformer_config():
     model_id = "BAAI/bge-base-en-v1.5"
     bge_model_config = ModelConfig(model_id)
@@ -264,17 +265,18 @@ def test_rope_customization():
             "rope_theta": TEST_ROPE_THETA,
         },
     )
-    assert getattr(llama_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
-    assert getattr(llama_model_config.hf_config, "rope_theta",
-                   None) == TEST_ROPE_THETA
+    assert (
+        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+    )
+    assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
     # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
     assert all(
         longchat_model_config.hf_config.rope_scaling.get(key) == value
-        for key, value in LONGCHAT_ROPE_SCALING.items())
+        for key, value in LONGCHAT_ROPE_SCALING.items()
+    )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
@@ -283,28 +285,68 @@ def test_rope_customization():
             "rope_scaling": TEST_ROPE_SCALING,
         },
     )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
+    assert (
+        getattr(longchat_model_config.hf_config, "rope_scaling", None)
+        == TEST_ROPE_SCALING
+    )
     assert longchat_model_config.max_model_len == 4096
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Encoder Decoder models not supported on ROCm.")
-@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
-    ("facebook/opt-125m", False),
-    ("openai/whisper-tiny", True),
-    ("meta-llama/Llama-3.2-1B-Instruct", False),
-])
+def test_nested_hf_overrides():
+    """Test that nested hf_overrides work correctly."""
+    # Test with a model that has text_config
+    model_config = ModelConfig(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        hf_overrides={
+            "text_config": {
+                "hidden_size": 1024,
+            },
+        },
+    )
+    assert model_config.hf_config.text_config.hidden_size == 1024
+
+    # Test with deeply nested overrides
+    model_config = ModelConfig(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        hf_overrides={
+            "text_config": {
+                "hidden_size": 2048,
+                "num_attention_heads": 16,
+            },
+            "vision_config": {
+                "hidden_size": 512,
+            },
+        },
+    )
+    assert model_config.hf_config.text_config.hidden_size == 2048
+    assert model_config.hf_config.text_config.num_attention_heads == 16
+    assert model_config.hf_config.vision_config.hidden_size == 512
+
+
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Encoder Decoder models not supported on ROCm."
+)
+@pytest.mark.parametrize(
+    ("model_id", "is_encoder_decoder"),
+    [
+        ("facebook/opt-125m", False),
+        ("openai/whisper-tiny", True),
+        ("meta-llama/Llama-3.2-1B-Instruct", False),
+    ],
+)
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
     config = ModelConfig(model_id)
 
     assert config.is_encoder_decoder == is_encoder_decoder
 
 
-@pytest.mark.parametrize(("model_id", "uses_mrope"), [
-    ("facebook/opt-125m", False),
-    ("Qwen/Qwen2-VL-2B-Instruct", True),
-])
+@pytest.mark.parametrize(
+    ("model_id", "uses_mrope"),
+    [
+        ("facebook/opt-125m", False),
+        ("Qwen/Qwen2-VL-2B-Instruct", True),
+    ],
+)
 def test_uses_mrope(model_id, uses_mrope):
     config = ModelConfig(model_id)
 
@@ -338,7 +380,8 @@ def test_generation_config_loading():
     model_config = ModelConfig(
         model_id,
         generation_config="auto",
-        override_generation_config=override_generation_config)
+        override_generation_config=override_generation_config,
+    )
 
     override_result = correct_generation_config.copy()
     override_result.update(override_generation_config)
@@ -350,17 +393,19 @@ def test_generation_config_loading():
     model_config = ModelConfig(
         model_id,
         generation_config="vllm",
-        override_generation_config=override_generation_config)
+        override_generation_config=override_generation_config,
+    )
 
     assert model_config.get_diff_sampling_param() == override_generation_config
 
 
-@pytest.mark.parametrize("pt_load_map_location", [
-    "cuda",
-    {
-        "": "cuda"
-    },
-])
+@pytest.mark.parametrize(
+    "pt_load_map_location",
+    [
+        "cuda",
+        {"": "cuda"},
+    ],
+)
 def test_load_config_pt_load_map_location(pt_load_map_location):
     load_config = LoadConfig(pt_load_map_location=pt_load_map_location)
     config = VllmConfig(load_config=load_config)
@@ -369,15 +414,18 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
 
 
 @pytest.mark.parametrize(
-    ("model_id", "max_model_len", "expected_max_len", "should_raise"), [
+    ("model_id", "max_model_len", "expected_max_len", "should_raise"),
+    [
         ("BAAI/bge-reranker-base", None, 512, False),
         ("BAAI/bge-reranker-base", 256, 256, False),
         ("BAAI/bge-reranker-base", 513, 512, True),
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
-    ])
-def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
-                                should_raise):
+    ],
+)
+def test_get_and_verify_max_len(
+    model_id, max_model_len, expected_max_len, should_raise
+):
     """Test get_and_verify_max_len with different configurations."""
     model_config = ModelConfig(model_id)
 
@@ -398,11 +446,14 @@ class MockConfig:
         self.model_weights = None
 
 
-@pytest.mark.parametrize("s3_url", [
-    "s3://example-bucket-1/model/",
-    "s3://example-bucket-2/model/",
-])
-@patch('vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files')
+@pytest.mark.parametrize(
+    "s3_url",
+    [
+        "s3://example-bucket-1/model/",
+        "s3://example-bucket-2/model/",
+    ],
+)
+@patch("vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files")
 def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     """Test that S3 URLs create deterministic local directories for model and
     tokenizer."""
@@ -414,22 +465,24 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
 
     # Check that model and tokenizer point to existing directories
-    assert os.path.exists(
-        config1.model), f"Model directory does not exist: {config1.model}"
-    assert os.path.isdir(
-        config1.model), f"Model path is not a directory: {config1.model}"
-    assert os.path.exists(
-        config1.tokenizer
-    ), f"Tokenizer directory does not exist: {config1.tokenizer}"
-    assert os.path.isdir(
-        config1.tokenizer
-    ), f"Tokenizer path is not a directory: {config1.tokenizer}"
+    assert os.path.exists(config1.model), (
+        f"Model directory does not exist: {config1.model}"
+    )
+    assert os.path.isdir(config1.model), (
+        f"Model path is not a directory: {config1.model}"
+    )
+    assert os.path.exists(config1.tokenizer), (
+        f"Tokenizer directory does not exist: {config1.tokenizer}"
+    )
+    assert os.path.isdir(config1.tokenizer), (
+        f"Tokenizer path is not a directory: {config1.tokenizer}"
+    )
 
     # Verify that the paths are different from the original S3 URL
-    assert config1.model != s3_url, (
-        "Model path should be converted to local directory")
+    assert config1.model != s3_url, "Model path should be converted to local directory"
     assert config1.tokenizer != s3_url, (
-        "Tokenizer path should be converted to local directory")
+        "Tokenizer path should be converted to local directory"
+    )
 
     # Store the original paths
     created_model_dir = config1.model
@@ -440,27 +493,31 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
 
     # Check that the new directories exist
-    assert os.path.exists(
-        config2.model), f"Model directory does not exist: {config2.model}"
-    assert os.path.isdir(
-        config2.model), f"Model path is not a directory: {config2.model}"
-    assert os.path.exists(
-        config2.tokenizer
-    ), f"Tokenizer directory does not exist: {config2.tokenizer}"
-    assert os.path.isdir(
-        config2.tokenizer
-    ), f"Tokenizer path is not a directory: {config2.tokenizer}"
+    assert os.path.exists(config2.model), (
+        f"Model directory does not exist: {config2.model}"
+    )
+    assert os.path.isdir(config2.model), (
+        f"Model path is not a directory: {config2.model}"
+    )
+    assert os.path.exists(config2.tokenizer), (
+        f"Tokenizer directory does not exist: {config2.tokenizer}"
+    )
+    assert os.path.isdir(config2.tokenizer), (
+        f"Tokenizer path is not a directory: {config2.tokenizer}"
+    )
 
     # Verify that the paths are deterministic (same as before)
     assert config2.model == created_model_dir, (
         f"Model paths are not deterministic. "
-        f"Original: {created_model_dir}, New: {config2.model}")
+        f"Original: {created_model_dir}, New: {config2.model}"
+    )
     assert config2.tokenizer == create_tokenizer_dir, (
         f"Tokenizer paths are not deterministic. "
-        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}")
+        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
+    )
 
 
-@patch('vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files')
+@patch("vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files")
 def test_s3_url_different_models_create_different_directories(mock_pull_files):
     """Test that different S3 URLs create different local directories."""
     # Mock pull_files to avoid actually downloading files during tests
@@ -479,16 +536,16 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     # Verify that different URLs produce different directories
     assert config1.model != config2.model, (
         f"Different S3 URLs should create different model directories. "
-        f"URL1 model: {config1.model}, URL2 model: {config2.model}")
+        f"URL1 model: {config1.model}, URL2 model: {config2.model}"
+    )
     assert config1.tokenizer != config2.tokenizer, (
         f"Different S3 URLs should create different tokenizer directories. "
         f"URL1 tokenizer: {config1.tokenizer}, "
-        f"URL2 tokenizer: {config2.tokenizer}")
+        f"URL2 tokenizer: {config2.tokenizer}"
+    )
 
     # Verify that both sets of directories exist
     assert os.path.exists(config1.model) and os.path.isdir(config1.model)
-    assert os.path.exists(config1.tokenizer) and os.path.isdir(
-        config1.tokenizer)
+    assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
     assert os.path.exists(config2.model) and os.path.isdir(config2.model)
-    assert os.path.exists(config2.tokenizer) and os.path.isdir(
-        config2.tokenizer)
+    assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index b9593e2a3b7c04dbc60dd34b622cca5093031529..687a15446fc2a0202af8d49ecb7dc1467472773a 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import vllm
-
-
-def test_embedded_commit_defined():
-    assert hasattr(vllm, "__version__")
-    assert hasattr(vllm, "__version_tuple__")
-    assert vllm.__version__ != "dev"
-    assert vllm.__version_tuple__ != (0, 0, "dev")
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert hasattr(vllm, "__version__")
+    assert hasattr(vllm, "__version_tuple__")
+    assert vllm.__version__ != "dev"
+    assert vllm.__version_tuple__ != (0, 0, "dev")
diff --git a/tests/test_envs.py b/tests/test_envs.py
index f81a6e2e415cd4855919a63c61fa6d5430c9762a..6a9835a68e7e218a71818fedd8333d24d783ab74 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -6,7 +6,55 @@ from unittest.mock import patch
 
 import pytest
 
-from vllm.envs import env_list_with_choices, env_with_choices
+import vllm.envs as envs
+from vllm.envs import (
+    enable_envs_cache,
+    env_list_with_choices,
+    env_set_with_choices,
+    env_with_choices,
+    environment_variables,
+)
+
+
+def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch):
+    assert envs.VLLM_HOST_IP == ""
+    assert envs.VLLM_PORT is None
+    monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
+    monkeypatch.setenv("VLLM_PORT", "1234")
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+    assert envs.VLLM_PORT == 1234
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+
+
+def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
+    monkeypatch.setenv("VLLM_PORT", "1234")
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+
+    # Enable envs cache and ignore ongoing environment changes
+    enable_envs_cache()
+
+    # __getattr__ is decorated with functools.cache
+    assert hasattr(envs.__getattr__, "cache_info")
+    start_hits = envs.__getattr__.cache_info().hits
+
+    # 2 more hits due to VLLM_HOST_IP and VLLM_PORT accesses
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+    assert envs.VLLM_PORT == 1234
+    assert envs.__getattr__.cache_info().hits == start_hits + 2
+
+    # All environment variables are cached
+    for environment_variable in environment_variables:
+        envs.__getattr__(environment_variable)
+    assert envs.__getattr__.cache_info().hits == start_hits + 2 + len(
+        environment_variables
+    )
+
+    # Reset envs.__getattr__ back to none-cached version to
+    # avoid affecting other tests
+    envs.__getattr__ = envs.__getattr__.__wrapped__
 
 
 class TestEnvWithChoices:
@@ -14,68 +62,71 @@ class TestEnvWithChoices:
 
     def test_default_value_returned_when_env_not_set(self):
         """Test default is returned when env var is not set."""
-        env_func = env_with_choices("NONEXISTENT_ENV", "default",
-                                    ["option1", "option2"])
+        env_func = env_with_choices(
+            "NONEXISTENT_ENV", "default", ["option1", "option2"]
+        )
         assert env_func() == "default"
 
     def test_none_default_returned_when_env_not_set(self):
         """Test that None is returned when env not set and default is None."""
-        env_func = env_with_choices("NONEXISTENT_ENV", None,
-                                    ["option1", "option2"])
+        env_func = env_with_choices("NONEXISTENT_ENV", None, ["option1", "option2"])
         assert env_func() is None
 
     def test_valid_value_returned_case_sensitive(self):
         """Test that valid value is returned in case sensitive mode."""
         with patch.dict(os.environ, {"TEST_ENV": "option1"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["option1", "option2"],
-                                        case_sensitive=True)
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["option1", "option2"], case_sensitive=True
+            )
             assert env_func() == "option1"
 
     def test_valid_lowercase_value_returned_case_insensitive(self):
         """Test that lowercase value is accepted in case insensitive mode."""
         with patch.dict(os.environ, {"TEST_ENV": "option1"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["OPTION1", "OPTION2"],
-                                        case_sensitive=False)
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["OPTION1", "OPTION2"], case_sensitive=False
+            )
             assert env_func() == "option1"
 
     def test_valid_uppercase_value_returned_case_insensitive(self):
         """Test that uppercase value is accepted in case insensitive mode."""
         with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["option1", "option2"],
-                                        case_sensitive=False)
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["option1", "option2"], case_sensitive=False
+            )
             assert env_func() == "OPTION1"
 
     def test_invalid_value_raises_error_case_sensitive(self):
         """Test that invalid value raises ValueError in case sensitive mode."""
         with patch.dict(os.environ, {"TEST_ENV": "invalid"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["option1", "option2"],
-                                        case_sensitive=True)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'invalid' for TEST_ENV"):
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["option1", "option2"], case_sensitive=True
+            )
+            with pytest.raises(
+                ValueError, match="Invalid value 'invalid' for TEST_ENV"
+            ):
                 env_func()
 
     def test_case_mismatch_raises_error_case_sensitive(self):
         """Test that case mismatch raises ValueError in case sensitive mode."""
         with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["option1", "option2"],
-                                        case_sensitive=True)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'OPTION1' for TEST_ENV"):
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["option1", "option2"], case_sensitive=True
+            )
+            with pytest.raises(
+                ValueError, match="Invalid value 'OPTION1' for TEST_ENV"
+            ):
                 env_func()
 
     def test_invalid_value_raises_error_case_insensitive(self):
         """Test that invalid value raises ValueError when case insensitive."""
         with patch.dict(os.environ, {"TEST_ENV": "invalid"}):
-            env_func = env_with_choices("TEST_ENV",
-                                        "default", ["option1", "option2"],
-                                        case_sensitive=False)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'invalid' for TEST_ENV"):
+            env_func = env_with_choices(
+                "TEST_ENV", "default", ["option1", "option2"], case_sensitive=False
+            )
+            with pytest.raises(
+                ValueError, match="Invalid value 'invalid' for TEST_ENV"
+            ):
                 env_func()
 
     def test_callable_choices_resolved_correctly(self):
@@ -96,8 +147,9 @@ class TestEnvWithChoices:
 
         with patch.dict(os.environ, {"TEST_ENV": "invalid"}):
             env_func = env_with_choices("TEST_ENV", "default", get_choices)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'invalid' for TEST_ENV"):
+            with pytest.raises(
+                ValueError, match="Invalid value 'invalid' for TEST_ENV"
+            ):
                 env_func()
 
 
@@ -106,84 +158,78 @@ class TestEnvListWithChoices:
 
     def test_default_list_returned_when_env_not_set(self):
         """Test that default list is returned when env var is not set."""
-        env_func = env_list_with_choices("NONEXISTENT_ENV",
-                                         ["default1", "default2"],
-                                         ["option1", "option2"])
+        env_func = env_list_with_choices(
+            "NONEXISTENT_ENV", ["default1", "default2"], ["option1", "option2"]
+        )
         assert env_func() == ["default1", "default2"]
 
     def test_empty_default_list_returned_when_env_not_set(self):
         """Test that empty default list is returned when env not set."""
-        env_func = env_list_with_choices("NONEXISTENT_ENV", [],
-                                         ["option1", "option2"])
+        env_func = env_list_with_choices("NONEXISTENT_ENV", [], ["option1", "option2"])
         assert env_func() == []
 
     def test_single_valid_value_parsed_correctly(self):
         """Test that single valid value is parsed correctly."""
         with patch.dict(os.environ, {"TEST_ENV": "option1"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == ["option1"]
 
     def test_multiple_valid_values_parsed_correctly(self):
         """Test that multiple valid values are parsed correctly."""
         with patch.dict(os.environ, {"TEST_ENV": "option1,option2"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == ["option1", "option2"]
 
     def test_values_with_whitespace_trimmed(self):
         """Test that values with whitespace are trimmed correctly."""
         with patch.dict(os.environ, {"TEST_ENV": " option1 , option2 "}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == ["option1", "option2"]
 
     def test_empty_values_filtered_out(self):
         """Test that empty values are filtered out."""
         with patch.dict(os.environ, {"TEST_ENV": "option1,,option2,"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == ["option1", "option2"]
 
     def test_empty_string_returns_default(self):
         """Test that empty string returns default."""
         with patch.dict(os.environ, {"TEST_ENV": ""}):
-            env_func = env_list_with_choices("TEST_ENV", ["default"],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices(
+                "TEST_ENV", ["default"], ["option1", "option2"]
+            )
             assert env_func() == ["default"]
 
     def test_only_commas_returns_default(self):
         """Test that string with only commas returns default."""
         with patch.dict(os.environ, {"TEST_ENV": ",,,"}):
-            env_func = env_list_with_choices("TEST_ENV", ["default"],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices(
+                "TEST_ENV", ["default"], ["option1", "option2"]
+            )
             assert env_func() == ["default"]
 
     def test_case_sensitive_validation(self):
         """Test case sensitive validation."""
         with patch.dict(os.environ, {"TEST_ENV": "option1,OPTION2"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"],
-                                             case_sensitive=True)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'OPTION2' in TEST_ENV"):
+            env_func = env_list_with_choices(
+                "TEST_ENV", [], ["option1", "option2"], case_sensitive=True
+            )
+            with pytest.raises(ValueError, match="Invalid value 'OPTION2' in TEST_ENV"):
                 env_func()
 
     def test_case_insensitive_validation(self):
         """Test case insensitive validation."""
         with patch.dict(os.environ, {"TEST_ENV": "OPTION1,option2"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"],
-                                             case_sensitive=False)
+            env_func = env_list_with_choices(
+                "TEST_ENV", [], ["option1", "option2"], case_sensitive=False
+            )
             assert env_func() == ["OPTION1", "option2"]
 
     def test_invalid_value_in_list_raises_error(self):
         """Test that invalid value in list raises ValueError."""
         with patch.dict(os.environ, {"TEST_ENV": "option1,invalid,option2"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
-            with pytest.raises(ValueError,
-                               match="Invalid value 'invalid' in TEST_ENV"):
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
+            with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"):
                 env_func()
 
     def test_callable_choices_resolved_correctly(self):
@@ -204,13 +250,118 @@ class TestEnvListWithChoices:
 
         with patch.dict(os.environ, {"TEST_ENV": "dynamic1,invalid"}):
             env_func = env_list_with_choices("TEST_ENV", [], get_choices)
-            with pytest.raises(ValueError,
-                               match="Invalid value 'invalid' in TEST_ENV"):
+            with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"):
                 env_func()
 
     def test_duplicate_values_preserved(self):
         """Test that duplicate values in the list are preserved."""
         with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
-            env_func = env_list_with_choices("TEST_ENV", [],
-                                             ["option1", "option2"])
+            env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == ["option1", "option1", "option2"]
+
+
+class TestEnvSetWithChoices:
+    """Test cases for env_set_with_choices function."""
+
+    def test_default_list_returned_when_env_not_set(self):
+        """Test that default list is returned when env var is not set."""
+        env_func = env_set_with_choices(
+            "NONEXISTENT_ENV", ["default1", "default2"], ["option1", "option2"]
+        )
+        assert env_func() == {"default1", "default2"}
+
+    def test_empty_default_list_returned_when_env_not_set(self):
+        """Test that empty default list is returned when env not set."""
+        env_func = env_set_with_choices("NONEXISTENT_ENV", [], ["option1", "option2"])
+        assert env_func() == set()
+
+    def test_single_valid_value_parsed_correctly(self):
+        """Test that single valid value is parsed correctly."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1"}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            assert env_func() == {"option1"}
+
+    def test_multiple_valid_values_parsed_correctly(self):
+        """Test that multiple valid values are parsed correctly."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1,option2"}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            assert env_func() == {"option1", "option2"}
+
+    def test_values_with_whitespace_trimmed(self):
+        """Test that values with whitespace are trimmed correctly."""
+        with patch.dict(os.environ, {"TEST_ENV": " option1 , option2 "}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            assert env_func() == {"option1", "option2"}
+
+    def test_empty_values_filtered_out(self):
+        """Test that empty values are filtered out."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1,,option2,"}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            assert env_func() == {"option1", "option2"}
+
+    def test_empty_string_returns_default(self):
+        """Test that empty string returns default."""
+        with patch.dict(os.environ, {"TEST_ENV": ""}):
+            env_func = env_set_with_choices(
+                "TEST_ENV", ["default"], ["option1", "option2"]
+            )
+            assert env_func() == {"default"}
+
+    def test_only_commas_returns_default(self):
+        """Test that string with only commas returns default."""
+        with patch.dict(os.environ, {"TEST_ENV": ",,,"}):
+            env_func = env_set_with_choices(
+                "TEST_ENV", ["default"], ["option1", "option2"]
+            )
+            assert env_func() == {"default"}
+
+    def test_case_sensitive_validation(self):
+        """Test case sensitive validation."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1,OPTION2"}):
+            env_func = env_set_with_choices(
+                "TEST_ENV", [], ["option1", "option2"], case_sensitive=True
+            )
+            with pytest.raises(ValueError, match="Invalid value 'OPTION2' in TEST_ENV"):
+                env_func()
+
+    def test_case_insensitive_validation(self):
+        """Test case insensitive validation."""
+        with patch.dict(os.environ, {"TEST_ENV": "OPTION1,option2"}):
+            env_func = env_set_with_choices(
+                "TEST_ENV", [], ["option1", "option2"], case_sensitive=False
+            )
+            assert env_func() == {"OPTION1", "option2"}
+
+    def test_invalid_value_in_list_raises_error(self):
+        """Test that invalid value in list raises ValueError."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1,invalid,option2"}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"):
+                env_func()
+
+    def test_callable_choices_resolved_correctly(self):
+        """Test that callable choices are resolved correctly."""
+
+        def get_choices():
+            return ["dynamic1", "dynamic2"]
+
+        with patch.dict(os.environ, {"TEST_ENV": "dynamic1,dynamic2"}):
+            env_func = env_set_with_choices("TEST_ENV", [], get_choices)
+            assert env_func() == {"dynamic1", "dynamic2"}
+
+    def test_callable_choices_with_invalid_value(self):
+        """Test that callable choices raise error for invalid values."""
+
+        def get_choices():
+            return ["dynamic1", "dynamic2"]
+
+        with patch.dict(os.environ, {"TEST_ENV": "dynamic1,invalid"}):
+            env_func = env_set_with_choices("TEST_ENV", [], get_choices)
+            with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"):
+                env_func()
+
+    def test_duplicate_values_deduped(self):
+        """Test that duplicate values in the list are deduped."""
+        with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
+            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
+            assert env_func() == {"option1", "option2"}
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index e549834faf6f71659a6584988be38706ee68939a..b1fb4e06a6906ffe12bc562fd500161c3610d37a 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -3,15 +3,20 @@
 
 import pytest
 
+from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import parse_raw_prompts
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+
+pytestmark = pytest.mark.cpu_test
 
 STRING_INPUTS = [
-    '',
-    'foo',
-    'foo bar',
-    'foo baz bar',
-    'foo bar qux baz',
+    "",
+    "foo",
+    "foo bar",
+    "foo baz bar",
+    "foo bar qux baz",
 ]
 
 TOKEN_INPUTS = [
@@ -29,52 +34,85 @@ INPUTS_SLICES = [
 ]
 
 
-def test_parse_single_batch_empty():
+def test_parse_raw_single_batch_empty():
     with pytest.raises(ValueError, match="at least one prompt"):
-        parse_and_batch_prompt([])
+        parse_raw_prompts([])
 
     with pytest.raises(ValueError, match="at least one prompt"):
-        parse_and_batch_prompt([[]])
+        parse_raw_prompts([[]])
 
 
-@pytest.mark.parametrize('string_input', STRING_INPUTS)
-def test_parse_single_batch_string_consistent(string_input: str):
-    assert parse_and_batch_prompt(string_input) \
-        == parse_and_batch_prompt([string_input])
+@pytest.mark.parametrize("string_input", STRING_INPUTS)
+def test_parse_raw_single_batch_string_consistent(string_input: str):
+    assert parse_raw_prompts(string_input) == parse_raw_prompts([string_input])
 
 
-@pytest.mark.parametrize('token_input', TOKEN_INPUTS)
-def test_parse_single_batch_token_consistent(token_input: list[int]):
-    assert parse_and_batch_prompt(token_input) \
-        == parse_and_batch_prompt([token_input])
+@pytest.mark.parametrize("token_input", TOKEN_INPUTS)
+def test_parse_raw_single_batch_token_consistent(token_input: list[int]):
+    assert parse_raw_prompts(token_input) == parse_raw_prompts([token_input])
 
 
-@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
-def test_parse_single_batch_string_slice(inputs_slice: slice):
-    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
-        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
+@pytest.mark.parametrize("inputs_slice", INPUTS_SLICES)
+def test_parse_raw_single_batch_string_slice(inputs_slice: slice):
+    assert parse_raw_prompts(STRING_INPUTS)[inputs_slice] == parse_raw_prompts(
+        STRING_INPUTS[inputs_slice]
+    )
 
 
-# yapf: disable
-@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
-    (None, [{}, {}]),
-    ({}, [{}, {}]),
-    ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
-    ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
-])
-# yapf: enable
+@pytest.mark.parametrize(
+    "mm_processor_kwargs,expected_mm_kwargs",
+    [
+        (None, [{}, {}]),
+        ({}, [{}, {}]),
+        ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
+        ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
+    ],
+)
 def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
     """Test mm_processor_kwargs init for zipping enc/dec prompts."""
-    encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
-    decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
-    zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
-                                         mm_processor_kwargs)
+    encoder_prompts = ["An encoder prompt", "Another encoder prompt"]
+    decoder_prompts = ["A decoder prompt", "Another decoder prompt"]
+    zipped_prompts = zip_enc_dec_prompts(
+        encoder_prompts, decoder_prompts, mm_processor_kwargs
+    )
     assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
-    for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
-                                            expected_mm_kwargs,
-                                            zipped_prompts):
+    for enc, dec, exp_kwargs, zipped in zip(
+        encoder_prompts, decoder_prompts, expected_mm_kwargs, zipped_prompts
+    ):
         assert isinstance(zipped, dict)
         assert len(zipped.keys()) == 3
-        assert zipped['encoder_prompt'] == enc
-        assert zipped['decoder_prompt'] == dec
-        assert zipped['mm_processor_kwargs'] == exp_kwargs
+        assert zipped["encoder_prompt"] == enc
+        assert zipped["decoder_prompt"] == dec
+        assert zipped["mm_processor_kwargs"] == exp_kwargs
+
+
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "facebook/chameleon-7b",
+    ],
+)
+@pytest.mark.parametrize(
+    "prompt",
+    [
+        "",
+        {"prompt_token_ids": []},
+    ],
+)
+@pytest.mark.skip(
+    reason=(
+        "Applying huggingface processor on text inputs results in "
+        "significant performance regression for multimodal models. "
+        "See https://github.com/vllm-project/vllm/issues/26320"
+    )
+)
+def test_preprocessor_always_mm_code_path(model_id, prompt):
+    model_config = ModelConfig(model=model_id)
+    tokenizer = init_tokenizer_from_configs(model_config)
+    input_preprocessor = InputPreprocessor(model_config, tokenizer)
+
+    # HF processor adds sep token
+    sep_token_id = tokenizer.vocab[tokenizer.sep_token]
+
+    processed_inputs = input_preprocessor.preprocess(prompt)
+    assert sep_token_id in processed_inputs["prompt_token_ids"]
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 0bfb449cdf213162086440075d3531f16204d0bd..01672358902f98451dd8b92b9c84ce197df950fd 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -16,8 +16,13 @@ from uuid import uuid4
 import pytest
 
 from vllm.entrypoints.logger import RequestLogger
-from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
-                         enable_trace_function_call, init_logger)
+from vllm.logger import (
+    _DATE_FORMAT,
+    _FORMAT,
+    _configure_vllm_root_logger,
+    enable_trace_function_call,
+    init_logger,
+)
 from vllm.logging_utils import NewLineFormatter
 from vllm.logging_utils.dump_input import prepare_object_to_dump
 
@@ -129,8 +134,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write("---\nloggers: []\nversion: 1")
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(JSONDecodeError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type == JSONDecodeError
@@ -138,24 +142,24 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
 
 
 @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@pytest.mark.parametrize("unexpected_config", (
-    "Invalid string",
-    [{
-        "version": 1,
-        "loggers": []
-    }],
-    0,
-))
+@pytest.mark.parametrize(
+    "unexpected_config",
+    (
+        "Invalid string",
+        [{"version": 1, "loggers": []}],
+        0,
+    ),
+)
 def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
-        unexpected_config: Any):
+    unexpected_config: Any,
+):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(unexpected_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(ValueError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type == ValueError  # noqa: E721
@@ -174,14 +178,15 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
                 "propagate": False,
             }
         },
-        "version": 1
+        "version": 1,
     }
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name), patch(
-                       "vllm.logger.dictConfig") as dict_config_mock:
+        with (
+            patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name),
+            patch("vllm.logger.dictConfig") as dict_config_mock,
+        ):
             _configure_vllm_root_logger()
             dict_config_mock.assert_called_with(valid_logging_config)
 
@@ -197,19 +202,19 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
                 "handlers": [],
             }
         },
-        "version": 1
+        "version": 1,
     }
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(RuntimeError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type is RuntimeError
             expected_message_snippet = (
                 "VLLM_CONFIGURE_LOGGING evaluated to false, but "
-                "VLLM_LOGGING_CONFIG_PATH was given.")
+                "VLLM_LOGGING_CONFIG_PATH was given."
+            )
             assert expected_message_snippet in str(ex_info)
 
         # Remember! The root logger is assumed to have been configured as
@@ -223,11 +228,11 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
 
 
 def test_prepare_object_to_dump():
-    str_obj = 'str'
+    str_obj = "str"
     assert prepare_object_to_dump(str_obj) == "'str'"
 
     list_obj = [1, 2, 3]
-    assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'
+    assert prepare_object_to_dump(list_obj) == "[1, 2, 3]"
 
     dict_obj = {"a": 1, "b": "b"}
     assert prepare_object_to_dump(dict_obj) in [
@@ -236,9 +241,9 @@ def test_prepare_object_to_dump():
     ]
 
     set_obj = {1, 2, 3}
-    assert prepare_object_to_dump(set_obj) == '[1, 2, 3]'
+    assert prepare_object_to_dump(set_obj) == "[1, 2, 3]"
 
-    tuple_obj = ('a', 'b', 'c')
+    tuple_obj = ("a", "b", "c")
     assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']"
 
     class CustomEnum(enum.Enum):
@@ -253,8 +258,7 @@ def test_prepare_object_to_dump():
         a: int
         b: str
 
-    assert (prepare_object_to_dump(CustomClass(
-        1, "b")) == "CustomClass(a=1, b='b')")
+    assert prepare_object_to_dump(CustomClass(1, "b")) == "CustomClass(a=1, b='b')"
 
 
 def test_request_logger_log_outputs():
@@ -467,7 +471,7 @@ def test_request_logger_log_outputs_integration():
 
 def test_streaming_complete_logs_full_text_content():
     """Test that streaming complete logging includes
-      full accumulated text, not just token count."""
+    full accumulated text, not just token count."""
     mock_logger = MagicMock()
 
     with patch("vllm.entrypoints.logger.logger", mock_logger):
@@ -497,3 +501,49 @@ def test_streaming_complete_logs_full_text_content():
         assert call_args[1] == "test-streaming-full-text"
         assert call_args[2] == " (streaming complete)"
         assert call_args[5] == "streaming_complete"
+
+
+# Add vllm prefix to make sure logs go through the vllm logger
+test_logger = init_logger("vllm.test_logger")
+
+
+def mp_function(**kwargs):
+    # This function runs in a subprocess
+
+    test_logger.warning("This is a subprocess: %s", kwargs.get("a"))
+    test_logger.error("This is a subprocess error.")
+    test_logger.debug("This is a subprocess debug message: %s.", kwargs.get("b"))
+
+
+def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork):
+    with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork():
+        import multiprocessing
+
+        ctx = multiprocessing.get_context("fork")
+        p = ctx.Process(
+            target=mp_function,
+            name=f"SubProcess{1}",
+            kwargs={"a": "AAAA", "b": "BBBBB"},
+        )
+        p.start()
+        p.join()
+
+    assert "AAAA" in caplog_vllm.text
+    assert "BBBBB" in caplog_vllm.text
+
+
+def test_caplog_mp_spawn(caplog_mp_spawn):
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        import multiprocessing
+
+        ctx = multiprocessing.get_context("spawn")
+        p = ctx.Process(
+            target=mp_function,
+            name=f"SubProcess{1}",
+            kwargs={"a": "AAAA", "b": "BBBBB"},
+        )
+        p.start()
+        p.join()
+
+    assert "AAAA" in log_holder.text
+    assert "BBBBB" in log_holder.text
diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d26a460d2bcab00dfc83c996bbd6d4f849e0cbb2
--- /dev/null
+++ b/tests/test_logprobs.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.logprobs import (
+    FlatLogprobs,
+    Logprob,
+    LogprobsOnePosition,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
+
+
+def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, list)
+    # Ensure first prompt position logprobs is None
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] is None
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, list)
+    assert len(sample_logprobs) == 0
+
+
+def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, FlatLogprobs)
+    assert prompt_logprobs.start_indices == [0]
+    assert prompt_logprobs.end_indices == [0]
+    assert len(prompt_logprobs.token_ids) == 0
+    assert len(prompt_logprobs.logprobs) == 0
+    assert len(prompt_logprobs.ranks) == 0
+    assert len(prompt_logprobs.decoded_tokens) == 0
+    # Ensure first prompt position logprobs is empty
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] == dict()
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, FlatLogprobs)
+    assert len(sample_logprobs.start_indices) == 0
+    assert len(sample_logprobs.end_indices) == 0
+    assert len(sample_logprobs.token_ids) == 0
+    assert len(sample_logprobs.logprobs) == 0
+    assert len(sample_logprobs.ranks) == 0
+    assert len(sample_logprobs.decoded_tokens) == 0
+    assert len(sample_logprobs) == 0
+
+
+def test_append_logprobs_for_next_position_none_flat(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, list)
+    assert logprobs == [
+        {1: Logprob(logprob=0.1, rank=10, decoded_token="1")},
+        {
+            2: Logprob(logprob=0.2, rank=11, decoded_token="2"),
+            3: Logprob(logprob=0.3, rank=1, decoded_token="3"),
+        },
+    ]
+
+
+def test_append_logprobs_for_next_position_flat(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, FlatLogprobs)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 11, 1]
+    assert logprobs.decoded_tokens == ["1", "2", "3"]
+
+
+LOGPROBS_ONE_POSITION_0: LogprobsOnePosition = {
+    1: Logprob(logprob=0.1, rank=10, decoded_token="10")
+}
+LOGPROBS_ONE_POSITION_1: LogprobsOnePosition = {
+    2: Logprob(logprob=0.2, rank=20, decoded_token="20"),
+    3: Logprob(logprob=0.3, rank=30, decoded_token="30"),
+}
+LOGPROBS_ONE_POSITION_2: LogprobsOnePosition = {
+    4: Logprob(logprob=0.4, rank=40, decoded_token="40"),
+    5: Logprob(logprob=0.5, rank=50, decoded_token="50"),
+    6: Logprob(logprob=0.6, rank=60, decoded_token="60"),
+}
+
+
+def test_flat_logprobs_append() -> None:
+    logprobs = FlatLogprobs()
+    logprobs.append(LOGPROBS_ONE_POSITION_0)
+    logprobs.append(LOGPROBS_ONE_POSITION_1)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 20, 30]
+    assert logprobs.decoded_tokens == ["10", "20", "30"]
+
+    logprobs.append(LOGPROBS_ONE_POSITION_2)
+    assert logprobs.start_indices == [0, 1, 3]
+    assert logprobs.end_indices == [1, 3, 6]
+    assert logprobs.token_ids == [1, 2, 3, 4, 5, 6]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs.ranks == [10, 20, 30, 40, 50, 60]
+    assert logprobs.decoded_tokens == ["10", "20", "30", "40", "50", "60"]
+
+
+def test_flat_logprobs_extend() -> None:
+    logprobs = FlatLogprobs()
+    # Extend with list[LogprobsOnePosition]
+    logprobs.extend([LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0])
+    assert logprobs.start_indices == [0, 3]
+    assert logprobs.end_indices == [3, 4]
+    assert logprobs.token_ids == [4, 5, 6, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10"]
+
+    other_logprobs = FlatLogprobs()
+    other_logprobs.extend([LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_0])
+    # Extend with another FlatLogprobs
+    logprobs.extend(other_logprobs)
+    assert logprobs.start_indices == [0, 3, 4, 6]
+    assert logprobs.end_indices == [3, 4, 6, 7]
+    assert logprobs.token_ids == [4, 5, 6, 1, 2, 3, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1, 0.2, 0.3, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10, 20, 30, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10", "20", "30", "10"]
+
+
+def test_flat_logprobs_access() -> None:
+    logprobs = FlatLogprobs()
+    logprobs.extend(
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0]
+    )
+    assert logprobs.start_indices == [0, 2, 5]
+    assert logprobs.end_indices == [2, 5, 6]
+    assert logprobs.token_ids == [2, 3, 4, 5, 6, 1]
+    assert logprobs.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [20, 30, 40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["20", "30", "40", "50", "60", "10"]
+
+    # Test __len__
+    assert len(logprobs) == 3
+
+    # Test __iter__
+    for actual_logprobs, expected_logprobs in zip(
+        logprobs,
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0],
+    ):
+        assert actual_logprobs == expected_logprobs
+
+    # Test __getitem__ : single item
+    assert logprobs[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs[2] == LOGPROBS_ONE_POSITION_0
+
+    # Test __getitem__ : slice
+    logprobs02 = logprobs[:2]
+    assert len(logprobs02) == 2
+    assert logprobs02[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs02[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs02.start_indices == [0, 2]
+    assert logprobs02.end_indices == [2, 5]
+    assert logprobs02.token_ids == [2, 3, 4, 5, 6]
+    assert logprobs02.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs02.ranks == [20, 30, 40, 50, 60]
+    assert logprobs02.decoded_tokens == ["20", "30", "40", "50", "60"]
+    logprobs_last2 = logprobs[-2:]
+    assert len(logprobs_last2) == 2
+    assert logprobs_last2[0] == LOGPROBS_ONE_POSITION_2
+    assert logprobs_last2[1] == LOGPROBS_ONE_POSITION_0
+    assert logprobs_last2.start_indices == [0, 3]
+    assert logprobs_last2.end_indices == [3, 4]
+    assert logprobs_last2.token_ids == [4, 5, 6, 1]
+    assert logprobs_last2.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs_last2.ranks == [40, 50, 60, 10]
+    assert logprobs_last2.decoded_tokens == ["40", "50", "60", "10"]
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
index 4bb1c20f77f1d75e5fc7d7caf39c088bf0b3e6df..7b234884c569e9c3a1a481b8373b909a1e8df260 100644
--- a/tests/test_outputs.py
+++ b/tests/test_outputs.py
@@ -1,15 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm.outputs import RequestOutput
 
+pytestmark = pytest.mark.cpu_test
+
 
 def test_request_output_forward_compatible():
-    output = RequestOutput(request_id="test_request_id",
-                           prompt="test prompt",
-                           prompt_token_ids=[1, 2, 3],
-                           prompt_logprobs=None,
-                           outputs=[],
-                           finished=False,
-                           example_arg_added_in_new_version="some_value")
+    output = RequestOutput(
+        request_id="test_request_id",
+        prompt="test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[],
+        finished=False,
+        example_arg_added_in_new_version="some_value",
+    )
     assert output is not None
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
index 52c03015483c92826f0741c798f19ad86fe6dca1..7812562c8948cae88c9fd3ffe365b015696f038d 100644
--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -1,18 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
 import pytest
 
 from tests.models.utils import EmbedModelInfo
 from vllm import PoolingParams
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, PoolerConfig
 
 EMBEDDING_MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                   is_matryoshka=True,
-                   matryoshka_dimensions=[256]),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        matryoshka_dimensions=[256],
+    ),
 ]
 
+classify_parameters = ["use_activation"]
+embed_parameters = ["dimensions", "normalize"]
+step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
+
+
+@dataclass()
+class MockModelConfig:
+    pooler_config: PoolerConfig
+
 
 def test_task():
     pooling_params = PoolingParams()
@@ -22,25 +35,27 @@ def test_task():
     pooling_params.verify(task="score")
 
     with pytest.raises(ValueError):
-        pooling_params.verify(task="encode")
+        pooling_params.verify(task="classify")
 
 
 def test_embed():
     task = "embed"
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
     pooling_params = PoolingParams(normalize=None)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(normalize=True)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(normalize=False)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    invalid_parameters = ["activation", "softmax"]
+    invalid_parameters = classify_parameters + step_pooling_parameters
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@@ -65,42 +80,77 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
 
     if model_info.is_matryoshka:
         assert model_info.matryoshka_dimensions is not None
-        pooling_params = PoolingParams(
-            dimensions=model_info.matryoshka_dimensions[0])
+        pooling_params = PoolingParams(dimensions=model_info.matryoshka_dimensions[0])
         pooling_params.verify(task=task, model_config=model_config)
 
 
 @pytest.mark.parametrize("task", ["score", "classify"])
 def test_classify(task):
-    pooling_params = PoolingParams(activation=None)
-    pooling_params.verify(task=task)
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
+    pooling_params = PoolingParams(use_activation=None)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=True)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(use_activation=True)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=False)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(use_activation=False)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    invalid_parameters = ["dimensions", "normalize", "softmax"]
+    invalid_parameters = embed_parameters + step_pooling_parameters
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
 
 
-def test_encode():
-    task = "encode"
-    pooling_params = PoolingParams(softmax=None)
-    pooling_params.verify(task=task)
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_embed(pooling_type: str):
+    task = "token_embed"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
 
-    pooling_params = PoolingParams(softmax=True)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(normalize=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(normalize=True)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(normalize=False)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    invalid_parameters = classify_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = classify_parameters + step_pooling_parameters
+
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task, model_config=model_config)
+
+
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_classify(pooling_type: str):
+    task = "token_classify"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
+
+    pooling_params = PoolingParams(use_activation=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(use_activation=True)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(use_activation=False)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(softmax=False)
-    pooling_params.verify(task=task)
+    invalid_parameters = embed_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = embed_parameters + step_pooling_parameters
 
-    invalid_parameters = ["dimensions", "normalize", "activation"]
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index f5f1ed8e805e00aaaee1e236638caf0dcaf07743..8a9829e4dba5fe6e3d534b02c2c7a05ee1cee86c 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -6,6 +6,7 @@ It should include tests that are reported by users and making sure they
 will never happen again.
 
 """
+
 import gc
 
 import pytest
@@ -18,12 +19,12 @@ from vllm import LLM, SamplingParams
 def test_duplicated_ignored_sequence_group():
     """https://github.com/vllm-project/vllm/issues/1655"""
 
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=256)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
     outputs = llm.generate(prompts, sampling_params=sampling_params)
 
@@ -31,12 +32,12 @@ def test_duplicated_ignored_sequence_group():
 
 
 def test_max_tokens_none():
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=None)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
     prompts = ["Just say hello!"]
     outputs = llm.generate(prompts, sampling_params=sampling_params)
 
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 77501f4bddc22676100e598cd01f3e4bbbf6bceb..5a162fa8f791b943b7a8bfdfa1a1a7f5036501b5 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -13,7 +13,9 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.routing_simulator import (
-    DistributionBasedRouting, RoutingSimulator)
+    DistributionBasedRouting,
+    RoutingSimulator,
+)
 
 
 @pytest.fixture
@@ -60,10 +62,10 @@ def test_basic_functionality(
         ), f"Wrong ids shape for {strategy}"
 
         # Check that expert IDs are valid
-        assert (topk_ids.min()
-                >= 0), f"Invalid expert ID (negative) for {strategy}"
-        assert (topk_ids.max()
-                < num_experts), f"Invalid expert ID (too large) for {strategy}"
+        assert topk_ids.min() >= 0, f"Invalid expert ID (negative) for {strategy}"
+        assert topk_ids.max() < num_experts, (
+            f"Invalid expert ID (too large) for {strategy}"
+        )
 
 
 def test_routing_strategy_integration(monkeypatch, device):
@@ -102,19 +104,20 @@ def test_routing_strategy_integration(monkeypatch, device):
             top_k=top_k,
             use_grouped_topk=False,
             renormalize=True,
-            indices_type=torch.long)
+            indices_type=torch.long,
+        )
 
         # Verify output shapes
-        assert topk_weights.shape == (
-            num_tokens, top_k), f"Wrong weights shape for {strategy}"
-        assert topk_ids.shape == (num_tokens,
-                                  top_k), f"Wrong ids shape for {strategy}"
+        assert topk_weights.shape == (num_tokens, top_k), (
+            f"Wrong weights shape for {strategy}"
+        )
+        assert topk_ids.shape == (num_tokens, top_k), f"Wrong ids shape for {strategy}"
 
         # Verify expert IDs are valid
-        assert topk_ids.min(
-        ) >= 0, f"Invalid expert ID (negative) for {strategy}"
-        assert topk_ids.max(
-        ) < num_experts, f"Invalid expert ID (too large) for {strategy}"
+        assert topk_ids.min() >= 0, f"Invalid expert ID (negative) for {strategy}"
+        assert topk_ids.max() < num_experts, (
+            f"Invalid expert ID (too large) for {strategy}"
+        )
 
 
 def test_distribution_based_routing_with_custom_strategy():
@@ -123,9 +126,7 @@ def test_distribution_based_routing_with_custom_strategy():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     # Register custom distribution-based strategy
-    custom_strategy = DistributionBasedRouting(distribution="normal",
-                                               mean=2.0,
-                                               std=0.5)
+    custom_strategy = DistributionBasedRouting(distribution="normal", mean=2.0, std=0.5)
     RoutingSimulator.register_strategy("custom_normal", custom_strategy)
 
     # Test data
@@ -142,7 +143,8 @@ def test_distribution_based_routing_with_custom_strategy():
         hidden_states=hidden_states,
         router_logits=router_logits,
         strategy_name="custom_normal",
-        top_k=top_k)
+        top_k=top_k,
+    )
 
     # Check output shapes
     assert topk_weights.shape == (num_tokens, top_k)
@@ -165,7 +167,8 @@ def test_instance_compatibility():
         hidden_states=hidden_states,
         router_logits=router_logits,
         strategy_name="uniform_random",
-        top_k=2)
+        top_k=2,
+    )
 
     assert topk_weights.shape == (10, 2)
     assert topk_ids.shape == (10, 2)
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index ef4aef3afc2e20b01e77e3134feb08c83da809ae..5361efbbdf6fbcdeed9a521dad61f8dbd3e5bb00 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -7,21 +7,24 @@ import torch
 from vllm.scalar_type import scalar_types
 
 
-@pytest.mark.parametrize("type_tuple", (
-    (-8, 7, scalar_types.int4),
-    (0, 15, scalar_types.uint4),
-    (-8, 7, scalar_types.uint4b8),
-    (-128, 127, scalar_types.uint8b128),
-    (-6., 6., scalar_types.float4_e2m1f),
-    (-28., 28., scalar_types.float6_e3m2f),
-    (torch.int8, scalar_types.int8),
-    (torch.uint8, scalar_types.uint8),
-    (torch.float8_e5m2, scalar_types.float8_e5m2),
-    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
-    (torch.bfloat16, scalar_types.float16_e8m7),
-    (torch.float16, scalar_types.float16_e5m10),
-),
-                         ids=lambda x: str(x))
+@pytest.mark.parametrize(
+    "type_tuple",
+    (
+        (-8, 7, scalar_types.int4),
+        (0, 15, scalar_types.uint4),
+        (-8, 7, scalar_types.uint4b8),
+        (-128, 127, scalar_types.uint8b128),
+        (-6.0, 6.0, scalar_types.float4_e2m1f),
+        (-28.0, 28.0, scalar_types.float6_e3m2f),
+        (torch.int8, scalar_types.int8),
+        (torch.uint8, scalar_types.uint8),
+        (torch.float8_e5m2, scalar_types.float8_e5m2),
+        (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
+        (torch.bfloat16, scalar_types.float16_e8m7),
+        (torch.float16, scalar_types.float16_e5m10),
+    ),
+    ids=lambda x: str(x),
+)
 def test_scalar_type_min_max(type_tuple):
     print(type_tuple)
     if len(type_tuple) == 3:
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index e9138b9e8eb6138f7e8787307b99873c7866c02b..adc8a1a4bf08e582686b386b667f1c8bac2eb5e6 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -1,25 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import random
-
-import numpy as np
-import torch
-
-from vllm.platforms.interface import Platform
-
-
-def test_seed_behavior():
-    # Test with a specific seed
-    Platform.seed_everything(42)
-    random_value_1 = random.randint(0, 100)
-    np_random_value_1 = np.random.randint(0, 100)
-    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
-
-    Platform.seed_everything(42)
-    random_value_2 = random.randint(0, 100)
-    np_random_value_2 = np.random.randint(0, 100)
-    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
-
-    assert random_value_1 == random_value_2
-    assert np_random_value_1 == np_random_value_2
-    assert torch_random_value_1 == torch_random_value_2
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1,)).item()
+
+    Platform.seed_everything(42)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1,)).item()
+
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index da9826ff05058c39fdcec2a80ead96c7b0c23070..27af05bec22dcf877d7b768c022a3edf06210474 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -7,7 +7,6 @@ from vllm.sequence import IntermediateTensors
 
 
 def test_sequence_intermediate_tensors_equal():
-
     class AnotherIntermediateTensors(IntermediateTensors):
         pass
 
@@ -20,22 +19,31 @@ def test_sequence_intermediate_tensors_equal():
     assert empty_intermediate_tensors_1 == empty_intermediate_tensors_2
 
     different_key_intermediate_tensors_1 = IntermediateTensors(
-        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+        {"1": torch.zeros([2, 4], dtype=torch.int32)}
+    )
     difference_key_intermediate_tensors_2 = IntermediateTensors(
-        {"2": torch.zeros([2, 4], dtype=torch.int32)})
-    assert (different_key_intermediate_tensors_1
-            != difference_key_intermediate_tensors_2)
+        {"2": torch.zeros([2, 4], dtype=torch.int32)}
+    )
+    assert different_key_intermediate_tensors_1 != difference_key_intermediate_tensors_2
 
     same_key_different_value_intermediate_tensors_1 = IntermediateTensors(
-        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+        {"1": torch.zeros([2, 4], dtype=torch.int32)}
+    )
     same_key_different_value_intermediate_tensors_2 = IntermediateTensors(
-        {"1": torch.zeros([2, 5], dtype=torch.int32)})
-    assert (same_key_different_value_intermediate_tensors_1
-            != same_key_different_value_intermediate_tensors_2)
+        {"1": torch.zeros([2, 5], dtype=torch.int32)}
+    )
+    assert (
+        same_key_different_value_intermediate_tensors_1
+        != same_key_different_value_intermediate_tensors_2
+    )
 
     same_key_same_value_intermediate_tensors_1 = IntermediateTensors(
-        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+        {"1": torch.zeros([2, 4], dtype=torch.int32)}
+    )
     same_key_same_value_intermediate_tensors_2 = IntermediateTensors(
-        {"1": torch.zeros([2, 4], dtype=torch.int32)})
-    assert (same_key_same_value_intermediate_tensors_1 ==
-            same_key_same_value_intermediate_tensors_2)
+        {"1": torch.zeros([2, 4], dtype=torch.int32)}
+    )
+    assert (
+        same_key_same_value_intermediate_tensors_1
+        == same_key_same_value_intermediate_tensors_2
+    )
diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py
index ebb69e627e951f2e3d35eeddee9a9cc0faa86b07..7fe0a5d9c5176e8ad002e5ed41a6526863e38223 100644
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@@ -5,8 +5,7 @@ import sys
 import types
 from unittest import mock
 
-from vllm.triton_utils.importing import (TritonLanguagePlaceholder,
-                                         TritonPlaceholder)
+from vllm.triton_utils.importing import TritonLanguagePlaceholder, TritonPlaceholder
 
 
 def test_triton_placeholder_is_module():
@@ -52,8 +51,7 @@ def test_triton_placeholder_decorators_with_args():
     def bar(x):
         return x
 
-    @triton.heuristics(
-        {"BLOCK_SIZE": lambda args: 128 if args["x"] > 1024 else 64})
+    @triton.heuristics({"BLOCK_SIZE": lambda args: 128 if args["x"] > 1024 else 64})
     def baz(x):
         return x
 
@@ -89,6 +87,7 @@ def test_no_triton_fallback():
     # mock triton not being installed
     with mock.patch.dict(sys.modules, {"triton": None}):
         from vllm.triton_utils import HAS_TRITON, tl, triton
+
         assert HAS_TRITON is False
         assert triton.__class__.__name__ == "TritonPlaceholder"
         assert triton.language.__class__.__name__ == "TritonLanguagePlaceholder"
diff --git a/tests/test_version.py b/tests/test_version.py
index fd07abb59b1f86ff47ce734813588b055c2b2502..928f742f1de8f35e6eea6a50d922c89c739d508c 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -31,7 +31,8 @@ def test_version_tuple():
         ((1, 0, 0), "1.-1", True),
         ((1, 0, 0), "0.9", False),
         ((1, 0, 0), "0.17", False),
-    ])
+    ],
+)
 def test_prev_minor_version_was(version_tuple, version_str, expected):
     with patch("vllm.version.__version_tuple__", version_tuple):
         assert version._prev_minor_version_was(version_str) == expected
diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py
index 88e1efd8fdbb69557115a5bfc2dd8dfbebfb42e6..68bd511635dc1b337c3599f23b1d601a3deefafa 100644
--- a/tests/test_vllm_port.py
+++ b/tests/test_vllm_port.py
@@ -23,14 +23,17 @@ def test_get_vllm_port_valid():
 
 def test_get_vllm_port_invalid():
     """Test when VLLM_PORT is set to a non-integer value."""
-    with (patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True),
-          pytest.raises(ValueError, match="must be a valid integer")):
+    with (
+        patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True),
+        pytest.raises(ValueError, match="must be a valid integer"),
+    ):
         get_vllm_port()
 
 
 def test_get_vllm_port_uri():
     """Test when VLLM_PORT is set to a URI."""
-    with (patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"},
-                     clear=True),
-          pytest.raises(ValueError, match="appears to be a URI")):
+    with (
+        patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"}, clear=True),
+        pytest.raises(ValueError, match="appears to be a URI"),
+    ):
         get_vllm_port()
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 07217611ea4d2deff3322ce18d95948773586766..074039f9e5134e097ba189d1bb60a75b0a0a33a1 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -6,17 +6,16 @@ from copy import deepcopy
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               get_cached_tokenizer)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
 def test_cached_tokenizer(model_id: str):
-    reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
-                                                        trust_remote_code=True)
+    reference_tokenizer = AutoTokenizer.from_pretrained(
+        model_id, trust_remote_code=True
+    )
     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
-    reference_tokenizer.add_special_tokens(
-        {"additional_special_tokens": ["<SEP>"]})
+    reference_tokenizer.add_special_tokens({"additional_special_tokens": ["<SEP>"]})
 
     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
     _check_consistency(cached_tokenizer, reference_tokenizer)
@@ -32,13 +31,13 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
     # Cached attributes
     assert target.all_special_ids == expected.all_special_ids
     assert target.all_special_tokens == expected.all_special_tokens
-    assert (target.all_special_tokens_extended ==
-            expected.all_special_tokens_extended)
+    assert target.all_special_tokens_extended == expected.all_special_tokens_extended
     assert target.get_vocab() == expected.get_vocab()
     assert len(target) == len(expected)
 
     # Other attributes
-    assert getattr(target, "padding_side",
-                   None) == getattr(expected, "padding_side", None)
+    assert getattr(target, "padding_side", None) == getattr(
+        expected, "padding_side", None
+    )
 
     assert target.encode("prompt") == expected.encode("prompt")
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index fe6c313d2966f8304d828e08bb9a8f09bae72e9a..f4b43a21daaa862c256cbd6cb145d968205d400a 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -2,18 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Generator
-from typing import Any, Optional
+from typing import Any
 
 import pytest
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
-                                        IncrementalDetokenizer,
-                                        SlowIncrementalDetokenizer)
+from vllm.v1.engine.detokenizer import (
+    FastIncrementalDetokenizer,
+    IncrementalDetokenizer,
+    SlowIncrementalDetokenizer,
+)
 
 SPECIAL_TOKS_TRUTH = [
     "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
@@ -45,33 +46,35 @@ TOKENIZERS = [
 ]
 
 
-def _run_incremental_decode(tokenizer,
-                            all_input_ids,
-                            skip_special_tokens: bool,
-                            starting_index: int,
-                            spaces_between_special_tokens: bool = True,
-                            fast: Optional[bool] = None):
-
+def _run_incremental_decode(
+    tokenizer,
+    all_input_ids,
+    skip_special_tokens: bool,
+    starting_index: int,
+    spaces_between_special_tokens: bool = True,
+    fast: bool | None = None,
+):
     prompt_token_ids = all_input_ids[:starting_index]
 
     params = SamplingParams(
         skip_special_tokens=skip_special_tokens,
         spaces_between_special_tokens=spaces_between_special_tokens,
     )
-    request = EngineCoreRequest(request_id="",
-                                prompt_token_ids=prompt_token_ids,
-                                mm_features=None,
-                                sampling_params=params,
-                                pooling_params=None,
-                                eos_token_id=None,
-                                arrival_time=0.0,
-                                lora_request=None,
-                                cache_salt=None,
-                                data_parallel_rank=None)
+    request = EngineCoreRequest(
+        request_id="",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
 
     if fast is None:
-        detokenizer = IncrementalDetokenizer.from_new_request(
-            tokenizer, request)
+        detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
     elif fast:
         detokenizer = FastIncrementalDetokenizer(tokenizer, request)
     else:
@@ -88,9 +91,11 @@ def _run_incremental_decode(tokenizer,
 
 @pytest.fixture
 def tokenizer(tokenizer_name):
-    return (MistralTokenizer.from_pretrained(tokenizer_name)
-            if "mistral" in tokenizer_name else
-            AutoTokenizer.from_pretrained(tokenizer_name))
+    return (
+        MistralTokenizer.from_pretrained(tokenizer_name)
+        if "mistral" in tokenizer_name
+        else AutoTokenizer.from_pretrained(tokenizer_name)
+    )
 
 
 @pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
@@ -102,7 +107,8 @@ def tokenizer(tokenizer_name):
         "ပုံပြင်လေးပြောပြပါ",
         # Using "URGENCY" since "CY" has token id 130282
         "URGENCY🌶️",
-    ])
+    ],
+)
 def test_mistral_edge_case(tokenizer, truth):
     """Test for a specific edge cases with V3-Tekken MistralTokenizer.
 
@@ -115,7 +121,8 @@ def test_mistral_edge_case(tokenizer, truth):
         tokenizer,
         all_input_ids,
         skip_special_tokens=True,
-        starting_index=starting_index)
+        starting_index=starting_index,
+    )
     assert decoded_text == truth
     assert out_ids == all_input_ids[starting_index:]
 
@@ -124,8 +131,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
     if "mistral" in tokenizer_name:
         yield (
-            True if request.param else
-            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+            True
+            if request.param
+            else pytest.skip("mistral doesn't support skip_special_tokens=False")
+        )
     else:
         yield bool(request.param)
 
@@ -136,8 +145,14 @@ def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
 @pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
 @pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
 @pytest.mark.parametrize("fast", (True, False))
-def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
-                          spaces_between_special_tokens, fast):
+def test_decode_streaming(
+    tokenizer,
+    truth,
+    with_prompt,
+    skip_special_tokens,
+    spaces_between_special_tokens,
+    fast,
+):
     if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
         pytest.skip()
 
@@ -146,30 +161,35 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
 
     if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
         # Fix up inconsistency in fast/slow tokenizer behaviour.
-        tokenizer.add_special_tokens({
-            "additional_special_tokens": [
-                at for at in
-                tokenizer._tokenizer.get_added_tokens_decoder().values()
-                if at.special
-            ]
-        })
-
-    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        tokenizer.add_special_tokens(
+            {
+                "additional_special_tokens": [
+                    at
+                    for at in tokenizer._tokenizer.get_added_tokens_decoder().values()
+                    if at.special
+                ]
+            }
+        )
+
+    extra_decode_args = (
+        {}
+        if not isinstance(tokenizer, PreTrainedTokenizer)
         else {"spaces_between_special_tokens": spaces_between_special_tokens}
+    )
 
     truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
     if tokenizer.bos_token_id is not None:
         truth_tokens.insert(0, tokenizer.bos_token_id)
     truth_tokens.append(tokenizer.eos_token_id)
 
-    new_truth = tokenizer.decode(truth_tokens,
-                                 skip_special_tokens=skip_special_tokens,
-                                 **extra_decode_args)
+    new_truth = tokenizer.decode(
+        truth_tokens, skip_special_tokens=skip_special_tokens, **extra_decode_args
+    )
 
     if with_prompt:
         num_prompt_tokens = len(
-            tokenizer(truth[:len(truth) // 2],
-                      add_special_tokens=False).input_ids)
+            tokenizer(truth[: len(truth) // 2], add_special_tokens=False).input_ids
+        )
         if tokenizer.bos_token_id is not None:
             num_prompt_tokens += 1
 
@@ -177,11 +197,13 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
         generated_input_ids = truth_tokens[num_prompt_tokens:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
-        prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens,
-                                  **extra_decode_args)
+        prompt = tokenizer.decode(
+            prompt_input_ids,
+            skip_special_tokens=skip_special_tokens,
+            **extra_decode_args,
+        )
 
-        generated = new_truth[len(prompt):]
+        generated = new_truth[len(prompt) :]
     else:
         generated = new_truth
         starting_index = 0
@@ -193,7 +215,8 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
         skip_special_tokens=skip_special_tokens,
         starting_index=starting_index,
         spaces_between_special_tokens=spaces_between_special_tokens,
-        fast=fast)
+        fast=fast,
+    )
 
     assert decoded_text == generated
     assert out_ids == all_input_ids[starting_index:]
@@ -206,11 +229,13 @@ def test_oov_decode(tokenizer, fast):
         pytest.skip()
 
     decoded_text, out_ids = _run_incremental_decode(
-        tokenizer, [len(tokenizer)],
+        tokenizer,
+        [len(tokenizer)],
         skip_special_tokens=True,
         starting_index=0,
         spaces_between_special_tokens=True,
-        fast=fast)
+        fast=fast,
+    )
 
-    assert decoded_text == ''
+    assert decoded_text == ""
     assert out_ids == [len(tokenizer)]
diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py
index 7aa655e1c3b456de8578e9106a6cd6a825313951..8aff50b351e313e35ff6247e4d10d6282cd73289 100644
--- a/tests/tokenization/test_do_lower_case.py
+++ b/tests/tokenization/test_do_lower_case.py
@@ -13,6 +13,6 @@ TOKENIZER_NAMES = ["BAAI/bge-base-en"]
 def test_special_tokens(tokenizer_name: str, n_tokens: int):
     tokenizer = get_tokenizer(tokenizer_name, revision="main")
 
-    prompts = '[UNK]' * n_tokens
+    prompts = "[UNK]" * n_tokens
     prompt_token_ids = tokenizer.encode(prompts)
     assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index d8288429351c4bb79c38ec62b4f696afd351ef4a..921d77b1b335ef4109d070aa5e9e7ff855d03d36 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -5,6 +5,7 @@ This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
 {meth}`vllm.LLMEngine._get_eos_token_id`.
 """
+
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -15,8 +16,7 @@ def test_get_llama3_eos_token():
     tokenizer = get_tokenizer(model_name)
     assert tokenizer.eos_token_id == 128009
 
-    generation_config = try_get_generation_config(model_name,
-                                                  trust_remote_code=False)
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
     assert generation_config is not None
     assert generation_config.eos_token_id == [128001, 128008, 128009]
 
@@ -27,7 +27,6 @@ def test_get_blip2_eos_token():
     tokenizer = get_tokenizer(model_name)
     assert tokenizer.eos_token_id == 2
 
-    generation_config = try_get_generation_config(model_name,
-                                                  trust_remote_code=False)
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
     assert generation_config is not None
     assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index 69b3c6294284bebe2f972e4663cf7f7f4cd26e93..1ada8ee187c38271e4e1ce1b24c69d1fdaede7d9 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -1,188 +1,2222 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 import pytest
-from mistral_common.protocol.instruct.messages import (AssistantMessage,
-                                                       ToolMessage,
-                                                       UserMessage)
-from mistral_common.protocol.instruct.request import ChatCompletionRequest
-from mistral_common.protocol.instruct.tool_calls import (Function,
-                                                         FunctionCall, Tool,
-                                                         ToolCall)
+from mistral_common.exceptions import InvalidMessageStructureException
+from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.transformers_utils.tokenizers.mistral import (
-    make_mistral_chat_completion_request)
+    MistralTokenizer,
+    _prepare_apply_chat_template_tools_and_messages,
+)
 
 
 @pytest.mark.parametrize(
-    "openai_request,expected_mistral_request",
-    [(
-        {
-            "messages": [{
-                "role": "user",
-                "content": "What is the current local date and time?",
-            }],
-            "tools": [{
-                "type": "function",
-                "function": {
-                    "description": "Fetch the current local date and time.",
-                    "name": "get_current_time",
-                },
-            }],
-        },
-        ChatCompletionRequest(
-            messages=[
-                UserMessage(content="What is the current local date and time?")
-            ],
-            tools=[
-                Tool(
-                    type="function",
-                    function=Function(
-                        name="get_current_time",
-                        description="Fetch the current local date and time.",
-                        parameters={},
-                    ),
-                )
-            ],
+    "openai_request,expected_mistral_output",
+    [
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                        },
+                    }
+                ],
+            },
+            (
+                [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    }
+                ],
+            ),
         ),
-    ),
-     (
-         {
-             "messages":
-             [{
-                 "role": "user",
-                 "content": "What is the current local date and time?",
-             }],
-             "tools": [{
-                 "type": "function",
-                 "function": {
-                     "description": "Fetch the current local date and time.",
-                     "name": "get_current_time",
-                     "parameters": None,
-                 },
-             }],
-         },
-         ChatCompletionRequest(
-             messages=[
-                 UserMessage(
-                     content="What is the current local date and time?")
-             ],
-             tools=[
-                 Tool(
-                     type="function",
-                     function=Function(
-                         name="get_current_time",
-                         description="Fetch the current local date and time.",
-                         parameters={},
-                     ),
-                 )
-             ],
-         ),
-     )],
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    }
+                ],
+            },
+            (
+                [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": {},
+                        },
+                    }
+                ],
+            ),
+        ),
+    ],
 )
-def test_make_mistral_chat_completion_request(openai_request,
-                                              expected_mistral_request):
-    actual_request = make_mistral_chat_completion_request(
-        openai_request["messages"], openai_request["tools"])
-    assert actual_request == expected_mistral_request
+def test_prepare_apply_chat_template_tools_and_messages(
+    openai_request, expected_mistral_output
+):
+    actual_request = _prepare_apply_chat_template_tools_and_messages(
+        openai_request["messages"], openai_request["tools"]
+    )
+    assert actual_request == expected_mistral_output
 
 
-# Tool use with list content and reasoning_content
-@pytest.mark.parametrize("openai_request,expected_mistral_request", [(
-    {
-        "messages": [
-            {
-                "role": "user",
-                "content": "What's the weather in Paris?",
-            },
+# Tool use with list content and reasoning
+@pytest.mark.parametrize(
+    "openai_request,expected_mistral_output",
+    [
+        (
             {
-                "role":
-                "assistant",
-                "reasoning_content":
-                None,
-                "content":
-                None,
-                "tool_calls": [{
-                    "id": "call123",
-                    "type": "function",
-                    "function": {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What's the weather in Paris?",
+                    },
+                    {
+                        "role": "assistant",
+                        "reasoning": None,
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call123",
+                                "type": "function",
+                                "function": {
+                                    "name": "get_weather",
+                                    "arguments": '{"city": "Paris"}',
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "role": "tool",
+                        "content": [{"type": "text", "text": "Rainy"}],
                         "name": "get_weather",
-                        "arguments": '{"city": "Paris"}',
+                        "tool_call_id": "call123",
                     },
-                }],
-            },
-            {
-                "role": "tool",
-                "content": [{
-                    "type": "text",
-                    "text": "Rainy"
-                }],
-                "name": "get_weather",
-                "tool_call_id": "call123",
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "description": "Gets the current weather in a city.",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "city": {
+                                        "type": "string",
+                                        "description": "The city name",
+                                    }
+                                },
+                                "required": ["city"],
+                            },
+                        },
+                    }
+                ],
             },
-        ],
-        "tools": [{
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Gets the current weather in a city.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description": "The city name"
-                        }
+            (
+                [
+                    {
+                        "role": "user",
+                        "content": "What's the weather in Paris?",
+                    },
+                    {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call123",
+                                "type": "function",
+                                "function": {
+                                    "name": "get_weather",
+                                    "arguments": '{"city": "Paris"}',
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "role": "tool",
+                        "content": [{"type": "text", "text": "Rainy"}],
+                        "name": "get_weather",
+                        "tool_call_id": "call123",
                     },
-                    "required": ["city"],
-                },
-            },
-        }],
-    },
-    ChatCompletionRequest(
-        messages=[
-            UserMessage(content="What's the weather in Paris?"),
-            AssistantMessage(
-                content=None,
-                tool_calls=[
-                    ToolCall(
-                        id="call123",
-                        function=FunctionCall(
-                            name="get_weather",
-                            arguments='{"city": "Paris"}',
-                        ),
-                    )
                 ],
+                [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "description": "Gets the current weather in a city.",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "city": {
+                                        "type": "string",
+                                        "description": "The city name",
+                                    }
+                                },
+                                "required": ["city"],
+                            },
+                        },
+                    }
+                ],
+            ),
+        )
+    ],
+)
+def test_prepare_apply_chat_template_tools_and_messages_list_content(
+    openai_request, expected_mistral_output
+):
+    actual_request = _prepare_apply_chat_template_tools_and_messages(
+        openai_request["messages"], openai_request["tools"]
+    )
+    assert actual_request == expected_mistral_output
+
+
+def test_prepare_apply_chat_template_generation_prompt_and_continue():
+    messages = [{"role": "assistant", "content": "Hello"}]
+    tools: list[dict[str, Any]] = []
+    with pytest.raises(ValueError):
+        _prepare_apply_chat_template_tools_and_messages(
+            messages, tools, add_generation_prompt=True
+        )
+
+    messages = [{"role": "user", "content": "Hello"}]
+    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
+        messages, tools, add_generation_prompt=True
+    )
+    assert out_messages == [{"role": "user", "content": "Hello"}]
+
+    with pytest.raises(ValueError):
+        _prepare_apply_chat_template_tools_and_messages(
+            messages, tools, add_generation_prompt=True, continue_final_message=True
+        )
+
+    messages = [{"role": "assistant", "content": "Hello"}]
+    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
+        messages, tools, add_generation_prompt=False, continue_final_message=True
+    )
+    assert out_messages == [{"role": "assistant", "content": "Hello"}]
+
+    messages = [{"role": "user", "content": "Hello"}]
+    with pytest.raises(ValueError):
+        _prepare_apply_chat_template_tools_and_messages(
+            messages, tools, add_generation_prompt=False, continue_final_message=True
+        )
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer(request) -> MistralTokenizer:
+    return MistralTokenizer.from_pretrained(request.param)
+
+
+@pytest.mark.parametrize(
+    "mistral_tokenizer",
+    ["mistralai/Mistral-7B-Instruct-v0.3", "mistralai/Magistral-Small-2509"],
+    indirect=True,
+)
+class TestMistralTokenizer:
+    def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
+        attributes = [
+            mistral_tokenizer.all_special_tokens,
+            mistral_tokenizer.all_special_tokens_extended,
+        ]
+
+        for attribute in attributes:
+            if mistral_tokenizer.is_tekken:
+                assert attribute == [
+                    "<unk>",
+                    "<s>",
+                    "</s>",
+                    "[INST]",
+                    "[/INST]",
+                    "[AVAILABLE_TOOLS]",
+                    "[/AVAILABLE_TOOLS]",
+                    "[TOOL_RESULTS]",
+                    "[/TOOL_RESULTS]",
+                    "[TOOL_CALLS]",
+                    "[IMG]",
+                    "<pad>",
+                    "[IMG_BREAK]",
+                    "[IMG_END]",
+                    "[PREFIX]",
+                    "[MIDDLE]",
+                    "[SUFFIX]",
+                    "[SYSTEM_PROMPT]",
+                    "[/SYSTEM_PROMPT]",
+                    "[TOOL_CONTENT]",
+                ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
+                    "[ARGS]",
+                    "[CALL_ID]",
+                    "[THINK]",
+                    "[/THINK]",
+                ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
+            else:
+                assert attribute == [
+                    "<s>",
+                    "</s>",
+                    "[INST]",
+                    "[/INST]",
+                    "[TOOL_CALLS]",
+                    "[AVAILABLE_TOOLS]",
+                    "[/AVAILABLE_TOOLS]",
+                    "[TOOL_RESULTS]",
+                    "[/TOOL_RESULTS]",
+                ] + [f"[control_{i}]" for i in range(8, 769)]
+
+    def get_vocab(self, mistral_tokenizer: MistralTokenizer):
+        assert (
+            mistral_tokenizer.get_vocab()
+            == mistral_tokenizer.transformers_tokenizer.get_vocab()
+        )
+
+    def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer):
+        assert mistral_tokenizer.get_added_vocab() == {}
+
+    def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
+        token_ids = (
+            [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686]
+        )
+
+        assert mistral_tokenizer.encode_one("Hello world !") == token_ids
+        assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids
+        assert (
+            mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1)
+            == token_ids[:-2]
+        )
+        assert (
+            mistral_tokenizer.encode_one(
+                "Hello world !", truncation=False, max_length=1
+            )
+            == token_ids
+        )
+
+    def test_encode(self, mistral_tokenizer: MistralTokenizer):
+        token_ids = (
+            [1, 22177, 4304, 2662]
+            if mistral_tokenizer.is_tekken
+            else [1, 23325, 2294, 1686]
+        )
+
+        assert mistral_tokenizer.encode("Hello world !") == token_ids
+        assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-1]
+        assert (
+            mistral_tokenizer.encode("Hello world !", truncation=True, max_length=3)
+            == token_ids[:-1]
+        )
+        assert (
+            mistral_tokenizer.encode("Hello world !", truncation=False, max_length=3)
+            == token_ids
+        )
+
+        assert (
+            mistral_tokenizer.encode("Hello world !", add_special_tokens=True)
+            == token_ids
+        )
+        assert (
+            mistral_tokenizer.encode(
+                "Hello world !", add_special_tokens=True, max_length=3
+            )
+            == token_ids[:-1]
+        )
+        assert (
+            mistral_tokenizer.encode(
+                "Hello world !", add_special_tokens=True, truncation=False, max_length=3
+            )
+            == token_ids
+        )
+        assert (
+            mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
+            == token_ids[1:]
+        )
+
+    @pytest.mark.parametrize(
+        "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output",
+        [
+            (
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": "Hello world !",
+                        }
+                    ],
+                },
+                True,
+                False,
+                ([1, 3, 23325, 2294, 1686, 4], [1, 3, 22177, 4304, 2662, 4]),
+                ("<s>[INST]▁Hello▁world▁![/INST]", ("<s>[INST]Hello world ![/INST]")),
             ),
-            ToolMessage(
-                content="Rainy",
-                tool_call_id="call123",
-                name="get_weather",
+            (
+                {
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "I am an AI",
+                        },
+                        {
+                            "role": "user",
+                            "content": "Hello world !",
+                        },
+                    ],
+                },
+                True,
+                False,
+                (
+                    [1, 3, 1083, 1605, 1164, 16875, 781, 781, 16998, 2294, 1686, 4],
+                    [1, 17, 1073, 1855, 1420, 26554, 18, 3, 22177, 4304, 2662, 4],
+                ),
+                (
+                    "<s>[INST]▁I▁am▁an▁AI<0x0A><0x0A>Hello▁world▁![/INST]",
+                    (
+                        "<s>[SYSTEM_PROMPT]I am an AI[/SYSTEM_PROMPT][INST]Hello world ![/INST]"  # noqa: E501
+                    ),
+                ),
             ),
-        ],
-        tools=[
-            Tool(
-                type="function",
-                function=Function(
-                    name="get_weather",
-                    description="Gets the current weather in a city.",
-                    parameters={
-                        "type": "object",
-                        "properties": {
-                            "city": {
-                                "type": "string",
-                                "description": "The city name"
-                            }
+            (
+                {
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "I am an AI",
                         },
-                        "required": ["city"],
-                    },
+                        {
+                            "role": "user",
+                            "content": "Hello world !",
+                        },
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "get_weather",
+                                "description": "Gets the current weather in a city.",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "city": {
+                                            "type": "string",
+                                            "description": "The city name",
+                                        }
+                                    },
+                                    "required": ["city"],
+                                },
+                            },
+                        }
+                    ],
+                },
+                True,
+                False,
+                (
+                    [
+                        1,
+                        6,
+                        1501,
+                        7567,
+                        1891,
+                        2032,
+                        1113,
+                        3396,
+                        1316,
+                        1113,
+                        3396,
+                        2032,
+                        10598,
+                        1629,
+                        2032,
+                        1113,
+                        1295,
+                        29498,
+                        1537,
+                        1991,
+                        1316,
+                        1113,
+                        7286,
+                        2032,
+                        1113,
+                        2226,
+                        29481,
+                        1040,
+                        2636,
+                        8854,
+                        1065,
+                        1032,
+                        3758,
+                        9959,
+                        1113,
+                        12206,
+                        2032,
+                        10598,
+                        1891,
+                        2032,
+                        1113,
+                        3582,
+                        1316,
+                        1113,
+                        11491,
+                        2032,
+                        10598,
+                        19141,
+                        2032,
+                        10598,
+                        1891,
+                        2032,
+                        1113,
+                        2195,
+                        1316,
+                        1113,
+                        7286,
+                        2032,
+                        1113,
+                        1782,
+                        3758,
+                        1909,
+                        29507,
+                        11549,
+                        1113,
+                        11661,
+                        2032,
+                        8135,
+                        19141,
+                        3010,
+                        1743,
+                        10925,
+                        7,
+                        3,
+                        1083,
+                        1605,
+                        1164,
+                        16875,
+                        781,
+                        781,
+                        16998,
+                        2294,
+                        1686,
+                        4,
+                    ],
+                    [
+                        1,
+                        17,
+                        1073,
+                        1855,
+                        1420,
+                        26554,
+                        18,
+                        5,
+                        1091,
+                        19227,
+                        4994,
+                        2811,
+                        1429,
+                        5165,
+                        1897,
+                        1429,
+                        5165,
+                        2811,
+                        16753,
+                        2391,
+                        2811,
+                        1429,
+                        1689,
+                        1095,
+                        45629,
+                        1897,
+                        1429,
+                        14653,
+                        2811,
+                        1429,
+                        1071,
+                        3083,
+                        1278,
+                        3519,
+                        17253,
+                        1294,
+                        1261,
+                        5970,
+                        39249,
+                        1429,
+                        26204,
+                        2811,
+                        16753,
+                        4994,
+                        2811,
+                        1429,
+                        6371,
+                        1897,
+                        1429,
+                        48649,
+                        2811,
+                        16753,
+                        29363,
+                        2811,
+                        16753,
+                        4994,
+                        2811,
+                        1429,
+                        3607,
+                        1897,
+                        1429,
+                        14653,
+                        2811,
+                        1429,
+                        1784,
+                        5970,
+                        2564,
+                        1034,
+                        47579,
+                        1429,
+                        15760,
+                        2811,
+                        12161,
+                        29363,
+                        4964,
+                        2821,
+                        27028,
+                        6,
+                        3,
+                        22177,
+                        4304,
+                        2662,
+                        4,
+                    ],
                 ),
-            )
+                (
+                    '<s>[AVAILABLE_TOOLS]▁[{"type":▁"function",▁"function":▁{"name":▁"get_weather",▁"description":▁"Gets▁the▁current▁weather▁in▁a▁city.",▁"parameters":▁{"type":▁"object",▁"properties":▁{"city":▁{"type":▁"string",▁"description":▁"The▁city▁name"}},▁"required":▁["city"]}}}][/AVAILABLE_TOOLS][INST]▁I▁am▁an▁AI<0x0A><0x0A>Hello▁world▁![/INST]',
+                    (
+                        '<s>[SYSTEM_PROMPT]I am an AI[/SYSTEM_PROMPT][AVAILABLE_TOOLS][{"type": "function", "function": {"name": "get_weather", "description": "Gets the current weather in a city.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city name"}}, "required": ["city"]}}}][/AVAILABLE_TOOLS][INST]Hello world ![/INST]'  # noqa: E501
+                    ),
+                ),
+            ),
+            (
+                {
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "I am an AI",
+                        },
+                        {
+                            "role": "user",
+                            "content": "Hello world !",
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "",
+                            "tool_calls": [
+                                {
+                                    "id": "123456789",
+                                    "type": "function",
+                                    "function": {
+                                        "name": "get_weather",
+                                        "arguments": '{"city": "Paris"}',
+                                    },
+                                }
+                            ],
+                        },
+                        {
+                            "role": "tool",
+                            "tool_call_id": "123456789",
+                            "content": '{"temperature": 20, "unit": "celsius"}',
+                        },
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "get_weather",
+                                "description": "Gets the current weather in a city.",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "city": {
+                                            "type": "string",
+                                            "description": "The city name",
+                                        }
+                                    },
+                                    "required": ["city"],
+                                },
+                            },
+                        }
+                    ],
+                },
+                True,
+                False,
+                (
+                    [
+                        1,
+                        6,
+                        1501,
+                        7567,
+                        1891,
+                        2032,
+                        1113,
+                        3396,
+                        1316,
+                        1113,
+                        3396,
+                        2032,
+                        10598,
+                        1629,
+                        2032,
+                        1113,
+                        1295,
+                        29498,
+                        1537,
+                        1991,
+                        1316,
+                        1113,
+                        7286,
+                        2032,
+                        1113,
+                        2226,
+                        29481,
+                        1040,
+                        2636,
+                        8854,
+                        1065,
+                        1032,
+                        3758,
+                        9959,
+                        1113,
+                        12206,
+                        2032,
+                        10598,
+                        1891,
+                        2032,
+                        1113,
+                        3582,
+                        1316,
+                        1113,
+                        11491,
+                        2032,
+                        10598,
+                        19141,
+                        2032,
+                        10598,
+                        1891,
+                        2032,
+                        1113,
+                        2195,
+                        1316,
+                        1113,
+                        7286,
+                        2032,
+                        1113,
+                        1782,
+                        3758,
+                        1909,
+                        29507,
+                        11549,
+                        1113,
+                        11661,
+                        2032,
+                        8135,
+                        19141,
+                        3010,
+                        1743,
+                        10925,
+                        7,
+                        3,
+                        1083,
+                        1605,
+                        1164,
+                        16875,
+                        781,
+                        781,
+                        16998,
+                        2294,
+                        1686,
+                        4,
+                        5,
+                        1501,
+                        7567,
+                        1629,
+                        2032,
+                        1113,
+                        1295,
+                        29498,
+                        1537,
+                        1991,
+                        1316,
+                        1113,
+                        17452,
+                        2032,
+                        10598,
+                        19141,
+                        2032,
+                        1113,
+                        4684,
+                        1046,
+                        8474,
+                        1113,
+                        1081,
+                        2032,
+                        1113,
+                        29508,
+                        29518,
+                        29538,
+                        29549,
+                        29550,
+                        29552,
+                        29555,
+                        29551,
+                        29542,
+                        29507,
+                        10925,
+                        2,
+                        8,
+                        10598,
+                        4557,
+                        2032,
+                        10598,
+                        29475,
+                        17329,
+                        2032,
+                        29473,
+                        29518,
+                        29502,
+                        29493,
+                        1113,
+                        6074,
+                        2032,
+                        1113,
+                        29485,
+                        1958,
+                        3938,
+                        8474,
+                        1113,
+                        3613,
+                        29498,
+                        1081,
+                        2032,
+                        1113,
+                        29508,
+                        29518,
+                        29538,
+                        29549,
+                        29550,
+                        29552,
+                        29555,
+                        29551,
+                        29542,
+                        18163,
+                        9,
+                    ],
+                    [
+                        1,
+                        17,
+                        1073,
+                        1855,
+                        1420,
+                        26554,
+                        18,
+                        5,
+                        1091,
+                        19227,
+                        4994,
+                        2811,
+                        1429,
+                        5165,
+                        1897,
+                        1429,
+                        5165,
+                        2811,
+                        16753,
+                        2391,
+                        2811,
+                        1429,
+                        1689,
+                        1095,
+                        45629,
+                        1897,
+                        1429,
+                        14653,
+                        2811,
+                        1429,
+                        1071,
+                        3083,
+                        1278,
+                        3519,
+                        17253,
+                        1294,
+                        1261,
+                        5970,
+                        39249,
+                        1429,
+                        26204,
+                        2811,
+                        16753,
+                        4994,
+                        2811,
+                        1429,
+                        6371,
+                        1897,
+                        1429,
+                        48649,
+                        2811,
+                        16753,
+                        29363,
+                        2811,
+                        16753,
+                        4994,
+                        2811,
+                        1429,
+                        3607,
+                        1897,
+                        1429,
+                        14653,
+                        2811,
+                        1429,
+                        1784,
+                        5970,
+                        2564,
+                        1034,
+                        47579,
+                        1429,
+                        15760,
+                        2811,
+                        12161,
+                        29363,
+                        4964,
+                        2821,
+                        27028,
+                        6,
+                        3,
+                        22177,
+                        4304,
+                        2662,
+                        4,
+                        9,
+                        1689,
+                        1095,
+                        45629,
+                        32,
+                        19227,
+                        29363,
+                        2811,
+                        1429,
+                        42572,
+                        46005,
+                        2,
+                        7,
+                        19227,
+                        113824,
+                        2811,
+                        1032,
+                        1050,
+                        1048,
+                        1044,
+                        1429,
+                        8979,
+                        2811,
+                        1429,
+                        1099,
+                        79092,
+                        46005,
+                        8,
+                    ],
+                ),
+                (
+                    '<s>[AVAILABLE_TOOLS]▁[{"type":▁"function",▁"function":▁{"name":▁"get_weather",▁"description":▁"Gets▁the▁current▁weather▁in▁a▁city.",▁"parameters":▁{"type":▁"object",▁"properties":▁{"city":▁{"type":▁"string",▁"description":▁"The▁city▁name"}},▁"required":▁["city"]}}}][/AVAILABLE_TOOLS][INST]▁I▁am▁an▁AI<0x0A><0x0A>Hello▁world▁![/INST][TOOL_CALLS]▁[{"name":▁"get_weather",▁"arguments":▁{"city":▁"Paris"},▁"id":▁"123456789"}]</s>[TOOL_RESULTS]▁{"content":▁{"temperature":▁20,▁"unit":▁"celsius"},▁"call_id":▁"123456789"}[/TOOL_RESULTS]',
+                    (
+                        '<s>[SYSTEM_PROMPT]I am an AI[/SYSTEM_PROMPT][AVAILABLE_TOOLS][{"type": "function", "function": {"name": "get_weather", "description": "Gets the current weather in a city.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city name"}}, "required": ["city"]}}}][/AVAILABLE_TOOLS][INST]Hello world ![/INST][TOOL_CALLS]get_weather[ARGS]{"city": "Paris"}</s>[TOOL_RESULTS]{"temperature": 20, "unit": "celsius"}[/TOOL_RESULTS]'  # noqa: E501
+                    ),
+                ),
+            ),
+            (
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": "Hello world !",
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello ",
+                        },
+                    ],
+                },
+                False,
+                True,
+                (
+                    [1, 3, 23325, 2294, 1686, 4, 23325],
+                    [1, 3, 22177, 4304, 2662, 4, 22177, 2],
+                ),
+                (
+                    "<s>[INST]▁Hello▁world▁![/INST]▁Hello",
+                    ("<s>[INST]Hello world ![/INST]Hello</s>"),
+                ),
+            ),
         ],
-    ),
-)])
-def test_make_mistral_chat_completion_request_list_content(
-        openai_request, expected_mistral_request):
-    actual_request = make_mistral_chat_completion_request(
-        openai_request["messages"], openai_request["tools"])
-    assert actual_request == expected_mistral_request
+    )
+    def test_apply_chat_template(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        openai_request: dict[str, Any],
+        add_generation_prompt: bool,
+        continue_final_message: bool,
+        expected_output: tuple[list[int], list[int]],
+        decoded_expected_output: tuple[str, str],
+    ):
+        actual_output = mistral_tokenizer.apply_chat_template(
+            openai_request["messages"],
+            tools=openai_request.get("tools", []),
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+        )
+        decoded_actual_output = mistral_tokenizer.tokenizer.decode(
+            actual_output, SpecialTokenPolicy.KEEP
+        )
+
+        assert actual_output == expected_output[mistral_tokenizer.is_tekken]
+        assert (
+            decoded_actual_output
+            == decoded_expected_output[mistral_tokenizer.is_tekken]
+        )
+
+    def test_apply_chat_template_error(self, mistral_tokenizer: MistralTokenizer):
+        messages = [{"role": "user", "content": "Hello world !"}]
+
+        with pytest.raises(ValueError):
+            mistral_tokenizer.apply_chat_template(
+                messages,
+                tools=[],
+                add_generation_prompt=True,
+                continue_final_message=True,
+            )
+
+        with pytest.raises(ValueError):
+            mistral_tokenizer.apply_chat_template(
+                messages,
+                tools=[],
+                add_generation_prompt=False,
+                continue_final_message=True,
+            )
+
+        messages = [
+            {"role": "user", "content": "Hello world !"},
+            {"role": "assistant", "content": "Hello "},
+        ]
+        with pytest.raises(ValueError):
+            mistral_tokenizer.apply_chat_template(
+                messages,
+                tools=[],
+                add_generation_prompt=True,
+                continue_final_message=False,
+            )
+
+        messages = [
+            {"role": "user", "content": "Hello world !"},
+            {"role": "assistant", "content": "Hello "},
+        ]
+        with pytest.raises(InvalidMessageStructureException):
+            mistral_tokenizer.apply_chat_template(
+                messages,
+                tools=[],
+                add_generation_prompt=False,
+                continue_final_message=False,
+            )
+
+    @pytest.mark.parametrize(
+        "skip_special_tokens,expected_tokens",
+        (
+            (
+                False,
+                (
+                    "<s>[INST]▁Hello▁world▁![/INST]▁Hello</s>",
+                    "<s>[INST]Hello world ![/INST]Hello</s>",
+                ),
+            ),
+            (True, ("Hello world ! Hello", "Hello world !Hello")),
+        ),
+    )
+    def test_decode(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        skip_special_tokens: bool,
+        expected_tokens: tuple[str, str],
+    ):
+        ids = (
+            [1, 3, 23325, 2294, 1686, 4, 23325, 2],
+            [1, 3, 22177, 4304, 2662, 4, 22177, 2],
+        )
+        assert (
+            mistral_tokenizer.decode(
+                ids[mistral_tokenizer.is_tekken],
+                skip_special_tokens=skip_special_tokens,
+            )
+            == expected_tokens[mistral_tokenizer.is_tekken]
+        )
+
+    def test_decode_int(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        ids = 1
+        assert (
+            mistral_tokenizer.decode(
+                ids,
+                skip_special_tokens=False,
+            )
+            == "<s>"
+        )
+
+    def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
+        tokens = (
+            [
+                "<s>",
+                "[AVAILABLE_TOOLS]",
+                "▁[",
+                '{"',
+                "type",
+                '":',
+                '▁"',
+                "function",
+                '",',
+                '▁"',
+                "function",
+                '":',
+                '▁{"',
+                "name",
+                '":',
+                '▁"',
+                "get",
+                "_",
+                "we",
+                "ather",
+                '",',
+                '▁"',
+                "description",
+                '":',
+                '▁"',
+                "Get",
+                "s",
+                "▁the",
+                "▁current",
+                "▁weather",
+                "▁in",
+                "▁a",
+                "▁city",
+                '.",',
+                '▁"',
+                "parameters",
+                '":',
+                '▁{"',
+                "type",
+                '":',
+                '▁"',
+                "object",
+                '",',
+                '▁"',
+                "properties",
+                '":',
+                '▁{"',
+                "city",
+                '":',
+                '▁{"',
+                "type",
+                '":',
+                '▁"',
+                "string",
+                '",',
+                '▁"',
+                "description",
+                '":',
+                '▁"',
+                "The",
+                "▁city",
+                "▁name",
+                '"',
+                "}},",
+                '▁"',
+                "required",
+                '":',
+                '▁["',
+                "city",
+                '"]',
+                "}}",
+                "}]",
+                "[/AVAILABLE_TOOLS]",
+                "[INST]",
+                "▁I",
+                "▁am",
+                "▁an",
+                "▁AI",
+                "<0x0A>",
+                "<0x0A>",
+                "Hello",
+                "▁world",
+                "▁!",
+                "[/INST]",
+                "[TOOL_CALLS]",
+                "▁[",
+                '{"',
+                "name",
+                '":',
+                '▁"',
+                "get",
+                "_",
+                "we",
+                "ather",
+                '",',
+                '▁"',
+                "arguments",
+                '":',
+                '▁{"',
+                "city",
+                '":',
+                '▁"',
+                "Par",
+                "is",
+                '"},',
+                '▁"',
+                "id",
+                '":',
+                '▁"',
+                "1",
+                "2",
+                "3",
+                "4",
+                "5",
+                "6",
+                "7",
+                "8",
+                "9",
+                '"',
+                "}]",
+                "</s>",
+                "[TOOL_RESULTS]",
+                '▁{"',
+                "content",
+                '":',
+                '▁{"',
+                "t",
+                "emperature",
+                '":',
+                "▁",
+                "2",
+                "0",
+                ",",
+                '▁"',
+                "unit",
+                '":',
+                '▁"',
+                "c",
+                "els",
+                "ius",
+                '"},',
+                '▁"',
+                "call",
+                "_",
+                "id",
+                '":',
+                '▁"',
+                "1",
+                "2",
+                "3",
+                "4",
+                "5",
+                "6",
+                "7",
+                "8",
+                "9",
+                '"}',
+                "[/TOOL_RESULTS]",
+            ],
+            [
+                "<s>",
+                "[SYSTEM_PROMPT]",
+                "I",
+                " am",
+                " an",
+                " AI",
+                "[/SYSTEM_PROMPT]",
+                "[AVAILABLE_TOOLS]",
+                "[",
+                '{"',
+                "type",
+                '":',
+                ' "',
+                "function",
+                '",',
+                ' "',
+                "function",
+                '":',
+                ' {"',
+                "name",
+                '":',
+                ' "',
+                "get",
+                "_",
+                "weather",
+                '",',
+                ' "',
+                "description",
+                '":',
+                ' "',
+                "G",
+                "ets",
+                " the",
+                " current",
+                " weather",
+                " in",
+                " a",
+                " city",
+                '.",',
+                ' "',
+                "parameters",
+                '":',
+                ' {"',
+                "type",
+                '":',
+                ' "',
+                "object",
+                '",',
+                ' "',
+                "properties",
+                '":',
+                ' {"',
+                "city",
+                '":',
+                ' {"',
+                "type",
+                '":',
+                ' "',
+                "string",
+                '",',
+                ' "',
+                "description",
+                '":',
+                ' "',
+                "The",
+                " city",
+                " name",
+                '"',
+                "}},",
+                ' "',
+                "required",
+                '":',
+                ' ["',
+                "city",
+                '"]',
+                "}}",
+                "}]",
+                "[/AVAILABLE_TOOLS]",
+                "[INST]",
+                "Hello",
+                " world",
+                " !",
+                "[/INST]",
+                "[TOOL_CALLS]",
+                "get",
+                "_",
+                "weather",
+                "[ARGS]",
+                '{"',
+                "city",
+                '":',
+                ' "',
+                "Paris",
+                '"}',
+                "</s>",
+                "[TOOL_RESULTS]",
+                '{"',
+                "temperature",
+                '":',
+                " ",
+                "2",
+                "0",
+                ",",
+                ' "',
+                "unit",
+                '":',
+                ' "',
+                "c",
+                "elsius",
+                '"}',
+                "[/TOOL_RESULTS]",
+            ],
+        )
+
+        expected_strings = (
+            '[{"type": "function", "function": {"name": "get_weather", "description": "Gets the current weather in a city.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city name"}}, "required": ["city"]}}}] I am an AI\n\nHello world ![TOOL_CALLS][{"name": "get_weather", "arguments": {"city": "Paris"}, "id": "123456789"}] {"content": {"temperature": 20, "unit": "celsius"}, "call_id": "123456789"}',  # noqa: E501
+            'I am an AI[{"type": "function", "function": {"name": "get_weather", "description": "Gets the current weather in a city.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city name"}}, "required": ["city"]}}}]Hello world ![TOOL_CALLS]get_weather{"city": "Paris"}{"temperature": 20, "unit": "celsius"}',  # noqa: E501
+        )
+
+        assert (
+            mistral_tokenizer.convert_tokens_to_string(
+                tokens[mistral_tokenizer.is_tekken]
+            )
+            == expected_strings[mistral_tokenizer.is_tekken]
+        )
+
+    @pytest.mark.parametrize(
+        "skip_special_tokens,tuple_expected_tokens",
+        (
+            (
+                True,
+                (
+                    [
+                        "▁[",
+                        '{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "function",
+                        '",',
+                        '▁"',
+                        "function",
+                        '":',
+                        '▁{"',
+                        "name",
+                        '":',
+                        '▁"',
+                        "get",
+                        "_",
+                        "we",
+                        "ather",
+                        '",',
+                        '▁"',
+                        "description",
+                        '":',
+                        '▁"',
+                        "Get",
+                        "s",
+                        "▁the",
+                        "▁current",
+                        "▁weather",
+                        "▁in",
+                        "▁a",
+                        "▁city",
+                        '.",',
+                        '▁"',
+                        "parameters",
+                        '":',
+                        '▁{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "object",
+                        '",',
+                        '▁"',
+                        "properties",
+                        '":',
+                        '▁{"',
+                        "city",
+                        '":',
+                        '▁{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "string",
+                        '",',
+                        '▁"',
+                        "description",
+                        '":',
+                        '▁"',
+                        "The",
+                        "▁city",
+                        "▁name",
+                        '"',
+                        "}},",
+                        '▁"',
+                        "required",
+                        '":',
+                        '▁["',
+                        "city",
+                        '"]',
+                        "}}",
+                        "}]",
+                        "▁I",
+                        "▁am",
+                        "▁an",
+                        "▁AI",
+                        "<0x0A>",
+                        "<0x0A>",
+                        "Hello",
+                        "▁world",
+                        "▁!",
+                        "[TOOL_CALLS]",
+                        "▁[",
+                        '{"',
+                        "name",
+                        '":',
+                        '▁"',
+                        "get",
+                        "_",
+                        "we",
+                        "ather",
+                        '",',
+                        '▁"',
+                        "arguments",
+                        '":',
+                        '▁{"',
+                        "city",
+                        '":',
+                        '▁"',
+                        "Par",
+                        "is",
+                        '"},',
+                        '▁"',
+                        "id",
+                        '":',
+                        '▁"',
+                        "1",
+                        "2",
+                        "3",
+                        "4",
+                        "5",
+                        "6",
+                        "7",
+                        "8",
+                        "9",
+                        '"',
+                        "}]",
+                        '▁{"',
+                        "content",
+                        '":',
+                        '▁{"',
+                        "t",
+                        "emperature",
+                        '":',
+                        "▁",
+                        "2",
+                        "0",
+                        ",",
+                        '▁"',
+                        "unit",
+                        '":',
+                        '▁"',
+                        "c",
+                        "els",
+                        "ius",
+                        '"},',
+                        '▁"',
+                        "call",
+                        "_",
+                        "id",
+                        '":',
+                        '▁"',
+                        "1",
+                        "2",
+                        "3",
+                        "4",
+                        "5",
+                        "6",
+                        "7",
+                        "8",
+                        "9",
+                        '"}',
+                    ],
+                    [
+                        "I",
+                        " am",
+                        " an",
+                        " AI",
+                        "[",
+                        '{"',
+                        "type",
+                        '":',
+                        ' "',
+                        "function",
+                        '",',
+                        ' "',
+                        "function",
+                        '":',
+                        ' {"',
+                        "name",
+                        '":',
+                        ' "',
+                        "get",
+                        "_",
+                        "weather",
+                        '",',
+                        ' "',
+                        "description",
+                        '":',
+                        ' "',
+                        "G",
+                        "ets",
+                        " the",
+                        " current",
+                        " weather",
+                        " in",
+                        " a",
+                        " city",
+                        '.",',
+                        ' "',
+                        "parameters",
+                        '":',
+                        ' {"',
+                        "type",
+                        '":',
+                        ' "',
+                        "object",
+                        '",',
+                        ' "',
+                        "properties",
+                        '":',
+                        ' {"',
+                        "city",
+                        '":',
+                        ' {"',
+                        "type",
+                        '":',
+                        ' "',
+                        "string",
+                        '",',
+                        ' "',
+                        "description",
+                        '":',
+                        ' "',
+                        "The",
+                        " city",
+                        " name",
+                        '"',
+                        "}},",
+                        ' "',
+                        "required",
+                        '":',
+                        ' ["',
+                        "city",
+                        '"]',
+                        "}}",
+                        "}]",
+                        "Hello",
+                        " world",
+                        " !",
+                        "[TOOL_CALLS]",
+                        "get",
+                        "_",
+                        "weather",
+                        '{"',
+                        "city",
+                        '":',
+                        ' "',
+                        "Paris",
+                        '"}',
+                        '{"',
+                        "temperature",
+                        '":',
+                        " ",
+                        "2",
+                        "0",
+                        ",",
+                        ' "',
+                        "unit",
+                        '":',
+                        ' "',
+                        "c",
+                        "elsius",
+                        '"}',
+                    ],
+                ),
+            ),
+            (
+                False,
+                (
+                    [
+                        "<s>",
+                        "[AVAILABLE_TOOLS]",
+                        "▁[",
+                        '{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "function",
+                        '",',
+                        '▁"',
+                        "function",
+                        '":',
+                        '▁{"',
+                        "name",
+                        '":',
+                        '▁"',
+                        "get",
+                        "_",
+                        "we",
+                        "ather",
+                        '",',
+                        '▁"',
+                        "description",
+                        '":',
+                        '▁"',
+                        "Get",
+                        "s",
+                        "▁the",
+                        "▁current",
+                        "▁weather",
+                        "▁in",
+                        "▁a",
+                        "▁city",
+                        '.",',
+                        '▁"',
+                        "parameters",
+                        '":',
+                        '▁{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "object",
+                        '",',
+                        '▁"',
+                        "properties",
+                        '":',
+                        '▁{"',
+                        "city",
+                        '":',
+                        '▁{"',
+                        "type",
+                        '":',
+                        '▁"',
+                        "string",
+                        '",',
+                        '▁"',
+                        "description",
+                        '":',
+                        '▁"',
+                        "The",
+                        "▁city",
+                        "▁name",
+                        '"',
+                        "}},",
+                        '▁"',
+                        "required",
+                        '":',
+                        '▁["',
+                        "city",
+                        '"]',
+                        "}}",
+                        "}]",
+                        "[/AVAILABLE_TOOLS]",
+                        "[INST]",
+                        "▁I",
+                        "▁am",
+                        "▁an",
+                        "▁AI",
+                        "<0x0A>",
+                        "<0x0A>",
+                        "Hello",
+                        "▁world",
+                        "▁!",
+                        "[/INST]",
+                        "[TOOL_CALLS]",
+                        "▁[",
+                        '{"',
+                        "name",
+                        '":',
+                        '▁"',
+                        "get",
+                        "_",
+                        "we",
+                        "ather",
+                        '",',
+                        '▁"',
+                        "arguments",
+                        '":',
+                        '▁{"',
+                        "city",
+                        '":',
+                        '▁"',
+                        "Par",
+                        "is",
+                        '"},',
+                        '▁"',
+                        "id",
+                        '":',
+                        '▁"',
+                        "1",
+                        "2",
+                        "3",
+                        "4",
+                        "5",
+                        "6",
+                        "7",
+                        "8",
+                        "9",
+                        '"',
+                        "}]",
+                        "</s>",
+                        "[TOOL_RESULTS]",
+                        '▁{"',
+                        "content",
+                        '":',
+                        '▁{"',
+                        "t",
+                        "emperature",
+                        '":',
+                        "▁",
+                        "2",
+                        "0",
+                        ",",
+                        '▁"',
+                        "unit",
+                        '":',
+                        '▁"',
+                        "c",
+                        "els",
+                        "ius",
+                        '"},',
+                        '▁"',
+                        "call",
+                        "_",
+                        "id",
+                        '":',
+                        '▁"',
+                        "1",
+                        "2",
+                        "3",
+                        "4",
+                        "5",
+                        "6",
+                        "7",
+                        "8",
+                        "9",
+                        '"}',
+                        "[/TOOL_RESULTS]",
+                    ],
+                    [
+                        "<s>",
+                        "[SYSTEM_PROMPT]",
+                        "I",
+                        " am",
+                        " an",
+                        " AI",
+                        "[/SYSTEM_PROMPT]",
+                        "[AVAILABLE_TOOLS]",
+                        "[",
+                        '{"',
+                        "type",
+                        '":',
+                        ' "',
+                        "function",
+                        '",',
+                        ' "',
+                        "function",
+                        '":',
+                        ' {"',
+                        "name",
+                        '":',
+                        ' "',
+                        "get",
+                        "_",
+                        "weather",
+                        '",',
+                        ' "',
+                        "description",
+                        '":',
+                        ' "',
+                        "G",
+                        "ets",
+                        " the",
+                        " current",
+                        " weather",
+                        " in",
+                        " a",
+                        " city",
+                        '.",',
+                        ' "',
+                        "parameters",
+                        '":',
+                        ' {"',
+                        "type",
+                        '":',
+                        ' "',
+                        "object",
+                        '",',
+                        ' "',
+                        "properties",
+                        '":',
+                        ' {"',
+                        "city",
+                        '":',
+                        ' {"',
+                        "type",
+                        '":',
+                        ' "',
+                        "string",
+                        '",',
+                        ' "',
+                        "description",
+                        '":',
+                        ' "',
+                        "The",
+                        " city",
+                        " name",
+                        '"',
+                        "}},",
+                        ' "',
+                        "required",
+                        '":',
+                        ' ["',
+                        "city",
+                        '"]',
+                        "}}",
+                        "}]",
+                        "[/AVAILABLE_TOOLS]",
+                        "[INST]",
+                        "Hello",
+                        " world",
+                        " !",
+                        "[/INST]",
+                        "[TOOL_CALLS]",
+                        "get",
+                        "_",
+                        "weather",
+                        "[ARGS]",
+                        '{"',
+                        "city",
+                        '":',
+                        ' "',
+                        "Paris",
+                        '"}',
+                        "</s>",
+                        "[TOOL_RESULTS]",
+                        '{"',
+                        "temperature",
+                        '":',
+                        " ",
+                        "2",
+                        "0",
+                        ",",
+                        ' "',
+                        "unit",
+                        '":',
+                        ' "',
+                        "c",
+                        "elsius",
+                        '"}',
+                        "[/TOOL_RESULTS]",
+                    ],
+                ),
+            ),
+        ),
+    )
+    def test_convert_ids_to_tokens(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        skip_special_tokens: bool,
+        tuple_expected_tokens: tuple[list[str], list[str]],
+    ):
+        tuple_ids = (
+            [
+                1,
+                6,
+                1501,
+                7567,
+                1891,
+                2032,
+                1113,
+                3396,
+                1316,
+                1113,
+                3396,
+                2032,
+                10598,
+                1629,
+                2032,
+                1113,
+                1295,
+                29498,
+                1537,
+                1991,
+                1316,
+                1113,
+                7286,
+                2032,
+                1113,
+                2226,
+                29481,
+                1040,
+                2636,
+                8854,
+                1065,
+                1032,
+                3758,
+                9959,
+                1113,
+                12206,
+                2032,
+                10598,
+                1891,
+                2032,
+                1113,
+                3582,
+                1316,
+                1113,
+                11491,
+                2032,
+                10598,
+                19141,
+                2032,
+                10598,
+                1891,
+                2032,
+                1113,
+                2195,
+                1316,
+                1113,
+                7286,
+                2032,
+                1113,
+                1782,
+                3758,
+                1909,
+                29507,
+                11549,
+                1113,
+                11661,
+                2032,
+                8135,
+                19141,
+                3010,
+                1743,
+                10925,
+                7,
+                3,
+                1083,
+                1605,
+                1164,
+                16875,
+                781,
+                781,
+                16998,
+                2294,
+                1686,
+                4,
+                5,
+                1501,
+                7567,
+                1629,
+                2032,
+                1113,
+                1295,
+                29498,
+                1537,
+                1991,
+                1316,
+                1113,
+                17452,
+                2032,
+                10598,
+                19141,
+                2032,
+                1113,
+                4684,
+                1046,
+                8474,
+                1113,
+                1081,
+                2032,
+                1113,
+                29508,
+                29518,
+                29538,
+                29549,
+                29550,
+                29552,
+                29555,
+                29551,
+                29542,
+                29507,
+                10925,
+                2,
+                8,
+                10598,
+                4557,
+                2032,
+                10598,
+                29475,
+                17329,
+                2032,
+                29473,
+                29518,
+                29502,
+                29493,
+                1113,
+                6074,
+                2032,
+                1113,
+                29485,
+                1958,
+                3938,
+                8474,
+                1113,
+                3613,
+                29498,
+                1081,
+                2032,
+                1113,
+                29508,
+                29518,
+                29538,
+                29549,
+                29550,
+                29552,
+                29555,
+                29551,
+                29542,
+                18163,
+                9,
+            ],
+            [
+                1,
+                17,
+                1073,
+                1855,
+                1420,
+                26554,
+                18,
+                5,
+                1091,
+                19227,
+                4994,
+                2811,
+                1429,
+                5165,
+                1897,
+                1429,
+                5165,
+                2811,
+                16753,
+                2391,
+                2811,
+                1429,
+                1689,
+                1095,
+                45629,
+                1897,
+                1429,
+                14653,
+                2811,
+                1429,
+                1071,
+                3083,
+                1278,
+                3519,
+                17253,
+                1294,
+                1261,
+                5970,
+                39249,
+                1429,
+                26204,
+                2811,
+                16753,
+                4994,
+                2811,
+                1429,
+                6371,
+                1897,
+                1429,
+                48649,
+                2811,
+                16753,
+                29363,
+                2811,
+                16753,
+                4994,
+                2811,
+                1429,
+                3607,
+                1897,
+                1429,
+                14653,
+                2811,
+                1429,
+                1784,
+                5970,
+                2564,
+                1034,
+                47579,
+                1429,
+                15760,
+                2811,
+                12161,
+                29363,
+                4964,
+                2821,
+                27028,
+                6,
+                3,
+                22177,
+                4304,
+                2662,
+                4,
+                9,
+                1689,
+                1095,
+                45629,
+                32,
+                19227,
+                29363,
+                2811,
+                1429,
+                42572,
+                46005,
+                2,
+                7,
+                19227,
+                113824,
+                2811,
+                1032,
+                1050,
+                1048,
+                1044,
+                1429,
+                8979,
+                2811,
+                1429,
+                1099,
+                79092,
+                46005,
+                8,
+            ],
+        )
+
+        ids = tuple_ids[mistral_tokenizer.is_tekken]
+        expected_tokens = tuple_expected_tokens[mistral_tokenizer.is_tekken]
+        actual_tokens = mistral_tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens
+        )
+        assert actual_tokens == expected_tokens
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index 09a3638fd2ed11a9aa3f04c6826f1cf04c6fb229..e86bb03883b5ed535e04006f59cecae42e9b22b3 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -19,5 +19,5 @@ def test_tokenizer_revision(tokenizer_name: str):
     assert isinstance(tokenizer, PreTrainedTokenizerBase)
 
     # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
+    with pytest.raises(OSError, match="not a valid git identifier"):
         get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 68d4b416b4c996e07f8d3e72ba4170fa85f3f95e..d89737888aa2b0013b229e64d53356a32f85b4af 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,18 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
-                                                    TokenizerRegistry)
+from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
 class TestTokenizer(TokenizerBase):
-
     @classmethod
     def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
         return TestTokenizer()
@@ -63,11 +61,11 @@ class TestTokenizer(TokenizerBase):
 
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
         raise NotImplementedError()
 
@@ -81,27 +79,25 @@ class TestTokenizer(TokenizerBase):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         raise NotImplementedError()
 
-    def encode(self,
-               text: str,
-               add_special_tokens: Optional[bool] = None) -> list[int]:
+    def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
         raise NotImplementedError()
 
-    def apply_chat_template(self,
-                            messages: list["ChatCompletionMessageParam"],
-                            tools: Optional[list[dict[str, Any]]] = None,
-                            **kwargs) -> list[int]:
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
         raise NotImplementedError()
 
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError()
 
-    def decode(self,
-               ids: Union[list[int], int],
-               skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     def convert_ids_to_tokens(
@@ -113,9 +109,9 @@ class TestTokenizer(TokenizerBase):
 
 
 def test_customized_tokenizer():
-    TokenizerRegistry.register("test_tokenizer",
-                               "tests.tokenization.test_tokenizer_registry",
-                               "TestTokenizer")
+    TokenizerRegistry.register(
+        "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
+    )
 
     tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 510b54790cd90378b4635c06e7ecbb5346152702..ff9cdeeb737521688f6f1235683706e5e33c4c3b 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -13,13 +13,13 @@ from .utils import ARGS, CONFIGS, ServerConfig
 
 # select models to test based on command line arguments
 def pytest_addoption(parser):
-    parser.addoption("--models",
-                     nargs="+",
-                     help="Specify one or more models to test")
-    parser.addoption("--extended",
-                     action="store_true",
-                     default=False,
-                     help="invoke extended tests requiring large GPUs")
+    parser.addoption("--models", nargs="+", help="Specify one or more models to test")
+    parser.addoption(
+        "--extended",
+        action="store_true",
+        default=False,
+        help="invoke extended tests requiring large GPUs",
+    )
 
 
 # for each server config, download the model and return the config
@@ -29,8 +29,10 @@ def server_config(request):
     models = request.config.getoption("--models")
 
     config_keys_to_test = [
-        key for key in CONFIGS if (models is None or key in models) and (
-            extended or not CONFIGS[key].get("extended", False))
+        key
+        for key in CONFIGS
+        if (models is None or key in models)
+        and (extended or not CONFIGS[key].get("extended", False))
     ]
 
     config_key = request.param
@@ -40,8 +42,9 @@ def server_config(request):
     config = CONFIGS[config_key]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
-        pytest.skip("The {} model can't be tested on the ROCm platform".format(
-            config["model"]))
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
 
     # download model and tokenizer using transformers
     snapshot_download(config["model"])
@@ -53,8 +56,9 @@ def server_config(request):
 def server(request, server_config: ServerConfig):
     model = server_config["model"]
     args_for_model = server_config["arguments"]
-    with RemoteOpenAIServer(model, ARGS + args_for_model,
-                            max_wait_seconds=480) as server:
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
         yield server
 
 
diff --git a/tests/v1/entrypoints/openai/responses/__init__.py b/tests/tool_use/mistral/__init__.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/__init__.py
rename to tests/tool_use/mistral/__init__.py
diff --git a/tests/mistral_tool_use/conftest.py b/tests/tool_use/mistral/conftest.py
similarity index 76%
rename from tests/mistral_tool_use/conftest.py
rename to tests/tool_use/mistral/conftest.py
index e89e60c5a02ec859456a7e184bb8a37eafd6bec8..9b0a6eb27fca7f510fb4d8848213506248f777bf 100644
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/tool_use/mistral/conftest.py
@@ -12,13 +12,14 @@ from .utils import ARGS, CONFIGS, ServerConfig
 
 
 # for each server config, download the model and return the config
-@pytest.fixture(scope="session", params=CONFIGS.keys())
+@pytest.fixture(scope="package", params=CONFIGS.keys())
 def server_config(request):
     config = CONFIGS[request.param]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
-        pytest.skip("The {} model can't be tested on the ROCm platform".format(
-            config["model"]))
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
 
     # download model and tokenizer using transformers
     snapshot_download(config["model"])
@@ -26,12 +27,13 @@ def server_config(request):
 
 
 # run this for each server config
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="package")
 def server(request, server_config: ServerConfig):
     model = server_config["model"]
     args_for_model = server_config["arguments"]
-    with RemoteOpenAIServer(model, ARGS + args_for_model,
-                            max_wait_seconds=480) as server:
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
         yield server
 
 
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/tool_use/mistral/test_mistral_tool_calls.py
similarity index 88%
rename from tests/mistral_tool_use/test_mistral_tool_calls.py
rename to tests/tool_use/mistral/test_mistral_tool_calls.py
index 9bf6863f3f2b7a3ef5001ebf981eab25ca2b4299..3c4a543abe412ca421ebb3c5251f26abea0b2700 100644
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/tool_use/mistral/test_mistral_tool_calls.py
@@ -19,12 +19,12 @@ async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
         model=model_name,
         tools=[WEATHER_TOOL],
         tool_choice=WEATHER_TOOL,
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 1
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 1
     assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
diff --git a/tests/mistral_tool_use/utils.py b/tests/tool_use/mistral/utils.py
similarity index 56%
rename from tests/mistral_tool_use/utils.py
rename to tests/tool_use/mistral/utils.py
index 7a026cd9bb619d43fae63b0b97af2bb45a835ae7..4d772ba63793dd46b00eeb613c95080ffd6666db 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/tool_use/mistral/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from typing_extensions import TypedDict
 
@@ -9,26 +8,25 @@ from typing_extensions import TypedDict
 class ServerConfig(TypedDict, total=False):
     model: str
     arguments: list[str]
-    system_prompt: Optional[str]
-    supports_parallel: Optional[bool]
-    supports_rocm: Optional[bool]
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
 
 
 ARGS: list[str] = ["--max-model-len", "1024"]
 
 CONFIGS: dict[str, ServerConfig] = {
     "mistral": {
-        "model":
-        "mistralai/Mistral-7B-Instruct-v0.3",
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
-            "--tokenizer-mode", "mistral",
-            "--ignore-patterns=\"consolidated.safetensors\""
+            "--tokenizer-mode",
+            "mistral",
+            '--ignore-patterns="consolidated.safetensors"',
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
 }
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index a30c58b09fe8f61c1beb29b8d413a4f51a5537e1..50cd9e4279b2a5f7ff4045dbd7a0595701a2ff1d 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -8,68 +8,56 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 
 def test_chat_completion_request_with_no_tools():
     # tools key is not present
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+        }
+    )
+    assert request.tool_choice == "none"
 
     # tools key is None
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-        'tools':
-        None
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": None,
+        }
+    )
+    assert request.tool_choice == "none"
 
     # tools key present but empty
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-        'tools': []
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": [],
+        }
+    )
+    assert request.tool_choice == "none"
 
 
-@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
 def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
-    with pytest.raises(ValueError,
-                       match="When using `tool_choice`, `tools` must be set."):
-        ChatCompletionRequest.model_validate({
-            'messages': [{
-                'role': 'user',
-                'content': 'Hello'
-            }],
-            'model':
-            'facebook/opt-125m',
-            'tool_choice':
-            tool_choice
-        })
-
-    with pytest.raises(ValueError,
-                       match="When using `tool_choice`, `tools` must be set."):
-        ChatCompletionRequest.model_validate({
-            'messages': [{
-                'role': 'user',
-                'content': 'Hello'
-            }],
-            'model':
-            'facebook/opt-125m',
-            'tool_choice':
-            tool_choice,
-            'tools':
-            None
-        })
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+            }
+        )
+
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+                "tools": None,
+            }
+        )
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 8c01c86e29f2f423ad5987214a1123ea2c611014..425d3879985e791460a8567ce00386beccdff9ac 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -4,16 +4,21 @@
 import openai
 import pytest
 
-from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
-                    ensure_system_prompt)
+from .utils import (
+    MESSAGES_WITHOUT_TOOLS,
+    WEATHER_TOOL,
+    ServerConfig,
+    ensure_system_prompt,
+)
 
 
 # test: make sure chat completions without tools provided work even when tools
 # are enabled. This makes sure tool call chat templates work, AND that the tool
 # parser stream processing doesn't change the output of the model.
 @pytest.mark.asyncio
-async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
-                                             server_config: ServerConfig):
+async def test_chat_completion_without_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
@@ -21,7 +26,8 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
         temperature=0,
         max_completion_tokens=150,
         model=model_name,
-        logprobs=False)
+        logprobs=False,
+    )
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     output_text = chat_completion.choices[0].message.content
@@ -32,8 +38,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
     assert stop_reason != "tool_calls"
 
     # check to make sure no tool calls were returned
-    assert (choice.message.tool_calls is None
-            or len(choice.message.tool_calls) == 0)
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
@@ -55,7 +60,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
         # make sure the role is assistant
         if delta.role:
             assert not role_sent
-            assert delta.role == 'assistant'
+            assert delta.role == "assistant"
             role_sent = True
 
         if delta.content:
@@ -80,8 +85,9 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
 # tools, to make sure we can still get normal chat completion responses
 # and that they won't be parsed as tools
 @pytest.mark.asyncio
-async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
-                                          server_config: ServerConfig):
+async def test_chat_completion_with_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
@@ -90,19 +96,19 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
         max_completion_tokens=150,
         model=model_name,
         tools=[WEATHER_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     output_text = chat_completion.choices[0].message.content
 
     # check to make sure we got text
     assert output_text is not None
-    assert stop_reason != 'tool_calls'
+    assert stop_reason != "tool_calls"
     assert len(output_text) > 0
 
     # check to make sure no tool calls were returned
-    assert (choice.message.tool_calls is None
-            or len(choice.message.tool_calls) == 0)
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
@@ -125,7 +131,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
 
         # make sure the role is assistant
         if delta.role:
-            assert delta.role == 'assistant'
+            assert delta.role == "assistant"
             role_sent = True
 
         if delta.content:
@@ -142,6 +148,6 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
     assert role_sent
     assert finish_reason_count == 1
     assert chunk.choices[0].finish_reason == stop_reason
-    assert chunk.choices[0].finish_reason != 'tool_calls'
+    assert chunk.choices[0].finish_reason != "tool_calls"
     assert len(chunks)
     assert "".join(chunks) == output_text
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py
index 5f6b266d3aa1978700901cfae9c80cf3c4b809b5..db5168071fbcee7e189f10f9a81c4872a91d8c66 100644
--- a/tests/tool_use/test_deepseekv31_tool_parser.py
+++ b/tests/tool_use/test_deepseekv31_tool_parser.py
@@ -3,7 +3,9 @@
 
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers import DeepSeekV31ToolParser
+from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
+    DeepSeekV31ToolParser,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL = "deepseek-ai/DeepSeek-V3.1"
@@ -21,23 +23,28 @@ def parser(deepseekv31_tokenizer):
 
 def test_extract_tool_calls_with_tool(parser):
     model_output = (
-        "normal text" + "<｜tool▁calls▁begin｜>" +
-        "<｜tool▁call▁begin｜>foo<｜tool▁sep｜>{\"x\":1}<｜tool▁call▁end｜>" +
-        "<｜tool▁calls▁end｜>")
+        "normal text"
+        + "<｜tool▁calls▁begin｜>"
+        + '<｜tool▁call▁begin｜>foo<｜tool▁sep｜>{"x":1}<｜tool▁call▁end｜>'
+        + "<｜tool▁calls▁end｜>"
+    )
     result = parser.extract_tool_calls(model_output, None)
     assert result.tools_called
     assert len(result.tool_calls) == 1
     assert result.tool_calls[0].function.name == "foo"
-    assert result.tool_calls[0].function.arguments == "{\"x\":1}"
+    assert result.tool_calls[0].function.arguments == '{"x":1}'
     assert result.content == "normal text"
 
 
 def test_extract_tool_calls_with_multiple_tools(parser):
     model_output = (
-        "some prefix text" + "<｜tool▁calls▁begin｜>" +
-        "<｜tool▁call▁begin｜>foo<｜tool▁sep｜>{\"x\":1}<｜tool▁call▁end｜>" +
-        "<｜tool▁call▁begin｜>bar<｜tool▁sep｜>{\"y\":2}<｜tool▁call▁end｜>" +
-        "<｜tool▁calls▁end｜>" + " some suffix text")
+        "some prefix text"
+        + "<｜tool▁calls▁begin｜>"
+        + '<｜tool▁call▁begin｜>foo<｜tool▁sep｜>{"x":1}<｜tool▁call▁end｜>'
+        + '<｜tool▁call▁begin｜>bar<｜tool▁sep｜>{"y":2}<｜tool▁call▁end｜>'
+        + "<｜tool▁calls▁end｜>"
+        + " some suffix text"
+    )
 
     result = parser.extract_tool_calls(model_output, None)
 
@@ -45,10 +52,10 @@ def test_extract_tool_calls_with_multiple_tools(parser):
     assert len(result.tool_calls) == 2
 
     assert result.tool_calls[0].function.name == "foo"
-    assert result.tool_calls[0].function.arguments == "{\"x\":1}"
+    assert result.tool_calls[0].function.arguments == '{"x":1}'
 
     assert result.tool_calls[1].function.name == "bar"
-    assert result.tool_calls[1].function.arguments == "{\"y\":2}"
+    assert result.tool_calls[1].function.arguments == '{"y":2}'
 
     # prefix is content
     assert result.content == "some prefix text"
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..36a07bb561d9e70fb83a04a69fc8f3b6681f831a
--- /dev/null
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -0,0 +1,359 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+from collections.abc import Generator
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
+from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
+
+
+@pytest.fixture(scope="module")
+def ernie45_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
+
+
+@pytest.fixture
+def ernie45_tool_parser(ernie45_tokenizer):
+    return Ernie45ToolParser(ernie45_tokenizer)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 0
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function.name == expected_tool_call.function.name
+        # Compare arguments as JSON objects to handle formatting differences
+        actual_args = json.loads(actual_tool_call.function.arguments)
+        expected_args = json.loads(expected_tool_call.function.arguments)
+        assert actual_args == expected_args
+
+
+def test_extract_tool_calls_no_tools(ernie45_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = ernie45_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_call",
+        "multiple_tool_calls",
+        "tool_call_with_content_before",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_temperature_unit",
+                        arguments=json.dumps(
+                            {
+                                "location": "Guangzhou",
+                                "unit": "c",
+                            }
+                        ),
+                    )
+                ),
+            ],
+            None,
+        ),
+        (
+            """I need to call two tools to handle these two issues separately.
+</think>
+
+<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_temperature_unit",
+                        arguments=json.dumps(
+                            {
+                                "location": "Guangzhou",
+                                "unit": "c",
+                            }
+                        ),
+                    )
+                ),
+            ],
+            "I need to call two tools to handle these two issues separately.\n</think>",
+        ),
+    ],
+)
+def test_extract_tool_calls(
+    ernie45_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = ernie45_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def stream_delta_message_generator(
+    ernie45_tool_parser: Ernie45ToolParser,
+    ernie45_tokenizer: AnyTokenizer,
+    model_output: str,
+    request: ChatCompletionRequest | None = None,
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = ernie45_tokenizer.encode(model_output, add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=ernie45_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
+
+        current_text = previous_text + delta_text
+
+        delta_message = ernie45_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=request,
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_call",
+        "multiple_tool_calls",
+        "tool_call_with_content_before",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_temperature_unit",
+                        arguments=json.dumps(
+                            {
+                                "location": "Guangzhou",
+                                "unit": "c",
+                            }
+                        ),
+                    )
+                ),
+            ],
+            None,
+        ),
+        (
+            """I need to call two tools to handle these two issues separately.
+</think>
+
+<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
+</tool_call>
+""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_temperature",
+                        arguments=json.dumps(
+                            {
+                                "location": "Beijing",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_temperature_unit",
+                        arguments=json.dumps(
+                            {
+                                "location": "Guangzhou",
+                                "unit": "c",
+                            }
+                        ),
+                    )
+                ),
+            ],
+            "I need to call two tools to handle these two issues separately.\n</think>",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_incremental(
+    ernie45_tool_parser,
+    ernie45_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    """Verify the Ernie45 Parser streaming behavior by verifying each chunk is as expected."""  # noqa: E501
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=[])
+
+    tool_calls_dict = {}
+    for delta_message in stream_delta_message_generator(
+        ernie45_tool_parser, ernie45_tokenizer, model_output, request
+    ):
+        if (
+            delta_message.role is None
+            and delta_message.content is None
+            and delta_message.reasoning is None
+            and len(delta_message.tool_calls) == 0
+        ):
+            continue
+        tool_calls = delta_message.tool_calls
+        for tool_call_chunk in tool_calls:
+            index = tool_call_chunk.index
+            if index not in tool_calls_dict:
+                if tool_call_chunk.function.arguments is None:
+                    tool_call_chunk.function.arguments = ""
+                tool_calls_dict[index] = tool_call_chunk
+            else:
+                tool_calls_dict[
+                    index
+                ].function.arguments += tool_call_chunk.function.arguments
+    actual_tool_calls = list(tool_calls_dict.values())
+
+    assert len(actual_tool_calls) > 0
+    # check tool call format
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index 91913c933184ec926d3cc64036ed3dd9f135fbf4..f545f52c02dcb0364d59eb9cb9fb488f7e533dd5 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -7,9 +7,13 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser
+from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
+    Glm4MoeModelToolParser,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
 MODEL = "zai-org/GLM-4.5"
@@ -25,12 +29,14 @@ def glm4_moe_tool_parser(glm4_moe_tokenizer):
     return Glm4MoeModelToolParser(glm4_moe_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 0
 
@@ -45,7 +51,8 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -71,14 +78,18 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
     <arg_value>fahrenheit</arg_value>
     </tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
@@ -100,22 +111,30 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
     <arg_value>fahrenheit</arg_value>
     </tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             None,
         ),
@@ -129,14 +148,18 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
     <arg_value>celsius</arg_value>
     </tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Seattle",
-                        "state": "WA",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Seattle",
+                                "state": "WA",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you check the weather.",
         ),
@@ -150,37 +173,51 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser):
     <arg_value>celsius</arg_value>
     </tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "New York",
-                        "state": "NY",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "New York",
+                                "state": "NY",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
-        ("""I will help you get the weather.<tool_call>get_weather
+        (
+            """I will help you get the weather.<tool_call>get_weather
     <arg_key>city</arg_key>
     <arg_value>Beijing</arg_value>
     <arg_key>date</arg_key>
     <arg_value>2025-08-01</arg_value>
-    </tool_call>""", [
-            ToolCall(function=FunctionCall(
-                name="get_weather",
-                arguments=json.dumps({
-                    "city": "Beijing",
-                    "date": "2025-08-01",
-                }),
-            ))
-        ], "I will help you get the weather."),
+    </tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Beijing",
+                                "date": "2025-08-01",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "I will help you get the weather.",
+        ),
     ],
 )
-def test_extract_tool_calls(glm4_moe_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    glm4_moe_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
 
@@ -200,7 +237,8 @@ I will help you get the weather.
 </tool_call>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
@@ -222,7 +260,8 @@ def test_extract_tool_calls_malformed_xml(glm4_moe_tool_parser):
 </tool_call>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     # Should handle malformed XML gracefully
     # The parser should either extract what it can or return no tool calls
@@ -237,12 +276,12 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
 </tool_call>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "get_current_time"
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_current_time"
     # Empty arguments should result in empty JSON object
     assert extracted_tool_calls.tool_calls[0].function.arguments == "{}"
 
@@ -268,7 +307,8 @@ meaningwhile, I will also check the weather in Shanghai.
 </tool_call>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 2
@@ -319,8 +359,7 @@ def test_streaming_basic_functionality(glm4_moe_tool_parser):
 
     # The result behavior depends on the streaming state
     # This test mainly ensures no exceptions are thrown
-    assert result is None or hasattr(result, 'tool_calls') or hasattr(
-        result, 'content')
+    assert result is None or hasattr(result, "tool_calls") or hasattr(result, "content")
 
 
 def test_streaming_no_tool_calls(glm4_moe_tool_parser):
@@ -339,7 +378,7 @@ def test_streaming_no_tool_calls(glm4_moe_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
 
 
@@ -365,7 +404,7 @@ def test_streaming_with_content_before_tool_calls(glm4_moe_tool_parser):
 
     # Should return content when no tool call tokens are detected
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == "get the weather.<tool_call>"
 
 
@@ -381,7 +420,8 @@ def test_extract_tool_calls_special_characters(glm4_moe_tool_parser):
 </tool_call>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
@@ -402,7 +442,8 @@ def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser):
 <arg_value>2025-08-01</arg_value>"""
 
     extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     # Incomplete tool calls should not be extracted
     assert not extracted_tool_calls.tools_called
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 57ace1fa22ac9d40e3190fcaa1d726bf22d67792..9eb73b80fa9b4856e38325a34766667db38e1be7 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -3,18 +3,18 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import partial_json_parser
 import pytest
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
-                                              ToolCall)
-from vllm.entrypoints.openai.tool_parsers import JambaToolParser
+from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 MODEL = "ai21labs/Jamba-tiny-dev"
 
 
@@ -28,12 +28,14 @@ def jamba_tool_parser(jamba_tokenizer):
     return JambaToolParser(jamba_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -42,10 +44,9 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 
 
 def stream_delta_message_generator(
-        jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
-        model_output: str) -> Generator[DeltaMessage, None, None]:
-    all_token_ids = jamba_tokenizer.encode(model_output,
-                                           add_special_tokens=False)
+    jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
 
     previous_text = ""
     previous_tokens = None
@@ -54,18 +55,19 @@ def stream_delta_message_generator(
     for i, delta_token in enumerate(all_token_ids):
         delta_token_ids = [delta_token]
         previous_token_ids = all_token_ids[:i]
-        current_token_ids = all_token_ids[:i + 1]
-
-        (new_tokens, delta_text, new_prefix_offset,
-         new_read_offset) = detokenize_incrementally(
-             tokenizer=jamba_tokenizer,
-             all_input_ids=current_token_ids,
-             prev_tokens=previous_tokens,
-             prefix_offset=prefix_offset,
-             read_offset=read_offset,
-             skip_special_tokens=False,
-             spaces_between_special_tokens=True,
-         )
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=jamba_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
 
         current_text = previous_text + delta_text
 
@@ -82,8 +84,9 @@ def stream_delta_message_generator(
             yield delta_message
 
         previous_text = current_text
-        previous_tokens = previous_tokens + new_tokens if previous_tokens\
-            else new_tokens
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
         prefix_offset = new_prefix_offset
         read_offset = new_read_offset
 
@@ -91,7 +94,8 @@ def stream_delta_message_generator(
 def test_extract_tool_calls_no_tools(jamba_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -106,54 +110,63 @@ def test_extract_tool_calls_no_tools(jamba_tool_parser):
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            None),
+            None,
+        ),
         (
-            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " Sure! let me call the tool for you."),
+            " Sure! let me call the tool for you.",
+        ),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   }))),
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Orlando",
-                                                       "state": "FL",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
             ],
-            None)
+            None,
+        ),
     ],
 )
-def test_extract_tool_calls(jamba_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    jamba_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -170,63 +183,75 @@ def test_extract_tool_calls(jamba_tool_parser, model_output,
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ('''This is a test''', [], '''This is a test'''),
+        ("""This is a test""", [], """This is a test"""),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " "),
+            " ",
+        ),
         (
-            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " Sure! let me call the tool for you."),
+            " Sure! let me call the tool for you.",
+        ),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   }))),
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Orlando",
-                                                       "state": "FL",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
             ],
-            " ")
+            " ",
+        ),
     ],
 )
-def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
-                                      model_output, expected_tool_calls,
-                                      expected_content):
-    other_content: str = ''
+def test_extract_tool_calls_streaming(
+    jamba_tool_parser,
+    jamba_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    other_content: str = ""
     function_names: list[str] = []
     function_args_strs: list[str] = []
     tool_call_idx: int = -1
-    tool_call_ids: list[Optional[str]] = []
+    tool_call_ids: list[str | None] = []
 
     for delta_message in stream_delta_message_generator(
-            jamba_tool_parser, jamba_tokenizer, model_output):
+        jamba_tool_parser, jamba_tokenizer, model_output
+    ):
         # role should never be streamed from tool parser
         assert not delta_message.role
 
@@ -262,18 +287,22 @@ def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
                     # make sure they're a string and then add them to the list
                     assert isinstance(tool_call.function.arguments, str)
 
-                    function_args_strs[
-                        tool_call.index] += tool_call.function.arguments
+                    function_args_strs[tool_call.index] += tool_call.function.arguments
 
     assert other_content == expected_content
 
     actual_tool_calls = [
-        ToolCall(id=tool_call_id,
-                 function=FunctionCall(
-                     name=function_name,
-                     arguments=partial_json_parser.ensure_json(
-                         function_args_str, Allow.OBJ | Allow.STR)))
+        ToolCall(
+            id=tool_call_id,
+            function=FunctionCall(
+                name=function_name,
+                arguments=partial_json_parser.ensure_json(
+                    function_args_str, Allow.OBJ | Allow.STR
+                ),
+            ),
+        )
         for tool_call_id, function_name, function_args_str in zip(
-            tool_call_ids, function_names, function_args_strs)
+            tool_call_ids, function_names, function_args_strs
+        )
     ]
     assert_tool_calls(actual_tool_calls, expected_tool_calls)
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index bd030632f167b09f016ef1b14283743994edfe80..3a48b5206141dacf61ef6ae8ecad14ff521b9600 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -7,9 +7,11 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import KimiK2ToolParser
+from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 # Use a common model that is likely to be available
 MODEL = "moonshotai/Kimi-K2-Instruct"
 
@@ -24,27 +26,31 @@ def kimi_k2_tool_parser(kimi_k2_tokenizer):
     return KimiK2ToolParser(kimi_k2_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
-
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert actual_tool_call.type == "function"
         assert actual_tool_call.function == expected_tool_call.function
 
-        # assert tool call id format
-        assert actual_tool_call.id.startswith("functions.")
-        assert actual_tool_call.id.split(':')[-1].isdigit()
-        assert actual_tool_call.id.split('.')[1].split(
-            ':')[0] == expected_tool_call.function.name
+        # assert tool call id format: should contain function name and numeric index
+        # Format can be either "functions.func_name:0" or "func_name:0"
+        assert actual_tool_call.id.split(":")[-1].isdigit()
+        assert (
+            actual_tool_call.id.split(":")[0].split(".")[-1]
+            == expected_tool_call.function.name
+        )
 
 
 def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -54,6 +60,11 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     ids=[
         "tool_call_with_content_before",
         "multi_tool_call_with_content_before",
+        "concatenated_tool_calls_bug_fix",
+        "three_concatenated_tool_calls",
+        "mixed_spacing_tool_calls",
+        "angle_brackets_in_json",
+        "newlines_in_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -61,14 +72,18 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
             """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
 functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_calls_section_end|>""",
             [
-                ToolCall(id='functions.get_weather:0',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Beijing",
-                             }, ),
-                         ),
-                         type='function')
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Beijing",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             "I'll help you check the weather. ",
         ),
@@ -77,31 +92,158 @@ functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_
 functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_call_begin|>
 functions.get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>""",
             [
-                ToolCall(id='functions.get_weather:0',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Beijing",
-                             }, ),
-                         ),
-                         type='function'),
-                ToolCall(id='functions.get_weather:1',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Shanghai",
-                             }, ),
-                         ),
-                         type='function')
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Beijing",
+                            },
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_weather:1",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Shanghai",
+                            },
+                        ),
+                    ),
+                    type="function",
+                ),
             ],
             "I'll help you check the weather. ",
         ),
+        (
+            """I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"latitude": 34.0522, "longitude": -118.2437}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"content": "Los Angeles today"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {"latitude": 34.0522, "longitude": -118.2437}
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"content": "Los Angeles today"}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. ",
+        ),
+        (
+            """I'll help you with multiple tasks. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "New York"}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"topic": "technology"}<|tool_call_end|><|tool_call_begin|>functions.send_email:2<|tool_call_argument_begin|>{"to": "user@example.com", "subject": "Daily Update"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps({"city": "New York"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"topic": "technology"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.send_email:2",
+                    function=FunctionCall(
+                        name="send_email",
+                        arguments=json.dumps(
+                            {"to": "user@example.com", "subject": "Daily Update"}
+                        ),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll help you with multiple tasks. ",
+        ),
+        (
+            """Mixed spacing test. <|tool_calls_section_begin|> <|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {} <|tool_call_end|><|tool_call_begin|>functions.test2:1<|tool_call_argument_begin|>{}<|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.test:0",
+                    function=FunctionCall(
+                        name="test",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.test2:1",
+                    function=FunctionCall(
+                        name="test2",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "Mixed spacing test. ",
+        ),
+        (
+            """I need to process HTML content. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_html:0<|tool_call_argument_begin|>{"html": "<div>content</div>", "text": "normal text"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_html:0",
+                    function=FunctionCall(
+                        name="process_html",
+                        arguments=json.dumps(
+                            {"html": "<div>content</div>", "text": "normal text"}
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process HTML content. ",
+        ),
+        (
+            """I need to process formatted JSON. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_data:0<|tool_call_argument_begin|>{
+  "name": "test",
+  "value": 123,
+  "nested": {
+    "key": "value"
+  }
+}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_data:0",
+                    function=FunctionCall(
+                        name="process_data",
+                        arguments=json.dumps(
+                            {"name": "test", "value": 123, "nested": {"key": "value"}},
+                            indent=2,
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process formatted JSON. ",
+        ),
     ],
 )
-def test_extract_tool_calls(kimi_k2_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    kimi_k2_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -116,15 +258,14 @@ functions.invalid_get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"
 functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
 
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 2
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "invalid_get_weather"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "valid_get_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "invalid_get_weather"
+    assert extracted_tool_calls.tool_calls[1].function.name == "valid_get_weather"
 
 
 def test_extract_tool_calls_invalid_funcall(kimi_k2_tool_parser):
@@ -134,13 +275,13 @@ functions.invalid_get_weather.0 <|tool_call_argument_begin|> {"city": "Beijing"}
 functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
 
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "valid_get_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_get_weather"
 
 
 def test_streaming_basic_functionality(kimi_k2_tool_parser):
@@ -168,8 +309,7 @@ functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_
 
     # The result might be None or contain tool call information
     # This depends on the internal state management
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         assert len(result.tool_calls) >= 0
 
 
@@ -189,5 +329,598 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
+
+
+def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser):
+    """
+    Test that text between <|tool_calls_section_begin|> and <|tool_call_begin|>
+    is suppressed and does not leak into reasoning_delta.
+    This is the main vulnerability being fixed.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Get token IDs for the markers
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+    tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Simulate streaming sequence:
+    # Delta 1: "I'll help you with that. "
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="I'll help you with that. ",
+        delta_text="I'll help you with that. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2, 3],  # Regular tokens
+        delta_token_ids=[1, 2, 3],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "I'll help you with that. "
+
+    # Delta 2: "<|tool_calls_section_begin|>"
+    prev_ids = [1, 2, 3]
+    curr_ids = prev_ids + [section_begin_token_id]
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[section_begin_token_id],
+        request=None,
+    )
+    # Section marker should be stripped and suppressed
+    assert result2 is None or (result2.content is None or result2.content == "")
+
+    # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO)
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [4, 5]
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|>",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        delta_text=" spurious text ",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[4, 5],
+        request=None,
+    )
+    # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta
+    assert result3 is None or (result3.content is None or result3.content == "")
+
+    # Delta 4: "<|tool_call_begin|>..."
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [tool_call_begin_token_id]
+    _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>",
+        delta_text="<|tool_call_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[tool_call_begin_token_id],
+        request=None,
+    )
+    # Now we're in tool call mode, result depends on internal state
+    # The key is that the spurious text from Delta 3 was not leaked
+
+
+def test_split_markers_across_deltas(kimi_k2_tool_parser):
+    """
+    Test that markers split across delta chunks are correctly detected
+    via the rolling buffer mechanism.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+
+    # Delta 1: "...reasoning<|tool_calls_sec"
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning",
+        current_text="Some reasoning<|tool_calls_sec",
+        delta_text="<|tool_calls_sec",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, 3],  # Partial token
+        delta_token_ids=[3],
+        request=None,
+    )
+    # Partial token not recognized yet, might be buffered
+    # Should return as content or None (depends on implementation)
+
+    # Delta 2: "tion_begin|> "  (completes the marker)
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning<|tool_calls_sec",
+        current_text="Some reasoning<|tool_calls_section_begin|> ",
+        delta_text="tion_begin|> ",
+        previous_token_ids=[1, 2, 3],
+        current_token_ids=[1, 2, section_begin_token_id, 4],
+        delta_token_ids=[section_begin_token_id, 4],
+        request=None,
+    )
+    # Now the complete marker should be detected via buffer
+    # The parser should enter tool section mode
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_marker_variants(kimi_k2_tool_parser):
+    """Test that both singular and plural marker variants are recognized."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Test singular variant: <|tool_call_section_begin|> (note: singular "call")
+    singular_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_section_begin|>")
+
+    if singular_token_id is not None:  # Only test if tokenizer supports it
+        _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+            previous_text="Reasoning ",
+            current_text="Reasoning <|tool_call_section_begin|>",
+            delta_text="<|tool_call_section_begin|>",
+            previous_token_ids=[1, 2],
+            current_token_ids=[1, 2, singular_token_id],
+            delta_token_ids=[singular_token_id],
+            request=None,
+        )
+        # Should enter tool section mode with singular variant too
+        assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser):
+    """
+    Test that after exiting a tool section with <|tool_calls_section_end|>,
+    subsequent text is correctly returned as reasoning content.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Exit tool section
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Subsequent reasoning text should be returned normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[section_begin_id, section_end_id],
+        current_token_ids=[section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == " More reasoning"
+
+
+def test_empty_tool_section(kimi_k2_tool_parser):
+    """Test an empty tool section (begin immediately followed by end)."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Section begin
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+
+    # Immediate section end
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning <|tool_calls_section_begin|>",
+        current_text="Reasoning <|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[1, section_begin_id],
+        current_token_ids=[1, section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    # Should exit cleanly without errors
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+
+def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
+    """
+    Test that the parser recovers from a malformed tool section
+    that never closes properly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Simulate a lot of text without proper tool calls or section end
+    # This should trigger the error recovery mechanism
+    large_text = "x" * 10000  # Exceeds max_section_chars
+
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|>" + large_text,
+        delta_text=large_text,
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))),
+        delta_token_ids=list(range(100, 100 + len(large_text))),
+        request=None,
+    )
+
+    # Parser should have force-exited the tool section
+    assert kimi_k2_tool_parser.in_tool_section is False
+    # And returned the content as reasoning
+    assert result2 is not None
+    assert result2.content == large_text
+
+
+def test_state_reset(kimi_k2_tool_parser):
+    """Test that reset_streaming_state() properly clears all state."""
+    # Put parser in a complex state
+    kimi_k2_tool_parser.in_tool_section = True
+    kimi_k2_tool_parser.token_buffer = "some buffer"
+    kimi_k2_tool_parser.current_tool_id = 5
+    kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}]
+    kimi_k2_tool_parser.section_char_count = 1000
+
+    # Reset
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify all state is cleared
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+    assert kimi_k2_tool_parser.current_tool_id == -1
+    assert kimi_k2_tool_parser.prev_tool_call_arr == []
+    assert kimi_k2_tool_parser.section_char_count == 0
+    assert kimi_k2_tool_parser.current_tool_name_sent is False
+    assert kimi_k2_tool_parser.streamed_args_for_tool == []
+
+
+def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser):
+    """
+    Test that begin→noise→tool_begin within the SAME chunk suppresses
+    the noise text correctly (not just across chunks).
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    tool_call_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Single delta containing: section_begin + spurious text + tool_call_begin
+    combined_text = "<|tool_calls_section_begin|> noise text <|tool_call_begin|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_text,
+        delta_text=combined_text,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, 3, 4, tool_call_begin_id],
+        delta_token_ids=[section_begin_id, 3, 4, tool_call_begin_id],
+        request=None,
+    )
+
+    # The noise text should NOT leak into content
+    # Result should either be None/empty or start tool call parsing
+    if result is not None and result.content is not None:
+        # If content is returned, it should not contain the noise
+        assert "noise text" not in result.content
+        assert result.content == "" or result.content.strip() == ""
+
+
+def test_stream_ends_without_section_end_marker(kimi_k2_tool_parser):
+    """
+    Test that if the stream ends (EOF) without a proper section end marker,
+    the parser doesn't leak text, doesn't crash, and resets state cleanly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Some content in tool section
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|> partial content",
+        delta_text=" partial content",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    # Content should be suppressed
+    assert result2.content == "" or result2.content is None
+
+    # Stream ends (EOF) - no more deltas, no section_end marker
+    # Simulate this by manually checking state and resetting
+    # (In real usage, the request handler would call reset_streaming_state)
+    assert kimi_k2_tool_parser.in_tool_section is True  # Still in section
+
+    # Reset state (as would happen between requests)
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify clean slate
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+
+    # Next request should work normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="New reasoning",
+        delta_text="New reasoning",
+        previous_token_ids=[],
+        current_token_ids=[20, 21],
+        delta_token_ids=[20, 21],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == "New reasoning"
+
+
+def test_same_chunk_begin_and_end_markers(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST: Verify that when both section_begin and section_end
+    markers appear in the SAME chunk, the parser correctly:
+    1. Enters the tool section
+    2. Immediately exits the tool section
+    3. Does NOT get stuck in in_tool_section=True state
+
+    This tests the bug fix where elif was changed to if to handle
+    both state transitions in a single delta.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Single chunk with both markers (e.g., empty tool section)
+    combined_delta = "<|tool_calls_section_begin|><|tool_calls_section_end|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning ",
+        current_text="Some reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, section_end_id],
+        delta_token_ids=[section_begin_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should NOT be stuck in tool section
+    assert kimi_k2_tool_parser.in_tool_section is False, (
+        "Parser stuck in tool section after processing both begin/end in same chunk. "
+        "This indicates the elif bug was not fixed."
+    )
+
+    # Result should be empty or contain only stripped content
+    assert result is not None
+    assert result.content == "" or result.content is None
+
+    # Verify subsequent content streams correctly (not suppressed)
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning " + combined_delta,
+        current_text="Some reasoning " + combined_delta + " More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[1, 2, section_begin_id, section_end_id],
+        current_token_ids=[1, 2, section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+
+    # This content should NOT be suppressed (we're out of tool section)
+    assert result2 is not None
+    assert result2.content == " More reasoning"
+
+
+def test_same_chunk_begin_content_end_markers(kimi_k2_tool_parser):
+    """
+    Test the same-chunk scenario with actual content between markers.
+    Example: <|tool_calls_section_begin|> text <|tool_calls_section_end|>
+    all arriving in one delta. The key is that the state machine correctly
+    transitions in and out within the same chunk.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Chunk with begin, some whitespace/noise, and end all together
+    # This simulates a tool section that opens and closes in the same chunk
+    combined_delta = "<|tool_calls_section_begin|>   <|tool_calls_section_end|>"
+
+    _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id, 100, section_end_id],
+        delta_token_ids=[section_begin_id, 100, section_end_id],
+        request=None,
+    )
+
+    # Parser should exit cleanly (not stuck in tool section)
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Verify the fix: next content should stream normally, not be suppressed
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning " + combined_delta,
+        current_text="Reasoning " + combined_delta + " Done",
+        delta_text=" Done",
+        previous_token_ids=[1, section_begin_id, 100, section_end_id],
+        current_token_ids=[1, section_begin_id, 100, section_end_id, 200],
+        delta_token_ids=[200],
+        request=None,
+    )
+
+    # Content after section should be returned (not suppressed)
+    assert result2 is not None
+    assert result2.content == " Done"
+
+
+def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST (P1): Verify that when both <|tool_call_end|> and
+    <|tool_calls_section_end|> appear in the SAME chunk, the parser:
+    1. Processes the tool_call_end first (emits final arguments)
+    2. THEN exits the section
+    3. Does NOT drop the final tool call update
+    4. Does NOT leak special tokens into reasoning
+
+    This tests the deferred section exit fix.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # Simulate a streaming sequence for a SHORT tool call (all in one chunk):
+    # 1. Reasoning text
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="Let me help. ",
+        delta_text="Let me help. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2],
+        delta_token_ids=[1, 2],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "Let me help. "
+
+    # 2. Section begin
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Let me help. ",
+        current_text="Let me help. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK
+    # This is the critical scenario for short tool calls
+    combined = (
+        '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} '
+        "<|tool_call_end|><|tool_calls_section_end|>"
+    )
+
+    # Build up the previous text gradually to simulate realistic streaming
+    prev_text = "Let me help. <|tool_calls_section_begin|>"
+    curr_text = prev_text + combined
+
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=prev_text,
+        current_text=curr_text,
+        delta_text=combined,
+        previous_token_ids=[1, 2, section_begin_id],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should have exited section AFTER processing tool
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Tool call should have been emitted (not dropped)
+    # The result might be the tool name or None depending on state, but
+    # importantly, it shouldn't be returning the literal tokens as content
+
+    if result3 is not None and result3.content is not None:
+        # Verify no special tokens leaked into content
+        assert "<|tool_call_end|>" not in result3.content
+        assert "<|tool_calls_section_end|>" not in result3.content
+
+    # 4. Verify subsequent content streams normally
+    result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=curr_text,
+        current_text=curr_text + " Done",
+        delta_text=" Done",
+        previous_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+            20,
+        ],
+        delta_token_ids=[20],
+        request=None,
+    )
+
+    # Content after tool section should stream normally
+    assert result4 is not None
+    assert result4.content == " Done"
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index ddf26007121e548337d3292804dbeffd5b8ea338..4332984083dabb1f3590d694d0c0602f98040f46 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -7,11 +7,16 @@ from typing import Any
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
-                                              FunctionCall, ToolCall)
-from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionToolsParam,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 # Use a common model that is likely to be available
 MODEL = "MiniMaxAi/MiniMax-M1-40k"
 
@@ -29,60 +34,48 @@ def minimax_tool_parser(minimax_tokenizer):
 @pytest.fixture
 def sample_tools():
     return [
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "get_current_weather",
-                                     "description": "Get the current weather",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "city": {
-                                                 "type": "string",
-                                                 "description": "The city name"
-                                             },
-                                             "state": {
-                                                 "type": "string",
-                                                 "description":
-                                                 "The state code"
-                                             },
-                                             "unit": {
-                                                 "type": "string",
-                                                 "enum":
-                                                 ["fahrenheit", "celsius"]
-                                             }
-                                         },
-                                         "required": ["city", "state"]
-                                     }
-                                 }),
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "calculate_area",
-                                     "description":
-                                     "Calculate area of a shape",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "shape": {
-                                                 "type": "string"
-                                             },
-                                             "dimensions": {
-                                                 "type": "object"
-                                             },
-                                             "precision": {
-                                                 "type": "integer"
-                                             }
-                                         }
-                                     }
-                                 })
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string", "description": "The city name"},
+                        "state": {"type": "string", "description": "The state code"},
+                        "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
+                    },
+                    "required": ["city", "state"],
+                },
+            },
+        ),
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "calculate_area",
+                "description": "Calculate area of a shape",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "shape": {"type": "string"},
+                        "dimensions": {"type": "object"},
+                        "precision": {"type": "integer"},
+                    },
+                },
+            },
+        ),
     ]
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -93,7 +86,8 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 def test_extract_tool_calls_no_tools(minimax_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -114,14 +108,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
@@ -131,22 +129,30 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             None,
         ),
@@ -155,14 +161,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Seattle",
-                        "state": "WA",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Seattle",
+                                "state": "WA",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you check the weather.",
         ),
@@ -171,14 +181,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "New York", "state": "NY", "unit": "celsius"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "New York",
-                        "state": "NY",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "New York",
+                                "state": "NY",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
@@ -186,22 +200,28 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
             """<tool_calls>
 {"name": "get_current_weather", "arguments": {"city": "Boston", "state": "MA"}}""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Boston",
-                        "state": "MA",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Boston",
+                                "state": "MA",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
     ],
 )
-def test_extract_tool_calls(minimax_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    minimax_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -219,8 +239,7 @@ I'll help you with that. <tool_calls>
 {"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"}}
 </tool_calls>"""
 
-    processed_output = minimax_tool_parser.preprocess_model_output(
-        model_output)
+    processed_output = minimax_tool_parser.preprocess_model_output(model_output)
 
     # The tool call within thinking tags should be removed
     assert "fake_tool" not in processed_output
@@ -242,12 +261,12 @@ Let me help you with the weather. <tool_calls>
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "get_current_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
 
     # Content extraction is based on the position of the first <tool_calls> in the original model_output
     # Since preprocessing removes tool calls within thinking tags, the actual first <tool_calls> is the external one
@@ -268,14 +287,14 @@ def test_extract_tool_calls_invalid_json(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 2
     assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "another_valid_tool"
+    assert extracted_tool_calls.tool_calls[1].function.name == "another_valid_tool"
 
 
 def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
@@ -288,14 +307,14 @@ def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid tool calls with both name and arguments
     assert len(extracted_tool_calls.tool_calls) == 2
     assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "another_valid_tool"
+    assert extracted_tool_calls.tool_calls[1].function.name == "another_valid_tool"
 
 
 def test_streaming_basic_functionality(minimax_tool_parser):
@@ -324,8 +343,7 @@ def test_streaming_basic_functionality(minimax_tool_parser):
 
     # The result might be None or contain tool call information
     # This depends on the internal state management
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         assert len(result.tool_calls) >= 0
 
 
@@ -350,7 +368,7 @@ def test_streaming_with_content_before_tool_calls(minimax_tool_parser):
         request=None,
     )
 
-    if result is not None and hasattr(result, 'content'):
+    if result is not None and hasattr(result, "content"):
         # Should contain some content
         assert result.content is not None
 
@@ -371,7 +389,7 @@ def test_streaming_no_tool_calls(minimax_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
 
 
@@ -397,8 +415,7 @@ def test_streaming_with_thinking_tags(minimax_tool_parser):
 
     # The preprocessing should remove tool calls from thinking tags
     # and only process the real tool call
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         for tool_call in result.tool_calls:
             assert tool_call.function.name != "ignored"
 
@@ -417,7 +434,8 @@ def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     # Multiline JSON is currently not supported, should return no tools called
     assert not extracted_tool_calls.tools_called
@@ -447,7 +465,7 @@ def test_streaming_arguments_incremental_output(minimax_tool_parser):
         '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
         # Stage 6: Tool calls closed
         '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n</tool',
-        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n</tool_calls>'
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n</tool_calls>',
     ]
 
     function_name_sent = False
@@ -455,8 +473,7 @@ def test_streaming_arguments_incremental_output(minimax_tool_parser):
 
     for i, current_text in enumerate(stages):
         previous_text = stages[i - 1] if i > 0 else ""
-        delta_text = current_text[len(previous_text
-                                      ):] if i > 0 else current_text
+        delta_text = current_text[len(previous_text) :] if i > 0 else current_text
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
             previous_text=previous_text,
@@ -471,30 +488,27 @@ def test_streaming_arguments_incremental_output(minimax_tool_parser):
         print(f"Stage {i}: Current text: {repr(current_text)}")
         print(f"Stage {i}: Delta text: {repr(delta_text)}")
 
-        if result is not None and hasattr(result,
-                                          'tool_calls') and result.tool_calls:
+        if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
             tool_call = result.tool_calls[0]
 
             # Check if function name is sent (should happen only once)
             if tool_call.function and tool_call.function.name:
                 assert tool_call.function.name == "get_current_weather"
                 function_name_sent = True
-                print(
-                    f"Stage {i}: Function name sent: {tool_call.function.name}"
-                )
+                print(f"Stage {i}: Function name sent: {tool_call.function.name}")
 
             # Check if arguments are sent incrementally
             if tool_call.function and tool_call.function.arguments:
                 args_fragment = tool_call.function.arguments
-                print(
-                    f"Stage {i}: Got arguments fragment: {repr(args_fragment)}"
-                )
+                print(f"Stage {i}: Got arguments fragment: {repr(args_fragment)}")
 
                 # For incremental output, each fragment should be new content only
                 # The fragment should not contain all previous content
                 if i >= 2 and previous_args_content:  # After we start getting arguments
                     # The new fragment should not be identical to or contain all previous content
-                    assert args_fragment != previous_args_content, f"Fragment should be incremental, not cumulative: {args_fragment}"
+                    assert args_fragment != previous_args_content, (
+                        f"Fragment should be incremental, not cumulative: {args_fragment}"
+                    )
 
                     # If this is truly incremental, the fragment should be relatively small
                     # compared to the complete arguments so far
@@ -518,7 +532,9 @@ def test_streaming_arguments_delta_only(minimax_tool_parser):
     minimax_tool_parser.streamed_args_for_tool = []
 
     # Simulate two consecutive calls with growing arguments
-    call1_text = '<tool_calls>\n{"name": "test_tool", "arguments": {"param1": "value1"}}'
+    call1_text = (
+        '<tool_calls>\n{"name": "test_tool", "arguments": {"param1": "value1"}}'
+    )
     call2_text = '<tool_calls>\n{"name": "test_tool", "arguments": {"param1": "value1", "param2": "value2"}}'
 
     print(f"Call 1 text: {repr(call1_text)}")
@@ -536,7 +552,7 @@ def test_streaming_arguments_delta_only(minimax_tool_parser):
     )
 
     print(f"Result 1: {result1}")
-    if result1 and hasattr(result1, 'tool_calls') and result1.tool_calls:
+    if result1 and hasattr(result1, "tool_calls") and result1.tool_calls:
         for i, tc in enumerate(result1.tool_calls):
             print(f"  Tool call {i}: {tc}")
 
@@ -552,13 +568,12 @@ def test_streaming_arguments_delta_only(minimax_tool_parser):
     )
 
     print(f"Result 2: {result2}")
-    if result2 and hasattr(result2, 'tool_calls') and result2.tool_calls:
+    if result2 and hasattr(result2, "tool_calls") and result2.tool_calls:
         for i, tc in enumerate(result2.tool_calls):
             print(f"  Tool call {i}: {tc}")
 
     # Verify the second call only returns the delta
-    if result2 is not None and hasattr(result2,
-                                       'tool_calls') and result2.tool_calls:
+    if result2 is not None and hasattr(result2, "tool_calls") and result2.tool_calls:
         tool_call = result2.tool_calls[0]
         if tool_call.function and tool_call.function.arguments:
             args_delta = tool_call.function.arguments
@@ -566,17 +581,21 @@ def test_streaming_arguments_delta_only(minimax_tool_parser):
 
             # Should only contain the new part, not the full arguments
             # The delta should be something like ', "param2": "value2"}' or just '"param2": "value2"'
-            assert ', "param2": "value2"}' in args_delta or '"param2": "value2"' in args_delta, f"Expected delta containing param2, got: {args_delta}"
+            assert (
+                ', "param2": "value2"}' in args_delta
+                or '"param2": "value2"' in args_delta
+            ), f"Expected delta containing param2, got: {args_delta}"
 
             # Should NOT contain the previous parameter data
-            assert '"param1": "value1"' not in args_delta, f"Arguments delta should not contain previous data: {args_delta}"
+            assert '"param1": "value1"' not in args_delta, (
+                f"Arguments delta should not contain previous data: {args_delta}"
+            )
 
             # The delta should be relatively short (incremental, not cumulative)
-            expected_max_length = len(
-                ', "param2": "value2"}') + 10  # Some tolerance
-            assert len(
-                args_delta
-            ) <= expected_max_length, f"Delta seems too long (possibly cumulative): {args_delta}"
+            expected_max_length = len(', "param2": "value2"}') + 10  # Some tolerance
+            assert len(args_delta) <= expected_max_length, (
+                f"Delta seems too long (possibly cumulative): {args_delta}"
+            )
 
             print("✓ Delta validation passed")
         else:
@@ -603,40 +622,39 @@ def test_streaming_openai_compatibility(minimax_tool_parser):
     # Test scenario: simple buffering without complex tool call context
     test_cases: list[dict[str, Any]] = [
         {
-            'stage': 'Token: <',
-            'previous': '',
-            'current': '<',
-            'delta': '<',
-            'expected_content': None,  # Should be buffered
+            "stage": "Token: <",
+            "previous": "",
+            "current": "<",
+            "delta": "<",
+            "expected_content": None,  # Should be buffered
         },
         {
-            'stage': 'Token: tool_calls>',
-            'previous': '<',
-            'current': '<tool_calls>',
-            'delta': 'tool_calls>',
-            'expected_content': None,  # Complete tag, should not output
+            "stage": "Token: tool_calls>",
+            "previous": "<",
+            "current": "<tool_calls>",
+            "delta": "tool_calls>",
+            "expected_content": None,  # Complete tag, should not output
         },
         {
-            'stage': 'Regular content',
-            'previous': 'Hello',
-            'current': 'Hello world',
-            'delta': ' world',
-            'expected_content': ' world',  # Normal content should pass through
+            "stage": "Regular content",
+            "previous": "Hello",
+            "current": "Hello world",
+            "delta": " world",
+            "expected_content": " world",  # Normal content should pass through
         },
         {
-            'stage': 'Content with end tag start',
-            'previous': 'Text',
-            'current': 'Text content</tool_',
-            'delta': ' content</tool_',
-            'expected_content':
-            ' content',  # Content part output, </tool_ buffered
+            "stage": "Content with end tag start",
+            "previous": "Text",
+            "current": "Text content</tool_",
+            "delta": " content</tool_",
+            "expected_content": " content",  # Content part output, </tool_ buffered
         },
         {
-            'stage': 'Complete end tag',
-            'previous': 'Text content</tool_',
-            'current': 'Text content</tool_calls>',
-            'delta': 'calls>',
-            'expected_content': None,  # Complete close tag, should not output
+            "stage": "Complete end tag",
+            "previous": "Text content</tool_",
+            "current": "Text content</tool_calls>",
+            "delta": "calls>",
+            "expected_content": None,  # Complete close tag, should not output
         },
     ]
 
@@ -647,9 +665,9 @@ def test_streaming_openai_compatibility(minimax_tool_parser):
         print(f"Delta:    {repr(test_case['delta'])}")
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
-            previous_text=test_case['previous'],
-            current_text=test_case['current'],
-            delta_text=test_case['delta'],
+            previous_text=test_case["previous"],
+            current_text=test_case["current"],
+            delta_text=test_case["delta"],
             previous_token_ids=[],
             current_token_ids=[],
             delta_token_ids=[],
@@ -659,15 +677,18 @@ def test_streaming_openai_compatibility(minimax_tool_parser):
         print(f"Result: {result}")
 
         # Check expected content
-        if test_case['expected_content'] is None:
-            assert result is None or not getattr(result, 'content', None), \
+        if test_case["expected_content"] is None:
+            assert result is None or not getattr(result, "content", None), (
                 f"Stage {i}: Expected no content, got {result}"
+            )
             print("✓ No content output as expected")
         else:
-            assert result is not None and hasattr(result, 'content'), \
+            assert result is not None and hasattr(result, "content"), (
                 f"Stage {i}: Expected content, got {result}"
-            assert result.content == test_case['expected_content'], \
+            )
+            assert result.content == test_case["expected_content"], (
                 f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}"
+            )
             print(f"✓ Content matches: {repr(result.content)}")
 
     print("✓ Streaming test with buffering completed successfully")
@@ -688,35 +709,26 @@ def test_streaming_thinking_tag_buffering(minimax_tool_parser):
     # Test scenario: tool calls within thinking tags should be ignored
     test_cases: list[dict[str, Any]] = [
         {
-            'stage': 'Start thinking',
-            'previous': '',
-            'current': '<think>I need to use a tool. <tool_calls>',
-            'delta': '<think>I need to use a tool. <tool_calls>',
-            'expected_content':
-            '<think>I need to use a tool. <tool_calls>',  # Should pass through as content
+            "stage": "Start thinking",
+            "previous": "",
+            "current": "<think>I need to use a tool. <tool_calls>",
+            "delta": "<think>I need to use a tool. <tool_calls>",
+            "expected_content": "<think>I need to use a tool. <tool_calls>",  # Should pass through as content
         },
         {
-            'stage':
-            'Tool call in thinking',
-            'previous':
-            '<think>I need to use a tool. <tool_calls>',
-            'current':
-            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
-            'delta':
-            '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
-            'expected_content':
-            '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',  # </tool_calls> should be preserved in thinking tags
+            "stage": "Tool call in thinking",
+            "previous": "<think>I need to use a tool. <tool_calls>",
+            "current": '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
+            "delta": '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
+            "expected_content": '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',  # </tool_calls> should be preserved in thinking tags
         },
         {
-            'stage': 'Real tool call after thinking',
-            'previous':
-            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>',
-            'current':
-            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>\n<tool_calls>',
-            'delta': '\n<tool_calls>',
-            'expected_content':
-            '\n',  # Should output '\n' and suppress <tool_calls>
-        }
+            "stage": "Real tool call after thinking",
+            "previous": '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>',
+            "current": '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>\n<tool_calls>',
+            "delta": "\n<tool_calls>",
+            "expected_content": "\n",  # Should output '\n' and suppress <tool_calls>
+        },
     ]
 
     for i, test_case in enumerate(test_cases):
@@ -726,9 +738,9 @@ def test_streaming_thinking_tag_buffering(minimax_tool_parser):
         print(f"Delta:    {repr(test_case['delta'])}")
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
-            previous_text=test_case['previous'],
-            current_text=test_case['current'],
-            delta_text=test_case['delta'],
+            previous_text=test_case["previous"],
+            current_text=test_case["current"],
+            delta_text=test_case["delta"],
             previous_token_ids=[],
             current_token_ids=[],
             delta_token_ids=[],
@@ -738,25 +750,32 @@ def test_streaming_thinking_tag_buffering(minimax_tool_parser):
         print(f"Result: {result}")
 
         # Check expected content
-        if 'expected_content' in test_case:
-            if test_case['expected_content'] is None:
-                assert result is None or not getattr(result, 'content', None), \
+        if "expected_content" in test_case:
+            if test_case["expected_content"] is None:
+                assert result is None or not getattr(result, "content", None), (
                     f"Stage {i}: Expected no content, got {result}"
+                )
             else:
-                assert result is not None and hasattr(result, 'content'), \
+                assert result is not None and hasattr(result, "content"), (
                     f"Stage {i}: Expected content, got {result}"
-                assert result.content == test_case['expected_content'], \
+                )
+                assert result.content == test_case["expected_content"], (
                     f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}"
+                )
                 print(f"✓ Content matches: {repr(result.content)}")
 
         # Check tool calls
-        if test_case.get('expected_tool_call'):
-            assert result is not None and hasattr(result, 'tool_calls') and result.tool_calls, \
-                f"Stage {i}: Expected tool call, got {result}"
+        if test_case.get("expected_tool_call"):
+            assert (
+                result is not None
+                and hasattr(result, "tool_calls")
+                and result.tool_calls
+            ), f"Stage {i}: Expected tool call, got {result}"
 
             tool_call = result.tool_calls[0]
-            assert tool_call.function.name == "real_tool", \
+            assert tool_call.function.name == "real_tool", (
                 f"Expected real_tool, got {tool_call.function.name}"
+            )
             print(f"✓ Real tool call detected: {tool_call.function.name}")
 
     print("✓ Thinking tag buffering test completed successfully")
@@ -782,104 +801,79 @@ def test_streaming_complex_scenario_with_multiple_tools(minimax_tool_parser):
     # Complex scenario: tools inside thinking tags and multiple tools in one group
     test_stages: list[dict[str, Any]] = [
         {
-            'stage': 'Initial content',
-            'previous': '',
-            'current': 'Let me help you with this task.',
-            'delta': 'Let me help you with this task.',
-            'expected_content': 'Let me help you with this task.',
-            'expected_tool_calls': 0,
+            "stage": "Initial content",
+            "previous": "",
+            "current": "Let me help you with this task.",
+            "delta": "Let me help you with this task.",
+            "expected_content": "Let me help you with this task.",
+            "expected_tool_calls": 0,
         },
         {
-            'stage': 'Start thinking tag',
-            'previous': 'Let me help you with this task.',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.',
-            'delta': '<think>I need to analyze this situation first.',
-            'expected_content':
-            '<think>I need to analyze this situation first.',
-            'expected_tool_calls': 0,
+            "stage": "Start thinking tag",
+            "previous": "Let me help you with this task.",
+            "current": "Let me help you with this task.<think>I need to analyze this situation first.",
+            "delta": "<think>I need to analyze this situation first.",
+            "expected_content": "<think>I need to analyze this situation first.",
+            "expected_tool_calls": 0,
         },
         {
-            'stage': 'Tool call inside thinking tag starts',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>',
-            'delta': '<tool_calls>',
-            'expected_content':
-            '<tool_calls>',  # Inside thinking tags, tool tags should be preserved as content
-            'expected_tool_calls': 0,
+            "stage": "Tool call inside thinking tag starts",
+            "previous": "Let me help you with this task.<think>I need to analyze this situation first.",
+            "current": "Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>",
+            "delta": "<tool_calls>",
+            "expected_content": "<tool_calls>",  # Inside thinking tags, tool tags should be preserved as content
+            "expected_tool_calls": 0,
         },
         {
-            'stage': 'Complete tool call inside thinking tag',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
-            'delta':
-            '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
-            'expected_content':
-            '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
-            'expected_tool_calls':
-            0,  # Tools inside thinking tags should be ignored
+            "stage": "Complete tool call inside thinking tag",
+            "previous": "Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>",
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            "delta": '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            "expected_content": '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            "expected_tool_calls": 0,  # Tools inside thinking tags should be ignored
         },
         {
-            'stage': 'End thinking tag',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
-            'delta': '</think>',
-            'expected_content': '</think>',
-            'expected_tool_calls': 0,
+            "stage": "End thinking tag",
+            "previous": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
+            "delta": "</think>",
+            "expected_content": "</think>",
+            "expected_tool_calls": 0,
         },
         {
-            'stage': 'Multiple tools group starts',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
-            'delta':
-            '\nNow I need to get weather information and calculate area.<tool_calls>',
-            'expected_content':
-            '\nNow I need to get weather information and calculate area.',  # <tool_calls> should be filtered
-            'expected_tool_calls': 0,
+            "stage": "Multiple tools group starts",
+            "previous": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
+            "delta": "\nNow I need to get weather information and calculate area.<tool_calls>",
+            "expected_content": "\nNow I need to get weather information and calculate area.",  # <tool_calls> should be filtered
+            "expected_tool_calls": 0,
         },
         {
-            'stage': 'First tool in group',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
-            'delta':
-            '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
-            'expected_content':
-            None,  # No content should be output when tool call is in progress
-            'expected_tool_calls': 1,
-            'expected_tool_name': 'get_current_weather',
+            "stage": "First tool in group",
+            "previous": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            "delta": '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            "expected_content": None,  # No content should be output when tool call is in progress
+            "expected_tool_calls": 1,
+            "expected_tool_name": "get_current_weather",
         },
         {
-            'stage': 'Second tool in group',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
-            'delta':
-            '\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
-            'expected_content': None,
-            'expected_tool_calls': 1,
-            'expected_tool_name': 'calculate_area',
+            "stage": "Second tool in group",
+            "previous": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            "delta": '\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            "expected_content": None,
+            "expected_tool_calls": 1,
+            "expected_tool_name": "calculate_area",
         },
         {
-            'stage': 'Complete tool calls group',
-            'previous':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
-            'current':
-            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}</tool_calls>',
-            'delta': '</tool_calls>',
-            'expected_content': None,
-            'expected_tool_calls': 0,
-        }
+            "stage": "Complete tool calls group",
+            "previous": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            "current": 'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}</tool_calls>',
+            "delta": "</tool_calls>",
+            "expected_content": None,
+            "expected_tool_calls": 0,
+        },
     ]
 
     tool_calls_count = 0
@@ -893,9 +887,9 @@ def test_streaming_complex_scenario_with_multiple_tools(minimax_tool_parser):
         print(f"Delta:    {repr(test_case['delta'])}")
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
-            previous_text=test_case['previous'],
-            current_text=test_case['current'],
-            delta_text=test_case['delta'],
+            previous_text=test_case["previous"],
+            current_text=test_case["current"],
+            delta_text=test_case["delta"],
             previous_token_ids=[],
             current_token_ids=[],
             delta_token_ids=[],
@@ -905,53 +899,64 @@ def test_streaming_complex_scenario_with_multiple_tools(minimax_tool_parser):
         print(f"Result: {result}")
 
         # Check expected content
-        if test_case['expected_content'] is None:
-            assert result is None or not getattr(result, 'content', None), \
+        if test_case["expected_content"] is None:
+            assert result is None or not getattr(result, "content", None), (
                 f"Stage {i}: Expected no content output, got {result}"
+            )
             print("✓ No content output as expected")
         else:
-            assert result is not None and hasattr(result, 'content'), \
+            assert result is not None and hasattr(result, "content"), (
                 f"Stage {i}: Expected content output, got {result}"
-            assert result.content == test_case['expected_content'], \
+            )
+            assert result.content == test_case["expected_content"], (
                 f"Stage {i}: Expected content {repr(test_case['expected_content'])}, got {repr(result.content)}"
+            )
             print(f"✓ Content matches: {repr(result.content)}")
 
         # Check tool calls
-        expected_tool_calls = test_case['expected_tool_calls']
-        actual_tool_calls = len(result.tool_calls) if result and hasattr(
-            result, 'tool_calls') and result.tool_calls else 0
+        expected_tool_calls = test_case["expected_tool_calls"]
+        actual_tool_calls = (
+            len(result.tool_calls)
+            if result and hasattr(result, "tool_calls") and result.tool_calls
+            else 0
+        )
 
         if expected_tool_calls > 0:
-            assert actual_tool_calls >= expected_tool_calls, \
+            assert actual_tool_calls >= expected_tool_calls, (
                 f"Stage {i}: Expected at least {expected_tool_calls} tool calls, got {actual_tool_calls}"
+            )
 
-            if 'expected_tool_name' in test_case:
+            if "expected_tool_name" in test_case:
                 # Find the tool call with the expected name
                 found_tool_call = None
                 for tool_call in result.tool_calls:
-                    if tool_call.function.name == test_case[
-                            'expected_tool_name']:
+                    if tool_call.function.name == test_case["expected_tool_name"]:
                         found_tool_call = tool_call
                         break
 
-                assert found_tool_call is not None, \
+                assert found_tool_call is not None, (
                     f"Stage {i}: Expected tool name {test_case['expected_tool_name']} not found in tool calls: {[tc.function.name for tc in result.tool_calls]}"
+                )
                 print(f"✓ Tool call correct: {found_tool_call.function.name}")
 
                 # Ensure tools inside thinking tags are not called
-                assert found_tool_call.function.name != "internal_analysis", \
+                assert found_tool_call.function.name != "internal_analysis", (
                     f"Stage {i}: Tool 'internal_analysis' inside thinking tags should not be called"
+                )
 
             tool_calls_count += actual_tool_calls
             print(f"✓ Detected {actual_tool_calls} tool calls")
         else:
-            assert actual_tool_calls == 0, \
+            assert actual_tool_calls == 0, (
                 f"Stage {i}: Expected no tool calls, got {actual_tool_calls}"
+            )
 
     # Verify overall results
     print("\n=== Test Summary ===")
     print(f"Total tool calls count: {tool_calls_count}")
-    assert tool_calls_count >= 2, f"Expected at least 2 valid tool calls (outside thinking tags), but got {tool_calls_count}"
+    assert tool_calls_count >= 2, (
+        f"Expected at least 2 valid tool calls (outside thinking tags), but got {tool_calls_count}"
+    )
 
     print("✓ Complex streaming test completed:")
     print("  - ✓ Tools inside thinking tags correctly ignored")
@@ -985,8 +990,8 @@ Now I'll get the weather information for you. <tool_calls>
     # Stream character by character
     for i in range(1, len(complete_text) + 1):
         current_text = complete_text[:i]
-        previous_text = complete_text[:i - 1] if i > 1 else ""
-        delta_text = complete_text[i - 1:i]
+        previous_text = complete_text[: i - 1] if i > 1 else ""
+        delta_text = complete_text[i - 1 : i]
 
         # Show progress every 50 characters
         if i % 50 == 0 or i == len(complete_text):
@@ -1005,36 +1010,35 @@ Now I'll get the weather information for you. <tool_calls>
 
         # Collect results
         if result is not None:
-            if hasattr(result, 'content') and result.content:
+            if hasattr(result, "content") and result.content:
                 content_fragments.append(result.content)
                 # Log important content fragments
                 if any(
-                        keyword in result.content for keyword in
-                    ['<think>', '</think>', '<tool_calls>', '</tool_calls>']):
-                    print(
-                        f"  Char {i}: Content fragment: {repr(result.content)}"
-                    )
-
-            if hasattr(result, 'tool_calls') and result.tool_calls:
+                    keyword in result.content
+                    for keyword in [
+                        "<think>",
+                        "</think>",
+                        "<tool_calls>",
+                        "</tool_calls>",
+                    ]
+                ):
+                    print(f"  Char {i}: Content fragment: {repr(result.content)}")
+
+            if hasattr(result, "tool_calls") and result.tool_calls:
                 for tool_call in result.tool_calls:
                     tool_info = {
-                        'character_position':
-                        i,
-                        'function_name':
-                        tool_call.function.name
-                        if tool_call.function else None,
-                        'arguments':
-                        tool_call.function.arguments
-                        if tool_call.function else None,
+                        "character_position": i,
+                        "function_name": tool_call.function.name
+                        if tool_call.function
+                        else None,
+                        "arguments": tool_call.function.arguments
+                        if tool_call.function
+                        else None,
                     }
                     tool_calls_detected.append(tool_info)
-                    print(
-                        f"  Char {i}: Tool call detected: {tool_call.function.name}"
-                    )
+                    print(f"  Char {i}: Tool call detected: {tool_call.function.name}")
                     if tool_call.function.arguments:
-                        print(
-                            f"    Arguments: {repr(tool_call.function.arguments)}"
-                        )
+                        print(f"    Arguments: {repr(tool_call.function.arguments)}")
 
     # Verify results
     print("\n=== Streaming Test Results ===")
@@ -1042,68 +1046,74 @@ Now I'll get the weather information for you. <tool_calls>
     print(f"Total tool calls detected: {len(tool_calls_detected)}")
 
     # Reconstruct content from fragments
-    reconstructed_content = ''.join(content_fragments)
+    reconstructed_content = "".join(content_fragments)
     print(f"Reconstructed content length: {len(reconstructed_content)}")
 
     # Verify thinking tags content is preserved
-    assert '<think>' in reconstructed_content, "Opening thinking tag should be preserved in content"
-    assert '</think>' in reconstructed_content, "Closing thinking tag should be preserved in content"
+    assert "<think>" in reconstructed_content, (
+        "Opening thinking tag should be preserved in content"
+    )
+    assert "</think>" in reconstructed_content, (
+        "Closing thinking tag should be preserved in content"
+    )
 
     # Verify that tool calls inside thinking tags are NOT extracted as actual tool calls
     thinking_tool_calls = [
-        tc for tc in tool_calls_detected
-        if tc['function_name'] == 'internal_analysis'
+        tc for tc in tool_calls_detected if tc["function_name"] == "internal_analysis"
     ]
-    assert len(
-        thinking_tool_calls
-    ) == 0, f"Tool calls inside thinking tags should be ignored, but found: {thinking_tool_calls}"
+    assert len(thinking_tool_calls) == 0, (
+        f"Tool calls inside thinking tags should be ignored, but found: {thinking_tool_calls}"
+    )
 
     # Verify that real tool calls outside thinking tags ARE extracted
     weather_tool_calls = [
-        tc for tc in tool_calls_detected
-        if tc['function_name'] == 'get_current_weather'
+        tc for tc in tool_calls_detected if tc["function_name"] == "get_current_weather"
     ]
     area_tool_calls = [
-        tc for tc in tool_calls_detected
-        if tc['function_name'] == 'calculate_area'
+        tc for tc in tool_calls_detected if tc["function_name"] == "calculate_area"
     ]
     print(tool_calls_detected)
-    assert len(weather_tool_calls
-               ) > 0, "get_current_weather tool call should be detected"
-    assert len(
-        area_tool_calls) > 0, "calculate_area tool call should be detected"
+    assert len(weather_tool_calls) > 0, (
+        "get_current_weather tool call should be detected"
+    )
+    assert len(area_tool_calls) > 0, "calculate_area tool call should be detected"
 
     # Verify tool call arguments are properly streamed
-    weather_args_found = any(tc['arguments'] for tc in weather_tool_calls
-                             if tc['arguments'])
-    area_args_found = any(tc['arguments'] for tc in area_tool_calls
-                          if tc['arguments'])
+    weather_args_found = any(
+        tc["arguments"] for tc in weather_tool_calls if tc["arguments"]
+    )
+    area_args_found = any(tc["arguments"] for tc in area_tool_calls if tc["arguments"])
 
     print(f"Weather tool call with arguments: {weather_args_found}")
     print(f"Area tool call with arguments: {area_args_found}")
 
     # Verify content before and after tool calls
-    assert 'I\'ll help you with the weather analysis.' in reconstructed_content, "Initial content should be preserved"
-    assert 'Here are the results.' in reconstructed_content, "Final content should be preserved"
+    assert "I'll help you with the weather analysis." in reconstructed_content, (
+        "Initial content should be preserved"
+    )
+    assert "Here are the results." in reconstructed_content, (
+        "Final content should be preserved"
+    )
 
     # Verify that <tool_calls> and </tool_calls> tags are not included in the final content
     # (they should be filtered out when not inside thinking tags)
     content_outside_thinking = reconstructed_content
     # Remove thinking tag content to check content outside
-    if '<think>' in content_outside_thinking and '</think>' in content_outside_thinking:
-        start_think = content_outside_thinking.find('<think>')
-        end_think = content_outside_thinking.find('</think>') + len('</think>')
-        content_outside_thinking = content_outside_thinking[:
-                                                            start_think] + content_outside_thinking[
-                                                                end_think:]
+    if "<think>" in content_outside_thinking and "</think>" in content_outside_thinking:
+        start_think = content_outside_thinking.find("<think>")
+        end_think = content_outside_thinking.find("</think>") + len("</think>")
+        content_outside_thinking = (
+            content_outside_thinking[:start_think]
+            + content_outside_thinking[end_think:]
+        )
 
     # Outside thinking tags, tool_calls tags should be filtered
-    tool_calls_in_content = content_outside_thinking.count('<tool_calls>')
-    assert tool_calls_in_content == 0, f"<tool_calls> tags should be filtered from content outside thinking tags, but found {tool_calls_in_content}"
-
-    print(
-        "\n=== Character-by-character streaming test completed successfully ==="
+    tool_calls_in_content = content_outside_thinking.count("<tool_calls>")
+    assert tool_calls_in_content == 0, (
+        f"<tool_calls> tags should be filtered from content outside thinking tags, but found {tool_calls_in_content}"
     )
+
+    print("\n=== Character-by-character streaming test completed successfully ===")
     print("✓ Tool calls inside thinking tags correctly ignored")
     print("✓ Tool calls outside thinking tags correctly detected")
     print("✓ Content properly streamed and reconstructed")
@@ -1111,8 +1121,7 @@ Now I'll get the weather information for you. <tool_calls>
     print("✓ Character-level streaming works correctly")
 
 
-def test_streaming_character_by_character_simple_tool_call(
-        minimax_tool_parser):
+def test_streaming_character_by_character_simple_tool_call(minimax_tool_parser):
     """Test character-by-character streaming for a simple tool call scenario."""
     # Reset streaming state
     reset_streaming_state(minimax_tool_parser)
@@ -1129,8 +1138,8 @@ def test_streaming_character_by_character_simple_tool_call(
 
     for i in range(1, len(simple_text) + 1):
         current_text = simple_text[:i]
-        previous_text = simple_text[:i - 1] if i > 1 else ""
-        delta_text = simple_text[i - 1:i]
+        previous_text = simple_text[: i - 1] if i > 1 else ""
+        delta_text = simple_text[i - 1 : i]
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
             previous_text=previous_text,
@@ -1143,19 +1152,17 @@ def test_streaming_character_by_character_simple_tool_call(
         )
 
         if result:
-            if hasattr(result, 'content') and result.content:
+            if hasattr(result, "content") and result.content:
                 content_parts.append(result.content)
                 print(
                     f"  Char {i} ({repr(delta_text)}): Content: {repr(result.content)}"
                 )
 
-            if hasattr(result, 'tool_calls') and result.tool_calls:
+            if hasattr(result, "tool_calls") and result.tool_calls:
                 for tool_call in result.tool_calls:
                     if tool_call.function and tool_call.function.name:
                         tool_name_sent = True
-                        print(
-                            f"  Char {i}: Tool name: {tool_call.function.name}"
-                        )
+                        print(f"  Char {i}: Tool name: {tool_call.function.name}")
                     if tool_call.function and tool_call.function.arguments:
                         tool_args_sent = True
                         print(
@@ -1163,12 +1170,14 @@ def test_streaming_character_by_character_simple_tool_call(
                         )
 
     # Verify basic expectations
-    reconstructed_content = ''.join(content_parts)
+    reconstructed_content = "".join(content_parts)
     print(f"Final reconstructed content: {repr(reconstructed_content)}")
 
     assert tool_name_sent, "Tool name should be sent during streaming"
     assert tool_args_sent, "Tool arguments should be sent during streaming"
-    assert "Let me check the weather." in reconstructed_content, "Initial content should be preserved"
+    assert "Let me check the weather." in reconstructed_content, (
+        "Initial content should be preserved"
+    )
 
     print("✓ Simple character-by-character test passed")
 
@@ -1188,8 +1197,8 @@ def test_streaming_character_by_character_with_buffering(minimax_tool_parser):
 
     for i in range(1, len(buffering_text) + 1):
         current_text = buffering_text[:i]
-        previous_text = buffering_text[:i - 1] if i > 1 else ""
-        delta_text = buffering_text[i - 1:i]
+        previous_text = buffering_text[: i - 1] if i > 1 else ""
+        delta_text = buffering_text[i - 1 : i]
 
         result = minimax_tool_parser.extract_tool_calls_streaming(
             previous_text=previous_text,
@@ -1201,16 +1210,18 @@ def test_streaming_character_by_character_with_buffering(minimax_tool_parser):
             request=None,
         )
 
-        if result and hasattr(result, 'content') and result.content:
+        if result and hasattr(result, "content") and result.content:
             all_content.append(result.content)
             print(f"  Char {i} ({repr(delta_text)}): {repr(result.content)}")
 
-    final_content = ''.join(all_content)
+    final_content = "".join(all_content)
     print(f"Final content: {repr(final_content)}")
 
     # The parser should handle the edge case where </tool_calls> appears before <tool_calls>
     assert "Hello" in final_content, "Initial 'Hello' should be preserved"
-    assert "world" in final_content, "Content after false closing tag should be preserved"
+    assert "world" in final_content, (
+        "Content after false closing tag should be preserved"
+    )
     assert "done" in final_content, "Final content should be preserved"
 
     print("✓ Buffering character-by-character test passed")
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
index 2551c41c627541b9e406bb91c8001f09fd72cf90..c874a9601ae707a35410978c0af9f93fa547f706 100644
--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -4,12 +4,18 @@
 import json
 
 import pytest
-from openai_harmony import (Conversation, DeveloperContent,
-                            HarmonyEncodingName, Message, Role, SystemContent,
-                            load_harmony_encoding)
+from openai_harmony import (
+    Conversation,
+    DeveloperContent,
+    HarmonyEncodingName,
+    Message,
+    Role,
+    SystemContent,
+    load_harmony_encoding,
+)
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
+from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL = "gpt2"
@@ -37,8 +43,9 @@ def assert_tool_calls(
 ):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16  # Default from protocol.py
         assert actual_tool_call.type == "function"
@@ -46,20 +53,25 @@ def assert_tool_calls(
 
 
 def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
-    convo = Conversation.from_messages([
-        Message.from_role_and_content(
-            Role.SYSTEM,
-            SystemContent.new(),
-        ),
-        Message.from_role_and_content(
-            Role.DEVELOPER,
-            DeveloperContent.new().with_instructions("Talk like a pirate!")),
-        Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
-        Message.from_role_and_content(Role.ASSISTANT,
-                                      "This is a test").with_channel("final")
-    ])
+    convo = Conversation.from_messages(
+        [
+            Message.from_role_and_content(
+                Role.SYSTEM,
+                SystemContent.new(),
+            ),
+            Message.from_role_and_content(
+                Role.DEVELOPER,
+                DeveloperContent.new().with_instructions("Talk like a pirate!"),
+            ),
+            Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "This is a test"
+            ).with_channel("final"),
+        ]
+    )
     token_ids = harmony_encoding.render_conversation_for_completion(
-        convo, Role.ASSISTANT)
+        convo, Role.ASSISTANT
+    )
     extracted_info = openai_tool_parser.extract_tool_calls(
         "",
         request=None,
@@ -70,26 +82,32 @@ def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
     assert extracted_info.content == "This is a test"
 
 
-@pytest.mark.parametrize("tool_args", [
-    '{"location": "Tokyo"}',
-    '{\n"location": "Tokyo"\n}',
-])
-def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding,
-                                        tool_args):
-    convo = Conversation.from_messages([
-        Message.from_role_and_content(Role.USER,
-                                      "What is the weather in Tokyo?"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.',  #  noqa: E501
-        ).with_channel("analysis"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            tool_args).with_channel("commentary").with_recipient(
-                "functions.get_current_weather").with_content_type("json"),
-    ])
+@pytest.mark.parametrize(
+    "tool_args",
+    [
+        '{"location": "Tokyo"}',
+        '{\n"location": "Tokyo"\n}',
+    ],
+)
+def test_extract_tool_calls_single_tool(
+    openai_tool_parser, harmony_encoding, tool_args
+):
+    convo = Conversation.from_messages(
+        [
+            Message.from_role_and_content(Role.USER, "What is the weather in Tokyo?"),
+            Message.from_role_and_content(
+                Role.ASSISTANT,
+                'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.',  #  noqa: E501
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, tool_args)
+            .with_channel("commentary")
+            .with_recipient("functions.get_current_weather")
+            .with_content_type("json"),
+        ]
+    )
     token_ids = harmony_encoding.render_conversation_for_completion(
-        convo, Role.ASSISTANT)
+        convo, Role.ASSISTANT
+    )
 
     extracted_info = openai_tool_parser.extract_tool_calls(
         "",
@@ -98,10 +116,12 @@ def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding,
     )
     assert extracted_info.tools_called
     expected_tool_calls = [
-        ToolCall(function=FunctionCall(
-            name="get_current_weather",
-            arguments=json.dumps({"location": "Tokyo"}),
-        ))
+        ToolCall(
+            function=FunctionCall(
+                name="get_current_weather",
+                arguments=json.dumps({"location": "Tokyo"}),
+            )
+        )
     ]
     assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
     assert extracted_info.content is None
@@ -111,33 +131,39 @@ def test_extract_tool_calls_multiple_tools(
     openai_tool_parser,
     harmony_encoding,
 ):
-    convo = Conversation.from_messages([
-        Message.from_role_and_content(
-            Role.USER, "What is the weather in Tokyo based on where I'm at?"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
-        ).with_channel("analysis"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
-                "functions.get_current_weather").with_content_type("json"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
-                "functions.get_user_location").with_content_type("json"),
-        Message.from_role_and_content(
-            Role.ASSISTANT, '{"location": "Tokyo"}').with_channel(
-                "commentary").with_recipient("functions.no_content_type"),
-        Message.from_role_and_content(Role.ASSISTANT, "foo").with_channel(
-            "commentary").with_recipient("functions.not_json_no_content_type"),
-        Message.from_role_and_content(
-            Role.ASSISTANT, '{}').with_channel("commentary").with_recipient(
-                "functions.empty_args").with_content_type("json"),
-        Message.from_role_and_content(
-            Role.ASSISTANT, '').with_channel("commentary").with_recipient(
-                "functions.no_args").with_content_type("json"),
-    ])
+    convo = Conversation.from_messages(
+        [
+            Message.from_role_and_content(
+                Role.USER, "What is the weather in Tokyo based on where I'm at?"
+            ),
+            Message.from_role_and_content(
+                Role.ASSISTANT,
+                'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_current_weather")
+            .with_content_type("json"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_user_location")
+            .with_content_type("json"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}')
+            .with_channel("commentary")
+            .with_recipient("functions.no_content_type"),
+            Message.from_role_and_content(Role.ASSISTANT, "foo")
+            .with_channel("commentary")
+            .with_recipient("functions.not_json_no_content_type"),
+            Message.from_role_and_content(Role.ASSISTANT, "{}")
+            .with_channel("commentary")
+            .with_recipient("functions.empty_args")
+            .with_content_type("json"),
+            Message.from_role_and_content(Role.ASSISTANT, "")
+            .with_channel("commentary")
+            .with_recipient("functions.no_args")
+            .with_content_type("json"),
+        ]
+    )
     token_ids = harmony_encoding.render_conversation_for_completion(
         convo,
         Role.ASSISTANT,
@@ -150,30 +176,42 @@ def test_extract_tool_calls_multiple_tools(
     )
     assert extracted_info.tools_called
     expected_tool_calls = [
-        ToolCall(function=FunctionCall(
-            name="get_current_weather",
-            arguments=json.dumps({"location": "Tokyo"}),
-        )),
-        ToolCall(function=FunctionCall(
-            name="get_user_location",
-            arguments=json.dumps({"location": "Tokyo"}),
-        )),
-        ToolCall(function=FunctionCall(
-            name="no_content_type",
-            arguments=json.dumps({"location": "Tokyo"}),
-        )),
-        ToolCall(function=FunctionCall(
-            name="not_json_no_content_type",
-            arguments="foo",
-        )),
-        ToolCall(function=FunctionCall(
-            name="empty_args",
-            arguments=json.dumps({}),
-        )),
-        ToolCall(function=FunctionCall(
-            name="no_args",
-            arguments="",
-        ))
+        ToolCall(
+            function=FunctionCall(
+                name="get_current_weather",
+                arguments=json.dumps({"location": "Tokyo"}),
+            )
+        ),
+        ToolCall(
+            function=FunctionCall(
+                name="get_user_location",
+                arguments=json.dumps({"location": "Tokyo"}),
+            )
+        ),
+        ToolCall(
+            function=FunctionCall(
+                name="no_content_type",
+                arguments=json.dumps({"location": "Tokyo"}),
+            )
+        ),
+        ToolCall(
+            function=FunctionCall(
+                name="not_json_no_content_type",
+                arguments="foo",
+            )
+        ),
+        ToolCall(
+            function=FunctionCall(
+                name="empty_args",
+                arguments=json.dumps({}),
+            )
+        ),
+        ToolCall(
+            function=FunctionCall(
+                name="no_args",
+                arguments="",
+            )
+        ),
     ]
     assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
     assert extracted_info.content is None
@@ -184,20 +222,24 @@ def test_extract_tool_calls_with_content(
     harmony_encoding,
 ):
     final_content = "This tool call will get the weather."
-    convo = Conversation.from_messages([
-        Message.from_role_and_content(
-            Role.USER, "What is the weather in Tokyo based on where I'm at?"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
-        ).with_channel("analysis"),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
-            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
-                "functions.get_current_weather").with_content_type("json"),
-        Message.from_role_and_content(Role.ASSISTANT,
-                                      final_content).with_channel("final"),
-    ])
+    convo = Conversation.from_messages(
+        [
+            Message.from_role_and_content(
+                Role.USER, "What is the weather in Tokyo based on where I'm at?"
+            ),
+            Message.from_role_and_content(
+                Role.ASSISTANT,
+                'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_current_weather")
+            .with_content_type("json"),
+            Message.from_role_and_content(Role.ASSISTANT, final_content).with_channel(
+                "final"
+            ),
+        ]
+    )
     token_ids = harmony_encoding.render_conversation_for_completion(
         convo,
         Role.ASSISTANT,
@@ -210,10 +252,12 @@ def test_extract_tool_calls_with_content(
     )
     assert extracted_info.tools_called
     expected_tool_calls = [
-        ToolCall(function=FunctionCall(
-            name="get_current_weather",
-            arguments=json.dumps({"location": "Tokyo"}),
-        )),
+        ToolCall(
+            function=FunctionCall(
+                name="get_current_weather",
+                arguments=json.dumps({"location": "Tokyo"}),
+            )
+        ),
     ]
     assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
     assert extracted_info.content == final_content
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index fff20c68d6212f95efe74803da76297b52131e1d..9af94a6a64a25e59b53a0b06ea1b6ea2d08f7602 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -2,14 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from typing import Optional
 
 import openai
 import pytest
 
-from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
-                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL, ServerConfig)
+from .utils import (
+    MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+    ServerConfig,
+)
 
 
 # test: getting the model to generate parallel tool calls (streaming/not)
@@ -17,12 +20,15 @@ from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
 # may be added in the future. e.g. llama 3.1 models are not designed to support
 # parallel tool calls.
 @pytest.mark.asyncio
-async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
-                                   server_config: ServerConfig):
-
+async def test_parallel_tool_calls(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     if not server_config.get("supports_parallel", True):
-        pytest.skip("The {} model doesn't support parallel tool calls".format(
-            server_config["model"]))
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
 
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -32,7 +38,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
@@ -69,9 +76,10 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
-    role_name: Optional[str] = None
+    role_name: str | None = None
     finish_reason_count: int = 0
 
     tool_call_names: list[str] = []
@@ -80,24 +88,22 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
     tool_call_id_count: int = 0
 
     async for chunk in stream:
-
         # if there's a finish reason make sure it's tools
         if chunk.choices[0].finish_reason:
             finish_reason_count += 1
-            assert chunk.choices[0].finish_reason == 'tool_calls'
+            assert chunk.choices[0].finish_reason == "tool_calls"
 
         # if a role is being streamed make sure it wasn't already set to
         # something else
         if chunk.choices[0].delta.role:
-            assert not role_name or role_name == 'assistant'
-            role_name = 'assistant'
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
 
         # if a tool call is streamed make sure there's exactly one
         # (based on the request parameters
         streamed_tool_calls = chunk.choices[0].delta.tool_calls
 
         if streamed_tool_calls and len(streamed_tool_calls) > 0:
-
             # make sure only one diff is present - correct even for parallel
             assert len(streamed_tool_calls) == 1
             tool_call = streamed_tool_calls[0]
@@ -110,8 +116,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
             # if a tool call ID is streamed, make sure one hasn't been already
             if tool_call.id:
                 tool_call_id_count += 1
-                assert (isinstance(tool_call.id, str)
-                        and (len(tool_call.id) >= 9))
+                assert isinstance(tool_call.id, str) and (len(tool_call.id) >= 9)
 
             # if parts of the function start being streamed
             if tool_call.function:
@@ -125,32 +130,32 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
                     # make sure they're a string and then add them to the list
                     assert isinstance(tool_call.function.arguments, str)
 
-                    tool_call_args[
-                        tool_call.index] += tool_call.function.arguments
+                    tool_call_args[tool_call.index] += tool_call.function.arguments
 
     assert finish_reason_count == 1
-    assert role_name == 'assistant'
+    assert role_name == "assistant"
 
-    assert (len(non_streamed_tool_calls) == len(tool_call_names) ==
-            len(tool_call_args))
+    assert len(non_streamed_tool_calls) == len(tool_call_names) == len(tool_call_args)
 
     for i in range(2):
         assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
         streamed_args = json.loads(tool_call_args[i])
-        non_streamed_args = json.loads(
-            non_streamed_tool_calls[i].function.arguments)
+        non_streamed_args = json.loads(non_streamed_tool_calls[i].function.arguments)
         assert streamed_args == non_streamed_args
 
 
 # test: providing parallel tool calls back to the model to get a response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
-                                                server_config: ServerConfig):
-
+async def test_parallel_tool_calls_with_results(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     if not server_config.get("supports_parallel", True):
-        pytest.skip("The {} model doesn't support parallel tool calls".format(
-            server_config["model"]))
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
 
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -160,14 +165,14 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 0
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
     assert choice.message.content is not None
     assert "98" in choice.message.content  # Dallas temp in tool response
     assert "78" in choice.message.content  # Orlando temp in tool response
@@ -179,7 +184,8 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     chunks: list[str] = []
     finish_reason_count = 0
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 57eaf84d36f23df41c5c61d1948dd15cf0fdfa52..93ef1049fc07e661816316c72902d17c13ca0bca 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -3,21 +3,25 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaMessage, FunctionCall,
-                                              ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaMessage,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
-    Qwen3CoderToolParser)
-from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import (
-    Qwen3XMLToolParser)
+    Qwen3CoderToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
 
 
@@ -36,9 +40,8 @@ def qwen3_xml_tool_parser(qwen3_tokenizer):
     return Qwen3XMLToolParser(qwen3_tokenizer)
 
 
-@pytest.fixture(params=["original", "xml"])
-def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser,
-                                   request):
+@pytest.fixture(params=["xml"])
+def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request):
     """Parameterized fixture that provides both parser types for testing"""
     if request.param == "original":
         return qwen3_tool_parser
@@ -49,76 +52,63 @@ def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser,
 @pytest.fixture
 def sample_tools():
     return [
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "get_current_weather",
-                                     "description": "Get the current weather",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "city": {
-                                                 "type": "string",
-                                                 "description": "The city name"
-                                             },
-                                             "state": {
-                                                 "type": "string",
-                                                 "description":
-                                                 "The state code"
-                                             },
-                                             "unit": {
-                                                 "type": "string",
-                                                 "enum":
-                                                 ["fahrenheit", "celsius"]
-                                             }
-                                         },
-                                         "required": ["city", "state"]
-                                     }
-                                 }),
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "calculate_area",
-                                     "description":
-                                     "Calculate area of a shape",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "shape": {
-                                                 "type": "string"
-                                             },
-                                             "dimensions": {
-                                                 "type": "object"
-                                             },
-                                             "precision": {
-                                                 "type": "integer"
-                                             }
-                                         }
-                                     }
-                                 })
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string", "description": "The city name"},
+                        "state": {"type": "string", "description": "The state code"},
+                        "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
+                    },
+                    "required": ["city", "state"],
+                },
+            },
+        ),
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "calculate_area",
+                "description": "Calculate area of a shape",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "shape": {"type": "string"},
+                        "dimensions": {"type": "object"},
+                        "precision": {"type": "integer"},
+                    },
+                },
+            },
+        ),
     ]
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         # Qwen3 parser doesn't generate IDs during extraction
         assert actual_tool_call.type == "function"
-        assert (
-            actual_tool_call.function.name == expected_tool_call.function.name)
-        assert (json.loads(actual_tool_call.function.arguments) == json.loads(
-            expected_tool_call.function.arguments))
+        assert actual_tool_call.function.name == expected_tool_call.function.name
+        assert json.loads(actual_tool_call.function.arguments) == json.loads(
+            expected_tool_call.function.arguments
+        )
 
 
 def stream_delta_message_generator(
     qwen3_tool_parser,
     qwen3_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
-    all_token_ids = qwen3_tokenizer.encode(model_output,
-                                           add_special_tokens=False)
+    all_token_ids = qwen3_tokenizer.encode(model_output, add_special_tokens=False)
 
     previous_text = ""
     previous_tokens = None
@@ -127,18 +117,19 @@ def stream_delta_message_generator(
     for i, delta_token in enumerate(all_token_ids):
         delta_token_ids = [delta_token]
         previous_token_ids = all_token_ids[:i]
-        current_token_ids = all_token_ids[:i + 1]
-
-        (new_tokens, delta_text, new_prefix_offset,
-         new_read_offset) = detokenize_incrementally(
-             tokenizer=qwen3_tokenizer,
-             all_input_ids=current_token_ids,
-             prev_tokens=previous_tokens,
-             prefix_offset=prefix_offset,
-             read_offset=read_offset,
-             skip_special_tokens=False,
-             spaces_between_special_tokens=True,
-         )
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=qwen3_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
 
         current_text = previous_text + delta_text
 
@@ -155,8 +146,9 @@ def stream_delta_message_generator(
             yield delta_message
 
         previous_text = current_text
-        previous_tokens = (previous_tokens +
-                           new_tokens if previous_tokens else new_tokens)
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
         prefix_offset = new_prefix_offset
         read_offset = new_read_offset
 
@@ -164,7 +156,8 @@ def stream_delta_message_generator(
 def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized):
     model_output = "This is a test response without any tool calls"
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -180,7 +173,8 @@ def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized):
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ('''<tool_call>
+        (
+            """<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -192,16 +186,21 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      })))
-        ], None),
-        ('''Sure! Let me check the weather for you.<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """Sure! Let me check the weather for you.<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -213,16 +212,21 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      })))
-        ], "Sure! Let me check the weather for you."),
-        ('''<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
+            ],
+            "Sure! Let me check the weather for you.",
+        ),
+        (
+            """<tool_call>
 <function=calculate_area>
 <parameter=shape>
 rectangle
@@ -235,18 +239,25 @@ rectangle
 2
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(function=FunctionCall(name="calculate_area",
-                                           arguments=json.dumps({
-                                               "shape": "rectangle",
-                                               "dimensions": {
-                                                   "width": 10,
-                                                   "height": 20
-                                               },
-                                               "precision": 2
-                                           })))
-        ], None),
-        ('''<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="calculate_area",
+                        arguments=json.dumps(
+                            {
+                                "shape": "rectangle",
+                                "dimensions": {"width": 10, "height": 20},
+                                "precision": 2,
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -271,23 +282,29 @@ FL
 fahrenheit
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      }))),
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Orlando",
-                                          "state": "FL",
-                                          "unit": "fahrenheit"
-                                      })))
-        ], None),
-        ('''Let me calculate that area for you.<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+            ],
+            None,
+        ),
+        (
+            """Let me calculate that area for you.<tool_call>
 <function=calculate_area>
 <parameter=shape>
 circle
@@ -299,26 +316,36 @@ circle
 3
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(function=FunctionCall(name="calculate_area",
-                                           arguments=json.dumps({
-                                               "shape": "circle",
-                                               "dimensions": {
-                                                   "radius": 15.5
-                                               },
-                                               "precision": 3
-                                           })))
-        ], "Let me calculate that area for you."),
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="calculate_area",
+                        arguments=json.dumps(
+                            {
+                                "shape": "circle",
+                                "dimensions": {"radius": 15.5},
+                                "precision": 3,
+                            }
+                        ),
+                    )
+                )
+            ],
+            "Let me calculate that area for you.",
+        ),
     ],
 )
-def test_extract_tool_calls(qwen3_tool_parser_parametrized, sample_tools,
-                            model_output, expected_tool_calls,
-                            expected_content):
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+def test_extract_tool_calls(
+    qwen3_tool_parser_parametrized,
+    sample_tools,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=request)
+        model_output, request=request
+    )
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -326,60 +353,51 @@ def test_extract_tool_calls(qwen3_tool_parser_parametrized, sample_tools,
     assert extracted_tool_calls.content == expected_content
 
 
-def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser_parametrized,
-                                             sample_tools):
+def test_extract_tool_calls_fallback_no_tags(
+    qwen3_tool_parser_parametrized, sample_tools
+):
     """Test fallback parsing when XML tags are missing"""
-    model_output = '''<function=get_current_weather>
+    model_output = """<function=get_current_weather>
 <parameter=city>
 Dallas
 </parameter>
 <parameter=state>
 TX
 </parameter>
-</function>'''
+</function>"""
 
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=request)
+        model_output, request=request
+    )
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert (extracted_tool_calls.tool_calls[0].function.name ==
-            "get_current_weather")
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
 
 
 def test_extract_tool_calls_type_conversion(qwen3_tool_parser_parametrized):
     """Test parameter type conversion based on tool schema"""
     tools = [
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "test_types",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "int_param": {
-                                                 "type": "integer"
-                                             },
-                                             "float_param": {
-                                                 "type": "float"
-                                             },
-                                             "bool_param": {
-                                                 "type": "boolean"
-                                             },
-                                             "str_param": {
-                                                 "type": "string"
-                                             },
-                                             "obj_param": {
-                                                 "type": "object"
-                                             }
-                                         }
-                                     }
-                                 })
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "test_types",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "int_param": {"type": "integer"},
+                        "float_param": {"type": "float"},
+                        "bool_param": {"type": "boolean"},
+                        "str_param": {"type": "string"},
+                        "obj_param": {"type": "object"},
+                    },
+                },
+            },
+        )
     ]
 
-    model_output = '''<tool_call>
+    model_output = """<tool_call>
 <function=test_types>
 <parameter=int_param>
 42
@@ -397,11 +415,12 @@ hello world
 {"key": "value"}
 </parameter>
 </function>
-</tool_call>'''
+</tool_call>"""
 
     request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=request)
+        model_output, request=request
+    )
 
     args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
     assert args["int_param"] == 42
@@ -423,7 +442,8 @@ hello world
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
         ("This is a test without tools", [], "This is a test without tools"),
-        ('''<tool_call>
+        (
+            """<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -435,16 +455,21 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      })))
-        ], None),
-        ('''Sure! Let me check the weather for you.<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """Sure! Let me check the weather for you.<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -456,16 +481,21 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      })))
-        ], "Sure! Let me check the weather for you."),
-        ('''<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
+            ],
+            "Sure! Let me check the weather for you.",
+        ),
+        (
+            """<tool_call>
 <function=calculate_area>
 <parameter=shape>
 rectangle
@@ -478,18 +508,25 @@ rectangle
 2
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(function=FunctionCall(name="calculate_area",
-                                           arguments=json.dumps({
-                                               "shape": "rectangle",
-                                               "dimensions": {
-                                                   "width": 10,
-                                                   "height": 20
-                                               },
-                                               "precision": 2
-                                           })))
-        ], None),
-        ('''<tool_call>
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="calculate_area",
+                        arguments=json.dumps(
+                            {
+                                "shape": "rectangle",
+                                "dimensions": {"width": 10, "height": 20},
+                                "precision": 2,
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -514,24 +551,30 @@ FL
 celsius
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Dallas",
-                                          "state": "TX",
-                                          "unit": "fahrenheit"
-                                      }))),
-            ToolCall(
-                function=FunctionCall(name="get_current_weather",
-                                      arguments=json.dumps({
-                                          "city": "Orlando",
-                                          "state": "FL",
-                                          "unit": "celsius"
-                                      })))
-        ], None),
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            None,
+        ),
         # Added tool_with_typed_params test case
-        ('''Let me calculate that area for you.<tool_call>
+        (
+            """Let me calculate that area for you.<tool_call>
 <function=calculate_area>
 <parameter=shape>
 circle
@@ -543,33 +586,42 @@ circle
 3
 </parameter>
 </function>
-</tool_call>''', [
-            ToolCall(function=FunctionCall(name="calculate_area",
-                                           arguments=json.dumps({
-                                               "shape": "circle",
-                                               "dimensions": {
-                                                   "radius": 15.5
-                                               },
-                                               "precision": 3
-                                           })))
-        ], "Let me calculate that area for you."),
+</tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="calculate_area",
+                        arguments=json.dumps(
+                            {
+                                "shape": "circle",
+                                "dimensions": {"radius": 15.5},
+                                "precision": 3,
+                            }
+                        ),
+                    )
+                )
+            ],
+            "Let me calculate that area for you.",
+        ),
     ],
 )
-def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized,
-                                      qwen3_tokenizer, sample_tools,
-                                      model_output, expected_tool_calls,
-                                      expected_content):
+def test_extract_tool_calls_streaming(
+    qwen3_tool_parser_parametrized,
+    qwen3_tokenizer,
+    sample_tools,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
     """Test incremental streaming behavior including typed parameters"""
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
 
-    other_content = ''
+    other_content = ""
     tool_states = {}  # Track state per tool index
 
     for delta_message in stream_delta_message_generator(
-            qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output,
-            request):
+        qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
+    ):
         # role should never be streamed from tool parser
         assert not delta_message.role
 
@@ -586,7 +638,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized,
                         "id": None,
                         "name": None,
                         "arguments": "",
-                        "type": None
+                        "type": None,
                     }
 
                 # First chunk should have id, name, and type
@@ -605,14 +657,16 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized,
 
                     if tool_call.function.arguments is not None:
                         # Accumulate arguments incrementally
-                        tool_states[idx][
-                            "arguments"] += tool_call.function.arguments
+                        tool_states[idx]["arguments"] += tool_call.function.arguments
 
     # Verify final content
     assert other_content == (expected_content or "")  # Handle None case
 
     # Verify we got all expected tool calls
     assert len(tool_states) == len(expected_tool_calls)
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len(
+        expected_tool_calls
+    )
 
     # Verify each tool call
     for idx, expected_tool in enumerate(expected_tool_calls):
@@ -630,10 +684,11 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized,
 
 
 def test_extract_tool_calls_missing_closing_parameter_tag(
-        qwen3_tool_parser_parametrized, sample_tools):
+    qwen3_tool_parser_parametrized, sample_tools
+):
     """Test handling of missing closing </parameter> tag"""
     # Using get_current_weather from sample_tools but with malformed XML
-    model_output = '''Let me check the weather for you:
+    model_output = """Let me check the weather for you:
 <tool_call>
 <function=get_current_weather>
 <parameter=city>
@@ -645,21 +700,19 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>'''
+</tool_call>"""
 
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=request)
+        model_output, request=request
+    )
 
     # The parser should handle the malformed XML gracefully
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
 
     # Verify the function name is correct
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "get_current_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
 
     # Verify the arguments are parsed despite the missing closing tag
     args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
@@ -673,10 +726,11 @@ fahrenheit
 
 
 def test_extract_tool_calls_streaming_missing_closing_tag(
-        qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools):
+    qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools
+):
     """Test streaming with missing closing </parameter> tag"""
     # Using get_current_weather from sample_tools but with malformed XML
-    model_output = '''Let me check the weather for you:
+    model_output = """Let me check the weather for you:
 <tool_call>
 <function=get_current_weather>
 <parameter=city>
@@ -688,19 +742,16 @@ TX
 fahrenheit
 </parameter>
 </function>
-</tool_call>'''
+</tool_call>"""
 
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
 
-    other_content = ''
+    other_content = ""
     tool_states = {}
 
     for delta_message in stream_delta_message_generator(
-            qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output,
-            request):
-
+        qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
+    ):
         if delta_message.content:
             other_content += delta_message.content
 
@@ -713,7 +764,7 @@ fahrenheit
                         "id": None,
                         "name": None,
                         "arguments": "",
-                        "type": None
+                        "type": None,
                     }
 
                 if tool_call.id:
@@ -728,14 +779,14 @@ fahrenheit
                         tool_states[idx]["name"] = tool_call.function.name
 
                     if tool_call.function.arguments is not None:
-                        tool_states[idx][
-                            "arguments"] += tool_call.function.arguments
+                        tool_states[idx]["arguments"] += tool_call.function.arguments
 
     # Verify content was streamed
     assert "Let me check the weather for you:" in other_content
-
     # Verify we got the tool call
     assert len(tool_states) == 1
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
+
     state = tool_states[0]
     assert state["id"] is not None
     assert state["type"] == "function"
@@ -750,9 +801,10 @@ fahrenheit
 
 
 def test_extract_tool_calls_streaming_incremental(
-        qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools):
+    qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools
+):
     """Test that streaming is truly incremental"""
-    model_output = '''I'll check the weather.<tool_call>
+    model_output = """I'll check the weather.<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -761,16 +813,14 @@ Dallas
 TX
 </parameter>
 </function>
-</tool_call>'''
+</tool_call>"""
 
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
 
     chunks = []
     for delta_message in stream_delta_message_generator(
-            qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output,
-            request):
+        qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
+    ):
         chunks.append(delta_message)
 
     # Should have multiple chunks
@@ -785,7 +835,7 @@ TX
     for chunk in chunks:
         if chunk.tool_calls and chunk.tool_calls[0].id:
             header_found = True
-            assert (chunk.tool_calls[0].function.name == "get_current_weather")
+            assert chunk.tool_calls[0].function.name == "get_current_weather"
             assert chunk.tool_calls[0].type == "function"
             # Empty initially
             assert chunk.tool_calls[0].function.arguments == ""
@@ -809,46 +859,120 @@ TX
 
 
 def test_extract_tool_calls_complex_type_with_single_quote(
-        qwen3_tool_parser_parametrized):
+    qwen3_tool_parser_parametrized,
+):
     """Test parameter type conversion based on tool schema"""
     tools = [
-        ChatCompletionToolsParam(type="function",
-                                 function={
-                                     "name": "test_types",
-                                     "parameters": {
-                                         "type": "object",
-                                         "properties": {
-                                             "int_param": {
-                                                 "type": "integer"
-                                             },
-                                             "float_param": {
-                                                 "type": "float"
-                                             },
-                                             "bool_param": {
-                                                 "type": "boolean"
-                                             },
-                                             "str_param": {
-                                                 "type": "string"
-                                             },
-                                             "obj_param": {
-                                                 "type": "object"
-                                             }
-                                         }
-                                     }
-                                 })
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "test_types",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "int_param": {"type": "integer"},
+                        "float_param": {"type": "float"},
+                        "bool_param": {"type": "boolean"},
+                        "str_param": {"type": "string"},
+                        "obj_param": {"type": "object"},
+                    },
+                },
+            },
+        )
     ]
 
-    model_output = '''<tool_call>
+    model_output = """<tool_call>
 <function=test_types>
 <parameter=obj_param>
 {'key': 'value'}
 </parameter>
 </function>
-</tool_call>'''
+</tool_call>"""
 
     request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
     extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
-        model_output, request=request)
+        model_output, request=request
+    )
 
     args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
     assert args["obj_param"] == {"key": "value"}
+
+
+def test_extract_tool_calls_streaming_missing_opening_tag(
+    qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools
+):
+    """Test streaming with missing opening <tool_call> tag
+
+    This tests that the streaming parser correctly handles
+    tool calls that start directly with <function=...>
+    """
+    model_output = """I'll check the weather for you.
+
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>"""
+
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
+
+    other_content = ""
+    tool_states = {}
+
+    for delta_message in stream_delta_message_generator(
+        qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
+    ):
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None,
+                    }
+
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        tool_states[idx]["arguments"] += tool_call.function.arguments
+
+    # Verify content was streamed
+    assert "I'll check the weather for you." in other_content
+
+    # Verify we got the tool call
+    assert len(tool_states) == 1
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
+
+    state = tool_states[0]
+    assert state["id"] is not None
+    assert state["type"] == "function"
+    assert state["name"] == "get_current_weather"
+
+    # Verify arguments were parsed correctly despite missing opening tag
+    assert state["arguments"] is not None
+    args = json.loads(state["arguments"])
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 118c7534622e20d9cc6be05052c6a41aa2e394ef..1367ad87cb01926669254d793519b793649326f4 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -4,18 +4,22 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaMessage, FunctionCall,
-                                              ToolCall)
-from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaMessage,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 # Use a common model that is likely to be available
 MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
 
@@ -43,51 +47,56 @@ def sample_tools():
                     "properties": {
                         "location": {
                             "type": "string",
-                            "description":
-                            "City and country e.g. Bogotá, Colombia"
+                            "description": "City and country e.g. Bogotá, Colombia",
                         },
                         "unit": {
                             "type": "string",
-                            "description": "this is the unit of temperature"
-                        }
+                            "description": "this is the unit of temperature",
+                        },
                     },
                     "required": ["location"],
-                    "additionalProperties": False
+                    "additionalProperties": False,
                 },
                 "returns": {
                     "type": "object",
                     "properties": {
                         "temperature": {
                             "type": "number",
-                            "description": "temperature in celsius"
+                            "description": "temperature in celsius",
                         }
                     },
                     "required": ["temperature"],
-                    "additionalProperties": False
+                    "additionalProperties": False,
                 },
-                "strict": True
-            }),
+                "strict": True,
+            },
+        ),
     ]
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         # Seed-OSS tool call will not generate id
         assert actual_tool_call.type == "function"
         assert actual_tool_call.function == expected_tool_call.function
 
         assert actual_tool_call.function.name == expected_tool_call.function.name
-        assert actual_tool_call.function.arguments == expected_tool_call.function.arguments
+        assert (
+            actual_tool_call.function.arguments == expected_tool_call.function.arguments
+        )
 
 
 def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
     model_output = "This is a test response without any tool calls"
     extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
@@ -102,17 +111,24 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ("""<seed:tool_call>\n<function=get_weather>\n"""
-         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-         [
-             ToolCall(function=FunctionCall(
-                 name="get_weather",
-                 arguments=json.dumps({
-                     "location": "Barcelona, Spain",
-                 }, ),
-             ),
-                      type='function')
-         ], None),
+        (
+            """<seed:tool_call>\n<function=get_weather>\n"""
+            """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            None,
+        ),
         (
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
@@ -129,13 +145,17 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
             """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
             """\n</seed:tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_weather",
-                    arguments=json.dumps({
-                        "location": "Barcelona, Spain",
-                    }, ),
-                ),
-                         type='function')
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
@@ -167,15 +187,18 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
             """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
             """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_weather",
-                    arguments=json.dumps(
-                        {
-                            "location": "Barcelona, Spain",
-                            "unit": "celsius",
-                        }, ),
-                ),
-                         type='function')
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                                "unit": "celsius",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
             """First, I need to remember the function I can use: get_weather. The function requires a """
@@ -194,13 +217,17 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
         ),
     ],
 )
-def test_extract_tool_calls(seed_oss_tool_parser, sample_tools, model_output,
-                            expected_tool_calls, expected_content):
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+def test_extract_tool_calls(
+    seed_oss_tool_parser,
+    sample_tools,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
     extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-        model_output, request=request)  # type: ignore[arg-type]
+        model_output, request=request
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -223,7 +250,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
 
 
@@ -231,10 +258,9 @@ def stream_delta_message_generator(
     seed_oss_tool_parser: SeedOssToolParser,
     seed_oss_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
-    all_token_ids = seed_oss_tokenizer.encode(model_output,
-                                              add_special_tokens=False)
+    all_token_ids = seed_oss_tokenizer.encode(model_output, add_special_tokens=False)
 
     previous_text = ""
     previous_tokens = None
@@ -243,18 +269,19 @@ def stream_delta_message_generator(
     for i, delta_token in enumerate(all_token_ids):
         delta_token_ids = [delta_token]
         previous_token_ids = all_token_ids[:i]
-        current_token_ids = all_token_ids[:i + 1]
-
-        (new_tokens, delta_text, new_prefix_offset,
-         new_read_offset) = detokenize_incrementally(
-             tokenizer=seed_oss_tokenizer,
-             all_input_ids=current_token_ids,
-             prev_tokens=previous_tokens,
-             prefix_offset=prefix_offset,
-             read_offset=read_offset,
-             skip_special_tokens=False,
-             spaces_between_special_tokens=True,
-         )
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=seed_oss_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
 
         current_text = previous_text + delta_text
 
@@ -271,8 +298,9 @@ def stream_delta_message_generator(
             yield delta_message
 
         previous_text = current_text
-        previous_tokens = (previous_tokens +
-                           new_tokens if previous_tokens else new_tokens)
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
         prefix_offset = new_prefix_offset
         read_offset = new_read_offset
 
@@ -285,22 +313,27 @@ def stream_delta_message_generator(
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         """<seed:tool_call>\n<function=get_weather>\n"""
-         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-         [
-             ToolCall(function=FunctionCall(
-                 name="get_weather",
-                 arguments=json.dumps({
-                     "location": "Barcelona, Spain",
-                 }, ),
-             ),
-                      type='function')
-         ],
-         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         ),
+        (
+            """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+            """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
+            """<seed:tool_call>\n<function=get_weather>\n"""
+            """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+            """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n""",
+        ),
         (
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
@@ -317,13 +350,17 @@ def stream_delta_message_generator(
             """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
             """\n</seed:tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_weather",
-                    arguments=json.dumps({
-                        "location": "Barcelona, Spain",
-                    }, ),
-                ),
-                         type='function')
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
@@ -355,15 +392,18 @@ def stream_delta_message_generator(
             """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
             """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_weather",
-                    arguments=json.dumps(
-                        {
-                            "location": "Barcelona, Spain",
-                            "unit": "celsius",
-                        }, ),
-                ),
-                         type='function')
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "location": "Barcelona, Spain",
+                                "unit": "celsius",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
             """First, I need to remember the function I can use: get_weather. The function requires a """
@@ -382,19 +422,23 @@ def stream_delta_message_generator(
         ),
     ],
 )
-def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
-                              sample_tools, model_output, expected_tool_calls,
-                              expected_content):
+def test_streaming_tool_calls(
+    seed_oss_tool_parser,
+    seed_oss_tokenizer,
+    sample_tools,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
     """Test incremental streaming behavior"""
-    request = ChatCompletionRequest(model=MODEL,
-                                    messages=[],
-                                    tools=sample_tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
 
-    other_content = ''
+    other_content = ""
     tool_states = {}  # Track state per tool index
 
     for delta_message in stream_delta_message_generator(
-            seed_oss_tool_parser, seed_oss_tokenizer, model_output, request):
+        seed_oss_tool_parser, seed_oss_tokenizer, model_output, request
+    ):
         # role should never be streamed from tool parser
         assert not delta_message.role
 
@@ -411,7 +455,7 @@ def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
                         "id": None,
                         "name": None,
                         "arguments": "",
-                        "type": None
+                        "type": None,
                     }
 
                 # First chunk should have id, name, and type
@@ -430,8 +474,7 @@ def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
 
                     if tool_call.function.arguments is not None:
                         # Accumulate arguments incrementally
-                        tool_states[idx][
-                            "arguments"] += tool_call.function.arguments
+                        tool_states[idx]["arguments"] += tool_call.function.arguments
 
     # Verify final content
     assert other_content == expected_content
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index 53ba03a0ae1091d83c675a0c61371ebddfb92c31..6614b6415a04feeee2e7f0c9659d8389efe7b474 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -2,13 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from typing import Optional
 
 import openai
 import pytest
 
-from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
-                    SEARCH_TOOL, WEATHER_TOOL)
+from .utils import (
+    MESSAGES_ASKING_FOR_TOOLS,
+    MESSAGES_WITH_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+)
 
 
 # test: request a chat completion that should return tool calls, so we know they
@@ -23,17 +26,18 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     tool_calls = chat_completion.choices[0].message.tool_calls
 
     # make sure a tool call is present
-    assert choice.message.role == 'assistant'
+    assert choice.message.role == "assistant"
     assert tool_calls is not None
     assert len(tool_calls) == 1
-    assert tool_calls[0].type == 'function'
+    assert tool_calls[0].type == "function"
     assert tool_calls[0].function is not None
     assert isinstance(tool_calls[0].id, str)
     assert len(tool_calls[0].id) >= 9
@@ -53,10 +57,10 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
 
     assert stop_reason == "tool_calls"
 
-    function_name: Optional[str] = None
-    function_args_str: str = ''
-    tool_call_id: Optional[str] = None
-    role_name: Optional[str] = None
+    function_name: str | None = None
+    function_args_str: str = ""
+    tool_call_id: str | None = None
+    role_name: str | None = None
     finish_reason_count: int = 0
 
     # make the same request, streaming
@@ -67,20 +71,21 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     async for chunk in stream:
         assert chunk.choices[0].index == 0
 
         if chunk.choices[0].finish_reason:
             finish_reason_count += 1
-            assert chunk.choices[0].finish_reason == 'tool_calls'
+            assert chunk.choices[0].finish_reason == "tool_calls"
 
         # if a role is being streamed make sure it wasn't already set to
         # something else
         if chunk.choices[0].delta.role:
-            assert not role_name or role_name == 'assistant'
-            role_name = 'assistant'
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
 
         # if a tool call is streamed make sure there's exactly one
         # (based on the request parameters
@@ -108,7 +113,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
                     function_args_str += tool_call.function.arguments
 
     assert finish_reason_count == 1
-    assert role_name == 'assistant'
+    assert role_name == "assistant"
     assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
 
     # validate the name and arguments
@@ -148,14 +153,14 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 0
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
     assert choice.message.content is not None
     assert "98" in choice.message.content  # the temperature from the response
 
@@ -166,7 +171,8 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     chunks: list[str] = []
     finish_reason_count = 0
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 130e9547bdccb8c288164cb7e4a18808cc484214..d5572cfbebe3c74b98f07149100628927fd44013 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -8,9 +8,13 @@ import pytest
 import regex as re
 from pydantic import TypeAdapter
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionToolsParam,
+)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
+
+pytestmark = pytest.mark.cpu_test
 
 EXAMPLE_TOOLS = [
     {
@@ -22,18 +26,16 @@ EXAMPLE_TOOLS = [
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to find the weather for"
+                        "type": "string",
+                        "description": "The city to find the weather for"
                         ", e.g. 'San Francisco'",
                     },
                 },
                 "required": ["city"],
-                "additionalProperties": False
+                "additionalProperties": False,
             },
         },
-        "strict": True
+        "strict": True,
     },
     {
         "type": "function",
@@ -44,35 +46,35 @@ EXAMPLE_TOOLS = [
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to get the forecast for, e.g. 'New York'",
+                        "type": "string",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'New York'",
                     },
                     "days": {
-                        "type":
-                        "integer",
-                        "description":
-                        "Number of days to get the forecast for (1-7)",
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
                     },
                 },
                 "required": ["city", "days"],
-                "additionalProperties": False
+                "additionalProperties": False,
             },
         },
-        "strict": True
+        "strict": True,
     },
 ]
 
 
-def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
-                       should_match: bool):
-    self = MagicMock(tool_choice="required", tools=tools)
-    schema = ChatCompletionRequest._get_json_schema_from_tool(self)
+def _compile_and_check(
+    tools: list[ChatCompletionToolsParam], sample_output, should_match: bool
+):
+    # self = MagicMock(tool_choice="required", tools=tools)
+    # schema = ChatCompletionRequest._get_json_schema_from_tool(self)
+    schema = get_json_schema_from_tools(tools=tools, tool_choice="required")
     assert isinstance(schema, dict)
 
     # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
     from outlines_core.json_schema import build_regex_from_schema
+
     regex = build_regex_from_schema(json.dumps(schema))
     compiled = re.compile(regex)
     matches = compiled.fullmatch(json.dumps(sample_output)) is not None
@@ -81,65 +83,31 @@ def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
 
 
 VALID_TOOL_OUTPUTS = [
-    ([{
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }], True),
-    ([{
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Berlin"
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }, {
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Berlin",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Berlin"
-        }
-    }], True),
+    ([{"name": "get_current_weather", "parameters": {"city": "Vienna"}}], True),
+    (
+        [
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
+    ([{"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}}], True),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+        ],
+        True,
+    ),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_forecast", "parameters": {"city": "Berlin", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
 ]
 
 VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
@@ -147,92 +115,100 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
 
 @pytest.mark.parametrize(
     "sample_output, should_match",
-    VALID_TOOL_OUTPUTS + [
+    VALID_TOOL_OUTPUTS
+    + [
         (None, False),
         ([], False),  # empty list cannot be generated
         ({}, False),  # empty object cannot be generated
         ([{}], False),  # list with empty object cannot be generated
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {}
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": None
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
         (
             {  # tool call without lists cannot be generated
                 "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna"
-                }
+                "parameters": {"city": "Vienna"},
             },
-            False),
+            False,
+        ),
         (
-            [{  # tool call with extra parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna",
-                    "extra": "value"
+            [
+                {  # tool call with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"city": "Vienna", "extra": "value"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (
-            [{  # tool call where parameters are first cannot be generated
-                "parameters": {
-                    "city": "Vienna"
-                },
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # tool call where parameters are first cannot be generated
+                    "parameters": {"city": "Vienna"},
+                    "name": "get_current_weather",
+                }
+            ],
+            False,
+        ),
         (
-            [{  # tool call without all required parameters cannot be generated
-                "name": "get_forecast",
-                "parameters": {
-                    "city": "Vienna"
+            [
+                {  # tool call without all required parameters cannot be generated
+                    "name": "get_forecast",
+                    "parameters": {"city": "Vienna"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (  # tool call with incorrect name/parameters cannot be generated
-            [{
-                "name": "get_weather",
-                "parameters": {
-                    "city": "Vienna",
-                    "days": 7
-                }
-            }], False),
+            [{"name": "get_weather", "parameters": {"city": "Vienna", "days": 7}}],
+            False,
+        ),
         (  #  tool call with both valid and empty function cannot be generated
-            [{
-                "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna"
-                }
-            }, {}], False),
-    ])
+            [{"name": "get_current_weather", "parameters": {"city": "Vienna"}}, {}],
+            False,
+        ),
+    ],
+)
 def test_structured_outputs_json(sample_output, should_match):
-    _compile_and_check(tools=TypeAdapter(
-        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
-                       sample_output=sample_output,
-                       should_match=should_match)
+    _compile_and_check(
+        tools=TypeAdapter(list[ChatCompletionToolsParam]).validate_python(
+            EXAMPLE_TOOLS
+        ),
+        sample_output=sample_output,
+        should_match=should_match,
+    )
 
 
-def update_parameters_none(
-        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+def update_parameters_none(tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
     tool.function.parameters = None
     return tool
 
 
 def update_parameters_empty_dict(
-        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool: ChatCompletionToolsParam,
+) -> ChatCompletionToolsParam:
     tool.function.parameters = {}
     return tool
 
@@ -245,48 +221,60 @@ def update_parameters_empty_dict(
         ({}, False),  # empty object cannot be generated
         ([{}], False),  # list with empty object cannot be generated
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": None
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function with extra parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {
-                    "extra": "value"
+            [
+                {  # function with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"extra": "value"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (
-            [{  # only function with empty parameters object is valid
-                "name": "get_current_weather",
-                "parameters": {}
-            }],
-            True),
-    ])
+            [
+                {  # only function with empty parameters object is valid
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            True,
+        ),
+    ],
+)
 @pytest.mark.parametrize(
-    "update_parameters",
-    [update_parameters_none, update_parameters_empty_dict])
-def test_structured_outputs_json_without_parameters(sample_output,
-                                                    should_match,
-                                                    update_parameters):
+    "update_parameters", [update_parameters_none, update_parameters_empty_dict]
+)
+def test_structured_outputs_json_without_parameters(
+    sample_output, should_match, update_parameters
+):
     updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
-    tools = TypeAdapter(
-        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = TypeAdapter(list[ChatCompletionToolsParam]).validate_python(updated_tools)
     tools = list(map(update_parameters, tools))
-    assert all([
-        tool.function.parameters is None or tool.function.parameters == {}
-        for tool in tools
-    ])
-    _compile_and_check(tools=tools,
-                       sample_output=sample_output,
-                       should_match=should_match)
+    assert all(
+        [
+            tool.function.parameters is None or tool.function.parameters == {}
+            for tool in tools
+        ]
+    )
+    _compile_and_check(
+        tools=tools, sample_output=sample_output, should_match=should_match
+    )
 
 
 @pytest.mark.parametrize("output", VALID_TOOLS)
@@ -304,7 +292,7 @@ def test_streaming_output_valid(output, empty_params, delta_len):
     function_name_returned = False
     messages = []
     for i in range(0, len(output_json), delta_len):
-        delta_text = output_json[i:i + delta_len]
+        delta_text = output_json[i : i + delta_len]
         current_text = previous_text + delta_text
 
         delta_message, function_name_returned = (
@@ -313,7 +301,9 @@ def test_streaming_output_valid(output, empty_params, delta_len):
                 previous_text=previous_text,
                 current_text=current_text,
                 delta_text=delta_text,
-                function_name_returned=function_name_returned))
+                function_name_returned=function_name_returned,
+            )
+        )
 
         if delta_message:
             messages.append(delta_message)
@@ -327,10 +317,12 @@ def test_streaming_output_valid(output, empty_params, delta_len):
             if len(combined_messages) > 1:
                 combined_messages += "},"
 
-            combined_messages += '{"name": "' + \
-                message.tool_calls[0].function.name  + \
-                    '", "parameters": ' + \
-                        message.tool_calls[0].function.arguments
+            combined_messages += (
+                '{"name": "'
+                + message.tool_calls[0].function.name
+                + '", "parameters": '
+                + message.tool_calls[0].function.arguments
+            )
         else:
             combined_messages += message.tool_calls[0].function.arguments
     combined_messages += "}]"
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index c07ca0f56d6b09b455e21f60709d7650a1bd3ed2..122b427d604098c891d67b805d6951ab7f63a789 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -3,17 +3,21 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage, FunctionCall,
-                                              ToolCall)
-from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+pytestmark = pytest.mark.cpu_test
+
 # Use a common model that is likely to be available
 MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
 
@@ -28,12 +32,14 @@ def xlam_tool_parser(xlam_tokenizer):
     return xLAMToolParser(xlam_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -45,10 +51,9 @@ def stream_delta_message_generator(
     xlam_tool_parser: xLAMToolParser,
     xlam_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
-    all_token_ids = xlam_tokenizer.encode(model_output,
-                                          add_special_tokens=False)
+    all_token_ids = xlam_tokenizer.encode(model_output, add_special_tokens=False)
 
     previous_text = ""
     previous_tokens = None
@@ -57,18 +62,19 @@ def stream_delta_message_generator(
     for i, delta_token in enumerate(all_token_ids):
         delta_token_ids = [delta_token]
         previous_token_ids = all_token_ids[:i]
-        current_token_ids = all_token_ids[:i + 1]
-
-        (new_tokens, delta_text, new_prefix_offset,
-         new_read_offset) = (detokenize_incrementally(
-             tokenizer=xlam_tokenizer,
-             all_input_ids=current_token_ids,
-             prev_tokens=previous_tokens,
-             prefix_offset=prefix_offset,
-             read_offset=read_offset,
-             skip_special_tokens=False,
-             spaces_between_special_tokens=True,
-         ))
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=xlam_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
 
         current_text = previous_text + delta_text
 
@@ -85,8 +91,9 @@ def stream_delta_message_generator(
             yield delta_message
 
         previous_text = current_text
-        previous_tokens = (previous_tokens +
-                           new_tokens if previous_tokens else new_tokens)
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
         prefix_offset = new_prefix_offset
         read_offset = new_read_offset
 
@@ -94,7 +101,8 @@ def stream_delta_message_generator(
 def test_extract_tool_calls_no_tools(xlam_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -113,87 +121,113 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
         (
             """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             None,
         ),
         (
             """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "<think>I'll help you with that.</think>",
         ),
         (
             """I'll help you with that.\n```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you with that.",
         ),
         (
             """I'll check the weather for you.[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll check the weather for you.",
         ),
         (
             """I'll help you check the weather.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you check the weather.",
         ),
     ],
 )
-def test_extract_tool_calls(xlam_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    xlam_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -208,25 +242,30 @@ def test_extract_tool_calls(xlam_tool_parser, model_output,
         (
             """[{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Seattle",
-                        "state": "WA",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Seattle",
+                                "state": "WA",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
     ],
 )
-def test_extract_tool_calls_list_structure(xlam_tool_parser, model_output,
-                                           expected_tool_calls,
-                                           expected_content):
+def test_extract_tool_calls_list_structure(
+    xlam_tool_parser, model_output, expected_tool_calls, expected_content
+):
     """Test extraction of tool calls when the model outputs a list-structured tool call."""  # noqa: E501
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -237,20 +276,25 @@ def test_extract_tool_calls_list_structure(xlam_tool_parser, model_output,
 # Test for preprocess_model_output method
 def test_preprocess_model_output(xlam_tool_parser):
     # Test with list structure
-    model_output = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    model_output = (
+        """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    )
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content is None
     assert potential_tool_calls == model_output
 
     # Test with thinking tag
     model_output = """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == "<think>I'll help you with that.</think>"
     assert (
-        potential_tool_calls ==
-        '[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]')
+        potential_tool_calls
+        == '[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]'
+    )
 
     # Test with JSON code block
     model_output = """I'll help you with that.
@@ -258,14 +302,16 @@ def test_preprocess_model_output(xlam_tool_parser):
 [{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]
 ```"""
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == "I'll help you with that."
     assert "get_current_weather" in potential_tool_calls
 
     # Test with no tool calls
     model_output = """I'll help you with that."""
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == model_output
     assert potential_tool_calls is None
 
@@ -279,7 +325,9 @@ def test_streaming_with_list_structure(xlam_tool_parser):
     xlam_tool_parser.current_tool_id = -1
 
     # Simulate receiving a message with list structure
-    current_text = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    current_text = (
+        """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    )
 
     # First call to set up the tool
     xlam_tool_parser.extract_tool_calls_streaming(
@@ -293,8 +341,7 @@ def test_streaming_with_list_structure(xlam_tool_parser):
     )
 
     # Make sure the tool is set up correctly
-    assert (xlam_tool_parser.current_tool_id
-            >= 0), "Tool index should be initialized"
+    assert xlam_tool_parser.current_tool_id >= 0, "Tool index should be initialized"
 
     # Manually set up the state for sending the tool name
     xlam_tool_parser.current_tools_sent = [False]
@@ -330,78 +377,102 @@ def test_streaming_with_list_structure(xlam_tool_parser):
         (
             """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             "",
         ),
         (
             """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "<think>I'll help you with that.</think>",
         ),
         (
             """```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "",
         ),
         (
             """[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "",
         ),
         (
             """I can help with that.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I can help with that.",
         ),
@@ -419,7 +490,8 @@ def test_extract_tool_calls_streaming_incremental(
 
     chunks = []
     for delta_message in stream_delta_message_generator(
-            xlam_tool_parser, xlam_tokenizer, model_output, request):
+        xlam_tool_parser, xlam_tokenizer, model_output, request
+    ):
         chunks.append(delta_message)
 
     # Should have multiple chunks
@@ -431,8 +503,9 @@ def test_extract_tool_calls_streaming_incremental(
     for chunk in chunks:
         if chunk.tool_calls and chunk.tool_calls[0].id:
             header_found = True
-            assert (chunk.tool_calls[0].function.name ==
-                    expected_first_tool.function.name)
+            assert (
+                chunk.tool_calls[0].function.name == expected_first_tool.function.name
+            )
             assert chunk.tool_calls[0].type == "function"
             # Arguments may be empty initially or None
             if chunk.tool_calls[0].function.arguments is not None:
@@ -444,11 +517,13 @@ def test_extract_tool_calls_streaming_incremental(
     # Should have chunks with incremental arguments
     arg_chunks = []
     for chunk in chunks:
-        if (chunk.tool_calls and chunk.tool_calls[0].function.arguments
-                and chunk.tool_calls[0].function.arguments != ""
-                and chunk.tool_calls[0].index ==
-                0  # Only collect arguments from the first tool call
-            ):
+        if (
+            chunk.tool_calls
+            and chunk.tool_calls[0].function.arguments
+            and chunk.tool_calls[0].function.arguments != ""
+            and chunk.tool_calls[0].index
+            == 0  # Only collect arguments from the first tool call
+        ):
             arg_chunks.append(chunk.tool_calls[0].function.arguments)
 
     # Arguments should be streamed incrementally
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index a17fab9aecbcab5e37be5175912ae2675af60fee..38def6f874d7df8f67a9e5a333eabd0f7f706bc1 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -2,10 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import Any, Optional
+from typing import Any
 
-from openai.types.chat import (ChatCompletionMessageParam,
-                               ChatCompletionToolParam)
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletionToolParam
 from typing_extensions import TypedDict
 
 from tests.utils import VLLM_PATH
@@ -14,14 +13,15 @@ from tests.utils import VLLM_PATH
 class ServerConfig(TypedDict, total=False):
     model: str
     arguments: list[str]
-    system_prompt: Optional[str]
-    supports_parallel: Optional[bool]
-    supports_rocm: Optional[bool]
-    extended: Optional[bool]  # tests do not run in CI automatically
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
+    extended: bool | None  # tests do not run in CI automatically
 
 
-def patch_system_prompt(messages: list[dict[str, Any]],
-                        system_prompt: str) -> list[dict[str, Any]]:
+def patch_system_prompt(
+    messages: list[dict[str, Any]], system_prompt: str
+) -> list[dict[str, Any]]:
     new_messages = deepcopy(messages)
     if new_messages[0]["role"] == "system":
         new_messages[0]["content"] = system_prompt
@@ -30,8 +30,9 @@ def patch_system_prompt(messages: list[dict[str, Any]],
     return new_messages
 
 
-def ensure_system_prompt(messages: list[dict[str, Any]],
-                         config: ServerConfig) -> list[dict[str, Any]]:
+def ensure_system_prompt(
+    messages: list[dict[str, Any]], config: ServerConfig
+) -> list[dict[str, Any]]:
     prompt = config.get("system_prompt")
     if prompt:
         return patch_system_prompt(messages, prompt)
@@ -42,92 +43,102 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
 ARGS: list[str] = [
-    "--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
-    "256"
+    "--enable-auto-tool-choice",
+    "--max-model-len",
+    "1024",
+    "--max-num-seqs",
+    "256",
 ]
 
 CONFIGS: dict[str, ServerConfig] = {
     "hermes": {
-        "model":
-        "NousResearch/Hermes-3-Llama-3.1-8B",
+        "model": "NousResearch/Hermes-3-Llama-3.1-8B",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "hermes", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "hermes",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja"),
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
     "llama": {
-        "model":
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama3_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja"),
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "llama3.2": {
-        "model":
-        "meta-llama/Llama-3.2-3B-Instruct",
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama3_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja"),
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "llama4": {
-        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama4_pythonic", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
-            "4"
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama4_pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_pythonic.jinja"),
+            "-tp",
+            "4",
         ],
-        "supports_parallel":
-        False,
-        "extended":
-        True
+        "supports_parallel": False,
+        "extended": True,
     },
     "llama4_json": {
-        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
-            "--distributed-executor-backend", "mp", "--tool-call-parser",
-            "llama4_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "-tp",
+            "4",
+            "--distributed-executor-backend",
+            "mp",
+            "--tool-call-parser",
+            "llama4_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja"),
         ],
-        "supports_parallel":
-        True,
-        "extended":
-        True
+        "supports_parallel": True,
+        "extended": True,
     },
     "mistral": {
-        "model":
-        "mistralai/Mistral-7B-Instruct-v0.3",
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "mistral", "--chat-template",
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
-            "--ignore-patterns=\"consolidated.safetensors\""
+            '--ignore-patterns="consolidated.safetensors"',
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
     # V1 Test: Passing locally but failing in CI. This runs the
     # V0 Engine because of CPU offloading. Need to debug why.
@@ -146,49 +157,50 @@ CONFIGS: dict[str, ServerConfig] = {
     #     False,
     # },
     "granite-3.0-8b": {
-        "model":
-        "ibm-granite/granite-3.0-8b-instruct",
+        "model": "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "granite", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "granite",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja"),
         ],
     },
     "granite-3.1-8b": {
-        "model":
-        "ibm-granite/granite-3.1-8b-instruct",
+        "model": "ibm-granite/granite-3.1-8b-instruct",
         "arguments": [
             "--enforce-eager",
             "--no-enable-prefix-caching",
             "--tool-call-parser",
             "granite",
         ],
-        "supports_parallel":
-        True,
+        "supports_parallel": True,
     },
     "internlm": {
-        "model":
-        "internlm/internlm2_5-7b-chat",
+        "model": "internlm/internlm2_5-7b-chat",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "internlm", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_internlm2_tool.jinja"),
-            "--trust_remote_code"
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "internlm",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code",
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "toolACE": {
-        "model":
-        "Team-ACE/ToolACE-8B",
+        "model": "Team-ACE/ToolACE-8B",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "pythonic", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja"),
         ],
-        "supports_parallel":
-        True,
+        "supports_parallel": True,
     },
 }
 
@@ -201,37 +213,31 @@ WEATHER_TOOL: ChatCompletionToolParam = {
             "type": "object",
             "properties": {
                 "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, "
-                    "e.g. 'San Francisco'"
+                    "type": "string",
+                    "description": "The city to find the weather for, "
+                    "e.g. 'San Francisco'",
                 },
                 "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "must the two-letter abbreviation for the state "
+                    "type": "string",
+                    "description": "must the two-letter abbreviation for the state "
                     "that the city is in, e.g. 'CA' which would "
-                    "mean 'California'"
+                    "mean 'California'",
                 },
                 "unit": {
                     "type": "string",
                     "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
-            }
-        }
-    }
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+        },
+    },
 }
 
 SEARCH_TOOL: ChatCompletionToolParam = {
     "type": "function",
     "function": {
-        "name":
-        "web_search",
-        "description":
-        "Search the internet and get a summary of the top "
+        "name": "web_search",
+        "description": "Search the internet and get a summary of the top "
         "10 webpages. Should only be used if you don't know "
         "the answer to a user query, and the results are likely"
         "to be able to be found with a web search",
@@ -239,124 +245,98 @@ SEARCH_TOOL: ChatCompletionToolParam = {
             "type": "object",
             "properties": {
                 "search_term": {
-                    "type":
-                    "string",
-                    "description":
-                    "The term to use in the search. This should"
+                    "type": "string",
+                    "description": "The term to use in the search. This should"
                     "ideally be keywords to search for, not a"
-                    "natural-language question"
+                    "natural-language question",
                 }
             },
-            "required": ["search_term"]
-        }
-    }
+            "required": ["search_term"],
+        },
+    },
 }
 
-MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "Hi! How are you?"
-}, {
-    "role":
-    "assistant",
-    "content":
-    "I'm doing great! How can I assist you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me a joke please?"
-}]
+MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "Hi! How are you?"},
+    {"role": "assistant", "content": "I'm doing great! How can I assist you?"},
+    {"role": "user", "content": "Can you tell me a joke please?"},
+]
 
-MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas in Fahrenheit?"
-}]
+MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"}
+]
 
-MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas in Fahrenheit?"
-}, {
-    "role":
-    "assistant",
-    "tool_calls": [{
-        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Dallas", "state": "TX", '
-            '"unit": "fahrenheit"}'
-        }
-    }]
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-    "content":
-    "The weather in Dallas is 98 degrees fahrenheit, with partly"
-    "cloudy skies and a low chance of rain."
-}]
+MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"},
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            }
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas is 98 degrees fahrenheit, with partly"
+        "cloudy skies and a low chance of rain.",
+    },
+]
 
-MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas and Orlando, Florida in "
-    "Fahrenheit?"
-}]
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    }
+]
 
-MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas and Orlando, Florida in "
-    "Fahrenheit?"
-}, {
-    "role":
-    "assistant",
-    "tool_calls": [{
-        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Dallas", "state": "TX", '
-            '"unit": "fahrenheit"}'
-        }
-    }, {
-        "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Orlando", "state": "Fl", '
-            '"unit": "fahrenheit"}'
-        }
-    }]
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-    "content":
-    "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
-    "cloudy skies and a chance of rain in the evening."
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
-    "content":
-    "The weather in Orlando FL is 78 degrees fahrenheit with clear"
-    "skies."
-}]
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    },
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+            {
+                "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Orlando", "state": "Fl", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
+        "cloudy skies and a chance of rain in the evening.",
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+        "content": "The weather in Orlando FL is 78 degrees fahrenheit with clear"
+        "skies.",
+    },
+]
diff --git a/tests/tools/test_config_validator.py b/tests/tools/test_config_validator.py
index b0475894a114e2f1b765894f549103db24b47142..d6104dc6d2eb19a9ee32b1d90bb5d23b4b44ff76 100644
--- a/tests/tools/test_config_validator.py
+++ b/tests/tools/test_config_validator.py
@@ -5,13 +5,13 @@ import ast
 
 import pytest
 
-from tools.validate_config import validate_ast
+from tools.pre_commit.validate_config import validate_ast
 
-_TestConfig1 = '''
+_TestConfig1 = """
 @config
 class _TestConfig1:
     pass
-'''
+"""
 
 _TestConfig2 = '''
 @config
@@ -21,12 +21,12 @@ class _TestConfig2:
     """docstring"""
 '''
 
-_TestConfig3 = '''
+_TestConfig3 = """
 @config
 @dataclass
 class _TestConfig3:
     a: int = 1
-'''
+"""
 
 _TestConfig4 = '''
 @config
@@ -37,12 +37,15 @@ class _TestConfig4:
 '''
 
 
-@pytest.mark.parametrize(("test_config", "expected_error"), [
-    (_TestConfig1, "must be a dataclass"),
-    (_TestConfig2, "must have a default"),
-    (_TestConfig3, "must have a docstring"),
-    (_TestConfig4, "must use a single Literal"),
-])
+@pytest.mark.parametrize(
+    ("test_config", "expected_error"),
+    [
+        (_TestConfig1, "must be a dataclass"),
+        (_TestConfig2, "must have a default"),
+        (_TestConfig3, "must have a docstring"),
+        (_TestConfig4, "must use a single Literal"),
+    ],
+)
 def test_config(test_config, expected_error):
     tree = ast.parse(test_config)
     with pytest.raises(Exception, match=expected_error):
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 5196a92cb727f6f14b7c65ec34754531cb82b0d9..9780092b25e66033e488d33ccc4419204899d5bb 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -17,29 +17,21 @@ from vllm.lora.request import LoRARequest
 # 100 training iterations with a training batch size of 100.
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
-    for all tests in this file
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        yield
-
-
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
-    return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
-                    max_model_len=256,
-                    max_num_seqs=8,
-                    tensor_parallel_size=tp,
-                    enable_lora=True,
-                    max_loras=num_loras,
-                    max_lora_rank=8)
+    return vllm.LLM(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        max_model_len=256,
+        max_num_seqs=8,
+        tensor_parallel_size=tp,
+        enable_lora=True,
+        max_loras=num_loras,
+        max_lora_rank=8,
+    )
 
 
-TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips()
-                             ] if tpu.num_available_chips() > 1 else [1]
+TPU_TENSOR_PARALLEL_SIZES = (
+    [1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
+)
 
 
 @pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
@@ -55,12 +47,19 @@ def test_single_lora(tp: int):
     prompt = "What is 1+1? \n"
 
     lora_request = LoRARequest(
-        "lora_adapter_1", 1,
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter")
-    output = llm.generate(prompt,
-                          sampling_params=vllm.SamplingParams(max_tokens=256,
-                                                              temperature=0),
-                          lora_request=lora_request)[0].outputs[0].text
+        "lora_adapter_1",
+        1,
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
+    )
+    output = (
+        llm.generate(
+            prompt,
+            sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+            lora_request=lora_request,
+        )[0]
+        .outputs[0]
+        .text
+    )
 
     answer = output.strip()[0]
 
@@ -73,13 +72,12 @@ def test_lora_hotswapping(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, even
     if we only have space to store 1.
-    
+
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
     """
 
-    lora_name_template = \
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
     lora_requests = [
         LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
         for i in range(1, 5)
@@ -90,10 +88,15 @@ def test_lora_hotswapping(tp: int):
     prompt = "What is 1+1? \n"
 
     for i, req in enumerate(lora_requests):
-        output = llm.generate(prompt,
-                              sampling_params=vllm.SamplingParams(
-                                  max_tokens=256, temperature=0),
-                              lora_request=req)[0].outputs[0].text
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
         answer = output.strip()[0]
 
         assert answer.isdigit()
@@ -105,12 +108,11 @@ def test_multi_lora(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, when
     we have enough space to store all of them.
-    
+
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
     """
-    lora_name_template = \
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
     lora_requests = [
         LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
         for i in range(1, 5)
@@ -121,10 +123,15 @@ def test_multi_lora(tp: int):
     prompt = "What is 1+1? \n"
 
     for i, req in enumerate(lora_requests):
-        output = llm.generate(prompt,
-                              sampling_params=vllm.SamplingParams(
-                                  max_tokens=256, temperature=0),
-                              lora_request=req)[0].outputs[0].text
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
 
         answer = output.strip()[0]
 
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 448b8b2bc094f1f3edd065ed8403a6340f9c35d7..5acfa484f0c1382307fee66c54645615f7eb480b 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -26,16 +26,15 @@ def test_tpu_compilation():
 
         # Currently, top-p sampling is disabled. `top_p` should be 1.0.
         N = 1
-        sampling_params = SamplingParams(temperature=0.7,
-                                         top_p=1.0,
-                                         n=N,
-                                         max_tokens=16)
+        sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  max_num_batched_tokens=256,
-                  max_model_len=256,
-                  max_num_seqs=32,
-                  enforce_eager=False)
+        llm = LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            max_num_batched_tokens=256,
+            max_model_len=256,
+            max_num_seqs=32,
+            enforce_eager=False,
+        )
 
         outputs = llm.generate(prompts, sampling_params)
         for output, answer in zip(outputs, answers):
@@ -45,7 +44,8 @@ def test_tpu_compilation():
             assert generated_text.startswith(answer)
 
     compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
+    )
 
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
@@ -66,9 +66,10 @@ def test_tpu_compilation():
 
     # Check all the compilations are as expected. The dump files include the
     # captured graph for the forward function of the nn.Module.
-    compiled_fns = sorted(glob.glob(
-        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
-                          key=lambda s: extract_compiled_index(s))
+    compiled_fns = sorted(
+        glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
+        key=lambda s: extract_compiled_index(s),
+    )
 
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
@@ -82,4 +83,4 @@ def test_tpu_compilation():
     # ragged_paged_attention
     with open(compiled_fns[1]) as f:
         content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
+        assert kv_cache_prefix in content and attn_prefix in content
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 9c90df1b77010dee144f334748ab8a8a7b8c41e4..cf455ff3edbd3415f7c594fd915cc294404f1847 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 
 from ..utils import compare_two_settings
 
@@ -15,17 +15,20 @@ from ..utils import compare_two_settings
 def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
-                             arg1=[
-                                 "--max-model-len=256",
-                                 "--max-num-seqs=32",
-                                 "--enforce-eager",
-                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
-                             ],
-                             arg2=[
-                                 "--max-model-len=256", "--max-num-seqs=32",
-                                 "--enforce-eager",
-                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
-                             ],
-                             env1={},
-                             env2={})
+        compare_two_settings(
+            "Qwen/Qwen2.5-1.5B-Instruct",
+            arg1=[
+                "--max-model-len=256",
+                "--max-num-seqs=32",
+                "--enforce-eager",
+                f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
+            ],
+            arg2=[
+                "--max-model-len=256",
+                "--max-num-seqs=32",
+                "--enforce-eager",
+                f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
+            ],
+            env1={},
+            env2={},
+        )
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 1e5d9d923d00473dd88f7d75ece5c95df4a5f017..e3236d20bf673a0549d81191bca820fd6fef433d 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -4,17 +4,15 @@
 
 Run `pytest tests/kernels/moe/test_moe_pallas.py`.
 """
+
 import pytest
 import torch
 import torch_xla
 
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.model_executor.layers.fused_moe.moe_pallas import (
-    fused_moe as pallas_moe)
+from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as torch_moe)
-# yapf: enable
+    fused_moe as torch_moe,
+)
 from vllm.platforms import current_platform
 
 if not current_platform.is_tpu():
@@ -43,6 +41,7 @@ def test_pallas_moe(
     dtype: torch.dtype,
 ):
     import torch_xla.core.xla_model as xm
+
     with torch.device(xm.xla_device()):
         a = torch.randn((m, k), dtype=dtype) / 10
         w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 8d9fbd280317ca34a21c198a21aee67c52e747b0..151be5f17fe89cb6786a01d92c0dde7e2d9c0674 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -17,15 +17,15 @@ class GSM8KAccuracyTestConfig:
     expected_value: float
 
     def get_model_args(self) -> str:
-        return (f"pretrained={self.model_name},"
-                "max_model_len=4096,max_num_seqs=32")
+        return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
 
 
 # NOTE: Accuracy scores measured on GPUs.
 ACCURACY_CONFIGS = [
     GSM8KAccuracyTestConfig(
         model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        expected_value=0.76),  # no bias
+        expected_value=0.76,
+    ),  # no bias
     # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
     # a follow-up, move this into the LM-EVAL section of the CI.
@@ -37,7 +37,6 @@ ACCURACY_CONFIGS = [
 
 @pytest.mark.parametrize("config", ACCURACY_CONFIGS)
 def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
-
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=config.get_model_args(),
@@ -47,6 +46,7 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
 
     EXPECTED_VALUE = config.expected_value
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < EXPECTED_VALUE
+        and measured_value + RTOL > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/transformers_utils/test_config_parser_registry.py b/tests/transformers_utils/test_config_parser_registry.py
index 13c654e05d2aca67ba0e90c540011afc10ba42e7..0931bd734f8f0dadd5b8d812814d4eea29a22e47 100644
--- a/tests/transformers_utils/test_config_parser_registry.py
+++ b/tests/transformers_utils/test_config_parser_registry.py
@@ -2,31 +2,29 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
-from typing import Optional, Union
 
 import pytest
 from transformers import PretrainedConfig
 
-from vllm.transformers_utils.config import (get_config_parser,
-                                            register_config_parser)
+from vllm.transformers_utils.config import get_config_parser, register_config_parser
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
 
 
 @register_config_parser("custom_config_parser")
 class CustomConfigParser(ConfigParserBase):
-
-    def parse(self,
-              model: Union[str, Path],
-              trust_remote_code: bool,
-              revision: Optional[str] = None,
-              code_revision: Optional[str] = None,
-              **kwargs) -> tuple[dict, PretrainedConfig]:
+    def parse(
+        self,
+        model: str | Path,
+        trust_remote_code: bool,
+        revision: str | None = None,
+        code_revision: str | None = None,
+        **kwargs,
+    ) -> tuple[dict, PretrainedConfig]:
         raise NotImplementedError
 
 
 def test_register_config_parser():
-    assert isinstance(get_config_parser("custom_config_parser"),
-                      CustomConfigParser)
+    assert isinstance(get_config_parser("custom_config_parser"), CustomConfigParser)
 
 
 def test_invalid_config_parser():
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_get_processor_kwargs_from_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ff9a557fa05a09d4c82d931f9c62aef6408918
--- /dev/null
+++ b/tests/transformers_utils/test_get_processor_kwargs_from_processor.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+
+from transformers.processing_utils import ProcessingKwargs
+from typing_extensions import Unpack
+
+from vllm.transformers_utils.processor import (
+    get_processor_kwargs_from_processor,
+)
+
+
+class _FakeProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore
+    pass
+
+
+def _assert_has_all_expected(keys: set[str]) -> None:
+    # text
+    for k in ("text_pair", "text_target", "text_pair_target"):
+        assert k in keys
+    # image
+    for k in ("do_convert_rgb", "do_resize"):
+        assert k in keys
+    # audio
+    for k in (
+        "fps",
+        "do_sample_frames",
+        "input_data_format",
+        "default_to_square",
+    ):
+        assert k in keys
+    # audio
+    for k in ("padding", "return_attention_mask"):
+        assert k in keys
+
+
+# Path 1: __call__ method has kwargs: Unpack[*ProcessingKwargs]
+class _ProcWithUnpack:
+    def __call__(self, *args, **kwargs: Unpack[_FakeProcessorKwargs]):  # type: ignore
+        return None
+
+
+def test_get_processor_kwargs_from_processor_unpack_path_returns_full_union():
+    proc = _ProcWithUnpack()
+    keys = get_processor_kwargs_from_processor(proc)
+    _assert_has_all_expected(keys)
+
+
+# ---- Path 2: No Unpack, fallback to scanning *ProcessingKwargs in module ----
+
+
+class _ProcWithoutUnpack:
+    def __call__(self, *args, **kwargs):
+        return None
+
+
+def test_get_processor_kwargs_from_processor_module_scan_returns_full_union():
+    # ensure the module scanned by fallback is this test module
+    module_name = _ProcWithoutUnpack.__module__
+    mod = importlib.import_module(module_name)
+    assert hasattr(mod, "_FakeProcessorKwargs")
+
+    proc = _ProcWithoutUnpack()
+    keys = get_processor_kwargs_from_processor(proc)
+    _assert_has_all_expected(keys)
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..beaef04d766bf2e5efc56cabfb1931fefc2044c0
--- /dev/null
+++ b/tests/transformers_utils/test_utils.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
+
+
+def test_is_gcs():
+    assert is_gcs("gs://model-path")
+    assert not is_gcs("s3://model-path/path-to-model")
+    assert not is_gcs("/unix/local/path")
+    assert not is_gcs("nfs://nfs-fqdn.local")
+
+
+def test_is_s3():
+    assert is_s3("s3://model-path/path-to-model")
+    assert not is_s3("gs://model-path")
+    assert not is_s3("/unix/local/path")
+    assert not is_s3("nfs://nfs-fqdn.local")
+
+
+def test_is_cloud_storage():
+    assert is_cloud_storage("gs://model-path")
+    assert is_cloud_storage("s3://model-path/path-to-model")
+    assert not is_cloud_storage("/unix/local/path")
+    assert not is_cloud_storage("nfs://nfs-fqdn.local")
diff --git a/tests/utils.py b/tests/utils.py
index f630c57f46d8061940cf3899febcbefd15444e41..c8f18384c51146cc541fb973e0a036cffb48ab86 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -6,6 +6,7 @@ import contextlib
 import copy
 import functools
 import importlib
+import itertools
 import json
 import os
 import random
@@ -15,12 +16,14 @@ import sys
 import tempfile
 import time
 import warnings
+from collections.abc import Callable, Iterable
 from contextlib import ExitStack, contextmanager, suppress
 from multiprocessing import Process
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Union
+from typing import Any, Literal
 from unittest.mock import patch
 
+import anthropic
 import cloudpickle
 import httpx
 import openai
@@ -33,20 +36,27 @@ from typing_extensions import ParamSpec
 
 import vllm.envs as envs
 from tests.models.utils import TextTextLogprobs
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
+from vllm.distributed import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (FlexibleArgumentParser, GB_bytes,
-                        cuda_device_count_stateless, get_open_port)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.mem_constants import GB_bytes
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 if current_platform.is_rocm():
-    from amdsmi import (amdsmi_get_gpu_vram_usage,
-                        amdsmi_get_processor_handles, amdsmi_init,
-                        amdsmi_shut_down)
+    from amdsmi import (
+        amdsmi_get_gpu_vram_usage,
+        amdsmi_get_processor_handles,
+        amdsmi_init,
+        amdsmi_shut_down,
+    )
 
     @contextmanager
     def _nvml():
@@ -56,9 +66,12 @@ if current_platform.is_rocm():
         finally:
             amdsmi_shut_down()
 elif current_platform.is_cuda():
-    from vllm.third_party.pynvml import (nvmlDeviceGetHandleByIndex,
-                                         nvmlDeviceGetMemoryInfo, nvmlInit,
-                                         nvmlShutdown)
+    from vllm.third_party.pynvml import (
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlInit,
+        nvmlShutdown,
+    )
 
     @contextmanager
     def _nvml():
@@ -81,58 +94,61 @@ VLLM_PATH = Path(__file__).parent.parent
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
-    def _start_server(self, model: str, vllm_serve_args: list[str],
-                      env_dict: Optional[dict[str, str]]) -> None:
-        """Subclasses override this method to customize server process launch
-        """
+    def _start_server(
+        self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
+    ) -> None:
+        """Subclasses override this method to customize server process launch"""
         env = os.environ.copy()
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
         if env_dict is not None:
             env.update(env_dict)
+        serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
+        print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
         self.proc: subprocess.Popen = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
+            serve_cmd,
             env=env,
             stdout=sys.stdout,
             stderr=sys.stderr,
         )
 
-    def __init__(self,
-                 model: str,
-                 vllm_serve_args: list[str],
-                 *,
-                 env_dict: Optional[dict[str, str]] = None,
-                 seed: Optional[int] = 0,
-                 auto_port: bool = True,
-                 max_wait_seconds: Optional[float] = None,
-                 override_hf_configs: Optional[dict[str, Any]] = None) -> None:
+    def __init__(
+        self,
+        model: str,
+        vllm_serve_args: list[str],
+        *,
+        env_dict: dict[str, str] | None = None,
+        seed: int | None = 0,
+        auto_port: bool = True,
+        max_wait_seconds: float | None = None,
+        override_hf_configs: dict[str, Any] | None = None,
+    ) -> None:
         if auto_port:
             if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
-                raise ValueError("You have manually specified the port "
-                                 "when `auto_port=True`.")
+                raise ValueError(
+                    "You have manually specified the port when `auto_port=True`."
+                )
 
             # No need for a port if using unix sockets
             if "--uds" not in vllm_serve_args:
                 # Don't mutate the input args
-                vllm_serve_args = vllm_serve_args + [
-                    "--port", str(get_open_port())
-                ]
+                vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
         if seed is not None:
             if "--seed" in vllm_serve_args:
-                raise ValueError("You have manually specified the seed "
-                                 f"when `seed={seed}`.")
+                raise ValueError(
+                    f"You have manually specified the seed when `seed={seed}`."
+                )
 
             vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
 
         if override_hf_configs is not None:
             vllm_serve_args = vllm_serve_args + [
                 "--hf-overrides",
-                json.dumps(override_hf_configs)
+                json.dumps(override_hf_configs),
             ]
 
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
+        parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
         subparsers = parser.add_subparsers(required=False, dest="subparser")
         parser = ServeSubcommand().subparser_init(subparsers)
         args = parser.parse_args(["--model", model, *vllm_serve_args])
@@ -141,11 +157,10 @@ class RemoteOpenAIServer:
             self.host = None
             self.port = None
         else:
-            self.host = str(args.host or 'localhost')
+            self.host = str(args.host or "127.0.0.1")
             self.port = int(args.port)
 
-        self.show_hidden_metrics = \
-            args.show_hidden_metrics_for_version is not None
+        self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
 
         # download the model before starting the server to avoid timeout
         is_local = os.path.isdir(model)
@@ -159,8 +174,7 @@ class RemoteOpenAIServer:
 
         self._start_server(model, vllm_serve_args, env_dict)
         max_wait_seconds = max_wait_seconds or 240
-        self._wait_for_server(url=self.url_for("health"),
-                              timeout=max_wait_seconds)
+        self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
 
     def __enter__(self):
         return self
@@ -173,15 +187,18 @@ class RemoteOpenAIServer:
             # force kill if needed
             self.proc.kill()
 
-    def _poll(self) -> Optional[int]:
+    def _poll(self) -> int | None:
         """Subclasses override this method to customize process polling"""
         return self.proc.poll()
 
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
         start = time.time()
-        client = (httpx.Client(transport=httpx.HTTPTransport(
-            uds=self.uds)) if self.uds else requests)
+        client = (
+            httpx.Client(transport=httpx.HTTPTransport(uds=self.uds))
+            if self.uds
+            else requests
+        )
         while True:
             try:
                 if client.get(url).status_code == 200:
@@ -197,13 +214,15 @@ class RemoteOpenAIServer:
 
                 time.sleep(0.5)
                 if time.time() - start > timeout:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from None
+                    raise RuntimeError("Server failed to start in time.") from None
 
     @property
     def url_root(self) -> str:
-        return (f"http://{self.uds.split('/')[-1]}"
-                if self.uds else f"http://{self.host}:{self.port}")
+        return (
+            f"http://{self.uds.split('/')[-1]}"
+            if self.uds
+            else f"http://{self.host}:{self.port}"
+        )
 
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
@@ -221,44 +240,66 @@ class RemoteOpenAIServer:
     def get_async_client(self, **kwargs):
         if "timeout" not in kwargs:
             kwargs["timeout"] = 600
-        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
-                                  api_key=self.DUMMY_API_KEY,
-                                  max_retries=0,
-                                  **kwargs)
+        return openai.AsyncOpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
+
+    def get_client_anthropic(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return anthropic.Anthropic(
+            base_url=self.url_for(),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
+
+    def get_async_client_anthropic(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return anthropic.AsyncAnthropic(
+            base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
+        )
 
 
 class RemoteOpenAIServerCustom(RemoteOpenAIServer):
     """Launch test server with custom child process"""
 
-    def _start_server(self, model: str, vllm_serve_args: list[str],
-                      env_dict: Optional[dict[str, str]]) -> None:
+    def _start_server(
+        self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
+    ) -> None:
         self.proc: Process = Process(
-            target=self.child_process_fxn,
-            args=(env_dict, model,
-                  vllm_serve_args))  # type: ignore[assignment]
+            target=self.child_process_fxn, args=(env_dict, model, vllm_serve_args)
+        )  # type: ignore[assignment]
         self.proc.start()
 
-    def __init__(self,
-                 model: str,
-                 vllm_serve_args: list[str],
-                 child_process_fxn: Callable[
-                     [Optional[dict[str, str]], str, list[str]], None],
-                 *,
-                 env_dict: Optional[dict[str, str]] = None,
-                 seed: Optional[int] = 0,
-                 auto_port: bool = True,
-                 max_wait_seconds: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        model: str,
+        vllm_serve_args: list[str],
+        child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None],
+        *,
+        env_dict: dict[str, str] | None = None,
+        seed: int | None = 0,
+        auto_port: bool = True,
+        max_wait_seconds: float | None = None,
+    ) -> None:
         """Store custom child process function then invoke superclass
         constructor which will indirectly launch it."""
         self.child_process_fxn = child_process_fxn
-        super().__init__(model=model,
-                         vllm_serve_args=vllm_serve_args,
-                         env_dict=env_dict,
-                         seed=seed,
-                         auto_port=auto_port,
-                         max_wait_seconds=max_wait_seconds)
-
-    def _poll(self) -> Optional[int]:
+        super().__init__(
+            model=model,
+            vllm_serve_args=vllm_serve_args,
+            env_dict=env_dict,
+            seed=seed,
+            auto_port=auto_port,
+            max_wait_seconds=max_wait_seconds,
+        )
+
+    def _poll(self) -> int | None:
         return self.proc.exitcode
 
     def __exit__(self, exc_type, exc_value, traceback):
@@ -278,17 +319,18 @@ def _test_completion(
     results = []
 
     # test with text prompt
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=5,
-                                           temperature=0.0)
-
-    results.append({
-        "test": "single_completion",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=5, temperature=0.0
+    )
+
+    results.append(
+        {
+            "test": "single_completion",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test using token IDs
     completion = client.completions.create(
@@ -298,43 +340,42 @@ def _test_completion(
         temperature=0.0,
     )
 
-    results.append({
-        "test": "token_ids",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    results.append(
+        {
+            "test": "token_ids",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test seeded random sampling
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
-
-    results.append({
-        "test": "seeded_sampling",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
+    )
+
+    results.append(
+        {
+            "test": "seeded_sampling",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test seeded random sampling with multiple prompts
-    completion = client.completions.create(model=model,
-                                           prompt=[prompt, prompt],
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
-
-    results.append({
-        "test":
-        "seeded_sampling",
-        "text": [choice.text for choice in completion.choices],
-        "finish_reason":
-        [choice.finish_reason for choice in completion.choices],
-        "usage":
-        completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=[prompt, prompt], max_tokens=5, seed=33, temperature=1.0
+    )
+
+    results.append(
+        {
+            "test": "seeded_sampling",
+            "text": [choice.text for choice in completion.choices],
+            "finish_reason": [choice.finish_reason for choice in completion.choices],
+            "usage": completion.usage,
+        }
+    )
 
     # test simple list
     batch = client.completions.create(
@@ -344,11 +385,13 @@ def _test_completion(
         temperature=0.0,
     )
 
-    results.append({
-        "test": "simple_list",
-        "text0": batch.choices[0].text,
-        "text1": batch.choices[1].text,
-    })
+    results.append(
+        {
+            "test": "simple_list",
+            "text0": batch.choices[0].text,
+            "text1": batch.choices[1].text,
+        }
+    )
 
     # test streaming
     batch = client.completions.create(
@@ -365,10 +408,12 @@ def _test_completion(
         choice = chunk.choices[0]
         texts[choice.index] += choice.text
 
-    results.append({
-        "test": "streaming",
-        "texts": texts,
-    })
+    results.append(
+        {
+            "test": "streaming",
+            "texts": texts,
+        }
+    )
 
     return results
 
@@ -381,19 +426,19 @@ def _test_completion_close(
     results = []
 
     # test with text prompt
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=1,
-                                           logprobs=5,
-                                           temperature=0.0)
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=1, logprobs=5, temperature=0.0
+    )
 
     logprobs = completion.choices[0].logprobs.top_logprobs[0]
     logprobs = {k: round(v, 2) for k, v in logprobs.items()}
 
-    results.append({
-        "test": "completion_close",
-        "logprobs": logprobs,
-    })
+    results.append(
+        {
+            "test": "completion_close",
+            "logprobs": logprobs,
+        }
+    )
 
     return results
 
@@ -405,26 +450,21 @@ def _test_chat(
 ):
     results = []
 
-    messages = [{
-        "role": "user",
-        "content": [{
-            "type": "text",
-            "text": prompt
-        }]
-    }]
+    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
     # test with text prompt
-    chat_response = client.chat.completions.create(model=model,
-                                                   messages=messages,
-                                                   max_tokens=5,
-                                                   temperature=0.0)
-
-    results.append({
-        "test": "completion_close",
-        "text": chat_response.choices[0].message.content,
-        "finish_reason": chat_response.choices[0].finish_reason,
-        "usage": chat_response.usage,
-    })
+    chat_response = client.chat.completions.create(
+        model=model, messages=messages, max_tokens=5, temperature=0.0
+    )
+
+    results.append(
+        {
+            "test": "completion_close",
+            "text": chat_response.choices[0].message.content,
+            "finish_reason": chat_response.choices[0].finish_reason,
+            "usage": chat_response.usage,
+        }
+    )
 
     return results
 
@@ -443,11 +483,13 @@ def _test_embeddings(
         encoding_format="float",
     )
 
-    results.append({
-        "test": "single_embedding",
-        "embedding": embeddings.data[0].embedding,
-        "usage": embeddings.usage,
-    })
+    results.append(
+        {
+            "test": "single_embedding",
+            "embedding": embeddings.data[0].embedding,
+            "usage": embeddings.usage,
+        }
+    )
 
     return results
 
@@ -460,74 +502,75 @@ def _test_image_text(
     results = []
 
     # test pure text input
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "How do you feel today?"
-            },
-        ],
-    }]
-
-    chat_completion = client.chat.completions.create(model=model_name,
-                                                     messages=messages,
-                                                     temperature=0.0,
-                                                     max_tokens=1,
-                                                     logprobs=True,
-                                                     top_logprobs=5)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "How do you feel today?"},
+            ],
+        }
+    ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=0.0,
+        max_tokens=1,
+        logprobs=True,
+        top_logprobs=5,
+    )
     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
 
     for x in top_logprobs:
         x.logprob = round(x.logprob, 2)
 
-    results.append({
-        "test": "pure_text",
-        "logprobs": top_logprobs,
-    })
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
-
-    chat_completion = client.chat.completions.create(model=model_name,
-                                                     messages=messages,
-                                                     temperature=0.0,
-                                                     max_tokens=1,
-                                                     logprobs=True,
-                                                     top_logprobs=5)
+    results.append(
+        {
+            "test": "pure_text",
+            "logprobs": top_logprobs,
+        }
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=0.0,
+        max_tokens=1,
+        logprobs=True,
+        top_logprobs=5,
+    )
     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
 
-    results.append({
-        "test": "text_image",
-        "logprobs": top_logprobs,
-    })
+    results.append(
+        {
+            "test": "text_image",
+            "logprobs": top_logprobs,
+        }
+    )
 
     return results
 
 
-def compare_two_settings(model: str,
-                         arg1: list[str],
-                         arg2: list[str],
-                         env1: Optional[dict[str, str]] = None,
-                         env2: Optional[dict[str, str]] = None,
-                         *,
-                         method: str = "generate",
-                         max_wait_seconds: Optional[float] = None) -> None:
+def compare_two_settings(
+    model: str,
+    arg1: list[str],
+    arg2: list[str],
+    env1: dict[str, str] | None = None,
+    env2: dict[str, str] | None = None,
+    *,
+    method: str = "generate",
+    max_wait_seconds: float | None = None,
+) -> None:
     """
     Launch API server with two different sets of arguments/environments
     and compare the results of the API calls.
@@ -549,12 +592,14 @@ def compare_two_settings(model: str,
     )
 
 
-def compare_all_settings(model: str,
-                         all_args: list[list[str]],
-                         all_envs: list[Optional[dict[str, str]]],
-                         *,
-                         method: str = "generate",
-                         max_wait_seconds: Optional[float] = None) -> None:
+def compare_all_settings(
+    model: str,
+    all_args: list[list[str]],
+    all_envs: list[dict[str, str] | None],
+    *,
+    method: str = "generate",
+    max_wait_seconds: float | None = None,
+) -> None:
     """
     Launch API server with several different sets of arguments/environments
     and compare the results of the API calls with the first set of arguments.
@@ -604,21 +649,22 @@ def compare_all_settings(model: str,
             args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
         compare_results: list = []
         results = ref_results if i == 0 else compare_results
-        with RemoteOpenAIServer(model,
-                                args,
-                                env_dict=env,
-                                max_wait_seconds=max_wait_seconds) as server:
+        with RemoteOpenAIServer(
+            model, args, env_dict=env, max_wait_seconds=max_wait_seconds
+        ) as server:
             client = server.get_client()
 
             # test models list
             models = client.models.list()
             models = models.data
             served_model = models[0]
-            results.append({
-                "test": "models_list",
-                "id": served_model.id,
-                "root": served_model.root,
-            })
+            results.append(
+                {
+                    "test": "models_list",
+                    "id": served_model.id,
+                    "root": served_model.root,
+                }
+            )
 
             if method == "generate":
                 results += _test_completion(client, model, prompt, token_ids)
@@ -628,8 +674,9 @@ def compare_all_settings(model: str,
                 results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
-                    client, model,
-                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+                    client,
+                    model,
+                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
                 )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
@@ -642,8 +689,7 @@ def compare_all_settings(model: str,
                 ref_envs = all_envs[0]
                 compare_args = all_args[i]
                 compare_envs = all_envs[i]
-                for ref_result, compare_result in zip(ref_results,
-                                                      compare_results):
+                for ref_result, compare_result in zip(ref_results, compare_results):
                     ref_result = copy.deepcopy(ref_result)
                     compare_result = copy.deepcopy(compare_result)
                     if "embedding" in ref_result and method == "encode":
@@ -654,7 +700,8 @@ def compare_all_settings(model: str,
                         )
                         assert sim >= 0.999, (
                             f"Embedding for {model=} are not the same.\n"
-                            f"cosine_similarity={sim}\n")
+                            f"cosine_similarity={sim}\n"
+                        )
                         del ref_result["embedding"]
                         del compare_result["embedding"]
                     assert ref_result == compare_result, (
@@ -662,7 +709,8 @@ def compare_all_settings(model: str,
                         f"{ref_args=} {ref_envs=}\n"
                         f"{compare_args=} {compare_envs=}\n"
                         f"{ref_result=}\n"
-                        f"{compare_result=}\n")
+                        f"{compare_result=}\n"
+                    )
 
 
 def init_test_distributed_environment(
@@ -677,7 +725,8 @@ def init_test_distributed_environment(
         world_size=pp_size * tp_size,
         rank=rank,
         distributed_init_method=distributed_init_method,
-        local_rank=local_rank)
+        local_rank=local_rank,
+    )
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
@@ -699,13 +748,17 @@ def multi_process_parallel(
     os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
     ray.init(
         runtime_env={
-            "working_dir":
-            VLLM_PATH,
+            "working_dir": VLLM_PATH,
             "excludes": [
-                "build", ".git", "cmake-build-*", "shellcheck", "dist",
-                "ep_kernels_workspace"
-            ]
-        })
+                "build",
+                ".git",
+                "cmake-build-*",
+                "shellcheck",
+                "dist",
+                "ep_kernels_workspace",
+            ],
+        }
+    )
 
     distributed_init_port = get_open_port()
     refs = []
@@ -717,7 +770,8 @@ def multi_process_parallel(
                 pp_size,
                 rank,
                 distributed_init_port,
-            ), )
+            ),
+        )
     ray.get(refs)
 
     ray.shutdown()
@@ -746,11 +800,13 @@ def get_physical_device_indices(devices):
 
 
 @_nvml()
-def wait_for_gpu_memory_to_clear(*,
-                                 devices: list[int],
-                                 threshold_bytes: Optional[int] = None,
-                                 threshold_ratio: Optional[float] = None,
-                                 timeout_s: float = 120) -> None:
+def wait_for_gpu_memory_to_clear(
+    *,
+    devices: list[int],
+    threshold_bytes: int | None = None,
+    threshold_ratio: float | None = None,
+    timeout_s: float = 120,
+) -> None:
     assert threshold_bytes is not None or threshold_ratio is not None
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
@@ -771,29 +827,33 @@ def wait_for_gpu_memory_to_clear(*,
                 gb_used = mem_info.used / 2**30
                 gb_total = mem_info.total / 2**30
             output_raw[device] = (gb_used, gb_total)
-            output[device] = f'{gb_used:.02f}/{gb_total:.02f}'
+            output[device] = f"{gb_used:.02f}/{gb_total:.02f}"
 
-        print('gpu memory used/total (GiB): ', end='')
+        print("gpu memory used/total (GiB): ", end="")
         for k, v in output.items():
-            print(f'{k}={v}; ', end='')
-        print('')
+            print(f"{k}={v}; ", end="")
+        print("")
 
         if threshold_bytes is not None:
             is_free = lambda used, total: used <= threshold_bytes / 2**30
-            threshold = f"{threshold_bytes/2**30} GiB"
+            threshold = f"{threshold_bytes / 2**30} GiB"
         else:
             is_free = lambda used, total: used / total <= threshold_ratio
             threshold = f"{threshold_ratio:.2f}"
 
         dur_s = time.time() - start_time
         if all(is_free(used, total) for used, total in output_raw.values()):
-            print(f'Done waiting for free GPU memory on devices {devices=} '
-                  f'({threshold=}) {dur_s=:.02f}')
+            print(
+                f"Done waiting for free GPU memory on devices {devices=} "
+                f"({threshold=}) {dur_s=:.02f}"
+            )
             break
 
         if dur_s >= timeout_s:
-            raise ValueError(f'Memory of devices {devices=} not free after '
-                             f'{dur_s=:.02f} ({threshold=})')
+            raise ValueError(
+                f"Memory of devices {devices=} not free after "
+                f"{dur_s=:.02f} ({threshold=})"
+            )
 
         time.sleep(5)
 
@@ -801,8 +861,7 @@ def wait_for_gpu_memory_to_clear(*,
 _P = ParamSpec("_P")
 
 
-def fork_new_process_for_each_test(
-        func: Callable[_P, None]) -> Callable[_P, None]:
+def fork_new_process_for_each_test(func: Callable[_P, None]) -> Callable[_P, None]:
     """Decorator to fork a new process for each test function.
     See https://github.com/vllm-project/vllm/issues/7053 for more details.
     """
@@ -816,11 +875,15 @@ def fork_new_process_for_each_test(
 
         # Create a unique temporary file to store exception info from child
         # process. Use test function name and process ID to avoid collisions.
-        with tempfile.NamedTemporaryFile(
+        with (
+            tempfile.NamedTemporaryFile(
                 delete=False,
-                mode='w+b',
+                mode="w+b",
                 prefix=f"vllm_test_{func.__name__}_{os.getpid()}_",
-                suffix=".exc") as exc_file, ExitStack() as delete_after:
+                suffix=".exc",
+            ) as exc_file,
+            ExitStack() as delete_after,
+        ):
             exc_file_path = exc_file.name
             delete_after.callback(os.remove, exc_file_path)
 
@@ -838,6 +901,7 @@ def fork_new_process_for_each_test(
                     os._exit(0)
                 except Exception as e:
                     import traceback
+
                     tb_string = traceback.format_exc()
 
                     # Try to serialize the exception object first
@@ -845,18 +909,18 @@ def fork_new_process_for_each_test(
                     try:
                         # First, try to pickle the actual exception with
                         # its traceback.
-                        exc_to_serialize = {'pickled_exception': e}
+                        exc_to_serialize = {"pickled_exception": e}
                         # Test if it can be pickled
                         cloudpickle.dumps(exc_to_serialize)
                     except (Exception, KeyboardInterrupt):
                         # Fall back to string-based approach.
                         exc_to_serialize = {
-                            'exception_type': type(e).__name__,
-                            'exception_msg': str(e),
-                            'traceback': tb_string,
+                            "exception_type": type(e).__name__,
+                            "exception_msg": str(e),
+                            "traceback": tb_string,
                         }
                     try:
-                        with open(exc_file_path, 'wb') as f:
+                        with open(exc_file_path, "wb") as f:
                             cloudpickle.dump(exc_to_serialize, f)
                     except Exception:
                         # Fallback: just print the traceback.
@@ -868,8 +932,7 @@ def fork_new_process_for_each_test(
                 pgid = os.getpgid(pid)
                 _pid, _exitcode = os.waitpid(pid, 0)
                 # ignore SIGTERM signal itself
-                old_signal_handler = signal.signal(signal.SIGTERM,
-                                                   signal.SIG_IGN)
+                old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
                 # kill all child processes
                 os.killpg(pgid, signal.SIGTERM)
                 # restore the signal handler
@@ -878,12 +941,15 @@ def fork_new_process_for_each_test(
                     # Try to read the exception from the child process
                     exc_info = {}
                     if os.path.exists(exc_file_path):
-                        with contextlib.suppress(Exception), \
-                            open(exc_file_path, 'rb') as f:
+                        with (
+                            contextlib.suppress(Exception),
+                            open(exc_file_path, "rb") as f,
+                        ):
                             exc_info = cloudpickle.load(f)
 
-                    if (original_exception :=
-                            exc_info.get('pickled_exception')) is not None:
+                    if (
+                        original_exception := exc_info.get("pickled_exception")
+                    ) is not None:
                         # Re-raise the actual exception object if it was
                         # successfully pickled.
                         assert isinstance(original_exception, Exception)
@@ -901,33 +967,33 @@ def fork_new_process_for_each_test(
                     raise AssertionError(
                         f"function {func.__name__} failed when called with"
                         f" args {args} and kwargs {kwargs}"
-                        f" (exit code: {_exitcode})") from None
+                        f" (exit code: {_exitcode})"
+                    ) from None
 
     return wrapper
 
 
-def spawn_new_process_for_each_test(
-        f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to spawn a new process for each test function.
-    """
+def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to spawn a new process for each test function."""
 
     @functools.wraps(f)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
         # Check if we're already in a subprocess
-        if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
+        if os.environ.get("RUNNING_IN_SUBPROCESS") == "1":
             # If we are, just run the function directly
             return f(*args, **kwargs)
 
         import torch.multiprocessing as mp
+
         with suppress(RuntimeError):
-            mp.set_start_method('spawn')
+            mp.set_start_method("spawn")
 
         # Get the module
         module_name = f.__module__
 
         # Create a process with environment variable set
         env = os.environ.copy()
-        env['RUNNING_IN_SUBPROCESS'] = '1'
+        env["RUNNING_IN_SUBPROCESS"] = "1"
 
         with tempfile.TemporaryDirectory() as tempdir:
             output_filepath = os.path.join(tempdir, "new_process.tmp")
@@ -935,31 +1001,36 @@ def spawn_new_process_for_each_test(
             # `cloudpickle` allows pickling complex functions directly
             input_bytes = cloudpickle.dumps((f, output_filepath))
 
+            repo_root = str(VLLM_PATH.resolve())
+
+            env = dict(env or os.environ)
+            env["PYTHONPATH"] = repo_root + os.pathsep + env.get("PYTHONPATH", "")
+
             cmd = [sys.executable, "-m", f"{module_name}"]
 
-            returned = subprocess.run(cmd,
-                                      input=input_bytes,
-                                      capture_output=True,
-                                      env=env)
+            returned = subprocess.run(
+                cmd, input=input_bytes, capture_output=True, env=env
+            )
 
             # check if the subprocess is successful
             try:
                 returned.check_returncode()
             except Exception as e:
                 # wrap raised exception to provide more information
-                raise RuntimeError(f"Error raised in subprocess:\n"
-                                   f"{returned.stderr.decode()}") from e
+                raise RuntimeError(
+                    f"Error raised in subprocess:\n{returned.stderr.decode()}"
+                ) from e
 
     return wrapper
 
 
 def create_new_process_for_each_test(
-    method: Optional[Literal["spawn", "fork"]] = None
+    method: Literal["spawn", "fork"] | None = None,
 ) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
     """Creates a decorator that runs each test function in a new process.
 
     Args:
-        method: The process creation method. Can be either "spawn" or "fork". 
+        method: The process creation method. Can be either "spawn" or "fork".
                If not specified, it defaults to "spawn" on ROCm and XPU
                platforms and "fork" otherwise.
 
@@ -970,8 +1041,7 @@ def create_new_process_for_each_test(
         use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
         method = "spawn" if use_spawn else "fork"
 
-    assert method in ["spawn",
-                      "fork"], "Method must be either 'spawn' or 'fork'"
+    assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'"
 
     if method == "fork":
         return fork_new_process_for_each_test
@@ -1051,11 +1121,11 @@ async def completions_with_server_args(
     prompts: list[str],
     model_name: str,
     server_cli_args: list[str],
-    num_logprobs: Optional[int],
+    num_logprobs: int | None,
     max_wait_seconds: int = 240,
-    max_tokens: Union[int, list] = 5,
+    max_tokens: int | list = 5,
 ) -> list[Completion]:
-    '''Construct a remote OpenAI server, obtain an async client to the
+    """Construct a remote OpenAI server, obtain an async client to the
     server & invoke the completions API to obtain completions.
 
     Args:
@@ -1071,7 +1141,7 @@ async def completions_with_server_args(
 
     Returns:
       OpenAI Completion instance
-    '''
+    """
 
     if isinstance(max_tokens, int):
         max_tokens = [max_tokens] * len(prompts)
@@ -1079,17 +1149,21 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    with RemoteOpenAIServer(model_name,
-                            server_cli_args,
-                            max_wait_seconds=max_wait_seconds) as server:
+    with RemoteOpenAIServer(
+        model_name, server_cli_args, max_wait_seconds=max_wait_seconds
+    ) as server:
         client = server.get_async_client()
-        outputs = [ client.completions.create(model=model_name,
-                                              prompt=[p],
-                                              temperature=0,
-                                              stream=False,
-                                              max_tokens=max_tok,
-                                              logprobs=num_logprobs) \
-                    for p, max_tok in zip(prompts, max_tokens) ]
+        outputs = [
+            client.completions.create(
+                model=model_name,
+                prompt=[p],
+                temperature=0,
+                stream=False,
+                max_tokens=max_tok,
+                logprobs=num_logprobs,
+            )
+            for p, max_tok in zip(prompts, max_tokens)
+        ]
         outputs = await asyncio.gather(*outputs)
 
     assert outputs is not None, "Completion API call failed."
@@ -1098,24 +1172,31 @@ async def completions_with_server_args(
 
 
 def get_client_text_generations(completions: list[Completion]) -> list[str]:
-    '''Extract generated tokens from the output of a
+    """Extract generated tokens from the output of a
     request made to an Open-AI-protocol completions endpoint.
-    '''
+    """
     assert all([len(x.choices) == 1 for x in completions])
     return [x.choices[0].text for x in completions]
 
 
 def get_client_text_logprob_generations(
-        completions: list[Completion]) -> list[TextTextLogprobs]:
-    '''Operates on the output of a request made to an Open-AI-protocol
+    completions: list[Completion],
+) -> list[TextTextLogprobs]:
+    """Operates on the output of a request made to an Open-AI-protocol
     completions endpoint; obtains top-rank logprobs for each token in
     each {class}`SequenceGroup`
-    '''
+    """
     text_generations = get_client_text_generations(completions)
-    text = ''.join(text_generations)
-    return [(text_generations, text,
-             (None if x.logprobs is None else x.logprobs.top_logprobs))
-            for completion in completions for x in completion.choices]
+    text = "".join(text_generations)
+    return [
+        (
+            text_generations,
+            text,
+            (None if x.logprobs is None else x.logprobs.top_logprobs),
+        )
+        for completion in completions
+        for x in completion.choices
+    ]
 
 
 def has_module_attribute(module_name, attribute_name):
@@ -1136,11 +1217,14 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
         attn_backend_list = ["TRITON_ATTN"]
         try:
             import aiter  # noqa: F401
+
             attn_backend_list.append("FLASH_ATTN")
         except Exception:
             print("Skip FLASH_ATTN on ROCm as aiter is not installed")
 
         return attn_backend_list
+    elif current_platform.is_xpu():
+        return ["FLASH_ATTN", "TRITON_ATTN"]
     else:
         raise ValueError("Unsupported platform")
 
@@ -1148,8 +1232,9 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
 @contextmanager
 def override_cutlass_fp8_supported(value: bool):
     with patch(
-            "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported",
-            return_value=value):
+        "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported",
+        return_value=value,
+    ):
         yield
 
 
@@ -1170,8 +1255,10 @@ def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     for _ in range(batch_size):
         idx = random.randint(30, 90)
         indices.append(idx)
-        prompt = "```python\n# We set a number of variables, " + \
-                 f"x{idx} will be important later\n"
+        prompt = (
+            "```python\n# We set a number of variables, "
+            + f"x{idx} will be important later\n"
+        )
         ln = random.randint(*ln_range)
         for k in range(30, ln):
             v = random.randint(10, 99)
@@ -1184,10 +1271,9 @@ def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     return prompts, answer, indices
 
 
-def check_answers(indices: list[int],
-                  answer: list[int],
-                  outputs: list[str],
-                  accept_rate: float = 0.7):
+def check_answers(
+    indices: list[int], answer: list[int], outputs: list[str], accept_rate: float = 0.7
+):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -1197,3 +1283,23 @@ def check_answers(indices: list[int],
     frac_ok = numok / len(answer)
     print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
     assert frac_ok >= accept_rate
+
+
+def flat_product(*iterables: Iterable[Any]):
+    """
+    Flatten lists of tuples of the cartesian product.
+    Useful when we want to avoid nested tuples to allow
+    test params to be unpacked directly from the decorator.
+
+    Example:
+    flat_product([(1, 2), (3, 4)], ["a", "b"]) ->
+    [
+      (1, 2, "a"),
+      (1, 2, "b"),
+      (3, 4, "a"),
+      (3, 4, "b"),
+    ]
+    """
+    for element in itertools.product(*iterables):
+        normalized = (e if isinstance(e, tuple) else (e,) for e in element)
+        yield tuple(itertools.chain(*normalized))
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3310753d2b6d6973073393647aa57180e73e1b0d
--- /dev/null
+++ b/tests/utils_/test_argparse_utils.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+
+import json
+import os
+
+import pytest
+import yaml
+from transformers import AutoTokenizer
+from pydantic import ValidationError
+
+from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
+
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from ..utils import flat_product
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument(
+        "--image-input-type", choices=["pixel_values", "image_features"]
+    )
+    parser.add_argument("--model-name")
+    parser.add_argument("--batch-size", type=int)
+    parser.add_argument("--enable-feature", action="store_true")
+    parser.add_argument("--hf-overrides", type=json.loads)
+    parser.add_argument("-O", "--compilation-config", type=json.loads)
+    return parser
+
+
+@pytest.fixture
+def parser_with_config():
+    parser = FlexibleArgumentParser()
+    parser.add_argument("serve")
+    parser.add_argument("model_tag", nargs="?")
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--served-model-name", type=str)
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--tensor-parallel-size", type=int)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(["--image_input_type", "pixel_values"])
+    assert args.image_input_type == "pixel_values"
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args(
+        ["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
+    )
+    assert args.image_input_type == "image_features"
+    assert args.model_name == "facebook/opt-125m"
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
+    )
+    assert args.image_input_type == "pixel_values"
+    assert args.model_name == "facebook/opt-125m"
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(["--batch_size", "32"])
+    assert args.batch_size == 32
+    args = parser.parse_args(["--batch-size", "32"])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(["--enable_feature"])
+    assert args.enable_feature is True
+    args = parser.parse_args(["--enable-feature"])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--image_input_type", "invalid_choice"])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument("--required-arg", required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])
+
+
+def test_cli_override_to_config(parser_with_config, cli_config_file):
+    args = parser_with_config.parse_args(
+        ["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
+    )
+    assert args.tensor_parallel_size == 3
+    args = parser_with_config.parse_args(
+        ["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
+    )
+    assert args.tensor_parallel_size == 3
+    assert args.port == 12312
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "mymodel",
+            "--tensor-parallel-size",
+            "3",
+            "--config",
+            cli_config_file,
+            "--port",
+            "666",
+        ]
+    )
+    assert args.tensor_parallel_size == 3
+    assert args.port == 666
+
+
+def test_config_args(parser_with_config, cli_config_file):
+    args = parser_with_config.parse_args(
+        ["serve", "mymodel", "--config", cli_config_file]
+    )
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code
+
+
+def test_config_file(parser_with_config):
+    with pytest.raises(FileNotFoundError):
+        parser_with_config.parse_args(
+            ["serve", "mymodel", "--config", "test_config.yml"]
+        )
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(
+            ["serve", "mymodel", "--config", "./data/test_config.json"]
+        )
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(
+            [
+                "serve",
+                "mymodel",
+                "--tensor-parallel-size",
+                "3",
+                "--config",
+                "--batch-size",
+                "32",
+            ]
+        )
+
+
+def test_no_model_tag(parser_with_config, cli_config_file):
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(["serve", "--config", cli_config_file])
+
+
+def test_dict_args(parser):
+    args = [
+        "--model-name=something.something",
+        "--hf-overrides.key1",
+        "val1",
+        # Test nesting
+        "--hf-overrides.key2.key3",
+        "val2",
+        "--hf-overrides.key2.key4",
+        "val3",
+        # Test compile config and compilation mode
+        "-O.use_inductor=true",
+        "-O.backend",
+        "custom",
+        "-O1",
+        # Test = sign
+        "--hf-overrides.key5=val4",
+        # Test underscore to dash conversion
+        "--hf_overrides.key_6",
+        "val5",
+        "--hf_overrides.key-7.key_8",
+        "val6",
+        # Test data type detection
+        "--hf_overrides.key9",
+        "100",
+        "--hf_overrides.key10",
+        "100.0",
+        "--hf_overrides.key11",
+        "true",
+        "--hf_overrides.key12.key13",
+        "null",
+        # Test '-' and '.' in value
+        "--hf_overrides.key14.key15",
+        "-minus.and.dot",
+        # Test array values
+        "-O.custom_ops+",
+        "-quant_fp8",
+        "-O.custom_ops+=+silu_mul,-rms_norm",
+    ]
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.model_name == "something.something"
+    assert parsed_args.hf_overrides == {
+        "key1": "val1",
+        "key2": {
+            "key3": "val2",
+            "key4": "val3",
+        },
+        "key5": "val4",
+        "key_6": "val5",
+        "key-7": {
+            "key_8": "val6",
+        },
+        "key9": 100,
+        "key10": 100.0,
+        "key11": True,
+        "key12": {
+            "key13": None,
+        },
+        "key14": {
+            "key15": "-minus.and.dot",
+        },
+    }
+    assert parsed_args.compilation_config == {
+        "mode": 1,
+        "use_inductor": True,
+        "backend": "custom",
+        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
+    }
+
+
+def test_duplicate_dict_args(caplog_vllm, parser):
+    args = [
+        "--model-name=something.something",
+        "--hf-overrides.key1",
+        "val1",
+        "--hf-overrides.key1",
+        "val2",
+        "-O1",
+        "-O.mode",
+        "2",
+        "-O3",
+    ]
+
+    parsed_args = parser.parse_args(args)
+    # Should be the last value
+    assert parsed_args.hf_overrides == {"key1": "val2"}
+    assert parsed_args.compilation_config == {"mode": 3}
+
+    assert len(caplog_vllm.records) == 1
+    assert "duplicate" in caplog_vllm.text
+    assert "--hf-overrides.key1" in caplog_vllm.text
+    assert "-O.mode" in caplog_vllm.text
+
+
+def test_model_specification(
+    parser_with_config, cli_config_file, cli_config_file_with_model
+):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args(
+        ["serve", "cli-model", "--config", cli_config_file_with_model]
+    )
+    assert args.model_tag == "cli-model"
+    assert args.served_model_name == "mymodel"
+
+    # Test model from config file works
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "--config",
+            cli_config_file_with_model,
+        ]
+    )
+    assert args.model == "config-model"
+    assert args.served_model_name == "mymodel"
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(["serve", "--config", cli_config_file])
+
+    # Test using --model option raises error
+    # with pytest.raises(
+    #         ValueError,
+    #         match=
+    #     ("With `vllm serve`, you should provide the model as a positional "
+    #      "argument or in a config file instead of via the `--model` option."),
+    # ):
+    #     parser_with_config.parse_args(['serve', '--model', 'my-model'])
+
+    # Test using --model option back-compatibility
+    # (when back-compatibility ends, the above test should be uncommented
+    # and the below test should be removed)
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "--tensor-parallel-size",
+            "2",
+            "--model",
+            "my-model",
+            "--trust-remote-code",
+            "--port",
+            "8001",
+        ]
+    )
+    assert args.model is None
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.port == 8001
+
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "--tensor-parallel-size=2",
+            "--model=my-model",
+            "--trust-remote-code",
+            "--port=8001",
+        ]
+    )
+    assert args.model is None
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.port == 8001
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "cli-model",
+            "--config",
+            cli_config_file_with_model,
+        ]
+    )
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.port == 12312
+
+
+def test_convert_ids_list_to_tokens():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    token_ids = tokenizer.encode("Hello, world!")
+    # token_ids = [9707, 11, 1879, 0]
+    assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
+    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
+    assert tokens == ["Hello", ",", " world", "!"]
+
+
+def test_load_config_file(tmp_path):
+    # Define the configuration data
+    config_data = {
+        "enable-logging": True,
+        "list-arg": ["item1", "item2"],
+        "port": 12323,
+        "tensor-parallel-size": 4,
+    }
+
+    # Write the configuration data to a temporary YAML file
+    config_file_path = tmp_path / "config.yaml"
+    with open(config_file_path, "w") as config_file:
+        yaml.dump(config_data, config_file)
+
+    # Initialize the parser
+    parser = FlexibleArgumentParser()
+
+    # Call the function with the temporary file path
+    processed_args = parser.load_config_file(str(config_file_path))
+
+    # Expected output
+    expected_args = [
+        "--enable-logging",
+        "--list-arg",
+        "item1",
+        "item2",
+        "--port",
+        "12323",
+        "--tensor-parallel-size",
+        "4",
+    ]
+
+    # Assert that the processed arguments match the expected output
+    assert processed_args == expected_args
+    os.remove(str(config_file_path))
+
+
+def test_compilation_mode_string_values(parser):
+    """Test that -O.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-O.mode", "0"])
+    assert args.compilation_config == {"mode": 0}
+
+    args = parser.parse_args(["-O3"])
+    assert args.compilation_config == {"mode": 3}
+
+    args = parser.parse_args(["-O.mode=NONE"])
+    assert args.compilation_config == {"mode": "NONE"}
+
+    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
+
+    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    assert args.compilation_config == {"mode": "VLLM_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=none"])
+    assert args.compilation_config == {"mode": "none"}
+
+    args = parser.parse_args(["-O.mode=vllm_compile"])
+    assert args.compilation_config == {"mode": "vllm_compile"}
+
+
+def test_compilation_config_mode_validator():
+    """Test that CompilationConfig.mode field validator converts strings to integers."""
+    from vllm.config.compilation import CompilationConfig, CompilationMode
+
+    config = CompilationConfig(mode=0)
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode=3)
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="NONE")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="STOCK_TORCH_COMPILE")
+    assert config.mode == CompilationMode.STOCK_TORCH_COMPILE
+
+    config = CompilationConfig(mode="DYNAMO_TRACE_ONCE")
+    assert config.mode == CompilationMode.DYNAMO_TRACE_ONCE
+
+    config = CompilationConfig(mode="VLLM_COMPILE")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="none")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="vllm_compile")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    with pytest.raises(ValidationError, match="Invalid compilation mode"):
+        CompilationConfig(mode="INVALID_MODE")
+
+
+def test_flat_product():
+    # Check regular itertools.product behavior
+    result1 = list(flat_product([1, 2, 3], ["a", "b"]))
+    assert result1 == [
+        (1, "a"),
+        (1, "b"),
+        (2, "a"),
+        (2, "b"),
+        (3, "a"),
+        (3, "b"),
+    ]
+
+    # check that the tuples get flattened
+    result2 = list(flat_product([(1, 2), (3, 4)], ["a", "b"], [(5, 6)]))
+    assert result2 == [
+        (1, 2, "a", 5, 6),
+        (1, 2, "b", 5, 6),
+        (3, 4, "a", 5, 6),
+        (3, 4, "b", 5, 6),
+    ]
diff --git a/tests/utils_/test_async_utils.py b/tests/utils_/test_async_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d116bdfd814061135823cd6ab302e70b99a325
--- /dev/null
+++ b/tests/utils_/test_async_utils.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+from collections.abc import AsyncIterator
+
+import pytest
+
+from vllm.utils.async_utils import merge_async_iterators
+
+
+async def _mock_async_iterator(idx: int):
+    try:
+        while True:
+            yield f"item from iterator {idx}"
+            await asyncio.sleep(0.1)
+    except asyncio.CancelledError:
+        print(f"iterator {idx} cancelled")
+
+
+@pytest.mark.asyncio
+async def test_merge_async_iterators():
+    iterators = [_mock_async_iterator(i) for i in range(3)]
+    merged_iterator = merge_async_iterators(*iterators)
+
+    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
+        async for idx, output in generator:
+            print(f"idx: {idx}, output: {output}")
+
+    task = asyncio.create_task(stream_output(merged_iterator))
+    await asyncio.sleep(0.5)
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    for iterator in iterators:
+        try:
+            await asyncio.wait_for(anext(iterator), 1)
+        except StopAsyncIteration:
+            # All iterators should be cancelled and print this message.
+            print("Iterator was cancelled normally")
+        except (Exception, asyncio.CancelledError) as e:
+            raise AssertionError() from e
diff --git a/tests/utils_/test_cache.py b/tests/utils_/test_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..e361006fd8e66dd2a0a329cbd828888ad6987100
--- /dev/null
+++ b/tests/utils_/test_cache.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.utils.cache import CacheInfo, LRUCache
+
+
+class TestLRUCache(LRUCache):
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+
+    assert cache.get(2) == 2
+    assert cache.stat() == CacheInfo(hits=1, total=1)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    assert cache[2] == 2
+    assert cache.stat() == CacheInfo(hits=2, total=2)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    assert cache.get(-1) is None
+    assert cache.stat() == CacheInfo(hits=2, total=3)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
diff --git a/tests/utils_/test_collection_utils.py b/tests/utils_/test_collection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19f4a3d1c95f226b9253a01ce198ce3331d33ffe
--- /dev/null
+++ b/tests/utils_/test_collection_utils.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.utils.collection_utils import swap_dict_values
+
+
+@pytest.mark.parametrize(
+    "obj,key1,key2",
+    [
+        # Tests for both keys exist
+        ({1: "a", 2: "b"}, 1, 2),
+        # Tests for one key does not exist
+        ({1: "a", 2: "b"}, 1, 3),
+        # Tests for both keys do not exist
+        ({1: "a", 2: "b"}, 3, 4),
+    ],
+)
+def test_swap_dict_values(obj, key1, key2):
+    original_obj = obj.copy()
+
+    swap_dict_values(obj, key1, key2)
+
+    if key1 in original_obj:
+        assert obj[key2] == original_obj[key1]
+    else:
+        assert key2 not in obj
+    if key2 in original_obj:
+        assert obj[key1] == original_obj[key2]
+    else:
+        assert key1 not in obj
diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce1ada095f184f1fa902cd25f369fc5ddb9fa56
--- /dev/null
+++ b/tests/utils_/test_func_utils.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+
+import pytest
+
+from vllm.utils.func_utils import deprecate_kwargs, supports_kw
+
+from ..utils import error_on_warning
+
+
+def test_deprecate_kwargs_always():
+    @deprecate_kwargs("old_arg", is_deprecated=True)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_never():
+    @deprecate_kwargs("old_arg", is_deprecated=False)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_dynamic():
+    is_deprecated = True
+
+    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+    is_deprecated = False
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_additional_message():
+    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="abcd"):
+        dummy(old_arg=1)
+
+
+@pytest.mark.parametrize(
+    ("callable", "kw_name", "requires_kw_only", "allow_var_kwargs", "is_supported"),
+    [
+        # Tests for positional argument support
+        (lambda foo: None, "foo", True, True, False),
+        (lambda foo: None, "foo", False, True, True),
+        # Tests for positional or keyword / keyword only
+        (lambda foo=100: None, "foo", True, True, False),
+        (lambda *, foo: None, "foo", False, True, True),
+        # Tests to make sure the names of variadic params are NOT supported
+        (lambda *args: None, "args", False, True, False),
+        (lambda **kwargs: None, "kwargs", False, True, False),
+        # Tests for if we allow var kwargs to add support
+        (lambda foo: None, "something_else", False, True, False),
+        (lambda foo, **kwargs: None, "something_else", False, True, True),
+        (lambda foo, **kwargs: None, "kwargs", True, True, False),
+        (lambda foo, **kwargs: None, "foo", True, True, False),
+    ],
+)
+def test_supports_kw(
+    callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
+):
+    assert (
+        supports_kw(
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
+            allow_var_kwargs=allow_var_kwargs,
+        )
+        == is_supported
+    )
diff --git a/tests/utils_/test_gc_utils.py b/tests/utils_/test_gc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1d0de87c81bad724880787867e5affa9be9bf12
--- /dev/null
+++ b/tests/utils_/test_gc_utils.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any
+
+from vllm.utils.gc_utils import (
+    GCDebugConfig,
+    _compute_detailed_type,
+    _compute_top_gc_collected_objects,
+)
+
+
+@dataclass
+class Normal:
+    v: int
+
+
+@dataclass
+class ListWrapper:
+    vs: list[int]
+
+    def __len__(self) -> int:
+        return len(self.vs)
+
+
+def test_compute_detailed_type():
+    assert (
+        _compute_detailed_type(Normal(v=8))
+        == "<class 'tests.utils_.test_gc_utils.Normal'>"
+    )
+
+    assert _compute_detailed_type([1, 2, 3]) == "<class 'list'>(size:3)"
+    assert _compute_detailed_type({4, 5}) == "<class 'set'>(size:2)"
+    assert _compute_detailed_type({6: 7}) == "<class 'dict'>(size:1)"
+    assert (
+        _compute_detailed_type(ListWrapper(vs=[]))
+        == "<class 'tests.utils_.test_gc_utils.ListWrapper'>(size:0)"
+    )
+
+
+def test_compute_top_gc_collected_objects():
+    objects: list[Any] = [
+        [1, 2, 3],
+        [4, 5, 6],
+        [7, 8, 9],
+        [10, 11, 12],
+        {13, 14},
+        {15: 16, 17: 18},
+        Normal(v=19),
+        Normal(v=20),
+        Normal(v=21),
+    ]
+    assert _compute_top_gc_collected_objects(objects, top=-1) == ""
+    assert _compute_top_gc_collected_objects(objects, top=0) == ""
+    assert (
+        _compute_top_gc_collected_objects(objects, top=1)
+        == "    4:<class 'list'>(size:3)"
+    )
+    assert _compute_top_gc_collected_objects(objects, top=2) == "\n".join(
+        [
+            "    4:<class 'list'>(size:3)",
+            "    3:<class 'tests.utils_.test_gc_utils.Normal'>",
+        ]
+    )
+    assert _compute_top_gc_collected_objects(objects, top=3) == "\n".join(
+        [
+            "    4:<class 'list'>(size:3)",
+            "    3:<class 'tests.utils_.test_gc_utils.Normal'>",
+            "    1:<class 'set'>(size:2)",
+        ]
+    )
+
+
+def test_gc_debug_config():
+    assert not GCDebugConfig(None).enabled
+    assert not GCDebugConfig("").enabled
+    assert not GCDebugConfig("0").enabled
+
+    config = GCDebugConfig("1")
+    assert config.enabled
+    assert config.top_objects == -1
+
+    config = GCDebugConfig('{"top_objects":5}')
+    assert config.enabled
+    assert config.top_objects == 5
diff --git a/tests/utils_/test_hashing.py b/tests/utils_/test_hashing.py
new file mode 100644
index 0000000000000000000000000000000000000000..484627a547d0d8644d1fbc3212aff092f7709cd8
--- /dev/null
+++ b/tests/utils_/test_hashing.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import pickle
+
+import pytest
+
+from vllm.utils.hashing import sha256
+
+
+@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
+def test_sha256(input: tuple):
+    digest = sha256(input)
+    assert digest is not None
+    assert isinstance(digest, bytes)
+    assert digest != b""
+
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert digest == hashlib.sha256(input_bytes).digest()
+
+    # hashing again, returns the same value
+    assert digest == sha256(input)
+
+    # hashing different input, returns different value
+    assert digest != sha256(input + (1,))
diff --git a/tests/utils_/test_import_utils.py b/tests/utils_/test_import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42685b3fc9a24c4a6dc4a6e537a514de89b3284
--- /dev/null
+++ b/tests/utils_/test_import_utils.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.utils.import_utils import PlaceholderModule
+
+
+def _raises_module_not_found():
+    return pytest.raises(ModuleNotFoundError, match="No module named")
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    with _raises_module_not_found():
+        int(placeholder)
+
+    with _raises_module_not_found():
+        placeholder()
+
+    with _raises_module_not_found():
+        _ = placeholder.some_attr
+
+    with _raises_module_not_found():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with _raises_module_not_found():
+        int(placeholder_attr)
+
+    with _raises_module_not_found():
+        placeholder_attr()
+
+    with _raises_module_not_found():
+        _ = placeholder_attr.some_attr
+
+    with _raises_module_not_found():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
diff --git a/tests/utils_/test_jsontree.py b/tests/utils_/test_jsontree.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af2751b2638cd44a6682b3ca75087ae4f41412d
--- /dev/null
+++ b/tests/utils_/test_jsontree.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.utils.jsontree import json_count_leaves
+
+
+def test_json_count_leaves():
+    """Test json_count_leaves function from jsontree utility."""
+
+    # Single leaf values
+    assert json_count_leaves(42) == 1
+    assert json_count_leaves("hello") == 1
+    assert json_count_leaves(None) == 1
+
+    # Empty containers
+    assert json_count_leaves([]) == 0
+    assert json_count_leaves({}) == 0
+    assert json_count_leaves(()) == 0
+
+    # Flat structures
+    assert json_count_leaves([1, 2, 3]) == 3
+    assert json_count_leaves({"a": 1, "b": 2}) == 2
+    assert json_count_leaves((1, 2, 3)) == 3
+
+    # Nested structures
+    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
+    assert json_count_leaves(nested_dict) == 3
+
+    nested_list = [1, [2, 3], 4]
+    assert json_count_leaves(nested_list) == 4
+
+    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
+    assert json_count_leaves(mixed_nested) == 4
diff --git a/tests/utils_/test_mem_utils.py b/tests/utils_/test_mem_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1058be412d83cd71df004628d940e87a44122c
--- /dev/null
+++ b/tests/utils_/test_mem_utils.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm_test_utils.monitor import monitor
+
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_snapshot = MemorySnapshot()
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
+
+    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
+
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
+    with (
+        memory_profiling(
+            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
+        ) as result,
+        monitor(measure_current_non_torch) as monitored_values,
+    ):
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
+    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
+    # which causes a small error (<10 MiB in practice)
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/utils_/test_network_utils.py b/tests/utils_/test_network_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc274f0679b8849e4638591deee8692b7f978ec2
--- /dev/null
+++ b/tests/utils_/test_network_utils.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import socket
+
+import pytest
+import zmq
+
+from vllm.utils.network_utils import (
+    get_open_port,
+    get_tcp_uri,
+    join_host_port,
+    make_zmq_path,
+    make_zmq_socket,
+    split_host_port,
+    split_zmq_path,
+)
+
+
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PORT", "5678")
+        # make sure we can get multiple ports, even if the env var is set
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+            s1.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+                s2.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                    s3.bind(("localhost", get_open_port()))
+
+
+@pytest.mark.parametrize(
+    "path,expected",
+    [
+        ("ipc://some_path", ("ipc", "some_path", "")),
+        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
+        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
+        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
+    ],
+)
+def test_split_zmq_path(path, expected):
+    assert split_zmq_path(path) == expected
+
+
+@pytest.mark.parametrize(
+    "invalid_path",
+    [
+        "invalid_path",  # Missing scheme
+        "tcp://127.0.0.1",  # Missing port
+        "tcp://[::1]",  # Missing port for IPv6
+        "tcp://:5555",  # Missing host
+    ],
+)
+def test_split_zmq_path_invalid(invalid_path):
+    with pytest.raises(ValueError):
+        split_zmq_path(invalid_path)
+
+
+def test_make_zmq_socket_ipv6():
+    # Check if IPv6 is supported by trying to create an IPv6 socket
+    try:
+        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+        sock.close()
+    except OSError:
+        pytest.skip("IPv6 is not supported on this system")
+
+    ctx = zmq.Context()
+    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
+    socket_type = zmq.REP  # Example socket type
+
+    # Create the socket
+    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
+
+    # Verify that the IPV6 option is set
+    assert zsock.getsockopt(zmq.IPV6) == 1, (
+        "IPV6 option should be enabled for IPv6 addresses"
+    )
+
+    # Clean up
+    zsock.close()
+    ctx.term()
+
+
+def test_make_zmq_path():
+    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
+    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
+
+
+def test_get_tcp_uri():
+    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
+    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
+
+
+def test_split_host_port():
+    # valid ipv4
+    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
+    # invalid ipv4
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("127.0.0.1::5555")
+    with pytest.raises(ValueError):
+        # tailing colon
+        assert split_host_port("127.0.0.1:5555:")
+    with pytest.raises(ValueError):
+        # no colon
+        assert split_host_port("127.0.0.15555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("127.0.0.1:5555a")
+
+    # valid ipv6
+    assert split_host_port("[::1]:5555") == ("::1", 5555)
+    # invalid ipv6
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("[::1]::5555")
+    with pytest.raises(IndexError):
+        # no colon
+        assert split_host_port("[::1]5555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("[::1]:5555a")
+
+
+def test_join_host_port():
+    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
+    assert join_host_port("::1", 5555) == "[::1]:5555"
diff --git a/tests/utils_/test_serial_utils.py b/tests/utils_/test_serial_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b2e4de0269377e6186fccb4682ef6a910848e2
--- /dev/null
+++ b/tests/utils_/test_serial_utils.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.models.utils import check_embeddings_close
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    binary2tensor,
+    tensor2binary,
+)
+
+
+@pytest.mark.parametrize("endianness", ENDIANNESS)
+@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
+@torch.inference_mode()
+def test_encode_and_decode(embed_dtype: str, endianness: str):
+    for i in range(10):
+        tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)
+        shape = tensor.shape
+        binary = tensor2binary(tensor, embed_dtype, endianness)
+        new_tensor = binary2tensor(binary, shape, embed_dtype, endianness).to(
+            torch.float32
+        )
+
+        if embed_dtype in ["float32", "float16"]:
+            torch.testing.assert_close(tensor, new_tensor, atol=0.001, rtol=0.001)
+        elif embed_dtype == "bfloat16":
+            torch.testing.assert_close(tensor, new_tensor, atol=0.01, rtol=0.01)
+        else:  # for fp8
+            torch.testing.assert_close(tensor, new_tensor, atol=0.1, rtol=0.1)
+
+        check_embeddings_close(
+            embeddings_0_lst=tensor.view(1, -1),
+            embeddings_1_lst=new_tensor.view(1, -1),
+            name_0="gt",
+            name_1="new",
+            tol=1e-2,
+        )
diff --git a/tests/utils_/test_system_utils.py b/tests/utils_/test_system_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b1fc4ce37d087f0cacde4a3fb3f58fe49e58a
--- /dev/null
+++ b/tests/utils_/test_system_utils.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from pathlib import Path
+
+from vllm.utils.system_utils import unique_filepath
+
+
+def test_unique_filepath():
+    temp_dir = tempfile.mkdtemp()
+    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
+    paths = set()
+    for i in range(10):
+        path = unique_filepath(path_fn)
+        path.write_text("test")
+        paths.add(path)
+    assert len(paths) == 10
+    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
diff --git a/tests/utils_/test_tensor_schema.py b/tests/utils_/test_tensor_schema.py
index 6aa781c1564de7e0d5e5c3754be16be7e077443b..c86bed75472c96b73401b1de73d68219fa0ba75f 100644
--- a/tests/utils_/test_tensor_schema.py
+++ b/tests/utils_/test_tensor_schema.py
@@ -6,37 +6,38 @@ import torch
 
 from vllm.model_executor.models.glm4_1v import Glm4vImageEmbeddingInputs
 from vllm.model_executor.models.granite_speech import GraniteSpeechAudioInputs
+from vllm.model_executor.models.hyperclovax_vision import HCXVisionVideoPixelInputs
 from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
 
 
 def test_tensor_schema_valid_tensor():
     Phi3VImagePixelInputs(
-        data=torch.randn(16, 64, 3, 32, 32),
+        pixel_values=torch.randn(16, 64, 3, 32, 32),
         image_sizes=torch.randint(0, 256, (16, 2)),
     )
 
 
 def test_tensor_schema_optional_fields():
     Phi3VImagePixelInputs(
-        data=torch.randn(16, 64, 3, 32, 32),
+        pixel_values=torch.randn(16, 64, 3, 32, 32),
         image_sizes=None,
     )
 
-    Phi3VImagePixelInputs(data=torch.randn(16, 64, 3, 32, 32), )
+    Phi3VImagePixelInputs(pixel_values=torch.randn(16, 64, 3, 32, 32))
 
 
 def test_tensor_schema_constant_dim_failure():
     with pytest.raises(ValueError, match="dim\\[2\\] expected 3, got 4"):
         Phi3VImagePixelInputs(
-            data=torch.randn(16, 64, 4, 32, 32),  # dim[2] = 4
+            pixel_values=torch.randn(16, 64, 4, 32, 32),  # dim[2] = 4
             image_sizes=torch.randint(0, 256, (16, 2)),
         )
 
 
 def test_tensor_schema_invalid_types_in_list():
-    with pytest.raises(ValueError, match="is not a torch.Tensor"):
+    with pytest.raises(TypeError, match="is not one of the expected types"):
         Phi3VImagePixelInputs(
-            data=[
+            pixel_values=[
                 torch.randn(64, 3, 32, 32),
                 "not_a_tensor",
                 torch.randn(64, 3, 32, 32),
@@ -48,27 +49,29 @@ def test_tensor_schema_invalid_types_in_list():
 def test_tensor_schema_rank_mismatch():
     with pytest.raises(ValueError, match="has rank 3 but expected 5"):
         Phi3VImagePixelInputs(
-            data=torch.randn(16, 64, 3),
+            pixel_values=torch.randn(16, 64, 3),
             image_sizes=torch.randint(0, 256, (16, 2)),
         )
 
 
 def test_tensor_schema_missing_required_field():
-    with pytest.raises(ValueError, match="Required field 'data' is missing"):
-        Phi3VImagePixelInputs(image_sizes=torch.randint(0, 256, (16, 2)), )
+    with pytest.raises(ValueError, match="Required field 'pixel_values' is missing"):
+        Phi3VImagePixelInputs(
+            image_sizes=torch.randint(0, 256, (16, 2)),
+        )
 
 
 def test_tensor_schema_symbolic_dim_mismatch():
     with pytest.raises(ValueError, match="expected 'bn'=12, got 16"):
         Phi3VImagePixelInputs(
-            data=torch.randn(12, 64, 3, 32, 32),
+            pixel_values=torch.randn(12, 64, 3, 32, 32),
             image_sizes=torch.randint(0, 256, (16, 2)),
         )
 
 
 def test_tensor_schema_list_tensor_valid():
     Phi3VImagePixelInputs(
-        data=[torch.randn(64, 3, 32, 32) for _ in range(16)],
+        pixel_values=[torch.randn(64, 3, 32, 32) for _ in range(16)],
         image_sizes=torch.randint(0, 256, (16, 2)),
     )
 
@@ -76,39 +79,46 @@ def test_tensor_schema_list_tensor_valid():
 def test_tensor_schema_variable_patch_counts_valid():
     # Each image has a different number of patches (p)
     # Each tensor has shape (p, 3, 32, 32)
-    data = [
-        torch.randn(16, 3, 32, 32),  # p = 16
-        torch.randn(32, 3, 32, 32),  # p = 32
-        torch.randn(64, 3, 32, 32),  # p = 64
-    ]
-    image_sizes = torch.randint(0, 256, (3, 2))  # bn = 3
     Phi3VImagePixelInputs(
-        data=data,
-        image_sizes=image_sizes,
+        pixel_values=[
+            torch.randn(16, 3, 32, 32),  # p = 16
+            torch.randn(32, 3, 32, 32),  # p = 32
+            torch.randn(64, 3, 32, 32),  # p = 64
+        ],
+        image_sizes=torch.randint(0, 256, (3, 2)),  # bn = 3
     )
 
 
 def test_tensor_schema_tuple_tensor_valid():
     Phi3VImagePixelInputs(
-        data=tuple(torch.randn(64, 3, 32, 32) for _ in range(16)),
+        pixel_values=tuple(torch.randn(64, 3, 32, 32) for _ in range(16)),
         image_sizes=torch.randint(0, 256, (16, 2)),
     )
 
 
+def test_tensor_schema_double_nested_tensors():
+    x = torch.rand(4, 3, 32, 32)
+    y = torch.rand(2, 3, 32, 32)
+
+    HCXVisionVideoPixelInputs(pixel_values_videos=([x, y, x], [y], [x, y]))
+
+
 def test_tensor_schema_inconsistent_shapes_in_list():
     with pytest.raises(ValueError, match="contains inconsistent shapes"):
         Phi3VImagePixelInputs(
-            data=[torch.randn(64, 3, 32, 32),
-                  torch.randn(64, 3, 16, 16)] +
-            [torch.randn(64, 3, 32, 32) for _ in range(14)],
+            pixel_values=[
+                torch.randn(64, 3, 32, 32),
+                torch.randn(64, 3, 16, 16),
+                *(torch.randn(64, 3, 32, 32) for _ in range(14)),
+            ],
             image_sizes=torch.randint(0, 256, (16, 2)),
         )
 
 
 def test_tensor_schema_empty_list():
-    with pytest.raises(ValueError, match="is an empty list"):
+    with pytest.raises(ValueError, match="is an empty sequence"):
         Phi3VImagePixelInputs(
-            data=[],
+            pixel_values=[],
             image_sizes=torch.randint(0, 256, (0, 2)),
         )
 
@@ -117,39 +127,33 @@ def test_tensor_schema_validation_disabled_skips_shape_check():
     # This should NOT raise, because validation is turned off
     # This would normally fail (dim[2] should be 3, not 4)
     Phi3VImagePixelInputs(
-        data=torch.randn(16, 64, 4, 32, 32),
+        pixel_values=torch.randn(16, 64, 4, 32, 32),
         image_sizes=torch.randint(0, 256, (16, 2)),
         validate=False,
     )
 
 
 def test_tensor_schema_with_valid_resolve_binding_dims():
-    data = torch.randn(16, 64, 3, 336, 336)  # h=336, w=336
+    pixel_values = torch.randn(16, 64, 3, 336, 336)  # h=336, w=336
     image_sizes = torch.randint(0, 256, (16, 2))
 
     Phi3VImagePixelInputs(
-        data=data,
+        pixel_values=pixel_values,
         image_sizes=image_sizes,
-        resolve_bindings={
-            "h": 336,
-            "w": 336
-        },
+        resolve_bindings={"h": 336, "w": 336},
     )
 
 
 def test_tensor_schema_with_invalid_resolve_binding_dims():
-    data = torch.randn(16, 64, 3, 36, 36)  # h=36, w=36
+    pixel_values = torch.randn(16, 64, 3, 36, 36)  # h=36, w=36
     image_sizes = torch.randint(0, 256, (16, 2))
 
     # Should raise because 'h' and 'w' don't match resolve bindings
     with pytest.raises(ValueError, match="dim\\[3\\] expected 336, got 36"):
         Phi3VImagePixelInputs(
-            data=data,
+            pixel_values=pixel_values,
             image_sizes=image_sizes,
-            resolve_bindings={
-                "h": 336,
-                "w": 336
-            },
+            resolve_bindings={"h": 336, "w": 336},
         )
 
 
diff --git a/tests/utils_/test_torch_utils.py b/tests/utils_/test_torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a30b9727f4de895eb32c2d84a9340bc025f2698
--- /dev/null
+++ b/tests/utils_/test_torch_utils.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.utils.torch_utils import (
+    common_broadcastable_dtype,
+    current_stream,
+    is_lossless_cast,
+)
+
+
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
+def _test_stream_thread(main_expected_stream: torch.cuda.Stream):
+    import threading
+
+    child_stream = torch.cuda.Stream()
+    thread_stream_ready = threading.Event()
+    thread_can_exit = threading.Event()
+
+    def child_thread_func():
+        with torch.cuda.stream(child_stream):
+            thread_stream_ready.set()
+            thread_can_exit.wait(timeout=10)
+
+    child_thread = threading.Thread(target=child_thread_func)
+    child_thread.start()
+
+    try:
+        assert thread_stream_ready.wait(timeout=5), (
+            "Child thread failed to enter stream context in time"
+        )
+
+        main_current_stream = current_stream()
+
+        assert main_current_stream != child_stream, (
+            "Main thread's current_stream was contaminated by child thread"
+        )
+        assert main_current_stream == main_expected_stream, (
+            f"Main thread's stream changed unexpectedly. "
+            f"Expected {main_expected_stream}, got {main_current_stream}"
+        )
+
+        thread_can_exit.set()
+
+    finally:
+        child_thread.join(timeout=5)
+        if child_thread.is_alive():
+            pytest.fail("Child thread failed to exit properly")
+
+
+def test_current_stream_multithread():
+    from vllm.platforms import current_platform
+
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    if current_platform.is_rocm():
+        main_dedicated_stream = current_stream()
+
+        assert main_dedicated_stream.cuda_stream != 0, (
+            "ROCm should create a dedicated stream, not use default stream (0x0)"
+        )
+
+        main_stream_again = current_stream()
+        assert main_stream_again == main_dedicated_stream, (
+            "Multiple calls to current_stream should return the same dedicated stream"
+        )
+
+        _test_stream_thread(main_dedicated_stream)
+    else:
+        main_default_stream = torch.cuda.default_stream()
+        main_initial_stream = current_stream()
+
+        assert main_initial_stream == main_default_stream, (
+            "First call to current_stream should return default stream on CUDA"
+        )
+
+        _test_stream_thread(main_default_stream)
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
deleted file mode 100644
index 658ae7e7451aea40108a0e93dda4e38aec5c67f7..0000000000000000000000000000000000000000
--- a/tests/utils_/test_utils.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa
-
-import asyncio
-import hashlib
-import json
-import os
-import pickle
-import socket
-import tempfile
-from collections.abc import AsyncIterator
-from pathlib import Path
-from unittest.mock import patch
-
-import pytest
-import torch
-import yaml
-import zmq
-from transformers import AutoTokenizer
-from vllm_test_utils.monitor import monitor
-
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.transformers_utils.detokenizer_utils import (
-    convert_ids_list_to_tokens)
-
-# isort: off
-from vllm.utils import (
-    CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
-    PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
-    current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
-    is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
-    memory_profiling, merge_async_iterators, sha256, split_host_port,
-    split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
-# isort: on
-from ..utils import create_new_process_for_each_test, error_on_warning
-
-
-@pytest.mark.asyncio
-async def test_merge_async_iterators():
-
-    async def mock_async_iterator(idx: int):
-        try:
-            while True:
-                yield f"item from iterator {idx}"
-                await asyncio.sleep(0.1)
-        except asyncio.CancelledError:
-            print(f"iterator {idx} cancelled")
-
-    iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators)
-
-    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
-        async for idx, output in generator:
-            print(f"idx: {idx}, output: {output}")
-
-    task = asyncio.create_task(stream_output(merged_iterator))
-    await asyncio.sleep(0.5)
-    task.cancel()
-    with pytest.raises(asyncio.CancelledError):
-        await task
-
-    for iterator in iterators:
-        try:
-            # Can use anext() in python >= 3.10
-            await asyncio.wait_for(iterator.__anext__(), 1)
-        except StopAsyncIteration:
-            # All iterators should be cancelled and print this message.
-            print("Iterator was cancelled normally")
-        except (Exception, asyncio.CancelledError) as e:
-            raise AssertionError() from e
-
-
-def test_deprecate_kwargs_always():
-
-    @deprecate_kwargs("old_arg", is_deprecated=True)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_never():
-
-    @deprecate_kwargs("old_arg", is_deprecated=False)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_dynamic():
-    is_deprecated = True
-
-    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-    is_deprecated = False
-
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_additional_message():
-
-    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="abcd"):
-        dummy(old_arg=1)
-
-
-def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_PORT", "5678")
-        # make sure we can get multiple ports, even if the env var is set
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-            s1.bind(("localhost", get_open_port()))
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-                s2.bind(("localhost", get_open_port()))
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-                    s3.bind(("localhost", get_open_port()))
-
-
-# Tests for FlexibleArgumentParser
-@pytest.fixture
-def parser():
-    parser = FlexibleArgumentParser()
-    parser.add_argument('--image-input-type',
-                        choices=['pixel_values', 'image_features'])
-    parser.add_argument('--model-name')
-    parser.add_argument('--batch-size', type=int)
-    parser.add_argument('--enable-feature', action='store_true')
-    parser.add_argument('--hf-overrides', type=json.loads)
-    parser.add_argument('-O', '--compilation-config', type=json.loads)
-    return parser
-
-
-@pytest.fixture
-def parser_with_config():
-    parser = FlexibleArgumentParser()
-    parser.add_argument('serve')
-    parser.add_argument('model_tag', nargs='?')
-    parser.add_argument('--model', type=str)
-    parser.add_argument('--served-model-name', type=str)
-    parser.add_argument('--config', type=str)
-    parser.add_argument('--port', type=int)
-    parser.add_argument('--tensor-parallel-size', type=int)
-    parser.add_argument('--trust-remote-code', action='store_true')
-    return parser
-
-
-def test_underscore_to_dash(parser):
-    args = parser.parse_args(['--image_input_type', 'pixel_values'])
-    assert args.image_input_type == 'pixel_values'
-
-
-def test_mixed_usage(parser):
-    args = parser.parse_args([
-        '--image_input_type', 'image_features', '--model-name',
-        'facebook/opt-125m'
-    ])
-    assert args.image_input_type == 'image_features'
-    assert args.model_name == 'facebook/opt-125m'
-
-
-def test_with_equals_sign(parser):
-    args = parser.parse_args(
-        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
-    assert args.image_input_type == 'pixel_values'
-    assert args.model_name == 'facebook/opt-125m'
-
-
-def test_with_int_value(parser):
-    args = parser.parse_args(['--batch_size', '32'])
-    assert args.batch_size == 32
-    args = parser.parse_args(['--batch-size', '32'])
-    assert args.batch_size == 32
-
-
-def test_with_bool_flag(parser):
-    args = parser.parse_args(['--enable_feature'])
-    assert args.enable_feature is True
-    args = parser.parse_args(['--enable-feature'])
-    assert args.enable_feature is True
-
-
-def test_invalid_choice(parser):
-    with pytest.raises(SystemExit):
-        parser.parse_args(['--image_input_type', 'invalid_choice'])
-
-
-def test_missing_required_argument(parser):
-    parser.add_argument('--required-arg', required=True)
-    with pytest.raises(SystemExit):
-        parser.parse_args([])
-
-
-def test_cli_override_to_config(parser_with_config, cli_config_file):
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', cli_config_file,
-        '--tensor-parallel-size', '3'
-    ])
-    assert args.tensor_parallel_size == 3
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file
-    ])
-    assert args.tensor_parallel_size == 3
-    assert args.port == 12312
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file, '--port', '666'
-    ])
-    assert args.tensor_parallel_size == 3
-    assert args.port == 666
-
-
-def test_config_args(parser_with_config, cli_config_file):
-    args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', cli_config_file])
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code
-
-
-def test_config_file(parser_with_config):
-    with pytest.raises(FileNotFoundError):
-        parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', 'test_config.yml'])
-
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', './data/test_config.json'])
-
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args([
-            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-            '--batch-size', '32'
-        ])
-
-
-def test_no_model_tag(parser_with_config, cli_config_file):
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-
-
-def test_dict_args(parser):
-    args = [
-        "--model-name=something.something",
-        "--hf-overrides.key1",
-        "val1",
-        # Test nesting
-        "--hf-overrides.key2.key3",
-        "val2",
-        "--hf-overrides.key2.key4",
-        "val3",
-        # Test compile config and compilation level
-        "-O.use_inductor=true",
-        "-O.backend",
-        "custom",
-        "-O1",
-        # Test = sign
-        "--hf-overrides.key5=val4",
-        # Test underscore to dash conversion
-        "--hf_overrides.key_6",
-        "val5",
-        "--hf_overrides.key-7.key_8",
-        "val6",
-        # Test data type detection
-        "--hf_overrides.key9",
-        "100",
-        "--hf_overrides.key10",
-        "100.0",
-        "--hf_overrides.key11",
-        "true",
-        "--hf_overrides.key12.key13",
-        "null",
-        # Test '-' and '.' in value
-        "--hf_overrides.key14.key15",
-        "-minus.and.dot",
-        # Test array values
-        "-O.custom_ops+",
-        "-quant_fp8",
-        "-O.custom_ops+=+silu_mul,-rms_norm",
-    ]
-    parsed_args = parser.parse_args(args)
-    assert parsed_args.model_name == "something.something"
-    assert parsed_args.hf_overrides == {
-        "key1": "val1",
-        "key2": {
-            "key3": "val2",
-            "key4": "val3",
-        },
-        "key5": "val4",
-        "key_6": "val5",
-        "key-7": {
-            "key_8": "val6",
-        },
-        "key9": 100,
-        "key10": 100.0,
-        "key11": True,
-        "key12": {
-            "key13": None,
-        },
-        "key14": {
-            "key15": "-minus.and.dot",
-        }
-    }
-    assert parsed_args.compilation_config == {
-        "level": 1,
-        "use_inductor": True,
-        "backend": "custom",
-        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
-    }
-
-
-def test_duplicate_dict_args(caplog_vllm, parser):
-    args = [
-        "--model-name=something.something",
-        "--hf-overrides.key1",
-        "val1",
-        "--hf-overrides.key1",
-        "val2",
-        "-O1",
-        "-O.level",
-        "2",
-        "-O3",
-    ]
-
-    parsed_args = parser.parse_args(args)
-    # Should be the last value
-    assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"level": 3}
-
-    assert len(caplog_vllm.records) == 1
-    assert "duplicate" in caplog_vllm.text
-    assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.level" in caplog_vllm.text
-
-
-# yapf: enable
-@pytest.mark.parametrize(
-    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
-    [
-        # Tests for positional argument support
-        (lambda foo: None, "foo", True, True, False),
-        (lambda foo: None, "foo", False, True, True),
-        # Tests for positional or keyword / keyword only
-        (lambda foo=100: None, "foo", True, True, False),
-        (lambda *, foo: None, "foo", False, True, True),
-        # Tests to make sure the names of variadic params are NOT supported
-        (lambda *args: None, "args", False, True, False),
-        (lambda **kwargs: None, "kwargs", False, True, False),
-        # Tests for if we allow var kwargs to add support
-        (lambda foo: None, "something_else", False, True, False),
-        (lambda foo, **kwargs: None, "something_else", False, True, True),
-        (lambda foo, **kwargs: None, "kwargs", True, True, False),
-        (lambda foo, **kwargs: None, "foo", True, True, False),
-    ])
-# yapf: disable
-def test_supports_kw(callable,kw_name,requires_kw_only,
-                     allow_var_kwargs,is_supported):
-    assert supports_kw(
-            callable=callable,
-            kw_name=kw_name,
-            requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs
-    ) == is_supported
-
-
-@create_new_process_for_each_test()
-def test_memory_profiling():
-    # Fake out some model loading + inference memory usage to test profiling
-    # Memory used by other processes will show up as cuda usage outside of torch
-    from vllm.distributed.device_communicators.cuda_wrapper import (
-        CudaRTLibrary)
-    lib = CudaRTLibrary()
-    # 512 MiB allocation outside of this instance
-    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
-
-    baseline_snapshot = MemorySnapshot()
-
-    # load weights
-
-    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
-
-    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
-
-    def measure_current_non_torch():
-        free, total = torch.cuda.mem_get_info()
-        current_used = total - free
-        current_torch = torch.cuda.memory_reserved()
-        current_non_torch = current_used - current_torch
-        return current_non_torch
-
-    with memory_profiling(baseline_snapshot=baseline_snapshot,
-    weights_memory=weights_memory) as result, \
-        monitor(measure_current_non_torch) as monitored_values:
-        # make a memory spike, 1 GiB
-        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
-        del spike
-
-        # Add some extra non-torch memory 256 MiB (simulate NCCL)
-        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
-
-    # this is an analytic value, it is exact,
-    # we only have 256 MiB non-torch memory increase
-    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
-    assert measured_diff == 256 * 1024 * 1024
-
-    # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by cuda runtime.
-    # we cannot control cuda runtime in the granularity of bytes,
-    # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
-    assert abs(non_torch_ratio - 1) <= 0.05
-    assert result.torch_peak_increase == 1024 * 1024 * 1024
-    del weights
-    lib.cudaFree(handle1)
-    lib.cudaFree(handle2)
-
-
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
-
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    shared_kv_cache_layers = {
-        'layers.2.self_attn': 'layers.1.self_attn',
-        'layers.3.self_attn': 'layers.0.self_attn'
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-
-
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(
-            parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-
-        ctx = {
-            'layers.0.self_attn': Attention(32, 128, 0.1),
-        }
-        kv_cache = [
-            [torch.zeros((1, ))],
-            [torch.zeros((1, ))]
-        ]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
-        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
-
-
-class TestLRUCache(LRUCache):
-
-    def _on_remove(self, key, value):
-        if not hasattr(self, "_remove_counter"):
-            self._remove_counter = 0
-        self._remove_counter += 1
-
-
-def test_lru_cache():
-    cache = TestLRUCache(3)
-    assert cache.stat() == CacheInfo(hits=0, total=0)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(2, 2)
-    assert len(cache) == 2
-
-    cache.put(3, 3)
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache.put(4, 4)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-
-    assert cache.get(2) == 2
-    assert cache.stat() == CacheInfo(hits=1, total=1)
-    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
-
-    assert cache[2] == 2
-    assert cache.stat() == CacheInfo(hits=2, total=2)
-    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
-
-    cache.put(5, 5)
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    assert cache.pop(5) == 5
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    assert cache.get(-1) is None
-    assert cache.stat() == CacheInfo(hits=2, total=3)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.get(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.put(6, 6)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-    cache.remove_oldest()
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 6}
-    assert cache._remove_counter == 4
-
-    cache.clear()
-    assert len(cache) == 0
-    assert cache._remove_counter == 6
-    assert cache.stat() == CacheInfo(hits=0, total=0)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
-
-    cache._remove_counter = 0
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[2] = 2
-    assert len(cache) == 2
-
-    cache[3] = 3
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache[4] = 4
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache[2] == 2
-
-    cache[5] = 5
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    del cache[5]
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache[6] = 6
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    ("src_dtype", "tgt_dtype", "expected_result"),
-    [
-        # Different precision_levels
-        (torch.bool, torch.int8, True),
-        (torch.bool, torch.float16, True),
-        (torch.bool, torch.complex32, True),
-        (torch.int64, torch.bool, False),
-        (torch.int64, torch.float16, True),
-        (torch.int64, torch.complex32, True),
-        (torch.float64, torch.bool, False),
-        (torch.float64, torch.int8, False),
-        (torch.float64, torch.complex32, True),
-        (torch.complex128, torch.bool, False),
-        (torch.complex128, torch.int8, False),
-        (torch.complex128, torch.float16, False),
-        # precision_level=0
-        (torch.bool, torch.bool, True),
-        # precision_level=1
-        (torch.int8, torch.int16, True),
-        (torch.int16, torch.int8, False),
-        (torch.uint8, torch.int8, False),
-        (torch.int8, torch.uint8, False),
-        # precision_level=2
-        (torch.float16, torch.float32, True),
-        (torch.float32, torch.float16, False),
-        (torch.bfloat16, torch.float32, True),
-        (torch.float32, torch.bfloat16, False),
-        # precision_level=3
-        (torch.complex32, torch.complex64, True),
-        (torch.complex64, torch.complex32, False),
-    ],
-)
-# yapf: enable
-def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
-    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    ("dtypes", "expected_result"),
-    [
-        ([torch.bool], torch.bool),
-        ([torch.bool, torch.int8], torch.int8),
-        ([torch.bool, torch.int8, torch.float16], torch.float16),
-        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
-    ],
-)
-# yapf: enable
-def test_common_broadcastable_dtype(dtypes, expected_result):
-    assert common_broadcastable_dtype(dtypes) == expected_result
-
-
-def test_placeholder_module_error_handling():
-    placeholder = PlaceholderModule("placeholder_1234")
-
-    def build_ctx():
-        return pytest.raises(ModuleNotFoundError, match="No module named")
-
-    with build_ctx():
-        int(placeholder)
-
-    with build_ctx():
-        placeholder()
-
-    with build_ctx():
-        _ = placeholder.some_attr
-
-    with build_ctx():
-        # Test conflict with internal __name attribute
-        _ = placeholder.name
-
-    # OK to print the placeholder or use it in a f-string
-    _ = repr(placeholder)
-    _ = str(placeholder)
-
-    # No error yet; only error when it is used downstream
-    placeholder_attr = placeholder.placeholder_attr("attr")
-
-    with build_ctx():
-        int(placeholder_attr)
-
-    with build_ctx():
-        placeholder_attr()
-
-    with build_ctx():
-        _ = placeholder_attr.some_attr
-
-    with build_ctx():
-        # Test conflict with internal __module attribute
-        _ = placeholder_attr.module
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    "obj,key1,key2",
-    [
-        # Tests for both keys exist
-        ({1: "a", 2: "b"}, 1, 2),
-        # Tests for one key does not exist
-        ({1: "a", 2: "b"}, 1, 3),
-        # Tests for both keys do not exist
-        ({1: "a", 2: "b"}, 3, 4),
-    ])
-# yapf: enable
-def test_swap_dict_values(obj, key1, key2):
-    original_obj = obj.copy()
-    swap_dict_values(obj, key1, key2)
-    if key1 in original_obj:
-        assert obj[key2] == original_obj[key1]
-    else:
-        assert key2 not in obj
-    if key2 in original_obj:
-        assert obj[key1] == original_obj[key2]
-    else:
-        assert key1 not in obj
-
-
-def test_model_specification(parser_with_config, cli_config_file,
-                             cli_config_file_with_model):
-    # Test model in CLI takes precedence over config
-    args = parser_with_config.parse_args(
-        ['serve', 'cli-model', '--config', cli_config_file_with_model])
-    assert args.model_tag == 'cli-model'
-    assert args.served_model_name == 'mymodel'
-
-    # Test model from config file works
-    args = parser_with_config.parse_args([
-        'serve',
-        '--config',
-        cli_config_file_with_model,
-    ])
-    assert args.model == 'config-model'
-    assert args.served_model_name == 'mymodel'
-
-    # Test no model specified anywhere raises error
-    with pytest.raises(ValueError, match="No model specified!"):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-
-    # Test using --model option raises error
-    with pytest.raises(
-            ValueError,
-            match=
-        ("With `vllm serve`, you should provide the model as a positional "
-         "argument or in a config file instead of via the `--model` option."),
-    ):
-        parser_with_config.parse_args(['serve', '--model', 'my-model'])
-
-    # Test other config values are preserved
-    args = parser_with_config.parse_args([
-        'serve',
-        'cli-model',
-        '--config',
-        cli_config_file_with_model,
-    ])
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code is True
-    assert args.port == 12312
-
-
-@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
-                                   (None, bool, [1, 2, 3])])
-def test_sha256(input: tuple):
-    digest = sha256(input)
-    assert digest is not None
-    assert isinstance(digest, bytes)
-    assert digest != b""
-
-    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert digest == hashlib.sha256(input_bytes).digest()
-
-    # hashing again, returns the same value
-    assert digest == sha256(input)
-
-    # hashing different input, returns different value
-    assert digest != sha256(input + (1, ))
-
-
-@pytest.mark.parametrize(
-    "path,expected",
-    [
-        ("ipc://some_path", ("ipc", "some_path", "")),
-        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
-        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
-        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
-    ])
-def test_split_zmq_path(path, expected):
-    assert split_zmq_path(path) == expected
-
-
-@pytest.mark.parametrize(
-    "invalid_path",
-    [
-        "invalid_path",  # Missing scheme
-        "tcp://127.0.0.1",  # Missing port
-        "tcp://[::1]",  # Missing port for IPv6
-        "tcp://:5555",  # Missing host
-    ])
-def test_split_zmq_path_invalid(invalid_path):
-    with pytest.raises(ValueError):
-        split_zmq_path(invalid_path)
-
-
-def test_make_zmq_socket_ipv6():
-    # Check if IPv6 is supported by trying to create an IPv6 socket
-    try:
-        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
-        sock.close()
-    except socket.error:
-        pytest.skip("IPv6 is not supported on this system")
-
-    ctx = zmq.Context()
-    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
-    socket_type = zmq.REP  # Example socket type
-
-    # Create the socket
-    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
-
-    # Verify that the IPV6 option is set
-    assert zsock.getsockopt(
-        zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
-
-    # Clean up
-    zsock.close()
-    ctx.term()
-
-
-def test_make_zmq_path():
-    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
-    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
-
-
-def test_get_tcp_uri():
-    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
-    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
-
-
-def test_split_host_port():
-    # valid ipv4
-    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
-    # invalid ipv4
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("127.0.0.1::5555")
-    with pytest.raises(ValueError):
-        # tailing colon
-        assert split_host_port("127.0.0.1:5555:")
-    with pytest.raises(ValueError):
-        # no colon
-        assert split_host_port("127.0.0.15555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("127.0.0.1:5555a")
-
-    # valid ipv6
-    assert split_host_port("[::1]:5555") == ("::1", 5555)
-    # invalid ipv6
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("[::1]::5555")
-    with pytest.raises(IndexError):
-        # no colon
-        assert split_host_port("[::1]5555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("[::1]:5555a")
-
-
-def test_join_host_port():
-    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
-    assert join_host_port("::1", 5555) == "[::1]:5555"
-
-
-def test_json_count_leaves():
-    """Test json_count_leaves function from jsontree utility."""
-    from vllm.utils.jsontree import json_count_leaves
-
-    # Single leaf values
-    assert json_count_leaves(42) == 1
-    assert json_count_leaves("hello") == 1
-    assert json_count_leaves(None) == 1
-
-    # Empty containers
-    assert json_count_leaves([]) == 0
-    assert json_count_leaves({}) == 0
-    assert json_count_leaves(()) == 0
-
-    # Flat structures
-    assert json_count_leaves([1, 2, 3]) == 3
-    assert json_count_leaves({"a": 1, "b": 2}) == 2
-    assert json_count_leaves((1, 2, 3)) == 3
-
-    # Nested structures
-    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
-    assert json_count_leaves(nested_dict) == 3
-
-    nested_list = [1, [2, 3], 4]
-    assert json_count_leaves(nested_list) == 4
-
-    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
-    assert json_count_leaves(mixed_nested) == 4
-
-
-def test_convert_ids_list_to_tokens():
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-    token_ids = tokenizer.encode("Hello, world!")
-    # token_ids = [9707, 11, 1879, 0]
-    assert tokenizer.convert_ids_to_tokens(token_ids) == [
-        'Hello', ',', 'Ġworld', '!'
-    ]
-    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
-    assert tokens == ['Hello', ',', ' world', '!']
-
-
-def test_current_stream_multithread():
-    import threading
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-
-    main_default_stream = torch.cuda.current_stream()
-    child_stream = torch.cuda.Stream()
-
-    thread_stream_ready = threading.Event()
-    thread_can_exit = threading.Event()
-
-    def child_thread_func():
-        with torch.cuda.stream(child_stream):
-            thread_stream_ready.set()
-            thread_can_exit.wait(timeout=10)
-
-    child_thread = threading.Thread(target=child_thread_func)
-    child_thread.start()
-
-    try:
-        assert thread_stream_ready.wait(
-            timeout=5), "Child thread failed to enter stream context in time"
-
-        main_current_stream = current_stream()
-
-        assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
-        assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
-
-        # Notify child thread it can exit
-        thread_can_exit.set()
-
-    finally:
-        # Ensure child thread exits properly
-        child_thread.join(timeout=5)
-        if child_thread.is_alive():
-            pytest.fail("Child thread failed to exit properly")
-
-
-def test_load_config_file(tmp_path):
-    # Define the configuration data
-    config_data = {
-        "enable-logging": True,
-        "list-arg": ["item1", "item2"],
-        "port": 12323,
-        "tensor-parallel-size": 4
-    }
-
-    # Write the configuration data to a temporary YAML file
-    config_file_path = tmp_path / "config.yaml"
-    with open(config_file_path, "w") as config_file:
-        yaml.dump(config_data, config_file)
-
-    # Initialize the parser
-    parser = FlexibleArgumentParser()
-
-    # Call the function with the temporary file path
-    processed_args = parser.load_config_file(str(config_file_path))
-
-    # Expected output
-    expected_args = [
-        "--enable-logging",
-        "--list-arg",
-        "item1",
-        "item2",
-        "--port",
-        "12323",
-        "--tensor-parallel-size",
-        "4",
-    ]
-
-    # Assert that the processed arguments match the expected output
-    assert processed_args == expected_args
-    os.remove(str(config_file_path))
-
-
-def test_unique_filepath():
-    temp_dir = tempfile.mkdtemp()
-    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
-    paths = set()
-    for i in range(10):
-        path = unique_filepath(path_fn)
-        path.write_text("test")
-        paths.add(path)
-    assert len(paths) == 10
-    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 6c17be759ab6b68c83f732ac25d2c38453c94b35..b46002c5fa8ff5903d67d72abb93f43ed8e13e24 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -1,35 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for v1 attention backends without GPUModelRunner dependency."""
+
 from functools import partial
-from typing import Optional, Union
 
 import pytest
 import torch
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config,
-                                      get_attention_backend)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    create_vllm_config,
+    try_get_attention_backend,
+)
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
-from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
-                                              set_kv_cache_layout)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    set_kv_cache_layout,
+)
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLASHINFER,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
+    AttentionBackendEnum.TREE_ATTN,
+    "FLEX_ATTENTION_SLOW",
 ]
 
 # Remove flashinfer from the list if it's not available
 try:
     import flashinfer  # noqa: F401
 except ImportError:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER)
 
 
 def _convert_dtype_to_torch(dtype):
@@ -49,42 +59,38 @@ def _convert_dtype_to_torch(dtype):
 
 # Define common batch configurations
 BATCH_SPECS = {
-    "small_decode":
-    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
-    "small_prefill":
-    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
-    "mixed_small":
-    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
-    "medium_decode":
-    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
-              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
-    "medium_prefill":
-    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
-    "mixed_medium":
-    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
-              query_lens=[1, 1, 1, 7, 7, 7]),
-    "large_decode":
-    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
-    "large_prefill":
-    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
-    "single_decode":
-    BatchSpec(seq_lens=[1024], query_lens=[1]),
-    "single_prefill":
-    BatchSpec(seq_lens=[1024], query_lens=[64]),
+    "small_decode": BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
+    "small_prefill": BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
+    "mixed_small": BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
+    "medium_decode": BatchSpec(
+        seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+        query_lens=[1, 1, 1, 1, 1, 1, 1, 1],
+    ),
+    "medium_prefill": BatchSpec(
+        seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]
+    ),
+    "mixed_medium": BatchSpec(
+        seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[1, 1, 1, 7, 7, 7]
+    ),
+    "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
+    "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
+    "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
 
 
 def create_and_prepopulate_kv_cache(
-        k_contexts: list[torch.Tensor],
-        v_contexts: list[torch.Tensor],
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        num_blocks: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True) -> torch.Tensor:
+    k_contexts: list[torch.Tensor],
+    v_contexts: list[torch.Tensor],
+    block_size: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int,
+    common_attn_metadata: CommonAttentionMetadata,
+    randomize_blocks: bool = True,
+) -> torch.Tensor:
     """Create and prepopulate a KV cache with context data.
 
     Args:
@@ -106,20 +112,18 @@ def create_and_prepopulate_kv_cache(
     """
     batch_size = len(k_contexts)
     seq_lens = common_attn_metadata.seq_lens_cpu
-    query_lens = common_attn_metadata.query_start_loc_cpu[
-        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
+    query_lens = (
+        common_attn_metadata.query_start_loc_cpu[1:]
+        - common_attn_metadata.query_start_loc_cpu[:-1]
+    )
     context_lens = common_attn_metadata.num_computed_tokens_cpu
     block_table = common_attn_metadata.block_table_tensor
     slot_mapping = common_attn_metadata.slot_mapping
 
     # Create KV cache
-    kv_cache = torch.empty(2,
-                           num_blocks,
-                           block_size,
-                           num_kv_heads,
-                           head_size,
-                           dtype=dtype,
-                           device=device)
+    kv_cache = torch.empty(
+        2, num_blocks, block_size, num_kv_heads, head_size, dtype=dtype, device=device
+    )
     kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
 
     # Populate the cache with the context tokens
@@ -168,8 +172,8 @@ def create_and_prepopulate_kv_cache(
         start = common_attn_metadata.query_start_loc_cpu[i]
         end = common_attn_metadata.query_start_loc_cpu[i + 1]
         slot_mapping[start:end] = block_table[
-            i,
-            block_indices] * block_size + token_inter_block_offsets.to(device)
+            i, block_indices
+        ] * block_size + token_inter_block_offsets.to(device)
 
     return kv_cache
 
@@ -188,7 +192,7 @@ class MockAttentionLayer:
 
 
 def run_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
@@ -198,7 +202,7 @@ def run_attention_backend(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
 ) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
@@ -207,13 +211,13 @@ def run_attention_backend(
 
     use_direct_block_mask = is_torch_equal_or_newer("2.9.0.dev0")
     if backend == "FLEX_ATTENTION_SLOW":
-        actual_backend = _Backend.FLEX_ATTENTION
+        actual_backend = AttentionBackendEnum.FLEX_ATTENTION
         use_direct_block_mask = False
 
-    builder_cls, impl_cls = get_attention_backend(actual_backend)
+    builder_cls, impl_cls = try_get_attention_backend(actual_backend)
 
     # Mock flashinfer's get_per_layer_parameters if needed
-    if actual_backend == _Backend.FLASHINFER:
+    if actual_backend == AttentionBackendEnum.FLASHINFER:
         import unittest.mock
 
         from vllm.v1.attention.backends.utils import PerLayerParameters
@@ -222,20 +226,19 @@ def run_attention_backend(
             # Return mock parameters for a single layer
             head_size = vllm_config.model_config.get_head_size()
             return {
-                layer_name:
-                PerLayerParameters(
+                layer_name: PerLayerParameters(
                     window_left=-1,  # No sliding window
                     logits_soft_cap=0.0,  # No soft cap
-                    sm_scale=1.0 / (head_size**0.5)  # Standard scale
+                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
                 )
                 for layer_name in layer_names
             }
 
         with unittest.mock.patch(
-                'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
-                mock_get_per_layer_parameters):
-            builder = builder_cls(kv_cache_spec, layer_names, vllm_config,
-                                  device)
+            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
+            mock_get_per_layer_parameters,
+        ):
+            builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
             attn_metadata = builder.build(
                 common_prefix_len=0,
                 common_attn_metadata=common_attn_metadata,
@@ -243,7 +246,7 @@ def run_attention_backend(
     else:
         # Build metadata
         builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
-        if actual_backend == _Backend.FLEX_ATTENTION:
+        if actual_backend == AttentionBackendEnum.FLEX_ATTENTION:
             builder.direct_build = use_direct_block_mask
         attn_metadata = builder.build(
             common_prefix_len=0,
@@ -252,9 +255,11 @@ def run_attention_backend(
 
     # Instantiate implementation
     num_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     head_size = vllm_config.model_config.get_head_size()
     scale = 1.0 / (head_size**0.5)
     impl = impl_cls(
@@ -274,13 +279,9 @@ def run_attention_backend(
     # Run forward pass
     # NOTE: The query, key, and value are already shaped correctly
     # in the calling test function.
-    output = impl.forward(mock_layer,
-                          query,
-                          key,
-                          value,
-                          kv_cache,
-                          attn_metadata,
-                          output=output)
+    output = impl.forward(
+        mock_layer, query, key, value, kv_cache, attn_metadata, output=output
+    )
 
     return output
 
@@ -288,12 +289,13 @@ def run_attention_backend(
 def _test_backend_correctness(
     batch_spec: BatchSpec,
     model: str,
-    backend_to_test: list[Union[_Backend, str]],
+    backend_to_test: list[AttentionBackendEnum | str],
     mask_mod,
     *,
     block_size: int = 16,
     atol: float = 1e-2,
     rtol: float = 1e-2,
+    tensor_parallel_size: int = 1,
 ):
     """
     Test that all backends produce similar outputs to a reference implementation
@@ -309,12 +311,39 @@ def _test_backend_correctness(
     4. Running each vLLM attention backend with the new queries and the
        simulated paged KV cache.
     5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+
+    Note: When tensor_parallel_size > 1, we simulate the head partitioning
+    by overriding the model config to use fewer heads, without requiring
+    multiple GPUs. This tests that backends work correctly with different
+    head counts.
     """
     current_platform.seed_everything(42)
-    vllm_config = create_vllm_config(model_name=model,
-                                     max_model_len=max(batch_spec.seq_lens),
-                                     block_size=block_size,
-                                     num_gpu_blocks=8192)
+
+    hf_config_override = None
+    if tensor_parallel_size > 1:
+        from vllm.config import ModelConfig
+
+        temp_config = ModelConfig(model=model, max_model_len=1)
+        original_num_heads = temp_config.hf_text_config.num_attention_heads
+        original_num_kv_heads = getattr(
+            temp_config.hf_text_config, "num_key_value_heads", None
+        )
+        hf_config_override = {
+            "num_attention_heads": original_num_heads // tensor_parallel_size,
+        }
+        if original_num_kv_heads is not None:
+            hf_config_override["num_key_value_heads"] = max(
+                1, original_num_kv_heads // tensor_parallel_size
+            )
+
+    vllm_config = create_vllm_config(
+        model_name=model,
+        tensor_parallel_size=1,  # Always use TP=1 to avoid multi-GPU requirements
+        max_model_len=max(batch_spec.seq_lens),
+        block_size=block_size,
+        num_gpu_blocks=8192,
+        hf_config_override=hf_config_override,
+    )
     device = torch.device("cuda:0")
 
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
@@ -324,9 +353,11 @@ def _test_backend_correctness(
     seq_lens = batch_spec.seq_lens
     query_lens = batch_spec.query_lens
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     head_size = vllm_config.model_config.get_head_size()
     sliding_window = vllm_config.model_config.get_sliding_window()
     dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
@@ -344,21 +375,9 @@ def _test_backend_correctness(
         context_len = s_len - q_len
 
         # Generate Q, K, V for the whole sequence to be used in SDPA
-        q = torch.randn(q_len,
-                        num_q_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=device)
-        k_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
-        v_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
+        q = torch.randn(q_len, num_q_heads, head_size, dtype=dtype, device=device)
+        k_full = torch.randn(s_len, num_kv_heads, head_size, dtype=dtype, device=device)
+        v_full = torch.randn(s_len, num_kv_heads, head_size, dtype=dtype, device=device)
 
         # SDPA expects (N, H, L, D), so unsqueeze batch and permute
         q_sdpa_in = q.unsqueeze(0).transpose(1, 2)
@@ -368,7 +387,8 @@ def _test_backend_correctness(
         if num_q_heads != num_kv_heads:
             assert num_q_heads % num_kv_heads == 0, (
                 f"num_q_heads ({num_q_heads}) must be divisible by "
-                f"num_kv_heads ({num_kv_heads})")
+                f"num_kv_heads ({num_kv_heads})"
+            )
             repeats = num_q_heads // num_kv_heads
             k_sdpa_in = k_sdpa_in.repeat_interleave(repeats, dim=1)
             v_sdpa_in = v_sdpa_in.repeat_interleave(repeats, dim=1)
@@ -378,18 +398,17 @@ def _test_backend_correctness(
         kv_len = s_len
 
         final_mask_mod = partial(mask_mod, context_len=context_len)
-        block_mask = create_block_mask(final_mask_mod,
-                                       B=None,
-                                       H=None,
-                                       Q_LEN=q_len,
-                                       KV_LEN=kv_len,
-                                       device=device)
-        sdpa_out_i = flex_attention(q_sdpa_in,
-                                    k_sdpa_in,
-                                    v_sdpa_in,
-                                    block_mask=block_mask,
-                                    scale=scale,
-                                    enable_gqa=True)
+        block_mask = create_block_mask(
+            final_mask_mod, B=None, H=None, Q_LEN=q_len, KV_LEN=kv_len, device=device
+        )
+        sdpa_out_i = flex_attention(
+            q_sdpa_in,
+            k_sdpa_in,
+            v_sdpa_in,
+            block_mask=block_mask,
+            scale=scale,
+            enable_gqa=True,
+        )
 
         all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
 
@@ -408,7 +427,8 @@ def _test_backend_correctness(
     sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
 
     common_attn_metadata = create_common_attn_metadata(
-        batch_spec, vllm_config.cache_config.block_size, device)
+        batch_spec, vllm_config.cache_config.block_size, device
+    )
 
     # 3. Simulate Paged KV Cache and a realistic slot_mapping
     kv_cache = create_and_prepopulate_kv_cache(
@@ -421,7 +441,8 @@ def _test_backend_correctness(
         device=device,
         num_blocks=vllm_config.cache_config.num_gpu_blocks or 1000,
         common_attn_metadata=common_attn_metadata,
-        randomize_blocks=True)
+        randomize_blocks=True,
+    )
 
     # 4. Run vLLM backends and compare
     # Note: flex_attention has known Triton kernel compatibility issues
@@ -429,63 +450,92 @@ def _test_backend_correctness(
     for backend_name in backend_to_test:
         # FlashAttentionm + FlexAttention:
         #   [2, num_blocks, block_size, num_kv_heads, head_size]
-        # FlashInfer:
+        # FlashInfer + Triton:
         #   [num_blocks, 2, block_size, num_kv_heads, head_size]
         # Select the appropriate KV cache format for each backend
         kv_cache_for_backend = kv_cache
-        if backend_name == _Backend.FLASHINFER:
+        reset_kv_cache_layout = False
+        if backend_name in (
+            AttentionBackendEnum.FLASHINFER,
+            AttentionBackendEnum.TRITON_ATTN,
+        ):
             kv_cache_for_backend = kv_cache.transpose(0, 1)
 
+        if backend_name == AttentionBackendEnum.FLASHINFER:
             # For FlashInfer default to HND layout and
-            kv_cache_for_backend = kv_cache_for_backend.transpose(
-                2, 3).contiguous().transpose(2, 3)
+            kv_cache_for_backend = (
+                kv_cache_for_backend.transpose(2, 3).contiguous().transpose(2, 3)
+            )
             set_kv_cache_layout("HND")
-
-        backend_output = run_attention_backend(
-            backend_name,
-            kv_cache_spec,
-            ["placeholder"],
-            vllm_config,
-            device,
-            common_attn_metadata,
-            query_vllm,
-            key_vllm,
-            value_vllm,
-            kv_cache_for_backend,
-            sliding_window=sliding_window,
-        )
+            reset_kv_cache_layout = True
+        elif backend_name == AttentionBackendEnum.TRITON_ATTN:
+            kv_cache_for_backend = kv_cache_for_backend.contiguous()
+
+        try:
+            backend_output = run_attention_backend(
+                backend_name,
+                kv_cache_spec,
+                ["placeholder"],
+                vllm_config,
+                device,
+                common_attn_metadata,
+                query_vllm,
+                key_vllm,
+                value_vllm,
+                kv_cache_for_backend,
+                sliding_window=sliding_window,
+            )
+        finally:
+            if reset_kv_cache_layout:
+                set_kv_cache_layout(None)
 
         # Check shape and dtype consistency
         assert backend_output.shape == sdpa_output.shape, (
             f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_output.shape}")
+            f"SDPA shape {sdpa_output.shape}"
+        )
         assert backend_output.dtype == sdpa_output.dtype, (
             f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_output.dtype}")
+            f"SDPA dtype {sdpa_output.dtype}"
+        )
 
         assert torch.isfinite(backend_output).all(), (
-            f"[{backend_name}] produced non-finite values")
+            f"[{backend_name}] produced non-finite values"
+        )
 
         # Check numerical similarity
         def error_msg(msg: str, backend_name: str):
-            return (f"[{backend_name}] output differs from SDPA baseline. "
-                    f"{msg}")
-
-        torch.testing.assert_close(backend_output,
-                                   sdpa_output,
-                                   rtol=rtol,
-                                   atol=atol,
-                                   msg=partial(error_msg,
-                                               backend_name=backend_name))
-
-
-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
-    "single_decode", "single_prefill"
-])
+            return f"[{backend_name}] output differs from SDPA baseline. {msg}"
+
+        torch.testing.assert_close(
+            backend_output,
+            sdpa_output,
+            rtol=rtol,
+            atol=atol,
+            msg=partial(error_msg, backend_name=backend_name),
+        )
+
+
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_small",
+        "medium_decode",
+        "medium_prefill",
+        "mixed_medium",
+        "large_decode",
+        "large_prefill",
+        "single_decode",
+        "single_prefill",
+    ],
+)
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_causal_backend_correctness(batch_spec_name: str, model: str):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_causal_backend_correctness(
+    batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """Test backend's correctness with causal attention."""
 
     def causal_mask_mod(
@@ -499,35 +549,51 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str):
         return (q_idx + context_len) >= kv_idx
 
     batch_spec = BATCH_SPECS[batch_spec_name]
-    LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
-                            if is_torch_equal_or_newer("2.9.0.dev0") else [])
+    LARGE_BLOCK_BACKENDS = (
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
+    )
     SMALL_BLOCK_BACKENDS = [
         x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
     ]
-    _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
-                              causal_mask_mod)
+    _test_backend_correctness(
+        batch_spec,
+        model,
+        SMALL_BLOCK_BACKENDS,
+        causal_mask_mod,
+        tensor_parallel_size=tensor_parallel_size,
+    )
 
     # Fast FlexAttention needs to run with block_size=128
     if LARGE_BLOCK_BACKENDS:
-        _test_backend_correctness(batch_spec,
-                                  model,
-                                  LARGE_BLOCK_BACKENDS,
-                                  causal_mask_mod,
-                                  block_size=128)
+        _test_backend_correctness(
+            batch_spec,
+            model,
+            LARGE_BLOCK_BACKENDS,
+            causal_mask_mod,
+            block_size=128,
+            tensor_parallel_size=tensor_parallel_size,
+        )
 
 
 SLIDING_WINDOW_BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
-    "FLEX_ATTENTION_SLOW"
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
+    "FLEX_ATTENTION_SLOW",
 ]
 
 
-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_medium", "large_decode",
-    "large_prefill"
-])
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
+)
 @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
-def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_sliding_window_backend_correctness(
+    batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """Test backend's correctness with sliding window attention."""
 
     def sliding_window_mask_mod(
@@ -544,25 +610,35 @@ def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
         return causal_mask & window_mask
 
     batch_spec = BATCH_SPECS[batch_spec_name]
-    model_config = ModelConfig(model=model,
-                               max_model_len=max(batch_spec.seq_lens))
+    model_config = ModelConfig(model=model, max_model_len=max(batch_spec.seq_lens))
     sliding_window = model_config.get_sliding_window()
-    sliding_window_mask_mod_fn = partial(sliding_window_mask_mod,
-                                         sliding_window=sliding_window)
+    sliding_window_mask_mod_fn = partial(
+        sliding_window_mask_mod, sliding_window=sliding_window
+    )
 
-    LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
-                            if is_torch_equal_or_newer("2.9.0.dev0") else [])
+    LARGE_BLOCK_BACKENDS = (
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
+    )
     SMALL_BLOCK_BACKENDS = [
-        x for x in SLIDING_WINDOW_BACKENDS_TO_TEST
-        if x not in LARGE_BLOCK_BACKENDS
+        x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
     ]
-    _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
-                              sliding_window_mask_mod_fn)
+    _test_backend_correctness(
+        batch_spec,
+        model,
+        SMALL_BLOCK_BACKENDS,
+        sliding_window_mask_mod_fn,
+        tensor_parallel_size=tensor_parallel_size,
+    )
 
     # Fast FlexAttention needs to run with block_size=128
     if LARGE_BLOCK_BACKENDS:
-        _test_backend_correctness(batch_spec,
-                                  model,
-                                  LARGE_BLOCK_BACKENDS,
-                                  sliding_window_mask_mod_fn,
-                                  block_size=128)
+        _test_backend_correctness(
+            batch_spec,
+            model,
+            LARGE_BLOCK_BACKENDS,
+            sliding_window_mask_mod_fn,
+            block_size=128,
+            tensor_parallel_size=tensor_parallel_size,
+        )
diff --git a/tests/v1/attention/test_attention_backends_selection.py b/tests/v1/attention/test_attention_backends_selection.py
index 59e56281494680a893e29bbae160b5d51c7b5f15..6464bb52a4eaa56e4e03e6da672253aea1bacd47 100644
--- a/tests/v1/attention/test_attention_backends_selection.py
+++ b/tests/v1/attention/test_attention_backends_selection.py
@@ -9,17 +9,16 @@ import pytest
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.short_conv import ShortConv
-from vllm.model_executor.models.minimax_text_01 import (
-    MiniMaxText01LinearAttention)
+from vllm.model_executor.models.minimax_text_01 import MiniMaxText01LinearAttention
 from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-from vllm.v1.attention.backends.short_conv_attn import (
-    ShortConvAttentionBackend)
+from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
 
 
 @pytest.mark.parametrize(
-    "layer_class, init_kwargs, expected_backend, expected_mamba_type", [
+    "layer_class, init_kwargs, expected_backend, expected_mamba_type",
+    [
         (
             MambaMixer,
             dict(
@@ -77,9 +76,11 @@ from vllm.v1.attention.backends.short_conv_attn import (
             ShortConvAttentionBackend,
             "short_conv",
         ),
-    ])
-def test_mamba_layers_get_attn_backend(dist_init, layer_class, init_kwargs,
-                                       expected_backend, expected_mamba_type):
+    ],
+)
+def test_mamba_layers_get_attn_backend(
+    dist_init, layer_class, init_kwargs, expected_backend, expected_mamba_type
+):
     """Test that Mamba-like layers return the correct attention backend."""
     layer = layer_class(**init_kwargs)
 
@@ -88,17 +89,23 @@ def test_mamba_layers_get_attn_backend(dist_init, layer_class, init_kwargs,
     assert layer.mamba_type == expected_mamba_type
 
 
-@pytest.mark.parametrize("layer_class,expected_backend,expected_mamba_type", [
-    (MambaMixer, Mamba1AttentionBackend, "mamba1"),
-    (MambaMixer2, Mamba2AttentionBackend, "mamba2"),
-    (MiniMaxText01LinearAttention, LinearAttentionBackend, "linear_attention"),
-    (ShortConv, ShortConvAttentionBackend, "short_conv"),
-])
-def test_mamba_layers_have_unified_interface(layer_class, expected_backend,
-                                             expected_mamba_type):
-    """Test that all Mamba layers have the unified get_attn_backend 
+@pytest.mark.parametrize(
+    "layer_class,expected_backend,expected_mamba_type",
+    [
+        (MambaMixer, Mamba1AttentionBackend, "mamba1"),
+        (MambaMixer2, Mamba2AttentionBackend, "mamba2"),
+        (MiniMaxText01LinearAttention, LinearAttentionBackend, "linear_attention"),
+        (ShortConv, ShortConvAttentionBackend, "short_conv"),
+    ],
+)
+def test_mamba_layers_have_unified_interface(
+    layer_class, expected_backend, expected_mamba_type
+):
+    """Test that all Mamba layers have the unified get_attn_backend
     interface."""
-    assert hasattr(layer_class, 'get_attn_backend'), (
-        f"{layer_class.__name__} should have get_attn_backend method")
-    assert hasattr(layer_class, 'mamba_type'), (
-        f"{layer_class.__name__} should have mamba_type property")
+    assert hasattr(layer_class, "get_attn_backend"), (
+        f"{layer_class.__name__} should have get_attn_backend method"
+    )
+    assert hasattr(layer_class, "mamba_type"), (
+        f"{layer_class.__name__} should have mamba_type property"
+    )
diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index d81f3da7e9cd9fe15e7ae642b9a063d24e355797..1cbd0fe56be6de93c53db642d12b655e9070a9ea 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -6,12 +6,14 @@ import torch
 
 from tests.v1.attention.test_attention_backends import BATCH_SPECS
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
-from vllm.v1.attention.backends.utils import (UBatchSlice,
-                                              _make_metadata_with_slice,
-                                              slice_query_start_locs,
-                                              split_attn_metadata,
-                                              split_decodes_and_prefills)
-from vllm.v1.worker.ubatch_splitting import create_ubatch_slices
+from vllm.v1.attention.backends.utils import (
+    UBatchSlice,
+    _make_metadata_with_slice,
+    slice_query_start_locs,
+    split_attn_metadata,
+    split_decodes_and_prefills,
+)
+from vllm.v1.worker.ubatch_utils import create_ubatch_slices
 
 
 @pytest.fixture
@@ -79,9 +81,7 @@ def small_decode_metadata():
     """Create metadata for small decode batch"""
     batch_spec = BATCH_SPECS["small_decode"]
     device = torch.device("cpu")
-    return create_common_attn_metadata(batch_spec,
-                                       block_size=16,
-                                       device=device)
+    return create_common_attn_metadata(batch_spec, block_size=16, device=device)
 
 
 @pytest.fixture
@@ -89,9 +89,7 @@ def large_decode_metadata():
     """Create metadata for small decode batch"""
     batch_spec = BATCH_SPECS["large_decode"]
     device = torch.device("cpu")
-    return create_common_attn_metadata(batch_spec,
-                                       block_size=16,
-                                       device=device)
+    return create_common_attn_metadata(batch_spec, block_size=16, device=device)
 
 
 @pytest.fixture
@@ -99,9 +97,7 @@ def mixed_small_metadata():
     """Create metadata for mixed small batch"""
     batch_spec = BATCH_SPECS["mixed_small"]
     device = torch.device("cpu")
-    return create_common_attn_metadata(batch_spec,
-                                       block_size=16,
-                                       device=device)
+    return create_common_attn_metadata(batch_spec, block_size=16, device=device)
 
 
 # Tests for _make_metadata_with_slice
@@ -122,8 +118,7 @@ def test_make_metadata_with_slice_decode_batch(small_decode_metadata):
 
 def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata):
     """Test slicing mixed batch metadata"""
-    ubatch_slice = UBatchSlice(slice(1, 3),
-                               slice(1, 7))  # Requests 1-3, tokens 1-7
+    ubatch_slice = UBatchSlice(slice(1, 3), slice(1, 7))  # Requests 1-3, tokens 1-7
 
     result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata)
 
@@ -140,8 +135,7 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
     mid_point = num_tokens // 2
     ubatch_slices = [
         UBatchSlice(slice(0, mid_point), slice(0, mid_point)),
-        UBatchSlice(slice(mid_point, num_tokens), slice(mid_point,
-                                                        num_tokens)),
+        UBatchSlice(slice(mid_point, num_tokens), slice(mid_point, num_tokens)),
     ]
 
     results = split_attn_metadata(ubatch_slices, large_decode_metadata)
@@ -159,26 +153,30 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
     assert torch.equal(results[1].seq_lens, torch.tensor([2048] * mid_point))
 
 
-def apply_split_decodes_and_prefills(query_lens: list[int],
-                                     decode_threshold: int,
-                                     require_uniform: bool):
+def apply_split_decodes_and_prefills(
+    query_lens: list[int], decode_threshold: int, require_uniform: bool
+):
     """Helper function to apply split_decodes_and_prefills and return
     the results."""
     device = torch.device("cpu")
     seq_lens = [10 * (i + 1) for i in range(len(query_lens))]
-    common_metadata = create_common_attn_metadata(BatchSpec(
-        seq_lens=seq_lens, query_lens=query_lens),
-                                                  block_size=16,
-                                                  device=device)
-    return split_decodes_and_prefills(common_metadata,
-                                      decode_threshold=decode_threshold,
-                                      require_uniform=require_uniform)
+    common_metadata = create_common_attn_metadata(
+        BatchSpec(seq_lens=seq_lens, query_lens=query_lens),
+        block_size=16,
+        device=device,
+    )
+    return split_decodes_and_prefills(
+        common_metadata,
+        decode_threshold=decode_threshold,
+        require_uniform=require_uniform,
+    )
 
 
 def test_split_decodes_and_prefills_nonuniform_all_ones():
     query_lens = [1, 1, 1]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 1, False))
+        apply_split_decodes_and_prefills(query_lens, 1, False)
+    )
     assert num_decodes == 3
     assert num_prefills == 0
     assert num_decode_tokens == 3
@@ -188,7 +186,8 @@ def test_split_decodes_and_prefills_nonuniform_all_ones():
 def test_split_decodes_and_prefills_nonuniform_all_short_decodes():
     query_lens = [1, 2, 1, 3, 2, 1, 2]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 3, False))
+        apply_split_decodes_and_prefills(query_lens, 3, False)
+    )
     assert num_decodes == 7
     assert num_prefills == 0
     assert num_decode_tokens == sum(query_lens)
@@ -198,7 +197,8 @@ def test_split_decodes_and_prefills_nonuniform_all_short_decodes():
 def test_split_decodes_and_prefills_nonuniform_all_prefills():
     query_lens = [4, 5, 6, 7]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 3, False))
+        apply_split_decodes_and_prefills(query_lens, 3, False)
+    )
     assert num_decodes == 0
     assert num_prefills == 4
     assert num_decode_tokens == 0
@@ -208,7 +208,8 @@ def test_split_decodes_and_prefills_nonuniform_all_prefills():
 def test_split_decodes_and_prefills_nonuniform_mixed_batch():
     query_lens = [2, 1, 3, 4, 5, 6, 7, 8]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 4, False))
+        apply_split_decodes_and_prefills(query_lens, 4, False)
+    )
     assert num_decodes == 4  # 2, 1, 3, 4 are all <= 4
     assert num_prefills == 4  # 5, 6, 7, 8 are all > 4
     assert num_decode_tokens == 10  # 2 + 1 + 3 + 4
@@ -218,7 +219,8 @@ def test_split_decodes_and_prefills_nonuniform_mixed_batch():
 def test_split_decodes_and_prefills_uniform_all_ones():
     query_lens = [1, 1, 1]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 1, True))
+        apply_split_decodes_and_prefills(query_lens, 1, True)
+    )
     assert num_decodes == 3
     assert num_prefills == 0
     assert num_decode_tokens == 3
@@ -228,7 +230,8 @@ def test_split_decodes_and_prefills_uniform_all_ones():
 def test_split_decodes_and_prefills_uniform_all_short_decodes():
     query_lens = [2, 2, 1, 3, 2, 1, 2]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 3, True))
+        apply_split_decodes_and_prefills(query_lens, 3, True)
+    )
     assert num_decodes == 2
     assert num_prefills == 5
     assert num_decode_tokens == 4
@@ -238,7 +241,8 @@ def test_split_decodes_and_prefills_uniform_all_short_decodes():
 def test_split_decodes_and_prefills_uniform_all_prefills():
     query_lens = [4, 5, 6, 7]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 3, True))
+        apply_split_decodes_and_prefills(query_lens, 3, True)
+    )
     assert num_decodes == 0
     assert num_prefills == 4
     assert num_decode_tokens == 0
@@ -248,7 +252,8 @@ def test_split_decodes_and_prefills_uniform_all_prefills():
 def test_split_decodes_and_prefills_uniform_mixed_batch_all_uniform_decodes():
     query_lens = [2, 2, 2, 4, 5, 6, 7, 8]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 4, True))
+        apply_split_decodes_and_prefills(query_lens, 4, True)
+    )
     assert num_decodes == 3  # 2, 2, 2 are all <= 4 and uniform
     assert num_prefills == 5  # 4, 5, 6, 7, 8 are all > 4
     assert num_decode_tokens == 6  # 2 + 2 + 2
@@ -258,7 +263,8 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_all_uniform_decodes():
 def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes():
     query_lens = [2, 1, 2, 4, 5, 6, 7, 8]
     num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-        apply_split_decodes_and_prefills(query_lens, 4, True))
+        apply_split_decodes_and_prefills(query_lens, 4, True)
+    )
     assert num_decodes == 1  # only the first 2 is taken as decode
     assert num_prefills == 7  # 1, 2, 4, 5, 6, 7, 8 are all > 4 or non-uniform
     assert num_decode_tokens == 2  # only the first 2
@@ -274,17 +280,15 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes():
         ([32, 40], [8, 8], 4, 1, 2),
     ],
 )
-def test_prefill_split_across_ubatches(seq_lens, query_lens, split_point,
-                                       expected_first_reqs,
-                                       expected_second_reqs):
+def test_prefill_split_across_ubatches(
+    seq_lens, query_lens, split_point, expected_first_reqs, expected_second_reqs
+):
     """Test splitting a prefill across ubatches"""
     import numpy as np
 
     device = torch.device("cpu")
     batch_spec = BatchSpec(seq_lens=seq_lens, query_lens=query_lens)
-    common = create_common_attn_metadata(batch_spec,
-                                         block_size=16,
-                                         device=device)
+    common = create_common_attn_metadata(batch_spec, block_size=16, device=device)
 
     num_scheduled_tokens = np.array(query_lens, dtype=np.int32)
     qsl_np = common.query_start_loc_cpu.numpy()
@@ -307,19 +311,19 @@ def test_prefill_split_across_ubatches(seq_lens, query_lens, split_point,
     # Identify which request is split and how many tokens are in the first chunk
     split_req_idx = int(np.searchsorted(qsl_np, split_point, side="right") - 1)
     tokens_in_first_chunk = split_point - int(qsl_np[split_req_idx])
-    orig_q_lens = (common.query_start_loc_cpu[1:] -
-                   common.query_start_loc_cpu[:-1])
+    orig_q_lens = common.query_start_loc_cpu[1:] - common.query_start_loc_cpu[:-1]
 
     # Check query length continuity: first-chunk + second-chunk == original qlen
     # First ubatch last request query length
-    qlen_first_last = int(first_meta.query_start_loc_cpu[-1] -
-                          first_meta.query_start_loc_cpu[-2])
+    qlen_first_last = int(
+        first_meta.query_start_loc_cpu[-1] - first_meta.query_start_loc_cpu[-2]
+    )
     # Second ubatch first request query length
-    qlen_second_first = int(second_meta.query_start_loc_cpu[1] -
-                            second_meta.query_start_loc_cpu[0])
+    qlen_second_first = int(
+        second_meta.query_start_loc_cpu[1] - second_meta.query_start_loc_cpu[0]
+    )
     assert qlen_first_last == tokens_in_first_chunk
-    assert qlen_first_last + qlen_second_first == int(
-        orig_q_lens[split_req_idx])
+    assert qlen_first_last + qlen_second_first == int(orig_q_lens[split_req_idx])
 
     # Check seq_lens adjustments
     # Context lengths per original request
diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py
new file mode 100644
index 0000000000000000000000000000000000000000..e37219454222bcf0e69ad1d9fb7b8b25b350ac6a
--- /dev/null
+++ b/tests/v1/attention/test_batch_reordering.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+from vllm.v1.attention.backends.utils import reorder_batch_to_split_decodes_and_prefills
+
+
+class MockInputBatch:
+    def __init__(self, req_ids, num_computed_tokens_cpu):
+        self.req_ids = req_ids
+        self.num_computed_tokens_cpu = num_computed_tokens_cpu
+
+    def swap_states(self, i, j):
+        self.req_ids[i], self.req_ids[j] = self.req_ids[j], self.req_ids[i]
+        self.num_computed_tokens_cpu[i], self.num_computed_tokens_cpu[j] = (
+            self.num_computed_tokens_cpu[j],
+            self.num_computed_tokens_cpu[i],
+        )
+
+
+class MockSchedulerOutput:
+    def __init__(self, num_scheduled_tokens):
+        self.num_scheduled_tokens = num_scheduled_tokens
+
+
+@dataclass
+class ReorderTestCase:
+    requests: list[tuple[int, int]]  # (num_scheduled_tokens, num_computed_tokens)
+    expected_order: list[int]
+    expected_modified: bool
+    decode_threshold: int = 1
+
+
+# Test cases for batch reordering
+REORDER_TEST_CASES = {
+    "all_decodes": ReorderTestCase(
+        requests=[(1, 10), (1, 20), (1, 30)],
+        expected_order=[0, 1, 2],
+        expected_modified=False,
+    ),
+    "all_prefills": ReorderTestCase(
+        requests=[(100, 100), (200, 200), (300, 300)],
+        expected_order=[0, 1, 2],
+        expected_modified=False,
+    ),
+    "mixed_interleaved": ReorderTestCase(
+        requests=[(100, 100), (1, 10), (200, 200), (1, 20)],
+        expected_order=[3, 1, 2, 0],  # Only swap 0↔3, keep 1 and 2 in place
+        expected_modified=True,
+    ),
+    "already_ordered": ReorderTestCase(
+        requests=[(1, 10), (1, 20), (100, 100), (200, 0)],
+        expected_order=[0, 1, 2, 3],
+        expected_modified=False,
+    ),
+    "single_request": ReorderTestCase(
+        requests=[(1, 10)],
+        expected_order=[0],
+        expected_modified=False,
+    ),
+    "higher_threshold": ReorderTestCase(
+        requests=[(2, 10), (3, 20), (5, 30), (6, 40)],
+        expected_order=[0, 1, 2, 3],
+        expected_modified=False,
+        decode_threshold=4,
+    ),
+    "decodes_at_end": ReorderTestCase(
+        requests=[(100, 100), (200, 200), (1, 10), (1, 20)],
+        expected_order=[2, 3, 0, 1],
+        expected_modified=True,
+    ),
+    "decode_extend_prefill": ReorderTestCase(
+        requests=[(100, 0), (10, 50), (1, 10)],
+        expected_order=[2, 1, 0],
+        expected_modified=True,
+    ),
+    "extend_prefill_only": ReorderTestCase(
+        requests=[(100, 0), (10, 50), (200, 0), (20, 75)],
+        expected_order=[3, 1, 2, 0],  # Only swap 0↔3, keep 1 and 2 in place
+        expected_modified=True,
+    ),
+    "complicated_mixed_interleaved": ReorderTestCase(
+        requests=[
+            (1, 20),
+            (1, 50),
+            (374, 0),
+            (300, 20),
+            (1, 20),
+            (256, 0),
+            (1, 5),
+            (27, 0),
+            (1, 4),
+        ],
+        expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
+        expected_modified=True,
+    ),
+}
+
+
+@pytest.mark.parametrize(
+    "test_case", REORDER_TEST_CASES.values(), ids=REORDER_TEST_CASES.keys()
+)
+def test_reorder_batch_to_split_decodes_and_prefills(test_case: ReorderTestCase):
+    req_ids = [f"r{i}" for i in range(len(test_case.requests))]
+    num_computed_tokens = np.array([r[1] for r in test_case.requests], dtype=np.int32)
+    num_scheduled_tokens = {f"r{i}": r[0] for i, r in enumerate(test_case.requests)}
+
+    input_batch = MockInputBatch(req_ids, num_computed_tokens)
+    scheduler_output = MockSchedulerOutput(num_scheduled_tokens)
+
+    modified = reorder_batch_to_split_decodes_and_prefills(
+        input_batch, scheduler_output, decode_threshold=test_case.decode_threshold
+    )
+
+    expected_req_ids = [f"r{i}" for i in test_case.expected_order]
+
+    assert modified == test_case.expected_modified, (
+        f"Expected modified={test_case.expected_modified}, got {modified}"
+    )
+    assert input_batch.req_ids == expected_req_ids, (
+        f"Expected order {expected_req_ids}, got {input_batch.req_ids}"
+    )
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index be77256a0d2f01547e87ab0723cf417b0f92cfc7..faace3473a281f071c69a073eae9160e8c601923 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -7,8 +7,7 @@ import pytest
 import torch
 
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
-from vllm.v1.attention.backends.utils import (
-    make_local_attention_virtual_batches)
+from vllm.v1.attention.backends.utils import make_local_attention_virtual_batches
 
 
 @dataclass
@@ -46,21 +45,24 @@ test_data_list = [
             [17, 17],  # local-batch 5, (batch 1, starting from k[16])
             [20, 21],  # local-batch 6, (batch 2, starting from k[4])
             [22, 23],  # local-batch 7, (batch 2, starting from k[8])
-        ]),
+        ],
+    ),
     # Case where block indices are not clipped to block table ncols-1
     # because tokens_in_last_block == attn_chunk_size
-    LocalAttentionTestData(batch_spec=BatchSpec(
-        query_lens=[8],
-        seq_lens=[12],
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[8],
+            seq_lens=[12],
+        ),
+        attn_chunk_size=4,
+        block_size=2,
+        expected_q_seqlens=[4, 4],
+        expected_k_seqlens=[4, 4],
+        expected_local_block_table=[
+            [2, 3],
+            [4, 5],
+        ],
     ),
-                           attn_chunk_size=4,
-                           block_size=2,
-                           expected_q_seqlens=[4, 4],
-                           expected_k_seqlens=[4, 4],
-                           expected_local_block_table=[
-                               [2, 3],
-                               [4, 5],
-                           ]),
     # Case where all kv_seq positions are involved in attn
     LocalAttentionTestData(
         batch_spec=BatchSpec(
@@ -76,7 +78,8 @@ test_data_list = [
             [0, 1],
             [2, 3],
             [4, 4],
-        ]),
+        ],
+    ),
     # Case where attn_chunk_size > kv_seq_len
     # so no extra mini virtual batches are created
     LocalAttentionTestData(
@@ -97,7 +100,8 @@ test_data_list = [
         # is calculated as (attn_chunk_size // block_size)
         expected_local_block_table=[
             [0, 1, 2, 2, 2],
-        ]),
+        ],
+    ),
     # Block size equal to chunk size
     # Expect single page per batch in local batch table
     LocalAttentionTestData(
@@ -118,7 +122,8 @@ test_data_list = [
             [1],  # local-batch 1, (batch 0, starting from k[4])
             [2],  # local-batch 1, (batch 0, starting from k[0])
             [3],  # local-batch 1, (batch 0, starting from k[4])
-        ]),
+        ],
+    ),
     # Case where query falls in the second attention chunk
     #  k_toks >   0 1 2 3 4
     #  q_toks v  _____________
@@ -128,17 +133,19 @@ test_data_list = [
     #         3 | 1 1 1 1
     #         4 |         1
     #  where tokens 0,1,2,3 have been pre-computed
-    LocalAttentionTestData(batch_spec=BatchSpec(
-        query_lens=[1],
-        seq_lens=[5],
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[1],
+            seq_lens=[5],
+        ),
+        attn_chunk_size=4,
+        block_size=2,
+        expected_q_seqlens=[1],
+        expected_k_seqlens=[1],
+        expected_local_block_table=[
+            [2, 2],
+        ],
     ),
-                           attn_chunk_size=4,
-                           block_size=2,
-                           expected_q_seqlens=[1],
-                           expected_k_seqlens=[1],
-                           expected_local_block_table=[
-                               [2, 2],
-                           ]),
 ]
 
 
@@ -165,9 +172,9 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
     )
 
     # Call the function
-    result = make_local_attention_virtual_batches(attn_chunk_size,
-                                                  common_attn_metadata,
-                                                  block_size)
+    result = make_local_attention_virtual_batches(
+        attn_chunk_size, common_attn_metadata, block_size
+    )
 
     # Convert to numpy for easier comparison
     actual_q_seqlens = np.diff(result.query_start_loc_cpu.numpy())
@@ -184,13 +191,11 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
     np.testing.assert_array_equal(actual_q_seqlens, expected_q_seqlens)
     np.testing.assert_array_equal(actual_k_seqlens, expected_k_seqlens)
 
-    expected_block_table_tensor =\
-        torch.tensor(expected_local_block_table,
-        dtype=torch.int32,
-        device=device)
+    expected_block_table_tensor = torch.tensor(
+        expected_local_block_table, dtype=torch.int32, device=device
+    )
 
     print(f"Expected block table:\n{expected_block_table_tensor}")
     print(f"Actual block table:\n{result.block_table_tensor}")
 
-    torch.testing.assert_close(result.block_table_tensor,
-                               expected_block_table_tensor)
+    torch.testing.assert_close(result.block_table_tensor, expected_block_table_tensor)
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 228551573ba892d3804a723931d47647b7ad5325..1bd05e6183dc2ef1ce1b04975171077d164bd1a4 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -1,30 +1,75 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for v1 MLA backends without GPUModelRunner dependency."""
-from typing import Optional, Union
+"""Tests for v1 MLA backends without GPUModelRunner dependency.
+
+Known Issues:
+- FLASH_ATTN_MLA backend occasionally produces NaN values in
+  test_backend_correctness[mixed_small] when run after
+  test_backend_correctness[small_prefill], but passes when run alone.
+"""
 
 import pytest
 import torch
 
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config,
-                                      get_attention_backend)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    create_common_attn_metadata,
+    create_vllm_config,
+    try_get_attention_backend,
+)
 from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+from vllm.attention.utils.fa_utils import flash_attn_supports_mla
+from vllm.config.vllm import set_current_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backends.mla.common import QueryLenSupport
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA,
-    _Backend.TRITON_MLA
+    AttentionBackendEnum.CUTLASS_MLA,
+    AttentionBackendEnum.FLASHMLA,
+    AttentionBackendEnum.FLASH_ATTN_MLA,
+    AttentionBackendEnum.FLASHINFER_MLA,
+    AttentionBackendEnum.TRITON_MLA,
 ]
 
-# Remove CUTLASS_MLA from the list if not using sm100
-if not torch.cuda.is_available() or torch.cuda.get_device_properties(
-        0).major < 10:
-    BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA)
+# Remove sm100 backends from the list if not using sm100
+if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10:
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.CUTLASS_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER_MLA)
+
+# Remove FLASH_ATTN_MLA from the list if not supported
+if not flash_attn_supports_mla():
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASH_ATTN_MLA)
+
+# Remove FLASHMLA from the list if not supported
+if not is_flashmla_dense_supported()[0]:
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHMLA)
+
+SPEC_DECODE_BACKENDS = []
+for backend in BACKENDS_TO_TEST:
+    builder_cls, _ = try_get_attention_backend(backend)
+    query_len_support = getattr(
+        builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY
+    )
+    if query_len_support != QueryLenSupport.SINGLE_ONLY:
+        SPEC_DECODE_BACKENDS.append(backend)
+
+BACKEND_BLOCK_SIZES = {}
+for backend in BACKENDS_TO_TEST:
+    supported_sizes = backend.get_class().supported_kernel_block_sizes
+    if supported_sizes:
+        default_size = supported_sizes[0]
+        block_size = (
+            default_size if isinstance(default_size, int) else default_size.base
+        )
+    else:
+        block_size = 16
+    BACKEND_BLOCK_SIZES[backend] = block_size
 
 torch.manual_seed(42)
 
@@ -46,45 +91,47 @@ def _convert_dtype_to_torch(dtype):
 
 # Define common batch configurations
 BATCH_SPECS = {
-    "small_decode":
-    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
-    "small_prefill":
-    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
-    "mixed_small":
-    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
-    "medium_decode":
-    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
-              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
-    "medium_prefill":
-    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
-    "mixed_medium":
-    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
-              query_lens=[1, 1, 1, 7, 7, 7]),
-    "large_decode":
-    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
-    "large_prefill":
-    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
-    "single_decode":
-    BatchSpec(seq_lens=[1024], query_lens=[1]),
-    "single_prefill":
-    BatchSpec(seq_lens=[1024], query_lens=[64]),
+    "small_decode": BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
+    "small_prefill": BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
+    "mixed_small": BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
+    "medium_decode": BatchSpec(
+        seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+        query_lens=[1, 1, 1, 1, 1, 1, 1, 1],
+    ),
+    "medium_prefill": BatchSpec(
+        seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]
+    ),
+    "mixed_medium": BatchSpec(
+        seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[1, 1, 1, 7, 7, 7]
+    ),
+    "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
+    "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
+    "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
+    "spec_decode_small": BatchSpec(
+        seq_lens=[128, 256, 512, 1024], query_lens=[4, 4, 4, 4]
+    ),
+    "spec_decode_medium": BatchSpec(
+        seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[8, 8, 8, 8, 8, 8]
+    ),
 }
 
 
 def create_and_prepopulate_kv_cache(
-        kv_c_contexts: list[torch.Tensor],
-        k_pe_contexts: list[torch.Tensor],
-        block_size: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        num_blocks: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True,
-        kv_cache_dtype: Optional[str] = None,
-        scale: Union[float, torch.Tensor] = 1.0) -> torch.Tensor:
+    kv_c_contexts: list[torch.Tensor],
+    k_pe_contexts: list[torch.Tensor],
+    block_size: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int,
+    common_attn_metadata: CommonAttentionMetadata,
+    randomize_blocks: bool = True,
+    kv_cache_dtype: str | None = None,
+    scale: float | torch.Tensor = 1.0,
+) -> torch.Tensor:
     """Create and prepopulate an MLA KV cache with context data.
-    
+
     Args:
         kv_c_contexts: List of latent KV context tensors for each sequence
         k_pe_contexts: List of key positional embedding context tensors
@@ -95,21 +142,23 @@ def create_and_prepopulate_kv_cache(
         device: Device to create the cache on
         num_blocks: Total number of blocks in the cache
         common_attn_metadata: Common attention metadata
-        randomize_blocks: Whether to randomly permute blocks 
+        randomize_blocks: Whether to randomly permute blocks
                           or use sequential order
         kv_cache_dtype: Optional kv cache dtype string. When set to
                         "fp8_ds_mla" the cache is populated using the
                         fp8 DeepSeek MLA layout via concat_and_cache_mla.
         scale: Scaling factor forwarded to concat_and_cache_mla when the
                fp8 cache layout is requested.
-        
+
     Returns:
         MLA KV cache tensor
     """
     batch_size = len(kv_c_contexts)
     seq_lens = common_attn_metadata.seq_lens_cpu
-    query_lens = common_attn_metadata.query_start_loc_cpu[
-        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
+    query_lens = (
+        common_attn_metadata.query_start_loc_cpu[1:]
+        - common_attn_metadata.query_start_loc_cpu[:-1]
+    )
     context_lens = common_attn_metadata.num_computed_tokens_cpu
     block_table = common_attn_metadata.block_table_tensor
     slot_mapping = common_attn_metadata.slot_mapping
@@ -118,27 +167,26 @@ def create_and_prepopulate_kv_cache(
 
     if use_fp8_ds_mla:
         if not kv_c_contexts:
-            raise ValueError("kv_c_contexts cannot be empty when using"
-                             " fp8_ds_mla cache dtype")
+            raise ValueError(
+                "kv_c_contexts cannot be empty when using fp8_ds_mla cache dtype"
+            )
         kv_lora_rank = kv_c_contexts[0].shape[-1]
         rope_dim = k_pe_contexts[0].shape[-1]
         entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
-        kv_cache = torch.zeros(num_blocks,
-                               block_size,
-                               entry_size,
-                               dtype=torch.uint8,
-                               device=device)
-        scale_tensor = (scale
-                        if isinstance(scale, torch.Tensor) else torch.tensor(
-                            scale, dtype=torch.float32, device=device))
+        kv_cache = torch.zeros(
+            num_blocks, block_size, entry_size, dtype=torch.uint8, device=device
+        )
+        scale_tensor = (
+            scale
+            if isinstance(scale, torch.Tensor)
+            else torch.tensor(scale, dtype=torch.float32, device=device)
+        )
         scale_tensor = scale_tensor.to(device=device, dtype=torch.float32)
     else:
         # Create MLA KV cache: (num_blocks, block_size, head_size)
-        kv_cache = torch.empty(num_blocks,
-                               block_size,
-                               head_size,
-                               dtype=dtype,
-                               device=device)
+        kv_cache = torch.zeros(
+            num_blocks, block_size, head_size, dtype=dtype, device=device
+        )
         kv_cache_flat = kv_cache.view(-1, head_size)
 
     # Populate the cache with the context tokens
@@ -154,8 +202,7 @@ def create_and_prepopulate_kv_cache(
         start = start_block_idx * block_size
 
         if use_fp8_ds_mla:
-            slots = torch.arange(context_len, device=device,
-                                 dtype=torch.long) + start
+            slots = torch.arange(context_len, device=device, dtype=torch.long) + start
             ops.concat_and_cache_mla(
                 kv_c_context,
                 k_pe_context.squeeze(1),
@@ -165,8 +212,7 @@ def create_and_prepopulate_kv_cache(
                 scale=scale_tensor,
             )
         else:
-            kv_context = torch.cat(
-                [kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+            kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1)
             end = start + kv_context.shape[0]
             kv_cache_flat[start:end, ...] = kv_context
 
@@ -177,15 +223,14 @@ def create_and_prepopulate_kv_cache(
 
     # Permute the context blocks (excluding block 0 which is null)
     if randomize_blocks:
-        perm = torch.randperm(
-            blocks_end - 1) + 1  # Random permutation starting from block 1
+        perm = (
+            torch.randperm(blocks_end - 1) + 1
+        )  # Random permutation starting from block 1
     else:
-        perm = torch.arange(
-            1, blocks_end)  # Sequential order starting from block 1
+        perm = torch.arange(1, blocks_end)  # Sequential order starting from block 1
 
     inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
-    inv_perm[1:] = torch.argsort(
-        perm) + 1  # Add 1 to account for starting from block 1
+    inv_perm[1:] = torch.argsort(perm) + 1  # Add 1 to account for starting from block 1
     kv_cache[1:blocks_end, ...] = kv_cache[perm, ...]
 
     # Construct the right block table
@@ -196,6 +241,7 @@ def create_and_prepopulate_kv_cache(
         start = start_block_idx
         end = start + num_blocks_for_seq
         block_table[i, :num_blocks_for_seq] = inv_perm[start:end]
+        block_table[i, num_blocks_for_seq:] = 0
         start_block_idx += num_blocks_for_seq
 
         # Create a realistic slot mapping that corresponds to the block table
@@ -206,8 +252,8 @@ def create_and_prepopulate_kv_cache(
         start = common_attn_metadata.query_start_loc_cpu[i]
         end = common_attn_metadata.query_start_loc_cpu[i + 1]
         slot_mapping[start:end] = block_table[
-            i,
-            block_indices] * block_size + token_inter_block_offsets.to(device)
+            i, block_indices
+        ] * block_size + token_inter_block_offsets.to(device)
 
     return kv_cache
 
@@ -219,88 +265,137 @@ class MockAttentionLayer:
         self._q_scale = torch.tensor(1.0, device=device)
         self._k_scale = torch.tensor(1.0, device=device)
         self._v_scale = torch.tensor(1.0, device=device)
-
-
-def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          layer_names: list[str], vllm_config,
-                          device: torch.device,
-                          common_attn_metadata: CommonAttentionMetadata,
-                          query: torch.Tensor, kv_c: torch.Tensor,
-                          k_pe: torch.Tensor, kv_cache: torch.Tensor,
-                          kv_lora_rank: int, qk_nope_head_dim: int,
-                          qk_rope_head_dim: int, v_head_dim: int,
-                          mock_kv_b_proj) -> torch.Tensor:
+        self._prob_scale = torch.tensor(1.0, device=device)
+        self._q_scale_float = 1.0
+        self._k_scale_float = 1.0
+        self._v_scale_float = 1.0
+
+    def forward(self, *_args, **_kwargs):
+        raise NotImplementedError
+
+
+class MockMLAAttentionLayer(AttentionLayerBase):
+    """A mock MLA attention layer for populating static_forward_context."""
+
+    def __init__(self, impl):
+        self.impl = impl
+
+    def get_attn_backend(self):
+        raise NotImplementedError
+
+    def get_kv_cache_spec(self, vllm_config):
+        raise NotImplementedError
+
+
+def run_attention_backend(
+    backend: AttentionBackendEnum,
+    kv_cache_spec: FullAttentionSpec,
+    layer_names: list[str],
+    vllm_config,
+    device: torch.device,
+    common_attn_metadata: CommonAttentionMetadata,
+    query: torch.Tensor,
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    kv_lora_rank: int,
+    qk_nope_head_dim: int,
+    qk_rope_head_dim: int,
+    v_head_dim: int,
+    mock_kv_b_proj,
+) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
-    builder_cls, impl_cls = get_attention_backend(backend)
-
-    # Build metadata
-    builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
-    attn_metadata = builder.build(
-        common_prefix_len=0,
-        common_attn_metadata=common_attn_metadata,
-    )
-
-    # Instantiate MLA implementation
-    num_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
-    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
-    head_size = vllm_config.model_config.get_head_size()
-    scale = 1.0 / (head_size**0.5)
-    impl = impl_cls(
-        num_heads=num_heads,
-        head_size=head_size,
-        scale=scale,
-        num_kv_heads=num_kv_heads,
-        alibi_slopes=None,
-        sliding_window=None,
-        kv_cache_dtype="auto",
-        logits_soft_cap=None,
-        attn_type="decoder",
-        kv_sharing_target_layer_name=None,
-        q_lora_rank=None,
-        kv_lora_rank=kv_lora_rank,
-        qk_nope_head_dim=qk_nope_head_dim,
-        qk_rope_head_dim=qk_rope_head_dim,
-        qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
-        v_head_dim=v_head_dim,
-        kv_b_proj=mock_kv_b_proj,
-    )
+    builder_cls, impl_cls = try_get_attention_backend(backend)
+
+    # Set the current vllm config so that get_current_vllm_config() works
+    # in the backend implementations
+    with set_current_vllm_config(vllm_config):
+        # Instantiate MLA implementation
+        num_heads = vllm_config.model_config.get_num_attention_heads(
+            vllm_config.parallel_config
+        )
+        num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+            vllm_config.parallel_config
+        )
+        head_size = vllm_config.model_config.get_head_size()
+        scale = 1.0 / (head_size**0.5)
+        impl = impl_cls(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            kv_b_proj=mock_kv_b_proj,
+        )
+
+        # Process weights to create W_UK_T and W_UV attributes needed by MLA
+        act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
+        impl.process_weights_after_loading(act_dtype)
+
+        # Populate static_forward_context with mock attention layers
+        for layer_name in layer_names:
+            vllm_config.compilation_config.static_forward_context[layer_name] = (
+                MockMLAAttentionLayer(impl)
+            )
 
-    # Process weights to create W_UK_T and W_UV attributes needed by MLA
-    act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
-    impl.process_weights_after_loading(act_dtype)
-
-    # Create mock layer and output buffer
-    mock_layer = MockAttentionLayer(device)
-    num_tokens = query.shape[0]
-    output = torch.empty(num_tokens,
-                         num_heads * v_head_dim,
-                         dtype=query.dtype,
-                         device=query.device)
-
-    # Run forward pass
-    # NOTE: The query, key, and value are already shaped correctly
-    # in the calling test function.
-    output = impl.forward(mock_layer,
-                          query,
-                          kv_c,
-                          k_pe,
-                          kv_cache,
-                          attn_metadata,
-                          output=output)
-
-    return output
-
-
-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
-    "single_decode", "single_prefill"
-])
-@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
-def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
+        # Build metadata
+        builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
+        attn_metadata = builder.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+        )
+
+        # Create mock layer and output buffer
+        mock_layer = MockAttentionLayer(device)
+        num_tokens = query.shape[0]
+        output = torch.empty(
+            num_tokens, num_heads * v_head_dim, dtype=query.dtype, device=query.device
+        )
+
+        # Run forward pass
+        # NOTE: The query, key, and value are already shaped correctly
+        # in the calling test function.
+        output = impl.forward(
+            mock_layer, query, kv_c, k_pe, kv_cache, attn_metadata, output=output
+        )
+
+        return output
+
+
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_small",
+        "medium_decode",
+        "medium_prefill",
+        "mixed_medium",
+        "large_decode",
+        "large_prefill",
+        "single_decode",
+        "single_prefill",
+        "spec_decode_small",
+        "spec_decode_medium",
+    ],
+)
+@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
+@pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
+def test_backend_correctness(
+    dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """
     Test that all backends produce similar outputs to a reference implementation
     using torch.nn.functional.scaled_dot_product_attention.
@@ -315,31 +410,82 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     4. Running each vLLM attention backend with the new queries and the
        simulated paged KV cache.
     5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+
+    Note: When tensor_parallel_size > 1, we simulate the head partitioning
+    by overriding the model config to use fewer heads, without requiring
+    multiple GPUs. This tests that backends work correctly with different
+    head counts.
     """
+
     batch_spec = BATCH_SPECS[batch_spec_name]
-    vllm_config = create_vllm_config(model_name=model,
-                                     max_model_len=max(batch_spec.seq_lens),
-                                     num_gpu_blocks=2048)
-    device = torch.device("cuda:0")
+    is_spec_decode_test = batch_spec_name.startswith("spec_decode")
+    unique_block_sizes = sorted(set(BACKEND_BLOCK_SIZES.values()))
+    default_block_size = unique_block_sizes[0]
+    required_blocks = sum(
+        (seq_len + default_block_size - 1) // default_block_size
+        for seq_len in batch_spec.seq_lens
+    )
+    # Add 1 for null block at index 0, and some buffer
+    num_gpu_blocks = required_blocks + 1 + 100
+
+    hf_config_override = None
+    if tensor_parallel_size > 1:
+        from vllm.config import ModelConfig
+
+        temp_config = ModelConfig(model=model, max_model_len=1)
+        original_num_heads = temp_config.hf_text_config.num_attention_heads
+        original_num_kv_heads = getattr(
+            temp_config.hf_text_config, "num_key_value_heads", None
+        )
+        hf_config_override = {
+            "num_attention_heads": original_num_heads // tensor_parallel_size,
+        }
+        if original_num_kv_heads is not None:
+            hf_config_override["num_key_value_heads"] = max(
+                1, original_num_kv_heads // tensor_parallel_size
+            )
+
+    vllm_config = create_vllm_config(
+        model_name=model,
+        tensor_parallel_size=1,  # Always use TP=1 to avoid multi-GPU requirements
+        max_model_len=max(batch_spec.seq_lens),
+        num_gpu_blocks=num_gpu_blocks,
+        block_size=default_block_size,
+        hf_config_override=hf_config_override,
+    )
+
+    # For spec decode tests, add a speculative_config to set the reorder_batch_threshold
+    if is_spec_decode_test:
+        from vllm.config import SpeculativeConfig
 
-    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+        # Get the query length from the batch spec (they should all be uniform)
+        query_len = batch_spec.query_lens[0]
+        # Set num_speculative_tokens to query_len - 1
+        # (since threshold is 1 + num_spec_tokens)
+        # Use ngram method which doesn't require a draft model
+        vllm_config.speculative_config = SpeculativeConfig(
+            method="ngram", num_speculative_tokens=query_len - 1
+        )
+
+    device = torch.device("cuda:0")
 
     # 1. Setup
     batch_size = batch_spec.batch_size
     seq_lens = batch_spec.seq_lens
     query_lens = batch_spec.query_lens
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     head_size = vllm_config.model_config.get_head_size()
     dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
-    block_size = vllm_config.cache_config.block_size
     kv_lora_rank = 512
     qk_rope_head_dim = 64
     qk_nope_head_dim = 128
     v_head_dim = 128
     total_head_size = kv_lora_rank + qk_rope_head_dim
-    assert kv_lora_rank + qk_rope_head_dim == head_size, \
+    assert kv_lora_rank + qk_rope_head_dim == head_size, (
         f"MLA dimensions don't match: {total_head_size} != {head_size}"
+    )
     scale = 1.0 / (total_head_size**0.5)
 
     # 2. Generate data and compute SDPA reference output for MLA
@@ -348,16 +494,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     kv_c_contexts, k_pe_contexts = [], []
 
     # Create shared MLA weight matrices for consistency across all sequences
-    W_UK = torch.randn(kv_lora_rank,
-                       num_q_heads,
-                       qk_nope_head_dim,
-                       dtype=dtype,
-                       device=device)
-    W_UV = torch.randn(kv_lora_rank,
-                       num_q_heads,
-                       v_head_dim,
-                       dtype=dtype,
-                       device=device)
+    W_UK = torch.randn(
+        kv_lora_rank, num_q_heads, qk_nope_head_dim, dtype=dtype, device=device
+    )
+    W_UV = torch.randn(
+        kv_lora_rank, num_q_heads, v_head_dim, dtype=dtype, device=device
+    )
     kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
 
     for i, backend in enumerate(BACKENDS_TO_TEST):
@@ -371,30 +513,51 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         # Generate MLA tensors
         # Q has both nope and rope components:
         # [q_len, num_heads, qk_nope_head_dim + qk_rope_head_dim]
-        q_c = torch.randn(q_len,
-                          num_q_heads,
-                          qk_nope_head_dim + qk_rope_head_dim,
-                          dtype=dtype,
-                          device=device)
+        q_c = torch.randn(
+            q_len,
+            num_q_heads,
+            qk_nope_head_dim + qk_rope_head_dim,
+            dtype=dtype,
+            device=device,
+        )
 
         # KV_C (latent K/V): [s_len, kv_lora_rank]
-        kv_c_full = torch.randn(s_len,
-                                kv_lora_rank,
-                                dtype=dtype,
-                                device=device)
+        kv_c_full = torch.randn(s_len, kv_lora_rank, dtype=dtype, device=device)
 
         # K_PE (rope component): [s_len, 1, qk_rope_head_dim]
-        k_pe_full = torch.randn(s_len,
-                                1,
-                                qk_rope_head_dim,
-                                dtype=dtype,
-                                device=device)
-
-        # Determine if this is decode or prefill
+        k_pe_full = torch.randn(s_len, 1, qk_rope_head_dim, dtype=dtype, device=device)
+
+        # Determine if this sequence uses the decode pipeline or prefill
+        # pipeline for each backend
+        # NOTE: For spec decode tests with uniform query_len > 1, backends that
+        # support spec decode (FLASH_ATTN_MLA with varlen support, FLASHMLA with
+        # uniform support) will use the decode pipeline (MQA-style), while
+        # backends that only support single-token queries will use the prefill
+        # pipeline (MHA-style). This ensures the reference implementation
+        # matches each backend's actual decode/prefill pipeline path.
         is_decode = []
-        for i, backend in enumerate(BACKENDS_TO_TEST):
-            builder_cls, _ = get_attention_backend(backend)
-            is_decode.append(q_len <= builder_cls.reorder_batch_threshold)
+        for backend_idx, backend in enumerate(BACKENDS_TO_TEST):
+            builder_cls, _ = try_get_attention_backend(backend)
+            if is_spec_decode_test:
+                query_len_support = getattr(
+                    builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY
+                )
+                supports_spec = query_len_support != QueryLenSupport.SINGLE_ONLY
+                is_decode.append(supports_spec)
+            else:
+                threshold = getattr(builder_cls, "reorder_batch_threshold", None)
+                query_len_support = getattr(
+                    builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY
+                )
+                within_threshold = q_len <= threshold if threshold else False
+                if (
+                    within_threshold
+                    and query_len_support == QueryLenSupport.UNIFORM
+                    and i > 0
+                ):
+                    first_q_len = query_lens[0]
+                    within_threshold = q_len == first_q_len
+                is_decode.append(within_threshold)
 
         # Split q into nope and rope components
         q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
@@ -404,8 +567,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         # Transform q_nope to latent space: q_nope @ W_UK
         # q_nope: [1, num_heads, qk_nope_head_dim]
         # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
-        ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
-                               W_UK)  # [1, num_heads, kv_lora_rank]
+        ql_nope = torch.einsum(
+            "qnh,lnh->qnl", q_nope, W_UK
+        )  # [1, num_heads, kv_lora_rank]
 
         # Build MQA attention inputs
         # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
@@ -431,25 +595,24 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
 
         sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+        )
         sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze(
-            0)  # [1, num_heads, kv_lora_rank]
+            0
+        )  # [1, num_heads, kv_lora_rank]
 
         # Project back to output space: sdpa_out @ W_UV
-        sdpa_out_i_decode = torch.einsum("qnl,lnv->qnv", sdpa_out_i_decode,
-                                         W_UV)
+        sdpa_out_i_decode = torch.einsum("qnl,lnv->qnv", sdpa_out_i_decode, W_UV)
         sdpa_out_i_decode = sdpa_out_i_decode.flatten(start_dim=-2)
 
         #######################################################
         # Prefill path: MHA-style attention with full sequence
         # Apply kv_b_proj to the full kv_c tensor
         kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, kv_b_proj_weight)
-        k_nope_full, v_full = kv_nope_full.split(
-            [qk_nope_head_dim, v_head_dim], dim=-1)
+        k_nope_full, v_full = kv_nope_full.split([qk_nope_head_dim, v_head_dim], dim=-1)
 
         # Build attention inputs for full sequence
-        q_mha = torch.cat([q_nope, q_pe],
-                          dim=-1)  # [q_len, num_heads, total_dim]
+        q_mha = torch.cat([q_nope, q_pe], dim=-1)  # [q_len, num_heads, total_dim]
         k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
         k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
 
@@ -468,15 +631,16 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
 
         # Single attention call with custom mask
         sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+        )
         sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0)
         sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2)
 
-        for i, backend in enumerate(BACKENDS_TO_TEST):
-            if is_decode[i]:
-                all_sdpa_outputs[i].append(sdpa_out_i_decode)
+        for backend_idx, backend in enumerate(BACKENDS_TO_TEST):
+            if is_decode[backend_idx]:
+                all_sdpa_outputs[backend_idx].append(sdpa_out_i_decode)
             else:
-                all_sdpa_outputs[i].append(sdpa_out_i_prefill)
+                all_sdpa_outputs[backend_idx].append(sdpa_out_i_prefill)
 
         # Inputs for vLLM MLA backends are just the new tokens
         all_q_vllm.append(q_c)
@@ -491,74 +655,165 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     query_vllm = torch.cat(all_q_vllm, dim=0)
     kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
     k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0)
-    sdpa_outputs = []
-    for i, backend in enumerate(BACKENDS_TO_TEST):
-        sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0))
+    sdpa_outputs = {}
+    for backend_idx, backend in enumerate(BACKENDS_TO_TEST):
+        sdpa_outputs[backend] = torch.cat(all_sdpa_outputs[backend_idx], dim=0)
 
     # Create mock kv_b_proj using the same weights as reference implementation
     from vllm.model_executor.layers.linear import ColumnParallelLinear
-    mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank,
-                                          output_size=num_q_heads *
-                                          (qk_nope_head_dim + v_head_dim),
-                                          bias=False).to(device=device,
-                                                         dtype=dtype)
+
+    mock_kv_b_proj = ColumnParallelLinear(
+        input_size=kv_lora_rank,
+        output_size=num_q_heads * (qk_nope_head_dim + v_head_dim),
+        bias=False,
+    ).to(device=device, dtype=dtype)
 
     # Set the mock weights to match our reference implementation
     # Reshape W_UK and W_UV to match the expected kv_b_proj format
     # [kv_lora_rank, num_heads, qk_nope_head_dim + v_head_dim]
     kv_b_proj_weight = kv_b_proj_weight.view(
-        kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim))
-    mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T)
-
-    # Create metadata using original batch spec
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec, vllm_config.cache_config.block_size, device)
-
-    # 3. Simulate Paged KV Cache and a realistic slot_mapping
-    kv_cache = create_and_prepopulate_kv_cache(
-        kv_c_contexts=kv_c_contexts,
-        k_pe_contexts=k_pe_contexts,
-        block_size=block_size,
-        head_size=head_size,
-        dtype=dtype,
-        device=device,
-        num_blocks=vllm_config.cache_config.num_gpu_blocks,
-        common_attn_metadata=common_attn_metadata,
-        randomize_blocks=True)
+        kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim)
+    )
+    mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T, requires_grad=False)
+
+    # 3. Create metadata and KV caches for each block size
+    # Group backends by block size and test each group
+    metadata_per_block_size = {}
+    kv_cache_per_block_size = {}
+
+    for block_size in unique_block_sizes:
+        # Create metadata for this block size
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec, block_size, device
+        )
+
+        # Pad block table to meet requirement:
+        # block_num % (128 / block_size) == 0
+        required_divisor = int(128 / block_size)
+        current_block_num = common_attn_metadata.block_table_tensor.shape[1]
+        if current_block_num % required_divisor != 0:
+            # Pad to next multiple of required_divisor
+            padded_block_num = (
+                (current_block_num + required_divisor - 1) // required_divisor
+            ) * required_divisor
+            padding_cols = padded_block_num - current_block_num
+            padding = torch.zeros(
+                (common_attn_metadata.block_table_tensor.shape[0], padding_cols),
+                dtype=torch.int32,
+                device=device,
+            )
+            common_attn_metadata.block_table_tensor = torch.cat(
+                [common_attn_metadata.block_table_tensor, padding], dim=1
+            )
+
+        metadata_per_block_size[block_size] = common_attn_metadata
+
+        # Create KV cache for this block size
+        required_blocks_for_size = sum(
+            (seq_len + block_size - 1) // block_size for seq_len in batch_spec.seq_lens
+        )
+        num_blocks_for_size = required_blocks_for_size + 1 + 100
+
+        kv_cache = create_and_prepopulate_kv_cache(
+            kv_c_contexts=kv_c_contexts,
+            k_pe_contexts=k_pe_contexts,
+            block_size=block_size,
+            head_size=head_size,
+            dtype=dtype,
+            device=device,
+            num_blocks=num_blocks_for_size,
+            common_attn_metadata=common_attn_metadata,
+            randomize_blocks=True,
+        )
+        kv_cache_per_block_size[block_size] = kv_cache
 
     # 4. Run vLLM backends and compare
-    for i, backend_name in enumerate(BACKENDS_TO_TEST):
+    failures = []
+    for backend_idx, backend_name in enumerate(BACKENDS_TO_TEST):
+        # Skip backends that don't support spec decode for spec decode tests
+        if is_spec_decode_test and backend_name not in SPEC_DECODE_BACKENDS:
+            continue
+
+        # Get the appropriate block_size, metadata, and cache for this backend
+        block_size = BACKEND_BLOCK_SIZES[backend_name]
+        common_attn_metadata = metadata_per_block_size[block_size]
+        kv_cache = kv_cache_per_block_size[block_size]
+
+        # Create kv_cache_spec with the correct block_size for this backend
+        backend_kv_cache_spec = FullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=vllm_config.model_config.get_num_kv_heads(
+                vllm_config.parallel_config
+            ),
+            head_size=vllm_config.model_config.get_head_size(),
+            dtype=vllm_config.model_config.dtype,
+            sliding_window=vllm_config.model_config.get_sliding_window(),
+        )
+
         backend_output = run_attention_backend(
-            backend_name, kv_cache_spec, ["placeholder"], vllm_config, device,
-            common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache,
-            kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim,
-            mock_kv_b_proj)
+            backend_name,
+            backend_kv_cache_spec,
+            ["placeholder"],
+            vllm_config,
+            device,
+            common_attn_metadata,
+            query_vllm,
+            kv_c_vllm,
+            k_pe_vllm,
+            kv_cache,
+            kv_lora_rank,
+            qk_nope_head_dim,
+            qk_rope_head_dim,
+            v_head_dim,
+            mock_kv_b_proj,
+        )
+
+        # Use backend_idx to get the correct SDPA output for this backend
+        expected_output = sdpa_outputs[backend_name]
 
         # Check shape and dtype consistency
-        assert backend_output.shape == sdpa_outputs[i].shape, (
-            f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_outputs[i].shape}")
-        assert backend_output.dtype == sdpa_outputs[i].dtype, (
-            f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_outputs[i].dtype}")
-
-        assert torch.isfinite(backend_output).all(), (
-            f"[{backend_name}] produced non-finite values")
-
-        # Check numerical similarity
-        rtol = 1e-2
-        atol = 5e-1
-
-        max_diff = torch.max(torch.abs(backend_output -
-                                       sdpa_outputs[i])).item()
-        max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_outputs[i]) /
-            torch.abs(sdpa_outputs[i])).item()
-        all_close = torch.allclose(backend_output,
-                                   sdpa_outputs[i],
-                                   rtol=rtol,
-                                   atol=atol)
-
-        assert all_close, (
-            f"[{backend_name}] output differs from SDPA baseline. "
-            f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})")
+        try:
+            assert backend_output.shape == expected_output.shape, (
+                f"[{backend_name}] shape {backend_output.shape} != "
+                f"SDPA shape {expected_output.shape}"
+            )
+            assert backend_output.dtype == expected_output.dtype, (
+                f"[{backend_name}] dtype {backend_output.dtype} != "
+                f"SDPA dtype {expected_output.dtype}"
+            )
+
+            assert torch.isfinite(backend_output).all(), (
+                f"[{backend_name}] produced non-finite values"
+            )
+
+            # Check numerical similarity
+            rtol = 1e-2
+            atol = 5e-1
+
+            max_diff = torch.max(torch.abs(backend_output - expected_output)).item()
+            max_rel_diff = torch.max(
+                torch.abs(backend_output - expected_output) / torch.abs(expected_output)
+            ).item()
+            all_close = torch.allclose(
+                backend_output, expected_output, rtol=rtol, atol=atol
+            )
+
+            assert all_close, (
+                f"[{backend_name}] output differs from SDPA baseline. "
+                f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})"
+            )
+        except AssertionError as e:
+            failures.append(str(e))
+
+    # Report all failures at once
+    if failures:
+        # Create a summary for the single-line failure message
+        backend_names = []
+        for f in failures:
+            if "[AttentionBackendEnum." in f:
+                backend_name = f.split("[")[1].split("]")[0]
+                backend_names.append(backend_name)
+
+        summary = f"{len(failures)} backend(s) failed: {', '.join(backend_names)}"
+        detailed_msg = "\n".join(failures)
+        pytest.fail(f"{summary}\n{detailed_msg}")
diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py
index ddad9342fad0d00ce6d1b258c5f84bd27b2d0f45..b34d587eb362dac651a147b0c0ad73d9d76b1314 100644
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -10,18 +10,21 @@ import pytest
 import torch
 
 from tests.v1.attention.test_mla_backends import (
-    BATCH_SPECS, BatchSpec, MockAttentionLayer,
-    create_and_prepopulate_kv_cache)
-from tests.v1.attention.utils import (create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config)
+    BATCH_SPECS,
+    BatchSpec,
+    MockAttentionLayer,
+    create_and_prepopulate_kv_cache,
+)
+from tests.v1.attention.utils import (
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    create_vllm_config,
+)
 from vllm import _custom_ops as ops
 from vllm.attention.ops import flashmla
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.utils import cdiv
-from vllm.v1.attention.backends.mla.flashmla_sparse import (
-    FlashMLASparseBackend, FlashMLASparseDecodeAndContextMetadata,
-    FlashMLASparseImpl, FlashMLASparseMetadata)
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend
 from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks
 
 SPARSE_BACKEND_BATCH_SPECS = {
@@ -35,41 +38,42 @@ SPARSE_BACKEND_BATCH_SPECS = {
     ]
 }
 
-SPARSE_BACKEND_BATCH_SPECS["large_q_prefill"] = BatchSpec(seq_lens=[1024] * 2,
-                                                          query_lens=[256] * 2)
+SPARSE_BACKEND_BATCH_SPECS["large_q_prefill"] = BatchSpec(
+    seq_lens=[1024] * 2, query_lens=[256] * 2
+)
 SPARSE_BACKEND_BATCH_SPECS["large_q_pure_prefill"] = BatchSpec(
-    seq_lens=[256] * 2, query_lens=[256] * 2)
+    seq_lens=[256] * 2, query_lens=[256] * 2
+)
 
 
 def _dequantize_fp8_ds_mla_entry(
-        cache_slice: torch.Tensor, kv_lora_rank: int, rope_dim: int,
-        dtype: torch.dtype) -> tuple[torch.Tensor, torch.Tensor]:
+    cache_slice: torch.Tensor, kv_lora_rank: int, rope_dim: int, dtype: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Dequantize a single fp8_ds_mla cache entry back to latent + rope."""
 
     # The first kv_lora_rank bytes store FP8 latent values with one scale per
     # 128 element tile written as float32 right after the latent payload.
-    scales = cache_slice.view(torch.float32)[kv_lora_rank //
-                                             4:kv_lora_rank // 4 + 4]
-    latent = torch.empty(kv_lora_rank,
-                         dtype=torch.float16,
-                         device=cache_slice.device)
+    scales = cache_slice.view(torch.float32)[kv_lora_rank // 4 : kv_lora_rank // 4 + 4]
+    latent = torch.empty(kv_lora_rank, dtype=torch.float16, device=cache_slice.device)
     for tile_idx in range(4):
         tile_start = tile_idx * 128
         tile_end = tile_start + 128
-        ops.convert_fp8(latent[tile_start:tile_end],
-                        cache_slice[tile_start:tile_end],
-                        float(scales[tile_idx].item()),
-                        kv_dtype="fp8")
+        ops.convert_fp8(
+            latent[tile_start:tile_end],
+            cache_slice[tile_start:tile_end],
+            float(scales[tile_idx].item()),
+            kv_dtype="fp8",
+        )
     latent = latent.to(dtype)
 
     rope_offset = kv_lora_rank // 2 + 8
-    rope_vals = cache_slice.view(dtype)[rope_offset:rope_offset + rope_dim]
+    rope_vals = cache_slice.view(dtype)[rope_offset : rope_offset + rope_dim]
     return latent, rope_vals.clone()
 
 
 def _quantize_dequantize_fp8_ds_mla(
-        kv_c: torch.Tensor, k_pe: torch.Tensor, block_size: int,
-        scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    kv_c: torch.Tensor, k_pe: torch.Tensor, block_size: int, scale: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Round-trip kv_c/k_pe though the fp8_ds_mla cache layout."""
 
     if kv_c.numel() == 0:
@@ -81,21 +85,14 @@ def _quantize_dequantize_fp8_ds_mla(
     num_blocks = max(1, math.ceil(num_tokens / block_size))
     entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
 
-    tmp_cache = torch.zeros(num_blocks,
-                            block_size,
-                            entry_size,
-                            dtype=torch.uint8,
-                            device=kv_c.device)
-    slot_mapping = torch.arange(num_tokens,
-                                dtype=torch.long,
-                                device=kv_c.device)
-
-    ops.concat_and_cache_mla(kv_c,
-                             k_pe,
-                             tmp_cache,
-                             slot_mapping,
-                             kv_cache_dtype="fp8_ds_mla",
-                             scale=scale)
+    tmp_cache = torch.zeros(
+        num_blocks, block_size, entry_size, dtype=torch.uint8, device=kv_c.device
+    )
+    slot_mapping = torch.arange(num_tokens, dtype=torch.long, device=kv_c.device)
+
+    ops.concat_and_cache_mla(
+        kv_c, k_pe, tmp_cache, slot_mapping, kv_cache_dtype="fp8_ds_mla", scale=scale
+    )
 
     dequant_kv_c = torch.empty_like(kv_c)
     dequant_k_pe = torch.empty_like(k_pe)
@@ -106,79 +103,20 @@ def _quantize_dequantize_fp8_ds_mla(
         block_offset = slot % block_size
         cache_slice = tmp_cache[block_idx, block_offset]
         latent, rope_vals = _dequantize_fp8_ds_mla_entry(
-            cache_slice, kv_lora_rank, rope_dim, kv_c.dtype)
+            cache_slice, kv_lora_rank, rope_dim, kv_c.dtype
+        )
         dequant_kv_c[token_idx] = latent
         dequant_k_pe[token_idx] = rope_vals
 
     return dequant_kv_c, dequant_k_pe
 
 
-def test_sparse_backend_metadata_registration():
-    backend = FlashMLASparseBackend
-
-    assert backend.get_name() == "FLASHMLA_SPARSE_VLLM_V1"
-    assert backend.get_metadata_cls() is FlashMLASparseMetadata
-    assert backend.get_impl_cls() is FlashMLASparseImpl
-
-    dtype_list = backend.get_supported_dtypes()
-    assert torch.bfloat16 in dtype_list
-
-    shape = backend.get_kv_cache_shape(num_blocks=2,
-                                       block_size=64,
-                                       num_kv_heads=1,
-                                       head_size=576)
-    assert shape == (2, 64, 576)
-
-
-def test_sparse_decode_metadata_filters_prefill_indices():
-    prefill_context_lengths = torch.tensor([4, 2], dtype=torch.int32)
-    metadata = FlashMLASparseDecodeAndContextMetadata(
-        scheduler_metadata=torch.tensor([[0]], dtype=torch.int32),
-        num_splits=torch.tensor([1, 1], dtype=torch.int32),
-        cache_lens=torch.tensor([10, 12], dtype=torch.int32),
-        prefill_context_lengths=prefill_context_lengths,
-    )
-
-    indices = torch.tensor([[0, 3, 5], [1, 2, 4]], dtype=torch.int32)
-
-    context_indices, new_token_indices = metadata.filter_prefill_indices(
-        indices)
-
-    expected_context = torch.tensor([[-1, -1, 5], [-1, -1, 4]],
-                                    dtype=torch.int32)
-    expected_new_tokens = torch.tensor([[-1, -1, 1], [-1, 0, 2]],
-                                       dtype=torch.int32)
-
-    assert torch.equal(context_indices, expected_context)
-    assert torch.equal(new_token_indices, expected_new_tokens)
-
-
-def test_sparse_impl_zero_fills_when_metadata_missing():
-    impl = FlashMLASparseImpl.__new__(FlashMLASparseImpl)
-    dummy_layer = object()
-    q = torch.zeros((2, 1, 3))
-    k_c = torch.zeros((2, 3))
-    k_pe = torch.zeros((2, 1, 1))
-    kv_cache = torch.zeros((1, 1, 1))
-    output = torch.ones((2, 4))
-
-    result = FlashMLASparseImpl.forward(impl,
-                                        dummy_layer,
-                                        q,
-                                        k_c,
-                                        k_pe,
-                                        kv_cache,
-                                        attn_metadata=None,
-                                        output=output)
-
-    assert result is output
-    assert torch.all(result == 0)
-
-
 @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys()))
 @pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"])
-def test_sparse_backend_decode_correctness(dist_init, batch_name,
-                                           kv_cache_dtype):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_sparse_backend_decode_correctness(
+    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size
+):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is required for sparse MLA decode test")
 
@@ -200,17 +138,20 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
     total_cache_tokens = sum(batch_spec.seq_lens)
     block_size = 64
 
+    # Note: We use TP=1 to avoid multi-GPU requirements in CI.
+    # The test simulates head partitioning via mocked methods below.
     vllm_config = create_vllm_config(
         model_name="deepseek-ai/DeepSeek-V2-Lite-Chat",
+        tensor_parallel_size=1,
         max_model_len=max_seqlen,
-        num_gpu_blocks=max(2048,
-                           cdiv(total_cache_tokens, block_size) + 1),
-        block_size=block_size)
+        num_gpu_blocks=max(2048, cdiv(total_cache_tokens, block_size) + 1),
+        block_size=block_size,
+        hf_config_override={
+            "index_topk": topk_tokens,
+            "attn_module_list_cfg": [{"topk_tokens": topk_tokens}],
+        },
+    )
     model_config = vllm_config.model_config
-    model_config.hf_config = SimpleNamespace(
-        attn_module_list_cfg=[{
-            "topk_tokens": topk_tokens
-        }])
     model_config.hf_text_config = SimpleNamespace(
         q_lora_rank=None,
         kv_lora_rank=kv_lora_rank,
@@ -221,13 +162,14 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
     )
     model_config.dtype = dtype
     model_config.get_num_attention_heads = MethodType(
-        lambda self, parallel_config: num_heads, model_config)
-    model_config.get_num_kv_heads = MethodType(lambda self, parallel_config: 1,
-                                               model_config)
-    model_config.get_head_size = MethodType(lambda self: head_size,
-                                            model_config)
-    model_config.get_sliding_window = MethodType(lambda self: None,
-                                                 model_config)
+        lambda self, parallel_config: max(1, num_heads // tensor_parallel_size),
+        model_config,
+    )
+    model_config.get_num_kv_heads = MethodType(
+        lambda self, parallel_config: 1, model_config
+    )
+    model_config.get_head_size = MethodType(lambda self: head_size, model_config)
+    model_config.get_sliding_window = MethodType(lambda self: None, model_config)
 
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
 
@@ -236,16 +178,10 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
     scale = 1.0 / math.sqrt(head_size)
 
     # Shared MLA projection weights to keep reference and backend in sync
-    W_UK = torch.randn(kv_lora_rank,
-                       num_heads,
-                       qk_nope_head_dim,
-                       dtype=dtype,
-                       device=device)
-    W_UV = torch.randn(kv_lora_rank,
-                       num_heads,
-                       v_head_dim,
-                       dtype=dtype,
-                       device=device)
+    W_UK = torch.randn(
+        kv_lora_rank, num_heads, qk_nope_head_dim, dtype=dtype, device=device
+    )
+    W_UV = torch.randn(kv_lora_rank, num_heads, v_head_dim, dtype=dtype, device=device)
 
     # Build synthetic decode-only workload
     seq_lens = batch_spec.seq_lens
@@ -262,17 +198,15 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
         q_len = query_lens[i]
         ctx_len = s_len - q_len
 
-        q_c = torch.rand(q_len,
-                         num_heads,
-                         qk_nope_head_dim + qk_rope_head_dim,
-                         dtype=dtype,
-                         device=device)
+        q_c = torch.rand(
+            q_len,
+            num_heads,
+            qk_nope_head_dim + qk_rope_head_dim,
+            dtype=dtype,
+            device=device,
+        )
         kv_c_full = torch.rand(s_len, kv_lora_rank, dtype=dtype, device=device)
-        k_pe_full = torch.rand(s_len,
-                               1,
-                               qk_rope_head_dim,
-                               dtype=dtype,
-                               device=device)
+        k_pe_full = torch.rand(s_len, 1, qk_rope_head_dim, dtype=dtype, device=device)
 
         kv_c_full, k_pe_full = _quantize_dequantize_fp8_ds_mla(
             kv_c_full,
@@ -298,7 +232,8 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
         v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
 
         sdpa_out = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+        )
         sdpa_out = sdpa_out.transpose(1, 2).squeeze(0)
 
         sdpa_out = torch.einsum("qnl,lnv->qnv", sdpa_out, W_UV)
@@ -307,8 +242,8 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
         all_q_vllm.append(q_c)
         all_kv_c_vllm.append(kv_c_full[ctx_len:])
         all_k_pe_vllm.append(k_pe_full[ctx_len:])
-        kv_c_contexts.append(kv_c_full[:ctx_len + 1])
-        k_pe_contexts.append(k_pe_full[:ctx_len + 1])
+        kv_c_contexts.append(kv_c_full[: ctx_len + 1])
+        k_pe_contexts.append(k_pe_full[: ctx_len + 1])
 
     query_vllm = torch.cat(all_q_vllm, dim=0)
     kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
@@ -316,12 +251,14 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
     sdpa_reference = torch.cat(reference_outputs, dim=0)
 
     vllm_config.cache_config.cache_dtype = kv_cache_dtype
+    vllm_config.model_config.hf_config.index_topk = topk_tokens
 
     common_attn_metadata = create_common_attn_metadata(
         batch_spec,
         vllm_config.cache_config.block_size,
         device,
-        arange_block_indices=True)
+        arange_block_indices=True,
+    )
 
     kv_cache = create_and_prepopulate_kv_cache(
         kv_c_contexts=kv_c_contexts,
@@ -339,92 +276,94 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
 
     builder_cls = FlashMLASparseBackend.get_builder_cls()
     builder = builder_cls(kv_cache_spec, ["placeholder"], vllm_config, device)
-    metadata = builder.build(common_prefix_len=0,
-                             common_attn_metadata=common_attn_metadata)
+    metadata = builder.build(
+        common_prefix_len=0, common_attn_metadata=common_attn_metadata
+    )
 
-    starts = np.asarray(common_attn_metadata.query_start_loc_cpu,
-                        dtype=np.int32)
+    starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
     seg_lengths = np.diff(starts)
     positions = np.arange(starts[-1], dtype=np.int32) - np.repeat(
-        starts[:-1], seg_lengths)
+        starts[:-1], seg_lengths
+    )
     seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32)
     prefix_lengths = seq_lengths - seg_lengths
     positions += np.repeat(prefix_lengths, seg_lengths)
 
     pos_gpu = torch.as_tensor(positions, device=device, dtype=torch.int32)
     topk = metadata.topk_tokens
-    debug_indices = torch.arange(topk, device=device,
-                                 dtype=torch.int32).unsqueeze(0)
+    debug_indices = torch.arange(topk, device=device, dtype=torch.int32).unsqueeze(0)
     token_positions = pos_gpu.unsqueeze(1)
-    causal_mask = (debug_indices <= token_positions)
-    debug_indices = torch.where(causal_mask, debug_indices,
-                                torch.full_like(debug_indices, -1))
+    causal_mask = debug_indices <= token_positions
+    debug_indices = torch.where(
+        causal_mask, debug_indices, torch.full_like(debug_indices, -1)
+    )
 
     # FlashMLASparseImpl now reads top-k indices from the indexer-provided
     # buffer, so emulate that contract with a simple namespace mock.
-    debug_indices = debug_indices.expand(metadata.num_actual_tokens,
-                                         -1).clone()
+    debug_indices = debug_indices.expand(metadata.num_actual_tokens, -1).clone()
     mock_indexer = SimpleNamespace(topk_indices_buffer=debug_indices)
 
-    ok, reason = flashmla.is_flashmla_supported()
+    ok, reason = flashmla.is_flashmla_sparse_supported()
     if not ok:
         pytest.skip(reason)
 
     kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
     kv_b_proj_weight = kv_b_proj_weight.view(
-        kv_lora_rank, num_heads * (qk_nope_head_dim + v_head_dim))
+        kv_lora_rank, num_heads * (qk_nope_head_dim + v_head_dim)
+    )
 
-    mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank,
-                                          output_size=num_heads *
-                                          (qk_nope_head_dim + v_head_dim),
-                                          bias=False).to(device=device,
-                                                         dtype=dtype)
+    mock_kv_b_proj = ColumnParallelLinear(
+        input_size=kv_lora_rank,
+        output_size=num_heads * (qk_nope_head_dim + v_head_dim),
+        bias=False,
+    ).to(device=device, dtype=dtype)
     mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T.contiguous())
 
     impl_cls = FlashMLASparseBackend.get_impl_cls()
-    impl = impl_cls(num_heads=num_heads,
-                    head_size=head_size,
-                    scale=scale,
-                    num_kv_heads=1,
-                    alibi_slopes=None,
-                    sliding_window=None,
-                    kv_cache_dtype=vllm_config.cache_config.cache_dtype,
-                    logits_soft_cap=None,
-                    attn_type="decoder",
-                    kv_sharing_target_layer_name=None,
-                    q_lora_rank=None,
-                    kv_lora_rank=kv_lora_rank,
-                    qk_nope_head_dim=qk_nope_head_dim,
-                    qk_rope_head_dim=qk_rope_head_dim,
-                    qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
-                    v_head_dim=v_head_dim,
-                    kv_b_proj=mock_kv_b_proj,
-                    indexer=mock_indexer)
+    impl = impl_cls(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=1,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+        logits_soft_cap=None,
+        attn_type="decoder",
+        kv_sharing_target_layer_name=None,
+        q_lora_rank=None,
+        kv_lora_rank=kv_lora_rank,
+        qk_nope_head_dim=qk_nope_head_dim,
+        qk_rope_head_dim=qk_rope_head_dim,
+        qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
+        v_head_dim=v_head_dim,
+        kv_b_proj=mock_kv_b_proj,
+        indexer=mock_indexer,
+    )
 
     impl.process_weights_after_loading(dtype)
 
     layer = MockAttentionLayer(device)
-    out_buffer = torch.empty(metadata.num_actual_tokens,
-                             num_heads * v_head_dim,
-                             dtype=dtype,
-                             device=device)
-
-    backend_output = impl.forward(layer,
-                                  query_vllm,
-                                  kv_c_vllm,
-                                  k_pe_vllm,
-                                  kv_cache,
-                                  metadata,
-                                  output=out_buffer)
+    out_buffer = torch.empty(
+        metadata.num_actual_tokens, num_heads * v_head_dim, dtype=dtype, device=device
+    )
+
+    with torch.inference_mode():
+        backend_output = impl.forward(
+            layer,
+            query_vllm,
+            kv_c_vllm,
+            k_pe_vllm,
+            kv_cache,
+            metadata,
+            output=out_buffer,
+        )
 
     assert backend_output.shape == sdpa_reference.shape
     assert backend_output.dtype == sdpa_reference.dtype
     assert torch.isfinite(backend_output).all()
 
-    torch.testing.assert_close(backend_output,
-                               sdpa_reference,
-                               rtol=0.5,
-                               atol=0.5)
+    torch.testing.assert_close(backend_output, sdpa_reference, rtol=0.5, atol=0.5)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 8873a7761e95cff60b3def09cb5c49684cd44a65..dea89babd4b478f4750481097c59e8f44fee7389 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -3,23 +3,34 @@
 """Utility functions for attention-related v1 tests."""
 
 from dataclasses import dataclass
-from typing import Union
 
 import pytest
 import torch
 
-from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
-                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import resolve_obj_by_qualname
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.attention.backends.abstract import AttentionImpl
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.config.model import ModelDType
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 
 @dataclass
 class BatchSpec:
     """Specification for a batch configuration (workload shape only)."""
+
     seq_lens: list[int]
     query_lens: list[int]
 
@@ -37,26 +48,25 @@ class BatchSpec:
 
 
 def create_common_attn_metadata(
-        batch_spec: BatchSpec,
-        block_size: int,
-        device: torch.device,
-        max_block_idx: int = 1000,
-        arange_block_indices: bool = False) -> CommonAttentionMetadata:
+    batch_spec: BatchSpec,
+    block_size: int,
+    device: torch.device,
+    max_block_idx: int = 1000,
+    arange_block_indices: bool = False,
+) -> CommonAttentionMetadata:
     """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
     # Create query start locations
-    query_start_loc = torch.zeros(batch_spec.batch_size + 1,
-                                  dtype=torch.int32,
-                                  device=device)
-    query_start_loc[1:] = torch.tensor(batch_spec.query_lens,
-                                       dtype=torch.int32,
-                                       device=device).cumsum(0)
+    query_start_loc = torch.zeros(
+        batch_spec.batch_size + 1, dtype=torch.int32, device=device
+    )
+    query_start_loc[1:] = torch.tensor(
+        batch_spec.query_lens, dtype=torch.int32, device=device
+    ).cumsum(0)
     query_start_loc_cpu = query_start_loc.cpu()
     num_tokens = batch_spec.compute_num_tokens()
 
     # Create sequence lengths
-    seq_lens = torch.tensor(batch_spec.seq_lens,
-                            dtype=torch.int32,
-                            device=device)
+    seq_lens = torch.tensor(batch_spec.seq_lens, dtype=torch.int32, device=device)
     seq_lens_cpu = seq_lens.cpu()
     max_seq_len = int(seq_lens_cpu.max())
 
@@ -71,24 +81,23 @@ def create_common_attn_metadata(
     max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size
     if arange_block_indices:
         num_blocks = batch_spec.batch_size * max_blocks
-        block_table_tensor = torch.arange(num_blocks,
-                                          dtype=torch.int32,
-                                          device=device).view(
-                                              batch_spec.batch_size,
-                                              max_blocks)
-        slot_mapping = torch.arange(num_tokens,
-                                    dtype=torch.int64,
-                                    device=device).view(num_tokens)
+        block_table_tensor = torch.arange(
+            num_blocks, dtype=torch.int32, device=device
+        ).view(batch_spec.batch_size, max_blocks)
+        slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device).view(
+            num_tokens
+        )
     else:
-        block_table_tensor = torch.randint(0,
-                                           max_block_idx,
-                                           (batch_spec.batch_size, max_blocks),
-                                           dtype=torch.int32,
-                                           device=device)
-        slot_mapping = torch.randint(0,
-                                     max_block_idx, (num_tokens, ),
-                                     dtype=torch.int64,
-                                     device=device)
+        block_table_tensor = torch.randint(
+            0,
+            max_block_idx,
+            (batch_spec.batch_size, max_blocks),
+            dtype=torch.int32,
+            device=device,
+        )
+        slot_mapping = torch.randint(
+            0, max_block_idx, (num_tokens,), dtype=torch.int64, device=device
+        )
 
     # Calculate max query length
     max_query_len = max(batch_spec.query_lens)
@@ -109,79 +118,44 @@ def create_common_attn_metadata(
     )
 
 
-def get_attention_backend(backend_name: _Backend):
-    """Set up attention backend classes for testing.
-
-    Args:
-        backend_name: Name of the backend ("flash_attn", "flashinfer", etc.)
-        vllm_config: VllmConfig instance
-
-    Returns:
-        Tuple of (backend_builder_class, backend_impl_class)
-    """
-    backend_map = {
-        _Backend.FLASH_ATTN:
-        ("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-         if current_platform.is_cuda() else
-         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
-         ),
-        _Backend.FLASHINFER:
-        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION:
-        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN:
-        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
-        _Backend.TREE_ATTN:
-        "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
-        _Backend.XFORMERS:
-        "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
-        _Backend.CUTLASS_MLA:
-        "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
-        _Backend.FLASHMLA:
-        "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
-        _Backend.FLASH_ATTN_MLA:
-        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
-        _Backend.FLASHINFER_MLA:
-        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
-        _Backend.TRITON_MLA:
-        "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
-    }
-
-    if backend_name not in backend_map:
-        raise ValueError(f"Unknown backend: {backend_name}")
-
-    backend_class_name = backend_map[backend_name]
-
+def try_get_attention_backend(
+    backend: AttentionBackendEnum,
+) -> tuple[type[AttentionMetadataBuilder], type[AttentionImpl]]:
+    """Try to get the attention backend class, skipping test if not found."""
     try:
-        backend_class = resolve_obj_by_qualname(backend_class_name)
+        backend_class = backend.get_class()
         return backend_class.get_builder_cls(), backend_class.get_impl_cls()
     except ImportError as e:
-        pytest.skip(f"{backend_name} not available: {e}")
+        pytest.skip(f"{backend.name} not available: {e}")
+        raise AssertionError("unreachable") from None
 
 
-def create_standard_kv_cache_spec(
-        vllm_config: VllmConfig) -> FullAttentionSpec:
+def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
     """Create a FullAttentionSpec from ModelParams only."""
     return FullAttentionSpec(
         block_size=vllm_config.cache_config.block_size,
         num_kv_heads=vllm_config.model_config.get_num_kv_heads(
-            vllm_config.parallel_config),
+            vllm_config.parallel_config
+        ),
         head_size=vllm_config.model_config.get_head_size(),
         dtype=vllm_config.model_config.dtype,
         sliding_window=vllm_config.model_config.get_sliding_window(),
     )
 
 
-def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
-                       tensor_parallel_size: int = 1,
-                       max_model_len: int = 1024,
-                       dtype: Union[ModelDType, torch.dtype] = "auto",
-                       num_gpu_blocks: int = 1000,
-                       block_size: int = 16,
-                       max_num_seqs: int = 256,
-                       max_num_batched_tokens: int = 8192,
-                       enable_chunked_prefill: bool = True,
-                       add_mock_model_methods: bool = True) -> VllmConfig:
+def create_vllm_config(
+    model_name: str = "meta-llama/Meta-Llama-3-8B",
+    tensor_parallel_size: int = 1,
+    max_model_len: int = 1024,
+    dtype: ModelDType | torch.dtype = "auto",
+    num_gpu_blocks: int = 1000,
+    block_size: int = 16,
+    max_num_seqs: int = 256,
+    max_num_batched_tokens: int = 8192,
+    enable_chunked_prefill: bool = True,
+    add_mock_model_methods: bool = True,
+    hf_config_override: dict | None = None,
+) -> VllmConfig:
     """Create a VllmConfig for testing with reasonable defaults."""
 
     model_config = ModelConfig(
@@ -204,7 +178,8 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     cache_config.num_cpu_blocks = 0
 
     parallel_config = ParallelConfig(
-        tensor_parallel_size=tensor_parallel_size, )
+        tensor_parallel_size=tensor_parallel_size,
+    )
 
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
@@ -222,15 +197,20 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
         # but some backends expect to query the model for layer-specific
         # parameters
         import types
-        model_config.get_num_layers = types.MethodType(lambda self: 1,
-                                                       model_config)
+
+        model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
         model_config.get_sliding_window_for_layer = types.MethodType(
-            lambda self, i: None, model_config)
+            lambda self, i: None, model_config
+        )
         model_config.get_logits_soft_cap_for_layer = types.MethodType(
-            lambda self, i: 0.0, model_config)
+            lambda self, i: 0.0, model_config
+        )
         model_config.get_sm_scale_for_layer = types.MethodType(
-            lambda self, i: 1.0 / model_config.get_head_size()**0.5,
-            model_config)
+            lambda self, i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+        )
+
+    if hf_config_override:
+        model_config.hf_config.update(hf_config_override)
 
     return VllmConfig(
         model_config=model_config,
@@ -243,12 +223,14 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     )
 
 
-def create_dummy_kv_cache(block_size: int,
-                          num_kv_heads: int,
-                          head_size: int,
-                          dtype: torch.dtype,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
+def create_dummy_kv_cache(
+    block_size: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int = 100,
+) -> torch.Tensor:
     """Create a dummy KV cache tensor for testing."""
     kv_cache = torch.randn(
         num_blocks,
@@ -257,5 +239,105 @@ def create_dummy_kv_cache(block_size: int,
         num_kv_heads,
         head_size,
         dtype=dtype,
-        device=device)
+        device=device,
+    )
     return kv_cache
+
+
+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict  # compilation config
+    specific_gpu_arch: tuple | None = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+full_cg_backend_configs = {
+    # FA3 on Hopper
+    "FA3": BackendConfig(
+        name="FA3",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+            "VLLM_FLASH_ATTN_VERSION": "3",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+        specific_gpu_arch=(9, 0),
+    ),
+    # FlashMLA on Hopper
+    "FlashMLA": BackendConfig(
+        name="FlashMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+        specific_gpu_arch=(9, 0),
+    ),
+    # Cutlass MLA on Blackwell
+    "CutlassMLA": BackendConfig(
+        name="CutlassMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+        specific_gpu_arch=(10, 0),
+    ),
+    # FlashInfer MLA on Blackwell
+    "FlashInferMLA": BackendConfig(
+        name="FlashInferMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+        specific_gpu_arch=(10, 0),
+    ),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA": BackendConfig(
+        name="FlashAttentionMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+        },
+        specific_gpu_arch=(9, 0),
+    ),
+    # FA2
+    "FA2": BackendConfig(
+        name="FA2",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+            "VLLM_FLASH_ATTN_VERSION": "2",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
+    # Triton Attention
+    "TritonAttn": BackendConfig(
+        name="TritonAttn",
+        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
+    # FlashInfer
+    "FlashInfer": BackendConfig(
+        name="FlashInfer",
+        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
+}
diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index c153e38fe3df36966c0c9b82cb57a848d3f27176..1d80ee98759135d2e3bc40abcc1c6ed762297dfc 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
+import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -11,17 +12,17 @@ from vllm.v1.utils import ConstantList
 
 from .utils import create_requests, create_scheduler
 
+pytestmark = pytest.mark.cpu_test
+
 
 def _make_model_runner_output(
-    scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
+    scheduler_output: SchedulerOutput,
+) -> ModelRunnerOutput:
     req_ids = list(scheduler_output.num_scheduled_tokens.keys())
     return ModelRunnerOutput(
         req_ids=req_ids,
-        req_id_to_index={
-            req_id: i
-            for i, req_id in enumerate(req_ids)
-        },
-        sampled_token_ids=[[i] for i in range(len(req_ids))],
+        req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
+        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -34,15 +35,20 @@ def test_stop_by_max_tokens(max_tokens: int):
     requests = create_requests(num_requests=2, max_tokens=max_tokens)
     req0, req1 = requests
 
+    expected_total_num_scheduled_tokens = 0
     sched_outputs: deque[SchedulerOutput] = deque()
     scheduler.add_request(req0)
     sched_outputs.append(scheduler.schedule())
+    expected_total_num_scheduled_tokens += req0.num_prompt_tokens + max_tokens - 1
 
     scheduler.add_request(req1)
     sched_outputs.append(scheduler.schedule())
+    expected_total_num_scheduled_tokens += req1.num_prompt_tokens + max_tokens - 1
 
+    total_num_scheduled_tokens = 0
     while sched_outputs:
         sched_output = sched_outputs.popleft()
+        total_num_scheduled_tokens += sched_output.total_num_scheduled_tokens
         model_runner_output = _make_model_runner_output(sched_output)
         scheduler.update_from_output(sched_output, model_runner_output)
 
@@ -53,6 +59,8 @@ def test_stop_by_max_tokens(max_tokens: int):
     assert scheduler.get_num_unfinished_requests() == 0
     assert req0.num_output_tokens == max_tokens
     assert req1.num_output_tokens == max_tokens
+    # Ensure we aren't scheduling more tokens than necessary.
+    assert total_num_scheduled_tokens == expected_total_num_scheduled_tokens
 
 
 def test_abort():
@@ -73,8 +81,7 @@ def test_abort():
         if not abort_order:
             return
         req = requests[abort_order.pop(0)]
-        scheduler.finish_requests(req.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
 
     while sched_outputs:
         # Abort a scheduled request.
@@ -110,8 +117,7 @@ def test_preempt():
         if not abort_order:
             return
         req = requests[abort_order.pop(0)]
-        scheduler.finish_requests(req.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
 
     while sched_outputs:
         # Abort a scheduled request.
@@ -133,15 +139,19 @@ def test_prefix_caching_for_prefill_dedup():
     CHUNK_SIZE = 1000
     BLOCK_SIZE = 16
     num_prompt_tokens = 100
-    scheduler = create_scheduler(async_scheduling=True,
-                                 max_num_batched_tokens=CHUNK_SIZE,
-                                 enable_prefix_caching=True,
-                                 block_size=BLOCK_SIZE)
-    requests = create_requests(num_requests=5,
-                               num_tokens=num_prompt_tokens,
-                               max_tokens=3,
-                               same_prompt=True,
-                               block_size=BLOCK_SIZE)
+    scheduler = create_scheduler(
+        async_scheduling=True,
+        max_num_batched_tokens=CHUNK_SIZE,
+        enable_prefix_caching=True,
+        block_size=BLOCK_SIZE,
+    )
+    requests = create_requests(
+        num_requests=5,
+        num_tokens=num_prompt_tokens,
+        max_tokens=3,
+        same_prompt=True,
+        block_size=BLOCK_SIZE,
+    )
     requests_copy = requests.copy()
 
     # Two requests with the same prompt.
@@ -183,14 +193,18 @@ def test_prefix_caching_for_multi_turn():
     BLOCK_SIZE = 16
     num_prompt_tokens = 100
     num_output_tokens = 200
-    scheduler = create_scheduler(async_scheduling=True,
-                                 max_num_batched_tokens=CHUNK_SIZE,
-                                 enable_prefix_caching=True,
-                                 block_size=BLOCK_SIZE)
-    requests = create_requests(num_requests=5,
-                               num_tokens=num_prompt_tokens,
-                               max_tokens=num_output_tokens,
-                               block_size=BLOCK_SIZE)
+    scheduler = create_scheduler(
+        async_scheduling=True,
+        max_num_batched_tokens=CHUNK_SIZE,
+        enable_prefix_caching=True,
+        block_size=BLOCK_SIZE,
+    )
+    requests = create_requests(
+        num_requests=5,
+        num_tokens=num_prompt_tokens,
+        max_tokens=num_output_tokens,
+        block_size=BLOCK_SIZE,
+    )
 
     for req in requests:
         scheduler.add_request(req)
@@ -210,14 +224,16 @@ def test_prefix_caching_for_multi_turn():
 
     # Create next-turn requests whose prompts are the full output of the
     # previous turn.
-    next_turn_requests = create_requests(num_requests=5,
-                                         num_tokens=num_prompt_tokens +
-                                         num_output_tokens,
-                                         max_tokens=num_output_tokens,
-                                         block_size=BLOCK_SIZE)
+    next_turn_requests = create_requests(
+        num_requests=5,
+        num_tokens=num_prompt_tokens + num_output_tokens,
+        max_tokens=num_output_tokens,
+        block_size=BLOCK_SIZE,
+    )
     for i, req in enumerate(next_turn_requests):
-        req.prompt_token_ids = (requests[i].prompt_token_ids +
-                                list(requests[i].output_token_ids))
+        req.prompt_token_ids = requests[i].prompt_token_ids + list(
+            requests[i].output_token_ids
+        )
         req._all_token_ids = req.prompt_token_ids.copy()
         req.all_token_ids = ConstantList(req._all_token_ids)
         req.block_hashes = []
@@ -231,5 +247,4 @@ def test_prefix_caching_for_multi_turn():
     # Make sure the next-turn requests get prefix cache hit by the previous
     # requests.
     for req in next_turn_requests:
-        assert (req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE *
-                BLOCK_SIZE)
+        assert req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE * BLOCK_SIZE
diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
index 4e3cace86be6ab7b01a9e43a641c7064cf1fc59f..8a52b5bd78977c473dad05edee9923d1ba324e31 100644
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
 
 from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 
+pytestmark = pytest.mark.cpu_test
+
 
 # ------------------ Mock Classes ------------------ #
 class MockRequest:
-
     def __init__(self, request_id, mm_hashes, token_counts):
         self.request_id = request_id
         self._token_counts = token_counts
@@ -17,8 +19,7 @@ class MockRequest:
                 data=None,
                 modality="image",
                 identifier=mm_hash,
-                mm_position=PlaceholderRange(offset=0,
-                                             length=self._token_counts[i]),
+                mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
             )
             self.mm_features.append(feature)
 
@@ -164,8 +165,7 @@ def test_schedule_request_multi_images_respect_space_limit():
     num_tokens_to_schedule += req.get_num_encoder_tokens(0)
     compute_budget -= req.get_num_encoder_tokens(0)
 
-    assert not manager.can_allocate(req, 1, compute_budget,
-                                    num_tokens_to_schedule)
+    assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
 
 
 def test_schedule_request_multi_images_respect_compute_limit():
@@ -177,5 +177,4 @@ def test_schedule_request_multi_images_respect_compute_limit():
     num_tokens_to_schedule += req.get_num_encoder_tokens(0)
     compute_budget -= req.get_num_encoder_tokens(0)
 
-    assert not manager.can_allocate(req, 1, compute_budget,
-                                    num_tokens_to_schedule)
+    assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 452b16ef4a910a0d466c4b25e96fc6be735bf50c..24611a4aaa1b85df9dc59b44eb5a19af6106dbdf 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,43 +1,61 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import (MultiModalFeatureSpec,
-                                    MultiModalKwargsItem, PlaceholderRange)
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes, sha256, sha256_cbor
+from vllm.utils.hashing import sha256, sha256_cbor
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-# disable yapf here as it formats differently than isort such that both fail
-# yapf: disable
 from vllm.v1.core.kv_cache_utils import (
-    BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
-    estimate_max_model_len, generate_block_hash_extra_keys,
-    generate_scheduler_kv_cache_config, get_kv_cache_configs,
-    get_max_concurrency_for_kv_cache_config, get_request_block_hasher,
-    hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform,
-    make_block_hash_with_group_id)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheSpec,
-                                        KVCacheTensor, MLAAttentionSpec,
-                                        SlidingWindowSpec,
-                                        UniformTypeKVCacheSpecs)
-from vllm.v1.metrics.stats import PrefixCacheStats
+    BlockHash,
+    FreeKVCacheBlockQueue,
+    KVCacheBlock,
+    estimate_max_model_len,
+    generate_block_hash_extra_keys,
+    generate_scheduler_kv_cache_config,
+    get_kv_cache_configs,
+    get_max_concurrency_for_kv_cache_config,
+    get_request_block_hasher,
+    hash_block_tokens,
+    init_none_hash,
+    is_kv_cache_spec_uniform,
+    make_block_hash_with_group_id,
+    tensor_data,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheSpec,
+    KVCacheTensor,
+    MLAAttentionSpec,
+    SlidingWindowSpec,
+    UniformTypeKVCacheSpecs,
+)
+from vllm.v1.metrics.stats import CachingMetrics, PrefixCacheStats
 from vllm.v1.request import Request
 
-# yapf: enable
+pytestmark = pytest.mark.cpu_test
 
 
 @pytest.fixture(autouse=True)
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)
@@ -45,12 +63,13 @@ def _auto_init_hash_fn(request):
 
 def make_request(
     request_id: str,
-    prompt_token_ids: list[int],
+    prompt_token_ids: list[int] | None,
     block_size: int = 3,
     hash_fn: Callable = hash,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    mm_hashes: Optional[list[str]] = None,
-    cache_salt: Optional[str] = None,
+    mm_positions: list[PlaceholderRange] | None = None,
+    mm_hashes: list[str] | None = None,
+    cache_salt: str | None = None,
+    prompt_embeds: torch.Tensor | None = None,
 ):
     mm_features = []
     if mm_positions is not None:
@@ -60,42 +79,50 @@ def make_request(
                 data=MultiModalKwargsItem.dummy("dummy_m"),
                 mm_position=position,
                 identifier=identifier,
-                modality="image")
+                modality="image",
+            )
             mm_features.append(mm_feature)
 
-    return Request(request_id=request_id,
-                   prompt_token_ids=prompt_token_ids,
-                   mm_features=mm_features if mm_features else None,
-                   sampling_params=SamplingParams(max_tokens=17),
-                   pooling_params=None,
-                   eos_token_id=100,
-                   lora_request=None,
-                   cache_salt=cache_salt,
-                   block_hasher=get_request_block_hasher(block_size, hash_fn))
-
-
-def new_kv_cache_spec(block_size=16,
-                      num_kv_heads=2,
-                      head_size=64,
-                      dtype=torch.float32,
-                      sliding_window=None):
-    return FullAttentionSpec(block_size=block_size,
-                             num_kv_heads=num_kv_heads,
-                             head_size=head_size,
-                             dtype=dtype,
-                             sliding_window=sliding_window)
-
-
-def new_sliding_window_spec(block_size=16,
-                            num_kv_heads=2,
-                            head_size=64,
-                            dtype=torch.float32,
-                            sliding_window=1):
-    return SlidingWindowSpec(block_size=block_size,
-                             num_kv_heads=num_kv_heads,
-                             head_size=head_size,
-                             dtype=dtype,
-                             sliding_window=sliding_window)
+    return Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        mm_features=mm_features if mm_features else None,
+        sampling_params=SamplingParams(max_tokens=17),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+        cache_salt=cache_salt,
+        block_hasher=get_request_block_hasher(block_size, hash_fn),
+        prompt_embeds=prompt_embeds,
+    )
+
+
+def new_kv_cache_spec(
+    block_size=16,
+    num_kv_heads=2,
+    head_size=64,
+    dtype=torch.float32,
+    sliding_window=None,
+):
+    return FullAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        sliding_window=sliding_window,
+    )
+
+
+def new_sliding_window_spec(
+    block_size=16, num_kv_heads=2, head_size=64, dtype=torch.float32, sliding_window=1
+):
+    return SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        sliding_window=sliding_window,
+    )
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
@@ -104,7 +131,7 @@ def test_none_hash(monkeypatch, hash_fn):
 
     # case 1: PYTHONHASHSEED is not set, use random
     with monkeypatch.context() as m:
-        m.delenv('PYTHONHASHSEED', raising=False)
+        m.delenv("PYTHONHASHSEED", raising=False)
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
@@ -113,16 +140,15 @@ def test_none_hash(monkeypatch, hash_fn):
 
     # case 2: PYTHONHASHSEED is set, use the seed and hash_fn
     with monkeypatch.context() as m:
-        m.setenv('PYTHONHASHSEED', 'python hash seed')
+        m.setenv("PYTHONHASHSEED", "python hash seed")
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
         assert isinstance(reloaded_kv_cache_utils.NONE_HASH, bytes)
-        assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
+        assert hash_fn("python hash seed") == reloaded_kv_cache_utils.NONE_HASH
 
 
 def test_kv_cache_block():
-
     # Test KVCacheBlock initialization
     block = KVCacheBlock(block_id=0)
     assert block.block_id == 0
@@ -190,10 +216,8 @@ def test_free_kv_cache_block_queue_operations():
     for _ in range(4):
         queue.popleft()
     assert queue.num_free_blocks == 0
-    assert (queue.fake_free_list_head.next_free_block
-            is queue.fake_free_list_tail)
-    assert (queue.fake_free_list_tail.prev_free_block
-            is queue.fake_free_list_head)
+    assert queue.fake_free_list_head.next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is queue.fake_free_list_head
 
     # Attempt to pop from an empty queue
     with pytest.raises(ValueError) as e:
@@ -209,10 +233,8 @@ def test_free_kv_cache_block_queue_append_n():
     # fake_head->fake_tail
     queue.append_n([])
     assert queue.num_free_blocks == 0
-    assert (queue.fake_free_list_head.next_free_block
-            is queue.fake_free_list_tail)
-    assert (queue.fake_free_list_tail.prev_free_block
-            is queue.fake_free_list_head)
+    assert queue.fake_free_list_head.next_free_block is queue.fake_free_list_tail
+    assert queue.fake_free_list_tail.prev_free_block is queue.fake_free_list_head
     # Append 1 block
     # fake_head->b0->fake_tail
     queue.append_n(blocks[0:1])
@@ -261,15 +283,18 @@ def test_free_kv_cache_block_queue_append_n():
         # fake_head->fake_tail
         invalid_queue.append_n(blocks[0:1])
     assert invalid_queue.num_free_blocks == 0
-    assert (invalid_queue.fake_free_list_head.next_free_block ==
-            invalid_queue.fake_free_list_tail)
+    assert (
+        invalid_queue.fake_free_list_head.next_free_block
+        == invalid_queue.fake_free_list_tail
+    )
 
 
 def test_free_kv_cache_block_queue_popleft_n():
     blocks = [KVCacheBlock(block_id=i) for i in range(6)]
     # Create an empty FreeKVCacheBlockQueue with these blocks
     queue = FreeKVCacheBlockQueue(
-        [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
+        [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]]
+    )
     assert queue.num_free_blocks == 6
     assert queue.fake_free_list_head.next_free_block is blocks[1]
     assert blocks[1].prev_free_block is queue.fake_free_list_head
@@ -343,8 +368,7 @@ def test_free_kv_cache_block_queue_get_all_free_blocks():
 
     # Append a block back and check again
     queue.append(block_to_remove)
-    assert queue.get_all_free_blocks() == \
-        blocks[1:2] + blocks[3:] + [block_to_remove]
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:] + [block_to_remove]
 
 
 def test_generate_block_hash_extra_keys():
@@ -360,12 +384,12 @@ def test_generate_block_hash_extra_keys():
 
     # Test with no extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
-    assert extra_keys == ("hash1", )
+    assert extra_keys == ("hash1",)
     assert next_mm_idx == 1
 
     # Test with partial overlap
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
-    assert extra_keys == ("hash1", )
+    assert extra_keys == ("hash1",)
     assert next_mm_idx == 1
 
     # Test with no overlap
@@ -375,7 +399,7 @@ def test_generate_block_hash_extra_keys():
 
     # Test with multiple extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
-    assert extra_keys == ('hash1', 'hash2')
+    assert extra_keys == ("hash1", "hash2")
     assert next_mm_idx == 2
 
 
@@ -403,9 +427,9 @@ def test_generate_block_hash_extra_keys_cache_salt():
 
     # salt is added for the first token
     extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0)
-    assert extra_keys == ('salt', )
+    assert extra_keys == ("salt",)
     extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0)
-    assert extra_keys == ('salt', )
+    assert extra_keys == ("salt",)
 
     # no salt added for other tokens
     extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0)
@@ -425,20 +449,84 @@ def test_generate_block_hash_extra_keys_cache_salt():
     )
 
     # Test with no extra keys
-    extra_keys, next_mm_idx = generate_block_hash_extra_keys(
-        request_mm, 0, 5, 0)
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request_mm, 0, 5, 0)
     assert extra_keys == ("hash1", "salt")
     assert next_mm_idx == 1
 
 
+def test_generate_block_hash_extra_keys_prompt_embeds():
+    prompt_embeds = torch.randn(10, 3)
+    request = make_request(
+        request_id="0",
+        prompt_token_ids=None,
+        mm_positions=None,
+        mm_hashes=None,
+        prompt_embeds=prompt_embeds,
+    )
+
+    # Test with prompt embeds for the first block
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 5, 0)
+    expected_embeds = prompt_embeds[0:5]
+    expected_bytes = kv_cache_utils.tensor_data(expected_embeds).tobytes()
+    assert extra_keys == (expected_bytes,)
+
+    # Test with prompt embeds for the second block
+    extra_keys, _ = generate_block_hash_extra_keys(request, 5, 10, 0)
+    expected_embeds = prompt_embeds[5:10]
+    expected_bytes = kv_cache_utils.tensor_data(expected_embeds).tobytes()
+    assert extra_keys == (expected_bytes,)
+
+
+def test_generate_block_hash_extra_keys_different_prompt_embeds():
+    prompt_embeds1 = torch.randn(10, 3)
+    prompt_embeds2 = torch.randn(10, 3)
+    request1 = make_request(
+        request_id="0",
+        prompt_token_ids=None,
+        mm_positions=None,
+        mm_hashes=None,
+        prompt_embeds=prompt_embeds1,
+    )
+    request2 = make_request(
+        request_id="1",
+        prompt_token_ids=None,
+        mm_positions=None,
+        mm_hashes=None,
+        prompt_embeds=prompt_embeds2,
+    )
+
+    extra_keys1, _ = generate_block_hash_extra_keys(request1, 0, 5, 0)
+    extra_keys2, _ = generate_block_hash_extra_keys(request2, 0, 5, 0)
+    assert extra_keys1 != extra_keys2
+
+
+def test_generate_block_hash_extra_keys_lora():
+    request = make_request(
+        request_id="0",
+        prompt_token_ids=[_ for _ in range(6)],
+    )
+
+    request.lora_request = LoRARequest(
+        lora_name="test_lora_adapter", lora_int_id=1, lora_path="/path/to/lora"
+    )
+
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 3, 0)
+    assert extra_keys == ("test_lora_adapter",)
+
+    request.lora_request = None
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 3, 0)
+    assert extra_keys is None
+
+
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_hash_block_tokens(hash_fn):
     parent_block_hash = BlockHash(b"123")
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
-    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                   curr_block_token_ids, extra_keys)
+    block_hash = hash_block_tokens(
+        hash_fn, parent_block_hash, curr_block_token_ids, extra_keys
+    )
     expected = hash_fn((parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash == expected
 
@@ -459,10 +547,8 @@ def test_request_block_hasher(hash_fn):
 
     block_hashes = request.block_hashes
     assert len(block_hashes) == 2
-    assert block_hashes[0] == hash_fn(
-        (kv_cache_utils.NONE_HASH, (0, 1, 2), ("hash1", )))
-    assert block_hashes[1] == hash_fn(
-        (block_hashes[0], (3, 4, 5), ("hash2", )))
+    assert block_hashes[0] == hash_fn((kv_cache_utils.NONE_HASH, (0, 1, 2), ("hash1",)))
+    assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), ("hash2",)))
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
@@ -507,8 +593,7 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     block_hashes = request.block_hashes
 
     assert len(block_hashes) == 2
-    assert block_hashes[0] == hash_fn(
-        (kv_cache_utils.NONE_HASH, (0, 1, 2), None))
+    assert block_hashes[0] == hash_fn((kv_cache_utils.NONE_HASH, (0, 1, 2), None))
     assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
 
 
@@ -520,7 +605,7 @@ def test_metrics():
     """
     Test the prefix caching metrics.
     """
-    metrics = PrefixCachingMetrics(max_recent_requests=5)
+    metrics = CachingMetrics(max_recent_requests=5)
     assert metrics.hit_rate == 0.0
 
     metrics.observe(_stats(1, 20, 9))
@@ -552,7 +637,7 @@ def test_metrics_empty_stats():
     """
     Test the prefix caching metrics with empty stats.
     """
-    metrics = PrefixCachingMetrics(max_recent_requests=5)
+    metrics = CachingMetrics(max_recent_requests=5)
     metrics.observe(_stats(0, 0, 0))
     metrics.observe(_stats(1, 20, 9))
     metrics.observe(_stats(0, 0, 0))
@@ -585,27 +670,36 @@ def test_get_kv_cache_configs_multiple_workers():
     vllm_config = VllmConfig(model_config=model_config)
 
     ref_kv_cache_spec = new_kv_cache_spec()
-    same_kv_cache_specs = [{
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_kv_cache_spec(),
-    }, {
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_kv_cache_spec(),
-    }]
+    same_kv_cache_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+    ]
 
     # Basic case. All things are the same.
-    kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [
-        ref_kv_cache_spec.page_size_bytes * 2 * 10,
-        ref_kv_cache_spec.page_size_bytes * 2 * 10
-    ])
+    kv_cache_configs = get_kv_cache_configs(
+        vllm_config,
+        same_kv_cache_specs,
+        [
+            ref_kv_cache_spec.page_size_bytes * 2 * 10,
+            ref_kv_cache_spec.page_size_bytes * 2 * 10,
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -614,10 +708,12 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -627,18 +723,24 @@ def test_get_kv_cache_configs_multiple_workers():
 
     # Different available memory. This is the case for TP.
     # Use the smallest memory available.
-    kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [
-        ref_kv_cache_spec.page_size_bytes * 2 * 10,
-        ref_kv_cache_spec.page_size_bytes * 2 * 20
-    ])
+    kv_cache_configs = get_kv_cache_configs(
+        vllm_config,
+        same_kv_cache_specs,
+        [
+            ref_kv_cache_spec.page_size_bytes * 2 * 10,
+            ref_kv_cache_spec.page_size_bytes * 2 * 20,
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -647,10 +749,12 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -659,25 +763,32 @@ def test_get_kv_cache_configs_multiple_workers():
     ]
 
     # Different KV cache specs. This is the case for PP.
-    different_layer_specs = [{
-        "layer1": new_kv_cache_spec(),
-    }, {
-        "layer2": new_kv_cache_spec(),
-        "layer3": new_kv_cache_spec(),
-    }]
+    different_layer_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+        },
+        {
+            "layer2": new_kv_cache_spec(),
+            "layer3": new_kv_cache_spec(),
+        },
+    ]
 
     # Different workers have different layers.
     kv_cache_configs = get_kv_cache_configs(
-        vllm_config, different_layer_specs, [
+        vllm_config,
+        different_layer_specs,
+        [
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
-            ref_kv_cache_spec.page_size_bytes * 2 * 10
-        ])
+            ref_kv_cache_spec.page_size_bytes * 2 * 10,
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
-                              shared_by=["layer1"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
@@ -686,10 +797,12 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer3"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer2", "layer3"], new_kv_cache_spec()),
@@ -698,33 +811,43 @@ def test_get_kv_cache_configs_multiple_workers():
     ]
 
     # Some layers are the same, some are different. This is the case for TP+PP
-    tp_pp_kv_cache_specs = [{
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_kv_cache_spec(),
-    }, {
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_kv_cache_spec(),
-    }, {
-        "layer3": new_kv_cache_spec(),
-    }, {
-        "layer3": new_kv_cache_spec(),
-    }]
+    tp_pp_kv_cache_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+        {
+            "layer3": new_kv_cache_spec(),
+        },
+        {
+            "layer3": new_kv_cache_spec(),
+        },
+    ]
 
     kv_cache_configs = get_kv_cache_configs(
-        vllm_config, tp_pp_kv_cache_specs, [
+        vllm_config,
+        tp_pp_kv_cache_specs,
+        [
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
-        ])
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -733,10 +856,12 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -745,8 +870,9 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
-                              shared_by=["layer3"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer3"], ref_kv_cache_spec),
@@ -755,8 +881,9 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
-                              shared_by=["layer3"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer3"], ref_kv_cache_spec),
@@ -766,26 +893,34 @@ def test_get_kv_cache_configs_multiple_workers():
 
     # Different workers have different types of layers. This is the case for
     # hybrid models + PP.
-    different_type_layer_specs = [{
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_kv_cache_spec(),
-    }, {
-        "layer3": new_sliding_window_spec(),
-        "layer4": new_sliding_window_spec(),
-    }]
+    different_type_layer_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+        {
+            "layer3": new_sliding_window_spec(),
+            "layer4": new_sliding_window_spec(),
+        },
+    ]
     kv_cache_configs = get_kv_cache_configs(
-        vllm_config, different_type_layer_specs, [
+        vllm_config,
+        different_type_layer_specs,
+        [
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
             ref_kv_cache_spec.page_size_bytes * 2 * 10,
-        ])
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer2"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
@@ -795,41 +930,50 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer3"]),
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer4"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
+                ),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer4"]
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec([], ref_kv_cache_spec),
-                KVCacheGroupSpec(["layer3", "layer4"],
-                                 new_sliding_window_spec()),
+                KVCacheGroupSpec(["layer3", "layer4"], new_sliding_window_spec()),
             ],
         ),
     ]
 
     # When divided into multiple KVCacheGroups, need to ensure the number of
     # layers per group is similar.
-    different_type_layer_specs = [{
-        "layer1": new_kv_cache_spec(),
-        "layer2": new_sliding_window_spec(),
-        "layer3": new_sliding_window_spec(),
-    }, {
-        "layer4": new_kv_cache_spec(),
-        "layer5": new_sliding_window_spec(),
-        "layer6": new_sliding_window_spec(),
-    }]
+    different_type_layer_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_sliding_window_spec(),
+            "layer3": new_sliding_window_spec(),
+        },
+        {
+            "layer4": new_kv_cache_spec(),
+            "layer5": new_sliding_window_spec(),
+            "layer6": new_sliding_window_spec(),
+        },
+    ]
     kv_cache_configs = get_kv_cache_configs(
-        vllm_config, different_type_layer_specs, [
+        vllm_config,
+        different_type_layer_specs,
+        [
             ref_kv_cache_spec.page_size_bytes * 10,
             ref_kv_cache_spec.page_size_bytes * 10,
-        ])
+        ],
+    )
     assert kv_cache_configs == [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer1", "layer2", "layer3"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10,
+                    shared_by=["layer1", "layer2", "layer3"],
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], ref_kv_cache_spec),
@@ -840,8 +984,10 @@ def test_get_kv_cache_configs_multiple_workers():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[
-                KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
-                              shared_by=["layer4", "layer5", "layer6"]),
+                KVCacheTensor(
+                    size=ref_kv_cache_spec.page_size_bytes * 10,
+                    shared_by=["layer4", "layer5", "layer6"],
+                ),
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer4"], ref_kv_cache_spec),
@@ -852,16 +998,23 @@ def test_get_kv_cache_configs_multiple_workers():
     ]
 
     # Have conflicting layers. Need to raise an error.
-    conflicting_layer_specs = [{
-        "layer1": new_kv_cache_spec(),
-    }, {
-        "layer1": new_sliding_window_spec(),
-    }]
+    conflicting_layer_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+        },
+        {
+            "layer1": new_sliding_window_spec(),
+        },
+    ]
     with pytest.raises(AssertionError):
-        get_kv_cache_configs(vllm_config, conflicting_layer_specs, [
-            ref_kv_cache_spec.page_size_bytes * 2 * 10,
-            ref_kv_cache_spec.page_size_bytes * 2 * 10,
-        ])
+        get_kv_cache_configs(
+            vllm_config,
+            conflicting_layer_specs,
+            [
+                ref_kv_cache_spec.page_size_bytes * 2 * 10,
+                ref_kv_cache_spec.page_size_bytes * 2 * 10,
+            ],
+        )
 
 
 def test_merge_kv_cache_spec():
@@ -906,14 +1059,16 @@ def test_merge_kv_cache_spec():
     ]
     with pytest.raises(ValueError):
         different_sliding_window_layer_specs[0].merge(
-            different_sliding_window_layer_specs)
+            different_sliding_window_layer_specs
+        )
 
     same_sliding_window_layer_specs = [
         new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
         new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
     ]
     merged_layer_spec = same_sliding_window_layer_specs[0].merge(
-        same_sliding_window_layer_specs)
+        same_sliding_window_layer_specs
+    )
     assert merged_layer_spec.sliding_window == 1
 
     same_sliding_window_layer_spec_with_none = [
@@ -921,7 +1076,8 @@ def test_merge_kv_cache_spec():
         new_kv_cache_spec(num_kv_heads=32, sliding_window=None),
     ]
     merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge(
-        same_sliding_window_layer_spec_with_none)
+        same_sliding_window_layer_spec_with_none
+    )
     assert merged_layer_spec.sliding_window == 1
 
 
@@ -958,12 +1114,13 @@ def test_is_kv_cache_spec_uniform():
 
 
 @pytest.mark.parametrize(
-    ("model_id", "max_model_len", "want_estimated_max_len"), [
+    ("model_id", "max_model_len", "want_estimated_max_len"),
+    [
         ("Qwen/Qwen1.5-7B", 16385, 16384),
         ("Qwen/Qwen1.5-7B", 16383, 16383),
-    ])
-def test_estimate_max_model_len(model_id, max_model_len,
-                                want_estimated_max_len):
+    ],
+)
+def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len):
     # Create a VllmConfig
     model_config = ModelConfig(
         model_id,
@@ -989,8 +1146,9 @@ def test_estimate_max_model_len(model_id, max_model_len,
             dtype=torch.float16,
         )
     # Estimate the maximum model length, 16384 model_len need 8GB
-    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
-                                               8 * GiB_bytes)
+    estimated_max_len = estimate_max_model_len(
+        vllm_config, kv_cache_spec, 8 * GiB_bytes
+    )
     assert estimated_max_len == want_estimated_max_len
 
 
@@ -1004,8 +1162,9 @@ def test_get_max_concurrency_for_kv_cache_config():
         dtype="float16",
         max_model_len=max_model_len,
     )
-    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
-                                       enable_chunked_prefill=True)
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=1024, enable_chunked_prefill=True
+    )
 
     vllm_config = VllmConfig(
         model_config=model_config,
@@ -1031,38 +1190,39 @@ def test_get_max_concurrency_for_kv_cache_config():
         num_blocks=int(1024 * 1.5),
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
         ],
     )
     max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_full_attention)
+        vllm_config, kv_cache_config_full_attention
+    )
     assert max_concurrency_full_attention == 1.5
 
     kv_cache_config_sliding_window = KVCacheConfig(
         num_blocks=129 * 3,
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             sliding_window_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], sliding_window_spec),
         ],
     )
     max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_sliding_window)
+        vllm_config, kv_cache_config_sliding_window
+    )
     assert max_concurrency_sliding_window == 3
 
     kv_cache_config_hybrid_model = KVCacheConfig(
         num_blocks=(1024 + 129) * 3,
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             full_attention_spec),
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
-                             sliding_window_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
+            KVCacheGroupSpec(
+                [f"layer_{i}" for i in range(32, 64)], sliding_window_spec
+            ),
         ],
     )
     max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_hybrid_model)
+        vllm_config, kv_cache_config_hybrid_model
+    )
     assert max_concurrency_hybrid_model == 3
 
 
@@ -1075,8 +1235,7 @@ def test_allocate_with_lookahead():
             KVCacheTensor(size=100, shared_by=["layer1"]),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer1"],
-                             new_kv_cache_spec(block_size=block_size)),
+            KVCacheGroupSpec(["layer1"], new_kv_cache_spec(block_size=block_size)),
         ],
     )
 
@@ -1089,8 +1248,7 @@ def test_allocate_with_lookahead():
     )
 
     # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1099,8 +1257,7 @@ def test_allocate_with_lookahead():
     assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
@@ -1111,8 +1268,7 @@ def test_allocate_with_lookahead():
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1129,82 +1285,78 @@ def test_get_kv_cache_config_one_worker():
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
     kv_cache_specs_full = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
     }
     kv_cache_config_full = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_full],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_full], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     print(kv_cache_config_full)
     assert kv_cache_config_full == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
-        kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
-        ])
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())],
+    )
 
     # all layers are sliding window -> single group
     kv_cache_specs_sliding = {
-        'layer_1': new_sliding_window_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_sliding_window_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_sliding = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_sliding],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_sliding], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     assert kv_cache_config_sliding == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
-        ])
+        ],
+    )
 
     # full + sliding, but disable_hybrid_kv_cache_manager
     vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_hybrid],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"],
-                             new_kv_cache_spec(sliding_window=1)),
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2"], new_kv_cache_spec(sliding_window=1)
+            ),
         ],
     )
     vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
 
     # full + sliding, with hybrid_kv_cache_manager
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_hybrid],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=64,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 64,
-                          shared_by=["layer_1", "layer_2"]),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 64, shared_by=["layer_1", "layer_2"]
+            ),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
@@ -1214,112 +1366,113 @@ def test_get_kv_cache_config_one_worker():
 
     # 2 full + 4 sliding, 2 layers per group
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
-        'layer_3': new_sliding_window_spec(),
-        'layer_4': new_sliding_window_spec(),
-        'layer_5': new_sliding_window_spec(),
-        'layer_6': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_sliding_window_spec(),
+        "layer_4": new_sliding_window_spec(),
+        "layer_5": new_sliding_window_spec(),
+        "layer_6": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_hybrid],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1", "layer_3", "layer_4"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2", "layer_5", "layer_6"]),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_3", "layer_4"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_5", "layer_6"],
+            ),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
-            KVCacheGroupSpec(["layer_3", "layer_5"],
-                             new_sliding_window_spec()),
-            KVCacheGroupSpec(["layer_4", "layer_6"],
-                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_5"], new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_4", "layer_6"], new_sliding_window_spec()),
         ],
     )
 
     # 3 full + 7 sliding, pad to 3 full + 9 sliding
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
-        'layer_3': new_kv_cache_spec(),
-        'layer_4': new_sliding_window_spec(),
-        'layer_5': new_sliding_window_spec(),
-        'layer_6': new_sliding_window_spec(),
-        'layer_7': new_sliding_window_spec(),
-        'layer_8': new_sliding_window_spec(),
-        'layer_9': new_sliding_window_spec(),
-        'layer_10': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_sliding_window_spec(),
+        "layer_5": new_sliding_window_spec(),
+        "layer_6": new_sliding_window_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_hybrid],
-        [mem_per_block_per_layer * 3 * 32])[0]
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 3 * 32]
+    )[0]
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
             KVCacheTensor(
                 size=mem_per_block_per_layer * 32,
-                shared_by=["layer_1", "layer_4", "layer_5", "layer_6"]),
+                shared_by=["layer_1", "layer_4", "layer_5", "layer_6"],
+            ),
             KVCacheTensor(
                 size=mem_per_block_per_layer * 32,
-                shared_by=["layer_2", "layer_7", "layer_8", "layer_9"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_3", "layer_10"]),
+                shared_by=["layer_2", "layer_7", "layer_8", "layer_9"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32, shared_by=["layer_3", "layer_10"]
+            ),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
-                             new_kv_cache_spec()),
-            KVCacheGroupSpec(["layer_4", "layer_7", "layer_10"],
-                             new_sliding_window_spec()),
-            KVCacheGroupSpec(["layer_5", "layer_8"],
-                             new_sliding_window_spec()),
-            KVCacheGroupSpec(["layer_6", "layer_9"],
-                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"], new_kv_cache_spec()),
+            KVCacheGroupSpec(
+                ["layer_4", "layer_7", "layer_10"], new_sliding_window_spec()
+            ),
+            KVCacheGroupSpec(["layer_5", "layer_8"], new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_6", "layer_9"], new_sliding_window_spec()),
         ],
     )
 
     # different hidden size
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(head_size=128),
-        'layer_2': new_kv_cache_spec(head_size=64),
+        "layer_1": new_kv_cache_spec(head_size=128),
+        "layer_2": new_kv_cache_spec(head_size=64),
     }
     kv_cache_config_hybrid = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_hybrid],
-        [mem_per_block_per_layer * 3 * 32])[0]
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 3 * 32]
+    )[0]
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32 * 2,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32 * 2, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"],
-                             UniformTypeKVCacheSpecs(
-                                 block_size=16,
-                                 kv_cache_specs=kv_cache_specs_hybrid))
-        ])
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2"],
+                UniformTypeKVCacheSpecs(
+                    block_size=16, kv_cache_specs=kv_cache_specs_hybrid
+                ),
+            )
+        ],
+    )
 
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
     kv_cache_config_override_blocks = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs_full],
-        [mem_per_block_per_layer * 2 * 32])[0]
+        vllm_config, [kv_cache_specs_full], [mem_per_block_per_layer * 2 * 32]
+    )[0]
     assert kv_cache_config_override_blocks == KVCacheConfig(
         num_blocks=16,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 16,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 16,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 16, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 16, shared_by=["layer_2"]),
         ],
-        kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
-        ])
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())],
+    )
 
 
 def test_get_kv_cache_configs_attention_free():
@@ -1338,42 +1491,44 @@ def test_get_kv_cache_configs_attention_free():
 def test_generate_uniform_type_kv_cache_specs():
     # All layers are full attention, can be merged
     kv_cache_specs = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(head_size=128),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(head_size=128),
     }
     uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
     assert uniform_spec == UniformTypeKVCacheSpecs(
-        block_size=16, kv_cache_specs=kv_cache_specs)
+        block_size=16, kv_cache_specs=kv_cache_specs
+    )
 
     # Full attention + sliding window, cannot be merged
     kv_cache_specs = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_sliding_window_spec(sliding_window=1),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(sliding_window=1),
     }
     uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
     assert uniform_spec is None
 
     # different order of full attention + sliding window, cannot be merged
     kv_cache_specs = {
-        'layer_1': new_sliding_window_spec(sliding_window=1),
-        'layer_2': new_kv_cache_spec(),
+        "layer_1": new_sliding_window_spec(sliding_window=1),
+        "layer_2": new_kv_cache_spec(),
     }
     uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
     assert uniform_spec is None
 
     # Same-size sliding window, can be merged
     kv_cache_specs = {
-        'layer_1': new_sliding_window_spec(sliding_window=1),
-        'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128),
+        "layer_1": new_sliding_window_spec(sliding_window=1),
+        "layer_2": new_sliding_window_spec(sliding_window=1, head_size=128),
     }
     uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
     assert uniform_spec == UniformTypeKVCacheSpecs(
-        block_size=16, kv_cache_specs=kv_cache_specs)
+        block_size=16, kv_cache_specs=kv_cache_specs
+    )
 
     # different block sizes, cannot be merged
     kv_cache_specs = {
-        'layer_1': new_kv_cache_spec(block_size=16),
-        'layer_2': new_kv_cache_spec(block_size=32),
+        "layer_1": new_kv_cache_spec(block_size=16),
+        "layer_2": new_kv_cache_spec(block_size=32),
     }
     uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
     assert uniform_spec is None
@@ -1381,38 +1536,39 @@ def test_generate_uniform_type_kv_cache_specs():
 
 def test_generate_scheduler_kv_cache_config():
     kv_cache_specs = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(head_size=128),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(head_size=128),
     }
     kv_cache_configs = [
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[],
             kv_cache_groups=[
-                KVCacheGroupSpec(['layer_1', 'layer_2'],
-                                 UniformTypeKVCacheSpecs(
-                                     block_size=16,
-                                     kv_cache_specs=kv_cache_specs)),
+                KVCacheGroupSpec(
+                    ["layer_1", "layer_2"],
+                    UniformTypeKVCacheSpecs(
+                        block_size=16, kv_cache_specs=kv_cache_specs
+                    ),
+                ),
             ],
         )
     ]
-    scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
-        kv_cache_configs)
+    scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
     assert scheduler_kv_cache_config == KVCacheConfig(
         num_blocks=10,
         kv_cache_tensors=[],
-        kv_cache_groups=[
-            KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec())
-        ],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())],
     )
 
 
 def new_mla_spec(cache_dtype_str=None):
-    return MLAAttentionSpec(block_size=16,
-                            num_kv_heads=16,
-                            head_size=64,
-                            dtype=torch.float32,
-                            cache_dtype_str=cache_dtype_str)
+    return MLAAttentionSpec(
+        block_size=16,
+        num_kv_heads=16,
+        head_size=64,
+        dtype=torch.float32,
+        cache_dtype_str=cache_dtype_str,
+    )
 
 
 def test_merge_mla_spec():
@@ -1450,3 +1606,88 @@ def test_merge_mla_spec():
     ]
     with pytest.raises(AssertionError):
         kv_cache_specs[0].merge(kv_cache_specs)
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
+def test_request_block_hasher_with_prompt_embeds(hash_fn: Callable[[Any], bytes]):
+    block_size = 3
+    num_tokens = 2 * block_size
+    prompt_token_ids = [_ for _ in range(num_tokens)]
+    hidden_size = 5
+    prompt_embeds = torch.randn((num_tokens, hidden_size))
+
+    request = make_request(
+        request_id="0",
+        prompt_token_ids=prompt_token_ids,
+        block_size=block_size,
+        hash_fn=hash_fn,
+        prompt_embeds=prompt_embeds,
+    )
+
+    block_hashes = request.block_hashes
+    assert len(block_hashes) == 2
+
+    block1_embeds_bytes = tensor_data(prompt_embeds[:block_size]).tobytes()
+    expected_hash1 = hash_fn(
+        (
+            kv_cache_utils.NONE_HASH,
+            tuple(prompt_token_ids[:block_size]),
+            (block1_embeds_bytes,),
+        )
+    )
+    assert block_hashes[0] == expected_hash1
+
+    block2_embeds_bytes = tensor_data(prompt_embeds[block_size:num_tokens]).tobytes()
+    expected_hash2 = hash_fn(
+        (
+            block_hashes[0],
+            tuple(prompt_token_ids[block_size:num_tokens]),
+            (block2_embeds_bytes,),
+        )
+    )
+    assert block_hashes[1] == expected_hash2
+
+
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
+def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes]):
+    block_size = 3
+    num_tokens = 2 * block_size
+    prompt_token_ids = [_ for _ in range(num_tokens)]
+    hidden_size = 5
+    prompt_embeds = torch.randn((num_tokens, hidden_size))
+
+    request = make_request(
+        request_id="0",
+        prompt_token_ids=prompt_token_ids,
+        block_size=block_size,
+        hash_fn=hash_fn,
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
+        mm_hashes=["hash1", "hash2"],
+        prompt_embeds=prompt_embeds,
+    )
+
+    block_hashes = request.block_hashes
+    assert len(block_hashes) == 2
+
+    block1_embeds_bytes = tensor_data(prompt_embeds[:block_size]).tobytes()
+    expected_hash1 = hash_fn(
+        (
+            kv_cache_utils.NONE_HASH,
+            tuple(prompt_token_ids[:block_size]),
+            ("hash1", block1_embeds_bytes),
+        )
+    )
+    assert block_hashes[0] == expected_hash1
+
+    block2_embeds_bytes = tensor_data(prompt_embeds[block_size:num_tokens]).tobytes()
+    expected_hash2 = hash_fn(
+        (
+            block_hashes[0],
+            tuple(prompt_token_ids[block_size:num_tokens]),
+            ("hash2", block2_embeds_bytes),
+        )
+    )
+    assert block_hashes[1] == expected_hash2
diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py
similarity index 82%
rename from tests/v1/test_kv_sharing.py
rename to tests/v1/core/test_kv_sharing.py
index 31a74101faf92cecde0402bf31b56a6b1cfaaad4..e6d37b1d63c8c5637ea515304ea895dd8d29cfb1 100644
--- a/tests/v1/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 import torch
 
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
 from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
 
+pytestmark = pytest.mark.cpu_test
+
 
 def new_kv_cache_spec():
     return FullAttentionSpec(16, 1, 1, torch.float32, False)
@@ -26,8 +29,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
     # However, if they have different attention backends, they will be
     # placed in different attention groups for KV cache group 0
     kv_cache_groups = [
-        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
-                         new_kv_cache_spec()),
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
     ]
 
     add_kv_sharing_layers_to_kv_cache_groups(
@@ -38,7 +40,10 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
     # Check that the layers were added to the correct KV cache group
     assert len(kv_cache_groups) == 1
     assert kv_cache_groups[0].layer_names == [
-        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+        "model.layers.0",
+        "model.layers.1",
+        "model.layers.2",
+        "model.layers.3",
     ]
 
 
@@ -53,8 +58,7 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
     }
 
     kv_cache_groups = [
-        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
-                         new_kv_cache_spec()),
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
     ]
 
     add_kv_sharing_layers_to_kv_cache_groups(
@@ -65,14 +69,17 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
     # Check that the layers were added to the correct KV cache group
     assert len(kv_cache_groups) == 1
     assert kv_cache_groups[0].layer_names == [
-        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+        "model.layers.0",
+        "model.layers.1",
+        "model.layers.2",
+        "model.layers.3",
     ]
 
 
 def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
     """
     Test KV sharing set up when no attention groups are provided.
-    This is the case for the TPU model runner, which doesn't have 
+    This is the case for the TPU model runner, which doesn't have
     support for attention groups yet.
     """
     shared_kv_cache_layers = {
@@ -92,9 +99,5 @@ def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
 
     # Check that the layers were added to the correct KV cache group
     assert len(kv_cache_groups) == 2
-    assert kv_cache_groups[0].layer_names == [
-        "model.layers.0", "model.layers.2"
-    ]
-    assert kv_cache_groups[1].layer_names == [
-        "model.layers.1", "model.layers.3"
-    ]
+    assert kv_cache_groups[0].layer_names == ["model.layers.0", "model.layers.2"]
+    assert kv_cache_groups[1].layer_names == ["model.layers.1", "model.layers.3"]
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 5769099e0af103b169c7c1fb7e0c0186c8e2b482..2291f363731f251c02312616e030b667fcf08aa4 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -3,27 +3,42 @@
 """Compare the with and without prefix caching."""
 
 import copy
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import (MultiModalFeatureSpec,
-                                    MultiModalKwargsItem, PlaceholderRange)
+from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved, BlockStored
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.sampling_params import SamplingParams
-from vllm.utils import sha256, sha256_cbor
+from vllm.utils.hashing import sha256, sha256_cbor
 from vllm.v1.core.block_pool import BlockHashToBlockMap, BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock, get_block_hash,
-                                         get_group_id,
-                                         get_request_block_hasher,
-                                         hash_block_tokens, init_none_hash,
-                                         make_block_hash_with_group_id)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, SlidingWindowSpec)
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashWithGroupId,
+    KVCacheBlock,
+    get_block_hash,
+    get_group_id,
+    get_request_block_hasher,
+    hash_block_tokens,
+    init_none_hash,
+    make_block_hash_with_group_id,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    SlidingWindowSpec,
+)
+
+pytestmark = pytest.mark.cpu_test
 
 
 @pytest.fixture(autouse=True)
@@ -41,10 +56,11 @@ def make_request(
     prompt_token_ids: list[int],
     block_size: int,
     hash_fn: Callable,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    mm_hashes: Optional[list[str]] = None,
-    prompt_logprobs: Optional[int] = None,
-    cache_salt: Optional[str] = None,
+    mm_positions: list[PlaceholderRange] | None = None,
+    mm_hashes: list[str] | None = None,
+    prompt_logprobs: int | None = None,
+    cache_salt: str | None = None,
+    lora_request: LoRARequest | None = None,
 ):
     mm_features = []
     if mm_positions is not None:
@@ -54,19 +70,21 @@ def make_request(
                 data=MultiModalKwargsItem.dummy("dummy_m"),
                 mm_position=position,
                 identifier=identifier,
-                modality="image")
+                modality="image",
+            )
             mm_features.append(mm_feature)
 
-    return Request(request_id=request_id,
-                   prompt_token_ids=prompt_token_ids,
-                   mm_features=mm_features if mm_features else None,
-                   sampling_params=SamplingParams(
-                       max_tokens=17, prompt_logprobs=prompt_logprobs),
-                   pooling_params=None,
-                   eos_token_id=100,
-                   lora_request=None,
-                   cache_salt=cache_salt,
-                   block_hasher=get_request_block_hasher(block_size, hash_fn))
+    return Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        mm_features=mm_features if mm_features else None,
+        sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=lora_request,
+        cache_salt=cache_salt,
+        block_hasher=get_request_block_hasher(block_size, hash_fn),
+    )
 
 
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
@@ -82,8 +100,9 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     )
 
 
-def make_kv_cache_config_hybrid_model(block_size: int,
-                                      num_blocks: int) -> KVCacheConfig:
+def make_kv_cache_config_hybrid_model(
+    block_size: int, num_blocks: int
+) -> KVCacheConfig:
     return KVCacheConfig(
         num_blocks=num_blocks,
         kv_cache_tensors=[],
@@ -94,19 +113,15 @@ def make_kv_cache_config_hybrid_model(block_size: int,
             ),
             KVCacheGroupSpec(
                 ["layer2"],
-                SlidingWindowSpec(block_size,
-                                  1,
-                                  1,
-                                  torch.float32,
-                                  sliding_window=2 * block_size),
+                SlidingWindowSpec(
+                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
+                ),
             ),
             KVCacheGroupSpec(
                 ["layer3"],
-                SlidingWindowSpec(block_size,
-                                  1,
-                                  1,
-                                  torch.float32,
-                                  sliding_window=2 * block_size),
+                SlidingWindowSpec(
+                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
+                ),
             ),
         ],
     )
@@ -114,7 +129,6 @@ def make_kv_cache_config_hybrid_model(block_size: int,
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_prefill(hash_fn):
-
     block_size = 16
     manager = KVCacheManager(
         make_kv_cache_config(block_size, 11),
@@ -134,17 +148,16 @@ def test_prefill(hash_fn):
     assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     # Check full block metadata
     parent_block_hash = None
     for block_id in (1, 2, 3):
-        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16 : block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
         blk_hash = manager.block_pool.blocks[block_id].block_hash
         assert blk_hash is not None
         assert get_block_hash(blk_hash) == block_hash
@@ -153,24 +166,23 @@ def test_prefill(hash_fn):
         parent_block_hash = block_hash
 
     # Check partial block metadata
-    for block_id in (4, ):
+    for block_id in (4,):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
-                        hash_fn)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(req1.block_hashes) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([5],)
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -189,30 +201,27 @@ def test_prefill(hash_fn):
     # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
-    req2 = make_request("2", common_token_ids + unique_token_ids, block_size,
-                        hash_fn)
+    req2 = make_request("2", common_token_ids + unique_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(req2.block_hashes) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req2, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([6], )
+    blocks = manager.allocate_slots(
+        req2, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([6],)
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
     assert free_block_queue.num_free_blocks == 6
-    assert all(
-        [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
+    assert all([b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
     assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6
 
     manager.free(req2)
@@ -222,19 +231,23 @@ def test_prefill(hash_fn):
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 16 * 10,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req3, 16 * 10, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     # This block ID order also checks the eviction order.
-    assert blocks is not None and blocks.get_block_ids() == ([
-        7, 8, 9, 10, 4, 5, 6, 3, 2, 1
-    ], )
+    assert blocks is not None and blocks.get_block_ids() == (
+        [7, 8, 9, 10, 4, 5, 6, 3, 2, 1],
+    )
 
     assert free_block_queue.num_free_blocks == 0
-    assert (free_block_queue.fake_free_list_head.next_free_block
-            is free_block_queue.fake_free_list_tail)
-    assert (free_block_queue.fake_free_list_tail.prev_free_block
-            is free_block_queue.fake_free_list_head)
+    assert (
+        free_block_queue.fake_free_list_head.next_free_block
+        is free_block_queue.fake_free_list_tail
+    )
+    assert (
+        free_block_queue.fake_free_list_tail.prev_free_block
+        is free_block_queue.fake_free_list_head
+    )
 
 
 def test_prefill_hybrid_model():
@@ -259,20 +272,20 @@ def test_prefill_hybrid_model():
     assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], [
-        5, 6, 7, 8
-    ], [9, 10, 11, 12])
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == (
+        [1, 2, 3, 4],
+        [5, 6, 7, 8],
+        [9, 10, 11, 12],
+    )
 
     # Check full block metadata
     parent_block_hash = None
-    for length, block_ids in zip((1, 2, 3),
-                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
-        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
+    for length, block_ids in zip((1, 2, 3), ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16 : length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
         for group_id, block_id in enumerate(block_ids):
             blk_hash = manager.block_pool.blocks[block_id].block_hash
             assert blk_hash is not None
@@ -289,17 +302,15 @@ def test_prefill_hybrid_model():
     # Cache hit in the common prefix
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
-                        hash_fn)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(req1.block_hashes) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
-                                                           7], [0, 10, 11])
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6, 7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and blocks.get_block_ids() == ([13], [14], [15])
     for block_per_group in computed_blocks.blocks:
         for block in block_per_group:
@@ -311,55 +322,70 @@ def test_prefill_hybrid_model():
     manager.free(req1)
 
     cached_block_hash_to_block_bak = copy.copy(
-        manager.block_pool.cached_block_hash_to_block._cache)
+        manager.block_pool.cached_block_hash_to_block._cache
+    )
 
-    def test_partial_request_hit(request_id: str,
-                                 hash_to_evict: list[BlockHashWithGroupId],
-                                 expect_hit_length: int):
-        req = make_request(request_id, common_token_ids + unique_token_ids,
-                           block_size, sha256)
+    def test_partial_request_hit(
+        request_id: str,
+        hash_to_evict: list[BlockHashWithGroupId],
+        expect_hit_length: int,
+    ):
+        req = make_request(
+            request_id, common_token_ids + unique_token_ids, block_size, sha256
+        )
         for hash_with_group_id in hash_to_evict:
-            manager.block_pool.cached_block_hash_to_block._cache.pop(
-                hash_with_group_id)
+            manager.block_pool.cached_block_hash_to_block._cache.pop(hash_with_group_id)
         computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
         assert len(req.block_hashes) == 3
         assert num_computed_tokens == expect_hit_length * block_size
         for block_per_group in computed_blocks.blocks:
             assert len(block_per_group) == num_computed_tokens // block_size
         for hash_with_group_id in hash_to_evict:
-            manager.block_pool.cached_block_hash_to_block._cache[
-                hash_with_group_id] = cached_block_hash_to_block_bak[
-                    hash_with_group_id]
+            manager.block_pool.cached_block_hash_to_block._cache[hash_with_group_id] = (
+                cached_block_hash_to_block_bak[hash_with_group_id]
+            )
         manager.free(req)
 
     # Evict the blocks outside sliding window, does not affect the hit length.
-    test_partial_request_hit("2", [
-        make_block_hash_with_group_id(block_hashes[0], 1),
-        make_block_hash_with_group_id(block_hashes[0], 2)
-    ], 3)
+    test_partial_request_hit(
+        "2",
+        [
+            make_block_hash_with_group_id(block_hashes[0], 1),
+            make_block_hash_with_group_id(block_hashes[0], 2),
+        ],
+        3,
+    )
 
     # Evict the first block of full attention, makes total cache miss.
     test_partial_request_hit(
-        "3", [make_block_hash_with_group_id(block_hashes[0], 0)], 0)
+        "3", [make_block_hash_with_group_id(block_hashes[0], 0)], 0
+    )
 
     # Evict the last block of all layers, reduces the hit length to 2.
-    test_partial_request_hit("4", [
-        make_block_hash_with_group_id(block_hashes[2], 0),
-        make_block_hash_with_group_id(block_hashes[2], 1),
-        make_block_hash_with_group_id(block_hashes[2], 2),
-    ], 2)
+    test_partial_request_hit(
+        "4",
+        [
+            make_block_hash_with_group_id(block_hashes[2], 0),
+            make_block_hash_with_group_id(block_hashes[2], 1),
+            make_block_hash_with_group_id(block_hashes[2], 2),
+        ],
+        2,
+    )
 
     # Evict the last block of full attention, reduces the hit length to 2.
     test_partial_request_hit(
-        "5", [make_block_hash_with_group_id(block_hashes[2], 0)], 2)
+        "5", [make_block_hash_with_group_id(block_hashes[2], 0)], 2
+    )
 
     # Evict the last block of sliding window, reduces the hit length to 2.
     test_partial_request_hit(
-        "6", [make_block_hash_with_group_id(block_hashes[2], 1)], 2)
+        "6", [make_block_hash_with_group_id(block_hashes[2], 1)], 2
+    )
 
     # Evict the last block of sliding window, reduces the hit length to 2.
     test_partial_request_hit(
-        "7", [make_block_hash_with_group_id(block_hashes[2], 2)], 2)
+        "7", [make_block_hash_with_group_id(block_hashes[2], 2)], 2
+    )
 
     # Evict different set of blocks for full attention and sliding window makes
     # total cache miss.
@@ -367,20 +393,24 @@ def test_prefill_hybrid_model():
     # The cache hit length of sliding window is 2 * block_size.
     # Then it is cache miss as the two type of layers
     # have different hit length.
-    test_partial_request_hit("8", [
-        make_block_hash_with_group_id(block_hashes[2], 0),
-        make_block_hash_with_group_id(block_hashes[0], 1),
-        make_block_hash_with_group_id(block_hashes[0], 2),
-    ], 0)
+    test_partial_request_hit(
+        "8",
+        [
+            make_block_hash_with_group_id(block_hashes[2], 0),
+            make_block_hash_with_group_id(block_hashes[0], 1),
+            make_block_hash_with_group_id(block_hashes[0], 2),
+        ],
+        0,
+    )
 
 
 def test_prefill_plp():
-    '''Test prefill with APC and some prompt logprobs (plp) requests.
+    """Test prefill with APC and some prompt logprobs (plp) requests.
 
     1. Schedule plp request and validate APC block allocation
     2. Schedule non-plp request and validate blocks
     3. Schedule plp request; no hit should occur; validate blocks
-    '''
+    """
     block_size = 16
     manager = KVCacheManager(
         make_kv_cache_config(block_size, 11),
@@ -398,28 +428,23 @@ def test_prefill_plp():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0",
-                        all_token_ids,
-                        block_size,
-                        hash_fn,
-                        prompt_logprobs=5)
+    req0 = make_request("0", all_token_ids, block_size, hash_fn, prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
     parent_block_hash = None
     for block_id in (1, 2, 3):
-        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
-        blk_hash = (manager.block_pool.blocks[block_id].block_hash)
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16 : block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
+        blk_hash = manager.block_pool.blocks[block_id].block_hash
         assert blk_hash is not None
         assert get_block_hash(blk_hash) == block_hash
         assert get_group_id(blk_hash) == 0
@@ -427,7 +452,7 @@ def test_prefill_plp():
         parent_block_hash = block_hash
 
     # Check partial block metadata
-    for block_id in (4, ):
+    for block_id in (4,):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -435,17 +460,16 @@ def test_prefill_plp():
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
-                        hash_fn)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(req1.block_hashes) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([5],)
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -463,30 +487,27 @@ def test_prefill_plp():
     # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
     unique_token_ids = [3] * 6
-    req2 = make_request("2",
-                        common_token_ids + unique_token_ids,
-                        block_size,
-                        hash_fn,
-                        prompt_logprobs=5)
+    req2 = make_request(
+        "2", common_token_ids + unique_token_ids, block_size, hash_fn, prompt_logprobs=5
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(req2.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req2, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
-    assert block_ids != ([1, 2, 3, 4], )
+    assert block_ids != ([1, 2, 3, 4],)
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -510,26 +531,29 @@ def test_decode():
     # Fully cache miss
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
-    req0 = make_request("0", common_token_ids + unique_token_ids, block_size,
-                        sha256)
+    req0 = make_request("0", common_token_ids + unique_token_ids, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
     for _ in range(4):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 4,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 4, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-1].block_hash is None
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-1]
+        .block_hash
+        is None
+    )
 
     # Append slots with allocating a new block.
     req0.num_computed_tokens = 59
@@ -537,14 +561,22 @@ def test_decode():
     # the preallocated block.
     for _ in range(9 + 10):
         req0.append_output_token_ids(7)
-    new_blocks = manager.allocate_slots(req0, 19,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 19, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-2].block_hash is not None
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-1].block_hash is None
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-2]
+        .block_hash
+        is not None
+    )
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-1]
+        .block_hash
+        is None
+    )
 
 
 def test_evict():
@@ -560,22 +592,22 @@ def test_evict():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 5 * 16 + 7,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, 5 * 16 + 7, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     # 5 full + 1 partial
     assert blocks is not None and len(blocks.blocks[0]) == 6
 
     # 3 blocks.
-    req1 = make_request("1", list(range(last_token_id,
-                                        last_token_id + 3 * 16)), block_size,
-                        sha256)
+    req1 = make_request(
+        "1", list(range(last_token_id, last_token_id + 3 * 16)), block_size, sha256
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, 3 * 16,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, 3 * 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
@@ -586,19 +618,18 @@ def test_evict():
     manager.free(req1)
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == ([1, 2], )
+    assert computed_blocks.get_block_ids() == ([1, 2],)
     assert num_computed_tokens == 2 * 16
-    blocks = manager.allocate_slots(req2, 3,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([10], )
+    blocks = manager.allocate_slots(
+        req2, 3, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([10],)
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
@@ -620,9 +651,9 @@ def test_hash_block_correct_reuse():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 1
 
     # Deallocate the block.
@@ -634,13 +665,12 @@ def test_hash_block_correct_reuse():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req, num_tokens - 1,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req, num_tokens - 1, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 1
 
-    assert manager.block_pool.blocks[blocks.blocks[0]
-                                     [0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks.blocks[0][0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -661,21 +691,22 @@ def test_computed_blocks_not_evicted():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
-    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)),
-                        block_size, sha256)
+    req1 = make_request(
+        "1", list(range(num_tokens, num_tokens * 2)), block_size, sha256
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 2
 
@@ -691,9 +722,12 @@ def test_computed_blocks_not_evicted():
     assert computed_blocks.blocks[0][0].block_id == 1
     assert num_computed_tokens == block_size
 
-    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2,
+        num_tokens * 2 - num_tokens,
+        len(computed_blocks.blocks[0]) * 16,
+        computed_blocks,
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 2
 
@@ -709,29 +743,29 @@ def test_basic_prefix_caching_disabled():
         enable_caching=False,
     )
 
-    req1 = make_request("1", list(range(10)), block_size,
-                        sha256)  # 2 blocks and some more
+    req1 = make_request(
+        "1", list(range(10)), block_size, sha256
+    )  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, 10,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, 10, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 3
 
     # Free the blocks.
     manager.free(req1)
 
     # No caching.
-    req2 = make_request("2", list(range(16)), block_size,
-                        sha256)  # shared prefix
+    req2 = make_request("2", list(range(16)), block_size, sha256)  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req2, 16,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2, 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None and len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
@@ -739,9 +773,9 @@ def test_basic_prefix_caching_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 4,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req3, 4, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert not blocks
 
 
@@ -840,24 +874,41 @@ def test_cache_blocks_multi_group():
     # Block hash 1: hit for group 0 and 1
     # Block hash 2: hit for group 1
 
-    assert block_pool.get_cached_block(req.block_hashes[0],
-                                       kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[1],
-                                       kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[2],
-                                       kv_cache_group_ids=[0]) is None
-    assert block_pool.get_cached_block(req.block_hashes[0],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[1],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[2],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[0],
-                                       kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[1],
-                                       kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(req.block_hashes[2],
-                                       kv_cache_group_ids=[0, 1]) is None
+    assert (
+        block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0]) is None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0, 1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0, 1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0, 1])
+        is None
+    )
 
 
 def test_mm_prefix_caching():
@@ -887,16 +938,16 @@ def test_mm_prefix_caching():
     # A unique image plus some text tokens.
     unique_token_ids = [-1] * 7 + [100] * 4
     all_token_ids = common_token_ids + unique_token_ids
-    mm_positions = common_mm_positions + [
-        PlaceholderRange(offset=48, length=7)
-    ]
+    mm_positions = common_mm_positions + [PlaceholderRange(offset=48, length=7)]
     mm_hashes = common_mm_hashes + ["ccc"]
-    req0 = make_request("0",
-                        all_token_ids,
-                        block_size,
-                        sha256,
-                        mm_positions=mm_positions,
-                        mm_hashes=mm_hashes)
+    req0 = make_request(
+        "0",
+        all_token_ids,
+        block_size,
+        sha256,
+        mm_positions=mm_positions,
+        mm_hashes=mm_hashes,
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes
@@ -905,47 +956,55 @@ def test_mm_prefix_caching():
     block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0] == sha256(
-        (kv_cache_utils.NONE_HASH, tuple(all_token_ids[:block_size]),
-         ("aaa", )))
+        (kv_cache_utils.NONE_HASH, tuple(all_token_ids[:block_size]), ("aaa",))
+    )
     assert block_hashes[1] == sha256(
-        (block_hashes[0], tuple(all_token_ids[block_size:block_size * 2]),
-         ("aaa", "bbb")))
+        (
+            block_hashes[0],
+            tuple(all_token_ids[block_size : block_size * 2]),
+            ("aaa", "bbb"),
+        )
+    )
     assert block_hashes[2] == sha256(
-        (block_hashes[1], tuple(all_token_ids[block_size * 2:block_size * 3]),
-         ("bbb", )))
+        (
+            block_hashes[1],
+            tuple(all_token_ids[block_size * 2 : block_size * 3]),
+            ("bbb",),
+        )
+    )
 
-    blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 5, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
     assert len(block_hashes) == 4
     assert block_hashes[3] == sha256(
-        (block_hashes[2], tuple(all_token_ids[3 * block_size:] + [8] * 5),
-         ("ccc", )))
+        (block_hashes[2], tuple(all_token_ids[3 * block_size :] + [8] * 5), ("ccc",))
+    )
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
     all_token_ids = common_token_ids + unique_token_ids
-    mm_positions = common_mm_positions + [
-        PlaceholderRange(offset=48, length=7)
-    ]
+    mm_positions = common_mm_positions + [PlaceholderRange(offset=48, length=7)]
     mm_hashes = common_mm_hashes + ["ccc"]
-    req1 = make_request("1",
-                        all_token_ids,
-                        block_size,
-                        sha256,
-                        mm_positions=mm_positions,
-                        mm_hashes=mm_hashes)
+    req1 = make_request(
+        "1",
+        all_token_ids,
+        block_size,
+        sha256,
+        mm_positions=mm_positions,
+        mm_hashes=mm_hashes,
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * 16
@@ -975,30 +1034,33 @@ def test_cache_key_salting():
     block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0] == sha256(
-        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt1", )))
+        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt1",))
+    )
     assert block_hashes[1] == sha256(
-        (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None))
+        (block_hashes[0], tuple(token_ids[block_size : block_size * 2]), None)
+    )
     assert block_hashes[2] == sha256(
-        (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]),
-         None))
+        (block_hashes[1], tuple(token_ids[block_size * 2 : block_size * 3]), None)
+    )
 
-    blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks is not None
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 5, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
     assert len(block_hashes) == 4
     assert block_hashes[3] == sha256(
-        (block_hashes[2], tuple(token_ids[3 * block_size:] + [8] * 5), None))
+        (block_hashes[2], tuple(token_ids[3 * block_size :] + [8] * 5), None)
+    )
 
     # Test cache hit with a new request that has the same salt.
     token_ids = common_token_ids + [4] * 11
@@ -1017,12 +1079,14 @@ def test_cache_key_salting():
     block_hashes = req2.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0] == sha256(
-        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt2", )))
+        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt2",))
+    )
     assert block_hashes[1] == sha256(
-        (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None))
+        (block_hashes[0], tuple(token_ids[block_size : block_size * 2]), None)
+    )
     assert block_hashes[2] == sha256(
-        (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]),
-         None))
+        (block_hashes[1], tuple(token_ids[block_size * 2 : block_size * 3]), None)
+    )
 
 
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
@@ -1045,22 +1109,24 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req0, 48,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req0, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id]
+        req0.request_id
+    ]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
-    manager.allocate_slots(req1, 48,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req1, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
-        req1.request_id]
+        req1.request_id
+    ]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
@@ -1073,9 +1139,12 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks[0]) * block_size,
-                           computed_blocks)
+    manager.allocate_slots(
+        req2,
+        block_size * 2,
+        len(computed_blocks.blocks[0]) * block_size,
+        computed_blocks,
+    )
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
@@ -1086,9 +1155,12 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
-    assert manager.allocate_slots(req3, 48,
-                                  len(computed_blocks.blocks[0]) * 16,
-                                  computed_blocks) is None
+    assert (
+        manager.allocate_slots(
+            req3, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+        )
+        is None
+    )
     # Block 0-2 are used by Req 1.
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
     # Block 3-5 are free.
@@ -1108,7 +1180,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids, block_size, sha256)
     blocks = manager.allocate_slots(req0, 55)
-    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], )
+    assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -1116,10 +1188,10 @@ def test_reset_prefix_cache():
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(req1.block_hashes) == 3
     assert len(computed_blocks.blocks[0]) == 3
-    blocks = manager.allocate_slots(req1, 7,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks is not None and blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, 7, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks is not None and blocks.get_block_ids() == ([5],)
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
@@ -1150,9 +1222,9 @@ def test_prefix_cache_stats_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req, 16,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.reset_prefix_cache()
 
     # Ensure prefix_cache_stats remains None
@@ -1189,19 +1261,14 @@ def test_maybe_evict_cached_block():
     # Evict block1
     pool._maybe_evict_cached_block(block1)
     assert pool.cached_block_hash_to_block._cache == {
-        block_hash0: {
-            block0.block_id: block0,
-            block3.block_id: block3
-        },
+        block_hash0: {block0.block_id: block0, block3.block_id: block3},
         block_hash2: block2,
     }
     # Evict block0: block_hash0 entry should NOT be removed, as block3
     # also use the same hash
     pool._maybe_evict_cached_block(block0)
     assert pool.cached_block_hash_to_block._cache == {
-        block_hash0: {
-            block3.block_id: block3
-        },
+        block_hash0: {block3.block_id: block3},
         block_hash2: block2,
     }
     # Evict block2
@@ -1234,8 +1301,11 @@ def test_kv_cache_events(blocks_to_cache: int):
     events = manager.take_events()
 
     block = events[-1]
-    assert (len(block.block_hashes) == blocks_to_cache == len(
-        manager.block_pool.cached_block_hash_to_block))
+    assert (
+        len(block.block_hashes)
+        == blocks_to_cache
+        == len(manager.block_pool.cached_block_hash_to_block)
+    )
     assert len(block.token_ids) == block.block_size * len(block.block_hashes)
     assert len(manager.block_pool.kv_event_queue) == 0
 
@@ -1252,9 +1322,12 @@ def test_kv_cache_events(blocks_to_cache: int):
     for blocks in events[:-1]:
         assert blocks.block_hashes[0] in stored_block_hash
     assert len(events) == blocks_to_cache + 1
-    assert (isinstance(events[-2], BlockRemoved))
-    assert (len(events[-1].block_hashes) == blocks_to_cache == len(
-        manager.block_pool.cached_block_hash_to_block))
+    assert isinstance(events[-2], BlockRemoved)
+    assert (
+        len(events[-1].block_hashes)
+        == blocks_to_cache
+        == len(manager.block_pool.cached_block_hash_to_block)
+    )
 
     # All Blocks Cleared
     # Should see a single all blocks cleared event
@@ -1266,8 +1339,65 @@ def test_kv_cache_events(blocks_to_cache: int):
     assert len(manager.block_pool.cached_block_hash_to_block) == 0
 
 
+@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
+def test_kv_cache_events_with_lora(blocks_to_cache: int):
+    """Test BlockStored events contain correct lora_id when using LoRA requests."""
+    block_size = 16
+    num_blocks = blocks_to_cache + 1
+
+    # Create KVCacheManager with events enabled
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks),
+        max_model_len=8192,
+        enable_caching=True,
+        enable_kv_cache_events=True,
+    )
+
+    # Test with LoRA request
+    lora_request = LoRARequest(
+        lora_name="test_lora", lora_int_id=42, lora_path="/test/path"
+    )
+
+    num_tokens = block_size * blocks_to_cache
+    req_with_lora = make_request(
+        "lora_req",
+        list(range(num_tokens)),
+        block_size,
+        sha256,
+        lora_request=lora_request,
+    )
+
+    # Allocate slots and get events
+    _ = manager.allocate_slots(req_with_lora, num_tokens)
+    events = manager.take_events()
+
+    # Verify BlockStored event contains correct lora_id
+    block_stored_event = events[-1]
+    assert isinstance(block_stored_event, BlockStored)
+    assert block_stored_event.lora_id == 42  # Should match lora_request.adapter_id
+    assert len(block_stored_event.block_hashes) == blocks_to_cache
+    assert block_stored_event.block_size == block_size
+
+    # Clean up
+    manager.free(req_with_lora)
+
+    # Test without LoRA request (should have lora_id=None)
+    req_without_lora = make_request(
+        "no_lora_req", list(range(num_tokens)), block_size, sha256
+    )
+
+    _ = manager.allocate_slots(req_without_lora, num_tokens)
+    events = manager.take_events()
+
+    block_stored_event = events[-1]
+    assert isinstance(block_stored_event, BlockStored)
+    assert block_stored_event.lora_id is None  # Should be None when no LoRA request
+    assert len(block_stored_event.block_hashes) == blocks_to_cache
+    assert block_stored_event.block_size == block_size
+
+
 def test_eagle_enabled_removes_last_block():
-    """Verify Eagle does NOT remove blocks when request 
+    """Verify Eagle does NOT remove blocks when request
     length is divisible by block size."""
     block_size = 16
     manager = KVCacheManager(
@@ -1283,9 +1413,9 @@ def test_eagle_enabled_removes_last_block():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
@@ -1314,9 +1444,9 @@ def test_eagle_with_partial_blocks():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.free(req)
 
     # New request with Eagle enabled
@@ -1341,7 +1471,7 @@ def test_eagle_with_sliding_window():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[],
-            kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
+            kv_cache_groups=[KVCacheGroupSpec(["layer"], sliding_window_spec)],
         ),
         max_model_len=8192,
         enable_caching=True,
@@ -1354,9 +1484,9 @@ def test_eagle_with_sliding_window():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     # record the block hash of the first block in the request for later use
     block_hash_first_block = req.block_hashes[0]
     assert block_hash_first_block is not None
@@ -1370,14 +1500,20 @@ def test_eagle_with_sliding_window():
     assert num_tokens == 1 * block_size
 
     # Evict the first block in the request
-    assert manager.block_pool.get_cached_block(
-        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    assert (
+        manager.block_pool.get_cached_block(
+            block_hash_first_block, kv_cache_group_ids=[0]
+        )
+        is not None
+    )
     manager.block_pool.cached_block_hash_to_block._cache.pop(
-        make_block_hash_with_group_id(block_hash_first_block, 0))
+        make_block_hash_with_group_id(block_hash_first_block, 0)
+    )
 
     # New request
-    req_after_evict = make_request("partial_eagle_after_evict", token_ids,
-                                   block_size, sha256)
+    req_after_evict = make_request(
+        "partial_eagle_after_evict", token_ids, block_size, sha256
+    )
     computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0b703302e3836893c632b2b35cf154a63608ed
--- /dev/null
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import uuid
+
+import numpy as np
+import pytest
+
+from vllm.config import VllmConfig
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
+from vllm.sampling_params import SamplingParams
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.request import Request
+
+from .test_scheduler import create_scheduler_with_priority
+from .utils import EOS_TOKEN_ID
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _create_random_request(
+    max_tokens_range: tuple[int, int],
+    num_tokens_range: tuple[int, int],
+    arrival_time_range: tuple[float, float],
+    priority_range: tuple[int, int],
+    num_mm_item_range: tuple[int, int],
+    vllm_config: VllmConfig,
+):
+    max_tokens = random.randint(*max_tokens_range)
+    num_tokens = random.randint(*num_tokens_range)
+    priority = random.randint(*priority_range)
+    arrival_time = random.uniform(*arrival_time_range)
+    num_mm_item = random.randint(*num_mm_item_range)
+
+    mm_positions: list[PlaceholderRange] = []
+    for mm_start in sorted(
+        random.sample(range(num_tokens), min(num_mm_item, num_tokens))
+    ):
+        if mm_start + 10 > num_tokens:
+            continue
+        mm_positions.append(PlaceholderRange(offset=mm_start, length=10))
+
+    request_id = uuid.uuid4().hex
+
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+    )
+    mm_features = []
+    for j, position in enumerate(mm_positions):
+        identifier = f"{request_id}_hash_{j}"
+        mm_feature = MultiModalFeatureSpec(
+            data=MultiModalKwargsItem.dummy("dummy_m"),
+            mm_position=position,
+            identifier=identifier,
+            modality="image",
+        )
+        mm_features.append(mm_feature)
+
+    prompt_token_ids = random.choices(range(100), k=num_tokens)
+
+    caching_hash_fn = get_hash_fn_by_name(
+        vllm_config.cache_config.prefix_caching_hash_algo
+    )
+    init_none_hash(caching_hash_fn)
+    block_hasher = get_request_block_hasher(
+        vllm_config.cache_config.block_size, caching_hash_fn
+    )
+
+    request = Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        mm_features=mm_features if mm_features else None,
+        eos_token_id=EOS_TOKEN_ID,
+        arrival_time=arrival_time,
+        priority=priority,
+        block_hasher=block_hasher,
+    )
+    return request
+
+
+def _mock_execute_model(
+    scheduler_output: SchedulerOutput, num_output_tokens_range: tuple[int, int]
+) -> ModelRunnerOutput:
+    request_ids: list[str] = []
+    request_ids.extend(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    request_ids.extend(scheduler_output.scheduled_cached_reqs.req_ids)
+    random.shuffle(request_ids)
+
+    num_output_tokens = [
+        random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
+    ]
+    sampled_token_ids = [
+        np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens
+    ]
+
+    return ModelRunnerOutput(
+        req_ids=request_ids,
+        req_id_to_index={req_id: i for i, req_id in enumerate(request_ids)},
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+
+def _mock_draft_token_ids(
+    scheduler_output: SchedulerOutput,
+    num_output_tokens_range: tuple[int, int],
+    seen_request_prompt_length: dict[str, int],
+) -> DraftTokenIds:
+    request_ids: list[str] = []
+    sampled_token_ids: list[list[int]] = []
+    for request in scheduler_output.scheduled_new_reqs:
+        assert request.req_id not in seen_request_prompt_length
+        seen_request_prompt_length[request.req_id] = len(request.prompt_token_ids or [])
+        if request.num_computed_tokens >= seen_request_prompt_length[request.req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(request.req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    for req_id, num_computed_tokens in zip(
+        scheduler_output.scheduled_cached_reqs.req_ids,
+        scheduler_output.scheduled_cached_reqs.num_computed_tokens,
+    ):
+        if num_computed_tokens >= seen_request_prompt_length[req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
+
+
+def _chech_valid_scheduler_output(
+    scheduler_output: SchedulerOutput,
+    seen_request_ids: set[str],
+    seen_mm_hashes: set[str],
+):
+    for req in scheduler_output.scheduled_new_reqs:
+        assert req.req_id not in seen_request_ids
+        seen_request_ids.add(req.req_id)
+    for req_id in scheduler_output.scheduled_cached_reqs.req_ids:
+        assert req_id in seen_request_ids
+
+    req_ids = set[str]()
+    req_ids.update(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    req_ids.update(scheduler_output.scheduled_cached_reqs.req_ids)
+
+    assert set(scheduler_output.num_scheduled_tokens.keys()) == req_ids
+    assert (
+        sum(scheduler_output.num_scheduled_tokens.values())
+        == scheduler_output.total_num_scheduled_tokens
+    )
+
+    assert set(scheduler_output.scheduled_spec_decode_tokens.keys()) <= req_ids
+    assert set(scheduler_output.scheduled_encoder_inputs.keys()) <= req_ids
+
+    for req in scheduler_output.scheduled_new_reqs:
+        for mm_feature in req.mm_features:
+            seen_mm_hashes.add(mm_feature.identifier)
+    for mm_hash in scheduler_output.free_encoder_mm_hashes:
+        assert mm_hash in seen_mm_hashes
+
+    assert scheduler_output.finished_req_ids <= seen_request_ids
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+@pytest.mark.parametrize("num_speculative_tokens", [None, 1, 5])
+@pytest.mark.parametrize(
+    ("max_input_tokens", "max_output_tokens", "max_num_seqs", "num_blocks"),
+    [
+        # Standard profile
+        (5000, 500, 256, 10000),
+        # Generation heavy + high max_num_seqs + low num_blocks -> Many preemptions
+        (500, 5000, 1024, 1000),
+    ],
+    ids=["standard", "preemption"],
+)
+def test_priority_scheduling_blast(
+    enable_prefix_caching: bool,
+    num_speculative_tokens: int | None,
+    max_input_tokens: int,
+    max_output_tokens: int,
+    max_num_seqs: int,
+    num_blocks: int,
+):
+    random.seed(42)
+    np.random.seed(42)
+
+    seen_request_prompt_length = dict[str, int]()
+    seen_request_ids = set[str]()
+    seen_mm_hashes = set[str]()
+
+    scheduler = create_scheduler_with_priority(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        max_num_seqs=max_num_seqs,
+        enable_prefix_caching=enable_prefix_caching,
+        num_blocks=num_blocks,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    num_initial_requests = 10
+    for _ in range(num_initial_requests):
+        req = _create_random_request(
+            max_tokens_range=(1, max_output_tokens),
+            num_tokens_range=(1, max_input_tokens),
+            arrival_time_range=(0, 1),
+            priority_range=(-3, 3),
+            num_mm_item_range=(0, 2),
+            vllm_config=scheduler.vllm_config,
+        )
+        scheduler.add_request(req)
+
+    for _ in range(20000):
+        if len(scheduler.waiting) == 0:
+            num_new_requests = random.randint(0, 2)
+            for _ in range(num_new_requests):
+                req = _create_random_request(
+                    max_tokens_range=(1, max_output_tokens),
+                    num_tokens_range=(1, max_input_tokens),
+                    arrival_time_range=(0, 1),
+                    priority_range=(-3, 3),
+                    num_mm_item_range=(0, 2),
+                    vllm_config=scheduler.vllm_config,
+                )
+                scheduler.add_request(req)
+        scheduler_output = scheduler.schedule()
+        _chech_valid_scheduler_output(
+            scheduler_output, seen_request_ids, seen_mm_hashes
+        )
+        model_output = _mock_execute_model(
+            scheduler_output,
+            num_output_tokens_range=(1, 1 + (num_speculative_tokens or 0)),
+        )
+        scheduler.update_from_output(scheduler_output, model_output)
+        if num_speculative_tokens is not None:
+            scheduler.update_draft_token_ids(
+                _mock_draft_token_ids(
+                    scheduler_output,
+                    (0, num_speculative_tokens),
+                    seen_request_prompt_length,
+                )
+            )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f6fc1e6d37d142d879dc03cc6c7c0f8ec2c0c4e5..0570c0854c678a2232258fc1b29a3ac1f6b73d06 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,26 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+import dataclasses
 from unittest.mock import Mock
 
+import numpy as np
 import pytest
 import torch
 
-from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
-                         SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalFeatureSpec,
-                                    MultiModalKwargsItem, PlaceholderRange)
+from vllm.config import (
+    CacheConfig,
+    ECTransferConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.utils.hashing import sha256
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.structured_output.request import StructuredOutputRequest
 
-from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
+from .utils import EOS_TOKEN_ID, create_requests, create_scheduler, mock_kv
+
+pytestmark = pytest.mark.cpu_test
 
 
 def test_add_requests():
@@ -40,8 +58,7 @@ def test_finish_request():
         scheduler.add_request(request)
 
     for i, request in enumerate(requests):
-        scheduler.finish_requests(request.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(request.request_id, RequestStatus.FINISHED_ABORTED)
         assert request.request_id not in scheduler.requests
         assert len(scheduler.waiting) == 9 - i
 
@@ -53,23 +70,23 @@ def test_get_num_unfinished_requests():
         scheduler.add_request(request)
 
     for i, request in enumerate(requests):
-        scheduler.finish_requests(request.request_id,
-                                  RequestStatus.FINISHED_STOPPED)
+        scheduler.finish_requests(request.request_id, RequestStatus.FINISHED_STOPPED)
         assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
 
 
-@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
-    (None, None),
-    (True, 5),
-])
-def test_schedule(enable_prefix_caching: Optional[bool],
-                  prompt_logprobs: Optional[int]):
-    '''Test scheduling.
+@pytest.mark.parametrize(
+    "enable_prefix_caching, prompt_logprobs",
+    [
+        (None, None),
+        (True, 5),
+    ],
+)
+def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
+    """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
-    '''
+    """
     scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
-    requests = create_requests(num_requests=10,
-                               prompt_logprobs=prompt_logprobs)
+    requests = create_requests(num_requests=10, prompt_logprobs=prompt_logprobs)
     for request in requests:
         scheduler.add_request(request)
 
@@ -91,8 +108,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
 
 def test_schedule_multimodal_requests():
     scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
-    mm_positions = [[PlaceholderRange(offset=i, length=100)]
-                    for i in range(10)]
+    mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)]
     requests = create_requests(
         num_requests=10,
         num_tokens=200,
@@ -125,8 +141,7 @@ def test_schedule_partial_requests():
         model="llava-hf/llava-1.5-7b-hf",
         max_num_batched_tokens=1024,
     )
-    mm_positions = [[PlaceholderRange(offset=100, length=600)]
-                    for _ in range(3)]
+    mm_positions = [[PlaceholderRange(offset=100, length=600)] for _ in range(3)]
     requests = create_requests(
         num_requests=3,
         num_tokens=800,
@@ -149,16 +164,13 @@ def test_schedule_partial_requests():
     # The third request is also scheduled partially.
     # The <img> tokens are not scheduled because of the encoder budget.
     assert output.num_scheduled_tokens[requests[2].request_id] == 100
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[[0], [], []],
+        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -188,9 +200,9 @@ def test_no_mm_input_chunking():
         max_model_len=2048,
     )
     mm_positions = [[PlaceholderRange(offset=400, length=800)]]
-    requests = create_requests(num_requests=1,
-                               num_tokens=1200,
-                               mm_positions=mm_positions)
+    requests = create_requests(
+        num_requests=1, num_tokens=1200, mm_positions=mm_positions
+    )
     for request in requests:
         scheduler.add_request(request)
 
@@ -201,14 +213,11 @@ def test_no_mm_input_chunking():
     # We want to only see the 400 text tokens at the start scheduled
     assert output.num_scheduled_tokens[requests[0].request_id] == 400
 
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -264,14 +273,11 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     assert output.num_scheduled_tokens[requests[1].request_id] == 400
     # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
     assert output.num_scheduled_tokens[requests[2].request_id] == 224
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -295,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        sampled_token_ids=[np.array([0]), np.array([0])]
+        + [np.array([]) for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -308,8 +315,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     assert len(output2.finished_req_ids) == 0
     assert output2.num_scheduled_tokens[requests[0].request_id] == 1
     assert output2.num_scheduled_tokens[requests[1].request_id] == 1
-    assert output2.num_scheduled_tokens[
-        requests[2].request_id] == 800 - 224 - 224
+    assert output2.num_scheduled_tokens[requests[2].request_id] == 800 - 224 - 224
 
 
 def test_stop_via_update_from_output():
@@ -327,34 +333,29 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 1,
-            requests[1].request_id: 2
-        },
+        num_scheduled_tokens={requests[0].request_id: 1, requests[1].request_id: 2},
         total_num_scheduled_tokens=3,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [],
-            requests[1].request_id: [10]
+            requests[1].request_id: [10],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None)
+    )
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[EOS_TOKEN_ID],
-                           [10,
-                            11]],  # First request hits EOS, second continues
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[
+            np.array([EOS_TOKEN_ID]),
+            np.array([10, 11]),
+        ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -368,9 +369,7 @@ def test_stop_via_update_from_output():
 
     # Test case 2: Stop on custom stop token
     scheduler = create_scheduler(num_speculative_tokens=2)
-    requests = create_requests(num_requests=2,
-                               max_tokens=10,
-                               stop_token_ids=[42, 43])
+    requests = create_requests(num_requests=2, max_tokens=10, stop_token_ids=[42, 43])
     for req in requests:
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
@@ -380,34 +379,29 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 3,
-            requests[1].request_id: 2
-        },
+        num_scheduled_tokens={requests[0].request_id: 3, requests[1].request_id: 2},
         total_num_scheduled_tokens=5,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [10, 42],
-            requests[1].request_id: [13]
+            requests[1].request_id: [13],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[10, 42, 12],
-                           [13, 14]],  # First request hits stop token
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[
+            np.array([10, 42, 12]),
+            np.array([13, 14]),
+        ],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -432,34 +426,29 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 3,
-            requests[1].request_id: 1
-        },
+        num_scheduled_tokens={requests[0].request_id: 3, requests[1].request_id: 1},
         total_num_scheduled_tokens=4,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [10, 11],
-            requests[1].request_id: []
+            requests[1].request_id: [],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[10, 11, 12],
-                           [13]],  # First request exceeds max_tokens
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[
+            np.array([10, 11, 12]),
+            np.array([13]),
+        ],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -468,8 +457,7 @@ def test_stop_via_update_from_output():
     assert scheduler.running[0].request_id == requests[1].request_id
     assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
     assert requests[0].request_id in scheduler.finished_req_ids
-    assert list(requests[0].output_token_ids) == [10, 11
-                                                  ]  # Truncated to max_tokens
+    assert list(requests[0].output_token_ids) == [10, 11]  # Truncated to max_tokens
     assert list(requests[1].output_token_ids) == [13]
 
     # Test case 4: Ignore EOS flag
@@ -486,22 +474,20 @@ def test_stop_via_update_from_output():
         num_scheduled_tokens={requests[0].request_id: 3},
         total_num_scheduled_tokens=3,
         scheduled_encoder_inputs={},
-        scheduled_spec_decode_tokens={
-            requests[0].request_id: [EOS_TOKEN_ID, 10]
-        },
-        num_common_prefix_blocks=0,
+        scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]},
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None)
+    )
 
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -511,12 +497,106 @@ def test_stop_via_update_from_output():
     assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
 
 
-@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
-    (None, None),
-    (True, 5),
-])
-def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
-                                     prompt_logprobs: Optional[int]):
+def test_check_stop_min_tokens():
+    """Test that requests don't stop when min_tokens requirement isn't met."""
+    from vllm.v1.core.sched.utils import check_stop
+
+    # Test case 1: num_output_tokens < min_tokens
+    # Should return False (don't stop)
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=20,
+        min_tokens=5,
+    )
+    request = Request(
+        request_id="0",
+        prompt_token_ids=[0, 1, 2],
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+    )
+    # Simulate having generated 3 output tokens (less than min_tokens=5)
+    request.append_output_token_ids([10, 11, EOS_TOKEN_ID])  # EOS token present
+
+    result = check_stop(request, max_model_len=100)
+    assert result is False, "Should not stop when num_output_tokens<min_tokens"
+
+    # Test case 2: num_output_tokens >= min_tokens
+    # Should follow normal stopping logic (stop on EOS)
+    request.append_output_token_ids(
+        [
+            10,
+            11,
+            12,
+            13,
+            14,
+            EOS_TOKEN_ID,
+        ]
+    )  # 6 tokens > min_tokens
+
+    result = check_stop(request, max_model_len=100)
+    assert result is True, "Should stop on EOS when min_tokens met"
+    assert request.status == RequestStatus.FINISHED_STOPPED
+
+    # Test case 3: min_tokens = 0, should follow normal stopping logic
+    sampling_params_no_min = SamplingParams(
+        ignore_eos=False,
+        max_tokens=20,
+        min_tokens=0,
+    )
+    request_no_min = Request(
+        request_id="1",
+        prompt_token_ids=[0, 1, 2],
+        sampling_params=sampling_params_no_min,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+    )
+    request_no_min.append_output_token_ids([10, EOS_TOKEN_ID])
+
+    result = check_stop(request_no_min, max_model_len=100)
+    assert result is True, "Should stop on EOS when min_tokens=0"
+    assert request_no_min.status == RequestStatus.FINISHED_STOPPED
+
+    # Test case 4: min_tokens > 0 with stop token (not EOS)
+    sampling_params_stop = SamplingParams(
+        ignore_eos=False,
+        max_tokens=20,
+        min_tokens=5,
+        stop_token_ids=[42],
+    )
+    request_stop = Request(
+        request_id="2",
+        prompt_token_ids=[0, 1, 2],
+        sampling_params=sampling_params_stop,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+    )
+    # Only 3 output tokens, less than min_tokens=5, but has stop token
+    request_stop.append_output_token_ids([10, 11, 42])
+    result = check_stop(request_stop, max_model_len=100)
+    assert result is False, "Should not stop when num_output_tokens<min_tokens"
+
+    # Test case 5: min_tokens met, should stop on stop token
+    request_stop.append_output_token_ids(
+        [10, 11, 12, 13, 14, 42]
+    )  # 6 tokens >= min_tokens=5
+
+    result = check_stop(request_stop, max_model_len=100)
+    assert result is True, "Should stop on stop token when min_tokens met"
+    assert request_stop.status == RequestStatus.FINISHED_STOPPED
+    assert request_stop.stop_reason == 42
+
+
+@pytest.mark.parametrize(
+    "enable_prefix_caching, prompt_logprobs",
+    [
+        (None, None),
+        (True, 5),
+    ],
+)
+def test_schedule_concurrent_batches(
+    enable_prefix_caching: bool | None, prompt_logprobs: int | None
+):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
         max_num_seqs=2,
@@ -532,21 +612,19 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
     scheduler.add_request(requests[0])
     scheduler_output0 = scheduler.schedule()
     assert len(scheduler_output0.scheduled_new_reqs) == 1
-    assert scheduler_output0.num_scheduled_tokens[
-        requests[0].request_id] == 512
+    assert scheduler_output0.num_scheduled_tokens[requests[0].request_id] == 512
 
     # The first request is still running, so only schedule the second request.
     scheduler.add_request(requests[1])
     scheduler_output1 = scheduler.schedule()
     assert len(scheduler_output1.scheduled_new_reqs) == 1
-    assert scheduler_output1.num_scheduled_tokens[
-        requests[1].request_id] == 512
+    assert scheduler_output1.num_scheduled_tokens[requests[1].request_id] == 512
 
     # Model output of the first request.
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -563,7 +641,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -574,10 +652,12 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
-    scheduler = create_scheduler(max_num_batched_tokens=100,
-                                 block_size=16,
-                                 num_blocks=11,
-                                 enable_prefix_caching=False)
+    scheduler = create_scheduler(
+        max_num_batched_tokens=100,
+        block_size=16,
+        num_blocks=11,
+        enable_prefix_caching=False,
+    )
     requests = create_requests(num_requests=2, num_tokens=80, block_size=16)
 
     # Schedule the first request.
@@ -598,7 +678,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -615,7 +695,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[42]],
+        sampled_token_ids=[np.array([42])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -632,15 +712,22 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
-         (2, 3, 3, [2, 1])),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
-        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
-         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
-    ])
+        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        (
+            [[1, 2], [3]],
+            [np.array([1, 2, 5]), np.array([3, 4])],
+            (2, 3, 3, [2, 1]),
+        ),  # multiple sequences
+        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
+        (
+            [[1, 2, 3], [4, 5, 6]],
+            [np.array([1, 2, 7]), np.array([4, 8])],
+            (2, 6, 3, [2, 1, 0]),
+        ),  # multiple mismatches
+    ],
+)
 def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     """Test scheduling behavior with speculative decoding.
 
@@ -670,13 +757,12 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
     )
-    engine_core_outputs = scheduler.update_from_output(output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(output, model_runner_output)
     draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
     scheduler.update_draft_token_ids(draft_token_ids)
 
@@ -691,20 +777,23 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
 
     # No draft or accepted tokens counted yet
     assert not engine_core_outputs or (
-        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None
+    )
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == 0
     # The sampled token and speculated tokens
-    assert output.total_num_scheduled_tokens == \
-        len(requests) + sum(len(ids) for ids in spec_tokens)
+    assert output.total_num_scheduled_tokens == len(requests) + sum(
+        len(ids) for ids in spec_tokens
+    )
     for i in range(len(requests)):
         req_id = requests[i].request_id
         assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
         if spec_tokens[i]:
-            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
-                len(spec_tokens[i])
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == len(
+                spec_tokens[i]
+            )
         else:
             assert req_id not in output.scheduled_spec_decode_tokens
 
@@ -716,14 +805,16 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         prompt_logprobs_dict={},
         pooler_output=[],
     )
-    engine_core_outputs = scheduler.update_from_output(output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(output, model_runner_output)
 
-    scheduler_stats = engine_core_outputs[0].scheduler_stats \
-        if engine_core_outputs else None
+    scheduler_stats = (
+        engine_core_outputs[0].scheduler_stats if engine_core_outputs else None
+    )
     if expected[0] == 0:
+        assert scheduler_stats is not None
         assert scheduler_stats.spec_decoding_stats is None
     else:
+        assert scheduler_stats is not None
         assert scheduler_stats.spec_decoding_stats is not None
         stats = scheduler_stats.spec_decoding_stats
         assert stats.num_drafts == expected[0]
@@ -760,18 +851,25 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req in requests:
-        blocks = (scheduler.kv_cache_manager.coordinator.
-                  single_type_managers[0].req_to_blocks[req.request_id])
+        blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+            0
+        ].req_to_blocks[req.request_id]
         hashes = req.block_hashes
-        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-                num_cached_block[req.request_id] == EXPECTED_TOTAL_BLOCKS)
+        assert (
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block[req.request_id]
+            == EXPECTED_TOTAL_BLOCKS
+        )
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
     # Make sure we actually touched all the blocks.
     BLOCKS_PER_REQ = num_tokens / block_size
-    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
-            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
+        == num_total_blocks - num_requests * BLOCKS_PER_REQ
+    )
 
 
 def _step_until_done(
@@ -790,7 +888,10 @@ def _step_until_done(
         for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
             # We should be in the decode phase now.
             assert num_scheduled_tokens == 1
-        assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.connector is not None:
+            assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.ec_connector is not None:
+            assert len(output.ec_connector_metadata.mm_datas) == 0
         ecos = scheduler.update_from_output(output, model_runner_output)[0]
         all_done = True
         for eco in ecos.outputs:
@@ -799,36 +900,77 @@ def _step_until_done(
         all_finished = all_done
 
 
-def test_kv_connector_basic():
+def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]):
+    """Cycle requests through a KV transfer cyle."""
+
+    # Requests should first transition to WAITING_FOR_REMOTE_KVS
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+    assert len(output.scheduled_new_reqs) == 0
+    for req in scheduler.requests.values():
+        assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # No model execution yet
+    EMPTY_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, EMPTY_OUTPUT)
+
+    # Simulate KV transfer completion using KVConnectorOutput.finished_recving
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+        kv_connector_output=KVConnectorOutput(finished_recving=req_ids),
+    )
+    scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    for req_id in req_ids:
+        assert req_id in scheduler.finished_recving_kv_req_ids
+
+
+@pytest.mark.parametrize("is_async", [False, True])
+def test_kv_connector_basic(is_async: bool):
     """
     Test whether Scheduler with KVConnector schedules tokens, allocates
     memory, and cleans up requests as expected under normal operation.
     """
 
     # Setup Scheduler.
+    BLOCK_SIZE = 16
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
+        block_size=BLOCK_SIZE,
     )
-    NUM_TOTAL_BLOCKS = (
-        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
-    BLOCK_SIZE = scheduler.cache_config.block_size
-
-    # Mock External Cache Hit.
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
+    NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
 
     ######################################################
     # FIRST SET OF REQUESTS - External Hit Only
     NUM_REQUESTS = 2
     NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
     MAX_TOKENS = 3
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS,
-                               block_size=BLOCK_SIZE)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -836,10 +978,13 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -855,15 +1000,17 @@ def test_kv_connector_basic():
     )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
-                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+    _assert_right_kv_cache_manager(
+        scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS
+    )
 
     # Continue Generation until done.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
     _ = scheduler.schedule()
     # Confirm we clean up the memory properly.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_TOTAL_BLOCKS
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_TOTAL_BLOCKS
+    )
 
     ######################################################
     # SECOND SET OF REQUESTS - Local And External Hit
@@ -871,10 +1018,12 @@ def test_kv_connector_basic():
     # We will get a local prefix cache hit for the first
     # NUM_TOKENS_PREFIX tokens since they are used above.
     NUM_TOKENS = NUM_TOKENS_PREFIX * 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS,
-                               block_size=BLOCK_SIZE)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -882,10 +1031,13 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -898,22 +1050,82 @@ def test_kv_connector_basic():
         output=output,
         num_requests=NUM_REQUESTS,
         # Just the incremental tokens after local + remote cache hit.
-        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
-                                       NUM_MATCHED_NEW_TOKENS))
+        expected_num_scheduled_tokens=(
+            NUM_TOKENS - NUM_TOKENS_PREFIX - NUM_MATCHED_NEW_TOKENS
+        ),
+    )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
-                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+    _assert_right_kv_cache_manager(
+        scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS
+    )
 
     # Continue Generation until done.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
     _ = scheduler.schedule()
     # Confirm we clean up the memory properly.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_TOTAL_BLOCKS
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_TOTAL_BLOCKS
+    )
+
+
+def test_external_prefix_cache_metrics():
+    """
+    Verify connector prefix cache metrics are updated
+    correctly when the scheduler processes requests with KV connector hits.
+    """
+
+    # Setup Scheduler.
+    NUM_MATCHED_NEW_TOKENS = 4
+    scheduler = create_scheduler(
+        enable_prefix_caching=False,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
+    )
+
+    # --- Prepare simple requests ---
+    NUM_REQUESTS = 2
+    NUM_TOKENS = 8
+    MAX_TOKENS = 2
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        max_tokens=MAX_TOKENS,
+    )
+
+    for req in requests:
+        scheduler.add_request(req)
+
+    # --- Trigger scheduling and simulate model output ---
+    output = scheduler.schedule()
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=[r.request_id for r in requests],
+        req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
+        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Update scheduler stats
+    ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # --- Assertions ---
+    assert ecos is not None and len(ecos) > 0
+    assert ecos[0].scheduler_stats is not None
 
+    external_stats = ecos[0].scheduler_stats.connector_prefix_cache_stats
+    assert external_stats is not None
 
-def test_kv_connector_unable_to_allocate():
+    assert external_stats.queries == NUM_TOKENS * NUM_REQUESTS
+    assert external_stats.hits == NUM_MATCHED_NEW_TOKENS * NUM_REQUESTS
+    assert external_stats.requests == NUM_REQUESTS
+    assert external_stats.preempted_requests == 0
+
+
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -922,26 +1134,28 @@ def test_kv_connector_unable_to_allocate():
     # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 4
     NUM_BLOCKS = 10
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
 
     # Create two requests. The second request will not be able to
     # allocate slots because it will not have enough blocks.
     NUM_REQUESTS = 2
     NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
     MAX_TOKENS = 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS,
-                               block_size=BLOCK_SIZE)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -952,7 +1166,7 @@ def test_kv_connector_unable_to_allocate():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -960,38 +1174,41 @@ def test_kv_connector_unable_to_allocate():
 
     # Just one request should be running.
     output = scheduler.schedule()
-    _assert_right_scheduler_output(output,
-                                   num_requests=1,
-                                   expected_num_scheduled_tokens=NUM_TOKENS -
-                                   NUM_MATCHED_NEW_TOKENS)
+    _assert_right_scheduler_output(
+        output,
+        num_requests=1,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
     # All memory should be freed, with one request waiting.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
 
     # Just one request should be running.
     output = scheduler.schedule()
-    _assert_right_scheduler_output(output,
-                                   num_requests=1,
-                                   expected_num_scheduled_tokens=NUM_TOKENS -
-                                   NUM_MATCHED_NEW_TOKENS)
+    _assert_right_scheduler_output(
+        output,
+        num_requests=1,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
 
     # All memory should be freed, with no requests waiting / running.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 0
 
 
-def test_kv_connector_handles_preemption():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1001,28 +1218,29 @@ def test_kv_connector_handles_preemption():
     BLOCK_SIZE = 2
     # NOTE: there is 1 null block, so this is 6 blocks.
     NUM_BLOCKS = 7
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
-
     # Create two requests.
     # Both can be scheduled at first, but the second request
     # will be preempted and re-scheduled.
     NUM_REQUESTS = 2
     NUM_TOKENS = BLOCK_SIZE * 2 + 1
     MAX_TOKENS = BLOCK_SIZE * 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS,
-                               block_size=BLOCK_SIZE)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1033,7 +1251,7 @@ def test_kv_connector_handles_preemption():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1045,7 +1263,8 @@ def test_kv_connector_handles_preemption():
         output,
         # 2 remote kv cache hits.
         num_requests=2,
-        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 2
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
 
@@ -1055,7 +1274,8 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 2
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
 
@@ -1065,7 +1285,8 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
@@ -1078,14 +1299,14 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.waiting) == 1
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
     # All memory should be freed since nothing is running.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 
     # Restarts the preempted request - generate 3rd token.
     # This will have a local and remote cache hit.
@@ -1110,23 +1331,20 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
     # All memory should be freed since nothing is running.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 
 
 def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(scheduler.running)
-        },
-        sampled_token_ids=[[1000]] * len(scheduler.running),
+        req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
+        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1146,14 +1364,24 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               num_cached_block) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks
+        )
+        == 0
+    )
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block
+        )
+        == 0
+    )
     num_free_blocks = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
-    assert num_free_blocks == (
-        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
+    assert num_free_blocks == (scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
 
     # NOTE(rob): just the ref count on blocks will be 0. The hash
     # value, etc will remain since we lazily evict for prefix cache.
@@ -1173,9 +1401,9 @@ def test_memory_leak():
     NUM_REQUESTS = 5
     NUM_TOKENS = 10
     MAX_TOKENS = 10
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
 
     # Add each request.
     for request in requests:
@@ -1200,16 +1428,18 @@ def create_scheduler_with_priority(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: Optional[bool] = None,
+    enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
     num_blocks: int = 10000,
     block_size: int = 16,
-    max_model_len: Optional[int] = None,
-    num_speculative_tokens: Optional[int] = None,
+    max_model_len: int | None = None,
+    num_speculative_tokens: int | None = None,
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
 ) -> Scheduler:
-    '''Create scheduler with priority policy enabled.
+    """Create scheduler with priority policy enabled.
 
     Args:
       model: model under test
@@ -1221,7 +1451,7 @@ def create_scheduler_with_priority(
 
     Returns:
       {class}`Scheduler` instance with priority scheduling
-    '''
+    """
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -1240,9 +1470,11 @@ def create_scheduler_with_priority(
         seed=42,
     )
     # Cache config, optionally force APC
-    kwargs_cache = ({} if enable_prefix_caching is None else {
-        'enable_prefix_caching': enable_prefix_caching
-    })
+    kwargs_cache = (
+        {}
+        if enable_prefix_caching is None
+        else {"enable_prefix_caching": enable_prefix_caching}
+    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
@@ -1250,16 +1482,31 @@ def create_scheduler_with_priority(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": "local_storage"},
-    ) if use_kv_connector else None
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
 
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
-            model="ngram", num_speculative_tokens=num_speculative_tokens)
+            model="ngram", num_speculative_tokens=num_speculative_tokens
+        )
+
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
 
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
@@ -1267,14 +1514,15 @@ def create_scheduler_with_priority(
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -1283,19 +1531,28 @@ def create_scheduler_with_priority(
         kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
+        block_size=block_size,
     )
 
 
+_none_hash_initialized = False
+
+
 def create_requests_with_priority(
-        num_requests: int,
-        priorities: list[int],
-        arrival_times: Optional[list[float]] = None,
-        num_tokens: int = 10,
-        mm_positions: Optional[list[list[PlaceholderRange]]] = None,
-        max_tokens: int = 16,
-        stop_token_ids: Optional[list[int]] = None,
-        prompt_logprobs: Optional[int] = None,
-        starting_idx: int = 0):
+    num_requests: int,
+    priorities: list[int],
+    arrival_times: list[float] | None = None,
+    num_tokens: int = 10,
+    mm_hashes_list: list[list[str]] | None = None,
+    mm_positions: list[list[PlaceholderRange]] | None = None,
+    max_tokens: int = 16,
+    stop_token_ids: list[int] | None = None,
+    prompt_logprobs: int | None = None,
+    starting_idx: int = 0,
+    same_prompt: bool = False,
+    block_size: int = 16,
+    req_ids: list[str] | None = None,
+):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
     if arrival_times is not None:
@@ -1303,33 +1560,83 @@ def create_requests_with_priority(
     else:
         arrival_times = [float(i) for i in range(num_requests)]
 
-    sampling_params = SamplingParams(ignore_eos=False,
-                                     max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids,
-                                     prompt_logprobs=prompt_logprobs)
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(sha256)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, sha256)
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+        stop_token_ids=stop_token_ids,
+        prompt_logprobs=prompt_logprobs,
+    )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i + starting_idx}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
+
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
                 identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image")
-                mm_features.append(mm_feature)
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
 
+        prompt_token_ids = (
+            [starting_idx] * num_tokens
+            if same_prompt
+            else [i + starting_idx] * num_tokens
+        )
         request = Request(
-            request_id=f"{i + starting_idx}",
-            prompt_token_ids=[i + starting_idx] * num_tokens,
+            request_id=req_ids[i],
+            prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
             mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             arrival_time=arrival_times[i],
             priority=priorities[i],
+            block_hasher=block_hasher,
         )
         requests.append(request)
     return requests
@@ -1344,9 +1651,9 @@ def test_priority_scheduling_basic_ordering():
     # Priority 0 (highest), 1, 2 (lowest)
     priorities = [2, 0, 1]  # Add in non-priority order
     arrival_times = [1.0, 2.0, 3.0]  # All different arrival times
-    requests = create_requests_with_priority(num_requests=3,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=3, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests in non-priority order
     for request in requests:
@@ -1372,9 +1679,9 @@ def test_priority_scheduling_arrival_time_tiebreaker():
     # Create requests with same priority but different arrival times
     priorities = [1, 1, 1]  # All same priority
     arrival_times = [3.0, 1.0, 2.0]  # Different arrival times
-    requests = create_requests_with_priority(num_requests=3,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=3, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests in non-arrival order
     for request in requests:
@@ -1399,9 +1706,9 @@ def test_priority_scheduling_mixed_priority_and_arrival():
     # Create requests with mixed priorities and arrival times
     priorities = [2, 1, 1, 0]  # Mixed priorities
     arrival_times = [1.0, 3.0, 2.0, 4.0]  # Mixed arrival times
-    requests = create_requests_with_priority(num_requests=4,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=4, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests
     for request in requests:
@@ -1438,7 +1745,7 @@ def test_priority_scheduling_preemption():
         num_requests=2,
         priorities=[5, 5],  # Low priority
         arrival_times=[1.0, 2.0],
-        num_tokens=30  # Large enough to consume significant memory
+        num_tokens=30,  # Large enough to consume significant memory
     )
 
     # Add and schedule low priority requests
@@ -1452,10 +1759,9 @@ def test_priority_scheduling_preemption():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in low_priority_requests],
         req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(low_priority_requests)
+            req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1471,7 +1777,7 @@ def test_priority_scheduling_preemption():
         num_requests=1,
         priorities=[0],  # High priority
         arrival_times=[3.0],
-        num_tokens=30  # Large enough to require significant memory
+        num_tokens=30,  # Large enough to require significant memory
     )[0]
 
     scheduler.add_request(high_priority_request)
@@ -1512,10 +1818,8 @@ def test_priority_scheduling_no_preemption_when_space_available():
 
     # Add two low-priority running requests
     low_priority_requests = create_requests_with_priority(
-        num_requests=2,
-        priorities=[5, 5],
-        arrival_times=[1.0, 2.0],
-        num_tokens=30)
+        num_requests=2, priorities=[5, 5], arrival_times=[1.0, 2.0], num_tokens=30
+    )
 
     for request in low_priority_requests:
         scheduler.add_request(request)
@@ -1524,10 +1828,9 @@ def test_priority_scheduling_no_preemption_when_space_available():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in low_priority_requests],
         req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(low_priority_requests)
+            req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1535,10 +1838,9 @@ def test_priority_scheduling_no_preemption_when_space_available():
     scheduler.update_from_output(output, model_output)
 
     # Add high-priority request
-    high_priority_request = create_requests_with_priority(num_requests=1,
-                                                          priorities=[0],
-                                                          arrival_times=[3.0],
-                                                          num_tokens=30)[0]
+    high_priority_request = create_requests_with_priority(
+        num_requests=1, priorities=[0], arrival_times=[3.0], num_tokens=30
+    )[0]
 
     scheduler.add_request(high_priority_request)
 
@@ -1566,7 +1868,8 @@ def test_priority_scheduling_preemption_victim_selection():
         num_requests=3,
         priorities=[3, 2, 0],  # Different priorities: low, medium, high
         arrival_times=[1.0, 2.0, 3.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1605,7 +1908,8 @@ def test_priority_scheduling_equal_priority_preemption():
         num_requests=3,
         priorities=[2, 2, 2],  # Same priority
         arrival_times=[3.0, 1.0, 2.0],  # Different arrival times
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1641,7 +1945,8 @@ def test_priority_scheduling_waiting_queue_order():
         num_requests=4,
         priorities=[3, 1, 2, 0],  # Mixed priorities
         arrival_times=[1.0, 2.0, 3.0, 4.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1676,9 +1981,9 @@ def test_priority_scheduling_fcfs_fallback():
     # Create requests with same priority but different arrival times
     priorities = [1, 1, 1, 1]  # All same priority
     arrival_times = [4.0, 1.0, 3.0, 2.0]  # Different arrival times
-    requests = create_requests_with_priority(num_requests=4,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=4, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests
     for request in requests:
@@ -1708,7 +2013,8 @@ def test_priority_scheduling_with_limited_slots():
         num_requests=4,
         priorities=[3, 1, 2, 0],  # Mixed priorities
         arrival_times=[1.0, 2.0, 3.0, 4.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1746,10 +2052,12 @@ def test_priority_scheduling_heap_property():
     # Add requests in random priority order
     priorities = [5, 1, 8, 3, 2, 7, 4, 6]
     arrival_times = [float(i) for i in range(len(priorities))]
-    requests = create_requests_with_priority(num_requests=len(priorities),
-                                             priorities=priorities,
-                                             arrival_times=arrival_times,
-                                             num_tokens=10)
+    requests = create_requests_with_priority(
+        num_requests=len(priorities),
+        priorities=priorities,
+        arrival_times=arrival_times,
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1768,7 +2076,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[[100]],
+                sampled_token_ids=[np.array([100])],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -1776,8 +2084,7 @@ def test_priority_scheduling_heap_property():
             scheduler.update_from_output(output, model_output)
 
             # Finish the request to make room for the next one
-            scheduler.finish_requests(req.req_id,
-                                      RequestStatus.FINISHED_STOPPED)
+            scheduler.finish_requests(req.req_id, RequestStatus.FINISHED_STOPPED)
 
     # Verify requests were scheduled in priority order (lowest value first)
     expected_priorities = sorted(priorities)
@@ -1791,7 +2098,6 @@ def test_schedule_skip_tokenizer_init():
         scheduler.add_request(request)
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == len(requests)
-    assert output.grammar_bitmask is None
 
 
 def test_schedule_skip_tokenizer_init_structured_output_request():
@@ -1809,7 +2115,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
         sampling_params=sampling_params,
         pooling_params=None,
         eos_token_id=EOS_TOKEN_ID,
-        structured_output_request=StructuredOutputRequest(sampling_params),
     )
     scheduler.add_request(request)
     output = scheduler.schedule()
@@ -1818,7 +2123,12 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(scheduler.waiting) == 1
 
 
-def test_priority_scheduling_preemption_when_out_of_kv():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
+    use_ec_connector, ec_role
+):
     """Test that priority scheduling preempts lower priority requests
     when out of KV cache space."""
     # Create scheduler with very limited memory to force preemption
@@ -1827,6 +2137,10 @@ def test_priority_scheduling_preemption_when_out_of_kv():
         max_num_batched_tokens=200,
         num_blocks=5,  # Can hold 64 tokens (first block is null)
         block_size=16,  # Standard block size
+        use_kv_connector=True,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     # Create a request and schedule it
@@ -1838,16 +2152,17 @@ def test_priority_scheduling_preemption_when_out_of_kv():
         starting_idx=0,
     )[0]
     scheduler.add_request(request_low)
+    # 1st schedule
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == 1
     assert len(scheduler.waiting) == 0
     assert len(scheduler.running) == 1
 
-    # Simulate model execution
+    # Simulate model execution - 1st decode
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -1864,6 +2179,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
         starting_idx=1,
     )[0]
     scheduler.add_request(request_high)
+    # 2nd schedule
     output = scheduler.schedule()
     # KV cache should be full at this point
     assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
@@ -1872,15 +2188,12 @@ def test_priority_scheduling_preemption_when_out_of_kv():
     assert len(scheduler.waiting) == 0
     assert len(scheduler.running) == 2
 
-    # Simulate model execution
+    # Simulate model execution - 2nd decode
     requests = [request_low, request_high]
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[100] for _ in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([100]) for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -1888,7 +2201,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
     )
     scheduler.update_from_output(output, model_output)
 
-    # Schedule again - this should trigger preemption
+    # 3rd schedule - this should trigger preemption
     # req_low needs 32 tokens = 2 blocks
     # req_high needs 33 tokens = 3 blocks
     # so doesn't fit in 4 blocks.
@@ -1898,5 +2211,1066 @@ def test_priority_scheduling_preemption_when_out_of_kv():
     assert len(output.scheduled_new_reqs) == 0
     assert output.scheduled_cached_reqs.num_reqs == 1
     assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert scheduler.requests[request_low.request_id].status == RequestStatus.PREEMPTED
     assert len(scheduler.waiting) == 1
-    assert len(scheduler.running) == 1
\ No newline at end of file
+    assert len(scheduler.running) == 1
+
+    # Simulate model execution - 3rd decode
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([]), np.array([100])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    # Finish the requests to make room for the preempted requests to resume
+    scheduler.update_from_output(output, model_output)
+    scheduler.finish_requests(request_high.request_id, RequestStatus.FINISHED_STOPPED)
+
+    # 4th Schedule - this should trigger the resumption
+    output = scheduler.schedule()
+    scheduled_cached_reqs = output.scheduled_cached_reqs
+    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
+
+    assert len(output.scheduled_new_reqs) == 0
+    assert scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Preempted request resumed in scheduled_cached_reqs
+    assert len(resumed_from_preemption) == 1
+    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
+    assert resumed_from_preemption[0]
+    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
+    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    # Resumed tokens include 30 prompt tokens and 2 decoded tokens
+    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 32
+    assert scheduled_cached_reqs.resumed_req_token_ids[0][31] == 100
+
+
+@pytest.mark.parametrize(
+    ("enable_chunked_prefill", "is_encoder_decoder", "expect_enabled"),
+    [
+        (True, False, True),
+        (False, False, False),
+        # Encoder-decoder models should always have it disabled
+        (False, True, False),
+        (True, True, False),
+    ],
+)
+def test_chunked_prefill_disabled_for_encoder_decoder(
+    enable_chunked_prefill: bool, is_encoder_decoder: bool, expect_enabled: bool
+) -> None:
+    """Validate that chunked prefill is appropriately disabled for
+    encoder-decoder models."""
+    scheduler_config = SchedulerConfig(
+        enable_chunked_prefill=enable_chunked_prefill,
+        is_encoder_decoder=is_encoder_decoder,
+        # Must <= max_num_batched_tokens if chunked prefill is disabled
+        max_model_len=SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+    )
+
+    # `is_encoder_decoder` should only be used during construction
+    # of the config, and otherwise stored in the model config.
+    assert "is_encoder_decoder" not in vars(scheduler_config)
+    assert "is_encoder_decoder" not in [
+        f.name for f in dataclasses.fields(scheduler_config)
+    ]
+    _validate_chunked_prefill_settings_for_encoder_decoder(
+        scheduler_config, is_encoder_decoder, expect_enabled
+    )
+
+    # Ensure it is retained in VllmConfig, even after its post-init.
+    vllm_config = VllmConfig(scheduler_config=scheduler_config)
+    _validate_chunked_prefill_settings_for_encoder_decoder(
+        vllm_config.scheduler_config, is_encoder_decoder, expect_enabled
+    )
+
+
+def _validate_chunked_prefill_settings_for_encoder_decoder(
+    scheduler_config: SchedulerConfig, is_encoder_decoder: bool, expect_enabled: bool
+) -> None:
+    """Validate chunked prefill settings in the scheduler config for
+    encoder-decoder models."""
+    assert scheduler_config.enable_chunked_prefill is expect_enabled
+    if is_encoder_decoder:
+        # Encoder-decoder models should automatically disable chunked multimodal
+        # inputs as well
+        assert scheduler_config.disable_chunked_mm_input is not expect_enabled
+    if is_encoder_decoder and not expect_enabled:
+        assert scheduler_config.long_prefill_token_threshold == 0
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests start
+# NOTE: In E->P->D disagg case, both KV and EC Connector works in P instance
+# Unless specify, the existence of KV Connector should not affect any test results
+# ==============================================================================
+
+
+def _assert_right_encoder_cache_allocated(
+    scheduler: Scheduler,
+    hashes_to_check: list[str] | None = None,
+    requests: list[Request] | None = None,
+    expected_total_allocated: int | None = None,
+):
+    """Check whether encoder cache is allocated correctly."""
+    encoder_cache_manager = scheduler.encoder_cache_manager
+
+    # Verify encoder cache manager exists
+    assert encoder_cache_manager is not None, "Encoder cache manager should exist"
+
+    # Verify number of cache
+    if expected_total_allocated is not None:
+        assert len(encoder_cache_manager.cached) == expected_total_allocated
+        if expected_total_allocated == 0:
+            return
+
+    # Verify each request with MM data is in cache
+    cached_hashes = set(encoder_cache_manager.cached.keys())
+
+    if hashes_to_check:
+        missed_hashes = set(hashes_to_check) - cached_hashes
+        assert not missed_hashes, (
+            f"Miss hashes: {missed_hashes} "
+            f"Existing encoder cache: {encoder_cache_manager.cached}"
+        )
+
+    for req in requests if requests is not None else []:
+        if req.mm_features:
+            mm_hashes = [f.identifier for f in req.mm_features]
+            req_hashes = set(mm_hashes)  # unique hashes set
+            missed_hashes = req_hashes - cached_hashes
+            assert not missed_hashes, (
+                f"Miss hashes in cache for request {req.request_id}: {missed_hashes} "
+                f"Existing encoder cache: {encoder_cache_manager.cached}"
+            )
+
+
+def _assert_right_ec_connector_metadata(
+    output: SchedulerOutput,
+    mm_features_list: list[MultiModalFeatureSpec],
+):
+    """Verify that ECConnector metadata EXACTLY matches the input MM data"""
+    # Get the connector metadata
+    metadata = output.ec_connector_metadata
+
+    # Create lookup dictionaries for efficient access
+    metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas}
+
+    # Check all required identifiers exist in metadata; and no extra
+    # In ECSharedStorageConnector format
+    # NOTE: even having same identifier, the mm_features can be different
+    # since their mm_position can be in different offsets, etc
+    identifiers_dict = {f.identifier for f in mm_features_list}
+    assert set(metadata_dict.keys()) == identifiers_dict
+
+    # Verify the info matches
+    for i, mm_feature in enumerate(mm_features_list):
+        identifier = mm_feature.identifier
+        assert metadata_dict[identifier].mm_hash == identifier
+        assert metadata_dict[identifier].num_token == mm_feature.mm_position.length
+
+
+def _assert_right_encoder_inputs(
+    output: SchedulerOutput,
+    check_exist: bool | None = True,
+    requests: list[Request] | None = None,
+    expected_encoder_inputs: list[list[int]] | None = None,
+    expected_total_reqs: int | None = None,
+):
+    """Verify that requests/mm_hashes should (not) in scheduled encoder input
+    If check_exist is False, this function returns True
+    if requests are NOT in encoder inputs"""
+
+    # Get the scheduled encoder inputs
+    # NOTE: scheduled_encoder_inputs is a dictionary with request id as key
+    scheduled_encoder_inputs = output.scheduled_encoder_inputs
+
+    # Check if scheduled_encoder_inputs is empty as expected
+    if expected_total_reqs is not None:
+        assert len(scheduled_encoder_inputs) == expected_total_reqs
+        if expected_total_reqs == 0:
+            return
+
+    # Number of expected enocder inputs should match number of requests
+    if expected_encoder_inputs:
+        assert check_exist and requests is not None  # only support expect input exist
+        assert len(requests) == len(expected_encoder_inputs)
+
+    # Check request (not) exist as expected
+    for i, request in enumerate(requests if requests is not None else []):
+        assert (request.request_id in scheduled_encoder_inputs) is check_exist, (
+            f"Request {request.id} presence mismatch: expected {check_exist}, "
+            f"got {request.id in scheduled_encoder_inputs}"
+        )
+        if expected_encoder_inputs:
+            scheduled_encoder_input = scheduled_encoder_inputs[request.request_id]
+            assert scheduled_encoder_input == expected_encoder_inputs[i]
+
+
+def test_scheduler_no_ec_connector_by_default():
+    """Test scheduler doesn't have EC connector by default."""
+    scheduler = create_scheduler()
+    assert scheduler.ec_connector is None
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_text_only_request(use_kv_connector):
+    """Test text-only requests don't allocate encoder cache."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    NUM_PROMPT_TOKENS = 100
+
+    # Create text-only request (no mm_positions)
+    requests = create_requests(
+        num_requests=1,
+        num_tokens=NUM_PROMPT_TOKENS,
+    )
+    assert not requests[0].mm_features  # No MM data
+
+    scheduler.add_request(requests[0])
+    output = scheduler.schedule()
+
+    # Should schedule
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Scheduled tokens should equal prompt tokens exactly
+    scheduled = output.num_scheduled_tokens[requests[0].request_id]
+    assert scheduled == NUM_PROMPT_TOKENS, (
+        f"Text-only should schedule {NUM_PROMPT_TOKENS}, got {scheduled}"
+    )
+
+    # Encoder cache should be empty
+    _assert_right_encoder_cache_allocated(scheduler, expected_total_allocated=0)
+
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_hit_external_load(use_kv_connector):
+    """Test ec_consumer loads from external cache when hit.
+    A normal basic operation for EPD disaggrgation"""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS = 200  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS = 100
+    mm_hashes_list = [["hash_test1"]]
+    mm_positions = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS)]]
+
+    request = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )[0]
+
+    # Mock cache hit - encoder cache exists externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+
+    # Should schedule prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request.request_id]
+    assert scheduled_tokens == NUM_TOKENS
+
+    # Should called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request, 0)
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request])
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(output, mm_features_list=request.mm_features)
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_miss_computes_locally(use_kv_connector):
+    """Test consumer can compute encoder locally when cache miss (fallback)."""
+    # encoder cache itself if it doesn't receive it from external storage
+
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Verify consumer role
+    assert scheduler.ec_connector is not None
+    assert not scheduler.ec_connector.is_producer
+
+    # Create MM request
+    request_mm_missed = create_requests(
+        num_requests=1,
+        num_tokens=200,  # Total (including 100 MM)
+        mm_positions=[[PlaceholderRange(offset=0, length=100)]],  # 100 MM tokens
+    )[0]
+
+    # Mock cache miss - encoder cache doesn't exist externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    scheduler.add_request(request_mm_missed)
+    output = scheduler.schedule()
+
+    # SCHEDULER should decide to compute encoder locally (fallback)
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Should schedule full prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request_mm_missed.request_id]
+    assert scheduled_tokens == 200, (
+        f"Expected 200 tokens on cache miss, got {scheduled_tokens}"
+    )
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_mm_missed])
+
+    # ECConnector should carry no metadata (missed cache)
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input contain mm for request_mm_missed
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request_mm_missed],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item
+        expected_total_reqs=1,
+    )
+
+    # Then MODEL_RUNNER will execute the encoder and cache the result
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
+    """Test consumer with partial cache hit (local & connector) with 2 requests."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS_1 = 300  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 50
+    mm_hashes_list_1 = [["hash1_A", "hash1_B", "hash1_A", "hash1_F"]]
+    mm_positions_1 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_1),
+        ]
+    ]
+
+    # Create request with 4 MM items, with 2 identical items
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: 1st and 3rd missing, 2nd and 4th exist
+    scheduler.ec_connector.has_caches = Mock(return_value=[False, True, False, True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    output = scheduler.schedule()
+
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+
+    # Encoder cache should contain all mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request1])
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called()
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for 2nd and 4th mm item
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request1.mm_features[1], request1.mm_features[3]]
+    )
+
+    # Should schedule ONLY 1 encoder input (index 0), no repeat for identical items
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item ONLY
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id],
+        req_id_to_index={request1.request_id: 0},
+        sampled_token_ids=[np.array([100])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+
+    # Create another request with 4 MM items
+    NUM_TOKENS_2 = 400
+    NUM_ENCODER_TOKENS_2 = 50
+    mm_hashes_list_2 = [["hash1_C", "hash1_D", "hash1_E", "hash1_A"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: only hash1_A and hash1_C exist in connector
+    scheduler.ec_connector.has_caches = Mock(return_value=[True, False, False, True])
+
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    assert scheduled_tokens == 400
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # Should call update_state_after_alloc for hash1_C, ONLY
+    # hash1_A should not be loaded from connector
+    # since it's computed in last request & exist in local cache
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 0)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash1_C only (index 0)
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0]]
+    )
+
+    # Should schedule 2 encoder input hash1_D and hash1_E (index 1, 2)
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request2],
+        expected_encoder_inputs=[[1, 2]],
+        expected_total_reqs=1,
+    )
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_seqs=10,  # allow multiple requests
+        max_num_batched_tokens=2048,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+    mm_hashes_list = [[f"hash_{i}"] for i in range(10)]
+    mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    if cache_exist == "local":
+        # Allocate cache to cache manager manually to mimick
+        for req in requests:
+            scheduler.encoder_cache_manager.allocate(req, 0)
+    else:
+        # Make sure local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    ## Encoder-cache-specific checks:
+    # mm_hashes of requests exist in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=requests)
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            requests[-1], 0
+        )
+
+        # Concat mm_features for the 10 requests together
+        mm_features_list = [feature for req in requests for feature in req.mm_features]
+
+        # Check metadata should contain mm data for all 10 requests
+        _assert_right_ec_connector_metadata(output, mm_features_list=mm_features_list)
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=requests,
+            expected_encoder_inputs=[[0] for _ in range(10)],
+            expected_total_reqs=10,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_unable_to_allocate(use_kv_connector):
+    """
+    Test whether scheduler with ECConnector is able to handle
+    unable to allocate (run out of blocks).
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock ec_connector load external cache behavior
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=[["hash_1"], ["hash_2"]],
+        mm_positions=[
+            [PlaceholderRange(offset=1, length=10)] for _ in range(NUM_REQUESTS)
+        ],
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Setup MODEL_RUNNER_OUTPUT to be run in _step_until_done later
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # update_state_after_alloc should be called for loading external cache
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_priority_scheduling_ec_connector_preemption_and_resumption(
+    cache_exist, use_kv_connector
+):
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        max_num_seqs=2,  # allow multiple requests
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        num_blocks=15,  # can hold 244 tokens with 14 blocks (first block is null)
+        block_size=16,  # standard block size
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock cache hit: Both cache exist in connector (at E->PD initially)
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create a request and schedule it (and to be preempted)
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=94,
+        mm_hashes_list=[["hash_low"]],
+        # NOTE: this test only preempt the last block.
+        # Setting mm_position at the last block can force to recompute encoding
+        mm_positions=[[PlaceholderRange(offset=82, length=10)]],
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    # 1st schedule
+    output = scheduler.schedule()
+
+    assert len(output.scheduled_new_reqs) == 1
+    scheduled_tokens = output.num_scheduled_tokens[request_low.request_id]
+    assert scheduled_tokens == 94
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_low, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_low.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 1st decode
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[np.array([100])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=128,
+        mm_hashes_list=[["hash_high"]],
+        mm_positions=[[PlaceholderRange(offset=1, length=10)]],
+        max_tokens=2,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    # 2nd schedule
+    output = scheduler.schedule()
+
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_high])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_high, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_high.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 2nd decode
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([100]) for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # 3rd schedule - - this should trigger preemption
+    # req_low needs 96 tokens = 6 blocks
+    # req_high needs 129 tokens = 9 blocks
+    # so doesn't fit in 14 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert scheduler.requests[request_low.request_id].status == RequestStatus.PREEMPTED
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # request_high is in decode phase now
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 3rd decode, after req_low was preempted
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([100]), np.array([100, 200])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    # Finish the requests to make room for the preempted requests to resume
+    # req_high is finished after outputing 2 tokens
+    scheduler.update_from_output(output, model_output)
+    scheduler.finish_requests(
+        request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED
+    )
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    # By default, the cache should still exist in local in this test case
+    if cache_exist != "local":
+        # Make local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    # 4th Schedule - this should trigger req_low resumption from waiting
+    output = scheduler.schedule()
+    scheduled_cached_reqs = output.scheduled_cached_reqs
+    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
+
+    assert len(output.scheduled_new_reqs) == 0
+    assert scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Preempted request resumed in scheduled_cached_reqs
+    assert len(resumed_from_preemption) == 1
+    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
+    assert resumed_from_preemption[0]
+    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
+    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    ## Resumed tokens include 94 prompt tokens and 2 decoded tokens
+    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
+    assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
+    assert scheduler.running[0].request_id == request_low.request_id
+    assert request_high.request_id in output.finished_req_ids
+
+    ## Encoder-cache-specific checks:
+    # mm_hash of request_low exists in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            request_low, 0
+        )
+        _assert_right_ec_connector_metadata(
+            output, mm_features_list=request_low.mm_features
+        )
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=[request_low],
+            expected_encoder_inputs=[[0]],
+            expected_total_reqs=1,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connector):
+    """
+    Scenario:
+      - Encoder cache size: 32
+      - Request A: 1 feature (12 tokens) → NOT cached remotely.
+      - Request B: 3 features (3 x 10 tokens) → ALL cached remotely.
+
+    Steps:
+      1. Schedule Request A (locally uses 12 tokens).
+      2. Schedule Request B (remote cache) - only schedule 1st and 2nd
+      3. Free A's cache, then schedule B again (continuation) - schedule 3rd image
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=16,
+        num_blocks=11,  # Can hold 160 tokens (first block is null)
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Limit the number of availiable slots of EncoderCacheManager
+    scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32)
+
+    # Create MM request1
+    NUM_TOKENS_1 = 50  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 12
+    mm_hashes_list_1 = [["hash1_1"]]
+    mm_positions_1 = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1)]]
+
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+        req_ids=["req1"],
+    )[0]
+
+    # Create MM request1 with 3 MM items
+    NUM_TOKENS_2 = 40
+    NUM_ENCODER_TOKENS_2 = 10
+    mm_hashes_list_2 = [["hash2_1", "hash2_2", "hash2_3"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=12, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=24, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=10,
+        req_ids=["req2"],
+    )[0]
+
+    # Mock cache hit: MM of request1 NOT cached remotely, request2 cached remotely
+    scheduler.ec_connector.has_caches = Mock(
+        side_effect=lambda req: [True, True, True] if req == request2 else [False]
+    )
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Now, since encoder cache manager can only store 32 tokens
+    # It should allocated mm item hash1_1, hash2_1 and hash2_2
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+    assert scheduler.get_num_unfinished_requests() == 2
+
+    # Encoder cache should contain mm item from request1
+    _assert_right_encoder_cache_allocated(
+        scheduler, hashes_to_check=["hash1_1", "hash2_1", "hash2_2"]
+    )
+
+    # request2's 2nd mm item is the last call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 1)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of hash2_1 and hash2_2 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0], request2.mm_features[1]]
+    )
+
+    # Should schedule ONLY 1 encoder input
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item of request1
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id, request2.request_id],
+        req_id_to_index={request1.request_id: 0, request2.request_id: 1},
+        sampled_token_ids=[np.array([100]), np.array([121])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+    assert scheduler.get_num_unfinished_requests() == 1
+
+    # Schedule again; Now request1's encoder cache should be freed
+    # -> hash2_3 can be scheduled and allocated
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    print(f"Hero: scheduled_tokens for req2: {scheduled_tokens}")
+    print(f"hero: num_scheduled_tokens 2: {output.num_scheduled_tokens}")
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # request2's 3rd mm item is the ONLY call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 2)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash2_3 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[2]]
+    )
+
+    # Should schedule no encoder input
+    _assert_right_encoder_inputs(
+        output,
+        expected_total_reqs=0,
+    )
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
+# ==============================================================================
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index bd0320baef87118059fcace65224c41eb8c279f1..f1df4e95d5f4963d578b16edbdacd09dd99da292 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -1,27 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 
 from vllm import LLM
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"
 
 
 @pytest.fixture(scope="module")
 def llm() -> LLM:
-    return LLM(MODEL,
-               enforce_eager=True,
-               enable_prefix_caching=True,
-               long_prefill_token_threshold=2,
-               max_num_batched_tokens=6,
-               max_num_seqs=3,
-               block_size=16)
+    return LLM(
+        MODEL,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        long_prefill_token_threshold=2,
+        max_num_batched_tokens=6,
+        max_num_seqs=3,
+        block_size=16,
+    )
 
 
 def test_concurrent_partial_prefill(llm):
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index 1f6825b6d24e591ab122bdde94a8d6d7157ffca6..a27f32938c08b6915398a730e7720e77be512ec4 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -3,28 +3,32 @@
 
 import random
 
+import pytest
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         make_block_hash_with_group_id)
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    KVCacheBlock,
+    make_block_hash_with_group_id,
+)
 from vllm.v1.core.single_type_kv_cache_manager import (
-    ChunkedLocalAttentionManager, SlidingWindowManager)
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        SlidingWindowSpec)
+    ChunkedLocalAttentionManager,
+    SlidingWindowManager,
+)
+from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
+
+pytestmark = pytest.mark.cpu_test
 
 
 def get_sliding_window_manager(sliding_window_spec, block_pool):
-    return SlidingWindowManager(sliding_window_spec,
-                                block_pool,
-                                kv_cache_group_id=0)
+    return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
 
 
-def get_chunked_local_attention_manager(chunked_local_attention_spec,
-                                        block_pool):
-    return ChunkedLocalAttentionManager(chunked_local_attention_spec,
-                                        block_pool,
-                                        kv_cache_group_id=0)
+def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
+    return ChunkedLocalAttentionManager(
+        chunked_local_attention_spec, block_pool, kv_cache_group_id=0
+    )
 
 
 def test_chunked_local_attention_possible_cached_prefix():
@@ -38,8 +42,9 @@ def test_chunked_local_attention_possible_cached_prefix():
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
-    manager = get_chunked_local_attention_manager(chunked_local_attention_spec,
-                                                  block_pool)
+    manager = get_chunked_local_attention_manager(
+        chunked_local_attention_spec, block_pool
+    )
 
     def run_one_case(block_is_cached, tail_token, expect_length):
         block_hash_list = [
@@ -49,12 +54,14 @@ def test_chunked_local_attention_possible_cached_prefix():
         block_pool.cached_block_hash_to_block._cache.clear()
 
         # Mock the block pool with the cached blocks
-        for i, (block_hash,
-                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+        for i, (block_hash, is_cached) in enumerate(
+            zip(block_hash_list, block_is_cached)
+        ):
             if is_cached:
                 block_pool.cached_block_hash_to_block.insert(
                     make_block_hash_with_group_id(block_hash, 0),
-                    block_pool.blocks[i + 10])
+                    block_pool.blocks[i + 10],
+                )
 
         computed_blocks = manager.find_longest_cache_hit(
             block_hashes=block_hash_list,
@@ -62,11 +69,14 @@ def test_chunked_local_attention_possible_cached_prefix():
             kv_cache_group_ids=[0],
             block_pool=block_pool,
             kv_cache_spec=chunked_local_attention_spec,
-            use_eagle=False)[0]
+            use_eagle=False,
+        )[0]
         assert len(computed_blocks) == expect_length
 
-        assert all(block == block_pool.null_block
-                   for block in computed_blocks[:(expect_length - 1) // 2])
+        assert all(
+            block == block_pool.null_block
+            for block in computed_blocks[: (expect_length - 1) // 2]
+        )
 
     run_one_case([True], 0, 1)
     run_one_case([True], 1, 1)
@@ -112,12 +122,14 @@ def test_sliding_window_possible_cached_prefix():
         block_pool.cached_block_hash_to_block._cache.clear()
 
         # Mock the block pool with the cached blocks
-        for i, (block_hash,
-                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+        for i, (block_hash, is_cached) in enumerate(
+            zip(block_hash_list, block_is_cached)
+        ):
             if is_cached:
                 block_pool.cached_block_hash_to_block.insert(
                     make_block_hash_with_group_id(block_hash, 0),
-                    block_pool.blocks[i + 10])
+                    block_pool.blocks[i + 10],
+                )
 
         computed_blocks = manager.find_longest_cache_hit(
             block_hashes=block_hash_list,
@@ -125,16 +137,18 @@ def test_sliding_window_possible_cached_prefix():
             kv_cache_group_ids=[0],
             block_pool=block_pool,
             kv_cache_spec=sliding_window_spec,
-            use_eagle=False)[0]
+            use_eagle=False,
+        )[0]
         assert len(computed_blocks) == expect_length
 
-        assert all(block == block_pool.null_block
-                   for block in computed_blocks[:expect_length - 2])
+        assert all(
+            block == block_pool.null_block
+            for block in computed_blocks[: expect_length - 2]
+        )
         for i in range(2):
             if i < expect_length:
                 block_index = expect_length - i - 1
-                assert computed_blocks[
-                    block_index].block_id == block_index + 10
+                assert computed_blocks[block_index].block_id == block_index + 10
 
     run_one_case([False] * 10, 0)
     run_one_case([True], 1)
@@ -143,17 +157,16 @@ def test_sliding_window_possible_cached_prefix():
     run_one_case([True, True, False], 2)
     run_one_case([True, True, True], 3)
     run_one_case([True, True, True, False], 3)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, True, True,
-        True
-    ], 12)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, False, False
-    ], 8)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, False, False,
-        True
-    ], 8)
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, True, True, True], 12
+    )
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, False, False], 8
+    )
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, False, False, True],
+        8,
+    )
 
 
 def test_chunked_local_attention_remove_skipped_blocks():
@@ -173,8 +186,8 @@ def test_chunked_local_attention_remove_skipped_blocks():
 
     def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
-            KVCacheBlock(id_)
-            if id_ != null_block_id else block_pool.null_block for id_ in ids
+            KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
+            for id_ in ids
         ]
 
     def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
@@ -185,7 +198,17 @@ def test_chunked_local_attention_remove_skipped_blocks():
                 assert block.block_id == id_
 
     original_block_ids = [
-        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+        1000,
+        1001,
+        1002,
+        1003,
+        1004,
+        1005,
+        1006,
+        1007,
+        1008,
+        1009,
+        1010,
     ]
     block_table = id_to_block_table(original_block_ids)
     manager.req_to_blocks["test"] = block_table
@@ -224,8 +247,8 @@ def test_sliding_window_remove_skipped_blocks():
 
     def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
-            KVCacheBlock(id_)
-            if id_ != null_block_id else block_pool.null_block for id_ in ids
+            KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
+            for id_ in ids
         ]
 
     def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
@@ -236,7 +259,17 @@ def test_sliding_window_remove_skipped_blocks():
                 assert block.block_id == id_
 
     original_block_ids = [
-        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+        1000,
+        1001,
+        1002,
+        1003,
+        1004,
+        1005,
+        1006,
+        1007,
+        1008,
+        1009,
+        1010,
     ]
     block_table = id_to_block_table(original_block_ids)
     manager.req_to_blocks["test"] = block_table
@@ -286,13 +319,16 @@ def test_get_num_blocks_to_allocate():
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
-    cached_blocks_2 = [block_pool.null_block for _ in range(5)
-                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
+        KVCacheBlock(i + 1) for i in range(5)
+    ]
 
-    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
-                                              cached_blocks_1) == 20
-    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
-                                              cached_blocks_2) == 15
+    assert (
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+    )
+    assert (
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+    )
 
 
 def test_chunked_local_attention_get_num_blocks_to_allocate():
@@ -308,10 +344,13 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
-    cached_blocks_2 = [block_pool.null_block for _ in range(5)
-                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
+        KVCacheBlock(i + 1) for i in range(5)
+    ]
 
-    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
-                                              cached_blocks_1) == 20
-    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
-                                              cached_blocks_2) == 15
+    assert (
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+    )
+    assert (
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+    )
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index d343141cdf4cb9e2fd1013435ffce4f4cd85bff6..65511c17473b223c9ecd21dcce955bc6df6a7c05 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -1,43 +1,61 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 
-from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
-                         SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalFeatureSpec,
-                                    MultiModalKwargsItem, PlaceholderRange)
+from tests.v1.kv_connector.unit.utils import MockKVConfig
+from vllm.config import (
+    CacheConfig,
+    ECTransferConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.sampling_params import SamplingParams
-from vllm.utils import sha256
-from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
-                                         init_none_hash)
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
 EOS_TOKEN_ID = 50256
 
 
+def mock_kv(matched_tokens: int, is_async: bool):
+    return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
+
+
 def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: Optional[bool] = None,
+    enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
-    use_kv_connector: bool = False,
+    use_kv_connector: None | bool | MockKVConfig = None,
     num_blocks: int = 10000,
     block_size: int = 16,
-    max_model_len: Optional[int] = None,
-    num_speculative_tokens: Optional[int] = None,
+    max_model_len: int | None = None,
+    num_speculative_tokens: int | None = None,
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
-) -> Union[Scheduler, AsyncScheduler]:
-    '''Create scheduler under test.
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
+) -> Scheduler | AsyncScheduler:
+    """Create scheduler under test.
 
     Args:
       model: model under test
@@ -49,7 +67,7 @@ def create_scheduler(
 
     Returns:
       {class}`Scheduler` instance
-    '''
+    """
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -69,9 +87,11 @@ def create_scheduler(
         skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
-    kwargs_cache = ({} if enable_prefix_caching is None else {
-        'enable_prefix_caching': enable_prefix_caching
-    })
+    kwargs_cache = (
+        {}
+        if enable_prefix_caching is None
+        else {"enable_prefix_caching": enable_prefix_caching}
+    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
@@ -79,16 +99,38 @@ def create_scheduler(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": "local_storage"},
-    ) if use_kv_connector else None
+    kv_transfer_config = None
+    if isinstance(use_kv_connector, MockKVConfig):
+        kv_transfer_config = KVTransferConfig(
+            kv_connector="MockKVConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "matched_tokens": use_kv_connector.matched_tokens,
+                "is_async": use_kv_connector.is_async,
+            },
+        )
+    elif use_kv_connector:
+        kv_transfer_config = KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
 
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
-            model="ngram", num_speculative_tokens=num_speculative_tokens)
+            model="ngram", num_speculative_tokens=num_speculative_tokens
+        )
+
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
 
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
@@ -96,14 +138,15 @@ def create_scheduler(
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -111,6 +154,7 @@ def create_scheduler(
     return scheduler_cls(
         vllm_config=vllm_config,
         kv_cache_config=kv_cache_config,
+        block_size=block_size,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
     )
@@ -122,12 +166,14 @@ _none_hash_initialized = False
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
-    mm_positions: Optional[list[list[PlaceholderRange]]] = None,
+    mm_hashes_list: list[list[str]] | None = None,
+    mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
-    stop_token_ids: Optional[list[int]] = None,
-    prompt_logprobs: Optional[int] = None,
+    stop_token_ids: list[int] | None = None,
+    prompt_logprobs: int | None = None,
     same_prompt: bool = False,
     block_size: int = 16,
+    req_ids: list[str] | None = None,
 ) -> list[Request]:
     global _none_hash_initialized
     if not _none_hash_initialized:
@@ -135,30 +181,65 @@ def create_requests(
         _none_hash_initialized = True
 
     block_hasher = get_request_block_hasher(block_size, sha256)
-    sampling_params = SamplingParams(ignore_eos=False,
-                                     max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+        stop_token_ids=stop_token_ids,
+        prompt_logprobs=prompt_logprobs,
+    )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
-                # Dummy hash for each mm item should be unique
-                # since encoder cache tracks entries by hash
+
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
                 identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image")
-                mm_features.append(mm_feature)
-
-        prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
-                            num_tokens)
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
+
+        prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
         request = Request(
-            request_id=f"{i}",
+            request_id=req_ids[i],
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 64f2fa462802f1fb4d23a7600988f40d545e671c..bb953e5c70c8caaa51bc66e512f1a49e5e5cf7bd 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -9,8 +9,14 @@ import torch.nn as nn
 from tests.utils import create_new_process_for_each_test
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.platforms import current_platform
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
@@ -18,7 +24,6 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 
 # Helper MLP for testing
 class SimpleMLP(nn.Module):
-
     def __init__(self):
         super().__init__()
         self.fc1 = nn.Linear(10, 10)
@@ -28,81 +33,89 @@ class SimpleMLP(nn.Module):
         return self.fc2(self.fc1(x))
 
 
-def _create_vllm_config(compilation_config: CompilationConfig,
-                        max_num_seqs: int = 8) -> MagicMock:
+def _create_vllm_config(
+    compilation_config: CompilationConfig,
+    max_num_seqs: int = 8,
+    lora_config: bool = False,
+) -> MagicMock:
     mock_config = MagicMock(spec=VllmConfig)
     mock_config.compilation_config = compilation_config
     mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
     mock_config.parallel_config = ParallelConfig()
-
+    if not lora_config:
+        mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
-    if compilation_config.level == CompilationLevel.PIECEWISE:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
     return mock_config
 
 
 class TestCudagraphDispatcher:
-
     @pytest.mark.parametrize(
-        "params",
+        "cudagraph_mode_str,compilation_mode,lora_config",
         [
             # Test case 0: Full CG for mixed batches, no separate routine
-            {
-                "case_id": 0,
-                "cudagraph_mode": "FULL",
-                "compilation_level": CompilationLevel.NO_COMPILATION,
-            },
+            ("FULL", CompilationMode.NONE, False),
             # Test case 1: Full CG for uniform batches, piecewise for mixed
-            {
-                "case_id": 1,
-                "cudagraph_mode": "FULL_AND_PIECEWISE",
-                "compilation_level": CompilationLevel.PIECEWISE,
-            },
+            ("FULL_AND_PIECEWISE", CompilationMode.NONE, False),
             # Test case 2: Full CG for uniform batches, no CG for mixed
-            {
-                "case_id": 2,
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "compilation_level": CompilationLevel.NO_COMPILATION,
-            },
-            # Test case 3: Piecewise for all
-            {
-                "case_id": 3,
-                "cudagraph_mode": "PIECEWISE",
-                "compilation_level": CompilationLevel.PIECEWISE,
-            },
-        ])
-    def test_dispatcher(self, params):
+            ("FULL_DECODE_ONLY", CompilationMode.NONE, False),
+            # Test case 3: PIECEWISE for all
+            ("PIECEWISE", CompilationMode.VLLM_COMPILE, False),
+            # Test case 4: PIECEWISE for all, specialize LoRA cases
+            ("PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ],
+    )
+    def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
         # Setup dispatcher
         comp_config = CompilationConfig(
-            cudagraph_mode=params["cudagraph_mode"],
-            level=params["compilation_level"],
-            cudagraph_capture_sizes=[1, 8])
+            cudagraph_mode=cudagraph_mode_str,
+            mode=compilation_mode,
+            cudagraph_capture_sizes=[1, 8],
+        )
+
+        config = _create_vllm_config(
+            comp_config, max_num_seqs=8, lora_config=lora_config
+        )
+        if (
+            cudagraph_mode_str == "FULL_AND_PIECEWISE"
+            and compilation_mode == CompilationMode.NONE
+        ):
+            with pytest.raises(AssertionError):
+                dispatcher = CudagraphDispatcher(config)
+            return
 
-        config = _create_vllm_config(comp_config, max_num_seqs=8)
         dispatcher = CudagraphDispatcher(config)
         dispatcher.initialize_cudagraph_keys(
-            cudagraph_mode=comp_config.cudagraph_mode,
-            uniform_decode_query_len=1)
+            cudagraph_mode=comp_config.cudagraph_mode, uniform_decode_query_len=1
+        )
 
         # Verify the key is initialized correctly
-        if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
-            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2
+        if cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == (
+                4 if lora_config else 2
+            )
         else:
             assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0
-        if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]:
-            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2
+        if cudagraph_mode_str not in ["NONE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == (
+                4 if lora_config else 2
+            )
         else:
             assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0
 
         # Test dispatch logic
         # 1. non-uniform batch, size in cudagraph size list
-        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
+        desc_full_exact = BatchDescriptor(
+            num_tokens=8,
+            uniform_decode=False,
+        )
         rt_mode, key = dispatcher.dispatch(desc_full_exact)
-        if params["cudagraph_mode"] == "FULL":
+        if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_full_exact
-        elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+        elif cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
             assert rt_mode == CUDAGraphMode.PIECEWISE
             assert key == desc_full_exact
         else:
@@ -111,15 +124,13 @@ class TestCudagraphDispatcher:
         # 2. uniform decode batch, size in cudagraph size list
         desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
         rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
-        if params["cudagraph_mode"] == "FULL":
+        if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_uniform_exact.non_uniform
-        elif params["cudagraph_mode"] in [
-                "FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"
-        ]:
+        elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_uniform_exact
-        elif params["cudagraph_mode"] == "PIECEWISE":
+        elif cudagraph_mode_str == "PIECEWISE":
             assert rt_mode == CUDAGraphMode.PIECEWISE
             assert key == desc_uniform_exact.non_uniform
         else:
@@ -131,38 +142,49 @@ class TestCudagraphDispatcher:
         assert rt_mode == CUDAGraphMode.NONE
         assert key is None
 
+        # 4. Cascade attention should have a fall back mode
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
+        rt_mode, key = dispatcher.dispatch(desc_full_exact, use_cascade_attn=True)
+        if "PIECEWISE" in cudagraph_mode_str:  # string contains check
+            assert rt_mode == CUDAGraphMode.PIECEWISE
+            assert key == desc_full_exact.non_uniform
+        else:
+            assert rt_mode == CUDAGraphMode.NONE
+
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 class TestCUDAGraphWrapper:
-
     def setup_method(self):
         self.vllm_config = _create_vllm_config(CompilationConfig())
         self.model = SimpleMLP().to("cuda")
         self.persistent_input_buffer = torch.zeros(1, 10, device="cuda")
         self.input_tensor = torch.randn(1, 10, device="cuda")
 
-    @create_new_process_for_each_test("spawn")
     def test_capture_and_replay(self):
-        wrapper = CUDAGraphWrapper(self.model,
-                                   self.vllm_config,
-                                   runtime_mode=CUDAGraphMode.FULL)
+        wrapper = CUDAGraphWrapper(
+            self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
+        )
         batch_descriptor = BatchDescriptor(num_tokens=10)
 
         # 0. global warmup
-        with set_forward_context(attn_metadata=None,
-                                 vllm_config=self.vllm_config,
-                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
-                                 batch_descriptor=None):
+        with set_forward_context(
+            attn_metadata=None,
+            vllm_config=self.vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            batch_descriptor=None,
+        ):
             wrapper(self.input_tensor)
 
         # 1. Capture
-        with set_forward_context(
+        with (
+            set_forward_context(
                 attn_metadata=None,
                 vllm_config=self.vllm_config,
                 cudagraph_runtime_mode=CUDAGraphMode.FULL,
-                batch_descriptor=batch_descriptor),\
-            patch("torch.cuda.graph",
-                       wraps=torch.cuda.graph) as mock_cuda_graph:
+                batch_descriptor=batch_descriptor,
+            ),
+            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_cuda_graph,
+        ):
             output1 = wrapper(self.input_tensor)
             # capturing phase should generate a zero output
             assert torch.allclose(output1, torch.zeros_like(output1))
@@ -173,13 +195,17 @@ class TestCUDAGraphWrapper:
         assert entry.cudagraph is not None
 
         # 2. Replay
-        with set_forward_context(
+        with (
+            set_forward_context(
                 attn_metadata=None,
                 vllm_config=self.vllm_config,
                 cudagraph_runtime_mode=CUDAGraphMode.FULL,
-                batch_descriptor=batch_descriptor),\
-            patch.object(entry.cudagraph, 'replay',
-                         wraps=entry.cudagraph.replay) as mock_replay:
+                batch_descriptor=batch_descriptor,
+            ),
+            patch.object(
+                entry.cudagraph, "replay", wraps=entry.cudagraph.replay
+            ) as mock_replay,
+        ):
             output2 = wrapper(self.input_tensor)
             mock_replay.assert_called_once()
 
@@ -187,41 +213,44 @@ class TestCUDAGraphWrapper:
         eager_output = self.model(self.input_tensor)
         torch.testing.assert_close(eager_output, output2)
 
-    @create_new_process_for_each_test("spawn")
     def test_bypass_on_mode_mismatch(self):
-        wrapper = CUDAGraphWrapper(self.model,
-                                   self.vllm_config,
-                                   runtime_mode=CUDAGraphMode.FULL)
+        wrapper = CUDAGraphWrapper(
+            self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
+        )
         batch_descriptor = BatchDescriptor(num_tokens=10)
 
-        with set_forward_context(
+        with (
+            set_forward_context(
                 attn_metadata=None,
                 vllm_config=self.vllm_config,
                 cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
-                batch_descriptor=batch_descriptor), \
-            patch('torch.cuda.graph',
-                  wraps=torch.cuda.graph) as mock_cuda_graph, \
-            patch.object(self.model, 'forward',
-                         wraps=self.model.forward) as mock_forward:
+                batch_descriptor=batch_descriptor,
+            ),
+            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_cuda_graph,
+            patch.object(
+                self.model, "forward", wraps=self.model.forward
+            ) as mock_forward,
+        ):
             wrapper(self.input_tensor)
             mock_cuda_graph.assert_not_called()
             mock_forward.assert_called_once()
         assert not wrapper.concrete_cudagraph_entries
 
-    @create_new_process_for_each_test("spawn")
     def test_bypass_on_mode_none(self):
-        wrapper = CUDAGraphWrapper(self.model,
-                                   self.vllm_config,
-                                   runtime_mode=CUDAGraphMode.FULL)
+        wrapper = CUDAGraphWrapper(
+            self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
+        )
         batch_descriptor = BatchDescriptor(num_tokens=10)
 
-        with set_forward_context(
+        with (
+            set_forward_context(
                 attn_metadata=None,
                 vllm_config=self.vllm_config,
                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
-                batch_descriptor=batch_descriptor), \
-            patch('torch.cuda.graph',
-                  wraps=torch.cuda.graph) as mock_cuda_graph:
+                batch_descriptor=batch_descriptor,
+            ),
+            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_cuda_graph,
+        ):
             wrapper(self.input_tensor)
             mock_cuda_graph.assert_not_called()
         assert not wrapper.concrete_cudagraph_entries
@@ -229,38 +258,44 @@ class TestCUDAGraphWrapper:
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 class TestCudagraphIntegration:
-
     def setup_method(self):
         # only FULL mode for non-uniform batches
-        self.comp_config = CompilationConfig(level=CompilationLevel.PIECEWISE,
-                                             cudagraph_mode="FULL",
-                                             cudagraph_capture_sizes=[10, 20])
+        self.comp_config = CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode="FULL",
+            cudagraph_capture_sizes=[10, 20],
+        )
         self.vllm_config = _create_vllm_config(self.comp_config)
         self.dispatcher = CudagraphDispatcher(self.vllm_config)
         self.dispatcher.initialize_cudagraph_keys(
-            self.comp_config.cudagraph_mode, uniform_decode_query_len=1)
+            self.comp_config.cudagraph_mode, uniform_decode_query_len=1
+        )
 
-    def _run_and_monitor_call(self, wrapper, input_tensor, runtime_mode,
-                              batch_descriptor):
+    def _run_and_monitor_call(
+        self, wrapper, input_tensor, runtime_mode, batch_descriptor
+    ):
         """Helper to run a single call and monitor the action."""
 
-        with patch('torch.cuda.graph',
-                wraps=torch.cuda.graph) as mock_graph_context, \
-            patch.object(wrapper, 'runnable',
-                        wraps=wrapper.runnable) as mock_runnable:
-
-            entry = wrapper.concrete_cudagraph_entries.get(
-                batch_descriptor, None)
+        with (
+            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_graph_context,
+            patch.object(wrapper, "runnable", wraps=wrapper.runnable) as mock_runnable,
+        ):
+            entry = wrapper.concrete_cudagraph_entries.get(batch_descriptor, None)
 
-            context = set_forward_context(attn_metadata=None,
-                                          vllm_config=self.vllm_config,
-                                          cudagraph_runtime_mode=runtime_mode,
-                                          batch_descriptor=batch_descriptor)
+            context = set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=runtime_mode,
+                batch_descriptor=batch_descriptor,
+            )
             mock_replay = MagicMock()
             if entry and entry.cudagraph:
-                with context, \
-                    patch.object(entry.cudagraph, 'replay',
-                                new_callable=MagicMock) as mock_replay:
+                with (
+                    context,
+                    patch.object(
+                        entry.cudagraph, "replay", new_callable=MagicMock
+                    ) as mock_replay,
+                ):
                     wrapper(input_tensor)
             else:
                 with context:
@@ -281,8 +316,7 @@ class TestCudagraphIntegration:
     @create_new_process_for_each_test("spawn")
     def test_capture_replay_bypass_logic(self):
         model = SimpleMLP().to("cuda")
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
-                                        CUDAGraphMode.FULL)
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
         max_bs = 16
         persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda")
         input_1 = persistent_input_buffer[:1]
@@ -294,75 +328,79 @@ class TestCudagraphIntegration:
         desc_3_unseen = BatchDescriptor(num_tokens=3)
 
         # 0. global warmup
-        with set_forward_context(attn_metadata=None,
-                                 vllm_config=self.vllm_config,
-                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
-                                 batch_descriptor=None):
+        with set_forward_context(
+            attn_metadata=None,
+            vllm_config=self.vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            batch_descriptor=None,
+        ):
             full_wrapper(input_1)
 
         rt_mode, key = self.dispatcher.dispatch(desc_1)
         # 1. Capture first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
-                                            key)
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
         assert action == "capture_global"
 
         # 2. Replay first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
-                                            key)
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
         assert action == "replay"
 
         rt_mode, key = self.dispatcher.dispatch(desc_2)
         # 3. Capture second shape
-        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode,
-                                            key)
+        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode, key)
         assert action == "capture_global"
 
         # 4. Replay second shape
-        action = self._run_and_monitor_call(full_wrapper, input_2,
-                                            CUDAGraphMode.FULL, desc_2)
+        action = self._run_and_monitor_call(
+            full_wrapper, input_2, CUDAGraphMode.FULL, desc_2
+        )
         assert action == "replay"
 
         # 5. Bypass if no key match
         rt_mode, key = self.dispatcher.dispatch(desc_3_unseen)
         assert rt_mode == CUDAGraphMode.NONE
-        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode,
-                                            key)
+        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode, key)
         assert action == "bypass"
 
         # capture unseen shape is not allowed after disable
         set_cudagraph_capturing_enabled(False)
         with pytest.raises(RuntimeError):
-            self._run_and_monitor_call(full_wrapper, input_3,
-                                       CUDAGraphMode.FULL, desc_3_unseen)
+            self._run_and_monitor_call(
+                full_wrapper, input_3, CUDAGraphMode.FULL, desc_3_unseen
+            )
         set_cudagraph_capturing_enabled(True)
 
     @create_new_process_for_each_test("spawn")
     def test_nested_wrappers(self):
         """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
         model = SimpleMLP().to("cuda")
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
-                                        CUDAGraphMode.FULL)
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
         input_1 = torch.randn(1, 10, device="cuda")
 
         # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
         inner_model = SimpleMLP().to("cuda")
-        piecewise_wrapper = CUDAGraphWrapper(inner_model, self.vllm_config,
-                                             CUDAGraphMode.PIECEWISE)
+        piecewise_wrapper = CUDAGraphWrapper(
+            inner_model, self.vllm_config, CUDAGraphMode.PIECEWISE
+        )
         inner_model.forward = MagicMock(wraps=inner_model.forward)
         outer_model = SimpleMLP().to("cuda")
         # When outer model is called, it calls the piecewise_wrapper
-        outer_model.forward = MagicMock(wraps=outer_model.forward,
-                                        side_effect=piecewise_wrapper)
-        full_wrapper = CUDAGraphWrapper(outer_model, self.vllm_config,
-                                        CUDAGraphMode.FULL)
+        outer_model.forward = MagicMock(
+            wraps=outer_model.forward, side_effect=piecewise_wrapper
+        )
+        full_wrapper = CUDAGraphWrapper(
+            outer_model, self.vllm_config, CUDAGraphMode.FULL
+        )
 
         desc_1 = BatchDescriptor(num_tokens=1)
 
         # 0. global warmup
-        with set_forward_context(attn_metadata=None,
-                                 vllm_config=self.vllm_config,
-                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
-                                 batch_descriptor=None):
+        with set_forward_context(
+            attn_metadata=None,
+            vllm_config=self.vllm_config,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            batch_descriptor=None,
+        ):
             full_wrapper(input_1)
 
         # --- Test runtime mode FULL---
@@ -370,8 +408,9 @@ class TestCudagraphIntegration:
         # The inner mock should be called once inside the graph capture.
         outer_model.forward.reset_mock()
         inner_model.forward.reset_mock()
-        action = self._run_and_monitor_call(full_wrapper, input_1,
-                                            CUDAGraphMode.FULL, desc_1)
+        action = self._run_and_monitor_call(
+            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
+        )
         assert action == "capture_global"
         assert outer_model.forward.call_count == 1
         assert inner_model.forward.call_count == 1
@@ -379,8 +418,9 @@ class TestCudagraphIntegration:
         # Run again. Expect outer wrapper to replay.
         # The outer model should NOT be called because the whole graph
         # is replayed.
-        action = self._run_and_monitor_call(full_wrapper, input_1,
-                                            CUDAGraphMode.FULL, desc_1)
+        action = self._run_and_monitor_call(
+            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
+        )
         assert action == "replay"
         assert outer_model.forward.call_count == 1  # No new call
         assert inner_model.forward.call_count == 1
@@ -391,16 +431,18 @@ class TestCudagraphIntegration:
         # Run with PIECEWISE mode context.
         # Expect outer wrapper to bypass and call inner wrapper.
         # Inner wrapper should capture.
-        action = self._run_and_monitor_call(full_wrapper, input_1,
-                                            CUDAGraphMode.PIECEWISE, desc_1)
+        action = self._run_and_monitor_call(
+            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
+        )
         assert action == "capture_global"
         assert outer_model.forward.call_count == 1
         assert inner_model.forward.call_count == 1
 
         # Run again with PIECEWISE.
         # Outer bypasses, inner replays.
-        action = self._run_and_monitor_call(full_wrapper, input_1,
-                                            CUDAGraphMode.PIECEWISE, desc_1)
+        action = self._run_and_monitor_call(
+            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
+        )
         assert action == "bypass"
         assert outer_model.forward.call_count == 2
         assert inner_model.forward.call_count == 1
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 41a9493cbe585a9d05306caef00c5394379c225b..d6bde16eba36b5ff36b58fa34f36364cdc811b87 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -4,14 +4,13 @@ import contextlib
 import os
 import weakref
 from contextlib import ExitStack
-from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform
 
 
@@ -34,74 +33,6 @@ def temporary_environ(env_vars):
                 os.environ[k] = v
 
 
-@dataclass
-class BackendConfig:
-    name: str
-    env_vars: dict
-    comp_config: dict
-    specific_gpu_arch: Optional[tuple] = None
-
-
-# Define all backend configurations of full cudagraph to be tested
-backend_configs = {
-    # FA3 on Hopper
-    "FA3":
-    BackendConfig(name="FA3",
-                  env_vars={
-                      "VLLM_FLASH_ATTN_VERSION": "3",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FlashMLA on Hopper
-    "FlashMLA":
-    BackendConfig(name="FlashMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FlashAttention MLA on Hopper
-    "FlashAttentionMLA":
-    BackendConfig(name="FlashAttentionMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_DECODE_ONLY",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FA2
-    "FA2":
-    BackendConfig(name="FA2",
-                  env_vars={
-                      "VLLM_FLASH_ATTN_VERSION": "2",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
-    # Triton Attention
-    "TritonAttn":
-    BackendConfig(name="TritonAttn",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
-    # FlashInfer
-    "FlashInfer":
-    BackendConfig(name="FlashInfer",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
-}
-
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 combo_cases_1 = [
@@ -114,9 +45,8 @@ combo_cases_1 = [
 ]
 
 
-@pytest.mark.parametrize("combo_case", combo_cases_1)
-def test_backend_and_cudagraph_mode_combo(combo_case):
-    backend_name, cudagraph_mode, supported = combo_case
+@pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
+def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supported):
     if backend_name == "FlashInfer":
         try:
             import flashinfer  # noqa: F401
@@ -124,25 +54,30 @@ def test_backend_and_cudagraph_mode_combo(combo_case):
             pytest.skip("FlashInfer is not installed")
     backend_config = backend_configs[backend_name]
     # Dynamically skip test if GPU capability is not met
-    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
-        != current_platform.get_device_capability():
+    if (
+        backend_config.specific_gpu_arch
+        and backend_config.specific_gpu_arch != current_platform.get_device_capability()
+    ):
         pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
 
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars
 
     with temporary_environ(env_vars), ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  gpu_memory_utilization=0.45,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      level=3, cudagraph_mode=cudagraph_mode))
+        llm = LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            max_num_seqs=256,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.45,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
+            ),
+        )
         llm.generate(["Hello, my name is"] * 10)
-
+    # when above code raises, `llm` may be undefined, so we need to catch that
     try:
         llm = weakref.proxy(llm)
         del llm
@@ -155,43 +90,46 @@ def test_backend_and_cudagraph_mode_combo(combo_case):
     )
 
 
-# test cudagraph_mode with different compilation level.
-# (backend_name, cudagraph_mode, compilation_level, supported)
+# test cudagraph_mode with different compilation mode.
+# (backend_name, cudagraph_mode, compilation_mode, supported)
 combo_cases_2 = [
-    ("FA2", "FULL", 0, True),  # no compilation + full cudagraph
-    ("FA2", "FULL", 3, True),  # piecewise compilation + full cudagraph
-    ("FA2", "PIECEWISE", 0, False),  # no compilation + piecewise cudagraph
-    ("FA2", "PIECEWISE", 3,
-     True),  # piecewise compilation + piecewise cudagraph
-    ("FA2", "FULL_AND_PIECEWISE", 0,
-     False),  # piecewise cudagraph not supported without piecewise compilation
-    ("FA2", "FULL_AND_PIECEWISE", 3, True),
-    ("FA2", "FULL_DECODE_ONLY", 0, True),
-    ("FA2", "FULL_DECODE_ONLY", 3, True),
-    ("FA2", "NONE", 0, True),  # no compilation + no cudagraph
-    ("FA2", "NONE", 3, True),  # piecewise compilation + no cudagraph
+    ("FA2", "FULL", CompilationMode.NONE, True),
+    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "NONE", CompilationMode.NONE, True),
+    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
 ]
 
 
-@pytest.mark.parametrize("combo_case", combo_cases_2)
-def test_cudagraph_compilation_combo(combo_case):
-    backend_name, cudagraph_mode, compilation_level, supported\
-        = combo_case
-
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+@pytest.mark.parametrize(
+    "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2
+)
+def test_cudagraph_compilation_combo(
+    backend_name, cudagraph_mode, compilation_mode, supported
+):
+    env_vars = backend_configs[backend_name].env_vars
 
     with temporary_environ(env_vars), ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  gpu_memory_utilization=0.45,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      level=compilation_level, cudagraph_mode=cudagraph_mode))
+        llm = LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            max_num_seqs=256,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.45,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                mode=compilation_mode, cudagraph_mode=cudagraph_mode
+            ),
+        )
         llm.generate(["Hello, my name is"] * 10)
+    # when above code raises, `llm` may be undefined, so we need to catch that
     try:
         llm = weakref.proxy(llm)
         del llm
diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c2136e005849af02a891c0044055da94ee43ccf
--- /dev/null
+++ b/tests/v1/determinism/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
+    """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+    yield
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..f018ee551dbfe397867474bb16603c1261df21b0
--- /dev/null
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -0,0 +1,929 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import random
+
+import pytest
+import torch
+from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
+
+from vllm import LLM, SamplingParams
+
+
+@skip_unsupported
+@pytest.mark.timeout(1000)
+@pytest.mark.parametrize(
+    "backend",
+    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+)
+def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
+    backend, monkeypatch: pytest.MonkeyPatch
+):
+    """
+    Ensures that the same request (the 'needle' prompt) yields identical output
+    whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
+    using the high-level v1 LLM() API only (no manual batching).
+
+    Strategy:
+    - Create two LLM engines with identical config except max_num_seqs: 1 vs N.
+    - Compute a baseline output for the needle prompt with the bs=1 engine.
+    - For many trials, generate a batch (size N) where the needle appears at a
+      random position among random filler prompts using the bs=N engine.
+    - Track how many trials match vs mismatch, and report totals at the end.
+      The test fails if any mismatches occur, but we still dump pass/fail
+      counts.
+
+    Notes:
+    - Use seeded stochastic sampling with a fixed seed to test determinism.
+    - Outputs are intentionally longer and sampled at higher temperature/top_p
+      to produce a more random-sounding phrase, yet remain deterministic by
+      seed.
+    - Keep max_tokens and max_model_len bounded for speed and memory use.
+    """
+    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
+    random.seed(seed)
+
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    # Allow overrides from environment (useful for CI tuning)
+    # "facebook/opt-125m" is too small, doesn't reliably test determinism
+    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
+    max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128"))
+    min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024"))
+    max_random_prompt = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
+    assert max_batch_size >= 2, "Batch size should be >= 2 to mix needle."
+
+    # Keep GPU memory usage low to avoid startup allocation failures.
+    gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "5120"))
+
+    # Sampling parameters: longer outputs with a more random-sounding
+    # continuation,but still deterministic due to fixed seed.
+    temperature = float(os.getenv("VLLM_NEEDLE_TEMPERATURE", "0.0"))
+    top_p = float(os.getenv("VLLM_NEEDLE_TOP_P", "0.95"))
+    max_tokens = int(os.getenv("VLLM_NEEDLE_MAX_TOKENS", "128"))
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm_bs1 = None
+    llm_bsN = None
+    try:
+        # Engine with bs=1 behavior
+        llm_bs1 = LLM_with_max_seqs(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+        )
+
+        # Baseline generation for the needle prompt alone.
+        baseline_out = llm_bs1.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        assert len(baseline_out[0].outputs) >= 1
+        baseline_text = baseline_out[0].outputs[0].text
+
+        # Engine with larger batch limit (e.g., 64)
+        llm_bsN = LLM_with_max_seqs(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+        )
+
+        mismatches = 0
+
+        for trial in range(num_trials):
+            # Create a batch of size `max_batch_size` and insert the needle at
+            # a random index
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
+
+            # Generate with the larger-batch engine
+            outputs = llm_bsN.generate(prompts, sampling)
+            # Find the needle output by position
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+            assert len(needle_output.outputs) >= 1
+            text = needle_output.outputs[0].text
+
+            if text != baseline_text:
+                print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
+                mismatches += 1
+
+        passes = num_trials - mismatches
+        # Dump how many passed vs failed
+        print(
+            f"[determinism] total={num_trials}, passed={passes}, "
+            f"failed={mismatches}, max_batch_size={max_batch_size}"
+        )
+
+        if mismatches > 0:
+            pytest.fail(
+                f"Nondeterministic outputs detected: {mismatches} failed out "
+                f"of {num_trials} trials (max_batch_size={max_batch_size})."
+            )
+
+    finally:
+        # Ensure engines are shutdown to free GPU/VRAM across test sessions
+        if llm_bs1 is not None:
+            with contextlib.suppress(Exception):
+                llm_bs1.shutdown()
+        if llm_bsN is not None:
+            with contextlib.suppress(Exception):
+                llm_bsN.shutdown()
+
+
+@skip_unsupported
+@pytest.mark.parametrize(
+    "backend",
+    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+)
+@pytest.mark.forked
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
+    backend, monkeypatch: pytest.MonkeyPatch
+):
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+
+    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
+    random.seed(seed)
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
+
+    # For batch invariance, disable custom all-reduce to ensure deterministic
+    # all-reduce operations (custom all-reduce may not be deterministic)
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_is_batch_invariant,
+    )
+
+    disable_custom_ar = vllm_is_batch_invariant()
+
+    if disable_custom_ar:
+        print(f"\n{'=' * 80}")
+        print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
+        print(f"{'=' * 80}\n")
+
+    llm = LLM(
+        model=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",  # not everything is supported
+    )
+
+    # Use more realistic prompts for better token generation
+    prompts = [_random_prompt(10, 50) for i in range(32)]
+
+    sp = SamplingParams(
+        temperature=0.6,
+        top_p=1.0,
+        max_tokens=8,
+        seed=1234,
+        logprobs=5,
+    )
+
+    # BS=1: run prompts individually and collect logprobs per step.
+    print("\n" + "=" * 80)
+    print("STARTING BS=1 RUNS (each prompt individually)")
+    print("=" * 80 + "\n")
+
+    bs1_logprobs_per_prompt = []
+    bs1_tokens_per_prompt = []
+    for idx, p in enumerate(prompts):
+        print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
+        outs = llm.generate([p], sp, use_tqdm=False)
+        assert len(outs) == 1
+        step_logprobs, token_ids = _extract_step_logprobs(outs[0])
+        if step_logprobs is None:
+            pytest.skip(
+                "Logits are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+        bs1_logprobs_per_prompt.append(step_logprobs)
+        bs1_tokens_per_prompt.append(token_ids)
+        print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
+
+    # BS=N: run prompts in a batch and collect logprobs per step for each
+    # prompt.
+    print("\n" + "=" * 80)
+    print(f"STARTING BS={len(prompts)} RUN (all prompts batched)")
+    print("=" * 80 + "\n")
+
+    outs_batched = llm.generate(prompts, sp, use_tqdm=False)
+    assert len(outs_batched) == len(prompts)
+    bsN_logprobs_per_prompt = []
+    bsN_tokens_per_prompt = []
+
+    print(f"\n[BS={len(prompts)}] Processing batched outputs...")
+    for idx, o in enumerate(outs_batched):
+        tokens = o.outputs[0].token_ids if o.outputs else "N/A"
+        print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
+        step_logprobs, token_ids = _extract_step_logprobs(o)
+        if step_logprobs is None:
+            pytest.skip(
+                "Logits are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+        bsN_logprobs_per_prompt.append(step_logprobs)
+        bsN_tokens_per_prompt.append(token_ids)
+
+    # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
+    failed_prompts = []
+    for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
+        zip(
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+        )
+    ):
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            reason = (
+                f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
+                f"vs {len(logprobs_bsN)} (BS=N)"
+            )
+            failed_prompts.append(
+                {
+                    "prompt_idx": i,
+                    "step": "all",
+                    "reason": reason,
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
+            continue
+
+        # Check if tokens match first
+        if tokens_bs1 != tokens_bsN:
+            failed_prompts.append(
+                {
+                    "prompt_idx": i,
+                    "step": "sampling",
+                    "reason": "Different tokens sampled",
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                    "bs1_all_logprobs": [
+                        logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))
+                    ],
+                    "bsN_all_logprobs": [
+                        logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))
+                    ],
+                }
+            )
+            continue
+
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a.shape != b.shape:
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
+                break
+
+            if not torch.equal(a, b):
+                max_diff = torch.abs(a - b).max().item()
+                # Print which token failed
+                print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
+                bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
+                bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
+                print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
+                print(f"  BS=1 logprob: {a.tolist()}")
+                print(f"  BS=N logprob: {b.tolist()}")
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                        "bs1_all_logprobs": [
+                            logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))
+                        ],
+                        "bsN_all_logprobs": [
+                            logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))
+                        ],
+                    }
+                )
+                break
+
+    # Print summary of all failures
+    if failed_prompts:
+        print(f"\n{'=' * 80}")
+        fail_msg = (
+            f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
+            f"{len(prompts)} prompts failed"
+        )
+        print(fail_msg)
+        print(f"{'=' * 80}")
+        for fail in failed_prompts:
+            print(f"\nPrompt {fail['prompt_idx']} (step {fail['step']}):")
+            print(f"  Reason: {fail['reason']}")
+            print(f"  Preview: {fail['prompt_preview']}...")
+
+            # Always show the tokens
+            if "bs1_tokens" in fail:
+                print(f"  BS=1 tokens: {fail['bs1_tokens']}")
+            if "bsN_tokens" in fail:
+                print(f"  BS=N tokens: {fail['bsN_tokens']}")
+
+            if "bs1_all_logprobs" in fail:
+                print(f"  BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
+                for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
+                    print(f"    Step {step_idx}: {logprobs}")
+                print(f"  BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
+                for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
+                    print(f"    Step {step_idx}: {logprobs}")
+        print(f"{'=' * 80}\n")
+
+        # Fail the test with summary
+        msg = (
+            f"Batch invariance violated in {len(failed_prompts)}/"
+            f"{len(prompts)} prompts. See output above for details."
+        )
+        pytest.fail(msg)
+
+
+@skip_unsupported
+@pytest.mark.parametrize(
+    "backend",
+    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+)
+def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
+    """
+    Simple test that runs the model with a basic prompt and prints the output.
+    Useful for quick smoke testing and debugging.
+    """
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+
+    llm = LLM(
+        model=model,
+        max_num_seqs=1,
+        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        gpu_memory_utilization=0.9,
+        max_model_len=2048,
+        dtype="bfloat16",
+        enable_prefix_caching=False,
+    )
+
+    prompt = "the capital of france is"
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=20,
+    )
+
+    print(f"\n{'=' * 80}")
+    print("Running simple generation test")
+    print(f"Prompt: '{prompt}'")
+    print(f"{'=' * 80}\n")
+
+    try:
+        outputs = llm.generate([prompt], sampling_params)
+
+        assert len(outputs) == 1
+        output_text = outputs[0].outputs[0].text
+
+        print(f"Output: '{output_text}'")
+        print(f"\n{'=' * 80}")
+        print(f"Full completion: '{prompt}{output_text}'")
+        print(f"{'=' * 80}\n")
+
+    finally:
+        with contextlib.suppress(Exception):
+            llm.shutdown()
+
+
+@skip_unsupported
+@pytest.mark.parametrize(
+    "backend",
+    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+)
+@pytest.mark.forked
+def test_logprobs_without_batch_invariance_should_fail(
+    backend, monkeypatch: pytest.MonkeyPatch
+):
+    """
+    This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
+    It DISABLES batch invariance mode and expects to see non-deterministic behavior
+    between BS=1 and BS=N runs. This demonstrates that batch invariance is actually
+    doing something useful.
+
+    The test will PASS if we detect differences (proving batch invariance matters).
+    The test will FAIL if everything matches (suggesting batch invariance isn't needed).
+    """
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+
+    # CRITICAL: Disable batch invariance for this test
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
+
+    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
+    random.seed(seed)
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
+
+    print(f"\n{'=' * 80}")
+    print("BATCH INVARIANCE DISABLED: Expecting non-deterministic behavior")
+    print(f"{'=' * 80}\n")
+
+    llm = LLM(
+        model=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",
+    )
+
+    # build ragged prompts to change shapes significantly across BS=1 vs BS=N
+    long_min = int(os.getenv("VLLM_MIN_PROMPT", "768"))
+    long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
+    prompts: list[str] = []
+    options = [
+        (max(long_min, 1536), max(long_max, 3072)),  # very long
+        (max(1024, long_min), max(2048, long_max)),  # long
+        (256, 512),  # mid
+        (10, 20),  # short
+    ]
+
+    for _ in range(32):
+        lo, hi = random.choice(options)
+        prompts.append(_random_prompt(lo, hi))
+
+    sp = SamplingParams(
+        temperature=0.6,
+        top_p=1.0,
+        max_tokens=8,
+        seed=1234,
+        logprobs=5,
+    )
+
+    # BS=1: run prompts individually and collect logprobs per step.
+    print("\n" + "=" * 80)
+    print("STARTING BS=1 RUNS (each prompt individually)")
+    print("=" * 80 + "\n")
+
+    bs1_logprobs_per_prompt = []
+    bs1_tokens_per_prompt = []
+    for idx, p in enumerate(prompts):
+        print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
+        outs = llm.generate([p], sp, use_tqdm=False)
+        assert len(outs) == 1
+        step_logprobs, token_ids = _extract_step_logprobs(outs[0])
+        if step_logprobs is None:
+            pytest.skip(
+                "Logits are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+        bs1_logprobs_per_prompt.append(step_logprobs)
+        bs1_tokens_per_prompt.append(token_ids)
+        print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
+
+    # BS=N: run prompts in a batch and collect logprobs per step for each prompt.
+    print("\n" + "=" * 80)
+    print(f"STARTING BS={len(prompts)} RUN (all prompts batched)")
+    print("=" * 80 + "\n")
+
+    outs_batched = llm.generate(prompts, sp, use_tqdm=False)
+    assert len(outs_batched) == len(prompts)
+    bsN_logprobs_per_prompt = []
+    bsN_tokens_per_prompt = []
+
+    print(f"\n[BS={len(prompts)}] Processing batched outputs...")
+    for idx, o in enumerate(outs_batched):
+        tokens = o.outputs[0].token_ids if o.outputs else "N/A"
+        print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
+        step_logprobs, token_ids = _extract_step_logprobs(o)
+        if step_logprobs is None:
+            pytest.skip(
+                "Logits are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+        bsN_logprobs_per_prompt.append(step_logprobs)
+        bsN_tokens_per_prompt.append(token_ids)
+
+    # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
+    differences_found = []
+    for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
+        zip(
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+        )
+    ):
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            reason = (
+                f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
+                f"vs {len(logprobs_bsN)} (BS=N)"
+            )
+            differences_found.append(
+                {
+                    "prompt_idx": i,
+                    "step": "all",
+                    "reason": reason,
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
+            continue
+
+        # Check if tokens match first
+        if tokens_bs1 != tokens_bsN:
+            differences_found.append(
+                {
+                    "prompt_idx": i,
+                    "step": "sampling",
+                    "reason": "Different tokens sampled",
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
+            continue
+
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a.shape != b.shape:
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
+                break
+
+            if not torch.equal(a, b):
+                max_diff = torch.abs(a - b).max().item()
+                print(
+                    f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
+                    f"Token {t}: max_diff={max_diff:.6e}"
+                )
+                bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
+                bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
+                print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
+                print(f"  BS=1 logprob: {a.tolist()}")
+                print(f"  BS=N logprob: {b.tolist()}")
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
+                break
+
+    # Print summary
+    print(f"\n{'=' * 80}")
+    if differences_found:
+        success_msg = (
+            f"✓ SUCCESS: Batch invariance is doing something! "
+            f"Found {len(differences_found)}/{len(prompts)} prompts "
+            f"with differences when batch invariance was DISABLED."
+        )
+        print(success_msg)
+        print(f"{'=' * 80}")
+        for diff in differences_found:
+            print(f"\nPrompt {diff['prompt_idx']} (step {diff['step']}):")
+            print(f"  Reason: {diff['reason']}")
+            print(f"  Preview: {diff['prompt_preview']}...")
+            if "bs1_tokens" in diff:
+                print(f"  BS=1 tokens: {diff['bs1_tokens']}")
+            if "bsN_tokens" in diff:
+                print(f"  BS=N tokens: {diff['bsN_tokens']}")
+        print(f"{'=' * 80}\n")
+        # Test PASSES because we found differences (batch invariance matters!)
+        return
+    else:
+        # Test FAILS because everything matched even without batch invariance
+        fail_msg = (
+            f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
+            f"between BS=1 and BS=N even with batch invariance DISABLED. "
+            f"This suggests batch invariance might not be necessary, "
+            f"or the test needs more sensitive prompts."
+        )
+        print(fail_msg)
+        print(f"{'=' * 80}\n")
+        pytest.fail(fail_msg)
+
+
+@skip_unsupported
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+@pytest.mark.forked
+def test_decode_logprobs_match_prefill_logprobs(
+    backend, monkeypatch: pytest.MonkeyPatch
+):
+    """
+    Test that verifies decode logprobs match prefill logprobs.
+
+    For each decoded token at position i:
+    1. Run decode to generate N tokens and collect their logprobs
+    2. For each position i in [0, N):
+       - Take prefix = prompt + tokens[0:i]
+       - Run prefill(prefix + tokens[i]) to get logprob of tokens[i]
+       - Verify prefill logprob matches decode logprob bitwise
+
+    This ensures that the logprobs from decode are consistent with what
+    we would get if we ran prefill on each prefix.
+    """
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+
+    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
+    random.seed(seed)
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
+
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_is_batch_invariant,
+    )
+
+    disable_custom_ar = vllm_is_batch_invariant()
+
+    if disable_custom_ar:
+        print(f"\n{'=' * 80}")
+        print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
+        print(f"{'=' * 80}\n")
+
+    llm = LLM(
+        model=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",
+    )
+
+    # Use a few test prompts
+    num_test_prompts = int(os.getenv("VLLM_DECODE_PREFILL_NUM_PROMPTS", "4"))
+    prompts = [_random_prompt(10, 50) for _ in range(num_test_prompts)]
+
+    # Generate longer sequences to test multiple decode steps
+    max_tokens = int(os.getenv("VLLM_DECODE_PREFILL_MAX_TOKENS", "16"))
+
+    sp = SamplingParams(
+        temperature=0.0,  # Greedy for determinism
+        max_tokens=max_tokens,
+        logprobs=5,
+    )
+
+    print("\n" + "=" * 80)
+    print("STEP 1: Running decode to generate tokens and collect logprobs")
+    print("=" * 80 + "\n")
+
+    # Step 1: Run decode and collect logprobs
+    decode_outputs = llm.generate(prompts, sp, use_tqdm=False)
+
+    failed_comparisons = []
+
+    for prompt_idx, (prompt, decode_output) in enumerate(zip(prompts, decode_outputs)):
+        print(f"\n[Prompt {prompt_idx}] Testing: {prompt[:80]}...")
+
+        # Extract decode logprobs and tokens
+        decode_logprobs, token_ids = _extract_step_logprobs(decode_output)
+        if decode_logprobs is None:
+            pytest.skip(
+                "Logprobs are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+
+        print(f"[Prompt {prompt_idx}] Generated {len(token_ids)} tokens: {token_ids}")
+        print(f"[Prompt {prompt_idx}] Decode logprobs: {decode_logprobs.tolist()}")
+
+        # Step 2: For each token position, run prefill and compare
+        print(f"\n[Prompt {prompt_idx}] Verifying each token via prefill...")
+
+        for token_idx in range(len(token_ids)):
+            # Construct the prefix up to (but not including) this token
+            current_token = token_ids[token_idx]
+
+            # We need to detokenize to get the text prefix
+            # For this, we'll use the tokenizer from the LLM
+            # However, the LLM API doesn't expose tokenizer easily, so we'll
+            # construct the prefix by decoding from the original prompt
+
+            # Get text up to this point by using the output text
+            # This is approximate but should work for verification
+            if token_idx == 0:
+                prefix_prompt = prompt
+            else:
+                # Use the partial output text up to this token
+                # We'll need to construct this from the full output
+                prefix_output = decode_output.outputs[0]
+                # Get the text for tokens 0 to token_idx-1
+                # Unfortunately, we don't have per-token text, so we'll use
+                # a different approach: run prefill with prompt + tokens[0:token_idx]
+
+                # Actually, we need to get the actual text. Let's use a workaround:
+                # Run a generation with max_tokens = token_idx to get that prefix
+                prefix_sp = SamplingParams(
+                    temperature=0.0,
+                    max_tokens=token_idx,
+                    logprobs=1,
+                )
+                prefix_output = llm.generate([prompt], prefix_sp, use_tqdm=False)[0]
+                prefix_prompt = prompt + prefix_output.outputs[0].text
+
+            # Now run prefill with max_tokens=1 to get the logprob of the next token
+            prefill_sp = SamplingParams(
+                temperature=0.0,
+                max_tokens=1,
+                logprobs=5,
+            )
+
+            print(
+                f"  [Token {token_idx}] Running prefill for prefix "
+                f"(len={len(prefix_prompt)})..."
+            )
+            prefill_output = llm.generate([prefix_prompt], prefill_sp, use_tqdm=False)[
+                0
+            ]
+            prefill_logprobs, prefill_token_ids = _extract_step_logprobs(prefill_output)
+
+            if prefill_logprobs is None:
+                print(f"  [Token {token_idx}] Warning: No prefill logprobs available")
+                continue
+
+            # The first token from prefill should match the current token
+            prefill_token = prefill_token_ids[0]
+            prefill_logprob = prefill_logprobs[0].item()
+            decode_logprob = decode_logprobs[token_idx].item()
+
+            print(
+                f"  [Token {token_idx}] Decode token: {current_token}, "
+                f"logprob: {decode_logprob:.8f}"
+            )
+            print(
+                f"  [Token {token_idx}] Prefill token: {prefill_token}, "
+                f"logprob: {prefill_logprob:.8f}"
+            )
+
+            # Check if tokens match
+            if current_token != prefill_token:
+                failed_comparisons.append(
+                    {
+                        "prompt_idx": prompt_idx,
+                        "token_idx": token_idx,
+                        "reason": "Token mismatch",
+                        "decode_token": current_token,
+                        "prefill_token": prefill_token,
+                        "decode_logprob": decode_logprob,
+                        "prefill_logprob": prefill_logprob,
+                        "prompt_text": prompt[:100],
+                        "prefix_text": prefix_prompt[:100],
+                    }
+                )
+                print(f"  [Token {token_idx}] ✗ TOKEN MISMATCH!")
+                continue
+
+            # Check if logprobs match bitwise
+            if decode_logprob != prefill_logprob:
+                diff = abs(decode_logprob - prefill_logprob)
+                failed_comparisons.append(
+                    {
+                        "prompt_idx": prompt_idx,
+                        "token_idx": token_idx,
+                        "reason": "Logprob mismatch",
+                        "decode_token": current_token,
+                        "prefill_token": prefill_token,
+                        "decode_logprob": decode_logprob,
+                        "prefill_logprob": prefill_logprob,
+                        "diff": diff,
+                        "prompt_text": prompt[:100],
+                        "prefix_text": prefix_prompt[:100],
+                        "decode_all_tokens": token_ids,
+                        "decode_all_logprobs": decode_logprobs.tolist(),
+                    }
+                )
+                print(f"  [Token {token_idx}] ✗ LOGPROB MISMATCH! diff={diff:.8e}")
+            else:
+                print(f"  [Token {token_idx}] ✓ Match (bitwise equal)")
+
+    # Print summary
+    print(f"\n{'=' * 80}")
+    if failed_comparisons:
+        print(f"DECODE-PREFILL MISMATCH: {len(failed_comparisons)} failures detected")
+        print(f"{'=' * 80}")
+
+        # Group failures by prompt for better readability
+        failures_by_prompt: dict[int, list[dict]] = {}
+        for fail in failed_comparisons:
+            pid = fail["prompt_idx"]
+            if pid not in failures_by_prompt:
+                failures_by_prompt[pid] = []
+            failures_by_prompt[pid].append(fail)
+
+        for prompt_idx, failures in failures_by_prompt.items():
+            print(f"\n{'=' * 80}")
+            print(f"PROMPT {prompt_idx}: {failures[0]['prompt_text']}...")
+            print(f"{'=' * 80}")
+            print(f"Total failures for this prompt: {len(failures)}")
+
+            # Show where mismatches occur (which token positions)
+            mismatch_positions = [f["token_idx"] for f in failures]
+            print(f"Mismatch at token positions: {mismatch_positions}")
+
+            # Show first few failures in detail
+            for i, fail in enumerate(failures[:5]):  # Show first 5 failures per prompt
+                print(f"\n  [Failure {i + 1}] Token position {fail['token_idx']}:")
+                print(f"    Reason: {fail['reason']}")
+                print(f"    Prefix text: '{fail['prefix_text']}...'")
+                print(
+                    f"    Decode:  token={fail['decode_token']}, "
+                    f"logprob={fail['decode_logprob']:.10f}"
+                )
+                print(
+                    f"    Prefill: token={fail['prefill_token']}, "
+                    f"logprob={fail['prefill_logprob']:.10f}"
+                )
+                if "diff" in fail:
+                    print(f"    Difference: {fail['diff']:.10e}")
+                    # Show in hex to see bitwise difference
+                    import struct
+
+                    decode_hex = struct.pack("f", fail["decode_logprob"]).hex()
+                    prefill_hex = struct.pack("f", fail["prefill_logprob"]).hex()
+                    print(f"    Decode logprob (hex):  0x{decode_hex}")
+                    print(f"    Prefill logprob (hex): 0x{prefill_hex}")
+
+                # If we have all tokens/logprobs, show the context
+                if "decode_all_tokens" in fail and "decode_all_logprobs" in fail:
+                    token_idx = fail["token_idx"]
+                    all_tokens = fail["decode_all_tokens"]
+                    all_logprobs = fail["decode_all_logprobs"]
+
+                    # Show context: 2 tokens before and after
+                    start = max(0, token_idx - 2)
+                    end = min(len(all_tokens), token_idx + 3)
+
+                    print(f"    Context (tokens {start} to {end - 1}):")
+                    for j in range(start, end):
+                        marker = " <-- MISMATCH" if j == token_idx else ""
+                        print(
+                            f"      [{j}] token={all_tokens[j]}, "
+                            f"logprob={all_logprobs[j]:.8f}{marker}"
+                        )
+
+            if len(failures) > 5:
+                print(f"\n  ... and {len(failures) - 5} more failures for this prompt")
+
+        print(f"\n{'=' * 80}\n")
+
+        pytest.fail(
+            f"Decode logprobs do not match prefill logprobs: "
+            f"{len(failed_comparisons)} mismatches found."
+        )
+    else:
+        print("✓ SUCCESS: All decode logprobs match prefill logprobs bitwise!")
+        print(f"{'=' * 80}\n")
+
+
+def LLM_with_max_seqs(
+    model: str,
+    max_num_seqs: int,
+    gpu_memory_utilization: float,
+    max_model_len: int,
+) -> LLM:
+    """
+    Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
+    using the high-level v1 LLM API, while constraining memory usage.
+    """
+    return LLM(
+        model=model,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
+        max_model_len=max_model_len,
+        dtype="bfloat16",
+        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        enable_prefix_caching=False,
+        # Enable for MOE models
+        # enable_expert_parallel=True,
+    )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f47863dd23fd7f3327db922b83e5fb38af8425
--- /dev/null
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+HTTP-based batch invariance test: send requests to a running
+vLLM server and compare BS=1 vs BS=N results (tokens and per-step logprobs).
+
+Environment variables:
+  - VLLM_TEST_MODEL: served model name (e.g., Qwen/Qwen3-1.7B / DeepSeek-R1)
+  - VLLM_TP_SIZE: tensor parallelism size (e.g., 4)
+
+"""
+
+import os
+import random
+import sys
+from typing import Any
+
+import openai
+from utils import _random_prompt, skip_unsupported
+
+from tests.utils import RemoteOpenAIServer
+
+
+def _request_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: Any,
+    sp: dict[str, Any],
+    max_retries: int = 3,
+    retry_backoff: float = 0.5,
+) -> dict[str, Any] | None:
+    payload: dict[str, Any] = {"model": model, "prompt": prompt}
+    payload.update(sp)
+
+    for attempt in range(max_retries + 1):
+        try:
+            completion = client.completions.create(**payload)
+            # Convert to plain dict so downstream logic can keep using
+            # dict-style access just like with raw HTTP JSON.
+            return completion.model_dump()
+        except Exception as e:  # pragma: no cover
+            if attempt < max_retries:
+                import time as _t
+
+                _t.sleep(retry_backoff * (2**attempt))
+                continue
+            sys.stderr.write(f"Error: {e}\n")
+            return None
+    return None
+
+
+def _extract_tokens_and_logprobs(
+    choice: dict[str, Any],
+) -> tuple[list[Any], list[float] | None]:
+    tokens: list[Any] = []
+    token_logprobs: list[float] | None = None
+    lp = choice.get("logprobs")
+    if lp and isinstance(lp, dict):
+        tokens = lp.get("token_ids") or lp.get("tokens") or []
+        token_logprobs = lp.get("token_logprobs", None)
+    return tokens, token_logprobs
+
+
+def _compare_bs1_vs_bsn_single_process(
+    prompts: list[str],
+    sp_kwargs: dict[str, Any],
+    client: openai.OpenAI,
+    model_name: str,
+) -> None:
+    # BS=1
+    bs1_tokens_per_prompt: list[list[Any]] = []
+    bs1_logprobs_per_prompt: list[list[float] | None] = []
+    for p in prompts:
+        resp = _request_completion(client, model_name, p, sp_kwargs)
+        if resp is None or not resp.get("choices"):
+            raise AssertionError("BS=1 empty/failed response")
+        choice = resp["choices"][0]
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(
+                "logprobs not returned; ensure server supports 'logprobs'"
+            )
+        bs1_tokens_per_prompt.append(list(toks))
+        bs1_logprobs_per_prompt.append(list(lps))
+
+    # BS=N
+    bsN_tokens_per_prompt: list[list[Any]] = [None] * len(prompts)  # type: ignore[list-item]
+    bsN_logprobs_per_prompt: list[list[float] | None] = [None] * len(prompts)
+    resp = _request_completion(client, model_name, prompts, sp_kwargs)
+    if resp is None or not resp.get("choices"):
+        raise AssertionError("BS=N empty/failed batched response")
+    choices = resp.get("choices", [])
+    if len(choices) != len(prompts):
+        raise AssertionError(
+            f"BS=N choices length {len(choices)} != num prompts {len(prompts)}"
+        )
+    for idx, choice in enumerate(choices):
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(f"BS=N missing logprobs for prompt {idx}")
+        bsN_tokens_per_prompt[idx] = list(toks)
+        bsN_logprobs_per_prompt[idx] = list(lps)
+
+    # compare
+    for i, (tokens_bs1, tokens_bsN, logprobs_bs1, logprobs_bsN) in enumerate(
+        zip(
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+        )
+    ):
+        if tokens_bs1 != tokens_bsN:
+            raise AssertionError(
+                f"Prompt {i} (sampling): Different tokens sampled. "
+                f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+            )
+        if logprobs_bs1 is None or logprobs_bsN is None:
+            raise AssertionError(f"Prompt {i}: Missing logprobs in one of the runs")
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            raise AssertionError(
+                f"Prompt {i}: Different number of steps: "
+                f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)."
+            )
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a != b:
+                diff = abs(a - b)
+                raise AssertionError(
+                    f"Prompt {i} Step {t}: Bitwise mismatch "
+                    f"(abs diff={diff:.6e}). "
+                    f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+                )
+
+
+@skip_unsupported
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN():
+    random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    prompts_all = [_random_prompt(10, 50) for _ in range(32)]
+
+    sp_kwargs: dict[str, Any] = {
+        "temperature": 0.6,
+        "top_p": 1.0,
+        "max_tokens": 8,
+        "seed": 42,
+        "logprobs": 5,
+    }
+
+    tp_size = os.getenv("VLLM_TP_SIZE", "1")
+    server_args: list[str] = []
+    if tp_size:
+        server_args += ["-tp", tp_size]
+
+    with RemoteOpenAIServer(model_name, server_args) as server:
+        client = server.get_client()
+        _compare_bs1_vs_bsn_single_process(
+            prompts=prompts_all,
+            sp_kwargs=sp_kwargs,
+            client=client,
+            model_name=model_name,
+        )
diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
new file mode 100644
index 0000000000000000000000000000000000000000..390872519528cc4ed9a1dcbc6f7f3cf2568d2a77
--- /dev/null
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -0,0 +1,310 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test batch-invariant RMS normalization against standard implementations.
+
+This test compares the Triton-based batch-invariant RMS norm implementation
+with the standard CUDA-based implementation to ensure numerical accuracy.
+"""
+
+import pytest
+import torch
+from utils import skip_unsupported
+
+from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@skip_unsupported
+@pytest.mark.parametrize("batch_size", [1, 4, 16, 64])
+@pytest.mark.parametrize("hidden_size", [512, 2048, 4096, 8192])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("eps", [1e-6, 1e-5])
+def test_rms_norm_batch_invariant_vs_standard(
+    batch_size: int, hidden_size: int, dtype: torch.dtype, eps: float
+):
+    """
+    Compare batch-invariant Triton RMS norm against standard CUDA implementation.
+
+    Tests that the Triton-based batch-invariant RMS norm produces numerically
+    equivalent results to the standard CUDA implementation across various
+    configurations.
+    """
+    device = torch.device("cuda")
+
+    # Create test input and weight
+    torch.manual_seed(42)
+    input_tensor = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Standard implementation (CUDA ops)
+    rms_norm_layer = RMSNorm(hidden_size, eps=eps, dtype=dtype).to(device)
+    rms_norm_layer.weight.data = weight.clone()
+
+    standard_output = rms_norm_layer.forward_cuda(input_tensor)
+
+    # Batch-invariant implementation (Triton)
+    triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+    # Compare outputs
+    # Use looser tolerance for bfloat16 due to its lower precision
+    if dtype == torch.bfloat16:
+        rtol, atol = 1e-1, 1e-1  # 10% relative tolerance for bfloat16
+    else:
+        rtol, atol = 1e-2, 1e-2  # 1% for float16/float32
+
+    torch.testing.assert_close(
+        triton_output,
+        standard_output,
+        rtol=rtol,
+        atol=atol,
+        msg=f"RMS norm mismatch for batch_size={batch_size}, "
+        f"hidden_size={hidden_size}, "
+        f"dtype={dtype}, eps={eps}",
+    )
+
+
+@skip_unsupported
+@pytest.mark.parametrize("batch_size", [1, 16, 128])
+@pytest.mark.parametrize("seq_len", [1, 32, 512])
+@pytest.mark.parametrize("hidden_size", [2048, 4096])
+def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
+    """
+    Test RMS norm with 3D input tensors (batch, seq_len, hidden_size).
+
+    Ensures that the batch-invariant RMS norm correctly handles multi-dimensional
+    inputs that are common in transformer models.
+    """
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    eps = 1e-6
+
+    torch.manual_seed(42)
+    input_tensor = torch.randn(
+        batch_size, seq_len, hidden_size, dtype=dtype, device=device
+    )
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Standard implementation
+    rms_norm_layer = RMSNorm(hidden_size, eps=eps, dtype=dtype).to(device)
+    rms_norm_layer.weight.data = weight.clone()
+    standard_output = rms_norm_layer.forward_cuda(input_tensor)
+
+    # Batch-invariant implementation
+    triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+    # Use looser tolerance for bfloat16
+    rtol, atol = 1e-1, 1e-1  # 10% tolerance for bfloat16
+
+    torch.testing.assert_close(
+        triton_output,
+        standard_output,
+        rtol=rtol,
+        atol=atol,
+        msg=f"RMS norm mismatch for 3D input with batch_size={batch_size}, "
+        f"seq_len={seq_len}, hidden_size={hidden_size}",
+    )
+
+
+@skip_unsupported
+def test_rms_norm_numerical_stability():
+    """
+    Test RMS norm numerical stability with extreme values.
+
+    Ensures that both implementations handle edge cases like very small or large
+    values without producing NaN or Inf.
+    """
+    device = torch.device("cuda")
+    dtype = torch.float16
+    eps = 1e-6
+    hidden_size = 2048
+
+    # Test cases with extreme values
+    test_cases = [
+        # Very small values
+        torch.ones(4, hidden_size, dtype=dtype, device=device) * 1e-5,
+        # Very large values
+        torch.ones(4, hidden_size, dtype=dtype, device=device) * 1e4,
+        # Mixed small and large
+        torch.randn(4, hidden_size, dtype=dtype, device=device) * 100,
+        # Values near zero
+        torch.randn(4, hidden_size, dtype=dtype, device=device) * 1e-6,
+    ]
+
+    weight = torch.ones(hidden_size, dtype=dtype, device=device)
+
+    for idx, input_tensor in enumerate(test_cases):
+        # Standard implementation
+        rms_norm_layer = RMSNorm(hidden_size, eps=eps, dtype=dtype).to(device)
+        rms_norm_layer.weight.data = weight.clone()
+        standard_output = rms_norm_layer.forward_cuda(input_tensor)
+
+        # Batch-invariant implementation
+        triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+        # Check for NaN or Inf
+        assert not torch.isnan(standard_output).any(), (
+            f"Standard RMS norm produced NaN for test case {idx}"
+        )
+        assert not torch.isinf(standard_output).any(), (
+            f"Standard RMS norm produced Inf for test case {idx}"
+        )
+        assert not torch.isnan(triton_output).any(), (
+            f"Triton RMS norm produced NaN for test case {idx}"
+        )
+        assert not torch.isinf(triton_output).any(), (
+            f"Triton RMS norm produced Inf for test case {idx}"
+        )
+
+        # Compare outputs - very lenient for extreme values with float16
+        torch.testing.assert_close(
+            triton_output,
+            standard_output,
+            rtol=2e-1,  # 20% tolerance for extreme values
+            atol=2e-1,
+            msg=f"RMS norm mismatch for extreme value test case {idx}",
+        )
+
+
+@skip_unsupported
+def test_rms_norm_formula():
+    """
+    Test that RMS norm follows the correct mathematical formula.
+
+    Verifies: output = input / sqrt(mean(input^2) + eps) * weight
+    """
+    device = torch.device("cuda")
+    dtype = torch.float32  # Use float32 for higher precision in formula check
+    eps = 1e-6
+    hidden_size = 1024
+
+    torch.manual_seed(42)
+    input_tensor = torch.randn(8, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Compute expected output using the formula
+    variance = (input_tensor.pow(2).mean(dim=-1, keepdim=True)).to(dtype)
+    expected_output = input_tensor * torch.rsqrt(variance + eps) * weight
+
+    # Batch-invariant implementation
+    triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+    # Compare against formula
+    torch.testing.assert_close(
+        triton_output,
+        expected_output,
+        rtol=1e-4,
+        atol=1e-4,
+        msg="Triton RMS norm doesn't match expected formula",
+    )
+
+
+@skip_unsupported
+@pytest.mark.parametrize("hidden_size", [128, 1024, 4096, 16384])
+def test_rms_norm_different_hidden_sizes(hidden_size: int):
+    """
+    Test RMS norm with various hidden sizes to ensure block size handling.
+
+    The Triton kernel uses a fixed BLOCK_SIZE=1024, so this tests that it
+    correctly handles hidden sizes both smaller and larger than the block size.
+    """
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    eps = 1e-6
+    batch_size = 16
+
+    torch.manual_seed(42)
+    input_tensor = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Standard implementation
+    rms_norm_layer = RMSNorm(hidden_size, eps=eps, dtype=dtype).to(device)
+    rms_norm_layer.weight.data = weight.clone()
+    standard_output = rms_norm_layer.forward_cuda(input_tensor)
+
+    # Batch-invariant implementation
+    triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+    # Use looser tolerance for bfloat16
+    rtol, atol = 1e-1, 1e-1  # 10% tolerance for bfloat16
+
+    torch.testing.assert_close(
+        triton_output,
+        standard_output,
+        rtol=rtol,
+        atol=atol,
+        msg=f"RMS norm mismatch for hidden_size={hidden_size}",
+    )
+
+
+@skip_unsupported
+def test_rms_norm_determinism():
+    """
+    Test that batch-invariant RMS norm produces deterministic results.
+
+    Runs the same input through the kernel multiple times and verifies
+    identical outputs.
+    """
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    eps = 1e-6
+    hidden_size = 4096
+    batch_size = 32
+
+    torch.manual_seed(42)
+    input_tensor = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Run multiple times
+    outputs = []
+    for _ in range(5):
+        output = triton_rms_norm(input_tensor.clone(), weight, eps=eps)
+        outputs.append(output)
+
+    # All outputs should be identical
+    reference = outputs[0]
+    for idx, output in enumerate(outputs[1:], start=1):
+        torch.testing.assert_close(
+            output,
+            reference,
+            rtol=0.0,
+            atol=0.0,
+            msg=f"RMS norm not deterministic: run {idx} differs from reference",
+        )
+
+
+if __name__ == "__main__":
+    # Run a quick smoke test
+    print("Running quick smoke test of RMS norm implementations...")
+
+    device = torch.device("cuda")
+    batch_size = 8
+    hidden_size = 4096
+    dtype = torch.bfloat16
+    eps = 1e-6
+
+    torch.manual_seed(42)
+    input_tensor = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+
+    # Standard implementation
+    rms_norm_layer = RMSNorm(hidden_size, eps=eps, dtype=dtype).to(device)
+    rms_norm_layer.weight.data = weight.clone()
+    standard_output = rms_norm_layer.forward_cuda(input_tensor)
+
+    # Batch-invariant implementation
+    triton_output = triton_rms_norm(input_tensor, weight, eps=eps)
+
+    # Compare
+    max_diff = (triton_output - standard_output).abs().max().item()
+    mean_diff = (triton_output - standard_output).abs().mean().item()
+
+    print(f"Max difference: {max_diff:.6e}")
+    print(f"Mean difference: {mean_diff:.6e}")
+    print(f"Standard output sample: {standard_output[0, :5].tolist()}")
+    print(f"Triton output sample: {triton_output[0, :5].tolist()}")
+
+    if max_diff < 1e-3:
+        print("✓ Smoke test passed!")
+    else:
+        print("✗ Smoke test failed - differences too large")
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5141837faea043ec7271b12c1be7d24d7025935d
--- /dev/null
+++ b/tests/v1/determinism/utils.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+skip_unsupported = pytest.mark.skipif(
+    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
+    reason="Requires CUDA and >= Hopper (SM90)",
+)
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    # Generate more realistic prompts that will actually produce varied tokens
+    # Use a mix of common English text patterns
+
+    prompt_templates = [
+        # Question-answer style
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        # Story/narrative style
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        # Technical/code style
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        # Factual/informative style
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        # Conversational style
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    # Pick a random template
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        # For longer prompts, repeat context
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def _extract_step_logprobs(request_output):
+    if getattr(request_output, "outputs", None):
+        inner = request_output.outputs[0]
+        if hasattr(inner, "logprobs") and inner.logprobs is not None:
+            t = torch.tensor(
+                [
+                    inner.logprobs[i][tid].logprob
+                    for i, tid in enumerate(inner.token_ids)
+                ],
+                dtype=torch.float32,
+            )
+            return t, inner.token_ids
+
+    return None, None
diff --git a/vllm/executor/__init__.py b/tests/v1/distributed/__init__.py
similarity index 100%
rename from vllm/executor/__init__.py
rename to tests/v1/distributed/__init__.py
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
similarity index 58%
rename from tests/v1/test_async_llm_dp.py
rename to tests/v1/distributed/test_async_llm_dp.py
index cef0f362cff86a0079d400e20f826374d326cdfb..60f9017184ea0ebd76a69949f65c093339d57e6c 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -5,7 +5,6 @@ import asyncio
 import os
 from contextlib import ExitStack
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
@@ -17,51 +16,55 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
 from vllm.v1.metrics.loggers import StatLoggerBase
-from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, MultiModalCacheStats, SchedulerStats
 
 DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
-engine_args = AsyncEngineArgs(
-    model="ibm-research/PowerMoE-3b",
-    enforce_eager=True,
-    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
-    data_parallel_size=DP_SIZE,
-)
-
 
 async def generate(
-        engine: AsyncLLM,
-        request_id: str,
-        prompt: PromptType,
-        output_kind: RequestOutputKind,
-        max_tokens: int,
-        prompt_logprobs: Optional[int] = None,
-        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    prompt_logprobs: int | None = None,
+    data_parallel_rank: int | None = None,
+) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
     count = 0
-    sampling_params = SamplingParams(max_tokens=max_tokens,
-                                     ignore_eos=True,
-                                     output_kind=output_kind,
-                                     temperature=0,
-                                     prompt_logprobs=prompt_logprobs)
-    async for out in engine.generate(request_id=request_id,
-                                     prompt=prompt,
-                                     sampling_params=sampling_params,
-                                     data_parallel_rank=data_parallel_rank):
-
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0,
+        prompt_logprobs=prompt_logprobs,
+    )
+    async for out in engine.generate(
+        request_id=request_id,
+        prompt=prompt,
+        sampling_params=sampling_params,
+        data_parallel_rank=data_parallel_rank,
+    ):
         num_tokens = len(out.outputs[0].token_ids)
         if output_kind == RequestOutputKind.DELTA:
             count += num_tokens
         else:
             count = num_tokens
 
-        await asyncio.sleep(0.)
+        await asyncio.sleep(0.0)
 
     return count, request_id
 
 
+@pytest.mark.parametrize(
+    "model",
+    [
+        "ibm-research/PowerMoE-3b",
+        "hmellor/tiny-random-LlamaForCausalLM",
+    ],
+)
 @pytest.mark.parametrize(
     "output_kind",
     [
@@ -72,9 +75,15 @@ async def generate(
 @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
 @pytest.mark.parametrize("async_scheduling", [True, False])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
-                    async_scheduling: bool):
-
+async def test_load(
+    model: str,
+    output_kind: RequestOutputKind,
+    data_parallel_backend: str,
+    async_scheduling: bool,
+):
+    if async_scheduling and data_parallel_backend == "ray":
+        # TODO(NickLucche) Re-enable when async scheduling is supported
+        pytest.skip("Async scheduling is not supported with ray")
     stats_loggers = {}
 
     @dataclass
@@ -85,25 +94,33 @@ async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
         def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
             stats_loggers[engine_index] = self
 
-        def record(self,
-                   scheduler_stats: Optional[SchedulerStats],
-                   iteration_stats: Optional[IterationStats],
-                   engine_idx: int = 0):
+        def record(
+            self,
+            scheduler_stats: SchedulerStats | None,
+            iteration_stats: IterationStats | None,
+            mm_cache_stats: MultiModalCacheStats | None = None,
+            engine_idx: int = 0,
+        ):
             if iteration_stats:
-                self.finished_req_count += len(
-                    iteration_stats.finished_requests)
+                self.finished_req_count += len(iteration_stats.finished_requests)
 
         def log_engine_initialized(self):
             self.init_count += 1
 
     with ExitStack() as after:
-
         prompt = "This is a test of data parallel"
 
-        engine_args.data_parallel_backend = data_parallel_backend
-        engine_args.async_scheduling = async_scheduling
-        engine = AsyncLLM.from_engine_args(engine_args,
-                                           stat_loggers=[SimpleStatsLogger])
+        engine_args = AsyncEngineArgs(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+            data_parallel_size=DP_SIZE,
+            data_parallel_backend=data_parallel_backend,
+            async_scheduling=async_scheduling,
+        )
+        engine = AsyncLLM.from_engine_args(
+            engine_args, stat_loggers=[SimpleStatsLogger]
+        )
         after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
@@ -116,20 +133,23 @@ async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(
+                        engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS
+                    )
+                )
+            )
             # Short sleep to ensure that requests are distributed.
             await asyncio.sleep(0.01)
         # Confirm that we got all the EXPECTED tokens from the requests.
-        done, pending = await asyncio.wait(tasks,
-                                           return_when=asyncio.FIRST_EXCEPTION)
+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
         for task in pending:
             task.cancel()
         for task in done:
             num_generated_tokens, request_id = await task
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} but "
-                f"expected {NUM_EXPECTED_TOKENS}")
+                f"expected {NUM_EXPECTED_TOKENS}"
+            )
 
         assert not engine.output_processor.has_unfinished_requests()
 
@@ -153,5 +173,6 @@ async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
         for sl in stats_loggers.values():
             slogger: SimpleStatsLogger = sl
 
-            assert slogger.finished_req_count > NUM_REQUESTS // (
-                DP_SIZE + 1), f"requests are imbalanced: {stats_loggers}"
+            assert slogger.finished_req_count > NUM_REQUESTS // (DP_SIZE + 1), (
+                f"requests are imbalanced: {stats_loggers}"
+            )
diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
new file mode 100644
index 0000000000000000000000000000000000000000..866ae742bf3c029195f46130f2174356780bbcce
--- /dev/null
+++ b/tests/v1/distributed/test_dbo.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism.
+
+DBO is specifically designed for DP+EP scenarios to hide communication latency
+by overlapping computation of two batches. This test validates that DBO works
+correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
+"""
+
+import pytest
+
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+DP_SIZE = 2
+
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI; but must be large enough to hit dbo thresholds
+NUM_SHOTS = 5  # Few-shot examples
+MIN_ACCURACY = 0.62  # Expected 0.64 with 2% buffer (based on vLLM test data)
+
+# Increase max_num_seqs to trigger DBO for decode batches
+# With 64 seqs, decode batches should exceed the 32 token threshold
+MAX_NUM_SEQS = 64  # Increased from 16 to trigger decode DBO
+
+# DeepEP backends to test
+DEEPEP_BACKENDS = [
+    "deepep_low_latency",
+    "deepep_high_throughput",
+]
+
+
+@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
+def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
+    """
+    Test DBO with DP+EP using GSM8K evaluation.
+    """
+    required_gpus = DP_SIZE
+
+    if num_gpus_available < required_gpus:
+        pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})")
+
+    # Server arguments for DBO + DP + EP
+    server_args = [
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),  # Use larger batch to trigger decode DBO
+        "--trust-remote-code",
+        # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching
+        "--data-parallel-size",
+        str(DP_SIZE),
+        "--enable-expert-parallel",
+        "--enable-dbo",
+        # Fix threshold so we know we trigger DBO
+        "--dbo-decode-token-threshold",
+        "16",
+        "--dbo-prefill-token-threshold",
+        "256",
+        "--all2all-backend",
+        all2all_backend,
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        server_args,
+        max_wait_seconds=600,  # Allow time for model loading with DP+EP
+    ) as remote_server:
+        # Use host and port directly from RemoteOpenAIServer
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
+
+        # Run GSM8K evaluation
+        results = evaluate_gsm8k(
+            num_questions=NUM_QUESTIONS,
+            num_shots=NUM_SHOTS,
+            host=host,
+            port=port,
+        )
+
+        # Validate accuracy is reasonable
+        accuracy = results["accuracy"]
+        assert accuracy >= MIN_ACCURACY, (
+            f"DBO+DP+EP accuracy too low ({all2all_backend}): "
+            f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
+            f"(correct: {results['num_correct']}/{results['num_questions']})"
+        )
diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/distributed/test_external_lb_dp.py
similarity index 69%
rename from tests/v1/test_external_lb_dp.py
rename to tests/v1/distributed/test_external_lb_dp.py
index 862a76f3c4e2d243fa0115e44f0178395817a32d..912f8cffe7f6d7737cf848f9d6c9312cdf0f6430 100644
--- a/tests/v1/test_external_lb_dp.py
+++ b/tests/v1/distributed/test_external_lb_dp.py
@@ -26,12 +26,14 @@ class ExternalLBServerManager:
     """Manages data parallel vLLM server instances for external
     load balancer testing."""
 
-    def __init__(self,
-                 model_name: str,
-                 dp_size: int,
-                 api_server_count: int,
-                 base_server_args: list,
-                 tp_size: int = TP_SIZE):
+    def __init__(
+        self,
+        model_name: str,
+        dp_size: int,
+        api_server_count: int,
+        base_server_args: list,
+        tp_size: int = TP_SIZE,
+    ):
         self.model_name = model_name
         self.dp_size = dp_size
         self.tp_size = tp_size
@@ -47,20 +49,22 @@ class ExternalLBServerManager:
             server_args = self.base_server_args.copy()
 
             # Add external LB specific arguments
-            server_args.extend([
-                "--data-parallel-size",
-                str(self.dp_size),
-                "--data-parallel-rank",
-                str(rank),
-                "--data-parallel-size-local",
-                "1",
-                "--tensor-parallel-size",
-                str(self.tp_size),
-                "--port",
-                str(8000 + rank),  # Different port for each rank
-                "--api-server-count",
-                str(self.api_server_count),
-            ])
+            server_args.extend(
+                [
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-rank",
+                    str(rank),
+                    "--data-parallel-size-local",
+                    "1",
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--port",
+                    str(8000 + rank),  # Different port for each rank
+                    "--api-server-count",
+                    str(self.api_server_count),
+                ]
+            )
 
             # Use a thread to start each server to allow parallel initialization
             def start_server(r: int, sargs: list[str]):
@@ -71,25 +75,24 @@ class ExternalLBServerManager:
                         sargs,
                         auto_port=False,
                         env_dict={
-                            "VLLM_SERVER_DEV_MODE":
-                            "1",
-                            current_platform.device_control_env_var:
-                            ",".join(
-                                str(
-                                    current_platform.
-                                    device_id_to_physical_device_id(i))
-                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
-                        })
+                            "VLLM_SERVER_DEV_MODE": "1",
+                            current_platform.device_control_env_var: ",".join(
+                                str(current_platform.device_id_to_physical_device_id(i))
+                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE)
+                            ),
+                        },
+                    )
                     server.__enter__()
-                    print(f"Server rank {r} started successfully with "
-                          f"{self.api_server_count} API servers")
+                    print(
+                        f"Server rank {r} started successfully with "
+                        f"{self.api_server_count} API servers"
+                    )
                     self.servers.append((server, sargs))
                 except Exception as e:
                     print(f"Failed to start server rank {r}: {e}")
                     raise
 
-            thread = threading.Thread(target=start_server,
-                                      args=(rank, server_args))
+            thread = threading.Thread(target=start_server, args=(rank, server_args))
             thread.start()
 
             self.server_threads.append(thread)
@@ -132,9 +135,9 @@ def default_server_args():
 @pytest.fixture(scope="module", params=[1, 4])
 def server_manager(request, default_server_args):
     api_server_count = request.param
-    server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE,
-                                             api_server_count,
-                                             default_server_args)
+    server_manager = ExternalLBServerManager(
+        MODEL_NAME, DP_SIZE, api_server_count, default_server_args
+    )
 
     with server_manager:
         yield server_manager
@@ -174,18 +177,16 @@ def test_external_lb_server_info(server_manager):
         # `n_reqs` is set so that there is a good chance each server
         # receives at least one request
         n_reqs = 2 * api_server_count * api_server_count
-        parallel_configs = [
-            _get_parallel_config(server) for _ in range(n_reqs)
-        ]
-        api_process_counts = [
-            c["_api_process_count"] for c in parallel_configs
-        ]
+        parallel_configs = [_get_parallel_config(server) for _ in range(n_reqs)]
+        api_process_counts = [c["_api_process_count"] for c in parallel_configs]
         api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
 
-        assert all(c == api_server_count
-                   for c in api_process_counts), api_process_counts
-        assert all(0 <= r < api_server_count
-                   for r in api_process_ranks), api_process_ranks
+        assert all(c == api_server_count for c in api_process_counts), (
+            api_process_counts
+        )
+        assert all(0 <= r < api_server_count for r in api_process_ranks), (
+            api_process_ranks
+        )
 
 
 @pytest.mark.asyncio
@@ -193,16 +194,15 @@ def test_external_lb_server_info(server_manager):
     "model_name",
     [MODEL_NAME],
 )
-async def test_external_lb_single_completion(clients: list[
-    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
-                                             model_name: str) -> None:
-
+async def test_external_lb_single_completion(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     async def make_request(client: openai.AsyncOpenAI):
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=10,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=10, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -256,11 +256,14 @@ async def test_external_lb_single_completion(clients: list[
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
     print(
         f"Successfully completed external LB test with {len(clients)} servers "
-        f"(API server count: {api_server_count})")
+        f"(API server count: {api_server_count})"
+    )
 
 
 @pytest.mark.asyncio
@@ -268,9 +271,11 @@ async def test_external_lb_single_completion(clients: list[
     "model_name",
     [MODEL_NAME],
 )
-async def test_external_lb_completion_streaming(clients: list[
-    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
-                                                model_name: str) -> None:
+async def test_external_lb_completion_streaming(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request(client: openai.AsyncOpenAI):
@@ -284,11 +289,9 @@ async def test_external_lb_completion_streaming(clients: list[
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -299,16 +302,15 @@ async def test_external_lb_completion_streaming(clients: list[
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single request to each server
@@ -324,10 +326,7 @@ async def test_external_lb_completion_streaming(clients: list[
     all_tasks = []
 
     for i, client in enumerate(clients):
-        tasks = [
-            make_streaming_request(client)
-            for _ in range(num_requests_per_server)
-        ]
+        tasks = [make_streaming_request(client) for _ in range(num_requests_per_server)]
         all_tasks.extend(tasks)
 
     results = await asyncio.gather(*all_tasks)
@@ -339,10 +338,7 @@ async def test_external_lb_completion_streaming(clients: list[
     # Second burst of streaming requests
     all_tasks = []
     for i, client in enumerate(clients):
-        tasks = [
-            make_streaming_request(client)
-            for _ in range(num_requests_per_server)
-        ]
+        tasks = [make_streaming_request(client) for _ in range(num_requests_per_server)]
         all_tasks.extend(tasks)
 
     results = await asyncio.gather(*all_tasks)
@@ -351,7 +347,11 @@ async def test_external_lb_completion_streaming(clients: list[
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
-    print(f"Successfully completed external LB streaming test with "
-          f"{len(clients)} servers (API server count: {api_server_count})")
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed external LB streaming test with "
+        f"{len(clients)} servers (API server count: {api_server_count})"
+    )
diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/distributed/test_hybrid_lb_dp.py
similarity index 69%
rename from tests/v1/test_hybrid_lb_dp.py
rename to tests/v1/distributed/test_hybrid_lb_dp.py
index 552436f818d7765af1248db16918b59b27c6e2b4..aa25130752a49de1e3bcfe61b213f815a0fc7b4c 100644
--- a/tests/v1/test_hybrid_lb_dp.py
+++ b/tests/v1/distributed/test_hybrid_lb_dp.py
@@ -12,7 +12,7 @@ import pytest_asyncio
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
@@ -28,17 +28,19 @@ DP_SIZE_LOCAL = DP_SIZE // NUM_NODES  # 2 ranks per node
 
 
 class HybridLBServerManager:
-    """Manages hybrid data parallel vLLM server instances where each node 
-    runs a single logical API server that balances requests only to the 
+    """Manages hybrid data parallel vLLM server instances where each node
+    runs a single logical API server that balances requests only to the
     DP engines running on that same node."""
 
-    def __init__(self,
-                 model_name: str,
-                 dp_size: int,
-                 api_server_count: int,
-                 base_server_args: list,
-                 dp_size_local: int = DP_SIZE_LOCAL,
-                 tp_size: int = TP_SIZE):
+    def __init__(
+        self,
+        model_name: str,
+        dp_size: int,
+        api_server_count: int,
+        base_server_args: list,
+        dp_size_local: int = DP_SIZE_LOCAL,
+        tp_size: int = TP_SIZE,
+    ):
         self.model_name = model_name
         self.dp_size = dp_size
         self.dp_size_local = dp_size_local
@@ -59,25 +61,27 @@ class HybridLBServerManager:
             start_rank = node_id * self.dp_size_local
 
             # Add hybrid LB specific arguments
-            server_args.extend([
-                "--data-parallel-size",
-                str(self.dp_size),
-                "--data-parallel-size-local",
-                str(self.dp_size_local),
-                "--data-parallel-start-rank",
-                str(start_rank),
-                "--data-parallel-hybrid-lb",  # Enable hybrid LB mode
-                "--tensor-parallel-size",
-                str(self.tp_size),
-                "--port",
-                str(8000 + node_id),  # Different port for each node
-                "--api-server-count",
-                str(self.api_server_count),
-                "--data-parallel-address",
-                "127.0.0.1",
-                "--data-parallel-rpc-port",
-                "13345",
-            ])
+            server_args.extend(
+                [
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-size-local",
+                    str(self.dp_size_local),
+                    "--data-parallel-start-rank",
+                    str(start_rank),
+                    "--data-parallel-hybrid-lb",  # Enable hybrid LB mode
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--port",
+                    str(8000 + node_id),  # Different port for each node
+                    "--api-server-count",
+                    str(self.api_server_count),
+                    "--data-parallel-address",
+                    "127.0.0.1",
+                    "--data-parallel-rpc-port",
+                    "13345",
+                ]
+            )
 
             # Use a thread to start each server to allow parallel initialization
             def start_server(node: int, sargs: list[str]):
@@ -93,26 +97,25 @@ class HybridLBServerManager:
                         sargs,
                         auto_port=False,
                         env_dict={
-                            "VLLM_SERVER_DEV_MODE":
-                            "1",
-                            current_platform.device_control_env_var:
-                            ",".join(
-                                str(
-                                    current_platform.
-                                    device_id_to_physical_device_id(i))
-                                for i in range(gpu_start, gpu_end))
-                        })
+                            "VLLM_SERVER_DEV_MODE": "1",
+                            current_platform.device_control_env_var: ",".join(
+                                str(current_platform.device_id_to_physical_device_id(i))
+                                for i in range(gpu_start, gpu_end)
+                            ),
+                        },
+                    )
                     server.__enter__()
-                    print(f"Hybrid LB node {node} started successfully with "
-                          f"{self.dp_size_local} local DP ranks and "
-                          f"{self.api_server_count} API servers")
+                    print(
+                        f"Hybrid LB node {node} started successfully with "
+                        f"{self.dp_size_local} local DP ranks and "
+                        f"{self.api_server_count} API servers"
+                    )
                     self.servers.append((server, sargs))
                 except Exception as e:
                     print(f"Failed to start hybrid LB node {node}: {e}")
                     raise
 
-            thread = threading.Thread(target=start_server,
-                                      args=(node_id, server_args))
+            thread = threading.Thread(target=start_server, args=(node_id, server_args))
             thread.start()
 
             self.server_threads.append(thread)
@@ -155,10 +158,14 @@ def default_server_args():
 @pytest.fixture(scope="module", params=[1, 4])
 def server_manager(request, default_server_args):
     api_server_count = request.param
-    server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE,
-                                           api_server_count,
-                                           default_server_args, DP_SIZE_LOCAL,
-                                           TP_SIZE)
+    server_manager = HybridLBServerManager(
+        MODEL_NAME,
+        DP_SIZE,
+        api_server_count,
+        default_server_args,
+        DP_SIZE_LOCAL,
+        TP_SIZE,
+    )
 
     with server_manager:
         yield server_manager
@@ -198,18 +205,16 @@ def test_hybrid_dp_server_info(server_manager):
         # `n_reqs` is set so that there is a good chance each server
         # receives at least one request
         n_reqs = 2 * api_server_count * api_server_count
-        parallel_configs = [
-            _get_parallel_config(server) for _ in range(n_reqs)
-        ]
-        api_process_counts = [
-            c["_api_process_count"] for c in parallel_configs
-        ]
+        parallel_configs = [_get_parallel_config(server) for _ in range(n_reqs)]
+        api_process_counts = [c["_api_process_count"] for c in parallel_configs]
         api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
 
-        assert all(c == api_server_count
-                   for c in api_process_counts), api_process_counts
-        assert all(0 <= r < api_server_count
-                   for r in api_process_ranks), api_process_ranks
+        assert all(c == api_server_count for c in api_process_counts), (
+            api_process_counts
+        )
+        assert all(0 <= r < api_server_count for r in api_process_ranks), (
+            api_process_ranks
+        )
 
 
 @pytest.mark.asyncio
@@ -217,17 +222,15 @@ def test_hybrid_dp_server_info(server_manager):
     "model_name",
     [MODEL_NAME],
 )
-async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
-                                    servers: list[tuple[RemoteOpenAIServer,
-                                                        list[str]]],
-                                    model_name: str) -> None:
-
+async def test_hybrid_lb_completion(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     async def make_request(client: openai.AsyncOpenAI):
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=5,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -251,9 +254,7 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
     for i, client in enumerate(clients):
         result = await make_request(client)
         assert result is not None
-        print(
-            f"Hybrid LB node {i} handled single completion request successfully"
-        )
+        print(f"Hybrid LB node {i} handled single completion request successfully")
 
     await asyncio.sleep(0.5)
 
@@ -284,8 +285,10 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
     print(
         f"Successfully completed hybrid LB test with {len(clients)} nodes "
         f"({DP_SIZE_LOCAL} DP ranks each, API server count: {api_server_count})"
@@ -302,9 +305,11 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
     "model_name",
     [MODEL_NAME],
 )
-async def test_hybrid_lb_completion_streaming(clients: list[
-    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
-                                              model_name: str) -> None:
+async def test_hybrid_lb_completion_streaming(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request(client: openai.AsyncOpenAI):
@@ -318,11 +323,9 @@ async def test_hybrid_lb_completion_streaming(clients: list[
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -333,25 +336,22 @@ async def test_hybrid_lb_completion_streaming(clients: list[
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single request to each node
     for i, client in enumerate(clients):
         result = await make_streaming_request(client)
         assert result is not None
-        print(
-            f"Hybrid LB node {i} handled single streaming request successfully"
-        )
+        print(f"Hybrid LB node {i} handled single streaming request successfully")
 
     await asyncio.sleep(0.5)
 
@@ -382,11 +382,15 @@ async def test_hybrid_lb_completion_streaming(clients: list[
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
-    print(f"Successfully completed hybrid LB streaming test with "
-          f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
-          f"API server count: {api_server_count})")
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed hybrid LB streaming test with "
+        f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
+        f"API server count: {api_server_count})"
+    )
 
     # Check request balancing within each node
     for i, (server, _) in enumerate(servers):
diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/distributed/test_internal_lb_dp.py
similarity index 66%
rename from tests/v1/test_internal_lb_dp.py
rename to tests/v1/distributed/test_internal_lb_dp.py
index e965645711ee636c16cc689694f1eb8710dbde79..8f7459e95ef67fcb80e00705cb68fb80168e3699 100644
--- a/tests/v1/test_internal_lb_dp.py
+++ b/tests/v1/distributed/test_internal_lb_dp.py
@@ -5,7 +5,7 @@ import os
 import threading
 import time
 import traceback
-from typing import Optional, cast
+from typing import cast
 
 import openai  # use the official client for correctness check
 import pytest
@@ -13,7 +13,7 @@ import pytest_asyncio
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
@@ -31,66 +31,71 @@ class MultinodeInternalLBServerManager:
     """Manages multi-node data parallel vLLM server instances for internal
     load balancer testing using --headless mode."""
 
-    def __init__(self,
-                 model_name: str,
-                 dp_size: int,
-                 api_server_count: int,
-                 base_server_args: list,
-                 dp_per_node: int = 1,
-                 tp_size: int = TP_SIZE):
+    def __init__(
+        self,
+        model_name: str,
+        dp_size: int,
+        api_server_count: int,
+        base_server_args: list,
+        dp_per_node: int = 1,
+        tp_size: int = TP_SIZE,
+    ):
         self.model_name = model_name
         self.dp_size = dp_size
         self.dp_per_node = dp_per_node
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[Optional[tuple[RemoteOpenAIServer,
-                                          list[str]]]] = [None] * (dp_size //
-                                                                   dp_per_node)
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]] | None] = [None] * (
+            dp_size // dp_per_node
+        )
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
         """Start all server instances for multi-node internal LB mode."""
-        for server_idx, rank in enumerate(
-                range(0, self.dp_size, self.dp_per_node)):
+        for server_idx, rank in enumerate(range(0, self.dp_size, self.dp_per_node)):
             # Create server args for this specific rank
             server_args = self.base_server_args.copy()
 
             if rank == 0:
                 # Head node - runs API server and first DP rank
-                server_args.extend([
-                    "--data-parallel-size",
-                    str(self.dp_size),
-                    "--data-parallel-size-local",
-                    str(self.dp_per_node),
-                    "--tensor-parallel-size",
-                    str(self.tp_size),
-                    "--port",
-                    "8000",  # Single endpoint for all requests
-                    "--api-server-count",
-                    str(self.api_server_count),
-                    "--data-parallel-address",
-                    "127.0.0.1",
-                    "--data-parallel-rpc-port",
-                    "13345",
-                ])
+                server_args.extend(
+                    [
+                        "--data-parallel-size",
+                        str(self.dp_size),
+                        "--data-parallel-size-local",
+                        str(self.dp_per_node),
+                        "--tensor-parallel-size",
+                        str(self.tp_size),
+                        "--port",
+                        "8000",  # Single endpoint for all requests
+                        "--api-server-count",
+                        str(self.api_server_count),
+                        "--data-parallel-address",
+                        "127.0.0.1",
+                        "--data-parallel-rpc-port",
+                        "13345",
+                    ]
+                )
             else:
                 # Secondary nodes - run in headless mode
-                server_args.extend([
-                    "--headless",
-                    "--data-parallel-size",
-                    str(self.dp_size),
-                    "--data-parallel-size-local",
-                    str(self.dp_per_node),
-                    "--data-parallel-start-rank",
-                    str(rank),
-                    "--tensor-parallel-size",
-                    str(self.tp_size),
-                    "--data-parallel-address",
-                    "127.0.0.1",
-                    "--data-parallel-rpc-port",
-                    "13345",
-                ])
+                server_args.extend(
+                    [
+                        "--headless",
+                        "--data-parallel-size",
+                        str(self.dp_size),
+                        "--data-parallel-size-local",
+                        str(self.dp_per_node),
+                        "--data-parallel-start-rank",
+                        str(rank),
+                        "--tensor-parallel-size",
+                        str(self.tp_size),
+                        "--data-parallel-address",
+                        "127.0.0.1",
+                        "--data-parallel-rpc-port",
+                        "13345",
+                    ]
+                )
 
             # Use a thread to start each server to allow parallel initialization
             def start_server(sidx: int, r: int, sargs: list[str]):
@@ -102,20 +107,19 @@ class MultinodeInternalLBServerManager:
                         sargs,
                         auto_port=False,
                         env_dict={
-                            "VLLM_SERVER_DEV_MODE":
-                            "1",
-                            current_platform.device_control_env_var:
-                            ",".join(
-                                str(
-                                    current_platform.
-                                    device_id_to_physical_device_id(i))
-                                for i in range(r, r + gpus_per_node))
-                        })
+                            "VLLM_SERVER_DEV_MODE": "1",
+                            current_platform.device_control_env_var: ",".join(
+                                str(current_platform.device_id_to_physical_device_id(i))
+                                for i in range(r, r + gpus_per_node)
+                            ),
+                        },
+                    )
                     server.__enter__()
                     if r == 0:
                         print(
                             f"Head node (rank {r}) started successfully with "
-                            f"{self.api_server_count} API servers")
+                            f"{self.api_server_count} API servers"
+                        )
                     else:
                         print(f"Headless node (rank {r}) started successfully")
                     self.servers[sidx] = (server, sargs)
@@ -124,8 +128,9 @@ class MultinodeInternalLBServerManager:
                     traceback.print_exc()
                     raise
 
-            thread = threading.Thread(target=start_server,
-                                      args=(server_idx, rank, server_args))
+            thread = threading.Thread(
+                target=start_server, args=(server_idx, rank, server_args)
+            )
             thread.start()
 
             self.server_threads.append(thread)
@@ -157,19 +162,20 @@ class APIOnlyServerManager:
     """Manages API-only server (Node 0) and headless engines server (Node 1)
     for testing separated API server and engine configuration."""
 
-    def __init__(self,
-                 model_name: str,
-                 dp_size: int,
-                 api_server_count: int,
-                 base_server_args: list,
-                 tp_size: int = TP_SIZE):
+    def __init__(
+        self,
+        model_name: str,
+        dp_size: int,
+        api_server_count: int,
+        base_server_args: list,
+        tp_size: int = TP_SIZE,
+    ):
         self.model_name = model_name
         self.dp_size = dp_size
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[Optional[tuple[RemoteOpenAIServer,
-                                          list[str]]]] = [None] * 2
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]] | None] = [None] * 2
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
@@ -177,38 +183,42 @@ class APIOnlyServerManager:
 
         # Start API-only server (Node 0) - no engines, only API server
         api_server_args = self.base_server_args.copy()
-        api_server_args.extend([
-            "--data-parallel-size",
-            str(self.dp_size),
-            "--data-parallel-size-local",
-            "0",  # No engines on this node
-            "--tensor-parallel-size",
-            str(self.tp_size),
-            "--port",
-            "8000",
-            "--api-server-count",
-            str(self.api_server_count),
-            "--data-parallel-address",
-            "127.0.0.1",
-            "--data-parallel-rpc-port",
-            "13345",
-        ])
+        api_server_args.extend(
+            [
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-size-local",
+                "0",  # No engines on this node
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--port",
+                "8000",
+                "--api-server-count",
+                str(self.api_server_count),
+                "--data-parallel-address",
+                "127.0.0.1",
+                "--data-parallel-rpc-port",
+                "13345",
+            ]
+        )
 
         # Start headless engines server (Node 1) - all engines, no API server
         engines_server_args = self.base_server_args.copy()
-        engines_server_args.extend([
-            "--headless",
-            "--data-parallel-size",
-            str(self.dp_size),
-            "--data-parallel-size-local",
-            str(self.dp_size),  # All engines on this node
-            "--tensor-parallel-size",
-            str(self.tp_size),
-            "--data-parallel-address",
-            "127.0.0.1",
-            "--data-parallel-rpc-port",
-            "13345",
-        ])
+        engines_server_args.extend(
+            [
+                "--headless",
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-size-local",
+                str(self.dp_size),  # All engines on this node
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--data-parallel-address",
+                "127.0.0.1",
+                "--data-parallel-rpc-port",
+                "13345",
+            ]
+        )
 
         # Use threads to start both servers in parallel
         def start_api_server():
@@ -220,10 +230,13 @@ class APIOnlyServerManager:
                     env_dict={
                         "VLLM_SERVER_DEV_MODE": "1",
                         # No GPUs needed for API-only server
-                    })
+                    },
+                )
                 server.__enter__()
-                print(f"API-only server started successfully with "
-                      f"{self.api_server_count} API servers")
+                print(
+                    f"API-only server started successfully with "
+                    f"{self.api_server_count} API servers"
+                )
                 self.servers[0] = (server, api_server_args)
             except Exception as e:
                 print(f"Failed to start API-only server: {e}")
@@ -236,16 +249,17 @@ class APIOnlyServerManager:
                     engines_server_args,
                     auto_port=False,
                     env_dict={
-                        current_platform.device_control_env_var:
-                        ",".join(
-                            str(
-                                current_platform.
-                                device_id_to_physical_device_id(i))
-                            for i in range(self.dp_size * self.tp_size))
-                    })
+                        current_platform.device_control_env_var: ",".join(
+                            str(current_platform.device_id_to_physical_device_id(i))
+                            for i in range(self.dp_size * self.tp_size)
+                        )
+                    },
+                )
                 server.__enter__()
-                print(f"Headless engines server started successfully with "
-                      f"{self.dp_size} engines")
+                print(
+                    f"Headless engines server started successfully with "
+                    f"{self.dp_size} engines"
+                )
                 self.servers[1] = (server, engines_server_args)
             except Exception as e:
                 print(f"Failed to start headless engines server: {e}")
@@ -301,11 +315,14 @@ def default_server_args():
 @pytest.fixture(scope="module", params=[1, 4])
 def server_manager(request, default_server_args):
     api_server_count = request.param
-    server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
-                                                      api_server_count,
-                                                      default_server_args,
-                                                      DP_SIZE // NUM_NODES,
-                                                      TP_SIZE)
+    server_manager = MultinodeInternalLBServerManager(
+        MODEL_NAME,
+        DP_SIZE,
+        api_server_count,
+        default_server_args,
+        DP_SIZE // NUM_NODES,
+        TP_SIZE,
+    )
 
     with server_manager:
         yield server_manager
@@ -320,8 +337,9 @@ def servers(server_manager):
 def api_only_servers(request, default_server_args):
     """Fixture for API-only server + headless engines configuration."""
     api_server_count = request.param
-    with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count,
-                              default_server_args, TP_SIZE) as server_list:
+    with APIOnlyServerManager(
+        MODEL_NAME, DP_SIZE, api_server_count, default_server_args, TP_SIZE
+    ) as server_list:
         yield server_list
 
 
@@ -335,8 +353,7 @@ async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
 
 
 @pytest_asyncio.fixture
-async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
-                                                       list[str]]]):
+async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, list[str]]]):
     """Client fixture for API-only server configuration."""
     # Connect to the API-only server (first server in the list)
     api_server = api_only_servers[0][0]
@@ -360,16 +377,12 @@ def test_multinode_dp_server_info(server_manager):
     # `n_reqs` is set so that there is a good chance each server
     # receives at least one request
     n_reqs = 2 * api_server_count * api_server_count
-    parallel_configs = [
-        _get_parallel_config(head_server) for _ in range(n_reqs)
-    ]
+    parallel_configs = [_get_parallel_config(head_server) for _ in range(n_reqs)]
     api_process_counts = [c["_api_process_count"] for c in parallel_configs]
     api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
 
-    assert all(c == api_server_count
-               for c in api_process_counts), api_process_counts
-    assert all(0 <= r < api_server_count
-               for r in api_process_ranks), api_process_ranks
+    assert all(c == api_server_count for c in api_process_counts), api_process_counts
+    assert all(0 <= r < api_server_count for r in api_process_ranks), api_process_ranks
 
 
 @pytest.mark.asyncio
@@ -377,17 +390,15 @@ def test_multinode_dp_server_info(server_manager):
     "model_name",
     [MODEL_NAME],
 )
-async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
-                                       servers: list[tuple[RemoteOpenAIServer,
-                                                           list[str]]],
-                                       model_name: str) -> None:
-
+async def test_multinode_dp_completion(
+    client: openai.AsyncOpenAI,
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     async def make_request():
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=5,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -410,9 +421,7 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
     # Test single request
     result = await make_request()
     assert result is not None
-    print(
-        "Multi-node internal LB handled single completion request successfully"
-    )
+    print("Multi-node internal LB handled single completion request successfully")
 
     await asyncio.sleep(0.5)
 
@@ -441,10 +450,14 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
-    print(f"Successfully completed multi-node internal LB test with "
-          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed multi-node internal LB test with "
+        f"{len(servers)} DP ranks (API server count: {api_server_count})"
+    )
 
     # Check request balancing via Prometheus metrics
     head_server = servers[0][0]
@@ -456,11 +469,11 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
-                                                 servers: list[
-                                                     tuple[RemoteOpenAIServer,
-                                                           list[str]]],
-                                                 model_name: str) -> None:
+async def test_multinode_dp_completion_streaming(
+    client: openai.AsyncOpenAI,
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request():
@@ -474,11 +487,9 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -489,23 +500,21 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single streaming request
     result = await make_streaming_request()
     assert result is not None
-    print(
-        "Multi-node internal LB handled single streaming request successfully")
+    print("Multi-node internal LB handled single streaming request successfully")
 
     await asyncio.sleep(0.5)
 
@@ -535,10 +544,14 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
-    print(f"Successfully completed multi-node internal LB streaming test with "
-          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed multi-node internal LB streaming test with "
+        f"{len(servers)} DP ranks (API server count: {api_server_count})"
+    )
 
     # Check request balancing via Prometheus metrics
     head_server = servers[0][0]
@@ -551,17 +564,16 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_api_only_multinode_dp_completion(
-        api_only_client: openai.AsyncOpenAI,
-        api_only_servers: list[tuple[RemoteOpenAIServer,
-                                     list[str]]], model_name: str) -> None:
+    api_only_client: openai.AsyncOpenAI,
+    api_only_servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     """Test API-only server with all engines on separate headless server."""
 
     async def make_request():
         completion = await api_only_client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=5,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -614,11 +626,14 @@ async def test_api_only_multinode_dp_completion(
 
     api_server, api_server_args = api_only_servers[0]
     api_server_count = (
-        api_server_args.count('--api-server-count')
-        and api_server_args[api_server_args.index('--api-server-count') + 1]
-        or 1)
-    print(f"Successfully completed API-only multi-node test with {DP_SIZE} "
-          f"engines on headless server (API server count: {api_server_count})")
+        api_server_args.count("--api-server-count")
+        and api_server_args[api_server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed API-only multi-node test with {DP_SIZE} "
+        f"engines on headless server (API server count: {api_server_count})"
+    )
 
     # Check request balancing via Prometheus metrics
     check_request_balancing(api_server, DP_SIZE)
@@ -630,9 +645,10 @@ async def test_api_only_multinode_dp_completion(
     [MODEL_NAME],
 )
 async def test_api_only_multinode_dp_completion_streaming(
-        api_only_client: openai.AsyncOpenAI,
-        api_only_servers: list[tuple[RemoteOpenAIServer,
-                                     list[str]]], model_name: str) -> None:
+    api_only_client: openai.AsyncOpenAI,
+    api_only_servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     """Test API-only server streaming with all engines on separate
     headless server."""
     prompt = "What is an LLM?"
@@ -648,11 +664,9 @@ async def test_api_only_multinode_dp_completion_streaming(
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await api_only_client.completions.create(model=model_name,
-                                                          prompt=prompt,
-                                                          max_tokens=5,
-                                                          temperature=0.0,
-                                                          stream=True)
+        stream = await api_only_client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -663,16 +677,15 @@ async def test_api_only_multinode_dp_completion_streaming(
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single streaming request
@@ -707,11 +720,14 @@ async def test_api_only_multinode_dp_completion_streaming(
 
     _, api_server_args = api_only_servers[0]
     api_server_count = (
-        api_server_args.count('--api-server-count')
-        and api_server_args[api_server_args.index('--api-server-count') + 1]
-        or 1)
-    print(f"Successfully completed API-only streaming test with {DP_SIZE} "
-          f"engines on headless server (API server count: {api_server_count})")
+        api_server_args.count("--api-server-count")
+        and api_server_args[api_server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed API-only streaming test with {DP_SIZE} "
+        f"engines on headless server (API server count: {api_server_count})"
+    )
 
     # Check request balancing via Prometheus metrics
     api_server = api_only_servers[0][0]
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d93e1ba0b53e185a9fa46e5898a050e1a312f0
--- /dev/null
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import repeat
+from typing import Any
+
+import pytest
+import torch._dynamo.config as dynamo_config
+
+from vllm import SamplingParams
+from vllm.logprobs import Logprob
+from vllm.sampling_params import StructuredOutputsParams
+from vllm.v1.metrics.reader import Metric
+
+from ...conftest import VllmRunner
+from ...models.utils import check_outputs_equal
+
+MODEL = "Qwen/Qwen3-0.6B"
+MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+first_prompt = (
+    "The following numbers of the sequence "
+    + ", ".join(str(i) for i in range(10))
+    + " are:"
+)
+example_prompts = [first_prompt, "In one word, the capital of France is "] + [
+    f"Tell me about the number {i}: " for i in range(32)
+]
+
+default_params = dict(
+    temperature=0.0,  # greedy
+    max_tokens=23,
+    min_tokens=18,
+)
+
+
+def test_without_spec_decoding(
+    sample_json_schema,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor, prefill chunking."""
+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+    test_sampling_params: list[dict[str, Any]] = [
+        dict(),
+        # dict(min_tokens=20),
+        dict(presence_penalty=-1.0),
+        dict(bad_words=["the", " the"]),
+        dict(logprobs=2),
+        dict(logprobs=2, presence_penalty=-1.0),
+        dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+            presence_penalty=-1.0,
+        ),
+    ]
+
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (True, "mp", False, None, True),
+        (False, "mp", True, None, False),
+        (False, "uni", True, None, False),
+        (True, "mp", True, None, False),
+        (True, "uni", True, None, False),
+        (False, "mp", True, None, True),
+        (True, "mp", True, None, True),
+        (True, "uni", True, None, True),
+    ]
+
+    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
+
+
+def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+    """Test consistency and acceptance rates with some different combos of
+    preemption, executor, async scheduling, prefill chunking,
+    spec decoding model length.
+    """
+
+    spec_config = {
+        "method": "eagle3",
+        "num_speculative_tokens": 2,
+        "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
+    }
+    # Set small draft model len to force doesn't-fit-in-drafter case.
+    spec_config_short = spec_config | {"max_model_len": 50}
+
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (False, "mp", False, spec_config, False),
+        (True, "mp", False, spec_config, True),
+        (True, "uni", False, spec_config_short, True),
+        (False, "mp", True, spec_config, False),
+        (True, "mp", True, spec_config, False),
+        (False, "mp", True, spec_config_short, True),
+        (True, "uni", True, spec_config, False),
+        (True, "uni", True, spec_config_short, False),
+        (True, "mp", True, spec_config, True),
+        (True, "uni", True, spec_config_short, True),
+    ]
+
+    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+
+
+@dynamo_config.patch(cache_size_limit=16)
+def run_tests(
+    monkeypatch: pytest.MonkeyPatch,
+    model: str,
+    test_configs: list[tuple],
+    test_sampling_params: list[dict[str, Any]],
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor with spec decoding."""
+
+    with monkeypatch.context() as m:
+        # avoid precision errors
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        # m.setenv("VLLM_BATCH_INVARIANT", "1")
+        outputs: list[tuple[str, list, list]] = []
+        for n, (
+            test_preemption,
+            executor,
+            async_scheduling,
+            spec_config,
+            test_prefill_chunking,
+        ) in enumerate(test_configs, 1):
+            test_str = f"{n}/{len(test_configs)}"
+            test_results = run_test(
+                model,
+                test_str,
+                test_sampling_params,
+                test_preemption,
+                executor,
+                async_scheduling,
+                spec_config,
+                test_prefill_chunking=test_prefill_chunking,
+            )
+            outputs.append(test_results)
+
+    baseline_config, baseline_tests, _ = outputs[0]
+    _, _, baseline_acceptances = next(
+        (o for o in outputs if o[2] is not None), (None, None, None)
+    )
+
+    print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
+
+    failure = None
+    for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
+        for (base_outs, base_logprobs), base_acceptance_rate, (
+            test_outs,
+            test_logprobs,
+        ), test_acceptance_rate, params in zip(
+            baseline_tests,
+            baseline_acceptances or repeat(None),
+            test_outputs,
+            test_acceptance_rates or repeat(None),
+            test_sampling_params,
+        ):
+            try:
+                check_outputs_equal(
+                    outputs_0_lst=base_outs,
+                    outputs_1_lst=test_outs,
+                    name_0=f"baseline=[{baseline_config}], params={params}",
+                    name_1=f"config=[{test_config}], params={params}",
+                )
+                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                if (
+                    base_acceptance_rate is not None
+                    and test_acceptance_rate is not None
+                ):
+                    if "spec_mml=None" in test_config:
+                        assert (
+                            test_acceptance_rate > base_acceptance_rate
+                            or test_acceptance_rate
+                            == pytest.approx(base_acceptance_rate, rel=5e-2)
+                        )
+                    else:
+                        # Currently the reported acceptance rate is expected to be
+                        # lower when we sometimes skip drafting altogether.
+                        assert test_acceptance_rate > 0.1
+                print(
+                    f"PASSED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+            except AssertionError as e:
+                print(
+                    f"FAILED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+                if failure is None:
+                    failure = e
+
+    if failure is not None:
+        raise failure
+
+
+def run_test(
+    model: str,
+    test_str: str,
+    sampling_param_tests: list[dict[str, Any]],
+    test_preemption: bool,
+    executor: str,
+    async_scheduling: bool,
+    spec_config: dict[str, Any] | None,
+    test_prefill_chunking: bool,
+):
+    spec_decoding = spec_config is not None
+    cache_arg: dict[str, Any] = (
+        # Force preemptions
+        dict(num_gpu_blocks_override=32)
+        if test_preemption
+        else dict(gpu_memory_utilization=0.9)
+    )
+    spec_mml = (spec_config or {}).get("max_model_len")
+    test_config = (
+        f"executor={executor}, preemption={test_preemption}, "
+        f"async_sched={async_scheduling}, "
+        f"chunk_prefill={test_prefill_chunking}, "
+        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+    )
+    print("-" * 80)
+    print(f"---- TESTING {test_str}: {test_config}")
+    print("-" * 80)
+    with VllmRunner(
+        model,
+        max_model_len=512,
+        enable_chunked_prefill=test_prefill_chunking,
+        # Force prefill chunking
+        max_num_batched_tokens=48 if test_prefill_chunking else None,
+        # enforce_eager=True,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=executor,
+        dtype="float32",  # avoid precision errors
+        speculative_config=spec_config,
+        disable_log_stats=False,
+        **cache_arg,
+    ) as vllm_model:
+        results = []
+        acceptance_rates: list[float] | None = [] if spec_decoding else None
+        for override_params in sampling_param_tests:
+            metrics_before = vllm_model.llm.get_metrics()
+            print(f"----------- RUNNING PARAMS: {override_params}")
+            results.append(
+                vllm_model.generate(
+                    example_prompts,
+                    sampling_params=SamplingParams(**default_params, **override_params),
+                    return_logprobs=True,
+                )
+            )
+            metrics_after = vllm_model.llm.get_metrics()
+            if acceptance_rates is not None:
+                acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
+                acceptance_rates.append(acceptance_rate)
+                print(f"ACCEPTANCE RATE {acceptance_rate}")
+
+            if test_preemption:
+                preemptions = _get_count(
+                    metrics_before, metrics_after, "vllm:num_preemptions"
+                )
+                assert preemptions > 0, "preemption test had no preemptions"
+
+    if len(results) > 1:
+        # First check that the different parameter configs
+        # actually result in different output.
+        for (other_test_outs, other_test_logprobs), params in zip(
+            results[1:], sampling_param_tests[1:]
+        ):
+            with pytest.raises(AssertionError):
+                check_outputs_equal(
+                    outputs_0_lst=results[0][0],
+                    outputs_1_lst=other_test_outs,
+                    name_0=f"baseline params={params}",
+                    name_1=f"other params={params}",
+                )
+                assert _all_logprobs_match(results[0][1], other_test_logprobs)
+
+    return test_config, results, acceptance_rates
+
+
+def _all_logprobs_match(req_a, req_b) -> bool:
+    return (
+        req_a == req_b
+        or len(req_a) == len(req_b)
+        and all(
+            len(seq_a) == len(seq_b)
+            and all(_logprobs_match(a, b) for a, b in zip(seq_a, seq_b))
+            for seq_a, seq_b in zip(req_a, req_b)
+        )
+    )
+
+
+def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
+    return len(lps_a) == len(lps_b) and all(
+        a.decoded_token == b.decoded_token
+        and a.rank == b.rank
+        and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
+        for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
+    )
+
+
+def _get_acceptance_rate(before: list[Metric], after: list[Metric]) -> float:
+    draft = _get_count(before, after, "vllm:spec_decode_num_draft_tokens")
+    accept = _get_count(before, after, "vllm:spec_decode_num_accepted_tokens")
+    return accept / draft if draft > 0 else 0.0
+
+
+def _get_count(before: list[Metric], after: list[Metric], name: str) -> int:
+    before_val = next(m.value for m in before if m.name == name)
+    after_val = next(m.value for m in after if m.name == name)
+    return after_val - before_val
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 5022347a87a4d5e814e97a36cf44ffe6bf334e66..0fcb97fe630552ac914ca51d8960faf89751bf89 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -14,11 +14,12 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     if attn_backend == "FLASHINFER":
-        pytest.skip("This test is failing with FlashInfer backend and "
-                    "needs investigation. See issue #25679.")
+        pytest.skip(
+            "This test is failing with FlashInfer backend and "
+            "needs investigation. See issue #25679."
+        )
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac40bec35fe2b3143b260fd0a74217eae9130c4
--- /dev/null
+++ b/tests/v1/e2e/test_context_length.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for vLLM `vllm/v1/engine/processor.Processor._validate_model_input()`
+handling of maximum context length for decoder models.
+
+This test ensures:
+- A prompt that is one token shorter than the model's maximum context length
+  can be processed successfully when requesting one additional token.
+- A prompt that reaches the model's maximum context length throws a
+  `ValueError` when requesting at least one additional token.
+"""
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model, max_model_len", [("JackFram/llama-160m", 2048)])
+@pytest.mark.parametrize(
+    "prompt_len, max_tokens",
+    [
+        (2047, 1),  # prompt_len = max_model_len - 1 -> allowed
+        (2048, 1),  # prompt_len = max_model_len -> not allowed
+    ],
+)
+def test_decoder_max_context_length_validation(
+    model: str,
+    max_model_len: int,
+    vllm_runner: type[VllmRunner],
+    prompt_len: int,
+    max_tokens: int,
+) -> None:
+    """Check vLLM decoder model input validation for edge cases where
+    the prompt length is (almost) equal to the max model length."""
+
+    prompt_ids = [[43] * prompt_len]
+
+    with vllm_runner(
+        model_name=model,
+        tokenizer_name=model,
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        if prompt_len + max_tokens <= max_model_len:
+            # Should succeed as constraints are met
+            vllm_model.generate_greedy(prompt_ids, max_tokens)
+        else:
+            # Should raise the ValueError defined in
+            # vllm/v1/engine/processor.Processor_validate_model_input()
+            expected_msg = (
+                f"The decoder prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than "
+                f"the maximum model length of {max_model_len}. "
+                "Make sure that `max_model_len` is no smaller than the number of "
+                "text tokens (prompt + requested output tokens)."
+            )
+            with pytest.raises(ValueError) as excinfo:
+                vllm_model.generate_greedy(prompt_ids, max_tokens)
+            assert expected_msg in str(excinfo.value)
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 5b0c1547225100a7005faf7edf900c93320818e5..71b0e86c75c184c1b127ca2c18f03c0cfedb5464 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -26,51 +26,53 @@ model_config = {
     [
         "bigcode/starcoder2-3b",  # sliding window only
         "google/gemma-3-1b-it",  # sliding window + full attention
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
-def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed,
-                                  disable_hybrid_kv_cache_manager):
+def test_sliding_window_retrieval(
+    model, batch_size, seed, disable_hybrid_kv_cache_manager
+):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
     If we tell it upfront which we are going to be looking for, then
     it answers correctly (mostly).
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    test_config = model_config[model]
 
-        test_config = model_config[model]
+    llm = LLM(
+        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
-        llm = LLM(
-            model=model,
-            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager)
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
 
-        prompts, answer, indices = prep_prompts(batch_size,
-                                                ln_range=test_config.ln_range)
+    check_length(prompts, llm, test_config.sliding_window)
 
-        check_length(prompts, llm, test_config.sliding_window)
+    # Fresh generation
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )
 
-        # Fresh generation
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(indices,
-                      answer,
-                      [response.outputs[0].text for response in responses],
-                      accept_rate=1.0)
-
-        # Re-generate with the same prompts to test prefix caching
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(indices,
-                      answer,
-                      [response.outputs[0].text for response in responses],
-                      accept_rate=1.0)
+    # Re-generate with the same prompts to test prefix caching
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )
 
 
 def check_length(prompts: list[str], llm: LLM, sliding_window: int):
     """
-    Check if the prompt length is valid, i.e., longer than the sliding window 
+    Check if the prompt length is valid, i.e., longer than the sliding window
     size and shorter than the model's max length.
 
     Args:
@@ -80,9 +82,9 @@ def check_length(prompts: list[str], llm: LLM, sliding_window: int):
     """
     tokenizer = llm.get_tokenizer()
     max_model_len = llm.llm_engine.model_config.max_model_len
-    assert any(
-        len(tokenizer.encode(prompt)) > sliding_window
-        for prompt in prompts), "Prompt is too short for test"
-    assert all(
-        len(tokenizer.encode(prompt)) <= max_model_len
-        for prompt in prompts), "Prompt is too long for test"
+    assert any(len(tokenizer.encode(prompt)) > sliding_window for prompt in prompts), (
+        "Prompt is too short for test"
+    )
+    assert all(len(tokenizer.encode(prompt)) <= max_model_len for prompt in prompts), (
+        "Prompt is too long for test"
+    )
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 6bc9b2b1d82d24fc4bf54c3870c309e98b4eb403..2778b0c5e5670227997251e831eba9d44db590c0 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -4,13 +4,11 @@
 import random
 
 import pytest
-import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel
-from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.config import CompilationConfig, CompilationMode
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
 # global seed
 SEED = 42
@@ -45,28 +43,12 @@ def test_prompts():
     return prompts
 
 
-def cleanup(llm: LLM, compilation_config: CompilationConfig):
-    # hacky: below lines are required to free up memory for the next test
-    # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient
-    # TODO(sarckk): when enforce_eager=False, memory is not freed:
-    # find out why and re-enable test for enforce_eager=False case
-    llm_engine = llm.llm_engine.engine_core.engine_core
-    model_runner = llm_engine.model_executor.driver_worker.worker.model_runner
-    del model_runner.model
-    del model_runner.kv_caches
-    del compilation_config.static_forward_context
-    compilation_config.static_forward_context = {}
-
-    del llm
-    torch.cuda.empty_cache()
-    cleanup_dist_env_and_memory()
-
-
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
+@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
+    kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
     test_prompts: list[str],
 ):
@@ -75,40 +57,29 @@ def test_kv_sharing_fast_prefill(
         # This allows vLLM compilation backend to handle allocating and
         # managing buffers for cudagraph
         cudagraph_copy_inputs=True,
-        level=CompilationLevel.PIECEWISE
-        if not enforce_eager else CompilationLevel.NO_COMPILATION)
+        mode=CompilationMode.VLLM_COMPILE
+        if not enforce_eager
+        else CompilationMode.NONE,
+    )
+    batch_size = 10
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Make scheduling deterministic for reproducibility
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
+        prompts, answer, indices = prep_prompts(batch_size)
+
         llm = LLM(
             model="google/gemma-3n-E2B-it",
             enforce_eager=enforce_eager,
             compilation_config=compilation_config,
             seed=SEED,
+            kv_sharing_fast_prefill=kv_sharing_fast_prefill,
+        )
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
         )
-        ref_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
-
-        llm = LLM(model="google/gemma-3n-E2B-it",
-                  enforce_eager=enforce_eager,
-                  compilation_config=compilation_config,
-                  seed=SEED,
-                  kv_sharing_fast_prefill=True)
-        optimized_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
-
-        misses = 0
-
-        for ref_response, optimized_response in zip(ref_responses,
-                                                    optimized_responses):
-            if ref_response.outputs[0].text != optimized_response.outputs[
-                    0].text:
-                misses += 1
-
-        assert misses == 0
diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..14532f27954439c9868e9f1e1d1599b39a6dd877
--- /dev/null
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script contains:
+1. test lora with speculative decoding for batch inference
+"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+LORA_TEST_PROMPT_MAP: dict[str, str] = {}
+
+LORA_TEST_PROMPT_MAP["premjatin/qwen-linear-algebra-coder"] = """
+### INSTRUCTION:
+You are an AI assistant that generates Python code to solve linear
+algebra problems.
+
+### PROBLEM:
+Find the eigenvalues and eigenvectors of the following 3x3 matrix:
+[[3, 2, 0],
+ [2, 3, 0],
+ [0, 0, 2]]
+
+### OUTPUT FORMAT (STRICT):
+Numbers should be represented as integers only.
+
+### PYTHON SOLUTION:
+"""
+
+SEED = 42
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        (
+            "eagle3",
+            "Qwen/Qwen3-1.7B",
+            "AngelSlim/Qwen3-1.7B_eagle3",
+            "premjatin/qwen-linear-algebra-coder",
+            1,
+        )
+    ],
+)
+def test_batch_inference_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    model_setup: tuple[str, str, str, str, int],
+):
+    """
+    Compare the outputs of a LLM with only Lora and a LLM with both SD and Lora.
+    Should be the same and no failure when doing batch inference.
+    model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Disable randomness
+        m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
+        torch.manual_seed(SEED)
+        np.random.seed(SEED)
+        random.seed(SEED)
+        torch.cuda.manual_seed_all(SEED)
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+
+        method, model_name, spec_model_name, lora_path, tp_size = model_setup
+
+        # without speculative decoding
+        ref_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        prompts = [LORA_TEST_PROMPT_MAP[lora_path]] * 100
+        lora_request = LoRARequest("adapter", 1, lora_path)
+        sampling_params = SamplingParams(
+            temperature=0.0, top_p=1.0, top_k=-1, seed=SEED, max_tokens=128
+        )
+
+        ref_outputs = ref_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        lora_spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        lora_spec_outputs = lora_spec_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, lora_spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 90% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        print(f"match ratio: {matches}/{len(ref_outputs)}")
+        assert matches > int(0.90 * len(ref_outputs))
+        del lora_spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
diff --git a/tests/v1/e2e/test_min_tokens.py b/tests/v1/e2e/test_min_tokens.py
index f013425cb59dfe6635b6fecdd8768ea4e483e853..ec7ee0c3ebe645d2599f0bea4dd33176cc18579a 100644
--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -13,9 +13,6 @@ Covers:
 5) Multiple stop conditions
 """
 
-import os
-from typing import Optional, Union
-
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -34,9 +31,9 @@ class MinTokensTestCase:
         name: str,
         min_tokens: int,
         max_tokens: int,
-        stop: Optional[Union[str, list[str]]] = None,
-        expected_min_len: Optional[int] = None,
-        expected_exact_len: Optional[int] = None,
+        stop: str | list[str] | None = None,
+        expected_min_len: int | None = None,
+        expected_exact_len: int | None = None,
     ):
         self.name = name
         self.min_tokens = min_tokens
@@ -46,29 +43,36 @@ class MinTokensTestCase:
         self.expected_exact_len = expected_exact_len
 
     def __str__(self):
-        return (f"{self.name}: min={self.min_tokens}, "
-                f"max={self.max_tokens}, stop={self.stop}")
+        return (
+            f"{self.name}: min={self.min_tokens}, "
+            f"max={self.max_tokens}, stop={self.stop}"
+        )
 
 
 # Test scenarios covering all critical cases
 MIN_TOKENS_TEST_CASES = [
     # === BASIC FUNCTIONALITY (should work) ===
-    MinTokensTestCase(name="basic_min_tokens_no_stop",
-                      min_tokens=8,
-                      max_tokens=20,
-                      stop=None,
-                      expected_min_len=8),
-    MinTokensTestCase(name="min_tokens_zero",
-                      min_tokens=0,
-                      max_tokens=10,
-                      stop=None,
-                      expected_min_len=0),
-    MinTokensTestCase(name="min_equals_max_no_stop",
-                      min_tokens=15,
-                      max_tokens=15,
-                      stop=None,
-                      expected_exact_len=15),
-
+    MinTokensTestCase(
+        name="basic_min_tokens_no_stop",
+        min_tokens=8,
+        max_tokens=20,
+        stop=None,
+        expected_min_len=8,
+    ),
+    MinTokensTestCase(
+        name="min_tokens_zero",
+        min_tokens=0,
+        max_tokens=10,
+        stop=None,
+        expected_min_len=0,
+    ),
+    MinTokensTestCase(
+        name="min_equals_max_no_stop",
+        min_tokens=15,
+        max_tokens=15,
+        stop=None,
+        expected_exact_len=15,
+    ),
     # === STOP STRINGS WITH MIN_TOKENS ===
     # These tests expose the detokenizer bug where stop strings
     # bypass min_tokens
@@ -94,9 +98,11 @@ MIN_TOKENS_TEST_CASES = [
             expected_min_len=5,
         ),
         marks=pytest.mark.xfail(
-            reason=("Known bug #21987: stop strings bypass min_tokens "
-                    "(fixed by PR #22014)"),
-            strict=False),
+            reason=(
+                "Known bug #21987: stop strings bypass min_tokens (fixed by PR #22014)"
+            ),
+            strict=False,
+        ),
         id="min_tokens_with_comprehensive_stops",
     ),
     pytest.param(
@@ -108,12 +114,13 @@ MIN_TOKENS_TEST_CASES = [
             expected_min_len=3,
         ),
         marks=pytest.mark.xfail(
-            reason=("Known bug #21987: stop strings bypass min_tokens "
-                    "(fixed by PR #22014)"),
-            strict=False),
+            reason=(
+                "Known bug #21987: stop strings bypass min_tokens (fixed by PR #22014)"
+            ),
+            strict=False,
+        ),
         id="min_tokens_with_simple_char_stop",
     ),
-
     # === EOS TOKEN WITH MIN_TOKENS (potential LogitsProcessor bug) ===
     # These test the MinTokensLogitsProcessor handling of EOS tokens
     pytest.param(
@@ -125,35 +132,32 @@ MIN_TOKENS_TEST_CASES = [
             expected_exact_len=20,
         ),
         marks=pytest.mark.xfail(
-            reason=
-            ("Potential logits-processor bug: EOS tokens may bypass min_tokens"
-             ),
+            reason=("Potential logits-processor bug: EOS tokens may bypass min_tokens"),
             strict=False,
         ),
         id="min_equals_max_eos_only",
     ),
-
     # === EDGE CASES ===
-    MinTokensTestCase(name="large_min_tokens",
-                      min_tokens=50,
-                      max_tokens=60,
-                      stop=None,
-                      expected_min_len=50),
+    MinTokensTestCase(
+        name="large_min_tokens",
+        min_tokens=50,
+        max_tokens=60,
+        stop=None,
+        expected_min_len=50,
+    ),
     MinTokensTestCase(
         name="min_tokens_with_empty_stop_list",
         min_tokens=5,
         max_tokens=15,
         stop=[],  # Empty stop list
-        expected_min_len=5),
+        expected_min_len=5,
+    ),
 ]
 
 
 @pytest.fixture(scope="module")
 def llm_v1():
     """Create V1 LLM instance for testing"""
-    # Ensure V1 engine is used
-    os.environ["VLLM_USE_V1"] = "1"
-
     llm = LLM(
         model=TEST_MODEL,
         tensor_parallel_size=1,
@@ -170,25 +174,27 @@ def get_token_count(output: RequestOutput) -> int:
     return len(output.outputs[0].token_ids)
 
 
-def assert_min_tokens_satisfied(output: RequestOutput,
-                                test_case: MinTokensTestCase) -> None:
+def assert_min_tokens_satisfied(
+    output: RequestOutput, test_case: MinTokensTestCase
+) -> None:
     """Assert that min_tokens requirement is satisfied"""
     token_count = get_token_count(output)
-    stop_reason = (output.outputs[0].stop_reason
-                   if output.outputs else "no output")
+    stop_reason = output.outputs[0].stop_reason if output.outputs else "no output"
 
     if test_case.expected_exact_len is not None:
         # Exact length requirement
         assert token_count == test_case.expected_exact_len, (
             f"Expected exactly {test_case.expected_exact_len} tokens, "
             f"got {token_count} tokens. "
-            f"Stop reason: {stop_reason}")
+            f"Stop reason: {stop_reason}"
+        )
     else:
         # Minimum length requirement
         assert token_count >= (test_case.expected_min_len or 0), (
             f"Expected at least {test_case.expected_min_len} tokens, "
             f"got {token_count} tokens. "
-            f"Stop reason: {stop_reason}")
+            f"Stop reason: {stop_reason}"
+        )
 
 
 @pytest.mark.parametrize(
@@ -199,13 +205,13 @@ def assert_min_tokens_satisfied(output: RequestOutput,
 def test_min_tokens_comprehensive(llm_v1: LLM, test_case: MinTokensTestCase):
     """
     Comprehensive test for min_tokens functionality in V1 engine.
-    
+
     This test covers all critical scenarios for min_tokens:
     - Basic functionality (should work)
     - Stop strings with min_tokens (known bug)
     - EOS tokens with min_tokens (potential bug)
     - Edge cases
-    
+
     Args:
         llm_v1: V1 LLM instance
         test_case: Test scenario parameters
@@ -218,7 +224,7 @@ def test_min_tokens_comprehensive(llm_v1: LLM, test_case: MinTokensTestCase):
         max_tokens=test_case.max_tokens,
         stop=test_case.stop,
         temperature=GREEDY,
-        include_stop_str_in_output=True  # Include stop strings for debugging
+        include_stop_str_in_output=True,  # Include stop strings for debugging
     )
 
     # Use simple prompt. Comprehensive stop lists should catch any generation
@@ -250,13 +256,11 @@ def test_min_tokens_comprehensive(llm_v1: LLM, test_case: MinTokensTestCase):
 def test_min_tokens_basic_functionality(llm_v1: LLM):
     """
     Test basic min_tokens functionality without stop conditions.
-    
+
     This is a baseline test that should always pass and validates
     that min_tokens works correctly in the simple case.
     """
-    sampling_params = SamplingParams(min_tokens=10,
-                                     max_tokens=20,
-                                     temperature=GREEDY)
+    sampling_params = SamplingParams(min_tokens=10, max_tokens=20, temperature=GREEDY)
 
     prompt = "Once upon a time"
     outputs = llm_v1.generate([prompt], sampling_params)
@@ -269,17 +273,16 @@ def test_min_tokens_basic_functionality(llm_v1: LLM):
 
 
 @pytest.mark.xfail(
-    reason=("Known bug #21987: stop strings bypass min_tokens "
-            "(fixed by PR #22014)"),
+    reason=("Known bug #21987: stop strings bypass min_tokens (fixed by PR #22014)"),
     strict=False,
 )
 def test_min_tokens_stop_strings_bug(llm_v1: LLM):
     """
     Test the specific bug where stop strings bypass min_tokens.
-    
+
     This test specifically reproduces the bug Calvin is fixing in PR #22014.
     It should fail until that fix is merged.
-    
+
     Strategy: Use guaranteed stop characters that will appear
     in any generated text.
     """
@@ -291,7 +294,8 @@ def test_min_tokens_stop_strings_bug(llm_v1: LLM):
         # Common letter; likely appears early
         stop=["e"],
         temperature=GREEDY,
-        include_stop_str_in_output=True)
+        include_stop_str_in_output=True,
+    )
 
     # Simple prompt that will generate text containing "e"
     prompt = "The quick brown fox"
@@ -308,23 +312,25 @@ def test_min_tokens_stop_strings_bug(llm_v1: LLM):
 
     # This assertion should fail due to the bug - if stop string is found early,
     # the model should still continue generating until min_tokens is reached
-    stop_reason = (outputs[0].outputs[0].stop_reason
-                   if outputs[0].outputs else "no output")
-    assert token_count >= 15, ("Bug confirmed: "
-                               f"{token_count} tokens < min_tokens=15. "
-                               f"Reason: {stop_reason}. "
-                               f"Text: {repr(generated_text)}")
+    stop_reason = (
+        outputs[0].outputs[0].stop_reason if outputs[0].outputs else "no output"
+    )
+    assert token_count >= 15, (
+        "Bug confirmed: "
+        f"{token_count} tokens < min_tokens=15. "
+        f"Reason: {stop_reason}. "
+        f"Text: {repr(generated_text)}"
+    )
 
 
 @pytest.mark.xfail(
-    reason=("Known bug #21987: stop strings bypass min_tokens "
-            "(fixed by PR #22014)"),
+    reason=("Known bug #21987: stop strings bypass min_tokens (fixed by PR #22014)"),
     strict=False,
 )
 def test_min_tokens_stop_strings_guaranteed_early_trigger(llm_v1: LLM):
     """
     Guaranteed test for stop strings bypassing min_tokens bug.
-    
+
     Strategy: Use very low temperature and multiple common stop strings
     to virtually guarantee early detection, combined with long min_tokens
     to ensure the bug is exposed regardless of model behavior.
@@ -337,7 +343,8 @@ def test_min_tokens_stop_strings_guaranteed_early_trigger(llm_v1: LLM):
         # Use multiple very common patterns - at least one will appear
         stop=["e", "a", "i", "o", "u", " ", "t", "n", "s", "r"],
         temperature=GREEDY,
-        include_stop_str_in_output=True)
+        include_stop_str_in_output=True,
+    )
 
     # Simple prompt that will generate some text
     prompt = "The cat"
@@ -346,8 +353,7 @@ def test_min_tokens_stop_strings_guaranteed_early_trigger(llm_v1: LLM):
     assert len(outputs) == 1
     token_count = get_token_count(outputs[0])
     generated_text = outputs[0].outputs[0].text if outputs[0].outputs else ""
-    stop_reason = (outputs[0].outputs[0].stop_reason
-                   if outputs[0].outputs else "unknown")
+    stop_reason = outputs[0].outputs[0].stop_reason if outputs[0].outputs else "unknown"
 
     print(f"Generated text: {repr(generated_text)}")
     print(f"Token count: {token_count}")
@@ -357,21 +363,23 @@ def test_min_tokens_stop_strings_guaranteed_early_trigger(llm_v1: LLM):
     # will trigger early termination before min_tokens=50 is reached
     # It's virtually impossible to generate 50 tokens without hitting
     # at least one of: e, a, i, o, u, space, t, n, s, r
-    finish_reason = (outputs[0].outputs[0].finish_reason
-                     if outputs[0].outputs else "unknown")
+    finish_reason = (
+        outputs[0].outputs[0].finish_reason if outputs[0].outputs else "unknown"
+    )
 
     print(f"Finish reason: {finish_reason}")
 
     if finish_reason == "stop":
-        assert token_count >= 50, ("Bug confirmed: "
-                                   f"{token_count} tokens < min_tokens=50. "
-                                   f"Reason: {finish_reason}. "
-                                   f"Text: {repr(generated_text)}")
+        assert token_count >= 50, (
+            "Bug confirmed: "
+            f"{token_count} tokens < min_tokens=50. "
+            f"Reason: {finish_reason}. "
+            f"Text: {repr(generated_text)}"
+        )
 
 
 @pytest.mark.xfail(
-    reason=(
-        "Potential logits-processor bug: EOS tokens may bypass min_tokens"),
+    reason=("Potential logits-processor bug: EOS tokens may bypass min_tokens"),
     strict=False,
 )
 def test_min_tokens_eos_behavior(llm_v1: LLM):
@@ -404,8 +412,14 @@ def test_min_tokens_eos_behavior(llm_v1: LLM):
     finish_no_min = choice_no_min.finish_reason
     stop_no_min = choice_no_min.stop_reason
 
-    print("[no-min] tokens=", len(ids_no_min), " finish=", finish_no_min,
-          " stop_reason=", stop_no_min)
+    print(
+        "[no-min] tokens=",
+        len(ids_no_min),
+        " finish=",
+        finish_no_min,
+        " stop_reason=",
+        stop_no_min,
+    )
 
     assert finish_no_min == "stop", (
         f"Expected finish_reason 'stop' without min_tokens, got {finish_no_min}"
@@ -414,7 +428,8 @@ def test_min_tokens_eos_behavior(llm_v1: LLM):
         "For EOS-based stop (no user stop strings), stop_reason should be None."
     )
     assert len(ids_no_min) < max_toks, (
-        f"Expected early EOS with < {max_toks} tokens, got {len(ids_no_min)}")
+        f"Expected early EOS with < {max_toks} tokens, got {len(ids_no_min)}"
+    )
 
     # Case 2: WITH min_tokens
     sp_with_min = SamplingParams(
@@ -430,23 +445,31 @@ def test_min_tokens_eos_behavior(llm_v1: LLM):
     finish_with_min = choice_with_min.finish_reason
     stop_with_min = choice_with_min.stop_reason
 
-    print("[with-min] tokens=", len(ids_with_min), " finish=", finish_with_min,
-          " stop_reason=", stop_with_min)
+    print(
+        "[with-min] tokens=",
+        len(ids_with_min),
+        " finish=",
+        finish_with_min,
+        " stop_reason=",
+        stop_with_min,
+    )
 
     # Exact length reached; EOS should have been blocked
     assert len(ids_with_min) == max_toks, (
-        f"Expected exactly {max_toks} tokens with min_tokens; "
-        f"got {len(ids_with_min)}")
+        f"Expected exactly {max_toks} tokens with min_tokens; got {len(ids_with_min)}"
+    )
     assert finish_with_min == "length", (
-        f"Expected finish_reason 'length'; got {finish_with_min}")
+        f"Expected finish_reason 'length'; got {finish_with_min}"
+    )
     assert eos_token_id not in ids_with_min, (
-        "EOS token id should not appear when min_tokens prevents early EOS.")
+        "EOS token id should not appear when min_tokens prevents early EOS."
+    )
 
 
 def test_min_tokens_validation():
     """
     Test that SamplingParams correctly validates min_tokens parameters.
-    
+
     This tests the parameter validation logic in SamplingParams.
     """
     # Valid cases
@@ -456,14 +479,14 @@ def test_min_tokens_validation():
 
     # Invalid cases
     with pytest.raises(
-            ValueError,
-            match="min_tokens must be greater than or equal to 0",
+        ValueError,
+        match="min_tokens must be greater than or equal to 0",
     ):
         SamplingParams(min_tokens=-1, max_tokens=10)
 
     with pytest.raises(
-            ValueError,
-            match="min_tokens must be less than or equal to max_tokens",
+        ValueError,
+        match="min_tokens must be less than or equal to max_tokens",
     ):
         SamplingParams(min_tokens=15, max_tokens=10)
 
@@ -474,6 +497,6 @@ if __name__ == "__main__":
     
     Usage:
         cd vllm/
-        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/test_min_tokens.py -v
     """
     pytest.main([__file__, "-v"])
diff --git a/tests/v1/e2e/test_pooling_chunked_prefill.py b/tests/v1/e2e/test_pooling_chunked_prefill.py
new file mode 100644
index 0000000000000000000000000000000000000000..a196e359920de3a474567d52be440c30102b5d9f
--- /dev/null
+++ b/tests/v1/e2e/test_pooling_chunked_prefill.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch.nn as nn
+
+from vllm.platforms import current_platform
+
+prompt = """
+Generals gathered in their masses
+Just like witches at black masses
+Evil minds that plot destruction
+Sorcerer of death's construction
+In the fields, the bodies burning
+As the war machine keeps turning
+Death and hatred to mankind
+Poisoning their brainwashed minds
+Oh, Lord, yeah
+
+Politicians hide themselves away
+They only started the war
+Why should they go out to fight?
+They leave that all to the poor, yeah
+Time will tell on their power minds
+Making war just for fun
+Treating people just like pawns in chess
+Wait till their judgment day comes, yeah
+
+Now, in darkness, world stops turning
+Ashes where their bodies burning
+No more war pigs have the power
+Hand of God has struck the hour
+Day of Judgment, God is calling
+On their knees, the war pigs crawling
+Begging mercies for their sins
+Satan, laughing, spreads his wings
+Oh, Lord, yeah
+"""
+
+
+class WrapperPooler(nn.Module):
+    def __init__(self, pooler):
+        super().__init__()
+        self.pooler = pooler
+        self.chunks = []
+
+    def get_pooling_updates(self, task):
+        return self.pooler.get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states,
+        pooling_metadata,
+    ):
+        self.chunks.append(hidden_states.shape[0])
+        return self.pooler(hidden_states, pooling_metadata)
+
+
+def inject_pooler(self):
+    model = self.get_model()
+    wrapper = WrapperPooler(model.pooler)
+    model.pooler = wrapper
+
+
+def retrieve_chunks(self):
+    model = self.get_model()
+    chunks = model.pooler.chunks
+    model.pooler.chunks = []
+    return chunks
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+def test_pooling_chunked_prefill(vllm_runner, monkeypatch):
+    """Test chunked prefill for pooling models with LastPool."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        model_id = "Qwen/Qwen3-Embedding-0.6B"
+
+        chunk_size = 10
+
+        # Set chunking parameters to force chunked prefill
+        # Note: Chunked prefill is automatically handled by vLLM
+        # internally based on the model size and prompt
+        with vllm_runner(
+            model_id,
+            runner="pooling",
+            long_prefill_token_threshold=chunk_size,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+            enable_chunked_prefill=True,
+        ) as llm:
+            llm.get_llm().llm_engine.collective_rpc(inject_pooler)
+
+            tokenizer = llm.get_llm().get_tokenizer()
+            tokens = tokenizer(prompt)["input_ids"]
+            prompt_len = len(tokens)
+            full_chunks, last_chunk = divmod(prompt_len, chunk_size)
+            expected_chunks = [chunk_size] * full_chunks
+            if last_chunk:
+                expected_chunks.append(last_chunk)
+            llm.embed([prompt])
+            chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
+
+        # Check that PoolerWrapper was called and chunks were received
+        assert len(chunks) > 1
+        assert chunks == expected_chunks
+
+        # Disable chunked prefill
+        with vllm_runner(
+            model_id,
+            runner="pooling",
+            tensor_parallel_size=1,
+            enforce_eager=True,
+        ) as llm:
+            llm.get_llm().llm_engine.collective_rpc(inject_pooler)
+            llm.embed([prompt])
+            chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
+
+        # Check that PoolerWrapper was called and no chunks were received
+        assert len(chunks) == 1
+        assert chunks[0] == prompt_len
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+def test_pooling_prefix_cache(vllm_runner, monkeypatch):
+    """Test chunked prefill for pooling models with LastPool."""
+
+    verses = prompt.split("\n\n")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        model_id = "Qwen/Qwen3-Embedding-0.6B"
+
+        with vllm_runner(
+            model_id,
+            runner="pooling",
+            enable_prefix_caching=True,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+        ) as llm:
+            llm.get_llm().llm_engine.collective_rpc(inject_pooler)
+            tokenizer = llm.get_llm().get_tokenizer()
+
+            prompt1 = "\n\n".join([verses[0], verses[1]])
+            prompt2 = "\n\n".join([verses[0], verses[2]])
+            tokens1 = tokenizer(prompt1)["input_ids"]
+            tokens2 = tokenizer(prompt2)["input_ids"]
+            prompt1_len = len(tokens1)
+            prompt2_len = len(tokens2)
+
+            llm.embed([prompt1])
+            chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
+
+            assert len(chunks) == 1
+            assert chunks[0] == prompt1_len
+
+            llm.embed([prompt2])
+            chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
+
+            assert len(chunks) == 1
+            assert chunks[0] <= prompt1_len
+            assert chunks[0] < prompt2_len
+
+            cache_config = llm.get_llm().llm_engine.cache_config
+            print(f"{cache_config=}")
+            # Prefixes are cached in blocks
+            assert (prompt2_len - chunks[0]) % cache_config.block_size == 0
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index c4efd7548b81bacc57edc10489cd5122cbccf4cf..22fe88fabab973864b21214debbcdb1441a70f5f 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
-from typing import Any, Union
+from typing import Any
 
 import pytest
 import torch
 
-from tests.utils import get_attn_backend_list_based_on_platform
+from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark
 from vllm import LLM, SamplingParams
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
@@ -34,7 +32,7 @@ def get_test_prompts(mm_enabled: bool):
     for kind in random_prompt_type_choices:
         word_choices = ["test", "temp", "hello", "where"]
         word = random.choice(word_choices)
-        prompt: Union[str, list[dict[str, Any]]] = ""
+        prompt: str | list[dict[str, Any]] = ""
         if kind == "repeat":
             prompt = f"""
             please repeat the word '{word}' 10 times.
@@ -48,19 +46,17 @@ def get_test_prompts(mm_enabled: bool):
             give no other output than that simple sentence without quotes.
             """
         elif kind == "mm":
-            placeholders = [{
-                "type": "image_url",
-                "image_url": {
-                    "url":
-                    f"{VLLM_S3_BUCKET_URL}/{VLM_IMAGES_DIR}/stop_sign.jpg"
-                },
-            }]
+            placeholders = [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"{VLLM_S3_BUCKET_URL}/{VLM_IMAGES_DIR}/stop_sign.jpg"
+                    },
+                }
+            ]
             prompt = [
                 *placeholders,
-                {
-                    "type": "text",
-                    "text": "The meaning of the image is"
-                },
+                {"type": "text", "text": "The meaning of the image is"},
             ]
         else:
             raise ValueError(f"Unknown prompt type: {kind}")
@@ -79,20 +75,340 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-def test_ngram_correctness(
+@pytest.fixture(autouse=True)
+def reset_torch_dynamo():
+    """Reset torch dynamo cache before each test"""
+    yield
+    # Cleanup after test
+    torch._dynamo.reset()
+
+
+@pytest.mark.parametrize(
+    "speculative_config",
+    [
+        {
+            "method": "ngram",
+            "prompt_lookup_max": 5,
+            "prompt_lookup_min": 3,
+            "num_speculative_tokens": 3,
+        },
+        {
+            "method": "suffix",
+            "suffix_decoding_max_spec_factor": 2.0,
+        },
+    ],
+)
+def test_ngram_and_suffix_correctness(
+    speculative_config: dict,
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
     model_name: str,
 ):
-    '''
+    """
     Compare the outputs of an original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
-    '''
+    """
+    test_prompts = get_test_prompts(mm_enabled=False)
+
+    ref_llm = LLM(model=model_name, max_model_len=1024)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    spec_llm = LLM(
+        model=model_name,
+        speculative_config=speculative_config,
+        max_model_len=1024,
+    )
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches >= int(0.66 * len(ref_outputs))
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+
+def test_suffix_decoding_acceptance(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    """
+    Check that suffix decoding caching takes effect and improves acceptance
+    lengths and acceptance rates over multiple runs of the same prompts.
+    """
+    test_prompts = get_test_prompts(mm_enabled=False)
+
+    spec_llm = LLM(
+        model=model_name,
+        speculative_config={
+            "method": "suffix",
+            "suffix_decoding_max_spec_factor": 2.0,
+            "suffix_decoding_max_cached_requests": 1000,
+        },
+        max_model_len=1024,
+        disable_log_stats=False,
+    )
+
+    # Run several times and check that the accepted tokens increase.
+    num_draft = []
+    num_accept = []
+    for i in range(10):  # Run multiple times to warm up the cache.
+        spec_llm.chat(test_prompts, sampling_config)
+        # Collect draft and acceptance stats.
+        metrics = spec_llm.get_metrics()
+        for metric in metrics:
+            if metric.name == "vllm:spec_decode_num_draft_tokens":
+                num_draft.append(metric.value)
+            if metric.name == "vllm:spec_decode_num_accepted_tokens":
+                num_accept.append(metric.value)
+
+    # Calculate the acceptance rates for the first and last runs.
+    first_accept_tokens = num_accept[0]
+    first_draft_tokens = num_draft[0]
+    first_accept_rate = first_accept_tokens / first_draft_tokens
+
+    # Take the diff since the stats are cumulative.
+    last_accept_tokens = num_accept[-1] - num_accept[-2]
+    last_draft_tokens = num_draft[-1] - num_draft[-2]
+    last_accept_rate = last_accept_tokens / last_draft_tokens
+
+    # Expect the acceptance length to improve.
+    assert first_accept_tokens < last_accept_tokens
+
+    # Expect the acceptance rate to improve.
+    assert first_accept_rate < last_accept_rate
+
+    # Heuristic: expect at least 85% acceptance rate at the end.
+    assert last_accept_rate > 0.85
+
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3",
+        "RedHatAI/Qwen3-8B-speculator.eagle3",
+    ],
+    ids=["llama3_eagle3_speculator", "qwen3_eagle3_speculator"],
+)
+def test_speculators_model_integration(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_path: str,
+):
+    """
+    Test that speculators models work with the simplified integration.
+
+    This verifies the `vllm serve <speculator-model>` use case where
+    speculative config is automatically detected from the model config
+    without requiring explicit --speculative-config argument.
+
+    Tests:
+    1. Speculator model is correctly detected
+    2. Verifier model is extracted from speculator config
+    3. Speculative decoding is automatically enabled
+    4. Text generation works correctly
+    5. Output matches reference (non-speculative) generation
+    """
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+    # Generate test prompts
+    test_prompts = get_test_prompts(mm_enabled=False)
+
+    # First run: Direct speculator model (simplified integration)
+    spec_llm = LLM(model=model_path, max_model_len=1024)
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+
+    # Verify speculative config was auto-detected
+    assert spec_llm.llm_engine.vllm_config.speculative_config is not None, (
+        f"Speculative config should be auto-detected for {model_path}"
+    )
+
+    spec_config = spec_llm.llm_engine.vllm_config.speculative_config
+    assert spec_config.num_speculative_tokens > 0, (
+        f"Expected positive speculative tokens, "
+        f"got {spec_config.num_speculative_tokens}"
+    )
+
+    # Verify draft model is set to the speculator model
+    assert spec_config.model == model_path, (
+        f"Draft model should be {model_path}, got {spec_config.model}"
+    )
+
+    # Extract verifier model for reference run
+    verifier_model = spec_llm.llm_engine.vllm_config.model_config.model
+
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Second run: Reference without speculative decoding
+    ref_llm = LLM(model=verifier_model, max_model_len=1024)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Compare outputs
+    matches = sum(
+        1
+        for ref, spec in zip(ref_outputs, spec_outputs)
+        if ref.outputs[0].text == spec.outputs[0].text
+    )
+
+    # Heuristic: expect at least 66% of prompts to match exactly
+    assert matches >= int(0.66 * len(ref_outputs)), (
+        f"Only {matches}/{len(ref_outputs)} outputs matched. "
+        f"Expected at least {int(0.66 * len(ref_outputs))} matches."
+    )
+
+
+@pytest.mark.parametrize(
+    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
+    [
+        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
+        pytest.param(
+            (
+                "eagle3",
+                "Qwen/Qwen2.5-VL-7B-Instruct",
+                "Rayzl/qwen2.5-vl-7b-eagle3-sgl",
+                1,
+            ),
+            False,
+            False,
+            marks=pytest.mark.skip(
+                reason="Skipping due to its head_dim not being a a multiple of 32"
+            ),
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-3.1-8B-Instruct",
+                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+                1,
+            ),
+            False,
+            True,
+            marks=large_gpu_mark(min_gb=40),
+        ),  # works on 4x H100
+        (
+            (
+                "eagle3",
+                "meta-llama/Llama-3.1-8B-Instruct",
+                "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+                1,
+            ),
+            False,
+            False,
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+                4,
+            ),
+            False,
+            False,
+            marks=large_gpu_mark(min_gb=80),
+        ),  # works on 4x H100
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+                4,
+            ),
+            True,
+            True,
+            marks=large_gpu_mark(min_gb=80),
+        ),  # works on 4x H100
+        (
+            (
+                "eagle",
+                "eagle618/deepseek-v3-random",
+                "eagle618/eagle-deepseek-v3-random",
+                1,
+            ),
+            False,
+            False,
+        ),
+    ],
+    ids=[
+        "qwen3_eagle3",
+        "qwen2_5_vl_eagle3",
+        "llama3_eagle",
+        "llama3_eagle3",
+        "llama4_eagle",
+        "llama4_eagle_mm",
+        "deepseek_eagle",
+    ],
+)
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
+    enable_chunked_prefill: bool,
+    attn_backend: str,
+):
+    if attn_backend == "TREE_ATTN":
+        # TODO: Fix this flaky test
+        pytest.skip(
+            "TREE_ATTN is flaky in the test disable for now until it can be "
+            "resolved (see https://github.com/vllm-project/vllm/issues/22922)"
+        )
+
+    # Generate test prompts inside the function instead of using fixture
+    test_prompts = get_test_prompts(mm_enabled)
+    """
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    model_setup: (method, model_name, eagle_model_name, tp_size)
+    """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        test_prompts = get_test_prompts(mm_enabled=False)
+        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+            # Scout requires default backend selection
+            # because vision encoder has head_dim 88 being incompatible
+            #  with FLASH_ATTN and needs to fall back to Flex Attn
+            pass
+        else:
+            m.setenv("VLLM_MLA_DISABLE", "1")
+            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+            pytest.skip(
+                "TRITON_ATTN does not support "
+                "multi-token eagle spec decode on current platform"
+            )
+
+        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+
+        method, model_name, spec_model_name, tp_size = model_setup
+        max_model_len = 2048
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
-        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_llm = LLM(
+            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
+        )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
         torch.cuda.empty_cache()
@@ -100,13 +416,17 @@ def test_ngram_correctness(
 
         spec_llm = LLM(
             model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
             speculative_config={
-                "method": "ngram",
-                "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
+                "method": method,
+                "model": spec_model_name,
                 "num_speculative_tokens": 3,
+                "max_model_len": max_model_len,
             },
-            max_model_len=1024,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -119,77 +439,46 @@ def test_ngram_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Heuristic: expect at least 60% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches >= int(0.66 * len(ref_outputs))
+        assert matches > int(0.6 * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
-    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
-    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        False,
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        True,
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-    (("eagle", "eagle618/deepseek-v3-random",
-      "eagle618/eagle-deepseek-v3-random", 1), False),
-],
-                         ids=[
-                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
-                             "llama4_eagle", "llama4_eagle_mm",
-                             "deepseek_eagle"
-                         ])
-@pytest.mark.parametrize("attn_backend",
-                         get_attn_backend_list_based_on_platform())
-def test_eagle_correctness(
+@pytest.mark.parametrize(
+    ["model_setup", "mm_enabled"],
+    [
+        (("mtp", "XiaomiMiMo/MiMo-7B-Base", 1), False),
+        (("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1), False),
+    ],
+    ids=["mimo", "deepseek"],
+)
+def test_mtp_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
-    model_setup: tuple[str, str, str, int],
+    model_setup: tuple[str, str, int],
     mm_enabled: bool,
-    attn_backend: str,
 ):
-    if attn_backend == "TREE_ATTN":
-        # TODO: Fix this flaky test
-        pytest.skip(
-            "TREE_ATTN is flaky in the test disable for now until it can be "
-            "resolved (see https://github.com/vllm-project/vllm/issues/22922)")
-
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
-    '''
+    """
     Compare the outputs of a original LLM and a speculative LLM
-    should be the same when using eagle speculative decoding.
-    model_setup: (method, model_name, eagle_model_name, tp_size)
-    '''
+    should be the same when using MTP speculative decoding.
+    model_setup: (method, model_name, tp_size)
+    """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_MLA_DISABLE", "1")
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
-        if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
-            pytest.skip("TRITON_ATTN does not support "
-                        "multi-token eagle spec decode on current platform")
-
-        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
-            m.setenv("VLLM_ROCM_USE_AITER", "1")
-
-        method, model_name, spec_model_name, tp_size = model_setup
+        method, model_name, tp_size = model_setup
 
-        ref_llm = LLM(model=model_name,
-                      max_model_len=2048,
-                      tensor_parallel_size=tp_size)
+        ref_llm = LLM(
+            model=model_name,
+            max_model_len=2048,
+            tensor_parallel_size=tp_size,
+            trust_remote_code=True,
+        )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
         torch.cuda.empty_cache()
@@ -201,8 +490,7 @@ def test_eagle_correctness(
             tensor_parallel_size=tp_size,
             speculative_config={
                 "method": method,
-                "model": spec_model_name,
-                "num_speculative_tokens": 3,
+                "num_speculative_tokens": 1,
                 "max_model_len": 2048,
             },
             max_model_len=2048,
@@ -218,9 +506,9 @@ def test_eagle_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Heuristic: expect at least 80% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
+        assert matches > int(MTP_SIMILARITY_RATE * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..30426e055ade85a2d4683389c005c09565811f9d
--- /dev/null
+++ b/tests/v1/ec_connector/integration/README.md
@@ -0,0 +1,171 @@
+# EPD Correctness Test
+
+This test verifies that EPD (Encoder-Prefill-Decode) disaggregation produces identical outputs to a baseline single instance.
+
+## What It Tests
+
+- **Baseline**: Single vLLM instance serving a multimodal model
+- **EPD (1E+1PD)**: 1 Encoder + 1 Prefill-Decode instance
+- **Baseline (1P+1D)**: 1 Prefill + 1 Decode instance
+- **EPD (1E+1P+1D)**: 1 Encoder + 1 Prefill + 1 Decode instance
+
+The test ensures that disaggregated encoding produces **identical** outputs to the baseline.
+
+Note that currently PD disaggregation set up may give slightly different results from a single instance. Therefore, we need the result from 1P+1D as the baseline for 1E+1P+1D
+
+Please refer to [Disaggregated Encoder Feature](../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features.
+
+## Files
+
+- `run_epd_correctness_test.sh` - Main test script (starts all instances and runs tests)
+- `test_epd_correctness.py` - Python test script (compares outputs)
+
+## Usage
+
+### Multimodal Prompts (Default)
+
+```bash
+cd vllm
+./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs the test with actual multimodal (image) prompts.
+
+### Text-Only Prompts
+
+```bash
+cd vllm
+USE_MM_PROMPTS=0 ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs a quick test with text-only prompts to verify the setup works.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+## How It Works
+
+### Step 1: Baseline
+
+1. Start single vLLM instance on GPU
+2. Run test prompts (multimodal or text-only)
+3. Save outputs to `.vllm_epd_baseline.txt`
+4. Shutdown instance
+
+### Step 2: EPD (1E + 1PD)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+### Step 3: EPD (1E + 1P + 1D)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+## Test Scenarios
+
+### Multimodal Prompts (--use_mm_prompts)
+
+Tests encoder cache transfer:
+
+- Single image query
+- Multiple images in one request
+- Mixed image and text
+- Image with detailed questions
+
+### Text-Only Prompts (default)
+
+Quick sanity check:
+
+- Simple text queries
+- Text-only explanations
+- Verifies proxy routing works
+
+## Expected Behavior
+
+### ✅ Test Passes When
+
+- All disagg outputs match baseline outputs exactly
+- No errors during instance startup
+- Encoder cache is properly saved and loaded
+- Proxy correctly routes requests
+
+### ❌ Test Fails When
+
+- Outputs differ between baseline and disagg
+- Server startup fails
+- Encoder cache not found (should fallback to local execution)
+- Proxy routing errors
+
+## Notes
+
+- The test uses deterministic generation (`temperature=0.0`, `seed=42`)
+- Encoder cache should enable exact output reproduction
+- Test cleans up all instances and cache files after completion
+- Safe to run multiple times (idempotent)
+- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/online_serving/disaggregated_encoder/README.md`
+
+## Requirements
+
+- Multiple GPUs (3 for 1E+1P+1D, 2 for 1E+1PD, 1 for baseline)
+    - 1E+1P+1D is runnable with 2 GPU by assign E and P on the same GPU now.
+- Multimodal model (e.g., Qwen2.5-VL-3B-Instruct)
+- Internet access (for accessing vllm test images)
+
+## Debugging
+
+### Check Logs
+
+Logs and baseline output are saved in `/tmp/` by default.
+Can be customized by changing the environment variables.
+
+### Check Encoder Cache
+
+```bash
+# Verify cache files are created
+ls -la $EC_SHARED_STORAGE_PATH/
+
+# Should see directories with mm_hash names
+# Each containing encoder_cache.safetensors
+```
+
+### Manual Testing
+
+Run individual components:
+
+```bash
+# Baseline only
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode baseline \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+
+# Disagg only (requires baseline output file!)
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode disagg \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+```
diff --git a/tests/v1/ec_connector/integration/hato.jpg b/tests/v1/ec_connector/integration/hato.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7e390e7d7f68824cfa198d8ceb57f9174a1137
Binary files /dev/null and b/tests/v1/ec_connector/integration/hato.jpg differ
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..55dd39c0a957f38dfb466523bd6bd079ace63ec6
--- /dev/null
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# EPD (Encoder-Prefill-Decode) Correctness Test
+# 
+# This script tests that EPD disaggregation produces the same outputs as baseline.
+# It runs:
+# 1. Baseline: Single vLLM instance
+# 2. EPD: 1E + 1PD setup
+# 3. Baseline for (E + P + D): 1P + 1D vLLM instances disagg
+# 4. EPD: 1E + 1P + 1D setup
+
+# For GPU usage
+
+# set -xe
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Model to test
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+
+# Set 1 to use multimodal prompts; else to use text-only
+USE_MM_PROMPTS="${USE_MM_PROMPTS:-1}"
+MM_FLAG=""
+if [ $USE_MM_PROMPTS = "1" ]; then
+    MM_FLAG="--use_mm_prompts"
+fi
+
+# GPU configuration
+GPU_E="${GPU_E:-0}"
+GPU_P="${GPU_P:-1}"
+GPU_D="${GPU_D:-2}"
+GPU_SINGLE="${GPU_SINGLE:-$GPU_P}"
+GPU_PD="${GPU_PD:-$GPU_P}"
+
+# Port
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19537}"
+ENDPOINT_PORT="${ENDPOINT_PORT:-10001}"
+
+# Storage path for encoder cache
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache_test}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-600}"
+
+# Output file for baseline comparison and logs
+LOG_PATH="${LOG_PATH:-/tmp}"
+BASELINE_FILE="${BASELINE_FILE:-/tmp/vllm_baseline.txt}"
+BASELINE_PD_FILE="${BASELINE_PD_FILE:-/tmp/vllm_epd_baseline.txt}"
+
+mkdir -p $LOG_PATH
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Wait for server to be ready
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:${port}/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup_instances() {
+    echo "Cleaning up any running vLLM instances..."
+    pkill -f "vllm serve" || true
+    pkill -f "disagg_epd_proxy.py" || true
+    sleep 2
+}
+
+# Function to run baseline (single instance)
+run_baseline() {
+    echo "================================"
+    echo "Running BASELINE (single instance)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    
+    local PORT=$ENDPOINT_PORT
+    
+    # Start baseline instance
+    echo "Starting baseline instance on GPU $GPU_SINGLE, port $PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_SINGLE" vllm serve "$MODEL" \
+        --port $PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        > $LOG_PATH/baseline.log 2>&1 &
+    
+    local BASELINE_PID=$!
+    
+    # Wait for baseline to start
+    echo "Waiting for baseline instance to start..."
+    wait_for_server $PORT
+
+    curl http://127.0.0.1:$PORT/v1/models
+    echo ""
+    
+    # Run test in baseline mode
+    echo "Running baseline..."
+
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PORT" \
+        --model_name "$MODEL" \
+        --mode baseline \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup baseline
+    echo "Stopping baseline instance..."
+    kill $BASELINE_PID 2>/dev/null || true
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1PD
+run_epd_1e_1pd() {
+    echo "================================"
+    echo "Running EPD (1E + 1PD)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_DECODE_PORT=$PREFILL_DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill+decode instance
+    echo "Starting PD instance on GPU $GPU_PD, port $PREFILL_DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+        --port $PREFILL_DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_pd.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for PD instance..."
+    wait_for_server $PREFILL_DECODE_PORT
+
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "disable" \
+        --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+        > $LOG_PATH/1e1pd_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1PD) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1PD) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1PD Correctness Test finished"
+    echo "Stopping EPD (1E+1PD) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run baseline for 1E + 1P + 1D (PD disagg)
+run_baseline_1p_1d() {
+    echo "================================"
+    echo "Running PD BASELINE (1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --prefiller-ports $PREFILL_PORT \
+        --decoder-ports $DECODE_PORT \
+        > $LOG_PATH/1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/healthcheck
+    echo ""
+
+    echo "All PD (1P+1D) services are up!"
+    
+    # Run test in baseline mode
+    echo "Running PD disagg baseline..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode baseline_pd \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "Stopping PD (1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1P + 1D
+run_epd_1e_1p_1d() {
+    echo "================================"
+    echo "Running EPD (1E + 1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1p1d_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1e1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1e1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+        --decode-servers-urls "http://localhost:$DECODE_PORT" \
+        > $LOG_PATH/1e1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1P+1D) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1P+1D) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1P+1D Correctness Test finished"
+    echo "Stopping EPD (1E+1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Main execution
+echo "================================"
+echo "EPD Correctness Test Suite"
+echo "Model: $MODEL"
+echo "================================"
+
+# Step 1: Run baseline
+run_baseline
+
+# Step 2: Test 1E + 1PD
+run_epd_1e_1pd
+
+# Step 3: Test baseline 1P + 1D
+run_baseline_1p_1d
+
+# Step 4: Test 1E + 1P + 1D
+run_epd_1e_1p_1d
+
+# Cleanup output file
+rm -f "$BASELINE_FILE"
+rm -f "$BASELINE_PD_FILE"
+
+echo "================================"
+echo "✓✓ All EPD correctness tests finished!"
+echo "================================"
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c4c58e349b90e310e4cf22d2462f9458c2ae2c
--- /dev/null
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+EPD Correctness Test
+
+Tests that EPD (Encoder-Prefill-Decode) disaggregation produces the same
+outputs as a baseline single instance.
+
+Usage:
+    # Baseline mode (saves outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode baseline \
+        --baseline_file .vllm_epd_baseline.txt
+
+    # Disagg mode (compares outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode disagg \
+        --baseline_file .vllm_epd_baseline.txt
+"""
+
+import argparse
+import json
+import os
+import time
+
+import openai
+import requests
+
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import encode_image_base64
+
+MAX_OUTPUT_LEN = 256
+
+# Sample prompts with multimodal content
+image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720))
+image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720))
+
+image_local_path = f"{os.path.dirname(os.path.abspath(__file__))}/hato.jpg"
+
+SAMPLE_PROMPTS_MM: list[dict] = [
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
+                        },
+                    },
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        "description": "Single image query",
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
+                        },
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"file://{image_local_path}"},
+                    },
+                    {"type": "text", "text": "Describe these 2 images in detail."},
+                ],
+            }
+        ],
+        "description": "2 images with detailed query",
+    },
+]
+
+# Text-only prompts for mixed testing
+SAMPLE_PROMPTS_TEXT: list[dict] = [
+    {
+        "messages": [{"role": "user", "content": "What is the capital of France?"}],
+        "description": "Simple text-only query",
+    },
+    {
+        "messages": [
+            {"role": "user", "content": "Explain quantum computing in simple terms."}
+        ],
+        "description": "Text-only explanation request",
+    },
+]
+
+
+def check_vllm_server(url: str, timeout=5, retries=10) -> bool:
+    """Check if the vLLM server is ready.
+
+    Args:
+        url: The URL to check (usually /health or /healthcheck endpoint)
+        timeout: Timeout in seconds for each request
+        retries: Number of retries if the server is not ready
+
+    Returns:
+        True if the server is ready, False otherwise
+    """
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, timeout=timeout)
+            if response.status_code == 200:
+                print(f"Server is ready at {url}")
+                return True
+            else:
+                print(
+                    f"Attempt {attempt + 1}/{retries}: Server returned "
+                    f"status code {response.status_code}"
+                )
+        except requests.exceptions.RequestException as e:
+            print(f"Attempt {attempt + 1}/{retries}: Error connecting: {e}")
+        time.sleep(2)  # Wait before retrying
+    return False
+
+
+def run_chat_completion(
+    base_url: str,
+    model_name: str,
+    messages: list,
+    max_tokens: int = MAX_OUTPUT_LEN,
+) -> str:
+    """Run a chat completion request.
+
+    Args:
+        base_url: Base URL of the vLLM server
+        model_name: Name of the model
+        messages: Messages for chat completion
+        max_tokens: Maximum tokens to generate
+
+    Returns:
+        Generated text content
+    """
+    client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
+
+    completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+    )
+
+    return completion.choices[0].message.content
+
+
+def main():
+    """Main test function."""
+    parser = argparse.ArgumentParser(
+        description="EPD correctness test - compare disagg vs baseline"
+    )
+
+    parser.add_argument(
+        "--service_url",
+        type=str,
+        required=True,
+        help="The vLLM service URL (e.g., http://localhost:8000)",
+    )
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="Model name",
+    )
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="baseline",
+        choices=["baseline", "baseline_pd", "disagg"],
+        help="Mode: baseline/baseline_pd (saves outputs) or disagg (compares outputs)",
+    )
+
+    parser.add_argument(
+        "--baseline_file",
+        type=str,
+        default=".vllm_epd_baseline.txt",
+        help="File to save/load baseline outputs",
+    )
+
+    parser.add_argument(
+        "--use_mm_prompts",
+        action="store_true",
+        help="Use multimodal prompts (default: use text-only for quick testing)",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Service URL: {args.service_url}")
+    print(f"Model: {args.model_name}")
+    print(f"Mode: {args.mode}")
+    print(f"Output file: {args.baseline_file}")
+    print(f"Use MM prompts: {args.use_mm_prompts}")
+
+    # Determine health check endpoint
+    if args.mode == "baseline":
+        health_check_url = f"{args.service_url}/health"
+    elif args.mode == "baseline_pd":
+        # Nixl toy proxy use /healthcheck
+        health_check_url = f"{args.service_url}/healthcheck"
+    else:
+        # Disagg EPD proxy uses /health
+        health_check_url = f"{args.service_url}/health"
+        if not os.path.exists(args.baseline_file):
+            raise ValueError(
+                f"In disagg mode, the output file {args.baseline_file} from "
+                "baseline does not exist. Run baseline mode first."
+            )
+
+    # Check if server is ready
+    if not check_vllm_server(health_check_url):
+        raise RuntimeError(f"vLLM server at {args.service_url} is not ready!")
+
+    # Select prompts to use
+    if args.use_mm_prompts:
+        test_prompts = SAMPLE_PROMPTS_MM
+        print("Using multimodal prompts")
+    else:
+        test_prompts = SAMPLE_PROMPTS_TEXT
+        print("Using text-only prompts for quick testing")
+
+    # Run completions
+    service_url = f"{args.service_url}/v1"
+    output_strs = {}
+
+    for i, prompt_data in enumerate(test_prompts):
+        print(
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
+                prompt_data['description']
+            }"
+        )
+
+        output_str = run_chat_completion(
+            base_url=service_url,
+            model_name=args.model_name,
+            messages=prompt_data["messages"],
+            max_tokens=MAX_OUTPUT_LEN,
+        )
+
+        # Use description as key for comparison
+        key = prompt_data["description"]
+        output_strs[key] = output_str
+        print(f"Output: {output_str}")
+
+    if args.mode in ("baseline", "baseline_pd"):
+        # Baseline mode: Save outputs
+        print(f"\nSaving baseline outputs to {args.baseline_file}")
+        try:
+            with open(args.baseline_file, "w") as json_file:
+                json.dump(output_strs, json_file, indent=4)
+            print("✅ Baseline outputs saved successfully")
+        except OSError as e:
+            print(f"Error writing to file: {e}")
+            raise
+    else:
+        # Disagg mode: Load and compare outputs
+        print(f"\nLoading baseline outputs from {args.baseline_file}")
+        baseline_outputs = None
+        try:
+            with open(args.baseline_file) as json_file:
+                baseline_outputs = json.load(json_file)
+        except OSError as e:
+            print(f"Error reading from file: {e}")
+            raise
+
+        # Verify outputs match
+        print("\nComparing disagg outputs with baseline...")
+        assert isinstance(baseline_outputs, dict), "Baseline outputs should be a dict"
+        assert len(baseline_outputs) == len(output_strs), (
+            f"Length mismatch: baseline has {len(baseline_outputs)}, "
+            f"disagg has {len(output_strs)}"
+        )
+
+        all_match = True
+        for key, baseline_output in baseline_outputs.items():
+            assert key in output_strs, f"{key} not in disagg outputs"
+
+            disagg_output = output_strs[key]
+            if baseline_output == disagg_output:
+                print(f"✅ {key}: MATCH")
+            else:
+                print(f"❌ {key}: MISMATCH")
+                print(f"  Baseline: {baseline_output}")
+                print(f"  Disagg:   {disagg_output}")
+                all_match = False
+
+        assert all_match, "❌❌Disagg outputs do not match baseline!❌❌"
+        if all_match:
+            print("\n✅ All outputs match! Test PASSED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..a58daa2628e210482883ba098b269d74bf1e0e31
--- /dev/null
+++ b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
@@ -0,0 +1,609 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for ECSharedStorageConnector.
+"""
+
+import os
+from unittest.mock import Mock, patch
+
+import pytest
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole
+from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
+    ECSharedStorageConnector,
+    ECSharedStorageConnectorMetadata,
+    MMMeta,
+)
+from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
+from vllm.v1.core.sched.output import SchedulerOutput
+
+
+# ------------------ Mock Classes ------------------ #
+class MockRequest:
+    def __init__(self, request_id, mm_hashes: list[str], token_counts: list[int]):
+        assert len(mm_hashes) == len(token_counts)
+        self.request_id = request_id
+        self._token_counts = token_counts
+        self.mm_features = []
+        for i, mm_hash in enumerate(mm_hashes):
+            feature = MultiModalFeatureSpec(
+                data=None,
+                modality="image",
+                identifier=mm_hash,
+                mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
+            )
+            self.mm_features.append(feature)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self._token_counts)
+        return self._token_counts[input_id]
+
+
+@pytest.fixture
+def temp_storage(tmp_path):
+    """Fixture providing temporary storage path."""
+    return str(tmp_path)
+
+
+@pytest.fixture
+def mock_vllm_config_producer(temp_storage):
+    """Fixture providing mock VllmConfig for producer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = True
+    return config
+
+
+@pytest.fixture
+def mock_vllm_config_consumer(temp_storage):
+    """Fixture providing mock VllmConfig for consumer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = False
+    return config
+
+
+@pytest.fixture
+def mock_request_with_3_mm():
+    """Fixture providing mock Request with 3 multimodal items."""
+    request_id = "test_req_123"
+    mm_hashes = ["img_hash_1", "img_hash_2", "img_hash_3"]
+    token_counts = [100, 150, 200]
+
+    request = MockRequest(request_id, mm_hashes, token_counts)
+    return request
+
+
+# ------------------ Unit Tests ------------------ #
+class TestECSharedStorageConnectorBasics:
+    """Test basic EC connector functionality."""
+
+    def test_initialization_producer(self, mock_vllm_config_producer, temp_storage):
+        """Test connector initializes correctly as producer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        assert connector.role == ECConnectorRole.SCHEDULER
+        assert connector.is_producer
+        assert connector._storage_path == temp_storage
+        assert connector._mm_datas_need_loads == {}
+
+    def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage):
+        """Test connector initializes correctly as consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert connector.role == ECConnectorRole.WORKER
+        assert not connector.is_producer
+        assert connector._storage_path == temp_storage
+
+    def test_role_assignment(self, mock_vllm_config_producer):
+        """Test role is correctly assigned."""
+        scheduler_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+        worker_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert scheduler_connector.role == ECConnectorRole.SCHEDULER
+        assert worker_connector.role == ECConnectorRole.WORKER
+
+
+class TestCacheExistence:
+    """Test cache existence checking using has_caches() API."""
+
+    def test_has_caches_all_exist_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+    ):
+        """Test has_caches returns True when all 3 caches exist."""
+        # Test for producer first
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create cache files using save_caches (proper way)
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_feature in mock_request_with_3_mm.mm_features:
+            mm_hash = mm_feature.identifier
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(encoder_cache, mm_hash)
+
+        # Test using has_caches API
+        producer_result = producer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(producer_result) == 3
+        assert all(producer_result), f"Expected all True, got {producer_result}"
+
+        # Also test consumer can check if cache exists
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test using has_caches API
+        consumer_result = consumer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(consumer_result) == 3
+        assert all(consumer_result), f"Expected all True, got {consumer_result}"
+
+    def test_has_caches_none_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches returns False when no caches exist."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test without creating any files
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not any(result), f"Expected all False, got {result}"
+
+    def test_has_caches_partial_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches with some caches existing (1 of 3)."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create only the second cache file
+        mm_hash_second = mock_request_with_3_mm.mm_features[1].identifier
+        encoder_cache = {mm_hash_second: torch.randn(10, 768)}
+        connector.save_caches(encoder_cache, mm_hash_second)
+
+        # Test
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not result[0]  # First doesn't exist
+        assert result[1]  # Second exists
+        assert not result[2]  # Third doesn't exist
+
+
+class TestStateManagement:
+    """Test connector state management."""
+
+    def test_update_state_after_alloc_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test state update after allocation for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Initial state should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Update state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Check state updated for all 3
+        assert len(connector._mm_datas_need_loads) == 3
+        assert "img_hash_1" in connector._mm_datas_need_loads
+        assert "img_hash_2" in connector._mm_datas_need_loads
+        assert "img_hash_3" in connector._mm_datas_need_loads
+        assert connector._mm_datas_need_loads["img_hash_1"] == 100
+        assert connector._mm_datas_need_loads["img_hash_2"] == 150
+        assert connector._mm_datas_need_loads["img_hash_3"] == 200
+
+    def test_build_connector_meta_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test metadata building for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Setup state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Build metadata
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        # Assert
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 3
+        assert metadata.mm_datas[0].mm_hash == "img_hash_1"
+        assert metadata.mm_datas[0].num_token == 100
+        assert metadata.mm_datas[1].mm_hash == "img_hash_2"
+        assert metadata.mm_datas[1].num_token == 150
+        assert metadata.mm_datas[2].mm_hash == "img_hash_3"
+        assert metadata.mm_datas[2].num_token == 200
+
+        # State should be cleared after building
+        assert len(connector._mm_datas_need_loads) == 0
+
+    def test_build_connector_meta_empty(self, mock_vllm_config_producer):
+        """Test metadata building with empty state."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 0
+
+    def test_state_cleared_after_metadata_build(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test that state is properly cleared after building metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Add state
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+        assert len(connector._mm_datas_need_loads) == 3
+
+        # Build metadata (should clear state)
+        scheduler_output = Mock(spec=SchedulerOutput)
+        connector.build_connector_meta(scheduler_output)
+
+        # State should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Build again should return empty metadata
+        metadata2 = connector.build_connector_meta(scheduler_output)
+        assert len(metadata2.mm_datas) == 0
+
+
+class TestCacheSaving:
+    """Test encoder cache saving (producer only)."""
+
+    def test_save_caches_producer_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage
+    ):
+        """Test cache saving as producer for 3 different MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Create and save 3 different caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_hash in mm_hashes:
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify all files exist using has_caches
+        result = connector.has_caches(mock_request_with_3_mm)
+        assert all(result), f"Not all caches were saved: {result}"
+
+        # Verify each file's content
+        for mm_hash in mm_hashes:
+            filename = connector._generate_filename_debug(mm_hash)
+            loaded = safetensors.torch.load_file(filename)
+            assert "ec_cache" in loaded
+            assert torch.allclose(loaded["ec_cache"], encoder_cache[mm_hash].cpu())
+
+    def test_save_caches_consumer_skips(self, mock_vllm_config_consumer):
+        """Test cache saving is skipped for consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_hash_consumer"
+        encoder_cache = {mm_hash: torch.randn(10, 768)}
+
+        # Save should not raise but also not create file
+        connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify file doesn't exist using has_caches
+        mock_request = MockRequest("req_consumer", [mm_hash], [10])
+        result = connector.has_caches(mock_request)
+        assert not result[0], "Consumer should not save caches"
+
+
+class TestCacheLoading:
+    """Test encoder cache loading (consumer)."""
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_start_load_caches_consumer_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+        temp_storage,
+    ):
+        """Test consumer loads 3 caches from storage."""
+        # First, create producer to save caches
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Producer saves 3 caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        saved_caches = {}
+        for mm_hash in mm_hashes:
+            saved_caches[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(saved_caches, mm_hash)
+
+        # Now consumer loads
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup metadata for all 3
+        metadata = ECSharedStorageConnectorMetadata()
+        for mm_hash in mm_hashes:
+            metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Load
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Verify all 3 loaded
+        assert len(encoder_cache) == 3
+        for mm_hash in mm_hashes:
+            assert mm_hash in encoder_cache, f"{mm_hash} missing in encoder_cache"
+            assert encoder_cache[mm_hash].is_cuda, (
+                f"{mm_hash} cache is in {encoder_cache[mm_hash].device}"
+            )
+            assert torch.allclose(
+                encoder_cache[mm_hash].cpu(), saved_caches[mm_hash]
+            ), f"{mm_hash} cache saved and loaded tesnor are not the same"
+
+    def test_start_load_caches_skip_existing(
+        self, mock_vllm_config_producer, mock_vllm_config_consumer, temp_storage
+    ):
+        """Test cache loading skips already cached items."""
+        # Setup: producer saves cache
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "existing_hash"
+        saved_cache = torch.randn(10, 768)
+        producer.save_caches({mm_hash: saved_cache}, mm_hash)
+
+        # Consumer setup
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Pre-populate encoder_cache with different value
+        existing_cache = torch.randn(5, 512)
+        encoder_cache = {mm_hash: existing_cache}
+
+        # Load (should skip since already exists)
+        with patch("safetensors.torch.load_file") as mock_load:
+            consumer.start_load_caches(encoder_cache=encoder_cache)
+            # Should not call load_file since cache exists
+            mock_load.assert_not_called()
+
+        # Verify original cache unchanged
+        assert torch.equal(encoder_cache[mm_hash], existing_cache)
+
+    def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer):
+        """Test loading with empty metadata does nothing."""
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup empty metadata
+        metadata = ECSharedStorageConnectorMetadata()
+        consumer.bind_connector_metadata(metadata)
+
+        # Load (should not raise)
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Cache should remain empty
+        assert len(encoder_cache) == 0
+
+
+class TestFilenameGeneration:
+    """Test filename and path generation."""
+
+    def test_generate_foldername(self, mock_vllm_config_producer, temp_storage):
+        """Test folder name generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_folder_hash"
+        folder = connector._generate_foldername_debug(mm_hash)
+
+        assert folder == os.path.join(temp_storage, mm_hash)
+        assert os.path.isdir(folder)  # Should be created
+
+    def test_generate_filename(self, mock_vllm_config_producer, temp_storage):
+        """Test filename generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_file_hash"
+        filename = connector._generate_filename_debug(mm_hash)
+
+        expected = os.path.join(temp_storage, mm_hash, "encoder_cache.safetensors")
+        assert filename == expected
+        assert os.path.isdir(os.path.dirname(filename))  # Folder created
+
+    def test_generate_filename_consistency(self, mock_vllm_config_producer):
+        """Test filename generation is consistent."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "consistency_hash"
+        filename1 = connector._generate_filename_debug(mm_hash)
+        filename2 = connector._generate_filename_debug(mm_hash)
+
+        assert filename1 == filename2
+
+
+class TestMetadataBindingLifecycle:
+    """Test metadata binding and clearing lifecycle."""
+
+    def test_bind_connector_metadata(self, mock_vllm_config_consumer):
+        """Test binding connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("hash_1", 100))
+
+        connector.bind_connector_metadata(metadata)
+
+        assert connector._connector_metadata is metadata
+
+    def test_clear_connector_metadata(self, mock_vllm_config_consumer):
+        """Test clearing connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        connector.clear_connector_metadata()
+
+        assert connector._connector_metadata is None
+
+    def test_get_connector_metadata(self, mock_vllm_config_consumer):
+        """Test getting connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        retrieved = connector._get_connector_metadata()
+
+        assert retrieved is metadata
+
+    def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer):
+        """Test getting metadata when not set raises."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        with pytest.raises(AssertionError):
+            connector._get_connector_metadata()
+
+
+class TestEdgeCases:
+    """Test edge cases and error handling."""
+
+    def test_save_empty_cache(self, mock_vllm_config_producer):
+        """Test saving empty tensor."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "empty_hash"
+        encoder_cache = {mm_hash: torch.empty(0)}
+
+        # Should not raise
+        connector.save_caches(encoder_cache, mm_hash)
+
+    def test_load_nonexistent_cache(self, mock_vllm_config_consumer):
+        """Test loading cache that doesn't exist raises error."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100))
+        connector.bind_connector_metadata(metadata)
+
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        # Should raise FileNotFoundError
+        with pytest.raises(FileNotFoundError):
+            connector.start_load_caches(encoder_cache=encoder_cache)
+
+    def test_has_caches_empty_request(self, mock_vllm_config_producer):
+        """Test has_caches with request that has no MM data."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        mock_request = MockRequest("req_empty", [], [])
+
+        result = connector.has_caches(mock_request)
+
+        assert len(result) == 0
+        assert result == []
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index a73a9a6999f7237107453f1d0420c8fa723ec5cb..283a76dab67235ea4d8a1303706f57b1e83e9abb 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -5,18 +5,20 @@ import pytest
 import torch
 from transformers import AutoTokenizer
 
-from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
-                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN,
-                                   TOKENIZER_NAME,
-                                   DummyOutputProcessorTestVectors,
-                                   generate_dummy_prompt_logprobs_tensors,
-                                   generate_dummy_sample_logprobs)
+from tests.v1.engine.utils import (
+    FULL_STRINGS,
+    NUM_PROMPT_LOGPROBS_UNDER_TEST,
+    NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+    PROMPT_LEN,
+    TOKENIZER_NAME,
+    DummyOutputProcessorTestVectors,
+    generate_dummy_prompt_logprobs_tensors,
+    generate_dummy_sample_logprobs,
+)
 from vllm.engine.arg_utils import EngineArgs
 
 from ...distributed.conftest import publisher_config, random_port  # noqa: F401
 
-from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
-
 EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
 EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor]
 
@@ -31,9 +33,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
     vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
     # Tokenize prompts under test & create dummy generated tokens
-    prompt_tokens = [
-        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-    ]
+    prompt_tokens = [tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS]
     generation_tokens = [
         tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
     ]
@@ -42,9 +42,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
         tokenizer.decode(prompt_tokens, skip_special_tokens=True)
         for prompt_tokens in prompt_tokens
     ]
-    prompt_strings_len = [
-        len(prompt_string) for prompt_string in prompt_strings
-    ]
+    prompt_strings_len = [len(prompt_string) for prompt_string in prompt_strings]
     return DummyOutputProcessorTestVectors(
         tokenizer=tokenizer,
         vllm_config=vllm_config,
@@ -58,7 +56,8 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
             for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
         ],
         prompt_logprobs=[],
-        generation_logprobs=[])
+        generation_logprobs=[],
+    )
 
 
 @pytest.fixture
@@ -76,12 +75,16 @@ def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
         generate_dummy_sample_logprobs(
             sampled_tokens_list=tokens_list,
             num_logprobs=NUM_SAMPLE_LOGPROBS_UNDER_TEST,
-            tokenizer=dtv.tokenizer) for tokens_list in dtv.generation_tokens
+            tokenizer=dtv.tokenizer,
+        )
+        for tokens_list in dtv.generation_tokens
     ]
     dtv.prompt_logprobs = [
         generate_dummy_prompt_logprobs_tensors(
             prompt_tokens_list=tokens_list,
             num_logprobs=NUM_PROMPT_LOGPROBS_UNDER_TEST,
-            tokenizer=dtv.tokenizer) for tokens_list in dtv.prompt_tokens
+            tokenizer=dtv.tokenizer,
+        )
+        for tokens_list in dtv.prompt_tokens
     ]
     return dtv
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index aca546600d0b56d15b04e5207e3b52bd48fff4fd..25af55baa91f461a79a5dd424776f5bba35f9c57 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -3,7 +3,6 @@
 
 import asyncio
 from contextlib import ExitStack
-from typing import Optional
 from unittest.mock import MagicMock
 
 import pytest
@@ -16,21 +15,26 @@ from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
-from vllm.utils import set_default_torch_num_threads
+from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine.async_llm import AsyncLLM
-from vllm.v1.metrics.loggers import LoggingStatLogger
+from vllm.v1.metrics.loggers import (
+    AggregatedLoggingStatLogger,
+    LoggingStatLogger,
+    PerEngineStatLoggerAdapter,
+    PrometheusStatLogger,
+)
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
 TEXT_ENGINE_ARGS = AsyncEngineArgs(
     model="meta-llama/Llama-3.2-1B-Instruct",
     enforce_eager=True,
 )
 
-VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
-                                     enforce_eager=True)
+VISION_ENGINE_ARGS = AsyncEngineArgs(
+    model="Qwen/Qwen2-VL-2B-Instruct", enforce_eager=True
+)
 
 TEXT_PROMPT = "Hello my name is Robert and"
 
@@ -38,12 +42,11 @@ VISION_PROMPT_TEMPLATE = (
     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
     "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
     "What is in the image?<|im_end|>\n"
-    "<|im_start|>assistant\n")
+    "<|im_start|>assistant\n"
+)
 VISION_PROMPT = {
     "prompt": VISION_PROMPT_TEMPLATE,
-    "multi_modal_data": {
-        "image": ImageAsset("stop_sign").pil_image
-    },
+    "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
 }
 
 
@@ -54,8 +57,8 @@ async def generate(
     output_kind: RequestOutputKind,
     max_tokens: int,
     n: int = 1,
-    prompt_logprobs: Optional[int] = None,
-    cancel_after: Optional[int] = None,
+    prompt_logprobs: int | None = None,
+    cancel_after: int | None = None,
 ) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
@@ -70,10 +73,9 @@ async def generate(
         n=n,
         prompt_logprobs=prompt_logprobs,
     )
-    async for out in engine.generate(request_id=request_id,
-                                     prompt=prompt,
-                                     sampling_params=sampling_params):
-
+    async for out in engine.generate(
+        request_id=request_id, prompt=prompt, sampling_params=sampling_params
+    ):
         num_tokens = sum(len(output.token_ids) for output in out.outputs)
         if output_kind == RequestOutputKind.DELTA:
             count += num_tokens
@@ -89,24 +91,19 @@ async def generate(
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.parametrize(
     "engine_args,prompt",
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
 )
 @pytest.mark.asyncio
 async def test_load(
-    monkeypatch: pytest.MonkeyPatch,
     output_kind: RequestOutputKind,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
-    # so that in the future when we switch, we don't have to change all the
-    # tests.
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -121,40 +118,40 @@ async def test_load(
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(
+                        engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS
+                    )
+                )
+            )
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        done, pending = await asyncio.wait(tasks,
-                                           return_when=asyncio.FIRST_EXCEPTION)
+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
         for task in pending:
             task.cancel()
         for task in done:
             num_generated_tokens, request_id = await task
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} but "
-                f"expected {NUM_EXPECTED_TOKENS}")
+                f"expected {NUM_EXPECTED_TOKENS}"
+            )
 
         assert not engine.output_processor.has_unfinished_requests()
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.parametrize(
     "engine_args,prompt",
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
 )
 @pytest.mark.asyncio
 async def test_abort(
-    monkeypatch: pytest.MonkeyPatch,
     output_kind: RequestOutputKind,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -170,14 +167,17 @@ async def test_abort(
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
-                          (idx
-                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            max_tokens = (
+                NUM_EXPECTED_TOKENS_LONG
+                if (idx in REQUEST_IDS_TO_ABORT)
+                else NUM_EXPECTED_TOKENS
+            )
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             max_tokens, n)))
+                    generate(engine, request_id, prompt, output_kind, max_tokens, n)
+                )
+            )
 
         # API server cancels requests when they disconnect.
         for idx in REQUEST_IDS_TO_ABORT:
@@ -197,7 +197,8 @@ async def test_abort(
                 expected_tokens = NUM_EXPECTED_TOKENS * n
                 assert num_generated_tokens == expected_tokens, (
                     f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {expected_tokens}")
+                    f"expected {expected_tokens}"
+                )
 
         # Make sure all aborted requests were really aborted.
         assert not engine.output_processor.has_unfinished_requests()
@@ -205,24 +206,19 @@ async def test_abort(
         # Confirm we can do another generation.
         request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
         task = asyncio.create_task(
-            generate(engine, request_id, prompt, output_kind,
-                     NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS)
+        )
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.asyncio
-async def test_multi_abort(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+async def test_multi_abort(output_kind: RequestOutputKind):
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -238,14 +234,19 @@ async def test_multi_abort(
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
-                          (idx
-                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            max_tokens = (
+                NUM_EXPECTED_TOKENS_LONG
+                if (idx in REQUEST_IDS_TO_ABORT)
+                else NUM_EXPECTED_TOKENS
+            )
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, TEXT_PROMPT, output_kind,
-                             max_tokens, n)))
+                    generate(
+                        engine, request_id, TEXT_PROMPT, output_kind, max_tokens, n
+                    )
+                )
+            )
 
         # Let requests start
         await asyncio.sleep(0.5)
@@ -261,25 +262,26 @@ async def test_multi_abort(
         for idx, result in enumerate(results):
             if idx in REQUEST_IDS_TO_ABORT:
                 # Aborted requests should return partial results
-                assert isinstance(
-                    result, tuple
-                ), f"Request {idx} should have completed with partial results"
+                assert isinstance(result, tuple), (
+                    f"Request {idx} should have completed with partial results"
+                )
                 num_generated_tokens, request_id = result
                 # Should have generated some tokens before abort
                 assert num_generated_tokens > 0, (
-                    f"Aborted request "
-                    f"{request_id} should have generated some tokens")
+                    f"Aborted request {request_id} should have generated some tokens"
+                )
             else:
                 # Non-aborted requests should complete normally
-                assert isinstance(
-                    result,
-                    tuple), f"Request {idx} should have completed successfully"
+                assert isinstance(result, tuple), (
+                    f"Request {idx} should have completed successfully"
+                )
                 num_generated_tokens, request_id = result
                 n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
                 expected_tokens = NUM_EXPECTED_TOKENS * n
                 assert num_generated_tokens == expected_tokens, (
                     f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {expected_tokens}")
+                    f"expected {expected_tokens}"
+                )
 
         # Make sure all aborted requests were cleaned up
         assert not engine.output_processor.has_unfinished_requests()
@@ -292,15 +294,11 @@ async def test_multi_abort(
 )
 @pytest.mark.asyncio
 async def test_finished_flag(
-    monkeypatch: pytest.MonkeyPatch,
     n: int,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -314,9 +312,9 @@ async def test_finished_flag(
         )
         outputs = [
             out
-            async for out in engine.generate(request_id="request-33",
-                                             prompt=prompt,
-                                             sampling_params=sampling_params)
+            async for out in engine.generate(
+                request_id="request-33", prompt=prompt, sampling_params=sampling_params
+            )
         ]
 
         # Assert only the last output has the finished flag set
@@ -329,13 +327,11 @@ async def test_finished_flag(
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
 )
 @pytest.mark.asyncio
-async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
-                                       engine_args: AsyncEngineArgs,
-                                       prompt: PromptType):
+async def test_mid_stream_cancellation(
+    engine_args: AsyncEngineArgs, prompt: PromptType
+):
     """Test that requests can be cancelled mid-stream."""
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -358,7 +354,9 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
                         RequestOutputKind.DELTA,
                         NUM_TOKENS,
                         cancel_after=NUM_EXPECTED_TOKENS,
-                    )))
+                    )
+                )
+            )
 
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks)
@@ -367,7 +365,8 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
         for num_generated_tokens, request_id in results:
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} tokens but "
-                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}"
+            )
 
         # Make sure no requests are left hanging
         assert not engine.output_processor.has_unfinished_requests()
@@ -375,20 +374,27 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
         # Confirm we can reuse the request id after the cancellations.
         request_id = request_ids[0]
         task = asyncio.create_task(
-            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
-                     NUM_EXPECTED_TOKENS))
+            generate(
+                engine, request_id, prompt, RequestOutputKind.DELTA, NUM_EXPECTED_TOKENS
+            )
+        )
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
 
 
 class MockLoggingStatLogger(LoggingStatLogger):
-
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         super().__init__(vllm_config, engine_index)
         self.log = MagicMock()
 
 
+class MockAggregatedStatLogger(AggregatedLoggingStatLogger):
+    def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]):
+        super().__init__(vllm_config, engine_indexes)
+        self.log = MagicMock()
+
+
 @pytest.mark.asyncio
 async def test_customize_loggers(monkeypatch):
     """Test that we can customize the loggers.
@@ -396,9 +402,7 @@ async def test_customize_loggers(monkeypatch):
     be added to the default loggers.
     """
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(
                 TEXT_ENGINE_ARGS,
@@ -408,45 +412,80 @@ async def test_customize_loggers(monkeypatch):
 
         await engine.do_log_stats()
 
-        stat_loggers = engine.logger_manager.per_engine_logger_dict
-        assert len(stat_loggers) == 1
-        assert len(
-            stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
-        stat_loggers[0][0].log.assert_called_once()
+        stat_loggers = engine.logger_manager.stat_loggers
+        assert (
+            len(stat_loggers) == 3
+        )  # MockLoggingStatLogger + LoggingStatLogger +  Promethus Logger
+        print(f"{stat_loggers=}")
+        stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once()
+        assert isinstance(stat_loggers[1], PerEngineStatLoggerAdapter)
+        assert isinstance(stat_loggers[1].per_engine_stat_loggers[0], LoggingStatLogger)
+        assert isinstance(stat_loggers[2], PrometheusStatLogger)
 
 
-@pytest.mark.asyncio(scope="module")
-async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
+@pytest.mark.asyncio
+async def test_customize_aggregated_loggers():
+    """Test that we can customize the aggregated loggers.
+    If a customized logger is provided at the init, it should
+    be added to the default loggers.
+    """
+    with ExitStack() as after:
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(
+                TEXT_ENGINE_ARGS,
+                stat_loggers=[MockLoggingStatLogger, MockAggregatedStatLogger],
+            )
+        after.callback(engine.shutdown)
+
+        await engine.do_log_stats()
+
+        stat_loggers = engine.logger_manager.stat_loggers
+        assert len(stat_loggers) == 4
+        #  MockLoggingStatLogger + MockAggregatedStatLogger
+        # + LoggingStatLogger + PrometheusStatLogger
+        stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once()
+        stat_loggers[1].log.assert_called_once()
+        assert isinstance(stat_loggers[2], PerEngineStatLoggerAdapter)
+        assert isinstance(stat_loggers[2].per_engine_stat_loggers[0], LoggingStatLogger)
+        assert isinstance(stat_loggers[3], PrometheusStatLogger)
 
+
+@pytest.mark.asyncio(scope="module")
+async def test_dp_rank_argument():
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
 
-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+        )
 
         # Test with valid DP rank.
-        async for _ in engine.generate(request_id="request-34",
-                                       prompt=TEXT_PROMPT,
-                                       sampling_params=sampling_params,
-                                       data_parallel_rank=0):
+        async for _ in engine.generate(
+            request_id="request-34",
+            prompt=TEXT_PROMPT,
+            sampling_params=sampling_params,
+            data_parallel_rank=0,
+        ):
             pass
 
         # Test with out-of-range DP rank.
         with pytest.raises(ValueError):
-            async for _ in engine.generate(request_id="request-35",
-                                           prompt=TEXT_PROMPT,
-                                           sampling_params=sampling_params,
-                                           data_parallel_rank=1):
+            async for _ in engine.generate(
+                request_id="request-35",
+                prompt=TEXT_PROMPT,
+                sampling_params=sampling_params,
+                data_parallel_rank=1,
+            ):
                 pass
 
 
 @pytest.mark.asyncio
-async def test_check_health(monkeypatch: pytest.MonkeyPatch):
+async def test_check_health():
     """Test that check_health returns normally for healthy engine
     and raises EngineDeadError when the engine is dead.
     """
@@ -454,9 +493,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
 
     from vllm.v1.engine.exceptions import EngineDeadError
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -465,10 +502,14 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
         await engine.check_health()
 
         # Test 2: Mock the errored property to simulate a dead engine
-        with patch.object(type(engine),
-                          'errored',
-                          new_callable=lambda: property(lambda self: True)
-                          ), pytest.raises(EngineDeadError):
+        with (
+            patch.object(
+                type(engine),
+                "errored",
+                new_callable=lambda: property(lambda self: True),
+            ),
+            pytest.raises(EngineDeadError),
+        ):
             await engine.check_health()
 
         # Test 3: Verify healthy engine still works after mock
@@ -476,17 +517,13 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.asyncio
-async def test_abort_final_output(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
+async def test_abort_final_output(output_kind: RequestOutputKind):
     """Test that abort() returns a final output with correct information."""
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -504,8 +541,8 @@ async def test_abort_final_output(
 
         outputs: list[RequestOutput] = []
         generated = asyncio.create_task(
-            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params,
-                            outputs))
+            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params, outputs)
+        )
 
         # Let it generate some tokens
         await asyncio.sleep(0.5)
@@ -525,14 +562,13 @@ async def test_abort_final_output(
         assert final_output.outputs[0].stop_reason is None
 
         # Verify num_cached_tokens is set correctly
-        assert hasattr(final_output, 'num_cached_tokens')
+        assert hasattr(final_output, "num_cached_tokens")
         assert final_output.num_cached_tokens >= 0
 
         # If we got intermediate outputs, verify they are consistent
         if output_kind == RequestOutputKind.DELTA:
             # For DELTA, sum all intermediate tokens should <= final tokens
-            token_count = sum(
-                len(output.outputs[0].token_ids) for output in outputs)
+            token_count = sum(len(output.outputs[0].token_ids) for output in outputs)
             assert token_count > 0
             # This would ordinarily be 0, but could end up > 0 if the
             # final abort is coalesced with another chunk in the output queue.
@@ -551,12 +587,12 @@ async def collect_outputs(
     prompt: PromptType,
     sampling_params: SamplingParams,
     outputs_list: list[RequestOutput],
-) -> Optional[RequestOutput]:
+) -> RequestOutput | None:
     """Helper to collect outputs and return the final one."""
-    final_output: Optional[RequestOutput] = None
-    async for output in engine.generate(request_id=request_id,
-                                        prompt=prompt,
-                                        sampling_params=sampling_params):
+    final_output: RequestOutput | None = None
+    async for output in engine.generate(
+        request_id=request_id, prompt=prompt, sampling_params=sampling_params
+    ):
         if not output.finished:
             outputs_list.append(output)
         final_output = output
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 23ec3673b10b43b9b1f65dbf1ea78d40099aab4e..e96759ed66a79f790bd9a8ecc487d4c3b3023b37 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -5,25 +5,19 @@ from argparse import ArgumentError
 
 import pytest
 
-from vllm import envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
-
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def test_prefix_caching_from_cli():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
-    assert (vllm_config.cache_config.enable_prefix_caching
-            ), "V1 turns on prefix caching by default."
+    assert vllm_config.cache_config.enable_prefix_caching, (
+        "V1 turns on prefix caching by default."
+    )
 
     # Turn it off possible with flag.
     args = parser.parse_args(["--no-enable-prefix-caching"])
@@ -41,8 +35,7 @@ def test_prefix_caching_from_cli():
     # set hash algorithm to sha256_cbor
     args = parser.parse_args(["--prefix-caching-hash-algo", "sha256_cbor"])
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
-    assert vllm_config.cache_config.prefix_caching_hash_algo == \
-        "sha256_cbor"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256_cbor"
 
     # set hash algorithm to sha256
     args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
@@ -57,13 +50,15 @@ def test_prefix_caching_from_cli():
 
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
-    vllm_config: VllmConfig = engine_args.create_engine_config(
-        UsageContext.LLM_CLASS)
+    vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
 
     from vllm.platforms import current_platform
+    from vllm.utils.mem_constants import GiB_bytes
+
+    device_memory = current_platform.get_device_total_memory()
     device_name = current_platform.get_device_name().lower()
-    if "h100" in device_name or "h200" in device_name:
-        # For H100 and H200, we use larger default values.
+    if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
+        # For GPUs like H100, H200, and MI300x with >= 70GB memory
         default_llm_tokens = 16384
         default_server_tokens = 8192
         default_max_num_seqs = 1024
@@ -76,7 +71,6 @@ def test_defaults_with_usage_context():
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
 
     engine_args = EngineArgs(model="facebook/opt-125m")
-    vllm_config = engine_args.create_engine_config(
-        UsageContext.OPENAI_API_SERVER)
+    vllm_config = engine_args.create_engine_config(UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 17b136aa427312549fdd4bb5c74aeef4cc11a088..3ba8ab26f5522158ee698fb64ceedc1bb40288f7 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -10,24 +10,34 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.config import (
+    CacheConfig,
+    ECTransferConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.utils import set_default_torch_num_threads
+from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.executor.abstract import Executor, UniProcExecutor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.uniproc_executor import UniProcExecutor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 
 from ...utils import create_new_process_for_each_test, multi_gpu_test
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
-PROMPT = "Hello my name is Robert and I love quantization kernels"
+# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt.
+# Adjust prompt if changing model to maintain 12-token length.
+PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
@@ -47,208 +57,196 @@ def make_request() -> EngineCoreRequest:
 
 
 @create_new_process_for_each_test()
-def test_engine_core(monkeypatch: pytest.MonkeyPatch):
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
-        """Test basic request lifecycle."""
-
-        # First request.
-        engine_core.add_request(
-            *engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-
-        # Second request.
-        engine_core.add_request(
-            *engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 1
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        # Add two requests in a row.
-        engine_core.add_request(
-            *engine_core.preprocess_add_request(make_request()))
-        engine_core.add_request(
-            *engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 4
-
-        # Loop through until they are all done.
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
-
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        """Test abort cycle."""
-
-        # Basic abort.
-        req = make_request()
-        request_id = req.request_id
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        engine_core.abort_requests([request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert engine_core.scheduler.has_finished_requests()
-
-        _ = engine_core.step()
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        # Add, step, abort 1 of the 3.
-        req0 = make_request()
-        req1 = make_request()
-        req2 = make_request()
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 0
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req2))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 3
-
-        # Abort just one.
-        engine_core.abort_requests([req1.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        # Abort the other requests at the same time.
-        engine_core.abort_requests([req2.request_id, req0.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-
-        # Sending duplicate requests with same request_id
-        req0 = make_request()
-        req1 = make_request()
-        req0.request_id = req1.request_id = "test"
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
-
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+def test_engine_core():
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+    """Test basic request lifecycle."""
+
+    # First request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+
+    # Second request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 1
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    # Add two requests in a row.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 4
+
+    # Loop through until they are all done.
+    while (outs := engine_core.step_fn()[0].get(0)) and outs.outputs:
+        pass
+
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    """Test abort cycle."""
+
+    # Basic abort.
+    req = make_request()
+    request_id = req.request_id
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    engine_core.abort_requests([request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert engine_core.scheduler.has_finished_requests()
+
+    _ = engine_core.step_fn()
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    # Add, step, abort 1 of the 3.
+    req0 = make_request()
+    req1 = make_request()
+    req2 = make_request()
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 0
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req2))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 3
+
+    # Abort just one.
+    engine_core.abort_requests([req1.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step_fn()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    # Abort the other requests at the same time.
+    engine_core.abort_requests([req2.request_id, req0.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+
+    # Sending duplicate requests with same request_id
+    req0 = make_request()
+    req1 = make_request()
+    req0.request_id = req1.request_id = "test"
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+
+    while engine_core.scheduler.has_requests():
+        engine_core.step_fn()
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    while engine_core.scheduler.has_requests():
+        engine_core.step_fn()
+
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
 
 
 @create_new_process_for_each_test()
-def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_advanced_sampling():
     """
     A basic end-to-end test to verify that the engine functions correctly
     when additional sampling parameters, such as top_p, min_tokens, and
     presence_penalty, are set.
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
-        """Test basic request lifecycle."""
-        # First request.
-        request: EngineCoreRequest = make_request()
-        request.sampling_params = SamplingParams(
-            min_tokens=4,
-            presence_penalty=1.0,
-            frequency_penalty=1.0,
-            repetition_penalty=0.1,
-            stop_token_ids=[1001, 1002],
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
         )
-        engine_core.add_request(*engine_core.preprocess_add_request(request))
-
-        def _check_engine_state():
-            assert len(engine_core.scheduler.waiting) == 1
-            assert len(engine_core.scheduler.running) == 0
-            # Loop through until they are all done.
-            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-                pass
-            assert len(engine_core.scheduler.waiting) == 0
-            assert len(engine_core.scheduler.running) == 0
-
-        _check_engine_state()
-
-        # Second request.
-        request2 = make_request()
-        request2.sampling_params = SamplingParams(
-            top_p=0.99,
-            top_k=50,
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request2))
-        _check_engine_state()
+    """Test basic request lifecycle."""
+    # First request.
+    request: EngineCoreRequest = make_request()
+    request.sampling_params = SamplingParams(
+        min_tokens=4,
+        presence_penalty=1.0,
+        frequency_penalty=1.0,
+        repetition_penalty=0.1,
+        stop_token_ids=[1001, 1002],
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request))
+
+    def _check_engine_state():
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        # Loop through until they are all done.
+        while engine_core.scheduler.has_requests():
+            engine_core.step_fn()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+
+    _check_engine_state()
+
+    # Second request.
+    request2 = make_request()
+    request2.sampling_params = SamplingParams(
+        top_p=0.99,
+        top_k=50,
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request2))
+    _check_engine_state()
 
 
 @create_new_process_for_each_test()
-def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_concurrent_batches():
     """
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(req_id: str,
-                                     max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: str, max_tokens: int) -> EngineCoreRequest:
         request = make_request()
         request.request_id = req_id
         request.sampling_params.max_tokens = max_tokens
         return request
 
     class DummyExecutor(UniProcExecutor):
-
-        def initialize_from_config(
-                self, kv_cache_configs: list[KVCacheConfig]) -> None:
+        def initialize_from_config(self, kv_cache_configs: list[KVCacheConfig]) -> None:
             super().initialize_from_config(kv_cache_configs)
 
             # Create a thread pool with a single worker
@@ -258,15 +256,31 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             self,
             scheduler_output,
             non_block=False,
-        ) -> Future[ModelRunnerOutput]:
+        ) -> Future[ModelRunnerOutput | None]:
             """Make execute_model non-blocking."""
 
             # DummyExecutor used only for testing async case.
             assert non_block
 
             def _execute():
-                output = self.collective_rpc("execute_model",
-                                             args=(scheduler_output, ))
+                output = self.collective_rpc("execute_model", args=(scheduler_output,))
+                # Make a copy because output[0] may be reused
+                # by the next batch.
+                return copy.deepcopy(output[0])
+
+            # Use the thread pool instead of creating a new thread
+            return self.thread_pool.submit(_execute)
+
+        def sample_tokens(
+            self, grammar_output, non_block=False
+        ) -> Future[ModelRunnerOutput]:
+            """Make sample_tokens non-blocking."""
+
+            # DummyExecutor used only for testing async case.
+            assert non_block
+
+            def _execute():
+                output = self.collective_rpc("sample_tokens", args=(grammar_output,))
                 # Make a copy because output[0] may be reused
                 # by the next batch.
                 return copy.deepcopy(output[0])
@@ -279,178 +293,306 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             return 2
 
         def shutdown(self):
-            if hasattr(self, 'thread_pool'):
+            if hasattr(self, "thread_pool"):
                 self.thread_pool.shutdown(wait=False)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            # To test concurrent batches.
-            max_num_seqs=2,
-            # Avoid all requests being scheduled once.
-            enable_prefix_caching=False,
-            max_num_batched_tokens=10,
-            # Reduce startup time.
-            enforce_eager=True,
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        # To test concurrent batches.
+        max_num_seqs=2,
+        # Avoid all requests being scheduled once.
+        enable_prefix_caching=False,
+        max_num_batched_tokens=10,
+        # Reduce startup time.
+        enforce_eager=True,
+        # Test concurrent batch behaviour independently of async scheduling.
+        async_scheduling=False,
+    )
+    vllm_config = engine_args.create_engine_config()
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
         )
-        vllm_config = engine_args.create_engine_config()
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     log_stats=False,
-                                     executor_class=DummyExecutor)
-        assert engine_core.batch_queue is not None
-
-        # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens("0", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        req1 = make_request_with_max_tokens("1", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-
-        # Schedule Batch 1: (10, req0)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 10
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[
-            req0.request_id].num_computed_tokens == 10
-
-        # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] == {}
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 2
-        assert scheduler_output.num_scheduled_tokens["1"] == 8
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
-        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
-
-        assert engine_core.scheduler.get_num_unfinished_requests() == 2
-
-        # Finish Batch 1 and schedule Batch 3: (4, req1).
-        # Note that req0 cannot be scheduled
-        # because it is in the decoding stage now.
-        engine_core.step_with_batch_queue()
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 4
-
-        # Finish Batch 2. Get first token of req0.
-        # Schedule Batch 4: (1, req0).
-        output = engine_core.step_with_batch_queue()[0].get(0)
-        assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 1
-
-        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
-        output = engine_core.step_with_batch_queue()[0].get(0)
+    assert engine_core.batch_queue is not None
+
+    # Add two requests in a row. Each request have 12 prompt tokens.
+    req0 = make_request_with_max_tokens("0", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    req1 = make_request_with_max_tokens("1", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+
+    # Schedule Batch 1: (10, req0)
+    assert engine_core.step_with_batch_queue()[0] is None
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 10
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
+
+    # Schedule Batch 2: (2, req0), (8, req1)
+    assert engine_core.step_with_batch_queue()[0] == {}
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 2
+    assert scheduler_output.num_scheduled_tokens["1"] == 8
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+    assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
+
+    assert engine_core.scheduler.get_num_unfinished_requests() == 2
+
+    # Finish Batch 1 and schedule Batch 3: (4, req1).
+    # Note that req0 cannot be scheduled
+    # because it is in the decoding stage now.
+    engine_core.step_with_batch_queue()
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 4
+
+    # Finish Batch 2. Get first token of req0.
+    # Schedule Batch 4: (1, req0).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 1
+
+    # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 1
+
+    # Loop until req0 is finished.
+    req_id = 0
+    expected_num_tokens = [
+        engine_core.scheduler.requests["0"].num_tokens + 1,
+        engine_core.scheduler.requests["1"].num_tokens + 1,
+    ]
+    while engine_core.scheduler.get_num_unfinished_requests() == 2:
+        output = engine_core.step_with_batch_queue()[0]
+        # Every step consumes an output.
         assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 1
-
-        # Loop until req0 is finished.
-        req_id = 0
-        expected_num_tokens = [
-            engine_core.scheduler.requests["0"].num_tokens + 1,
-            engine_core.scheduler.requests["1"].num_tokens + 1,
-        ]
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()[0]
-            # Every step consumes an output.
-            assert output is not None
-            assert len(output[0].outputs) == 1
-            if req_id in engine_core.scheduler.requests:
-                assert engine_core.scheduler.requests[
-                    req_id].num_tokens == expected_num_tokens[req_id]
-            expected_num_tokens[req_id] += 1
-            req_id = (req_id + 1) % 2
+        assert len(output[0].outputs) == 1
+        if req_id in engine_core.scheduler.requests:
+            assert (
+                engine_core.scheduler.requests[req_id].num_tokens
+                == expected_num_tokens[req_id]
+            )
+        expected_num_tokens[req_id] += 1
+        req_id = (req_id + 1) % 2
 
 
 @multi_gpu_test(num_gpus=2)
-def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_tp():
     """
     Test engine can initialize worker in tp properly
     """
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            tensor_parallel_size=2,
-            # Reduce startup time.
-            enforce_eager=True,
-        )
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        tensor_parallel_size=2,
+        # Reduce startup time.
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
 
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
 
-        def get_worker_cache_config_field(worker, key: str):
-            return getattr(worker.cache_config, key)
+    def get_worker_cache_config_field(worker, key: str):
+        return getattr(worker.cache_config, key)
 
-        num_gpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_gpu_blocks", ))
-        num_cpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_cpu_blocks", ))
-        assert all(x is not None for x in num_gpu_blocks)
-        assert all(x is not None for x in num_cpu_blocks)
+    num_gpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_gpu_blocks",)
+    )
+    num_cpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_cpu_blocks",)
+    )
+    assert all(x is not None for x in num_gpu_blocks)
+    assert all(x is not None for x in num_cpu_blocks)
 
 
 @create_new_process_for_each_test()
-def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_invalid_request_id_type():
     """Test that engine raises TypeError for non-string request_id."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
-
-        # Test with UUID object (common mistake)
-        uuid_request = make_request()
-        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
-
-        with pytest.raises(TypeError,
-                           match="request_id must be a string, got.*UUID"):
-            engine_core.add_request(
-                *engine_core.preprocess_add_request(uuid_request))
-
-        # Test with integer
-        int_request = make_request()
-        int_request.request_id = 12345
-
-        with pytest.raises(TypeError,
-                           match="request_id must be a string, got.*int"):
-            engine_core.add_request(
-                *engine_core.preprocess_add_request(int_request))
-
-        # Test with None
-        none_request = make_request()
-        none_request.request_id = None
-
-        with pytest.raises(TypeError,
-                           match="request_id must be a string, got.*NoneType"):
-            engine_core.add_request(
-                *engine_core.preprocess_add_request(none_request))
-
-        # Verify engine is still functional after errors
-        valid_request = make_request()
-        engine_core.add_request(
-            *engine_core.preprocess_add_request(valid_request))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+
+    # Test with UUID object (common mistake)
+    uuid_request = make_request()
+    uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+
+    with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
+        engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
+
+    # Test with integer
+    int_request = make_request()
+    int_request.request_id = 12345
+
+    with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
+        engine_core.add_request(*engine_core.preprocess_add_request(int_request))
+
+    # Test with None
+    none_request = make_request()
+    none_request.request_id = None
+
+    with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
+        engine_core.add_request(*engine_core.preprocess_add_request(none_request))
+
+    # Verify engine is still functional after errors
+    valid_request = make_request()
+    engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    ("ec_role", "gpu_memory_utilization", "enable_prefix_caching"),
+    [
+        ("ec_producer", 0.01, False),
+        # NOTE: ec_producer never allows prefix caching
+        ("ec_consumer", 0.7, True),
+        ("ec_consumer", 0.7, False),
+    ],
+)
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_encoder_instance_zero_kv_cache(
+    ec_role: str,
+    gpu_memory_utilization: float,
+    enable_prefix_caching: bool,
+    use_kv_connector: bool,
+):
+    """EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests
+
+    This test verifies encoder-only instance initializes with 0 KV cache blocks.
+    Under EPD disagg mode, Encoder instances (EC producer role) only execute
+    vision encoder, so they don't need KV cache for text generation.
+    """
+    # Form vllm config
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+    )
+    model_config = ModelConfig(
+        model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
+        enforce_eager=True,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=gpu_memory_utilization,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
+    ec_transfer_config = ECTransferConfig(
+        ec_connector="ECSharedStorageConnector",
+        ec_role=ec_role,
+        ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        kv_transfer_config=kv_transfer_config,
+        ec_transfer_config=ec_transfer_config,
+    )
+
+    executor_class = Executor.get_class(vllm_config)
+    print(f"executor_class: {executor_class}")
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+
+    # Check encoder cache manager exists
+    assert engine_core.scheduler.encoder_cache_manager is not None, (
+        "encoder_cache_manager should exist"
+    )
+
+    if ec_role == "ec_producer":
+        # Check 1: num_blocks should be 0
+        # NOTE: num_blocks=1 as BlockPool always needs a null_block.
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks == 1, (
+            f"ec_producer should only have 1 KV blocks, "
+            f"got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should be empty
+        assert len(kv_cache_config.kv_cache_groups) == 0, (
+            f"ec_producer should have 0 KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: kv_cache_tensors should be empty
+        assert len(kv_cache_config.kv_cache_tensors) == 0, (
+            f"Encoder instance should have 0 KV cache tensors, "
+            f"got {len(kv_cache_config.kv_cache_tensors)}"
+        )
+
+        # Check 4: Verify EC connector is initialized and is producer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Encoder instance should have EC connector"
+        )
+        assert engine_core.scheduler.ec_connector.is_producer, (
+            "Encoder instance EC connector should be producer"
+        )
+
+        # Check 5: Verify chunked prefill is disabled
+        assert not vllm_config.scheduler_config.enable_chunked_prefill, (
+            "Encoder instance should disable chunked prefill (no KV cache)"
+        )
+
+    elif ec_role == "ec_consumer":
+        # Check 1: num_blocks should be > 1
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks > 1, (
+            f"ec_consumer should have >1 KV blocks, got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should NOT be empty
+        assert len(kv_cache_config.kv_cache_groups) > 0, (
+            f"ec_consumer should have KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: Verify EC connector is consumer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Consumer instance should have EC connector"
+        )
+        assert not engine_core.scheduler.ec_connector.is_producer, (
+            "Consumer instance EC connector should be consumer"
+        )
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 10adac9bab5f79dea2f2bed0061af0838e493997..770560a5e549ef3e1e5e1d337cb1f225660fb896 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -8,7 +8,7 @@ import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
@@ -17,16 +17,14 @@ from transformers import AutoTokenizer
 
 from tests.utils import multi_gpu_test
 from vllm import SamplingParams
-from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
-                                        ZmqEventPublisher)
+from vllm.distributed.kv_events import BlockStored, KVEventBatch, ZmqEventPublisher
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import set_default_torch_num_threads
+from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
-                                        SyncMPClient)
+from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 
@@ -34,8 +32,7 @@ from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -44,8 +41,8 @@ PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
 def make_request(
-        params: SamplingParams,
-        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    params: SamplingParams, prompt_tokens_ids: list[int] | None = None
+) -> EngineCoreRequest:
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
 
@@ -64,7 +61,6 @@ def make_request(
 
 
 def loop_until_done(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = client.get_output().outputs
 
@@ -82,7 +78,6 @@ def loop_until_done(client: EngineCoreClient, outputs: dict):
 
 
 async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = (await client.get_output_async()).outputs
 
@@ -100,7 +95,6 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
 
 
 async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = (await client.get_output_async()).outputs
 
@@ -119,10 +113,7 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
 
 
 # Dummy utility function to monkey-patch into engine core.
-def echo(self,
-         msg: str,
-         err_msg: Optional[str] = None,
-         sleep: Optional[float] = None) -> str:
+def echo(self, msg: str, err_msg: str | None = None, sleep: float | None = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
     if sleep is not None:
         time.sleep(sleep)
@@ -133,18 +124,15 @@ def echo(self,
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
-                            multiprocessing_mode: bool):
-
+def test_engine_core_client(
+    monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
+):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch core engine utility function to test.
         m.setattr(EngineCore, "echo", echo, raising=False)
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -172,7 +160,8 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
         for req_id in request_ids:
             assert len(outputs[req_id]) == MAX_TOKENS, (
-                f"{outputs[req_id]=}, {MAX_TOKENS=}")
+                f"{outputs[req_id]=}, {MAX_TOKENS=}"
+            )
         """Abort Request Cycle."""
 
         # Note: this code pathway will only work for multiprocessing
@@ -191,10 +180,12 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
         for idx, req_id in enumerate(request_ids):
             if idx % 2 == 0:
                 assert len(outputs[req_id]) < MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                )
             else:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                )
         """Abort after request is finished."""
 
         # Note: this code pathway will only work for multiprocessing
@@ -202,7 +193,7 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
         request = requests[0]
         client.add_request(request)
-        time.sleep(10.)
+        time.sleep(10.0)
 
         client.abort_requests([request.request_id])
 
@@ -222,16 +213,14 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch core engine utility function to test.
         m.setattr(EngineCore, "echo", echo, raising=False)
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -261,7 +250,8 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
             for req_id in request_ids:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{outputs[req_id]=}, {MAX_TOKENS=}")
+                    f"{outputs[req_id]=}, {MAX_TOKENS=}"
+                )
             """Abort Request Cycle."""
 
             # Add requests to the engine.
@@ -277,10 +267,12 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             for idx, req_id in enumerate(request_ids):
                 if idx % 2 == 0:
                     assert len(outputs[req_id]) < MAX_TOKENS, (
-                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                    )
                 else:
                     assert len(outputs[req_id]) == MAX_TOKENS, (
-                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                    )
             """Utility method invocation"""
 
             core_client: AsyncMPClient = client
@@ -296,8 +288,8 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             # Test that cancelling the utility call doesn't destabilize the
             # engine.
             util_task = asyncio.create_task(
-                core_client.call_utility_async("echo", "testarg2", None,
-                                               0.5))  # sleep for 0.5 sec
+                core_client.call_utility_async("echo", "testarg2", None, 0.5)
+            )  # sleep for 0.5 sec
             await asyncio.sleep(0.05)
             cancelled = util_task.cancel()
             assert cancelled
@@ -305,9 +297,9 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             # Ensure client is still functional. The engine runs utility
             # methods in a single thread so this request won't be processed
             # until the cancelled sleeping one is complete.
-            result = await asyncio.wait_for(core_client.call_utility_async(
-                "echo", "testarg3"),
-                                            timeout=1.0)
+            result = await asyncio.wait_for(
+                core_client.call_utility_async("echo", "testarg3"), timeout=1.0
+            )
             assert result == "testarg3"
         finally:
             client.shutdown()
@@ -323,7 +315,7 @@ def echo_dc(
     self,
     msg: str,
     return_list: bool = False,
-) -> Union[MyDataclass, list[MyDataclass]]:
+) -> MyDataclass | list[MyDataclass]:
     print(f"echo dc util function called: {msg}")
     val = None if msg is None else MyDataclass(msg)
     # Return dataclass to verify support for returning custom types
@@ -336,7 +328,7 @@ def echo_dc_dict(
     self,
     msg: str,
     return_dict: bool = False,
-) -> Union[MyDataclass, dict[str, MyDataclass]]:
+) -> MyDataclass | dict[str, MyDataclass]:
     print(f"echo dc dict util function called: {msg}")
     val = None if msg is None else MyDataclass(msg)
     # Return dict of dataclasses to verify support for returning dicts
@@ -353,8 +345,7 @@ def echo_dc_nested(
     msg: str,
     structure_type: str = "list_of_dicts",
 ) -> Any:
-    print(f"echo dc nested util function called: {msg}, "
-          f"structure: {structure_type}")
+    print(f"echo dc nested util function called: {msg}, structure: {structure_type}")
     val = None if msg is None else MyDataclass(msg)
 
     if structure_type == "list_of_dicts":  # noqa
@@ -373,11 +364,9 @@ def echo_dc_nested(
 
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_util_method_custom_return(
-        monkeypatch: pytest.MonkeyPatch):
-
+    monkeypatch: pytest.MonkeyPatch,
+):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -386,7 +375,8 @@ async def test_engine_core_client_util_method_custom_return(
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -402,22 +392,17 @@ async def test_engine_core_client_util_method_custom_return(
             # Test utility method returning custom / non-native data type.
             core_client: AsyncMPClient = client
 
-            result = await core_client.call_utility_async(
-                "echo_dc", "testarg2", False)
-            assert isinstance(result,
-                              MyDataclass) and result.message == "testarg2"
-            result = await core_client.call_utility_async(
-                "echo_dc", "testarg2", True)
+            result = await core_client.call_utility_async("echo_dc", "testarg2", False)
+            assert isinstance(result, MyDataclass) and result.message == "testarg2"
+            result = await core_client.call_utility_async("echo_dc", "testarg2", True)
             assert isinstance(result, list) and all(
-                isinstance(r, MyDataclass) and r.message == "testarg2"
-                for r in result)
+                isinstance(r, MyDataclass) and r.message == "testarg2" for r in result
+            )
 
             # Test returning None and list of Nones
-            result = await core_client.call_utility_async(
-                "echo_dc", None, False)
+            result = await core_client.call_utility_async("echo_dc", None, False)
             assert result is None
-            result = await core_client.call_utility_async(
-                "echo_dc", None, True)
+            result = await core_client.call_utility_async("echo_dc", None, True)
             assert isinstance(result, list) and all(r is None for r in result)
 
         finally:
@@ -426,11 +411,9 @@ async def test_engine_core_client_util_method_custom_return(
 
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_util_method_custom_dict_return(
-        monkeypatch: pytest.MonkeyPatch):
-
+    monkeypatch: pytest.MonkeyPatch,
+):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -439,7 +422,8 @@ async def test_engine_core_client_util_method_custom_dict_return(
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -457,22 +441,21 @@ async def test_engine_core_client_util_method_custom_dict_return(
 
             # Test single object return
             result = await core_client.call_utility_async(
-                "echo_dc_dict", "testarg3", False)
-            assert isinstance(result,
-                              MyDataclass) and result.message == "testarg3"
+                "echo_dc_dict", "testarg3", False
+            )
+            assert isinstance(result, MyDataclass) and result.message == "testarg3"
 
             # Test dict return with custom value types
             result = await core_client.call_utility_async(
-                "echo_dc_dict", "testarg3", True)
+                "echo_dc_dict", "testarg3", True
+            )
             assert isinstance(result, dict) and len(result) == 3
             for key, val in result.items():
                 assert key in ["key1", "key2", "key3"]
-                assert isinstance(val,
-                                  MyDataclass) and val.message == "testarg3"
+                assert isinstance(val, MyDataclass) and val.message == "testarg3"
 
             # Test returning dict with None values
-            result = await core_client.call_utility_async(
-                "echo_dc_dict", None, True)
+            result = await core_client.call_utility_async("echo_dc_dict", None, True)
             assert isinstance(result, dict) and len(result) == 3
             for key, val in result.items():
                 assert key in ["key1", "key2", "key3"]
@@ -484,11 +467,9 @@ async def test_engine_core_client_util_method_custom_dict_return(
 
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_util_method_nested_structures(
-        monkeypatch: pytest.MonkeyPatch):
-
+    monkeypatch: pytest.MonkeyPatch,
+):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -497,7 +478,8 @@ async def test_engine_core_client_util_method_nested_structures(
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -514,42 +496,48 @@ async def test_engine_core_client_util_method_nested_structures(
 
             # Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}]
             result = await core_client.call_utility_async(
-                "echo_dc_nested", "nested1", "list_of_dicts")
+                "echo_dc_nested", "nested1", "list_of_dicts"
+            )
             assert isinstance(result, list) and len(result) == 2
             for i, item in enumerate(result):
                 assert isinstance(item, dict)
                 if i == 0:
                     assert "a" in item and "b" in item
-                    assert isinstance(
-                        item["a"],
-                        MyDataclass) and item["a"].message == "nested1"
-                    assert isinstance(
-                        item["b"],
-                        MyDataclass) and item["b"].message == "nested1"
+                    assert (
+                        isinstance(item["a"], MyDataclass)
+                        and item["a"].message == "nested1"
+                    )
+                    assert (
+                        isinstance(item["b"], MyDataclass)
+                        and item["b"].message == "nested1"
+                    )
                 else:
                     assert "c" in item and "d" in item
-                    assert isinstance(
-                        item["c"],
-                        MyDataclass) and item["c"].message == "nested1"
-                    assert isinstance(
-                        item["d"],
-                        MyDataclass) and item["d"].message == "nested1"
+                    assert (
+                        isinstance(item["c"], MyDataclass)
+                        and item["c"].message == "nested1"
+                    )
+                    assert (
+                        isinstance(item["d"], MyDataclass)
+                        and item["d"].message == "nested1"
+                    )
 
             # Test dict of lists: {"list1": [val, val], "list2": [val, val]}
             result = await core_client.call_utility_async(
-                "echo_dc_nested", "nested2", "dict_of_lists")
+                "echo_dc_nested", "nested2", "dict_of_lists"
+            )
             assert isinstance(result, dict) and len(result) == 2
             assert "list1" in result and "list2" in result
             for key, lst in result.items():
                 assert isinstance(lst, list) and len(lst) == 2
                 for item in lst:
-                    assert isinstance(
-                        item, MyDataclass) and item.message == "nested2"
+                    assert isinstance(item, MyDataclass) and item.message == "nested2"
 
             # Test deeply nested: {"outer": [{"inner": [val, val]},
             # {"inner": [val]}]}
             result = await core_client.call_utility_async(
-                "echo_dc_nested", "nested3", "deep_nested")
+                "echo_dc_nested", "nested3", "deep_nested"
+            )
             assert isinstance(result, dict) and "outer" in result
             outer_list = result["outer"]
             assert isinstance(outer_list, list) and len(outer_list) == 2
@@ -560,21 +548,22 @@ async def test_engine_core_client_util_method_nested_structures(
             inner_list1 = inner_dict1["inner"]
             assert isinstance(inner_list1, list) and len(inner_list1) == 2
             for item in inner_list1:
-                assert isinstance(item,
-                                  MyDataclass) and item.message == "nested3"
+                assert isinstance(item, MyDataclass) and item.message == "nested3"
 
             # Second dict in outer list should have "inner" with 1 item
             inner_dict2 = outer_list[1]
             assert isinstance(inner_dict2, dict) and "inner" in inner_dict2
             inner_list2 = inner_dict2["inner"]
             assert isinstance(inner_list2, list) and len(inner_list2) == 1
-            assert isinstance(
-                inner_list2[0],
-                MyDataclass) and inner_list2[0].message == "nested3"
+            assert (
+                isinstance(inner_list2[0], MyDataclass)
+                and inner_list2[0].message == "nested3"
+            )
 
             # Test with None values in nested structures
             result = await core_client.call_utility_async(
-                "echo_dc_nested", None, "list_of_dicts")
+                "echo_dc_nested", None, "list_of_dicts"
+            )
             assert isinstance(result, list) and len(result) == 2
             for item in result:
                 assert isinstance(item, dict)
@@ -591,75 +580,71 @@ async def test_engine_core_client_util_method_nested_structures(
     indirect=["publisher_config"],
 )
 def test_kv_cache_events(
-    monkeypatch: pytest.MonkeyPatch,
     multiprocessing_mode: bool,
     publisher_config,
 ):
+    block_size = 16
+    num_blocks = 2
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            block_size=block_size,
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
-        engine_args.kv_events_config = publisher_config
+    endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(
+        endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+    )
 
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        request = make_request(sampling_params, custom_tokens)
+        client.add_request(request)
 
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=False,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        subscriber = MockSubscriber(endpoint,
-                                    topic=publisher_config.topic,
-                                    decode_type=KVEventBatch)
+        outputs: dict[str, list] = {request.request_id: []}
+        loop_until_done(client, outputs)
 
-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            request = make_request(sampling_params, custom_tokens)
-            client.add_request(request)
+        result = subscriber.receive_one(timeout=1000)
+        assert result is not None, "No message received"
 
-            outputs: dict[str, list] = {request.request_id: []}
-            loop_until_done(client, outputs)
-
-            result = subscriber.receive_one(timeout=1000)
-            assert result is not None, "No message received"
-
-            seq, received = result
-
-            assert seq == 0, "Sequence number mismatch"
-            assert (len(received.events) == 1
-                    ), "We should have exactly one BlockStored event"
-            event = received.events[0]
-            assert isinstance(
-                event, BlockStored), "We should have a BlockStored event"
-            assert (len(event.block_hashes) == num_blocks
-                    ), "We should have a BlockStored event with 2 block_hashes"
-            assert (event.block_size == block_size
-                    ), "Block size should be the same as the block size"
-            assert (event.parent_block_hash
-                    is None), "Parent block hash should be None"
-            assert event.lora_id is None, "Lora id should be None"
-            assert (len(event.token_ids) == num_blocks * block_size
-                    ), "Token ids should be the same as the custom tokens"
-            assert (event.token_ids == custom_tokens
-                    ), "Token ids should be the same as the custom tokens"
-        finally:
-            client.shutdown()
-            subscriber.close()
+        seq, received = result
+
+        assert seq == 0, "Sequence number mismatch"
+        assert len(received.events) == 1, "We should have exactly one BlockStored event"
+        event = received.events[0]
+        assert isinstance(event, BlockStored), "We should have a BlockStored event"
+        assert len(event.block_hashes) == num_blocks, (
+            "We should have a BlockStored event with 2 block_hashes"
+        )
+        assert event.block_size == block_size, (
+            "Block size should be the same as the block size"
+        )
+        assert event.parent_block_hash is None, "Parent block hash should be None"
+        assert event.lora_id is None, "Lora id should be None"
+        assert len(event.token_ids) == num_blocks * block_size, (
+            "Token ids should be the same as the custom tokens"
+        )
+        assert event.token_ids == custom_tokens, (
+            "Token ids should be the same as the custom tokens"
+        )
+    finally:
+        client.shutdown()
+        subscriber.close()
 
 
 @pytest.mark.asyncio
@@ -670,110 +655,96 @@ def test_kv_cache_events(
 )
 @multi_gpu_test(num_gpus=4)
 async def test_kv_cache_events_dp(
-    monkeypatch: pytest.MonkeyPatch,
     multiprocessing_mode: bool,
     publisher_config,
 ):
+    block_size = 16
+    num_blocks = 2
+    dp_size = 2
+    tp_size = 2
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        data_parallel_size=dp_size,
+        tensor_parallel_size=tp_size,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
-        dp_size = 2
-        tp_size = 2
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            data_parallel_size=dp_size,
-            tensor_parallel_size=tp_size,
-            block_size=block_size,
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
-        engine_args.kv_events_config = publisher_config
+    await asyncio.sleep(1)
 
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+    # Build endpoints for all DP ranks
+    base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    endpoints = []
+    for i in range(dp_size):
+        offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
+        endpoints.append(offset_endpoint)
 
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=True,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        await asyncio.sleep(1)
+    subscriber = MockSubscriber(
+        endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+    )
 
-        # Build endpoints for all DP ranks
-        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        endpoints = []
-        for i in range(dp_size):
-            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
-                base_endpoint, i)
-            endpoints.append(offset_endpoint)
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        all_request_ids = []
 
-        subscriber = MockSubscriber(endpoints,
-                                    topic=publisher_config.topic,
-                                    decode_type=KVEventBatch)
+        # Create and add 25 requests
+        # NOTE: attempts to force routing to both dp groups but can be flaky
+        for i in range(25):
+            await asyncio.sleep(0.01)
+            request = make_request(sampling_params, custom_tokens)
+            await client.add_request_async(request)
+            all_request_ids.append(request.request_id)
 
-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            all_request_ids = []
+        await asyncio.sleep(0.1)
 
-            # Create and add 25 requests
-            # NOTE: attempts to force routing to both dp groups but can be flaky
-            for i in range(25):
-                await asyncio.sleep(0.01)
-                request = make_request(sampling_params, custom_tokens)
-                await client.add_request_async(request)
-                all_request_ids.append(request.request_id)
-
-            await asyncio.sleep(0.1)
-
-            # Initialize outputs dict for all requests
-            outputs: dict[str, list] = {
-                req_id: []
-                for req_id in all_request_ids
-            }
-
-            print("processing requests...")
-            await asyncio.wait_for(loop_until_fully_done_async(
-                client, outputs),
-                                   timeout=20.0)
-
-            # Receive from subscriber until no more messages
-            print("collecting results...")
-            results = []
-            while True:
-                result = subscriber.receive_one(timeout=1)
-                print(result)
-                if result is None:
-                    break
-                results.append(result)
-
-            # Collect all events and data_parallel_ranks from all results
-            all_dp_ranks = [
-                received.data_parallel_rank for (_, received) in results
-            ]
-            unique_dps = set(all_dp_ranks)
-            assert (
-                len(unique_dps) == 2
-            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+        # Initialize outputs dict for all requests
+        outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
 
-        finally:
-            client.shutdown()
-            subscriber.close()
+        print("processing requests...")
+        await asyncio.wait_for(
+            loop_until_fully_done_async(client, outputs), timeout=20.0
+        )
+
+        # Receive from subscriber until no more messages
+        print("collecting results...")
+        results = []
+        while True:
+            result = subscriber.receive_one(timeout=1)
+            print(result)
+            if result is None:
+                break
+            results.append(result)
+
+        # Collect all events and data_parallel_ranks from all results
+        all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
+        unique_dps = set(all_dp_ranks)
+        assert len(unique_dps) == 2, (
+            f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+        )
+
+    finally:
+        client.shutdown()
+        subscriber.close()
 
 
 @pytest.mark.timeout(20)
 def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch to extract core process pid while it's starting.
         core_proc_pid = [None]
         cepm_ctor = CoreEngineProcManager.__init__
@@ -787,7 +758,8 @@ def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
         t = time.time()
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
         print(f"VllmConfig creation took {time.time() - t:.2f} seconds.")
 
@@ -815,8 +787,7 @@ def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
 
 
 @create_new_process_for_each_test()
-def test_engine_core_proc_instantiation_cuda_empty(
-        monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPatch):
     """
     Test that EngineCoreProc can be instantiated when CUDA_VISIBLE_DEVICES
     is empty. This ensures the engine frontend does not need access to GPUs.
@@ -833,17 +804,13 @@ def test_engine_core_proc_instantiation_cuda_empty(
 
         # Only implement the methods that are actually called during init
         from vllm.v1.kv_cache_interface import FullAttentionSpec
-        mock_spec = FullAttentionSpec(block_size=16,
-                                      num_kv_heads=1,
-                                      head_size=64,
-                                      dtype=torch.float16)
-
-        mock_executor.get_kv_cache_specs.return_value = [{
-            "default": mock_spec
-        }]
-        mock_executor.determine_available_memory.return_value = [
-            1024 * 1024 * 1024
-        ]
+
+        mock_spec = FullAttentionSpec(
+            block_size=16, num_kv_heads=1, head_size=64, dtype=torch.float16
+        )
+
+        mock_executor.get_kv_cache_specs.return_value = [{"default": mock_spec}]
+        mock_executor.determine_available_memory.return_value = [1024 * 1024 * 1024]
         mock_executor.initialize_from_config.return_value = None
         mock_executor.max_concurrent_batches = 1
 
@@ -852,24 +819,26 @@ def test_engine_core_proc_instantiation_cuda_empty(
     mock_executor_class.side_effect = create_mock_executor
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
 
         from vllm.v1.engine.utils import EngineZmqAddresses
 
-        def mock_startup_handshake(self, handshake_socket, local_client,
-                                   headless, parallel_config):
-            return EngineZmqAddresses(inputs=["tcp://127.0.0.1:5555"],
-                                      outputs=["tcp://127.0.0.1:5556"],
-                                      coordinator_input=None,
-                                      coordinator_output=None)
+        def mock_startup_handshake(
+            self, handshake_socket, local_client, headless, parallel_config
+        ):
+            return EngineZmqAddresses(
+                inputs=["tcp://127.0.0.1:5555"],
+                outputs=["tcp://127.0.0.1:5556"],
+                coordinator_input=None,
+                coordinator_output=None,
+            )
 
         # Background processes are not important here
         m.setattr(EngineCoreProc, "startup_handshake", mock_startup_handshake)
 
         vllm_config = EngineArgs(
-            model="deepseek-ai/DeepSeek-V2-Lite",
-            trust_remote_code=True).create_engine_config()
+            model="deepseek-ai/DeepSeek-V2-Lite", trust_remote_code=True
+        ).create_engine_config()
         engine_core_proc = EngineCoreProc(
             vllm_config=vllm_config,
             local_client=True,
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index f3d8e13088b039e47efb832f60ef4b2ec780f25b..77e67d54e587e1326044eaaacaf5454bffa0ad5e 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -40,23 +40,139 @@ def test_fast_inc_detok_invalid_utf8_err_case():
 
     detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
 
-    assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", \
+    assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", (
         "Should use FastIncrementalDetokenizer by default"
+    )
 
     # Process tokens incrementally
     test_tokens = [
-        236840, 107, 138, 236782, 107, 140, 236775, 6265, 1083, 623, 121908,
-        147418, 827, 107, 140, 236775, 6265, 236779, 2084, 1083, 623, 203292,
-        827, 107, 140, 236775, 6265, 236779, 7777, 1083, 623, 121908, 147418,
-        569, 537, 236789, 65880, 569, 537, 236789, 62580, 853, 115693, 210118,
-        35178, 16055, 1270, 759, 215817, 4758, 1925, 1117, 827, 107, 140,
-        236775, 5654, 1083, 623, 110733, 46291, 827, 107, 140, 236775, 5654,
-        236779, 2084, 1083, 623, 136955, 56731, 827, 107, 140, 236775, 5654,
-        236779, 7777, 1083, 623, 194776, 2947, 496, 109811, 1608, 890, 215817,
-        4758, 1925, 1117, 2789, 432, 398, 602, 31118, 569, 124866, 134772, 509,
-        19478, 1640, 33779, 236743, 236770, 236819, 236825, 236771, 432, 398,
-        432, 237167, 827, 107, 140, 236775, 77984, 1083, 623, 2709, 236745,
-        2555, 513, 236789, 602, 31118, 569
+        236840,
+        107,
+        138,
+        236782,
+        107,
+        140,
+        236775,
+        6265,
+        1083,
+        623,
+        121908,
+        147418,
+        827,
+        107,
+        140,
+        236775,
+        6265,
+        236779,
+        2084,
+        1083,
+        623,
+        203292,
+        827,
+        107,
+        140,
+        236775,
+        6265,
+        236779,
+        7777,
+        1083,
+        623,
+        121908,
+        147418,
+        569,
+        537,
+        236789,
+        65880,
+        569,
+        537,
+        236789,
+        62580,
+        853,
+        115693,
+        210118,
+        35178,
+        16055,
+        1270,
+        759,
+        215817,
+        4758,
+        1925,
+        1117,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        1083,
+        623,
+        110733,
+        46291,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        236779,
+        2084,
+        1083,
+        623,
+        136955,
+        56731,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        236779,
+        7777,
+        1083,
+        623,
+        194776,
+        2947,
+        496,
+        109811,
+        1608,
+        890,
+        215817,
+        4758,
+        1925,
+        1117,
+        2789,
+        432,
+        398,
+        602,
+        31118,
+        569,
+        124866,
+        134772,
+        509,
+        19478,
+        1640,
+        33779,
+        236743,
+        236770,
+        236819,
+        236825,
+        236771,
+        432,
+        398,
+        432,
+        237167,
+        827,
+        107,
+        140,
+        236775,
+        77984,
+        1083,
+        623,
+        2709,
+        236745,
+        2555,
+        513,
+        236789,
+        602,
+        31118,
+        569,
     ]
 
     output = ""
@@ -66,9 +182,9 @@ def test_fast_inc_detok_invalid_utf8_err_case():
         finished = i == len(test_tokens) - 1
         output += detokenizer.get_next_output_text(finished, delta=True)
 
-
-# fmt: off
-    assert output == r'''[
+    assert (
+        output
+        == r"""[
   {
     "source": "Résultats",
     "source_type": "CONCEPT",
@@ -76,4 +192,5 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     "target": "Israël",
     "target_type": "ORGANIZATION",
     "target_description": "Pays qui a obtenu à sa frontière libanaise « un niveau de calme inédit depuis les années 1960 »",
-    "relationship": "Obtention d'un niveau de'''
+    "relationship": "Obtention d'un niveau de"""
+    )
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 7529c3780ec256f921571047df66560113f53aed..c1d5f8af791774f0763dbc1bcb9b7e89a2b51a14 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import pytest
 
@@ -13,6 +11,8 @@ from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
@@ -21,12 +21,10 @@ DTYPE = "half"
 def _vllm_model(
     apc: bool,
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     *,
     skip_tokenizer_init: bool = False,
 ):
     """Set up VllmRunner instance."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     return vllm_runner(
         MODEL,
         dtype=DTYPE,
@@ -43,17 +41,18 @@ def _vllm_model(
     # env var adjustment via monkeypatch
     scope="function",
     # Prefix caching
-    params=[False, True])
-def vllm_model(vllm_runner, request, monkeypatch):
+    params=[False, True],
+)
+def vllm_model(vllm_runner, request):
     """VllmRunner test fixture parameterized by APC True/False."""
-    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(request.param, vllm_runner) as vllm_model:
         yield vllm_model
 
 
 @pytest.fixture(scope="function")
-def vllm_model_apc(vllm_runner, monkeypatch):
+def vllm_model_apc(vllm_runner):
     """VllmRunner test fixture with APC."""
-    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(True, vllm_runner) as vllm_model:
         yield vllm_model
 
 
@@ -62,21 +61,21 @@ def vllm_model_apc(vllm_runner, monkeypatch):
     # env var adjustment via monkeypatch
     scope="function",
     # Prefix caching
-    params=[False, True])
-def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+    params=[False, True],
+)
+def vllm_model_skip_tokenizer_init(vllm_runner, request):
     """VllmRunner test fixture with APC."""
     with _vllm_model(
-            request.param,
-            vllm_runner,
-            monkeypatch,
-            skip_tokenizer_init=True,
+        request.param,
+        vllm_runner,
+        skip_tokenizer_init=True,
     ) as vllm_model:
         yield vllm_model
 
 
 def _get_test_sampling_params(
     prompt_list: list[str],
-    seed: Optional[int] = 42,
+    seed: int | None = 42,
     structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
     """Generate random sampling params for a batch."""
@@ -97,9 +96,11 @@ def _get_test_sampling_params(
             top_p=0.95,
             n=n,
             seed=seed,
-            structured_outputs=StructuredOutputsParams(
-                regex="[0-9]+") if structured_outputs else None,
-        ) for n in n_list
+            structured_outputs=StructuredOutputsParams(regex="[0-9]+")
+            if structured_outputs
+            else None,
+        )
+        for n in n_list
     ], n_list
 
 
@@ -132,26 +133,23 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
     for out, n in zip(outputs, n_list):
         completion_counts: dict[str, int] = {}
         # Assert correct number of completions
-        assert len(out.outputs) == n, (
-            f"{len(out.outputs)} completions; {n} expected.")
+        assert len(out.outputs) == n, f"{len(out.outputs)} completions; {n} expected."
         for idx in range(n):
             comp = out.outputs[idx]
             # Assert correct completion indices
-            assert comp.index == idx, (f"Index {comp.index}; expected {idx}.")
+            assert comp.index == idx, f"Index {comp.index}; expected {idx}."
             text = comp.text
             completion_counts[text] = completion_counts.get(text, 0) + 1
         # Assert unique completions
         if len(completion_counts) != n:
-            repeats = {
-                txt: num
-                for (txt, num) in completion_counts.items() if num > 1
-            }
+            repeats = {txt: num for (txt, num) in completion_counts.items() if num > 1}
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
-                f" {n}. Repeats: {repeats}")
+                f" {n}. Repeats: {repeats}"
+            )
 
 
-def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+def test_engine_metrics(vllm_runner, example_prompts):
     max_tokens = 100
     # Use spec decoding to test num_accepted_tokens_per_pos
     speculative_config = {
@@ -160,15 +158,14 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         "prompt_lookup_min": 3,
         "num_speculative_tokens": 5,
     }
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+
     with vllm_runner(
-            MODEL,
-            speculative_config=speculative_config,
-            disable_log_stats=False,
+        MODEL,
+        speculative_config=speculative_config,
+        disable_log_stats=False,
     ) as vllm_model:
         llm: LLM = vllm_model.llm
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = llm.generate(example_prompts, sampling_params)
 
         n_prompts = len(example_prompts)
@@ -192,15 +189,14 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         num_requests_running = find_metric("vllm:num_requests_running")
         assert len(num_requests_running) == 1
         assert isinstance(num_requests_running[0], Gauge)
-        assert num_requests_running[0].value == .0
+        assert num_requests_running[0].value == 0.0
 
         generation_tokens = find_metric("vllm:generation_tokens")
         assert len(generation_tokens) == 1
         assert isinstance(generation_tokens[0], Counter)
         assert generation_tokens[0].value == total_tokens
 
-        request_generation_tokens = find_metric(
-            "vllm:request_generation_tokens")
+        request_generation_tokens = find_metric("vllm:request_generation_tokens")
         assert len(request_generation_tokens) == 1
         assert isinstance(request_generation_tokens[0], Histogram)
         assert "+Inf" in request_generation_tokens[0].buckets
@@ -209,16 +205,15 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         assert request_generation_tokens[0].sum == total_tokens
 
         num_accepted_tokens_per_pos = find_metric(
-            "vllm:spec_decode_num_accepted_tokens_per_pos")
+            "vllm:spec_decode_num_accepted_tokens_per_pos"
+        )
         assert len(num_accepted_tokens_per_pos) == 1
         assert isinstance(num_accepted_tokens_per_pos[0], Vector)
         assert len(num_accepted_tokens_per_pos[0].values) == 5
 
 
 @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
-def test_skip_tokenizer_initialization(model: str,
-                                       monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
@@ -232,8 +227,9 @@ def test_skip_tokenizer_initialization(model: str,
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)
 
-    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
-                           sampling_params=sampling_params)
+    outputs = llm.generate(
+        {"prompt_token_ids": [1, 2, 3]}, sampling_params=sampling_params
+    )
     assert len(outputs) > 0
     completions = outputs[0].outputs
     assert len(completions) > 0
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 72c0a9a13e231816cf91ecff380bb7a94afac2e6..8e1198b315bd179aae6b57224a7fbd70f16aef05 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -3,24 +3,31 @@
 
 import math
 import time
-from typing import Optional
 
 import pytest
 
-from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
-                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
-                                   STOP_STRINGS,
-                                   DummyOutputProcessorTestVectors,
-                                   MockEngineCore)
+from tests.v1.engine.utils import (
+    NUM_PROMPT_LOGPROBS_UNDER_TEST,
+    NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+    STOP_STRINGS,
+    DummyOutputProcessorTestVectors,
+    MockEngineCore,
+)
 from vllm import PoolingParams
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
+from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.output_processor import (OutputProcessor,
-                                             RequestOutputCollector)
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.engine import (
+    EngineCoreEvent,
+    EngineCoreEventType,
+    EngineCoreOutputs,
+    EngineCoreRequest,
+    FinishReason,
+)
+from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 
 def _ref_convert_id_to_token(
@@ -40,33 +47,39 @@ def _ref_convert_id_to_token(
 
 
 @pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind,
-                                    dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens)
+    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
+@pytest.mark.parametrize("stream_interval", [1, 5, 10])
+def test_incremental_detokenization(
+    request_output_kind: RequestOutputKind,
+    stream_interval: int,
+    dummy_test_vectors,
+):
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
+    )
+    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt_token_ids=prompt_tokens,
-                          mm_features=None,
-                          eos_token_id=None,
-                          arrival_time=0,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                          ),
-                          pooling_params=None)
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=request_output_kind,
+                stop=[],
+                include_stop_str_in_output=False,
+            ),
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -96,14 +109,23 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
+                if request_output_kind == RequestOutputKind.DELTA:
+                    assert len(new_tokens) == 1, f"{len(new_tokens)=}"
             else:
                 gen_strings[request_id] += new_text
                 gen_tokens[request_id].extend(new_tokens)
+                if (
+                    request_output_kind == RequestOutputKind.DELTA
+                    and not request_output.finished
+                ):
+                    assert len(new_tokens) >= stream_interval, (
+                        f"{len(new_tokens)=}, {stream_interval=}"
+                    )
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(dummy_test_vectors.generation_strings,
-                dummy_test_vectors.generation_tokens)):
+        zip(dummy_test_vectors.generation_strings, dummy_test_vectors.generation_tokens)
+    ):
         gen_str = gen_strings[f"request-{idx}"]
         gen_toks = gen_tokens[f"request-{idx}"]
 
@@ -116,13 +138,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
 
 def _validate_logprobs(
     gen_tokens: dict[str, list[int]],
-    gen_logprobs: dict[str, Optional[SampleLogprobs]],
-    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
+    gen_logprobs: dict[str, SampleLogprobs | None],
+    gen_prompt_logprobs: dict[str, PromptLogprobs | None],
     gen_cumulative_logprob: dict[str, float],
     dtv: DummyOutputProcessorTestVectors,
     request_id_list: list[str],
-    num_sample_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
+    num_sample_logprobs: int | None,
+    num_prompt_logprobs: int | None,
 ) -> None:
     for req_idx, req_id in enumerate(request_id_list):
         new_tokens = gen_tokens[req_id]
@@ -134,9 +156,11 @@ def _validate_logprobs(
         ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
         if num_sample_logprobs is not None:
             # Validate sample logprobs
-            assert logprobs is not None, (f"Request {req_id} requires sample"
-                                          " logprobs but sample logprobs are"
-                                          " None.")
+            assert logprobs is not None, (
+                f"Request {req_id} requires sample"
+                " logprobs but sample logprobs are"
+                " None."
+            )
             # Require num sampled tokens to match num
             # sampled logprobs - especially important
             # to check since the detokenizer can cause
@@ -147,44 +171,51 @@ def _validate_logprobs(
             assert num_new_tokens == len_sample_logprobs, (
                 f"Request {req_id} has {num_new_tokens}"
                 " completion tokens but has"
-                f" {len_sample_logprobs} sample logprobs.")
+                f" {len_sample_logprobs} sample logprobs."
+            )
             ref_cumulative_logprob = 0.0
-            for idx, (sampled_token,
-                      pos_logprob_dict) in enumerate(zip(new_tokens,
-                                                         logprobs)):
+            for idx, (sampled_token, pos_logprob_dict) in enumerate(
+                zip(new_tokens, logprobs)
+            ):
                 # Break out the reference log probability value &
                 # logprob token id tensors associated with this
                 # position in the completion. Also break out the
                 # sampled token ranks
-                (ref_pos_logprob_toks, ref_pos_logprob_vals,
-                 ref_sampled_token_rank) = ref_logprobs[idx]
+                (ref_pos_logprob_toks, ref_pos_logprob_vals, ref_sampled_token_rank) = (
+                    ref_logprobs[idx]
+                )
                 # For each position in the completion sequence,
                 # ensure the actual sampled token is among the
                 # logprobs
                 assert sampled_token in pos_logprob_dict, (
                     f"Sampled token {sampled_token} not"
-                    f" present in logprob at index {idx}")
+                    f" present in logprob at index {idx}"
+                )
 
                 # Validate number of sample logprobs
                 num_lp_toks = len(pos_logprob_dict)
-                assert (num_lp_toks == num_sample_logprobs
-                        or num_lp_toks == num_sample_logprobs +
-                        1), ("Valid numbers of sample logprobs are"
-                             f" {num_sample_logprobs} or"
-                             f" {num_sample_logprobs+1} but"
-                             f" {num_lp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
+                assert (
+                    num_lp_toks == num_sample_logprobs
+                    or num_lp_toks == num_sample_logprobs + 1
+                ), (
+                    "Valid numbers of sample logprobs are"
+                    f" {num_sample_logprobs} or"
+                    f" {num_sample_logprobs + 1} but"
+                    f" {num_lp_toks} logprobs found at"
+                    f" position {idx}. Logprobs dict:"
+                    f" {pos_logprob_dict}"
+                )
 
                 # Validate sampled token logprob rank
                 smp_lp = pos_logprob_dict[sampled_token]
                 smp_lp_rank = smp_lp.rank
-                assert (ref_sampled_token_rank == smp_lp_rank), (
+                assert ref_sampled_token_rank == smp_lp_rank, (
                     "Sampled token logprob rank"
                     f" {smp_lp_rank} does not match"
                     " correct value"
                     f" {ref_sampled_token_rank}"
-                    f" in Logprob {smp_lp}")
+                    f" in Logprob {smp_lp}"
+                )
 
                 # Validate that the logprob processor yields
                 # the correct log probabilities and valid
@@ -198,7 +229,8 @@ def _validate_logprobs(
                     ref_tok_id = ref_pos_logprob_toks[jdx]
                     assert ref_tok_id in pos_logprob_dict, (
                         f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
+                        f" in logprob dict but it is not."
+                    )
 
                     # Extract actually-generated logprob
                     # info
@@ -208,40 +240,43 @@ def _validate_logprobs(
 
                     # A "top" (rank 1) logprob must be
                     # present
-                    rank_one_appears = (True
-                                        if lp_rank == 1 else rank_one_appears)
+                    rank_one_appears = True if lp_rank == 1 else rank_one_appears
 
                     # Rank must be >= 1
-                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
-                                          f" rank {lp_rank} < 1."
-                                          f" Logprob dict: {pos_logprob_dict}")
+                    assert lp_rank >= 1, (
+                        f"Logprob {lp} has invalid"
+                        f" rank {lp_rank} < 1."
+                        f" Logprob dict: {pos_logprob_dict}"
+                    )
 
                     # Validate log probability
                     assert math.isclose(lp_val, ref_lp_val), (
                         f"Token id {ref_tok_id} appears in logprobs dict"
                         f" at position {idx} in completion with log"
                         f" probability {lp_val} but {ref_lp_val} was"
-                        f" expected. Logprob: {lp}")
+                        f" expected. Logprob: {lp}"
+                    )
 
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
+                assert rank_one_appears, (
+                    f"No Logprob has rank 1"
+                    " in the following Logprob"
+                    f" dict: {pos_logprob_dict}"
+                )
 
                 # Validate logprobs detokenization
                 for lp_tok in pos_logprob_dict:
                     # Confirm that sample logprob decoded token matches
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[lp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, lp_tok)
+                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
                     assert decoded_token == ref_decoded_token, (
                         f"Sampled logprob token id {lp_tok} decodes to"
                         f" {ref_decoded_token} but Logprob decoded"
                         f" token is {decoded_token} instead"
-                        f" (at position {idx})")
+                        f" (at position {idx})"
+                    )
 
-                ref_cumulative_logprob += pos_logprob_dict[
-                    sampled_token].logprob
+                ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
             # Assert that cumulative logprobs are correct
             assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
         else:
@@ -254,7 +289,8 @@ def _validate_logprobs(
             assert prompt_logprobs is not None, (
                 f"Request {req_id} requires prompt"
                 " logprobs but prompt logprobs are"
-                " None.")
+                " None."
+            )
             # Require num prompt tokens to match num
             # prompt logprobs
             num_prompt_tokens = len(prompt_token_ids)
@@ -262,56 +298,70 @@ def _validate_logprobs(
             assert num_prompt_tokens == len_prompt_logprobs, (
                 f"Request {req_id} has {num_prompt_tokens}"
                 " prompt tokens but has"
-                f" {len_prompt_logprobs} prompt logprobs.")
+                f" {len_prompt_logprobs} prompt logprobs."
+            )
             # First prompt logprob is None
             first_plp_dict = prompt_logprobs[0]
             assert first_plp_dict is None, (
                 f"Request {req_id} first prompt logprob"
                 f" should be None but has following value"
-                f" instead: {first_plp_dict}")
+                f" instead: {first_plp_dict}"
+            )
             # Break out the reference prompt log prob value &
             # logprob token id matrices for the whole prompt.
             # Also break out the prompt token rank vector
-            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
-             ref_prompt_token_ranks) = ref_prompt_logprobs
+            (
+                ref_prompt_logprob_toks,
+                ref_prompt_logprob_vals,
+                ref_prompt_token_ranks,
+            ) = ref_prompt_logprobs
             for idx, (prompt_token, pos_logprob_dict) in enumerate(
-                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
-
+                zip(prompt_token_ids[1:], prompt_logprobs[1:])
+            ):
                 # Break out the reference prompt log prob value
                 # vector, prompt logprob token id vector, and
                 # prompt token rank at the current position.
-                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
-                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
-                                               ref_prompt_logprob_vals[idx, :],
-                                               ref_prompt_token_ranks[idx])
+                (
+                    ref_pos_prompt_logprob_toks,
+                    ref_pos_prompt_logprob_vals,
+                    ref_pos_prompt_token_rank,
+                ) = (
+                    ref_prompt_logprob_toks[idx, :],
+                    ref_prompt_logprob_vals[idx, :],
+                    ref_prompt_token_ranks[idx],
+                )
 
                 # For each position in the prompt sequence,
                 # ensure the actual prompt token is among the
                 # logprobs
                 assert prompt_token in pos_logprob_dict, (
-                    f"Prompt token {prompt_token} not"
-                    f" present in logprob at index {idx}")
+                    f"Prompt token {prompt_token} not present in logprob at index {idx}"
+                )
                 # Validate number of prompt logprobs
                 num_plp_toks = len(pos_logprob_dict)
-                assert (num_plp_toks == num_prompt_logprobs
-                        or num_plp_toks == num_prompt_logprobs +
-                        1), ("Valid numbers of prompt logprobs are"
-                             f" {num_prompt_logprobs} or"
-                             f" {num_prompt_logprobs+1} but"
-                             f" {num_plp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
+                assert (
+                    num_plp_toks == num_prompt_logprobs
+                    or num_plp_toks == num_prompt_logprobs + 1
+                ), (
+                    "Valid numbers of prompt logprobs are"
+                    f" {num_prompt_logprobs} or"
+                    f" {num_prompt_logprobs + 1} but"
+                    f" {num_plp_toks} logprobs found at"
+                    f" position {idx}. Logprobs dict:"
+                    f" {pos_logprob_dict}"
+                )
 
                 # Validate prompt token logprob rank
                 prmpt_tok_lp = pos_logprob_dict[prompt_token]
                 prmpt_tok_lp_rank = prmpt_tok_lp.rank
                 ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
-                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
+                assert ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank, (
                     "Prompt token logprob rank"
                     f" {prmpt_tok_lp_rank} does not match"
                     " correct value"
                     f" {ref_prmpt_tok_lp_rank}"
-                    f" in Logprob {prmpt_tok_lp}")
+                    f" in Logprob {prmpt_tok_lp}"
+                )
 
                 # Validate that the logprob processor yields
                 # the correct prompt log probs and valid
@@ -325,7 +375,8 @@ def _validate_logprobs(
                     ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
                     assert ref_tok_id in pos_logprob_dict, (
                         f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
+                        f" in logprob dict but it is not."
+                    )
 
                     # Extract actually-generated logprob
                     # info
@@ -335,87 +386,93 @@ def _validate_logprobs(
 
                     # A "top" (rank 1) logprob must be
                     # present
-                    rank_one_appears = (True
-                                        if plp_rank == 1 else rank_one_appears)
+                    rank_one_appears = True if plp_rank == 1 else rank_one_appears
 
                     # Rank must be >= 1
                     assert plp_rank >= 1, (
                         f"Logprob {plp} has invalid"
                         f" rank {plp_rank} < 1."
-                        f" Logprob dict: {pos_logprob_dict}")
+                        f" Logprob dict: {pos_logprob_dict}"
+                    )
 
                     # Validate log probability
                     assert math.isclose(plp_val, ref_plp_val), (
                         f"Token id {ref_tok_id} appears in logprobs dict"
                         f" at position {idx} in completion with log"
                         f" probability {plp_val} but {ref_plp_val} was"
-                        f" expected. Logprob: {plp}")
+                        f" expected. Logprob: {plp}"
+                    )
 
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
+                assert rank_one_appears, (
+                    f"No Logprob has rank 1"
+                    " in the following Logprob"
+                    f" dict: {pos_logprob_dict}"
+                )
 
                 # Validate prompt logprob detokenization
                 for plp_tok in pos_logprob_dict:
                     # Confirm that prompt logprob decoded token matches
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[plp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, plp_tok)
+                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
                     assert decoded_token == ref_decoded_token, (
                         f"Prompt logprob token id {plp_tok} decodes to"
                         f" {ref_decoded_token} but Logprob decoded"
                         f" token is {decoded_token} instead"
-                        f" (at position {idx})")
+                        f" (at position {idx})"
+                    )
         else:
             # Prompt logprobs disabled for this request
             assert prompt_logprobs is None
 
 
 @pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs",
-                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
-def test_logprobs_processor(request_output_kind: RequestOutputKind,
-                            num_sample_logprobs: Optional[int],
-                            num_prompt_logprobs: Optional[int],
-                            dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
+    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
+@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs", [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(
+    request_output_kind: RequestOutputKind,
+    num_sample_logprobs: int | None,
+    num_prompt_logprobs: int | None,
+    dummy_test_vectors,
+):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None if num_sample_logprobs is None else
-        dummy_test_vectors.generation_logprobs,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
         prompt_logprobs_raw=None
-        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+    )
 
     # Make N requests.
     request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
+        f"request-{idx}" for idx in range(len(dummy_test_vectors.prompt_strings))
     ]
     requests = [
-        EngineCoreRequest(request_id=request_id_list[idx],
-                          prompt_token_ids=prompt_tokens,
-                          mm_features=None,
-                          eos_token_id=None,
-                          arrival_time=0,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                              logprobs=num_sample_logprobs,
-                              prompt_logprobs=num_prompt_logprobs,
-                          ),
-                          pooling_params=None)
+        EngineCoreRequest(
+            request_id=request_id_list[idx],
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=request_output_kind,
+                stop=[],
+                include_stop_str_in_output=False,
+                logprobs=num_sample_logprobs,
+                prompt_logprobs=num_prompt_logprobs,
+            ),
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -446,7 +503,8 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
             prompt_logprobs = request_output.prompt_logprobs
             logprobs = request_output.outputs[0].logprobs
             gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
+                0
+            ].cumulative_logprob
             if request_id not in gen_logprobs:
                 # Start tracking sample and prompt logprobs for this request
                 gen_tokens[request_id] = new_tokens
@@ -463,10 +521,16 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                     plp.extend(prompt_logprobs)
 
     # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs,
-                       num_prompt_logprobs)
+    _validate_logprobs(
+        gen_tokens,
+        gen_logprobs,
+        gen_prompt_logprobs,
+        gen_cumulative_logprobs,
+        dummy_test_vectors,
+        request_id_list,
+        num_sample_logprobs,
+        num_prompt_logprobs,
+    )
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
@@ -474,15 +538,23 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
 
 @pytest.mark.parametrize(
     "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
-    [(False, "stop_token_ids", False, None),
-     (True, "stop_token_ids", False, None),
-     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
-     (False, "eos_token_id", True, None)])
-def test_stop_token(include_stop_str_in_output: bool,
-                    num_sample_logprobs: Optional[int], stop_token_type: str,
-                    ignore_eos: bool, dummy_test_vectors):
+    [
+        (False, "stop_token_ids", False, None),
+        (True, "stop_token_ids", False, None),
+        (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+        (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+        (False, "eos_token_id", False, None),
+        (True, "eos_token_id", False, None),
+        (False, "eos_token_id", True, None),
+    ],
+)
+def test_stop_token(
+    include_stop_str_in_output: bool,
+    num_sample_logprobs: int | None,
+    stop_token_type: str,
+    ignore_eos: bool,
+    dummy_test_vectors,
+):
     """Test output processor EOS/stop token handling.
 
     Send mock engine core request to mock engine core and pass core outputs
@@ -523,9 +595,10 @@ def test_stop_token(include_stop_str_in_output: bool,
         dummy_test_vectors: dummy engine core outputs and other data structures
     """
     model_id = dummy_test_vectors.tokenizer.name_or_path
-    if model_id != 'meta-llama/Llama-3.2-1B':
-        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
-                             f"{model_id} is in use.")
+    if model_id != "meta-llama/Llama-3.2-1B":
+        raise AssertionError(
+            f"Test requires meta-llama/Llama-3.2-1B but {model_id} is in use."
+        )
     do_logprobs = num_sample_logprobs is not None
     # EOS under test; if False, stop_token_ids under test
     is_eos_test = stop_token_type == "eos_token_id"
@@ -536,18 +609,16 @@ def test_stop_token(include_stop_str_in_output: bool,
     )  # '<|end_of_text|>'
     stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
 
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
     # Dummy engine core outputs, with control tokens suffixed to test stops
-    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+    suffix_token = [eos_token_id] if is_eos_test else stop_token_ids
     assert suffix_token is not None and isinstance(suffix_token[0], int)
     generation_string = dummy_test_vectors.generation_strings[0]
-    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
-                         2 * suffix_token)
+    generation_tokens = dummy_test_vectors.generation_tokens[0] + 2 * suffix_token
     if do_logprobs:
-        generation_logprobs = (
-            dummy_test_vectors.generation_logprobs[0] +
-            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+        generation_logprobs = dummy_test_vectors.generation_logprobs[0] + 2 * [
+            dummy_test_vectors.generation_logprobs[0][-1]
+        ]
     prompt_string = dummy_test_vectors.prompt_strings[0]
     prompt_tokens = dummy_test_vectors.prompt_tokens[0]
     engine_core = MockEngineCore(
@@ -556,7 +627,8 @@ def test_stop_token(include_stop_str_in_output: bool,
         prompt_logprobs_raw=None,
         eos_token_id=eos_token_id,
         stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos)
+        ignore_eos=ignore_eos,
+    )
 
     # Make request.
     request_id = "request-0"
@@ -580,7 +652,8 @@ def test_stop_token(include_stop_str_in_output: bool,
             prompt_logprobs=None,
             ignore_eos=ignore_eos,
         ),
-        pooling_params=None)
+        pooling_params=None,
+    )
 
     # Add request to the detokenizer.
     output_processor.add_request(request, prompt_string)
@@ -605,7 +678,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         # Update tracking.
         request_output = request_outputs[0]
         if request_output.finished:
-            finish_reason = ("length" if is_eos_ignore_test else "stop")
+            finish_reason = "length" if is_eos_ignore_test else "stop"
             assert request_output.outputs[0].finish_reason == finish_reason
 
         gen_string += request_output.outputs[0].text
@@ -614,7 +687,7 @@ def test_stop_token(include_stop_str_in_output: bool,
             gen_logprobs.extend(request_output.outputs[0].logprobs)
 
     # Validate generated text
-    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+    control_token = "<|end_of_text|>" if is_eos_test else "<|eot_id|>"
     if is_eos_ignore_test:
         # Length-based stop; expect full string
         ref_str = generation_string + 2 * control_token
@@ -624,14 +697,15 @@ def test_stop_token(include_stop_str_in_output: bool,
     else:
         # Stop token triggered but not in output
         ref_str = generation_string
-    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+    assert gen_string == ref_str, f"{gen_string=}, {ref_str=}"
 
     if do_logprobs:
         # Validate number of sample logprobs
         num_tokens = len(gen_tokens)
         num_logprobs = len(gen_logprobs)
         assert num_tokens == num_logprobs, (
-            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})"
+        )
 
     # Check requests are finished
     assert output_processor.get_num_unfinished_requests() == 0
@@ -639,22 +713,24 @@ def test_stop_token(include_stop_str_in_output: bool,
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-def test_stop_string(include_stop_str_in_output: bool,
-                     num_sample_logprobs: Optional[int], dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
+@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+def test_stop_string(
+    include_stop_str_in_output: bool,
+    num_sample_logprobs: int | None,
+    dummy_test_vectors,
+):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
         generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs else None,
-        prompt_logprobs_raw=None)
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+    )
 
     # Make N requests.
     request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
+        f"request-{idx}" for idx in range(len(dummy_test_vectors.prompt_strings))
     ]
     requests = [
         EngineCoreRequest(
@@ -675,7 +751,8 @@ def test_stop_string(include_stop_str_in_output: bool,
                 logprobs=num_sample_logprobs,
                 prompt_logprobs=None,
             ),
-            pooling_params=None)
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -715,7 +792,8 @@ def test_stop_string(include_stop_str_in_output: bool,
             prompt_logprobs = request_output.prompt_logprobs
             logprobs = request_output.outputs[0].logprobs
             gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
+                0
+            ].cumulative_logprob
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
@@ -733,8 +811,8 @@ def test_stop_string(include_stop_str_in_output: bool,
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, stop_str) in enumerate(
-            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
-
+        zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
+    ):
         # Request should be aborted.
         request_id = f"request-{idx}"
         assert request_id in aborted
@@ -748,24 +826,28 @@ def test_stop_string(include_stop_str_in_output: bool,
         ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
 
         if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
+            assert gen_str == ref_str_inc_stop, f"{gen_str=}, {ref_str_inc_stop=}"
         else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
+            assert gen_str == ref_str_exc_stop, f"{gen_str=}, {ref_str_exc_stop=}"
 
     # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs, None)
+    _validate_logprobs(
+        gen_tokens,
+        gen_logprobs,
+        gen_prompt_logprobs,
+        gen_cumulative_logprobs,
+        dummy_test_vectors,
+        request_id_list,
+        num_sample_logprobs,
+        None,
+    )
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
 
 
 def test_iteration_stats(dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=True)
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
     engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
@@ -782,7 +864,8 @@ def test_iteration_stats(dummy_test_vectors):
             data_parallel_rank=None,
             sampling_params=SamplingParams(),
             pooling_params=None,
-        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+        )
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add all requests except one to the OutputProcessor.
@@ -794,12 +877,13 @@ def test_iteration_stats(dummy_test_vectors):
     # First iteration has 2 prefills.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-    total_prompt_tokens = sum([
-        len(prompt_tokens)
-        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
-    ])
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
+    total_prompt_tokens = sum(
+        [
+            len(prompt_tokens)
+            for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+        ]
+    )
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
@@ -807,8 +891,7 @@ def test_iteration_stats(dummy_test_vectors):
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
@@ -818,8 +901,7 @@ def test_iteration_stats(dummy_test_vectors):
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
     total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
@@ -828,13 +910,176 @@ def test_iteration_stats(dummy_test_vectors):
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
 
 
+@pytest.mark.parametrize("log_stats", [True, False])
+def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
+    """Test LoRA request lifecycle tracking through waiting -> running -> finished."""
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=log_stats
+    )
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
+
+    # Create LoRA requests
+    lora1 = LoRARequest(lora_name="lora-1", lora_int_id=1, lora_path="/path/to/lora1")
+    lora2 = LoRARequest(lora_name="lora-2", lora_int_id=2, lora_path="/path/to/lora2")
+
+    # Create requests with different LoRA adapters:
+    # - request-0: lora-1
+    # - request-1: lora-2
+    # - request-2: None (no LoRA)
+    lora_assignments = [lora1, lora2, None]
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=lora_assignments[idx],
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(),
+            pooling_params=None,
+        )
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add all requests to the OutputProcessor
+    for request in requests:
+        output_processor.add_request(request, None)
+
+    # First iteration: process outputs with QUEUED events
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    for output in outputs.outputs:
+        output.events = [
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, engine_core_timestamp)
+        ]
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # Verify waiting counts
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 1
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 1
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 0
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 0
+        # Verify internal state
+        assert len(output_processor.lora_states.requests) == 2
+        assert "lora-1" in output_processor.lora_states.requests
+        assert "lora-2" in output_processor.lora_states.requests
+    else:
+        # When log_stats=False, no tracking should occur
+        assert iteration_stats is None
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Second iteration: process outputs with SCHEDULED events
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    for output in outputs.outputs:
+        output.events = [
+            EngineCoreEvent.new_event(
+                EngineCoreEventType.SCHEDULED, engine_core_timestamp
+            )
+        ]
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # Verify running counts
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 0
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 0
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 1
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1
+    else:
+        assert iteration_stats is None
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Third iteration: finish request-0 (lora-1)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-0 as finished (it uses lora-1)
+    for output in outputs.outputs:
+        if output.request_id == "request-0":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # lora-1 should be removed since no requests remain
+        assert "lora-1" not in output_processor.lora_states.requests
+        # lora-2 should still be running
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1
+        assert len(output_processor.lora_states.requests) == 1
+    else:
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Fourth iteration: finish request-1 (lora-2)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-1 as finished (it uses lora-2)
+    for output in outputs.outputs:
+        if output.request_id == "request-1":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # lora-2 should be removed since no requests remain
+        assert "lora-2" not in output_processor.lora_states.requests
+        assert len(outputs.scheduler_stats.running_lora_adapters) == 0
+        assert len(output_processor.lora_states.requests) == 0
+    else:
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Finish the last request (no LoRA)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-2 as finished (it has no LoRA)
+    for output in outputs.outputs:
+        if output.request_id == "request-2":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    # Verify all requests are finished
+    assert output_processor.get_num_unfinished_requests() == 0
+
+
 @pytest.mark.asyncio
 async def test_request_output_collector():
     NUM_REQS = 3
@@ -853,16 +1098,13 @@ async def test_request_output_collector():
                         text=TEXT,
                         token_ids=[idx],
                         cumulative_logprob=(idx + 1 * 1.0),
-                        logprobs=[{
-                            "a": idx,
-                            "b": idx
-                        }],
-                        finish_reason="length" if
-                        (idx == NUM_REQS - 1) else None,
+                        logprobs=[{"a": idx, "b": idx}],
+                        finish_reason="length" if (idx == NUM_REQS - 1) else None,
                     )
                 ],
                 finished=(idx == NUM_REQS - 1),
-            ) for idx in range(NUM_REQS)
+            )
+            for idx in range(NUM_REQS)
         ]
 
     collector = RequestOutputCollector(RequestOutputKind.DELTA)
@@ -888,8 +1130,7 @@ async def test_request_output_collector():
     assert not output.finished
     # Text, token_ids, and logprobs should get merged.
     assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids, list(range(num_to_put))):
         assert tok_0 == tok_1
     assert len(output.outputs[0].logprobs) == num_to_put
 
@@ -910,8 +1151,7 @@ async def test_request_output_collector():
     assert output.outputs[0].finish_reason == "length"
     # Text, token_ids, and logprobs should get merged.
     assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids, list(range(num_to_put))):
         assert tok_0 == tok_1
     assert len(output.outputs[0].logprobs) == num_to_put
 
@@ -1003,8 +1243,7 @@ async def test_cumulative_output_collector_n():
 
 @pytest.mark.parametrize("runner", ["generate", "pooling"])
 def test_abort_requests(runner: str, dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=True)
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
@@ -1016,9 +1255,9 @@ def test_abort_requests(runner: str, dummy_test_vectors):
             cache_salt=None,
             data_parallel_rank=None,
             sampling_params=SamplingParams() if runner == "generate" else None,
-            pooling_params=PoolingParams(
-                task="embed") if runner == "pooling" else None,
-        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+            pooling_params=PoolingParams(task="embed") if runner == "pooling" else None,
+        )
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     for request in requests:
diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..736c0e54837feaf947142193b362786a5f09570e
--- /dev/null
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.parallel_sampling import ParentRequest
+
+
+def test_parent_request_to_output_stream() -> None:
+    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+
+    # output_1 finished
+    output_1.finish_reason = "ended"
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    # Finished output_1 had already returned, DO NOT returned again
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+
+    # output_0 finished
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    # Finished output_0 had already returned, DO NOT returned again
+    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+
+
+def test_parent_request_to_output_final_only() -> None:
+    parent_request = ParentRequest(
+        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+    )
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished, return nothing
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_1 finished, but outputs won't be returned until all child requests finished
+    output_1.finish_reason = "ended"
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_0 finished, as all child requests finished, the output would be returned
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py
index 3a7bcb95718259c99939424d0adb06f7ef68a920..cb6865e42ef8b3704563c12e3d7300dba1497e7b 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@@ -16,35 +16,33 @@ baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
 
 
 # Mock processor for testing
-def _mk_processor(monkeypatch,
-                  *,
-                  mm_cache_gb: float = 4.0,
-                  enable_prefix_caching: bool = True) -> Processor:
+def _mk_processor(
+    monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
+) -> Processor:
     """
     Create a Processor instance with minimal configuration suitable for unit
     tests without accessing external resources.
     """
-    monkeypatch.setattr(ModelConfig,
-                        "try_get_generation_config",
-                        lambda self: {},
-                        raising=True)
-    monkeypatch.setattr(ModelConfig,
-                        "__post_init__",
-                        lambda self, *args: None,
-                        raising=True)
-    monkeypatch.setattr(ModelConfig,
-                        "verify_with_parallel_config",
-                        lambda self, parallel_config: None,
-                        raising=True)
-    monkeypatch.setattr(processor_mod,
-                        "processor_cache_from_config",
-                        lambda vllm_config, mm_registry: None,
-                        raising=True)
-
-    monkeypatch.setattr(VllmConfig,
-                        "__post_init__",
-                        lambda self: None,
-                        raising=True)
+    monkeypatch.setattr(
+        ModelConfig, "try_get_generation_config", lambda self: {}, raising=True
+    )
+    monkeypatch.setattr(
+        ModelConfig, "__post_init__", lambda self, *args: None, raising=True
+    )
+    monkeypatch.setattr(
+        ModelConfig,
+        "verify_with_parallel_config",
+        lambda self, parallel_config: None,
+        raising=True,
+    )
+    monkeypatch.setattr(
+        processor_mod,
+        "processor_cache_from_config",
+        lambda vllm_config, mm_registry: None,
+        raising=True,
+    )
+
+    monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
 
     model_config = ModelConfig(
         skip_tokenizer_init=True,
@@ -57,21 +55,17 @@ def _mk_processor(monkeypatch,
     # Minimal multimodal_config to satisfy references in
     # Processor.process_inputs.
     class _MockMMConfig:
-
         def __init__(self, gb: float):
             self.mm_processor_cache_gb = gb
 
-    model_config.multimodal_config = _MockMMConfig(
-        mm_cache_gb)  # type: ignore[attr-defined]
+    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
         device_config=DeviceConfig(device="cpu"),
     )
 
-    # Pass tokenizer=None; InputPreprocessor handles None when
-    # skip_tokenizer_init is True.
-    return Processor(vllm_config, tokenizer=None)  # type: ignore[arg-type]
+    return Processor(vllm_config, tokenizer=None)
 
 
 def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
@@ -79,13 +73,9 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
 
     prompt = {
         "prompt": "USER: <image>\nDescribe\nASSISTANT:",
-        "multi_modal_data": {
-            "image": [cherry_pil_image, stop_pil_image]
-        },
+        "multi_modal_data": {"image": [cherry_pil_image, stop_pil_image]},
         # Mismatch: 2 items but only 1 uuid provided
-        "multi_modal_uuids": {
-            "image": ["hash_cherry"]
-        },
+        "multi_modal_uuids": {"image": ["hash_cherry"]},
     }
 
     with pytest.raises(ValueError, match="must have same length as data"):
@@ -104,16 +94,13 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
         # Two modalities provided in data
         "multi_modal_data": {
             "image": [cherry_pil_image],
-            "video": [baby_reading_np_ndarrays]
+            "video": [baby_reading_np_ndarrays],
         },
         # Only image uuids provided; video missing should raise
-        "multi_modal_uuids": {
-            "image": ["hash_cherry"]
-        },
+        "multi_modal_uuids": {"image": ["hash_cherry"]},
     }
 
-    with pytest.raises(ValueError,
-                       match="must be provided if multi_modal_data"):
+    with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
         processor.process_inputs(
             request_id="req-2",
             prompt=prompt,  # type: ignore[arg-type]
@@ -130,28 +117,28 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
     ],
 )
 def test_multi_modal_uuids_accepts_none_and_passes_through(
-        monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
-    processor = _mk_processor(monkeypatch,
-                              mm_cache_gb=mm_cache_gb,
-                              enable_prefix_caching=enable_prefix_caching)
+    monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
+):
+    processor = _mk_processor(
+        monkeypatch,
+        mm_cache_gb=mm_cache_gb,
+        enable_prefix_caching=enable_prefix_caching,
+    )
 
     # Capture the overrides passed to InputPreprocessor.preprocess
     captured: dict[str, object] = {}
 
-    def fake_preprocess(prompt,
-                        *,
-                        tokenization_kwargs=None,
-                        lora_request=None,
-                        mm_uuids=None):
+    def fake_preprocess(
+        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
+    ):
         captured["mm_uuids"] = mm_uuids
         # Minimal processed inputs for decoder-only flow
         return {"type": "token", "prompt_token_ids": [1]}
 
     # Monkeypatch only the bound preprocess method on this instance
-    monkeypatch.setattr(processor.input_preprocessor,
-                        "preprocess",
-                        fake_preprocess,
-                        raising=True)
+    monkeypatch.setattr(
+        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+    )
 
     # Use a consistent two-image scenario across all configurations
     mm_uuids = {"image": [None, "hash_stop"], "video": None}
@@ -176,24 +163,19 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     # When both processor cache is 0 and prefix caching disabled, the
     # processor builds overrides from request id instead of using user UUIDs.
-    processor = _mk_processor(monkeypatch,
-                              mm_cache_gb=0.0,
-                              enable_prefix_caching=False)
+    processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
 
     captured: dict[str, object] = {}
 
-    def fake_preprocess(prompt,
-                        *,
-                        tokenization_kwargs=None,
-                        lora_request=None,
-                        mm_uuids=None):
+    def fake_preprocess(
+        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
+    ):
         captured["mm_uuids"] = mm_uuids
         return {"type": "token", "prompt_token_ids": [1]}
 
-    monkeypatch.setattr(processor.input_preprocessor,
-                        "preprocess",
-                        fake_preprocess,
-                        raising=True)
+    monkeypatch.setattr(
+        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+    )
 
     request_id = "req-42"
     mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 689b2c95f927e0b98fe5a875ed2318f0c72ab652..3541ef89bfc14c2018a88582ca80f49f1517188c 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -3,8 +3,9 @@
 
 import random
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import TypeAlias
 
+import numpy as np
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -12,7 +13,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.v1.engine import EngineCoreOutput, FinishReason
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
-GeneralTokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+GeneralTokenizerType: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
 
 # Number of sample logprobs to request when testing sample logprobs
 NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
@@ -82,11 +83,12 @@ def _create_random_top_logprob_test_matrix(
 
 
 def _create_random_top_token_test_vector(
-        num_logprobs: int,
-        lower: int,
-        upper: int,
-        sampled_token_id: int,
-        adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]:
+    num_logprobs: int,
+    lower: int,
+    upper: int,
+    sampled_token_id: int,
+    adjust_num_logprobs: bool = True,
+) -> tuple[torch.Tensor, int]:
     """Create a random vector of top logprob token indices
 
     Use to create fake sample logprobs for testing. The sampled token
@@ -127,8 +129,9 @@ def _create_random_top_token_test_vector(
 
     # Check if the sampled_token_id occurs in choice_tensor[1:]
     if sampled_token_id in choice_tensor[1:]:
-        sampled_token_rank = (choice_tensor[1:] == sampled_token_id).nonzero(
-            as_tuple=True)[0].item()
+        sampled_token_rank = (
+            (choice_tensor[1:] == sampled_token_id).nonzero(as_tuple=True)[0].item()
+        )
     else:
         # If not found, assign a random int between num_logprobs and 50700
         sampled_token_rank = random.randint(num_logprobs, 50700)
@@ -164,9 +167,12 @@ def _create_random_top_token_test_matrix(
     num_elements = shape[0] * shape[1]
     choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
     matrix = torch.cat(
-        (torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
-         choice_tensor.view(shape)),
-        dim=1)
+        (
+            torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
+            choice_tensor.view(shape),
+        ),
+        dim=1,
+    )
 
     # Initialize the tensor for storing the ranks
     prompt_token_ranks = torch.empty(shape[0], dtype=torch.int)
@@ -174,8 +180,7 @@ def _create_random_top_token_test_matrix(
     # Iterate over each row to check presence of
     # tokens_list[rdx] and determine its index
     for rdx in range(shape[0]):
-        row = matrix[rdx,
-                     1:]  # Skip the first column as it contains the token list
+        row = matrix[rdx, 1:]  # Skip the first column as it contains the token list
         token_index = (row == tokens_list[rdx]).nonzero(as_tuple=True)[0]
         if token_index.numel() > 0:
             prompt_token_ranks[rdx] = token_index.item()
@@ -229,19 +234,21 @@ def generate_dummy_sample_logprobs(
         (
             token_vector,
             sampled_token_rank,
-        ) = _create_random_top_token_test_vector(num_logprobs, 0,
-                                                 len(tokenizer.vocab) - 1,
-                                                 sampled_token_id)
+        ) = _create_random_top_token_test_vector(
+            num_logprobs, 0, len(tokenizer.vocab) - 1, sampled_token_id
+        )
 
         res.append(
-            (token_vector,
-             _create_random_top_logprob_test_vector(num_logprobs + 1, -100,
-                                                    0), sampled_token_rank))
+            (
+                token_vector,
+                _create_random_top_logprob_test_vector(num_logprobs + 1, -100, 0),
+                sampled_token_rank,
+            )
+        )
 
     # Convert tensors in the list tuples to Python lists
     res_list_format = [
-        (log_probs_tensor.tolist(), token_ids_tensor.tolist(),
-         sampled_token_rank)
+        (log_probs_tensor.tolist(), token_ids_tensor.tolist(), sampled_token_rank)
         for log_probs_tensor, token_ids_tensor, sampled_token_rank in res
     ]
 
@@ -282,18 +289,24 @@ def generate_dummy_prompt_logprobs_tensors(
         token_vector,
         prompt_token_ranks,
     ) = _create_random_top_token_test_matrix(
-        (num_prompt_logprobs, num_logprobs), 0,
-        len(tokenizer.vocab) - 1, prompt_tokens_list[1:])
+        (num_prompt_logprobs, num_logprobs),
+        0,
+        len(tokenizer.vocab) - 1,
+        prompt_tokens_list[1:],
+    )
     return LogprobsTensors(
         token_vector,
         _create_random_top_logprob_test_matrix(
-            (num_prompt_logprobs, num_logprobs + 1), -100, 0),
-        prompt_token_ranks)
+            (num_prompt_logprobs, num_logprobs + 1), -100, 0
+        ),
+        prompt_token_ranks,
+    )
 
 
 @dataclass
 class DummyOutputProcessorTestVectors:
     """Dummy test vectors for output processor tests"""
+
     tokenizer: GeneralTokenizerType
     vllm_config: EngineArgs
     full_tokens: list[list[int]]  # Prompt + generated tokens
@@ -320,16 +333,15 @@ class MockEngineCore:
         # For each request, for each sampled token offset,
         # a tuple of
         # (list of topk token ids, list of sample logprob vals, rank)
-        generated_logprobs_raw: Optional[list[list[tuple[list[int],
-                                                         list[float],
-                                                         int]]]] = None,
+        generated_logprobs_raw: list[list[tuple[list[int], list[float], int]]]
+        | None = None,
         # For each request, a tuple of
         # (prompt logprob val matrix, prompt logprob tok id matrix);
         # each matrix has dimensions
         # (num prompt toks) x (num prompt logprobs+1)
-        prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
-        eos_token_id: Optional[int] = None,
-        stop_token_ids: Optional[list[int]] = None,
+        prompt_logprobs_raw: list[LogprobsTensors] | None = None,
+        eos_token_id: int | None = None,
+        stop_token_ids: list[int] | None = None,
         ignore_eos: bool = False,
     ) -> None:
         self.num_requests = len(tokens_list)
@@ -355,11 +367,12 @@ class MockEngineCore:
                 if do_logprobs:
                     assert self.generated_logprobs_raw is not None
                     (logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
-                        self.generated_logprobs_raw[req_idx][token_idx])
+                        self.generated_logprobs_raw[req_idx][token_idx]
+                    )
                     logprobs = LogprobsLists(
-                        [logprobs_token_ids_],
-                        [logprobs_],
-                        [sampled_token_ranks_],
+                        np.array([logprobs_token_ids_]),
+                        np.array([logprobs_]),
+                        np.array([sampled_token_ranks_]),
                     )
                 else:
                     logprobs = None
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 46b953fe37433fd5dccabd3f418bc4446512a8e8..40b9d1fe850c67a19ea52c18e2cd4970d998b895 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -26,8 +26,10 @@ def sample_token_ids():
 
 @pytest.fixture
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 # Note: Ensure this only uses attributes compatible with xgrammar
@@ -36,53 +38,44 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
             "skills": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                }
+                },
             },
             "grade": {
                 "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
+                "pattern": "^[A-D]$",  # Regex pattern
             },
             "email": {
                 "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
             },
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
+                        "company": {"type": "string"},
                         "duration": {
                             "type": "number",
                             "minimum": 0.0,
                             "maximum": 100.0,  # Numeric range
                         },
-                        "position": {
-                            "type": "string"
-                        }
+                        "position": {"type": "string"},
                     },
                     "required": ["company", "duration", "position"],
-                    "additionalProperties": False
+                    "additionalProperties": False,
                 },
                 "minItems": 0,
-                "maxItems": 3
-            }
+                "maxItems": 3,
+            },
         },
-        "required":
-        ["name", "age", "skills", "grade", "email", "work_history"],
-        "additionalProperties": False
+        "required": ["name", "age", "skills", "grade", "email", "work_history"],
+        "additionalProperties": False,
     }
 
 
@@ -94,67 +87,60 @@ def unsupported_json_schema():
         "properties": {
             "score": {
                 "type": "integer",
-                "multipleOf": 5  # Numeric multiple
+                "multipleOf": 5,  # Numeric multiple
             },
             "tags": {
                 "type": "array",
-                "items": {
-                    "type": "string",
-                    "minLength": 10,
-                    "maxLength": 20
-                }
-            }
+                "items": {"type": "string", "minLength": 10, "maxLength": 20},
+            },
         },
         "required": ["score", "tags"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
 
 
 @pytest.fixture
 def sample_definition_json_schema():
     return {
-        '$defs': {
-            'Step': {
-                'properties': {
-                    'explanation': {
-                        'title': 'Explanation',
-                        'type': 'string'
-                    },
-                    'output': {
-                        'title': 'Output',
-                        'type': 'string'
-                    }
+        "$defs": {
+            "Step": {
+                "properties": {
+                    "explanation": {"title": "Explanation", "type": "string"},
+                    "output": {"title": "Output", "type": "string"},
                 },
-                'required': ['explanation', 'output'],
-                'title': 'Step',
-                'type': 'object'
+                "required": ["explanation", "output"],
+                "title": "Step",
+                "type": "object",
             }
         },
-        'properties': {
-            'steps': {
-                'items': {
-                    '$ref': '#/$defs/Step'
-                },
-                'title': 'Steps',
-                'type': 'array'
+        "properties": {
+            "steps": {
+                "items": {"$ref": "#/$defs/Step"},
+                "title": "Steps",
+                "type": "array",
             },
-            'final_answer': {
-                'title': 'Final Answer',
-                'type': 'string'
-            }
+            "final_answer": {"title": "Final Answer", "type": "string"},
         },
-        'required': ['steps', 'final_answer'],
-        'title': 'MathReasoning',
-        'type': 'object',
-        "additionalProperties": False
+        "required": ["steps", "final_answer"],
+        "title": "MathReasoning",
+        "type": "object",
+        "additionalProperties": False,
     }
 
 
 @pytest.fixture
 def sample_structured_outputs_choices():
     return [
-        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
-        "Ruby", "Swift", "Kotlin"
+        "Python",
+        "Java",
+        "JavaScript",
+        "C++",
+        "C#",
+        "PHP",
+        "TypeScript",
+        "Ruby",
+        "Swift",
+        "Kotlin",
     ]
 
 
@@ -172,11 +158,11 @@ number ::= "1" | "2"
 
 @pytest.fixture
 def sample_sql_lark():
-    return ("""
+    return """
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
 column: "col_1" | "col_2"
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
+"""
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 5017c83025ba1101d2d74245d4fb3b35fd58e5d8..a7d769c8542a985e9c70ef261d49084aa52acea8 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -2,8 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import json
 from dataclasses import fields
 from enum import Enum
@@ -22,11 +20,16 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-from vllm.sampling_params import (GuidedDecodingParams, SamplingParams,
-                                  StructuredOutputsParams)
+from vllm.sampling_params import (
+    GuidedDecodingParams,
+    SamplingParams,
+    StructuredOutputsParams,
+)
 
 if TYPE_CHECKING:
-    from vllm.config import TokenizerMode
+    from vllm.config.model import TokenizerMode
+else:
+    TokenizerMode = str
 
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
@@ -44,22 +47,18 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
-     None),
+    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
-    #FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
+    # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
-     NGRAM_SPEC_CONFIG),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
-     NGRAM_SPEC_CONFIG),
+    # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
-    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto",
-     EAGLE_SPEC_CONFIG)
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
 ]
 
 PARAMS_MODELS_TOKENIZER_MODE = [
@@ -82,19 +81,16 @@ class CarDescription(BaseModel):
 
 
 def test_guided_decoding_deprecated():
-    with pytest.warns(DeprecationWarning,
-                      match="GuidedDecodingParams is deprecated.*"):
+    with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
         guided_decoding = GuidedDecodingParams(json_object=True)
 
     structured_outputs = StructuredOutputsParams(json_object=True)
     assert fields(guided_decoding) == fields(structured_outputs)
 
-    with pytest.warns(DeprecationWarning,
-                      match="guided_decoding is deprecated.*"):
+    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
         sp1 = SamplingParams(guided_decoding=guided_decoding)
 
-    with pytest.warns(DeprecationWarning,
-                      match="guided_decoding is deprecated.*"):
+    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
         sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
 
     assert sp1 == sp2
@@ -104,9 +100,9 @@ def test_guided_decoding_deprecated():
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
-    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
+)
 def test_structured_output(
-    monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
     unsupported_json_schema: dict[str, Any],
     sample_sql_ebnf: str,
@@ -118,25 +114,22 @@ def test_structured_output(
     model_name: str,
     speculative_config: dict[str, Any],
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
 
-    # Don't use eager execution on TPUs because we want to test for no
-    # recompilation at runtime
-    enforce_eager = bool(not current_platform.is_tpu())
     # Use a single LLM instance for several scenarios to
     # speed up the test suite.
-    llm = LLM(model=model_name,
-              enforce_eager=enforce_eager,
-              max_model_len=1024,
-              structured_outputs_config=dict(backend=backend,
-                                             disable_any_whitespace=backend
-                                             in {"xgrammar", "guidance"}),
-              seed=120,
-              tokenizer_mode=tokenizer_mode,
-              speculative_config=speculative_config)
+    llm = LLM(
+        model=model_name,
+        enforce_eager=True,
+        max_model_len=1024,
+        structured_outputs_config=dict(
+            backend=backend, disable_any_whitespace=backend in {"xgrammar", "guidance"}
+        ),
+        seed=120,
+        tokenizer_mode=tokenizer_mode,
+        speculative_config=speculative_config,
+    )
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -144,11 +137,14 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(json=sample_json_schema))
+        structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+    )
 
-    prompt = ("Give an example JSON for an employee profile that fits this "
-              "schema. Make the response as short as possible. Schema: "
-              f"{sample_json_schema}")
+    prompt = (
+        "Give an example JSON for an employee profile that fits this "
+        "schema. Make the response as short as possible. Schema: "
+        f"{sample_json_schema}"
+    )
     outputs = llm.generate(
         [prompt] * 2,
         sampling_params=sampling_params,
@@ -164,7 +160,7 @@ def test_structured_output(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        if backend != 'lm-format-enforcer':
+        if backend != "lm-format-enforcer":
             assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         try:
@@ -172,7 +168,8 @@ def test_structured_output(
         except json.JSONDecodeError as e:
             pytest.fail(
                 f"Invalid JSON from backend={backend}: {generated_text!r}\n"
-                f"Schema: {sample_json_schema}\nError: {e}")
+                f"Schema: {sample_json_schema}\nError: {e}"
+            )
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
     #
@@ -183,14 +180,18 @@ def test_structured_output(
             temperature=1.0,
             max_tokens=4096,
             n=2,
-            structured_outputs=StructuredOutputsParams(json_object=True))
+            structured_outputs=StructuredOutputsParams(json_object=True),
+        )
 
-        outputs = llm.generate(prompts=(
-            "Generate a JSON object with curly braces for a person with "
-            "name and age fields for John Smith who is 31 years old. "
-            "Make the response as short as possible."),
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts=(
+                "Generate a JSON object with curly braces for a person with "
+                "name and age fields for John Smith who is 31 years old. "
+                "Make the response as short as possible."
+            ),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
 
         assert outputs is not None
         for output in outputs:
@@ -212,25 +213,30 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(
-            json=unsupported_json_schema))
+        structured_outputs=StructuredOutputsParams(json=unsupported_json_schema),
+    )
     if backend.startswith("xgrammar"):
-        with pytest.raises(ValueError,
-                           match="The provided JSON schema contains features "
-                           "not supported by xgrammar."):
-
-            prompt = (f"Give an example JSON for an employee profile that "
-                      f"fits this schema: {unsupported_json_schema}. "
-                      f"Make the response as short as possible.")
+        with pytest.raises(
+            ValueError,
+            match="The provided JSON schema contains features "
+            "not supported by xgrammar.",
+        ):
+            prompt = (
+                f"Give an example JSON for an employee profile that "
+                f"fits this schema: {unsupported_json_schema}. "
+                f"Make the response as short as possible."
+            )
             llm.generate(
                 [prompt] * 2,
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
     else:
-        prompt = (f"Give an example JSON object for a grade that "
-                  f"fits this schema: {unsupported_json_schema}. "
-                  f"Make the response as short as possible.")
+        prompt = (
+            f"Give an example JSON object for a grade that "
+            f"fits this schema: {unsupported_json_schema}. "
+            f"Make the response as short as possible."
+        )
         outputs = llm.generate(
             prompt,
             sampling_params=sampling_params,
@@ -256,12 +262,14 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar=sample_sql_ebnf))
+            structured_outputs=StructuredOutputsParams(grammar=sample_sql_ebnf),
+        )
         outputs = llm.generate(
-            ("Generate a sql statement that selects col_1 from "
-             "table_1 where it is equal to 1. Make the response as short as "
-             "possible."),
+            (
+                "Generate a sql statement that selects col_1 from "
+                "table_1 where it is equal to 1. Make the response as short as "
+                "possible."
+            ),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -276,8 +284,7 @@ def test_structured_output(
             assert generated_text is not None
 
             # remove spaces for comparison b/c we removed them in the grammar
-            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-                " ", "")
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
             assert generated_text.strip() == ground_truth
 
@@ -290,12 +297,14 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar=sample_sql_lark))
+            structured_outputs=StructuredOutputsParams(grammar=sample_sql_lark),
+        )
         outputs = llm.generate(
-            ("Generate a sql statement that selects col_1 from "
-             "table_1 where it is equal to 1. Make the response as short as "
-             "possible."),
+            (
+                "Generate a sql statement that selects col_1 from "
+                "table_1 where it is equal to 1. Make the response as short as "
+                "possible."
+            ),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -311,12 +320,12 @@ def test_structured_output(
 
             # use Lark to parse the output, and make sure it's a valid parse tree
             from lark import Lark
+
             parser = Lark(sample_sql_lark)
             parser.parse(generated_text)
 
             # remove spaces for comparison b/c we removed them in the grammar
-            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-                " ", "")
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
             assert generated_text.strip() == ground_truth
 
@@ -329,13 +338,15 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar="not a grammar"))
+            structured_outputs=StructuredOutputsParams(grammar="not a grammar"),
+        )
         with pytest.raises(ValueError, match="Failed to convert the grammar "):
             llm.generate(
-                ("Generate a sql statement that selects col_1 from "
-                 "table_1 where it is equal to 1. Make the response as short "
-                 "as possible."),
+                (
+                    "Generate a sql statement that selects col_1 from "
+                    "table_1 where it is equal to 1. Make the response as short "
+                    "as possible."
+                ),
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
@@ -346,10 +357,13 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        structured_outputs=StructuredOutputsParams(regex=sample_regex))
+        structured_outputs=StructuredOutputsParams(regex=sample_regex),
+    )
 
-    prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
-              f"Make the response as short as possible.")
+    prompt = (
+        f"Give an example IPv4 address with this regex: {sample_regex}. "
+        f"Make the response as short as possible."
+    )
     outputs = llm.generate(
         [prompt] * 2,
         sampling_params=sampling_params,
@@ -374,11 +388,15 @@ def test_structured_output(
         temperature=0.8,
         top_p=0.95,
         structured_outputs=StructuredOutputsParams(
-            choice=sample_structured_outputs_choices))
+            choice=sample_structured_outputs_choices
+        ),
+    )
 
     outputs = llm.generate(
-        ("The best language for type-safe systems programming is "
-         "(Make the response as short as possible.) "),
+        (
+            "The best language for type-safe systems programming is "
+            "(Make the response as short as possible.) "
+        ),
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -400,12 +418,15 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        structured_outputs=StructuredOutputsParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema),
+    )
 
     outputs = llm.generate(
-        ("Generate a JSON with the brand, model and car_type of the most "
-         "iconic car from the 90's. Make the response as short as "
-         "possible."),
+        (
+            "Generate a JSON with the brand, model and car_type of the most "
+            "iconic car from the 90's. Make the response as short as "
+            "possible."
+        ),
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -425,7 +446,8 @@ def test_structured_output(
         except json.JSONDecodeError as e:
             pytest.fail(
                 f"Invalid JSON from backend={backend}: {generated_text!r}\n"
-                f"Schema: {json_schema}\nError: {e}")
+                f"Schema: {json_schema}\nError: {e}"
+            )
         jsonschema.validate(instance=output_json, schema=json_schema)
 
     #
@@ -439,21 +461,24 @@ def test_structured_output(
             "description": {
                 "type": "string",
                 "maxLength": max_length,
-                "minLength": min_length
+                "minLength": min_length,
             }
         },
         "required": ["description"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema),
+    )
 
     outputs = llm.generate(
-        ("Generate a description of a frog using 50 characters. "
-         "Make the response as short as possible."),
+        (
+            "Generate a description of a frog using 50 characters. "
+            "Make the response as short as possible."
+        ),
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -473,7 +498,8 @@ def test_structured_output(
         except json.JSONDecodeError as e:
             pytest.fail(
                 f"Invalid JSON from backend={backend}: {generated_text!r}\n"
-                f"Schema: {json_schema}\nError: {e}")
+                f"Schema: {json_schema}\nError: {e}"
+            )
         jsonschema.validate(instance=output_json, schema=json_schema)
 
     if backend not in ["outlines", "lm-format-enforcer"]:
@@ -481,29 +507,28 @@ def test_structured_output(
         # Test 11: Generate structured output using structural_tag format
         #
         structural_tag_config = {
-            "type":
-            "structural_tag",
-            "structures": [{
-                "begin": "<function=get_weather>",
-                "schema": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string"
-                        }
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "additionalProperties": False,
                     },
-                    "additionalProperties": False
-                },
-                "end": "</function>"
-            }],
-            "triggers": ["<function="]
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
         }
 
         sampling_params = SamplingParams(
             temperature=0.0,
             max_tokens=4096,
             structured_outputs=StructuredOutputsParams(
-                structural_tag=json.dumps(structural_tag_config)))
+                structural_tag=json.dumps(structural_tag_config)
+            ),
+        )
 
         prompt = """
 You have access to the following function to retrieve the weather in a city:
@@ -545,9 +570,7 @@ Make the response as short as possible.
 """
 
         # Change this once other backends support structural_tag
-        outputs = llm.generate(prompt,
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
         assert outputs is not None
 
         for output in outputs:
@@ -557,12 +580,13 @@ Make the response as short as possible.
             assert generated_text is not None
 
             # Search for function call pattern in the response
-            function_call_pattern = r'<function=get_weather>(.*?)</function>'
+            function_call_pattern = r"<function=get_weather>(.*?)</function>"
             matches = re.findall(function_call_pattern, generated_text)
 
             if not matches:
-                print(f"Warning: No function calls found in response: "
-                      f"{generated_text!r}")
+                print(
+                    f"Warning: No function calls found in response: {generated_text!r}"
+                )
                 continue
 
             # Take the first function call if multiple are found
@@ -573,29 +597,32 @@ Make the response as short as possible.
                 assert isinstance(json_content["city"], str)
                 print(f"Found valid function call: {generated_text!r}")
             except (json.JSONDecodeError, AssertionError) as e:
-                pytest.fail("Invalid function call format: "
-                            f"{generated_text!r}\nError: {str(e)}")
+                pytest.fail(
+                    f"Invalid function call format: {generated_text!r}\nError: {str(e)}"
+                )
 
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
-        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
-         "deepseek_r1", NGRAM_SPEC_CONFIG),
+        (
+            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "xgrammar",
+            "auto",
+            "deepseek_r1",
+            NGRAM_SPEC_CONFIG,
+        ),
         ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
     ],
 )
 def test_structured_output_with_reasoning_matrices(
-    monkeypatch: pytest.MonkeyPatch,
     backend: str,
     tokenizer_mode: TokenizerMode,
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
 
@@ -608,27 +635,25 @@ def test_structured_output_with_reasoning_matrices(
         enforce_eager=bool(not current_platform.is_tpu()),
         max_model_len=1024,
         max_num_seqs=16,
-        structured_outputs_config=dict(backend=backend,
-                                       disable_any_whitespace=backend
-                                       in {"xgrammar", "guidance"},
-                                       reasoning_parser=reasoning_parser),
+        structured_outputs_config=dict(
+            backend=backend,
+            disable_any_whitespace=backend in {"xgrammar", "guidance"},
+            reasoning_parser=reasoning_parser,
+        ),
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config,
     )
     tokenizer = llm.get_tokenizer()
     reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
-        tokenizer=tokenizer)
+        tokenizer=tokenizer
+    )
 
     reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?"  # noqa: E501
     reasoning_schema = {
         "type": "object",
-        "properties": {
-            "result": {
-                "type": "integer"
-            }
-        },
+        "properties": {"result": {"type": "integer"}},
         "required": ["result"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
     if "Qwen3" in model_name:
         reasoning_prompt += "<think>\n"
@@ -649,52 +674,52 @@ def test_structured_output_with_reasoning_matrices(
     assert output is not None and isinstance(output, RequestOutput)
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    reasoning_content, content = run_reasoning_extraction(
-        reasoner, [generated_text])
-    print(
-        f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}"
-    )
+    reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
+    print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
+
+    if "Qwen3" in model_name:
+        assert content is not None
+
+    assert reasoning is not None
 
-    assert content is not None and reasoning_content is not None
-    output_json = json.loads(content)
-    jsonschema.validate(instance=output_json, schema=reasoning_schema)
+    if content is not None:
+        output_json = json.loads(content)
+        jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name, tokenizer_mode",
-                         PARAMS_MODELS_TOKENIZER_MODE)
+@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
     model_name: str,
     tokenizer_mode: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              structured_outputs_config=dict(backend="auto"),
-              tokenizer_mode=tokenizer_mode)
+    llm = LLM(
+        model=model_name,
+        max_model_len=1024,
+        structured_outputs_config=dict(backend="auto"),
+        tokenizer_mode=tokenizer_mode,
+    )
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        structured_outputs=StructuredOutputsParams(
-            json=unsupported_json_schema))
+        structured_outputs=StructuredOutputsParams(json=unsupported_json_schema),
+    )
 
     prompts = (
         "Give an example JSON object for a grade "
         "that fits this schema: "
-        f"{unsupported_json_schema}. Make the response as short as possible.")
+        f"{unsupported_json_schema}. Make the response as short as possible."
+    )
     # This would fail with the default of "xgrammar", but in "auto"
     # we will handle fallback automatically.
-    outputs = llm.generate(prompts,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+    outputs = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
     # Make sure `auto` backend handling doesn't mess up sampling_params
     # and that we can reuse it without error.
     outputs.extend(
-        llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True))
+        llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -710,30 +735,25 @@ def test_structured_output_auto_mode(
 
 
 @pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-              max_model_len=1024,
-              structured_outputs_config=dict(
-                  backend="guidance",
-                  disable_any_whitespace=True,
-                  disable_additional_properties=True))
+def test_guidance_no_additional_properties():
+    llm = LLM(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        max_model_len=1024,
+        structured_outputs_config=dict(
+            backend="guidance",
+            disable_any_whitespace=True,
+            disable_additional_properties=True,
+        ),
+    )
 
     schema = {
-        'type': 'object',
-        'properties': {
-            'a1': {
-                'type': 'string'
-            },
-            'a2': {
-                'type': 'string'
-            },
-            'a3': {
-                'type': 'string'
-            }
+        "type": "object",
+        "properties": {
+            "a1": {"type": "string"},
+            "a2": {"type": "string"},
+            "a3": {"type": "string"},
         },
-        'required': ['a1', 'a2', 'a3'],
+        "required": ["a1", "a2", "a3"],
     }
 
     prompt = (
@@ -741,18 +761,19 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
         "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
         "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20. "
         "Make the response as short as possible."
-        "<|im_end|>\n<|im_start|>assistant\n")
+        "<|im_end|>\n<|im_start|>assistant\n"
+    )
 
     def generate_with_backend(backend):
         structured_outputs_params = StructuredOutputsParams(
             json=schema,
             backend=backend,
             disable_any_whitespace=True,
-            disable_additional_properties=True)
+            disable_additional_properties=True,
+        )
         sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=256,
-            structured_outputs=structured_outputs_params)
+            temperature=0, max_tokens=256, structured_outputs=structured_outputs_params
+        )
 
         outputs = llm.generate(prompt, sampling_params=sampling_params)
         assert outputs is not None
@@ -774,12 +795,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
 
 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
     backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     # Don't use eager execution on TPUs because we want to test for no
     # recompilation at runtime
     enforce_eager = bool(not current_platform.is_tpu())
@@ -797,16 +815,18 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
     structured_outputs_prompt = (
         "Give an example JSON for an employee profile that fits this "
         "schema. Make the response as short as possible. Schema: "
-        f"{sample_json_schema}")
+        f"{sample_json_schema}"
+    )
 
     non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
 
     prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
     sampling_params = [
-        SamplingParams(temperature=1.0,
-                       max_tokens=400,
-                       structured_outputs=StructuredOutputsParams(
-                           json=sample_json_schema)),
+        SamplingParams(
+            temperature=1.0,
+            max_tokens=400,
+            structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+        ),
         # No max tokens, temp=0 to assert on contents
         SamplingParams(
             seed=42,
@@ -815,9 +835,9 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
         ),
     ]
 
-    outputs = llm.generate(prompts=prompts,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+    outputs = llm.generate(
+        prompts=prompts, sampling_params=sampling_params, use_tqdm=True
+    )
 
     assert outputs is not None
 
@@ -840,8 +860,7 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
             # First prompt is structured outputs, expect valid JSON
             assert "\n" not in generated_text
             output_json = json.loads(generated_text)
-            jsonschema.validate(instance=output_json,
-                                schema=sample_json_schema)
+            jsonschema.validate(instance=output_json, schema=sample_json_schema)
         else:
             # Second prompt is not structured outputs, expect valid output
             # Cannot assert on exact output, but we can expect it to be factual
@@ -850,3 +869,46 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
             # non-structured outputs requests should not return a valid JSON here
             with pytest.raises(ValueError):
                 output_json = json.loads(generated_text)
+
+
+@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
+def test_structured_output_with_structural_tag(
+    guided_decoding_backend: str,
+):
+    llm = LLM(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        guided_decoding_backend=guided_decoding_backend,
+    )
+
+    structural_tag_config = {
+        "type": "structural_tag",
+        "format": {
+            "type": "triggered_tags",
+            "tags": [
+                {"begin": "hello_flag", "content": {"type": "any_text"}, "end": "hello"}
+            ],
+            "triggers": ["hello"],
+            "stop_after_first": False,
+        },
+    }
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=500,
+        guided_decoding=StructuredOutputsParams(
+            structural_tag=json.dumps(structural_tag_config)
+        ),
+    )
+
+    prompt = "Hello and repete hello 10 times, do not say anything else. Only say hello hello hello, now start"
+    outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert "hello_flag" in generated_text, (
+            f"Expected 'hello_flag' to be in generated text, but got: {generated_text}"
+        )
diff --git a/vllm/worker/__init__.py b/tests/v1/entrypoints/openai/serving_responses/__init__.py
similarity index 100%
rename from vllm/worker/__init__.py
rename to tests/v1/entrypoints/openai/serving_responses/__init__.py
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/serving_responses/conftest.py
similarity index 67%
rename from tests/v1/entrypoints/openai/responses/conftest.py
rename to tests/v1/entrypoints/openai/serving_responses/conftest.py
index 2d677a00b646a83b318e9fdee03107090098f540..b948b6d058a5d5a234f59c6e41f111148b8d84c4 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@@ -6,7 +6,7 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 
 # Use a small reasoning model to test the responses API.
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 
 
 @pytest.fixture(scope="module")
@@ -15,17 +15,25 @@ def default_server_args():
         "--max-model-len",
         "8192",
         "--enforce-eager",  # For faster startup.
+        "--enable-auto-tool-choice",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
         "--reasoning-parser",
-        "deepseek_r1",
+        "qwen3",
     ]
 
 
 @pytest.fixture(scope="module")
 def server_with_store(default_server_args):
     with RemoteOpenAIServer(
-            MODEL_NAME,
-            default_server_args,
-            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+        MODEL_NAME,
+        default_server_args,
+        env_dict={
+            "VLLM_ENABLE_RESPONSES_API_STORE": "1",
+            "VLLM_SERVER_DEV_MODE": "1",
+        },
     ) as remote_server:
         yield remote_server
 
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/serving_responses/test_basic.py
similarity index 73%
rename from tests/v1/entrypoints/openai/responses/test_basic.py
rename to tests/v1/entrypoints/openai/serving_responses/test_basic.py
index 2ee1004493a16a1e916fca14fd80978f4f8bd2b4..dd3a563e9570a9627b4bb4a61c1a0d3f8ba307c1 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_basic.py
@@ -36,24 +36,14 @@ async def test_instructions(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_chat(client: openai.AsyncOpenAI):
-    response = await client.responses.create(input=[
-        {
-            "role": "system",
-            "content": "Finish the answer with QED."
-        },
-        {
-            "role": "user",
-            "content": "What is 5 * 3?"
-        },
-        {
-            "role": "assistant",
-            "content": "15. QED."
-        },
-        {
-            "role": "user",
-            "content": "Multiply the result by 2."
-        },
-    ], )
+    response = await client.responses.create(
+        input=[
+            {"role": "system", "content": "Finish the answer with QED."},
+            {"role": "user", "content": "What is 5 * 3?"},
+            {"role": "assistant", "content": "15. QED."},
+            {"role": "user", "content": "Multiply the result by 2."},
+        ],
+    )
     print(response)
 
     output_text = response.output[-1].content[0].text
@@ -63,15 +53,14 @@ async def test_chat(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_chat_with_input_type(client: openai.AsyncOpenAI):
-    response = await client.responses.create(input=[
-        {
-            "role": "user",
-            "content": [{
-                "type": "input_text",
-                "text": "Hello!"
-            }],
-        },
-    ], )
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "user",
+                "content": [{"type": "input_text", "text": "Hello!"}],
+            },
+        ],
+    )
     print(response)
     assert response.status == "completed"
 
@@ -99,6 +88,6 @@ async def test_streaming(client: openai.AsyncOpenAI):
     assert isinstance(events[0], openai_responses_types.ResponseCreatedEvent)
     assert any(
         isinstance(event, openai_responses_types.ResponseTextDeltaEvent)
-        for event in events)
-    assert isinstance(events[-1],
-                      openai_responses_types.ResponseCompletedEvent)
+        for event in events
+    )
+    assert isinstance(events[-1], openai_responses_types.ResponseCompletedEvent)
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..90161e7c221b7748b567bbe22680591b972e8351
--- /dev/null
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+
+MODEL_NAME = "Qwen/Qwen3-1.7B"
+tools = [
+    {
+        "type": "function",
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to find the weather for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+                "options": {
+                    "$ref": "#/$defs/WeatherOptions",
+                    "description": "Optional parameters for weather query",
+                },
+            },
+            "required": ["country", "unit"],
+            "$defs": {
+                "WeatherOptions": {
+                    "title": "WeatherOptions",
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "default": "celsius",
+                            "description": "Temperature unit",
+                            "title": "Temperature Unit",
+                        },
+                        "include_forecast": {
+                            "type": "boolean",
+                            "default": False,
+                            "description": "Whether to include a 24-hour forecast",
+                            "title": "Include Forecast",
+                        },
+                        "language": {
+                            "type": "string",
+                            "default": "zh-CN",
+                            "description": "Language of the response",
+                            "title": "Language",
+                            "enum": ["zh-CN", "en-US", "ja-JP"],
+                        },
+                    },
+                },
+            },
+        },
+    },
+    {
+        "type": "function",
+        "name": "get_forecast",
+        "description": "Get the weather forecast for a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to get the forecast for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "days": {
+                    "type": "integer",
+                    "description": "Number of days to get the forecast for (1-7)",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+            "required": ["country", "days", "unit"],
+        },
+    },
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI, model_name: str, tool_choice: str
+):
+    prompt = [
+        {
+            "role": "user",
+            "content": "Can you tell me what the current weather is in Berlin and the "
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+    response = await client.responses.create(
+        model=model_name,
+        input=prompt,
+        tools=tools,
+        tool_choice=tool_choice,
+        temperature=0.0,
+    )
+
+    assert len(response.output) >= 1
+    tool_call = None
+    reasoning = None
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+        if out.type == "reasoning":
+            reasoning = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert json.loads(tool_call.arguments) is not None
+    assert reasoning is not None
+    assert reasoning.type == "reasoning"
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(client: openai.AsyncOpenAI):
+    def get_weather(latitude: float, longitude: float) -> str:
+        """
+        Mock function to simulate getting weather data.
+        In a real application, this would call an external weather API.
+        """
+        return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": (
+                "Get current temperature for provided coordinates in celsius."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "latitude": {"type": "number"},
+                    "longitude": {"type": "number"},
+                },
+                "required": ["latitude", "longitude"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    input_messages = [
+        {"role": "user", "content": "What's the weather like in Paris today?"}
+    ]
+
+    response = await client.responses.create(
+        model=MODEL_NAME,
+        input=input_messages,
+        tools=tools,
+        tool_choice={"type": "function", "name": "get_weather"},
+    )
+    assert len(response.output) >= 1
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert tool_call.name == "get_weather"
+    args = json.loads(tool_call.arguments)
+    assert args["latitude"] is not None
+    assert args["longitude"] is not None
+    # call the tool
+    result = get_weather(args["latitude"], args["longitude"])
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    # create a new response with the tool call result
+    response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
+    # check the output
+    assert len(response_2.output_text) > 0
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
similarity index 65%
rename from tests/v1/entrypoints/openai/responses/test_image.py
rename to tests/v1/entrypoints/openai/serving_responses/test_image.py
index 3ed36ca678c0c6d234dabee8830163f4fd85612c..980d83b787e7abb5259b2a122e0641e60d34a34c 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -38,9 +38,9 @@ def default_image_server_args():
 @pytest.fixture(scope="module")
 def image_server(default_image_server_args):
     with RemoteOpenAIServer(
-            MODEL_NAME,
-            default_image_server_args,
-            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+        MODEL_NAME,
+        default_image_server_args,
+        env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
     ) as remote_server:
         yield remote_server
 
@@ -54,8 +54,7 @@ async def client(image_server):
 @pytest.fixture(scope="session")
 def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url:
-        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
         for image_url in TEST_IMAGE_ASSETS
     }
 
@@ -63,24 +62,23 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_single_chat_session_image(client: openai.AsyncOpenAI,
-                                         model_name: str, image_url: str):
+async def test_single_chat_session_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_image",
+                    "image_url": image_url,
+                    "detail": "auto",
+                },
+                {"type": "input_text", "text": content_text},
+            ],
+        }
+    ]
 
     # test image url
     response = await client.responses.create(
@@ -100,22 +98,19 @@ async def test_single_chat_session_image_base64encoded(
     base64_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url":
-                f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_image",
+                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
+                    "detail": "auto",
+                },
+                {"type": "input_text", "text": content_text},
+            ],
+        }
+    ]
     # test image base64
     response = await client.responses.create(
         model=model_name,
@@ -129,24 +124,27 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize(
     "image_urls",
     [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
-    indirect=True)
-async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            } for image_url in image_urls),
-            {
-                "type": "input_text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    indirect=True,
+)
+async def test_multi_image_input(
+    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {
+                        "type": "input_image",
+                        "image_url": image_url,
+                        "detail": "auto",
+                    }
+                    for image_url in image_urls
+                ),
+                {"type": "input_text", "text": "What's in this image?"},
+            ],
+        }
+    ]
 
     if len(image_urls) > MAXIMUM_IMAGES:
         with pytest.raises(openai.BadRequestError):  # test multi-image input
@@ -157,10 +155,12 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         # the server should still work afterwards
         response = await client.responses.create(
             model=model_name,
-            input=[{
-                "role": "user",
-                "content": "What's the weather like in Paris today?",
-            }],
+            input=[
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris today?",
+                }
+            ],
         )
         assert len(response.output_text) > 0
     else:
diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
similarity index 90%
rename from tests/v1/entrypoints/openai/responses/test_stateful.py
rename to tests/v1/entrypoints/openai/serving_responses/test_stateful.py
index a2d581ef7ced86703889d0db5734d27b2dfedf73..6f7edb6bd7e78ca94b0f404dbf474b5e86eb8a07 100644
--- a/tests/v1/entrypoints/openai/responses/test_stateful.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
@@ -24,8 +24,7 @@ async def test_store(client: openai.AsyncOpenAI):
     assert response.status == "completed"
 
     # The response should not be found.
-    with pytest.raises(openai.NotFoundError,
-                       match="Response with id .* not found."):
+    with pytest.raises(openai.NotFoundError, match="Response with id .* not found."):
         await client.responses.retrieve(response.id)
 
 
@@ -53,8 +52,8 @@ async def test_background(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 async def test_background_error(client: openai.AsyncOpenAI):
     with pytest.raises(
-            openai.BadRequestError,
-            match="background can only be used when `store` is true"):
+        openai.BadRequestError, match="background can only be used when `store` is true"
+    ):
         _ = await client.responses.create(
             input="What is 13 * 24?",
             background=True,
@@ -87,8 +86,9 @@ async def test_cancel_completed(client: openai.AsyncOpenAI):
     response = await client.responses.create(input="Hello")
     assert response.status == "completed"
 
-    with pytest.raises(openai.BadRequestError,
-                       match="Cannot cancel a synchronous response."):
+    with pytest.raises(
+        openai.BadRequestError, match="Cannot cancel a synchronous response."
+    ):
         await client.responses.cancel(response.id)
 
 
@@ -97,7 +97,8 @@ async def test_previous_response_id(client: openai.AsyncOpenAI):
     response1 = await client.responses.create(
         instructions="You are tested on your ability to retrieve the correct "
         "information from the previous response.",
-        input="Hello, my name is John.")
+        input="Hello, my name is John.",
+    )
 
     response2 = await client.responses.create(
         input="Actually, my name is not John. My real name is Mark.",
@@ -118,7 +119,8 @@ async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
     response1 = await client.responses.create(
         instructions="You are tested on your ability to retrieve the correct "
         "information from the previous response.",
-        input="Hello, my name is John.")
+        input="Hello, my name is John.",
+    )
 
     # Both response 2 and 3 use response 1 as the previous response.
     response2 = client.responses.create(
diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
similarity index 76%
rename from tests/v1/entrypoints/openai/responses/test_structured_output.py
rename to tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
index c4c43a87b601a5221949a7ef089105aa9ac3d029..db8b87768e44f3cd43619b0c9aef851f6bc9692d 100644
--- a/tests/v1/entrypoints/openai/responses/test_structured_output.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
@@ -11,14 +11,10 @@ from pydantic import BaseModel
 async def test_structured_output(client: openai.AsyncOpenAI):
     response = await client.responses.create(
         input=[
-            {
-                "role": "system",
-                "content": "Extract the event information."
-            },
+            {"role": "system", "content": "Extract the event information."},
             {
                 "role": "user",
-                "content":
-                "Alice and Bob are going to a science fair on Friday.",
+                "content": "Alice and Bob are going to a science fair on Friday.",
             },
         ],
         text={
@@ -28,18 +24,9 @@ async def test_structured_output(client: openai.AsyncOpenAI):
                 "schema": {
                     "type": "object",
                     "properties": {
-                        "event_name": {
-                            "type": "string"
-                        },
-                        "date": {
-                            "type": "string"
-                        },
-                        "participants": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
-                            }
-                        },
+                        "event_name": {"type": "string"},
+                        "date": {"type": "string"},
+                        "participants": {"type": "array", "items": {"type": "string"}},
                     },
                     "required": ["event_name", "date", "participants"],
                     "additionalProperties": False,
@@ -65,7 +52,6 @@ async def test_structured_output(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
-
     class CalendarEvent(BaseModel):
         event_name: str
         date: str
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index 9aa285aa9b18d0cbab37c232bf415bd41ef1f415..b5aa20448dfcbe8224f4146b540d254009d40995 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -40,8 +40,7 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_invalid_json_schema(client: openai.AsyncOpenAI,
-                                   model_name: str) -> None:
+async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
     invalid_json_schema = {
         "$defs": {
             "CarType": {
@@ -51,35 +50,29 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
             }
         },
         "properties": {
-            "brand": {
-                "title": "Brand",
-                "type": "string"
-            },
-            "model": {
-                "title": "Model",
-                "type": "string"
-            },
-            "car_type": {
-                "$ref": "#/$defs/CarType"
-            },
+            "brand": {"title": "Brand", "type": "string"},
+            "model": {"title": "Model", "type": "string"},
+            "car_type": {"$ref": "#/$defs/CarType"},
             "foo": "bar",
         },
         "required": ["brand", "model", "car_type"],
         "title": "CarDescription",
         "type": "object",
     }
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
-            extra_body={"structured_outputs": {
-                "json": invalid_json_schema
-            }},
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"structured_outputs": {"json": invalid_json_schema}},
         )
 
 
@@ -89,23 +82,22 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
-            extra_body={
-                "structured_outputs": {
-                    "regex": r"[.*"
-                },
-                "stop": ["\n"]
-            },
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"structured_outputs": {"regex": r"[.*"}, "stop": ["\n"]},
         )
 
 
@@ -129,18 +121,40 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         number ::= "1 " | "2 "
     """
 
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
-            extra_body={
-                "structured_outputs": {
-                    "grammar": invalid_simplified_sql_grammar
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
                 }
+            ],
+            extra_body={
+                "structured_outputs": {"grammar": invalid_simplified_sql_grammar}
             },
         )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_empty_grammar(client: openai.AsyncOpenAI, model_name: str) -> None:
+    prompt = "Say hello"
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"structured_outputs": {"grammar": ""}},
+        )
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 9090beb4bbd2a033910d90870d648f62b2e31e94..736ccbefbc4da6125ff8d6a3932075aa5d570d3c 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import regex as re
-import requests
 from openai import BadRequestError
 
 from tests.utils import RemoteOpenAIServer
@@ -31,12 +29,13 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="module",
-                params=[["--no-enable-prefix-caching"],
-                        [
-                            "--no-enable-prefix-caching",
-                            "--disable-frontend-multiprocessing"
-                        ]])
+@pytest.fixture(
+    scope="module",
+    params=[
+        ["--no-enable-prefix-caching"],
+        ["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
+    ],
+)
 def server(default_server_args, request):
     if request.param:
         default_server_args = default_server_args + request.param
@@ -55,12 +54,10 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_single_completion(client: openai.AsyncOpenAI,
-                                 model_name: str) -> None:
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str) -> None:
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -69,7 +66,8 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
 
     # test using token IDs
     completion = await client.completions.create(
@@ -147,11 +145,12 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME],
 )
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str) -> None:
-
+async def test_too_many_completion_logprobs(
+    client: openai.AsyncOpenAI, model_name: str
+) -> None:
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -163,7 +162,8 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
         )
         ...
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         stream = await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -188,13 +188,13 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, -1), (MODEL_NAME, 0), (MODEL_NAME, 1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_completion(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
+):
     params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
         "model": model_name,
@@ -223,8 +223,9 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str) -> None:
+async def test_completion_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+) -> None:
     prompt = "What is an LLM?"
 
     single_completion = await client.completions.create(
@@ -234,11 +235,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
         temperature=0.0,
     )
     single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
+    stream = await client.completions.create(
+        model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+    )
     chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -257,8 +256,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
-                                     model_name: str):
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI, model_name: str):
     """Parallel sampling without streaming.
     A single request output contains a list of completions.
     """
@@ -268,27 +266,26 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     max_tokens = 50  # we want some to finish earlier than others
 
     # High temperature to maximize chance of unique completions.
-    completion = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=max_tokens,
-                                                 n=n,
-                                                 temperature=1.0,
-                                                 stream=False,
-                                                 logprobs=0,
-                                                 seed=42)
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=n,
+        temperature=1.0,
+        stream=False,
+        logprobs=0,
+        seed=42,
+    )
 
     # Assert `n` completions
     num_completions = len(completion.choices)
-    assert num_completions == n, (
-        f"Num completions {num_completions} but expected {n}.")
+    assert num_completions == n, f"Num completions {num_completions} but expected {n}."
     completion_repeats: dict[str, int] = {}
     output_token_lengths = set()
     for idx, choice in enumerate(completion.choices):
         # Assert correct completion index & some finish reason.
-        assert choice.index == idx, (
-            f"Index {choice.index} but expected {idx}.")
-        assert choice.finish_reason is not None, (
-            "None finish_reason is invalid.")
+        assert choice.index == idx, f"Index {choice.index} but expected {idx}."
+        assert choice.finish_reason is not None, "None finish_reason is invalid."
         text = choice.text
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
         output_token_lengths.add(len(choice.logprobs.tokens))
@@ -297,13 +294,10 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
-        repeats = {
-            txt: num
-            for (txt, num) in completion_repeats.items() if num > 1
-        }
+        repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
         raise AssertionError(
-            f"Expected {n} unique completions, got {num_unique};"
-            f" repeats: {repeats}.")
+            f"Expected {n} unique completions, got {num_unique}; repeats: {repeats}."
+        )
 
 
 @pytest.mark.asyncio
@@ -321,13 +315,15 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     n = 3
     max_tokens = 50  # we want some to finish earlier than others
 
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             temperature=1.0,
-                                             stream=True,
-                                             seed=42)
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=n,
+        temperature=1.0,
+        stream=True,
+        seed=42,
+    )
     chunks: list[list[str]] = [[] for _ in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
@@ -338,7 +334,8 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
             finish_reason_count += 1
     # Assert `n` completions with correct finish reasons
     assert finish_reason_count == n, (
-        f"Expected {n} completions with valid indices and finish_reason.")
+        f"Expected {n} completions with valid indices and finish_reason."
+    )
     completion_repeats: dict[str, int] = {}
     chunk_lengths = set()
     for chunk in chunks:
@@ -346,7 +343,8 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
         # Assert correct number of completion tokens
         chunk_lengths.add(chunk_len)
         assert chunk_len <= max_tokens, (
-            f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
+            f"max_tokens={max_tokens} but chunk len is {chunk_len}."
+        )
         text = "".join(chunk)
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
         print(text)
@@ -355,12 +353,10 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
-        repeats = {
-            txt: num
-            for (txt, num) in completion_repeats.items() if num > 1
-        }
-        raise AssertionError(f"{num_unique} unique completions, expected {n};"
-                             f" repeats: {repeats}")
+        repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
+        raise AssertionError(
+            f"{num_unique} unique completions, expected {n}; repeats: {repeats}"
+        )
 
 
 @pytest.mark.asyncio
@@ -368,114 +364,122 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
+async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": False,
+        },
+    )
 
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": False,
+        },
+    )
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
         else:
             assert chunk.usage is None
-            final_chunk = await stream.__anext__()
+            final_chunk = await anext(stream)
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is not None
         assert chunk.usage.prompt_tokens > 0
         assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
+            final_chunk = await anext(stream)
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options=
     #     {"include_usage": None}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"include_usage": True}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
 
     # Test stream=False, stream_options=
     #     {"continuous_usage_stats": None}
@@ -486,7 +490,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": None})
+            stream_options={"continuous_usage_stats": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"continuous_usage_stats": True}
@@ -497,7 +502,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": True})
+            stream_options={"continuous_usage_stats": True},
+        )
 
 
 @pytest.mark.asyncio
@@ -528,15 +534,19 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
             extra_body=dict(
                 # NOTE: this has to be true for n > 1 in vLLM, but
                 # not necessary for official client.
-                use_beam_search=True),
+                use_beam_search=True
+            ),
         )
         assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
+        assert batch.choices[0].text != batch.choices[1].text, (
+            "beam search should be different"
+        )
+        assert batch.choices[0].text == batch.choices[2].text, (
+            "two copies of the same prompt should be the same"
+        )
+        assert batch.choices[1].text == batch.choices[3].text, (
+            "two copies of the same prompt should be the same"
+        )
 
         # test streaming
         batch = await client.completions.create(
@@ -560,31 +570,30 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
+async def test_echo_logprob_completion(
+    client: openai.AsyncOpenAI, model_name: str, logprobs_arg: int
+):
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
     # test using text and token IDs
     for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            echo=True,
+            logprobs=logprobs_arg,
+        )
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt
         assert re.search(r"^" + prompt_text, completion.choices[0].text)
         logprobs = completion.choices[0].logprobs
         assert logprobs is not None
         assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
+        assert len(logprobs.token_logprobs) > 5 and logprobs.token_logprobs[0] is None
+        assert len(logprobs.top_logprobs) > 5 and logprobs.top_logprobs[0] is None
         for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) > 5
 
 
@@ -593,8 +602,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_invalid_json_schema(client: openai.AsyncOpenAI,
-                                   model_name: str) -> None:
+async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
     invalid_json_schema = {
         "$defs": {
             "CarType": {
@@ -604,32 +612,24 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
             }
         },
         "properties": {
-            "brand": {
-                "title": "Brand",
-                "type": "string"
-            },
-            "model": {
-                "title": "Model",
-                "type": "string"
-            },
-            "car_type": {
-                "$ref": "#/$defs/CarType"
-            },
+            "brand": {"title": "Brand", "type": "string"},
+            "model": {"title": "Model", "type": "string"},
+            "car_type": {"$ref": "#/$defs/CarType"},
             "foo": "bar",
         },
         "required": ["brand", "model", "car_type"],
         "title": "CarDescription",
         "type": "object",
     }
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={"structured_outputs": {
-                "json": invalid_json_schema
-            }},
+            extra_body={"structured_outputs": {"json": invalid_json_schema}},
         )
 
 
@@ -639,20 +639,17 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={
-                "structured_outputs": {
-                    "regex": r"[.*"
-                },
-                "stop": ["\n"]
-            },
+            extra_body={"structured_outputs": {"regex": r"[.*"}, "stop": ["\n"]},
         )
 
 
@@ -676,29 +673,15 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         number ::= "1 " | "2 "
     """
 
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
             extra_body={
-                "structured_outputs": {
-                    "grammar": invalid_simplified_sql_grammar
-                }
+                "structured_outputs": {"grammar": invalid_simplified_sql_grammar}
             },
         )
-
-
-@pytest.mark.asyncio
-async def test_completion_with_empty_prompt_embeds(
-        client: openai.AsyncOpenAI) -> None:
-    """Test completion with empty prompt embeds."""
-    payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []}
-    headers: dict[str, str] = {"Content-Type": "application/json"}
-    # base_url = http://localhost:8000/v1/completions
-    response = requests.post(f"{client.base_url}completions",
-                             headers=headers,
-                             json=payload)
-    assert response.status_code == 200, (
-        f"Expected status code 200, got {response.status_code}. ")
diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
index 41f1d02bf78706cac7ae3432b19ddf390c2ca01a..276de2ff8e2cdda25d61809f2ef80947b7f693f6 100644
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -32,14 +32,15 @@ def default_image_embeds_server_args() -> list[str]:
         "--enforce-eager",
         "--limit-mm-per-prompt",
         json.dumps({"image": MAXIMUM_IMAGES}),
+        "--enable-mm-embeds",
     ]
 
 
 @pytest.fixture(scope="module")
 def server_with_image_embeds(default_image_embeds_server_args):
-    with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_embeds_server_args,
-                            max_wait_seconds=600) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, default_image_embeds_server_args, max_wait_seconds=600
+    ) as remote_server:
         yield remote_server
 
 
@@ -57,7 +58,7 @@ def encode_image_embedding_to_base64(image_embedding) -> str:
     torch.save(image_embedding, buffer)
     buffer.seek(0)
     binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
     return base64_image_embedding
 
 
@@ -75,19 +76,13 @@ async def test_completions_with_image_embeds(
     base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
     chat_completion = await client_with_image_embeds.chat.completions.create(
         messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
             {
-                "role": "system",
-                "content": "You are a helpful assistant."
-            },
-            {
-                "role":
-                "user",
+                "role": "user",
                 "content": [
                     {
-                        "type":
-                        "text",
-                        "text":
-                        "Describe these images separately. For each image,"
+                        "type": "text",
+                        "text": "Describe these images separately. For each image,"
                         "reply with a short sentence (no more than 10 words).",
                     },
                     {
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index f7c31b0c43778ff6b2a99d4a6aa8b7c92a55d286..db52aef70f6078dd0406c39991f9782c4b0adf8c 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -8,9 +8,9 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 
-MODEL_NAME = "ibm-research/PowerMoE-3b"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
 DP_SIZE = os.getenv("DP_SIZE", "1")
 
@@ -50,16 +50,13 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_single_completion(client: openai.AsyncOpenAI,
-                                 server: RemoteOpenAIServer,
-                                 model_name: str) -> None:
-
+async def test_single_completion(
+    client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
+) -> None:
     async def make_request():
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=10,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=10, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -108,9 +105,9 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    server: RemoteOpenAIServer,
-                                    model_name: str) -> None:
+async def test_completion_streaming(
+    client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request():
@@ -124,11 +121,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -139,16 +134,15 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single request
@@ -162,9 +156,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     tasks = [make_streaming_request() for _ in range(num_requests)]
     results = await asyncio.gather(*tasks)
 
-    assert len(
-        results
-    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert len(results) == num_requests, (
+        f"Expected {num_requests} results, got {len(results)}"
+    )
     assert all(results), "Not all streaming requests completed successfully."
 
     await asyncio.sleep(0.5)
@@ -172,9 +166,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     tasks = [make_streaming_request() for _ in range(num_requests)]
     results = await asyncio.gather(*tasks)
 
-    assert len(
-        results
-    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert len(results) == num_requests, (
+        f"Expected {num_requests} results, got {len(results)}"
+    )
     assert all(results), "Not all streaming requests completed successfully."
 
     # Check request balancing via Prometheus metrics if DP_SIZE > 1
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
index 4e83e2f9d4b63371b54810365f82953f52fc8539..e9f635378e577654eaa00a09cb5b821772c0e172 100644
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -3,10 +3,13 @@
 
 import asyncio
 import os
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from concurrent.futures import Future
+from typing import Any
 
 import pytest
 
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncLLM
@@ -14,23 +17,32 @@ from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 
 
-class Mock:
-    ...
+class Mock: ...
 
 
 class CustomMultiprocExecutor(MultiprocExecutor):
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None,
-                       non_block: bool = False,
-                       unique_reply_rank: Optional[int] = None) -> list[Any]:
+    def collective_rpc(
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: bool = False,
+        unique_reply_rank: int | None = None,
+        kv_output_aggregator: KVOutputAggregator = None,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
         # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
-        return super().collective_rpc(method, timeout, args, kwargs)
+        return super().collective_rpc(
+            method,
+            timeout,
+            args,
+            kwargs,
+            non_block,
+            unique_reply_rank,
+            kv_output_aggregator,
+        )
 
 
 CustomMultiprocExecutorAsync = CustomMultiprocExecutor
@@ -47,17 +59,22 @@ def test_custom_executor_type_checking():
         )
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
-        engine_args = AsyncEngineArgs(model=MODEL,
-                                      gpu_memory_utilization=0.2,
-                                      max_model_len=8192,
-                                      distributed_executor_backend=Mock)
+        engine_args = AsyncEngineArgs(
+            model=MODEL,
+            gpu_memory_utilization=0.2,
+            max_model_len=8192,
+            distributed_executor_backend=Mock,
+        )
         AsyncLLM.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("distributed_executor_backend", [
-    CustomMultiprocExecutor,
-    "tests.v1.executor.test_executor.CustomMultiprocExecutor"
-])
+@pytest.mark.parametrize(
+    "distributed_executor_backend",
+    [
+        CustomMultiprocExecutor,
+        "tests.v1.executor.test_executor.CustomMultiprocExecutor",
+    ],
+)
 def test_custom_executor(distributed_executor_backend, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -82,10 +99,13 @@ def test_custom_executor(distributed_executor_backend, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("distributed_executor_backend", [
-    CustomMultiprocExecutorAsync,
-    "tests.v1.executor.test_executor.CustomMultiprocExecutorAsync"
-])
+@pytest.mark.parametrize(
+    "distributed_executor_backend",
+    [
+        CustomMultiprocExecutorAsync,
+        "tests.v1.executor.test_executor.CustomMultiprocExecutorAsync",
+    ],
+)
 def test_custom_executor_async(distributed_executor_backend, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -103,9 +123,9 @@ def test_custom_executor_async(distributed_executor_backend, tmp_path):
         sampling_params = SamplingParams(max_tokens=1)
 
         async def t():
-            stream = engine.generate(request_id="0",
-                                     prompt="foo",
-                                     sampling_params=sampling_params)
+            stream = engine.generate(
+                request_id="0", prompt="foo", sampling_params=sampling_params
+            )
             async for x in stream:
                 ...
 
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index bc88370791096a90c843127b8c014afd2aa45300..ebc8575e5b3901261421268fb5d3e82f5184bae8 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -1,16 +1,56 @@
 #!/bin/bash
 set -xe
 
+# Parse command line arguments
+KV_BUFFER_DEVICE="cuda"  # Default to cuda
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --kv_buffer_device)
+      KV_BUFFER_DEVICE="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option $1"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      exit 1
+      ;;
+  esac
+done
+
+echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
+
+DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
+if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
+  KV_CONFIG_HETERO_LAYOUT=',"enable_permute_local_kv":"True"'
+else
+  KV_CONFIG_HETERO_LAYOUT=''
+fi
+
+# Build the kv-transfer-config once
+if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
+  KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"'${KV_CONFIG_HETERO_LAYOUT}'}'
+else
+  KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\""${KV_CONFIG_HETERO_LAYOUT}"}"
+fi
+
 # Models to run
-MODELS=(
-    "Qwen/Qwen3-0.6B"
-)
+MODEL_NAMES=${MODEL_NAMES:-}
+if [[ -n "$MODEL_NAMES" ]]; then
+  MODELS=("$MODEL_NAMES")
+else
+  MODELS=(
+      "Qwen/Qwen3-0.6B"
+  )
+fi
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
 NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -76,24 +116,32 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(get_num_gpus)))
+    NEXT_GPU=${GPU_ID}
+    # If PREFILLER_TP_SIZE is more than 1
+    for (( j=1; j < PREFILLER_TP_SIZE; j++ )); do
+      NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus)))
+      GPU_ID="${GPU_ID},${NEXT_GPU}"
+    done
 
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port. Avoid clash with with TP workers. 
+    # Calculate side channel port. Avoid clash with with TP workers.
     SIDE_CHANNEL_PORT=$((5559 + i))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
 
     # Build the command with or without model-specific args
     BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \
+    VLLM_KV_CACHE_LAYOUT='HND' \
     UCX_NET_DEVICES=all \
     VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --gpu-memory-utilization 0.2 \
+    --block-size ${PREFILL_BLOCK_SIZE} \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
-    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+    --kv-transfer-config '$KV_CONFIG'"
 
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
@@ -111,7 +159,12 @@ run_tests_for_model() {
   # Start decode instances
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
-    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    GPU_ID=$(((i + NEXT_GPU + 1) % $(get_num_gpus)))
+    # If DECODER_TP_SIZE is more than 1
+    for (( j=1; j < DECODER_TP_SIZE; j++ )); do
+      NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus)))
+      GPU_ID="${GPU_ID},${NEXT_GPU}"
+    done
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
@@ -121,14 +174,24 @@ run_tests_for_model() {
 
     # Build the command with or without model-specific args
     BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \
+    VLLM_KV_CACHE_LAYOUT=$DECODER_KV_LAYOUT \
     UCX_NET_DEVICES=all \
     VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --gpu-memory-utilization 0.2 \
-    --tensor-parallel-size $DECODER_TP_SIZE \
-    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+    --block-size ${DECODE_BLOCK_SIZE} \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+    --kv-transfer-config '$KV_CONFIG'"
+  
+  # DP-EP attention mode
+  if [[ -z "$DP_EP" ]]; then
+    BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
+  else
+    echo "DP-EP Attention enabled, deploying with dp=DECODER_TP_SIZE and tp=1"
+    BASE_CMD="${BASE_CMD} --data-parallel-size $DECODER_TP_SIZE \
+    --tensor-parallel-size 1 --enable-expert-parallel"
+  fi
 
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
@@ -155,7 +218,7 @@ run_tests_for_model() {
   done
 
   # Build the command for the proxy server with all the hosts and ports
-  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
+  PROXY_CMD="python3 ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
 
   # Add all prefill hosts and ports
   PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
@@ -174,7 +237,7 @@ run_tests_for_model() {
 
   # Run lm eval for this model
   echo "Running tests for $model_name"
-  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+  TEST_MODEL=$model_name python3 -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
 
   # Clean up before running next model
   cleanup_instances
diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
old mode 100644
new mode 100755
index b64461292910d32730a44d769ab787c3cb9608b8..c48b452e24cd48d6751a900801666275c41d970b
--- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
@@ -1,6 +1,33 @@
 #!/bin/bash
 set -xe
 
+# Parse command line arguments
+KV_BUFFER_DEVICE="cuda"  # Default to cuda
+PREFILL_GPU_ID=4         # Default GPU IDs
+DECODE_GPU_ID=5
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --kv_buffer_device)
+      KV_BUFFER_DEVICE="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option $1"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      exit 1
+      ;;
+  esac
+done
+
+echo "Running edge case tests with kv_buffer_device=$KV_BUFFER_DEVICE (GPUs: $PREFILL_GPU_ID, $DECODE_GPU_ID)"
+
+# Build the kv-transfer-config once
+if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
+  KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+else
+  KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
+fi
+
 # Models to run
 MODELS=(
     "Qwen/Qwen3-0.6B"
@@ -50,15 +77,15 @@ run_tests_for_model() {
 
   # Get model-specific arguments
   local model_args=$(get_model_args "$model_name")
-  
+
   # Start prefill instance
   PREFILL_PORT=8001
 
-  BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
+  BASE_CMD="CUDA_VISIBLE_DEVICES=$PREFILL_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
   --port $PREFILL_PORT \
   --enforce-eager \
   --gpu-memory-utilization 0.2 \
-  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+  --kv-transfer-config '$KV_CONFIG'"
 
   if [ -n "$model_args" ]; then
   FULL_CMD="$BASE_CMD $model_args"
@@ -72,11 +99,11 @@ run_tests_for_model() {
   DECODE_PORT=8002
 
   # Build the command with or without model-specific args
-  BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
+  BASE_CMD="CUDA_VISIBLE_DEVICES=$DECODE_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
   --port $DECODE_PORT \
   --enforce-eager \
   --gpu-memory-utilization 0.2 \
-  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+  --kv-transfer-config '$KV_CONFIG'"
 
   if [ -n "$model_args" ]; then
   FULL_CMD="$BASE_CMD $model_args"
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
index ea125f99fc42cd8e73cdd9a0525dd80673940783..fa1738bb31940397c1ce9ae8a7dfde21e38d97a2 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -53,7 +53,6 @@ cleanup() {
 launch_baseline() {
   BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@@ -73,7 +72,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
   VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
   PJRT_DEVICE=TPU \
@@ -93,7 +91,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
index 8ba653770c4f04e429259b6844a8ce42b0763ab4..3d63822371bed37533dbb0c47c3e43e4abbe2714 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@@ -55,7 +55,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
   VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
   PJRT_DEVICE=TPU \
@@ -75,7 +74,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index e5d66ffeeeb2395b4119dda05b0454bab2254210..a70f4caeb937081ad3476acf1a7657e2dbfd07a9 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -14,10 +14,15 @@ RTOL = 0.03
 # Model-specific expected values
 EXPECTED_VALUES = {
     "Qwen/Qwen3-0.6B": 0.41,
-    "deepseek-ai/deepseek-vl2-small": 0.59
+    "deepseek-ai/deepseek-vl2-small": 0.59,
+    "deepseek-ai/deepseek-vl2-tiny": 0.19,
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.65,
 }
 
-SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
+SIMPLE_PROMPT = (
+    "The best part about working on vLLM is that I got to meet so many people across "
+    "various different organizations like UCB, Google, and Meta which means",
+)
 
 # Get model name from environment variable
 MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
@@ -25,8 +30,7 @@ MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
 
 def run_simple_prompt():
     client = openai.OpenAI(api_key="EMPTY", base_url=BASE_URL)
-    completion = client.completions.create(model=MODEL_NAME,
-                                           prompt=SIMPLE_PROMPT)
+    completion = client.completions.create(model=MODEL_NAME, prompt=SIMPLE_PROMPT)
 
     print("-" * 50)
     print(f"Completion results for {MODEL_NAME}:")
@@ -38,9 +42,11 @@ def test_accuracy():
     """Run the end to end accuracy test."""
     run_simple_prompt()
 
-    model_args = (f"model={MODEL_NAME},"
-                  f"base_url={BASE_URL}/completions,"
-                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+    model_args = (
+        f"model={MODEL_NAME},"
+        f"base_url={BASE_URL}/completions,"
+        f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+    )
 
     results = lm_eval.simple_evaluate(
         model="local-completions",
@@ -52,11 +58,14 @@ def test_accuracy():
     expected_value = EXPECTED_VALUES.get(MODEL_NAME)
 
     if expected_value is None:
-        print(f"Warning: No expected value found for {MODEL_NAME}. "
-              "Skipping accuracy check.")
+        print(
+            f"Warning: No expected value found for {MODEL_NAME}. "
+            "Skipping accuracy check."
+        )
         print(f"Measured value: {measured_value}")
         return
 
-    assert (measured_value - RTOL < expected_value
-            and measured_value + RTOL > expected_value
-            ), f"Expected: {expected_value} | Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} | Measured: {measured_value}"
diff --git a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
index 697e101c3592672c15eec6cdfecaac49e07d60f7..caa4aab870abe2e220f041b6609d3eaf471215b3 100644
--- a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
@@ -43,37 +43,39 @@ def check_vllm_server(url: str, timeout=5, retries=3) -> bool:
             if response.status_code == 200:
                 return True
             else:
-                print(f"Attempt {attempt + 1}: Server returned status code "
-                      "{response.status_code}")
+                print(
+                    f"Attempt {attempt + 1}: Server returned status code "
+                    "{response.status_code}"
+                )
         except requests.exceptions.RequestException as e:
             print(f"Attempt {attempt + 1}: Error connecting to server: {e}")
         time.sleep(1)  # Wait before retrying
     return False
 
 
-def run_simple_prompt(base_url: str, model_name: str, input_prompt: str,
-                      use_chat_endpoint: bool) -> str:
+def run_simple_prompt(
+    base_url: str, model_name: str, input_prompt: str, use_chat_endpoint: bool
+) -> str:
     client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
     if use_chat_endpoint:
         completion = client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": [{
-                    "type": "text",
-                    "text": input_prompt
-                }]
-            }],
+            messages=[
+                {"role": "user", "content": [{"type": "text", "text": input_prompt}]}
+            ],
             max_completion_tokens=MAX_OUTPUT_LEN,
             temperature=0.0,
-            seed=42)
+            seed=42,
+        )
         return completion.choices[0].message.content
     else:
-        completion = client.completions.create(model=model_name,
-                                               prompt=input_prompt,
-                                               max_tokens=MAX_OUTPUT_LEN,
-                                               temperature=0.0,
-                                               seed=42)
+        completion = client.completions.create(
+            model=model_name,
+            prompt=input_prompt,
+            max_tokens=MAX_OUTPUT_LEN,
+            temperature=0.0,
+            seed=42,
+        )
 
         return completion.choices[0].text
 
@@ -90,7 +92,8 @@ def main():
         "--service_url",  # Name of the first argument
         type=str,
         required=True,
-        help="The vLLM service URL.")
+        help="The vLLM service URL.",
+    )
 
     parser.add_argument(
         "--model_name",  # Name of the first argument
@@ -127,28 +130,30 @@ def main():
         if not os.path.exists(args.file_name):
             raise ValueError(
                 f"In disagg mode, the output file {args.file_name} from "
-                "non-disagg. baseline does not exist.")
+                "non-disagg. baseline does not exist."
+            )
 
     service_url = f"{args.service_url}/v1"
 
     if not check_vllm_server(health_check_url):
-        raise RuntimeError(
-            f"vllm server: {args.service_url} is not ready yet!")
+        raise RuntimeError(f"vllm server: {args.service_url} is not ready yet!")
 
     output_strs = dict()
     for i, prompt in enumerate(SAMPLE_PROMPTS):
-        use_chat_endpoint = (i % 2 == 1)
-        output_str = run_simple_prompt(base_url=service_url,
-                                       model_name=args.model_name,
-                                       input_prompt=prompt,
-                                       use_chat_endpoint=use_chat_endpoint)
+        use_chat_endpoint = i % 2 == 1
+        output_str = run_simple_prompt(
+            base_url=service_url,
+            model_name=args.model_name,
+            input_prompt=prompt,
+            use_chat_endpoint=use_chat_endpoint,
+        )
         print(f"Prompt: {prompt}, output: {output_str}")
         output_strs[prompt] = output_str
 
     if args.mode == "baseline":
         # baseline: save outputs
         try:
-            with open(args.file_name, 'w') as json_file:
+            with open(args.file_name, "w") as json_file:
                 json.dump(output_strs, json_file, indent=4)
         except OSError as e:
             print(f"Error writing to file: {e}")
diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
index 8439e30be154b12cfd8e4ebdd4971e810d9b568f..268a1845a2bbabb8bf668087d1ea2a7e41fb9ce6 100644
--- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -12,8 +12,7 @@ PROXY_HOST = os.getenv("PROXY_HOST", "localhost")
 PROXY_PORT = os.getenv("PROXY_PORT", None)
 
 if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
-    raise ValueError(
-        "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
+    raise ValueError("Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
 
 LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, "  # noqa: E501
 PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result,"  # noqa: E501
@@ -41,13 +40,13 @@ def test_edge_cases():
 
     # (1) Check that we can handle a very short prompt,
     # less than the length of the block size.
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=SHORT_PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=SHORT_PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
-    completion = prefill_client.completions.create(model=MODEL,
-                                                   prompt=SHORT_PROMPT,
-                                                   temperature=0)
+    completion = prefill_client.completions.create(
+        model=MODEL, prompt=SHORT_PROMPT, temperature=0
+    )
     prefill_response = completion.choices[0].text
     print(f"SMALL PROMPT: {proxy_response=}")
     assert proxy_response == prefill_response
@@ -55,27 +54,27 @@ def test_edge_cases():
     # (2) Check that we can handle a full prefix cache
     # hit on the D worker but not on the P worker.
     # (2a): prime the D worker.
-    completion = decode_client.completions.create(model=MODEL,
-                                                  prompt=PROMPT,
-                                                  temperature=0)
+    completion = decode_client.completions.create(
+        model=MODEL, prompt=PROMPT, temperature=0
+    )
     decode_response = completion.choices[0].text
     # (2b): send via the P/D setup
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
     print(f"FULL CACHE HIT: {proxy_response=}")
     assert proxy_response == decode_response
 
     # (3) Check that we can handle a partial prefix cache
     # hit on the D worker.
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=LONG_PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=LONG_PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
-    completion = prefill_client.completions.create(model=MODEL,
-                                                   prompt=LONG_PROMPT,
-                                                   temperature=0)
+    completion = prefill_client.completions.create(
+        model=MODEL, prompt=LONG_PROMPT, temperature=0
+    )
     prefill_response = completion.choices[0].text
     print(f"PARTIAL CACHE HIT: {proxy_response=}")
     assert proxy_response == prefill_response
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 905ae0ea717225ba54735704806c211f9287ec9f..5768fcdb57ceb89571d22665d70c7ec9aac8fc4e 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -27,49 +27,45 @@ async def lifespan(app: FastAPI):
 
     # Create prefill clients
     for i, (host, port) in enumerate(global_args.prefiller_instances):
-        prefiller_base_url = f'http://{host}:{port}/v1'
-        app.state.prefill_clients.append({
-            'client':
-            httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
-            'host':
-            host,
-            'port':
-            port,
-            'id':
-            i
-        })
+        prefiller_base_url = f"http://{host}:{port}/v1"
+        app.state.prefill_clients.append(
+            {
+                "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
+                "host": host,
+                "port": port,
+                "id": i,
+            }
+        )
 
     # Create decode clients
     for i, (host, port) in enumerate(global_args.decoder_instances):
-        decoder_base_url = f'http://{host}:{port}/v1'
-        app.state.decode_clients.append({
-            'client':
-            httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
-            'host':
-            host,
-            'port':
-            port,
-            'id':
-            i
-        })
+        decoder_base_url = f"http://{host}:{port}/v1"
+        app.state.decode_clients.append(
+            {
+                "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
+                "host": host,
+                "port": port,
+                "id": i,
+            }
+        )
 
     # Initialize round-robin iterators
-    app.state.prefill_iterator = itertools.cycle(
-        range(len(app.state.prefill_clients)))
-    app.state.decode_iterator = itertools.cycle(
-        range(len(app.state.decode_clients)))
+    app.state.prefill_iterator = itertools.cycle(range(len(app.state.prefill_clients)))
+    app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients)))
 
-    print(f"Initialized {len(app.state.prefill_clients)} prefill clients "
-          f"and {len(app.state.decode_clients)} decode clients.")
+    print(
+        f"Initialized {len(app.state.prefill_clients)} prefill clients "
+        f"and {len(app.state.decode_clients)} decode clients."
+    )
 
     yield
 
     # Shutdown: Close all clients
     for client_info in app.state.prefill_clients:
-        await client_info['client'].aclose()
+        await client_info["client"].aclose()
 
     for client_info in app.state.decode_clients:
-        await client_info['client'].aclose()
+        await client_info["client"].aclose()
 
 
 # Update FastAPI app initialization to use lifespan
@@ -80,46 +76,42 @@ def parse_args():
     parser = argparse.ArgumentParser()
 
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--host", type=str, default="localhost")
+    # Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI
+    parser.add_argument("--host", type=str, default="127.0.0.1")
 
     # For prefiller instances
-    parser.add_argument("--prefiller-hosts",
-                        "--prefiller-host",
-                        type=str,
-                        nargs="+",
-                        default=["localhost"])
-    parser.add_argument("--prefiller-ports",
-                        "--prefiller-port",
-                        type=int,
-                        nargs="+",
-                        default=[8100])
+    parser.add_argument(
+        "--prefiller-hosts",
+        "--prefiller-host",
+        type=str,
+        nargs="+",
+        default=["localhost"],
+    )
+    parser.add_argument(
+        "--prefiller-ports", "--prefiller-port", type=int, nargs="+", default=[8100]
+    )
 
     # For decoder instances
-    parser.add_argument("--decoder-hosts",
-                        "--decoder-host",
-                        type=str,
-                        nargs="+",
-                        default=["localhost"])
-    parser.add_argument("--decoder-ports",
-                        "--decoder-port",
-                        type=int,
-                        nargs="+",
-                        default=[8200])
+    parser.add_argument(
+        "--decoder-hosts", "--decoder-host", type=str, nargs="+", default=["localhost"]
+    )
+    parser.add_argument(
+        "--decoder-ports", "--decoder-port", type=int, nargs="+", default=[8200]
+    )
 
     args = parser.parse_args()
 
     # Validate and pair hosts with ports
     if len(args.prefiller_hosts) != len(args.prefiller_ports):
         raise ValueError(
-            "Number of prefiller hosts must match number of prefiller ports")
+            "Number of prefiller hosts must match number of prefiller ports"
+        )
 
     if len(args.decoder_hosts) != len(args.decoder_ports):
-        raise ValueError(
-            "Number of decoder hosts must match number of decoder ports")
+        raise ValueError("Number of decoder hosts must match number of decoder ports")
 
     # Create tuples of (host, port) for each service type
-    args.prefiller_instances = list(
-        zip(args.prefiller_hosts, args.prefiller_ports))
+    args.prefiller_instances = list(zip(args.prefiller_hosts, args.prefiller_ports))
     args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports))
 
     return args
@@ -136,29 +128,30 @@ def get_next_client(app, service_type: str):
     Returns:
         The next client to use
     """
-    if service_type == 'prefill':
+    if service_type == "prefill":
         client_idx = next(app.state.prefill_iterator)
         return app.state.prefill_clients[client_idx]
-    elif service_type == 'decode':
+    elif service_type == "decode":
         client_idx = next(app.state.decode_iterator)
         return app.state.decode_clients[client_idx]
     else:
         raise ValueError(f"Unknown service type: {service_type}")
 
 
-async def send_request_to_service(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+async def send_request_to_service(
+    client_info: dict, endpoint: str, req_data: dict, request_id: str
+):
     """
     Send a request to a service using a client from the pool.
     """
     req_data = req_data.copy()
-    req_data['kv_transfer_params'] = {
+    req_data["kv_transfer_params"] = {
         "do_remote_decode": True,
         "do_remote_prefill": False,
         "remote_engine_id": None,
         "remote_block_ids": None,
         "remote_host": None,
-        "remote_port": None
+        "remote_port": None,
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
@@ -168,31 +161,31 @@ async def send_request_to_service(client_info: dict, endpoint: str,
         del req_data["stream_options"]
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        "X-Request-Id": request_id
+        "X-Request-Id": request_id,
     }
 
-    response = await client_info['client'].post(endpoint,
-                                                json=req_data,
-                                                headers=headers)
+    response = await client_info["client"].post(
+        endpoint, json=req_data, headers=headers
+    )
     response.raise_for_status()
 
     return response
 
 
-async def stream_service_response(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+async def stream_service_response(
+    client_info: dict, endpoint: str, req_data: dict, request_id: str
+):
     """
     Asynchronously stream response from a service using a client from the pool.
     """
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        "X-Request-Id": request_id
+        "X-Request-Id": request_id,
     }
 
-    async with client_info['client'].stream("POST",
-                                            endpoint,
-                                            json=req_data,
-                                            headers=headers) as response:
+    async with client_info["client"].stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
         response.raise_for_status()
         async for chunk in response.aiter_bytes():
             yield chunk
@@ -204,40 +197,39 @@ async def _handle_completions(api: str, request: Request):
         request_id = str(uuid.uuid4())
 
         # Get the next prefill client in round-robin fashion
-        prefill_client_info = get_next_client(request.app, 'prefill')
+        prefill_client_info = get_next_client(request.app, "prefill")
 
         # Send request to prefill service
-        response = await send_request_to_service(prefill_client_info, api,
-                                                 req_data, request_id)
+        response = await send_request_to_service(
+            prefill_client_info, api, req_data, request_id
+        )
 
         # Extract the needed fields
         response_json = response.json()
-        kv_transfer_params = response_json.get('kv_transfer_params', {})
+        kv_transfer_params = response_json.get("kv_transfer_params", {})
         if kv_transfer_params:
             req_data["kv_transfer_params"] = kv_transfer_params
 
         # Get the next decode client in round-robin fashion
-        decode_client_info = get_next_client(request.app, 'decode')
+        decode_client_info = get_next_client(request.app, "decode")
 
         logger.debug("Using %s %s", prefill_client_info, decode_client_info)
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(decode_client_info,
-                                                       api,
-                                                       req_data,
-                                                       request_id=request_id):
+            async for chunk in stream_service_response(
+                decode_client_info, api, req_data, request_id=request_id
+            ):
                 yield chunk
 
-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="application/json")
 
     except Exception as e:
         import sys
         import traceback
+
         exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server"
-              f" - {api} endpoint")
+        print(f"Error occurred in disagg prefill proxy server - {api} endpoint")
         print(e)
         print("".join(traceback.format_exception(*exc_info)))
         raise
@@ -259,13 +251,14 @@ async def healthcheck():
     return {
         "status": "ok",
         "prefill_instances": len(app.state.prefill_clients),
-        "decode_instances": len(app.state.decode_clients)
+        "decode_instances": len(app.state.decode_clients),
     }
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     global global_args
     global_args = parse_args()
 
     import uvicorn
+
     uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9308c81da06353a299c8633b1e0b01eb62d4c873
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Utility to run integration tests sequentially with varying TP configurations.
+SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
+
+# Define test configurations
+configs=(
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+  "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+)
+
+run_tests() {
+  local label=$1
+  local extra_env=$2
+
+  echo "=== Running tests (${label}) ==="
+  for cfg in "${configs[@]}"; do
+    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
+    # Use 'env' to safely set variables without eval
+    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
+      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
+      exit 1
+    fi
+  done
+  echo "✅ All ${label} tests passed!"
+}
+
+# Run tests
+run_tests "default backend" ""
+
+# Check if FLASHINFER is set (non-empty)
+if [[ -n "${FLASHINFER:-}" ]]; then
+  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
+  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
+else
+  echo "FLASHINFER not set, skipping FLASHINFER runs."
+fi
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
new file mode 100644
index 0000000000000000000000000000000000000000..f51001a6ec12a94bd7019396f55f33dafe2c5e5f
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for backwards compatibility with external KV connector implementations.
+
+This test ensures that external connectors (loaded via kv_connector_module_path)
+implemented with the old signature continue to work:
+- Old signature: __init__(self, vllm_config, role)
+- New signature: __init__(self, vllm_config, role, kv_cache_config)
+"""
+
+from typing import TYPE_CHECKING
+from unittest.mock import patch
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+
+from .utils import create_scheduler, create_vllm_config
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+
+class OldStyleTestConnector(KVConnectorBase_V1):
+    """
+    Test connector using the old signature with 2 required arguments.
+    This simulates external connectors that haven't been updated yet.
+    """
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        # Old-style call to super().__init__ with only 2 arguments
+        super().__init__(vllm_config=vllm_config, role=role)
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int | None, bool]:
+        return 0, False
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        blocks: "KVCacheBlocks",
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(self, scheduler_output: SchedulerOutput):
+        return None
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+class NewStyleTestConnector(KVConnectorBase_V1):
+    """
+    Test connector using the new signature with 3 required arguments.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        # New-style call to super().__init__ with all 3 arguments
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int | None, bool]:
+        return 0, False
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        blocks: "KVCacheBlocks",
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(self, scheduler_output: SchedulerOutput):
+        return None
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_external_old_signature_factory_instantiation(role):
+    """
+    Test that external connectors with old signature (2 required args) loaded
+    via kv_connector_module_path are correctly instantiated with backwards
+    compatibility support.
+    """
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "OldStyleTestConnector"
+    vllm_config.kv_transfer_config.kv_connector_module_path = (
+        "tests.v1.kv_connector.unit.test_backwards_compatibility"
+    )
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
+
+    assert connector is not None
+    assert isinstance(connector, OldStyleTestConnector)
+    assert connector.role == role
+    assert connector._kv_cache_config is None
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_external_new_signature_factory_instantiation(role):
+    """
+    Test that external connectors with new signature (3 required args) loaded
+    via kv_connector_module_path are correctly instantiated.
+    """
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "NewStyleTestConnector"
+    vllm_config.kv_transfer_config.kv_connector_module_path = (
+        "tests.v1.kv_connector.unit.test_backwards_compatibility"
+    )
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
+
+    assert connector is not None
+    assert isinstance(connector, NewStyleTestConnector)
+    assert connector.role == role
+    assert connector._kv_cache_config is not None
+    assert connector._kv_cache_config == kv_cache_config
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_old_signature_super_init(role):
+    """
+    Test that old-style connectors can call super().__init__() without
+    kv_cache_config parameter.
+    """
+    vllm_config = create_vllm_config()
+
+    connector = OldStyleTestConnector(vllm_config, role)
+
+    assert connector is not None
+    assert connector.role == role
+    assert connector._kv_cache_config is None
+
+
+def test_old_signature_super_init_with_kwargs():
+    """
+    Test that old-style connectors can call super().__init__() with keyword
+    arguments in different orders.
+    """
+    vllm_config = create_vllm_config()
+
+    # Test with vllm_config= and role= kwargs
+    connector1 = OldStyleTestConnector(
+        vllm_config=vllm_config, role=KVConnectorRole.SCHEDULER
+    )
+    assert connector1 is not None
+    assert connector1._kv_cache_config is None
+
+    # Test with role= and vllm_config= in reversed order
+    connector2 = OldStyleTestConnector(
+        role=KVConnectorRole.WORKER, vllm_config=vllm_config
+    )
+    assert connector2 is not None
+    assert connector2._kv_cache_config is None
+
+
+def test_internal_connector_uses_new_signature():
+    """
+    Test that internal connectors (registered in factory) always use the new
+    signature and get kv_cache_config.
+    """
+    from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
+        SharedStorageConnector,
+    )
+
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector"
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(
+        vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+    )
+
+    assert connector is not None
+    assert isinstance(connector, SharedStorageConnector)
+    assert connector._kv_cache_config is not None
+    assert connector._kv_cache_config == kv_cache_config
+
+
+def test_signature_detection_with_mocking():
+    """
+    Test that the factory correctly applies compat_sig flag returned from
+    _get_connector_class_with_compat.
+    """
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    # Mock _get_connector_class_with_compat to return old-style connector
+    with patch.object(
+        KVConnectorFactory,
+        "_get_connector_class_with_compat",
+        return_value=(OldStyleTestConnector, True),
+    ):
+        old_connector = KVConnectorFactory.create_connector(
+            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+        )
+        assert old_connector is not None
+        assert isinstance(old_connector, OldStyleTestConnector)
+        assert old_connector._kv_cache_config is None
+
+    # Mock _get_connector_class_with_compat to return new-style connector
+    with patch.object(
+        KVConnectorFactory,
+        "_get_connector_class_with_compat",
+        return_value=(NewStyleTestConnector, False),
+    ):
+        new_connector = KVConnectorFactory.create_connector(
+            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+        )
+        assert new_connector is not None
+        assert isinstance(new_connector, NewStyleTestConnector)
+        assert new_connector._kv_cache_config is not None
+        assert new_connector._kv_cache_config == kv_cache_config
diff --git a/tests/v1/kv_connector/unit/test_config.py b/tests/v1/kv_connector/unit/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf86f3d5c4ac09f8072efa1ac50c0e8ecc92534
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_config.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Tests for KV cache offloading configuration."""
+
+import pytest
+
+from vllm.config import CacheConfig, KVTransferConfig, ParallelConfig, VllmConfig
+
+pytestmark = pytest.mark.cpu_test
+
+
+@pytest.mark.parametrize(
+    "kv_offloading_backend,kv_offloading_size,tp,pp,expected_backend,expected_bytes",
+    [
+        ("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)),
+        # bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
+        ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4),
+        ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
+        # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
+        ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
+        (None, None, 1, 1, None, None),
+    ],
+)
+def test_kv_connector(
+    kv_offloading_backend, kv_offloading_size, tp, pp, expected_backend, expected_bytes
+):
+    kv_transfer_config = (
+        KVTransferConfig(kv_connector_extra_config={"existing_key": "existing_value"})
+        if expected_backend is not None
+        else None
+    )
+
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_offloading_backend=kv_offloading_backend,
+            kv_offloading_size=kv_offloading_size,
+        ),
+        kv_transfer_config=kv_transfer_config,
+        parallel_config=ParallelConfig(
+            tensor_parallel_size=tp, pipeline_parallel_size=pp
+        ),
+    )
+
+    # No KV transfer config expected
+    if expected_backend is None:
+        assert vllm_config.kv_transfer_config is expected_backend
+        return
+
+    kv_transfer_config = vllm_config.kv_transfer_config
+    kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
+
+    assert kv_transfer_config.kv_connector == expected_backend
+    assert kv_transfer_config.kv_role == "kv_both"
+
+    if kv_offloading_backend == "native":
+        assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes
+        assert kv_connector_extra_config["num_cpu_blocks"] == 0
+        # Existing config should be preserved
+        assert kv_connector_extra_config["existing_key"] == "existing_value"
+    elif kv_offloading_backend == "lmcache":
+        assert kv_connector_extra_config["lmcache.local_cpu"] is True
+        assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
+        # Existing config should be replaced
+        assert "existing_key" not in kv_connector_extra_config
diff --git a/tests/v1/kv_connector/unit/test_decode_bench_connector.py b/tests/v1/kv_connector/unit/test_decode_bench_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..24802317a2bbc00733e18af547239cc48b5f244c
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_decode_bench_connector.py
@@ -0,0 +1,415 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for DecodeBenchConnector.
+
+Tests the functionality of the DecodeBenchConnector which fills KV cache
+with dummy values for decode performance benchmarking.
+"""
+
+import pytest
+import torch
+
+from vllm import SamplingParams
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
+
+# ruff: noqa: E501
+from vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector import (
+    DecodeBenchConnector,
+    DecodeBenchConnectorMetadata,
+)
+from vllm.forward_context import ForwardContext
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import Request
+
+from .utils import (
+    EOS_TOKEN_ID,
+    create_model_runner_output,
+    create_scheduler,
+    create_vllm_config,
+)
+
+
+class DecodeBenchTestRunner:
+    """Test runner for DecodeBenchConnector."""
+
+    def __init__(self, block_size: int, num_gpu_blocks: int):
+        self.block_size = block_size
+        self.num_gpu_blocks = num_gpu_blocks
+
+        self.req_id = -1
+
+        # Create vllm config with DecodeBenchConnector
+        vllm_config = create_vllm_config(
+            block_size=block_size, max_num_batched_tokens=1000
+        )
+        vllm_config.kv_transfer_config = KVTransferConfig(
+            kv_connector="DecodeBenchConnector",
+            kv_role="kv_both",
+        )
+
+        self.vllm_config = vllm_config
+        self.scheduler: Scheduler = create_scheduler(
+            vllm_config, num_blocks=num_gpu_blocks
+        )
+
+        # Create worker-side connector
+        self.worker_connector = DecodeBenchConnector(
+            vllm_config, KVConnectorRole.WORKER
+        )
+
+        # Create dummy KV caches for testing
+        # Shape: [num_blocks, 2, num_heads, block_size, head_dim]
+        # Using simplified shape for testing
+        num_heads = 4
+        head_dim = 64
+        self.kv_caches = {
+            f"layer_{i}": torch.zeros(
+                num_gpu_blocks, 2, num_heads, block_size, head_dim
+            )
+            for i in range(2)  # 2 layers for testing
+        }
+
+        # Register KV caches with worker connector
+        self.worker_connector.register_kv_caches(self.kv_caches)
+
+        # Extract scheduler-side connector
+        scheduler_connector = self.scheduler.connector
+        assert scheduler_connector is not None
+        assert isinstance(scheduler_connector, DecodeBenchConnector)
+        self.scheduler_connector: DecodeBenchConnector = scheduler_connector
+
+        init_none_hash(sha256)
+        self._block_hasher = get_request_block_hasher(block_size, sha256)
+
+        self._dummy_ctx: ForwardContext = ForwardContext(
+            no_compile_layers={}, attn_metadata={}, virtual_engine=0
+        )
+
+    def new_request(self, token_ids: list[int]) -> Request:
+        """Create a new request with given token IDs."""
+        self.req_id += 1
+
+        req = Request(
+            request_id=str(self.req_id),
+            prompt_token_ids=token_ids,
+            sampling_params=SamplingParams(max_tokens=100),
+            pooling_params=None,
+            eos_token_id=EOS_TOKEN_ID,
+            block_hasher=self._block_hasher,
+        )
+
+        self.scheduler.add_request(req)
+        return req
+
+    def run_single_step(self, token_id: int = 0):
+        """Run a single scheduler + worker step."""
+        scheduler_output = self.scheduler.schedule()
+
+        # Get connector metadata
+        kv_connector_metadata = scheduler_output.kv_connector_metadata
+        assert kv_connector_metadata is not None
+        assert isinstance(kv_connector_metadata, DecodeBenchConnectorMetadata)
+
+        # Bind metadata and load KV
+        self.worker_connector.bind_connector_metadata(kv_connector_metadata)
+        self.worker_connector.start_load_kv(self._dummy_ctx)
+
+        if scheduler_output.total_num_scheduled_tokens > 0:
+            self.worker_connector.wait_for_save()
+
+        self.worker_connector.clear_connector_metadata()
+
+        # Create model runner output
+        model_runner_output = create_model_runner_output(
+            reqs=self.scheduler.running,
+            token_id=token_id,
+        )
+
+        self.scheduler.update_from_output(scheduler_output, model_runner_output)
+
+        return scheduler_output, kv_connector_metadata
+
+
+def test_decode_bench_connector_basic():
+    """Test basic functionality of DecodeBenchConnector."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request with multiple blocks worth of tokens
+    num_tokens = block_size * 3  # 3 blocks
+    token_ids = [1] * num_tokens
+
+    req = runner.new_request(token_ids)
+
+    # Run first step - should fill KV cache with dummy values
+    scheduler_output, metadata = runner.run_single_step()
+
+    # Check that get_num_new_matched_tokens returned correct value
+    # Should be num_tokens - 1 (all except the last token for decode)
+    expected_fill_tokens = num_tokens - 1
+
+    # Check metadata has the request to fill
+    assert len(metadata.reqs_to_fill) == 1
+    assert req.request_id in metadata.reqs_to_fill
+
+    block_ids_per_group, num_tokens_to_fill = metadata.reqs_to_fill[req.request_id]
+    assert num_tokens_to_fill == expected_fill_tokens
+
+    # For standard attention, there's only one group
+    assert len(block_ids_per_group) == 1
+    block_ids = block_ids_per_group[0]
+
+    # Calculate expected number of blocks
+    expected_num_blocks = (expected_fill_tokens + block_size - 1) // block_size
+    assert len(block_ids) == expected_num_blocks
+
+    # Verify KV caches were filled with constant value
+    for layer_name, kv_cache in runner.kv_caches.items():
+        for block_id in block_ids:
+            # Check that the block was filled
+            block_data = kv_cache[block_id]
+            # Should be filled with constant value 0.015
+            assert torch.allclose(block_data, torch.tensor(0.015))
+
+
+def test_decode_bench_connector_no_refill():
+    """Test that DecodeBenchConnector only fills once per request."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request
+    num_tokens = block_size * 2
+    token_ids = [1] * num_tokens
+
+    runner.new_request(token_ids)
+
+    # Run first step - should fill KV cache
+    _, metadata1 = runner.run_single_step()
+    assert len(metadata1.reqs_to_fill) == 1
+
+    # Run second step - should NOT fill again (already filled)
+    _, metadata2 = runner.run_single_step()
+    assert len(metadata2.reqs_to_fill) == 0
+
+
+def test_decode_bench_connector_single_token():
+    """Test DecodeBenchConnector with single token request."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request with just 1 token
+    # Should not fill anything (need at least 2 tokens: 1 to fill, 1 to decode)
+    token_ids = [1]
+
+    runner.new_request(token_ids)
+
+    # Run step - should NOT fill KV cache
+    _, metadata = runner.run_single_step()
+    assert len(metadata.reqs_to_fill) == 0
+
+
+def test_decode_bench_connector_two_tokens():
+    """Test DecodeBenchConnector with two token request."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request with 2 tokens
+    # Should fill 1 token (first token), decode the second
+    token_ids = [1, 2]
+
+    req = runner.new_request(token_ids)
+
+    # Run step
+    _, metadata = runner.run_single_step()
+
+    assert len(metadata.reqs_to_fill) == 1
+    assert req.request_id in metadata.reqs_to_fill
+
+    block_ids_per_group, num_tokens_to_fill = metadata.reqs_to_fill[req.request_id]
+    assert num_tokens_to_fill == 1
+    # For standard attention, there's only one group
+    assert len(block_ids_per_group) == 1
+    assert len(block_ids_per_group[0]) == 1  # 1 token needs 1 block
+
+
+def test_decode_bench_connector_large_context():
+    """Test DecodeBenchConnector with large context size."""
+    block_size = 16
+    num_gpu_blocks = 1000
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request with many blocks
+    num_blocks = 20
+    num_tokens = block_size * num_blocks
+    token_ids = list(range(num_tokens))
+
+    req = runner.new_request(token_ids)
+
+    # Run step
+    _, metadata = runner.run_single_step()
+
+    assert len(metadata.reqs_to_fill) == 1
+    assert req.request_id in metadata.reqs_to_fill
+
+    block_ids_per_group, num_tokens_to_fill = metadata.reqs_to_fill[req.request_id]
+
+    # Should fill all tokens except the last one
+    expected_fill_tokens = num_tokens - 1
+    assert num_tokens_to_fill == expected_fill_tokens
+
+    # For standard attention, there's only one group
+    assert len(block_ids_per_group) == 1
+    block_ids = block_ids_per_group[0]
+
+    # Calculate expected number of blocks
+    expected_num_blocks = (expected_fill_tokens + block_size - 1) // block_size
+    assert len(block_ids) == expected_num_blocks
+
+    # Verify blocks were filled
+    for layer_name, kv_cache in runner.kv_caches.items():
+        for block_id in block_ids:
+            block_data = kv_cache[block_id]
+            assert torch.allclose(block_data, torch.tensor(0.015))
+
+
+def test_decode_bench_connector_multiple_requests():
+    """Test DecodeBenchConnector with multiple sequential requests."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # First request
+    req1 = runner.new_request([1] * (block_size * 2))
+    _, metadata1 = runner.run_single_step()
+
+    assert len(metadata1.reqs_to_fill) == 1
+    assert req1.request_id in metadata1.reqs_to_fill
+
+    # Complete first request
+    while runner.scheduler.running:
+        runner.run_single_step()
+
+    # Add EOS to finish
+    scheduler_output = runner.scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        reqs=runner.scheduler.running,
+        token_id=EOS_TOKEN_ID,
+        use_eos=True,
+    )
+    runner.scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Second request - should also get filled
+    req2 = runner.new_request([2] * (block_size * 3))
+    _, metadata2 = runner.run_single_step()
+
+    assert len(metadata2.reqs_to_fill) == 1
+    assert req2.request_id in metadata2.reqs_to_fill
+
+    # Different request should have different metadata
+    _, num_tokens1 = metadata1.reqs_to_fill[req1.request_id]
+    _, num_tokens2 = metadata2.reqs_to_fill[req2.request_id]
+
+    assert num_tokens1 == block_size * 2 - 1
+    assert num_tokens2 == block_size * 3 - 1
+
+
+def test_decode_bench_connector_partial_block():
+    """Test DecodeBenchConnector with partial block filling."""
+    block_size = 16
+    num_gpu_blocks = 100
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create a request that doesn't align to block boundaries
+    # e.g., 2.5 blocks worth of tokens
+    num_tokens = block_size * 2 + block_size // 2
+    token_ids = [1] * num_tokens
+
+    req = runner.new_request(token_ids)
+
+    # Run step
+    _, metadata = runner.run_single_step()
+
+    assert len(metadata.reqs_to_fill) == 1
+    assert req.request_id in metadata.reqs_to_fill
+
+    block_ids_per_group, num_tokens_to_fill = metadata.reqs_to_fill[req.request_id]
+
+    # Should fill all tokens except the last one
+    expected_fill_tokens = num_tokens - 1
+    assert num_tokens_to_fill == expected_fill_tokens
+
+    # For standard attention, there's only one group
+    assert len(block_ids_per_group) == 1
+    block_ids = block_ids_per_group[0]
+
+    # Should allocate 3 blocks to hold the partial data
+    expected_num_blocks = 3
+    assert len(block_ids) == expected_num_blocks
+
+
+def test_decode_bench_connector_concurrent_requests():
+    """Test DecodeBenchConnector with multiple concurrent requests in the same batch."""
+    block_size = 16
+    num_gpu_blocks = 1000
+
+    runner = DecodeBenchTestRunner(block_size=block_size, num_gpu_blocks=num_gpu_blocks)
+
+    # Create multiple requests that will be batched together
+    req1 = runner.new_request([1] * (block_size * 2))
+    req2 = runner.new_request([2] * (block_size * 3))
+    req3 = runner.new_request([3] * (block_size * 1))
+
+    # Run first step - all requests should be filled concurrently
+    _, metadata = runner.run_single_step()
+
+    # All three requests should be in the metadata
+    assert len(metadata.reqs_to_fill) == 3
+    assert req1.request_id in metadata.reqs_to_fill
+    assert req2.request_id in metadata.reqs_to_fill
+    assert req3.request_id in metadata.reqs_to_fill
+
+    # Verify each request has correct fill info
+    block_ids_per_group1, num_tokens1 = metadata.reqs_to_fill[req1.request_id]
+    block_ids_per_group2, num_tokens2 = metadata.reqs_to_fill[req2.request_id]
+    block_ids_per_group3, num_tokens3 = metadata.reqs_to_fill[req3.request_id]
+
+    # Verify token counts (all tokens except last one)
+    assert num_tokens1 == block_size * 2 - 1
+    assert num_tokens2 == block_size * 3 - 1
+    assert num_tokens3 == block_size * 1 - 1
+
+    # Verify block counts for each request
+    assert len(block_ids_per_group1[0]) == 2  # 2 blocks
+    assert len(block_ids_per_group2[0]) == 3  # 3 blocks
+    assert len(block_ids_per_group3[0]) == 1  # 1 block
+
+    # Verify all blocks are filled in KV cache
+    for req_id, (block_ids_per_group, _) in metadata.reqs_to_fill.items():
+        block_ids = block_ids_per_group[0]
+        for layer_name, kv_cache in runner.kv_caches.items():
+            for block_id in block_ids:
+                block_data = kv_cache[block_id]
+                assert torch.allclose(block_data, torch.tensor(0.015))
+
+    # Run second step - should NOT fill again (already filled)
+    _, metadata2 = runner.run_single_step()
+    assert len(metadata2.reqs_to_fill) == 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
index fe6296cf12ea0e42e34754c063b139177a04da7b..d0a6eeae6286dde83a46ed2a433a23fd71f19dd6 100644
--- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
+++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa: E501
-    SharedStorageConnectorMetadata)
+    SharedStorageConnectorMetadata,
+)
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    ensure_kv_transfer_initialized, get_kv_transfer_group)
+    ensure_kv_transfer_initialized,
+    get_kv_transfer_group,
+)
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
-from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin)
+from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 
 # Importing utils registers TestSharedStorageConnector with the factory
 from .utils import create_vllm_config
@@ -24,8 +26,6 @@ def _make_empty_scheduler_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
         kv_connector_metadata=SharedStorageConnectorMetadata(),
     )
 
@@ -34,7 +34,7 @@ def test_kv_connector_mixin_clears_metadata():
     vllm_config = create_vllm_config()
     vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector"
     vllm_config.kv_transfer_config.kv_role = "kv_both"
-    vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = ("unit")
+    vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit"
 
     # Initialize the global connector instance
     ensure_kv_transfer_initialized(vllm_config)
@@ -46,7 +46,8 @@ def test_kv_connector_mixin_clears_metadata():
 
         # Invoke the no-forward path which uses the mixin context manager
         KVConnectorModelRunnerMixin.kv_connector_no_forward(
-            scheduler_output, vllm_config)
+            scheduler_output, vllm_config
+        )
 
         # Verify clear_connector_metadata was called on the connector
         connector = get_kv_transfer_group()
diff --git a/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py b/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b7b2226e758af4cf860ab4f5d3fbd00fa6d69ac
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import Request, RequestStatus
+
+from .utils import (
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+
+def _make_get_num_new_matched_tokens(
+    req_num_new_matched_tokens: dict[str, int],
+    async_load,
+) -> Callable[[Request, int], tuple[int, bool]]:
+    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
+        value = req_num_new_matched_tokens.get(request.request_id, 0)
+        return value, async_load
+
+    return get_num_new_matched_tokens
+
+
+@pytest.fixture
+def scheduler():
+    vllm_config = create_vllm_config()
+    return create_scheduler(vllm_config)
+
+
+@pytest.mark.parametrize(
+    "num_prompt_blocks,num_external_computed_blocks,invalid_block_idxs",
+    [
+        (100, 99, {0, 98}),
+        (100, 99, {50, 98}),
+        (100, 99, {98}),
+    ],
+)
+def test_async_load_failure(
+    scheduler: Scheduler,
+    num_prompt_blocks: int,
+    num_external_computed_blocks: int,
+    invalid_block_idxs: set[int],
+):
+    assert num_prompt_blocks >= num_external_computed_blocks
+
+    num_prompt_tokens = num_prompt_blocks * scheduler.block_size
+    num_external_computed_tokens = num_external_computed_blocks * scheduler.block_size
+
+    request1 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request1)
+    request2 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request2)
+    request3 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request3)
+
+    # Mock KV connector method.
+    # req_id -> num_external_computed_tokens
+    req_num_new_matched_tokens = {
+        request1.request_id: num_external_computed_tokens,
+        request2.request_id: num_external_computed_tokens,
+        request3.request_id: num_external_computed_tokens,
+    }
+
+    scheduler.connector = Mock()
+    scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, async_load=True)
+    )
+    scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = scheduler.schedule()
+
+    assert len(scheduler.waiting) == 3
+    for request in scheduler.waiting:
+        assert request.num_computed_tokens == 0
+        assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
+
+    # Simulate a failure in loading some of request2 blocks.
+    (req2_block_ids,) = scheduler.kv_cache_manager.get_block_ids(request2.request_id)
+    invalid_block_ids = {req2_block_ids[i] for i in invalid_block_idxs}
+    model_runner_output = create_model_runner_output(
+        reqs=[],
+        finished_recving={request1.request_id, request3.request_id},
+        invalid_block_ids=invalid_block_ids,
+        use_eos=True,
+    )
+
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    min_invalid_block_idx = min(invalid_block_idxs)
+
+    assert len(scheduler.waiting) == 3
+    for request in scheduler.waiting:
+        if request.request_id == request2.request_id:
+            assert request.num_computed_tokens == (
+                min_invalid_block_idx * scheduler.block_size
+            )
+        else:
+            assert request.num_computed_tokens == 0
+        assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert scheduler.failed_recving_kv_req_ids == {request2.request_id}
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
+
+
+@pytest.mark.parametrize(
+    "num_prompt_blocks,num_external_computed_blocks,invalid_block_idxs",
+    [
+        (100, 99, {0, 98}),
+        (100, 99, {50, 98}),
+        (100, 99, {98}),
+    ],
+)
+def test_sync_load_failure(
+    scheduler: Scheduler,
+    num_prompt_blocks: int,
+    num_external_computed_blocks: int,
+    invalid_block_idxs: set[int],
+):
+    assert num_prompt_blocks >= num_external_computed_blocks
+
+    num_prompt_tokens = num_prompt_blocks * scheduler.block_size
+    num_external_computed_tokens = num_external_computed_blocks * scheduler.block_size
+
+    request1 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request1)
+    request2 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request2)
+    request3 = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request3)
+
+    # Mock KV connector method.
+    # req_id -> num_external_computed_tokens
+    req_num_new_matched_tokens = {
+        request1.request_id: num_external_computed_tokens,
+        request2.request_id: num_external_computed_tokens,
+        request3.request_id: num_external_computed_tokens,
+    }
+
+    scheduler.connector = Mock()
+    scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, async_load=False)
+    )
+    scheduler.connector.request_finished.return_value = (False, None)
+    scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = scheduler.schedule()
+
+    # req_id -> num_computed_tokens
+    expected_computed_tokens = {
+        request1.request_id: num_external_computed_tokens,
+        request2.request_id: num_external_computed_tokens,
+        request3.request_id: num_external_computed_tokens,
+    }
+
+    assert len(scheduler.running) == 3
+    assert len(scheduler_output.scheduled_new_reqs) == 3
+    for request in scheduler_output.scheduled_new_reqs:
+        assert request.num_computed_tokens == expected_computed_tokens[request.req_id]
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
+
+    # Simulate a failure in loading some of request2 blocks.
+    req2_block_ids = scheduler_output.scheduled_new_reqs[1].block_ids[0]
+    invalid_block_ids = {req2_block_ids[i] for i in invalid_block_idxs}
+    model_runner_output = create_model_runner_output(
+        [request1, request2, request3],
+        invalid_block_ids=invalid_block_ids,
+        use_eos=True,
+    )
+
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == request2.request_id
+    assert scheduler.running[0].num_computed_tokens == (
+        min(invalid_block_idxs) * scheduler.block_size
+    )
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
+    assert scheduler.connector.request_finished.call_count == 2
+
+
+@pytest.mark.parametrize(
+    "num_prompt_blocks,"
+    "num_external_computed_blocks,"
+    "num_common_prefix_blocks,"
+    "invalid_block_idxs",
+    [
+        (100, 99, 50, {0, 49}),
+        (100, 99, 50, {25, 49}),
+        (100, 99, 50, {49}),
+    ],
+)
+def test_sync_load_failure_with_shared_blocks(
+    scheduler: Scheduler,
+    num_prompt_blocks: int,
+    num_external_computed_blocks: int,
+    num_common_prefix_blocks: int,
+    invalid_block_idxs: set[int],
+):
+    assert num_prompt_blocks >= num_external_computed_blocks >= num_common_prefix_blocks
+
+    num_prompt_tokens = num_prompt_blocks * scheduler.block_size
+    num_external_computed_tokens = num_external_computed_blocks * scheduler.block_size
+    common_prefix_len = num_common_prefix_blocks * scheduler.block_size
+
+    request1 = create_request(
+        num_tokens=num_prompt_tokens, common_prefix_len=common_prefix_len
+    )
+    scheduler.add_request(request=request1)
+    request2 = create_request(
+        num_tokens=num_prompt_tokens, common_prefix_len=common_prefix_len
+    )
+    scheduler.add_request(request=request2)
+
+    # Mock KV connector method.
+    # req_id -> num_external_computed_tokens
+    req_num_new_matched_tokens = {
+        request1.request_id: num_external_computed_tokens,
+    }
+
+    scheduler.connector = Mock()
+    scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, async_load=False)
+    )
+    scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = scheduler.schedule()
+
+    # req_id -> num_computed_tokens
+    expected_computed_tokens = {
+        request1.request_id: num_external_computed_tokens,
+        request2.request_id: common_prefix_len,
+    }
+
+    assert len(scheduler.running) == 2
+    assert len(scheduler_output.scheduled_new_reqs) == 2
+    for request in scheduler_output.scheduled_new_reqs:
+        assert request.num_computed_tokens == expected_computed_tokens[request.req_id]
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 2
+
+    # Simulate a failure in loading some of the shared blocks.
+    req1_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
+    invalid_block_ids = {req1_block_ids[i] for i in invalid_block_idxs}
+    model_runner_output = create_model_runner_output(
+        [request1, request2], invalid_block_ids=invalid_block_ids, use_eos=True
+    )
+
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # req_id -> num_computed_tokens
+    # all the common prefix blocks will be computed by request1
+    expected_computed_tokens = {
+        request1.request_id: min(invalid_block_idxs) * scheduler.block_size,
+        request2.request_id: common_prefix_len,
+    }
+
+    assert len(scheduler.running) == 2
+    for request in scheduler.running:
+        assert (
+            request.num_computed_tokens == expected_computed_tokens[request.request_id]
+        )
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 2
+
+
+@pytest.mark.parametrize(
+    "num_prompt_blocks,num_external_computed_blocks,invalid_block_idxs",
+    [
+        (100, 99, {0, 50, 98}),
+        (100, 99, {98, 50, 0}),
+    ],
+)
+def test_async_progressive_load_failure(
+    scheduler: Scheduler,
+    num_prompt_blocks: int,
+    num_external_computed_blocks: int,
+    invalid_block_idxs: set[int],
+):
+    assert num_prompt_blocks >= num_external_computed_blocks
+
+    num_prompt_tokens = num_prompt_blocks * scheduler.block_size
+    num_external_computed_tokens = num_external_computed_blocks * scheduler.block_size
+
+    request = create_request(num_tokens=num_prompt_tokens)
+    scheduler.add_request(request=request)
+
+    # Mock KV connector method.
+    # req_id -> num_external_computed_tokens
+    req_num_new_matched_tokens = {
+        request.request_id: num_external_computed_tokens,
+    }
+
+    scheduler.connector = Mock()
+    scheduler.connector.get_num_new_matched_tokens.side_effect = (
+        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, async_load=True)
+    )
+    scheduler.connector.take_events.return_value = ()
+
+    scheduler_output = scheduler.schedule()
+
+    assert len(scheduler.waiting) == 1
+    assert scheduler.waiting.peek_request().request_id == request.request_id
+    assert request.num_computed_tokens == 0
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert scheduler.connector.get_num_new_matched_tokens.call_count == 1
+
+    min_invalid_block_idx = max(invalid_block_idxs) + 1
+    # Simulate failures when progressively loading request blocks.
+    for invalid_block_idx in invalid_block_idxs:
+        (req_block_ids,) = scheduler.kv_cache_manager.get_block_ids(request.request_id)
+        invalid_block_ids = {req_block_ids[invalid_block_idx]}
+        model_runner_output = create_model_runner_output(
+            reqs=[],
+            finished_recving=set(),
+            invalid_block_ids=invalid_block_ids,
+            use_eos=True,
+        )
+
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+        min_invalid_block_idx = min(min_invalid_block_idx, invalid_block_idx)
+
+        assert len(scheduler.waiting) == 1
+        assert scheduler.waiting.peek_request().request_id == request.request_id
+        assert request.num_computed_tokens == (
+            min_invalid_block_idx * scheduler.block_size
+        )
+        assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+        assert scheduler.failed_recving_kv_req_ids == {request.request_id}
+        assert scheduler.connector.get_num_new_matched_tokens.call_count == 1
diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..11507d7cd4e7bfbf5251ec06e919b5d358a86a40
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# NOTE: if your PR has broken one of the tests here (sorry),
+# kindly patch the corresponding integration in
+# /vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+# or reach out to @aposataC for assistance
+
+# Assumption vs. Correctness Tests:
+# these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
+# it is to ensure that assumptions LMCache makes about vLLM's interface are stable
+def assumes(obj, attr, is_callable=False, is_instance_of=None):
+    import inspect
+    from dataclasses import is_dataclass
+
+    assumption_msg = (
+        f"LMCache connector currently assumes that {obj} has a(n) {attr} attribute"
+    )
+    if hasattr(obj, attr):
+        attr_value = getattr(obj, attr)
+    elif is_dataclass(obj) and attr in getattr(obj, "__dataclass_fields__", {}):
+        field = obj.__dataclass_fields__[attr]
+        field_type = field.type
+        origin = getattr(field_type, "__origin__", None)
+        if origin is not None:
+            field_type = origin
+        attr_value = field_type
+    else:
+        raise AssertionError(assumption_msg)
+    if is_callable:
+        assumption_msg += f" and that {obj}.{attr} is a callable"
+        assert callable(attr_value), assumption_msg
+    if is_instance_of:
+        assumption_msg += f" and that {obj}.{attr} is an instance of {is_instance_of}"
+        if isinstance(attr_value, property):
+            fget = attr_value.fget
+            assert fget is not None, f"Property {obj}.{attr} has no fget"
+            sig = inspect.signature(fget)
+            ret_anno = sig.return_annotation
+            assert ret_anno is not inspect._empty, (
+                f"Property {obj}.{attr} has no return annotation"
+            )
+            assert ret_anno == is_instance_of, assumption_msg
+        else:
+            if isinstance(attr_value, type):
+                assert attr_value is is_instance_of, assumption_msg
+            else:
+                assert isinstance(attr_value, is_instance_of), assumption_msg
+
+
+def test_multimodal_interface():
+    # protect against interface changes
+    from vllm.multimodal.inputs import PlaceholderRange
+
+    assumes(PlaceholderRange, "offset")
+    assumes(PlaceholderRange, "length")
+
+    # test a minimal case
+    import torch
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        apply_mm_hashes_to_token_ids,
+    )
+
+    token_ids = torch.arange(10, dtype=torch.long)
+    mm_hashes = ["0000", "1111"]  # hex repr of 0 and 4369
+    mm_positions = [
+        PlaceholderRange(offset=0, length=4),
+        PlaceholderRange(offset=5, length=4),
+    ]
+    apply_mm_hashes_to_token_ids(token_ids, mm_hashes, mm_positions)
+    assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
+
+
+def test_config_interface():
+    # protect against interface changes
+    from vllm.config import VllmConfig
+    from vllm.config.cache import CacheConfig
+    from vllm.config.kv_transfer import KVTransferConfig
+    from vllm.config.model import ModelConfig
+    from vllm.config.parallel import ParallelConfig
+
+    assumes(VllmConfig, "model_config")
+    assumes(VllmConfig, "cache_config")
+    assumes(VllmConfig, "parallel_config")
+    assumes(VllmConfig, "kv_transfer_config")
+
+    assumes(KVTransferConfig, "kv_role")
+    assumes(KVTransferConfig, "kv_connector_extra_config")
+
+    assumes(ModelConfig, "use_mla", is_instance_of=bool)
+    assumes(ModelConfig, "dtype")
+    assumes(ModelConfig, "max_model_len")
+    assumes(ModelConfig, "get_vocab_size", is_callable=True)
+    assumes(ModelConfig, "get_num_attention_heads", is_callable=True)
+    assumes(ModelConfig, "get_num_kv_heads", is_callable=True)
+    assumes(ModelConfig, "get_head_size", is_callable=True)
+    assumes(ModelConfig, "get_num_layers", is_callable=True)
+    assumes(ModelConfig, "get_num_kv_heads", is_callable=True)
+    assumes(ModelConfig, "model")
+
+    assumes(ParallelConfig, "world_size")
+    assumes(ParallelConfig, "rank")
+    assumes(ParallelConfig, "tensor_parallel_size")
+    assumes(ParallelConfig, "pipeline_parallel_size")
+    assumes(ParallelConfig, "data_parallel_size_local")
+    assumes(ParallelConfig, "data_parallel_rank_local")
+
+    assumes(CacheConfig, "cache_dtype")
+    assumes(CacheConfig, "block_size")
+    assumes(CacheConfig, "gpu_memory_utilization")
+
+    # mla metadata minimal cases
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        mla_enabled,
+    )
+
+    model_config = ModelConfig(model="deepseek-ai/DeepSeek-R1")
+    assert mla_enabled(model_config)
+    model_config = ModelConfig(model="Qwen/Qwen3-0.6B")
+    assert not mla_enabled(model_config)
+
+    # kv metadata minimal case
+    from vllm.utils.torch_utils import get_kv_cache_torch_dtype
+
+    model_config = ModelConfig(dtype="bfloat16")
+    parallel_config = ParallelConfig()
+    cache_config = CacheConfig(cache_dtype="bfloat16")
+    kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype)
+    use_mla = mla_enabled(model_config)
+    chunk_size = 256
+    num_layer = model_config.get_num_layers(parallel_config)
+    num_kv_head = model_config.get_num_kv_heads(parallel_config)
+    head_size = model_config.get_head_size()
+    kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size)
+
+    # dummy lmcache metadata creation example
+    _ = (
+        model_config.model,
+        parallel_config.world_size,
+        parallel_config.rank,
+        "vllm",
+        kv_dtype,
+        kv_shape,
+        use_mla,
+    )
+
+
+def test_request_interface():
+    # protect against interface changes
+    from types import NoneType
+
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+    req = Request(
+        request_id="test_request",
+        prompt_token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+    )
+    assumes(req, "mm_features", is_instance_of=(list, NoneType))
+    assumes(req, "request_id")
+    assumes(req, "priority")
+    assumes(req, "prompt_token_ids")
+    assumes(req, "sampling_params")
+    assumes(req, "num_tokens")
+    assumes(req, "kv_transfer_params", is_instance_of=(dict, NoneType))
+
+    from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
+
+    assumes(MultiModalFeatureSpec, "identifier")
+    assumes(MultiModalFeatureSpec, "mm_position")
+
+    # minimal case:
+    from vllm.multimodal.inputs import PlaceholderRange
+
+    request = Request(
+        request_id="test_request",
+        prompt_token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+        mm_features=[
+            MultiModalFeatureSpec(
+                modality="image",
+                identifier="0000",
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=PlaceholderRange(offset=0, length=10),
+            )
+        ],
+    )
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        extract_mm_features,
+    )
+
+    mm_hashes, mm_positions = extract_mm_features(request)
+    assert isinstance(mm_hashes, list)
+    assert len(mm_hashes) == 1
+    assert isinstance(mm_positions, list)
+    assert len(mm_positions) == 1
+    assert mm_positions[0].offset == 0
+    assert mm_positions[0].length == 10
+
+
+def test_new_request_interface():
+    # protect against interface changes
+    from vllm.v1.core.sched.output import NewRequestData
+
+    assumes(NewRequestData, "req_id")
+    assumes(NewRequestData, "block_ids")
+    assumes(NewRequestData, "prompt_token_ids")
+    assumes(NewRequestData, "sampling_params")
+
+
+def test_sampling_params_interface():
+    # protect against interface changes
+    from vllm.sampling_params import SamplingParams
+
+    assumes(SamplingParams, "extra_args")
+
+    # dumb example use case in LMCache
+    kv_transfer_params = {
+        "lmcache.tag.user": "example_user_1",
+        "lmcache.ttl": 60,
+    }
+    sampling_params = SamplingParams(
+        extra_args={"kv_transfer_params": kv_transfer_params}
+    )
+    assert sampling_params.extra_args["kv_transfer_params"] == kv_transfer_params
+
+
+def test_tp_interface():
+    # protect against interface changes
+    import inspect
+
+    from vllm.distributed.parallel_state import get_tp_group
+
+    sig = inspect.signature(get_tp_group)
+    GroupCoordinator = sig.return_annotation
+
+    assumes(GroupCoordinator, "broadcast", is_callable=True)
+    assumes(GroupCoordinator, "broadcast_object", is_callable=True)
+
+
+def test_forward_context_interface():
+    # protect against interface changes
+    from vllm.forward_context import ForwardContext
+
+    assumes(ForwardContext, "no_compile_layers", is_instance_of=dict)
+    assumes(ForwardContext, "virtual_engine")
+    assumes(ForwardContext, "attn_metadata")
+
+
+def test_scheduler_output_interface():
+    # protect against interface changes
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+    assumes(SchedulerOutput, "finished_req_ids")
+    assumes(SchedulerOutput, "scheduled_new_reqs", is_instance_of=list)
+    assumes(SchedulerOutput, "num_scheduled_tokens", is_instance_of=dict)
+    assumes(SchedulerOutput, "scheduled_cached_reqs")
+
+    from vllm.v1.core.sched.output import CachedRequestData
+
+    assumes(CachedRequestData, "req_ids", is_instance_of=list)
+    assumes(CachedRequestData, "new_block_ids", is_instance_of=list)
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index b1780d8a9af801ce627bd8e05822c0bf2a7026c9..1c1ac915c758e3f97707f2d02dbee1cc053174ab 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -4,9 +4,22 @@ import filecmp
 import shutil
 import tempfile
 from pathlib import Path
+from typing import Any
+
+import pytest
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
+from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
+    MultiConnector,
+    MultiKVConnectorStats,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
+    NixlKVConnectorStats,
+)
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -19,6 +32,27 @@ PROMPTS = [
 SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20)
 
 
+# Test connector with custom stats for testing MultiConnector
+class MockConnectorStats(KVConnectorStats):
+    """Mock stats class for testing."""
+
+    pass
+
+
+class MockConnector(KVConnectorBase_V1):
+    """Mock connector that implements build_kv_connector_stats for testing."""
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
+        return MockConnectorStats(data=data) if data is not None else None
+
+
+# Register the mock connector
+KVConnectorFactory.register_connector("MockConnector", __name__, MockConnector.__name__)
+
+
 # Helper function to compare directories recursively
 def _compare_directories(dir1: Path, dir2: Path) -> bool:
     """Compares two directories recursively for identical content."""
@@ -52,29 +86,26 @@ def test_multi_shared_storage_connector_consistency():
         kv_connector="MultiConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
-            "connectors": [{
-                "kv_connector":
-                "TestSharedStorageConnector",
-                "kv_role":
-                "kv_both",
-                "kv_connector_extra_config": {
-                    "shared_storage_path": str(storage_1_path),
-                    "name": "storage1",
+            "connectors": [
+                {
+                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_role": "kv_both",
+                    "kv_connector_extra_config": {
+                        "shared_storage_path": str(storage_1_path),
+                        "name": "storage1",
+                    },
+                    "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
-                "kv_connector_module_path":
-                "tests.v1.kv_connector.unit.utils",
-            }, {
-                "kv_connector":
-                "TestSharedStorageConnector",
-                "kv_role":
-                "kv_both",
-                "kv_connector_extra_config": {
-                    "shared_storage_path": str(storage_2_path),
-                    "name": "storage2",
+                {
+                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_role": "kv_both",
+                    "kv_connector_extra_config": {
+                        "shared_storage_path": str(storage_2_path),
+                        "name": "storage2",
+                    },
+                    "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
-                "kv_connector_module_path":
-                "tests.v1.kv_connector.unit.utils",
-            }]
+            ]
         },
     )
 
@@ -93,14 +124,16 @@ def test_multi_shared_storage_connector_consistency():
     local_subdirs = list(storage_1_path.iterdir())
     external_subdirs = list(storage_2_path.iterdir())
 
-    assert len(
-        local_subdirs
-    ) > 0, f"Local storage path {storage_1_path} is empty after generation."
+    assert len(local_subdirs) > 0, (
+        f"Local storage path {storage_1_path} is empty after generation."
+    )
     assert len(external_subdirs) > 0, (
-        f"External storage path {storage_2_path} is empty after generation.")
+        f"External storage path {storage_2_path} is empty after generation."
+    )
     assert len(local_subdirs) == len(external_subdirs), (
         f"Mismatch in number of cache entries: "
-        f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
+        f"Local={len(local_subdirs)}, External={len(external_subdirs)}"
+    )
 
     # The subdirectories should correspond to the prompt hashes
     # Since prompts are the same, the hash directories should be the same name
@@ -113,29 +146,39 @@ def test_multi_shared_storage_connector_consistency():
     # Compare the contents of each corresponding cache directory
     for subdir_name in local_subdir_names:
         print(f"Comparing contents of cache directory: {subdir_name}")
-        assert _compare_directories(storage_1_path / subdir_name,
-                                    storage_2_path / subdir_name), \
-            (f"Contents differ for cache directory '{subdir_name}' between "
-             f"{storage_1_path} and {storage_2_path}")
+        assert _compare_directories(
+            storage_1_path / subdir_name, storage_2_path / subdir_name
+        ), (
+            f"Contents differ for cache directory '{subdir_name}' between "
+            f"{storage_1_path} and {storage_2_path}"
+        )
 
     events = get_connector_events()
     # get_num_new_matched_tokens and update_state_after_alloc will be called
     # on each connector in turn.
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage1-WORKER"][:5] == [
-        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
-        'wait_for_layer_load', 'save_kv_layer'
+        "register_kv_caches",
+        "bind_connector_metadata",
+        "start_load_kv",
+        "wait_for_layer_load",
+        "save_kv_layer",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage2-WORKER"][:5] == [
-        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
-        'wait_for_layer_load', 'save_kv_layer'
+        "register_kv_caches",
+        "bind_connector_metadata",
+        "start_load_kv",
+        "wait_for_layer_load",
+        "save_kv_layer",
     ]
 
     # Reset prefix cache or else we'll just get the tokens back from there.
@@ -151,12 +194,14 @@ def test_multi_shared_storage_connector_consistency():
     # on that one but with zero blocks for others (first nonzero match is
     # chosen).
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[7] 96",
+        "build_connector_meta",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
 
     # Delete storage1 connector state
@@ -175,12 +220,14 @@ def test_multi_shared_storage_connector_consistency():
     # a hit, so update_state_after_alloc will only be called with allocated
     # blocks for the second connector.
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[7] 96",
+        "build_connector_meta",
     ]
 
     # Clean up
@@ -191,15 +238,14 @@ def test_multi_shared_storage_connector_consistency():
 def get_connector_events() -> dict[str, list[str]]:
     # Read in connector events and reset the files.
     import glob
+
     event_files = glob.glob(tempfile.gettempdir() + "/connector_*_events.log")
     connector_events = {}
     for fname in event_files:
         name = fname.split("connector_")[1].split("_events.log")[0]
         try:
             with open(fname, "r+") as f:
-                connector_events[name] = [
-                    line.strip() for line in f if line.strip()
-                ]
+                connector_events[name] = [line.strip() for line in f if line.strip()]
                 f.truncate(0)
         except Exception as e:
             print(f"[ERROR] Could not read connector events for {name}: {e}")
@@ -211,5 +257,339 @@ def test_engine_id_conflict():
     configs = [KVTransferConfig() for _ in range(2)]
     ids = [config.engine_id for config in configs]
     assert ids[0] != ids[1], (
-        "Engine IDs should be different for different configs. "
-        f"Got {ids}")
+        f"Engine IDs should be different for different configs. Got {ids}"
+    )
+
+
+class TestMultiConnectorStats:
+    """Tests for MultiConnector stats reconstruction and operations."""
+
+    def test_build_kv_connector_stats_with_none(self):
+        """Test that build_kv_connector_stats returns empty stats when given None."""
+        stats = MultiConnector.build_kv_connector_stats(data=None)
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        assert len(stats.data) == 0
+        assert stats.is_empty()
+
+    def test_build_kv_connector_stats_with_empty_dict(self):
+        """Test that build_kv_connector_stats returns empty stats with empty dict."""
+        stats = MultiConnector.build_kv_connector_stats(data={})
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        assert len(stats.data) == 0
+        assert stats.is_empty()
+
+    def test_build_kv_connector_stats_reconstructs_nixl_stats(self):
+        """Test that NixlConnector stats are properly reconstructed with
+        correct data."""
+        serialized_data = {
+            "NixlConnector": {
+                "data": {
+                    "transfer_duration": [1.5, 2.3],
+                    "post_duration": [0.1, 0.2],
+                    "bytes_transferred": [1024, 2048],
+                    "num_descriptors": [10, 20],
+                    "num_failed_transfers": [],
+                    "num_failed_notifications": [],
+                }
+            }
+        }
+
+        stats = MultiConnector.build_kv_connector_stats(data=serialized_data)
+
+        assert "NixlConnector" in stats.data
+        nixl_stats = stats.data["NixlConnector"]
+        assert isinstance(nixl_stats, NixlKVConnectorStats)
+        assert nixl_stats.data["transfer_duration"] == [1.5, 2.3]
+        assert nixl_stats.data["post_duration"] == [0.1, 0.2]
+        assert nixl_stats.data["bytes_transferred"] == [1024, 2048]
+        assert nixl_stats.data["num_descriptors"] == [10, 20]
+
+    def test_build_kv_connector_stats_with_multiple_connectors(self):
+        """Test reconstruction with multiple connector types that have custom stats."""
+        serialized_data = {
+            "NixlConnector": {
+                "data": {
+                    "transfer_duration": [1.5],
+                    "post_duration": [0.1],
+                    "bytes_transferred": [1024],
+                    "num_descriptors": [10],
+                    "num_failed_transfers": [],
+                    "num_failed_notifications": [],
+                }
+            },
+            "MockConnector": {"data": {"mock_field": [1, 2, 3]}},
+        }
+
+        stats = MultiConnector.build_kv_connector_stats(data=serialized_data)
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        # Both connectors should be reconstructed
+        assert len(stats.data) == 2
+        assert "NixlConnector" in stats.data
+        assert "MockConnector" in stats.data
+        assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats)
+        assert isinstance(stats.data["MockConnector"], MockConnectorStats)
+        # Verify data is preserved
+        assert stats.data["MockConnector"].data == {"mock_field": [1, 2, 3]}
+
+    def test_build_kv_connector_stats_raises_error_for_unknown_connector(self):
+        """Test that unknown connectors raise an error."""
+        serialized_data = {
+            "UnknownConnector": {"data": {"some_field": [1, 2, 3]}},
+            "NixlConnector": {
+                "data": {
+                    "transfer_duration": [1.5],
+                    "post_duration": [0.1],
+                    "bytes_transferred": [1024],
+                    "num_descriptors": [10],
+                    "num_failed_transfers": [],
+                    "num_failed_notifications": [],
+                }
+            },
+        }
+
+        with pytest.raises(
+            ValueError, match="Connector 'UnknownConnector' is not registered."
+        ):
+            MultiConnector.build_kv_connector_stats(data=serialized_data)
+
+    def test_build_kv_connector_stats_with_already_instantiated_objects(self):
+        """Test that already-instantiated stats objects are preserved (same process)."""
+        # This simulates the in-process case where stats are not serialized
+        nixl_stats = NixlKVConnectorStats(
+            data={
+                "transfer_duration": [1.5],
+                "post_duration": [0.1],
+                "bytes_transferred": [1024],
+                "num_descriptors": [10],
+                "num_failed_transfers": [],
+                "num_failed_notifications": [],
+            }
+        )
+        mock_stats = MockConnectorStats(data={"mock_field": [1, 2, 3]})
+
+        data_with_objects = {
+            "NixlConnector": nixl_stats,
+            "MockConnector": mock_stats,
+        }
+
+        stats = MultiConnector.build_kv_connector_stats(data=data_with_objects)
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        assert len(stats.data) == 2
+        # Verify objects are preserved as-is
+        assert stats.data["NixlConnector"] is nixl_stats
+        assert stats.data["MockConnector"] is mock_stats
+
+    def test_build_kv_connector_stats_with_mixed_objects_and_dicts(self):
+        """Test handling mixed already-instantiated and serialized stats."""
+        # This can happen during transition or partial serialization
+        nixl_stats = NixlKVConnectorStats(
+            data={
+                "transfer_duration": [1.5],
+                "post_duration": [0.1],
+                "bytes_transferred": [1024],
+                "num_descriptors": [10],
+                "num_failed_transfers": [],
+                "num_failed_notifications": [],
+            }
+        )
+
+        mixed_data = {
+            "NixlConnector": nixl_stats,  # Already instantiated
+            "MockConnector": {"data": {"mock_field": [1, 2, 3]}},  # Serialized
+        }
+
+        stats = MultiConnector.build_kv_connector_stats(data=mixed_data)
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        assert len(stats.data) == 2
+        # Instantiated object preserved
+        assert stats.data["NixlConnector"] is nixl_stats
+        # Serialized object reconstructed
+        assert isinstance(stats.data["MockConnector"], MockConnectorStats)
+        assert stats.data["MockConnector"].data == {"mock_field": [1, 2, 3]}
+
+    def test_build_kv_connector_stats_skips_connectors_without_custom_stats(self):
+        """Test that connectors without custom stats (return None) are skipped."""
+        # SharedStorageConnector doesn't override build_kv_connector_stats,
+        # so it returns None and should be skipped
+        serialized_data = {
+            "NixlConnector": {
+                "data": {
+                    "transfer_duration": [1.5],
+                    "post_duration": [0.1],
+                    "bytes_transferred": [1024],
+                    "num_descriptors": [10],
+                    "num_failed_transfers": [],
+                    "num_failed_notifications": [],
+                }
+            },
+            "SharedStorageConnector": {"data": {"some_field": [1, 2, 3]}},
+        }
+
+        stats = MultiConnector.build_kv_connector_stats(data=serialized_data)
+
+        assert stats is not None
+        assert isinstance(stats, MultiKVConnectorStats)
+        # Only NixlConnector should be reconstructed
+        assert len(stats.data) == 1
+        assert "NixlConnector" in stats.data
+        assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats)
+        # SharedStorageConnector should be skipped (returns None)
+        assert "SharedStorageConnector" not in stats.data
+
+    def test_build_kv_connector_stats_handles_malformed_data(self):
+        """Test that malformed data raises appropriate errors."""
+        serialized_data = {
+            "NixlConnector": {"wrong_field": {"transfer_duration": [1.5]}}
+        }
+
+        with pytest.raises(AssertionError, match="Expected a dict with a 'data' field"):
+            MultiConnector.build_kv_connector_stats(data=serialized_data)
+
+    def test_aggregate_same_connector(self):
+        """Test aggregating stats from the same connector type."""
+        stats1 = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(
+                    data={
+                        "transfer_duration": [1.0],
+                        "post_duration": [0.1],
+                        "bytes_transferred": [1024],
+                        "num_descriptors": [10],
+                        "num_failed_transfers": [],
+                        "num_failed_notifications": [],
+                    }
+                )
+            }
+        )
+
+        stats2 = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(
+                    data={
+                        "transfer_duration": [2.0],
+                        "post_duration": [0.2],
+                        "bytes_transferred": [2048],
+                        "num_descriptors": [20],
+                        "num_failed_transfers": [],
+                        "num_failed_notifications": [],
+                    }
+                )
+            }
+        )
+
+        result = stats1.aggregate(stats2)
+
+        assert result is stats1  # Should return self
+        assert "NixlConnector" in result.data
+        nixl_stats = result.data["NixlConnector"]
+        assert nixl_stats.data["transfer_duration"] == [1.0, 2.0]
+        assert nixl_stats.data["post_duration"] == [0.1, 0.2]
+        assert nixl_stats.data["bytes_transferred"] == [1024, 2048]
+        assert nixl_stats.data["num_descriptors"] == [10, 20]
+
+    def test_aggregate_new_connector(self):
+        """Test aggregating stats when a new connector type appears."""
+        from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+            KVConnectorStats,
+        )
+
+        stats1 = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(
+                    data={
+                        "transfer_duration": [1.0],
+                        "post_duration": [0.1],
+                        "bytes_transferred": [1024],
+                        "num_descriptors": [10],
+                        "num_failed_transfers": [],
+                        "num_failed_notifications": [],
+                    }
+                )
+            }
+        )
+
+        stats2 = MultiKVConnectorStats(
+            data={"SharedStorageConnector": KVConnectorStats(data={"field": [1, 2]})}
+        )
+
+        result = stats1.aggregate(stats2)
+
+        assert "NixlConnector" in result.data
+        assert "SharedStorageConnector" in result.data
+
+    def test_reduce(self):
+        """Test that reduce() correctly reduces all nested connector stats."""
+        stats = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(
+                    data={
+                        "transfer_duration": [1.0, 2.0],
+                        "post_duration": [0.1, 0.2],
+                        "bytes_transferred": [1024, 2048],
+                        "num_descriptors": [10, 20],
+                        "num_failed_transfers": [],
+                        "num_failed_notifications": [],
+                    }
+                )
+            }
+        )
+
+        reduced = stats.reduce()
+
+        assert "NixlConnector" in reduced
+        assert isinstance(reduced["NixlConnector"], dict)
+        # Check that the stats were reduced (should have aggregated values)
+        assert "Num successful transfers" in reduced["NixlConnector"]
+        assert reduced["NixlConnector"]["Num successful transfers"] == 2
+
+    def test_reset(self):
+        """Test that reset() resets all nested connector stats."""
+        stats = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(
+                    data={
+                        "transfer_duration": [1.0, 2.0],
+                        "post_duration": [0.1, 0.2],
+                        "bytes_transferred": [1024, 2048],
+                        "num_descriptors": [10, 20],
+                        "num_failed_transfers": [],
+                        "num_failed_notifications": [],
+                    }
+                )
+            }
+        )
+
+        assert not stats.is_empty()
+
+        stats.reset()
+
+        # After reset, stats should be empty
+        assert stats.is_empty()
+        nixl_stats = stats.data["NixlConnector"]
+        assert len(nixl_stats.data["transfer_duration"]) == 0
+
+    def test_is_empty_with_multiple_connectors(self):
+        """Test is_empty() returns correct value with multiple connectors."""
+        # All empty
+        stats = MultiKVConnectorStats(
+            data={
+                "NixlConnector": NixlKVConnectorStats(data={}),
+            }
+        )
+        # Initialize empty stats
+        stats.data["NixlConnector"].reset()
+        assert stats.is_empty()
+
+        # One non-empty
+        stats.data["NixlConnector"].data["transfer_duration"].append(1.0)
+        assert not stats.is_empty()
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 578bf02eb51924b8b3a857955e5edfbe714fc6d1..b264e5108c16d60d827dbd5aa50ca7b596ece48d 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -9,9 +9,9 @@ import textwrap
 import time
 import uuid
 from collections import defaultdict
-from typing import Optional
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import ray
 import torch
@@ -19,20 +19,29 @@ import torch
 from vllm import LLM
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorStats)
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
-    MultiKVConnectorStats)
+    MultiKVConnectorStats,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
-    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
-    NixlConnectorWorker, NixlKVConnectorStats)
+    KVConnectorRole,
+    NixlAgentMetadata,
+    NixlConnector,
+    NixlConnectorMetadata,
+    NixlConnectorScheduler,
+    NixlConnectorWorker,
+    NixlKVConnectorStats,
+)
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    ensure_kv_transfer_shutdown, has_kv_transfer_group)
+    ensure_kv_transfer_shutdown,
+    has_kv_transfer_group,
+)
 from vllm.forward_context import ForwardContext
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+from vllm.v1.request import RequestStatus
 
 from .utils import create_request, create_scheduler, create_vllm_config
 
@@ -41,14 +50,14 @@ from .utils import create_request, create_scheduler, create_vllm_config
 def clear_kv_transfer():
     """
     The test cases in this file use `VLLM_ENABLE_V1_MULTIPROCESSING=0`,
-    causing the global variable `_KV_CONNECTOR_AGENT` 
+    causing the global variable `_KV_CONNECTOR_AGENT`
     to be assigned but never deleted.
 
-    Since the current pytest process does not terminate and instead 
+    Since the current pytest process does not terminate and instead
     continues running tests from other files,
-    this global variable remains in memory and interferes 
+    this global variable remains in memory and interferes
     with test cases in other modules.
-    
+
     So we use this fixture to ensure that the global variable
     `_KV_CONNECTOR_AGENT` is properly cleaned up after each test.
     """
@@ -57,12 +66,33 @@ def clear_kv_transfer():
         ensure_kv_transfer_shutdown()
 
 
+def get_default_xfer_telemetry(
+    xferDurationS: float = 1,
+    postDurationS: float = 1,
+    totalBytes: int = 1,
+    descCount: int = 1,
+) -> dict:
+    class AttributeDict(dict):
+        __slots__ = ()
+        __getattr__ = dict.__getitem__
+        __setattr__ = dict.__setitem__  # type: ignore[assignment]
+
+    # We can't instantiate nixlXferTelemetry because it's read only and
+    # ray env does not have NIXL, so we must fake it
+    return AttributeDict(
+        xferDuration=xferDurationS * 1e6,  # in us
+        postDuration=postDurationS * 1e6,  # in us
+        totalBytes=totalBytes,
+        descCount=descCount,
+    )
+
+
 class FakeNixlWrapper:
     """Mock implementation of NixlWrapper for testing.
 
     We don't inherit from nixl._api.nixl_agent because nixl may not be
     installed.
-    
+
     Note: The complete source of this class is also used in the
     `_make_fake_nixl_pkg` function to create a fake nixl package
     for Ray workers.
@@ -73,8 +103,7 @@ class FakeNixlWrapper:
 
     def __init__(self, agent_name: str, *args, **kwargs):
         self._cycles_before_xfer_done = 0
-        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(
-            lambda: 0)
+        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(lambda: 0)
 
     def get_reg_descs(self, caches_data, memory_type: str) -> list:
         return [str(uuid.uuid4()) for _ in caches_data]
@@ -102,8 +131,7 @@ class FakeNixlWrapper:
         return {}
 
     def check_xfer_state(self, handle: int) -> str:
-        if self._check_xfer_state_cycles[
-                handle] >= self._cycles_before_xfer_done:
+        if self._check_xfer_state_cycles[handle] >= self._cycles_before_xfer_done:
             return "DONE"
         self._check_xfer_state_cycles[handle] += 1
         return "PROC"
@@ -120,18 +148,23 @@ class FakeNixlWrapper:
     def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
         pass
 
-    def make_prepped_xfer(self,
-                          xfer_type: str,
-                          local_xfer_side_handle: int,
-                          local_block_descs_ids: list[int],
-                          remote_xfer_side_handle: int,
-                          remote_block_descs_ids: list[int],
-                          notif_msg: Optional[bytes] = None) -> int:
+    def make_prepped_xfer(
+        self,
+        xfer_type: str,
+        local_xfer_side_handle: int,
+        local_block_descs_ids: list[int],
+        remote_xfer_side_handle: int,
+        remote_block_descs_ids: list[int],
+        notif_msg: bytes | None = None,
+    ) -> int:
         return uuid.uuid4().int
 
     def transfer(self, handle: int) -> str:
         return "PROC"
 
+    def get_xfer_telemetry(self, handle: int) -> dict:
+        return get_default_xfer_telemetry()
+
     ############################################################
     # Follow are for changing the behavior during testing.
     ############################################################
@@ -144,7 +177,7 @@ class FakeNixlWrapper:
 def _make_fake_nixl_pkg():
     """Context manager that creates a temporary package making
        `from nixl._api import nixl_agent` resolve to our FakeNixlWrapper.
-       
+
     Automatically cleans up the temporary directory when done.
     """
     with tempfile.TemporaryDirectory() as td:
@@ -159,7 +192,6 @@ def _make_fake_nixl_pkg():
 # Copy of FakeNixlWrapper implementation for Ray workers
 import uuid
 from collections import defaultdict
-from typing import Optional
 
 {fake_nixl_source}
 
@@ -169,6 +201,11 @@ nixl_agent = FakeNixlWrapper
         with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
             f.write(stub)
 
+        # Mock nixlXferTelemetry class
+        pkg_root2 = os.path.join(td, "nixl", "_bindings")
+        os.makedirs(pkg_root2, exist_ok=True)
+        with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
+            f.write("class nixlXferTelemetry: pass")
         # touch parent package
         open(os.path.join(td, "nixl", "__init__.py"), "w").close()
         yield td
@@ -185,10 +222,12 @@ def test_basic_interface():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+    )
     request_id = request.request_id
 
     scheduler.add_request(request)
@@ -204,8 +243,11 @@ def test_basic_interface():
     req_meta = kv_connector_metadata.reqs_to_recv[request_id]
 
     for block_id, block in zip(
-            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
-            single_type_managers[0].req_to_blocks[request_id]):
+        req_meta.local_block_ids,
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ],
+    ):
         assert block_id == block.block_id
 
 
@@ -225,11 +267,13 @@ def test_prompt_less_than_block_size():
     NUM_TOKENS = int(BLOCK_SIZE * 0.5)
 
     # Request will have 1 partial remote block.
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True,
-                             num_remote_blocks=1)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+        num_remote_blocks=1,
+    )
     scheduler.add_request(request)
     scheduler_output = scheduler.schedule()
 
@@ -241,16 +285,105 @@ def test_prompt_less_than_block_size():
     assert len(scheduler_output.scheduled_new_reqs) == 0
 
 
-class FakeNixlConnectorWorker(NixlConnectorWorker):
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_kv_transfer_handshake(dist_init):
+    """Unit test for basic NixlConnector interface functionality."""
 
+    # Test setup, we creates a scheduler that contains a NixlConnector
+    # of role SCHEDULER, and expect it to be serving NixlAgentMetadata from
+    # all workers of the instance.
+    vllm_config = create_vllm_config()
+    # in case the test runs on non-GPU machine
+    vllm_config.kv_transfer_config.kv_buffer_device = "cpu"
+    scheduler = create_scheduler(vllm_config)
+
+    # Create two NixlConnector of role WORKER, one is the worker of
+    # the scheduler (prefill), the other is a worker of decode instance.
+
+    # Prefill connector will register KV cache to populate proper handshake
+    # metadata.
+    prefill_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+        num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
+    )
+    shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    kv_caches = {
+        "layer0": shared_tensor,
+        "layer1": unique_tensor,
+        "layer2": shared_tensor,
+    }
+    prefill_connector.register_kv_caches(kv_caches)
+
+    # Simulate EngineCore initialization that would
+    # gather connector metadata from all workers, the scheduler connector
+    # expects metadata to be in dict[int, KVConnectorHandshakeMetadata],
+    # where the first key is the dp_rank, the second key is the tp_rank.
+    metadata = {0: prefill_connector.get_handshake_metadata()}
+    scheduler_connector = scheduler.get_kv_connector()
+    scheduler_connector.set_xfer_handshake_metadata(metadata)
+
+    # Simulate a request that finishes prefill, which returns
+    # corresponding NixlConnectorMetadata for decode instance.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+    )
+    request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+    delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished(
+        request, [0, 1, 2]
+    )
+    assert delay
+
+    # Decode connector will be able to create handshake with the prefill connector.
+    decode_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+
+    # Here we are testing the retrieval of NIXLAgentMetadata.
+    # Knowing the implementation detail, we override the add_remote_agent
+    # to validate the metadata received is the same as the one in prefill_connector.
+    with patch.object(
+        decode_connector.connector_worker, "add_remote_agent"
+    ) as mock_add_remote_agent:
+        mock_add_remote_agent.return_type = "remote_agent"
+
+        decode_connector.connector_worker._nixl_handshake(
+            kv_connector_metadata["remote_host"],
+            kv_connector_metadata["remote_port"],
+            kv_connector_metadata["tp_size"],
+            kv_connector_metadata["remote_engine_id"],
+        )
+
+        received_metadata = mock_add_remote_agent.call_args.args
+        assert received_metadata[1] == 0  # remote_tp_rank
+        assert received_metadata[2] == 1  # remote_tp_size
+        assert metadata[0] == received_metadata[0]
+
+    # Need to shutdown the background thread to release NIXL side channel port
+    scheduler_connector.shutdown()
+
+
+class FakeNixlConnectorWorker(NixlConnectorWorker):
     REMOTE_ENGINE_ID = "remote_engine"
 
-    def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs):
+    def __init__(
+        self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self._hand_shake_latency = hand_shake_latency
+        self.kv_cache_layout = kv_cache_layout
 
-    def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
-                        expected_engine_id: str) -> dict[int, str]:
+    def _nixl_handshake(
+        self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
+    ) -> dict[int, str]:
         # Mimic slow _nixl_handshake, as well as bypass zmq communication.
         time.sleep(self._hand_shake_latency)
         # These should've been done in register_kv_caches(), called by
@@ -268,28 +401,32 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                 engine_id=self.REMOTE_ENGINE_ID,
                 agent_metadata=FakeNixlWrapper.AGENT_METADATA,
                 kv_caches_base_addr=[0],
+                device_id=0,
                 num_blocks=1,
                 block_lens=self.block_len_per_layer,
                 attn_backend_name=self.backend_name,
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
+                block_size=self.block_size,
             ),
-            remote_tp_size=remote_tp_size)
+            remote_tp_size=remote_tp_size,
+        )
         return {0: remote_agent_name}
 
 
 class TestNixlHandshake:
-
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
+        FakeNixlWrapper,
+    )
     def test_multi_xfer_one_engine(
         self,
         # dist_init is a fixture that initializes the distributed environment.
-        dist_init):
+        dist_init,
+    ):
         """Test case where multiple xfers are initiated to the same engine.
-        
+
         This test triggers the connector to load remote KV for the same
         `request_id`. The transfer is not done immediately due to
         `set_cycles_before_xfer_done`, so there is a state where there are
@@ -303,9 +440,9 @@ class TestNixlHandshake:
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id, hand_shake_latency=0)
-        assert isinstance(connector.connector_worker.nixl_wrapper,
-                          FakeNixlWrapper)
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
         connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
         num_xfers = 4
         while True:
@@ -316,21 +453,19 @@ class TestNixlHandshake:
                 num_xfers -= 1
                 metadata.add_new_req(
                     request_id=request_id,
-                    local_block_ids=[
-                        num_xfers + 1, num_xfers + 2, num_xfers + 3
-                    ],
+                    local_block_ids=[num_xfers + 1, num_xfers + 2, num_xfers + 3],
                     kv_transfer_params={
-                        "remote_block_ids":
-                        [num_xfers + 4, num_xfers + 5, num_xfers + 6],
-                        "remote_engine_id":
-                        FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                        "remote_host":
-                        "localhost",
-                        "remote_port":
-                        1234,
-                        "remote_tp_size":
-                        1,
-                    })
+                        "remote_block_ids": [
+                            num_xfers + 4,
+                            num_xfers + 5,
+                            num_xfers + 6,
+                        ],
+                        "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_host": "localhost",
+                        "remote_port": 1234,
+                        "remote_tp_size": 1,
+                    },
+                )
             connector.bind_connector_metadata(metadata)
 
             # Mimic maybe_setup_kv_connector in gpu_model_runner.
@@ -342,8 +477,9 @@ class TestNixlHandshake:
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
 
             # Mimic get_finished_kv_transfers in gpu_model_runner.
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -355,20 +491,25 @@ class TestNixlHandshake:
 
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
-    @pytest.mark.parametrize("decode_tp_size, prefill_tp_size", [
-        (1, 1),
-        (2, 1),
-        (4, 2),
-        (4, 4),
-    ])
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize(
+        "decode_tp_size, prefill_tp_size",
+        [
+            (1, 1),
+            (2, 1),
+            (4, 2),
+            (4, 4),
+        ],
+    )
     def test_async_load_kv(
-            self,
-            # Fixture that initializes the distributed environment.
-            dist_init,
-            # Simulate consumer-producer TP sizes.
-            decode_tp_size,
-            prefill_tp_size):
+        self,
+        # Fixture that initializes the distributed environment.
+        dist_init,
+        # Simulate consumer-producer TP sizes.
+        decode_tp_size,
+        prefill_tp_size,
+    ):
         """Test that NixlConnector's start_load_kv should be non-blocking."""
 
         vllm_config = create_vllm_config()
@@ -377,18 +518,20 @@ class TestNixlHandshake:
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id)
+            vllm_config, connector.engine_id
+        )
         metadata = NixlConnectorMetadata()
-        metadata.add_new_req(request_id="id",
-                             local_block_ids=[1, 2, 3],
-                             kv_transfer_params={
-                                 "remote_block_ids": [4, 5, 6],
-                                 "remote_engine_id":
-                                 FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                                 "remote_host": "localhost",
-                                 "remote_port": 1234,
-                                 "remote_tp_size": prefill_tp_size,
-                             })
+        metadata.add_new_req(
+            request_id="id",
+            local_block_ids=[1, 2, 3],
+            kv_transfer_params={
+                "remote_block_ids": [4, 5, 6],
+                "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                "remote_host": "localhost",
+                "remote_port": 1234,
+                "remote_tp_size": prefill_tp_size,
+            },
+        )
         connector.bind_connector_metadata(metadata)
 
         timeout = 2.5
@@ -402,8 +545,9 @@ class TestNixlHandshake:
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
             time.sleep(0.5)  # backoff for the async handshake to complete.
             connector.bind_connector_metadata(NixlConnectorMetadata())
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -413,11 +557,13 @@ class TestNixlHandshake:
 
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
+        FakeNixlWrapper,
+    )
     def test_concurrent_load_kv(
         self,
         # dist_init is a fixture that initializes the distributed environment.
-        dist_init):
+        dist_init,
+    ):
         """Test that multiple start_load_kv calls should occur concurrently."""
 
         vllm_config = create_vllm_config()
@@ -425,20 +571,22 @@ class TestNixlHandshake:
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id)
+            vllm_config, connector.engine_id
+        )
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
-            metadata.add_new_req(request_id=f"id_{i}",
-                                 local_block_ids=[1, 2, 3],
-                                 kv_transfer_params={
-                                     "remote_block_ids": [4, 5, 6],
-                                     "remote_engine_id":
-                                     FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                                     "remote_host": "localhost",
-                                     "remote_port": 1234,
-                                     "remote_tp_size": 1,
-                                 })
+            metadata.add_new_req(
+                request_id=f"id_{i}",
+                local_block_ids=[1, 2, 3],
+                kv_transfer_params={
+                    "remote_block_ids": [4, 5, 6],
+                    "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                    "remote_host": "localhost",
+                    "remote_port": 1234,
+                    "remote_tp_size": 1,
+                },
+            )
         connector.bind_connector_metadata(metadata)
 
         timeout = 2.5 * total_reqs
@@ -453,8 +601,9 @@ class TestNixlHandshake:
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
             time.sleep(0.5)  # backoff for the async handshake to complete.
             connector.bind_connector_metadata(NixlConnectorMetadata())
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -466,7 +615,8 @@ class TestNixlHandshake:
 
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
+        FakeNixlWrapper,
+    )
     def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
         """
         Verify that adding a remote agent fails if kv_cache_layout differs.
@@ -477,12 +627,14 @@ class TestNixlHandshake:
         # Mock TP world size to 2 to force heterogeneous TP when
         # remote_tp_size=1
         with patch(
-                "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size",  # noqa: E501
-                return_value=2):
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size",  # noqa: E501
+            return_value=2,
+        ):
             # Initialize connector and worker (with fake NIXL wrapper)
             connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
             connector.connector_worker = FakeNixlConnectorWorker(
-                vllm_config, connector.engine_id, hand_shake_latency=0)
+                vllm_config, connector.engine_id, hand_shake_latency=0
+            )
             worker = connector.connector_worker
 
             # Minimal local registration params used by add_remote_agent
@@ -492,40 +644,95 @@ class TestNixlHandshake:
             worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
 
             # Metadata with different kv_cache_layout than local worker
-            mismatched_layout = "HND" if worker.kv_cache_layout != "HND" \
-                else "NHD"
+            mismatched_layout = "HND" if worker.kv_cache_layout != "HND" else "NHD"
             meta = NixlAgentMetadata(
                 engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
                 agent_metadata=FakeNixlWrapper.AGENT_METADATA,
                 kv_caches_base_addr=[0],
+                device_id=0,
                 num_blocks=1,
                 block_lens=worker.block_len_per_layer,
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
             )
 
-            # We don't check layout for homogeneous TP and MLA for now, as the
-            # whole block is moved.
-            worker.add_remote_agent(meta, remote_tp_size=2)
+            with pytest.raises(RuntimeError):
+                # mismatched layout is expected to fail
+                worker.add_remote_agent(meta, remote_tp_size=2)
             with pytest.raises(AssertionError):
                 worker.add_remote_agent(meta, remote_tp_size=1)
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
+        self, dist_init
+    ):
+        """
+        Verify that adding a remote agent fails if kv_cache_layout differs.
+        This test is only relevant for heterogeneous TP.
+        """
+        vllm_config = create_vllm_config(enable_permute_local_kv=True)
+
+        # Mock TP world size to 2 to force heterogeneous TP when
+        # remote_tp_size=1
+        with patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size",  # noqa: E501
+            return_value=2,
+        ):
+            # Initialize connector and worker (with fake NIXL wrapper)
+            connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+            connector.connector_worker = FakeNixlConnectorWorker(
+                vllm_config,
+                connector.engine_id,
+                hand_shake_latency=0,
+                kv_cache_layout="NHD",
+            )
+            worker = connector.connector_worker
+
+            # Minimal local registration params used by add_remote_agent
+            worker.slot_size_per_layer = [2048]
+            worker.block_len_per_layer = [2048 * worker.block_size]
+            worker.num_blocks = 1
+            worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+
+            # Metadata with different kv_cache_layout than local worker
+            meta = NixlAgentMetadata(
+                engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                kv_caches_base_addr=[0],
+                device_id=0,
+                num_blocks=1,
+                # prefill TP=1, decode TP=2, remote block_lens is double to local
+                block_lens=[i * 2 for i in worker.block_len_per_layer],
+                attn_backend_name=worker.backend_name,
+                kv_cache_layout="HND",
+                block_size=worker.block_size,
+            )
+
+            # We don't check layout for homogeneous TP and MLA for now, as the
+            # whole block is moved.
+            worker.add_remote_agent(meta, remote_tp_size=1)
+
 
 # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
 # we put here is important. First run ray, it will clean up the resources, then
 # the rest of the tests.
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-    FakeNixlWrapper)
+    FakeNixlWrapper,
+)
 def test_kv_connector_stats(dist_init):
     """Test that KV transfer stats are properly recorded and retrieved."""
     vllm_config = create_vllm_config()
 
     # Test worker role in decode server.
     connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
-    connector.connector_worker = FakeNixlConnectorWorker(vllm_config,
-                                                         connector.engine_id,
-                                                         hand_shake_latency=0)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config, connector.engine_id, hand_shake_latency=0
+    )
 
     # Verify that xfer_stats starts empty
     initial_stats = connector.get_kv_connector_stats()
@@ -534,16 +741,17 @@ def test_kv_connector_stats(dist_init):
     # Create transfer metadata
     request_id = "test_req_for_stats"
     metadata = NixlConnectorMetadata()
-    metadata.add_new_req(request_id=request_id,
-                         local_block_ids=[1, 2, 3],
-                         kv_transfer_params={
-                             "remote_block_ids": [4, 5, 6],
-                             "remote_engine_id":
-                             FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                             "remote_host": "localhost",
-                             "remote_port": 1234,
-                             "remote_tp_size": 1,
-                         })
+    metadata.add_new_req(
+        request_id=request_id,
+        local_block_ids=[1, 2, 3],
+        kv_transfer_params={
+            "remote_block_ids": [4, 5, 6],
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
     connector.bind_connector_metadata(metadata)
 
     # Start the transfer
@@ -564,8 +772,7 @@ def test_kv_connector_stats(dist_init):
         _, done_recving = connector.get_finished(finished_req_ids=set())
         if len(done_recving) > 0 and request_id in done_recving:
             break
-        time.sleep(
-            0.1)  # Small delay to allow background handshake to complete
+        time.sleep(0.1)  # Small delay to allow background handshake to complete
     else:
         assert "Transfer did not complete within expected iterations"
 
@@ -575,7 +782,7 @@ def test_kv_connector_stats(dist_init):
 
     # Verify stats values are recorded
     assert not stats_after_transfer.is_empty()
-    assert stats_after_transfer.data["num_successful_transfers"] == 1
+    assert stats_after_transfer.num_successful_transfers == 1
 
     # Verify stats are reset after retrieval
     stats_after_reset = connector.get_kv_connector_stats()
@@ -584,13 +791,13 @@ def test_kv_connector_stats(dist_init):
 
 def test_kv_connector_stats_aggregation():
     """
-    Test KV transfer stats aggregation across TP ranks using 
+    Test KV transfer stats aggregation across TP ranks using
     KVOutputAggregator (used by MultiprocExecutor).
     """
 
     # Create KVOutputAggregator for 3 workers (simulating TP=3), same thing
     # done in MultiprocExecutor.execute_model
-    aggregator = KVOutputAggregator(world_size=3)
+    aggregator = KVOutputAggregator(expected_finished_count=3)
 
     # Create stats for multiple workers with different transfer patterns
     worker1_stats = NixlKVConnectorStats()
@@ -599,44 +806,54 @@ def test_kv_connector_stats_aggregation():
 
     # Record different transfers on each worker
     # Worker 1: 2 transfers
-    worker1_stats.record_transfer()
-    worker1_stats.record_transfer()
+    stats = get_default_xfer_telemetry()
+    worker1_stats.record_transfer(stats)
+    worker1_stats.record_transfer(stats)
 
     # Worker 2: 1 transfer
-    worker2_stats.record_transfer()
+    worker2_stats.record_transfer(stats)
 
     # Worker 3: 3 transfers
-    worker3_stats.record_transfer()
-    worker3_stats.record_transfer()
-    worker3_stats.record_transfer()
+    stats = get_default_xfer_telemetry(
+        xferDurationS=2, postDurationS=2, totalBytes=2, descCount=2
+    )
+    worker3_stats.record_transfer(stats)
+    worker3_stats.record_transfer(stats)
+    worker3_stats.record_transfer(stats)
 
     # Create ModelRunnerOutput instances for each worker
     worker_outputs = []
-    for i, worker_stats in enumerate(
-        [worker1_stats, worker2_stats, worker3_stats]):
+    for i, worker_stats in enumerate([worker1_stats, worker2_stats, worker3_stats]):
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],  # dummy token
+            sampled_token_ids=[np.array([123])],  # dummy token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
             kv_connector_output=KVConnectorOutput(
                 finished_sending=set([f"req_{i}_send"])
-                if i < 2 else None,  # Workers 0,1 finished sending
+                if i < 2
+                else None,  # Workers 0,1 finished sending
                 finished_recving=set([f"req_{i}_recv"])
-                if i > 0 else None,  # Workers 1,2 finished receiving
+                if i > 0
+                else None,  # Workers 1,2 finished receiving
                 kv_connector_stats=worker_stats,
-            ))
+            ),
+        )
         worker_outputs.append(output)
 
     # Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
     aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
-    kv_connector_stats = \
-        aggregated_output.kv_connector_output.kv_connector_stats
+    kv_connector_stats = aggregated_output.kv_connector_output.kv_connector_stats
     assert isinstance(kv_connector_stats, NixlKVConnectorStats)
     # Number of total transfers across all workers.
-    assert kv_connector_stats.data["num_successful_transfers"] == 6
+    assert kv_connector_stats.num_successful_transfers == 6
+    # Logging proc, call reduce() to get CLI-friendly stats.
+    cli_stats = kv_connector_stats.reduce()
+    assert cli_stats["Avg xfer time (ms)"] == 1500.0
+    assert cli_stats["Avg post time (ms)"] == 1500.0
+    assert cli_stats["Avg number of descriptors"] == 1.5
 
 
 def test_multi_kv_connector_stats_aggregation():
@@ -645,13 +862,13 @@ def test_multi_kv_connector_stats_aggregation():
     KVOutputAggregator (used by MultiprocExecutor).
     """
 
-    aggregator = KVOutputAggregator(world_size=3)
+    aggregator = KVOutputAggregator(expected_finished_count=3)
 
     from dataclasses import dataclass
 
+    # Mock a KVConnectorStats class for testing aggregation over connectors.
     @dataclass
     class FooKVConnectorStats(KVConnectorStats):
-
         def reset(self):
             self.data = {"num_foo_transfers": 0}
 
@@ -663,20 +880,17 @@ def test_multi_kv_connector_stats_aggregation():
         def is_empty(self) -> bool:
             return self.data["num_foo_transfers"] == 0
 
-        def aggregate(self,
-                      other: "FooKVConnectorStats") -> "FooKVConnectorStats":
+        def aggregate(self, other: "FooKVConnectorStats") -> "FooKVConnectorStats":
             if not other.is_empty():
-                self.data["num_foo_transfers"] += other.data[
-                    "num_foo_transfers"]
+                self.data["num_foo_transfers"] += other.data["num_foo_transfers"]
             return self
 
-    def make_multi_stats(nixl_count: int,
-                         foo_count: int) -> MultiKVConnectorStats:
+    def make_multi_stats(nixl_count: int, foo_count: int) -> MultiKVConnectorStats:
         data: dict[str, KVConnectorStats] = {}
         if nixl_count > 0:
             nixl_stats = NixlKVConnectorStats()
             for _ in range(nixl_count):
-                nixl_stats.record_transfer()
+                nixl_stats.record_transfer(get_default_xfer_telemetry())
             data["NixlConnector"] = nixl_stats
         if foo_count > 0:
             foo_stats = FooKVConnectorStats()
@@ -694,7 +908,7 @@ def test_multi_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],
+            sampled_token_ids=[np.array([123])],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -707,24 +921,92 @@ def test_multi_kv_connector_stats_aggregation():
         worker_outputs.append(output)
 
     aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
-    kv_connector_stats = \
-        aggregated_output.kv_connector_output.kv_connector_stats
+    kv_connector_stats = aggregated_output.kv_connector_output.kv_connector_stats
     assert isinstance(kv_connector_stats, MultiKVConnectorStats)
 
     # Validate per-connector totals across workers
-    assert kv_connector_stats["NixlConnector"].data[
-        "num_successful_transfers"] == 5
+    assert isinstance(kv_connector_stats["NixlConnector"], NixlKVConnectorStats)
+    assert kv_connector_stats["NixlConnector"].num_successful_transfers == 5
+    assert isinstance(kv_connector_stats["FooConnector"], FooKVConnectorStats)
     assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
 
 
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_scheduler_kv_connector_stats_aggregation():
+    """Test scheduler and worker KV connector stats aggregation."""
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+    scheduler = create_scheduler(create_vllm_config())
+
+    # Worker stats with transfer metrics
+    worker_stats = NixlKVConnectorStats()
+    worker_stats.record_transfer(get_default_xfer_telemetry())
+    worker_stats.data["remote_tokens"] = []
+
+    # Scheduler stats with custom metric (needs dummy transfer to avoid being skipped)
+    scheduler_stats = NixlKVConnectorStats()
+    scheduler_stats.data.update(
+        {  # dummy transfer just for testing, to bypass is_empty() check
+            "transfer_duration": [0],
+            "post_duration": [0],
+            "bytes_transferred": [0],
+            "num_descriptors": [0],
+            "remote_tokens": [128],
+        }
+    )
+
+    # Mock the scheduler connector's stats method
+    scheduler.connector.get_kv_connector_stats = lambda: MultiKVConnectorStats(
+        data={"NixlConnector": scheduler_stats}
+    )
+
+    model_output = ModelRunnerOutput(
+        req_ids=["req_0"],
+        req_id_to_index={"req_0": 0},
+        sampled_token_ids=[np.array([123])],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[None],
+        kv_connector_output=KVConnectorOutput(
+            kv_connector_stats=MultiKVConnectorStats(
+                data={"NixlConnector": worker_stats}
+            )
+        ),
+    )
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=None,
+        num_scheduled_tokens={"req_0": 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=[0],
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+    )
+
+    engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output)
+
+    final_stats = next(
+        iter(engine_core_outputs.values())
+    ).scheduler_stats.kv_connector_stats
+    nixl_stats = final_stats["NixlConnector"]
+    assert nixl_stats.num_successful_transfers == 2
+    assert nixl_stats.data["remote_tokens"] == [128]
+
+
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-    FakeNixlWrapper)
+    FakeNixlWrapper,
+)
 def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
     """
     Test lifecycle of an aborted Remote Prefill request hitting the timeout.
-    -----> P 
+    -----> P
             |  {process request}
      <-/--- |  {result is NOT delivered, eg proxy is down}
             |
@@ -748,6 +1030,13 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
 
+    def run_test_and_cleanup():
+        llm = LLM(**llm_kwargs)
+        try:
+            _run_abort_timeout_test(llm, timeout)
+        finally:
+            llm.llm_engine.engine_core.shutdown()
+
     # Build runtime_env only if we're using Ray
     if distributed_executor_backend == "ray":
         with _make_fake_nixl_pkg() as working_dir:
@@ -755,18 +1044,21 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
                 "working_dir": working_dir,  # ship fake nixl package
                 "env_vars": {
                     "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
+                    # TODO: for ray to carry over, remove once we set
+                    "NIXL_TELEMETRY_ENABLE": "1",
                 },
             }
             ray.init(runtime_env=runtime_env)
-
-            _run_abort_timeout_test(llm_kwargs, timeout)
+            try:
+                run_test_and_cleanup()
+            finally:
+                ray.shutdown()
     else:
-        _run_abort_timeout_test(llm_kwargs, timeout)
+        run_test_and_cleanup()
 
 
-def _run_abort_timeout_test(llm_kwargs: dict, timeout: int):
+def _run_abort_timeout_test(llm: LLM, timeout: int):
     """Helper function to run the abort timeout test logic."""
-    llm = LLM(**llm_kwargs)
     remote_prefill_opts = {
         "do_remote_decode": True,
         "do_remote_prefill": False,
@@ -779,39 +1071,41 @@ def _run_abort_timeout_test(llm_kwargs: dict, timeout: int):
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=1,
-        extra_args={"kv_transfer_params": remote_prefill_opts})
+        extra_args={"kv_transfer_params": remote_prefill_opts},
+    )
     scheduler = llm.llm_engine.engine_core.engine_core.scheduler
     req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks
+        0
+    ].req_to_blocks
 
     padding = "Just making this request a little longer so that we're sure "
     "we're not hitting the small-request lower bound beneath which we don't "
     "actually trigger the whole kv transfer, but rather just recompute the "
     "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"],
-                     sampling_params)
+    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
 
     # Request finished but not freed
-    assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
+    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
     # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"],
-                     sampling_params)
-    assert '0' in req_to_blocks
-    assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
+    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    assert "0" in req_to_blocks
+    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
 
     # Wait for timeout and trigger another scheduler loop
     time.sleep(timeout)
-    _ = llm.generate([f"What is the capital of France? {padding}"],
-                     sampling_params)
+    _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
     # Request-0 times out and is cleared!
-    assert '0' not in req_to_blocks
+    assert "0" not in req_to_blocks
+    # Need to shutdown the background thread to release NIXL side channel port
+    llm.llm_engine.engine_core.shutdown()
 
 
-def test_register_kv_caches(dist_init):
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"])
+def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
-    
+
     This test verifies:
     1. nixl_wrapper.get_reg_descs() is called with caches_data containing
        tensor metadata
@@ -819,13 +1113,24 @@ def test_register_kv_caches(dist_init):
        block layout info
     """
 
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
     vllm_config = create_vllm_config()
 
+    # Import the appropriate backend based on the parameter
+    if attn_backend == "FLASH_ATTN":
+        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+
+        backend_cls = FlashAttentionBackend
+    else:  # TRITON_ATTN
+        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
+
+        backend_cls = TritonAttentionBackend
+
     # Create test kv cache tensors using proper backend shape
-    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(num_blocks=2,
-                                                              block_size=16,
-                                                              num_kv_heads=4,
-                                                              head_size=64)
+    kv_cache_shape = backend_cls.get_kv_cache_shape(
+        num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
+    )
     shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
     unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
     kv_caches = {
@@ -835,26 +1140,38 @@ def test_register_kv_caches(dist_init):
     }
 
     # Store tensor info for validation
-    expected_tensor_size = shared_tensor[0].element_size(
-    ) * shared_tensor[0].numel()
+    expected_tensor_size = shared_tensor[0].element_size() * shared_tensor[0].numel()
     expected_base_addrs = [
-        shared_tensor[0].data_ptr(), shared_tensor[1].data_ptr(),
-        unique_tensor[0].data_ptr(), unique_tensor[1].data_ptr()
+        shared_tensor[0].data_ptr(),
+        shared_tensor[1].data_ptr(),
+        unique_tensor[0].data_ptr(),
+        unique_tensor[1].data_ptr(),
     ]
 
-    with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper") as mock_nixl_wrapper, \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"):  # noqa: E501
-
+    with (
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
+        ) as mock_nixl_wrapper,
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"
+        ) as mock_thread,
+    ):  # noqa: E501
         # Create connector
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id, hand_shake_latency=0)
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
 
         # Get the mock instance
         mock_wrapper_instance = mock_nixl_wrapper.return_value
         connector.connector_worker.nixl_wrapper = mock_wrapper_instance
 
+        # Reassure the shutdown() check that the thread is terminated
+        mock_thread.return_value.is_alive.return_value = False
+
         # Execute register_kv_caches
         connector.register_kv_caches(kv_caches)
 
@@ -865,12 +1182,13 @@ def test_register_kv_caches(dist_init):
 
         for i, cache_entry in enumerate(caches_data):
             base_addr, size, _tp_rank, _ = cache_entry
-            assert size == expected_tensor_size, \
-                f"Entry {i}: Expected tensor size {expected_tensor_size}, " \
-                f"got {size}"
-            assert base_addr == expected_base_addrs[i], \
-                f"Entry {i}: Expected base address {expected_base_addrs[i]}, " \
+            assert size == expected_tensor_size, (
+                f"Entry {i}: Expected tensor size {expected_tensor_size}, got {size}"
+            )
+            assert base_addr == expected_base_addrs[i], (
+                f"Entry {i}: Expected base address {expected_base_addrs[i]}, "
                 f"got {base_addr}"
+            )
 
         # Verify get_xfer_descs was called with blocks_data
         assert mock_wrapper_instance.get_xfer_descs.called
@@ -878,16 +1196,17 @@ def test_register_kv_caches(dist_init):
 
         # Validate blocks_data structure and size
         expected_blocks_count = 8
-        assert len(blocks_data) == expected_blocks_count, \
-            f"Expected {expected_blocks_count} blocks, " \
-            f"got {len(blocks_data)}"
+        assert len(blocks_data) == expected_blocks_count, (
+            f"Expected {expected_blocks_count} blocks, got {len(blocks_data)}"
+        )
 
         expected_block_len = expected_tensor_size // 2
         for i, block_entry in enumerate(blocks_data):
             block_start_addr, block_len, tp_rank = block_entry
-            assert block_len == expected_block_len, \
-                f"Block entry {i}: Expected block len {expected_block_len}, " \
+            assert block_len == expected_block_len, (
+                f"Block entry {i}: Expected block len {expected_block_len}, "
                 f"got {block_len}"
+            )
 
 
 class FakePlatform(Platform):
@@ -896,24 +1215,26 @@ class FakePlatform(Platform):
     @classmethod
     def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
         """
-        Returns a mapping from device_type to a tuple of supported 
+        Returns a mapping from device_type to a tuple of supported
         kv_buffer_device for nixl.
         """
-        return {'oot': ('oot', )}
+        return {"oot": ("oot",)}
 
     @classmethod
-    def get_nixl_memory_type(cls) -> Optional[str]:
+    def get_nixl_memory_type(cls) -> str | None:
         """
         Returns the nixl memory type for the current platform.
         """
-        return 'VRAM'
+        return "VRAM"
 
 
-@pytest.mark.parametrize("kv_buffer_device, nixl_memory_type", [
-    ("oot", "VRAM"),
-])
-def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device,
-                                        nixl_memory_type):
+@pytest.mark.parametrize(
+    "kv_buffer_device, nixl_memory_type",
+    [
+        ("oot", "VRAM"),
+    ],
+)
+def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, nixl_memory_type):
     """
     Test that register_kv_caches() passes the correct memory types from the
     config to the nixl_wrapper.
@@ -922,15 +1243,30 @@ def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device,
     # Override the default memory types in the config
     vllm_config.kv_transfer_config.kv_buffer_device = kv_buffer_device
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
-        _NIXL_SUPPORTED_DEVICE)
-    _NIXL_SUPPORTED_DEVICE.update(FakePlatform.get_nixl_supported_devices())
+        _NIXL_SUPPORTED_DEVICE,
+    )
 
-    with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"), \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"), \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.current_platform", FakePlatform), \
-         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector._NIXL_SUPPORTED_DEVICE", _NIXL_SUPPORTED_DEVICE):  # noqa: E501
+    _NIXL_SUPPORTED_DEVICE.update(FakePlatform.get_nixl_supported_devices())
 
+    with (
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.current_platform",
+            FakePlatform,
+        ),
+        patch(
+            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector._NIXL_SUPPORTED_DEVICE",
+            _NIXL_SUPPORTED_DEVICE,
+        ),
+    ):  # noqa: E501
         # Create connector and replace its worker with a fake one for isolation
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
 
@@ -941,28 +1277,34 @@ def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device,
 
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-    FakeNixlWrapper)
+    FakeNixlWrapper,
+)
 def test_shutdown_cleans_up_resources(dist_init):
     """Test that shutdown() properly cleans up all resources."""
     vllm_config = create_vllm_config()
 
-    worker = NixlConnectorWorker(vllm_config,
-                                 vllm_config.kv_transfer_config.engine_id)
+    scheduler = NixlConnectorScheduler(
+        vllm_config, vllm_config.kv_transfer_config.engine_id
+    )
+    worker = NixlConnectorWorker(vllm_config, vllm_config.kv_transfer_config.engine_id)
     nixl_wrapper = worker.nixl_wrapper
 
-    with patch.object(worker, '_handshake_initiation_executor') as mock_exec, \
-         patch.object(worker, '_nixl_handshake_listener_t') as mock_listener, \
-         patch.object(nixl_wrapper, 'release_xfer_handle') as mock_rel_xfer, \
-         patch.object(nixl_wrapper, 'release_dlist_handle') as mock_rel_dlist, \
-         patch.object(nixl_wrapper, 'remove_remote_agent') as mock_rem_agent, \
-         patch.object(nixl_wrapper, 'deregister_memory') as mock_dereg:
-
+    with (
+        patch.object(worker, "_handshake_initiation_executor") as mock_exec,
+        patch.object(scheduler, "_nixl_handshake_listener_t") as mock_listener,
+        patch.object(nixl_wrapper, "release_xfer_handle") as mock_rel_xfer,
+        patch.object(nixl_wrapper, "release_dlist_handle") as mock_rel_dlist,
+        patch.object(nixl_wrapper, "remove_remote_agent") as mock_rem_agent,
+        patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
+    ):
         worker._recving_transfers = {"req1": [(123, time.perf_counter())]}
         worker.src_xfer_side_handle = 456
         worker.dst_xfer_side_handles = {"engine1": 789}
         worker._remote_agents = {"engine1": {0: "agent1"}}
         worker._registered_descs = ["desc1", "desc2"]
 
+        mock_listener.is_alive.return_value = False
+
         worker.shutdown()
 
         # Test idempotency
@@ -970,7 +1312,12 @@ def test_shutdown_cleans_up_resources(dist_init):
         worker.shutdown()
 
         mock_exec.shutdown.assert_called_with(wait=False)
-        mock_listener.join.assert_called_once_with(timeout=0)
+
+        # Same sequence on scheduler.shutdown()
+        scheduler.shutdown()
+        scheduler.shutdown()
+        scheduler.shutdown()
+        mock_listener.join.assert_called_once()
 
         mock_rel_xfer.assert_called_once_with(123)
         assert mock_rel_dlist.call_count == 2
@@ -980,3 +1327,211 @@ def test_shutdown_cleans_up_resources(dist_init):
         assert mock_dereg.call_count == 2
         mock_dereg.assert_any_call("desc1")
         mock_dereg.assert_any_call("desc2")
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_aborted_request_removed_from_worker_in_batch(dist_init):
+    """
+    Create and schedule a request so that P adds it to in-batch tracking via
+    the real scheduler, then simulate an abort (request not in next scheduler
+    iteration) and verify the worker no longer tracks it as in-batch.
+    """
+    vllm_config = create_vllm_config()
+
+    scheduler = create_scheduler(vllm_config)
+    # KVConnector Worker in P
+    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config, connector.engine_id, hand_shake_latency=0
+    )
+
+    # Create a request that triggers do_remote_decode so that
+    # the scheduler adds it to reqs_in_batch
+    req = create_request(request_id=1, do_remote_decode=True, max_tokens=1)
+    scheduler.add_request(req)
+
+    # First scheduling pass - examinate build_connector_meta output
+    sched_out = scheduler.schedule()
+    kv_meta = sched_out.kv_connector_metadata
+    assert kv_meta is not None
+    assert isinstance(kv_meta, NixlConnectorMetadata)
+    assert req.request_id in kv_meta.reqs_in_batch
+
+    #### Model Runner start ####
+    # Bind scheduler-produced metadata and start worker processing.
+    connector.bind_connector_metadata(kv_meta)
+
+    dummy_ctx = ForwardContext(
+        no_compile_layers={},
+        attn_metadata={},
+        virtual_engine=0,
+    )
+    connector.start_load_kv(dummy_ctx)
+
+    # Ensure it was tracked by the worker
+    assert req.request_id in connector.connector_worker._reqs_to_process
+
+    #### Model Runner end ####
+
+    # Abort request - request_finished call in connector scheduler
+    scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
+    # Second scheduling pass - build metadata with aborted request
+    sched_out2 = scheduler.schedule()
+    kv_meta2 = sched_out2.kv_connector_metadata
+    assert kv_meta2 is not None
+    assert isinstance(kv_meta2, NixlConnectorMetadata)
+    assert req.request_id not in kv_meta2.reqs_in_batch
+
+    # Bind empty/abort metadata and run worker step
+    #### Model Runner start ####
+    connector.bind_connector_metadata(kv_meta2)
+    connector.start_load_kv(dummy_ctx)
+
+    # After abort, the worker should not keep tracking it as "in-batch"
+    assert req.request_id not in connector.connector_worker._reqs_to_process
+    #### Model Runner end ####
+
+
+class FailingNixlWrapper(FakeNixlWrapper):
+    """Mock NixlWrapper that fails on specific operations."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fail_handshake = False
+        self.fail_transfer_setup = False
+        self.fail_send_notif = False
+
+    def add_remote_agent(self, agent_metadata: bytes) -> str:
+        if self.fail_handshake:
+            from zmq.error import Again
+
+            raise Again("Simulated timeout failure")
+        return super().add_remote_agent(agent_metadata)
+
+    def make_prepped_xfer(
+        self,
+        xfer_type: str,
+        local_xfer_side_handle: int,
+        local_block_descs_ids: list[int],
+        remote_xfer_side_handle: int,
+        remote_block_descs_ids: list[int],
+        notif_msg: bytes | None = None,
+    ) -> int:
+        if self.fail_transfer_setup:
+            # classic RuntimeError to simulate failure
+            raise RuntimeError("BAD STATUS")
+        return super().make_prepped_xfer(
+            xfer_type,
+            local_xfer_side_handle,
+            local_block_descs_ids,
+            remote_xfer_side_handle,
+            remote_block_descs_ids,
+            notif_msg,
+        )
+
+    def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
+        if self.fail_send_notif:
+            raise RuntimeError("Simulated send_notif failure")
+        return super().send_notif(agent_name, notif_msg)
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FailingNixlWrapper,
+)
+def test_handshake_failure_returns_finished(dist_init):
+    """Test that handshake failures mark blocks invalid and return via get_finished."""
+    vllm_config = create_vllm_config()
+
+    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config, connector.engine_id, hand_shake_latency=0.1
+    )
+    connector.connector_worker.nixl_wrapper.fail_handshake = True
+
+    request_id = "test_handshake_fail"
+    metadata = NixlConnectorMetadata()
+    metadata.add_new_req(
+        request_id=request_id,
+        local_block_ids=[1, 2, 3],
+        kv_transfer_params={
+            "remote_block_ids": [4, 5, 6],
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
+    connector.bind_connector_metadata(metadata)
+
+    dummy_ctx = ForwardContext(
+        no_compile_layers={},
+        attn_metadata={},
+        virtual_engine=0,
+    )
+    connector.start_load_kv(dummy_ctx)
+
+    # Wait for handshake to fail
+    time.sleep(0.3)
+
+    # Check that blocks were marked invalid
+    invalid_blocks = connector.get_block_ids_with_load_errors()
+    assert invalid_blocks == {1, 2, 3}
+
+    # Check that request appears in get_finished
+    _, done_recving = connector.get_finished(finished_req_ids=set())
+    assert request_id in done_recving
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FailingNixlWrapper,
+)
+def test_transfer_setup_failure_returns_finished(dist_init):
+    """Test that transfer setup failures mark blocks invalid
+    and return via get_finished."""
+    vllm_config = create_vllm_config()
+
+    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config, connector.engine_id, hand_shake_latency=0
+    )
+    connector.connector_worker.nixl_wrapper.fail_transfer_setup = True
+
+    request_id = "test_transfer_fail"
+    metadata = NixlConnectorMetadata()
+    metadata.add_new_req(
+        request_id=request_id,
+        local_block_ids=[7, 8, 9],
+        kv_transfer_params={
+            "remote_block_ids": [10, 11, 12],
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
+    connector.bind_connector_metadata(metadata)
+
+    dummy_ctx = ForwardContext(
+        no_compile_layers={},
+        attn_metadata={},
+        virtual_engine=0,
+    )
+    connector.start_load_kv(dummy_ctx)
+
+    # Wait for handshake to complete and process ready_requests
+    connector.bind_connector_metadata(NixlConnectorMetadata())
+    time.sleep(0.1)
+    connector.start_load_kv(dummy_ctx)
+
+    # check that blocks were marked invalid
+    invalid_blocks = connector.get_block_ids_with_load_errors()
+    assert invalid_blocks == {7, 8, 9}
+
+    # ensure request appears in get_finished
+    _, done_recving = connector.get_finished(finished_req_ids=set())
+    assert request_id in done_recving
diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
index f9a4d2fb4de4448d6d560cb22c8506241cba7fca..23b6c4802d106127510f42e3f272f0ac6cac17ea 100644
--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -14,27 +14,42 @@ from vllm.config import KVTransferConfig, VllmConfig
 from vllm.distributed.kv_events import BlockRemoved, BlockStored
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import (
-    OffloadingConnector, OffloadingConnectorMetadata)
+    OffloadingConnector,
+    OffloadingConnectorMetadata,
+)
 from vllm.forward_context import ForwardContext
-from vllm.utils import sha256
-from vllm.v1.core.kv_cache_utils import (BlockHash, get_request_block_hasher,
-                                         init_none_hash)
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    get_request_block_hasher,
+    init_none_hash,
+)
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
-                                         OffloadingManager, PrepareStoreOutput)
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
 from vllm.v1.kv_offload.mediums import GPULoadStoreSpec
 from vllm.v1.kv_offload.spec import OffloadingSpec
-from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
-                                              TransferResult, TransferSpec)
+from vllm.v1.kv_offload.worker.worker import (
+    OffloadingHandler,
+    TransferResult,
+    TransferSpec,
+)
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import Request
 
-from .utils import (EOS_TOKEN_ID, create_model_runner_output, create_scheduler,
-                    create_vllm_config)
+from .utils import (
+    EOS_TOKEN_ID,
+    create_model_runner_output,
+    create_scheduler,
+    create_vllm_config,
+)
 
 
 class MockLoadStoreSpec(LoadStoreSpec):
-
     def __init__(self, block_hashes: Iterable[BlockHash]):
         self.block_hashes: list[BlockHash] = list(block_hashes)
 
@@ -47,7 +62,6 @@ class MockLoadStoreSpec(LoadStoreSpec):
 
 
 class MockOffloadingHandler(OffloadingHandler):
-
     def __init__(self):
         self.completed_transfers: list[TransferResult] = []
         self.completed_specs: list[TransferSpec] = []
@@ -64,14 +78,14 @@ class MockOffloadingHandler(OffloadingHandler):
 
 
 class MockOffloadingSpec(OffloadingSpec):
-
     def __init__(self, vllm_config: VllmConfig):
         super().__init__(vllm_config)
 
         self.manager = MagicMock(spec=OffloadingManager)
         self.manager.lookup.return_value = 0
-        self.manager.prepare_load = lambda block_hashes: (MockLoadStoreSpec(
-            block_hashes))
+        self.manager.prepare_load = lambda block_hashes: (
+            MockLoadStoreSpec(block_hashes)
+        )
         self.handler = MockOffloadingHandler()
 
     def get_manager(self) -> OffloadingManager:
@@ -79,9 +93,7 @@ class MockOffloadingSpec(OffloadingSpec):
 
     def get_handlers(
         self, _
-    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
-                        OffloadingHandler]]:
-
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler
         yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler
 
@@ -98,35 +110,35 @@ class TransferSummary:
 
 
 class RequestRunner:
-
-    def __init__(self, offloaded_block_size: int, gpu_block_size: int,
-                 num_gpu_blocks: int):
+    def __init__(
+        self, offloaded_block_size: int, gpu_block_size: int, num_gpu_blocks: int
+    ):
         self.offloaded_block_size: int = offloaded_block_size
         self.gpu_block_size: int = gpu_block_size
         self.num_gpu_blocks: int = num_gpu_blocks
 
         self.req_id: int = -1
 
-        vllm_config = create_vllm_config(block_size=gpu_block_size,
-                                         max_num_batched_tokens=1000)
+        vllm_config = create_vllm_config(
+            block_size=gpu_block_size, max_num_batched_tokens=1000
+        )
         vllm_config.kv_transfer_config = KVTransferConfig(
             kv_connector="OffloadingConnector",
             kv_role="kv_both",
             kv_connector_extra_config={
                 "spec_name": "MockOffloadingSpec",
-                "spec_module_path":
-                "tests.v1.kv_connector.unit.test_offloading_connector",
+                "spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",  # noqa: E501
                 "block_size": offloaded_block_size,
-            })
+            },
+        )
 
-        self.scheduler: Scheduler = create_scheduler(vllm_config,
-                                                     num_blocks=num_gpu_blocks)
-        self.worker_connector = OffloadingConnector(vllm_config,
-                                                    KVConnectorRole.WORKER)
+        self.scheduler: Scheduler = create_scheduler(
+            vllm_config, num_blocks=num_gpu_blocks
+        )
+        self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER)
 
         # register worker kv_caches to enable OffloadingWorker creations
-        self.worker_connector.register_kv_caches(
-            kv_caches={"a": torch.empty(0)})
+        self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)})
 
         # extract connector of scheduler
         scheduler_connector = self.scheduler.connector
@@ -166,9 +178,9 @@ class RequestRunner:
         init_none_hash(sha256)
         self._block_hasher = get_request_block_hasher(gpu_block_size, sha256)
 
-        self._dummy_ctx: ForwardContext = ForwardContext(no_compile_layers={},
-                                                         attn_metadata={},
-                                                         virtual_engine=0)
+        self._dummy_ctx: ForwardContext = ForwardContext(
+            no_compile_layers={}, attn_metadata={}, virtual_engine=0
+        )
 
     def new_request(self, token_ids: list[int]):
         assert not self.scheduler.requests
@@ -189,8 +201,7 @@ class RequestRunner:
         block_size_factor = self.offloaded_block_size // self.gpu_block_size
 
         while self.pending_loads_count or self.pending_stores_count:
-            for transfer_spec in (
-                    self.offloading_spec.get_completed_transfers()):
+            for transfer_spec in self.offloading_spec.get_completed_transfers():
                 src_spec, dst_spec = transfer_spec
 
                 if isinstance(src_spec, GPULoadStoreSpec):
@@ -207,8 +218,7 @@ class RequestRunner:
 
                 gpu_block_indices: list[int] = []
                 for block_id in gpu_spec.block_ids:
-                    gpu_block_indices.append(
-                        self.gpu_block_index[block_id.item()])
+                    gpu_block_indices.append(self.gpu_block_index[block_id.item()])
 
                 # list of (block_hash, sub_block_offset)
                 offload_addresses: list[Any] = []
@@ -220,23 +230,26 @@ class RequestRunner:
                     assert len(gpu_block_indices) == len(offload_addresses)
 
                     self.completed_stores.append(
-                        TransferSummary(gpu_block_indices, offload_addresses))
+                        TransferSummary(gpu_block_indices, offload_addresses)
+                    )
                     self.pending_stores_count -= 1
                 else:
-                    remainder_sub_block_count = (len(offload_addresses) -
-                                                 len(gpu_block_indices))
+                    remainder_sub_block_count = len(offload_addresses) - len(
+                        gpu_block_indices
+                    )
                     assert remainder_sub_block_count >= 0
                     assert remainder_sub_block_count < block_size_factor
-                    offload_addresses = offload_addresses[
-                        remainder_sub_block_count:]
+                    offload_addresses = offload_addresses[remainder_sub_block_count:]
 
                     self.completed_loads.append(
-                        TransferSummary(gpu_block_indices, offload_addresses))
+                        TransferSummary(gpu_block_indices, offload_addresses)
+                    )
                     self.pending_loads_count -= 1
 
     def _update_gpu_block_idx(self):
-        for blocks in (self.scheduler.kv_cache_manager.coordinator.
-                       single_type_managers[0].req_to_blocks.values()):
+        for blocks in self.scheduler.kv_cache_manager.coordinator.single_type_managers[
+            0
+        ].req_to_blocks.values():
             for block_idx, block in enumerate(blocks):
                 self.gpu_block_index[block.block_id] = block_idx
 
@@ -259,37 +272,34 @@ class RequestRunner:
 
             kv_connector_metadata = scheduler_output.kv_connector_metadata
             assert kv_connector_metadata is not None
-            assert isinstance(kv_connector_metadata,
-                              OffloadingConnectorMetadata)
+            assert isinstance(kv_connector_metadata, OffloadingConnectorMetadata)
 
             self.pending_loads_count += len(kv_connector_metadata.reqs_to_load)
-            self.pending_stores_count += len(
-                kv_connector_metadata.reqs_to_store)
+            self.pending_stores_count += len(kv_connector_metadata.reqs_to_store)
 
-            self.worker_connector.bind_connector_metadata(
-                kv_connector_metadata)
+            self.worker_connector.bind_connector_metadata(kv_connector_metadata)
             self.worker_connector.start_load_kv(self._dummy_ctx)
 
             if scheduler_output.total_num_scheduled_tokens > 0:
                 self.worker_connector.wait_for_save()
 
-            finished_sending, finished_recving = (
-                self.worker_connector.get_finished(
-                    scheduler_output.finished_req_ids))
+            finished_sending, finished_recving = self.worker_connector.get_finished(
+                scheduler_output.finished_req_ids
+            )
 
             self.worker_connector.clear_connector_metadata()
 
             model_runner_output = create_model_runner_output(
                 reqs=self.scheduler.running,
-                finished_sending=list(finished_sending),
-                finished_recving=list(finished_recving),
-                token_id=token_id)
+                finished_sending=finished_sending,
+                finished_recving=finished_recving,
+                token_id=token_id,
+            )
 
             if self.scheduler.running:
                 token_id = next(tokens_iter, None)
 
-            self.scheduler.update_from_output(scheduler_output,
-                                              model_runner_output)
+            self.scheduler.update_from_output(scheduler_output, model_runner_output)
 
         self._wait_for_transfers()
 
@@ -300,24 +310,24 @@ class RequestRunner:
             while self.scheduler.requests:
                 scheduler_output = self.scheduler.schedule()
 
-                finished_sending, finished_recving = (
-                    self.worker_connector.get_finished(
-                        scheduler_output.finished_req_ids))
+                finished_sending, finished_recving = self.worker_connector.get_finished(
+                    scheduler_output.finished_req_ids
+                )
 
                 assert not finished_recving
 
                 model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
                 model_runner_output.kv_connector_output = KVConnectorOutput(
-                    finished_sending=finished_sending)
+                    finished_sending=finished_sending
+                )
 
-                self.scheduler.update_from_output(scheduler_output,
-                                                  model_runner_output)
+                self.scheduler.update_from_output(scheduler_output, model_runner_output)
 
     def run(
-            self,
-            decoded_tokens: list[int],
-            expected_stored_gpu_block_indexes: tuple[int, ...] = (),
-            expected_loaded_gpu_block_indexes: tuple[int, ...] = (),
+        self,
+        decoded_tokens: list[int],
+        expected_stored_gpu_block_indexes: tuple[int, ...] = (),
+        expected_loaded_gpu_block_indexes: tuple[int, ...] = (),
     ):
         """
         Runs multiple engine (scheduler + worker) steps.
@@ -337,23 +347,23 @@ class RequestRunner:
         loaded_gpu_block_indexes: set[int] = set()
         for transfer in self.completed_loads:
             for gpu_block_idx, offloaded_address in zip(
-                    transfer.gpu_block_indices, transfer.offload_addresses):
+                transfer.gpu_block_indices, transfer.offload_addresses
+            ):
                 loaded_gpu_block_indexes.add(gpu_block_idx)
                 assert gpu_block_idx == self.offloaded[offloaded_address]
 
-        assert (
-            set(expected_loaded_gpu_block_indexes) == loaded_gpu_block_indexes)
+        assert set(expected_loaded_gpu_block_indexes) == loaded_gpu_block_indexes
         self.completed_loads.clear()
 
         stored_gpu_block_indexes: set[int] = set()
         for transfer in self.completed_stores:
             for gpu_block_idx, offloaded_address in zip(
-                    transfer.gpu_block_indices, transfer.offload_addresses):
+                transfer.gpu_block_indices, transfer.offload_addresses
+            ):
                 stored_gpu_block_indexes.add(gpu_block_idx)
                 self.offloaded[offloaded_address] = gpu_block_idx
 
-        assert (
-            set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes)
+        assert set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes
         self.completed_stores.clear()
 
 
@@ -362,9 +372,11 @@ def request_runner():
     runners = []
 
     def runner_factory(offloaded_block_size, gpu_block_size, num_gpu_blocks):
-        runner = RequestRunner(offloaded_block_size=offloaded_block_size,
-                               gpu_block_size=gpu_block_size,
-                               num_gpu_blocks=num_gpu_blocks)
+        runner = RequestRunner(
+            offloaded_block_size=offloaded_block_size,
+            gpu_block_size=gpu_block_size,
+            num_gpu_blocks=num_gpu_blocks,
+        )
         runners.append(runner)
         return runner
 
@@ -386,15 +398,18 @@ def test_offloading_connector(request_runner):
     num_gpu_blocks = 100
     block_size_factor = offloaded_block_size // gpu_block_size
 
-    runner = request_runner(offloaded_block_size=offloaded_block_size,
-                            gpu_block_size=gpu_block_size,
-                            num_gpu_blocks=num_gpu_blocks)
+    runner = request_runner(
+        offloaded_block_size=offloaded_block_size,
+        gpu_block_size=gpu_block_size,
+        num_gpu_blocks=num_gpu_blocks,
+    )
 
     # 3 blocks, store just the middle block (skip first and last)
     # blocks = [0, 1, 2], [3, 4, 5], [6, 7, 8]
     runner.new_request(token_ids=[0] * offloaded_block_size * 3)
-    runner.manager.prepare_store.side_effect = \
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output(list(block_hashes)[1:2])
+    )
     runner.run(decoded_tokens=[0], expected_stored_gpu_block_indexes=(3, 4, 5))
 
     # add block missing 1 token -> no offload
@@ -402,21 +417,24 @@ def test_offloading_connector(request_runner):
     runner.manager.prepare_store.assert_not_called()
 
     # +1 token -> single block, fail prepare_store
-    runner.manager.prepare_store.side_effect = \
-        lambda block_hashes: None
+    runner.manager.prepare_store.side_effect = lambda block_hashes: None
     runner.run(decoded_tokens=[0])
     runner.manager.prepare_store.assert_called()
 
     # 1 more block, now set block_hashes_to_store = []
-    runner.manager.prepare_store.side_effect = \
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output([])
+    )
     runner.run(decoded_tokens=[0] * offloaded_block_size)
 
     # 1 more block, now check touch was called with all 6 blocks
-    runner.manager.prepare_store.side_effect = \
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output(block_hashes)
-    runner.run(decoded_tokens=[0] * offloaded_block_size,
-               expected_stored_gpu_block_indexes=(15, 16, 17))
+    )
+    runner.run(
+        decoded_tokens=[0] * offloaded_block_size,
+        expected_stored_gpu_block_indexes=(15, 16, 17),
+    )
     runner.manager.touch.assert_called()
     block_hashes1 = list(runner.manager.touch.call_args.args[0])
     assert len(block_hashes1) == 6
@@ -426,9 +444,10 @@ def test_offloading_connector(request_runner):
 
     # create a new request differing only on the last token
     runner.new_request(token_ids=[0] * (offloaded_block_size * 6 - 1) + [1])
-    runner.run(decoded_tokens=[0],
-               expected_stored_gpu_block_indexes=tuple(
-                   range(6 * block_size_factor)))
+    runner.run(
+        decoded_tokens=[0],
+        expected_stored_gpu_block_indexes=tuple(range(6 * block_size_factor)),
+    )
     runner.manager.touch.assert_called()
     block_hashes2 = list(runner.manager.touch.call_args.args[0])
     assert len(block_hashes2) == 6
@@ -441,17 +460,20 @@ def test_offloading_connector(request_runner):
     runner.run(decoded_tokens=[EOS_TOKEN_ID])
 
     # full_block_tokens - num_computed_tokens < offloaded_block_size
-    runner.new_request(token_ids=[0] * gpu_block_size + [1] *
-                       (offloaded_block_size - gpu_block_size))
-    runner.manager.prepare_store.side_effect = \
+    runner.new_request(
+        token_ids=[0] * gpu_block_size + [1] * (offloaded_block_size - gpu_block_size)
+    )
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output([])
+    )
     runner.run(decoded_tokens=[EOS_TOKEN_ID])
     runner.manager.lookup.assert_not_called()
 
     # single block lookup with no hits
     runner.new_request(token_ids=[1] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = \
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output([])
+    )
     runner.run(decoded_tokens=[EOS_TOKEN_ID])
     runner.manager.lookup.assert_called()
     assert len(list(runner.manager.lookup.call_args.args[0])) == 1
@@ -459,34 +481,37 @@ def test_offloading_connector(request_runner):
     # single block lookup with a hit
     runner.scheduler.reset_prefix_cache()
     runner.new_request(token_ids=[0] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = \
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output([])
+    )
     runner.manager.lookup.return_value = 1
-    runner.run(decoded_tokens=[EOS_TOKEN_ID],
-               expected_loaded_gpu_block_indexes=(0, 1, 2))
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(0, 1, 2)
+    )
 
     # single block lookup with a hit in a middle block
-    runner.new_request(token_ids=[0] * offloaded_block_size * 2 +
-                       [1] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = \
+    runner.new_request(
+        token_ids=[0] * offloaded_block_size * 2 + [1] * offloaded_block_size
+    )
+    runner.manager.prepare_store.side_effect = (
         lambda block_hashes: generate_store_output([])
+    )
     runner.manager.lookup.return_value = 1
-    runner.run(decoded_tokens=[EOS_TOKEN_ID],
-               expected_loaded_gpu_block_indexes=(3, 4, 5))
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(3, 4, 5)
+    )
 
     # test take_events
     def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
         return [BlockHash(str(i).encode()) for i in int_hashes]
 
     def take_events() -> Iterable[OffloadingEvent]:
-        yield OffloadingEvent(block_hashes=to_hashes([1, 2, 3]),
-                              block_size=16,
-                              medium="A",
-                              removed=False)
-        yield OffloadingEvent(block_hashes=to_hashes([4, 5, 6]),
-                              block_size=32,
-                              medium="B",
-                              removed=True)
+        yield OffloadingEvent(
+            block_hashes=to_hashes([1, 2, 3]), block_size=16, medium="A", removed=False
+        )
+        yield OffloadingEvent(
+            block_hashes=to_hashes([4, 5, 6]), block_size=32, medium="B", removed=True
+        )
 
     runner.manager.take_events.side_effect = take_events
     events = list(runner.scheduler_connector.take_events())
diff --git a/tests/v1/kv_connector/unit/test_output_aggreagator.py b/tests/v1/kv_connector/unit/test_output_aggreagator.py
deleted file mode 100644
index 5d2b27a9eb4da2ca62ed980ef0abf2b6b07be43d..0000000000000000000000000000000000000000
--- a/tests/v1/kv_connector/unit/test_output_aggreagator.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from concurrent.futures import Future
-from typing import Optional
-
-from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
-
-
-class DummyModelRunnerOutput(ModelRunnerOutput):
-
-    def __init__(self,
-                 finished_sending: Optional[set[str]] = None,
-                 finished_recving: Optional[set[str]] = None):
-        self.kv_connector_output = KVConnectorOutput(
-            finished_sending=finished_sending,
-            finished_recving=finished_recving,
-        )
-
-    def __repr__(self):
-        return (
-            f"DummyModelRunnerOutput("
-            f"finished_sending={self.kv_connector_output.finished_sending},"
-            f"finished_recving={self.kv_connector_output.finished_recving})")
-
-
-def test_aggregate_workers_output():
-    aggregator = KVOutputAggregator(world_size=2)
-
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-
-    aggregated = aggregator.aggregate([output1, output2])
-
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
-
-    aggregated = aggregator.aggregate([output1, output2])
-
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending == {'req1'}
-    assert aggregated.finished_recving is None
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-
-    aggregated = aggregator.aggregate([output1, output2])
-
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
-
-
-def test_async_aggregate_workers_output():
-    aggregator = KVOutputAggregator(world_size=2)
-
-    future1: Future[DummyModelRunnerOutput] = Future()
-    future2: Future[DummyModelRunnerOutput] = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-
-    future1 = Future()
-    future2 = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending == {'req1'}
-    assert aggregated.finished_recving is None
-
-    future1 = Future()
-    future2 = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
diff --git a/tests/v1/kv_connector/unit/test_output_aggregator.py b/tests/v1/kv_connector/unit/test_output_aggregator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b083ccef9819ba6b16089d9218563183137dbb21
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_output_aggregator.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+
+pytestmark = pytest.mark.cpu_test
+
+
+class DummyModelRunnerOutput(ModelRunnerOutput):
+    def __init__(
+        self,
+        finished_sending: set[str] | None = None,
+        finished_recving: set[str] | None = None,
+        invalid_block_ids: set[int] | None = None,
+        expected_finished_count: int = 0,
+    ):
+        self.kv_connector_output = KVConnectorOutput(
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+            invalid_block_ids=invalid_block_ids or set(),
+            expected_finished_count=expected_finished_count,
+        )
+
+    def __repr__(self):
+        return (
+            f"DummyModelRunnerOutput("
+            f"finished_sending={self.kv_connector_output.finished_sending},"
+            f"finished_recving={self.kv_connector_output.finished_recving})"
+            f"invalid_block_ids={self.kv_connector_output.invalid_block_ids})"
+        )
+
+
+def test_aggregate_workers_output():
+    aggregator = KVOutputAggregator(expected_finished_count=2)
+
+    output1 = DummyModelRunnerOutput()
+    output2 = DummyModelRunnerOutput()
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+    assert not aggregated.invalid_block_ids
+
+    output1 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, finished_recving={"req2"}
+    )
+    output2 = DummyModelRunnerOutput(invalid_block_ids={1})
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+    assert aggregated.invalid_block_ids == {1}
+
+    output1 = DummyModelRunnerOutput(invalid_block_ids={2})
+    output2 = DummyModelRunnerOutput(finished_sending={"req1"})
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
+    assert aggregated.finished_sending == {"req1"}
+    assert aggregated.finished_recving is None
+    assert aggregated.invalid_block_ids == {2}
+
+    output1 = DummyModelRunnerOutput(invalid_block_ids={3, 4})
+    output2 = DummyModelRunnerOutput(
+        finished_recving={"req2"}, invalid_block_ids={4, 5}
+    )
+
+    aggregated = aggregator.aggregate([output1, output2])
+
+    assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving == {"req2"}
+    assert aggregated.invalid_block_ids == {3, 4, 5}
+
+
+def test_aggregate_workers_output_with_expected_finished_count():
+    # We create the aggregator expecting to collect from 4 workers
+    aggregator = KVOutputAggregator(expected_finished_count=4)
+    assert aggregator._expected_finished_count == 4
+    # Some request with default expected finished requests
+    output1 = DummyModelRunnerOutput(finished_sending={"req1"})
+    aggregated = aggregator.aggregate([output1])
+    # still expecting to collect from 4 workers
+    assert aggregator._send_remaining_count["req1"] == 3
+    assert not aggregated.kv_connector_output.finished_sending
+    assert not aggregated.kv_connector_output.finished_recving
+
+    # Workers discover and find that in this setup they only need to
+    # collect from 2
+    output1 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, expected_finished_count=2
+    )
+    output2 = DummyModelRunnerOutput(
+        finished_recving={"req2"}, expected_finished_count=2
+    )
+    output3 = DummyModelRunnerOutput(finished_recving={"req2"})
+    # Req2 only needs 2 acks
+    aggregated = aggregator.aggregate([output1, output2, output3])
+    assert aggregated.kv_connector_output.expected_finished_count == 2
+
+    assert not aggregated.kv_connector_output.finished_sending
+
+    # Req2 is finished
+    assert "req2" not in aggregator._recv_remaining_count
+    assert aggregated.kv_connector_output.finished_recving == {"req2"}
+
+    # Req1 is still waiting for 2 more acks (expected_finished_count has no effect)
+    # NOTE: This is to showcase dynamic update. Workers are responsible for
+    # ensuring "req1" termination in this case
+    assert aggregator._send_remaining_count["req1"] == 2
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 380e72a156336797c72ca7be6d9058713a2b741b..b2ec2ddfb64da3ca206f40e32709dfae9e0d801a 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -2,11 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
+import pytest
+
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import FinishReason, RequestStatus
 
-from .utils import (assert_scheduler_empty, create_model_runner_output,
-                    create_request, create_scheduler, create_vllm_config)
+from .utils import (
+    assert_scheduler_empty,
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+pytestmark = pytest.mark.cpu_test
 
 
 def test_basic_lifecycle():
@@ -20,11 +29,13 @@ def test_basic_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             max_tokens=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_decode=True)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        max_tokens=1,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -32,6 +43,7 @@ def test_basic_lifecycle():
     # STEP (1): Prefill.
     # (1a): schedule()
     scheduler_output = scheduler.schedule()
+    assert len(scheduler.requests) == 1
     assert len(scheduler.running) == 1
     assert len(scheduler_output.scheduled_new_reqs) == 1
 
@@ -39,8 +51,9 @@ def test_basic_lifecycle():
     model_runner_output = create_model_runner_output(reqs=[request])
 
     # (1c): update_from_output()
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
 
     # Ensure the request is finished after 1 token.
     assert request.is_finished()
@@ -55,14 +68,17 @@ def test_basic_lifecycle():
     assert len(scheduler.waiting) == 0
 
     # ... but blocks should not be freed.
+    assert len(scheduler.requests) == 1
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
 
     # STEP (2): Send Finished to PB.
     # (2a): schedule() - pass finished request to PB.
     scheduler_output = scheduler.schedule()
+    assert len(scheduler.requests) == 1
     assert len(scheduler.running) == 0
     assert len(scheduler_output.finished_req_ids) == 1
     assert request_id in scheduler_output.finished_req_ids
@@ -79,6 +95,7 @@ def test_basic_lifecycle():
     # STEP (3): Finished sending.
     # (3a): schedule() - pass finished request to PB.
     scheduler_output = scheduler.schedule()
+    assert len(scheduler.requests) == 1
     assert len(scheduler.running) == 0
     assert len(scheduler_output.finished_req_ids) == 0
     assert len(scheduler_output.scheduled_new_reqs) == 0
@@ -88,7 +105,8 @@ def test_basic_lifecycle():
     # (3b): execute_model()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
     model_runner_output.kv_connector_output = KVConnectorOutput(
-        finished_sending=[request_id])
+        finished_sending={request_id}
+    )
 
     # (3c): update_from_output()
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -106,17 +124,20 @@ def test_short_prompt_lifecycle():
     # Not enough tokens for full block.
     BLOCK_SIZE = vllm_config.cache_config.block_size
     NUM_TOKENS = BLOCK_SIZE // 2
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             max_tokens=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_decode=True)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        max_tokens=1,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+    )
 
     scheduler.add_request(request)
 
     # STEP (1): Prefill.
     # (1a): schedule()
     scheduler_output = scheduler.schedule()
+    assert len(scheduler.requests) == 1
     assert len(scheduler.running) == 1
     assert len(scheduler_output.scheduled_new_reqs) == 1
 
@@ -128,14 +149,15 @@ def test_short_prompt_lifecycle():
     eco = scheduler.update_from_output(scheduler_output, model_runner_output)
     kv_transfer_params = eco[0].outputs[0].kv_transfer_params
 
-    assert (len(kv_transfer_params["remote_block_ids"]) == 1)
+    assert len(kv_transfer_params["remote_block_ids"]) == 1
 
     # Confirm we do not have any memory leaks after req lifecycle.
     # We need to mark sending finish to clear data for persistent batch.
     scheduler_output = scheduler.schedule()
     # Use create_model_runner_output to pass kv_connector_output along
     model_runner_output = create_model_runner_output(
-        reqs=[request], finished_sending=[request.request_id])
+        reqs=[request], finished_sending={request.request_id}
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert_scheduler_empty(scheduler)
 
@@ -151,16 +173,17 @@ def test_prefix_cache_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 3
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS)
+    request_normal = create_request(
+        request_id=1, block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS
+    )
 
     scheduler.add_request(request_normal)
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_normal],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
-    scheduler.schedule()
+    scheduler_output = scheduler.schedule()
     scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT)
 
     #####################
@@ -170,10 +193,12 @@ def test_prefix_cache_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS -= 1
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_remote = create_request(request_id=1,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS,
-                                    do_remote_decode=True)
+    request_remote = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+    )
 
     scheduler.add_request(request_remote)
     scheduler_output = scheduler.schedule()
@@ -183,14 +208,55 @@ def test_prefix_cache_lifecycle():
 
     # Ensure we send all block ids, including the partial blocks,
     # even if there is a cache hit.
-    assert (len(
-        kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS +
-                                                    1))
+    assert len(kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS + 1)
 
     # STEP (2): Ensure it is freed.
     scheduler_output = scheduler.schedule()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
     model_runner_output.kv_connector_output = KVConnectorOutput(
-        finished_sending=[request_remote.request_id])
+        finished_sending={request_remote.request_id}
+    )
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert_scheduler_empty(scheduler)
+
+
+def test_abort_during_kv_transfer():
+    """Test aborting request does not release blocks for remote decode."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # Prime the KVCache.
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_EXTERNAL_FULL_BLOCKS = 2
+    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
+
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_decode=True,
+    )
+
+    scheduler.add_request(request)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    scheduler_output = scheduler.schedule()
+    scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT)
+
+    # Request removed from PB but blocks should not be freed.
+    assert len(scheduler.requests) == 1
+
+    # Abort the request, and check the blocks are still not freed
+    scheduler.finish_requests([request.request_id], RequestStatus.FINISHED_ABORTED)
+    assert len(scheduler.requests) == 1
+
+    # Simulate a finished sending notification
+    scheduler_output = scheduler.schedule()
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.kv_connector_output = KVConnectorOutput(
+        finished_sending=[request.request_id]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 21fec5344255c755e2cf31025657bf8b9d5f6ab9..b9588ebcd211098df131b8fbb7e36e4cfb012a4f 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -2,11 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
+import pytest
+
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import FinishReason, RequestStatus
 
-from .utils import (assert_scheduler_empty, create_model_runner_output,
-                    create_request, create_scheduler, create_vllm_config)
+from .utils import (
+    assert_scheduler_empty,
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
+
+pytestmark = pytest.mark.cpu_test
 
 
 def test_basic_lifecycle():
@@ -20,12 +29,15 @@ def test_basic_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
     START_FREE_BLOCK_QUEUE_SIZE = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
 
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -44,16 +56,16 @@ def test_basic_lifecycle():
     # Req waiting for KVs with no computed/scheduled toks ...
     assert len(scheduler.waiting) == 1
     assert request in scheduler.waiting
-    assert (request.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
-    assert (request.num_computed_tokens == 0)
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert request.num_computed_tokens == 0
 
     # ... but should have (uncached) blocks allocated to it.
     block_pool = scheduler.kv_cache_manager.block_pool
-    assert (block_pool.free_block_queue.num_free_blocks
-            < START_FREE_BLOCK_QUEUE_SIZE)
+    assert block_pool.free_block_queue.num_free_blocks < START_FREE_BLOCK_QUEUE_SIZE
     assert len(block_pool.cached_block_hash_to_block) == 0
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block._block_hash is None
 
@@ -61,8 +73,9 @@ def test_basic_lifecycle():
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
 
     # (1c): update_from_output()
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     assert not engine_core_outputs or not engine_core_outputs[0].outputs
 
     # STEP (2):
@@ -74,13 +87,15 @@ def test_basic_lifecycle():
     # (2b): forward(): request finishes recv.
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
     model_runner_output.kv_connector_output = KVConnectorOutput(
-        finished_recving=[request_id])
+        finished_recving={request_id}
+    )
 
     # (2c): update_from_output():
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     assert len(scheduler.waiting) == 1
-    assert (request_id in scheduler.finished_recving_kv_req_ids)
+    assert request_id in scheduler.finished_recving_kv_req_ids
 
     # STEP (3):
     # (3a): schedule(): this should actually schedule.
@@ -90,10 +105,11 @@ def test_basic_lifecycle():
     # Confirm the block are actually allocated.
     num_hashed_blocks = 0
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
-        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+        num_hashed_blocks += 1 if block._block_hash is not None else 0
     assert num_hashed_blocks == NUM_EXTERNAL_FULL_BLOCKS
 
     # Confirm the rest of the prompt is scheduled in this step.
@@ -101,7 +117,7 @@ def test_basic_lifecycle():
     num_scheduled_tokens = scheduler_output.num_scheduled_tokens[request_id]
     num_computed_tokens = scheduled_req.num_computed_tokens
     total_prompt_tokens = len(scheduled_req.prompt_token_ids)
-    assert (num_scheduled_tokens == total_prompt_tokens - num_computed_tokens)
+    assert num_scheduled_tokens == total_prompt_tokens - num_computed_tokens
 
     # (3b): execute_model()
     model_runner_output = create_model_runner_output([request])
@@ -111,8 +127,9 @@ def test_basic_lifecycle():
     # Step (4): Hit EOS.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output([request], use_eos=True)
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     scheduler.schedule()
 
     outputs = engine_core_outputs[0].outputs
@@ -133,10 +150,12 @@ def test_interleaved_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_remote = create_request(request_id=1,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS,
-                                    do_remote_prefill=True)
+    request_remote = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+    )
     request_local_a = create_request(
         request_id=2,
         block_size=BLOCK_SIZE,
@@ -165,8 +184,7 @@ def test_interleaved_lifecycle():
     assert len(scheduler_output.scheduled_new_reqs) == 1
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 1
 
-    model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b])
+    model_runner_output = create_model_runner_output([request_local_a, request_local_b])
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 3: continue running, KVs not arrived yet.
@@ -177,7 +195,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        reqs=[request_local_a, request_local_b])
+        reqs=[request_local_a, request_local_b]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 2
     assert len(scheduler.waiting) == 1
@@ -192,8 +211,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b],
-        finished_recving=[request_remote.request_id])
+        [request_local_a, request_local_b], finished_recving={request_remote.request_id}
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 5: RECVed KVs are sent to ModelRunner.
@@ -204,7 +223,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b, request_remote])
+        [request_local_a, request_local_b, request_remote]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 6: Hit EOS and free.
@@ -242,16 +262,16 @@ def test_no_spurious_prefix_caching():
         request_id=1,
         block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
+        common_prefix_len=NUM_TOKENS,
         do_remote_prefill=True,
-        use_all_1s_for_prompt_tokens=True,
     )
 
     request_local = create_request(
         request_id=2,
         block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
+        common_prefix_len=NUM_TOKENS,
         do_remote_prefill=False,
-        use_all_1s_for_prompt_tokens=True,
     )
 
     # Schedule the remote prefill request. This should not
@@ -269,15 +289,17 @@ def test_no_spurious_prefix_caching():
     assert len(scheduler.waiting) == 1
 
     local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_local.request_id]
+        0
+    ].req_to_blocks[request_local.request_id]
     remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_remote.request_id]
+        0
+    ].req_to_blocks[request_remote.request_id]
 
     # Local should have cached blocks (but not all due to preallocate).
     num_hashed_blocks = 0
     for block in local_blocks:
         assert block.ref_cnt == 1
-        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+        num_hashed_blocks += 1 if block._block_hash is not None else 0
     assert num_hashed_blocks > 0
 
     # Remote blocks should not be cached.
@@ -297,10 +319,12 @@ def test_full_block_prompt():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS)
 
-    request = create_request(request_id=1,
-                             block_size=BLOCK_SIZE,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS,
+        do_remote_prefill=True,
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -308,8 +332,11 @@ def test_full_block_prompt():
     # STEP (1): Initialize a recv.
     scheduler_output = scheduler.schedule()
     # All blocks should be allocated.
-    num_blocks = len(scheduler.kv_cache_manager.coordinator.
-                     single_type_managers[0].req_to_blocks[request_id])
+    num_blocks = len(
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ]
+    )
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -318,22 +345,25 @@ def test_full_block_prompt():
     scheduler_output = scheduler.schedule()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
     model_runner_output.kv_connector_output = KVConnectorOutput(
-        finished_recving=[request_id])
+        finished_recving={request_id}
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.waiting) == 1
-    assert (request_id in scheduler.finished_recving_kv_req_ids)
+    assert request_id in scheduler.finished_recving_kv_req_ids
 
     # # STEP (3): Run as usual.
     scheduler_output = scheduler.schedule()
 
     # We need to recompute the final token of the prompt to generate
     # the first new token, so we should not have a new block.
-    num_blocks = len(scheduler.kv_cache_manager.coordinator.
-                     single_type_managers[0].req_to_blocks[request_id])
+    num_blocks = len(
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ]
+    )
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
-    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
-            NUM_TOKENS - 1)
-    assert (scheduler_output.num_scheduled_tokens[request_id] == 1)
+    assert scheduler_output.scheduled_new_reqs[0].num_computed_tokens == NUM_TOKENS - 1
+    assert scheduler_output.num_scheduled_tokens[request_id] == 1
 
     model_runner_output = create_model_runner_output([request])
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -341,8 +371,9 @@ def test_full_block_prompt():
     # # Step (4): Hit EOS.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output([request], use_eos=True)
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     scheduler.schedule()
 
     outputs = engine_core_outputs[0].outputs
@@ -371,13 +402,15 @@ def test_cannot_schedule_after_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
 
-    request_normal = create_request(request_id=1,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS_LOCAL)
-    request_remote = create_request(request_id=2,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS_REMOTE,
-                                    do_remote_prefill=True)
+    request_normal = create_request(
+        request_id=1, block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_LOCAL
+    )
+    request_remote = create_request(
+        request_id=2,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS_REMOTE,
+        do_remote_prefill=True,
+    )
 
     # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
     scheduler.add_request(request_normal)
@@ -398,7 +431,8 @@ def test_cannot_schedule_after_recv():
     # Step 3: finish recving (5 blocks in use)
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(
-        reqs=[request_normal], finished_recving=[request_remote.request_id])
+        reqs=[request_normal], finished_recving={request_remote.request_id}
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
@@ -407,7 +441,8 @@ def test_cannot_schedule_after_recv():
     # because the transfer is completed.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(
-        reqs=[request_normal, request_remote])
+        reqs=[request_normal, request_remote]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 2
     assert len(scheduler.waiting) == 0
@@ -422,8 +457,9 @@ def test_cannot_schedule_after_recv():
 
     # Step 6: finish the request, free it.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_normal],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
@@ -432,16 +468,19 @@ def test_cannot_schedule_after_recv():
     # request is retrieved from preempted list.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote])
-    assert (scheduler_output.scheduled_cached_reqs.num_computed_tokens[0] ==
-            NUM_PROMPT_BLOCKS * BLOCK_SIZE)
+    assert (
+        scheduler_output.scheduled_cached_reqs.num_computed_tokens[0]
+        == NUM_PROMPT_BLOCKS * BLOCK_SIZE
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
 
     # Step 8: free everything.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_remote],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_remote], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     _ = scheduler.schedule()
     assert_scheduler_empty(scheduler)
@@ -466,13 +505,15 @@ def test_cannot_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS_LOCAL)
-    request_remote = create_request(request_id=2,
-                                    block_size=BLOCK_SIZE,
-                                    num_tokens=NUM_TOKENS_REMOTE,
-                                    do_remote_prefill=True)
+    request_normal = create_request(
+        request_id=1, block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_LOCAL
+    )
+    request_remote = create_request(
+        request_id=2,
+        block_size=BLOCK_SIZE,
+        num_tokens=NUM_TOKENS_REMOTE,
+        do_remote_prefill=True,
+    )
 
     # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
     scheduler.add_request(request_normal)
@@ -491,12 +532,13 @@ def test_cannot_recv():
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
     # Should not have KV transfer in progress.
-    assert (request_remote.status != RequestStatus.WAITING_FOR_REMOTE_KVS)
+    assert request_remote.status != RequestStatus.WAITING_FOR_REMOTE_KVS
 
     # Step 3: finish the request, free it.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_normal],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
@@ -507,12 +549,13 @@ def test_cannot_recv():
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
-    assert (request_remote.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
+    assert request_remote.status == RequestStatus.WAITING_FOR_REMOTE_KVS
 
     # Step 5: finish recving (5 blocks in use)
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(
-        reqs=[], finished_recving=[request_remote.request_id])
+        reqs=[], finished_recving={request_remote.request_id}
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
@@ -526,8 +569,9 @@ def test_cannot_recv():
 
     # Step 7: free everything.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_remote],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_remote], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     _ = scheduler.schedule()
     assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index 6be261e45cb00143b5b2059be60fdbfc2dcea8e2..e7013a794a8c612d4607c0fa44e709580c6ad329 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -37,16 +37,22 @@ def _list_path(path):
     return list(path.iterdir())
 
 
-def run_test(tmp_path, processor, llm: LLM, question: str,
-             image_urls: list[Image], expected_len: int, info: str):
+def run_test(
+    tmp_path,
+    processor,
+    llm: LLM,
+    question: str,
+    image_urls: list[Image],
+    expected_len: int,
+    info: str,
+):
     """
     One individual test to process the prompt and output base on 1 set of input
     Then check if the length in the storage path matches the expected length
     `info` introduces details or purpose of the individual test
     """
     print(f"***info: {info}***")
-    print(
-        f"**Expected storage path length after llm generate: {expected_len}**")
+    print(f"**Expected storage path length after llm generate: {expected_len}**")
     process_prompt(processor, llm, question, image_urls)
 
     print(f"Path matched expected length: {_check_path_len(tmp_path)}")
@@ -54,51 +60,42 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
 
     assert _check_path_len(tmp_path) == expected_len, (
         f"Expect storage path length {expected_len} ;",
-        f"but end up {_check_path_len(tmp_path)} instead. ", f"Info: {info}")
+        f"but end up {_check_path_len(tmp_path)} instead. ",
+        f"Info: {info}",
+    )
 
 
-def process_prompt(processor, llm: LLM, question: str,
-                   image_urls: list[Image]):
+def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
     """
     Form the prompt based on the text and image input, then llm generate output
     """
-    placeholders = [{
-        "type": "image_url",
-        "image_url": {
-            "url": f"data:image;base64,{encode_image_base64(image_pil)}"
+    placeholders = [
+        {
+            "type": "image_url",
+            "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
         }
-    } for image_pil in image_urls]
+        for image_pil in image_urls
+    ]
 
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        },
+        {"role": "system", "content": "You are a helpful assistant."},
         {
             "role": "user",
             "content": [
                 *placeholders,
-                {
-                    "type": "text",
-                    "text": question
-                },
+                {"type": "text", "text": question},
             ],
         },
     ]
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     outputs = llm.generate(
         {
-            "prompt":
-            prompt,
-            **({
-                "multi_modal_data": {
-                    "image": [*image_urls]
-                }
-            } if image_urls else {})
+            "prompt": prompt,
+            **({"multi_modal_data": {"image": [*image_urls]}} if image_urls else {}),
         },
         sampling_params=SAMPLING_PARAMS,
     )
@@ -114,7 +111,7 @@ def process_prompt(processor, llm: LLM, question: str,
 def test_shared_storage_connector_hashes(tmp_path):
     """
     Tests that SharedStorageConnector saves KV to the storage locations
-    with proper hashes; that are unique for inputs with identical text but 
+    with proper hashes; that are unique for inputs with identical text but
     different images (same size), or same multiple images but different orders.
     """
     # Using tmp_path as the storage path to store KV
@@ -124,7 +121,8 @@ def test_shared_storage_connector_hashes(tmp_path):
     kv_transfer_config = KVTransferConfig(
         kv_connector="SharedStorageConnector",
         kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": str(tmp_path)})
+        kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
+    )
 
     engine_args = EngineArgs(
         model=MODEL_NAME,
@@ -157,56 +155,88 @@ def test_shared_storage_connector_hashes(tmp_path):
 
     # Prepare the input cases
     input_cases = [
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_1],
-                  expected_len=1,
-                  info="image_1 single input the first time."),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_2],
-                  expected_len=2,
-                  info=("image_2 single input the first time. "
-                        "It is in same pixel size with image_1, yet it "
-                        "should be able to form a new unique hash.")),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_1],
-                  expected_len=2,
-                  info=("image_1 single input the 2nd time. "
-                        "It should not form another new hash.")),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_2],
-                  expected_len=2,
-                  info=("image_2 single input the 2nd time. "
-                        "It should not form another new hash.")),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_1, image_2],
-                  expected_len=3,
-                  info="image_1 with image_2 input the first time."),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_2, image_1],
-                  expected_len=4,
-                  info="The image order is swapped. Should form new hash."),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_1, image_2],
-                  expected_len=4,
-                  info=("[image_1, image_2] input the 2nd time. "
-                        "It should not form another new hash.")),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[image_2, image_1],
-                  expected_len=4,
-                  info=("[image_2, image_1] input the 2nd time. "
-                        "It should not form another new hash.")),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[],
-                  expected_len=5,
-                  info="Pure text input test as a case-control"),
-        InputCase(text=TEXT_PROMPTS[0],
-                  img=[],
-                  expected_len=5,
-                  info="Identical pure text input as a case-control"),
-        InputCase(text=TEXT_PROMPTS[1],
-                  img=[],
-                  expected_len=6,
-                  info="Another pure text input as a case-control"),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_1],
+            expected_len=1,
+            info="image_1 single input the first time.",
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_2],
+            expected_len=2,
+            info=(
+                "image_2 single input the first time. "
+                "It is in same pixel size with image_1, yet it "
+                "should be able to form a new unique hash."
+            ),
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_1],
+            expected_len=2,
+            info=(
+                "image_1 single input the 2nd time. "
+                "It should not form another new hash."
+            ),
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_2],
+            expected_len=2,
+            info=(
+                "image_2 single input the 2nd time. "
+                "It should not form another new hash."
+            ),
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_1, image_2],
+            expected_len=3,
+            info="image_1 with image_2 input the first time.",
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_2, image_1],
+            expected_len=4,
+            info="The image order is swapped. Should form new hash.",
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_1, image_2],
+            expected_len=4,
+            info=(
+                "[image_1, image_2] input the 2nd time. "
+                "It should not form another new hash."
+            ),
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[image_2, image_1],
+            expected_len=4,
+            info=(
+                "[image_2, image_1] input the 2nd time. "
+                "It should not form another new hash."
+            ),
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[],
+            expected_len=5,
+            info="Pure text input test as a case-control",
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[0],
+            img=[],
+            expected_len=5,
+            info="Identical pure text input as a case-control",
+        ),
+        InputCase(
+            text=TEXT_PROMPTS[1],
+            img=[],
+            expected_len=6,
+            info="Another pure text input as a case-control",
+        ),
     ]
 
     # Run tests
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index de52668e3dcf121d1666e960210d3af51a5c550d..c248104d5b5eac25d56a169d84009ee5db09cf12 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -2,24 +2,41 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 from collections import defaultdict
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from dataclasses import dataclass
+from itertools import chain, count
+from typing import Any
 
+import numpy as np
 import torch
 
 from vllm import SamplingParams
-from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
-                         ModelConfig, SchedulerConfig, VllmConfig)
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector)
-from vllm.utils import sha256
+    SharedStorageConnector,
+)
+from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
-from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
-                                         init_none_hash)
-from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.core.sched.scheduler import Scheduler, SchedulerOutput
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -41,14 +58,24 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               num_cached_block) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks
+        )
+        == 0
+    )
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block
+        )
+        == 0
+    )
     num_free_blocks = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
-    assert num_free_blocks == (
-        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
+    assert num_free_blocks == (scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
 
     # NOTE(rob): just the ref count on blocks will be 0. The hash
     # value, etc will remain since we lazily evict for prefix cache.
@@ -61,12 +88,16 @@ def create_vllm_config(
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 64,
     block_size: int = 16,
+    max_model_len: int = 10000,
+    enable_chunked_prefill: bool = True,
+    enable_permute_local_kv: bool = False,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
     )
     model_config = ModelConfig(
         model=model,
@@ -85,12 +116,15 @@ def create_vllm_config(
     kv_transfer_config = KVTransferConfig(
         kv_connector="NixlConnector",
         kv_role="kv_both",
+        enable_permute_local_kv=enable_permute_local_kv,
+    )
+    return VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        device_config=DeviceConfig("cpu"),
     )
-    return VllmConfig(scheduler_config=scheduler_config,
-                      model_config=model_config,
-                      cache_config=cache_config,
-                      kv_transfer_config=kv_transfer_config,
-                      device_config=DeviceConfig("cpu"))
 
 
 def create_scheduler(
@@ -103,9 +137,9 @@ def create_scheduler(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     vllm_config.cache_config.num_gpu_blocks = num_blocks
@@ -114,49 +148,57 @@ def create_scheduler(
         kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
+        block_size=block_size,
     )
 
 
+_request_count = count(1)
 _none_hash_initialized = False
 
 
-def create_request(request_id: int,
-                   num_tokens: int = 10,
-                   max_tokens: int = 16,
-                   do_remote_decode: bool = False,
-                   do_remote_prefill: bool = False,
-                   use_all_1s_for_prompt_tokens: bool = False,
-                   num_remote_blocks: int = 3,
-                   block_size: int = 16,
-                   hash_fn: Callable = sha256) -> Request:
+def create_request(
+    request_id: int | None = None,
+    num_tokens: int = 10,
+    common_prefix_len=0,
+    max_tokens: int = 16,
+    do_remote_decode: bool = False,
+    do_remote_prefill: bool = False,
+    num_remote_blocks: int = 3,
+    block_size: int = 16,
+    hash_fn: Callable = sha256,
+) -> Request:
     """Make dummy request for testing."""
+    assert num_tokens >= common_prefix_len >= 0
+
+    if request_id is None:
+        request_id = next(_request_count)
+
     global _none_hash_initialized
     if not _none_hash_initialized:
         init_none_hash(hash_fn)
         _none_hash_initialized = True
 
-    kv_transfer_params: Optional[dict[str, Any]] = None
+    kv_transfer_params: dict[str, Any] | None = None
 
     if do_remote_decode:
         assert not do_remote_prefill
-        kv_transfer_params = dict(do_remote_prefill=False,
-                                  do_remote_decode=True)
+        kv_transfer_params = dict(do_remote_prefill=False, do_remote_decode=True)
     elif do_remote_prefill:
-        kv_transfer_params = dict(do_remote_prefill=True,
-                                  do_remote_decode=False,
-                                  remote_engine_id="my-engine-id",
-                                  remote_block_ids=list(
-                                      range(num_remote_blocks)),
-                                  remote_host="my-host",
-                                  remote_port=1234)
+        kv_transfer_params = dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_engine_id="my-engine-id",
+            remote_block_ids=list(range(num_remote_blocks)),
+            remote_host="my-host",
+            remote_port=1234,
+        )
 
     max_tokens = 1 if do_remote_decode else max_tokens
     sampling_params = SamplingParams(max_tokens=max_tokens)
 
-    if use_all_1s_for_prompt_tokens:
-        prompt_token_ids = [1] * num_tokens
-    else:
-        prompt_token_ids = [i * request_id for i in range(num_tokens)]
+    common_prefix = [1] * common_prefix_len if common_prefix_len > 0 else []
+    suffix = [i * request_id for i in range(num_tokens - common_prefix_len)]
+    prompt_token_ids = common_prefix + suffix
 
     req = Request(
         request_id=f"id-{request_id}",
@@ -173,8 +215,9 @@ def create_request(request_id: int,
 
 def create_model_runner_output(
     reqs: list[Request],
-    finished_sending: Optional[list[str]] = None,
-    finished_recving: Optional[list[str]] = None,
+    finished_sending: set[str] | None = None,
+    finished_recving: set[str] | None = None,
+    invalid_block_ids: set[int] | None = None,
     use_eos: bool = False,
     token_id: int = 0,
 ) -> ModelRunnerOutput:
@@ -186,14 +229,21 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [[sampled_token] for _ in req_ids]
-
-    kv_connector_output = None if (
-        finished_sending is None
-        and finished_recving is None) else KVConnectorOutput(
+    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
+
+    kv_connector_output = (
+        None
+        if (
+            finished_sending is None
+            and finished_recving is None
+            and invalid_block_ids is None
+        )
+        else KVConnectorOutput(
             finished_sending=finished_sending,
             finished_recving=finished_recving,
+            invalid_block_ids=invalid_block_ids or set(),
         )
+    )
 
     # Make output data structure.
     return ModelRunnerOutput(
@@ -208,22 +258,30 @@ def create_model_runner_output(
 
 
 class TestSharedStorageConnector(SharedStorageConnector):
-
-    def __init__(self, config: VllmConfig, role):
+    def __init__(self, config: VllmConfig, role, kv_cache_config):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
         self._connector = SharedStorageConnector(config, role)
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
-        self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}-{self.role.name}_events.log"
+        self._event_file = (
+            tempfile.gettempdir()
+            + f"/connector_{self.name}-{self.role.name}_events.log"
+        )
         # Start with an empty file
         with open(self._event_file, "w") as _:
             pass
 
     def __getattribute__(self, name):
-        if name in ("_connector", "call_record", "name", "_event_file",
-                    "__class__", "__dict__", "__getattribute__",
-                    "__init__"):  # avoid recursion
+        if name in (
+            "_connector",
+            "call_record",
+            "name",
+            "_event_file",
+            "__class__",
+            "__dict__",
+            "__getattribute__",
+            "__init__",
+        ):  # avoid recursion
             return object.__getattribute__(self, name)
         if not hasattr(self._connector, name):
             return object.__getattribute__(self, name)
@@ -242,21 +300,96 @@ class TestSharedStorageConnector(SharedStorageConnector):
                     if isinstance(arg, int):
                         to_log.append(str(arg))
                     elif isinstance(arg, KVCacheBlocks):
-                        to_log.append(
-                            f"num_blocks={[len(b) for b in arg.blocks]}")
+                        to_log.append(f"num_blocks={[len(b) for b in arg.blocks]}")
 
                 # Log the event as a line to the file
                 try:
                     with open(self._event_file, "a") as f:
-                        f.write(' '.join(to_log) + "\n")
+                        f.write(" ".join(to_log) + "\n")
                 except Exception as e:
-                    print(f"[ERROR] Could not log event {name} "
-                          f"for {self.name}: {e}")
+                    print(f"[ERROR] Could not log event {name} for {self.name}: {e}")
                 return attr(*args, **kwargs)
 
             return wrapper
         return attr
 
 
-KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
-                                      TestSharedStorageConnector.__name__)
+@dataclass(frozen=True)
+class MockKVConfig:
+    matched_tokens: int = 0
+    is_async: bool = False
+
+
+class MockKVConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        # Scheduler tests check metadata.requests
+        self.requests: list = []
+
+
+class MockKVConnector(KVConnectorBase_V1):
+    """Mock KV connector for scheduler tests, supporting both sync and async mode."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        extra_config = self._kv_transfer_config.kv_connector_extra_config
+        self.config = MockKVConfig(
+            matched_tokens=extra_config["matched_tokens"],
+            is_async=extra_config["is_async"],
+        )
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (self.config.matched_tokens, self.config.is_async)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: KVCacheBlocks,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        metadata = MockKVConnectorMetadata()
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for req_id in chain(
+            (req.req_id for req in scheduler_output.scheduled_new_reqs),
+            (
+                req_id
+                for req_id in cached_reqs.req_ids
+                if req_id in cached_reqs.resumed_req_ids
+            ),
+        ):
+            metadata.requests.append({"req_id": req_id})
+        return metadata
+
+    def start_load_kv(self, kv_caches, finished_req_ids):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+KVConnectorFactory.register_connector(
+    "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
+)
+
+KVConnectorFactory.register_connector(
+    "MockKVConnector", __name__, MockKVConnector.__name__
+)
diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 0edb9513e3ff988b69c8c99a67b46146f4a739ca..0d4fa344d298c15d3657275ec78506b080773620 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -8,11 +8,20 @@ import torch
 
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
-from vllm.v1.attention.backends.flashinfer import FlashInferBackend
-from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
 
+BACKENDS_TO_TEST = [FlashAttentionBackend]
+
+if not current_platform.is_rocm():
+    from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+
+    BACKENDS_TO_TEST.append(FlashInferBackend)
+
+    from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
+
+    BACKENDS_TO_TEST.append(FlashAttnMLABackend)
+
 NUM_GPU_BLOCKS = [64]
 NUM_CPU_BLOCKS = [256]
 GPU_BLOCK_SIZES = [16]
@@ -22,7 +31,7 @@ NUM_HEADS = [8]
 NUM_LAYERS = [4]
 DTYPES = [torch.bfloat16]
 SEEDS = [0]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 NUM_MAPPINGS = [3]
 
 
@@ -55,36 +64,36 @@ def test_transfer(
 ) -> None:
     current_platform.seed_everything(seed)
 
-    # create per-layer GPU KV caches
-    attn_backends_list = [
-        FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend
-    ]
+    # create per-layer GPU KV caches based on available attn_backends
+    attn_backends_list = BACKENDS_TO_TEST
 
     gpu_caches = {}
     attn_backends = {}
     for i in range(num_layers):
-        layer_name = f'layer {i}'
+        layer_name = f"layer {i}"
 
         attn_backend = attn_backends_list[i % len(attn_backends_list)]
         attn_backends[layer_name] = attn_backend
 
         gpu_cache_shape = attn_backend.get_kv_cache_shape(
-            num_gpu_blocks, gpu_block_size, num_heads, head_size)
-        gpu_caches[layer_name] = torch.rand(gpu_cache_shape,
-                                            dtype=dtype,
-                                            device=device)
+            num_gpu_blocks, gpu_block_size, num_heads, head_size
+        )
+        gpu_caches[layer_name] = torch.rand(gpu_cache_shape, dtype=dtype, device=device)
 
     # create handler
     cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
-    handler = CpuGpuOffloadingHandler(attn_backends=attn_backends,
-                                      gpu_block_size=gpu_block_size,
-                                      cpu_block_size=cpu_block_size,
-                                      num_cpu_blocks=num_cpu_blocks,
-                                      gpu_caches=gpu_caches)
+    handler = CpuGpuOffloadingHandler(
+        attn_backends=attn_backends,
+        gpu_block_size=gpu_block_size,
+        cpu_block_size=cpu_block_size,
+        num_cpu_blocks=num_cpu_blocks,
+        gpu_caches=gpu_caches,
+    )
 
     # select block mappings
-    gpu_blocks = random.sample(range(num_gpu_blocks),
-                               num_mappings * gpu_blocks_per_cpu_block)
+    gpu_blocks = random.sample(
+        range(num_gpu_blocks), num_mappings * gpu_blocks_per_cpu_block
+    )
     cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
 
     # convert cpu blocks to gpu block size
@@ -96,9 +105,10 @@ def test_transfer(
 
     # maybe skip a GPU block to test writing to the middle of a CPU block
     if gpu_to_cpu:
-        gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:]
+        gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
         cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
-            gpu_blocks_per_cpu_block - 1:]
+            gpu_blocks_per_cpu_block - 1 :
+        ]
 
     # set transfer direction
     if gpu_to_cpu:
@@ -124,8 +134,9 @@ def test_transfer(
 
     # build dst -> src mapping
     dst_to_src = {}
-    for src_block, dst_block in zip(src_blocks_in_gpu_block_size,
-                                    dst_blocks_in_gpu_block_size):
+    for src_block, dst_block in zip(
+        src_blocks_in_gpu_block_size, dst_blocks_in_gpu_block_size
+    ):
         dst_to_src[dst_block] = src_block
 
     # build transfer specs
@@ -157,8 +168,11 @@ def test_transfer(
     for dst_block in range(dst_size_in_gpu_blocks):
         src_block_candidate = dst_to_src.get(dst_block)
         for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
-                src_kv_caches, dst_kv_caches, orig_dst_caches,
-                handler.kv_dim_before_num_blocks):
+            src_kv_caches,
+            dst_kv_caches,
+            orig_dst_caches,
+            handler.kv_dim_before_num_blocks,
+        ):
             if kv_dim:
                 # iterate over key, value
                 for i in range(2):
@@ -166,12 +180,14 @@ def test_transfer(
                         expected_value = src_cache[i][src_block_candidate]
                     else:
                         expected_value = orig_dst_cache[i][dst_block]
-                    torch.testing.assert_close(dst_cache[i][dst_block].cpu(),
-                                               expected_value.cpu())
+                    torch.testing.assert_close(
+                        dst_cache[i][dst_block].cpu(), expected_value.cpu()
+                    )
             else:
                 if src_block_candidate is not None:
                     expected_value = src_cache[src_block_candidate]
                 else:
                     expected_value = orig_dst_cache[dst_block]
-                torch.testing.assert_close(dst_cache[dst_block].cpu(),
-                                           expected_value.cpu())
+                torch.testing.assert_close(
+                    dst_cache[dst_block].cpu(), expected_value.cpu()
+                )
diff --git a/tests/v1/kv_offload/test_cpu_manager.py b/tests/v1/kv_offload/test_cpu_manager.py
index cdee7811d85b3f0985750cd7cbf273c6f2699e13..839cd9b6dc55c8aaf534089cb6890e9fa94d7581 100644
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/test_cpu_manager.py
@@ -2,13 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 import numpy as np
 
 from vllm.v1.core.kv_cache_utils import BlockHash
-from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
-                                         PrepareStoreOutput)
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
@@ -26,31 +29,38 @@ def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
 
 
 def verify_store_output(
-        prepare_store_output: Optional[PrepareStoreOutput],
-        expected_prepare_store_output: ExpectedPrepareStoreOutput):
+    prepare_store_output: PrepareStoreOutput | None,
+    expected_prepare_store_output: ExpectedPrepareStoreOutput,
+):
     assert prepare_store_output is not None
-    assert (prepare_store_output.block_hashes_to_store == to_hashes(
-        expected_prepare_store_output.block_hashes_to_store))
-    assert (prepare_store_output.block_hashes_evicted == to_hashes(
-        expected_prepare_store_output.block_hashes_evicted))
+    assert prepare_store_output.block_hashes_to_store == to_hashes(
+        expected_prepare_store_output.block_hashes_to_store
+    )
+    assert prepare_store_output.block_hashes_evicted == to_hashes(
+        expected_prepare_store_output.block_hashes_evicted
+    )
     store_spec = prepare_store_output.store_spec
     assert isinstance(store_spec, CPULoadStoreSpec)
-    expected_array = np.array(expected_prepare_store_output.store_block_ids,
-                              dtype=np.int64)
+    expected_array = np.array(
+        expected_prepare_store_output.store_block_ids, dtype=np.int64
+    )
     assert np.array_equal(expected_array, store_spec.block_ids)
 
 
-def verify_load_output(prepare_load_output: LoadStoreSpec,
-                       expected_prepare_load_output: list[int]):
+def verify_load_output(
+    prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
+):
     assert isinstance(prepare_load_output, CPULoadStoreSpec)
     expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
     assert np.array_equal(expected_array, prepare_load_output.block_ids)
 
 
-def verify_events(events: Iterable[OffloadingEvent],
-                  block_size: int,
-                  expected_stores: tuple[set[int], ...] = (),
-                  expected_evictions: tuple[set[int], ...] = ()):
+def verify_events(
+    events: Iterable[OffloadingEvent],
+    block_size: int,
+    expected_stores: tuple[set[int], ...] = (),
+    expected_evictions: tuple[set[int], ...] = (),
+):
     stores: list[set[BlockHash]] = []
     evictions: list[set[BlockHash]] = []
     for event in events:
@@ -61,8 +71,7 @@ def verify_events(events: Iterable[OffloadingEvent],
         else:
             stores.append(set(event.block_hashes))
 
-    def to_hash_sets(
-            int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
+    def to_hash_sets(int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
         return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
 
     assert tuple(evictions) == to_hash_sets(expected_evictions)
@@ -86,7 +95,8 @@ def test_cpu_manager():
             block_hashes_to_store=[1, 2],
             store_block_ids=[0, 1],
             block_hashes_evicted=[],
-        ))
+        ),
+    )
 
     # lookup [1, 2] -> not ready
     assert cpu_manager.lookup(to_hashes([1, 2])) == 0
@@ -96,9 +106,9 @@ def test_cpu_manager():
 
     # complete store [1, 2]
     cpu_manager.complete_store(to_hashes([1, 2]))
-    verify_events(cpu_manager.take_events(),
-                  block_size=block_size,
-                  expected_stores=({1, 2}, ))
+    verify_events(
+        cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
 
     # lookup [1, 2]
     assert cpu_manager.lookup(to_hashes([1])) == 1
@@ -113,12 +123,13 @@ def test_cpu_manager():
             block_hashes_to_store=[3, 4, 5],
             store_block_ids=[2, 3, 0],
             block_hashes_evicted=[1],
-        ))
+        ),
+    )
 
     # verify eviction event
-    verify_events(cpu_manager.take_events(),
-                  block_size=block_size,
-                  expected_evictions=({1}, ))
+    verify_events(
+        cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
+    )
 
     # prepare store with no space
     assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
@@ -144,7 +155,8 @@ def test_cpu_manager():
             block_hashes_to_store=[6, 7, 8],
             store_block_ids=[3, 2, 1],
             block_hashes_evicted=[2, 3, 4],
-        ))
+        ),
+    )
 
     # complete store [6, 7, 8]
     cpu_manager.complete_store(to_hashes([6, 7, 8]))
@@ -160,7 +172,8 @@ def test_cpu_manager():
             block_hashes_to_store=[9],
             store_block_ids=[1],
             block_hashes_evicted=[8],
-        ))
+        ),
+    )
 
     # complete store [7, 9] with failure
     cpu_manager.complete_store(to_hashes([7, 9]), success=False)
@@ -169,7 +182,316 @@ def test_cpu_manager():
     assert cpu_manager.lookup(to_hashes([7])) == 1
     assert cpu_manager.lookup(to_hashes([9])) == 0
 
-    verify_events(cpu_manager.take_events(),
-                  block_size=block_size,
-                  expected_stores=({3, 4, 5}, {6, 7, 8}),
-                  expected_evictions=({2, 3, 4}, {8}))
+    verify_events(
+        cpu_manager.take_events(),
+        block_size=block_size,
+        expected_stores=({3, 4, 5}, {6, 7, 8}),
+        expected_evictions=({2, 3, 4}, {8}),
+    )
+
+
+def test_arc_manager_basic():
+    """
+    Tests ARCOffloadingManager basic operations with a CPUBackend.
+    Verifies that ARC handles store, load, and lookup operations correctly.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ),
+    )
+
+    # lookup [1, 2] -> not ready
+    assert arc_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(arc_manager.take_events()) == []
+
+    # complete store [1, 2]
+    arc_manager.complete_store(to_hashes([1, 2]))
+    verify_events(
+        arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
+
+    # lookup [1, 2]
+    assert arc_manager.lookup(to_hashes([1])) == 1
+    assert arc_manager.lookup(to_hashes([1, 2])) == 2
+    assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # blocks should be in T1 (recent)
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 0
+
+
+def test_arc_manager_t1_to_t2_promotion():
+    """
+    Tests that accessing a block in T1 promotes it to T2 (frequent).
+    This is a key feature of ARC's adaptive behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store and complete block 1
+    arc_manager.prepare_store(to_hashes([1]))
+    arc_manager.complete_store(to_hashes([1]))
+
+    # block 1 starts in T1 (recent)
+    assert to_hashes([1])[0] in arc_manager.t1
+    assert to_hashes([1])[0] not in arc_manager.t2
+
+    # touch block 1 (simulate second access)
+    arc_manager.touch(to_hashes([1]))
+
+    # block 1 should now be in T2 (frequent)
+    assert to_hashes([1])[0] not in arc_manager.t1
+    assert to_hashes([1])[0] in arc_manager.t2
+
+
+def test_arc_manager_eviction_with_load():
+    """
+    Tests ARC eviction behavior similar to LRU test.
+    Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare and complete store [1, 2, 3, 4]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2, 3, 4],
+            store_block_ids=[0, 1, 2, 3],
+            block_hashes_evicted=[],
+        ),
+    )
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare load [2, 3] (increases ref_cnt)
+    prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store [5, 6, 7] with [2, 3] being loaded
+    # should fail because [2, 3] have ref_cnt > 0
+    assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
+
+    # complete load [2, 3]
+    arc_manager.complete_load(to_hashes([2, 3]))
+
+    # now prepare store [5, 6, 7] should succeed
+    # ARC will evict blocks one at a time from T1 as needed
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
+    assert prepare_store_output is not None
+    # Should successfully evict enough blocks to make room (at least 1)
+    assert len(prepare_store_output.block_hashes_evicted) >= 1
+
+
+def test_arc_manager_adaptive_target():
+    """
+    Tests ARC's adaptive target adjustment via ghost lists.
+    When a block in B1 (ghost list) is accessed, target_t1_size increases.
+    When a block in B2 is accessed, target_t1_size decreases.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2 (fills cache)
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    initial_target = arc_manager.target_t1_size
+
+    # store block 3, evicting block 1 (moves to B1 ghost list)
+    arc_manager.prepare_store(to_hashes([3]))
+    arc_manager.complete_store(to_hashes([3]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+
+    # touch block 1 (cache miss, but in B1)
+    # this should increase target_t1_size (favor recency)
+    arc_manager.touch(to_hashes([1]))
+
+    # target should have increased
+    assert arc_manager.target_t1_size > initial_target
+
+
+def test_arc_manager_t1_t2_eviction_policy():
+    """
+    Tests that ARC evicts from T1 or T2 based on target_t1_size.
+    If |T1| >= target_t1_size, evict from T1, otherwise from T2.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote blocks 3, 4 to T2 by touching them
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # now: T1 = {1, 2}, T2 = {3, 4}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # set target_t1_size to prefer evicting from T1
+    # (when |T1| >= target, evict from T1)
+    arc_manager.target_t1_size = 1
+
+    # store block 5, should evict from T1 (block 1, LRU in T1)
+    output = arc_manager.prepare_store(to_hashes([5]))
+    assert output is not None
+    assert to_hashes([1]) == output.block_hashes_evicted
+
+    arc_manager.complete_store(to_hashes([5]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+    # block 5 should be in T1
+    assert to_hashes([5])[0] in arc_manager.t1
+
+
+def test_arc_manager_ghost_list_bounds():
+    """
+    Tests that ghost lists (B1, B2) don't grow unbounded.
+    They should be capped at cache_capacity.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # fill cache with blocks 1, 2
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store many blocks to fill ghost lists
+    for i in range(3, 20):
+        arc_manager.prepare_store(to_hashes([i]))
+        arc_manager.complete_store(to_hashes([i]))
+
+    # ghost lists should not exceed cache_capacity
+    assert len(arc_manager.b1) <= arc_manager.cache_capacity
+    assert len(arc_manager.b2) <= arc_manager.cache_capacity
+
+
+def test_arc_manager_touch_ordering():
+    """
+    Tests that touch() correctly updates access patterns.
+    Similar to LRU test but verifies T1/T2 ordering.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote 3, 4 to T2
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # T1 = {1, 2}, T2 = {3, 4}
+    # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
+    arc_manager.touch(to_hashes([1, 3, 4]))
+
+    # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
+    assert len(arc_manager.t1) == 1
+    assert len(arc_manager.t2) == 3
+
+    # store block 5, should evict from T1 (block 2, only one in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[5],
+            store_block_ids=[1],  # reuses block 2's storage
+            block_hashes_evicted=[2],
+        ),
+    )
+
+
+def test_arc_manager_failed_store():
+    """
+    Tests that failed store operations clean up correctly.
+    Similar to LRU test but for ARC.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare store block 5 (will evict block 1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+
+    # complete store with failure
+    arc_manager.complete_store(to_hashes([5]), success=False)
+
+    # block 5 should not be in cache
+    assert arc_manager.lookup(to_hashes([5])) == 0
+    # block 5 should not be in T1 or T2
+    assert to_hashes([5])[0] not in arc_manager.t1
+    assert to_hashes([5])[0] not in arc_manager.t2
+
+    # evicted block should still be gone (in B1 ghost list)
+    evicted_hash = prepare_store_output.block_hashes_evicted[0]
+    assert evicted_hash in arc_manager.b1
+
+
+def test_arc_manager_full_scenario():
+    """
+    Comprehensive test covering multiple ARC operations in sequence.
+    Similar to the full LRU test but adapted for ARC behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store [1, 2]
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store [3, 4, 5] -> evicts [1]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+    arc_manager.complete_store(to_hashes([3, 4, 5]))
+
+    # promote some blocks to T2
+    arc_manager.touch(to_hashes([2, 3]))
+
+    # T1 has {4, 5}, T2 has {2, 3}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # store [6] -> should evict from T1 (4 is oldest in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
+    assert prepare_store_output is not None
+    arc_manager.complete_store(to_hashes([6]))
+
+    # verify blocks 2, 3 (in T2) are still present
+    assert arc_manager.lookup(to_hashes([2])) == 1
+    assert arc_manager.lookup(to_hashes([3])) == 1
+
+    # verify events
+    events = list(arc_manager.take_events())
+    assert len(events) > 0  # should have store and eviction events
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index fc8ca09bea3de184708eda9b1cb6653ad6126603..b654ea4298dbb0e0827e2e3fdfd9db4847ec1d93 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -1,15 +1,68 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import socket
 import time
 
+import msgspec
+import msgspec.msgpack
 import pytest
+import zmq
+from tqdm import tqdm
 
-from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig
+from vllm import LLM, SamplingParams, TokensPrompt
+from vllm.config import KVEventsConfig, KVTransferConfig
+from vllm.distributed.kv_events import BlockStored, KVEventBatch
 
 CPU_BLOCK_SIZES = [16, 48]
 
 
+class MockSubscriber:
+    """Helper class to receive and verify published events"""
+
+    def __init__(
+        self,
+        endpoint: str,
+        topic: str,
+    ):
+        self.ctx = zmq.Context.instance()
+        self.topic_bytes = topic.encode("utf-8")
+
+        # Set up subscriber socket
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, self.topic_bytes)
+        self.sub.connect(endpoint)
+
+        self.decoder = msgspec.msgpack.Decoder(type=KVEventBatch)
+
+    def get_new_cpu_stored_events(self) -> list[BlockStored]:
+        cpu_stored_events: list[BlockStored] = []
+
+        poller = zmq.Poller()
+        poller.register(self.sub, zmq.POLLIN)
+
+        timeout = 1000  # 1 second
+        while True:
+            events = dict(poller.poll(timeout))
+
+            if events.get(self.sub) != zmq.POLLIN:
+                return cpu_stored_events
+
+            topic_bytes, _, payload = self.sub.recv_multipart()
+
+            assert topic_bytes == self.topic_bytes
+
+            event_batch = self.decoder.decode(payload)
+            assert isinstance(event_batch, KVEventBatch)
+            for event in event_batch.events:
+                if isinstance(event, BlockStored) and event.medium == "CPU":
+                    cpu_stored_events.append(event)
+                    timeout = 100
+
+    def close(self):
+        """Clean up resources"""
+        self.sub.close()
+
+
 @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
 def test_cpu_offloading(cpu_block_size: int) -> None:
     """
@@ -21,42 +74,79 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
         kv_connector="OffloadingConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
-            "num_cpu_blocks": 100,
-            "block_size": cpu_block_size
+            "num_cpu_blocks": 1000,
+            "block_size": cpu_block_size,
         },
     )
 
+    port: int
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("0.0.0.0", 0))
+        port = s.getsockname()[1]
+
+    events_endpoint = f"tcp://*:{port}"
+    kv_events_config = KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=events_endpoint,
+        topic="test",
+    )
+
     llm = LLM(
         model="meta-llama/Llama-3.2-1B-Instruct",
         gpu_memory_utilization=0.5,
+        kv_events_config=kv_events_config,
         kv_transfer_config=kv_transfer_config,
     )
 
-    prompts = ["Hi " * 100]
-    sampling_params = SamplingParams(temperature=0, max_tokens=20)
+    sampling_params = SamplingParams(temperature=0, max_tokens=1)
+
+    events_endpoint = events_endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
+
+    try:
+        num_times_cpu_better_than_cold = 0
+        num_tests = 10
+        total_cold_time = 0.0
+        total_gpu_hit_time = 0.0
+        total_cpu_hit_time = 0.0
+        prompt_token_ids = [0] * 10001
+        for i in tqdm(range(num_tests), desc="Running tests"):
+            prompt_token_ids[0] = i
+            prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
+
+            # run generation - this should trigger saving KV cache
+            start_time = time.time()
+            llm.generate(prompts, sampling_params, use_tqdm=False)
+            cold_time = time.time() - start_time
+            total_cold_time += cold_time
+
+            # run generation again - should hit the GPU prefix cache
+            start_time = time.time()
+            llm.generate(prompts, sampling_params, use_tqdm=False)
+            gpu_hit_time = time.time() - start_time
+            total_gpu_hit_time += gpu_hit_time
 
-    # run generation - this should trigger saving KV cache
-    start_time = time.time()
-    llm.generate(prompts, sampling_params, use_tqdm=False)
-    cold_time = time.time() - start_time
+            # reset prefix cache to avoid GPU hit.
+            llm.reset_prefix_cache()
 
-    # run generation again - should hit the GPU prefix cache
-    start_time = time.time()
-    llm.generate(prompts, sampling_params, use_tqdm=False)
-    gpu_hit_time = time.time() - start_time
+            assert subscriber.get_new_cpu_stored_events()
 
-    # reset prefix cache to avoid GPU hit.
-    llm.reset_prefix_cache()
+            # run generation again - this should trigger loading from CPU
+            start_time = time.time()
+            llm.generate(prompts, sampling_params, use_tqdm=False)
+            cpu_hit_time = time.time() - start_time
+            total_cpu_hit_time += cpu_hit_time
 
-    # sleep for a sec to make sure CPU finished storing
-    time.sleep(1)
+            if cpu_hit_time < cold_time:
+                num_times_cpu_better_than_cold += 1
 
-    # run generation again - this should trigger loading from CPU
-    start_time = time.time()
-    llm.generate(prompts, sampling_params, use_tqdm=False)
-    cpu_hit_time = time.time() - start_time
+        print("Average times:")
+        print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
+        print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
+        print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
 
-    print("Generation times:")
-    print(f"    Cold: {cold_time * 1000:.2f}ms")
-    print(f"    GPU hit: {gpu_hit_time * 1000:.2f}ms")
-    print(f"    CPU hit: {cpu_hit_time * 1000:.2f}ms")
+        assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+    finally:
+        subscriber.close()
+        del llm
diff --git a/tests/v1/kv_offload/test_worker.py b/tests/v1/kv_offload/test_worker.py
index 6cf8aa0875d62693a28aa9862512212b878f967c..6fcd408f3c593ea225cb1b985430a17577ee99b7 100644
--- a/tests/v1/kv_offload/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@@ -1,17 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.kv_offload.abstract import LoadStoreSpec
-from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
-                                              OffloadingWorker, TransferResult,
-                                              TransferSpec)
+from vllm.v1.kv_offload.worker.worker import (
+    OffloadingHandler,
+    OffloadingWorker,
+    TransferResult,
+    TransferSpec,
+)
 
 
 class LoadStoreSpec1(LoadStoreSpec):
-
-    def __init__(self,
-                 submit_success: bool = True,
-                 async_success: bool = True,
-                 exception: bool = False):
+    def __init__(
+        self,
+        submit_success: bool = True,
+        async_success: bool = True,
+        exception: bool = False,
+    ):
         self.finished = False
         self.submit_success = submit_success
         self.async_success = async_success
@@ -26,7 +30,6 @@ class LoadStoreSpec1(LoadStoreSpec):
 
 
 class LoadStoreSpec2(LoadStoreSpec):
-
     @staticmethod
     def medium() -> str:
         return "2"
@@ -36,7 +39,6 @@ class LoadStoreSpec2(LoadStoreSpec):
 
 
 class OffloadingHandler1To2(OffloadingHandler):
-
     def __init__(self):
         self.transfers: dict[int, LoadStoreSpec1] = {}
 
@@ -63,7 +65,6 @@ class OffloadingHandler1To2(OffloadingHandler):
 
 
 class OffloadingHandler2To1(OffloadingHandler):
-
     def __init__(self):
         self.transfers: dict[int, LoadStoreSpec1] = {}
 
@@ -144,9 +145,9 @@ def test_offloading_worker():
     assert 7 in handler2to1.transfers
 
     # verify result of 3rd and 4th transfers
-    assert (sorted(worker.get_finished()) == [(3, False), (4, True)])
+    assert sorted(worker.get_finished()) == [(3, False), (4, True)]
 
     # complete 6th and 7th transfers
     src6.finished = True
     dst7.finished = True
-    assert (sorted(worker.get_finished()) == [(6, True), (7, True)])
+    assert sorted(worker.get_finished()) == [(6, True), (7, True)]
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
index 43caef79b02f7ce296605878b0ad5b9a5bbe911e..dac7ffed69d4a0cb93cc272935d6ad2338ee88ee 100644
--- a/tests/v1/logits_processors/test_correctness.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -3,31 +3,35 @@
 
 import random
 from collections.abc import Callable
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple, TypeAlias
 
 import numpy as np
 import pytest
 import torch
 
 from tests.utils import create_new_process_for_each_test
-from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
-                                   create_penalty_tensor,
-                                   create_prompt_tokens_tensor,
-                                   fake_apply_logitsprocs,
-                                   fake_update_logitsprocs_state)
+from tests.v1.sample.utils import (
+    LogitsprocsTestFakes,
+    create_fake_logits,
+    create_penalty_tensor,
+    create_prompt_tokens_tensor,
+    fake_apply_logitsprocs,
+    fake_update_logitsprocs_state,
+)
 from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_pin_memory_available
-# yapf: disable
-from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
-                                             LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             build_logitsprocs)
-# yapf: enable
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    BatchUpdateBuilder,
+    LogitBiasLogitsProcessor,
+    LogitsProcessor,
+    MinPLogitsProcessor,
+    MinTokensLogitsProcessor,
+    MoveDirectionality,
+    build_logitsprocs,
+)
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -44,14 +48,15 @@ REQS_PER_LOGITPROC = 50
 STR_NO_LOGITPROC = "none"
 
 # LogitsProcessor subclass or "none"
-LogitprocType = Union[type[LogitsProcessor], str]
+LogitprocType: TypeAlias = type[LogitsProcessor] | str
 
 
 class LogitsProcsRequestParams:
     """Encapsulates key params for a single request in a batch.
-    
+
     Params can be customized based on the enabled logitproc
     """
+
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
@@ -64,14 +69,13 @@ class LogitsProcsRequestParams:
         # Number of output tokens is randomly 0 or twice the min-tokens
         # threshold which will be used in testing. Output token values
         # don't matter *for these tests* so use 0 as a dummy value
-        self.out_tokens = ([0] *
-                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.out_tokens = [0] * (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2))
         self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
         """For debugging"""
-        summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
+        summ = ", ".join(f"{k}={v}" for k, v in vars(self).items())
         return f"MyClass({summ})"
 
 
@@ -86,12 +90,13 @@ def _generate_fake_sampling_metadata(
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
         output_token_ids.append(
-            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
+        )
         prompt_token_ids.append(
-            np.random.randint(0,
-                              vocab_size,
-                              size=np.random.randint(
-                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+            np.random.randint(
+                0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
+            ).tolist()
+        )
     logitsprocs = build_logitsprocs(
         vllm_config=VllmConfig(),
         device=device,
@@ -99,15 +104,16 @@ def _generate_fake_sampling_metadata(
         is_pooling_model=False,
     )
     fake_sampling_metadata = SamplingMetadata(
-        temperature=torch.full((batch_size, ), 0.0),
+        temperature=torch.full((batch_size,), 0.0),
         all_greedy=True,
         all_random=False,
         top_p=None,
         top_k=None,
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids=create_prompt_tokens_tensor(prompt_token_ids,
-                                                     vocab_size, device),
+        prompt_token_ids=create_prompt_tokens_tensor(
+            prompt_token_ids, vocab_size, device
+        ),
         output_token_ids=output_token_ids,
         frequency_penalties=create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=create_penalty_tensor(batch_size, 0.0, device),
@@ -115,7 +121,8 @@ def _generate_fake_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=logitsprocs)
+        logitsprocs=logitsprocs,
+    )
     return fake_sampling_metadata
 
 
@@ -127,15 +134,15 @@ def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
         fake_logits[i, 0] = 10.0  # High logit for first token
         fake_logits[i, 1:] = 1e-2  # Others remain low
     sampling_metadata = _generate_fake_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     return LogitsprocsTestFakes(
         logits=fake_logits,
         sampling_metadata=sampling_metadata,
     )
 
 
-def _sampling_params_from_logitproc(
-        logitproc_type: LogitprocType) -> SamplingParams:
+def _sampling_params_from_logitproc(logitproc_type: LogitprocType) -> SamplingParams:
     """Customize request SamplingParams for a specified logitproc"""
     # SamplingParams for req with no logitproc
     kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
@@ -150,7 +157,7 @@ def _generate_mixed_logitsprocs_batch_params(
 ) -> list[LogitsProcsRequestParams]:
     """Define key params for a batch of requests with a different
     logitproc enabled per request.
-    
+
     The batch will have `reqs_per_logitproc` repeats for all
     `logitsprocs_types` under test, including the case where
     no logitsproc is enabled. The batch is randomly shuffled. The
@@ -173,7 +180,8 @@ def _generate_mixed_logitsprocs_batch_params(
     return [
         LogitsProcsRequestParams(
             workload_index=idx,
-            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc])
+            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc],
+        )
         for idx, pdx in enumerate(batch_perm)
     ]
 
@@ -185,10 +193,12 @@ def _raise_error_invalid(
     step_idx: int,
     err_cls: type[Exception] = ValueError,
 ) -> None:
-    raise err_cls(f"Validation failed for step={step_idx}, "
-                  f"batch_index={batch_index}, "
-                  f"workload_index={request_params.workload_index}, "
-                  f"req_params={request_params}. Reason: {msg_suffix}")
+    raise err_cls(
+        f"Validation failed for step={step_idx}, "
+        f"batch_index={batch_index}, "
+        f"workload_index={request_params.workload_index}, "
+        f"req_params={request_params}. Reason: {msg_suffix}"
+    )
 
 
 def _logit_bias_params(kwargs: dict) -> None:
@@ -208,8 +218,7 @@ def _logit_bias_validate(
 ) -> None:
     """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
-    logits_old = (
-        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits_old = test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()
     logits_new = logits_new[batch_index].cpu()
     for token_id in range(VOCAB_SIZE):
         logit_old_value = logits_old[token_id]
@@ -218,22 +227,28 @@ def _logit_bias_validate(
             bias_value = logit_bias[token_id]
             exp_value = bias_value + logit_old_value
             if logit_new_value != pytest.approx(exp_value):
-                _raise_error_invalid(msg_suffix=(
-                    f"Biased token {token_id} logit value {logit_new_value} "
-                    f"does not match expected value {exp_value} "
-                    f"given bias {bias_value}"),
-                                     batch_index=batch_index,
-                                     request_params=request_params,
-                                     step_idx=step_idx)
+                _raise_error_invalid(
+                    msg_suffix=(
+                        f"Biased token {token_id} logit value {logit_new_value} "
+                        f"does not match expected value {exp_value} "
+                        f"given bias {bias_value}"
+                    ),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx,
+                )
 
         else:
             if logit_new_value != pytest.approx(logit_old_value):
-                _raise_error_invalid(msg_suffix=(
-                    f"Unbiased token {token_id} logit value {logit_new_value} "
-                    f"does not match expected value {logit_old_value}"),
-                                     batch_index=batch_index,
-                                     request_params=request_params,
-                                     step_idx=step_idx)
+                _raise_error_invalid(
+                    msg_suffix=(
+                        f"Unbiased token {token_id} logit value {logit_new_value} "
+                        f"does not match expected value {logit_old_value}"
+                    ),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx,
+                )
 
 
 def _min_p_params(kwargs: dict) -> None:
@@ -259,26 +274,27 @@ def _min_p_validate(
                     msg_suffix="Invalid: dominant token 0 masked (-inf)",
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
         else:
             if request_params.params.min_p > 0.0:
                 # Non-dominant tokens should be masked when min_p > 0
                 if logits_for_token != -float("inf"):
                     _raise_error_invalid(
-                        msg_suffix=
-                        f"Invalid: non-dominant token {token_id} not masked",
+                        msg_suffix=f"Invalid: non-dominant token {token_id} not masked",
                         batch_index=batch_index,
                         request_params=request_params,
-                        step_idx=step_idx)
+                        step_idx=step_idx,
+                    )
             else:
                 # No masking when min_p is 0
                 if logits_for_token == -float("inf"):
                     _raise_error_invalid(
-                        msg_suffix=
-                        f"Invalid: token {token_id} masked when min_p=0.0",
+                        msg_suffix=f"Invalid: token {token_id} masked when min_p=0.0",
                         batch_index=batch_index,
                         request_params=request_params,
-                        step_idx=step_idx)
+                        step_idx=step_idx,
+                    )
 
 
 def _min_tokens_params(kwargs: dict) -> None:
@@ -303,7 +319,8 @@ def _min_tokens_validate(
     min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
     mt_lp: MinTokensLogitsProcessor = next(
-        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor))
+        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor)
+    )
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
 
@@ -312,38 +329,50 @@ def _min_tokens_validate(
         (_, out_tok, all_stop_token_ids) = min_tok
         num_out_tokens = len(out_tok)
         if num_out_tokens != ref_num_out_tokens:
-            _raise_error_invalid(msg_suffix=(
-                "Number of output tokens in min-token logit processor "
-                f"request metadata ({num_out_tokens}) does not match "
-                f"reference ({ref_num_out_tokens})."),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Number of output tokens in min-token logit processor "
+                    f"request metadata ({num_out_tokens}) does not match "
+                    f"reference ({ref_num_out_tokens})."
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+            )
         if ref_all_stop_token_ids != all_stop_token_ids:
-            _raise_error_invalid(msg_suffix=(
-                "Stop token ids do not match reference; all_stop_token_ids: "
-                f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
-                f"{sorted(ref_all_stop_token_ids)}"),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Stop token ids do not match reference; all_stop_token_ids: "
+                    f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
+                    f"{sorted(ref_all_stop_token_ids)}"
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+            )
         if min_reached:
-            _raise_error_invalid(msg_suffix=(
-                "Expected min-tokens request with min reached, but batch "
-                "index is recognized by min-tokens logits processor."),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx,
-                                 err_cls=RuntimeError)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Expected min-tokens request with min reached, but batch "
+                    "index is recognized by min-tokens logits processor."
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+                err_cls=RuntimeError,
+            )
 
     elif not min_reached:
-        _raise_error_invalid(msg_suffix=(
-            "Expected min-tokens request with min not reached, but batch "
-            "index is not recognized by min-tokens logits processor."),
-                             batch_index=batch_index,
-                             request_params=request_params,
-                             step_idx=step_idx,
-                             err_cls=RuntimeError)
+        _raise_error_invalid(
+            msg_suffix=(
+                "Expected min-tokens request with min not reached, but batch "
+                "index is not recognized by min-tokens logits processor."
+            ),
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx,
+            err_cls=RuntimeError,
+        )
 
     # Validate min-token logits
     for token_id in range(VOCAB_SIZE):
@@ -351,21 +380,27 @@ def _min_tokens_validate(
         if token_id in ref_all_stop_token_ids and not min_reached:
             if logits_for_token != -float("inf"):
                 _raise_error_invalid(
-                    msg_suffix=(f"Token {token_id} is a stop token and "
-                                "the sequence has not reached min length, "
-                                "but the token is not masked "
-                                f"(logit={logits_for_token})"),
+                    msg_suffix=(
+                        f"Token {token_id} is a stop token and "
+                        "the sequence has not reached min length, "
+                        "but the token is not masked "
+                        f"(logit={logits_for_token})"
+                    ),
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
         else:
             if logits_for_token == -float("inf"):
                 _raise_error_invalid(
-                    msg_suffix=(f"Token {token_id} should not be masked but "
-                                f"is (output len={ref_num_out_tokens})"),
+                    msg_suffix=(
+                        f"Token {token_id} should not be masked but "
+                        f"is (output len={ref_num_out_tokens})"
+                    ),
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
 
 
 def _none_validate(
@@ -377,52 +412,58 @@ def _none_validate(
     step_idx: int,
 ) -> None:
     """Validate that no logits processors are applied"""
-    logits = (
-        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits = test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()
     ref_logits = logits_new[batch_index]
     if not torch.all(ref_logits == logits):
-        mismatch_toks = (ref_logits
-                         != logits).nonzero(as_tuple=True)[0].tolist()
+        mismatch_toks = (ref_logits != logits).nonzero(as_tuple=True)[0].tolist()
         mismatch_strs = []
         for token in mismatch_toks:
             val = float(logits[token])
             ref_val = float(ref_logits[token])
             mismatch_strs.append(f"({token=},{val=},{ref_val=})")
-        _raise_error_invalid(msg_suffix=(
-            f"Unexpected modification of logits: {','.join(mismatch_strs)}"),
-                             batch_index=batch_index,
-                             request_params=request_params,
-                             step_idx=step_idx)
+        _raise_error_invalid(
+            msg_suffix=(
+                f"Unexpected modification of logits: {','.join(mismatch_strs)}"
+            ),
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx,
+        )
 
 
 class LogitsprocTestHelpers(NamedTuple):
     """Supports setting up and validating logitsprocs unit tests."""
+
     eval_fxn: Callable
-    gen_request_fxn: Optional[Callable] = None
+    gen_request_fxn: Callable | None = None
 
 
 logitsprocs_test_mapping = {
-    STR_NO_LOGITPROC:
-    LogitsprocTestHelpers(eval_fxn=_none_validate),
-    LogitBiasLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
-                          eval_fxn=_logit_bias_validate),
-    MinPLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
-                          eval_fxn=_min_p_validate),
-    MinTokensLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
-                          eval_fxn=_min_tokens_validate),
+    STR_NO_LOGITPROC: LogitsprocTestHelpers(eval_fxn=_none_validate),
+    LogitBiasLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_logit_bias_params, eval_fxn=_logit_bias_validate
+    ),
+    MinPLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_min_p_params, eval_fxn=_min_p_validate
+    ),
+    MinTokensLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_min_tokens_params, eval_fxn=_min_tokens_validate
+    ),
 }
 
 
 def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_types = list(logitsprocs_test_mapping.keys())
-    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
-                                   for logitproc_type in logitsprocs_types
-                                   if logitproc_type != STR_NO_LOGITPROC
-                                   ] + [logitsprocs_types]
+    return (
+        [[STR_NO_LOGITPROC]]
+        + [
+            [logitproc_type, STR_NO_LOGITPROC]
+            for logitproc_type in logitsprocs_types
+            if logitproc_type != STR_NO_LOGITPROC
+        ]
+        + [logitsprocs_types]
+    )
 
 
 def _generate_fake_step_update(
@@ -430,7 +471,7 @@ def _generate_fake_step_update(
     workload_params: list[LogitsProcsRequestParams],
     wdx: int,
     batch_update_builder: BatchUpdateBuilder,
-) -> tuple[Optional[BatchUpdate], int, int]:
+) -> tuple[BatchUpdate | None, int, int]:
     batch_size = len(persistent_batch)
     workload_size = len(workload_params)
     workload_reqs_remaining = workload_size - wdx
@@ -440,11 +481,18 @@ def _generate_fake_step_update(
     # Other 50%: add a limited number of reqs (less than the number
     # of workload reqs remaining, less than an arbitrary max)
     # If no workload reqs remain: 100% of steps have 0 adds
-    num_step_add = random.choice([
-        0,
-        random.randint(1, min(max_add_remove_per_step,
-                              workload_reqs_remaining))
-    ]) if workload_reqs_remaining else 0
+    num_step_add = (
+        random.choice(
+            [
+                0,
+                random.randint(
+                    1, min(max_add_remove_per_step, workload_reqs_remaining)
+                ),
+            ]
+        )
+        if workload_reqs_remaining
+        else 0
+    )
 
     # 50% of steps: remove no requests
     # Other 50%: remove a limited number of reqs (less than the number
@@ -452,9 +500,11 @@ def _generate_fake_step_update(
     # If persistent batch is empty: 100% of steps have 0 removals until
     # more requests are added. Assume that removed requests are always
     # drawn from the current batch, before new adds
-    num_step_remove = random.choice([
-        0, random.randint(1, min(max_add_remove_per_step, batch_size))
-    ]) if batch_size else 0
+    num_step_remove = (
+        random.choice([0, random.randint(1, min(max_add_remove_per_step, batch_size))])
+        if batch_size
+        else 0
+    )
 
     num_step_add_replace = min(num_step_add, num_step_remove)
 
@@ -463,23 +513,34 @@ def _generate_fake_step_update(
         batch_update_builder.removed_append(removal)
 
     # Get added requests from workload
-    for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
+    for add_req_params in workload_params[wdx : (wdx + num_step_add_replace)]:
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params,
-             add_req_params.prompt_tokens, add_req_params.out_tokens))
+            (
+                add_remove_idx,
+                add_req_params.params,
+                add_req_params.prompt_tokens,
+                add_req_params.out_tokens,
+            )
+        )
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
-    add_reqs_append = workload_params[(wdx +
-                                       num_step_add_replace):(wdx +
-                                                              num_step_add)]
-    batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens,
-         add_req_params.out_tokens)
-        for adx, add_req_params in enumerate(add_reqs_append)
-    ])
+    add_reqs_append = workload_params[
+        (wdx + num_step_add_replace) : (wdx + num_step_add)
+    ]
+    batch_update_builder.added.extend(
+        [
+            (
+                adx + batch_size,
+                add_req_params.params,
+                add_req_params.prompt_tokens,
+                add_req_params.out_tokens,
+            )
+            for adx, add_req_params in enumerate(add_reqs_append)
+        ]
+    )
     persistent_batch.extend(add_reqs_append)
     pre_condense_batch_size = len(persistent_batch)
     wdx += num_step_add  # Update workload offset
@@ -488,8 +549,10 @@ def _generate_fake_step_update(
     last_nonempty_index = pre_condense_batch_size - 1
     condensed_to_idxs = set()
     while batch_update_builder.removed:
-        if (last_nonempty_index in batch_update_builder.removed
-                or last_nonempty_index in condensed_to_idxs):
+        if (
+            last_nonempty_index in batch_update_builder.removed
+            or last_nonempty_index in condensed_to_idxs
+        ):
             last_nonempty_index -= 1
             continue
         # last_nonempty_index is the highest persistent batch index that was
@@ -504,11 +567,10 @@ def _generate_fake_step_update(
         # move last_nonempty_index -> first_empty_index
         batch_update_builder.pop_removed()
         condensed_to_idxs.add(first_empty_index)
-        persistent_batch[first_empty_index] = persistent_batch[
-            last_nonempty_index]
+        persistent_batch[first_empty_index] = persistent_batch[last_nonempty_index]
         batch_update_builder.moved.append(
-            (last_nonempty_index, first_empty_index,
-             MoveDirectionality.UNIDIRECTIONAL))
+            (last_nonempty_index, first_empty_index, MoveDirectionality.UNIDIRECTIONAL)
+        )
 
         last_nonempty_index -= 1
 
@@ -519,23 +581,26 @@ def _generate_fake_step_update(
     persistent_batch[:] = persistent_batch[0:condensed_batch_size]
 
     if condensed_batch_size > 1:
-        # Simulate arbitrary reorder_batch() in the kernel backend
+        # Simulate arbitrary batch ordering in the kernel backend
         # Generate a random number k of non-overlapping swap tuples
         k = random.randint(0, condensed_batch_size // 2)
         idxs = list(range(condensed_batch_size))
         random.shuffle(idxs)
-        swaps = [
-            tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
-        ]
-        batch_update_builder.moved.extend([
-            (sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps
-        ])
+        swaps = [tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)]
+        batch_update_builder.moved.extend(
+            [(sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps]
+        )
         for adx, bdx in swaps:
-            persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
-                bdx], persistent_batch[adx]
-
-    return (batch_update_builder.get_and_reset(condensed_batch_size), wdx,
-            workload_size - wdx)
+            persistent_batch[adx], persistent_batch[bdx] = (
+                persistent_batch[bdx],
+                persistent_batch[adx],
+            )
+
+    return (
+        batch_update_builder.get_and_reset(condensed_batch_size),
+        wdx,
+        workload_size - wdx,
+    )
 
 
 def _assert_valid(
@@ -550,8 +615,10 @@ def _assert_valid(
         # Trivial case of empty persistent batch
         assert len(persistent_batch) == 0
         if logits_w_lp.shape[0] != 0:
-            raise ValueError("Fake persistent batch is empty but logitsprocs "
-                             f"output batch has shape {logits_w_lp.shape}")
+            raise ValueError(
+                "Fake persistent batch is empty but logitsprocs "
+                f"output batch has shape {logits_w_lp.shape}"
+            )
         return
 
     # Validate logits for each fake request
@@ -560,36 +627,40 @@ def _assert_valid(
         # Invoke the appropriate validation function for
         # the logitproc employed by this request
         fxn = logitsprocs_test_mapping[request_params.logitproc_type].eval_fxn
-        fxn(test_fakes=test_fakes,
+        fxn(
+            test_fakes=test_fakes,
             persistent_batch=persistent_batch,
             logits_new=logits_w_lp,
             batch_index=batch_index,
             request_params=request_params,
-            step_idx=step_idx)
+            step_idx=step_idx,
+        )
 
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
-def test_logitsprocs(device: str, reqs_per_logitproc: int,
-                     logitsprocs_under_test: list[str]):
+def test_logitsprocs(
+    device: str, reqs_per_logitproc: int, logitsprocs_under_test: list[str]
+):
     random.seed(40)
     torch.set_default_device(device)
 
     # Define a shuffled batch of requests which individually use a different
     # logitproc, or no logitproc at all
     workload_params = _generate_mixed_logitsprocs_batch_params(
-        reqs_per_logitproc=reqs_per_logitproc,
-        logitsprocs_types=logitsprocs_under_test)
+        reqs_per_logitproc=reqs_per_logitproc, logitsprocs_types=logitsprocs_under_test
+    )
     workload_size = len(workload_params)
 
     # Create fake test data structures for testing.
     test_fakes = _generate_test_fakes(workload_size, device)
 
     wdx = 0  # Next request index in workload to add
-    persistent_batch: list[LogitsProcsRequestParams] = [
-    ]  # Persistent batch state, as list of workload indices
+    persistent_batch: list[
+        LogitsProcsRequestParams
+    ] = []  # Persistent batch state, as list of workload indices
 
     # Generate fake removed request indices from current persistent
     # batch before adds
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 891f55a14633bdbaefa13fb87a9e7e7bdf379572..1899737737f4b9f4b8a56428ce990d6547a485e6 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -2,37 +2,46 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 import sys
-from typing import Union
+from typing import Any
 
 import pytest
 
 from tests.utils import create_new_process_for_each_test
-# yapf: disable
-from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
-                                              DUMMY_LOGITPROC_FQCN,
-                                              DUMMY_LOGITPROC_MODULE,
-                                              MAX_TOKENS, MODEL_NAME,
-                                              POOLING_MODEL_NAME, TEMP_GREEDY,
-                                              CustomLogitprocSource,
-                                              DummyLogitsProcessor,
-                                              WrappedPerReqLogitsProcessor,
-                                              dummy_module)
+from tests.v1.logits_processors.utils import (
+    DUMMY_LOGITPROC_ARG,
+    DUMMY_LOGITPROC_FQCN,
+    DUMMY_LOGITPROC_MODULE,
+    MAX_TOKENS,
+    MODEL_NAME,
+    POOLING_MODEL_NAME,
+    TEMP_GREEDY,
+    CustomLogitprocSource,
+    DummyLogitsProcessor,
+    WrappedPerReqLogitsProcessor,
+    dummy_module,
+    prompts,
+)
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
-from tests.v1.logits_processors.utils import prompts
-# yapf: enable
 from vllm import LLM, SamplingParams
-from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS,
-                                             LogitsProcessor)
+from vllm.v1.sample.logits_processor import (
+    STR_POOLING_REJECTS_LOGITSPROCS,
+    STR_SPEC_DEC_REJECTS_LOGITSPROCS,
+    LogitsProcessor,
+)
 
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
-    SamplingParams(temperature=TEMP_GREEDY,
-                   max_tokens=MAX_TOKENS,
-                   extra_args={DUMMY_LOGITPROC_ARG: 128}),
+    SamplingParams(
+        temperature=TEMP_GREEDY,
+        max_tokens=MAX_TOKENS,
+        extra_args={DUMMY_LOGITPROC_ARG: 128},
+    ),
     SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
-    SamplingParams(temperature=TEMP_GREEDY,
-                   max_tokens=MAX_TOKENS,
-                   extra_args={DUMMY_LOGITPROC_ARG: 67}),
+    SamplingParams(
+        temperature=TEMP_GREEDY,
+        max_tokens=MAX_TOKENS,
+        extra_args={DUMMY_LOGITPROC_ARG: 67},
+    ),
     SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
 ]
 
@@ -49,7 +58,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
     2. Server has *not* loaded dummy logitproc; test that all requests
        behave as if logitproc is *not* operating (output matches reference
        `LLM` output.)
-    
+
     Args:
       kwargs: `LLM` constructor kwargs
       logitproc_loaded: server has loaded dummy logitproc if True
@@ -73,7 +82,8 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
 
     # Validate outputs
     for bdx, (out_lp, out_ref, params) in enumerate(
-            zip(outputs_logitproc, outputs_ref, sampling_params_list)):
+        zip(outputs_logitproc, outputs_ref, sampling_params_list)
+    ):
         lp_toks = out_lp.outputs[0].token_ids
         if logitproc_loaded and params.extra_args:
             # This request exercises custom logitproc; validate that logitproc
@@ -81,8 +91,8 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
             target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
             if not all(x == target_token for x in lp_toks):
                 raise AssertionError(
-                    f"Request {bdx} generated {lp_toks}, should all be "
-                    f"{target_token}")
+                    f"Request {bdx} generated {lp_toks}, should all be {target_token}"
+                )
         else:
             # This request does not exercise custom logitproc (or custom
             # logitproc is not enabled on this server); validate against
@@ -90,16 +100,15 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
             ref_toks = out_ref.outputs[0].token_ids
             if lp_toks != ref_toks:
                 raise AssertionError(
-                    f"Request {bdx} generated {lp_toks}, should match "
-                    f"{ref_toks}")
+                    f"Request {bdx} generated {lp_toks}, should match {ref_toks}"
+                )
 
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource))
-def test_custom_logitsprocs(monkeypatch,
-                            logitproc_source: CustomLogitprocSource):
+def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource):
     """Test offline Python interface for passing custom logitsprocs
-    
+
     Construct an `LLM` instance which loads a custom logitproc that has a
     well-defined behavior (mask out all tokens except one `target_token`)
 
@@ -118,7 +127,7 @@ def test_custom_logitsprocs(monkeypatch,
       instance output
     * Logitproc passed in via {entrypoint, class object, fully-qualified class
       name (FQCN)} - test that dummy logitproc is utilized correctly when
-      provided via any of these three possible sources 
+      provided via any of these three possible sources
 
     Args:
       monkeypatch: for setting env vars
@@ -142,6 +151,7 @@ def test_custom_logitsprocs(monkeypatch,
         # Scenario: vLLM loads a logitproc from a preconfigured entrypoint
         # To that end, mock a dummy logitproc entrypoint
         import importlib.metadata
+
         importlib.metadata.entry_points = fake_entry_points  # type: ignore
 
         # fork is required for workers to see entrypoint patch
@@ -149,7 +159,7 @@ def test_custom_logitsprocs(monkeypatch,
         _run_test({}, logitproc_loaded=True)
         return
 
-    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
         # Inject dummy module which defines logitproc
@@ -165,7 +175,7 @@ def test_custom_logitsprocs(monkeypatch,
 @create_new_process_for_each_test()
 def test_custom_logitsprocs_req(monkeypatch):
     """Test passing request-level logits processor to offline Python interface
-    
+
     Wrap a request-level logits processor to create a batch level logits
     processor that has a well-defined behavior (mask out all tokens except one
     `target_token`)
@@ -190,20 +200,27 @@ def test_custom_logitsprocs_req(monkeypatch):
     # Test that logitproc info is passed to workers
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
     random.seed(40)
-    _run_test({"logits_processors": [WrappedPerReqLogitsProcessor]},
-              logitproc_loaded=True)
+    _run_test(
+        {"logits_processors": [WrappedPerReqLogitsProcessor]}, logitproc_loaded=True
+    )
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("logitproc_source", [
-    CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
-    CustomLogitprocSource.LOGITPROC_SOURCE_FQCN,
-    CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
-])
-def test_pooling_rejects_custom_logitsprocs(
-        monkeypatch, logitproc_source: CustomLogitprocSource):
+@pytest.mark.parametrize("model_scenario", ["pooling", "spec_dec"])
+@pytest.mark.parametrize(
+    "logitproc_source",
+    [
+        CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
+        CustomLogitprocSource.LOGITPROC_SOURCE_FQCN,
+        CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
+    ],
+)
+def test_rejects_custom_logitsprocs(
+    monkeypatch, model_scenario: str, logitproc_source: CustomLogitprocSource
+):
     """Validate that vLLM engine initialization properly rejects custom
-    logitsprocs when the model is a pooling model.
+    logitsprocs when the model is a pooling model or speculative decoding
+    enabled.
 
     Use `LLM` entrypoint. We expect `LLM` initialization to fail before the
     logitproc is actually loaded.
@@ -227,44 +244,57 @@ def test_pooling_rejects_custom_logitsprocs(
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     random.seed(40)
 
+    test_params: dict[str, dict[str, Any]] = {
+        "pooling": {
+            "runner": "pooling",
+            "model": POOLING_MODEL_NAME,
+            "error_message": STR_POOLING_REJECTS_LOGITSPROCS,
+            "speculative_config": None,
+        },
+        "spec_dec": {
+            "runner": "auto",
+            "model": MODEL_NAME,
+            "error_message": STR_SPEC_DEC_REJECTS_LOGITSPROCS,
+            "speculative_config": {"model": "ngram", "num_speculative_tokens": 1},
+        },
+    }
+
+    config = test_params[model_scenario]
+
+    llm_kwargs: dict[str, Any] = {
+        "runner": config["runner"],
+        "model": config["model"],
+        "gpu_memory_utilization": 0.1,
+        "speculative_config": config["speculative_config"],
+    }
+
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
-        # Scenario: vLLM loads a pooling model and ignores a logitproc that is
+        # Scenario: vLLM loads a model and ignores a logitproc that is
         # available at a preconfigured entrypoint
 
         # Patch in dummy logitproc entrypoint
         import importlib.metadata
+
         importlib.metadata.entry_points = fake_entry_points  # type: ignore
 
         # fork is required for entrypoint patch to be visible to workers,
         # although they should ignore the entrypoint patch anyway
         monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
 
-        llm = LLM(
-            runner="pooling",
-            model=POOLING_MODEL_NAME,
-            gpu_memory_utilization=0.1,
-        )
+        llm = LLM(**llm_kwargs)
         # Require that no logitsprocs have been loaded
-        assert sum([
-            1 for _ in llm.llm_engine.model_executor.driver_worker.worker.
-            model_runner.input_batch.logitsprocs.all
-        ]) == 0
+        worker = llm.llm_engine.model_executor.driver_worker.worker
+        assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
         return
 
-    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
-        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+        llm_kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
     elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
         # Scenario: load logitproc from provided class object
-        kwargs["logits_processors"] = [DummyLogitsProcessor]
+        llm_kwargs["logits_processors"] = [DummyLogitsProcessor]
 
-    with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS):
-        # Require that loading a pooling model alongside the logitproc raises
+    with pytest.raises(ValueError, match=config["error_message"]):
+        # Require that loading a model alongside the logitproc raises
         # the appropriate exception.
-        LLM(
-            runner="pooling",
-            model=POOLING_MODEL_NAME,
-            gpu_memory_utilization=0.1,
-            **kwargs,
-        )
+        LLM(**llm_kwargs)
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index a01a479e5b24813c9c74af749f082ed8f869f6bd..3e0bb02ed68bed2a00cc5759449920e8ed64ad86 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -4,28 +4,28 @@
 import os
 import random
 import sys
-from typing import Any, Optional
+from typing import Any
 
 import openai
 import pytest
 import pytest_asyncio
 
-from tests.utils import (RemoteOpenAIServerCustom,
-                         create_new_process_for_each_test)
-# yapf: disable
-from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
-                                              DUMMY_LOGITPROC_FQCN,
-                                              DUMMY_LOGITPROC_MODULE,
-                                              MAX_TOKENS, MODEL_NAME,
-                                              TEMP_GREEDY, dummy_module)
+from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
+from tests.v1.logits_processors.utils import (
+    DUMMY_LOGITPROC_ARG,
+    DUMMY_LOGITPROC_FQCN,
+    DUMMY_LOGITPROC_MODULE,
+    MAX_TOKENS,
+    MODEL_NAME,
+    TEMP_GREEDY,
+    dummy_module,
+    prompts,
+)
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
-from tests.v1.logits_processors.utils import prompts
-
-# yapf: enable
 
 
 def _server_with_logitproc_entrypoint(
-    env_dict: Optional[dict[str, str]],
+    env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
@@ -33,11 +33,12 @@ def _server_with_logitproc_entrypoint(
 
     # Patch `entry_points` to inject logitproc entrypoint
     import importlib.metadata
+
     importlib.metadata.entry_points = fake_entry_points  # type: ignore
     from vllm.entrypoints.cli import main
 
     # fork is required for workers to see entrypoint patch
-    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
     if env_dict is not None:
         os.environ.update(env_dict)
 
@@ -47,7 +48,7 @@ def _server_with_logitproc_entrypoint(
 
 
 def _server_with_logitproc_module(
-    env_dict: Optional[dict[str, str]],
+    env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
@@ -55,10 +56,11 @@ def _server_with_logitproc_module(
 
     # Patch `modules` to inject dummy logitproc module
     from vllm.entrypoints.cli import main
+
     sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
 
     # fork is required for workers to see entrypoint patch
-    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
     if env_dict is not None:
         os.environ.update(env_dict)
 
@@ -80,8 +82,9 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="function",
-                params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]])
+@pytest.fixture(
+    scope="function", params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]]
+)
 def server(default_server_args, request, monkeypatch):
     """Consider two server configurations:
     (1) --logits-processors cli arg specifies dummy logits processor via fully-
@@ -102,8 +105,7 @@ def server(default_server_args, request, monkeypatch):
         args = default_server_args
         _server_fxn = _server_with_logitproc_entrypoint
 
-    with RemoteOpenAIServerCustom(MODEL_NAME, args,
-                                  _server_fxn) as remote_server:
+    with RemoteOpenAIServerCustom(MODEL_NAME, args, _server_fxn) as remote_server:
         yield remote_server
 
 
@@ -133,7 +135,7 @@ api_keyword_args = {
 )
 async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
     """Test custom logitsprocs when starting OpenAI server from CLI
-    
+
     Launch vLLM OpenAI-compatible server, configured to load a custom logitproc
     that has a well-defined behavior (mask out all tokens except one
     `target_token`).
@@ -157,9 +159,7 @@ async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
             # For requests which activate the dummy logitproc, choose one of
             # two `target_token` values which are known not to be EOS tokens
             request_keyword_args["extra_body"] = {
-                "vllm_xargs": {
-                    DUMMY_LOGITPROC_ARG: target_token
-                }
+                "vllm_xargs": {DUMMY_LOGITPROC_ARG: target_token}
             }
         batch = await client.completions.create(
             model=model_name,
@@ -173,8 +173,36 @@ async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
             choices: openai.types.CompletionChoice = batch.choices
             toks = choices[0].logprobs.tokens
             if not all([x == toks[0] for x in toks]):
-                raise AssertionError(
-                    f"Generated {toks} should all be {toks[0]}")
+                raise AssertionError(f"Generated {toks} should all be {toks[0]}")
 
         # Alternate whether to activate dummy logitproc for each request
         use_dummy_logitproc = not use_dummy_logitproc
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_custom_logitsproc_arg(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that request with invalid custom logitsproc is rejected"""
+
+    prompt = "Hello, my name is"
+    # Pass invalid (non-int) target_token value to dummy logits processor
+    request_keyword_args: dict[str, Any] = {
+        **api_keyword_args,
+        "extra_body": {
+            "vllm_xargs": {DUMMY_LOGITPROC_ARG: "invalid_target_token_value"}
+        },
+    }
+
+    with pytest.raises(openai.OpenAIError) as exc_info:
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **request_keyword_args,
+        )
+
+    assert "is not int" in str(exc_info.value)
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index d3b7f314da099395e2b32c4639c410354db2cd71..b8548bc319554dc2d1e60007718593d5ff767f96 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -3,17 +3,20 @@
 
 import types
 from enum import Enum, auto
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP,
-                                             AdapterLogitsProcessor,
-                                             BatchUpdate, LogitsProcessor,
-                                             RequestLogitsProcessor)
+from vllm.v1.sample.logits_processor import (
+    LOGITSPROCS_GROUP,
+    AdapterLogitsProcessor,
+    BatchUpdate,
+    LogitsProcessor,
+    RequestLogitsProcessor,
+)
 from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 logger = init_logger(__name__)
@@ -30,6 +33,7 @@ DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
 
 class CustomLogitprocSource(Enum):
     """How to source a logitproc for testing purposes"""
+
     LOGITPROC_SOURCE_NONE = auto()  # No custom logitproc
     LOGITPROC_SOURCE_ENTRYPOINT = auto()  # Via entrypoint
     LOGITPROC_SOURCE_FQCN = auto()  # Via fully-qualified class name (FQCN)
@@ -48,20 +52,34 @@ prompts = [
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
 
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool):
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: int | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"target_token value {target_token} {type(target_token)} is not int"
+            )
+
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
         self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
+        def extract_extra_arg(params: SamplingParams) -> int | None:
+            self.validate_params(params)
+            return params.extra_args and params.extra_args.get("target_token")
+
         process_dict_updates(
             self.req_info,
             batch_update,
-            lambda params, _, __: params.extra_args and
-            (params.extra_args.get("target_token")),
+            lambda params, _, __: extract_extra_arg(params),
         )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
@@ -69,16 +87,16 @@ class DummyLogitsProcessor(LogitsProcessor):
             return logits
 
         # Save target values before modification
-        cols = torch.tensor(list(self.req_info.values()),
-                            dtype=torch.long,
-                            device=logits.device)
-        rows = torch.tensor(list(self.req_info.keys()),
-                            dtype=torch.long,
-                            device=logits.device)
+        cols = torch.tensor(
+            list(self.req_info.values()), dtype=torch.long, device=logits.device
+        )
+        rows = torch.tensor(
+            list(self.req_info.keys()), dtype=torch.long, device=logits.device
+        )
         values_to_keep = logits[rows, cols].clone()
 
         # Mask all but target tokens
-        logits[rows] = float('-inf')
+        logits[rows] = float("-inf")
         logits[rows, cols] = values_to_keep
 
         return logits
@@ -140,7 +158,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
@@ -154,14 +172,17 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
         Returns:
           `Callable` request logits processor, or None
         """
-        target_token: Optional[
-            Any] = params.extra_args and params.extra_args.get("target_token")
+        target_token: Any | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
         if target_token is None:
             return None
         if not isinstance(target_token, int):
             logger.warning(
                 "target_token value %s is not int; not applying logits"
-                " processor to request.", target_token)
+                " processor to request.",
+                target_token,
+            )
             return None
         return DummyPerReqLogitsProcessor(target_token)
 
diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py
index e6a4d0a2a2e8bdbcd408735911d680c0cda62cfd..2e243c23cbf9a311255b02acebeee4de80e65c92 100644
--- a/tests/v1/metrics/test_engine_logger_apis.py
+++ b/tests/v1/metrics/test_engine_logger_apis.py
@@ -4,33 +4,13 @@ import copy
 
 import pytest
 
+from tests.plugins.vllm_add_dummy_stat_logger.dummy_stat_logger.dummy_stat_logger import (  # noqa E501
+    DummyStatLogger,
+)
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
 
 
-class DummyStatLogger:
-    """
-    A dummy stat logger for testing purposes.
-    Implements the minimal interface expected by StatLoggerManager.
-    """
-
-    def __init__(self, vllm_config, engine_idx):
-        self.vllm_config = vllm_config
-        self.engine_idx = engine_idx
-        self.recorded = []
-        self.logged = False
-        self.engine_initialized = False
-
-    def record(self, scheduler_stats, iteration_stats, engine_idx):
-        self.recorded.append((scheduler_stats, iteration_stats, engine_idx))
-
-    def log(self):
-        self.logged = True
-
-    def log_engine_initialized(self):
-        self.engine_initialized = True
-
-
 @pytest.fixture
 def log_stats_enabled_engine_args():
     """
@@ -46,23 +26,22 @@ def log_stats_enabled_engine_args():
 
 
 @pytest.mark.asyncio
-async def test_async_llm_replace_default_loggers(
-        log_stats_enabled_engine_args):
+async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args):
     """
     RayPrometheusStatLogger should replace the default PrometheusStatLogger
     """
 
-    engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args,
-                                       stat_loggers=[RayPrometheusStatLogger])
-    assert isinstance(engine.logger_manager.prometheus_logger,
-                      RayPrometheusStatLogger)
+    engine = AsyncLLM.from_engine_args(
+        log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger]
+    )
+    assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger)
     engine.shutdown()
 
 
 @pytest.mark.asyncio
 async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
     """
-    It's still possible to use custom stat loggers exclusively by passing 
+    It's still possible to use custom stat loggers exclusively by passing
     disable_log_stats=True in addition to a list of custom stat loggers.
     """
     # Create engine_args with disable_log_stats=True for this test
@@ -70,12 +49,16 @@ async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
     disabled_log_engine_args.disable_log_stats = True
 
     # Disable default loggers; pass custom stat logger to the constructor
-    engine = AsyncLLM.from_engine_args(disabled_log_engine_args,
-                                       stat_loggers=[DummyStatLogger])
+    engine = AsyncLLM.from_engine_args(
+        disabled_log_engine_args, stat_loggers=[DummyStatLogger]
+    )
 
-    assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
-    assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0],
-                      DummyStatLogger)
+    assert len(engine.logger_manager.stat_loggers) == 2
+    assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1
+    assert isinstance(
+        engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0],
+        DummyStatLogger,
+    )
 
     # log_stats is still True, since custom stat loggers are used
     assert engine.log_stats
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/metrics/test_metrics_reader.py
similarity index 78%
rename from tests/v1/test_metrics_reader.py
rename to tests/v1/metrics/test_metrics_reader.py
index c05de5e4cb645494c022508e7d55a87b3719358a..1c90e6d335274d8808682fd6c13d1bfbce29670b 100644
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/metrics/test_metrics_reader.py
@@ -4,8 +4,15 @@
 import prometheus_client
 import pytest
 
-from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
-                                    get_metrics_snapshot)
+from vllm.v1.metrics.reader import (
+    Counter,
+    Gauge,
+    Histogram,
+    Vector,
+    get_metrics_snapshot,
+)
+
+pytestmark = pytest.mark.cpu_test
 
 
 @pytest.fixture(autouse=True)
@@ -18,10 +25,12 @@ def test_registry(monkeypatch):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_gauge_metric(test_registry, num_engines):
-    g = prometheus_client.Gauge("vllm:test_gauge",
-                                "Test gauge metric",
-                                labelnames=["model", "engine_index"],
-                                registry=test_registry)
+    g = prometheus_client.Gauge(
+        "vllm:test_gauge",
+        "Test gauge metric",
+        labelnames=["model", "engine_index"],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         g.labels(model="foo", engine_index=str(i)).set(98.5)
 
@@ -39,10 +48,12 @@ def test_gauge_metric(test_registry, num_engines):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_counter_metric(test_registry, num_engines):
-    c = prometheus_client.Counter("vllm:test_counter",
-                                  "Test counter metric",
-                                  labelnames=["model", "engine_index"],
-                                  registry=test_registry)
+    c = prometheus_client.Counter(
+        "vllm:test_counter",
+        "Test counter metric",
+        labelnames=["model", "engine_index"],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         c.labels(model="bar", engine_index=str(i)).inc(19)
 
@@ -60,11 +71,13 @@ def test_counter_metric(test_registry, num_engines):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_histogram_metric(test_registry, num_engines):
-    h = prometheus_client.Histogram("vllm:test_histogram",
-                                    "Test histogram metric",
-                                    labelnames=["model", "engine_index"],
-                                    buckets=[10, 20, 30, 40, 50],
-                                    registry=test_registry)
+    h = prometheus_client.Histogram(
+        "vllm:test_histogram",
+        "Test histogram metric",
+        labelnames=["model", "engine_index"],
+        buckets=[10, 20, 30, 40, 50],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         hist = h.labels(model="blaa", engine_index=str(i))
         hist.observe(42)
@@ -95,7 +108,8 @@ def test_vector_metric(test_registry, num_engines):
         "vllm:spec_decode_num_accepted_tokens_per_pos",
         "Vector-like counter metric",
         labelnames=["position", "model", "engine_index"],
-        registry=test_registry)
+        registry=test_registry,
+    )
     for i in range(num_engines):
         c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
         c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index 0c9f83f049e4d80791a324c0da363432bb581dec..f08d9f684921d67b474a4428c965eecb92e5cad0 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,24 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 import ray
 
-from vllm.config import ModelDType
+from vllm.config.model import ModelDType
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
-from vllm.v1.metrics.ray_wrappers import (RayPrometheusMetric,
-                                          RayPrometheusStatLogger)
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    """
-    The change relies on V1 APIs, so set VLLM_USE_V1=1.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
+from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
 
 MODELS = [
     "distilbert/distilgpt2",
@@ -34,24 +23,19 @@ def test_engine_log_metrics_ray(
     dtype: ModelDType,
     max_tokens: int,
 ) -> None:
-    """ Simple smoke test, verifying this can be used without exceptions.
+    """Simple smoke test, verifying this can be used without exceptions.
     Need to start a Ray cluster in order to verify outputs."""
 
     @ray.remote(num_gpus=1)
     class EngineTestActor:
-
         async def run(self):
-            # Set environment variable inside the Ray actor since environment
-            # variables from pytest fixtures don't propagate to Ray actors
-            os.environ['VLLM_USE_V1'] = '1'
-
-            engine_args = AsyncEngineArgs(model=model,
-                                          dtype=dtype,
-                                          disable_log_stats=False,
-                                          enforce_eager=True)
+            engine_args = AsyncEngineArgs(
+                model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
+            )
 
             engine = AsyncLLM.from_engine_args(
-                engine_args, stat_loggers=[RayPrometheusStatLogger])
+                engine_args, stat_loggers=[RayPrometheusStatLogger]
+            )
 
             for i, prompt in enumerate(example_prompts):
                 results = engine.generate(
@@ -73,32 +57,40 @@ def test_sanitized_opentelemetry_name():
 
     # Only a-z, A-Z, 0-9, _, test valid characters are preserved
     valid_name = "valid_metric_123_abcDEF"
-    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
-        valid_name) == valid_name
+    assert (
+        RayPrometheusMetric._get_sanitized_opentelemetry_name(valid_name) == valid_name
+    )
 
     # Test dash, dot, are replaced
     name_with_dash_dot = "metric-name.test"
     expected = "metric_name_test"
-    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
-        name_with_dash_dot) == expected
+    assert (
+        RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_dash_dot)
+        == expected
+    )
 
     # Test colon is replaced with underscore
     name_with_colon = "metric:name"
     expected = "metric_name"
-    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
-        name_with_colon) == expected
+    assert (
+        RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_colon)
+        == expected
+    )
 
     # Test multiple invalid characters are replaced
     name_with_invalid = "metric:name@with#special%chars"
     expected = "metric_name_with_special_chars"
-    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
-        name_with_invalid) == expected
+    assert (
+        RayPrometheusMetric._get_sanitized_opentelemetry_name(name_with_invalid)
+        == expected
+    )
 
     # Test mixed valid and invalid characters
     complex_name = "vllm:engine_stats/time.latency_ms-99p"
     expected = "vllm_engine_stats_time_latency_ms_99p"
-    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
-        complex_name) == expected
+    assert (
+        RayPrometheusMetric._get_sanitized_opentelemetry_name(complex_name) == expected
+    )
 
     # Test empty string
     assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""
diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..48067def8357e8a77527c408892e07374412fdf6
--- /dev/null
+++ b/tests/v1/metrics/test_stats.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.metrics.stats import IterationStats
+
+
+def test_iteration_stats_repr():
+    iteration_stats = IterationStats()
+    assert repr(iteration_stats).startswith("IterationStats(")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 71aa9e3d379cfa877a03119b0d922393f693ef0f..42584938bc06ff8c9dbd8a3f2b38b0b2d71eaea6 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -2,18 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
+import math
 from collections.abc import Generator
 from typing import get_args
 
 import pytest
 import torch
 
+from tests.utils import large_gpu_mark
 from tests.v1.sample.utils import (
-    BatchLogprobsComposition, BatchLogprobsSpecType,
+    BatchLogprobsComposition,
+    BatchLogprobsSpecType,
     assert_incr_detok_str_matches_non_incr_detok_str,
-    compute_correct_cumulative_logprob, get_test_batch)
+    compute_correct_cumulative_logprob,
+    get_test_batch,
+)
 from vllm import SamplingParams
-from vllm.config import LogprobsMode
+from vllm.config.model import LogprobsMode
+from vllm.distributed import cleanup_dist_env_and_memory
 
 from ...conftest import HfRunner, VllmRunner
 
@@ -29,22 +35,24 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
 @pytest.fixture(
     scope="module",
     # Parameterize APC
-    params=[False, True])
+    params=[False, True],
+)
 def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
     with vllm_runner(
-            MODEL,
-            dtype=DTYPE,
-            max_logprobs=7,
-            # Very small number of batched tokens to ensure
-            # that we test chunking.
-            max_num_batched_tokens=16,
-            max_num_seqs=16,
-            max_model_len=128,
-            enforce_eager=True,
-            #TODO: enable this once we support it for
-            # prompt logprobs.
-            enable_prefix_caching=request.param,
-            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
+        MODEL,
+        dtype=DTYPE,
+        max_logprobs=7,
+        # Very small number of batched tokens to ensure
+        # that we test chunking.
+        max_num_batched_tokens=16,
+        max_num_seqs=16,
+        max_model_len=128,
+        enable_chunked_prefill=True,
+        enforce_eager=True,
+        # TODO: enable this once we support it for
+        # prompt logprobs.
+        enable_prefix_caching=request.param,
+        gpu_memory_utilization=0.4,  # up to 2 alive concurrently
     ) as vllm_model:
         yield vllm_model
 
@@ -96,8 +104,8 @@ def _repeat_logprob_config(
     num_test_prompts = len(test_prompts)
     # Make sure there is a logprobs configuration for each test prompt
     logprob_prompt_logprob_list = list(
-        itertools.islice(itertools.cycle(logprob_prompt_logprob_list),
-                         num_test_prompts))
+        itertools.islice(itertools.cycle(logprob_prompt_logprob_list), num_test_prompts)
+    )
     # Now the number of prompts should match the number of sample params combos
     assert num_test_prompts == len(logprob_prompt_logprob_list)
     return logprob_prompt_logprob_list
@@ -115,24 +123,28 @@ def _run_and_validate(
     do_apc: bool,
 ) -> None:
     vllm_results = vllm_model.llm.generate(
-        test_prompts, sampling_params=vllm_sampling_params)
+        test_prompts, sampling_params=vllm_sampling_params
+    )
 
     for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
-            vllm_results, hf_logprobs, hf_outputs,
-            logprob_prompt_logprob_list):
-
+        vllm_results, hf_logprobs, hf_outputs, logprob_prompt_logprob_list
+    ):
         # Extract request-level (prompt)logprobs config
         num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
 
         # Test whether sampled token output is consistent between vLLM and HF
         # vLLM prompt+completion should match HF output
         if temperature == 0.0:
-            assert (vllm_result.prompt_token_ids +
-                    vllm_result.outputs[0].token_ids == hf_output[0])
+            assert (
+                vllm_result.prompt_token_ids + vllm_result.outputs[0].token_ids
+                == hf_output[0]
+            )
         else:
             # Sampled tokens won't match if not greedy
-            assert (vllm_result.prompt_token_ids == hf_output[0]
-                    [:len(vllm_result.prompt_token_ids)])
+            assert (
+                vllm_result.prompt_token_ids
+                == hf_output[0][: len(vllm_result.prompt_token_ids)]
+            )
 
         # Validate sample logprobs
         if num_top_logprobs is not None:
@@ -141,8 +153,9 @@ def _run_and_validate(
             # correct
             assert vllm_result.outputs[0].logprobs is not None
             assert len(vllm_result.outputs[0].logprobs) == max_tokens
-            for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
-                                          vllm_result.outputs[0].token_ids):
+            for logprobs, token_id in zip(
+                vllm_result.outputs[0].logprobs, vllm_result.outputs[0].token_ids
+            ):
                 assert logprobs is not None
 
                 # Confirm that the output token appears among the logprobs
@@ -159,23 +172,26 @@ def _run_and_validate(
                 if num_top_logprobs > 0:
                     # We should have an entry for each of the topk ranks
                     all_ranks = {lp.rank for lp in logprobs.values()}
-                    assert all(r in all_ranks
-                               for r in range(1, num_top_logprobs + 1))
+                    assert all(r in all_ranks for r in range(1, num_top_logprobs + 1))
 
             output_text = vllm_result.outputs[0].text
             output_string_from_most_likely_tokens_lst: list[str] = []
             for top_logprobs in vllm_result.outputs[0].logprobs:
                 top_logprob = next(iter(top_logprobs.values()))
                 output_string_from_most_likely_tokens_lst.append(
-                    top_logprob.decoded_token)
+                    top_logprob.decoded_token
+                )
 
             output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
+                output_string_from_most_likely_tokens_lst
+            )
             assert_incr_detok_str_matches_non_incr_detok_str(
-                output_text, output_string_from_most_likely_tokens,
+                output_text,
+                output_string_from_most_likely_tokens,
                 "The output text from the top logprob for each token "
                 "position should be the same as the output text in the "
-                "result.")
+                "result.",
+            )
 
             # Compare vLLM sample logprobs to HF
             vllm_sample_logprobs = vllm_result.outputs[0].logprobs
@@ -187,11 +203,12 @@ def _run_and_validate(
                             logprob,
                             hf_logprob[i][-1][token_id].item(),
                             atol=1e-2,
-                            rtol=1e-2)
-                    assert isinstance(
-                        sample_logprob.decoded_token,
-                        str), ("The token should be decoded by the time it is"
-                               " returned to the user.")
+                            rtol=1e-2,
+                        )
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is"
+                        " returned to the user."
+                    )
 
             # At this point we know the sample logprobs are correct for this
             # request. Validate that cumulative_logprob is actually the sum.
@@ -201,7 +218,8 @@ def _run_and_validate(
                 vllm_result.outputs[0].cumulative_logprob,
                 compute_correct_cumulative_logprob(vllm_result.outputs[0]),
                 atol=1e-6,
-                rtol=1e-6)
+                rtol=1e-6,
+            )
         else:
             # Logprobs disabled for this request; should be None
             assert vllm_result.outputs[0].logprobs is None
@@ -214,17 +232,17 @@ def _run_and_validate(
             assert vllm_result.prompt_logprobs[0] is None
             # - Prompt logprobs are returned for all indices in
             #   the prompt
-            assert len(vllm_result.prompt_logprobs) == len(
-                vllm_result.prompt_token_ids)
+            assert len(vllm_result.prompt_logprobs) == len(vllm_result.prompt_token_ids)
             for prompt_logprobs, prompt_token_id in zip(
-                    vllm_result.prompt_logprobs[1:],
-                    vllm_result.prompt_token_ids[1:]):
+                vllm_result.prompt_logprobs[1:], vllm_result.prompt_token_ids[1:]
+            ):
                 assert prompt_logprobs is not None
 
                 # Confirm that the prompt token appears among the logprobs
                 assert prompt_token_id in prompt_logprobs
-                token_in_topk = prompt_logprobs[
-                    prompt_token_id].rank <= num_top_prompt_logprobs
+                token_in_topk = (
+                    prompt_logprobs[prompt_token_id].rank <= num_top_prompt_logprobs
+                )
 
                 # If the prompt token is not included in the top K
                 # logprob, it can return 1 more data
@@ -236,8 +254,9 @@ def _run_and_validate(
                 if num_top_prompt_logprobs > 0:
                     # We should have an entry for each of the topk ranks
                     all_ranks = {lp.rank for lp in prompt_logprobs.values()}
-                    assert all(r in all_ranks
-                               for r in range(1, num_top_prompt_logprobs + 1))
+                    assert all(
+                        r in all_ranks for r in range(1, num_top_prompt_logprobs + 1)
+                    )
 
             # Compare prompt logprobs to HF
             # The first prompt logprob is always None, so we compare it from
@@ -249,19 +268,23 @@ def _run_and_validate(
                         logprob.logprob,
                         hf_logprob[0][i][token_id].item(),
                         atol=2e-2,
-                        rtol=2e-2)
+                        rtol=2e-2,
+                    )
         else:
             assert vllm_result.prompt_logprobs is None
 
 
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
+@pytest.mark.parametrize(
+    "batch_logprobs_composition", [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]
+)
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-        hf_model, vllm_model,
-        batch_logprobs_composition: BatchLogprobsComposition,
-        temperature: float, example_prompts: list[str],
-        monkeypatch: pytest.MonkeyPatch) -> None:
+    hf_model,
+    vllm_model,
+    batch_logprobs_composition: BatchLogprobsComposition,
+    temperature: float,
+    example_prompts: list[str],
+) -> None:
     """Test V1 Engine logprobs & prompt logprobs
 
     Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
@@ -288,218 +311,292 @@ def test_get_logprobs_and_prompt_logprobs(
       temperature: "temperature" sampling parameter
       example_prompts: example prompt fixture
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
-        if do_apc and (temperature < 2.0
-                       or batch_logprobs_composition != SAMPLE_PROMPT):
-            # Skip some test-cases to save time.
-            pytest.skip()
-        test_prompts = example_prompts
-
-        max_tokens = 5
-        hf_outputs = hf_model.generate_greedy(
-            test_prompts,
+    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list
+    )
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(
             max_tokens=max_tokens,
+            logprobs=num_lp,
+            prompt_logprobs=num_plp,
+            temperature=temperature,
+            seed=1984,
         )
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            test_prompts,
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
             max_tokens=max_tokens,
+            do_apc=do_apc,
         )
 
-        # Batch has mixed sample params
-        # (different logprobs/prompt logprobs combos)
-        logprob_prompt_logprob_list = get_test_batch(
-            batch_logprobs_composition)
-
-        # Ensure that each test prompt has a logprob config for testing
-        logprob_prompt_logprob_list = _repeat_logprob_config(
-            test_prompts, logprob_prompt_logprob_list)
-        # Generate SamplingParams
-        vllm_sampling_params = [
-            SamplingParams(max_tokens=max_tokens,
-                           logprobs=num_lp,
-                           prompt_logprobs=num_plp,
-                           temperature=temperature,
-                           seed=1984)
-            for num_lp, num_plp in logprob_prompt_logprob_list
-        ]
-        for _ in range(2 if do_apc else 1):
-            _run_and_validate(
-                vllm_model=vllm_model,
-                test_prompts=test_prompts,
-                vllm_sampling_params=vllm_sampling_params,
-                hf_logprobs=hf_logprobs,
-                hf_outputs=hf_outputs,
-                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                do_apc=do_apc)
-
-
-def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
+
+def test_max_logprobs():
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     Should also fail for `prompt_logprobs > max_logprobs`
     APC should not matter as this test checks basic request validation.
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256)
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        # should pass
-        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-        bad_sampling_params = SamplingParams(logprobs=2)
-        with pytest.raises(ValueError):
-            runner.generate(["Hello world"],
-                            sampling_params=bad_sampling_params)
-
-
-def test_none_logprobs(vllm_model, example_prompts,
-                       monkeypatch: pytest.MonkeyPatch):
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+def test_none_logprobs(vllm_model, example_prompts):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
 
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
-
-        sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
-            logprobs=None,
-            prompt_logprobs=None,
-            temperature=0.0,
-        )
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params_logprobs_none,
-        )
-
-        for i in range(len(results_logprobs_none)):
-            # Check sample logprobs are None
-            assert results_logprobs_none[i].outputs[0].logprobs is None
-            assert results_logprobs_none[i].outputs[
-                0].cumulative_logprob is None
-            # Check prompt logprobs are None
-            assert results_logprobs_none[i].prompt_logprobs is None
-
-
-def test_zero_logprobs(vllm_model, example_prompts,
-                       monkeypatch: pytest.MonkeyPatch):
+    max_tokens = 5
+
+    sampling_params_logprobs_none = SamplingParams(
+        max_tokens=max_tokens,
+        logprobs=None,
+        prompt_logprobs=None,
+        temperature=0.0,
+    )
+    results_logprobs_none = vllm_model.llm.generate(
+        example_prompts,
+        sampling_params=sampling_params_logprobs_none,
+    )
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
+
+
+def test_zero_logprobs(vllm_model, example_prompts):
     """Engine should return sampled token and prompt token logprobs
 
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
-
-        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=0,
-                                                       prompt_logprobs=0,
-                                                       temperature=0.0)
-        results_logprobs_zero = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_zero)
-
-        for i in range(len(results_logprobs_zero)):
-            # Check that there is one sample logprob dict for each
-            # sample token
-            logprobs = results_logprobs_zero[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-            assert logprobs is not None
-            assert len(sampled_token_ids) == len(logprobs)
-            assert results_logprobs_zero[i].outputs[
-                0].cumulative_logprob is not None
-            # Check that there is one prompt logprob dict for each
-            # prompt token
-            assert prompt_logprobs is not None
-            assert len(prompt_token_ids) == len(prompt_logprobs)
-
-
-def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+    max_tokens = 5
+
+    sampling_params_logprobs_zero = SamplingParams(
+        max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+    )
+    results_logprobs_zero = vllm_model.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_zero
+    )
+
+    for i in range(len(results_logprobs_zero)):
+        # Check that there is one sample logprob dict for each
+        # sample token
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        assert logprobs is not None
+        assert len(sampled_token_ids) == len(logprobs)
+        assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
+        # Check that there is one prompt logprob dict for each
+        # prompt token
+        assert prompt_logprobs is not None
+        assert len(prompt_token_ids) == len(prompt_logprobs)
+
+
+def test_all_logprobs(example_prompts):
     """Engine should return all vocabulary logprobs and prompt logprobs
 
     Args:
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=-1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256)
-
-        sampling_params_logprobs_all = SamplingParams(max_tokens=5,
-                                                      logprobs=-1,
-                                                      prompt_logprobs=-1)
-        results_logprobs_all = runner.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_all)
-        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
-
-        for i in range(len(results_logprobs_all)):
-            logprobs = results_logprobs_all[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
-            assert logprobs is not None
-            for logprob in logprobs:
-                assert len(logprob) == vocab_size
-            assert prompt_logprobs is not None
-            assert prompt_logprobs[0] is None
-            for prompt_logprob in prompt_logprobs[1:]:
-                assert len(prompt_logprob) == vocab_size
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=-1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    sampling_params_logprobs_all = SamplingParams(
+        max_tokens=5, logprobs=-1, prompt_logprobs=-1
+    )
+    results_logprobs_all = runner.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_all
+    )
+    vocab_size = runner.llm.llm_engine.model_config.get_vocab_size()
+
+    for i in range(len(results_logprobs_all)):
+        logprobs = results_logprobs_all[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_all[i].prompt_logprobs
+        assert logprobs is not None
+        for logprob in logprobs:
+            assert len(logprob) == vocab_size
+        assert prompt_logprobs is not None
+        assert prompt_logprobs[0] is None
+        for prompt_logprob in prompt_logprobs[1:]:
+            assert len(prompt_logprob) == vocab_size
 
 
 @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
-def test_logprobs_mode(logprobs_mode: LogprobsMode,
-                       monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
     """Test with LLM engine with different logprobs_mode.
     For logprobs, we should have non-positive values.
     For logits, we should expect at least one positive values.
     """
     from vllm import LLM
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        llm = LLM(
-            "facebook/opt-125m",
-            max_logprobs=5,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.05,
-            max_model_len=16,
-            logprobs_mode=logprobs_mode)
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        results = llm.generate(["Hello world"],
-                               sampling_params=vllm_sampling_params)
-
-        total_token_with_logprobs = 0
-        positive_values = 0
-        for output in results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    logprob = logprobs[token_id]
-                    if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
-                        assert logprob.logprob <= 0
-                    if logprob.logprob > 0:
-                        positive_values = positive_values + 1
-                    total_token_with_logprobs = total_token_with_logprobs + 1
-        assert total_token_with_logprobs >= len(results[0].outputs)
-        if logprobs_mode in ("raw_logits", "processed_logits"):
-            assert positive_values > 0
-        del llm
+
+    llm = LLM(
+        "facebook/opt-125m",
+        max_logprobs=5,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.05,
+        max_model_len=16,
+        logprobs_mode=logprobs_mode,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    total_token_with_logprobs = 0
+    positive_values = 0
+    for output in results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                logprob = logprobs[token_id]
+                if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
+                    assert logprob.logprob <= 0
+                if logprob.logprob > 0:
+                    positive_values = positive_values + 1
+                total_token_with_logprobs = total_token_with_logprobs + 1
+    assert total_token_with_logprobs >= len(results[0].outputs)
+    if logprobs_mode in ("raw_logits", "processed_logits"):
+        assert positive_values > 0
+    del llm
+
+
+@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-3.1-8B-Instruct",
+                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+            ),
+            marks=large_gpu_mark(min_gb=32),
+        ),
+    ],
+)
+def test_spec_decode_logprobs(
+    logprobs_mode: LogprobsMode,
+    model_setup: tuple[str, str, str],
+):
+    """Spec decode logprobs should match those of the base model.
+
+    Args:
+        logprobs_mode: logprobs mode.
+        model_setup: Spec decode method, base model name, and
+        draft model name.
+    """
+    from vllm import LLM
+
+    prompt = "Hello world"
+    sampling_params = SamplingParams(
+        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
+    )
+    method, model_name, spec_model_name = model_setup
+    max_model_len = 256
+
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    ref_results = ref_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from reference LLM.
+    ref_logprobs = []
+    for output in ref_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                ref_logprobs.append(logprobs[token_id])
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Run spec decode LLM.
+    spec_llm = LLM(
+        model_name,
+        speculative_config={
+            "method": method,
+            "model": spec_model_name,
+            "num_speculative_tokens": 3,
+            "max_model_len": max_model_len,
+        },
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    spec_results = spec_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from spec decode LLM.
+    spec_logprobs = []
+    for output in spec_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                spec_logprobs.append(logprobs[token_id])
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Per-token logprobs are expected to be the same.
+    assert len(ref_logprobs) == len(spec_logprobs)
+    for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
+        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert ref_logprob.rank == spec_logprob.rank
+        assert ref_logprob.decoded_token == spec_logprob.decoded_token
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 7f41355ff7ce4e7068ac1fa846e3ab0a5436cfa9..b3233e50fbf18639f8b32ec9eaac4bc73fceaacb 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -15,22 +15,23 @@ EXPECTED_VALUE = 0.62
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching",
-    "--gpu-memory-utilization=0.8"
+    "--enforce_eager",
+    "--no_enable_prefix_caching",
+    "--gpu-memory-utilization=0.8",
 ]
 NUM_CONCURRENT = 100
 
 
 def test_prompt_logprobs_e2e():
-    results = lm_eval.simple_evaluate(model="vllm",
-                                      model_args=MODEL_ARGS,
-                                      tasks=TASK,
-                                      batch_size="auto")
+    results = lm_eval.simple_evaluate(
+        model="vllm", model_args=MODEL_ARGS, tasks=TASK, batch_size="auto"
+    )
 
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < EXPECTED_VALUE
+        and measured_value + RTOL > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
 def test_prompt_logprobs_e2e_server():
@@ -40,7 +41,8 @@ def test_prompt_logprobs_e2e_server():
         model_args = (
             f"model={MODEL},"
             f"base_url={url},"
-            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
 
         results = lm_eval.simple_evaluate(
             model="local-completions",
@@ -49,6 +51,7 @@ def test_prompt_logprobs_e2e_server():
         )
 
         measured_value = results["results"][TASK][FILTER]
-        assert (measured_value - RTOL < EXPECTED_VALUE
-                and measured_value + RTOL > EXPECTED_VALUE
-                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 4e912f98f376f50b315d8d830e1379d87bfb4969..bf7726ebf907feacef5da5c9f0da13bf2969d1ce 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,16 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
+from unittest.mock import Mock
 
 import pytest
 import torch
 import torch.nn.functional as F
 
+from tests.v1.sample.utils import create_allowed_token_ids
 from vllm.platforms import current_platform
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
-                                              RejectionSampler)
+from vllm.v1.sample.rejection_sampler import PLACEHOLDER_TOKEN_ID, RejectionSampler
+from vllm.v1.sample.sampler import Sampler, SamplerOutput
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 DEVICE = current_platform.device_type
@@ -18,13 +20,37 @@ DEVICE = current_platform.device_type
 
 @pytest.fixture
 def rejection_sampler():
-    return RejectionSampler()
+    mock_sampler = Mock(spec=Sampler)
+    mock_sampler.logprobs_mode = "raw_logprobs"
+    return RejectionSampler(mock_sampler)
 
 
-def create_logits_tensor(output_token_ids: list[list[int]],
-                         vocab_size: int = 100) -> torch.Tensor:
+def mock_sampler_output(
+    rejection_sampler: RejectionSampler, bonus_token_ids: torch.Tensor
+):
+    rejection_sampler.sampler.return_value = SamplerOutput(
+        sampled_token_ids=bonus_token_ids, logprobs_tensors=None
+    )
+
+
+def create_spec_decode_metadata(
+    spec_tokens: list[list[int]], logits: torch.Tensor
+) -> SpecDecodeMetadata:
+    metadata = SpecDecodeMetadata.make_dummy(spec_tokens, device=logits.device)
+    metadata.target_logits_indices = torch.arange(logits.shape[0])
+    # Output bonus token ids are mocked, so the bonus logit indices should
+    # be empty.
+    metadata.bonus_logits_indices = torch.empty(0, dtype=torch.int32)
+    return metadata
+
+
+def create_logits_tensor(
+    output_token_ids: list[list[int]],
+    vocab_size: int = 100,
+    token_idx_to_override: int | None = None,
+) -> torch.Tensor:
     """Helper function to create logits tensor that
-       will produce desired token ids on argmax"""
+    will produce desired token ids on argmax"""
     token_ids = [tokens[:-1] for tokens in output_token_ids]
     num_total_tokens = sum(len(tokens) for tokens in token_ids)
     logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
@@ -33,19 +59,29 @@ def create_logits_tensor(output_token_ids: list[list[int]],
         for j, token_id in enumerate(tokens):
             logits[start_loc + j, token_id] = 100.0
         start_loc += len(tokens)
+    if token_idx_to_override:
+        logits[:, token_idx_to_override] = 99.0
     return logits
 
 
 def create_sampling_metadata(
     all_greedy: bool,
-    temperature: Optional[torch.Tensor] = None,
-    top_k: Optional[torch.Tensor] = None,
-    top_p: Optional[torch.Tensor] = None,
-    generators: Optional[dict[int, Any]] = None,
+    output_token_ids: list[list[int]] | None = None,
+    prompt_token_ids: torch.Tensor | None = None,
+    spec_token_ids: torch.Tensor | None = None,
+    temperature: torch.Tensor | None = None,
+    top_k: torch.Tensor | None = None,
+    top_p: torch.Tensor | None = None,
+    generators: dict[int, Any] | None = None,
+    frequency_penalties: list[float] | None = None,
+    presence_penalties: list[float] | None = None,
+    repetition_penalties: list[float] | None = None,
+    bad_words_token_ids: dict[int, list[list[int]]] | None = None,
+    allowed_token_ids_mask: torch.Tensor | None = None,
 ) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set
-        to the given value. Either all greedy or all random sampling
-        is used.
+    to the given value. Either all greedy or all random sampling
+    is used.
     """
     generators = generators or {}
     if all_greedy:
@@ -53,6 +89,21 @@ def create_sampling_metadata(
     else:
         assert temperature is not None
 
+    if any([frequency_penalties, presence_penalties, repetition_penalties]):
+        no_penalties = False
+
+        assert output_token_ids
+        assert len(output_token_ids) > 0
+
+        frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE)
+        presence_penalties = torch.tensor(presence_penalties, device=DEVICE)
+        repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE)
+    else:
+        no_penalties = True
+        frequency_penalties = torch.tensor([])
+        presence_penalties = torch.tensor([])
+        repetition_penalties = torch.tensor([])
+
     return SamplingMetadata(
         temperature=temperature,
         all_greedy=all_greedy,
@@ -61,14 +112,15 @@ def create_sampling_metadata(
         top_k=top_k,
         generators=generators,
         max_num_logprobs=0,
-        no_penalties=False,
-        prompt_token_ids=None,
-        frequency_penalties=torch.tensor([]),
-        presence_penalties=torch.tensor([]),
-        repetition_penalties=torch.tensor([]),
-        output_token_ids=[],
-        allowed_token_ids_mask=None,
-        bad_words_token_ids={},
+        no_penalties=no_penalties,
+        prompt_token_ids=prompt_token_ids,
+        frequency_penalties=frequency_penalties,
+        presence_penalties=presence_penalties,
+        repetition_penalties=repetition_penalties,
+        output_token_ids=[] if output_token_ids is None else output_token_ids,
+        spec_token_ids=[] if spec_token_ids is None else spec_token_ids,
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids={} if bad_words_token_ids is None else bad_words_token_ids,
         logitsprocs=LogitsProcessors(),
     )
 
@@ -81,22 +133,18 @@ def test_perfect_match(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
-    expected = torch.tensor([[1, 2, 3, 4]],
-                            dtype=torch.int,
-                            device=logits.device)
-    assert torch.equal(output, expected)
+    expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 def test_early_mismatch(rejection_sampler):
@@ -106,16 +154,14 @@ def test_early_mismatch(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
     expected = torch.tensor(
@@ -123,33 +169,32 @@ def test_early_mismatch(rejection_sampler):
         dtype=torch.int,
         device=logits.device,
     )
-    assert torch.equal(output, expected)
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 def test_multiple_sequences(rejection_sampler):
     """Test handling multiple sequences of speculated tokens"""
     spec_tokens = [[1, 2], [3]]
-    output_tokens = [[1, 2, 5], [3,
-                                 4]]  # Two sequences with bonus tokens 5 and 4
+    output_tokens = [[1, 2, 5], [3, 4]]  # Two sequences with bonus tokens 5 and 4
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
-        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
-    expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
-                            dtype=torch.int,
-                            device=logits.device)
-    assert torch.equal(output, expected)
+    expected = torch.tensor(
+        [[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]], dtype=torch.int, device=logits.device
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 def test_single_token_sequence(rejection_sampler):
@@ -159,20 +204,18 @@ def test_single_token_sequence(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
     expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output, expected)
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 def test_empty_sequence(rejection_sampler):
@@ -182,49 +225,48 @@ def test_empty_sequence(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
     expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output, expected)
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 def test_multiple_mismatches(rejection_sampler):
     """Test handling multiple sequences with mismatches"""
     spec_tokens = [[1, 2, 3], [4, 5, 6]]
-    output_tokens = [[1, 2, 7, 6], [4, 8, 6,
-                                    9]]  # Mismatches in both sequences
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6, 9]]  # Mismatches in both sequences
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
-        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
     expected = torch.tensor(
-        [[1, 2, 7, PLACEHOLDER_TOKEN_ID],
-         [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        [
+            [1, 2, 7, PLACEHOLDER_TOKEN_ID],
+            [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID],
+        ],
         dtype=torch.int,
         device=logits.device,
     )
-    assert torch.equal(output, expected)
+    assert torch.equal(output.sampled_token_ids, expected)
 
 
 @pytest.mark.parametrize(
@@ -232,30 +274,31 @@ def test_multiple_mismatches(rejection_sampler):
     [
         ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
         ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
-        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
-         [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
-    ])
-def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
-                            expected):
+        (
+            [[1, 2], [3, 4]],
+            [[1, 5, 6], [3, 4, 7]],
+            [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]],
+        ),  # Mixed matches
+    ],
+)
+def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens, expected):
     """Parametrized test for various matching scenarios"""
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor(
+        [tokens[-1] for tokens in output_tokens], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
 
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
     output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=None,
-        target_logits=logits,
-        bonus_token_ids=bonus_token_tensor,
+        logits=logits,
         sampling_metadata=metadata,
     )
-    expected_tensor = torch.tensor(expected,
-                                   dtype=torch.int,
-                                   device=logits.device)
-    assert torch.equal(output, expected_tensor)
+    expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected_tensor)
 
 
 ########################### Tests for Random Sampling ###################
@@ -273,22 +316,15 @@ def test_deterministic_when_seeded(
     n_rep: int,
 ):
     num_tokens = batch_size * k
-    draft_probs = torch.rand(num_tokens,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device=DEVICE)
+    draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
     draft_probs = F.softmax(draft_probs, dim=-1)
     target_logits = torch.rand_like(draft_probs)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device=DEVICE)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device=DEVICE)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE
+    )
 
     seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
 
@@ -296,26 +332,27 @@ def test_deterministic_when_seeded(
     for _ in range(n_rep):
         seeded_seqs = {
             i: torch.Generator(device=DEVICE).manual_seed(i)
-            for i in range(batch_size) if seeded_mask[i]
+            for i in range(batch_size)
+            if seeded_mask[i]
         }
 
-        temperature = torch.ones(batch_size,
-                                 dtype=torch.float32,
-                                 device=DEVICE)
-        sampling_metadata = create_sampling_metadata(all_greedy=False,
-                                                     temperature=temperature,
-                                                     generators=seeded_seqs)
-        spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-            draft_token_ids.tolist(), device=DEVICE)
+        temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+        sampling_metadata = create_sampling_metadata(
+            all_greedy=False, temperature=temperature, generators=seeded_seqs
+        )
+        spec_decode_metadata = create_spec_decode_metadata(
+            draft_token_ids.tolist(), target_logits
+        )
+
+        mock_sampler_output(rejection_sampler, bonus_token_ids)
         rep_result = rejection_sampler(
             spec_decode_metadata,
-            draft_probs=draft_probs,
-            target_logits=target_logits,
-            bonus_token_ids=bonus_token_ids,
+            draft_probs=None,
+            logits=target_logits,
             sampling_metadata=sampling_metadata,
         )
 
-        results.append(rep_result)
+        results.append(rep_result.sampled_token_ids)
 
     for i in range(batch_size):
         if seeded_mask[i]:
@@ -352,8 +389,7 @@ def test_rejection_sampling_approximates_target_distribution():
     num_reference_probs = 100
 
     # Prepare draft, target, and reference probability distributions
-    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
-                            dim=-1)
+    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32), dim=-1)
     target_logits = torch.rand(vocab_size, dtype=torch.float32)
     target_probs = F.softmax(target_logits, dim=-1)
     reference_probs = F.softmax(
@@ -368,38 +404,48 @@ def test_rejection_sampling_approximates_target_distribution():
     for num_samples in sample_sizes:
         # Sample using rejection sampling.
         rej_sample_probs = estimate_rejection_sampling_pdf(
-            draft_probs, target_logits, k, vocab_size, num_samples)
+            draft_probs, target_logits, k, vocab_size, num_samples
+        )
         rej_sample_probs = rej_sample_probs.to(DEVICE)
 
         # Average distance from reference probs.
-        reference_vs_rejsample_dist = torch.dist(
-            reference_probs,
-            rej_sample_probs).item() / reference_probs.shape[0]
-        target_vs_rejsample_dist = torch.dist(target_probs,
-                                              rej_sample_probs).item()
+        reference_vs_rejsample_dist = (
+            torch.dist(reference_probs, rej_sample_probs).item()
+            / reference_probs.shape[0]
+        )
+        target_vs_rejsample_dist = torch.dist(target_probs, rej_sample_probs).item()
 
         distance_wrt_reference.append(reference_vs_rejsample_dist)
         distance_wrt_target.append(target_vs_rejsample_dist)
 
         relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-            distance_wrt_target)
+            distance_wrt_target
+        )
         relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-            distance_wrt_reference)
+            distance_wrt_reference
+        )
 
-        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
-              f"{reference_vs_rejsample_dist=:.05f}")
-        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
-              f"{relative_change_in_distance_wrt_reference=:.02f}")
+        print(
+            f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+            f"{reference_vs_rejsample_dist=:.05f}"
+        )
+        print(
+            f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+            f"{relative_change_in_distance_wrt_reference=:.02f}"
+        )
 
     relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-        distance_wrt_target)
+        distance_wrt_target
+    )
     relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-        distance_wrt_reference)
+        distance_wrt_reference
+    )
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target
-            > relative_change_in_distance_wrt_reference *
-            expected_improvement_multiplier)
+    assert (
+        relative_change_in_distance_wrt_target
+        > relative_change_in_distance_wrt_reference * expected_improvement_multiplier
+    )
 
 
 def get_ratio_first_to_last(elements: list[float]) -> float:
@@ -424,45 +470,50 @@ def estimate_rejection_sampling_pdf(
     Returns:
         Estimated probability distribution of the output tokens.
     """
-    rejection_sampler = RejectionSampler()
+    mock_sampler = Mock(spec=Sampler)
+    mock_sampler.logprobs_mode = "raw_logprobs"
+    rejection_sampler = RejectionSampler(mock_sampler)
     num_tokens = num_samples * k
     # Repeat draft probs num_samples * k times.
-    draft_probs = draft_probs.reshape(1, 1,
-                                      vocab_size).repeat(num_samples, k, 1)
+    draft_probs = draft_probs.reshape(1, 1, vocab_size).repeat(num_samples, k, 1)
 
     # Repeat target probs num_tokens times.
     target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
 
     # Randomly sample draft token ids from draft probs.
-    draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
-                                        num_samples=k,
-                                        replacement=True).reshape(
-                                            num_samples, k)
+    draft_token_ids = torch.multinomial(
+        draft_probs[:, 0, :], num_samples=k, replacement=True
+    ).reshape(num_samples, k)
     draft_probs = draft_probs.view(num_tokens, vocab_size)
 
     # Bonus tokens not used but required.
-    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
-                                  device=DEVICE).repeat(num_samples, 1)
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat(
+        num_samples, 1
+    )
 
     temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
-    sampling_metadata = create_sampling_metadata(all_greedy=False,
-                                                 temperature=temperature)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-        draft_token_ids.tolist(), device=bonus_token_ids.device)
-    output_token_ids = rejection_sampler(
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False, temperature=temperature
+    )
+    spec_decode_metadata = create_spec_decode_metadata(
+        draft_token_ids.tolist(), target_logits
+    )
+
+    mock_sampler_output(rejection_sampler, bonus_token_ids)
+    sampler_output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=draft_probs,
-        target_logits=target_logits,
-        bonus_token_ids=bonus_token_ids,
+        logits=target_logits,
         sampling_metadata=sampling_metadata,
     )
-    output_token_ids = output_token_ids[:, :-1].flatten()
+    output_token_ids = sampler_output.sampled_token_ids[:, :-1].flatten()
 
-    hist = torch.histogram(output_token_ids.to(dtype=torch.float,
-                                               device="cpu"),
-                           bins=vocab_size,
-                           range=(0, vocab_size),
-                           density=True)
+    hist = torch.histogram(
+        output_token_ids.to(dtype=torch.float, device="cpu"),
+        bins=vocab_size,
+        range=(0, vocab_size),
+        density=True,
+    )
 
     return hist.hist
 
@@ -480,9 +531,9 @@ def _test_masked_logits(
     num_tokens = batch_size * num_draft_tokens
 
     # Create random draft probabilities.
-    draft_probs = torch.rand((num_tokens, vocab_size),
-                             dtype=torch.float32,
-                             device=DEVICE)
+    draft_probs = torch.rand(
+        (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE
+    )
     draft_probs = F.softmax(draft_probs, dim=-1)
 
     # Randomly sample draft token ids from draft probs
@@ -491,27 +542,22 @@ def _test_masked_logits(
     draft_token_ids = draft_token_ids.tolist()
 
     # Bonus tokens not used but required
-    bonus_token_ids = torch.zeros((batch_size, 1),
-                                  dtype=torch.int64,
-                                  device=DEVICE)
+    bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE)
 
     # Create spec decode metadata
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-        draft_token_ids,
-        device=DEVICE,
-    )
+    spec_decode_metadata = create_spec_decode_metadata(draft_token_ids, target_logits)
 
     # Run rejection sampling
-    output_token_ids = rejection_sampler(
+    mock_sampler_output(rejection_sampler, bonus_token_ids)
+    output = rejection_sampler(
         spec_decode_metadata,
         draft_probs=draft_probs,
-        target_logits=target_logits,
-        bonus_token_ids=bonus_token_ids,
+        logits=target_logits,
         sampling_metadata=sampling_metadata,
     )
 
     # Remove bonus tokens and reshape
-    output_token_ids = output_token_ids[:, :-1].flatten().tolist()
+    output_token_ids = output.sampled_token_ids[:, :-1].flatten().tolist()
 
     # Check that all sampled tokens are within the unmasked indices.
     for i in range(num_tokens):
@@ -531,8 +577,7 @@ def test_top_k(rejection_sampler, top_k):
 
     # Randomly create top-k indices.
     top_k_indices = [
-        torch.randperm(vocab_size, device=DEVICE)[:top_k]
-        for _ in range(num_tokens)
+        torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens)
     ]
     top_k_indices = torch.stack(top_k_indices)
 
@@ -550,9 +595,7 @@ def test_top_k(rejection_sampler, top_k):
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_k=torch.tensor([top_k] * batch_size,
-                           device=DEVICE,
-                           dtype=torch.int64),
+        top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64),
     )
 
     _test_masked_logits(
@@ -595,9 +638,7 @@ def test_top_p(rejection_sampler, top_p):
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_p=torch.tensor([top_p] * batch_size,
-                           device=DEVICE,
-                           dtype=torch.float32),
+        top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32),
     )
 
     _test_masked_logits(
@@ -609,3 +650,132 @@ def test_top_p(rejection_sampler, top_p):
         unmasked_indices=top_p_indices,
         sampling_metadata=sampling_metadata,
     )
+
+
+########################### Tests for Logit Processors ###################
+def test_frequency_penalties(rejection_sampler):
+    """Test rejection sampling with frequency penalties"""
+    spec_tokens = [[1, 1, 1], [], [1, 1, 1]]
+    output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]]  # 1, 7 and 1 are the bonus tokens
+
+    num_requsts = len(spec_tokens)
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
+        frequency_penalties=[1.5, 1.5, 0.7],
+        presence_penalties=[0.0] * num_requsts,
+        repetition_penalties=[1.0] * num_requsts,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [7, -1, -1, -1], [1, 1, 15, -1]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_bad_words(rejection_sampler):
+    """Test rejection sampling with bad words constraints"""
+    spec_tokens = [[1, 2, 3], [1, 15, 3], [1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
+
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        bad_words_token_ids={
+            0: [
+                [
+                    2,
+                ]
+            ],
+            1: [
+                [
+                    2,
+                ]
+            ],
+            # Do not apply bad words to the last request
+        },
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [1, 15, 3, 4], [1, 2, 3, 4]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_allowed_token_ids(rejection_sampler):
+    """Test rejection sampling with allowed token ids"""
+    spec_tokens = [[1, 2, 10], [10, 5, 3], [7, 10, 12]]
+    output_tokens = [[1, 2, 10, 5], [10, 5, 10, 5], [7, 10, 12, 5]]
+    # Not allowed tokens:
+    # 0: 0-4
+    # 1: 1-5
+    # 2: 2-6
+    num_allowed_token_ids = 5
+
+    # Use the token 15 as the sampler choose if a token rejected
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+
+    batch_size = len(output_tokens)
+    _, vocab_size = logits.size()
+    mask = create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=vocab_size,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=logits.device,
+    )
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[], [], []],
+        spec_token_ids=spec_tokens,
+        allowed_token_ids_mask=mask,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[15, -1, -1, -1], [10, 5, 10, -1], [7, 10, 12, 5]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 6ff000043265eb4951bfff0680f67184edbf036c..51f2bf5e753c0234563ac8350d6a195dbc2ddcc5 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
-
 import numpy as np
 import pytest
 import torch
 
+from tests.v1.sample.utils import create_allowed_token_ids
 from vllm.platforms import current_platform
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
@@ -29,12 +29,12 @@ def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
     return fake_logits
 
 
-def _create_penalty_tensor(batch_size: int, penalty_value: float,
-                           device: torch.device) -> torch.Tensor:
-    return torch.full((batch_size, ),
-                      fill_value=penalty_value,
-                      dtype=torch.float,
-                      device=device)
+def _create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
 
 
 def _create_prompt_tokens_tensor(
@@ -51,26 +51,6 @@ def _create_prompt_tokens_tensor(
     )
 
 
-def _create_allowed_token_ids(
-    batch_size: int,
-    vocab_size: int,
-    num_allowed_token_ids: int,
-    device: torch.device,
-) -> Optional[torch.Tensor]:
-    mask: Optional[torch.Tensor] = None
-    for i in range(batch_size):
-        if i % 2 == 1:
-            continue
-        if mask is None:
-            mask = torch.zeros((batch_size, vocab_size),
-                               dtype=torch.bool,
-                               device=device)
-        start = min(i, vocab_size - 1)
-        end = min(i + num_allowed_token_ids, vocab_size - 1)
-        mask[i, start:end] = True
-    return mask
-
-
 def _create_bad_words_token_ids(
     batch_size: int,
     vocab_size: int,
@@ -80,9 +60,9 @@ def _create_bad_words_token_ids(
     for batch_idx in range(batch_size):
         token_ids_single_batch = []
         for bad_words_length in bad_words_lengths:
-            token_ids = np.random.choice(vocab_size,
-                                         size=bad_words_length,
-                                         replace=True).tolist()
+            token_ids = np.random.choice(
+                vocab_size, size=bad_words_length, replace=True
+            ).tolist()
             token_ids_single_batch.append(token_ids)
         bad_words_token_ids[batch_idx] = token_ids_single_batch
     if batch_size >= 2:
@@ -95,26 +75,27 @@ def _create_bad_words_token_ids(
 # Returns all last tokens of bad word sequences that share the same prefix
 # as `given_prefix` (excluding the last token).
 def _collect_suffixes_with_same_prefix(
-        given_prefix: list[int],
-        bad_words_token_ids: list[list[int]]) -> list[int]:
+    given_prefix: list[int], bad_words_token_ids: list[list[int]]
+) -> list[int]:
     return [bwt[-1] for bwt in bad_words_token_ids if bwt[:-1] == given_prefix]
 
 
 # generate a valid token id that is not in bad_words_token_ids
-def _generate_valid_token_id(bad_words_token_ids: list[list[int]],
-                             vocab_size: int) -> int:
+def _generate_valid_token_id(
+    bad_words_token_ids: list[list[int]], vocab_size: int
+) -> int:
     forbidden_start_tokens = set()
     for bad_word in bad_words_token_ids:
         forbidden_start_tokens.add(bad_word[0])
     # Get a safe token that's not in forbidden starts
-    safe_token_candidates = list(
-        set(range(vocab_size)) - forbidden_start_tokens)
+    safe_token_candidates = list(set(range(vocab_size)) - forbidden_start_tokens)
     # Pick a random safe token
     return np.random.choice(safe_token_candidates)
 
 
 def _update_output_token_ids_for_bad_words(
-        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    metadata: SamplingMetadata, vocab_size: int
+) -> dict[int, list[int]]:
     bad_words_last_tokens = {}
     for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
         output_token_ids = metadata.output_token_ids[batch_idx]
@@ -132,12 +113,13 @@ def _update_output_token_ids_for_bad_words(
                     # Collect all last tokens from other bad words
                     # that share this prefix
                     bad_words_last_token.extend(
-                        _collect_suffixes_with_same_prefix(
-                            prefix, bad_words_token_ids))
+                        _collect_suffixes_with_same_prefix(prefix, bad_words_token_ids)
+                    )
                     break  # Maximum one update to output_token_ids
                 else:  # Make sure no accidental match to bad words
                     output_token_ids[-1] = _generate_valid_token_id(
-                        bad_words_token_ids, vocab_size)
+                        bad_words_token_ids, vocab_size
+                    )
         bad_words_last_tokens[batch_idx] = bad_words_last_token
     return bad_words_last_tokens
 
@@ -152,23 +134,26 @@ def _create_default_sampling_metadata(
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
         output_token_ids.append(
-            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
+        )
         prompt_token_ids.append(
-            np.random.randint(0,
-                              vocab_size,
-                              size=np.random.randint(
-                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+            np.random.randint(
+                0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
+            ).tolist()
+        )
     fake_sampling_metadata = SamplingMetadata(
-        temperature=torch.full((batch_size, ), 0.0),
+        temperature=torch.full((batch_size,), 0.0),
         all_greedy=True,
         all_random=False,
         top_p=None,
         top_k=None,
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
-                                                      vocab_size, device),
+        prompt_token_ids=_create_prompt_tokens_tensor(
+            prompt_token_ids, vocab_size, device
+        ),
         output_token_ids=output_token_ids,
+        spec_token_ids=[[] for _ in range(batch_size)],
         frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
@@ -181,8 +166,8 @@ def _create_default_sampling_metadata(
 
 
 def _create_weighted_output_token_list(
-        batch_size: int,
-        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
+    batch_size: int, vocab_size: int
+) -> tuple[list[list[int]], list[list[int]]]:
     """
     Creates an output token list where each token occurs a distinct
     number of times.
@@ -203,14 +188,13 @@ def _create_weighted_output_token_list(
     output_token_ids: list[list[int]] = []
     sorted_token_ids_in_output: list[list[int]] = []
     for _ in range(batch_size):
-        distinct_token_ids = np.random.choice(vocab_size,
-                                              size=np.random.randint(1, 10),
-                                              replace=False).tolist()
+        distinct_token_ids = np.random.choice(
+            vocab_size, size=np.random.randint(1, 10), replace=False
+        ).tolist()
         sorted_token_ids_in_output.append(distinct_token_ids)
         output_token_ids_for_batch = []
         for index, token_id in enumerate(distinct_token_ids):
-            output_token_ids_for_batch.extend(
-                [token_id for _ in range(index + 1)])
+            output_token_ids_for_batch.extend([token_id for _ in range(index + 1)])
         output_token_ids.append(output_token_ids_for_batch)
     return output_token_ids, sorted_token_ids_in_output
 
@@ -218,8 +202,9 @@ def _create_weighted_output_token_list(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
-def test_sampler_presence_penalty(device: str, batch_size: int,
-                                  presence_penalty: float):
+def test_sampler_presence_penalty(
+    device: str, batch_size: int, presence_penalty: float
+):
     """
     Test to verify that if presence penalty is enabled then tokens
     are penalized as per their presence in the existing output.
@@ -229,13 +214,17 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     output_token_ids = sampling_metadata.output_token_ids
     sampling_metadata.presence_penalties = _create_penalty_tensor(
-        batch_size, presence_penalty, torch.device(device))
+        batch_size, presence_penalty, torch.device(device)
+    )
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         # Since all tokens initially have the same logits, the non-penalized
@@ -263,8 +252,9 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
-def test_sampler_frequency_penalty(device: str, batch_size: int,
-                                   frequency_penalty: float):
+def test_sampler_frequency_penalty(
+    device: str, batch_size: int, frequency_penalty: float
+):
     """
     Test to verify that if frequency penalty is enabled then tokens are
     penalized as per their frequency of occurrence.
@@ -274,34 +264,36 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
-        batch_size, frequency_penalty, torch.device(device))
-    output_token_ids, sorted_token_ids_in_output = \
-        _create_weighted_output_token_list(
-            batch_size,
-            VOCAB_SIZE,
-        )
+        batch_size, frequency_penalty, torch.device(device)
+    )
+    output_token_ids, sorted_token_ids_in_output = _create_weighted_output_token_list(
+        batch_size,
+        VOCAB_SIZE,
+    )
     sampling_metadata.output_token_ids = output_token_ids
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
-            batch_idx]
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[batch_idx]
         most_frequent_token_id = distinct_sorted_token_ids_in_output[
-            len(distinct_sorted_token_ids_in_output) - 1]
+            len(distinct_sorted_token_ids_in_output) - 1
+        ]
         if frequency_penalty > 0:
             # If `frequency_penalty` is set to > 0, it indicates
             # a preference for new tokens over existing ones. Verify that the
             # non-penalized token ID is not present in the output, while the
             # most penalized token is the one that occurs most frequently in
             # the output.
-            assert (non_penalized_token_id
-                    not in distinct_sorted_token_ids_in_output)
+            assert non_penalized_token_id not in distinct_sorted_token_ids_in_output
             assert penalized_token_id == most_frequent_token_id
         elif frequency_penalty < 0:
             # If `frequency_penalty` is set to < 0, it indicates
@@ -316,8 +308,9 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
-def test_sampler_repetition_penalty(device: str, batch_size: int,
-                                    repetition_penalty: float):
+def test_sampler_repetition_penalty(
+    device: str, batch_size: int, repetition_penalty: float
+):
     """
     Test to verify that when the repetition penalty is enabled, tokens
     are penalized based on their presence in the prompt or the existing
@@ -328,42 +321,54 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.repetition_penalties = _create_penalty_tensor(
-        batch_size, repetition_penalty, torch.device(device))
+        batch_size, repetition_penalty, torch.device(device)
+    )
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        prompt_tokens = sampling_metadata.prompt_token_ids[
-            batch_idx][:].tolist()
+        prompt_tokens = sampling_metadata.prompt_token_ids[batch_idx][:].tolist()
         output_tokens = sampling_metadata.output_token_ids[batch_idx]
         if repetition_penalty > 1.0:
             # If `repetition_penalty` > 1.0, verify that the non-penalized
             # token ID has not been seen before, while the penalized token ID
             # exists either in the prompt or the output.
-            assert (non_penalized_token_id not in prompt_tokens
-                    and non_penalized_token_id not in output_tokens)
-            assert (penalized_token_id in prompt_tokens
-                    or penalized_token_id in output_tokens)
+            assert (
+                non_penalized_token_id not in prompt_tokens
+                and non_penalized_token_id not in output_tokens
+            )
+            assert (
+                penalized_token_id in prompt_tokens
+                or penalized_token_id in output_tokens
+            )
         elif repetition_penalty < 1.0:
             # If `repetition_penalty` < 1.0, verify that the penalized
             # token ID has not been seen before, while the non-penalized
             # token ID exists either in the prompt or the output.
-            assert (penalized_token_id not in prompt_tokens
-                    and penalized_token_id not in output_tokens)
-            assert (non_penalized_token_id in prompt_tokens
-                    or non_penalized_token_id in output_tokens)
+            assert (
+                penalized_token_id not in prompt_tokens
+                and penalized_token_id not in output_tokens
+            )
+            assert (
+                non_penalized_token_id in prompt_tokens
+                or non_penalized_token_id in output_tokens
+            )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
-def test_sampler_allowed_token_ids(device: str, batch_size: int,
-                                   num_allowed_token_ids: int):
+def test_sampler_allowed_token_ids(
+    device: str, batch_size: int, num_allowed_token_ids: int
+):
     """
     Test to verify that when the repetition penalty is enabled, tokens
     are penalized based on their presence in the prompt or the existing
@@ -374,8 +379,9 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    mask = _create_allowed_token_ids(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    mask = create_allowed_token_ids(
         batch_size=batch_size,
         vocab_size=VOCAB_SIZE,
         num_allowed_token_ids=num_allowed_token_ids,
@@ -383,7 +389,9 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     )
     sampling_metadata.allowed_token_ids_mask = mask
     sampler = Sampler()
-    logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
+    logits = sampler.apply_logits_processors(
+        fake_logits, sampling_metadata, predict_bonus_token=False
+    )
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         logits_for_req = logits[batch_idx]
@@ -394,17 +402,19 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
             start = min(batch_idx, VOCAB_SIZE - 1)
             end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
             if token_id >= start and token_id < end:
-                assert logits_for_req[token_id] == -float(
-                    "inf"), f"{batch_idx}, {token_id}"
+                assert logits_for_req[token_id] == -float("inf"), (
+                    f"{batch_idx}, {token_id}"
+                )
             else:
                 assert logits_for_req[token_id] != -float("inf")
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
-def test_sampler_bad_words(device: str, batch_size: int,
-                           bad_words_lengths: tuple[int, ...]):
+@pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)])
+def test_sampler_bad_words(
+    device: str, batch_size: int, bad_words_lengths: tuple[int, ...]
+):
     """
     Test to verify that when the bad words restriction is present, tokens
     are penalized based on their match with the bad words.
@@ -414,19 +424,26 @@ def test_sampler_bad_words(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
-        batch_size, VOCAB_SIZE, bad_words_lengths)
+        batch_size, VOCAB_SIZE, bad_words_lengths
+    )
     bad_words_last_tokens = _update_output_token_ids_for_bad_words(
-        sampling_metadata, VOCAB_SIZE)
+        sampling_metadata, VOCAB_SIZE
+    )
     sampler = Sampler()
-    logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
+    logits = sampler.apply_logits_processors(
+        fake_logits, sampling_metadata, predict_bonus_token=False
+    )
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         logits_for_req = logits[batch_idx]
         for token_id in range(VOCAB_SIZE):
-            if (batch_idx in bad_words_last_tokens
-                    and token_id in bad_words_last_tokens[batch_idx]):
+            if (
+                batch_idx in bad_words_last_tokens
+                and token_id in bad_words_last_tokens[batch_idx]
+            ):
                 assert logits_for_req[token_id] == -float("inf")
             else:
                 assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index f53e1e1c485d6a1d1c4f396652120927e6f57fc3..915b9957031d8a224b2152c7ef80e3d74cc83fcf 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -1,24 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 
 from vllm import LLM, SamplingParams
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"
 
 
 @pytest.fixture(scope="module")
 def llm() -> LLM:
-    # Disable prefix caching so that we can test prompt logprobs.
-    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
-    # is merged
-    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
+    return LLM(MODEL, enforce_eager=True)
 
 
 def test_n_gt_1(llm):
@@ -66,9 +59,9 @@ def test_stop(llm):
     # Output should not contain the stop word.
     assert len(new_split_text) == STOP_IDX
 
-    params = SamplingParams(temperature=0,
-                            stop=split_text[STOP_IDX],
-                            include_stop_str_in_output=True)
+    params = SamplingParams(
+        temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True
+    )
     output = llm.generate(PROMPT, params)
     new_split_text = output[0].outputs[0].text.split()
 
@@ -103,8 +96,8 @@ def test_detokenize_false(llm):
     assert len(output[0].outputs[0].text) == 0
 
     output = llm.generate(
-        PROMPT, SamplingParams(detokenize=False, logprobs=3,
-                               prompt_logprobs=3))
+        PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)
+    )
     assert len(output[0].outputs[0].token_ids) > 0
     assert len(output[0].outputs[0].text) == 0
 
@@ -131,8 +124,7 @@ def test_bad_words(llm):
     assert bad_words_1 not in new_text
 
     bad_words_2 = new_text.split()[-1]
-    params = SamplingParams(temperature=0,
-                            bad_words=[bad_words_1, bad_words_2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
     output = llm.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
     assert bad_words_1 not in new_text
@@ -158,8 +150,7 @@ def test_allowed_token_ids(llm):
 
     TOKEN_ID = 10
     allowed_token_ids = [TOKEN_ID]
-    output = llm.generate(PROMPT,
-                          SamplingParams(allowed_token_ids=allowed_token_ids))
+    output = llm.generate(PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
     assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
 
     # Reject empty allowed_token_ids.
@@ -175,14 +166,6 @@ def test_allowed_token_ids(llm):
         _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
 
 
-def test_priority(llm):
-    """Check that we reject requests with priority."""
-
-    # Reject all allowed token ids
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, priority=[1])
-
-
 def test_seed(llm):
     """Check that seed impacts randomness."""
 
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index ccf38c31d39e6f640387187f1a61e666162afafb..f50ef610220409916863497dd4abd2dfdf0a75e6 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -5,18 +5,13 @@ import torch
 from torch import Generator
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  is_flashinfer_available)
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 
 DEVICE = current_platform.device_type
 
 BATCH_SIZE = 1024
 VOCAB_SIZE = 128 * 1024
 
-FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
-if is_flashinfer_available:
-    from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
-
 
 @pytest.fixture(autouse=True)
 def reset_default_device():
@@ -30,19 +25,18 @@ def reset_default_device():
 
 
 def test_topk_impl_equivalence():
-
     torch.set_default_device(DEVICE)
     generator = Generator(device=DEVICE).manual_seed(33)
 
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
     # Random top-k values between 1 and 9.
-    k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+    k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
 
     # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
     k.masked_fill_(
-        torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool),
-        VOCAB_SIZE)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
+    )
 
     # Top-k only implementation
     result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
@@ -55,7 +49,7 @@ def test_topk_impl_equivalence():
 
 
 def test_flashinfer_sampler():
-    '''
+    """
     This test verifies that the FlashInfer top-k and top-p sampling
     implementation produces the same results as the Python implementation.
 
@@ -63,11 +57,18 @@ def test_flashinfer_sampler():
     top-p prob renorm (it did provide fused sampling but we cannot compare
     sampling results due to randomness), so we will compare the probability
     renormed consequently by top-k and then top-p of FlashInfer implementation.
-    '''
+    """
+    try:
+        from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
+
+        is_flashinfer_available = True
+    except ImportError:
+        is_flashinfer_available = False
+
+    FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
 
     if not FLASHINFER_ENABLED:
-        pytest.skip(
-            "FlashInfer not installed or not available on this platform.")
+        pytest.skip("FlashInfer not installed or not available on this platform.")
 
     torch.set_default_device(DEVICE)
     generator = Generator(device=DEVICE).manual_seed(42)
@@ -76,23 +77,21 @@ def test_flashinfer_sampler():
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
     # Generate various top-k and top-p values
-    k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator)
-    p_values = torch.rand(
-        (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
+    k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+    p_values = (
+        torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
+    )  # range in [0.5, 1.0]
 
     # Sometimes disable top-k (k=vocab_size)
     k_values.masked_fill_(
-        torch.randint(0,
-                      2, (BATCH_SIZE, ),
-                      generator=generator,
-                      dtype=torch.bool), VOCAB_SIZE)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
+        VOCAB_SIZE,
+    )
 
     # Sometimes disable top-p (p=1.0)
     p_values.masked_fill_(
-        torch.randint(0,
-                      2, (BATCH_SIZE, ),
-                      generator=generator,
-                      dtype=torch.bool), 1.0)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
+    )
 
     python_logits = apply_top_k_top_p(
         logits=logits.clone(),
@@ -113,5 +112,6 @@ def test_flashinfer_sampler():
     )
 
     # Compare the results
-    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \
+    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), (
         "FlashInfer and Python sampling implementations do not match!"
+    )
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index e33efb413d0260bf78357204885565194f721a9f..a0abb3b4c6ce23f498b5f1f78edec2268e46e53f 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -3,33 +3,34 @@
 
 from collections.abc import Iterator
 from enum import Enum
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import regex as re
 import torch
 
 from vllm import CompletionOutput
-from vllm.utils import make_tensor_with_pad
+from vllm.utils.torch_utils import make_tensor_with_pad
 from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 
 
 class BatchLogprobsComposition(Enum):
     """Types of logprobs configs to include in test batch"""
+
     NONE = 0
     SAMPLE = 1
     PROMPT = 2
     SAMPLE_PROMPT = 3
 
 
-BatchLogprobsSpecType = list[tuple[Optional[int], Optional[int]]]
+BatchLogprobsSpecType = list[tuple[int | None, int | None]]
 
 
 def get_test_batch(
-    batch_logprobs_composition: BatchLogprobsComposition
+    batch_logprobs_composition: BatchLogprobsComposition,
 ) -> BatchLogprobsSpecType:
     """Generate logprobs configs for a batch of requests
-    
+
     A given request's logprobs configuration is (1) num_sample_logprobs and (2)
     num_prompt_logprobs. The batch logprobs configuration is the list of request
     logprobs configs.
@@ -101,7 +102,7 @@ def assert_incr_detok_str_matches_non_incr_detok_str(
     msg: str,
 ) -> None:
     """Compare incrementally detok. text to non-incrementally detok. text
-    
+
     Fail if the strings mismatch after non-alphanumeric characters are stripped
     out.
 
@@ -120,15 +121,15 @@ def assert_incr_detok_str_matches_non_incr_detok_str(
                                           tokens
       msg: error message if `assert` fails
     """
-    rgx = r'[^a-zA-Z0-9]+'
-    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
-        rgx, '', non_incremental_detokenization_str)), (msg)
+    rgx = r"[^a-zA-Z0-9]+"
+    assert re.sub(rgx, "", incremental_detokenization_str) == re.sub(
+        rgx, "", non_incremental_detokenization_str
+    ), msg
 
 
-def compute_correct_cumulative_logprob(
-        completion_output: CompletionOutput) -> float:
+def compute_correct_cumulative_logprob(completion_output: CompletionOutput) -> float:
     """Compute known-good value for evaluating cumulative logprob
-    
+
     Args:
       completion_output: completion output from engine
 
@@ -146,12 +147,12 @@ def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
     return fake_logits
 
 
-def create_penalty_tensor(batch_size: int, penalty_value: float,
-                          device: torch.device) -> torch.Tensor:
-    return torch.full((batch_size, ),
-                      fill_value=penalty_value,
-                      dtype=torch.float,
-                      device=device)
+def create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
 
 
 def create_prompt_tokens_tensor(
@@ -170,6 +171,7 @@ def create_prompt_tokens_tensor(
 
 class LogitsprocsTestFakes(NamedTuple):
     """Wraps fake data structures to support testing"""
+
     logits: torch.Tensor
     sampling_metadata: SamplingMetadata
 
@@ -178,15 +180,16 @@ class LogitsprocsTestFakes(NamedTuple):
         cls: type[LogitsProcessor],
     ) -> Iterator[LogitsProcessor]:
         """Yield logits processors of a specific class.
-        
+
         Args:
           cls: :class:`LogitsProcessor` subclass
 
         Returns:
           Iterator over logits processors
         """
-        return (lp for lp in self.sampling_metadata.logitsprocs.all
-                if isinstance(lp, cls))
+        return (
+            lp for lp in self.sampling_metadata.logitsprocs.all if isinstance(lp, cls)
+        )
 
     def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
         """Iterator over all logits processors."""
@@ -208,8 +211,27 @@ def fake_apply_logitsprocs(
     slice_indices: list[int],
 ) -> torch.Tensor:
     """Imitate application of logits processors in engine core"""
-    logits = test_fakes.logits[torch.tensor(slice_indices,
-                                            dtype=torch.long)].clone()
+    logits = test_fakes.logits[torch.tensor(slice_indices, dtype=torch.long)].clone()
     for processor in test_fakes.get_logitsprocs():
         logits = processor.apply(logits)
     return logits
+
+
+def create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> torch.Tensor | None:
+    mask: torch.Tensor | None = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros(
+                (batch_size, vocab_size), dtype=torch.bool, device=device
+            )
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 682d84dc23d12fda6e3f208b2b44444840787ec6..ee04dfad39066e4628775494dd9684522cb6d8ab 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -5,15 +5,17 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 @pytest.mark.asyncio
@@ -21,8 +23,9 @@ MODELS = ["meta-llama/Llama-3.2-1B"]
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("send_one_request", [False, True])
-async def test_async_llm_delete(model: str, tensor_parallel_size: int,
-                                send_one_request: bool) -> None:
+async def test_async_llm_delete(
+    model: str, tensor_parallel_size: int, send_one_request: bool
+) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
 
@@ -34,19 +37,21 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int,
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
 
     # Instantiate AsyncLLM; make request to complete any deferred
     # initialization; then delete instance
     async_llm = AsyncLLM.from_engine_args(engine_args)
     if send_one_request:
         async for _ in async_llm.generate(
-                "Hello my name is",
-                request_id="abc",
-                sampling_params=SamplingParams(
-                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=1, output_kind=RequestOutputKind.DELTA
+            ),
+        ):
             pass
     del async_llm
 
@@ -62,9 +67,13 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int,
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("send_one_request", [False, True])
-def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
-                    enable_multiprocessing: bool,
-                    send_one_request: bool) -> None:
+def test_llm_delete(
+    monkeypatch,
+    model: str,
+    tensor_parallel_size: int,
+    enable_multiprocessing: bool,
+    send_one_request: bool,
+) -> None:
     """Test that LLM frees GPU memory upon deletion.
     TODO(andy) - LLM without multiprocessing.
 
@@ -83,12 +92,13 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
 
         # Instantiate LLM; make request to complete any deferred
         # initialization; then delete instance
-        llm = LLM(model=model,
-                  enforce_eager=True,
-                  tensor_parallel_size=tensor_parallel_size)
+        llm = LLM(
+            model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+        )
         if send_one_request:
-            llm.generate("Hello my name is",
-                         sampling_params=SamplingParams(max_tokens=1))
+            llm.generate(
+                "Hello my name is", sampling_params=SamplingParams(max_tokens=1)
+            )
         del llm
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 523b7ee231151b05dd960c51514bd6b5f2c752f4..a751b2d919e1a85643e5ba95c46f066b23946ea7 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -7,16 +7,18 @@ import asyncio
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
 from vllm import LLM, AsyncEngineArgs, SamplingParams
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 def evil_forward(self, *args, **kwargs):
@@ -26,8 +28,10 @@ def evil_forward(self, *args, **kwargs):
     if not hasattr(self, "num_calls"):
         self.num_calls = 0
 
-    if (self.num_calls == NUMBER_OF_GOOD_PASSES
-            and get_tensor_model_parallel_rank() == 0):
+    if (
+        self.num_calls == NUMBER_OF_GOOD_PASSES
+        and get_tensor_model_parallel_rank() == 0
+    ):
         raise Exception("Simulated illegal memory access on Rank 0!")
     self.num_calls += 1
 
@@ -37,10 +41,11 @@ def evil_forward(self, *args, **kwargs):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
-async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
-                                     model: str) -> None:
+async def test_async_llm_model_error(
+    monkeypatch, tensor_parallel_size: int, model: str
+) -> None:
     """Test that AsyncLLM propagates a forward pass error and frees memory.
-    
+
     AsyncLLM always uses an MP client.
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
@@ -49,15 +54,15 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
     # Monkeypatch an error in the model.
     monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
     async_llm = AsyncLLM.from_engine_args(engine_args)
 
     async def generate(request_id: str):
-        generator = async_llm.generate("Hello my name is",
-                                       request_id=request_id,
-                                       sampling_params=SamplingParams())
+        generator = async_llm.generate(
+            "Hello my name is", request_id=request_id, sampling_params=SamplingParams()
+        )
         try:
             async for _ in generator:
                 pass
@@ -77,9 +82,9 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
 
     # We should not be able to make another request.
     with pytest.raises(EngineDeadError):
-        async for _ in async_llm.generate("Hello my name is",
-                                          request_id="abc",
-                                          sampling_params=SamplingParams()):
+        async for _ in async_llm.generate(
+            "Hello my name is", request_id="abc", sampling_params=SamplingParams()
+        ):
             raise Exception("We should not get here.")
 
     # Confirm all the processes are cleaned up.
@@ -98,8 +103,9 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
-def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
-                         enable_multiprocessing: bool, model: str) -> None:
+def test_llm_model_error(
+    monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str
+) -> None:
     """Test that LLM propagates a forward pass error and frees memory.
     TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
     and >1 rank
@@ -108,19 +114,17 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
         m.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-        llm = LLM(model=model,
-                  enforce_eager=True,
-                  tensor_parallel_size=tensor_parallel_size)
+        llm = LLM(
+            model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+        )
 
-        with pytest.raises(
-                EngineDeadError if enable_multiprocessing else Exception):
+        with pytest.raises(EngineDeadError if enable_multiprocessing else Exception):
             llm.generate("Hello my name is Robert and I")
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index a077d48fecbba2bad4a1f1813beabf557e590199..013b929e3df658796594c5c4634b6e369c578c31 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -30,9 +30,9 @@ async def test_async_llm_processor_error(model: str) -> None:
 
     async def generate(request_id: str):
         # [] is not allowed and will raise a ValueError in Processor.
-        generator = async_llm.generate(TokensPrompt([]),
-                                       request_id=request_id,
-                                       sampling_params=SamplingParams())
+        generator = async_llm.generate(
+            TokensPrompt([]), request_id=request_id, sampling_params=SamplingParams()
+        )
         try:
             async for _ in generator:
                 pass
@@ -55,11 +55,12 @@ async def test_async_llm_processor_error(model: str) -> None:
     EXPECTED_TOKENS = 5
     outputs = []
     async for out in async_llm.generate(
-            "Hello my name is",
-            request_id="abc",
-            sampling_params=SamplingParams(
-                max_tokens=EXPECTED_TOKENS,
-                output_kind=RequestOutputKind.DELTA)):
+        "Hello my name is",
+        request_id="abc",
+        sampling_params=SamplingParams(
+            max_tokens=EXPECTED_TOKENS, output_kind=RequestOutputKind.DELTA
+        ),
+    ):
         outputs.append(out)
 
     generated_tokens = []
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 88fc5297aaf5054fb6a926568d5ef6cda59114eb..c1594cc2e8b760df3663f46e672a4a2e36678a94 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -5,16 +5,18 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 def evil_method(self, *args, **kwargs):
@@ -30,9 +32,9 @@ def evil_method(self, *args, **kwargs):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
-def test_async_llm_startup_error(monkeypatch, model: str,
-                                 tensor_parallel_size: int,
-                                 failing_method: str) -> None:
+def test_async_llm_startup_error(
+    monkeypatch, model: str, tensor_parallel_size: int, failing_method: str
+) -> None:
     """Test that AsyncLLM propagates an __init__ error & frees memory.
     Test profiling (forward()) and load weights failures.
     AsyncLLM always uses an MP client.
@@ -43,9 +45,9 @@ def test_async_llm_startup_error(monkeypatch, model: str,
     # Monkeypatch an error in the model.
     monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
 
     # Confirm we get an exception.
     with pytest.raises(Exception, match="initialization failed"):
@@ -63,20 +65,25 @@ def test_async_llm_startup_error(monkeypatch, model: str,
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
-def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
-                           enable_multiprocessing: bool,
-                           failing_method: str) -> None:
+def test_llm_startup_error(
+    monkeypatch,
+    model: str,
+    tensor_parallel_size: int,
+    enable_multiprocessing: bool,
+    failing_method: str,
+) -> None:
     """Test that LLM propagates an __init__ error and frees memory.
     Test profiling (forward()) and load weights failures.
     TODO(andy) - LLM without multiprocessing.
     """
-    if model != "meta-llama/Llama-3.2-1B":
-        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    # Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
+    # If MODELS list grows, each architecture needs its own test variant.
+    if model != "JackFram/llama-68m":
+        pytest.skip(reason="Only test JackFram/llama-68m")
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
@@ -84,12 +91,16 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
         monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
         with pytest.raises(
-                Exception,
-                match="initialization failed"
-                if enable_multiprocessing else "Simulated Error in startup!"):
-            _ = LLM(model=model,
-                    enforce_eager=True,
-                    tensor_parallel_size=tensor_parallel_size)
+            Exception,
+            match="initialization failed"
+            if enable_multiprocessing
+            else "Simulated Error in startup!",
+        ):
+            _ = LLM(
+                model=model,
+                enforce_eager=True,
+                tensor_parallel_size=tensor_parallel_size,
+            )
 
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 49311c0005e751d4a7877c7c87e29ec58a17c3e9..e7da11f3c02810490b8bfe131498e7e9c3391e4f 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,20 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 from unittest import mock
 
+import numpy as np
 import pytest
 import torch
 
 from tests.utils import get_attn_backend_list_based_on_platform
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      get_attention_backend)
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VllmConfig)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    try_get_attention_backend,
+)
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
@@ -30,11 +39,9 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 def _create_proposer(
     method: str,
     num_speculative_tokens: int,
-    speculative_token_tree: Optional[list[tuple[int, ...]]] = None,
+    speculative_token_tree: list[tuple[int, ...]] | None = None,
 ) -> EagleProposer:
-    model_config = ModelConfig(model=model_dir,
-                               runner="generate",
-                               max_model_len=100)
+    model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
 
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
@@ -60,10 +67,10 @@ def _create_proposer(
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig())
+        scheduler_config=SchedulerConfig(),
+    )
 
-    return EagleProposer(vllm_config=vllm_config,
-                         device=current_platform.device_type)
+    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
 
 
 def test_prepare_next_token_ids():
@@ -82,7 +89,7 @@ def test_prepare_next_token_ids():
         query_lens=[num_speculative_tokens + 1] * num_requests,
     )
 
-    req_ids = [f"req_{i+1}" for i in range(num_requests)]
+    req_ids = [f"req_{i + 1}" for i in range(num_requests)]
     mock_input_batch = mock.MagicMock(spec=InputBatch)
     mock_input_batch.req_ids = req_ids
     mock_input_batch.num_reqs = num_requests
@@ -101,24 +108,28 @@ def test_prepare_next_token_ids():
         [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
         [0, 1, 2, 3, 4],  # all accepted, "4" sampled
         [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
-        [-1, -1, -1, -1, -1]  # this request will be discarded
+        [-1, -1, -1, -1, -1],  # this request will be discarded
+    ]
+    sampled_token_ids_tensor = torch.tensor(
+        sampled_token_ids, dtype=torch.int32, device=device
+    )
+    sampled_token_ids_cpu = [
+        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
     ]
-    sampled_token_ids_tensor = torch.tensor(sampled_token_ids,
-                                            dtype=torch.int32,
-                                            device=device)
-    sampled_token_ids_cpu = [[i for i in seq if i != -1]
-                             for seq in sampled_token_ids]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
-    expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu,
-                                                  dtype=torch.int32,
-                                                  device=device)
+    expected_next_token_ids_tensor = torch.tensor(
+        expected_next_token_ids_cpu, dtype=torch.int32, device=device
+    )
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
     next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu(
-        sampled_token_ids_cpu, mock_requests, mock_input_batch,
-        mock_num_scheduled_tokens)
+        sampled_token_ids_cpu,
+        mock_requests,
+        mock_input_batch,
+        mock_num_scheduled_tokens,
+    )
 
     assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor)
 
@@ -131,19 +142,23 @@ def test_prepare_next_token_ids():
     discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
     num_discarded_reqs = 1
 
-    expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0],
-                                                       dtype=torch.int32,
-                                                       device=device)
+    expected_valid_sampled_tokens_count = torch.tensor(
+        [2, 5, 0, 0], dtype=torch.int32, device=device
+    )
 
-    next_token_ids_from_padded, valid_sampled_tokens_count = \
+    next_token_ids_from_padded, valid_sampled_tokens_count = (
         proposer.prepare_next_token_ids_padded(
-            common_attn_metadata, sampled_token_ids_tensor, mock_requests,
-            mock_input_batch, discarded_req_indices, num_discarded_reqs)
+            common_attn_metadata,
+            sampled_token_ids_tensor,
+            mock_requests,
+            mock_input_batch,
+            discarded_req_indices,
+            num_discarded_reqs,
+        )
+    )
 
-    assert torch.equal(next_token_ids_from_padded,
-                       expected_next_token_ids_tensor)
-    assert torch.equal(valid_sampled_tokens_count,
-                       expected_valid_sampled_tokens_count)
+    assert torch.equal(next_token_ids_from_padded, expected_next_token_ids_tensor)
+    assert torch.equal(valid_sampled_tokens_count, expected_valid_sampled_tokens_count)
 
 
 def test_prepare_inputs():
@@ -183,21 +198,27 @@ def test_prepare_inputs():
     sampled_token_ids = [
         [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN],
         [
-            ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN,
-            REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN
+            ACCEPT_TOKEN,
+            ACCEPT_TOKEN,
+            ACCEPT_TOKEN,
+            REJECT_TOKEN,
+            REJECT_TOKEN,
+            REJECT_TOKEN,
+            BONUS_TOKEN,
         ],
-        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN]
+        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN],
+    ]
+    sampled_token_ids = [
+        [i for i in seq if i != REJECT_TOKEN] for seq in sampled_token_ids
     ]
-    sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN]
-                         for seq in sampled_token_ids]
 
     # Expected calculations:
     # query_len_per_req = [4, 7, 5]
     # num_tokens_per_req = [3, 4, 3]  (after subtracting rejected tokens)
     # Expected cumulative counts: [0, 3, 7, 10]
-    expected_cu_num_tokens = torch.tensor([0, 3, 7, 10],
-                                          dtype=torch.int32,
-                                          device=device)
+    expected_cu_num_tokens = torch.tensor(
+        [0, 3, 7, 10], dtype=torch.int32, device=device
+    )
 
     # Expected token indices (mapped from original positions):
     # First request: indices 0, 1, 2      (keeping first 3 from positions 0-3)
@@ -214,17 +235,18 @@ def test_prepare_inputs():
             7,  # Second request: 4 tokens (7-3)
             11,
             12,
-            13  # Third request: 3 tokens (5-2)
+            13,  # Third request: 3 tokens (5-2)
         ],
         dtype=torch.int32,
-        device=device)
+        device=device,
+    )
     proposer = _create_proposer("eagle", 1)
 
     updated_metadata, token_indices = proposer.prepare_inputs(
-        common_attn_metadata, sampled_token_ids, num_draft_tokens)
+        common_attn_metadata, sampled_token_ids, num_draft_tokens
+    )
 
-    assert torch.equal(updated_metadata.query_start_loc,
-                       expected_cu_num_tokens)
+    assert torch.equal(updated_metadata.query_start_loc, expected_cu_num_tokens)
     assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
     assert torch.equal(token_indices, expected_token_indices)
 
@@ -249,12 +271,12 @@ def test_prepare_inputs_padded():
 
     device = torch.device(current_platform.device_type)
 
-    expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8],
-                                          dtype=torch.int32,
-                                          device=device)
-    expected_token_indices_to_sample = torch.tensor([1, 5, 6],
-                                                    dtype=torch.int32,
-                                                    device=device)
+    expected_token_indices = torch.tensor(
+        [0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.int32, device=device
+    )
+    expected_token_indices_to_sample = torch.tensor(
+        [1, 5, 6], dtype=torch.int32, device=device
+    )
 
     num_speculative_tokens = 2
     batch_spec = BatchSpec(
@@ -269,9 +291,9 @@ def test_prepare_inputs_padded():
     )
 
     # Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9]
-    expected_query_start_loc = torch.tensor([0, 3, 6, 9],
-                                            dtype=torch.int32,
-                                            device=device)
+    expected_query_start_loc = torch.tensor(
+        [0, 3, 6, 9], dtype=torch.int32, device=device
+    )
     spec_decode_metadata = SpecDecodeMetadata.make_dummy(
         draft_token_ids=[[0] * num_speculative_tokens] * 3,
         device=device,
@@ -280,75 +302,85 @@ def test_prepare_inputs_padded():
     # num_rejected_tokens = [1, 0, 2]
     # num_draft_tokens = [2, 2, 2]
     # valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens
-    valid_sampled_tokens_count = torch.tensor([2, 3, 1],
-                                              dtype=torch.int32,
-                                              device=device)
+    valid_sampled_tokens_count = torch.tensor(
+        [2, 3, 1], dtype=torch.int32, device=device
+    )
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices, token_indices_to_sample = \
+    output_metadata, token_indices, token_indices_to_sample = (
         proposer.prepare_inputs_padded(
-            common_attn_metadata,
-            spec_decode_metadata,
-            valid_sampled_tokens_count)
+            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+        )
+    )
 
     assert output_metadata.max_query_len == 3
-    assert torch.equal(output_metadata.query_start_loc,
-                       expected_query_start_loc)
+    assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
     assert torch.equal(token_indices, expected_token_indices)
-    assert torch.equal(token_indices_to_sample,
-                       expected_token_indices_to_sample)
+    assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
 
 
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
-@pytest.mark.parametrize("attn_backend",
-                         get_attn_backend_list_based_on_platform())
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
-@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
-@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
-@mock.patch('vllm.v1.spec_decode.eagle.get_model')
-def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    attn_backend, pp_size, use_distinct_embed_tokens,
-                    monkeypatch):
-
+@pytest.mark.parametrize("use_distinct_lm_head", [True, False])
+@mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
+@mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
+@mock.patch("vllm.v1.spec_decode.eagle.get_model")
+def test_load_model(
+    mock_get_model,
+    mock_get_layers,
+    mock_get_pp_group,
+    method,
+    attn_backend,
+    pp_size,
+    use_distinct_embed_tokens,
+    use_distinct_lm_head,
+    monkeypatch,
+):
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
-    if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
-        pytest.skip("TRITON_ATTN does not support "
-                    "multi-token eagle spec decode on current platform")
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
 
     if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Setup draft model mock
     mock_model = mock.MagicMock()
+    mock_model.model = mock.MagicMock()
+    mock_model.has_own_embed_tokens = use_distinct_embed_tokens
     if use_distinct_embed_tokens:
-        # Some models can have a different hidden size than the target model,
-        # so we test that their embed_tokens doesn't get overwritten
-        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
-    else:
-        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+        mock_model.model.embed_tokens = mock.MagicMock()
+    mock_model.has_own_lm_head = use_distinct_lm_head
+    if use_distinct_lm_head:
+        mock_model.lm_head = mock.MagicMock()
 
     mock_get_model.return_value = mock_model
 
     # Setup mocks for attention layers
     target_attn_layers = {
         "target_attn_1": mock.MagicMock(),
-        "target_attn_2": mock.MagicMock()
+        "target_attn_2": mock.MagicMock(),
     }
     target_indx_layers: dict[str, mock.MagicMock] = {}
     # Draft model has one extra attention layer compared to target model
-    all_attn_layers = {
-        **target_attn_layers, "draft_extra_attn": mock.MagicMock()
-    }
+    all_attn_layers = {**target_attn_layers, "draft_extra_attn": mock.MagicMock()}
+
+    all_indx_layers: dict[str, mock.MagicMock] = {}
 
     all_indx_layers: dict[str, mock.MagicMock] = {}
 
     # Make mock_get_layers return different values for each call
     mock_get_layers.side_effect = [
-        target_attn_layers, target_indx_layers, all_attn_layers,
-        all_indx_layers
+        target_attn_layers,
+        target_indx_layers,
+        all_attn_layers,
+        all_indx_layers,
     ]
 
     # Setup mock for pp group to return the appropriate value for world size
@@ -364,13 +396,12 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 
     target_model = mock.create_autospec(_TargetModelStub, instance=True)
     target_model.model = mock.MagicMock()
-    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+    target_model.lm_head = mock.MagicMock()
+    target_model.model.embed_tokens = mock.MagicMock()
 
     from vllm.model_executor.models import SupportsMultiModal
-    assert not isinstance(target_model, SupportsMultiModal)
 
-    if method == "eagle":
-        target_model.lm_head = mock.MagicMock()
+    assert not isinstance(target_model, SupportsMultiModal)
 
     # Create proposer using the helper function
     proposer = _create_proposer(method, num_speculative_tokens=8)
@@ -381,37 +412,37 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify that EAGLE models gain the lm head from the target model
-    if method == "eagle":
-        assert proposer.model.lm_head == target_model.lm_head
+    # Verify that the lm head is set correctly
+    if use_distinct_lm_head:
+        assert proposer.model.lm_head is not target_model.lm_head
+    else:
+        assert proposer.model.lm_head is target_model.lm_head
 
     # Verify that the embed tokens are set correctly
     # If pp_size is > 1, the embed tokens should be distinct
     if pp_size > 1 or use_distinct_embed_tokens:
-        assert proposer.model.model.embed_tokens != \
-            target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is not target_model.model.embed_tokens
     else:
-        # When pp_size is 1 and the draft and target models have
-        # embed_tokens of the same shape, they should be shared.
-        assert proposer.model.model.embed_tokens == \
-            target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is target_model.model.embed_tokens
 
 
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
-@pytest.mark.parametrize("attn_backend",
-                         get_attn_backend_list_based_on_platform())
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
 def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
-
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
-    if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
-        pytest.skip("TRITON_ATTN does not support "
-                    "multi-token eagle spec decode on current platform")
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
 
-    if (attn_backend == "TREE_ATTN"):
-        pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
-                    "because it requires special input mocking.")
+    if attn_backend == "TREE_ATTN":
+        pytest.skip(
+            "TREE_ATTN is tested separately in test_propose_tree"
+            "because it requires special input mocking."
+        )
 
     if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -498,31 +529,28 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         device=device,
     )
 
-    target_token_ids = torch.randint(0,
-                                     vocab_size, (total_tokens, ),
-                                     device=device)
-    target_positions = torch.cat([
-        torch.arange(seq_len_1, device=device),
-        torch.arange(seq_len_2, device=device)
-    ])
-    target_hidden_states = torch.randn(total_tokens,
-                                       hidden_size,
-                                       device=device)
-    next_token_ids = torch.randint(0,
-                                   vocab_size, (batch_size, ),
-                                   dtype=torch.int32,
-                                   device=device)
+    target_token_ids = torch.randint(0, vocab_size, (total_tokens,), device=device)
+    target_positions = torch.cat(
+        [torch.arange(seq_len_1, device=device), torch.arange(seq_len_2, device=device)]
+    )
+    target_hidden_states = torch.randn(total_tokens, hidden_size, device=device)
+    next_token_ids = torch.randint(
+        0, vocab_size, (batch_size,), dtype=torch.int32, device=device
+    )
     sampling_metadata = mock.MagicMock()
 
     if attn_backend == "FLASH_ATTN":
-        attn_metadata_builder_cls, _ = get_attention_backend(
-            _Backend.FLASH_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.FLASH_ATTN
+        )
     elif attn_backend == "TRITON_ATTN":
-        attn_metadata_builder_cls, _ = get_attention_backend(
-            _Backend.TRITON_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TRITON_ATTN
+        )
     elif attn_backend == "TREE_ATTN":
-        attn_metadata_builder_cls, _ = get_attention_backend(
-            _Backend.TREE_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TREE_ATTN
+        )
     else:
         raise ValueError(f"Unsupported attention backend: {attn_backend}")
 
@@ -536,18 +564,22 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
     # Mock runner for attention metadata building
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
-        attn_metadata_builder
+    proposer.runner.attn_groups[0][
+        0
+    ].get_metadata_builder.return_value = attn_metadata_builder
     proposer._get_attention_metadata_builder = mock.MagicMock(
-        return_value=attn_metadata_builder)
+        return_value=attn_metadata_builder
+    )
 
-    result = proposer.propose(target_token_ids=target_token_ids,
-                              target_positions=target_positions,
-                              target_hidden_states=target_hidden_states,
-                              next_token_ids=next_token_ids,
-                              last_token_indices=None,
-                              common_attn_metadata=common_attn_metadata,
-                              sampling_metadata=sampling_metadata)
+    result = proposer.propose(
+        target_token_ids=target_token_ids,
+        target_positions=target_positions,
+        target_hidden_states=target_hidden_states,
+        next_token_ids=next_token_ids,
+        last_token_indices=None,
+        common_attn_metadata=common_attn_metadata,
+        sampling_metadata=sampling_metadata,
+    )
 
     assert result.shape == (batch_size, num_speculative_tokens)
 
@@ -556,13 +588,14 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         # Example for num_speculative_tokens=1:
         # [[42], [60]]
         expected_tokens = torch.tensor(
-            [[base_token_ids[0]], [base_token_ids[1]]], device=device)
+            [[base_token_ids[0]], [base_token_ids[1]]], device=device
+        )
     else:
         # Example for num_speculative_tokens=3:
         # [[42, 43, 44], [60, 61, 62]]
-        expected_tokens = torch.zeros((batch_size, num_speculative_tokens),
-                                      dtype=torch.int64,
-                                      device=device)
+        expected_tokens = torch.zeros(
+            (batch_size, num_speculative_tokens), dtype=torch.int64, device=device
+        )
         for i in range(batch_size):
             for j in range(num_speculative_tokens):
                 expected_tokens[i, j] = base_token_ids[i] + j
@@ -574,12 +607,12 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
 @pytest.mark.parametrize(
     "spec_token_tree",
     [
-        [(0, )],  # A single token
-        [(0, ), (0, 0), (0, 0, 0)],  # Chain
-        [(0, ), (1, ), (2, )],  # Parallel
-        [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0),
-         (2, 1)],  # Tree
-    ])
+        [(0,)],  # A single token
+        [(0,), (0, 0), (0, 0, 0)],  # Chain
+        [(0,), (1,), (2,)],  # Parallel
+        [(0,), (1,), (2,), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)],  # Tree
+    ],
+)
 def test_propose_tree(spec_token_tree):
     # Get GPU device.
     device = torch.device(current_platform.device_type)
@@ -594,9 +627,9 @@ def test_propose_tree(spec_token_tree):
     num_speculative_tokens = len(spec_token_tree)
 
     # Create proposer first so we can use its actual hidden_size.
-    proposer = _create_proposer("eagle",
-                                num_speculative_tokens,
-                                speculative_token_tree=spec_token_tree)
+    proposer = _create_proposer(
+        "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree
+    )
     # Get the hidden_size from the proposer to ensure consistency.
     hidden_size = proposer.hidden_size
 
@@ -617,32 +650,31 @@ def test_propose_tree(spec_token_tree):
     model_mock = mock.MagicMock()
 
     # Mock the model forward calls.
-    forward_returns = [(torch.zeros(total_tokens, hidden_size, device=device),
-                        torch.zeros(total_tokens, hidden_size, device=device))]
+    forward_returns = [
+        (
+            torch.zeros(total_tokens, hidden_size, device=device),
+            torch.zeros(total_tokens, hidden_size, device=device),
+        )
+    ]
     for cu_num_drafts in proposer.cu_drafts_per_level:
-        h_logits = torch.zeros(batch_size * cu_num_drafts,
-                               hidden_size,
-                               device=device)
-        h_states = torch.zeros(batch_size * cu_num_drafts,
-                               hidden_size,
-                               device=device)
+        h_logits = torch.zeros(batch_size * cu_num_drafts, hidden_size, device=device)
+        h_states = torch.zeros(batch_size * cu_num_drafts, hidden_size, device=device)
         forward_returns.append((h_logits, h_states))
     model_mock.side_effect = forward_returns
 
     # Mock the compute_logits calls.
-    cu_num_drafts_tensor = torch.tensor([0] + proposer.cu_drafts_per_level,
-                                        dtype=torch.int32,
-                                        device=device)
+    cu_num_drafts_tensor = torch.tensor(
+        [0] + proposer.cu_drafts_per_level, dtype=torch.int32, device=device
+    )
     logits_returns = []
     for level, num_children in enumerate(proposer.child_drafts_per_level):
         token_ids = base_token_ids + cu_num_drafts_tensor[level]
-        level_num_drafts = cu_num_drafts_tensor[
-            level + 1] - cu_num_drafts_tensor[level]
+        level_num_drafts = cu_num_drafts_tensor[level + 1] - cu_num_drafts_tensor[level]
         level_logits = []
         for i in range(level_num_drafts // num_children):
             level_logits.append(
-                create_deterministic_logits(token_ids + i * num_children,
-                                            num_children))
+                create_deterministic_logits(token_ids + i * num_children, num_children)
+            )
         logits_returns.append(torch.stack(level_logits, dim=1))
     model_mock.compute_logits.side_effect = logits_returns
 
@@ -653,7 +685,9 @@ def test_propose_tree(spec_token_tree):
     proposer.attn_layer_names = ["layer.0"]
 
     # Get the tree attention metadata builder.
-    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.TREE_ATTN
+    )
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         layer_names=proposer.attn_layer_names,
@@ -664,29 +698,23 @@ def test_propose_tree(spec_token_tree):
     # Mock runner for attention metadata building.
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].metadata_builders = [
-        attn_metadata_builder
-    ]
-    proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
-        attn_metadata_builder
+    proposer.runner.attn_groups[0][0].metadata_builders = [attn_metadata_builder]
+    proposer.runner.attn_groups[0][
+        0
+    ].get_metadata_builder.return_value = attn_metadata_builder
     proposer._get_attention_metadata_builder = mock.MagicMock(
-        return_value=attn_metadata_builder)
+        return_value=attn_metadata_builder
+    )
 
     # Setup inputs for the proposer.
-    target_token_ids = torch.randint(0,
-                                     vocab_size, (total_tokens, ),
-                                     device=device)
-    target_positions = torch.cat([
-        torch.arange(seq_len_1, device=device),
-        torch.arange(seq_len_2, device=device)
-    ])
-    target_hidden_states = torch.randn(total_tokens,
-                                       hidden_size,
-                                       device=device)
-    next_token_ids = torch.randint(0,
-                                   vocab_size, (batch_size, ),
-                                   dtype=torch.int32,
-                                   device=device)
+    target_token_ids = torch.randint(0, vocab_size, (total_tokens,), device=device)
+    target_positions = torch.cat(
+        [torch.arange(seq_len_1, device=device), torch.arange(seq_len_2, device=device)]
+    )
+    target_hidden_states = torch.randn(total_tokens, hidden_size, device=device)
+    next_token_ids = torch.randint(
+        0, vocab_size, (batch_size,), dtype=torch.int32, device=device
+    )
     batch_spec = BatchSpec(
         seq_lens=seq_lens,
         query_lens=seq_lens,
@@ -699,19 +727,22 @@ def test_propose_tree(spec_token_tree):
     sampling_metadata = mock.MagicMock()
 
     # Propose draft tokens.
-    result = proposer.propose(target_token_ids=target_token_ids,
-                              target_positions=target_positions,
-                              target_hidden_states=target_hidden_states,
-                              next_token_ids=next_token_ids,
-                              last_token_indices=None,
-                              common_attn_metadata=common_attn_metadata,
-                              sampling_metadata=sampling_metadata)
+    result = proposer.propose(
+        target_token_ids=target_token_ids,
+        target_positions=target_positions,
+        target_hidden_states=target_hidden_states,
+        next_token_ids=next_token_ids,
+        last_token_indices=None,
+        common_attn_metadata=common_attn_metadata,
+        sampling_metadata=sampling_metadata,
+    )
     assert result.shape == (batch_size, num_speculative_tokens)
 
     # The tokens are expected to be consecutive integers starting
     # from the base token IDs.
     expected_tokens = base_token_ids[:, None] + torch.arange(
-        num_speculative_tokens, dtype=torch.int64, device=device)
+        num_speculative_tokens, dtype=torch.int64, device=device
+    )
 
     # Verify that the draft tokens match our expectations.
     assert torch.equal(result, expected_tokens)
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index f93593f2d482141baa4029569fa0fdf5d6727f15..fa1d0437f7c71c440641d1afaa08367f8395e24e 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -7,6 +7,7 @@ import pytest
 from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
+from vllm.sampling_params import StructuredOutputsParams
 
 _PROMPTS = [
     "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -33,17 +34,18 @@ def test_ngram_max_len(num_speculative_tokens: int):
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
-@pytest.mark.parametrize("attn_backend",
-                         get_attn_backend_list_based_on_platform())
-def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
-                       num_speculative_tokens: int, attn_backend: str):
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
+def test_eagle_max_len(
+    monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
+):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
-        if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
-            pytest.skip("TRITON_ATTN does not support "
-                        "multi-token eagle spec decode on current platform")
+        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+            pytest.skip(
+                "TRITON_ATTN does not support "
+                "multi-token eagle spec decode on current platform"
+            )
 
         if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
             m.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -55,8 +57,34 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
                 "method": "eagle",
                 "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
                 "num_speculative_tokens": num_speculative_tokens,
+                "max_model_len": 80,
             },
-            max_model_len=100,
+            max_model_len=200,
         )
-        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
-        llm.generate(_PROMPTS, sampling_params)
+        sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
+        outputs = llm.generate(_PROMPTS, sampling_params)
+        for o in outputs:
+            assert o.outputs[0].finish_reason == "length", (
+                "This test is only meaningful if the output "
+                "is truncated due to max length"
+            )
+
+        sampling_params = SamplingParams(
+            max_tokens=200,
+            structured_outputs=StructuredOutputsParams(
+                regex="^" + "a b c d e " * 15 + "$"
+            ),
+        )
+        output = llm.generate(_PROMPTS, sampling_params)
+        for o in output:
+            assert o.prompt_token_ids is not None
+            assert (
+                len(o.prompt_token_ids)
+                < 80
+                < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
+                < 200
+            ), (
+                "This test is only meaningful if the output "
+                "is longer than the eagle max length"
+            )
+            assert o.outputs[0].text == "a b c d e " * 15
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 5b9ccfc3f48b1ff3d9591345621a3447ed7f2143..c5c0491abaf7c89d5f265f9bc9835ed5bd7dec88 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -6,13 +6,22 @@ from unittest import mock
 import pytest
 import torch
 
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      get_attention_backend)
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VllmConfig)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    try_get_attention_backend,
+)
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
@@ -23,10 +32,9 @@ mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base"
 
 def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
     """Create an MTP proposer with unified model configuration."""
-    model_config = ModelConfig(model=mimo_7b_dir,
-                               runner="generate",
-                               max_model_len=100,
-                               trust_remote_code=True)
+    model_config = ModelConfig(
+        model=mimo_7b_dir, runner="generate", max_model_len=100, trust_remote_code=True
+    )
 
     speculative_config = SpeculativeConfig(
         target_model_config=model_config,
@@ -43,23 +51,26 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig())
+        scheduler_config=SchedulerConfig(),
+    )
 
-    return EagleProposer(vllm_config=vllm_config,
-                         device=current_platform.device_type)
+    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
 
 
-@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
-@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
-@mock.patch('vllm.v1.spec_decode.eagle.get_model')
-def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
-                                mock_get_pp_group):
+@mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
+@mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
+@mock.patch("vllm.v1.spec_decode.eagle.get_model")
+def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_group):
     """Test MTP-specific model loading with unified model approach."""
 
     # Setup mocks
     mock_model = mock.MagicMock()
     mock_model.model.embed_tokens.weight.shape = (131072, 4096)
     mock_get_model.return_value = mock_model
+    # MTP does not have its own embed_tokens or lm_head
+    # so it should share them with the target model
+    mock_model.has_own_embed_tokens = False
+    mock_model.has_own_lm_head = False
 
     target_attn_layers = {"target_attn_1": mock.MagicMock()}
     all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
@@ -67,8 +78,10 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
     all_indexer_layers: dict = {}
 
     mock_get_layers.side_effect = [
-        target_attn_layers, target_indexer_layers, all_attn_layers,
-        all_indexer_layers
+        target_attn_layers,
+        target_indexer_layers,
+        all_attn_layers,
+        all_indexer_layers,
     ]
 
     mock_pp_group = mock.MagicMock()
@@ -116,17 +129,13 @@ def test_mtp_propose(num_speculative_tokens, monkeypatch):
 
     # MTP returns hidden states directly
     if num_speculative_tokens == 1:
-        model_mock.return_value = torch.zeros(total_tokens,
-                                              hidden_size,
-                                              device=device)
+        model_mock.return_value = torch.zeros(total_tokens, hidden_size, device=device)
     else:
         # Multiple forward passes for multi-token speculation
         forward_returns = []
         for i in range(num_speculative_tokens):
             if i == 0:
-                h_states = torch.zeros(total_tokens,
-                                       hidden_size,
-                                       device=device)
+                h_states = torch.zeros(total_tokens, hidden_size, device=device)
             else:
                 h_states = torch.zeros(batch_size, hidden_size, device=device)
             forward_returns.append(h_states)
@@ -140,7 +149,8 @@ def test_mtp_propose(num_speculative_tokens, monkeypatch):
 
     if num_speculative_tokens == 1:
         model_mock.compute_logits.return_value = create_deterministic_logits(
-            batch_size, vocab_size, 42)
+            batch_size, vocab_size, 42
+        )
     else:
         logits_returns = [
             create_deterministic_logits(batch_size, vocab_size, 42 + i)
@@ -153,28 +163,27 @@ def test_mtp_propose(num_speculative_tokens, monkeypatch):
 
     # Prepare inputs
     batch_spec = BatchSpec(seq_lens=seq_lens, query_lens=seq_lens)
-    common_attn_metadata = create_common_attn_metadata(batch_spec,
-                                                       block_size=16,
-                                                       device=device)
-
-    target_token_ids = torch.randint(0,
-                                     vocab_size, (total_tokens, ),
-                                     device=device)
-    target_positions = torch.cat([
-        torch.arange(seq_lens[0], device=device),
-        torch.arange(seq_lens[1], device=device)
-    ])
-    target_hidden_states = torch.randn(total_tokens,
-                                       hidden_size,
-                                       device=device)
-    next_token_ids = torch.randint(0,
-                                   vocab_size, (batch_size, ),
-                                   dtype=torch.int32,
-                                   device=device)
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, block_size=16, device=device
+    )
+
+    target_token_ids = torch.randint(0, vocab_size, (total_tokens,), device=device)
+    target_positions = torch.cat(
+        [
+            torch.arange(seq_lens[0], device=device),
+            torch.arange(seq_lens[1], device=device),
+        ]
+    )
+    target_hidden_states = torch.randn(total_tokens, hidden_size, device=device)
+    next_token_ids = torch.randint(
+        0, vocab_size, (batch_size,), dtype=torch.int32, device=device
+    )
     sampling_metadata = mock.MagicMock()
 
     # Setup attention metadata
-    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.FLASH_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.FLASH_ATTN
+    )
 
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
@@ -187,13 +196,15 @@ def test_mtp_propose(num_speculative_tokens, monkeypatch):
     proposer.attn_metadata_builder = attn_metadata_builder
 
     # Run propose
-    result = proposer.propose(target_token_ids=target_token_ids,
-                              target_positions=target_positions,
-                              target_hidden_states=target_hidden_states,
-                              next_token_ids=next_token_ids,
-                              last_token_indices=None,
-                              common_attn_metadata=common_attn_metadata,
-                              sampling_metadata=sampling_metadata)
+    result = proposer.propose(
+        target_token_ids=target_token_ids,
+        target_positions=target_positions,
+        target_hidden_states=target_hidden_states,
+        next_token_ids=next_token_ids,
+        last_token_indices=None,
+        common_attn_metadata=common_attn_metadata,
+        sampling_metadata=sampling_metadata,
+    )
 
     # Verify the model was called correctly
     assert model_mock.called
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 344d19c60db7d74ea96283fd7552c33ca3632710..563bc1d957f412cbaf9be4c8f76cdf47deb1dd8d 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,82 +4,80 @@ import numpy as np
 
 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (
-    NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
+    NgramProposer,
+    _find_longest_matched_ngram_and_propose_tokens,
+)
 
 
 def test_find_longest_matched_ngram_and_propose_tokens():
     tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
     result = _find_longest_matched_ngram_and_propose_tokens(
-        origin_tokens=tokens,
-        min_ngram=2,
-        max_ngram=2,
-        max_model_len=1024,
-        k=2)
+        origin_tokens=tokens, min_ngram=2, max_ngram=2, max_model_len=1024, k=2
+    )
     assert len(result) == 0
 
     tokens = np.array([1, 2, 3, 4, 1, 2, 3])
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=2,
-                                                       max_ngram=2,
-                                                       max_model_len=1024,
-                                                       k=3),
-        np.array([4, 1, 2]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=2, max_ngram=2, max_model_len=1024, k=3
+        ),
+        np.array([4, 1, 2]),
+    )
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=2,
-                                                       max_ngram=2,
-                                                       max_model_len=1024,
-                                                       k=2), np.array([4, 1]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=2, max_ngram=2, max_model_len=1024, k=2
+        ),
+        np.array([4, 1]),
+    )
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=1,
-                                                       max_ngram=1,
-                                                       max_model_len=1024,
-                                                       k=3),
-        np.array([4, 1, 2]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=1, max_ngram=1, max_model_len=1024, k=3
+        ),
+        np.array([4, 1, 2]),
+    )
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=1,
-                                                       max_ngram=1,
-                                                       max_model_len=1024,
-                                                       k=2), np.array([4, 1]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=1, max_ngram=1, max_model_len=1024, k=2
+        ),
+        np.array([4, 1]),
+    )
 
     tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=2,
-                                                       max_ngram=2,
-                                                       max_model_len=1024,
-                                                       k=3),
-        np.array([4, 1, 2]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=2, max_ngram=2, max_model_len=1024, k=3
+        ),
+        np.array([4, 1, 2]),
+    )
     # Return on the first match
     np.testing.assert_array_equal(
-        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
-                                                       min_ngram=1,
-                                                       max_ngram=1,
-                                                       max_model_len=1024,
-                                                       k=2), np.array([6, 2]))
+        _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=tokens, min_ngram=1, max_ngram=1, max_model_len=1024, k=2
+        ),
+        np.array([6, 2]),
+    )
 
 
 def test_ngram_proposer():
-
     def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
         # Dummy model config. Just to set max_model_len.
         model_config = ModelConfig(model="facebook/opt-125m")
         return NgramProposer(
-            vllm_config=VllmConfig(model_config=model_config,
-                                   speculative_config=SpeculativeConfig(
-                                       prompt_lookup_min=min_n,
-                                       prompt_lookup_max=max_n,
-                                       num_speculative_tokens=k,
-                                       method="ngram",
-                                   )))
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                speculative_config=SpeculativeConfig(
+                    prompt_lookup_min=min_n,
+                    prompt_lookup_max=max_n,
+                    num_speculative_tokens=k,
+                    method="ngram",
+                ),
+            )
+        )
 
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -90,7 +88,7 @@ def test_ngram_proposer():
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -101,7 +99,7 @@ def test_ngram_proposer():
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -113,7 +111,7 @@ def test_ngram_proposer():
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -124,7 +122,7 @@ def test_ngram_proposer():
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,10 +131,9 @@ def test_ngram_proposer():
     assert np.array_equal(result, np.array([[1, 2]]))  # Not [5, 2]]
 
     # Multiple 3-gram matched, but always pick the first one.
-    token_ids_cpu = np.array(
-        [[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
+    token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -147,7 +144,7 @@ def test_ngram_proposer():
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -160,7 +157,7 @@ def test_ngram_proposer():
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -184,13 +181,12 @@ def test_ngram_proposer():
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
         spec_decode_unsupported_reqs=(),
     )
     assert len(result[0]) == 2
-    assert np.array_equal(result[0],
-                          np.array([middle_integer + 2, middle_integer + 3]))
+    assert np.array_equal(result[0], np.array([middle_integer + 2, middle_integer + 3]))
     assert np.array_equal(result[1], np.array([]))
diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py
similarity index 58%
rename from tests/speculative_decoding/speculators/test_eagle3.py
rename to tests/v1/spec_decode/test_speculators_eagle3.py
index 368238b3a7200c7438ba7800e30f0dd03af143f4..5ce6e1593b5c148a8f752dd3f87c68fe312ebc37 100644
--- a/tests/speculative_decoding/speculators/test_eagle3.py
+++ b/tests/v1/spec_decode/test_speculators_eagle3.py
@@ -7,16 +7,26 @@ from vllm.config import SpeculativeConfig
 from vllm.model_executor.models.interfaces import supports_eagle3
 
 
-@pytest.mark.parametrize("model_path", [
-    pytest.param(
-        "nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized",
-        id="llama3-eagle3-speculator"),
-    pytest.param(
-        "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized",
-        id="qwen3-eagle3-speculator"),
-])
-def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path,
-                                  monkeypatch):
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        pytest.param(
+            "nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized",
+            id="llama3-eagle3-speculator",
+        ),
+        pytest.param(
+            "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized",
+            id="qwen3-eagle3-speculator",
+        ),
+        pytest.param(
+            "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
+            id="qwen3-eagle3-speculator-w4a16-verifier",
+        ),
+    ],
+)
+def test_eagle3_speculators_model(
+    vllm_runner, example_prompts, model_path, monkeypatch
+):
     """
     Test Eagle3 speculators models properly initialize speculative decoding.
 
@@ -37,18 +47,19 @@ def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path,
 
         vllm_config = vllm_model.llm.llm_engine.vllm_config
 
-        assert isinstance(vllm_config.speculative_config, SpeculativeConfig), \
+        assert isinstance(vllm_config.speculative_config, SpeculativeConfig), (
             "Speculative config should be initialized for speculators model"
+        )
 
         spec_config = vllm_config.speculative_config
-        assert spec_config.num_speculative_tokens > 0, \
-            (f"Expected positive speculative tokens, "
-             f"got {spec_config.num_speculative_tokens}")
+        assert spec_config.num_speculative_tokens > 0, (
+            f"Expected positive speculative tokens, "
+            f"got {spec_config.num_speculative_tokens}"
+        )
 
-        assert spec_config.model == model_path, \
+        assert spec_config.model == model_path, (
             f"Draft model should be {model_path}, got {spec_config.model}"
+        )
 
-        vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                  max_tokens=20)
-        assert vllm_outputs, \
-            f"No outputs generated for speculators model {model_path}"
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20)
+        assert vllm_outputs, f"No outputs generated for speculators model {model_path}"
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 51a737496dff3e4b9dc31b7513416e60d7ec3f2a..6958d62dc7e90fddbb0085b9205a0488171e64d8 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
-from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
-                                      create_vllm_config,
-                                      get_attention_backend)
+from tests.v1.attention.utils import (
+    create_standard_kv_cache_spec,
+    create_vllm_config,
+    try_get_attention_backend,
+)
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
@@ -33,18 +35,19 @@ def forward_attention(
     block_table: torch.Tensor,
     slot_mapping: torch.Tensor,
     seqlen_k: int,
-    backend: _Backend,
-    spec_token_tree: Optional[str] = None,
+    backend: AttentionBackendEnum,
+    spec_token_tree: str | None = None,
     num_spec_tokens: int = 0,
 ) -> torch.Tensor:
     batch_size, q_len, num_heads, dim_per_head = q.shape
     num_kv_heads = k.shape[-2]
     # Initialize the query and KV sequence lengths.
     query_start_loc = q_len * torch.arange(
-        batch_size + 1, device=q.device, dtype=torch.int32)
+        batch_size + 1, device=q.device, dtype=torch.int32
+    )
     query_lens = torch.diff(query_start_loc)
     seq_lens = torch.full(
-        (batch_size, ),
+        (batch_size,),
         seqlen_k,
         device=q.device,
         dtype=torch.int32,
@@ -54,14 +57,13 @@ def forward_attention(
     max_query_len = q_len
     num_actual_tokens = query_start_loc[-1]
 
-    softmax_scale = q.shape[-1]**(-0.5)
+    softmax_scale = q.shape[-1] ** (-0.5)
     layer = MockAttentionLayer()
 
     # Build common metadata.
     model_name = "meta-llama/Meta-Llama-3-8B"
-    builder_cls, impl_cls = get_attention_backend(backend)
-    vllm_config = create_vllm_config(model_name=model_name,
-                                     max_model_len=max(seq_lens))
+    builder_cls, impl_cls = try_get_attention_backend(backend)
+    vllm_config = create_vllm_config(model_name=model_name, max_model_len=max(seq_lens))
     if spec_token_tree is not None:
         # Create speculative config if token tree is specified.
         vllm_config.speculative_config = SpeculativeConfig(
@@ -70,7 +72,8 @@ def forward_attention(
             model=model_name,
             method="eagle",
             num_speculative_tokens=num_spec_tokens,
-            speculative_token_tree=spec_token_tree)
+            speculative_token_tree=spec_token_tree,
+        )
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
     builder = builder_cls(kv_cache_spec, [], vllm_config, q.device)
     common_attn_metadata = CommonAttentionMetadata(
@@ -127,8 +130,7 @@ def test_tree_attn_correctness() -> None:
     device = "cuda"
     tree_attn_masks = {
         # Chain.
-        "[(0,), (0, 0), (0, 0, 0)]":
-        torch.tensor(
+        "[(0,), (0, 0), (0, 0, 0)]": torch.tensor(
             [
                 [1, 0, 0, 0],
                 [1, 1, 0, 0],
@@ -139,8 +141,7 @@ def test_tree_attn_correctness() -> None:
             dtype=torch.int32,
         ),
         # Tree.
-        "[(0,), (1,), (0, 0), (0, 1), (1, 0), (1, 1)]":
-        torch.tensor(
+        "[(0,), (1,), (0, 0), (0, 1), (1, 0), (1, 1)]": torch.tensor(
             [
                 [1, 0, 0, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0, 0],
@@ -201,8 +202,7 @@ def test_tree_attn_correctness() -> None:
                         device=q.device,
                         dtype=torch.bfloat16,
                     )
-                    num_alloc_blocks_per_batch = math.ceil(seqlen_k /
-                                                           block_size)
+                    num_alloc_blocks_per_batch = math.ceil(seqlen_k / block_size)
                     block_table = torch.zeros(
                         (batch_size, max_blocks_per_batch),
                         device=q.device,
@@ -216,11 +216,10 @@ def test_tree_attn_correctness() -> None:
                     )
                     if randomize_blocks:
                         # Randomize the block ids.
-                        block_ids = block_ids[torch.randperm(
-                            block_ids.numel())]
-                    block_table[:, :
-                                num_alloc_blocks_per_batch] = block_ids.view(
-                                    -1, num_alloc_blocks_per_batch)
+                        block_ids = block_ids[torch.randperm(block_ids.numel())]
+                    block_table[:, :num_alloc_blocks_per_batch] = block_ids.view(
+                        -1, num_alloc_blocks_per_batch
+                    )
 
                     # Set up the slot mapping for the input KVs.
                     tree_positions = sequence_position + torch.arange(
@@ -230,7 +229,8 @@ def test_tree_attn_correctness() -> None:
                         dtype=torch.int64,
                     ).repeat(batch_size, 1)
                     tree_slot_mapping = _gen_slot_mapping(
-                        tree_positions, block_table, block_size)
+                        tree_positions, block_table, block_size
+                    )
 
                     # Compute attention for the tree.
                     tree_attn_output = forward_attention(
@@ -241,7 +241,7 @@ def test_tree_attn_correctness() -> None:
                         block_table=block_table,
                         slot_mapping=tree_slot_mapping,
                         seqlen_k=seqlen_k,
-                        backend=_Backend.TREE_ATTN,
+                        backend=AttentionBackendEnum.TREE_ATTN,
                         spec_token_tree=spec_token_tree,
                         num_spec_tokens=tree_size_q - 1,
                     ).view(batch_size, -1, num_heads, dim_per_head)
@@ -252,8 +252,7 @@ def test_tree_attn_correctness() -> None:
                     for q_index in range(tree_size_q):
                         # Get the q, k, and v for the branch.
                         branch_mask = tree_attn_mask[q_index, :]
-                        branch_indices = torch.nonzero(branch_mask,
-                                                       as_tuple=True)[0]
+                        branch_indices = torch.nonzero(branch_mask, as_tuple=True)[0]
                         q_len = branch_indices.shape[0]
                         q_branch = q[:, branch_indices]
                         k_branch = k[:, branch_indices]
@@ -267,7 +266,8 @@ def test_tree_attn_correctness() -> None:
                             dtype=torch.int64,
                         ).repeat(batch_size, 1)
                         branch_slot_mapping = _gen_slot_mapping(
-                            branch_positions, block_table, block_size)
+                            branch_positions, block_table, block_size
+                        )
 
                         # Compute flash attention for the branch.
                         flash_attn_output = forward_attention(
@@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
                             block_table=block_table,
                             slot_mapping=branch_slot_mapping,
                             seqlen_k=sequence_position + q_len,
-                            backend=_Backend.FLASH_ATTN,
+                            backend=AttentionBackendEnum.FLASH_ATTN,
                         ).view(batch_size, -1, num_heads, dim_per_head)
 
                         # Compare the outputs.
@@ -286,16 +286,19 @@ def test_tree_attn_correctness() -> None:
                             tree_attn_output[:, branch_indices],
                             flash_attn_output,
                             atol=7.81e-3,
-                        ), (f"outputs are not close for "
+                        ), (
+                            f"outputs are not close for "
                             f"batch_size: {batch_size}, "
                             f"num_heads: {num_heads}, "
                             f"sequence_position: {sequence_position}, "
                             f"tree_attn_mask: {tree_attn_mask}, "
-                            f"q_index: {q_index}.")
+                            f"q_index: {q_index}."
+                        )
 
 
-def _gen_slot_mapping(positions: torch.Tensor, block_table: torch.Tensor,
-                      block_size: int):
+def _gen_slot_mapping(
+    positions: torch.Tensor, block_table: torch.Tensor, block_size: int
+):
     block_indices = positions // block_size
     blocks = block_table.gather(dim=1, index=block_indices)
     return (blocks * block_size + positions % block_size).view(-1)
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
new file mode 100644
index 0000000000000000000000000000000000000000..771076186a3b4e0dcad38ec22a0daa8e0ed80744
--- /dev/null
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoTokenizer
+
+from vllm.config import StructuredOutputsConfig, VllmConfig
+from vllm.config.model import ModelConfig
+from vllm.config.speculative import SpeculativeConfig
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import StructuredOutputOptions
+
+TOKENIZER = "gpt2"
+
+
+def test_backend_guidance_rollback_terminated():
+    # Test that the backend guidance successfully rollbacks from a
+    # terminated state. This can happen with speculative decoding,
+    # where the draft model proposes EOS and it is verified by the
+    # guidance backend. In that case we are in a stopped state, but
+    # it should be reverted in case EOS is not accepted by the target
+    # model.
+    vllm_config = VllmConfig(
+        decoding_config=StructuredOutputsConfig(
+            backend="guidance",
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+
+    backend = GuidanceBackend(
+        vllm_config,
+        tokenizer=tokenizer,
+        vocab_size=50257,
+    )
+
+    grammar = backend.compile_grammar(
+        StructuredOutputOptions.JSON, '{"type": "object"}'
+    )
+
+    prompt = tokenizer.encode('{"a": "b"}')
+    assert len(prompt) > 1
+    dummy_wrong = tokenizer.encode('{"a"}')
+    for token in prompt:
+        assert grammar.accept_tokens("", [token])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Giving any other token should also be accepted
+    assert grammar.accept_tokens("", dummy_wrong)
+    # Rollback is done from where state was terminated, so from '}' not EOS
+    grammar.rollback(len(prompt) - 1)
+    assert not grammar.is_terminated()
+    assert grammar.validate_tokens([tokenizer.eos_token_id]) == []
+    assert grammar.validate_tokens(dummy_wrong) != dummy_wrong
+    assert grammar.accept_tokens("", prompt[1:])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Rollback of <= 0 should not change the terminated state
+    grammar.rollback(0)
+    assert grammar.is_terminated()
+    grammar.rollback(-1)
+    assert grammar.is_terminated()
+
+
+def test_grammar_bitmask_with_specdec():
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+    prompt = tokenizer.encode('{"a": "b"}')
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(tokenizer=TOKENIZER),
+        structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
+        speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
+    )
+    structured_output_manager = StructuredOutputManager(vllm_config)
+
+    for i in range(1, 2):
+        sampling_params = SamplingParams(
+            structured_outputs=StructuredOutputsParams(
+                json='{"type": "object"}',
+            ),
+        )
+        sampling_params.structured_outputs._backend = "guidance"
+
+        my_req_id = f"my_req_id_{i}"
+        request = Request(
+            my_req_id,
+            prompt_token_ids=prompt[:i],
+            sampling_params=sampling_params,
+            pooling_params=None,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+        structured_output_manager.grammar_init(request)
+
+        def grammar_bitmask(req: Request, tokens: list[int]) -> None:
+            structured_output_manager.grammar_bitmask(
+                requests={req.request_id: req},
+                structured_output_request_ids={req.request_id: 0},
+                scheduled_spec_decode_tokens={req.request_id: tokens},
+            )
+            # At this point, we rolled-back, so should not be terminated
+            assert not req.structured_output_request.grammar.is_terminated()
+
+        # The grammar might not yet be compiled, so we wait for it
+        while not request.structured_output_request._check_grammar_completion():
+            continue
+
+        assert request.structured_output_request.grammar.accept_tokens(
+            request.request_id, prompt[:i]
+        )
+
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
+        grammar_bitmask(
+            request, prompt[i:] + [tokenizer.eos_token_id] + prompt
+        )  # EOS not the final token
+        grammar_bitmask(request, prompt[i:])  # EOS not present
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
diff --git a/tests/v1/structured_output/test_gptoss_structural_tags.py b/tests/v1/structured_output/test_gptoss_structural_tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0feabfb99ab72dc1557253d77c28056f2ca3836
--- /dev/null
+++ b/tests/v1/structured_output/test_gptoss_structural_tags.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Unit tests for GPT-OSS structural tag support in reasoning (PR #25515)."""
+
+import json
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.entrypoints.tool_server import ToolServer
+from vllm.reasoning.gptoss_reasoning_parser import (
+    GptOssReasoningParser,
+    from_builtin_tool_to_tag,
+    no_func_reaonsing_tag,
+    tag_with_builtin_funcs,
+)
+
+
+class TestGptOssReasoningParser:
+    """Test cases for GptOssReasoningParser structural tag functionality."""
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Create a mock tokenizer for testing."""
+        tokenizer = Mock()
+        tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        return tokenizer
+
+    @pytest.fixture
+    def reasoning_parser(self, mock_tokenizer):
+        """Create a GptOssReasoningParser instance."""
+        return GptOssReasoningParser(mock_tokenizer)
+
+    @pytest.fixture
+    def mock_tool_server_empty(self):
+        """Create a mock ToolServer with no tools."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(return_value=False)
+        return tool_server
+
+    @pytest.fixture
+    def mock_tool_server_with_browser(self):
+        """Create a mock ToolServer with browser tool."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "browser")
+        return tool_server
+
+    @pytest.fixture
+    def mock_tool_server_with_all_tools(self):
+        """Create a mock ToolServer with all builtin tools."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(
+            side_effect=lambda tool: tool in ["browser", "python", "container"]
+        )
+        return tool_server
+
+    def test_prepare_structured_tag_no_tool_server(self, reasoning_parser):
+        """Test prepare_structured_tag with no tool server."""
+        result = reasoning_parser.prepare_structured_tag(None, None)
+        expected = json.dumps(no_func_reaonsing_tag)
+
+        assert result == expected
+
+        # Verify the structure is correct
+        parsed = json.loads(result)
+        assert parsed["type"] == "structural_tag"
+        assert parsed["format"]["type"] == "triggered_tags"
+        assert len(parsed["format"]["tags"]) == 1
+        assert parsed["format"]["tags"][0]["begin"] == "<|channel|>analysis<|message|>"
+        assert parsed["format"]["triggers"] == ["<|channel|>analysis"]
+
+    def test_prepare_structured_tag_with_all_tools(
+        self, reasoning_parser, mock_tool_server_with_all_tools
+    ):
+        """Test prepare_structured_tag with all builtin tools."""
+        result = reasoning_parser.prepare_structured_tag(
+            None, mock_tool_server_with_all_tools
+        )
+        parsed = json.loads(result)
+
+        # Should have analysis tag + tags for all 3 tools (2 tags each)
+        assert len(parsed["format"]["tags"]) == 7  # 1 analysis + 6 tool tags
+
+        # Check all tool tags are present
+        tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]]
+        for tool in ["browser", "python", "container"]:
+            assert f"<|channel|>commentary to={tool}" in tag_begins
+            assert f"<|channel|>analysis to={tool}" in tag_begins
+
+    def test_prepare_structured_tag_with_original_tag(self, reasoning_parser):
+        """Test prepare_structured_tag when original_tag is provided."""
+        original_tag = '{"custom": "tag"}'
+        result = reasoning_parser.prepare_structured_tag(original_tag, None)
+
+        # Should return the original tag unchanged
+        assert result == original_tag
+
+    def test_from_builtin_tool_to_tag(self):
+        """Test from_builtin_tool_to_tag function."""
+        tags = from_builtin_tool_to_tag("python")
+
+        assert len(tags) == 2
+        assert tags[0]["begin"] == "<|channel|>commentary to=python"
+        assert tags[0]["content"]["type"] == "any_text"
+        assert tags[0]["end"] == "<|end|>"
+
+        assert tags[1]["begin"] == "<|channel|>analysis to=python"
+        assert tags[1]["content"]["type"] == "any_text"
+        assert tags[1]["end"] == "<|end|>"
+
+    def test_tag_with_builtin_funcs(self):
+        """Test tag_with_builtin_funcs function."""
+        builtin_tools = ["browser", "python"]
+        result = tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tools)
+
+        assert result["type"] == "structural_tag"
+        # Should have original analysis tag + 2 tags per tool
+        assert len(result["format"]["tags"]) == 5  # 1 + 2*2
+
+        # Should have added commentary trigger
+        assert "<|channel|>commentary to=" in result["format"]["triggers"]
+        assert "<|channel|>analysis" in result["format"]["triggers"]
+
+    def test_tag_structure_invariants(self):
+        """Test that the basic tag structure follows expected format."""
+        # Test the base no_func_reaonsing_tag structure
+        assert no_func_reaonsing_tag["type"] == "structural_tag"
+        assert no_func_reaonsing_tag["format"]["type"] == "triggered_tags"
+        assert no_func_reaonsing_tag["format"]["stop_after_first"] is False
+
+        # Verify analysis tag structure
+        analysis_tag = no_func_reaonsing_tag["format"]["tags"][0]
+        assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
+        assert analysis_tag["content"]["type"] == "any_text"
+        assert analysis_tag["end"] == "<|end|>"
+
+    def test_json_serialization_valid(
+        self, reasoning_parser, mock_tool_server_with_all_tools
+    ):
+        """Test that all generated tags produce valid JSON."""
+        # Test with no tool server
+        result1 = reasoning_parser.prepare_structured_tag(None, None)
+        json.loads(result1)  # Should not raise
+
+        # Test with empty tool server
+        empty_server = Mock(spec=ToolServer)
+        empty_server.has_tool = Mock(return_value=False)
+        result2 = reasoning_parser.prepare_structured_tag(None, empty_server)
+        json.loads(result2)  # Should not raise
+
+        # Test with tools
+        result3 = reasoning_parser.prepare_structured_tag(
+            None, mock_tool_server_with_all_tools
+        )
+        json.loads(result3)  # Should not raise
+
+    @pytest.mark.parametrize("tool_name", ["browser", "python", "container"])
+    def test_single_tool_integration(self, reasoning_parser, tool_name):
+        """Test integration with individual tools."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == tool_name)
+
+        result = reasoning_parser.prepare_structured_tag(None, tool_server)
+        parsed = json.loads(result)
+
+        # Should have 1 analysis + 2 tool-specific tags
+        assert len(parsed["format"]["tags"]) == 3
+
+        tag_begins = [tag["begin"] for tag in parsed["format"]["tags"]]
+        assert f"<|channel|>commentary to={tool_name}" in tag_begins
+        assert f"<|channel|>analysis to={tool_name}" in tag_begins
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..70047a993c3f902233db485e34efd15c4b91027f
--- /dev/null
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Unit tests for reasoning-aware structured output functionality (PR #25515)."""
+
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.reasoning import ReasoningParser
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+
+
+class TestReasoningStructuredOutput:
+    """Test reasoning-aware structured output functionality."""
+
+    @pytest.fixture
+    def mock_model_config(self):
+        """Create a mock ModelConfig."""
+        config = Mock(spec=ModelConfig)
+        config.skip_tokenizer_init = True  # Skip tokenizer init to avoid network calls
+        config.get_vocab_size = Mock(return_value=50000)
+        # Add missing runner_type attribute that tokenizer initialization expects
+        config.runner_type = "generate"
+        # Add other attributes that tokenizer initialization might need
+        config.tokenizer = "test-tokenizer"
+        config.tokenizer_mode = "auto"
+        config.trust_remote_code = False
+        config.tokenizer_revision = None
+        return config
+
+    @pytest.fixture
+    def mock_scheduler_config(self):
+        """Create a mock SchedulerConfig."""
+        config = Mock(spec=SchedulerConfig)
+        config.max_num_seqs = 128
+        return config
+
+    @pytest.fixture
+    def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
+        """Create a mock VllmConfig."""
+        config = Mock(spec=VllmConfig)
+        config.model_config = mock_model_config
+        config.scheduler_config = mock_scheduler_config
+        config.structured_outputs_config = Mock()
+        config.structured_outputs_config.reasoning_parser = None
+        config.structured_outputs_config.enable_in_reasoning = False
+        config.speculative_config = None
+        return config
+
+    @pytest.fixture
+    def mock_reasoning_parser(self):
+        """Create a mock ReasoningParser."""
+        parser = Mock(spec=ReasoningParser)
+        parser.is_reasoning_end = Mock(return_value=False)
+        return parser
+
+    @pytest.fixture
+    def mock_request_with_structured_output(self):
+        """Create a mock request with structured output."""
+        request = Mock(spec=Request)
+        request.structured_output_request = Mock()
+        request.structured_output_request.reasoning_ended = None
+        request.structured_output_request.grammar = Mock()
+        request.structured_output_request.grammar.is_terminated = Mock(
+            return_value=False
+        )
+        request.use_structured_output = True
+        request.prompt_token_ids = [1, 2, 3, 4, 5]
+        request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+        return request
+
+    def test_should_fill_bitmask_with_enable_in_reasoning(
+        self, mock_vllm_config, mock_request_with_structured_output
+    ):
+        """Test should_fill_bitmask when enable_in_reasoning is True."""
+        # Enable enable_in_reasoning
+        mock_vllm_config.structured_outputs_config.enable_in_reasoning = True
+
+        manager = StructuredOutputManager(mock_vllm_config)
+
+        # Should always return True when enable_in_reasoning is enabled
+        result = manager.should_fill_bitmask(mock_request_with_structured_output)
+        assert result is True
+
+    def test_should_fill_bitmask_without_enable_in_reasoning(
+        self,
+        mock_vllm_config,
+        mock_request_with_structured_output,
+        mock_reasoning_parser,
+    ):
+        """Test should_fill_bitmask when enable_in_reasoning is False."""
+        # Keep enable_in_reasoning as False (default)
+        config = mock_vllm_config.structured_outputs_config
+        assert config.enable_in_reasoning is False
+
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = mock_reasoning_parser
+
+        # Mock reasoning not ended
+        mock_reasoning_parser.is_reasoning_end.return_value = False
+
+        result = manager.should_fill_bitmask(mock_request_with_structured_output)
+
+        # Should set reasoning_ended and return its value
+        assert (
+            mock_request_with_structured_output.structured_output_request.reasoning_ended
+            is False
+        )
+        assert result is False
+
+    def test_should_fill_bitmask_no_reasoner(
+        self, mock_vllm_config, mock_request_with_structured_output
+    ):
+        """Test should_fill_bitmask when no reasoner is configured."""
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = None
+
+        result = manager.should_fill_bitmask(mock_request_with_structured_output)
+
+        # Should default to True when no reasoner
+        assert result is True
+
+    def test_should_advance_with_enable_in_reasoning(
+        self,
+        mock_vllm_config,
+        mock_request_with_structured_output,
+        mock_reasoning_parser,
+    ):
+        """Test should_advance when enable_in_reasoning is True."""
+        # Enable enable_in_reasoning
+        mock_vllm_config.structured_outputs_config.enable_in_reasoning = True
+
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = mock_reasoning_parser
+
+        # Should always return True when enable_in_reasoning is enabled
+        result = manager.should_advance(mock_request_with_structured_output)
+        assert result is True
+
+    def test_should_advance_reasoning_not_ended(
+        self,
+        mock_vllm_config,
+        mock_request_with_structured_output,
+        mock_reasoning_parser,
+    ):
+        """Test should_advance when reasoning has not ended."""
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = mock_reasoning_parser
+
+        # Set reasoning as not ended
+        (
+            mock_request_with_structured_output.structured_output_request
+        ).reasoning_ended = False
+        mock_reasoning_parser.is_reasoning_end.return_value = False
+
+        result = manager.should_advance(mock_request_with_structured_output)
+
+        # Should return False since reasoning hasn't ended
+        assert result is False
+
+    def test_should_advance_reasoning_just_ended(
+        self,
+        mock_vllm_config,
+        mock_request_with_structured_output,
+        mock_reasoning_parser,
+    ):
+        """Test should_advance when reasoning ends in current step."""
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = mock_reasoning_parser
+
+        # Set reasoning as not ended initially, but ends in this step
+        (
+            mock_request_with_structured_output.structured_output_request
+        ).reasoning_ended = False
+        mock_reasoning_parser.is_reasoning_end.return_value = True
+
+        result = manager.should_advance(mock_request_with_structured_output)
+
+        # Should set reasoning_ended to True but return False for this step
+        assert (
+            mock_request_with_structured_output.structured_output_request.reasoning_ended
+            is True
+        )
+        assert result is False
+
+    def test_should_advance_reasoning_already_ended(
+        self,
+        mock_vllm_config,
+        mock_request_with_structured_output,
+        mock_reasoning_parser,
+    ):
+        """Test should_advance when reasoning has already ended."""
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner = mock_reasoning_parser
+
+        # Set reasoning as already ended
+        (
+            mock_request_with_structured_output.structured_output_request
+        ).reasoning_ended = True
+
+        result = manager.should_advance(mock_request_with_structured_output)
+
+        # Should return True since reasoning has ended
+        assert result is True
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 4e7c4b33e8c47b846c34307e1999ed10165a01f3..513a21dd6bb39a931efba344538ee29cc8d3d3f5 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -4,88 +4,50 @@
 import pytest
 
 from vllm.v1.structured_output.backend_xgrammar import (
-    has_xgrammar_unsupported_json_features)
+    has_xgrammar_unsupported_json_features,
+)
+
+pytestmark = pytest.mark.cpu_test
 
 
 @pytest.fixture
 def unsupported_string_schemas():
     return [
-        {
-            "type": "string",
-            "format": "email"
-        },
+        {"type": "string", "format": "non_existing_format"},
     ]
 
 
 @pytest.fixture
 def unsupported_integer_schemas():
     return [
-        {
-            "type": "integer",
-            "multipleOf": 120
-        },
+        {"type": "integer", "multipleOf": 120},
     ]
 
 
 @pytest.fixture
 def unsupported_number_schemas():
     return [
-        {
-            "type": "number",
-            "multipleOf": 120
-        },
+        {"type": "number", "multipleOf": 120},
     ]
 
 
 @pytest.fixture
 def unsupported_array_schemas():
     return [
-        {
-            "type": "array",
-            "uniqueItems": True
-        },
-        {
-            "type": "array",
-            "contains": {
-                "type": "string"
-            }
-        },
-        {
-            "type": "array",
-            "minContains": 1
-        },
-        {
-            "type": "array",
-            "maxContains": 5
-        },
+        {"type": "array", "uniqueItems": True},
+        {"type": "array", "contains": {"type": "string"}},
+        {"type": "array", "minContains": 1},
+        {"type": "array", "maxContains": 5},
     ]
 
 
 @pytest.fixture
 def unsupported_object_schemas():
     return [
-        {
-            "type": "object",
-            "minProperties": 1
-        },
-        {
-            "type": "object",
-            "maxProperties": 5
-        },
-        {
-            "type": "object",
-            "propertyNames": {
-                "pattern": "^[a-z]+$"
-            }
-        },
-        {
-            "type": "object",
-            "patternProperties": {
-                "^S": {
-                    "type": "string"
-                }
-            }
-        },
+        {"type": "object", "minProperties": 1},
+        {"type": "object", "maxProperties": 5},
+        {"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
+        {"type": "object", "patternProperties": {"^S": {"type": "string"}}},
     ]
 
 
@@ -94,75 +56,51 @@ def supported_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
-            "status": {
-                "type": "string"
-            },
-            "scores": {
-                "type": "array",
-                "items": {
-                    "type": "number"
-                }
-            },
-            "car_type": {
-                "type": "string",
-                "enum": ["sedan", "suv", "truck"]
-            },
-            "car_brand": {
-                "type": "string",
-                "pattern": "^[a-zA-Z]+$"
-            },
-            "short_description": {
-                "type": "string",
-                "maxLength": 50
-            },
-            "mileage": {
-                "type": "number",
-                "minimum": 0,
-                "maximum": 1000000
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "email": {"type": "string", "format": "email"},
+            "status": {"type": "string"},
+            "scores": {"type": "array", "items": {"type": "number"}},
+            "car_type": {"type": "string", "enum": ["sedan", "suv", "truck"]},
+            "car_brand": {"type": "string", "pattern": "^[a-zA-Z]+$"},
+            "short_description": {"type": "string", "maxLength": 50},
+            "mileage": {"type": "number", "minimum": 0, "maximum": 1000000},
             "model_year": {
                 "type": "integer",
                 "exclusiveMinimum": 1900,
-                "exclusiveMaximum": 2100
-            },
-            "long_description": {
-                "type": "string",
-                "minLength": 50,
-                "maxLength": 2000
+                "exclusiveMaximum": 2100,
             },
+            "long_description": {"type": "string", "minLength": 50, "maxLength": 2000},
             "address": {
                 "type": "object",
                 "properties": {
-                    "street": {
-                        "type": "string"
-                    },
-                    "city": {
-                        "type": "string"
-                    }
-                }
-            }
-        }
+                    "street": {"type": "string"},
+                    "city": {"type": "string"},
+                },
+            },
+        },
     }
 
 
-@pytest.mark.parametrize("schema_type", [
-    "unsupported_string_schemas", "unsupported_integer_schemas",
-    "unsupported_number_schemas", "unsupported_array_schemas",
-    "unsupported_object_schemas"
-])
+@pytest.mark.parametrize(
+    "schema_type",
+    [
+        "unsupported_string_schemas",
+        "unsupported_integer_schemas",
+        "unsupported_number_schemas",
+        "unsupported_array_schemas",
+        "unsupported_object_schemas",
+    ],
+)
 def test_unsupported_json_features_by_type(schema_type, request):
     schemas = request.getfixturevalue(schema_type)
     for schema in schemas:
-        assert has_xgrammar_unsupported_json_features(
-            schema), f"Schema should be unsupported: {schema}"
+        assert has_xgrammar_unsupported_json_features(schema), (
+            f"Schema should be unsupported: {schema}"
+        )
 
 
 def test_supported_json_features(supported_schema):
-    assert not has_xgrammar_unsupported_json_features(
-        supported_schema), "Schema should be supported"
+    assert not has_xgrammar_unsupported_json_features(supported_schema), (
+        "Schema should be supported"
+    )
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 74aa20a2f7f9c2c9cebad3bdebe41c4d01f4897c..f989f0744166cf7f6aa4cb7cc5b102c4d6716349 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -1,56 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-
 import pytest
 
-import vllm.envs as envs
-from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
 
-def test_reject_bad_config(monkeypatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-
-
-def test_unsupported_configs(monkeypatch):
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                speculative_config={
-                    "model": MODEL,
-                },
-            ).create_engine_config()
-
-
-def test_enable_by_default_fallback(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Should default to V1 for supported config.
-        _ = AsyncEngineArgs(
+def test_unsupported_configs():
+    with pytest.raises(NotImplementedError):
+        AsyncEngineArgs(
             model=MODEL,
-            enforce_eager=True,
+            speculative_config={
+                "model": MODEL,
+            },
         ).create_engine_config()
-        assert envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
-
-def test_v1_llm_by_default(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Should default to V1 for supported config.
-        llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
-        print(llm.generate("Hello my name is"))
-        assert hasattr(llm.llm_engine, "engine_core")
-        m.delenv("VLLM_USE_V1")
diff --git a/tests/v1/test_outputs.py b/tests/v1/test_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..af9df844249ef711d4f606245884ce6477e2a623
--- /dev/null
+++ b/tests/v1/test_outputs.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest import TestCase
+
+from vllm.v1.outputs import LogprobsLists
+
+
+class TestLogprobsLists(TestCase):
+    def setUp(self):
+        self.logprobsLists = LogprobsLists(
+            logprob_token_ids=[
+                [1, 2],  # Request 0 token 0
+                [3, 4],  # Request 0 token 1
+                [5, 6],  # Request 1 token 0
+                [7, 8],  # Request 1 token 1
+                [9, 10],  # Request 1 token 2
+                [11, 12],  # Request 2 token 0
+                [13, 14],  # Request 2 token 1
+                [15, 16],  # Request 2 token 2
+                [17, 18],  # Request 2 token 3
+            ],
+            logprobs=[
+                [0.1, 0.2],
+                [0.3, 0.4],
+                [0.5, 0.6],
+                [0.7, 0.8],
+                [0.9, 1.0],
+                [1.1, 1.2],
+                [1.3, 1.4],
+                [1.5, 1.6],
+                [1.7, 1.8],
+            ],
+            sampled_token_ranks=[1, 3, 5, 7, 9, 11, 13, 15, 17],
+            cu_num_generated_tokens=[0, 2, 5, 9],
+        )
+
+    def test_slice_without_cu_num_generated_tokens(self):
+        """Test slicing without cu_num_generated_tokens"""
+        logprobsLists = LogprobsLists(
+            logprob_token_ids=[[1], [2], [3]],
+            logprobs=[[0.1], [0.2], [0.3]],
+            sampled_token_ranks=[1, 2, 3],
+            cu_num_generated_tokens=None,
+        )
+
+        sliced = logprobsLists.slice(1, 3)
+        assert sliced.logprob_token_ids == [[2], [3]]
+        assert sliced.logprobs == [[0.2], [0.3]]
+        assert sliced.sampled_token_ranks == [2, 3]
+        assert sliced.cu_num_generated_tokens is None
+
+    def test_slice_from_start(self):
+        """Test slicing from the start position"""
+        sliced = self.logprobsLists.slice(0, 2)
+        assert len(sliced.logprob_token_ids) == 5
+        assert sliced.logprob_token_ids == [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [9, 10],
+        ]
+        assert sliced.cu_num_generated_tokens == [0, 2, 5]
+
+    def test_slice_from_middle(self):
+        """Test slicing from the middle position"""
+        sliced = self.logprobsLists.slice(1, 3)
+        assert len(sliced.logprob_token_ids) == 7
+        assert sliced.logprob_token_ids == [
+            [5, 6],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+        ]
+        assert sliced.cu_num_generated_tokens == [0, 3, 7]
+
+    def test_slice_single_request(self):
+        """Test slicing a single request"""
+        sliced = self.logprobsLists.slice(1, 2)
+        assert len(sliced.logprob_token_ids) == 3
+        assert sliced.logprob_token_ids == [[5, 6], [7, 8], [9, 10]]
+        assert sliced.cu_num_generated_tokens == [0, 3]
+
+    def test_slice_last_request(self):
+        """Test slicing the last request"""
+        sliced = self.logprobsLists.slice(2, 3)
+        assert len(sliced.logprob_token_ids) == 4
+        assert sliced.logprob_token_ids == [[11, 12], [13, 14], [15, 16], [17, 18]]
+        assert sliced.cu_num_generated_tokens == [0, 4]
+
+    def test_slice_all_requests(self):
+        """Test slicing all requests (full slice)"""
+        sliced = self.logprobsLists.slice(0, 3)
+        assert len(sliced.logprob_token_ids) == 9  # All tokens
+        assert sliced.logprob_token_ids == self.logprobsLists.logprob_token_ids
+        assert (
+            sliced.cu_num_generated_tokens == self.logprobsLists.cu_num_generated_tokens
+        )
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 118b40d0ef4188e0e914a4ac9f0041f1598a2e8f..00749c5415c8e843814915c19279f759660c68f1 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -2,23 +2,27 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import UserDict
 from dataclasses import dataclass
-from typing import Optional
 
 import msgspec
 import numpy as np
 import pytest
 import torch
 
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalFlatField,
-                                    MultiModalKwargsItem,
-                                    MultiModalKwargsItems,
-                                    MultiModalSharedField, NestedTensors)
+from vllm.multimodal.inputs import (
+    MultiModalBatchedField,
+    MultiModalFieldElem,
+    MultiModalFlatField,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalSharedField,
+    NestedTensors,
+)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
+pytestmark = pytest.mark.cpu_test
 
-class UnrecognizedType(UserDict):
 
+class UnrecognizedType(UserDict):
     def __init__(self, an_int: int):
         super().__init__()
         self.an_int = an_int
@@ -45,10 +49,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
         obj = MyType(
-            tensor1=torch.randint(low=0,
-                                  high=100,
-                                  size=(1024, ),
-                                  dtype=torch.int32),
+            tensor1=torch.randint(low=0, high=100, size=(1024,), dtype=torch.int32),
             a_string="hello",
             list_of_tensors=[
                 torch.rand((1, 10), dtype=torch.float32),
@@ -56,8 +57,9 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
                 torch.tensor(1984),  # test scalar too
                 # Make sure to test bf16 which numpy doesn't support.
                 torch.rand((3, 5, 1000), dtype=torch.bfloat16),
-                torch.tensor([float("-inf"), float("inf")] * 1024,
-                             dtype=torch.bfloat16),
+                torch.tensor(
+                    [float("-inf"), float("inf")] * 1024, dtype=torch.bfloat16
+                ),
             ],
             numpy_array=np.arange(512),
             unrecognized=UnrecognizedType(33),
@@ -97,26 +99,28 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
 
 
 class MyRequest(msgspec.Struct):
-    mm: Optional[list[MultiModalKwargsItems]]
+    mm: list[MultiModalKwargsItems] | None
 
 
 def test_multimodal_kwargs():
-    e1 = MultiModalFieldElem("audio", "a0",
-                             torch.zeros(1000, dtype=torch.bfloat16),
-                             MultiModalBatchedField())
+    e1 = MultiModalFieldElem(
+        "audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()
+    )
     e2 = MultiModalFieldElem(
         "video",
         "v0",
         [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
-        MultiModalFlatField(
-            [[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+        MultiModalFlatField([[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+    )
+    e3 = MultiModalFieldElem(
+        "image", "i0", torch.zeros(1000, dtype=torch.int32), MultiModalSharedField(4)
     )
-    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
-                                                        dtype=torch.int32),
-                             MultiModalSharedField(4))
     e4 = MultiModalFieldElem(
-        "image", "i1", torch.zeros(1000, dtype=torch.int32),
-        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2))
+        "image",
+        "i1",
+        torch.zeros(1000, dtype=torch.int32),
+        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2),
+    )
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
@@ -162,16 +166,14 @@ def assert_equal(obj1: MyType, obj2: MyType):
     assert torch.equal(obj1.tensor1, obj2.tensor1)
     assert obj1.a_string == obj2.a_string
     assert all(
-        torch.equal(a, b)
-        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+        torch.equal(a, b) for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors)
+    )
     assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
     assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
     assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
     assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
-    assert torch.equal(obj1.small_non_contig_tensor,
-                       obj2.small_non_contig_tensor)
-    assert torch.equal(obj1.large_non_contig_tensor,
-                       obj2.large_non_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor, obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor, obj2.large_non_contig_tensor)
     assert torch.equal(obj1.empty_tensor, obj2.empty_tensor)
 
 
@@ -208,8 +210,9 @@ def test_tensor_serialization():
     decoded = decoder.decode(encoded)
 
     # Verify the decoded tensor matches the original
-    assert torch.allclose(
-        tensor, decoded), "Decoded tensor does not match the original tensor."
+    assert torch.allclose(tensor, decoded), (
+        "Decoded tensor does not match the original tensor."
+    )
 
 
 def test_numpy_array_serialization():
@@ -227,13 +230,12 @@ def test_numpy_array_serialization():
     decoded = decoder.decode(encoded)
 
     # Verify the decoded array matches the original
-    assert np.allclose(
-        array,
-        decoded), "Decoded numpy array does not match the original array."
+    assert np.allclose(array, decoded), (
+        "Decoded numpy array does not match the original array."
+    )
 
 
 class CustomClass:
-
     def __init__(self, value):
         self.value = value
 
@@ -242,7 +244,8 @@ class CustomClass:
 
 
 def test_custom_class_serialization_allowed_with_pickle(
-        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch: pytest.MonkeyPatch,
+):
     """Test that serializing a custom class succeeds when allow_pickle=True."""
 
     with monkeypatch.context() as m:
@@ -259,8 +262,7 @@ def test_custom_class_serialization_allowed_with_pickle(
         decoded = decoder.decode(encoded)
 
         # Verify the decoded object matches the original
-        assert obj == decoded, (
-            "Decoded object does not match the original object.")
+        assert obj == decoded, "Decoded object does not match the original object."
 
 
 def test_custom_class_serialization_disallowed_without_pickle():
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 865b58bc7f4b0c65d928dfa685b13adc9d8af76d..0d53a02476fabaf33eac300638fd6280ce6acf30 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -4,7 +4,6 @@
 
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
-from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
@@ -15,6 +14,8 @@ from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 MODELS = [
     "Qwen/Qwen2.5-1.5B-Instruct",
@@ -32,51 +33,51 @@ MAX_NUM_REQS = [16, 1024]
 # TENSOR_PARALLEL_SIZES = [1, 4]
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     tensor_parallel_size: int,
     max_num_seqs: int,
 ) -> None:
-    prompt = "The next numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+    prompt = (
+        "The next numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
     example_prompts = [prompt]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]
 
-        with vllm_runner(
-                model,
-                # Note: max_num_batched_tokens == 1024 is needed here to
-                # actually test chunked prompt
-                max_num_batched_tokens=1024,
-                max_model_len=8192,
-                gpu_memory_utilization=0.7,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output
 
 
 @pytest.mark.skip(reason="Temporarily disabled due to timeout")
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 @pytest.mark.parametrize("max_tokens", [8])
 @pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     max_tokens: int,
     max_num_seqs: int,
 ) -> None:
@@ -93,30 +94,27 @@ def test_phi3(
     # test head dim = 96
     model = "microsoft/Phi-3-mini-128k-instruct"
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(model,
-                         max_num_batched_tokens=256,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text
 
 
 TP_SIZE_8 = 8
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a test for TPU only")
-@pytest.mark.skipif(tpu.num_available_chips() < TP_SIZE_8,
-                    reason=f"This test requires {TP_SIZE_8} TPU chips.")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This is a test for TPU only")
+@pytest.mark.skipif(
+    tpu.num_available_chips() < TP_SIZE_8,
+    reason=f"This test requires {TP_SIZE_8} TPU chips.",
+)
 def test_gemma3_27b_with_text_input_and_tp(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     model = "google/gemma-3-27b-it"
     max_tokens = 16
@@ -133,49 +131,47 @@ def test_gemma3_27b_with_text_input_and_tp(
         " but in rising every time we fall.",
     ]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-                model,
-                max_num_batched_tokens=256,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
-
-
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=256,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text
+
+
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 def test_w8a8_quantization(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
     max_tokens = 5
     tensor_parallel_size = 1
     max_num_seqs = 4
 
-    prompt = "The next numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+    prompt = (
+        "The next numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
     example_prompts = [prompt]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-                model,
-                max_num_batched_tokens=64,
-                max_model_len=4096,
-                gpu_memory_utilization=0.7,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=64,
+        max_model_len=4096,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]
+
+    assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py
index acb607247d754e07e4a9b531087d82db81348797..99d5f98351ad2a6eee493ed7c9708fd77fc2f02f 100644
--- a/tests/v1/tpu/test_kv_cache_update_kernel.py
+++ b/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -10,61 +10,69 @@ import vllm.v1.attention.backends.pallas  # noqa: F401
 from vllm.platforms import current_platform
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a test for TPU only")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This is a test for TPU only")
 @pytest.mark.parametrize("page_size", [32, 33])
 @pytest.mark.parametrize("combined_kv_head_num", [2, 16])
 @pytest.mark.parametrize("head_dim", [128, 256])
 @pytest.mark.parametrize("num_slices_per_block", [4, 8])
-def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
-                                head_dim: int, num_slices_per_block: int):
+def test_kv_cache_update_kernel(
+    page_size: int, combined_kv_head_num: int, head_dim: int, num_slices_per_block: int
+):
     page_num = 1000
     padded_num_tokens = 128
     kv_cache_cpu = torch.zeros(
         (page_num * page_size, combined_kv_head_num, head_dim),
         dtype=torch.bfloat16,
-        device="cpu")
+        device="cpu",
+    )
     kv_cache_xla = kv_cache_cpu.to(torch_xla.device())
     new_kv_cpu = torch.randn(
         (padded_num_tokens, combined_kv_head_num, head_dim),
         dtype=torch.bfloat16,
-        device="cpu")
+        device="cpu",
+    )
     new_kv_xla = new_kv_cpu.to(torch_xla.device())
-    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9],
-                          dtype=np.int32)
+    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9], dtype=np.int32)
     num_kv_update_slices = len(slice_lens)
-    kv_cache_start_indices = np.array([
-        page_size * 2 - 7, page_size * 2, page_size * 3, page_size * 4 + 6,
-        page_size * 5 + 7, page_size * 6 + 8, page_size * 15 + 3
-    ],
-                                      dtype=np.int32)
+    kv_cache_start_indices = np.array(
+        [
+            page_size * 2 - 7,
+            page_size * 2,
+            page_size * 3,
+            page_size * 4 + 6,
+            page_size * 5 + 7,
+            page_size * 6 + 8,
+            page_size * 15 + 3,
+        ],
+        dtype=np.int32,
+    )
     new_kv_cache_indices = np.concatenate(
-        [np.array([0], dtype=np.int32),
-         np.cumsum(slice_lens[:-1])])
+        [np.array([0], dtype=np.int32), np.cumsum(slice_lens[:-1])]
+    )
     slot_mapping = np.stack(
-        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
+        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1
+    )
     slot_mapping = np.transpose(slot_mapping)
-    slot_mapping_cpu = torch.tensor(slot_mapping,
-                                    device="cpu",
-                                    dtype=torch.int32)
+    slot_mapping_cpu = torch.tensor(slot_mapping, device="cpu", dtype=torch.int32)
     slot_mapping_xla = slot_mapping_cpu.to(torch_xla.device())
-    num_kv_update_slices_xla = torch.tensor([num_kv_update_slices],
-                                            device=torch_xla.device(),
-                                            dtype=torch.int32)
+    num_kv_update_slices_xla = torch.tensor(
+        [num_kv_update_slices], device=torch_xla.device(), dtype=torch.int32
+    )
     torch_xla.sync()
 
     torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
     new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
-        new_kv_xla, slot_mapping_xla, kv_cache_xla, num_kv_update_slices_xla,
-        page_size, num_slices_per_block)
+        new_kv_xla,
+        slot_mapping_xla,
+        kv_cache_xla,
+        num_kv_update_slices_xla,
+        page_size,
+        num_slices_per_block,
+    )
     kv_cache_xla.copy_(new_kv_cache_xla)
     torch_xla.sync()
 
-    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices,
-                          slice_lens):
-        kv_cache_cpu[ci:ci + sl, :, :] = new_kv_cpu[ni:ni + sl, :, :]
+    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices, slice_lens):
+        kv_cache_cpu[ci : ci + sl, :, :] = new_kv_cpu[ni : ni + sl, :, :]
 
-    assert torch.allclose(kv_cache_xla.cpu(),
-                          kv_cache_cpu,
-                          atol=1e-4,
-                          rtol=1e-4)
+    assert torch.allclose(kv_cache_xla.cpu(), kv_cache_cpu, atol=1e-4, rtol=1e-4)
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 9d690851b70eb13acc060f23d1c6de6a3774072b..5debdf85bea8de41c3bc38f48171e41d82b492fa 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -19,8 +19,7 @@ from vllm.platforms import current_platform
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
@@ -49,8 +48,7 @@ NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -68,19 +66,12 @@ def test_mha_attn_forward(
     current_platform.seed_everything(0)
     # These are expected to be f32
     q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device)
-    k = torch.randn(batch_size,
-                    seq_len,
-                    num_kv_heads * head_size,
-                    device=device)
-    v = torch.randn(batch_size,
-                    seq_len,
-                    num_kv_heads * head_size,
-                    device=device)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(num_heads,
-                              head_size,
-                              scale=scale,
-                              num_kv_heads=num_kv_heads)
+    attn = MultiHeadAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
     output = attn(q, k, v)
 
     assert num_heads % num_kv_heads == 0
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index 9947fcbe73135168788579e02e3f14350a3eeea6..5bf823417d4dc7b4775116fc88d6be8beec5c9d7 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -14,38 +14,32 @@ from ...utils import RemoteOpenAIServer
 @pytest.fixture(scope="session")
 def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_asset:
-        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        image_asset: encode_image_base64(
+            local_asset_server.get_image_asset(image_asset)
+        )
         for image_asset in TEST_IMAGE_ASSETS
     }
 
 
 @pytest.mark.asyncio
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
-async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
-                                                                        str]):
-
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
     pytest.skip("Skip this test until it's fixed.")
 
     def whats_in_this_image_msg(b64):
-        return [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{b64}"
+        return [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
                     },
-                },
-            ],
-        }]
+                ],
+            }
+        ]
 
     server_args = [
         "--max-model-len",
@@ -62,19 +56,20 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
     ]
 
     # Server will pre-compile on first startup (takes a long time).
-    with RemoteOpenAIServer(model_name, server_args,
-                            max_wait_seconds=600) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=600
+    ) as remote_server:
         client: openai.AsyncOpenAI = remote_server.get_async_client()
 
         # Other requests now should be much faster
         for image_url in TEST_IMAGE_ASSETS:
             image_base64 = base64_encoded_image[image_url]
-            chat_completion_from_base64 = await client.chat.completions\
-                .create(
+            chat_completion_from_base64 = await client.chat.completions.create(
                 model=model_name,
                 messages=whats_in_this_image_msg(image_base64),
                 max_completion_tokens=24,
-                temperature=0.0)
+                temperature=0.0,
+            )
             result = chat_completion_from_base64
             assert result
             choice = result.choices[0]
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index 1bc8dff317a7474e2e11c58fa43389b6d9a551a1..0a994e99bade13cac0e98c15cd34dc23b9af3ea4 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -5,8 +5,7 @@ from unittest.mock import ANY, patch
 import torch
 
 from vllm.attention.backends.abstract import AttentionType
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
-                                               PallasMetadata)
+from vllm.v1.attention.backends.pallas import PallasAttentionBackendImpl, PallasMetadata
 
 
 def test_ragged_paged_attention():
@@ -53,14 +52,14 @@ def test_ragged_paged_attention():
     max_num_reqs = 8
     max_num_blocks_per_req = 8
     num_kv_update_slices = torch.tensor([num_tokens], dtype=torch.int32)
-    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                               dtype=torch.int32)
-    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    block_tables = torch.zeros(
+        (max_num_reqs, max_num_blocks_per_req), dtype=torch.int32
+    )
+    context_lens = torch.ones((max_num_reqs,), dtype=torch.int32)
     query_lens = [1] * max_num_reqs
-    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                                dtype=torch.int32),
-                                   dim=0,
-                                   dtype=torch.int32)
+    query_start_loc = torch.cumsum(
+        torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32
+    )
     num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
     attn_metadata = PallasMetadata(
         slot_mapping=slot_mapping,
@@ -72,8 +71,7 @@ def test_ragged_paged_attention():
         num_slices_per_kv_cache_update_block=8,
     )
 
-    with patch("torch.ops.xla.ragged_paged_attention"
-               ) as mock_ragged_paged_attention:
+    with patch("torch.ops.xla.ragged_paged_attention") as mock_ragged_paged_attention:
         attn_impl.forward(
             layer=layer,
             query=query,
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index f4a2d5ac853a8895c4700e543beb143f00c97cd9..e230491cddb012a66587216007efdaf747683843 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -4,7 +4,6 @@
 
 Run `pytest tests/v1/tpu/test_perf.py`.
 """
-from __future__ import annotations
 
 import time
 from dataclasses import dataclass
@@ -19,6 +18,8 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 
 @dataclass
@@ -37,7 +38,6 @@ TEST_PARAMS = [
     #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
     #   Couldn't open iommu group /dev/vfio/0
     # => Investigate
-
     # TestParams(
     #     model="Qwen/Qwen2.5-1.5B-Instruct",
     #     num_prompts=1,
@@ -59,16 +59,14 @@ TEST_PARAMS = [
         num_prompts=64,
         prefix_len=500,
         decode_len=50,
-
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
         # tpu: v5lite (old vllm CI/CD)
         # expected_avg_time=1.4,
         # err_tol=0.30,
-
         # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
         # tpu: v6e (current vllm CI/CD)
-        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=  
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=
         err_tol=0.20,
     ),
 ]
@@ -81,66 +79,72 @@ MAX_NUM_SEQS = 32
 GPU_UTIL = 0.9
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic performance test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(),
+    reason="This is a basic performance test for TPU only",
+)
 @pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     params: TestParams,
 ) -> None:
-    tokenizer = get_tokenizer(params.model,
-                              tokenizer_mode="auto",
-                              trust_remote_code=True)
+    tokenizer = get_tokenizer(
+        params.model, tokenizer_mode="auto", trust_remote_code=True
+    )
 
     prompts = []
     for i in range(params.num_prompts):
-        prefix_token_ids = np.random.randint(0,
-                                             tokenizer.vocab_size,
-                                             size=params.prefix_len).tolist()
+        prefix_token_ids = np.random.randint(
+            0, tokenizer.vocab_size, size=params.prefix_len
+        ).tolist()
         prompt = tokenizer.decode(prefix_token_ids)
         prompts.append(prompt)
 
     print(
         "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
-            len(prompts), params.prefix_len, params.decode_len))
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        sampling_params = SamplingParams(max_tokens=params.decode_len,
-                                         temperature=1.0,
-                                         min_p=0.0)
-
-        with vllm_runner(params.model,
-                         max_num_batched_tokens=MAX_MODEL_LEN,
-                         max_model_len=MAX_MODEL_LEN,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         gpu_memory_utilization=GPU_UTIL,
-                         enforce_eager=False,
-                         tensor_parallel_size=1) as vllm_model:
-            print("  -- Warmup / Compile")
-            for i in range(NUM_WARMUPS):
-                _ = vllm_model.generate(prompts, sampling_params)
-
-            print("  -- Benchmarking... ")
-            times = []
-            for i in range(NUM_RUNS):
-                start_time = time.time()
-                _ = vllm_model.generate(prompts, sampling_params)
-                times.append(time.time() - start_time)
-
-            avg_time = sum(times) / len(times)
-
-            print("  -- avg_time = {}".format(avg_time))
-            print("  -- expected_avg_time = {} with err_tol = {}".format(
-                params.expected_avg_time, params.err_tol))
-            diff = avg_time - params.expected_avg_time
-            ok = diff < params.err_tol
-            if diff < -params.err_tol:
-                print("  !! WARNING !! Performance has improved by {}, "
-                      "it may be necessary to fine-tune the "
-                      "expected_avg_time = {}".format(
-                          -diff, params.expected_avg_time))
-
-            assert ok, " !! ERROR !! Regression detected"
+            len(prompts), params.prefix_len, params.decode_len
+        )
+    )
+
+    sampling_params = SamplingParams(
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )
+
+    with vllm_runner(
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)
+
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)
+
+        avg_time = sum(times) / len(times)
+
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
+        )
+        diff = avg_time - params.expected_avg_time
+        ok = diff < params.err_tol
+        if diff < -params.err_tol:
+            print(
+                "  !! WARNING !! Performance has improved by {}, "
+                "it may be necessary to fine-tune the "
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
+            )
+
+        assert ok, " !! ERROR !! Regression detected"
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index fa950e5f7f85b2519e2d2b24a0690cfd4346237a..58f6292b05a7249e3e8e2f782ffcaa174d007ffb 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -10,21 +10,20 @@ from vllm.sampling_params import SamplingParams
 
 
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 def test_sampler_different(model_name: str):
     """
-    Test significantly different sampling params to assert the model produces 
+    Test significantly different sampling params to assert the model produces
     different results.
     """
-    llm = LLM(model_name,
-              enforce_eager=False,
-              max_num_seqs=1,
-              max_model_len=512,
-              max_num_batched_tokens=256)
-    prompts = [
-        "Write a short story about a robot that dreams for the first time."
-    ]
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=512,
+        max_num_batched_tokens=256,
+    )
+    prompts = ["Write a short story about a robot that dreams for the first time."]
     sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
     output = llm.generate(prompts, sampling_params)
 
@@ -47,7 +46,9 @@ def test_sampler_different(model_name: str):
                 max_tokens=64,
                 # Vary number of ks
                 top_k=random.randint(4, 12),
-                top_p=random.random()) for _ in range(B)
+                top_p=random.random(),
+            )
+            for _ in range(B)
         ]
         # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
@@ -61,20 +62,18 @@ def test_sampler_different(model_name: str):
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 # TODO TPU will appear busy if we fan-out test params here
 @pytest.mark.parametrize("n_prompts", [1])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 def test_logprobs(model_name: str, n_prompts: int):
     """
     Request top logprobs with different sampling settings and check
-    that results contains the requested number, ordered ascendingly.  
+    that results contains the requested number, ordered ascendingly.
     """
 
     def check_num_logprobs(logprobs, expected_num: int):
         for step in logprobs:
             prev_logp = 1.0
             # order by rank
-            sorted_step = dict(
-                sorted(step.items(), key=lambda item: item[1].rank))
+            sorted_step = dict(sorted(step.items(), key=lambda item: item[1].rank))
 
             # Can contain the sampled token
             assert len(step) == expected_num or len(step) == expected_num + 1
@@ -84,23 +83,23 @@ def test_logprobs(model_name: str, n_prompts: int):
                 prev_logp = logp.logprob
                 assert logp.rank == rankno + 1
 
-    llm = LLM(model_name,
-              enforce_eager=False,
-              max_num_seqs=1,
-              max_model_len=128,
-              max_num_batched_tokens=128)
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=128,
+        max_num_batched_tokens=128,
+    )
     prompts = [
         "Write a short story about a robot that dreams for the first time."
     ] * n_prompts
-    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
-         logprobs=4)
-    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4)
-    topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4, top_k=12, top_p=0.5)
+    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64, logprobs=4)
+    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64, logprobs=4)
+    topkp_sampling_params = SamplingParams(
+        temperature=0.4, max_tokens=64, logprobs=4, top_k=12, top_p=0.5
+    )
 
-    for sp in [greedy_sampling_params, regular_sampling_params, \
-               topkp_sampling_params]:
+    for sp in [greedy_sampling_params, regular_sampling_params, topkp_sampling_params]:
         output = llm.generate(prompts, sp)
         for o in output:
             check_num_logprobs(o.outputs[0].logprobs, 4)
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
index ad234df0c8ed752ac748e2944e170ef1a3ec0ab0..be866bf90a7922142f865a88621ad6acf50572c3 100644
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -9,14 +9,18 @@ import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 from vllm.config import set_current_vllm_config
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tpu import TPUModelLoader
 
 
 def _setup_environment(model):
-    engine_args = EngineArgs(model=model, )
+    engine_args = EngineArgs(
+        model=model,
+    )
     vllm_config = engine_args.create_engine_config()
     with set_current_vllm_config(vllm_config):
         temp_file = tempfile.mkstemp()[1]
@@ -25,7 +29,8 @@ def _setup_environment(model):
             0,
             local_rank=0,
             distributed_init_method=f"file://{temp_file}",
-            backend="gloo")
+            backend="gloo",
+        )
         # Under single worker mode, full model is init first and then
         # partitioned using GSPMD.
         ensure_model_parallel_initialized(1, 1)
@@ -42,7 +47,7 @@ def _get_spmd_mesh():
         num_devices = xr.global_runtime_device_count()
         mesh_shape = (num_devices, 1)
         device_ids = np.array(range(num_devices))
-        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+        MESH = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
     return MESH
 
 
@@ -53,15 +58,17 @@ def _get_spmd_mesh():
         # Skip large models due to CI runner disk space limitations
         # "meta-llama/Llama-3.1-8B-Instruct",
         # "meta-llama/Llama-3.1-70B-Instruct",
-    ])
+    ],
+)
 def test_tpu_model_loader(model):
     # Skip the 70B test if there are less than 8 chips
     # TODO: Query using torch xla API, the query API is not working
     # with SPMD now. However, This test is running under SPMD mode.
-    if '70B' in model and xr.global_runtime_device_count() < 8:
+    if "70B" in model and xr.global_runtime_device_count() < 8:
         pytest.skip(
             "Skipping 70B model if the TPU VM has less than 8 chips to \
-                     avoid OOM.")
+                     avoid OOM."
+        )
 
     vllm_config = _setup_environment(model)
     loader = TPUModelLoader(load_config=vllm_config.load_config)
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index 665cf8cd2629efdd1b97236330451fc732b15d6d..c6634395bb167724f61705f3f8738c9385e767ef 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -8,11 +8,7 @@ import torch_xla
 
 from vllm.platforms import current_platform
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
-
-# isort: off
-from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as
-                                        apply_top_k_top_p_tpu)
-# isort: on
+from vllm.v1.sample.tpu.sampler import apply_top_k_top_p as apply_top_k_top_p_tpu
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
@@ -30,11 +26,10 @@ def test_topk_equivalence_to_native_impl():
         logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
 
         # Random top-k values between 1 and 10.
-        k = torch.randint(1, 10, (BATCH_SIZE, ))
+        k = torch.randint(1, 10, (BATCH_SIZE,))
 
         # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
-        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
-                       VOCAB_SIZE)
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE,), dtype=bool), VOCAB_SIZE)
 
         result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
 
@@ -50,15 +45,13 @@ def test_topp_result_sums_past_p():
         probs = logits.softmax(dim=-1)
 
         # Random top-p values between 0 and 1.
-        p = torch.rand((BATCH_SIZE, ))
+        p = torch.rand((BATCH_SIZE,))
 
         # Set p=1 for ~50% of requests in the batch (top-p disabled).
-        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE,), dtype=bool), 1)
 
         no_op_k = torch.tensor([VOCAB_SIZE])
-        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                              k=no_op_k,
-                                              p=p)
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(), k=no_op_k, p=p)
 
         # Verify that the masked logit's probability sums to at least p.
         probs.masked_fill_(logits_masked.isinf(), 0)
@@ -72,16 +65,16 @@ def test_topp_result_sums_past_p():
 
 def test_topp_basic():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([3, 3]),
-                                       p=torch.tensor([0.79, 0.79]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([3, 3]), p=torch.tensor([0.79, 0.79])
+        )
 
         torch_xla.sync()
 
@@ -94,16 +87,16 @@ def test_topp_basic():
 
 def test_topp_select_all():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([3, 3]),
-                                       p=torch.tensor([1.0, 1.0]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([3, 3]), p=torch.tensor([1.0, 1.0])
+        )
 
         torch_xla.sync()
 
@@ -114,14 +107,12 @@ def test_topp_with_ties():
     with torch.device(xm.xla_device()):
         # Input has multiple math.log(0.3).
         logits = torch.tensor(
-            [[math.log(0.3),
-              math.log(0.3),
-              math.log(0.3),
-              math.log(0.1)]])
+            [[math.log(0.3), math.log(0.3), math.log(0.3), math.log(0.1)]]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([4]),
-                                       p=torch.tensor([0.2]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([4]), p=torch.tensor([0.2])
+        )
 
         torch_xla.sync()
 
@@ -135,17 +126,17 @@ def test_topp_with_ties():
 
 def test_both_topk_topp():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
         # Set k=1 for the first batch.
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([1, 3]),
-                                       p=torch.tensor([0.79, 0.79]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([1, 3]), p=torch.tensor([0.79, 0.79])
+        )
 
         torch_xla.sync()
 
diff --git a/tests/v1/tpu/test_tpu_int8.py b/tests/v1/tpu/test_tpu_int8.py
index f39a8021a29ef2cf73931436ebc51962783669e7..50001567a9588fefd1e93733cd720a6ef66c26e5 100644
--- a/tests/v1/tpu/test_tpu_int8.py
+++ b/tests/v1/tpu/test_tpu_int8.py
@@ -4,11 +4,11 @@
 
 Run `pytest tests/quantization/test_tpu_int8.py`.
 """
+
 import pytest
 
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.layers.quantization.tpu_int8 import (
-    TPUInt8LinearMethod)
+from vllm.model_executor.layers.quantization.tpu_int8 import TPUInt8LinearMethod
 from vllm.platforms import current_platform
 
 from ...models.registry import HF_EXAMPLE_MODELS
@@ -16,8 +16,9 @@ from ...models.registry import HF_EXAMPLE_MODELS
 MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="TPU Int8 is only enabled for TPUs.")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="TPU Int8 is only enabled for TPUs."
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [10])
@@ -26,20 +27,28 @@ MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
     [
         # w8a8 dynamic activation
         {
-            'quantization_config': {
-                'quant_method': 'tpu_int8',
-                'activation_scheme': 'dynamic'
+            "quantization_config": {
+                "quant_method": "tpu_int8",
+                "activation_scheme": "dynamic",
             }
         }
-    ])
-def test_model_tpu_int8(vllm_runner, model: str, dtype: str, max_tokens: int,
-                        hf_overrides: dict, monkeypatch) -> None:
+    ],
+)
+def test_model_tpu_int8(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    hf_overrides: dict,
+    monkeypatch,
+) -> None:
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    activation_scheme = hf_overrides.get('quantization_config',
-                                         {}).get('activation_scheme')
-    quantize_activation = activation_scheme == 'dynamic'
+    activation_scheme = hf_overrides.get("quantization_config", {}).get(
+        "activation_scheme"
+    )
+    quantize_activation = activation_scheme == "dynamic"
 
     # Allows using apply_model
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py
index 46fa1193881fa65b924c53599247ac60c73e7115..098d925505424662e17daf7edcff60f37b4b0bc0 100644
--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -9,8 +9,10 @@ import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 from vllm.config import set_current_vllm_config
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.linear import QKVParallelLinear
@@ -36,7 +38,8 @@ def setup_environment():
             0,
             local_rank=0,
             distributed_init_method=f"file://{temp_file}",
-            backend="gloo")
+            backend="gloo",
+        )
         ensure_model_parallel_initialized(1, 1)
         yield
 
@@ -51,7 +54,7 @@ def _get_spmd_mesh():
         num_devices = xr.global_runtime_device_count()
         mesh_shape = (num_devices, 1)
         device_ids = np.array(range(num_devices))
-        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+        MESH = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
     return MESH
 
 
@@ -59,7 +62,7 @@ def _get_spmd_mesh():
 # `xr.use_spmd()` will set a global state, and this state is not reversible.
 # Therefore, non-SPMD tests should be run before SPMD tests.
 @pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()])
-@pytest.mark.parametrize("device", ['cpu', 'xla'])
+@pytest.mark.parametrize("device", ["cpu", "xla"])
 @torch.no_grad()
 def test_xla_qkv_linear(bias, mesh, device):
     torch.manual_seed(123)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 4f4a9c7db88a3359f3f9411f367de4f967d89ebe..7b3a07b4e12a589f9f1cc4499dff1d24efc77478 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -4,18 +4,25 @@
 import pytest
 
 from vllm.attention.layer import Attention
-from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
-                                         get_kv_cache_configs)
-from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
-                                       SchedulerOutput)
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs
+from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 from vllm.v1.worker.tpu_model_runner import (
-    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
-    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
+    TPUModelRunner,
+    _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len,
+    _get_req_paddings,
+    _get_token_paddings,
+)
 
 
 def get_vllm_config():
@@ -67,10 +74,11 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_features=[],
                 sampling_params=SamplingParams(),
                 pooling_params=PoolingParams(),
-                block_ids=([0], ),  # block_ids should be tuple[list[int]]
+                block_ids=([0],),  # block_ids should be tuple[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
-            ))
+            )
+        )
         num_scheduled_tokens[req_id] = 3
         total_num_scheduled_tokens += num_scheduled_tokens[req_id]
 
@@ -81,11 +89,9 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         total_num_scheduled_tokens=total_num_scheduled_tokens,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
 
@@ -99,7 +105,7 @@ def _is_req_added(model_runner, req_id: str) -> bool:
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     """Check if the request state block IDs match the block table.
-    
+
     This function handles both legacy BlockTable and new MultiGroupBlockTable
     structures for backward compatibility.
     """
@@ -160,11 +166,9 @@ def test_update_states_request_finished(model_runner):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -190,11 +194,9 @@ def test_update_states_request_resumed(model_runner):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -204,10 +206,12 @@ def test_update_states_request_resumed(model_runner):
     # resume req
     cached_req_data = CachedRequestData(
         req_ids=[req_id],
-        resumed_from_preemption=[False],
+        resumed_req_ids={req_id},
         new_token_ids=[[]],
-        new_block_ids=[([], )],
+        all_token_ids={req_id: scheduler_output.scheduled_new_reqs[0].prompt_token_ids},
+        new_block_ids=[([],)],
         num_computed_tokens=[0],
+        num_output_tokens=[0],
     )
 
     scheduler_output = SchedulerOutput(
@@ -217,11 +221,9 @@ def test_update_states_request_resumed(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -248,11 +250,9 @@ def test_update_states_no_changes(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -283,11 +283,9 @@ def test_update_states_request_unscheduled(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -303,27 +301,23 @@ def test_get_paddings():
     # Bucketed padding
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
 
     # Bucketed padding with max_token_size not a power of two.
     max_token_size = 317
     expected_paddings = [16, 32, 64, 128, 192, 256, 320]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
 
     # Exponential padding.
     max_token_size, padding_gap = 1024, 0
     expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
     # Exponential padding with max_token_size not a power of two.
     max_token_size = 317
     expected_paddings = [16, 32, 64, 128, 256, 512]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
 
 
@@ -350,32 +344,31 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
-        model_runner):
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
                 kv_sharing_target_layer_name=layer_1,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -387,25 +380,25 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     invalid_layer = "model.layers.0.cross_attn.attn"
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 # invalid layer: cross_attn.atn doesn't exist!
                 kv_sharing_target_layer_name=invalid_layer,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -416,26 +409,26 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -447,20 +440,18 @@ def test_init_kv_cache_without_kv_sharing():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -475,17 +466,17 @@ def test_init_kv_cache_without_kv_sharing():
     available_memory = 20 * GiB_bytes
     # page size for each layer KV can be calculated as
     # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
-    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
     num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
-    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
-                                           [available_memory])[0]
+    kv_cache_config = get_kv_cache_configs(
+        vllm_config, [kv_cache_spec], [available_memory]
+    )[0]
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 2
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
     assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     # max_context_len = available_memory / (page_size / block_size) / num_caches
     # max_context_len = 5GB / (512KB / 128) / 2 = 655360
@@ -495,8 +486,9 @@ def test_init_kv_cache_without_kv_sharing():
     # this will only allocate 2 block worth of memory (2 * 512kb)
     kv_cache_config.num_blocks = 1
     for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-        kv_cache_tensor.size = (
-            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+        kv_cache_tensor.size = kv_cache_spec[
+            kv_cache_tensor.shared_by[0]
+        ].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
@@ -518,21 +510,19 @@ def test_init_kv_cache_with_kv_sharing_valid():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -550,24 +540,23 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
     num_expected_blocks = 2 * 20480  # 20GB / 512KB
-    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
-                                           [available_memory])[0]
+    kv_cache_config = get_kv_cache_configs(
+        vllm_config, [kv_cache_spec], [available_memory]
+    )[0]
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == (2 * 655360)
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (512kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.kv_cache_tensors[0].size =\
-        kv_cache_spec[layer_0].page_size_bytes
+    kv_cache_config.kv_cache_tensors[0].size = kv_cache_spec[layer_0].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index da8655f95e195b80747f0456448dca15f03f3b0f..11d9d18ead7db6a2df80a703d74edbeda69aac22 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 # type: ignore
-from __future__ import annotations
-
 import threading
 from collections.abc import Iterable
 from concurrent import futures
@@ -12,20 +10,23 @@ from typing import Callable, Generator, Literal
 import grpc
 import pytest
 from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
-    ExportTraceServiceResponse)
+    ExportTraceServiceResponse,
+)
 from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
-    TraceServiceServicer, add_TraceServiceServicer_to_server)
+    TraceServiceServicer,
+    add_TraceServiceServicer_to_server,
+)
 from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
-from opentelemetry.sdk.environment_variables import (
-    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+from opentelemetry.sdk.environment_variables import OTEL_EXPORTER_OTLP_TRACES_INSECURE
 
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes
 
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
 
-FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
-                    'array_value']
+FieldName = Literal[
+    "bool_value", "string_value", "int_value", "double_value", "array_value"
+]
 
 
 def decode_value(value: AnyValue):
@@ -34,8 +35,9 @@ def decode_value(value: AnyValue):
         "string_value": (lambda v: v.string_value),
         "int_value": (lambda v: v.int_value),
         "double_value": (lambda v: v.double_value),
-        "array_value":
-        (lambda v: [decode_value(item) for item in v.array_value.values]),
+        "array_value": (
+            lambda v: [decode_value(item) for item in v.array_value.values]
+        ),
     }
     for field, decoder in field_decoders.items():
         if value.HasField(field):
@@ -48,7 +50,6 @@ def decode_attributes(attributes: Iterable[KeyValue]):
 
 
 class FakeTraceService(TraceServiceServicer):
-
     def __init__(self):
         self.request = None
         self.evt = threading.Event()
@@ -79,17 +80,19 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
             max_tokens=256,
         )
         model = "facebook/opt-125m"
-        llm = LLM(model=model,
-                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-                  gpu_memory_utilization=0.3,
-                  disable_log_stats=False)
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            gpu_memory_utilization=0.3,
+            disable_log_stats=False,
+        )
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
         print(f"test_traces outputs is : {outputs}")
@@ -98,40 +101,48 @@ def test_traces(
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
-                f"the {timeout} seconds timeout")
+                f"the {timeout} seconds timeout"
+            )
 
         request = trace_service.request
         assert len(request.resource_spans) == 1, (
-            f"Expected 1 resource span, "
-            f"but got {len(request.resource_spans)}")
+            f"Expected 1 resource span, but got {len(request.resource_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans) == 1, (
             f"Expected 1 scope span, "
-            f"but got {len(request.resource_spans[0].scope_spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
             f"Expected 1 span, "
-            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}"
+        )
 
         attributes = decode_attributes(
-            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+            request.resource_spans[0].scope_spans[0].spans[0].attributes
+        )
         # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE)
+            == sampling_params.temperature
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS)
+            == sampling_params.max_tokens
+        )
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            outputs[0].prompt_token_ids
+        )
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS)
+            == completion_tokens
+        )
 
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
diff --git a/tests/v1/test_utils.py b/tests/v1/utils.py
similarity index 56%
rename from tests/v1/test_utils.py
rename to tests/v1/utils.py
index 00d98a873a310b1a8cac2fa5a2f8b774a7471e0f..993ad8a947d030bb2c9cfe3984bc0368fdec63e3 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/utils.py
@@ -1,79 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import pytest
 import regex as re
 import requests
-import torch
 
 from tests.utils import RemoteOpenAIServer
-from vllm.v1.worker.utils import bind_kv_cache
-
-
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = {
-        'layers.0.self_attn': torch.zeros((1, )),
-        'layers.1.self_attn': torch.zeros((1, )),
-        'layers.2.self_attn': torch.zeros((1, )),
-        'layers.3.self_attn': torch.zeros((1, )),
-    }
-    runner_kv_caches: list[torch.Tensor] = []
-    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
-        'layers.0.self_attn']
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
-        'layers.1.self_attn']
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
-        'layers.2.self_attn']
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
-        'layers.3.self_attn']
-
-    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
-    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
-    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
-    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = {
-        'model.layers.20.attn': torch.zeros((1, )),
-        'model.layers.28.attn': torch.zeros((1, )),
-    }
-
-    runner_kv_caches: list[torch.Tensor] = []
-    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
-
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
-        'model.layers.20.attn']
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
-        'model.layers.28.attn']
-
-    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
-    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
-
 
 # Prometheus metrics utilities for testing
 
 
-def get_prometheus_metrics(
-        server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
+def get_prometheus_metrics(server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
     """Fetch and parse Prometheus metrics from the /metrics endpoint.
-    
+
     Returns:
         Dict mapping metric names to their values grouped by labels.
         For example: {"vllm:request_success": {
@@ -88,14 +26,14 @@ def get_prometheus_metrics(
 
         # Regex patterns for Prometheus metrics
         metric_with_labels = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
-        metric_simple = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
+            r"^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$"
+        )
+        metric_simple = re.compile(r"^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$")
 
-        for line in response.text.split('\n'):
+        for line in response.text.split("\n"):
             line = line.strip()
             # Skip comments and empty lines
-            if not line or line.startswith('#'):
+            if not line or line.startswith("#"):
                 continue
 
             # Try to match metric with labels first
@@ -106,7 +44,7 @@ def get_prometheus_metrics(
                     value = float(value_str)
                     if metric_name not in metrics:
                         metrics[metric_name] = {}
-                    metrics[metric_name][f'{{{labels_part}}}'] = value
+                    metrics[metric_name][f"{{{labels_part}}}"] = value
                 except ValueError:
                     continue
             else:
@@ -118,7 +56,7 @@ def get_prometheus_metrics(
                         value = float(value_str)
                         if metric_name not in metrics:
                             metrics[metric_name] = {}
-                        metrics[metric_name][''] = value
+                        metrics[metric_name][""] = value
                     except ValueError:
                         continue
 
@@ -128,10 +66,9 @@ def get_prometheus_metrics(
         return {}
 
 
-def get_engine_request_counts(
-        metrics: dict[str, dict[str, float]]) -> dict[str, float]:
+def get_engine_request_counts(metrics: dict[str, dict[str, float]]) -> dict[str, float]:
     """Extract request counts per engine from Prometheus metrics.
-    
+
     Returns:
         Dict mapping engine indices to request counts.
         For example: {"0": 15.0, "1": 12.0}
@@ -156,7 +93,7 @@ def get_engine_request_counts(
 
 def check_request_balancing(server: RemoteOpenAIServer, dp_size: int):
     """Check request balancing via Prometheus metrics if dp_size > 1.
-    
+
     Args:
         server: The RemoteOpenAIServer instance
         dp_size: Number of data parallel ranks
@@ -175,7 +112,8 @@ def check_request_balancing(server: RemoteOpenAIServer, dp_size: int):
     assert len(engines_with_requests) == dp_size, (
         f"Expected requests to be distributed across multiple engines,"
         f" but only engine(s) {engines_with_requests} received "
-        f"requests. Engine counts: {engine_counts}")
+        f"requests. Engine counts: {engine_counts}"
+    )
 
     # Verify that the load is reasonably balanced
     # (no engine should handle all requests)
@@ -183,4 +121,5 @@ def check_request_balancing(server: RemoteOpenAIServer, dp_size: int):
 
     for count in engine_counts.values():
         assert count > total_requests // (dp_size + 1), (
-            f"requests are imbalanced: {engine_counts}")
+            f"requests are imbalanced: {engine_counts}"
+        )
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 98700ff73fd1fc3a1a32cbbbd7a16e2a0086b494..6ea65c6944b05e9a09984eaffc51b9e7ac26a5f9 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -3,7 +3,6 @@
 
 import inspect
 from collections.abc import Sequence
-from typing import Optional
 
 import numpy as np
 import pytest
@@ -11,7 +10,8 @@ import torch
 
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -29,14 +29,11 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def _compare_objs(obj1,
-                  obj2,
-                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
+def _compare_objs(obj1, obj2, skip: Sequence = ("logitsprocs", "batch_update_builder")):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
-    attr_names = set([
-        a[0] for a in attrs
-        if not (a[0].startswith('__') and a[0].endswith('__'))
-    ])
+    attr_names = set(
+        [a[0] for a in attrs if not (a[0].startswith("__") and a[0].endswith("__"))]
+    )
     for attr_name in attr_names:
         if attr_name in skip:
             continue
@@ -47,7 +44,7 @@ def _compare_objs(obj1,
         is_same = False
         if isinstance(a, torch.Tensor):
             if a.numel() == 0 or b.numel() == 0:
-                is_same = (a.numel() == 0 and b.numel() == 0)
+                is_same = a.numel() == 0 and b.numel() == 0
             elif torch.allclose(a, b):
                 is_same = True
         elif isinstance(a, np.ndarray):
@@ -64,12 +61,14 @@ def _compare_objs(obj1,
             is_same = True
         elif isinstance(a, CpuGpuBuffer):
             is_same = np.allclose(a.np, b.np) and torch.allclose(a.gpu, b.gpu)
-        assert is_same, f"Attribute {attr_name} is different"\
-            f" in {obj1} and {obj2}: {a} != {b}"
+        assert is_same, (
+            f"Attribute {attr_name} is different in {obj1} and {obj2}: {a} != {b}"
+        )
 
 
-def _remove_requests(input_batch: InputBatch, batch_size: int,
-                     reqs: list[CachedRequestState]) -> set[str]:
+def _remove_requests(
+    input_batch: InputBatch, batch_size: int, reqs: list[CachedRequestState]
+) -> set[str]:
     """
     Remove some requests randomly from the batch and returns
     set of request removed
@@ -109,10 +108,9 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
-    allowed_token_ids_mask = torch.zeros(num_reqs,
-                                         VOCAB_SIZE,
-                                         dtype=torch.bool,
-                                         device=device)
+    allowed_token_ids_mask = torch.zeros(
+        num_reqs, VOCAB_SIZE, dtype=torch.bool, device=device
+    )
     bad_words_token_ids = {}
     for req in reqs:
         if req.req_id not in req_ids_retained:
@@ -120,35 +118,40 @@ def _construct_expected_sampling_metadata(
         index_in_input_batch = req_id_index_in_input_batch[req.req_id]
         output_token_ids[index_in_input_batch] = req.output_token_ids
         prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
-        presence_penalties[
-            index_in_input_batch] = req.sampling_params.presence_penalty
+        presence_penalties[index_in_input_batch] = req.sampling_params.presence_penalty
         frequency_penalties[index_in_input_batch] = (
-            req.sampling_params.frequency_penalty)
+            req.sampling_params.frequency_penalty
+        )
         repetition_penalties[index_in_input_batch] = (
-            req.sampling_params.repetition_penalty)
+            req.sampling_params.repetition_penalty
+        )
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         min_tokens[index_in_input_batch] = (
             req.sampling_params.min_tokens,
-            req.sampling_params.all_stop_token_ids)
+            req.sampling_params.all_stop_token_ids,
+        )
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
         if req.sampling_params.allowed_token_ids:
             allowed_token_ids_mask[index_in_input_batch][
-                req.sampling_params.allowed_token_ids] = True
+                req.sampling_params.allowed_token_ids
+            ] = True
         if req.sampling_params.bad_words_token_ids:
-            bad_words_token_ids[
-                index_in_input_batch] = req.sampling_params.bad_words_token_ids
+            bad_words_token_ids[index_in_input_batch] = (
+                req.sampling_params.bad_words_token_ids
+            )
 
     return SamplingMetadata(
-        temperature=torch.tensor(temperature, dtype=torch.float,
-                                 device=device),
+        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
         all_greedy=False,
         all_random=True,
-        top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
-            top_p, dtype=torch.float, device=device),
-        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
-            top_k, dtype=torch.int, device=device),
+        top_p=None
+        if all(x == 1.0 for x in top_p)
+        else torch.tensor(top_p, dtype=torch.float, device=device),
+        top_k=None
+        if all(x == 0 for x in top_k)
+        else torch.tensor(top_k, dtype=torch.int, device=device),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(
@@ -157,19 +160,22 @@ def _construct_expected_sampling_metadata(
             device=torch.device(device),
             dtype=torch.int64,
         ),
-        frequency_penalties=torch.tensor(frequency_penalties,
-                                         dtype=torch.float,
-                                         device=device),
-        presence_penalties=torch.tensor(presence_penalties,
-                                        dtype=torch.float,
-                                        device=device),
-        repetition_penalties=torch.tensor(repetition_penalties,
-                                          dtype=torch.float,
-                                          device=device),
+        frequency_penalties=torch.tensor(
+            frequency_penalties, dtype=torch.float, device=device
+        ),
+        presence_penalties=torch.tensor(
+            presence_penalties, dtype=torch.float, device=device
+        ),
+        repetition_penalties=torch.tensor(
+            repetition_penalties, dtype=torch.float, device=device
+        ),
         output_token_ids=output_token_ids,
-        no_penalties=(all(x == 0 for x in presence_penalties)
-                      and all(x == 0 for x in frequency_penalties)
-                      and all(x == 1 for x in repetition_penalties)),
+        spec_token_ids=[[] for _ in range(len(output_token_ids))],
+        no_penalties=(
+            all(x == 0 for x in presence_penalties)
+            and all(x == 0 for x in frequency_penalties)
+            and all(x == 1 for x in repetition_penalties)
+        ),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
         logitsprocs=LogitsProcessors(),
@@ -185,8 +191,7 @@ def _create_sampling_params():
         frequency_penalty=np.random.uniform(-2.0, 2.0),
         min_tokens=np.random.randint(1, 10),
         stop_token_ids=[
-            np.random.randint(0, VOCAB_SIZE)
-            for _ in range(np.random.randint(10))
+            np.random.randint(0, VOCAB_SIZE) for _ in range(np.random.randint(10))
         ],
         logit_bias={0: np.random.uniform(-3.0, 3.0)},
     )
@@ -207,7 +212,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         sampling_params=_create_sampling_params(),
         pooling_params=None,
         mm_features=[],
-        block_ids=([], ),
+        block_ids=([],),
         generator=None,
         num_computed_tokens=len(output_token_ids),
         output_token_ids=output_token_ids,
@@ -236,6 +241,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -262,19 +268,18 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
 
     # Create expected output.
     expected_sampling_metadata = _construct_expected_sampling_metadata(
-        reqs,
-        req_ids_retained,
-        input_batch.req_id_to_index,
-        device=torch.device(device))
+        reqs, req_ids_retained, input_batch.req_id_to_index, device=torch.device(device)
+    )
 
-    def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
-        return (t1 is None
-                and t2 is None) or (t1 is not None and t2 is not None
-                                    and torch.allclose(t1, t2))
+    def same(t1: torch.Tensor | None, t2: torch.Tensor | None) -> bool:
+        return (t1 is None and t2 is None) or (
+            t1 is not None and t2 is not None and torch.allclose(t1, t2)
+        )
 
     # Assert the actual and expected output.
-    assert torch.allclose(expected_sampling_metadata.temperature,
-                          sampling_metadata.temperature)
+    assert torch.allclose(
+        expected_sampling_metadata.temperature, sampling_metadata.temperature
+    )
     assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p)
     assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k)
     assert torch.allclose(
@@ -289,25 +294,29 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         expected_sampling_metadata.repetition_penalties,
         sampling_metadata.repetition_penalties,
     )
-    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
-                          sampling_metadata.prompt_token_ids)
-    assert (expected_sampling_metadata.output_token_ids ==
-            sampling_metadata.output_token_ids)
-    assert expected_sampling_metadata.no_penalties == \
-           sampling_metadata.no_penalties
+    assert torch.allclose(
+        expected_sampling_metadata.prompt_token_ids, sampling_metadata.prompt_token_ids
+    )
+    assert (
+        expected_sampling_metadata.output_token_ids
+        == sampling_metadata.output_token_ids
+    )
+    assert expected_sampling_metadata.no_penalties == sampling_metadata.no_penalties
     if sampling_metadata.allowed_token_ids_mask:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
-            sampling_metadata.allowed_token_ids_mask)
-    assert expected_sampling_metadata.bad_words_token_ids == \
-        sampling_metadata.bad_words_token_ids
+            sampling_metadata.allowed_token_ids_mask,
+        )
+    assert (
+        expected_sampling_metadata.bad_words_token_ids
+        == sampling_metadata.bad_words_token_ids
+    )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [32])
-@pytest.mark.parametrize("swap_list", [((0, 1), )])
-def test_swap_states_in_input_batch(device: str, batch_size: int,
-                                    swap_list: list):
+@pytest.mark.parametrize("swap_list", [((0, 1),)])
+def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: list):
     """
     Tests the logic for managing sampling metadata in the InputBatch.
 
@@ -327,6 +336,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -336,6 +346,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
 
     reqs: list[CachedRequestState] = []
@@ -352,8 +363,10 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
 
     reordered_reqs = reqs.copy()
     for swap_pair in swap_list:
-        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
-            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = (
+            reordered_reqs[swap_pair[1]],
+            reordered_reqs[swap_pair[0]],
+        )
         input_batch.swap_states(swap_pair[0], swap_pair[1])
 
     for req_index in range(batch_size):
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 49a7a61e18890a4dc05996dc91bf9b141338e8fe..b95c8df3469b3c305b19e329dcf7ccb33d41a485 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -6,23 +6,36 @@ import pytest
 import torch
 
 from vllm.attention import Attention
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig, set_current_vllm_config)
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.attention.backends.abstract import MultipleOf
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes, update_environment_variables
-from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
-                                         get_kv_cache_configs)
-from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
-                                       SchedulerOutput)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.system_utils import update_environment_variables
+from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs
+from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheTensor,
+)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.utils import AttentionGroup
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
@@ -35,8 +48,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
     """
     attn_spec = FullAttentionSpec(
         block_size=BLOCK_SIZE,
-        num_kv_heads=runner.model_config.get_num_kv_heads(
-            runner.parallel_config),
+        num_kv_heads=runner.model_config.get_num_kv_heads(runner.parallel_config),
         head_size=runner.model_config.get_head_size(),
         dtype=runner.kv_cache_dtype,
     )
@@ -58,7 +70,8 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        block_sizes=[
+        block_sizes=[kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size],
+        kernel_block_sizes=[
             kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
         ],
     )
@@ -98,8 +111,9 @@ def model_runner():
     model_config = vllm_config.model_config
     num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
     head_size = model_config.get_head_size()
-    vllm_config.compilation_config.static_forward_context[
-        "layer.0"] = Attention(num_heads, head_size, 0.1)
+    vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
+        num_heads, head_size, 0.1
+    )
     runner = GPUModelRunner(vllm_config, DEVICE)
     initialize_kv_cache(runner)
     return runner
@@ -120,10 +134,11 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_features=[],
                 sampling_params=SamplingParams(),
                 pooling_params=None,
-                block_ids=([0], ),
+                block_ids=([0],),
                 num_computed_tokens=0,
                 lora_request=None,
-            ))
+            )
+        )
         num_scheduled_tokens[req_id] = 3
         total_num_scheduled_tokens += num_scheduled_tokens[req_id]
 
@@ -134,11 +149,9 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         total_num_scheduled_tokens=total_num_scheduled_tokens,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
 
@@ -150,22 +163,71 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
-def _is_sampling_metadata_changed(model_runner,
-                                  sampling_metadata_before: SamplingMetadata):
-    return model_runner.input_batch.sampling_metadata is not (
-        sampling_metadata_before)
+def _is_sampling_metadata_changed(
+    model_runner, sampling_metadata_before: SamplingMetadata
+):
+    return model_runner.input_batch.sampling_metadata is not (sampling_metadata_before)
 
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     req_index = model_runner.input_batch.req_id_to_index[req_id]
     block_table = model_runner.input_batch.block_table[0]
     req_state = model_runner.requests[req_id]
-    if block_table.num_blocks_per_row[req_index] != len(
-            req_state.block_ids[0]):
+    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids[0]):
         return False
     num_blocks = block_table.num_blocks_per_row[req_index]
-    return (block_table.block_table.np[req_index, :num_blocks] ==
-            req_state.block_ids[0]).all()
+    return (
+        block_table.block_table.np[req_index, :num_blocks] == req_state.block_ids[0]
+    ).all()
+
+
+def _make_mock_backend_for_kernel_block_size(
+    supported_sizes: list[int | MultipleOf],
+):
+    class _MockBackend:
+        supported_kernel_block_sizes = supported_sizes
+
+    return _MockBackend()
+
+
+def _make_kv_cache_spec() -> FullAttentionSpec:
+    return FullAttentionSpec(block_size=1, num_kv_heads=1, head_size=1, dtype="float16")
+
+
+def test_select_common_block_size_prefers_manager_block_size():
+    backend_a = _make_mock_backend_for_kernel_block_size([MultipleOf(32)])
+    backend_b = _make_mock_backend_for_kernel_block_size([64, MultipleOf(16)])
+    attn_groups = [
+        AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
+        AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
+    ]
+
+    selected_size = GPUModelRunner.select_common_block_size(128, attn_groups)
+    assert selected_size == 128
+
+
+def test_select_common_block_size_uses_largest_shared_int():
+    backend_a = _make_mock_backend_for_kernel_block_size([128, 64])
+    backend_b = _make_mock_backend_for_kernel_block_size([64, 32])
+    attn_groups = [
+        AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
+        AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
+    ]
+
+    selected_size = GPUModelRunner.select_common_block_size(256, attn_groups)
+    assert selected_size == 64
+
+
+def test_select_common_block_size_no_valid_option():
+    backend_a = _make_mock_backend_for_kernel_block_size([64])
+    backend_b = _make_mock_backend_for_kernel_block_size([MultipleOf(16)])
+    attn_groups = [
+        AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
+        AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
+    ]
+
+    with pytest.raises(ValueError):
+        GPUModelRunner.select_common_block_size(48, attn_groups)
 
 
 def test_update_states_new_request(model_runner, dist_init):
@@ -200,11 +262,9 @@ def test_update_states_request_finished(model_runner, dist_init):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -232,11 +292,9 @@ def test_update_states_request_resumed(model_runner, dist_init):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -246,10 +304,12 @@ def test_update_states_request_resumed(model_runner, dist_init):
     # resume req
     cached_req_data = CachedRequestData(
         req_ids=[req_id],
-        resumed_from_preemption=[False],
+        resumed_req_ids=set(),
         new_token_ids=[[]],
-        new_block_ids=([[0]], ),
+        all_token_ids={},
+        new_block_ids=[([0],)],
         num_computed_tokens=[0],
+        num_output_tokens=[0],
     )
 
     scheduler_output = SchedulerOutput(
@@ -259,11 +319,9 @@ def test_update_states_request_resumed(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -280,46 +338,58 @@ def test_get_nans_in_logits(model_runner, dist_init):
     scheduler_output = _schedule_new_request(*req_ids)
     model_runner._update_states(scheduler_output)
 
-    logits = torch.tensor([
-        [1.0, 2.0, 3.0],
-        [3.0, 2.0, 1.0],
-    ], device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, 2.0, 3.0],
+            [3.0, 2.0, 1.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 0}
 
-    logits = torch.tensor([
-        [1.0, float('nan'), 3.0],
-        [4.0, float('nan'), float('nan')],
-    ],
-                          device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, float("nan"), 3.0],
+            [4.0, float("nan"), float("nan")],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 1, "req_1": 2}
 
-    logits = torch.tensor([
-        [1.0, 2.0, 3.0],
-        [4.0, float('nan'), float('nan')],
-    ],
-                          device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, 2.0, 3.0],
+            [4.0, float("nan"), float("nan")],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 2}
 
     result = model_runner._get_nans_in_logits(logits=None)
     assert result == {"req_0": 0, "req_1": 0}
 
-    logits = torch.tensor([
-        [1.0, float('nan'), 3.0],
-    ], device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, float("nan"), 3.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
-    assert result == {'req_0': 1, 'req_1': 0}
-
-    logits = torch.tensor([
-        [float('nan'), float('nan'), 2.0],
-        [1.0, 2.0, 3.0],
-        [float('nan'), 2.0, 3.0],
-    ],
-                          device=DEVICE)
+    assert result == {"req_0": 1, "req_1": 0}
+
+    logits = torch.tensor(
+        [
+            [float("nan"), float("nan"), 2.0],
+            [1.0, 2.0, 3.0],
+            [float("nan"), 2.0, 3.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
-    assert result == {'req_0': 2, 'req_1': 0}
+    assert result == {"req_0": 2, "req_1": 0}
 
 
 def test_update_states_no_changes(model_runner, dist_init):
@@ -340,11 +410,9 @@ def test_update_states_no_changes(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -377,11 +445,9 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner._update_states(scheduler_output)
@@ -397,12 +463,21 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
 def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
-    n_heads = model_runner.model_config.get_num_kv_heads(
-        model_runner.parallel_config)
-    expected_kv_cache_shape = [
-        2, NUM_BLOCKS, BLOCK_SIZE, n_heads,
-        model_runner.model_config.get_head_size()
-    ]
+    n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
+    head_size = model_runner.model_config.get_head_size()
+
+    # Get the expected shape from the backend's get_kv_cache_shape method
+    # to ensure compatibility with different backends (triton vs flexattention)
+    attn_backend = None
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
+        break
+
+    assert attn_backend is not None, "No attention backend found"
+    expected_kv_cache_shape = list(
+        attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
+    )
+
     # TODO mla test
     default_stride = tuple(range(5))
     # Permutation that gets you back to expected kv shape
@@ -414,8 +489,9 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
         # Patch the attention backend class and re-trigger the KV cache creation
         for attn_group in model_runner._attn_group_iterator():
             attn_backend = attn_group.backend
-            monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
-                                rnd_stride_order)
+            monkeypatch.setattr(
+                attn_backend, "get_kv_cache_stride_order", rnd_stride_order
+            )
 
         model_runner.attn_groups = []
         model_runner.kv_caches = []
@@ -447,14 +523,13 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
     model_runner_2.update_config({"load_config": {"load_format": "dummy"}})
     model_runner_2.load_model()  # Initial model loading with dummy weights
     assert str(model_runner.get_model().state_dict()) != str(
-        model_runner_2.get_model().state_dict())
-    model_runner_2.update_config(
-        {"load_config": {
-            "load_format": original_load_format
-        }})
+        model_runner_2.get_model().state_dict()
+    )
+    model_runner_2.update_config({"load_config": {"load_format": original_load_format}})
     model_runner_2.reload_weights()  # Load real weights inplace
     assert str(model_runner.get_model().state_dict()) == str(
-        model_runner_2.get_model().state_dict())
+        model_runner_2.get_model().state_dict()
+    )
 
 
 def test_reload_weights_before_load_model(model_runner):
@@ -471,21 +546,19 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
                 kv_sharing_target_layer_name=layer_1,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -499,22 +572,20 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
     with pytest.raises(ValueError, match=error_msg):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 # invalid layer: cross_attn.atn doesn't exist!
                 kv_sharing_target_layer_name=invalid_layer,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -529,21 +600,19 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -556,20 +625,18 @@ def test_init_kv_cache_without_kv_sharing():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -584,15 +651,15 @@ def test_init_kv_cache_without_kv_sharing():
     available_memory = 20 * GiB_bytes
     # page size for layer 0's kv_cache_spec is 32KB
     num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
-    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
-                                           [available_memory])[0]
+    kv_cache_config = get_kv_cache_configs(
+        vllm_config, [kv_cache_spec], [available_memory]
+    )[0]
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 2
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
     assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 1310720
 
@@ -600,8 +667,9 @@ def test_init_kv_cache_without_kv_sharing():
     # this will only allocate 2 block worth of memory (2 * 32kb)
     kv_cache_config.num_blocks = 1
     for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-        kv_cache_tensor.size = (
-            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+        kv_cache_tensor.size = kv_cache_spec[
+            kv_cache_tensor.shared_by[0]
+        ].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
 
@@ -624,21 +692,19 @@ def test_init_kv_cache_with_kv_sharing_valid():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -656,24 +722,23 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
     num_expected_blocks = 655360  # 20GB / 32KB
-    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
-                                           [available_memory])[0]
+    kv_cache_config = get_kv_cache_configs(
+        vllm_config, [kv_cache_spec], [available_memory]
+    )[0]
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 2 * 1310720
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.kv_cache_tensors[0].size =\
-        kv_cache_spec[layer_0].page_size_bytes
+    kv_cache_config.kv_cache_tensors[0].size = kv_cache_spec[layer_0].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
     kv_cache_config_after_init = runner.kv_cache_config
@@ -686,30 +751,30 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # check layer 1 added to kv cache group's layer names
     assert len(kv_cache_config_after_init.kv_cache_groups) == 1
     assert len(kv_cache_config_after_init.kv_cache_groups[0].layer_names) == 2
-    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
-        0] == layer_0
-    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
-        1] == layer_1
+    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1
 
 
 def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
-    '''
+    """
     The GPU model runner creates different views into the
     KVCacheTensors for the attention and mamba layers
     (via _reshape_kv_cache_tensors function). This test verifies
     that the views are compatible: writing a mamba block
     will not corrupt an attention block and vice versa
-    '''
+    """
 
     current_platform.seed_everything(42)
 
-    update_environment_variables({
-        'RANK': "0",
-        'LOCAL_RANK': "0",
-        'WORLD_SIZE': "1",
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=1)
     torch.set_default_dtype(torch.float16)
@@ -750,8 +815,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         fwd_context = {}
         for key in [layer_0, layer_1]:
             fwd_context[key] = Attention(
-                num_heads=model_config.get_num_attention_heads(
-                    parallel_config),
+                num_heads=model_config.get_num_attention_heads(parallel_config),
                 num_kv_heads=model_config.get_num_kv_heads(parallel_config),
                 head_size=model_config.get_head_size(),
                 scale=1.0,
@@ -759,13 +823,12 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
             )
         for key in [layer_2, layer_3, layer_4, layer_5]:
             fwd_context[key] = MambaMixer2(
-                hidden_size = hf_config.hidden_size,
-                ssm_state_size = hf_config.mamba_d_state,
-                conv_kernel_size = hf_config.mamba_d_conv,
-                intermediate_size = hf_config.mamba_expand *\
-                                    hf_config.hidden_size,
-                use_conv_bias = hf_config.mamba_conv_bias,
-                use_bias = hf_config.mamba_proj_bias,
+                hidden_size=hf_config.hidden_size,
+                ssm_state_size=hf_config.mamba_d_state,
+                conv_kernel_size=hf_config.mamba_d_conv,
+                intermediate_size=hf_config.mamba_expand * hf_config.hidden_size,
+                use_conv_bias=hf_config.mamba_conv_bias,
+                use_bias=hf_config.mamba_proj_bias,
                 n_groups=hf_config.mamba_n_groups,
                 num_heads=hf_config.mamba_n_heads,
                 head_dim=hf_config.mamba_d_head,
@@ -780,15 +843,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     vllm_ctx = vllm_config.compilation_config.static_forward_context
 
     with monkeypatch.context() as m:
-
         m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
 
         runner = GPUModelRunner(vllm_config, DEVICE)
         kv_cache_spec = runner.get_kv_cache_spec()
 
         available_memory = 5 * GiB_bytes
-        kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
-                                               [available_memory])[0]
+        kv_cache_config = get_kv_cache_configs(
+            vllm_config, [kv_cache_spec], [available_memory]
+        )[0]
         runner.initialize_kv_cache(kv_cache_config)
 
         # random partition of blocks
@@ -797,43 +860,242 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         num_blocks = kv_cache_config.num_blocks
         ind = np.arange(num_blocks)
         np.random.shuffle(ind)
-        blocks0, blocks1 = ind[:(num_blocks // 2)], ind[(num_blocks // 2):]
+        blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :]
 
         attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
         conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
         ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
 
         # assert we are using FlashInfer
-        assert attn_shape[0] == num_blocks
-
-        attn_blocks_constant = torch.full((len(blocks0), *attn_shape[1:]),
-                                          device=DEVICE,
-                                          fill_value=3.33)
-        conv_blocks_constant = torch.full((len(blocks1), *conv_shape[1:]),
-                                          device=DEVICE,
-                                          fill_value=6.66)
-        ssm_blocks_constant = torch.full((len(blocks1), *ssm_shape[1:]),
-                                         device=DEVICE,
-                                         fill_value=9.99)
-
-        # fill all attention blocks with constant
+        assert attn_shape[0] % num_blocks == 0
+        block_split_ratio = attn_shape[0] // num_blocks
+
+        # use small blocks for testing to avoid memory issues
+        test_block_size = min(2, len(blocks0), len(blocks1))
+
+        # use non-overlapping blocks to avoid data contamination
+        # Split kernel blocks: first half for attention, second half for mamba
+        mid_point = num_blocks // 2
+
+        # attention uses kernel blocks from first half (mapped to logical blocks)
+        kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
+
+        # mamba uses kernel blocks from second half
+        kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
+
+        # create small constant tensors for testing with corrected shapes
+        # attention: [block_size, ...] starting from dimension 2
+        attn_constant_shape = attn_shape[2:]
+        conv_constant_shape = conv_shape[1:]
+        ssm_constant_shape = ssm_shape[1:]
+
+        attn_blocks_constant = torch.full(
+            (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
+        )
+        conv_blocks_constant = torch.full(
+            (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
+        )
+        ssm_blocks_constant = torch.full(
+            (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
+        )
+
+        # Fill attention blocks with constants using kv block indices
+        kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio
+
         for layer in [layer_0, layer_1]:
-            vllm_ctx[layer].kv_cache[0][
-                blocks0, :] = attn_blocks_constant.detach().clone()
+            # attention: kv_cache[0][kernel_block_idx, kv_idx, ...]
+            for i, kernel_block in enumerate(kernel_blocks_for_attention):
+                vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
 
-        # fill all mamba blocks with constant
+        # fill mamba blocks with constants using kernel block indices
         for layer in [layer_2, layer_3, layer_4, layer_5]:
-            vllm_ctx[layer].kv_cache[0][0][
-                blocks1, :] = conv_blocks_constant.detach().clone()
-            vllm_ctx[layer].kv_cache[0][1][
-                blocks1, :] = ssm_blocks_constant.detach().clone()
+            # mamba: kv_cache[0][component][kernel_block_idx, ...]
+            for i, kv_block in enumerate(kv_blocks_for_mamba):
+                vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
+                vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
 
         # verify attention and mamba contents are correct
         for layer in [layer_0, layer_1]:
-            assert torch.equal(vllm_ctx[layer].kv_cache[0][blocks0, :],
-                               attn_blocks_constant)
+            for i, kernel_block in enumerate(kernel_blocks_for_attention):
+                actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]
+                expected = attn_blocks_constant[i]
+
+                # Check K and V separately
+                assert torch.equal(actual_kv[0], expected)
+                assert torch.equal(actual_kv[1], expected)
+
+        for layer in [layer_2, layer_3, layer_4, layer_5]:
+            for i, kv_block in enumerate(kv_blocks_for_mamba):
+                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+                expected_conv = conv_blocks_constant[i]
+                expected_ssm = ssm_blocks_constant[i]
+
+                assert torch.equal(actual_conv, expected_conv)
+                assert torch.equal(actual_ssm, expected_ssm)
+
         for layer in [layer_2, layer_3, layer_4, layer_5]:
-            assert torch.equal(vllm_ctx[layer].kv_cache[0][0][blocks1, :],
-                               conv_blocks_constant)
-            assert torch.equal(vllm_ctx[layer].kv_cache[0][1][blocks1, :],
-                               ssm_blocks_constant)
+            for i, kv_block in enumerate(kv_blocks_for_mamba):
+                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+                expected_conv = conv_blocks_constant[i]
+                expected_ssm = ssm_blocks_constant[i]
+                assert torch.equal(actual_conv, expected_conv)
+                assert torch.equal(actual_ssm, expected_ssm)
+
+
+def test_hybrid_block_table_initialization():
+    """Test hybrid block table with different kernel and kvcache_manager block
+    sizes."""
+    from vllm.v1.worker.block_table import BlockTable
+
+    # Test configuration: kvcache_manager block size = 32,
+    # kernel block size = 16
+    block_size = 32
+    kernel_block_sizes = [16]
+    max_num_reqs = 10
+    max_num_blocks_per_req = 20
+    max_num_batched_tokens = 512
+    dcp_kv_cache_interleave_size = 8
+
+    block_table = BlockTable(
+        block_size=block_size,
+        max_num_reqs=max_num_reqs,
+        max_num_blocks_per_req=max_num_blocks_per_req,
+        max_num_batched_tokens=max_num_batched_tokens,
+        pin_memory=False,
+        device=torch.device(DEVICE),
+        kernel_block_size=kernel_block_sizes[0],
+        dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+    )
+
+    # Verify hybrid block configuration
+    assert block_table.use_hybrid_blocks is True
+    assert block_table.block_size == kernel_block_sizes[0]
+    assert block_table.blocks_per_kv_block == (
+        block_size // kernel_block_sizes[0]
+    )  # Changed to use first element
+
+    # Test block table conversion logic
+    # One kvcache_manager block should map to multiple kernel blocks
+    kvcache_manager_blocks = [0, 1, 2]
+
+    # Verify that kvcache_manager blocks can be converted to kernel blocks
+    # and that block table operations work correctly.
+    req_index = 0
+    block_table.append_row(kvcache_manager_blocks, req_index)
+    # Get expected kernel blocks from the implementation for verification.
+    expected_kernel_blocks = block_table.map_to_kernel_blocks(
+        np.array(kvcache_manager_blocks),
+        block_table.blocks_per_kv_block,
+        block_table._kernel_block_arange,
+    )
+    # Verify block table state
+    assert block_table.num_blocks_per_row[req_index] == len(expected_kernel_blocks)
+    assert np.array_equal(
+        block_table.block_table.np[req_index, : len(expected_kernel_blocks)],
+        expected_kernel_blocks,
+    )
+
+
+def test_input_batch_with_kernel_block_sizes():
+    """Test InputBatch initialization with kernel_block_sizes parameter."""
+    max_num_reqs = 10
+    max_model_len = 512
+    max_num_batched_tokens = 512
+    device = torch.device(DEVICE)
+    pin_memory = False
+    vocab_size = 50272
+
+    # Test with different kernel block sizes
+    block_sizes = [32, 64]
+    kernel_block_sizes = [16, 32]
+
+    input_batch = InputBatch(
+        max_num_reqs=max_num_reqs,
+        max_model_len=max_model_len,
+        max_num_batched_tokens=max_num_batched_tokens,
+        device=device,
+        pin_memory=pin_memory,
+        vocab_size=vocab_size,
+        block_sizes=block_sizes,
+        kernel_block_sizes=kernel_block_sizes,
+    )
+
+    # Verify that block tables were created with kernel block sizes
+    assert len(input_batch.block_table.block_tables) == len(block_sizes)
+
+    for i, (kv_size, kernel_size) in enumerate(zip(block_sizes, kernel_block_sizes)):
+        block_table = input_batch.block_table.block_tables[i]
+        if kv_size != kernel_size:
+            assert block_table.use_hybrid_blocks is True
+            assert block_table.block_size == kernel_size
+        else:
+            assert block_table.use_hybrid_blocks is False
+            assert block_table.block_size == kernel_size
+
+
+def test_hybrid_cache_integration(model_runner, dist_init):
+    """Test hybrid cache architecture integration with GPUModelRunner."""
+    # Create a new model runner with hybrid cache configuration
+    vllm_config = get_vllm_config()
+
+    # Configure hybrid cache with different kvcache_manager block size
+    vllm_config.cache_config.block_size = 32
+
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
+    head_size = model_config.get_head_size()
+    vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
+        num_heads, head_size, 0.1
+    )
+
+    runner = GPUModelRunner(vllm_config, DEVICE)
+
+    # Initialize KV cache with configuration
+    attn_spec = FullAttentionSpec(
+        block_size=16,  # Use kernel block size directly
+        num_kv_heads=runner.model_config.get_num_kv_heads(runner.parallel_config),
+        head_size=runner.model_config.get_head_size(),
+        dtype=runner.kv_cache_dtype,
+    )
+    tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
+    kv_cache_config = KVCacheConfig(
+        num_blocks=NUM_BLOCKS,
+        kv_cache_tensors=[
+            KVCacheTensor(size=tensor_size, shared_by=["layer.0"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
+        ],
+    )
+    runner.kv_cache_config = kv_cache_config
+
+    # Initialize input batch with kernel block sizes
+    runner.input_batch = InputBatch(
+        max_num_reqs=runner.max_num_reqs,
+        max_model_len=runner.max_model_len,
+        max_num_batched_tokens=runner.max_num_tokens,
+        device=runner.device,
+        pin_memory=runner.pin_memory,
+        vocab_size=runner.model_config.get_vocab_size(),
+        block_sizes=[kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size],
+        kernel_block_sizes=[16],
+    )  # Use kernel block size
+
+    runner.initialize_attn_backend(kv_cache_config)
+
+    # Verify hybrid block table configuration
+    block_table = runner.input_batch.block_table.block_tables[0]
+    assert block_table.block_size == (
+        kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+    )
+
+    # Test request processing with hybrid blocks
+    req_id = "hybrid_req_0"
+    scheduler_output = _schedule_new_request(req_id)
+
+    # Update states should work with hybrid blocks
+    runner._update_states(scheduler_output)
+    assert _is_req_scheduled(runner, req_id)
+    assert _is_req_state_block_table_match(runner, req_id)
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f987b09e603e77752e97207dd4481574e34589d1
--- /dev/null
+++ b/tests/v1/worker/test_utils.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        "layers.0.self_attn": Attention(32, 128, 0.1),
+        "layers.1.self_attn": Attention(32, 128, 0.1),
+        "layers.2.self_attn": Attention(32, 128, 0.1),
+        "layers.3.self_attn": Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        "layers.0.self_attn": torch.zeros((1,)),
+        "layers.1.self_attn": torch.zeros((1,)),
+        "layers.2.self_attn": torch.zeros((1,)),
+        "layers.3.self_attn": torch.zeros((1,)),
+    }
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache["layers.0.self_attn"]
+    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache["layers.1.self_attn"]
+    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache["layers.2.self_attn"]
+    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache["layers.3.self_attn"]
+
+    assert runner_kv_caches[0] is kv_cache["layers.0.self_attn"]
+    assert runner_kv_caches[1] is kv_cache["layers.1.self_attn"]
+    assert runner_kv_caches[2] is kv_cache["layers.2.self_attn"]
+    assert runner_kv_caches[3] is kv_cache["layers.3.self_attn"]
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        "model.layers.20.attn": Attention(32, 128, 0.1),
+        "model.layers.28.attn": Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        "model.layers.20.attn": torch.zeros((1,)),
+        "model.layers.28.attn": torch.zeros((1,)),
+    }
+
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache["model.layers.20.attn"]
+    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache["model.layers.28.attn"]
+
+    assert runner_kv_caches[0] is kv_cache["model.layers.20.attn"]
+    assert runner_kv_caches[1] is kv_cache["model.layers.28.attn"]
diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py
index 6faa6bcc591cbe923759d5604007d1685073a20f..66330127b5ec77c43ca6deb09ec76f9d5b26c8bd 100644
--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -4,20 +4,18 @@
 import multiprocessing as mp
 import os
 import tempfile
-from multiprocessing import Queue
-from typing import Optional
+from multiprocessing.queues import Queue
 from unittest.mock import patch
 
 import pytest
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import MemorySnapshot
-from vllm.v1.worker.gpu_worker import (Worker,
-                                       init_worker_distributed_environment)
+from vllm.utils.mem_utils import MemorySnapshot
+from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
 # Global queue to track operation order across processes
-_QUEUE: Optional[Queue] = None
+_QUEUE: Queue | None = None
 
 
 def track_operation(operation: str, rank: int):
@@ -28,11 +26,11 @@ def track_operation(operation: str, rank: int):
 
 def make_operation_tracker(operation_name: str, original_func):
     """Create a mock function that tracks when an operation is called.
-    
+
     Args:
         operation_name: Name to use when tracking this operation
         original_func: The original function to wrap
-    
+
     Returns:
         A wrapper function that tracks the operation and calls the original
     """
@@ -45,8 +43,13 @@ def make_operation_tracker(operation_name: str, original_func):
     return wrapper
 
 
-def worker_process(rank: int, world_size: int, distributed_init_method: str,
-                   queue: Queue, error_queue: Queue):
+def worker_process(
+    rank: int,
+    world_size: int,
+    distributed_init_method: str,
+    queue: Queue,
+    error_queue: Queue,
+):
     """Worker process that initializes a GPU worker with proper tracking."""
     global _QUEUE
     _QUEUE = queue
@@ -58,9 +61,9 @@ def worker_process(rank: int, world_size: int, distributed_init_method: str,
         os.environ["WORLD_SIZE"] = str(world_size)
 
         # Create vLLM config with small model
-        vllm_config = EngineArgs(model="facebook/opt-125m",
-                                 tensor_parallel_size=2,
-                                 load_format="dummy").create_engine_config()
+        vllm_config = EngineArgs(
+            model="facebook/opt-125m", tensor_parallel_size=2, load_format="dummy"
+        ).create_engine_config()
 
         # Create worker
         worker = Worker(
@@ -77,19 +80,22 @@ def worker_process(rank: int, world_size: int, distributed_init_method: str,
 
         # Apply minimal patches to track operation order
         init_patch = patch(
-            'vllm.v1.worker.gpu_worker.init_worker_distributed_environment',
-            side_effect=make_operation_tracker("init_distributed",
-                                               original_init_worker))
+            "vllm.v1.worker.gpu_worker.init_worker_distributed_environment",
+            side_effect=make_operation_tracker(
+                "init_distributed", original_init_worker
+            ),
+        )
         memory_patch = patch.object(
-            MemorySnapshot, '__init__',
-            make_operation_tracker("memory_snapshot",
-                                   original_memory_snapshot_init))
-        all_reduce_patch = patch('torch.distributed.all_reduce',
-                                 side_effect=make_operation_tracker(
-                                     "nccl_all_reduce", original_all_reduce))
+            MemorySnapshot,
+            "__init__",
+            make_operation_tracker("memory_snapshot", original_memory_snapshot_init),
+        )
+        all_reduce_patch = patch(
+            "torch.distributed.all_reduce",
+            side_effect=make_operation_tracker("nccl_all_reduce", original_all_reduce),
+        )
 
         with init_patch, memory_patch, all_reduce_patch:
-
             # Initialize device (this is where we test the order)
             worker.init_device()
 
@@ -104,13 +110,14 @@ def worker_process(rank: int, world_size: int, distributed_init_method: str,
         raise
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs for tensor parallelism")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs for tensor parallelism"
+)
 def test_init_distributed_is_called_before_memory_snapshot():
     """Test that distributed env is setup before memory snapshot.
-    
-    This test makes sure during worker initialization, the initial memory 
-    snapshot is taken after distributed env is setup to include all the buffers 
+
+    This test makes sure during worker initialization, the initial memory
+    snapshot is taken after distributed env is setup to include all the buffers
     allocated by distributed env.
     """
     world_size = 2
@@ -127,9 +134,16 @@ def test_init_distributed_is_called_before_memory_snapshot():
     # Start worker processes
     processes = []
     for rank in range(world_size):
-        p = ctx.Process(target=worker_process,
-                        args=(rank, world_size, distributed_init_method,
-                              operation_queue, error_queue))
+        p = ctx.Process(
+            target=worker_process,
+            args=(
+                rank,
+                world_size,
+                distributed_init_method,
+                operation_queue,
+                error_queue,
+            ),
+        )
         p.start()
         processes.append(p)
 
@@ -168,7 +182,8 @@ def test_init_distributed_is_called_before_memory_snapshot():
         assert init_distributed < nccl_all_reduce < memory_snapshot, (
             f"Rank {rank}: init_distributed (index {init_distributed}) "
             f"must happen before nccl_all_reduce (index {nccl_all_reduce}) "
-            f"and memory_snapshot (index {memory_snapshot})")
+            f"and memory_snapshot (index {memory_snapshot})"
+        )
 
     # Clean up
     os.unlink(distributed_init_method.replace("file://", ""))
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index 83be8bdce85cf9b05e4f84187520b70917044993..4cb66b556e5a76311a195dbe27f45ee513891ef6 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 
 setup(
-    name='vllm_test_utils',
-    version='0.1',
-    packages=['vllm_test_utils'],
+    name="vllm_test_utils",
+    version="0.1",
+    packages=["vllm_test_utils"],
 )
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 49fd083ef19c88c17c7b30dcc4ace15b5140dba1..9746c3964e21fc511de051023eb5bd59bdcbcca6 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -5,8 +5,7 @@ import contextlib
 import dataclasses
 import sys
 import traceback
-from collections.abc import Generator
-from typing import Callable
+from collections.abc import Callable, Generator
 
 
 @dataclasses.dataclass
@@ -26,7 +25,7 @@ def blame(func: Callable) -> Generator[BlameResult, None, None]:
     ```python
     with blame(lambda: some_condition()) as result:
         # do something
-    
+
     if result.found:
         print(result.trace_stack)
     """
@@ -34,7 +33,7 @@ def blame(func: Callable) -> Generator[BlameResult, None, None]:
 
     def _trace_calls(frame, event, arg=None):
         nonlocal result
-        if event in ['call', 'return']:
+        if event in ["call", "return"]:
             # for every function call or return
             try:
                 # Temporarily disable the trace function
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 9454221b273e6296cf4b289ca7c86d0b81aca359..ba22bde8795b3b1aff59218787eb5e003cddbbb2 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -5,8 +5,8 @@ import contextlib
 import dataclasses
 import sys
 import traceback
-from collections.abc import Generator
-from typing import Callable, Generic, TypeVar
+from collections.abc import Callable, Generator
+from typing import Generic, TypeVar
 
 _T = TypeVar("_T")
 
@@ -19,8 +19,8 @@ class MonitoredValues(Generic[_T]):
 
 @contextlib.contextmanager
 def monitor(
-    measure_func: Callable[[],
-                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    measure_func: Callable[[], _T],
+) -> Generator[MonitoredValues[_T], None, None]:
     """
     Trace the function calls to continuously monitor the change of
     a value.
@@ -28,23 +28,23 @@ def monitor(
     Usage:
 
     ```python
-
     def measure_func():
-        ... # measure the current value
+        ...  # measure the current value
         return current_value
 
+
     with monitor(measure_func) as monitored_values:
         # do something
-    
-        monitored_values.values # all changes of the values
-        monitored_values.trace_stacks # trace stacks of every change
+
+        monitored_values.values  # all changes of the values
+        monitored_values.trace_stacks  # trace stacks of every change
     ```
     """
     monitored_values = MonitoredValues[_T]()
 
     def _trace_calls(frame, event, arg=None):
         nonlocal monitored_values
-        if event in ['line']:
+        if event in ["line"]:
             # triggered by every line of Python code.
             # only Python functions will trigger it,
             # c/cpp functions will not trigger it.
@@ -53,11 +53,14 @@ def monitor(
                 sys.settrace(None)
                 # do a measurement
                 current_value = measure_func()
-                if len(monitored_values.values
-                       ) == 0 or current_value != monitored_values.values[-1]:
+                if (
+                    len(monitored_values.values) == 0
+                    or current_value != monitored_values.values[-1]
+                ):
                     monitored_values.values.append(current_value)
-                    monitored_values.trace_stacks.append("".join(
-                        traceback.format_stack()))
+                    monitored_values.trace_stacks.append(
+                        "".join(traceback.format_stack())
+                    )
                 # Re-enable the trace function
                 sys.settrace(_trace_calls)
             except NameError:
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 3aabae099073ee2065bd95ed20c8ae4e9351ac1d..6587730682088bded8d7158fe81928be84b6aa56 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -9,35 +9,39 @@ import torch
 from vllm.platforms import current_platform
 
 MAX_MODEL_LEN = 1024
-MODEL_NAME = os.environ.get("MODEL_NAME",
-                            "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
+MODEL_NAME = os.environ.get(
+    "MODEL_NAME", "robertgshaw2/zephyr-7b-beta-channelwise-gptq"
+)
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
 MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
 
 
 @pytest.mark.skipif(
-    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
-    reason="OOM in the CI")
+    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq", reason="OOM in the CI"
+)
 @pytest.mark.skipif(
     not current_platform.has_device_capability(int(MIN_CAPABILITY)),
-    reason="Current system does not have minimum capability.")
+    reason="Current system does not have minimum capability.",
+)
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
 
     # MoE models need fp16.
-    NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
-                  == "nm-testing/test-w4a16-mixtral-actorder-group")
+    NEEDS_FP16 = (
+        QUANTIZATION == "gptq"
+        or MODEL_NAME == "nm-testing/test-w4a16-mixtral-actorder-group"
+    )
     with vllm_runner(
-            model_name=MODEL_NAME,
-            revision=REVISION,
-            dtype=torch.half if NEEDS_FP16 else "auto",
-            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=2) as model:
-
+        model_name=MODEL_NAME,
+        revision=REVISION,
+        dtype=torch.half if NEEDS_FP16 else "auto",
+        quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=2,
+    ) as model:
         output = model.generate_greedy("Hello world!", max_tokens=20)
         print(output)
         assert output
diff --git a/tools/check_repo.sh b/tools/check_repo.sh
old mode 100644
new mode 100755
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
old mode 100644
new mode 100755
diff --git a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
old mode 100644
new mode 100755
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
old mode 100644
new mode 100755
index 59bfe69dc0dd6b453295b3b098321a07b2dff457..5ea543f4cb1e8f7188f465f03ba10be2560ad13a
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,3 +1,4 @@
+#!/usr/bin/env bash
 set -ex
 
 # prepare workspace directory
@@ -10,8 +11,12 @@ if [ ! -d "$WORKSPACE" ]; then
     mkdir -p $WORKSPACE
 fi
 
+# configurable pip command (default: pip3)
+PIP_CMD=${PIP_CMD:-pip3}
+CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+
 # install dependencies if not installed
-pip3 install cmake torch ninja
+$PIP_CMD install cmake torch ninja
 
 # build nvshmem
 pushd $WORKSPACE
@@ -110,15 +115,13 @@ clone_repo() {
 pushd $WORKSPACE
 clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
 cd pplx-kernels
-# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
-# PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+$PIP_CMD install --no-build-isolation -vvv -e .
 popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+$PIP_CMD install --no-build-isolation -vvv -e .
 popd
diff --git a/tools/flashinfer-build.sh b/tools/flashinfer-build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6c14d87348c3ad3b363703dba3b3eca31f38e86a
--- /dev/null
+++ b/tools/flashinfer-build.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# This script is used to build FlashInfer wheels with AOT kernels
+
+set -ex
+
+# FlashInfer configuration
+FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
+CUDA_VERSION="${CUDA_VERSION}"
+BUILD_WHEEL="${BUILD_WHEEL:-true}"
+
+if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
+    echo "❌ FLASHINFER_GIT_REF must be specified" >&2
+    exit 1
+fi
+
+if [[ -z "${CUDA_VERSION}" ]]; then
+    echo "❌ CUDA_VERSION must be specified" >&2
+    exit 1
+fi
+
+echo "🏗️  Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
+
+# Clone FlashInfer
+git clone --depth 1 --recursive --shallow-submodules \
+    --branch ${FLASHINFER_GIT_REF} \
+    ${FLASHINFER_GIT_REPO} flashinfer
+
+# Set CUDA arch list based on CUDA version
+# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+if [[ "${CUDA_VERSION}" == 11.* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+else
+    # CUDA 12.8+ supports 10.0a and 12.0
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+fi
+
+echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+
+pushd flashinfer
+    # Make sure the wheel is built for the correct CUDA version
+    export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+    # Build AOT kernels
+    export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+    export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+    python3 -m flashinfer.aot
+    
+    if [[ "${BUILD_WHEEL}" == "true" ]]; then
+        # Build wheel for distribution
+        uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
+        echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
+    else
+        # Install directly (for Dockerfile)
+        uv pip install --system --no-build-isolation --force-reinstall .
+        echo "✅ FlashInfer installed successfully"
+    fi
+popd
+
+# Cleanup
+rm -rf flashinfer
\ No newline at end of file
diff --git a/tools/generate_cmake_presets.py b/tools/generate_cmake_presets.py
index 4869a71307e4e210c5a6aa2cf080c26159ffd5b1..85847c2c0fe80632a39de0a2df74440e73ce75be 100644
--- a/tools/generate_cmake_presets.py
+++ b/tools/generate_cmake_presets.py
@@ -12,8 +12,7 @@ try:
     # most reliable source of truth for vLLM's build.
     from torch.utils.cpp_extension import CUDA_HOME
 except ImportError:
-    print("Warning: PyTorch not found. "
-          "Falling back to CUDA_HOME environment variable.")
+    print("Warning: PyTorch not found. Falling back to CUDA_HOME environment variable.")
     CUDA_HOME = os.environ.get("CUDA_HOME")
 
 
@@ -27,8 +26,7 @@ def get_cpu_cores():
     return multiprocessing.cpu_count()
 
 
-def generate_presets(output_path="CMakeUserPresets.json",
-                     force_overwrite=False):
+def generate_presets(output_path="CMakeUserPresets.json", force_overwrite=False):
     """Generates the CMakeUserPresets.json file."""
 
     print("Attempting to detect your system configuration...")
@@ -39,8 +37,7 @@ def generate_presets(output_path="CMakeUserPresets.json",
         prospective_path = os.path.join(CUDA_HOME, "bin", "nvcc")
         if os.path.exists(prospective_path):
             nvcc_path = prospective_path
-            print("Found nvcc via torch.utils.cpp_extension.CUDA_HOME: "
-                  f"{nvcc_path}")
+            print(f"Found nvcc via torch.utils.cpp_extension.CUDA_HOME: {nvcc_path}")
 
     if not nvcc_path:
         nvcc_path = which("nvcc")
@@ -50,7 +47,8 @@ def generate_presets(output_path="CMakeUserPresets.json",
     if not nvcc_path:
         nvcc_path_input = input(
             "Could not automatically find 'nvcc'. Please provide the full "
-            "path to nvcc (e.g., /usr/local/cuda/bin/nvcc): ")
+            "path to nvcc (e.g., /usr/local/cuda/bin/nvcc): "
+        )
         nvcc_path = nvcc_path_input.strip()
     print(f"Using NVCC path: {nvcc_path}")
 
@@ -63,12 +61,13 @@ def generate_presets(output_path="CMakeUserPresets.json",
             "Could not automatically find Python executable. Please provide "
             "the full path to your Python executable for vLLM development "
             "(typically from your virtual environment, e.g., "
-            "/home/user/venvs/vllm/bin/python): ")
+            "/home/user/venvs/vllm/bin/python): "
+        )
         python_executable = input(python_executable_prompt).strip()
         if not python_executable:
             raise ValueError(
-                "Could not determine Python executable. Please provide it "
-                "manually.")
+                "Could not determine Python executable. Please provide it manually."
+            )
 
     print(f"Using Python executable: {python_executable}")
 
@@ -76,20 +75,23 @@ def generate_presets(output_path="CMakeUserPresets.json",
     cpu_cores = get_cpu_cores()
     nvcc_threads = min(4, cpu_cores)
     cmake_jobs = max(1, cpu_cores // nvcc_threads)
-    print(f"Detected {cpu_cores} CPU cores. "
-          f"Setting NVCC_THREADS={nvcc_threads} and CMake jobs={cmake_jobs}.")
+    print(
+        f"Detected {cpu_cores} CPU cores. "
+        f"Setting NVCC_THREADS={nvcc_threads} and CMake jobs={cmake_jobs}."
+    )
 
     # Get vLLM project root (assuming this script is in vllm/tools/)
-    project_root = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), ".."))
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
     print(f"VLLM project root detected as: {project_root}")
 
     # Ensure python_executable path is absolute or resolvable
     if not os.path.isabs(python_executable) and which(python_executable):
         python_executable = os.path.abspath(which(python_executable))
     elif not os.path.isabs(python_executable):
-        print(f"Warning: Python executable '{python_executable}' is not an "
-              "absolute path and not found in PATH. CMake might not find it.")
+        print(
+            f"Warning: Python executable '{python_executable}' is not an "
+            "absolute path and not found in PATH. CMake might not find it."
+        )
 
     cache_variables = {
         "CMAKE_CUDA_COMPILER": nvcc_path,
@@ -122,24 +124,20 @@ def generate_presets(output_path="CMakeUserPresets.json",
         configure_preset["generator"] = "Ninja"
         cache_variables["CMAKE_JOB_POOLS"] = f"compile={cmake_jobs}"
     else:
-        print("Ninja not found, using default generator. "
-              "Build may be slower.")
+        print("Ninja not found, using default generator. Build may be slower.")
 
     presets = {
-        "version":
-        6,
+        "version": 6,
         # Keep in sync with CMakeLists.txt and requirements/build.txt
-        "cmakeMinimumRequired": {
-            "major": 3,
-            "minor": 26,
-            "patch": 1
-        },
+        "cmakeMinimumRequired": {"major": 3, "minor": 26, "patch": 1},
         "configurePresets": [configure_preset],
-        "buildPresets": [{
-            "name": "release",
-            "configurePreset": "release",
-            "jobs": cmake_jobs,
-        }],
+        "buildPresets": [
+            {
+                "name": "release",
+                "configurePreset": "release",
+                "jobs": cmake_jobs,
+            }
+        ],
     }
 
     output_file_path = os.path.join(project_root, output_path)
@@ -148,10 +146,12 @@ def generate_presets(output_path="CMakeUserPresets.json",
         if force_overwrite:
             print(f"Overwriting existing file '{output_file_path}'")
         else:
-            overwrite = input(
-                f"'{output_file_path}' already exists. Overwrite? (y/N): "
-            ).strip().lower()
-            if overwrite != 'y':
+            overwrite = (
+                input(f"'{output_file_path}' already exists. Overwrite? (y/N): ")
+                .strip()
+                .lower()
+            )
+            if overwrite != "y":
                 print("Generation cancelled.")
                 return
 
@@ -160,11 +160,9 @@ def generate_presets(output_path="CMakeUserPresets.json",
             json.dump(presets, f, indent=4)
         print(f"Successfully generated '{output_file_path}'")
         print("\nTo use this preset:")
-        print(
-            f"1. Ensure you are in the vLLM root directory: cd {project_root}")
+        print(f"1. Ensure you are in the vLLM root directory: cd {project_root}")
         print("2. Initialize CMake: cmake --preset release")
-        print("3. Build+install: cmake --build --preset release "
-              "--target install")
+        print("3. Build+install: cmake --build --preset release --target install")
 
     except OSError as e:
         print(f"Error writing file: {e}")
@@ -175,7 +173,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--force-overwrite",
         action="store_true",
-        help="Force overwrite existing CMakeUserPresets.json without prompting"
+        help="Force overwrite existing CMakeUserPresets.json without prompting",
     )
 
     args = parser.parse_args()
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 98427f1835ec2341e0a37c002ebc94ee85dcf7f9..4f2cd302c3eff1820461a4e329fab5848ab90b9a 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -6,7 +6,7 @@ set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"
+DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
diff --git a/tools/install_gdrcopy.sh b/tools/install_gdrcopy.sh
index 481723320c63b6728f968d6b0ffd49ce454aa3bb..d8a756879978bafb087822a8cbd54684391effa7 100755
--- a/tools/install_gdrcopy.sh
+++ b/tools/install_gdrcopy.sh
@@ -7,18 +7,15 @@ set -euo pipefail
 # Requires: curl, apt-get, root privileges
 if [[ $(id -u) -ne 0 ]]; then
   echo "Must be run as root" >&2
-  
   exit 1
 fi
 if [[ $# -ne 3 ]]; then
   echo "Usage: $0 <GDRCOPY_OS_VERSION> <GDRCOPY_CUDA_VERSION> <uuarch(x64|aarch64)>" >&2
   exit 1
 fi
-
 OS_VER="$1"
 CUDA_VER="$2"
 UUARCH_RAW="$3"
-
 # Normalize/validate arch
 case "${UUARCH_RAW,,}" in
   aarch64|arm64)
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a786abba95ad9575408eb3f6c1342ec3b5ab55d7
--- /dev/null
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# install_prerequisites.py
+import argparse
+import glob
+import json
+import os
+import subprocess
+import sys
+import urllib.request
+
+# --- Configuration ---
+WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache")
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+UCX_DIR = os.path.join("/tmp", "ucx_source")
+NIXL_DIR = os.path.join("/tmp", "nixl_source")
+UCX_INSTALL_DIR = os.path.join("/tmp", "ucx_install")
+UCX_REPO_URL = "https://github.com/openucx/ucx.git"
+NIXL_REPO_URL = "https://github.com/ai-dynamo/nixl.git"
+
+
+# --- Helper Functions ---
+def get_latest_nixl_version():
+    """Helper function to get latest release version of NIXL"""
+    try:
+        nixl_release_url = "https://api.github.com/repos/ai-dynamo/nixl/releases/latest"
+        with urllib.request.urlopen(nixl_release_url) as response:
+            data = json.load(response)
+            return data.get("tag_name", "0.7.0")
+    except Exception:
+        return "0.7.0"
+
+
+NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())
+
+
+def run_command(command, cwd=".", env=None):
+    """Helper function to run a shell command and check for errors."""
+    print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True)
+    subprocess.check_call(command, cwd=cwd, env=env)
+
+
+def is_pip_package_installed(package_name):
+    """Checks if a package is installed via pip without raising an exception."""
+    result = subprocess.run(
+        [sys.executable, "-m", "pip", "show", package_name],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return result.returncode == 0
+
+
+def find_nixl_wheel_in_cache(cache_dir):
+    """Finds a nixl wheel file in the specified cache directory."""
+    # The repaired wheel will have a 'manylinux' tag, but this glob still works.
+    search_pattern = os.path.join(cache_dir, f"nixl*{NIXL_VERSION}*.whl")
+    wheels = glob.glob(search_pattern)
+    if wheels:
+        # Sort to get the most recent/highest version if multiple exist
+        wheels.sort()
+        return wheels[-1]
+    return None
+
+
+def install_system_dependencies():
+    """Installs required system packages using apt-get if run as root."""
+    if os.geteuid() != 0:
+        print("\n---", flush=True)
+        print(
+            "WARNING: Not running as root. \
+            Skipping system dependency installation.",
+            flush=True,
+        )
+        print(
+            "Please ensure the listed packages are installed on your system:",
+            flush=True,
+        )
+        print(
+            "  patchelf build-essential git cmake ninja-build \
+            autotools-dev automake meson libtool libtool-bin",
+            flush=True,
+        )
+        print("---\n", flush=True)
+        return
+
+    print("--- Running as root. Installing system dependencies... ---", flush=True)
+    apt_packages = [
+        "patchelf",  # <-- Add patchelf here
+        "build-essential",
+        "git",
+        "cmake",
+        "ninja-build",
+        "autotools-dev",
+        "automake",
+        "meson",
+        "libtool",
+        "libtool-bin",
+    ]
+    run_command(["apt-get", "update"])
+    run_command(["apt-get", "install", "-y"] + apt_packages)
+    print("--- System dependencies installed successfully. ---\n", flush=True)
+
+
+def build_and_install_prerequisites(args):
+    """Builds UCX and NIXL from source, creating a self-contained wheel."""
+
+    if not args.force_reinstall and is_pip_package_installed("nixl"):
+        print("--> NIXL is already installed. Nothing to do.", flush=True)
+        return
+
+    cached_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
+    if not args.force_reinstall and cached_wheel:
+        print(
+            f"\n--> Found self-contained wheel: \
+                {os.path.basename(cached_wheel)}.",
+            flush=True,
+        )
+        print("--> Installing from cache, skipping all source builds.", flush=True)
+        install_command = [sys.executable, "-m", "pip", "install", cached_wheel]
+        run_command(install_command)
+        print("\n--- Installation from cache complete. ---", flush=True)
+        return
+
+    print(
+        "\n--> No installed package or cached wheel found. \
+         Starting full build process...",
+        flush=True,
+    )
+    print("\n--> Installing auditwheel...", flush=True)
+    run_command([sys.executable, "-m", "pip", "install", "auditwheel"])
+    install_system_dependencies()
+    ucx_install_path = os.path.abspath(UCX_INSTALL_DIR)
+    print(f"--> Using wheel cache directory: {WHEELS_CACHE_HOME}", flush=True)
+    os.makedirs(WHEELS_CACHE_HOME, exist_ok=True)
+
+    # -- Step 1: Build UCX from source --
+    print("\n[1/3] Configuring and building UCX from source...", flush=True)
+    if not os.path.exists(UCX_DIR):
+        run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
+    ucx_source_path = os.path.abspath(UCX_DIR)
+    run_command(["git", "checkout", "v1.19.x"], cwd=ucx_source_path)
+    run_command(["./autogen.sh"], cwd=ucx_source_path)
+    configure_command = [
+        "./configure",
+        f"--prefix={ucx_install_path}",
+        "--enable-shared",
+        "--disable-static",
+        "--disable-doxygen-doc",
+        "--enable-optimizations",
+        "--enable-cma",
+        "--enable-devel-headers",
+        "--with-verbs",
+        "--enable-mt",
+        "--with-ze=no",
+    ]
+    run_command(configure_command, cwd=ucx_source_path)
+    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
+    run_command(["make", "install"], cwd=ucx_source_path)
+    print("--- UCX build and install complete ---", flush=True)
+
+    # -- Step 2: Build NIXL wheel from source --
+    print("\n[2/3] Building NIXL wheel from source...", flush=True)
+    if not os.path.exists(NIXL_DIR):
+        run_command(["git", "clone", NIXL_REPO_URL, NIXL_DIR])
+    else:
+        run_command(["git", "fetch", "--tags"], cwd=NIXL_DIR)
+    run_command(["git", "checkout", NIXL_VERSION], cwd=NIXL_DIR)
+    print(f"--> Checked out NIXL version: {NIXL_VERSION}", flush=True)
+
+    build_env = os.environ.copy()
+    build_env["PKG_CONFIG_PATH"] = os.path.join(ucx_install_path, "lib", "pkgconfig")
+    ucx_lib_path = os.path.join(ucx_install_path, "lib")
+    ucx_plugin_path = os.path.join(ucx_lib_path, "ucx")
+    existing_ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+    build_env["LD_LIBRARY_PATH"] = (
+        f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":")
+    )
+    build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN"
+    print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)
+
+    temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")
+    run_command(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "wheel",
+            ".",
+            "--no-deps",
+            f"--wheel-dir={temp_wheel_dir}",
+        ],
+        cwd=os.path.abspath(NIXL_DIR),
+        env=build_env,
+    )
+
+    # -- Step 3: Repair the wheel by copying UCX libraries --
+    print("\n[3/3] Repairing NIXL wheel to include UCX libraries...", flush=True)
+    unrepaired_wheel = find_nixl_wheel_in_cache(temp_wheel_dir)
+    if not unrepaired_wheel:
+        raise RuntimeError("Failed to find the NIXL wheel after building it.")
+
+    # We tell auditwheel to ignore the plugin that mesonpy already handled.
+    auditwheel_command = [
+        "auditwheel",
+        "repair",
+        "--exclude",
+        "libplugin_UCX.so",  # <-- Exclude because mesonpy already includes it
+        unrepaired_wheel,
+        f"--wheel-dir={WHEELS_CACHE_HOME}",
+    ]
+    run_command(auditwheel_command, env=build_env)
+
+    # --- CLEANUP ---
+    # No more temporary files to remove, just the temp wheelhouse
+    run_command(["rm", "-rf", temp_wheel_dir])
+    # --- END CLEANUP ---
+
+    newly_built_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
+    if not newly_built_wheel:
+        raise RuntimeError("Failed to find the repaired NIXL wheel.")
+
+    print(
+        f"--> Successfully built self-contained wheel: \
+            {os.path.basename(newly_built_wheel)}. Now installing...",
+        flush=True,
+    )
+    install_command = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-deps",  # w/o "no-deps", it will install cuda-torch
+        newly_built_wheel,
+    ]
+    if args.force_reinstall:
+        install_command.insert(-1, "--force-reinstall")
+
+    run_command(install_command)
+    print("--- NIXL installation complete ---", flush=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Build and install UCX and NIXL dependencies."
+    )
+    parser.add_argument(
+        "--force-reinstall",
+        action="store_true",
+        help="Force rebuild and reinstall of UCX and NIXL \
+        even if they are already installed.",
+    )
+    args = parser.parse_args()
+    build_and_install_prerequisites(args)
diff --git a/tools/check_init_lazy_imports.py b/tools/pre_commit/check_init_lazy_imports.py
similarity index 63%
rename from tools/check_init_lazy_imports.py
rename to tools/pre_commit/check_init_lazy_imports.py
index e8e6f07cc33fcba376180bbc8e0e6b43c805c211..ab2ef8b3aa5bad4fb664f2ce304f11f170abde8b 100644
--- a/tools/check_init_lazy_imports.py
+++ b/tools/pre_commit/check_init_lazy_imports.py
@@ -1,28 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Ensure we perform lazy loading in vllm/__init__.py.
-i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard,
+i.e: appears only within the `if typing.TYPE_CHECKING:` guard,
 **except** for a short whitelist.
 """
 
-from __future__ import annotations
-
 import ast
-import pathlib
 import sys
 from collections.abc import Iterable
+from pathlib import Path
 from typing import Final
 
-REPO_ROOT: Final = pathlib.Path(__file__).resolve().parent.parent
-INIT_PATH: Final = REPO_ROOT / "vllm" / "__init__.py"
+INIT_PATH: Final = Path("vllm/__init__.py")
 
 # If you need to add items to whitelist, do it here.
-ALLOWED_IMPORTS: Final[frozenset[str]] = frozenset({
-    "vllm.env_override",
-})
-ALLOWED_FROM_MODULES: Final[frozenset[str]] = frozenset({
-    ".version",
-})
+ALLOWED_IMPORTS: Final[frozenset[str]] = frozenset(
+    {
+        "vllm.env_override",
+    }
+)
+ALLOWED_FROM_MODULES: Final[frozenset[str]] = frozenset(
+    {
+        ".version",
+    }
+)
 
 
 def _is_internal(name: str | None, *, level: int = 0) -> bool:
@@ -34,8 +35,7 @@ def _is_internal(name: str | None, *, level: int = 0) -> bool:
 
 
 def _fail(violations: Iterable[tuple[int, str]]) -> None:
-    print("ERROR: Disallowed eager imports in vllm/__init__.py:\n",
-          file=sys.stderr)
+    print("ERROR: Disallowed eager imports in vllm/__init__.py:\n", file=sys.stderr)
     for lineno, msg in violations:
         print(f"  Line {lineno}: {msg}", file=sys.stderr)
     sys.exit(1)
@@ -48,7 +48,6 @@ def main() -> None:
     violations: list[tuple[int, str]] = []
 
     class Visitor(ast.NodeVisitor):
-
         def __init__(self) -> None:
             super().__init__()
             self._in_type_checking = False
@@ -56,10 +55,10 @@ def main() -> None:
         def visit_If(self, node: ast.If) -> None:
             guard_is_type_checking = False
             test = node.test
-            if isinstance(test, ast.Attribute) and isinstance(
-                    test.value, ast.Name):
-                guard_is_type_checking = (test.value.id == "typing"
-                                          and test.attr == "TYPE_CHECKING")
+            if isinstance(test, ast.Attribute) and isinstance(test.value, ast.Name):
+                guard_is_type_checking = (
+                    test.value.id == "typing" and test.attr == "TYPE_CHECKING"
+                )
             elif isinstance(test, ast.Name):
                 guard_is_type_checking = test.id == "TYPE_CHECKING"
 
@@ -79,24 +78,28 @@ def main() -> None:
                 return
             for alias in node.names:
                 module_name = alias.name
-                if _is_internal(
-                        module_name) and module_name not in ALLOWED_IMPORTS:
-                    violations.append((
-                        node.lineno,
-                        f"import '{module_name}' must be inside typing.TYPE_CHECKING",  # noqa: E501
-                    ))
+                if _is_internal(module_name) and module_name not in ALLOWED_IMPORTS:
+                    violations.append(
+                        (
+                            node.lineno,
+                            f"import '{module_name}' must be inside typing.TYPE_CHECKING",  # noqa: E501
+                        )
+                    )
 
         def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
             if self._in_type_checking:
                 return
             module_as_written = ("." * node.level) + (node.module or "")
-            if _is_internal(
-                    node.module, level=node.level
-            ) and module_as_written not in ALLOWED_FROM_MODULES:
-                violations.append((
-                    node.lineno,
-                    f"from '{module_as_written}' import ... must be inside typing.TYPE_CHECKING",  # noqa: E501
-                ))
+            if (
+                _is_internal(node.module, level=node.level)
+                and module_as_written not in ALLOWED_FROM_MODULES
+            ):
+                violations.append(
+                    (
+                        node.lineno,
+                        f"from '{module_as_written}' import ... must be inside typing.TYPE_CHECKING",  # noqa: E501
+                    )
+                )
 
     Visitor().visit(tree)
 
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index acbbc1f181d69311374eb45fefc374fc751ac6a4..b96a6701333def41c59d10f417a3c8d80f228637 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -17,45 +17,48 @@ import regex as re
 #  add to this list if absolutely necessary and after careful security review.
 ALLOWED_FILES = {
     # pickle
-    'vllm/v1/serial_utils.py',
-    'vllm/v1/executor/multiproc_executor.py',
-    'vllm/multimodal/hasher.py',
-    'vllm/transformers_utils/config.py',
-    'vllm/model_executor/models/registry.py',
-    'tests/utils_/test_utils.py',
-    'tests/tokenization/test_cached_tokenizer.py',
-    'vllm/distributed/utils.py',
-    'vllm/distributed/parallel_state.py',
-    'vllm/distributed/device_communicators/all_reduce_utils.py',
-    'vllm/distributed/device_communicators/shm_broadcast.py',
-    'vllm/distributed/device_communicators/shm_object_storage.py',
-    'benchmarks/kernels/graph_machete_bench.py',
-    'benchmarks/kernels/benchmark_lora.py',
-    'benchmarks/kernels/benchmark_machete.py',
-    'benchmarks/fused_kernels/layernorm_rms_benchmarks.py',
-    'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
-    'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
+    "vllm/multimodal/hasher.py",
+    "vllm/transformers_utils/config.py",
+    "vllm/model_executor/models/registry.py",
+    "vllm/compilation/caching.py",
+    "vllm/distributed/utils.py",
+    "vllm/distributed/parallel_state.py",
+    "vllm/distributed/device_communicators/all_reduce_utils.py",
+    "vllm/distributed/device_communicators/shm_broadcast.py",
+    "vllm/distributed/device_communicators/shm_object_storage.py",
+    "vllm/utils/hashing.py",
+    "tests/utils_/test_hashing.py",
+    "tests/tokenization/test_cached_tokenizer.py",
+    "benchmarks/kernels/graph_machete_bench.py",
+    "benchmarks/kernels/benchmark_lora.py",
+    "benchmarks/kernels/benchmark_machete.py",
+    "benchmarks/fused_kernels/layernorm_rms_benchmarks.py",
+    "benchmarks/cutlass_benchmarks/w8a8_benchmarks.py",
+    "benchmarks/cutlass_benchmarks/sparse_benchmarks.py",
     # cloudpickle
-    'vllm/worker/worker_base.py',
-    'vllm/executor/mp_distributed_executor.py',
-    'vllm/executor/ray_distributed_executor.py',
-    'vllm/entrypoints/llm.py',
-    'tests/utils.py',
+    "vllm/v1/executor/multiproc_executor.py",
+    "vllm/v1/executor/ray_executor.py",
+    "vllm/entrypoints/llm.py",
+    "tests/utils.py",
     # pickle and cloudpickle
-    'vllm/utils/__init__.py',
+    "vllm/v1/serial_utils.py",
 }
 
-PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)"
-                       r"|from\s+(pickle|cloudpickle)\s+import\b)")
+PICKLE_RE = re.compile(
+    r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)"
+    r"|from\s+(pickle|cloudpickle)\s+import\b)"
+)
 
 
 def scan_file(path: str) -> int:
-    with open(path, encoding='utf-8') as f:
+    with open(path, encoding="utf-8") as f:
         for i, line in enumerate(f, 1):
             if PICKLE_RE.match(line):
-                print(f"{path}:{i}: "
-                      "\033[91merror:\033[0m "  # red color
-                      "Found pickle/cloudpickle import")
+                print(
+                    f"{path}:{i}: "
+                    "\033[91merror:\033[0m "  # red color
+                    "Found pickle/cloudpickle import"
+                )
                 return 1
     return 0
 
@@ -93,13 +96,13 @@ def test_regex():
     for i, (line, should_match) in enumerate(test_cases):
         result = bool(PICKLE_RE.match(line))
         assert result == should_match, (
-            f"Test case {i} failed: '{line}' "
-            f"(expected {should_match}, got {result})")
+            f"Test case {i} failed: '{line}' (expected {should_match}, got {result})"
+        )
     print("All regex tests passed.")
 
 
-if __name__ == '__main__':
-    if '--test-regex' in sys.argv:
+if __name__ == "__main__":
+    if "--test-regex" in sys.argv:
         test_regex()
     else:
         sys.exit(main())
diff --git a/tools/check_spdx_header.py b/tools/pre_commit/check_spdx_header.py
similarity index 97%
rename from tools/check_spdx_header.py
rename to tools/pre_commit/check_spdx_header.py
index ced10ba9097bcec7284da60276619084d220f223..1fcca12519ffac509a4cec18e6841b83928ae27d 100644
--- a/tools/check_spdx_header.py
+++ b/tools/pre_commit/check_spdx_header.py
@@ -7,6 +7,7 @@ from enum import Enum
 
 class SPDXStatus(Enum):
     """SPDX header status enumeration"""
+
     EMPTY = "empty"  # empty __init__.py
     COMPLETE = "complete"
     MISSING_LICENSE = "missing_license"  # Only has copyright line
@@ -16,7 +17,8 @@ class SPDXStatus(Enum):
 
 FULL_SPDX_HEADER = (
     "# SPDX-License-Identifier: Apache-2.0\n"
-    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
+    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"
+)
 
 LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
 COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501
@@ -123,8 +125,9 @@ def main():
             continue
 
     # Collect all files that need fixing
-    all_files_to_fix = (files_missing_both + files_missing_copyright +
-                        files_missing_license)
+    all_files_to_fix = (
+        files_missing_both + files_missing_copyright + files_missing_license
+    )
     if all_files_to_fix:
         print("The following files are missing the SPDX header:")
         if files_missing_both:
diff --git a/tools/check_triton_import.py b/tools/pre_commit/check_triton_import.py
similarity index 84%
rename from tools/check_triton_import.py
rename to tools/pre_commit/check_triton_import.py
index c01d9d4ab079a40715e1588977ba2c30eee2eb21..1b83074fe0d208093a76562548472e7300f8cb00 100644
--- a/tools/check_triton_import.py
+++ b/tools/pre_commit/check_triton_import.py
@@ -23,8 +23,7 @@ def is_allowed_file(current_file: str) -> bool:
 
 def is_forbidden_import(line: str) -> bool:
     stripped = line.strip()
-    return bool(
-        FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES
+    return bool(FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES
 
 
 def parse_diff(diff: str) -> list[str]:
@@ -42,24 +41,24 @@ def parse_diff(diff: str) -> list[str]:
         elif line.startswith("@@"):
             match = re.search(r"\+(\d+)", line)
             if match:
-                current_lineno = int(
-                    match.group(1)) - 1  # next "+ line" is here
+                current_lineno = int(match.group(1)) - 1  # next "+ line" is here
         elif line.startswith("+") and not line.startswith("++"):
             current_lineno += 1
             code_line = line[1:]
             if is_forbidden_import(code_line):
                 violations.append(
-                    f"{current_file}:{current_lineno}: {code_line.strip()}")
+                    f"{current_file}:{current_lineno}: {code_line.strip()}"
+                )
     return violations
 
 
 def get_diff(diff_type: str) -> str:
     if diff_type == "staged":
         return subprocess.check_output(
-            ["git", "diff", "--cached", "--unified=0"], text=True)
+            ["git", "diff", "--cached", "--unified=0"], text=True
+        )
     elif diff_type == "unstaged":
-        return subprocess.check_output(["git", "diff", "--unified=0"],
-                                       text=True)
+        return subprocess.check_output(["git", "diff", "--unified=0"], text=True)
     else:
         raise ValueError(f"Unknown diff_type: {diff_type}")
 
@@ -75,8 +74,10 @@ def main():
             print(f"[{diff_type}] Git diff failed: {e}", file=sys.stderr)
 
     if all_violations:
-        print("❌ Forbidden direct `import triton` detected."
-              " ➤ Use `from vllm.triton_utils import triton` instead.\n")
+        print(
+            "❌ Forbidden direct `import triton` detected."
+            " ➤ Use `from vllm.triton_utils import triton` instead.\n"
+        )
         for v in all_violations:
             print(f"❌ {v}")
         return 1
diff --git a/tools/enforce_regex_import.py b/tools/pre_commit/enforce_regex_import.py
similarity index 73%
rename from tools/enforce_regex_import.py
rename to tools/pre_commit/enforce_regex_import.py
index 63ceee5829aba3d93bb080e57072a9569c1594c4..a29952e92264d7085f1deca937c3210269471eca 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/pre_commit/enforce_regex_import.py
@@ -1,30 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import subprocess
 from pathlib import Path
 
 import regex as re
 
-FORBIDDEN_PATTERNS = re.compile(
-    r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)')
+FORBIDDEN_PATTERNS = re.compile(r"^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)")
 ALLOWED_PATTERNS = [
-    re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'),
-    re.compile(r'^\s*import\s+regex\s*$'),
+    re.compile(r"^\s*import\s+regex\s+as\s+re\s*$"),
+    re.compile(r"^\s*import\s+regex\s*$"),
 ]
 
 
 def get_staged_python_files() -> list[str]:
     try:
         result = subprocess.run(
-            ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'],
+            ["git", "diff", "--cached", "--name-only", "--diff-filter=AM"],
             capture_output=True,
             text=True,
-            check=True)
-        files = result.stdout.strip().split(
-            '\n') if result.stdout.strip() else []
-        return [f for f in files if f.endswith('.py')]
+            check=True,
+        )
+        files = result.stdout.strip().split("\n") if result.stdout.strip() else []
+        return [f for f in files if f.endswith(".py")]
     except subprocess.CalledProcessError:
         return []
 
@@ -33,13 +30,14 @@ def is_forbidden_import(line: str) -> bool:
     line = line.strip()
     return bool(
         FORBIDDEN_PATTERNS.match(line)
-        and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS))
+        and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS)
+    )
 
 
 def check_file(filepath: str) -> list[tuple[int, str]]:
     violations = []
     try:
-        with open(filepath, encoding='utf-8') as f:
+        with open(filepath, encoding="utf-8") as f:
             for line_num, line in enumerate(f, 1):
                 if is_forbidden_import(line):
                     violations.append((line_num, line.strip()))
@@ -72,9 +70,7 @@ def main() -> int:
     if total_violations > 0:
         print(f"\n💡 Found {total_violations} violation(s).")
         print("❌ Please replace 'import re' with 'import regex as re'")
-        print(
-            "   Also replace 'from re import ...' with 'from regex import ...'"
-        )  # noqa: E501
+        print("   Also replace 'from re import ...' with 'from regex import ...'")  # noqa: E501
         print("✅ Allowed imports:")
         print("   - import regex as re")
         print("   - import regex")  # noqa: E501
diff --git a/tools/generate_nightly_torch_test.py b/tools/pre_commit/generate_nightly_torch_test.py
similarity index 100%
rename from tools/generate_nightly_torch_test.py
rename to tools/pre_commit/generate_nightly_torch_test.py
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 039cf6075f6312b2fdf8a4dd60f478482a636797..8d04848f8f780d52577cddd99598af475d20923f 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -20,14 +20,15 @@ Args:
 
 import subprocess
 import sys
-from typing import Optional
 
 import regex as re
 
 FILES = [
     "vllm/*.py",
     "vllm/assets",
+    "vllm/distributed",
     "vllm/entrypoints",
+    "vllm/executor",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
@@ -35,23 +36,33 @@ FILES = [
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
+    "vllm/v1/core",
+    "vllm/v1/engine",
 ]
 
 # After fixing errors resulting from changing follow_imports
 # from "skip" to "silent", move the following directories to FILES
 SEPARATE_GROUPS = [
     "tests",
+    # v0 related
     "vllm/attention",
     "vllm/compilation",
-    "vllm/distributed",
     "vllm/engine",
-    "vllm/executor",
     "vllm/inputs",
     "vllm/lora",
     "vllm/model_executor",
     "vllm/plugins",
     "vllm/worker",
-    "vllm/v1",
+    # v1 related
+    "vllm/v1/attention",
+    "vllm/v1/executor",
+    "vllm/v1/kv_offload",
+    "vllm/v1/metrics",
+    "vllm/v1/pool",
+    "vllm/v1/sample",
+    "vllm/v1/spec_decode",
+    "vllm/v1/structured_output",
+    "vllm/v1/worker",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
@@ -94,11 +105,15 @@ def group_files(changed_files: list[str]) -> dict[str, list[str]]:
     return file_groups
 
 
-def mypy(targets: list[str], python_version: Optional[str],
-         follow_imports: Optional[str], file_group: str) -> int:
+def mypy(
+    targets: list[str],
+    python_version: str | None,
+    follow_imports: str | None,
+    file_group: str,
+) -> int:
     """
     Run mypy on the given targets.
-    
+
     Args:
         targets: List of files or directories to check.
         python_version: Python version to use (e.g., "3.10") or None to use
@@ -131,8 +146,9 @@ def main():
     for file_group, changed_files in file_groups.items():
         follow_imports = None if ci and file_group == "" else "skip"
         if changed_files:
-            returncode |= mypy(changed_files, python_version, follow_imports,
-                               file_group)
+            returncode |= mypy(
+                changed_files, python_version, follow_imports, file_group
+            )
     return returncode
 
 
diff --git a/tools/png-lint.sh b/tools/pre_commit/png-lint.sh
similarity index 100%
rename from tools/png-lint.sh
rename to tools/pre_commit/png-lint.sh
diff --git a/tools/shellcheck.sh b/tools/pre_commit/shellcheck.sh
similarity index 100%
rename from tools/shellcheck.sh
rename to tools/pre_commit/shellcheck.sh
diff --git a/tools/update-dockerfile-graph.sh b/tools/pre_commit/update-dockerfile-graph.sh
similarity index 100%
rename from tools/update-dockerfile-graph.sh
rename to tools/pre_commit/update-dockerfile-graph.sh
diff --git a/tools/validate_config.py b/tools/pre_commit/validate_config.py
similarity index 67%
rename from tools/validate_config.py
rename to tools/pre_commit/validate_config.py
index f6439fa9ada5f493cecf609019d3d7b19710825a..fb6f0e6a9285075e45f348d1d150f3803c7ea7f5 100644
--- a/tools/validate_config.py
+++ b/tools/pre_commit/validate_config.py
@@ -8,6 +8,7 @@ and that each field has a docstring.
 import ast
 import inspect
 import sys
+from itertools import pairwise
 
 import regex as re
 
@@ -20,28 +21,17 @@ def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
     https://davidism.com/mit-license/
     """
 
-    def pairwise(iterable):
-        """
-        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
-
-        Can be removed when Python 3.9 support is dropped.
-        """
-        iterator = iter(iterable)
-        a = next(iterator, None)
-
-        for b in iterator:
-            yield a, b
-            a = b
-
     out = {}
 
     # Consider each pair of nodes.
     for a, b in pairwise(cls_node.body):
         # Must be an assignment then a constant string.
-        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
-                or not isinstance(b, ast.Expr)
-                or not isinstance(b.value, ast.Constant)
-                or not isinstance(b.value.value, str)):
+        if (
+            not isinstance(a, (ast.Assign, ast.AnnAssign))
+            or not isinstance(b, ast.Expr)
+            or not isinstance(b.value, ast.Constant)
+            or not isinstance(b.value.value, str)
+        ):
             continue
 
         doc = inspect.cleandoc(b.value.value)
@@ -61,25 +51,27 @@ def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
 
 
 class ConfigValidator(ast.NodeVisitor):
-
-    def __init__(self):
-        ...
+    def __init__(self): ...
 
     def visit_ClassDef(self, node):
         # Validate class with both @config and @dataclass decorators
         decorators = [
-            id for d in node.decorator_list if (isinstance(d, ast.Name) and (
-                (id := d.id) == 'config' or id == 'dataclass')) or
-            (isinstance(d, ast.Call) and (isinstance(d.func, ast.Name) and
-                                          (id := d.func.id) == 'dataclass'))
+            id
+            for d in node.decorator_list
+            if (
+                isinstance(d, ast.Name)
+                and ((id := d.id) == "config" or id == "dataclass")
+            )
+            or (
+                isinstance(d, ast.Call)
+                and (isinstance(d.func, ast.Name) and (id := d.func.id) == "dataclass")
+            )
         ]
 
-        if set(decorators) == {'config', 'dataclass'}:
+        if set(decorators) == {"config", "dataclass"}:
             validate_class(node)
-        elif set(decorators) == {'config'}:
-            fail(
-                f"Class {node.name} with config decorator must be a dataclass.",
-                node)
+        elif set(decorators) == {"config"}:
+            fail(f"Class {node.name} with config decorator must be a dataclass.", node)
 
         self.generic_visit(node)
 
@@ -93,9 +85,11 @@ def validate_class(class_node: ast.ClassDef):
             # Skip ClassVar and InitVar
             # see https://docs.python.org/3/library/dataclasses.html#class-variables
             # and https://docs.python.org/3/library/dataclasses.html#init-only-variables
-            if (isinstance(stmt.annotation, ast.Subscript)
-                    and isinstance(stmt.annotation.value, ast.Name)
-                    and stmt.annotation.value.id in {"ClassVar", "InitVar"}):
+            if (
+                isinstance(stmt.annotation, ast.Subscript)
+                and isinstance(stmt.annotation.value, ast.Name)
+                and stmt.annotation.value.id in {"ClassVar", "InitVar"}
+            ):
                 continue
 
             if isinstance(stmt.target, ast.Name):
@@ -103,22 +97,30 @@ def validate_class(class_node: ast.ClassDef):
                 if stmt.value is None:
                     fail(
                         f"Field '{field_name}' in {class_node.name} must have "
-                        "a default value.", stmt)
+                        "a default value.",
+                        stmt,
+                    )
 
                 if field_name not in attr_docs:
                     fail(
                         f"Field '{field_name}' in {class_node.name} must have "
-                        "a docstring.", stmt)
+                        "a docstring.",
+                        stmt,
+                    )
 
-                if isinstance(stmt.annotation, ast.Subscript) and \
-                   isinstance(stmt.annotation.value, ast.Name) \
-                    and stmt.annotation.value.id == "Union" and \
-                        isinstance(stmt.annotation.slice, ast.Tuple):
+                if (
+                    isinstance(stmt.annotation, ast.Subscript)
+                    and isinstance(stmt.annotation.value, ast.Name)
+                    and stmt.annotation.value.id == "Union"
+                    and isinstance(stmt.annotation.slice, ast.Tuple)
+                ):
                     args = stmt.annotation.slice.elts
                     literal_args = [
-                        arg for arg in args
-                        if isinstance(arg, ast.Subscript) and isinstance(
-                            arg.value, ast.Name) and arg.value.id == "Literal"
+                        arg
+                        for arg in args
+                        if isinstance(arg, ast.Subscript)
+                        and isinstance(arg.value, ast.Name)
+                        and arg.value.id == "Literal"
                     ]
                     if len(literal_args) > 1:
                         fail(
@@ -126,7 +128,9 @@ def validate_class(class_node: ast.ClassDef):
                             "use a single "
                             "Literal type. Please use 'Literal[Literal1, "
                             "Literal2]' instead of 'Union[Literal1, Literal2]'"
-                            ".", stmt)
+                            ".",
+                            stmt,
+                        )
 
 
 def validate_ast(tree: ast.stmt):
diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py
index 42dfede9e98705efc46fbc4c8d74a3ffe1c4bb69..fd237c0b214a43b7d9eefb15f1ef45bfda5cbc54 100755
--- a/tools/profiler/nsys_profile_tools/gputrc2graph.py
+++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-    This generates gpu kernel analysis output from nsys rep. Will call nsys
-    stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
-    csv and html output for analysis
+This generates gpu kernel analysis output from nsys rep. Will call nsys
+stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
+csv and html output for analysis
 """
+
 import argparse
 import logging
 import os
@@ -16,13 +17,13 @@ logger = logging.getLogger(__name__)
 
 # helper data class for annotating kernels
 def load_engine_model():
-    """ returns engine_model built from all json files in the current dir """
+    """returns engine_model built from all json files in the current dir"""
     import glob
     import json
+
     engine_model = {}
 
-    json_files = glob.glob(
-        os.path.join(os.path.dirname(__file__) or ".", "*.json"))
+    json_files = glob.glob(os.path.join(os.path.dirname(__file__) or ".", "*.json"))
     for fname in json_files:
         with open(fname, encoding="utf-8") as f:
             engine_model.update(json.load(f))
@@ -30,54 +31,54 @@ def load_engine_model():
 
 
 class GPUTrace2Graph:
-    """ 
-        Parses output of nsys report, generates csv and bar chart output
+    """
+    Parses output of nsys report, generates csv and bar chart output
     """
 
     def __init__(self):
         import pandas as pd  # avoid importing till needed
+
         self.pd = pd
         self.pd.options.mode.copy_on_write = True
 
     # helper functions for generating trace->summary csvs
     def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
-        logger.info('loading %s', in_file)
+        logger.info("loading %s", in_file)
         df = self.pd.read_csv(
-            in_file,
-            usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name'])
-        df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)']
+            in_file, usecols=["Start (ns)", "Duration (ns)", "Device", "Strm", "Name"]
+        )
+        df["End (ns)"] = df["Start (ns)"] + df["Duration (ns)"]
         df = self.sum_non_overlapping_intervals(df)
         # get ready to print table with elapsed times per kernel
-        df['Instances'] = 1
-        df_sum = df.groupby('Name', as_index=False).agg({
-            'Elapsed Time (ns)': 'sum',
-            'Duration (ns)': 'sum',
-            'Instances': 'size'
-        })
+        df["Instances"] = 1
+        df_sum = df.groupby("Name", as_index=False).agg(
+            {"Elapsed Time (ns)": "sum", "Duration (ns)": "sum", "Instances": "size"}
+        )
 
         # generate csv
-        df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9
-        df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9
-        df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False)
-        df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances',
-                'Name']].to_csv(out_file, index=False)
+        df_sum["Total Time (sec)"] = df_sum["Duration (ns)"] / 1e9
+        df_sum["Elapsed Time (sec)"] = df_sum["Elapsed Time (ns)"] / 1e9
+        df_sum = df_sum.sort_values(by="Elapsed Time (sec)", ascending=False)
+        df_sum[["Elapsed Time (sec)", "Total Time (sec)", "Instances", "Name"]].to_csv(
+            out_file, index=False
+        )
 
     def sum_non_overlapping_intervals(self, df):
-        """ 
-            returns new sorted df with Elapsed Time (ns) column using 
-            vectorized operations 
+        """
+        returns new sorted df with Elapsed Time (ns) column using
+        vectorized operations
         """
         logger.info("sorting %s trace records by start time", str(df.shape))
 
         # Sort by start time and reset index
-        df = df.sort_values(by='Start (ns)').reset_index(drop=True)
+        df = df.sort_values(by="Start (ns)").reset_index(drop=True)
 
         # Initialize elapsed time as duration
-        df['Elapsed Time (ns)'] = df['Duration (ns)']
+        df["Elapsed Time (ns)"] = df["Duration (ns)"]
 
         # Get numpy arrays for faster operations
-        starts = df['Start (ns)'].values
-        ends = df['End (ns)'].values
+        starts = df["Start (ns)"].values
+        ends = df["End (ns)"].values
 
         # Keep track of current interval end
         current_end = ends[0]
@@ -85,16 +86,17 @@ class GPUTrace2Graph:
         # Update current_end for overlapping intervals
         for i in range(1, len(df)):
             if i % display_units == 0:
-                print(f'processing trace: {int(i/len(df) * 100)} %', end="\r")
+                print(f"processing trace: {int(i / len(df) * 100)} %", end="\r")
             if starts[i] <= current_end:
                 if ends[i] > current_end:
                     # Partial overlap
-                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)'
-                                                  )] = ends[i] - current_end
+                    df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = (
+                        ends[i] - current_end
+                    )
                     current_end = ends[i]
                 else:
                     # Complete overlap
-                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0
+                    df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = 0
             else:
                 # No overlap
                 current_end = ends[i]
@@ -103,147 +105,167 @@ class GPUTrace2Graph:
 
     # functions for generating html files
     def make_html(self, df, output_dir, title):
-        """ make html graph from df """
+        """make html graph from df"""
         import plotly.express as px
+
         if df.empty:
             return
-        output_name = output_dir + '/result'
+        output_name = output_dir + "/result"
         if not title:
-            title = 'Model_Engine'
-        x = 'Model_Engine'
-        y = 'Elapsed Time (sec)'
-        color = 'Category'
+            title = "Model_Engine"
+        x = "Model_Engine"
+        y = "Elapsed Time (sec)"
+        color = "Category"
         """ generate kernel mapping table  """
         # Sort Model_Engine categories by last field after underscore
-        df['Model_Engine'] = self.pd.Categorical(
-            df['Model_Engine'],
-            sorted(df['Model_Engine'].unique(),
-                   key=lambda x: x.split('_')[-1]))
-        df[['Model_Engine', color, 'Instances', 'Name',
-            y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False)
-        graph = px.histogram(df.round(2),
-                             x=x,
-                             y=y,
-                             title=(f'{y} for {title}'),
-                             color=color,
-                             text_auto=True)
+        df["Model_Engine"] = self.pd.Categorical(
+            df["Model_Engine"],
+            sorted(df["Model_Engine"].unique(), key=lambda x: x.split("_")[-1]),
+        )
+        df[["Model_Engine", color, "Instances", "Name", y]].sort_values(
+            by=color
+        ).to_csv(f"{output_name}.csv", index=False)
+        graph = px.histogram(
+            df.round(2),
+            x=x,
+            y=y,
+            title=(f"{y} for {title}"),
+            color=color,
+            text_auto=True,
+        )
         # wrap x axis labels
         graph.update_xaxes(automargin=True)
-        graph.write_html(f'{output_name}.html')
+        graph.write_html(f"{output_name}.html")
         """
             Generate data table with columns per Model_Engine into result.html
         """
-        pivot_df = df.pivot_table(values='Elapsed Time (sec)',
-                                  index='Category',
-                                  columns='Model_Engine',
-                                  aggfunc='sum',
-                                  observed=False).round(2)
+        pivot_df = df.pivot_table(
+            values="Elapsed Time (sec)",
+            index="Category",
+            columns="Model_Engine",
+            aggfunc="sum",
+            observed=False,
+        ).round(2)
         # Add sum row at bottom
-        pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
-        pivot_df.fillna('').to_html('temp.html')
-        with (open(f'{output_name}.html', 'a', encoding='utf-8') as
-              outfile, open('temp.html', encoding='utf-8') as infile):
+        pivot_df.loc["total_elapsed_sec"] = pivot_df.sum()
+        pivot_df.fillna("").to_html("temp.html")
+        with (
+            open(f"{output_name}.html", "a", encoding="utf-8") as outfile,
+            open("temp.html", encoding="utf-8") as infile,
+        ):
             outfile.write(infile.read())
-        os.remove('temp.html')
+        os.remove("temp.html")
 
-        print(f'Finished generating: \n'
-              f' {output_name}.html for stack bar chart \n'
-              f' {output_name}.csv for Kernel-Category mapping')
+        print(
+            f"Finished generating: \n"
+            f" {output_name}.html for stack bar chart \n"
+            f" {output_name}.csv for Kernel-Category mapping"
+        )
 
     def anno_gpu_kernname(self, df, mapping):
-        """ add "Category" column """
+        """add "Category" column"""
 
         def anno_gpu_kernname_helper(name):
             for kern_name, val in mapping.items():
                 if re.search(kern_name, name):
                     return val
 
-        df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)
+        df["Category"] = df["Name"].apply(anno_gpu_kernname_helper)
 
     def make_nongpu_row(self, df, nongpu_sec):
-        """ this will append non-gpu time entry at end of df """
+        """this will append non-gpu time entry at end of df"""
         nongpu_row = self.pd.DataFrame([df.iloc[-1]])
-        nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
-        nongpu_row['Instances'] = 1
-        nongpu_row['Elapsed Time (sec)'] = nongpu_sec
-        return (nongpu_row)
+        nongpu_row["Category"] = nongpu_row["Name"] = "CPU(non-GPU)"
+        nongpu_row["Instances"] = 1
+        nongpu_row["Elapsed Time (sec)"] = nongpu_sec
+        return nongpu_row
 
     def is_valid_file(self, base_file):
-        """ asserts if base_file is non-existent or is empty """
-        assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \
-           f"{base_file} doesn't exist or is empty"
+        """asserts if base_file is non-existent or is empty"""
+        assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, (
+            f"{base_file} doesn't exist or is empty"
+        )
 
     def should_gen_file(self, new_file, base_file):
-        """ figure out if new file should be generated from base_file """
+        """figure out if new file should be generated from base_file"""
         self.is_valid_file(base_file)
-        if (os.path.exists(new_file)
-                and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
-                and (os.path.getsize(base_file) > 0)):
-            logger.info('reusing %s', new_file)
+        if (
+            os.path.exists(new_file)
+            and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
+            and (os.path.getsize(base_file) > 0)
+        ):
+            logger.info("reusing %s", new_file)
             return False
         else:
-            logger.info('generating %s', new_file)
+            logger.info("generating %s", new_file)
             return True
 
     def gen_sum_file(self, file, nsys_cmd):
-        """ 
-            generates sum file from nsys trace with times per kernel and
-            returns the name of the sum file
+        """
+        generates sum file from nsys trace with times per kernel and
+        returns the name of the sum file
         """
         import subprocess
+
         file_dir = os.path.dirname(file)
         file_name = os.path.basename(file)
 
         if not file_dir:
-            file_dir = '.'
+            file_dir = "."
         # Walk through trace and get the total non-overlapped time
-        nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv'
-        sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
+        nsys_stats_file = f"{file_dir}/{file_name}_cuda_gpu_trace.csv"
+        sum_file = f"{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv"
         if self.should_gen_file(nsys_stats_file, file):
             cmd = [
-                nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
-                f'{file_dir}/{file_name}'
+                nsys_cmd,
+                "stats",
+                "-r",
+                "cuda_gpu_trace",
+                file,
+                "-o",
+                f"{file_dir}/{file_name}",
             ]
-            cmd_str = ' '.join(cmd)
-            logger.info('+ %s', cmd_str)
+            cmd_str = " ".join(cmd)
+            logger.info("+ %s", cmd_str)
             # estimate time based on calibrated 240M/min
             file_size_mb = os.path.getsize(file) / 1e6
             logger.info(
-                'nsys stats for %.2f MB file expected to take %.2f min',
-                file_size_mb, file_size_mb / 240)
+                "nsys stats for %.2f MB file expected to take %.2f min",
+                file_size_mb,
+                file_size_mb / 240,
+            )
             try:
                 subprocess.run(cmd, check=True)
             except Exception:
-                logger.error("%s failed; Use --nsys_cmd to specify nsys path",
-                             cmd_str)
+                logger.error("%s failed; Use --nsys_cmd to specify nsys path", cmd_str)
                 exit(1)
-            logger.info('generating non-overalapped sum %s', sum_file)
+            logger.info("generating non-overalapped sum %s", sum_file)
             self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
         self.is_valid_file(sum_file)
-        logger.info('Finished generating %s', sum_file)
+        logger.info("Finished generating %s", sum_file)
         return sum_file
 
     def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
-        """ generates graph and csv file from in_file into out_dir """
+        """generates graph and csv file from in_file into out_dir"""
         # Initialize an empty DataFrame to store combined data
         combined_df = self.pd.DataFrame()
         for idx, (file, engine, model, total_sec) in enumerate(in_file):
             file_dir = os.path.dirname(file)
             file_name = os.path.basename(file)
             if not file_dir:
-                file_dir = '.'
+                file_dir = "."
             sum_file = self.gen_sum_file(file, nsys_cmd)
             # read kernel summary file
             df = self.pd.read_csv(sum_file)
             # annotate kernel to their categories
-            assert engine_model.get(engine), f'engine {engine} unknown'
-            assert engine_model[engine].get(model), f'model {model} unknown'
+            assert engine_model.get(engine), f"engine {engine} unknown"
+            assert engine_model[engine].get(model), f"model {model} unknown"
             # remove nsys-rep from file_name for shorter x-label
-            file_name = file_name.replace('.nsys-rep', '')
-            df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
+            file_name = file_name.replace(".nsys-rep", "")
+            df["Model_Engine"] = f"{model}_{engine}_{file_name}_{idx}"
             self.anno_gpu_kernname(df, engine_model[engine][model])
             # patch in non-gpu time
-            gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
+            gpu_sec = round(df["Elapsed Time (sec)"].sum(), 1)
             total_sec = round(float(total_sec), 1)
             if total_sec < gpu_sec:
                 logger.warning(
@@ -256,7 +278,7 @@ class GPUTrace2Graph:
             df = self.pd.concat([df, nongpu_row], ignore_index=True)
             combined_df = self.pd.concat([combined_df, df], ignore_index=True)
         if out_dir is None:
-            out_dir = '.'
+            out_dir = "."
         else:
             os.makedirs(out_dir, exist_ok=True)
         # generate html file
@@ -264,50 +286,59 @@ class GPUTrace2Graph:
 
 
 def parse_tuple(s):
-    return tuple(s.split(','))
+    return tuple(s.split(","))
 
 
 def main():
-    logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'),
-                        level=logging.INFO)
+    logging.basicConfig(
+        format=("%(asctime)s - %(levelname)s - %(message)s"), level=logging.INFO
+    )
     parser = argparse.ArgumentParser(
         description=(
-            'Process nsys rep and generate kernel non-overlapped cycles. \n'
-            'Example:\n'
+            "Process nsys rep and generate kernel non-overlapped cycles. \n"
+            "Example:\n"
             "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n"
             "d2.nsys-rep,vllm,gpt-oss,102 "
-            "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
-        formatter_class=argparse.RawDescriptionHelpFormatter)
+            '--out_dir results/ --title "Model=gpt-oss vLLM chart"'
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
 
     # load supported engine_model
     engine_model_supported = load_engine_model()
     # Get a string representation of supported engine/model combinations
-    engine_model_supported_str = ', '.join(
+    engine_model_supported_str = ", ".join(
         f"{engine}:[{', '.join(models.keys())}]"
-        for engine, models in engine_model_supported.items())
+        for engine, models in engine_model_supported.items()
+    )
     parser.add_argument(
-        '--in_file',
+        "--in_file",
         type=parse_tuple,
-        nargs='+',
+        nargs="+",
         help=(
-            'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) '
-            'separated by space. Elapsed_nonprofiled_sec is runtime without '
-            'profiling used to calculate non-gpu time. Specify 0 to use '
-            'elapsed time from nsys-rep but that might inflate non-gpu time. '
-            f'Available engine:[model] are: {engine_model_supported_str} '
-            f'Example: --infile d1.nsys-rep,vllm,llama,100 '
-            'd2.nsys-rep,vllm,gpt-oss,102'),
-        required=True)
-    parser.add_argument('--out_dir', help=('output dir for result.csv/html'))
-    parser.add_argument('--title', help=('title for html chart'))
-    parser.add_argument('--nsys_cmd',
-                        help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
-                        default="nsys")
+            "list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) "
+            "separated by space. Elapsed_nonprofiled_sec is runtime without "
+            "profiling used to calculate non-gpu time. Specify 0 to use "
+            "elapsed time from nsys-rep but that might inflate non-gpu time. "
+            f"Available engine:[model] are: {engine_model_supported_str} "
+            f"Example: --infile d1.nsys-rep,vllm,llama,100 "
+            "d2.nsys-rep,vllm,gpt-oss,102"
+        ),
+        required=True,
+    )
+    parser.add_argument("--out_dir", help=("output dir for result.csv/html"))
+    parser.add_argument("--title", help=("title for html chart"))
+    parser.add_argument(
+        "--nsys_cmd",
+        help=("nsys cmd, e.g. /usr/bin/nsys, Default: nsys"),
+        default="nsys",
+    )
     args = parser.parse_args()
     gputrace = GPUTrace2Graph()
-    gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
-                       engine_model_supported)
+    gputrace.gen_graph(
+        args.in_file, args.out_dir, args.title, args.nsys_cmd, engine_model_supported
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 209c3a576aeed77b7babdcf469cf5dd14895c258..d7a24a598593d92951511cbf24816a599f747f57 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -29,48 +29,50 @@ def flatten_entries(entry_cls, profile_dict: dict):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--json-trace",
-                        type=str,
-                        required=True,
-                        help="json trace file output by "
-                        "examples/offline_inference/profiling.py")
-    parser.add_argument("--phase",
-                        type=str,
-                        required=True,
-                        help="The phase to print the table for. This is either"
-                        "prefill or decode_n, where n is the decode step "
-                        "number")
-    parser.add_argument("--table",
-                        type=str,
-                        choices=["summary", "model"],
-                        default="summary",
-                        help="Which table to print, the summary table or the "
-                        "layerwise model table")
+    parser.add_argument(
+        "--json-trace",
+        type=str,
+        required=True,
+        help="json trace file output by examples/offline_inference/profiling.py",
+    )
+    parser.add_argument(
+        "--phase",
+        type=str,
+        required=True,
+        help="The phase to print the table for. This is either"
+        "prefill or decode_n, where n is the decode step "
+        "number",
+    )
+    parser.add_argument(
+        "--table",
+        type=str,
+        choices=["summary", "model"],
+        default="summary",
+        help="Which table to print, the summary table or the layerwise model table",
+    )
 
     args = parser.parse_args()
 
     with open(args.json_trace) as f:
         profile_data = json.load(f)
 
-    assert args.phase in profile_data, \
-       (f"Cannot find phase {args.phase} in profile data. Choose one among"
-        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+    assert args.phase in profile_data, (
+        f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f"{[x for x in profile_data if 'prefill' in x or 'decode' in x]}"
+    )  # noqa
 
     if args.table == "summary":
         entries_and_depths = flatten_entries(
-            SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
-        column_widths = dict(name=80,
-                             cuda_time_us=12,
-                             pct_cuda_time=12,
-                             invocations=15)
+            SummaryStatsEntry, profile_data[args.phase]["summary_stats"]
+        )
+        column_widths = dict(name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15)
     elif args.table == "model":
         entries_and_depths = flatten_entries(
-            ModelStatsEntry, profile_data[args.phase]["model_stats"])
-        column_widths = dict(name=60,
-                             cpu_time_us=12,
-                             cuda_time_us=12,
-                             pct_cuda_time=12,
-                             trace=60)
+            ModelStatsEntry, profile_data[args.phase]["model_stats"]
+        )
+        column_widths = dict(
+            name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
+        )
 
     # indent entry names based on the depth
     entries = []
@@ -78,7 +80,8 @@ if __name__ == "__main__":
         entry.name = indent_string(
             entry.name,
             indent=depth,
-            indent_style=lambda indent: "|" + "-" * indent + " ")
+            indent_style=lambda indent: "|" + "-" * indent + " ",
+        )
         entries.append(entry)
 
     TablePrinter(type(entries[0]), column_widths).print_table(entries)
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 30d6547073d38df6c3d325aed9318d4e0976deae..ed4bf0beb716b1f47e04d9e46644de977ab752a5 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -7,7 +7,7 @@ import json
 import math
 import os
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -18,17 +18,18 @@ import pandas as pd
 def largest_dist_from_leaf(node: dict, depth: int = 0):
     if len(node["children"]) == 0:
         return depth
-    return max([
-        largest_dist_from_leaf(child, depth=depth + 1)
-        for child in node["children"]
-    ])
-
-
-def get_entries_at_depth(depth: int,
-                         entries_and_traces: list[tuple[Any, Any]],
-                         node: dict,
-                         curr_depth: int = 0,
-                         trace=()):
+    return max(
+        [largest_dist_from_leaf(child, depth=depth + 1) for child in node["children"]]
+    )
+
+
+def get_entries_at_depth(
+    depth: int,
+    entries_and_traces: list[tuple[Any, Any]],
+    node: dict,
+    curr_depth: int = 0,
+    trace=(),
+):
     # assert that the query is at kernel or module level
     assert depth == -1 or depth == -2
 
@@ -40,21 +41,18 @@ def get_entries_at_depth(depth: int,
     if largest_dist_from_leaf(node) == (abs(depth) - 1):
         entries_and_traces.append((node["entry"], trace))
 
-    trace = (node["entry"]["name"], ) + trace
+    trace = (node["entry"]["name"],) + trace
     for child in node["children"]:
-        get_entries_at_depth(depth,
-                             entries_and_traces,
-                             child,
-                             curr_depth=curr_depth + 1,
-                             trace=trace)
+        get_entries_at_depth(
+            depth, entries_and_traces, child, curr_depth=curr_depth + 1, trace=trace
+        )
 
 
 def fold_nodes(root: dict, nodes_to_fold: list[str]):
-
     stack: list[dict] = [root]
     while len(stack) != 0:
         node = stack.pop()
-        if node['entry']['name'] in nodes_to_fold:
+        if node["entry"]["name"] in nodes_to_fold:
             node["children"] = []
             continue
         for child in node["children"]:
@@ -76,9 +74,7 @@ def trim_string_back(string: str, width: int) -> str:
 
 def shorten_plot_legend_strings(legend, max_char_len: int):
     for t in legend.get_texts():
-        t.set_text(
-            trim_string_back(abbreviate_known_names(t.get_text()),
-                             max_char_len))
+        t.set_text(trim_string_back(abbreviate_known_names(t.get_text()), max_char_len))
 
 
 def abbreviate_known_names(name: str) -> str:
@@ -108,15 +104,21 @@ def attempt_to_make_names_unique(entries_and_traces):
             names.add(entry["name"])
 
     for name in non_unique_names:
-        entries_and_traces_with_name = [(entry, trace)
-                                        for entry, trace in entries_and_traces
-                                        if entry["name"] == name]
+        entries_and_traces_with_name = [
+            (entry, trace)
+            for entry, trace in entries_and_traces
+            if entry["name"] == name
+        ]
 
-        zipped_traces = list(
-            zip(*[trace for _, trace in entries_and_traces_with_name]))
+        zipped_traces = list(zip(*[trace for _, trace in entries_and_traces_with_name]))
         first_trace_difference = next(
-            (i for i, trace_eles in enumerate(zipped_traces)
-             if not all_the_same(trace_eles)), None)
+            (
+                i
+                for i, trace_eles in enumerate(zipped_traces)
+                if not all_the_same(trace_eles)
+            ),
+            None,
+        )
 
         if first_trace_difference is None:
             # can't create a unique name, leave the names as they
@@ -124,34 +126,32 @@ def attempt_to_make_names_unique(entries_and_traces):
             continue
 
         for entry, trace in entries_and_traces_with_name:
-            entry["name"] = " <- ".join((entry["name"], ) +
-                                        trace[:first_trace_difference + 1])
+            entry["name"] = " <- ".join(
+                (entry["name"],) + trace[: first_trace_difference + 1]
+            )
 
 
 ## Operation grouping utils ####
-'''
+"""
     Group operations in the given dataframe by some high-level ops like,
     - gemms
     - attention
     - rms_norm 
     etc.
-'''
-
+"""
 
-def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
 
+def group_trace_by_operations(trace_df: "pd.DataFrame") -> "pd.DataFrame":
     def is_rms_norm(op_name: str):
         if "rms_norm_kernel" in op_name:
             return True
 
     def is_attention_block(op_name: str):
-        if "flash_fwd" in op_name or \
-            "reshape_and_cache_flash_kernel" in op_name:
+        if "flash_fwd" in op_name or "reshape_and_cache_flash_kernel" in op_name:
             return True
 
     def is_quant(op_name: str):
-        if "scaled_fp8_quant" in op_name or \
-           "scaled_int8_quant" in op_name:
+        if "scaled_fp8_quant" in op_name or "scaled_int8_quant" in op_name:
             return True
 
     # LoRA ops
@@ -168,24 +168,27 @@ def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
         return "bgmv_expand" in op_name
 
     def is_cutlass_gemm_op(op_name: str):
-        return "void cutlass::Kernel" in op_name or \
-           "void cutlass::device_kernel" in op_name
+        return (
+            "void cutlass::Kernel" in op_name
+            or "void cutlass::device_kernel" in op_name
+        )
 
     def is_gemm_op(op_name: str):
         if is_quant(op_name):
             return False
-        return is_cutlass_gemm_op(op_name) or \
-           "xmma_gemm" in op_name  or \
-           "gemv2T_kernel" in op_name or \
-           "splitKreduce" in op_name or \
-           "s16816gemm" in op_name
+        return (
+            is_cutlass_gemm_op(op_name)
+            or "xmma_gemm" in op_name
+            or "gemv2T_kernel" in op_name
+            or "splitKreduce" in op_name
+            or "s16816gemm" in op_name
+        )
 
     def is_elementwise_op(op_name: str):
         return "elementwise_kernel" in op_name
 
     def is_mem_op(op_name: str):
-        return "memcpy" in op_name.lower() or \
-               "memset" in op_name.lower()
+        return "memcpy" in op_name.lower() or "memset" in op_name.lower()
 
     def is_vocab_embedding_op(op_name: str):
         return "vocabparallelembed" in op_name.lower()
@@ -195,17 +198,15 @@ def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
         return "nccl" in op_name.lower()
 
     def is_nccl_all_reduce(op_name: str):
-        return is_nccl_op(op_name) and \
-                ("all_reduce" in op_name.lower() or \
-                "allreduce" in op_name.lower())
+        return is_nccl_op(op_name) and (
+            "all_reduce" in op_name.lower() or "allreduce" in op_name.lower()
+        )
 
     def is_nccl_gather(op_name: str):
-        return is_nccl_op(op_name) and \
-                "gather" in op_name.lower()
+        return is_nccl_op(op_name) and "gather" in op_name.lower()
 
     def is_nccl_broadcast(op_name: str):
-        return is_nccl_op(op_name) and \
-                "broadcast" in op_name.lower()
+        return is_nccl_op(op_name) and "broadcast" in op_name.lower()
 
     # Reduce ops types
     def is_cross_device_reduce_1stage(op_name: str):
@@ -269,114 +270,122 @@ def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
     ops = list(filter(lambda x: x not in nccl_other_ops, ops))
 
     cross_device_reduce_1stage_ops = list(
-        filter(lambda x: is_cross_device_reduce_1stage(x), ops))
+        filter(lambda x: is_cross_device_reduce_1stage(x), ops)
+    )
     ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
 
     cross_device_reduce_2stage_ops = list(
-        filter(lambda x: is_cross_device_reduce_2stage(x), ops))
+        filter(lambda x: is_cross_device_reduce_2stage(x), ops)
+    )
     ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
 
-    custom_ar_all_reduce_ops = list(
-        filter(lambda x: is_custom_ar_all_reduce(x), ops))
+    custom_ar_all_reduce_ops = list(filter(lambda x: is_custom_ar_all_reduce(x), ops))
     ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
 
     reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
     ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
 
     if len(attention_ops):
-        trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
+        trace_df["attention"] = trace_df[attention_ops].agg("sum", axis=1)
     if len(quant_ops):
-        trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+        trace_df["quant_ops"] = trace_df[quant_ops].agg("sum", axis=1)
 
     if len(sgmv_shrink_ops):
-        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["sgmv_shrink_ops"] = trace_df[sgmv_shrink_ops].agg("sum", axis=1)
     if len(sgmv_expand_ops):
-        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["sgmv_expand_ops"] = trace_df[sgmv_expand_ops].agg("sum", axis=1)
     if len(bgmv_shrink_ops):
-        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["bgmv_shrink_ops"] = trace_df[bgmv_shrink_ops].agg("sum", axis=1)
     if len(bgmv_expand_ops):
-        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["bgmv_expand_ops"] = trace_df[bgmv_expand_ops].agg("sum", axis=1)
 
     if len(cutlass_gemm_ops):
-        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
-                                                                      axis=1)
+        trace_df["cutlass_gemm_ops"] = trace_df[cutlass_gemm_ops].agg("sum", axis=1)
 
     if len(gemm_ops):
-        trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
+        trace_df["gemm_ops"] = trace_df[gemm_ops].agg("sum", axis=1)
     if len(rms_norm_ops):
-        trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
+        trace_df["rms_norm_ops"] = trace_df[rms_norm_ops].agg("sum", axis=1)
     if len(vocab_embed_ops):
-        trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["vocab_embed_ops"] = trace_df[vocab_embed_ops].agg("sum", axis=1)
     if len(mem_ops):
-        trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
+        trace_df["mem_ops"] = trace_df[mem_ops].agg("sum", axis=1)
     if len(elementwise_ops):
-        trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["elementwise_ops"] = trace_df[elementwise_ops].agg("sum", axis=1)
 
     if len(nccl_all_reduce_ops):
-        trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
-            "sum", axis=1)
+        trace_df["nccl_all_reduce_ops"] = trace_df[nccl_all_reduce_ops].agg(
+            "sum", axis=1
+        )
     if len(nccl_gather_ops):
-        trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
-                                                                    axis=1)
+        trace_df["nccl_gather_ops"] = trace_df[nccl_gather_ops].agg("sum", axis=1)
     if len(nccl_broadcast_ops):
-        trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
-            "sum", axis=1)
+        trace_df["nccl_broadcast_ops"] = trace_df[nccl_broadcast_ops].agg("sum", axis=1)
     if len(nccl_other_ops):
-        trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
-                                                                  axis=1)
+        trace_df["nccl_other_ops"] = trace_df[nccl_other_ops].agg("sum", axis=1)
 
     if len(cross_device_reduce_1stage_ops):
-        trace_df['cross_device_reduce_1stage_ops'] = trace_df[
-            cross_device_reduce_1stage_ops].agg("sum", axis=1)
+        trace_df["cross_device_reduce_1stage_ops"] = trace_df[
+            cross_device_reduce_1stage_ops
+        ].agg("sum", axis=1)
     if len(cross_device_reduce_2stage_ops):
-        trace_df['cross_device_reduce_2stage_ops'] = trace_df[
-            cross_device_reduce_2stage_ops].agg("sum", axis=1)
+        trace_df["cross_device_reduce_2stage_ops"] = trace_df[
+            cross_device_reduce_2stage_ops
+        ].agg("sum", axis=1)
     if len(custom_ar_all_reduce_ops):
-        trace_df['custom_ar_all_reduce_ops'] = trace_df[
-            custom_ar_all_reduce_ops].agg("sum", axis=1)
+        trace_df["custom_ar_all_reduce_ops"] = trace_df[custom_ar_all_reduce_ops].agg(
+            "sum", axis=1
+        )
     if len(reduce_kernel_ops):
-        trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
-                                                                        axis=1)
-
-    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
-                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
-                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
-                  vocab_embed_ops + mem_ops + elementwise_ops +
-                  nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
-                  nccl_other_ops + cross_device_reduce_1stage_ops +
-                  cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
-                  reduce_kernel_ops,
-                  axis=1,
-                  inplace=True)
+        trace_df["reduce_kernel_ops"] = trace_df[reduce_kernel_ops].agg("sum", axis=1)
+
+    trace_df.drop(
+        attention_ops
+        + quant_ops
+        + sgmv_shrink_ops
+        + sgmv_expand_ops
+        + bgmv_shrink_ops
+        + bgmv_expand_ops
+        + cutlass_gemm_ops
+        + gemm_ops
+        + rms_norm_ops
+        + vocab_embed_ops
+        + mem_ops
+        + elementwise_ops
+        + nccl_all_reduce_ops
+        + nccl_gather_ops
+        + nccl_broadcast_ops
+        + nccl_other_ops
+        + cross_device_reduce_1stage_ops
+        + cross_device_reduce_2stage_ops
+        + custom_ar_all_reduce_ops
+        + reduce_kernel_ops,
+        axis=1,
+        inplace=True,
+    )
     return trace_df
 
 
 ## Data plotting utils ####
 
 
-def plot_trace_df(traces_df: pd.DataFrame,
-                  plot_metric: str,
-                  plot_title: str,
-                  output: Optional[Path] = None):
-
-    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+def plot_trace_df(
+    traces_df: "pd.DataFrame",
+    plot_metric: str,
+    plot_title: str,
+    output: Path | None = None,
+):
+    def get_phase_description(traces_df: "pd.DataFrame", phase: str) -> str:
         phase_df = traces_df.query(f'phase == "{phase}"')
-        descs = phase_df['phase_desc'].to_list()
+        descs = phase_df["phase_desc"].to_list()
         assert all([desc == descs[0] for desc in descs])
         return descs[0]
 
-    phases = traces_df['phase'].unique()
+    phases = traces_df["phase"].unique()
     phase_descs = [get_phase_description(traces_df, p) for p in phases]
-    traces_df = traces_df.pivot_table(index="phase",
-                                      columns="name",
-                                      values=plot_metric,
-                                      aggfunc="sum")
+    traces_df = traces_df.pivot_table(
+        index="phase", columns="name", values=plot_metric, aggfunc="sum"
+    )
 
     traces_df = group_trace_by_operations(traces_df)
 
@@ -396,20 +405,19 @@ def plot_trace_df(traces_df: pd.DataFrame,
     # Write the values as text on the bars
     for bar in ax.patches:
         if bar.get_height() != 0:
-            ax.text(bar.get_x() + bar.get_width() / 2,
-                    bar.get_height() / 2 + bar.get_y(),
-                    f"{round(bar.get_height(), 2)}",
-                    ha='center',
-                    color='w',
-                    weight='bold',
-                    size=5)
+            ax.text(
+                bar.get_x() + bar.get_width() / 2,
+                bar.get_height() / 2 + bar.get_y(),
+                f"{round(bar.get_height(), 2)}",
+                ha="center",
+                color="w",
+                weight="bold",
+                size=5,
+            )
 
     # Setup legend
     handles, labels = plt.gca().get_legend_handles_labels()
-    legend = fig.legend(handles,
-                        labels,
-                        loc='center left',
-                        bbox_to_anchor=(1, 1))
+    legend = fig.legend(handles, labels, loc="center left", bbox_to_anchor=(1, 1))
     shorten_plot_legend_strings(legend, 50)
 
     # Setup labels and title
@@ -417,21 +425,20 @@ def plot_trace_df(traces_df: pd.DataFrame,
     ax.set_ylabel(plot_metric)
     plt.suptitle(plot_title)
 
-    plt.savefig(output, bbox_inches='tight')
+    plt.savefig(output, bbox_inches="tight")
     print("Created: ", output)
 
 
 def main(
-        json_trace: Path,
-        output_directory: Path,
-        depth: int,  # Fetch/Plot operations at this depth of the Json tree
-        plot_metric: str,
-        make_names_unique: bool,
-        top_k: int,
-        json_nodes_to_fold: list[str]):
-
-    def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame:
-
+    json_trace: Path,
+    output_directory: Path,
+    depth: int,  # Fetch/Plot operations at this depth of the Json tree
+    plot_metric: str,
+    make_names_unique: bool,
+    top_k: int,
+    json_nodes_to_fold: list[str],
+):
+    def prepare_data(profile_json: dict, step_keys: list[str]) -> "pd.DataFrame":
         def get_entries_and_traces(key: str):
             entries_and_traces: list[tuple[Any, Any]] = []
             for root in profile_json[key]["summary_stats"]:
@@ -441,16 +448,14 @@ def main(
                 get_entries_at_depth(depth, entries_and_traces, root)
             return entries_and_traces
 
-        def keep_only_top_entries(df: pd.DataFrame,
-                                  metric: str,
-                                  top_k: int = 9) -> pd.DataFrame:
-            df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
-                   ["name"]] = "others"
+        def keep_only_top_entries(
+            df: "pd.DataFrame", metric: str, top_k: int = 9
+        ) -> "pd.DataFrame":
+            df.loc[df.nsmallest(len(df) - top_k + 1, metric).index, ["name"]] = "others"
             return df
 
         def get_phase_description(key: str) -> str:
-            num_running_seqs = profile_json[key]['metadata'][
-                'num_running_seqs']
+            num_running_seqs = profile_json[key]["metadata"]["num_running_seqs"]
             if num_running_seqs is not None:
                 return f"{key}-seqs-{num_running_seqs}"
             else:
@@ -466,20 +471,24 @@ def main(
 
         # To pandas dataframe
         trace_dfs = list(
-            map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
-                traces))
+            map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0), traces)
+        )
 
         # Respect top_k
         if top_k:
             trace_dfs = list(
                 map(
                     lambda trace_df: keep_only_top_entries(
-                        trace_df, "cuda_time_us", top_k), trace_dfs))
+                        trace_df, "cuda_time_us", top_k
+                    ),
+                    trace_dfs,
+                )
+            )
 
         # Fill in information about the step-keys
         for trace_df, step_key in zip(trace_dfs, step_keys):
-            trace_df['phase'] = step_key
-            trace_df['phase_desc'] = get_phase_description(step_key)
+            trace_df["phase"] = step_key
+            trace_df["phase_desc"] = get_phase_description(step_key)
 
         # Combine all data frames so they can be put in a single plot
         traces_df = pd.concat(trace_dfs)
@@ -492,17 +501,23 @@ def main(
 
     def make_plot_title_suffix(profile_json: dict) -> str:
         context = profile_json["context"]
-        sparsity = context.get('sparsity', None)
-        run_type = \
-            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
-                (f'Complete {context["complete_num_requests_per_step"]} per '
-                 f'step; Run till completion')
-        return (f"{context['engine_args']['model']}\n"
-                f"Batch={context['batch_size']}, "
-                f"PromptLen={context['prompt_len']}, "
-                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
-                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
-                f"Run Type: {run_type}")
+        sparsity = context.get("sparsity", None)
+        run_type = (
+            f"Run {context['num_steps']} steps"
+            if context["num_steps"]
+            else (
+                f"Complete {context['complete_num_requests_per_step']} per "
+                f"step; Run till completion"
+            )
+        )
+        return (
+            f"{context['engine_args']['model']}\n"
+            f"Batch={context['batch_size']}, "
+            f"PromptLen={context['prompt_len']}, "
+            f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+            f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+            f"Run Type: {run_type}"
+        )
 
     profile_json = None
     with open(json_trace) as f:
@@ -511,14 +526,14 @@ def main(
 
     # Get all `llm.generate.step()` profile
     step_traces = list(profile_json.keys())
-    assert (step_traces[0] == 'context')
+    assert step_traces[0] == "context"
     step_traces = step_traces[1:]  # have only prefill and decodes
     prefills = list(filter(lambda x: "prefill" in x, step_traces))
     all_decodes = list(filter(lambda x: "decode" in x, step_traces))
     assert len(prefills) + len(all_decodes) == len(step_traces)
     assert len(prefills) == 1
 
-    decodes = all_decodes[::args.step_plot_interval]
+    decodes = all_decodes[:: args.step_plot_interval]
     if decodes[-1] != all_decodes[-1]:
         # Always have the last decode
         decodes.append(all_decodes[-1])
@@ -528,48 +543,63 @@ def main(
 
     plot_title_suffix = make_plot_title_suffix(profile_json)
 
-    plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
-                  output_directory / Path("prefill.png"))
-    plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
-                  output_directory / Path("decode_steps.png"))
+    plot_trace_df(
+        prefill_traces,
+        plot_metric,
+        "prefill " + plot_title_suffix,
+        output_directory / Path("prefill.png"),
+    )
+    plot_trace_df(
+        decode_traces,
+        plot_metric,
+        "decodes " + plot_title_suffix,
+        output_directory / Path("decode_steps.png"),
+    )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--json-trace",
-                        type=str,
-                        required=True,
-                        help="json trace file output by \
-                              examples/offline_inference/profiling.py")
-    parser.add_argument("--output-directory",
-                        type=str,
-                        required=False,
-                        help="Directory to output plots")
-    parser.add_argument("--level",
-                        type=str,
-                        default="module",
-                        choices=["module", "kernel"])
-    parser.add_argument("--top-k",
-                        type=int,
-                        default=12,
-                        help="Only graph the top `top_k` entries by time.")
-    parser.add_argument("--fold-json-node",
-                        nargs='+',
-                        default=['Sampler', 'LogitsProcessor'],
-                        help='Do not plot the children of these nodes. Let, \
+    parser.add_argument(
+        "--json-trace",
+        type=str,
+        required=True,
+        help="json trace file output by \
+                              examples/offline_inference/profiling.py",
+    )
+    parser.add_argument(
+        "--output-directory", type=str, required=False, help="Directory to output plots"
+    )
+    parser.add_argument(
+        "--level", type=str, default="module", choices=["module", "kernel"]
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=12,
+        help="Only graph the top `top_k` entries by time.",
+    )
+    parser.add_argument(
+        "--fold-json-node",
+        nargs="+",
+        default=["Sampler", "LogitsProcessor"],
+        help="Do not plot the children of these nodes. Let, \
                               the node represent the aggregate of all its \
-                              children')
-    parser.add_argument("--plot-metric",
-                        type=str,
-                        default="cuda_time_ms",
-                        help='Metric to plot. some options are cuda_time_ms, \
-                                pct_cuda_time')
+                              children",
+    )
+    parser.add_argument(
+        "--plot-metric",
+        type=str,
+        default="cuda_time_ms",
+        help="Metric to plot. some options are cuda_time_ms, \
+                                pct_cuda_time",
+    )
     parser.add_argument(
         "--step-plot-interval",
         type=int,
         default=4,
-        help="For every `step_plot_interval` steps, plot 1 step")
+        help="For every `step_plot_interval` steps, plot 1 step",
+    )
 
     args = parser.parse_args()
 
@@ -583,11 +613,19 @@ if __name__ == "__main__":
     else:
         raise Exception(f"Unexpected level value ({args.level})")
 
-    output_directory = args.output_directory if args.output_directory else Path(
-        args.json_trace).parent
+    output_directory = (
+        args.output_directory if args.output_directory else Path(args.json_trace).parent
+    )
 
     if not os.path.exists(output_directory):
         os.makedirs(output_directory)
 
-    main(Path(args.json_trace), output_directory, depth, args.plot_metric,
-         make_names_unique, args.top_k, args.fold_json_node)
+    main(
+        Path(args.json_trace),
+        output_directory,
+        depth,
+        args.plot_metric,
+        make_names_unique,
+        args.top_k,
+        args.fold_json_node,
+    )
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 7386cdd9f724583d5d958bb1091e2d1107bce5b6..fe3f352fe153e2dfcdead0196c6a2944568f657e 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -83,9 +83,9 @@ class Target:
         """
         # Allow for modest floating-point errors
         epsilon = 0.000002
-        if (self.weighted_duration > self.Duration() + epsilon):
-            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
-        assert (self.weighted_duration <= self.Duration() + epsilon)
+        if self.weighted_duration > self.Duration() + epsilon:
+            print("{} > {}?".format(self.weighted_duration, self.Duration()))
+        assert self.weighted_duration <= self.Duration() + epsilon
         return self.weighted_duration
 
     def DescribeTargets(self):
@@ -93,10 +93,10 @@ class Target:
         # Some build steps generate dozens of outputs - handle them sanely.
         # The max_length was chosen so that it can fit most of the long
         # single-target names, while minimizing word wrapping.
-        result = ', '.join(self.targets)
+        result = ", ".join(self.targets)
         max_length = 65
         if len(result) > max_length:
-            result = result[:max_length] + '...'
+            result = result[:max_length] + "..."
         return result
 
 
@@ -106,12 +106,13 @@ def ReadTargets(log, show_all):
 
     The result is a list of Target objects."""
     header = log.readline()
-    assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version {!r}'.format(header)
+    assert header == "# ninja log v5\n", "unrecognized ninja log version {!r}".format(
+        header
+    )
     targets_dict = {}
     last_end_seen = 0.0
     for line in log:
-        parts = line.strip().split('\t')
+        parts = line.strip().split("\t")
         if len(parts) != 5:
             # If ninja.exe is rudely halted then the .ninja_log file may be
             # corrupt. Silently continue.
@@ -150,17 +151,17 @@ def ReadTargets(log, show_all):
 def GetExtension(target, extra_patterns):
     """Return the file extension that best represents a target.
 
-  For targets that generate multiple outputs it is important to return a
-  consistent 'canonical' extension. Ultimately the goal is to group build steps
-  by type."""
+    For targets that generate multiple outputs it is important to return a
+    consistent 'canonical' extension. Ultimately the goal is to group build steps
+    by type."""
     for output in target.targets:
         if extra_patterns:
-            for fn_pattern in extra_patterns.split(';'):
-                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+            for fn_pattern in extra_patterns.split(";"):
+                if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
                     return fn_pattern
         # Not a true extension, but a good grouping.
-        if output.endswith('type_mappings'):
-            extension = 'type_mappings'
+        if output.endswith("type_mappings"):
+            extension = "type_mappings"
             break
 
         # Capture two extensions if present. For example: file.javac.jar should
@@ -170,26 +171,26 @@ def GetExtension(target, extra_patterns):
         extension = ext2 + ext1  # Preserve the order in the file name.
 
         if len(extension) == 0:
-            extension = '(no extension found)'
+            extension = "(no extension found)"
 
-        if ext1 in ['.pdb', '.dll', '.exe']:
-            extension = 'PEFile (linking)'
+        if ext1 in [".pdb", ".dll", ".exe"]:
+            extension = "PEFile (linking)"
             # Make sure that .dll and .exe are grouped together and that the
             # .dll.lib files don't cause these to be listed as libraries
             break
-        if ext1 in ['.so', '.TOC']:
-            extension = '.so (linking)'
+        if ext1 in [".so", ".TOC"]:
+            extension = ".so (linking)"
             # Attempt to identify linking, avoid identifying as '.TOC'
             break
         # Make sure .obj files don't get categorized as mojo files
-        if ext1 in ['.obj', '.o']:
+        if ext1 in [".obj", ".o"]:
             break
         # Jars are the canonical output of java targets.
-        if ext1 == '.jar':
+        if ext1 == ".jar":
             break
         # Normalize all mojo related outputs to 'mojo'.
-        if output.count('.mojom') > 0:
-            extension = 'mojo'
+        if output.count(".mojom") > 0:
+            extension = "mojo"
             break
     return extension
 
@@ -214,8 +215,8 @@ def SummarizeEntries(entries, extra_step_types):
         if target.end > latest:
             latest = target.end
         total_cpu_time += target.Duration()
-        task_start_stop_times.append((target.start, 'start', target))
-        task_start_stop_times.append((target.end, 'stop', target))
+        task_start_stop_times.append((target.start, "start", target))
+        task_start_stop_times.append((target.end, "stop", target))
     length = latest - earliest
     weighted_total = 0.0
 
@@ -241,10 +242,10 @@ def SummarizeEntries(entries, extra_step_types):
         if num_running > 0:
             # Update the total weighted time up to this moment.
             last_weighted_time += (time - last_time) / float(num_running)
-        if action_name == 'start':
+        if action_name == "start":
             # Record the total weighted task time when this task starts.
             running_tasks[target] = last_weighted_time
-        if action_name == 'stop':
+        if action_name == "stop":
             # Record the change in the total weighted task time while this task
             # ran.
             weighted_duration = last_weighted_time - running_tasks[target]
@@ -252,13 +253,16 @@ def SummarizeEntries(entries, extra_step_types):
             weighted_total += weighted_duration
             del running_tasks[target]
         last_time = time
-    assert (len(running_tasks) == 0)
+    assert len(running_tasks) == 0
 
     # Warn if the sum of weighted times is off by more than half a second.
     if abs(length - weighted_total) > 500:
-        print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
-                  length, weighted_total))
+        print(
+            "Warning: Possible corrupt ninja log, results may be "
+            "untrustworthy. Length = {:.3f}, weighted total = {:.3f}".format(
+                length, weighted_total
+            )
+        )
 
     entries_by_ext = defaultdict(list)
     for target in entries:
@@ -266,32 +270,38 @@ def SummarizeEntries(entries, extra_step_types):
         entries_by_ext[extension].append(target)
 
     for key, values in entries_by_ext.items():
-        print('    Longest build steps for {}:'.format(key))
+        print("    Longest build steps for {}:".format(key))
         values.sort(key=lambda x: x.WeightedDuration())
         for target in values[-long_count:]:
             print(
-                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
-                format(target.WeightedDuration(), target.DescribeTargets(),
-                       target.Duration()))
-
-    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
-          'parallelism)'.format(length, total_cpu_time,
-                                total_cpu_time * 1.0 / length))
-    print('    {} build steps completed, average of {:1.2f}/s'.format(
-        len(entries),
-        len(entries) / (length)))
+                "      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)".format(
+                    target.WeightedDuration(),
+                    target.DescribeTargets(),
+                    target.Duration(),
+                )
+            )
+
+    print(
+        "    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x "
+        "parallelism)".format(length, total_cpu_time, total_cpu_time * 1.0 / length)
+    )
+    print(
+        "    {} build steps completed, average of {:1.2f}/s".format(
+            len(entries), len(entries) / (length)
+        )
+    )
 
 
 def main():
-    log_file = '.ninja_log'
+    log_file = ".ninja_log"
     parser = argparse.ArgumentParser()
-    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument("-C", dest="build_directory", help="Build directory.")
     parser.add_argument(
-        '-s',
-        '--step-types',
-        help='semicolon separated fnmatch patterns for build-step grouping')
-    parser.add_argument('--log-file',
-                        help="specific ninja log file to analyze.")
+        "-s",
+        "--step-types",
+        help="semicolon separated fnmatch patterns for build-step grouping",
+    )
+    parser.add_argument("--log-file", help="specific ninja log file to analyze.")
     args, _extra_args = parser.parse_known_args()
     if args.build_directory:
         log_file = os.path.join(args.build_directory, log_file)
@@ -300,17 +310,16 @@ def main():
     if args.step_types:
         # Make room for the extra build types.
         global long_ext_count
-        long_ext_count += len(args.step_types.split(';'))
+        long_ext_count += len(args.step_types.split(";"))
 
     try:
         with open(log_file) as log:
             entries = ReadTargets(log, False)
             SummarizeEntries(entries, args.step_types)
     except OSError:
-        print('Log file {!r} not found, no build summary created.'.format(
-            log_file))
+        print("Log file {!r} not found, no build summary created.".format(log_file))
         return errno.ENOENT
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..45ef8dfcb1db642f6aaf176187aa1756457470bf
--- /dev/null
+++ b/tools/vllm-tpu/build.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -e # Exit immediately if a command exits with a non-zero status.
+# Script to build VLLM wheel for TPU with an optional version override.
+
+SCRIPT_PATH_PARAM="$0"
+TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the script's directory
+REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root
+VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources
+
+CHANGE_FILE_LIST=(
+  "vllm/entrypoints/cli/main.py"
+  "vllm/entrypoints/cli/run_batch.py"
+  "vllm/utils/__init__.py"
+  "vllm/platforms/__init__.py"
+)
+
+# Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "."
+if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then
+    echo "Error: This script should not be run from the vllm directory directly if using relative paths."
+    echo "Place it in a subdirectory like 'tools/vllm-tpu' and run it from the repository root or via its full path."
+    exit 1
+fi
+
+# Optional version argument
+if [ -n "$1" ]; then
+    USER_VERSION="$1"
+    export VLLM_VERSION_OVERRIDE="$USER_VERSION"
+    echo "User defined version: $USER_VERSION"
+else
+    echo "No version override supplied. Using default version from source."
+fi
+
+PYPROJECT_FILE="$VLLM_DIR/pyproject.toml"
+
+# Backup and update the project name.
+if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then
+    echo "Patching pyproject.toml project name to vllm-tpu..."
+    cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak"
+    sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE"
+
+    echo "Patching ${CHANGE_FILE_LIST[@]} vllm to vllm-tpu..."
+    # patching
+    #   importlib.metadata.version('vllm') -> importlib.metadata.version('vllm-tpu')
+    #   importlib.metadata.version("vllm") -> importlib.metadata.version("vllm-tpu")
+    #   importlib.metadata.metadata('vllm') -> importlib.metadata.metadata('vllm-tpu')
+    #   importlib.metadata.metadata("vllm") -> importlib.metadata.metadata("vllm-tpu")
+    #   version('vllm') -> version('vllm-tpu')
+    #   version("vllm") -> version("vllm-tpu")
+    sed -i \
+        -e "s/importlib.metadata.version(\(['\"]\)vllm\1)/importlib.metadata.version(\1vllm-tpu\1)/" \
+        -e "s/importlib.metadata.metadata(\(['\"]\)vllm\1)/importlib.metadata.metadata(\1vllm-tpu\1)/" \
+        -e "s/version(\(['\"]\)vllm\1)/version(\1vllm-tpu\1)/" \
+        "${CHANGE_FILE_LIST[@]}"
+    PATCHED=true
+else
+    PATCHED=false
+fi
+
+# Navigate to the vllm directory
+cd "$VLLM_DIR"
+
+# Cleanup function to be called on exit or error
+cleanup() {
+    echo "Cleaning up..."
+    if [ "$PATCHED" = true ]; then
+        echo "Restoring original pyproject.toml..."
+        cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE"
+        rm -f "${PYPROJECT_FILE}.bak"
+
+        echo "Restoring vllm code..."
+        sed -i \
+            -e "s/importlib.metadata.version(\(['\"]\)vllm-tpu\1)/importlib.metadata.version(\1vllm\1)/" \
+            -e "s/importlib.metadata.metadata(\(['\"]\)vllm-tpu\1)/importlib.metadata.metadata(\1vllm\1)/" \
+            -e "s/version(\(['\"]\)vllm-tpu\1)/version(\1vllm\1)/" \
+            "${CHANGE_FILE_LIST[@]}"
+    fi
+}
+trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals
+
+echo "Updating pyproject.toml completed. Proceeding with build..."
+
+echo "Building wheel for TPU..."
+rm -rf dist/
+mkdir -p dist/
+
+# User confirmed to use 'python -m build' directly
+if ! VLLM_TARGET_DEVICE=tpu python -m build; then
+    echo "Error: Python build command failed. Check if 'python -m build' works and the 'build' module is installed."
+    exit 1
+fi
+
+trap - EXIT HUP INT QUIT PIPE TERM
+cleanup
+
+exit 0 
\ No newline at end of file
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 76480f3e58fee21f33099a9e6bf7984e1ff21c43..fd4caa69ec9c1c2c6aa742a387afda74ece6b981 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -3,7 +3,7 @@
 
 import glob
 
-requires_files = glob.glob('requirements/*.txt')
+requires_files = glob.glob("requirements/*.txt")
 requires_files += ["pyproject.toml"]
 for file in requires_files:
     print(f">>> cleaning {file}")
@@ -11,11 +11,11 @@ for file in requires_files:
         lines = f.readlines()
     if "torch" in "".join(lines).lower():
         print("removed:")
-        with open(file, 'w') as f:
+        with open(file, "w") as f:
             for line in lines:
-                if 'torch' not in line.lower():
+                if "torch" not in line.lower():
                     f.write(line)
                 else:
                     print(line.strip())
     print(f"<<< done cleaning {file}")
-    print()
\ No newline at end of file
+    print()
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 3a5c1b1ce0dafa7f96615601253d3b7e5c2cc506..19b2cdc673c4741ea5a21a0e6c8de09c1c135057 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -21,7 +21,7 @@ MODULE_ATTRS = {
     "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
     "LLMEngine": ".engine.llm_engine:LLMEngine",
     "LLM": ".entrypoints.llm:LLM",
-    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
+    "initialize_ray_cluster": ".v1.executor.ray_utils:initialize_ray_cluster",
     "PromptType": ".inputs:PromptType",
     "TextPrompt": ".inputs:TextPrompt",
     "TokensPrompt": ".inputs:TokensPrompt",
@@ -45,17 +45,23 @@ if typing.TYPE_CHECKING:
     from vllm.engine.async_llm_engine import AsyncLLMEngine
     from vllm.engine.llm_engine import LLMEngine
     from vllm.entrypoints.llm import LLM
-    from vllm.executor.ray_utils import initialize_ray_cluster
     from vllm.inputs import PromptType, TextPrompt, TokensPrompt
     from vllm.model_executor.models import ModelRegistry
-    from vllm.outputs import (ClassificationOutput,
-                              ClassificationRequestOutput, CompletionOutput,
-                              EmbeddingOutput, EmbeddingRequestOutput,
-                              PoolingOutput, PoolingRequestOutput,
-                              RequestOutput, ScoringOutput,
-                              ScoringRequestOutput)
+    from vllm.outputs import (
+        ClassificationOutput,
+        ClassificationRequestOutput,
+        CompletionOutput,
+        EmbeddingOutput,
+        EmbeddingRequestOutput,
+        PoolingOutput,
+        PoolingRequestOutput,
+        RequestOutput,
+        ScoringOutput,
+        ScoringRequestOutput,
+    )
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
+    from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
     from ._bc_linter import bc_linter_include, bc_linter_skip
 else:
@@ -68,8 +74,7 @@ else:
             module = import_module(module_name, __package__)
             return getattr(module, attr_name)
         else:
-            raise AttributeError(
-                f'module {__package__} has no attribute {name}')
+            raise AttributeError(f"module {__package__} has no attribute {name}")
 
 
 __all__ = [
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53e4ae6e5296a1bc6f87f52a7d3128776d754ae
--- /dev/null
+++ b/vllm/_aiter_ops.py
@@ -0,0 +1,983 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from collections.abc import Callable
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
+
+
+def is_aiter_found() -> bool:
+    from importlib.util import find_spec
+
+    return find_spec("aiter") is not None
+
+
+# `find_spec` is not torch.compile compatible.
+# In cases where aiter availability might have
+# been checked in forward passes that are torch compiled.
+# we keep this global outside to not cause torch compile breaks.
+IS_AITER_FOUND = is_aiter_found()
+
+
+def if_aiter_supported(func: Callable) -> Callable:
+    """Decorator that only executes the function if
+    ROCm AITER package is supported on gfx9 archs.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # checks the platform, device arch and aiter library existence.
+
+        if current_platform.is_rocm() and IS_AITER_FOUND:
+            from vllm.platforms.rocm import on_gfx9
+
+            if on_gfx9():
+                return func(*args, **kwargs)
+
+        return None
+
+    return wrapper
+
+
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, dtypes, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import dtypes
+
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+def _rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(
+        hidden_states,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        expert_mask,
+        activation,
+        quant_type,
+        doweight_stage1,
+        w1_scale,
+        w2_scale,
+        a1_scale,
+        a2_scale,
+    )
+
+
+def _rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_asm_moe_tkw1_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+
+
+def _rocm_aiter_asm_moe_tkw1_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_topk_softmax_impl(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    from aiter import topk_softmax
+
+    topk_softmax(
+        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+    )
+
+
+def _rocm_aiter_topk_softmax_fake(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    pass
+
+
+def _rocm_aiter_biased_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    from aiter import biased_grouped_topk
+
+    biased_grouped_topk(
+        gating_output,
+        correction_bias,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_biased_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    is_softmax = scoring_func == "softmax"
+    from aiter import grouped_topk
+
+    grouped_topk(
+        gating_output,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        is_softmax,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_mla_decode_fwd_impl(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(
+        q,
+        kv_buffer.view(-1, 1, 1, q.shape[-1]),
+        o,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        max_seqlen_qo,
+        sm_scale=sm_scale,
+        logit_cap=logit_cap,
+    )
+
+
+def _rocm_aiter_mla_decode_fwd_fake(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    pass
+
+
+def _rocm_aiter_gemm_a8w8_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_CK
+
+    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+    # a to be [M, K]
+    # b to be [N, K]
+    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
+
+
+def _rocm_aiter_gemm_a8w8_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_gemm_a8w8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_blockscale
+
+    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+
+def _rocm_aiter_gemm_a8w8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_rms_norm_impl(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    from aiter import rms_norm
+
+    if x.dim() > 2:
+        x_original_shape = x.shape
+        x = x.reshape(-1, x_original_shape[-1])
+        x = rms_norm(x, weight, variance_epsilon)
+        return x.reshape(x_original_shape)
+
+    return rms_norm(x, weight, variance_epsilon)
+
+
+def _rocm_aiter_rms_norm_fake(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import rmsnorm2d_fwd_with_add
+
+    residual_out = torch.empty_like(residual)
+    output = torch.empty_like(x)
+    rmsnorm2d_fwd_with_add(
+        output,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return output, residual_out
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x), torch.empty_like(residual)
+
+
+# Global flag to ensure ops are registered only once
+_OPS_REGISTERED = False
+
+
+class rocm_aiter_ops:
+    _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
+    _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
+    _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
+    _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
+    _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
+    _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+    _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
+    _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+    _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+    _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+    _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
+    _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+    _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
+
+    @classmethod
+    @if_aiter_supported
+    def is_enabled(cls) -> bool:
+        """Verifies device specs and availability of aiter main env variable."""
+        return cls._AITER_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._LINEAR_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_fp8_enaled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls.is_linear_enabled() and current_platform.is_fp8_fnuz()
+
+    @classmethod
+    @if_aiter_supported
+    def is_rmsnorm_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fused_moe_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._FMOE_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fusion_moe_shared_experts_enabled(cls) -> bool:
+        return cls.is_fused_moe_enabled() and cls._MOE_SHARED_EXPERTS_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mla_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MLA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mha_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MHA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_pa_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_unified_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fp8bmm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP8BMM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_asm_fp4_gemm_dynamic_quant_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_rotary_embed_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_ROTARY_EMBED
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_gemm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM
+
+    @staticmethod
+    @if_aiter_supported
+    def register_ops_once() -> None:
+        global _OPS_REGISTERED
+        if not _OPS_REGISTERED:
+            tags = (
+                tuple()
+                if is_torch_equal_or_newer("2.7.0")
+                else (torch.Tag.needs_fixed_stride_order,)
+            )
+
+            # register all the custom ops here
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_asm_moe_tkw1",
+                op_func=_rocm_aiter_asm_moe_tkw1_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_asm_moe_tkw1_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_moe",
+                op_func=_rocm_aiter_fused_moe_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_fused_moe_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_topk_softmax",
+                op_func=_rocm_aiter_topk_softmax_impl,
+                mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+                fake_impl=_rocm_aiter_topk_softmax_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_biased_grouped_topk",
+                op_func=_rocm_aiter_biased_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_biased_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_grouped_topk",
+                op_func=_rocm_aiter_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_mla_decode_fwd",
+                op_func=_rocm_aiter_mla_decode_fwd_impl,
+                mutates_args=["o"],
+                fake_impl=_rocm_aiter_mla_decode_fwd_fake,
+                tags=tags,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8",
+                op_func=_rocm_aiter_gemm_a8w8_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8_blockscale",
+                op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rms_norm",
+                op_func=_rocm_aiter_rms_norm_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rms_norm_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
+                op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            _OPS_REGISTERED = True
+
+    @staticmethod
+    def rms_norm2d_with_add(
+        x: torch.Tensor,
+        residual: torch.Tensor,
+        weight: torch.Tensor,
+        variance_epsilon: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add(
+            x, residual, weight, variance_epsilon
+        )
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
+    @staticmethod
+    def gemm_a8w8(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        bias: torch.Tensor | None = None,
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
+
+    @staticmethod
+    def gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8_blockscale(
+            A, B, As, Bs, output_dtype
+        )
+
+    @staticmethod
+    def fused_moe(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+        quant_method: int = 0,
+        doweight_stage1: bool = False,
+        w1_scale: torch.Tensor | None = None,
+        w2_scale: torch.Tensor | None = None,
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            expert_mask,
+            activation_method,
+            quant_method,
+            doweight_stage1,
+            w1_scale,
+            w2_scale,
+            a1_scale,
+            a2_scale,
+        )
+
+    @staticmethod
+    def asm_moe_tkw1(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: torch.Tensor | None = None,
+        fc2_scale: torch.Tensor | None = None,
+        fc1_smooth_scale: torch.Tensor | None = None,
+        fc2_smooth_scale: torch.Tensor | None = None,
+        a16: bool = False,
+        per_tensor_quant_scale: torch.Tensor | None = None,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale,
+            fc2_scale,
+            fc1_smooth_scale,
+            fc2_smooth_scale,
+            a16,
+            per_tensor_quant_scale,
+            expert_mask,
+            activation_method,
+        )
+
+    @staticmethod
+    def topk_softmax(
+        topk_weights: torch.Tensor,
+        topk_indices: torch.Tensor,
+        token_expert_indices: torch.Tensor,
+        gating_output: torch.Tensor,
+        renormalize: bool,
+    ) -> tuple[torch.Tensor, ...]:
+        torch.ops.vllm.rocm_aiter_topk_softmax(
+            topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+        )
+        return topk_weights, topk_indices
+
+    @staticmethod
+    def biased_grouped_topk(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def grouped_topk(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            scoring_func,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def mla_decode_fwd(
+        q: torch.Tensor,
+        kv_buffer: torch.Tensor,
+        o: torch.Tensor,
+        sm_scale: float,
+        qo_indptr: torch.Tensor,
+        max_seqlen_qo: int,
+        kv_indptr: torch.Tensor | None = None,
+        kv_indices: torch.Tensor | None = None,
+        kv_last_page_lens: torch.Tensor | None = None,
+        logit_cap: float = 0.0,
+    ):
+        torch.ops.vllm.rocm_aiter_mla_decode_fwd(
+            q,
+            kv_buffer.view(-1, 1, 1, q.shape[-1]),
+            o,
+            qo_indptr,
+            max_seqlen_qo,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            sm_scale=sm_scale,
+            logit_cap=logit_cap,
+        )
+
+    @staticmethod
+    def triton_fp4_gemm_dynamic_qaunt(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+        x_scales: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+        if x_scales is None:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+        else:
+            x_q = x
+            x_s = x_scales
+
+        y = torch.empty(
+            x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype
+        )
+
+        gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
+        return y
+
+    @staticmethod
+    def triton_rotary_embed(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos_sin_cache: torch.Tensor,
+        head_size: int,
+        rotary_dim: int,
+        is_neox_style: bool,
+    ):
+        from aiter.ops.triton.rope import rope_cached_thd_positions_2c_fwd_inplace
+
+        num_tokens = positions.numel()
+        cos, sin = cos_sin_cache.chunk(2, dim=-1)
+        query_shape = query.shape
+        key_shape = key.shape
+        rotate_style = 0 if is_neox_style else 1
+
+        query = query.view(num_tokens, -1, head_size)
+        key = key.view(num_tokens, -1, head_size)
+        query_ = query[..., :rotary_dim]
+        key_ = key[..., :rotary_dim]
+        positions = positions.view(*query.shape[:1])
+        rope_cached_thd_positions_2c_fwd_inplace(
+            positions,
+            sin,
+            cos,
+            query_,
+            key_,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            is_nope_first=False,
+        )
+        query = query.view(query_shape)
+        key = key.view(key_shape)
+
+    @staticmethod
+    def triton_fp8_bmm(
+        X: torch.Tensor,
+        WQ: torch.Tensor,
+        w_scale: torch.Tensor,
+        group_size: int = 128,
+        bias: torch.Tensor | None = None,
+        dtype: torch.dtype | None = torch.bfloat16,
+        splitK: int | None = None,
+        YQ: torch.Tensor | None = None,
+        transpose_bm: bool | None = False,
+        config: dict | None = None,
+    ) -> torch.Tensor:
+        # ruff: noqa: E501 # isort: skip
+        from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (
+            batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
+        )
+
+        return aiter_triton_fp8_bmm(
+            X,
+            WQ,
+            w_scale,
+            group_size=group_size,
+            bias=bias,
+            dtype=dtype,
+            splitK=splitK,
+            YQ=YQ,
+            transpose_bm=transpose_bm,
+            config=config,
+        )
+
+    @staticmethod
+    def triton_gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
+
+        return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+    @staticmethod
+    def group_fp8_quant(
+        input_2d: torch.Tensor,
+        group_size: int = 128,
+    ) -> tuple[torch.Tensor, ...]:
+        assert group_size == 128, "Group size must be 128"
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size)
+
+    @staticmethod
+    def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (1024, 8192),
+            (2112, 7168),
+            (3072, 1536),
+            (32768, 8192),
+            (4096, 7168),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2048),
+            (7168, 256),
+            (8192, 1024),
+            (8192, 32768),
+        ]
+
+    @staticmethod
+    def shuffle_weight(
+        self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> torch.Tensor:
+        from aiter.ops.shuffle import shuffle_weight
+
+        return shuffle_weight(tensor, layout=layout)
+
+    @staticmethod
+    def shuffle_weights(
+        *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Applies shuffle_weight function from AITER to each
+        input tensor and returns them.
+
+        Rearranges (shuffles) the input tensor/s
+        into a specified block layout for optimized computation.
+
+        Args:
+            *tensors: Variable number of torch.Tensor objects.
+            layout: A pair of integers specifying the block sizes used to divide
+                the tensors during shuffling. Default is (16, 16).
+
+        Returns:
+        A Tuple of shuffled tensors.
+        """
+        from aiter.ops.shuffle import shuffle_weight
+
+        return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
+
+
+rocm_aiter_ops.register_ops_once()
diff --git a/vllm/_bc_linter.py b/vllm/_bc_linter.py
index 52a95dbee18663525fc8375ac0bb8889f0e6057d..2929a8bce85ac156b0124dd90d5482636a769347 100644
--- a/vllm/_bc_linter.py
+++ b/vllm/_bc_linter.py
@@ -1,21 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # vllm/_bc_linter.py
-from __future__ import annotations
-
-from typing import Any, Callable, TypeVar, overload
+from collections.abc import Callable
+from typing import Any, TypeVar, overload
 
 T = TypeVar("T")
 
 
 @overload
-def bc_linter_skip(obj: T) -> T:
-    ...
+def bc_linter_skip(obj: T) -> T: ...
 
 
 @overload
-def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]:
-    ...
+def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]: ...
 
 
 def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
@@ -34,13 +31,11 @@ def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
 
 
 @overload
-def bc_linter_include(obj: T) -> T:
-    ...
+def bc_linter_include(obj: T) -> T: ...
 
 
 @overload
-def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]:
-    ...
+def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]: ...
 
 
 def bc_linter_include(obj: Any = None, *, reason: str | None = None):
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 93956579eb698d5494ff12368491356ad4cbce9f..ba0cc6c54d291206cc168d4a0e969fcaefe29836 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import contextlib
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Literal
 
 import torch
 
@@ -13,16 +12,7 @@ from vllm.scalar_type import ScalarType
 
 logger = init_logger(__name__)
 
-if not current_platform.is_tpu() and not current_platform.is_xpu():
-    try:
-        import vllm._C
-    except ImportError as e:
-        logger.warning("Failed to import from vllm._C with %r", e)
-
-supports_moe_ops = False
-with contextlib.suppress(ImportError):
-    import vllm._moe_C  # noqa: F401
-    supports_moe_ops = True
+current_platform.import_kernels()
 
 if TYPE_CHECKING:
 
@@ -47,7 +37,7 @@ def paged_attention_v1(
     seq_lens: torch.Tensor,
     block_size: int,
     max_seq_len: int,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
@@ -58,11 +48,26 @@ def paged_attention_v1(
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
     torch.ops._C.paged_attention_v1(
-        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
-        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
-        blocksparse_vert_stride, blocksparse_block_size,
-        blocksparse_head_sliding_step)
+        out,
+        query,
+        key_cache,
+        value_cache,
+        num_kv_heads,
+        scale,
+        block_tables,
+        seq_lens,
+        block_size,
+        max_seq_len,
+        alibi_slopes,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+        tp_rank,
+        blocksparse_local_blocks,
+        blocksparse_vert_stride,
+        blocksparse_block_size,
+        blocksparse_head_sliding_step,
+    )
 
 
 def paged_attention_v2(
@@ -79,7 +84,7 @@ def paged_attention_v2(
     seq_lens: torch.Tensor,
     block_size: int,
     max_seq_len: int,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
@@ -90,11 +95,29 @@ def paged_attention_v2(
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
     torch.ops._C.paged_attention_v2(
-        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
-        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
-        blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+        out,
+        exp_sum,
+        max_logits,
+        tmp_out,
+        query,
+        key_cache,
+        value_cache,
+        num_kv_heads,
+        scale,
+        block_tables,
+        seq_lens,
+        block_size,
+        max_seq_len,
+        alibi_slopes,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+        tp_rank,
+        blocksparse_local_blocks,
+        blocksparse_vert_stride,
+        blocksparse_block_size,
+        blocksparse_head_sliding_step,
+    )
 
 
 # def paged_attention_rocm(
@@ -109,22 +132,38 @@ def paged_attention_v2(
 #     scale: float,
 #     block_tables: torch.Tensor,
 #     seq_lens: torch.Tensor,
-#     query_start_loc: Optional[torch.Tensor],
+#     query_start_loc: torch.Tensor | None,
 #     block_size: int,
 #     max_seq_len: int,
-#     alibi_slopes: Optional[torch.Tensor],
+#     alibi_slopes: torch.Tensor | None,
 #     kv_cache_dtype: str,
 #     k_scale: torch.Tensor,
 #     v_scale: torch.Tensor,
-#     fp8_out_scale: Optional[torch.Tensor] = None,
+#     fp8_out_scale: torch.Tensor | None = None,
 #     mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16",
 # ) -> None:
-#     torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
-#                                       key_cache, value_cache, num_kv_heads,
-#                                       scale, block_tables, seq_lens,
-#                                       query_start_loc, block_size, max_seq_len,
-#                                       alibi_slopes, kv_cache_dtype, k_scale,
-#                                       v_scale, fp8_out_scale, mfma_type)
+#     torch.ops._rocm_C.paged_attention(
+#         out,
+#         exp_sum,
+#         max_logits,
+#         tmp_out,
+#         query,
+#         key_cache,
+#         value_cache,
+#         num_kv_heads,
+#         scale,
+#         block_tables,
+#         seq_lens,
+#         query_start_loc,
+#         block_size,
+#         max_seq_len,
+#         alibi_slopes,
+#         kv_cache_dtype,
+#         k_scale,
+#         v_scale,
+#         fp8_out_scale,
+#         mfma_type,
+#     )
 
 
 def mla_decode_kvcache_cpu(
@@ -135,19 +174,23 @@ def mla_decode_kvcache_cpu(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
 ) -> None:
-    torch.ops._C_cpu.mla_decode_kvcache(out, query, kv_cache, scale,
-                                        block_tables, seq_lens)
+    torch.ops._C_cpu.mla_decode_kvcache(
+        out, query, kv_cache, scale, block_tables, seq_lens
+    )
 
 
 # merge attn states ops
-def merge_attn_states(output: torch.Tensor,
-                      prefix_output: torch.Tensor,
-                      prefix_lse: torch.Tensor,
-                      suffix_output: torch.Tensor,
-                      suffix_lse: torch.Tensor,
-                      output_lse: Optional[torch.Tensor] = None) -> None:
-    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
-                                   prefix_lse, suffix_output, suffix_lse)
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: torch.Tensor | None = None,
+) -> None:
+    torch.ops._C.merge_attn_states(
+        output, output_lse, prefix_output, prefix_lse, suffix_output, suffix_lse
+    )
 
 
 def convert_vertical_slash_indexes(
@@ -166,33 +209,43 @@ def convert_vertical_slash_indexes(
     nnz_vertical = vertical_indexes.size(2)
     num_rows = (context_size + block_size_M - 1) // block_size_M
 
-    block_count = torch.zeros(batch_size,
-                              num_heads,
-                              num_rows,
-                              dtype=q_seqlens.dtype,
-                              device=q_seqlens.device)
-    block_offset = torch.zeros(batch_size,
-                               num_heads,
-                               num_rows,
-                               nnz_slash,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
-    column_count = torch.zeros(batch_size,
-                               num_heads,
-                               num_rows,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
-    column_index = torch.zeros(batch_size,
-                               num_heads,
-                               num_rows,
-                               nnz_vertical,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
+    block_count = torch.zeros(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    block_offset = torch.zeros(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_slash,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+    column_count = torch.zeros(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    column_index = torch.zeros(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_vertical,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
 
     torch.ops._C.convert_vertical_slash_indexes(
-        block_count, block_offset, column_count, column_index, q_seqlens,
-        kv_seqlens, vertical_indexes, slash_indexes, context_size,
-        block_size_M, block_size_N, causal)
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        q_seqlens,
+        kv_seqlens,
+        vertical_indexes,
+        slash_indexes,
+        context_size,
+        block_size_M,
+        block_size_N,
+        causal,
+    )
     return block_count, block_offset, column_count, column_index
 
 
@@ -215,33 +268,45 @@ def convert_vertical_slash_indexes_mergehead(
     nnz_vertical = vertical_indexes.size(2)
     num_rows = (context_size + block_size_M - 1) // block_size_M
 
-    block_count = torch.empty(batch_size,
-                              num_heads,
-                              num_rows,
-                              dtype=q_seqlens.dtype,
-                              device=q_seqlens.device)
-    block_offset = torch.empty(batch_size,
-                               num_heads,
-                               num_rows,
-                               nnz_slash,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
-    column_count = torch.empty(batch_size,
-                               num_heads,
-                               num_rows,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
-    column_index = torch.empty(batch_size,
-                               num_heads,
-                               num_rows,
-                               nnz_vertical,
-                               dtype=q_seqlens.dtype,
-                               device=q_seqlens.device)
+    block_count = torch.empty(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    block_offset = torch.empty(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_slash,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+    column_count = torch.empty(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    column_index = torch.empty(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_vertical,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
 
     torch.ops._C.convert_vertical_slash_indexes_mergehead(
-        block_count, block_offset, column_count, column_index, q_seqlens,
-        kv_seqlens, vertical_indexes, slash_indexes, vertical_indices_count,
-        slash_indices_count, context_size, block_size_M, block_size_N, causal)
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        q_seqlens,
+        kv_seqlens,
+        vertical_indexes,
+        slash_indexes,
+        vertical_indices_count,
+        slash_indices_count,
+        context_size,
+        block_size_M,
+        block_size_N,
+        causal,
+    )
     return block_count, block_offset, column_count, column_index
 
 
@@ -249,58 +314,93 @@ def convert_vertical_slash_indexes_mergehead(
 def rotary_embedding(
     positions: torch.Tensor,
     query: torch.Tensor,
-    key: Optional[torch.Tensor],
+    key: torch.Tensor | None,
     head_size: int,
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    torch.ops._C.rotary_embedding(positions, query, key, head_size,
-                                  cos_sin_cache, is_neox)
+    torch.ops._C.rotary_embedding(
+        positions, query, key, head_size, cos_sin_cache, is_neox
+    )
 
 
 # layer norm ops
-def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
-             epsilon: float) -> None:
+def rms_norm(
+    out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
+) -> None:
     # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
+    # If removed, also need to remove contiguous in MatcherRMSNorm
     input_contiguous = input.contiguous()
     torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
 
 
-def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
-                       weight: torch.Tensor, epsilon: float) -> None:
+def fused_add_rms_norm(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, epsilon: float
+) -> None:
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
-def poly_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
-              bias: torch.Tensor, epsilon: float) -> None:
-    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
-    input_contiguous = input.contiguous()
-    torch.ops._C.poly_norm(out, input_contiguous, weight, bias, epsilon)
+def fused_qk_norm_rope(
+    qkv: torch.Tensor,
+    num_heads_q: int,
+    num_heads_k: int,
+    num_heads_v: int,
+    head_dim: int,
+    eps: float,
+    q_weight: torch.Tensor,
+    k_weight: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    position_ids: torch.Tensor,
+) -> None:
+    torch.ops._C.fused_qk_norm_rope(
+        qkv,
+        num_heads_q,
+        num_heads_k,
+        num_heads_v,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        cos_sin_cache,
+        is_neox,
+        position_ids,
+    )
 
 
 def apply_repetition_penalties_torch(
-        logits: torch.Tensor, prompt_mask: torch.Tensor,
-        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    logits: torch.Tensor,
+    prompt_mask: torch.Tensor,
+    output_mask: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+) -> None:
     repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
-        1, logits.size(1))
+        1, logits.size(1)
+    )
     # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
-                            1.0)
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties, 1.0)
     # If logits are positive, divide by penalty, otherwise multiply by penalty.
     scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
     logits *= scaling
 
 
 def apply_repetition_penalties_cuda(
-        logits: torch.Tensor, prompt_mask: torch.Tensor,
-        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
-    torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
-                                             repetition_penalties)
+    logits: torch.Tensor,
+    prompt_mask: torch.Tensor,
+    output_mask: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+) -> None:
+    torch.ops._C.apply_repetition_penalties_(
+        logits, prompt_mask, output_mask, repetition_penalties
+    )
 
 
-def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
-                               output_mask: torch.Tensor,
-                               repetition_penalties: torch.Tensor) -> None:
+def apply_repetition_penalties(
+    logits: torch.Tensor,
+    prompt_mask: torch.Tensor,
+    output_mask: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+) -> None:
     """Apply repetition penalties to logits in-place.
 
     Args:
@@ -310,11 +410,13 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
         repetition_penalties: The repetition penalties of shape (num_seqs, ).
     """
     if logits.is_cuda and logits.is_contiguous():
-        apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
-                                        repetition_penalties)
+        apply_repetition_penalties_cuda(
+            logits, prompt_mask, output_mask, repetition_penalties
+        )
     else:
-        apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
-                                         repetition_penalties)
+        apply_repetition_penalties_torch(
+            logits, prompt_mask, output_mask, repetition_penalties
+        )
 
 # trans_w16
 def trans_w16_gemm(dst: torch.Tensor, src: torch.Tensor,
@@ -328,129 +430,182 @@ def rms_norm_dynamic_per_token_quant(
     weight: torch.Tensor,
     epsilon: float,
     quant_dtype: torch.dtype,
-    scale_ub: Optional[torch.Tensor] = None,
-    residual: Optional[torch.Tensor] = None
+    scale_ub: torch.Tensor | None = None,
+    residual: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=quant_dtype)
-    scales = torch.empty((input.numel() // input.shape[-1], 1),
-                         device=input.device,
-                         dtype=torch.float32)
+    scales = torch.empty(
+        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
+    )
 
-    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
-                                                  scales, epsilon, scale_ub,
-                                                  residual)
+    torch.ops._C.rms_norm_dynamic_per_token_quant(
+        output, input, weight, scales, epsilon, scale_ub, residual
+    )
     return output, scales
 
 
 # quantization ops
 # awq
-def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
-                   zeros: torch.Tensor, split_k_iters: int, thx: int,
-                   thy: int) -> torch.Tensor:
+def awq_dequantize(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    split_k_iters: int,
+    thx: int,
+    thy: int,
+) -> torch.Tensor:
     if envs.VLLM_USE_TRITON_AWQ:
         from vllm.model_executor.layers.quantization.awq_triton import (
-            awq_dequantize_triton)
+            awq_dequantize_triton,
+        )
+
         return awq_dequantize_triton(qweight, scales, zeros)
-    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
-                                       thx, thy)
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters, thx, thy)
 
 
-def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
-             scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
+def awq_gemm(
+    input: torch.Tensor,
+    qweight: torch.Tensor,
+    qzeros: torch.Tensor,
+    scales: torch.Tensor,
+    split_k_iters: int,
+) -> torch.Tensor:
     if envs.VLLM_USE_TRITON_AWQ:
-        from vllm.model_executor.layers.quantization.awq_triton import (
-            awq_gemm_triton)
+        from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
+
         return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
     return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
 
 
 # gptq
-def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-              b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
-              b_g_idx: torch.Tensor, use_exllama: bool,
-              bit: int) -> torch.Tensor:
-    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
-                                  b_g_idx, use_exllama, bit)
+def gptq_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_gptq_qzeros: torch.Tensor,
+    b_gptq_scales: torch.Tensor,
+    b_g_idx: torch.Tensor,
+    use_exllama: bool,
+    use_v2_format: bool,
+    bit: int,
+) -> torch.Tensor:
+    return torch.ops._C.gptq_gemm(
+        a,
+        b_q_weight,
+        b_gptq_qzeros,
+        b_gptq_scales,
+        b_g_idx,
+        use_exllama,
+        use_v2_format,
+        bit,
+    )
 
 
 if hasattr(torch.ops._C, "gptq_gemm"):
 
     @register_fake("_C::gptq_gemm")
-    def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                        b_gptq_qzeros: torch.Tensor,
-                        b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor,
-                        use_exllama: bool, bit: int) -> torch.Tensor:
-        return torch.empty((a.size(0), b_q_weight.size(1)),
-                           dtype=a.dtype,
-                           device=a.device)
+    def _gptq_gemm_fake(
+        a: torch.Tensor,
+        b_q_weight: torch.Tensor,
+        b_gptq_qzeros: torch.Tensor,
+        b_gptq_scales: torch.Tensor,
+        b_g_idx: torch.Tensor,
+        use_exllama: bool,
+        use_v2_format: bool,
+        bit: int,
+    ) -> torch.Tensor:
+        return torch.empty(
+            (a.size(0), b_q_weight.size(1)), dtype=a.dtype, device=a.device
+        )
 
 
-def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
-                 bit: int) -> None:
+def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, bit: int) -> None:
     torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
 # marlin_24
-def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                        b_meta: torch.Tensor, b_scales: torch.Tensor,
-                        workspace: torch.Tensor, b_q_type: ScalarType,
-                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                            workspace, b_q_type.id, size_m,
-                                            size_n, size_k)
+def gptq_marlin_24_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_meta: torch.Tensor,
+    b_scales: torch.Tensor,
+    workspace: torch.Tensor,
+    b_q_type: ScalarType,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_24_gemm(
+        a, b_q_weight, b_meta, b_scales, workspace, b_q_type.id, size_m, size_n, size_k
+    )
 
 
 if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
 
     @register_fake("_C::gptq_marlin_24_gemm")
-    def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                                  b_meta: torch.Tensor, b_scales: torch.Tensor,
-                                  workspace: torch.Tensor,
-                                  b_q_type: ScalarType, size_m: torch.SymInt,
-                                  size_n: torch.SymInt,
-                                  size_k: torch.SymInt) -> torch.Tensor:
+    def _gptq_marlin_24_gemm_fake(
+        a: torch.Tensor,
+        b_q_weight: torch.Tensor,
+        b_meta: torch.Tensor,
+        b_scales: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+    ) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
 
     @register_fake("_C::gptq_marlin_gemm")
-    def _gptq_marlin_gemm_fake(a: torch.Tensor,
-                               c: Optional[torch.Tensor],
-                               b_q_weight: torch.Tensor,
-                               b_bias: Optional[torch.Tensor],
-                               b_scales: torch.Tensor,
-                               global_scale: Optional[torch.Tensor],
-                               b_zeros: Optional[torch.Tensor],
-                               g_idx: Optional[torch.Tensor],
-                               perm: Optional[torch.Tensor],
-                               workspace: torch.Tensor,
-                               b_q_type_id: int,
-                               size_m: torch.SymInt,
-                               size_n: torch.SymInt,
-                               size_k: torch.SymInt,
-                               is_k_full: bool = True,
-                               use_atomic_add: bool = False,
-                               use_fp32_reduce: bool = False,
-                               is_zp_float: bool = False) -> torch.Tensor:
+    def _gptq_marlin_gemm_fake(
+        a: torch.Tensor,
+        c: torch.Tensor | None,
+        b_q_weight: torch.Tensor,
+        b_bias: torch.Tensor | None,
+        b_scales: torch.Tensor,
+        global_scale: torch.Tensor | None,
+        b_zeros: torch.Tensor | None,
+        g_idx: torch.Tensor | None,
+        perm: torch.Tensor | None,
+        workspace: torch.Tensor,
+        b_q_type_id: int,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool = True,
+        use_atomic_add: bool = False,
+        use_fp32_reduce: bool = False,
+        is_zp_float: bool = False,
+    ) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
 
     @register_fake("_C::awq_dequantize")
-    def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
-                             zeros: torch.Tensor, split_k_iters: torch.SymInt,
-                             thx: int, thy: int) -> torch.Tensor:
+    def _awq_dequantize_fake(
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zeros: torch.Tensor,
+        split_k_iters: torch.SymInt,
+        thx: int,
+        thy: int,
+    ) -> torch.Tensor:
         in_c = qweight.size(0)
         qout_c = qweight.size(1)
         out_c = qout_c * 8
-        return torch.empty((in_c, out_c),
-                           dtype=scales.dtype,
-                           device=scales.device)
+        return torch.empty((in_c, out_c), dtype=scales.dtype, device=scales.device)
 
     @register_fake("_C::awq_gemm")
-    def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
-                       qzeros: torch.Tensor, scales: torch.Tensor,
-                       split_k_iters: torch.SymInt) -> torch.Tensor:
+    def _awq_gemm_fake(
+        input: torch.Tensor,
+        qweight: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        split_k_iters: torch.SymInt,
+    ) -> torch.Tensor:
         num_in_feats = input.size(0)
-        return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8),
-                           dtype=input.dtype,
-                           device=input.device).sum(0)
+        return torch.empty(
+            (split_k_iters, num_in_feats, qweight.size(1) * 8),
+            dtype=input.dtype,
+            device=input.device,
+        ).sum(0)
 
     @register_fake("_C::machete_mm")
     def machete_mm_fake(
@@ -458,13 +613,13 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         # b_q Should be the tensor returned by machete_prepack_B
         b_q: torch.Tensor,
         b_type: ScalarType,
-        out_type: Optional[torch.dtype] = None,
-        b_group_scales: Optional[torch.Tensor] = None,
-        b_group_zeros: Optional[torch.Tensor] = None,
-        b_group_size: Optional[int] = None,
-        b_channel_scales: Optional[torch.Tensor] = None,
-        a_token_scales: Optional[torch.Tensor] = None,
-        schedule: Optional[str] = None,
+        out_type: torch.dtype | None = None,
+        b_group_scales: torch.Tensor | None = None,
+        b_group_zeros: torch.Tensor | None = None,
+        b_group_size: int | None = None,
+        b_channel_scales: torch.Tensor | None = None,
+        a_token_scales: torch.Tensor | None = None,
+        schedule: str | None = None,
     ) -> torch.Tensor:
         m = a.size(0)
         n = b_q.size(1)
@@ -472,22 +627,25 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
 
     @register_fake("_C::machete_prepack_B")
     def machete_prepack_B_fake(
-            b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
-            group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
-        return torch.empty_like(b_q_weight,
-                                memory_format=torch.contiguous_format)
+        b_q_weight: torch.Tensor,
+        a_type: torch.dtype,
+        b_type: ScalarType,
+        group_scales_type: torch.dtype | None,
+    ) -> torch.Tensor:
+        return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format)
 
     @register_fake("_C::cutlass_w4a8_mm")
     def cutlass_w4a8_mm_fake(
-            a: torch.Tensor,
-            # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
-            b_q: torch.Tensor,
-            b_group_scales: torch.Tensor,
-            b_group_size: int,
-            b_channel_scales: torch.Tensor,
-            a_token_scales: torch.Tensor,
-            out_type: Optional[torch.dtype] = None,
-            maybe_schedule: Optional[str] = None) -> torch.Tensor:
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
+        b_q: torch.Tensor,
+        b_group_scales: torch.Tensor,
+        b_group_size: int,
+        b_channel_scales: torch.Tensor,
+        a_token_scales: torch.Tensor,
+        out_type: torch.dtype | None = None,
+        maybe_schedule: str | None = None,
+    ) -> torch.Tensor:
         m = a.size(0)
         n = b_q.size(1)
         out_dtype = out_type if out_type is not None else torch.bfloat16
@@ -505,15 +663,19 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
 if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 
     @register_fake("_C::allspark_w8a16_gemm")
-    def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
-                                  b_scales: torch.Tensor,
-                                  b_qzeros: Optional[torch.Tensor],
-                                  n: torch.SymInt, group_size: torch.SymInt,
-                                  sm_count: torch.SymInt,
-                                  sm_version: torch.SymInt,
-                                  CUBLAS_M_THRESHOLD: torch.SymInt,
-                                  has_zp: bool,
-                                  n32k16_reorder: bool) -> torch.Tensor:
+    def _allspark_w8a16_gemm_fake(
+        a: torch.Tensor,
+        b_qweight: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_qzeros: torch.Tensor | None,
+        n: torch.SymInt,
+        group_size: torch.SymInt,
+        sm_count: torch.SymInt,
+        sm_version: torch.SymInt,
+        CUBLAS_M_THRESHOLD: torch.SymInt,
+        has_zp: bool,
+        n32k16_reorder: bool,
+    ) -> torch.Tensor:
         m = a.size(0)
         return torch.empty((m, n), device=a.device, dtype=a.dtype)
 
@@ -522,11 +684,12 @@ if hasattr(torch.ops._C, "ggml_dequantize"):
 
     @register_fake("_C::ggml_dequantize")
     def _ggml_dequantize_fake(
-            W: torch.Tensor,
-            quant_type: int,
-            m: torch.SymInt,
-            n: torch.SymInt,
-            dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        W: torch.Tensor,
+        quant_type: int,
+        m: torch.SymInt,
+        n: torch.SymInt,
+        dtype: torch.dtype | None = None,
+    ) -> torch.Tensor:
         return torch.empty((m, n), dtype=torch.float16, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_vec_a8")
@@ -561,9 +724,7 @@ if hasattr(torch.ops._C, "ggml_dequantize"):
         tokens: torch.SymInt,
     ) -> torch.Tensor:
         tokens = X.size(0)
-        return torch.empty((tokens * top_k, row),
-                           dtype=torch.float16,
-                           device=W.device)
+        return torch.empty((tokens * top_k, row), dtype=torch.float16, device=W.device)
 
 
 if hasattr(torch.ops._C, "ggml_moe_a8_vec"):
@@ -579,9 +740,7 @@ if hasattr(torch.ops._C, "ggml_moe_a8_vec"):
         tokens: torch.SymInt,
     ) -> torch.Tensor:
         tokens = X.size(0)
-        return torch.empty((tokens * top_k, row),
-                           dtype=X.dtype,
-                           device=W.device)
+        return torch.empty((tokens * top_k, row), dtype=X.dtype, device=W.device)
 
 
 # cutlass
@@ -598,20 +757,23 @@ def cutlass_blockwise_scaled_grouped_mm(
     problem_sizes: torch.Tensor,
     expert_offsets: torch.Tensor,
 ):
-    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
-                                                     scales_b, problem_sizes,
-                                                     expert_offsets)
+    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
+    )
 
 
-def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
-                          block_scale_a: torch.Tensor,
-                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
-                          out_dtype: torch.dtype) -> torch.Tensor:
+def cutlass_scaled_fp4_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    block_scale_a: torch.Tensor,
+    block_scale_b: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
     assert a.ndim == 2 and b.ndim == 2
     m, n = a.shape[0], b.shape[0]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
-    torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b,
-                                       alpha)
+    torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b, alpha)
     return out
 
 
@@ -620,16 +782,17 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
 
 
 def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
-        cuda_device_capability)
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
 
 
-def cutlass_scaled_mm(a: torch.Tensor,
-                      b: torch.Tensor,
-                      scale_a: torch.Tensor,
-                      scale_b: torch.Tensor,
-                      out_dtype: torch.dtype,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def cutlass_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: torch.dtype,
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     """
     `cutlass_scaled_mm` implements a fused version of
         `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
@@ -652,70 +815,69 @@ def cutlass_scaled_mm(a: torch.Tensor,
         scale_a.shape * [1, 128] == a.shape
         scale_b.shape * [128, 128] == b.shape
     """
-    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.numel(
-    ) == b.shape[1] and bias.dtype == out_dtype
+    assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
+    assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype
 
     # Massage the input to be 2D
     target_shape = (*a.shape[:-1], b.shape[1])
     a = a.view(-1, a.shape[-1])
 
-    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
     if current_platform.is_rocm() or not cutlass_compatible_b:
         from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa
-            triton_scaled_mm)
+            triton_scaled_mm,
+        )
+
         out = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     else:
-        out = torch.empty((a.shape[0], b.shape[1]),
-                          dtype=out_dtype,
-                          device=a.device)
+        out = torch.empty((a.shape[0], b.shape[1]), dtype=out_dtype, device=a.device)
         torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
 
     return out.view(*target_shape)
 
 
-def cutlass_scaled_mm_azp(a: torch.Tensor,
-                          b: torch.Tensor,
-                          scale_a: torch.Tensor,
-                          scale_b: torch.Tensor,
-                          out_dtype: torch.dtype,
-                          azp_adj: torch.Tensor,
-                          azp: Optional[torch.Tensor] = None,
-                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def cutlass_scaled_mm_azp(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: torch.dtype,
+    azp_adj: torch.Tensor,
+    azp: torch.Tensor | None = None,
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     """
     :param azp_adj: In the per-tensor case, this should include the azp.
     Always per-channel.
     :param azp: Only set in the per-token case. Per-token if set.
     """
-    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
-    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.numel(
-    ) == b.shape[1] and bias.dtype == out_dtype
+    assert b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
+    assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
+    assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype
 
     # Massage the input to be 2D
     target_shape = (*a.shape[:-1], b.shape[1])
     a = a.view(-1, a.shape[-1])
     assert azp is None or azp.numel() == a.shape[0]
 
-    out = torch.empty((a.shape[0], b.shape[1]),
-                      dtype=out_dtype,
-                      device=a.device)
-    torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj,
-                                       azp, bias)
+    out = torch.empty((a.shape[0], b.shape[1]), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj, azp, bias)
     return out.view(*target_shape)
 
 
 def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
-        cuda_device_capability)
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(cuda_device_capability)
 
 
 def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+    try:
+        return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+    except AttributeError:
+        # Return False on non-CUDA platforms where it is not available
+        return False
 
 
-def cutlass_sparse_compress(a: torch.Tensor) \
-    -> tuple[torch.Tensor, torch.Tensor]:
+def cutlass_sparse_compress(a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compresses a sparse matrix for use with Cutlass sparse operations.
 
@@ -746,26 +908,25 @@ def cutlass_sparse_compress(a: torch.Tensor) \
         - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
         - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
     """
-    assert (a.dtype in [
-        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
-    ])
-    assert (a.is_contiguous())
+    assert a.dtype in [torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16]
+    assert a.is_contiguous()
 
     # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
     elemsPerMetaElem = 4
-    assert (a.shape[1] % (2 * elemsPerMetaElem) == 0)
+    assert a.shape[1] % (2 * elemsPerMetaElem) == 0
 
     return torch.ops._C.cutlass_sparse_compress(a)
 
 
 def cutlass_scaled_sparse_mm(
-        a: torch.Tensor,
-        bt_nzs: torch.Tensor,
-        bt_meta: torch.Tensor,
-        scale_a: torch.Tensor,
-        scale_b: torch.Tensor,
-        out_dtype: torch.dtype,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    a: torch.Tensor,
+    bt_nzs: torch.Tensor,
+    bt_meta: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: torch.dtype,
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     """
     Performs a scaled sparse matrix multiplication using Cutlass.
 
@@ -789,31 +950,33 @@ def cutlass_scaled_sparse_mm(
     Returns:
     - The result of the scaled sparse matrix multiplication.
     """
-    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
-    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
-        and bias.dtype == out_dtype
+    assert bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0
+    assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] and bias.dtype == out_dtype
 
     m = a.shape[0]
     n = bt_nzs.shape[0]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
-                                          scale_b, bias)
+    torch.ops._C.cutlass_scaled_sparse_mm(
+        out, a, bt_nzs, bt_meta, scale_a, scale_b, bias
+    )
 
     return out
 
 
-def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
-                            expert_offsets: torch.Tensor,
-                            problem_sizes1: torch.Tensor,
-                            problem_sizes2: torch.Tensor,
-                            input_permutation: torch.Tensor,
-                            output_permutation: torch.Tensor,
-                            num_experts: int,
-                            n: int,
-                            k: int,
-                            blockscale_offsets: Optional[torch.Tensor] = None):
+def get_cutlass_moe_mm_data(
+    topk_ids: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    input_permutation: torch.Tensor,
+    output_permutation: torch.Tensor,
+    num_experts: int,
+    n: int,
+    k: int,
+    blockscale_offsets: torch.Tensor | None = None,
+):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
     used in CUTLASS-based fused MoE.
@@ -837,22 +1000,29 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
                           computed with expert E is blockscale_offsets[E + 1] -
                           blockscale_offsets[E]
     """
-    return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
-                                                problem_sizes1, problem_sizes2,
-                                                input_permutation,
-                                                output_permutation,
-                                                num_experts, n, k,
-                                                blockscale_offsets)
+    return torch.ops._C.get_cutlass_moe_mm_data(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        input_permutation,
+        output_permutation,
+        num_experts,
+        n,
+        k,
+        blockscale_offsets,
+    )
 
 
 def get_cutlass_moe_mm_problem_sizes(
-        topk_ids: torch.Tensor,
-        problem_sizes1: torch.Tensor,
-        problem_sizes2: torch.Tensor,
-        num_experts: int,
-        n: int,
-        k: int,
-        blockscale_offsets: Optional[torch.Tensor] = None):
+    topk_ids: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    num_experts: int,
+    n: int,
+    k: int,
+    blockscale_offsets: torch.Tensor | None = None,
+):
     """
     Compute only the per-expert problem sizes needed by the two grouped matrix
     multiplications used in CUTLASS-based fused MoE.
@@ -863,8 +1033,8 @@ def get_cutlass_moe_mm_problem_sizes(
                                     used in the fused MoE operation.
     """
     return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
-        topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k,
-        blockscale_offsets)
+        topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets
+    )
 
 
 def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
@@ -873,25 +1043,31 @@ def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
     This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
     """
     num_tokens_permuted = dst2src_map.shape[0]
-    output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
-                                device=input_tensor.device,
-                                dtype=input_tensor.dtype)
+    output_tensor = torch.empty(
+        (num_tokens_permuted, input_tensor.shape[1]),
+        device=input_tensor.device,
+        dtype=input_tensor.dtype,
+    )
     torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
     return output_tensor
 
 
-def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
-                                 problem_sizes1: torch.Tensor,
-                                 problem_sizes2: torch.Tensor,
-                                 expert_num_tokens: torch.Tensor,
-                                 num_local_experts: int, padded_m: int, n: int,
-                                 k: int):
+def get_cutlass_pplx_moe_mm_data(
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    expert_num_tokens: torch.Tensor,
+    num_local_experts: int,
+    padded_m: int,
+    n: int,
+    k: int,
+):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
     used in CUTLASS-based fused MoE.
 
     The function takes in expert_num_tokens (token count per expert) and
-    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
+    non_zero_expert_idxs (consecutive indices of experts with non-zero token
     counts) and uses them to compute:
     - expert_offsets: Indices that mark at which token index each expert begins
                       its computation.
@@ -900,16 +1076,31 @@ def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
                                       the fused MoE operation.
     """
     return torch.ops._C.get_cutlass_pplx_moe_mm_data(
-        expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
-        num_local_experts, padded_m, n, k)
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        expert_num_tokens,
+        num_local_experts,
+        padded_m,
+        n,
+        k,
+    )
 
 
-def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
-                   b_tensors: torch.Tensor, a_scales: torch.Tensor,
-                   b_scales: torch.Tensor, expert_offsets: torch.Tensor,
-                   problem_sizes: torch.Tensor, a_strides: torch.Tensor,
-                   b_strides: torch.Tensor, c_strides: torch.Tensor,
-                   per_act_token: bool, per_out_ch: bool):
+def cutlass_moe_mm(
+    out_tensors: torch.Tensor,
+    a_tensors: torch.Tensor,
+    b_tensors: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    a_strides: torch.Tensor,
+    b_strides: torch.Tensor,
+    c_strides: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
     """
     A single grouped matrix multiplication used in CUTLASS-based fused MoE.
     The function executes fp8-quantized OUT = AB matrix multiplication.
@@ -921,17 +1112,33 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                      MMs used in the fused MoE operation.
     - a/b/c_strides: The data strides passed to grouped matrix multiplication.
     """
-    return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
-                                       a_scales, b_scales, expert_offsets,
-                                       problem_sizes, a_strides, b_strides,
-                                       c_strides, per_act_token, per_out_ch)
+    return torch.ops._C.cutlass_moe_mm(
+        out_tensors,
+        a_tensors,
+        b_tensors,
+        a_scales,
+        b_scales,
+        expert_offsets,
+        problem_sizes,
+        a_strides,
+        b_strides,
+        c_strides,
+        per_act_token,
+        per_out_ch,
+    )
 
 
-def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
-                       b_tensors: torch.Tensor, a_scales: torch.Tensor,
-                       b_scales: torch.Tensor, alphas: torch.Tensor,
-                       problem_sizes: torch.Tensor,
-                       expert_offsets: torch.Tensor, sf_offsets: torch.Tensor):
+def cutlass_fp4_moe_mm(
+    out_tensors: torch.Tensor,
+    a_tensors: torch.Tensor,
+    b_tensors: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    alphas: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sf_offsets: torch.Tensor,
+):
     """
     An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
     the gemms for each combination based on the specified problem sizes.
@@ -948,132 +1155,239 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
     - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                      MMs used in the fused MoE operation.
     """
-    return torch.ops._C.cutlass_fp4_group_mm(out_tensors, a_tensors, b_tensors,
-                                             a_scales, b_scales, alphas,
-                                             problem_sizes, expert_offsets,
-                                             sf_offsets)
+    return torch.ops._C.cutlass_fp4_group_mm(
+        out_tensors,
+        a_tensors,
+        b_tensors,
+        a_scales,
+        b_scales,
+        alphas,
+        problem_sizes,
+        expert_offsets,
+        sf_offsets,
+    )
 
 
 # gptq_marlin
-def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
-                       size_k: int, size_n: int,
-                       num_bits: int) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
-                                           num_bits)
+def gptq_marlin_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
 
 
-# gptq_marlin
-def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
-                      num_bits: int) -> torch.Tensor:
+if hasattr(torch.ops._C, "gptq_marlin_repack"):
+
+    @register_fake("_C::gptq_marlin_repack")
+    def _gptq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        perm: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
+# awq_marlin
+def awq_marlin_repack(
+    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
-def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
-                           size_k: int, size_n: int,
-                           num_bits: int) -> torch.Tensor:
+if hasattr(torch.ops._C, "awq_marlin_repack"):
+
+    @register_fake("_C::awq_marlin_repack")
+    def _awq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
-                         device=b_q_weight.device,
-                         dtype=b_q_weight.dtype)
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
     for e in range(num_experts):
-        output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
-                                                    size_k, size_n, num_bits)
+        output[e] = torch.ops._C.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
     return output
 
 
-def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
-                          size_k: int, size_n: int,
-                          num_bits: int) -> torch.Tensor:
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
-                         device=b_q_weight.device,
-                         dtype=b_q_weight.dtype)
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
     for e in range(num_experts):
-        output[e] = torch.ops._C.awq_marlin_repack(b_q_weight[e], size_k,
-                                                   size_n, num_bits)
+        output[e] = torch.ops._C.awq_marlin_repack(
+            b_q_weight[e], size_k, size_n, num_bits
+        )
     return output
 
 
-def gptq_marlin_gemm(a: torch.Tensor,
-                     c: Optional[torch.Tensor],
-                     b_q_weight: torch.Tensor,
-                     b_bias: Optional[torch.Tensor],
-                     b_scales: torch.Tensor,
-                     global_scale: Optional[torch.Tensor],
-                     b_zeros: Optional[torch.Tensor],
-                     g_idx: Optional[torch.Tensor],
-                     perm: Optional[torch.Tensor],
-                     workspace: torch.Tensor,
-                     b_q_type: ScalarType,
-                     size_m: int,
-                     size_n: int,
-                     size_k: int,
-                     is_k_full: bool = True,
-                     use_atomic_add: bool = False,
-                     use_fp32_reduce: bool = False,
-                     is_zp_float: bool = False) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_bias, b_scales,
-                                         global_scale, b_zeros, g_idx, perm,
-                                         workspace, b_q_type.id, size_m,
-                                         size_n, size_k, is_k_full,
-                                         use_atomic_add, use_fp32_reduce,
-                                         is_zp_float)
+def gptq_marlin_gemm(
+    a: torch.Tensor,
+    c: torch.Tensor | None,
+    b_q_weight: torch.Tensor,
+    b_bias: torch.Tensor | None,
+    b_scales: torch.Tensor,
+    global_scale: torch.Tensor | None,
+    b_zeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    perm: torch.Tensor | None,
+    workspace: torch.Tensor,
+    b_q_type: ScalarType,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+    is_k_full: bool = True,
+    use_atomic_add: bool = False,
+    use_fp32_reduce: bool = False,
+    is_zp_float: bool = False,
+) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_gemm(
+        a,
+        c,
+        b_q_weight,
+        b_bias,
+        b_scales,
+        global_scale,
+        b_zeros,
+        g_idx,
+        perm,
+        workspace,
+        b_q_type.id,
+        size_m,
+        size_n,
+        size_k,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float,
+    )
 
 
 # machete
 def machete_supported_schedules(
-        a_type: torch.dtype,
-        b_type: ScalarType,
-        group_scales_type: Optional[torch.dtype],
-        group_zeros_type: Optional[torch.dtype] = None,
-        channel_scales_type: Optional[torch.dtype] = None,
-        token_scales_type: Optional[torch.dtype] = None,
-        out_type: Optional[torch.dtype] = None) -> list[str]:
+    a_type: torch.dtype,
+    b_type: ScalarType,
+    group_scales_type: torch.dtype | None,
+    group_zeros_type: torch.dtype | None = None,
+    channel_scales_type: torch.dtype | None = None,
+    token_scales_type: torch.dtype | None = None,
+    out_type: torch.dtype | None = None,
+) -> list[str]:
     return torch.ops._C.machete_supported_schedules(
-        a_type, b_type.id, group_scales_type, group_zeros_type,
-        channel_scales_type, token_scales_type, out_type)
+        a_type,
+        b_type.id,
+        group_scales_type,
+        group_zeros_type,
+        channel_scales_type,
+        token_scales_type,
+        out_type,
+    )
 
 
 def machete_mm(
-        a: torch.Tensor,
-        # b_q Should be the tensor returned by machete_prepack_B
-        b_q: torch.Tensor,
-        b_type: ScalarType,
-        out_type: Optional[torch.dtype] = None,
-        b_group_scales: Optional[torch.Tensor] = None,
-        b_group_zeros: Optional[torch.Tensor] = None,
-        b_group_size: Optional[int] = None,
-        b_channel_scales: Optional[torch.Tensor] = None,
-        a_token_scales: Optional[torch.Tensor] = None,
-        schedule: Optional[str] = None) -> torch.Tensor:
-    return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales,
-                                   b_group_zeros, b_group_size,
-                                   b_channel_scales, a_token_scales, schedule)
+    a: torch.Tensor,
+    # b_q Should be the tensor returned by machete_prepack_B
+    b_q: torch.Tensor,
+    b_type: ScalarType,
+    out_type: torch.dtype | None = None,
+    b_group_scales: torch.Tensor | None = None,
+    b_group_zeros: torch.Tensor | None = None,
+    b_group_size: int | None = None,
+    b_channel_scales: torch.Tensor | None = None,
+    a_token_scales: torch.Tensor | None = None,
+    schedule: str | None = None,
+) -> torch.Tensor:
+    return torch.ops._C.machete_mm(
+        a,
+        b_q,
+        b_type.id,
+        out_type,
+        b_group_scales,
+        b_group_zeros,
+        b_group_size,
+        b_channel_scales,
+        a_token_scales,
+        schedule,
+    )
 
 
 def machete_prepack_B(
-        b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
-        group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
-    return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id,
-                                          group_scales_type)
+    b_q_weight: torch.Tensor,
+    a_type: torch.dtype,
+    b_type: ScalarType,
+    group_scales_type: torch.dtype | None,
+) -> torch.Tensor:
+    return torch.ops._C.machete_prepack_B(
+        b_q_weight, a_type, b_type.id, group_scales_type
+    )
 
 
 # CUTLASS W4A8
 def cutlass_w4a8_mm(
-        a: torch.Tensor,
-        # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
-        b_q: torch.Tensor,
-        b_group_scales: torch.Tensor,
-        b_group_size: int,
-        b_channel_scales: torch.Tensor,
-        a_token_scales: torch.Tensor,
-        out_type: Optional[torch.dtype] = None,
-        maybe_schedule: Optional[str] = None) -> torch.Tensor:
-    return torch.ops._C.cutlass_w4a8_mm(a, b_q, b_group_scales, b_group_size,
-                                        b_channel_scales, a_token_scales,
-                                        out_type, maybe_schedule)
+    a: torch.Tensor,
+    # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
+    b_q: torch.Tensor,
+    b_group_scales: torch.Tensor,
+    b_group_size: int,
+    b_channel_scales: torch.Tensor,
+    a_token_scales: torch.Tensor,
+    out_type: torch.dtype | None = None,
+    maybe_schedule: str | None = None,
+) -> torch.Tensor:
+    return torch.ops._C.cutlass_w4a8_mm(
+        a,
+        b_q,
+        b_group_scales,
+        b_group_size,
+        b_channel_scales,
+        a_token_scales,
+        out_type,
+        maybe_schedule,
+    )
 
 
 def cutlass_pack_scale_fp8(scales: torch.Tensor) -> torch.Tensor:
@@ -1087,8 +1401,7 @@ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor:
 if hasattr(torch.ops._C, "permute_cols"):
 
     @register_fake("_C::permute_cols")
-    def _permute_cols_fake(a: torch.Tensor,
-                           perm: torch.Tensor) -> torch.Tensor:
+    def _permute_cols_fake(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
         return torch.empty_like(a)
 
 
@@ -1098,8 +1411,8 @@ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
 
 # fp4
 def scaled_fp4_quant(
-        input: torch.Tensor,
-        input_global_scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    input: torch.Tensor, input_global_scale: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale.
 
@@ -1119,18 +1432,17 @@ def scaled_fp4_quant(
             in the sizzled layout.
     """
     assert not current_platform.is_rocm()
-    assert input.ndim >= 1, (
-        f'input.ndim needs to be >= 1, but got {input.ndim}.')
+    assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
     other_dims = 1 if input.ndim == 1 else -1
     input = input.reshape(other_dims, input.shape[-1])
     m, n = input.shape
     block_size = 16
     device = input.device
 
-    assert n % block_size == 0, (
-        f'last dim has to be multiple of 16, but got {n}.')
+    assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
     assert input.dtype in (torch.float16, torch.bfloat16), (
-        f'input.dtype needs to be fp16 or bf16 but got {input.dtype}.')
+        f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
+    )
 
     # Two fp4 values will be packed into an uint8.
     output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
@@ -1144,12 +1456,11 @@ def scaled_fp4_quant(
     rounded_m = round_up(m, 128)
     scale_n = n // block_size
     rounded_n = round_up(scale_n, 4)
-    output_scale = torch.empty((rounded_m, rounded_n // 4),
-                               device=device,
-                               dtype=torch.int32)
+    output_scale = torch.empty(
+        (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+    )
 
-    torch.ops._C.scaled_fp4_quant(output, input, output_scale,
-                                  input_global_scale)
+    torch.ops._C.scaled_fp4_quant(output, input, output_scale, input_global_scale)
     output_scale = output_scale.view(torch.float8_e4m3fn)
     return output, output_scale
 
@@ -1175,7 +1486,8 @@ def scaled_fp4_experts_quant(
     """
     assert not current_platform.is_rocm()
     assert input_tensor.ndim == 2, (
-        f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
+        f"input.ndim needs to be == 2, but got {input_tensor.ndim}."
+    )
 
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
@@ -1184,26 +1496,33 @@ def scaled_fp4_experts_quant(
     MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
     m_numtopk, k = input_tensor.shape
 
-    assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
+    assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, (
         f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
         f"{MAX_TOKENS_PER_EXPERT})"
         f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
-        f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.")
+        f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value."
+    )
     scales_k = k // 16
     padded_k = (scales_k + (4 - 1)) // 4
 
     # output is uint8 and packed fp4 values
-    output = torch.empty(m_numtopk,
-                         k // 2,
-                         device=input_tensor.device,
-                         dtype=torch.uint8)
-    output_scales = torch.empty(MAX_TOKENS_PER_EXPERT * topk,
-                                padded_k,
-                                dtype=torch.int32,
-                                device=input_tensor.device)
-    torch.ops._C.scaled_fp4_experts_quant(output, output_scales, input_tensor,
-                                          input_global_scale, expert_offsets,
-                                          blockscale_offsets)
+    output = torch.empty(
+        m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
+    )
+    output_scales = torch.empty(
+        MAX_TOKENS_PER_EXPERT * topk,
+        padded_k,
+        dtype=torch.int32,
+        device=input_tensor.device,
+    )
+    torch.ops._C.scaled_fp4_experts_quant(
+        output,
+        output_scales,
+        input_tensor,
+        input_global_scale,
+        expert_offsets,
+        blockscale_offsets,
+    )
     output_scales = output_scales.view(torch.float8_e4m3fn)
     return output, output_scales
 
@@ -1211,11 +1530,11 @@ def scaled_fp4_experts_quant(
 # fp8
 def scaled_fp8_quant(
     input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    num_token_padding: Optional[int] = None,
-    scale_ub: Optional[torch.Tensor] = None,
+    scale: torch.Tensor | None = None,
+    num_token_padding: int | None = None,
+    scale_ub: torch.Tensor | None = None,
     use_per_token_if_dynamic: bool = False,
-    output: Optional[torch.Tensor] = None,
+    output: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -1241,8 +1560,8 @@ def scaled_fp8_quant(
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
-    assert (input.ndim == 2)
-    shape: Union[tuple[int, int], torch.Size] = input.shape
+    assert input.ndim == 2
+    shape: tuple[int, int] | torch.Size = input.shape
     # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
@@ -1250,19 +1569,17 @@ def scaled_fp8_quant(
     if output is None:
         output = torch.empty(shape, device=input.device, dtype=out_dtype)
     else:
-        assert num_token_padding is None, \
-            "padding not supported if output passed in"
+        assert num_token_padding is None, "padding not supported if output passed in"
         assert output.dtype == out_dtype
 
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((shape[0], 1),
-                                device=input.device,
-                                dtype=torch.float32)
+            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-                output, input, scale, scale_ub)
+                output, input, scale, scale_ub
+            )
         else:
-            scale = torch.empty(1, device=input.device, dtype=torch.float32)
+            scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"
@@ -1273,10 +1590,10 @@ def scaled_fp8_quant(
 
 # gptq allspark
 def allspark_repack_weight(
-        qweight: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: Optional[torch.Tensor] = None,
-        has_zp: bool = False
+    qweight: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor | None = None,
+    has_zp: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
@@ -1298,47 +1615,70 @@ def allspark_repack_weight(
     N = qweight.shape[1]
     N_32align = (N + 32 - 1) // 32 * 32
 
-    qweight_reorder = torch.empty((N_32align, K),
-                                  device=qweight.device,
-                                  dtype=qweight.dtype)
-    scale_reorder = torch.empty((1, N_32align),
-                                device=scale.device,
-                                dtype=scale.dtype)
+    qweight_reorder = torch.empty(
+        (N_32align, K), device=qweight.device, dtype=qweight.dtype
+    )
+    scale_reorder = torch.empty((1, N_32align), device=scale.device, dtype=scale.dtype)
     zero_point_reorder = None
     if has_zp:
         assert zero_point is not None, (
-            "zero_point must be provided for asymmetric quantization.")
-        zero_point_reorder = torch.empty((1, N_32align),
-                                         device=zero_point.device,
-                                         dtype=zero_point.dtype)
+            "zero_point must be provided for asymmetric quantization."
+        )
+        zero_point_reorder = torch.empty(
+            (1, N_32align), device=zero_point.device, dtype=zero_point.dtype
+        )
 
     torch.ops._C.rearrange_kn_weight_as_n32k16_order(
-        qweight, scale, zero_point, has_zp, qweight_reorder, scale_reorder,
-        zero_point_reorder, K, N, N_32align)
+        qweight,
+        scale,
+        zero_point,
+        has_zp,
+        qweight_reorder,
+        scale_reorder,
+        zero_point_reorder,
+        K,
+        N,
+        N_32align,
+    )
 
     return qweight_reorder, scale_reorder, zero_point_reorder
 
 
-def allspark_w8a16_gemm(a: torch.Tensor, b_qweight: torch.Tensor,
-                        b_scales: torch.Tensor,
-                        b_qzeros: Optional[torch.Tensor], n: int,
-                        group_size: int, sm_count: int, sm_version: int,
-                        CUBLAS_M_THRESHOLD: int, has_zp: bool,
-                        n32k16_reorder: bool) -> torch.Tensor:
-
-    return torch.ops._C.allspark_w8a16_gemm(a, b_qweight, b_scales, b_qzeros,
-                                            n, group_size, sm_count,
-                                            sm_version, CUBLAS_M_THRESHOLD,
-                                            has_zp, n32k16_reorder)
+def allspark_w8a16_gemm(
+    a: torch.Tensor,
+    b_qweight: torch.Tensor,
+    b_scales: torch.Tensor,
+    b_qzeros: torch.Tensor | None,
+    n: int,
+    group_size: int,
+    sm_count: int,
+    sm_version: int,
+    CUBLAS_M_THRESHOLD: int,
+    has_zp: bool,
+    n32k16_reorder: bool,
+) -> torch.Tensor:
+    return torch.ops._C.allspark_w8a16_gemm(
+        a,
+        b_qweight,
+        b_scales,
+        b_qzeros,
+        n,
+        group_size,
+        sm_count,
+        sm_version,
+        CUBLAS_M_THRESHOLD,
+        has_zp,
+        n32k16_reorder,
+    )
 
 
 # int8
 def scaled_int8_quant(
     input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    azp: Optional[torch.Tensor] = None,
-    symmetric: bool = True
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    scale: torch.Tensor | None = None,
+    azp: torch.Tensor | None = None,
+    symmetric: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
 
@@ -1356,26 +1696,27 @@ def scaled_int8_quant(
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
-        assert symmetric == (
-            azp
-            is None), "azp must only be provided for asymmetric quantization."
+        assert symmetric == (azp is None), (
+            "azp must only be provided for asymmetric quantization."
+        )
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
     # dynamic-per-token quantization.
-    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
-                               device=input.device,
-                               dtype=torch.float32)
-    input_azp = None if symmetric else torch.empty_like(input_scales,
-                                                        dtype=torch.int32)
-    torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(),
-                                           input_scales, input_azp)
+    input_scales = torch.empty(
+        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
+    )
+    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
+    torch.ops._C.dynamic_scaled_int8_quant(
+        output, input.contiguous(), input_scales, input_azp
+    )
     return output, input_scales, input_azp
 
 
 # gguf
-def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
-                    dtype: Optional[torch.dtype]) -> torch.Tensor:
+def ggml_dequantize(
+    W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None
+) -> torch.Tensor:
     return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
 
 
@@ -1408,9 +1749,17 @@ def ggml_moe_a8(
     top_k: int,
     tokens: int,
 ) -> torch.Tensor:
-    return torch.ops._C.ggml_moe_a8(X, W, sorted_token_ids, expert_ids,
-                                    num_tokens_post_padded, quant_type, row,
-                                    top_k, tokens)
+    return torch.ops._C.ggml_moe_a8(
+        X,
+        W,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        quant_type,
+        row,
+        top_k,
+        tokens,
+    )
 
 
 def ggml_moe_a8_vec(
@@ -1422,8 +1771,7 @@ def ggml_moe_a8_vec(
     row: torch.SymInt,
     tokens: torch.SymInt,
 ) -> torch.Tensor:
-    return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row,
-                                        tokens)
+    return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row, tokens)
 
 
 def ggml_moe_get_block_size(quant_type: int) -> int:
@@ -1431,44 +1779,69 @@ def ggml_moe_get_block_size(quant_type: int) -> int:
 
 
 # mamba
-def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
-                       B: torch.Tensor, C: torch.Tensor,
-                       D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
-                       delta_bias_: Optional[torch.Tensor],
-                       delta_softplus: bool,
-                       query_start_loc: Optional[torch.Tensor],
-                       cache_indices: Optional[torch.Tensor],
-                       has_initial_state: Optional[torch.Tensor],
-                       ssm_states: torch.Tensor, pad_slot_id: int):
-    torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_,
-                                    delta_softplus, query_start_loc,
-                                    cache_indices, has_initial_state,
-                                    ssm_states, pad_slot_id)
+def selective_scan_fwd(
+    u: torch.Tensor,
+    delta: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    D_: torch.Tensor | None,
+    z_: torch.Tensor | None,
+    delta_bias_: torch.Tensor | None,
+    delta_softplus: bool,
+    query_start_loc: torch.Tensor | None,
+    cache_indices: torch.Tensor | None,
+    has_initial_state: torch.Tensor | None,
+    ssm_states: torch.Tensor,
+    pad_slot_id: int,
+    block_size: int = 1024,
+    block_idx_first_scheduled_token: torch.Tensor | None = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
+):
+    torch.ops._C.selective_scan_fwd(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D_,
+        z_,
+        delta_bias_,
+        delta_softplus,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        ssm_states,
+        pad_slot_id,
+        block_size,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
+    )
 
 
 # ROCm skinny gemms
-def LLMM1(a: torch.Tensor, b: torch.Tensor,
-          rows_per_block: int) -> torch.Tensor:
+def LLMM1(a: torch.Tensor, b: torch.Tensor, rows_per_block: int) -> torch.Tensor:
     return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
 
 
-def wvSplitK(a: torch.Tensor,
-             b: torch.Tensor,
-             cu_count: int,
-             bias: torch.Tensor = None) -> torch.Tensor:
+def wvSplitK(
+    a: torch.Tensor, b: torch.Tensor, cu_count: int, bias: torch.Tensor = None
+) -> torch.Tensor:
     return torch.ops._rocm_C.wvSplitK(a, b, bias, cu_count)
 
 
-def wvSplitKQ(a: torch.Tensor,
-              b: torch.Tensor,
-              out_dtype: torch.dtype,
-              scale_a: torch.Tensor,
-              scale_b: torch.Tensor,
-              cu_count: int,
-              bias: torch.Tensor = None) -> torch.Tensor:
-    out = torch.empty((b.shape[0], a.shape[0]),
-                      dtype=out_dtype,
-                      device=b.device)
+def wvSplitKQ(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    cu_count: int,
+    bias: torch.Tensor = None,
+) -> torch.Tensor:
+    out = torch.empty((b.shape[0], a.shape[0]), dtype=out_dtype, device=b.device)
     torch.ops._rocm_C.wvSplitKQ(a, b, bias, out, scale_a, scale_b, cu_count)
     return out
 
@@ -1478,118 +1851,276 @@ def moe_sum(input: torch.Tensor, output: torch.Tensor):
     torch.ops._moe_C.moe_sum(input, output)
 
 
-def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
-                         block_size: int, sorted_token_ids: torch.Tensor,
-                         experts_ids: torch.Tensor,
-                         num_tokens_post_pad: torch.Tensor) -> None:
-    torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
-                                          sorted_token_ids, experts_ids,
-                                          num_tokens_post_pad)
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    torch.ops._moe_C.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
 
 
-def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
-                   b_qweight: torch.Tensor, b_scales: torch.Tensor,
-                   b_qzeros: Optional[torch.Tensor],
-                   topk_weights: Optional[torch.Tensor],
-                   sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor,
-                   num_tokens_post_pad: torch.Tensor, top_k: int,
-                   BLOCK_SIZE_M: int, BLOCK_SIZE_N: int, BLOCK_SIZE_K: int,
-                   bit: int) -> torch.Tensor:
+def batched_moe_align_block_size(
+    max_tokens_per_batch: int,
+    block_size: int,
+    expert_num_tokens: torch.Tensor,
+    sorted_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    torch.ops._moe_C.batched_moe_align_block_size(
+        max_tokens_per_batch,
+        block_size,
+        expert_num_tokens,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+    )
+
+
+def moe_lora_align_block_size(
+    topk_ids: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    max_loras: int,
+    max_num_tokens_padded: int,
+    max_num_m_blocks: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    lora_ids: torch.Tensor,
+) -> None:
+    torch.ops._moe_C.moe_lora_align_block_size(
+        topk_ids,
+        token_lora_mapping,
+        num_experts,
+        block_size,
+        max_loras,
+        max_num_tokens_padded,
+        max_num_m_blocks,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+        adapter_enabled,
+        lora_ids,
+    )
+
+
+def moe_wna16_gemm(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    b_qweight: torch.Tensor,
+    b_scales: torch.Tensor,
+    b_qzeros: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+    top_k: int,
+    BLOCK_SIZE_M: int,
+    BLOCK_SIZE_N: int,
+    BLOCK_SIZE_K: int,
+    bit: int,
+) -> torch.Tensor:
     if not current_platform.is_cuda():
         raise NotImplementedError(
-            "The optimized moe_wna16_gemm kernel is only "
-            "available on CUDA platforms")
-    torch.ops._moe_C.moe_wna16_gemm(input, output, b_qweight, b_scales,
-                                    b_qzeros, topk_weights, sorted_token_ids,
-                                    experts_ids, num_tokens_post_pad, top_k,
-                                    BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K,
-                                    bit)
+            "The optimized moe_wna16_gemm kernel is only available on CUDA platforms"
+        )
+    torch.ops._moe_C.moe_wna16_gemm(
+        input,
+        output,
+        b_qweight,
+        b_scales,
+        b_qzeros,
+        topk_weights,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+        top_k,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+        bit,
+    )
 
 
-def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 token_expert_indices: torch.Tensor,
-                 gating_output: torch.Tensor) -> None:
-    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
-                                  gating_output)
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool = False,
+) -> None:
+    torch.ops._moe_C.topk_softmax(
+        topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
+    )
 
 
-def grouped_topk(scores: torch.Tensor, scores_with_bias: torch.Tensor,
-                 num_expert_group: int, topk_group: int, topk: int,
-                 renormalize: bool, routed_scaling_factor: float):
+def grouped_topk(
+    scores: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    topk: int,
+    renormalize: bool,
+    routed_scaling_factor: float,
+    bias: torch.Tensor,
+    scoring_func: int = 0,
+):
+    """
+    Perform grouped top-k routing for mixture of experts.
+
+    Args:
+        scores: Raw inputs (logits if scoring_func=1, scores if scoring_func=0)
+        num_expert_group: Number of expert groups
+        topk_group: Number of groups to select
+        topk: Number of experts to select per token
+        renormalize: Whether to renormalize the output weights
+        routed_scaling_factor: Scaling factor for routing weights
+        bias: Bias tensor (e_score_correction_bias). Always fused in kernel.
+        scoring_func: 0=none (no activation), 1=sigmoid
+    """
     if not current_platform.is_cuda():
-        raise NotImplementedError("The fused grouped_topk kernel is only "
-                                  "available on CUDA platforms")
-    return torch.ops._moe_C.grouped_topk(scores, scores_with_bias,
-                                         num_expert_group, topk_group, topk,
-                                         renormalize, routed_scaling_factor)
-
-
-def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
-                          b_qweight: torch.Tensor,
-                          b_bias: Optional[torch.Tensor],
-                          b_scales: torch.Tensor,
-                          global_scale: Optional[torch.Tensor],
-                          b_qzeros: Optional[torch.Tensor],
-                          g_idx: Optional[torch.Tensor],
-                          perm: Optional[torch.Tensor],
-                          workspace: torch.Tensor,
-                          sorted_token_ids: torch.Tensor,
-                          expert_ids: torch.Tensor,
-                          num_tokens_past_padded: torch.Tensor,
-                          topk_weights: torch.Tensor, moe_block_size: int,
-                          top_k: int, mul_topk_weights: bool, is_ep: bool,
-                          b_q_type: ScalarType, size_m: int, size_n: int,
-                          size_k: int, is_k_full: bool, use_atomic_add: bool,
-                          use_fp32_reduce: bool,
-                          is_zp_float: bool) -> torch.Tensor:
+        raise NotImplementedError(
+            "The fused grouped_topk kernel is only available on CUDA platforms"
+        )
+    return torch.ops._moe_C.grouped_topk(
+        scores,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        routed_scaling_factor,
+        bias,
+        scoring_func,
+    )
+
+
+def moe_wna16_marlin_gemm(
+    input: torch.Tensor,
+    output: torch.Tensor | None,
+    b_qweight: torch.Tensor,
+    b_bias: torch.Tensor | None,
+    b_scales: torch.Tensor,
+    global_scale: torch.Tensor | None,
+    b_qzeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    perm: torch.Tensor | None,
+    workspace: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_past_padded: torch.Tensor,
+    topk_weights: torch.Tensor,
+    moe_block_size: int,
+    top_k: int,
+    mul_topk_weights: bool,
+    is_ep: bool,
+    b_q_type: ScalarType,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+    is_k_full: bool,
+    use_atomic_add: bool,
+    use_fp32_reduce: bool,
+    is_zp_float: bool,
+) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
-        input, output, b_qweight, b_bias, b_scales, global_scale, b_qzeros,
-        g_idx, perm, workspace, sorted_token_ids, expert_ids,
-        num_tokens_past_padded, topk_weights, moe_block_size, top_k,
-        mul_topk_weights, is_ep, b_q_type.id, size_m, size_n, size_k,
-        is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float)
+        input,
+        output,
+        b_qweight,
+        b_bias,
+        b_scales,
+        global_scale,
+        b_qzeros,
+        g_idx,
+        perm,
+        workspace,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_past_padded,
+        topk_weights,
+        moe_block_size,
+        top_k,
+        mul_topk_weights,
+        is_ep,
+        b_q_type.id,
+        size_m,
+        size_n,
+        size_k,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float,
+    )
 
 
-if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
+if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
 
     @register_fake("_moe_C::marlin_gemm_moe")
-    def marlin_gemm_moe_fake(a: torch.Tensor, b_q_weights: torch.Tensor,
-                             sorted_ids: torch.Tensor,
-                             topk_weights: torch.Tensor,
-                             topk_ids: torch.Tensor, b_scales: torch.Tensor,
-                             b_zero_points: torch.Tensor, g_idx: torch.Tensor,
-                             perm: torch.Tensor, workspace: torch.Tensor,
-                             b_q_type: ScalarType, size_m: torch.SymInt,
-                             size_n: torch.SymInt, size_k: torch.SymInt,
-                             is_k_full: bool, num_experts: int, topk: int,
-                             moe_block_size: int, replicate_input: bool,
-                             apply_weights: bool) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n),
-                           dtype=a.dtype,
-                           device=a.device)
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
 
     @register_fake("_moe_C::moe_wna16_marlin_gemm")
-    def moe_wna16_marlin_gemm_fake(input: torch.Tensor,
-                                   output: Optional[torch.Tensor],
-                                   b_qweight: torch.Tensor,
-                                   b_scales: torch.Tensor,
-                                   b_qzeros: Optional[torch.Tensor],
-                                   g_idx: Optional[torch.Tensor],
-                                   perm: Optional[torch.Tensor],
-                                   workspace: torch.Tensor,
-                                   sorted_token_ids: torch.Tensor,
-                                   expert_ids: torch.Tensor,
-                                   num_tokens_past_padded: torch.Tensor,
-                                   topk_weights: torch.Tensor,
-                                   moe_block_size: int, top_k: int,
-                                   mul_topk_weights: bool, is_ep: bool,
-                                   b_q_type: ScalarType, size_m: int,
-                                   size_n: int, size_k: int, is_k_full: bool,
-                                   use_atomic_add: bool, use_fp32_reduce: bool,
-                                   is_zp_float: bool) -> torch.Tensor:
-        return torch.empty((size_m * top_k, size_n),
-                           dtype=input.dtype,
-                           device=input.device)
+    def moe_wna16_marlin_gemm_fake(
+        input: torch.Tensor,
+        output: torch.Tensor | None,
+        b_qweight: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_qzeros: torch.Tensor | None,
+        g_idx: torch.Tensor | None,
+        perm: torch.Tensor | None,
+        workspace: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+        expert_ids: torch.Tensor,
+        num_tokens_past_padded: torch.Tensor,
+        topk_weights: torch.Tensor,
+        moe_block_size: int,
+        top_k: int,
+        mul_topk_weights: bool,
+        is_ep: bool,
+        b_q_type: ScalarType,
+        size_m: int,
+        size_n: int,
+        size_k: int,
+        is_k_full: bool,
+        use_atomic_add: bool,
+        use_fp32_reduce: bool,
+        is_zp_float: bool,
+    ) -> torch.Tensor:
+        return torch.empty(
+            (size_m * top_k, size_n), dtype=input.dtype, device=input.device
+        )
 
 
 def reshape_and_cache(
@@ -1602,9 +2133,16 @@ def reshape_and_cache(
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
 ) -> None:
-    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
-                                             value_cache, slot_mapping,
-                                             kv_cache_dtype, k_scale, v_scale)
+    torch.ops._C_cache_ops.reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
 
 
 def reshape_and_cache_flash(
@@ -1617,10 +2155,16 @@ def reshape_and_cache_flash(
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
 ) -> None:
-    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
-                                                   value_cache, slot_mapping,
-                                                   kv_cache_dtype, k_scale,
-                                                   v_scale)
+    torch.ops._C_cache_ops.reshape_and_cache_flash(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
 
 
 def concat_and_cache_mla(
@@ -1631,56 +2175,92 @@ def concat_and_cache_mla(
     kv_cache_dtype: str,
     scale: torch.Tensor,
 ) -> None:
-    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
-                                                slot_mapping, kv_cache_dtype,
-                                                scale)
+    torch.ops._C_cache_ops.concat_and_cache_mla(
+        kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale
+    )
 
 
-def copy_blocks(key_caches: list[torch.Tensor],
-                value_caches: list[torch.Tensor],
-                block_mapping: torch.Tensor) -> None:
+def copy_blocks(
+    key_caches: list[torch.Tensor],
+    value_caches: list[torch.Tensor],
+    block_mapping: torch.Tensor,
+) -> None:
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
-def copy_blocks_mla(kv_caches: list[torch.Tensor],
-                    block_mapping: torch.Tensor) -> None:
+def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
 
 
-def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
-                block_mapping: torch.Tensor) -> None:
+def swap_blocks(
+    src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
+) -> None:
     torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
 
 
-def convert_fp8(output: torch.Tensor,
-                input: torch.Tensor,
-                scale: float = 1.0,
-                kv_dtype: str = "fp8") -> None:
+def convert_fp8(
+    output: torch.Tensor, input: torch.Tensor, scale: float = 1.0, kv_dtype: str = "fp8"
+) -> None:
     torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
 
 
 def gather_and_maybe_dequant_cache(
-        src_cache: torch.Tensor,
-        dst: torch.Tensor,
-        block_table: torch.Tensor,
-        cu_seq_lens: torch.Tensor,
-        batch_size: int,
-        kv_cache_dtype: str,
-        scale: torch.Tensor,
-        seq_starts: Optional[torch.Tensor] = None) -> None:
+    src_cache: torch.Tensor,
+    dst: torch.Tensor,
+    block_table: torch.Tensor,
+    cu_seq_lens: torch.Tensor,
+    batch_size: int,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+    seq_starts: torch.Tensor | None = None,
+) -> None:
     torch.ops._C_cache_ops.gather_and_maybe_dequant_cache(
-        src_cache, dst, block_table, cu_seq_lens, batch_size, kv_cache_dtype,
-        scale, seq_starts)
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        kv_cache_dtype,
+        scale,
+        seq_starts,
+    )
+
 
+def cp_gather_cache(
+    src_cache: torch.Tensor,
+    dst: torch.Tensor,
+    block_table: torch.Tensor,
+    cu_seq_lens: torch.Tensor,
+    batch_size: int,
+    seq_starts: torch.Tensor | None = None,
+) -> None:
+    torch.ops._C_cache_ops.cp_gather_cache(
+        src_cache, dst, block_table, cu_seq_lens, batch_size, seq_starts
+    )
 
-def cp_gather_cache(src_cache: torch.Tensor,
-                    dst: torch.Tensor,
-                    block_table: torch.Tensor,
-                    cu_seq_lens: torch.Tensor,
-                    batch_size: int,
-                    seq_starts: Optional[torch.Tensor] = None) -> None:
-    torch.ops._C_cache_ops.cp_gather_cache(src_cache, dst, block_table,
-                                           cu_seq_lens, batch_size, seq_starts)
+
+def indexer_k_quant_and_cache(
+    k: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    quant_block_size: int,
+    kv_cache_dtype: str,
+) -> None:
+    torch.ops._C_cache_ops.indexer_k_quant_and_cache(
+        k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype
+    )
+
+
+def cp_gather_indexer_k_quant_cache(
+    kv_cache: torch.Tensor,
+    dst_k: torch.Tensor,
+    dst_scale: torch.Tensor,
+    block_table: torch.Tensor,
+    cu_seq_lens: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.cp_gather_indexer_k_quant_cache(
+        kv_cache, dst_k, dst_scale, block_table, cu_seq_lens
+    )
 
 
 def indexer_k_quant_and_cache(k: torch.Tensor, kv_cache: torch.Tensor,
@@ -1699,20 +2279,30 @@ def get_device_attribute(attribute: int, device: int) -> int:
 def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
     # ruff: noqa: E501
     return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
-        device)
+        device
+    )
 
 
 # custom ar
-def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
-                   rank: int, fully_connected: bool) -> int:
-    return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
-                                                 fully_connected)
+def init_custom_ar(
+    ipc_tensors: list[torch.Tensor],
+    rank_data: torch.Tensor,
+    rank: int,
+    fully_connected: bool,
+) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(
+        ipc_tensors, rank_data, rank, fully_connected
+    )
 
 
-def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
-               reg_buffer_sz_bytes: int) -> None:
-    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer,
-                                      reg_buffer_sz_bytes)
+def all_reduce(
+    fa: int,
+    inp: torch.Tensor,
+    out: torch.Tensor,
+    reg_buffer: int,
+    reg_buffer_sz_bytes: int,
+) -> None:
+    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
 
 
 def dispose(fa: int) -> None:
@@ -1731,8 +2321,9 @@ def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 
-def register_graph_buffers(fa: int, handles: list[list[int]],
-                           offsets: list[list[int]]) -> None:
+def register_graph_buffers(
+    fa: int, handles: list[list[int]], offsets: list[list[int]]
+) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
@@ -1749,9 +2340,7 @@ def free_shared_buffer(ptr: int) -> None:
 
 
 # quick all reduce
-def init_custom_qr(rank: int,
-                   world_size: int,
-                   qr_max_size: Optional[int] = None) -> int:
+def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int:
     return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
 
 
@@ -1759,13 +2348,14 @@ def qr_destroy(fa: int) -> None:
     torch.ops._C_custom_ar.qr_destroy(fa)
 
 
-def qr_all_reduce(fa: int,
-                  inp: torch.Tensor,
-                  out: torch.Tensor,
-                  quant_level: int,
-                  cast_bf2half: bool = False) -> None:
-    torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level,
-                                         cast_bf2half)
+def qr_all_reduce(
+    fa: int,
+    inp: torch.Tensor,
+    out: torch.Tensor,
+    quant_level: int,
+    cast_bf2half: bool = False,
+) -> None:
+    torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
 
 
 def qr_get_handle(fa: int) -> torch.Tensor:
@@ -1795,9 +2385,9 @@ def get_flash_mla_metadata(
         tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
         num_splits: (batch_size + 1), dtype torch.int32.
     """
-    return torch.ops._C.get_flash_mla_metadata(cache_seqlens,
-                                               num_heads_per_head_k,
-                                               num_heads_k)
+    return torch.ops._C.get_flash_mla_metadata(
+        cache_seqlens, num_heads_per_head_k, num_heads_k
+    )
 
 
 def flash_mla_with_kvcache(
@@ -1808,7 +2398,7 @@ def flash_mla_with_kvcache(
     head_dim_v: int,
     tile_scheduler_metadata: torch.Tensor,
     num_splits: torch.Tensor,
-    softmax_scale: Optional[float] = None,
+    softmax_scale: float | None = None,
     causal: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -1828,7 +2418,7 @@ def flash_mla_with_kvcache(
         softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
     """
     if softmax_scale is None:
-        softmax_scale = q.shape[-1]**(-0.5)
+        softmax_scale = q.shape[-1] ** (-0.5)
     out, softmax_lse = torch.ops._C.flash_mla_fwd_kvcache(
         q,
         k_cache,
@@ -1844,35 +2434,53 @@ def flash_mla_with_kvcache(
     return out, softmax_lse
 
 
-def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor,
-                             q_nope: torch.Tensor, q_pe: torch.Tensor,
-                             kv_c_and_k_pe_cache: torch.Tensor,
-                             seq_lens: torch.Tensor, page_table: torch.Tensor,
-                             workspace: torch.Tensor, scale: float,
-                             num_kv_splits: int) -> torch.Tensor:
-    torch.ops._C.sm100_cutlass_mla_decode(out, lse, q_nope, q_pe,
-                                          kv_c_and_k_pe_cache, seq_lens,
-                                          page_table, workspace, scale,
-                                          num_kv_splits)
+def sm100_cutlass_mla_decode(
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    q_nope: torch.Tensor,
+    q_pe: torch.Tensor,
+    kv_c_and_k_pe_cache: torch.Tensor,
+    seq_lens: torch.Tensor,
+    page_table: torch.Tensor,
+    workspace: torch.Tensor,
+    scale: float,
+    num_kv_splits: int,
+) -> torch.Tensor:
+    torch.ops._C.sm100_cutlass_mla_decode(
+        out,
+        lse,
+        q_nope,
+        q_pe,
+        kv_c_and_k_pe_cache,
+        seq_lens,
+        page_table,
+        workspace,
+        scale,
+        num_kv_splits,
+    )
     return out
 
 
-def sm100_cutlass_mla_get_workspace_size(max_seq_len: int, num_batches: int,
-                                         sm_count: int,
-                                         num_kv_splits: int) -> int:
+def sm100_cutlass_mla_get_workspace_size(
+    max_seq_len: int, num_batches: int, sm_count: int, num_kv_splits: int
+) -> int:
     return torch.ops._C.sm100_cutlass_mla_get_workspace_size(
-        max_seq_len, num_batches, sm_count, num_kv_splits)
+        max_seq_len, num_batches, sm_count, num_kv_splits
+    )
 
 
 if hasattr(torch.ops._C, "weight_packed_linear"):
 
     @register_fake("_C::weight_packed_linear")
-    def weight_packed_linear_fake(mat1: torch.Tensor, mat2: torch.Tensor,
-                                  bias: Optional[torch.Tensor],
-                                  is_vnni: bool) -> torch.Tensor:
-        return torch.empty((mat1.size(0), mat2.size(0)),
-                           dtype=mat1.dtype,
-                           device=mat2.device)
+    def weight_packed_linear_fake(
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        bias: torch.Tensor | None,
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        return torch.empty(
+            (mat1.size(0), mat2.size(0)), dtype=mat1.dtype, device=mat2.device
+        )
 
 
 if hasattr(torch.ops._C, "fused_experts_cpu"):
@@ -1887,11 +2495,11 @@ if hasattr(torch.ops._C, "fused_experts_cpu"):
         inplace: bool,
         use_int8_w8a8: bool,
         use_fp8_w8a16: bool,
-        w1_scale: Optional[torch.Tensor],
-        w2_scale: Optional[torch.Tensor],
-        block_size: Optional[list[int]],
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        w1_scale: torch.Tensor | None,
+        w2_scale: torch.Tensor | None,
+        block_size: list[int] | None,
+        a1_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         is_vnni: bool,
     ) -> torch.Tensor:
         return torch.empty_like(hidden_states)
@@ -1904,7 +2512,7 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
         mat1: torch.Tensor,
         mat2: torch.Tensor,
         scales2: torch.Tensor,
-        bias: Optional[torch.Tensor],
+        bias: torch.Tensor | None,
         out_dtype: torch.dtype,
         is_vnni: bool,
     ) -> torch.Tensor:
@@ -1914,9 +2522,8 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
 
 
 class CPUDNNLGEMMHandler:
-
     def __init__(self) -> None:
-        self.handler: Optional[int] = None
+        self.handler: int | None = None
         self.n = -1
         self.k = -1
 
@@ -1925,10 +2532,11 @@ class CPUDNNLGEMMHandler:
             torch.ops._C.release_dnnl_matmul_handler(self.handler)
 
 
-if hasattr(torch.ops._C, "create_onednn_mm_handler"):
-    _supports_onednn = True
-else:
-    _supports_onednn = False
+_supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler"))
+
+
+def is_onednn_acl_supported():
+    return torch.ops._C.is_onednn_acl_supported()
 
 
 def create_onednn_mm(
@@ -1938,18 +2546,20 @@ def create_onednn_mm(
     handler = CPUDNNLGEMMHandler()
     handler.k, handler.n = weight.size()
     handler.handler = torch.ops._C.create_onednn_mm_handler(
-        weight, primitive_cache_size)
+        weight, primitive_cache_size
+    )
     return handler
 
 
 def onednn_mm(
     dnnl_handler: CPUDNNLGEMMHandler,
     x: torch.Tensor,
-    bias: Optional[torch.Tensor],
+    bias: torch.Tensor | None,
 ) -> torch.Tensor:
     output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype)
-    torch.ops._C.onednn_mm(output, x.reshape(-1, dnnl_handler.k), bias,
-                           dnnl_handler.handler)
+    torch.ops._C.onednn_mm(
+        output, x.reshape(-1, dnnl_handler.k), bias, dnnl_handler.handler
+    )
 
     return output
 
@@ -1965,15 +2575,17 @@ def create_onednn_scaled_mm(
     handler = CPUDNNLGEMMHandler()
     handler.k, handler.n = weight.size()
     handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
-        weight, weight_scales, output_type, dynamic_quant, use_azp,
-        primitive_cache_size)
+        weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size
+    )
     return handler
 
 
-def onednn_scaled_int8_quant(input: torch.Tensor,
-                             scale: Optional[torch.Tensor] = None,
-                             azp: Optional[torch.Tensor] = None,
-                             symmetric: bool = True):
+def onednn_scaled_int8_quant(
+    input: torch.Tensor,
+    scale: torch.Tensor | None = None,
+    azp: torch.Tensor | None = None,
+    symmetric: bool = True,
+):
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
 
@@ -1993,20 +2605,16 @@ def onednn_scaled_int8_quant(input: torch.Tensor,
     input = input.view((token_num, input.shape[-1]))
     if scale is not None:
         # static-per-tensor quantization.
-        assert symmetric == (
-            azp
-            is None), "azp must only be provided for asymmetric quantization."
+        assert symmetric == (azp is None), (
+            "azp must only be provided for asymmetric quantization."
+        )
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
     # dynamic-per-token quantization.
-    input_scales = torch.empty((token_num, 1),
-                               device=input.device,
-                               dtype=torch.float32)
-    input_azp = None if symmetric else torch.empty_like(input_scales,
-                                                        dtype=torch.int32)
-    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales,
-                                           input_azp)
+    input_scales = torch.empty((token_num, 1), device=input.device, dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
     return output, input_scales, input_azp
 
 
@@ -2014,17 +2622,238 @@ def onednn_scaled_mm(
     dnnl_handler: CPUDNNLGEMMHandler,
     x: torch.Tensor,
     output: torch.Tensor,
-    input_scale: Optional[torch.Tensor],
-    input_zp: Optional[torch.Tensor],
-    input_zp_adj: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor],
+    input_scale: torch.Tensor | None,
+    input_zp: torch.Tensor | None,
+    input_zp_adj: torch.Tensor | None,
+    bias: torch.Tensor | None,
 ) -> torch.Tensor:
-    torch.ops._C.onednn_scaled_mm(output, x, input_scale, input_zp,
-                                  input_zp_adj, bias, dnnl_handler.handler)
+    torch.ops._C.onednn_scaled_mm(
+        output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
+    )
 
     return output
 
 
+def cpu_attn_get_scheduler_metadata(
+    num_reqs: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    seq_lens: torch.Tensor,
+    dtype: torch.dtype,
+    query_start_loc: torch.Tensor,
+    causal: bool,
+    sliding_window_size: int,
+    isa: str,
+    enable_kv_split: bool,
+) -> torch.Tensor:
+    sheduler_metadata = torch.ops._C.get_scheduler_metadata(
+        num_reqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        seq_lens,
+        dtype,
+        query_start_loc,
+        causal,
+        sliding_window_size,
+        isa,
+        enable_kv_split,
+    )
+    return sheduler_metadata
+
+
+def cpu_attn_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    isa: str,
+) -> None:
+    torch.ops._C.cpu_attn_reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        isa,
+    )
+
+
+def cpu_attention_with_kv_cache(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    output: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    causal: bool,
+    alibi_slopes: torch.Tensor | None,
+    sliding_window: tuple[int, int],
+    block_table: torch.Tensor,
+    softcap: float,
+    scheduler_metadata: torch.Tensor,
+    s_aux: torch.Tensor | None,
+) -> None:
+    torch.ops._C.cpu_attention_with_kv_cache(
+        query,
+        key_cache,
+        value_cache,
+        output,
+        query_start_loc,
+        seq_lens,
+        scale,
+        causal,
+        alibi_slopes,
+        sliding_window[0],
+        sliding_window[1],
+        block_table,
+        softcap,
+        scheduler_metadata,
+        s_aux,
+    )
+
+
+if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
+
+    @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
+    def _fake_matmul_mxf4_bf16_tn(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        a_sf: torch.Tensor,
+        b_sf: torch.Tensor,
+        alpha: torch.Tensor,
+    ):
+        return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16)
+
+
+def matmul_mxf4_bf16_tn(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_sf: torch.Tensor,
+    b_sf: torch.Tensor,
+    alpha: torch.Tensor,
+) -> torch.Tensor:
+    return torch.ops._qutlass_C.matmul_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
+
+
+if hasattr(torch.ops._qutlass_C, "matmul_ada_mxf4_bf16_tn"):
+
+    @register_fake("_qutlass_C::matmul_ada_mxf4_bf16_tn")
+    def _fake_matmul_ada_mxf4_bf16_tn(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        a_sf: torch.Tensor,
+        b_sf: torch.Tensor,
+        alpha: torch.Tensor,
+    ):
+        return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16)
+
+
+def matmul_ada_mxf4_bf16_tn(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_sf: torch.Tensor,
+    b_sf: torch.Tensor,
+    alpha: torch.Tensor,
+) -> torch.Tensor:
+    return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
+
+    @register_fake("_qutlass_C::fusedQuantizeMxQuest")
+    def _fake_fused_quantize_mx_quest(
+        a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor
+    ):
+        return xh_e2m1, xh_e8m0
+
+
+if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxAbsMax"):
+
+    @register_fake("_qutlass_C::fusedQuantizeMxAbsMax")
+    def _fake_fused_quantize_mx_absmax(
+        a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor
+    ):
+        return xh_e2m1, xh_e8m0
+
+
+def fusedQuantizeMx(
+    a: torch.Tensor, b: torch.Tensor, *, method: Literal["quest", "abs_max"] = "quest"
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if a.dim() == 0:
+        raise ValueError("`a` must have at least 1 dimension.")
+    if a.size(-1) % 32 != 0:
+        raise ValueError(f"last dim of `a` must be divisible by 32, got {a.size(-1)}.")
+    if b.device != a.device:
+        raise ValueError("`a` and `b` must be on the same device.")
+
+    xh_e2m1 = torch.empty(
+        *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device
+    )
+
+    rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    xh_e8m0 = torch.empty(
+        padded_rows, padded_cols, dtype=torch.float8_e8m0fnu, device=a.device
+    )
+
+    if not hasattr(torch.ops, "_qutlass_C"):
+        raise RuntimeError(
+            "The `_qutlass_C` extension is not loaded. "
+            "Make sure your custom op library is imported before calling fusedQuantizeMx."
+        )
+
+    if method == "quest":
+        return torch.ops._qutlass_C.fusedQuantizeMxQuest(a, b, xh_e2m1, xh_e8m0)
+    elif method == "abs_max":
+        return torch.ops._qutlass_C.fusedQuantizeMxAbsMax(a, b, xh_e2m1, xh_e8m0)
+    else:
+        raise ValueError(f"invalid method {method!r}, must be 'quest' or 'abs_max'")
+
+
+if hasattr(torch.ops._qutlass_C, "fusedQuantizeNv"):
+
+    @register_fake("_qutlass_C::fusedQuantizeNv")
+    def _fake_fused_quantize_nv(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        xh_e2m1: torch.Tensor,
+        xh_e4m3: torch.Tensor,
+        global_scale: torch.Tensor,
+    ):
+        return xh_e2m1, xh_e4m3
+
+
+def fusedQuantizeNv(
+    a: torch.Tensor, b: torch.Tensor, global_scale: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xh_e2m1 = torch.empty(
+        *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device
+    )
+
+    rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+    xh_e4m3 = torch.empty(
+        padded_rows, padded_cols, dtype=torch.float8_e4m3fn, device=a.device
+    )
+
+    return torch.ops._qutlass_C.fusedQuantizeNv(a, b, xh_e2m1, xh_e4m3, global_scale)
+
+
 def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
     """
     Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832)
@@ -2033,7 +2862,7 @@ def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
 
     Note that sylvester hadamard transforms are also symmetric, which means that
     this function is also applies the (transpose <=> inverse) transform.
-    
+
     :param x: value to be transformed inplace
     :param inplace: modify value in place
     :return: value after transformation
@@ -2044,6 +2873,5 @@ def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
 if hasattr(torch.ops._C, "hadacore_transform"):
 
     @register_fake("_C::hadacore_transform")
-    def _hadacore_transform_fake(x: torch.Tensor,
-                                 inplace: bool) -> torch.Tensor:
+    def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor:
         return torch.empty_like(x) if not inplace else x
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 9d2eda482fcf8bb3ac19eacf8b0a732374a821e4..95c17cb331f67a0f70319138bf42d23e39bb71b0 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 
@@ -17,10 +16,10 @@ except ImportError as e:
 
 
 class ipex_ops:
-
     @staticmethod
     def _reshape_activation_tensor(
-            x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        x: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         num = x.size(0)
         d = x.size(1) // 2
         x = x.reshape(num, 2, d)
@@ -65,7 +64,7 @@ class ipex_ops:
         context_lens: torch.Tensor,
         block_size: int,
         max_context_len: int,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         kv_cache_dtype: str,
         k_scale: float,
         v_scale: float,
@@ -107,7 +106,7 @@ class ipex_ops:
         context_lens: torch.Tensor,
         block_size: int,
         max_context_len: int,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         kv_cache_dtype: str,
         k_scale: float,
         v_scale: float,
@@ -144,21 +143,26 @@ class ipex_ops:
         is_neox: bool,
     ) -> None:
         rot_dim = cos_sin_cache.size(1)
-        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
-                                                     head_size, cos_sin_cache,
-                                                     is_neox, rot_dim)
+        ipex.llm.functional.rotary_embedding_batched(
+            positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim
+        )
 
     @staticmethod
-    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
-                 epsilon: float) -> torch.Tensor:
-        return ipex.llm.functional.rms_norm(input, weight, epsilon)
+    def rms_norm(
+        input: torch.Tensor, weight: torch.Tensor, epsilon: float
+    ) -> torch.Tensor:
+        out = torch.empty_like(input)
+        torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon)
+        return out
 
     @staticmethod
-    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
-                           weight: torch.Tensor, epsilon: float) -> None:
-        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
-                                               epsilon, True)
-        input.copy_(tmp)
+    def fused_add_rms_norm(
+        input: torch.Tensor,
+        residual: torch.Tensor,
+        weight: torch.Tensor,
+        epsilon: float,
+    ) -> None:
+        torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon)
 
     @staticmethod
     def varlen_attention(
@@ -168,7 +172,7 @@ class ipex_ops:
         out: torch.Tensor,
         seqlen_q: torch.Tensor,
         seqlen_k: torch.Tensor,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         max_seqlen_q: int,
         max_seqlen_k: int,
         pdropout: float,
@@ -186,22 +190,43 @@ class ipex_ops:
                 raise ValueError("IPEX CPU does not support logits_soft_cap")
             assert alibi_slopes is None
             assert window_size_left < 0 and window_size_right < 0
-            ipex.llm.functional.varlen_attention(query.contiguous(),
-                                                 key.contiguous(),
-                                                 value.contiguous(), out,
-                                                 seqlen_q.int(),
-                                                 seqlen_k.int(), max_seqlen_q,
-                                                 max_seqlen_k, pdropout,
-                                                 softmax_scale, zero_tensors,
-                                                 is_causal, return_softmax,
-                                                 gen_)
+            ipex.llm.functional.varlen_attention(
+                query.contiguous(),
+                key.contiguous(),
+                value.contiguous(),
+                out,
+                seqlen_q.int(),
+                seqlen_k.int(),
+                max_seqlen_q,
+                max_seqlen_k,
+                pdropout,
+                softmax_scale,
+                zero_tensors,
+                is_causal,
+                return_softmax,
+                gen_,
+            )
         else:  # XPU build
             ipex.llm.functional.varlen_attention(
-                query.contiguous(), key.contiguous(), value.contiguous(), out,
-                seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
-                max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
-                return_softmax, gen_, window_size_left, window_size_right,
-                logits_soft_cap)
+                query.contiguous(),
+                key.contiguous(),
+                value.contiguous(),
+                out,
+                seqlen_q.int(),
+                seqlen_k.int(),
+                alibi_slopes,
+                max_seqlen_q,
+                max_seqlen_k,
+                pdropout,
+                softmax_scale,
+                zero_tensors,
+                is_causal,
+                return_softmax,
+                gen_,
+                window_size_left,
+                window_size_right,
+                logits_soft_cap,
+            )
 
     @staticmethod
     def reshape_and_cache(
@@ -216,7 +241,8 @@ class ipex_ops:
     ) -> None:
         assert kv_cache_dtype == "auto"
         ipex.llm.modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache, slot_mapping)
+            key, value, key_cache, value_cache, slot_mapping
+        )
 
     @staticmethod
     def reshape_and_cache_flash(
@@ -226,32 +252,41 @@ class ipex_ops:
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        k_scale: Optional[torch.Tensor] = None,
-        v_scale: Optional[torch.Tensor] = None,
+        k_scale: torch.Tensor | None = None,
+        v_scale: torch.Tensor | None = None,
         k_scale_float: float = 1.0,
         v_scale_float: float = 1.0,
     ) -> None:
         ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
-            key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-            k_scale_float, v_scale_float)
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale_float,
+            v_scale_float,
+        )
 
     @staticmethod
     def flash_attn_varlen_func(
-        out: torch.Tensor,
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
         cu_seqlens_q: torch.Tensor,
-        seqused_k: torch.Tensor,  # we don't support this in ipex kernel
         max_seqlen_q: int,
         max_seqlen_k: int,
-        softmax_scale: float,
-        causal: bool,
-        block_table: torch.Tensor,
-        alibi_slopes: Optional[torch.Tensor],
-        window_size: Optional[list[int]] = None,
-        softcap: Optional[float] = 0.0,
-        cu_seqlens_k: Optional[torch.Tensor] = None,
+        softmax_scale: float | None = None,
+        causal: bool = False,
+        out: torch.Tensor | None = None,
+        block_table: torch.Tensor | None = None,
+        alibi_slopes: torch.Tensor | None = None,
+        window_size: list[int] | None = None,
+        softcap: float | None = 0.0,
+        seqused_k: torch.Tensor | None = None,
+        cu_seqlens_k: torch.Tensor | None = None,
+        # passed in qwen vl
+        dropout_p: float = 0.0,
         # The following parameters are not used in ipex kernel currently,
         # we keep API compatible to CUDA's.
         scheduler_metadata=None,
@@ -260,74 +295,100 @@ class ipex_ops:
         k_descale=None,
         v_descale=None,
         num_splits=0,
-        s_aux: Optional[torch.Tensor] = None,
+        s_aux: torch.Tensor | None = None,
     ):
-        if cu_seqlens_k is None:
-            # cu_seqlens_k is not used in ipex kernel.
-            cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
-            cu_seqlens_k = torch.cat([
-                torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
-                cu_seqlens_k
-            ]).to(torch.int32)
-
+        if out is None:
+            out = torch.empty(q.shape, dtype=q.dtype, device=q.device)
         real_window_size: tuple[int, int]
         if window_size is None:
             real_window_size = (-1, -1)
         else:
             assert len(window_size) == 2
             real_window_size = (window_size[0], window_size[1])
-        return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
-            out,
-            q.contiguous(),
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            softmax_scale,
-            causal,
-            block_table,
-            alibi_slopes,
-            softcap=softcap,
-            window_size_left=real_window_size[0],
-            window_size_right=real_window_size[1],
-            k_scale=1.0,
-            v_scale=1.0,
-        )
+
+        if block_table is None:
+            assert cu_seqlens_k is not None, (
+                "cu_seqlens_k can't be None when calling varlen_attention."
+            )
+            if softmax_scale is None:
+                softmax_scale = q.shape[-1] ** (-0.5)
+            ipex_ops.varlen_attention(
+                q.contiguous(),
+                k.contiguous(),
+                v.contiguous(),
+                out,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                None,
+                max_seqlen_q,
+                max_seqlen_k,
+                0.0,
+                softmax_scale,
+                False,
+                causal,
+                False,
+                None,
+                real_window_size[0],
+                real_window_size[1],
+                -1,
+            )
+            return out
+        else:
+            return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+                out,
+                q.contiguous(),
+                k,
+                v,
+                cu_seqlens_q,
+                seqused_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                softmax_scale,
+                causal,
+                block_table,
+                alibi_slopes,
+                sink=s_aux,
+                softcap=softcap,
+                window_size_left=real_window_size[0],
+                window_size_right=real_window_size[1],
+                k_scale=1.0,
+                v_scale=1.0,
+            )
 
     @staticmethod
     def get_scheduler_metadata(
-            batch_size,
-            max_seqlen_q,
-            max_seqlen_k,
-            num_heads_q,
-            num_heads_kv,
-            headdim,
-            cache_seqlens: torch.Tensor,
-            qkv_dtype=torch.bfloat16,
-            headdim_v=None,
-            cu_seqlens_q: Optional[torch.Tensor] = None,
-            cu_seqlens_k_new: Optional[torch.Tensor] = None,
-            cache_leftpad: Optional[torch.Tensor] = None,
-            page_size: Optional[int] = None,
-            max_seqlen_k_new=0,
-            causal=False,
-            window_size=(-1, -1),  # -1 means infinite context window
-            has_softcap=False,
-            num_splits=0,  # Can be tuned for speed
-            pack_gqa=None,  # Can be tuned for speed
-            sm_margin=0,  # Can be tuned if some SMs are used for communication
+        batch_size,
+        max_seqlen_q,
+        max_seqlen_k,
+        num_heads_q,
+        num_heads_kv,
+        headdim,
+        cache_seqlens: torch.Tensor,
+        qkv_dtype=torch.bfloat16,
+        headdim_v=None,
+        cu_seqlens_q: torch.Tensor | None = None,
+        cu_seqlens_k_new: torch.Tensor | None = None,
+        cache_leftpad: torch.Tensor | None = None,
+        page_size: int | None = None,
+        max_seqlen_k_new=0,
+        causal=False,
+        window_size=(-1, -1),  # -1 means infinite context window
+        has_softcap=False,
+        num_splits=0,  # Can be tuned for speed
+        pack_gqa=None,  # Can be tuned for speed
+        sm_margin=0,  # Can be tuned if some SMs are used for communication
     ) -> None:
         logger.warning_once(
-            "get_scheduler_metadata is not implemented for ipex_ops, "
-            "returning None.")
+            "get_scheduler_metadata is not implemented for ipex_ops, returning None."
+        )
         return None
 
     @staticmethod
-    def copy_blocks(key_caches: list[torch.Tensor],
-                    value_caches: list[torch.Tensor],
-                    block_mapping: torch.Tensor) -> None:
+    def copy_blocks(
+        key_caches: list[torch.Tensor],
+        value_caches: list[torch.Tensor],
+        block_mapping: torch.Tensor,
+    ) -> None:
         torch.xpu.copy_blocks(  # type: ignore
             key_caches,
             value_caches,
@@ -335,22 +396,23 @@ class ipex_ops:
         )
 
     @staticmethod
-    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
-                    block_mapping: torch.Tensor) -> None:
+    def swap_blocks(
+        src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
+    ) -> None:
         torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
 
     @staticmethod
     def scaled_fp8_quant(
         input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        num_token_padding: Optional[int] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        num_token_padding: int | None = None,
+        scale_ub: torch.Tensor | None = None,
         use_per_token_if_dynamic: bool = False,
-        output: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Quantize input tensor to FP8 and return quantized tensor and scale.
-        
+
         This function is designed for both static and dynamic quantization:
         If you provide the scale, it will use static scaling and if you omit
         it, the scale will be determined dynamically. Currently, XPU platform
@@ -367,26 +429,28 @@ class ipex_ops:
                 of the output to at least this value.
             use_per_token_if_dynamic: Whether to do per_tensor or per_token
                 in the dynamic quantization case.
-    
+
         Returns:
             tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
                 scaling factor.
         """
         # This code assumes batch_dim and num_tokens are flattened
-        assert (input.ndim == 2)
-        shape: Union[tuple[int, int], torch.Size] = input.shape
+        assert input.ndim == 2
+        shape: tuple[int, int] | torch.Size = input.shape
         out_dtype: torch.dtype = current_platform.fp8_dtype()
         if num_token_padding:
             shape = (max(num_token_padding, input.shape[0]), shape[1])
         if output is None:
             output = torch.empty(shape, device=input.device, dtype=out_dtype)
         else:
-            assert num_token_padding is None, \
+            assert num_token_padding is None, (
                 "padding not supported if output passed in"
+            )
             assert output.dtype == out_dtype
         assert scale is None, "only dynamic fp8 quantization supported on XPU"
         assert not use_per_token_if_dynamic, (
-            "per token dynamic fp8 quantization not supported on XPU")
+            "per token dynamic fp8 quantization not supported on XPU"
+        )
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
         torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
 
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 1c16230849bca6f64ca6b181707ae35307c84111..b527ffcf9b18bf30936f9600e491eb766803ed91 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -8,7 +8,7 @@ from urllib.parse import urljoin
 
 import numpy.typing as npt
 
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
 
@@ -32,13 +32,11 @@ class AudioAsset:
 
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
-        audio_path = get_vllm_public_assets(filename=self.filename,
-                                            s3_prefix=ASSET_DIR)
+        audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
         return librosa.load(audio_path, sr=None)
 
     def get_local_path(self) -> Path:
-        return get_vllm_public_assets(filename=self.filename,
-                                      s3_prefix=ASSET_DIR)
+        return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
 
     @property
     def url(self) -> str:
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 31cde431b5b6ad890841f535ffcf0aac654c79c8..5ca9de4076ad067c2090595f5a394718efbc5a4e 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -3,7 +3,6 @@
 
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
@@ -20,10 +19,9 @@ def get_cache_dir() -> Path:
 
 
 @lru_cache
-def get_vllm_public_assets(filename: str,
-                           s3_prefix: Optional[str] = None) -> Path:
+def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
     """
-    Download an asset file from ``s3://vllm-public-assets``
+    Download an asset file from `s3://vllm-public-assets`
     and return the path to the downloaded file.
     """
     asset_directory = get_cache_dir() / "vllm_public_assets"
@@ -36,6 +34,7 @@ def get_vllm_public_assets(filename: str,
         global_http_connection.download_file(
             f"{VLLM_S3_BUCKET_URL}/{filename}",
             asset_path,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
     return asset_path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 4639a11187d03df3018ad5e991b68b35dc102a11..c1a0f2b9cc29419ddcba018389fe68aea996cd19 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -12,12 +12,21 @@ from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
-ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
-                         "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
-                         "Grayscale_8bits_palette_sample_image",
-                         "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
-                         "231-200x300", "27-500x500", "17-150x600",
-                         "handelsblatt-preview", "paper-11"]
+ImageAssetName = Literal[
+    "stop_sign",
+    "cherry_blossom",
+    "hato",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
+    "Grayscale_8bits_palette_sample_image",
+    "1280px-Venn_diagram_rgb",
+    "RGBA_comp",
+    "237-400x300",
+    "231-200x300",
+    "27-500x500",
+    "17-150x600",
+    "handelsblatt-preview",
+    "paper-11",
+]
 
 
 @dataclass(frozen=True)
@@ -28,12 +37,12 @@ class ImageAsset:
         """
         Return s3 path for given image.
         """
-        return get_vllm_public_assets(filename=f"{self.name}.{ext}",
-                                      s3_prefix=VLM_IMAGES_DIR)
+        return get_vllm_public_assets(
+            filename=f"{self.name}.{ext}", s3_prefix=VLM_IMAGES_DIR
+        )
 
     @property
     def pil_image(self, ext="jpg") -> Image.Image:
-
         image_path = self.get_path(ext)
         return Image.open(image_path)
 
@@ -42,7 +51,7 @@ class ImageAsset:
         """
         Image embeddings, only used for testing purposes with llava 1.5.
         """
-        image_path = self.get_path('pt')
+        image_path = self.get_path("pt")
         return torch.load(image_path, map_location="cpu", weights_only=True)
 
     def read_bytes(self, ext: str) -> bytes:
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 5c9e403c4b91fc6c21bfc6c42dc457f8231423ec..d025368cbd43d24bfbea1e7365afc76a812f36cb 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -3,15 +3,14 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, ClassVar, Literal, Optional
+from typing import Any, ClassVar, Literal
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 from .base import get_cache_dir
 
@@ -43,6 +42,8 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    import cv2
+
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -65,18 +66,21 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
     frames = np.stack(frames)
     if len(frames) < num_frames:
-        raise ValueError(f"Could not read enough frames from video file {path}"
-                         f" (expected {num_frames} frames, got {len(frames)})")
+        raise ValueError(
+            f"Could not read enough frames from video file {path}"
+            f" (expected {num_frames} frames, got {len(frames)})"
+        )
     return frames
 
 
-def video_to_pil_images_list(path: str,
-                             num_frames: int = -1) -> list[Image.Image]:
+def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Image]:
     frames = video_to_ndarrays(path, num_frames)
     return [Image.fromarray(frame) for frame in frames]
 
 
 def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
+    import cv2
+
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -90,7 +94,7 @@ def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
 
     metadata = {
         "total_num_frames": num_frames,
-        "fps": fps,
+        "fps": duration / num_frames,
         "duration": duration,
         "video_backend": "opencv",
         "frames_indices": list(range(num_frames)),
@@ -136,10 +140,10 @@ class VideoAsset:
         ret = video_get_metadata(self.video_path, self.num_frames)
         return ret
 
-    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+    def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
         """
         Read audio data from the video asset, used in Qwen2.5-Omni examples.
-        
+
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
         return librosa.load(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 1b37bd1f6100a00d420cea732c4653cc2c4b7620..dd35165d5415e0f5b1facd5d5d909a2d77327911 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index b49e1c007c57ea37136aea9ea14dd83bc95ce135..9275d70fd86a41ee9d79e6b9a41f736246e647d3 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -2,18 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Generic, List, Optional, Protocol, Tuple, Type, TypeVar
+from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
 import torch
 
+from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 
+if TYPE_CHECKING:
+    from vllm.config.cache import CacheDType
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.utils import KVCacheLayoutType
+
 
 class AttentionType:
     """
     Attention type.
     Use string to be compatible with `torch.compile`.
     """
+
     DECODER = "decoder"
     """Decoder attention between previous layer Q/K/V."""
     ENCODER = "encoder"
@@ -24,20 +31,23 @@ class AttentionType:
     """Attention between dec. Q and enc. K/V for encoder-decoder."""
 
 
+class MultipleOf:
+    base: int
+
+    def __init__(self, base: int):
+        self.base = base
+
+
 class AttentionBackend(ABC):
     """Abstract class for attention backends."""
+
     # For some attention backends, we allocate an output tensor before
     # calling the custom op. When piecewise cudagraph is enabled, this
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
-
-    # Whether this backend supports receiving pre-quantized query input.
-    # If True, the attention layer will handle query quantization instead
-    # of the backend, allowing torch.compile to fuse quantization with
-    # previous operations.
-    # Needs to be worked through for all backends
-    # https://github.com/vllm-project/vllm/issues/25584
-    supports_quant_query_input: bool = False
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
     @staticmethod
     @abstractmethod
@@ -46,18 +56,9 @@ class AttentionBackend(ABC):
 
     @staticmethod
     @abstractmethod
-    def get_impl_cls() -> Type["AttentionImpl"]:
+    def get_impl_cls() -> type["AttentionImpl"]:
         raise NotImplementedError
 
-    @staticmethod
-    @abstractmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        raise NotImplementedError
-
-    @classmethod
-    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
-        return cls.get_metadata_cls()(*args, **kwargs)
-
     @staticmethod
     @abstractmethod
     def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
@@ -71,17 +72,161 @@ class AttentionBackend(ABC):
         num_kv_heads: int,
         head_size: int,
         cache_dtype_str: str = "auto",
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         raise NotImplementedError
 
     @staticmethod
-    def get_kv_cache_stride_order() -> Tuple[int, ...]:
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
         raise NotImplementedError
 
     @classmethod
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        return (not supported_head_sizes) or head_size in supported_head_sizes
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool:
+        if kv_cache_dtype is None:
+            return True
+        return (not cls.supported_kv_cache_dtypes) or (
+            kv_cache_dtype in cls.supported_kv_cache_dtypes
+        )
+
+    @classmethod
+    def supports_block_size(cls, block_size: int | None) -> bool:
+        from vllm.config.cache import BlockSize
+
+        if block_size is None:
+            return True
+
+        valid_sizes = get_args(BlockSize)
+        if block_size not in valid_sizes:
+            return False
+
+        if not cls.supported_kernel_block_sizes:
+            return True
+
+        for supported_size in cls.supported_kernel_block_sizes:
+            is_multiple_of = (
+                isinstance(supported_size, MultipleOf)
+                and block_size % supported_size.base == 0
+            )
+            is_int_equal = (
+                isinstance(supported_size, int) and block_size == supported_size
+            )
+            if is_multiple_of or is_int_equal:
+                return True
+        return False
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """Check if backend supports a given attention type.
+
+        By default, only supports decoder attention.
+        Backends should override this to support other attention types.
+        """
+        from vllm.attention import AttentionType
+
+        return attn_type == AttentionType.DECODER
+
+    @classmethod
+    def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> str | None:
+        return None
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+        attn_type: str,
+    ) -> list[str]:
+        invalid_reasons = []
+        if not cls.supports_head_size(head_size):
+            invalid_reasons.append("head_size not supported")
+        if not cls.supports_dtype(dtype):
+            invalid_reasons.append("dtype not supported")
+        if not cls.supports_kv_cache_dtype(kv_cache_dtype):
+            invalid_reasons.append("kv_cache_dtype not supported")
+        if not cls.supports_block_size(block_size):
+            invalid_reasons.append("block_size not supported")
+        if use_mla != cls.is_mla():
+            if use_mla:
+                invalid_reasons.append("MLA not supported")
+            else:
+                invalid_reasons.append("non-MLA not supported")
+        if has_sink and not cls.supports_sink():
+            invalid_reasons.append("sink setting not supported")
+        if use_sparse != cls.is_sparse():
+            if use_sparse:
+                invalid_reasons.append("sparse not supported")
+            else:
+                invalid_reasons.append("non-sparse not supported")
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append("compute capability not supported")
+        if not cls.supports_attn_type(attn_type):
+            invalid_reasons.append(f"attention type {attn_type} not supported")
+        combination_reason = cls.supports_combination(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+        )
+        if combination_reason is not None:
+            invalid_reasons.append(combination_reason)
+        return invalid_reasons
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return None
+
 
 class AttentionMetadata:
     pass
@@ -91,7 +236,6 @@ T = TypeVar("T", bound=AttentionMetadata)
 
 
 class AttentionLayer(Protocol):
-
     _q_scale: torch.Tensor
     _k_scale: torch.Tensor
     _v_scale: torch.Tensor
@@ -107,12 +251,10 @@ class AttentionLayer(Protocol):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        ...
+    ) -> torch.Tensor: ...
 
 
 class AttentionImpl(ABC, Generic[T]):
-
     # Whether the attention impl can return the softmax lse for decode.
     # Some features like decode context parallelism require the softmax lse.
     can_return_lse_for_decode: bool = False
@@ -129,14 +271,16 @@ class AttentionImpl(ABC, Generic[T]):
         self = super().__new__(cls)
         try:
             from vllm.distributed.parallel_state import get_dcp_group
+
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
         except AssertionError:
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.need_to_return_lse_for_decode = self.dcp_world_size > 1 \
-            and self.can_return_lse_for_decode
+        self.need_to_return_lse_for_decode = (
+            self.dcp_world_size > 1 and self.can_return_lse_for_decode
+        )
         return self
 
     @abstractmethod
@@ -145,13 +289,13 @@ class AttentionImpl(ABC, Generic[T]):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[List[float]] = None,
-        sliding_window: Optional[int] = None,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        sliding_window: int | None = None,
         kv_cache_dtype: str = "auto",
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         raise NotImplementedError
 
@@ -164,9 +308,9 @@ class AttentionImpl(ABC, Generic[T]):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -181,8 +325,51 @@ class AttentionImpl(ABC, Generic[T]):
         """
         return False
 
+    def supports_quant_query_input(self) -> bool:
+        """
+        Check if this attention implementation supports pre-quantized query input.
+
+        When True, the attention layer will quantize queries before passing them
+        to this backend, allowing torch.compile to fuse the quantization with
+        previous operations. This is typically supported when using FP8 KV cache
+        with compatible attention kernels (e.g., TRT-LLM).
+        TODO add support to more backends:
+        https://github.com/vllm-project/vllm/issues/25584
+
+        Returns:
+            bool: True if the implementation can accept pre-quantized queries.
+        """
+        return False
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        pass
+
 
 class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+    @abstractmethod
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        kv_b_proj: ColumnParallelLinear,
+        indexer: object | None = None,
+    ) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def forward(
@@ -193,9 +380,9 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         k_pe: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f07a6059be377468a7b5c611eed6e4a8f3ee81a8
--- /dev/null
+++ b/vllm/attention/backends/registry.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention backend registry"""
+
+import enum
+from collections.abc import Callable
+from typing import TYPE_CHECKING, cast
+
+from vllm.logger import init_logger
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+
+class _AttentionBackendEnumMeta(enum.EnumMeta):
+    """Metaclass for AttentionBackendEnum to provide better error messages."""
+
+    def __getitem__(cls, name: str):
+        """Get backend by name with helpful error messages."""
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
+            valid_backends = ", ".join(m.name for m in members)
+            raise ValueError(
+                f"Unknown attention backend: '{name}'. "
+                f"Valid options are: {valid_backends}"
+            ) from None
+
+
+class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
+
+    FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
+    ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+    ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_FA = (
+        "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+    )
+    TORCH_SDPA = ""  # this tag is only used for ViT
+    FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+    FLASHINFER_MLA = (
+        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
+    )
+    TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+    CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
+    FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
+    FLASHMLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
+    )
+    FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
+    PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+    IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend"
+    NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend"
+    FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+    TREE_ATTN = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"
+    ROCM_AITER_UNIFIED_ATTN = (
+        "vllm.v1.attention.backends.rocm_aiter_unified_attn."
+        "RocmAiterUnifiedAttentionBackend"
+    )
+    CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _OVERRIDES.pop(self, None)
+
+
+_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+
+
+def register_backend(
+    backend: AttentionBackendEnum, class_path: str | None = None
+) -> Callable[[type], type]:
+    """Register or override a backend implementation.
+
+    Args:
+        backend: The AttentionBackendEnum member to register
+        class_path: Optional class path. If not provided and used as
+            decorator, will be auto-generated from the class.
+
+    Returns:
+        Decorator function if class_path is None, otherwise a no-op
+
+    Examples:
+        # Override an existing backend
+        @register_backend(AttentionBackendEnum.FLASH_ATTN)
+        class MyCustomFlashAttn:
+            ...
+
+        # Register a custom third-party backend
+        @register_backend(AttentionBackendEnum.CUSTOM)
+        class MyCustomBackend:
+            ...
+
+        # Direct registration
+        register_backend(
+            AttentionBackendEnum.CUSTOM,
+            "my.module.MyCustomBackend"
+        )
+    """
+
+    def decorator(cls: type) -> type:
+        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        return cls
+
+    if class_path is not None:
+        _OVERRIDES[backend] = class_path
+        return lambda x: x
+
+    return decorator
+
+
+# Backwards compatibility alias for plugins
+class _BackendMeta(type):
+    """Metaclass to provide deprecation warnings when accessing _Backend."""
+
+    def __getattribute__(cls, name: str):
+        if name not in ("__class__", "__mro__", "__name__"):
+            logger.warning(
+                "_Backend has been renamed to AttentionBackendEnum. "
+                "Please update your code to use AttentionBackendEnum instead. "
+                "_Backend will be removed in a future release."
+            )
+        return getattr(AttentionBackendEnum, name)
+
+    def __getitem__(cls, name: str):
+        logger.warning(
+            "_Backend has been renamed to AttentionBackendEnum. "
+            "Please update your code to use AttentionBackendEnum instead. "
+            "_Backend will be removed in a future release."
+        )
+        return AttentionBackendEnum[name]
+
+
+class _Backend(metaclass=_BackendMeta):
+    """Deprecated: Use AttentionBackendEnum instead.
+
+    This class is provided for backwards compatibility with plugins
+    and will be removed in a future release.
+    """
+
+    pass
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 6b8d97be705079dbe18704e3b7b69eee0561be86..4c7fa477b52ba58fcb1d3484de8157f1709f1841 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend utils"""
+
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
@@ -14,7 +14,7 @@ PAD_SLOT_ID = -1
 
 @dataclass
 class MLADims:
-    q_lora_rank: Optional[int]
+    q_lora_rank: int | None
     kv_lora_rank: int
     qk_nope_head_dim: int
     qk_rope_head_dim: int
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 79879b6805bcbe8f153e94e8e324e3583b17933a..37f9a4b383ce95dd9cff879b2e91f0c03d8b34f8 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
-from typing import List, Optional
+
+from collections.abc import Callable
+from typing import cast
 
 import torch
 import torch.nn as nn
@@ -9,33 +11,47 @@ import torch.nn.functional as F
 
 import vllm.envs as envs
 from vllm.attention import AttentionType
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
+from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-                                          has_kv_transfer_group,
-                                          is_v1_kv_transfer_group)
+from vllm.config.multimodal import MultiModalConfig
+from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.models.vision import get_vit_attn_backend
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import GiB_bytes, direct_register_custom_op
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import (
+    direct_register_custom_op,
+    kv_cache_dtype_str_to_dtype,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheSpec,
+    MLAAttentionSpec,
+    SlidingWindowSpec,
+)
 
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx9
+else:
+    on_gfx9 = lambda *args, **kwargs: False
+
+
+FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
-try:
-    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
-except AttributeError:
-    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 
 
 def check_xformers_availability():
@@ -43,8 +59,7 @@ def check_xformers_availability():
     if USE_XFORMERS_OPS is not None:
         return USE_XFORMERS_OPS
 
-    if current_platform.is_cuda() and current_platform.has_device_capability(
-            100):
+    if current_platform.is_cuda() and current_platform.has_device_capability(100):
         # Xformers FA is not compatible with B200
         USE_XFORMERS_OPS = False
     else:
@@ -64,13 +79,135 @@ def check_xformers_availability():
 
 
 def check_upstream_fa_availability(dtype: torch.dtype):
-    if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
-    ) and current_platform.has_device_capability(80):
+    if (
+        dtype in (torch.float16, torch.bfloat16)
+        and current_platform.is_cuda()
+        and current_platform.has_device_capability(80)
+    ):
         from transformers.utils import is_flash_attn_2_available
+
         return is_flash_attn_2_available()
+    if current_platform.is_rocm():
+        from importlib.util import find_spec
+
+        return find_spec("flash_attn") is not None
     return False
 
 
+def maybe_get_vit_flash_attn_backend(
+    attn_backend: AttentionBackendEnum,
+    use_upstream_fa: bool,
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> tuple[AttentionBackendEnum, Callable | None]:
+    if current_platform.is_rocm():
+        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+            attn_backend = AttentionBackendEnum.ROCM_AITER_FA
+
+        elif (
+            check_upstream_fa_availability(torch.get_default_dtype())
+            and on_gfx9()
+            and attn_backend_override is None
+        ):
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
+            use_upstream_fa = True
+        else:
+            return AttentionBackendEnum.TORCH_SDPA, None
+
+    elif current_platform.is_cuda():
+        if (
+            attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
+            use_upstream_fa = True
+    elif current_platform.is_xpu():
+        assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
+            "XPU platform only supports FLASH_ATTN as vision attention backend."
+        )
+        use_upstream_fa = False
+    else:
+        return AttentionBackendEnum.TORCH_SDPA, None
+
+    if attn_backend in {
+        AttentionBackendEnum.FLASH_ATTN,
+        AttentionBackendEnum.ROCM_AITER_FA,
+    }:
+        if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
+            from aiter import flash_attn_varlen_func
+        else:
+            if use_upstream_fa:
+                from flash_attn import flash_attn_varlen_func
+            else:
+                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+    else:
+        flash_attn_varlen_func = None
+
+    return attn_backend, flash_attn_varlen_func
+
+
+def _init_kv_cache_quant(
+    layer: nn.Module,
+    quant_config: QuantizationConfig | None,
+    prefix: str,
+    kv_cache_dtype: str,
+    calculate_kv_scales: bool,
+) -> None:
+    """Initializes KV cache scaling factors and quantization method.
+
+    This helper function sets up the KV cache quantization attributes that are
+    shared between Attention and MLAAttention layers. It initializes scale
+    tensors for query, key, value, and probability, and configures the
+    quantization method if applicable.
+
+    Args:
+        layer: The attention layer instance to initialize.
+        quant_config: Optional quantization configuration.
+        prefix: Layer name prefix for quantization method lookup.
+        kv_cache_dtype: The KV cache data type string.
+        calculate_kv_scales: Whether to calculate KV scales dynamically.
+    """
+    # The default k/v_scale is set to 1.0. This is ignored
+    # when kv-cache is not fp8, and should be used with
+    # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
+    # expect the pre-quantized k/v_scale to be loaded along
+    # with the model weights.
+    layer.kv_cache_dtype = kv_cache_dtype
+    layer.calculate_kv_scales = calculate_kv_scales
+    layer._k_scale = torch.tensor(1.0, dtype=torch.float32)
+    layer._v_scale = torch.tensor(1.0, dtype=torch.float32)
+    layer._q_scale = torch.tensor(1.0, dtype=torch.float32)
+    layer._prob_scale = torch.tensor(1.0, dtype=torch.float32)
+
+    # We also keep q/k/v_scale on host (cpu) memory for attention
+    # backends that require the scales to be on host instead of on device.
+    # e.g. Flashinfer
+    layer._q_scale_float = 1.0
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    # The output scale on host memory. This should be the input scale of
+    # the quant op after this attention layer.
+    layer._o_scale_float = None
+
+    quant_method = (
+        quant_config.get_quant_method(layer, prefix=prefix) if quant_config else None
+    )
+    if quant_method is not None and not isinstance(
+        quant_method, UnquantizedLinearMethod
+    ):
+        assert isinstance(quant_method, BaseKVCacheMethod)
+        # TODO (mgoin): kv cache dtype should be specified in the FP8
+        # checkpoint config and become the "auto" behavior
+        if kv_cache_dtype == "fp8_e5m2":
+            raise ValueError("fp8_e5m2 kv-cache is not supported with fp8 checkpoints.")
+        # If quantization is enabled, we make "k_scale" and "v_scale"
+        # parameters so that it can be loaded from the model checkpoint.
+        # The k/v_scale will then be converted back to native float32
+        # values after weight loading.
+        layer.quant_method = quant_method
+        layer.quant_method.create_weights(layer)
+
+
 class Attention(nn.Module, AttentionLayerBase):
     """Attention layer.
 
@@ -88,18 +225,16 @@ class Attention(nn.Module, AttentionLayerBase):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[List[float]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        logits_soft_cap: Optional[float] = None,
-        per_layer_sliding_window: Optional[int] = None,
-        use_mla: bool = False,
-        use_sparse: bool = False,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        logits_soft_cap: float | None = None,
+        per_layer_sliding_window: int | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        attn_backend: Optional[type[AttentionBackend]] = None,
+        kv_sharing_target_layer_name: str | None = None,
+        attn_backend: type[AttentionBackend] | None = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -116,6 +251,7 @@ class Attention(nn.Module, AttentionLayerBase):
         else:
             sliding_window = None
 
+        vllm_config = get_current_vllm_config()
         if cache_config is not None:
             kv_cache_dtype = cache_config.cache_dtype
             block_size = cache_config.block_size
@@ -124,82 +260,57 @@ class Attention(nn.Module, AttentionLayerBase):
             kv_cache_dtype = "auto"
             block_size = 16
             calculate_kv_scales = False
+        self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype(
+            kv_cache_dtype, vllm_config.model_config
+        )
         if num_kv_heads is None:
             num_kv_heads = num_heads
-        assert num_heads % num_kv_heads == 0, \
-            f"num_heads ({num_heads}) is not " \
-            f"divisible by num_kv_heads ({num_kv_heads})"
-
-        # The default k/v_scale is set to 1.0. This is ignored
-        # when kv-cache is not fp8, and should be used with
-        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
-        # expect the pre-quantized k/v_scale to be loaded along
-        # with the model weights.
-        self.kv_cache_dtype = kv_cache_dtype
-        self.calculate_kv_scales = calculate_kv_scales
-        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
-        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
-        # FlashAttn doesn't support quantizing the kv-cache only
-        # but requires q to be quantized as well.
-        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
-        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
-
-        # We also keep q/k/v_scale on host (cpu) memory for attention
-        # backends that require the scales to be on host instead of on device.
-        # e.g. Flashinfer
-        self._q_scale_float = 1.0
-        self._k_scale_float = 1.0
-        self._v_scale_float = 1.0
-
-        # The output scale on host memory. This should be the input scale of
-        # the quant op after this attention layer.
-        self._o_scale_float: Optional[float] = None
-
-        self.use_mla = use_mla
-        self.use_sparse = use_sparse
+        assert num_heads % num_kv_heads == 0, (
+            f"num_heads ({num_heads}) is not divisible by num_kv_heads ({num_kv_heads})"
+        )
+
+        # Initialize KV cache quantization attributes
+        _init_kv_cache_quant(
+            self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales
+        )
+
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
         self.has_sink = extra_impl_args.get("sinks") is not None
 
-        quant_method = quant_config.get_quant_method(
-            self, prefix=prefix) if quant_config else None
-        if quant_method is not None and not isinstance(
-                quant_method, UnquantizedLinearMethod):
-            assert isinstance(quant_method, BaseKVCacheMethod)
-            # TODO (mgoin): kv cache dtype should be specified in the FP8
-            # checkpoint config and become the "auto" behavior
-            if self.kv_cache_dtype == "fp8_e5m2":
-                raise ValueError("fp8_e5m2 kv-cache is not supported with "
-                                 "fp8 checkpoints.")
-            # If quantization is enabled, we make "k_scale" and "v_scale"
-            # parameters so that it can be loaded from the model checkpoint.
-            # The k/v_scale will then be converted back to native float32
-            # values after weight loading.
-            self.quant_method = quant_method
-            self.quant_method.create_weights(self)
-
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
         if attn_backend is None:
-            self.attn_backend = get_attn_backend(head_size,
-                                                 dtype,
-                                                 kv_cache_dtype,
-                                                 block_size,
-                                                 use_mla=use_mla,
-                                                 has_sink=self.has_sink,
-                                                 use_sparse=use_sparse)
+            self.attn_backend = get_attn_backend(
+                head_size,
+                dtype,
+                kv_cache_dtype,
+                block_size,
+                use_mla=False,
+                has_sink=self.has_sink,
+                attn_type=attn_type,
+            )
         else:
             self.attn_backend = attn_backend
 
         impl_cls = self.attn_backend.get_impl_cls()
-        self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
-                             alibi_slopes, sliding_window, kv_cache_dtype,
-                             logits_soft_cap, attn_type,
-                             kv_sharing_target_layer_name, **extra_impl_args)
-        self.backend = backend_name_to_enum(self.attn_backend.get_name())
+        self.impl = impl_cls(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **extra_impl_args,
+        )
+        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -209,7 +320,7 @@ class Attention(nn.Module, AttentionLayerBase):
         self.use_direct_call = not current_platform.opaque_attention_op()
 
         self.use_output = self.attn_backend.accept_output_buffer
-        compilation_config = get_current_vllm_config().compilation_config
+        compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
@@ -228,37 +339,22 @@ class Attention(nn.Module, AttentionLayerBase):
         # by bind_kv_cache
         # this variable will not be accessed if use_direct_call is True
         self.kv_cache = [
-            torch.tensor([]) for _ in range(get_current_vllm_config(
-            ).parallel_config.pipeline_parallel_size)
+            torch.tensor([])
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
         ]
 
-        try:
-            self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT,
-                                        dtype=torch.float32)
-            self.k_range = torch.tensor(envs.K_SCALE_CONSTANT,
-                                        dtype=torch.float32)
-            self.v_range = torch.tensor(envs.V_SCALE_CONSTANT,
-                                        dtype=torch.float32)
-        except torch.cuda.OutOfMemoryError as e:
-            logger.error(
-                "Failed to initialize attention q/k/v range constants: %s", e)
-            if torch.cuda.is_available():
-                logger.debug("CUDA device: %s", torch.cuda.current_device())
-                logger.debug("Allocated: %.2f GiB",
-                             torch.cuda.memory_allocated() / GiB_bytes)
-                logger.debug("Reserved: %.2f GiB",
-                             torch.cuda.memory_reserved() / GiB_bytes)
-            raise RuntimeError(
-                "Failed to initialize q/k/v range constants. "
-                "This may be caused by insufficient memory to allocate "
-                "kv cache.") from e
+        # Initialize q/k/v range constants.
+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
+        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
+        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
 
         # for attn backends supporting query quantization
         self.query_quant = None
-        if self.kv_cache_dtype.startswith(
-                "fp8") and self.attn_backend.supports_quant_query_input:
-            self.query_quant = QuantFP8(static=True,
-                                        group_shape=GroupShape.PER_TENSOR)
+        if (
+            self.kv_cache_dtype.startswith("fp8")
+            and self.impl.supports_quant_query_input()
+        ):
+            self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
 
     def forward(
         self,
@@ -268,7 +364,7 @@ class Attention(nn.Module, AttentionLayerBase):
         # For some alternate attention backends like MLA the attention output
         # shape does not match the query shape, so we optionally let the model
         # definition specify the output tensor shape.
-        output_shape: Optional[torch.Size] = None,
+        output_shape: torch.Size | None = None,
     ) -> torch.Tensor:
         """
         The KV cache is stored inside this class and is accessed via
@@ -280,10 +376,7 @@ class Attention(nn.Module, AttentionLayerBase):
         `vllm.forward_context.get_forward_context().attn_metadata`.
         """
         if self.calculate_kv_scales:
-            attn_metadata = get_forward_context().attn_metadata
-            if attn_metadata.enable_kv_scales_calculation:
-                self.calc_kv_scales(query, key, value)
-
+            torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name)
         output_dtype = query.dtype
         if self.query_quant is not None:
             # quantizing with a simple torch operation enables
@@ -292,44 +385,37 @@ class Attention(nn.Module, AttentionLayerBase):
             # Otherwise queries are quantized using custom ops
             # which causes decoding overheads
             assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
-            query, _ = self.query_quant(query, self._q_scale)
+
+            # check if query quantization is supported
+            if self.impl.supports_quant_query_input():
+                query, _ = self.query_quant(query, self._q_scale)
 
         if self.use_output:
-            output_shape = (output_shape
-                            if output_shape is not None else query.shape)
-            output = torch.zeros(output_shape,
-                                 dtype=output_dtype,
-                                 device=query.device)
+            output_shape = output_shape if output_shape is not None else query.shape
+            output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
             hidden_size = output_shape[-1]
-            # We skip reshaping query, key and value tensors for the MLA
-            # backend since these tensors have different semantics and are
-            # processed differently.
-            if not self.use_mla:
-                # Reshape the query, key, and value tensors.
-                # NOTE(woosuk): We do this outside the custom op to minimize the
-                # CPU overheads from the non-CUDA-graph regions.
-                query = query.view(-1, self.num_heads, self.head_size)
-                output = output.view(-1, self.num_heads, self.head_size)
-                if key is not None:
-                    key = key.view(-1, self.num_kv_heads, self.head_size)
-                if value is not None:
-                    value = value.view(-1, self.num_kv_heads, self.head_size)
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
                 forward_context: ForwardContext = get_forward_context()
                 attn_metadata = forward_context.attn_metadata
                 if isinstance(attn_metadata, dict):
                     attn_metadata = attn_metadata[self.layer_name]
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
-                self.impl.forward(self,
-                                  query,
-                                  key,
-                                  value,
-                                  self_kv_cache,
-                                  attn_metadata,
-                                  output=output)
+                self.impl.forward(
+                    self, query, key, value, self_kv_cache, attn_metadata, output=output
+                )
             else:
                 torch.ops.vllm.unified_attention_with_output(
-                    query, key, value, output, self.layer_name)
+                    query, key, value, output, self.layer_name
+                )
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
@@ -338,11 +424,13 @@ class Attention(nn.Module, AttentionLayerBase):
                 if isinstance(attn_metadata, dict):
                     attn_metadata = attn_metadata[self.layer_name]
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
-                return self.impl.forward(self, query, key, value,
-                                         self_kv_cache, attn_metadata)
+                return self.impl.forward(
+                    self, query, key, value, self_kv_cache, attn_metadata
+                )
             else:
                 return torch.ops.vllm.unified_attention(
-                    query, key, value, self.layer_name)
+                    query, key, value, self.layer_name
+                )
 
     def calc_kv_scales(self, query, key, value):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
@@ -363,21 +451,35 @@ class Attention(nn.Module, AttentionLayerBase):
         return s
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
-        if hasattr(self.impl, "process_weights_after_loading"):
-            self.impl.process_weights_after_loading(act_dtype)
-
-        # FlashInfer requires attention sinks to be float32
-        if (self.backend == _Backend.FLASHINFER
-                and hasattr(self.impl, 'sinks')):
-            from vllm.v1.attention.backends.flashinfer import FlashInferImpl
-            assert isinstance(self.impl, FlashInferImpl)
-            if (self.impl.sinks is not None
-                    and self.impl.sinks.dtype != torch.float32):
-                self.impl.sinks = self.impl.sinks.to(torch.float32)
+        self.impl.process_weights_after_loading(act_dtype)
 
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        # Block size may get updated after model loading, refresh it
+        block_size = vllm_config.cache_config.block_size
+        # Should not be called for enc-dec or encoder-only attention.
+        assert self.attn_type == AttentionType.DECODER
+        if self.sliding_window is not None:
+            assert not vllm_config.model_config.use_mla, (
+                "MLA is not supported for slidingwindow"
+            )
+            return SlidingWindowSpec(
+                block_size=block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                dtype=self.kv_cache_torch_dtype,
+                sliding_window=self.sliding_window,
+            )
+        else:
+            return FullAttentionSpec(
+                block_size=block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                dtype=self.kv_cache_torch_dtype,
+            )
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
@@ -387,17 +489,23 @@ class MultiHeadAttention(nn.Module):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-    ):
+        num_kv_heads: int | None = None,
+        # This has no effect, it is only here to make it easier to swap
+        # between Attention and MultiHeadAttention
+        prefix: str = "",
+        multimodal_config: MultiModalConfig | None = None,
+    ) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.layer_name = prefix
 
-        assert self.num_heads % self.num_kv_heads == 0, \
-            f"num_heads ({self.num_heads}) is not " \
+        assert self.num_heads % self.num_kv_heads == 0, (
+            f"num_heads ({self.num_heads}) is not "
             f"divisible by num_kv_heads ({self.num_kv_heads})"
+        )
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         # During model initialization, the default dtype is set as the model
@@ -405,45 +513,64 @@ class MultiHeadAttention(nn.Module):
         dtype = torch.get_default_dtype()
 
         # Determine the attention backend
-        backend = get_vit_attn_backend(head_size=head_size, dtype=dtype)
+        attn_backend_override = None
+        if multimodal_config is not None:
+            attn_backend_override = multimodal_config.mm_encoder_attn_backend
+        backend = get_vit_attn_backend(
+            head_size=head_size,
+            dtype=dtype,
+            attn_backend_override=attn_backend_override,
+        )
 
         # Some auto-selected backends can be upgraded
         # to upstream flash attention if available.
         # If vllm native fa is selected, we use it directly.
         use_upstream_fa = False
-        if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-                dtype):
-            backend = _Backend.FLASH_ATTN
-            use_upstream_fa = True
 
-        if current_platform.is_rocm() or current_platform.is_xpu():
-            # currently, only torch_sdpa is supported on rocm/xpu
-            self.attn_backend = _Backend.TORCH_SDPA
-        else:
-
-            self.attn_backend = backend if backend in {
-                _Backend.TORCH_SDPA,
-                _Backend.XFORMERS,
-                _Backend.PALLAS,
-                _Backend.ROCM_AITER_FA,
-                _Backend.FLASH_ATTN,
-            } else _Backend.TORCH_SDPA
-
-        if (self.attn_backend == _Backend.XFORMERS
-                and not check_xformers_availability()):
-            self.attn_backend = _Backend.TORCH_SDPA
-
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            if use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-                self._flash_attn_varlen_func = flash_attn_varlen_func
-            else:
-                from vllm.vllm_flash_attn import flash_attn_varlen_func
-                self._flash_attn_varlen_func = flash_attn_varlen_func
+        self.attn_backend = (
+            backend
+            if backend
+            in {
+                AttentionBackendEnum.TORCH_SDPA,
+                AttentionBackendEnum.XFORMERS,
+                AttentionBackendEnum.PALLAS,
+                AttentionBackendEnum.ROCM_AITER_FA,
+                AttentionBackendEnum.FLASH_ATTN,
+            }
+            else AttentionBackendEnum.TORCH_SDPA
+        )
+
+        self.attn_backend, self._flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
+
+        if (
+            self.attn_backend == AttentionBackendEnum.XFORMERS
+            and not check_xformers_availability()
+        ):
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
+
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
+
+        # this condition is just to make sure that the
+        # use_upstream_fa in the log is correct
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
+            use_upstream_fa = True
 
         logger.info_once(
             f"MultiHeadAttention attn_backend: {self.attn_backend}, "
-            f"use_upstream_fa: {use_upstream_fa}")
+            f"use_upstream_fa: {use_upstream_fa}"
+        )
 
     def forward(
         self,
@@ -451,7 +578,7 @@ class MultiHeadAttention(nn.Module):
         key: torch.Tensor,
         value: torch.Tensor,
     ) -> torch.Tensor:
-        """Input shape: 
+        """Input shape:
         (batch_size x seq_len x hidden_size) or
         (batch_size x seq_len x num_heads x head_size)
         """
@@ -467,15 +594,14 @@ class MultiHeadAttention(nn.Module):
             key = torch.repeat_interleave(key, num_repeat, dim=2)
             value = torch.repeat_interleave(value, num_repeat, dim=2)
 
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
-                                        step=q_len,
-                                        dtype=torch.int32,
-                                        device=query.device)
-            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
-                                        step=kv_len,
-                                        dtype=torch.int32,
-                                        device=key.device)
+        if self.is_flash_attn_backend:
+            assert self._flash_attn_varlen_func is not None
+            cu_seqlens_q = torch.arange(
+                0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device
+            )
+            cu_seqlens_k = torch.arange(
+                0, (bsz + 1) * kv_len, step=kv_len, dtype=torch.int32, device=key.device
+            )
 
             out = self._flash_attn_varlen_func(
                 query.flatten(0, 1),
@@ -487,94 +613,306 @@ class MultiHeadAttention(nn.Module):
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
 
-            out = xops.memory_efficient_attention_forward(query,
-                                                          key,
-                                                          value,
-                                                          scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
-            query, key, value = (x.transpose(1, 2)
-                                 for x in (query, key, value))
-            out = F.scaled_dot_product_attention(query,
-                                                 key,
-                                                 value,
-                                                 scale=self.scale)
+            out = xops.memory_efficient_attention_forward(
+                query, key, value, scale=self.scale
+            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+            out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
             out = out.transpose(1, 2)
-        elif self.attn_backend == _Backend.PALLAS:
-            query, key, value = (x.transpose(1, 2)
-                                 for x in (query, key, value))
+        elif self.attn_backend == AttentionBackendEnum.PALLAS:
+            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             from torch_xla.experimental.custom_kernel import flash_attention
+
             out = flash_attention(query, key, value, sm_scale=self.scale)
             out = out.transpose(1, 2)
-        elif self.attn_backend == _Backend.ROCM_AITER_FA:
-            from aiter import flash_attn_varlen_func
-
-            # ROCm Flash Attention expects (batch, seq, heads, head_dim)
-            out = flash_attn_varlen_func(query,
-                                         key,
-                                         value,
-                                         softmax_scale=self.scale)
         else:
             # ViT attention hasn't supported this backend yet
             raise NotImplementedError(
-                f"ViT attention hasn't supported {self.attn_backend} "
-                f"backend yet.")
+                f"ViT attention hasn't supported {self.attn_backend} backend yet."
+            )
 
         return out.reshape(bsz, q_len, -1)
 
 
-def wait_for_kv_layer_from_connector(layer_name: str):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
+class MLAAttention(nn.Module, AttentionLayerBase):
+    """Multi-Head Latent Attention layer.
 
-    connector = get_kv_transfer_group()
+    This class takes query, and compressed key/value tensors as input.
+    The class does the following:
 
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
-        return
-    assert isinstance(attn_metadata, dict)
-    connector.wait_for_layer_load(layer_name)
+    1. Store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        kv_b_proj: ColumnParallelLinear,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_sparse: bool = False,
+        indexer: object | None = None,
+        **extra_impl_args,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = scale
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.head_size = kv_lora_rank + qk_rope_head_dim
+        self.layer_name = prefix
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+            calculate_kv_scales = cache_config.calculate_kv_scales
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+            calculate_kv_scales = False
 
+        # Initialize KV cache quantization attributes
+        _init_kv_cache_quant(
+            self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales
+        )
 
-def maybe_save_kv_layer_to_connector(
-    layer_name: str,
-    kv_cache_layer: List[torch.Tensor],
-):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
+        dtype = torch.get_default_dtype()
+        self.attn_backend = get_attn_backend(
+            self.head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla=True,
+            use_sparse=use_sparse,
+        )
+        impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls())
+        self.impl = impl_cls(
+            num_heads=self.num_heads,
+            head_size=self.head_size,
+            scale=self.scale,
+            num_kv_heads=1,
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype=self.kv_cache_dtype,
+            logits_soft_cap=None,
+            attn_type=AttentionType.DECODER,
+            kv_sharing_target_layer_name=None,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            kv_b_proj=kv_b_proj,
+            indexer=indexer,
+            **extra_impl_args,
+        )
 
-    connector = get_kv_transfer_group()
+        self.use_direct_call = not current_platform.opaque_attention_op()
 
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+
+        self.kv_cache = [
+            torch.tensor([])
+            for _ in range(
+                get_current_vllm_config().parallel_config.pipeline_parallel_size
+            )
+        ]
+
+        self.use_sparse = use_sparse
+
+        # Initialize q/k/v range constants.
+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
+        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
+        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        output_shape: torch.Size | None = None,
+    ) -> torch.Tensor:
+        if self.calculate_kv_scales:
+            torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name)
+
+        if self.use_direct_call:
+            forward_context: ForwardContext = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+
+            if self.attn_backend.accept_output_buffer:
+                output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
+                self.impl.forward(
+                    self,
+                    q,
+                    kv_c_normed,
+                    k_pe,
+                    self_kv_cache,
+                    attn_metadata,
+                    output=output,
+                )
+                return output
+            else:
+                return self.impl.forward(
+                    self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata
+                )
+        else:
+            if self.attn_backend.accept_output_buffer:
+                output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
+                torch.ops.vllm.unified_mla_attention_with_output(
+                    q,
+                    kv_c_normed,
+                    k_pe,
+                    output,
+                    self.layer_name,
+                )
+                return output
+            else:
+                return torch.ops.vllm.unified_mla_attention(
+                    q,
+                    kv_c_normed,
+                    k_pe,
+                    self.layer_name,
+                )
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        if hasattr(self.impl, "process_weights_after_loading"):
+            self.impl.process_weights_after_loading(act_dtype)
+
+    def calc_kv_scales(
+        self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor
+    ) -> None:
+        """Optional scale calculation for MLA inputs.
+
+        Mirrors Attention.calc_kv_scales. Not all MLA backends require this
+        """
+        # Use safe defaults if ranges are not present
+        q_range = getattr(self, "q_range", torch.tensor(1.0))
+        k_range = getattr(self, "k_range", torch.tensor(1.0))
+        v_range = getattr(self, "v_range", torch.tensor(1.0))
+
+        self._q_scale.copy_(torch.abs(q).max() / q_range)
+        # kv_c_normed is the compressed KV representation; use it for k/v
+        kv_abs_max = torch.abs(kv_c_normed).max()
+        self._k_scale.copy_(kv_abs_max / k_range)
+        self._v_scale.copy_(kv_abs_max / v_range)
+        self._q_scale_float = self._q_scale.item()
+        self._k_scale_float = self._k_scale.item()
+        self._v_scale_float = self._v_scale.item()
+        self.calculate_kv_scales = False
+
+    def get_attn_backend(self) -> type[AttentionBackend]:
+        return self.attn_backend
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        kv_cache_dtype = kv_cache_dtype_str_to_dtype(
+            self.kv_cache_dtype, vllm_config.model_config
+        )
+        return MLAAttentionSpec(
+            block_size=vllm_config.cache_config.block_size,
+            num_kv_heads=1,
+            head_size=self.head_size,
+            dtype=kv_cache_dtype,
+            cache_dtype_str=vllm_config.cache_config.cache_dtype,
+        )
+
+
+def maybe_calc_kv_scales(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    layer_name: str,
+) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
+    self = forward_context.no_compile_layers[layer_name]
+
+    # Only calculate if the layer's calculate_kv_scales flag is True
+    # This flag gets set to False after the first forward pass
+    if not self.calculate_kv_scales:
         return
-    assert isinstance(attn_metadata, dict)
-    connector.save_kv_layer(layer_name, kv_cache_layer,
-                            attn_metadata[layer_name])
 
+    self.calc_kv_scales(query, key, value)
 
-def unified_attention(
+
+def maybe_calc_kv_scales_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
-) -> torch.Tensor:
-    wait_for_kv_layer_from_connector(layer_name)
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="maybe_calc_kv_scales",
+    op_func=maybe_calc_kv_scales,
+    mutates_args=["query", "key", "value"],
+    fake_impl=maybe_calc_kv_scales_fake,
+)
 
+
+def get_attention_context(
+    layer_name: str,
+) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]:
+    """Extract attention context for a given layer.
+
+    This helper function extracts the attention metadata, attention layer
+    instance, and KV cache tensor for a specific layer.
+
+    Args:
+        layer_name: The name/identifier of the attention layer.
+
+    Returns:
+        A tuple containing:
+        - attn_metadata: Attention metadata for this specific layer, or None if
+            no metadata available
+        - attn_layer: The attention layer instance (Attention or MLAAttention)
+        - kv_cache: The KV cache tensor for current virtual engine
+
+        Note: attn_metadata may be None, but attn_layer and kv_cache are always
+        extracted from the forward context.
+    """
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     if isinstance(attn_metadata, dict):
         attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
-    output = self.impl.forward(self, query, key, value, kv_cache,
-                               attn_metadata)
+    attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name]
+    kv_cache = attn_layer.kv_cache[forward_context.virtual_engine]
+    return attn_metadata, attn_layer, kv_cache
+
+
+@maybe_transfer_kv_layer
+def unified_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
+    output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
     return output
 
 
@@ -591,37 +929,31 @@ direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
     fake_impl=unified_attention_fake,
-    tags=tag_cudagraph_unsafe,
 )
 
 
+@maybe_transfer_kv_layer
 def unified_attention_with_output(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
-    wait_for_kv_layer_from_connector(layer_name)
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
-    self.impl.forward(self,
-                      query,
-                      key,
-                      value,
-                      kv_cache,
-                      attn_metadata,
-                      output=output,
-                      output_scale=output_scale,
-                      output_block_scale=output_block_scale)
-
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
+    self.impl.forward(
+        self,
+        query,
+        key,
+        value,
+        kv_cache,
+        attn_metadata,
+        output=output,
+        output_scale=output_scale,
+        output_block_scale=output_block_scale,
+    )
 
 
 def unified_attention_with_output_fake(
@@ -630,8 +962,8 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
     return
 
@@ -641,5 +973,80 @@ direct_register_custom_op(
     op_func=unified_attention_with_output,
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
-    tags=tag_cudagraph_unsafe,
+)
+
+
+@maybe_transfer_kv_layer
+def unified_mla_attention(
+    q: torch.Tensor,
+    kv_c_normed: torch.Tensor,
+    k_pe: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
+    output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata)
+
+    return output
+
+
+def unified_mla_attention_fake(
+    q: torch.Tensor,
+    kv_c_normed: torch.Tensor,
+    k_pe: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    return torch.empty_like(q).contiguous()
+
+
+direct_register_custom_op(
+    op_name="unified_mla_attention",
+    op_func=unified_mla_attention,
+    mutates_args=[],
+    fake_impl=unified_mla_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+@maybe_transfer_kv_layer
+def unified_mla_attention_with_output(
+    q: torch.Tensor,
+    kv_c_normed: torch.Tensor,
+    k_pe: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
+) -> None:
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
+    self.impl.forward(
+        self,
+        q,
+        kv_c_normed,
+        k_pe,
+        kv_cache,
+        attn_metadata,
+        output=output,
+        output_scale=output_scale,
+        output_block_scale=output_block_scale,
+    )
+
+
+def unified_mla_attention_with_output_fake(
+    q: torch.Tensor,
+    kv_c_normed: torch.Tensor,
+    k_pe: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_mla_attention_with_output",
+    op_func=unified_mla_attention_with_output,
+    mutates_args=["output", "output_block_scale"],
+    fake_impl=unified_mla_attention_with_output_fake,
+    dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index e13f77c18c45aa1dd17ce8e75a6586d22029aa33..48fcc6fa736bb1cc02392216bf15cf1593cf416a 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -1,18 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import ClassVar, List, Optional
 
 import torch
 
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.selector import get_attn_backend
-from vllm.config import CacheConfig, QuantizationConfig
+from vllm.config import CacheConfig
+from vllm.config.vllm import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.v1.attention.backends.utils import (
-    AttentionCGSupport, CommonAttentionMetadata,
-    make_local_attention_virtual_batches, subclass_attention_backend)
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    make_local_attention_virtual_batches,
+    subclass_attention_backend,
+)
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    KVCacheSpec,
+)
 
 from ..layer import Attention
 
@@ -26,41 +34,54 @@ def create_chunked_local_attention_backend(
     prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
+    assert issubclass(underlying_builder, AttentionMetadataBuilder)
 
     class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
-        cudagraph_support: ClassVar[AttentionCGSupport] = \
-            AttentionCGSupport.NEVER
-
-        def build(self,
-                  common_prefix_len: int,
-                  common_attn_metadata: CommonAttentionMetadata,
-                  fast_build: bool = False) -> AttentionMetadata:
+        @classmethod
+        def get_cudagraph_support(
+            cls: type["AttentionMetadataBuilder"],
+            vllm_config: VllmConfig,
+            kv_cache_spec: AttentionSpec,
+        ) -> AttentionCGSupport:
+            # Explicit override in case the underlying builder specialized this getter.
+            # @override omitted only because of mypy limitation due to type variable.
+            return AttentionCGSupport.NEVER
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
             common_attn_metadata = make_local_attention_virtual_batches(
-                attention_chunk_size, common_attn_metadata, block_size)
-            return super().build(common_prefix_len, common_attn_metadata,
-                                 fast_build)
+                attention_chunk_size, common_attn_metadata, block_size
+            )
+            return super().build(common_prefix_len, common_attn_metadata, fast_build)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=ChunkedLocalAttentionBuilder)
+        builder_cls=ChunkedLocalAttentionBuilder,
+    )
 
     return attn_backend
 
 
 class ChunkedLocalAttention(Attention):
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 attention_chunk_size: int,
-                 num_kv_heads: Optional[int] = None,
-                 alibi_slopes: Optional[List[float]] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 kv_sharing_target_layer_name: Optional[str] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        attention_chunk_size: int,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        kv_sharing_target_layer_name: str | None = None,
+        prefix: str = "",
+    ):
+        self.attention_chunk_size = attention_chunk_size
         dtype = torch.get_default_dtype()
         if cache_config is not None:
             kv_cache_dtype = cache_config.cache_dtype
@@ -69,16 +90,12 @@ class ChunkedLocalAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(head_size, dtype,
-                                                       kv_cache_dtype,
-                                                       block_size)
-
-            attn_backend = create_chunked_local_attention_backend(
-                underlying_attn_backend, attention_chunk_size, block_size)
-        else:
-            # in v0 the local attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_chunked_local_attention_backend(
+            underlying_attn_backend, attention_chunk_size, block_size
+        )
 
         super().__init__(
             num_heads=num_heads,
@@ -90,4 +107,15 @@ class ChunkedLocalAttention(Attention):
             quant_config=quant_config,
             prefix=prefix,
             kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            attn_backend=attn_backend)
+            attn_backend=attn_backend,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        assert self.attention_chunk_size
+        return ChunkedLocalAttentionSpec(
+            block_size=vllm_config.cache_config.block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            dtype=self.kv_cache_torch_dtype,
+            attention_chunk_size=self.attention_chunk_size,
+        )
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 9400c5bffa3807235c44ca1077fbf4223c497c49..5b44c7e3e7ec84354e4ba842693d70275b4c4a8e 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -2,39 +2,44 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from copy import copy
-from typing import Optional
 
 import numpy as np
 import torch
 
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv
-from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
-                                              subclass_attention_backend)
-from vllm.v1.kv_cache_interface import CrossAttentionSpec
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend,
+)
+from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec
 
 logger = init_logger(__name__)
 
 
 def _get_max_encoder_len(vllm_config: "VllmConfig") -> int:
-    """Gets the max number of encoder input tokens from the config.
-    """
+    """Gets the max number of encoder input tokens from the config."""
     sc = vllm_config.scheduler_config
-    assert sc and isinstance(sc.max_num_encoder_input_tokens, int), \
+    assert sc and isinstance(sc.max_num_encoder_input_tokens, int), (
         "max_num_encoder_input_tokens must be int for enc-dec models"
+    )
     return sc.max_num_encoder_input_tokens
 
 
-def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray,
-                            block_table_tensor: torch.Tensor,
-                            kv_cache_spec: CrossAttentionSpec,
-                            device: torch.device) -> torch.Tensor:
+def _get_cross_slot_mapping(
+    encoder_seq_lens: np.ndarray,
+    block_table_tensor: torch.Tensor,
+    kv_cache_spec: CrossAttentionSpec,
+    device: torch.device,
+) -> torch.Tensor:
     """Get cross-attention slot mappings."""
 
     block_size = kv_cache_spec.block_size
@@ -58,9 +63,7 @@ def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray,
         needed_block_ids = req_block_ids[:num_blocks_needed]
 
         # All needed blocks are allocated
-        i_values = torch.arange(encoder_seq_len,
-                                dtype=torch.int64,
-                                device=device)
+        i_values = torch.arange(encoder_seq_len, dtype=torch.int64, device=device)
         block_indices = i_values // block_size
         block_offsets = i_values % block_size
         block_numbers = needed_block_ids[block_indices]
@@ -76,42 +79,48 @@ def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray,
 
 @functools.lru_cache
 def create_cross_attention_backend(
-    underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]:
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
     prefix = "CrossAttention_"
     underlying_builder = underlying_attn_backend.get_builder_cls()
 
     class CrossAttentionBuilder(underlying_builder):  # type: ignore
-
-        def build(self,
-                  common_prefix_len: int,
-                  common_attn_metadata: CommonAttentionMetadata,
-                  fast_build: bool = False) -> AttentionMetadata:
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
             new_metadata = copy(common_attn_metadata)
             new_metadata.causal = False
             max_encoder_len = _get_max_encoder_len(self.vllm_config)
             new_metadata.max_seq_len = max_encoder_len
 
             new_metadata.seq_lens = torch.full(
-                (new_metadata.num_reqs, ),
+                (new_metadata.num_reqs,),
                 max_encoder_len,
                 dtype=torch.int32,
                 device=self.device,
             )
             new_metadata.seq_lens_cpu = torch.full(
-                (new_metadata.num_reqs, ),
+                (new_metadata.num_reqs,),
                 max_encoder_len,
                 dtype=torch.int32,
                 device="cpu",
             )
             new_metadata.slot_mapping = _get_cross_slot_mapping(
-                new_metadata.encoder_seq_lens, new_metadata.block_table_tensor,
-                self.kv_cache_spec, self.device)
+                new_metadata.encoder_seq_lens,
+                new_metadata.block_table_tensor,
+                self.kv_cache_spec,
+                self.device,
+            )
             return super().build(common_prefix_len, new_metadata, fast_build)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=CrossAttentionBuilder)
+        builder_cls=CrossAttentionBuilder,
+    )
 
     return attn_backend
 
@@ -122,13 +131,15 @@ class CrossAttention(Attention):
     Handles attention between decoder queries and encoder keys/values.
     """
 
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 cache_config: Optional[CacheConfig] = None,
-                 attn_type: Optional[str] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
+        **kwargs,
+    ):
         dtype = torch.get_default_dtype()
 
         if cache_config is not None:
@@ -138,25 +149,30 @@ class CrossAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(head_size, dtype,
-                                                       kv_cache_dtype,
-                                                       block_size)
-
-            attn_backend = create_cross_attention_backend(
-                underlying_attn_backend)
-        else:
-            # in v0 cross attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_DECODER, (
-                "CrossAttention only supports AttentionType.ENCODER_DECODER")
-
-        super().__init__(num_heads=num_heads,
-                         head_size=head_size,
-                         scale=scale,
-                         cache_config=cache_config,
-                         attn_backend=attn_backend,
-                         attn_type=AttentionType.ENCODER_DECODER,
-                         **kwargs)
+                "CrossAttention only supports AttentionType.ENCODER_DECODER"
+            )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            attn_type=AttentionType.ENCODER_DECODER,
+            **kwargs,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        return CrossAttentionSpec(
+            block_size=vllm_config.cache_config.block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            dtype=self.kv_cache_torch_dtype,
+        )
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index cea05df5b96d2ae2014c5108dcb440a03f313f94..5e99c990100347875b9dc9cbdd8d2757ad220c55 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -2,41 +2,50 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from copy import copy
-from typing import Optional
 
 import torch
 
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
-from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
-                                              subclass_attention_backend)
+from vllm.config.vllm import VllmConfig
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend,
+)
+from vllm.v1.kv_cache_interface import KVCacheSpec
 
 
 @functools.lru_cache
 def create_encoder_only_attention_backend(
-    underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]:
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
     prefix = "EncoderOnlyAttention_"
     underlying_builder = underlying_attn_backend.get_builder_cls()
 
     class EncoderOnlyAttentionBuilder(underlying_builder):  # type: ignore
-
-        def build(self,
-                  common_prefix_len: int,
-                  common_attn_metadata: CommonAttentionMetadata,
-                  fast_build: bool = False) -> AttentionMetadata:
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
             new_common_attn_metadata = copy(common_attn_metadata)
             new_common_attn_metadata.causal = False
-            return super().build(common_prefix_len, new_common_attn_metadata,
-                                 fast_build)
+            return super().build(
+                common_prefix_len, new_common_attn_metadata, fast_build
+            )
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=EncoderOnlyAttentionBuilder)
+        builder_cls=EncoderOnlyAttentionBuilder,
+    )
 
     return attn_backend
 
@@ -46,13 +55,15 @@ class EncoderOnlyAttention(Attention):
     Encoder attention is a special case that doesn't need a KV Cache.
     """
 
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 cache_config: Optional[CacheConfig] = None,
-                 attn_type: Optional[str] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
+        **kwargs,
+    ):
         dtype = torch.get_default_dtype()
 
         if cache_config is not None:
@@ -62,25 +73,31 @@ class EncoderOnlyAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(head_size, dtype,
-                                                       kv_cache_dtype,
-                                                       block_size)
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_ONLY,
+        )
 
-            attn_backend = create_encoder_only_attention_backend(
-                underlying_attn_backend)
-        else:
-            # in v0 encoder only attention is handled inside the backends
-            attn_backend = None
+        attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
-            assert attn_type == AttentionType.ENCODER_ONLY, \
+            assert attn_type == AttentionType.ENCODER_ONLY, (
                 "EncoderOnlyAttention only supports AttentionType.ENCODER_ONLY"
-
-        super().__init__(num_heads=num_heads,
-                         head_size=head_size,
-                         scale=scale,
-                         cache_config=cache_config,
-                         attn_backend=attn_backend,
-                         attn_type=AttentionType.ENCODER_ONLY,
-                         **kwargs)
+            )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            cache_config=cache_config,
+            attn_backend=attn_backend,
+            attn_type=AttentionType.ENCODER_ONLY,
+            **kwargs,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
+        # Does not need KV cache
+        return None
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index bf4b06512a3c11e698fe69c4f039ee25f4e707bc..aa791fe97006390f719e4c2c53d43c0078ed555a 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -25,72 +25,73 @@ def cdiv_fn(x, y):
 
 @triton.jit
 def kernel_paged_attention_2d(
-        output_ptr,  # [num_tokens, num_query_heads, head_size]
-        query_ptr,  # [num_tokens, num_query_heads, head_size]
-        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
-        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
-        sink_ptr,  # [num_query_heads]
-        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-        seq_lens_ptr,  # [num_seqs]
-        alibi_slopes_ptr,  # [num_query_heads]
-        scale,  # float32
-        k_scale,  # float32
-        v_scale,  # float32
-        out_scale_inv,
-        num_query_heads: tl.constexpr,  # int
-        num_queries_per_kv: tl.constexpr,  # int
-        num_queries_per_kv_padded: tl.constexpr,  # int
-        block_table_stride: tl.int64,  # int
-        query_stride_0: tl.int64,  # int
-        query_stride_1: tl.int64,  # int, should be equal to head_size
-        output_stride_0: tl.int64,  # int
-        output_stride_1: tl.int64,  # int, should be equal to head_size
-        BLOCK_SIZE: tl.constexpr,  # int
-        HEAD_SIZE: tl.constexpr,  # int
-        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-        USE_ALIBI_SLOPES: tl.constexpr,  # bool
-        SLIDING_WINDOW: tl.constexpr,  # int
-        x: tl.constexpr,  # int
-        stride_k_cache_0: tl.int64,  # int
-        stride_k_cache_1: tl.int64,  # int
-        stride_k_cache_2: tl.int64,  # int
-        stride_k_cache_3: tl.int64,  # int
-        stride_k_cache_4: tl.int64,  # int
-        stride_v_cache_0: tl.int64,  # int
-        stride_v_cache_1: tl.int64,  # int
-        stride_v_cache_2: tl.int64,  # int
-        stride_v_cache_3: tl.int64,  # int
-        filter_by_query_len: tl.constexpr,  # bool
-        query_start_len_ptr,  # [num_seqs+1]
-        USE_SINKS: tl.constexpr,  # bool
-        USE_FP8: tl.constexpr,
-        FP8_MIN: tl.constexpr = float8_info.min,
-        FP8_MAX: tl.constexpr = float8_info.max):
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+    sink_ptr,  # [num_query_heads]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    out_scale_inv,
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    num_queries_per_kv_padded: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    x: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.int64,  # int
+    stride_k_cache_4: tl.int64,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.int64,  # int
+    filter_by_query_len: tl.constexpr,  # bool
+    query_start_len_ptr,  # [num_seqs+1]
+    USE_SINKS: tl.constexpr,  # bool
+    USE_FP8: tl.constexpr,
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
+):
     seq_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
 
     if filter_by_query_len:
         cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
-        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx +
-                                              1)
-        cur_batch_query_len = cur_batch_in_all_stop_index \
-            - cur_batch_in_all_start_index
+        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+        cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
         if cur_batch_query_len > 1:
             return
     else:
         cur_batch_in_all_start_index = seq_idx
 
     query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange(
-        0, num_queries_per_kv_padded)
+        0, num_queries_per_kv_padded
+    )
 
-    query_offset = (cur_batch_in_all_start_index * query_stride_0 +
-                    query_head_idx[:, None] * query_stride_1)
+    query_offset = (
+        cur_batch_in_all_start_index * query_stride_0
+        + query_head_idx[:, None] * query_stride_1
+    )
 
     head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv
     head_mask = head_mask & (query_head_idx < num_query_heads)
 
-    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
-                        0).to(tl.int1)
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1)
 
     # Q : (num_queries_per_kv, HEAD_SIZE,)
     Q = tl.load(
@@ -102,9 +103,7 @@ def kernel_paged_attention_2d(
     block_table_offset = seq_idx * block_table_stride
 
     if not USE_SINKS:
-        M = tl.full([num_queries_per_kv_padded],
-                    float("-inf"),
-                    dtype=tl.float32)
+        M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32)
     else:
         M = tl.load(
             sink_ptr + query_head_idx,
@@ -113,43 +112,43 @@ def kernel_paged_attention_2d(
         ).to(dtype=tl.float32)
 
     L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32)
-    acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED],
-                   dtype=tl.float32)
+    acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], dtype=tl.float32)
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx,
-                              mask=head_mask,
-                              other=0.0)
+        alibi_slope = tl.load(
+            alibi_slopes_ptr + query_head_idx, mask=head_mask, other=0.0
+        )
 
     num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
 
     # iterate through tiles
     for j in range(0, num_blocks):
-
         physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
 
         offs_n = tl.arange(0, BLOCK_SIZE)
         offs_d = tl.arange(0, HEAD_SIZE_PADDED)
 
-        v_offset = (physical_block_idx * stride_v_cache_0 +
-                    kv_head_idx * stride_v_cache_1 +
-                    offs_d[None, :] * stride_v_cache_2 +
-                    offs_n[:, None] * stride_v_cache_3)
+        v_offset = (
+            physical_block_idx * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_1
+            + offs_d[None, :] * stride_v_cache_2
+            + offs_n[:, None] * stride_v_cache_3
+        )
 
-        k_offset = (physical_block_idx * stride_k_cache_0 +
-                    kv_head_idx * stride_k_cache_1 +
-                    (offs_d[:, None] // x) * stride_k_cache_2 +
-                    offs_n[None, :] * stride_k_cache_3 +
-                    (offs_d[:, None] % x) * stride_k_cache_4)
+        k_offset = (
+            physical_block_idx * stride_k_cache_0
+            + kv_head_idx * stride_k_cache_1
+            + (offs_d[:, None] // x) * stride_k_cache_2
+            + offs_n[None, :] * stride_k_cache_3
+            + (offs_d[:, None] % x) * stride_k_cache_4
+        )
 
         # K : (HEAD_SIZE, BLOCK_SIZE)
-        K_load = tl.load(key_cache_ptr + k_offset,
-                         mask=dim_mask[:, None],
-                         other=0.0)
+        K_load = tl.load(key_cache_ptr + k_offset, mask=dim_mask[:, None], other=0.0)
 
         if K_load.dtype.is_fp8():
             K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
@@ -157,9 +156,7 @@ def kernel_paged_attention_2d(
             K = K_load
 
         # V : (BLOCK_SIZE, HEAD_SIZE)
-        V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[None, :],
-                         other=0.0)
+        V_load = tl.load(value_cache_ptr + v_offset, mask=dim_mask[None, :], other=0.0)
 
         if V_load.dtype.is_fp8():
             V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
@@ -171,15 +168,13 @@ def kernel_paged_attention_2d(
         seq_mask = seq_offset[None, :] < boundary
 
         # S : (num_queries_per_kv, BLOCK_SIZE,)
-        S = tl.where(head_mask[:, None] & seq_mask, 0.0,
-                     float("-inf")).to(tl.float32)
+        S = tl.where(head_mask[:, None] & seq_mask, 0.0, float("-inf")).to(tl.float32)
         S += scale * tl.dot(Q, K)
 
         context_len = seq_len - 1
 
         if SLIDING_WINDOW > 0:
-            S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S,
-                         -10000)
+            S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S, -10000)
 
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
@@ -213,12 +208,13 @@ def kernel_paged_attention_2d(
         acc = acc * tl.load(out_scale_inv)
         acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
 
-    output_offset = (cur_batch_in_all_start_index * output_stride_0 +
-                     query_head_idx * output_stride_1)
+    output_offset = (
+        cur_batch_in_all_start_index * output_stride_0
+        + query_head_idx * output_stride_1
+    )
 
     tl.store(
-        output_ptr + output_offset[:, None] +
-        tl.arange(0, HEAD_SIZE_PADDED)[None, :],
+        output_ptr + output_offset[:, None] + tl.arange(0, HEAD_SIZE_PADDED)[None, :],
         acc,
         mask=dim_mask[None, :] & head_mask[:, None],
     )
@@ -246,9 +242,8 @@ def chunked_prefill_paged_decode(
     # Optional tensor for sinks
     sinks=None,
 ):
-
     if sm_scale is None:
-        sm_scale = 1.0 / (query.shape[1]**0.5)
+        sm_scale = 1.0 / (query.shape[1] ** 0.5)
 
     use_alibi_slopes = alibi_slopes is not None
 
@@ -302,10 +297,10 @@ def chunked_prefill_paged_decode(
         key_cache = key_cache.view(target_dtype)
         value_cache = value_cache.view(target_dtype)
 
-    num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
-                                    16)
+    num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16)
 
     from vllm.platforms.rocm import use_rocm_custom_paged_attention
+
     use_custom = use_rocm_custom_paged_attention(
         query.dtype,
         head_size,
@@ -319,13 +314,13 @@ def chunked_prefill_paged_decode(
     )
     if use_custom:
         _PARTITION_SIZE_ROCM = 256
-        max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
-                              _PARTITION_SIZE_ROCM)
+        max_num_partitions = (
+            max_seq_len + _PARTITION_SIZE_ROCM - 1
+        ) // _PARTITION_SIZE_ROCM
         assert _PARTITION_SIZE_ROCM % block_size == 0
         total_num_seq = block_table.shape[0]
         tmp_output = torch.empty(
-            size=(total_num_seq, num_query_heads, max_num_partitions,
-                  head_size),
+            size=(total_num_seq, num_query_heads, max_num_partitions, head_size),
             dtype=query.dtype,
             device=output.device,
         )
@@ -358,10 +353,12 @@ def chunked_prefill_paged_decode(
             fp8_out_scale=output_scale,
         )
     else:
-        kernel_paged_attention_2d[(
-            num_seqs,
-            num_kv_heads,
-        )](
+        kernel_paged_attention_2d[
+            (
+                num_seqs,
+                num_kv_heads,
+            )
+        ](
             output_ptr=output,
             query_ptr=query,
             key_cache_ptr=key_cache,
@@ -373,8 +370,7 @@ def chunked_prefill_paged_decode(
             scale=sm_scale,
             k_scale=k_scale,
             v_scale=v_scale,
-            out_scale_inv=1.0 /
-            output_scale if output_scale is not None else 1.0,
+            out_scale_inv=1.0 / output_scale if output_scale is not None else 1.0,
             num_query_heads=num_query_heads,
             num_queries_per_kv=num_queries_per_kv,
             num_queries_per_kv_padded=num_queries_per_kv_padded,
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index e659f1f3eae93f9f103017a8065d221fbd8c57ba..1d06ab7da68070a68868527ea120cbc8f6b1379e 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -7,11 +7,21 @@ from vllm.triton_utils import tl, triton
 
 
 @triton.jit
-def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
-                                vlse_ptr, outputs_stride_B, outputs_stride_H,
-                                outputs_stride_D, lses_stride_N, lses_stride_B,
-                                lses_stride_H, lse_idx, HEAD_DIM: tl.constexpr,
-                                N_ROUNDED: tl.constexpr):
+def _correct_attn_cp_out_kernel(
+    outputs_ptr,
+    new_output_ptr,
+    lses_ptr,
+    vlse_ptr,
+    outputs_stride_B,
+    outputs_stride_H,
+    outputs_stride_D,
+    lses_stride_N,
+    lses_stride_B,
+    lses_stride_H,
+    lse_idx,
+    HEAD_DIM: tl.constexpr,
+    N_ROUNDED: tl.constexpr,
+):
     """
     Apply the all-gathered lses to correct each local rank's attention
     output. we still need perform a cross-rank reduction to obtain the
@@ -33,13 +43,17 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
     num_n_offsets = tl.arange(0, N_ROUNDED)
 
     # shape = [N]
-    lse_offsets = num_n_offsets * lses_stride_N + batch_idx * \
-        lses_stride_B + head_idx * lses_stride_H
+    lse_offsets = (
+        num_n_offsets * lses_stride_N
+        + batch_idx * lses_stride_B
+        + head_idx * lses_stride_H
+    )
 
     # calc final lse
     lse = tl.load(lses_ptr + lse_offsets)
-    lse = tl.where((lse != lse) | (lse == float('inf')), -float('inf'), lse)
+    lse = tl.where((lse != lse) | (lse == float("inf")), -float("inf"), lse)
     lse_max = tl.max(lse, axis=0)
+    lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
     lse -= lse_max
     lse_exp = tl.exp(lse)
     lse_acc = tl.sum(lse_exp, axis=0)
@@ -50,18 +64,23 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
     tl.store(vlse_ptr + lse_offsets, lse)
 
     # shape = [D]
-    output_offsets = batch_idx * outputs_stride_B + \
-                    head_idx * outputs_stride_H + \
-                    d_offsets * outputs_stride_D
+    output_offsets = (
+        batch_idx * outputs_stride_B
+        + head_idx * outputs_stride_H
+        + d_offsets * outputs_stride_D
+    )
 
     # correct output
-    lse_offset = lse_idx * lses_stride_N + batch_idx * \
-        lses_stride_B + head_idx * lses_stride_H
+    lse_offset = (
+        lse_idx * lses_stride_N + batch_idx * lses_stride_B + head_idx * lses_stride_H
+    )
     lse_tmp = tl.load(lses_ptr + lse_offset)
     lse_finally = lse_tmp - lse
     lse_finally = tl.where(
-        (lse_finally != lse_finally) | (lse_finally == float('inf')),
-        -float('inf'), lse_finally)
+        (lse_finally != lse_finally) | (lse_finally == float("inf")),
+        -float("inf"),
+        lse_finally,
+    )
     factor = tl.exp(lse_finally)
     output = tl.load(outputs_ptr + output_offsets)
     output = output * factor
@@ -70,8 +89,7 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
 
 
 class CPTritonContext:
-    """ The CPTritonContext is used to avoid recompilation of the Triton JIT.
-    """
+    """The CPTritonContext is used to avoid recompilation of the Triton JIT."""
 
     def __init__(self):
         self.inner_kernel = None
@@ -84,8 +102,8 @@ class CPTritonContext:
 
 
 def correct_attn_out(
-        out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
-        ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]:
+    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Correct the attention output using the all-gathered lses.
 
     Args:
@@ -100,25 +118,64 @@ def correct_attn_out(
     if ctx is None:
         ctx = CPTritonContext()
 
-    lse = torch.empty_like(lses[0])
-
-    grid = (out.shape[0], out.shape[1], 1)
-    regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(),
-                    cp_rank)
-    const_args = {
-        "HEAD_DIM": out.shape[-1],
-        "N_ROUNDED": lses.shape[0],
-    }
-
-    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args,
-                    **const_args)
+    # --- Normalize to 3D views ---
+    if out.ndim == 4 and out.shape[1] == 1:
+        out = out.squeeze(1)
+    assert out.ndim == 3, f"expected out [B,H,D] or [B,1,H,D], got {tuple(out.shape)}"
+
+    if lses.ndim == 4 and lses.shape[-1] == 1:
+        lses = lses.squeeze(-1)
+    if lses.ndim == 4 and lses.shape[1] == 1:
+        lses = lses.squeeze(1)
+    assert lses.ndim == 3, (
+        f"expected lses [N,B,H] (optionally with a 1-sized extra dim), "
+        f"got {tuple(lses.shape)}"
+    )
+
+    B, H, D = out.shape
+    N = lses.shape[0]
+
+    # Strides after we normalized shapes to 3-D views.  The kernel computes
+    # offsets for `vlse_ptr` using lses_stride_B/H, so the output buffer must
+    # have the same B/H stride layout as a slice of `lses`.
+    o_sB, o_sH, o_sD = out.stride()
+    l_sN, l_sB, l_sH = lses.stride()
+
+    # Allocate LSE with the same B/H strides as `lses` so writes land correctly
+    # even when `lses` is a non-contiguous view (e.g., 4-D to 3-D squeeze).
+    lse = torch.empty_strided(
+        (B, H), (l_sB, l_sH), device=lses.device, dtype=lses.dtype
+    )
+
+    # Kernel launch config
+    grid = (B, H, 1)
+
+    regular_args = (
+        out,
+        out,
+        lses,
+        lse,
+        o_sB,
+        o_sH,
+        o_sD,
+        l_sN,
+        l_sB,
+        l_sH,
+        cp_rank,
+    )
+    const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
+
+    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
     return out, lse
 
 
-def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor,
-                     cp_attn_lse: torch.Tensor,
-                     cp_group: GroupCoordinator,
-                     ctx: CPTritonContext = None):
+def cp_lse_ag_out_rs(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext = None,
+    return_lse=False,
+):
     """
     cp_attn_out: [ B, H, D ]
     cp_attn_lse: [ B, H ]
@@ -129,14 +186,231 @@ def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor,
     if ctx is None:
         ctx = CPTritonContext()
 
-    lses = torch.empty((cp_group.world_size, ) + cp_attn_lse.shape,
-                       dtype=cp_attn_lse.dtype,
-                       device=cp_attn_lse.device)
+    lses = torch.empty(
+        (cp_group.world_size,) + cp_attn_lse.shape,
+        dtype=cp_attn_lse.dtype,
+        device=cp_attn_lse.device,
+    )
 
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
-    out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
     out = cp_group.reduce_scatter(out, dim=1)
+
+    if return_lse:
+        cp_num_heads = lse.shape[1] // cp_group.world_size
+        cp_rank = cp_group.rank_in_group
+        lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)]
+        return out, lse
+    return out
+
+
+@triton.jit
+def _pack_seq_kernel(
+    x_ptr,  # [N, D]
+    out_ptr,  # [B, Lmax, D]
+    lengths_ptr,  # *i32, [B]
+    N: tl.constexpr,
+    D: tl.constexpr,
+    Lmax: tl.constexpr,
+    PAD_VALUE: tl.constexpr,
+    BLOCK_T: tl.constexpr,  # timesteps per program
+    BLOCK_D: tl.constexpr,  # features per program
+):
+    pid_b = tl.program_id(0)  # batch id
+    pid_t = tl.program_id(1)  # block over time dimension
+    pid_d = tl.program_id(2)  # block over feature dimension
+    off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T)  # [BLOCK_T]
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)  # [BLOCK_D]
+
+    # Compute start index and sequence length from cumulative lengths
+    in_start = 0
+    for i in range(pid_b):
+        in_start += tl.load(lengths_ptr + i)
+    seq_len = tl.load(lengths_ptr + pid_b)
+
+    # valid time positions for this block
+    t_mask = off_t < Lmax
+
+    # compute input row indices for valid (b, t)
+    in_row = in_start + off_t
+    valid_row = (off_t < seq_len) & t_mask
+
+    # Pointers
+    # x_ptr: row-major [N, D]
+    x_row_ptr = x_ptr + in_row[:, None] * D + off_d[None, :]
+
+    # out_ptr: row-major [B, Lmax, D]
+    out_row_ptr = out_ptr + (pid_b * Lmax + off_t)[:, None] * D + off_d[None, :]
+
+    # Initialize with PAD (cast will occur as needed based on out_ptr dtype)
+    d_mask = off_d[None, :] < D
+    pad_vals = tl.full([BLOCK_T, BLOCK_D], PAD_VALUE, tl.float32)
+    tl.store(out_row_ptr, pad_vals, mask=t_mask[:, None] & d_mask)
+
+    # Load & write only where within seq_len
+    x_vals = tl.load(x_row_ptr, mask=valid_row[:, None] & d_mask)
+    tl.store(out_row_ptr, x_vals, mask=valid_row[:, None] & d_mask)
+
+
+def pack_seq_triton(
+    x: torch.Tensor,
+    lengths: torch.Tensor,
+    pad_value: float = -float("inf"),
+    block_t: int = 64,
+    block_d: int = 64,
+) -> torch.Tensor:
+    """
+    Pack sequences of different lengths into a batched tensor.
+
+    Args:
+        x: [N, ...] - input tensor where N is total number of tokens
+        lengths: [B] - sequence lengths for each batch
+        pad_value: value to use for padding
+        block_t: block size for time dimension
+        block_d: block size for feature dimension
+
+    Returns:
+        packed: [B, Lmax, ...] - packed tensor
+    """
+
+    # Handle multi-dimensional input by reshaping to (N, -1)
+    original_shape = x.shape
+    if len(original_shape) > 2:
+        N = original_shape[0]
+        x_reshaped = x.reshape(N, -1)
+        D = x_reshaped.shape[1]
+    else:
+        N, D = x.shape
+        x_reshaped = x
+
+    B = lengths.numel()
+    Lmax = int(lengths.max().item())
+
+    # Starts are computed inside the kernel from lengths
+
+    out = torch.empty((B, Lmax, D), device=x.device, dtype=x.dtype)
+
+    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
+    _pack_seq_kernel[grid](
+        x_reshaped,
+        out,
+        lengths.int(),
+        N,
+        D,
+        Lmax,
+        PAD_VALUE=float(pad_value),
+        BLOCK_T=block_t,
+        BLOCK_D=block_d,
+        num_warps=4,
+        num_stages=2,
+    )
+
+    # Reshape output back to original dimensions (except first dimension)
+    if len(original_shape) > 2:
+        output_shape = (B, Lmax) + original_shape[1:]
+        out = out.reshape(output_shape)
+
+    return out
+
+
+@triton.jit
+def _unpack_seq_triton_kernel(
+    packed_ptr,  # [B, Lmax, D]
+    out_ptr,  # [N, D]
+    lengths_ptr,  # *i32, [B]
+    B: tl.constexpr,
+    Lmax: tl.constexpr,
+    D: tl.constexpr,
+    BLOCK_T: tl.constexpr,  # timesteps per program
+    BLOCK_D: tl.constexpr,  # features per program
+):
+    pid_b = tl.program_id(0)  # batch id
+    pid_t = tl.program_id(1)  # block over time dimension
+    pid_d = tl.program_id(2)  # block over feature dimension
+    off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T)  # [BLOCK_T]
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)  # [BLOCK_D]
+
+    # bounds: compute start from cumulative lengths
+    in_start = 0
+    for i in range(pid_b):
+        in_start += tl.load(lengths_ptr + i)
+    seq_len = tl.load(lengths_ptr + pid_b)
+
+    # valid time positions for this block
+    t_mask = off_t < Lmax
+    valid_row = (off_t < seq_len) & t_mask
+
+    # compute output row indices for valid (b, t)
+    out_row = in_start + off_t
+
+    # Pointers
+    # packed_ptr: row-major [B, Lmax, D]
+    packed_row_ptr = packed_ptr + (pid_b * Lmax + off_t)[:, None] * D + off_d[None, :]
+
+    # out_ptr: row-major [N, D]
+    out_row_ptr = out_ptr + out_row[:, None] * D + off_d[None, :]
+
+    # Load from packed tensor and store to output
+    d_mask = off_d[None, :] < D
+    packed_vals = tl.load(packed_row_ptr, mask=valid_row[:, None] & d_mask)
+    tl.store(out_row_ptr, packed_vals, mask=valid_row[:, None] & d_mask)
+
+
+def unpack_seq_triton(
+    packed_tensor: torch.Tensor,
+    lengths: torch.Tensor,
+    block_t: int = 64,
+    block_d: int = 64,
+) -> torch.Tensor:
+    """
+    Unpack a packed decode query tensor back to the original format.
+    Efficient Triton implementation.
+
+    Args:
+        packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
+        lengths: [B] - sequence lengths for each batch
+        block_t: block size for time dimension
+        block_d: block size for feature dimension
+
+    Returns:
+        unpacked_tensor: [N, ...] where N = sum(lengths)
+    """
+
+    # Handle multi-dimensional input by reshaping to (B, Lmax, -1)
+    original_shape = packed_tensor.shape
+    if len(original_shape) > 3:
+        B, Lmax = original_shape[:2]
+        packed_reshaped = packed_tensor.reshape(B, Lmax, -1)
+        D = packed_reshaped.shape[2]
+    else:
+        B, Lmax, D = packed_tensor.shape
+        packed_reshaped = packed_tensor
+
+    # Calculate total number of elements
+    N = int(lengths.sum().item())
+
+    out = torch.empty((N, D), device=packed_tensor.device, dtype=packed_tensor.dtype)
+
+    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
+    _unpack_seq_triton_kernel[grid](
+        packed_reshaped,
+        out,
+        lengths.int(),
+        B,
+        Lmax,
+        D,
+        BLOCK_T=block_t,
+        BLOCK_D=block_d,
+        num_warps=4,
+        num_stages=2,
+    )
+
+    # Reshape output back to original dimensions (except first dimension)
+    if len(original_shape) > 3:
+        output_shape = (N,) + original_shape[2:]
+        out = out.reshape(output_shape)
+
     return out
 
 
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index cb54bfa51d989e94d2fd168120a769c3544be872..499e145ddbb74dac46eed5f82c7bc97c705fdf8f 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
-from typing import Optional, Tuple
 
 import torch
 
@@ -13,6 +12,7 @@ logger = init_logger(__name__)
 if current_platform.is_cuda():
     try:
         import vllm._flashmla_C  # noqa: F401
+
         _flashmla_C_AVAILABLE = True
     except ImportError:
         _flashmla_C_AVAILABLE = False
@@ -33,58 +33,99 @@ if current_platform.is_rocm():
     _flashmla_C_AVAILABLE = True
 
 
-def is_flashmla_supported() -> Tuple[bool, Optional[str]]:
+def _is_flashmla_available() -> tuple[bool, str | None]:
+    if not _flashmla_C_AVAILABLE:
+        return (
+            False,
+            "vllm._flashmla_C is not available, likely was not "
+            "compiled due to insufficient nvcc version or a supported arch "
+            "was not in the list of target arches to compile for.",
+        )
+    if not _flashmla_extension_C_AVAILABLE:
+        return (
+            False,
+            "vllm._flashmla_extension_C is not available, likely "
+            "was not compiled due to a build error.",
+        )
+
+    return True, None
+
+
+def is_flashmla_dense_supported() -> tuple[bool, str | None]:
     """
     Return: is_supported_flag, unsupported_reason (optional).
     """
-    if not (current_platform.is_cuda() or current_platform.is_rocm()):
-        return False, "FlashMLA is supported on CUDA and ROCM devices."
+    is_availble, maybe_reason = _is_flashmla_available()
+    if not is_availble:
+        return False, maybe_reason
     if current_platform.get_device_capability()[0] != 9:
-        return False, "FlashMLA is only supported on Hopper devices."
-    if not _flashmla_C_AVAILABLE:
-        return False, "vllm._flashmla_C is not available, likely was not "\
-            "compiled due to insufficient nvcc version or a supported arch "\
-            "(only sm90a currently) was not in the list of target arches to "\
-            "compile for."
+        return False, "FlashMLA Dense is only supported on Hopper devices."
+    return True, None
+
+
+def is_flashmla_sparse_supported() -> tuple[bool, str | None]:
+    """
+    Return: is_supported_flag, unsupported_reason (optional).
+    """
+    is_availble, maybe_reason = _is_flashmla_available()
+    if not is_availble:
+        return False, maybe_reason
+    if current_platform.get_device_capability()[0] not in (9, 10):
+        return (
+            False,
+            "FlashMLA Sparse is only supported on Hopper and Blackwell devices.",
+        )
     return True, None
 
 
 def get_mla_metadata(
-        cache_seqlens: torch.Tensor,
-        num_q_tokens_per_head_k: int,
-        num_heads_k: int,
-        num_heads_q: Optional[int] = None,
-        is_fp8_kvcache: bool = False,
-        topk: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    cache_seqlens: torch.Tensor,
+    num_q_tokens_per_head_k: int,
+    num_heads_k: int,
+    num_heads_q: int | None = None,
+    is_fp8_kvcache: bool = False,
+    topk: int | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
     - cache_seqlens: (batch_size), dtype torch.int32.
-    - num_q_tokens_per_head_k: 
+    - num_q_tokens_per_head_k:
             Equals to num_q_tokens_per_q_seq * num_heads_q // num_heads_k.
     - num_heads_k: The number of k heads.
-    - num_heads_q: 
-            The number of q heads. 
+    - num_heads_q:
+            The number of q heads.
             This argument is optional when sparse attention is not enabled
     - is_fp8_kvcache: Whether the k_cache and v_cache are in fp8 format.
-    - topk: If not None, sparse attention will be enabled, 
-            and only tokens in the `indices` array 
+    - topk: If not None, sparse attention will be enabled,
+            and only tokens in the `indices` array
             passed to `flash_mla_with_kvcache_sm90` will be attended to.
 
     Returns:
-    - tile_scheduler_metadata: 
+    - tile_scheduler_metadata:
             (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
     - num_splits: (batch_size + 1), dtype torch.int32.
     """
-        
+    if is_fp8_kvcache and topk is None:
+        return torch.ops._flashmla_extension_C.get_mla_decoding_metadata_dense_fp8(
+            cache_seqlens,
+            num_q_tokens_per_head_k,
+            num_heads_k,
+        )
     if current_platform.is_rocm():
-        return flash_mla_cuda.get_mla_metadata(cache_seqlens,
-                                num_q_tokens_per_head_k,
-                                num_heads_k)
+        return flash_mla_cuda.get_mla_metadata(
+            cache_seqlens,
+            num_q_tokens_per_head_k,
+            num_heads_k,
+        )
     else:
         return torch.ops._flashmla_C.get_mla_decoding_metadata(
-        cache_seqlens, num_q_tokens_per_head_k, num_heads_k, num_heads_q,
-        is_fp8_kvcache, topk)
-
+        cache_seqlens,
+        num_q_tokens_per_head_k,
+        num_heads_k,
+        num_heads_q,
+        is_fp8_kvcache,
+        topk,
+    )
 
 
 def flash_mla_with_kvcache(
@@ -95,13 +136,13 @@ def flash_mla_with_kvcache(
     head_dim_v: int,
     tile_scheduler_metadata: torch.Tensor,
     num_splits: torch.Tensor,
-    softmax_scale: Optional[float] = None,
+    softmax_scale: float | None = None,
     causal: bool = False,
-    descale_q: Optional[torch.Tensor] = None,
-    descale_k: Optional[torch.Tensor] = None,
+    descale_q: torch.Tensor | None = None,
+    descale_k: torch.Tensor | None = None,
     is_fp8_kvcache: bool = False,
-    indices: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    indices: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
     - q: (batch_size, seq_len_q, num_heads_q, head_dim).
@@ -109,26 +150,26 @@ def flash_mla_with_kvcache(
     - block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
     - cache_seqlens: (batch_size), torch.int32.
     - head_dim_v: Head dimension of v.
-    - tile_scheduler_metadata: 
-        (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, 
+    - tile_scheduler_metadata:
+        (num_sm_parts, TileSchedulerMetaDataSize), torch.int32,
         returned by get_mla_metadata.
-    - num_splits: 
+    - num_splits:
         (batch_size + 1), torch.int32, returned by get_mla_metadata.
-    - softmax_scale: float. 
-        The scale of QK^T before applying softmax. 
+    - softmax_scale: float.
+        The scale of QK^T before applying softmax.
         Default to 1 / sqrt(head_dim).
     - causal: bool. Whether to apply causal attention mask.
-    - descale_q: (batch_size), 
+    - descale_q: (batch_size),
         torch.float32. Descaling factors for Q, used for fp8 quantization.
-    - descale_k: (batch_size), 
+    - descale_k: (batch_size),
         torch.float32. Descaling factors for K, used for fp8 quantization.
-    - is_fp8_kvcache: bool. 
-        Whether the k_cache and v_cache are in fp8 format. 
+    - is_fp8_kvcache: bool.
+        Whether the k_cache and v_cache are in fp8 format.
         For the format of FP8 KV cache, please refer to README.md
-    - indices: (batch_size, seq_len_q, topk), torch.int32. 
-        If not None, sparse attention will be enabled, 
-        and only tokens in the `indices` array will be attended to. 
-        Invalid indices should be set to -1 or numbers >= total_seq_len_kv. 
+    - indices: (batch_size, seq_len_q, topk), torch.int32.
+        If not None, sparse attention will be enabled,
+        and only tokens in the `indices` array will be attended to.
+        Invalid indices should be set to -1 or numbers >= total_seq_len_kv.
         For details about how to set up `indices`, please refer to README.md.
 
     Returns:
@@ -136,31 +177,57 @@ def flash_mla_with_kvcache(
     - softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
     """
     if softmax_scale is None:
-        softmax_scale = q.shape[-1]**(-0.5)
+        softmax_scale = q.shape[-1] ** (-0.5)
     if indices is not None:
         # NOTE (zyongye): sparse attention is also causal
         # since it only attend to the tokens before
         # but here `causal` should not be specified
-        assert not causal, \
-            "causal must be `false` if sparse attention is enabled."
-    assert (descale_q is None) == (
-        descale_k is None
-    ), "descale_q and descale_k should be both None or both not None"
+        assert not causal, "causal must be `false` if sparse attention is enabled."
+    assert (descale_q is None) == (descale_k is None), (
+        "descale_q and descale_k should be both None or both not None"
+    )
 
     if indices is None and q.element_size() == 1:
         out, softmax_lse = torch.ops._flashmla_extension_C.fwd_kvcache_mla_fp8(
-            q, k_cache, head_dim_v, cache_seqlens, block_table, softmax_scale,
-            causal, tile_scheduler_metadata, num_splits, descale_q, descale_k)
+            q,
+            k_cache,
+            head_dim_v,
+            cache_seqlens,
+            block_table,
+            softmax_scale,
+            causal,
+            tile_scheduler_metadata,
+            num_splits,
+            descale_q,
+            descale_k,
+        )
     else:
         if current_platform.is_rocm():
             out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
-                q, k_cache, None, head_dim_v, cache_seqlens, block_table, softmax_scale,
-                causal, tile_scheduler_metadata, num_splits)
+                q, 
+                k_cache, 
+                None, 
+                head_dim_v, 
+                cache_seqlens, 
+                block_table, 
+                softmax_scale,
+                causal, 
+                tile_scheduler_metadata, 
+                num_splits)
         else:
             out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
-                q, k_cache, head_dim_v, cache_seqlens, block_table, softmax_scale,
-                causal, tile_scheduler_metadata, num_splits, is_fp8_kvcache,
-                indices)
+            q,
+            k_cache,
+            head_dim_v,
+            cache_seqlens,
+            block_table,
+            softmax_scale,
+            causal,
+            tile_scheduler_metadata,
+            num_splits,
+            is_fp8_kvcache,
+            indices,
+        )
     return out, softmax_lse
 
 
@@ -170,28 +237,27 @@ def flash_mla_sparse_prefill(
     indices: torch.Tensor,
     sm_scale: float,
     d_v: int = 512,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Sparse attention prefill kernel
 
     Args:
     - q: [s_q, h_q, d_qk], bfloat16
     - kv: [s_kv, h_kv, d_qk], bfloat16
-    - indices: [s_q, h_kv, topk], int32. 
+    - indices: [s_q, h_kv, topk], int32.
         Invalid indices should be set to -1 or numbers >= s_kv
     - sm_scale: float
     - d_v: The dimension of value vectors. Can only be 512
 
     Returns:
     - (output, max_logits, lse)
-        About the definition of output, 
+        About the definition of output,
         max_logits and lse, please refer to README.md
     - output: [s_q, h_q, d_v], bfloat16
     - max_logits:  [s_q, h_q], float
     - lse: [s_q, h_q], float, 2-based log-sum-exp
     """
-    results = torch.ops._flashmla_C.sparse_prefill_fwd(q, kv, indices,
-                                                       sm_scale, d_v)
+    results = torch.ops._flashmla_C.sparse_prefill_fwd(q, kv, indices, sm_scale, d_v)
     return results
 
 
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index 5cb1a47394cf6bb04ec13a8810e54143ba167c53..16106f3c93a6a99626fa8275829ba4241d1d95e2 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -13,9 +12,8 @@ def merge_attn_states(
     prefix_lse: torch.Tensor,
     suffix_output: torch.Tensor,
     suffix_lse: torch.Tensor,
-    output_lse: Optional[torch.Tensor] = None,
+    output_lse: torch.Tensor | None = None,
 ) -> None:
-
     # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
     # is not support for FP8 dtype, fallback to use Triton kernel.
     def supported_dtypes(o: torch.Tensor) -> bool:
@@ -31,13 +29,19 @@ def merge_attn_states(
             return headdim % 4 == 0
         return headdim % 8 == 0
 
-    if (current_platform.is_cuda() and supported_dtypes(output)
-            and supported_headdim(output)):
+    if (
+        current_platform.is_cuda()
+        and supported_dtypes(output)
+        and supported_headdim(output)
+    ):
         from vllm._custom_ops import merge_attn_states
-        return merge_attn_states(output, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse, output_lse)
+
+        return merge_attn_states(
+            output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse
+        )
     else:
-        from vllm.attention.ops.triton_merge_attn_states import (
-            merge_attn_states)
-        return merge_attn_states(output, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse, output_lse)
+        from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+
+        return merge_attn_states(
+            output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse
+        )
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 539b57e41de7c40a288bb3e6f85d1c1d71378941..8e010ffba32ecc02d1cb9a95e911be10e437a054 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
 
 import torch
 
@@ -24,9 +23,10 @@ _PARTITION_SIZE = 512
 @dataclass
 class PagedAttentionMetadata:
     """Metadata for PagedAttention."""
+
     # (batch_size,). The length of sequences (entire tokens seen so far) per
     # sequence.
-    seq_lens_tensor: Optional[torch.Tensor]
+    seq_lens_tensor: torch.Tensor | None
     # Maximum sequence length in the batch. 0 if it is prefill-only batch.
     max_decode_seq_len: int
     # (batch_size, max_blocks_per_seq).
@@ -35,13 +35,12 @@ class PagedAttentionMetadata:
     # in the kv cache. Each block can contain up to block_size tokens.
     # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
     # captured.
-    block_tables: Optional[torch.Tensor]
+    block_tables: torch.Tensor | None
 
 
 class PagedAttention:
-
     @staticmethod
-    def get_supported_head_sizes() -> List[int]:
+    def get_supported_head_sizes() -> list[int]:
         return [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
@@ -51,7 +50,7 @@ class PagedAttention:
         num_kv_heads: int,
         head_size: int,
         cache_dtype_str: str = "auto",
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (2, num_blocks, block_size * num_kv_heads * head_size)
 
     @staticmethod
@@ -59,13 +58,12 @@ class PagedAttention:
         kv_cache: torch.Tensor,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         x = 16 // kv_cache.element_size()
         num_blocks = kv_cache.shape[1]
 
         key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x)
         value_cache = kv_cache[1]
         value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
         return key_cache, value_cache
@@ -103,7 +101,7 @@ class PagedAttention:
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         tp_rank: int = 0,
@@ -115,16 +113,17 @@ class PagedAttention:
         if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
             # use blocksparse paged attention
             block_size = value_cache.size(-1)
-            assert (blocksparse_block_size > 0 and
-                    blocksparse_block_size % block_size == 0), \
-                (f"{blocksparse_block_size=} needs to be a multiple of"
-                 f"{block_size=} used in block_tables.")
+            assert (
+                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
+            ), (
+                f"{blocksparse_block_size=} needs to be a multiple of"
+                f"{block_size=} used in block_tables."
+            )
 
         output = torch.empty_like(query)
         block_size = value_cache.shape[3]
         num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
-                              _PARTITION_SIZE)
+        max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE
         # NOTE(woosuk): We use a simple heuristic to decide whether to use
         # PagedAttention V1 or V2. If the number of partitions is 1, we use
         # V1 to avoid the overhead of reduction. Also, if the number of
@@ -132,8 +131,9 @@ class PagedAttention:
         # to parallelize.
         # TODO(woosuk): Tune this heuristic.
         # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = (max_seq_len <= 8192
-                  and (max_num_partitions == 1 or num_seqs * num_heads > 512))
+        use_v1 = max_seq_len <= 8192 and (
+            max_num_partitions == 1 or num_seqs * num_heads > 512
+        )
 
         if use_v1:
             # Run PagedAttention V1.
@@ -210,8 +210,8 @@ class PagedAttention:
         query_start_loc: torch.Tensor,
         seq_lens_tensor: torch.Tensor,
         max_query_len: int,
-        alibi_slopes: Optional[torch.Tensor],
-        sliding_window: Optional[int],
+        alibi_slopes: torch.Tensor | None,
+        sliding_window: int | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
     ) -> torch.Tensor:
@@ -254,7 +254,7 @@ class PagedAttention:
 
     @staticmethod
     def copy_blocks(
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py
index d75983bd407d08627642878bcdfa0451cfce0640..51214b02271af919a43aef516d112f18a358086e 100644
--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -7,7 +7,7 @@ import jax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 
 def _kv_cache_update_kernel(
@@ -33,10 +33,12 @@ def _kv_cache_update_kernel(
     # Copy from new_kv_hbm_ref to scratch
     for i in range(num_slices_per_block):
         offset_i = i + block_idx * num_slices_per_block
-        new_kv_start = jax.lax.select(offset_i < num_slices_ref[0],
-                                      slices_ref[1, offset_i], 0)
-        length = jax.lax.select(offset_i < num_slices_ref[0],
-                                slices_ref[2, offset_i], 0)
+        new_kv_start = jax.lax.select(
+            offset_i < num_slices_ref[0], slices_ref[1, offset_i], 0
+        )
+        length = jax.lax.select(
+            offset_i < num_slices_ref[0], slices_ref[2, offset_i], 0
+        )
         async_copy = pltpu.make_async_copy(
             new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...],
             scratch.at[i, pl.ds(0, length), ...],
@@ -52,10 +54,12 @@ def _kv_cache_update_kernel(
     async_copies.clear()
     for i in range(num_slices_per_block):
         offset_i = i + block_idx * num_slices_per_block
-        kv_cache_start = jax.lax.select(offset_i < num_slices_ref[0],
-                                        slices_ref[0, offset_i], 0)
-        length = jax.lax.select(offset_i < num_slices_ref[0],
-                                slices_ref[2, offset_i], 0)
+        kv_cache_start = jax.lax.select(
+            offset_i < num_slices_ref[0], slices_ref[0, offset_i], 0
+        )
+        length = jax.lax.select(
+            offset_i < num_slices_ref[0], slices_ref[2, offset_i], 0
+        )
         async_copy = pltpu.make_async_copy(
             scratch.at[i, pl.ds(0, length), ...],
             kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...],
@@ -72,12 +76,14 @@ def _kv_cache_update_kernel(
     static_argnames=["page_size", "num_slices_per_block"],
 )
 def kv_cache_update(
-    new_kv: jax.Array,  # [total_num_token, num_combined_kv_heads, head_dim]
-    slices: jax.
-    Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
-    kv_cache: jax.
-    Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
-    num_kv_update_slices: jax.Array,  # [1]
+    # [total_num_token, num_combined_kv_heads, head_dim]
+    new_kv: jax.Array,
+    # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
+    slices: jax.Array,
+    # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    kv_cache: jax.Array,
+    # [1]
+    num_kv_update_slices: jax.Array,
     *,
     page_size: int = 32,
     num_slices_per_block: int = 8,
@@ -114,7 +120,7 @@ def kv_cache_update(
             num_scalar_prefetch=len(scalar_prefetches),
             in_specs=in_specs,
             out_specs=out_specs,
-            grid=(cdiv(num_kv_update_slices[0], num_slices_per_block), ),
+            grid=(cdiv(num_kv_update_slices[0], num_slices_per_block),),
             scratch_shapes=scratch_shapes,
         ),
         out_shape=out_shape,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 7e5c2b6c62e9be8d0b70cf70f316dc46b4644521..f101d5c4a9278956ca543e883cc6696dd1835f8a 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -34,62 +34,63 @@ float8_info = torch.finfo(current_platform.fp8_dtype())
 #     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
 # )
 @triton.jit
-def _fwd_kernel(Q,
-                K,
-                V,
-                K_cache,
-                V_cache,
-                sink_ptr,
-                B_Loc,
-                sm_scale,
-                k_scale,
-                v_scale,
-                out_scale_inv,
-                B_Start_Loc,
-                B_Seqlen,
-                x: tl.constexpr,
-                Out,
-                stride_b_loc_b,
-                stride_b_loc_s,
-                stride_qbs,
-                stride_qh,
-                stride_qd,
-                stride_kbs,
-                stride_kh,
-                stride_kd,
-                stride_vbs,
-                stride_vh,
-                stride_vd,
-                stride_obs,
-                stride_oh,
-                stride_od,
-                stride_k_cache_bs,
-                stride_k_cache_h,
-                stride_k_cache_d,
-                stride_k_cache_bl: tl.constexpr,
-                stride_k_cache_x,
-                stride_v_cache_bs,
-                stride_v_cache_h,
-                stride_v_cache_d,
-                stride_v_cache_bl,
-                num_queries_per_kv: tl.constexpr,
-                IN_PRECISION: tl.constexpr,
-                BLOCK_M: tl.constexpr,
-                BLOCK_DMODEL: tl.constexpr,
-                BLOCK_DMODEL_PADDED: tl.constexpr,
-                BLOCK_SIZE: tl.constexpr,
-                BLOCK_N: tl.constexpr,
-                SLIDING_WINDOW: tl.constexpr,
-                num_unroll_cache: tl.constexpr,
-                num_unroll_request: tl.constexpr,
-                SKIP_DECODE: tl.constexpr,
-                USE_SINKS: tl.constexpr,
-                USE_FP8: tl.constexpr,
-                MAX_Q_LEN: tl.constexpr = 0,
-                MAX_CTX_LEN: tl.constexpr = 0,
-                FP8_MIN: tl.constexpr = float8_info.min,
-                FP8_MAX: tl.constexpr = float8_info.max):
-
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    sink_ptr,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    out_scale_inv,
+    B_Start_Loc,
+    B_Seqlen,
+    x: tl.constexpr,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl: tl.constexpr,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: tl.constexpr,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DMODEL_PADDED: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SLIDING_WINDOW: tl.constexpr,
+    num_unroll_cache: tl.constexpr,
+    num_unroll_request: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+    USE_SINKS: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    MAX_Q_LEN: tl.constexpr = 0,
+    MAX_CTX_LEN: tl.constexpr = 0,
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
+):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
     start_m = tl.program_id(2)
@@ -99,8 +100,7 @@ def _fwd_kernel(Q,
     cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-    cur_batch_query_len = (cur_batch_in_all_stop_index -
-                           cur_batch_in_all_start_index)
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
     cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
     if SKIP_DECODE and cur_batch_query_len == 1:
@@ -120,17 +120,21 @@ def _fwd_kernel(Q,
     # [M]; starts at current position in query
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     # [M,D]
-    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-             cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-    dim_mask = tl.where(
-        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
-        0).to(tl.int1)  # [D]
-
-    q = tl.load(Q + off_q,
-                mask=dim_mask[None, :] &
-                (offs_m[:, None] < cur_batch_query_len),
-                other=0.0)  # [M,D]
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :] * stride_qd
+    )
+
+    dim_mask = tl.where(tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(
+        tl.int1
+    )  # [D]
+
+    q = tl.load(
+        Q + off_q,
+        mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len),
+        other=0.0,
+    )  # [M,D]
 
     # initialize pointer to m and l
     if not USE_SINKS:
@@ -146,32 +150,43 @@ def _fwd_kernel(Q,
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
 
     # compute query against context (no causal mask here)
-    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
-                            loop_unroll_factor=num_unroll_cache):
+    for start_n in tl.range(
+        0, cur_batch_ctx_len, BLOCK_SIZE, loop_unroll_factor=num_unroll_cache
+    ):
         start_n = tl.multiple_of(start_n, BLOCK_SIZE)
         # -- compute qk ----
-        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                     (start_n // BLOCK_SIZE) * stride_b_loc_s).to(tl.int64)
+        bn = tl.load(
+            B_Loc
+            + cur_batch * stride_b_loc_b
+            + (start_n // BLOCK_SIZE) * stride_b_loc_s
+        ).to(tl.int64)
         # [D,BLOCK_SIZE]
         off_k = (
-            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
-            (offs_d[:, None] // x) * stride_k_cache_d +
-            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
-            (offs_d[:, None] % x) * stride_k_cache_x)
+            bn[None, :] * stride_k_cache_bs
+            + cur_kv_head * stride_k_cache_h
+            + (offs_d[:, None] // x) * stride_k_cache_d
+            + ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl
+            + (offs_d[:, None] % x) * stride_k_cache_x
+        )
 
         # [BLOCK_SIZE,D]
-        off_v = (bn[:, None] * stride_v_cache_bs +
-                 cur_kv_head * stride_v_cache_h +
-                 offs_d[None, :] * stride_v_cache_d +
-                 offs_bs_n[:, None] * stride_v_cache_bl)
+        off_v = (
+            bn[:, None] * stride_v_cache_bs
+            + cur_kv_head * stride_v_cache_h
+            + offs_d[None, :] * stride_v_cache_d
+            + offs_bs_n[:, None] * stride_v_cache_bl
+        )
 
-        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
-            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+        if (
+            start_n + BLOCK_SIZE > cur_batch_ctx_len
+            or BLOCK_DMODEL != BLOCK_DMODEL_PADDED
+        ):
             k_load = tl.load(
                 K_cache + off_k,
-                mask=dim_mask[:, None] &
-                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
-                other=0.0)  # [D,N]
+                mask=dim_mask[:, None]
+                & ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0,
+            )  # [D,N]
         else:
             k_load = tl.load(K_cache + off_k)
 
@@ -182,8 +197,9 @@ def _fwd_kernel(Q,
 
         qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
         qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
-                      float("-inf"))
+        qk = tl.where(
+            (start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")
+        )
         qk *= sm_scale
         if SLIDING_WINDOW > 0:
             # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
@@ -197,9 +213,12 @@ def _fwd_kernel(Q,
             # sliding window may lead to the entire row being masked.
             # This then makes m_ij contain -inf, which causes NaNs in
             # exp().
-            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
-                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
-                          -10000)
+            qk = tl.where(
+                (cur_batch_ctx_len + offs_m[:, None]) - (start_n + offs_bs_n[None, :])
+                < SLIDING_WINDOW,
+                qk,
+                -10000,
+            )
 
         # compute running maximum
         m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
@@ -209,13 +228,16 @@ def _fwd_kernel(Q,
         acc = acc * alpha[:, None]
 
         # update acc
-        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
-            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+        if (
+            start_n + BLOCK_SIZE > cur_batch_ctx_len
+            or BLOCK_DMODEL != BLOCK_DMODEL_PADDED
+        ):
             v_load = tl.load(
                 V_cache + off_v,
-                mask=dim_mask[None, :] &
-                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
-                other=0.0)  # [N,D]
+                mask=dim_mask[None, :]
+                & ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0,
+            )  # [N,D]
         else:
             v_load = tl.load(V_cache + off_v)
 
@@ -230,10 +252,16 @@ def _fwd_kernel(Q,
         l_i = l_i * alpha + l_ij
         m_i = m_ij
 
-    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-             offs_d[:, None] * stride_kd)
-    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-             offs_d[None, :] * stride_vd)
+    off_k = (
+        offs_n[None, :] * stride_kbs
+        + cur_kv_head * stride_kh
+        + offs_d[:, None] * stride_kd
+    )
+    off_v = (
+        offs_n[:, None] * stride_vbs
+        + cur_kv_head * stride_vh
+        + offs_d[None, :] * stride_vd
+    )
     k_ptrs = K + off_k
     v_ptrs = V + off_v
 
@@ -241,27 +269,32 @@ def _fwd_kernel(Q,
     block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
 
     # compute query against itself (with causal mask)
-    for start_n in tl.range(0, \
-                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
-                        loop_unroll_factor=num_unroll_request):
+    for start_n in tl.range(
+        0,
+        block_mask * (start_m + 1) * BLOCK_M,
+        BLOCK_N,
+        loop_unroll_factor=num_unroll_request,
+    ):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
-        k = tl.load(k_ptrs +
-                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                    mask=dim_mask[:, None] &
-                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
-                    other=0.0)
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None]
+            & ((start_n + offs_n[None, :]) < cur_batch_query_len),
+            other=0.0,
+        )
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
         qk *= sm_scale
         # apply causal mask
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                      float("-inf"))
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
         if SLIDING_WINDOW > 0:
             qk = tl.where(
                 offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
-                qk, -10000)
+                qk,
+                -10000,
+            )
 
         # compute running maximum
         m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
@@ -271,11 +304,12 @@ def _fwd_kernel(Q,
         acc = acc * alpha[:, None]
 
         # update acc
-        v = tl.load(v_ptrs +
-                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                    mask=dim_mask[None, :] &
-                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
-                    other=0.0)
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :]
+            & ((start_n + offs_n[:, None]) < cur_batch_query_len),
+            other=0.0,
+        )
         p = p.to(v.dtype)
 
         acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
@@ -286,201 +320,18 @@ def _fwd_kernel(Q,
     acc = acc / l_i[:, None]
 
     # initialize pointers to output
-    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_d[None, :] * stride_od
+    )
     out_ptrs = Out + off_o
     if USE_FP8:
         acc = acc * tl.load(out_scale_inv)
         acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
-    tl.store(out_ptrs,
-             acc,
-             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
-    return
-
-
-@triton.jit
-def _fwd_kernel_flash_attn_v2(
-    Q,
-    K,
-    V,
-    K_cache,
-    V_cache,
-    B_Loc,
-    sm_scale,
-    B_Start_Loc,
-    B_Seqlen,
-    B_Ctxlen,
-    block_size,
-    x,
-    Out,
-    stride_b_loc_b,
-    stride_b_loc_s,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_obs,
-    stride_oh,
-    stride_od,
-    stride_k_cache_bs,
-    stride_k_cache_h,
-    stride_k_cache_d,
-    stride_k_cache_bl,
-    stride_k_cache_x,
-    stride_v_cache_bs,
-    stride_v_cache_h,
-    stride_v_cache_d,
-    stride_v_cache_bl,
-    num_queries_per_kv: int,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    cur_batch = tl.program_id(0)
-    cur_head = tl.program_id(1)
-    start_m = tl.program_id(2)
-
-    cur_kv_head = cur_head // num_queries_per_kv
-
-    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-    block_start_loc = BLOCK_M * start_m
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-             cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-    q = tl.load(Q + off_q,
-                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-                other=0.0)
-
-    # # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                     mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0).to(tl.int64)
-        off_k = (
-            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
-            (offs_d[:, None] // x) * stride_k_cache_d +
-            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
-            (offs_d[:, None] % x) * stride_k_cache_x)
-        off_v = (bn[:, None] * stride_v_cache_bs +
-                 cur_kv_head * stride_v_cache_h +
-                 offs_d[None, :] * stride_v_cache_d +
-                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-        k = tl.load(K_cache + off_k,
-                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                    other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                      float("-inf"))
-        qk *= sm_scale
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(V_cache + off_v,
-                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                    other=0.0)
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-             offs_d[:, None] * stride_kd)
-    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-             offs_d[None, :] * stride_vd)
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-
-    block_mask = tl.where(
-        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(k_ptrs +
-                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                    mask=(start_n + offs_n[None, :])
-                    < cur_batch_seq_len - cur_batch_ctx_len,
-                    other=0.0)
-
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                      float("-inf"))
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(v_ptrs +
-                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                    mask=(start_n + offs_n[:, None])
-                    < cur_batch_seq_len - cur_batch_ctx_len,
-                    other=0.0)
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    # acc /= l_i[:, None]
-    # initialize pointers to output
-    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-             cur_head * stride_oh + offs_d[None, :] * stride_od)
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs,
-             acc,
-             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    tl.store(
+        out_ptrs, acc, mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len)
+    )
     return
 
 
@@ -545,8 +396,7 @@ def _fwd_kernel_alibi(
     cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-    cur_batch_query_len = (cur_batch_in_all_stop_index -
-                           cur_batch_in_all_start_index)
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
     cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
     if SKIP_DECODE and cur_batch_query_len == 1:
@@ -558,16 +408,22 @@ def _fwd_kernel_alibi(
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-             cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-    dim_mask = tl.where(
-        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
-
-    q = tl.load(Q + off_q,
-                mask=dim_mask[None, :] &
-                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
-                other=0.0)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :] * stride_qd
+    )
+
+    dim_mask = tl.where(tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(
+        tl.int1
+    )
+
+    q = tl.load(
+        Q + off_q,
+        mask=dim_mask[None, :]
+        & (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+        other=0.0,
+    )
 
     # # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -580,23 +436,31 @@ def _fwd_kernel_alibi(
     for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
-        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                     mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0).to(tl.int64)
+        bn = tl.load(
+            B_Loc
+            + cur_batch * stride_b_loc_b
+            + ((start_n + offs_n) // block_size) * stride_b_loc_s,
+            mask=(start_n + offs_n) < cur_batch_ctx_len,
+            other=0,
+        ).to(tl.int64)
         off_k = (
-            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
-            (offs_d[:, None] // x) * stride_k_cache_d +
-            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
-            (offs_d[:, None] % x) * stride_k_cache_x)
-        off_v = (bn[:, None] * stride_v_cache_bs +
-                 cur_kv_head * stride_v_cache_h +
-                 offs_d[None, :] * stride_v_cache_d +
-                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-        k_load = tl.load(K_cache + off_k,
-                         mask=dim_mask[:, None] &
-                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                         other=0.0)  # [D,N]
+            bn[None, :] * stride_k_cache_bs
+            + cur_kv_head * stride_k_cache_h
+            + (offs_d[:, None] // x) * stride_k_cache_d
+            + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl
+            + (offs_d[:, None] % x) * stride_k_cache_x
+        )
+        off_v = (
+            bn[:, None] * stride_v_cache_bs
+            + cur_kv_head * stride_v_cache_h
+            + offs_d[None, :] * stride_v_cache_d
+            + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl
+        )
+        k_load = tl.load(
+            K_cache + off_k,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+            other=0.0,
+        )  # [D,N]
 
         if k_load.dtype.is_fp8():
             k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
@@ -605,16 +469,20 @@ def _fwd_kernel_alibi(
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                      float("-inf"))
+        qk = tl.where(
+            (start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")
+        )
         qk *= sm_scale
 
         # load alibi
-        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                 alibi_start_q[:, None]) * alibi_slope
+        alibi = (
+            tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - alibi_start_q[:, None]
+        ) * alibi_slope
         alibi = tl.where(
-            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
-            float("-inf"))
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+            alibi,
+            float("-inf"),
+        )
         qk += alibi
         alibi_start_k += BLOCK_N
 
@@ -634,30 +502,36 @@ def _fwd_kernel_alibi(
         # acc_scale = l_i / l_i_new * alpha
         acc = acc * acc_scale[:, None]
         # update acc
-        v_load = tl.load(V_cache + off_v,
-                         mask=dim_mask[None, :] &
-                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                         other=0.0)
+        v_load = tl.load(
+            V_cache + off_v,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+            other=0.0,
+        )
         if v_load.dtype.is_fp8():
             v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
         else:
             v = v_load
         p = p.to(v.dtype)
 
-        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        acc = tl.dot(p, v, acc=acc, input_precision="ieee")
         # update m_i and l_i
         l_i = l_i_new
         m_i = m_i_new
 
-    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-             offs_d[:, None] * stride_kd)
-    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-             offs_d[None, :] * stride_vd)
+    off_k = (
+        offs_n[None, :] * stride_kbs
+        + cur_kv_head * stride_kh
+        + offs_d[:, None] * stride_kd
+    )
+    off_v = (
+        offs_n[:, None] * stride_vbs
+        + cur_kv_head * stride_vh
+        + offs_d[None, :] * stride_vd
+    )
     k_ptrs = K + off_k
     v_ptrs = V + off_v
 
-    block_mask = tl.where(
-        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+    block_mask = tl.where(block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
 
     # init alibi
     alibi_slope = tl.load(Alibi_slopes + cur_head)
@@ -672,22 +546,25 @@ def _fwd_kernel_alibi(
         # -- compute qk ----
         k = tl.load(
             k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
-            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
-                                      < cur_batch_seq_len - cur_batch_ctx_len),
-            other=0.0)
+            mask=dim_mask[:, None]
+            & ((start_n + offs_n[None, :]) < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0,
+        )
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk = tl.dot(q, k, acc=qk, input_precision="ieee")
         qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                      float("-inf"))
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
 
         # load alibi
-        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                 alibi_start_q[:, None]) * alibi_slope
+        alibi = (
+            tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - alibi_start_q[:, None]
+        ) * alibi_slope
         alibi = tl.where(
-            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
-            float("-inf"))
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+            alibi,
+            float("-inf"),
+        )
         qk += alibi
         alibi_start_k += BLOCK_N
 
@@ -709,12 +586,13 @@ def _fwd_kernel_alibi(
         # update acc
         v = tl.load(
             v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
-            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
-                                      < cur_batch_seq_len - cur_batch_ctx_len),
-            other=0.0)
+            mask=dim_mask[None, :]
+            & ((start_n + offs_n[:, None]) < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0,
+        )
         p = p.to(v.dtype)
 
-        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        acc = tl.dot(p, v, acc=acc, input_precision="ieee")
         # update m_i and l_i
         l_i = l_i_new
         m_i = m_i_new
@@ -722,45 +600,51 @@ def _fwd_kernel_alibi(
     acc = acc / l_i[:, None]
 
     # initialize pointers to output
-    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_d[None, :] * stride_od
+    )
     out_ptrs = Out + off_o
-    tl.store(out_ptrs,
-             acc,
-             mask=dim_mask[None, :] &
-             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    tl.store(
+        out_ptrs,
+        acc,
+        mask=dim_mask[None, :]
+        & (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+    )
     return
 
 
 @torch.inference_mode()
-def context_attention_fwd(q,
-                          k,
-                          v,
-                          o,
-                          kv_cache_dtype: str,
-                          k_cache,
-                          v_cache,
-                          b_loc,
-                          b_start_loc,
-                          b_seq_len,
-                          max_seq_len,
-                          max_input_len,
-                          k_scale: torch.Tensor,
-                          v_scale: torch.Tensor,
-                          alibi_slopes=None,
-                          sliding_window=None,
-                          sm_scale=None,
-                          skip_decode=False,
-                          fp8_out_scale=None,
-                          sinks=None):
-
+def context_attention_fwd(
+    q,
+    k,
+    v,
+    o,
+    kv_cache_dtype: str,
+    k_cache,
+    v_cache,
+    b_loc,
+    b_start_loc,
+    b_seq_len,
+    max_seq_len,
+    max_input_len,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    alibi_slopes=None,
+    sliding_window=None,
+    sm_scale=None,
+    skip_decode=False,
+    fp8_out_scale=None,
+    sinks=None,
+):
     q_dtype_is_f32 = q.dtype is torch.float32
 
     # Turing does have tensor core for float32 multiplication
     # use ieee as fallback for triton kernels work. There is also
     # warning on vllm/config.py to inform users this fallback
     # implementation
-    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+    IN_PRECISION = "ieee" if IS_TURING and q_dtype_is_f32 else None
 
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
@@ -778,10 +662,15 @@ def context_attention_fwd(q,
         k_cache = k_cache.view(target_dtype)
         v_cache = v_cache.view(target_dtype)
 
-    if (k_cache.dtype == torch.uint8
-            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
-        raise ValueError("kv_cache_dtype='auto' unsupported for\
-            FP8 KV Cache prefill kernel")
+    if (
+        k_cache.dtype == torch.uint8
+        or v_cache.dtype == torch.uint8
+        and kv_cache_dtype == "auto"
+    ):
+        raise ValueError(
+            "kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel"
+        )
 
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
@@ -843,13 +732,11 @@ def context_attention_fwd(q,
             k_cache.stride(1),
             k_cache.stride(2),
             k_cache.stride(3),
-            k_cache.stride(
-                4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+            k_cache.stride(4),  # [num_blocks, num_kv_heads, head_size/x, block_size, x]
             v_cache.stride(0),
             v_cache.stride(1),
             v_cache.stride(2),
-            v_cache.stride(
-                3),  #[num_blocks, num_kv_heads, head_size, block_size]
+            v_cache.stride(3),  # [num_blocks, num_kv_heads, head_size, block_size]
             num_queries_per_kv=num_queries_per_kv,
             IN_PRECISION=IN_PRECISION,
             BLOCK_M=BLOCK,
@@ -867,8 +754,7 @@ def context_attention_fwd(q,
     if current_platform.is_rocm():
         extra_kargs = {"kpack": 1, "waves_per_eu": 2}
 
-    grid = lambda META: (batch, head,
-                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    grid = lambda META: (batch, head, triton.cdiv(max_input_len, META["BLOCK_M"]))
     _fwd_kernel[grid](
         q,
         k,
@@ -903,12 +789,11 @@ def context_attention_fwd(q,
         k_cache.stride(1),
         k_cache.stride(2),
         k_cache.stride(3),
-        k_cache.stride(
-            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        k_cache.stride(4),  # [num_blocks, num_kv_heads, head_size/x, block_size, x]
         v_cache.stride(0),
         v_cache.stride(1),
         v_cache.stride(2),
-        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        v_cache.stride(3),  # [num_blocks, num_kv_heads, head_size, block_size]
         BLOCK_SIZE=v_cache.shape[3],
         num_queries_per_kv=num_queries_per_kv,
         IN_PRECISION=IN_PRECISION,
@@ -924,5 +809,6 @@ def context_attention_fwd(q,
         num_warps=4,
         num_stages=1,
         USE_SINKS=sinks is not None,
-        **extra_kargs)
+        **extra_kargs,
+    )
     return
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
deleted file mode 100644
index d91cda255ff31aa31adbe49d6eb7eee5de219db8..0000000000000000000000000000000000000000
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-
-
-def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
-                           max_block_per_batch: int,
-                           device: torch.device) -> tuple[torch.Tensor, ...]:
-    paged_kv_indices = torch.zeros(max_batch_size * max_block_per_batch,
-                                   dtype=torch.int32,
-                                   device=device)
-    paged_kv_indptr = torch.zeros(max_batch_size + 1,
-                                  dtype=torch.int32,
-                                  device=device)
-    paged_kv_last_page_lens = torch.full((max_batch_size, ),
-                                         block_size,
-                                         dtype=torch.int32)
-    qo_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int, device=device)
-    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens, qo_indptr
-
-
-def aiter_mla_decode_fwd(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    sm_scale: float,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
-    logit_cap: float = 0.0,
-):
-
-    torch.ops.vllm.rocm_aiter_mla_decode_fwd(q,
-                                             kv_buffer.view(
-                                                 -1, 1, 1, q.shape[-1]),
-                                             o,
-                                             qo_indptr,
-                                             max_seqlen_qo,
-                                             kv_indptr,
-                                             kv_indices,
-                                             kv_last_page_lens,
-                                             sm_scale=sm_scale,
-                                             logit_cap=logit_cap)
-
-
-def mla_decode_fwd_impl(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    from aiter.mla import mla_decode_fwd
-
-    mla_decode_fwd(q,
-                   kv_buffer.view(-1, 1, 1, q.shape[-1]),
-                   o,
-                   qo_indptr,
-                   kv_indptr,
-                   kv_indices,
-                   kv_last_page_lens,
-                   max_seqlen_qo,
-                   sm_scale=sm_scale,
-                   logit_cap=logit_cap)
-
-
-def mla_decode_fwd_fake(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    pass
-
-
-if current_platform.is_rocm():
-    if is_torch_equal_or_newer("2.7.0"):
-        tags = ()
-    else:
-        tags = (torch.Tag.needs_fixed_stride_order, ),
-    direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd",
-                              op_func=mla_decode_fwd_impl,
-                              mutates_args=["o"],
-                              fake_impl=mla_decode_fwd_fake,
-                              tags=tags)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 2a0336de8cf72138da8669cfe4c6de5999cd4bc9..bcd1e2cd5644167de0e7d3e753808fece620f24c 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -1,19 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import aiter as rocm_aiter
 import torch
 
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
 class AITERPagedAttention(PagedAttention):
-
     @staticmethod
     def write_to_paged_cache(
         key: torch.Tensor,
@@ -26,19 +24,31 @@ class AITERPagedAttention(PagedAttention):
         v_scale: torch.Tensor,
     ) -> None:
         if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache, slot_mapping,
-                                                kv_cache_dtype, k_scale,
-                                                v_scale)
+            PagedAttention.write_to_paged_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
         else:
-            kv_cache_torch_dtype = (FP8_DTYPE
-                                    if "fp8" in kv_cache_dtype else torch.int8)
+            kv_cache_torch_dtype = FP8_DTYPE if "fp8" in kv_cache_dtype else torch.int8
             key_cache = key_cache.view(kv_cache_torch_dtype)
             value_cache = value_cache.view(kv_cache_torch_dtype)
 
             rocm_aiter.reshape_and_cache_with_pertoken_quant(
-                key, value, key_cache, value_cache, k_scale, v_scale,
-                slot_mapping.flatten(), True)
+                key,
+                value,
+                key_cache,
+                value_cache,
+                k_scale,
+                v_scale,
+                slot_mapping.flatten(),
+                True,
+            )
 
     @staticmethod
     def forward_decode(
@@ -51,7 +61,7 @@ class AITERPagedAttention(PagedAttention):
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         tp_rank: int = 0,
@@ -78,7 +88,8 @@ class AITERPagedAttention(PagedAttention):
                 blocksparse_local_blocks=blocksparse_local_blocks,
                 blocksparse_vert_stride=blocksparse_vert_stride,
                 blocksparse_block_size=blocksparse_block_size,
-                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+                blocksparse_head_sliding_step=blocksparse_head_sliding_step,
+            )
 
         if "fp8" in kv_cache_dtype:
             key_cache = key_cache.view(current_platform.fp8_dtype())
@@ -87,16 +98,26 @@ class AITERPagedAttention(PagedAttention):
         if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
             # use blocksparse paged attention
             block_size = value_cache.size(-1)
-            assert (blocksparse_block_size > 0 and
-                    blocksparse_block_size % block_size == 0), \
-                (f"{blocksparse_block_size=} needs to be a multiple of"
-                 f"{block_size=} used in block_tables.")
+            assert (
+                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
+            ), (
+                f"{blocksparse_block_size=} needs to be a multiple of"
+                f"{block_size=} used in block_tables."
+            )
 
         output = torch.empty_like(query)
         block_size = value_cache.shape[3]
         max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
 
-        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
-                              seq_lens, max_num_blocks_per_seq, k_scale,
-                              v_scale, output)
+        rocm_aiter.pa_fwd_asm(
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            seq_lens,
+            max_num_blocks_per_seq,
+            k_scale,
+            v_scale,
+            output,
+        )
         return output
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index 7f5a678615cf0330cc77e850f776b9b96e760de3..aebc2e63cff6946d959f94c2dc7b58f0cd9beef3 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -42,10 +42,11 @@ logger = logging.getLogger(__name__)
 
 # Only print the following warnings when triton version < 3.2.0.
 # The issue won't affect performance or accuracy.
-if version.parse(triton.__version__) < version.parse('3.2.0'):
+if version.parse(triton.__version__) < version.parse("3.2.0"):
     logger.warning(
         "The following error message 'operation scheduled before its operands' "
-        "can be ignored.")
+        "can be ignored."
+    )
 
 
 @triton.jit
@@ -101,8 +102,7 @@ def _fwd_kernel_stage1(
 
     kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
     split_kv_start = kv_len_per_split * split_kv_id
-    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
-                              cur_batch_seq_len)
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
 
     e_max = -float("inf")
     e_sum = 0.0
@@ -112,14 +112,18 @@ def _fwd_kernel_stage1(
         for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
             offs_n = start_n + tl.arange(0, BLOCK_N)
             kv_page_number = tl.load(
-                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
-                offs_n // PAGE_SIZE,
+                Req_to_tokens
+                + stride_req_to_tokens_b * cur_batch_req_idx
+                + offs_n // PAGE_SIZE,
                 mask=offs_n < split_kv_end,
                 other=0,
             )
             kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
-            offs_buf_k = (kv_loc[:, None] * stride_buf_kbs +
-                          cur_kv_head * stride_buf_kh + offs_d[None, :])
+            offs_buf_k = (
+                kv_loc[:, None] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[None, :]
+            )
             k = tl.load(
                 K_Buffer + offs_buf_k,
                 mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
@@ -133,8 +137,11 @@ def _fwd_kernel_stage1(
 
             qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
 
-            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
-                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
             v = tl.load(
                 V_Buffer + offs_buf_v,
                 mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
@@ -150,8 +157,12 @@ def _fwd_kernel_stage1(
             e_sum = e_sum * re_scale + tl.sum(p, 0)
             e_max = n_e_max
 
-        offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
-                      split_kv_id * stride_mid_os + offs_dv)
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv
+        )
 
         tl.store(
             Att_Out + offs_mid_o,
@@ -159,8 +170,12 @@ def _fwd_kernel_stage1(
             mask=(mask_dv),
         )
 
-        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
-                        split_kv_id * stride_mid_os + Lv)
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + Lv
+        )
 
         tl.store(
             Att_Out + offs_mid_o_1,
@@ -282,25 +297,22 @@ def _fwd_grouped_kernel_stage1(
     cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
     cur_batch_req_idx = cur_batch
 
-    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
-        None, :]
-    q = tl.load(Q + offs_q,
-                mask=(mask_h[:, None]) & (mask_d[None, :]),
-                other=0.0)
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+    q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0)
 
     if BLOCK_DPE > 0:
         offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
         mask_dpe = offs_dpe < Lk
-        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
-                   offs_dpe[None, :])
-        qpe = tl.load(Q + off_qpe,
-                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
-                      other=0.0)
+        off_qpe = (
+            cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
+        )
+        qpe = tl.load(
+            Q + off_qpe, mask=(mask_h[:, None]) & (mask_dpe[None, :]), other=0.0
+        )
 
     kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
     split_kv_start = kv_len_per_split * split_kv_id
-    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
-                              cur_batch_seq_len)
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
 
     e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
     e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
@@ -310,14 +322,18 @@ def _fwd_grouped_kernel_stage1(
         for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
             offs_n = start_n + tl.arange(0, BLOCK_N)
             kv_page_number = tl.load(
-                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
-                offs_n // PAGE_SIZE,
+                Req_to_tokens
+                + stride_req_to_tokens_b * cur_batch_req_idx
+                + offs_n // PAGE_SIZE,
                 mask=offs_n < split_kv_end,
                 other=0,
             )
             kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
-            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
-                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            offs_buf_k = (
+                kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
             k = tl.load(
                 K_Buffer + offs_buf_k,
                 mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
@@ -325,13 +341,14 @@ def _fwd_grouped_kernel_stage1(
             )
             qk = tl.dot(q, k.to(q.dtype))
             if BLOCK_DPE > 0:
-                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
-                                cur_kv_head * stride_buf_kh +
-                                offs_dpe[:, None])
+                offs_buf_kpe = (
+                    kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
                 kpe = tl.load(
                     K_Buffer + offs_buf_kpe,
-                    mask=(offs_n[None, :] < split_kv_end) &
-                    (mask_dpe[:, None]),
+                    mask=(offs_n[None, :] < split_kv_end) & (mask_dpe[:, None]),
                     other=0.0,
                 )
                 qk += tl.dot(qpe, kpe.to(qpe.dtype))
@@ -340,11 +357,15 @@ def _fwd_grouped_kernel_stage1(
             if logit_cap > 0:
                 qk = logit_cap * tanh(qk / logit_cap)
 
-            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
-                          qk, float("-inf"))
+            qk = tl.where(
+                mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
+            )
 
-            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
-                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
             v = tl.load(
                 V_Buffer + offs_buf_v,
                 mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
@@ -360,9 +381,12 @@ def _fwd_grouped_kernel_stage1(
             e_sum = e_sum * re_scale + tl.sum(p, 1)
             e_max = n_e_max
 
-        offs_mid_o = (cur_batch * stride_mid_ob +
-                      cur_head[:, None] * stride_mid_oh +
-                      split_kv_id * stride_mid_os + offs_dv[None, :])
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head[:, None] * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv[None, :]
+        )
 
         tl.store(
             Att_Out + offs_mid_o,
@@ -370,8 +394,12 @@ def _fwd_grouped_kernel_stage1(
             mask=(mask_h[:, None]) & (mask_dv[None, :]),
         )
 
-        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
-                        split_kv_id * stride_mid_os + Lv)
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + Lv
+        )
 
         tl.store(
             Att_Out + offs_mid_o_1,
@@ -427,11 +455,7 @@ def _decode_grouped_att_m_fwd(
     if is_hip_:
         # https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#mi300x-triton-kernel-performance-optimization
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
-        extra_kargs = {
-            "waves_per_eu": 1,
-            "matrix_instr_nonkdim": 16,
-            "kpack": 2
-        }
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
         num_stages = 1
 
     _fwd_grouped_kernel_stage1[grid](
@@ -504,13 +528,12 @@ def _fwd_kernel_stage2(
     for split_kv_id in range(0, NUM_KV_SPLITS):
         kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
         split_kv_start = kv_len_per_split * split_kv_id
-        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
-                                  cur_batch_seq_len)
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
 
         if split_kv_end > split_kv_start:
-            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
-                         mask=mask_d,
-                         other=0.0)
+            tv = tl.load(
+                Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0
+            )
             tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
             n_e_max = tl.maximum(tlogic, e_max)
 
@@ -553,11 +576,7 @@ def _decode_softmax_reducev_fwd(
     if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
-        extra_kargs = {
-            "waves_per_eu": 4,
-            "matrix_instr_nonkdim": 16,
-            "kpack": 2
-        }
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
 
     grid = (batch, head_num)
     _fwd_kernel_stage2[grid](
@@ -606,8 +625,9 @@ def decode_attention_fwd_normal(
         page_size,
         logit_cap,
     )
-    _decode_softmax_reducev_fwd(attn_logits, q, o, lse, v_buffer, b_seq_len,
-                                num_kv_splits)
+    _decode_softmax_reducev_fwd(
+        attn_logits, q, o, lse, v_buffer, b_seq_len, num_kv_splits
+    )
 
 
 def decode_attention_fwd_grouped(
@@ -636,8 +656,9 @@ def decode_attention_fwd_grouped(
         page_size,
         logit_cap,
     )
-    _decode_softmax_reducev_fwd(attn_logits, q, o, lse, v_buffer, b_seq_len,
-                                num_kv_splits)
+    _decode_softmax_reducev_fwd(
+        attn_logits, q, o, lse, v_buffer, b_seq_len, num_kv_splits
+    )
 
 
 def decode_attention_fwd(
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
deleted file mode 100644
index 49070e4c7ae6a9d45ec405522795044132a3f276..0000000000000000000000000000000000000000
--- a/vllm/attention/ops/triton_flash_attention.py
+++ /dev/null
@@ -1,984 +0,0 @@
-#!/usr/bin/env python
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Fused Attention
-===============
-
-This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
-(https://tridao.me/publications/flash2/flash2.pdf)
-Credits: OpenAI kernel team, AMD ML Frameworks Triton team
-
-Features supported:
-
-1) Fwd with causal masking
-2) Any sequence lengths without padding (currently fwd kernel only)
-3) Support for different sequence lengths for q and k
-4) Nested tensor API currently does not support dropout or bias.
-
-Not currently supported:
-
-1) Non power of two head dims
-
-"""
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.triton_utils import tl, triton
-
-# Avoid misleading ROCm warning.
-if current_platform.is_rocm():
-    from vllm.platforms.rocm import on_gfx1x
-else:
-    on_gfx1x = lambda *args, **kwargs: False
-
-torch_dtype: tl.constexpr = torch.float16
-
-
-@triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
-
-
-@triton.jit
-def max_fn(x, y):
-    return tl.math.max(x, y)
-
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
-                                  stride).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
-                             stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
-
-@triton.jit
-def load_fn(block_ptr, first, second, pad):
-    if first and second:
-        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
-    elif first:
-        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
-    elif second:
-        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
-    else:
-        tensor = tl.load(block_ptr)
-    return tensor
-
-
-@triton.jit
-def _attn_fwd_inner(
-    acc,
-    l_i,
-    m_i,
-    q,
-    K_block_ptr,
-    V_block_ptr,
-    start_m,
-    actual_seqlen_k,
-    dropout_p,
-    philox_seed,
-    batch_philox_offset,
-    encoded_softmax_block_ptr,
-    block_min,
-    block_max,
-    offs_n_causal,
-    masked_blocks,
-    n_extra_tokens,
-    bias_ptr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    OFFS_M: tl.constexpr,
-    OFFS_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    MASK_STEPS: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    PADDED_HEAD: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    qk_scale,
-    p_descale,
-):
-    # loop over k, v, and update accumulator
-    for start_n in range(block_min, block_max, BLOCK_N):
-        # For padded blocks, we will overrun the tensor size if
-        # we load all BLOCK_N. For others, the blocks are all within range.
-        k = load_fn(
-            K_block_ptr,
-            PADDED_HEAD,
-            MASK_STEPS and (n_extra_tokens != 0),
-            "zero",
-        )
-        if PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        # We start from end of seqlen_k so only the first iteration would need
-        # to be checked for padding if it is not a multiple of block_n
-        # TODO: This can be optimized to only be true for the padded block.
-        if MASK_STEPS:  # noqa: SIM102
-            # If this is the last block / iteration, we want to
-            # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
-            # if not is_modulo_mn. last step might get wasted but that is okay.
-            # check if this masking works for that case.
-            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
-                boundary_m = tl.full([BLOCK_M],
-                                     actual_seqlen_k,
-                                     dtype=tl.int32)
-                size_n = start_n + OFFS_N[None, :]
-                mask = size_n < boundary_m[:, None]
-                qk = tl.where(mask, qk, float("-inf"))
-        if IS_CAUSAL:
-            causal_boundary = start_n + offs_n_causal
-            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
-            qk = tl.where(causal_mask, qk, float("-inf"))
-        # -- compute qk ----
-        qk += tl.dot(q, k)
-        if USE_FP8:
-            qk *= qk_scale
-        if bias_ptr is not None:
-            bias = load_fn(bias_ptr, False, MASK_STEPS
-                           and (n_extra_tokens != 0), "zero")
-            # While bias is added after multiplying qk with sm_scale, our
-            # optimization to use 2^x instead of e^x results in an additional
-            # scale factor of log2(e) which we must also multiply the bias with.
-            qk += bias * 1.44269504089
-        m_ij = tl.maximum(m_i, tl.max(qk, 1))
-        qk = qk - m_ij[:, None]
-        p = tl.math.exp2(qk)
-
-        # CAVEAT: Must update l_ij before applying dropout
-        l_ij = tl.sum(p, 1)
-        if ENABLE_DROPOUT:
-            philox_offset = (batch_philox_offset +
-                             start_m * BLOCK_M * actual_seqlen_k + start_n -
-                             BLOCK_N)
-            keep = dropout_mask(
-                philox_seed,
-                philox_offset,
-                dropout_p,
-                BLOCK_M,
-                BLOCK_N,
-                actual_seqlen_k,
-            )
-            if RETURN_ENCODED_SOFTMAX:
-                tl.store(
-                    encoded_softmax_block_ptr,
-                    tl.where(keep, p,
-                             -p).to(encoded_softmax_block_ptr.type.element_ty),
-                )
-            p = tl.where(keep, p, 0.0)
-        elif RETURN_ENCODED_SOFTMAX:
-            tl.store(
-                encoded_softmax_block_ptr,
-                p.to(encoded_softmax_block_ptr.type.element_ty),
-            )
-        # -- update output accumulator --
-        alpha = tl.math.exp2(m_i - m_ij)
-        acc = acc * alpha[:, None]
-        if not PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        # -- update m_i and l_i
-        l_i = l_i * alpha + l_ij
-        # update m_i and l_i
-        m_i = m_ij
-
-        if USE_FP8:
-            p *= p_descale
-
-        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
-
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
-                                                   (0, BLOCK_N))
-    return acc, l_i, m_i
-
-
-def get_cdna_autotune_configs():
-    return [
-        triton.Config(
-            {
-                'BLOCK_M': 256,
-                'BLOCK_N': 64,
-                'waves_per_eu': 2,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=8),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 128,
-                'waves_per_eu': 2,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 256,
-                'BLOCK_N': 128,
-                'waves_per_eu': 2,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=8),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
-                'waves_per_eu': 1,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
-                'waves_per_eu': 3,
-                'PRE_LOAD_V': True
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
-                'waves_per_eu': 3,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 64,
-                'BLOCK_N': 64,
-                'waves_per_eu': 4,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=8),
-        triton.Config(
-            {
-                'BLOCK_M': 32,
-                'BLOCK_N': 32,
-                'waves_per_eu': 4,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=8),
-        # TODO: This config fails with head_size not pow2 with data mismatches.
-        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
-        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
-
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         "BLOCK_M": 16,
-        #         "BLOCK_N": 16,
-        #         "waves_per_eu": 1,
-        #         "PRE_LOAD_V": False,
-        #     },
-        #     num_stages=1,
-        #     num_warps=4,
-        # ),
-    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
-
-
-def get_rdna_autotune_configs():
-    return [
-        triton.Config(
-            {
-                'BLOCK_M': 32,
-                'BLOCK_N': 32,
-                'waves_per_eu': 4,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=2),
-        triton.Config(
-            {
-                'BLOCK_M': 32,
-                'BLOCK_N': 32,
-                'waves_per_eu': 2,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=2),
-        triton.Config(
-            {
-                'BLOCK_M': 32,
-                'BLOCK_N': 16,
-                'waves_per_eu': 4,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=2),
-        triton.Config(
-            {
-                'BLOCK_M': 32,
-                'BLOCK_N': 16,
-                'waves_per_eu': 2,
-                'PRE_LOAD_V': False
-            },
-            num_stages=1,
-            num_warps=2),
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 4,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 2,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # # Fall-back config.
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 1,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
-
-
-def get_autotune_configs():
-    if on_gfx1x():
-        return get_rdna_autotune_configs()
-    else:
-        return get_cdna_autotune_configs()
-
-
-autotune_configs, autotune_keys = get_autotune_configs()
-
-float8_info = torch.finfo(current_platform.fp8_dtype())
-
-
-@triton.autotune(
-    configs=autotune_configs,
-    key=autotune_keys,
-)
-@triton.jit
-def attn_fwd(
-    Q,
-    K,
-    V,
-    bias,
-    sm_scale,
-    q_scale,
-    k_scale,
-    v_scale,
-    p_scale,
-    p_descale,
-    o_descale,
-    L,
-    Out,
-    stride_qz: tl.int64,
-    stride_qh: tl.int64,
-    stride_qm: tl.int64,
-    stride_qk: tl.int64,
-    stride_kz: tl.int64,
-    stride_kh: tl.int64,
-    stride_kn: tl.int64,
-    stride_kk: tl.int64,
-    stride_vz: tl.int64,
-    stride_vh: tl.int64,
-    stride_vk: tl.int64,
-    stride_vn: tl.int64,
-    stride_oz: tl.int64,
-    stride_oh: tl.int64,
-    stride_om: tl.int64,
-    stride_on: tl.int64,
-    stride_bz: tl.int64,
-    stride_bh: tl.int64,
-    stride_bm: tl.int64,
-    stride_bn: tl.int64,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    dropout_p,
-    philox_seed,
-    philox_offset_base,
-    encoded_softmax,
-    HQ: tl.constexpr,
-    HK: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    MAX_SEQLENS_Q: tl.constexpr,
-    MAX_SEQLENS_K: tl.constexpr,
-    VARLEN: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    USE_FP8_OUT: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    FP8_MIN: tl.constexpr = float8_info.min,
-    FP8_MAX: tl.constexpr = float8_info.max,
-):
-    start_m = tl.program_id(0)
-    off_h_q = tl.program_id(1)
-    off_z = tl.program_id(2)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    if VARLEN:
-        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
-        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
-        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
-        # small for all start_m so for those we return early.
-        if start_m * BLOCK_M > seqlen_q:
-            return
-        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
-        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
-        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
-    else:
-        cu_seqlens_q_start = 0
-        cu_seqlens_k_start = 0
-        seqlen_q = MAX_SEQLENS_Q
-        seqlen_k = MAX_SEQLENS_K
-
-    # Now we compute whether we need to exit early due to causal masking.
-    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
-    # are completely masked, resulting in 0s written to the output, and
-    # inf written to LSE. We don't need to do any GEMMs in this case.
-    # This block of code determines what N is, and if this WG is operating
-    # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-    if IS_CAUSAL:
-        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-        # If seqlen_q != seqlen_k, attn scores are rectangular which means
-        # the causal mask boundary is bottom right aligned, and ends at either
-        # the top edge (seqlen_q < seqlen_k) or left edge.
-        # This captures the decrease in n_blocks if we have a rectangular attn
-        # matrix
-        n_blocks_seqlen = cdiv_fn(
-            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
-        # This is what adjusts the block_max for the current WG, only
-        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
-        n_blocks = min(n_blocks, n_blocks_seqlen)
-        # If we have no blocks after adjusting for seqlen deltas, this WG is
-        # part of the blocks that are all 0. We exit early.
-        if n_blocks <= 0:
-            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
-                        off_h_q * stride_oh)
-            O_block_ptr = tl.make_block_ptr(
-                base=Out + o_offset,
-                shape=(seqlen_q, BLOCK_DMODEL),
-                strides=(stride_om, stride_on),
-                offsets=(start_m * BLOCK_M, 0),
-                block_shape=(BLOCK_M, BLOCK_DMODEL),
-                order=(1, 0),
-            )
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
-            # We still need to write 0s to the result
-            # tl.store(O_block_ptr,
-            # acc.to(Out.type.element_ty), boundary_check=(0,1))
-            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
-            #          + offs_m
-            # We store inf to LSE, not -inf because in the bwd pass,
-            # we subtract this
-            # from qk which makes it -inf, such that exp(qk - inf) = 0
-            # for these masked blocks.
-            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            # tl.store(l_ptrs, l)
-            # TODO: Should dropout and return encoded softmax be handled here?
-            return
-
-    # If MQA / GQA, set the K and V head offsets appropriately.
-    GROUP_SIZE: tl.constexpr = HQ // HK
-    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-
-    n_extra_tokens = 0
-    if seqlen_k < BLOCK_N:
-        n_extra_tokens = BLOCK_N - seqlen_k
-    elif seqlen_k % BLOCK_N:
-        n_extra_tokens = seqlen_k % BLOCK_N
-    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
-
-    # Compute pointers for all the tensors used in this kernel.
-    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
-                cu_seqlens_q_start * stride_qm)
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
-                cu_seqlens_k_start * stride_kn)
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
-                cu_seqlens_k_start * stride_vk)
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    if BIAS_TYPE != 0:
-        bias_ptr = tl.make_block_ptr(
-            base=bias + off_h_q * stride_bh,
-            shape=(seqlen_q, seqlen_k),
-            strides=(stride_bm, stride_bn),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        bias_ptr = None
-    if ENABLE_DROPOUT:
-        batch_philox_offset = philox_offset_base \
-                              + (off_z * HQ + off_h_q) \
-                              * seqlen_q * seqlen_k
-    else:
-        batch_philox_offset = 0
-    # We can ask to return the dropout mask without actually doing any dropout.
-    # In this case, we return an invalid pointer so indicate the mask is not i
-    # valid.
-    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
-    if RETURN_ENCODED_SOFTMAX:
-        encoded_softmax_block_ptr = tl.make_block_ptr(
-            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
-            shape=(seqlen_q, seqlen_k),
-            strides=(seqlen_k, 1),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        encoded_softmax_block_ptr = 0
-    # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
-    # have native e^x support in HW.
-    qk_scale = sm_scale * 1.44269504089
-    # Q is loaded once at the beginning and shared by all N blocks.
-    q = load_fn(Q_block_ptr, True, padded_head, "zero")
-    if not USE_FP8:
-        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
-        acc_scale = 1.0
-    else:
-        qk_scale *= q_scale * k_scale
-        acc_scale = p_scale * v_scale
-
-    # Here we compute how many full and masked blocks we have.
-    padded_block_k = n_extra_tokens != 0
-    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-    if IS_CAUSAL:
-        # There are always at least BLOCK_M // BLOCK_N masked blocks.
-        # Additionally there might be one more due to dissimilar seqlens.
-        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-    else:
-        # Padding on Q does not need to be masked in the FA loop.
-        masked_blocks = padded_block_k
-    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
-    # block. In this case we might exceed n_blocks so pick the min.
-    masked_blocks = min(masked_blocks, n_blocks)
-    n_full_blocks = n_blocks - masked_blocks
-    block_min = 0
-    block_max = n_blocks * BLOCK_N
-    # Compute for full blocks. Here we set causal to false regardless of its
-    # value because there is no masking. Similarly we do not need padding.
-    if n_full_blocks > 0:
-        block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-            block_min,
-            block_max,
-            0,
-            0,
-            0,
-            bias_ptr,
-            # IS_CAUSAL, ....
-            False,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            False,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-        block_min = block_max
-        block_max = n_blocks * BLOCK_N
-
-    tl.debug_barrier()
-    # Remaining blocks, if any, are full / not masked.
-    if masked_blocks > 0:
-        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
-        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
-                                                   (0, n_full_blocks))
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            block_min,
-            block_max,
-            offs_n_causal,
-            masked_blocks,
-            n_extra_tokens,
-            bias_ptr,
-            IS_CAUSAL,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            True,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-    # epilogue
-
-    if USE_FP8:
-        acc *= acc_scale
-    acc = acc / l_i[:, None]
-    if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
-    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
-    # then we have one block with a row of all NaNs which come from computing
-    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
-    # and store 0s where there are NaNs as these rows should've been zeroed out.
-    end_m_idx = (start_m + 1) * BLOCK_M
-    start_m_idx = start_m * BLOCK_M
-    causal_start_idx = seqlen_q - seqlen_k
-    if USE_FP8_OUT:
-        acc *= o_descale
-        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
-    acc = acc.to(Out.type.element_ty)
-    if IS_CAUSAL:  # noqa: SIM102
-        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
-            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
-                                        causal_start_idx,
-                                        dtype=tl.int32)
-            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None]
-                             >= out_mask_boundary[None, :])
-            z = tl.zeros((1, ), tl.float32)
-            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
-    # write back LSE
-    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
-    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
-    # few rows. This is only true for the last M block. For others,
-    # overflow_size will be -ve
-    # overflow_size = end_m_idx - seqlen_q
-    # if overflow_size > 0:
-    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-    #    # This is a > check because mask being 0 blocks the store.
-    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    # else:
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-    # write back O
-    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
-                off_h_q * stride_oh)
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + o_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    # Need boundary check on this to make sure the padding from the
-    # Q and KV tensors in both dims are not part of what we store back.
-    # TODO: Do the boundary check optionally.
-    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
-
-
-def check_args(
-    q,
-    k,
-    v,
-    o,
-    varlen=True,
-    max_seqlens=None,
-    cu_seqlens_q=None,
-    cu_seqlens_k=None,
-):
-    assert q.dim() == k.dim() and q.dim() == v.dim()
-    if varlen:
-        assert q.dim() == 3
-        total_q, nheads_q, head_size = q.shape
-        total_k, nheads_k, _ = k.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-    else:
-        assert q.dim() == 4
-        batch, nheads_q, seqlen_q, head_size = q.shape
-        _, nheads_k, seqlen_k, _ = k.shape
-        assert max_seqlens > 0
-    assert k.shape == v.shape
-    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-    # TODO: Change assert if we support qkl f8 and v f16
-    assert q.dtype == k.dtype and q.dtype == v.dtype
-    assert head_size <= 256
-    assert o.shape == q.shape
-    assert (nheads_q % nheads_k) == 0
-
-
-class _attention(torch.autograd.Function):
-
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        o,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlens_q,
-        max_seqlens_k,
-        causal=False,
-        sm_scale=1.0,
-        bias=None,
-        fp8_scales=None,
-        fp8_out_scale=None,
-    ):
-        if fp8_scales is not None:
-            use_fp8 = True
-            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
-            float8 = current_platform.fp8_dtype()
-
-            def check_and_convert(t, scale):
-                if t.dtype != float8:
-                    descale = 1.0 / scale
-                    ts = (t * descale).clamp(min=float8_info.min,
-                                             max=float8_info.max)
-                    return ts.to(float8)
-                else:
-                    return t
-
-            q = check_and_convert(q, q_scale)
-            k = check_and_convert(k, k_scale)
-            v = check_and_convert(v, v_scale)
-        else:
-            use_fp8 = False
-            q_scale = k_scale = v_scale = p_scale = 1.0
-
-        if o is None:
-            o = torch.empty_like(q, dtype=v.dtype)
-
-        check_args(
-            q,
-            k,
-            v,
-            o,
-            varlen=True,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-        )
-        if True:  # varlen
-            total_q, nheads_q, head_size = q.shape
-            total_k, nheads_k, _ = k.shape
-            batch = len(cu_seqlens_q) - 1
-            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
-            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
-            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
-            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
-        else:
-            batch, seqlen_q, nheads_q, head_size = q.shape
-            _, seqlen_k, nheads_k, _ = k.shape
-            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
-            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
-            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
-            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
-
-        # Get closest power of 2 over or equal to 32.
-        unpadded_head_dims = {32, 64, 128, 256}
-        if head_size not in unpadded_head_dims:
-            padded_d_model = None
-            for i in unpadded_head_dims:
-                if i > head_size:
-                    padded_d_model = i
-                    break
-            assert padded_d_model is not None
-        else:
-            padded_d_model = head_size
-
-        grid = lambda META: (
-            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
-            nheads_q,
-            batch,
-        )
-
-        encoded_softmax = None
-
-        # Seed the RNG so we get reproducible results for testing.
-        philox_seed = 0x1BF52
-        philox_offset = 0x1D4B42
-
-        if bias is not None:
-            bias_strides = (
-                bias.stride(0),
-                bias.stride(1),
-                bias.stride(2),
-                bias.stride(3),
-            )
-        else:
-            bias_strides = (0, 0, 0, 0)
-
-        p_descale = 1.0 / p_scale
-        o_descale = 1.0 / fp8_out_scale.item(
-        ) if fp8_out_scale is not None else 1.0
-
-        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
-        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
-
-        attn_fwd[grid](
-            q,
-            k,
-            v,
-            bias,
-            sm_scale,
-            q_scale,
-            k_scale,
-            v_scale,
-            p_scale,
-            p_descale,
-            o_descale,
-            None,
-            o,
-            *q_strides,
-            *k_strides,
-            *v_strides,
-            *o_strides,
-            *bias_strides,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            dropout_p=0.0,
-            philox_seed=philox_seed,
-            philox_offset_base=philox_offset,
-            encoded_softmax=encoded_softmax,
-            HQ=nheads_q,
-            HK=nheads_k,
-            ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=arg_max_seqlens_q,
-            MAX_SEQLENS_K=arg_max_seqlens_k,
-            IS_CAUSAL=causal,
-            VARLEN=True,
-            BLOCK_DMODEL=padded_d_model,
-            BIAS_TYPE=0 if bias is None else 1,
-            ENABLE_DROPOUT=False,
-            RETURN_ENCODED_SOFTMAX=False,
-            USE_FP8=use_fp8,
-            USE_FP8_OUT=fp8_out_scale is not None,
-        )
-
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = head_size
-        ctx.causal = causal
-        ctx.dropout_p = 0.0
-        ctx.philox_seed = philox_seed
-        ctx.philox_offset = philox_offset
-        ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = False
-        return o, encoded_softmax
-
-
-triton_attention = _attention.apply
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 56d78ed5ea6ee0a024e58c733589189f2f0855f3..3c87a24afd9c7ddbaba1b152629b1e961f564b43 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -15,7 +14,7 @@ def merge_attn_states(
     prefix_lse: torch.Tensor,
     suffix_output: torch.Tensor,
     suffix_lse: torch.Tensor,
-    output_lse: Optional[torch.Tensor] = None,
+    output_lse: torch.Tensor | None = None,
 ) -> None:
     num_tokens = output.shape[0]
     num_query_heads = output.shape[1]
@@ -61,8 +60,8 @@ def merge_attn_states_kernel(
     # If we see an inf assume FA2 and convert inf to -inf for consistency
     # and correctness. Inf generally doesn't make sense in this context outside
     # of undefined-behavior/FA2-case, so I think this a safe assumption.
-    p_lse = float('-inf') if p_lse == float('inf') else p_lse
-    s_lse = float('-inf') if s_lse == float('inf') else s_lse
+    p_lse = float("-inf") if p_lse == float("inf") else p_lse
+    s_lse = float("-inf") if s_lse == float("inf") else s_lse
 
     max_lse = tl.maximum(p_lse, s_lse)
     p_lse = p_lse - max_lse
@@ -70,7 +69,7 @@ def merge_attn_states_kernel(
     # Will reuse precomputed Exp values for scale factor computation.
     p_se = tl.exp(p_lse)
     s_se = tl.exp(s_lse)
-    out_se = (p_se + s_se)
+    out_se = p_se + s_se
 
     if OUTPUT_LSE:
         out_lse = tl.log(out_se) + max_lse
@@ -78,12 +77,20 @@ def merge_attn_states_kernel(
 
     head_arange = tl.arange(0, PADDED_HEAD_SIZE)
     head_mask = head_arange < HEAD_SIZE
-    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
-                    head_idx * HEAD_SIZE + head_arange,
-                    mask=head_mask)
-    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
-                    head_idx * HEAD_SIZE + head_arange,
-                    mask=head_mask)
+    p_out = tl.load(
+        prefix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    s_out = tl.load(
+        suffix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
 
     # NOTE(woosuk): Be careful with the numerical stability.
     # We should compute the scale first, and then multiply it with the output.
@@ -91,7 +98,8 @@ def merge_attn_states_kernel(
     p_scale = p_se / out_se
     s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
-    tl.store(output + token_idx * num_heads * HEAD_SIZE +
-             head_idx * HEAD_SIZE + head_arange,
-             out,
-             mask=head_mask)
+    tl.store(
+        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        out,
+        mask=head_mask,
+    )
diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py
index cc9e1bb23b9b313cd02a904dd64fcee62499ce13..5d2ba154ae018c5d81e29109931eed7382e8d3d1 100644
--- a/vllm/attention/ops/triton_reshape_and_cache_flash.py
+++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py
@@ -29,7 +29,6 @@ def reshape_and_cache_kernel_flash(
     # tune parameters
     TILE_SIZE: tl.constexpr,
 ):
-
     token_idx = tl.program_id(axis=0)
     slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64)
     if slot_idx < 0:
@@ -49,21 +48,20 @@ def reshape_and_cache_kernel_flash(
     tgt_idx = block_idx * block_stride + block_offset * page_stride
 
     # [TILE_SIZE]
-    key_load = tl.load(key_ptr + src_key_idx + tile_pos,
-                       mask=tile_pos < (num_heads * head_size))
+    key_load = tl.load(
+        key_ptr + src_key_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
+    )
     if FP8_KV_CACHE:
-        if key_load.dtype.is_fp8():
-            key_tile = key_load
-        else:
-            # tl.store will do the correct implicit cast to fp8,
-            #  based on the key_cache_ptr.dtype.element_ty
-            key_tile = key_load / tl.load(k_scale)
+        # tl.store will do the correct implicit cast to fp8,
+        # based on the key_cache_ptr.dtype.element_ty
+        key_tile = key_load if key_load.dtype.is_fp8() else key_load / tl.load(k_scale)
     else:
         key_tile = key_load
 
     # [TILE_SIZE]
-    value_load = tl.load(value_ptr + src_value_idx + tile_pos,
-                         mask=tile_pos < (num_heads * head_size))
+    value_load = tl.load(
+        value_ptr + src_value_idx + tile_pos, mask=tile_pos < (num_heads * head_size)
+    )
     if FP8_KV_CACHE:
         if value_load.dtype.is_fp8():
             value_tile = value_load
@@ -88,18 +86,17 @@ def reshape_and_cache_kernel_flash(
 
 
 def triton_reshape_and_cache_flash(
-        key: torch.Tensor,  # [num_tokens, num_heads, head_size]
-        value: torch.Tensor,  # [num_tokens, num_heads, head_size]
-        # [num_blocks, block_size, num_heads, head_size]
+    key: torch.Tensor,  # [num_tokens, num_heads, head_size]
+    value: torch.Tensor,  # [num_tokens, num_heads, head_size]
+    # [num_blocks, block_size, num_heads, head_size]
     key_cache: torch.Tensor,
-        # [num_blocks, block_size, num_heads, head_size]
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,  # [num_tokens]
-        kv_cache_dtype: str,  # "auto", "fp8"
-        k_scale: torch.Tensor,  # float32
-        v_scale: torch.Tensor,  # float32
+    # [num_blocks, block_size, num_heads, head_size]
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,  # [num_tokens]
+    kv_cache_dtype: str,  # "auto", "fp8"
+    k_scale: torch.Tensor,  # float32
+    v_scale: torch.Tensor,  # float32
 ):
-    num_tokens = key.shape[0]
     num_heads = key.shape[1]
     head_size = key.shape[2]
     block_size = key_cache.shape[1]
@@ -113,31 +110,40 @@ def triton_reshape_and_cache_flash(
     head_stride = key_cache.stride()[2]
     assert head_stride == head_size, "only continous heads are supported"
 
-    assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), \
+    assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), (
         f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}."
-    kv_cache_torch_dtype = current_platform.fp8_dtype() if \
-        kv_cache_dtype.startswith("fp8") else key_cache.dtype
+    )
+    kv_cache_torch_dtype = (
+        current_platform.fp8_dtype()
+        if kv_cache_dtype.startswith("fp8")
+        else key_cache.dtype
+    )
 
-    if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith(
-            "fp8"):
+    if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith("fp8"):
         # to avoid erounous implicit cast in triton kernel (tl.store to uint8)
         # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4)
         key_cache = key_cache.view(kv_cache_torch_dtype)
         value_cache = value_cache.view(kv_cache_torch_dtype)
-    assert kv_cache_dtype != torch.uint8, "explicit fp8 cast and store to "\
+    assert kv_cache_dtype != torch.uint8, (
+        "explicit fp8 cast and store to "
         "uint8 is not supported by triton reshape_and_cache_flash"
+    )
 
     FP8_KV_CACHE = kv_cache_dtype.startswith("fp8")
     assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [
-        torch.float8_e4m3fn, torch.float8_e5m2, torch.uint8,
-        torch.float8_e4m3fnuz], \
-            "unsupported dtype of KV cache tensor, got "\
-            "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, " \
-            "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz."
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.uint8,
+        torch.float8_e4m3fnuz,
+    ], (
+        "unsupported dtype of KV cache tensor, got "
+        "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, "
+        "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz."
+    )
 
     # heuristics instead of autotuning
     TILE_SIZE = min(2048, triton.next_power_of_2(n))
-    if torch.version.hip or torch.version.xpu:
+    if current_platform.is_rocm() or current_platform.is_xpu():
         num_stages = 4
         num_warps = 8
     else:  # cuda
@@ -148,7 +154,10 @@ def triton_reshape_and_cache_flash(
 
     # TODO(ngl): maybe replace with static launch grid to avoid overhead if
     #   using cudagraphs
-    grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"]))
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        triton.cdiv(n, meta["TILE_SIZE"]),
+    )
 
     reshape_and_cache_kernel_flash[grid](
         key_ptr=key,
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 9e7cafc17428753faa71778b071a7fd64260f141..565be1c39bec179b56f60e179bf27357565e61f5 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -31,8 +31,13 @@ def apply_softcap(S, x):
 
 
 @triton.jit
-def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
-                 BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr):
+def find_seq_idx(
+    query_start_len_ptr,
+    target_idx,
+    num_seqs,
+    BLOCK_Q: tl.constexpr,
+    use_q_block_mode: tl.constexpr,
+):
     left: tl.int32 = 0
     right = num_seqs
     while left < right:
@@ -100,19 +105,18 @@ def kernel_unified_attention_2d(
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
 
-    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
-                           BLOCK_Q, True)
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
+    )
 
-    q_block_start_idx = tl.load(query_start_len_ptr +
-                                seq_idx) // BLOCK_Q + seq_idx
+    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
 
     q_block_local_idx = q_block_global_idx - q_block_start_idx
 
     cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
     cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
 
-    cur_batch_query_len = cur_batch_in_all_stop_index \
-        - cur_batch_in_all_start_index
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
 
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
@@ -123,10 +127,12 @@ def kernel_unified_attention_2d(
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
-    query_offset_1 = kv_head_idx * num_queries_per_kv + \
-        offs_m % num_queries_per_kv
-    query_offset = (query_offset_0[:, None] * query_stride_0 +
-                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
+    query_offset = (
+        query_offset_0[:, None] * query_stride_0
+        + query_offset_1[:, None] * query_stride_1
+        + offs_d[None, :]
+    )
 
     dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
@@ -161,19 +167,24 @@ def kernel_unified_attention_2d(
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
-                              mask=query_mask_1,
-                              other=0.0)
+        alibi_slope = tl.load(
+            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
+        )
 
     # query-query attention bias
     if USE_QQ_BIAS:
-        qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
-                            )  # shape: [BLOCK_M]
+        qq_bias_row_ptrs = (
+            qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+        )  # shape: [BLOCK_M]
 
     # compute the length of the longest sequence prefix spanned by any
     # query token in the current q_block (q_block_local_idx)
-    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
-        BLOCK_M - 1) // num_queries_per_kv + 1
+    max_seq_prefix_len = (
+        context_len
+        + q_block_local_idx * BLOCK_Q
+        + (BLOCK_M - 1) // num_queries_per_kv
+        + 1
+    )
 
     # adjust for potential padding in the last q_block by considering the
     # actual sequence length
@@ -211,23 +222,30 @@ def kernel_unified_attention_2d(
         seq_offset = j * TILE_SIZE + offs_t
         tile_mask = seq_offset < max_seq_prefix_len
 
-        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
-                                     seq_offset // BLOCK_SIZE).to(tl.int64)
+        physical_block_idx = tl.load(
+            block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE
+        ).to(tl.int64)
 
-        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
-                    kv_head_idx * stride_v_cache_2 +
-                    offs_d[None, :] * stride_v_cache_3 +
-                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+        v_offset = (
+            physical_block_idx[:, None] * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_2
+            + offs_d[None, :] * stride_v_cache_3
+            + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1
+        )
 
-        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
-                    kv_head_idx * stride_k_cache_2 +
-                    offs_d[:, None] * stride_k_cache_3 +
-                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+        k_offset = (
+            physical_block_idx[None, :] * stride_k_cache_0
+            + kv_head_idx * stride_k_cache_2
+            + offs_d[:, None] * stride_k_cache_3
+            + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1
+        )
 
         # K : (HEAD_SIZE, TILE_SIZE)
-        K_load = tl.load(key_cache_ptr + k_offset,
-                         mask=dim_mask[:, None] & tile_mask[None, :],
-                         other=0.0)
+        K_load = tl.load(
+            key_cache_ptr + k_offset,
+            mask=dim_mask[:, None] & tile_mask[None, :],
+            other=0.0,
+        )
 
         if K_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -238,9 +256,11 @@ def kernel_unified_attention_2d(
             K = K_load
 
         # V : (TILE_SIZE, HEAD_SIZE)
-        V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[None, :] & tile_mask[:, None],
-                         other=0.0)
+        V_load = tl.load(
+            value_cache_ptr + v_offset,
+            mask=dim_mask[None, :] & tile_mask[:, None],
+            other=0.0,
+        )
 
         if V_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -260,12 +280,16 @@ def kernel_unified_attention_2d(
         if USE_SOFTCAP:
             S = apply_softcap(S, softcap)
 
-        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
-                     S, float("-inf"))
+        S = tl.where(
+            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
+        )
 
         if SLIDING_WINDOW > 0:
-            S = tl.where((context_len + query_pos[:, None] - seq_offset)
-                         < SLIDING_WINDOW, S, float("-inf"))
+            S = tl.where(
+                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
+                S,
+                float("-inf"),
+            )
 
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
@@ -315,9 +339,11 @@ def kernel_unified_attention_2d(
         acc = acc * tl.load(out_scale)
         acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
 
-    output_offset = (query_offset_0[:, None] * output_stride_0 +
-                     query_offset_1[:, None] * output_stride_1 +
-                     offs_d[None, :])
+    output_offset = (
+        query_offset_0[:, None] * output_stride_0
+        + query_offset_1[:, None] * output_stride_1
+        + offs_d[None, :]
+    )
 
     tl.store(
         output_ptr + output_offset,
@@ -328,68 +354,67 @@ def kernel_unified_attention_2d(
 
 @triton.jit
 def kernel_unified_attention_3d(
-        segm_output_ptr,
-        # [num_tokens, num_query_heads, num_segments, head_size]
-        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
-        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
-        query_ptr,  # [num_tokens, num_query_heads, head_size]
-        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
-        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
-        sink_ptr,  # [num_query_heads]
-        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-        seq_lens_ptr,  # [num_seqs]
-        alibi_slopes_ptr,  # [num_query_heads]
-        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
-        scale,  # float32
-        k_scale,  # float32
-        v_scale,  # float32
-        softcap,  # float32
-        num_query_heads: tl.constexpr,  # int
-        num_queries_per_kv: tl.constexpr,  # int
-        block_table_stride: tl.int64,  # int
-        query_stride_0: tl.int64,  # int
-        query_stride_1: tl.int64,  # int, should be equal to head_size
-        qq_bias_stride_0: tl.int64,  # int
-        BLOCK_SIZE: tl.constexpr,  # int
-        TILE_SIZE: tl.constexpr,  # int, must be power of 2
-        HEAD_SIZE: tl.constexpr,  # int
-        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-        USE_ALIBI_SLOPES: tl.constexpr,  # bool
-        USE_QQ_BIAS: tl.constexpr,  # bool
-        USE_SOFTCAP: tl.constexpr,  # bool
-        USE_SINKS: tl.constexpr,  # bool
-        SLIDING_WINDOW: tl.constexpr,  # int
-        stride_k_cache_0: tl.int64,  # int
-        stride_k_cache_1: tl.int64,  # int
-        stride_k_cache_2: tl.int64,  # int
-        stride_k_cache_3: tl.constexpr,  # int
-        stride_v_cache_0: tl.int64,  # int
-        stride_v_cache_1: tl.int64,  # int
-        stride_v_cache_2: tl.int64,  # int
-        stride_v_cache_3: tl.constexpr,  # int
-        query_start_len_ptr,  # [num_seqs+1]
-        BLOCK_Q: tl.constexpr,  # int
-        num_seqs: tl.int32,
-        BLOCK_M: tl.constexpr,  # int
-        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+    segm_output_ptr,
+    # [num_tokens, num_query_heads, num_segments, head_size]
+    segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+    segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+    sink_ptr,  # [num_query_heads]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    softcap,  # float32
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    qq_bias_stride_0: tl.int64,  # int
+    BLOCK_SIZE: tl.constexpr,  # int
+    TILE_SIZE: tl.constexpr,  # int, must be power of 2
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    USE_QQ_BIAS: tl.constexpr,  # bool
+    USE_SOFTCAP: tl.constexpr,  # bool
+    USE_SINKS: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.constexpr,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.constexpr,  # int
+    query_start_len_ptr,  # [num_seqs+1]
+    BLOCK_Q: tl.constexpr,  # int
+    num_seqs: tl.int32,
+    BLOCK_M: tl.constexpr,  # int
+    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
 ):
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
     segm_idx = tl.program_id(2)
 
-    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
-                           BLOCK_Q, True)
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
+    )
 
-    q_block_start_idx = tl.load(query_start_len_ptr +
-                                seq_idx) // BLOCK_Q + seq_idx
+    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
 
     q_block_local_idx = q_block_global_idx - q_block_start_idx
 
     cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
     cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
 
-    cur_batch_query_len = cur_batch_in_all_stop_index \
-        - cur_batch_in_all_start_index
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
 
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
@@ -410,10 +435,12 @@ def kernel_unified_attention_3d(
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
-    query_offset_1 = kv_head_idx * num_queries_per_kv + \
-        offs_m % num_queries_per_kv
-    query_offset = (query_offset_0[:, None] * query_stride_0 +
-                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
+    query_offset = (
+        query_offset_0[:, None] * query_stride_0
+        + query_offset_1[:, None] * query_stride_1
+        + offs_d[None, :]
+    )
 
     dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
@@ -448,19 +475,24 @@ def kernel_unified_attention_3d(
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
-                              mask=query_mask_1,
-                              other=0.0)
+        alibi_slope = tl.load(
+            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
+        )
 
     # query-query attention bias
     if USE_QQ_BIAS:
-        qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
-                            )  # shape: [BLOCK_M]
+        qq_bias_row_ptrs = (
+            qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+        )  # shape: [BLOCK_M]
 
     # compute the length of the longest sequence prefix spanned by any
     # query token in the current q_block (q_block_local_idx)
-    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
-        BLOCK_M - 1) // num_queries_per_kv + 1
+    max_seq_prefix_len = (
+        context_len
+        + q_block_local_idx * BLOCK_Q
+        + (BLOCK_M - 1) // num_queries_per_kv
+        + 1
+    )
 
     # adjust for potential padding in the last q_block by considering the
     # actual sequence length
@@ -473,29 +505,36 @@ def kernel_unified_attention_3d(
 
     # iterate through tiles within current segment
     for j in range(
-            segm_idx * tiles_per_segment,
-            min((segm_idx + 1) * tiles_per_segment, num_tiles),
+        segm_idx * tiles_per_segment,
+        min((segm_idx + 1) * tiles_per_segment, num_tiles),
     ):
         seq_offset = j * TILE_SIZE + offs_t
         tile_mask = seq_offset < max_seq_prefix_len
 
-        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
-                                     seq_offset // BLOCK_SIZE).to(tl.int64)
+        physical_block_idx = tl.load(
+            block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE
+        ).to(tl.int64)
 
-        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
-                    kv_head_idx * stride_v_cache_2 +
-                    offs_d[None, :] * stride_v_cache_3 +
-                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+        v_offset = (
+            physical_block_idx[:, None] * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_2
+            + offs_d[None, :] * stride_v_cache_3
+            + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1
+        )
 
-        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
-                    kv_head_idx * stride_k_cache_2 +
-                    offs_d[:, None] * stride_k_cache_3 +
-                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+        k_offset = (
+            physical_block_idx[None, :] * stride_k_cache_0
+            + kv_head_idx * stride_k_cache_2
+            + offs_d[:, None] * stride_k_cache_3
+            + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1
+        )
 
         # K : (HEAD_SIZE, TILE_SIZE)
-        K_load = tl.load(key_cache_ptr + k_offset,
-                         mask=dim_mask[:, None] & tile_mask[None, :],
-                         other=0.0)
+        K_load = tl.load(
+            key_cache_ptr + k_offset,
+            mask=dim_mask[:, None] & tile_mask[None, :],
+            other=0.0,
+        )
 
         if K_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -506,9 +545,11 @@ def kernel_unified_attention_3d(
             K = K_load
 
         # V : (TILE_SIZE, HEAD_SIZE)
-        V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[None, :] & tile_mask[:, None],
-                         other=0.0)
+        V_load = tl.load(
+            value_cache_ptr + v_offset,
+            mask=dim_mask[None, :] & tile_mask[:, None],
+            other=0.0,
+        )
 
         if V_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -527,12 +568,16 @@ def kernel_unified_attention_3d(
         if USE_SOFTCAP:
             S = apply_softcap(S, softcap)
 
-        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
-                     S, float("-inf"))
+        S = tl.where(
+            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
+        )
 
         if SLIDING_WINDOW > 0:
-            S = tl.where((context_len + query_pos[:, None] - seq_offset)
-                         < SLIDING_WINDOW, S, float("-inf"))
+            S = tl.where(
+                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
+                S,
+                float("-inf"),
+            )
 
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
@@ -577,29 +622,31 @@ def kernel_unified_attention_3d(
         acc += tl.dot(P.to(V.dtype), V)
 
     segm_output_offset = (
-        query_offset_0[:, None].to(tl.int64) *
-        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
-        query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
-        segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+        query_offset_0[:, None].to(tl.int64)
+        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + segm_idx * HEAD_SIZE_PADDED
+        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
+    )
     tl.store(
         segm_output_ptr + segm_output_offset,
         acc,
         mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
     )
-    segm_offset = (query_offset_0.to(tl.int64) *
-                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
-                   query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
+    segm_offset = (
+        query_offset_0.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
+        + query_offset_1 * NUM_SEGMENTS_PER_SEQ
+        + segm_idx
+    )
     tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
-    tl.store(segm_expsum_ptr + segm_offset,
-             L,
-             mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset, L, mask=query_mask_0 & query_mask_1)
 
 
 @triton.jit
 def reduce_segments(
     output_ptr,  # [num_tokens, num_query_heads, head_size]
     segm_output_ptr,
-    #[num_tokens, num_query_heads, max_num_segments, head_size]
+    # [num_tokens, num_query_heads, max_num_segments, head_size]
     segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
     segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
     seq_lens_ptr,  # [num_seqs]
@@ -622,8 +669,9 @@ def reduce_segments(
     query_token_idx = tl.program_id(0)
     query_head_idx = tl.program_id(1)
 
-    seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs,
-                           BLOCK_Q, False)
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, query_token_idx, num_seqs, BLOCK_Q, False
+    )
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
@@ -635,34 +683,32 @@ def reduce_segments(
     # create masks for subsequent loads
     act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE)
     segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
-        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
-    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
-                        0).to(tl.int1)
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32
+    )
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1)
 
     # load segment maxima
-    segm_offset = (query_token_idx.to(tl.int64) *
-                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
-                   query_head_idx * NUM_SEGMENTS_PER_SEQ +
-                   tl.arange(0, NUM_SEGMENTS_PER_SEQ))
-    segm_max = tl.load(segm_max_ptr + segm_offset,
-                       mask=segm_mask,
-                       other=float("-inf"))
+    segm_offset = (
+        query_token_idx.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
+        + query_head_idx * NUM_SEGMENTS_PER_SEQ
+        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)
+    )
+    segm_max = tl.load(segm_max_ptr + segm_offset, mask=segm_mask, other=float("-inf"))
     overall_max = tl.max(segm_max)
 
     # load and rescale segment exp sums
-    segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
-                          mask=segm_mask,
-                          other=0.0)
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset, mask=segm_mask, other=0.0)
     segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
     overall_expsum = tl.sum(segm_expsum)
 
     # load, rescale, and add segment attention outputs
     segm_output_offset = (
-        query_token_idx.to(tl.int64) *
-        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
-        query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
-        tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
-        tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+        query_token_idx.to(tl.int64)
+        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED
+        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
+    )
     segm_output = tl.load(
         segm_output_ptr + segm_output_offset,
         mask=segm_mask[:, None] & dim_mask[None, :],
@@ -678,9 +724,11 @@ def reduce_segments(
         acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
 
     # write result
-    output_offset = (query_token_idx * output_stride_0 +
-                     query_head_idx * output_stride_1 +
-                     tl.arange(0, HEAD_SIZE_PADDED))
+    output_offset = (
+        query_token_idx * output_stride_0
+        + query_head_idx * output_stride_1
+        + tl.arange(0, HEAD_SIZE_PADDED)
+    )
     tl.store(output_ptr + output_offset, acc, mask=dim_mask)
 
 
@@ -707,13 +755,11 @@ def unified_attention(
     # Optional tensor for sinks
     sinks=None,
 ):
-
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
 
     if sinks is not None:
-        assert sinks.shape[0] == q.shape[1], \
-        "Sinks must be num_query_heads size"
+        assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size"
 
     use_alibi_slopes = alibi_slopes is not None
     use_qq_bias = qq_bias is not None
@@ -725,8 +771,9 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    BLOCK_M = 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(
-        num_queries_per_kv)
+    BLOCK_M = (
+        16 if num_queries_per_kv <= 16 else triton.next_power_of_2(num_queries_per_kv)
+    )
     BLOCK_Q = BLOCK_M // num_queries_per_kv
 
     # Ideally we would launch with kernel with:
@@ -748,10 +795,12 @@ def unified_attention(
 
     # if batch contains a prefill
     if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
-        kernel_unified_attention_2d[(
-            total_num_q_blocks,
-            num_kv_heads,
-        )](
+        kernel_unified_attention_2d[
+            (
+                total_num_q_blocks,
+                num_kv_heads,
+            )
+        ](
             output_ptr=out,
             query_ptr=q,
             key_cache_ptr=k,
@@ -825,52 +874,51 @@ def unified_attention(
             device=q.device,
         )
 
-        kernel_unified_attention_3d[(
-            total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
-                segm_output_ptr=segm_output,
-                segm_max_ptr=segm_max,
-                segm_expsum_ptr=segm_expsum,
-                query_ptr=q,
-                key_cache_ptr=k,
-                value_cache_ptr=v,
-                sink_ptr=sinks,
-                block_tables_ptr=block_table,
-                seq_lens_ptr=seqused_k,
-                alibi_slopes_ptr=alibi_slopes,
-                qq_bias_ptr=qq_bias,
-                scale=softmax_scale,
-                k_scale=k_descale,
-                v_scale=v_descale,
-                softcap=softcap,
-                num_query_heads=num_query_heads,
-                num_queries_per_kv=num_queries_per_kv,
-                block_table_stride=block_table.stride(0),
-                query_stride_0=q.stride(0),
-                query_stride_1=q.stride(1),
-                qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
-                BLOCK_SIZE=block_size,
-                TILE_SIZE=TILE_SIZE_DECODE,
-                HEAD_SIZE=head_size,
-                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-                USE_ALIBI_SLOPES=use_alibi_slopes,
-                USE_QQ_BIAS=use_qq_bias,
-                USE_SOFTCAP=(softcap > 0),
-                USE_SINKS=(sinks is not None),
-                SLIDING_WINDOW=(1 + window_size[0]),
-                stride_k_cache_0=k.stride(0),
-                stride_k_cache_1=k.stride(1),
-                stride_k_cache_2=k.stride(2),
-                stride_k_cache_3=k.stride(3),
-                stride_v_cache_0=v.stride(0),
-                stride_v_cache_1=v.stride(1),
-                stride_v_cache_2=v.stride(2),
-                stride_v_cache_3=v.stride(3),
-                query_start_len_ptr=cu_seqlens_q,
-                BLOCK_Q=BLOCK_Q,
-                num_seqs=num_seqs,
-                BLOCK_M=BLOCK_M,
-                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
-            )
+        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+            segm_output_ptr=segm_output,
+            segm_max_ptr=segm_max,
+            segm_expsum_ptr=segm_expsum,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            sink_ptr=sinks,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            qq_bias_ptr=qq_bias,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
+            BLOCK_SIZE=block_size,
+            TILE_SIZE=TILE_SIZE_DECODE,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_QQ_BIAS=use_qq_bias,
+            USE_SOFTCAP=(softcap > 0),
+            USE_SINKS=(sinks is not None),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            num_seqs=num_seqs,
+            BLOCK_M=BLOCK_M,
+            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
         reduce_segments[(q.shape[0], num_query_heads)](
             output_ptr=out,
             segm_output_ptr=segm_output,
@@ -879,8 +927,7 @@ def unified_attention(
             seq_lens_ptr=seqused_k,
             num_seqs=num_seqs,
             num_query_heads=num_query_heads,
-            out_scale_inv=1 /
-            output_scale if output_scale is not None else 1.0,
+            out_scale_inv=1 / output_scale if output_scale is not None else 1.0,
             output_stride_0=out.stride(0),
             output_stride_1=out.stride(1),
             block_table_stride=block_table.stride(0),
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..06a9f7cd82266e8b229176db5676f0b34e7c189b
--- /dev/null
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains ops for ViT attention to be compatible with torch.compile
+as there are operations here not supported by torch.compile (for instance,
+`to_list` in xformers attn, or `.item()` in flash attention)
+
+Using these ops and wrapping vision blocks with `torch.compile` can speed up
+throughput in vision models by ~5% relative on H100, and improve token
+latencies by ~7% (see qwen2_5_vl for example usage)
+
+To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0)
+"""
+
+import einops
+import torch
+import torch.nn.functional as F
+
+from vllm.utils.torch_utils import direct_register_custom_op
+
+
+def xformers_attn_seqlens_wrapper(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
+) -> torch.Tensor:
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+    attn_bias = BlockDiagonalMask.from_seqlens(
+        q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
+    )
+    context_layer = xops.memory_efficient_attention_forward(
+        q, k, v, attn_bias=attn_bias, p=0, scale=None
+    )
+    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+    return context_layer
+
+
+def xformers_attn_seqlens_wrapper_fake(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
+) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="xformers_attn_seqlens_wrapper",
+    op_func=xformers_attn_seqlens_wrapper,
+    fake_impl=xformers_attn_seqlens_wrapper_fake,
+)
+
+
+def vit_xformers_attn_wrapper(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
+) -> torch.Tensor:
+    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
+
+
+def flash_attn_maxseqlen_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    max_seqlen: torch.Tensor,
+    batch_size: int,
+    is_rocm_aiter: bool,
+    use_upstream_fa: bool,
+) -> torch.Tensor:
+    if is_rocm_aiter:
+        from aiter import flash_attn_varlen_func
+    else:
+        if use_upstream_fa:
+            from flash_attn import flash_attn_varlen_func
+        else:
+            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+    q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+    output = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen.item(),
+        max_seqlen_k=max_seqlen.item(),
+        dropout_p=0.0,
+        causal=False,
+    )
+    context_layer = einops.rearrange(
+        output, "(b s) h d -> s b (h d)", b=batch_size
+    ).contiguous()
+    return context_layer
+
+
+def flash_attn_maxseqlen_wrapper_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    max_seqlen: torch.Tensor,
+    batch_size: int,
+    is_rocm_aiter: bool,
+    use_upstream_fa: bool,
+) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="flash_attn_maxseqlen_wrapper",
+    op_func=flash_attn_maxseqlen_wrapper,
+    fake_impl=flash_attn_maxseqlen_wrapper_fake,
+)
+
+
+def vit_flash_attn_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    max_seqlen: torch.Tensor,
+    batch_size: int,
+    is_rocm_aiter: bool,
+    use_upstream_fa: bool,
+) -> torch.Tensor:
+    return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
+    )
+
+
+# TODO: Once we have a torch 2.10, we can use tensor slices
+# so we won't need to wrap this in custom ops
+def torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    outputs = []
+    for i in range(1, len(cu_seqlens)):
+        start_idx = cu_seqlens[i - 1]
+        end_idx = cu_seqlens[i]
+        q_i = q[:, start_idx:end_idx]
+        k_i = k[:, start_idx:end_idx]
+        v_i = v[:, start_idx:end_idx]
+        q_i, k_i, v_i = (
+            einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+        )
+        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        outputs.append(output_i)
+    context_layer = torch.cat(outputs, dim=1)
+    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+    return context_layer
+
+
+def torch_sdpa_wrapper_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="torch_sdpa_wrapper",
+    op_func=torch_sdpa_wrapper,
+    fake_impl=torch_sdpa_wrapper_fake,
+)
+
+
+def vit_torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 6f048e589f7f23c6eecc515fc0bdf8d83a65ffd1..1a092db9ce3782c315b3a6abc67075fda5ce105a 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,50 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import os
+from collections.abc import Generator
 from contextlib import contextmanager
-from dataclasses import dataclass
 from functools import cache
-from typing import Generator, Optional, Union
+from typing import cast, get_args
 
 import torch
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
+from vllm.utils import STR_BACKEND_ENV_VAR
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
 
-def backend_name_to_enum(backend_name: str) -> Optional[_Backend]:
+def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     """
-    Convert a string backend name to a _Backend enum value.
-
-    Returns:
-    * _Backend: enum value if backend_name is a valid in-tree type
-    * None: otherwise it's an invalid in-tree type or an out-of-tree platform is
-            loaded.
-    """
-    assert backend_name is not None
-    return _Backend[backend_name] if backend_name in _Backend.__members__ else \
-          None
-
-
-def get_env_variable_attn_backend() -> Optional[_Backend]:
-    '''
     Get the backend override specified by the vLLM attention
     backend environment variable, if one is specified.
 
     Returns:
 
-    * _Backend enum value if an override is specified
+    * AttentionBackendEnum value if an override is specified
     * None otherwise
-    '''
+    """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return (None
-            if backend_name is None else backend_name_to_enum(backend_name))
+    return None if backend_name is None else AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
@@ -54,11 +42,11 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
 # VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: Optional[_Backend] = None
+forced_attn_backend: AttentionBackendEnum | None = None
 
 
-def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
-    '''
+def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None:
+    """
     Force all attention operations to use a specified backend.
 
     Passing `None` for the argument re-enables automatic
@@ -67,99 +55,47 @@ def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
     Arguments:
 
     * attn_backend: backend selection (None to revert to auto)
-    '''
+    """
     global forced_attn_backend
     forced_attn_backend = attn_backend
 
 
-def get_global_forced_attn_backend() -> Optional[_Backend]:
-    '''
+def get_global_forced_attn_backend() -> AttentionBackendEnum | None:
+    """
     Get the currently-forced choice of attention backend,
     or None if auto-selection is currently enabled.
-    '''
+    """
     return forced_attn_backend
 
 
-@dataclass(frozen=True)
-class _IsSupported:
-    can_import: bool
-    head_size: bool
-    dtype: bool
-
-    def __bool__(self) -> bool:
-        return self.can_import and self.head_size and self.dtype
-
-
-def is_attn_backend_supported(
-    attn_backend: Union[str, type[AttentionBackend]],
-    head_size: int,
-    dtype: torch.dtype,
-    *,
-    allow_import_error: bool = True,
-) -> _IsSupported:
-    if isinstance(attn_backend, str):
-        try:
-            attn_backend = resolve_obj_by_qualname(attn_backend)
-        except ImportError:
-            if not allow_import_error:
-                raise
-
-            return _IsSupported(can_import=False, head_size=False, dtype=False)
-
-    assert isinstance(attn_backend, type)
-
-    # TODO: Update the interface once V0 is removed
-    if get_supported_head_sizes := getattr(attn_backend,
-                                           "get_supported_head_sizes", None):
-        is_head_size_supported = head_size in get_supported_head_sizes()
-    elif validate_head_size := getattr(attn_backend, "validate_head_size",
-                                       None):
-        try:
-            validate_head_size(head_size)
-            is_head_size_supported = True
-        except Exception:
-            is_head_size_supported = False
-    else:
-        raise NotImplementedError(f"{attn_backend.__name__} does not support "
-                                  "head size validation")
-
-    if get_supported_dtypes := getattr(attn_backend, "get_supported_dtypes",
-                                       None):
-        is_dtype_supported = dtype in get_supported_dtypes()
-    else:
-        raise NotImplementedError(f"{attn_backend.__name__} does not support "
-                                  "dtype validation")
-
-    return _IsSupported(
-        can_import=True,
-        head_size=is_head_size_supported,
-        dtype=is_dtype_supported,
-    )
-
-
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
+    kv_cache_dtype: str | None,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
+
+    if kv_cache_dtype is not None:
+        valid_cache_dtypes = get_args(CacheDType)
+        assert kv_cache_dtype in valid_cache_dtypes, (
+            f"Invalid kv_cache_dtype: {kv_cache_dtype}. "
+            f"Valid values are: {valid_cache_dtypes}"
+        )
+
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,
-        kv_cache_dtype=kv_cache_dtype,
+        kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
         block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        attn_type=attn_type,
     )
 
 
@@ -167,56 +103,105 @@ def get_attn_backend(
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    use_v1: bool = False,
+    kv_cache_dtype: CacheDType | None,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
-
     # Check whether a particular choice of backend was
     # previously forced.
     #
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
     selected_backend = None
-    backend_by_global_setting: Optional[_Backend] = (
-        get_global_forced_attn_backend())
+    backend_by_global_setting: AttentionBackendEnum | None = (
+        get_global_forced_attn_backend()
+    )
     if backend_by_global_setting is not None:
         selected_backend = backend_by_global_setting
     else:
         # Check the environment variable and override if specified
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND
         if backend_by_env_var is not None:
             if backend_by_env_var.endswith("_VLLM_V1"):
                 logger.warning(
                     "The suffix '_VLLM_V1' in the environment variable "
                     "%s is no longer necessary as V0 backends have been "
                     "deprecated. Please remove this suffix from your "
-                    "environment variable setting.", STR_BACKEND_ENV_VAR)
-                backend_by_env_var = backend_by_env_var.removesuffix(
-                    "_VLLM_V1")
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-            if selected_backend is None:
+                    "environment variable setting.",
+                    STR_BACKEND_ENV_VAR,
+                )
+                backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
+            try:
+                selected_backend = AttentionBackendEnum[backend_by_env_var]
+            except KeyError as e:
                 raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. "
-                    f"Valid backends are: {list(_Backend.__members__.keys())}")
+                    f"Invalid attention backend: '{backend_by_env_var}'. Valid "
+                    f"backends are: {list(AttentionBackendEnum.__members__.keys())}"
+                ) from e
 
     # get device-specific attn_backend
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla, has_sink, use_sparse)
+    from vllm.platforms import current_platform
+
+    sig = inspect.signature(current_platform.get_attn_backend_cls)
+    if "use_v1" in sig.parameters:
+        logger.warning_once(
+            "use_v1 parameter for get_attn_backend_cls is deprecated and will "
+            "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
+            "remove it from your plugin code."
+        )
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            True,  # use_v1
+            use_mla,
+            has_sink,
+            use_sparse,
+            attn_type,
+        )
+    else:
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            attn_type,
+        )
     if not attention_cls:
         raise ValueError(
-            f"Invalid attention backend for {current_platform.device_name}")
-    return resolve_obj_by_qualname(attention_cls)
+            f"Invalid attention backend for {current_platform.device_name}"
+        )
+    backend = resolve_obj_by_qualname(attention_cls)
+
+    # Adjust kv cache layout if the selected backend requires a specific one
+    required_layout = backend.get_required_kv_cache_layout()
+    if required_layout is not None:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout(required_layout)
+        logger.info(
+            "Using %s KV cache layout for %s backend.",
+            required_layout,
+            backend.get_name(),
+        )
+
+    return backend
 
 
 @contextmanager
 def global_force_attn_backend_context_manager(
-        attn_backend: _Backend) -> Generator[None, None, None]:
-    '''
+    attn_backend: AttentionBackendEnum,
+) -> Generator[None, None, None]:
+    """
     Globally force a vLLM attention backend override within a
     context manager, reverting the global attention backend
     override to its prior state upon exiting the context
@@ -229,7 +214,7 @@ def global_force_attn_backend_context_manager(
     Returns:
 
     * Generator
-    '''
+    """
 
     # Save the current state of the global backend override (if any)
     original_value = get_global_forced_attn_backend()
@@ -243,3 +228,4 @@ def global_force_attn_backend_context_manager(
     finally:
         # Revert the original global backend override, if any
         global_force_attn_backend(original_value)
+        _cached_get_attn_backend.cache_clear()
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index dc0af7e28e3e25b5b9948b2c5eeb7b9a3698acd1..adb9b08a65735928f0df49ae13e48dd2a506dc1e 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -10,31 +9,37 @@ logger = init_logger(__name__)
 
 if current_platform.is_cuda():
     from vllm import _custom_ops as ops
+
     reshape_and_cache_flash = ops.reshape_and_cache_flash
-    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                      get_scheduler_metadata)
+    from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
+
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     flash_attn_varlen_func = ops.flash_attn_varlen_func
     get_scheduler_metadata = ops.get_scheduler_metadata
 
 
-def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
+def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
+
     if current_platform.is_xpu():
         return 2
     try:
         from vllm.vllm_flash_attn.flash_attn_interface import (
-            fa_version_unsupported_reason, is_fa_version_supported)
+            fa_version_unsupported_reason,
+            is_fa_version_supported,
+        )
+
         device_capability = current_platform.get_device_capability()
 
         assert device_capability is not None
 
         # 1. default version depending on platform
-        fa_version = 3 if (device_capability.major == 9
-                           and is_fa_version_supported(3)) else 2
+        fa_version = (
+            3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2
+        )
 
         # 2. override if passed by environment
         if envs.VLLM_FLASH_ATTN_VERSION is not None:
@@ -45,17 +50,22 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
         if device_capability.major == 10 and fa_version == 3:
             logger.warning_once(
                 "Cannot use FA version 3 on Blackwell platform "
-                "defaulting to FA version 2.")
+                "defaulting to FA version 2."
+            )
             fa_version = 2
 
         if requires_alibi and fa_version == 3:
-            logger.warning_once("Cannot use FA version 3 with ALiBi, "
-                                "defaulting to FA version 2.")
+            logger.warning_once(
+                "Cannot use FA version 3 with ALiBi, defaulting to FA version 2."
+            )
             fa_version = 2
 
         if not is_fa_version_supported(fa_version):
-            logger.error("Cannot use FA version %d is not supported due to %s",
-                         fa_version, fa_version_unsupported_reason(fa_version))
+            logger.error(
+                "Cannot use FA version %d is not supported due to %s",
+                fa_version,
+                fa_version_unsupported_reason(fa_version),
+            )
 
         assert is_fa_version_supported(fa_version)
         return fa_version
@@ -64,18 +74,32 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
 
 
 def flash_attn_supports_fp8() -> bool:
-    return get_flash_attn_version() == 3 and \
-        current_platform.get_device_capability().major == 9
+    return (
+        get_flash_attn_version() == 3
+        and current_platform.get_device_capability().major == 9
+    )
+
+
+def flash_attn_supports_sinks() -> bool:
+    if current_platform.is_xpu():
+        return True
+    else:
+        return get_flash_attn_version() == 3
 
 
 def flash_attn_supports_mla():
     from vllm.platforms import current_platform
+
     if current_platform.is_cuda():
         try:
             from vllm.vllm_flash_attn.flash_attn_interface import (
-                is_fa_version_supported)
-            return is_fa_version_supported(3) \
+                is_fa_version_supported,
+            )
+
+            return (
+                is_fa_version_supported(3)
                 and current_platform.get_device_capability()[0] == 9
+            )
         except (ImportError, AssertionError):
             pass
     return False
diff --git a/vllm/attention/utils/kv_sharing_utils.py b/vllm/attention/utils/kv_sharing_utils.py
index b4ae8bdf4d76251a633f76a5d45df26ce238916d..93af5bf7e13fe920103dae9bc9f81816805726da 100644
--- a/vllm/attention/utils/kv_sharing_utils.py
+++ b/vllm/attention/utils/kv_sharing_utils.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-def validate_kv_sharing_target(current_layer_name, target_layer_name,
-                               static_forward_context):
-    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
-                 f"is not valid: target layer {target_layer_name} ")
+def validate_kv_sharing_target(
+    current_layer_name, target_layer_name, static_forward_context
+):
+    error_msg = (
+        f"Specified KV sharing target layer for {current_layer_name} "
+        f"is not valid: target layer {target_layer_name} "
+    )
 
     if current_layer_name == target_layer_name:
-        raise ValueError(error_msg +
-                         "cannot be the same as the current layer.")
+        raise ValueError(error_msg + "cannot be the same as the current layer.")
 
     if target_layer_name not in static_forward_context:
         from vllm.model_executor.models.utils import extract_layer_index
@@ -20,14 +22,12 @@ def validate_kv_sharing_target(current_layer_name, target_layer_name,
         if current_layer_idx <= target_layer_idx:
             raise ValueError(error_msg + "must come before the current layer.")
         else:
-            raise ValueError(error_msg +
-                             "is not a valid Attention layer in the model.")
+            raise ValueError(error_msg + "is not a valid Attention layer in the model.")
 
     # Currently KV sharing is only supported between layers of the same type
-    target_layer_attn_type = static_forward_context[
-        target_layer_name].attn_type
+    target_layer_attn_type = static_forward_context[target_layer_name].attn_type
     expected = static_forward_context[current_layer_name].attn_type
     if target_layer_attn_type != expected:
         raise ValueError(
-            error_msg +
-            f"must be the same type as the current layer ({expected}).")
+            error_msg + f"must be the same type as the current layer ({expected})."
+        )
diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/attention/utils/kv_transfer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..210be55feb2faf04436bfc85eebe61afa9fa4767
--- /dev/null
+++ b/vllm/attention/utils/kv_transfer_utils.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+from collections.abc import Callable
+from functools import wraps
+
+from vllm.distributed.kv_transfer import (
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+    is_v1_kv_transfer_group,
+)
+
+
+def maybe_transfer_kv_layer(func: Callable) -> Callable:
+    """Decorator that handles KV layer transfer prior and after execution of
+    an attention layer, if enabled. Otherwise, the wrapper is a no-op.
+
+    On entry: waits for the KV layer from the connector.
+    On exit: saves the KV layer to the connector.
+    """
+    # Import at runtime to avoid circular dependency
+    from vllm.attention.layer import get_attention_context
+
+    # Inspect the signature ONCE when the decorator is applied.
+    sig = inspect.signature(func)
+    param_names = list(sig.parameters.keys())
+
+    # Find the index of 'layer_name' parameter.
+    try:
+        layer_name_index = param_names.index("layer_name")
+    except ValueError as e:
+        raise TypeError(
+            f"Function {func.__name__} must have a 'layer_name' parameter"
+        ) from e
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+            return func(*args, **kwargs)
+
+        layer_name: str = args[layer_name_index]
+
+        # Extract attention context (layer-specific metadata, layer, and kv_cache)
+        attn_metadata, attn_layer, kv_cache = get_attention_context(layer_name)
+        connector = get_kv_transfer_group()
+        if attn_metadata is None or not connector.has_connector_metadata():
+            return func(*args, **kwargs)
+
+        # Wait for KV layer on entry
+        connector.wait_for_layer_load(layer_name)
+
+        # Execute the function
+        result = func(*args, **kwargs)
+
+        # Save KV cache layer on exit
+        connector.save_kv_layer(layer_name, kv_cache, attn_metadata)
+
+        return result
+
+    return wrapper
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 01124872e98c06e501659e8a44fbeabb30d83d6d..fcd2d1f0e01ab8c15c56b0a0f22384577d6d34d4 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 from vllm.logprobs import Logprob
 from vllm.lora.request import LoRARequest
@@ -18,16 +18,17 @@ class BeamSearchSequence:
     The text field is optional and will only be filled when the sequence is
     about to be returned to the user.
     """
+
     # The tokens include the prompt.
     tokens: list[int]
     logprobs: list[dict[int, Logprob]]
-    lora_request: Optional[LoRARequest] = None
+    lora_request: LoRARequest | None = None
     cum_logprob: float = 0.0
-    text: Optional[str] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Union[int, str, None] = None
+    text: str | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
     multi_modal_data: Optional["MultiModalDataDict"] = None
-    mm_processor_kwargs: Optional[dict[str, Any]] = None
+    mm_processor_kwargs: dict[str, Any] | None = None
 
 
 @dataclass
@@ -36,16 +37,16 @@ class BeamSearchOutput:
     It contains the list of the best beam search sequences.
     The length of the list is equal to the beam width.
     """
+
     sequences: list[BeamSearchSequence]
 
 
 class BeamSearchInstance:
-
     def __init__(
         self,
         prompt_tokens: list[int],
-        lora_request: Optional[LoRARequest] = None,
-        logprobs: Optional[list[dict[int, Logprob]]] = None,
+        lora_request: LoRARequest | None = None,
+        logprobs: list[dict[int, Logprob]] | None = None,
         **kwargs,
     ):
         self.beams: list[BeamSearchSequence] = [
@@ -79,9 +80,9 @@ def get_beam_search_score(
 
 
 def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
-
     def sort_beams_key(x: BeamSearchSequence) -> float:
-        return get_beam_search_score(x.tokens, x.cum_logprob, eos_token_id,
-                                     length_penalty)
+        return get_beam_search_score(
+            x.tokens, x.cum_logprob, eos_token_id, length_penalty
+        )
 
     return sort_beams_key
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 68a937d5750ecb00894fb06f0e5318791ed88323..5411ecbb27b276282b040a6865fa589fb74f7a4b 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -11,6 +11,7 @@ generation. Supported dataset types include:
   - HuggingFace
   - VisionArena
 """
+
 import argparse
 import ast
 import base64
@@ -20,13 +21,14 @@ import logging
 import math
 import random
 from abc import ABC, abstractmethod
-from collections.abc import Iterator, Mapping
+from collections.abc import Callable, Iterator, Mapping
 from contextlib import suppress
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
-from typing import Any, Callable, Optional, Union, cast
+from tempfile import NamedTemporaryFile
+from typing import Any, cast
 
 import numpy as np
 from PIL import Image
@@ -38,7 +40,7 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 try:
     from datasets import load_dataset
@@ -57,7 +59,7 @@ except ImportError:
     librosa = PlaceholderModule("librosa")
 
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
@@ -74,14 +76,12 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: Union[str, list[str]]
+    prompt: str | list[str]
     prompt_len: int
     expected_output_len: int
-    multi_modal_data: Optional[
-        Union[MultiModalDataDict, dict, list[dict]]
-    ] = None
-    lora_request: Optional[LoRARequest] = None
-    request_id: Optional[str] = None
+    multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
+    lora_request: LoRARequest | None = None
+    request_id: str | None = None
 
 
 # -----------------------------------------------------------------------------
@@ -95,8 +95,10 @@ class BenchmarkDataset(ABC):
 
     def __init__(
         self,
-        dataset_path: Optional[str] = None,
+        dataset_path: str | None = None,
         random_seed: int = DEFAULT_SEED,
+        disable_shuffle: bool = False,
+        **kwargs,
     ) -> None:
         """
         Initialize the BenchmarkDataset with an optional dataset path and random
@@ -111,16 +113,15 @@ class BenchmarkDataset(ABC):
         self.dataset_path = dataset_path
         # Set the random seed, ensuring that a None value is replaced with the
         # default seed.
-        self.random_seed = (random_seed
-                            if random_seed is not None else self.DEFAULT_SEED)
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
+        self.disable_shuffle = disable_shuffle
         self.data = None
 
     def apply_multimodal_chat_transformation(
-            self,
-            prompt: str,
-            mm_content: Optional[
-                        Union[MultiModalDataDict, dict, list[dict]]
-                             ] = None) -> list[dict]:
+        self,
+        prompt: str,
+        mm_content: MultiModalDataDict | dict | list[dict] | None = None,
+    ) -> list[dict]:
         """
         Transform a prompt and optional multimodal content into a chat format.
         This method is used for chat models that expect a specific conversation
@@ -134,8 +135,8 @@ class BenchmarkDataset(ABC):
                 content.append(mm_content)
             else:
                 raise TypeError(
-                    "Could not process multimodal content of type: " +
-                    f"{type(mm_content)}"
+                    "Could not process multimodal content of type: "
+                    + f"{type(mm_content)}"
                 )
         return [{"role": "user", "content": content}]
 
@@ -150,14 +151,13 @@ class BenchmarkDataset(ABC):
             NotImplementedError: If a subclass does not implement this method.
         """
         # TODO (jenniferzhao): add support for downloading data
-        raise NotImplementedError(
-            "load_data must be implemented in subclasses.")
+        raise NotImplementedError("load_data must be implemented in subclasses.")
 
     def get_random_lora_request(
         self,
-        max_loras: Optional[int] = None,
-        lora_path: Optional[str] = None,
-    ) -> Optional[LoRARequest]:
+        max_loras: int | None = None,
+        lora_path: str | None = None,
+    ) -> LoRARequest | None:
         """
         Optionally select a random LoRA request.
 
@@ -187,10 +187,13 @@ class BenchmarkDataset(ABC):
         return lora_request
 
     @abstractmethod
-    def sample(self, tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               request_id_prefix: str = "",
-               no_oversample: bool = False) -> list[SampleRequest]:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+    ) -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
 
@@ -229,21 +232,27 @@ class BenchmarkDataset(ABC):
 
         """
         if no_oversample:
-            logger.info("Skipping oversampling. " \
-            "Total samples: %d.", len(requests))
+            logger.info("Skipping oversampling. Total samples: %d.", len(requests))
             return
 
         if len(requests) < num_requests:
             random.seed(self.random_seed)
-            additional = deepcopy(
-                random.choices(requests, k=num_requests - len(requests))
-            )
-            for i in range(len(additional)):
-                req = additional[i]
+            needed = num_requests - len(requests)
+            additional = []
+            for i in range(needed):
+                req = deepcopy(random.choice(requests))
                 req.request_id = request_id_prefix + str(len(requests) + i)
+                additional.append(req)
             requests.extend(additional)
-            logger.info("Oversampled requests to reach %d total samples.",
-                        num_requests)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)
+
+        ids = [req.request_id for req in requests]
+        if len(ids) != len(set(ids)):
+            raise ValueError(
+                "Duplicate request_id found in the sampled "
+                "requests. Please ensure that each request_id "
+                "is unique."
+            )
 
 
 # -----------------------------------------------------------------------------
@@ -268,14 +277,14 @@ def is_valid_sequence(
     """
     # Check for invalid conditions
     prompt_too_short = prompt_len < min_len
-    output_too_short = (not skip_min_output_len_check) and (output_len
-                                                            < min_len)
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
     prompt_too_long = prompt_len > max_prompt_len
     combined_too_long = (prompt_len + output_len) > max_total_len
 
     # Return True if none of the invalid conditions are met
-    return not (prompt_too_short or output_too_short or prompt_too_long
-                or combined_too_long)
+    return not (
+        prompt_too_short or output_too_short or prompt_too_long or combined_too_long
+    )
 
 
 @cache
@@ -307,28 +316,30 @@ def process_image(image: Any) -> Mapping[str, Any]:
     Raises:
         ValueError: If the input is not a supported type.
     """
-    if isinstance(image, dict) and 'bytes' in image:
-        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
     if isinstance(image, Image.Image):
         image = convert_image_mode(image, "RGB")
         with io.BytesIO() as image_data:
             image.save(image_data, format="JPEG")
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
         return {
             "type": "image_url",
-            "image_url": {
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
         }
 
     if isinstance(image, str):
-        image_url = (image if image.startswith(
-            ("http://", "https://", "file://")) else f"file://{image}")
+        image_url = (
+            image
+            if image.startswith(("http://", "https://", "file://"))
+            else f"file://{image}"
+        )
         return {"type": "image_url", "image_url": {"url": image_url}}
 
-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
-                     " or str or dictionary with raw image bytes.")
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image"
+        " or str or dictionary with raw image bytes."
+    )
 
 
 def process_video(video: Any) -> Mapping[str, Any]:
@@ -347,25 +358,82 @@ def process_video(video: Any) -> Mapping[str, Any]:
     Raises:
         ValueError: If the input is not a supported type.
     """
-    if isinstance(video, dict) and 'bytes' in video:
-        video_bytes = video['bytes']
+    if isinstance(video, dict) and "bytes" in video:
+        video_bytes = video["bytes"]
         video_base64 = base64.b64encode(video_bytes).decode("utf-8")
         return {
             "type": "video_url",
-            "video_url": {
-                "url": f"data:video/mp4;base64,{video_base64}"
-            },
+            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
         }
 
     if isinstance(video, str):
-        video_url = (video if video.startswith(
-            ("http://", "https://", "file://")) else f"file://{video}")
+        video_url = (
+            video
+            if video.startswith(("http://", "https://", "file://"))
+            else f"file://{video}"
+        )
         return {"type": "video_url", "video_url": {"url": video_url}}
 
     raise ValueError(
         f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
     )
 
+
+def gen_prompt_decode_to_target_len(
+    tokenizer: PreTrainedTokenizerBase,
+    token_sequence: list[int],
+    target_token_len: int,
+    max_retry: int = 10,
+    add_special_tokens: bool = False,
+    rng: np.random.Generator | None = None,
+) -> tuple[str, list[int]]:
+    """
+    Ensure decoded-then-encoded prompt length matches the target token length.
+
+    This function decodes an initial token sequence to text and re-encodes it
+    , iteratively adjusting the token sequence length to match a target.
+    This is necessary because some tokenizers do not guarantee a 1:1 mapping
+    between consecutive tokens and the decoded-then-encoded sequence length.
+    For example, for GPT2Tokenizer:
+    [6880, 6881] -> ['Ġcalls', 'here'] ->
+    [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+
+    Returns a tuple of the final prompt string and the adjusted token sequence.
+    """
+    remain_num_try = max_retry
+    token_mismatch = 0
+    while True:
+        prompt = tokenizer.decode(token_sequence)
+        token_sequence = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
+        if remain_num_try <= 0:
+            if len(token_sequence) != target_token_len:
+                token_mismatch = len(token_sequence) - target_token_len
+            break
+
+        if len(token_sequence) == target_token_len:
+            break
+        elif len(token_sequence) < target_token_len:
+            if rng is not None:
+                extra_tokens = rng.integers(
+                    0,
+                    tokenizer.vocab_size,
+                    size=target_token_len - len(token_sequence),
+                ).tolist()
+            else:
+                extra_tokens = np.random.randint(
+                    0,
+                    tokenizer.vocab_size,
+                    size=target_token_len - len(token_sequence),
+                ).tolist()
+            token_sequence.extend(extra_tokens)
+        elif len(token_sequence) > target_token_len:
+            token_sequence = token_sequence[:target_token_len]
+
+        remain_num_try -= 1
+
+    return prompt, token_sequence, token_mismatch
+
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -384,6 +452,7 @@ class RandomDataset(BenchmarkDataset):
     - Decode then re-encode/truncate to ensure prompt token counts match.
     - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
     """
+
     # Default values copied from benchmark_serving.py for the random dataset.
     DEFAULT_PREFIX_LEN = 0
     DEFAULT_RANGE_RATIO = 0.0
@@ -410,18 +479,38 @@ class RandomDataset(BenchmarkDataset):
         batchsize: int = 1,
         **kwargs,
     ) -> list[SampleRequest]:
+        # validate total input tokens (prefix + sampled) is at least 1.
+        num_special = int(tokenizer.num_special_tokens_to_add())
+        real_input_len = max(0, int(input_len) - num_special)
+        min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio)))
+        min_total_input = int(prefix_len) + min_sampled_input
+        if min_total_input < 1:
+            raise ValueError(
+                "--random-input-len is too small: with tokenizer special "
+                f"tokens {num_special} and --random-range-ratio {range_ratio}, "
+                "the minimum possible total input tokens (prefix + sampled) is "
+                f"{min_total_input}. Increase --random-input-len and/or "
+                "--random-prefix-len, or decrease --random-range-ratio so that "
+                "prefix_len + floor(max(0, random_input_len - num_special)) "
+                "* (1 - range_ratio) >= 1."
+            )
 
         input_lens, output_lens, offsets = self.get_sampling_params(
             num_requests, range_ratio, input_len, output_len, tokenizer
         )
 
-        # Generate prefix once
-        prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
         vocab_size = tokenizer.vocab_size
+        prohibited_tokens = tokenizer.all_special_ids
+        all_tokens = np.arange(vocab_size)
+        allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
+
+        # Generate prefix once
+        prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
 
         requests = []
+        token_mismatch_total = 0
         for i in range(num_requests):
-            prompt, total_input_len = self.generate_token_sequence(
+            prompt, total_input_len, token_mismatch = self.generate_token_sequence(  # noqa: E501
                 tokenizer=tokenizer,
                 prefix_token_ids=prefix_token_ids,
                 prefix_len=prefix_len,
@@ -429,7 +518,9 @@ class RandomDataset(BenchmarkDataset):
                 input_len=int(input_lens[i]),
                 offset=int(offsets[i]),
                 index=i,
+                allowed_tokens=allowed_tokens,
             )
+            token_mismatch_total += token_mismatch
             requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -453,17 +544,32 @@ class RandomDataset(BenchmarkDataset):
                     )
                 )
             requests = batch_requests
+
+        if token_mismatch_total != 0:
+            sign = "more" if token_mismatch_total > 0 else "fewer"
+            logger.warning(
+                "Across all generated prompts, there were %d %s tokens "
+                "than expected after decoding and re-encoding. This is "
+                "expected due to the imperfect nature of the sampling "
+                "procedure.",
+                abs(token_mismatch_total),
+                sign,
+            )
+
         return requests
 
     def get_prefix(
-        self, tokenizer: PreTrainedTokenizerBase, prefix_len: int
+        self,
+        allowed_tokens: np.ndarray,
+        prefix_len: int,
     ) -> list[int]:
         """
         Get the prefix for the dataset.
         """
         return (
-            self._rng.integers(
-                0, tokenizer.vocab_size, size=prefix_len).tolist()
+            allowed_tokens[
+                self._rng.integers(0, len(allowed_tokens), size=prefix_len)
+            ].tolist()
             if prefix_len > 0
             else []
         )
@@ -492,11 +598,11 @@ class RandomDataset(BenchmarkDataset):
         # Ensure the lower bound for output length is at least 1 to
         # prevent sampling 0 tokens.
         output_low = max(output_low, 1)
+        output_high = max(output_high, 1)
 
         if input_low > input_high:
             raise ValueError(
-                "Invalid input sampling interval: "
-                f"low={input_low} > high={input_high}"
+                f"Invalid input sampling interval: low={input_low} > high={input_high}"
             )
         if output_low > output_high:
             raise ValueError(
@@ -512,12 +618,9 @@ class RandomDataset(BenchmarkDataset):
             output_high,
         )
 
-        input_lens = self._rng.integers(input_low, input_high + 1,
-                                           size=num_requests)
-        output_lens = self._rng.integers(output_low, output_high + 1,
-                                            size=num_requests)
-        offsets = self._rng.integers(0, tokenizer.vocab_size,
-                                        size=num_requests)
+        input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests)
+        output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests)
+        offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests)
         return input_lens, output_lens, offsets
 
     def generate_token_sequence(
@@ -530,7 +633,8 @@ class RandomDataset(BenchmarkDataset):
         input_len: int,
         offset: int,
         index: int,
-    ) -> tuple[str, int]:
+        allowed_tokens: np.ndarray,
+    ) -> tuple[str, int, int]:
         """
         Returns (prompt, total_input_len).
 
@@ -543,34 +647,146 @@ class RandomDataset(BenchmarkDataset):
         To avoid uncontrolled change of the prompt length,
         the encoded sequence is truncated before being decoded again.
         """
-        # Build the inner sequence by sampling sequentially from the vocab
-        inner_seq = ((offset + index + np.arange(input_len))
-                    % vocab_size).tolist()
+        # Build the inner sequence by sampling
+        # sequentially from the allowed tokens
+        inner_seq = allowed_tokens[
+            (offset + index + np.arange(input_len)) % len(allowed_tokens)
+        ].tolist()
         token_sequence = prefix_token_ids + inner_seq
 
         # Decode, then re-encode and truncate to preserve token count invariants
-        prompt = tokenizer.decode(token_sequence)
         total_input_len = prefix_len + int(input_len)
+        prompt, adjusted_token_sequence, token_mismatch = (
+            gen_prompt_decode_to_target_len(
+                tokenizer=tokenizer,
+                token_sequence=token_sequence,
+                target_token_len=total_input_len,
+                add_special_tokens=False,
+                rng=self._rng,
+            )
+        )
+        total_input_len = len(adjusted_token_sequence)
+        return prompt, total_input_len, token_mismatch
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
 
-        re_encoded_sequence = tokenizer.encode(
-            prompt, add_special_tokens=False)[:total_input_len]
-        prompt = tokenizer.decode(re_encoded_sequence)
-        total_input_len = len(re_encoded_sequence)
 
-        return prompt, total_input_len
+class RandomDatasetForReranking(RandomDataset):
+    """
+    Random dataset specialized for the needs of scoring:
+    - Batches of inputs
+    - Inputs composed of pairs
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
+        input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
+        batchsize: int = 1,
+        is_reranker: bool = True,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        n_sep_tokens = int(is_reranker)
+
+        query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
+
+        query_lens, _, query_offsets = self.get_sampling_params(
+            1, range_ratio, query_len_param, 0, tokenizer
+        )
+
+        query_len = int(query_lens[0])
+
+        if not is_reranker:
+            assert num_requests > 1 and batchsize > 1
+            num_requests -= 1
+            batchsize -= 1
+            doc_len_param = input_len
+        else:
+            doc_len_param = input_len - query_len - n_sep_tokens
+
+        doc_lens, _, doc_offsets = self.get_sampling_params(
+            num_requests, range_ratio, doc_len_param, 0, tokenizer
+        )
+        vocab_size = tokenizer.vocab_size
+
+        query_prompt, query_input_len, token_mismatch_total = (
+            self.generate_token_sequence(
+                tokenizer=tokenizer,
+                prefix_token_ids=[],
+                prefix_len=0,
+                vocab_size=vocab_size,
+                input_len=query_len,
+                offset=int(query_offsets[0]),
+                index=0,
+            )
+        )
+
+        requests = []
+        for i in range(num_requests):
+            prompt, total_input_len, token_mismatch = self.generate_token_sequence(  # noqa: E501
+                tokenizer=tokenizer,
+                prefix_token_ids=[],
+                prefix_len=0,
+                vocab_size=vocab_size,
+                input_len=int(doc_lens[i]),
+                offset=int(doc_offsets[i]),
+                index=i + 1,
+            )
+            token_mismatch_total += token_mismatch
+            requests.append((prompt, total_input_len))
+
+        batch_requests = []
+        # Create batched requests
+        for i in range(0, num_requests, batchsize):
+            batch = requests[i : i + batchsize]
+            query_contrib = (
+                (query_input_len + n_sep_tokens) * len(batch)
+                if is_reranker
+                else query_input_len
+            )
+            batch_requests.append(
+                SampleRequest(
+                    prompt=[query_prompt] + [req[0] for req in batch],
+                    prompt_len=query_contrib + sum(req[1] for req in batch),
+                    expected_output_len=0,
+                    request_id=request_id_prefix + str(i // batchsize),
+                )
+            )
+
+        if token_mismatch_total != 0:
+            logger.warning(
+                "Across all generated prompts, there were %d %s tokens "
+                "than expected after decoding and re-encoding. This is "
+                "expected due to the imperfect nature of the sampling "
+                "procedure.",
+                abs(token_mismatch_total),
+                "more" if token_mismatch_total > 0 else "fewer",
+            )
+
+        return batch_requests
 
 
 # -----------------------------------------------------------------------------
 # MultiModalDataset Implementation
 # -----------------------------------------------------------------------------
 
+
 class RandomMultiModalDataset(RandomDataset):
     """
     Synthetic multimodal dataset (text + images) that extends RandomDataset.
 
     Status:
     - Images: supported via synthetic RGB data.
-    - Video: not yet supported (TODO: implement video generation method).
+    - Video: supported via synthetic RGB data.
     - Audio: not yet supported.
 
     Sampling overview:
@@ -580,7 +796,7 @@ class RandomMultiModalDataset(RandomDataset):
        The maximum is further clamped to the sum of per-modality limits.
     2) Each item’s modality and shape is sampled from `bucket_config`, a dict
        mapping (height, width, num_frames) → probability. We treat
-       `num_frames`=1 as image and and `num_frames` > 1 as video.
+       `num_frames`=1 as image and `num_frames` > 1 as video.
        Entries with zero probability are removed and the rest are renormalized
        to sum to 1.
     3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
@@ -595,8 +811,7 @@ class RandomMultiModalDataset(RandomDataset):
     """
 
     IS_MULTIMODAL = True
-    # NOTE: video sampling is WIP. Setting it to 0.
-    DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0}
+    DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1}
 
     DEFAULT_BASE_ITEMS_PER_REQUEST = 1
     DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
@@ -610,7 +825,6 @@ class RandomMultiModalDataset(RandomDataset):
     def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
 
-
     def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
         """Generate synthetic PIL image with random RGB values.
 
@@ -627,14 +841,49 @@ class RandomMultiModalDataset(RandomDataset):
         )
         return Image.fromarray(random_pixels)
 
-    def generate_synthetic_video(self, width: int,
-                                    height: int,
-                                    num_frames: int) -> Any:
+    def generate_synthetic_video(
+        self, width: int, height: int, num_frames: int
+    ) -> dict:
         """Generate synthetic video with random values.
 
-        TODO: Finish this method.
+        Creates a video with random pixel values, encodes it to MP4 format,
+        and returns the content as bytes.
         """
-        raise NotImplementedError("Video sampling is WIP.")
+        import cv2
+
+        random_pixels = self._rng.integers(
+            0,
+            256,
+            (num_frames, height, width, 3),
+            dtype=np.uint8,
+        )
+
+        # Create a temporary video file in memory
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        fps = 30  # frames per second
+
+        with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
+            temp_path = temp_file.name
+
+            # Create video writer
+            video_writer = cv2.VideoWriter(
+                temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height)
+            )
+
+            if not video_writer.isOpened():
+                raise RuntimeError("Failed to create video writer")
+
+            for frame in random_pixels:
+                video_writer.write(frame)
+
+            video_writer.release()
+            temp_file.close()
+
+            # Read the video file content
+            with open(temp_path, "rb") as f:
+                video_content = f.read()
+
+            return {"bytes": video_content}
 
     def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
         """Map the configuration to the modality."""
@@ -645,8 +894,9 @@ class RandomMultiModalDataset(RandomDataset):
         else:
             raise ValueError(f"Invalid multimodal item configuration: {config}")
 
-    def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int],
-                                float]) -> dict[tuple[int, int, int], float]:
+    def normalize_bucket_config(
+        self, bucket_config: dict[tuple[int, int, int], float]
+    ) -> dict[tuple[int, int, int], float]:
         """
         Remove zero probability entries
         and normalize the bucket config to sum to 1.
@@ -658,16 +908,17 @@ class RandomMultiModalDataset(RandomDataset):
         bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
         # if bucket config is empty, raise error
         if not bucket_config:
-            raise ValueError("Got invalid bucket config. "
-                             "Bucket config values must be non-zero.")
+            raise ValueError(
+                "Got invalid bucket config. Bucket config values must be non-zero."
+            )
         # Normalize the remaining bucket config to sum to 1
         total = sum(bucket_config.values())
         return {k: v / total for k, v in bucket_config.items()}
 
-
-    def generate_mm_item(self,
-                         mm_item_config: tuple[int, int, int],
-                         ) -> Mapping[str, Any]:
+    def generate_mm_item(
+        self,
+        mm_item_config: tuple[int, int, int],
+    ) -> Mapping[str, Any]:
         """
         Create synthetic images and videos and
         apply process_image/process_video respectively.
@@ -676,18 +927,17 @@ class RandomMultiModalDataset(RandomDataset):
         """
 
         if self.map_config_to_modality(mm_item_config) == "image":
-            return process_image(self.generate_synthetic_image(
-                                                            mm_item_config[1],
-                                                            mm_item_config[0]))
+            return process_image(
+                self.generate_synthetic_image(mm_item_config[1], mm_item_config[0])
+            )
         elif self.map_config_to_modality(mm_item_config) == "video":
-            return process_video(self.generate_synthetic_video(
-                                                            mm_item_config[1],
-                                                            mm_item_config[0],
-                                                            mm_item_config[2]))
+            return process_video(
+                self.generate_synthetic_video(
+                    mm_item_config[1], mm_item_config[0], mm_item_config[2]
+                )
+            )
         else:
-            raise ValueError(f"Invalid multimodal item configuration: "
-                             f"{mm_item_config}")
-
+            raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
 
     def get_mm_item_sampling_params(
         self,
@@ -708,49 +958,53 @@ class RandomMultiModalDataset(RandomDataset):
             # get modality from bucket config
             modality = self.map_config_to_modality(k)
             if modality not in limit_mm_per_prompt:
-                raise ValueError(f"Modality {modality} is not in "
-                                 f"limit_mm_per_prompt: "
-                                 f"{limit_mm_per_prompt.keys()}")
+                raise ValueError(
+                    f"Modality {modality} is not in "
+                    f"limit_mm_per_prompt: "
+                    f"{limit_mm_per_prompt.keys()}"
+                )
 
         # Remove zero probability entries
         # and normalize bucket config to sum to 1
         bucket_config = self.normalize_bucket_config(bucket_config)
         logger.info(
-            "Normalized bucket config: %s", bucket_config,
+            "Normalized bucket config: %s",
+            bucket_config,
         )
         # Only consider limit per prompt for modalities in bucket config
-        allowed_modalities = {self.map_config_to_modality(cfg)
-                              for cfg in bucket_config}
+        allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config}
         limit_mm_per_prompt = {
-            k: v for k, v in limit_mm_per_prompt.items()
-            if k in allowed_modalities}
+            k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities
+        }
         if not limit_mm_per_prompt:
-            raise ValueError("No valid limits for modalities present in "
-                             "bucket_config.")
+            raise ValueError("No valid limits for modalities present in bucket_config.")
 
         logger.info(
-            "Updated mm-limit-per-prompt: %s", limit_mm_per_prompt,
+            "Updated mm-limit-per-prompt: %s",
+            limit_mm_per_prompt,
         )
 
         # Get max and min num mm items and ensure
         # it is at most the sum of limit_mm_per_prompt for all modalities
         max_num_mm_items = min(
             sum(limit_mm_per_prompt.values()),
-            math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio))
+            math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)),
         )
         # Ensure min num mm items is at least 0
         min_num_mm_items = max(
-            0,
-            math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
+            0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
         )
         # Raise error if min num mm items is greater than max num mm items
         if min_num_mm_items > max_num_mm_items:
-            raise ValueError(f"Min num mm items is greater than max mm items: "
-                             f"{min_num_mm_items} > {max_num_mm_items}")
+            raise ValueError(
+                f"Min num mm items is greater than max mm items: "
+                f"{min_num_mm_items} > {max_num_mm_items}"
+            )
 
         logger.info(
             "Sampling number of multimodal items from [%s, %s]",
-            min_num_mm_items, max_num_mm_items,
+            min_num_mm_items,
+            max_num_mm_items,
         )
 
         return (
@@ -766,7 +1020,7 @@ class RandomMultiModalDataset(RandomDataset):
         max_num_mm_items: int,
         bucket_config: dict[tuple[int, int, int], float],
         limit_mm_per_prompt: dict[str, int],
-    ) -> Iterator[tuple[int,int, int]]:
+    ) -> Iterator[tuple[int, int, int]]:
         """
         Iterator over the multimodal items for each request
         whose size is between min_num_mm_items and max_num_mm_items.
@@ -790,22 +1044,20 @@ class RandomMultiModalDataset(RandomDataset):
         if request_num_mm_items == 0:
             return
         # Initialize modality counters
-        modality_counter = {self.map_config_to_modality(k): 0
-                            for k in bucket_config}
+        modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config}
         # Copy the bucket config to avoid modifying the original
         bucket_config_copy = bucket_config.copy()
         # Loop over the number of multimodal items to sample
         while sum(modality_counter.values()) < request_num_mm_items:
             # Sample a multimodal item config
-            mm_item_config = self._rng.choice(list(bucket_config_copy.keys()),
-                                                p=list(bucket_config_copy.values()))
+            mm_item_config = self._rng.choice(
+                list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())
+            )
             modality = self.map_config_to_modality(mm_item_config)
             # Check that modality count is less than limit per prompt
             if modality_counter[modality] < limit_mm_per_prompt[modality]:
                 modality_counter[modality] += 1
-                yield (
-                    mm_item_config
-                )
+                yield (mm_item_config)
             else:
                 # If the counter is greater than the limit per prompt
                 # set all multimodal items of this modality to 0
@@ -816,14 +1068,12 @@ class RandomMultiModalDataset(RandomDataset):
                 # This should not happen as request_num_mm_items is at most
                 # the sum of limit_mm_per_prompt for all modalities
                 if all(v == 0 for v in bucket_config_copy.values()):
-                    logger.warning("Exhausted all multimodal items "
-                                   "of modality %s",
-                                   modality)
+                    logger.warning(
+                        "Exhausted all multimodal items of modality %s", modality
+                    )
                     break
                 # Renormalize the bucket config
-                bucket_config_copy = self.normalize_bucket_config(
-                                        bucket_config_copy)
-
+                bucket_config_copy = self.normalize_bucket_config(bucket_config_copy)
 
     def sample(
         self,
@@ -838,19 +1088,12 @@ class RandomMultiModalDataset(RandomDataset):
         limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
         base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
         num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
-        bucket_config: dict[tuple[int, int, int], float] =
-                                        DEFAULT_MM_ITEM_BUCKET_CONFIG,
+        bucket_config: dict[
+            tuple[int, int, int], float
+        ] = DEFAULT_MM_ITEM_BUCKET_CONFIG,
         enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
         **kwargs,
     ) -> list[SampleRequest]:
-
-        # NOTE: Video sampling is WIP. Raise error if video is in bucket config
-        # and probability is non-zero.
-        if any(self.map_config_to_modality(cfg) == "video" and p > 0
-                for cfg, p in bucket_config.items()):
-            raise NotImplementedError("Video sampling not implemented; "
-                                      "set its probability to 0.")
-
         # Get the sampling parameters for the dataset
         input_lens, output_lens, offsets = self.get_sampling_params(
             num_requests, range_ratio, input_len, output_len, tokenizer
@@ -868,13 +1111,29 @@ class RandomMultiModalDataset(RandomDataset):
             bucket_config,
         )
 
-        # Generate prefix once
-        prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
         vocab_size = tokenizer.vocab_size
+        # Can't use tokenizer.all_special_ids since
+        # it returns ONLY ids from special_tokens_map.json
+        # We want to exclude placeholder tokens and all
+        # tokens that indicate start/end of image as it
+        # may break prompt replacement logic.
+        prohibited_tokens = list(
+            tok_id
+            for tok_id, token in tokenizer.added_tokens_decoder.items()
+            if token.special
+        )
+        all_tokens = np.arange(vocab_size)
+        allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
+        logger.debug(
+            "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size
+        )
+        # Generate prefix once
+        prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
         # Add synthetic multimodal items to each request
         mm_requests = []
+        token_mismatch_total = 0
         for i in range(num_requests):
-            prompt, total_input_len = self.generate_token_sequence(
+            prompt, total_input_len, token_mismatch = self.generate_token_sequence(  # noqa: E501
                 tokenizer=tokenizer,
                 prefix_token_ids=prefix_token_ids,
                 prefix_len=prefix_len,
@@ -882,7 +1141,9 @@ class RandomMultiModalDataset(RandomDataset):
                 input_len=int(input_lens[i]),
                 offset=int(offsets[i]),
                 index=i,
+                allowed_tokens=allowed_tokens,
             )
+            token_mismatch_total += token_mismatch
             # Get multimodal item iterator for a given request
             mm_item_iterator = self.get_mm_item_iterator(
                 min_num_mm_items,
@@ -891,17 +1152,21 @@ class RandomMultiModalDataset(RandomDataset):
                 limit_mm_per_prompt,
             )
 
-            mm_content = cast(list[dict[str, Any]], [
-                self.generate_mm_item(mm_item_config)
-                for mm_item_config in mm_item_iterator
-            ])
+            mm_content = cast(
+                list[dict[str, Any]],
+                [
+                    self.generate_mm_item(mm_item_config)
+                    for mm_item_config in mm_item_iterator
+                ],
+            )
 
             if enable_multimodal_chat:
                 # NOTE: For now this option is only provided for completeness
                 # given that the serve.py benchmark currently does not use it.
                 mm_chat_prompt: Any = prompt
                 mm_chat_prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                    prompt, mm_content
+                )
                 sample_request = SampleRequest(
                     prompt=mm_chat_prompt,
                     prompt_len=total_input_len,
@@ -918,8 +1183,21 @@ class RandomMultiModalDataset(RandomDataset):
                     request_id=request_id_prefix + str(i),
                 )
             mm_requests.append(sample_request)
+
+        if token_mismatch_total != 0:
+            sign = "more" if token_mismatch_total > 0 else "fewer"
+            logger.warning(
+                "Across all generated prompts, there were %d %s tokens "
+                "than expected after decoding and re-encoding. This is "
+                "expected due to the imperfect nature of the sampling "
+                "procedure.",
+                abs(token_mismatch_total),
+                sign,
+            )
+
         return mm_requests
 
+
 # -----------------------------------------------------------------------------
 # ShareGPT Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -943,19 +1221,21 @@ class ShareGPTDataset(BenchmarkDataset):
             self.data = json.load(f)
         # Filter entries with at least two conversation turns.
         self.data = [
-            entry for entry in self.data
+            entry
+            for entry in self.data
             if "conversations" in entry and len(entry["conversations"]) >= 2
         ]
         random.seed(self.random_seed)
-        random.shuffle(self.data)
+        if not getattr(self, "disable_shuffle", False):
+            random.shuffle(self.data)
 
     def sample(
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        lora_path: Optional[str] = None,
-        max_loras: Optional[int] = None,
-        output_len: Optional[int] = None,
+        lora_path: str | None = None,
+        max_loras: int | None = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -972,16 +1252,17 @@ class ShareGPTDataset(BenchmarkDataset):
             )
 
             lora_request = self.get_random_lora_request(
-                max_loras=max_loras, lora_path=lora_path)
+                max_loras=max_loras, lora_path=lora_path
+            )
             prompt_ids = tokenizer(prompt).input_ids
             completion_ids = tokenizer(completion).input_ids
             prompt_len = len(prompt_ids)
-            new_output_len = (len(completion_ids)
-                              if output_len is None else output_len)
-            if not is_valid_sequence(prompt_len,
-                                     new_output_len,
-                                     skip_min_output_len_check=output_len
-                                     is not None):
+            new_output_len = len(completion_ids) if output_len is None else output_len
+            if not is_valid_sequence(
+                prompt_len,
+                new_output_len,
+                skip_min_output_len_check=output_len is not None,
+            ):
                 continue
             if image_path := entry.get("image"):
                 mm_content = process_image(image_path)
@@ -990,8 +1271,7 @@ class ShareGPTDataset(BenchmarkDataset):
             else:
                 mm_content = None
             if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
@@ -1000,23 +1280,24 @@ class ShareGPTDataset(BenchmarkDataset):
                     lora_request=lora_request,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(ind),
-                ))
+                )
+            )
             ind += 1
-        self.maybe_oversample_requests(samples,
-                                       num_requests,
-                                       request_id_prefix,
-                                       no_oversample)
+        self.maybe_oversample_requests(
+            samples, num_requests, request_id_prefix, no_oversample
+        )
         return samples
 
 
 class _ValidateDatasetArgs(argparse.Action):
     """Argparse action to validate dataset name and path compatibility."""
+
     def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, values)
 
         # Get current values of both dataset_name and dataset_path
-        dataset_name = getattr(namespace, 'dataset_name', 'random')
-        dataset_path = getattr(namespace, 'dataset_path', None)
+        dataset_name = getattr(namespace, "dataset_name", "random")
+        dataset_path = getattr(namespace, "dataset_path", None)
 
         # Validate the combination
         if dataset_name == "random" and dataset_path is not None:
@@ -1042,8 +1323,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         default="random",
         action=_ValidateDatasetArgs,
         choices=[
-            "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
-            "custom", "prefix_repetition", "spec_bench"
+            "sharegpt",
+            "burstgpt",
+            "sonnet",
+            "random",
+            "random-mm",
+            "random-rerank",
+            "hf",
+            "custom",
+            "prefix_repetition",
+            "spec_bench",
         ],
         help="Name of the dataset to benchmark on.",
     )
@@ -1063,8 +1352,17 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "--no-oversample",
         action="store_true",
-        help="Do not oversample if the dataset has " \
-        "fewer samples than num-prompts.",
+        help="Do not oversample if the dataset has fewer samples than num-prompts.",
+    )
+    parser.add_argument(
+        "--skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt for datasets that support it.",
+    )
+    parser.add_argument(
+        "--disable-shuffle",
+        action="store_true",
+        help="Disable shuffling of dataset samples for deterministic ordering.",
     )
 
     # group for dataset specific arguments
@@ -1073,14 +1371,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--custom-output-len",
         type=int,
         default=256,
-        help=
-        "Number of output tokens per request, used only for custom dataset.",
-    )
-    custom_group.add_argument(
-        "--custom-skip-chat-template",
-        action="store_true",
-        help=
-        "Skip applying chat template to prompt, used only for custom dataset.",
+        help="Number of output tokens per request, used only for custom dataset.",
     )
 
     spec_bench_group = parser.add_argument_group("spec bench dataset options")
@@ -1088,15 +1379,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--spec-bench-output-len",
         type=int,
         default=256,
-        help=
-        "Num of output tokens per request, used only for spec bench dataset.",
+        help="Num of output tokens per request, used only for spec bench dataset.",
     )
     spec_bench_group.add_argument(
         "--spec-bench-category",
         type=str,
         default=None,
-        help=
-        "Category for spec bench dataset. If None, use all categories.",
+        help="Category for spec bench dataset. If None, use all categories.",
     )
 
     sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -1104,22 +1393,19 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--sonnet-input-len",
         type=int,
         default=550,
-        help=
-        "Number of input tokens per request, used only for sonnet dataset.",
+        help="Number of input tokens per request, used only for sonnet dataset.",
     )
     sonnet_group.add_argument(
         "--sonnet-output-len",
         type=int,
         default=150,
-        help=
-        "Number of output tokens per request, used only for sonnet dataset.",
+        help="Number of output tokens per request, used only for sonnet dataset.",
     )
     sonnet_group.add_argument(
         "--sonnet-prefix-len",
         type=int,
         default=200,
-        help=
-        "Number of prefix tokens per request, used only for sonnet dataset.",
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
     )
 
     sharegpt_group = parser.add_argument_group("sharegpt dataset options")
@@ -1136,15 +1422,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--blazedit-min-distance",
         type=float,
         default=0.0,
-        help=
-        "Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
+        help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
     )
     blazedit_group.add_argument(
         "--blazedit-max-distance",
         type=float,
         default=1.0,
-        help=
-        "Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
+        help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
     )
 
     random_group = parser.add_argument_group("random dataset options")
@@ -1152,15 +1436,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--random-input-len",
         type=int,
         default=1024,
-        help=
-        "Number of input tokens per request, used only for random sampling.",
+        help="Number of input tokens per request, used only for random sampling.",
     )
     random_group.add_argument(
         "--random-output-len",
         type=int,
         default=128,
-        help=
-        "Number of output tokens per request, used only for random sampling.",
+        help="Number of output tokens per request, used only for random sampling.",
     )
     random_group.add_argument(
         "--random-range-ratio",
@@ -1175,24 +1457,34 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--random-prefix-len",
         type=int,
         default=0,
-        help=("Number of fixed prefix tokens before the random context "
-              "in a request. "
-              "The total input length is the sum of `random-prefix-len` and "
-              "a random "
-              "context length sampled from [input_len * (1 - range_ratio), "
-              "input_len * (1 + range_ratio)]."),
+        help=(
+            "Number of fixed prefix tokens before the random context "
+            "in a request. "
+            "The total input length is the sum of `random-prefix-len` and "
+            "a random "
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
     )
     random_group.add_argument(
         "--random-batch-size",
         type=int,
         default=1,
-        help=("Batch size for random sampling. "
-              "Only used for embeddings benchmark."),
+        help=("Batch size for random sampling. Only used for embeddings benchmark."),
+    )
+    random_group.add_argument(
+        "--no-reranker",
+        action="store_true",
+        help=(
+            "Whether the model supports reranking natively."
+            " Only used for reranker benchmark."
+        ),
     )
 
     # random multimodal dataset options
     random_mm_group = parser.add_argument_group(
-        "random multimodal dataset options extended from random dataset")
+        "random multimodal dataset options extended from random dataset"
+    )
     random_mm_group.add_argument(
         "--random-mm-base-items-per-request",
         type=int,
@@ -1224,7 +1516,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
         help=(
             "Per-modality hard caps for items attached per request, e.g. "
-            "'{\"image\": 3, \"video\": 0}'. The sampled per-request item "
+            '\'{"image": 3, "video": 0}\'. The sampled per-request item '
             "count is clamped to the sum of these limits. When a modality "
             "reaches its cap, its buckets are excluded and probabilities are "
             "renormalized."
@@ -1241,8 +1533,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
                 if isinstance(key, str):
                     with suppress(Exception):
                         key = ast.literal_eval(key)
-                if not (isinstance(key, tuple) and len(key) == 3
-                        and all(isinstance(x, int) for x in key)):
+                if not (
+                    isinstance(key, tuple)
+                    and len(key) == 3
+                    and all(isinstance(x, int) for x in key)
+                ):
                     raise ValueError(
                         f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
                     )
@@ -1281,14 +1576,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     )
 
     hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument("--hf-subset",
-                          type=str,
-                          default=None,
-                          help="Subset of the HF dataset.")
-    hf_group.add_argument("--hf-split",
-                          type=str,
-                          default=None,
-                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
     hf_group.add_argument(
         "--hf-name",
         type=str,
@@ -1308,7 +1601,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     )
 
     prefix_repetition_group = parser.add_argument_group(
-        "prefix repetition dataset options")
+        "prefix repetition dataset options"
+    )
     prefix_repetition_group.add_argument(
         "--prefix-repetition-prefix-len",
         type=int,
@@ -1340,23 +1634,26 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
 
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
-
     if not hasattr(args, "request_id_prefix"):
         args.request_id_prefix = ""
 
     if args.dataset_name == "custom":
-        dataset = CustomDataset(dataset_path=args.dataset_path)
+        dataset = CustomDataset(
+            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+        )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
-            skip_chat_template=args.custom_skip_chat_template,
+            skip_chat_template=args.skip_chat_template,
             request_id_prefix=args.request_id_prefix,
             no_oversample=args.no_oversample,
         )
 
     elif args.dataset_name == "sonnet":
-        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        dataset = SonnetDataset(
+            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+        )
         # For the "sonnet" dataset, formatting depends on the backend.
         if args.backend == "openai-chat":
             input_requests = dataset.sample(
@@ -1371,7 +1668,8 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
-                "Tokenizer/model must have chat template for sonnet dataset.")
+                "Tokenizer/model must have chat template for sonnet dataset."
+            )
             input_requests = dataset.sample(
                 num_requests=args.num_prompts,
                 input_len=args.sonnet_input_len,
@@ -1413,6 +1711,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
         ):
             dataset_class = MTBenchDataset
             args.hf_split = "train"
+        elif (
+            args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            dataset_class = MultiModalConversationDataset
         elif (
             args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
             or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
@@ -1425,8 +1728,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             dataset_class = AIMODataset
             args.hf_split = "train"
         elif (
-            args.dataset_path
-            in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS  # noqa: E501
+            args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS  # noqa: E501
             or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
         ):
             dataset_class = NextEditPredictionDataset
@@ -1450,27 +1752,39 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
         ):
             dataset_class = MLPerfDataset
             args.hf_split = "train"
+        elif (
+            args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
+        ):
+            dataset_class = MMStarDataset
+            args.hf_split = "val"
+            args.hf_subset = None
         else:
-            supported_datasets = set([
-                dataset_name for cls in HuggingFaceDataset.__subclasses__()
-                for dataset_name in cls.SUPPORTED_DATASET_PATHS
-            ])
+            supported_datasets = set(
+                [
+                    dataset_name
+                    for cls in HuggingFaceDataset.__subclasses__()
+                    for dataset_name in cls.SUPPORTED_DATASET_PATHS
+                ]
+            )
             raise ValueError(
                 f"Unsupported dataset path: {args.dataset_path}. "
                 "Huggingface dataset only supports dataset_path"
                 f" from one of following: {supported_datasets}. "
                 "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
+                "like to add support for additional dataset formats."
+            )
 
-        if dataset_class.IS_MULTIMODAL and args.backend not in [
-                "openai-chat",
-                "openai-audio",
-        ]:
+        if dataset_class.IS_MULTIMODAL and not (
+            args.backend in ("openai-chat", "openai-audio")
+            or "embeddings-" in args.backend
+        ):
             # multi-modal benchmark is only available on OpenAI Chat
             # endpoint-type.
             raise ValueError(
                 "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' backends.")
+                "'openai-audio' backends."
+            )
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
@@ -1478,21 +1792,25 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             random_seed=args.seed,
             no_stream=args.no_stream,
             hf_name=args.hf_name,
+            disable_shuffle=args.disable_shuffle,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
             request_id_prefix=args.request_id_prefix,
             no_oversample=args.no_oversample,
-            **hf_kwargs
+            skip_chat_template=args.skip_chat_template,
+            **hf_kwargs,
         )
 
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
-            "spec_bench":
-            lambda: SpecBench(dataset_path=args.dataset_path,
-                              category=args.spec_bench_category).sample(
+            "spec_bench": lambda: SpecBench(
+                dataset_path=args.dataset_path,
+                category=args.spec_bench_category,
+                disable_shuffle=args.disable_shuffle,
+            ).sample(
                 num_requests=args.num_prompts,
                 tokenizer=tokenizer,
                 output_len=args.spec_bench_output_len,
@@ -1500,7 +1818,9 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 no_oversample=args.no_oversample,
             ),
             "sharegpt": lambda: ShareGPTDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1509,7 +1829,9 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 no_oversample=args.no_oversample,
             ),
             "burstgpt": lambda: BurstGPTDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1517,7 +1839,9 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 no_oversample=args.no_oversample,
             ),
             "random": lambda: RandomDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1529,9 +1853,10 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 batchsize=args.random_batch_size,
                 no_oversample=args.no_oversample,
             ),
-            "random-mm":
-            lambda: RandomMultiModalDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
+            "random-mm": lambda: RandomMultiModalDataset(
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1546,9 +1871,23 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 request_id_prefix=args.request_id_prefix,
                 no_oversample=args.no_oversample,
             ),
-            "prefix_repetition":
-            lambda: PrefixRepetitionRandomDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
+            "random-rerank": lambda: RandomDatasetForReranking(
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                input_len=args.random_input_len,
+                range_ratio=args.random_range_ratio,
+                request_id_prefix=args.request_id_prefix,
+                batchsize=args.random_batch_size,
+                is_reranker=not args.no_reranker,
+            ),
+            "prefix_repetition": lambda: PrefixRepetitionRandomDataset(
+                random_seed=args.seed,
+                dataset_path=args.dataset_path,
+                disable_shuffle=args.disable_shuffle,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -1563,8 +1902,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
 
         try:
             # Enforce endpoint compatibility for multimodal datasets.
-            if args.dataset_name == "random-mm" and args.backend not in [
-                    "openai-chat"]:
+            if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
                 raise ValueError(
                     "Multi-modal content (images) is only supported on "
                     "'openai-chat' backend."
@@ -1609,8 +1947,7 @@ class CustomDataset(BenchmarkDataset):
 
         # Load the JSONL file
         if self.dataset_path.endswith(".jsonl"):
-            jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
-                                      lines=True)
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
 
             # check if the JSONL file has a 'prompt' column
             if "prompt" not in jsonl_data.columns:
@@ -1624,18 +1961,20 @@ class CustomDataset(BenchmarkDataset):
                 self.data.append(row.to_dict())
         else:
             raise NotImplementedError(
-                "Only JSONL format is supported for CustomDataset.")
+                "Only JSONL format is supported for CustomDataset."
+            )
 
         random.seed(self.random_seed)
-        random.shuffle(self.data)
+        if not getattr(self, "disable_shuffle", False):
+            random.shuffle(self.data)
 
     def sample(
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        lora_path: Optional[str] = None,
-        max_loras: Optional[int] = None,
-        output_len: Optional[int] = None,
+        lora_path: str | None = None,
+        max_loras: int | None = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
@@ -1646,9 +1985,11 @@ class CustomDataset(BenchmarkDataset):
         self.num_available_samples = len(self.data)
         if num_requests <= 0:
             num_requests = self.num_available_samples
-            logger.info("num_requests is set to 0 or negative, "
-                        "so using all available samples: %d",
-                        num_requests)
+            logger.info(
+                "num_requests is set to 0 or negative, "
+                "so using all available samples: %d",
+                num_requests,
+            )
 
         sampled_requests = []
         for i, item in enumerate(self.data):
@@ -1659,10 +2000,7 @@ class CustomDataset(BenchmarkDataset):
             # apply template
             if not skip_chat_template:
                 prompt = tokenizer.apply_chat_template(
-                    [{
-                        "role": "user",
-                        "content": prompt
-                    }],
+                    [{"role": "user", "content": prompt}],
                     add_generation_prompt=True,
                     tokenize=False,
                 )
@@ -1674,9 +2012,11 @@ class CustomDataset(BenchmarkDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
 
         return sampled_requests
 
@@ -1691,7 +2031,7 @@ class SpecBench(CustomDataset):
     Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
     Download the dataset using:
     wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
-    """ # noqa: E501
+    """  # noqa: E501
 
     def __init__(self, **kwargs) -> None:
         self.category = kwargs.pop("category", None)
@@ -1705,8 +2045,7 @@ class SpecBench(CustomDataset):
         self.data = []
 
         # Load the JSONL file
-        jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
-                                    lines=True)
+        jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
 
         # check if the JSONL file has a 'turns' column
         if "turns" not in jsonl_data.columns:
@@ -1714,16 +2053,16 @@ class SpecBench(CustomDataset):
 
         for _, row in jsonl_data.iterrows():
             # sample only from a specific category if specified
-            if (not self.category) or (self.category == row['category']):
+            if (not self.category) or (self.category == row["category"]):
                 prompt = row["turns"][0]
                 self.data.append({"prompt": prompt})
 
         random.seed(self.random_seed)
-        random.shuffle(self.data)
+        if not getattr(self, "disable_shuffle", False):
+            random.shuffle(self.data)
 
     def sample(self, **kwargs) -> list:
         # leverage CustomDataset sample
-        kwargs["skip_chat_template"] = False
         return super().sample(**kwargs)
 
 
@@ -1731,6 +2070,7 @@ class SpecBench(CustomDataset):
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
 
+
 @deprecated(
     "SonnetDataset is deprecated and will be removed in a future version.",
 )
@@ -1772,20 +2112,20 @@ class SonnetDataset(BenchmarkDataset):
     ) -> list:
         # Calculate average token length for a poem line.
         tokenized_lines = [tokenizer(line).input_ids for line in self.data]
-        avg_len = sum(len(tokens)
-                      for tokens in tokenized_lines) / len(tokenized_lines)
+        avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
 
         # Build the base prompt.
         base_prompt = "Pick as many lines as you can from these poem lines:\n"
         base_msg = [{"role": "user", "content": base_prompt}]
-        base_fmt = tokenizer.apply_chat_template(base_msg,
-                                                 add_generation_prompt=True,
-                                                 tokenize=False)
+        base_fmt = tokenizer.apply_chat_template(
+            base_msg, add_generation_prompt=True, tokenize=False
+        )
         base_offset = len(tokenizer(base_fmt).input_ids)
         if input_len <= base_offset:
             raise ValueError(
                 f"'input_len' must be higher than the base prompt length "
-                f"({base_offset}).")
+                f"({base_offset})."
+            )
 
         # Determine how many poem lines to use.
         num_input_lines = round((input_len - base_offset) / avg_len)
@@ -1795,22 +2135,24 @@ class SonnetDataset(BenchmarkDataset):
         samples = []
         ind = 0
         while len(samples) < num_requests:
-            extra_lines = random.choices(self.data,
-                                         k=num_input_lines - num_prefix_lines)
+            extra_lines = random.choices(
+                self.data, k=num_input_lines - num_prefix_lines
+            )
             prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
             msg = [{"role": "user", "content": prompt}]
             prompt_formatted = tokenizer.apply_chat_template(
-                msg, add_generation_prompt=True, tokenize=False)
+                msg, add_generation_prompt=True, tokenize=False
+            )
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
             if prompt_len <= input_len:
                 samples.append(
                     SampleRequest(
-                        prompt=prompt_formatted
-                        if return_prompt_formatted else prompt,
+                        prompt=prompt_formatted if return_prompt_formatted else prompt,
                         prompt_len=prompt_len,
                         expected_output_len=output_len,
-                         request_id=request_id_prefix + str(ind),
-                    ))
+                        request_id=request_id_prefix + str(ind),
+                    )
+                )
                 ind += 1
         return samples
 
@@ -1831,7 +2173,9 @@ class BurstGPTDataset(BenchmarkDataset):
         super().__init__(**kwargs)
         self.load_data()
 
-    def load_data(self, ):
+    def load_data(
+        self,
+    ):
         if self.dataset_path is None:
             raise ValueError("dataset_path must be provided for loading data.")
 
@@ -1845,8 +2189,7 @@ class BurstGPTDataset(BenchmarkDataset):
 
     def _sample_loaded_data(self, num_requests: int) -> list:
         if num_requests <= len(self.data):
-            data = self.data.sample(n=num_requests,
-                                    random_state=self.random_seed)
+            data = self.data.sample(n=num_requests, random_state=self.random_seed)
         else:
             data = self.data.sample(
                 n=num_requests,
@@ -1860,8 +2203,8 @@ class BurstGPTDataset(BenchmarkDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        max_loras: Optional[int] = None,
-        lora_path: Optional[str] = None,
+        max_loras: int | None = None,
+        lora_path: str | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -1872,7 +2215,8 @@ class BurstGPTDataset(BenchmarkDataset):
             input_len = int(data[i][2])
             output_len = int(data[i][3])
             lora_req = self.get_random_lora_request(
-                max_loras=max_loras, lora_path=lora_path)
+                max_loras=max_loras, lora_path=lora_path
+            )
             vocab_size = tokenizer.vocab_size
             # Generate a synthetic prompt: a list of token IDs computed as (i +
             # j) modulo vocab_size.
@@ -1885,7 +2229,8 @@ class BurstGPTDataset(BenchmarkDataset):
                     expected_output_len=output_len,
                     lora_request=lora_req,
                     request_id=request_id_prefix + str(i),
-                ))
+                )
+            )
         return samples
 
 
@@ -1895,15 +2240,15 @@ class BurstGPTDataset(BenchmarkDataset):
 class HuggingFaceDataset(BenchmarkDataset):
     """Base class for datasets hosted on HuggingFace."""
 
-    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+    SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set()
 
     def __init__(
         self,
         dataset_path: str,
         dataset_split: str,
         no_stream: bool = False,
-        dataset_subset: Optional[str] = None,
-        hf_name: Optional[str] = None,
+        dataset_subset: str | None = None,
+        hf_name: str | None = None,
         **kwargs,
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
@@ -1922,7 +2267,8 @@ class HuggingFaceDataset(BenchmarkDataset):
             split=self.dataset_split,
             streaming=self.load_stream,
         )
-        self.data = self.data.shuffle(seed=self.random_seed)
+        if not getattr(self, "disable_shuffle", False):
+            self.data = self.data.shuffle(seed=self.random_seed)
 
 
 # -----------------------------------------------------------------------------
@@ -1931,23 +2277,85 @@ class HuggingFaceDataset(BenchmarkDataset):
 
 
 class ConversationDataset(HuggingFaceDataset):
-    """Dataset for conversation data with multimodal support."""
+    """Dataset for text-only conversation data."""
+
+    SUPPORTED_DATASET_PATHS = {
+        "Aeala/ShareGPT_Vicuna_unfiltered",
+    }
+    IS_MULTIMODAL = False
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        ind = 0
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
+                continue
+            mm_content = process_image(item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
+                )
+            )
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
+        return sampled_requests
+
+
+class MultiModalConversationDataset(HuggingFaceDataset):
+    """Dataset for multimodal conversation data."""
+
     SUPPORTED_DATASET_PATHS = {
-        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+        "lmms-lab/LLaVA-OneVision-Data",
     }
     IS_MULTIMODAL = True
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               request_id_prefix: str = "",
-               no_oversample: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
         # Filter examples with at least 2 conversations
-        filtered_data = self.data.filter(
-            lambda x: len(x["conversations"]) >= 2)
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         ind = 0
         dynamic_output = output_len is None
@@ -1964,17 +2372,14 @@ class ConversationDataset(HuggingFaceDataset):
             completion_len = len(completion_ids)
             output_len = completion_len if dynamic_output else output_len
             assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(
-                    prompt_len, completion_len):
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
                 continue
-            mm_content = process_image(
-                item["image"]) if "image" in item else None
+            mm_content = process_image(item["image"]) if "image" in item else None
             if enable_multimodal_chat:
                 # Note: when chat is enabled the request prompt_len is no longer
                 # accurate and we will be using request output to count the
                 # actual prompt len and output len
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -1982,10 +2387,12 @@ class ConversationDataset(HuggingFaceDataset):
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(ind),
-                ))
+                )
+            )
             ind += 1
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2001,10 +2408,8 @@ class VisionArenaDataset(HuggingFaceDataset):
 
     DEFAULT_OUTPUT_LEN = 128
     SUPPORTED_DATASET_PATHS = {
-        "lmarena-ai/VisionArena-Chat":
-        lambda x: x["conversation"][0][0]["content"],
-        "lmarena-ai/vision-arena-bench-v0.1":
-        lambda x: x["turns"][0][0]["content"]
+        "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
     }
     IS_MULTIMODAL = True
 
@@ -2012,14 +2417,13 @@ class VisionArenaDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
     ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
@@ -2034,8 +2438,7 @@ class VisionArenaDataset(HuggingFaceDataset):
                 # Note: when chat is enabled the request prompt_len is no longer
                 # accurate and we will be using request output to count the
                 # actual prompt len
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -2043,9 +2446,11 @@ class VisionArenaDataset(HuggingFaceDataset):
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2057,24 +2462,22 @@ class MMVUDataset(HuggingFaceDataset):
 
     DEFAULT_OUTPUT_LEN = 128
     SUPPORTED_DATASET_PATHS = {
-        "yale-nlp/MMVU":
-        lambda x: x["question"] + " " + (
-            " ".join(f"{k}.{v}" for k, v in x["choices"].items())
-        ),
+        "yale-nlp/MMVU": lambda x: x["question"]
+        + " "
+        + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
     }
 
     def sample(
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
     ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
@@ -2089,8 +2492,7 @@ class MMVUDataset(HuggingFaceDataset):
                 # Note: when chat is enabled the request prompt_len is no longer
                 # accurate and we will be using request output to count the
                 # actual prompt len
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -2098,9 +2500,11 @@ class MMVUDataset(HuggingFaceDataset):
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2124,16 +2528,18 @@ class InstructCoderDataset(HuggingFaceDataset):
         "likaixin/InstructCoder",
     }
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               request_id_prefix: str = "",
-               no_oversample: bool = False,
-               **kwargs) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
@@ -2144,14 +2550,12 @@ class InstructCoderDataset(HuggingFaceDataset):
             )
 
             # apply template
-            prompt = tokenizer.apply_chat_template(
-                [{
-                    "role": "user",
-                    "content": prompt
-                }],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
 
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
@@ -2160,9 +2564,11 @@ class InstructCoderDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2190,14 +2596,14 @@ class MTBenchDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
     ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
 
         for i, item in enumerate(self.data):
@@ -2206,14 +2612,12 @@ class MTBenchDataset(HuggingFaceDataset):
             prompt = item["turns"][0]
 
             # apply template
-            prompt = tokenizer.apply_chat_template(
-                [{
-                    "role": "user",
-                    "content": prompt
-                }],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
 
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
@@ -2222,9 +2626,11 @@ class MTBenchDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2256,15 +2662,15 @@ class BlazeditDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
+        skip_chat_template: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         min_distance: float = 0.0,
         max_distance: float = 1.0,
         **kwargs,
     ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
 
         for i, item in enumerate(self.data):
@@ -2280,7 +2686,7 @@ class BlazeditDataset(HuggingFaceDataset):
 
             # template copied from
             # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
-            instruction = f"""Given a code file, please apply the change requests and generate the new file.
+            prompt = f"""Given a code file, please apply the change requests and generate the new file.
 
 Original file:
 ```python
@@ -2290,17 +2696,15 @@ Original file:
 Change request:
 {change_request}
 
-Please generate the new code file in the "New file" section below.""" # noqa: E501
+Please generate the new code file in the "New file" section below."""  # noqa: E501
 
             # apply template
-            prompt = tokenizer.apply_chat_template(
-                [{
-                    "role": "user",
-                    "content": instruction
-                }],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
 
             prompt_len = len(tokenizer(prompt).input_ids)
 
@@ -2310,9 +2714,11 @@ Please generate the new code file in the "New file" section below.""" # noqa: E5
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     request_id=request_id_prefix + str(i),
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
 
         return sampled_requests
 
@@ -2326,18 +2732,22 @@ class AIMODataset(HuggingFaceDataset):
     """
     Dataset class for processing a AIMO dataset with reasoning questions.
     """
+
     SUPPORTED_DATASET_PATHS = {
-        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
-        "AI-MO/NuminaMath-CoT"
+        "AI-MO/aimo-validation-aime",
+        "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT",
     }
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               request_id_prefix: str = "",
-               no_oversample: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
         sampled_requests = []
         ind = 0
         dynamic_output = output_len is None
@@ -2345,7 +2755,7 @@ class AIMODataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt, completion = item['problem'], item["solution"]
+            prompt, completion = item["problem"], item["solution"]
 
             prompt_ids = tokenizer(prompt).input_ids
             completion_ids = tokenizer(completion).input_ids
@@ -2353,10 +2763,9 @@ class AIMODataset(HuggingFaceDataset):
             completion_len = len(completion_ids)
             output_len = completion_len if dynamic_output else output_len
             assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(prompt_len,
-                                                        completion_len,
-                                                        max_prompt_len=2048,
-                                                        max_total_len=32000):
+            if dynamic_output and not is_valid_sequence(
+                prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
+            ):
                 continue
             sampled_requests.append(
                 SampleRequest(
@@ -2365,10 +2774,12 @@ class AIMODataset(HuggingFaceDataset):
                     expected_output_len=output_len,
                     multi_modal_data=None,
                     request_id=request_id_prefix + str(ind),
-                ))
+                )
+            )
             ind += 1
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2390,12 +2801,12 @@ You are a code completion assistant and your task is to analyze user edits and t
 
 ### Response:
 
-""" # noqa: E501
+"""  # noqa: E501
 
 
 def _format_zeta_prompt(
-        sample: dict,
-        original_start_marker: str = "<|editable_region_start|>") -> dict:
+    sample: dict, original_start_marker: str = "<|editable_region_start|>"
+) -> dict:
     """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
 
     This function formats examples from the NEP dataset
@@ -2438,10 +2849,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
         "zed-industries/zeta": _format_zeta_prompt,
     }
 
-    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
-               request_id_prefix: str = "",
-               no_oversample: bool = False,
-               **kwargs):
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ):
         formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
         if formatting_prompt_func is None:
             raise ValueError(f"Unsupported dataset path: {self.hf_name}")
@@ -2453,15 +2868,16 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                     prompt=sample["prompt"],
                     prompt_len=len(tokenizer(sample["prompt"]).input_ids),
                     expected_output_len=len(
-                        tokenizer(sample["expected_output"]).input_ids),
+                        tokenizer(sample["expected_output"]).input_ids
+                    ),
                     request_id=request_id_prefix + str(i),
-                ))
+                )
+            )
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples,
-                                       num_requests,
-                                       request_id_prefix,
-                                       no_oversample)
+        self.maybe_oversample_requests(
+            samples, num_requests, request_id_prefix, no_oversample
+        )
         return samples
 
 
@@ -2502,21 +2918,19 @@ class ASRDataset(HuggingFaceDataset):
     IS_MULTIMODAL = True
 
     # TODO Whisper-specific. Abstract interface when more models are supported.
-    TRANSCRIPTION_PREAMBLE = (
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
     skip_long_audios: bool = True
 
     def sample(
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
     ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
         prompt_len = len(tokenizer(prompt).input_ids)
         sampled_requests = []
@@ -2541,7 +2955,8 @@ class ASRDataset(HuggingFaceDataset):
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(ind),
-                ))
+                )
+            )
             ind += 1
         if skipped:
             logger.warning(
@@ -2550,8 +2965,9 @@ class ASRDataset(HuggingFaceDataset):
                 " what Whisper supports.",
                 skipped,
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2587,7 +3003,7 @@ class MLPerfDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -2635,8 +3051,9 @@ class MLPerfDataset(HuggingFaceDataset):
             )
             ind += 1
 
-        self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix, no_oversample)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
         return sampled_requests
 
 
@@ -2685,29 +3102,27 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
             """Generate tokens that decode and re-encode to exactly
             target_length."""
             # Generate random tokens
-            tokens = np.random.randint(
-                0, vocab_size, size=target_length).tolist()
-            text = tokenizer.decode(tokens)
-            re_encoded = tokenizer.encode(text, add_special_tokens=False)
-
-            if len(re_encoded) == target_length:
-                return re_encoded
-            elif len(re_encoded) < target_length:
-                # Recursively generate additional consistent tokens
-                needed = target_length - len(re_encoded)
-                extra_tokens = _generate_exact_length_tokens(needed)
-                return re_encoded + extra_tokens
-            else:
-                # Truncate to target length
-                return re_encoded[:target_length]
+            tokens = np.random.randint(0, vocab_size, size=target_length).tolist()
+
+            _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len(  # noqa: E501
+                tokenizer=tokenizer,
+                token_sequence=tokens,
+                target_token_len=target_length,
+                add_special_tokens=False,
+            )
+            return adjusted_tokens, token_mismatch
 
         requests = []
+        token_mismatch_total = 0
         for _ in range(num_prefixes):
-            prefix_tokens = _generate_exact_length_tokens(prefix_len)
+            prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len)
+            token_mismatch_total += prefix_mismatch
 
             for _ in range(prompts_per_prefix):
-                suffix_tokens = _generate_exact_length_tokens(suffix_len)
-
+                suffix_tokens, suffix_mismatch = _generate_exact_length_tokens(
+                    suffix_len
+                )
+                token_mismatch_total += suffix_mismatch
                 combined_tokens = prefix_tokens + suffix_tokens
                 prompt = tokenizer.decode(combined_tokens)
                 prompt_len = len(combined_tokens)
@@ -2719,5 +3134,89 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
                     )
                 )
 
-        random.shuffle(requests)
+        if token_mismatch_total != 0:
+            sign = "more" if token_mismatch_total > 0 else "fewer"
+            logger.warning(
+                "Across all generated prompts, there were %d %s tokens "
+                "than expected after decoding and re-encoding. This is "
+                "expected due to the imperfect nature of the sampling "
+                "procedure.",
+                abs(token_mismatch_total),
+                sign,
+            )
+        if not getattr(self, "disable_shuffle", False):
+            random.shuffle(requests)
         return requests
+
+
+# -----------------------------------------------------------------------------
+# MMStar Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MMStarDataset(HuggingFaceDataset):
+    """
+    Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
+    refer to: https://github.com/sgl-project/SpecForge/pull/106
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # If --hf-output-len is not set, use the default output length.
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests: list[SampleRequest] = []
+
+        for ind, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            # Split the question text from options
+            # (keep only the part before "Options:").
+            full_q: str = item.get("question", "")
+            question_text = full_q.split("Options:", 1)[0].strip()
+
+            # Multimodal image content.
+            mm_content = process_image(item["image"])
+
+            # Compute prompt token length (note: this is plain text length
+            # if enable_multimodal_chat is False).
+            prompt_len = len(tokenizer(question_text).input_ids)
+
+            if enable_multimodal_chat:
+                # If multimodal content should be embedded in the chat message,
+                # convert to [{"role":"user","content":[...]}]
+                prompt = self.apply_multimodal_chat_transformation(
+                    question_text, mm_content
+                )
+                mm_for_request = None  # Already embedded in chat content.
+            else:
+                # Default: prompt is plain text,
+                # image is in mm_content for the bench to assemble.
+                prompt = question_text
+                mm_for_request = mm_content
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_for_request,
+                    request_id=request_id_prefix + str(ind),
+                )
+            )
+
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
+        return sampled_requests
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 05378ec74d2fa2adcc5b8846b3bbce446a959ef4..b4f1751837f48ef143684b241a706693df8f0c68 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -7,26 +7,26 @@ import dataclasses
 import json
 import os
 import time
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.sampling_params import BeamSearchParams
 
 
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={"latency": results["latencies"]},
-        extra_info={k: results[k]
-                    for k in ["avg_latency", "percentiles"]})
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
     if pt_records:
         pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
         write_to_json(pt_file, pt_records)
@@ -49,10 +49,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default=10,
         help="Number of iterations to run for warmup.",
     )
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=30,
-                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -67,8 +66,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--disable-detokenize",
         action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
-              "detokenization time in the latency measurement)"),
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
     )
 
     parser = EngineArgs.add_cli_args(parser)
@@ -81,7 +82,8 @@ def main(args: argparse.Namespace):
     if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
         raise OSError(
             "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler.")
+            "Please set it to a valid path to use torch profiler."
+        )
     engine_args = EngineArgs.from_cli_args(args)
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
@@ -91,9 +93,11 @@ def main(args: argparse.Namespace):
     # the engine will automatically process the request in multiple batches.
     llm = LLM(**dataclasses.asdict(engine_args))
     assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len +
-        args.output_len), ("Please ensure that max_model_len is greater than"
-                           " the sum of input_len and output_len.")
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -103,18 +107,16 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
         detokenize=not args.disable_detokenize,
     )
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_prompts: list[PromptType] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
 
     def llm_generate():
         if not args.use_beam_search:
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
         else:
             llm.beam_search(
                 dummy_prompts,
@@ -125,7 +127,7 @@ def main(args: argparse.Namespace):
                 ),
             )
 
-    def run_to_completion(profile_dir: Optional[str] = None):
+    def run_to_completion(profile_dir: str | None = None):
         if profile_dir:
             llm.start_profile()
             llm_generate()
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 725b7df8b18713a29be81906bbf153da3f725764..ed0fdec2518639945f0299ab3515ca9c4d7bc61e 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -10,9 +10,10 @@ import time
 import traceback
 from collections.abc import Awaitable
 from dataclasses import dataclass, field
-from typing import Optional, Protocol, Union
+from typing import Any, Literal, Protocol
 
 import aiohttp
+import regex as re
 from tqdm.asyncio import tqdm
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -62,31 +63,32 @@ class StreamedResponseHandler:
 @dataclass
 class RequestFuncInput:
     """The input for the request function."""
-    prompt: str
+
+    prompt: str | list[str]
     api_url: str
     prompt_len: int
     output_len: int
     model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_headers: Optional[dict] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[Union[dict, list[dict]]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_headers: dict | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
     ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None
 
 
 @dataclass
 class RequestFuncOutput:
     """The output of the request function including metrics."""
+
     generated_text: str = ""
     success: bool = False
     latency: float = 0.0
     output_tokens: int = 0
     ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
-        default_factory=list)  # list of inter-token latencies
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
@@ -98,15 +100,48 @@ class RequestFunc(Protocol):
         self,
         request_func_input: RequestFuncInput,
         session: aiohttp.ClientSession,
-        pbar: Optional[tqdm] = None,
-    ) -> Awaitable[RequestFuncOutput]:
-        ...
+        pbar: tqdm | None = None,
+    ) -> Awaitable[RequestFuncOutput]: ...
+
+
+def _validate_api_url(
+    api_url: str,
+    api_name: str,
+    expected_suffixes: str | set[str],
+) -> None:
+    if isinstance(expected_suffixes, str):
+        expected_suffixes = {expected_suffixes}
+
+    expected_suffixes = {*expected_suffixes, "profile"}
+
+    if not api_url.endswith(tuple(expected_suffixes)):
+        raise ValueError(f"{api_name} URL must end with one of: {expected_suffixes}.")
+
+
+def _update_payload_common(
+    payload: dict[str, Any],
+    request_func_input: RequestFuncInput,
+) -> None:
+    if request_func_input.ignore_eos:
+        payload["ignore_eos"] = request_func_input.ignore_eos
+    if request_func_input.extra_body:
+        payload.update(request_func_input.extra_body)
+
+
+def _update_headers_common(
+    headers: dict[str, Any],
+    request_func_input: RequestFuncInput,
+) -> None:
+    if request_func_input.extra_headers:
+        headers |= request_func_input.extra_headers
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
 
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     """The async request function for the OpenAI Completions API.
 
@@ -118,13 +153,12 @@ async def async_request_openai_completions(
         The output of the request function.
     """
     api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    _validate_api_url(api_url, "OpenAI Completions API", "completions")
 
     payload = {
         "model": request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
+        if request_func_input.model_name
+        else request_func_input.model,
         "prompt": request_func_input.prompt,
         "temperature": 0.0,
         "repetition_penalty": 1.0,
@@ -135,17 +169,12 @@ async def async_request_openai_completions(
             "include_usage": True,
         },
     }
-    if request_func_input.ignore_eos:
-        payload["ignore_eos"] = request_func_input.ignore_eos
-    if request_func_input.extra_body:
-        payload.update(request_func_input.extra_body)
+    _update_payload_common(payload, request_func_input)
+
     headers = {
-        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
-    if request_func_input.extra_headers:
-        headers |= request_func_input.extra_headers
-    if request_func_input.request_id:
-        headers["x-request-id"] = request_func_input.request_id
+    _update_headers_common(headers, request_func_input)
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -155,8 +184,7 @@ async def async_request_openai_completions(
     output.start_time = st
     most_recent_timestamp = st
     try:
-        async with session.post(url=api_url, json=payload,
-                                headers=headers) as response:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
             if response.status == 200:
                 first_chunk_received = False
                 handler = StreamedResponseHandler()
@@ -195,21 +223,20 @@ async def async_request_openai_completions(
 
                                 # Decoding phase
                                 else:
-                                    output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
                                 generated_text += text or ""
                             elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
                 if first_chunk_received:
                     output.success = True
                 else:
                     output.success = False
                     output.error = (
                         "Never received a valid chunk to calculate TTFT."
-                        "This response will be marked as failed!")
+                        "This response will be marked as failed!"
+                    )
                 output.generated_text = generated_text
                 output.latency = most_recent_timestamp - st
             else:
@@ -225,59 +252,62 @@ async def async_request_openai_completions(
     return output
 
 
-async def async_request_openai_chat_completions(
+def _get_chat_content(
     request_func_input: RequestFuncInput,
-    session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(("chat/completions", "profile")), (
-        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+    mm_position: Literal["first", "last"] = "last",
+) -> list[dict[str, Any]]:
+    text_contents = [{"type": "text", "text": request_func_input.prompt}]
 
-    content = [{"type": "text", "text": request_func_input.prompt}]
+    mm_contents = []
     if request_func_input.multi_modal_content:
         mm_content = request_func_input.multi_modal_content
         if isinstance(mm_content, list):
-            content.extend(mm_content)
+            mm_contents.extend(request_func_input.multi_modal_content)
         elif isinstance(mm_content, dict):
-            content.append(mm_content)
+            mm_contents.append(request_func_input.multi_modal_content)
         else:
             raise TypeError(
-                "multi_modal_content must be a dict or list[dict] "
-                "for openai-chat"
+                "multi_modal_content must be a dict or list[dict] for openai-chat"
             )
+
+    if mm_position == "first":
+        return mm_contents + text_contents
+
+    return text_contents + mm_contents
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+    mm_position: Literal["first", "last"] = "last",
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions")
+
+    content = _get_chat_content(request_func_input, mm_position=mm_position)
+
     payload = {
-        "model":
-        request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
         "messages": [
-            {
-                "role": "user",
-                "content": content
-            },
+            {"role": "user", "content": content},
         ],
-        "temperature":
-        0.0,
-        "max_completion_tokens":
-        request_func_input.output_len,
-        "stream":
-        True,
+        "temperature": 0.0,
+        "max_completion_tokens": request_func_input.output_len,
+        "stream": True,
         "stream_options": {
             "include_usage": True,
         },
     }
-    if request_func_input.ignore_eos:
-        payload["ignore_eos"] = request_func_input.ignore_eos
-    if request_func_input.extra_body:
-        payload.update(request_func_input.extra_body)
+    _update_payload_common(payload, request_func_input)
+
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
-    if request_func_input.extra_headers:
-        headers |= request_func_input.extra_headers
-    if request_func_input.request_id:
-        headers["x-request-id"] = request_func_input.request_id
+    _update_headers_common(headers, request_func_input)
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -288,8 +318,7 @@ async def async_request_openai_chat_completions(
     output.start_time = st
     most_recent_timestamp = st
     try:
-        async with session.post(url=api_url, json=payload,
-                                headers=headers) as response:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
             if response.status == 200:
                 handler = StreamedResponseHandler()
                 async for chunk_bytes in response.content.iter_any():
@@ -320,13 +349,11 @@ async def async_request_openai_chat_completions(
 
                                 # Decoding phase
                                 else:
-                                    output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)
 
                                 generated_text += content or ""
                             elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
 
                             most_recent_timestamp = timestamp
 
@@ -349,44 +376,33 @@ async def async_request_openai_chat_completions(
 async def async_request_openai_audio(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     # Lazy import without PlaceholderModule to avoid vllm dep.
     import soundfile
 
     api_url = request_func_input.api_url
-    assert api_url.endswith(("transcriptions", "translations")), (
-        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
-    "or `translations`."
+    _validate_api_url(api_url, "OpenAI Audio API", {"transcriptions", "translations"})
 
     content = [{"type": "text", "text": request_func_input.prompt}]
     payload = {
-        "model":
-        request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
-        "temperature":
-        0.0,
-        "max_completion_tokens":
-        request_func_input.output_len,
-        "stream":
-        True,
-        "language":
-        "en",
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+        "temperature": 0.0,
+        "max_completion_tokens": request_func_input.output_len,
+        "stream": True,
+        "language": "en",
         # Flattened due to multipart/form-data
-        "stream_include_usage":
-        True,
-        "stream_continuous_usage_stats":
-        True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
     }
-    if request_func_input.extra_body:
-        payload.update(request_func_input.extra_body)
+    _update_payload_common(payload, request_func_input)
+
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
-    if request_func_input.extra_headers:
-        headers |= request_func_input.extra_headers
-    if request_func_input.request_id:
-        headers["x-request-id"] = request_func_input.request_id
+    _update_headers_common(headers, request_func_input)
 
     # Send audio file
     def to_bytes(y, sr):
@@ -413,9 +429,9 @@ async def async_request_openai_audio(
         output.start_time = st
         most_recent_timestamp = st
         try:
-            async with session.post(url=api_url,
-                                    data=form,
-                                    headers=headers) as response:
+            async with session.post(
+                url=api_url, data=form, headers=headers
+            ) as response:
                 if response.status == 200:
                     handler = StreamedResponseHandler()
 
@@ -426,15 +442,13 @@ async def async_request_openai_audio(
 
                         messages = handler.add_chunk(chunk_bytes)
                         for message in messages:
-                            chunk = message.decode("utf-8").removeprefix(
-                                "data: ")
+                            chunk = message.decode("utf-8").removeprefix("data: ")
                             if chunk != "[DONE]":
                                 timestamp = time.perf_counter()
                                 data = json.loads(chunk)
 
                                 if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
+                                    content = choices[0]["delta"].get("content")
                                     # First token
                                     if ttft == 0.0:
                                         ttft = timestamp - st
@@ -443,12 +457,14 @@ async def async_request_openai_audio(
                                     # Decoding phase
                                     else:
                                         output.itl.append(
-                                            timestamp - most_recent_timestamp)
+                                            timestamp - most_recent_timestamp
+                                        )
 
                                     generated_text += content or ""
                                 elif usage := data.get("usage"):
                                     output.output_tokens = usage.get(
-                                        "completion_tokens")
+                                        "completion_tokens"
+                                    )
 
                                 most_recent_timestamp = timestamp
 
@@ -468,43 +484,31 @@ async def async_request_openai_audio(
     return output
 
 
-async def async_request_openai_embeddings(
-    request_func_input: RequestFuncInput,
+async def _run_pooling_request(
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-):
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        "embeddings"
-    ), "OpenAI Embeddings API URL must end with 'embeddings'."
-
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-    }
-
-    payload = {
-        "model": request_func_input.model,
-        "input": request_func_input.prompt,
-    }
-
+    api_url: str,
+    payload: dict[str, Any],
+    headers: dict[str, Any],
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
     output = RequestFuncOutput()
     st = time.perf_counter()
     output.start_time = st
     try:
-        async with session.post(
-            url=api_url,
-            headers=headers,
-            json=payload
-        ) as response:
+        async with session.post(url=api_url, headers=headers, json=payload) as response:
             if response.status == 200:
-                output.latency = time.perf_counter() - st
-                data = await response.json()
+                output.ttft = output.latency = time.perf_counter() - st
+
+                if payload.get("encoding_format", "float") == "bytes":
+                    metadata = json.loads(response.headers["metadata"])
+                    usage = metadata.get("usage", {})
+                else:
+                    data = await response.json()
+                    usage = data.get("usage", {})
+
                 output.success = True
                 output.generated_text = ""
-                output.prompt_len = data.get(
-                    "usage", {}).get(
-                    "prompt_tokens", 0)
+                output.prompt_len = usage.get("prompt_tokens", 0)
             else:
                 output.success = False
                 output.error = response.reason or ""
@@ -517,6 +521,238 @@ async def async_request_openai_embeddings(
     return output
 
 
+async def async_request_openai_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings")
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+        "input": request_func_input.prompt,
+        # Many embedding models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
+    }
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_pooling_request(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+async def async_request_vllm_rerank(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "vLLM score API", "rerank")
+
+    assert (
+        isinstance(request_func_input.prompt, list)
+        and len(request_func_input.prompt) > 1
+    )
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+        "query": request_func_input.prompt[0],
+        "documents": request_func_input.prompt[1:],
+        # Many reranker models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
+    }
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_pooling_request(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+async def async_request_openai_embeddings_chat(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+    mm_position: Literal["first", "last"] = "last",
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings")
+
+    content = _get_chat_content(request_func_input, mm_position=mm_position)
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+        "messages": [
+            {"role": "user", "content": content},
+        ],
+        # Many embedding models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
+    }
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_pooling_request(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+def _try_extract_request_idx(request_func_input: RequestFuncInput):
+    if request_func_input.request_id:
+        match = re.search(r"(\d+)$", request_func_input.request_id)
+        if match:
+            try:
+                return int(match.group(1))
+            except ValueError:
+                pass
+
+    return None
+
+
+def _preprocess_clip(request_func_input: RequestFuncInput):
+    if request_func_input.multi_modal_content:
+        # Image input
+        request_func_input.prompt = ""
+
+
+def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
+    if request_func_input.multi_modal_content:
+        request_idx = _try_extract_request_idx(request_func_input)
+
+        # Adjust the ratio manually if needed.
+        use_image_only_prompt = request_idx is None or request_idx % 2 == 0
+
+        if use_image_only_prompt:
+            # Image input
+            request_func_input.prompt = "Represent the given image."
+        else:
+            # Text+Image input
+            request_func_input.prompt = (
+                f"Represent the given image with the following question: "
+                f"{request_func_input.prompt}"
+            )
+
+
+async def async_request_openai_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_openai_embeddings_chat(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
+async def async_request_openai_embeddings_vlm2vec(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    _preprocess_vlm2vec(request_func_input)
+
+    return await async_request_openai_embeddings_chat(
+        request_func_input,
+        session,
+        pbar=pbar,
+        mm_position="first",
+    )
+
+
+async def async_request_infinity_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "Infinity Embeddings API", "embeddings")
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+    }
+
+    if request_func_input.prompt:
+        payload["input"] = request_func_input.prompt
+    else:
+        mm_content = request_func_input.multi_modal_content
+        assert isinstance(mm_content, dict)
+
+        mm_type = mm_content["type"]
+        payload["input"] = mm_content[mm_type]["url"]
+        payload["modality"] = mm_type.split("_", 1)[0]
+
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_pooling_request(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+async def async_request_infinity_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_infinity_embeddings(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
     "vllm": async_request_openai_completions,
@@ -524,10 +760,18 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
     "openai-chat": async_request_openai_chat_completions,
     "openai-audio": async_request_openai_audio,
     "openai-embeddings": async_request_openai_embeddings,
+    "openai-embeddings-chat": async_request_openai_embeddings_chat,
+    "openai-embeddings-clip": async_request_openai_embeddings_clip,
+    "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec,
+    # Infinity embedding server: https://github.com/michaelfeil/infinity
+    "infinity-embeddings": async_request_infinity_embeddings,
+    "infinity-embeddings-clip": async_request_infinity_embeddings_clip,
+    # (Infinity embedding server does not support vlm2vec)
+    "vllm-rerank": async_request_vllm_rerank,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
 ]
diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py
index 87fc16b55012eb30dde7c3cdca8afaeef455afc2..5649faf05597612426e105817fec71091eb6be37 100644
--- a/vllm/benchmarks/lib/ready_checker.py
+++ b/vllm/benchmarks/lib/ready_checker.py
@@ -8,8 +8,7 @@ import time
 import aiohttp
 from tqdm.asyncio import tqdm
 
-from .endpoint_request_func import (RequestFunc, RequestFuncInput,
-                                    RequestFuncOutput)
+from .endpoint_request_func import RequestFunc, RequestFuncInput, RequestFuncOutput
 
 
 async def wait_for_endpoint(
@@ -21,30 +20,29 @@ async def wait_for_endpoint(
 ) -> RequestFuncOutput:
     """
     Wait for an endpoint to become available before starting benchmarks.
-    
+
     Args:
         request_func: The async request function to call
         test_input: The RequestFuncInput to test with
         timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
         retry_interval: Time between retries in seconds (default: 5 seconds)
-        
+
     Returns:
         RequestFuncOutput: The successful response
-        
+
     Raises:
         ValueError: If the endpoint doesn't become available within the timeout
     """
     deadline = time.perf_counter() + timeout_seconds
     output = RequestFuncOutput(success=False)
     print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
-    
+
     with tqdm(
-        total=timeout_seconds, 
+        total=timeout_seconds,
         bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
         unit="s",
     ) as pbar:
-
-        while True:            
+        while True:
             # update progress bar
             remaining = deadline - time.perf_counter()
             elapsed = timeout_seconds - remaining
@@ -58,16 +56,17 @@ async def wait_for_endpoint(
             # ping the endpoint using request_func
             try:
                 output = await request_func(
-                    request_func_input=test_input, session=session)
+                    request_func_input=test_input, session=session
+                )
                 if output.success:
                     pbar.close()
                     return output
             except aiohttp.ClientConnectorError:
                 pass
-            
+
             # retry after a delay
             sleep_duration = min(retry_interval, remaining)
             if sleep_duration > 0:
                 await asyncio.sleep(sleep_duration)
-    
+
     return output
diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py
index 0c27687dcf16db0d1b84b5b1c1728ef7a793199d..32e9db49900783ea97963778ef9dbae0c8651886 100644
--- a/vllm/benchmarks/lib/utils.py
+++ b/vllm/benchmarks/lib/utils.py
@@ -8,9 +8,9 @@ import os
 from typing import Any
 
 
-def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: dict[str, list],
-                                        extra_info: dict[str, Any]) -> list:
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
     """
     Save the benchmark results in the format used by PyTorch OSS benchmark with
     on metric per record
@@ -38,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
             },
         }
 
-        tp = record["benchmark"]["extra_info"]["args"].get(
-            "tensor_parallel_size")
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
         # Save tensor_parallel_size parameter if it's part of the metadata
         if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"][
-                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )
 
         records.append(record)
 
@@ -51,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
 
 
 class InfEncoder(json.JSONEncoder):
-
     def clear_inf(self, o: Any):
         if isinstance(o, dict):
             return {
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 2a042802d0d54e12f5e7ae776be5308598adefd7..dddb050ec180e0311b01b57066593e0149da73e3 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -15,66 +15,58 @@ On the client side, run:
         --request-rate <request_rate. Default inf> \
         --num-prompts <num_prompts. Default 1000>
 """
+
 import argparse
 import asyncio
-import gc
+import contextlib
 import importlib.util
 import json
 import os
 import random
 import shutil
 import time
+import uuid
 import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
-from typing import Any, Literal, Optional
+from typing import Any, Literal
 
 import aiohttp
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
-                                      get_samples)
+from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
 from vllm.benchmarks.lib.endpoint_request_func import (
-    ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
-    RequestFuncOutput)
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.network_utils import join_host_port
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
-TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
-                          and (shutil.which("gnuplot") is not None))
-
-
-# TODO: Remove this in v0.11.0
-class DeprecatedEndpointTypeAction(argparse.Action):
-    """Argparse action for the deprecated --endpoint-type flag.
-    """
-
-    def __call__(self, _, namespace, values, option_string=None):
-        warnings.warn(
-            "'--endpoint-type' is deprecated and will be removed in v0.11.0. "
-            "Please use '--backend' instead or remove this argument if you "
-            "have already set it.",
-            stacklevel=1,
-        )
-        setattr(namespace, self.dest, values)
+TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) and (
+    shutil.which("gnuplot") is not None
+)
 
 
 class TaskType(Enum):
     GENERATION = "generation"
-    EMBEDDING = "embedding"
+    POOLING = "pooling"
 
 
 @dataclass
 class BenchmarkMetrics:
     completed: int
+    failed: int
     total_input: int
     total_output: int
     request_throughput: float
@@ -108,6 +100,7 @@ class BenchmarkMetrics:
 @dataclass
 class EmbedBenchmarkMetrics:
     completed: int
+    failed: int
     total_input: int
     request_throughput: float
     total_token_throughput: float
@@ -118,15 +111,18 @@ class EmbedBenchmarkMetrics:
 
 
 def _get_current_request_rate(
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
-    ramp_up_start_rps: Optional[int],
-    ramp_up_end_rps: Optional[int],
+    ramp_up_strategy: Literal["linear", "exponential"] | None,
+    ramp_up_start_rps: int | None,
+    ramp_up_end_rps: int | None,
     request_index: int,
     total_requests: int,
     request_rate: float,
 ) -> float:
-    if (ramp_up_strategy and ramp_up_start_rps is not None
-            and ramp_up_end_rps is not None):
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
         progress = request_index / max(total_requests - 1, 1)
         if ramp_up_strategy == "linear":
             increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
@@ -143,9 +139,9 @@ async def get_request(
     input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
+    ramp_up_strategy: Literal["linear", "exponential"] | None = None,
+    ramp_up_start_rps: int | None = None,
+    ramp_up_end_rps: int | None = None,
 ) -> AsyncGenerator[tuple[SampleRequest, float], None]:
     """
     Asynchronously generates requests at a specified rate
@@ -173,10 +169,10 @@ async def get_request(
             The ending request rate for ramp-up.
     """
     assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
     # Convert to list to get length for ramp-up calculations
-    if isinstance(input_requests,
-                  Iterable) and not isinstance(input_requests, list):
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
         input_requests = list(input_requests)
 
     total_requests = len(input_requests)
@@ -187,11 +183,23 @@ async def get_request(
     delay_ts = []
     for request_index, request in enumerate(input_requests):
         current_request_rate = _get_current_request_rate(
-            ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
-            request_index, total_requests, request_rate)
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )
+        assert current_request_rate > 0.0, (
+            f"Obtained non-positive request rate {current_request_rate}."
+        )
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
+        elif burstiness == float("inf"):
+            # when burstiness tends to infinity, the delay time becomes constant
+            # and tends to the inverse of the request rate
+            delay_ts.append(1.0 / current_request_rate)
         else:
             theta = 1.0 / (current_request_rate * burstiness)
 
@@ -228,8 +236,8 @@ async def get_request(
 
 
 def calculate_metrics_for_embeddings(
-        outputs: list[RequestFuncOutput], dur_s: float,
-        selected_percentiles: list[float]) -> EmbedBenchmarkMetrics:
+    outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
+) -> EmbedBenchmarkMetrics:
     """Calculate the metrics for the embedding requests.
 
     Args:
@@ -242,28 +250,34 @@ def calculate_metrics_for_embeddings(
     """
     total_input = 0
     completed = 0
+    failed = 0
     e2els: list[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             e2els.append(outputs[i].latency)
             completed += 1
             total_input += outputs[i].prompt_len
+        else:
+            failed += 1
 
     if completed == 0:
         warnings.warn(
             "All requests failed. This is likely due to a misconfiguration "
             "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
     metrics = EmbedBenchmarkMetrics(
         completed=completed,
+        failed=failed,
         total_input=total_input,
         request_throughput=completed / dur_s,
         total_token_throughput=total_input / dur_s,
         mean_e2el_ms=np.mean(e2els or 0) * 1000,
         std_e2el_ms=np.std(e2els or 0) * 1000,
         median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
     )
     return metrics
 
@@ -309,8 +323,10 @@ def calculate_metrics(
                 # bundled together
                 # Note : this may inflate the output token count slightly
                 output_len = len(
-                    tokenizer(outputs[i].generated_text,
-                              add_special_tokens=False).input_ids)
+                    tokenizer(
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
             actual_output_lens.append(output_len)
             total_input += input_requests[i].prompt_len
             tpot = 0
@@ -333,16 +349,19 @@ def calculate_metrics(
 
         if "ttft" in goodput_config_dict:
             valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
         if "tpot" in goodput_config_dict:
             valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
         if "e2el" in goodput_config_dict:
             valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
 
         for req_metric in zip(*valid_metrics):
             is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -353,7 +372,8 @@ def calculate_metrics(
         warnings.warn(
             "All requests failed. This is likely due to a misconfiguration "
             "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
 
     # Calculate max output tokens per second metric
     max_output_tokens_per_s = 0.0
@@ -361,11 +381,12 @@ def calculate_metrics(
 
     # Find the time range across all successful requests
     successful_outputs = [output for output in outputs if output.success]
+    failed_outputs = [output for output in outputs if not output.success]
     if successful_outputs:
-        min_start_time = min(output.start_time
-                             for output in successful_outputs)
-        max_end_time = max(output.start_time + output.latency
-                           for output in successful_outputs)
+        min_start_time = min(output.start_time for output in successful_outputs)
+        max_end_time = max(
+            output.start_time + output.latency for output in successful_outputs
+        )
 
         # Create second buckets (ceiling to ensure we capture all time)
         duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
@@ -389,8 +410,9 @@ def calculate_metrics(
 
             # Track concurrent requests for each second this request was active
             request_start_second = int(output.start_time - min_start_time)
-            request_end_second = int((output.start_time + output.latency) -
-                                     min_start_time)
+            request_end_second = int(
+                (output.start_time + output.latency) - min_start_time
+            )
 
             for second in range(request_start_second, request_end_second + 1):
                 concurrent_requests_per_second[second] += 1
@@ -399,51 +421,60 @@ def calculate_metrics(
         # concurrent requests
         if len(tokens_per_second) > 0:
             max_output_tokens_per_s = float(np.max(tokens_per_second))
-            max_concurrent_requests = int(
-                np.max(concurrent_requests_per_second))
+            max_concurrent_requests = int(np.max(concurrent_requests_per_second))
 
         if TERM_PLOTLIB_AVAILABLE:
             import termplotlib as tpl
+
             fig = tpl.figure()
-            fig.plot(np.arange(len(tokens_per_second)),
-                     tokens_per_second,
-                     title="Output tokens per second")
-            fig.plot(np.arange(len(concurrent_requests_per_second)),
-                     concurrent_requests_per_second,
-                     title="Concurrent requests per second")
+            fig.plot(
+                np.arange(len(tokens_per_second)),
+                tokens_per_second,
+                title="Output tokens per second",
+            )
+            fig.plot(
+                np.arange(len(concurrent_requests_per_second)),
+                concurrent_requests_per_second,
+                title="Concurrent requests per second",
+            )
             fig.show()
         else:
             print("tip: install termplotlib and gnuplot to plot the metrics")
 
     metrics = BenchmarkMetrics(
         completed=completed,
+        failed=len(failed_outputs),
         total_input=total_input,
         total_output=sum(actual_output_lens),
         request_throughput=completed / dur_s,
         request_goodput=good_completed / dur_s,
         output_throughput=sum(actual_output_lens) / dur_s,
         total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0) *
-        1000,  # ttfts is empty if streaming is not supported by the endpoint
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by the endpoint
         std_ttft_ms=np.std(ttfts or 0) * 1000,
         median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         std_tpot_ms=np.std(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
         mean_itl_ms=np.mean(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
-                            for p in selected_percentiles],
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
         mean_e2el_ms=np.mean(e2els or 0) * 1000,
         std_e2el_ms=np.std(e2els or 0) * 1000,
         median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
         max_output_tokens_per_s=max_output_tokens_per_s,
         max_concurrent_requests=max_concurrent_requests,
     )
@@ -452,6 +483,7 @@ def calculate_metrics(
 
 
 async def benchmark(
+    task_type: TaskType,
     endpoint_type: str,
     api_url: str,
     base_url: str,
@@ -459,33 +491,29 @@ async def benchmark(
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: list[SampleRequest],
-    logprobs: Optional[int],
+    logprobs: int | None,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
+    num_warmups: int,
     profile: bool,
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
-    max_concurrency: Optional[int],
-    lora_modules: Optional[Iterable[str]],
-    extra_headers: Optional[dict],
-    extra_body: Optional[dict],
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
+    max_concurrency: int | None,
+    lora_modules: Iterable[str] | None,
+    extra_headers: dict | None,
+    extra_body: dict | None,
+    ramp_up_strategy: Literal["linear", "exponential"] | None = None,
+    ramp_up_start_rps: int | None = None,
+    ramp_up_end_rps: int | None = None,
     ready_check_timeout_sec: int = 600,
 ):
-    task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
-                 TaskType.GENERATION)
-    if endpoint_type in ASYNC_REQUEST_FUNCS:
-        if task_type == TaskType.EMBEDDING:
-            request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
-        else:
-            request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
-    else:
-        raise ValueError(f"Unknown backend: {endpoint_type}")
+    try:
+        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+    except KeyError:
+        raise ValueError(f"Unknown backend: {endpoint_type}") from None
 
     # Reuses connections across requests to reduce TLS handshake overhead.
     connector = aiohttp.TCPConnector(
@@ -513,10 +541,14 @@ async def benchmark(
         input_requests[0].multi_modal_data,
     )
 
-    assert (test_mm_content is None or isinstance(test_mm_content, dict)
-            or (isinstance(test_mm_content, list)
-                and all(isinstance(item, dict) for item in test_mm_content))
-            ), "multi_modal_data must be a dict or list[dict]"
+    assert (
+        test_mm_content is None
+        or isinstance(test_mm_content, dict)
+        or (
+            isinstance(test_mm_content, list)
+            and all(isinstance(item, dict) for item in test_mm_content)
+        )
+    ), "multi_modal_data must be a dict or list[dict]"
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -542,42 +574,75 @@ async def benchmark(
             raise ValueError(
                 "Initial test run failed - Please make sure benchmark "
                 "arguments are correctly specified. "
-                f"Error: {test_output.error}")
+                f"Error: {test_output.error}"
+            )
         else:
-            print("Initial test run completed. Starting main benchmark run...")
+            print("Initial test run completed.")
     else:
         print("Skipping endpoint ready check.")
 
+    if num_warmups > 0:
+        print(f"Warming up with {num_warmups} requests...")
+        warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups)
+        warmup_semaphore = (
+            asyncio.Semaphore(max_concurrency)
+            if max_concurrency
+            else contextlib.nullcontext()
+        )
+        warmup_tasks = []
+
+        async def warmup_limited_request_func():
+            async with warmup_semaphore:
+                return await request_func(
+                    request_func_input=test_input, session=session, pbar=warmup_pbar
+                )
+
+        for _ in range(num_warmups):
+            request_task = asyncio.create_task(warmup_limited_request_func())
+            warmup_tasks.append(request_task)
+        _ = await asyncio.gather(*warmup_tasks)
+
+        if warmup_pbar is not None:
+            warmup_pbar.close()
+        print("Warmup run completed.")
+
+    print("Starting main benchmark run...")
+
     if lora_modules:
         # For each input request, choose a LoRA module at random.
         lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))])
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
+        )
 
     if profile:
         print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
-                                         model_name=model_name,
-                                         prompt=test_prompt,
-                                         api_url=base_url + "/start_profile",
-                                         prompt_len=test_prompt_len,
-                                         output_len=test_output_len,
-                                         logprobs=logprobs,
-                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos,
-                                         extra_headers=extra_headers,
-                                         extra_body=extra_body)
-        profile_output = await request_func(request_func_input=profile_input,
-                                            session=session)
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session
+        )
         if profile_output.success:
             print("Profiler started")
 
-    distribution = ("Poisson process"
-                    if burstiness == 1.0 else "Gamma distribution")
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
 
     if ramp_up_strategy is not None:
         print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
-        print(f"Will increase RPS from {ramp_up_start_rps} to "
-              f"{ramp_up_end_rps} RPS over the duration of the benchmark.")
+        print(
+            f"Will increase RPS from {ramp_up_start_rps} to "
+            f"{ramp_up_end_rps} RPS over the duration of the benchmark."
+        )
     else:
         print(f"Traffic request rate: {request_rate}")
 
@@ -586,22 +651,17 @@ async def benchmark(
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
-                 if max_concurrency else None)
+    semaphore = (
+        asyncio.Semaphore(max_concurrency)
+        if max_concurrency
+        else contextlib.nullcontext()
+    )
 
     async def limited_request_func(request_func_input, session, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
-                                      session=session,
-                                      pbar=pbar)
         async with semaphore:
-            return await request_func(request_func_input=request_func_input,
-                                      session=session,
-                                      pbar=pbar)
+            return await request_func(
+                request_func_input=request_func_input, session=session, pbar=pbar
+            )
 
     benchmark_start_time = time.perf_counter()
     tasks: list[asyncio.Task] = []
@@ -610,23 +670,27 @@ async def benchmark(
     last_int_rps = -1
     if ramp_up_strategy is not None and ramp_up_start_rps is not None:
         last_int_rps = ramp_up_start_rps
-        rps_change_events.append({
-            "rps": last_int_rps,
-            "timestamp": datetime.now().isoformat(),
-        })
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
 
     async for request, current_request_rate in get_request(
-            input_requests, request_rate, burstiness, ramp_up_strategy,
-            ramp_up_start_rps, ramp_up_end_rps):
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
         if ramp_up_strategy is not None:
             current_int_rps = int(current_request_rate)
             if current_int_rps > last_int_rps:
                 timestamp = datetime.now().isoformat()
                 for rps_val in range(last_int_rps + 1, current_int_rps + 1):
-                    rps_change_events.append({
-                        "rps": rps_val,
-                        "timestamp": timestamp
-                    })
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                 last_int_rps = current_int_rps
         prompt, prompt_len, output_len, mm_content, request_id = (
             request.prompt,
@@ -656,9 +720,11 @@ async def benchmark(
         )
         tasks.append(
             asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
-                                     session=session,
-                                     pbar=pbar)))
+                limited_request_func(
+                    request_func_input=request_func_input, session=session, pbar=pbar
+                )
+            )
+        )
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
     if pbar is not None:
@@ -683,45 +749,59 @@ async def benchmark(
         )
         actual_output_lens = 0
 
-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
     if max_concurrency is not None:
-        print("{:<40} {:<10}".format("Maximum request concurrency:",
-                                     max_concurrency))
-    if request_rate != float('inf'):
-        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
-                                        request_rate))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10}".format("Total generated tokens:",
-                                     metrics.total_output))
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
+        print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
     if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
-                                        metrics.request_goodput))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
     if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                        metrics.output_throughput))
-        print("{:<40} {:<10.2f}".format(
-            "Peak output token throughput (tok/s):",
-            metrics.max_output_tokens_per_s))
-        print("{:<40} {:<10.2f}".format("Peak concurrent requests:",
-                                        metrics.max_concurrent_requests))
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
-                                    metrics.total_token_throughput))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", metrics.output_throughput
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Peak concurrent requests:", metrics.max_concurrent_requests
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
 
     if isinstance(metrics, BenchmarkMetrics):
         result = {
             "duration": benchmark_duration,
             "completed": metrics.completed,
+            "failed": metrics.failed,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "request_throughput": metrics.request_throughput,
-            "request_goodput":
-            metrics.request_goodput if goodput_config_dict else None,
+            "request_goodput": metrics.request_goodput if goodput_config_dict else None,
             "output_throughput": metrics.output_throughput,
             "total_token_throughput": metrics.total_token_throughput,
             "input_lens": [output.prompt_len for output in outputs],
@@ -759,30 +839,36 @@ async def benchmark(
         # metric.
         if metric_attribute_name not in selected_percentile_metrics:
             return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name} (ms):",
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name} (ms):",
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
         result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
         result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
         result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}_ms"):
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
             p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
-                                            value))
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
             result[f"p{p_word}_{metric_attribute_name}_ms"] = value
 
     if task_type == TaskType.GENERATION:
         process_one_metric("ttft", "TTFT", "Time to First Token")
-        process_one_metric("tpot", "TPOT",
-                           "Time per Output Token (excl. 1st token)")
+        process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
         process_one_metric("itl", "ITL", "Inter-token Latency")
     process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
@@ -798,8 +884,9 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
         )
-        profile_output = await request_func(request_func_input=profile_input,
-                                            session=session)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session
+        )
         if profile_output.success:
             print("Profiler stopped")
 
@@ -818,12 +905,14 @@ def check_goodput_args(args):
                 raise ValueError(
                     f"Invalid metric name found, {slo_name}: {slo_val}. "
                     "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{str(VALID_NAMES)}. "
+                )
             if slo_val < 0:
                 raise ValueError(
                     f"Invalid value found, {slo_name}: {slo_val}. "
                     "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
     return goodput_config_dict
 
 
@@ -836,31 +925,42 @@ def parse_goodput(slo_pairs):
     except ValueError as err:
         raise argparse.ArgumentTypeError(
             "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
             "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
     return goodput_config_dict
 
 
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any],
-                                     file_name: str) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any], file_name: str
+) -> None:
     metrics = [
-        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
-        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
-        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
     ]
     # These raw data might be useful, but they are rather big. They can be added
     # later if needed
     ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
-        metrics={k: [results[k]]
-                 for k in metrics if k in results},
+        metrics={k: [results[k]] for k in metrics if k in results},
         extra_info={
             k: results[k]
-            for k in results if k not in metrics and k not in ignored_metrics
-        })
+            for k in results
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
         pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
@@ -881,16 +981,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=str,
         default="openai",
         choices=list(ASYNC_REQUEST_FUNCS.keys()),
-        help="The type of backend or endpoint to use for the benchmark."
-    )
-    parser.add_argument(
-        "--endpoint-type",
-        type=str,
-        default=None,
-        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-        action=DeprecatedEndpointTypeAction,
-        help="'--endpoint-type' is deprecated and will be removed in v0.11.0. "
-        "Please use '--backend' instead.",
+        help="The type of backend or endpoint to use for the benchmark.",
     )
     parser.add_argument(
         "--base-url",
@@ -912,9 +1003,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
         metavar="KEY=VALUE",
         nargs="*",
         help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
-        "for headers to be passed with each request. These headers override " \
-        "per backend constants and values set via environment variable, and " \
-        "will be overriden by other arguments (such as request ids)."
+        "for headers to be passed with each request. These headers override "
+        "per backend constants and values set via environment variable, and "
+        "will be overriden by other arguments (such as request ids).",
     )
     parser.add_argument(
         "--max-concurrency",
@@ -939,19 +1030,20 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--logprobs",
         type=int,
         default=None,
-        help=("Number of logprobs-per-token to compute & return as part of "
-              "the request. If unspecified, then either (1) if beam search "
-              "is disabled, no logprobs are computed & a single dummy "
-              "logprob is returned for each token; or (2) if beam search "
-              "is enabled 1 logprob per token is computed"),
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
     )
     parser.add_argument(
         "--request-rate",
@@ -984,6 +1076,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="Specify to disable tqdm progress bar.",
     )
+    parser.add_argument(
+        "--num-warmups",
+        type=int,
+        default=0,
+        help="Number of warmup requests.",
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -1034,32 +1132,36 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--ignore-eos",
         action="store_true",
         help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
     parser.add_argument(
         "--percentile-metrics",
         type=str,
-        default="ttft,tpot,itl",
+        default=None,
         help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'If not specified, defaults to "ttft,tpot,itl" for generative models '
+        'and "e2el" for pooling models.',
+    )
     parser.add_argument(
         "--metric-percentiles",
         type=str,
         default="99",
         help="Comma-separated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\"."
-        "Use \"--percentile-metrics\" to select metrics.",
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99".'
+        'Use "--percentile-metrics" to select metrics.',
     )
     parser.add_argument(
         "--goodput",
         nargs="+",
         required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
         "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
         "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
     )
@@ -1067,7 +1169,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--request-id-prefix",
         type=str,
         required=False,
-        default="benchmark-serving",
+        default=f"bench-{uuid.uuid4().hex[:8]}-",
         help="Specify the prefix of request id.",
     )
 
@@ -1076,22 +1178,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--top-p",
         type=float,
         default=None,
-        help="Top-p sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
     )
     sampling_group.add_argument(
         "--top-k",
         type=int,
         default=None,
-        help="Top-k sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
     )
     sampling_group.add_argument(
         "--min-p",
         type=float,
         default=None,
-        help="Min-p sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
     )
     sampling_group.add_argument(
         "--temperature",
@@ -1101,31 +1200,57 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "openai-compatible backends. If not specified, default to greedy "
         "decoding (i.e. temperature==0.0).",
     )
+    sampling_group.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=None,
+        help="Frequency penalty sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--presence-penalty",
+        type=float,
+        default=None,
+        help="Presence penalty sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--repetition-penalty",
+        type=float,
+        default=None,
+        help="Repetition penalty sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
 
     parser.add_argument(
-        '--tokenizer-mode',
+        "--tokenizer-mode",
         type=str,
         default="auto",
-        choices=['auto', 'slow', 'mistral', 'custom'],
+        choices=["auto", "slow", "mistral", "custom"],
         help='The tokenizer mode.\n\n* "auto" will use the '
         'fast tokenizer if available.\n* "slow" will '
-        'always use the slow tokenizer. \n* '
+        "always use the slow tokenizer. \n* "
         '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.')
-
-    parser.add_argument("--served-model-name",
-                        type=str,
-                        default=None,
-                        help="The model name used in the API. "
-                        "If not specified, the model name will be the "
-                        "same as the ``--model`` argument. ")
-
-    parser.add_argument("--lora-modules",
-                        nargs='+',
-                        default=None,
-                        help="A subset of LoRA module names passed in when "
-                        "launching the server. For each request, the "
-                        "script chooses a LoRA module at random.")
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )
+
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the `--model` argument. ",
+    )
+
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )
 
     parser.add_argument(
         "--ramp-up-strategy",
@@ -1135,7 +1260,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="The ramp-up strategy. This would be used to "
         "ramp up the request rate from initial RPS to final "
         "RPS rate (specified by --ramp-up-start-rps and "
-        "--ramp-up-end-rps.) over the duration of the benchmark.")
+        "--ramp-up-end-rps.) over the duration of the benchmark.",
+    )
     parser.add_argument(
         "--ramp-up-start-rps",
         type=int,
@@ -1156,7 +1282,16 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default=600,
         help="Maximum time to wait for the endpoint to become ready "
         "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
-        "the ready check will be skipped."
+        "the ready check will be skipped.",
+    )
+
+    parser.add_argument(
+        "--extra-body",
+        help="A JSON string representing extra body parameters to include "
+        "in each request."
+        'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'',
+        type=json.loads,
+        default=None,
     )
 
 
@@ -1175,19 +1310,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError(
                 "When using ramp-up, do not specify --request-rate. "
                 "The request rate will be controlled by ramp-up parameters. "
-                "Please remove the --request-rate argument.")
+                "Please remove the --request-rate argument."
+            )
         if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
             raise ValueError(
                 "When using --ramp-up-strategy, both --ramp-up-start-rps and "
-                "--ramp-up-end-rps must be specified")
+                "--ramp-up-end-rps must be specified"
+            )
         if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
             raise ValueError("Ramp-up start and end RPS must be non-negative")
         if args.ramp_up_start_rps > args.ramp_up_end_rps:
             raise ValueError("Ramp-up start RPS must be less than end RPS")
-        if (args.ramp_up_strategy == "exponential"
-                and args.ramp_up_start_rps == 0):
-            raise ValueError(
-                "For exponential ramp-up, the start RPS cannot be 0.")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
 
     label = args.label
     model_id = args.model
@@ -1199,8 +1334,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         api_url = f"{args.base_url}{args.endpoint}"
         base_url = f"{args.base_url}"
     else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
+        host_port = join_host_port(args.host, args.port)
+        api_url = f"http://{host_port}{args.endpoint}"
+        base_url = f"http://{host_port}"
 
     # Headers
     headers = None
@@ -1211,47 +1347,80 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                 kvstring = item.split("=", 1)
                 headers[kvstring[0].strip()] = kvstring[1].strip()
             else:
-                raise ValueError(
-                    "Invalid header format. Please use KEY=VALUE format.")
+                raise ValueError("Invalid header format. Please use KEY=VALUE format.")
 
-    tokenizer = get_tokenizer(tokenizer_id,
-                              tokenizer_mode=tokenizer_mode,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
 
     if args.dataset_name is None:
         raise ValueError(
             "Please specify '--dataset-name' and the corresponding "
-            "'--dataset-path' if required.")
+            "'--dataset-path' if required."
+        )
+
+    # when using random datasets, default to ignoring EOS
+    # so generation runs to the requested length
+    if (
+        args.dataset_name in ("random", "random-mm")
+        and args.backend in OPENAI_COMPATIBLE_BACKENDS
+    ):
+        args.ignore_eos = True
 
     # Load the dataset.
     input_requests = get_samples(args, tokenizer)
     goodput_config_dict = check_goodput_args(args)
 
+    backend = args.backend
+    task_type = (
+        TaskType.POOLING
+        if "embeddings" in backend or "rerank" in backend
+        else TaskType.GENERATION
+    )
+
     # Collect the sampling parameters.
-    sampling_params = {
-        k: v
-        for k, v in {
-            "top_p": args.top_p,
-            "top_k": args.top_k,
-            "min_p": args.min_p,
-            "temperature": args.temperature,
-        }.items() if v is not None
-    }
-
-    # Sampling parameters are only supported by openai-compatible backend.
-    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError("Sampling parameters are only supported by "
-                         "openai-compatible backends.")
-
-    if "temperature" not in sampling_params:
-        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+    if task_type == TaskType.GENERATION:
+        sampling_params = {
+            k: v
+            for k, v in {
+                "top_p": args.top_p,
+                "top_k": args.top_k,
+                "min_p": args.min_p,
+                "temperature": args.temperature,
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+                "repetition_penalty": args.repetition_penalty,
+            }.items()
+            if v is not None
+        }
+
+        # Sampling parameters are only supported by openai-compatible backend.
+        if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+            raise ValueError(
+                "Sampling parameters are only supported by openai-compatible backends."
+            )
+
+        if "temperature" not in sampling_params:
+            sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
+        default_percentile_metrics = "ttft,tpot,itl"
+    else:
+        sampling_params = {}
+        default_percentile_metrics = "e2el"
+
+    extra_body = args.extra_body or {}
+    extra_body = {**sampling_params, **extra_body}
+
+    percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
 
     # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
+    freeze_gc_heap()
 
     benchmark_result = await benchmark(
-        endpoint_type=args.backend,
+        task_type=task_type,
+        endpoint_type=backend,
         api_url=api_url,
         base_url=base_url,
         model_id=model_id,
@@ -1262,17 +1431,16 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         request_rate=args.request_rate,
         burstiness=args.burstiness,
         disable_tqdm=args.disable_tqdm,
+        num_warmups=args.num_warmups,
         profile=args.profile,
-        selected_percentile_metrics=args.percentile_metrics.split(","),
-        selected_percentiles=[
-            float(p) for p in args.metric_percentiles.split(",")
-        ],
+        selected_percentile_metrics=percentile_metrics.split(","),
+        selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
         ignore_eos=args.ignore_eos,
         goodput_config_dict=goodput_config_dict,
         max_concurrency=args.max_concurrency,
         lora_modules=args.lora_modules,
         extra_headers=headers,
-        extra_body=sampling_params,
+        extra_body=extra_body,
         ramp_up_strategy=args.ramp_up_strategy,
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
@@ -1285,7 +1453,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     # Setup
     current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
     result_json["date"] = current_dt
-    result_json["endpoint_type"] = args.backend # for backward compatibility
+    result_json["endpoint_type"] = args.backend  # for backward compatibility
     result_json["backend"] = args.backend
     result_json["label"] = label
     result_json["model_id"] = model_id
@@ -1300,11 +1468,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                 result_json[kvstring[0].strip()] = kvstring[1].strip()
             else:
                 raise ValueError(
-                    "Invalid metadata format. Please use KEY=VALUE format.")
+                    "Invalid metadata format. Please use KEY=VALUE format."
+                )
 
     # Traffic
-    result_json["request_rate"] = (args.request_rate if args.request_rate
-                                   < float("inf") else "inf")
+    result_json["request_rate"] = (
+        args.request_rate if args.request_rate < float("inf") else "inf"
+    )
     result_json["burstiness"] = args.burstiness
     result_json["max_concurrency"] = args.max_concurrency
 
@@ -1319,12 +1489,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     if not args.save_detailed:
         # Remove fields with too many data points
         for field in [
-                "input_lens",
-                "output_lens",
-                "ttfts",
-                "itls",
-                "generated_texts",
-                "errors",
+            "input_lens",
+            "output_lens",
+            "ttfts",
+            "itls",
+            "generated_texts",
+            "errors",
         ]:
             if field in result_json:
                 del result_json[field]
@@ -1334,8 +1504,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         # Save to file
     if args.save_result or args.append_result:
         base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
-                               if args.max_concurrency is not None else "")
+        max_concurrency_str = (
+            f"-concurrency{args.max_concurrency}"
+            if args.max_concurrency is not None
+            else ""
+        )
         label = label or args.backend
         if args.ramp_up_strategy is not None:
             file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
@@ -1346,9 +1519,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         if args.result_dir:
             os.makedirs(args.result_dir, exist_ok=True)
             file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name,
-                  mode="a+" if args.append_result else "w",
-                  encoding="utf-8") as outfile:
+        with open(
+            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
+        ) as outfile:
             # Append a newline.
             if args.append_result and outfile.tell() != 0:
                 outfile.write("\n")
diff --git a/vllm/benchmarks/sweep/__init__.py b/vllm/benchmarks/sweep/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..108cd75690864c810cfbcaeab09c444d7ba47d8c
--- /dev/null
+++ b/vllm/benchmarks/sweep/cli.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
+
+from .plot import SweepPlotArgs
+from .plot import main as plot_main
+from .serve import SweepServeArgs
+from .serve import main as serve_main
+from .serve_sla import SweepServeSLAArgs
+from .serve_sla import main as serve_sla_main
+
+SUBCOMMANDS = (
+    (SweepServeArgs, serve_main),
+    (SweepServeSLAArgs, serve_sla_main),
+    (SweepPlotArgs, plot_main),
+)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    subparsers = parser.add_subparsers(required=True, dest="sweep_type")
+
+    for cmd, entrypoint in SUBCOMMANDS:
+        cmd_subparser = subparsers.add_parser(
+            cmd.parser_name,
+            description=cmd.parser_help,
+            usage=f"vllm bench sweep {cmd.parser_name} [options]",
+        )
+        cmd_subparser.set_defaults(dispatch_function=entrypoint)
+        cmd.add_cli_args(cmd_subparser)
+        cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
+            subcmd=f"sweep {cmd.parser_name}"
+        )
+
+
+def main(args: argparse.Namespace):
+    args.dispatch_function(args)
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
new file mode 100644
index 0000000000000000000000000000000000000000..986561ed8502aa4f3c052c9c493d212dd4cbf42f
--- /dev/null
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from typing import Any
+
+
+class ParameterSweep(list["ParameterSweepItem"]):
+    @classmethod
+    def read_json(cls, filepath: os.PathLike):
+        with open(filepath, "rb") as f:
+            records = json.load(f)
+
+        return cls.from_records(records)
+
+    @classmethod
+    def from_records(cls, records: list[dict[str, object]]):
+        if not isinstance(records, list):
+            raise TypeError(
+                f"The parameter sweep should be a list of dictionaries, "
+                f"but found type: {type(records)}"
+            )
+
+        return cls(ParameterSweepItem.from_record(record) for record in records)
+
+
+class ParameterSweepItem(dict[str, object]):
+    @classmethod
+    def from_record(cls, record: dict[str, object]):
+        if not isinstance(record, dict):
+            raise TypeError(
+                f"Each item in the parameter sweep should be a dictionary, "
+                f"but found type: {type(record)}"
+            )
+
+        return cls(record)
+
+    def __or__(self, other: dict[str, Any]):
+        return type(self)(super().__or__(other))
+
+    # In JSON, we prefer "_"
+    def _iter_param_key_candidates(self, param_key: str):
+        # Inner config arguments are not converted by the CLI
+        if "." in param_key:
+            prefix, rest = param_key.split(".", 1)
+            for prefix_candidate in self._iter_param_key_candidates(prefix):
+                yield prefix_candidate + "." + rest
+
+            return
+
+        yield param_key
+        yield param_key.replace("-", "_")
+        yield param_key.replace("_", "-")
+
+    # In CLI, we prefer "-"
+    def _iter_cmd_key_candidates(self, param_key: str):
+        for k in reversed(tuple(self._iter_param_key_candidates(param_key))):
+            yield "--" + k
+
+    def _normalize_cmd_key(self, param_key: str):
+        return next(self._iter_cmd_key_candidates(param_key))
+
+    def has_param(self, param_key: str) -> bool:
+        return any(k in self for k in self._iter_param_key_candidates(param_key))
+
+    def apply_to_cmd(self, cmd: list[str]) -> list[str]:
+        cmd = list(cmd)
+
+        for k, v in self.items():
+            for k_candidate in self._iter_cmd_key_candidates(k):
+                try:
+                    k_idx = cmd.index(k_candidate)
+
+                    if isinstance(v, bool):
+                        cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k)
+                    else:
+                        cmd[k_idx + 1] = str(v)
+
+                    break
+                except ValueError:
+                    continue
+            else:
+                if isinstance(v, bool):
+                    cmd.append(self._normalize_cmd_key(k if v else "no-" + k))
+                else:
+                    cmd.extend([self._normalize_cmd_key(k), str(v)])
+
+        return cmd
+
+    def as_text(self, sep: str = ", ") -> str:
+        return sep.join(f"{k}={v}" for k, v in self.items())
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..9947d6170d891d36271c4d5f0d6cc66276cdb34c
--- /dev/null
+++ b/vllm/benchmarks/sweep/plot.py
@@ -0,0 +1,580 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+from abc import ABC, abstractmethod
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from types import TracebackType
+from typing import ClassVar
+
+from typing_extensions import Self, override
+
+from vllm.utils.collection_utils import full_groupby
+from vllm.utils.import_utils import PlaceholderModule
+
+from .utils import sanitize_filename
+
+try:
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+except ImportError:
+    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    pd = PlaceholderModule("pandas")
+    seaborn = PlaceholderModule("seaborn")
+
+
+@dataclass
+class PlotFilterBase(ABC):
+    var: str
+    target: str
+
+    @classmethod
+    def parse_str(cls, s: str):
+        for op_key in PLOT_FILTERS:
+            if op_key in s:
+                key, value = s.split(op_key)
+                return PLOT_FILTERS[op_key](
+                    key,
+                    value.removeprefix(op_key).strip("'").strip('"'),
+                )
+        else:
+            raise ValueError(
+                f"Invalid operator for plot filter '{s}'. "
+                f"Valid operators are: {sorted(PLOT_FILTERS)}",
+            )
+
+    @abstractmethod
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        """Applies this filter to a DataFrame."""
+        raise NotImplementedError
+
+
+@dataclass
+class PlotEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        try:
+            target = float(self.target)
+        except ValueError:
+            target = self.target
+
+        return df[df[self.var] == target]
+
+
+@dataclass
+class PlotLessThan(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        return df[df[self.var] < float(self.target)]
+
+
+@dataclass
+class PlotLessThanOrEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        return df[df[self.var] <= float(self.target)]
+
+
+@dataclass
+class PlotGreaterThan(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        return df[df[self.var] > float(self.target)]
+
+
+@dataclass
+class PlotGreaterThanOrEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        return df[df[self.var] >= float(self.target)]
+
+
+# NOTE: The ordering is important! Match longer op_keys first
+PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
+    "==": PlotEqualTo,
+    "<=": PlotLessThanOrEqualTo,
+    ">=": PlotGreaterThanOrEqualTo,
+    "<": PlotLessThan,
+    ">": PlotGreaterThan,
+}
+
+
+class PlotFilters(list[PlotFilterBase]):
+    @classmethod
+    def parse_str(cls, s: str):
+        if not s:
+            return cls()
+
+        return cls(PlotFilterBase.parse_str(e) for e in s.split(","))
+
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        for item in self:
+            df = item.apply(df)
+
+        return df
+
+
+@dataclass
+class PlotBinner:
+    var: str
+    bin_size: float
+
+    @classmethod
+    def parse_str(cls, s: str):
+        for op_key in PLOT_BINNERS:
+            if op_key in s:
+                key, value = s.split(op_key)
+                return PLOT_BINNERS[op_key](key, float(value.removeprefix(op_key)))
+        else:
+            raise ValueError(
+                f"Invalid operator for plot binner '{s}'. "
+                f"Valid operators are: {sorted(PLOT_BINNERS)}",
+            )
+
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        """Applies this binner to a DataFrame."""
+        df = df.copy()
+        df[self.var] = df[self.var] // self.bin_size * self.bin_size
+        return df
+
+
+PLOT_BINNERS: dict[str, type[PlotBinner]] = {
+    "%": PlotBinner,
+}
+
+
+class PlotBinners(list[PlotBinner]):
+    @classmethod
+    def parse_str(cls, s: str):
+        if not s:
+            return cls()
+
+        return cls(PlotBinner.parse_str(e) for e in s.split(","))
+
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        for item in self:
+            df = item.apply(df)
+
+        return df
+
+
+def _json_load_bytes(path: Path) -> list[dict[str, object]]:
+    with path.open("rb") as f:
+        return json.load(f)
+
+
+def _get_metric(run_data: dict[str, object], metric_key: str):
+    try:
+        return run_data[metric_key]
+    except KeyError as exc:
+        raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc
+
+
+def _get_group(run_data: dict[str, object], group_keys: list[str]):
+    return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
+
+
+def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
+    parts = list[str]()
+    if group:
+        parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group)))
+    else:
+        parts.append("figure")
+
+    return fig_dir / sanitize_filename("-".join(parts) + ".png")
+
+
+class DummyExecutor:
+    map = map
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        return None
+
+
+def _plot_fig(
+    fig_dir: Path,
+    fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]],
+    row_by: list[str],
+    col_by: list[str],
+    curve_by: list[str],
+    *,
+    var_x: str,
+    var_y: str,
+    filter_by: PlotFilters,
+    bin_by: PlotBinners,
+    scale_x: str | None,
+    scale_y: str | None,
+    dry_run: bool,
+):
+    fig_group, fig_data = fig_group_data
+
+    row_groups = full_groupby(
+        fig_data,
+        key=lambda item: _get_group(item, row_by),
+    )
+    num_rows = len(row_groups)
+    num_cols = max(
+        len(full_groupby(row_data, key=lambda item: _get_group(item, col_by)))
+        for _, row_data in row_groups
+    )
+
+    fig_path = _get_fig_path(fig_dir, fig_group)
+
+    print("[BEGIN FIGURE]")
+    print(f"Group: {dict(fig_group)}")
+    print(f"Grid: {num_rows} rows x {num_cols} cols")
+    print(f"Output file: {fig_path}")
+
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
+    df = pd.DataFrame.from_records(fig_data)
+
+    if var_x not in df.columns:
+        raise ValueError(
+            f"Cannot find {var_x=!r} in parameter sweep results. "
+            f"Available variables: {df.columns.tolist()}"
+        )
+    if var_y not in df.columns:
+        raise ValueError(
+            f"Cannot find {var_y=!r} in parameter sweep results. "
+            f"Available variables: {df.columns.tolist()}"
+        )
+    for k in row_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find row_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
+    for k in col_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find col_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
+    for k in curve_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find curve_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
+
+    df = filter_by.apply(df)
+    df = bin_by.apply(df)
+
+    df["row_group"] = (
+        pd.concat(
+            [k + "=" + df[k].astype(str) for k in row_by],
+            axis=1,
+        ).agg("\n".join, axis=1)
+        if row_by
+        else "(All)"
+    )
+
+    df["col_group"] = (
+        pd.concat(
+            [k + "=" + df[k].astype(str) for k in col_by],
+            axis=1,
+        ).agg("\n".join, axis=1)
+        if col_by
+        else "(All)"
+    )
+
+    g = sns.FacetGrid(df, row="row_group", col="col_group")
+
+    if row_by and col_by:
+        g.set_titles("{row_name}\n{col_name}")
+    elif row_by:
+        g.set_titles("{row_name}")
+    elif col_by:
+        g.set_titles("{col_name}")
+    else:
+        g.set_titles("")
+
+    if scale_x:
+        g.set(xscale=scale_x)
+    if scale_y:
+        g.set(yscale=scale_y)
+
+    if len(curve_by) <= 3:
+        hue, style, size, *_ = (*curve_by, None, None, None)
+
+        g.map_dataframe(
+            sns.lineplot,
+            x=var_x,
+            y=var_y,
+            hue=hue,
+            style=style,
+            size=size,
+            markers=True,
+        )
+
+        g.add_legend(title=hue)
+    else:
+        df["curve_group"] = (
+            pd.concat(
+                [k + "=" + df[k].astype(str) for k in curve_by],
+                axis=1,
+            ).agg("\n".join, axis=1)
+            if curve_by
+            else "(All)"
+        )
+
+        g.map_dataframe(
+            sns.lineplot,
+            x=var_x,
+            y=var_y,
+            hue="curve_group",
+            markers=True,
+        )
+
+        g.add_legend()
+
+    g.savefig(fig_path)
+    plt.close(g.figure)
+
+    print("[END FIGURE]")
+
+
+def plot(
+    output_dir: Path,
+    fig_dir: Path,
+    fig_by: list[str],
+    row_by: list[str],
+    col_by: list[str],
+    curve_by: list[str],
+    *,
+    var_x: str,
+    var_y: str,
+    filter_by: PlotFilters,
+    bin_by: PlotBinners,
+    scale_x: str | None,
+    scale_y: str | None,
+    dry_run: bool,
+):
+    all_data = [
+        run_data
+        for path in output_dir.rglob("**/summary.json")
+        for run_data in _json_load_bytes(path)
+    ]
+
+    if not all_data:
+        raise ValueError(f"Did not find any parameter sweep results under {output_dir}")
+
+    fig_dir.mkdir(parents=True, exist_ok=True)
+
+    fig_groups = full_groupby(
+        all_data,
+        key=lambda item: _get_group(item, fig_by),
+    )
+
+    with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor:
+        # Resolve the iterable to ensure that the workers are run
+        all(
+            executor.map(
+                partial(
+                    _plot_fig,
+                    fig_dir,
+                    row_by=row_by,
+                    col_by=col_by,
+                    curve_by=curve_by,
+                    var_x=var_x,
+                    var_y=var_y,
+                    filter_by=filter_by,
+                    bin_by=bin_by,
+                    scale_x=scale_x,
+                    scale_y=scale_y,
+                    dry_run=dry_run,
+                ),
+                fig_groups,
+            )
+        )
+
+
+@dataclass
+class SweepPlotArgs:
+    output_dir: Path
+    fig_dir: Path
+    fig_by: list[str]
+    row_by: list[str]
+    col_by: list[str]
+    curve_by: list[str]
+    var_x: str
+    var_y: str
+    filter_by: PlotFilters
+    bin_by: PlotBinners
+    scale_x: str | None
+    scale_y: str | None
+    dry_run: bool
+
+    parser_name: ClassVar[str] = "plot"
+    parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results."
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        output_dir = Path(args.OUTPUT_DIR)
+        if not output_dir.exists():
+            raise ValueError(f"No parameter sweep results under {output_dir}")
+
+        curve_by = [] if not args.curve_by else args.curve_by.split(",")
+        row_by = [] if not args.row_by else args.row_by.split(",")
+        col_by = [] if not args.col_by else args.col_by.split(",")
+        fig_by = [] if not args.fig_by else args.fig_by.split(",")
+
+        return cls(
+            output_dir=output_dir,
+            fig_dir=output_dir / args.fig_dir,
+            fig_by=fig_by,
+            row_by=row_by,
+            col_by=col_by,
+            curve_by=curve_by,
+            var_x=args.var_x,
+            var_y=args.var_y,
+            filter_by=PlotFilters.parse_str(args.filter_by),
+            bin_by=PlotBinners.parse_str(args.bin_by),
+            scale_x=args.scale_x,
+            scale_y=args.scale_y,
+            dry_run=args.dry_run,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser.add_argument(
+            "OUTPUT_DIR",
+            type=str,
+            default="results",
+            help="The directory containing the results to plot, "
+            "i.e., the `--output-dir` argument to the parameter sweep script.",
+        )
+        parser.add_argument(
+            "--fig-dir",
+            type=str,
+            default="",
+            help="The directory to save the figures, relative to `OUTPUT_DIR`. "
+            "By default, the same directory is used.",
+        )
+        parser.add_argument(
+            "--fig-by",
+            type=str,
+            default="",
+            help="A comma-separated list of variables, such that a separate figure "
+            "is created for each combination of these variables.",
+        )
+        parser.add_argument(
+            "--row-by",
+            type=str,
+            default="",
+            help="A comma-separated list of variables, such that a separate row "
+            "is created for each combination of these variables.",
+        )
+        parser.add_argument(
+            "--col-by",
+            type=str,
+            default="",
+            help="A comma-separated list of variables, such that a separate column "
+            "is created for each combination of these variables.",
+        )
+        parser.add_argument(
+            "--curve-by",
+            type=str,
+            default=None,
+            help="A comma-separated list of variables, such that a separate curve "
+            "is created for each combination of these variables.",
+        )
+        parser.add_argument(
+            "--var-x",
+            type=str,
+            default="request_throughput",
+            help="The variable for the x-axis.",
+        )
+        parser.add_argument(
+            "--var-y",
+            type=str,
+            default="p99_e2el_ms",
+            help="The variable for the y-axis",
+        )
+        parser.add_argument(
+            "--filter-by",
+            type=str,
+            default="",
+            help="A comma-separated list of statements indicating values to filter by. "
+            "This is useful to remove outliers. "
+            "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means "
+            "plot only the points where `max_concurrency` is less than 1000 and "
+            "`max_num_batched_tokens` is no greater than 4096.",
+        )
+        parser.add_argument(
+            "--bin-by",
+            type=str,
+            default="",
+            help="A comma-separated list of statements indicating values to bin by. "
+            "This is useful to avoid plotting points that are too close together. "
+            "Example: `request_throughput%%1` means "
+            "use a bin size of 1 for the `request_throughput` variable.",
+        )
+        parser.add_argument(
+            "--scale-x",
+            type=str,
+            default=None,
+            help="The scale to use for the x-axis. "
+            "Currently only accepts string values such as 'log' and 'sqrt'. "
+            "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
+        )
+        parser.add_argument(
+            "--scale-y",
+            type=str,
+            default=None,
+            help="The scale to use for the y-axis. "
+            "Currently only accepts string values such as 'log' and 'sqrt'. "
+            "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the information about each figure to plot, "
+            "then exits without drawing them.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepPlotArgs):
+    return plot(
+        output_dir=args.output_dir,
+        fig_dir=args.fig_dir,
+        fig_by=args.fig_by,
+        row_by=args.row_by,
+        col_by=args.col_by,
+        curve_by=args.curve_by,
+        var_x=args.var_x,
+        var_y=args.var_y,
+        filter_by=args.filter_by,
+        bin_by=args.bin_by,
+        scale_x=args.scale_x,
+        scale_y=args.scale_y,
+        dry_run=args.dry_run,
+    )
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepPlotArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepPlotArgs.parser_help)
+    SweepPlotArgs.add_cli_args(parser)
+
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ac446a7aedfd48af83b376bc5e4953312ea890
--- /dev/null
+++ b/vllm/benchmarks/sweep/serve.py
@@ -0,0 +1,416 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import contextlib
+import json
+import shlex
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import ClassVar
+
+from vllm.utils.import_utils import PlaceholderModule
+
+from .param_sweep import ParameterSweep, ParameterSweepItem
+from .server import ServerProcess
+from .utils import sanitize_filename
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+
+@contextlib.contextmanager
+def run_server(
+    serve_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_overrides: ParameterSweepItem,
+    dry_run: bool,
+):
+    server_cmd = serve_overrides.apply_to_cmd(serve_cmd)
+
+    print("[BEGIN SERVER]")
+    print(f"Server overrides: {serve_overrides}")
+    print(f"Server command: {server_cmd}")
+
+    if dry_run:
+        yield None
+        print("[END SERVER]")
+        return
+
+    with ServerProcess(server_cmd, after_bench_cmd, show_stdout=show_stdout) as server:
+        yield server
+
+    print("[END SERVER]")
+
+
+def _update_run_data(
+    run_data: dict[str, object],
+    serve_overrides: ParameterSweepItem,
+    bench_overrides: ParameterSweepItem,
+    run_number: int,
+):
+    run_data["run_number"] = run_number
+    run_data.update(serve_overrides)
+    run_data.update(bench_overrides)
+
+    return run_data
+
+
+def run_benchmark(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_overrides: ParameterSweepItem,
+    bench_overrides: ParameterSweepItem,
+    run_number: int,
+    output_path: Path,
+    dry_run: bool,
+):
+    benchmark_cmd = [
+        *bench_overrides.apply_to_cmd(bench_cmd),
+        "--percentile-metrics",
+        "ttft,tpot,itl,e2el",
+        "--save-result",
+        "--result-dir",
+        str(output_path.parent),
+        "--result-filename",
+        output_path.name,
+    ]
+
+    print("[BEGIN BENCHMARK]")
+    print(f"Benchmark overrides: {bench_overrides}")
+    print(f"Run Number: {run_number}")
+    print(f"Benchmark command: {benchmark_cmd}")
+    print(f"Output file: {output_path}")
+
+    run_data: dict[str, object]
+
+    if output_path.exists():
+        print("Found existing results. Skipping.")
+
+        with output_path.open("rb") as f:
+            run_data = json.load(f)
+            return _update_run_data(
+                run_data,
+                serve_overrides,
+                bench_overrides,
+                run_number,
+            )
+
+    if server is None:
+        if not dry_run:
+            raise ValueError(f"Cannot find results at {output_path}")
+
+        print("[END BENCHMARK]")
+        return None
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    server.run_subcommand(benchmark_cmd)
+    server.after_bench()
+
+    with output_path.open("rb") as f:
+        run_data = json.load(f)
+
+    run_data = _update_run_data(
+        run_data,
+        serve_overrides,
+        bench_overrides,
+        run_number,
+    )
+
+    with output_path.open("w") as f:
+        json.dump(run_data, f, indent=4)
+
+    print("[END BENCHMARK]")
+
+    return run_data
+
+
+def _get_comb_base_path(
+    output_dir: Path,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+):
+    parts = list[str]()
+    if serve_comb:
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+    if bench_comb:
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+
+    return output_dir / sanitize_filename("-".join(parts))
+
+
+def _get_comb_run_path(base_path: Path, run_number: int | None):
+    if run_number is None:
+        return base_path / "summary.json"
+
+    return base_path / f"run={run_number}.json"
+
+
+def _comb_needs_server(
+    serve_comb: ParameterSweepItem,
+    bench_combs: ParameterSweep,
+    output_dir: Path,
+):
+    for bench_comb in bench_combs:
+        base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+        if not _get_comb_run_path(base_path, run_number=None).exists():
+            return True
+
+    return False
+
+
+def run_comb(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    comb_data = list[dict[str, object]]()
+
+    for run_number in range(num_runs):
+        run_data = run_benchmark(
+            server,
+            bench_cmd,
+            serve_overrides=serve_comb,
+            bench_overrides=bench_comb,
+            run_number=run_number,
+            output_path=_get_comb_run_path(base_path, run_number),
+            dry_run=dry_run,
+        )
+
+        if run_data is not None:
+            comb_data.append(run_data)
+
+    if dry_run:
+        return None
+
+    with _get_comb_run_path(base_path, run_number=None).open("w") as f:
+        json.dump(comb_data, f, indent=4)
+
+    return comb_data
+
+
+def run_combs(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    all_data = list[dict[str, object]]()
+    for serve_comb in serve_params:
+        with (
+            run_server(
+                serve_cmd,
+                after_bench_cmd,
+                show_stdout=show_stdout,
+                serve_overrides=serve_comb,
+                dry_run=dry_run,
+            )
+            if _comb_needs_server(serve_comb, bench_params, output_dir)
+            else contextlib.nullcontext()
+        ) as server:
+            for bench_comb in bench_params:
+                base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+
+                comb_data = run_comb(
+                    server,
+                    bench_cmd,
+                    serve_comb=serve_comb,
+                    bench_comb=bench_comb,
+                    base_path=base_path,
+                    num_runs=num_runs,
+                    dry_run=dry_run,
+                )
+
+                if comb_data is not None:
+                    all_data.extend(comb_data)
+
+    if dry_run:
+        return None
+
+    combined_df = pd.DataFrame.from_records(all_data)
+    combined_df.to_csv(output_dir / "summary.csv")
+
+    return combined_df
+
+
+@dataclass
+class SweepServeArgs:
+    serve_cmd: list[str]
+    bench_cmd: list[str]
+    after_bench_cmd: list[str]
+    show_stdout: bool
+    serve_params: ParameterSweep
+    bench_params: ParameterSweep
+    output_dir: Path
+    num_runs: int
+    dry_run: bool
+    resume: str | None
+
+    parser_name: ClassVar[str] = "serve"
+    parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        serve_cmd = shlex.split(args.serve_cmd)
+        bench_cmd = shlex.split(args.bench_cmd)
+        after_bench_cmd = (
+            [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd)
+        )
+
+        if args.serve_params:
+            serve_params = ParameterSweep.read_json(args.serve_params)
+        else:
+            # i.e.: run serve_cmd without any modification
+            serve_params = ParameterSweep.from_records([{}])
+
+        if args.bench_params:
+            bench_params = ParameterSweep.read_json(args.bench_params)
+        else:
+            # i.e.: run bench_cmd without any modification
+            bench_params = ParameterSweep.from_records([{}])
+
+        num_runs = args.num_runs
+        if num_runs < 1:
+            raise ValueError("`num_runs` should be at least 1.")
+
+        return cls(
+            serve_cmd=serve_cmd,
+            bench_cmd=bench_cmd,
+            after_bench_cmd=after_bench_cmd,
+            show_stdout=args.show_stdout,
+            serve_params=serve_params,
+            bench_params=bench_params,
+            output_dir=Path(args.output_dir),
+            num_runs=num_runs,
+            dry_run=args.dry_run,
+            resume=args.resume,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser.add_argument(
+            "--serve-cmd",
+            type=str,
+            required=True,
+            help="The command used to run the server: `vllm serve ...`",
+        )
+        parser.add_argument(
+            "--bench-cmd",
+            type=str,
+            required=True,
+            help="The command used to run the benchmark: `vllm bench serve ...`",
+        )
+        parser.add_argument(
+            "--after-bench-cmd",
+            type=str,
+            default=None,
+            help="After a benchmark run is complete, invoke this command instead of "
+            "the default `ServerWrapper.clear_cache()`.",
+        )
+        parser.add_argument(
+            "--show-stdout",
+            action="store_true",
+            help="If set, logs the standard output of subcommands. "
+            "Useful for debugging but can be quite spammy.",
+        )
+        parser.add_argument(
+            "--serve-params",
+            type=str,
+            default=None,
+            help="Path to JSON file containing a list of parameter combinations "
+            "for the `vllm serve` command. "
+            "If both `serve_params` and `bench_params` are given, "
+            "this script will iterate over their Cartesian product.",
+        )
+        parser.add_argument(
+            "--bench-params",
+            type=str,
+            default=None,
+            help="Path to JSON file containing a list of parameter combinations "
+            "for the `vllm bench serve` command. "
+            "If both `serve_params` and `bench_params` are given, "
+            "this script will iterate over their Cartesian product.",
+        )
+        parser.add_argument(
+            "-o",
+            "--output-dir",
+            type=str,
+            default="results",
+            help="The directory to which results are written.",
+        )
+        parser.add_argument(
+            "--num-runs",
+            type=int,
+            default=3,
+            help="Number of runs per parameter combination.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the commands to run, "
+            "then exits without executing them.",
+        )
+        parser.add_argument(
+            "--resume",
+            type=str,
+            default=None,
+            help="Set this to the name of a directory under `output_dir` (which is a "
+            "timestamp) to resume a previous execution of this script, i.e., only run "
+            "parameter combinations for which there are still no output files.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepServeArgs):
+    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = args.output_dir / timestamp
+
+    if args.resume and not output_dir.exists():
+        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+
+    try:
+        return run_combs(
+            serve_cmd=args.serve_cmd,
+            bench_cmd=args.bench_cmd,
+            after_bench_cmd=args.after_bench_cmd,
+            show_stdout=args.show_stdout,
+            serve_params=args.serve_params,
+            bench_params=args.bench_params,
+            output_dir=output_dir,
+            num_runs=args.num_runs,
+            dry_run=args.dry_run,
+        )
+    except BaseException as exc:
+        raise RuntimeError(
+            f"The script was terminated early. Use `--resume {timestamp}` "
+            f"to continue the script from its last checkpoint."
+        ) from exc
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepServeArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepServeArgs.parser_help)
+    SweepServeArgs.add_cli_args(parser)
+
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py
new file mode 100644
index 0000000000000000000000000000000000000000..0403d1ddfd6c115f14e94f6c342e750b6e5064c8
--- /dev/null
+++ b/vllm/benchmarks/sweep/serve_sla.py
@@ -0,0 +1,492 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import contextlib
+import json
+import math
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import ClassVar, Literal, get_args
+
+from typing_extensions import assert_never
+
+from vllm.utils.import_utils import PlaceholderModule
+
+from .param_sweep import ParameterSweep, ParameterSweepItem
+from .serve import SweepServeArgs, run_benchmark, run_server
+from .server import ServerProcess
+from .sla_sweep import SLASweep, SLASweepItem
+from .utils import sanitize_filename
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+
+def _get_sla_base_path(
+    output_dir: Path,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+):
+    parts = list[str]()
+    if serve_comb:
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+    if bench_comb:
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+
+    return output_dir / sanitize_filename("-".join(parts))
+
+
+def _get_sla_iter_path(
+    base_path: Path,
+    sla_comb: SLASweepItem,
+    sla_variable: str,
+    sla_value: int | None,
+):
+    if sla_value is None:
+        prefix = sla_comb.as_text(sep="-")
+        return base_path / f"SLA--{prefix}.json"
+
+    return base_path / f"{sla_variable}={sla_value}"
+
+
+def _get_sla_run_path(iter_path: Path, run_number: int | None):
+    if run_number is None:
+        return iter_path / "summary.json"
+
+    return iter_path / f"run={run_number}.json"
+
+
+def _sla_needs_server(
+    serve_comb: ParameterSweepItem,
+    bench_combs: ParameterSweep,
+    sla_combs: SLASweep,
+    sla_variable: str,
+    output_dir: Path,
+):
+    for bench_comb in bench_combs:
+        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+        for sla_comb in sla_combs:
+            if not _get_sla_iter_path(
+                base_path,
+                sla_comb,
+                sla_variable,
+                sla_value=None,
+            ).exists():
+                return True
+
+    return False
+
+
+def run_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    iter_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    iter_data = list[dict[str, object]]()
+
+    for run_number in range(num_runs):
+        run_data = run_benchmark(
+            server,
+            bench_cmd,
+            serve_overrides=serve_comb,
+            bench_overrides=bench_comb,
+            run_number=run_number,
+            output_path=_get_sla_run_path(iter_path, run_number),
+            dry_run=dry_run,
+        )
+
+        if run_data is not None:
+            iter_data.append(run_data)
+
+    if dry_run:
+        return None
+
+    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
+        json.dump(iter_data, f, indent=4)
+
+    return iter_data
+
+
+SLAVariable = Literal["request_rate", "max_concurrency"]
+
+
+def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
+    request_throughput = float(run_data["request_throughput"])  # type: ignore
+    if sla_variable == "request_rate":
+        return request_throughput
+    if sla_variable == "max_concurrency":
+        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
+        return request_throughput * mean_latency_ms / 1000
+
+    assert_never(sla_variable)
+
+
+def _estimate_sla_bounds(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    init_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    max_passing: int = 0
+    min_failing: int = 0
+
+    val: int = init_value
+    assert val > 0
+
+    while True:
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            max_passing = val
+            val *= 2
+        else:
+            print("SLA criteria are not met.")
+            min_failing = val
+            break
+
+        if val >= max_value:
+            break
+
+    return sla_data, (max_passing, min_failing)
+
+
+def _find_sla_value(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    min_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    left: int = min_value
+    right: int = max_value
+
+    while True:
+        val = (left + right) // 2
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            left = val
+        else:
+            print("SLA criteria are not met.")
+            right = val
+
+        if right - left <= 1:
+            break
+
+    return sla_data, left
+
+
+def search_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    sla_variable: SLAVariable,
+    sla_inf_value: int = 65536,  # The value that represents infinite QPS
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    print("[SLA START]")
+    print(f"SLA criteria: {sla_comb.as_text()}")
+
+    sla_data_0 = run_sla(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb | {sla_variable: sla_inf_value},
+        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
+        num_runs=num_runs,
+        dry_run=dry_run,
+    )
+    if sla_data_0 is None:
+        assert dry_run
+        print("Omitting SLA search.")
+        print("[SLA END]")
+        return None
+
+    sla_init_value = math.ceil(
+        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
+        / len(sla_data_0)
+    )
+    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
+
+    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        init_value=sla_init_value,
+        max_value=sla_inf_value,
+    )
+    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
+
+    sla_data_2, sla_value = _find_sla_value(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        min_value=sla_min,
+        max_value=sla_max,
+    )
+
+    sla_data = sla_data_0 + sla_data_1 + sla_data_2
+    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
+
+    with _get_sla_iter_path(
+        base_path,
+        sla_comb,
+        sla_variable,
+        sla_value=None,
+    ).open("w") as f:
+        json.dump(sla_data, f, indent=4)
+
+    print("[SLA END]")
+
+    return sla_data
+
+
+def run_slas(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    sla_params: SLASweep,
+    sla_variable: SLAVariable,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
+        raise ValueError(
+            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
+            "since it is supposed to be determined automatically."
+        )
+
+    all_data = list[dict[str, object]]()
+    for serve_comb in serve_params:
+        with (
+            run_server(
+                serve_cmd,
+                after_bench_cmd,
+                show_stdout=show_stdout,
+                serve_overrides=serve_comb,
+                dry_run=dry_run,
+            )
+            if _sla_needs_server(
+                serve_comb,
+                bench_params,
+                sla_params,
+                sla_variable,
+                output_dir,
+            )
+            else contextlib.nullcontext()
+        ) as server:
+            for bench_comb in bench_params:
+                for sla_comb in sla_params:
+                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+
+                    comb_data = search_sla(
+                        server,
+                        bench_cmd,
+                        serve_comb=serve_comb,
+                        bench_comb=bench_comb,
+                        sla_comb=sla_comb,
+                        sla_variable=sla_variable,
+                        base_path=base_path,
+                        num_runs=num_runs,
+                        dry_run=dry_run,
+                    )
+
+                    if comb_data is not None:
+                        all_data.extend(comb_data)
+
+    if dry_run:
+        return None
+
+    combined_df = pd.DataFrame.from_records(all_data)
+    combined_df.to_csv(output_dir / "summary.csv")
+
+    return combined_df
+
+
+@dataclass
+class SweepServeSLAArgs(SweepServeArgs):
+    sla_params: SLASweep
+    sla_variable: SLAVariable
+
+    parser_name: ClassVar[str] = "serve_sla"
+    parser_help: ClassVar[str] = "Tune a variable to meet SLAs under multiple settings."
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # NOTE: Don't use super() as `from_cli_args` calls `cls()`
+        base_args = SweepServeArgs.from_cli_args(args)
+
+        if args.sla_params:
+            sla_params = SLASweep.read_json(args.sla_params)
+        else:
+            sla_params = SLASweep.from_records([])
+
+        return cls(
+            **asdict(base_args),
+            sla_params=sla_params,
+            sla_variable=args.sla_variable,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser = super().add_cli_args(parser)
+
+        sla_group = parser.add_argument_group("sla options")
+        sla_group.add_argument(
+            "--sla-params",
+            type=str,
+            required=True,
+            help="Path to JSON file containing a list of SLA constraints to satisfy. "
+            'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
+            'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
+            "the E2E latency should be less than 500ms 99%% of the time. "
+            "Setting this option runs this script in SLA mode, which searches for "
+            "the maximum `sla_variable` that satisfies the constraints for "
+            "each combination of `serve_params`, `bench_params`, and `sla_params`.",
+        )
+        sla_group.add_argument(
+            "--sla-variable",
+            type=str,
+            choices=get_args(SLAVariable),
+            default="request_rate",
+            help="Whether to tune request rate or maximum concurrency to satisfy "
+            "the SLA constraints.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepServeSLAArgs):
+    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = args.output_dir / timestamp
+
+    if args.resume and not output_dir.exists():
+        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+
+    try:
+        return run_slas(
+            serve_cmd=args.serve_cmd,
+            bench_cmd=args.bench_cmd,
+            after_bench_cmd=args.after_bench_cmd,
+            show_stdout=args.show_stdout,
+            serve_params=args.serve_params,
+            bench_params=args.bench_params,
+            sla_params=args.sla_params,
+            sla_variable=args.sla_variable,
+            output_dir=output_dir,
+            num_runs=args.num_runs,
+            dry_run=args.dry_run,
+        )
+    except BaseException as exc:
+        raise RuntimeError(
+            f"The script was terminated early. Use `--resume {timestamp}` "
+            f"to continue the script from its last checkpoint."
+        ) from exc
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepServeSLAArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepServeSLAArgs.parser_help)
+    SweepServeSLAArgs.add_cli_args(parser)
+
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/server.py b/vllm/benchmarks/sweep/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..f17578726415f34e680e0033539af8fefd472b55
--- /dev/null
+++ b/vllm/benchmarks/sweep/server.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import signal
+import subprocess
+from types import TracebackType
+
+import requests
+from typing_extensions import Self
+
+
+class ServerProcess:
+    def __init__(
+        self,
+        server_cmd: list[str],
+        after_bench_cmd: list[str],
+        *,
+        show_stdout: bool,
+    ) -> None:
+        super().__init__()
+
+        self.server_cmd = server_cmd
+        self.after_bench_cmd = after_bench_cmd
+        self.show_stdout = show_stdout
+
+    def __enter__(self) -> Self:
+        self.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        self.stop()
+
+    def start(self):
+        # Create new process for clean termination
+        self._server_process = subprocess.Popen(
+            self.server_cmd,
+            start_new_session=True,
+            stdout=None if self.show_stdout else subprocess.DEVNULL,
+            # Need `VLLM_SERVER_DEV_MODE=1` for `_reset_caches`
+            env=os.environ | {"VLLM_SERVER_DEV_MODE": "1"},
+        )
+
+    def stop(self):
+        server_process = self._server_process
+
+        if server_process.poll() is None:
+            # In case only some processes have been terminated
+            with contextlib.suppress(ProcessLookupError):
+                # We need to kill both API Server and Engine processes
+                os.killpg(os.getpgid(server_process.pid), signal.SIGKILL)
+
+    def run_subcommand(self, cmd: list[str]):
+        return subprocess.run(
+            cmd,
+            stdout=None if self.show_stdout else subprocess.DEVNULL,
+            check=True,
+        )
+
+    def after_bench(self) -> None:
+        if not self.after_bench_cmd:
+            self.reset_caches()
+            return
+
+        self.run_subcommand(self.after_bench_cmd)
+
+    def _get_vllm_server_address(self) -> str:
+        server_cmd = self.server_cmd
+
+        for host_key in ("--host",):
+            if host_key in server_cmd:
+                host = server_cmd[server_cmd.index(host_key) + 1]
+                break
+        else:
+            host = "localhost"
+
+        for port_key in ("-p", "--port"):
+            if port_key in server_cmd:
+                port = int(server_cmd[server_cmd.index(port_key) + 1])
+                break
+        else:
+            port = 8000  # The default value in vllm serve
+
+        return f"http://{host}:{port}"
+
+    def reset_caches(self) -> None:
+        server_cmd = self.server_cmd
+
+        # Use `.endswith()` to match `/bin/...`
+        if server_cmd[0].endswith("vllm"):
+            server_address = self._get_vllm_server_address()
+            print(f"Resetting caches at {server_address}")
+
+            res = requests.post(f"{server_address}/reset_prefix_cache")
+            res.raise_for_status()
+
+            res = requests.post(f"{server_address}/reset_mm_cache")
+            res.raise_for_status()
+        elif server_cmd[0].endswith("infinity_emb"):
+            if "--vector-disk-cache" in server_cmd:
+                raise NotImplementedError(
+                    "Infinity server uses caching but does not expose a method "
+                    "to reset the cache"
+                )
+        else:
+            raise NotImplementedError(
+                f"No implementation of `reset_caches` for `{server_cmd[0]}` server. "
+                "Please specify a custom command via `--after-bench-cmd`."
+            )
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
new file mode 100644
index 0000000000000000000000000000000000000000..327e3c7c5897a7ce3f512243ab7d501cecea38d1
--- /dev/null
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from typing_extensions import override
+
+
+@dataclass
+class SLACriterionBase(ABC):
+    target: float
+
+    @abstractmethod
+    def validate(self, actual: float) -> bool:
+        """Return `True` if this criterion is met; otherwise `False`."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def format_cond(self, lhs: str) -> str:
+        raise NotImplementedError
+
+    def print_and_validate(
+        self,
+        metrics: dict[str, float],
+        metrics_key: str,
+    ) -> bool:
+        metric = metrics[metrics_key]
+        result = self.validate(metric)
+
+        cond = self.format_cond(f"{metrics_key} = {metric:.2f}")
+        print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED"))
+
+        return result
+
+
+@dataclass
+class SLALessThan(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual < self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}<{self.target:.2f}"
+
+
+@dataclass
+class SLALessThanOrEqualTo(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual <= self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}<={self.target:.2f}"
+
+
+@dataclass
+class SLAGreaterThan(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual > self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}>{self.target:.2f}"
+
+
+@dataclass
+class SLAGreaterThanOrEqualTo(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual >= self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}>={self.target:.2f}"
+
+
+# NOTE: The ordering is important! Match longer op_keys first
+SLA_CRITERIA: dict[str, type[SLACriterionBase]] = {
+    "<=": SLALessThanOrEqualTo,
+    ">=": SLAGreaterThanOrEqualTo,
+    "<": SLALessThan,
+    ">": SLAGreaterThan,
+}
+
+
+class SLASweep(list["SLASweepItem"]):
+    @classmethod
+    def read_json(cls, filepath: os.PathLike):
+        with open(filepath, "rb") as f:
+            records = json.load(f)
+
+        return cls.from_records(records)
+
+    @classmethod
+    def from_records(cls, records: list[dict[str, str]]):
+        if not isinstance(records, list):
+            raise TypeError(
+                f"The SLA sweep should be a list of dictionaries, "
+                f"but found type: {type(records)}"
+            )
+
+        return cls(SLASweepItem.from_record(record) for record in records)
+
+
+class SLASweepItem(dict[str, SLACriterionBase]):
+    @classmethod
+    def from_record(cls, record: dict[str, str]):
+        sla_criteria: dict[str, SLACriterionBase] = {}
+
+        for metric_key, metric_value in record.items():
+            for op_key in SLA_CRITERIA:
+                if metric_value.startswith(op_key):
+                    sla_criteria[metric_key] = SLA_CRITERIA[op_key](
+                        float(metric_value.removeprefix(op_key))
+                    )
+                    break
+            else:
+                raise ValueError(
+                    f"Invalid operator for "
+                    f"SLA constraint '{metric_key}={metric_value}'. "
+                    f"Valid operators are: {sorted(SLA_CRITERIA)}",
+                )
+
+        return cls(sla_criteria)
+
+    def as_text(self, sep: str = ", ") -> str:
+        return sep.join(v.format_cond(k) for k, v in self.items())
diff --git a/vllm/benchmarks/sweep/utils.py b/vllm/benchmarks/sweep/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d7867eaf483b5717dcebf3210f0a8f23d2864f
--- /dev/null
+++ b/vllm/benchmarks/sweep/utils.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def sanitize_filename(filename: str) -> str:
+    return filename.replace("/", "_").replace("..", "__").strip("'").strip('"')
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 96e39fd92eba0784db246176737c1808e274fd56..23b5faa1b2c3287cc76fe17cb72797de6aba261d 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
+
 import argparse
 import dataclasses
 import json
@@ -8,29 +9,33 @@ import os
 import random
 import time
 import warnings
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import uvloop
 from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
-
-from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
-                                      ConversationDataset,
-                                      InstructCoderDataset,
-                                      PrefixRepetitionRandomDataset,
-                                      RandomDataset, SampleRequest,
-                                      ShareGPTDataset, SonnetDataset,
-                                      VisionArenaDataset)
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import (
+    AIMODataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    InstructCoderDataset,
+    MultiModalConversationDataset,
+    PrefixRepetitionRandomDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 
 def run_vllm(
@@ -39,25 +44,32 @@ def run_vllm(
     engine_args: EngineArgs,
     do_profile: bool,
     disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
+) -> tuple[float, list[RequestOutput] | None]:
     from vllm import LLM, SamplingParams
+
     llm = LLM(**dataclasses.asdict(engine_args))
     assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " prompt_len and expected_output_len for all requests."
+    )
     # Add the requests to the engine.
-    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    prompts: list[TextPrompt | TokensPrompt] = []
     sampling_params: list[SamplingParams] = []
     for request in requests:
-        prompts.append(
-            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                       multi_modal_data=request.multi_modal_data)
-            if "prompt_token_ids" in request.prompt else \
-            TextPrompt(prompt=request.prompt,
-                       multi_modal_data=request.multi_modal_data))
+        prompt = (
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"])
+            if "prompt_token_ids" in request.prompt
+            else TextPrompt(prompt=request.prompt)
+        )
+        if request.multi_modal_data:
+            assert isinstance(request.multi_modal_data, dict)
+            prompt["multi_modal_data"] = request.multi_modal_data
+        prompts.append(prompt)
+
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -66,8 +78,9 @@ def run_vllm(
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
                 detokenize=not disable_detokenize,
-            ))
-    lora_requests: Optional[list[LoRARequest]] = None
+            )
+        )
+    lora_requests: list[LoRARequest] | None = None
     if engine_args.enable_lora:
         lora_requests = [request.lora_request for request in requests]
 
@@ -78,10 +91,9 @@ def run_vllm(
         start = time.perf_counter()
         if do_profile:
             llm.start_profile()
-        outputs = llm.generate(prompts,
-                               sampling_params,
-                               lora_request=lora_requests,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
         if do_profile:
             llm.stop_profile()
         end = time.perf_counter()
@@ -101,7 +113,8 @@ def run_vllm(
                 beam_width=n,
                 max_tokens=output_len,
                 ignore_eos=True,
-            ))
+            ),
+        )
         if do_profile:
             llm.stop_profile()
         end = time.perf_counter()
@@ -109,25 +122,29 @@ def run_vllm(
 
 
 def run_vllm_chat(
-        requests: list[SampleRequest],
-        n: int,
-        engine_args: EngineArgs,
-        do_profile: bool,
-        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    do_profile: bool,
+    disable_detokenize: bool = False,
+) -> tuple[float, list[RequestOutput]]:
     """
     Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
     multimodal models as it properly handles multimodal inputs and chat
     formatting. For non-multimodal models, use run_vllm() instead.
     """
     from vllm import LLM, SamplingParams
+
     llm = LLM(**dataclasses.asdict(engine_args))
 
     assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of "
-            "prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )
 
     prompts = []
     sampling_params: list[SamplingParams] = []
@@ -141,7 +158,8 @@ def run_vllm_chat(
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
                 detokenize=not disable_detokenize,
-            ))
+            )
+        )
     start = time.perf_counter()
     if do_profile:
         llm.start_profile()
@@ -162,31 +180,38 @@ async def run_vllm_async(
 ) -> float:
     from vllm import SamplingParams
     from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
+        build_async_engine_client_from_engine_args,
+    )
 
     async with build_async_engine_client_from_engine_args(
         engine_args,
         disable_frontend_multiprocessing=disable_frontend_multiprocessing,
     ) as llm:
-        model_config = await llm.get_model_config()
+        model_config = llm.model_config
         assert all(
-            model_config.max_model_len >= (request.prompt_len +
-                                           request.expected_output_len)
-            for request in requests), (
-                "Please ensure that max_model_len is greater than the sum of"
-                " prompt_len and expected_output_len for all requests.")
+            model_config.max_model_len
+            >= (request.prompt_len + request.expected_output_len)
+            for request in requests
+        ), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests."
+        )
 
         # Add the requests to the engine.
-        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        prompts: list[TextPrompt | TokensPrompt] = []
         sampling_params: list[SamplingParams] = []
-        lora_requests: list[Optional[LoRARequest]] = []
+        lora_requests: list[LoRARequest | None] = []
         for request in requests:
-            prompts.append(
-                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                        multi_modal_data=request.multi_modal_data)
-                if "prompt_token_ids" in request.prompt else \
-                TextPrompt(prompt=request.prompt,
-                           multi_modal_data=request.multi_modal_data))
+            prompt = (
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"])
+                if "prompt_token_ids" in request.prompt
+                else TextPrompt(prompt=request.prompt)
+            )
+
+            if request.multi_modal_data:
+                assert isinstance(request.multi_modal_data, dict)
+                prompt["multi_modal_data"] = request.multi_modal_data
+
             sampling_params.append(
                 SamplingParams(
                     n=n,
@@ -195,19 +220,19 @@ async def run_vllm_async(
                     ignore_eos=True,
                     max_tokens=request.expected_output_len,
                     detokenize=not disable_detokenize,
-                ))
+                )
+            )
+            prompts.append(prompt)
             lora_requests.append(request.lora_request)
 
         generators = []
         start = time.perf_counter()
         if do_profile:
             await llm.start_profile()
-        for i, (prompt, sp,
-                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
-            generator = llm.generate(prompt,
-                                     sp,
-                                     lora_request=lr,
-                                     request_id=f"test{i}")
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
             generators.append(generator)
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
@@ -228,7 +253,8 @@ def run_hf(
     disable_detokenize: bool = False,
 ) -> float:
     llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+        model, dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token
@@ -251,14 +277,15 @@ def run_hf(
             # Check if we can add more requests to the batch.
             next_prompt_len = requests[i + 1].prompt_len
             next_output_len = requests[i + 1].expected_output_len
-            if (max(max_prompt_len, next_prompt_len) +
-                    max(max_output_len, next_output_len)) <= 2048:
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
                 # We can add more requests to the batch.
                 continue
 
         # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt",
-                              padding=True).input_ids
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
         llm_outputs = llm.generate(
             input_ids=input_ids.cuda(),
             do_sample=True,
@@ -281,8 +308,9 @@ def run_hf(
     return end - start
 
 
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={
@@ -290,9 +318,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
             "tokens_per_second": [results["tokens_per_second"]],
         },
         extra_info={
-            k: results[k]
-            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-        })
+            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        },
+    )
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
         pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
@@ -324,7 +352,8 @@ def get_requests(args, tokenizer):
             sample_kwargs["enable_multimodal_chat"] = True
     elif args.dataset_name == "sonnet":
         assert tokenizer.chat_template or tokenizer.default_chat_template, (
-            "Tokenizer/model must have chat template for sonnet dataset.")
+            "Tokenizer/model must have chat template for sonnet dataset."
+        )
         dataset_cls = SonnetDataset
         sample_kwargs["prefix_len"] = args.prefix_len
         sample_kwargs["return_prompt_formatted"] = True
@@ -333,21 +362,26 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "hf":
         if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = VisionArenaDataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
             sample_kwargs["enable_multimodal_chat"] = True
         elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_split"] = "train"
+        elif args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = MultiModalConversationDataset
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = ConversationDataset
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
         elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = AIMODataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
     elif args.dataset_name == "prefix_repetition":
         dataset_cls = PrefixRepetitionRandomDataset
         sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
@@ -358,7 +392,26 @@ def get_requests(args, tokenizer):
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
     sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
-    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+    requests = dataset_cls(**common_kwargs).sample(**sample_kwargs)
+    requests = filter_requests_for_dp(requests, args.data_parallel_size)
+    return requests
+
+
+def filter_requests_for_dp(requests, data_parallel_size):
+    # Note(zhuohan): The way we get data_parallel_rank is hacky and only
+    # works for external launcher mode. Should be cleaned up and deprecated
+    # in the future with a better vLLM distributed process design.
+    if data_parallel_size == 1:
+        return requests
+
+    global_rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    data_parallel_rank = global_rank // (world_size // data_parallel_size)
+    return [
+        r
+        for i, r in enumerate(requests)
+        if i % data_parallel_size == data_parallel_rank
+    ]
 
 
 def validate_args(args):
@@ -371,7 +424,8 @@ def validate_args(args):
         warnings.warn(
             "The '--dataset' argument will be deprecated in the next release. "
             "Please use '--dataset-name' and '--dataset-path' instead.",
-            stacklevel=2)
+            stacklevel=2,
+        )
         args.dataset_path = args.dataset
 
     if not getattr(args, "tokenizer", None):
@@ -388,9 +442,8 @@ def validate_args(args):
         and not args.dataset_path
         and args.dataset_name not in {"prefix_repetition"}
     ):
-        print(
-            "When dataset path is not set, it will default to random dataset")
-        args.dataset_name = 'random'
+        print("When dataset path is not set, it will default to random dataset")
+        args.dataset_name = "random"
         if args.input_len is None:
             raise ValueError("input_len must be provided for a random dataset")
 
@@ -398,41 +451,56 @@ def validate_args(args):
     # --hf-subset and --hf-split: only used
     # when dataset_name is 'hf'
     if args.dataset_name != "hf" and (
-            getattr(args, "hf_subset", None) is not None
-            or getattr(args, "hf_split", None) is not None):
-        warnings.warn("--hf-subset and --hf-split will be ignored \
+        getattr(args, "hf_subset", None) is not None
+        or getattr(args, "hf_split", None) is not None
+    ):
+        warnings.warn(
+            "--hf-subset and --hf-split will be ignored \
                 since --dataset-name is not 'hf'.",
-                      stacklevel=2)
+            stacklevel=2,
+        )
     elif args.dataset_name == "hf":
         if args.dataset_path in (
-                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-                | ConversationDataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
-                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm-chat", (
+                f"{args.dataset_path} needs to use vllm-chat as the backend."
+            )
+        elif args.dataset_path in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm", (
+                f"{args.dataset_path} needs to use vllm as the backend."
+            )
         else:
-            raise ValueError(
-                f"{args.dataset_path} is not supported by hf dataset.")
+            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
 
     # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != 'random' and args.random_range_ratio is not None:
-        warnings.warn("--random-range-ratio will be ignored since \
+    if args.dataset_name != "random" and args.random_range_ratio is not None:
+        warnings.warn(
+            "--random-range-ratio will be ignored since \
                 --dataset-name is not 'random'.",
-                      stacklevel=2)
+            stacklevel=2,
+        )
 
     # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
     # set.
-    if args.dataset_name not in {"random", "sonnet", None
-                                 } and args.prefix_len is not None:
-        warnings.warn("--prefix-len will be ignored since --dataset-name\
+    if (
+        args.dataset_name not in {"random", "sonnet", None}
+        and args.prefix_len is not None
+    ):
+        warnings.warn(
+            "--prefix-len will be ignored since --dataset-name\
                  is not 'random', 'sonnet', or not set.",
-                      stacklevel=2)
+            stacklevel=2,
+        )
 
     # === LoRA Settings ===
     if getattr(args, "enable_lora", False) and args.backend != "vllm":
-        raise ValueError(
-            "LoRA benchmarking is only supported for vLLM backend")
+        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
     if getattr(args, "enable_lora", False) and args.lora_path is None:
         raise ValueError("LoRA path must be provided when enable_lora is True")
 
@@ -442,8 +510,10 @@ def validate_args(args):
     if args.backend != "hf" and args.hf_max_batch_size is not None:
         raise ValueError("HF max batch size is only for HF backend.")
 
-    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
-                                                 None) is not None:
+    if (
+        args.backend in {"hf", "mii"}
+        and getattr(args, "quantization", None) is not None
+    ):
         raise ValueError("Quantization is only for vLLM backend.")
 
     if args.backend == "mii" and args.dtype != "auto":
@@ -451,32 +521,36 @@ def validate_args(args):
     if args.backend == "mii" and args.n != 1:
         raise ValueError("n must be 1 for MII backend.")
     if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError("Tokenizer must be the same as the model for MII backend.")
+
+    if args.data_parallel_size > 1 and (
+        args.distributed_executor_backend != "external_launcher" or args.async_engine
+    ):
+        # --data-parallel is not supported fully.
+        # Old issue: https://github.com/vllm-project/vllm/issues/16222
+        # Currently we only support data parallel with external launcher
+        # mode (i.e., launch with toruchrun).
         raise ValueError(
-            "Tokenizer must be the same as the model for MII backend.")
-    
-    # --data-parallel is not supported currently.
-    # https://github.com/vllm-project/vllm/issues/16222
-    if args.data_parallel_size > 1:
-        raise ValueError(
-            "Data parallel is not supported in offline benchmark, "
+            "Data parallel is only supported with external launcher mode "
+            "with synchronous engine in offline benchmark, "
             "please use benchmark serving instead"
         )
 
 
 def add_cli_args(parser: argparse.ArgumentParser):
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii", "vllm-chat"],
-                        default="vllm")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["vllm", "hf", "mii", "vllm-chat"],
+        default="vllm",
+    )
     parser.add_argument(
         "--dataset-name",
         type=str,
-        choices=[
-            "sharegpt", "random", "sonnet", "burstgpt", "hf",
-            "prefix_repetition"
-        ],
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"],
         help="Name of the dataset to benchmark on.",
-        default="sharegpt")
+        default="sharegpt",
+    )
     parser.add_argument(
         "--dataset",
         type=str,
@@ -484,57 +558,70 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Path to the ShareGPT dataset, will be deprecated in\
             the next release. The dataset is expected to "
         "be a json in form of list[dict[..., conversations: "
-        "list[dict[..., value: <prompt_or_response>]]]]")
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--hf-max-batch-size",
-                        type=int,
-                        default=None,
-                        help="Maximum batch size for HF backend.")
+        "list[dict[..., value: <prompt_or_response>]]]]",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
     parser.add_argument(
-        '--output-json',
+        "--output-json",
         type=str,
         default=None,
-        help='Path to save the throughput results in JSON format.')
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
     parser.add_argument(
         "--disable-detokenize",
         action="store_true",
-        help=("Do not detokenize the response (i.e. do not include "
-              "detokenization time in the measurement)"))
+        help=(
+            "Do not detokenize the response (i.e. do not include "
+            "detokenization time in the measurement)"
+        ),
+    )
     # LoRA
     parser.add_argument(
         "--lora-path",
         type=str,
         default=None,
         help="Path to the lora adapters to use. This can be an absolute path, "
-        "a relative path, or a Hugging Face model identifier.")
+        "a relative path, or a Hugging Face model identifier.",
+    )
     parser.add_argument(
         "--prefix-len",
         type=int,
@@ -554,24 +641,24 @@ def add_cli_args(parser: argparse.ArgumentParser):
     )
 
     # hf dtaset
-    parser.add_argument("--hf-subset",
-                        type=str,
-                        default=None,
-                        help="Subset of the HF dataset.")
-    parser.add_argument("--hf-split",
-                        type=str,
-                        default=None,
-                        help="Split of the HF dataset.")
+    parser.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    parser.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
         default=False,
         help="Use Torch Profiler. The env variable "
-        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
+        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
+    )
 
     # prefix repetition dataset
     prefix_repetition_group = parser.add_argument_group(
-        "prefix repetition dataset options")
+        "prefix repetition dataset options"
+    )
     prefix_repetition_group.add_argument(
         "--prefix-repetition-prefix-len",
         type=int,
@@ -613,11 +700,11 @@ def main(args: argparse.Namespace):
     random.seed(args.seed)
     # Sample the requests.
     tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
     requests = get_requests(args, tokenizer)
-    is_multi_modal = any(request.multi_modal_data is not None
-                         for request in requests)
-    request_outputs: Optional[list[RequestOutput]] = None
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+    request_outputs: list[RequestOutput] | None = None
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -628,24 +715,37 @@ def main(args: argparse.Namespace):
                     disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
                     disable_detokenize=args.disable_detokenize,
                     do_profile=args.profile,
-                ))
+                )
+            )
         else:
             elapsed_time, request_outputs = run_vllm(
-                requests, args.n, EngineArgs.from_cli_args(args),
+                requests,
+                args.n,
+                EngineArgs.from_cli_args(args),
                 disable_detokenize=args.disable_detokenize,
-                do_profile=args.profile)
+                do_profile=args.profile,
+            )
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         if args.profile:
-            raise NotImplementedError(
-                "Profiling not implemented yet for backend='hf'.")
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code,
-                              args.disable_detokenize)
+            raise NotImplementedError("Profiling not implemented yet for backend='hf'.")
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+            args.disable_detokenize,
+        )
     elif args.backend == "vllm-chat":
         elapsed_time, request_outputs = run_vllm_chat(
-            requests, args.n, EngineArgs.from_cli_args(args),
-            disable_detokenize=args.disable_detokenize, do_profile=args.profile)
+            requests,
+            args.n,
+            EngineArgs.from_cli_args(args),
+            disable_detokenize=args.disable_detokenize,
+            do_profile=args.profile,
+        )
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
 
@@ -657,28 +757,31 @@ def main(args: argparse.Namespace):
         for ro in request_outputs:
             if not isinstance(ro, RequestOutput):
                 continue
-            total_prompt_tokens += len(
-                ro.prompt_token_ids) if ro.prompt_token_ids else 0
-            total_output_tokens += sum(
-                len(o.token_ids) for o in ro.outputs if o)
+            total_prompt_tokens += (
+                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            )
+            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
         total_num_tokens = total_prompt_tokens + total_output_tokens
     else:
-        total_num_tokens = sum(r.prompt_len + r.expected_output_len
-                               for r in requests)
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
         total_output_tokens = sum(r.expected_output_len for r in requests)
         total_prompt_tokens = total_num_tokens - total_output_tokens
 
     if is_multi_modal and args.backend != "vllm-chat":
-        print("\033[91mWARNING\033[0m: Multi-modal request with "
-              f"{args.backend} backend detected. The "
-              "following metrics are not accurate because image tokens are not"
-              " counted. See vllm-project/vllm/issues/9778 for details.")
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request with "
+            f"{args.backend} backend detected. The "
+            "following metrics are not accurate because image tokens are not"
+            " counted. See vllm-project/vllm/issues/9778 for details."
+        )
         # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
         # vllm-chat backend counts the image tokens now
 
-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+    )
     print(f"Total num prompt tokens:  {total_prompt_tokens}")
     print(f"Total num output tokens:  {total_output_tokens}")
 
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index fb9d3657790cffca3e7e0cf5d88cccf9b1abbea0..4ca0852e3998f7ebc40c366228b9bd63feafaaf2 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -9,6 +9,7 @@ import locale
 import os
 import subprocess
 import sys
+
 # Unlike the rest of the PyTorch this file must be python2 compliant.
 # This script outputs relevant system environment info
 # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
@@ -20,45 +21,47 @@ from vllm.envs import environment_variables
 
 try:
     import torch
+
     TORCH_AVAILABLE = True
 except (ImportError, NameError, AttributeError, OSError):
     TORCH_AVAILABLE = False
 
 # System Environment Information
 SystemEnv = namedtuple(
-    'SystemEnv',
+    "SystemEnv",
     [
-        'torch_version',
-        'is_debug_build',
-        'cuda_compiled_version',
-        'gcc_version',
-        'clang_version',
-        'cmake_version',
-        'os',
-        'libc_version',
-        'python_version',
-        'python_platform',
-        'is_cuda_available',
-        'cuda_runtime_version',
-        'cuda_module_loading',
-        'nvidia_driver_version',
-        'nvidia_gpu_models',
-        'cudnn_version',
-        'pip_version',  # 'pip' or 'pip3'
-        'pip_packages',
-        'conda_packages',
-        'hip_compiled_version',
-        'hip_runtime_version',
-        'miopen_runtime_version',
-        'caching_allocator_config',
-        'is_xnnpack_available',
-        'cpu_info',
-        'rocm_version',  # vllm specific field
-        'vllm_version',  # vllm specific field
-        'vllm_build_flags',  # vllm specific field
-        'gpu_topo',  # vllm specific field
-        'env_vars',
-    ])
+        "torch_version",
+        "is_debug_build",
+        "cuda_compiled_version",
+        "gcc_version",
+        "clang_version",
+        "cmake_version",
+        "os",
+        "libc_version",
+        "python_version",
+        "python_platform",
+        "is_cuda_available",
+        "cuda_runtime_version",
+        "cuda_module_loading",
+        "nvidia_driver_version",
+        "nvidia_gpu_models",
+        "cudnn_version",
+        "pip_version",  # 'pip' or 'pip3'
+        "pip_packages",
+        "conda_packages",
+        "hip_compiled_version",
+        "hip_runtime_version",
+        "miopen_runtime_version",
+        "caching_allocator_config",
+        "is_xnnpack_available",
+        "cpu_info",
+        "rocm_version",  # vllm specific field
+        "vllm_version",  # vllm specific field
+        "vllm_build_flags",  # vllm specific field
+        "gpu_topo",  # vllm specific field
+        "env_vars",
+    ],
+)
 
 DEFAULT_CONDA_PATTERNS = {
     "torch",
@@ -98,18 +101,17 @@ def run(command):
     """Return (return-code, stdout, stderr)."""
     shell = True if type(command) is str else False
     try:
-        p = subprocess.Popen(command,
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.PIPE,
-                             shell=shell)
+        p = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
+        )
         raw_output, raw_err = p.communicate()
         rc = p.returncode
-        if get_platform() == 'win32':
-            enc = 'oem'
+        if get_platform() == "win32":
+            enc = "oem"
         else:
             enc = locale.getpreferredencoding()
         output = raw_output.decode(enc)
-        if command == 'nvidia-smi topo -m':
+        if command == "nvidia-smi topo -m":
             # don't remove the leading whitespace of `nvidia-smi topo -m`
             #   because they are meaningful
             output = output.rstrip()
@@ -120,7 +122,7 @@ def run(command):
 
     except FileNotFoundError:
         cmd_str = command if isinstance(command, str) else command[0]
-        return 127, '', f"Command not found: {cmd_str}"
+        return 127, "", f"Command not found: {cmd_str}"
 
 
 def run_and_read_all(run_lambda, command):
@@ -147,49 +149,54 @@ def run_and_return_first_line(run_lambda, command):
     rc, out, _ = run_lambda(command)
     if rc != 0:
         return None
-    return out.split('\n')[0]
+    return out.split("\n")[0]
 
 
 def get_conda_packages(run_lambda, patterns=None):
     if patterns is None:
         patterns = DEFAULT_CONDA_PATTERNS
-    conda = os.environ.get('CONDA_EXE', 'conda')
-    out = run_and_read_all(run_lambda, [conda, 'list'])
+    conda = os.environ.get("CONDA_EXE", "conda")
+    out = run_and_read_all(run_lambda, [conda, "list"])
     if out is None:
         return out
 
-    return "\n".join(line for line in out.splitlines()
-                     if not line.startswith("#") and any(name in line
-                                                         for name in patterns))
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#") and any(name in line for name in patterns)
+    )
 
 
 def get_gcc_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
 
 
 def get_clang_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'clang --version',
-                                     r'clang version (.*)')
+    return run_and_parse_first_match(
+        run_lambda, "clang --version", r"clang version (.*)"
+    )
 
 
 def get_cmake_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'cmake --version',
-                                     r'cmake (.*)')
+    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
 
 
 def get_nvidia_driver_version(run_lambda):
-    if get_platform() == 'darwin':
-        cmd = 'kextstat | grep -i cuda'
-        return run_and_parse_first_match(run_lambda, cmd,
-                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    if get_platform() == "darwin":
+        cmd = "kextstat | grep -i cuda"
+        return run_and_parse_first_match(
+            run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]"
+        )
     smi = get_nvidia_smi()
-    return run_and_parse_first_match(run_lambda, smi,
-                                     r'Driver Version: (.*?) ')
+    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
 
 
 def get_gpu_info(run_lambda):
-    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
-            torch.version, 'hip') and torch.version.hip is not None):
+    if get_platform() == "darwin" or (
+        TORCH_AVAILABLE
+        and hasattr(torch.version, "hip")
+        and torch.version.hip is not None
+    ):
         if TORCH_AVAILABLE and torch.cuda.is_available():
             if torch.version.hip is not None:
                 prop = torch.cuda.get_device_properties(0)
@@ -202,43 +209,42 @@ def get_gpu_info(run_lambda):
             return torch.cuda.get_device_name(None) + gcnArch
         return None
     smi = get_nvidia_smi()
-    uuid_regex = re.compile(r' \(UUID: .+?\)')
-    rc, out, _ = run_lambda(smi + ' -L')
+    uuid_regex = re.compile(r" \(UUID: .+?\)")
+    rc, out, _ = run_lambda(smi + " -L")
     if rc != 0:
         return None
     # Anonymize GPUs by removing their UUID
-    return re.sub(uuid_regex, '', out)
+    return re.sub(uuid_regex, "", out)
 
 
 def get_running_cuda_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'nvcc --version',
-                                     r'release .+ V(.*)')
+    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
 
 
 def get_cudnn_version(run_lambda):
     """Return a list of libcudnn.so; it's hard to tell which one is being used."""
-    if get_platform() == 'win32':
-        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
-        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
-        where_cmd = os.path.join(system_root, 'System32', 'where')
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, "System32", "where")
         cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
-    elif get_platform() == 'darwin':
+    elif get_platform() == "darwin":
         # CUDA libraries and drivers can be found in /usr/local/cuda/. See
         # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
         # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
         # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
-        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
     else:
         cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
     rc, out, _ = run_lambda(cudnn_cmd)
     # find will return 1 if there are permission errors or if not found
     if len(out) == 0 or (rc != 1 and rc != 0):
-        l = os.environ.get('CUDNN_LIBRARY')
+        l = os.environ.get("CUDNN_LIBRARY")
         if l is not None and os.path.isfile(l):
             return os.path.realpath(l)
         return None
     files_set = set()
-    for fn in out.split('\n'):
+    for fn in out.split("\n"):
         fn = os.path.realpath(fn)  # eliminate symbolic links
         if os.path.isfile(fn):
             files_set.add(fn)
@@ -248,20 +254,20 @@ def get_cudnn_version(run_lambda):
     files = sorted(files_set)
     if len(files) == 1:
         return files[0]
-    result = '\n'.join(files)
-    return 'Probably one of the following:\n{}'.format(result)
+    result = "\n".join(files)
+    return "Probably one of the following:\n{}".format(result)
 
 
 def get_nvidia_smi():
     # Note: nvidia-smi is currently available only on Windows and Linux
-    smi = 'nvidia-smi'
-    if get_platform() == 'win32':
-        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
-        program_files_root = os.environ.get('PROGRAMFILES',
-                                            'C:\\Program Files')
-        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
-                                   'NVSMI', smi)
-        new_path = os.path.join(system_root, 'System32', smi)
+    smi = "nvidia-smi"
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
+        legacy_path = os.path.join(
+            program_files_root, "NVIDIA Corporation", "NVSMI", smi
+        )
+        new_path = os.path.join(system_root, "System32", smi)
         smis = [new_path, legacy_path]
         for candidate_smi in smis:
             if os.path.exists(candidate_smi):
@@ -272,8 +278,9 @@ def get_nvidia_smi():
 
 def get_rocm_version(run_lambda):
     """Returns the ROCm version if available, otherwise 'N/A'."""
-    return run_and_parse_first_match(run_lambda, 'hipcc --version',
-                                     r'HIP version: (\S+)')
+    return run_and_parse_first_match(
+        run_lambda, "hipcc --version", r"HIP version: (\S+)"
+    )
 
 
 def get_vllm_version():
@@ -282,12 +289,12 @@ def get_vllm_version():
     if __version__ == "dev":
         return "N/A (dev)"
     version_str = __version_tuple__[-1]
-    if isinstance(version_str, str) and version_str.startswith('g'):
+    if isinstance(version_str, str) and version_str.startswith("g"):
         # it's a dev build
-        if '.' in version_str:
+        if "." in version_str:
             # it's a dev build containing local changes
-            git_sha = version_str.split('.')[0][1:]
-            date = version_str.split('.')[-1][1:]
+            git_sha = version_str.split(".")[0][1:]
+            date = version_str.split(".")[-1][1:]
             return f"{__version__} (git sha: {git_sha}, date: {date})"
         else:
             # it's a dev build without local changes
@@ -298,19 +305,19 @@ def get_vllm_version():
 
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
-    return 'CUDA Archs: {}; ROCm: {}'.format(
-        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
-        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
+    return "CUDA Archs: {}; ROCm: {}".format(
+        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
+        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
     )
 
 
 def get_gpu_topo(run_lambda):
     output = None
 
-    if get_platform() == 'linux':
-        output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+    if get_platform() == "linux":
+        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
         if output is None:
-            output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
+            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
 
     return output
 
@@ -392,17 +399,17 @@ def get_gpu_topo(run_lambda):
 
 
 def get_cpu_info(run_lambda):
-    rc, out, err = 0, '', ''
-    if get_platform() == 'linux':
-        rc, out, err = run_lambda('lscpu')
-    elif get_platform() == 'win32':
+    rc, out, err = 0, "", ""
+    if get_platform() == "linux":
+        rc, out, err = run_lambda("lscpu")
+    elif get_platform() == "win32":
         rc, out, err = run_lambda(
-            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
-        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
+            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
         )
-    elif get_platform() == 'darwin':
+    elif get_platform() == "darwin":
         rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
-    cpu_info = 'None'
+    cpu_info = "None"
     if rc == 0:
         cpu_info = out
     else:
@@ -411,67 +418,69 @@ def get_cpu_info(run_lambda):
 
 
 def get_platform():
-    if sys.platform.startswith('linux'):
-        return 'linux'
-    elif sys.platform.startswith('win32'):
-        return 'win32'
-    elif sys.platform.startswith('cygwin'):
-        return 'cygwin'
-    elif sys.platform.startswith('darwin'):
-        return 'darwin'
+    if sys.platform.startswith("linux"):
+        return "linux"
+    elif sys.platform.startswith("win32"):
+        return "win32"
+    elif sys.platform.startswith("cygwin"):
+        return "cygwin"
+    elif sys.platform.startswith("darwin"):
+        return "darwin"
     else:
         return sys.platform
 
 
 def get_mac_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
-                                     r'(.*)')
+    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
 
 
 def get_windows_version(run_lambda):
-    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
-    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
-    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
+    findstr_cmd = os.path.join(system_root, "System32", "findstr")
     return run_and_read_all(
-        run_lambda,
-        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+        run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd)
+    )
 
 
 def get_lsb_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
-                                     r'Description:\t(.*)')
+    return run_and_parse_first_match(
+        run_lambda, "lsb_release -a", r"Description:\t(.*)"
+    )
 
 
 def check_release_file(run_lambda):
-    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
-                                     r'PRETTY_NAME="(.*)"')
+    return run_and_parse_first_match(
+        run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
+    )
 
 
 def get_os(run_lambda):
     from platform import machine
+
     platform = get_platform()
 
-    if platform == 'win32' or platform == 'cygwin':
+    if platform == "win32" or platform == "cygwin":
         return get_windows_version(run_lambda)
 
-    if platform == 'darwin':
+    if platform == "darwin":
         version = get_mac_version(run_lambda)
         if version is None:
             return None
-        return 'macOS {} ({})'.format(version, machine())
+        return "macOS {} ({})".format(version, machine())
 
-    if platform == 'linux':
+    if platform == "linux":
         # Ubuntu/Debian based
         desc = get_lsb_version(run_lambda)
         if desc is not None:
-            return '{} ({})'.format(desc, machine())
+            return "{} ({})".format(desc, machine())
 
         # Try reading /etc/*-release
         desc = check_release_file(run_lambda)
         if desc is not None:
-            return '{} ({})'.format(desc, machine())
+            return "{} ({})".format(desc, machine())
 
-        return '{} ({})'.format(platform, machine())
+        return "{} ({})".format(platform, machine())
 
     # Unknown platform
     return platform
@@ -479,23 +488,25 @@ def get_os(run_lambda):
 
 def get_python_platform():
     import platform
+
     return platform.platform()
 
 
 def get_libc_version():
     import platform
-    if get_platform() != 'linux':
-        return 'N/A'
-    return '-'.join(platform.libc_ver())
+
+    if get_platform() != "linux":
+        return "N/A"
+    return "-".join(platform.libc_ver())
 
 
 def is_uv_venv():
     if os.environ.get("UV"):
         return True
-    pyvenv_cfg_path = os.path.join(sys.prefix, 'pyvenv.cfg')
+    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
     if os.path.exists(pyvenv_cfg_path):
-        with open(pyvenv_cfg_path, 'r') as f:
-            return any(line.startswith('uv = ') for line in f)
+        with open(pyvenv_cfg_path, "r") as f:
+            return any(line.startswith("uv = ") for line in f)
     return False
 
 
@@ -507,13 +518,14 @@ def get_pip_packages(run_lambda, patterns=None):
     def run_with_pip():
         try:
             import importlib.util
-            pip_spec = importlib.util.find_spec('pip')
+
+            pip_spec = importlib.util.find_spec("pip")
             pip_available = pip_spec is not None
         except ImportError:
             pip_available = False
 
         if pip_available:
-            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
         elif is_uv_venv():
             print("uv is set")
             cmd = ["uv", "pip", "list", "--format=freeze"]
@@ -523,23 +535,24 @@ def get_pip_packages(run_lambda, patterns=None):
             )
 
         out = run_and_read_all(run_lambda, cmd)
-        return "\n".join(line for line in out.splitlines()
-                         if any(name in line for name in patterns))
+        return "\n".join(
+            line for line in out.splitlines() if any(name in line for name in patterns)
+        )
 
-    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    pip_version = "pip3" if sys.version[0] == "3" else "pip"
     out = run_with_pip()
     return pip_version, out
 
 
 def get_cachingallocator_config():
-    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
     return ca_config
 
 
 def get_cuda_module_loading_config():
     if TORCH_AVAILABLE and torch.cuda.is_available():
         torch.cuda.init()
-        config = os.environ.get('CUDA_MODULE_LOADING', '')
+        config = os.environ.get("CUDA_MODULE_LOADING", "")
         return config
     else:
         return "N/A"
@@ -548,17 +561,26 @@ def get_cuda_module_loading_config():
 def is_xnnpack_available():
     if TORCH_AVAILABLE:
         import torch.backends.xnnpack
-        return str(
-            torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
     else:
         return "N/A"
 
 
 def get_env_vars():
-    env_vars = ''
-    secret_terms = ('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_", "NVIDIA")
+    env_vars = ""
+    secret_terms = ("secret", "token", "api", "access", "password")
+    report_prefix = (
+        "TORCH",
+        "NCCL",
+        "PYTORCH",
+        "CUDA",
+        "CUBLAS",
+        "CUDNN",
+        "OMP_",
+        "MKL_",
+        "NVIDIA",
+    )
     for k, v in os.environ.items():
         if any(term in k.lower() for term in secret_terms):
             continue
@@ -579,23 +601,24 @@ def get_env_info():
         debug_mode_str = str(torch.version.debug)
         cuda_available_str = str(torch.cuda.is_available())
         cuda_version_str = torch.version.cuda
-        if not hasattr(torch.version,
-                       'hip') or torch.version.hip is None:  # cuda version
-            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        if (
+            not hasattr(torch.version, "hip") or torch.version.hip is None
+        ):  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
         else:  # HIP version
 
             def get_version_or_na(cfg, prefix):
                 _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
-                return _lst[0] if _lst else 'N/A'
+                return _lst[0] if _lst else "N/A"
 
-            cfg = torch._C._show_config().split('\n')
-            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
-            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
-            cuda_version_str = 'N/A'
+            cfg = torch._C._show_config().split("\n")
+            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
+            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
+            cuda_version_str = "N/A"
             hip_compiled_version = torch.version.hip
     else:
-        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
-        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
 
     sys_version = sys.version.replace("\n", " ")
 
@@ -609,9 +632,9 @@ def get_env_info():
     return SystemEnv(
         torch_version=version_str,
         is_debug_build=debug_mode_str,
-        python_version='{} ({}-bit runtime)'.format(
-            sys_version,
-            sys.maxsize.bit_length() + 1),
+        python_version="{} ({}-bit runtime)".format(
+            sys_version, sys.maxsize.bit_length() + 1
+        ),
         python_platform=get_python_platform(),
         is_cuda_available=cuda_available_str,
         cuda_compiled_version=cuda_version_str,
@@ -715,15 +738,14 @@ GPU Topology:
 
 
 def pretty_str(envinfo):
-
-    def replace_nones(dct, replacement='Could not collect'):
+    def replace_nones(dct, replacement="Could not collect"):
         for key in dct.keys():
             if dct[key] is not None:
                 continue
             dct[key] = replacement
         return dct
 
-    def replace_bools(dct, true='Yes', false='No'):
+    def replace_bools(dct, true="Yes", false="No"):
         for key in dct.keys():
             if dct[key] is True:
                 dct[key] = true
@@ -731,43 +753,48 @@ def pretty_str(envinfo):
                 dct[key] = false
         return dct
 
-    def prepend(text, tag='[prepend]'):
-        lines = text.split('\n')
+    def prepend(text, tag="[prepend]"):
+        lines = text.split("\n")
         updated_lines = [tag + line for line in lines]
-        return '\n'.join(updated_lines)
+        return "\n".join(updated_lines)
 
-    def replace_if_empty(text, replacement='No relevant packages'):
+    def replace_if_empty(text, replacement="No relevant packages"):
         if text is not None and len(text) == 0:
             return replacement
         return text
 
     def maybe_start_on_next_line(string):
         # If `string` is multiline, prepend a \n to it.
-        if string is not None and len(string.split('\n')) > 1:
-            return '\n{}\n'.format(string)
+        if string is not None and len(string.split("\n")) > 1:
+            return "\n{}\n".format(string)
         return string
 
     mutable_dict = envinfo._asdict()
 
     # If nvidia_gpu_models is multiline, start on the next line
-    mutable_dict['nvidia_gpu_models'] = \
-        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
+        envinfo.nvidia_gpu_models
+    )
 
     # If the machine doesn't have CUDA, report some fields as 'No CUDA'
     dynamic_cuda_fields = [
-        'cuda_runtime_version',
-        'nvidia_gpu_models',
-        'nvidia_driver_version',
+        "cuda_runtime_version",
+        "nvidia_gpu_models",
+        "nvidia_driver_version",
     ]
-    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
-    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
-                                          for field in dynamic_cuda_fields)
-    if TORCH_AVAILABLE and not torch.cuda.is_available(
-    ) and all_dynamic_cuda_fields_missing:
+    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields
+    )
+    if (
+        TORCH_AVAILABLE
+        and not torch.cuda.is_available()
+        and all_dynamic_cuda_fields_missing
+    ):
         for field in all_cuda_fields:
-            mutable_dict[field] = 'No CUDA'
+            mutable_dict[field] = "No CUDA"
         if envinfo.cuda_compiled_version is None:
-            mutable_dict['cuda_compiled_version'] = 'None'
+            mutable_dict["cuda_compiled_version"] = "None"
 
     # Replace True with Yes, False with No
     mutable_dict = replace_bools(mutable_dict)
@@ -776,20 +803,20 @@ def pretty_str(envinfo):
     mutable_dict = replace_nones(mutable_dict)
 
     # If either of these are '', replace with 'No relevant packages'
-    mutable_dict['pip_packages'] = replace_if_empty(
-        mutable_dict['pip_packages'])
-    mutable_dict['conda_packages'] = replace_if_empty(
-        mutable_dict['conda_packages'])
+    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
+    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
 
     # Tag conda and pip packages with a prefix
     # If they were previously None, they'll show up as ie '[conda] Could not collect'
-    if mutable_dict['pip_packages']:
-        mutable_dict['pip_packages'] = prepend(
-            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
-    if mutable_dict['conda_packages']:
-        mutable_dict['conda_packages'] = prepend(
-            mutable_dict['conda_packages'], '[conda] ')
-    mutable_dict['cpu_info'] = envinfo.cpu_info
+    if mutable_dict["pip_packages"]:
+        mutable_dict["pip_packages"] = prepend(
+            mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)
+        )
+    if mutable_dict["conda_packages"]:
+        mutable_dict["conda_packages"] = prepend(
+            mutable_dict["conda_packages"], "[conda] "
+        )
+    mutable_dict["cpu_info"] = envinfo.cpu_info
     return env_info_fmt.format(**mutable_dict)
 
 
@@ -802,22 +829,29 @@ def main():
     output = get_pretty_env_info()
     print(output)
 
-    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
-            torch.utils, '_crash_handler'):
+    if (
+        TORCH_AVAILABLE
+        and hasattr(torch, "utils")
+        and hasattr(torch.utils, "_crash_handler")
+    ):
         minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
         if sys.platform == "linux" and os.path.exists(minidump_dir):
             dumps = [
-                os.path.join(minidump_dir, dump)
-                for dump in os.listdir(minidump_dir)
+                os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)
             ]
             latest = max(dumps, key=os.path.getctime)
             ctime = os.path.getctime(latest)
             creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
-                '%Y-%m-%d %H:%M:%S')
-            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
-                  "if this is related to your bug please include it when you file a report ***"
+                "%Y-%m-%d %H:%M:%S"
+            )
+            msg = (
+                "\n*** Detected a minidump at {} created on {}, ".format(
+                    latest, creation_time
+                )
+                + "if this is related to your bug please include it when you file a report ***"
+            )
             print(msg, file=sys.stderr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index a9d0228c29e050be49be3e6c9defd937ef18eb19..ad894d6e82768dad5a8ddd9535a9bcffd6f81bcf 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -5,18 +5,25 @@ from abc import ABC, abstractmethod
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only,
-                                             register_replacement)
+from torch._inductor.pattern_matcher import (
+    PatternMatcherPass,
+    fwd_only,
+    register_replacement,
+)
 from torch._ops import OpOverload
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale)
+    QuantKey,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+)
 from vllm.platforms import current_platform
 
 from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherQuantFP8, MatcherSiluAndMul
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -29,11 +36,11 @@ SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 # FUSED_OPS: dict[QuantKey, OpOverload] = {
 #     kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
 # }
-# silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
-#     torch.ops._C, "silu_and_mul_nvfp4_quant"))
+# silu_and_mul_nvfp4_quant_supported = current_platform.is_cuda() and hasattr(
+#     torch.ops._C, "silu_and_mul_nvfp4_quant"
+# )
 # if silu_and_mul_nvfp4_quant_supported:
-#     FUSED_OPS[
-#         kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
+#     FUSED_OPS[kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
 
 
 class ActivationQuantPattern(ABC):
@@ -49,16 +56,20 @@ class ActivationQuantPattern(ABC):
         self.quant_key = quant_key
         self.quant_dtype = quant_key.dtype
 
-        assert self.quant_key in QUANT_OPS, \
+        assert self.quant_key in QUANT_OPS, (
             f"unsupported quantization scheme {self.quant_key}"
+        )
         self.QUANT_OP = QUANT_OPS[self.quant_key]
 
-        assert self.quant_key in FUSED_OPS, \
+        assert self.quant_key in FUSED_OPS, (
             f"unsupported fusion scheme {self.quant_key}"
+        )
         self.FUSED_OP = FUSED_OPS[self.quant_key]
 
+        self.silu_and_mul_matcher = MatcherSiluAndMul()
+
     def empty_quant(self, *args, **kwargs):
-        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
     @abstractmethod
@@ -71,39 +82,38 @@ class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
     Fusion for SiluMul+Fp8StaticQuant Pattern
     """
 
-    def __init__(self, symmetric: bool = True):
-        quant_key = QuantKey(dtype=FP8_DTYPE,
-                             scale=kStaticTensorScale,
-                             symmetric=symmetric)
-        super().__init__(quant_key)
+    def __init__(self):
+        super().__init__(kFp8StaticTensorSym)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(result: torch.Tensor, result_silu_mul: torch.Tensor,
-                    input: torch.Tensor, scale: torch.Tensor):
-            at1 = auto_functionalized(SILU_MUL_OP,
-                                      result=result_silu_mul,
-                                      input=input)
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      result=result,
-                                      input=at1[1],
-                                      scale=scale)
-            return at2[1]
-
-        def replacement(result: torch.Tensor, result_silu_mul: torch.Tensor,
-                        input: torch.Tensor, scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     input=input,
-                                     scale=scale)
+        def pattern(
+            input: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            result_silu_mul = self.silu_and_mul_matcher(input)
+            result_quant = self.quant_matcher(result_silu_mul, scale)
+            return result_quant[0]
+
+        def replacement(
+            input: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            d = input.shape[-1] // 2
+            output_shape = input.shape[:-1] + (d,)
+            result = torch.empty(
+                output_shape, device=input.device, dtype=self.quant_dtype
+            )
+            at = auto_functionalized(
+                self.FUSED_OP, result=result, input=input, scale=scale
+            )
             return at[1]
 
         inputs = [
-            self.empty_quant(5, 4),  # result
-            empty_bf16(5, 4),  # result_silu_mul
-            empty_bf16(5, 4),  # input
-            empty_fp32(1, 1)  # scale
+            *self.silu_and_mul_matcher.inputs(),  # input
+            self.quant_matcher.inputs()[1],  # scale
         ]
+        pattern(*inputs)
 
         register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
 
@@ -117,36 +127,42 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
         super().__init__(kNvfp4Quant)
 
     def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            result: torch.Tensor,
+            output_scale: torch.Tensor,
+            input: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            result_silu_mul = self.silu_and_mul_matcher(input)
+            at = auto_functionalized(
+                self.QUANT_OP,
+                output=result,
+                input=result_silu_mul,
+                output_scale=output_scale,
+                input_scale=scale,
+            )
+            return at[1], at[2]
 
-        def pattern(result: torch.Tensor, output_scale: torch.Tensor,
-                    result_silu_mul: torch.Tensor, input: torch.Tensor,
-                    scale: torch.Tensor):
-            at1 = auto_functionalized(SILU_MUL_OP,
-                                      result=result_silu_mul,
-                                      input=input)
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      output=result,
-                                      input=at1[1],
-                                      output_scale=output_scale,
-                                      input_scale=scale)
-            return at2[1], at2[2]
-
-        def replacement(result: torch.Tensor, output_scale: torch.Tensor,
-                        result_silu_mul: torch.Tensor, input: torch.Tensor,
-                        scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     result_block_scale=output_scale,
-                                     input=input,
-                                     input_global_scale=scale)
+        def replacement(
+            result: torch.Tensor,
+            output_scale: torch.Tensor,
+            input: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                result_block_scale=output_scale,
+                input=input,
+                input_global_scale=scale,
+            )
             return at[1], at[2]
 
         inputs = [
             self.empty_quant(5, 32),  # result
             empty_i32(128, 4),  # output_scale
-            empty_bf16(5, 64),  # result_silu_mul
             empty_bf16(5, 64),  # input
-            empty_fp32(1, 1)  # scale
+            empty_fp32(1, 1),  # scale
         ]
 
         register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
@@ -167,7 +183,8 @@ class ActivationQuantFusionPass(VllmPatternMatcherPass):
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="activation_quant_fusion_pass")
+            pass_name="activation_quant_fusion_pass"
+        )
 
         pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern()
         pattern_silu_mul_fp8.register(self.patterns)
@@ -184,6 +201,9 @@ class ActivationQuantFusionPass(VllmPatternMatcherPass):
         logger.debug("Replaced %s patterns", self.matched_count)
 
     def uuid(self):
-        return VllmInductorPass.hash_source(self, ActivationQuantPattern,
-                                            SiluMulFp8StaticQuantPattern,
-                                            SiluMulNvfp4QuantPattern)
+        return VllmInductorPass.hash_source(
+            self,
+            ActivationQuantPattern,
+            SiluMulFp8StaticQuantPattern,
+            SiluMulNvfp4QuantPattern,
+        )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 17fc727b8fc70f305b26b4eb1e085c466f312452..60ef6eef216639f74d79fd677df71e897cbb2879 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -3,25 +3,39 @@
 
 import ast
 import dataclasses
+import hashlib
+import operator
 import os
 import pprint
 import time
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 import torch.fx as fx
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
+from vllm.compilation.inductor_pass import pass_context
+from vllm.compilation.partition_rules import (
+    inductor_partition_rule_context,
+    should_split,
+)
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
-
-from .compiler_interface import (CompilerInterface, EagerAdaptor,
-                                 InductorAdaptor, InductorStandaloneAdaptor)
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+from .caching import VllmSerializableFunction
+from .compiler_interface import (
+    CompilerInterface,
+    EagerAdaptor,
+    InductorAdaptor,
+    InductorStandaloneAdaptor,
+    is_compile_cache_enabled,
+)
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
 from .pass_manager import PostGradPassManager
@@ -30,18 +44,26 @@ logger = init_logger(__name__)
 
 
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
-    if compilation_config.use_inductor:
+    if compilation_config.backend == "inductor":
         # Use standalone compile only if requested, version is new enough,
         # and the symbol actually exists in this PyTorch build.
-        if (envs.VLLM_USE_STANDALONE_COMPILE
-                and is_torch_equal_or_newer("2.8.0.dev")
-                and hasattr(torch._inductor, "standalone_compile")):
+        if (
+            envs.VLLM_USE_STANDALONE_COMPILE
+            and is_torch_equal_or_newer("2.8.0.dev")
+            and hasattr(torch._inductor, "standalone_compile")
+        ):
             logger.debug("Using InductorStandaloneAdaptor")
-            return InductorStandaloneAdaptor()
+            return InductorStandaloneAdaptor(
+                compilation_config.compile_cache_save_format
+            )
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
     else:
+        assert compilation_config.backend == "eager", (
+            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
+        )
+
         logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
 
@@ -62,7 +84,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
+        self.cache: dict[tuple[int | None, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -70,10 +92,23 @@ class CompilerManager:
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
 
-    def initialize_cache(self,
-                         cache_dir: str,
-                         disable_cache: bool = False,
-                         prefix: str = ""):
+    @contextmanager
+    def compile_context(self, runtime_shape: int | None = None):
+        """Provide compilation context for the duration of compilation to set
+        any torch global properties we want to scope to a single Inductor
+        compilation (e.g. partition rules, pass context)."""
+        with pass_context(runtime_shape):
+            if self.compilation_config.use_inductor_graph_partition:
+                with inductor_partition_rule_context(
+                    self.compilation_config.splitting_ops
+                ):
+                    yield
+            else:
+                yield
+
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
         """
         Initialize the cache directory for the compiler.
 
@@ -101,9 +136,9 @@ class CompilerManager:
                 # do not use eval(), it is unsafe.
                 self.cache = ast.literal_eval(f.read())
 
-        self.compiler.initialize_cache(cache_dir=cache_dir,
-                                       disable_cache=disable_cache,
-                                       prefix=prefix)
+        self.compiler.initialize_cache(
+            cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix
+        )
 
     def save_to_file(self):
         if self.disable_cache or not self.is_cache_updated:
@@ -113,35 +148,46 @@ class CompilerManager:
         with open(self.cache_file_path, "w") as f:
             f.write(data)
 
-    def load(self,
-             graph: fx.GraphModule,
-             example_inputs: list[Any],
-             graph_index: int,
-             runtime_shape: Optional[int] = None) -> Optional[Callable]:
+    def load(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: int | None = None,
+    ) -> Callable | None:
         if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
             return None
         handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
-        compiled_graph = self.compiler.load(handle, graph, example_inputs,
-                                            graph_index, runtime_shape)
+        compiled_graph = self.compiler.load(
+            handle, graph, example_inputs, graph_index, runtime_shape
+        )
         if runtime_shape is None:
             logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via "
-                "handle %s", graph_index, self.compiler.name, handle)
+                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
+                graph_index,
+                self.compiler.name,
+                handle,
+            )
         else:
             logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via "
-                "handle %s", graph_index, str(runtime_shape),
-                self.compiler.name, handle)
+                "Directly load the %s-th graph for shape %s from %s via handle %s",
+                graph_index,
+                str(runtime_shape),
+                self.compiler.name,
+                handle,
+            )
         return compiled_graph
 
-    def compile(self,
-                graph: fx.GraphModule,
-                example_inputs,
-                additional_inductor_config,
-                compilation_config: CompilationConfig,
-                graph_index: int = 0,
-                num_graphs: int = 1,
-                runtime_shape: Optional[int] = None) -> Any:
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs,
+        additional_inductor_config,
+        compilation_config: CompilationConfig,
+        graph_index: int = 0,
+        num_graphs: int = 1,
+        runtime_shape: int | None = None,
+    ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
             global compilation_start_time
@@ -152,23 +198,27 @@ class CompilerManager:
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index,
-                                   runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
+                compilation_config.compilation_time += elapsed
                 if runtime_shape is None:
                     logger.info(
                         "Directly load the compiled graph(s) for dynamic shape "
-                        "from the cache, took %.3f s", elapsed)
+                        "from the cache, took %.3f s",
+                        elapsed,
+                    )
                 else:
                     logger.info(
                         "Directly load the compiled graph(s) for shape %s "
-                        "from the cache, took %.3f s", str(runtime_shape),
-                        elapsed)
+                        "from the cache, took %.3f s",
+                        str(runtime_shape),
+                        elapsed,
+                    )
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -177,37 +227,51 @@ class CompilerManager:
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = \
-                f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-        compiled_graph, handle = self.compiler.compile(
-            graph, example_inputs, additional_inductor_config, runtime_shape,
-            maybe_key)
+            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
+
+        with self.compile_context(runtime_shape):
+            compiled_graph, handle = self.compiler.compile(
+                graph,
+                example_inputs,
+                additional_inductor_config,
+                runtime_shape,
+                maybe_key,
+            )
 
         assert compiled_graph is not None, "Failed to compile the graph"
 
         # store the artifact in the cache
-        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
-            self.cache[(runtime_shape, graph_index,
-                        self.compiler.name)] = handle
+        if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
+            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
                 if runtime_shape is None:
-                    logger.info(
-                        "Cache the graph for dynamic shape for later use")
+                    logger.info_once(
+                        "Cache the graph for dynamic shape for later use", scope="local"
+                    )
                 else:
-                    logger.info("Cache the graph of shape %s for later use",
-                                str(runtime_shape))
+                    logger.info_once(
+                        "Cache the graph of shape %s for later use",
+                        str(runtime_shape),
+                        scope="local",
+                    )
             if runtime_shape is None:
                 logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via "
-                    "handle %s", graph_index, self.compiler.name, handle)
+                    "Store the %s-th graph for dynamic shape from %s via handle %s",
+                    graph_index,
+                    self.compiler.name,
+                    handle,
+                )
             else:
                 logger.debug(
                     "Store the %s-th graph for shape %s from %s via handle %s",
-                    graph_index, str(runtime_shape), self.compiler.name,
-                    handle)
+                    graph_index,
+                    str(runtime_shape),
+                    self.compiler.name,
+                    handle,
+                )
 
         # after compiling the last graph, record the end time
         if graph_index == num_graphs - 1:
@@ -215,11 +279,18 @@ class CompilerManager:
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
             if runtime_shape is None:
-                logger.info("Compiling a graph for dynamic shape takes %.2f s",
-                            elapsed)
+                logger.info_once(
+                    "Compiling a graph for dynamic shape takes %.2f s",
+                    elapsed,
+                    scope="local",
+                )
             else:
-                logger.info("Compiling a graph for shape %s takes %.2f s",
-                            runtime_shape, elapsed)
+                logger.info_once(
+                    "Compiling a graph for shape %s takes %.2f s",
+                    runtime_shape,
+                    elapsed,
+                    scope="local",
+                )
 
         return compiled_graph
 
@@ -232,16 +303,30 @@ class SplitItem:
     graph: fx.GraphModule
 
 
-def split_graph(graph: fx.GraphModule,
-                ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]:
+def split_graph(
+    graph: fx.GraphModule, splitting_ops: list[str]
+) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
-    node_to_subgraph_id = {}
-    split_op_graphs = []
+    node_to_subgraph_id: dict[fx.Node, int] = {}
+    split_op_graphs: list[int] = []
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
-        if node.op == 'call_function' and str(node.target) in ops:
+
+        # Check if this is a getitem operation on a node from an earlier subgraph.
+        # If so, assign it to the same subgraph as its input to avoid passing entire
+        # tuple as input to submodules, which is against standalone_compile and
+        # AoTAutograd input requirement.
+        if node.op == "call_function" and node.target == operator.getitem:
+            # Assign this getitem to the same subgraph as its input
+            input_node = node.args[0]
+            if input_node.op != "placeholder":
+                assert input_node in node_to_subgraph_id
+                node_to_subgraph_id[node] = node_to_subgraph_id[input_node]
+                continue
+
+        if should_split(node, splitting_ops):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id
             split_op_graphs.append(subgraph_id)
@@ -254,10 +339,8 @@ def split_graph(graph: fx.GraphModule,
     # the semantics of the graph will change when we
     # have mutations in the graph
     split_gm = torch.fx.passes.split_module.split_module(
-        graph,
-        None,
-        lambda node: node_to_subgraph_id[node],
-        keep_original_order=True)
+        graph, None, lambda node: node_to_subgraph_id[node], keep_original_order=True
+    )
 
     outputs = []
 
@@ -271,8 +354,7 @@ def split_graph(graph: fx.GraphModule,
         module = getattr(split_gm, name)
 
         graph_id = int(name.replace("submod_", ""))
-        outputs.append(
-            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+        outputs.append(SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
 
     # sort by integer graph_id, rather than string name
     outputs.sort(key=lambda x: x.graph_id)
@@ -295,11 +377,16 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     has some special cudagraph output handling.
     """
 
-    def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: list[str], vllm_config: VllmConfig,
-                 vllm_backend: "VllmBackend"):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        compile_submod_names: list[str],
+        vllm_config: VllmConfig,
+        vllm_backend: "VllmBackend",
+    ):
         super().__init__(module)
         from torch._guards import detect_fake_mode
+
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
         self.compilation_config = vllm_config.compilation_config
@@ -316,9 +403,12 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         with self.fake_mode, enable_python_dispatcher():
             return super().run(*fake_args)
 
-    def call_module(self, target: torch.fx.node.Target,
-                    args: tuple[torch.fx.node.Argument,
-                                ...], kwargs: dict[str, Any]) -> Any:
+    def call_module(
+        self,
+        target: torch.fx.node.Target,
+        args: tuple[torch.fx.node.Argument, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
         assert isinstance(target, str)
         output = super().call_module(target, args, kwargs)
 
@@ -330,26 +420,34 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             ]
             global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = self.vllm_backend.\
-                compiler_manager.compile(
-                submod,
-                args,
-                self.compilation_config.inductor_compile_config,
-                self.compilation_config,
-                graph_index=index,
-                num_graphs=len(self.compile_submod_names),
-                runtime_shape=None)
+            compiled_graph_for_dynamic_shape = (
+                self.vllm_backend.compiler_manager.compile(
+                    submod,
+                    args,
+                    self.compilation_config.inductor_compile_config,
+                    self.compilation_config,
+                    graph_index=index,
+                    num_graphs=len(self.compile_submod_names),
+                    runtime_shape=None,
+                )
+            )
             # Lazy import here to avoid circular import
-            from .cuda_piecewise_backend import PiecewiseBackend
+            from .piecewise_backend import PiecewiseBackend
 
             piecewise_backend = PiecewiseBackend(
-                submod, self.vllm_config, index,
-                len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_dynamic_shape, self.vllm_backend)
+                submod,
+                self.vllm_config,
+                index,
+                len(self.compile_submod_names),
+                sym_shape_indices,
+                compiled_graph_for_dynamic_shape,
+                self.vllm_backend,
+            )
 
-            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                    and
-                    not self.compilation_config.use_inductor_graph_partition):
+            if (
+                self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
+                and not self.compilation_config.use_inductor_graph_partition
+            ):
                 # We're using Dynamo-based piecewise splitting, so we wrap
                 # the whole subgraph with a static graph wrapper.
                 from .cuda_graph import CUDAGraphOptions
@@ -357,7 +455,8 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
                 # class) as platform dependent.
                 static_graph_wrapper_class = resolve_obj_by_qualname(
-                    current_platform.get_static_graph_wrapper_cls())
+                    current_platform.get_static_graph_wrapper_cls()
+                )
 
                 # Always assign PIECEWISE runtime mode to the
                 # CUDAGraphWrapper for piecewise_backend, to distinguish
@@ -370,7 +469,9 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                     cudagraph_options=CUDAGraphOptions(
                         debug_log_enable=piecewise_backend.is_first_graph,
                         gc_disable=not piecewise_backend.is_first_graph,
-                        weak_ref_output=piecewise_backend.is_last_graph))
+                        weak_ref_output=piecewise_backend.is_last_graph,
+                    ),
+                )
             else:
                 self.module.__dict__[target] = piecewise_backend
 
@@ -388,8 +489,9 @@ model_tag: str = "backbone"
 def set_model_tag(tag: str):
     """Context manager to set the model tag."""
     global model_tag
-    assert tag != model_tag, \
+    assert tag != model_tag, (
         f"Model tag {tag} is the same as the current tag {model_tag}."
+    )
     old_tag = model_tag
     model_tag = tag
     try:
@@ -400,7 +502,7 @@ def set_model_tag(tag: str):
 
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
-    It is used for compilation level of `CompilationLevel.PIECEWISE`,
+    It is used for compilation mode of `CompilationMode.VLLM_COMPILE`,
     where we customize the compilation.
 
     The major work of this backend is to split the graph into
@@ -430,7 +532,6 @@ class VllmBackend:
         vllm_config: VllmConfig,
         prefix: str = "",
     ):
-
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
         # e.g. language_model, vision_model, etc.
@@ -449,7 +550,8 @@ class VllmBackend:
         self.compilation_config = vllm_config.compilation_config
 
         self.compiler_manager: CompilerManager = CompilerManager(
-            self.compilation_config)
+            self.compilation_config
+        )
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
@@ -465,15 +567,20 @@ class VllmBackend:
         if PASS_KEY in inductor_config:
             if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
                 # PassManager already added to config, make sure it's correct
-                assert (inductor_config[PASS_KEY].uuid() ==
-                        self.post_grad_pass_manager.uuid())
+                assert (
+                    inductor_config[PASS_KEY].uuid()
+                    == self.post_grad_pass_manager.uuid()
+                )
             else:
                 # Config should automatically wrap all inductor passes
                 assert isinstance(inductor_config[PASS_KEY], InductorPass)
                 self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
-    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+    def __call__(
+        self, graph: fx.GraphModule, example_inputs
+    ) -> VllmSerializableFunction:
+        from .caching import _compute_code_hash, compilation_config_hash_factors
 
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
@@ -482,37 +589,11 @@ class VllmBackend:
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
 
-            factors = []
-            # 0. factors come from the env, for example, The values of
-            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-            env_hash = envs.compute_hash()
-            factors.append(env_hash)
-
-            # 1. factors come from the vllm_config (it mainly summarizes how the
-            #    model is created)
-            config_hash = vllm_config.compute_hash()
-            factors.append(config_hash)
-
+            factors = compilation_config_hash_factors(vllm_config)
             # 2. factors come from the code files that are traced by Dynamo (
             #    it mainly summarizes how the model is used in forward pass)
-            forward_code_files = list(
-                sorted(self.compilation_config.traced_files))
+            code_hash = _compute_code_hash(self.compilation_config.traced_files)
             self.compilation_config.traced_files.clear()
-            logger.debug(
-                "Traced files (to be considered for compilation cache):\n%s",
-                "\n".join(forward_code_files))
-            hash_content = []
-            for filepath in forward_code_files:
-                hash_content.append(filepath)
-                if filepath == "<string>":
-                    # This means the function was dynamically generated, with
-                    # e.g. exec(). We can't actually check these.
-                    continue
-                with open(filepath) as f:
-                    hash_content.append(f.read())
-            import hashlib
-            code_hash = hashlib.md5("\n".join(hash_content).encode(),
-                                    usedforsecurity=False).hexdigest()
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -520,8 +601,9 @@ class VllmBackend:
             factors.append(compiler_hash)
 
             # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(str(factors).encode(),
-                                   usedforsecurity=False).hexdigest()[:10]
+            hash_key = hashlib.md5(
+                str(factors).encode(), usedforsecurity=False
+            ).hexdigest()[:10]
 
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
@@ -535,28 +617,36 @@ class VllmBackend:
         self.compilation_config.cache_dir = cache_dir
         rank = vllm_config.parallel_config.rank
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}",
-                                       self.prefix)
+        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", self.prefix)
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
-        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
+        disable_cache = not is_compile_cache_enabled(
+            self.compilation_config.inductor_compile_config
+        )
 
         if disable_cache:
-            logger.info("vLLM's torch.compile cache is disabled.")
+            logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
         else:
-            logger.info("Using cache directory: %s for vLLM's torch.compile",
-                        local_cache_dir)
+            logger.info_once(
+                "Using cache directory: %s for vLLM's torch.compile",
+                local_cache_dir,
+                scope="local",
+            )
 
-        self.compiler_manager.initialize_cache(local_cache_dir, disable_cache,
-                                               self.prefix)
+        self.compiler_manager.initialize_cache(
+            local_cache_dir, disable_cache, self.prefix
+        )
 
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
         from .monitor import torch_compile_start_time
+
         dynamo_time = time.time() - torch_compile_start_time
-        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
+        logger.info_once(
+            "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
+        )
         self.compilation_config.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
@@ -566,8 +656,13 @@ class VllmBackend:
         self.graph = graph
         self.configure_post_pass()
 
-        self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_config.splitting_ops)
+        if self.compilation_config.use_inductor_graph_partition:
+            # Let Inductor decide partitioning; avoid FX-level pre-splitting.
+            fx_split_ops: list[str] = []
+        else:
+            fx_split_ops = self.compilation_config.splitting_ops or []
+
+        self.split_gm, self.piecewise_graphs = split_graph(graph, fx_split_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
 
@@ -576,39 +671,49 @@ class VllmBackend:
         lazy_format_graph_code("before split", self.graph)
         lazy_format_graph_code("after split", self.split_gm)
 
-        compilation_counter.num_piecewise_graphs_seen += len(
-            self.piecewise_graphs)
+        compilation_counter.num_piecewise_graphs_seen += len(self.piecewise_graphs)
         submod_names_to_compile = [
-            item.submod_name for item in self.piecewise_graphs
+            item.submod_name
+            for item in self.piecewise_graphs
             if not item.is_splitting_graph
         ]
 
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
-        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config,
-                                    self).run(*example_inputs)
+        PiecewiseCompileInterpreter(
+            self.split_gm, submod_names_to_compile, self.vllm_config, self
+        ).run(*example_inputs)
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
-            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+            # code adapted from
+            # https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30
             # use `print_readable` because it can include submodules
-            src = "from __future__ import annotations\nimport torch\n" + \
-                self.split_gm.print_readable(print_output=False)
+            src = (
+                "from __future__ import annotations\nimport torch\n"
+                + self.split_gm.print_readable(print_output=False)
+            )
             src = src.replace("<lambda>", "GraphModule")
             with open(graph_path, "w") as f:
                 f.write(src)
 
-            logger.debug("Computation graph saved to %s", graph_path)
+            logger.debug_once(
+                "Computation graph saved to %s", graph_path, scope="local"
+            )
 
         self._called = True
 
-        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
-            not self.compilation_config.cudagraph_copy_inputs:
-            return self.split_gm
+        if (
+            self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+            or not self.compilation_config.cudagraph_copy_inputs
+        ):
+            return VllmSerializableFunction(
+                graph, example_inputs, self.prefix, self.split_gm
+            )
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
+
         fake_mode = detect_fake_mode()
         fake_args = [
             fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
@@ -619,10 +724,12 @@ class VllmBackend:
         # for weights and static buffers, they will have concrete shapes.
         # symbolic shape only happens for input tensors.
         from torch.fx.experimental.symbolic_shapes import is_symbolic
+
         self.sym_tensor_indices = [
-            i for i, x in enumerate(fake_args)
-            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
-                any(is_symbolic(d) for d in x.size())
+            i
+            for i, x in enumerate(fake_args)
+            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
+            and any(is_symbolic(d) for d in x.size())
         ]
 
         # compiler managed cudagraph input buffers
@@ -647,4 +754,6 @@ class VllmBackend:
                 list_args[index] = static_tensor
             return self.split_gm(*list_args)
 
-        return copy_and_call
+        return VllmSerializableFunction(
+            graph, example_inputs, self.prefix, copy_and_call
+        )
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
index 6ee82e74963d96e12fb9c2a909687c6fa81fbd6b..12f1ff5bc04478f97e2b9b5a12c8c39ad0e1961d 100644
--- a/vllm/compilation/base_static_graph.py
+++ b/vllm/compilation/base_static_graph.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Protocol
+from collections.abc import Callable
+from typing import Any, Protocol
 
 from vllm.config import CUDAGraphMode, VllmConfig
 
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e34c2711e9f37f58705064aaf3ea44c77c15a4
--- /dev/null
+++ b/vllm/compilation/caching.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+import inspect
+import os
+import pickle
+from unittest.mock import patch
+
+import torch
+from torch.utils import _pytree as pytree
+
+import vllm.envs as envs
+from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
+
+try:
+    from torch._dynamo.aot_compile import SerializableCallable
+except ImportError:
+    SerializableCallable = object
+
+assert isinstance(SerializableCallable, type)
+
+logger = init_logger(__name__)
+
+
+class VllmSerializableFunction(SerializableCallable):
+    """
+    A wrapper around a compiled function by vllm. It will forward the tensor
+    inputs to the compiled function and return the result.
+    It also implements a serialization interface to support PyTorch's precompile
+    with custom backend, so that we can save and load the compiled function on
+    disk. There's no need to wrap around the compiled function if we don't want
+    to serialize them in particular cases.
+    Right now serialization for the custom backend is done via
+    serializing the Dynamo fx graph plus example inputs.
+    """
+
+    def __init__(self, graph_module, example_inputs, prefix, optimized_call):
+        assert isinstance(graph_module, torch.fx.GraphModule)
+        self.graph_module = graph_module
+        self.example_inputs = example_inputs
+        self.prefix = prefix
+        self.optimized_call = optimized_call
+        self.shape_env = None
+        sym_input = next(
+            (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None
+        )
+        if sym_input is not None:
+            self.shape_env = sym_input.node.shape_env
+
+    def __call__(self, *args, **kwargs):
+        return self.optimized_call(*args, **kwargs)
+
+    @classmethod
+    def serialize_compile_artifacts(
+        cls, compiled_fn: "VllmSerializableFunction"
+    ) -> bytes:
+        import sympy
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler, Options
+
+        state = compiled_fn.__dict__.copy()
+        state.pop("optimized_call")
+        state.pop("shape_env")
+        for node in state["graph_module"].graph.nodes:
+            node.meta.pop("source_fn_stack", None)
+            node.meta.pop("nn_module_stack", None)
+
+        graph_reducer_override = GraphPickler.reducer_override
+
+        def _graph_reducer_override(self, obj):
+            if (
+                inspect.isclass(obj)
+                and issubclass(obj, sympy.Function)
+                and hasattr(obj, "_torch_unpickler")
+            ):
+                return obj._torch_unpickler, (obj._torch_handler_name,)
+            if isinstance(obj, FakeTensorMode):
+                return type(None), ()
+            return graph_reducer_override(self, obj)
+
+        # Mask off tensor inputs since they are large and not needed.
+        state["example_inputs"] = pytree.tree_map_only(
+            torch.Tensor, lambda _: None, state["example_inputs"]
+        )
+        with patch.object(GraphPickler, "reducer_override", _graph_reducer_override):
+            state["graph_module"] = GraphPickler.dumps(
+                state["graph_module"], Options(ops_filter=None)
+            )
+            state["example_inputs"] = GraphPickler.dumps(state["example_inputs"])
+        return pickle.dumps(state)
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> "VllmSerializableFunction":
+        from torch._guards import TracingContext, tracing
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+        from vllm.compilation.backends import VllmBackend
+
+        state = pickle.loads(data)
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
+        vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
+
+        def optimized_call(*example_inputs):
+            """
+            On the first run of the optimized call, we rerun the compiler
+            backend which should result in a cache hit. After the backend
+            call returns, we just do a one-time replacement of the optimized
+            call with the compiled function, so that subsequent calls are on
+            the AOT compiled path.
+            """
+            compile_inputs = [
+                inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)
+            ]
+            with tracing(TracingContext(fake_mode)):
+                fn.optimized_call = vllm_backend(
+                    state["graph_module"], compile_inputs
+                ).optimized_call
+            return fn.optimized_call(*example_inputs)
+
+        fn = cls(**state, optimized_call=optimized_call)
+        return fn
+
+    @property
+    def co_name(self):
+        """
+        Used for depyf debugging.
+        """
+        return "VllmSerializableFunction"
+
+
+def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]:
+    factors = []
+    # 0. factors come from the env, for example, The values of
+    # VLLM_PP_LAYER_PARTITION will affect the computation graph.
+    env_hash = envs.compute_hash()
+    factors.append(env_hash)
+
+    # 1. factors come from the vllm_config (it mainly summarizes how the
+    #    model is created)
+    config_hash = vllm_config.compute_hash()
+    factors.append(config_hash)
+    return factors
+
+
+def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
+    items = list(sorted(file_contents.items(), key=lambda x: x[0]))
+    hash_content = []
+    for filepath, content in items:
+        hash_content.append(filepath)
+        if filepath == "<string>":
+            # This means the function was dynamically generated, with
+            # e.g. exec(). We can't actually check these.
+            continue
+        hash_content.append(content)
+    return hashlib.md5(
+        "\n".join(hash_content).encode(), usedforsecurity=False
+    ).hexdigest()
+
+
+def _compute_code_hash(files: set[str]) -> str:
+    logger.debug(
+        "Traced files (to be considered for compilation cache):\n%s", "\n".join(files)
+    )
+    file_contents = {}
+    for filepath in files:
+        # Skip files that don't exist (e.g., <string>, <frozen modules>, etc.)
+        if not os.path.isfile(filepath):
+            file_contents[filepath] = ""
+        else:
+            with open(filepath) as f:
+                file_contents[filepath] = f.read()
+    return _compute_code_hash_with_content(file_contents)
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 04b76a9c2d228ae84fbf899bb56a19e3211a504f..69d4606d73ebda9e293e947a89bffd3e560e5bc5 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from importlib.util import find_spec
-from typing import Optional
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -10,16 +9,21 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -27,8 +31,12 @@ FP8_DTYPE = current_platform.fp8_dtype()
 if find_spec("flashinfer"):
     try:
         import flashinfer.comm as flashinfer_comm
-        flashinfer_comm = (flashinfer_comm if hasattr(
-            flashinfer_comm, "trtllm_allreduce_fusion") else None)
+
+        flashinfer_comm = (
+            flashinfer_comm
+            if hasattr(flashinfer_comm, "trtllm_allreduce_fusion")
+            else None
+        )
     except ImportError:
         flashinfer_comm = None
 else:
@@ -36,15 +44,11 @@ else:
 
 logger = init_logger(__name__)
 
-ALLREDUCE_OP = torch.ops.vllm.all_reduce.default
-RMS_OP = torch.ops._C.rms_norm.default
-RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
-STATIC_FP8_QUANT_OP = torch.ops._C.static_scaled_fp8_quant.default
-STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default
+if hasattr(torch.ops._C, "scaled_fp4_quant"):
+    STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default
 
 
 class BasePattern:
-
     def __init__(self, dtype: torch.dtype, device: str):
         self.dtype = dtype
         self.device = device
@@ -53,14 +57,12 @@ class BasePattern:
 
 
 class GEMMReduceScatterPattern(BasePattern):
-
     def get_inputs(self):
         mul = torch.empty([16, 4], device=self.device, dtype=self.dtype)
         mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         return [mul, mm_weight]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(mul: torch.Tensor, mm_weight: torch.Tensor):
             mm = torch.ops.aten.mm.default(mul, mm_weight)
             reduce_scatter = torch.ops.vllm.reduce_scatter.default(
@@ -82,12 +84,12 @@ class GEMMReduceScatterPattern(BasePattern):
 
             return gemm_rs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllGatherGEMMPattern(BasePattern):
-
     def get_inputs(self):
         x = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
@@ -95,7 +97,6 @@ class AllGatherGEMMPattern(BasePattern):
         return [x, weight]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
@@ -110,8 +111,8 @@ class AllGatherGEMMPattern(BasePattern):
             return torch.ops.aten.mm.default(all_gather, weight)
 
         def replacement(
-                x: torch.Tensor,
-                weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
             ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul(
                 x,
                 [weight],
@@ -120,65 +121,87 @@ class AllGatherGEMMPattern(BasePattern):
             )
             return mm_outputs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class ScaledMMReduceScatterPattern(BasePattern):
-
     def get_inputs(self):
         input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
-        mm_weight = torch.empty([16, 16], device=self.device,
-                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        mm_weight = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
         scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
         scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
         return [input, mm_weight, scale_a, scale_b]
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(input: torch.Tensor, mat2: torch.Tensor,
-                    scale_a: torch.Tensor,
-                    scale_b: torch.Tensor) -> torch.Tensor:
-            scaled_mm = torch.ops.aten._scaled_mm.default(input,
-                                                          mat2=mat2,
-                                                          scale_a=scale_a,
-                                                          scale_b=scale_b,
-                                                          bias=None,
-                                                          scale_result=None,
-                                                          out_dtype=self.dtype)
+        def pattern(
+            input: torch.Tensor,
+            mat2: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
+            scaled_mm = torch.ops.aten._scaled_mm.default(
+                input,
+                mat2=mat2,
+                scale_a=scale_a,
+                scale_b=scale_b,
+                bias=None,
+                scale_result=None,
+                out_dtype=self.dtype,
+            )
             reduce_scatter = torch.ops.vllm.reduce_scatter.default(
                 scaled_mm,
                 dim=0,
                 world_size=self.tp_size,
-                group_name=self.tp.unique_name)
+                group_name=self.tp.unique_name,
+            )
             return reduce_scatter
 
-        def replacement(input: torch.Tensor, mat2: torch.Tensor,
-                        scale_a: torch.Tensor,
-                        scale_b: torch.Tensor) -> torch.Tensor:
-            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+        def replacement(
+            input: torch.Tensor,
+            mat2: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
+            # Calculate output shape: input @ mat2 with scatter_dim reduced
+            output_shape = [*input.shape[:-1], mat2.shape[1]]
+            scatter_dim = 0
+            gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
                 scale_b,
                 "avg",
-                scatter_dim=0,
-                out_dtype=self.dtype,
-                group_name=self.tp.device_group.group_name,
+                scatter_dim,  # orig_scatter_dim
+                scatter_dim,  # scatter_dim_after_maybe_reshape
+                self.tp.device_group.group_name,
+                output_shape,
+                None,  # bias
+                None,  # result_scale
+                self.dtype,  # out_dtype
+                False,  # use_fast_accum
             )
 
             return gemm_rs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllGatherScaledMMPattern(BasePattern):
-
     def get_inputs(self):
         x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
-        weight = torch.empty([16, 16], device=self.device,
-                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        weight = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
 
         s1 = x.shape[0] * self.tp_size
 
@@ -188,7 +211,6 @@ class AllGatherScaledMMPattern(BasePattern):
         return [x, weight, scale_a, scale_b]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
@@ -196,22 +218,25 @@ class AllGatherScaledMMPattern(BasePattern):
             scale_b: torch.Tensor,
         ) -> torch.Tensor:
             all_gather = torch.ops.vllm.all_gather.default(
-                x,
-                dim=0,
-                world_size=self.tp_size,
-                group_name=self.tp.unique_name)
-
-            return torch.ops.aten._scaled_mm.default(all_gather,
-                                                     mat2=weight,
-                                                     scale_a=scale_a,
-                                                     scale_b=scale_b,
-                                                     bias=None,
-                                                     scale_result=None,
-                                                     out_dtype=self.dtype)
-
-        def replacement(x: torch.Tensor, weight: torch.Tensor,
-                        scale_a: torch.Tensor,
-                        scale_b: torch.Tensor) -> torch.Tensor:
+                x, dim=0, world_size=self.tp_size, group_name=self.tp.unique_name
+            )
+
+            return torch.ops.aten._scaled_mm.default(
+                all_gather,
+                mat2=weight,
+                scale_a=scale_a,
+                scale_b=scale_b,
+                bias=None,
+                scale_result=None,
+                out_dtype=self.dtype,
+            )
+
+        def replacement(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
             ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
                 x,
                 [weight],
@@ -226,29 +251,33 @@ class AllGatherScaledMMPattern(BasePattern):
             )
             return mm_outputs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class CutlassScaledMMReduceScatterPattern(BasePattern):
-
     def get_inputs(self):
         input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
-        mm_weight = torch.empty([16, 16], device=self.device,
-                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        mm_weight = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
         scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
         scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
 
-        cutlass_mm_output = torch.empty([16, 16],
-                                        device=self.device,
-                                        dtype=self.dtype)
+        cutlass_mm_output = torch.empty([16, 16], device=self.device, dtype=self.dtype)
         return [input, mm_weight, scale_a, scale_b, cutlass_mm_output]
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(input: torch.Tensor, weight: torch.Tensor,
-                    scale_a: torch.Tensor, scale_b: torch.Tensor,
-                    cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            cutlass_mm_output: torch.Tensor,
+        ) -> torch.Tensor:
             cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
                 torch.ops._C.cutlass_scaled_mm.default,
                 out=cutlass_mm_output,
@@ -256,41 +285,58 @@ class CutlassScaledMMReduceScatterPattern(BasePattern):
                 b=weight,
                 a_scales=scale_a,
                 b_scales=scale_b,
-                bias=None)
+                bias=None,
+            )
 
             reduce_scatter = torch.ops.vllm.reduce_scatter.default(
                 cutlass_scaled_mm[1],
                 dim=0,
                 world_size=self.tp_size,
-                group_name=self.tp.unique_name)
+                group_name=self.tp.unique_name,
+            )
             return reduce_scatter
 
-        def replacement(input: torch.Tensor, mat2: torch.Tensor,
-                        scale_a: torch.Tensor, scale_b: torch.Tensor,
-                        cutlass_mm_output: torch.Tensor) -> torch.Tensor:
-            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+        def replacement(
+            input: torch.Tensor,
+            mat2: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            cutlass_mm_output: torch.Tensor,
+        ) -> torch.Tensor:
+            # Calculate output shape: input @ mat2 with scatter_dim reduced
+            output_shape = [*input.shape[:-1], mat2.shape[1]]
+            scatter_dim = 0
+            gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
                 scale_b,
                 "avg",
-                scatter_dim=0,
-                out_dtype=self.dtype,
-                group_name=self.tp.device_group.group_name,
+                scatter_dim,  # orig_scatter_dim
+                scatter_dim,  # scatter_dim_after_maybe_reshape
+                self.tp.device_group.group_name,
+                output_shape,
+                None,  # bias
+                None,  # result_scale
+                self.dtype,  # out_dtype
+                False,  # use_fast_accum
             )
 
             return gemm_rs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllGatherCutlassScaledMMPattern(BasePattern):
-
     def get_inputs(self):
         x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
-        weight = torch.empty([16, 16], device=self.device,
-                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        weight = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
 
         s1 = x.shape[0] * self.tp_size
 
@@ -303,7 +349,6 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
         return [x, weight, scale_a, scale_b, output]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             x: torch.Tensor,
             weight: torch.Tensor,
@@ -312,10 +357,8 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
             output: torch.Tensor,
         ) -> torch.Tensor:
             all_gather = torch.ops.vllm.all_gather.default(
-                x,
-                dim=0,
-                world_size=self.tp_size,
-                group_name=self.tp.unique_name)
+                x, dim=0, world_size=self.tp_size, group_name=self.tp.unique_name
+            )
 
             cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
                 torch.ops._C.cutlass_scaled_mm.default,
@@ -324,12 +367,17 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
                 b=weight,
                 a_scales=scale_a,
                 b_scales=scale_b,
-                bias=None)
+                bias=None,
+            )
             return cutlass_scaled_mm[1]
 
-        def replacement(x: torch.Tensor, weight: torch.Tensor,
-                        scale_a: torch.Tensor, scale_b: torch.Tensor,
-                        output: torch.Tensor) -> torch.Tensor:
+        def replacement(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            output: torch.Tensor,
+        ) -> torch.Tensor:
             ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
                 x,
                 [weight],
@@ -344,12 +392,12 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
             )
             return mm_outputs
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AsyncTPPass(VllmPatternMatcherPass):
-
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
@@ -357,31 +405,41 @@ class AsyncTPPass(VllmPatternMatcherPass):
         # Enable symmetric memory for the TP process group
         enable_symm_mem_for_group(get_tp_group().device_group.group_name)
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="async_tp_pass")
-        GEMMReduceScatterPattern(self.model_dtype,
-                                 self.device).register(self.patterns)
+            pass_name="async_tp_pass"
+        )
+        GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns)
 
-        AllGatherGEMMPattern(self.model_dtype,
-                             self.device).register(self.patterns)
+        AllGatherGEMMPattern(self.model_dtype, self.device).register(self.patterns)
 
         # These fusions are enabled only for bfloat16 models because
         # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling
         # only supports bfloat16 as the output dtype.
         if self.model_dtype == torch.bfloat16:
-            ScaledMMReduceScatterPattern(self.model_dtype,
-                                         self.device).register(self.patterns)
-            AllGatherScaledMMPattern(self.model_dtype,
-                                     self.device).register(self.patterns)
+            ScaledMMReduceScatterPattern(self.model_dtype, self.device).register(
+                self.patterns
+            )
+            AllGatherScaledMMPattern(self.model_dtype, self.device).register(
+                self.patterns
+            )
 
-            CutlassScaledMMReduceScatterPattern(
-                self.model_dtype, self.device).register(self.patterns)
-            AllGatherCutlassScaledMMPattern(
-                self.model_dtype, self.device).register(self.patterns)
+            CutlassScaledMMReduceScatterPattern(self.model_dtype, self.device).register(
+                self.patterns
+            )
+            AllGatherCutlassScaledMMPattern(self.model_dtype, self.device).register(
+                self.patterns
+            )
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
-        # only do replace for specific shapes
+    def is_applicable(self, shape: int | None) -> bool:
+        # This pass is applied on top of the sequence parallelism pass.
+        # It inherits the same applicability condition as `SequenceParallelismPass`.
+        # See `SequenceParallelismPass.is_applicable` for more details.
+        if (
+            not self.compilation_config.splitting_ops
+            or self.compilation_config.use_inductor_graph_partition
+        ):
+            return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 
@@ -391,33 +449,41 @@ class AsyncTPPass(VllmPatternMatcherPass):
         logger.debug("Replaced %s patterns", self.matched_count)
 
 
+# Max size of the input tensor per world size per device capability
+# to use flashinfer fused allreduce
+FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = {
+    90: {
+        2: 64,  # 64MB
+        4: 2,  # 2MB
+        8: 0.5,  # 0.5MB
+    },
+    100: {
+        2: 64,  # 64MB
+        4: 32,  # 32MB
+        8: 1,  # 1MB
+    },
+}
+
+# Max size of the input tensor per world size per device capability
+# to use flashinfer one shot fused allreduce
+# OneShot max size is at most 64MB / world size (FlashInfer restriction)
+_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = {
+    90: {
+        2: 32,  # 32MB
+        4: 2,  # 2MB
+        8: 0.5,  # 0.5MB
+    },
+    100: {
+        2: 32,  # 32MB
+        4: 4,  # 4MB
+        8: 1,  # 1MB
+    },
+}
+
+
 if flashinfer_comm is not None:
     _FI_WORKSPACE_TENSOR = None
-
     MiB = 1024 * 1024
-    # Max size of the input tensor per world size
-    # to use flashinfer fused allreduce
-    _FI_MAX_SIZES = {
-        2: 64 * MiB,  # 64MB
-        4: MiB,  # 1MB
-        6: MiB // 2,  # 512KB
-        8: MiB // 2,  # 512KB
-    }
-
-    try:
-        _FI_MAX_SIZES.update({
-            int(k): int(float(v) * MiB)
-            for k, v in
-            envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items()
-        })
-    except Exception as e:
-        raise ValueError(
-            "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
-            + str(e)) from e
-
-    # opt for a more conservative default value
-    # when world size is not in _FI_MAX_SIZES
-    _DEFAULT_FI_MAX_SIZE = MiB // 2
 
     def call_trtllm_fused_allreduce_norm(
         allreduce_in: torch.Tensor,
@@ -431,23 +497,31 @@ if flashinfer_comm is not None:
         fp32_acc: bool,
         max_token_num: int,
         pattern_code: int,
-        fuse_rms_quant: bool,
-        norm_out: Optional[torch.Tensor] = None,
-        quant_out: Optional[torch.Tensor] = None,
-        scale_out: Optional[torch.Tensor] = None,
-        scale_factor: Optional[torch.Tensor] = None,
+        norm_out: torch.Tensor | None = None,
+        quant_out: torch.Tensor | None = None,
+        scale_out: torch.Tensor | None = None,
+        scale_factor: torch.Tensor | None = None,
     ) -> None:
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
-        max_fusion_size = max_token_num * hidden_size * element_size
-        use_flashinfer = current_tensor_size <= min(
-            _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE),
-            max_fusion_size,
-        )
-        if use_flashinfer:
-            assert (_FI_WORKSPACE_TENSOR is not None
-                    ), "Flashinfer must be enabled when using flashinfer"
+
+        if num_tokens <= max_token_num:
+            device_capability = current_platform.get_device_capability().to_int()
+            # Get one shot input size limit for the current world size
+            # for the current device capability
+            max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
+                device_capability, {}
+            ).get(world_size, None)
+            # Use one shot if no max size for one shot is specified
+            use_oneshot = (
+                max_one_shot_size_mb is None
+                or current_tensor_size <= max_one_shot_size_mb * MiB
+            )
+
+            assert _FI_WORKSPACE_TENSOR is not None, (
+                "Flashinfer must be enabled when using flashinfer"
+            )
             if norm_out is None:
                 norm_out = allreduce_in
                 residual_out = residual
@@ -471,7 +545,7 @@ if flashinfer_comm is not None:
                 hidden_dim=allreduce_in.shape[-1],
                 workspace_ptrs=_FI_WORKSPACE_TENSOR,
                 launch_with_pdl=launch_with_pdl,
-                use_oneshot=True,
+                use_oneshot=use_oneshot,
                 trigger_completion_at_end=trigger_completion_at_end,
                 fp32_acc=fp32_acc,
                 pattern_code=pattern_code,
@@ -479,38 +553,38 @@ if flashinfer_comm is not None:
                 quant_out=quant_out,
                 scale_out=scale_out,
                 # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.QuantizationSFLayout.
-                SWIZZLED_128x4,
+                layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
                 scale_factor=scale_factor,
             )
         else:
             allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if (scale_factor is not None and scale_out is None
-                    and fuse_rms_quant):
+            if scale_factor is not None and scale_out is None:
                 # Do fused rms norm static fp8 quant fused op
                 if norm_out is None:
                     torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-                        quant_out, allreduce_out, residual, rms_gamma,
-                        scale_factor, rms_eps)
+                        quant_out,
+                        allreduce_out,
+                        residual,
+                        rms_gamma,
+                        scale_factor,
+                        rms_eps,
+                    )
                 else:
                     torch.ops._C.rms_norm_static_fp8_quant(
-                        quant_out, allreduce_out, rms_gamma, scale_factor,
-                        rms_eps)
+                        quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps
+                    )
             else:
                 if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm(allreduce_out, residual,
-                                                    rms_gamma, rms_eps)
+                    torch.ops._C.fused_add_rms_norm(
+                        allreduce_out, residual, rms_gamma, rms_eps
+                    )
                     norm_out = allreduce_out
                 else:
-                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma,
-                                          rms_eps)
-                if scale_factor is not None:
-                    if scale_out is not None:
-                        torch.ops._C.scaled_fp4_quant(quant_out, norm_out,
-                                                      scale_out, scale_factor)
-                    else:
-                        torch.ops._C.static_scaled_fp8_quant(
-                            quant_out, norm_out, scale_factor)
+                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps)
+                if scale_factor is not None and scale_out is not None:
+                    torch.ops._C.scaled_fp4_quant(
+                        quant_out, norm_out, scale_out, scale_factor
+                    )
             if scale_factor is None or norm_out is not None:
                 # we need to return allreduce output
                 # in cases of non quant fused AR + RMS norm
@@ -518,22 +592,22 @@ if flashinfer_comm is not None:
                 allreduce_in.copy_(allreduce_out)
 
     def call_trtllm_fused_allreduce_norm_fake(
-            allreduce_in: torch.Tensor,
-            residual: torch.Tensor,
-            rms_gamma: torch.Tensor,
-            rms_eps: float,
-            world_rank: int,
-            world_size: int,
-            launch_with_pdl: bool,
-            trigger_completion_at_end: bool,
-            fp32_acc: bool,
-            max_token_num: int,
-            pattern_code: int,
-            fuse_rms_quant: bool,
-            norm_out: Optional[torch.Tensor] = None,
-            quant_out: Optional[torch.Tensor] = None,
-            scale_out: Optional[torch.Tensor] = None,
-            scale_factor: Optional[torch.Tensor] = None) -> None:
+        allreduce_in: torch.Tensor,
+        residual: torch.Tensor,
+        rms_gamma: torch.Tensor,
+        rms_eps: float,
+        world_rank: int,
+        world_size: int,
+        launch_with_pdl: bool,
+        trigger_completion_at_end: bool,
+        fp32_acc: bool,
+        max_token_num: int,
+        pattern_code: int,
+        norm_out: torch.Tensor | None = None,
+        quant_out: torch.Tensor | None = None,
+        scale_out: torch.Tensor | None = None,
+        scale_factor: torch.Tensor | None = None,
+    ) -> None:
         pass
 
     direct_register_custom_op(
@@ -549,7 +623,8 @@ if flashinfer_comm is not None:
         fake_impl=call_trtllm_fused_allreduce_norm_fake,
     )
     flashinfer_trtllm_fused_allreduce_norm = (
-        torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default)
+        torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
+    )
 
 
 class FlashInferFusedAllReduceParams:
@@ -561,7 +636,6 @@ class FlashInferFusedAllReduceParams:
         world_size: int,
         use_fp32_lamport: bool = False,
         max_token_num: int = 1024,
-        fuse_rms_quant: bool = False,
     ):
         self.rank = rank
         self.world_size = world_size
@@ -569,9 +643,7 @@ class FlashInferFusedAllReduceParams:
         self.trigger_completion_at_end = True
         self.launch_with_pdl = True
         self.fp32_acc = True
-        self.use_oneshot = False
         self.max_token_num = max_token_num
-        self.fuse_rms_quant = fuse_rms_quant
 
     def get_trtllm_fused_allreduce_kwargs(self):
         return {
@@ -581,13 +653,12 @@ class FlashInferFusedAllReduceParams:
             "trigger_completion_at_end": self.trigger_completion_at_end,
             "fp32_acc": self.fp32_acc,
             "max_token_num": self.max_token_num,
-            "fuse_rms_quant": self.fuse_rms_quant,
         }
 
 
 class AllReduceRMSNormPattern(BasePattern):
     """
-    This pattern replaces the allreduce + rms norm (without residual) 
+    This pattern replaces the allreduce + rms norm (without residual)
     with fused flashinfer implementation.
     Applies to allreduce + rmsnorm before attn in the first Transformer block.
     """
@@ -602,34 +673,24 @@ class AllReduceRMSNormPattern(BasePattern):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
     def get_inputs(self):
-        input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        rms_result = torch.empty([1, 8, 4],
-                                 device=self.device,
-                                 dtype=self.dtype)
-        weight = torch.empty([4], device=self.device, dtype=self.dtype)
+        input, weight = self.rmsnorm_matcher.inputs()
 
-        return [input, rms_result, weight]
+        # input goes through allreduce first, always 16-bit
+        return [input.to(self.dtype), weight]
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(input: torch.Tensor, rms_result: torch.Tensor,
-                    weight: torch.Tensor):
+        def pattern(input: torch.Tensor, weight: torch.Tensor):
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms = auto_functionalized(
-                RMS_OP,
-                result=rms_result,
-                input=allreduce_output,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            # rms_result, allreduce_output
-            return rms[1], allreduce_output
+            rms = self.rmsnorm_matcher(allreduce_output, weight)
+
+            return rms, allreduce_output
 
-        def replacement(input: torch.Tensor, rms_result: torch.Tensor,
-                        weight: torch.Tensor):
+        def replacement(input: torch.Tensor, weight: torch.Tensor):
             residual = torch.zeros_like(input)
+            rms_result = torch.empty_like(input)
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -639,20 +700,20 @@ class AllReduceRMSNormPattern(BasePattern):
                 scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNorm,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
             # rms_result, allreduce_in
             return allreduce[3], allreduce[1]
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllReduceFusedAddRMSNormPattern(BasePattern):
     """
-    This pattern replaces the allreduce + rms norm (with residual) 
+    This pattern replaces the allreduce + rms norm (with residual)
     with fused flashinfer implementation.
     Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
     """
@@ -667,34 +728,23 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self):
-        input = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        return [
-            residual,
-            input,
-            weight,
-        ]
+        input, residual, weight = self.rmsnorm_matcher.inputs()
 
-    def register(self, pm_pass: PatternMatcherPass):
+        # input goes through allreduce first, always 16-bit
+        return [residual, input.to(self.dtype), weight]
 
-        def pattern(residual: torch.Tensor, input: torch.Tensor,
-                    weight: torch.Tensor):
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor):
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms = auto_functionalized(
-                RMS_ADD_OP,
-                input=allreduce_output,
-                residual=residual,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            # input, residual
-            return rms[1], rms[2]
+            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            return rms, residual
 
-        def replacement(residual: torch.Tensor, input: torch.Tensor,
-                        weight: torch.Tensor):
+        def replacement(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ):
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -704,85 +754,86 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
                 scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNorm,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
             # allreduce_in, residual
             return allreduce[1], allreduce[2]
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+
+        # Same pattern, but only return the output and not residual
+        # (helpful for end of graph where residual is not used again)
+        first_return_only = lambda fn: lambda a, b, c: fn(a, b, c)[0]
+
+        pm.register_replacement(
+            first_return_only(pattern),
+            first_return_only(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
 
 
 class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
     """
-    This pattern replaces the allreduce + rms norm (without residual) 
+    This pattern replaces the allreduce + rms norm (without residual)
     + static fp8 quant with fused flashinfer implementation.
-    Applies to allreduce + rmsnorm + quant before attn 
+    Applies to allreduce + rmsnorm + quant before attn
     in the first Transformer block.
     """
 
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 allreduce_params: FlashInferFusedAllReduceParams):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.quant_dtype = torch.float8_e4m3fn
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def get_inputs():
-            input = torch.zeros([1, 8, 4],
-                                device=self.device,
-                                dtype=self.dtype)
-            rmsnorm_result = torch.empty([1, 8, 4],
-                                         device=self.device,
-                                         dtype=self.dtype)
-            quant_result = torch.empty([1, 8, 4],
-                                       device=self.device,
-                                       dtype=self.quant_dtype)
-            weight = torch.empty([4], device=self.device, dtype=self.dtype)
-            scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
-            return [input, rmsnorm_result, quant_result, weight, scale]
+            input, weight = self.rmsnorm_matcher.inputs()
+            _, scale = self.quant_matcher.inputs()
+
+            # input goes through allreduce first, always 16-bit
+            return [input.to(self.dtype), weight, scale]
 
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             all_reduce = tensor_model_parallel_all_reduce(input)
-            rmsnorm_out_tuple = auto_functionalized(RMS_OP,
-                                                    result=rmsnorm_result,
-                                                    input=all_reduce,
-                                                    weight=weight,
-                                                    epsilon=self.epsilon)
+            rms = self.rmsnorm_matcher(all_reduce, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, all_reduce
 
-            quant_out_tuple = auto_functionalized(STATIC_FP8_QUANT_OP,
-                                                  result=quant_result,
-                                                  input=rmsnorm_out_tuple[1],
-                                                  scale=scale)
-
-            # quant_out, allreduce_output
-            return quant_out_tuple[1], all_reduce
-
-        def replacement(input: torch.Tensor, result_rms: torch.Tensor,
-                        quant_result: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
+        def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
             residual = torch.zeros_like(input)
+            result_rms = torch.empty_like(input)
+            result_quant = torch.empty_like(input, dtype=self.quant_dtype)
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
                 residual=residual,
                 norm_out=result_rms,
-                quant_out=quant_result,
+                quant_out=result_quant,
                 scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
+                ),
                 scale_factor=scale,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
@@ -790,168 +841,146 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
             # quant_out, allreduce_output
             return allreduce[4], allreduce[1]
 
-        pm.register_replacement(pattern, replacement, get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
     """
     This pattern replaces the allreduce + rms norm (with residual)
     + static fp8 quant with fused flashinfer implementation.
-    Applies to o_proj + rmsnorm after attn + quant and 
+    Applies to o_proj + rmsnorm after attn + quant and
     mlp + rmsnorm + quant before attn.
     """
 
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 allreduce_params: FlashInferFusedAllReduceParams):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
         self.quant_dtype = torch.float8_e4m3fn
 
-    def register(self, pm_pass: PatternMatcherPass):
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
+    def register(self, pm_pass: PatternMatcherPass):
         def get_inputs():
-            input = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-            residual = torch.empty([4, 4],
-                                   device=self.device,
-                                   dtype=self.dtype)
-            weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-            quant_result = torch.empty([4, 4],
-                                       device=self.device,
-                                       dtype=self.quant_dtype)
-            scale = torch.empty([1, 1],
-                                device=self.device,
-                                dtype=torch.float32)
+            input, residual, weight = self.rmsnorm_matcher.inputs()
+            _, scale = self.quant_matcher.inputs()
 
-            return [
-                quant_result,
-                residual,
-                input,
-                weight,
-                scale,
-            ]
+            # input goes through allreduce first, always 16-bit
+            return [residual, input.to(self.dtype), weight, scale]
 
         def pattern(
-            quant_result: torch.Tensor,
             residual: torch.Tensor,
             input: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             allreduce_output = tensor_model_parallel_all_reduce(input)
+            rms, res = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            quant, _ = self.quant_matcher(rms, scale)
 
-            fused_add_rmsnorm_out_tuple = \
-            auto_functionalized(
-                RMS_ADD_OP,
-                input=allreduce_output,
-                residual=residual,
-                weight=weight,
-                epsilon=self.epsilon)
-            quant_out_tuple = auto_functionalized(
-                STATIC_FP8_QUANT_OP,
-                result=quant_result,
-                input=fused_add_rmsnorm_out_tuple[1],
-                scale=scale)
-
-            # quant_out, allreduce_output
-            return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[2]
+            return quant, res
 
-        def replacement(quant_result: torch.Tensor, residual: torch.Tensor,
-                        input: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
+        def replacement(
+            residual: torch.Tensor,
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            result_quant = torch.empty_like(input, dtype=self.quant_dtype)
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
                 residual=residual,
                 norm_out=None,
-                quant_out=quant_result,
+                quant_out=result_quant,
                 scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
+                ),
                 scale_factor=scale,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
-            # # quant_out, rms_norm_residual
+            # quant_out, rms_norm_residual
             return allreduce[4], allreduce[2]
 
-        pm.register_replacement(pattern, replacement, get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
     """
-    This pattern replaces the allreduce + rms norm (without residual) 
+    This pattern replaces the allreduce + rms norm (without residual)
     + static nvfp4 quant with fused flashinfer implementation.
-    Applies to allreduce + rmsnorm + quant before attn 
+    Applies to allreduce + rmsnorm + quant before attn
     in the first Transformer block.
     """
 
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 allreduce_params: FlashInferFusedAllReduceParams):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def get_inputs():
-            input = torch.empty([1, 16, 16],
-                                device=self.device,
-                                dtype=self.dtype)
-
-            rmsnorm_result = torch.empty([1, 16, 16],
-                                         device=self.device,
-                                         dtype=self.dtype)
-            quant_result = torch.empty((16, 8),
-                                       device=self.device,
-                                       dtype=torch.uint8)
-            input_global_scale = torch.empty([1, 1],
-                                             device=self.device,
-                                             dtype=torch.float32)
+            input = torch.empty([1, 16, 16], device=self.device, dtype=self.dtype)
+            quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
+            input_global_scale = torch.empty(
+                [1, 1], device=self.device, dtype=torch.float32
+            )
             weight = torch.empty([16], device=self.device, dtype=self.dtype)
-            output_scale = torch.empty([128, 4],
-                                       device=self.device,
-                                       dtype=torch.int32)
+            output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
 
-            return [
-                input, rmsnorm_result, quant_result, weight,
-                input_global_scale, output_scale
-            ]
+            return [input, quant_result, weight, input_global_scale, output_scale]
 
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
             quant_result: torch.Tensor,
             weight: torch.Tensor,
             input_global_scale: torch.Tensor,
             output_scale: torch.Tensor,
         ):
             all_reduce = tensor_model_parallel_all_reduce(input)
-            rmsnorm_out_tuple = auto_functionalized(RMS_OP,
-                                                    result=rmsnorm_result,
-                                                    input=all_reduce,
-                                                    weight=weight,
-                                                    epsilon=self.epsilon)
-
+            rms = self.rmsnorm_matcher(all_reduce, weight)
             quant_out_tuple = auto_functionalized(
                 STATIC_FP4_QUANT_OP,
                 output=quant_result,
-                input=rmsnorm_out_tuple[1],
+                input=rms,
                 output_scale=output_scale,
-                input_scale=input_global_scale)
+                input_scale=input_global_scale,
+            )
 
             # quant_out, allreduce_output, output_scale
             return quant_out_tuple[1], all_reduce, quant_out_tuple[2]
 
-        def replacement(input: torch.Tensor, result_rms: torch.Tensor,
-                        quant_result: torch.Tensor, weight: torch.Tensor,
-                        input_global_scale: torch.Tensor,
-                        output_scale: torch.Tensor):
+        def replacement(
+            input: torch.Tensor,
+            quant_result: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            output_scale: torch.Tensor,
+        ):
             residual = torch.zeros_like(input)
+            result_rms = torch.empty_like(input)
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -961,8 +990,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
                 scale_out=output_scale,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
+                ),
                 scale_factor=input_global_scale,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
@@ -970,44 +1001,42 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
             # quant_out, allreduce_output, output_scale
             return allreduce[4], allreduce[1], allreduce[5]
 
-        pm.register_replacement(pattern, replacement, get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
     """
     This pattern replaces the allreduce + rms norm (with residual)
     + static nvfp4 quant with fused flashinfer implementation.
-    Applies to o_proj + rmsnorm after attn + quant and 
+    Applies to o_proj + rmsnorm after attn + quant and
     mlp + rmsnorm + quant before attn.
     """
 
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 allreduce_params: FlashInferFusedAllReduceParams):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+        allreduce_params: FlashInferFusedAllReduceParams,
+    ):
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def get_inputs():
             input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
 
-            residual = torch.empty([16, 16],
-                                   device=self.device,
-                                   dtype=self.dtype)
-            weight = torch.empty([16, 16],
-                                 device=self.device,
-                                 dtype=self.dtype)
-            quant_result = torch.empty((16, 8),
-                                       device=self.device,
-                                       dtype=torch.uint8)
-            input_global_scale = torch.empty([1, 1],
-                                             device=self.device,
-                                             dtype=torch.float32)
-            output_scale = torch.empty([128, 4],
-                                       device=self.device,
-                                       dtype=torch.int32)
+            residual = torch.empty([16, 16], device=self.device, dtype=self.dtype)
+            weight = torch.empty([16, 16], device=self.device, dtype=self.dtype)
+            quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8)
+            input_global_scale = torch.empty(
+                [1, 1], device=self.device, dtype=torch.float32
+            )
+            output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
 
             return [
                 quant_result,
@@ -1018,33 +1047,35 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
                 input_global_scale,
             ]
 
-        def pattern(quant_result: torch.Tensor, residual: torch.Tensor,
-                    input: torch.Tensor, output_scale: torch.Tensor,
-                    weight: torch.Tensor, input_global_scale: torch.Tensor):
+        def pattern(
+            quant_result: torch.Tensor,
+            residual: torch.Tensor,
+            input: torch.Tensor,
+            output_scale: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+        ):
             allreduce_output = tensor_model_parallel_all_reduce(input)
-
-            fused_add_rmsnorm_out_tuple = \
-            auto_functionalized(
-                RMS_ADD_OP,
-                input=allreduce_output,
-                residual=residual,
-                weight=weight,
-                epsilon=self.epsilon)
+            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
             quant_out_tuple = auto_functionalized(
                 STATIC_FP4_QUANT_OP,
                 output=quant_result,
-                input=fused_add_rmsnorm_out_tuple[1],
+                input=rms,
                 output_scale=output_scale,
-                input_scale=input_global_scale)
+                input_scale=input_global_scale,
+            )
 
             # quant_out, allreduce_output, output_scale
-            return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[
-                2], quant_out_tuple[2]
+            return quant_out_tuple[1], residual, quant_out_tuple[2]
 
-        def replacement(quant_result: torch.Tensor, residual: torch.Tensor,
-                        input: torch.Tensor, output_scale: torch.Tensor,
-                        weight: torch.Tensor,
-                        input_global_scale: torch.Tensor):
+        def replacement(
+            quant_result: torch.Tensor,
+            residual: torch.Tensor,
+            input: torch.Tensor,
+            output_scale: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+        ):
             allreduce = auto_functionalized(
                 flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
@@ -1054,20 +1085,22 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
                 scale_out=output_scale,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
+                ),
                 scale_factor=input_global_scale,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
             # quant_out, rms_norm_residual, output_scale
             return allreduce[4], allreduce[2], allreduce[5]
 
-        pm.register_replacement(pattern, replacement, get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class AllReduceFusionPass(VllmPatternMatcherPass):
-
     def __init__(self, config: VllmConfig):
         super().__init__(config)
         self.disabled = True
@@ -1075,7 +1108,8 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         if self.tp_size <= 1:
             return
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="all_reduce_fusion_pass")
+            pass_name="all_reduce_fusion_pass"
+        )
         if config.model_config is None:
             return
         self.hidden_dim = config.model_config.get_hidden_size()
@@ -1085,30 +1119,43 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         if flashinfer_comm is None:
             logger.warning(
                 "Flashinfer is not installed or comm module not found, "
-                "skipping allreduce fusion pass")
+                "skipping allreduce fusion pass"
+            )
             return
-        # Check if the world size is supported
-        if self.tp_size not in _FI_MAX_SIZES:
+        max_size = config.compilation_config.pass_config.flashinfer_max_size(
+            self.tp_size
+        )
+        if max_size is None:
+            # Flashinfer doesn't support current world size
             logger.warning(
-                "Flashinfer allreduce fusion is not "
-                "supported for world size %s",
+                "Flashinfer allreduce fusion is not supported for world size %s",
                 self.tp_size,
             )
             return
-        max_num_token = min(
-            _FI_MAX_SIZES.get(self.tp_size, _DEFAULT_FI_MAX_SIZE) //
-            (self.hidden_dim * self.tp_size * (4 if use_fp32_lamport else 2)),
-            config.compilation_config.pass_config.
-            fi_allreduce_fusion_max_token_num)
+        element_size = 4 if use_fp32_lamport else 2
+        self.max_token_num = max_size // (self.hidden_dim * element_size)
+        # take the min to save workspace size and we'll never use more
+        # than max_num_batched_tokens anyways
+        self.max_token_num = min(
+            self.max_token_num, config.scheduler_config.max_num_batched_tokens
+        )
+        logger.debug_once(
+            f"Flashinfer max size: {max_size // (1024 * 1024)} MB,"
+            "Maximal number of tokens used by "
+            f"Flashinfer Allreduce Fusion: {self.max_token_num}",
+            scope="global",
+        )
+
         self.ipc_handles, workspace_tensor = (
             flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
                 tp_rank=rank,
                 tp_size=self.tp_size,
-                max_token_num=max_num_token,
+                max_token_num=self.max_token_num,
                 hidden_dim=self.hidden_dim,
                 group=self.group,
                 use_fp32_lamport=use_fp32_lamport,
-            ))
+            )
+        )
 
         global _FI_WORKSPACE_TENSOR
         _FI_WORKSPACE_TENSOR = workspace_tensor
@@ -1116,10 +1163,8 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
             rank=rank,
             world_size=self.tp_size,
             use_fp32_lamport=use_fp32_lamport,
-            max_token_num=max_num_token,
-            # fuse rms norm static fp8 quant fused op
-            # in fallback path, when we don't use flashinfer
-            fuse_rms_quant=config.compilation_config.pass_config.enable_fusion)
+            max_token_num=self.max_token_num,
+        )
 
         self.register_patterns()
         self.dump_patterns(config, self.patterns)
@@ -1185,4 +1230,5 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
             return
         if flashinfer_comm is not None:
             flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
-                self.ipc_handles, self.group)
+                self.ipc_handles, self.group
+            )
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index eeca14d1296f3c5c5af9d4e6a2c2af16c732d614..11cf0f85c17877f8dfb5d092bbdbcc42be95d745 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -4,8 +4,9 @@ import contextlib
 import copy
 import hashlib
 import os
+from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any, Callable, Optional
+from typing import Any, Literal
 from unittest.mock import patch
 
 import torch
@@ -15,23 +16,21 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
-from vllm.utils import is_torch_equal_or_newer
-
-from .inductor_pass import pass_context
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
 class CompilerInterface:
     """
     The interface for a compiler that can be used by vLLM.
     """
+
     # The name of the compiler, e.g. inductor.
     # This is a class-level attribute.
     name: str
 
-    def initialize_cache(self,
-                         cache_dir: str,
-                         disable_cache: bool = False,
-                         prefix: str = ""):
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
         """
         when the vLLM process uses `cache_dir` as the cache directory,
         the compiler should initialize itself with the cache directory,
@@ -64,9 +63,9 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
         with a runtime shape. If the `runtime_shape` is None, it means
@@ -93,12 +92,14 @@ class CompilerInterface:
         """
         return None, None
 
-    def load(self,
-             handle: Any,
-             graph: fx.GraphModule,
-             example_inputs: list[Any],
-             graph_index: int,
-             runtime_shape: Optional[int] = None) -> Callable:
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: int | None = None,
+    ) -> Callable:
         """
         Load the compiled function from the handle.
         Raises an error if the handle is invalid.
@@ -150,16 +151,35 @@ def get_inductor_factors() -> list[Any]:
     factors: list[Any] = []
     # summarize system state
     from torch._inductor.codecache import CacheBase
+
     system_factors = CacheBase.get_system()
     factors.append(system_factors)
 
     # summarize pytorch state
     from torch._inductor.codecache import torch_key
+
     torch_factors = torch_key()
     factors.append(torch_factors)
     return factors
 
 
+def is_compile_cache_enabled(
+    vllm_additional_inductor_config: dict[str, Any],
+) -> bool:
+    vllm_inductor_config_disable_cache = vllm_additional_inductor_config.get(
+        "force_disable_caches", False
+    )
+
+    # TODO(gmagogsfm): Replace torch._inductor.config.force_disable_caches
+    # with torch.compiler.config.force_disable_caches when minimum PyTorch
+    # version reaches 2.10
+    return (
+        not envs.VLLM_DISABLE_COMPILE_CACHE
+        and not torch._inductor.config.force_disable_caches
+        and not vllm_inductor_config_disable_cache
+    )
+
+
 class InductorStandaloneAdaptor(CompilerInterface):
     """
     The adaptor for the Inductor compiler.
@@ -169,18 +189,22 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off.
     """
+
     name = "inductor_standalone"
 
+    def __init__(self, save_format: Literal["binary", "unpacked"]):
+        self.save_format = save_format
+
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()[:10]
+        hash_str = hashlib.md5(
+            str(factors).encode(), usedforsecurity=False
+        ).hexdigest()[:10]
         return hash_str
 
-    def initialize_cache(self,
-                         cache_dir: str,
-                         disable_cache: bool = False,
-                         prefix: str = ""):
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
         self.cache_dir = cache_dir
 
     def compile(
@@ -188,14 +212,15 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
         set_inductor_config(current_config, runtime_shape)
+        set_functorch_config()
 
         if isinstance(runtime_shape, int):
             dynamic_shapes = "from_example_inputs"
@@ -203,34 +228,40 @@ class InductorStandaloneAdaptor(CompilerInterface):
             dynamic_shapes = "from_tracing_context"
 
         from torch._inductor import standalone_compile
-        with pass_context(runtime_shape):
-            compiled_graph = standalone_compile(
-                graph,
-                example_inputs,
-                dynamic_shapes=dynamic_shapes,
-                options={"config_patches": current_config})
+
+        compiled_graph = standalone_compile(
+            graph,
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+            options={"config_patches": current_config},
+        )
 
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
-            compiled_graph.save(path=path, format="unpacked")
+
+        if is_compile_cache_enabled(compiler_config):
+            compiled_graph.save(path=path, format=self.save_format)
             compilation_counter.num_compiled_artifacts_saved += 1
         return compiled_graph, (key, path)
 
-    def load(self,
-             handle: Any,
-             graph: fx.GraphModule,
-             example_inputs: list[Any],
-             graph_index: int,
-             runtime_shape: Optional[int] = None) -> Callable:
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: int | None = None,
+    ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
         path = handle[1]
         inductor_compiled_graph = torch._inductor.CompiledArtifact.load(
-            path=path, format="unpacked")
+            path=path, format=self.save_format
+        )
         from torch._inductor.compile_fx import graph_returns_tuple
+
         returns_tuple = graph_returns_tuple(graph)
 
         def compiled_graph_wrapper(*args):
@@ -250,24 +281,25 @@ class InductorAdaptor(CompilerInterface):
     """
     The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7.
     """
+
     name = "inductor"
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()[:10]
+        hash_str = hashlib.md5(
+            str(factors).encode(), usedforsecurity=False
+        ).hexdigest()[:10]
         return hash_str
 
-    def initialize_cache(self,
-                         cache_dir: str,
-                         disable_cache: bool = False,
-                         prefix: str = ""):
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
         self.cache_dir = cache_dir
         self.prefix = prefix
-        self.base_cache_dir = cache_dir[:-len(prefix)] if prefix else cache_dir
+        self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
         if disable_cache:
             return
-        # redirect the cache directory to a sub-directory
+        # redirect the cache directory to a subdirectory
         # set flags so that Inductor and Triton store their cache
         # in the cache_dir, then users only need to copy the cache_dir
         # to another machine to reuse the cache.
@@ -283,11 +315,12 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         from torch._inductor.compile_fx import compile_fx
+
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
@@ -297,6 +330,7 @@ class InductorAdaptor(CompilerInterface):
         current_config["fx_graph_remote_cache"] = False
 
         set_inductor_config(current_config, runtime_shape)
+        set_functorch_config()
 
         # inductor can inplace modify the graph, so we need to copy it
         # see https://github.com/pytorch/pytorch/issues/138980
@@ -308,8 +342,8 @@ class InductorAdaptor(CompilerInterface):
         # it to get the hash of the compiled graph directly.
 
         hash_str, file_path = None, None
-        from torch._inductor.codecache import (FxGraphCache,
-                                               compiled_fx_graph_hash)
+        from torch._inductor.codecache import FxGraphCache, compiled_fx_graph_hash
+
         if torch.__version__.startswith("2.5"):
             original_load = FxGraphCache.load
             original_load_name = "torch._inductor.codecache.FxGraphCache.load"
@@ -319,14 +353,18 @@ class InductorAdaptor(CompilerInterface):
                 nonlocal file_path
                 compiled_fn = inductor_compiled_graph.current_callable
                 file_path = compiled_fn.__code__.co_filename  # noqa
-                if not file_path.startswith(self.base_cache_dir):
+                if (
+                    not file_path.startswith(self.base_cache_dir)
+                    and compiled_fn.__closure__ is not None
+                ):
                     # hooked in the align_inputs_from_check_idxs function
                     # in torch/_inductor/utils.py
                     for cell in compiled_fn.__closure__:
                         if not callable(cell.cell_contents):
                             continue
                         if cell.cell_contents.__code__.co_filename.startswith(
-                                self.base_cache_dir):
+                            self.base_cache_dir
+                        ):
                             # this is the real file path compiled from Inductor
                             file_path = cell.cell_contents.__code__.co_filename
                             break
@@ -338,23 +376,24 @@ class InductorAdaptor(CompilerInterface):
             original_load_name = None
 
             def hijacked_compile_fx_inner(*args, **kwargs):
-                output = torch._inductor.compile_fx.compile_fx_inner(
-                    *args, **kwargs)
+                output = torch._inductor.compile_fx.compile_fx_inner(*args, **kwargs)
                 nonlocal hash_str
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
                     compiled_fn = inductor_compiled_graph.current_callable
                     file_path = compiled_fn.__code__.co_filename  # noqa
-                    if not file_path.startswith(self.base_cache_dir):
+                    if (
+                        not file_path.startswith(self.base_cache_dir)
+                        and compiled_fn.__closure__ is not None
+                    ):
                         # hooked in the align_inputs_from_check_idxs function
                         # in torch/_inductor/utils.py
                         for cell in compiled_fn.__closure__:
                             if not callable(cell.cell_contents):
                                 continue
                             code = cell.cell_contents.__code__
-                            if code.co_filename.startswith(
-                                    self.base_cache_dir):
+                            if code.co_filename.startswith(self.base_cache_dir):
                                 # this is the real file path
                                 # compiled from Inductor
                                 file_path = code.co_filename
@@ -387,29 +426,38 @@ class InductorAdaptor(CompilerInterface):
 
             # for hijacking the hash of the compiled graph
             stack.enter_context(
-                patch("torch._inductor.codecache.compiled_fx_graph_hash",
-                      hijack_compiled_fx_graph_hash))
+                patch(
+                    "torch._inductor.codecache.compiled_fx_graph_hash",
+                    hijack_compiled_fx_graph_hash,
+                )
+            )
 
             # for providing a dummy shape environment
             stack.enter_context(
-                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                      _get_shape_env))
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                    _get_shape_env,
+                )
+            )
 
-            from torch._functorch._aot_autograd.autograd_cache import (
-                AOTAutogradCache)
+            from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
 
             # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
             if hasattr(AOTAutogradCache, "_get_shape_env"):
                 stack.enter_context(
                     patch(
                         "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
-                        _get_shape_env))
+                        _get_shape_env,
+                    )
+                )
 
             # for forcing the graph to be cached
             stack.enter_context(
                 patch(
                     "torch._inductor.codecache.FxGraphCache._check_can_cache",
-                    _check_can_cache))
+                    _check_can_cache,
+                )
+            )
 
             # Dynamo metrics context, see method for more details.
             stack.enter_context(self.metrics_context())
@@ -422,28 +470,28 @@ class InductorAdaptor(CompilerInterface):
             # standalone_compile sometime.
             if is_torch_equal_or_newer("2.6"):
                 stack.enter_context(
-                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                    torch._inductor.config.patch(fx_graph_remote_cache=False)
+                )
                 # InductorAdaptor (unfortunately) requires AOTAutogradCache
                 # to be turned off to run. It will fail to acquire the hash_str
                 # and error if not.
                 # StandaloneInductorAdaptor (PyTorch 2.8+) fixes this problem.
                 stack.enter_context(
-                    torch._functorch.config.patch(enable_autograd_cache=False))
+                    torch._functorch.config.patch(enable_autograd_cache=False)
+                )
                 stack.enter_context(
-                    torch._functorch.config.patch(
-                        enable_remote_autograd_cache=False))
-
-            with pass_context(runtime_shape):
-                compiled_graph = compile_fx(
-                    graph,
-                    example_inputs,
-                    inner_compile=hijacked_compile_fx_inner,
-                    config_patches=current_config)
-
-        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
-        # compilation cache. So turn off the checks if we disable the
-        # compilation cache.
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+                    torch._functorch.config.patch(enable_remote_autograd_cache=False)
+                )
+
+            compiled_graph = compile_fx(
+                graph,
+                example_inputs,
+                inner_compile=hijacked_compile_fx_inner,
+                config_patches=current_config,
+            )
+
+        # Turn off the checks if we disable the compilation cache.
+        if is_compile_cache_enabled(compiler_config):
             if hash_str is None:
                 raise RuntimeError(
                     "vLLM failed to compile the model. The most "
@@ -451,52 +499,63 @@ class InductorAdaptor(CompilerInterface):
                     "failed, leading to a corrupted compilation artifact. "
                     "We recommend trying to "
                     "remove ~/.cache/vllm/torch_compile_cache and try again "
-                    "to see the real issue. ")
+                    "to see the real issue. "
+                )
             assert file_path is not None, (
-                "failed to get the file path of the compiled graph")
+                "failed to get the file path of the compiled graph"
+            )
         return compiled_graph, (hash_str, file_path)
 
-    def load(self,
-             handle: Any,
-             graph: fx.GraphModule,
-             example_inputs: list[Any],
-             graph_index: int,
-             runtime_shape: Optional[int] = None) -> Callable:
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: int | None = None,
+    ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
         hash_str = handle[0]
 
-        from torch._functorch._aot_autograd.autograd_cache import (
-            AOTAutogradCache)
+        from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
         from torch._inductor.codecache import FxGraphCache
+
         with ExitStack() as exit_stack:
             exit_stack.enter_context(
-                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                    lambda *args, **kwargs: AlwaysHitShapeEnv(),
+                )
+            )
             # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
             if hasattr(AOTAutogradCache, "_get_shape_env"):
                 exit_stack.enter_context(
                     patch(
                         "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
-                        lambda *args, **kwargs: AlwaysHitShapeEnv()))
+                        lambda *args, **kwargs: AlwaysHitShapeEnv(),
+                    )
+                )
 
             # Dynamo metrics context, see method for more details.
             exit_stack.enter_context(self.metrics_context())
 
             if torch.__version__.startswith("2.5"):
                 inductor_compiled_graph = FxGraphCache._lookup_graph(
-                    hash_str, example_inputs, True, False)
+                    hash_str, example_inputs, True, False
+                )
                 assert inductor_compiled_graph is not None, (
                     "Inductor cache lookup failed. Please remove"
                     f"the cache directory and try again."  # noqa
                 )
             elif torch.__version__ >= "2.6":
-                from torch._inductor.output_code import (
-                    CompiledFxGraphConstantsWithGm)
+                from torch._inductor.output_code import CompiledFxGraphConstantsWithGm
+
                 constants = CompiledFxGraphConstantsWithGm(graph)
                 inductor_compiled_graph, _ = FxGraphCache._lookup_graph(
-                    hash_str, example_inputs, True, None, constants)
+                    hash_str, example_inputs, True, None, constants
+                )
                 assert inductor_compiled_graph is not None, (
                     "Inductor cache lookup failed. Please remove"
                     f"the cache directory and try again."  # noqa
@@ -509,6 +568,7 @@ class InductorAdaptor(CompilerInterface):
 
         # need to know if the graph returns a tuple
         from torch._inductor.compile_fx import graph_returns_tuple
+
         returns_tuple = graph_returns_tuple(graph)
 
         # this is the callable we return to Dynamo to run
@@ -534,7 +594,7 @@ class InductorAdaptor(CompilerInterface):
 
         Because it is re-entrant, we always set it (even if entering via Dynamo
         and the context was already entered). We might want to revisit if it
-        should be set at a different level of compilation.
+        should be set at a different mode of compilation.
 
         This is likely a bug in PyTorch: public APIs should not rely on
         manually setting up internal contexts. But we also rely on non-public
@@ -542,6 +602,7 @@ class InductorAdaptor(CompilerInterface):
         """
         if is_torch_equal_or_newer("2.6"):
             import torch._dynamo.utils
+
             return torch._dynamo.utils.get_metrics_context()
         else:
             return contextlib.nullcontext()
@@ -553,7 +614,12 @@ def set_inductor_config(config, runtime_shape):
         # can be beneficial
         config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE
         config["coordinate_descent_tuning"] = (
-            envs.VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING)
+            envs.VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING
+        )
+
+
+def set_functorch_config():
+    torch._functorch.config.bundled_autograd_cache = False
 
 
 class EagerAdaptor(CompilerInterface):
@@ -564,9 +630,9 @@ class EagerAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index e01dd3915a3a165b79d3a419dbec01a425b08154..20918099f169d8c6af0dfc1c018614dde7b93296 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -27,8 +27,8 @@ class CompilationCounter:
     num_cache_entries_updated: int = 0
     # The number of standalone_compile compiled artifacts saved
     num_compiled_artifacts_saved: int = 0
-    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
-    dynamo_as_is_count: int = 0
+    # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE
+    stock_torch_compile_count: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
@@ -41,7 +41,8 @@ class CompilationCounter:
             assert getattr(self, k) - getattr(old, k) == v, (
                 f"{k} not as expected, before it is {getattr(old, k)}"
                 f", after it is {getattr(self, k)}, "
-                f"expected diff is {v}")
+                f"expected diff is {v}"
+            )
 
 
 compilation_counter = CompilationCounter()
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index befb7736d75af37a37c9d28f4ecaa9abb8d29ac6..a2e0abfebc2c9cd523466b63ab65778a35796bd4 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any, Callable, Optional
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -12,12 +13,11 @@ import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
 from vllm.config import CUDAGraphMode, VllmConfig
-from vllm.distributed.device_communicators.pynccl_allocator import (
-    set_graph_pool_id)
+from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
 from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import weak_ref_tensors
+from vllm.utils.torch_utils import weak_ref_tensors
 
 logger = init_logger(__name__)
 
@@ -25,12 +25,12 @@ logger = init_logger(__name__)
 @dataclasses.dataclass
 class CUDAGraphEntry:
     batch_descriptor: BatchDescriptor
-    cudagraph: Optional[torch.cuda.CUDAGraph] = None
-    output: Optional[Any] = None
+    cudagraph: torch.cuda.CUDAGraph | None = None
+    output: Any | None = None
 
     # for cudagraph debugging, track the input addresses
     # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
+    input_addresses: list[int] | None = None
 
 
 @dataclasses.dataclass
@@ -46,10 +46,10 @@ class CUDAGraphWrapper:
 
     The workflow of this wrapper in the cudagraph dispatching is as follows:
     1. At initialization, a runtime mode is assigned to the wrapper (FULL or
-    PIECEWISE). 
-    2. At runtime, the wrapper receives a runtime_mode and a 
+    PIECEWISE).
+    2. At runtime, the wrapper receives a runtime_mode and a
     batch_descriptor(key) from the forward context and blindly trust them
-    for cudagraph dispatching. 
+    for cudagraph dispatching.
     3. If runtime_mode is NONE or runtime_mode does not match the mode of the
     wrapper, just call the runnable directly.
     4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
@@ -58,18 +58,20 @@ class CUDAGraphWrapper:
 
     Note: CUDAGraphWrapper does not store persistent buffers or copy any
     runtime inputs into that buffers for replay. We assume implementing them
-    is done outside of the wrapper. That is because we do not make any 
+    is done outside of the wrapper. That is because we do not make any
     assumption on the dynamic shape (batch size) of the runtime inputs, as a
-    trade-off for staying orthogonal to compilation logic. Nevertheless, 
+    trade-off for staying orthogonal to compilation logic. Nevertheless,
     tracing and checking the input addresses to be consistent during replay is
     guaranteed when VLLM_LOGGING_LEVEL == "DEBUG".
     """
 
-    def __init__(self,
-                 runnable: Callable,
-                 vllm_config: VllmConfig,
-                 runtime_mode: CUDAGraphMode,
-                 cudagraph_options: Optional[CUDAGraphOptions] = None):
+    def __init__(
+        self,
+        runnable: Callable,
+        vllm_config: VllmConfig,
+        runtime_mode: CUDAGraphMode,
+        cudagraph_options: CUDAGraphOptions | None = None,
+    ):
         self.runnable = runnable
         self.vllm_config = vllm_config
         self.runtime_mode = runtime_mode
@@ -91,15 +93,16 @@ class CUDAGraphWrapper:
         self.cudagraph_options = cudagraph_options
         # the entries for different batch descriptors that we need to capture
         # cudagraphs for.
-        self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry]\
-                                                                        = {}
+        self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry] = {}
 
     def __getattr__(self, key: str):
         # allow accessing the attributes of the runnable.
         if hasattr(self.runnable, key):
             return getattr(self.runnable, key)
-        raise AttributeError(f"Attribute {key} not exists in the runnable of "
-                             f"cudagraph wrapper: {self.runnable}")
+        raise AttributeError(
+            f"Attribute {key} not exists in the runnable of "
+            f"cudagraph wrapper: {self.runnable}"
+        )
 
     def unwrap(self) -> Callable:
         # in case we need to access the original runnable.
@@ -110,8 +113,10 @@ class CUDAGraphWrapper:
         batch_descriptor = forward_context.batch_descriptor
         cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode
 
-        if cudagraph_runtime_mode == CUDAGraphMode.NONE or \
-                            cudagraph_runtime_mode != self.runtime_mode:
+        if (
+            cudagraph_runtime_mode == CUDAGraphMode.NONE
+            or cudagraph_runtime_mode != self.runtime_mode
+        ):
             # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
             # running without cudagraphs.
             # We do not trigger capture/replay if the runtime mode is not
@@ -122,8 +127,9 @@ class CUDAGraphWrapper:
 
         if batch_descriptor not in self.concrete_cudagraph_entries:
             # create a new entry for this batch descriptor
-            self.concrete_cudagraph_entries[batch_descriptor] = \
-                CUDAGraphEntry(batch_descriptor=batch_descriptor)
+            self.concrete_cudagraph_entries[batch_descriptor] = CUDAGraphEntry(
+                batch_descriptor=batch_descriptor
+            )
 
         entry = self.concrete_cudagraph_entries[batch_descriptor]
 
@@ -133,8 +139,11 @@ class CUDAGraphWrapper:
                 # capturing is fast, we don't need to log it for every
                 # shape. E.g. we only log it for the first subgraph in
                 # piecewise mode.
-                logger.debug("Capturing a cudagraph on (%s,%s)",
-                             self.runtime_mode.name, entry.batch_descriptor)
+                logger.debug(
+                    "Capturing a cudagraph on (%s,%s)",
+                    self.runtime_mode.name,
+                    entry.batch_descriptor,
+                )
             # validate that cudagraph capturing is legal at this point.
             validate_cudagraph_capturing_enabled()
 
@@ -153,8 +162,7 @@ class CUDAGraphWrapper:
                     # therefore, we only run gc for the first graph,
                     # and disable gc for the rest of the graphs.
                     stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(
-                        patch("torch.cuda.empty_cache", lambda: None))
+                    stack.enter_context(patch("torch.cuda.empty_cache", lambda: None))
 
                 if self.graph_pool is not None:
                     set_graph_pool_id(self.graph_pool)
@@ -193,7 +201,8 @@ class CUDAGraphWrapper:
             assert new_input_addresses == entry.input_addresses, (
                 f"Input addresses for cudagraphs are different "
                 f"during replay. Expected {entry.input_addresses}, "
-                f"got {new_input_addresses}")
+                f"got {new_input_addresses}"
+            )
 
         entry.cudagraph.replay()
         return entry.output
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 6e9a36a2b0b99c34e6b28bcdf276cad9e122678c..11a18c0e6bb78a13bf9bf9e9d6a375fb90723398 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -2,8 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
+import hashlib
 import inspect
-from typing import Callable, Optional, TypeVar, Union, overload
+import os
+import sys
+from collections.abc import Callable
+from typing import TypeVar, overload
 from unittest.mock import patch
 
 import torch
@@ -11,12 +15,19 @@ import torch.nn as nn
 from packaging import version
 from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, VllmConfig
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationMode,
+    VllmConfig,
+    get_current_vllm_config,
+    set_current_vllm_config,
+)
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
-from vllm.utils import resolve_obj_by_qualname, supports_dynamo
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.torch_utils import supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -34,11 +45,11 @@ def ignore_torch_compile(cls: _T) -> _T:
     a support_torch_compile decorator, but we don't want to
     compile the class `cls` that inherits the parent class.
     This only ignores compiling the forward of the class the
-    decorator is applied to. 
+    decorator is applied to.
 
     If the parent has ignore_torch_compile but the child has
     support_torch_compile, the child will still be compiled.
-    
+
     If the class has one or more submodules
     that have support_torch_compile decorator applied, compile will
     not be ignored for those submodules.
@@ -57,30 +68,43 @@ def _should_ignore_torch_compile(cls) -> bool:
 @overload
 def support_torch_compile(
     *,
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
-) -> Callable[[_T], _T]:
-    ...
+    enable_if: Callable[[VllmConfig], bool] | None = None,
+) -> Callable[[_T], _T]: ...
 
 
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]],
-) -> Callable[[_T], _T]:
-    ...
+    dynamic_arg_dims: dict[str, int | list[int]] | None,
+) -> Callable[[_T], _T]: ...
 
 
 @overload
-def support_torch_compile(cls: _T) -> _T:
-    ...
+def support_torch_compile(
+    *,
+    mark_unbacked_dims: dict[str, int | list[int]] | None,
+) -> Callable[[_T], _T]: ...
 
 
+@overload
 def support_torch_compile(
-    cls: Optional[_T] = None,
     *,
-    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None,
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
-) -> Union[Callable[[_T], _T], _T]:
+    dynamic_arg_dims: dict[str, int | list[int]] | None,
+    mark_unbacked_dims: dict[str, int | list[int]] | None,
+) -> Callable[[_T], _T]: ...
+
+
+@overload
+def support_torch_compile(cls: _T) -> _T: ...
+
+
+def support_torch_compile(
+    cls: _T | None = None,
+    *,
+    dynamic_arg_dims: dict[str, int | list[int]] | None = None,
+    mark_unbacked_dims: dict[str, int | list[int]] | None = None,
+    enable_if: Callable[[VllmConfig], bool] | None = None,
+) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
 
@@ -89,8 +113,7 @@ def support_torch_compile(
     ```python
     @support_torch_compile
     class MyModel(nn.Module):
-        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
-            ...
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
     ```
 
     Usage 2: use as a decorator with arguments:
@@ -98,8 +121,7 @@ def support_torch_compile(
     ```python
     @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0})
     class MyModel(nn.Module):
-        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
-            ...
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
     ```
 
     `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
@@ -134,12 +156,17 @@ def support_torch_compile(
     returns a boolean value indicating whether to compile the model or not.
     This is useful if you want to compile the model only when certain
     conditions are met.
+
+    `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic
+    dim to be decorated with `mark_unbacked`.  This is useful if we would like to
+    enforce that dynamo does not specialize on 0/1 values in the case of dummy input
+    such as for vision model compilation
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
-        # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
-        # to avoid too much indentation for `_support_torch_compile``
-        if not hasattr(cls, 'forward'):
+        # helper to pass `dynamic_arg_dims` to `_support_torch_compile`
+        # to avoid too much indentation for `_support_torch_compile`
+        if not hasattr(cls, "forward"):
             raise TypeError("decorated class should have a forward method.")
         sig = inspect.signature(cls.forward)
         inferred_dynamic_arg_dims = dynamic_arg_dims
@@ -147,26 +174,33 @@ def support_torch_compile(
             inferred_dynamic_arg_dims = {}
             for k, v in sig.parameters.items():
                 if v.annotation in [
-                        torch.Tensor, Optional[torch.Tensor],
-                        IntermediateTensors, Optional[IntermediateTensors]
+                    torch.Tensor,
+                    torch.Tensor | None,
+                    IntermediateTensors,
+                    IntermediateTensors | None,
                 ]:
                     inferred_dynamic_arg_dims[k] = 0
 
-            logger.debug(("Inferred dynamic dimensions for "
-                          "forward method of %s: %s"), cls,
-                         list(inferred_dynamic_arg_dims.keys()))
+            logger.debug(
+                ("Inferred dynamic dimensions for forward method of %s: %s"),
+                cls,
+                list(inferred_dynamic_arg_dims.keys()),
+            )
 
         if len(inferred_dynamic_arg_dims) == 0:
             raise ValueError(
                 "No dynamic dimensions found in the forward method of "
-                f"{cls}. Please provide dynamic_arg_dims explicitly.")
+                f"{cls}. Please provide dynamic_arg_dims explicitly."
+            )
 
         for k in inferred_dynamic_arg_dims:
             if k not in sig.parameters:
                 raise ValueError(
-                    f"Argument {k} not found in the forward method of {cls}")
-        return _support_torch_compile(cls, inferred_dynamic_arg_dims,
-                                      enable_if)
+                    f"Argument {k} not found in the forward method of {cls}"
+                )
+        return _support_torch_compile(
+            cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+        )
 
     if cls is not None:
         # use `support_torch_compile` as a decorator without arguments
@@ -176,47 +210,123 @@ def support_torch_compile(
     return cls_decorator_helper
 
 
+def _model_hash_key(fn) -> str:
+    import vllm
+
+    sha256_hash = hashlib.sha256()
+    sha256_hash.update(vllm.__version__.encode())
+    sha256_hash.update(fn.__qualname__.encode())
+    sha256_hash.update(str(fn.__code__.co_firstlineno).encode())
+    return sha256_hash.hexdigest()
+
+
+def _verify_source_unchanged(source_info, vllm_config) -> None:
+    from .caching import _compute_code_hash, _compute_code_hash_with_content
+
+    file_contents = {}
+    for source in source_info.inlined_sources:
+        module = sys.modules[source.module]
+        file = inspect.getfile(module)
+        vllm_config.compilation_config.traced_files.add(file)
+        file_contents[file] = source.content
+    expected_checksum = _compute_code_hash_with_content(file_contents)
+    actual_checksum = _compute_code_hash(set(file_contents.keys()))
+    if expected_checksum != actual_checksum:
+        raise RuntimeError(
+            "Source code has changed since the last compilation. Recompiling the model."
+        )
+
+
 def _support_torch_compile(
     cls: _T,
-    dynamic_arg_dims: dict[str, Union[int, list[int]]],
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
+    dynamic_arg_dims: dict[str, int | list[int]],
+    mark_unbacked_dims: dict[str, int | list[int]] | None = None,
+    enable_if: Callable[[VllmConfig], bool] | None = None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
     """
-    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+    if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
-    #  other than TorchCompileWrapperWithCustomDispatcher
-    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+    #  other than TorchCompileWithNoGuardsWrapper
+    cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,)
 
     old_init = cls.__init__
 
     setattr(cls, IGNORE_COMPILE_KEY, False)
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
-        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+    def __init__(
+        self, *, vllm_config: VllmConfig | None = None, prefix: str = "", **kwargs
+    ):
+        if vllm_config is None:
+            vllm_config = get_current_vllm_config()
+
+        # NOTE: to support multimodal models (such as encoder),
+        # we may not have vllm_config so we may need to patch
+        # it
+        sig = inspect.signature(old_init)
+        if "vllm_config" in sig.parameters:
+            kwargs["vllm_config"] = vllm_config
+        if "prefix" in sig.parameters:
+            kwargs["prefix"] = prefix
+        old_init(self, **kwargs)
+
         self.vllm_config = vllm_config
         enable_compile = enable_if is None or enable_if(vllm_config)
-        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
-        self.do_not_compile = \
-            vllm_config.compilation_config.level in [
-            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
-        ] or not supports_dynamo() or _should_ignore_torch_compile(
-            self.__class__) or not enable_compile
+        self.do_not_compile = (
+            vllm_config.compilation_config.mode
+            in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
+            or not supports_dynamo()
+            or _should_ignore_torch_compile(self.__class__)
+            or not enable_compile
+        )
         if self.do_not_compile:
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level)
+        self.compiled = False
+        TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
+    def _mark_dynamic_inputs(mod, *args, **kwargs):
+        sig = inspect.signature(mod.__class__.forward)
+        bound_args = sig.bind(mod, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+            if arg is not None:
+                dims = [dims] if isinstance(dims, int) else dims
+                if isinstance(arg, torch.Tensor):
+                    # In case dims is specified with negative indexing
+                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                    torch._dynamo.mark_dynamic(arg, dims)
+                elif isinstance(arg, IntermediateTensors):
+                    for tensor in arg.tensors.values():
+                        # In case dims is specified with negative indexing
+                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.mark_dynamic(tensor, dims)
+                else:
+                    raise ValueError(
+                        "Unsupported dynamic dimensions"
+                        f" {dims} for argument {k} with type {type(arg)}."
+                    )
+        if mark_unbacked_dims:
+            for k, dims in mark_unbacked_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
@@ -224,98 +334,144 @@ def _support_torch_compile(
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
-        # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
-            sig = inspect.signature(self.__class__.forward)
-            bound_args = sig.bind(self, *args, **kwargs)
-            bound_args.apply_defaults()
-            for k, dims in dynamic_arg_dims.items():
-                arg = bound_args.arguments.get(k)
-                if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
-                    if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [
-                            arg.ndim + dim if dim < 0 else dim for dim in dims
-                        ]
-                        torch._dynamo.mark_dynamic(arg, dims)
-                    elif isinstance(arg, IntermediateTensors):
-                        for tensor in arg.tensors.values():
-                            # In case dims is specified with negative indexing
-                            dims = [
-                                tensor.ndim + dim if dim < 0 else dim
-                                for dim in dims
-                            ]
-                            torch._dynamo.mark_dynamic(tensor, dims)
-                    else:
-                        raise ValueError(
-                            "Unsupported dynamic dimensions"
-                            f" {dims} for argument {k} with type {type(arg)}.")
-            # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config)
-            logger.debug("Start compiling function %s",
-                         self.original_code_object)
-
-        # if we don't use custom dispatcher, we can directly call the
-        # compiled function and let torch.compile handle the dispatching,
-        # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            # it seems Dynamo reuse the compilation across instances,
-            # while we need to make sure the compiled code is not reused.
-            # we need to control all the compilation of the model.
-            torch._dynamo.eval_frame.remove_from_cache(
-                self.original_code_object)
-
-            # collect all relevant files traced by Dynamo,
-            # so that the compilation cache can trigger re-compilation
-            # properly when any of these files change.
-
-            # 1. the file containing the top-level forward function
-            self.vllm_config.compilation_config.traced_files.add(
-                self.original_code_object.co_filename)
-
-            # 2. every time Dynamo sees a function call, it will inline
-            # the function by calling InliningInstructionTranslator.inline_call
-            # we hijack this function to know all the functions called
-            # during Dynamo tracing, and their corresponding files
-            inline_call = InliningInstructionTranslator.inline_call
-
-            def patched_inline_call(parent, func, args, kwargs):
-                code = func.get_code()
-                self.vllm_config.compilation_config.traced_files.add(
-                    code.co_filename)
-                return inline_call(parent, func, args, kwargs)
-
-            # Disable the C++ compilation of symbolic shape guards. C++-fication
-            # of symbolic shape guards can improve guard overhead. But, since
-            # vllm skip guards anyways, setting this flag to False can improve
-            # compile time.
-            dynamo_config_patches = {}
+        # if aot_compiled_fn is set, just call it.
+        if getattr(self, "aot_compiled_fn", None) is not None:
+            return self.aot_compiled_fn(self, *args, **kwargs)
+
+        cache_dir = None
+        aot_compilation_path = None
+        if envs.VLLM_USE_AOT_COMPILE:
+            """
+            When using torch.compile in AOT mode, we store the cache artifacts
+            under VLLM_CACHE_ROOT/torch_aot_compile/{hash}/rank_i_j. The {hash}
+            contains all of the factors except for the source files being
+            traced through, because we don't actually know which source files
+            to check at this point (before dynamo runs).
+            On loading we will actually look at the source files being traced
+            through. If any source file have changed (compared with the
+            serialized backend artifacts), then we need to generate a new AOT
+            compile artifact from scratch.
+            """
+            from .caching import compilation_config_hash_factors
+
+            factors: list[str] = compilation_config_hash_factors(self.vllm_config)
+
+            factors.append(_model_hash_key(self.forward))
+            hash_key = hashlib.sha256(str(factors).encode()).hexdigest()
+
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT,
+                "torch_aot_compile",
+                hash_key,
+            )
+
+            rank = self.vllm_config.parallel_config.rank
+            dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+            cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
+            aot_compilation_path = os.path.join(cache_dir, "model")
             try:
-                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
-                dynamo_config_patches[
-                    "enable_cpp_symbolic_shape_guards"] = False
-            except AttributeError:
-                # Note: this config is not available in torch 2.6, we can skip
-                # if the config doesn't exist
-                logger.debug(
-                    "enable_cpp_symbolic_shape_guards config not available")
-
-            with patch.object(
-                    InliningInstructionTranslator, "inline_call",
-                    patched_inline_call), torch._dynamo.config.patch(
-                        **dynamo_config_patches
-                    ), maybe_use_cudagraph_partition_wrapper(
-                        self.vllm_config), _torch27_patch_tensor_subclasses():
-                output = self.compiled_callable(*args, **kwargs)
-            return output
-
-        # usually, capturing the model once is enough, and then we can
-        # dispatch to the compiled code directly, without going through
-        # the Dynamo guard mechanism.
-        with self.dispatch_to_code(0):
-            model_output = self.forward(*args, **kwargs)
-            return model_output
+                with (
+                    set_current_vllm_config(self.vllm_config),
+                    open(aot_compilation_path, "rb") as f,
+                ):
+                    start_monitoring_torch_compile(self.vllm_config)
+                    loaded_fn = torch.compiler.load_compiled_function(f)
+                _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config)
+                loaded_fn.disable_guard_check()
+                self.aot_compiled_fn = loaded_fn
+            except Exception as e:
+                if os.path.exists(aot_compilation_path):
+                    logger.warning(
+                        "Cannot load aot compilation from path %s, error: %s",
+                        aot_compilation_path,
+                        str(e),
+                    )
+                if envs.VLLM_FORCE_AOT_LOAD:
+                    raise e
+            if getattr(self, "aot_compiled_fn", None) is not None:
+                logger.info(
+                    "Directly load AOT compilation from path %s", aot_compilation_path
+                )
+                return self.aot_compiled_fn(self, *args, **kwargs)
+
+        if self.compiled:
+            assert not envs.VLLM_USE_AOT_COMPILE
+            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        # This is the path for the first compilation.
+
+        # the first compilation needs to have dynamic shapes marked
+        _mark_dynamic_inputs(self, *args, **kwargs)
+
+        # here, it is the starting point of the `torch.compile` process
+        start_monitoring_torch_compile(self.vllm_config)
+        original_code_object = self.original_code_object()
+        logger.debug("Start compiling function %s", original_code_object)
+
+        # we do not want tp delete the original code object entries since
+        # we depend on them now to look up cached compiled functions.
+        # torch._dynamo.eval_frame.remove_from_cache(original_code_object)
+
+        # collect all relevant files traced by Dynamo,
+        # so that the compilation cache can trigger re-compilation
+        # properly when any of these files change.
+
+        # 1. the file containing the top-level forward function
+        self.vllm_config.compilation_config.traced_files.add(
+            original_code_object.co_filename
+        )
+
+        # 2. every time Dynamo sees a function call, it will inline
+        # the function by calling InliningInstructionTranslator.inline_call_
+        # we hijack this function to know all the functions called
+        # during Dynamo tracing, and their corresponding files
+        inline_call = InliningInstructionTranslator.inline_call_
+
+        def patched_inline_call(self_):
+            code = self_.f_code
+            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            return inline_call(self_)
+
+        # Disable the C++ compilation of symbolic shape guards. C++-fication
+        # of symbolic shape guards can improve guard overhead. But, since
+        # vllm skip guards anyways, setting this flag to False can improve
+        # compile time.
+        dynamo_config_patches = {}
+        try:
+            _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+            dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
+        except AttributeError:
+            # Note: this config is not available in torch 2.6, we can skip
+            # if the config doesn't exist
+            logger.debug("enable_cpp_symbolic_shape_guards config not available")
+
+        with (
+            patch.object(
+                InliningInstructionTranslator, "inline_call_", patched_inline_call
+            ),
+            torch._dynamo.config.patch(**dynamo_config_patches),
+            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            _torch27_patch_tensor_subclasses(),
+        ):
+            if envs.VLLM_USE_AOT_COMPILE:
+                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                output = self.aot_compiled_fn(self, *args, **kwargs)
+                assert aot_compilation_path is not None
+                assert cache_dir is not None
+                try:
+                    os.makedirs(cache_dir, exist_ok=True)
+                    self.aot_compiled_fn.save_compiled_function(aot_compilation_path)
+                except Exception as e:
+                    logger.warning(
+                        "Cannot save aot compilation to path %s, error: %s",
+                        aot_compilation_path,
+                        str(e),
+                    )
+            else:
+                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        self.compiled = True
+        return output
 
     cls.__call__ = __call__
     return cls
@@ -327,7 +483,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     Context manager to set/unset customized cudagraph partition wrappers.
 
     If we're using Inductor-based graph partitioning, we currently have the
-    whole `fx.Graph` before Inductor lowering and and the piecewise
+    whole `fx.Graph` before Inductor lowering and the piecewise
     splitting happens after all graph passes and fusions. Here, we add
     a custom hook for Inductor to wrap each partition with our static
     graph wrapper class to maintain more control over static graph
@@ -336,18 +492,20 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     from vllm.config import CUDAGraphMode
 
     compilation_config = vllm_config.compilation_config
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and compilation_config.use_inductor_graph_partition):
+    if (
+        compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
+        and compilation_config.use_inductor_graph_partition
+    ):
         from torch._inductor.utils import CUDAGraphWrapperMetadata
 
         from vllm.compilation.cuda_graph import CUDAGraphOptions
         from vllm.platforms import current_platform
 
         static_graph_wrapper_class = resolve_obj_by_qualname(
-            current_platform.get_static_graph_wrapper_cls())
+            current_platform.get_static_graph_wrapper_cls()
+        )
 
-        def customized_cudagraph_wrapper(f,
-                                         metadata: CUDAGraphWrapperMetadata):
+        def customized_cudagraph_wrapper(f, metadata: CUDAGraphWrapperMetadata):
             partition_id = metadata.partition_index
             num_partitions = metadata.num_partitions
             return static_graph_wrapper_class(
@@ -358,15 +516,19 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
                     debug_log_enable=partition_id == 0,
                     gc_disable=partition_id != 0,
                     weak_ref_output=partition_id == num_partitions - 1,
-                ))
+                ),
+            )
 
         torch._inductor.utils.set_customized_partition_wrappers(
-            customized_cudagraph_wrapper)
+            customized_cudagraph_wrapper
+        )
 
     yield
 
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and compilation_config.use_inductor_graph_partition):
+    if (
+        compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
+        and compilation_config.use_inductor_graph_partition
+    ):
         torch._inductor.utils.set_customized_partition_wrappers(None)
 
 
@@ -378,23 +540,32 @@ def _torch27_patch_tensor_subclasses():
     `BasevLLMParameters` without having to replace them with regular tensors
     before `torch.compile`-time.
     """
-    from vllm.model_executor.parameter import (BasevLLMParameter,
-                                               ModelWeightParameter,
-                                               RowvLLMParameter,
-                                               _ColumnvLLMParameter)
+    from vllm.model_executor.parameter import (
+        BasevLLMParameter,
+        ModelWeightParameter,
+        RowvLLMParameter,
+        _ColumnvLLMParameter,
+    )
 
     def return_false(*args, **kwargs):
         return False
 
-    if version.parse("2.7") <= version.parse(
-            torch.__version__) < version.parse("2.8"):
+    if version.parse("2.7") <= version.parse(torch.__version__) < version.parse("2.8"):
         yield
         return
 
-    with (torch._dynamo.config.patch("traceable_tensor_subclasses", [
-            BasevLLMParameter, ModelWeightParameter, _ColumnvLLMParameter,
-            RowvLLMParameter
-    ]),
-          patch("torch._dynamo.variables.torch.can_dispatch_torch_function",
-                return_false)):
+    with (
+        torch._dynamo.config.patch(
+            "traceable_tensor_subclasses",
+            [
+                BasevLLMParameter,
+                ModelWeightParameter,
+                _ColumnvLLMParameter,
+                RowvLLMParameter,
+            ],
+        ),
+        patch(
+            "torch._dynamo.variables.torch.can_dispatch_torch_function", return_false
+        ),
+    ):
         yield
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 6ff388be8077da2d981d1d5556bea0fc10adcf81..736120c2b7cd18c2970685f680d3939c4fc635b9 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -3,7 +3,6 @@
 
 import operator
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -31,8 +30,9 @@ class FixFunctionalizationPass(VllmInductorPass):
         # XPU does not support auto-functionalization yet.
         # Will enable this when switch to vllm-xpu-kernels.
         if current_platform.is_xpu():
-            logger.debug("XPU platform does not support fix functionalization"
-                         "pass currently.")
+            logger.debug(
+                "XPU platform does not support fix functionalizationpass currently."
+            )
             return
 
         self.nodes_to_remove: list[torch.fx.Node] = []
@@ -45,65 +45,110 @@ class FixFunctionalizationPass(VllmInductorPass):
             at_target = node.args[0]
 
             if at_target == torch.ops._C.rotary_embedding.default:
-                query = kwargs['query']
-                mm_node = query.args[0].args[0]
-
-                # rotary_embedding is a special case: the two mutating inputs
-                # are query and key, which are slices of mm_node.
-                # While functionalized, results at[1] and at[2] are scattered
-                # back into mm_node. After de-functionalization, we can just
-                # use mm_node directly.
-                for idx, user in self.getitem_users(node).items():
-                    for user_of_getitem in user.users:
-                        if is_func(user_of_getitem,
-                                   torch.ops.aten.slice_scatter.default):
-                            user_of_getitem.replace_all_uses_with(mm_node)
-                            self._remove(user_of_getitem)
-                    self._remove(user)
-
-                self.insert_defunctionalized(graph, node)
-                self._remove(node)
+                query = kwargs["query"]
+                key = kwargs["key"]
+                getitem_nodes = self.getitem_users(node)
+
+                if (
+                    is_func(query, operator.getitem)
+                    and is_func(key, operator.getitem)
+                    and query.args[0] == key.args[0]
+                    and is_func(query.args[0], torch.ops.aten.split_with_sizes.default)
+                    and all(
+                        is_func(user, torch.ops.aten.slice_scatter.default)
+                        for getitem_node in getitem_nodes.values()
+                        for user in getitem_node.users
+                    )
+                ):
+                    # Pattern where query and key are slices of an mm_node.
+                    # While functionalized, results at [1] and [2] are scattered
+                    # back into mm_node. So after de-functionalization, we can
+                    # just use mm_node directly.
+
+                    mm_node = query.args[0].args[0]
+                    for user in getitem_nodes.values():
+                        for user_of_getitem in user.users:
+                            if is_func(
+                                user_of_getitem, torch.ops.aten.slice_scatter.default
+                            ):
+                                user_of_getitem.replace_all_uses_with(mm_node)
+                                self._remove(user_of_getitem)
+                        self._remove(user)
+
+                    self.insert_defunctionalized(graph, node)
+                    self._remove(node)
+
+                else:
+                    # Directly replace the auto_functionalize(rotary_embedding)
+                    # with the inplace rotary_embedding. In theory, we shouldn't
+                    # do this blindly, but in practice in vLLM it's ok. The best
+                    # solution is to use auto_functionalization_v2 and then use
+                    # inductor's builtin defunctionalization (reinplacing) pass.
+                    mutated_args = {1: "query", 2: "key"}
+                    self.defunctionalize(graph, node, mutated_args)
 
             # rms_norm replacements avoid the most copies for LLaMa.
             elif at_target == torch.ops._C.fused_add_rms_norm.default:
-                mutated_args = {1: 'input', 2: 'residual'}
+                mutated_args = {1: "input", 2: "residual"}
                 self.defunctionalize(graph, node, mutated_args)
             # elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
-            #     mutated_args = {1: 'result', 2: 'residual'}
+            #     mutated_args = {1: "result", 2: "residual"}
             #     self.defunctionalize(graph, node, mutated_args)
             elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
-                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                mutated_args = {1: "result", 2: "scale", 3: "residual"}
                 self.defunctionalize(graph, node, mutated_args)
             elif at_target in [
-                    torch.ops._C.rms_norm.default,
-                    # torch.ops._C.rms_norm_static_fp8_quant.default,
+                torch.ops._C.rms_norm.default,
+                # torch.ops._C.rms_norm_static_fp8_quant.default,
             ]:
-                mutated_args = {1: 'result'}
+                mutated_args = {1: "result"}
                 self.defunctionalize(graph, node, mutated_args)
             # For some reason we need to specify the args for both
             # silu_and_mul and silu_and_mul_quant. The kwargs
             # pathway gets the wrong answer.
             elif at_target == torch.ops._C.silu_and_mul.default:
-                mutated_args = {1: 'result'}
-                self.defunctionalize(graph,
-                                     node,
-                                     mutated_args,
-                                     args=('result', 'input'))
+                mutated_args = {1: "result"}
+                self.defunctionalize(
+                    graph, node, mutated_args, args=("result", "input")
+                )
             # elif at_target == torch.ops._C.silu_and_mul_quant.default:
-            #     mutated_args = {1: 'result'}
-            #     self.defunctionalize(graph,
-            #                          node,
-            #                          mutated_args,
-            #                          args=('result', 'input', 'scale'))
-            # elif hasattr(
-            #         torch.ops._C, "silu_and_mul_nvfp4_quant"
-            # ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
-            #     mutated_args = {1: 'result', 2: 'result_block_scale'}
-            #     self.defunctionalize(graph,
-            #                          node,
-            #                          mutated_args,
-            #                          args=('result', 'result_block_scale',
-            #                                'input', 'input_global_scale'))
+            #     mutated_args = {1: "result"}
+            #     self.defunctionalize(
+            #         graph, node, mutated_args, args=("result", "input", "scale")
+            #     )
+            # elif (
+            #     hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")
+            #     and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default
+            # ):
+            #     mutated_args = {1: "result", 2: "result_block_scale"}
+            #     self.defunctionalize(
+            #         graph,
+            #         node,
+            #         mutated_args,
+            #         args=(
+            #             "result",
+            #             "result_block_scale",
+            #             "input",
+            #             "input_global_scale",
+            #         ),
+            #     )
+            # Defunctionalize fused_qk_norm_rope to remove higher-order wrapper.
+            elif at_target == torch.ops._C.fused_qk_norm_rope.default:
+                mutated_args = {1: "qkv"}
+                args = (
+                    "qkv",
+                    "num_heads_q",
+                    "num_heads_k",
+                    "num_heads_v",
+                    "head_dim",
+                    "eps",
+                    "q_weight",
+                    "k_weight",
+                    "cos_sin_cache",
+                    "is_neox",
+                    "position_ids",
+                )
+                self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
             else:
                 continue  # skip the count
 
@@ -116,12 +161,12 @@ class FixFunctionalizationPass(VllmInductorPass):
         for node in self.nodes_to_remove:
             graph.erase_node(node)
 
-        logger.debug("De-functionalized %s nodes, removed %s nodes", count,
-                     count_removed)
+        logger.debug(
+            "De-functionalized %s nodes, removed %s nodes", count, count_removed
+        )
         self.nodes_to_remove.clear()
 
-    def _remove(self, node_or_nodes: Union[torch.fx.Node,
-                                           Iterable[torch.fx.Node]]):
+    def _remove(self, node_or_nodes: torch.fx.Node | Iterable[torch.fx.Node]):
         """
         Stage a node (or nodes) for removal at the end of the pass.
         """
@@ -130,12 +175,13 @@ class FixFunctionalizationPass(VllmInductorPass):
         else:
             self.nodes_to_remove.extend(node_or_nodes)
 
-    def defunctionalize(self,
-                        graph: torch.fx.Graph,
-                        node: torch.fx.Node,
-                        mutated_args: dict[int, Union[torch.fx.Node, str]],
-                        args: Optional[tuple[Union[torch.fx.Node, str],
-                                             ...]] = None):
+    def defunctionalize(
+        self,
+        graph: torch.fx.Graph,
+        node: torch.fx.Node,
+        mutated_args: dict[int, torch.fx.Node | str],
+        args: tuple[torch.fx.Node | str, ...] | None = None,
+    ):
         """
         De-functionalize a node by replacing it with a call to the original.
         It also replaces the getitem users with the mutated arguments.
@@ -145,10 +191,9 @@ class FixFunctionalizationPass(VllmInductorPass):
         self.insert_defunctionalized(graph, node, args=args)
         self._remove(node)
 
-    def replace_users_with_mutated_args(self, node: torch.fx.Node,
-                                        mutated_args: dict[int,
-                                                           Union[torch.fx.Node,
-                                                                 str]]):
+    def replace_users_with_mutated_args(
+        self, node: torch.fx.Node, mutated_args: dict[int, torch.fx.Node | str]
+    ):
         """
         Replace all getitem users of the auto-functionalized node with the
         mutated arguments.
@@ -174,11 +219,12 @@ class FixFunctionalizationPass(VllmInductorPass):
                 users[idx] = user
         return users
 
-    def insert_defunctionalized(self,
-                                graph: torch.fx.Graph,
-                                node: torch.fx.Node,
-                                args: Optional[tuple[Union[torch.fx.Node, str],
-                                                     ...]] = None):
+    def insert_defunctionalized(
+        self,
+        graph: torch.fx.Graph,
+        node: torch.fx.Node,
+        args: tuple[torch.fx.Node | str, ...] | None = None,
+    ):
         """
         Insert a new defunctionalized node into the graph before node.
         If one of the kwargs is 'out', provide args directly,
@@ -190,8 +236,9 @@ class FixFunctionalizationPass(VllmInductorPass):
         :param args: If we cannot use kwargs, specify args directly.
         If an arg is a string, `node.kwargs[arg]` is used.
         """  # noqa: E501
-        assert is_func(node, auto_functionalized), \
+        assert is_func(node, auto_functionalized), (
             f"node must be auto-functionalized, is {node} instead"
+        )
 
         # Create a new call to the original function
         with graph.inserting_before(node):
@@ -200,6 +247,7 @@ class FixFunctionalizationPass(VllmInductorPass):
                 graph.call_function(function, kwargs=node.kwargs)
             else:
                 # Args passed as strings refer to items in node.kwargs
-                args = tuple(node.kwargs[arg] if isinstance(arg, str) else arg
-                             for arg in args)
+                args = tuple(
+                    node.kwargs[arg] if isinstance(arg, str) else arg for arg in args
+                )
                 graph.call_function(function, args=args)
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index aa51802398d863108e5b9d8468df9c601df0ff6a..87f613d2500d42ca1ff59a89c4be3568293fb154 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -9,14 +9,22 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, QuantKey, ScaleDesc, kFp8DynamicTensorSym, kFp8DynamicTokenSym,
-    kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale)
+    GroupShape,
+    QuantKey,
+    ScaleDesc,
+    kFp8DynamicTensorSym,
+    kFp8DynamicTokenSym,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+    kStaticTensorScale,
+)
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -36,17 +44,17 @@ def empty_i32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
 
 
+def empty_i64(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda")
+
 
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
 QUANT_OPS: dict[QuantKey, OpOverload] = {
-    # kFp8StaticTensorSym:
-    # torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
-    # kFp8DynamicTensorSym:
-    # torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
-    # kFp8DynamicTokenSym:
-    # torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
 }
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
     QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default
@@ -58,146 +66,147 @@ class FusedRMSQuantKey(NamedTuple):
     quant: type of quantization
     fused_add: does the op also perform the residual add
     """
+
     quant: QuantKey
     fused_add: bool
 
     def __str__(self):
-        return (f"FusedQuantKey({self.quant}, with"
-                f"{'' if self.fused_add else 'out'} residual)")
+        return (
+            f"FusedQuantKey({self.quant}, with"
+            f"{'' if self.fused_add else 'out'} residual)"
+        )
 
 
 FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
-    # FusedRMSQuantKey(kFp8StaticTensorSym, False):
-    # torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8StaticTensorSym, True):
-    # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8DynamicTokenSym, False):
-    # torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8DynamicTokenSym, True):
-    # torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    # FusedRMSQuantKey(
+    #     kFp8StaticTensorSym, False
+    # ): torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa: E501
+    # FusedRMSQuantKey(
+    #     kFp8StaticTensorSym, True
+    # ): torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa: E501
+    # FusedRMSQuantKey(
+    #     kFp8DynamicTokenSym, False
+    # ): torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    # FusedRMSQuantKey(
+    #     kFp8DynamicTokenSym, True
+    # ): torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
 }
 
 
 class RMSNormQuantPattern:
-
     def __init__(self, epsilon: float, key: FusedRMSQuantKey):
         self.epsilon = epsilon
         self.quant_dtype = key.quant.dtype
+        config = get_current_vllm_config()
+        self.model_dtype = config.model_config.dtype if config.model_config else None
 
-        assert key.quant in QUANT_OPS, \
-            f"unsupported quantization scheme {key.quant}"
-        self.QUANT_OP = QUANT_OPS[key.quant]
-
-        assert key in FUSED_OPS, \
-            f"unsupported fused rmsnorm+quant op for {key}"
+        assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
+        self.rmsnorm_matcher = (
+            MatcherRMSNorm(epsilon)
+            if not key.fused_add
+            else MatcherFusedAddRMSNorm(epsilon)
+        )
+        self.quant_matcher = MatcherQuantFP8(key.quant)
 
-class RMSNormStaticQuantPattern(RMSNormQuantPattern):
 
-    def __init__(self,
-                 epsilon: float,
-                 quant_dtype: torch.dtype,
-                 symmetric=True):
-        fused_key = FusedRMSQuantKey(fused_add=False,
-                                     quant=QuantKey(dtype=quant_dtype,
-                                                    scale=kStaticTensorScale,
-                                                    symmetric=symmetric))
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
+        fused_key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(
+                dtype=quant_dtype, scale=kStaticTensorScale, symmetric=symmetric
+            ),
+        )
         super().__init__(epsilon, fused_key)
 
     def register(self, pm_pass: PatternMatcherPass):
         # Cannot use methods, as the self argument affects tracing
-        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
-                    input: torch.Tensor, weight: torch.Tensor,
-                    scale: torch.Tensor):
-            at1 = auto_functionalized(RMS_OP,
-                                      result=result_rms,
-                                      input=input,
-                                      weight=weight,
-                                      epsilon=self.epsilon)
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      result=result,
-                                      input=at1[1],
-                                      scale=scale)
-
-            # result
-            return at2[1]
-
-        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
-                        input: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     input=input,
-                                     weight=weight,
-                                     scale=scale,
-                                     epsilon=self.epsilon)
+        def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+            result_rms = self.rmsnorm_matcher(input, weight)
+            return self.quant_matcher(result_rms, scale)[0]
+
+        def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty(
+                input.shape, device=input.device, dtype=self.quant_dtype
+            )
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+            )
 
             # result
             return at[1]
 
         inputs = [
-            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
-            empty_bf16(5, 4),  # result_rms
-            empty_bf16(5, 4),  # input
-            empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1)  # scale
+            # input, weight
+            *self.rmsnorm_matcher.inputs(),
+            self.quant_matcher.inputs()[1],  # scale
         ]
+        pattern(*inputs)
 
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
-                                pm_pass)
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
 
 
 class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
-
-    def __init__(self,
-                 epsilon: float,
-                 quant_dtype: torch.dtype,
-                 symmetric=True):
-        key = FusedRMSQuantKey(fused_add=True,
-                               quant=QuantKey(dtype=quant_dtype,
-                                              scale=kStaticTensorScale,
-                                              symmetric=symmetric))
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(
+                dtype=quant_dtype, scale=kStaticTensorScale, symmetric=symmetric
+            ),
+        )
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(result: torch.Tensor, input: torch.Tensor,
-                    residual: torch.Tensor, weight: torch.Tensor,
-                    scale: torch.Tensor):
-            at = auto_functionalized(RMS_ADD_OP,
-                                     input=input,
-                                     residual=residual,
-                                     weight=weight,
-                                     epsilon=self.epsilon)
-            at1 = auto_functionalized(self.QUANT_OP,
-                                      result=result,
-                                      input=at[1],
-                                      scale=scale)
-
-            # result, residual
-            return at1[1], at[2]
-
-        def replacement(result: torch.Tensor, input: torch.Tensor,
-                        residual: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     input=input,
-                                     residual=residual,
-                                     weight=weight,
-                                     scale=scale,
-                                     epsilon=self.epsilon)
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            residual: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result, _ = self.quant_matcher(result_rms, scale)
+
+            return result, residual
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            residual: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                residual=residual,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+            )
 
             # result, residual
             return at[1], at[2]
 
         inputs = [
-            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1)  # scale
+            # input, weight, residual
+            *self.rmsnorm_matcher.inputs(),
+            self.quant_matcher.inputs()[1],  # scale
         ]
 
         pm.register_replacement(
@@ -210,130 +219,105 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
 
 
 class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
-
-    def __init__(self,
-                 epsilon: float,
-                 quant_dtype: torch.dtype,
-                 group_shape: GroupShape = GroupShape.PER_TOKEN,
-                 symmetric=True):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
         scale = ScaleDesc(torch.float32, False, group_shape)
-        key = FusedRMSQuantKey(fused_add=False,
-                               quant=QuantKey(dtype=quant_dtype,
-                                              scale=scale,
-                                              symmetric=symmetric))
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
-                    input: torch.Tensor, weight: torch.Tensor,
-                    scale: torch.Tensor):
-            at1 = auto_functionalized(RMS_OP,
-                                      result=result_rms,
-                                      input=input,
-                                      weight=weight,
-                                      epsilon=self.epsilon)
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      result=result,
-                                      input=at1[1],
-                                      scale=scale,
-                                      scale_ub=None)
-
+        def pattern(input: torch.Tensor, weight: torch.Tensor):
+            result_rms = self.rmsnorm_matcher(input, weight)
             # result, scale
-            return at2[1], at2[2]
-
-        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
-                        input: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     input=input,
-                                     weight=weight,
-                                     scale=scale,
-                                     epsilon=self.epsilon,
-                                     scale_ub=None,
-                                     residual=None)
+            return self.quant_matcher(result_rms)
+
+        def replacement(input: torch.Tensor, weight: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(input)
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=None,
+            )
 
             # result, scale
             return at[1], at[2]
 
-        inputs = [
-            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
-            empty_bf16(5, 4),  # result_rms
-            empty_bf16(5, 4),  # input
-            empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1)  # scale
-        ]
-
         pm.register_replacement(
             pattern,
             replacement,
-            inputs,
+            self.rmsnorm_matcher.inputs(),
             pm.fwd_only,
             pm_pass,
         )
 
 
 class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
-
-    def __init__(self,
-                 epsilon: float,
-                 quant_dtype: torch.dtype,
-                 group_shape: GroupShape = GroupShape.PER_TOKEN,
-                 symmetric=True):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
         scale = ScaleDesc(torch.float32, False, group_shape)
-        key = FusedRMSQuantKey(fused_add=True,
-                               quant=QuantKey(dtype=quant_dtype,
-                                              scale=scale,
-                                              symmetric=symmetric))
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(result: torch.Tensor, input: torch.Tensor,
-                    residual: torch.Tensor, weight: torch.Tensor,
-                    scale: torch.Tensor):
-            at = auto_functionalized(RMS_ADD_OP,
-                                     input=input,
-                                     residual=residual,
-                                     weight=weight,
-                                     epsilon=self.epsilon)
-            at1 = auto_functionalized(self.QUANT_OP,
-                                      result=result,
-                                      input=at[1],
-                                      scale=scale,
-                                      scale_ub=None)
-
-            # result, residual, scale
-            return at1[1], at[2], at1[2]
-
-        def replacement(result: torch.Tensor, input: torch.Tensor,
-                        residual: torch.Tensor, weight: torch.Tensor,
-                        scale: torch.Tensor):
-            at = auto_functionalized(self.FUSED_OP,
-                                     result=result,
-                                     input=input,
-                                     weight=weight,
-                                     scale=scale,
-                                     epsilon=self.epsilon,
-                                     scale_ub=None,
-                                     residual=residual)
+        def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+
+            return result, residual, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(input)
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=residual,
+            )
 
             # result, residual, scale
             return at[1], at[3], at[2]
 
-        inputs = [
-            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1)  # scale
-        ]
-
         pm.register_replacement(
             pattern,
             replacement,
-            inputs,
+            self.rmsnorm_matcher.inputs(),
             pm.fwd_only,
             pm_pass,
         )
@@ -350,24 +334,27 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="rmsnorm_quant_fusion_pass")
+            pass_name="rmsnorm_quant_fusion_pass"
+        )
 
+        # Make sure fused add patterns are before simple rms norm,
+        # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
-            # Fuse rms_norm + static fp8 quant
-            RMSNormStaticQuantPattern(epsilon,
-                                      FP8_DTYPE).register(self.patterns)
-
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
-                self.patterns)
+                self.patterns
+            )
 
-            # Fuse rms_norm + dynamic per-token fp8 quant
-            RMSNormDynamicQuantPattern(epsilon,
-                                       FP8_DTYPE).register(self.patterns)
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(self.patterns)
 
             # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
             FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
-                self.patterns)
+                self.patterns
+            )
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 
@@ -377,8 +364,11 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         logger.debug("Replaced %s patterns", self.matched_count)
 
     def uuid(self) -> Any:
-        return self.hash_source(self, RMSNormQuantPattern,
-                                RMSNormStaticQuantPattern,
-                                RMSNormDynamicQuantPattern,
-                                FusedAddRMSNormStaticQuantPattern,
-                                FusedAddRMSNormDynamicQuantPattern)
+        return self.hash_source(
+            self,
+            RMSNormQuantPattern,
+            RMSNormStaticQuantPattern,
+            RMSNormDynamicQuantPattern,
+            FusedAddRMSNormStaticQuantPattern,
+            FusedAddRMSNormDynamicQuantPattern,
+        )
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 2c6cf8f12fdc1f77c4ba4af7ce7f6298e370049b..4f44faece75e5d4a674a3a3fcd29554b6cf40a5b 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 
 import torch
 import torch._inductor.pattern_matcher as pm
+from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
@@ -12,12 +14,17 @@ from vllm.attention import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kNvfp4Quant, kStaticTensorScale)
+    QuantKey,
+    kNvfp4Quant,
+    kStaticTensorScale,
+)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
+from .fx_utils import is_func
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherQuantFP8
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -49,31 +56,49 @@ class AttentionQuantPattern(ABC):
         self.quant_dtype = quant_key.dtype
         self.dtype = dtype
 
-        assert self.quant_key in QUANT_OPS, \
+        assert self.quant_key in QUANT_OPS, (
             f"unsupported quantization scheme {self.quant_key}"
+        )
         self.QUANT_OP = QUANT_OPS[self.quant_key]
 
     def empty(self, *args, **kwargs):
-        kwargs = {'dtype': self.dtype, 'device': "cuda", **kwargs}
+        kwargs = {"dtype": self.dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
     def empty_quant(self, *args, **kwargs):
-        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
     @staticmethod
-    def wrap_trace_fn(process_fx, trace_fn):
-
+    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
         def wrapped(*args, **kwargs):
-            return process_fx(trace_fn(*args, **kwargs))
+            gm = trace_fn(*args, **kwargs)
+            for process_fx in process_fx_fns:
+                process_fx(gm)
+
+            return gm
 
         return wrapped
 
     @staticmethod
     def fx_view_to_reshape(gm: torch.fx.GraphModule):
         from torch._inductor.fx_passes.post_grad import view_to_reshape
+
         view_to_reshape(gm)
-        return gm
+
+    @staticmethod
+    def remove_noop_permutes(gm: torch.fx.GraphModule):
+        for node in gm.graph.nodes:
+            if not is_func(node, torch.ops.aten.permute.default):
+                continue
+
+            dims = node.args[1]
+            if any(dim != i for i, dim in enumerate(dims)):
+                continue
+
+            # this is now an identity op, remove
+            node.replace_all_uses_with(node.args[0])
+            gm.graph.erase_node(node)
 
     def register_if_supported(self, pm_pass: PatternMatcherPass):
         if self.layer.impl.fused_output_quant_supported(self.quant_key):
@@ -100,70 +125,81 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
         dtype: torch.dtype,
         symmetric: bool = True,
     ):
-        quant_key = QuantKey(dtype=FP8_DTYPE,
-                             scale=kStaticTensorScale,
-                             symmetric=symmetric)
+        quant_key = QuantKey(
+            dtype=FP8_DTYPE, scale=kStaticTensorScale, symmetric=symmetric
+        )
         super().__init__(layer, quant_key, dtype)
+        self.quant_matcher = MatcherQuantFP8(quant_key)
 
     def _register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    output_attn: torch.Tensor, output_quant: torch.Tensor,
-                    scale: torch.Tensor):
-            at1 = auto_functionalized(ATTN_OP,
-                                      query=q,
-                                      key=k,
-                                      value=v,
-                                      output=output_attn,
-                                      layer_name=self.layer_name,
-                                      output_scale=None,
-                                      output_block_scale=None)
+        def pattern(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            output_attn: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            at1 = auto_functionalized(
+                ATTN_OP,
+                query=q,
+                key=k,
+                value=v,
+                output=output_attn,
+                layer_name=self.layer_name,
+                output_scale=None,
+                output_block_scale=None,
+            )
             attn_out_view = RESHAPE_OP(
-                at1[1], [q.shape[0], self.num_heads * self.head_size])
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      result=output_quant,
-                                      input=attn_out_view,
-                                      scale=scale)
-            return at2[1]
-
-        def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                        output_attn: torch.Tensor, output_quant: torch.Tensor,
-                        scale: torch.Tensor):
+                at1[1], [q.shape[0], self.num_heads * self.head_size]
+            )
+
+            return self.quant_matcher(attn_out_view, scale)[0]
+
+        def replacement(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            output_attn: torch.Tensor,
+            scale: torch.Tensor,
+        ):
             # attn output in quant_dtype
             output_attn = torch.ops.aten.full.default(
                 [q.shape[0], self.num_heads, self.head_size],
                 0.0,
                 dtype=self.quant_dtype,
-                device=q.device)
-            at1 = auto_functionalized(ATTN_OP,
-                                      query=q,
-                                      key=k,
-                                      value=v,
-                                      output=output_attn,
-                                      layer_name=self.layer_name,
-                                      output_scale=scale,
-                                      output_block_scale=None)
+                device=q.device,
+            )
+            at1 = auto_functionalized(
+                ATTN_OP,
+                query=q,
+                key=k,
+                value=v,
+                output=output_attn,
+                layer_name=self.layer_name,
+                output_scale=scale,
+                output_block_scale=None,
+            )
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
         inputs = [
-            self.empty(5, self.num_heads, self.head_size,
-                       dtype=self.dtype),  # q
-            self.empty(5, self.num_heads, self.head_size,
-                       dtype=self.dtype),  # k
-            self.empty(5, self.num_heads, self.head_size,
-                       dtype=self.dtype),  # v
-            self.empty(5, self.num_heads, self.head_size,
-                       dtype=self.dtype),  # attn_output
-            self.empty_quant(5,
-                             self.num_heads * self.head_size),  # quant_output
-            empty_fp32(1, 1)  # scale
+            self.empty(5, self.num_heads, self.head_size),  # q
+            self.empty(5, self.num_heads, self.head_size),  # k
+            self.empty(5, self.num_heads, self.head_size),  # v
+            self.empty(5, self.num_heads, self.head_size),  # attn_output
+            empty_fp32(1, 1),  # scale
         ]
 
         pm.register_replacement(
-            pattern, replacement, inputs,
+            pattern,
+            replacement,
+            inputs,
             AttentionQuantPattern.wrap_trace_fn(
-                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-            pm_pass)
+                pm.fwd_only,
+                AttentionQuantPattern.fx_view_to_reshape,
+                AttentionQuantPattern.remove_noop_permutes,
+            ),
+            pm_pass,
+        )
 
 
 class AttentionNvfp4QuantPattern(AttentionQuantPattern):
@@ -180,50 +216,67 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
         super().__init__(layer, kNvfp4Quant, dtype)
 
     def _register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    output_attn: torch.Tensor, output_quant: torch.Tensor,
-                    output_scale: torch.Tensor, input_scale: torch.Tensor):
-            at1 = auto_functionalized(ATTN_OP,
-                                      query=q,
-                                      key=k,
-                                      value=v,
-                                      output=output_attn,
-                                      layer_name=self.layer_name,
-                                      output_scale=None,
-                                      output_block_scale=None)
+        def pattern(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            output_attn: torch.Tensor,
+            output_quant: torch.Tensor,
+            output_scale: torch.Tensor,
+            input_scale: torch.Tensor,
+        ):
+            at1 = auto_functionalized(
+                ATTN_OP,
+                query=q,
+                key=k,
+                value=v,
+                output=output_attn,
+                layer_name=self.layer_name,
+                output_scale=None,
+                output_block_scale=None,
+            )
             attn_out_view = RESHAPE_OP(
-                at1[1], [q.shape[0], self.num_heads * self.head_size])
-            at2 = auto_functionalized(self.QUANT_OP,
-                                      output=output_quant,
-                                      input=attn_out_view,
-                                      output_scale=output_scale,
-                                      input_scale=input_scale)
+                at1[1], [q.shape[0], self.num_heads * self.head_size]
+            )
+            at2 = auto_functionalized(
+                self.QUANT_OP,
+                output=output_quant,
+                input=attn_out_view,
+                output_scale=output_scale,
+                input_scale=input_scale,
+            )
             output_scale_view = torch.ops.aten.view.dtype(at2[2], FP8_DTYPE)
             return at2[1], output_scale_view
 
-        def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                        output_attn: torch.Tensor, output_quant: torch.Tensor,
-                        output_scale: torch.Tensor, input_scale: torch.Tensor):
+        def replacement(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            output_attn: torch.Tensor,
+            output_quant: torch.Tensor,
+            output_scale: torch.Tensor,
+            input_scale: torch.Tensor,
+        ):
             # attention output in quant_dtype
             output_attn = torch.ops.aten.full.default(
                 [q.shape[0], self.num_heads, self.head_size // 2],
                 0.0,
                 dtype=self.quant_dtype,
-                device=q.device)
+                device=q.device,
+            )
             # attention output block scale
-            output_scale_view = torch.ops.aten.view.dtype(
-                output_scale, FP8_DTYPE)
-            at2 = auto_functionalized(ATTN_OP,
-                                      query=q,
-                                      key=k,
-                                      value=v,
-                                      output=output_attn,
-                                      layer_name=self.layer_name,
-                                      output_scale=input_scale,
-                                      output_block_scale=output_scale_view)
-            output = RESHAPE_OP(at2[1],
-                                [-1, self.num_heads * self.head_size // 2])
+            output_scale_view = torch.ops.aten.view.dtype(output_scale, FP8_DTYPE)
+            at2 = auto_functionalized(
+                ATTN_OP,
+                query=q,
+                key=k,
+                value=v,
+                output=output_attn,
+                layer_name=self.layer_name,
+                output_scale=input_scale,
+                output_block_scale=output_scale_view,
+            )
+            output = RESHAPE_OP(at2[1], [-1, self.num_heads * self.head_size // 2])
             return output, at2[2]
 
         inputs = [
@@ -231,18 +284,24 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
             empty_bf16(5, self.num_heads, self.head_size),  # k
             empty_bf16(5, self.num_heads, self.head_size),  # v
             empty_bf16(5, self.num_heads, self.head_size),  # output_attn
-            self.empty_quant(5, self.num_heads * self.head_size //
-                             2),  # output_quant
-            empty_i32(128, round_up(self.num_heads * self.head_size // 16,
-                                    4)),  # output_scale
+            self.empty_quant(5, self.num_heads * self.head_size // 2),  # output_quant
+            empty_i32(
+                128, round_up(self.num_heads * self.head_size // 16, 4)
+            ),  # output_scale
             empty_fp32(1, 1),  # input_scale
         ]
 
         pm.register_replacement(
-            pattern, replacement, inputs,
+            pattern,
+            replacement,
+            inputs,
             AttentionQuantPattern.wrap_trace_fn(
-                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-            pm_pass)
+                pm.fwd_only,
+                AttentionQuantPattern.fx_view_to_reshape,
+                AttentionQuantPattern.remove_noop_permutes,
+            ),
+            pm_pass,
+        )
 
 
 class AttnFusionPass(VllmPatternMatcherPass):
@@ -267,20 +326,22 @@ class AttnFusionPass(VllmPatternMatcherPass):
         attn_layers = get_layers_from_vllm_config(config, Attention)
         for layer_name, layer in attn_layers.items():
             pattern_fp8 = AttentionFp8StaticQuantPattern(
-                layer, config.model_config.dtype)
+                layer, config.model_config.dtype
+            )
             pattern_fp8.register_if_supported(self.patterns)
 
-            if current_platform.is_cuda() and hasattr(torch.ops._C,
-                                                      "scaled_fp4_quant"):
+            if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
                 pattern_nvfp4 = AttentionNvfp4QuantPattern(
-                    layer, config.model_config.dtype)
+                    layer, config.model_config.dtype
+                )
                 pattern_nvfp4.register_if_supported(self.patterns)
 
         if len(attn_layers) == 0:
             logger.warning(
                 "Attention + quant fusion is enabled, but no attention layers "
                 "were found in CompilationConfig.static_forward_context "
-                "so no fusion patterns were registered.")
+                "so no fusion patterns were registered."
+            )
 
         self.dump_patterns(config, self.patterns)
 
@@ -290,6 +351,9 @@ class AttnFusionPass(VllmPatternMatcherPass):
         logger.debug("Fused quant onto %s attention nodes", self.matched_count)
 
     def uuid(self):
-        return VllmInductorPass.hash_source(self, AttentionQuantPattern,
-                                            AttentionFp8StaticQuantPattern,
-                                            AttentionNvfp4QuantPattern)
+        return VllmInductorPass.hash_source(
+            self,
+            AttentionQuantPattern,
+            AttentionFp8StaticQuantPattern,
+            AttentionNvfp4QuantPattern,
+        )
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index 2db8b5441bd6feeec005c5c1e0feb659b0f5104a..f2497950fc22f0c0aafcdb49b8476026b386e9f3 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -3,11 +3,10 @@
 
 import operator
 from collections.abc import Iterable, Iterator
-from typing import Optional
 
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._ops import OpOverload
+from torch._ops import OpOverload, OpOverloadPacket
 
 
 def is_func(node: fx.Node, target) -> bool:
@@ -19,8 +18,7 @@ def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
 
 
 # Returns the first specified node with the given op (if it exists)
-def find_specified_fn_maybe(nodes: Iterable[fx.Node],
-                            op: OpOverload) -> Optional[fx.Node]:
+def find_specified_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None:
     for node in nodes:
         if node.target == op:
             return node
@@ -35,8 +33,7 @@ def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
 
 
 # Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[fx.Node],
-                       op: OpOverload) -> Optional[fx.Node]:
+def find_auto_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None:
     for node in nodes:
         if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
             return node
@@ -52,7 +49,7 @@ def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
 
 # Returns the getitem node that extracts the idx-th element from node
 # (if it exists)
-def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+def find_getitem_maybe(node: fx.Node, idx: int) -> fx.Node | None:
     for user in node.users:
         if is_func(user, operator.getitem) and user.args[1] == idx:
             return user
@@ -67,7 +64,17 @@ def find_getitem(node: fx.Node, idx: int) -> fx.Node:
 
 
 # An auto-functionalization-aware utility for finding nodes with a specific op
-def find_op_nodes(op: OpOverload, graph: fx.Graph) -> Iterator[fx.Node]:
+# Also handles op overload packets and finds all overloads
+def find_op_nodes(
+    op: OpOverload | OpOverloadPacket, graph: fx.Graph
+) -> Iterator[fx.Node]:
+    if isinstance(op, OpOverloadPacket):
+        for overload in op.overloads():
+            overload_op = getattr(op, overload)
+            yield from find_op_nodes(overload_op, graph)
+        return
+
+    assert isinstance(op, OpOverload)
     if not op._schema.is_mutable:
         yield from graph.find_nodes(op="call_function", target=op)
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index e1b691df385d766cda1e955f35c3619213e3b14c..9af635a929b4b30a26e9eacd0c8912e66fce0b51 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -6,29 +6,29 @@ import hashlib
 import inspect
 import json
 import types
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 from torch import fx
-from torch._subclasses.fake_tensor import (FakeTensorMode,
-                                           unset_fake_temporarily)
+from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
-from vllm.utils import is_torch_equal_or_newer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if is_torch_equal_or_newer("2.6"):
     from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
     # CustomGraphPass is not present in 2.5 or lower, import our version
-    from .torch25_custom_graph_pass import (  # noqa: E501
-        Torch25CustomGraphPass as CustomGraphPass)
+    from .torch25_custom_graph_pass import (
+        Torch25CustomGraphPass as CustomGraphPass,
+    )
 
 _pass_context = None
 
 
 class PassContext:
-
-    def __init__(self, runtime_shape: Optional[int]):
+    def __init__(self, runtime_shape: int | None):
         self.runtime_shape = runtime_shape
 
 
@@ -39,7 +39,7 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: Optional[int]):
+def pass_context(runtime_shape: int | None):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
@@ -68,7 +68,7 @@ class InductorPass(CustomGraphPass):
         return InductorPass.hash_source(self)
 
     @staticmethod
-    def hash_source(*srcs: Union[str, Any]):
+    def hash_source(*srcs: str | Any):
         """
         Utility method to hash the sources of functions or objects.
         :param srcs: strings or objects to add to the hash.
@@ -96,7 +96,7 @@ class InductorPass(CustomGraphPass):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable_for_shape(self, shape: Optional[int]):
+    def is_applicable(self, shape: int | None):
         return True
 
 
@@ -106,9 +106,7 @@ class CallableInductorPass(InductorPass):
     implementation of the UUID.
     """
 
-    def __init__(self,
-                 callable: Callable[[fx.Graph], None],
-                 uuid: Optional[Any] = None):
+    def __init__(self, callable: Callable[[fx.Graph], None], uuid: Any | None = None):
         self.callable = callable
         self._uuid = self.hash_source(callable) if uuid is None else uuid
 
@@ -127,8 +125,7 @@ def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
 
     @functools.wraps(fn)
     def fn_new(*args, **kwargs) -> Any:
-        with torch._guards.tracing(
-                None), unset_fake_temporarily(), FakeTensorMode():
+        with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode():
             result = fn(*args, **kwargs)
 
         return result
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..38eb4e5301a1876b92edaf9475d6e58f451fd2fb
--- /dev/null
+++ b/vllm/compilation/matcher_utils.py
@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+import torch
+from torch._higher_order_ops import auto_functionalized
+from torch._ops import OpOverload
+
+from vllm.config import get_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    _normalize_quant_group_shape,
+    kFp8DynamicTensorSym,
+    kFp8DynamicTokenSym,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+ROTARY_OP = torch.ops._C.rotary_embedding.default
+FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
+
+QUANT_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+}
+
+if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
+    QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
+
+SILU_MUL_OP = torch.ops._C.silu_and_mul.default
+
+
+class MatcherCustomOp(ABC):
+    def __init__(self, enabled: bool):
+        config = get_current_vllm_config()
+        self.model_dtype = config.model_config.dtype if config.model_config else None
+        self.device = config.device_config.device if config.device_config else None
+
+        self.enabled = enabled
+        self.forward = self.forward_custom if enabled else self.forward_native
+
+    @abstractmethod
+    def forward_custom(self, *args, **kws):
+        pass
+
+    @abstractmethod
+    def forward_native(self, *args, **kws):
+        pass
+
+    def __call__(self, *args, **kws):
+        return self.forward(*args, **kws)
+
+    def empty(self, *args, **kws):
+        return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws)
+
+    def empty_int64(self, *args, **kws):
+        return torch.empty(*args, dtype=torch.int64, device=self.device, **kws)
+
+    def empty_f32(self, *args, **kws):
+        return torch.empty(*args, dtype=torch.float32, device=self.device, **kws)
+
+    def inputs(self) -> list[torch.Tensor]:
+        """Utility for inputs to the pattern"""
+        raise NotImplementedError
+
+
+class MatcherRotaryEmbedding(MatcherCustomOp):
+    def __init__(
+        self,
+        is_neox: bool,
+        head_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        use_flashinfer: bool = False,
+        enabled: bool | None = None,
+    ) -> None:
+        if enabled is None:
+            enabled = RotaryEmbedding.enabled()
+
+        super().__init__(enabled)
+        self.is_neox = is_neox
+        self.head_size = head_size
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.q_size = self.num_heads * self.head_size
+        self.kv_size = self.num_kv_heads * self.head_size
+        self.rotary_dim = head_size
+        if use_flashinfer:
+            self.rotary_op = FLASHINFER_ROTARY_OP
+        else:
+            self.rotary_op = ROTARY_OP
+
+    def inputs(self) -> list[torch.Tensor]:
+        positions = self.empty_int64(5)
+        query = self.empty(5, self.q_size)
+        key = self.empty(5, self.kv_size)
+        cos_sin_cache = self.empty(4096, self.rotary_dim)
+        return [positions, query, key, cos_sin_cache]
+
+    def forward_custom(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        result = auto_functionalized(
+            self.rotary_op,
+            positions=positions,
+            query=query,
+            key=key,
+            head_size=self.head_size,
+            cos_sin_cache=cos_sin_cache,
+            is_neox=self.is_neox,
+        )
+        query_out = result[1]
+        key_out = result[2] if len(result) > 2 else None
+        return query_out, key_out
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return RotaryEmbedding.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            cos_sin_cache,
+            self.is_neox,
+        )
+
+
+class MatcherRMSNorm(MatcherCustomOp):
+    def __init__(self, epsilon: float, enabled: bool | None = None):
+        if enabled is None:
+            enabled = RMSNorm.enabled()
+
+        super().__init__(enabled)
+        self.epsilon = epsilon
+
+    def inputs(self):
+        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
+        weight = self.empty(16)
+        return [input, weight]
+
+    def forward_custom(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        result = torch.empty_like(input)
+        # TODO: support non-contiguous input for RMSNorm and remove this
+        input_contiguous = input.contiguous()
+        _, result = auto_functionalized(
+            RMS_OP,
+            result=result,
+            input=input_contiguous,
+            weight=weight,
+            epsilon=self.epsilon,
+        )
+
+        return result
+
+    def forward_native(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return RMSNorm.forward_static(
+            input, self.epsilon, input.size(-1), self.model_dtype, weight
+        )
+
+
+class MatcherFusedAddRMSNorm(MatcherCustomOp):
+    def __init__(self, epsilon: float, enabled: bool | None = None):
+        if enabled is None:
+            enabled = RMSNorm.enabled()
+
+        super().__init__(enabled)
+        self.epsilon = epsilon
+
+    def inputs(self):
+        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
+        weight = self.empty(16)
+        residual = self.empty(5, 16)
+        return [input, weight, residual]
+
+    def forward_custom(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        _, result, residual = auto_functionalized(
+            RMS_ADD_OP,
+            input=input,
+            residual=residual,
+            weight=weight,
+            epsilon=self.epsilon,
+        )
+
+        return result, residual
+
+    def forward_native(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return RMSNorm.forward_static(
+            input, self.epsilon, input.size(-1), self.model_dtype, weight, residual
+        )
+
+
+class MatcherQuantFP8(MatcherCustomOp):
+    def __init__(self, quant_key: QuantKey, enabled: bool | None = None):
+        if enabled is None:
+            enabled = QuantFP8.enabled()
+
+        super().__init__(enabled)
+        self.quant_key = quant_key
+        assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
+        self.QUANT_OP = QUANT_OPS[quant_key]
+
+        assert quant_key.dtype == current_platform.fp8_dtype(), (
+            "Only QuantFP8 supported by"
+        )
+        assert quant_key.scale2 is None
+        self.quant_fp8 = QuantFP8(quant_key.scale.static, quant_key.scale.group_shape)
+
+    def forward_custom(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        result = torch.empty(
+            input.shape, device=input.device, dtype=self.quant_key.dtype
+        )
+
+        if self.quant_key.scale.static:
+            assert scale is not None
+            _, result = auto_functionalized(
+                self.QUANT_OP, result=result, input=input, scale=scale
+            )
+            return result, scale
+        else:
+            assert scale is None
+            scale = self.make_scale(input)
+            _, result, scale = auto_functionalized(
+                self.QUANT_OP, result=result, input=input, scale=scale, scale_ub=None
+            )
+            return result, scale
+
+    def forward_native(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.quant_fp8(input, scale)
+
+    def make_scale(self, input: torch.Tensor):
+        normalized_group_shape = _normalize_quant_group_shape(
+            input, self.quant_key.scale.group_shape
+        )
+        scale_shape = (
+            input.shape[0] // normalized_group_shape[0],
+            input.shape[1] // normalized_group_shape[1],
+        )
+
+        return torch.empty(scale_shape, device=input.device, dtype=torch.float32)
+
+    def inputs(self) -> list[torch.Tensor]:
+        input = self.empty(5, 16)
+        if self.quant_key.scale.static:
+            return [input, self.empty_f32(1, 1)]
+
+        return [input]
+
+
+class MatcherSiluAndMul(MatcherCustomOp):
+    def __init__(self, enabled: bool | None = None):
+        if enabled is None:
+            enabled = SiluAndMul.enabled()
+        super().__init__(enabled)
+
+    def inputs(self) -> list[torch.Tensor]:
+        input = self.empty(5, 4)
+        return [input]
+
+    def forward_custom(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        result = auto_functionalized(SILU_MUL_OP, result=out, input=x)
+        return result[1]
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return SiluAndMul.forward_native(x)
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index c46721ab2d74667a577bca4ee2ac412ea13de7e2..660fb9887e2cd1ff360b9734270640bf7c0c2cc1 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import time
 
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, VllmConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -18,21 +17,25 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
     torch_compile_start_time = time.time()
 
     compilation_config: CompilationConfig = vllm_config.compilation_config
-    if compilation_config.level == CompilationLevel.PIECEWISE and \
-        compilation_config.debug_dump_path:
+    path = vllm_config.compile_debug_dump_path()
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE and path:
         import depyf
-        path = os.path.join(compilation_config.debug_dump_path,
-                            f"rank_{vllm_config.parallel_config.rank}")
+
+        path.mkdir(parents=True, exist_ok=True)
+        logger.debug("Dumping depyf output to %s", path)
         global context_manager
-        context_manager = depyf.prepare_debug(path)
+        context_manager = depyf.prepare_debug(path.as_posix())
         context_manager.__enter__()
 
 
 def end_monitoring_torch_compile(vllm_config: VllmConfig):
     compilation_config: CompilationConfig = vllm_config.compilation_config
-    if compilation_config.level == CompilationLevel.PIECEWISE:
-        logger.info("torch.compile takes %.2f s in total",
-                    compilation_config.compilation_time)
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
+        logger.info_once(
+            "torch.compile takes %.2f s in total",
+            compilation_config.compilation_time,
+            scope="local",
+        )
         global context_manager
         if context_manager is not None:
             context_manager.__exit__(None, None, None)
@@ -48,8 +51,10 @@ def validate_cudagraph_capturing_enabled():
     # if an illegal cudagraph capturing happens, raise an error.
     global cudagraph_capturing_enabled
     if not cudagraph_capturing_enabled:
-        raise RuntimeError("CUDA graph capturing detected at an inappropriate "
-                           "time. This operation is currently disabled.")
+        raise RuntimeError(
+            "CUDA graph capturing detected at an inappropriate "
+            "time. This operation is currently disabled."
+        )
 
 
 def set_cudagraph_capturing_enabled(enabled: bool):
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 2c453daf873d26105fb992cd536920445129cbec..42b8d3daac9855f6a5d3e2cde1389361a82539a8 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Union
 
 import torch.fx
 from torch import SymInt
@@ -81,78 +80,55 @@ class NoOpEliminationPass(VllmInductorPass):
                         graph.erase_node(input)
                         count += 1
 
-                # Case 2: remove this reshape if it produces the original shape
-                input, shape = node.args[:2]
-                input_shape = input.meta["val"].shape
-                if len(shape) != len(input_shape):
-                    # Reshape changing rank, skip
-                    continue
-
-                if shape.count(-1) > 1:
-                    # Invalid reshape args, skip
-                    continue
-
-                if self.reshape_all_dims_equivalent(shape, input_shape):
-                    node.replace_all_uses_with(input)
-                    graph.erase_node(node)
-                    count += 1
-
-            elif is_func(node, torch.ops.aten.slice.Tensor):
-                # python slicing semantics are different from reshape
-                # Don't treat -1 as inferred dimension
-                input, dim_index, start, end = node.args[:4]
+            # remove reshape/slice if it produces the original shape
+            if is_func(node, torch.ops.aten.reshape.default) or is_func(
+                node, torch.ops.aten.slice.Tensor
+            ):
+                input = node.args[0]
                 input_shape = input.meta["val"].shape
                 output_shape = node.meta["val"].shape
-
-                if output_shape == input_shape:
+                if self.all_dims_equivalent(input_shape, output_shape):
                     node.replace_all_uses_with(input)
                     graph.erase_node(node)
                     count += 1
-
             elif is_func(node, torch.ops.aten.slice_scatter.default):
                 base, view, dim_index, start, end = node.args[:5]
                 base_shape = base.meta["val"].shape
                 view_shape = view.meta["val"].shape
 
-                if base_shape == view_shape:
+                if self.all_dims_equivalent(base_shape, view_shape):
                     node.replace_all_uses_with(view)
                     graph.erase_node(node)
                     count += 1
 
         logger.debug("Removed %s no-op reshapes and slices", count)
 
-    # ---------------------- Reshape helpers ----------------------
-    def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node],
-                                i_dim: Union[int, SymInt]) -> bool:
+    # ---------------------- Shape comparison helpers ----------------------
+    def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool:
         """
         This function checks if two dimensions are equivalent.
         :param dim: The dimension arg to reshape/slice
         :param i_dim: The corresponding dimension in the input tensor
         :return: Are the dimensions equivalent?
 
-        There are three cases in which the dimensions are equivalent:
+        There are two cases in which the dimensions are equivalent:
         1. The dimensions are equal (both integers)
-        2. The reshape dimension is -1 (i.e. inferred)
-        3. The dimensions both correspond to the same SymInt
-
-        While case 2 does not guarantee the dimensions are equal,
-        they are equal if all other dimensions are equal.
-
-        In case 3, the reshape dimension is a torch.fx.Node,
-        and its value is a SymInt. That value is equal to the
-        input dimension.
+        2. The dimensions both correspond to the same SymInt
         """
-        # Case 1 and 2
-        if dim == i_dim or dim == -1:
-            return True
-        # Case 3
-        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
-
-    def reshape_all_dims_equivalent(
-        self,
-        dims: Iterable[Union[int, torch.fx.Node]],
-        i_dims: Iterable[Union[int, SymInt]],
+        # Case 1
+        if isinstance(i_dim, int) and isinstance(dim, int):
+            return dim == i_dim
+        # Case 2
+        if isinstance(i_dim, SymInt) and isinstance(dim, SymInt):
+            return dim == i_dim
+        return False
+
+    def all_dims_equivalent(
+        self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
     ) -> bool:
-        return all(
-            self.reshape_dims_equivalent(s, i_s)
-            for s, i_s in zip(dims, i_dims))
+        dims_ = list(dims)
+        i_dims_ = list(i_dims)
+        if len(dims_) != len(i_dims_):
+            # Different ranks can't be equivalent
+            return False
+        return all(self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..08bd27e8095268a1d5a62aeca8090ba05371a6e9
--- /dev/null
+++ b/vllm/compilation/partition_rules.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def should_split(node: torch.fx.Node, splitting_ops: list[str]) -> bool:
+    """
+    Check if a node should be split for dynamo graph partition.
+    It operates on dynamo graph, so the node.target can be anything.
+    We need to check and split only on OpOverload and OpOverloadPacket.
+    """
+
+    if node.op != "call_function":
+        return False
+
+    target = node.target
+
+    if isinstance(target, torch._ops.OpOverloadPacket):
+        # Example: "aten::add"
+        return target._qualified_op_name in splitting_ops
+
+    if isinstance(target, torch._ops.OpOverload):
+        # Example: "aten::add"
+        packet_name = target.name()
+
+        # Example: "aten::add.default"
+        op_overload_name = f"{packet_name}.{target._overloadname}"
+        return op_overload_name in splitting_ops or packet_name in splitting_ops
+
+    return False
+
+
+@contextlib.contextmanager
+def inductor_partition_rule_context(splitting_ops: list[str]):
+    """Context manager to temporarily register Inductor partition rules.
+
+    Registers custom partition rules for specified operators, forcing the
+    Inductor scheduler to partition the graph at these operators. The rules
+    are automatically restored to their previous state on exit.
+
+    Args:
+        splitting_ops: List of operator names to partition on.
+    """
+    if not splitting_ops:
+        logger.debug("No partition ops provided; skipping rule registration.")
+        yield
+        return
+
+    # Save current state before registering
+
+    saved_splitting_ops: list[str] = list(
+        torch._inductor.config.custom_should_partition_ops
+    )
+    torch._inductor.config.custom_should_partition_ops = splitting_ops
+
+    logger.debug(
+        "Registered inductor partition rules for %d operators", len(splitting_ops)
+    )
+
+    try:
+        yield
+    finally:
+        # Clear and restore previous state
+        torch._inductor.config.custom_should_partition_ops = saved_splitting_ops
+        logger.debug("Restored previous partition rules state.")
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index e323fa1f77349c39695d970077367d2629652e53..0e8bb2fc973513ce65da8365f8de7f5f32d4366c 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -5,10 +5,10 @@ import functools
 from torch import fx as fx
 
 from vllm import envs
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import set_env_var
+from vllm.utils.system_utils import set_env_var
 
 from .post_cleanup import PostCleanupPass
 from .vllm_inductor_pass import VllmInductorPass
@@ -17,6 +17,8 @@ if current_platform.is_cuda_alike():
     from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import RMSNormQuantFusionPass
     from .fusion_attn import AttnFusionPass
+    from .qk_norm_rope_fusion import QKNormRoPEFusionPass
+    from .sequence_parallelism import SequenceParallelismPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
@@ -24,7 +26,6 @@ if current_platform.is_cuda():
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
-from .sequence_parallelism import SequenceParallelismPass
 
 logger = init_logger(__name__)
 
@@ -71,9 +72,11 @@ class PostGradPassManager(CustomGraphPass):
 
         shape = get_pass_context().runtime_shape
         for pass_ in self.passes:
-            if pass_.is_applicable_for_shape(shape):
+            if pass_.is_applicable(shape):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
+            else:
+                logger.debug("Skipping %s with shape %s", pass_, shape)
 
         # post-cleanup goes before fix_functionalization
         # because it requires a functional graph
@@ -86,27 +89,33 @@ class PostGradPassManager(CustomGraphPass):
 
     def configure(self, config: VllmConfig):
         self.pass_config = config.compilation_config.pass_config
-        if self.pass_config.enable_noop:
-            self.passes += [NoOpEliminationPass(config)]
 
-        if self.pass_config.enable_sequence_parallelism:
-            self.passes += [SequenceParallelismPass(config)]
-            if self.pass_config.enable_async_tp:
-                self.passes += [AsyncTPPass(config)]
+        # Set the current vllm config to allow tracing CustomOp instances
+        with set_current_vllm_config(config, check_compile=False):
+            if self.pass_config.enable_noop:
+                self.passes += [NoOpEliminationPass(config)]
 
-        if self.pass_config.enable_fi_allreduce_fusion:
-            self.passes += [AllReduceFusionPass(config)]
+            if self.pass_config.enable_sequence_parallelism:
+                self.passes += [SequenceParallelismPass(config)]
+                if self.pass_config.enable_async_tp:
+                    self.passes += [AsyncTPPass(config)]
 
-        if self.pass_config.enable_fusion:
-            self.passes += [RMSNormQuantFusionPass(config)]
-            self.passes += [ActivationQuantFusionPass(config)]
+            if self.pass_config.enable_fi_allreduce_fusion:
+                self.passes += [AllReduceFusionPass(config)]
 
-        if self.pass_config.enable_attn_fusion:
-            self.passes += [AttnFusionPass(config)]
+            if self.pass_config.enable_fusion:
+                self.passes += [RMSNormQuantFusionPass(config)]
+                self.passes += [ActivationQuantFusionPass(config)]
 
-        # needs a functional graph
-        self.post_cleanup = PostCleanupPass(config)
-        self.fix_functionalization = FixFunctionalizationPass(config)
+            if self.pass_config.enable_attn_fusion:
+                self.passes += [AttnFusionPass(config)]
+
+            if self.pass_config.enable_qk_norm_rope_fusion:
+                self.passes += [QKNormRoPEFusionPass(config)]
+
+            # needs a functional graph
+            self.post_cleanup = PostCleanupPass(config)
+            self.fix_functionalization = FixFunctionalizationPass(config)
 
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
@@ -122,4 +131,5 @@ class PostGradPassManager(CustomGraphPass):
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
+
         return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/piecewise_backend.py
similarity index 85%
rename from vllm/compilation/cuda_piecewise_backend.py
rename to vllm/compilation/piecewise_backend.py
index ae26e9f1bf2b6c782f29f2ccb4c3decd97359960..2931580afbbb009d91c4e9bdbeaf64bb8c4216cd 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch.fx as fx
 
@@ -23,15 +24,19 @@ class ConcreteSizeEntry:
 
 
 class PiecewiseBackend:
-
-    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 piecewise_compile_index: int, total_piecewise_compiles: int,
-                 sym_shape_indices: list[int],
-                 compiled_graph_for_general_shape: Callable,
-                 vllm_backend: VllmBackend):
+    def __init__(
+        self,
+        graph: fx.GraphModule,
+        vllm_config: VllmConfig,
+        piecewise_compile_index: int,
+        total_piecewise_compiles: int,
+        sym_shape_indices: list[int],
+        compiled_graph_for_general_shape: Callable,
+        vllm_backend: VllmBackend,
+    ):
         """
         The backend for piecewise compilation.
-        It mainly handles the compilation of static shapes and 
+        It mainly handles the compilation of static shapes and
         dispatching based on runtime shape.
 
         We will compile `self.graph` once for the general shape,
@@ -46,13 +51,11 @@ class PiecewiseBackend:
         self.vllm_backend = vllm_backend
 
         self.is_first_graph = piecewise_compile_index == 0
-        self.is_last_graph = (
-            piecewise_compile_index == total_piecewise_compiles - 1)
+        self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
 
         self.is_full_graph = total_piecewise_compiles == 1
 
-        self.compile_sizes: set[int] = set(
-            self.compilation_config.compile_sizes)
+        self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes)
 
         self.first_run_finished = False
 
@@ -108,7 +111,8 @@ class PiecewiseBackend:
                 self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape)
+                runtime_shape=runtime_shape,
+            )
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
diff --git a/vllm/compilation/post_cleanup.py b/vllm/compilation/post_cleanup.py
index 6a31f3935da7c335790cbf21052b8afd52276150..55117516838ca9ffb155432d5ca654d7734acec3 100644
--- a/vllm/compilation/post_cleanup.py
+++ b/vllm/compilation/post_cleanup.py
@@ -16,5 +16,6 @@ class PostCleanupPass(VllmInductorPass):
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
         from torch._inductor.pattern_matcher import stable_topological_sort
+
         stable_topological_sort(graph)
         graph.eliminate_dead_code()
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c399e079063048b9d97ffcc10f8c7909c20e0b
--- /dev/null
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.attention import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+from .fusion import empty_bf16, empty_fp32, empty_i64
+from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherRMSNorm, MatcherRotaryEmbedding
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+
+FUSED_QK_ROPE_OP = torch.ops._C.fused_qk_norm_rope.default
+
+
+class QkNormRopePattern:
+    """
+    Match the unfused sequence in attention blocks and replace with the fused op.
+
+    Unfused (conceptually):
+      q, k, v = split(qkv, [qsz, kvsz, kvsz], -1)
+      qh = reshape(q, [-1, num_heads, head_dim])
+      kh = reshape(k, [-1, num_kv_heads, head_dim])
+      qn = rms_norm(qh, q_weight, eps)
+      kn = rms_norm(kh, k_weight, eps)
+      qf = reshape(qn, [-1, num_heads * head_dim])
+      kf = reshape(kn, [-1, num_kv_heads * head_dim])
+      qf, kf = rotary_embedding(positions, qf, kf, head_dim, cos_sin_cache, is_neox)
+      return qf, kf, v
+
+    Fused replacement:
+      fused_qk_norm_rope(qkv, num_heads, num_kv_heads, num_kv_heads, head_dim,
+                         eps, q_weight, k_weight, cos_sin_cache, is_neox,
+                         positions.view(-1))
+      return split(qkv, [qsz, kvsz, kvsz], -1)
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        eps: float,
+        is_neox: bool,
+        rope_flashinfer: bool = False,
+    ) -> None:
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.eps = eps
+        self.rmsnorm_matcher = MatcherRMSNorm(eps)
+        self.is_neox = is_neox
+        self.rope_flashinfer = rope_flashinfer
+        self.rope_matcher = MatcherRotaryEmbedding(
+            is_neox=is_neox,
+            head_size=self.head_dim,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            use_flashinfer=self.rope_flashinfer,
+        )
+
+    def get_inputs(self):
+        # Sample inputs to help pattern tracing
+        T = 5
+        qkv = empty_bf16(T, self.q_size + 2 * self.kv_size)
+        positions = empty_i64(T)
+        q_weight = empty_bf16(1, self.head_dim)
+        k_weight = empty_bf16(1, self.head_dim)
+        if self.rope_flashinfer:
+            cos_sin_cache = empty_fp32(4096, self.head_dim)
+        else:
+            cos_sin_cache = empty_bf16(4096, self.head_dim)
+        return [
+            qkv,
+            positions,
+            q_weight,
+            k_weight,
+            cos_sin_cache,
+        ]
+
+    @staticmethod
+    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
+        def wrapped(*args, **kwargs):
+            gm = trace_fn(*args, **kwargs)
+            for process_fx in process_fx_fns:
+                process_fx(gm)
+
+            return gm
+
+        return wrapped
+
+    @staticmethod
+    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+        from torch._inductor.fx_passes.post_grad import view_to_reshape
+
+        view_to_reshape(gm)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # split qkv -> q,k,v
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # Q path: view -> RMS -> view back to q.shape
+            q_by_head = q.view(
+                *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim
+            )
+            q_normed_by_head = self.rmsnorm_matcher(q_by_head, q_weight)
+            q_flat = q_normed_by_head.view(q.shape)
+
+            # K path: view -> RMS -> view back to k.shape
+            k_by_head = k.view(
+                *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim
+            )
+            k_normed_by_head = self.rmsnorm_matcher(k_by_head, k_weight)
+            k_flat = k_normed_by_head.view(k.shape)
+
+            # RoPE: apply to flattened q/k
+            q_rope, k_rope = self.rope_matcher(positions, q_flat, k_flat, cos_sin_cache)
+            return q_rope, k_rope, v
+
+        def replacement(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # Run fused qk_norm_rope op
+            result = auto_functionalized(
+                FUSED_QK_ROPE_OP,
+                qkv=qkv,
+                num_heads_q=self.num_heads,
+                num_heads_k=self.num_kv_heads,
+                num_heads_v=self.num_kv_heads,
+                head_dim=self.head_dim,
+                eps=self.eps,
+                q_weight=q_weight,
+                k_weight=k_weight,
+                cos_sin_cache=cos_sin_cache,
+                is_neox=self.is_neox,
+                position_ids=positions.view(-1),
+            )
+            result_qkv = result[1]
+
+            # Split back to q,k,v and return
+            return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # NOTE: use fx_view_to_reshape to unify view/reshape to simplify
+        # pattern and increase matching opportunities
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.get_inputs(),
+            QkNormRopePattern.wrap_trace_fn(
+                pm.fwd_only,
+                QkNormRopePattern.fx_view_to_reshape,
+            ),
+            pm_pass,
+        )
+
+
+class QKNormRoPEFusionPass(VllmPatternMatcherPass):
+    """Fuse Q/K RMSNorm + RoPE into fused_qk_norm_rope when the custom op exists."""
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="qk_norm_rope_fusion_pass"
+        )
+
+        dtype = config.model_config.dtype
+        if dtype not in (torch.bfloat16, torch.float16):
+            logger.warning_once(
+                "QK Norm+RoPE fusion not enabled: unsupported dtype %s", dtype
+            )
+            return
+
+        # use one attn layer to get meta (such as head_dim) for QkNormRopePattern
+        attn_layers: dict[str, Attention] = get_layers_from_vllm_config(
+            config, Attention
+        )
+        if len(attn_layers) == 0:
+            logger.warning_once(
+                "QK Norm+RoPE fusion enabled, but no Attention layers were discovered."
+            )
+            return
+        layer = next(iter(attn_layers.values()))
+
+        for epsilon in [1e-5, 1e-6]:
+            for neox in [True, False]:
+                if RotaryEmbedding.enabled():
+                    for rope_flashinfer in [False, True]:
+                        QkNormRopePattern(
+                            head_dim=layer.head_size,
+                            num_heads=layer.num_heads,
+                            num_kv_heads=layer.num_kv_heads,
+                            eps=epsilon,
+                            is_neox=neox,
+                            rope_flashinfer=rope_flashinfer,
+                        ).register(self.patterns)
+                else:
+                    QkNormRopePattern(
+                        head_dim=layer.head_size,
+                        num_heads=layer.num_heads,
+                        num_kv_heads=layer.num_kv_heads,
+                        eps=epsilon,
+                        is_neox=neox,
+                    ).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Fused QK Norm+RoPE on %s sites", self.matched_count)
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, QkNormRopePattern)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index a6ca50c925a2a3edcf8f0e2f286f60b5cbc24568..bb4dcf12d865da936280099677e7b675ec5fbd4a 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+
+import functools
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -9,92 +10,41 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 
 from vllm.config import VllmConfig
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_world_size)
+from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
+from .noop_elimination import NoOpEliminationPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 
 
-class _RMSNormAndQuantOpHelper:
-    """Base helper for RMSNorm and RMSNorm + Quantization functionalization."""
+def get_first_out_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args):
+        return fn(*args)[0]
 
-    def __init__(self,
-                 epsilon: float,
-                 dtype: torch.dtype,
-                 device: str,
-                 quant_op: Optional[torch._ops.OpOverload] = None,
-                 **kwargs):
-        self.epsilon = epsilon
-        self.dtype = dtype
-        self.device = device
-        self.quant_op = quant_op
-
-    def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.rms_norm.default,
-            result=result_buffer,
-            input=input_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon)
-
-    def _functional_fused_add_rmsnorm(self, input_tensor, residual_tensor,
-                                      weight_tensor):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.fused_add_rms_norm.default,
-            input=input_tensor,
-            residual=residual_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon)
-
-    def _functional_rmsnorm_then_quant(self, rmsnorm_result_buffer,
-                                       quant_result_buffer, input_tensor,
-                                       weight_tensor, scale_tensor):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        rmsnorm_out_tuple = self._functional_rmsnorm(rmsnorm_result_buffer,
-                                                     input_tensor,
-                                                     weight_tensor)
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=rmsnorm_out_tuple[1],
-            scale=scale_tensor)
-        return quant_out_tuple
-
-    def _functional_fused_add_rmsnorm_then_quant(self, quant_result_buffer,
-                                                 input_tensor, residual_tensor,
-                                                 weight_tensor, scale_tensor):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm(
-            input_tensor, residual_tensor, weight_tensor)
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=fused_add_rmsnorm_out_tuple[1],
-            scale=scale_tensor)
-        return quant_out_tuple, fused_add_rmsnorm_out_tuple[2]
+    return wrapper
 
 
-class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
+class _SequenceParallelPatternHelper:
     """Helper for sequence parallelism patterns."""
 
-    def __init__(self,
-                 epsilon: float,
-                 dtype: torch.dtype,
-                 device: str,
-                 quant_op: Optional[torch._ops.OpOverload] = None,
-                 **kwargs):
-        super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+    ):
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
         self.tp_group = get_tp_group()
         self.tp_size = get_tensor_model_parallel_world_size()
 
@@ -103,111 +53,61 @@ class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
 
     def _reduce_scatter(self, x: torch.Tensor) -> torch.Tensor:
         return torch.ops.vllm.reduce_scatter.default(
-            x,
-            dim=0,
-            world_size=self.tp_size,
-            group_name=self.tp_group.unique_name)
+            x, dim=0, world_size=self.tp_size, group_name=self.tp_group.unique_name
+        )
 
     def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
         return torch.ops.vllm.all_gather.default(
-            x,
-            dim=0,
-            world_size=self.tp_size,
-            group_name=self.tp_group.unique_name)
+            x, dim=0, world_size=self.tp_size, group_name=self.tp_group.unique_name
+        )
 
 
 class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
 
     def get_inputs(self):
         input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
         arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
 
-        return [input, permute, arg3_1]
+        return [input, arg3_1]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1)
+            rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1)
 
-            return rmsnorm[1], all_reduce
+            return rmsnorm, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
 
-            rmsnorm_result = torch.empty_like(reduce_scatter)
-            rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter,
-                                               arg3_1)
-
-            all_gather = self._all_gather(rmsnorm[1])
-
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1)
+            all_gather = self._all_gather(rmsnorm)
             return all_gather, reduce_scatter
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4],
-                                       device=self.device,
-                                       dtype=self.dtype)
-
-        return [
-            residual,
-            mm_1,
-            rms_norm_weights,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights)
-            return rmsnorm[1], rmsnorm[2]
-
-        def replacement(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights)
-            all_gather = self._all_gather(rmsnorm[1])
-            return all_gather, rmsnorm[2]
-
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
-
-
-class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4],
-                                       device=self.device,
-                                       dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
         return [
             residual,
@@ -216,205 +116,151 @@ class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
         ]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights)
-            return rmsnorm[1]
+            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            return rmsnorm[0], rmsnorm[1]
 
         def replacement(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights)
-            normalized = self._all_gather(rmsnorm[1])
-            return normalized
-
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            all_gather = self._all_gather(rmsnorm[0])
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, rmsnorm[1]
+
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+        pm.register_replacement(
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
 
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
 class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 op: torch._ops.OpOverload):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
+    ):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
-        rmsnorm_result = torch.empty([1, 8, 4],
-                                     device=self.device,
-                                     dtype=self.dtype)
-        quant_result = torch.empty([1, 8, 4],
-                                   device=self.device,
-                                   dtype=FP8_DTYPE)
         weight = torch.empty([4], device=self.device, dtype=self.dtype)
         scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
-        return [input, rmsnorm_result, quant_result, weight, scale]
+        return [input, weight, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, all_reduce, weight, scale)
-            return static_fp8[1], all_reduce
+            rms = self.rmsnorm_matcher(all_reduce, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
-
-            rmsnorm_result = torch.empty_like(reduce_scatter,
-                                              dtype=rmsnorm_result.dtype)
-            quant_result = torch.empty_like(
-                rmsnorm_result,  # Output of RMSNorm
-                dtype=quant_result.dtype)
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, reduce_scatter, weight, scale)
-            all_gather = self._all_gather(static_fp8[1])
+            rms = self.rmsnorm_matcher(reduce_scatter, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
 
             return all_gather, reduce_scatter
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
 
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 op: torch._ops.OpOverload):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4],
-                                       device=self.device,
-                                       dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
+        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
 
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
+        return [residual, mm_1, rms_norm_weights, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
-
         def pattern(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            static_fp8, rmsnorm_residual_out = self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                result, all_reduce, residual, rms_norm_weights, scale)
-            return static_fp8[1], rmsnorm_residual_out
+            rms, residual_out = self.rmsnorm_matcher(
+                all_reduce, rms_norm_weights, residual
+            )
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, residual_out
 
         def replacement(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # add a temporary slice which will become a noop
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter,
-                                                dtype=result.dtype)
-            static_fp8, rmsnorm_residual_out = self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                quant_result_buf, reduce_scatter, residual, rms_norm_weights,
-                scale)
-            all_gather = self._all_gather(static_fp8[1])
-            return all_gather, rmsnorm_residual_out
-
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
-
-
-class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-
-    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
-                 op: torch._ops.OpOverload):
-        super().__init__(epsilon, dtype, device, quant_op=op)
-
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4],
-                                       device=self.device,
-                                       dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
-        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
-
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                result, all_reduce, residual, rms_norm_weights, scale)
-            return static_fp8[1]
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rms, residual_out = self.rmsnorm_matcher(
+                reduce_scatter, rms_norm_weights, residual
+            )
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, residual_out
 
-        def replacement(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter,
-                                                dtype=result.dtype)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                quant_result_buf, reduce_scatter, residual, rms_norm_weights,
-                scale)
-            normalized = self._all_gather(static_fp8[1])
-            return normalized
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
 
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
 
 
 class SequenceParallelismPass(VllmPatternMatcherPass):
@@ -435,40 +281,77 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
     significantly reduce communication overhead and improve overall model
     performance.
+
+
+    This pass splits up the residual tensor across TP ranks and hence divides its size.
+    Because the pattern matcher starts at the end of the graph, the replacement
+    contains a slice that temporarily conforms the input residual to the correct size.
+    After all patterns have been matched, we use a NoOpEliminationPass to clean up
+    what have now become no-op slices.
+
+    Note that an older version of the pass did not need this as it operated only on
+    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
+    mismatched shapes during replacement. So this approach has the same assumption that
+    correctness is only maintained if all rms_norm operations are split across ranks.
+
+    Correctness-wise, this is approach strictly better than before - before,
+    the graph was incorrect semantically and shape-wise during the pass.
+    With this approach there's only semantic incorrectness during the pass.
+    Both approaches restore a correct graph once all patterns are matched.
     """
 
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
+        # Used to cleanup redundant views created temporarily
+        # to circumvent residual shape change issues
+        self.noop_cleanup = NoOpEliminationPass(config)
+        self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
+
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="sequence_parallelism_pass")
+            pass_name="sequence_parallelism_pass"
+        )
 
         for epsilon in [1e-5, 1e-6]:
             # RMSNorm + Static FP8 quantization patterns
-            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
             FirstAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device,
-                fp8_quant_op).register(self.patterns)
+                epsilon, self.model_dtype, self.device
+            ).register(self.patterns)
             MiddleAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device,
-                fp8_quant_op).register(self.patterns)
-            LastAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device,
-                fp8_quant_op).register(self.patterns)
+                epsilon, self.model_dtype, self.device
+            ).register(self.patterns)
 
             # Normal RMSNorm patterns
-            FirstAllReduceRMSNormPattern(epsilon, self.model_dtype,
-                                         self.device).register(self.patterns)
+            FirstAllReduceRMSNormPattern(
+                epsilon, self.model_dtype, self.device
+            ).register(self.patterns)
 
-            MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype,
-                                          self.device).register(self.patterns)
+            MiddleAllReduceRMSNormPattern(
+                epsilon, self.model_dtype, self.device
+            ).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(epsilon, self.model_dtype,
-                                        self.device).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+    def is_applicable(self, shape: int | None) -> bool:
+        # When sequence parallelism is enabled, the residual tensor from RMSNorm
+        # needs to be split along the sequence dimension. However, this dimension
+        # is symbolic during piecewise compilation, and splitting symbolic shapes
+        # is not supported.
+        #
+        # This pass is therefore only applied when the sequence dimension is
+        # concrete:
+        # 1. In full-graph compilation mode (no Dynamo splitting ops are used).
+        #   For this case we always pad num_tokens to be a multiple of
+        #   tensor_parallel_size, so there's no need to check shape % tp_size == 0.
+        # 2. For specific shape provided during compilation (e.g., from
+        #    `compile_sizes`), which must be divisible by the tensor-parallel
+        #    size.
+        if (
+            not self.compilation_config.splitting_ops
+            or self.compilation_config.use_inductor_graph_partition
+        ):
+            return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 
@@ -476,3 +359,5 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     def __call__(self, graph: fx.Graph):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
+        # Clean up reshape nodes
+        self.noop_cleanup(graph)
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index cd3970657522ed6454c4672771f2acb2e435b7dd..1031856cdf008037732c59675b13013aa256229a 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -23,7 +23,7 @@ class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
         """
 
     @abstractmethod
-    def uuid(self) -> Optional[Any]:
+    def uuid(self) -> Any | None:
         """
         Return an ID to uniquely identify your custom pass implementation.
         Return None to skip inductor code caching entirely.
@@ -37,6 +37,8 @@ class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
         return self.uuid()
 
     def __setstate__(self, state):
-        raise ValueError("Cannot unpickle CustomGraphPass because pickling"
-                         " is used for cache key uuid. Use torch>=2.6 with"
-                         " native uuid support for custom passes.")
+        raise ValueError(
+            "Cannot unpickle CustomGraphPass because pickling"
+            " is used for cache key uuid. Use torch>=2.6 with"
+            " native uuid support for custom passes."
+        )
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 837770d181993026162f2ee98d098f0bd2e21413..08721e3ae4a249c99a19ce86ed9ec30fae2b7f45 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -3,14 +3,13 @@
 import functools
 import operator
 import time
-from pathlib import Path
-from typing import ClassVar, Optional
+from dataclasses import dataclass
+from typing import ClassVar
 
 import regex as re
 import torch
 from torch._dynamo.utils import lazy_format_graph_code
-from torch._inductor.pattern_matcher import (PatternMatcherPass,
-                                             PatternPrettyPrinter)
+from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -20,25 +19,35 @@ from .inductor_pass import InductorPass
 logger = init_logger(__name__)
 
 
+@dataclass
+class InductorCompilationConfig:
+    splitting_ops: list[str] | None = None
+    use_inductor_graph_partition: bool = False
+
+
 class VllmInductorPass(InductorPass):
     """
     An inductor pass with access to vLLM PassConfig.
     It provides timing, logging, and dumping utilities.
     """
-    dump_prefix: ClassVar[Optional[int]] = None
+
+    dump_prefix: ClassVar[int | None] = None
     """Keep track of pass index for debug dump ordering."""
 
     def __init__(self, config: VllmConfig):
+        # Get only the necessary CompilationConfig for the inductor pass, since
+        # full `CompilationConfig` contains pointer to model which is unsafe.
+        self.compilation_config = InductorCompilationConfig(
+            splitting_ops=config.compilation_config.splitting_ops,
+            use_inductor_graph_partition=config.compilation_config.use_inductor_graph_partition,
+        )
         self.pass_config = config.compilation_config.pass_config
-        self.model_dtype = config.model_config.dtype if config.model_config \
-            else None
-        self.device = config.device_config.device if config.device_config \
-            else None
+        self.model_dtype = config.model_config.dtype if config.model_config else None
+        self.device = config.device_config.device if config.device_config else None
         self.pass_name = self.__class__.__name__
 
     @staticmethod
     def time_and_log(call_fn):
-
         @functools.wraps(call_fn)
         def wrapped(self: VllmInductorPass, graph: torch.fx.Graph):
             self.begin()
@@ -52,8 +61,9 @@ class VllmInductorPass(InductorPass):
     def dump_graph(self, graph: torch.fx.Graph, stage: str):
         i = VllmInductorPass.dump_prefix
         i_str = "" if i is None else f".{i}"
-        lazy_format_graph_code(f"post_grad{i_str}.{self.pass_name}.{stage}",
-                               graph.owning_module)
+        lazy_format_graph_code(
+            f"post_grad{i_str}.{self.pass_name}.{stage}", graph.owning_module
+        )
 
     def begin(self):
         self._start_time = time.perf_counter_ns()
@@ -72,11 +82,13 @@ class VllmPatternMatcherPass(VllmInductorPass):
 
     TODO(luka) move more utilities to this pass.
     """
+
     matched_count: int = 0
     """The number of matched patterns in the pass."""
 
     _OP_OVERLOAD_PATTERN: ClassVar[re.Pattern] = re.compile(
-        r"<OpOverload\(op='([^']*)', overload='([^']*)'\)>")
+        r"<OpOverload\(op='([^']*)', overload='([^']*)'\)>"
+    )
 
     def _replace_op_overloads(self, string: str) -> str:
         """Replace <OpOverload(..., ...)> with nicer formulations"""
@@ -96,28 +108,30 @@ class VllmPatternMatcherPass(VllmInductorPass):
 
         TODO(luka): use pattern object to manually produce pattern graph
         """
-        debug_dump_path = config.compilation_config.debug_dump_path
+        debug_dump_path = config.compile_debug_dump_path()
         if not debug_dump_path:
             return
 
-        rank = config.parallel_config.rank
-        debug_dump_path = Path(debug_dump_path) / f"rank_{rank}"
         debug_dump_path.mkdir(parents=True, exist_ok=True)
 
-        from vllm.utils import unique_filepath
+        from vllm.utils.system_utils import unique_filepath
+
         file_path = unique_filepath(
-            lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py")
+            lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py"
+        )
 
         with file_path.open("w") as f:
             print(
-                f'# This file was produced by VllmPatternMatcherPass.'
-                f'dump_patterns for {self.pass_name}.\n'
-                f'# It does its best to produce valid-Python-looking code but'
-                f' please add to dump_patterns if there are any errors.\n\n'
-                f'from torch._higher_order_ops.auto_functionalize import '
-                f'auto_functionalized as auto_functionalized\n'
-                f'from torch._inductor.pattern_matcher import *',
-                file=f)
+                f"# This file was produced by VllmPatternMatcherPass."
+                f"dump_patterns for {self.pass_name}.\n"
+                f"# It does its best to produce valid-Python-looking code but"
+                f" please add to dump_patterns if there are any errors.\n\n"
+                f"from torch._higher_order_ops.auto_functionalize import "
+                f"auto_functionalized as auto_functionalized\n"
+                f"from torch._inductor.pattern_matcher import *\n"
+                f"vllm = torch.ops.vllm",
+                file=f,
+            )
 
             for node, patterns in pm_pass.patterns.items():
                 # fix the operator.getitem repr
@@ -136,18 +150,21 @@ class VllmPatternMatcherPass(VllmInductorPass):
 
                     # Assemble pattern
                     out_node = pp.pretty_print(pattern.pattern)
-                    pattern_repr = "\n".join([f"def pattern_{i}():"] + [
-                        f"{pp.memoized_objs_names[key]} = "
-                        f"{pp.memoized_objs_pp[key]}"
-                        for key in pp.memoized_objs_names
-                    ] + [f"return {out_node}"]).replace("\n", "\n    ")
+                    pattern_repr = "\n".join(
+                        [f"def pattern_{i}():"]
+                        + [
+                            f"{pp.memoized_objs_names[key]} = "
+                            f"{pp.memoized_objs_pp[key]}"
+                            for key in pp.memoized_objs_names
+                        ]
+                        + [f"return {out_node}"]
+                    ).replace("\n", "\n    ")
 
                     pattern_repr = self._replace_op_overloads(pattern_repr)
                     print(f"{pattern_repr}\n", file=f)
 
 
 class PrinterInductorPass(VllmInductorPass):
-
     def __init__(self, name: str, config: VllmConfig):
         super().__init__(config)
         self.name = name
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 930e4d27b410f8a593b2ad15eb1e1854668eabe9..493e57f97f0f4fbededd5d8a2f06416eee53e436 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -6,76 +6,164 @@ import sys
 from abc import abstractmethod
 from contextlib import contextmanager
 from types import CodeType
-from typing import Callable, Optional
 
 import torch
+import torch._C._dynamo.guards
 
-from vllm.config import (CompilationLevel, CUDAGraphMode,
-                         get_current_vllm_config)
+import vllm.envs as envs
+from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-class TorchCompileWrapperWithCustomDispatcher:
+def _noop_add_global_state_guard(self, *args, **kwargs):
+    """No-op to skip the GLOBAL_STATE guard entirely"""
+    pass
+
+
+def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs):
+    """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely"""
+    pass
+
+
+@contextmanager
+def _compilation_context():
+    """Context manager for compilation settings and patches.
+
+    This manager:
+    1. Sets higher dynamo cache limits for compilation. (Needed for
+        qwen2_5_vl see test_qwen2_5_vl_evs_functionality).
+        Generally a recompilation can happen whenever we use a new
+        backend instance in torch.compile.
+    2. Patches out add_global_state_guard to skip GLOBAL_STATE guards
+    3. Patches out add_torch_function_mode_stack_guard to skip
+        TORCH_FUNCTION_MODE_STACK guards.
+    4. Restores everything when compilation completes
+    """
+    # Save original values
+    original_global_state_guard = (
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard
+    )
+    original_torch_function_mode_stack_guard = (
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard
+    )
+    original_cache_size = torch._dynamo.config.cache_size_limit
+    original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit
+
+    try:
+        # Set higher cache limits for compilation
+        torch._dynamo.config.cache_size_limit = 2048
+        torch._dynamo.config.accumulated_cache_size_limit = 8192
+
+        # Patch guard manager
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            _noop_add_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            _noop_add_torch_function_mode_stack_guard
+        )
+        yield
+    finally:
+        # Restore original values
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            original_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            original_torch_function_mode_stack_guard
+        )
+        torch._dynamo.config.cache_size_limit = original_cache_size
+        torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache
+
+
+class TorchCompileWithNoGuardsWrapper:
     """
-    A wrapper class for torch.compile, with a custom dispatch logic.
-    Subclasses should:
-    1. Implement the forward method
-    2. Implement the dispatch logic in the __call__ method
-        It can use `self.compiled_codes` to access the compiled bytecode,
-        and `with self.dispatch_to_code(index):` to dispatch to
-        the compiled code.
-    3. Implement the `__init__` method to determine how to call
-        `torch.compile` over the forward method.
+    A wrapper class for torch.compile, it ensures that all guards are dropped
+    when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE.
+    When guards are dropped, the first time __call__ is invoked, a single
+    compilation is triggered. Dynamo should never be traced again after that
+    since we drop all guards.
     """
 
-    def __init__(self,
-                 compiled_callable: Optional[Callable] = None,
-                 compilation_level: int = 0):
+    def __init__(self):
+        self.compiled = False
 
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
-        if compiled_callable is None:
-            # default compilation settings
-            # compiling the forward method
-
-            backend = vllm_config.compilation_config.init_backend(vllm_config)
-            options = None
-            if isinstance(backend, str) and backend == "inductor":
-                options = get_current_vllm_config(
-                ).compilation_config.inductor_compile_config
-
-            compiled_callable = torch.compile(self.forward,
-                                              fullgraph=True,
-                                              backend=backend,
-                                              options=options)
-
-        self.compiled_callable = compiled_callable
-        self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
-        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
-
-        # read the env var to determine whether to use the custom dispatcher
-        # subclasses can use this to switch between the custom dispatcher
-        # and the default Dynamo guard mechanism.
-        self.use_custom_dispatcher: bool = \
-            compilation_level >= CompilationLevel.DYNAMO_ONCE
+        mode = vllm_config.compilation_config.mode
+        if mode is None:
+            raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
+
+        backend = vllm_config.compilation_config.init_backend(vllm_config)
+        options = {}
+
+        if isinstance(backend, str) and backend == "inductor":
+            options = vllm_config.compilation_config.inductor_compile_config
+
+        if mode != CompilationMode.STOCK_TORCH_COMPILE:
+            # Drop all the guards.
+            options["guard_filter_fn"] = lambda x: [False for _ in x]
+
+        if envs.VLLM_USE_AOT_COMPILE:
+            if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                torch._dynamo.config.enable_aot_compile = True
+            else:
+                msg = "torch._dynamo.config.enable_aot_compile is not "
+                msg += "available. AOT compile is disabled and please "
+                msg += "upgrade PyTorch version to use AOT compile."
+                logger.warning(msg)
+
+        self._compiled_callable = torch.compile(
+            self.forward,
+            fullgraph=True,
+            dynamic=False,
+            backend=backend,
+            options=options,
+        )
+
+        if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
+            torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+            self._compiled_bytecode = None
+
+    def aot_compile(self, *args, **kwargs):
+        if not hasattr(self._compiled_callable, "aot_compile"):
+            raise RuntimeError(
+                "aot_compile is not supported by the current configuration. "
+                + "Please make sure torch.compile is enabled with the latest "
+                + f"version of PyTorch (current using torch: {torch.__version__})"
+            )
+        return self._compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile level.
-        NOTE: this function can have additional arguments beyond the forward
-         method, for directly dispatching to the compiled code.
-        """
-        return self.compiled_callable(*args, **kwargs)
+        if envs.VLLM_USE_BYTECODE_HOOK:
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                return self._compiled_callable(*args, **kwargs)
+
+            if not self._compiled_bytecode:
+                # Make sure a compilation is triggered by clearing dynamo
+                # cache.
+                torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
+                return self._compiled_callable(*args, **kwargs)
+            else:
+                with self._dispatch_to_compiled_code():
+                    return self.forward(*args, **kwargs)
+        else:
+            with _compilation_context():
+                return self._compiled_callable(*args, **kwargs)
 
     @abstractmethod
-    def forward(self, *args, **kwargs):
-        ...
+    def forward(self, *args, **kwargs): ...
+
+    def original_code_object(self) -> CodeType:
+        """Return the original code object of the forward method."""
+        return self.__class__.forward.__code__
 
     def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         """Hook to save the compiled bytecode for direct execution."""
-        if old_code is not self.original_code_object:
+        if old_code is not self.original_code_object():
             return
         # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
         frame = sys._getframe()
@@ -91,46 +179,60 @@ class TorchCompileWrapperWithCustomDispatcher:
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
-        debug_dump_dir = self.vllm_config.compilation_config.debug_dump_path
-        if isinstance(debug_dump_dir, str) and debug_dump_dir != "":
-            rank = self.vllm_config.parallel_config.rank
-            decompiled_file = os.path.join(debug_dump_dir, f"rank_{rank}",
-                                           "transformed_code.py")
-            if not os.path.exists(decompiled_file):
+        self._compiled_bytecode = new_code
+
+        path = self.vllm_config.compile_debug_dump_path()
+        if path:
+            decompiled_file = path / "transformed_code.py"
+            if not decompiled_file.exists():
                 try:
                     # usually the decompilation will succeed for most models,
                     # as we guarantee a full-graph compilation in Dynamo.
                     # but there's no 100% guarantee, since decompliation is
                     # not a reversible process.
                     import depyf
+
                     src = depyf.decompile(new_code)
 
                     with open(decompiled_file, "w") as f:
                         f.write(src)
 
-                    logger.debug("Dynamo transformed code saved to %s",
-                                 decompiled_file)
+                    logger.debug("Dynamo transformed code saved to %s", decompiled_file)
                 except Exception:
                     pass
 
-        if self.vllm_config.compilation_config.cudagraph_mode != \
-            CUDAGraphMode.NONE and "update" in new_code.co_names:
+        if (
+            self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and "update" in new_code.co_names
+        ):
             import depyf
+
             src = depyf.decompile(new_code)
-            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
+            msg = (
+                "Assigning / modifying buffers of nn.Module during forward pass is not "
+                "allowed when using cudagraph inside the compiler because it will "
+                "cause silent errors. Please use eager mode or fix the code. The "
+                "following code contains clues about which buffer is being modified "
+                f"(please search for the usage of the function `update`):\n{src}"
+            )
             raise RuntimeError(msg)
 
     @contextmanager
-    def dispatch_to_code(self, index: int):
-        """Context manager to dispatch to the compiled code.
+    def _dispatch_to_compiled_code(self):
+        # noqa: E501
+        """
+        Context manager to dispatch to internally compiled code for torch<2.8.
         Why does this work? Because Dynamo guarantees that the compiled
         bytecode has exactly the same arguments, cell variables, and free
         variables as the original code. Therefore we can directly switch
         the code object in the function and call it.
 
         See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
-        """ # noqa
-        self.__class__.forward.__code__ = self.compiled_codes[index]
-        yield
-        self.__class__.forward.__code__ = self.original_code_object
+        """  # noqa: E501 line too long
+        original = self.original_code_object()
+        assert self._compiled_bytecode is not None
+        self.__class__.forward.__code__ = self._compiled_bytecode
+        try:
+            yield
+        finally:
+            self.__class__.forward.__code__ = original
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 4189493218809e30b725c2812a89a67798ddf6a1..0aa3ce5f4e4430473f662c8a6a5e4d81ca73bab2 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1,813 +1,102 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# ruff: noqa: F401
-import ast
-import copy
-import hashlib
-import inspect
-import json
-import os
-import textwrap
-from contextlib import contextmanager
-from dataclasses import field, fields, is_dataclass, replace
-from functools import cached_property, lru_cache
-from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar,
-                    Union, cast)
-
-import regex as re
-import torch
-from pydantic import ConfigDict, SkipValidation
-from pydantic.dataclasses import dataclass
-from typing_extensions import runtime_checkable
-
-import vllm.envs as envs
-from vllm import version
-from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
-                               PrefixCachingHashAlgo)
-from vllm.config.compilation import (CompilationConfig, CompilationLevel,
-                                     CUDAGraphMode, PassConfig)
-from vllm.config.device import Device, DeviceConfig
+from vllm.config.cache import CacheConfig
+from vllm.config.compilation import (
+    CompilationConfig,
+    CompilationMode,
+    CUDAGraphMode,
+    PassConfig,
+)
+from vllm.config.device import DeviceConfig
+from vllm.config.ec_transfer import ECTransferConfig
 from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
-from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode,
-                               ModelConfig, ModelDType, ModelImpl,
-                               RunnerOption, TaskOption, TokenizerMode,
-                               iter_architecture_defaults,
-                               try_match_architecture_defaults)
-from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
-                                    MultiModalConfig)
-from vllm.config.observability import DetailedTraceModules, ObservabilityConfig
-from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
-                                  ParallelConfig)
+from vllm.config.model import (
+    ModelConfig,
+    iter_architecture_defaults,
+    try_match_architecture_defaults,
+)
+from vllm.config.multimodal import MultiModalConfig
+from vllm.config.observability import ObservabilityConfig
+from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
-from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
+from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
 from vllm.config.structured_outputs import StructuredOutputsConfig
-from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
-from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.runai_utils import is_runai_obj_uri
-from vllm.utils import random_uuid
-
-if TYPE_CHECKING:
-    from _typeshed import DataclassInstance
-    from transformers.configuration_utils import PretrainedConfig
-
-    from vllm.model_executor.layers.quantization.base_config import (
-        QuantizationConfig)
-else:
-    DataclassInstance = Any
-    PretrainedConfig = Any
-    QuantizationConfig = Any
-    QuantizationMethods = Any
-    BaseModelLoader = Any
-    LogitsProcessor = Any
-
-logger = init_logger(__name__)
-DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
-
-
-@runtime_checkable
-class SupportsHash(Protocol):
-
-    def compute_hash(self) -> str:
-        ...
-
-
-class SupportsMetricsInfo(Protocol):
-
-    def metrics_info(self) -> dict[str, str]:
-        ...
-
-@config
-@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
-class VllmConfig:
-    """Dataclass which contains all vllm-related configuration. This
-    simplifies passing around the distinct configurations in the codebase.
-    """
-
-    # TODO: use default_factory once default constructing ModelConfig doesn't
-    # try to download a model
-    model_config: ModelConfig = None  # type: ignore
-    """Model configuration."""
-    cache_config: CacheConfig = field(default_factory=CacheConfig)
-    """Cache configuration."""
-    parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
-    """Parallel configuration."""
-    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
-    """Scheduler configuration."""
-    device_config: DeviceConfig = field(default_factory=DeviceConfig)
-    """Device configuration."""
-    load_config: LoadConfig = field(default_factory=LoadConfig)
-    """Load configuration."""
-    lora_config: Optional[LoRAConfig] = None
-    """LoRA configuration."""
-    speculative_config: Optional[SpeculativeConfig] = None
-    """Speculative decoding configuration."""
-    structured_outputs_config: StructuredOutputsConfig = field(
-        default_factory=StructuredOutputsConfig)
-    """Structured outputs configuration."""
-    observability_config: Optional[ObservabilityConfig] = None
-    """Observability configuration."""
-    quant_config: Optional[QuantizationConfig] = None
-    """Quantization configuration."""
-    compilation_config: CompilationConfig = field(
-        default_factory=CompilationConfig)
-    """`torch.compile` and cudagraph capture configuration for the model.
-
-    As a shorthand, `-O<n>` can be used to directly specify the compilation
-    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
-    Currently, -O <n> and -O=<n> are supported as well but this will likely be
-    removed in favor of clearer -O<n> syntax in the future.
-
-    NOTE: level 0 is the default level without any optimization. level 1 and 2
-    are for internal testing only. level 3 is the recommended level for
-    production, also default in V1.
-
-    You can specify the full compilation config like so:
-    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
-    """
-    kv_transfer_config: Optional[KVTransferConfig] = None
-    """The configurations for distributed KV cache transfer."""
-    kv_events_config: Optional[KVEventsConfig] = None
-    """The configurations for event publishing."""
-    # some opaque config, only used to provide additional information
-    # for the hash computation, mainly used for testing, debugging or out of
-    # tree config registration.
-    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
-    """Additional config for specified platform. Different platforms may
-    support different configs. Make sure the configs are valid for the platform
-    you are using. Contents must be hashable."""
-    instance_id: str = ""
-    """The ID of the vLLM instance."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-
-        # summarize vllm config
-        vllm_factors: list[Any] = []
-        from vllm import __version__
-        vllm_factors.append(__version__)
-        vllm_factors.append(envs.VLLM_USE_V1)
-        if self.model_config:
-            vllm_factors.append(self.model_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.cache_config:
-            vllm_factors.append(self.cache_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.parallel_config:
-            vllm_factors.append(self.parallel_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.scheduler_config:
-            vllm_factors.append(self.scheduler_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.device_config:
-            vllm_factors.append(self.device_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.load_config:
-            vllm_factors.append(self.load_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.lora_config:
-            vllm_factors.append(self.lora_config.compute_hash())
-            # LoRA creates static buffers based on max_num_batched_tokens.
-            # The tensor sizes and strides get captured in the torch.compile
-            # graph explicitly.
-            vllm_factors.append(
-                str(self.scheduler_config.max_num_batched_tokens))
-        else:
-            vllm_factors.append("None")
-        if self.speculative_config:
-            vllm_factors.append(self.speculative_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.structured_outputs_config:
-            vllm_factors.append(self.structured_outputs_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.observability_config:
-            vllm_factors.append(self.observability_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.quant_config:
-            pass  # should be captured by model_config.quantization
-        if self.compilation_config:
-            vllm_factors.append(self.compilation_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.kv_transfer_config:
-            vllm_factors.append(self.kv_transfer_config.compute_hash())
-        else:
-            vllm_factors.append("None")
-        if self.additional_config:
-            if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
-                    json.dumps(additional_config, sort_keys=True).encode(),
-                    usedforsecurity=False,
-                ).hexdigest()
-            else:
-                additional_config_hash = additional_config.compute_hash()
-            vllm_factors.append(additional_config_hash)
-        else:
-            vllm_factors.append("None")
-        factors.append(vllm_factors)
-
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()[:10]
-        return hash_str
-
-    def pad_for_cudagraph(self, batch_size: int) -> int:
-        # if batch_size > self.compilation_config.max_capture_size,
-        # it should raise an IndexError.
-        # the caller should make sure the batch_size is within the range,
-        # i.e., batch_size <= self.compilation_config.max_capture_size
-        return self.compilation_config.bs_to_padded_graph_size[batch_size]
-
-    @staticmethod
-    def _get_quantization_config(
-            model_config: ModelConfig,
-            load_config: LoadConfig) -> Optional[QuantizationConfig]:
-        """Get the quantization config."""
-        from vllm.platforms import current_platform
-        if model_config.quantization is not None:
-            from vllm.model_executor.model_loader.weight_utils import (
-                get_quant_config)
-            quant_config = get_quant_config(model_config, load_config)
-            capability_tuple = current_platform.get_device_capability()
-
-            if capability_tuple is not None:
-                capability = capability_tuple.to_int()
-                if capability < quant_config.get_min_capability():
-                    raise ValueError(
-                        f"The quantization method {model_config.quantization} "
-                        "is not supported for the current GPU. Minimum "
-                        f"capability: {quant_config.get_min_capability()}. "
-                        f"Current capability: {capability}.")
-            supported_dtypes = quant_config.get_supported_act_dtypes()
-            if model_config.dtype not in supported_dtypes:
-                raise ValueError(
-                    f"{model_config.dtype} is not supported for quantization "
-                    f"method {model_config.quantization}. Supported dtypes: "
-                    f"{supported_dtypes}")
-            quant_config.maybe_update_config(model_config.model)
-            return quant_config
-        return None
-
-    @staticmethod
-    def get_quantization_config(
-            model_config: ModelConfig,
-            load_config: LoadConfig) -> Optional[QuantizationConfig]:
-        import copy
-
-        # For some reason, the _ version of this modifies the model_config
-        # object, so using deepcopy to avoid this problem.
-        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
-                                                   load_config)
-
-    def with_hf_config(
-        self,
-        hf_config: PretrainedConfig,
-        architectures: Optional[list[str]] = None,
-    ) -> "VllmConfig":
-        if architectures is not None:
-            hf_config = copy.deepcopy(hf_config)
-            hf_config.architectures = architectures
-
-        model_config = copy.deepcopy(self.model_config)
-        model_config.hf_config = hf_config
-
-        return replace(self, model_config=model_config)
-
-    def __post_init__(self):
-        """Verify configs are valid & consistent with each other.
-        """
-
-        self.try_verify_and_update_config()
-
-        if self.model_config is not None:
-            self.model_config.verify_with_parallel_config(self.parallel_config)
-            self.model_config.verify_dual_chunk_attention_config(
-                self.load_config)
-
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-
-        if self.lora_config is not None:
-            self.lora_config.verify_with_cache_config(self.cache_config)
-            self.lora_config.verify_with_model_config(self.model_config)
-
-        if self.quant_config is None and self.model_config is not None:
-            self.quant_config = VllmConfig._get_quantization_config(
-                self.model_config, self.load_config)
-
-        from vllm.platforms import current_platform
-        if self.model_config is not None and \
-            self.scheduler_config.chunked_prefill_enabled and \
-            self.model_config.dtype == torch.float32 and \
-            current_platform.get_device_capability() == (7, 5):
-            logger.warning_once(
-                "Turing devices tensor cores do not support float32 matmul. "
-                "To workaround this limitation, vLLM will set 'ieee' input "
-                "precision for chunked prefill triton kernels.")
-
-        # If the user does not explicitly set a compilation level, then
-        # we use the default level. The default level depends on other
-        # settings (see the below code).
-        if self.compilation_config.level is None:
-            if envs.VLLM_USE_V1:
-                if (self.model_config is not None
-                        and not self.model_config.enforce_eager):
-                    self.compilation_config.level = CompilationLevel.PIECEWISE
-                else:
-                    self.compilation_config.level = \
-                            CompilationLevel.NO_COMPILATION
-
-            else:
-                # NB: Passing both --enforce-eager and a compilation level
-                # in V0 means the compilation level wins out.
-                self.compilation_config.level = CompilationLevel.NO_COMPILATION
-
-        # async tp is built on top of sequence parallelism
-        # and requires it to be enabled.
-        if self.compilation_config.pass_config.enable_async_tp:
-            self.compilation_config.pass_config.enable_sequence_parallelism = \
-                True
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
-            self.compilation_config.custom_ops.append("+rms_norm")
-
-        if current_platform.support_static_graph_mode():
-            # if cudagraph_mode is not explicitly set by users, set default
-            # value
-            if self.compilation_config.cudagraph_mode is None:
-                if envs.VLLM_USE_V1 and self.compilation_config.level \
-                    == CompilationLevel.PIECEWISE:
-                    # default to full and piecewise for most models
-                    self.compilation_config.cudagraph_mode = \
-                        CUDAGraphMode.FULL_AND_PIECEWISE
-
-                    # pooling models and encoder-decoder models
-                    # do not support full cudagraphs
-                    if self.model_config is not None and \
-                        (self.model_config.pooler_config is not None
-                         or self.model_config.is_encoder_decoder):
-                        self.compilation_config.cudagraph_mode = \
-                            CUDAGraphMode.PIECEWISE
-                else:
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
-            # disable cudagraph when enforce eager execution
-            if self.model_config is not None and \
-                    self.model_config.enforce_eager:
-                logger.info("Cudagraph is disabled under eager mode")
-                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-            elif envs.VLLM_USE_V1:
-                self.compilation_config.cudagraph_num_of_warmups = 1
-
-            self._set_cudagraph_sizes()
-        else:
-            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
-        if self.cache_config.kv_sharing_fast_prefill:
-
-            if self.speculative_config is not None and \
-                self.speculative_config.use_eagle():
-                raise NotImplementedError(
-                    "Fast prefill optimization for KV sharing is not "
-                    "compatible with EAGLE as EAGLE requires correct logits "
-                    "for all tokens while fast prefill gives incorrect logits "
-                    "for prompt tokens.")
-
-            logger.warning_once(
-                "--kv-sharing-fast-prefill requires changes on model side for "
-                "correctness and to realize prefill savings. ")
-
-        disable_chunked_prefill_reasons: list[str] = []
-
-        if self.model_config:
-            if self.model_config.pooler_config:
-                pooling_type = self.model_config.pooler_config.pooling_type
-                if pooling_type is None or pooling_type.lower() != "last":
-                    disable_chunked_prefill_reasons.append(
-                        "Only \"last\" pooling supports chunked "
-                        "prefill and prefix caching; disabling both.")
-                if not getattr(self.model_config.hf_config, "is_causal", True):
-                    disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention supports chunked "
-                        "prefill and prefix caching; disabling both.")
-            elif self.model_config.is_encoder_decoder:
-                self.scheduler_config.max_num_encoder_input_tokens = \
-                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
-                logger.debug(
-                    "Encoder-decoder model detected: setting "
-                    "`max_num_encoder_input_tokens` to encoder length (%s)",
-                    self.scheduler_config.max_num_encoder_input_tokens)
-                self.scheduler_config.disable_chunked_mm_input = True
-                disable_chunked_prefill_reasons.append(
-                    "Encoder-decoder models do not support chunked prefill nor"
-                    " prefix caching; disabling both.")
-                if (self.model_config.architecture
-                        == "WhisperForConditionalGeneration"
-                        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
-                        != "spawn"):
-                    logger.warning(
-                        "Whisper is known to have issues with "
-                        "forked workers. If startup is hanging, "
-                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
-                        "to 'spawn'.")
-
-        if disable_chunked_prefill_reasons:
-            for reason in disable_chunked_prefill_reasons:
-                logger.info(reason)
-            self.scheduler_config.chunked_prefill_enabled = False
-            self.scheduler_config.long_prefill_token_threshold = 0
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
-
-        if (self.kv_events_config is not None
-                and self.kv_events_config.enable_kv_cache_events
-                and not self.cache_config.enable_prefix_caching):
-            logger.warning(
-                "KV cache events are on, but prefix caching is not enabled."
-                "Use --enable-prefix-caching to enable.")
-        if (self.kv_events_config is not None
-                and self.kv_events_config.publisher != "null"
-                and not self.kv_events_config.enable_kv_cache_events):
-            logger.warning("KV cache events are disabled,"
-                           "but the scheduler is configured to publish them."
-                           "Modify KVEventsConfig.enable_kv_cache_events"
-                           "to True to enable.")
-        current_platform.check_and_update_config(self)
-
-        # final check of cudagraph mode after platform-specific update
-        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
-            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
-                and self.model_config is not None and \
-                not self.model_config.disable_cascade_attn:
-                logger.info("CUDAGraphMode.FULL is not supported with "
-                            "cascade attention currently. Disabling cascade"
-                            "attention.")
-                self.model_config.disable_cascade_attn = True
-
-            if self.compilation_config.cudagraph_mode\
-                .requires_piecewise_compilation():
-                assert self.compilation_config.level == \
-                    CompilationLevel.PIECEWISE, \
-                    "Compilation level should be CompilationLevel.PIECEWISE "\
-                    "when cudagraph_mode piecewise cudagraphs is used, "\
-                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
-
-        if self.parallel_config.enable_dbo:
-            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
-            assert a2a_backend in \
-                ["deepep_low_latency", "deepep_high_throughput"], \
-            "Microbatching currently only supports the deepep_low_latency and "\
-            f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
-            "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
-            "variable to deepep_low_latency or deepep_high_throughput and "\
-            "install the DeepEP kernels."
-
-        if not self.instance_id:
-            self.instance_id = random_uuid()[:5]
-
-        # Do this after all the updates to compilation_config.level
-        if envs.VLLM_USE_V1 and \
-            self.compilation_config.level == CompilationLevel.PIECEWISE:
-            self.compilation_config.set_splitting_ops_for_v1()
-
-        if (envs.VLLM_USE_V1
-                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
-            # logger should only print warning message for hybrid models. As we
-            # can't know whether the model is hybrid or not now, so we don't log
-            # warning message here and will log it later.
-            if not current_platform.support_hybrid_kv_cache():
-                # Hybrid KV cache manager is not supported on non-GPU platforms.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if self.kv_transfer_config is not None:
-                # Hybrid KV cache manager is not compatible with KV transfer.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if self.kv_events_config is not None:
-                # Hybrid KV cache manager is not compatible with KV events.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if self.model_config is not None and \
-                self.model_config.attention_chunk_size is not None:
-                if self.speculative_config is not None and \
-                    self.speculative_config.use_eagle():
-                    # Hybrid KV cache manager is not yet supported with chunked
-                    # local attention + eagle.
-                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
-                elif \
-                    not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
-                    logger.warning(
-                        "There is a latency regression when using chunked local"
-                        " attention with the hybrid KV cache manager. Disabling"
-                        " it, by default. To enable it, set the environment "
-                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
-                    )
-                    # Hybrid KV cache manager is not yet supported with chunked
-                    # local attention.
-                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
-
-    def update_sizes_for_sequence_parallelism(self,
-                                              possible_sizes: list) -> list:
-        # remove the sizes that not multiple of tp_size when
-        # enable sequence parallelism
-        removed_sizes = [
-            size for size in possible_sizes
-            if size % self.parallel_config.tensor_parallel_size != 0
-        ]
-        if removed_sizes:
-            logger.warning(
-                "Batch sizes %s are removed because they are not "
-                "multiple of tp_size %d when "
-                "sequence parallelism is enabled", removed_sizes,
-                self.parallel_config.tensor_parallel_size)
-
-        return [
-            size for size in possible_sizes
-            if size % self.parallel_config.tensor_parallel_size == 0
-        ]
-
-    def _set_cudagraph_sizes(self):
-        """
-        vLLM defines the default candidate list of batch sizes for CUDA graph
-        capture as:
-
-        ```python
-        max_graph_size = min(max_num_seqs * 2, 512)
-        # 1, 2, 4, then multiples of 8 up to max_graph_size
-        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
-
-        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
-        will be the final sizes to capture cudagraph (in descending order).
-
-        These sizes are used to capture and reuse CUDA graphs for
-        performance-critical paths (e.g., decoding). Capturing enables
-        significantly faster kernel dispatch by avoiding Python overhead. The
-        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
-        most GPUs), which controls the total allowed number of tokens in a
-        batch. Since each sequence may have a variable number of tokens, the
-        maximum usable batch size will depend on actual sequence lengths.
-
-        Example:
-            With `max_num_batched_tokens = 8192`, and typical sequences
-            averaging ~32 tokens, most practical batch sizes fall below 256.
-            However, the system will still allow capture sizes up to 512 if
-            shape and memory permit.
-
-        Note:
-            If users explicitly specify cudagraph capture sizes in the
-            compilation config, those will override this default logic.
-            At runtime:
-
-            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
-            padded CUDA graph will be used.
-            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
-            not be used.
-        """
-
-        # calculate the default `batch_size_capture_list`
-        batch_size_capture_list = []
-        if self.model_config is not None and \
-            not self.model_config.enforce_eager:
-            cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
-            if len(cuda_graph_sizes) == 1:
-                batch_size_capture_list = [1, 2, 4] + [
-                    i for i in range(8, cuda_graph_sizes[0] + 1, 8)
-                ]
-            elif len(cuda_graph_sizes) > 1:
-                batch_size_capture_list = sorted(cuda_graph_sizes)
-            else:
-                raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
-            if self.parallel_config.tensor_parallel_size > 1 and \
-                self.compilation_config.pass_config.enable_sequence_parallelism:
-                batch_size_capture_list = \
-                    self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
-            max_num_tokens = self.scheduler_config.max_num_batched_tokens
-            batch_size_capture_list = [
-                size for size in batch_size_capture_list
-                if size <= max_num_tokens
-            ]
-
-        self.compilation_config.init_with_cudagraph_sizes(
-            batch_size_capture_list)
-
-    def recalculate_max_model_len(self, max_model_len: int):
-        # Can only be called in try_verify_and_update_config
-        model_config = self.model_config
-        max_model_len = model_config.get_and_verify_max_len(max_model_len)
-        self.model_config.max_model_len = max_model_len
-        self.scheduler_config.max_model_len = max_model_len
-
-    def try_verify_and_update_config(self):
-        if self.model_config is None:
-            return
-
-        # Avoid running try_verify_and_update_config multiple times
-        if getattr(self.model_config, "config_updated", False):
-            return
-        self.model_config.config_updated = True
-
-        architecture = self.model_config.architecture
-        if architecture is None:
-            return
-
-        from vllm.model_executor.models.config import (
-            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
-        cls = MODELS_CONFIG_MAP.get(architecture, None)
-        if cls is not None:
-            cls.verify_and_update_config(self)
-
-        if self.model_config.is_hybrid:
-            HybridAttentionMambaModelConfig.verify_and_update_config(self)
-
-        if self.model_config.convert_type == "classify":
-            # Maybe convert ForCausalLM into ForSequenceClassification model.
-            from vllm.model_executor.models.adapters import (
-                SequenceClassificationConfig)
-            SequenceClassificationConfig.verify_and_update_config(self)
-
-        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
-                self.model_config.model_weights):
-            if self.load_config.load_format == "auto":
-                logger.info("Detected Run:ai model config. "
-                            "Overriding `load_format` to 'runai_streamer'")
-                self.load_config.load_format = "runai_streamer"
-            elif self.load_config.load_format != "runai_streamer":
-                raise ValueError(f"To load a model from S3, 'load_format' "
-                                 f"must be 'runai_streamer', "
-                                 f"but got '{self.load_config.load_format}'. "
-                                 f"Model: {self.model_config.model}")
-
-    def __str__(self):
-        return (
-            f"model={self.model_config.model!r}, "
-            f"speculative_config={self.speculative_config!r}, "
-            f"tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
-            f"revision={self.model_config.revision}, "
-            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
-            f"trust_remote_code={self.model_config.trust_remote_code}, "
-            f"dtype={self.model_config.dtype}, "
-            f"max_seq_len={self.model_config.max_model_len}, "
-            f"download_dir={self.load_config.download_dir!r}, "
-            f"load_format={self.load_config.load_format}, "
-            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
-            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
-            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
-            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
-            f"quantization={self.model_config.quantization}, "
-            f"enforce_eager={self.model_config.enforce_eager}, "
-            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f"device_config={self.device_config.device}, "
-            f"structured_outputs_config={self.structured_outputs_config!r}, "
-            f"observability_config={self.observability_config!r}, "
-            f"seed={self.model_config.seed}, "
-            f"served_model_name={self.model_config.served_model_name}, "
-            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
-            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
-            f"pooler_config={self.model_config.pooler_config!r}, "
-            f"compilation_config={self.compilation_config!r}")
-
-
-_current_vllm_config: Optional[VllmConfig] = None
-_current_prefix: Optional[str] = None
-
-
-@contextmanager
-def set_current_vllm_config(vllm_config: VllmConfig,
-                            check_compile=False,
-                            prefix: Optional[str] = None):
-    """
-    Temporarily set the current vLLM config.
-    Used during model initialization.
-    We save the current vLLM config in a global variable,
-    so that all modules can access it, e.g. custom ops
-    can access the vLLM config to determine how to dispatch.
-    """
-    global _current_vllm_config, _current_prefix
-    old_vllm_config = _current_vllm_config
-    old_prefix = _current_prefix
-    from vllm.compilation.counter import compilation_counter
-    num_models_seen = compilation_counter.num_models_seen
-    try:
-        _current_vllm_config = vllm_config
-        _current_prefix = prefix
-        yield
-    except Exception:
-        raise
-    else:
-        if check_compile:
-            vllm_config.compilation_config.custom_op_log_check()
-
-        if check_compile and \
-            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
-            and compilation_counter.num_models_seen == num_models_seen:
-            # If the model supports compilation,
-            # compilation_counter.num_models_seen should be increased
-            # by at least 1.
-            # If it is not increased, it means the model does not support
-            # compilation (does not have @support_torch_compile decorator).
-            logger.warning(
-                "`torch.compile` is turned on, but the model %s"
-                " does not support it. Please open an issue on GitHub"
-                " if you want it to be supported.",
-                vllm_config.model_config.model)
-    finally:
-        _current_vllm_config = old_vllm_config
-        _current_prefix = old_prefix
-        # Clear the compilation config cache when context changes
-        get_cached_compilation_config.cache_clear()
-
-
-@lru_cache(maxsize=1)
-def get_cached_compilation_config():
-    """Cache config to avoid repeated calls to get_current_vllm_config()"""
-    return get_current_vllm_config().compilation_config
-
-
-def get_current_vllm_config() -> VllmConfig:
-    if _current_vllm_config is None:
-        # in ci, usually when we test custom ops/modules directly,
-        # we don't set the vllm config. In that case, we set a default
-        # config.
-        logger.warning("Current vLLM config is not set.")
-        from vllm.config import VllmConfig
-        return VllmConfig()
-    return _current_vllm_config
-
-
-def get_current_model_prefix() -> str:
-    """
-    Get the prefix of the model that's currently being initialized.
-    """
-    assert _current_prefix is not None, \
-        "Current model prefix is not set. "
-    return _current_prefix
-
-
-T = TypeVar("T")
-
-
-def get_layers_from_vllm_config(
-        vllm_config: VllmConfig,
-        layer_type: type[T],
-        layer_names: Optional[list[str]] = None) -> dict[str, T]:
-    """
-    Get layers from the vLLM config.
-
-    Args:
-        vllm_config: The vLLM config.
-        layer_type: The type of the layer to get.
-        layer_names: The names of the layers to get. If None, return all layers.
-    """
-
-    if layer_names is None:
-        layer_names = list(
-            vllm_config.compilation_config.static_forward_context.keys())
-
-    forward_context = vllm_config.compilation_config.static_forward_context
-
-    return {
-        layer_name: forward_context[layer_name]
-        for layer_name in layer_names
-        if isinstance(forward_context[layer_name], layer_type)
-    }
-
-
-def update_config(config: DataclassInstanceT,
-                  overrides: dict[str, Any]) -> DataclassInstanceT:
-    processed_overrides = {}
-    for field_name, value in overrides.items():
-        assert hasattr(
-            config, field_name), f"{type(config)} has no field `{field_name}`"
-        current_value = getattr(config, field_name)
-        if is_dataclass(current_value) and not is_dataclass(value):
-            assert isinstance(value, dict), (
-                f"Overrides to {type(config)}.{field_name} must be a dict"
-                f"  or {type(current_value)}, but got {type(value)}")
-            value = update_config(
-                current_value,  # type: ignore[type-var]
-                value)
-        processed_overrides[field_name] = value
-    return replace(config, **processed_overrides)
+from vllm.config.utils import (
+    ConfigType,
+    SupportsMetricsInfo,
+    config,
+    get_attr_docs,
+    is_init_field,
+    update_config,
+)
+from vllm.config.vllm import (
+    VllmConfig,
+    get_cached_compilation_config,
+    get_current_vllm_config,
+    get_layers_from_vllm_config,
+    set_current_vllm_config,
+)
+
+# __all__ should only contain classes and functions.
+# Types and globals should be imported from their respective modules.
+__all__ = [
+    # From vllm.config.cache
+    "CacheConfig",
+    # From vllm.config.compilation
+    "CompilationConfig",
+    "CompilationMode",
+    "CUDAGraphMode",
+    "PassConfig",
+    # From vllm.config.device
+    "DeviceConfig",
+    # From vllm.config.ec_transfer
+    "ECTransferConfig",
+    # From vllm.config.kv_events
+    "KVEventsConfig",
+    # From vllm.config.kv_transfer
+    "KVTransferConfig",
+    # From vllm.config.load
+    "LoadConfig",
+    # From vllm.config.lora
+    "LoRAConfig",
+    # From vllm.config.model
+    "ModelConfig",
+    "iter_architecture_defaults",
+    "try_match_architecture_defaults",
+    # From vllm.config.multimodal
+    "MultiModalConfig",
+    # From vllm.config.observability
+    "ObservabilityConfig",
+    # From vllm.config.parallel
+    "EPLBConfig",
+    "ParallelConfig",
+    # From vllm.config.pooler
+    "PoolerConfig",
+    # From vllm.config.scheduler
+    "SchedulerConfig",
+    # From vllm.config.speculative
+    "SpeculativeConfig",
+    # From vllm.config.speech_to_text
+    "SpeechToTextConfig",
+    # From vllm.config.structured_outputs
+    "StructuredOutputsConfig",
+    # From vllm.config.utils
+    "ConfigType",
+    "SupportsMetricsInfo",
+    "config",
+    "get_attr_docs",
+    "is_init_field",
+    "update_config",
+    # From vllm.config.vllm
+    "VllmConfig",
+    "get_cached_compilation_config",
+    "get_current_vllm_config",
+    "set_current_vllm_config",
+    "get_layers_from_vllm_config",
+]
\ No newline at end of file
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 58770649a8af8ff554b2eb83d95e0023cc4541e3..864cf1be81b201f58209e76de66a7de002de5791 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -3,16 +3,15 @@
 
 import hashlib
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal, Optional, get_args
+from typing import TYPE_CHECKING, Any, Literal
 
-from pydantic import SkipValidation, model_validator
+from pydantic import Field, SkipValidation, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self
 
-import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, get_cpu_memory
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import get_cpu_memory
 
 if TYPE_CHECKING:
     from vllm.config.parallel import ParallelConfig
@@ -21,11 +20,19 @@ else:
 
 logger = init_logger(__name__)
 
-BlockSize = Literal[1, 8, 16, 32, 64, 128]
-CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2",
-                     "fp8_inc"]
+BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
+CacheDType = Literal[
+    "auto",
+    "bfloat16",
+    "fp8",
+    "fp8_e4m3",
+    "fp8_e5m2",
+    "fp8_inc",
+    "fp8_ds_mla",
+]
 MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
+KVOffloadingBackend = Literal["native", "lmcache"]
 
 
 @config
@@ -40,7 +47,7 @@ class CacheConfig:
     This config has no static default. If left unspecified by the user, it will
     be set in `Platform.check_and_update_config()` based on the current
     platform."""
-    gpu_memory_utilization: float = 0.9
+    gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
     utilization. If unspecified, will use the default value of 0.9. This is a
@@ -48,7 +55,7 @@ class CacheConfig:
     not matter if you have another vLLM instance running on the same GPU. For
     example, if you have two vLLM instances running on the same GPU, you can
     set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = 4
+    swap_space: float = Field(default=4, ge=0)
     """Size of the CPU swap space per GPU (in GiB)."""
     cache_dtype: CacheDType = "auto"
     """Data type for kv cache storage. If "auto", will use model data type.
@@ -61,20 +68,20 @@ class CacheConfig:
     is_attention_free: bool = False
     """Whether the model is attention-free. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    num_gpu_blocks_override: Optional[int] = None
+    num_gpu_blocks_override: int | None = None
     """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
     if specified. Does nothing if `None`. Used for testing preemption."""
-    sliding_window: Optional[int] = None
+    sliding_window: int | None = None
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: Optional[bool] = None
+    enable_prefix_caching: bool | None = None
     """Whether to enable prefix caching. Enabled by default for V1."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
     - "sha256" uses Pickle for object serialization before hashing.\n
     - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
     serializes objects using canonical CBOR and hashes them with SHA-256."""
-    cpu_offload_gb: float = 0
+    cpu_offload_gb: float = Field(default=0, ge=0)
     """The space in GiB to offload to CPU, per GPU. Default is 0, which means
     no offloading. Intuitively, this argument can be seen as a virtual way to
     increase the GPU memory size. For example, if you have one 24 GB GPU and
@@ -87,12 +94,15 @@ class CacheConfig:
     """This enables dynamic calculation of `k_scale` and `v_scale` when
     kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
     checkpoint if available. Otherwise, the scales will default to 1.0."""
-    cpu_kvcache_space_bytes: Optional[int] = None
+    cpu_kvcache_space_bytes: int | None = None
     """(CPU backend only) CPU key-value cache space."""
-    mamba_page_size_padded: Optional[int] = None
+    mamba_page_size_padded: int | None = None
     """ Optional override for mamba page size; used by hybrid mamba/attention
     models to ensure exact alignment with attention page size."""
-
+    mamba_block_size: int | None = Field(default=None, gt=0)
+    """Size of a contiguous cache block in number of tokens for mamba cache.
+    Can be set only when prefix caching is enabled.
+    Value must be a multiple of 8 to align with causal_conv1d kernel."""
     mamba_cache_dtype: MambaDType = "auto"
     """The data type to use for the Mamba cache (both the conv as well as the
     ssm state). If set to 'auto', the data type will be inferred from the model
@@ -103,9 +113,9 @@ class CacheConfig:
     for the ssm state will be determined by mamba_cache_dtype."""
 
     # Will be set after profiling.
-    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    num_gpu_blocks: int | None = field(default=None, init=False)
     """The number of blocks to allocate for GPU memory."""
-    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    num_cpu_blocks: int | None = field(default=None, init=False)
     """The number of blocks to allocate for CPU memory."""
 
     kv_sharing_fast_prefill: bool = False
@@ -118,15 +128,26 @@ class CacheConfig:
     necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
-    kv_cache_memory_bytes: Optional[int] = None
+    kv_cache_memory_bytes: int | None = None
     """Size of KV Cache per GPU in bytes. By default, this is set to None
     and vllm can automatically infer the kv cache size based on
     gpu_memory_utilization. However, users may want to manually specify
     the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
     control of how much memory gets used when compared with using
-    gpu_memory_memory_utilization. Note that kv_cache_memory_bytes
+    gpu_memory_utilization. Note that kv_cache_memory_bytes
     (when not-None) ignores gpu_memory_utilization"""
 
+    kv_offloading_size: float | None = None
+    """Size of the KV cache offloading buffer in GiB. When TP > 1, this is
+    the total buffer size summed across all TP ranks. By default, this is set
+    to None, which means no KV offloading is enabled. When set with
+    kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
+
+    kv_offloading_backend: KVOffloadingBackend | None = None
+    """The backend to use for KV cache offloading. Supported backends include
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    together with kv_offloading_size."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -144,76 +165,42 @@ class CacheConfig:
         factors.append(self.mamba_cache_dtype)
         factors.append(self.mamba_ssm_cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self) -> None:
-        self.swap_space_bytes = self.swap_space * GiB_bytes
-
-        self._verify_cache_dtype()
-        self._verify_prefix_caching()
-
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
-    @model_validator(mode='after')
-    def _verify_args(self) -> Self:
-        if self.cpu_offload_gb < 0:
-            raise ValueError("CPU offload space must be non-negative"
-                             f", but got {self.cpu_offload_gb}")
-
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError(
-                "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}.")
-
-        return self
-
-    def _verify_cache_dtype(self) -> None:
-        if self.cache_dtype == "auto":
-            pass
-        elif self.cache_dtype in get_args(CacheDType):
-            if self.cache_dtype.startswith("fp8"):
-                logger.info(
-                    "Using fp8 data type to store kv cache. It reduces the GPU "
-                    "memory footprint and boosts the performance. "
-                    "Meanwhile, it may cause accuracy drop without a proper "
-                    "scaling factor.")
-        else:
-            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
-
-    def _verify_prefix_caching(self) -> None:
-        if not self.enable_prefix_caching:
-            return
-
-        if self.sliding_window is not None and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Prefix caching is not supported with sliding window. "
-                "Run with --disable-sliding-window to use prefix caching.")
-
-        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
-                not in get_args(PrefixCachingHashAlgo)):
-            raise ValueError(
-                "Unknown prefix caching hash algorithm: "
-                f"{self.prefix_caching_hash_algo}. Must be one of "
-                f"{get_args(PrefixCachingHashAlgo)}.")
+    @field_validator("cache_dtype", mode="after")
+    @classmethod
+    def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
+        if cache_dtype.startswith("fp8"):
+            logger.info(
+                "Using fp8 data type to store kv cache. It reduces the GPU "
+                "memory footprint and boosts the performance. "
+                "Meanwhile, it may cause accuracy drop without a proper "
+                "scaling factor."
+            )
+        return cache_dtype
 
     def verify_with_parallel_config(
         self,
         parallel_config: ParallelConfig,
     ) -> None:
+        swap_space_bytes = self.swap_space * GiB_bytes
         total_cpu_memory = get_cpu_memory()
         # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
         # group are in the same node. However, the GPUs may span multiple nodes.
         num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+        cpu_memory_usage = swap_space_bytes * num_gpus_per_node
 
-        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
-               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
-               "is allocated for the swap space.")
+        msg = (
+            f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
+            f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+            "is allocated for the swap space."
+        )
         if cpu_memory_usage > 0.7 * total_cpu_memory:
             raise ValueError("Too large swap space. " + msg)
         elif cpu_memory_usage > 0.4 * total_cpu_memory:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index ef0b7353e82ecf35985c71c04a7797a97e160648..5a35173623f830142e71606c24fc5746d4f48701 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -4,16 +4,22 @@
 import enum
 import hashlib
 from collections import Counter
+from collections.abc import Callable
 from dataclasses import asdict, field
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
+import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
+from vllm.platforms import current_platform
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -23,47 +29,67 @@ else:
 logger = init_logger(__name__)
 
 
-class CompilationLevel:
-    # constants for the levels of the compilation process
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
+class CompilationMode(enum.IntEnum):
+    """The compilation approach used for torch.compile-based compilation of the
+    model."""
+
+    NONE = 0
+    """No torch.compile compilation is applied, model runs in fully eager pytorch mode.
+    The model runs as-is."""
+    STOCK_TORCH_COMPILE = 1
+    """The standard `torch.compile` compilation pipeline."""
+    DYNAMO_TRACE_ONCE = 2
+    """Single Dynamo trace through the model, avoiding recompilation."""
+    VLLM_COMPILE = 3
+    """Custom vLLM Inductor-based backend with caching, piecewise compilation,
+    shape specialization, and custom passes."""
 
 
 class CUDAGraphMode(enum.Enum):
-    """ Constants for the cudagraph mode in CompilationConfig.
+    """Constants for the cudagraph mode in CompilationConfig.
     Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
     treated as concrete runtime mode for cudagraph runtime dispatching.
     """
+
     NONE = 0
     PIECEWISE = 1
     FULL = 2
     FULL_DECODE_ONLY = (FULL, NONE)
     FULL_AND_PIECEWISE = (FULL, PIECEWISE)
 
-    def decode_mode(self) -> 'CUDAGraphMode':
-        return CUDAGraphMode(self.value[0]) if \
-            self.separate_routine() else self
+    def decode_mode(self) -> "CUDAGraphMode":
+        return CUDAGraphMode(self.value[0]) if self.separate_routine() else self
+
+    def mixed_mode(self) -> "CUDAGraphMode":
+        return CUDAGraphMode(self.value[1]) if self.separate_routine() else self
 
-    def mixed_mode(self) -> 'CUDAGraphMode':
-        return CUDAGraphMode(self.value[1]) if \
-            self.separate_routine() else self
+    def has_mode(self, mode: "CUDAGraphMode") -> bool:
+        assert not mode.separate_routine()
+        if self.separate_routine():
+            return mode.value in self.value
+        return self == mode
 
     def requires_piecewise_compilation(self) -> bool:
-        return (self.decode_mode() == CUDAGraphMode.PIECEWISE
-                or self.mixed_mode() == CUDAGraphMode.PIECEWISE)
+        return self.has_mode(CUDAGraphMode.PIECEWISE)
 
-    def max_cudagraph_mode(self) -> 'CUDAGraphMode':
-        return CUDAGraphMode(max(
-            self.value)) if self.separate_routine() else self
+    def max_cudagraph_mode(self) -> "CUDAGraphMode":
+        return CUDAGraphMode(max(self.value)) if self.separate_routine() else self
 
     def has_full_cudagraphs(self) -> bool:
         return self.max_cudagraph_mode() == CUDAGraphMode.FULL
 
+    def has_piecewise_cudagraphs(self) -> bool:
+        return self.requires_piecewise_compilation()
+
     def separate_routine(self) -> bool:
         return isinstance(self.value, tuple)
 
+    def valid_runtime_modes(self) -> bool:
+        return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
+
+    def __str__(self) -> str:
+        return self.name
+
 
 @config
 @dataclass
@@ -86,11 +112,54 @@ class PassConfig:
     """Whether to enable async TP."""
     enable_fi_allreduce_fusion: bool = False
     """Whether to enable flashinfer allreduce fusion."""
-    fi_allreduce_fusion_max_token_num: int = 16384
-    """Max number of tokens to used in flashinfer allreduce fusion."""
+    fi_allreduce_fusion_max_size_mb: float | None = None
+    """The threshold of the communicated tensor sizes under which
+    vllm should use flashinfer fused allreduce. Specified as a
+    float in MB.
+    Unspecified will fallback to default values
+    which are compute capability and world size dependent.
+        FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
+            90: {
+                2: 64,  # 64MB
+                4: 2,  # 2MB
+                8: 1,  # 1MB
+            },
+            100: {
+                2: 64,  # 64MB
+                4: 32,  # 32MB
+                8: 1,  # 1MB
+            },
+        }, where key is the device capability"""
+    enable_qk_norm_rope_fusion: bool = False
+    """Whether to enable the fused Q/K RMSNorm + RoPE pass."""
 
     # TODO(luka) better pass enabling system.
 
+    def flashinfer_max_size(self, world_size: int) -> int | None:
+        """
+        Returns the max communication size in bytes for flashinfer
+        allreduce fusion for the given world size. Returns None if world size
+        is not supported by configs as it's not supported by flashinfer.
+        """
+
+        MiB = 1024 * 1024
+        max_size_mb = self.fi_allreduce_fusion_max_size_mb
+        if max_size_mb is None:
+            max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
+
+        return int(max_size_mb * MiB) if max_size_mb is not None else None
+
+    @staticmethod
+    def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
+        from vllm.compilation.collective_fusion import FI_ALLREDUCE_FUSION_MAX_SIZE_MB
+        from vllm.platforms import current_platform
+
+        if not current_platform.is_cuda():
+            return {}
+        return FI_ALLREDUCE_FUSION_MAX_SIZE_MB.get(
+            current_platform.get_device_capability().to_int(), {}
+        )
+
     def uuid(self):
         """
         Produces a hash unique to the pass configuration.
@@ -104,11 +173,24 @@ class PassConfig:
             if self.enable_fusion:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
-                    "RMSNorm/SiluMul + quant (fp8) fusion might not work")
+                    "RMSNorm/SiluMul + quant (fp8) fusion might not work"
+                )
             if self.enable_attn_fusion:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
-                    "Attention + quant (fp8) fusion might not work")
+                    "Attention + quant (fp8) fusion might not work"
+                )
+            if self.enable_fi_allreduce_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "Allreduce + rms norm + quant (fp8) fusion might not work"
+                )
+        if self.enable_qk_norm_rope_fusion and not current_platform.is_cuda_alike():
+            logger.warning_once(
+                "QK Norm + RoPE fusion enabled but the current platform is not "
+                "CUDA or ROCm. The fusion will be disabled."
+            )
+            self.enable_qk_norm_rope_fusion = False
 
 
 @config
@@ -117,22 +199,23 @@ class CompilationConfig:
     """Configuration for compilation. It has three parts:
 
     - Top-level Compilation control:
-        - [`level`][vllm.config.CompilationConfig.level]
+        - [`mode`][vllm.config.CompilationConfig.mode]
         - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
         - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
         - [`backend`][vllm.config.CompilationConfig.backend]
         - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
+        - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
     - CudaGraph capture:
-        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
         - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`max_cudagraph_capture_size`]
+        [vllm.config.CompilationConfig.max_cudagraph_capture_size]
         - [`cudagraph_num_of_warmups`]
         [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
-        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
     - Inductor compilation:
         - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
@@ -151,35 +234,63 @@ class CompilationConfig:
         sufficient for most cases. It might be beneficial to compile for
         certain small batchsizes, where inductor is good at optimizing.
     """
+
+    # Top-level Compilation control
+    level: int | None = None
+    """
+    Level is deprecated and will be removed in the next release,
+    either 0.12.0 or 0.11.2 whichever is soonest.
+    Please use mode. Currently all levels are mapped to mode.
+    """
     # Top-level Compilation control
-    level: Optional[int] = None
-    """The level of compilation:
-
-    - None: If None, we will select the default compilation level.
-      For V1 engine this is 3, for V0 engine this is 0.
-    - 0: no compilation.
-    - 1: dynamo as is.
-    - 2: dynamo once.
-    - 3: piecewise compilation."""
-    debug_dump_path: str = ""
+    mode: CompilationMode | None = None
+    """The compilation approach used for torch.compile-based compilation of the
+    model.
+
+    - None: If None, we will select the default compilation mode.
+      For V1 engine this is 3.
+    - 0: NONE: No torch.compile compilation is applied, model runs in fully
+         eager pytorch mode. The model runs as-is.
+    - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
+    - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
+         recompilation by removing guards.
+         Requires no dynamic-shape-dependent control-flow.
+    - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
+         piecewise compilation, shape specialization, and custom passes."""
+    debug_dump_path: Path | None = None
     """The path to dump the debug information."""
     cache_dir: str = ""
     """The directory to store the compiled graph, to accelerate Inductor
     compilation. By default, it will use model-related information to generate
     a cache directory."""
+    compile_cache_save_format: Literal["binary", "unpacked"] = field(
+        default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT
+    )
+    """Format for saving torch compile cache:\n
+    - "binary": saves as binary file (multiprocess safe)\n
+    - "unpacked": saves as directory structure for inspection/debugging
+    (NOT multiprocess safe)\n
+    Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
+    """
     backend: str = ""
     """The backend for compilation. It needs to be a string:
 
-    - "" (empty string): use the default backend.
+    - "" (empty string): use the default backend ("inductor" on CUDA-alike
+    platforms).
     - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
     - "full.module.name": a qualified name which can be used to import the
 
     backend function.
     We use string to avoid serialization issues when using compilation in a
-    distributed setting. When the compilation level is 1 or 2, the backend is
+    distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation level is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph)."""
+    compilation mode is 3, the backend is used for the piecewise compilation
+    (it sees a part of the graph). The backend can not be custom for compilation
+    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation is only piecewise if splitting ops is set accordingly and
+    use_inductor_graph_partition is off. Note that the default options for
+    splitting ops are sufficient for piecewise compilation.
+    """
     custom_ops: list[str] = field(default_factory=list)
     """Fine-grained control over which custom ops to enable/disable. Use 'all'
     to enable all, 'none' to disable all. Also specify a list of custom op
@@ -190,15 +301,38 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
     Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: Optional[list[str]] = None
-    """A list of ops to split the full graph into subgraphs, used in piecewise
-    compilation."""
+    splitting_ops: list[str] | None = None
+    """A list of ops to exclude from cudagraphs, used in piecewise compilation.
+
+    The behavior depends on use_inductor_graph_partition:
+
+    - When use_inductor_graph_partition=False (default):
+        These ops are used for Dynamo FX-level graph splitting. The graph is
+        split at these ops before Inductor compilation, creating separate
+        subgraphs for cudagraph capture.
+
+    - When use_inductor_graph_partition=True:
+        These ops are used to register Inductor partition rules. The graph
+        partitioning happens at Inductor codegen time after all passes and
+        fusions are finished, allowing compilation and custom passes to operate
+        on the full graph while still excluding these ops from cudagraphs.
+
+    If None, defaults to attention ops for piecewise cudagraphs.
+    If empty list [], no ops are excluded (suitable for full cudagraphs)."""
+    compile_mm_encoder: bool = False
+    """Whether or not to compile the multimodal encoder.
+    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
-    use_inductor: bool = True
-    """Whether to use inductor compilation:
+    use_inductor: bool | None = None
+    """
+    Whether to use inductor compilation.
+
+    This flag is deprecated and will be removed in the next release 0.12.0.
+    Please use the 'backend' option instead.
 
     - False: inductor compilation is not used. graph runs in eager
         (custom_ops enabled by default).
@@ -206,8 +340,12 @@ class CompilationConfig:
         One graph for symbolic shape and one graph per size in compile_sizes
         are compiled using configurations in inductor_compile_config.
 
-    This setting is ignored if level<PIECEWISE."""
-    compile_sizes: Optional[list[Union[int, str]]] = None
+    This setting is ignored if mode<VLLM_COMPILE.
+
+    For future compatibility:
+    If use_inductor is True, backend="inductor" otherwise backend="eager".
+    """
+    compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
@@ -222,7 +360,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: Optional[CUDAGraphMode] = None
+    cudagraph_mode: CUDAGraphMode | None = None
     """
     The mode of the cudagraph:
 
@@ -239,44 +377,31 @@ class CompilationConfig:
     FULL mode: Capture full cudagraph for all batches. Can be good for small
     models or workloads with small prompts; not supported by many backends.
     Generally for performance FULL_AND_PIECEWISE is better.
-    
+
     FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
     Mixed prefill-decode batches are run without cudagraphs. Can be good for
     decode instances in a P/D setup where prefill is not as important so we
     can save some memory.
-    
+
     FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
     piecewise cudagraph for prefill and mixed prefill-decode batches.
     This is the most performant mode for most models and is the default.
 
     Currently, the cudagraph mode is only used for the v1 engine.
-    Note that the cudagraph logic is generally orthogonal to the 
-    compilation logic. While piecewise cudagraphs require piecewise 
-    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    Note that the cudagraph logic is generally orthogonal to the
+    compilation logic. While piecewise cudagraphs require piecewise
+    compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
     cudagraphs are supported with and without compilation.
-    
-    Warning: This flag is new and subject to change in addition 
+
+    Warning: This flag is new and subject to change in addition
     more modes may be added.
     """
-    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation.
-    - False: cudagraph inside compilation is not used.
-    - True: cudagraph inside compilation is used. It requires
-        that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-    In the vLLM V1 Engine, this flag only applies for
-    CompilationLevel.PIECEWISE (aka -O3).
-    Note that this is orthogonal to the cudagraph capture logic
-    outside of compilation.
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
-    """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
     It means the first several runs will be treated as warmup runs.
     Only after that, the execution will be recorded, and the recorded
     cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: Optional[list[int]] = None
+    cudagraph_capture_sizes: list[int] | None = None
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
@@ -285,16 +410,16 @@ class CompilationConfig:
     cudagraph. If the caller can guarantee that the same input buffers
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False. 
+    internally managed buffer. Default is False.
     Note that this flag is only effective when cudagraph_mode is PIECEWISE.
     """
-    full_cuda_graph: Optional[bool] = False
-    """whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs. Thus this
-    flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models.
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    cudagraph_specialize_lora: bool = True
+    """Whether to create separate cuda graphs for cases with and without active
+    LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
+    for all cases, incurring the overhead of running LoRA ops even when no
+    adapters are active. Setting this to True will remove this overhead at the
+    cost of increased startup time and slightly higher memory usage.
+    When `enable_lora` is False, this option has no effect.
     """
 
     use_inductor_graph_partition: bool = False
@@ -305,7 +430,7 @@ class CompilationConfig:
     outside the partition functions. For a graph with N cudagraph-unsafe ops
     (e.g., Attention), there would be N+1 partitions. To mark an op as
     cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
-    register the custom op. 
+    register the custom op.
 
     This config supports both full cudagraph and piecewise cudagraph without
     compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
@@ -320,47 +445,63 @@ class CompilationConfig:
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
-    max_capture_size: int = field(default=None, init=False)  # type: ignore
-    """not configurable, computed after init"""
+    max_cudagraph_capture_size: int | None = field(default=None)
+    """The maximum cudagraph capture size.
+
+    If cudagraph_capture_sizes is specified, this will be set to the largest
+    size in that list (or checked for consistency if specified). If
+    cudagraph_capture_sizes is not specified, the list of sizes is generated
+    automatically following the pattern:
+
+        [1, 2, 4] + list(range(8, 256, 8)) + list(
+        range(256, max_cudagraph_capture_size + 1, 16))
+
+    If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
+    512) by default. This voids OOM in tight memory scenarios with small
+    max_num_seqs, and prevents capture of many large graphs (>512) that would
+    greatly increase startup time with limited performance benefit.
+    """
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
     bs_to_padded_graph_size: list[int] = field(
         default=None,  # type: ignore
-        init=False)
+        init=False,
+    )
     """optimization:
     Intuitively, bs_to_padded_graph_size should be dict[int, int].
-    since we know all keys are in a range [0, max_capture_size],
+    since we know all keys are in a range [0, max_cudagraph_capture_size],
     we can optimize it to list[int] for better lookup performance."""
 
     # keep track of enabled and disabled custom ops
-    enabled_custom_ops: Counter[str] = field(default_factory=Counter,
-                                             init=False)
+    enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
     """custom ops that are enabled"""
-    disabled_custom_ops: Counter[str] = field(default_factory=Counter,
-                                              init=False)
+    disabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
     """custom ops that are disabled"""
     traced_files: set[str] = field(default_factory=set, init=False)
     """files that are traced for compilation"""
     compilation_time: float = field(default=0.0, init=False)
     """time taken for compilation"""
 
-    static_forward_context: dict[str, Any] = field(default_factory=dict,
-                                                   init=False)
+    static_forward_context: dict[str, Any] = field(default_factory=dict, init=False)
     """Per-model forward context
     Map from layer name to layer objects that need to be accessed outside
     model code, e.g., Attention, FusedMOE when dp_size>1."""
 
     # Attention ops; used for piecewise cudagraphs
+    # Use PyTorch operator format: "namespace::name"
     _attention_ops: ClassVar[list[str]] = [
-        "vllm.unified_attention",
-        "vllm.unified_attention_with_output",
-        "vllm.mamba_mixer2",
-        "vllm.mamba_mixer",
-        "vllm.short_conv",
-        "vllm.linear_attention",
-        "vllm.plamo2_mamba_mixer",
-        "vllm.gdn_attention",
-        "vllm.sparse_attn_indexer",
+        "vllm::unified_attention",
+        "vllm::unified_attention_with_output",
+        "vllm::unified_mla_attention",
+        "vllm::unified_mla_attention_with_output",
+        "vllm::mamba_mixer2",
+        "vllm::mamba_mixer",
+        "vllm::short_conv",
+        "vllm::linear_attention",
+        "vllm::plamo2_mamba_mixer",
+        "vllm::gdn_attention_core",
+        "vllm::kda_attention",
+        "vllm::sparse_attn_indexer",
     ]
 
     def compute_hash(self) -> str:
@@ -376,14 +517,16 @@ class CompilationConfig:
         the final hidden states.
         """
         factors: list[Any] = []
-        factors.append(self.level)
+        factors.append(self.mode)
         factors.append(self.backend)
         factors.append(self.custom_ops)
         factors.append(self.splitting_ops)
         factors.append(self.use_inductor)
+        factors.append(self.use_inductor_graph_partition)
         factors.append(self.inductor_compile_config)
         factors.append(self.inductor_passes)
         factors.append(self.pass_config.uuid())
+        factors.append(self.compile_cache_save_format)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __repr__(self) -> str:
@@ -407,24 +550,73 @@ class CompilationConfig:
         if pass_config_exclude:
             exclude["pass_config"] = pass_config_exclude
 
-        return TypeAdapter(CompilationConfig).dump_json(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True).decode()
+        config = TypeAdapter(CompilationConfig).dump_python(
+            self, exclude=exclude, exclude_unset=True
+        )
+
+        return str(config)
 
     __str__ = __repr__
 
-    @field_validator("cudagraph_mode", mode="before")
+    @field_validator("mode", mode="before")
     @classmethod
-    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+    def validate_mode_before(cls, value: Any) -> Any:
         """
-        enable parse the `cudagraph_mode` enum type from string
+        Enable parsing the `mode` field from string mode names.
+        Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
+        DYNAMO_TRACE_ONCE, VLLM_COMPILE.
         """
+        if isinstance(value, str):
+            # Convert string mode name to integer value
+            mode_name = value.upper()
+
+            if mode_name not in CompilationMode.__members__:
+                raise ValueError(
+                    f"Invalid compilation mode: {value}. "
+                    f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
+                )
+
+            return CompilationMode[mode_name]
+        return value
+
+    @field_validator("cudagraph_mode", mode="before")
+    @classmethod
+    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+        """Enable parsing of the `cudagraph_mode` enum type from string."""
         if isinstance(value, str):
             return CUDAGraphMode[value.upper()]
         return value
 
+    @field_validator("pass_config", mode="before")
+    @classmethod
+    def validate_pass_config_before(cls, value: Any) -> Any:
+        """Enable parsing of the `pass_config` field from a dictionary."""
+        if isinstance(value, dict):
+            return PassConfig(**value)
+        return value
+
+    @field_validator("compile_cache_save_format")
+    @classmethod
+    def validate_compile_cache_save_format(cls, value: str) -> str:
+        if value not in ("binary", "unpacked"):
+            raise ValueError(
+                f"compile_cache_save_format must be 'binary' or 'unpacked', "
+                f"got: {value}"
+            )
+        return value
+
     def __post_init__(self) -> None:
+        if self.level is not None:
+            logger.warning(
+                "Level is deprecated and will be removed in the next release,"
+                "either 0.12.0 or 0.11.2 whichever is soonest."
+                "Use mode instead."
+                "If both level and mode are given,"
+                "only mode will be used."
+            )
+            if self.mode is None:
+                self.mode = self.level
+
         count_none = self.custom_ops.count("none")
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
@@ -438,16 +630,16 @@ class CompilationConfig:
         #    https://github.com/vllm-project/vllm/issues/14703
 
         if is_torch_equal_or_newer("2.6"):
-            KEY = 'enable_auto_functionalized_v2'
+            KEY = "enable_auto_functionalized_v2"
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False
 
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
-                assert callable(v), (
-                    f"pass {k} should be callable or a qualified name")
-                self.inductor_compile_config[k] = v if isinstance(
-                    v, InductorPass) else CallableInductorPass(v)
+                assert callable(v), f"pass {k} should be callable or a qualified name"
+                self.inductor_compile_config[k] = (
+                    v if isinstance(v, InductorPass) else CallableInductorPass(v)
+                )
                 continue
 
             # resolve function from qualified name
@@ -455,80 +647,111 @@ class CompilationConfig:
             module = ".".join(names[:-1])
             func_name = names[-1]
             func = __import__(module).__dict__[func_name]
-            self.inductor_compile_config[k] = func if isinstance(
-                func, InductorPass) else CallableInductorPass(func)
-
-        if isinstance(self.pass_config, dict):
-            self.pass_config = PassConfig(**self.pass_config)
+            self.inductor_compile_config[k] = (
+                func if isinstance(func, InductorPass) else CallableInductorPass(func)
+            )
+
+        if self.pass_config.enable_qk_norm_rope_fusion:
+            # TODO(zhuhaoran): support rope native forward match and remove this.
+            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
+            self.custom_ops.append("+rotary_embedding")
+
+        if (
+            is_torch_equal_or_newer("2.9.0.dev")
+            and "combo_kernels" not in self.inductor_compile_config
+            and "benchmark_combo_kernel" not in self.inductor_compile_config
+        ):
+            # use horizontal fusion, which is useful for fusing qk-norm and
+            # qk-rope when query and key have different shapes.
+            self.inductor_compile_config["combo_kernels"] = True
+            self.inductor_compile_config["benchmark_combo_kernel"] = True
+
+        if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
+            "2.9.0.dev"
+        ):
+            raise ValueError(
+                "use_inductor_graph_partition is only "
+                "supported with torch>=2.9.0.dev. Set "
+                "use_inductor_graph_partition=False instead."
+            )
 
-        # migrate the deprecated flags
-        if not self.use_cudagraph:
-            logger.warning("use_cudagraph is deprecated, use "
-                           "cudagraph_mode=NONE instead.")
-            if self.cudagraph_mode is not None:
+        for op in self.custom_ops:
+            if op[0] not in {"+", "-"} and op not in {"all", "none"}:
                 raise ValueError(
-                    "use_cudagraph and cudagraph_mode are mutually"
-                    " exclusive, prefer cudagraph_mode since "
-                    "use_cudagraph is deprecated.")
-            self.cudagraph_mode = CUDAGraphMode.NONE
-        if self.full_cuda_graph:
-            logger.warning("full_cuda_graph is deprecated, use "
-                           "cudagraph_mode=FULL instead.")
-            if self.cudagraph_mode is not None:
-                raise ValueError("full_cuda_graph and cudagraph_mode are "
-                                 "mutually exclusive, prefer cudagraph_mode "
-                                 "since full_cuda_graph is deprecated.")
-            self.cudagraph_mode = CUDAGraphMode.FULL
-
-        if (self.use_inductor_graph_partition
-                and not is_torch_equal_or_newer("2.9.0.dev")):
-            raise ValueError("use_inductor_graph_partition is only "
-                             "supported with torch>=2.9.0.dev. Set "
-                             "use_inductor_graph_partition=False instead.")
+                    f"Invalid syntax '{op}' for custom op, "
+                    "must be 'all', 'none', '+op' or '-op' "
+                    "(where 'op' is the registered op name)"
+                )
+
+        # Currently only eager and inductor backend are supported.
+        # for piecewise compilation. Custom backends are not suppported for
+        # piecewise compilation. Update when more backends are supported.
+        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
+            "",
+            "eager",
+            "inductor",
+        ]:
+            raise ValueError(
+                f"Invalid backend for piecewise compilation: {self.backend}"
+            )
 
-        for op in self.custom_ops:
-            if op[0] not in {'+', '-'} and op not in {'all', 'none'}:
-                raise ValueError(f"Invalid syntax '{op}' for custom op, "
-                                 "must be 'all', 'none', '+op' or '-op' "
-                                 "(where 'op' is the registered op name)")
+        if self.use_inductor is not None:
+            logger.warning_once(
+                "The 'use_inductor' flag is deprecated and will be "
+                "removed in the next release (v0.12.0). "
+                "Please use the 'backend' option instead.",
+            )
+            self.backend = "inductor" if self.use_inductor else "eager"
 
-    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
-        if self.level == CompilationLevel.NO_COMPILATION:
-            raise ValueError("No compilation level is set.")
+        if self.backend == "":
+            self.backend = current_platform.simple_compile_backend
+
+    def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
+        """
+        Initialize the backend for the compilation config from a vllm config.
+        Arguments:
+            vllm_config: The vllm config to initialize the backend from.
+        Returns:
+            The backend for the compilation config.
+        """
+        if self.mode is None:
+            raise ValueError(
+                "No compilation mode is set. This method should only be \
+                called via vllm config where the level is set if none is \
+                provided."
+            )
+        if self.mode == CompilationMode.NONE:
+            raise ValueError("No compilation mode is set.")
 
         from torch._dynamo.backends.registry import list_backends
+
         torch_backends = list_backends(exclude_tags=tuple())
-        if self.level in [
-                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
+        if self.mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
         ]:
-            if self.backend == "":
-                return "eager"
             if self.backend in torch_backends:
                 return self.backend
             return resolve_obj_by_qualname(self.backend)
 
-        # TODO: pass user-specified backend to piecewise compilation
-        # merge with the config use_inductor
-        assert self.level == CompilationLevel.PIECEWISE
+        assert self.mode == CompilationMode.VLLM_COMPILE
+        if self.backend not in ["eager", "inductor"]:
+            raise ValueError(
+                f"Invalid backend for piecewise compilation: {self.backend}"
+            )
 
         from vllm.compilation.backends import VllmBackend
-        return VllmBackend(vllm_config)
 
-    def init_with_cudagraph_sizes(self,
-                                  cudagraph_capture_sizes: list[int]) -> None:
-        """To complete the initialization of config,
-        we need to know the cudagraph sizes."""
+        # TODO[@lucaskabela]: See if we can forward prefix
+        # https://github.com/vllm-project/vllm/issues/27045
+        return VllmBackend(vllm_config)
 
-        if self.cudagraph_capture_sizes is None:
-            self.cudagraph_capture_sizes = cudagraph_capture_sizes
-        else:
-            # de-duplicate the sizes provided by the config
-            dedup_sizes = list(set(self.cudagraph_capture_sizes))
-            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
-                logger.info(("cudagraph sizes specified by model runner"
-                             " %s is overridden by config %s"),
-                            cudagraph_capture_sizes, dedup_sizes)
-            self.cudagraph_capture_sizes = dedup_sizes
+    def post_init_cudagraph_sizes(self) -> None:
+        """To complete the initialization after cudagraph related
+        configs are set. This includes:
+        - initialize compile_sizes
+        - pre-compute the mapping bs_to_padded_graph_size
+        """
 
         computed_compile_sizes = []
         if self.compile_sizes is not None:
@@ -536,103 +759,113 @@ class CompilationConfig:
             self.compile_sizes = list(set(self.compile_sizes))
             for x in self.compile_sizes:
                 if isinstance(x, str):
-                    assert x == "cudagraph_capture_sizes", \
-                        "Unrecognized size type in compile_sizes, " \
+                    assert x == "cudagraph_capture_sizes", (
+                        "Unrecognized size type in compile_sizes, "
                         f"expect 'cudagraph_capture_sizes', got {x}"
+                    )
                     computed_compile_sizes.extend(self.cudagraph_capture_sizes)
                 else:
                     assert isinstance(x, int)
                     computed_compile_sizes.append(x)
         self.compile_sizes = computed_compile_sizes  # type: ignore
 
-        # sort to make sure cudagraph capture sizes are in descending order
-        self.cudagraph_capture_sizes.sort(reverse=True)
-        self.max_capture_size = self.cudagraph_capture_sizes[
-            0] if self.cudagraph_capture_sizes else 0
+        # make sure the sizes are in ascending order
+        self.cudagraph_capture_sizes.sort()
+        if self.cudagraph_capture_sizes:
+            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
 
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [
-            0 for i in range(self.max_capture_size + 1)
-        ]
-        for end, start in zip(self.cudagraph_capture_sizes,
-                              self.cudagraph_capture_sizes[1:] + [0]):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_graph_size[bs] = end
-        self.bs_to_padded_graph_size[
-            self.max_capture_size] = self.max_capture_size
+        # May get recomputed in the model runner if adjustment is needed for spec-decode
+        self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called only when level is
-        # CompilationLevel.PIECEWISE
-        assert self.level == CompilationLevel.PIECEWISE, (
+        # NOTE: this function needs to be called only when mode is
+        # CompilationMode.VLLM_COMPILE
+        assert self.mode == CompilationMode.VLLM_COMPILE, (
             "set_splitting_ops_for_v1 should only be called when "
-            "level is CompilationLevel.PIECEWISE")
+            "mode is CompilationMode.VLLM_COMPILE"
+        )
+
+        if self.use_inductor_graph_partition:
+            self.set_splitting_ops_for_inductor_graph_partition()
+            return
 
-        use_inductor_graph_partition_msg = (
-            "When use_inductor_graph_partition=True, splitting_ops "
-            "are ignored and set to an empty list. Instead, "
-            "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
-            "used to annotate custom ops for graph partition.")
+        if self.pass_config.enable_attn_fusion:
+            # here use_inductor_graph_partition is False
+            self.set_splitting_ops_for_attn_fusion()
+            return
 
         if self.splitting_ops is None:
-            if self.use_inductor_graph_partition:
-                # When using inductor graph partition, we set splitting_ops
-                # to be empty and rely on torch._C.Tag.cudagraph_unsafe to
-                # annotate custom ops as splitting ops.
-                logger.warning_once(use_inductor_graph_partition_msg)
-                self.splitting_ops = []
-            else:
-                # NOTE: When using full cudagraph, instead of setting an empty
-                # list and capture the full cudagraph inside the flattened fx
-                # graph, we keep the piecewise fx graph structure but capture
-                # the full cudagraph outside the fx graph. This reduces some
-                # cpu overhead when the runtime batch_size is not cudagraph
-                # captured. see https://github.com/vllm-project/vllm/pull/20059
-                # for details. make a copy to avoid mutating the class-level
-                # list via reference.
-                self.splitting_ops = list(self._attention_ops)
+            # NOTE: When using full cudagraph, instead of setting an empty
+            # list and capture the full cudagraph inside the flattened fx
+            # graph, we keep the piecewise fx graph structure but capture
+            # the full cudagraph outside the fx graph. This reduces some
+            # cpu overhead when the runtime batch_size is not cudagraph
+            # captured. see https://github.com/vllm-project/vllm/pull/20059
+            # for details. Make a copy to avoid mutating the class-level
+            # list via reference.
+            self.splitting_ops = list(self._attention_ops)
         elif len(self.splitting_ops) == 0:
-            logger.warning_once(
-                "Using piecewise compilation with empty "
-                "splitting_ops and use_inductor_graph_partition"
-                f"={self.use_inductor_graph_partition}.")
-            if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
-                    and not self.use_inductor_graph_partition):
+            logger.warning_once("Using piecewise compilation with empty splitting_ops")
+            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+                logger.warning_once(
+                    "Piecewise compilation with empty splitting_ops do not"
+                    "contains piecewise cudagraph. Setting cudagraph_"
+                    "mode to NONE. Hint: If you are using attention backends "
+                    "that support cudagraph, consider manually setting "
+                    "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable "
+                    "full cudagraphs."
+                )
+                self.cudagraph_mode = CUDAGraphMode.NONE
+            elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                 logger.warning_once(
-                    "When compilation level is piecewise with empty "
-                    "splitting_ops, PIECEWISE cudagraph_mode will be "
-                    "treated as FULL cudagraph_mode. Please ensure you are "
-                    "using attention backends that support cudagraph or set "
-                    "cudagraph_mode to NONE explicitly if encountering "
-                    "any problems.")
+                    "Piecewise compilation with empty splitting_ops do not "
+                    "contains piecewise cudagraph. Setting cudagraph_mode "
+                    "to FULL."
+                )
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
-        elif self.use_inductor_graph_partition:
-            logger.warning_once(use_inductor_graph_partition_msg)
+
+    def set_splitting_ops_for_inductor_graph_partition(self):
+        assert self.use_inductor_graph_partition
+        if self.splitting_ops is None:
+            self.splitting_ops = list(self._attention_ops)
+
+    def set_splitting_ops_for_attn_fusion(self):
+        assert self.pass_config.enable_attn_fusion
+        if self.splitting_ops is None:
             self.splitting_ops = []
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
+                logger.warning_once(
+                    "enable_attn_fusion is incompatible with piecewise "
+                    "cudagraph when use_inductor_graph_partition is off. "
+                    "In this case, splitting_ops will be set to empty "
+                    "list, and cudagraph_mode will be set to FULL. "
+                    "Please ensure you are using attention backends that "
+                    "support cudagraph or set cudagraph_mode to NONE "
+                    "explicitly if encountering any problems."
+                )
+                self.cudagraph_mode = CUDAGraphMode.FULL
+
+        assert not self.splitting_ops_contain_attention(), (
+            "attention ops should not be in splitting_ops "
+            "when enable_attn_fusion is True"
+        )
 
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
-            op in self.splitting_ops for op in self._attention_ops)
+            op in self.splitting_ops for op in self._attention_ops
+        )
 
     def is_attention_compiled_piecewise(self) -> bool:
-        use_fx_graph_piecewise_compilation = (
-            self.level == CompilationLevel.PIECEWISE
-            and self.splitting_ops_contain_attention())
+        if not self.splitting_ops_contain_attention():
+            return False
 
-        inductor_used = (self.level == CompilationLevel.PIECEWISE
-                         and self.use_inductor) or (
-                             self.level >= CompilationLevel.DYNAMO_AS_IS
-                             and self.backend == "inductor")
-        use_inductor_piecewise_compilation = (
-            inductor_used and self.use_inductor_graph_partition
-            and not self.splitting_ops_contain_attention())
+        if not self.use_inductor_graph_partition:
+            # Dynamo-level FX split case
+            return self.mode == CompilationMode.VLLM_COMPILE
 
-        return use_fx_graph_piecewise_compilation or \
-            use_inductor_piecewise_compilation
+        # Inductor partition case
+        return self.backend == "inductor" and self.mode != CompilationMode.NONE
 
     def custom_op_log_check(self):
         """
@@ -649,13 +882,14 @@ class CompilationConfig:
         logger.debug("enabled custom ops: %s", self.enabled_custom_ops)
         logger.debug("disabled custom ops: %s", self.disabled_custom_ops)
 
-        all_ops_in_model = (self.enabled_custom_ops | self.disabled_custom_ops)
+        all_ops_in_model = self.enabled_custom_ops | self.disabled_custom_ops
         for op in self.custom_ops:
             if op in {"all", "none"}:
                 continue
 
-            assert op[0] in {'+', '-'}, "Invalid custom op syntax " \
-                                        "(should be checked during init)"
+            assert op[0] in {"+", "-"}, (
+                "Invalid custom op syntax (should be checked during init)"
+            )
 
             # check if op name exists in model
             op_name = op[1:]
@@ -664,10 +898,78 @@ class CompilationConfig:
 
                 # Does op exist at all or is it just not present in this model?
                 # Note: Only imported op classes appear in the registry.
-                missing_str = "doesn't exist (or wasn't imported/registered)" \
-                    if op_name not in CustomOp.op_registry \
+                missing_str = (
+                    "doesn't exist (or wasn't imported/registered)"
+                    if op_name not in CustomOp.op_registry
                     else "not present in model"
+                )
 
-                enable_str = "enabling" if op[0] == '+' else "disabling"
-                logger.warning_once("Op '%s' %s, %s with '%s' has no effect",
-                                    op_name, missing_str, enable_str, op)
+                enable_str = "enabling" if op[0] == "+" else "disabling"
+                logger.warning_once(
+                    "Op '%s' %s, %s with '%s' has no effect",
+                    op_name,
+                    missing_str,
+                    enable_str,
+                    op,
+                )
+
+    def adjust_cudagraph_sizes_for_spec_decode(
+        self, uniform_decode_query_len: int, tensor_parallel_size: int
+    ):
+        multiple_of = uniform_decode_query_len
+        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
+            multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
+            if (
+                multiple_of % uniform_decode_query_len != 0
+                or multiple_of % tensor_parallel_size != 0
+            ):
+                raise ValueError(
+                    f"Can't determine cudagraph shapes that are both a "
+                    f"multiple of {uniform_decode_query_len} "
+                    f"(num_speculative_tokens + 1) required by spec-decode "
+                    f"and {tensor_parallel_size} (tensor_parallel_size) "
+                    f"required by sequence parallelism please adjust "
+                    f"num_speculative_tokens or disable sequence parallelism"
+                )
+
+        if not self.cudagraph_capture_sizes or multiple_of <= 1:
+            return
+
+        assert self.max_cudagraph_capture_size is not None
+        rounded_sizes = sorted(
+            set(
+                round_up(size, multiple_of)
+                for size in self.cudagraph_capture_sizes
+                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            )
+        )
+
+        if len(rounded_sizes) == 0:
+            logger.warning(
+                "No valid cudagraph sizes after rounding to multiple of "
+                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
+                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
+                multiple_of,
+            )
+            return
+
+        self.max_cudagraph_capture_size = rounded_sizes[-1]
+        self.cudagraph_capture_sizes = rounded_sizes
+
+        # Recompute after adjusting the cudagraph sizes
+        self.compute_bs_to_padded_graph_size()
+
+    def compute_bs_to_padded_graph_size(self):
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
+        for end, start in zip(
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
diff --git a/vllm/config/device.py b/vllm/config/device.py
index 4654ac96e0b7b1bfe8772cbaa493729ad905f3a3..e85cd15de8cf4efbb585a90b83d68cf15aaab631 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -3,7 +3,7 @@
 
 import hashlib
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import torch
 from pydantic import ConfigDict, SkipValidation
@@ -19,7 +19,7 @@ Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
-    device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
+    device: SkipValidation[Device | torch.device | None] = "auto"
     """Device type for vLLM execution.
     This parameter is deprecated and will be
     removed in a future release.
@@ -45,20 +45,21 @@ class DeviceConfig:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
         if self.device == "auto":
             # Automated device type detection
             from vllm.platforms import current_platform
+
             self.device_type = current_platform.device_type
             if not self.device_type:
                 raise RuntimeError(
                     "Failed to infer device type, please set "
                     "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
-                    "to turn on verbose logging to help debug the issue.")
+                    "to turn on verbose logging to help debug the issue."
+                )
         else:
             # Device type is assigned explicitly
             if isinstance(self.device, str):
diff --git a/vllm/config/ec_transfer.py b/vllm/config/ec_transfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95236f818abb8133b96dfd55e3076abef871a73
--- /dev/null
+++ b/vllm/config/ec_transfer.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import uuid
+from dataclasses import field
+from typing import Any, Literal, get_args
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+ECProducer = Literal["ec_producer"]
+ECConsumer = Literal["ec_consumer"]
+ECRole = Literal[ECProducer, ECConsumer]
+
+
+@config
+@dataclass
+class ECTransferConfig:
+    """Configuration for distributed EC cache transfer."""
+
+    ec_connector: str | None = None
+    """The EC connector for vLLM to transmit EC caches between vLLM instances.
+    """
+
+    engine_id: str | None = None
+    """The engine id for EC transfers."""
+
+    ec_buffer_device: str | None = "cuda"
+    """The device used by ec connector to buffer the EC cache.
+    Currently only support 'cuda'."""
+
+    ec_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+
+    ec_role: ECRole | None = None
+    """Whether this vLLM instance produces, consumes EC cache, or both. Choices
+    are 'ec_producer', 'ec_consumer'."""
+
+    ec_rank: int | None = None
+    """The rank of this vLLM instance in the EC cache transfer. Typical value:
+    0 for encoder, 1 for pd instance.
+    Currently only 1P1D is supported."""
+
+    ec_parallel_size: int = 1
+    """The number of parallel instances for EC cache transfer. For
+    PyNcclConnector, this should be 2."""
+
+    ec_ip: str = "127.0.0.1"
+    """The EC connector ip, used to build distributed connection."""
+
+    ec_port: int = 14579
+    """The EC connector port, used to build distributed connection."""
+
+    ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+
+    ec_connector_module_path: str | None = None
+    """The Python module path to dynamically load the EC connector from.
+    Only supported in V1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
+        if self.ec_role is not None and self.ec_role not in get_args(ECRole):
+            raise ValueError(
+                f"Unsupported ec_role: {self.ec_role}. "
+                f"Supported roles are {get_args(ECRole)}"
+            )
+
+        if self.ec_connector is not None and self.ec_role is None:
+            raise ValueError(
+                "Please specify ec_role when ec_connector "
+                f"is set, supported roles are {get_args(ECRole)}"
+            )
+
+    @property
+    def is_ec_transfer_instance(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECRole)
+
+    @property
+    def is_ec_producer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
+
+    @property
+    def is_ec_consumer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
+
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.ec_connector_extra_config.get(key, default)
diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py
index 1c6bdffa1281d877b201655b2a6c6f83c53e661e..ce46cc03c39fe44c512a38b0f7b77a022cc8b0a7 100644
--- a/vllm/config/kv_events.py
+++ b/vllm/config/kv_events.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
+from typing import Literal
+
+from pydantic import Field
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
@@ -18,7 +20,7 @@ class KVEventsConfig:
     Events can be published externally by zmq using the event publisher config.
     """
 
-    publisher: str = "null"
+    publisher: Literal["null", "zmq"] = Field(default=None)
     """The publisher to use for publishing kv events. Can be "null", "zmq".
     """
 
@@ -26,7 +28,7 @@ class KVEventsConfig:
     """The zmq endpoint to use for publishing kv events.
     """
 
-    replay_endpoint: Optional[str] = None
+    replay_endpoint: str | None = None
     """The zmq endpoint to use for replaying kv events.
     """
 
@@ -48,3 +50,7 @@ class KVEventsConfig:
     """The topic to use for the event publisher. Consumers can subscribe to
     this topic to receive events.
     """
+
+    def __post_init__(self):
+        if self.publisher is None:
+            self.publisher = "zmq" if self.enable_kv_cache_events else "null"
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index 9abf4acacfe8185c749387e871758d29df4bb490..dfd7ef63712a3ea71da06a0efaefc108a95251dd 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -4,7 +4,7 @@
 import hashlib
 import uuid
 from dataclasses import field
-from typing import Any, Literal, Optional, get_args
+from typing import Any, Literal, get_args
 
 from pydantic.dataclasses import dataclass
 
@@ -20,26 +20,26 @@ KVRole = Literal[KVProducer, KVConsumer]
 class KVTransferConfig:
     """Configuration for distributed KV cache transfer."""
 
-    kv_connector: Optional[str] = None
+    kv_connector: str | None = None
     """The KV connector for vLLM to transmit KV caches between vLLM instances.
     """
 
-    engine_id: Optional[str] = None
+    engine_id: str | None = None
     """The engine id for KV transfers."""
 
-    kv_buffer_device: Optional[str] = "cuda"
-    """The device used by kv connector to buffer the KV cache.
-    Currently only support 'cuda'."""
+    kv_buffer_device: str = "cuda"
+    """The device used by kv connector to buffer the KV cache. Choices are 
+    'cuda' and 'cpu'."""
 
     kv_buffer_size: float = 1e9
     """The buffer size for TorchDistributedConnector. Measured in number of
     bytes. Recommended value: 1e9 (about 1GB)."""
 
-    kv_role: Optional[KVRole] = None
+    kv_role: KVRole | None = None
     """Whether this vLLM instance produces, consumes KV cache, or both. Choices
     are 'kv_producer', 'kv_consumer', and 'kv_both'."""
 
-    kv_rank: Optional[int] = None
+    kv_rank: int | None = None
     """The rank of this vLLM instance in the KV cache transfer. Typical value:
     0 for prefill instance, 1 for decode instance.
     Currently only 1P1D is supported."""
@@ -57,10 +57,13 @@ class KVTransferConfig:
     kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
     """any extra config that the connector may need."""
 
-    kv_connector_module_path: Optional[str] = None
+    kv_connector_module_path: str | None = None
     """The Python module path to dynamically load the KV connector from.
     Only supported in V1."""
 
+    enable_permute_local_kv: bool = False
+    """Experiment feature flag to enable HND to NHD KV Transfer"""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -76,8 +79,7 @@ class KVTransferConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
@@ -85,27 +87,28 @@ class KVTransferConfig:
             self.engine_id = str(uuid.uuid4())
 
         if self.kv_role is not None and self.kv_role not in get_args(KVRole):
-            raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
-                             f"Supported roles are {get_args(KVRole)}")
+            raise ValueError(
+                f"Unsupported kv_role: {self.kv_role}. "
+                f"Supported roles are {get_args(KVRole)}"
+            )
 
         if self.kv_connector is not None and self.kv_role is None:
-            raise ValueError("Please specify kv_disagg_role when kv_connector "
-                             f"is set, supported roles are {get_args(KVRole)}")
+            raise ValueError(
+                "Please specify kv_role when kv_connector "
+                f"is set, supported roles are {get_args(KVRole)}"
+            )
 
     @property
     def is_kv_transfer_instance(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVRole)
+        return self.kv_connector is not None and self.kv_role in get_args(KVRole)
 
     @property
     def is_kv_producer(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVProducer)
+        return self.kv_connector is not None and self.kv_role in get_args(KVProducer)
 
     @property
     def is_kv_consumer(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVConsumer)
+        return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)
 
     def get_from_extra_config(self, key, default) -> Any:
         return self.kv_connector_extra_config.get(key, default)
diff --git a/vllm/config/load.py b/vllm/config/load.py
index 26ffec23ad5c67a121c469dcfa3689c8139f96af..e424f8c5edb62ae068d97c1acfdb398aae201108 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from dataclasses import field
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
 class LoadConfig:
     """Configuration for loading the model weights."""
 
-    load_format: Union[str, LoadFormats] = "auto"
+    load_format: str | LoadFormats = "auto"
     """The format of the model weights to load:\n
     - "auto" will try to load the weights in the safetensors format and fall
     back to the pytorch bin format if safetensors format is not available.\n
@@ -40,6 +40,8 @@ class LoadConfig:
     more information.\n
     - "runai_streamer" will load the Safetensors weights using Run:ai Model
     Streamer.\n
+    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
+    files using Run:ai Model Streamer.\n
     - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
     - "sharded_state" will load weights from pre-sharded checkpoint files,
     supporting efficient loading of tensor-parallel models.\n
@@ -48,7 +50,7 @@ class LoadConfig:
     - "mistral" will load weights from consolidated safetensors files used by
     Mistral models.
     - Other custom values can be supported via plugins."""
-    download_dir: Optional[str] = None
+    download_dir: str | None = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
     safetensors_load_strategy: str = "lazy"
@@ -59,21 +61,24 @@ class LoadConfig:
       This is recommended for models on network filesystems (e.g., Lustre, NFS)
       as it avoids inefficient random reads, significantly speeding up model
       initialization. However, it uses more CPU RAM.
+    - "torchao": Weights are loaded in upfront and then reconstructed
+      into torchao tensor subclasses. This is used when the checkpoint
+      was quantized using torchao and saved using safetensors.
+      Needs torchao >= 0.14.0
     """
-    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
-        default_factory=dict)
+    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format."""
-    device: Optional[str] = None
+    device: str | None = None
     """Device to which model weights will be loaded, default to
     device_config.device"""
-    ignore_patterns: Optional[Union[list[str], str]] = None
+    ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
     """The list of patterns to ignore when loading the model. Default to
     "original/**/*" to avoid repeated loading of llama's checkpoints."""
     use_tqdm_on_load: bool = True
     """Whether to enable tqdm for showing progress bar when loading model
     weights."""
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    pt_load_map_location: str | dict[str, str] = "cpu"
     """
     pt_load_map_location: the map location for loading pytorch checkpoint, to
     support loading checkpoints can only be loaded on certain devices like
@@ -99,15 +104,21 @@ class LoadConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self):
-        self.load_format = self.load_format.lower()
-        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+    @field_validator("load_format", mode="after")
+    def _lowercase_load_format(cls, load_format: str) -> str:
+        return load_format.lower()
+
+    @field_validator("ignore_patterns", mode="after")
+    def _validate_ignore_patterns(
+        cls, ignore_patterns: list[str] | str
+    ) -> list[str] | str:
+        if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
             logger.info(
                 "Ignoring the following patterns when downloading weights: %s",
-                self.ignore_patterns)
-        else:
-            self.ignore_patterns = ["original/**/*"]
+                ignore_patterns,
+            )
+
+        return ignore_patterns
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 3fe28f5dad4fa7cdcb3986f93d7add34012e02a1..84e92eef400771ad265c58c8ef6c13328ed00a2f 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 import torch
-from pydantic import ConfigDict
+from pydantic import ConfigDict, Field, model_validator
 from pydantic.dataclasses import dataclass
+from typing_extensions import Self
 
-import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -23,6 +23,8 @@ else:
 logger = init_logger(__name__)
 
 LoRADType = Literal["auto", "float16", "bfloat16"]
+MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
+LoRAExtraVocabSize = Literal[256, 512]
 
 
 @config
@@ -30,26 +32,34 @@ LoRADType = Literal["auto", "float16", "bfloat16"]
 class LoRAConfig:
     """Configuration for LoRA."""
 
-    max_lora_rank: int = 16
+    max_lora_rank: MaxLoRARanks = 16
     """Max LoRA rank."""
-    max_loras: int = 1
+    max_loras: int = Field(default=1, ge=1)
     """Max number of LoRAs in a single batch."""
     fully_sharded_loras: bool = False
     """By default, only half of the LoRA computation is sharded with tensor
     parallelism. Enabling this will use the fully sharded layers. At high
     sequence length, max rank or tensor parallel size, this is likely faster.
     """
-    max_cpu_loras: Optional[int] = None
+    max_cpu_loras: int | None = None
     """Maximum number of LoRAs to store in CPU memory. Must be >= than
     `max_loras`."""
-    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    lora_dtype: torch.dtype | LoRADType = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: int = 256
+    lora_extra_vocab_size: LoRAExtraVocabSize = Field(
+        default=256,
+        deprecated=(
+            "`lora_extra_vocab_size` is deprecated and will be removed "
+            "in v0.12.0. Additional vocabulary support for "
+            "LoRA adapters is being phased out."
+        ),
+    )
     """(Deprecated) Maximum size of extra vocabulary that can be present in a 
     LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = current_platform\
-        .get_lora_vocab_padding_size()
-    default_mm_loras: Optional[dict[str, str]] = None
+    lora_vocab_padding_size: ClassVar[int] = (
+        current_platform.get_lora_vocab_padding_size()
+    )
+    default_mm_loras: dict[str, str] | None = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
     model always expects a LoRA to be active when a given modality is present.
@@ -59,9 +69,6 @@ class LoRAConfig:
     per prompt. When run in offline mode, the lora IDs for n modalities
     will be automatically assigned to 1-n with the names of the modalities
     in alphabetic order."""
-    bias_enabled: bool = False
-    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
-    removed in v0.12.0."""
 
     def compute_hash(self) -> str:
         """
@@ -82,48 +89,21 @@ class LoRAConfig:
         factors.append(self.lora_dtype)
         factors.append(self.lora_extra_vocab_size)
         factors.append(self.lora_vocab_padding_size)
-        factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        # Deprecation warning for lora_extra_vocab_size
-        logger.warning(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out.")
 
-        # Deprecation warning for enable_lora_bias
-        if self.bias_enabled:
-            logger.warning("`enable_lora_bias` is deprecated "
-                           "and will be removed in v0.12.0.")
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
 
-        # Setting the maximum rank to 512 should be able to satisfy the vast
-        # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
-        possible_lora_extra_vocab_size = (256, 512)
-        if self.max_lora_rank not in possible_max_ranks:
-            raise ValueError(
-                f"max_lora_rank ({self.max_lora_rank}) must be one of "
-                f"{possible_max_ranks}.")
-        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
-            raise ValueError(
-                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
-                f"must be one of {possible_lora_extra_vocab_size}.")
-        if self.max_loras < 1:
-            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
+    @model_validator(mode="after")
+    def _validate_lora_config(self) -> Self:
         if self.max_cpu_loras is None:
             self.max_cpu_loras = self.max_loras
         elif self.max_cpu_loras < self.max_loras:
             raise ValueError(
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})")
+                f"max_loras ({self.max_loras})"
+            )
 
-    def verify_with_cache_config(self, cache_config: CacheConfig):
-        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
-            raise ValueError(
-                "V0 LoRA does not support CPU offload, please use V1.")
+        return self
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 4b1aef1e97523e5ca9703f5b137f3e28f2d8441f..5a303d5bba36e60d491dd971d3355838defb198c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,72 +4,89 @@
 import hashlib
 import json
 import warnings
+from collections.abc import Callable
 from dataclasses import InitVar, field
 from importlib.util import find_spec
-from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Union,
-                    cast, get_args)
+from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
-from pydantic import (ConfigDict, SkipValidation, field_validator,
-                      model_validator)
+from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 
 import vllm.envs as envs
-from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
-                                    MultiModalConfig)
+from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
-from vllm.config.utils import assert_hashable, config
+from vllm.config.scheduler import RunnerType
+from vllm.config.utils import assert_hashable, config, getattr_iter
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
-    ConfigFormat, get_config, get_hf_image_processor_config,
-    get_hf_text_config, get_pooling_config,
-    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    is_interleaved, try_get_generation_config, try_get_safetensors_metadata,
-    try_get_tokenizer_config, uses_mrope)
-from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
-                                                 is_runai_obj_uri)
+    ConfigFormat,
+    get_config,
+    get_hf_image_processor_config,
+    get_hf_text_config,
+    get_pooling_config,
+    get_sentence_transformer_tokenizer_config,
+    is_encoder_decoder,
+    try_get_dense_modules,
+    try_get_generation_config,
+    try_get_safetensors_metadata,
+    try_get_tokenizer_config,
+    uses_mrope,
+)
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils import LayerBlockType, LazyLoader, common_broadcastable_dtype
+from vllm.utils.import_utils import LazyLoader
+from vllm.utils.torch_utils import common_broadcastable_dtype
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
-    from vllm.config.scheduler import RunnerType
     from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.v1.sample.logits_processor import LogitsProcessor
 else:
     PretrainedConfig = Any
 
-    me_quant = LazyLoader("model_executor", globals(),
-                          "vllm.model_executor.layers.quantization")
-    me_models = LazyLoader("model_executor", globals(),
-                           "vllm.model_executor.models")
+    AttentionBackendEnum = Any
+    me_quant = LazyLoader(
+        "model_executor", globals(), "vllm.model_executor.layers.quantization"
+    )
+    me_models = LazyLoader("model_executor", globals(), "vllm.model_executor.models")
     LoadConfig = Any
     ParallelConfig = Any
-    RunnerType = Any
     QuantizationMethods = Any
     LogitsProcessor = Any
 
 logger = init_logger(__name__)
 
-RunnerOption = Literal["auto", "generate", "pooling", "draft"]
+RunnerOption = Literal["auto", RunnerType]
 ConvertType = Literal["none", "embed", "classify", "reward"]
 ConvertOption = Literal["auto", ConvertType]
-TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
-                     "score", "reward", "transcription", "draft"]
+TaskOption = Literal[
+    "auto",
+    "generate",
+    "embedding",
+    "embed",
+    "classify",
+    "score",
+    "reward",
+    "transcription",
+    "draft",
+]
 TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
-LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits",
-                       "processed_logprobs"]
-HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
-                                             PretrainedConfig]]
+LogprobsMode = Literal[
+    "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
+]
+HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
 ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
+LayerBlockType = Literal["attention", "linear_attention", "mamba"]
 
 _RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
     "generate": ["generate", "transcription"],
@@ -100,7 +117,7 @@ class ModelConfig:
     """Convert the model using adapters defined in
     [vllm.model_executor.models.adapters][]. The most common use case is to
     adapt a text generation model to be used for pooling tasks."""
-    task: Optional[TaskOption] = None
+    task: TaskOption | None = None
     """[DEPRECATED] The task to use the model for. If the model supports more
     than one model runner, this is used to select which model runner to run.
 
@@ -118,7 +135,7 @@ class ModelConfig:
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
-    dtype: Union[ModelDType, torch.dtype] = "auto"
+    dtype: ModelDType | torch.dtype = "auto"
     """Data type for model weights and activations:\n
     - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
     precision for BF16 models.\n
@@ -127,33 +144,31 @@ class ModelConfig:
     - "bfloat16" for a balance between precision and range.\n
     - "float" is shorthand for FP32 precision.\n
     - "float32" for FP32 precision."""
-    seed: Optional[int] = None
+    seed: int | None = None
     """Random seed for reproducibility. Initialized to None in V0, but
     initialized to 0 in V1."""
-    hf_config_path: Optional[str] = None
+    hf_config: PretrainedConfig = field(init=False)
+    """The Hugging Face config of the model."""
+    hf_text_config: PretrainedConfig = field(init=False)
+    """The Hugging Face config of the text model (same as hf_config for text models)."""
+    hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
     allowed_local_media_path: str = ""
     """Allowing API requests to read local images or videos from directories
     specified by the server file system. This is a security risk. Should only
     be enabled in trusted environments."""
-    allowed_media_domains: Optional[list[str]] = None
-    """If set, only media URLs that belong to this domain can be used for 
+    allowed_media_domains: list[str] | None = None
+    """If set, only media URLs that belong to this domain can be used for
     multi-modal inputs. """
-    revision: Optional[str] = None
+    revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    rope_scaling: dict[str, Any] = field(default_factory=dict)
-    """RoPE scaling configuration. For example,
-    `{"rope_type":"dynamic","factor":2.0}`."""
-    rope_theta: Optional[float] = None
-    """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
-    theta improves the performance of the scaled model."""
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
@@ -166,9 +181,9 @@ class ModelConfig:
     - 1k -> 1000\n
     - 1K -> 1024\n
     - 25.6k -> 25,600"""
-    spec_target_max_model_len: Optional[int] = None
+    spec_target_max_model_len: int | None = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: SkipValidation[Optional[QuantizationMethods]] = None
+    quantization: SkipValidation[QuantizationMethods | None] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -207,9 +222,11 @@ class ModelConfig:
     output will contain token ids."""
     enable_prompt_embeds: bool = False
     """If `True`, enables passing text embeddings as inputs via the
-    `prompt_embeds` key. Note that enabling this will double the time required
-    for graph compilation."""
-    served_model_name: Optional[Union[str, list[str]]] = None
+    `prompt_embeds` key.
+
+    WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!"""
+    served_model_name: str | list[str] | None = None
     """The model name(s) used in the API. If multiple names are provided, the
     server will respond to any of the provided names. The model name in the
     model field of a response will be the first name in this list. If not
@@ -217,20 +234,20 @@ class ModelConfig:
     that this name(s) will also be used in `model_name` tag content of
     prometheus metrics, if multiple names provided, metrics tag will take the
     first one."""
-    config_format: Union[str, ConfigFormat] = "auto"
+    config_format: str | ConfigFormat = "auto"
     """The format of the model config to load:\n
     - "auto" will try to load the config in hf format if available else it
     will try to load in mistral format.\n
     - "hf" will load the config in hf format.\n
     - "mistral" will load the config in mistral format."""
-    hf_token: Optional[Union[bool, str]] = None
+    hf_token: bool | str | None = None
     """The token to use as HTTP bearer authorization for remote files . If
     `True`, will use the token generated when running `huggingface-cli login`
     (stored in `~/.huggingface`)."""
     hf_overrides: HfOverrides = field(default_factory=dict)
     """If a dictionary, contains arguments to be forwarded to the Hugging Face
     config. If a callable, it is called to update the HuggingFace config."""
-    logits_processor_pattern: Optional[str] = None
+    logits_processor_pattern: str | None = None
     """Optional regex pattern specifying valid logits processor qualified names
     that can be passed with the `logits_processors` extra completion argument.
     Defaults to `None`, which allows no processors."""
@@ -247,8 +264,9 @@ class ModelConfig:
     merged with the default config from the model. If used with
     `--generation-config vllm`, only the override parameters are used."""
     enable_sleep_mode: bool = False
-    """Enable sleep mode for the engine (only cuda platform is supported)."""
-    model_impl: Union[str, ModelImpl] = "auto"
+    """Enable sleep mode for the engine (only cuda and
+    hip platforms are supported)."""
+    model_impl: str | ModelImpl = "auto"
     """Which implementation of the model to use:\n
     - "auto" will try to use the vLLM implementation, if it exists, and fall
     back to the Transformers implementation if no vLLM implementation is
@@ -257,39 +275,41 @@ class ModelConfig:
     - "transformers" will use the Transformers model implementation.\n
     - "terratorch" will use the TerraTorch model implementation.
     """
-    override_attention_dtype: Optional[str] = None
+    override_attention_dtype: str | None = None
     """Override dtype for attention"""
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
-    io_processor_plugin: Optional[str] = None
+    io_processor_plugin: str | None = None
     """IOProcessor plugin name to load at model startup"""
     enable_chunked_prefill: Optional[bool] = None
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
 
     # Pooler config
-    pooler_config: Optional[PoolerConfig] = None
+    pooler_config: PoolerConfig | None = None
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
+    override_pooler_config: dict | PoolerConfig | None = None
     """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
     v0.12.0 or v1.0.0, whichever is sooner."""
 
     # Multimodal config and init vars
-    multimodal_config: Optional[MultiModalConfig] = None
+    multimodal_config: MultiModalConfig | None = None
     """Configuration for multimodal model. If `None`, this will be inferred
     from the architecture of `self.model`."""
-    limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
-    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
-    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
-    mm_processor_cache_gb: InitVar[Optional[float]] = None
-    mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
-    mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
-    mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
-    interleave_mm_strings: InitVar[Optional[bool]] = None
-    skip_mm_profiling: InitVar[Optional[bool]] = None
-    video_pruning_rate: InitVar[Optional[float]] = None
+    limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
+    enable_mm_embeds: InitVar[bool | None] = None
+    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
+    mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
+    mm_processor_cache_gb: InitVar[float | None] = None
+    mm_processor_cache_type: InitVar[MMCacheType | None] = None
+    mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
+    mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
+    mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None
+    interleave_mm_strings: InitVar[bool | None] = None
+    skip_mm_profiling: InitVar[bool | None] = None
+    video_pruning_rate: InitVar[float | None] = None
 
     def compute_hash(self) -> str:
         """
@@ -316,10 +336,8 @@ class ModelConfig:
         factors.append(self.generation_config)
         factors.append(self.model_impl)
         factors.append(self.override_generation_config)
-        factors.append(self.rope_scaling)
-        factors.append(self.rope_theta)
         factors.append(self.video_pruning_rate)
-        factors.append(self.enable_chunked_prefill)
+        factors.append(self.enable_prompt_embeds)
 
         # hf_config can control how the model looks!
         try:
@@ -330,15 +348,19 @@ class ModelConfig:
             from vllm.utils.jsontree import json_map_leaves
 
             # Handle nested HF configs with unserializable values gracefully
-            hf_config_json = json.dumps(
-                json_map_leaves(
-                    lambda v: v.to_dict()
-                    if isinstance(v, PretrainedConfig) else str(v),
-                    self.hf_config.to_dict(),
-                ),
-                indent=2,
-                sort_keys=True,
-            ) + "\n"
+            hf_config_json = (
+                json.dumps(
+                    json_map_leaves(
+                        lambda v: v.to_dict()
+                        if isinstance(v, PretrainedConfig)
+                        else str(v),
+                        self.hf_config.to_dict(),
+                    ),
+                    indent=2,
+                    sort_keys=True,
+                )
+                + "\n"
+            )
 
         factors.append(hf_config_json)
 
@@ -346,41 +368,88 @@ class ModelConfig:
         assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
+    def _update_nested(
+        self,
+        target: PretrainedConfig | dict[str, Any],
+        updates: dict[str, Any],
+    ) -> None:
+        """Recursively updates a config or dict with nested updates."""
+        for key, value in updates.items():
+            if isinstance(value, dict):
+                # Get the nested target
+                if isinstance(target, dict):
+                    nested_target = target.get(key)
+                else:
+                    nested_target = getattr(target, key, None)
+
+                # If nested target exists and can be updated recursively
+                if nested_target is not None and (
+                    isinstance(nested_target, dict)
+                    or hasattr(nested_target, "__dict__")
+                ):
+                    self._update_nested(nested_target, value)
+                    continue
+
+            # Set the value (base case)
+            if isinstance(target, dict):
+                target[key] = value
+            else:
+                setattr(target, key, value)
+
+    def _apply_dict_overrides(
+        self,
+        config: PretrainedConfig,
+        overrides: dict[str, Any],
+    ) -> None:
+        """Apply dict overrides, handling both nested configs and dict values."""
+        from transformers import PretrainedConfig
+
+        for key, value in overrides.items():
+            attr = getattr(config, key, None)
+            if attr is not None and isinstance(attr, PretrainedConfig):
+                # It's a nested config - recursively update it
+                self._update_nested(attr, value)
+            else:
+                # It's a dict-valued parameter - set it directly
+                setattr(config, key, value)
+
     def __post_init__(
         self,
         # Multimodal config init vars
-        limit_mm_per_prompt: Optional[dict[str, int]],
-        media_io_kwargs: Optional[dict[str, dict[str, Any]]],
-        mm_processor_kwargs: Optional[dict[str, Any]],
-        mm_processor_cache_gb: Optional[float],
-        mm_processor_cache_type: Optional[MMCacheType],
-        mm_shm_cache_max_object_size_mb: Optional[int],
-        mm_encoder_tp_mode: Optional[MMEncoderTPMode],
-        interleave_mm_strings: Optional[bool],
-        skip_mm_profiling: Optional[bool],
-        video_pruning_rate: Optional[float],
+        limit_mm_per_prompt: dict[str, int] | None,
+        enable_mm_embeds: bool | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
+        mm_processor_kwargs: dict[str, Any] | None,
+        mm_processor_cache_gb: float | None,
+        mm_processor_cache_type: MMCacheType | None,
+        mm_shm_cache_max_object_size_mb: int | None,
+        mm_encoder_tp_mode: MMEncoderTPMode | None,
+        mm_encoder_attn_backend: AttentionBackendEnum | str | None,
+        interleave_mm_strings: bool | None,
+        skip_mm_profiling: bool | None,
+        video_pruning_rate: float | None,
     ) -> None:
         # Set the default seed to 0 in V1.
-        # NOTE(woosuk): In V0, we set the default seed to None because the
-        # driver worker shares the same process as the user process, and thus
-        # setting a seed affects the user process as well.
-        # In V1, we use separate processes for workers (unless
+        # NOTE(woosuk): In V1, we use separate processes for workers (unless
         # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
         # doesn't affect the user process. However, without a consistent seed,
         # different tensor parallel workers would sample different tokens,
         # leading to inconsistent results.
-        if envs.VLLM_USE_V1 and self.seed is None:
+        if self.seed is None:
             self.seed = 0
             if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
                 logger.warning(
                     "The global random seed is set to %d. Since "
                     "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
                     "affect the random state of the Python process that "
-                    "launched vLLM.", self.seed)
+                    "launched vLLM.",
+                    self.seed,
+                )
 
         # Keep set served_model_name before maybe_model_redirect(self.model)
-        self.served_model_name = get_served_model_name(self.model,
-                                                       self.served_model_name)
+        self.served_model_name = get_served_model_name(
+            self.model, self.served_model_name
+        )
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
@@ -395,70 +464,69 @@ class ModelConfig:
         if callable(self.hf_overrides):
             hf_overrides_kw = {}
             hf_overrides_fn = self.hf_overrides
+            dict_overrides: dict[str, Any] = {}
         else:
-            hf_overrides_kw = self.hf_overrides
+            # Separate dict overrides from flat ones
+            # We'll determine how to apply dict overrides after loading the config
+            hf_overrides_kw = {}
+            dict_overrides = {}
+            for key, value in self.hf_overrides.items():
+                if isinstance(value, dict):
+                    dict_overrides[key] = value
+                else:
+                    hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        if self.rope_scaling:
-            hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling}
-            hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides_kw)
-            msg = (
-                "`--rope-scaling` will be removed in a future release. "
-                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
-            warnings.warn(DeprecationWarning(msg), stacklevel=2)
-        if self.rope_theta is not None:
-            hf_override = {"rope_theta": self.rope_theta}
-            hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides_kw)
-            msg = (
-                "`--rope-theta` will be removed in a future release. "
-                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
-            warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
         self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
-        if (backend := envs.VLLM_ATTENTION_BACKEND
-            ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
+        if (
+            (backend := envs.VLLM_ATTENTION_BACKEND)
+            and backend == "FLASHINFER"
+            and find_spec("flashinfer") is None
+        ):
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
                 "module was not found. See "
                 "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
-                "for instructions on how to install it.")
+                "for instructions on how to install it."
+            )
 
         from vllm.platforms import current_platform
 
-        if (self.override_attention_dtype is not None
-                and not current_platform.is_rocm()):
+        if self.override_attention_dtype is not None and not current_platform.is_rocm():
             warnings.warn(
                 "override-attention-dtype is set but not using ROCm platform",
-                stacklevel=2)
-
-        if (self.enable_sleep_mode
-                and not current_platform.is_sleep_mode_available()):
-            raise ValueError(
-                "Sleep mode is not supported on current platform.")
+                stacklevel=2,
+            )
 
-        hf_config = get_config(self.hf_config_path or self.model,
-                               self.trust_remote_code,
-                               self.revision,
-                               self.code_revision,
-                               self.config_format,
-                               hf_overrides_kw=hf_overrides_kw,
-                               hf_overrides_fn=hf_overrides_fn)
+        if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
+            raise ValueError("Sleep mode is not supported on current platform.")
+
+        hf_config = get_config(
+            self.hf_config_path or self.model,
+            self.trust_remote_code,
+            self.revision,
+            self.code_revision,
+            self.config_format,
+            hf_overrides_kw=hf_overrides_kw,
+            hf_overrides_fn=hf_overrides_fn,
+        )
 
         self.hf_config = hf_config
+        if dict_overrides:
+            self._apply_dict_overrides(hf_config, dict_overrides)
         self.hf_text_config = get_hf_text_config(self.hf_config)
-        self.attention_chunk_size = getattr(self.hf_text_config,
-                                            "attention_chunk_size", None)
+        self.attention_chunk_size = getattr(
+            self.hf_text_config, "attention_chunk_size", None
+        )
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, hf_token=self.hf_token, revision=self.revision)
+            self.model, hf_token=self.hf_token, revision=self.revision
+        )
 
         architectures = self.architectures
         registry = self.registry
-        is_generative_model = registry.is_text_generation_model(
-            architectures, self)
+        is_generative_model = registry.is_text_generation_model(architectures, self)
         is_pooling_model = registry.is_pooling_model(architectures, self)
 
         def _task_to_convert(task: TaskOption) -> ConvertType:
@@ -477,8 +545,10 @@ class ModelConfig:
         if self.task is not None:
             runner: RunnerOption = "auto"
             convert: ConvertOption = "auto"
-            msg_prefix = ("The 'task' option has been deprecated and will be "
-                          "removed in v0.13.0 or v1.0, whichever comes first.")
+            msg_prefix = (
+                "The 'task' option has been deprecated and will be "
+                "removed in v0.13.0 or v1.0, whichever comes first."
+            )
             msg_hint = "Please remove this option."
 
             is_generative_task = self.task in _RUNNER_TASKS["generate"]
@@ -488,15 +558,19 @@ class ModelConfig:
                 if is_generative_task:
                     runner = "generate"
                     convert = "auto"
-                    msg_hint = ("Please replace this option with `--runner "
-                                "generate` to continue using this model "
-                                "as a generative model.")
+                    msg_hint = (
+                        "Please replace this option with `--runner "
+                        "generate` to continue using this model "
+                        "as a generative model."
+                    )
                 elif is_pooling_task:
                     runner = "pooling"
                     convert = "auto"
-                    msg_hint = ("Please replace this option with `--runner "
-                                "pooling` to continue using this model "
-                                "as a pooling model.")
+                    msg_hint = (
+                        "Please replace this option with `--runner "
+                        "pooling` to continue using this model "
+                        "as a pooling model."
+                    )
                 else:  # task == "auto"
                     pass
             elif is_generative_model or is_pooling_model:
@@ -507,15 +581,24 @@ class ModelConfig:
                 elif is_pooling_task:
                     runner = "pooling"
                     convert = _task_to_convert(self.task)
-                    msg_hint = ("Please replace this option with `--convert "
-                                f"{convert}` to continue using this model "
-                                "as a pooling model.")
+                    msg_hint = (
+                        "Please replace this option with `--convert "
+                        f"{convert}` to continue using this model "
+                        "as a pooling model."
+                    )
                 else:  # task == "auto"
                     pass
             else:
-                raise AssertionError("The model should be a generative or "
-                                     "pooling model when task is set to "
-                                     f"{self.task!r}.")
+                debug_info = {
+                    "architectures": architectures,
+                    "is_generative_model": is_generative_model,
+                    "is_pooling_model": is_pooling_model,
+                }
+                raise AssertionError(
+                    "The model should be a generative or "
+                    "pooling model when task is set to "
+                    f"{self.task!r}. Found: {debug_info}"
+                )
 
             self.runner = runner
             self.convert = convert
@@ -524,16 +607,15 @@ class ModelConfig:
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
         self.runner_type = self._get_runner_type(architectures, self.runner)
-        self.convert_type = self._get_convert_type(architectures,
-                                                   self.runner_type,
-                                                   self.convert)
+        self.convert_type = self._get_convert_type(
+            architectures, self.runner_type, self.convert
+        )
 
         if self.runner_type == "generate" and not is_generative_model:
             generate_converts = _RUNNER_CONVERTS["generate"]
             if self.convert_type not in generate_converts:
                 # Currently we don't have any converters for generative models
-                raise ValueError(
-                    "This model does not support `--runner generate`.")
+                raise ValueError("This model does not support `--runner generate`.")
         if self.runner_type == "pooling" and not is_pooling_model:
             pooling_converts = _RUNNER_CONVERTS["pooling"]
             if self.convert_type not in pooling_converts:
@@ -541,7 +623,8 @@ class ModelConfig:
                 raise ValueError(
                     "This model does not support `--runner pooling`. "
                     f"You can pass `--convert {convert_option} to adapt "
-                    "it into a pooling model.")
+                    "it into a pooling model."
+                )
 
         # Note: Initialize these attributes early because transformers fallback
         # may fail to load dynamic modules in child processes
@@ -556,11 +639,11 @@ class ModelConfig:
                 logger.warning_once(
                     "`override_pooler_config` is deprecated and will be "
                     "removed in v0.12.0 or v1.0.0, whichever is sooner. "
-                    "Please use `pooler_config` instead.")
+                    "Please use `pooler_config` instead."
+                )
 
                 if isinstance(self.override_pooler_config, dict):
-                    self.pooler_config = PoolerConfig(
-                        **self.override_pooler_config)
+                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
                 else:
                     self.pooler_config = self.override_pooler_config
 
@@ -586,49 +669,37 @@ class ModelConfig:
             revision=self.revision,
         )
 
-        # Interleaved attention is not supported by some backends in V0
-        if (not self.disable_sliding_window
-                and is_interleaved(self.hf_text_config)
-                and not envs.VLLM_USE_V1
-                and (backend := envs.VLLM_ATTENTION_BACKEND)
-                in ("XFORMERS", "FLASHINFER")):
-            logger.warning_once(
-                "%s has interleaved attention, which is currently not "
-                "supported by the %s backend. Disabling sliding window and "
-                "capping the max length to the sliding window size (%d).",
-                self.hf_text_config.model_type,
-                backend,
-                self.hf_text_config.sliding_window,
-            )
-            self.disable_sliding_window = True
-
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
-            if (mm_encoder_tp_mode == "data" and
-                    not self._model_info.supports_multimodal_encoder_tp_data):
+            if (
+                mm_encoder_tp_mode == "data"
+                and not self._model_info.supports_multimodal_encoder_tp_data
+            ):
                 logger.warning_once(
                     "This model does not support `--mm-encoder-tp-mode data`. "
-                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                    "Falling back to `--mm-encoder-tp-mode weights`."
+                )
                 mm_encoder_tp_mode = "weights"
 
             mm_config_kwargs = dict(
                 limit_per_prompt=limit_mm_per_prompt,
+                enable_mm_embeds=enable_mm_embeds,
                 media_io_kwargs=media_io_kwargs,
                 mm_processor_kwargs=mm_processor_kwargs,
                 mm_processor_cache_gb=mm_processor_cache_gb,
                 mm_processor_cache_type=mm_processor_cache_type,
                 mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
                 mm_encoder_tp_mode=mm_encoder_tp_mode,
+                mm_encoder_attn_backend=mm_encoder_attn_backend,
                 interleave_mm_strings=interleave_mm_strings,
                 skip_mm_profiling=skip_mm_profiling,
                 video_pruning_rate=video_pruning_rate,
             )
 
             mm_config_kwargs = {
-                k: v
-                for k, v in mm_config_kwargs.items() if v is not None
+                k: v for k, v in mm_config_kwargs.items() if v is not None
             }
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
@@ -660,24 +731,41 @@ class ModelConfig:
         if not isinstance(self.tokenizer, str):
             raise ValueError("tokenizer must be a string after __post_init__.")
         if not isinstance(self.max_model_len, int):
-            raise ValueError(
-                "max_model_len must be an integer after __post_init__.")
+            raise ValueError("max_model_len must be an integer after __post_init__.")
         return self
 
     def _get_transformers_backend_cls(self) -> str:
-        """Determine which Transformers backend class will be used if
+        """Determine which Transformers modeling backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
-        if getattr(self, "runner_type", self.runner) == "pooling":
-            return "TransformersModel"
-        if self.hf_config != self.hf_text_config:
-            # If 'hf_text_config' is the same as 'hf_config'. If not, it is
-            # probably a composite config, i.e. multimodal
-            return "TransformersForMultimodalLM"
-        return "TransformersForCausalLM"
+        cls = "Transformers"
+        # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
+        cls += "MultiModal" if self.hf_config != self.hf_text_config else ""
+        cls += "MoE" if self.get_num_experts() > 1 else ""
+        # Check if the architecture we're wrapping has defaults
+        runner = None
+        task = None
+        if defaults := try_match_architecture_defaults(self.architectures[0]):
+            _, (runner, task) = defaults
+        # User specified value take precedence
+        if self.runner != "auto":
+            runner = self.runner
+        # Only consider Transformers modeling backend pooling classes if we're wrapping
+        # an architecture that defaults to pooling. Otherwise, we return the LM class
+        # and use adapters.
+        if runner == "pooling" and task in {"embed", "classify"}:
+            if task == "embed":
+                cls += "EmbeddingModel"
+            elif task == "classify":
+                cls += "ForSequenceClassification"
+        else:
+            cls += "ForCausalLM"
+        return cls
 
     def using_transformers_backend(self) -> bool:
-        """Check if the model is using the Transformers backend class."""
-        return self.architecture == self._get_transformers_backend_cls()
+        """Check if the model is using the Transformers modeling backend class."""
+        used_cls = self._model_info.architecture
+        transformers_backend_cls = self._get_transformers_backend_cls()
+        return used_cls == transformers_backend_cls
 
     @property
     def registry(self):
@@ -692,8 +780,7 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_tokenizer_for_runai(self, model: str,
-                                             tokenizer: str) -> None:
+    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
         """Pull model/tokenizer from Object Storage to temporary
         directory when needed.
 
@@ -708,42 +795,45 @@ class ModelConfig:
         if is_runai_obj_uri(model):
             object_storage_model = ObjectStorageModel(url=model)
             object_storage_model.pull_files(
-                model, allow_pattern=["*.model", "*.py", "*.json"])
+                model, allow_pattern=["*.model", "*.py", "*.json"]
+            )
             self.model_weights = model
             self.model = object_storage_model.dir
 
             # If tokenizer is same as model, download to same directory
             if model == tokenizer:
-                object_storage_model.pull_files(model,
-                                                ignore_pattern=[
-                                                    "*.pt", "*.safetensors",
-                                                    "*.bin", "*.tensors",
-                                                    "*.pth"
-                                                ])
+                object_storage_model.pull_files(
+                    model,
+                    ignore_pattern=[
+                        "*.pt",
+                        "*.safetensors",
+                        "*.bin",
+                        "*.tensors",
+                        "*.pth",
+                    ],
+                )
                 self.tokenizer = object_storage_model.dir
                 return
 
         # Only download tokenizer if needed and not already handled
         if is_runai_obj_uri(tokenizer):
             object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
-            object_storage_tokenizer.pull_files(model,
-                                                ignore_pattern=[
-                                                    "*.pt", "*.safetensors",
-                                                    "*.bin", "*.tensors",
-                                                    "*.pth"
-                                                ])
+            object_storage_tokenizer.pull_files(
+                model,
+                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
+            )
             self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(
-            self.model, self.revision)
+        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
         if tokenizer_mode not in get_args(TokenizerMode):
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                f"one of {get_args(TokenizerMode)}.")
+                f"one of {get_args(TokenizerMode)}."
+            )
         self.tokenizer_mode = tokenizer_mode
 
     def _get_default_runner_type(
@@ -785,7 +875,8 @@ class ModelConfig:
             logger.info(
                 "Resolved `--runner auto` to `--runner %s`. "
                 "Pass the value explicitly to silence this message.",
-                runner_type)
+                runner_type,
+            )
 
         return runner_type
 
@@ -798,16 +889,16 @@ class ModelConfig:
 
         for arch in architectures:
             if arch in registry.get_supported_archs():
-                if (runner_type == "generate"
-                        and registry.is_text_generation_model(
-                            architectures, self)):
+                if runner_type == "generate" and registry.is_text_generation_model(
+                    architectures, self
+                ):
                     return "none"
-                if (runner_type == "pooling"
-                        and registry.is_pooling_model(architectures, self)):
+                if runner_type == "pooling" and registry.is_pooling_model(
+                    architectures, self
+                ):
                     return "none"
 
-            match = try_match_architecture_defaults(arch,
-                                                    runner_type=runner_type)
+            match = try_match_architecture_defaults(arch, runner_type=runner_type)
             if match:
                 _, (_, convert_type) = match
                 return convert_type
@@ -829,15 +920,15 @@ class ModelConfig:
         if convert != "auto":
             return convert
 
-        convert_type = self._get_default_convert_type(architectures,
-                                                      runner_type)
+        convert_type = self._get_default_convert_type(architectures, runner_type)
 
         # Don't log the most common case
         if convert_type != "none":
             logger.info(
                 "Resolved `--convert auto` to `--convert %s`. "
                 "Pass the value explicitly to silence this message.",
-                convert_type)
+                convert_type,
+            )
 
         return convert_type
 
@@ -849,8 +940,7 @@ class ModelConfig:
             return "classify"
 
         for arch in architectures:
-            match = try_match_architecture_defaults(arch,
-                                                    runner_type="pooling")
+            match = try_match_architecture_defaults(arch, runner_type="pooling")
             if match:
                 _, (_, convert_type) = match
                 assert convert_type != "none"
@@ -868,28 +958,26 @@ class ModelConfig:
             # Set quant_method for ModelOpt models.
             producer_name = quant_cfg.get("producer", {}).get("name")
             if producer_name == "modelopt":
-                quant_algo = quant_cfg.get("quantization",
-                                           {}).get("quant_algo")
+                quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
                 if quant_algo == "FP8":
                     quant_cfg["quant_method"] = "modelopt"
                 elif quant_algo == "NVFP4":
                     quant_cfg["quant_method"] = "modelopt_fp4"
                 elif quant_algo is not None:
-                    raise ValueError(
-                        f"Unknown ModelOpt quant algo: {quant_algo}")
+                    raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
 
         return quant_cfg
 
     def _verify_quantization(self) -> None:
         supported_quantization = me_quant.QUANTIZATION_METHODS
         if self.quantization is not None:
-            self.quantization = cast(me_quant.QuantizationMethods,
-                                     self.quantization)
+            self.quantization = cast(me_quant.QuantizationMethods, self.quantization)
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config(self.hf_config)
-        if quant_cfg is None and (text_config := getattr(
-                self.hf_config, "text_config", None)):
+        if quant_cfg is None and (
+            text_config := getattr(self.hf_config, "text_config", None)
+        ):
             # Check the text config as well for multi-modal models.
             quant_cfg = self._parse_quant_hf_config(text_config)
 
@@ -898,8 +986,9 @@ class ModelConfig:
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Normalize library names
-            quant_method = quant_method.replace("compressed_tensors",
-                                                "compressed-tensors")
+            quant_method = quant_method.replace(
+                "compressed_tensors", "compressed-tensors"
+            )
 
             quant_cfg["quant_method"] = quant_method
 
@@ -933,18 +1022,22 @@ class ModelConfig:
             for name in quantization_methods:
                 method = me_quant.get_quantization_config(name)
                 quantization_override = method.override_quantization_method(
-                    quant_cfg, self.quantization)
+                    quant_cfg, self.quantization
+                )
                 if quantization_override is not None:
                     # Raise error if the override is not custom (custom would
                     # be in QUANTIZATION_METHODS but not QuantizationMethods)
                     # and hasn't been added to the overrides list.
-                    if (name in get_args(me_quant.QuantizationMethods)
-                            and name not in overrides):
+                    if (
+                        name in get_args(me_quant.QuantizationMethods)
+                        and name not in overrides
+                    ):
                         raise ValueError(
                             f"Quantization method {name} is an override but "
                             "is has not been added to the `overrides` list "
                             "above. This is necessary to ensure that the "
-                            "overrides are checked in order of preference.")
+                            "overrides are checked in order of preference."
+                        )
                     quant_method = quantization_override
                     self.quantization = quantization_override
                     break
@@ -958,24 +1051,28 @@ class ModelConfig:
                     "Quantization method specified in the model config "
                     f"({quant_method}) does not match the quantization "
                     f"method specified in the `quantization` argument "
-                    f"({self.quantization}).")
+                    f"({self.quantization})."
+                )
 
         if self.quantization is not None:
             if self.quantization not in supported_quantization:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
-                    f"be one of {supported_quantization}.")
+                    f"be one of {supported_quantization}."
+                )
             from vllm.platforms import current_platform
+
             current_platform.verify_quantization(self.quantization)
 
     def _verify_cuda_graph(self) -> None:
         # CUDAGraph capture not supported for encoder-decoder models on ROCm
         unsupported_rocm = self.is_encoder_decoder
-        if (unsupported_rocm and not self.enforce_eager
-                and current_platform.is_rocm()):
+        if unsupported_rocm and not self.enforce_eager and current_platform.is_rocm():
             logger.warning(
                 "CUDA graph is not supported for %s on ROCm yet, fallback "
-                "to eager mode.", self.hf_config.model_type)
+                "to eager mode.",
+                self.hf_config.model_type,
+            )
             self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:
@@ -985,39 +1082,36 @@ class ModelConfig:
         # TODO Remove this when bitsandbytes supports.
         """
         is_bitsandbytes = self.quantization == "bitsandbytes"
-        has_quantization_config = (getattr(self.hf_config,
-                                           "quantization_config", None)
-                                   is not None)
-        is_8bit = (self.hf_config.quantization_config.get(
-            "load_in_8bit", False) if has_quantization_config else False)
-        if all([
+        has_quantization_config = (
+            getattr(self.hf_config, "quantization_config", None) is not None
+        )
+        is_8bit = (
+            self.hf_config.quantization_config.get("load_in_8bit", False)
+            if has_quantization_config
+            else False
+        )
+        if all(
+            [
                 is_bitsandbytes,
                 has_quantization_config,
                 is_8bit,
                 not self.enforce_eager,
-        ]):
+            ]
+        ):
             logger.warning(
                 "CUDA graph is not supported on BitsAndBytes 8bit yet, "
-                "fallback to the eager mode.")
+                "fallback to the eager mode."
+            )
 
             self.enforce_eager = True
 
     def _verify_with_expert_parallelism(self) -> None:
-        num_expert_names = [
-            "moe_num_experts",  # Dbrx
-            "num_experts",  # Jamba
-            "n_routed_experts",  # DeepSeek
-            "num_local_experts",  # Mixtral
-        ]
-        num_experts = 0
-        for name in num_expert_names:
-            num_experts = getattr(self.hf_text_config, name, 0)
-            if num_experts > 0:
-                break
+        num_experts = self.get_num_experts()
         if num_experts < 1:
             raise ValueError(
                 "Number of experts in the model must be greater than 0 "
-                "when expert parallelism is enabled.")
+                "when expert parallelism is enabled."
+            )
 
     def verify_dual_chunk_attention_config(
         self,
@@ -1026,47 +1120,81 @@ class ModelConfig:
         if hasattr(self.hf_config, "dual_chunk_attention_config"):
             # Try loading the sparse attention config
             from vllm.model_executor.model_loader.weight_utils import (
-                get_sparse_attention_config)
+                get_sparse_attention_config,
+            )
+
             sparse_attn_config = get_sparse_attention_config(self, load_config)
             if sparse_attn_config:
                 self.hf_config.dual_chunk_attention_config[
-                    "sparse_attention_config"] = sparse_attn_config
-                if "sparse_attention_enabled" not in \
-                        self.hf_config.dual_chunk_attention_config:
+                    "sparse_attention_config"
+                ] = sparse_attn_config
+                if (
+                    "sparse_attention_enabled"
+                    not in self.hf_config.dual_chunk_attention_config
+                ):
                     self.hf_config.dual_chunk_attention_config[
-                        "sparse_attention_enabled"] = True
+                        "sparse_attention_enabled"
+                    ] = True
 
     def verify_with_parallel_config(
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-
         if parallel_config.distributed_executor_backend == "external_launcher":
             assert self.seed is not None, (
                 "Seed must be set when using external launcher backend to "
-                "make sure sampling results are the same across workers.")
+                "make sure sampling results are the same across workers."
+            )
 
-        total_num_attention_heads = getattr(self.hf_text_config,
-                                            "num_attention_heads", 0)
+        total_num_attention_heads = getattr(
+            self.hf_text_config, "num_attention_heads", 0
+        )
         tensor_parallel_size = parallel_config.tensor_parallel_size
         if total_num_attention_heads % tensor_parallel_size != 0:
             raise ValueError(
                 f"Total number of attention heads ({total_num_attention_heads})"
                 " must be divisible by tensor parallel size "
-                f"({tensor_parallel_size}).")
+                f"({tensor_parallel_size})."
+            )
 
         if parallel_config.enable_expert_parallel:
             self._verify_with_expert_parallelism()
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
-        if (pipeline_parallel_size > 1
-                and not self.registry.is_pp_supported_model(
-                    self.architectures, self)):
+        if pipeline_parallel_size > 1 and not self.registry.is_pp_supported_model(
+            self.architectures, self
+        ):
             raise NotImplementedError(
                 "Pipeline parallelism is not supported for this model. "
-                "Supported models implement the `SupportsPP` interface.")
+                "Supported models implement the `SupportsPP` interface."
+            )
+
+        decode_context_parallel_size = parallel_config.decode_context_parallel_size
+        if decode_context_parallel_size > 1 and not self.use_mla:
+            total_num_kv_heads = self.get_total_num_kv_heads()
+            assert tensor_parallel_size > total_num_kv_heads, (
+                f"tensor parallel size {tensor_parallel_size} must be greater "
+                f"than total num kv heads {total_num_kv_heads} when enable "
+                f"decode context parallel for GQA/MQA"
+            )
+
+            max_dcp_size = tensor_parallel_size // total_num_kv_heads
+            assert decode_context_parallel_size <= max_dcp_size, (
+                f"decode context parallel size must less than or equal to "
+                f"(tensor parallel size {tensor_parallel_size} // total "
+                f"num kv heads {total_num_kv_heads}) = {max_dcp_size}, "
+                f"but got {decode_context_parallel_size}"
+            )
 
-    def get_sliding_window(self) -> Optional[int]:
+            num_q_per_kv = total_num_attention_heads // total_num_kv_heads
+            assert num_q_per_kv % decode_context_parallel_size == 0, (
+                f"Total number of q per kv attn heads ({num_q_per_kv})"
+                " must be divisible by dcp world size when enable "
+                "decode context parallel for GQA "
+                f"({parallel_config.decode_context_parallel_size})."
+            )
+
+    def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
 
@@ -1080,34 +1208,42 @@ class ModelConfig:
     def is_deepseek_mla(self) -> bool:
         if not hasattr(self.hf_text_config, "model_type"):
             return False
-        elif self.hf_text_config.model_type in \
-            ('deepseek_v2', 'deepseek_v3', 'deepseek_v32', 'deepseek_mtp',
-              'kimi_k2', 'longcat_flash'):
+        elif self.hf_text_config.model_type in (
+            "deepseek_v2",
+            "deepseek_v3",
+            "deepseek_v32",
+            "deepseek_mtp",
+            "kimi_k2",
+            "kimi_linear",
+            "longcat_flash",
+            "pangu_ultra_moe",
+            "pangu_ultra_moe_mtp",
+        ):
             return self.hf_text_config.kv_lora_rank is not None
-        elif self.hf_text_config.model_type == 'eagle':
+        elif self.hf_text_config.model_type == "eagle":
             # if the model is an EAGLE module, check for the
             # underlying architecture
-            return self.hf_text_config.model.model_type in \
-                    ('deepseek_v2', 'deepseek_v3', 'deepseek_v32') \
+            return (
+                self.hf_text_config.model.model_type
+                in ("deepseek_v2", "deepseek_v3", "deepseek_v32")
                 and self.hf_text_config.kv_lora_rank is not None
+            )
         return False
 
     def get_head_size(self) -> int:
         # TODO remove hard code
         if self.is_deepseek_mla:
-            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
-                                       0)
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
             if self.use_mla:
                 return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
             else:
-                qk_nope_head_dim = getattr(self.hf_text_config,
-                                           "qk_nope_head_dim", 0)
+                qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
                 if qk_rope_head_dim and qk_nope_head_dim:
                     return qk_rope_head_dim + qk_nope_head_dim
 
-        if hasattr(self.hf_text_config,
-                   "model_type") and (self.hf_text_config.model_type
-                                      == "zamba2"):
+        if hasattr(self.hf_text_config, "model_type") and (
+            self.hf_text_config.model_type == "zamba2"
+        ):
             return self.hf_text_config.attention_head_dim
 
         if self.is_attention_free:
@@ -1118,13 +1254,13 @@ class ModelConfig:
             return self.hf_text_config.head_dim
 
         # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
-        if getattr(self.hf_text_config, "hidden_size_per_head",
-                   None) is not None:
+        if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
             return self.hf_text_config.hidden_size_per_head
 
         # FIXME(woosuk): This may not be true for all models.
-        return (self.hf_text_config.hidden_size //
-                self.hf_text_config.num_attention_heads)
+        return (
+            self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads
+        )
 
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -1135,9 +1271,11 @@ class ModelConfig:
         falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
         new_decoder_arch_falcon = (
             self.hf_config.model_type in falcon_model_types
-            and getattr(self.hf_config, "new_decoder_architecture", False))
-        if not new_decoder_arch_falcon and getattr(self.hf_text_config,
-                                                   "multi_query", False):
+            and getattr(self.hf_config, "new_decoder_architecture", False)
+        )
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_text_config, "multi_query", False
+        ):
             # Multi-query attention, only one KV head.
             # Currently, tensor parallelism is not supported in this case.
             return 1
@@ -1148,14 +1286,19 @@ class ModelConfig:
                 return self.hf_config.attn_config["kv_n_heads"]
             return self.hf_config.num_attention_heads
         if self.hf_config.model_type == "dbrx":
-            return getattr(self.hf_config.attn_config, "kv_n_heads",
-                           self.hf_config.num_attention_heads)
+            return getattr(
+                self.hf_config.attn_config,
+                "kv_n_heads",
+                self.hf_config.num_attention_heads,
+            )
 
         if self.hf_config.model_type == "nemotron-nas":
             for block in self.hf_config.block_configs:
                 if not block.attention.no_op:
-                    return self.hf_config.num_attention_heads \
+                    return (
+                        self.hf_config.num_attention_heads
                         // block.attention.n_heads_in_group
+                    )
 
             raise RuntimeError("Couldn't determine number of kv heads")
 
@@ -1191,32 +1334,56 @@ class ModelConfig:
         # the tensor parallel size. We will replicate the KV heads in the
         # case where the number of KV heads is smaller than the tensor
         # parallel size so each GPU has at least one KV head.
-        return max(1,
-                   total_num_kv_heads // parallel_config.tensor_parallel_size)
+        return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
 
     def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int:
         num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
         return num_heads // parallel_config.tensor_parallel_size
 
+    def get_num_experts(self) -> int:
+        """Returns the number of experts in the model."""
+        num_expert_names = [
+            "num_experts",  # Jamba
+            "moe_num_experts",  # Dbrx
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
+        if isinstance(num_experts, list):
+            # Ernie VL's remote code uses list[int]...
+            # The values are always the same so we just take the first one.
+            return num_experts[0]
+        # Coerce to 0 if explicitly set to None
+        return num_experts or 0
+
     def get_layers_start_end_indices(
-            self, parallel_config: ParallelConfig) -> tuple[int, int]:
+        self, parallel_config: ParallelConfig
+    ) -> tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
-        if (self.hf_text_config.model_type == "deepseek_mtp"
-                or self.hf_config.model_type == "mimo_mtp"
-                or self.hf_config.model_type == "glm4_moe_mtp"
-                or self.hf_config.model_type == "ernie_mtp"
-                or self.hf_config.model_type == "qwen3_next_mtp"):
-            total_num_hidden_layers = getattr(self.hf_text_config,
-                                              "num_nextn_predict_layers", 0)
-        elif (self.hf_config.model_type == "longcat_flash_mtp"):
-            total_num_hidden_layers = getattr(self.hf_text_config,
-                                              "num_nextn_predict_layers", 1)
+
+        if (
+            self.hf_text_config.model_type == "deepseek_mtp"
+            or self.hf_config.model_type == "mimo_mtp"
+            or self.hf_config.model_type == "glm4_moe_mtp"
+            or self.hf_config.model_type == "ernie_mtp"
+            or self.hf_config.model_type == "qwen3_next_mtp"
+            or self.hf_config.model_type == "pangu_ultra_moe_mtp"
+        ):
+            total_num_hidden_layers = getattr(
+                self.hf_text_config, "num_nextn_predict_layers", 0
+            )
+        elif self.hf_config.model_type == "longcat_flash_mtp":
+            total_num_hidden_layers = getattr(
+                self.hf_text_config, "num_nextn_predict_layers", 1
+            )
         else:
-            total_num_hidden_layers = getattr(self.hf_text_config,
-                                              "num_hidden_layers", 0)
+            total_num_hidden_layers = getattr(
+                self.hf_text_config, "num_hidden_layers", 0
+            )
         # the layout order is: DP x PP x TP
-        pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size
-                   ) % parallel_config.pipeline_parallel_size
+        pp_rank = (
+            parallel_config.rank // parallel_config.tensor_parallel_size
+        ) % parallel_config.pipeline_parallel_size
         pp_size = parallel_config.pipeline_parallel_size
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
         return start, end
@@ -1228,14 +1395,14 @@ class ModelConfig:
     def get_num_layers_by_block_type(
         self,
         parallel_config: ParallelConfig,
-        block_type: LayerBlockType = LayerBlockType.attention,
+        block_type: LayerBlockType = "attention",
     ) -> int:
         # This function relies on 'layers_block_type' in hf_config,
         # for w/o this attribute, we will need to have workarounds like so
-        attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and \
-                            not self.has_noops and \
-                            not self.is_attention_free
+        attn_block_type = block_type == "attention"
+        is_transformer = (
+            not self.is_hybrid and not self.has_noops and not self.is_attention_free
+        )
         start, end = self.get_layers_start_end_indices(parallel_config)
 
         if is_transformer:
@@ -1248,23 +1415,23 @@ class ModelConfig:
             return 0 if attn_block_type else end - start
         elif self.has_noops:
             block_configs = self.hf_config.block_configs
-            return sum(not bc.attention.no_op
-                       for bc in block_configs[start:end])
+            return sum(not bc.attention.no_op for bc in block_configs[start:end])
         else:
             # Hybrid model Jamba
-            layers_block_type_value = getattr(self.hf_text_config,
-                                              "layers_block_type", None)
+            layers_block_type_value = getattr(
+                self.hf_text_config, "layers_block_type", None
+            )
             if layers_block_type_value is not None:
-                if hasattr(self.hf_text_config,
-                           "model_type") and (self.hf_text_config.model_type
-                                              == "zamba2"):
+                if hasattr(self.hf_text_config, "model_type") and (
+                    self.hf_text_config.model_type == "zamba2"
+                ):
                     if attn_block_type:
-                        return sum(t == "hybrid"
-                                   for t in layers_block_type_value[start:end])
+                        return sum(
+                            t == "hybrid" for t in layers_block_type_value[start:end]
+                        )
                     else:
                         return self.get_num_layers(parallel_config)
-                return sum(t == block_type.value
-                           for t in layers_block_type_value[start:end])
+                return sum(t == block_type for t in layers_block_type_value[start:end])
 
             # Hybrid model Minimax
             attn_type_list = getattr(self.hf_config, "attn_type_list", None)
@@ -1274,26 +1441,29 @@ class ModelConfig:
             # Hybrid model Qwen3Next
             layer_types_value = getattr(self.hf_config, "layer_types", None)
             if layer_types_value is not None:
-                if getattr(block_type, "value", block_type) == "attention":
-                    return sum(t == "full_attention"
-                               for t in layer_types_value[start:end])
-                elif getattr(block_type, "value",
-                             block_type) == "linear_attention":
-                    return sum(t == "linear_attention"
-                               for t in layer_types_value[start:end])
+                if block_type == "attention":
+                    return sum(
+                        t == "full_attention" for t in layer_types_value[start:end]
+                    )
+                elif block_type == "linear_attention":
+                    return sum(
+                        t == "linear_attention" for t in layer_types_value[start:end]
+                    )
                 else:
-                    return sum(t == getattr(block_type, "value", block_type)
-                               for t in layer_types_value[start:end])
+                    return sum(t == block_type for t in layer_types_value[start:end])
 
-            if (layers_block_type_value is None and attn_type_list is None
-                    and layer_types_value is None):
+            if (
+                layers_block_type_value is None
+                and attn_type_list is None
+                and layer_types_value is None
+            ):
                 raise ValueError(
-                    "The model is an hybrid without a"
-                    "layers_block_type or an attn_type_list, or a layer_types "
-                    "in the hf_config, cannot determine the num of "
-                    f"{block_type.value} layers")
+                    "The model is an hybrid without a layers_block_type or an "
+                    "attn_type_list, or a layer_types in the hf_config, "
+                    f"cannot determine the num of {block_type} layers"
+                )
 
-    def get_mamba_chunk_size(self) -> Optional[int]:
+    def get_mamba_chunk_size(self) -> int | None:
         """
         Returns the mamba chunk size if it exists
         """
@@ -1302,6 +1472,12 @@ class ModelConfig:
         if chunk_size is None:
             # used by e.g. Mamba2, NemotronH, Zamba
             chunk_size = getattr(self.hf_text_config, "chunk_size", None)
+
+        # Since Mamba1 does not have a chunk notion
+        # we use a default chunk size of 1024.
+        if chunk_size is None:
+            chunk_size = 2048
+
         return chunk_size
 
     def get_multimodal_config(self) -> MultiModalConfig:
@@ -1333,11 +1509,13 @@ class ModelConfig:
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
                 revision=self.revision,
+                config_format=self.config_format,
             )
         else:
             config = try_get_generation_config(
                 self.generation_config,
                 trust_remote_code=self.trust_remote_code,
+                config_format=self.config_format,
             )
 
         if config is None:
@@ -1378,14 +1556,14 @@ class ModelConfig:
         ]
         if any(p in config for p in available_params):
             diff_sampling_param = {
-                p: config.get(p)
-                for p in available_params if config.get(p) is not None
+                p: config.get(p) for p in available_params if config.get(p) is not None
             }
             # Huggingface definition of max_new_tokens is equivalent
             # to vLLM's max_tokens
             if "max_new_tokens" in diff_sampling_param:
                 diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
-                    "max_new_tokens")
+                    "max_new_tokens"
+                )
         else:
             diff_sampling_param = {}
 
@@ -1394,7 +1572,8 @@ class ModelConfig:
                 "Default sampling parameters have been overridden by the "
                 "model's Hugging Face generation config recommended from the "
                 "model creator. If this is not intended, please relaunch "
-                "vLLM instance with `--generation-config vllm`.")
+                "vLLM instance with `--generation-config vllm`."
+            )
         return diff_sampling_param
 
     @property
@@ -1402,6 +1581,29 @@ class ModelConfig:
         """Extract the HF encoder/decoder model flag."""
         return is_encoder_decoder(self.hf_config)
 
+    @property
+    def uses_alibi(self) -> bool:
+        cfg = self.hf_text_config
+
+        return (
+            getattr(cfg, "alibi", False)  # Falcon
+            or "BloomForCausalLM" in self.architectures  # Bloom
+            or getattr(cfg, "position_encoding_type", "") == "alibi"  # codellm_1b_alibi
+            or (
+                hasattr(cfg, "attn_config")  # MPT
+                and (
+                    (
+                        isinstance(cfg.attn_config, dict)
+                        and cfg.attn_config.get("alibi", False)
+                    )
+                    or (
+                        not isinstance(cfg.attn_config, dict)
+                        and getattr(cfg.attn_config, "alibi", False)
+                    )
+                )
+            )
+        )
+
     @property
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
@@ -1416,8 +1618,9 @@ class ModelConfig:
 
     @property
     def is_cross_encoder(self) -> bool:
-        return (self._model_info.supports_cross_encoding
-                or self.convert_type == "classify")
+        return (
+            self._model_info.supports_cross_encoding or self.convert_type == "classify"
+        )
 
     @property
     def is_pp_supported(self) -> bool:
@@ -1429,6 +1632,13 @@ class ModelConfig:
 
     @property
     def is_hybrid(self) -> bool:
+        # Handle granite-4.0-micro case which uses hybrid config but does not
+        # actually contain any non-attention layers.
+        layer_types = getattr(self.hf_config, "layer_types", None)
+        if layer_types is not None and all(
+            layer == "attention" for layer in layer_types
+        ):
+            return False
         return self._model_info.is_hybrid
 
     @property
@@ -1440,8 +1650,8 @@ class ModelConfig:
         return self._model_info.has_inner_state
 
     @property
-    def is_v1_compatible(self) -> bool:
-        return not self._model_info.supports_v0_only
+    def supports_mamba_prefix_caching(self) -> bool:
+        return self._model_info.supports_mamba_prefix_caching
 
     @property
     def use_mla(self) -> bool:
@@ -1449,8 +1659,9 @@ class ModelConfig:
 
     @property
     def is_matryoshka(self) -> bool:
-        return (bool(getattr(self.hf_config, "matryoshka_dimensions", None))
-                or getattr(self.hf_config, "is_matryoshka", False))
+        return bool(getattr(self.hf_config, "matryoshka_dimensions", None)) or getattr(
+            self.hf_config, "is_matryoshka", False
+        )
 
     @property
     def matryoshka_dimensions(self):
@@ -1474,35 +1685,57 @@ class ModelConfig:
         you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
         """
 
-        head_dtype = _get_head_dtype(config=self.hf_config,
-                                     dtype=self.dtype,
-                                     runner_type=self.runner_type)
+        head_dtype = _get_head_dtype(
+            config=self.hf_config, dtype=self.dtype, runner_type=self.runner_type
+        )
 
         if self.runner_type != "pooling" and head_dtype != self.dtype:
             logger.warning_once(
                 "`head_dtype` currently only supports pooling models."
-                "fallback to model dtype [%s].", self.dtype)
+                "fallback to model dtype [%s].",
+                self.dtype,
+            )
             return self.dtype
 
         if head_dtype not in current_platform.supported_dtypes:
             logger.warning_once(
                 "The current platform does not support [%s] head dtype, "
-                "fallback to model dtype [%s].", head_dtype, self.dtype)
+                "fallback to model dtype [%s].",
+                head_dtype,
+                self.dtype,
+            )
             return self.dtype
 
         logger.debug_once("head dtype: %s", head_dtype)
         return head_dtype
 
+    @property
+    def hidden_size(self):
+        if hasattr(self.hf_config, "hidden_size"):
+            return self.hf_config.hidden_size
+        text_config = self.hf_config.get_text_config()
+        return text_config.hidden_size
+
+    @property
+    def embedding_size(self):
+        dense_modules = try_get_dense_modules(self.model, revision=self.revision)
+        if dense_modules is not None:
+            return dense_modules[-1]["out_features"]
+        return self.hidden_size
+
     def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
         tokenizer_config = None
-        if (self.runner_type == "pooling" and getattr(
-                self.hf_config, "position_embedding_type", "") == "absolute"):
+        if (
+            self.runner_type == "pooling"
+            and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
+        ):
             tokenizer_config = try_get_tokenizer_config(
                 self.tokenizer,
                 trust_remote_code=self.trust_remote_code,
-                revision=self.tokenizer_revision)
+                revision=self.tokenizer_revision,
+            )
         max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,
@@ -1510,13 +1743,13 @@ class ModelConfig:
             disable_sliding_window=self.disable_sliding_window,
             sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
-            encoder_config=self.encoder_config)
+            encoder_config=self.encoder_config,
+        )
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
 
-def get_served_model_name(model: str,
-                          served_model_name: Optional[Union[str, list[str]]]):
+def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
     If the input is a non-empty list, the first model_name in
     `served_model_name` is taken.
@@ -1560,14 +1793,18 @@ def iter_architecture_defaults():
 def try_match_architecture_defaults(
     architecture: str,
     *,
-    runner_type: Optional[RunnerType] = None,
-    convert_type: Optional[ConvertType] = None,
-) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
-    for suffix, (default_runner_type,
-                 default_convert_type) in iter_architecture_defaults():
-        if ((runner_type is None or runner_type == default_runner_type) and
-            (convert_type is None or convert_type == default_convert_type)
-                and architecture.endswith(suffix)):
+    runner_type: RunnerType | None = None,
+    convert_type: ConvertType | None = None,
+) -> tuple[str, tuple[RunnerType, ConvertType]] | None:
+    for suffix, (
+        default_runner_type,
+        default_convert_type,
+    ) in iter_architecture_defaults():
+        if (
+            (runner_type is None or runner_type == default_runner_type)
+            and (convert_type is None or convert_type == default_convert_type)
+            and architecture.endswith(suffix)
+        ):
             return suffix, (default_runner_type, default_convert_type)
 
     return None
@@ -1585,8 +1822,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
-    "gemma3_text":
-    "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3_text": "Numerical instability. Please use bfloat16 or float32 instead.",
     "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
 }
@@ -1602,8 +1838,9 @@ def _is_valid_dtype(model_type: str, dtype: torch.dtype):
 def _check_valid_dtype(model_type: str, dtype: torch.dtype):
     if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:
         reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type]
-        raise ValueError(f"The model type {model_type!r} "
-                         f"does not support float16. Reason: {reason}")
+        raise ValueError(
+            f"The model type {model_type!r} does not support float16. Reason: {reason}"
+        )
 
     return True
 
@@ -1612,20 +1849,20 @@ def _find_dtype(
     model_id: str,
     config: PretrainedConfig,
     *,
-    revision: Optional[str],
+    revision: str | None,
 ):
-    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
-    # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    # NOTE: getattr(config, "dtype", torch.float32) is not correct
+    # because config.dtype can be None.
+    config_dtype = getattr(config, "dtype", None)
 
     # Fallbacks for multi-modal models if the root config
-    # does not define torch_dtype
+    # does not define dtype
     if config_dtype is None:
-        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
+        config_dtype = getattr(config.get_text_config(), "dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
-        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+        config_dtype = getattr(config.vision_config, "dtype", None)
     if config_dtype is None and hasattr(config, "encoder_config"):
-        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+        config_dtype = getattr(config.encoder_config, "dtype", None)
 
     # Try to read the dtype of the weights if they are in safetensors format
     if config_dtype is None:
@@ -1657,7 +1894,8 @@ def _resolve_auto_dtype(
     from vllm.platforms import current_platform
 
     supported_dtypes = [
-        dtype for dtype in current_platform.supported_dtypes
+        dtype
+        for dtype in current_platform.supported_dtypes
         if _is_valid_dtype(model_type, dtype)
     ]
 
@@ -1684,8 +1922,7 @@ def _resolve_auto_dtype(
         device_str = f"{device_name!r} (with compute capability {version_str})"
 
     logger.warning(
-        "Your device %s doesn't support %s. "
-        "Falling back to %s for compatibility.",
+        "Your device %s doesn't support %s. Falling back to %s for compatibility.",
         device_str,
         config_dtype,
         preferred_dtype,
@@ -1697,10 +1934,10 @@ def _resolve_auto_dtype(
 def _get_and_verify_dtype(
     model_id: str,
     config: PretrainedConfig,
-    dtype: Union[str, torch.dtype],
+    dtype: str | torch.dtype,
     *,
     is_pooling_model: bool,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ) -> torch.dtype:
     config_dtype = _find_dtype(model_id, config, revision=revision)
     model_type = config.model_type
@@ -1739,11 +1976,10 @@ def _get_and_verify_dtype(
     return torch_dtype
 
 
-def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype,
-                    runner_type: str) -> torch.dtype:
-    head_dtype: Optional[Union[str,
-                               torch.dtype]] = getattr(config, "head_dtype",
-                                                       None)
+def _get_head_dtype(
+    config: PretrainedConfig, dtype: torch.dtype, runner_type: str
+) -> torch.dtype:
+    head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None)
 
     if head_dtype == "model":
         return dtype
@@ -1766,12 +2002,12 @@ def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype,
 
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
-    tokenizer_config: Optional[dict],
-    max_model_len: Optional[int],
+    tokenizer_config: dict | None,
+    max_model_len: int | None,
     disable_sliding_window: bool,
-    sliding_window: Optional[int],
-    spec_target_max_model_len: Optional[int] = None,
-    encoder_config: Optional[Any] = None,
+    sliding_window: int | None,
+    spec_target_max_model_len: int | None = None,
+    encoder_config: Any | None = None,
 ) -> int:
     """Get and verify the model's maximum length."""
     derived_max_model_len = float("inf")
@@ -1798,8 +2034,7 @@ def _get_and_verify_max_len(
     for key in possible_keys:
         max_len = getattr(hf_config, key, None)
         if max_len is not None:
-            max_len_key = key if max_len < derived_max_model_len \
-                else max_len_key
+            max_len_key = key if max_len < derived_max_model_len else max_len_key
             derived_max_model_len = min(derived_max_model_len, max_len)
     # For Command-R / Cohere, Cohere2 / Aya Vision models
     if tmp_max_len := getattr(hf_config, "model_max_length", None):
@@ -1808,17 +2043,20 @@ def _get_and_verify_max_len(
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
-    if (disable_sliding_window and sliding_window is not None
-            and sliding_window < derived_max_model_len):
+    if (
+        disable_sliding_window
+        and sliding_window is not None
+        and sliding_window < derived_max_model_len
+    ):
         max_len_key = "sliding_window"
         derived_max_model_len = sliding_window
 
     # Consider model_max_length in tokenizer_config
     if tokenizer_config:
         tokenizer_model_max_length = tokenizer_config.get(
-            "model_max_length", derived_max_model_len)
-        derived_max_model_len = min(derived_max_model_len,
-                                    tokenizer_model_max_length)
+            "model_max_length", derived_max_model_len
+        )
+        derived_max_model_len = min(derived_max_model_len, tokenizer_model_max_length)
 
     # If none of the keys were found in the config, use a default and
     # log a warning.
@@ -1836,8 +2074,10 @@ def _get_and_verify_max_len(
         logger.warning(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
-            "%s. Assuming the model's maximum length is %d.", possible_keys,
-            default_max_len)
+            "%s. Assuming the model's maximum length is %d.",
+            possible_keys,
+            default_max_len,
+        )
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
@@ -1855,32 +2095,37 @@ def _get_and_verify_max_len(
                 raise NotImplementedError(
                     "Disabling sliding window is not supported for models "
                     "with rope_scaling. Please raise an issue so we can "
-                    "investigate.")
+                    "investigate."
+                )
 
             # NOTE: rope_type == "default" does not define factor
             # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
             scaling_factor = rope_scaling.get("factor", 1.0)
 
             if rope_type == "yarn":
-                derived_max_model_len = rope_scaling[
-                    "original_max_position_embeddings"]
+                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
             derived_max_model_len *= scaling_factor
 
     if encoder_config and "max_seq_length" in encoder_config:
         derived_max_model_len = encoder_config["max_seq_length"]
 
+    # If the user didn't specify `max_model_len`, then use that derived from
+    # the model config as a default value.
+    if max_model_len is None:
+        # For LongRoPE, default to original_max_position_embeddings to avoid
+        # performance degradation for shorter sequences
+        if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
+            max_model_len = int(
+                getattr(
+                    hf_config, "original_max_position_embeddings", derived_max_model_len
+                )
+            )
+        else:
+            max_model_len = int(derived_max_model_len)
+        max_model_len = current_platform.check_max_model_len(max_model_len)
+
     # If the user specified a max length, make sure it is smaller than the
     # derived length from the HF model config.
-    if max_model_len is None:
-        max_model_len = int(derived_max_model_len)
-        if current_platform.is_tpu():
-            logger.warning(
-                "--max-model-len is not specified, "
-                "it's currently using model's default length %s, "
-                "which might be too large."
-                "Please input with --max-model-len based on your "
-                "request input length and output length, to avoid "
-                "unnecessary degradation.", max_model_len)
     elif max_model_len > derived_max_model_len:
         # Some models might have a separate key for specifying model_max_length
         # that will be bigger than derived_max_model_len. We compare user input
@@ -1893,24 +2138,28 @@ def _get_and_verify_max_len(
                 raise NotImplementedError(
                     "Disabling sliding window is not supported for models "
                     "model_max_length in the config. Please raise an issue "
-                    "so we can investigate.")
+                    "so we can investigate."
+                )
         else:
             msg = (
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
                 f"{derived_max_model_len} or model_max_length="
-                f"{model_max_length} in model's config.json).")
+                f"{model_max_length} in model's config.json)."
+            )
             warning = (
                 "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
                 "caution. If the model uses relative position encoding (RoPE), "
                 "positions exceeding derived_max_model_len lead to nan. If the "
                 "model uses absolute position encoding, positions exceeding "
                 "derived_max_model_len will cause a CUDA array out-of-bounds "
-                "error.")
+                "error."
+            )
             if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
                 logger.warning_once("%s %s", msg, warning)
             else:
                 raise ValueError(
                     f"{msg} To allow overriding this maximum, set "
-                    f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
+                    f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}"
+                )
     return int(max_model_len)
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 569de957990023b8c83674d5e582da322e44fa19..9f62b35ed515ceaf3b515cbad46cdc25125afec9 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -3,16 +3,55 @@
 
 import hashlib
 from collections.abc import Mapping
-from dataclasses import field
-from typing import Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal, TypeAlias
 
+from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
-import vllm.envs as envs
 from vllm.config.utils import config
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.registry import AttentionBackendEnum
+else:
+    AttentionBackendEnum = Any
+
+
+@dataclass
+class BaseDummyOptions:
+    """Base options for generating dummy data during profiling."""
+
+    count: int = Field(999, ge=0)
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class VideoDummyOptions(BaseDummyOptions):
+    """Options for generating dummy video data during profiling."""
+
+    num_frames: int | None = Field(None, gt=0)
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class ImageDummyOptions(BaseDummyOptions):
+    """Options for generating dummy image data during profiling."""
+
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class AudioDummyOptions(BaseDummyOptions):
+    """Options for generating dummy audio data during profiling."""
+
+    length: int | None = Field(None, gt=0)
+
+
 MMEncoderTPMode = Literal["weights", "data"]
 MMCacheType = Literal["shm", "lru"]
+DummyOptions: TypeAlias = (
+    BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
+)
 
 
 @config
@@ -20,17 +59,35 @@ MMCacheType = Literal["shm", "lru"]
 class MultiModalConfig:
     """Controls the behavior of multimodal models."""
 
-    limit_per_prompt: dict[str, int] = field(default_factory=dict)
-    """The maximum number of input items allowed per prompt for each modality.
-    Defaults to 1 (V0) or 999 (V1) for each modality.
+    limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
+    """The maximum number of input items and options allowed per 
+        prompt for each modality.
+    Defaults to 999 for each modality.
+
+    Legacy format (count only):
+        {"image": 16, "video": 2}
 
-    For example, to allow up to 16 images and 2 videos per prompt:
-    `{"image": 16, "video": 2}`"""
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    Configurable format (with options):
+        {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, 
+        "image": {"count": 5, "width": 512, "height": 512}}
+
+    Mixed format (combining both):
+        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, 
+        "height": 512}}
+    """
+    enable_mm_embeds: bool = False
+    """If `True`, enables passing multimodal embeddings:
+    for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
+    for the OpenAI-compatible server, this refers to chat messages with content
+    `"type": "*_embeds"`.
+
+    WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!"""
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
     """Additional args passed to process media inputs, keyed by modalities.
     For example, to set num_frames for video, set
     `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
-    mm_processor_kwargs: Optional[dict[str, object]] = None
+    mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
     from `transformers.AutoProcessor.from_pretrained`.
@@ -39,7 +96,7 @@ class MultiModalConfig:
 
     For example, for Phi-3-Vision:
     `{"num_crops": 4}`."""
-    mm_processor_cache_gb: float = 4
+    mm_processor_cache_gb: float = Field(default=4, ge=0)
     """The size (in GiB) of the multi-modal processor cache, which is used to
     avoid re-processing past multi-modal inputs.
 
@@ -51,7 +108,7 @@ class MultiModalConfig:
     mm_processor_cache_type: MMCacheType = "lru"
     """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
     use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-    mm_shm_cache_max_object_size_mb: int = 128
+    mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
     """Size limit (in MiB) for each object stored in the multi-modal processor
     shared memory cache. Only effective when `mm_processor_cache_type` is
     `"shm"`."""
@@ -68,6 +125,10 @@ class MultiModalConfig:
         DP (which is controlled by `--data-parallel-size`).
         This is only supported on a per-model basis and falls back to
         `"weights"` if the encoder does not support DP."""
+    mm_encoder_attn_backend: AttentionBackendEnum | None = None
+    """Optional override for the multi-modal encoder attention backend when
+    using vision transformers. Accepts any value from
+    `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string."""
@@ -78,12 +139,60 @@ class MultiModalConfig:
     This reduces engine startup time but shifts the responsibility to users for
     estimating the peak memory usage of the activation of multimodal encoder and
     embedding cache."""
-    video_pruning_rate: Optional[float] = None
+    video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
     """Sets pruning rate for video pruning via Efficient Video Sampling.
     Value sits in range [0;1) and determines fraction of media tokens
     from each video to be pruned.
     """
 
+    @field_validator("limit_per_prompt", mode="before")
+    @classmethod
+    def _validate_limit_per_prompt(
+        cls, value: dict[str, int | dict[str, int]]
+    ) -> dict[str, DummyOptions]:
+        for k, v in value.items():
+            # Handle legacy format where only count is specified
+            if isinstance(v, int):
+                v = {"count": v}
+            # Convert to the appropriate DummyOptions subclass
+            if k == "video":
+                value[k] = VideoDummyOptions(**v)
+            elif k == "image":
+                value[k] = ImageDummyOptions(**v)
+            elif k == "audio":
+                value[k] = AudioDummyOptions(**v)
+            else:
+                value[k] = BaseDummyOptions(**v)
+        return value
+
+    @field_validator("mm_encoder_attn_backend", mode="before")
+    @classmethod
+    def _validate_mm_encoder_attn_backend(
+        cls, value: str | AttentionBackendEnum | None
+    ) -> AttentionBackendEnum | None:
+        # We need to import the real type here (deferred to avoid circular import).
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        if value is None or isinstance(value, AttentionBackendEnum):
+            return value
+
+        assert isinstance(value, str), (
+            "mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
+        )
+        return AttentionBackendEnum[value.upper()]
+
+    @model_validator(mode="after")
+    def _validate_multimodal_config(self):
+        if self.mm_processor_cache_type != "shm" and (
+            self.mm_shm_cache_max_object_size_mb
+            != MultiModalConfig.mm_shm_cache_max_object_size_mb
+        ):
+            raise ValueError(
+                "'mm_shm_cache_max_object_size_mb' should only be set when "
+                "'mm_processor_cache_type' is 'shm'."
+            )
+        return self
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -96,22 +205,33 @@ class MultiModalConfig:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        factors: list[Any] = [
+            self.mm_encoder_attn_backend.name
+            if self.mm_encoder_attn_backend is not None
+            else None
+        ]
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
         """
         Get the maximum number of input items allowed per prompt
-        for the given modality.
+        for the given modality (backward compatible).
         """
-        return self.limit_per_prompt.get(
-            modality,
-            999 if envs.VLLM_USE_V1 else 1,
-        )
+        limit_data = self.limit_per_prompt.get(modality)
+
+        if limit_data is None:
+            # Unspecified modality is set to 999 by default
+            return 999
+        return limit_data.count
+
+    def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
+        """
+        Get the configurable dummy data options for a modality.
+        Returns None if no options are configured for this modality.
+        """
+        # All values are now DummyOptions after normalization
+        return self.limit_per_prompt.get(modality)
 
     def merge_mm_processor_kwargs(
         self,
@@ -125,5 +245,4 @@ class MultiModalConfig:
         return kwargs | dict(inference_kwargs)
 
     def is_multimodal_pruning_enabled(self):
-        return (self.video_pruning_rate is not None
-                and self.video_pruning_rate > 0)
+        return self.video_pruning_rate is not None and self.video_pruning_rate > 0
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 766d03051e21230c2b7a8a19177b0eee188cc35a..564c4f7aed41999f241261afdc8bc26928248149 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -3,8 +3,10 @@
 
 import hashlib
 from functools import cached_property
-from typing import Any, Literal, Optional, cast
+from typing import Any, Literal, cast
 
+from packaging.version import parse
+from pydantic import field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
 from vllm import version
@@ -18,7 +20,7 @@ DetailedTraceModules = Literal["model", "worker", "all"]
 class ObservabilityConfig:
     """Configuration for observability - metrics and tracing."""
 
-    show_hidden_metrics_for_version: Optional[str] = None
+    show_hidden_metrics_for_version: str | None = None
     """Enable deprecated Prometheus metrics that have been hidden since the
     specified version. For example, if a previously deprecated metric has been
     hidden since the v0.7.0 release, you use
@@ -31,13 +33,12 @@ class ObservabilityConfig:
         """Check if the hidden metrics should be shown."""
         if self.show_hidden_metrics_for_version is None:
             return False
-        return version._prev_minor_version_was(
-            self.show_hidden_metrics_for_version)
+        return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
 
-    otlp_traces_endpoint: Optional[str] = None
+    otlp_traces_endpoint: str | None = None
     """Target URL to which OpenTelemetry traces will be sent."""
 
-    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
+    collect_detailed_traces: list[DetailedTraceModules] | None = None
     """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
     set, it will collect detailed traces for the specified modules. This
     involves use of possibly costly and or blocking operations and hence might
@@ -49,16 +50,18 @@ class ObservabilityConfig:
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
-        return (self.collect_detailed_traces is not None
-                and ("model" in self.collect_detailed_traces
-                     or "all" in self.collect_detailed_traces))
+        return self.collect_detailed_traces is not None and (
+            "model" in self.collect_detailed_traces
+            or "all" in self.collect_detailed_traces
+        )
 
     @cached_property
     def collect_model_execute_time(self) -> bool:
         """Whether to collect model execute time for the request."""
-        return (self.collect_detailed_traces is not None
-                and ("worker" in self.collect_detailed_traces
-                     or "all" in self.collect_detailed_traces))
+        return self.collect_detailed_traces is not None and (
+            "worker" in self.collect_detailed_traces
+            or "all" in self.collect_detailed_traces
+        )
 
     def compute_hash(self) -> str:
         """
@@ -75,25 +78,46 @@ class ObservabilityConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self):
-        if (self.collect_detailed_traces is not None
-                and len(self.collect_detailed_traces) == 1
-                and "," in self.collect_detailed_traces[0]):
-            self._parse_collect_detailed_traces()
-
-        from vllm.tracing import is_otel_available, otel_import_error_traceback
-        if not is_otel_available() and self.otlp_traces_endpoint is not None:
+    @field_validator("show_hidden_metrics_for_version")
+    @classmethod
+    def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
+        if value is not None:
+            # Raises an exception if the string is not a valid version.
+            parse(value)
+        return value
+
+    @field_validator("otlp_traces_endpoint")
+    @classmethod
+    def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
+        if value is not None:
+            from vllm.tracing import is_otel_available, otel_import_error_traceback
+
+            if not is_otel_available():
+                raise ValueError(
+                    "OpenTelemetry is not available. Unable to configure "
+                    "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
+                    f"installed. Original error:\n{otel_import_error_traceback}"
+                )
+        return value
+
+    @field_validator("collect_detailed_traces")
+    @classmethod
+    def _validate_collect_detailed_traces(
+        cls, value: list[DetailedTraceModules] | None
+    ) -> list[DetailedTraceModules] | None:
+        """Handle the legacy case where users might provide a comma-separated
+        string instead of a list of strings."""
+        if value is not None and len(value) == 1 and "," in value[0]:
+            value = cast(list[DetailedTraceModules], value[0].split(","))
+        return value
+
+    @model_validator(mode="after")
+    def _validate_tracing_config(self):
+        if self.collect_detailed_traces and not self.otlp_traces_endpoint:
             raise ValueError(
-                "OpenTelemetry is not available. Unable to configure "
-                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
-                f"installed. Original error:\n{otel_import_error_traceback}")
-
-    def _parse_collect_detailed_traces(self):
-        assert isinstance(self.collect_detailed_traces, list)
-        self.collect_detailed_traces = cast(
-            list[DetailedTraceModules],
-            self.collect_detailed_traces[0].split(","))
+                "collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
+            )
+        return self
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 8b980458ddaff2787f4122e87e73c44a5d8b5074..9a6326d62e82eb1a8f2f15a1468024c1ce6509cc 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -3,11 +3,10 @@
 
 import hashlib
 import os
-from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
-from pydantic import model_validator
+from pydantic import Field, model_validator
 from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
 from typing_extensions import Self
@@ -15,23 +14,28 @@ from typing_extensions import Self
 import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, get_open_ports_list
+from vllm.utils.network_utils import get_open_ports_list
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 if TYPE_CHECKING:
     from ray.runtime_env import RuntimeEnv
     from ray.util.placement_group import PlacementGroup
 
-    from vllm.executor.executor_base import ExecutorBase
+    from vllm.v1.executor import Executor
 else:
     RuntimeEnv = Any
     PlacementGroup = Any
-    ExecutorBase = Any
+    Executor = Any
 
 logger = init_logger(__name__)
 
 ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
+DataParallelBackend = Literal["ray", "mp"]
 
 
 @config
@@ -49,7 +53,7 @@ class EPLBConfig:
     of the last `lb_window_size` steps will be used for rearranging experts.
     """
 
-    num_redundant_experts: int = 0
+    num_redundant_experts: int = Field(default=0, ge=0)
     """Number of redundant experts to use for expert parallelism."""
 
     log_balancedness: bool = False
@@ -75,7 +79,7 @@ class ParallelConfig:
     """Number of local data parallel groups."""
     data_parallel_rank: int = 0
     """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
+    data_parallel_rank_local: int | None = None
     """Local rank of the data parallel group,
     set only in SPMD mode."""
     data_parallel_master_ip: str = "127.0.0.1"
@@ -84,7 +88,7 @@ class ParallelConfig:
     """Port for data parallel messaging."""
     data_parallel_master_port: int = 29500
     """Port of the data parallel master."""
-    data_parallel_backend: str = "mp"
+    data_parallel_backend: DataParallelBackend = "mp"
     """Backend to use for data parallel, either "mp" or "ray"."""
     data_parallel_external_lb: bool = False
     """Whether to use "external" DP LB mode. Applies only to online serving
@@ -102,7 +106,7 @@ class ParallelConfig:
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     enable_eplb: bool = False
     """Enable expert parallelism load balancing for MoE layers."""
-    eplb_config: EPLBConfig = field(default_factory=EPLBConfig)
+    eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
     """Expert parallelism configuration."""
     expert_placement_strategy: ExpertPlacementStrategy = "linear"
     """The expert placement strategy for MoE layers:\n
@@ -113,24 +117,43 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    num_redundant_experts: Optional[int] = None
+    all2all_backend: (
+        Literal[
+            "naive",
+            "pplx",
+            "deepep_high_throughput",
+            "deepep_low_latency",
+            "allgather_reducescatter",
+            "flashinfer_all2allv",
+        ]
+        | None
+    ) = None
+    """All2All backend for MoE expert parallel communication. If not set, uses
+    the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
+    - "naive": Naive all2all implementation using broadcasts
+    - "allgather_reducescatter": All2all based on allgather and reducescatter
+    - "pplx": Use pplx kernels
+    - "deepep_high_throughput": Use deepep high-throughput kernels
+    - "deepep_low_latency": Use deepep low-latency kernels
+    - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
+    num_redundant_experts: int | None = None
     """`num_redundant_experts` is deprecated and has been replaced with
     `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
     Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: Optional[int] = None
+    eplb_window_size: int | None = None
     """`eplb_window_size` is deprecated and has been replaced with
     `eplb_config.window_size`. This will be removed in v0.12.0.
     Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: Optional[int] = None
+    eplb_step_interval: int | None = None
     """`eplb_step_interval` is deprecated and has been replaced with
     `eplb_config.step_interval`. This will be removed in v0.12.0.
     Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: Optional[bool] = None
+    eplb_log_balancedness: bool | None = None
     """`eplb_log_balancedness` is deprecated and has been replaced with
     `eplb_config.log_balancedness`. This will be removed in v0.12.0.
     Please use `eplb_config.log_balancedness` instead."""
 
-    max_parallel_loading_workers: Optional[int] = None
+    max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
     sequentially in multiple batches. To avoid RAM OOM when using tensor
     parallel and large models."""
@@ -152,18 +175,22 @@ class ParallelConfig:
     threshold, microbatching will be used. Otherwise, the request will be
     processed in a single batch."""
 
+    disable_nccl_for_dp_synchronization: bool = False
+    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
+    to use Gloo instead of NCCL for its all reduce"""
+
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
-    ray_runtime_env: Optional[RuntimeEnv] = None
+    ray_runtime_env: RuntimeEnv | None = None
     """Ray runtime environment to pass to distributed workers."""
 
-    placement_group: Optional[PlacementGroup] = None
+    placement_group: PlacementGroup | None = None
     """ray distributed model workers placement group."""
 
-    distributed_executor_backend: Optional[Union[str,
-                                                 DistributedExecutorBackend,
-                                                 type[ExecutorBase]]] = None
+    distributed_executor_backend: (
+        str | DistributedExecutorBackend | type[Executor] | None
+    ) = None
     """Backend to use for distributed model
     workers, either "ray" or "mp" (multiprocessing). If the product
     of pipeline_parallel_size and tensor_parallel_size is less than
@@ -183,14 +210,26 @@ class ParallelConfig:
     class is dynamically inherited by the worker class. This is used to inject
     new attributes and methods to the worker class for use in collective_rpc
     calls."""
-
-    world_size: int = field(init=False)
+    master_addr: str = "127.0.0.1"
+    """distributed master address for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    master_port: int = 29501
+    """distributed master port for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    node_rank: int = 0
+    """distributed node rank for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    nnodes: int = 1
+    """num of nodes for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+
+    world_size: int = Field(init=False)
     """world_size is TPxPP, it affects the number of workers we create."""
 
     rank: int = 0
     """Global rank in distributed setup."""
 
-    _data_parallel_master_port_list: list[int] = field(default_factory=list)
+    _data_parallel_master_port_list: list[int] = Field(default_factory=list)
     """List of open port auto-queried for data parallel messaging.
     Set to be private as it's not intended to be configured by users.
     """
@@ -200,7 +239,18 @@ class ParallelConfig:
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
     needs to be divisible by dcp_size."""
 
-    _api_process_count: int = 1
+    dcp_kv_cache_interleave_size: int = 1
+    """Interleave size of kv_cache storage while using dcp or cp > 1,
+    store interleave_size tokens on (d)cp i,
+    then store next interleave_size tokens on (d)cp i+1.
+    Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
+    Interleave_size=block_size: block-level align, first fill the block on first rank,
+    token is stored on rank i+1 block j after rank i block j is full.
+    Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
+    Block_size should be divisible by dcp_kv_cache_interleave_size.
+    """
+
+    _api_process_count: int = Field(default=1, gt=0)
     """
     The number of API processes initialized.
 
@@ -209,7 +259,7 @@ class ParallelConfig:
         should only be set by API server scale-out.
     """
 
-    _api_process_rank: int = 0
+    _api_process_rank: int = Field(default=0, ge=-1)
     """
     The rank of this API process, or `-1` for engine core processes
     under API server scale-out.
@@ -219,6 +269,51 @@ class ParallelConfig:
         should only be set by API server scale-out.
     """
 
+    @model_validator(mode="after")
+    def _validate_parallel_config(self) -> Self:
+        if self._api_process_rank >= self._api_process_count:
+            raise ValueError(
+                "Invalid value of `_api_process_rank`. "
+                f"Expected to be `-1` or `[0, {self._api_process_count})`, "
+                f"but found: {self._api_process_rank}"
+            )
+
+        if self.data_parallel_size_local > self.data_parallel_size:
+            raise ValueError(
+                f"data_parallel_size_local ({self.data_parallel_size_local}) "
+                f"must be <= data_parallel_size ({self.data_parallel_size})"
+            )
+
+        if self.data_parallel_size <= 1 and self.data_parallel_external_lb:
+            raise ValueError(
+                "data_parallel_external_lb can only be set when data_parallel_size > 1"
+            )
+
+        if self.enable_eplb:
+            if not current_platform.is_cuda_alike():
+                raise ValueError(
+                    "Expert parallelism load balancing is only supported on "
+                    "CUDA devices or ROCm devices now."
+                )
+            if not self.enable_expert_parallel:
+                raise ValueError("enable_expert_parallel must be True to use EPLB.")
+            if self.tensor_parallel_size * self.data_parallel_size <= 1:
+                raise ValueError(
+                    "EPLB requires tensor_parallel_size or data_parallel_size "
+                    f"to be greater than 1, but got "
+                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
+                )
+        else:
+            if self.eplb_config.num_redundant_experts != 0:
+                raise ValueError(
+                    "num_redundant_experts is set to "
+                    f"{self.eplb_config.num_redundant_experts} but EPLB is not "
+                    "enabled. Either enable EPLB or unset "
+                    "num_redundant_experts."
+                )
+
+        return self
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
@@ -253,10 +348,11 @@ class ParallelConfig:
         from torch.distributed import DistNetworkError
 
         from vllm.distributed.utils import (
-            stateless_init_torch_distributed_process_group)
+            stateless_init_torch_distributed_process_group,
+        )
 
         max_retries = 5
-        last_exc: Optional[Exception] = None
+        last_exc: Exception | None = None
         for _ in range(max_retries):
             try:
                 # use gloo since the engine process might not have cuda device
@@ -265,12 +361,12 @@ class ParallelConfig:
                     self.get_next_dp_init_port(),
                     self.data_parallel_rank,
                     self.data_parallel_size,
-                    backend="gloo")
+                    backend=current_platform.dist_backend,
+                )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
                 if "EADDRINUSE" in str(e):
-                    logger.warning(
-                        "Address already in use. Retrying with a new port.")
+                    logger.warning("Address already in use. Retrying with a new port.")
                     last_exc = e
                     continue  # try again with a new port
                 raise e
@@ -290,19 +386,39 @@ class ParallelConfig:
     # Not needed for pplx-kernels as it can handle duplicate input tokens.
     @property
     def use_sequence_parallel_moe(self) -> bool:
-        return (envs.VLLM_ALL2ALL_BACKEND
-                in ("allgather_reducescatter", "naive",
-                    "deepep_high_throughput", "deepep_low_latency")
-                and self.enable_expert_parallel
-                and self.tensor_parallel_size > 1
-                and self.data_parallel_size > 1)
+        return (
+            self.all2all_backend
+            in (
+                "allgather_reducescatter",
+                "naive",
+                "deepep_high_throughput",
+                "deepep_low_latency",
+            )
+            and self.enable_expert_parallel
+            and self.tensor_parallel_size > 1
+            and self.data_parallel_size > 1
+        )
+
+    @property
+    def node_rank_within_dp(self) -> int:
+        return self.node_rank % self.nnodes_within_dp
+
+    @property
+    def nnodes_within_dp(self) -> int:
+        if self.nnodes == 1:
+            return 1
+        data_parallel_node_size = (
+            self.data_parallel_size // self.data_parallel_size_local
+        )
+        return self.nnodes // data_parallel_node_size
+
+    @property
+    def local_world_size(self) -> int:
+        return self.world_size // self.nnodes_within_dp
 
     @staticmethod
-    def has_unfinished_dp(dp_group: ProcessGroup,
-                          has_unfinished: bool) -> bool:
-        tensor = torch.tensor([has_unfinished],
-                              dtype=torch.int32,
-                              device="cpu")
+    def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
+        tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
         # dp rank 0: has_unfinished_seqs=True
         # dp rank 1: has_unfinished_seqs=False
         # aggregated: has_unfinished_seqs=True
@@ -312,13 +428,10 @@ class ParallelConfig:
         return aggregated_has_unfinished
 
     @staticmethod
-    def sync_kv_cache_memory_size(dp_group: ProcessGroup,
-                                  kv_cache_memory: int) -> int:
+    def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
         if kv_cache_memory == -1:
             kv_cache_memory = torch.iinfo(torch.int64).max
-        tensor = torch.tensor([kv_cache_memory],
-                              dtype=torch.int64,
-                              device="cpu")
+        tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
         # we cannot use broadcast for stateless dp group since it depends
         # on global rank
         torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
@@ -331,78 +444,97 @@ class ParallelConfig:
         graph from input ids/embeddings to the final hidden states,
         excluding anything before input ids/embeddings and after
         the final hidden states.
+
+        This hash is also used for DP worker configuration validation
+        to prevent hangs from mismatched collective communication patterns.
         """
         factors: list[Any] = []
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
         factors.append(self.enable_expert_parallel)
         factors.append(self.data_parallel_size)
-        factors.append(envs.VLLM_ALL2ALL_BACKEND)
+        factors.append(self.all2all_backend)
+        factors.append(self.enable_eplb)
+        if self.enable_eplb:
+            factors.append(self.eplb_config.log_balancedness)
+            factors.append(self.eplb_config.window_size)
+            factors.append(self.eplb_config.step_interval)
+            factors.append(self.eplb_config.num_redundant_experts)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
+        # Set all2all_backend from env var if not specified, with deprecation warning
+        if self.all2all_backend is None:
+            self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
+            if envs.is_set("VLLM_ALL2ALL_BACKEND"):
+                logger.warning_once(
+                    "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
+                    "will be removed in a future release. Please use the "
+                    "--all2all-backend command-line argument instead."
+                )
+
         # Forward deprecated fields to their new location
         if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = (
-                self.num_redundant_experts)
+            self.eplb_config.num_redundant_experts = self.num_redundant_experts
             logger.warning_once(
                 "num_redundant_experts is deprecated and has been replaced "
                 "with eplb_config.num_redundant_experts. This will be removed "
                 "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
         if self.eplb_window_size is not None:
             self.eplb_config.window_size = self.eplb_window_size
             logger.warning_once(
                 "eplb_window_size is deprecated and has been replaced "
                 "with eplb_config.window_size. This will be removed "
                 "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
         if self.eplb_step_interval is not None:
             self.eplb_config.step_interval = self.eplb_step_interval
             logger.warning_once(
                 "eplb_step_interval is deprecated and has been replaced "
                 "with eplb_config.step_interval. This will be removed "
                 "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
         if self.eplb_log_balancedness is not None:
             self.eplb_config.log_balancedness = self.eplb_log_balancedness
             logger.warning_once(
                 "eplb_log_balancedness is deprecated and has been replaced "
                 "with eplb_config.log_balancedness. This will be removed "
                 "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
 
         # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * \
-            self.tensor_parallel_size
+        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
 
         if self.distributed_executor_backend == "external_launcher":
             logger.info("Using external launcher for distributed inference.")
             self.world_size *= self.data_parallel_size
 
-        if self.data_parallel_size_local > self.data_parallel_size:
-            raise ValueError(
-                f"data_parallel_size_local ({self.data_parallel_size_local}) "
-                f"must be <= data_parallel_size ({self.data_parallel_size})")
-
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
             if self.distributed_executor_backend == "external_launcher":
                 # For external launcher,
                 # we need to set the data parallel rank automatically
-                self.data_parallel_rank = int(os.environ["RANK"]) \
-                    // (self.world_size // self.data_parallel_size)
-                logger.info("Set data_parallel_rank to %d automatically.",
-                            self.data_parallel_rank)
+                self.data_parallel_rank = int(os.environ["RANK"]) // (
+                    self.world_size // self.data_parallel_size
+                )
+                logger.info(
+                    "Set data_parallel_rank to %d automatically.",
+                    self.data_parallel_rank,
+                )
             if not self._data_parallel_master_port_list:
                 self._data_parallel_master_port_list = get_open_ports_list(5)
-            self.data_parallel_master_port = \
-                self._data_parallel_master_port_list.pop()
+            self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
 
             if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                 raise ValueError(
                     f"data_parallel_rank ({self.data_parallel_rank})"
-                    f" must be in the range [0, {self.data_parallel_size})")
+                    f" must be in the range [0, {self.data_parallel_size})"
+                )
         else:
             # Otherwise fall back to env vars (e.g. for offline SPMD case).
             self.data_parallel_size = envs.VLLM_DP_SIZE
@@ -411,114 +543,113 @@ class ParallelConfig:
             self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
             self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
 
-            if self.data_parallel_external_lb:
-                raise ValueError("data_parallel_external_lb can only "
-                                 "be set when data_parallel_size > 1")
-
         if self.distributed_executor_backend == "external_launcher":
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
             logger.info("Disabling V1 multiprocessing for external launcher.")
 
-        if self.enable_eplb:
-            if not current_platform.is_cuda():
-                raise ValueError(
-                    "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now.")
-            if self.eplb_config.num_redundant_experts < 0:
-                raise ValueError(
-                    "num_redundant_experts must be non-negative, but got "
-                    f"{self.eplb_config.num_redundant_experts}.")
-            if not self.enable_expert_parallel:
-                raise ValueError(
-                    "enable_expert_parallel must be True to use EPLB.")
-            if self.tensor_parallel_size * self.data_parallel_size <= 1:
-                raise ValueError(
-                    "EPLB requires tensor_parallel_size or data_parallel_size "
-                    f"to be greater than 1, but got "
-                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
-                )
-        else:
-            if self.eplb_config.num_redundant_experts != 0:
-                raise ValueError(
-                    "num_redundant_experts is set to "
-                    f"{self.eplb_config.num_redundant_experts} but EPLB is not "
-                    "enabled. Either enable EPLB or unset "
-                    "num_redundant_experts.")
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
 
-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils
+
             backend: DistributedExecutorBackend = "mp"
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
-            elif (current_platform.is_cuda()
-                  and cuda_device_count_stateless() < self.world_size):
-                if not ray_found:
-                    raise ValueError("Unable to load Ray: "
-                                     f"{ray_utils.ray_import_err}. Ray is "
-                                     "required for multi-node inference, "
-                                     "please install Ray with `pip install "
-                                     "ray`.")
-                backend = "ray"
+            elif current_platform.is_cuda() and self.nnodes > 1:
+                backend = "mp"
+            elif (
+                current_platform.is_cuda()
+                and cuda_device_count_stateless() < self.world_size
+            ):
+                gpu_count = cuda_device_count_stateless()
+                raise ValueError(
+                    f"Tensor parallel size ({self.world_size}) cannot be "
+                    f"larger than the number of available GPUs ({gpu_count})."
+                )
             elif self.data_parallel_backend == "ray":
-                logger.info("Using ray distributed inference because "
-                            "data_parallel_backend is ray")
+                logger.info(
+                    "Using ray distributed inference because "
+                    "data_parallel_backend is ray"
+                )
                 backend = "ray"
             elif ray_found:
                 if self.placement_group:
                     backend = "ray"
                 else:
                     from ray import is_initialized as ray_is_initialized
+
                     if ray_is_initialized():
                         from ray.util import get_current_placement_group
+
                         if get_current_placement_group():
                             backend = "ray"
             self.distributed_executor_backend = backend
-            logger.debug("Defaulting to use %s for distributed inference",
-                         backend)
+            logger.debug("Defaulting to use %s for distributed inference", backend)
 
         if self.distributed_executor_backend is None and self.world_size == 1:
             self.distributed_executor_backend = "uni"
 
-        if not -1 <= self._api_process_rank < self._api_process_count:
+        if self.max_parallel_loading_workers is not None:
+            logger.warning(
+                "max_parallel_loading_workers is currently "
+                "not supported and will be ignored."
+            )
+        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
             raise ValueError(
-                "Invalid value of `_api_process_rank`. "
-                f"Expected to be `-1` or `[0, {self._api_process_count})`, "
-                f"but found: {self._api_process_rank}")
+                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+            )
 
     @property
     def use_ray(self) -> bool:
         return self.distributed_executor_backend == "ray" or (
             isinstance(self.distributed_executor_backend, type)
-            and getattr(self.distributed_executor_backend, "uses_ray", False))
+            and getattr(self.distributed_executor_backend, "uses_ray", False)
+        )
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def _verify_args(self) -> Self:
         # Lazy import to avoid circular import
-        from vllm.executor.executor_base import ExecutorBase
-        from vllm.platforms import current_platform
-        if self.distributed_executor_backend is not None and not isinstance(
-                self.distributed_executor_backend, str) and not (isinstance(
-                    self.distributed_executor_backend, type) and issubclass(
-                        self.distributed_executor_backend, ExecutorBase)):
+        from vllm.v1.executor import Executor
+
+        # Enable batch invariance settings if requested
+        if vllm_is_batch_invariant():
+            self.disable_custom_all_reduce = True
+
+        if (
+            self.distributed_executor_backend is not None
+            and not isinstance(self.distributed_executor_backend, str)
+            and not (
+                isinstance(self.distributed_executor_backend, type)
+                and issubclass(self.distributed_executor_backend, Executor)
+            )
+        ):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
                 "values are 'ray', 'mp' 'uni', 'external_launcher', "
-                " custom ExecutorBase subclass or its import path.")
+                " custom Executor subclass or its import path."
+            )
         if self.use_ray:
-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils
+
             ray_utils.assert_ray_available()
 
         if not current_platform.use_custom_allreduce():
             self.disable_custom_all_reduce = True
             logger.debug(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on current platform.")
+                "supported on current platform."
+            )
+        if self.nnodes > 1:
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce since we are running on multi-node."
+            )
         if self.ray_workers_use_nsight and not self.use_ray:
-            raise ValueError("Unable to use nsight profiling unless workers "
-                             "run with Ray.")
+            raise ValueError(
+                "Unable to use nsight profiling unless workers run with Ray."
+            )
 
         return self
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 85b5a1ace85f6009858e13620de1b95e82833b46..6bece8d0785bdd0f6800ebc44f4398d0137101de 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -2,11 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import Any, Optional
+from typing import Any
 
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @config
@@ -14,23 +17,23 @@ from vllm.config.utils import config
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: Optional[str] = None
+    pooling_type: str | None = None
     """
     The pooling method of the pooling model. This should be a key in
     [`vllm.model_executor.layers.pooler.PoolingType`][].
     """
 
     ## for embeddings models
-    normalize: Optional[bool] = None
+    normalize: bool | None = None
     """
     Whether to normalize the embeddings outputs. Defaults to True.
     """
-    dimensions: Optional[int] = None
+    dimensions: int | None = None
     """
     Reduce the dimensions of embeddings if model
     support matryoshka representation. Defaults to None.
     """
-    enable_chunked_processing: Optional[bool] = None
+    enable_chunked_processing: bool | None = None
     """
     Whether to enable chunked processing for long inputs that exceed the model's
     maximum position embeddings. When enabled, long inputs will be split into
@@ -38,7 +41,7 @@ class PoolerConfig:
     This allows embedding models to handle arbitrarily long text without CUDA
     errors. Defaults to False.
     """
-    max_embed_len: Optional[int] = None
+    max_embed_len: int | None = None
     """
     Maximum input length allowed for embedding generation. When set, allows
     inputs longer than max_embed_len to be accepted for embedding models.
@@ -48,35 +51,42 @@ class PoolerConfig:
     """
 
     ## for classification models
-    activation: Optional[bool] = None
+    softmax: float | None = None
+    """
+    softmax will be deprecated, please use use_activation instead.
+    """
+    activation: float | None = None
+    """
+    activation will be deprecated, please use use_activation instead.
+    """
+    use_activation: bool | None = None
     """
     Whether to apply activation function to the classification outputs.
     Defaults to True.
     """
-    logit_bias: Optional[float] = None
+    logit_bias: float | None = None
     """
     If provided, apply classification logit biases. Defaults to None.
     """
 
     ## for reward models
-    softmax: Optional[bool] = None
-    """
-    Whether to apply softmax to the reward outputs.
-    Defaults to True.
+    step_tag_id: int | None = None
     """
-    step_tag_id: Optional[int] = None
-    """
-    If set, only the score corresponding to the ``step_tag_id`` in the
+    If set, only the score corresponding to the `step_tag_id` in the
     generated sentence should be returned. Otherwise, the scores for all tokens
     are returned.
     """
-    returned_token_ids: Optional[list[int]] = None
+    returned_token_ids: list[int] | None = None
     """
     A list of indices for the vocabulary dimensions to be extracted,
-    such as the token IDs of ``good_token`` and ``bad_token`` in the
-    ``math-shepherd-mistral-7b-prm`` model.
+    such as the token IDs of `good_token` and `bad_token` in the
+    `math-shepherd-mistral-7b-prm` model.
     """
 
+    def __post_init__(self):
+        # raise deprecated warning for softmax and activation
+        self.use_activation = get_use_activation(self)
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -92,6 +102,21 @@ class PoolerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
+
+
+def get_use_activation(o: object):
+    if softmax := getattr(o, "softmax", None) is not None:
+        logger.warning_once(
+            "softmax will be deprecated, please use use_activation instead."
+        )
+        return softmax
+
+    if activation := getattr(o, "activation", None) is not None:
+        logger.warning_once(
+            "activation will be deprecated, please use use_activation instead."
+        )
+        return activation
+
+    return getattr(o, "use_activation", None)
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index daf094d2df5c890923ddc4675030efce9632334d..8194295ffedb606596a1de22e956b857d4494732 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -2,18 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from dataclasses import field
-from typing import Any, Literal, Union
+from collections.abc import Callable
+from dataclasses import InitVar
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
-from pydantic import SkipValidation, model_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self
+from typing_extensions import Self, deprecated
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
-                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.interface import SchedulerInterface
 
 logger = init_logger(__name__)
 
@@ -26,31 +28,31 @@ SchedulerPolicy = Literal["fcfs", "priority"]
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
+    DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
+
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
+    max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
     """Maximum number of tokens to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_num_seqs: SkipValidation[int] = None  # type: ignore
+    max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
     """Maximum number of sequences to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
-
-    max_model_len: SkipValidation[int] = None  # type: ignore
-    """Maximum length of a sequence (including prompt and generated text). This
-    is primarily set in `ModelConfig` and that value should be manually
-    duplicated here."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_num_partial_prefills: int = 1
+    max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
     partially prefilled concurrently."""
 
-    max_long_partial_prefills: int = 1
+    max_long_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of prompts longer than
     long_prefill_token_threshold that will be prefilled concurrently. Setting
     this less than max_num_partial_prefills will allow shorter prompts to jump
@@ -60,7 +62,7 @@ class SchedulerConfig:
     """For chunked prefill, a request is considered long if the prompt is
     longer than this number of tokens."""
 
-    num_lookahead_slots: int = 0
+    num_lookahead_slots: int = Field(default=0, ge=0)
     """The number of slots to allocate per sequence per
     step, beyond the known token ids. This is used in speculative
     decoding to store KV activations of tokens which may or may not be
@@ -69,41 +71,44 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: list[int] = field(default_factory=list)
-    """Cuda graph capture sizes
-    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
-    2. if one value is provided, then the capture list would follow the
-    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
-    3. more than one value (e.g. 1 2 128) is provided, then the capture list
-    will follow the provided list."""
-
-    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
+    enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
-    on the remaining max_num_batched_tokens."""
+    on the remaining `max_num_batched_tokens`.
+
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
+    max_model_len: InitVar[int] = 8192
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
+    is_encoder_decoder: InitVar[bool] = False
+    """True if the model is an encoder-decoder model.
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    disable chunked prefill and prefix caching for encoder-decoder models.
+    """
+
     # TODO (ywang96): Make this configurable.
-    max_num_encoder_input_tokens: int = field(init=False)
+    max_num_encoder_input_tokens: int = Field(init=False)
     """Multimodal encoder compute budget, only used in V1.
 
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
     # TODO (ywang96): Make this configurable.
-    encoder_cache_size: int = field(init=False)
+    encoder_cache_size: int = Field(init=False)
     """Multimodal encoder cache size, only used in V1.
 
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    send_delta_data: bool = False
-    """Private API. If used, scheduler sends delta data to
-    workers instead of an entire data. It should be enabled only
-    when SPMD worker architecture is enabled. I.e.,
-    VLLM_USE_RAY_SPMD_WORKER=1"""
-
     policy: SchedulerPolicy = "fcfs"
     """The scheduling policy to use:\n
     - "fcfs" means first come first served, i.e. requests are handled in order
@@ -111,9 +116,6 @@ class SchedulerConfig:
     - "priority" means requests are handled based on given priority (lower
     value means earlier handling) and time of arrival deciding any ties)."""
 
-    chunked_prefill_enabled: bool = field(init=False)
-    """True if chunked prefill is enabled."""
-
     disable_chunked_mm_input: bool = False
     """If set to true and chunked prefill is enabled, we do not want to
     partially schedule a multimodal item. Only used in V1
@@ -122,12 +124,12 @@ class SchedulerConfig:
     some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
     it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
 
-    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
-    # or "mod.custom_class".
-    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
-    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
-    default scheduler. Can be a class directly or the path to a class of form
-    "mod.custom_class"."""
+    # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
+    # (default) or "mod.custom_class".
+    scheduler_cls: str | type[object] = Field(default=None)
+    """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
+    the default scheduler. Can be a class directly or the path to a class of
+    form "mod.custom_class"."""
 
     disable_hybrid_kv_cache_manager: bool = False
     """If set to True, KV cache manager will allocate the same size of KV cache
@@ -136,12 +138,40 @@ class SchedulerConfig:
     """
 
     async_scheduling: bool = False
-    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
-    reduce the CPU overheads, leading to better latency and throughput. However,
-    async scheduling is currently not supported with some features such as
-    structured outputs, speculative decoding, and pipeline parallelism.
+    """If set to True, perform async scheduling. This helps to avoid gaps in
+    GPU utilization, leading to better latency and throughput.
+    Async scheduling is currently not supported with some features such as
+    speculative decoding and pipeline parallelism.
     """
 
+    stream_interval: int = Field(default=1, ge=1)
+    """The interval (or buffer size) for streaming in terms of token length.
+    A smaller value (1) makes streaming smoother by sending each token immediately,
+    while a larger value (e.g., 10) reduces host overhead and may increase throughput
+    by batching multiple tokens before sending."""
+
+    def get_scheduler_cls(self) -> type["SchedulerInterface"]:
+        if self.scheduler_cls is None:
+            if self.async_scheduling:
+                from vllm.v1.core.sched.async_scheduler import AsyncScheduler
+
+                return AsyncScheduler
+            from vllm.v1.core.sched.scheduler import Scheduler
+
+            return Scheduler
+
+        # This warning can be removed once the Scheduler interface is
+        # finalized and we can maintain support for scheduler classes that
+        # implement it
+        logger.warning_once(
+            "Using custom scheduler class %s. This scheduler interface is "
+            "not public and compatibility may not be maintained.",
+            self.scheduler_cls,
+        )
+        if not isinstance(self.scheduler_cls, str):
+            return cast(type["SchedulerInterface"], self.scheduler_cls)
+        return resolve_obj_by_qualname(self.scheduler_cls)
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -157,46 +187,27 @@ class SchedulerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self) -> None:
-        if self.max_model_len is None:
-            self.max_model_len = 8192
-
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 128
-
-        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
-            else:
-                # If max_model_len is too short, use
-                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
-                # for higher throughput.
-                self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-            if self.runner_type == "pooling":
-                # Choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-            if self.is_multimodal_model:
-                # The value needs to be at least the number of multimodal tokens
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-
-            # When using default settings,
-            # Ensure max_num_batched_tokens does not exceed model limit.
-            # Some models (e.g., Whisper) have embeddings tied to max length.
-            self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len,
-                self.max_num_batched_tokens)
+    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
+    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
+        if is_encoder_decoder:
+            # Chunked prefill should be disabled for encoder-decoder models.
+            self.disable_chunked_mm_input = True
+            self.enable_chunked_prefill = False
+            self.long_prefill_token_threshold = 0
+            logger.info(
+                "Encoder-decoder models do not support chunked prefill nor"
+                " prefix caching; disabling both."
+            )
 
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
@@ -204,84 +215,84 @@ class SchedulerConfig:
         if self.enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
-                self.max_num_batched_tokens)
+                self.max_num_batched_tokens,
+            )
 
-        self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len *
-                                                        0.04)
+                self.long_prefill_token_threshold = int(max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
                 "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
                 "long_prefill_token_threshold=%d",
-                self.max_num_partial_prefills, self.max_long_partial_prefills,
-                self.long_prefill_token_threshold)
-
-        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
-        # This avoids OOM in tight memory scenarios with small max_num_seqs,
-        # and prevents capture of many large graphs (>512) that would greatly
-        # increase startup time with limited performance benefit.
-        if not self.cuda_graph_sizes:
-            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
-
-        if self.async_scheduling:
-            self.scheduler_cls = (
-                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
-
-    @model_validator(mode='after')
-    def _verify_args(self) -> Self:
-        if (self.max_num_batched_tokens < self.max_model_len
-                and not self.chunked_prefill_enabled):
+                self.max_num_partial_prefills,
+                self.max_long_partial_prefills,
+                self.long_prefill_token_threshold,
+            )
+
+        self.verify_max_model_len(max_model_len)
+
+    @property
+    @deprecated(
+        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
+        "`SchedulerConfig.enable_chunked_prefill`. "
+        "The old name will be removed in v0.12."
+    )
+    def chunked_prefill_enabled(self) -> bool:
+        return self.enable_chunked_prefill
+
+    @chunked_prefill_enabled.setter
+    def chunked_prefill_enabled(self, value: bool):
+        self.enable_chunked_prefill = value
+
+    def verify_max_model_len(self, max_model_len: int) -> Self:
+        if (
+            self.max_num_batched_tokens < max_model_len
+            and not self.enable_chunked_prefill
+        ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
+                f"smaller than max_model_len ({max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len.")
+                "decrease max_model_len."
+            )
 
         if self.max_num_batched_tokens < self.max_num_seqs:
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs}).")
+                f"({self.max_num_seqs})."
+            )
 
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len)
+                self.max_num_seqs * max_model_len,
+            )
 
-        if self.num_lookahead_slots < 0:
-            raise ValueError(
-                "num_lookahead_slots "
-                f"({self.num_lookahead_slots}) must be greater than or "
-                "equal to 0.")
+        if self.max_num_partial_prefills > 1:
+            if not self.enable_chunked_prefill:
+                raise ValueError(
+                    "Chunked prefill must be enabled to set "
+                    "max_num_partial_prefills > 1."
+                )
 
-        if self.max_num_partial_prefills < 1:
-            raise ValueError(
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
-                "must be greater than or equal to 1.")
-        elif self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
-                raise ValueError("Chunked prefill must be enabled to set "
-                                 "max_num_partial_prefills > 1.")
-
-            if self.long_prefill_token_threshold > self.max_model_len:
+            if self.long_prefill_token_threshold > max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len}).")
+                    f"than the max_model_len ({max_model_len})."
+                )
 
-        if (self.max_long_partial_prefills
-                < 1) or (self.max_long_partial_prefills
-                         > self.max_num_partial_prefills):
+        if self.max_long_partial_prefills > self.max_num_partial_prefills:
             raise ValueError(
-                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
-                "must be greater than or equal to 1 and less than or equal to "
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+                f"{self.max_long_partial_prefills=} must be less than or equal to "
+                f"{self.max_num_partial_prefills=}."
+            )
 
         return self
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index d5c6d1d4d8668ae5508fe5ae23a7df2f2010b7af..13a8632413d91041338123abdcd24d78958fea85 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -3,17 +3,16 @@
 
 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal, get_args
 
-from pydantic import SkipValidation, model_validator
+from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
-import vllm.envs as envs
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import LazyLoader
+from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -24,33 +23,48 @@ else:
     PretrainedConfig = Any
     ModelConfig = Any
 
-    me_quant = LazyLoader("model_executor", globals(),
-                          "vllm.model_executor.layers.quantization")
+    me_quant = LazyLoader(
+        "model_executor", globals(), "vllm.model_executor.layers.quantization"
+    )
 
 logger = init_logger(__name__)
 
-SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
-                            "mlp_speculator", "draft_model", "deepseek_mtp",
-                            "ernie_mtp", "qwen3_next_mtp", "mimo_mtp",
-                            "longcat_flash_mtp", "mtp"]
-MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
-                   "qwen3_next_mtp", "longcat_flash_mtp")
+MTPModelTypes = Literal[
+    "deepseek_mtp",
+    "mimo_mtp",
+    "glm4_moe_mtp",
+    "ernie_mtp",
+    "qwen3_next_mtp",
+    "longcat_flash_mtp",
+    "mtp",
+    "pangu_ultra_moe_mtp",
+]
+EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+SpeculativeMethod = Literal[
+    "ngram",
+    "medusa",
+    "mlp_speculator",
+    "draft_model",
+    "suffix",
+    EagleModelTypes,
+]
 
 
 @config
 @dataclass
 class SpeculativeConfig:
     """Configuration for speculative decoding."""
-    enforce_eager: Optional[bool] = None
+
+    enforce_eager: bool | None = None
     """Override the default enforce_eager from model_config"""
     # General speculative decoding control
-    num_speculative_tokens: SkipValidation[int] = None  # type: ignore
+    num_speculative_tokens: int = Field(default=None, gt=0)
     """The number of speculative tokens, if provided. It will default to the
     number in the draft model config if present, otherwise, it is required."""
-    model: Optional[str] = None
+    model: str | None = None
     """The name of the draft model, eagle head, or additional weights, if
     provided."""
-    method: Optional[SpeculativeMethod] = None
+    method: SpeculativeMethod | None = None
     """The name of the speculative method to use. If users provide and set the
     `model` param, the speculative method type will be detected automatically
     if possible, if `model` param is not provided, the method name must be
@@ -58,33 +72,29 @@ class SpeculativeConfig:
 
     If using `ngram` method, the related configuration `prompt_lookup_max` and
     `prompt_lookup_min` should be considered."""
-    draft_tensor_parallel_size: Optional[int] = None
+    draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
     """The degree of the tensor parallelism for the draft model. Can only be 1
     or the same as the target model's tensor parallel size."""
-    disable_logprobs: bool = True
-    """If set to True, token log probabilities are not returned during
-    speculative decoding. If set to False, token log probabilities are returned
-    according to the log probability settings in SamplingParams."""
 
     # Draft model configuration
-    quantization: Optional[me_quant.QuantizationMethods] = None
+    quantization: me_quant.QuantizationMethods | None = None
     """Quantization method that was used to quantize the draft model weights.
     If `None`, we assume the model weights are not quantized. Note that it only
     takes effect when using the draft model-based speculative method."""
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = Field(default=None, ge=1)
     """The maximum model length of the draft model. Used when testing the
     ability to skip speculation for some sequences."""
-    revision: Optional[str] = None
+    revision: str | None = None
     """The specific model version to use for the draft model. It can be a
     branch name, a tag name, or a commit id. If unspecified, will use the
     default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
     """The specific revision to use for the draft model code on Hugging Face
     Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
     will use the default version."""
 
     # Advanced control
-    disable_by_batch_size: Optional[int] = None
+    disable_by_batch_size: int | None = Field(default=None, ge=2)
     """Disable speculative decoding for new incoming requests when the number
     of enqueued requests is larger than this value, if provided."""
     disable_padded_drafter_batch: bool = False
@@ -94,36 +104,49 @@ class SpeculativeConfig:
     only affects the EAGLE method of speculation."""
 
     # Ngram proposer configuration
-    prompt_lookup_max: Optional[int] = None
+    prompt_lookup_max: int | None = Field(default=None, ge=1)
     """Maximum size of ngram token window when using Ngram proposer, required
     when method is set to ngram."""
-    prompt_lookup_min: Optional[int] = None
+    prompt_lookup_min: int | None = Field(default=None, ge=1)
     """Minimum size of ngram token window when using Ngram proposer, if
     provided. Defaults to 1."""
 
-    speculative_token_tree: Optional[str] = None
+    speculative_token_tree: str | None = None
     """Specifies the tree structure for speculative token generation.
     """
     # required configuration params passed from engine
     target_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the target model."""
-    target_parallel_config: SkipValidation[
-        ParallelConfig] = None  # type: ignore
+    target_parallel_config: SkipValidation[ParallelConfig] = None  # type: ignore
     """The parallel configuration for the target model."""
-    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
-    """Whether vLLM is configured to use chunked prefill or not. Used for
-    raising an error since it's not yet compatible with speculative decode."""
-    disable_log_stats: SkipValidation[bool] = None  # type: ignore
-    """Whether to disable the periodic printing of stage times in speculative
-    decoding."""
 
     # params generated in the post-init stage
     draft_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the draft model initialized internal."""
-    draft_parallel_config: SkipValidation[
-        ParallelConfig] = None  # type: ignore
+    draft_parallel_config: SkipValidation[ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    # Suffix decoding configuration
+    suffix_decoding_max_tree_depth: int = 24
+    """The maximum depth of the suffix decoding global and prompt trees. The
+    tree depth limits the sum of the prefix match and speculation lengths."""
+
+    suffix_decoding_max_cached_requests: int = 10000
+    """The maximum number of requests to cache in the global suffix tree. If
+    exceeded, will trigger eviction in FIFO order. If set to 0, the global
+    suffix tree is disabled and past responses are not cached (prompt trees
+    are still used)."""
+
+    suffix_decoding_max_spec_factor: float = 1.0
+    """The maximum spec factor for suffix decoding. The spec factor controls
+    speculation lengths based on the prefix match length: max_spec_tokens =
+    max_spec_factor * prefix_match_length."""
+
+    suffix_decoding_min_token_prob: float = 0.1
+    """The minimum token probability for suffix decoding. Will only speculate
+    tokens with estimated probability (based on frequency counts) greater than
+    or equal to this value."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -140,8 +163,7 @@ class SpeculativeConfig:
         # Eagle3 affects the computation graph because it returns intermediate
         # hidden states in addition to the final hidden state.
         factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
@@ -150,58 +172,64 @@ class SpeculativeConfig:
             hf_config.model_type = "deepseek_mtp"
         if hf_config.model_type == "deepseek_mtp":
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-            hf_config.update({
-                "n_predict": n_predict,
-                "architectures": ["DeepSeekMTPModel"]
-            })
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
+            )
+        if hf_config.model_type in ("pangu_ultra_moe"):
+            hf_config.model_type = "pangu_ultra_moe_mtp"
+        if hf_config.model_type == "pangu_ultra_moe_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
+            )
 
         if hf_config.architectures[0] == "MiMoForCausalLM":
             hf_config.model_type = "mimo_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-            hf_config.update({
-                "num_hidden_layers": 0,
-                "n_predict": n_predict,
-                "architectures": ["MiMoMTPModel"]
-            })
+            hf_config.update(
+                {
+                    "num_hidden_layers": 0,
+                    "n_predict": n_predict,
+                    "architectures": ["MiMoMTPModel"],
+                }
+            )
 
         if hf_config.architectures[0] == "Glm4MoeForCausalLM":
             hf_config.model_type = "glm4_moe_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-            hf_config.update({
-                "num_hidden_layers": 0,
-                "n_predict": n_predict,
-                "architectures": ["Glm4MoeMTPModel"]
-            })
+            hf_config.update(
+                {
+                    "num_hidden_layers": 0,
+                    "n_predict": n_predict,
+                    "architectures": ["Glm4MoeMTPModel"],
+                }
+            )
 
         if hf_config.model_type == "ernie4_5_moe":
             hf_config.model_type = "ernie_mtp"
         if hf_config.model_type == "ernie_mtp":
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-            hf_config.update({
-                "n_predict": n_predict,
-                "architectures": ["ErnieMTPModel"]
-            })
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
+            )
 
         if hf_config.model_type == "qwen3_next":
             hf_config.model_type = "qwen3_next_mtp"
         if hf_config.model_type == "qwen3_next_mtp":
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-            hf_config.update({
-                "n_predict": n_predict,
-                "architectures": ["Qwen3NextMTP"]
-            })
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
+            )
         if hf_config.model_type == "longcat_flash":
             hf_config.model_type = "longcat_flash_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
-            hf_config.update({
-                "n_predict": n_predict,
-                "architectures": ["LongCatFlashMTPModel"]
-            })
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
+            )
 
         return hf_config
 
     def __post_init__(self):
-
         # Note: "method" is a new parameter that helps to extend the
         # configuration of non-model-based proposers, and the "model" parameter
         # will be used to set the draft model, eagle head, or additional weight
@@ -210,18 +238,17 @@ class SpeculativeConfig:
         # can not be detected, it will be considered as the "draft_model" by
         # default.
 
-        if self.method in MTP_MODEL_TYPES:
-            logger.warning("method `%s` is deprecated and replaced with mtp.",
-                           self.method)
+        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
+            logger.warning(
+                "method `%s` is deprecated and replaced with mtp.", self.method
+            )
             self.method = "mtp"
 
         if self.model is None and self.num_speculative_tokens is not None:
             if self.method == "mtp":
-                assert (
-                    self.target_model_config
-                    is not None), "target_model_config must be present for mtp"
-                if self.target_model_config.hf_text_config.model_type \
-                    == "deepseek_v32":
+                if self.target_model_config is None:
+                    raise ValueError("target_model_config must be present for mtp")
+                if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
                     # FIXME(luccafong): cudgraph with v32 MTP is not supported,
                     # remove this when the issue is fixed.
                     self.enforce_eager = True
@@ -233,50 +260,57 @@ class SpeculativeConfig:
                     self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
+            elif self.method == "suffix":
+                self.model = "suffix"
             else:
                 raise ValueError(
-                    "num_speculative_tokens was provided but without "
-                    "speculative model.")
+                    "num_speculative_tokens was provided but without speculative model."
+                )
 
         # Automatically configure the method for ngram when "model" is used
         # instead of "method"
-        if self.method is None and (self.model is not None
-                                    and self.model in ("ngram", "[ngram]")):
+        if self.method is None and (
+            self.model is not None and self.model in ("ngram", "[ngram]")
+        ):
             self.method = "ngram"
 
         if self.method in ("ngram", "[ngram]"):
             # Unified to "ngram" internally
             self.method = "ngram"
             # Set default values if not provided
-            if (self.prompt_lookup_min is None
-                    and self.prompt_lookup_max is None):
+            if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
                 # TODO(woosuk): Tune these values. They are arbitrarily chosen.
                 self.prompt_lookup_min = 5
                 self.prompt_lookup_max = 5
             elif self.prompt_lookup_min is None:
-                assert self.prompt_lookup_max is not None
+                if self.prompt_lookup_max is None:
+                    raise ValueError(
+                        "Either prompt_lookup_max or prompt_lookup_min must be "
+                        "provided when using the ngram method."
+                    )
                 self.prompt_lookup_min = self.prompt_lookup_max
             elif self.prompt_lookup_max is None:
-                assert self.prompt_lookup_min is not None
+                if self.prompt_lookup_min is None:
+                    raise ValueError(
+                        "Either prompt_lookup_max or prompt_lookup_min must be "
+                        "provided when using the ngram method."
+                    )
                 self.prompt_lookup_max = self.prompt_lookup_min
 
             # Validate values
-            if self.prompt_lookup_min < 1:
-                raise ValueError(
-                    f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
-            if self.prompt_lookup_max < 1:
-                raise ValueError(
-                    f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
             if self.prompt_lookup_min > self.prompt_lookup_max:
                 raise ValueError(
                     f"prompt_lookup_min={self.prompt_lookup_min} must "
-                    f"be <= prompt_lookup_max={self.prompt_lookup_max}")
+                    f"be <= prompt_lookup_max={self.prompt_lookup_max}"
+                )
 
             # TODO: current we still need extract vocab_size from target model
             # config, in future, we may try refactor it out, and set
             # draft related config as None here.
             self.draft_model_config = self.target_model_config
             self.draft_parallel_config = self.target_parallel_config
+        elif self.method == "suffix":
+            self._validate_suffix_decoding()
         else:
             self.prompt_lookup_max = 0
             self.prompt_lookup_min = 0
@@ -285,25 +319,21 @@ class SpeculativeConfig:
                 # TODO: Move this import to the top once `ModelConfig`
                 # lives in `vllm.config.model`.
                 from vllm.config import ModelConfig
+
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
                     tokenizer=self.target_model_config.tokenizer,
                     tokenizer_mode=self.target_model_config.tokenizer_mode,
-                    trust_remote_code=self.target_model_config.
-                    trust_remote_code,
-                    allowed_local_media_path=self.target_model_config.
-                    allowed_local_media_path,
-                    allowed_media_domains=self.target_model_config.
-                    allowed_media_domains,
+                    trust_remote_code=self.target_model_config.trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
+                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                     dtype=self.target_model_config.dtype,
                     seed=self.target_model_config.seed,
                     revision=self.revision,
                     code_revision=self.code_revision,
-                    tokenizer_revision=self.target_model_config.
-                    tokenizer_revision,
-                    spec_target_max_model_len=self.target_model_config.
-                    max_model_len,
+                    tokenizer_revision=self.target_model_config.tokenizer_revision,
+                    spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
                     max_logprobs=self.target_model_config.max_logprobs,
@@ -311,7 +341,7 @@ class SpeculativeConfig:
                 )
 
                 # Automatically detect the method
-                if self.method in ('eagle', 'eagle3'):
+                if self.method in ("eagle", "eagle3"):
                     pass
                 # examples:
                 # yuhuili/EAGLE-LLaMA3-Instruct-8B
@@ -323,94 +353,97 @@ class SpeculativeConfig:
                     self.method = "eagle3"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"
-                elif (self.draft_model_config.hf_config.model_type ==
-                      "mlp_speculator"):
+                elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                     self.method = "mlp_speculator"
-                elif (self.draft_model_config.hf_config.model_type
-                      in MTP_MODEL_TYPES):
+                elif self.draft_model_config.hf_config.model_type in get_args(
+                    MTPModelTypes
+                ):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
-                                "Enabling num_speculative_tokens > 1 will run" \
-                                "multiple times of forward on same MTP layer" \
-                                ",which may result in lower acceptance rate" \
-                            )
-                elif (self.draft_model_config.hf_config.model_type
-                      in ("longcat_flash_mtp")):
+                            "Enabling num_speculative_tokens > 1 will run"
+                            "multiple times of forward on same MTP layer"
+                            ",which may result in lower acceptance rate"
+                        )
+                elif self.draft_model_config.hf_config.model_type in (
+                    "longcat_flash_mtp"
+                ):
                     self.method = "longcat_flash_mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
-                                "LongCat MTP models only have " \
-                                "one layer. Might need some code changes " \
-                                "to support multiple layers."
-                            )
+                            "LongCat MTP models only have "
+                            "one layer. Might need some code changes "
+                            "to support multiple layers."
+                        )
                 else:
                     self.method = "draft_model"
                     raise NotImplementedError(
                         "Speculative decoding with draft model is not "
                         "supported yet. Please consider using other "
                         "speculative decoding methods such as ngram, medusa, "
-                        "eagle, or mtp.")
+                        "eagle, or mtp."
+                    )
 
                 # Replace hf_config for EAGLE draft_model
                 if self.method in ("eagle", "eagle3"):
-                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
-                        raise ValueError(
-                            "Chunked prefill and EAGLE are not compatible "
-                            "when using V0.")
-
-                    from vllm.transformers_utils.configs import (
-                        SpeculatorsConfig)
-                    from vllm.transformers_utils.configs.eagle import (
-                        EAGLEConfig)
+                    from vllm.transformers_utils.configs import SpeculatorsConfig
+                    from vllm.transformers_utils.configs.eagle import EAGLEConfig
 
-                    if isinstance(self.draft_model_config.hf_config,
-                                  (EAGLEConfig, SpeculatorsConfig)):
+                    if isinstance(
+                        self.draft_model_config.hf_config,
+                        (EAGLEConfig, SpeculatorsConfig),
+                    ):
                         pass
                     else:
                         eagle_config = EAGLEConfig(
                             self.draft_model_config.hf_config,
                             method=self.method,
-                            model_type="eagle")
+                            model_type="eagle",
+                        )
                         self.draft_model_config.hf_config = eagle_config
 
-                if (self.num_speculative_tokens is not None
-                        and hasattr(self.draft_model_config.hf_config,
-                                    "num_lookahead_tokens")):
-                    self.draft_model_config.hf_config.num_lookahead_tokens = \
-                    self.num_speculative_tokens
+                if self.num_speculative_tokens is not None and hasattr(
+                    self.draft_model_config.hf_config, "num_lookahead_tokens"
+                ):
+                    self.draft_model_config.hf_config.num_lookahead_tokens = (
+                        self.num_speculative_tokens
+                    )
 
-                n_predict = getattr(self.draft_model_config.hf_config,
-                                    "n_predict", None)
+                n_predict = getattr(
+                    self.draft_model_config.hf_config, "n_predict", None
+                )
                 if n_predict is not None:
                     if self.num_speculative_tokens is None:
                         # Default to max value defined in draft model config.
                         self.num_speculative_tokens = n_predict
-                    elif self.num_speculative_tokens > n_predict and \
-                            self.num_speculative_tokens % n_predict != 0:
+                    elif (
+                        self.num_speculative_tokens > n_predict
+                        and self.num_speculative_tokens % n_predict != 0
+                    ):
                         # Ensure divisibility for MTP module reuse.
                         raise ValueError(
                             f"num_speculative_tokens:{self.num_speculative_tokens}"
-                            f" must be divisible by {n_predict=}")
+                            f" must be divisible by {n_predict=}"
+                        )
 
                 if self.speculative_token_tree is None:
                     # Generate chain of tokens.
-                    self.speculative_token_tree = str([
-                        (i + 1) * (0, )
-                        for i in range(self.num_speculative_tokens)
-                    ])
+                    self.speculative_token_tree = str(
+                        [(i + 1) * (0,) for i in range(self.num_speculative_tokens)]
+                    )
                 else:
                     # Sort the token tree breadth-first.
-                    tree_choices = ast.literal_eval(
-                        self.speculative_token_tree)
+                    tree_choices = ast.literal_eval(self.speculative_token_tree)
                     self.speculative_token_tree = str(
-                        sorted(tree_choices, key=lambda t: (len(t), t)))
+                        sorted(tree_choices, key=lambda t: (len(t), t))
+                    )
 
-                self.draft_tensor_parallel_size = \
+                self.draft_tensor_parallel_size = (
                     SpeculativeConfig._verify_and_get_draft_tp(
                         self.target_parallel_config,
                         self.draft_tensor_parallel_size,
-                        self.draft_model_config.hf_config
+                        self.draft_model_config.hf_config,
+                    )
                 )
 
                 self.draft_model_config.max_model_len = (
@@ -418,16 +451,55 @@ class SpeculativeConfig:
                         self.max_model_len,
                         self.draft_model_config.max_model_len,
                         self.target_model_config.max_model_len,
-                    ))
+                    )
+                )
 
                 self.draft_parallel_config = (
                     SpeculativeConfig.create_draft_parallel_config(
-                        self.target_parallel_config,
-                        self.draft_tensor_parallel_size))
+                        self.target_parallel_config, self.draft_tensor_parallel_size
+                    )
+                )
+        return self
+
+    def _validate_suffix_decoding(self):
+        if not has_arctic_inference():
+            raise ImportError(
+                "Arctic Inference is required for suffix decoding. "
+                "Install via `pip install arctic-inference==0.1.1`."
+            )
+        if self.num_speculative_tokens is None:
+            # Suffix decoding decides the actual number of speculative tokens
+            # dynamically and treats num_speculative_tokens as a maximum limit.
+            self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
+            logger.warning(
+                "Defaulted num_speculative_tokens to %s for suffix decoding.",
+                self.num_speculative_tokens,
+            )
+        # Validate values
+        if self.suffix_decoding_max_tree_depth < 1:
+            raise ValueError(
+                f"suffix_decoding_max_tree_depth="
+                f"{self.suffix_decoding_max_tree_depth} must be >= 1"
+            )
+        if self.suffix_decoding_max_cached_requests < 0:
+            raise ValueError(
+                f"suffix_decoding_max_cached_requests="
+                f"{self.suffix_decoding_max_cached_requests} must be >= 0"
+            )
+        if self.suffix_decoding_max_spec_factor < 0:
+            raise ValueError(
+                f"suffix_decoding_max_spec_factor="
+                f"{self.suffix_decoding_max_spec_factor} must be >= 0"
+            )
+        if not 0 <= self.suffix_decoding_min_token_prob <= 1:
+            raise ValueError(
+                f"suffix_decoding_min_token_prob="
+                f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
+            )
 
     @staticmethod
     def _maybe_override_draft_max_model_len(
-        speculative_max_model_len: Optional[int],
+        speculative_max_model_len: int | None,
         draft_max_model_len: int,
         target_max_model_len: int,
     ) -> int:
@@ -444,14 +516,17 @@ class SpeculativeConfig:
         """
 
         if speculative_max_model_len is not None:
-
             if speculative_max_model_len > draft_max_model_len:
-                raise ValueError(f"{speculative_max_model_len=} cannot be "
-                                 f"larger than {draft_max_model_len=}")
+                raise ValueError(
+                    f"{speculative_max_model_len=} cannot be "
+                    f"larger than {draft_max_model_len=}"
+                )
 
             if speculative_max_model_len > target_max_model_len:
-                raise ValueError(f"{speculative_max_model_len=} cannot be "
-                                 f"larger than {target_max_model_len=}")
+                raise ValueError(
+                    f"{speculative_max_model_len=} cannot be "
+                    f"larger than {target_max_model_len=}"
+                )
 
             return speculative_max_model_len
 
@@ -462,9 +537,10 @@ class SpeculativeConfig:
 
     @staticmethod
     def _verify_and_get_draft_tp(
-            target_parallel_config: ParallelConfig,
-            speculative_draft_tensor_parallel_size: Optional[int],
-            draft_hf_config: PretrainedConfig) -> int:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: int | None,
+        draft_hf_config: PretrainedConfig,
+    ) -> int:
         """
         Verifies and adjusts the tensor parallel size for a draft model
         specified using speculative_draft_tensor_parallel_size.
@@ -478,15 +554,20 @@ class SpeculativeConfig:
                     logger.warning(
                         "%s cannot currently be run with tp>1; "
                         "setting speculative_draft_tensor_parallel_size=1",
-                        draft_hf_config.model_type)
+                        draft_hf_config.model_type,
+                    )
             else:
-                speculative_draft_tensor_parallel_size = \
+                speculative_draft_tensor_parallel_size = (
                     target_parallel_config.tensor_parallel_size
+                )
         elif speculative_draft_tensor_parallel_size not in (
-                1, target_parallel_config.tensor_parallel_size):
+            1,
+            target_parallel_config.tensor_parallel_size,
+        ):
             raise ValueError(
                 f"{speculative_draft_tensor_parallel_size=} cannot be "
-                f"other value than 1 or target model tensor_parallel_size")
+                f"other value than 1 or target model tensor_parallel_size"
+            )
         return speculative_draft_tensor_parallel_size
 
     @staticmethod
@@ -499,52 +580,57 @@ class SpeculativeConfig:
         This is mostly a copy of the target parallel config, except the tp_size.
         """
         draft_parallel_config = ParallelConfig(
-            pipeline_parallel_size=target_parallel_config.
-            pipeline_parallel_size,
+            pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
             tensor_parallel_size=speculative_draft_tensor_parallel_size,
-            distributed_executor_backend=target_parallel_config.
-            distributed_executor_backend,
-            max_parallel_loading_workers=target_parallel_config.
-            max_parallel_loading_workers,
-            disable_custom_all_reduce=target_parallel_config.
-            disable_custom_all_reduce,
-            ray_workers_use_nsight=target_parallel_config.
-            ray_workers_use_nsight,
+            distributed_executor_backend=target_parallel_config.distributed_executor_backend,
+            max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
+            disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
+            ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
             placement_group=target_parallel_config.placement_group,
         )
 
         return draft_parallel_config
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def _verify_args(self) -> Self:
         if self.num_speculative_tokens is None:
             raise ValueError(
                 "num_speculative_tokens must be provided with "
                 "speculative model unless the draft model config contains an "
-                "n_predict parameter.")
+                "n_predict parameter."
+            )
 
         if self.num_speculative_tokens <= 0:
-            raise ValueError("Expected num_speculative_tokens to be greater "
-                             f"than zero ({self.num_speculative_tokens}).")
+            raise ValueError(
+                "Expected num_speculative_tokens to be greater "
+                f"than zero ({self.num_speculative_tokens})."
+            )
 
         if self.draft_model_config:
             self.draft_model_config.verify_with_parallel_config(
-                self.draft_parallel_config)
+                self.draft_parallel_config
+            )
 
-        if (self.disable_by_batch_size is not None
-                and self.disable_by_batch_size < 2):
-            raise ValueError("Expect the batch size threshold of disabling "
-                             "speculative decoding is > 1, but got "
-                             f"{self.disable_by_batch_size=}")
+        if self.disable_by_batch_size is not None and self.disable_by_batch_size < 2:
+            raise ValueError(
+                "Expect the batch size threshold of disabling "
+                "speculative decoding is > 1, but got "
+                f"{self.disable_by_batch_size=}"
+            )
 
         eagle3_target_supported = ["llama", "qwen", "minicpm", "gpt_oss"]
-        if self.method == "eagle3" and self.target_model_config and not any(
-                supported_model in
-                self.target_model_config.hf_text_config.model_type
-                for supported_model in eagle3_target_supported):
+        if (
+            self.method == "eagle3"
+            and self.target_model_config
+            and not any(
+                supported_model in self.target_model_config.hf_text_config.model_type
+                for supported_model in eagle3_target_supported
+            )
+        ):
             raise ValueError(
                 f"Eagle3 is only supported for {eagle3_target_supported} models. "  # noqa: E501
-                f"Got {self.target_model_config.hf_text_config.model_type=}")
+                f"Got {self.target_model_config.hf_text_config.model_type=}"
+            )
 
         return self
 
@@ -563,6 +649,6 @@ class SpeculativeConfig:
 
     def __repr__(self) -> str:
         method = self.method
-        model = None if method == "ngram" else self.draft_model_config.model
+        model = None if method in ("ngram", "suffix") else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
diff --git a/vllm/config/speech_to_text.py b/vllm/config/speech_to_text.py
index de9f525efe1851102036718383cdf342b82f5700..3eafff1a30609945f76e6122973b82b88eb104ae 100644
--- a/vllm/config/speech_to_text.py
+++ b/vllm/config/speech_to_text.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from pydantic.dataclasses import dataclass
 
@@ -28,7 +27,7 @@ class SpeechToTextConfig:
     splitting long audio. This helps maintain context across chunk boundaries
     and improves transcription quality at split points."""
 
-    min_energy_split_window_size: Optional[int] = 1600
+    min_energy_split_window_size: int | None = 1600
     """Window size in samples for finding low-energy (quiet) regions to split
     audio chunks. The algorithm looks for the quietest moment within this
     window to minimize cutting through speech. Default 1600 samples ≈ 100ms
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index b1f14294510f8930346930493e4feb2ccbcae802..9530d3d81e15de091f4f1aa625c87d13905dfe42 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -4,12 +4,15 @@
 import hashlib
 from typing import Any, Literal
 
+from pydantic import model_validator
 from pydantic.dataclasses import dataclass
+from typing_extensions import Self
 
 from vllm.config.utils import config
 
-StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines",
-                                   "lm-format-enforcer"]
+StructuredOutputsBackend = Literal[
+    "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
+]
 
 
 @config
@@ -34,6 +37,11 @@ class StructuredOutputsConfig:
     reasoning_parser: str = ""
     """Select the reasoning parser depending on the model that you're using.
     This is used to parse the reasoning content into OpenAI API format."""
+    reasoning_parser_plugin: str = ""
+    """Path to a dynamically reasoning parser plugin that can be dynamically
+    loaded and registered."""
+    enable_in_reasoning: bool = False
+    """Whether to use structured input for reasoning."""
 
     def compute_hash(self) -> str:
         """
@@ -50,15 +58,35 @@ class StructuredOutputsConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self):
-        if (self.disable_any_whitespace
-                and self.backend not in ("xgrammar", "guidance")):
-            raise ValueError("disable_any_whitespace is only supported for "
-                             "xgrammar and guidance backends.")
-        if (self.disable_additional_properties and self.backend != "guidance"):
-            raise ValueError("disable_additional_properties is only supported "
-                             "for the guidance backend.")
+    @model_validator(mode="after")
+    def _validate_structured_output_config(self) -> Self:
+        # Import here to avoid circular import
+        from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+
+        if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3:
+            ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin)
+
+        valid_reasoning_parsers = ReasoningParserManager.list_registered()
+        if (
+            self.reasoning_parser != ""
+            and self.reasoning_parser not in valid_reasoning_parsers
+        ):
+            raise ValueError(
+                f"invalid reasoning parser: {self.reasoning_parser} "
+                f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
+            )
+
+        if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
+            raise ValueError(
+                "disable_any_whitespace is only supported for "
+                "xgrammar and guidance backends."
+            )
+        if self.disable_additional_properties and self.backend != "guidance":
+            raise ValueError(
+                "disable_additional_properties is only supported "
+                "for the guidance backend."
+            )
+        return self
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 91e61b330273858fa4d5161293f360c1b403338a..7e0878d96bbd6a73589dc87b8a1a141f1eb2cbc9 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -1,21 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility functions for vLLM config dataclasses."""
 
 import ast
 import inspect
 import textwrap
-from dataclasses import MISSING, Field, field, fields, is_dataclass
-from typing import TYPE_CHECKING, Any, TypeVar
+from collections.abc import Iterable
+from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from itertools import pairwise
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
+from pydantic.fields import FieldInfo
+from typing_extensions import runtime_checkable
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
-
-    ConfigType = type[DataclassInstance]
 else:
-    ConfigType = type
+    DataclassInstance = Any
 
+ConfigType = type[DataclassInstance]
 ConfigT = TypeVar("ConfigT", bound=ConfigType)
 
 
@@ -29,7 +33,7 @@ def config(cls: ConfigT) -> ConfigT:
     `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the
     `cli_arg` as a JSON string which gets validated by `pydantic`.
 
-    Config validation is performed by the tools/validate_config.py
+    Config validation is performed by the tools/pre_commit/validate_config.py
     script, which is invoked during the pre-commit checks.
     """
     return cls
@@ -47,9 +51,29 @@ def get_field(cls: ConfigType, name: str) -> Field:
     if (default_factory := named_field.default_factory) is not MISSING:
         return field(default_factory=default_factory)
     if (default := named_field.default) is not MISSING:
+        if isinstance(default, FieldInfo):
+            # Handle pydantic.Field defaults
+            if default.default_factory is not None:
+                return field(default_factory=default.default_factory)
+            else:
+                default = default.default
         return field(default=default)
+
     raise ValueError(
-        f"{cls.__name__}.{name} must have a default value or default factory.")
+        f"{cls.__name__}.{name} must have a default value or default factory."
+    )
+
+
+def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any:
+    """
+    A helper function that retrieves an attribute from an object which may
+    have multiple possible names. This is useful when fetching attributes from
+    arbitrary `transformers.PretrainedConfig` instances.
+    """
+    for name in names:
+        if hasattr(object, name):
+            return getattr(object, name)
+    return default
 
 
 def contains_object_print(text: str) -> bool:
@@ -65,7 +89,7 @@ def contains_object_print(text: str) -> bool:
     Returns:
         result (bool): `True` if a match is found, `False` otherwise.
     """
-    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    pattern = r"at 0x[a-fA-F0-9]{2,16}>"
     match = re.search(pattern, text)
     return match is not None
 
@@ -76,7 +100,8 @@ def assert_hashable(text: str) -> bool:
     raise AssertionError(
         f"vLLM tried to hash some configs that may have Python objects ids "
         f"in them. This is a bug, please file an issue. "
-        f"Text being hashed: {text}")
+        f"Text being hashed: {text}"
+    )
 
 
 def get_attr_docs(cls: type[Any]) -> dict[str, str]:
@@ -86,30 +111,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     https://davidism.com/mit-license/
     """
 
-    def pairwise(iterable):
-        """
-        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
-
-        Can be removed when Python 3.9 support is dropped.
-        """
-        iterator = iter(iterable)
-        a = next(iterator, None)
-
-        for b in iterator:
-            yield a, b
-            a = b
-
-    try:
-        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
-    except (OSError, KeyError, TypeError):
-        # HACK: Python 3.13+ workaround - set missing __firstlineno__
-        # Workaround can be removed after we upgrade to pydantic==2.12.0
-        with open(inspect.getfile(cls)) as f:
-            for i, line in enumerate(f):
-                if f"class {cls.__name__}" in line and ":" in line:
-                    cls.__firstlineno__ = i + 1
-                    break
-        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
 
     if not isinstance(cls_node, ast.ClassDef):
         raise TypeError("Given object was not a class.")
@@ -119,10 +121,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     # Consider each pair of nodes.
     for a, b in pairwise(cls_node.body):
         # Must be an assignment then a constant string.
-        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
-                or not isinstance(b, ast.Expr)
-                or not isinstance(b.value, ast.Constant)
-                or not isinstance(b.value.value, str)):
+        if (
+            not isinstance(a, (ast.Assign, ast.AnnAssign))
+            or not isinstance(b, ast.Expr)
+            or not isinstance(b.value, ast.Constant)
+            or not isinstance(b.value.value, str)
+        ):
             continue
 
         doc = inspect.cleandoc(b.value.value)
@@ -143,3 +147,32 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
 
 def is_init_field(cls: ConfigType, name: str) -> bool:
     return next(f for f in fields(cls) if f.name == name).init
+
+
+@runtime_checkable
+class SupportsHash(Protocol):
+    def compute_hash(self) -> str: ...
+
+
+class SupportsMetricsInfo(Protocol):
+    def metrics_info(self) -> dict[str, str]: ...
+
+
+def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
+    processed_overrides = {}
+    for field_name, value in overrides.items():
+        assert hasattr(config, field_name), (
+            f"{type(config)} has no field `{field_name}`"
+        )
+        current_value = getattr(config, field_name)
+        if is_dataclass(current_value) and not is_dataclass(value):
+            assert isinstance(value, dict), (
+                f"Overrides to {type(config)}.{field_name} must be a dict"
+                f"  or {type(current_value)}, but got {type(value)}"
+            )
+            value = update_config(
+                current_value,  # type: ignore[type-var]
+                value,
+            )
+        processed_overrides[field_name] = value
+    return replace(config, **processed_overrides)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..672b004c4aa565c9ef2e081ba0147a3dd8e6f273
--- /dev/null
+++ b/vllm/config/vllm.py
@@ -0,0 +1,1166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+import getpass
+import hashlib
+import json
+import os
+import tempfile
+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import replace
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, TypeVar, get_args
+
+import torch
+from pydantic import ConfigDict, Field, model_validator
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.config.speculative import EagleModelTypes
+from vllm.logger import enable_trace_function_call, init_logger
+from vllm.transformers_utils.runai_utils import is_runai_obj_uri
+from vllm.utils import random_uuid
+
+from .cache import CacheConfig
+from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
+from .device import DeviceConfig
+from .ec_transfer import ECTransferConfig
+from .kv_events import KVEventsConfig
+from .kv_transfer import KVTransferConfig
+from .load import LoadConfig
+from .lora import LoRAConfig
+from .model import ModelConfig
+from .observability import ObservabilityConfig
+from .parallel import ParallelConfig
+from .scheduler import SchedulerConfig
+from .speculative import SpeculativeConfig
+from .structured_outputs import StructuredOutputsConfig
+from .utils import SupportsHash, config
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+else:
+    PretrainedConfig = Any
+
+    QuantizationConfig = Any
+
+    KVCacheConfig = Any
+
+logger = init_logger(__name__)
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class VllmConfig:
+    """Dataclass which contains all vllm-related configuration. This
+    simplifies passing around the distinct configurations in the codebase.
+    """
+
+    # TODO: use default_factory once default constructing ModelConfig doesn't
+    # try to download a model
+    model_config: ModelConfig = Field(default=None)
+    """Model configuration."""
+    cache_config: CacheConfig = Field(default_factory=CacheConfig)
+    """Cache configuration."""
+    parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
+    """Parallel configuration."""
+    scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig)
+    """Scheduler configuration."""
+    device_config: DeviceConfig = Field(default_factory=DeviceConfig)
+    """Device configuration."""
+    load_config: LoadConfig = Field(default_factory=LoadConfig)
+    """Load configuration."""
+    lora_config: LoRAConfig | None = None
+    """LoRA configuration."""
+    speculative_config: SpeculativeConfig | None = None
+    """Speculative decoding configuration."""
+    structured_outputs_config: StructuredOutputsConfig = Field(
+        default_factory=StructuredOutputsConfig
+    )
+    """Structured outputs configuration."""
+    observability_config: ObservabilityConfig = Field(
+        default_factory=ObservabilityConfig
+    )
+    """Observability configuration."""
+    quant_config: QuantizationConfig | None = None
+    """Quantization configuration."""
+    compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
+    """`torch.compile` and cudagraph capture configuration for the model.
+
+    As a shorthand, one can append compilation arguments via 
+    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+
+    You can specify the full compilation config like so:
+    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
+    """
+    kv_transfer_config: KVTransferConfig | None = None
+    """The configurations for distributed KV cache transfer."""
+    kv_events_config: KVEventsConfig | None = None
+    """The configurations for event publishing."""
+    ec_transfer_config: ECTransferConfig | None = None
+    """The configurations for distributed EC cache transfer."""
+    # some opaque config, only used to provide additional information
+    # for the hash computation, mainly used for testing, debugging or out of
+    # tree config registration.
+    additional_config: dict | SupportsHash = Field(default_factory=dict)
+    """Additional config for specified platform. Different platforms may
+    support different configs. Make sure the configs are valid for the platform
+    you are using. Contents must be hashable."""
+    instance_id: str = ""
+    """The ID of the vLLM instance."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+
+        # summarize vllm config
+        vllm_factors: list[Any] = []
+        from vllm import __version__
+
+        vllm_factors.append(__version__)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+            # LoRA creates static buffers based on max_num_batched_tokens.
+            # The tensor sizes and strides get captured in the torch.compile
+            # graph explicitly.
+            vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
+        else:
+            vllm_factors.append("None")
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.structured_outputs_config:
+            vllm_factors.append(self.structured_outputs_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        vllm_factors.append(self.observability_config.compute_hash())
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.ec_transfer_config:
+            vllm_factors.append(self.ec_transfer_config.compute_hash())
+        else:
+            vllm_factors.append("None")
+        if self.additional_config:
+            if isinstance(additional_config := self.additional_config, dict):
+                additional_config_hash = hashlib.md5(
+                    json.dumps(additional_config, sort_keys=True).encode(),
+                    usedforsecurity=False,
+                ).hexdigest()
+            else:
+                additional_config_hash = additional_config.compute_hash()
+            vllm_factors.append(additional_config_hash)
+        else:
+            vllm_factors.append("None")
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(
+            str(factors).encode(), usedforsecurity=False
+        ).hexdigest()[:10]
+        return hash_str
+
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_cudagraph_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
+
+    def enable_trace_function_call_for_thread(self) -> None:
+        """
+        Set up function tracing for the current thread,
+        if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
+        """
+        if envs.VLLM_TRACE_FUNCTION:
+            tmp_dir = tempfile.gettempdir()
+            # add username to tmp_dir to avoid permission issues
+            tmp_dir = os.path.join(tmp_dir, getpass.getuser())
+            filename = (
+                f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
+                f"_thread_{threading.get_ident()}_at_{datetime.now()}.log"
+            ).replace(" ", "_")
+            log_path = os.path.join(
+                tmp_dir,
+                "vllm",
+                f"vllm-instance-{self.instance_id}",
+                filename,
+            )
+            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+            enable_trace_function_call(log_path)
+
+    @staticmethod
+    def _get_quantization_config(
+        model_config: ModelConfig, load_config: LoadConfig
+    ) -> QuantizationConfig | None:
+        """Get the quantization config."""
+        from vllm.platforms import current_platform
+
+        if model_config.quantization is not None:
+            from vllm.model_executor.model_loader.weight_utils import get_quant_config
+
+            quant_config = get_quant_config(model_config, load_config)
+            capability_tuple = current_platform.get_device_capability()
+
+            if capability_tuple is not None:
+                capability = capability_tuple.to_int()
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. Minimum "
+                        f"capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}."
+                    )
+            supported_dtypes = quant_config.get_supported_act_dtypes()
+            if model_config.dtype not in supported_dtypes:
+                raise ValueError(
+                    f"{model_config.dtype} is not supported for quantization "
+                    f"method {model_config.quantization}. Supported dtypes: "
+                    f"{supported_dtypes}"
+                )
+            quant_config.maybe_update_config(model_config.model)
+            return quant_config
+        return None
+
+    @staticmethod
+    def get_quantization_config(
+        model_config: ModelConfig, load_config: LoadConfig
+    ) -> QuantizationConfig | None:
+        import copy
+
+        # For some reason, the _ version of this modifies the model_config
+        # object, so using deepcopy to avoid this problem.
+        return VllmConfig._get_quantization_config(
+            copy.deepcopy(model_config), load_config
+        )
+
+    def with_hf_config(
+        self,
+        hf_config: PretrainedConfig,
+        architectures: list[str] | None = None,
+    ) -> "VllmConfig":
+        if architectures is not None:
+            hf_config = copy.deepcopy(hf_config)
+            hf_config.architectures = architectures
+
+        model_config = copy.deepcopy(self.model_config)
+        model_config.hf_config = hf_config
+
+        return replace(self, model_config=model_config)
+
+    def _post_init_kv_transfer_config(self) -> None:
+        """Update KVTransferConfig based on top-level configs in VllmConfig.
+
+        Right now, this function reads the offloading settings from
+        CacheConfig and configures the KVTransferConfig accordingly.
+        """
+        if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
+            return
+
+        # If no KVTransferConfig is provided, create a default one.
+        if self.kv_transfer_config is None:
+            self.kv_transfer_config = KVTransferConfig()
+
+        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
+            raise ValueError(
+                "You must set kv_offloading_size when kv_offloading_backend is set."
+            )
+        num_kv_ranks = (
+            self.parallel_config.tensor_parallel_size
+            * self.parallel_config.pipeline_parallel_size
+        )
+
+        if kv_offloading_backend == "native":
+            self.kv_transfer_config.kv_connector = "OffloadingConnector"
+            kv_bytes_per_rank = kv_offloading_size * (1 << 30) / num_kv_ranks
+
+            # NOTE(ApostaC): the actual calculation for num_cpu_blocks should be
+            # done after the model's KV cache is initialized
+            self.kv_transfer_config.kv_connector_extra_config.update(
+                {"kv_bytes_per_rank": kv_bytes_per_rank, "num_cpu_blocks": 0}
+            )
+        elif kv_offloading_backend == "lmcache":
+            self.kv_transfer_config.kv_connector = "LMCacheConnectorV1"
+            kv_gb_per_rank = kv_offloading_size / num_kv_ranks
+            self.kv_transfer_config.kv_connector_extra_config = {
+                "lmcache.local_cpu": True,
+                "lmcache.max_local_cpu_size": kv_gb_per_rank,
+            }
+
+        # This is the same for all backends
+        self.kv_transfer_config.kv_role = "kv_both"
+
+    def __post_init__(self):
+        """Verify configs are valid & consistent with each other."""
+
+        # To give each torch profile run a unique instance name.
+        self.instance_id = f"{time.time_ns()}"
+
+        self.try_verify_and_update_config()
+
+        if self.model_config is not None:
+            self.model_config.verify_with_parallel_config(self.parallel_config)
+            self.model_config.verify_dual_chunk_attention_config(self.load_config)
+
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+        if self.lora_config is not None:
+            self.lora_config.verify_with_model_config(self.model_config)
+
+        if self.quant_config is None and self.model_config is not None:
+            self.quant_config = VllmConfig._get_quantization_config(
+                self.model_config, self.load_config
+            )
+
+        executor_backend = self.parallel_config.distributed_executor_backend
+        executor_supports_async_sched = executor_backend in (
+            "mp",
+            "uni",
+            "external_launcher",
+        )
+
+        if self.scheduler_config.async_scheduling:
+            # Async scheduling explicitly enabled, hard fail any incompatibilities.
+            if self.parallel_config.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Async scheduling is not yet compatible with "
+                    "pipeline_parallel_size > 1."
+                )
+            # Currently, async scheduling only support eagle speculative
+            # decoding.
+            if self.speculative_config is not None:
+                if self.speculative_config.method not in get_args(EagleModelTypes):
+                    raise ValueError(
+                        "Currently, async scheduling is only supported "
+                        "with EAGLE/MTP kind of speculative decoding"
+                    )
+                if self.speculative_config.disable_padded_drafter_batch:
+                    raise ValueError(
+                        "async scheduling for EAGLE/MTP kind of speculative "
+                        "decoding is enabled, but disable_padded_drafter_batch=True "
+                        "disable_padded_drafter_batch=True is not supported for "
+                        "this situation now. please set "
+                        "disable_padded_drafter_batch=Fasle"
+                    )
+            if not executor_supports_async_sched:
+                raise ValueError(
+                    "Currently, async scheduling only supports `mp`, `uni`, or "
+                    "`external_launcher` distributed executor backend, but you chose "
+                    f"`{executor_backend}`."
+                )
+        elif self.scheduler_config.async_scheduling is None:
+            # Enable async scheduling unless there is an incompatible option.
+            # NOTE: we won't reach here until async scheduling is enabled by default.
+            if (
+                self.parallel_config.pipeline_parallel_size > 1
+                or self.speculative_config is not None
+            ):
+                logger.warning(
+                    "Async scheduling is not yet supported with speculative decoding "
+                    " or pipeline_parallel_size > 1 and will be disabled."
+                )
+                self.scheduler_config.async_scheduling = False
+            elif not executor_supports_async_sched:
+                logger.warning(
+                    "Async scheduling will be disabled because it is not supported "
+                    "with the `%s` distributed executor backend (only `mp`, `uni`, and "
+                    "`external_launcher` are supported).",
+                    executor_backend,
+                )
+                self.scheduler_config.async_scheduling = False
+            else:
+                self.scheduler_config.async_scheduling = True
+
+        from vllm.platforms import current_platform
+
+        if (
+            self.model_config is not None
+            and self.scheduler_config.enable_chunked_prefill
+            and self.model_config.dtype == torch.float32
+            and current_platform.get_device_capability() == (7, 5)
+        ):
+            logger.warning_once(
+                "Turing devices tensor cores do not support float32 matmul. "
+                "To workaround this limitation, vLLM will set 'ieee' input "
+                "precision for chunked prefill triton kernels."
+            )
+
+        # If the user does not explicitly set a compilation mode, then
+        # we use the default mode. The default mode depends on other
+        # settings (see the below code).
+        if self.compilation_config.mode is None:
+            if self.model_config is not None and not self.model_config.enforce_eager:
+                self.compilation_config.mode = CompilationMode.VLLM_COMPILE
+            else:
+                self.compilation_config.mode = CompilationMode.NONE
+
+        # If user does not set custom ops via none or all set it here based on
+        # compilation mode and backend.
+        if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
+            if (
+                self.compilation_config.backend == "inductor"
+                and self.compilation_config.mode != CompilationMode.NONE
+            ):
+                self.compilation_config.custom_ops.append("none")
+            else:
+                self.compilation_config.custom_ops.append("all")
+
+        # async tp is built on top of sequence parallelism
+        # and requires it to be enabled.
+        if self.compilation_config.pass_config.enable_async_tp:
+            self.compilation_config.pass_config.enable_sequence_parallelism = True
+
+        if current_platform.support_static_graph_mode():
+            # if cudagraph_mode is not explicitly set by users, set default
+            # value
+            if self.compilation_config.cudagraph_mode is None:
+                if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+                    # default to full and piecewise for most models
+                    self.compilation_config.cudagraph_mode = (
+                        CUDAGraphMode.FULL_AND_PIECEWISE
+                    )
+                else:
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
+            # if cudagraph_mode has full cudagraphs, we need to check support
+            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+                # decode context parallel does not support full cudagraphs
+                if self.parallel_config.decode_context_parallel_size > 1:
+                    logger.warning_once(
+                        "Decode context parallel (DCP) is enabled, which is "
+                        "incompatible with full CUDA graphs. "
+                        "Overriding cudagraph_mode to PIECEWISE."
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                elif self.model_config is not None:
+                    if self.model_config.pooler_config is not None:
+                        logger.warning_once(
+                            "Pooling models do not support full cudagraphs. "
+                            "Overriding cudagraph_mode to PIECEWISE."
+                        )
+                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                    elif self.model_config.is_encoder_decoder:
+                        logger.warning_once(
+                            "Encoder-decoder models do not support full cudagraphs. "
+                            "Overriding cudagraph_mode to PIECEWISE."
+                        )
+                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
+            # disable cudagraph when enforce eager execution
+            if self.model_config is not None and self.model_config.enforce_eager:
+                logger.info("Cudagraph is disabled under eager mode")
+                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+                # override related settings when enforce eager
+                self.compilation_config.max_cudagraph_capture_size = 0
+                self.compilation_config.cudagraph_capture_sizes = []
+            else:
+                self.compilation_config.cudagraph_num_of_warmups = 1
+
+            self._set_cudagraph_sizes()
+        else:
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
+        if self.cache_config.kv_sharing_fast_prefill:
+            if (
+                self.speculative_config is not None
+                and self.speculative_config.use_eagle()
+            ):
+                raise ValueError(
+                    "Fast prefill optimization for KV sharing is not "
+                    "compatible with EAGLE as EAGLE requires correct logits "
+                    "for all tokens while fast prefill gives incorrect logits "
+                    "for prompt tokens."
+                )
+
+            logger.warning_once(
+                "--kv-sharing-fast-prefill requires changes on model side for "
+                "correctness and to realize prefill savings. "
+            )
+
+        disable_chunked_prefill_reasons: list[str] = []
+
+        if self.model_config:
+            if self.model_config.pooler_config:
+                pooling_type = self.model_config.pooler_config.pooling_type
+                if pooling_type is None or pooling_type.lower() != "last":
+                    disable_chunked_prefill_reasons.append(
+                        'Only "last" pooling supports chunked '
+                        "prefill and prefix caching; disabling both."
+                    )
+                if not getattr(self.model_config.hf_config, "is_causal", True):
+                    disable_chunked_prefill_reasons.append(
+                        "Only models using causal attention support chunked "
+                        "prefill and prefix caching; disabling both."
+                    )
+            elif self.model_config.is_encoder_decoder:
+                from vllm.multimodal import MULTIMODAL_REGISTRY
+
+                self.scheduler_config.max_num_encoder_input_tokens = (
+                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
+                )
+                logger.debug(
+                    "Encoder-decoder model detected: setting "
+                    "`max_num_encoder_input_tokens` to encoder length (%s)",
+                    self.scheduler_config.max_num_encoder_input_tokens,
+                )
+                if (
+                    self.model_config.architecture == "WhisperForConditionalGeneration"
+                    and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+                ):
+                    logger.warning(
+                        "Whisper is known to have issues with "
+                        "forked workers. If startup is hanging, "
+                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                        "to 'spawn'."
+                    )
+
+        # Final off-switch for CP/APC:
+        # Disable for (a) collected blockers, (b) encoder–decoder, or
+        # (c) explicit CP=False when APC wasn't requested.
+        # Do NOT disable merely because the resolved CP flag is False.
+        apc_requested = (
+            self.cache_config is not None and self.cache_config.enable_prefix_caching
+        )
+        if (
+            disable_chunked_prefill_reasons
+            or (self.model_config is not None and self.model_config.is_encoder_decoder)
+            or (
+                self.scheduler_config.enable_chunked_prefill is False
+                and not apc_requested
+            )
+        ):
+            for reason in disable_chunked_prefill_reasons:
+                logger.info(reason)
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.long_prefill_token_threshold = 0
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
+        if (
+            self.kv_events_config is not None
+            and self.kv_events_config.enable_kv_cache_events
+            and not self.cache_config.enable_prefix_caching
+        ):
+            logger.warning(
+                "KV cache events are on, but prefix caching is not enabled."
+                "Use --enable-prefix-caching to enable."
+            )
+        if (
+            self.kv_events_config is not None
+            and self.kv_events_config.publisher != "null"
+            and not self.kv_events_config.enable_kv_cache_events
+        ):
+            logger.warning(
+                "KV cache events are disabled,"
+                "but the scheduler is configured to publish them."
+                "Modify KVEventsConfig.enable_kv_cache_events"
+                "to True to enable."
+            )
+        current_platform.check_and_update_config(self)
+
+        # If DCP, ensure the block size is right.
+        if self.parallel_config.decode_context_parallel_size > 1:
+            assert (
+                self.parallel_config.dcp_kv_cache_interleave_size
+                <= self.cache_config.block_size
+                and self.cache_config.block_size
+                % self.parallel_config.dcp_kv_cache_interleave_size
+                == 0
+            ), (
+                f"Block_size({self.cache_config.block_size}) should be greater "
+                "than or equal to and divisible by dcp_kv_cache_interleave_size "
+                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+            )
+
+        assert (
+            self.parallel_config.dcp_kv_cache_interleave_size == 1
+            or self.speculative_config is None
+        ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
+
+        # Do this after all the updates to compilation_config.mode
+        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            self.compilation_config.set_splitting_ops_for_v1()
+
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            # With pipeline parallelism or dynamo partitioning,
+            # native rms norm tracing errors due to incorrect residual shape.
+            # Use custom rms norm to unblock. In the future,
+            # the pass will operate on higher-level IR to avoid the issue.
+            # TODO: https://github.com/vllm-project/vllm/issues/27894
+            is_fullgraph = (
+                self.compilation_config.use_inductor_graph_partition
+                or len(self.compilation_config.splitting_ops) == 0
+            )
+            if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
+                if "-rms_norm" not in self.compilation_config.custom_ops:
+                    self.compilation_config.custom_ops.append("+rms_norm")
+                else:
+                    regime = (
+                        "Dynamo partition"
+                        if not is_fullgraph
+                        else "pipeline parallelism"
+                    )
+                    logger.warning_once(
+                        "Sequence parallelism not supported with"
+                        "native rms_norm when using %s, "
+                        "this will likely lead to an error.",
+                        regime,
+                    )
+
+        # final check of cudagraph mode after all possible updates
+        if current_platform.is_cuda_alike():
+            if (
+                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                and self.model_config is not None
+                and not self.model_config.disable_cascade_attn
+                and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()  # noqa: E501
+            ):
+                logger.warning_once(
+                    "No piecewise cudagraph for executing cascade attention."
+                    " Will fall back to eager execution if a batch runs "
+                    "into cascade attentions"
+                )
+
+            if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
+                assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
+                    "Compilation mode should be CompilationMode.VLLM_COMPILE "
+                    "when cudagraph_mode piecewise cudagraphs is used, "
+                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
+                )
+
+        if self.parallel_config.enable_dbo:
+            a2a_backend = self.parallel_config.all2all_backend
+            assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (
+                "Microbatching currently only supports the deepep_low_latency and "
+                f"deepep_high_throughput all2all backend. {a2a_backend} is not "
+                "supported. To fix use --all2all-backend=deepep_low_latency or "
+                "--all2all-backend=deepep_high_throughput and install the DeepEP"
+                " kernels."
+            )
+
+            if not self.model_config.disable_cascade_attn:
+                self.model_config.disable_cascade_attn = True
+                logger.warning_once("Disabling cascade attention when DBO is enabled.")
+
+        if not self.instance_id:
+            self.instance_id = random_uuid()[:5]
+
+        if not self.scheduler_config.disable_hybrid_kv_cache_manager:
+            # logger should only print warning message for hybrid models. As we
+            # can't know whether the model is hybrid or not now, so we don't log
+            # warning message here and will log it later.
+            if not current_platform.support_hybrid_kv_cache():
+                # Hybrid KV cache manager is not supported on non-GPU platforms.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_transfer_config is not None:
+                # NOTE(Kuntai): turn HMA off for connector for now.
+                # TODO(Kuntai): have a more elegent solution to check and
+                # turn off HMA for connector that does not support HMA.
+                logger.warning(
+                    "Turning off hybrid kv cache manager because "
+                    "`--kv-transfer-config` is set. This will reduce the "
+                    "performance of vLLM on LLMs with sliding window attention "
+                    "or Mamba attention. If you are a developer of kv connector"
+                    ", please consider supporting hybrid kv cache manager for "
+                    "your connector by making sure your connector is a subclass"
+                    " of `SupportsHMA` defined in kv_connector/v1/base.py."
+                )
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_events_config is not None:
+                # Hybrid KV cache manager is not compatible with KV events.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if (
+                self.model_config is not None
+                and self.model_config.attention_chunk_size is not None
+            ):
+                if (
+                    self.speculative_config is not None
+                    and self.speculative_config.use_eagle()
+                ):
+                    # Hybrid KV cache manager is not yet supported with chunked
+                    # local attention + eagle.
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
+                    logger.warning(
+                        "There is a latency regression when using chunked local"
+                        " attention with the hybrid KV cache manager. Disabling"
+                        " it, by default. To enable it, set the environment "
+                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
+                    )
+                    # Hybrid KV cache manager is not yet supported with chunked
+                    # local attention.
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
+
+        if self.compilation_config.debug_dump_path:
+            self.compilation_config.debug_dump_path = (
+                self.compilation_config.debug_dump_path.absolute().expanduser()
+            )
+        if envs.VLLM_DEBUG_DUMP_PATH is not None:
+            env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
+            if self.compilation_config.debug_dump_path:
+                logger.warning(
+                    "Config-specified debug dump path is overridden"
+                    " by VLLM_DEBUG_DUMP_PATH to %s",
+                    env_path,
+                )
+            self.compilation_config.debug_dump_path = env_path
+
+        def has_blocked_weights():
+            if self.quant_config is not None:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    return self.quant_config.weight_block_size is not None
+                elif hasattr(self.quant_config, "has_blocked_weights"):
+                    return self.quant_config.has_blocked_weights()
+            return False
+
+        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
+        # On H100 the CUDA kernel is faster than
+        # native implementation
+        # https://github.com/vllm-project/vllm/issues/25094
+        if has_blocked_weights():
+            custom_ops = self.compilation_config.custom_ops
+            if "-quant_fp8" not in custom_ops:
+                custom_ops.append("+quant_fp8")
+
+        # Handle the KV connector configs
+        self._post_init_kv_transfer_config()
+
+    def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list:
+        # remove the sizes that not multiple of tp_size when
+        # enable sequence parallelism
+        removed_sizes = [
+            size
+            for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size != 0
+        ]
+        if removed_sizes:
+            logger.warning(
+                "Batch sizes %s are removed because they are not "
+                "multiple of tp_size %d when "
+                "sequence parallelism is enabled",
+                removed_sizes,
+                self.parallel_config.tensor_parallel_size,
+            )
+
+        return [
+            size
+            for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size == 0
+        ]
+
+    def _set_cudagraph_sizes(self):
+        """
+        vLLM defines the default candidate list of batch sizes for CUDA graph
+        capture as:
+
+        ```python
+        max_graph_size = min(max_num_seqs * 2, 512)
+        # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
+        # up to max_graph_size
+        cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
+            range(256, max_graph_size + 1, 16))
+
+        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
+        will be the final sizes to capture cudagraph (in ascending order).
+
+        These sizes are used to capture and reuse CUDA graphs for
+        performance-critical paths (e.g., decoding). Capturing enables
+        significantly faster kernel dispatch by avoiding Python overhead. The
+        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
+        most GPUs), which controls the total allowed number of tokens in a
+        batch. Since each sequence may have a variable number of tokens, the
+        maximum usable batch size will depend on actual sequence lengths.
+
+        Example:
+            With `max_num_batched_tokens = 8192`, and typical sequences
+            averaging ~32 tokens, most practical batch sizes fall below 256.
+            However, the system will still allow capture sizes up to 512 if
+            shape and memory permit.
+
+        Note:
+            If users explicitly specify cudagraph capture sizes in the
+            compilation config, those will override this default logic.
+            At runtime:
+
+            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
+            padded CUDA graph will be used.
+            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
+            not be used.
+        """
+
+        if (
+            self.model_config is not None
+            and not self.model_config.enforce_eager
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            # determine the initial max_cudagraph_capture_size
+            max_cudagraph_capture_size = (
+                self.compilation_config.max_cudagraph_capture_size
+            )
+            if max_cudagraph_capture_size is None:
+                max_cudagraph_capture_size = min(
+                    self.scheduler_config.max_num_seqs * 2, 512
+                )
+            max_num_tokens = self.scheduler_config.max_num_batched_tokens
+            max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
+
+            assert max_cudagraph_capture_size >= 1, (
+                "Maximum cudagraph size should be greater than or equal to 1 "
+                "when using cuda graph."
+            )
+
+            # determine the cudagraph_capture_sizes
+            if self.compilation_config.cudagraph_capture_sizes is not None:
+                assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
+                    "cudagraph_capture_sizes should contain at least one element "
+                    "when using cuda graph."
+                )
+                # de-duplicate the sizes provided by the config
+                dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
+                cudagraph_capture_sizes = [
+                    i for i in dedup_sizes if i <= max_num_tokens
+                ]
+                # sort to make sure the sizes are in ascending order
+                cudagraph_capture_sizes.sort()
+            else:
+                cudagraph_capture_sizes = [
+                    i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
+                ]
+                if max_cudagraph_capture_size >= 8:
+                    # Step size 8 for small batch sizes, up to 256(not included)
+                    cudagraph_capture_sizes += list(
+                        range(8, min(max_cudagraph_capture_size + 1, 256), 8)
+                    )
+                if max_cudagraph_capture_size >= 256:
+                    # Step size 16 for larger batch sizes
+                    cudagraph_capture_sizes += list(
+                        range(256, max_cudagraph_capture_size + 1, 16)
+                    )
+
+            if (
+                self.parallel_config.tensor_parallel_size > 1
+                and self.compilation_config.pass_config.enable_sequence_parallelism
+            ):
+                cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
+                    cudagraph_capture_sizes
+                )
+
+            # user-specific compilation_config.max_cudagraph_capture_size get
+            # truncated to valid_max_size when they are inconsistent.
+            valid_max_size = (
+                cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
+            )
+            if (
+                self.compilation_config.max_cudagraph_capture_size is not None
+                and self.compilation_config.max_cudagraph_capture_size != valid_max_size
+            ):
+                # raise error only when both two flags are user-specified
+                # and they are inconsistent with each other
+                if self.compilation_config.cudagraph_capture_sizes is not None:
+                    raise ValueError(
+                        "customized max_cudagraph_capture_size"
+                        f"(={self.compilation_config.max_cudagraph_capture_size}) "
+                        "should be consistent with the max value of "
+                        f"cudagraph_capture_sizes(={valid_max_size})"
+                    )
+
+                logger.warning(
+                    "Truncating max_cudagraph_capture_size to %d",
+                    valid_max_size,
+                )
+            # always set the final max_cudagraph_capture_size
+            self.compilation_config.max_cudagraph_capture_size = valid_max_size
+
+            if self.compilation_config.cudagraph_capture_sizes is not None and len(
+                cudagraph_capture_sizes
+            ) < len(self.compilation_config.cudagraph_capture_sizes):
+                # If users have specified capture sizes, we only need to
+                # compare the lens before and after modification since the modified
+                # list is only the subset of the original list.
+                logger.warning(
+                    (
+                        "cudagraph_capture_sizes specified in compilation_config"
+                        " %s is overridden by config %s"
+                    ),
+                    self.compilation_config.cudagraph_capture_sizes,
+                    cudagraph_capture_sizes,
+                )
+            # always write back the final sizes
+            self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
+
+        else:
+            # no cudagraph in use
+            self.compilation_config.max_cudagraph_capture_size = 0
+            self.compilation_config.cudagraph_capture_sizes = []
+
+        # complete the remaining process.
+        self.compilation_config.post_init_cudagraph_sizes()
+
+    def recalculate_max_model_len(self, max_model_len: int):
+        # Can only be called in try_verify_and_update_config
+        model_config = self.model_config
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
+        self.model_config.max_model_len = max_model_len
+
+    def try_verify_and_update_config(self):
+        if self.model_config is None:
+            return
+
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self.model_config, "config_updated", False):
+            return
+        self.model_config.config_updated = True
+
+        architecture = self.model_config.architecture
+        if architecture is None:
+            return
+
+        from vllm.model_executor.models.config import (
+            MODELS_CONFIG_MAP,
+            HybridAttentionMambaModelConfig,
+        )
+
+        cls = MODELS_CONFIG_MAP.get(architecture, None)
+        if cls is not None:
+            cls.verify_and_update_config(self)
+
+        if self.model_config.is_hybrid:
+            HybridAttentionMambaModelConfig.verify_and_update_config(self)
+
+        if self.model_config.convert_type == "classify":
+            # Maybe convert ForCausalLM into ForSequenceClassification model.
+            from vllm.model_executor.models.adapters import SequenceClassificationConfig
+
+            SequenceClassificationConfig.verify_and_update_config(self)
+
+        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
+            self.model_config.model_weights
+        ):
+            if self.load_config.load_format == "auto":
+                logger.info(
+                    "Detected Run:ai model config. "
+                    "Overriding `load_format` to 'runai_streamer'"
+                )
+                self.load_config.load_format = "runai_streamer"
+            elif self.load_config.load_format not in (
+                "runai_streamer",
+                "runai_streamer_sharded",
+            ):
+                raise ValueError(
+                    f"To load a model from S3, 'load_format' "
+                    f"must be 'runai_streamer' or 'runai_streamer_sharded', "
+                    f"but got '{self.load_config.load_format}'. "
+                    f"Model: {self.model_config.model}"
+                )
+
+    def compile_debug_dump_path(self) -> Path | None:
+        """Returns a rank-aware path for dumping
+        torch.compile debug information.
+        """
+        if self.compilation_config.debug_dump_path is None:
+            return None
+        tp_rank = self.parallel_config.rank
+        dp_rank = self.parallel_config.data_parallel_rank
+        data_parallel_size = self.parallel_config.data_parallel_size
+        append_path = (
+            f"rank_{tp_rank}"
+            if data_parallel_size == 1
+            else f"rank_{tp_rank}_dp_{dp_rank}"
+        )
+        path = self.compilation_config.debug_dump_path / append_path
+        return path
+
+    def __str__(self):
+        return (
+            f"model={self.model_config.model!r}, "
+            f"speculative_config={self.speculative_config!r}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"revision={self.model_config.revision}, "
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"trust_remote_code={self.model_config.trust_remote_code}, "
+            f"dtype={self.model_config.dtype}, "
+            f"max_seq_len={self.model_config.max_model_len}, "
+            f"download_dir={self.load_config.download_dir!r}, "
+            f"load_format={self.load_config.load_format}, "
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
+            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
+            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
+            f"enforce_eager={self.model_config.enforce_eager}, "
+            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
+            f"device_config={self.device_config.device}, "
+            f"structured_outputs_config={self.structured_outputs_config!r}, "
+            f"observability_config={self.observability_config!r}, "
+            f"seed={self.model_config.seed}, "
+            f"served_model_name={self.model_config.served_model_name}, "
+            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
+            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}"
+        )
+
+    @model_validator(mode="after")
+    def validate_mamba_block_size(self) -> "VllmConfig":
+        if self.model_config is None:
+            return self
+        mamba_block_size_is_set = (
+            self.cache_config.mamba_block_size is not None
+            and self.cache_config.mamba_block_size != self.model_config.max_model_len
+        )
+        if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching:
+            raise ValueError(
+                "--mamba-block-size can only be set with --enable-prefix-caching"
+            )
+        return self
+
+
+_current_vllm_config: VllmConfig | None = None
+_current_prefix: str | None = None
+
+
+@contextmanager
+def set_current_vllm_config(
+    vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
+):
+    """
+    Temporarily set the current vLLM config.
+    Used during model initialization.
+    We save the current vLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the vLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config, _current_prefix
+    old_vllm_config = _current_vllm_config
+    old_prefix = _current_prefix
+    from vllm.compilation.counter import compilation_counter
+
+    num_models_seen = compilation_counter.num_models_seen
+    try:
+        _current_vllm_config = vllm_config
+        _current_prefix = prefix
+        yield
+    except Exception:
+        raise
+    else:
+        if check_compile:
+            vllm_config.compilation_config.custom_op_log_check()
+
+        if (
+            check_compile
+            and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+            and compilation_counter.num_models_seen == num_models_seen
+        ):
+            # If the model supports compilation,
+            # compilation_counter.num_models_seen should be increased
+            # by at least 1.
+            # If it is not increased, it means the model does not support
+            # compilation (does not have @support_torch_compile decorator).
+            logger.warning(
+                "`torch.compile` is turned on, but the model %s"
+                " does not support it. Please open an issue on GitHub"
+                " if you want it to be supported.",
+                vllm_config.model_config.model,
+            )
+    finally:
+        _current_vllm_config = old_vllm_config
+        _current_prefix = old_prefix
+        # Clear the compilation config cache when context changes
+        get_cached_compilation_config.cache_clear()
+
+
+@lru_cache(maxsize=1)
+def get_cached_compilation_config():
+    """Cache config to avoid repeated calls to get_current_vllm_config()"""
+    return get_current_vllm_config().compilation_config
+
+
+def get_current_vllm_config() -> VllmConfig:
+    if _current_vllm_config is None:
+        # in ci, usually when we test custom ops/modules directly,
+        # we don't set the vllm config. In that case, we set a default
+        # config.
+        logger.warning("Current vLLM config is not set.")
+        return VllmConfig()
+    return _current_vllm_config
+
+
+T = TypeVar("T")
+
+
+def get_layers_from_vllm_config(
+    vllm_config: VllmConfig,
+    layer_type: type[T],
+    layer_names: list[str] | None = None,
+) -> dict[str, T]:
+    """
+    Get layers from the vLLM config.
+
+    Args:
+        vllm_config: The vLLM config.
+        layer_type: The type of the layer to get.
+        layer_names: The names of the layers to get. If None, return all layers.
+    """
+
+    if layer_names is None:
+        layer_names = list(vllm_config.compilation_config.static_forward_context.keys())
+
+    forward_context = vllm_config.compilation_config.static_forward_context
+
+    return {
+        layer_name: forward_context[layer_name]
+        for layer_name in layer_names
+        if isinstance(forward_context[layer_name], layer_type)
+    }
diff --git a/vllm/connections.py b/vllm/connections.py
index 1f341719ae30a95f8f5619af3c344f96f35937f8..31b0d5e9c702f75b87a969a67e21fda24ec6965c 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Mapping, MutableMapping
 from pathlib import Path
-from typing import Optional
 from urllib.parse import urlparse
 
 import aiohttp
@@ -20,8 +19,8 @@ class HTTPConnection:
 
         self.reuse_client = reuse_client
 
-        self._sync_client: Optional[requests.Session] = None
-        self._async_client: Optional[aiohttp.ClientSession] = None
+        self._sync_client: requests.Session | None = None
+        self._async_client: aiohttp.ClientSession | None = None
 
     def get_sync_client(self) -> requests.Session:
         if self._sync_client is None or not self.reuse_client:
@@ -41,8 +40,9 @@ class HTTPConnection:
         parsed_url = urlparse(url)
 
         if parsed_url.scheme not in ("http", "https"):
-            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
-                             "must have scheme 'http' or 'https'.")
+            raise ValueError(
+                "Invalid HTTP URL: A valid HTTP URL must have scheme 'http' or 'https'."
+            )
 
     def _headers(self, **extras: str) -> MutableMapping[str, str]:
         return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
@@ -52,8 +52,8 @@ class HTTPConnection:
         url: str,
         *,
         stream: bool = False,
-        timeout: Optional[float] = None,
-        extra_headers: Optional[Mapping[str, str]] = None,
+        timeout: float | None = None,
+        extra_headers: Mapping[str, str] | None = None,
         allow_redirects: bool = True,
     ):
         self._validate_http_url(url)
@@ -61,18 +61,20 @@ class HTTPConnection:
         client = self.get_sync_client()
         extra_headers = extra_headers or {}
 
-        return client.get(url,
-                          headers=self._headers(**extra_headers),
-                          stream=stream,
-                          timeout=timeout,
-                          allow_redirects=allow_redirects)
+        return client.get(
+            url,
+            headers=self._headers(**extra_headers),
+            stream=stream,
+            timeout=timeout,
+            allow_redirects=allow_redirects,
+        )
 
     async def get_async_response(
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
-        extra_headers: Optional[Mapping[str, str]] = None,
+        timeout: float | None = None,
+        extra_headers: Mapping[str, str] | None = None,
         allow_redirects: bool = True,
     ):
         self._validate_http_url(url)
@@ -80,19 +82,19 @@ class HTTPConnection:
         client = await self.get_async_client()
         extra_headers = extra_headers or {}
 
-        return client.get(url,
-                          headers=self._headers(**extra_headers),
-                          timeout=timeout,
-                          allow_redirects=allow_redirects)
-
-    def get_bytes(self,
-                  url: str,
-                  *,
-                  timeout: Optional[float] = None,
-                  allow_redirects: bool = True) -> bytes:
-        with self.get_response(url,
-                               timeout=timeout,
-                               allow_redirects=allow_redirects) as r:
+        return client.get(
+            url,
+            headers=self._headers(**extra_headers),
+            timeout=timeout,
+            allow_redirects=allow_redirects,
+        )
+
+    def get_bytes(
+        self, url: str, *, timeout: float | None = None, allow_redirects: bool = True
+    ) -> bytes:
+        with self.get_response(
+            url, timeout=timeout, allow_redirects=allow_redirects
+        ) as r:
             r.raise_for_status()
 
             return r.content
@@ -101,16 +103,17 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         allow_redirects: bool = True,
     ) -> bytes:
         async with await self.get_async_response(
-                url, timeout=timeout, allow_redirects=allow_redirects) as r:
+            url, timeout=timeout, allow_redirects=allow_redirects
+        ) as r:
             r.raise_for_status()
 
             return await r.read()
 
-    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+    def get_text(self, url: str, *, timeout: float | None = None) -> str:
         with self.get_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
@@ -120,14 +123,14 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
     ) -> str:
         async with await self.get_async_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
             return await r.text()
 
-    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+    def get_json(self, url: str, *, timeout: float | None = None) -> str:
         with self.get_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
@@ -137,7 +140,7 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
     ) -> str:
         async with await self.get_async_response(url, timeout=timeout) as r:
             r.raise_for_status()
@@ -149,7 +152,7 @@ class HTTPConnection:
         url: str,
         save_path: Path,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         chunk_size: int = 128,
     ) -> Path:
         with self.get_response(url, timeout=timeout) as r:
@@ -166,7 +169,7 @@ class HTTPConnection:
         url: str,
         save_path: Path,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         chunk_size: int = 128,
     ) -> Path:
         async with await self.get_async_response(url, timeout=timeout) as r:
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index af7ca6be1fca8d63d438de81729d370f97da49f2..e9695698bb493784022ebe13b6e6688bcb1b8b55 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -11,24 +11,25 @@
 import dataclasses
 import gc
 import os
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 
 from vllm.logger import init_logger
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
 
-def find_loaded_library(lib_name) -> Optional[str]:
+def find_loaded_library(lib_name) -> str | None:
     """
     According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
     the file `/proc/self/maps` contains the memory maps of the process, which includes the
     shared libraries loaded by the process. We can use this file to find the path of the
     a loaded library.
-    """ # noqa
+    """  # noqa
     found_line = None
     with open("/proc/self/maps") as f:
         for line in f:
@@ -43,22 +44,26 @@ def find_loaded_library(lib_name) -> Optional[str]:
     start = found_line.index("/")
     path = found_line[start:].strip()
     filename = path.split("/")[-1]
-    assert filename.rpartition(".so")[0].startswith(lib_name), \
+    assert filename.rpartition(".so")[0].startswith(lib_name), (
         f"Unexpected filename: {filename} for library {lib_name}"
+    )
     return path
 
 
 cumem_available = False
 try:
-    from vllm.cumem_allocator import (init_module, python_create_and_map,
-                                      python_unmap_and_release)
-    from vllm.distributed.device_communicators.cuda_wrapper import (
-        CudaRTLibrary)
+    from vllm.cumem_allocator import (
+        init_module,
+        python_create_and_map,
+        python_unmap_and_release,
+    )
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
     lib_name = find_loaded_library("cumem_allocator")
     libcudart = CudaRTLibrary()
     cumem_available = True
 except ModuleNotFoundError:
-    # rocm platform does not support cumem allocator
+    # only cuda and rocm platforms support cumem allocator
     init_module = None
     python_create_and_map = None
     python_unmap_and_release = None
@@ -74,7 +79,7 @@ HandleType = tuple[int, int, int, int]
 class AllocationData:
     handle: HandleType
     tag: str
-    cpu_backup_tensor: Optional[torch.Tensor] = None
+    cpu_backup_tensor: torch.Tensor | None = None
 
 
 def create_and_map(allocation_handle: HandleType) -> None:
@@ -86,20 +91,19 @@ def unmap_and_release(allocation_handle: HandleType) -> None:
 
 
 def get_pluggable_allocator(
-    python_malloc_fn: Callable[[int],
-                               int], python_free_func: Callable[[int, int],
-                                                                None]
+    python_malloc_fn: Callable[[int], int], python_free_func: Callable[[int, int], None]
 ) -> torch.cuda.memory.CUDAPluggableAllocator:
     init_module(python_malloc_fn, python_free_func)
     new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
-        lib_name, 'my_malloc', 'my_free')
+        lib_name, "my_malloc", "my_free"
+    )
     return new_alloc
 
 
 @contextmanager
 def use_memory_pool_with_allocator(
-        python_malloc_fn: Callable[[int], int],
-        python_free_func: Callable[[int, int], None]) -> None:
+    python_malloc_fn: Callable[[int], int], python_free_func: Callable[[int, int], None]
+) -> None:
     new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
     mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
     with torch.cuda.memory.use_mem_pool(mem_pool):
@@ -130,6 +134,7 @@ class CuMemAllocator:
     the global variable will be overwritten and the free callback will
     not work as expected.
     """
+
     instance: "CuMemAllocator" = None
     default_tag: str = "default"
 
@@ -147,10 +152,11 @@ class CuMemAllocator:
 
     def __init__(self):
         conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
-        assert "expandable_segments:True" not in conf, \
-            ("Expandable segments are not compatible with memory pool. "
+        assert "expandable_segments:True" not in conf, (
+            "Expandable segments are not compatible with memory pool. "
             "Please track https://github.com/pytorch/pytorch/issues/147851 "
-            "for the latest updates.")
+            "for the latest updates."
+        )
 
         self.pointer_to_data: dict[int, AllocationData] = {}
         self.current_tag: str = CuMemAllocator.default_tag
@@ -167,10 +173,14 @@ class CuMemAllocator:
         when memory is allocated in the memory pool."""
         py_d_mem = allocation_handle[2]
         self.pointer_to_data[py_d_mem] = AllocationData(
-            allocation_handle, self.current_tag)
+            allocation_handle, self.current_tag
+        )
         logger.debug(
             "Allocated %s bytes for %s with address %s from cumem allocator",
-            allocation_handle[1], self.current_tag, py_d_mem)
+            allocation_handle[1],
+            self.current_tag,
+            py_d_mem,
+        )
         return
 
     def _python_free_callback(self, ptr: int) -> HandleType:
@@ -182,13 +192,13 @@ class CuMemAllocator:
             data.cpu_backup_tensor = None
         logger.debug(
             "Freed %s bytes for %s with address %s from cumem allocator",
-            data.handle[1], data.tag, ptr)
+            data.handle[1],
+            data.tag,
+            ptr,
+        )
         return data.handle
 
-    def sleep(
-            self,
-            offload_tags: Optional[Union[tuple[str, ...],
-                                         str]] = None) -> None:
+    def sleep(self, offload_tags: tuple[str, ...] | str | None = None) -> None:
         """
         Put the allocator in sleep mode.
         All data in the memory allocation with the specified tag will be
@@ -200,9 +210,9 @@ class CuMemAllocator:
         if offload_tags is None:
             # by default, allocated tensors are offloaded
             # when the allocator sleeps
-            offload_tags = (CuMemAllocator.default_tag, )
+            offload_tags = (CuMemAllocator.default_tag,)
         elif isinstance(offload_tags, str):
-            offload_tags = (offload_tags, )
+            offload_tags = (offload_tags,)
 
         assert isinstance(offload_tags, tuple)
 
@@ -218,8 +228,9 @@ class CuMemAllocator:
                 cpu_backup_tensor = torch.empty(
                     size_in_bytes,
                     dtype=torch.uint8,
-                    device='cpu',
-                    pin_memory=is_pin_memory_available())
+                    device="cpu",
+                    pin_memory=is_pin_memory_available(),
+                )
                 cpu_ptr = cpu_backup_tensor.data_ptr()
                 libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes)
                 data.cpu_backup_tensor = cpu_backup_tensor
@@ -228,13 +239,16 @@ class CuMemAllocator:
         logger.info(
             "CuMemAllocator: sleep freed %.2f GiB memory in total, of which "
             "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded "
-            "directly.", total_bytes / 1024**3, backup_bytes / 1024**3,
-            (total_bytes - backup_bytes) / 1024**3)
+            "directly.",
+            total_bytes / 1024**3,
+            backup_bytes / 1024**3,
+            (total_bytes - backup_bytes) / 1024**3,
+        )
 
         gc.collect()
         torch.cuda.empty_cache()
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         """
         Wake up the allocator from sleep mode.
         All data that is previously offloaded will be loaded back to GPU
@@ -251,14 +265,15 @@ class CuMemAllocator:
                 if data.cpu_backup_tensor is not None:
                     cpu_backup_tensor = data.cpu_backup_tensor
                     if cpu_backup_tensor is not None:
-                        size_in_bytes = cpu_backup_tensor.numel(
-                        ) * cpu_backup_tensor.element_size()
+                        size_in_bytes = (
+                            cpu_backup_tensor.numel() * cpu_backup_tensor.element_size()
+                        )
                         cpu_ptr = cpu_backup_tensor.data_ptr()
                         libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
                         data.cpu_backup_tensor = None
 
     @contextmanager
-    def use_memory_pool(self, tag: Optional[str] = None):
+    def use_memory_pool(self, tag: str | None = None):
         """
         A context manager to use the memory pool.
         All memory allocation created inside the context will be allocated
@@ -274,8 +289,9 @@ class CuMemAllocator:
 
         old_tag = self.current_tag
         self.current_tag = tag
-        with use_memory_pool_with_allocator(self.python_malloc_callback,
-                                            self.python_free_callback) as data:
+        with use_memory_pool_with_allocator(
+            self.python_malloc_callback, self.python_free_callback
+        ) as data:
             # start to hit another PyTorch bug in PyTorch 2.6,
             # possibly because of gc-related issue w.r.t. the allocator and
             # the memory pool.
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 0a5a95176f7c354825f6f5fac6d5f74892819829..5ad99e4e1592d321fdc67e177664dd92981c4dcd 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed
@@ -14,28 +14,30 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     return get_tp_group().all_reduce(input_)
 
 
-def tensor_model_parallel_all_gather(input_: torch.Tensor,
-                                     dim: int = -1) -> torch.Tensor:
+def tensor_model_parallel_all_gather(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
     """All-gather the input tensor across model parallel group."""
     return get_tp_group().all_gather(input_, dim)
 
 
-def tensor_model_parallel_reduce_scatter(input_: torch.Tensor,
-                                         dim: int = -1) -> torch.Tensor:
+def tensor_model_parallel_reduce_scatter(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
     """Reduce-Scatter the input tensor across model parallel group."""
     return get_tp_group().reduce_scatter(input_, dim)
 
 
-def tensor_model_parallel_gather(input_: torch.Tensor,
-                                 dst: int = 0,
-                                 dim: int = -1) -> Optional[torch.Tensor]:
+def tensor_model_parallel_gather(
+    input_: torch.Tensor, dst: int = 0, dim: int = -1
+) -> torch.Tensor | None:
     """Gather the input tensor across model parallel group."""
     return get_tp_group().gather(input_, dst, dim)
 
 
-def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor,
-                                                                Any]]] = None,
-                          src: int = 0):
+def broadcast_tensor_dict(
+    tensor_dict: dict[Any, torch.Tensor | Any] | None = None, src: int = 0
+):
     if not torch.distributed.is_initialized():
         return tensor_dict
     return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index bb3fd657facde03d127eb46a58fa7fb7fd11ca84..c40dde26b741f4878a0604a6a41fda73c0161d5a 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -9,15 +9,17 @@ import vllm.envs as envs
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.utils import has_deep_ep, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_all2all
+from vllm.utils.import_utils import has_deep_ep, has_pplx
 
 from .base_device_communicator import All2AllManagerBase, Cache
 
 if has_flashinfer_all2all():
-    from flashinfer.comm import Mapping
-    from flashinfer.comm.mnnvl import MnnvlConfig
-    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
+    from flashinfer.comm import Mapping  # type: ignore[import-not-found]
+    from flashinfer.comm.mnnvl import MnnvlConfig  # type: ignore[import-not-found]
+    from flashinfer.comm.trtllm_alltoall import (
+        MnnvlMoe,  # type: ignore[import-not-found]
+    )
 
 logger = init_logger(__name__)
 
@@ -33,17 +35,19 @@ class NaiveAll2AllManager(All2AllManagerBase):
     def __init__(self, cpu_group):
         super().__init__(cpu_group)
 
-    def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_sp_cpu: torch.Tensor,
-                        is_sequence_parallel: bool) -> torch.Tensor:
-        assert (len(x.shape) == 2)
-        buffer = torch.empty((cu_tokens_across_sp_cpu[-1], x.size(1)),
-                             device=x.device,
-                             dtype=x.dtype)
+    def naive_multicast(
+        self,
+        x: torch.Tensor,
+        cu_tokens_across_sp_cpu: torch.Tensor,
+        is_sequence_parallel: bool,
+    ) -> torch.Tensor:
+        assert len(x.shape) == 2
+        buffer = torch.empty(
+            (cu_tokens_across_sp_cpu[-1], x.size(1)), device=x.device, dtype=x.dtype
+        )
 
         rank = self.rank if is_sequence_parallel else self.dp_rank
-        world_size = (self.world_size
-                      if is_sequence_parallel else self.dp_world_size)
+        world_size = self.world_size if is_sequence_parallel else self.dp_world_size
 
         start = 0 if rank == 0 else cu_tokens_across_sp_cpu[rank - 1]
         end = cu_tokens_across_sp_cpu[rank]
@@ -59,27 +63,28 @@ class NaiveAll2AllManager(All2AllManagerBase):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         sp_size = self.tp_group.world_size if is_sequence_parallel else 1
         dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
         cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
 
-        hidden_states = self.naive_multicast(hidden_states,
-                                             cu_tokens_across_sp_cpu,
-                                             is_sequence_parallel)
-        router_logits = self.naive_multicast(router_logits,
-                                             cu_tokens_across_sp_cpu,
-                                             is_sequence_parallel)
+        hidden_states = self.naive_multicast(
+            hidden_states, cu_tokens_across_sp_cpu, is_sequence_parallel
+        )
+        router_logits = self.naive_multicast(
+            router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel
+        )
         return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
-
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         ep_rank = self.rank if is_sequence_parallel else self.dp_rank
 
         dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
         sp_size = self.tp_group.world_size if is_sequence_parallel else 1
         cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
 
@@ -107,13 +112,15 @@ class AgRsAll2AllManager(All2AllManagerBase):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gather hidden_states and router_logits from all dp ranks.
         """
-        sizes = get_forward_context(
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+        assert sizes is not None
 
         dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
         assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
@@ -124,19 +131,19 @@ class AgRsAll2AllManager(All2AllManagerBase):
         )
         return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         """
         Reduce-scatter hidden_states across all dp ranks.
         """
-        sizes = get_forward_context(
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+        assert sizes is not None
 
         dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
-        hidden_states = dist_group.reduce_scatterv(hidden_states,
-                                                   dim=0,
-                                                   sizes=sizes)
+        hidden_states = dist_group.reduce_scatterv(hidden_states, dim=0, sizes=sizes)
         return hidden_states
 
     def destroy(self):
@@ -149,46 +156,60 @@ class PPLXAll2AllManager(All2AllManagerBase):
     """
 
     def __init__(self, cpu_group):
-        assert has_pplx(
-        ), "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
+        assert has_pplx(), (
+            "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install pplx_kernels."
+        )
         super().__init__(cpu_group)
 
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
-            from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                              nvshmem_get_unique_id,
-                                              nvshmem_init)
+            from pplx_kernels.nvshmem import (  # type: ignore[import-not-found]
+                nvshmem_alloc_empty_unique_id,
+                nvshmem_get_unique_id,
+                nvshmem_init,
+            )
+
             logger.debug(
-                "Initialize NVSHMEM for pplx_kernels: "
-                "rank=%d, world size=%d", self.rank, self.world_size)
-            uid = nvshmem_get_unique_id(
-            ) if self.rank == 0 else nvshmem_alloc_empty_unique_id()
-            dist.broadcast(uid,
-                           src=dist.get_process_group_ranks(self.cpu_group)[0],
-                           group=self.cpu_group)
+                "Initialize NVSHMEM for pplx_kernels: rank=%d, world size=%d",
+                self.rank,
+                self.world_size,
+            )
+            uid = (
+                nvshmem_get_unique_id()
+                if self.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
+            dist.broadcast(
+                uid,
+                src=dist.get_process_group_ranks(self.cpu_group)[0],
+                group=self.cpu_group,
+            )
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
 
         self.handle_cache = Cache()
 
     def get_handle(self, kwargs):
-        import pplx_kernels as pplx
+        import pplx_kernels as pplx  # type: ignore[import-not-found]
+
         return self.handle_cache.get_or_create(
-            kwargs, pplx.AllToAll.internode
-            if self.internode else pplx.AllToAll.intranode)
+            kwargs,
+            pplx.AllToAll.internode if self.internode else pplx.AllToAll.intranode,
+        )
 
     def dispatch(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     def destroy(self):
@@ -197,7 +218,10 @@ class PPLXAll2AllManager(All2AllManagerBase):
                 handle.destroy()
 
         if self.internode:
-            from pplx_kernels.nvshmem import nvshmem_finalize
+            from pplx_kernels.nvshmem import (
+                nvshmem_finalize,  # type: ignore[import-not-found]
+            )
+
             logger.debug("PPLX NVSHMEM finalize")
             nvshmem_finalize()
 
@@ -208,8 +232,10 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
     """
 
     def __init__(self, cpu_group):
-        assert has_deep_ep(
-        ), "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
+        assert has_deep_ep(), (
+            "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install DeepEP kernels."
+        )  # noqa
         super().__init__(cpu_group)
         self.handle_cache = Cache()
 
@@ -224,13 +250,13 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     def destroy(self):
@@ -251,7 +277,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
         num_rdma_bytes = None
         num_qps_per_rank = None
 
-        if self.internode:
+        if self.internode and not envs.VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE:
             num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
             num_qps_per_rank = self.num_sms // 2
         else:
@@ -260,27 +286,31 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
 
         assert num_rdma_bytes is not None
         assert num_qps_per_rank is not None
-        return dict(group=self.cpu_group,
-                    num_nvl_bytes=num_nvl_bytes,
-                    num_rdma_bytes=num_rdma_bytes,
-                    low_latency_mode=False,
-                    num_qps_per_rank=num_qps_per_rank)
+        return dict(
+            group=self.cpu_group,
+            num_nvl_bytes=num_nvl_bytes,
+            num_rdma_bytes=num_rdma_bytes,
+            low_latency_mode=False,
+            num_qps_per_rank=num_qps_per_rank,
+        )
 
     def get_handle(self, kwargs):
-
         assert len(kwargs) == 0, (
             "DeepEPHTAll2AllManager expects no arguments. All the required "
-            "args are computed in the Manager itself.")
+            "args are computed in the Manager itself."
+        )
+
+        import deep_ep  # type: ignore[import-not-found]
 
-        import deep_ep
         buffer_kwargs = self._make_all2all_kwargs()
         logger.debug("DeepEP all2all args %s", buffer_kwargs)
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
-            buffer_kwargs, deep_ep.Buffer)
+            buffer_kwargs, deep_ep.Buffer
+        )
         return handle
 
     def set_num_sms(self, num_sms: int):
-        import deep_ep
+        import deep_ep  # type: ignore[import-not-found]
 
         # Right now the buffers are sized for only what the kernels were
         # created with. So we can only reduce the number of SMS used
@@ -314,7 +344,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
         num_global_experts: Number of experts in the model.
         num_local_experts: Number of experts in an EP rank.
         """
-        import deep_ep
+        import deep_ep  # type: ignore[import-not-found]
 
         # Defaults for internode and intranode are taken from DeepEP tests.
         num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
@@ -323,29 +353,36 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
             num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
             hidden=token_hidden_size,
             num_ranks=num_ep_ranks,
-            num_experts=num_global_experts)
+            num_experts=num_global_experts,
+        )
 
         assert num_rdma_bytes is not None
-        return dict(group=self.cpu_group,
-                    num_nvl_bytes=num_nvl_bytes,
-                    num_rdma_bytes=num_rdma_bytes,
-                    low_latency_mode=True,
-                    num_qps_per_rank=num_qps_per_rank)
+        return dict(
+            group=self.cpu_group,
+            num_nvl_bytes=num_nvl_bytes,
+            num_rdma_bytes=num_rdma_bytes,
+            low_latency_mode=True,
+            num_qps_per_rank=num_qps_per_rank,
+            allow_nvlink_for_low_latency_mode=True,
+            allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
+        )
 
     def get_handle(self, kwargs):
         """
         The kwargs for DeepEPLLAll2AllManager is dictated by
         _make_all2all_kwargs.
         """
-        import deep_ep
+        import deep_ep  # type: ignore[import-not-found]
+
         buffer_kwargs = self._make_all2all_kwargs(**kwargs)
         logger.debug("DeepEP all2all args %s", buffer_kwargs)
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
-            buffer_kwargs, deep_ep.Buffer)
+            buffer_kwargs, deep_ep.Buffer
+        )
         return handle
 
     # DeepEP LL uses RDMA so no SMs are used for communication
-    def max_sms_used(self) -> Optional[int]:
+    def max_sms_used(self) -> int | None:
         return 0
 
 
@@ -354,13 +391,21 @@ class FlashInferAllToAllManager(All2AllManagerBase):
     All2All communication based on flashinfer kernels.
     """
 
+    # This type lint could be removed after all of the work in
+    # https://github.com/vllm-project/vllm/issues/26533 done.
+    rank: int
+    world_size: int
+
     def __init__(self, cpu_group):
-        assert has_flashinfer_all2all(
-        ), "flashinfer all2all module not found. Please install/check flashinfer"  # noqa
+        assert has_flashinfer_all2all(), (
+            "flashinfer all2all module not found. Please install/check flashinfer"
+        )  # noqa
         super().__init__(cpu_group)
         logger.debug(
-            "Initialize for flashinfer All2All "
-            "rank=%d, world size=%d", self.rank, self.world_size)
+            "Initialize for flashinfer All2All rank=%d, world size=%d",
+            self.rank,
+            self.world_size,
+        )
         self.initialized = False
         self.alltoall_info = None
 
@@ -375,8 +420,7 @@ class FlashInferAllToAllManager(All2AllManagerBase):
             return
 
         self.cleanup()
-        logger.debug("making map: "
-                     "rank=%d, world size=%d", rank, world_size)
+        logger.debug("making map: rank=%d, world size=%d", rank, world_size)
         self.mapping = Mapping(
             world_size,
             rank,
@@ -385,25 +429,28 @@ class FlashInferAllToAllManager(All2AllManagerBase):
         )
 
         from vllm.distributed.device_communicators.mnnvl_compat import (
-            CustomCommunicator)
+            CustomCommunicator,
+        )
+
         dp_config = MnnvlConfig(
             comm_backend=CustomCommunicator(get_dp_group().cpu_group),
             fabric_page_size=1 << 29,  # 512MB
-            allocation_granularity=0  # Auto-detect
+            allocation_granularity=0,  # Auto-detect
         )
 
-        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(
-            self.mapping, dp_config)
+        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(self.mapping, dp_config)
         self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace(
-            self.mapping, dp_config)
+            self.mapping, dp_config
+        )
 
         self.world_size = world_size
         self.rank = rank
         self.gpus_per_node = gpus_per_node
         self.initialized = True
 
-        logger.info("FlashInfer All2All initialized for rank %s, size %s",
-                    rank, world_size)
+        logger.info(
+            "FlashInfer All2All initialized for rank %s, size %s", rank, world_size
+        )
 
     def ensure_alltoall_workspace_initialized(self):
         """Ensure workspace is initialized"""
@@ -426,8 +473,11 @@ class FlashInferAllToAllManager(All2AllManagerBase):
 
     def cleanup(self):
         """Clean up workspace"""
-        if self.initialized and self.workspace_tensor is not None \
-            and self.prepare_workspace_tensor is not None:
+        if (
+            self.initialized
+            and self.workspace_tensor is not None
+            and self.prepare_workspace_tensor is not None
+        ):
             try:
                 del self.workspace_tensor
                 del self.prepare_workspace_tensor
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index 87e0f8e1a96771d346c36718f9c8dd55d76a753a..ff2d7436b27098733117d554ea5b0f1ced4ae936 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -10,7 +10,7 @@ import sys
 import tempfile
 from collections.abc import Sequence
 from itertools import product
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -19,8 +19,11 @@ import torch.multiprocessing as mp
 import vllm.envs as envs
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
-from vllm.utils import (cuda_device_count_stateless,
-                        update_environment_variables)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -39,7 +42,7 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = {
         4: 2 * MiB,  # 2 MB
         6: 1 * MiB,  # 1 MB
         8: 1 * MiB,  # 1 MB
-    }
+    },
 }
 
 SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
@@ -54,7 +57,7 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
         4: 32 * MiB,  # 32 MB
         6: 128 * MiB,  # 128 MB
         8: 128 * MiB,  # 128 MB
-    }
+    },
 }
 
 NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
@@ -63,14 +66,18 @@ NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
         4: 2 * MiB,  # 2 MB
         8: 1 * MiB,  # 1 MB
     },
-    "always_use_above_world_size": 8  # Always use symm mem for world_size > 8
+    "always_use_above_world_size": 8,  # Always use symm mem for world_size > 8
 }
 
 
-def should_nccl_symm_mem_allreduce(world_size: int,
-                                   input_tensor: torch.Tensor) -> bool:
+def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor) -> bool:
     from vllm.distributed.device_communicators.pynccl_allocator import (
-        is_symmetric_memory_enabled)
+        is_symmetric_memory_enabled,
+    )
+
+    if vllm_is_batch_invariant():
+        return False
+
     if not is_symmetric_memory_enabled():
         return False
     if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]:
@@ -78,18 +85,18 @@ def should_nccl_symm_mem_allreduce(world_size: int,
     threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size)
     if threshold is not None and input_tensor.nbytes >= threshold:
         return True
-    return (world_size
-            > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"])
+    return world_size > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"]
 
 
-def producer(batch_src: Sequence[int],
-             producer_queue,
-             consumer_queue,
-             result_queue,
-             cuda_visible_devices: Optional[str] = None):
+def producer(
+    batch_src: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: str | None = None,
+):
     if cuda_visible_devices is not None:
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for i in batch_src:
@@ -115,14 +122,15 @@ def producer(batch_src: Sequence[int],
         lib.cudaDeviceReset()
 
 
-def consumer(batch_tgt: Sequence[int],
-             producer_queue,
-             consumer_queue,
-             result_queue,
-             cuda_visible_devices: Optional[str] = None):
+def consumer(
+    batch_tgt: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: str | None = None,
+):
     if cuda_visible_devices is not None:
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for j in batch_tgt:
@@ -198,12 +206,26 @@ def can_actually_p2p(
     producer_queue = smp.Queue()
     consumer_queue = smp.Queue()
     result_queue = smp.Queue()
-    p_src = smp.Process(target=producer,
-                        args=(batch_src, producer_queue, consumer_queue,
-                              result_queue, cuda_visible_devices))
-    p_tgt = smp.Process(target=consumer,
-                        args=(batch_tgt, producer_queue, consumer_queue,
-                              result_queue, cuda_visible_devices))
+    p_src = smp.Process(
+        target=producer,
+        args=(
+            batch_src,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_tgt = smp.Process(
+        target=consumer,
+        args=(
+            batch_tgt,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
     p_src.start()
     p_tgt.start()
     p_src.join()
@@ -216,7 +238,10 @@ def can_actually_p2p(
         if a != b:
             logger.warning(
                 "Two processes do not agree on the P2P access"
-                " status on %d -> %d, treat as disabled.", src, tgt)
+                " status on %d -> %d, treat as disabled.",
+                src,
+                tgt,
+            )
             result.append(False)
         else:
             result.append(a)
@@ -235,7 +260,7 @@ def can_actually_p2p(
 #  e.g. used by different vllm engines. The device id in the cache file is a
 #  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
 #  of visible devices in the vllm engine.
-_gpu_p2p_access_cache: Optional[dict[str, bool]] = None
+_gpu_p2p_access_cache: dict[str, bool] | None = None
 
 
 def gpu_p2p_access_check(src: int, tgt: int) -> bool:
@@ -255,12 +280,14 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
 
     path = os.path.join(
-        envs.VLLM_CACHE_ROOT,
-        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+        envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
     os.makedirs(os.path.dirname(path), exist_ok=True)
     from vllm.distributed.parallel_state import get_world_group
-    if ((not is_distributed or get_world_group().local_rank == 0)
-            and (not os.path.exists(path))):
+
+    if (not is_distributed or get_world_group().local_rank == 0) and (
+        not os.path.exists(path)
+    ):
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
         logger.info("generating GPU P2P access cache in %s", path)
@@ -279,11 +306,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         # we don't use the output of the subprocess directly,
         # because the subprocess might produce logging output
         with tempfile.NamedTemporaryFile() as output_file:
-            input_bytes = pickle.dumps(
-                (batch_src, batch_tgt, output_file.name))
-            returned = subprocess.run([sys.executable, __file__],
-                                      input=input_bytes,
-                                      capture_output=True)
+            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run(
+                [sys.executable, __file__], input=input_bytes, capture_output=True
+            )
             # check if the subprocess is successful
             try:
                 returned.check_returncode()
@@ -292,7 +318,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
                 raise RuntimeError(
                     f"Error happened when batch testing "
                     f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
-                    f"{returned.stderr.decode()}") from e
+                    f"{returned.stderr.decode()}"
+                ) from e
             with open(output_file.name, "rb") as f:
                 result = pickle.load(f)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index a42081fb0c15861c7912ee07aa05069c10e39f8e..3a849da70e4cb34b52076e9306c62b125212766e 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
-from typing import Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
 
 
 class Cache:
-
     def __init__(self):
         self._cache: WeakValueDictionary = WeakValueDictionary()
         self._lock = threading.RLock()  # Reentrant lock for thread safety
@@ -35,9 +33,11 @@ class All2AllManagerBase:
         self.cpu_group = cpu_group
 
         # compute some common properties
-        from vllm.distributed.parallel_state import (get_dp_group,
-                                                     get_tp_group,
-                                                     in_the_same_node_as)
+        from vllm.distributed.parallel_state import (
+            get_dp_group,
+            get_tp_group,
+            in_the_same_node_as,
+        )
 
         # all2all lives in ep group, which is merged from dp and tp group
         self.dp_group = get_dp_group()
@@ -63,21 +63,21 @@ class All2AllManagerBase:
         # and reuse it for the same config.
         raise NotImplementedError
 
-    def dispatch(self,
-                 hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor,
-                 is_sequence_parallel: bool = False):
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ):
         raise NotImplementedError
 
     def set_num_sms(self, num_sms: int):
         pass
 
-    def max_sms_used(self) -> Optional[int]:
+    def max_sms_used(self) -> int | None:
         return None  # None means it could use the whole GPU
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False):
+    def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False):
         raise NotImplementedError
 
     def destroy(self):
@@ -92,11 +92,13 @@ class DeviceCommunicatorBase:
     communication backend), the `device_group` will also be given.
     """
 
-    def __init__(self,
-                 cpu_group: ProcessGroup,
-                 device: Optional[torch.device] = None,
-                 device_group: Optional[ProcessGroup] = None,
-                 unique_name: str = ""):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
         self.device = device or torch.device("cpu")
         self.cpu_group = cpu_group
         self.device_group = device_group
@@ -106,21 +108,24 @@ class DeviceCommunicatorBase:
         self.ranks = dist.get_process_group_ranks(cpu_group)
         self.global_rank = dist.get_rank()
         self.global_world_size = dist.get_world_size()
-        self.rank_in_group = dist.get_group_rank(self.cpu_group,
-                                                 self.global_rank)
+        self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
 
         use_ep = False
+        all2all_backend = None
         from vllm.config import get_current_vllm_config
+
         config = get_current_vllm_config()
         if config is not None:
             # as long as we use data parallel (coupled data parallel
             # where all data parallel ranks execute forward together),
             # we initialize the all2all manager used in expert parallel.
             use_ep = config.parallel_config.data_parallel_size > 1
+            all2all_backend = config.parallel_config.all2all_backend
 
         self.is_ep_communicator = "ep" in unique_name
         self.use_all2all = self.is_ep_communicator and use_ep
-        self.all2all_manager: Optional[All2AllManagerBase] = None
+        self.all2all_backend = all2all_backend
+        self.all2all_manager: All2AllManagerBase | None = None
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         dist.all_reduce(input_, group=self.device_group)
@@ -134,41 +139,39 @@ class DeviceCommunicatorBase:
         # NOTE: we have to use concat-style all-gather here,
         # stack-style all-gather has compatibility issues with
         # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
-        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        output_size = (input_size[0] * self.world_size,) + input_size[1:]
         # Allocate output tensor.
-        output_tensor = torch.empty(output_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
         # All-gather.
-        dist.all_gather_into_tensor(output_tensor,
-                                    input_,
-                                    group=self.device_group)
+        dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group)
         # Reshape
-        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.reshape((self.world_size,) + input_size)
         output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (self.world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
+        output_tensor = output_tensor.reshape(
+            input_size[:dim]
+            + (self.world_size * input_size[dim],)
+            + input_size[dim + 1 :]
+        )
         return output_tensor
 
     def all_gatherv(
         self,
-        input_: Union[torch.Tensor, list[torch.Tensor]],
+        input_: torch.Tensor | list[torch.Tensor],
         dim: int = 0,
-        sizes: Optional[list[int]] = None
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        sizes: list[int] | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
         raise NotImplementedError
 
-    def reduce_scatter(self,
-                       input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
 
         if dim < 0:
             # Convert negative dim to positive.
@@ -180,30 +183,28 @@ class DeviceCommunicatorBase:
 
         assert input_tensor.shape[0] % world_size == 0
         chunk_size = input_tensor.shape[0] // world_size
-        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
 
-        output_tensor = torch.empty(output_shape,
-                                    dtype=input_tensor.dtype,
-                                    device=input_tensor.device)
+        output_tensor = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
 
         # Perform reduce-scatter operation
-        torch.distributed.reduce_scatter_tensor(output_tensor,
-                                                input_tensor,
-                                                group=self.device_group)
+        torch.distributed.reduce_scatter_tensor(
+            output_tensor, input_tensor, group=self.device_group
+        )
 
         # Reshape before returning
         return output_tensor.movedim(0, dim).contiguous()
 
-    def reduce_scatterv(self,
-                        input_: torch.Tensor,
-                        dim: int = -1,
-                        sizes: Optional[list[int]] = None) -> torch.Tensor:
+    def reduce_scatterv(
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
+    ) -> torch.Tensor:
         raise NotImplementedError
 
-    def gather(self,
-               input_: torch.Tensor,
-               dst: int = 0,
-               dim: int = -1) -> Optional[torch.Tensor]:
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -211,7 +212,8 @@ class DeviceCommunicatorBase:
         """
         world_size = self.world_size
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -222,27 +224,25 @@ class DeviceCommunicatorBase:
         else:
             gather_list = None
         # Gather.
-        torch.distributed.gather(input_,
-                                 gather_list,
-                                 dst=self.ranks[dst],
-                                 group=self.device_group)
+        torch.distributed.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
         if self.rank_in_group == dst:
             output_tensor = torch.cat(gather_list, dim=dim)
         else:
             output_tensor = None
         return output_tensor
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
         torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
-    def recv(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
+    ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
         if src is None:
@@ -255,8 +255,7 @@ class DeviceCommunicatorBase:
     def destroy(self):
         pass
 
-    def prepare_communication_buffer_for_model(self,
-                                               model: torch.nn.Module) -> None:
+    def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None:
         """
         Prepare the communication buffer for the model.
         """
@@ -264,20 +263,23 @@ class DeviceCommunicatorBase:
             return
 
         moe_modules = [
-            module for module in model.modules()
+            module
+            for module in model.modules()
             # TODO(bnell): Should use isinstance but can't.  Maybe search for
-            # presence of quant_method.init_prepare_finalize?
-            if (module.__class__.__name__ == "FusedMoE"
-                or module.__class__.__name__ == "SharedFusedMoE")
+            # presence of quant_method.maybe_init_modular_kernel?
+            if (
+                module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE"
+            )
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize(module)
+            module.maybe_init_modular_kernel()
 
     def dispatch(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Dispatch the hidden states and router logits to the appropriate device.
@@ -285,9 +287,9 @@ class DeviceCommunicatorBase:
         """
         return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         """
         Combine the hidden states and router logits from the appropriate device.
         This is a no-op in the base class.
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index bda567f8489c515183db9ae0a93c487c8547c517..fdfb74d7a752c040880d3fa4197de3e67f3ee998 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.distributed import ProcessGroup
@@ -15,30 +15,30 @@ from .base_device_communicator import DeviceCommunicatorBase
 
 
 class CpuCommunicator(DeviceCommunicatorBase):
-
-    def __init__(self,
-                 cpu_group: ProcessGroup,
-                 device: Optional[torch.device] = None,
-                 device_group: Optional[ProcessGroup] = None,
-                 unique_name: str = ""):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
         super().__init__(cpu_group, device, device_group, unique_name)
         self.dist_module = torch.distributed
 
-        if (current_platform.get_cpu_architecture()
-                == CpuArchEnum.X86) and hasattr(
-                    torch.ops._C,
-                    "init_shm_manager") and (unique_name.startswith("tp")
-                                             or unique_name.startswith("pp")):
+        if (
+            (current_platform.get_cpu_architecture() == CpuArchEnum.X86)
+            and hasattr(torch.ops._C, "init_shm_manager")
+            and (unique_name.startswith("tp") or unique_name.startswith("pp"))
+        ):
             self.dist_module = _CPUSHMDistributed(self)
 
     def all_reduce(self, input_):
         self.dist_module.all_reduce(input_, group=self.device_group)
         return input_
 
-    def gather(self,
-               input_: torch.Tensor,
-               dst: int = 0,
-               dim: int = -1) -> Optional[torch.Tensor]:
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -46,7 +46,8 @@ class CpuCommunicator(DeviceCommunicatorBase):
         """
         world_size = self.world_size
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -58,10 +59,9 @@ class CpuCommunicator(DeviceCommunicatorBase):
             gather_list = None
 
         # Gather.
-        self.dist_module.gather(input_,
-                                gather_list,
-                                dst=self.ranks[dst],
-                                group=self.device_group)
+        self.dist_module.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
 
         if self.rank_in_group == dst:
             output_tensor = torch.cat(gather_list, dim=dim)
@@ -77,28 +77,29 @@ class CpuCommunicator(DeviceCommunicatorBase):
         # NOTE: we have to use concat-style all-gather here,
         # stack-style all-gather has compatibility issues with
         # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
-        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        output_size = (input_size[0] * self.world_size,) + input_size[1:]
         # Allocate output tensor.
-        output_tensor = torch.empty(output_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
         # All-gather.
-        self.dist_module.all_gather_into_tensor(output_tensor,
-                                                input_,
-                                                group=self.device_group)
+        self.dist_module.all_gather_into_tensor(
+            output_tensor, input_, group=self.device_group
+        )
 
         # Reshape
-        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.reshape((self.world_size,) + input_size)
         output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (self.world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
+        output_tensor = output_tensor.reshape(
+            input_size[:dim]
+            + (self.world_size * input_size[dim],)
+            + input_size[dim + 1 :]
+        )
         return output_tensor
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        tensor_dict: dict[str, torch.Tensor | Any],
         dst: int,
     ) -> None:
         return self.dist_module.send_tensor_dict(tensor_dict, dst)
@@ -106,12 +107,11 @@ class CpuCommunicator(DeviceCommunicatorBase):
     def recv_tensor_dict(
         self,
         src: int,
-    ) -> dict[str, Union[torch.Tensor, Any]]:
+    ) -> dict[str, torch.Tensor | Any]:
         return self.dist_module.recv_tensor_dict(src)
 
 
 class _CPUSHMDistributed:
-
     def __init__(self, communicator: CpuCommunicator):
         instance_identifier = os.environ["VLLM_DIST_IDENT"]
         unique_name = communicator.unique_name
@@ -139,29 +139,37 @@ class _CPUSHMDistributed:
 
         return handle
 
-    def all_reduce(self,
-                   input: torch.Tensor,
-                   group: Optional[ProcessGroup] = None) -> None:
+    def all_reduce(
+        self, input: torch.Tensor, group: ProcessGroup | None = None
+    ) -> None:
         torch.ops._C.shm_allreduce(self.handle, input)
 
-    def gather(self,
-               input: torch.Tensor,
-               gather_list: Optional[list[torch.Tensor]],
-               dst: int = -1,
-               group: Optional[ProcessGroup] = None) -> None:
+    def gather(
+        self,
+        input: torch.Tensor,
+        gather_list: list[torch.Tensor] | None,
+        dst: int = -1,
+        group: ProcessGroup | None = None,
+    ) -> None:
         # Note: different from the torch gather, here we use local dst rank.
-        torch.ops._C.shm_gather(self.handle, input, gather_list,
-                                torch.distributed.get_group_rank(group, dst))
+        torch.ops._C.shm_gather(
+            self.handle,
+            input,
+            gather_list,
+            torch.distributed.get_group_rank(group, dst),
+        )
 
-    def all_gather_into_tensor(self,
-                               output: torch.Tensor,
-                               input: torch.Tensor,
-                               group: Optional[ProcessGroup] = None) -> None:
+    def all_gather_into_tensor(
+        self,
+        output: torch.Tensor,
+        input: torch.Tensor,
+        group: ProcessGroup | None = None,
+    ) -> None:
         torch.ops._C.shm_all_gather(self.handle, input, output)
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        tensor_dict: dict[str, torch.Tensor | Any],
         dst: int,
     ) -> None:
         key_list = list(tensor_dict.keys())
@@ -169,11 +177,11 @@ class _CPUSHMDistributed:
         size_list = []
         for v in value_list:
             if not isinstance(v, torch.Tensor):
-                raise RuntimeError(
-                    "CpuCommunicator only supports sending tensors.")
+                raise RuntimeError("CpuCommunicator only supports sending tensors.")
             size_list.append(v.size())
-        key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]),
-                                           dtype=torch.uint8)
+        key_size_tensor = torch.frombuffer(
+            pickle.dumps([key_list, size_list]), dtype=torch.uint8
+        )
         value_list.append(key_size_tensor)
 
         torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst)
@@ -183,7 +191,7 @@ class _CPUSHMDistributed:
     def recv_tensor_dict(
         self,
         src: int,
-    ) -> dict[str, Union[torch.Tensor, Any]]:
+    ) -> dict[str, torch.Tensor | Any]:
         tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
 
         value_list: list[torch.Tensor] = tensor_list[:-1]
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 30d1bf10138bbde213264a0fc36b228ae2a2e5fa..2e878eef908aceacdb1e15172d238f007a3c0f27 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,18 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.all_reduce_utils import (
-    should_nccl_symm_mem_allreduce)
-from vllm.distributed.device_communicators.pynccl import (
-    register_nccl_symmetric_ops)
+    should_nccl_symm_mem_allreduce,
+)
+from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
 from vllm.distributed.device_communicators.pynccl_allocator import (
-    is_symmetric_memory_enabled)
+    is_symmetric_memory_enabled,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -22,20 +22,21 @@ logger = init_logger(__name__)
 
 
 class CudaCommunicator(DeviceCommunicatorBase):
-
-    def __init__(self,
-                 cpu_group: ProcessGroup,
-                 device: Optional[torch.device] = None,
-                 device_group: Optional[ProcessGroup] = None,
-                 unique_name: str = ""):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
         super().__init__(cpu_group, device, device_group, unique_name)
         if "tp" not in unique_name:
             # custom allreduce or torch symm mem can be used only by tp
             use_custom_allreduce = False
             use_torch_symm_mem = False
         else:
-            from vllm.distributed.parallel_state import (
-                _ENABLE_CUSTOM_ALL_REDUCE)
+            from vllm.distributed.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE
+
             use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
             use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM
 
@@ -44,15 +45,15 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
-            CustomAllreduce)
-        from vllm.distributed.device_communicators.pynccl import (
-            PyNcclCommunicator)
+            CustomAllreduce,
+        )
+        from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.device_communicators.quick_all_reduce import (
-            QuickAllReduce)
-        from vllm.distributed.device_communicators.symm_mem import (
-            SymmMemCommunicator)
+            QuickAllReduce,
+        )
+        from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 
-        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        self.pynccl_comm: PyNcclCommunicator | None = None
         if self.world_size > 1:
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
@@ -61,9 +62,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
             if is_symmetric_memory_enabled():
                 register_nccl_symmetric_ops(self.pynccl_comm)
 
-        self.ca_comm: Optional[CustomAllreduce] = None
-        self.qr_comm: Optional[QuickAllReduce] = None
-        self.symm_mem_comm: Optional[SymmMemCommunicator] = None
+        self.ca_comm: CustomAllreduce | None = None
+        self.qr_comm: QuickAllReduce | None = None
+        self.symm_mem_comm: SymmMemCommunicator | None = None
         if use_torch_symm_mem and current_platform.is_cuda():
             self.symm_mem_comm = SymmMemCommunicator(
                 group=self.cpu_group,
@@ -75,8 +76,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
             self.ca_comm = CustomAllreduce(
                 group=self.cpu_group,
                 device=self.device,
-                symm_mem_enabled=(self.symm_mem_comm is not None
-                                  and not self.symm_mem_comm.disabled),
+                symm_mem_enabled=(
+                    self.symm_mem_comm is not None and not self.symm_mem_comm.disabled
+                ),
             )
 
             if current_platform.is_rocm():
@@ -85,68 +87,81 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 # Based on quickreduce (https://github.com/mk1-project/quickreduce).
                 # If it's a rocm, 'use_custom_allreduce==True' means it must
                 # currently be an MI300 series.
-                self.qr_comm = QuickAllReduce(group=self.cpu_group,
-                                              device=self.device)
+                self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device)
 
         if self.use_all2all:
-            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-            if all2all_backend == "naive":
+            if self.all2all_backend == "naive":
                 from .all2all import NaiveAll2AllManager
+
                 self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
-                logger.info("Using naive all2all manager.")
-            elif all2all_backend == "allgather_reducescatter":
+            elif self.all2all_backend == "allgather_reducescatter":
                 from .all2all import AgRsAll2AllManager
+
                 self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
-                logger.info("Using AllGather-ReduceScatter all2all manager.")
-            elif all2all_backend == "pplx":
+            elif self.all2all_backend == "pplx":
                 from .all2all import PPLXAll2AllManager
+
                 self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
-                logger.info("Using PPLX all2all manager.")
-            elif all2all_backend == "deepep_high_throughput":
+            elif self.all2all_backend == "deepep_high_throughput":
                 from .all2all import DeepEPHTAll2AllManager
+
                 self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
-                logger.info("Using DeepEP High-Throughput all2all manager.")
-            elif all2all_backend == "deepep_low_latency":
+            elif self.all2all_backend == "deepep_low_latency":
                 from .all2all import DeepEPLLAll2AllManager
+
                 self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
-                logger.info("Using DeepEP Low-Latency all2all manager.")
-            elif all2all_backend == "flashinfer_all2allv":
+            elif self.all2all_backend == "flashinfer_all2allv":
                 from .all2all import FlashInferAllToAllManager
-                self.all2all_manager = FlashInferAllToAllManager(
-                    self.cpu_group)
-                logger.info("Using Flashinfer all2allv manager.")
+
+                self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
             else:
-                raise ValueError(f"Unknown all2all backend: {all2all_backend}")
+                raise ValueError(f"Unknown all2all backend: {self.all2all_backend}")
+
+            logger.info_once(
+                "Using %s all2all manager.",
+                self.all2all_manager.__class__.__name__,
+                scope="global",
+            )
 
     def all_reduce(self, input_):
         # since currently we perform copy input -> symm_input -> out-of-place AR
         # return symm_output, we don't need to check if input is symmetric
-        if self.pynccl_comm is not None and \
-            should_nccl_symm_mem_allreduce(self.pynccl_comm.world_size,input_):
+        if self.pynccl_comm is not None and should_nccl_symm_mem_allreduce(
+            self.pynccl_comm.world_size, input_
+        ):
             out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_)
             if out is not None:
                 return out
         # always try quick reduce first, then custom allreduce,
         # and then pynccl. (quick reduce just for ROCM MI3*)
         qr_comm = self.qr_comm
-        if qr_comm is not None and not qr_comm.disabled and \
-            qr_comm.should_quick_allreduce(input_):
+        if (
+            qr_comm is not None
+            and not qr_comm.disabled
+            and qr_comm.should_quick_allreduce(input_)
+        ):
             out = qr_comm.quick_all_reduce(input_)
             assert out is not None
             return out
         ca_comm = self.ca_comm
-        if ca_comm is not None and not ca_comm.disabled and \
-            ca_comm.should_custom_ar(input_):
+        if (
+            ca_comm is not None
+            and not ca_comm.disabled
+            and ca_comm.should_custom_ar(input_)
+        ):
             out = ca_comm.custom_all_reduce(input_)
             assert out is not None
             return out
         symm_mem_comm = self.symm_mem_comm
-        if symm_mem_comm is not None and \
-            symm_mem_comm.should_use_symm_mem(input_):
+        if symm_mem_comm is not None and symm_mem_comm.should_use_symm_mem(input_):
             out = symm_mem_comm.all_reduce(input_)
             assert out is not None
             return out
         pynccl_comm = self.pynccl_comm
+        if pynccl_comm is None or pynccl_comm.disabled:
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+            return out
         assert pynccl_comm is not None
         out = pynccl_comm.all_reduce(input_)
         if out is None:
@@ -172,21 +187,20 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         assert input_tensor.shape[0] % world_size == 0
         chunk_size = input_tensor.shape[0] // world_size
-        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
 
-        output = torch.empty(output_shape,
-                             dtype=input_tensor.dtype,
-                             device=input_tensor.device)
+        output = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
 
         pynccl_comm.reduce_scatter(output, input_tensor)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
 
-    def reduce_scatterv(self,
-                        input_: torch.Tensor,
-                        dim: int = -1,
-                        sizes: Optional[list[int]] = None):
+    def reduce_scatterv(
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
+    ):
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
@@ -205,11 +219,11 @@ class CudaCommunicator(DeviceCommunicatorBase):
         else:
             assert input_tensor.shape[0] % world_size == 0
             chunk_size = input_tensor.shape[0] // world_size
-        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
 
-        output = torch.empty(output_shape,
-                             dtype=input_tensor.dtype,
-                             device=input_tensor.device)
+        output = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
 
         if sizes is not None:
             pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
@@ -219,7 +233,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
@@ -231,10 +245,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
         else:
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
-    def recv(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
+    ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
         if src is None:
@@ -257,10 +270,12 @@ class CudaCommunicator(DeviceCommunicatorBase):
             self.all2all_manager.destroy()
             self.all2all_manager = None
 
-    def all_gatherv(self,
-                    input_: Union[torch.Tensor, list[torch.Tensor]],
-                    dim: int = 0,
-                    sizes: Optional[list[int]] = None):
+    def all_gatherv(
+        self,
+        input_: torch.Tensor | list[torch.Tensor],
+        dim: int = 0,
+        sizes: list[int] | None = None,
+    ):
         if dim != 0:
             raise NotImplementedError("only dim 0 all-gatherv is supported")
         world_size = self.world_size
@@ -272,20 +287,20 @@ class CudaCommunicator(DeviceCommunicatorBase):
         if sizes is not None and all(s == sizes[0] for s in sizes):
             sizes = None
 
-        def _all_gather_single(input_: torch.Tensor,
-                               sizes: Optional[list[int]] = None):
+        def _all_gather_single(input_: torch.Tensor, sizes: list[int] | None = None):
             input_size = input_.size()
             if sizes is not None:
                 assert len(sizes) == world_size
                 assert input_.shape[dim] == sizes[self.rank_in_group], (
-                    f"{input_.shape[dim]} != {sizes[self.rank_in_group]}")
-                output_size = (sum(sizes), ) + input_size[1:]
+                    f"{input_.shape[dim]} != {sizes[self.rank_in_group]}"
+                )
+                output_size = (sum(sizes),) + input_size[1:]
             else:
-                output_size = (input_size[0] * world_size, ) + input_size[1:]
+                output_size = (input_size[0] * world_size,) + input_size[1:]
             # Allocate output tensor.
-            output_tensor = torch.empty(output_size,
-                                        dtype=input_.dtype,
-                                        device=input_.device)
+            output_tensor = torch.empty(
+                output_size, dtype=input_.dtype, device=input_.device
+            )
             if sizes is not None:
                 pynccl_comm.all_gatherv(output_tensor, input_, sizes=sizes)
             else:
@@ -307,17 +322,19 @@ class CudaCommunicator(DeviceCommunicatorBase):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.all2all_manager is not None
         hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel)
+            hidden_states, router_logits, is_sequence_parallel
+        )
         return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states,
-                                                     is_sequence_parallel)
+        hidden_states = self.all2all_manager.combine(
+            hidden_states, is_sequence_parallel
+        )
         return hidden_states
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 2c38e8ed21d7df2e93efb0cf17753f6628bcdde6..6aadab33e31324ba4c3881e6949cb7c0dd7c009a 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -7,13 +7,14 @@ convenient for use when we just need to call a few functions.
 
 import ctypes
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 # this line makes it possible to directly load `libcudart.so` using `ctypes`
 import torch  # noqa
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -36,13 +37,13 @@ class Function:
     argtypes: list[Any]
 
 
-def find_loaded_library(lib_name) -> Optional[str]:
+def find_loaded_library(lib_name) -> str | None:
     """
     According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
     the file `/proc/self/maps` contains the memory maps of the process, which includes the
     shared libraries loaded by the process. We can use this file to find the path of the
     a loaded library.
-    """ # noqa
+    """  # noqa
     found = False
     with open("/proc/self/maps") as f:
         for line in f:
@@ -57,8 +58,9 @@ def find_loaded_library(lib_name) -> Optional[str]:
     start = line.index("/")
     path = line[start:].strip()
     filename = path.split("/")[-1]
-    assert filename.rpartition(".so")[0].startswith(lib_name), \
+    assert filename.rpartition(".so")[0].startswith(lib_name), (
         f"Unexpected filename: {filename} for library {lib_name}"
+    )
     return path
 
 
@@ -70,32 +72,54 @@ class CudaRTLibrary:
         Function("cudaDeviceSynchronize", cudaError_t, []),
         # ​cudaError_t cudaDeviceReset ( void )
         Function("cudaDeviceReset", cudaError_t, []),
-
         # const char* 	cudaGetErrorString ( cudaError_t error )
         Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
-
         # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
-        Function("cudaMalloc", cudaError_t,
-                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
+        Function(
+            "cudaMalloc",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t],
+        ),
         # ​cudaError_t 	cudaFree ( void* devPtr )
         Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
         # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
-        Function("cudaMemset", cudaError_t,
-                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
+        Function(
+            "cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+        ),
         # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
-        Function("cudaMemcpy", cudaError_t, [
-            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
-        ]),
-
+        Function(
+            "cudaMemcpy",
+            cudaError_t,
+            [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind],
+        ),
         # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
-        Function("cudaIpcGetMemHandle", cudaError_t,
-                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
+        Function(
+            "cudaIpcGetMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p],
+        ),
         # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
-        Function("cudaIpcOpenMemHandle", cudaError_t, [
-            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
-        ]),
+        Function(
+            "cudaIpcOpenMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint],
+        ),
     ]
 
+    # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa
+    cuda_to_hip_mapping = {
+        "cudaSetDevice": "hipSetDevice",
+        "cudaDeviceSynchronize": "hipDeviceSynchronize",
+        "cudaDeviceReset": "hipDeviceReset",
+        "cudaGetErrorString": "hipGetErrorString",
+        "cudaMalloc": "hipMalloc",
+        "cudaFree": "hipFree",
+        "cudaMemset": "hipMemset",
+        "cudaMemcpy": "hipMemcpy",
+        "cudaIpcGetMemHandle": "hipIpcGetMemHandle",
+        "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle",
+    }
+
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: dict[str, Any] = {}
@@ -104,16 +128,21 @@ class CudaRTLibrary:
     #  to the corresponding dictionary
     path_to_dict_mapping: dict[str, dict[str, Any]] = {}
 
-    def __init__(self, so_file: Optional[str] = None):
+    def __init__(self, so_file: str | None = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
-                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
-            assert so_file is not None, \
-                (
-                    "libcudart is not loaded in the current process, "
-                    "try setting VLLM_CUDART_SO_PATH"
-                )
+                # libcudart is not loaded in the current process, try hip
+                so_file = find_loaded_library("libamdhip64")
+                # should be safe to assume now that we are using ROCm
+                # as the following assertion should error out if the
+                # libhiprtc library is also not loaded
+                if so_file is None:
+                    so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+            assert so_file is not None, (
+                "libcudart is not loaded in the current process, "
+                "try setting VLLM_CUDART_SO_PATH"
+            )
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
@@ -122,7 +151,12 @@ class CudaRTLibrary:
         if so_file not in CudaRTLibrary.path_to_dict_mapping:
             _funcs = {}
             for func in CudaRTLibrary.exported_functions:
-                f = getattr(self.lib, func.name)
+                f = getattr(
+                    self.lib,
+                    CudaRTLibrary.cuda_to_hip_mapping[func.name]
+                    if current_platform.is_rocm()
+                    else func.name,
+                )
                 f.restype = func.restype
                 f.argtypes = func.argtypes
                 _funcs[func.name] = f
@@ -154,27 +188,29 @@ class CudaRTLibrary:
     def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
         self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
 
-    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
-                   count: int) -> None:
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, count: int) -> None:
         self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
 
-    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
-                   count: int) -> None:
+    def cudaMemcpy(
+        self, dst: ctypes.c_void_p, src: ctypes.c_void_p, count: int
+    ) -> None:
         cudaMemcpyDefault = 4
         kind = cudaMemcpyDefault
         self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
 
-    def cudaIpcGetMemHandle(self,
-                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+    def cudaIpcGetMemHandle(self, devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
         handle = cudaIpcMemHandle_t()
-        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
-            ctypes.byref(handle), devPtr))
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcGetMemHandle"](ctypes.byref(handle), devPtr)
+        )
         return handle
 
-    def cudaIpcOpenMemHandle(self,
-                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+    def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
         cudaIpcMemLazyEnablePeerAccess = 1
         devPtr = ctypes.c_void_p()
-        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
-            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcOpenMemHandle"](
+                ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess
+            )
+        )
         return devPtr
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index d6e5218dcb6779ace7da444e27bdbd66ff17dce4..02591805a7962f9a7271cefe326ed2691e1fed5b 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Optional, Union
+from typing import cast
 
 import torch
 import torch.distributed as dist
@@ -11,11 +11,13 @@ from torch.distributed import ProcessGroup
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.all_reduce_utils import (
-    CUSTOM_ALL_REDUCE_MAX_SIZES, gpu_p2p_access_check)
+    CUSTOM_ALL_REDUCE_MAX_SIZES,
+    gpu_p2p_access_check,
+)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 try:
     ops.meta_size()
@@ -32,8 +34,7 @@ def _can_p2p(rank: int, world_size: int) -> bool:
         if i == rank:
             continue
         if envs.VLLM_SKIP_P2P_CHECK:
-            logger.info(
-                "Skipping P2P check and trusting the driver's P2P report.")
+            logger.debug("Skipping P2P check and trusting the driver's P2P report.")
             return torch.cuda.can_device_access_peer(rank, i)
         if not gpu_p2p_access_check(rank, i):
             return False
@@ -41,21 +42,23 @@ def _can_p2p(rank: int, world_size: int) -> bool:
 
 
 def is_weak_contiguous(inp: torch.Tensor):
-    return inp.is_contiguous() or (inp.storage().nbytes() -
-                                   inp.storage_offset() * inp.element_size()
-                                   == inp.numel() * inp.element_size())
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
 
 
 class CustomAllreduce:
-
     _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
 
     # max_size: max supported allreduce size
-    def __init__(self,
-                 group: ProcessGroup,
-                 device: Union[int, str, torch.device],
-                 max_size=8192 * 1024,
-                 symm_mem_enabled=False) -> None:
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: int | str | torch.device,
+        max_size=8192 * 1024,
+        symm_mem_enabled=False,
+    ) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
@@ -72,20 +75,24 @@ class CustomAllreduce:
         if not custom_ar:
             # disable because of missing custom allreduce library
             # e.g. in a non-GPU environment
-            logger.info("Custom allreduce is disabled because "
-                        "of missing custom allreduce library")
+            logger.info(
+                "Custom allreduce is disabled because "
+                "of missing custom allreduce library"
+            )
             return
 
         self.group = group
 
         assert dist.get_backend(group) != dist.Backend.NCCL, (
-            "CustomAllreduce should be attached to a non-NCCL group.")
+            "CustomAllreduce should be attached to a non-NCCL group."
+        )
 
         if not all(in_the_same_node_as(group, source_rank=0)):
             # No need to initialize custom allreduce for multi-node case.
             logger.warning(
                 "Custom allreduce is disabled because this process group"
-                " spans across nodes.")
+                " spans across nodes."
+            )
             return
 
         rank = dist.get_rank(group=self.group)
@@ -100,7 +107,9 @@ class CustomAllreduce:
                 "Custom allreduce is disabled due to an unsupported world"
                 " size: %d. Supported world sizes: %s. To silence this "
                 "warning, specify disable_custom_all_reduce=True explicitly.",
-                world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
+                world_size,
+                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
+            )
             return
 
         if isinstance(device, int):
@@ -110,13 +119,18 @@ class CustomAllreduce:
         # now `device` is a `torch.device` object
         assert isinstance(device, torch.device)
         self.device = device
-        device_capability = current_platform.get_device_capability(
-        ).as_version_str()
-        if (current_platform.is_cuda() and symm_mem_enabled
-                and device_capability in CUSTOM_ALL_REDUCE_MAX_SIZES):
-            max_size = min(
-                CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability][world_size],
-                max_size)
+        device_capability = current_platform.get_device_capability()
+        if (
+            current_platform.is_cuda()
+            and symm_mem_enabled
+            and device_capability is not None
+        ):
+            device_capability_str = device_capability.as_version_str()
+            if device_capability_str in CUSTOM_ALL_REDUCE_MAX_SIZES:
+                max_size = min(
+                    CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability_str][world_size],
+                    max_size,
+                )
         cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
@@ -124,12 +138,9 @@ class CustomAllreduce:
             device_ids = list(range(cuda_device_count_stateless()))
 
         physical_device_id = device_ids[device.index]
-        tensor = torch.tensor([physical_device_id],
-                              dtype=torch.int,
-                              device="cpu")
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
         gather_list = [
-            torch.tensor([0], dtype=torch.int, device="cpu")
-            for _ in range(world_size)
+            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
         ]
         dist.all_gather(gather_list, tensor, group=self.group)
         physical_device_ids = [t.item() for t in gather_list]
@@ -138,14 +149,13 @@ class CustomAllreduce:
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
         assert current_platform.is_cuda_alike()
-        fully_connected = current_platform.is_fully_connected(
-            physical_device_ids)
-        # if world_size > 2 and not fully_connected:
-        if not fully_connected:
+        fully_connected = current_platform.is_fully_connected(physical_device_ids)
+        if world_size > 2 and not fully_connected:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
                 " more than two PCIe-only GPUs. To silence this warning, "
-                "specify disable_custom_all_reduce=True explicitly.")
+                "specify disable_custom_all_reduce=True explicitly."
+            )
             return
         # test P2P capability, this checks software/cudaruntime support
         # this is expensive to compute at the first time
@@ -155,16 +165,17 @@ class CustomAllreduce:
             logger.warning(
                 "Custom allreduce is disabled because your platform lacks "
                 "GPU P2P capability or P2P test failed. To silence this "
-                "warning, specify disable_custom_all_reduce=True explicitly.")
+                "warning, specify disable_custom_all_reduce=True explicitly."
+            )
             return
 
         self.disabled = False
         # Buffers memory are owned by this Python class and passed to C++.
         # Metadata composes of two parts: metadata for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
-        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
-                                                   group=group,
-                                                   uncached=True)
+        self.meta_ptrs = self.create_shared_buffer(
+            ops.meta_size() + max_size, group=group, uncached=True
+        )
         # This is a pre-registered IPC buffer. In eager mode, input tensors
         # are first copied into this buffer before allreduce is performed
         self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
@@ -173,21 +184,22 @@ class CustomAllreduce:
         # 8*world_size bytes where world_size is at most 8. Allocating 8MB
         # is enough for 131072 such tuples. The largest model I've seen only
         # needs less than 10000 of registered tuples.
-        self.rank_data = torch.empty(8 * 1024 * 1024,
-                                     dtype=torch.uint8,
-                                     device=self.device)
+        self.rank_data = torch.empty(
+            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+        )
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
         self.fully_connected = fully_connected
-        self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
-                                       self.fully_connected)
+        self._ptr = ops.init_custom_ar(
+            self.meta_ptrs, self.rank_data, rank, self.fully_connected
+        )
         ops.register_buffer(self._ptr, self.buffer_ptrs)
 
     @contextmanager
     def capture(self):
         """
-        The main responsibility of this context manager is the 
+        The main responsibility of this context manager is the
         `register_graph_buffers` call at the end of the context.
         It records all the buffer addresses used in the CUDA graph.
         """
@@ -205,18 +217,17 @@ class CustomAllreduce:
         # We cannot directly use `dist.all_gather_object` here
         # because it is incompatible with `gloo` backend under inference mode.
         # see https://github.com/pytorch/pytorch/issues/126032 for details.
-        all_data = [[None, None]
-                    for _ in range(dist.get_world_size(group=self.group))]
+        all_data: list[list[list[int] | None]]
+        all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
         all_data[self.rank] = [handle, offset]
         ranks = sorted(dist.get_process_group_ranks(group=self.group))
         for i, rank in enumerate(ranks):
-            dist.broadcast_object_list(all_data[i],
-                                       src=rank,
-                                       group=self.group,
-                                       device="cpu")
+            dist.broadcast_object_list(
+                all_data[i], src=rank, group=self.group, device="cpu"
+            )
         # Unpack list of tuples to tuple of lists.
-        handles = [d[0] for d in all_data]  # type: ignore
-        offsets = [d[1] for d in all_data]  # type: ignore
+        handles = cast(list[list[int]], [d[0] for d in all_data])
+        offsets = cast(list[list[int]], [d[1] for d in all_data])
         ops.register_graph_buffers(self._ptr, handles, offsets)
 
     def should_custom_ar(self, inp: torch.Tensor):
@@ -234,13 +245,11 @@ class CustomAllreduce:
             return inp_size < self.max_size
         return False
 
-    def all_reduce(self,
-                   inp: torch.Tensor,
-                   *,
-                   out: torch.Tensor = None,
-                   registered: bool = False):
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
+    ):
         """Performs an out-of-place all reduce.
-        
+
         If registered is True, this assumes inp's pointer is already
         IPC-registered. Otherwise, inp is first copied into a pre-registered
         buffer.
@@ -250,11 +259,12 @@ class CustomAllreduce:
         if registered:
             ops.all_reduce(self._ptr, inp, out, 0, 0)
         else:
-            ops.all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank],
-                           self.max_size)
+            ops.all_reduce(
+                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+            )
         return out
 
-    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
+    def custom_all_reduce(self, input: torch.Tensor) -> torch.Tensor | None:
         """The main allreduce API that provides support for cuda graph."""
         # When custom allreduce is disabled, this will be None.
         if self.disabled or not self.should_custom_ar(input):
@@ -284,9 +294,11 @@ class CustomAllreduce:
         self.close()
 
     @staticmethod
-    def create_shared_buffer(size_in_bytes: int,
-                             group: Optional[ProcessGroup] = None,
-                             uncached: Optional[bool] = False) -> list[int]:
+    def create_shared_buffer(
+        size_in_bytes: int,
+        group: ProcessGroup | None = None,
+        uncached: bool | None = False,
+    ) -> list[int]:
         pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
 
         world_size = dist.get_world_size(group=group)
@@ -303,9 +315,11 @@ class CustomAllreduce:
         return pointers
 
     @staticmethod
-    def free_shared_buffer(pointers: list[int],
-                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = None) -> None:
+    def free_shared_buffer(
+        pointers: list[int],
+        group: ProcessGroup | None = None,
+        rank: int | None = None,
+    ) -> None:
         if rank is None:
             rank = dist.get_rank(group=group)
         if ops is not None:
diff --git a/vllm/distributed/device_communicators/mnnvl_compat.py b/vllm/distributed/device_communicators/mnnvl_compat.py
index 80072c4fa643ffd2a4b8f0efc1ade525fcad0c8a..61aee2db46b843222c58e55a13c66cbcd3bcf3a1 100644
--- a/vllm/distributed/device_communicators/mnnvl_compat.py
+++ b/vllm/distributed/device_communicators/mnnvl_compat.py
@@ -9,7 +9,6 @@ assert has_flashinfer_all2all(), "Flashinfer alltoallv module cannot be found"
 
 
 class CustomCommunicator(CommBackend):
-
     def __init__(self, group):
         self._group = group
 
@@ -24,5 +23,5 @@ class CustomCommunicator(CommBackend):
         dist.all_gather_object(gathered, data, group=self._group)
         return gathered
 
-    def Split(self, color: int, key: int) -> 'CustomCommunicator':
+    def Split(self, color: int, key: int) -> "CustomCommunicator":
         return self
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 76fe9a93259fa24833d00d32e6e6584ce8ad6665..2fc35e80f5919eea3c2ce1de96c2b1defac0d3e5 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,19 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 # ===================== import region =====================
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp
 
+import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl_wrapper import (
-    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
-    ncclRedOpTypeEnum, ncclUniqueId)
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import current_stream
+from vllm.utils.torch_utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -22,16 +28,16 @@ _NCCL_SYMM_OPS_REGISTERED = False
 
 def register_nccl_symmetric_ops(pynccl_comm):
     from vllm.distributed.device_communicators.pynccl_allocator import (
-        nccl_symm_mem_context)
-    from vllm.utils import direct_register_custom_op
+        nccl_symm_mem_context,
+    )
+    from vllm.utils.torch_utils import direct_register_custom_op
 
     global _NCCL_SYMM_OPS_REGISTERED
     if _NCCL_SYMM_OPS_REGISTERED:
         return
     _NCCL_SYMM_OPS_REGISTERED = True
 
-    def all_reduce_symmetric_with_copy_impl(
-            input_tensor: torch.Tensor) -> torch.Tensor:
+    def all_reduce_symmetric_with_copy_impl(input_tensor: torch.Tensor) -> torch.Tensor:
         with nccl_symm_mem_context(pynccl_comm):
             symm_input = torch.empty_like(input_tensor)
             symm_output = torch.empty_like(input_tensor)
@@ -39,8 +45,7 @@ def register_nccl_symmetric_ops(pynccl_comm):
         symm_output = pynccl_comm.all_reduce(symm_input, symm_output)
         return symm_output
 
-    def all_reduce_symmetric_with_copy_fake(
-            input_tensor: torch.Tensor) -> torch.Tensor:
+    def all_reduce_symmetric_with_copy_fake(input_tensor: torch.Tensor) -> torch.Tensor:
         return torch.empty_like(input_tensor)
 
     direct_register_custom_op(
@@ -51,12 +56,11 @@ def register_nccl_symmetric_ops(pynccl_comm):
 
 
 class PyNcclCommunicator:
-
     def __init__(
         self,
-        group: Union[ProcessGroup, StatelessProcessGroup],
-        device: Union[int, str, torch.device],
-        library_path: Optional[str] = None,
+        group: ProcessGroup | StatelessProcessGroup,
+        device: int | str | torch.device,
+        library_path: str | None = None,
     ):
         """
         Args:
@@ -72,7 +76,8 @@ class PyNcclCommunicator:
         if not isinstance(group, StatelessProcessGroup):
             assert dist.is_initialized()
             assert dist.get_backend(group) != dist.Backend.NCCL, (
-                "PyNcclCommunicator should be attached to a non-NCCL group.")
+                "PyNcclCommunicator should be attached to a non-NCCL group."
+            )
             # note: this rank is the rank in the group
             self.rank = dist.get_rank(group)
             self.world_size = dist.get_world_size(group)
@@ -83,7 +88,7 @@ class PyNcclCommunicator:
         self.group = group
 
         # if world_size == 1, no need to create communicator
-        if self.world_size == 1:
+        if self.world_size == 1 or envs.VLLM_DISABLE_PYNCCL:
             self.available = False
             self.disabled = True
             return
@@ -100,11 +105,12 @@ class PyNcclCommunicator:
         self.disabled = False
 
         self.nccl_version = self.nccl.ncclGetRawVersion()
-        logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
-
         if self.rank == 0:
             # get the unique id from NCCL
             self.unique_id = self.nccl.ncclGetUniqueId()
+            logger.info_once(
+                "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
+            )
         else:
             # construct an empty unique id
             self.unique_id = ncclUniqueId()
@@ -131,7 +137,8 @@ class PyNcclCommunicator:
         # current cuda device to the specified one
         with torch.cuda.device(device):
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
-                self.world_size, self.unique_id, self.rank)
+                self.world_size, self.unique_id, self.rank
+            )
 
             stream = current_stream()
             # A small all_reduce for warmup.
@@ -140,11 +147,13 @@ class PyNcclCommunicator:
             stream.synchronize()
             del data
 
-    def all_reduce(self,
-                   in_tensor: torch.Tensor,
-                   out_tensor: torch.Tensor = None,
-                   op: ReduceOp = ReduceOp.SUM,
-                   stream=None) -> torch.Tensor:
+    def all_reduce(
+        self,
+        in_tensor: torch.Tensor,
+        out_tensor: torch.Tensor = None,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ) -> torch.Tensor:
         if self.disabled:
             return None
         # nccl communicator created on a specific device
@@ -152,25 +161,28 @@ class PyNcclCommunicator:
         # otherwise it will cause "illegal memory access"
         assert in_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {in_tensor.device}")
+            f"but the input tensor is on {in_tensor.device}"
+        )
 
         if out_tensor is None:
             out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
-                                buffer_type(out_tensor.data_ptr()),
-                                in_tensor.numel(),
-                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
-                                ncclRedOpTypeEnum.from_torch(op), self.comm,
-                                cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclAllReduce(
+            buffer_type(in_tensor.data_ptr()),
+            buffer_type(out_tensor.data_ptr()),
+            in_tensor.numel(),
+            ncclDataTypeEnum.from_torch(in_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
         return out_tensor
 
-    def all_gather(self,
-                   output_tensor: torch.Tensor,
-                   input_tensor: torch.Tensor,
-                   stream=None):
+    def all_gather(
+        self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None
+    ):
         if self.disabled:
             return
         # nccl communicator created on a specific device
@@ -178,14 +190,18 @@ class PyNcclCommunicator:
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
-            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
-            cudaStream_t(stream.cuda_stream))
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def all_gatherv(
         self,
@@ -201,14 +217,15 @@ class PyNcclCommunicator:
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         assert output_tensor.shape[0] == sum(sizes)
         split_offset = 0
         self.nccl.ncclGroupStart()
         for root, split_size in enumerate(sizes):
-            dst_slice = output_tensor[split_offset:split_offset + split_size]
+            dst_slice = output_tensor[split_offset : split_offset + split_size]
             self.nccl.ncclBroadcast(
                 buffer_type(input_tensor.data_ptr()),
                 buffer_type(dst_slice.data_ptr()),
@@ -221,11 +238,13 @@ class PyNcclCommunicator:
             split_offset += split_size
         self.nccl.ncclGroupEnd()
 
-    def reduce_scatter(self,
-                       output_tensor: torch.Tensor,
-                       input_tensor: torch.Tensor,
-                       op: ReduceOp = ReduceOp.SUM,
-                       stream=None):
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
         if self.disabled:
             return
         # nccl communicator created on a specific device
@@ -233,15 +252,19 @@ class PyNcclCommunicator:
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+            buffer_type(output_tensor.data_ptr()),
+            output_tensor.numel(),
             ncclDataTypeEnum.from_torch(input_tensor.dtype),
-            ncclRedOpTypeEnum.from_torch(op), self.comm,
-            cudaStream_t(stream.cuda_stream))
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def reduce_scatterv(
         self,
@@ -258,20 +281,25 @@ class PyNcclCommunicator:
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
 
         split_offset = 0
         self.nccl.ncclGroupStart()
         for root, split_size in enumerate(sizes):
-            chunk = input_tensor[split_offset:split_offset + split_size, ...]
+            chunk = input_tensor[split_offset : split_offset + split_size, ...]
             self.nccl.ncclReduce(
                 buffer_type(chunk.data_ptr()),
-                buffer_type(output_tensor.data_ptr()), chunk.numel(),
+                buffer_type(output_tensor.data_ptr()),
+                chunk.numel(),
                 ncclDataTypeEnum.from_torch(input_tensor.dtype),
-                ncclRedOpTypeEnum.from_torch(op), root, self.comm,
-                cudaStream_t(stream.cuda_stream))
+                ncclRedOpTypeEnum.from_torch(op),
+                root,
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
             split_offset += split_size
         self.nccl.ncclGroupEnd()
 
@@ -280,31 +308,44 @@ class PyNcclCommunicator:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), dst,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         if src == self.rank:
@@ -314,9 +355,15 @@ class PyNcclCommunicator:
         else:
             sendbuff = buffer_type()
             recvbuff = buffer_type(tensor.data_ptr())
-        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                                self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def group_start(self):
         self.nccl.ncclGroupStart()
@@ -333,8 +380,7 @@ class PyNcclCommunicator:
         )
 
     def register_comm_window_raw(self, ptr: int, size: int):
-        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr),
-                                                size, 1)
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
 
     def deregister_comm_window(self, window):
         return self.nccl.ncclCommWindowDeregister(self.comm, window)
diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index bc874c1e197e771e214bde9f0da79e24800d65db..401b80046f606ab4774af42f573f4ccfd29d9e25 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -3,7 +3,7 @@
 import atexit
 import contextlib
 import tempfile
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from packaging import version
@@ -14,7 +14,7 @@ from vllm import envs
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import find_nccl_include_paths
+from vllm.utils.nccl import find_nccl_include_paths
 
 logger = init_logger(__name__)
 
@@ -98,7 +98,9 @@ def compile_nccl_allocator():
             "This is expected if NCCL headers are not available. "
             "optionally set VLLM_NCCL_INCLUDE_PATH to point to a directory "
             "containing the NCCL header. "
-            "Error: %s", str(e))
+            "Error: %s",
+            str(e),
+        )
 
 
 def get_nccl_mem_pool():
@@ -125,21 +127,24 @@ atexit.register(_cleanup_nccl_allocator_wrapper)
 
 
 class nccl_symm_mem_context:
-
     def __init__(
         self,
         pynccl_comm: PyNcclCommunicator,
         disabled: bool = False,
     ):
-        self.disabled = (disabled or not is_symmetric_memory_enabled()
-                         or pynccl_comm.world_size == 1
-                         or not current_platform.is_cuda()
-                         or get_nccl_mem_pool() is None or version.parse(
-                             torch.__version__) < version.parse("2.8.0.a0"))
+        self.disabled = (
+            disabled
+            or not is_symmetric_memory_enabled()
+            or pynccl_comm.world_size == 1
+            or not current_platform.is_cuda()
+            or get_nccl_mem_pool() is None
+            or version.parse(torch.__version__) < version.parse("2.8.0.a0")
+        )
         if self.disabled:
-            self.pynccl_comm: Optional[PyNcclCommunicator] = None
-            self._mem_pool_ctx: contextlib.AbstractContextManager[
-                Any] = contextlib.nullcontext()
+            self.pynccl_comm: PyNcclCommunicator | None = None
+            self._mem_pool_ctx: contextlib.AbstractContextManager[Any] = (
+                contextlib.nullcontext()
+            )
             self.is_graph_capture = None
             self.device = None
         else:
@@ -151,16 +156,16 @@ class nccl_symm_mem_context:
     def __enter__(self):
         if self.disabled:
             return self
-        assert (
-            self.pynccl_comm
-            is not None), "Symmetric memory requires pynccl to be initalized"
-        assert (
-            self.pynccl_comm.nccl_version >= 22703
-        ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        assert self.pynccl_comm is not None, (
+            "Symmetric memory requires pynccl to be initalized"
+        )
+        assert self.pynccl_comm.nccl_version >= 22703, (
+            "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        )
         if self.is_graph_capture:
-            assert (
-                _graph_pool_id
-                is not None), "graph_pool_id is not set under graph capture"
+            assert _graph_pool_id is not None, (
+                "graph_pool_id is not set under graph capture"
+            )
             # Pause graph memory pool to use symmetric memory with cuda graph
             torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
         self._mem_pool_ctx.__enter__()
@@ -179,8 +184,8 @@ class nccl_symm_mem_context:
         for segment in _cached_pool_snapshot:
             if segment["address"] not in _registered_base_addrs:
                 self.pynccl_comm.register_comm_window_raw(
-                    segment["address"], segment["total_size"])
+                    segment["address"], segment["total_size"]
+                )
                 _registered_base_addrs.add(segment["address"])
         if self.is_graph_capture:
-            torch._C._cuda_beginAllocateCurrentThreadToPool(
-                self.device, _graph_pool_id)
+            torch._C._cuda_beginAllocateCurrentThreadToPool(self.device, _graph_pool_id)
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index c3e99e177e2d537dbb0615cb2b87f4df57e519ce..b2433d58dc1f0f1177437a13696aeef0ba8d50ff 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -25,13 +25,15 @@
 import ctypes
 import platform
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.distributed import ReduceOp
 
+from vllm import envs
 from vllm.logger import init_logger
-from vllm.utils import find_nccl_library
+from vllm.platforms import current_platform
+from vllm.utils.nccl import find_nccl_library
 
 logger = init_logger(__name__)
 
@@ -131,88 +133,141 @@ class NCCLLibrary:
         # const char* ncclGetErrorString(ncclResult_t result)
         Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
         # ncclResult_t  ncclGetVersion(int *version);
-        Function("ncclGetVersion", ncclResult_t,
-                 [ctypes.POINTER(ctypes.c_int)]),
+        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
         # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
-        Function("ncclGetUniqueId", ncclResult_t,
-                 [ctypes.POINTER(ncclUniqueId)]),
+        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
         # ncclResult_t  ncclCommInitRank(
         #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
         # note that ncclComm_t is a pointer type, so the first argument
         # is a pointer to a pointer
-        Function("ncclCommInitRank", ncclResult_t, [
-            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
-            ctypes.c_int
-        ]),
+        Function(
+            "ncclCommInitRank",
+            ncclResult_t,
+            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
+        ),
         # ncclResult_t  ncclAllReduce(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
         #   cudaStream_t stream);
         # note that cudaStream_t is a pointer type, so the last argument
         # is a pointer
-        Function("ncclAllReduce", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ncclRedOp_t, ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclAllReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclReduce(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclRedOp_t op, int root,
         #   ncclComm_t comm,  cudaStream_t stream);
         # note that cudaStream_t is a pointer type, so the last argument
         # is a pointer
-        Function("ncclReduce", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ncclRedOp_t, ctypes.c_int, ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclAllGather(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclComm_t comm,
         #   cudaStream_t stream);
         # note that cudaStream_t is a pointer type, so the last argument
         # is a pointer
-        Function("ncclAllGather", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclAllGather",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclReduceScatter(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
         #   cudaStream_t stream);
         # note that cudaStream_t is a pointer type, so the last argument
         # is a pointer
-        Function("ncclReduceScatter", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ncclRedOp_t, ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclReduceScatter",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclSend(
         #   const void* sendbuff, size_t count, ncclDataType_t datatype,
         #   int dest, ncclComm_t comm, cudaStream_t stream);
-        Function("ncclSend", ncclResult_t, [
-            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
-            ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclSend",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclRecv(
         #   void* recvbuff, size_t count, ncclDataType_t datatype,
         #   int src, ncclComm_t comm, cudaStream_t stream);
-        Function("ncclRecv", ncclResult_t, [
-            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
-            ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclRecv",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t ncclBroadcast(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, int root, ncclComm_t comm,
         #   cudaStream_t stream);
-        Function("ncclBroadcast", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ctypes.c_int, ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclBroadcast",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # be cautious! this is a collective call, it will block until all
         # processes in the communicator have called this function.
         # because Python object destruction can happen in random order,
@@ -239,8 +294,7 @@ class NCCLLibrary:
         ),
         # ncclResult_t ncclCommWindowDeregister(
         #   ncclComm_t comm, ncclWindow_t win);
-        Function("ncclCommWindowDeregister", ncclResult_t,
-                 [ncclComm_t, ncclWindow_t]),
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
     ]
 
     # class attribute to store the mapping from the path to the library
@@ -251,8 +305,7 @@ class NCCLLibrary:
     #  to the corresponding dictionary
     path_to_dict_mapping: dict[str, dict[str, Any]] = {}
 
-    def __init__(self, so_file: Optional[str] = None):
-
+    def __init__(self, so_file: str | None = None):
         so_file = so_file or find_nccl_library()
 
         try:
@@ -268,17 +321,39 @@ class NCCLLibrary:
                 "or it does not support the current platform %s. "
                 "If you already have the library, please set the "
                 "environment variable VLLM_NCCL_SO_PATH"
-                " to point to the correct nccl library path.", so_file,
-                platform.platform())
+                " to point to the correct nccl library path.",
+                so_file,
+                platform.platform(),
+            )
             raise e
 
         if so_file not in NCCLLibrary.path_to_dict_mapping:
             _funcs: dict[str, Any] = {}
             for func in NCCLLibrary.exported_functions:
-                f = getattr(self.lib, func.name)
-                f.restype = func.restype
-                f.argtypes = func.argtypes
-                _funcs[func.name] = f
+                try:
+                    f = getattr(self.lib, func.name)
+                    f.restype = func.restype
+                    f.argtypes = func.argtypes
+                    _funcs[func.name] = f
+                except AttributeError:
+                    if func.name in [
+                        "ncclCommWindowRegister",
+                        "ncclCommWindowDeregister",
+                    ]:
+                        if envs.VLLM_USE_NCCL_SYMM_MEM:
+                            logger.warning_once(
+                                "The symbol %s is not found in the NCCL "
+                                "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
+                                " please update your NCCL version to >= "
+                                "2.27.03.",
+                                func.name,
+                                so_file,
+                            )
+                        if current_platform.is_rocm():
+                            # Having an exception here on ROCm platform is
+                            # not allowed during graph capturing
+                            continue
+                    raise
             NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
         self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
 
@@ -306,88 +381,153 @@ class NCCLLibrary:
 
     def ncclGetUniqueId(self) -> ncclUniqueId:
         unique_id = ncclUniqueId()
-        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
-            ctypes.byref(unique_id)))
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
         return unique_id
 
     def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
         if len(data) != 128:
             raise ValueError(
-                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes")
+                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
+            )
         unique_id = ncclUniqueId()
         ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
         return unique_id
 
-    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
-                         rank: int) -> ncclComm_t:
+    def ncclCommInitRank(
+        self, world_size: int, unique_id: ncclUniqueId, rank: int
+    ) -> ncclComm_t:
         comm = ncclComm_t()
-        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
-                                                        world_size, unique_id,
-                                                        rank))
+        self.NCCL_CHECK(
+            self._funcs["ncclCommInitRank"](
+                ctypes.byref(comm), world_size, unique_id, rank
+            )
+        )
         return comm
 
-    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                      count: int, datatype: int, op: int, comm: ncclComm_t,
-                      stream: cudaStream_t) -> None:
+    def ncclAllReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
         # `datatype` actually should be `ncclDataType_t`
         # and `op` should be `ncclRedOp_t`
         # both are aliases of `ctypes.c_int`
         # when we pass int to a function, it will be converted to `ctypes.c_int`
         # by ctypes automatically
-        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
-                                                     datatype, op, comm,
-                                                     stream))
-
-    def ncclReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                   count: int, datatype: int, op: int, root: int,
-                   comm: ncclComm_t, stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclAllReduce"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
         # `datatype` actually should be `ncclDataType_t`
         # and `op` should be `ncclRedOp_t`
         # both are aliases of `ctypes.c_int`
         # when we pass int to a function, it will be converted to `ctypes.c_int`
         # by ctypes automatically
-        self.NCCL_CHECK(self._funcs["ncclReduce"](sendbuff, recvbuff, count,
-                                                  datatype, op, root, comm,
-                                                  stream))
-
-    def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                          count: int, datatype: int, op: int, comm: ncclComm_t,
-                          stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclReduce"](
+                sendbuff, recvbuff, count, datatype, op, root, comm, stream
+            )
+        )
+
+    def ncclReduceScatter(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
         # `datatype` actually should be `ncclDataType_t`
         # and `op` should be `ncclRedOp_t`
         # both are aliases of `ctypes.c_int`
         # when we pass int to a function, it will be converted to `ctypes.c_int`
         # by ctypes automatically
-        self.NCCL_CHECK(self._funcs["ncclReduceScatter"](sendbuff, recvbuff,
-                                                         count, datatype, op,
-                                                         comm, stream))
-
-    def ncclAllGather(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                      count: int, datatype: int, comm: ncclComm_t,
-                      stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclReduceScatter"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclAllGather(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
         # `datatype` actually should be `ncclDataType_t`
         # which is an aliases of `ctypes.c_int`
         # when we pass int to a function, it will be converted to `ctypes.c_int`
         # by ctypes automatically
-        self.NCCL_CHECK(self._funcs["ncclAllGather"](sendbuff, recvbuff, count,
-                                                     datatype, comm, stream))
-
-    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
-                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
-        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
-                                                dest, comm, stream))
-
-    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
-                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
-        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
-                                                comm, stream))
-
-    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                      count: int, datatype: int, root: int, comm: ncclComm_t,
-                      stream: cudaStream_t) -> None:
-        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
-                                                     datatype, root, comm,
-                                                     stream))
+        self.NCCL_CHECK(
+            self._funcs["ncclAllGather"](
+                sendbuff, recvbuff, count, datatype, comm, stream
+            )
+        )
+
+    def ncclSend(
+        self,
+        sendbuff: buffer_type,
+        count: int,
+        datatype: int,
+        dest: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
+        )
+
+    def ncclRecv(
+        self,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        src: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
+        )
+
+    def ncclBroadcast(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclBroadcast"](
+                sendbuff, recvbuff, count, datatype, root, comm, stream
+            )
+        )
 
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
@@ -398,19 +538,27 @@ class NCCLLibrary:
     def ncclGroupEnd(self) -> None:
         self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
 
-    def ncclCommWindowRegister(self, comm: ncclComm_t, buff: buffer_type,
-                               size: int, win_flags: int) -> ncclWindow_t:
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
         window = ncclWindow_t()
-        self.NCCL_CHECK(self._funcs["ncclCommWindowRegister"](
-            comm, buff, size, ctypes.byref(window), win_flags))
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
         return window
 
-    def ncclCommWindowDeregister(self, comm: ncclComm_t,
-                                 window: ncclWindow_t) -> None:
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
 
 
 __all__ = [
-    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
-    "ncclComm_t", "cudaStream_t", "buffer_type"
+    "NCCLLibrary",
+    "ncclDataTypeEnum",
+    "ncclRedOpTypeEnum",
+    "ncclUniqueId",
+    "ncclComm_t",
+    "cudaStream_t",
+    "buffer_type",
 ]
diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
index 836241910e2fb7f4b4b1a459376147cb873390dc..9c7765883cfd1340081d2d2d8593c7d7ef90778a 100644
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
-from typing import Union
 
 import torch
 import torch.distributed as dist
@@ -14,7 +13,7 @@ from vllm.config import get_current_vllm_config
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -27,9 +26,10 @@ except Exception:
 
 
 def is_weak_contiguous(inp: torch.Tensor):
-    return inp.is_contiguous() or (inp.storage().nbytes() -
-                                   inp.storage_offset() * inp.element_size()
-                                   == inp.numel() * inp.element_size())
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
 
 
 class QuickReduceRegime(Enum):
@@ -44,7 +44,6 @@ MB = 1024 * 1024
 
 
 class QuickAllReduce:
-
     _SUPPORTED_WORLD_SIZES = [2, 4, 8]
     _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
     # The following data is based on kernel tests.
@@ -58,20 +57,19 @@ class QuickAllReduce:
         (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
     }
 
-    def __init__(self, group: ProcessGroup,
-                 device: Union[int, str, torch.device]) -> None:
+    def __init__(self, group: ProcessGroup, device: int | str | torch.device) -> None:
         """
-        Custom allreduce provides non-destructive acceleration and is 
+        Custom allreduce provides non-destructive acceleration and is
         available for CUDA and ROCm MI300 series.
 
-        Custom quick allreduce leverages quantization for further 
-        acceleration on ROCm. It currently supports Q8, Q6, and Q4 
+        Custom quick allreduce leverages quantization for further
+        acceleration on ROCm. It currently supports Q8, Q6, and Q4
         quantization formats and FP(float16, bfloat16).
 
-        Quick allreduce is designed as a complement to custom allreduce. 
-        Its initialization requires even stricter conditions. 
+        Quick allreduce is designed as a complement to custom allreduce.
+        Its initialization requires even stricter conditions.
 
-        Only the ROCm MI300 series is supported for quick allreduce at 
+        Only the ROCm MI300 series is supported for quick allreduce at
         this time.
 
         Args:
@@ -93,18 +91,23 @@ class QuickAllReduce:
         if not quick_ar:
             # disable because of missing quick reduce library
             # e.g. in a cuda environment
-            logger.info("Custom quick allreduce is disabled because "
-                        "of missing custom quick allreduce library")
+            logger.info(
+                "Custom quick allreduce is disabled because "
+                "of missing custom quick allreduce library"
+            )
             return
 
         self.group = group
         assert dist.get_backend(group) != dist.Backend.NCCL, (
-            "Custom quick allreduce should be attached to a non-NCCL group.")
+            "Custom quick allreduce should be attached to a non-NCCL group."
+        )
         if not all(in_the_same_node_as(group, source_rank=0)):
             # No need to initialize custom quick allreduce for
             # multi-node case.
-            logger.warning("Custom quick allreduce is disabled because this "
-                           "process group spans across nodes.")
+            logger.warning(
+                "Custom quick allreduce is disabled because this "
+                "process group spans across nodes."
+            )
             return
         rank = dist.get_rank(group=self.group)
         world_size = dist.get_world_size(group=self.group)
@@ -118,7 +121,9 @@ class QuickAllReduce:
             logger.warning(
                 "Custom quick allreduce is disabled due to an "
                 "unsupported world size: %d. Supported world sizes: %s.",
-                world_size, str(QuickAllReduce._SUPPORTED_WORLD_SIZES))
+                world_size,
+                str(QuickAllReduce._SUPPORTED_WORLD_SIZES),
+            )
             return
 
         if isinstance(device, int):
@@ -134,9 +139,7 @@ class QuickAllReduce:
         else:
             device_ids = list(range(cuda_device_count_stateless()))
         physical_device_id = device_ids[device.index]
-        tensor = torch.tensor([physical_device_id],
-                              dtype=torch.int,
-                              device="cpu")
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
         gather_list = [
             torch.tensor([0], dtype=torch.int, device="cpu")
             for _ in range(self.world_size)
@@ -148,12 +151,12 @@ class QuickAllReduce:
         # where custom quick allreduce is not supported
         # this checks hardware and driver support for NVLink
         assert current_platform.is_cuda_alike()
-        self.fully_connected = current_platform.is_fully_connected(
-            physical_device_ids)
+        self.fully_connected = current_platform.is_fully_connected(physical_device_ids)
         if self.world_size > 2 and not self.fully_connected:
             logger.debug(
                 "Custom quick allreduce is disabled because it's not supported "
-                "on more than two PCIe-only GPUs. ")
+                "on more than two PCIe-only GPUs. "
+            )
             return
 
         self.init_quick_all_reduce()
@@ -169,24 +172,31 @@ class QuickAllReduce:
                 "Custom quick allreduce:",
                 f"Invalid quantization level: {regime_str}. "
                 "Supported levels: "
-                f"{list(QuickReduceRegime.__members__.keys())}")
+                f"{list(QuickReduceRegime.__members__.keys())}",
+            )
             return
 
         if regime_str == "NONE":
-            logger.debug("Custom quick allreduce is disabled based "
-                         "on env variable "
-                         "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION='NONE'")
+            logger.debug(
+                "Custom quick allreduce is disabled based "
+                "on env variable "
+                "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION='NONE'"
+            )
             return
         self.qr_quant_level = QuickReduceRegime[regime_str]
         vllm_config = get_current_vllm_config()
-        if vllm_config is not None and \
-            hasattr(vllm_config, "model_config") and \
-            hasattr(vllm_config.model_config, "dtype"):
+        if (
+            vllm_config is not None
+            and hasattr(vllm_config, "model_config")
+            and hasattr(vllm_config.model_config, "dtype")
+        ):
             dtype = vllm_config.model_config.dtype
             if dtype not in [torch.float16, torch.bfloat16]:
                 logger.debug(
                     "Custom quick allreduce disabled: only supports "
-                    "float16 and float16, but get %s.", dtype)
+                    "float16 and float16, but get %s.",
+                    dtype,
+                )
                 return
 
             if dtype == torch.bfloat16 and self.use_fp16_kernels:
@@ -194,7 +204,8 @@ class QuickAllReduce:
                     "Custom quick allreduce: BF16 inputs will be converted "
                     "to FP16 to improve performance. set "
                     "envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 "
-                    "to turn off.")
+                    "to turn off."
+                )
 
         # VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB
         qr_max_size = envs.VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB
@@ -206,8 +217,7 @@ class QuickAllReduce:
                 )
             qr_max_size = qr_max_size * MB
         self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size)
-        self.qr_max_size = qr_max_size if qr_max_size is not None \
-            else ops.qr_max_size()
+        self.qr_max_size = qr_max_size if qr_max_size is not None else ops.qr_max_size()
         self.create_shared_buffer()
         self.disabled = False
 
@@ -217,16 +227,15 @@ class QuickAllReduce:
         try:
             props = torch.cuda.get_device_properties(0)
             gcn_arch = getattr(props, "gcnArchName", "")
-            supported_archs = ['gfx94', 'gfx95']
+            supported_archs = ["gfx94", "gfx95"]
             return any(gfx in gcn_arch for gfx in supported_archs)
         except Exception as e:
-            logger.warning("Failed to determine ROCm for quick allreduce: %s",
-                           e)
+            logger.warning("Failed to determine ROCm for quick allreduce: %s", e)
             return False
 
     def create_shared_buffer(self):
         """
-        Creates a shared buffer for quickreduce. 
+        Creates a shared buffer for quickreduce.
         Has to be called after init_custom_qr
         """
         handle = ops.qr_get_handle(self._ptr)
@@ -253,9 +262,11 @@ class QuickAllReduce:
         dtype = inp.dtype
         if self.use_fp16_kernels:
             dtype = torch.float16
-        return inp_size <= self.qr_max_size and \
-            inp_size >= self._QR_MIN_SIZE[(dtype, self.world_size)]\
-                [self.qr_quant_level.value]
+        return (
+            inp_size <= self.qr_max_size
+            and inp_size
+            >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value]
+        )
 
     def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
         """Performs an out-of-place custom quick all reduce."""
@@ -263,8 +274,9 @@ class QuickAllReduce:
         # as QR uses static IPC buffer.
         if out is None:
             out = torch.empty_like(inp)
-        ops.qr_all_reduce(self._ptr, inp, out, self.qr_quant_level.value,
-                          self.use_fp16_kernels)
+        ops.qr_all_reduce(
+            self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels
+        )
         return out
 
     def close(self):
diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
index 69efc8b45270285cdea754b1393a14e68a3e8a15..d9517f51acad3490cb9c2f8c494e13dd4e0aec4b 100644
--- a/vllm/distributed/device_communicators/ray_communicator.py
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -1,20 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import uuid
-from typing import Any, Optional
+from typing import Any
 
 import ray
 import torch
 from ray.exceptions import RayChannelError
-from ray.experimental.channel.communicator import (Communicator,
-                                                   TorchTensorAllocator)
+from ray.experimental.channel.communicator import Communicator, TorchTensorAllocator
 from torch.distributed import ReduceOp
 
 from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase)
+    DeviceCommunicatorBase,
+)
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
-from vllm.utils import current_stream
+from vllm.utils.torch_utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -27,15 +27,15 @@ class RayPPCommunicator(Communicator):
     This class is not thread-safe.
     """
 
-    _comm: Optional[DeviceCommunicatorBase]
+    _comm: DeviceCommunicatorBase | None
 
     def __init__(
         self,
         world_size: int,
         comm_id: Any,
-        rank: Optional[int],
+        rank: int | None,
         actor_handles: list["ray.actor.ActorHandle"],
-        cuda_stream: Optional[torch.cuda.Stream],
+        cuda_stream: torch.cuda.Stream | None,
         use_communication_streams: bool = False,
     ):
         """
@@ -56,14 +56,14 @@ class RayPPCommunicator(Communicator):
                 This is not supported.
         """
         self._world_size = world_size
-        self._rank: Optional[int] = None
+        self._rank: int | None = None
         self._actor_handles = actor_handles
         if use_communication_streams:
-            raise NotImplementedError(
-                "use_communication_streams is not supported")
+            raise NotImplementedError("use_communication_streams is not supported")
         if cuda_stream is not None and cuda_stream != current_stream():
             raise ValueError(
-                "cuda_stream other than the current stream is not supported")
+                "cuda_stream other than the current stream is not supported"
+            )
 
         if rank is not None:
             # Rank is not None, this is Ray worker
@@ -99,13 +99,14 @@ class RayPPCommunicator(Communicator):
 
         # Ray actor IDs are 32-character hex strings (128 bits)
         ACTOR_ID_LEN = 32
-        actor_id_bytes = actor_id_str.encode('utf-8')
-        assert len(
-            actor_id_bytes
-        ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}"
+        actor_id_bytes = bytearray(actor_id_str.encode("utf-8"))
+        assert len(actor_id_bytes) == ACTOR_ID_LEN, (
+            f"Unexpected actor ID length: {len(actor_id_bytes)}"
+        )
 
-        actor_id_tensor = torch.frombuffer(
-            actor_id_bytes, dtype=torch.uint8).to(self._comm.device)
+        actor_id_tensor = torch.frombuffer(actor_id_bytes, dtype=torch.uint8).to(
+            self._comm.device
+        )
 
         # All-gather full actor IDs from all actors
         gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
@@ -115,9 +116,8 @@ class RayPPCommunicator(Communicator):
         for rank in range(self._world_size):
             start_idx = rank * ACTOR_ID_LEN
             end_idx = (rank + 1) * ACTOR_ID_LEN
-            actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy(
-            ).tobytes()
-            actor_id = actor_bytes.decode('utf-8')
+            actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy().tobytes()
+            actor_id = actor_bytes.decode("utf-8")
             self._actor_id_to_rank[actor_id] = rank
 
     def initialize(self, rank: int) -> None:
@@ -131,9 +131,10 @@ class RayPPCommunicator(Communicator):
         """
         Return the given actor's rank using device communicator collective ops.
         """
-        assert hasattr(self, '_actor_id_to_rank'), (
+        assert hasattr(self, "_actor_id_to_rank"), (
             "Actor rank mapping not built. "
-            "This should have been done during initialization.")
+            "This should have been done during initialization."
+        )
 
         actor_id_str = actor._actor_id.hex()
 
@@ -142,7 +143,7 @@ class RayPPCommunicator(Communicator):
         else:
             raise ValueError(f"Actor {actor} not found in communicator group")
 
-    def get_self_rank(self) -> Optional[int]:
+    def get_self_rank(self) -> int | None:
         """
         Return this actor's rank.
         """
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 0fc9d1cf4f5185bfbb4d33e4df6dadff5ab8e456..052df19e34d72c396028639f9da1620865e0f7c5 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,35 +1,65 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import functools
 import pickle
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
+from pickle import PickleBuffer
 from threading import Event
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
 import zmq
 from torch.distributed import ProcessGroup
-from zmq import IPV6  # type: ignore
-from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
+from zmq import (  # type: ignore
+    IPV6,  # type: ignore
+    SUB,
+    SUBSCRIBE,
+    XPUB,
+    XPUB_VERBOSE,
+    Context,
+)
 
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup, sched_yield
 from vllm.logger import init_logger
-from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address)
+from vllm.utils.network_utils import (
+    get_ip,
+    get_open_port,
+    get_open_zmq_ipc_path,
+    is_valid_ipv6_address,
+)
+
+if TYPE_CHECKING:
+    from _typeshed import SizedBuffer
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
+from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
+
+
+def to_bytes_big(value: int, size: int) -> bytes:
+    return value.to_bytes(size, byteorder="big")
+
+
 logger = init_logger(__name__)
 
 
-class SpinTimer:
+def long_wait_time_msg(threshold: int) -> str:
+    return (
+        "No available shared memory broadcast block found "
+        f"in {threshold} seconds. This typically happens "
+        "when some processes are hanging or doing some "
+        "time-consuming work (e.g. compilation, "
+        "weight/kv cache quantization)."
+    )
+
 
+class SpinTimer:
     def record_activity(self):
         pass
 
@@ -66,12 +96,13 @@ class SpinSleepTimer(SpinTimer):
 
 
 class ShmRingBuffer:
-
-    def __init__(self,
-                 n_reader: int,
-                 max_chunk_bytes: int,
-                 max_chunks: int,
-                 name: Optional[str] = None):
+    def __init__(
+        self,
+        n_reader: int,
+        max_chunk_bytes: int,
+        max_chunks: int,
+        name: str | None = None,
+    ):
         """
         A shared memory ring buffer implementation for broadcast communication.
         Essentially, it is a queue where only one will `enqueue` and multiple
@@ -120,13 +151,14 @@ class ShmRingBuffer:
         created object to other processes by pickling it. The other processes will
         get the name of the shared memory and open it, so that they can access the
         same shared memory buffer.
-        """# noqa
+        """  # noqa
         self.n_reader = n_reader
         self.metadata_size = 1 + n_reader
         self.max_chunk_bytes = max_chunk_bytes
         self.max_chunks = max_chunks
-        self.total_bytes_of_buffer = (self.max_chunk_bytes +
-                                      self.metadata_size) * self.max_chunks
+        self.total_bytes_of_buffer = (
+            self.max_chunk_bytes + self.metadata_size
+        ) * self.max_chunks
         self.data_offset = 0
         self.metadata_offset = self.max_chunk_bytes * self.max_chunks
 
@@ -134,10 +166,10 @@ class ShmRingBuffer:
             # we are creating a buffer
             self.is_creator = True
             self.shared_memory = shared_memory.SharedMemory(
-                create=True, size=self.total_bytes_of_buffer)
+                create=True, size=self.total_bytes_of_buffer
+            )
             # initialize the metadata section to 0
-            with self.shared_memory.buf[self.
-                                        metadata_offset:] as metadata_buffer:
+            with self.shared_memory.buf[self.metadata_offset :] as metadata_buffer:
                 torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
         else:
             # we are opening an existing buffer
@@ -145,8 +177,10 @@ class ShmRingBuffer:
             # fix to https://stackoverflow.com/q/62748654/9191338
             # Python incorrectly tracks shared memory even if it is not
             # created by the process. The following patch is a workaround.
-            with patch("multiprocessing.resource_tracker.register",
-                       lambda *args, **kwargs: None):
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
                 try:
                     self.shared_memory = shared_memory.SharedMemory(name=name)
                     # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
@@ -154,8 +188,7 @@ class ShmRingBuffer:
                     # so the shared memory block size may be larger or equal
                     # to the requested size. The size parameter is ignored
                     # when attaching to an existing block.
-                    assert (self.shared_memory.size
-                            >= self.total_bytes_of_buffer)
+                    assert self.shared_memory.size >= self.total_bytes_of_buffer
                 except FileNotFoundError:
                     # we might deserialize the object in a different node
                     # in this case, this object is not used,
@@ -163,8 +196,12 @@ class ShmRingBuffer:
                     pass
 
     def handle(self):
-        return (self.n_reader, self.max_chunk_bytes, self.max_chunks,
-                self.shared_memory.name)
+        return (
+            self.n_reader,
+            self.max_chunk_bytes,
+            self.max_chunks,
+            self.shared_memory.name,
+        )
 
     def __reduce__(self):
         return (
@@ -197,22 +234,23 @@ class ShmRingBuffer:
 class Handle:
     local_reader_ranks: list[int] = field(default_factory=list)
 
-    buffer_handle: Optional[tuple[int, int, int, str]] = None
-    local_subscribe_addr: Optional[str] = None
-    remote_subscribe_addr: Optional[str] = None
+    buffer_handle: tuple[int, int, int, str] | None = None
+    local_subscribe_addr: str | None = None
+    remote_subscribe_addr: str | None = None
     remote_addr_ipv6: bool = False
 
 
 class MessageQueue:
-
     def __init__(
         self,
         n_reader,  # number of all readers
         n_local_reader,  # number of local readers through shared memory
-        local_reader_ranks: Optional[list[int]] = None,
-        max_chunk_bytes: int = 1024 * 1024 * 10,
+        local_reader_ranks: list[int] | None = None,
+        # Default of 24MiB chosen to be large enough to accommodate grammar
+        # bitmask tensors for large batches (1024 requests).
+        max_chunk_bytes: int = 1024 * 1024 * 24,
         max_chunks: int = 10,
-        connect_ip: Optional[str] = None,
+        connect_ip: str | None = None,
     ):
         if local_reader_ranks is None:
             local_reader_ranks = list(range(n_local_reader))
@@ -228,8 +266,7 @@ class MessageQueue:
             # for local readers, we will:
             # 1. create a shared memory ring buffer to communicate small data
             # 2. create a publish-subscribe socket to communicate large data
-            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
-                                        max_chunks)
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks)
 
             # XPUB is very similar to PUB,
             # except that it can receive subscription messages
@@ -279,14 +316,13 @@ class MessageQueue:
 
         self.handle = Handle(
             local_reader_ranks=local_reader_ranks,
-            buffer_handle=self.buffer.handle()
-            if self.buffer is not None else None,
+            buffer_handle=self.buffer.handle() if self.buffer is not None else None,
             local_subscribe_addr=local_subscribe_addr,
             remote_subscribe_addr=remote_subscribe_addr,
             remote_addr_ipv6=remote_addr_ipv6,
         )
 
-        logger.info("vLLM message queue communication handle: %s", self.handle)
+        logger.debug("vLLM message queue communication handle: %s", self.handle)
 
     def export_handle(self) -> Handle:
         return self.handle
@@ -315,8 +351,9 @@ class MessageQueue:
 
             self.remote_socket = None
 
-            self._read_spin_timer = SpinSleepTimer(
-            ) if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
+            self._read_spin_timer = (
+                SpinSleepTimer() if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
+            )
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -370,7 +407,7 @@ class MessageQueue:
             assert recv == b"READY"
 
     @contextmanager
-    def acquire_write(self, timeout: Optional[float] = None):
+    def acquire_write(self, timeout: float | None = None):
         assert self._is_writer, "Only writers can acquire write"
         start_time = time.monotonic()
         n_warning = 1
@@ -395,11 +432,8 @@ class MessageQueue:
                     # if we wait for a long time, log a message
                     if elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:
                         logger.info(
-                            "No available shared memory broadcast block found"
-                            " in %s seconds. This typically happens when some"
-                            " processes are hanging or doing some"
-                            " time-consuming work (e.g. compilation)",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        )
                         n_warning += 1
 
                     continue
@@ -423,15 +457,16 @@ class MessageQueue:
                     metadata_buffer[i] = 0
                 # mark the block as written
                 metadata_buffer[0] = 1
-                self.current_idx = (self.current_idx +
-                                    1) % self.buffer.max_chunks
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
                 break
 
     @contextmanager
-    def acquire_read(self,
-                     timeout: Optional[float] = None,
-                     cancel: Optional[Event] = None,
-                     indefinite: bool = False):
+    def acquire_read(
+        self,
+        timeout: float | None = None,
+        cancel: Event | None = None,
+        indefinite: bool = False,
+    ):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -460,15 +495,12 @@ class MessageQueue:
                         raise TimeoutError
 
                     # if we wait for a long time, log a message
-                    if not indefinite and (elapsed
-                                           > VLLM_RINGBUFFER_WARNING_INTERVAL *
-                                           n_warning):
+                    if not indefinite and (
+                        elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
                         logger.info(
-                            "No available shared memory broadcast block found"
-                            " in %s seconds. This typically happens when some"
-                            " processes are hanging or doing some"
-                            " time-consuming work (e.g. compilation).",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        )
                         n_warning += 1
 
                     continue
@@ -480,41 +512,74 @@ class MessageQueue:
                 # caller has read from the buffer
                 # set the read flag
                 metadata_buffer[self.local_reader_rank + 1] = 1
-                self.current_idx = (self.current_idx +
-                                    1) % self.buffer.max_chunks
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
 
                 self._read_spin_timer.record_activity()
                 break
 
-    def enqueue(self, obj, timeout: Optional[float] = None):
-        """ Write to message queue with optional timeout (in seconds) """
+    def enqueue(self, obj, timeout: float | None = None):
+        """Write to message queue with optional timeout (in seconds)"""
         assert self._is_writer, "Only writers can enqueue"
-        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        all_buffers: list[SizedBuffer] = [b""]
+        total_bytes = 6  # 2 bytes for oob buffer count, 4 for main buffer size
+
+        def oob_callback(buf: PickleBuffer) -> bool:
+            raw_buf = buf.raw()
+            if len(raw_buf) < 1024 * 1024:
+                # In-line buffers smaller than 1MiB.
+                return True
+            all_buffers.append(raw_buf)
+            nonlocal total_bytes
+            total_bytes += len(raw_buf) + 4
+            return False
+
+        all_buffers[0] = pickle.dumps(
+            obj, protocol=pickle.HIGHEST_PROTOCOL, buffer_callback=oob_callback
+        )
         if self.n_local_reader > 0:
-            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+            if total_bytes + len(all_buffers[0]) >= self.buffer.max_chunk_bytes:
                 with self.acquire_write(timeout) as buf:
                     buf[0] = 1  # overflow
-                self.local_socket.send(serialized_obj)
+                self.local_socket.send_multipart(all_buffers, copy=False)
             else:
+                # Byte 0: 0
+                # Bytes 1-2: Count of buffers
+                # Then each buffer follows, preceded by 4 bytes containing its length:
+                # [4 byte int L][L bytes of buffer content] ...
                 with self.acquire_write(timeout) as buf:
                     buf[0] = 0  # not overflow
-                    buf[1:len(serialized_obj) + 1] = serialized_obj
+                    offset = 3
+                    buf[1:offset] = to_bytes_big(len(all_buffers), 2)  # oob buf count
+                    for buffer in all_buffers:
+                        buf_len = len(buffer)
+                        # prepend each buffer with 4 bytes containing its size.
+                        buf_offset = offset + 4
+                        buf[offset:buf_offset] = to_bytes_big(buf_len, 4)
+                        buf[buf_offset : (offset := buf_offset + buf_len)] = buffer
+
         if self.n_remote_reader > 0:
-            self.remote_socket.send(serialized_obj)
+            self.remote_socket.send_multipart(all_buffers, copy=False)
 
-    def dequeue(self,
-                timeout: Optional[float] = None,
-                cancel: Optional[Event] = None,
-                indefinite: bool = False):
-        """ Read from message queue with optional timeout (in seconds) """
+    def dequeue(
+        self,
+        timeout: float | None = None,
+        cancel: Event | None = None,
+        indefinite: bool = False,
+    ):
+        """Read from message queue with optional timeout (in seconds)"""
         if self._is_local_reader:
             with self.acquire_read(timeout, cancel, indefinite) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
-                    # no need to know the size of serialized object
-                    # pickle format contains the size information internally
-                    # see https://docs.python.org/3/library/pickle.html
-                    obj = pickle.loads(buf[1:])
+                    offset = 3
+                    buf_count = from_bytes_big(buf[1:offset])
+                    all_buffers = []
+                    for i in range(buf_count):
+                        buf_offset = offset + 4
+                        buf_len = from_bytes_big(buf[offset:buf_offset])
+                        offset = buf_offset + buf_len
+                        all_buffers.append(buf[buf_offset:offset])
+                    obj = pickle.loads(all_buffers[0], buffers=all_buffers[1:])
             if overflow:
                 obj = MessageQueue.recv(self.local_socket, timeout)
         elif self._is_remote_reader:
@@ -524,26 +589,100 @@ class MessageQueue:
         return obj
 
     @staticmethod
-    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+    def recv(socket: zmq.Socket, timeout: float | None) -> Any:
         timeout_ms = None if timeout is None else int(timeout * 1000)
         if not socket.poll(timeout=timeout_ms):
             raise TimeoutError
-        recv = socket.recv(copy=False)
-        return pickle.loads(recv.buffer)
+        recv, *recv_oob = socket.recv_multipart(copy=False)
+        return pickle.loads(recv, buffers=recv_oob)
 
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)
             return obj
-        else:
-            return self.dequeue()
+        return self.dequeue()
+
+    @staticmethod
+    def create_from_process_group_single_reader(
+        pg: ProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        reader_rank: int = 0,
+        blocking: bool = False,
+    ) -> tuple["MessageQueue", list[Handle]]:
+        """
+        Creates a MessageQueue for a process group with a single reader.
+
+        This method is designed for scenarios where only one process (the reader)
+        will consume messages, and all other processes are writers. It sets up
+        the shared memory buffer and communication handles accordingly, and
+        gathers the handles from all processes to the reader.
+
+        Args:
+            pg (ProcessGroup): The torch distributed process group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            reader_rank (int, optional): The global rank that will act as the reader.
+                Defaults to 0.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to False.
+
+        Returns:
+            tuple[MessageQueue, list[Handle]]:
+            The MessageQueue instance for the calling process,
+            and a list of handles (only non-empty for the reader process).
+        """
+        local_size = torch.cuda.device_count()
+        rank = dist.get_rank()
+        same_node = rank // local_size == reader_rank // local_size
+        buffer_io = MessageQueue(
+            n_reader=1,
+            n_local_reader=1 if same_node else 0,
+            max_chunk_bytes=max_chunk_bytes,
+            max_chunks=max_chunks,
+        )
+        handle = buffer_io.export_handle()
+        handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None
+        dist.gather_object(handle, handles, dst=reader_rank, group=pg)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io, cast(list[Handle], handles or [])
 
     @staticmethod
-    def create_from_process_group(pg: Union[ProcessGroup,
-                                            StatelessProcessGroup],
-                                  max_chunk_bytes,
-                                  max_chunks,
-                                  writer_rank=0) -> "MessageQueue":
+    def create_from_process_group(
+        pg: ProcessGroup | StatelessProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        writer_rank: int = 0,
+        external_writer_handle=None,
+        blocking: bool = True,
+    ) -> "MessageQueue":
+        """
+        Creates a MessageQueue for a distributed process group with one writer and
+        multiple readers.
+
+        This method is designed for scenarios where one process (the writer) sends
+        messages, and all other processes (the readers) receive messages. It sets up
+        the shared memory buffer and socket communication handles accordingly, and
+        broadcasts the handle from the writer to all readers.
+
+        Args:
+            pg (ProcessGroup | StatelessProcessGroup): The torch distributed process
+                group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            writer_rank (int, optional): The global rank that will act as the writer.
+                Defaults to 0.
+            external_writer_handle (Handle, optional): Used when there is a handle
+                from an external Message Queue. If provided, use this handle to init
+                PG writer message queue instead of creating a new one. Defaults to None.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to True.
+
+        Returns:
+            MessageQueue: The MessageQueue instance for the calling process.
+
+        """
         if isinstance(pg, ProcessGroup):
             group_rank = dist.get_rank(pg)
             group_world_size = dist.get_world_size(pg)
@@ -552,38 +691,43 @@ class MessageQueue:
             group_rank = pg.rank
             group_world_size = pg.world_size
             global_ranks = list(range(pg.world_size))
-
         from vllm.distributed.parallel_state import in_the_same_node_as
+
         status = in_the_same_node_as(pg, source_rank=writer_rank)
-        same_node_ranks = [i for i, s in enumerate(status) if s]
-        n_reader = group_world_size - 1
-        n_local_reader = len(same_node_ranks) - 1
-        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
-        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer_io = MessageQueue(
-                n_reader=n_reader,
-                n_local_reader=n_local_reader,
-                local_reader_ranks=local_reader_ranks,
-                max_chunk_bytes=max_chunk_bytes,
-                max_chunks=max_chunks,
-            )
+            if external_writer_handle is not None:
+                buffer_io = MessageQueue.create_from_handle(
+                    external_writer_handle, group_rank
+                )
+            else:
+                same_node_ranks = [i for i, s in enumerate(status) if s]
+                n_reader = group_world_size - 1
+                n_local_reader = len(same_node_ranks) - 1
+                local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+                buffer_io = MessageQueue(
+                    n_reader=n_reader,
+                    n_local_reader=n_local_reader,
+                    local_reader_ranks=local_reader_ranks,
+                    max_chunk_bytes=max_chunk_bytes,
+                    max_chunks=max_chunks,
+                )
             handle = buffer_io.export_handle()
             if isinstance(pg, ProcessGroup):
-                dist.broadcast_object_list([handle],
-                                           src=global_ranks[writer_rank],
-                                           group=pg)
+                dist.broadcast_object_list(
+                    [handle], src=global_ranks[writer_rank], group=pg
+                )
             else:
                 pg.broadcast_obj(handle, writer_rank)
         else:
             if isinstance(pg, ProcessGroup):
                 recv = [None]
-                dist.broadcast_object_list(recv,
-                                           src=global_ranks[writer_rank],
-                                           group=pg)
+                dist.broadcast_object_list(
+                    recv, src=global_ranks[writer_rank], group=pg
+                )
                 handle = recv[0]  # type: ignore
             else:
                 handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
-        buffer_io.wait_until_ready()
+        if blocking:
+            buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 0310fc14da256b01a6828057fcedbd16d3f3b24d..4af2caa16b0d62f7d0c04690200e8bdbc5dba9f3 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -3,13 +3,13 @@
 
 import pickle
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import chain
 from multiprocessing import shared_memory
 from multiprocessing.synchronize import Lock as LockType
-from typing import Any, Callable, Optional, Union
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -24,63 +24,63 @@ class SingleWriterShmRingBuffer:
     A single-writer, multiple-reader ring buffer implementation using shared
     memory. This class provides a thread-safe ring buffer where one process
     can write data while multiple processes/threads can read from it.
-    
+
     Architecture:
     - Uses shared memory for cross-process communication
     - Maintains metadata for each allocated buffer chunk in the writer process
     - Supports custom "is_free_fn" functions to determine when buffers can be
       reused
     - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]`
-    
+
     Key Concepts:
     - monotonic_id_start/end: Track the range of active buffer IDs
     - data_buffer_start/end: Track the physical memory range in use
     - Automatic wraparound when reaching buffer end
     - Lazy garbage collection based on is_free_fn checks
-    
+
     Example Usage Scenarios:
-    
+
     Scenario 1: Simple Linear Allocation
     ```
     Buffer size: 100 bytes
     Initial state: [................................................. ]
                    ^start=end(0)
-    
+
     After allocating 20 bytes (id=0):
     [id:0|size:20|data........][...................................]
     ^start(0)                  ^end(28)
-    
-    After allocating 30 bytes (id=1):  
+
+    After allocating 30 bytes (id=1):
     [id:0|size:20|data........][id:1|size:30|data..............][..]
     ^start(0)                                                   ^end(66)
     ```
-    
+
     Scenario 2: Memory Reclamation
     ```
     Before freeing (both buffers still in use):
     [id:0|size:20|data........][id:1|size:30|data..............][..]
     ^start(0)                                                   ^end(66)
-    
+
     After id:0 is marked free by readers:
     [FREED.................... ][id:1|size:30|data..............][..]
                                 ^start(28)                       ^end(66)
-    
+
     After both are freed:
     [FREED..............................................][..]
                                                          ^start=end(66)
     ```
-    
+
     Scenario 3: Wraparound Allocation (continuing from Scenario 2)
     ```
     Starting from after memory reclamation in Scenario 2:
     [FREED..............................................][..]
                                                          ^start=end(66)
-    
+
     Allocate 40 bytes (id=2) - only 34 bytes available at end, so wraparound:
     [id:2|size:40|data........................][FREED.............][..]
                                               ^end(148)            ^start(66)
     ```
-    
+
     Scenario 4: Error Handling - Out of Space
     ```
     Starting from after wraparound allocation in Scenario 3:
@@ -91,17 +91,17 @@ class SingleWriterShmRingBuffer:
     occupied_size_new = end + size - start = 148 + 28 - 66 > buffer_size(100)
     -> Raises MemoryError: "Not enough space in the data buffer"
     ```
-    
+
     Thread Safety:
     - Single writer: Only one process/thread should write (allocate_buf)
-    - Multiple readers: Multiple processes/threads can read (access_buf) 
+    - Multiple readers: Multiple processes/threads can read (access_buf)
     - Reader synchronization handled by is_free_fn callback
     - Writer handles garbage collection (free_buf) based on reader feedback
-    
+
     Memory Layout per Buffer Chunk:
     `[4-byte monotonic_id][4-byte chunk_size][actual_data...]`
     ^metadata_start                         ^data_start
-    
+
     The monotonic_id ensures data integrity - readers can verify they're
     accessing the correct data even after buffer wraparound or reuse.
     """
@@ -109,7 +109,7 @@ class SingleWriterShmRingBuffer:
     def __init__(
         self,
         data_buffer_size: int,
-        name: Optional[str] = None,
+        name: str | None = None,
         create: bool = False,
     ):
         self.data_buffer_size = data_buffer_size
@@ -127,19 +127,18 @@ class SingleWriterShmRingBuffer:
 
         if create:
             # we are creating a buffer
-            self.metadata = {
-                self.monotonic_id_end: self.data_buffer_end
-            }  # monotonic_id -> start address
+            self.metadata: dict[int, int] = {}  # monotonic_id -> start address
             self.shared_memory = shared_memory.SharedMemory(
-                create=True, size=self.data_buffer_size, name=name)
+                create=True, size=self.data_buffer_size, name=name
+            )
         else:
             # we are opening an existing buffer
             # fix to https://stackoverflow.com/q/62748654/9191338
             # Python incorrectly tracks shared memory even if it is not
             # created by the process. The following patch is a workaround.
             with patch(
-                    "multiprocessing.resource_tracker.register",
-                    lambda *args, **kwargs: None,
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
             ):
                 self.shared_memory = shared_memory.SharedMemory(name=name)
                 # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
@@ -149,8 +148,11 @@ class SingleWriterShmRingBuffer:
                 # when attaching to an existing block.
                 assert self.shared_memory.size >= self.data_buffer_size
 
-        logger.debug("Shared memory created/opened with name: %s, size: %d",
-                     self.shared_memory.name, self.data_buffer_size)
+        logger.debug(
+            "Shared memory created/opened with name: %s, size: %d",
+            self.shared_memory.name,
+            self.data_buffer_size,
+        )
 
     def handle(self):
         return (
@@ -182,19 +184,20 @@ class SingleWriterShmRingBuffer:
         return int.from_bytes(byte_data, "little", signed=True)
 
     def allocate_buf(self, size: int) -> tuple[int, int]:
-        '''
+        """
         Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory.
         Memory layout:
         `[4-byte monotonic_id][4-byte size][buffer data...]`
-        '''
+        """
         assert self.is_writer, "Only the writer can allocate buffers."
         assert size > 0, "Size must be greater than 0"
         size += self.MD_SIZE  # add metadata size to the buffer size
         # reset to beginning if the buffer does have enough contiguous space
         buffer_end_reset = self.data_buffer_end % self.data_buffer_size
         if buffer_end_reset + size > self.data_buffer_size:
-            buffer_end_reset = (self.data_buffer_end // self.data_buffer_size +
-                                1) * self.data_buffer_size
+            buffer_end_reset = (
+                self.data_buffer_end // self.data_buffer_size + 1
+            ) * self.data_buffer_size
         else:  # no reset needed
             buffer_end_reset = self.data_buffer_end
 
@@ -203,21 +206,24 @@ class SingleWriterShmRingBuffer:
         # exceeds the start of the data buffer
         occupied_size_new = buffer_end_reset + size - self.data_buffer_start
         if occupied_size_new > self.data_buffer_size:
-            raise MemoryError("Not enough space in the data buffer, "
-                              "try calling free_buf() to free up space")
+            raise MemoryError(
+                "Not enough space in the data buffer, "
+                "try calling free_buf() to free up space"
+            )
         self.data_buffer_end = buffer_end_reset
 
         # first 4 bytes as the monotonic id
         buf_idx = self.data_buffer_end % self.data_buffer_size
-        self.shared_memory.buf[buf_idx:buf_idx + self.ID_NBYTES] = \
-            self.int2byte(self.monotonic_id_end)
+        self.shared_memory.buf[buf_idx : buf_idx + self.ID_NBYTES] = self.int2byte(
+            self.monotonic_id_end
+        )
         # next 4 bytes as the size of the data buffer
-        self.shared_memory.buf[buf_idx + self.ID_NBYTES: \
-            buf_idx + self.MD_SIZE] = self.int2byte(size)
+        self.shared_memory.buf[buf_idx + self.ID_NBYTES : buf_idx + self.MD_SIZE] = (
+            self.int2byte(size)
+        )
 
         # record metadata
-        self.metadata[self.monotonic_id_end %
-                      self.ID_MAX] = self.data_buffer_end
+        self.metadata[self.monotonic_id_end % self.ID_MAX] = self.data_buffer_end
         # update buffer and monotonic id indices
         current_buffer_end = self.data_buffer_end
         current_id_end = self.monotonic_id_end
@@ -230,23 +236,26 @@ class SingleWriterShmRingBuffer:
         buf_idx = address % self.data_buffer_size
 
         # read metadata
-        metadata_buff = self.shared_memory.buf[buf_idx:buf_idx + self.MD_SIZE]
-        id = self.byte2int(metadata_buff[:self.ID_NBYTES])
-        size = self.byte2int(metadata_buff[self.ID_NBYTES:self.MD_SIZE])
+        metadata_buff = self.shared_memory.buf[buf_idx : buf_idx + self.MD_SIZE]
+        id = self.byte2int(metadata_buff[: self.ID_NBYTES])
+        size = self.byte2int(metadata_buff[self.ID_NBYTES : self.MD_SIZE])
 
         # yield the data buffer and metadata
-        data_buff = self.shared_memory.buf[buf_idx + self.MD_SIZE:buf_idx +
-                                           size]
-        with (memoryview(data_buff) as data_view, ):
+        data_buff = self.shared_memory.buf[buf_idx + self.MD_SIZE : buf_idx + size]
+        with (
+            memoryview(data_buff) as data_view,
+        ):
             yield data_view, (id, size)
 
-    def free_buf(self,
-                 is_free_fn: Callable[[int, memoryview], bool],
-                 nbytes: Optional[int] = None) -> Iterable[int]:
-        '''
+    def free_buf(
+        self,
+        is_free_fn: Callable[[int, memoryview], bool],
+        nbytes: int | None = None,
+    ) -> Iterable[int]:
+        """
         Free a buffer of the given size. This is a no-op in shared memory,
         but we need to keep track of the metadata.
-        
+
         If freed memory spreads across the end and start of the ring buffer,
         the actual freed memory will be in two segments. In this case there
         still might not be a contiguous space of `nbytes` available.
@@ -254,13 +263,15 @@ class SingleWriterShmRingBuffer:
         Args:
             nbytes (int, optional): The size of the buffer to free. If None,
                 frees the maximum size of the ring buffer.
-        '''
+        """
 
         assert self.is_writer, "Only the writer can free buffers."
         logger.debug(
             "Freeing up space in the ring buffer, "
             "monotonic_id_start: %d, monotonic_id_end: %d",
-            self.monotonic_id_start, self.monotonic_id_end)
+            self.monotonic_id_start,
+            self.monotonic_id_end,
+        )
         monotonic_id_before = self.monotonic_id_start
         # if nbytes is None, free up the maximum size of the ring buffer
         if nbytes is None:
@@ -272,9 +283,18 @@ class SingleWriterShmRingBuffer:
                 if is_free_fn(self.monotonic_id_start, data_buff):
                     # check passed, we can free the buffer
                     del self.metadata[self.monotonic_id_start]
-                    self.monotonic_id_start = ((self.monotonic_id_start + 1) %
-                                               self.ID_MAX)
-                    self.data_buffer_start = address
+                    self.monotonic_id_start = (
+                        self.monotonic_id_start + 1
+                    ) % self.ID_MAX
+                    if self.monotonic_id_start in self.metadata:
+                        # pointing to the start addr of next allocation
+                        self.data_buffer_start += (
+                            self.metadata[self.monotonic_id_start]
+                            - self.data_buffer_start
+                        ) % self.data_buffer_size
+                    else:
+                        # no remaining allocation, reset to zero
+                        self.data_buffer_start = self.data_buffer_end = 0
                     freed_bytes += metadata[1]
                 else:
                     # there are still readers, we cannot free the buffer
@@ -282,8 +302,11 @@ class SingleWriterShmRingBuffer:
 
         logger.debug(
             "Freed %d bytes from the ring buffer, "
-            "monotonic_id_start: %d, monotonic_id_end: %d", freed_bytes,
-            self.monotonic_id_start, self.monotonic_id_end)
+            "monotonic_id_start: %d, monotonic_id_end: %d",
+            freed_bytes,
+            self.monotonic_id_start,
+            self.monotonic_id_end,
+        )
 
         # buffer wrap around
         if self.data_buffer_start >= self.data_buffer_size:
@@ -295,12 +318,12 @@ class SingleWriterShmRingBuffer:
         if monotonic_id_after >= monotonic_id_before:
             return range(monotonic_id_before, monotonic_id_after)
         else:
-            return chain(range(monotonic_id_before, self.ID_MAX),
-                         range(0, monotonic_id_after))
+            return chain(
+                range(monotonic_id_before, self.ID_MAX), range(0, monotonic_id_after)
+            )
 
 
 class ObjectSerde(ABC):
-
     @abstractmethod
     def serialize(self, value: Any) -> tuple[Any, int, bytes, int]:
         """Serialize an object to bytes."""
@@ -313,20 +336,17 @@ class ObjectSerde(ABC):
 
 
 class MsgpackSerde(ObjectSerde):
-
     def __init__(self):
         # Delayed import to avoid circular dependency
         from vllm.multimodal.inputs import MultiModalKwargsItem
         from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
         self.encoder = MsgpackEncoder()
-        self.tensor_decoder = MsgpackDecoder(torch.Tensor)
-        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem)
+        self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False)
+        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False)
         self._mm_kwargs_item_cls = MultiModalKwargsItem
 
-    def serialize(
-            self,
-            value: Any) -> tuple[Union[bytes, list[bytes]], int, bytes, int]:
+    def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
         len_arr = None
         if isinstance(value, (torch.Tensor, self._mm_kwargs_item_cls)):
             type_name = type(value).__name__
@@ -339,21 +359,22 @@ class MsgpackSerde(ObjectSerde):
             nbytes = len(value)
 
         object_metadata = (type_name, nbytes, len_arr)
-        serialized_metadata = pickle.dumps(object_metadata,
-                                           protocol=pickle.HIGHEST_PROTOCOL)
+        serialized_metadata = pickle.dumps(
+            object_metadata, protocol=pickle.HIGHEST_PROTOCOL
+        )
         return value, nbytes, serialized_metadata, len(serialized_metadata)
 
     def deserialize(self, data_view: memoryview) -> Any:
         # pickle.loads do not read past the end of a pickled object
         # within a large buffer, so we can skip storing the metadata size
         type_name, nbytes, len_arr = pickle.loads(data_view)
-        serialized_data = bytearray(data_view[-nbytes:])
+        serialized_data = data_view[-nbytes:]
 
         if type_name == torch.Tensor.__name__:
             obj = []
             start_idx = 0
             for length in len_arr:
-                item_bytes = serialized_data[start_idx:start_idx + length]
+                item_bytes = serialized_data[start_idx : start_idx + length]
                 obj.append(item_bytes)
                 start_idx += length
             obj = self.tensor_decoder.decode(obj)
@@ -361,15 +382,14 @@ class MsgpackSerde(ObjectSerde):
             obj = []
             start_idx = 0
             for length in len_arr:
-                item_bytes = serialized_data[start_idx:start_idx + length]
+                item_bytes = serialized_data[start_idx : start_idx + length]
                 obj.append(item_bytes)
                 start_idx += length
             obj = self.mm_decoder.decode(obj)
         elif type_name == bytes.__name__:
             obj = pickle.loads(serialized_data)
         else:
-            raise ValueError(
-                f"Unsupported object type '{type_name}' in metadata")
+            raise ValueError(f"Unsupported object type '{type_name}' in metadata")
 
         return obj
 
@@ -380,7 +400,7 @@ class ShmObjectStorageHandle:
     n_readers: int
     ring_buffer_handle: tuple[int, str]
     serde_class: type[ObjectSerde]
-    reader_lock: Optional[LockType]
+    reader_lock: LockType | None
 
 
 class SingleWriterShmObjectStorage:
@@ -388,18 +408,18 @@ class SingleWriterShmObjectStorage:
     A single-writer, multiple-reader object storage system built on top of a
     shared memory ring buffer. Provides key-value storage with automatic memory
     management and cross-process serialization support.
-    
+
     This storage system follows a FIFO (First-In-First-Out) eviction policy
     where the oldest objects are automatically freed when memory runs low.
     Memory is reclaimed based on reader reference counting - objects are only
     freed when all readers have finished accessing them.
-    
+
     Architecture:
     - Single writer process can put(key, value) objects
     - Multiple reader processes can get(address, monotonic_id) objects
     - Built on SingleWriterShmRingBuffer for efficient shared memory management
     - Thread-safe operations with reader synchronization via locks
-    
+
     Key Features:
     - FIFO Eviction: Oldest objects are evicted first when memory is full
     - Reference Counting: Objects are only freed when no readers are
@@ -414,7 +434,7 @@ class SingleWriterShmObjectStorage:
 
     Memory Layout per Object:
     `[4-byte reference_count][metadata_size][serialized_object_data]`
-    
+
     Thread Safety:
     - Writer operations (put, clear) are single-threaded by design
     - Reader operations (get) are thread-safe with lock-based reference
@@ -428,7 +448,7 @@ class SingleWriterShmObjectStorage:
         n_readers: int,
         ring_buffer: SingleWriterShmRingBuffer,
         serde_class: type[ObjectSerde] = MsgpackSerde,
-        reader_lock: Optional[LockType] = None,
+        reader_lock: LockType | None = None,
     ):
         """
         Initialize the object storage.
@@ -476,24 +496,23 @@ class SingleWriterShmObjectStorage:
 
     def copy_to_buffer(
         self,
-        data: Union[bytes, list[bytes]],
+        data: bytes | list[bytes],
         data_bytes: int,
         metadata: bytes,
         md_bytes: int,
         data_view: memoryview,
     ) -> None:
-        data_view[self.flag_bytes:self.flag_bytes + md_bytes] = metadata
+        data_view[self.flag_bytes : self.flag_bytes + md_bytes] = metadata
         if isinstance(data, bytes):
             data_view[-data_bytes:] = data
         elif isinstance(data, list):
             start_idx = self.flag_bytes + md_bytes
             for item_bytes in data:
                 item_size = len(item_bytes)
-                data_view[start_idx:start_idx + item_size] = item_bytes
+                data_view[start_idx : start_idx + item_size] = item_bytes
                 start_idx += item_size
         else:
-            raise ValueError(
-                f"Unsupported data type for serialization: {type(data)}")
+            raise ValueError(f"Unsupported data type for serialization: {type(data)}")
 
     def increment_writer_flag(self, id: int) -> None:
         """Set the in-use flag for the writer."""
@@ -509,8 +528,9 @@ class SingleWriterShmObjectStorage:
         """Free unused buffers in the ring buffer."""
         # try to free up 2*max_object_size bytes of space in the ring buffer,
         # since the buffer might be fragmented
-        freed_ids = self.ring_buffer.free_buf(self.default_is_free_check,
-                                              2 * self.max_object_size)
+        freed_ids = self.ring_buffer.free_buf(
+            self.default_is_free_check, 2 * self.max_object_size
+        )
         # update the metadata after freeing up space
         for freed_id in freed_ids:
             key_to_free = self.id_index[freed_id]
@@ -537,7 +557,7 @@ class SingleWriterShmObjectStorage:
         Store a key-value pair in the object storage.
         Attempts to free max_object_size bytes using FIFO order
         when the ring buffer runs out of space during a put() operation.
-        
+
         Args:
             key: String key to identify the object
             value: Any serializable Python object
@@ -550,15 +570,17 @@ class SingleWriterShmObjectStorage:
         if key in self.key_index:
             raise ValueError(f"Key '{key}' already exists in the storage.")
 
-        object_data, data_bytes, object_metadata, md_bytes = \
-            self.ser_de.serialize(value)
+        object_data, data_bytes, object_metadata, md_bytes = self.ser_de.serialize(
+            value
+        )
         buffer_size = self.flag_bytes + data_bytes + md_bytes
 
         # Sanity checks
         if buffer_size > self.max_object_size:
             raise ValueError(
                 f"Serialized object size ({buffer_size} bytes) exceeds "
-                f"max object size ({self.max_object_size} bytes)")
+                f"max object size ({self.max_object_size} bytes)"
+            )
 
         # Allocate new buffer
         try:
@@ -570,9 +592,10 @@ class SingleWriterShmObjectStorage:
 
         # Write data to buffer
         with self.ring_buffer.access_buf(address) as (data_view, metadata):
-            data_view[:self.flag_bytes] = self.ring_buffer.int2byte(0)
-            self.copy_to_buffer(object_data, data_bytes, object_metadata,
-                                md_bytes, data_view)
+            data_view[: self.flag_bytes] = self.ring_buffer.int2byte(0)
+            self.copy_to_buffer(
+                object_data, data_bytes, object_metadata, md_bytes, data_view
+            )
         self.increment_writer_flag(monotonic_id)
 
         # Update key index
@@ -587,14 +610,15 @@ class SingleWriterShmObjectStorage:
             if buf_metadata[0] != monotonic_id:
                 raise ValueError(
                     f"Data for address:id '{address}:{monotonic_id}'"
-                    " has been modified or is invalid.")
+                    " has been modified or is invalid."
+                )
 
-            obj = self.ser_de.deserialize(data_view[self.flag_bytes:])
+            obj = self.ser_de.deserialize(data_view[self.flag_bytes :])
 
             # decrease the in-use flag for reader reads
             if self._reader_lock is not None:
                 with self._reader_lock:
-                    self.increment_reader_flag(data_view[:self.flag_bytes])
+                    self.increment_reader_flag(data_view[: self.flag_bytes])
             else:
                 # if self._reader_lock is None, it means we are the writer
                 # in this case, we do not need to decrease the reader count
@@ -614,7 +638,8 @@ class SingleWriterShmObjectStorage:
 
     @staticmethod
     def create_from_handle(
-            handle: ShmObjectStorageHandle) -> "SingleWriterShmObjectStorage":
+        handle: ShmObjectStorageHandle,
+    ) -> "SingleWriterShmObjectStorage":
         logger.debug("Creating storage from handle: %s", handle)
         ring_buffer = SingleWriterShmRingBuffer(*handle.ring_buffer_handle)
         return SingleWriterShmObjectStorage(
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 09012d16978d93773d901ed01c335e8c73cde460..eb1f173b119255a144d1f48186797c945f9e1546 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
 from vllm.distributed.device_communicators.all_reduce_utils import (
-    SYMM_MEM_ALL_REDUCE_MAX_SIZES)
+    SYMM_MEM_ALL_REDUCE_MAX_SIZES,
+)
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
 
 try:
@@ -28,20 +31,20 @@ class SymmMemCommunicator:
     }
 
     def __init__(
-            self,
-            group: ProcessGroup,
-            device: Union[int, str, torch.device],
-            # add options for testing
-            force_multimem: Optional[bool] = None,
-            max_size_override: Optional[int] = None):
+        self,
+        group: ProcessGroup,
+        device: int | str | torch.device,
+        # add options for testing
+        force_multimem: bool | None = None,
+        max_size_override: int | None = None,
+    ):
         self.disabled = True
 
         if not symm_mem_available:
             return
 
         if not current_platform.is_cuda():
-            logger.warning("SymmMemCommunicator: symmetric "
-                           "memory is not available.")
+            logger.warning("SymmMemCommunicator: symmetric memory is not available.")
             return
         if isinstance(device, int):
             device = torch.device(f"cuda:{device}")
@@ -52,8 +55,14 @@ class SymmMemCommunicator:
         self.device = device
         self.group = group
         self.world_size = dist.get_world_size(self.group)
-        self.device_capability = current_platform.get_device_capability(
-        ).as_version_str()
+        capability = current_platform.get_device_capability()
+        if capability is None:
+            logger.warning(
+                "SymmMemCommunicator: device capability is unknown, "
+                "communicator is not available."
+            )
+            return
+        self.device_capability = capability.as_version_str()
         if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES:
             logger.warning(
                 "SymmMemCommunicator: Device capability %s not supported, "
@@ -61,8 +70,7 @@ class SymmMemCommunicator:
                 self.device_capability,
             )
             return
-        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[
-                self.device_capability]:
+        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]:
             logger.warning(
                 "SymmMemCommunicator: World size %d not supported, "
                 "communicator is not available.",
@@ -77,21 +85,34 @@ class SymmMemCommunicator:
                 self.max_size,
             )
         else:
-            self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[
-                self.device_capability][self.world_size]
-
-        self.buffer = torch_symm_mem.empty(
-            self.max_size // self.dtype.itemsize,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+            self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+                self.world_size
+            ]
+        try:
+            self.buffer = torch_symm_mem.empty(
+                self.max_size // self.dtype.itemsize,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        except RuntimeError as e:
+            logger.warning_once(
+                "SymmMemCommunicator: symmetric memory initialization failed: %s "
+                "Communicator is not available. To suppress this warning set "
+                "VLLM_ALLREDUCE_USE_SYMM_MEM=0",
+                str(e),
+            )
+            return
         if handle.multicast_ptr == 0:
-            logger.warning("SymmMemCommunicator: symmetric memory "
-                           "multicast operations are not supported.")
+            logger.warning(
+                "SymmMemCommunicator: symmetric memory "
+                "multicast operations are not supported."
+            )
             return
         self.force_multimem = force_multimem
         self.disabled = False
+        if vllm_is_batch_invariant():
+            self.disabled = True
 
     def should_use_symm_mem(self, inp: torch.Tensor):
         if self.disabled:
@@ -104,15 +125,13 @@ class SymmMemCommunicator:
         return inp_size < self.max_size
 
     def all_reduce(
-            self,
-            inp: torch.Tensor,
-            *,
-            out: Optional[torch.Tensor] = None) -> Optional[torch.Tensor]:
+        self, inp: torch.Tensor, *, out: torch.Tensor | None = None
+    ) -> torch.Tensor | None:
         if not self.should_use_symm_mem(inp):
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[:inp.numel()].copy_(inp.view(-1))
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
 
         # Determine which algorithm to use
         use_multimem = False
@@ -121,16 +140,17 @@ class SymmMemCommunicator:
             use_multimem = self.force_multimem
         else:
             # Normal logic: use multimem for supported world sizes
-            use_multimem = self.world_size in self._WORLD_SIZES_MULTIMEM[
-                self.device_capability]
+            use_multimem = (
+                self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]
+            )
 
         if use_multimem:
-            torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()],
-                                                    "sum",
-                                                    self.group.group_name)
+            torch.ops.symm_mem.multimem_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
         else:
-            torch.ops.symm_mem.two_shot_all_reduce_(self.buffer[:inp.numel()],
-                                                    "sum",
-                                                    self.group.group_name)
-        out.copy_(self.buffer[:inp.numel()].view(out.shape))
+            torch.ops.symm_mem.two_shot_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        out.copy_(self.buffer[: inp.numel()].view(out.shape))
         return out
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 942dd67f065dcc38230213211fd92936a1f300a3..a7724a86cc6a522975c01633c5a6cbd3b79a4e28 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
 
 import torch
 from torch.distributed import ProcessGroup
@@ -10,35 +9,39 @@ from torch.distributed import ProcessGroup
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.platforms.tpu import USE_TPU_COMMONS
+from vllm.platforms.tpu import USE_TPU_INFERENCE
 
 from .base_device_communicator import DeviceCommunicatorBase
 
-USE_RAY = parallel_config = get_current_vllm_config(
-).parallel_config.distributed_executor_backend == "ray"
+USE_RAY = parallel_config = (
+    get_current_vllm_config().parallel_config.distributed_executor_backend == "ray"
+)
 
 logger = init_logger(__name__)
 
-if not USE_TPU_COMMONS:
-    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
+if not USE_TPU_INFERENCE:
+    logger.info("tpu_inference not found, using vLLM's TpuCommunicator")
     if current_platform.is_tpu():
         import torch_xla
         import torch_xla.core.xla_model as xm
         import torch_xla.runtime as xr
         from torch_xla._internal import pjrt
         from torch_xla.distributed.xla_multiprocessing import (
-            create_optimized_replica_groups)
+            create_optimized_replica_groups,
+        )
+
         if USE_RAY:
-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils
 
 
 class TpuCommunicator(DeviceCommunicatorBase):
-
-    def __init__(self,
-                 cpu_group: ProcessGroup,
-                 device: Optional[torch.device] = None,
-                 device_group: Optional[ProcessGroup] = None,
-                 unique_name: str = ""):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
         super().__init__(cpu_group, device, device_group, unique_name)
 
         # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
@@ -96,7 +99,9 @@ class TpuCommunicator(DeviceCommunicatorBase):
         return xm.all_gather(input_, dim=dim)
 
 
-if USE_TPU_COMMONS:
-    from tpu_commons.distributed.device_communicators import (
-        TpuCommunicator as TpuCommonsCommunicator)
-    TpuCommunicator = TpuCommonsCommunicator  # type: ignore
+if USE_TPU_INFERENCE:
+    from tpu_inference.distributed.device_communicators import (
+        TpuCommunicator as TpuInferenceCommunicator,
+    )
+
+    TpuCommunicator = TpuInferenceCommunicator  # type: ignore
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 27bd176554afa5752237252a212b045f4386f50d..ad61fdfb8ea526b3bf24247c6081330a886c0501 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .base_device_communicator import DeviceCommunicatorBase
@@ -16,23 +14,25 @@ logger = init_logger(__name__)
 
 
 class XpuCommunicator(DeviceCommunicatorBase):
-
-    def __init__(self,
-                 cpu_group: ProcessGroup,
-                 device: Optional[torch.device] = None,
-                 device_group: Optional[ProcessGroup] = None,
-                 unique_name: str = ""):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
         super().__init__(cpu_group, device, device_group, unique_name)
         if self.use_all2all:
-            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-            if all2all_backend != "naive":
+            if self.all2all_backend != "naive":
                 logger.warning(
-                    "`%s` all2all manager is not supported on XPU."
+                    "`%s` all2all manager is not supported on XPU. "
                     "Falling back to `naive` all2all manager for XPU.",
-                    all2all_backend)
-                all2all_backend = "naive"
-            if all2all_backend == "naive":
+                    self.all2all_backend,
+                )
+                self.all2all_backend = "naive"
+            if self.all2all_backend == "naive":
                 from .all2all import NaiveAll2AllManager
+
                 self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
                 logger.info("Using naive all2all manager.")
 
@@ -40,12 +40,12 @@ class XpuCommunicator(DeviceCommunicatorBase):
         dist.all_reduce(input_, group=self.device_group)
         return input_
 
-    def gather(self,
-               input_: torch.Tensor,
-               dst: int = 0,
-               dim: int = -1) -> Optional[torch.Tensor]:
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -53,20 +53,19 @@ class XpuCommunicator(DeviceCommunicatorBase):
         # cluster so we use all_gather instead for now.
         input_size = input_.size()
         # Allocate output tensor.
-        output_tensor = torch.empty((self.world_size, ) + input_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
+        output_tensor = torch.empty(
+            (self.world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
         # All-gather.
-        dist.all_gather_into_tensor(output_tensor,
-                                    input_,
-                                    group=self.device_group)
+        dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group)
         if self.rank_in_group == dst:
             # Reshape
             output_tensor = output_tensor.movedim(0, dim)
-            output_tensor = output_tensor.reshape(input_size[:dim] +
-                                                  (self.world_size *
-                                                   input_size[dim], ) +
-                                                  input_size[dim + 1:])
+            output_tensor = output_tensor.reshape(
+                input_size[:dim]
+                + (self.world_size * input_size[dim],)
+                + input_size[dim + 1 :]
+            )
         else:
             output_tensor = None
         return output_tensor
@@ -78,17 +77,19 @@ class XpuCommunicator(DeviceCommunicatorBase):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.all2all_manager is not None
         hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel)
+            hidden_states, router_logits, is_sequence_parallel
+        )
         return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states: torch.Tensor,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states,
-                                                     is_sequence_parallel)
+        hidden_states = self.all2all_manager.combine(
+            hidden_states, is_sequence_parallel
+        )
         return hidden_states
diff --git a/vllm/distributed/ec_transfer/__init__.py b/vllm/distributed/ec_transfer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0decfd143e3431271fbaedc79624d4c0ece7771c
--- /dev/null
+++ b/vllm/distributed/ec_transfer/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.distributed.ec_transfer.ec_transfer_state import (
+    ensure_ec_transfer_initialized,
+    get_ec_transfer,
+    has_ec_transfer,
+)
+
+__all__ = [
+    "get_ec_transfer",
+    "ensure_ec_transfer_initialized",
+    "has_ec_transfer",
+]
diff --git a/vllm/distributed/ec_transfer/ec_connector/__init__.py b/vllm/distributed/ec_transfer/ec_connector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/distributed/ec_transfer/ec_connector/base.py b/vllm/distributed/ec_transfer/ec_connector/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b7b14d89b8a13893cd4e12b4437713716266bcd
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/base.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ECConnectorBase Class for Distributed Encoder Cache &
+P2P Encoder cache communication in V1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save Encoder cache.
+        check_caches_exist() - Check whether Encoder cache of requests exist
+        update_state_after_alloc() - update ECConnector state after
+        allocate. This will decide to load the cache or not
+        request_finished() - called when a request is finished,
+        free the cache with the requests
+
+    Worker-side: runs in each worker, loads/saves Encoder Cache to/from
+    the Connector based on the metadata.
+        start_load_ec() - starts loading all ECs (maybe async)
+        wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class ECConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+class ECConnectorMetadata(ABC):  # noqa: B024
+    """
+    Abstract Metadata used to communicate between the
+    Scheduler ECConnector and Worker ECConnector.
+    """
+
+    pass
+
+
+class ECConnectorBase(ABC):
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        self._connector_metadata: ECConnectorMetadata | None = None
+        self._vllm_config = vllm_config
+        self._role = role
+        if vllm_config.ec_transfer_config is not None:
+            self._is_producer = vllm_config.ec_transfer_config.is_ec_producer
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    @property
+    def role(self) -> ECConnectorRole:
+        return self._role
+
+    @property
+    def is_producer(self) -> bool:
+        return self._is_producer
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time
+        before the model execution. The metadata will be used for runtime
+        EC cache loading.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time
+        after the model execution.
+        """
+        self._connector_metadata = None
+
+    def _get_connector_metadata(self) -> ECConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_caches(
+        self,
+        ec_caches: dict[str, torch.Tensor],
+    ):
+        """
+        Initialize with the EC caches.
+        Args:
+            ec_caches: dictionary of encoder cache
+        """
+        # TODO: Implement this later for P2P feature
+        return
+
+    @abstractmethod
+    def start_load_caches(
+        self, encoder_cache: dict[str, torch.Tensor], **kwargs
+    ) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    @abstractmethod
+    def save_caches(
+        self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs
+    ) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return None, None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    @abstractmethod
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if encoder cache exists for each mm data of requests
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            A list bool where ith value is True if cache exist for
+            ith mm_data of requests
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request", index: int):
+        """
+        Update ECConnector state to decide allocate cache for requests
+
+        Args:
+            request (Request): the request object.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> ECConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
+
+    def update_connector_output(self, connector_output: ECConnectorOutput):
+        """
+        Update ECConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (ECConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self, request: "Request"
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished, before its encoder cache is freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and cached
+            should not be freed until the request_id is returned from
+            get_finished().
+        """
+        return False, None
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfdf51d775bda0c965df2e9d4eafa79295470d21
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+# yapf: disable
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from vllm.config import ECTransferConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+
+class ECConnectorFactory:
+    _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {}
+
+    @classmethod
+    def register_connector(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[ECConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(
+        cls,
+        config: "VllmConfig",
+        role: ECConnectorRole,
+    ) -> ECConnectorBase:
+        ec_transfer_config = config.ec_transfer_config
+        if ec_transfer_config is None:
+            raise ValueError("ec_transfer_config must be set to create a connector")
+        connector_cls = cls.get_connector_class(ec_transfer_config)
+        logger.info(
+            "Creating connector with name: %s and engine_id: %s",
+            connector_cls.__name__,
+            ec_transfer_config.engine_id,
+        )
+        # Connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        return connector_cls(config, role)
+
+    @classmethod
+    def get_connector_class(
+        cls, ec_transfer_config: "ECTransferConfig"
+    ) -> type[ECConnectorBase]:
+        """Get the connector class by name."""
+        connector_name = ec_transfer_config.ec_connector
+        if connector_name is None:
+            raise ValueError("EC connect must not be None")
+        elif connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = ec_transfer_config.ec_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+
+ECConnectorFactory.register_connector(
+    "ECSharedStorageConnector",
+    "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector",
+    "ECSharedStorageConnector",
+)
diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8388141dcc974bcc0c3da8c159c914fbae3baa3
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MMMeta:
+    mm_hash: str
+    num_token: int
+
+    @staticmethod
+    def make_meta(mm_hash, num_token) -> "MMMeta":
+        return MMMeta(mm_hash=mm_hash, num_token=num_token)
+
+
+@dataclass
+class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
+    mm_datas: list[MMMeta]
+
+    def __init__(self):
+        self.mm_datas = []
+
+    def add_mm_data(self, mm_data: MMMeta):
+        self.mm_datas.append(mm_data)
+
+
+class ECSharedStorageConnector(ECConnectorBase):
+    # NOTE: This is Simple debug implementation of the EC connector.
+    # It save / load the EC cache to / from the disk.
+
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        # req_id -> index
+        self._mm_datas_need_loads: dict[str, int] = {}
+        transfer_config = vllm_config.ec_transfer_config
+        if transfer_config is not None:
+            self._storage_path = transfer_config.get_from_extra_config(
+                "shared_storage_path", "/tmp"
+            )
+            logger.debug(transfer_config)
+            logger.debug("Shared storage path is %s", self._storage_path)
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    def start_load_caches(self, encoder_cache, **kwargs) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+
+        # Get the metadata
+        metadata: ECConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert encoder_cache is not None
+        if metadata is None:
+            logger.warning(
+                (
+                    "In connector.start_load_caches, ",
+                    "but the connector metadata is None",
+                )
+            )
+            return
+        # Load the EC for each mm data
+        for mm_data in metadata.mm_datas:
+            if mm_data.mm_hash in encoder_cache:
+                continue
+            filename = self._generate_filename_debug(mm_data.mm_hash)
+            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            encoder_cache[mm_data.mm_hash] = ec_cache
+            logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
+
+    def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        # Return if it is PD Instance
+        if not self.is_producer:
+            return
+        filename = self._generate_filename_debug(mm_hash)
+        ec_cache = encoder_cache[mm_hash]
+        tensors = {"ec_cache": ec_cache.detach().cpu()}
+        safetensors.torch.save_file(tensors, filename)
+        logger.debug("Save cache successful for mm_hash %s", mm_hash)
+
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if cache exist externally for each mm_data of request
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            List of bool indicate that ith mm_data exist in cache or not
+        """
+        result = []
+        for feature in request.mm_features:
+            result.append(self._found_match_for_mm_data(feature.identifier))
+        return result
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        index: int,
+    ) -> None:
+        """
+        Update ECConnector state after encoder cache allocation.
+        """
+        mm_hash = request.mm_features[index].identifier
+        num_encoder_token = request.get_num_encoder_tokens(index)
+        # Insert mm_hash only if this block has not been recorded yet.
+        self._mm_datas_need_loads[mm_hash] = num_encoder_token
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> ECConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        This only build for load mm_data only
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = ECSharedStorageConnectorMetadata()
+        for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
+            meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
+        self._mm_datas_need_loads.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_mm_data(self, mm_hash) -> bool:
+        """Check if the cache is hit for the request."""
+        filename = self._generate_filename_debug(mm_hash)
+        return os.path.exists(filename)
+
+    def _generate_foldername_debug(
+        self,
+        mm_hash: str,
+        create_folder: bool = True,  # <- now defaults to True
+    ) -> str:
+        """
+        Return the folder in which the cache for this mm_hash lives.
+        If `create_folder` is True (default) the directory is created
+        recursively the first time it is needed.
+        """
+        foldername = os.path.join(self._storage_path, mm_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(self, mm_hash: str) -> str:
+        """
+        Return the full path of the safetensors file for this mm_hash.
+        Ensures the parent directory exists because
+        `_generate_foldername_debug` is called with its default
+        (`create_folder=True`).
+        """
+        foldername = self._generate_foldername_debug(mm_hash)  # <- folder auto-created
+        return os.path.join(foldername, "encoder_cache.safetensors")
diff --git a/vllm/distributed/ec_transfer/ec_transfer_state.py b/vllm/distributed/ec_transfer/ec_transfer_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3c978b36a3a799afa110ae10cee5c3a840e0e1
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_transfer_state.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_EC_CONNECTOR_AGENT: ECConnectorBase | None = None
+
+
+def get_ec_transfer() -> ECConnectorBase:
+    assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized"
+    return _EC_CONNECTOR_AGENT
+
+
+def has_ec_transfer() -> bool:
+    return _EC_CONNECTOR_AGENT is not None
+
+
+def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize EC cache connector.
+    """
+
+    global _EC_CONNECTOR_AGENT
+
+    if vllm_config.ec_transfer_config is None:
+        return
+
+    if (
+        vllm_config.ec_transfer_config.is_ec_transfer_instance
+        and _EC_CONNECTOR_AGENT is None
+    ):
+        _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector(
+            config=vllm_config, role=ECConnectorRole.WORKER
+        )
diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py
index 80511024b9304019793b1c43222af6c24bbe9ef4..4cd51dd384ad22a68ecc39d7c8b9fd5c79a80a8b 100644
--- a/vllm/distributed/eplb/__init__.py
+++ b/vllm/distributed/eplb/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''
+"""
 Expert parallelism load balancer (EPLB).
-'''
+"""
 
 from .eplb_state import *
 from .rebalance_algo import *
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 3e318d7848326a77379b2a8278fd86a45d6fd4f7..526d3ceac7b8fe2c6ac0eab0106fbda35f61c207 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -29,14 +29,16 @@ physical experts.
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup, all_reduce
 
-from vllm.config import ParallelConfig
-from vllm.distributed.parallel_state import (get_ep_group, get_node_count,
-                                             in_the_same_node_as)
+from vllm.config import ModelConfig, ParallelConfig
+from vllm.distributed.parallel_state import (
+    get_ep_group,
+    get_node_count,
+    in_the_same_node_as,
+)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
@@ -48,7 +50,7 @@ logger = init_logger(__name__)
 
 
 @dataclass
-class EplbState:
+class EplbModelState:
     """EPLB metrics."""
 
     physical_to_logical_map: torch.Tensor
@@ -128,34 +130,46 @@ class EplbState:
     See:
     https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856
     """
-    expert_load_window_step: int = 0
-    """
-    Current step in the sliding window.
+    model_name: str
+    model: MixtureOfExperts
 
-    Different from `expert_rearrangement_step`, each EP rank may have its own
-    `expert_load_window_step`.
-    """
-    expert_load_window_size: int = 0
+
+class EplbState:
     """
-    Size of the expert load sliding window.
-    This is a constant and is taken from the config.
+    EplbState of each expert parallel model. Key is the model config hash.
     """
 
-    expert_rearrangement_step: int = 0
-    """
-    Steps after last rearrangement.
-    Will trigger a rearrangement if it exceeds the threshold.
+    def __init__(self, parallel_config: ParallelConfig, device: torch.device):
+        self.parallel_config = parallel_config
+        self.device = device
+        self.model_states: dict[str, EplbModelState] = {}
+        """
+        Current step in the sliding window.
 
-    NOTE: Keep in mind that all EP ranks need to have the same
-    `expert_rearrangement_step` value to ensure synchronization.
-    Otherwise, the rearrangement will hang at collective
-    communication calls.
-    """
-    expert_rearrangement_step_interval: int = 0
-    """
-    Interval for expert rearrangement steps.
-    This is a constant and is taken from the config.
-    """
+        Different from `expert_rearrangement_step`, 
+        each EP rank may have its own `expert_load_window_step`.
+        """
+        self.expert_load_window_step: int = 0
+        """
+        Size of the expert load sliding window.
+        This is a constant and is taken from the config.
+        """
+        self.expert_load_window_size: int = 0
+        """
+        Steps after last rearrangement.
+        Will trigger a rearrangement if it exceeds the threshold.
+
+        NOTE: Keep in mind that all EP ranks need to have the same
+        `expert_rearrangement_step` value to ensure synchronization.
+        Otherwise, the rearrangement will hang at collective
+        communication calls.
+        """
+        self.expert_rearrangement_step: int = 0
+        """
+        Interval for expert rearrangement steps.
+        This is a constant and is taken from the config.
+        """
+        self.expert_rearrangement_step_interval: int = 0
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -177,27 +191,63 @@ class EplbState:
         ]
         return global_physical_to_logical_map
 
-    @classmethod
-    def build(
-        cls,
+    def validate_ep_configuration(self, new_model: MixtureOfExperts):
+        """
+        Validate that the expert parallel configuration of
+        the new model is the same as the existing models.
+        """
+        if len(self.model_states) > 0:
+            model = next(iter(self.model_states.values())).model
+            if (
+                model.num_routed_experts != new_model.num_routed_experts
+                or model.num_redundant_experts != new_model.num_redundant_experts
+                or model.num_physical_experts != new_model.num_physical_experts
+                or model.num_logical_experts != new_model.num_logical_experts
+                or model.num_expert_groups != new_model.num_expert_groups
+            ):
+                raise RuntimeError(
+                    "Model: {} "
+                    "with config {} "
+                    "{} {} {} {} "
+                    "mismatch with new model {} "
+                    "with config {} "
+                    "{} {} {} {}".format(
+                        type(model),
+                        model.num_routed_experts,
+                        model.num_redundant_experts,
+                        model.num_physical_experts,
+                        model.num_logical_experts,
+                        model.num_expert_groups,
+                        type(new_model),
+                        new_model.num_routed_experts,
+                        new_model.num_redundant_experts,
+                        new_model.num_physical_experts,
+                        new_model.num_logical_experts,
+                        new_model.num_expert_groups,
+                    )
+                )
+
+    def add_model(
+        self,
         model: MixtureOfExperts,
-        device: torch.device,
-        parallel_config: ParallelConfig,
-        global_expert_load: Optional[torch.Tensor] = None,
-        old_global_expert_indices: Optional[torch.Tensor] = None,
-        rank_mapping: Optional[dict[int, int]] = None,
-    ) -> "EplbState":
+        model_config: ModelConfig,
+        global_expert_load: torch.Tensor | None = None,
+        old_global_expert_indices: torch.Tensor | None = None,
+        rank_mapping: dict[int, int] | None = None,
+    ):
         """
         Build the initial EPLB state.
         """
+        self.validate_ep_configuration(model)
         physical_to_logical_map_list = (
-            cls.build_initial_global_physical_to_logical_map(
+            EplbState.build_initial_global_physical_to_logical_map(
                 model.num_routed_experts,
                 model.num_redundant_experts,
-            ))
+            )
+        )
         physical_to_logical_map = torch.tensor(
             physical_to_logical_map_list,
-            device=device,
+            device=self.device,
         )
         # Assuming 8 GPUs per node, this supports up to
         # (1023 + 1) / 8 = 128 nodes for now.
@@ -205,62 +255,81 @@ class EplbState:
         MAX_EXPERT_REDUNDANCY = 1023
         assert model.num_redundant_experts <= MAX_EXPERT_REDUNDANCY, (
             f"num_redundant_experts {model.num_redundant_experts} "
-            f"must be less than or equal to {MAX_EXPERT_REDUNDANCY}")
+            f"must be less than or equal to {MAX_EXPERT_REDUNDANCY}"
+        )
         max_slots_per_logical_expert = MAX_EXPERT_REDUNDANCY + 1
         logical_to_physical_map = torch.full(
             (model.num_logical_experts, max_slots_per_logical_expert),
             -1,
-            device=device,
+            device=self.device,
         )
         logical_replica_count = torch.zeros(
-            (model.num_logical_experts, ),
-            device=device,
+            (model.num_logical_experts,),
+            device=self.device,
             dtype=torch.long,
         )
 
         for i in range(model.num_physical_experts):
             logical_idx = physical_to_logical_map[i]
-            logical_to_physical_map[logical_idx,
-                                    logical_replica_count[logical_idx]] = i
+            logical_to_physical_map[logical_idx, logical_replica_count[logical_idx]] = i
             logical_replica_count[logical_idx] += 1
 
         # Duplicate initial mapping for all layers
-        physical_to_logical_map = physical_to_logical_map.unsqueeze(0).expand(
-            model.num_moe_layers,
-            -1,
-        ).contiguous()
-        logical_to_physical_map = logical_to_physical_map.unsqueeze(0).expand(
-            model.num_moe_layers,
-            -1,
-            -1,
-        ).contiguous()
-        logical_replica_count = logical_replica_count.unsqueeze(0).expand(
-            model.num_moe_layers,
-            -1,
-        ).contiguous()
+        physical_to_logical_map = (
+            physical_to_logical_map.unsqueeze(0)
+            .expand(
+                model.num_moe_layers,
+                -1,
+            )
+            .contiguous()
+        )
+        logical_to_physical_map = (
+            logical_to_physical_map.unsqueeze(0)
+            .expand(
+                model.num_moe_layers,
+                -1,
+                -1,
+            )
+            .contiguous()
+        )
+        logical_replica_count = (
+            logical_replica_count.unsqueeze(0)
+            .expand(
+                model.num_moe_layers,
+                -1,
+            )
+            .contiguous()
+        )
 
         expert_load_pass = torch.zeros(
             (model.num_moe_layers, model.num_physical_experts),
             dtype=torch.int32,
-            device=device,
+            device=self.device,
         )
-        expert_load_window_size = parallel_config.eplb_config.window_size
+        self.expert_load_window_size = self.parallel_config.eplb_config.window_size
         expert_load_window = torch.zeros(
-            (expert_load_window_size, model.num_moe_layers,
-             model.num_physical_experts),
+            (
+                self.expert_load_window_size,
+                model.num_moe_layers,
+                model.num_physical_experts,
+            ),
             dtype=torch.int32,
-            device=device,
+            device=self.device,
         )
 
         # Set the initial progress of rearrangement to 3/4
-        eplb_step_interval = parallel_config.eplb_config.step_interval
-        expert_rearrangement_step = max(
-            0, eplb_step_interval - eplb_step_interval // 4)
+        eplb_step_interval = self.parallel_config.eplb_config.step_interval
+        self.expert_rearrangement_step = max(
+            0, eplb_step_interval - eplb_step_interval // 4
+        )
+        self.expert_rearrangement_step_interval = eplb_step_interval
 
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
-            assert global_expert_load.shape == (model.num_moe_layers,
-                                                model.num_logical_experts)
+            assert global_expert_load.shape == (
+                model.num_moe_layers,
+                model.num_logical_experts,
+            )
             assert global_expert_load.dtype == torch.int64
 
             num_replicas = model.num_physical_experts
@@ -273,20 +342,21 @@ class EplbState:
                 logger.warning_once(
                     f"num_gpus % num_nodes != 0, "
                     "not using hierarchical rearrangement algorithm.\n"
-                    f"{num_gpus=}, {num_nodes=}")
+                    f"{num_gpus=}, {num_nodes=}"
+                )
 
             # Get new expert mappings
             (
                 new_physical_to_logical_map,
                 new_logical_to_physical_map,
                 new_logical_replica_count,
-            ) = (rebalance_experts(
+            ) = rebalance_experts(
                 global_expert_load,
                 num_replicas,
                 num_groups,
                 num_nodes,
                 num_gpus,
-            ))
+            )
 
             max_physical_slots = new_logical_to_physical_map.shape[-1]
             assert max_physical_slots <= logical_to_physical_map.shape[-1]
@@ -295,7 +365,7 @@ class EplbState:
                 (0, logical_to_physical_map.shape[-1] - max_physical_slots),
                 value=-1,
             )
-            physical_to_logical_map = new_physical_to_logical_map.to(device)
+            physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
 
@@ -313,29 +383,28 @@ class EplbState:
                 False,
                 rank_mapping,
             )
-            expert_rearrangement_step = 0
+            self.expert_rearrangement_step = 0
 
-        return cls(
+        self.model_states[model_config.compute_hash()] = EplbModelState(
             physical_to_logical_map,
             logical_to_physical_map,
             logical_replica_count,
             expert_load_pass,
             expert_load_window,
-            expert_load_window_size=expert_load_window_size,
-            expert_rearrangement_step=expert_rearrangement_step,
-            expert_rearrangement_step_interval=eplb_step_interval,
+            model_config.model,
+            model,
         )
 
-    def step(self,
-             model: MixtureOfExperts,
-             is_dummy: bool = False,
-             is_profile: bool = False,
-             log_stats: bool = False) -> None:
+    def step(
+        self,
+        is_dummy: bool = False,
+        is_profile: bool = False,
+        log_stats: bool = False,
+    ) -> None:
         """
         Step the EPLB state.
 
         Args:
-            model (MixtureOfExperts): The MoE model.
             is_dummy (bool): If `True`, this is a dummy step and the load
                 metrics recorded in this forward pass will not count.
                 Defaults to `False`.
@@ -353,73 +422,98 @@ class EplbState:
         """
 
         if is_profile:
-            self.rearrange(model, is_profile=True)
+            self.rearrange(is_profile=True)
             return
 
         if is_dummy:
             # Do not record load metrics for dummy steps
-            self.expert_load_pass.zero_()
+            for eplb_model_state in self.model_states.values():
+                eplb_model_state.expert_load_pass.zero_()
 
         if log_stats:
-            # total_expert_load_pass: (num_moe_layers, num_physical_experts)
-            total_expert_load_pass = self.expert_load_pass.clone()
-
-            # Collect load metrics from all ranks
+            # Sync the expert load pass for each model (main and drafter).
+            # expert_load_pass: (num_moe_layers, num_physical_experts)
+            expert_load_pass_list = self._sync_load_pass()
             ep_group = get_ep_group().device_group
-            all_reduce(total_expert_load_pass, group=ep_group)
-
-            # num_tokens_per_rank: (num_moe_layers, num_ranks)
-            num_tokens_per_rank = total_expert_load_pass.reshape(
-                total_expert_load_pass.shape[0], ep_group.size(),
-                -1).sum(dim=-1).float()
-
-            # Compute balancedness ratio:
-            # for each layer:
-            #   (mean load across ranks) / (max load across ranks)
-            avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0)
-            max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(
-                dim=0)
-
-            # Just to make type checker happy
-            tokens_tensors: list[float] = torch.stack(
-                [avg_tokens_tensor, max_tokens_tensor]).tolist()
-            avg_tokens, max_tokens = tokens_tensors
-            balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0
-
-            if ep_group.rank() == 0:
-                logger.info(
-                    "EPLB step: avg_tokens=%.2f, max_tokens=%d, "
-                    "balancedness=%.4f", avg_tokens, max_tokens, balancedness)
+            for expert_load_pass, eplb_model_state in zip(
+                expert_load_pass_list, self.model_states.values()
+            ):
+                # num_tokens_per_rank: (num_moe_layers, num_ranks)
+                num_tokens_per_rank = (
+                    expert_load_pass.reshape(
+                        expert_load_pass.shape[0], ep_group.size(), -1
+                    )
+                    .sum(dim=-1)
+                    .float()
+                )
+
+                # Compute balancedness ratio:
+                # for each layer:
+                #   (mean load across ranks) / (max load across ranks)
+                avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0)
+                max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(dim=0)
+
+                # Just to make type checker happy
+                tokens_tensors: list[float] = torch.stack(
+                    [avg_tokens_tensor, max_tokens_tensor]
+                ).tolist()
+                avg_tokens, max_tokens = tokens_tensors
+                balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0
+
+                if ep_group.rank() == 0:
+                    logger.info(
+                        "EPLB step: %d for model %s: avg_tokens=%.2f, "
+                        "max_tokens=%d, balancedness=%.4f",
+                        self.expert_rearrangement_step,
+                        eplb_model_state.model_name,
+                        avg_tokens,
+                        max_tokens,
+                        balancedness,
+                    )
 
         # Update the expert load sliding window
         if not is_dummy:
-            self.expert_load_window[self.expert_load_window_step] = (
-                self.expert_load_pass.clone())
+            for eplb_model_state in self.model_states.values():
+                eplb_model_state.expert_load_window[self.expert_load_window_step] = (
+                    eplb_model_state.expert_load_pass.clone()
+                )
+                eplb_model_state.expert_load_pass.zero_()
+
             self.expert_load_window_step += 1
             if self.expert_load_window_step >= self.expert_load_window_size:
                 self.expert_load_window_step = 0
-            self.expert_load_pass.zero_()
 
         # Step the expert rearrangement step
         # Note that even if this is a dummy step, we still increment the
         # rearrangement step and perform rearrangement to ensure all ranks are
         # performing collective communication.
         self.expert_rearrangement_step += 1
-        if (self.expert_rearrangement_step
-                >= self.expert_rearrangement_step_interval):
+        if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
             self.expert_rearrangement_step = 0
-            self.rearrange(model)
+            self.rearrange()
 
     def rearrange(
         self,
-        model: MixtureOfExperts,
         is_profile: bool = False,
         execute_shuffle: bool = True,
-        global_expert_load: Optional[torch.Tensor] = None,
-        rank_mapping: Optional[dict[int,
-                                    int]] = None) -> Optional[torch.Tensor]:
+        global_expert_loads: list[torch.Tensor] | None = None,
+        rank_mapping: dict[int, int] | None = None,
+    ) -> torch.Tensor | None:
         """
         Rearrange the experts according to the current load.
+
+        Args:
+            is_profile (bool): If `True`, perform a dummy rearrangement.
+                This is used in `profile_run` to reserve enough memory,
+                no memory movement will be performed. Default is False.
+            execute_shuffle (bool): If `True`, execute the shuffle
+                in elastic expert parallel (EEP). Default is True.
+            global_expert_loads (list[torch.Tensor] | None): The global expert
+                loads when scaling is done in EEP.
+                List of expert loads for the main and drafter
+                (when spec decode is used) models.
+            rank_mapping (dict[int, int] | None): The rank mapping
+                when scaling is done in EEP.
         """
 
         ep_group = get_ep_group().device_group
@@ -430,54 +524,73 @@ class EplbState:
         if is_main_rank:
             torch.cuda.synchronize()
             time_start = time.perf_counter()
-            logger.info("Rearranging experts %s...",
-                        "(profile)" if is_profile else "")
+            logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
 
-        if global_expert_load is None:
+        if global_expert_loads is None:
             # Map the physical expert load to global logical experts
-            logical_expert_load_window = torch.zeros(
-                self.expert_load_window_size,
-                model.num_moe_layers,
-                model.num_logical_experts,
-                dtype=self.expert_load_window.dtype,
-                device=self.expert_load_window.device,
-            )
-            logical_expert_load_window.scatter_add_(
-                dim=-1,
-                index=self.physical_to_logical_map.unsqueeze(0).expand_as(
-                    self.expert_load_window).long(),
-                src=self.expert_load_window,
-            )
-
+            global_expert_load_windows = []
             if not execute_shuffle:
-                metadata = torch.tensor(
-                    [
-                        model.num_moe_layers, model.num_logical_experts,
-                        self.physical_to_logical_map.shape[1]
-                    ],
-                    dtype=torch.int32,
-                    device="cpu",
+                num_models = torch.tensor(
+                    [len(self.model_states)], dtype=torch.int32, device="cpu"
+                )
+                torch.distributed.broadcast(
+                    num_models, group=get_ep_group().cpu_group, group_src=0
                 )
-                torch.distributed.broadcast(metadata,
-                                            group=get_ep_group().cpu_group,
-                                            group_src=0)
 
-            # Perform all-reduce to get the expert load across all ranks
-            global_expert_load_window = logical_expert_load_window.sum(dim=0)
-            all_reduce(global_expert_load_window, group=ep_group)
+            for eplb_model_state in self.model_states.values():
+                logical_expert_load_window = torch.zeros(
+                    self.expert_load_window_size,
+                    eplb_model_state.model.num_moe_layers,
+                    eplb_model_state.model.num_logical_experts,
+                    dtype=eplb_model_state.expert_load_window.dtype,
+                    device=eplb_model_state.expert_load_window.device,
+                )
+                logical_expert_load_window.scatter_add_(
+                    dim=-1,
+                    index=eplb_model_state.physical_to_logical_map.unsqueeze(0)
+                    .expand_as(eplb_model_state.expert_load_window)
+                    .long(),
+                    src=eplb_model_state.expert_load_window,
+                )
 
+                if not execute_shuffle:
+                    metadata = torch.tensor(
+                        [
+                            eplb_model_state.model.num_moe_layers,
+                            eplb_model_state.model.num_logical_experts,
+                            eplb_model_state.physical_to_logical_map.shape[1],
+                        ],
+                        dtype=torch.int32,
+                        device="cpu",
+                    )
+                    torch.distributed.broadcast(
+                        metadata, group=get_ep_group().cpu_group, group_src=0
+                    )
+
+                global_expert_load_window = logical_expert_load_window.sum(dim=0)
+                global_expert_load_windows.append(global_expert_load_window)
+            # Perform all-reduce to get the expert load across all ranks for each model
+            global_expert_load_windows = self._allreduce_list(
+                global_expert_load_windows
+            )
             if not execute_shuffle:
-                # (num_moe_layers, old_num_physical_experts)
-                old_global_expert_indices = self.physical_to_logical_map
-                torch.distributed.broadcast(old_global_expert_indices,
-                                            group=ep_group,
-                                            group_src=0)
-                return global_expert_load_window
+                for eplb_model_state, global_expert_load_window in zip(
+                    self.model_states.values(), global_expert_load_windows
+                ):
+                    # (num_moe_layers, old_num_physical_experts)
+                    old_global_expert_indices = eplb_model_state.physical_to_logical_map
+                    torch.distributed.broadcast(
+                        old_global_expert_indices, group=ep_group, group_src=0
+                    )
+            if not execute_shuffle:
+                return global_expert_load_windows
         else:
             assert execute_shuffle
-            global_expert_load_window = global_expert_load
+            global_expert_load_windows = global_expert_loads
 
         # TODO(bowen): Treat differently for prefill and decode nodes
+        eplb_model_state = next(iter(self.model_states.values()))
+        model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
@@ -486,10 +599,10 @@ class EplbState:
             # the GPUs to be released.
             cpu_group = get_ep_group().cpu_group
             num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping)
-            num_gpus = sum(new_rank != -1
-                           for new_rank in rank_mapping.values())
-            num_replicas = num_replicas // ep_group.size(
-            ) * num_gpus  # handle num replicas change
+            num_gpus = sum(new_rank != -1 for new_rank in rank_mapping.values())
+            num_replicas = (
+                num_replicas // ep_group.size() * num_gpus
+            )  # handle num replicas change
         else:
             num_nodes = get_node_count()
             num_gpus = ep_group.size()
@@ -499,48 +612,67 @@ class EplbState:
             logger.warning_once(
                 f"num_gpus % num_nodes != 0, "
                 "not using hierarchical rearrangement algorithm.\n"
-                f"{num_gpus=}, {num_nodes=}")
-
-        # Get new expert mappings
-        (
-            new_physical_to_logical_map,
-            new_logical_to_physical_map,
-            new_logical_replica_count,
-        ) = (rebalance_experts(
-            global_expert_load_window,
-            num_replicas,
-            num_groups,
-            num_nodes,
-            num_gpus,
-        ))
-
-        # Update expert weights
-        rearrange_expert_weights_inplace(
-            self.physical_to_logical_map,
-            new_physical_to_logical_map,
-            model.expert_weights,
-            ep_group,
-            is_profile,
-            rank_mapping,
-        )
+                f"{num_gpus=}, {num_nodes=}"
+            )
 
-        if not is_profile:
-            if self.physical_to_logical_map.shape[
-                    1] != new_physical_to_logical_map.shape[1]:
-                self.physical_to_logical_map = new_physical_to_logical_map.to(
-                    self.physical_to_logical_map.device)
-            else:
-                self.physical_to_logical_map.copy_(new_physical_to_logical_map)
-            max_physical_slots = new_logical_to_physical_map.shape[-1]
-            assert max_physical_slots <= self.logical_to_physical_map.shape[-1]
-            new_logical_to_physical_map = torch.nn.functional.pad(
+        for eplb_model_state, global_expert_load_window in zip(
+            self.model_states.values(), global_expert_load_windows
+        ):
+            # Get new expert mappings for the model
+            (
+                new_physical_to_logical_map,
                 new_logical_to_physical_map,
-                (0,
-                 self.logical_to_physical_map.shape[-1] - max_physical_slots),
-                value=-1,
+                new_logical_replica_count,
+            ) = rebalance_experts(
+                global_expert_load_window,
+                num_replicas,
+                num_groups,
+                num_nodes,
+                num_gpus,
+            )
+
+            # Update expert weights
+            rearrange_expert_weights_inplace(
+                eplb_model_state.physical_to_logical_map,
+                new_physical_to_logical_map,
+                eplb_model_state.model.expert_weights,
+                ep_group,
+                is_profile,
+                rank_mapping,
             )
-            self.logical_to_physical_map.copy_(new_logical_to_physical_map)
-            self.logical_replica_count.copy_(new_logical_replica_count)
+
+            if not is_profile:
+                if (
+                    eplb_model_state.physical_to_logical_map.shape[1]
+                    != new_physical_to_logical_map.shape[1]
+                ):
+                    eplb_model_state.physical_to_logical_map = (
+                        new_physical_to_logical_map.to(
+                            eplb_model_state.physical_to_logical_map.device
+                        )
+                    )
+                else:
+                    eplb_model_state.physical_to_logical_map.copy_(
+                        new_physical_to_logical_map
+                    )
+                max_physical_slots = new_logical_to_physical_map.shape[-1]
+                assert (
+                    max_physical_slots
+                    <= eplb_model_state.logical_to_physical_map.shape[-1]
+                )
+                new_logical_to_physical_map = torch.nn.functional.pad(
+                    new_logical_to_physical_map,
+                    (
+                        0,
+                        eplb_model_state.logical_to_physical_map.shape[-1]
+                        - max_physical_slots,
+                    ),
+                    value=-1,
+                )
+                eplb_model_state.logical_to_physical_map.copy_(
+                    new_logical_to_physical_map
+                )
+                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
         if is_main_rank:
             assert time_start is not None
@@ -554,37 +686,122 @@ class EplbState:
         return None
 
     @staticmethod
-    def recv_state() -> tuple[torch.Tensor, torch.Tensor]:
+    def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """
         Receive the expert load and old placement from the master rank.
         """
         ep_group = get_ep_group()
-        metadata = torch.empty(3, dtype=torch.int32, device="cpu")
-        torch.distributed.broadcast(metadata,
-                                    group=ep_group.cpu_group,
-                                    group_src=0)
-        num_moe_layers, num_logical_experts, num_old_physical_experts = (
-            metadata.tolist())
-        global_expert_load = torch.zeros(
-            (num_moe_layers, num_logical_experts),
-            dtype=torch.int64,
-            device=ep_group.device,
+        num_models = torch.empty(1, dtype=torch.int32, device="cpu")
+        torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0)
+        num_models = num_models.item()
+        global_expert_loads = []
+        old_global_expert_indices_per_model = []
+        for _ in range(num_models):
+            metadata = torch.empty(3, dtype=torch.int32, device="cpu")
+            torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0)
+            num_moe_layers, num_logical_experts, num_old_physical_experts = (
+                metadata.tolist()
+            )
+            global_expert_load = torch.zeros(
+                (num_moe_layers, num_logical_experts),
+                dtype=torch.int64,
+                device=ep_group.device,
+            )
+            all_reduce(global_expert_load, group=ep_group.device_group)
+            old_global_expert_indices = torch.empty(
+                (num_moe_layers, num_old_physical_experts),
+                dtype=torch.int64,
+                device=ep_group.device,
+            )
+            torch.distributed.broadcast(
+                old_global_expert_indices,
+                group=ep_group.device_group,
+                group_src=0,
+            )
+            global_expert_loads.append(global_expert_load)
+            old_global_expert_indices_per_model.append(old_global_expert_indices)
+        return global_expert_loads, old_global_expert_indices_per_model
+
+    @classmethod
+    def get_eep_state(
+        cls, parallel_config: ParallelConfig
+    ) -> tuple[
+        list[torch.Tensor] | None,
+        list[torch.Tensor] | None,
+        dict[int, int] | None,
+    ]:
+        num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu")
+        torch.distributed.broadcast(
+            num_local_physical_experts,
+            group=get_ep_group().cpu_group,
+            group_src=0,
         )
-        all_reduce(global_expert_load, group=ep_group.device_group)
-        old_global_expert_indices = torch.empty(
-            (num_moe_layers, num_old_physical_experts),
-            dtype=torch.int64,
-            device=ep_group.device,
+        num_local_physical_experts = int(num_local_physical_experts.item())
+        new_ep_size = get_ep_group().world_size
+        global_expert_loads, old_global_expert_indices_per_model = (
+            EplbState.recv_state()
         )
-        torch.distributed.broadcast(old_global_expert_indices,
-                                    group=ep_group.device_group,
-                                    group_src=0)
 
-        return global_expert_load, old_global_expert_indices
+        # EP configuration for all models has to be the same so as eplb config
+        num_logical_experts = global_expert_loads[0].shape[1]
+        parallel_config.eplb_config.num_redundant_experts = (
+            num_local_physical_experts * new_ep_size - num_logical_experts
+        )
+        assert (
+            old_global_expert_indices_per_model[0].shape[1] % num_local_physical_experts
+            == 0
+        )
+        old_ep_size = (
+            old_global_expert_indices_per_model[0].shape[1]
+            // num_local_physical_experts
+        )
+        rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
+        return (
+            global_expert_loads,
+            old_global_expert_indices_per_model,
+            rank_mapping,
+        )
+
+    def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]:
+        """
+        All-reduce a list of tensors.
+        """
+        if len(tensor_list) == 1:
+            all_reduce(tensor_list[0], group=get_ep_group().device_group)
+            return tensor_list
+        assert all(t.dim() == 2 for t in tensor_list), "All tensors must be 2D."
+        assert all(t.shape[1] == tensor_list[0].shape[1] for t in tensor_list), (
+            "All tensors must have the same shape[1]."
+        )
+        # Concatenate, all_reduce, then unpack to original shapes.
+        # We assume all tensors are 2D and shape[1] (num_physical_experts)
+        # is the same across all models.
+        shapes = [t.shape for t in tensor_list]
+        concat_tensor = torch.cat(tensor_list, dim=0)
+
+        ep_group = get_ep_group().device_group
+        all_reduce(concat_tensor, group=ep_group)
+
+        all_reduce_list = []
+        offset = 0
+        for shape in shapes:
+            all_reduce_list.append(concat_tensor[offset : offset + shape[0], :])
+            offset += shape[0]
+        return all_reduce_list
+
+    def _sync_load_pass(self) -> list[torch.Tensor]:
+        """
+        Sync the expert load pass across all ranks for log stats.
+        Doesn't update the expert load pass in eplb_model_state.
+        """
+        load_pass_list = []
+        for eplb_model_state in self.model_states.values():
+            load_pass_list.append(eplb_model_state.expert_load_pass.clone())
+        return self._allreduce_list(load_pass_list)
 
 
 def _node_count_with_rank_mapping(
-    pg: Union[ProcessGroup, StatelessProcessGroup],
+    pg: ProcessGroup | StatelessProcessGroup,
     rank_mapping: dict[int, int],
 ) -> int:
     if isinstance(pg, ProcessGroup):
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
index fc43dbe3b6533e725d0168b55c85bf9c4cd202a5..e6645e524cc3e649883441af9858cad40b629376 100644
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -12,11 +12,13 @@ Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
 on how the EPLB algorithm works.
 """
 
+import numpy as np
 import torch
 
 
-def balanced_packing(weight: torch.Tensor,
-                     num_packs: int) -> tuple[torch.Tensor, torch.Tensor]:
+def balanced_packing(
+    weight: torch.Tensor, num_packs: int
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Pack n weighted objects to m packs, such that each bin contains exactly
     n/m objects and the weights of all packs are as balanced as possible.
@@ -33,39 +35,50 @@ def balanced_packing(weight: torch.Tensor,
     assert num_groups % num_packs == 0
     groups_per_pack = num_groups // num_packs
 
+    device = weight.device
+
     if groups_per_pack == 1:
-        pack_index = torch.arange(weight.size(-1),
-                                  dtype=torch.int64,
-                                  device=weight.device).expand(weight.shape)
-        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        pack_index = torch.arange(
+            weight.size(-1), dtype=torch.int64, device=device
+        ).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
         return pack_index, rank_in_pack
 
-    indices = weight.float().sort(-1, descending=True).indices.cpu()
-    pack_index = torch.full_like(weight,
-                                 fill_value=-1,
-                                 dtype=torch.int64,
-                                 device="cpu")
-    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    weight_np = weight.cpu().numpy()
+
+    # Sort and get indices in decending order
+    indices_np = np.argsort(-weight_np, axis=-1)
+
+    pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+    rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+
+    # Run the packing algorithm
     for i in range(num_layers):
-        pack_weights = [0] * num_packs
+        pack_weights = [0.0] * num_packs
         pack_items = [0] * num_packs
-        for group in indices[i]:
+
+        for group in indices_np[i]:
+            # Find a pack with capacity that has the lowest weight
             pack = min(
-                (i
-                 for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
                 key=pack_weights.__getitem__,
             )
+
             assert pack_items[pack] < groups_per_pack
-            pack_index[i, group] = pack
-            rank_in_pack[i, group] = pack_items[pack]
-            pack_weights[pack] += weight[i, group]
+            pack_index_np[i, group] = pack
+            rank_in_pack_np[i, group] = pack_items[pack]
+            pack_weights[pack] += weight_np[i, group]
             pack_items[pack] += 1
+
+    pack_index = torch.from_numpy(pack_index_np).to(device)
+    rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+
     return pack_index, rank_in_pack
 
 
 def replicate_experts(
-        weight: torch.Tensor,
-        num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    weight: torch.Tensor, num_phy: int
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Replicate `num_log` experts to `num_phy` replicas, such that the maximum
     load of all replicas is minimized.
@@ -83,8 +96,7 @@ def replicate_experts(
     num_redundant = num_phy - num_log
     assert num_redundant >= 0
     device = weight.device
-    phy2log = torch.arange(num_phy, dtype=torch.int64,
-                           device=device).repeat(n, 1)
+    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
     rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
     logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
     arangen = torch.arange(n, dtype=torch.int64, device=device)
@@ -108,7 +120,7 @@ def rebalance_experts_hierarchical(
         weight: [num_moe_layers, num_logical_experts]
         num_physical_experts: number of physical experts after replication
         num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network 
+        num_nodes: number of server nodes, where the intra-node network
             (e.g., NVLink) is faster
         num_gpus: number of GPUs, must be a multiple of `num_nodes`
 
@@ -134,45 +146,51 @@ def rebalance_experts_hierarchical(
         inv.scatter_(
             1,
             perm,
-            torch.arange(perm.size(1), dtype=torch.int64,
-                         device=perm.device).expand(perm.shape),
+            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
+                perm.shape
+            ),
         )
         return inv
 
     # Step 1: pack groups to nodes
     tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
-    group_pack_index, group_rank_in_pack = balanced_packing(
-        tokens_per_group, num_nodes)
-    log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) *
-                 group_size).unsqueeze(-1) +
-                torch.arange(group_size,
-                             dtype=torch.int64,
-                             device=group_pack_index.device)).flatten(-2)
+    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
+    log2mlog = (
+        (
+            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+        ).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
+    ).flatten(-2)
     mlog2log = inverse(log2mlog)
 
     # Step 2: construct redundant experts within nodes
     # [num_layers * num_nodes, num_logical_experts // num_nodes]
     tokens_per_mlog = weight.gather(-1, mlog2log).view(
-        -1, num_logical_experts // num_nodes)
+        -1, num_logical_experts // num_nodes
+    )
     phy2mlog, phyrank, mlogcnt = replicate_experts(
-        tokens_per_mlog, num_physical_experts // num_nodes)
+        tokens_per_mlog, num_physical_experts // num_nodes
+    )
 
     # Step 3: pack physical_experts to GPUs
     # [num_layers * num_nodes, num_physical_experts // num_nodes]
     tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
-    pack_index, rank_in_pack = balanced_packing(tokens_per_phy,
-                                                num_gpus // num_nodes)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
     phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
     pphy2phy = inverse(phy2pphy)
 
     pphy2mlog = phy2mlog.gather(
-        -1, pphy2phy)  # [num_layers * num_nodes, num_log_per_nodes]
-    pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange(
-        0,
-        num_logical_experts,
-        num_logical_experts // num_nodes,
-        device=group_pack_index.device,
-    ).view(1, -1, 1)).flatten(-2)
+        -1, pphy2phy
+    )  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (
+        pphy2mlog.view(num_layers, num_nodes, -1)
+        + torch.arange(
+            0,
+            num_logical_experts,
+            num_logical_experts // num_nodes,
+            device=group_pack_index.device,
+        ).view(1, -1, 1)
+    ).flatten(-2)
     pphy2log = mlog2log.gather(-1, pphy2mlog)
     pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
     logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
@@ -210,15 +228,17 @@ def rebalance_experts(
             replicas for each logical expert
     """
     num_layers, num_logical_experts = weight.shape
-    weight = weight.float().cpu()
+    weight = weight.float()
     if num_groups % num_nodes == 0:
         # use hierarchical load-balance policy
         phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, num_groups, num_nodes, num_gpus)
+            weight, num_replicas, num_groups, num_nodes, num_gpus
+        )
     else:
         # use global load-balance policy
         phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, 1, 1, num_gpus)
+            weight, num_replicas, 1, 1, num_gpus
+        )
     num_redundant_experts = num_replicas - num_logical_experts
     maxlogcnt = num_redundant_experts + 1
     log2phy: torch.Tensor = torch.full(
@@ -230,8 +250,9 @@ def rebalance_experts(
     log2phy.view(num_layers, -1).scatter_(
         -1,
         phy2log * maxlogcnt + phyrank,
-        torch.arange(num_replicas, dtype=torch.int64,
-                     device=log2phy.device).expand(num_layers, -1),
+        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+            num_layers, -1
+        ),
     )
     return phy2log, log2phy, logcnt
 
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index f8a7d1170bb010d30d48ece9a5664ed6a2dd7c79..5c1efbaf03babb0fe92eb54ea5398811285476f5 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -8,11 +8,15 @@ This involves the exchange of expert weights between GPUs.
 
 from collections.abc import Iterable, MutableSequence, Sequence
 from functools import partial
-from typing import Optional
 
 import torch
-from torch.distributed import (P2POp, ProcessGroup, all_gather,
-                               batch_isend_irecv, get_global_rank)
+from torch.distributed import (
+    P2POp,
+    ProcessGroup,
+    all_gather,
+    batch_isend_irecv,
+    get_global_rank,
+)
 
 
 def idx_local_to_global(
@@ -132,8 +136,7 @@ def shuffle_layer(
                 continue
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
-                for weight, buffer in zip(expert_weights,
-                                          expert_weights_buffer):
+                for weight, buffer in zip(expert_weights, expert_weights_buffer):
                     buffer[dst].copy_(weight[src])
 
     p2p_ops: list[P2POp] = []
@@ -177,7 +180,8 @@ def shuffle_layer(
                     torch.distributed.isend,
                     weight[src],
                     dst_global,
-                ) for weight in expert_weights
+                )
+                for weight in expert_weights
             ]
 
     # 3. Initiate receiving of weights.
@@ -216,7 +220,8 @@ def shuffle_layer(
                 torch.distributed.irecv,
                 weight[dst],
                 src_global,
-            ) for weight in expert_weights_buffer
+            )
+            for weight in expert_weights_buffer
         ]
 
     # 4. Execute the P2P operations. The real communication happens here.
@@ -247,7 +252,7 @@ def rearrange_expert_weights_inplace(
     expert_weights: Sequence[Iterable[torch.Tensor]],
     ep_group: ProcessGroup,
     is_profile: bool = False,
-    rank_mapping: Optional[dict[int, int]] = None,
+    rank_mapping: dict[int, int] | None = None,
 ) -> None:
     """
     Rearranges the expert weights in place according to the new expert indices.
@@ -271,29 +276,25 @@ def rearrange_expert_weights_inplace(
     if rank_mapping is not None:
         if len(rank_mapping) == ep_group.size():
             # scale down
-            new_global_expert_indices = \
-                _map_new_expert_indices_with_rank_mapping(
+            new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
                 new_global_expert_indices,
                 rank_mapping,
             )
         else:
             # scale up
-            old_global_expert_indices = \
-                _map_old_expert_indices_with_rank_mapping(
+            old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
                 old_global_expert_indices,
                 rank_mapping,
                 ep_group.size(),
             )
 
-    assert old_global_expert_indices.shape[
-        1] == new_global_expert_indices.shape[1]
+    assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
 
     num_moe_layers, num_physical_experts = old_global_expert_indices.shape
     assert len(expert_weights) == num_moe_layers
 
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
-    assert new_global_expert_indices.shape == (num_moe_layers,
-                                               num_physical_experts)
+    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
     ep_rank = ep_group.rank()
     ep_size = ep_group.size()
@@ -320,15 +321,19 @@ def rearrange_expert_weights_inplace(
             )
         return
 
+    old_global_expert_indices_cpu = old_global_expert_indices.cpu()
+    new_global_expert_indices_cpu = new_global_expert_indices.cpu()
+
+    # NOTE(bowen): We need this synchronize to run, but I don't know why.
+    # If you figure out the reason, please let me know -- thank you!
+    torch.cuda.synchronize()
+
     for layer in range(num_moe_layers):
-        # NOTE(bowen): We need this synchronize to run, but I don't know why.
-        # If you figure out the reason, please let me know -- thank you!
-        torch.cuda.synchronize()
         shuffle_layer(
             num_local_physical_experts,
             ep_rank,
-            old_global_expert_indices[layer].tolist(),
-            new_global_expert_indices[layer].tolist(),
+            old_global_expert_indices_cpu[layer].tolist(),
+            new_global_expert_indices_cpu[layer].tolist(),
             expert_weights[layer],
             expert_weights_buffer,
             ep_group,
@@ -342,13 +347,13 @@ def _map_old_expert_indices_with_rank_mapping(
 ) -> torch.Tensor:
     """
     Map the old global expert indices to the new global expert indices.
-    
+
     Args:
         old_global_expert_indices:
             Shape (num_layers, old_ep_size * num_local_physical_experts).
         rank_mapping: Mapping from old rank to new rank.
         new_ep_size: New expert parallelism size.
-    
+
     Returns:
         Mapped expert indices with shape
         (num_layers, new_ep_size * num_local_physical_experts).
@@ -379,8 +384,9 @@ def _map_old_expert_indices_with_rank_mapping(
             new_start_idx = new_rank * num_local_physical_experts
             new_end_idx = (new_rank + 1) * num_local_physical_experts
 
-            mapped_expert_indices[:, new_start_idx:new_end_idx] = \
+            mapped_expert_indices[:, new_start_idx:new_end_idx] = (
                 old_global_expert_indices[:, old_start_idx:old_end_idx]
+            )
         # If new_rank is None or >= new_ep_size, the experts remain -1
         # (scale down case)
 
@@ -415,8 +421,9 @@ def _map_new_expert_indices_with_rank_mapping(
             new_start_idx = new_rank * num_local_physical_experts
             new_end_idx = (new_rank + 1) * num_local_physical_experts
 
-            mapped_expert_indices[:, old_start_idx:old_end_idx] = \
+            mapped_expert_indices[:, old_start_idx:old_end_idx] = (
                 new_global_expert_indices[:, new_start_idx:new_end_idx]
+            )
 
     return mapped_expert_indices
 
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 46f0cd9289b2305116f9d9961ac8a943e93c627c..7b5cb94cf13ea62d5f8fabbec335da6eb6d1ffc2 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -6,10 +6,11 @@ import threading
 import time
 from abc import ABC, abstractmethod
 from collections import deque
+from collections.abc import Callable
 from dataclasses import asdict
 from itertools import count
 from queue import Queue
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import msgspec
 import zmq
@@ -22,22 +23,23 @@ logger = init_logger(__name__)
 
 
 class EventBatch(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False,  # type: ignore[call-arg]
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,  # type: ignore[call-arg]
 ):
     ts: float
     events: list[Any]
-    data_parallel_rank: Optional[int] = None
+    data_parallel_rank: int | None = None
 
 
 class KVCacheEvent(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False,  # type: ignore[call-arg]
-        tag=True):
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,  # type: ignore[call-arg]
+    tag=True,
+):
     """Base class for all KV cache-related events"""
 
 
@@ -46,16 +48,16 @@ MEDIUM_GPU = "GPU"
 
 class BlockStored(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    parent_block_hash: Optional[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
-    lora_id: Optional[int]
-    medium: Optional[str]
+    lora_id: int | None
+    medium: str | None
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    medium: Optional[str]
+    medium: str | None
 
 
 class AllBlocksCleared(KVCacheEvent):
@@ -63,20 +65,20 @@ class AllBlocksCleared(KVCacheEvent):
 
 
 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 
 
 class EventPublisher(ABC):
     """Lightweight publisher for EventBatch batches with data parallelism
     support.
-    
+
     In data parallel setups, each DP rank runs its own EventPublisher instance
     to avoid duplicate events and ensure proper event attribution:
-    
+
     - Each DP rank creates a separate publisher
     - Publishers automatically annotate events with their data_parallel_rank
     - This allows consumers to distinguish events from different DP ranks
-    
+
     The publisher is responsible for adding DP metadata since the scheduler
     operates independently of DP topology and shouldn't need DP awareness.
     """
@@ -115,7 +117,7 @@ class ZmqEventPublisher(EventPublisher):
     Parameters
     ----------
     endpoint:
-        PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
+        PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to
         connect.
     replay_endpoint:
         Optional ROUTER address for replay requests. When given, subscribers can
@@ -130,6 +132,7 @@ class ZmqEventPublisher(EventPublisher):
     topic:
         Topic to publish events to.
     """
+
     SHUTDOWN_TIMEOUT: float = 1.0
     END_SEQ = (-1).to_bytes(8, "big", signed=True)
 
@@ -137,7 +140,7 @@ class ZmqEventPublisher(EventPublisher):
         self,
         data_parallel_rank: int,
         endpoint: str = "tcp://*:5557",
-        replay_endpoint: Optional[str] = None,
+        replay_endpoint: str | None = None,
         buffer_steps: int = 10_000,
         hwm: int = 100_000,
         max_queue_size: int = 100_000,
@@ -145,32 +148,33 @@ class ZmqEventPublisher(EventPublisher):
     ) -> None:
         # Storage
         super().__init__(data_parallel_rank)
-        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._event_queue = Queue[EventBatch | None](maxsize=max_queue_size)
         self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
 
         # ZMQ sockets
         self._ctx = zmq.Context.instance()
-        self._pub: Optional[zmq.Socket] = None
-        self._replay: Optional[zmq.Socket] = None
+        self._pub: zmq.Socket | None = None
+        self._replay: zmq.Socket | None = None
         self._dp_rank = data_parallel_rank
 
         self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
         self._replay_endpoint = self.offset_endpoint_port(
-            replay_endpoint, self._dp_rank)
+            replay_endpoint, self._dp_rank
+        )
         self._hwm = hwm
         self._socket_setup()
 
         # Payload
         self._seq_gen = count()
-        self._topic_bytes = topic.encode('utf-8')
+        self._topic_bytes = topic.encode("utf-8")
 
         # Thread
         self._running = True
         logger.info("Starting ZMQ publisher thread")
 
-        self._thread = threading.Thread(target=self._publisher_thread,
-                                        daemon=True,
-                                        name="zmq-publisher")
+        self._thread = threading.Thread(
+            target=self._publisher_thread, daemon=True, name="zmq-publisher"
+        )
         self._thread.start()
 
     def publish(self, events: EventBatch) -> None:
@@ -220,10 +224,12 @@ class ZmqEventPublisher(EventPublisher):
             self._pub.set_hwm(self._hwm)
             # Heuristic: bind if wildcard / * present, else connect.
             # bind stable, connect volatile convention
-            if (self._endpoint is not None
-                    and ("*" in self._endpoint or "::" in self._endpoint
-                         or self._endpoint.startswith("ipc://")
-                         or self._endpoint.startswith("inproc://"))):
+            if self._endpoint is not None and (
+                "*" in self._endpoint
+                or "::" in self._endpoint
+                or self._endpoint.startswith("ipc://")
+                or self._endpoint.startswith("inproc://")
+            ):
                 self._pub.bind(self._endpoint)
             elif self._endpoint is not None:
                 self._pub.connect(self._endpoint)
@@ -263,8 +269,7 @@ class ZmqEventPublisher(EventPublisher):
 
                 payload = self._pack.encode(event)
                 seq_bytes = seq.to_bytes(8, "big")
-                self._pub.send_multipart(
-                    (self._topic_bytes, seq_bytes, payload))
+                self._pub.send_multipart((self._topic_bytes, seq_bytes, payload))
 
                 self._buffer.append((seq, payload))
                 self._event_queue.task_done()
@@ -291,24 +296,26 @@ class ZmqEventPublisher(EventPublisher):
                 # (identity, empty_delim) are stripped off by the router
                 # receiving payload is (seq_bytes, payload)
                 self._replay.send_multipart(
-                    (client_id, b"", seq.to_bytes(8, "big"), buf))
+                    (client_id, b"", seq.to_bytes(8, "big"), buf)
+                )
         # Send end of sequence marker
         # receiving payload is (-1, b""")
         self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
 
     @staticmethod
-    def offset_endpoint_port(endpoint: Optional[str],
-                             data_parallel_rank: int) -> Optional[str]:
-        """Helper function to offset the port in an endpoint by 
+    def offset_endpoint_port(
+        endpoint: str | None, data_parallel_rank: int
+    ) -> str | None:
+        """Helper function to offset the port in an endpoint by
             the data parallel rank.
 
         Args:
-            endpoint: The endpoint string 
+            endpoint: The endpoint string
                 (e.g., "tcp://*:5557" or "inproc://cache")
             data_parallel_rank: The data parallel rank to offset by
 
         Returns:
-            The endpoint with the port offset by data_parallel_rank 
+            The endpoint with the port offset by data_parallel_rank
                 or suffix appended
         """
         # Do nothing if input is None or data_parallel_rank is 0
@@ -322,7 +329,7 @@ class ZmqEventPublisher(EventPublisher):
                 # Get everything after the last colon (the port)
                 last_colon_idx = endpoint.rfind(":")
                 base_addr = endpoint[:last_colon_idx]
-                base_port = int(endpoint[last_colon_idx + 1:])
+                base_port = int(endpoint[last_colon_idx + 1 :])
                 new_port = base_port + data_parallel_rank
                 return f"{base_addr}:{new_port}"
             return endpoint
@@ -336,27 +343,29 @@ class EventPublisherFactory:
     }
 
     @classmethod
-    def register_publisher(cls, name: str,
-                           ctor: Callable[..., EventPublisher]) -> None:
+    def register_publisher(cls, name: str, ctor: Callable[..., EventPublisher]) -> None:
         if name in cls._registry:
             raise KeyError(f"publisher '{name}' already registered")
         cls._registry[name] = ctor
 
     @classmethod
-    def create(cls,
-               config: Optional[KVEventsConfig],
-               data_parallel_rank: int = 0) -> EventPublisher:
+    def create(
+        cls, config: KVEventsConfig | None, data_parallel_rank: int = 0
+    ) -> EventPublisher:
         """Create publisher from a config mapping."""
-        if not config:
+        if (
+            config is None
+            or not config.enable_kv_cache_events
+            or config.publisher == "null"
+        ):
             return NullEventPublisher()
 
         config_dict = asdict(config)
 
-        kind = config_dict.pop("publisher", "null")
+        kind = config_dict.pop("publisher")
         config_dict.pop("enable_kv_cache_events")
         try:
             constructor = cls._registry[kind]
         except KeyError as exc:
             raise ValueError(f"Unknown event publisher '{kind}'") from exc
-        return constructor(data_parallel_rank=data_parallel_rank,
-                           **config_dict)
+        return constructor(data_parallel_rank=data_parallel_rank, **config_dict)
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index cf58e7914972c7b2bc3a3a83b262554bdf2b9191..2bf4e1feb7034d3adcf33201a6a0f84b418928f7 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -2,12 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    KVConnectorBaseType, ensure_kv_transfer_initialized,
-    ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group,
-    is_v1_kv_transfer_group)
+    KVConnectorBaseType,
+    ensure_kv_transfer_initialized,
+    ensure_kv_transfer_shutdown,
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+    is_v1_kv_transfer_group,
+)
 
 __all__ = [
-    "get_kv_transfer_group", "has_kv_transfer_group",
-    "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
-    "ensure_kv_transfer_shutdown", "KVConnectorBaseType"
+    "get_kv_transfer_group",
+    "has_kv_transfer_group",
+    "is_v1_kv_transfer_group",
+    "ensure_kv_transfer_initialized",
+    "ensure_kv_transfer_shutdown",
+    "KVConnectorBaseType",
 ]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 873f130ed827e5fbb274bdc8ebd7c1d7b24d4166..df871dd7cbe4fc7f7c5d131f286978961e67f150 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -2,20 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Optional, cast
 
-# yapf: disable
-import vllm.envs as envs
 from vllm.distributed.kv_transfer.kv_connector.base import (
-    KVConnectorBase, KVConnectorBaseType)
-from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
+    KVConnectorBase,
+    KVConnectorBaseType,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorRole,
+    supports_hma,
+)
 from vllm.logger import init_logger
-
-# yapf: enable
+from vllm.utils.func_utils import supports_kw
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.config.kv_transfer import KVTransferConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = init_logger(__name__)
 
@@ -24,8 +28,7 @@ class KVConnectorFactory:
     _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {}
 
     @classmethod
-    def register_connector(cls, name: str, module_path: str,
-                           class_name: str) -> None:
+    def register_connector(cls, name: str, module_path: str, class_name: str) -> None:
         """Register a connector with a lazy-loading module and class name."""
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
@@ -41,15 +44,28 @@ class KVConnectorFactory:
         cls,
         config: "VllmConfig",
         role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
     ) -> KVConnectorBase:
-        if not envs.VLLM_USE_V1:
-            raise ValueError("Attempting to initialize a V1 Connector, "
-                             f"but found {envs.VLLM_USE_V1=}")
-
         kv_transfer_config = config.kv_transfer_config
-        connector_cls = cls.get_connector_class(kv_transfer_config)
-        logger.info("Creating v1 connector with name: %s and engine_id: %s",
-                    connector_cls.__name__, kv_transfer_config.engine_id)
+        if kv_transfer_config is None:
+            raise ValueError("kv_transfer_config must be set to create a connector")
+        connector_cls, compat_sig = cls._get_connector_class_with_compat(
+            kv_transfer_config
+        )
+
+        # check if the connector supports HMA
+        hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager
+        if hma_enabled and not supports_hma(connector_cls):
+            raise ValueError(
+                f"Connector {connector_cls.__name__} does not support HMA but "
+                f"HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`."
+            )
+
+        logger.info(
+            "Creating v1 connector with name: %s and engine_id: %s",
+            connector_cls.__name__,
+            kv_transfer_config.engine_id,
+        )
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
         # Scheduler connector:
         # - Co-locate with scheduler process
@@ -58,23 +74,68 @@ class KVConnectorFactory:
         # - Co-locate with worker process
         # - Should only be used inside the forward context & attention layer
         # We build separately to enforce strict separation
-        return connector_cls(config, role)
+        if compat_sig:
+            # Old signature: __init__(self, vllm_config, role)
+            return connector_cls(config, role)
+        else:
+            # New signature: __init__(self, vllm_config, role, kv_cache_config)
+            return connector_cls(config, role, kv_cache_config)
 
     @classmethod
-    def get_connector_class(
-            cls, kv_transfer_config: "KVTransferConfig"
+    def get_connector_class_by_name(
+        cls, connector_name: str
     ) -> type[KVConnectorBaseType]:
-        """Get the connector class by name."""
+        """Get a registered connector class by name.
+
+        Raises ValueError if the connector is not registered.
+
+        Args:
+            connector_name: Name of the registered connector.
+
+        Returns:
+            The connector class.
+        """
+        if connector_name not in cls._registry:
+            raise ValueError(f"Connector '{connector_name}' is not registered.")
+        return cls._registry[connector_name]()
+
+    @classmethod
+    def _get_connector_class_with_compat(
+        cls, kv_transfer_config: "KVTransferConfig"
+    ) -> tuple[type[KVConnectorBaseType], bool]:
         connector_name = kv_transfer_config.kv_connector
+        if connector_name is None:
+            raise ValueError("Connector name is not set in KVTransferConfig")
+        compat_sig = False
         if connector_name in cls._registry:
             connector_cls = cls._registry[connector_name]()
         else:
             connector_module_path = kv_transfer_config.kv_connector_module_path
             if connector_module_path is None:
-                raise ValueError(
-                    f"Unsupported connector type: {connector_name}")
+                raise ValueError(f"Unsupported connector type: {connector_name}")
             connector_module = importlib.import_module(connector_module_path)
-            connector_cls = getattr(connector_module, connector_name)
+            try:
+                connector_cls = getattr(connector_module, connector_name)
+            except AttributeError as e:
+                raise AttributeError(
+                    f"Class {connector_name} not found in {connector_module_path}"
+                ) from e
+            connector_cls = cast(type[KVConnectorBaseType], connector_cls)
+            if not supports_kw(connector_cls, "kv_cache_config"):
+                compat_sig = True
+                logger.warning(
+                    "Connector %s uses deprecated signature with 2 required arguments. "
+                    "Please update to include kv_cache_config as the second argument.",
+                    connector_cls.__name__,
+                )
+        return connector_cls, compat_sig
+
+    @classmethod
+    def get_connector_class(
+        cls, kv_transfer_config: "KVTransferConfig"
+    ) -> type[KVConnectorBaseType]:
+        """Get the connector class by name."""
+        connector_cls, _ = cls._get_connector_class_with_compat(kv_transfer_config)
         return connector_cls
 
 
@@ -85,29 +146,47 @@ class KVConnectorFactory:
 KVConnectorFactory.register_connector(
     "SharedStorageConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
-    "SharedStorageConnector")
+    "SharedStorageConnector",
+)
 
 KVConnectorFactory.register_connector(
     "P2pNcclConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_connector",
-    "P2pNcclConnector")
+    "P2pNcclConnector",
+)
 
 KVConnectorFactory.register_connector(
     "LMCacheConnectorV1",
     "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector",
-    "LMCacheConnectorV1")
+    "LMCacheConnectorV1",
+)
+
+KVConnectorFactory.register_connector(
+    "LMCacheMPConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_mp_connector",
+    "LMCacheMPConnector",
+)
 
 KVConnectorFactory.register_connector(
     "NixlConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector",
-    "NixlConnector")
+    "NixlConnector",
+)
 
 KVConnectorFactory.register_connector(
     "MultiConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector",
-    "MultiConnector")
+    "MultiConnector",
+)
 
 KVConnectorFactory.register_connector(
     "OffloadingConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector",
-    "OffloadingConnector")
+    "OffloadingConnector",
+)
+
+KVConnectorFactory.register_connector(
+    "DecodeBenchConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector",
+    "DecodeBenchConnector",
+)
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index efa4c9abf47f798fbbc01654a33b7ca2ce9537f4..b8eb5ea3b493934fd2bf3e62cfa748823f930bda 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -3,33 +3,31 @@
 """
 KV cache helper for store.
 """
-from collections import defaultdict
-from collections.abc import Sequence
-from concurrent.futures import CancelledError, Future
-from typing import Literal, Optional, Union, cast
+
+from typing import TYPE_CHECKING, Literal
 
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, get_current_vllm_config
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+
 logger = init_logger(__name__)
 
 
 class model_aware_kv_ops_helper:
-
     def __init__(self, config: VllmConfig):
         self.is_deepseek_mla = config.model_config.is_deepseek_mla
         self.use_mla_opt = not envs.VLLM_MLA_DISABLE
         self.tp_size = config.parallel_config.tensor_parallel_size
 
     def get_model_args(self, model_executable: torch.nn.Module):
-
         model_config = model_executable.model.config
         self.model_executable = model_executable
         num_heads = int(model_config.num_key_value_heads / self.tp_size)
@@ -46,12 +44,10 @@ class model_aware_kv_ops_helper:
         # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
         # For more details, see vllm/v1/attention/backends/mla/common.py.
         if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + \
-                model_config.qk_rope_head_dim
+            head_size = model_config.kv_lora_rank + model_config.qk_rope_head_dim
             num_heads = 1
         elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + \
-                model_config.qk_rope_head_dim
+            head_size = model_config.qk_nope_head_dim + model_config.qk_rope_head_dim
         else:
             head_size = getattr(model_config, "head_dim", None)
             if head_size is None:
@@ -68,16 +64,24 @@ class model_aware_kv_ops_helper:
             value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
         return key_cache, value_cache
 
-    def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
-                        layer, kv_cache, slot_mapping, start_pos, end_pos):
-
+    def put_kv_to_cache(
+        self,
+        model_executable: torch.nn.Module,
+        keys,
+        values,
+        layer,
+        kv_cache,
+        slot_mapping,
+        start_pos,
+        end_pos,
+    ):
         model_config = model_executable.model.config
 
         if self.is_deepseek_mla and self.use_mla_opt:
             layer.self_attn.attn = layer.self_attn.mla_attn
             k_c_normed_k_pe = keys.squeeze(1)
-            k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
-            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+            k_c_normed = k_c_normed_k_pe[:, : model_config.kv_lora_rank]
+            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank :]
             ops.concat_and_cache_mla(
                 k_c_normed.to(kv_cache.device),
                 k_pe.to(kv_cache.device),
@@ -107,35 +111,48 @@ def get_kv_connector_cache_layout():
     kv_config = vllm_config.kv_transfer_config
     if kv_config is not None:
         connector_cls = KVConnectorFactory.get_connector_class(kv_config)
-        required_kvcache_layout = connector_cls.get_required_kvcache_layout(
-            vllm_config)
+        required_kvcache_layout = connector_cls.get_required_kvcache_layout(vllm_config)
         if required_kvcache_layout is not None:
             return required_kvcache_layout
-        logger.info_once("Connectors do not specify a " \
-                         "kv cache layout, defaulting to NHD.")
+        logger.info_once(
+            "Connectors do not specify a kv cache layout, defaulting to NHD."
+        )
     return "NHD"
 
 
 class KVOutputAggregator:
-    """Utility class to aggregate the output of all workers into a single 
+    """Utility class to aggregate the output of all workers into a single
     output corresponding to Rank 0 for scheduler."""
 
-    def __init__(self, world_size: int):
+    def __init__(self, expected_finished_count: int):
         # Complete transfer tracker. Used to track finished requests
         # [req_id -> n_remaining_workers]
-        self._recv_remaining_count = defaultdict[str, int](lambda: world_size)
-        self._send_remaining_count = defaultdict[str, int](lambda: world_size)
+        self._recv_remaining_count = dict[str, int]()
+        self._send_remaining_count = dict[str, int]()
+        self._expected_finished_count = expected_finished_count
+
+    @classmethod
+    def from_connector(cls, connector: "KVConnectorBase", world_size: int):
+        return cls(connector.get_finished_count() or world_size)
+
+    def aggregate(
+        self, outputs: list[ModelRunnerOutput | None], output_rank: int = 0
+    ) -> ModelRunnerOutput | None:
+        if not outputs[output_rank]:
+            return None
 
-    def aggregate(self,
-                  outputs: list[ModelRunnerOutput],
-                  output_rank: int = 0) -> ModelRunnerOutput:
         # Aggregate kv_connector_output from all workers
 
-        def update_finished_set(req_ids: Optional[set[str]],
-                                remaining_count_dict: dict[str, int],
-                                finished_set: set[str]) -> None:
+        def update_finished_set(
+            req_ids: set[str] | None,
+            remaining_count_dict: dict[str, int],
+            finished_set: set[str],
+        ) -> None:
             for req_id in req_ids or ():
-                remaining_count_dict[req_id] -= 1
+                remaining_count = remaining_count_dict.get(
+                    req_id, self._expected_finished_count
+                )
+                remaining_count_dict[req_id] = remaining_count - 1
                 if remaining_count_dict[req_id] == 0:
                     finished_set.add(req_id)
                     del remaining_count_dict[req_id]
@@ -143,88 +160,72 @@ class KVOutputAggregator:
         finished_sending = set[str]()
         finished_recving = set[str]()
         aggregated_kv_connector_stats = None
+        invalid_block_ids = set[int]()
         for model_runner_output in outputs:
-            output = model_runner_output.kv_connector_output
-            if not output:
+            assert model_runner_output is not None
+            kv_output = model_runner_output.kv_connector_output
+            if not kv_output:
                 continue
-            update_finished_set(output.finished_sending,
-                                self._send_remaining_count, finished_sending)
-            update_finished_set(output.finished_recving,
-                                self._recv_remaining_count, finished_recving)
+            # Allow the worker to dynamically update the expected number of
+            # finished sending/recving for new requests.
+            if (
+                kv_output.expected_finished_count > 0
+                and kv_output.expected_finished_count != self._expected_finished_count
+            ):
+                logger.debug(
+                    "Expected finished requests updated from %d to %d",
+                    self._expected_finished_count,
+                    kv_output.expected_finished_count,
+                )
+                self._expected_finished_count = kv_output.expected_finished_count
+
+            update_finished_set(
+                kv_output.finished_sending, self._send_remaining_count, finished_sending
+            )
+            update_finished_set(
+                kv_output.finished_recving, self._recv_remaining_count, finished_recving
+            )
 
             # Aggregate kv_connector_stats from all workers.
             if aggregated_kv_connector_stats is None:
                 # Use the first worker's kv_connector_stats as accumulator.
-                aggregated_kv_connector_stats = output.kv_connector_stats
-            elif kv_connector_stats := output.kv_connector_stats:
+                aggregated_kv_connector_stats = kv_output.kv_connector_stats
+            elif kv_connector_stats := kv_output.kv_connector_stats:
                 if aggregated_kv_connector_stats is None:
                     aggregated_kv_connector_stats = kv_connector_stats
                 else:
-                    assert isinstance(aggregated_kv_connector_stats,
-                                      type(kv_connector_stats))
-                    aggregated_kv_connector_stats = \
+                    assert isinstance(
+                        aggregated_kv_connector_stats, type(kv_connector_stats)
+                    )
+                    aggregated_kv_connector_stats = (
                         aggregated_kv_connector_stats.aggregate(kv_connector_stats)
+                    )
+
+            invalid_block_ids |= kv_output.invalid_block_ids
 
         # select output of the worker specified by output_rank
         output = outputs[output_rank]
 
+        assert output is not None
         output.kv_connector_output = KVConnectorOutput(
             finished_sending=finished_sending or None,
             finished_recving=finished_recving or None,
             kv_connector_stats=aggregated_kv_connector_stats or None,
+            invalid_block_ids=invalid_block_ids,
+            expected_finished_count=self._expected_finished_count,
         )
 
         return output
 
-    def async_aggregate(self,
-                        output_futures: Sequence[Future[ModelRunnerOutput]],
-                        output_rank: int = 0) -> Future[ModelRunnerOutput]:
-        """Takes a list of futures and returns a single future which resolves
-        to the respective list of outputs."""
-        result_future: Future[ModelRunnerOutput] = Future()
-
-        outputs: list[Optional[ModelRunnerOutput]] = [None
-                                                      ] * len(output_futures)
-
-        def make_callback(idx):
-
-            def callback(fut):
-                if result_future.done():
-                    return
-
-                try:
-                    outputs[idx] = fut.result()
-                except CancelledError:
-                    result_future.cancel()
-                except Exception as e:
-                    result_future.set_exception(e)
-
-                # this check assumes io_thread_pool uses a single thread
-                if all(outputs):
-                    result_future.set_result(
-                        self.aggregate(cast(list[ModelRunnerOutput], outputs),
-                                       output_rank))
-
-            return callback
-
-        for i, output_future in enumerate(output_futures):
-            output_future.add_done_callback(make_callback(i))
-
-        return result_future
-
 
 def _make_src_and_dst_indices(
     src_block_ids: list[int],
     dst_block_ids: list[int],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
+    src_device: torch.device | str,
+    dst_device: torch.device | str,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    src_indices = torch.tensor(src_block_ids,
-                               device=src_device,
-                               dtype=torch.int64)
-    dst_indices = torch.tensor(dst_block_ids,
-                               device=dst_device,
-                               dtype=torch.int64)
+    src_indices = torch.tensor(src_block_ids, device=src_device, dtype=torch.int64)
+    dst_indices = torch.tensor(dst_block_ids, device=dst_device, dtype=torch.int64)
     return src_indices, dst_indices
 
 
@@ -236,9 +237,13 @@ def copy_kv_blocks(
     direction: Literal["h2d", "d2h"],
 ) -> None:
     """Copy kv blocks between different buffers."""
-    if not src_kv_caches or not dst_kv_caches or \
-       not src_block_ids or not dst_block_ids or \
-       len(src_block_ids) != len(dst_block_ids):
+    if (
+        not src_kv_caches
+        or not dst_kv_caches
+        or not src_block_ids
+        or not dst_block_ids
+        or len(src_block_ids) != len(dst_block_ids)
+    ):
         return
 
     src_device = next(iter(src_kv_caches.values())).device
@@ -248,9 +253,11 @@ def copy_kv_blocks(
         src_block_ids=src_block_ids,
         dst_block_ids=dst_block_ids,
         src_device=src_device,
-        dst_device=dst_device)
+        dst_device=dst_device,
+    )
 
     from vllm.platforms import current_platform
+
     if direction == "h2d":
         copy_fn = current_platform.insert_blocks_to_device
     else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
index f00f31dde915ad0527965a72e9332ec8fb0bd3a9..0e16bc5cc685ce590b555a8227ab7eef5b1c47cd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -1,6 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorRole)
+    KVConnectorBase_V1,
+    KVConnectorRole,
+    SupportsHMA,
+    supports_hma,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector import (  # noqa E:501
+    DecodeBenchConnector,
+)
 
-__all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+    "supports_hma",
+    "SupportsHMA",
+    "DecodeBenchConnector",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 184d0a62f2c3027dd5095510912c801ba516b38b..f85eb414b2222e9451107abbb2376ecfc594d4a2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -7,18 +7,19 @@ communication in vLLM v1
 The class provides the following primitives:
     Scheduler-side: runs in the scheduler, binds metadata, which
     is used by the worker-side to load/save KV cache.
-        get_num_new_matched_tokens() - get number of new tokens 
+        get_num_new_matched_tokens() - get number of new tokens
             that exist in the remote KV cache. Might be called multiple
             times for a given request and should be side-effect free.
         update_state_after_alloc() - update KVConnector state after
             temporary buffer alloc by the CacheManager.
         update_connector_output() - update KVConnector state after
             output is received from worker-side connectors.
-        request_finished() - called when a request is finished, with
-            the computed kv cache blocks for the request.
-            Returns whether KV cache should be freed now or will be
-            freed asynchronously and optionally returns KV transfer
-            params.
+        request_finished() - called once when a request is finished,
+            with the computed kv cache blocks for the request.
+            Returns whether KV cache should be freed now or if the
+            connector now assumes responsibility for freeing the
+            the blocks asynchronously. Also optionally returns KV
+            transfer params.
         take_events() - returns new KV events that were collected
             by the connector since the last call.
 
@@ -36,8 +37,8 @@ The class provides the following primitives:
 
 import enum
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, Literal, Optional
 
 import torch
 
@@ -50,20 +51,70 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-        KVConnectorStats)
+        KVConnectorPromMetrics,
+        KVConnectorStats,
+        PromMetric,
+        PromMetricT,
+    )
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 # s_tensor_list, d_tensor_list, s_indices, d_indices, direction
-CopyBlocksOp = Callable[[
-    dict[str, torch.Tensor], dict[
-        str, torch.Tensor], list[int], list[int], Literal["h2d", "d2h"]
-], None]
+CopyBlocksOp = Callable[
+    [
+        dict[str, torch.Tensor],
+        dict[str, torch.Tensor],
+        list[int],
+        list[int],
+        Literal["h2d", "d2h"],
+    ],
+    None,
+]
 
 logger = init_logger(__name__)
 
 
+class SupportsHMA(ABC):
+    """
+    The class that indicates the corresponding connector supports hybrid memory
+    allocator (HMA).
+    This is required to use the connector together with hybrid memory allocator.
+    """
+
+    @abstractmethod
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called exactly once when a request has finished for all kv cache groups,
+        before its blocks are freed for each group.
+
+        NOTE(Kuntai): This function is only supported by connectors that support HMA.
+
+        The connector may assumes responsibility for freeing the blocks
+        asynchronously by returning True.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        raise NotImplementedError
+
+
+def supports_hma(connector: Any) -> bool:
+    if isinstance(connector, type):
+        return issubclass(connector, SupportsHMA)
+    else:
+        return isinstance(connector, SupportsHMA)
+
+
 class KVConnectorRole(enum.Enum):
     # Connector running in the scheduler process
     SCHEDULER = 0
@@ -72,22 +123,49 @@ class KVConnectorRole(enum.Enum):
     WORKER = 1
 
 
+class KVConnectorHandshakeMetadata(ABC):  # noqa: B024
+    """
+    Metadata used for out of band connector handshake between
+    P/D workers. This needs to serializeable.
+    """
+
+    pass
+
+
 class KVConnectorMetadata(ABC):  # noqa: B024
     """
     Abstract Metadata used to communicate between the
     Scheduler KVConnector and Worker KVConnector.
     """
+
     pass
 
 
 class KVConnectorBase_V1(ABC):
-
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
         logger.warning(
             "Initializing KVConnectorBase_V1. This API is experimental and "
-            "subject to change in the future as we iterate the design.")
-        self._connector_metadata: Optional[KVConnectorMetadata] = None
+            "subject to change in the future as we iterate the design."
+        )
+        self._connector_metadata: KVConnectorMetadata | None = None
         self._vllm_config = vllm_config
+        if vllm_config.kv_transfer_config is not None:
+            self._kv_transfer_config = vllm_config.kv_transfer_config
+        else:
+            raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1")
+        self._kv_cache_config = kv_cache_config
+        if self._kv_cache_config is None:
+            logger.warning(
+                "KVConnectorBase_V1 initialized without kv_cache_config. "
+                "This is deprecated - please update your connector to accept "
+                "kv_cache_config as the third constructor argument and pass it "
+                "to super().__init__()."
+            )
         self._role = role
 
     @property
@@ -98,11 +176,10 @@ class KVConnectorBase_V1(ABC):
     # Worker-side methods
     # ==============================
 
-    def bind_connector_metadata(
-            self, connector_metadata: KVConnectorMetadata) -> None:
+    def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
         """Set the connector metadata from the scheduler.
 
-        This function should be called by the model runner every time 
+        This function should be called by the model runner every time
         before the model execution. The metadata will be used for runtime
         KV cache loading and saving.
 
@@ -114,7 +191,7 @@ class KVConnectorBase_V1(ABC):
     def clear_connector_metadata(self) -> None:
         """Clear the connector metadata.
 
-        This function should be called by the model runner every time 
+        This function should be called by the model runner every time
         after the model execution.
         """
         self._connector_metadata = None
@@ -127,17 +204,24 @@ class KVConnectorBase_V1(ABC):
         Returns:
             ConnectorMetadata: the connector metadata.
         """
-
         # Should only be called while set to valid metadata.
         assert self._connector_metadata is not None
         return self._connector_metadata
 
+    def has_connector_metadata(self) -> bool:
+        """Check whether the connector metadata is currently set.
+
+        Returns:
+            bool: True if connector metadata exists, False otherwise.
+        """
+        return self._connector_metadata is not None
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         Initialize with the KV caches. Useful for pre-registering the
         KV Caches in the KVConnector (e.g. for NIXL).
 
-        Args: 
+        Args:
             kv_caches: dictionary of layer names, kv cache
         """
         return
@@ -150,8 +234,7 @@ class KVConnectorBase_V1(ABC):
         return
 
     @abstractmethod
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs: Any) -> None:
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
         """
         Start loading the KV cache from the connector to vLLM's paged
         KV buffer. This is called from the forward context before the
@@ -162,9 +245,9 @@ class KVConnectorBase_V1(ABC):
             **kwargs: additional arguments for the load operation
 
         Note:
-            The number of elements in kv_caches and layer_names should be 
+            The number of elements in kv_caches and layer_names should be
             the same.
-            
+
         """
         pass
 
@@ -174,7 +257,7 @@ class KVConnectorBase_V1(ABC):
         Block until the KV for a specific layer is loaded into vLLM's
         paged buffer. This is called from within attention layer to ensure
         async copying from start_load_kv is complete.
-        
+
         This interface will be useful for layer-by-layer pipelining.
 
         Args:
@@ -183,17 +266,21 @@ class KVConnectorBase_V1(ABC):
         pass
 
     @abstractmethod
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata",
-                      **kwargs: Any) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
         """
-        Start saving a layer of KV cache from vLLM's paged buffer 
+        Start saving a layer of KV cache from vLLM's paged buffer
         to the connector. This is called from within attention layer to
         enable async copying during execution.
 
         Args:
             layer_name (str): the name of the layer.
-            kv_layer (torch.Tensor): the paged KV buffer of the current 
+            kv_layer (torch.Tensor): the paged KV buffer of the current
                 layer in vLLM.
             attn_metadata (AttentionMetadata): the attention metadata.
             **kwargs: additional arguments for the save operation.
@@ -213,7 +300,7 @@ class KVConnectorBase_V1(ABC):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens on the worker.
@@ -229,6 +316,26 @@ class KVConnectorBase_V1(ABC):
         """
         return None, None
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+
+        Notes:
+            - Applies to both sync- and async-loading requests.
+            - Async loading: failed blocks may be reported in any forward pass
+              up to and including the pass where the request ID is returned by
+              `get_finished()`. Even if failures occur, the request must still
+              be reported via `get_finished()`, and the failed block IDs must
+              appear here no later than that same pass.
+            - Sync loading: failed blocks should be reported in the forward
+              pass in which they are detected.
+        """
+        return set()
+
     def shutdown(self):
         """
         Shutdown the connector. This is called when the worker process
@@ -243,6 +350,18 @@ class KVConnectorBase_V1(ABC):
         """
         return None
 
+    def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
+        """
+        Get the KVConnector handshake metadata for this connector.
+        This metadata is used for out-of-band connector handshake
+        between P/D workers.
+
+        Returns:
+            KVConnectorHandshakeMetadata: the handshake metadata.
+            None if no handshake metadata is available.
+        """
+        return None
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -252,11 +371,11 @@ class KVConnectorBase_V1(ABC):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
@@ -264,21 +383,28 @@ class KVConnectorBase_V1(ABC):
 
         Returns:
             A tuple with the following elements:
-                - An optional number of tokens that can be loaded from the 
-                  external KV cache beyond what is already computed. 
+                - An optional number of tokens that can be loaded from the
+                  external KV cache beyond what is already computed.
                   If None, it means that the connector needs more time to
                   determine the number of matched tokens, and the scheduler
                   should query for this request again later.
                 - `True` if external KV cache tokens will be loaded
                   asynchronously (between scheduler steps). Must be
                   'False' if the first element is 0.
+
+        Notes:
+            The connector should only consider the largest prefix of prompt-
+            tokens for which KV cache is actually available at the time of the
+            call. If the cache cannot be loaded for some tokens (e.g., due to
+            connectivity issues or eviction), those tokens must not be taken
+            into account.
         """
         pass
 
     @abstractmethod
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         """
         Update KVConnector state after block allocation.
 
@@ -298,7 +424,8 @@ class KVConnectorBase_V1(ABC):
 
     @abstractmethod
     def build_connector_meta(
-            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
         """
         Build the connector metadata for this step.
 
@@ -324,9 +451,13 @@ class KVConnectorBase_V1(ABC):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
-        Called when a request has finished, before its blocks are freed.
+        Called exactly once when a request has finished, before its blocks are
+        freed.
+
+        The connector may assumes responsibility for freeing the blocks
+        asynchronously by returning True.
 
         Returns:
             True if the request is being saved/sent asynchronously and blocks
@@ -347,8 +478,7 @@ class KVConnectorBase_V1(ABC):
         return ()
 
     @classmethod
-    def get_required_kvcache_layout(
-            cls, vllm_config: "VllmConfig") -> Optional[str]:
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
         """
         Get the required KV cache layout for this connector.
         Args:
@@ -360,14 +490,17 @@ class KVConnectorBase_V1(ABC):
         """
 
         if cls is KVConnectorBase_V1:
-            raise TypeError("get_required_kvcache_layout should not be called "
-                            "on the abstract base class")
+            raise TypeError(
+                "get_required_kvcache_layout should not be called "
+                "on the abstract base class"
+            )
         return None
 
-    def get_finished_count(self) -> Optional[int]:
+    def get_finished_count(self) -> int | None:
         """
         Get the count of requests expected to complete send/receive operations
-        via this connector.
+        via this connector. This method is used to initialize the
+        KVOutputAggregator, overwriting the default world_size.
 
         Returns:
             int: expected sending or receiving completion count.
@@ -377,12 +510,37 @@ class KVConnectorBase_V1(ABC):
 
     @classmethod
     def build_kv_connector_stats(
-            cls,
-            data: Optional[dict[str,
-                                Any]] = None) -> Optional["KVConnectorStats"]:
+        cls, data: dict[str, Any] | None = None
+    ) -> Optional["KVConnectorStats"]:
         """
-        KVConnectorStats resolution method. This method allows dynamically 
+        KVConnectorStats resolution method. This method allows dynamically
         registered connectors to return their own KVConnectorStats object,
         which can implement custom aggregation logic on the data dict.
         """
         return None
+
+    def set_xfer_handshake_metadata(
+        self, metadata: dict[int, KVConnectorHandshakeMetadata]
+    ) -> None:
+        """
+        Set the KV connector handshake metadata for this connector.
+
+        Args:
+            metadata (KVConnectorHandshakeMetadata): the handshake metadata to set.
+        """
+        return None
+
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type["PromMetric"], type["PromMetricT"]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> Optional["KVConnectorPromMetrics"]:
+        """
+        Create a KVConnectorPromMetrics subclass which should register
+        per-connector Prometheus metrics and implement observe() to
+        expose connector transfer stats via Prometheus.
+        """
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd7d93c92fa3c4598c3d2fca817f49baba7f96c
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -0,0 +1,419 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+DecodeBenchConnector: A KV Connector for decode instance performance testing.
+
+This connector emulates a prefill-decode disaggregated setting by filling
+the KV cache with dummy values, allowing measurement of decoder performance
+under larger input sequence lengths (ISL) in resource-limited environments.
+
+Usage:
+    To use this connector for benchmarking, configure it in the kv_transfer_config:
+
+    Example:
+        vllm serve <model> --kv-transfer-config '{
+            "kv_connector": "DecodeBenchConnector",
+            "kv_role": "kv_both",
+            "kv_connector_extra_config": {
+                "fill_mean": 0.015,
+                "fill_std": 0.0
+            }
+        }'
+
+    Then run your benchmark with desired input/output lengths:
+        vllm bench serve --base-url http://127.0.0.1:8000 --model <model> \\
+            --dataset-name random --random-input-len 40000 \\
+            --random-output-len 100 --max-concurrency 10
+
+    Configuration options (via kv_connector_extra_config):
+        - fill_mean (float): Mean value for random normal fill (default: 0.015)
+        - fill_std (float): Standard deviation for random fill (default: 0.0)
+          Set to 0 for constant values, >0 for random sampling
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
+from vllm.logger import init_logger
+from vllm.utils.math_utils import cdiv
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class DecodeBenchConnectorMetadata(KVConnectorMetadata):
+    """Metadata for DecodeBenchConnector.
+
+    Contains information about which requests need their KV cache filled
+    with dummy values for benchmarking purposes.
+    """
+
+    # request_id -> (block_ids_per_group, num_tokens_to_fill)
+    # block_ids_per_group is a tuple of lists, one per KV cache group
+    # For standard attention: single group, e.g., ([1, 2, 3],)
+    # For MLA: multiple groups, e.g., ([1, 2], [1, 2])
+    reqs_to_fill: dict[str, tuple[tuple[list[int], ...], int]]
+
+
+class DecodeBenchConnector(KVConnectorBase_V1):
+    """
+    A KV Connector for decode instance performance testing.
+
+    This connector fills the KV cache with dummy (non-zero) values to
+    emulate a prefill-decode disaggregated setting, enabling performance
+    testing of the decoder with larger input sequence lengths.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        self.connector_scheduler: DecodeBenchConnectorScheduler | None = None
+        self.connector_worker: DecodeBenchConnectorWorker | None = None
+
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler = DecodeBenchConnectorScheduler(vllm_config)
+        elif role == KVConnectorRole.WORKER:
+            self.connector_worker = DecodeBenchConnectorWorker(vllm_config)
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        assert self.connector_worker is not None
+        self.connector_worker.register_kv_caches(kv_caches)
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, DecodeBenchConnectorMetadata)
+        self.connector_worker.start_fill_kv(self._connector_metadata)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        # All operations are synchronous, so nothing to wait for
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
+        # This connector doesn't save KV cache (benchmarking only)
+        pass
+
+    def wait_for_save(self):
+        # This connector doesn't save KV cache (benchmarking only)
+        pass
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens
+        )
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens
+        )
+
+    def build_connector_meta(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> KVConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        self.connector_scheduler.request_finished(request)
+        return False, None
+
+
+class DecodeBenchConnectorScheduler:
+    """Scheduler-side implementation for DecodeBenchConnector."""
+
+    def __init__(self, vllm_config: "VllmConfig"):
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
+        # Track which requests have already been filled
+        self._filled_requests: set[str] = set()
+
+        # Track pending fills for the current scheduler step
+        # request_id -> (block_ids_per_group, num_tokens_to_fill)
+        # Note: _pending_fills doesn't need explicit cleanup - it's cleared
+        # after build_connector_meta() is called in the same scheduler step
+        self._pending_fills: dict[str, tuple[tuple[list[int], ...], int]] = {}
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        For new requests, return the number of tokens that should be filled
+        with dummy KV cache values.
+
+        Returns:
+            (num_tokens_to_fill, is_async)
+            - num_tokens_to_fill: number of uncomputed tokens minus 1
+                (we fill everything except the last token for decode)
+            - is_async: False (synchronous filling)
+        """
+        req_id = request.request_id
+
+        # Only fill once per request on first scheduling
+        if req_id in self._filled_requests:
+            return 0, False
+
+        # Calculate how many tokens we need to fill
+        # Fill all uncomputed tokens except the last one (which will be decoded)
+        # This simulates having processed a long prefill
+        num_uncomputed_tokens = request.num_tokens - num_computed_tokens
+        num_tokens_to_fill = max(0, num_uncomputed_tokens - 1)
+
+        if num_tokens_to_fill == 0:
+            return 0, False
+
+        # Return False for synchronous operation - the fill is fast enough
+        # that async overhead isn't worth it
+        return num_tokens_to_fill, False
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        """
+        Called after blocks are allocated. Store the block IDs so we can
+        fill them with dummy values.
+
+        Supports both standard attention (single KV cache group) and MLA
+        (multiple KV cache groups).
+        """
+        req_id = request.request_id
+
+        if num_external_tokens == 0:
+            return
+
+        # Get the block IDs that were allocated
+        # block_groups is a tuple of lists, one per KV cache group
+        # For standard attention: 1 group
+        # For MLA: multiple groups (one per attention type)
+        block_groups = blocks.get_block_ids()
+
+        # Calculate how many blocks we need to fill
+        # num_external_tokens are the tokens we said we'd provide
+        num_blocks_to_fill = cdiv(num_external_tokens, self.block_size)
+
+        # Extract the first num_blocks_to_fill blocks from each group
+        # All groups should have the same block IDs for the same request
+        block_ids_per_group = tuple(
+            group_blocks[:num_blocks_to_fill] for group_blocks in block_groups
+        )
+
+        # Store the blocks to fill for all group. _pending_fills doesn't need cleanup
+        # as it's cleared after build_connector_meta
+        self._pending_fills[req_id] = (
+            block_ids_per_group,
+            num_external_tokens,
+        )
+        self._filled_requests.add(req_id)
+
+        logger.debug(
+            "DecodeBenchConnector: Allocated %d blocks across %d KV cache groups "
+            "for request %s",
+            num_blocks_to_fill,
+            len(block_groups),
+            req_id,
+        )
+
+    def build_connector_meta(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> KVConnectorMetadata:
+        """
+        Build metadata containing information about which blocks to fill
+        with dummy KV values.
+        """
+        meta = DecodeBenchConnectorMetadata(reqs_to_fill=self._pending_fills.copy())
+
+        # Clear pending fills after building metadata
+        self._pending_fills.clear()
+
+        return meta
+
+    def request_finished(self, request: "Request"):
+        """
+        Called when a request has finished. Clean up any state.
+        """
+        self._filled_requests.discard(request.request_id)
+
+
+class DecodeBenchConnectorWorker:
+    """Worker-side implementation for DecodeBenchConnector."""
+
+    def __init__(self, vllm_config: "VllmConfig"):
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
+        # Get fill parameters from extra config
+        kv_transfer_config = vllm_config.kv_transfer_config
+        assert kv_transfer_config is not None
+        self.fill_mean = kv_transfer_config.get_from_extra_config("fill_mean", 0.015)
+        self.fill_std = kv_transfer_config.get_from_extra_config("fill_std", 0.0)
+
+        # Will be populated via register_kv_caches
+        self.kv_caches: dict[str, torch.Tensor] | None = None
+
+        # Mapping from KV cache group index to list of layer names in that group
+        self.group_to_layers: dict[int, list[str]] | None = None
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """Store references to the KV cache tensors and build group mapping."""
+        self.kv_caches = kv_caches
+
+        # For simplicity, assume all layers belong to group 0 (standard attention)
+        # For MLA models with multiple groups, the metadata will handle the mapping
+        # We just need to fill the blocks specified in the metadata
+        self.group_to_layers = {0: list(kv_caches.keys())}
+
+        logger.debug(
+            "DecodeBenchConnector: Registered %d KV cache layers",
+            len(kv_caches),
+        )
+
+    def start_fill_kv(self, metadata: DecodeBenchConnectorMetadata):
+        """
+        Fill the allocated KV cache blocks with dummy (non-zero) values.
+
+        This simulates having a populated KV cache from a prefill phase,
+        allowing decode performance testing with larger context sizes.
+
+        Supports both standard attention (single group) and MLA (multiple groups).
+        """
+        if not metadata.reqs_to_fill:
+            return
+
+        assert self.kv_caches is not None, "KV caches must be registered before filling"
+        assert self.group_to_layers is not None, "Group mapping must be initialized"
+
+        for req_id, (block_ids_per_group, num_tokens) in metadata.reqs_to_fill.items():
+            # Fill blocks for each KV cache group
+            for group_idx, block_ids in enumerate(block_ids_per_group):
+                self._fill_blocks(group_idx, block_ids, num_tokens)
+
+            logger.debug(
+                "DecodeBenchConnector: Filled %d blocks (%d tokens) across %d groups "
+                "for request %s",
+                len(block_ids_per_group[0]) if block_ids_per_group else 0,
+                num_tokens,
+                len(block_ids_per_group),
+                req_id,
+            )
+
+    def _fill_blocks(self, group_idx: int, block_ids: list[int], num_tokens: int):
+        """
+        Fill specified blocks with dummy non-zero values for a specific KV cache group.
+
+        Args:
+            group_idx: The KV cache group index to fill
+            block_ids: List of block IDs to fill in this group
+            num_tokens: Total number of tokens to fill across these blocks
+        """
+        if not block_ids:
+            return
+
+        assert self.kv_caches is not None
+        assert self.group_to_layers is not None
+
+        # Get the layers that belong to this group
+        layer_names = self.group_to_layers.get(group_idx, [])
+
+        # Fill only the layers in this group
+        for layer_name in layer_names:
+            if layer_name not in self.kv_caches:
+                logger.warning(
+                    "DecodeBenchConnector: Layer %s not found in KV caches", layer_name
+                )
+                continue
+
+            kv_cache = self.kv_caches[layer_name]
+
+            # Convert block_ids to tensor on device
+            block_ids_tensor = torch.tensor(
+                block_ids, dtype=torch.long, device=kv_cache.device
+            )
+
+            # Filter invalid block IDs
+            valid_mask = block_ids_tensor < kv_cache.shape[0]
+            valid_block_ids = block_ids_tensor[valid_mask]
+
+            if len(valid_block_ids) == 0:
+                continue
+
+            # Create fill values - either constant or random
+            block_shape = kv_cache.shape[1:]
+            if self.fill_std > 0:
+                # Random normal sampling
+                fill_values = torch.normal(
+                    mean=self.fill_mean,
+                    std=self.fill_std,
+                    size=(len(valid_block_ids),) + block_shape,
+                    dtype=kv_cache.dtype,
+                    device=kv_cache.device,
+                )
+            else:
+                # Constant fill value
+                fill_values = torch.full(
+                    (len(valid_block_ids),) + block_shape,
+                    self.fill_mean,
+                    dtype=kv_cache.dtype,
+                    device=kv_cache.device,
+                )
+
+            # Batch fill operation
+            kv_cache[valid_block_ids] = fill_values
+
+        logger.debug(
+            "DecodeBenchConnector: Filled %d blocks in group %d with %s values "
+            "(mean=%.3f, std=%.3f)",
+            len(block_ids),
+            group_idx,
+            "random" if self.fill_std > 0 else "constant",
+            self.fill_mean,
+            self.fill_std,
+        )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 2b0abe983fbb3416d2434834a94d5586fdf80c96..0c24a53fb754b538fb9e75a5bd93cbb1bc210291 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
-from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
+from lmcache.integration.vllm.vllm_v1_adapter import (
+    LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
+)
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -15,22 +20,44 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
 class LMCacheConnectorV1(KVConnectorBase_V1):
-
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
-        self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
+        assert vllm_config.kv_transfer_config is not None
+        use_native = vllm_config.kv_transfer_config.get_from_extra_config(
+            "use_native", False
+        )
+        if use_native:
+            logger.info("Initializing native LMCache connector")
+            # lazy import
+            from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
+
+            _adapter = lmcache_integration.vllm_v1_adapter
+
+            cls = _adapter.LMCacheConnectorV1Impl
+        else:
+            logger.info("Initializing latest dev LMCache connector")
+            cls = LMCacheConnectorLatestImpl
+
+        self._lmcache_engine = cls(vllm_config, role, self)
 
     # ==============================
     # Worker-side methods
     # ==============================
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs: Any) -> None:
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
         """
         Start loading the KV cache from the connector to vLLM's paged
         KV buffer. This is called from the forward context before the
@@ -41,9 +68,9 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
             **kwargs: additional arguments for the load operation
 
         Note:
-            The number of elements in kv_caches and layer_names should be 
+            The number of elements in kv_caches and layer_names should be
             the same.
-            
+
         """
         self._lmcache_engine.start_load_kv(forward_context, **kwargs)
 
@@ -52,7 +79,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         Block until the KV for a specific layer is loaded into vLLM's
         paged buffer. This is called from within attention layer to ensure
         async copying from start_load_kv is complete.
-        
+
         This interface will be useful for layer-by-layer pipelining.
 
         Args:
@@ -60,23 +87,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         """
         self._lmcache_engine.wait_for_layer_load(layer_name)
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata",
-                      **kwargs: Any) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
         """
-        Start saving the a layer of KV cache from vLLM's paged buffer 
+        Start saving the a layer of KV cache from vLLM's paged buffer
         to the connector. This is called from within attention layer to
         enable async copying during execution.
 
         Args:
             layer_name (str): the name of the layer.
-            kv_layer (torch.Tensor): the paged KV buffer of the current 
+            kv_layer (torch.Tensor): the paged KV buffer of the current
                 layer in vLLM.
             attn_metadata (AttentionMetadata): the attention metadata.
             **kwargs: additional arguments for the save operation.
         """
-        self._lmcache_engine.save_kv_layer(layer_name, kv_layer, attn_metadata,
-                                           **kwargs)
+        self._lmcache_engine.save_kv_layer(
+            layer_name, kv_layer, attn_metadata, **kwargs
+        )
 
     def wait_for_save(self):
         """
@@ -90,7 +122,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -104,6 +136,21 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         """
         return self._lmcache_engine.get_finished(finished_req_ids)
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+        """
+        method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
+        if callable(method):
+            return method()
+
+        # Fallback for older versions that don't support this method
+        return set()
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -111,34 +158,35 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
                 computed tokens for this request
 
         Returns:
-            the number of tokens that can be loaded from the 
+            the number of tokens that can be loaded from the
             external KV cache beyond what is already computed.
         """
         return self._lmcache_engine.get_num_new_matched_tokens(
-            request, num_computed_tokens), False
+            request, num_computed_tokens
+        ), False
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         """
         Update KVConnector state after block allocation.
         """
-        self._lmcache_engine.update_state_after_alloc(request,
-                                                      num_external_tokens)
+        self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
 
     def build_connector_meta(
-            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
         """
         Build the connector metadata for this step.
 
@@ -154,7 +202,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..07e05cc8f8932399667c8c2e7690d62060f7d693
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from . import multi_process_adapter, vllm_v1_adapter
+from .multi_process_adapter import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
+
+__all__ = [
+    "vllm_v1_adapter",
+    "multi_process_adapter",
+    "LMCacheMPSchedulerAdapter",
+    "LMCacheMPWorkerAdapter",
+    "LoadStoreOp",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2eeed9f6b8a0fd836ae5b453ba70b814e8e94d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import islice
+from typing import Any
+
+import torch
+import zmq
+from lmcache.utils import _lmcache_nvtx_annotate, init_logger
+from lmcache.v1.multiprocess.custom_types import (
+    CudaIPCWrapper,
+    IPCCacheEngineKey,
+    KVCache,
+)
+from lmcache.v1.multiprocess.mq import MessageQueueClient, MessagingFuture
+from lmcache.v1.multiprocess.protocol import RequestType, get_response_class
+
+logger = init_logger(__name__)
+
+
+def wrap_kv_caches(kv_caches: dict[str, KVCache]) -> KVCache:
+    logger.info("KV caches keys are %s", list(kv_caches.keys()))
+    return [CudaIPCWrapper(tensor) for tensor in kv_caches.values()]
+
+
+def send_lmcache_request(
+    mq_client: MessageQueueClient,
+    request_type: RequestType,
+    payloads: list[Any],
+) -> MessagingFuture[Any]:
+    future = mq_client.submit_request(
+        request_type, payloads, get_response_class(request_type)
+    )
+    return future
+
+
+def get_lmcache_chunk_size(
+    mq_client: MessageQueueClient,
+) -> int:
+    future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, [])
+    chunk_size = future.result()
+    return chunk_size
+
+
+def striding_block_hashes(
+    block_hashes: list[bytes],
+    blocks_in_chunk,
+) -> Iterable[bytes]:
+    """Striding the block hashes to get the block hashes for each chunk.
+    For example, if blocks_in_chunk is 16, then we will get the block hashes
+    for the 16th, 32nd, 48th, ... blocks.
+    """
+    return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk)
+
+
+@dataclass
+class LoadStoreOp:
+    block_hashes: list[bytes]
+    block_ids: list[int]
+
+    def __len__(self) -> int:
+        return len(self.block_hashes)
+
+    def __post_init__(self):
+        assert len(self.block_hashes) == len(self.block_ids), (
+            "The number of block hashes should be equal to the number of block ids "
+            f"But got {len(self.block_hashes)} and {len(self.block_ids)}"
+        )
+
+
+StoreResult = bool
+RetrieveResult = list[bool]
+LookupResult = list[bool]
+
+
+class LMCacheMPSchedulerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        """
+        Args:
+            server_url: The server URL for the LMCache message queue
+            context: The ZMQ context
+
+            model_name: The model name used for LMCache keys
+            world_size: The world size used for LMCache keys
+            kv_rank: The kv rank used for LMCache keys
+            vllm_block_size: The block size used in vLLM
+        """
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Request futures
+        self.lookup_futures: dict[str, MessagingFuture[LookupResult]] = {}
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        self.chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert self.chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = self.chunk_size // vllm_block_size
+
+    @_lmcache_nvtx_annotate
+    def maybe_submit_lookup_request(self, request_id: str, block_hashes: list[bytes]):
+        if request_id in self.lookup_futures:
+            # Skip if there is already a lookup request
+            return
+
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        keys = [self._create_key(block_hash) for block_hash in s]
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.LOOKUP,
+            [keys, True],
+        )
+        self.lookup_futures[request_id] = future
+
+    @_lmcache_nvtx_annotate
+    def check_lookup_result(self, request_id: str) -> int | None:
+        assert request_id in self.lookup_futures, (
+            f"Lookup request for request_id={request_id} has not been submitted"
+        )
+
+        future = self.lookup_futures[request_id]
+        if not future.query():
+            return None
+
+        result = future.result()
+        num_chunks = sum(result)
+        return num_chunks * self.chunk_size
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    # Helper functions
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+
+class LMCacheMPWorkerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Instance id for GPU worker
+        self.instance_id = os.getpid()
+
+        # Registered kv caches from vLLM
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        # Request futures
+        # request_id -> (future, other merged requests)
+        self.store_futures: dict[
+            str, tuple[MessagingFuture[StoreResult], list[str]]
+        ] = {}
+        self.retrieve_futures: dict[
+            str, tuple[MessagingFuture[RetrieveResult], list[str]]
+        ] = {}
+
+        self.finished_stores: set[str] = set()
+        self.previously_finished: set[str] = set()
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = chunk_size // vllm_block_size
+
+    def register_kv_caches(self, kv_caches: dict[str, KVCache]):
+        # Register kv cache and send the request
+        self.kv_caches = kv_caches
+        logger.info("Registering kv caches")
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.REGISTER_KV_CACHE,
+            [self.instance_id, wrap_kv_caches(kv_caches)],
+        )
+        future.result()
+
+    @_lmcache_nvtx_annotate
+    def submit_store_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def submit_retrieve_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_store_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_retrieve_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        finished_stores = set()
+        finished_retrieves = set()
+        for request_id, (future, other_reqs) in self.store_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_stores.add(request_id)
+            finished_stores.update(other_reqs)
+
+            if not result:
+                # TODO: add error handling here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "store request for request_id=%s",
+                    request_id,
+                )
+
+        for request_id, (future, other_reqs) in self.retrieve_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_retrieves.add(request_id)
+            finished_retrieves.update(other_reqs)
+
+            if not all(result):
+                # TODO: add error handing here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "retrieve request for request_id=%s, result=%s",
+                    request_id,
+                    result,
+                )
+            logger.info("Retrieve request for request_id=%s finished", request_id)
+
+        # Remove the finished requests from the tracking dicts
+        for request_id in finished_stores:
+            self.store_futures.pop(request_id, None)
+        for request_id in finished_retrieves:
+            self.retrieve_futures.pop(request_id, None)
+
+        # Update the internal states
+        self.finished_stores.update(finished_stores)
+
+        ret_stores = set()
+        for req_id in finished_req_ids:
+            if req_id in self.finished_stores or req_id in self.store_futures:
+                self.previously_finished.add(req_id)
+            else:
+                ret_stores.add(req_id)
+
+        # Calculate the final finished stores
+        ret_stores.update(self._update_and_get_finished_store())
+
+        return ret_stores, finished_retrieves
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    def shutdown(self):
+        # Unregister kv cache
+        logger.info("Unregistering kv caches")
+        send_lmcache_request(
+            self.mq_client, RequestType.UNREGISTER_KV_CACHE, [self.instance_id]
+        ).result()
+
+        self.mq_client.close()
+
+    # Helper functions
+    def _update_and_get_finished_store(
+        self,
+    ) -> set[str]:
+        """Converge the internal states about finished stores
+        and returns the 'safe finished store request ids' back
+        """
+        safe_finished_s = self.finished_stores.intersection(self.previously_finished)
+        self.finished_stores.difference_update(self.previously_finished)
+        self.previously_finished.difference_update(safe_finished_s)
+
+        return safe_finished_s
+
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+    def _block_hashes_to_keys(
+        self, block_hashes: list[bytes]
+    ) -> list[IPCCacheEngineKey]:
+        """Convert block hashes to IPC cache engine keys"""
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        return [self._create_key(block_hash) for block_hash in s]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e87dea59d232e024a82ab122b941684546a6c26
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Standard
+import os
+import threading
+from typing import TYPE_CHECKING, Union
+
+import torch
+from lmcache.config import LMCacheEngineConfig as Config
+from lmcache.logging import init_logger
+from lmcache.v1.config import LMCacheEngineConfig as V1Config
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.v1.core.sched.output import NewRequestData
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+ENGINE_NAME = "vllm-instance"
+
+# Thread-safe singleton storage
+_config_instance: Config | V1Config | None = None
+_config_lock = threading.Lock()
+
+
+def is_false(value: str) -> bool:
+    """Check if the given string value is equivalent to 'false'."""
+    return value.lower() in ("false", "0", "no", "n", "off")
+
+
+def lmcache_get_or_create_config() -> Config | V1Config:
+    """Get the LMCache configuration from the environment variable
+    `LMCACHE_CONFIG_FILE`. If the environment variable is not set, this
+    function will return the default configuration.
+
+    This function is thread-safe and implements singleton pattern,
+    ensuring the configuration is loaded only once.
+    """
+    global _config_instance
+
+    # Double-checked locking for thread-safe singleton
+    if _config_instance is None:
+        with _config_lock:
+            if _config_instance is None:  # Check again within lock
+                if is_false(os.getenv("LMCACHE_USE_EXPERIMENTAL", "True")):
+                    logger.warning(
+                        "Detected LMCACHE_USE_EXPERIMENTAL is set to False. "
+                        "Using legacy configuration is deprecated and will "
+                        "be remove soon! Please set LMCACHE_USE_EXPERIMENTAL "
+                        "to True."
+                    )
+                    LMCacheEngineConfig = Config  # type: ignore[assignment]
+                else:
+                    LMCacheEngineConfig = V1Config  # type: ignore[assignment]
+
+                if "LMCACHE_CONFIG_FILE" not in os.environ:
+                    logger.warning(
+                        "No LMCache configuration file is set. Trying to read"
+                        " configurations from the environment variables."
+                    )
+                    logger.warning(
+                        "You can set the configuration file through "
+                        "the environment variable: LMCACHE_CONFIG_FILE"
+                    )
+                    _config_instance = LMCacheEngineConfig.from_env()
+                else:
+                    config_file = os.environ["LMCACHE_CONFIG_FILE"]
+                    logger.info("Loading LMCache config file %s", config_file)
+                    _config_instance = LMCacheEngineConfig.from_file(config_file)
+                    # Update config from environment variables
+                    _config_instance.update_config_from_env()
+    return _config_instance
+
+
+def hex_hash_to_int16(s: str) -> int:
+    """
+    Convert a hex hash string to a 16-bit integer.
+    """
+    return int(s, 16) & 0xFFFF
+
+
+def apply_mm_hashes_to_token_ids(
+    token_ids: torch.Tensor,
+    mm_hashes: list[str],
+    mm_positions: list["PlaceholderRange"],
+) -> torch.Tensor:
+    """
+    Overwrite token_ids in-place for multimodal placeholders using
+    efficient slice assignments.
+    """
+    n = token_ids.size(0)
+    for hash_str, placeholder in zip(mm_hashes, mm_positions):
+        start, length = placeholder.offset, placeholder.length
+        if start >= n:
+            continue
+        end = min(start + length, n)
+        token_ids[start:end] = hex_hash_to_int16(hash_str)
+    return token_ids
+
+
+def mla_enabled(model_config: "ModelConfig") -> bool:
+    return (
+        hasattr(model_config, "use_mla")
+        and isinstance(model_config.use_mla, bool)
+        and model_config.use_mla
+    )
+
+
+def create_lmcache_metadata(
+    vllm_config=None, model_config=None, parallel_config=None, cache_config=None
+):
+    """
+    Create LMCacheEngineMetadata from vLLM configuration.
+
+    This function extracts common metadata creation logic that was duplicated
+    across multiple files.
+
+    Args:
+        vllm_config (VllmConfig): vLLM configuration object containing model,
+                                  parallel, and cache configs (alternative to
+                                  individual config parameters)
+        model_config (ModelConfig): Model configuration (alternative to
+                                    vllm_config)
+        parallel_config (ParallelConfig): Parallel configuration (alternative
+                                          to vllm_config)
+        cache_config (CacheConfig): Cache configuration (alternative to
+                                    vllm_config)
+    """
+    # Third Party
+    # First Party
+    from lmcache.config import LMCacheEngineMetadata
+
+    from vllm.utils.torch_utils import get_kv_cache_torch_dtype
+
+    config = lmcache_get_or_create_config()
+    # Support both vllm_config object and individual config parameters
+    if vllm_config is not None:
+        model_cfg = vllm_config.model_config
+        parallel_cfg = vllm_config.parallel_config
+        cache_cfg = vllm_config.cache_config
+    else:
+        if model_config is None or parallel_config is None or cache_config is None:
+            raise ValueError(
+                "Either vllm_config must be provided, or all of "
+                "model_config, parallel_config, and cache_config must be provided."
+            )
+        model_cfg = model_config
+        parallel_cfg = parallel_config
+        cache_cfg = cache_config
+
+    # Get KV cache dtype
+    kv_dtype = get_kv_cache_torch_dtype(cache_cfg.cache_dtype, model_cfg.dtype)
+
+    # Check if MLA is enabled
+    use_mla = mla_enabled(model_cfg)
+
+    # Construct KV shape (for memory pool)
+    num_layer = model_cfg.get_num_layers(parallel_cfg)
+    chunk_size = config.chunk_size
+    num_kv_head = model_cfg.get_num_kv_heads(parallel_cfg)
+    head_size = model_cfg.get_head_size()
+    kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size)
+
+    # Create metadata
+    metadata = LMCacheEngineMetadata(
+        model_cfg.model,
+        parallel_cfg.world_size,
+        parallel_cfg.rank,
+        "vllm",
+        kv_dtype,
+        kv_shape,
+        use_mla,
+    )
+
+    return metadata, config
+
+
+def extract_mm_features(
+    request: Union["Request", "NewRequestData"], modify: bool = False
+) -> tuple[list[str], list["PlaceholderRange"]]:
+    """
+    Normalize multimodal information from a Request into parallel lists.
+
+    This helper reads either:
+      1) `request.mm_features` (objects each exposing `.identifier` and
+      `.mm_position`), or
+      2) legacy fields `request.mm_hashes` and `request.mm_positions`.
+
+    It returns two equally sized lists: the multimodal hash identifiers and
+    their corresponding positions. If the request contains no multimodal info,
+    it returns `([], [])`.
+
+    Args:
+        request (Request): The source object.
+        modify (bool):
+            Controls copy semantics for the legacy-path return values.
+            - If True and legacy fields are used, shallow-copies are returned so
+              the caller can mutate the lists without affecting `request`.
+            - If False, the original legacy sequences are returned as-is
+              (zero-copy); treat them as read-only.
+
+    Returns:
+        tuple[list[str], list[PlaceholderRange]]: (`mm_hashes`, `mm_positions`).
+        May be `([], [])` when no multimodal data is present.
+    """
+    if getattr(request, "mm_features", None):
+        mm_hashes, mm_positions = zip(
+            *((f.identifier, f.mm_position) for f in request.mm_features)
+        )
+        return (list(mm_hashes), list(mm_positions))
+    elif getattr(request, "mm_hashes", None):
+        if modify:
+            return (
+                request.mm_hashes.copy(),  # type: ignore
+                request.mm_positions.copy(),  # type: ignore
+            )
+        else:
+            return (request.mm_hashes, request.mm_positions)  # type: ignore
+    else:
+        return ([], [])
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..94572b02fa8723693ec4c8353497705f813944a6
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -0,0 +1,1411 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Standard
+import os
+import uuid
+from collections.abc import Generator
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from lmcache import utils
+from lmcache.config import LMCacheEngineMetadata
+from lmcache.logging import init_logger
+from lmcache.observability import LMCStatsMonitor
+from lmcache.utils import _lmcache_nvtx_annotate
+from lmcache.v1.cache_engine import LMCacheEngine, LMCacheEngineBuilder
+from lmcache.v1.compute.blend import LMCBlenderBuilder
+from lmcache.v1.config import LMCacheEngineConfig, _validate_and_set_config_value
+from lmcache.v1.gpu_connector import (
+    VLLMBufferLayerwiseGPUConnector,
+    VLLMPagedMemGPUConnectorV2,
+    VLLMPagedMemLayerwiseGPUConnector,
+)
+from lmcache.v1.internal_api_server.api_server import InternalAPIServer
+from lmcache.v1.lookup_client import LookupClientFactory
+from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
+    LMCacheAsyncLookupServer,
+)
+from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
+from lmcache.v1.plugin.plugin_launcher import PluginLauncher
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+    ENGINE_NAME,
+    apply_mm_hashes_to_token_ids,
+    extract_mm_features,
+    lmcache_get_or_create_config,
+    mla_enabled,
+)
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
+from vllm.sampling_params import SamplingParams
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import get_kv_cache_torch_dtype
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.version import __version__ as VLLM_VERSION
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.v1.core.kv_cache_manager import KVCacheManager
+    from vllm.v1.core.sched.output import NewRequestData
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class LoadSpec:
+    # Number of tokens cached in vLLM
+    vllm_cached_tokens: int
+    # Number of tokens that are cached in LMCache
+    lmcache_cached_tokens: int
+    # Whether the scheduler allow us to load the tokens
+    can_load: bool
+
+
+@dataclass
+class SaveSpec:
+    # Skip already saved tokens
+    skip_leading_tokens: int
+    # Whether the scheduler allow us to save the tokens
+    can_save: bool
+
+
+@dataclass
+class DisaggSpec:
+    req_id: str
+    receiver_id: str
+    receiver_host: str
+    receiver_init_port: int
+    receiver_alloc_port: int
+    is_last_prefill: bool = False
+    num_transferred_tokens: int = 0
+
+
+tmp_disagg_tracker: dict[str, DisaggSpec] = {}
+
+
+def extract_request_configs(sampling_params: SamplingParams) -> dict | None:
+    request_configs = None
+    if (
+        sampling_params.extra_args is not None
+        and "kv_transfer_params" in sampling_params.extra_args
+    ):
+        kv_transfer_params = sampling_params.extra_args.get("kv_transfer_params")
+        if kv_transfer_params is None:
+            return None
+        assert isinstance(kv_transfer_params, dict)
+        for k, v in kv_transfer_params.items():
+            if k.startswith("lmcache."):
+                if request_configs is None:
+                    request_configs = {}
+                request_configs[k] = v
+    return request_configs
+
+
+@dataclass
+class RequestTracker:
+    # Request id
+    req_id: str
+
+    # Total prompt token length
+    prompt_len: int
+
+    # The token ids that has been scheduled so far
+    token_ids: list[int]
+
+    # The block ids that has been allocated so far
+    # NOTE: allocated blocks could be more than the number of tokens
+    allocated_block_ids: list[int]
+
+    # The number of tokens that has been saved
+    num_saved_tokens: int = 0
+
+    # Disagg spec for the request
+    disagg_spec: DisaggSpec | None = None
+
+    # Multimodal hashes and positions
+    mm_hashes: list[str] | None = None
+    mm_positions: list["PlaceholderRange"] | None = None
+
+    # The configs of the request, includes tags and other configs
+    request_configs: dict | None = None
+
+    # Whether the request is in decode phase
+    is_decode_phase = False
+
+    # Whether the request cache should be saved
+    skip_save: bool = False
+
+    @_lmcache_nvtx_annotate
+    @staticmethod
+    def from_new_request(
+        lmcache_config: LMCacheEngineConfig,
+        new_request: "NewRequestData",
+        num_tokens_to_compute: int,
+        lmcache_cached_tokens: int,
+        skip_save: bool,
+    ) -> "RequestTracker":
+        """Create the request tracker from a new request.
+
+        Args:
+            lmcache_config (LMCacheEngineConfig): the LMCache engine config.
+            new_request (NewRequestData): the new request data.
+            num_tokens_to_compute (int): the number of tokens that will
+                be 'computed', including the `num_computed_tokens` (vLLM's
+                local cache hit) and new tokens that will be scheduled.
+            lmcache_cached_tokens (int): the number of tokens that are
+                cached in LMCache.
+            skip_save (bool): whether the request cache should be saved
+        """
+        # vLLM 0.9.0 update: request.block_ids changed from list[int] to
+        # list[list[int]]
+        # Need to check the type of request.block_ids
+
+        unfolded_block_ids = []
+
+        if not isinstance(new_request.block_ids[0], list):
+            unfolded_block_ids = new_request.block_ids.copy()
+        else:
+            # According to the vLLM code
+            # (https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/
+            # sched/scheduler.py#L943),
+            # only one KVCacheGroup is supported in connector for now.
+            unfolded_block_ids = new_request.block_ids[0].copy()
+
+        # NOTE: Initialized in `update_state_after_alloc`
+        disagg_spec = tmp_disagg_tracker.pop(new_request.req_id, None)
+
+        if new_request.sampling_params:
+            request_configs = extract_request_configs(new_request.sampling_params)
+        else:
+            request_configs = None
+
+        mm_hashes, mm_positions = extract_mm_features(new_request, modify=True)
+
+        assert new_request.prompt_token_ids is not None
+        return RequestTracker(
+            req_id=new_request.req_id,
+            prompt_len=len(new_request.prompt_token_ids),
+            token_ids=new_request.prompt_token_ids[:num_tokens_to_compute].copy(),
+            allocated_block_ids=unfolded_block_ids,
+            num_saved_tokens=lmcache_cached_tokens,
+            disagg_spec=disagg_spec,
+            mm_hashes=mm_hashes,
+            mm_positions=mm_positions,
+            skip_save=skip_save,
+            request_configs=request_configs,
+        )
+
+    def update(
+        self,
+        new_token_ids: list[int],
+        new_block_ids: tuple[list[int], ...] | None | list[int],
+    ) -> None:
+        """Update the request tracker when a running request is
+        scheduled again
+        """
+
+        self.token_ids.extend(new_token_ids)
+
+        if new_block_ids is None:
+            # https://github.com/vllm-project/vllm/commit/
+            # b029de9902aa3ac58806c8c17776c7074175b6db
+            new_block_ids = []
+        elif len(new_block_ids) == 0:
+            new_block_ids = []
+        elif isinstance(new_block_ids, tuple):
+            new_block_ids = new_block_ids[0]
+        elif isinstance(new_block_ids, list):
+            pass
+        else:
+            raise ValueError(f"Unsupported new_block_ids type {type(new_block_ids)}")
+        self.allocated_block_ids.extend(new_block_ids)
+
+        # When a request is scheduled again, and the number of new tokens
+        # is 1 (excluding chunked prefill), the request is in decode phase.
+        if len(new_token_ids) == 1:
+            self.is_decode_phase = True
+
+
+@dataclass
+class ReqMeta:
+    # Request id
+    req_id: str
+    # Request tokens
+    token_ids: list[int]  # torch.Tensor
+    # Slot mapping
+    slot_mapping: torch.Tensor
+
+    # Whether is last prefill or not
+    is_last_prefill: bool = False
+
+    # Skip save or not
+    save_spec: SaveSpec | None = None
+    # load_spec
+    load_spec: LoadSpec | None = None
+    # disagg spec
+    disagg_spec: DisaggSpec | None = None
+    # the configs of the request
+    request_configs: dict | None = None
+
+    @staticmethod
+    def from_request_tracker(
+        tracker: RequestTracker,
+        block_size: int,
+        lmcache_chunk_size: int = 256,
+        load_spec: LoadSpec | None = None,
+        discard_partial_chunks: bool = True,
+        save_decode_cache: bool = False,
+    ) -> Optional["ReqMeta"]:
+        """Create the request metadata from a request tracker.
+
+        Args:
+            tracker (RequestTracker): the request tracker.
+            block_size (int): the block size in vLLM.
+            lmcache_chunk_size (int): the chunk size for LMCache.
+            load_spec (Optional[LoadSpec]): the load spec for KV cache loading.
+            discard_partial_chunks (bool): whether to discard partial chunks.
+            save_decode_cache (bool): whether to save the cache in decode phase.
+
+        Returns:
+            the request metadata if we need to perform load/save
+            operations, None otherwise.
+        """
+        input_token_ids = tracker.token_ids
+        input_token_len = len(input_token_ids)
+
+        is_last_prefill = False
+        if input_token_len == tracker.prompt_len:
+            is_last_prefill = True
+
+        # For save operation: do not save if the following condition is met
+        # 1. has already been saved before (num_saved_tokens > 0)
+        # 2. number of unsaved tokens is not reached the chunk boundary
+        # 3. if save_decode_cache is False and it is in decode phase
+
+        skip_leading_tokens = tracker.num_saved_tokens
+        chunk_boundary = (
+            cdiv(tracker.num_saved_tokens + 1, lmcache_chunk_size) * lmcache_chunk_size
+        )
+
+        # NOTE(vladnosiv): for disagg, you cannot skip saving, as saving is a
+        # trqansfer. Check if request_configs has lmcache.skip_save set to True
+        request_skip = (tracker.request_configs or {}).get("lmcache.skip_save", False)
+
+        skip_save = tracker.disagg_spec is None and (
+            tracker.skip_save
+            or (tracker.num_saved_tokens > 0 and input_token_len < chunk_boundary)
+            or (tracker.is_decode_phase and not save_decode_cache)
+            or request_skip
+        )
+
+        if skip_save and load_spec is None:
+            return None
+
+        # Calculate number of tokens to save based on discard_partial_chunks
+        # setting
+
+        # NOTE(vladnosiv): for the input_token_len chunk prefill,
+        # we are required to discard partial chunks,
+        # as new tokens will be added in the next iteration.
+        num_tokens_to_save = (
+            (input_token_len // lmcache_chunk_size * lmcache_chunk_size)
+            if not is_last_prefill or discard_partial_chunks
+            else input_token_len
+        )
+
+        # If we need to save, update the number of saved tokens
+        if not skip_save:
+            tracker.num_saved_tokens = num_tokens_to_save
+        save_spec = SaveSpec(skip_leading_tokens, not skip_save)
+
+        # Calculate the token ids and slot mappings for load and save
+        token_ids = input_token_ids[:num_tokens_to_save]
+
+        # If the request has multimodal hashes, apply them to the token ids
+        if tracker.mm_hashes:
+            token_ids_tensor = torch.tensor(token_ids)
+            assert tracker.mm_positions is not None, (
+                "tracker got mm_hashes but no mm_positions"
+            )
+            apply_mm_hashes_to_token_ids(
+                token_ids_tensor, tracker.mm_hashes, tracker.mm_positions
+            )
+            token_ids = token_ids_tensor.tolist()
+
+        num_blocks = len(tracker.allocated_block_ids)
+
+        if len(token_ids) > num_blocks * block_size:
+            logger.error(
+                "The number of tokens is more than the number of blocks."
+                "Something might be wrong in scheduling logic!"
+            )
+            logger.error(
+                "Num tokens: %d, num blocks: %d, block size: %d",
+                len(token_ids),
+                num_blocks,
+                block_size,
+            )
+
+        block_ids = torch.tensor(tracker.allocated_block_ids, dtype=torch.long)
+        block_offsets = torch.arange(0, block_size, dtype=torch.long)
+        slot_mapping = (
+            block_offsets.reshape((1, block_size))
+            + block_ids.reshape((num_blocks, 1)) * block_size
+        )
+
+        slot_mapping = slot_mapping.flatten()[: len(token_ids)]
+        assert slot_mapping.dtype == torch.long
+
+        # For load operation: check whether the request is scheduled to load
+        if load_spec is not None and load_spec.can_load:
+            logger.debug(
+                "Scheduled to load %d tokens for request %s",
+                load_spec.lmcache_cached_tokens,
+                tracker.req_id,
+            )
+        else:
+            # Do not load if not in `can_load` state
+            load_spec = None
+
+        return ReqMeta(
+            req_id=tracker.req_id,
+            token_ids=token_ids,
+            slot_mapping=slot_mapping,
+            is_last_prefill=is_last_prefill,
+            save_spec=save_spec,
+            load_spec=load_spec,
+            disagg_spec=tracker.disagg_spec,
+            request_configs=tracker.request_configs,
+        )
+
+
+def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig):
+    return not lmcache_config.enable_pd
+
+
+def _calculate_mtp_layers(vllm_config, model_config):
+    num_mtp_layers = 0
+    if vllm_config is not None and vllm_config.speculative_config is not None:
+        logger.info(
+            "vllm_config.speculative_config: %s", vllm_config.speculative_config
+        )
+        # TODO(baoloongmao): Support other MTP methods
+        if vllm_config.speculative_config.method == "deepseek_mtp":
+            num_mtp_layers = getattr(
+                model_config.hf_config, "num_nextn_predict_layers", 0
+            )
+
+        elif vllm_config.speculative_config.use_eagle():
+            try:
+                draft_model_config = vllm_config.speculative_config.draft_model_config
+                num_mtp_layers = draft_model_config.get_num_layers(
+                    vllm_config.parallel_config
+                )
+                logger.info("EAGLE detected %d extra layer(s)", num_mtp_layers)
+            except Exception:
+                logger.info(
+                    "EAGLE detected, but failed to get the number of extra layers"
+                    "falling back to 1"
+                )
+                num_mtp_layers = 1
+    return num_mtp_layers
+
+
+def _init_lmcache_engine(
+    lmcache_config: LMCacheEngineConfig,
+    vllm_config: "VllmConfig",
+) -> LMCacheEngine:
+    """Initialize the LMCache engine by the given model config and parallel
+    config. This function will check the environment variable
+    `LMCACHE_CONFIG_FILE` to load the configuration file. If that environment
+    variable is not set, this function will return None.
+
+    :param lmcache_config: The LMCache configuration.
+    :type lmcache_config: LMCacheEngineConfig
+    :param vllm_config: The vLLM configuration.
+    :type vllm_config: VllmConfig
+
+    :return: The initialized LMCache engine
+    :rtype: LMCacheEngine
+    """
+    if curr_engine := LMCacheEngineBuilder.get(ENGINE_NAME):
+        return curr_engine
+
+    model_config = vllm_config.model_config
+    parallel_config = vllm_config.parallel_config
+    cache_config = vllm_config.cache_config
+
+    assert isinstance(lmcache_config, LMCacheEngineConfig), (
+        "LMCache v1 configuration is should be passed."
+    )
+
+    kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype)
+
+    use_mla = mla_enabled(model_config)
+    if use_mla and (
+        lmcache_config.remote_serde != "naive"
+        and lmcache_config.remote_serde is not None
+    ):
+        raise ValueError("MLA only works with naive serde mode..")
+
+    # construct kv shape (for mem pool)
+    num_layer = model_config.get_num_layers(parallel_config)
+    num_mtp_layers = _calculate_mtp_layers(vllm_config, model_config)
+    num_layer += num_mtp_layers
+    chunk_size = lmcache_config.chunk_size
+    num_kv_head = model_config.get_num_kv_heads(parallel_config)
+    head_size = model_config.get_head_size()
+    kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size)
+    logger.info(
+        "use mla: %s, kv shape: %s, num_mtp_layers: %s",
+        use_mla,
+        kv_shape,
+        num_mtp_layers,
+    )
+
+    # Change current device.
+    num_gpus = torch.cuda.device_count()
+    local_rank = parallel_config.rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    metadata = LMCacheEngineMetadata(
+        model_config.model,
+        parallel_config.world_size,
+        parallel_config.rank,
+        "vllm",
+        kv_dtype,
+        kv_shape,
+        use_mla,
+    )
+
+    use_gpu = need_gpu_interm_buffer(lmcache_config)
+    vllm_gpu_connector: (
+        VLLMBufferLayerwiseGPUConnector
+        | VLLMPagedMemGPUConnectorV2
+        | VLLMPagedMemLayerwiseGPUConnector
+    )
+
+    if use_mla and lmcache_config.use_layerwise:
+        raise ValueError("layerwise MLA connector is not supported yet")
+
+    # When use_mla is True, num_kv_head is 1
+    hidden_dim_size = num_kv_head * head_size
+    if lmcache_config.use_layerwise:
+        if lmcache_config.enable_blending:
+            # Use layerwise connector for blending
+            vllm_gpu_connector = VLLMBufferLayerwiseGPUConnector(
+                hidden_dim_size,
+                num_layer,
+                use_gpu=use_gpu,
+                chunk_size=chunk_size,
+                dtype=kv_dtype,
+                device=device,
+            )
+        else:
+            vllm_gpu_connector = VLLMPagedMemLayerwiseGPUConnector(
+                hidden_dim_size,
+                num_layer,
+                use_gpu=use_gpu,
+                chunk_size=chunk_size,
+                dtype=kv_dtype,
+                device=device,
+            )
+    else:
+        vllm_gpu_connector = VLLMPagedMemGPUConnectorV2(
+            hidden_dim_size,
+            num_layer,
+            use_gpu=use_gpu,
+            chunk_size=chunk_size,
+            dtype=kv_dtype,
+            device=device,
+            use_mla=use_mla,
+        )
+    tpg = get_tp_group()
+    engine = LMCacheEngineBuilder.get_or_create(
+        ENGINE_NAME,
+        lmcache_config,
+        metadata,
+        vllm_gpu_connector,
+        tpg.broadcast,
+        tpg.broadcast_object,
+    )
+
+    return engine
+
+
+@dataclass
+class LMCacheConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta] = field(default_factory=list)
+    lookup_requests_in_step: list[str] = field(default_factory=list)
+
+    @_lmcache_nvtx_annotate
+    def add_request(self, req_meta: ReqMeta) -> None:
+        """Add a request to the metadata.
+
+        Args:
+            req_meta (ReqMeta): the request metadata.
+        """
+        self.requests.append(req_meta)
+
+
+class LMCacheConnectorV1Impl:
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        parent: KVConnectorBase_V1,
+    ):
+        assert vllm_config.kv_transfer_config is not None
+        self._parent = parent
+        self._vllm_config = vllm_config
+        self.kv_role = vllm_config.kv_transfer_config.kv_role
+        self.worker_count = vllm_config.parallel_config.tensor_parallel_size
+        config = lmcache_get_or_create_config()
+        assert isinstance(config, LMCacheEngineConfig), (
+            "LMCache v1 configuration is should be passed for vLLM v1."
+        )
+        # Put the leading with "lmcache." and matched configs from
+        # vllm extra_config to the config
+        kv_connector_extra_config = (
+            vllm_config.kv_transfer_config.kv_connector_extra_config
+        )
+        if kv_connector_extra_config:
+            for key, value in kv_connector_extra_config.items():
+                if key.startswith("lmcache."):
+                    config_key = key[8:]  # Remove "lmcache." prefix
+                    if _validate_and_set_config_value(config, config_key, value):
+                        logger.info(
+                            "Updated config %s from vLLM extra config: %s",
+                            config_key,
+                            value,
+                        )
+
+        self.config = config
+
+        self.async_loading = config.enable_async_loading
+        self.layerwise_retrievers: list[Generator[torch.Tensor | None, None, None]] = []
+        self._stats_monitor = LMCStatsMonitor.GetOrCreate()
+        if role == KVConnectorRole.SCHEDULER:
+            # Create lookup client using factory
+            self.lookup_client = LookupClientFactory.create_lookup_client(
+                vllm_config, config
+            )
+            self._unfinished_requests: dict[str, Request] = {}
+            self._lookup_requests_in_step: list[str] = []
+            self.lmcache_engine = None
+        else:
+            self.lmcache_engine = _init_lmcache_engine(
+                config,
+                vllm_config,
+            )
+
+            self.use_layerwise = config.use_layerwise
+            self.enable_blending = config.enable_blending
+
+            if self.enable_blending:
+                self.blender = LMCBlenderBuilder.get_or_create(
+                    ENGINE_NAME,
+                    self.lmcache_engine,
+                    self.lmcache_engine.gpu_connector,
+                    config,
+                )
+
+            # Create lookup server using factory
+            assert self.lmcache_engine is not None
+            self.lookup_server = LookupClientFactory.create_lookup_server(
+                self.lmcache_engine, vllm_config
+            )
+
+            self.offload_server = ZMQOffloadServer(
+                self.lmcache_engine,
+                vllm_config,
+                get_tensor_model_parallel_rank(),
+            )
+
+            # In case of MLA, the lookup server is only created on worker 0
+            if self.async_loading and self.lookup_server is not None:
+                assert isinstance(self.lookup_server, LMCacheAsyncLookupServer)
+                self.lmcache_engine.post_init(async_lookup_server=self.lookup_server)
+
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        self._block_size = vllm_config.cache_config.block_size
+
+        # request_id -> (vllm cached tokens, lmcache cached tokens)
+        self.load_specs: dict[str, LoadSpec] = {}
+
+        self.kv_cache_manager: KVCacheManager | None = None
+
+        # request_id -> full_token_ids
+        self._request_trackers: dict[str, RequestTracker] = {}
+
+        # Whether to discard partial chunks
+        self._discard_partial_chunks = (
+            vllm_config.kv_transfer_config.get_from_extra_config(
+                "discard_partial_chunks", False
+            )
+            or not config.save_unfull_chunk
+        )
+
+        self._lmcache_chunk_size = config.chunk_size
+        self._save_decode_cache = config.save_decode_cache
+
+        self.skip_last_n_tokens = vllm_config.kv_transfer_config.get_from_extra_config(
+            "skip_last_n_tokens", 0
+        )
+
+        self.num_layers = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config
+        )
+        self.current_layer = 0
+
+        self.force_skip_save = bool(os.environ.get("LMCACHE_FORCE_SKIP_SAVE", False))
+
+        self._requests_priority: dict[str, int] = {}
+
+        # TODO(baoloongmao): Internal api server & plugin framework support
+        # dp > 1
+        if (
+            vllm_config.parallel_config.data_parallel_size_local == 1
+            or vllm_config.parallel_config.data_parallel_rank_local == 0
+        ):
+            # Start internal API server if enabled
+            # The enabled check is in the InternalAPIServer constructor
+            self.api_server = InternalAPIServer(self)
+            self.api_server.start()
+            # Launch plugins
+            self.plugin_launcher = PluginLauncher(
+                self.config,
+                role,
+                self.worker_count,
+                -1
+                if self.lmcache_engine is None  # scheduler side
+                else self.lmcache_engine.metadata.worker_id,
+            )
+            self.plugin_launcher.launch_plugins()
+        else:
+            self.api_server = None  # type: ignore[assignment]
+            self.plugin_launcher = None  # type: ignore[assignment]
+        logger.info(
+            "LMCache initialized for role %s with version %s, "
+            "vllm version %s, lmcache cache_engine metadata: %s",
+            role,
+            utils.get_version(),
+            VLLM_VERSION,
+            getattr(self.lmcache_engine, "metadata", None),
+        )
+
+    def get_inference_info(self) -> dict:
+        """Get inference information including vLLM config and related details.
+
+        Returns:
+            dict: Dictionary containing inference information
+        """
+        # Get vLLM config information
+        vllm_config = self._vllm_config
+
+        # Use vLLM config's string representation and add specific configs
+        inference_info = {
+            "vllm_version": VLLM_VERSION,
+            "lmcache_version": utils.get_version(),
+            "vllm_config": str(vllm_config),
+            "model_config": {
+                "model": getattr(vllm_config.model_config, "model", None),
+                "dtype": str(getattr(vllm_config.model_config, "dtype", None)),
+                "max_model_len": getattr(
+                    vllm_config.model_config, "max_model_len", None
+                ),
+                "vocab_size": vllm_config.model_config.get_vocab_size(),
+                "num_layers": getattr(
+                    vllm_config.model_config, "get_num_layers", lambda _: None
+                )(vllm_config.parallel_config),
+                "num_attention_heads": getattr(
+                    vllm_config.model_config, "get_num_attention_heads", lambda _: None
+                )(vllm_config.parallel_config),
+                "num_kv_heads": getattr(
+                    vllm_config.model_config, "get_num_kv_heads", lambda _: None
+                )(vllm_config.parallel_config),
+                "head_size": getattr(
+                    vllm_config.model_config, "get_head_size", lambda: None
+                )(),
+            },
+            "cache_config": {
+                "block_size": getattr(vllm_config.cache_config, "block_size", None),
+                "cache_dtype": str(
+                    getattr(vllm_config.cache_config, "cache_dtype", None)
+                ),
+                "gpu_memory_utilization": getattr(
+                    vllm_config.cache_config, "gpu_memory_utilization", None
+                ),
+            },
+        }
+
+        return inference_info
+
+    def get_inference_version(self) -> str:
+        """Get vLLM version information.
+
+        Returns:
+            str: vLLM version string
+        """
+        return VLLM_VERSION
+
+    @_lmcache_nvtx_annotate
+    def _init_kv_caches_from_forward_context(self, forward_context: "ForwardContext"):
+        for layer_name in forward_context.no_compile_layers:
+            attn_layer = forward_context.no_compile_layers[layer_name]
+            if not hasattr(attn_layer, "kv_cache"):
+                logger.debug("The layer %s does not have kv_cache, skip it", layer_name)
+                continue
+
+            if layer_name not in self.kv_caches:
+                self.kv_caches[layer_name] = attn_layer.kv_cache[
+                    forward_context.virtual_engine
+                ]
+
+    ####################
+    # Worker side APIs
+    ####################
+
+    @_lmcache_nvtx_annotate
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+        """
+        self.current_layer = 0
+
+        if len(self.kv_caches) == 0:
+            self._init_kv_caches_from_forward_context(forward_context)
+
+        metadata = self._parent._get_connector_metadata()
+        assert isinstance(metadata, LMCacheConnectorMetadata)
+
+        assert len(self.kv_caches) > 0
+        kvcaches = list(self.kv_caches.values())
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.debug("In connector.start_load_kv, but the attn_metadata is None")
+            return
+
+        assert self.lmcache_engine is not None
+
+        self.lmcache_engine.post_init(kvcaches=kvcaches)
+
+        self.layerwise_retrievers = []
+
+        for idx, request in enumerate(metadata.requests):
+            if request.load_spec is None:
+                continue
+            last_idx = idx
+
+        for idx, request in enumerate(metadata.requests):
+            if request.load_spec is None:
+                continue
+
+            tokens = request.token_ids
+            # TODO: have a pre-allocated buffer to hold the slot_mappings
+            slot_mapping = request.slot_mapping.cuda()
+            assert len(tokens) == len(slot_mapping)
+
+            self._stats_monitor.update_interval_vllm_hit_tokens(
+                request.load_spec.vllm_cached_tokens
+            )
+            token_mask = torch.ones(len(tokens), dtype=torch.bool)
+            masked_token_count = (
+                request.load_spec.vllm_cached_tokens
+                // self._lmcache_chunk_size
+                * self._lmcache_chunk_size
+            )
+            token_mask[:masked_token_count] = False
+
+            lmcache_cached_tokens = request.load_spec.lmcache_cached_tokens
+            if self.use_layerwise:
+                sync = idx == last_idx
+                # NOTE(Jiayi): Perform blending before layerwise prefix caching
+                if self.enable_blending:
+                    # TODO(Jiayi): Need to make prefix caching and blending
+                    # compatible
+                    self.blender.blend(
+                        tokens[:lmcache_cached_tokens],
+                        token_mask[:lmcache_cached_tokens],
+                        kvcaches=kvcaches,
+                        slot_mapping=slot_mapping[:lmcache_cached_tokens],
+                    )
+                else:
+                    layerwise_retriever = self.lmcache_engine.retrieve_layer(
+                        tokens[:lmcache_cached_tokens],
+                        token_mask[:lmcache_cached_tokens],
+                        kvcaches=kvcaches,
+                        slot_mapping=slot_mapping[:lmcache_cached_tokens],
+                        sync=sync,
+                    )
+                    # NOTE: retrieve for two layers at the first layer
+                    next(layerwise_retriever)
+                    next(layerwise_retriever)
+                    self.layerwise_retrievers.append(layerwise_retriever)
+            else:
+                ret_token_mask = self.lmcache_engine.retrieve(
+                    tokens[:lmcache_cached_tokens],
+                    token_mask[:lmcache_cached_tokens],
+                    kvcaches=kvcaches,
+                    slot_mapping=slot_mapping[:lmcache_cached_tokens],
+                    request_configs=request.request_configs,
+                    req_id=request.req_id,
+                )
+
+                # Check the result
+                num_retrieved_tokens = ret_token_mask.sum().item()
+                num_expected_tokens = (
+                    lmcache_cached_tokens - request.load_spec.vllm_cached_tokens
+                )
+                if num_retrieved_tokens < num_expected_tokens:
+                    logger.error(
+                        "The number of retrieved tokens is less than the "
+                        "expected number of tokens! This should not happen!"
+                    )
+                    logger.error(
+                        "Num retrieved tokens: %d, num expected tokens: %d",
+                        num_retrieved_tokens,
+                        num_expected_tokens,
+                    )
+
+    @_lmcache_nvtx_annotate
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        if self.layerwise_retrievers:
+            logger.debug("Waiting for layer %s to be loaded", self.current_layer)
+
+        # Wait for the layer to be loaded
+        for layerwise_retriever in self.layerwise_retrievers:
+            ret_token_mask = next(layerwise_retriever)
+
+            if self.current_layer == self.num_layers - 1:
+                assert ret_token_mask is not None
+                num_retrieved_tokens = ret_token_mask.sum().item()
+                logger.info("Retrieved %s tokens", num_retrieved_tokens)
+
+        return
+
+    @_lmcache_nvtx_annotate
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        """Start saving the a layer of KV cache from vLLM's paged buffer
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+        """
+        assert self.lmcache_engine is not None
+
+        if not self.use_layerwise:
+            return
+
+        if self.kv_role == "kv_consumer":
+            # Don't do save if the role is kv_consumer
+            return
+        if self._parent._connector_metadata is None:
+            logger.warning(
+                "In connector.save_kv_layer, but the connector metadata is None"
+            )
+            return
+        connector_metadata = self._parent._get_connector_metadata()
+        assert isinstance(connector_metadata, LMCacheConnectorMetadata)
+
+        assert len(self.kv_caches) > 0
+
+        kvcaches = list(self.kv_caches.values())
+        if self.current_layer == 0:
+            self.layerwise_storers = []
+
+            is_first = True
+
+            for idx, request in enumerate(connector_metadata.requests):
+                save_spec = request.save_spec
+                if save_spec is None or not save_spec.can_save:
+                    continue
+
+                token_ids = request.token_ids
+                assert isinstance(token_ids, list)
+
+                slot_mapping = request.slot_mapping
+                assert isinstance(slot_mapping, torch.Tensor)
+                assert len(slot_mapping) == len(token_ids)
+
+                # TODO: have a pre-allocated buffer to hold the slot_mappings
+                slot_mapping = slot_mapping.cuda()
+
+                if self.kv_role == "kv_producer":
+                    skip_leading_tokens = 0
+                else:
+                    skip_leading_tokens = save_spec.skip_leading_tokens
+
+                    if skip_leading_tokens == len(token_ids):
+                        continue  # skip this request
+                    # Align to lmcache chunk size
+                    skip_leading_tokens = (
+                        skip_leading_tokens
+                        // self._lmcache_chunk_size
+                        * self._lmcache_chunk_size
+                    )
+
+                store_mask = torch.ones(len(token_ids), dtype=torch.bool)
+                store_mask[:skip_leading_tokens] = False
+
+                logger.info(
+                    "Storing KV cache for %d out of %d tokens "
+                    "(skip_leading_tokens=%d) for request %s",
+                    len(token_ids) - skip_leading_tokens,
+                    len(token_ids),
+                    skip_leading_tokens,
+                    request.req_id,
+                )
+
+                # TODO (Jiayi): need to make layerwise storing
+                # compatible with disagg spec
+                layerwise_storer = self.lmcache_engine.store_layer(
+                    token_ids,
+                    mask=store_mask,
+                    kvcaches=kvcaches,
+                    slot_mapping=slot_mapping,
+                    offset=skip_leading_tokens,
+                    sync=is_first,
+                )
+                self.layerwise_storers.append(layerwise_storer)
+                if is_first:
+                    is_first = False
+
+        for layerwise_storer in self.layerwise_storers:
+            next(layerwise_storer)
+
+        self.current_layer += 1
+
+    @_lmcache_nvtx_annotate
+    def wait_for_save(self):
+        """Blocking until the KV cache is saved to the connector buffer."""
+
+        connector_metadata = self._parent._get_connector_metadata()
+        assert isinstance(connector_metadata, LMCacheConnectorMetadata)
+
+        self.lmcache_engine.lookup_unpin(  # type: ignore
+            connector_metadata.lookup_requests_in_step
+        )
+
+        if self.kv_role == "kv_consumer":
+            # Don't do save if the role is kv_consumer
+            return
+
+        if self.use_layerwise:
+            for layerwise_storer in self.layerwise_storers:
+                next(layerwise_storer)
+            return
+
+        assert len(self.kv_caches) > 0
+        kvcaches = list(self.kv_caches.values())
+
+        assert self.lmcache_engine is not None
+
+        for request in connector_metadata.requests:
+            save_spec = request.save_spec
+            if (
+                save_spec is None or not save_spec.can_save
+            ) and self.kv_role != "kv_producer":
+                continue
+
+            token_ids = request.token_ids
+
+            slot_mapping = request.slot_mapping
+            assert isinstance(slot_mapping, torch.Tensor)
+            assert len(slot_mapping) == len(token_ids)
+            assert save_spec is not None
+
+            # TODO: have a pre-allocated buffer to hold the slot_mappings
+            slot_mapping = slot_mapping.cuda()
+
+            skip_leading_tokens = save_spec.skip_leading_tokens
+            if self.kv_role == "kv_producer":
+                assert request.disagg_spec is not None
+                skip_leading_tokens = min(
+                    skip_leading_tokens, request.disagg_spec.num_transferred_tokens
+                )
+
+            if skip_leading_tokens == len(token_ids):
+                continue  # skip this request
+            # Align to lmcache chunk size
+            skip_leading_tokens = (
+                skip_leading_tokens
+                // self._lmcache_chunk_size
+                * self._lmcache_chunk_size
+            )
+
+            store_mask = torch.ones(len(token_ids), dtype=torch.bool)
+            store_mask[:skip_leading_tokens] = False
+
+            logger.info(
+                "Storing KV cache for %d out of %d tokens "
+                "(skip_leading_tokens=%d) for request %s",
+                len(token_ids) - skip_leading_tokens,
+                len(token_ids),
+                skip_leading_tokens,
+                request.req_id,
+            )
+
+            is_last_prefill = request.is_last_prefill
+            if is_last_prefill:
+                if request.disagg_spec:
+                    request.disagg_spec.is_last_prefill = True
+            else:
+                token_len = len(token_ids)
+                aligned_token_len = (
+                    token_len // self._lmcache_chunk_size * self._lmcache_chunk_size
+                )
+                token_ids = token_ids[:aligned_token_len]
+                store_mask = store_mask[:aligned_token_len]
+                slot_mapping = slot_mapping[:aligned_token_len]
+
+            self.lmcache_engine.store(
+                token_ids,
+                mask=store_mask,
+                kvcaches=kvcaches,
+                slot_mapping=slot_mapping,
+                offset=skip_leading_tokens,
+                transfer_spec=request.disagg_spec,
+                request_configs=request.request_configs,
+            )
+
+            # NOTE(Jiayi): We assume all tokens are saved
+            save_spec.skip_leading_tokens = len(token_ids)
+            if request.disagg_spec:
+                request.disagg_spec.num_transferred_tokens = len(token_ids)
+
+    @_lmcache_nvtx_annotate
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        return None, None
+
+    ###################
+    # Scheduler side APIs
+    ####################
+
+    @_lmcache_nvtx_annotate
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int | None:
+        """
+        Check for external KV cache hit.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the
+            external KV cache beyond what is already computed.
+        """
+        if self.kv_role == "kv_producer" and not hasattr(
+            self.lookup_client, "supports_producer_reuse"
+        ):
+            return 0
+
+        self._requests_priority[request.request_id] = request.priority
+
+        token_ids = request.prompt_token_ids
+
+        # If the request has multimodal hashes, apply them to the token ids
+        mm_hashes, mm_positions = extract_mm_features(request)
+        if mm_hashes and mm_positions:
+            # TODO(Jiayi): Optimize this
+            token_ids_tensor = torch.tensor(request.prompt_token_ids)
+            apply_mm_hashes_to_token_ids(token_ids_tensor, mm_hashes, mm_positions)
+            token_ids = token_ids_tensor.tolist()
+
+        if request.sampling_params:
+            request_configs = extract_request_configs(request.sampling_params)
+        else:
+            request_configs = None
+
+        if self.skip_last_n_tokens > 0:
+            assert token_ids is not None
+            token_ids = token_ids[: -self.skip_last_n_tokens]
+        lookup_id = request.request_id if self.async_loading else str(uuid.uuid4())
+
+        self._lookup_requests_in_step.append(lookup_id)
+
+        num_external_hit_tokens = self.lookup_client.lookup(
+            token_ids,
+            lookup_id=lookup_id,
+            request_configs=request_configs,
+        )
+
+        if num_external_hit_tokens is None:
+            logger.info(
+                "Reqid: %s, Total tokens %d, LMCache hit tokens: None.",
+                request.request_id,
+                request.num_tokens,
+            )
+            return None
+
+        # When prompt length is divisible by the block size and all
+        # blocks are cached, we need to recompute the last token.
+        # This will be removed in the future if vLLM's scheduler provides
+        # a better support for this case.
+        need_to_allocate = num_external_hit_tokens - num_computed_tokens
+
+        # In, full-prompt-hit case, we need to recompute the last token
+        if num_external_hit_tokens == request.num_tokens:
+            need_to_allocate -= 1
+
+        logger.info(
+            "Reqid: %s, Total tokens %d, LMCache hit tokens: %d, need to load: %d",
+            request.request_id,
+            request.num_tokens,
+            num_external_hit_tokens,
+            need_to_allocate,
+        )
+
+        self.load_specs[request.request_id] = LoadSpec(
+            vllm_cached_tokens=num_computed_tokens,
+            lmcache_cached_tokens=num_external_hit_tokens,
+            can_load=False,
+        )
+
+        if need_to_allocate <= 0:
+            return 0
+
+        return need_to_allocate
+
+    @_lmcache_nvtx_annotate
+    def update_state_after_alloc(self, request: "Request", num_external_tokens: int):
+        """
+        Update KVConnector state after temporary buffer alloc.
+
+        For SharedStorageConnector, update _request_needs_load
+        if the CacheManager this allocated blocks for us.
+        """
+
+        # Clear local status in lookup client when a new request is
+        # successfully scheduled.
+        self.lookup_client.clear_lookup_status(request.request_id)
+
+        kv_transfer_params = (
+            request.kv_transfer_params
+            if hasattr(request, "kv_transfer_params")
+            else None
+        )
+
+        if kv_transfer_params is not None and "disagg_spec" in kv_transfer_params:
+            req_disagg_spec = kv_transfer_params["disagg_spec"]
+
+            receiver_id = req_disagg_spec["receiver_host"] + str(
+                req_disagg_spec["receiver_init_port"]
+            )
+
+            disagg_spec = DisaggSpec(
+                req_id=req_disagg_spec["req_id"],
+                receiver_id=receiver_id,
+                receiver_host=req_disagg_spec["receiver_host"],
+                receiver_init_port=req_disagg_spec["receiver_init_port"],
+                receiver_alloc_port=req_disagg_spec["receiver_alloc_port"],
+            )
+
+            tmp_disagg_tracker[request.request_id] = disagg_spec
+        self._unfinished_requests[request.request_id] = request
+
+        if request.request_id not in self.load_specs:
+            # No KV tokens from external KV cache, return
+            return
+
+        if num_external_tokens == 0:
+            # No need to load anything
+            self.load_specs[request.request_id].can_load = False
+            return
+
+        # Only check for non-prompt-hit case
+        if (
+            self.load_specs[request.request_id].lmcache_cached_tokens
+            != request.num_tokens
+        ):
+            assert (
+                num_external_tokens > 0
+                and num_external_tokens
+                == self.load_specs[request.request_id].lmcache_cached_tokens
+                - self.load_specs[request.request_id].vllm_cached_tokens
+            ), (
+                f"Mismatch in number of tokens: {num_external_tokens} vs "
+                f"{self.load_specs[request.request_id].lmcache_cached_tokens} -"
+                f" {self.load_specs[request.request_id].vllm_cached_tokens}"
+                f" for request {request.request_id}"
+            )
+
+        self.load_specs[request.request_id].can_load = True
+
+    @_lmcache_nvtx_annotate
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        """Attach the connector metadata to the request object.
+
+        This function should NOT modify other fields in the scheduler_output
+        except the `kv_connector_metadata` field.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+
+        force_skip_save = self.kv_role == "kv_consumer" or self.force_skip_save
+
+        meta = LMCacheConnectorMetadata()
+
+        # set and update lookup requests for unpin
+        meta.lookup_requests_in_step = self._lookup_requests_in_step
+        self._lookup_requests_in_step = []
+
+        for finished_req_id in scheduler_output.finished_req_ids:
+            self._request_trackers.pop(finished_req_id, None)
+            self._unfinished_requests.pop(finished_req_id, None)
+
+        for request in scheduler_output.scheduled_new_reqs:
+            # Right now, we only load KV for new requests
+            load_spec = self.load_specs.pop(request.req_id, None)
+            num_tokens_to_compute = (
+                request.num_computed_tokens
+                + scheduler_output.num_scheduled_tokens[request.req_id]
+            )
+            lmcache_cached_tokens = 0
+            if load_spec is not None:
+                lmcache_cached_tokens = load_spec.lmcache_cached_tokens
+            request_priority = self._requests_priority.pop(request.req_id, 0)
+
+            skip_save = force_skip_save or (
+                self.config.priority_limit is not None
+                and request_priority > self.config.priority_limit
+            )
+
+            request_tracker = RequestTracker.from_new_request(
+                self.config,
+                request,
+                num_tokens_to_compute,
+                lmcache_cached_tokens,
+                skip_save,
+            )
+            self._request_trackers[request.req_id] = request_tracker
+
+            req_meta = ReqMeta.from_request_tracker(
+                request_tracker,
+                self._block_size,
+                self._lmcache_chunk_size,
+                load_spec=load_spec,
+                discard_partial_chunks=self._discard_partial_chunks,
+                save_decode_cache=self._save_decode_cache,
+            )
+            if req_meta is not None:
+                meta.add_request(req_meta)
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+
+        # NOTE: For backward compatibility with vllm version < 0.9.2,
+        # In the latest vllm version, the type of scheduled_cached_reqs has
+        # changed from list to object `CachedRequestData`
+        if isinstance(cached_reqs, list):
+            for i, req in enumerate(cached_reqs):
+                request_tracker = self._request_trackers[req.req_id]
+                request_tracker.update(req.new_token_ids, req.new_block_ids)
+
+                req_meta = ReqMeta.from_request_tracker(
+                    request_tracker,
+                    self._block_size,
+                    self._lmcache_chunk_size,
+                    load_spec=None,
+                    discard_partial_chunks=self._discard_partial_chunks,
+                )
+                if req_meta is not None:
+                    meta.add_request(req_meta)
+            return meta
+
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            request_tracker = self._request_trackers[req_id]
+            num_new_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            if cached_request := self._unfinished_requests.get(req_id):
+                num_current_tokens = len(request_tracker.token_ids)
+                new_token_ids = cached_request.all_token_ids[
+                    num_current_tokens : num_current_tokens + num_new_tokens
+                ]
+            else:
+                raise ValueError(
+                    f"Request {req_id} is not in _unfinished_requests, "
+                    f"but it is scheduled to be cached"
+                )
+            new_block_ids = cached_reqs.new_block_ids[i]
+
+            request_tracker.update(new_token_ids, new_block_ids)
+
+            req_meta = ReqMeta.from_request_tracker(
+                request_tracker,
+                self._block_size,
+                self._lmcache_chunk_size,
+                load_spec=None,
+                discard_partial_chunks=self._discard_partial_chunks,
+                save_decode_cache=self._save_decode_cache,
+            )
+            if req_meta is not None:
+                meta.add_request(req_meta)
+
+        return meta
+
+    @_lmcache_nvtx_annotate
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        params = (
+            request.kv_transfer_params
+            if hasattr(request, "kv_transfer_params")
+            else None
+        )
+        return_params = None
+
+        # NOTE: Used to stream back the first token
+        # for disagg prefill
+        if params is not None and "ret_first_tok" in params:
+            return_params = {
+                "first_tok": request._output_token_ids[0],
+            }
+
+        return False, return_params
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..55831dc56c803e56181c91973df3b7e602857b36
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -0,0 +1,867 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
+import torch
+import zmq
+from lmcache.utils import init_logger as lmcache_init_logger
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.utils import ConstantList
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+        KVConnectorPromMetrics,
+        KVConnectorStats,
+        PromMetric,
+        PromMetricT,
+    )
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.core.kv_cache_utils import BlockHash
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = lmcache_init_logger(__name__)
+
+
+# Helper functions
+def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
+    if block_ids is None:
+        return []
+    assert isinstance(block_ids, tuple), (
+        f"Expected block_ids to be a tuple of lists, but got {type(block_ids)}"
+    )
+
+    if len(block_ids) > 1:
+        raise RuntimeError(
+            "LMCacheMPConnector only works without hybrid kv cache manager. "
+            "Please pass --disable-hybrid-kv-cache-manager when starting vllm"
+        )
+
+    return block_ids[0]
+
+
+def create_scheduler_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPSchedulerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPSchedulerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def create_worker_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPWorkerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPWorkerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def convert_block_hashes_to_bytes(
+    block_hashes: list["BlockHash"],
+) -> list[bytes]:
+    return cast(list[bytes], block_hashes)
+
+
+class LMCacheMPRequestState(enum.Enum):
+    """
+    State machine:
+    PREFETCHING -- update_state_after_alloc --> WAITING_FOR_LOAD
+    WAITING_FOR_LOAD -- process_loading_requests --> READY
+    """
+
+    PREFETCHING = enum.auto()
+    WAITING_FOR_LOAD = enum.auto()
+    READY = enum.auto()
+
+
+@dataclass
+class LMCacheMPRequestTracker:
+    # NOTE: this class used vLLM data structures, should be part of
+    # vLLM integration code
+
+    request_id: str
+
+    # Read-only lists to track the token ids and block hashes
+    all_token_ids: ConstantList[int]
+    block_hashes: ConstantList["BlockHash"]
+
+    # Block ids and hashes will be updated at update_states_after_alloc and
+    # during the generation
+    allocated_block_ids: list[int] = field(default_factory=list)
+
+    # Number of scheduled tokens in this request. We keep tracking this to
+    # avoid saving half-full blocks.
+    num_scheduled_tokens: int = 0
+
+    # Number of blocks stored will be initialized when lookup the external
+    # hit tokens and will be updated when processing new requests and cached
+    # requests.
+    num_stored_blocks: int = 0
+
+    # Staging load operation -- save vllm and lmcache hit tokens during lookup
+    num_vllm_hit_blocks: int = 0
+    num_lmcache_hit_blocks: int = 0
+
+    # Main state
+    state: LMCacheMPRequestState = LMCacheMPRequestState.PREFETCHING
+
+    def __init__(self, request: "Request"):
+        self.request_id = request.request_id
+        self.all_token_ids = request.all_token_ids
+        self.block_hashes = ConstantList(request.block_hashes)
+        self.allocated_block_ids = []
+        self.num_stored_blocks = 0
+        self.num_vllm_hit_blocks = 0
+        self.num_lmcache_hit_blocks = 0
+        self.state = LMCacheMPRequestState.PREFETCHING
+
+    ####
+    # Check the state of the request
+    ####
+    def needs_retrieve(self) -> bool:
+        """Check whether the current request needs retrieve, will be used
+        update_stage_after_alloc"""
+        return (
+            self.num_lmcache_hit_blocks > self.num_vllm_hit_blocks
+            and self.state != LMCacheMPRequestState.READY
+        )
+
+    def is_ready_for_retrieving(self) -> bool:
+        """Check whether the current request is ready for retrieving,
+        will be used in process_loading_requests"""
+        return (
+            self.state == LMCacheMPRequestState.WAITING_FOR_LOAD
+            and self.needs_retrieve()
+        )
+
+    ####
+    # Update internal states
+    ####
+    def increase_num_scheduled_tokens(self, num_new_tokens: int):
+        self.num_scheduled_tokens += num_new_tokens
+
+    def increase_num_stored_blocks(self, num_new_blocks: int):
+        """Increase the number of stored blocks for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.num_stored_blocks += num_new_blocks
+
+    def update_block_ids(
+        self,
+        new_block_ids: list[int],
+    ):
+        """Update the block ids for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.allocated_block_ids.extend(new_block_ids)
+
+    ####
+    # For debugging
+    ####
+    def __repr__(self) -> str:
+        return (
+            f"LMCacheMPRequestTracker(request_id={self.request_id}, "
+            f"num_tokens={len(self.all_token_ids)}, "
+            f"num_block_hashes={len(self.block_hashes)}, "
+            f"num_allocated_blocks={len(self.allocated_block_ids)}, "
+            f"num_stored_blocks={self.num_stored_blocks}, "
+            f"vllm_hit_blocks={self.num_vllm_hit_blocks}, "
+            f"lmcache_hit_blocks={self.num_lmcache_hit_blocks}, "
+            f"state={self.state})"
+        )
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+@dataclass
+class LMCacheMPRequestMetadata:
+    request_id: str
+    direction: Literal["STORE", "RETRIEVE"]
+    op: LoadStoreOp
+
+    @staticmethod
+    def GetStoreMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+        vllm_block_size: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the store metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        # Store the blocks that has block hashes
+        # NOTE: the invariant here is that `num_stored_blocks` should
+        # always be a multiple of `blocks_in_chunk`
+        # TODO: This should be checked everytime we update the num_stored_blocks
+        min_available_blocks = min(
+            len(tracker.block_hashes),
+            len(tracker.allocated_block_ids),
+            tracker.num_scheduled_tokens // vllm_block_size,
+        )
+        num_staging_blocks = min_available_blocks - tracker.num_stored_blocks
+        num_chunks = num_staging_blocks // blocks_in_chunk
+
+        if num_chunks >= 1:
+            start = tracker.num_stored_blocks
+            end = start + num_chunks * blocks_in_chunk
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="STORE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+
+            # Update the request tracker
+            tracker.increase_num_stored_blocks(end - start)
+            return ret
+
+        return None
+
+    @staticmethod
+    def GetRetrieveMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the retrieve metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        if not tracker.is_ready_for_retrieving():
+            return None
+
+        # |---------------------|-----------------|----------------|
+        # | num_vllm_hit_blocks |
+        # | lmcache chunk 1   | lmcache chunk 2   |
+        #                     |  need to retrieve |
+
+        start = tracker.num_vllm_hit_blocks // blocks_in_chunk * blocks_in_chunk
+        end = tracker.num_lmcache_hit_blocks
+        assert end % blocks_in_chunk == 0, (
+            "The number of LMCache hit blocks should be a multiple of the "
+            "number of blocks in a lmcache chunk. "
+        )
+        assert len(tracker.block_hashes) >= end, (
+            "The number of block hashes should be greater than or equal to the "
+            "number of LMCache hit blocks. "
+        )
+        if end > start:
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="RETRIEVE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+            return ret
+
+        return None
+
+
+class LMCacheMPConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        super().__init__()
+        self.requests: list[LMCacheMPRequestMetadata] = []
+
+    def add_request_metadata(self, request_metadata: LMCacheMPRequestMetadata):
+        self.requests.append(request_metadata)
+
+    def __len__(self):
+        return len(self.requests)
+
+    # For debugging
+    def __str__(self):
+        request_strs = []
+        for req_meta in self.requests:
+            request_strs.append(
+                f"RequestMetadata(request_id={req_meta.request_id}, "
+                f"direction={req_meta.direction}, "
+                f"num_blocks={len(req_meta.op)}, "
+                f"block_ids={req_meta.op.block_ids})"
+            )
+        return "[" + "\n".join(request_strs) + "]"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class LMCacheMPConnector(KVConnectorBase_V1):
+    """
+    The connector for LMCache multi-process mode.
+
+    Extra configs (kv_transfer_config.extra_config):
+    - lmcache.mp.host: the host of the LMCache server.
+    - lmcache.mp.port: the port of the LMCache server.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        assert vllm_config.kv_transfer_config is not None
+        server_host = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.host", "tcp://localhost"
+        )
+        server_port = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.port", 5555
+        )
+
+        server_url = f"{server_host}:{server_port}"
+        zmq_context = zmq.Context.instance()
+        if self.role == KVConnectorRole.SCHEDULER:
+            self.scheduler_adapter = create_scheduler_adapter(
+                server_url, zmq_context, vllm_config
+            )
+            self.request_trackers: dict[str, LMCacheMPRequestTracker] = {}
+        elif self.role == KVConnectorRole.WORKER:
+            self.worker_adapter = create_worker_adapter(
+                server_url, zmq_context, vllm_config
+            )
+        else:
+            raise ValueError(f"Unknown KVConnectorRole: {self.role}")
+
+        self.vllm_block_size = vllm_config.cache_config.block_size
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args:
+            kv_caches: dictionary of layer names, kv cache
+        """
+        logger.info("Registering kv caches!")
+        self.worker_adapter.register_kv_caches(kv_caches)
+        return
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+
+        for meta in metadata.requests:
+            if meta.direction != "RETRIEVE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            logger.info(
+                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
+            )
+            self.worker_adapter.batched_submit_retrieve_requests(
+                request_ids, ops, event
+            )
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        return
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+        for meta in metadata.requests:
+            if meta.direction != "STORE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        val = self.worker_adapter.get_finished(finished_req_ids)
+        # logger.error("Finished req ids: %s, %s", val[0], val[1])
+        return val
+
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+
+        Notes:
+            - Applies to both sync- and async-loading requests.
+            - Async loading: failed blocks may be reported in any forward pass
+              up to and including the pass where the request ID is returned by
+              `get_finished()`. Even if failures occur, the request must still
+              be reported via `get_finished()`, and the failed block IDs must
+              appear here no later than that same pass.
+            - Sync loading: failed blocks should be reported in the forward
+              pass in which they are detected.
+        """
+        # TODO: add error tracking
+        return set()
+
+    def shutdown(self):
+        """
+        Shutdown the connector. This is called when the worker process
+        is shutting down to ensure that all the async operations are
+        completed and the connector is cleaned up properly.
+        """
+        if hasattr(self, "worker_adapter"):
+            self.worker_adapter.shutdown()
+        return None
+
+    def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
+        """
+        Get the KV connector stats collected during the last interval.
+        """
+        return None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            A tuple with the following elements:
+                - An optional number of tokens that can be loaded from the
+                  external KV cache beyond what is already computed.
+                  If None, it means that the connector needs more time to
+                  determine the number of matched tokens, and the scheduler
+                  should query for this request again later.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
+
+        Notes:
+            The connector should only consider the largest prefix of prompt-
+            tokens for which KV cache is actually available at the time of the
+            call. If the cache cannot be loaded for some tokens (e.g., due to
+            connectivity issues or eviction), those tokens must not be taken
+            into account.
+        """
+        tracker = self._get_or_create_request_tracker(request)
+
+        self.scheduler_adapter.maybe_submit_lookup_request(
+            request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
+        )
+
+        ret = self.scheduler_adapter.check_lookup_result(request.request_id)
+        if ret is None:
+            return None, True
+
+        if ret == 0:
+            return 0, False
+
+        assert (
+            ret % (self.scheduler_adapter.num_blocks_per_chunk() * self.vllm_block_size)
+            == 0
+        )
+
+        # Update num stored blocks for the tracker
+        num_vllm_blocks = num_computed_tokens // self.vllm_block_size
+        num_lmcache_blocks = ret // self.vllm_block_size
+        tracker.increase_num_stored_blocks(num_lmcache_blocks)
+
+        # Save the vllm and lmcache hit tokens
+        tracker.num_vllm_hit_blocks = num_vllm_blocks
+        tracker.num_lmcache_hit_blocks = num_lmcache_blocks
+
+        need_to_load = max(0, ret - num_computed_tokens)
+        logger.debug(
+            "vLLM hit is: %d, Need to load is %d", num_computed_tokens, need_to_load
+        )
+        return need_to_load, need_to_load > 0
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        """
+        Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        # NOTE: the `blocks` are NEW BLOCKS allocated for this request.
+        tracker = self._get_request_tracker(request.request_id)
+        block_ids = reformat_block_ids(blocks.get_block_ids())
+
+        # No matter we need to retrieve or not, we need to update
+        # the block ids into the tracker
+        tracker.update_block_ids(block_ids)
+
+        # Update the state of the tracker
+        condition = tracker.needs_retrieve()
+        if tracker.state == LMCacheMPRequestState.PREFETCHING:
+            # If need to retrieve, change to WAITING_FOR_LOAD
+            # Otherwise, change to READY
+            tracker.state = (
+                LMCacheMPRequestState.WAITING_FOR_LOAD
+                if condition
+                else LMCacheMPRequestState.READY
+            )
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        metadata = LMCacheMPConnectorMetadata()
+
+        self._process_retrieve_requests(metadata)
+        self._process_new_requests(scheduler_output, metadata)
+        self._process_cached_requests(scheduler_output, metadata)
+
+        if len(metadata) > 0:
+            logger.debug("Final connector metadata: %s", metadata)
+
+        return metadata
+
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called exactly once when a request has finished, before its blocks are
+        freed.
+
+        The connector may assumes responsibility for freeing the blocks
+        asynchronously by returning True.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return True, None
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        return ()
+
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+
+        if cls is KVConnectorBase_V1:
+            raise TypeError(
+                "get_required_kvcache_layout should not be called "
+                "on the abstract base class"
+            )
+        return None
+
+    def get_finished_count(self) -> int | None:
+        """
+        Get the count of requests expected to complete send/receive operations
+        via this connector. This method is used to initialize the
+        KVOutputAggregator, overwriting the default world_size.
+
+        Returns:
+            int: expected sending or receiving completion count.
+        """
+        return None
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> Optional["KVConnectorStats"]:
+        """
+        KVConnectorStats resolution method. This method allows dynamically
+        registered connectors to return their own KVConnectorStats object,
+        which can implement custom aggregation logic on the data dict.
+        """
+        return None
+
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type["PromMetric"], type["PromMetricT"]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> Optional["KVConnectorPromMetrics"]:
+        """
+        Create a KVConnectorPromMetrics subclass which should register
+        per-connector Prometheus metrics and implement observe() to
+        expose connector transfer stats via Prometheus.
+        """
+        return None
+
+    ##############################
+    # Helper functions
+    ##############################
+    def _process_retrieve_requests(
+        self,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for request_tracker in self.request_trackers.values():
+            if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD:
+                continue
+            r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata(
+                request_tracker, blocks_per_chunk
+            )
+            if r_metadata is not None:
+                metadata.add_request_metadata(r_metadata)
+            request_tracker.state = LMCacheMPRequestState.READY
+
+    def _process_new_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for new_request in scheduler_output.scheduled_new_reqs:
+            request_tracker = self._get_request_tracker(new_request.req_id)
+
+            num_new_tokens = scheduler_output.num_scheduled_tokens[new_request.req_id]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _process_cached_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for idx, request_id in enumerate(cached_reqs.req_ids):
+            request_tracker = self._get_request_tracker(request_id)
+
+            # Update block ids
+            new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
+            request_tracker.update_block_ids(new_block_ids)
+
+            # Update new scheduled tokens
+            num_new_tokens = cached_reqs.num_computed_tokens[idx]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _get_request_tracker(self, request_id: str) -> LMCacheMPRequestTracker:
+        assert request_id in self.request_trackers, (
+            f"Request tracker for request_id {request_id} not found. "
+        )
+        return self.request_trackers[request_id]
+
+    def _get_or_create_request_tracker(
+        self, request: "Request"
+    ) -> LMCacheMPRequestTracker:
+        request_id = request.request_id
+        if request_id not in self.request_trackers:
+            new_tracker = LMCacheMPRequestTracker(request)
+            self.request_trackers[request_id] = new_tracker
+        return self.request_trackers[request_id]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index e40007230ba45c3f3d4346053f3b7a1b415db9ef..d6ea4f1ab4cfc28a909fde66e3b4b4fbcba2a1be 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -1,26 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
-from typing import Any, Optional, Union
+from typing import Any, TypeAlias, TypeVar
 
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.distributed.kv_transfer.kv_transfer_state import (
-    has_kv_transfer_group)
+from prometheus_client import Counter, Gauge, Histogram
+
+from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group
 from vllm.logger import init_logger
 
+PromMetric: TypeAlias = Gauge | Counter | Histogram
+PromMetricT = TypeVar("PromMetricT", bound=PromMetric)
+
 logger = init_logger(__name__)
 
 
 @dataclass
 class KVConnectorStats:
     """
-    Base class for KV Connector Stats, a container for transfer performance 
-    metrics or otherwise important telemetry from the connector. 
+    Base class for KV Connector Stats, a container for transfer performance
+    metrics or otherwise important telemetry from the connector.
     All sub-classes need to be serializable as stats are sent from worker to
     logger process.
     """
+
     data: dict[str, Any] = field(default_factory=dict)
 
     def reset(self):
@@ -33,10 +37,10 @@ class KVConnectorStats:
         """
         raise NotImplementedError
 
-    def reduce(self) -> dict[str, Union[int, float]]:
+    def reduce(self) -> dict[str, int | float]:
         """
-        Reduce the observations collected during a time interval to one or 
-        more representative values (eg avg/median/sum of the series). 
+        Reduce the observations collected during a time interval to one or
+        more representative values (eg avg/median/sum of the series).
         This is meant to be called by the logger to produce a summary of the
         stats for the last time interval.
         """
@@ -48,18 +52,18 @@ class KVConnectorStats:
 
 
 class KVConnectorLogging:
-
     def __init__(self, kv_tranfer_config: KVTransferConfig):
         # This should be called on frontend process.
         assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
         if kv_tranfer_config and kv_tranfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(
-                kv_tranfer_config)
+                kv_tranfer_config
+            )
         self.reset()
 
     def reset(self):
-        self.transfer_stats_accumulator: Optional[KVConnectorStats] = None
+        self.transfer_stats_accumulator: KVConnectorStats | None = None
 
     def observe(self, transfer_stats_data: dict[str, Any]):
         # Should not be called when a KVConnector is not configured.
@@ -69,32 +73,117 @@ class KVConnectorLogging:
         # We expect transfer_stats_data to be aggregated across all workers and
         # consist of observations from a single connector or a MultiConnector.
         transfer_stats = self.connector_cls.build_kv_connector_stats(
-            transfer_stats_data)
+            transfer_stats_data
+        )
         if transfer_stats is None:
             logger.warning_once(
                 "The connector %s is collecting stats but "
                 "does not implement the "
                 "`build_kv_connector_stats` method. "
-                "Stats will not be logged.", self.connector_cls)
+                "Stats will not be logged.",
+                self.connector_cls,
+            )
             return
 
         if self.transfer_stats_accumulator is None:
             self.transfer_stats_accumulator = transfer_stats
         else:
             # Accumulate last interval stats.
-            self.transfer_stats_accumulator = \
-                self.transfer_stats_accumulator.aggregate(transfer_stats)
+            self.transfer_stats_accumulator = self.transfer_stats_accumulator.aggregate(
+                transfer_stats
+            )
 
     def log(self, log_fn=logger.info):
         """Log transfer metrics periodically, similar to throughput logging"""
-        if (self.transfer_stats_accumulator
-                and not self.transfer_stats_accumulator.is_empty()):
+        if (
+            self.transfer_stats_accumulator
+            and not self.transfer_stats_accumulator.is_empty()
+        ):
             # Produce a single cumulative stats object for the last time
             # interval from the recorded observations.
             xfer_metrics = self.transfer_stats_accumulator.reduce()
-            xfer_metrics_str = ", ".join(f"{k}={v}"
-                                         for k, v in xfer_metrics.items())
+            xfer_metrics_str = ", ".join(f"{k}={v}" for k, v in xfer_metrics.items())
             log_fn("KV Transfer metrics: %s", xfer_metrics_str)
 
             # Reset metrics for next interval
-            self.reset()
\ No newline at end of file
+            self.reset()
+
+
+class KVConnectorPromMetrics:
+    """
+    A base class for per-connector Prometheus metric registration
+    and recording.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        metric_types: dict[type[PromMetric], type[PromMetricT]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ):
+        self._kv_transfer_config = vllm_config.kv_transfer_config
+        self._gauge_cls = metric_types[Gauge]
+        self._counter_cls = metric_types[Counter]
+        self._histogram_cls = metric_types[Histogram]
+        self._labelnames = labelnames
+        self._per_engine_labelvalues = per_engine_labelvalues
+
+    def make_per_engine(self, metric: PromMetric) -> PromMetric:
+        """
+        Create a per-engine child of a prometheus_client.Metric with
+        the appropriate labels set. The parent metric must be created
+        using the labelnames list.
+        """
+        return {
+            idx: metric.labels(*labelvalues)
+            for idx, labelvalues in self._per_engine_labelvalues.items()
+        }
+
+    def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
+        """
+        Record the supplied transfer statistics to Prometheus metrics. These
+        statistics are engine-specific, and should be recorded to a metric
+        with the appropriate 'engine' label. These metric instances can be
+        created using the make_per_engine() helper method.
+        """
+        raise NotImplementedError
+
+
+class KVConnectorPrometheus:
+    """
+    Support for registering per-connector Prometheus metrics, and
+    recording transfer statistics to those metrics. Uses
+    KVConnectorBase.build_prom_metrics().
+    """
+
+    _gauge_cls = Gauge
+    _counter_cls = Counter
+    _histogram_cls = Histogram
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ):
+        self.prom_metrics: KVConnectorPromMetrics | None = None
+        kv_transfer_config = vllm_config.kv_transfer_config
+        if kv_transfer_config and kv_transfer_config.kv_connector:
+            connector_cls = KVConnectorFactory.get_connector_class(kv_transfer_config)
+            metric_types = {
+                Gauge: self._gauge_cls,
+                Counter: self._counter_cls,
+                Histogram: self._histogram_cls,
+            }
+            self.prom_metrics = connector_cls.build_prom_metrics(
+                vllm_config,
+                metric_types,
+                labelnames,
+                per_engine_labelvalues,
+            )
+
+    def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
+        if self.prom_metrics is None:
+            return
+        self.prom_metrics.observe(transfer_stats_data, engine_idx)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 6836a71e58d62809184ca1f94251d64b19df7fcf..c9d08e9b78ed0d21bfb2102cdfbe71eaa70aaf5d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -3,18 +3,25 @@
 import copy
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorStats)
+    KVConnectorPromMetrics,
+    KVConnectorStats,
+    PromMetric,
+    PromMetricT,
+)
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
@@ -24,6 +31,7 @@ if TYPE_CHECKING:
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -32,7 +40,7 @@ logger = init_logger(__name__)
 @dataclass
 class MultiKVConnectorMetadata(KVConnectorMetadata):
     metadata: tuple[KVConnectorMetadata, ...]
-    extra_async_saves: Optional[dict[str, int]] = None
+    extra_async_saves: dict[str, int] | None = None
 
 
 @dataclass
@@ -58,8 +66,7 @@ class MultiKVConnectorStats(KVConnectorStats):
     def reduce(self) -> dict[str, Any]:
         # TODO (NickLucche) Adjust for logging on separate lines
         return {
-            connector_id: stats.reduce()
-            for connector_id, stats in self.data.items()
+            connector_id: stats.reduce() for connector_id, stats in self.data.items()
         }
 
     def is_empty(self) -> bool:
@@ -72,6 +79,27 @@ class MultiKVConnectorStats(KVConnectorStats):
         self.data[connector_id] = stats
 
 
+class MultiKVConnectorPromMetrics(KVConnectorPromMetrics):
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type[PromMetric], type[PromMetricT]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+        prom_metrics: dict[str, KVConnectorPromMetrics],
+    ):
+        super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
+        self._prom_metrics = prom_metrics
+
+    def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
+        for connector_id, stats_data in transfer_stats_data.items():
+            assert connector_id in self._prom_metrics, (
+                f"{connector_id} is not contained in the list of registered connectors "
+                f"with Prometheus metrics support: {self._prom_metrics.keys()}"
+            )
+            self._prom_metrics[connector_id].observe(stats_data["data"], engine_idx)
+
+
 class MultiConnector(KVConnectorBase_V1):
     """
     A wrapper for using multiple KVConnectors at the same time.
@@ -82,21 +110,22 @@ class MultiConnector(KVConnectorBase_V1):
     - Save to all connectors.
     """
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
+
         self._connectors: list[KVConnectorBase_V1] = []
         self._ktc_kv_transfer_config = []
-        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
-            "connectors")
-        assert ktcs is not None
-        for ktc in ktcs:
-            temp_config = copy.copy(vllm_config)
-            engine_id = ktc.get("engine_id",
-                                vllm_config.kv_transfer_config.engine_id)
-            temp_config.kv_transfer_config = KVTransferConfig(
-                **ktc, engine_id=engine_id)
-            self._connectors.append(
-                KVConnectorFactory.create_connector(temp_config, role))
+        for connector_cls, temp_config in self._get_connector_classes_and_configs(
+            vllm_config
+        ):
+            self._connectors.append(connector_cls(temp_config, role, kv_cache_config))
             self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
 
         # A mapping from request id to the index of the connector chosen to
@@ -109,6 +138,32 @@ class MultiConnector(KVConnectorBase_V1):
         # Propagated from scheduler to worker side via the connector metadata.
         self._extra_async_saves: dict[str, int] = {}
 
+    @classmethod
+    def _get_connector_classes_and_configs(
+        cls, vllm_config: "VllmConfig"
+    ) -> list[tuple[type[KVConnectorBaseType], "VllmConfig"]]:
+        assert vllm_config.kv_transfer_config is not None
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors"
+        )
+        assert ktcs is not None
+        ret: list[tuple[type[KVConnectorBaseType], VllmConfig]] = []
+        for ktc in ktcs:
+            temp_config = copy.copy(vllm_config)
+            engine_id = ktc.get("engine_id", vllm_config.kv_transfer_config.engine_id)
+            temp_config.kv_transfer_config = KVTransferConfig(
+                **ktc, engine_id=engine_id
+            )
+            ret.append(
+                (
+                    KVConnectorFactory.get_connector_class(
+                        temp_config.kv_transfer_config
+                    ),
+                    temp_config,
+                )
+            )
+        return ret
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         for c in self._connectors:
             c.register_kv_caches(kv_caches)
@@ -116,27 +171,32 @@ class MultiConnector(KVConnectorBase_V1):
     # We must override the base class method here because we need to bind
     # the metadata to each connector in the order of the connectors in the
     # MultiKVConnectorMetadata.
-    def bind_connector_metadata(
-            self, connector_metadata: KVConnectorMetadata) -> None:
+    #
+    # Note: Call the base class method to ensure metadata is also set on the
+    # MultiConnector instance itself; otherwise, `has_connector_metadata()` will
+    # always return False.
+    def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
         assert isinstance(connector_metadata, MultiKVConnectorMetadata)
         if connector_metadata.extra_async_saves:
-            self._extra_async_saves.update(
-                connector_metadata.extra_async_saves)
+            self._extra_async_saves.update(connector_metadata.extra_async_saves)
         for c, cm in zip(self._connectors, connector_metadata.metadata):
             c.bind_connector_metadata(cm)
+        super().bind_connector_metadata(connector_metadata)
 
     def clear_connector_metadata(self) -> None:
         for c in self._connectors:
             c.clear_connector_metadata()
+        super().clear_connector_metadata()
 
     def shutdown(self):
-        exception: Optional[Exception] = None
+        exception: Exception | None = None
         for c in self._connectors:
             try:
                 c.shutdown()
             except Exception as e:
-                logger.exception("Exception during connector %s shutdown.",
-                                 c.__class__.__name__)
+                logger.exception(
+                    "Exception during connector %s shutdown.", c.__class__.__name__
+                )
                 exception = e
         if exception:
             raise exception
@@ -144,8 +204,7 @@ class MultiConnector(KVConnectorBase_V1):
     # ==============================
     # Worker-side methods
     # ==============================
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         for c in self._connectors:
             c.start_load_kv(forward_context, **kwargs)
 
@@ -153,8 +212,13 @@ class MultiConnector(KVConnectorBase_V1):
         for c in self._connectors:
             c.wait_for_layer_load(layer_name)
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
         for c in self._connectors:
             c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs)
 
@@ -164,7 +228,7 @@ class MultiConnector(KVConnectorBase_V1):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         finished_sending: set[str] = set()
         finished_recving: set[str] = set()
         for c in self._connectors:
@@ -189,6 +253,12 @@ class MultiConnector(KVConnectorBase_V1):
 
         return finished_sending or None, finished_recving or None
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        agg_block_ids: set[int] = set()
+        for c in self._connectors:
+            agg_block_ids |= c.get_block_ids_with_load_errors()
+        return agg_block_ids
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -196,11 +266,12 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         to_return = (0, False)
         for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
-                request, num_computed_tokens)
+                request, num_computed_tokens
+            )
             # If there is a connector still looking up the matches,
             # we return None to indicate that we are not done yet.
             if toks is None:
@@ -212,27 +283,27 @@ class MultiConnector(KVConnectorBase_V1):
                 to_return = (toks, load_async)
         return to_return
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
-        chosen_connector = self._requests_to_connector.get(
-            request.request_id, -1)
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        chosen_connector = self._requests_to_connector.get(request.request_id, -1)
         empty_blocks = blocks.new_empty()
         for i, c in enumerate(self._connectors):
             if i == chosen_connector:
                 # Forward call to the chosen connector (if any).
-                c.update_state_after_alloc(request, blocks,
-                                           num_external_tokens)
+                c.update_state_after_alloc(request, blocks, num_external_tokens)
             else:
                 # Call with empty blocks for other connectors.
                 c.update_state_after_alloc(request, empty_blocks, 0)
 
     def build_connector_meta(
-            self,
-            scheduler_output: SchedulerOutput) -> MultiKVConnectorMetadata:
-        metadata = MultiKVConnectorMetadata(metadata=tuple(
-            c.build_connector_meta(scheduler_output)
-            for c in self._connectors))
+        self, scheduler_output: SchedulerOutput
+    ) -> MultiKVConnectorMetadata:
+        metadata = MultiKVConnectorMetadata(
+            metadata=tuple(
+                c.build_connector_meta(scheduler_output) for c in self._connectors
+            )
+        )
         if self._extra_async_saves:
             metadata.extra_async_saves = self._extra_async_saves
             self._extra_async_saves = {}
@@ -246,7 +317,7 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         blocks: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         async_saves = 0
         kv_txfer_params = None
         for c in self._connectors:
@@ -258,7 +329,8 @@ class MultiConnector(KVConnectorBase_V1):
                     # TODO we can probably change this to merge the dicts here,
                     # checking for key clashes.
                     raise RuntimeError(
-                        "Only one connector can produce KV transfer params")
+                        "Only one connector can produce KV transfer params"
+                    )
                 kv_txfer_params = txfer_params
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
@@ -273,8 +345,7 @@ class MultiConnector(KVConnectorBase_V1):
             yield from c.take_events()
 
     @classmethod
-    def get_required_kvcache_layout(
-            cls, vllm_config: "VllmConfig") -> Optional[str]:
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
         """
         Get the required KV cache layout for this connector.
         Args:
@@ -284,39 +355,69 @@ class MultiConnector(KVConnectorBase_V1):
             str: the required KV cache layout. e.g. HND, or NHD.
             None if the connector does not require a specific layout.
         """
-        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
-            "connectors")
-        assert ktcs is not None
+        assert vllm_config.kv_transfer_config is not None
         layouts: set[str] = set()
-        temp_vllm_config = copy.copy(vllm_config)
-        for ktc in ktcs:
-            kv_transfer_config = KVTransferConfig(**ktc)
-            temp_vllm_config.kv_transfer_config = kv_transfer_config
-            connector_cls = KVConnectorFactory.get_connector_class(
-                kv_transfer_config)
-            required_kvcache_layout = (
-                connector_cls.get_required_kvcache_layout(temp_vllm_config))
+        for connector_cls, temp_config in cls._get_connector_classes_and_configs(
+            vllm_config
+        ):
+            required_kvcache_layout = connector_cls.get_required_kvcache_layout(
+                temp_config
+            )
             if required_kvcache_layout is not None:
                 layouts.add(required_kvcache_layout)
 
         if len(layouts) > 1:
-            raise ValueError(f"KV cache layout mismatch: "
-                             f"found {len(layouts)} different layouts "
-                             f"({', '.join(layouts) })."
-                             f"All connectors must use the same layout.")
+            raise ValueError(
+                f"KV cache layout mismatch: "
+                f"found {len(layouts)} different layouts "
+                f"({', '.join(layouts)})."
+                f"All connectors must use the same layout."
+            )
         return next(iter(layouts), None)
 
     @classmethod
     def build_kv_connector_stats(
-            cls,
-            data: Optional[dict[str,
-                                Any]] = None) -> Optional[KVConnectorStats]:
-        return MultiKVConnectorStats(data=data) if data is not None \
-            else MultiKVConnectorStats()
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
+        if data is None:
+            return MultiKVConnectorStats()
+
+        # data is a dict mapping connector name to their stats data.
+        # The stats data can be either:
+        # 1. Already-instantiated KVConnectorStats objects (same process)
+        # 2. Serialized dicts (cross-process after serialization)
+        # We need to reconstruct proper KVConnectorStats objects from dicts
+        reconstructed_data = {}
+        for connector_name, stats_value in data.items():
+            # If already a KVConnectorStats object, use it directly
+            if isinstance(stats_value, KVConnectorStats):
+                reconstructed_data[connector_name] = stats_value
+                continue
+
+            # Otherwise, reconstruct from serialized dict
+            # Get the connector class to reconstruct its stats
+            connector_cls = KVConnectorFactory.get_connector_class_by_name(
+                connector_name
+            )
+
+            # stats_value is the serialized dataclass which contains {'data': {...}}
+            # We need to extract the inner 'data' field to avoid double-nesting
+            assert isinstance(stats_value, dict) and "data" in stats_value, (
+                f"Expected a dict with a 'data' field, got {stats_value}"
+            )
+            inner_data = stats_value["data"]
+
+            # Use the connector's build_kv_connector_stats to reconstruct
+            if reconstructed_stats := connector_cls.build_kv_connector_stats(
+                data=inner_data
+            ):
+                reconstructed_data[connector_name] = reconstructed_stats
 
-    def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]:
+        return MultiKVConnectorStats(data=reconstructed_data)
+
+    def get_kv_connector_stats(self) -> MultiKVConnectorStats | None:
         # Group connector stats by connector type.
-        stats_by_connector: Optional[MultiKVConnectorStats] = None
+        stats_by_connector: MultiKVConnectorStats | None = None
         for c in self._connectors:
             stats = c.get_kv_connector_stats()
             if stats is None:
@@ -326,3 +427,28 @@ class MultiConnector(KVConnectorBase_V1):
                 stats_by_connector = MultiKVConnectorStats()
             stats_by_connector[c.__class__.__name__] = stats
         return stats_by_connector
+
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type["PromMetric"], type["PromMetricT"]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> KVConnectorPromMetrics:
+        prom_metrics: dict[str, KVConnectorPromMetrics] = {}
+        for connector_cls, temp_config in cls._get_connector_classes_and_configs(
+            vllm_config
+        ):
+            connector_prom = connector_cls.build_prom_metrics(
+                temp_config, metric_types, labelnames, per_engine_labelvalues
+            )
+            if connector_prom is not None:
+                prom_metrics[connector_cls.__name__] = connector_prom
+        return MultiKVConnectorPromMetrics(
+            vllm_config,
+            metric_types,
+            labelnames,
+            per_engine_labelvalues,
+            prom_metrics,
+        )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 9e657703aa952c1b11d469cf9bbaf321fb94220a..a70c98b6371315f21b9e37b0ff91fd8507eb6dd3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,6 +4,7 @@ import contextlib
 import copy
 import logging
 import math
+import os
 import queue
 import threading
 import time
@@ -12,7 +13,7 @@ from collections import defaultdict
 from collections.abc import Iterator
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
 import numpy as np
@@ -20,26 +21,40 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.attention import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    CopyBlocksOp,
+    KVConnectorBase_V1,
+    KVConnectorHandshakeMetadata,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorStats)
+    KVConnectorPromMetrics,
+    KVConnectorStats,
+    PromMetric,
+    PromMetricT,
+)
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
-    get_tp_group)
-from vllm.distributed.utils import divide
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+)
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import make_zmq_path, make_zmq_socket
+from vllm.platforms import current_platform
+from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 Transfer = tuple[int, float]  # (xfer_handle, start_time)
@@ -53,10 +68,14 @@ logger = init_logger(__name__)
 # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
 try:
     from nixl._api import nixl_agent as NixlWrapper
+    from nixl._bindings import nixlXferTelemetry
+
     logger.info("NIXL is available")
 except ImportError:
     logger.warning("NIXL is not available")
     NixlWrapper = None
+    nixlXferTelemetry = None
+
 
 try:
     from nixl._api import nixl_agent_config
@@ -67,31 +86,36 @@ except ImportError:
 # Supported platforms and types of kv transfer buffer.
 # {device: tuple of supported kv buffer types}
 _NIXL_SUPPORTED_DEVICE = {
-    "cuda": ("cuda", ),
-    "tpu": ("cpu", ),
-    "xpu": ("cpu", ),
+    "cuda": (
+        "cuda",
+        "cpu",
+    ),
+    "tpu": ("cpu",),
+    "xpu": ("cpu",),
+    "cpu": ("cpu",),
 }
 # support for oot platform by providing mapping in current_platform
 _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
 
 
-class NixlAgentMetadata(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):
+@dataclass
+class NixlAgentMetadata(KVConnectorHandshakeMetadata):
     engine_id: str
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
+    device_id: int
     num_blocks: int
     block_lens: list[int]
     attn_backend_name: str
     kv_cache_layout: str
+    block_size: int
 
 
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
+    # To be used when logical block size does not match the kernel block size
+    local_physical_block_ids: list[int]
     remote_block_ids: list[int]
     remote_host: str
     remote_port: int
@@ -100,12 +124,12 @@ class ReqMeta:
 
 
 class NixlConnectorMetadata(KVConnectorMetadata):
-
     def __init__(self):
         self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
         self.reqs_to_save: dict[ReqId, ReqMeta] = {}
         self.reqs_to_send: dict[ReqId, float] = {}
         self.reqs_in_batch: set[ReqId] = set()
+        self.reqs_not_processed: set[ReqId] = set()
 
     def add_new_req(
         self,
@@ -119,6 +143,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
         assert load_remote_cache ^ save_to_host
         _req = ReqMeta(
             local_block_ids=local_block_ids,
+            local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
             remote_host=kv_transfer_params["remote_host"],
@@ -133,20 +158,26 @@ class NixlConnectorMetadata(KVConnectorMetadata):
 
 
 class NixlConnector(KVConnectorBase_V1):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
 
-    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
         assert vllm_config.kv_transfer_config is not None
         assert vllm_config.kv_transfer_config.engine_id is not None
         self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
 
         if role == KVConnectorRole.SCHEDULER:
-            self.connector_scheduler: Optional[NixlConnectorScheduler] = \
+            self.connector_scheduler: NixlConnectorScheduler | None = (
                 NixlConnectorScheduler(vllm_config, self.engine_id)
-            self.connector_worker: Optional[NixlConnectorWorker] = None
+            )
+            self.connector_worker: NixlConnectorWorker | None = None
         elif role == KVConnectorRole.WORKER:
             self.connector_scheduler = None
-            self.connector_worker = NixlConnectorWorker(
-                vllm_config, self.engine_id)
+            self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id)
 
     ############################################################
     # Class Methods
@@ -154,8 +185,10 @@ class NixlConnector(KVConnectorBase_V1):
     @classmethod
     def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
         if vllm_config.model_config is None:
-            logger.warning_once("Unable to detect current VLLM config. "
-                                "Fallback to default kv cache layout.")
+            logger.warning_once(
+                "Unable to detect current VLLM config. "
+                "Fallback to default kv cache layout."
+            )
             return None
         use_mla = vllm_config.model_config.use_mla
         if use_mla:
@@ -163,8 +196,9 @@ class NixlConnector(KVConnectorBase_V1):
             # as the layout should not matter in that case,
             # which fallback to the default behavior.
             return None
-        logger.info_once("NixlConnector setting KV cache "
-                         "layout to HND for better xfer performance.")
+        logger.info_once(
+            "NixlConnector setting KV cache layout to HND for better xfer performance."
+        )
         return "HND"
 
     ############################################################
@@ -172,18 +206,20 @@ class NixlConnector(KVConnectorBase_V1):
     ############################################################
 
     def get_num_new_matched_tokens(
-            self, request: "Request",
-            num_computed_tokens: int) -> tuple[Optional[int], bool]:
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int | None, bool]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.get_num_new_matched_tokens(
-            request, num_computed_tokens)
+            request, num_computed_tokens
+        )
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         assert self.connector_scheduler is not None
         return self.connector_scheduler.update_state_after_alloc(
-            request, blocks, num_external_tokens)
+            request, blocks, num_external_tokens
+        )
 
     def build_connector_meta(
         self,
@@ -196,10 +232,22 @@ class NixlConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
 
+    def set_xfer_handshake_metadata(
+        self, metadata: dict[int, KVConnectorHandshakeMetadata]
+    ) -> None:
+        """
+        Set the KV connector handshake metadata for this connector.
+
+        Args:
+            metadata (dict): the handshake metadata to set.
+        """
+        assert self.connector_scheduler is not None
+        self.connector_scheduler.set_xfer_handshake_metadata(metadata)
+
     ############################################################
     # Worker Side Methods
     ############################################################
@@ -211,26 +259,44 @@ class NixlConnector(KVConnectorBase_V1):
         assert self.connector_worker is not None
         self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
 
-    def get_finished(self,
-                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+    def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """Get the finished recving and sending requests."""
         assert self.connector_worker is not None
         return self.connector_worker.get_finished()
 
-    def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """Get block IDs that failed to load via NIXL."""
         assert self.connector_worker is not None
+        return self.connector_worker.get_block_ids_with_load_errors()
+
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
+        if self.connector_worker is None:
+            return None
         return self.connector_worker.get_kv_connector_stats()
 
     @classmethod
     def build_kv_connector_stats(
-            cls,
-            data: Optional[dict[str,
-                                Any]] = None) -> Optional[KVConnectorStats]:
-        return NixlKVConnectorStats(data=data) if data is not None \
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
+        return (
+            NixlKVConnectorStats(data=data)
+            if data is not None
             else NixlKVConnectorStats()
+        )
 
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: VllmConfig,
+        metric_types: dict[type[PromMetric], type[PromMetricT]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> KVConnectorPromMetrics:
+        return NixlPromMetrics(
+            vllm_config, metric_types, labelnames, per_engine_labelvalues
+        )
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, NixlConnectorMetadata)
         self.connector_worker.start_load_kv(self._connector_metadata)
@@ -239,21 +305,40 @@ class NixlConnector(KVConnectorBase_V1):
         """NixlConnector does not do layerwise saving."""
         pass
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
         """NixlConnector does not save explicitly."""
         pass
 
     def wait_for_save(self):
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, NixlConnectorMetadata)
-        if self.connector_worker.use_host_buffer and \
-           self.connector_worker.copy_blocks:
+        if self.connector_worker.use_host_buffer and self.connector_worker.copy_blocks:
             self.connector_worker.save_kv_to_host(self._connector_metadata)
 
     def shutdown(self):
         if self.connector_worker is not None:
             self.connector_worker.shutdown()
+        if self.connector_scheduler is not None:
+            self.connector_scheduler.shutdown()
+
+    def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
+        """
+        Get the KVConnector handshake metadata for this connector.
+        This metadata is used for out-of-band connector handshake
+        between P/D workers.
+
+        Returns:
+            KVConnectorHandshakeMetadata: the handshake metadata.
+            None if no handshake metadata is available.
+        """
+        assert self.connector_worker is not None
+        return self.connector_worker.xfer_handshake_metadata
 
 
 class NixlConnectorScheduler:
@@ -265,13 +350,24 @@ class NixlConnectorScheduler:
         self.engine_id: EngineId = engine_id
         self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
         self.side_channel_port = (
-            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-            vllm_config.parallel_config.data_parallel_rank *
-            vllm_config.parallel_config.tensor_parallel_size)
-        self.use_host_buffer = \
-            vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT
+            + vllm_config.parallel_config.data_parallel_rank
+        )
+        assert vllm_config.kv_transfer_config is not None
+        if current_platform.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = (
+                vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+            )
+
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
+        # Background thread for handling new handshake requests.
+        self._nixl_handshake_listener_t: threading.Thread | None = None
+        self._encoded_xfer_handshake_metadata: dict[int, Any] = {}
+        self._stop_event = threading.Event()
+
         # Requests that need to start recv/send.
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
@@ -280,10 +376,96 @@ class NixlConnectorScheduler:
         # Reqs to send and their expiration time
         self._reqs_need_send: dict[ReqId, float] = {}
         self._reqs_in_batch: set[ReqId] = set()
+        # Reqs to remove from processed set because they're not to send after
+        # remote prefill or aborted.
+        self._reqs_not_processed: set[ReqId] = set()
+
+    def shutdown(self):
+        self._stop_event.set()
+        if self._nixl_handshake_listener_t is not None:
+            self._nixl_handshake_listener_t.join()
+            self._nixl_handshake_listener_t = None
+
+    def set_xfer_handshake_metadata(
+        self, metadata: dict[int, KVConnectorHandshakeMetadata]
+    ) -> None:
+        """
+        Set the KV connector handshake metadata for this connector.
+
+        Args:
+            metadata (dict): the handshake metadata to set.
+        """
+        encoded_data: dict[int, bytes] = {}
+        encoder = msgspec.msgpack.Encoder()
+        for tp_rank, rank_metadata in metadata.items():
+            if not isinstance(rank_metadata, NixlAgentMetadata):
+                raise ValueError(
+                    "NixlConnectorScheduler expects NixlAgentMetadata for "
+                    "handshake metadata."
+                )
+            encoded_data[tp_rank] = encoder.encode(rank_metadata)
+            logger.debug(
+                "Tp rank %d: encoded NixlAgentMetadata size: %s bytes",
+                tp_rank,
+                str(len(encoded_data[tp_rank])),
+            )
+        self._encoded_xfer_handshake_metadata = encoded_data
+
+        # Only start the listener when we have metadata to serve.
+        if self._nixl_handshake_listener_t is None:
+            ready_event = threading.Event()
+            self._nixl_handshake_listener_t = threading.Thread(
+                target=self._nixl_handshake_listener,
+                args=(
+                    encoded_data,
+                    ready_event,
+                    self._stop_event,
+                    self.side_channel_port,
+                ),
+                daemon=True,
+                name="nixl_handshake_listener",
+            )
+            self._nixl_handshake_listener_t.start()
+            ready_event.wait()  # Wait for listener ZMQ socket to be ready.
+
+    @staticmethod
+    def _nixl_handshake_listener(
+        encoded_data: dict[int, Any],
+        ready_event: threading.Event,
+        stop_event: threading.Event,
+        port: int,
+    ):
+        """Background thread for getting new NIXL handshakes."""
+        # NOTE(rob): this is a simple implementation. We will move
+        # to a better approach via HTTP endpoint soon.
+
+        # Listen for new requests for metadata.
+        host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+        path = make_zmq_path("tcp", host, port)
+        logger.debug("Starting listening on path: %s", path)
+        with zmq_ctx(zmq.ROUTER, path) as sock:
+            sock.setsockopt(zmq.RCVTIMEO, 1000)
+            ready_event.set()
+            while True:
+                try:
+                    identity, _, msg = sock.recv_multipart()
+                except zmq.Again:
+                    if stop_event.is_set():
+                        break
+                    continue
+                # Decode the message which contains (GET_META_MSG, rank)
+                msg, target_tp_rank = msgspec.msgpack.decode(msg)
+                logger.debug(
+                    "Received message for tp rank %s",
+                    target_tp_rank,
+                )
+                if msg != GET_META_MSG:
+                    logger.warning("Connection listener got unexpected message %s", msg)
+                sock.send_multipart((identity, b"", encoded_data[target_tp_rank]))
 
     def get_num_new_matched_tokens(
-            self, request: "Request",
-            num_computed_tokens: int) -> tuple[int, bool]:
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
         """
         For remote prefill, pull all prompt blocks from remote
         asynchronously relative to engine execution.
@@ -303,26 +485,30 @@ class NixlConnectorScheduler:
         logger.debug(
             "NIXLConnector get_num_new_matched_tokens: "
             "num_computed_tokens=%s, kv_transfer_params=%s",
-            num_computed_tokens, params)
+            num_computed_tokens,
+            params,
+        )
 
         if params is not None and params.get("do_remote_prefill"):
             # Remote prefill: get all prompt blocks from remote.
-            count = len(request.prompt_token_ids) - num_computed_tokens
+            token_ids = request.prompt_token_ids or []
+            count = len(token_ids) - num_computed_tokens
             if count > 0:
                 return count, True
 
         # No remote prefill for this request.
         return 0, False
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
-
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         params = request.kv_transfer_params
         logger.debug(
             "NIXLConnector update_state_after_alloc: "
             "num_external_tokens=%s, kv_transfer_params=%s",
-            num_external_tokens, params)
+            num_external_tokens,
+            params,
+        )
 
         if not params:
             return
@@ -341,25 +527,33 @@ class NixlConnectorScheduler:
             # block is not overwritten; and it will be safe to skip saving them
             # to host xfer buffer.
             if block_ids:
-                self._reqs_need_save[request.request_id] = \
-                    (request, block_ids)
+                self._reqs_need_save[request.request_id] = (request, block_ids)
         elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
-                if all(p in params for p in ("remote_engine_id", "remote_host",
-                                             "remote_port")):
+                if all(
+                    p in params
+                    for p in ("remote_engine_id", "remote_host", "remote_port")
+                ):
                     # If remote_blocks and num_external_tokens = 0, we have
                     # a full prefix cache hit on the D worker. We need to call
                     # send_notif in _read_blocks to free the memory on the P.
-                    local_block_ids = (blocks.get_unhashed_block_ids()
-                                       if num_external_tokens > 0 else [])
+                    local_block_ids = (
+                        blocks.get_unhashed_block_ids()
+                        if num_external_tokens > 0
+                        else []
+                    )
                     # Get unhashed blocks to pull from remote.
                     self._reqs_need_recv[request.request_id] = (
-                        request, local_block_ids)
+                        request,
+                        local_block_ids,
+                    )
 
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
-                        "request will not utilize KVTransfer", params)
+                        "request will not utilize KVTransfer",
+                        params,
+                    )
             else:
                 assert num_external_tokens == 0
             # Only trigger 1 KV transfer per request.
@@ -394,11 +588,13 @@ class NixlConnectorScheduler:
 
         meta.reqs_to_send = self._reqs_need_send
         meta.reqs_in_batch = self._reqs_in_batch
+        meta.reqs_not_processed = self._reqs_not_processed
 
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
         self._reqs_need_save.clear()
         self._reqs_in_batch = set()
+        self._reqs_not_processed = set()
         self._reqs_need_send = {}
 
         return meta
@@ -407,7 +603,7 @@ class NixlConnectorScheduler:
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Once a request is finished, determine whether request blocks
         should be freed now or will be sent asynchronously and freed later.
@@ -416,8 +612,12 @@ class NixlConnectorScheduler:
 
         params = request.kv_transfer_params
         logger.debug(
-            "NIXLConnector request_finished, request_status=%s, "
-            "kv_transfer_params=%s", request.status, params)
+            "NIXLConnector request_finished(%s), request_status=%s, "
+            "kv_transfer_params=%s",
+            request.request_id,
+            request.status,
+            params,
+        )
         if not params:
             return False, None
 
@@ -432,8 +632,12 @@ class NixlConnectorScheduler:
             params["do_remote_prefill"] = False
             return False, None
 
-        if (not params.get("do_remote_decode")
-                or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
+        if not params.get("do_remote_decode"):
+            return False, None
+        if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
+            # Also include the case of a P/D Prefill request with immediate
+            # block free (eg abort). Stop tracking this request.
+            self._reqs_not_processed.add(request.request_id)
             return False, None
 
         # TODO: check whether block_ids actually ever be 0. If not we could
@@ -442,8 +646,15 @@ class NixlConnectorScheduler:
 
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
-            self._reqs_need_send[request.request_id] = time.perf_counter(
-            ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+            logger.debug(
+                "NIXLConnector request_finished(%s) waiting for %d seconds "
+                "for remote decode to fetch blocks",
+                request.request_id,
+                envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
+            )
+            self._reqs_need_send[request.request_id] = (
+                time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+            )
 
         return delay_free_blocks, dict(
             do_remote_prefill=True,
@@ -452,12 +663,129 @@ class NixlConnectorScheduler:
             remote_engine_id=self.engine_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
-            tp_size=self.vllm_config.parallel_config.tensor_parallel_size)
+            tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
+        )
 
 
 class NixlConnectorWorker:
     """Implementation of Worker side methods"""
 
+    @dataclass
+    class TpKVTopology:
+        """
+        Helper class for tensor parallel and KV topology information for
+        mapping between local and remote TP workers.
+        """
+
+        tp_size: int
+        tp_rank: int
+        remote_tp_size: dict[EngineId, int]
+        is_mla: bool
+        total_num_kv_heads: int
+        attn_backend: type[AttentionBackend]
+
+        def __post_init__(self):
+            # Figure out whether the first dimension of the cache is K/V
+            # or num_blocks. This is used to register the memory regions correctly.
+            kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+                num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+            )
+            # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
+            # we just mock num_blocks to 1 for the dimension check below.
+            self._is_kv_layout_blocks_first = (
+                len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
+            )
+
+            attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
+            self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
+
+        @property
+        def is_kv_layout_blocks_first(self) -> bool:
+            return self._is_kv_layout_blocks_first
+
+        @property
+        def split_k_and_v(self) -> bool:
+            # Whether to register regions for K and V separately (when present).
+            return not (
+                self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
+            )
+
+        block_size: int
+        remote_block_size: dict[EngineId, int]
+
+        def tp_ratio(
+            self,
+            remote_tp_size: int,
+        ) -> int:
+            """
+            Calculate the tensor parallel ratio between local and remote TP.
+            We can think of it as the number of local TP workers-per-remote TP
+            workers. Local workers will read from the same remote TP worker in
+            groups of size `tp_ratio`.
+            """
+            assert self.tp_size % remote_tp_size == 0, (
+                f"Local tensor parallel size {self.tp_size} is not divisible "
+                f"by remote tensor parallel size {remote_tp_size}."
+            )
+            return self.tp_size // remote_tp_size
+
+        def block_size_ratio(
+            self,
+            remote_block_size: int,
+        ) -> float:
+            """
+            Calculate the block size ratio between local and remote TP.
+            """
+            assert self.block_size % remote_block_size == 0, (
+                f"Local block size {self.block_size} is not divisible "
+                f"by remote block size {remote_block_size} or vice versa."
+            )
+            return self.block_size // remote_block_size
+
+        def tp_ratio_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> int:
+            remote_tp_size = self.remote_tp_size[remote_engine_id]
+            return self.tp_ratio(remote_tp_size)
+
+        def block_size_ratio_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> float:
+            remote_block_size = self.remote_block_size[remote_engine_id]
+            return self.block_size_ratio(remote_block_size)
+
+        def is_kv_replicated(self, engine_id: EngineId) -> bool:
+            """
+            Whether the KV cache is replicated across TP workers due to the
+            number of TP workers being greater than the number of KV heads.
+            """
+            tp_size = self.remote_tp_size[engine_id]
+            return tp_size // self.total_num_kv_heads >= 1
+
+        def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
+            # MLA is always replicated as the hidden dim can't be split.
+            return self.is_mla or self.is_kv_replicated(remote_engine_id)
+
+        def get_target_remote_rank(
+            self,
+            remote_tp_size: int,
+        ) -> int:
+            """
+            Get the remote TP rank (on P) that the current local TP rank
+            (on D) will read from.
+            """
+            tp_ratio = self.tp_ratio(remote_tp_size)
+            return self.tp_rank // tp_ratio
+
+        def get_target_remote_rank_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> int:
+            remote_tp_size = self.remote_tp_size[remote_engine_id]
+            return self.get_target_remote_rank(remote_tp_size)
+
     def __init__(self, vllm_config: VllmConfig, engine_id: str):
         if NixlWrapper is None:
             logger.error("NIXL is not available")
@@ -469,73 +797,91 @@ class NixlConnectorWorker:
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
 
-        self.nixl_backends = \
-            vllm_config.kv_transfer_config.get_from_extra_config(
-                "backends", ["UCX"])
+        if vllm_config.kv_transfer_config is None:
+            raise ValueError("kv_transfer_config must be set for NixlConnector")
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+
+        self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
+            "backends", ["UCX"]
+        )
+        # TODO temporary, once nixl allows for telemetry flag in config
+        # (next release), we can remove this env var.
+        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
+
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
+        # Configure NIXL num_threads to avoid UAR exhaustion on Mellanox NICs.
+        # Each UCX thread allocates UARs (doorbell pages) via DevX, and
+        # excessive NIXL UAR usage can exhaust NIC UAR space. This can cause
+        # components like NVSHMEM (used by DeepEP kernels) to fail during RDMA
+        # initialization with "mlx5dv_devx_alloc_uar" errors.
+        # Ref: https://network.nvidia.com/files/doc-2020/ethernet-adapters-programming-manual.pdf#page=63
+        num_threads = vllm_config.kv_transfer_config.get_from_extra_config(
+            "num_threads", 4
+        )
         if nixl_agent_config is None:
             config = None
         else:
-            config = nixl_agent_config(backends=self.nixl_backends) if len(
-                non_ucx_backends) > 0 else nixl_agent_config(num_threads=8)
+            config = (
+                nixl_agent_config(backends=self.nixl_backends)
+                if len(non_ucx_backends) > 0
+                else nixl_agent_config(num_threads=num_threads)
+            )
 
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
         # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
         self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
 
-        # NIXL handshake port.
-        # NOTE(rob): Within a DP group, each DP rank gets its own
-        # base port (which is sent in the KVTransferParams).
-        # Each TP rank listens/queries on the base_port + tp_rank.
-        self.side_channel_port: int = (
-            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-            vllm_config.parallel_config.data_parallel_rank *
-            vllm_config.parallel_config.tensor_parallel_size)
-
         # Metadata.
         self.engine_id: EngineId = engine_id
         self.tp_rank = get_tensor_model_parallel_rank()
         self.world_size = get_tensor_model_parallel_world_size()
         self.tp_group = get_tp_group()
         self.num_blocks = 0
+        self.enable_permute_local_kv = False
 
         # KV Caches and nixl tracking data.
         self.device_type = current_platform.device_type
-        self.kv_buffer_device: str = \
-            vllm_config.kv_transfer_config.kv_buffer_device
+        self.kv_buffer_device: str = vllm_config.kv_transfer_config.kv_buffer_device
         if self.device_type not in _NIXL_SUPPORTED_DEVICE:
             raise RuntimeError(f"{self.device_type} is not supported.")
-        elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[
-                self.device_type]:
+        elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[self.device_type]:
             raise RuntimeError(
                 f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-                "is not supported.")
+                "is not supported."
+            )
         self.device_kv_caches: dict[str, torch.Tensor] = {}
 
         # cpu kv buffer for xfer
         # used when device memory can not be registered under nixl
         self.host_xfer_buffers: dict[str, torch.Tensor] = {}
-        self.use_host_buffer = self.kv_buffer_device == "cpu"
+        if self.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = self.kv_buffer_device == "cpu"
+
         # support for oot platform which can't register nixl memory
         # type based on kv_buffer_device
-        self.nixl_memory_type = current_platform.get_nixl_memory_type()
-        if self.nixl_memory_type is None:
+        nixl_memory_type = current_platform.get_nixl_memory_type()
+        if nixl_memory_type is None:
             if self.kv_buffer_device == "cuda":
-                self.nixl_memory_type = "VRAM"
+                nixl_memory_type = "VRAM"
             elif self.kv_buffer_device == "cpu":
-                self.nixl_memory_type = "DRAM"
-        if self.nixl_memory_type is None:
+                nixl_memory_type = "DRAM"
+        if nixl_memory_type is None:
             raise RuntimeError(
                 f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-                "is not supported.")
+                "is not supported."
+            )
+        self.nixl_memory_type = nixl_memory_type
 
         # Note: host xfer buffer ops when use_host_buffer is True
-        self.copy_blocks: Optional[CopyBlocksOp] = None
+        self.copy_blocks: CopyBlocksOp | None = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
         # rank will still only pull from a single remote TP worker.
         self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
+        self.device_id: int = 0
 
         # Number of NIXL regions. Currently one region per cache
         # (so 1 per layer for MLA, otherwise 2 per layer)
@@ -544,6 +890,7 @@ class NixlConnectorWorker:
 
         # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
+        self.src_xfer_side_handles: dict[int, int] = {}
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[EngineId, int] = {}
 
@@ -561,19 +908,24 @@ class NixlConnectorWorker:
         # Set of requests that have been part of a batch, regardless of status.
         self._reqs_to_process: set[ReqId] = set()
 
-        # Background thread for handling new handshake requests.
-        self._nixl_handshake_listener_t: Optional[threading.Thread] = None
+        # invalid blocks from failed NIXL operations
+        self._invalid_block_ids: set[int] = set()
+        # requests that skipped transfer (handshake or transfer failures)
+        self._failed_recv_reqs: set[ReqId] = set()
+
+        # Handshake metadata of this worker for NIXL transfers.
+        self.xfer_handshake_metadata: NixlAgentMetadata | None = None
         # Background thread for initializing new NIXL handshakes.
         self._handshake_initiation_executor = ThreadPoolExecutor(
             # NIXL is not guaranteed to be thread-safe, limit 1 worker.
             max_workers=1,
-            thread_name_prefix="vllm-nixl-handshake-initiator")
+            thread_name_prefix="vllm-nixl-handshake-initiator",
+        )
         self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
         self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
         # Protects _handshake_futures and _remote_agents.
         self._handshake_lock = threading.RLock()
 
-        self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -581,54 +933,41 @@ class NixlConnectorWorker:
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
         # List of block window sizes for each layer for local attention
-        self.block_window_per_layer: list[Optional[int]] = []
+        self.block_window_per_layer: list[int | None] = []
         self.use_mla = self.model_config.use_mla
 
-        backend = get_attn_backend(self.model_config.get_head_size(),
-                                   self.model_config.dtype,
-                                   self.cache_config.cache_dtype,
-                                   self.block_size,
-                                   use_mla=self.use_mla)
+        backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+            use_mla=self.use_mla,
+        )
         self.backend_name = backend.get_name()
-        attn_backend = backend_name_to_enum(self.backend_name)
-        self._use_flashinfer = attn_backend == _Backend.FLASHINFER
-        self._use_pallas = attn_backend == _Backend.PALLAS
         self.kv_cache_layout = get_kv_cache_layout()
+        self.host_buffer_kv_cache_layout = self.kv_cache_layout
         logger.debug("Detected attention backend %s", self.backend_name)
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
-    @staticmethod
-    def _nixl_handshake_listener(metadata: NixlAgentMetadata,
-                                 ready_event: threading.Event, base_port: int,
-                                 tp_rank: int):
-        """Background thread for getting new NIXL handshakes."""
-        # NOTE(rob): this is a simple implementation. We will move
-        # to a better approach via HTTP endpoint soon.
-
-        encoder = msgspec.msgpack.Encoder()
-        encoded_data = encoder.encode(metadata)
-        size_in_bytes = len(encoded_data)
-        logger.debug("Size of encoded NixlAgentMetadata: %s bytes",
-                     str(size_in_bytes))
-
-        # Listen for new requests for metadata.
-        host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
-        path = make_zmq_path("tcp", host, base_port + tp_rank)
-        logger.debug("Starting listening on path: %s", path)
-        with zmq_ctx(zmq.ROUTER, path) as sock:
-            ready_event.set()
-            while True:
-                identity, _, msg = sock.recv_multipart()
-                if msg != GET_META_MSG:
-                    logger.warning(
-                        "Connection listener got unexpected message %s", msg)
-                sock.send_multipart((identity, b"", encoded_data))
+        self.kv_topo = self.TpKVTopology(
+            tp_size=self.world_size,
+            tp_rank=self.tp_rank,
+            remote_tp_size=self._tp_size,  # shared state
+            is_mla=self.use_mla,
+            total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            block_size=self.block_size,
+            remote_block_size=self._block_size,
+            attn_backend=backend,
+        )
+        self._use_pallas = self.kv_topo._use_pallas
+        self._physical_blocks_per_logical_kv_block = 1
 
     def _nixl_handshake(
         self,
@@ -647,40 +986,52 @@ class NixlConnectorWorker:
 
         # Handshake only with the remote TP rank that current local rank will
         # pull from. With homogeneous TP it happens to be the same rank_i.
-        tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
-        p_remote_rank = self.tp_rank // tp_ratio
-        path = make_zmq_path("tcp", host, port + p_remote_rank)
-        logger.debug("Querying metadata on path: %s at remote rank %s", path,
-                     p_remote_rank)
+        p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size)
+        path = make_zmq_path("tcp", host, port)
+        logger.debug(
+            "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank
+        )
 
         # Send query for the request.
         with zmq_ctx(zmq.REQ, path) as sock:
-            sock.send(GET_META_MSG)
+            msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank))
+            # Set receive timeout to 5 seconds to avoid hanging on dead server
+            sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
+            sock.send(msg)
             metadata_bytes = sock.recv()
             decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
             metadata = decoder.decode(metadata_bytes)
             got_metadata_time = time.perf_counter()
-            logger.debug("NIXL handshake: get metadata took: %s",
-                         got_metadata_time - start_time)
+            logger.debug(
+                "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
+            )
 
             # Ensure engine id matches.
             if metadata.engine_id != expected_engine_id:
-                raise RuntimeError(f"Remote NIXL agent engine ID mismatch. "
-                                   f"Expected {expected_engine_id},"
-                                   f"received {metadata.engine_id}.")
+                raise RuntimeError(
+                    f"Remote NIXL agent engine ID mismatch. "
+                    f"Expected {expected_engine_id},"
+                    f"received {metadata.engine_id}."
+                )
 
             # Register Remote agent.
-            remote_agent_name = self.add_remote_agent(metadata, p_remote_rank,
-                                                      remote_tp_size)
+            assert metadata.block_size <= self.block_size, (
+                "nP > nD is not supported yet."
+            )
+            remote_agent_name = self.add_remote_agent(
+                metadata, p_remote_rank, remote_tp_size
+            )
+
             setup_agent_time = time.perf_counter()
-            logger.debug("NIXL handshake: add agent took: %s",
-                         setup_agent_time - got_metadata_time)
+            logger.debug(
+                "NIXL handshake: add agent took: %s",
+                setup_agent_time - got_metadata_time,
+            )
 
         # Remote rank -> agent name.
         return {p_remote_rank: remote_agent_name}
 
-    def initialize_host_xfer_buffer(
-            self, kv_caches: dict[str, torch.Tensor]) -> None:
+    def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
         Initialize transfer buffer in CPU mem for accelerators
         NOT directly supported by NIXL (e.g., tpu)
@@ -690,9 +1041,23 @@ class NixlConnectorWorker:
             for layer_name, kv_cache in kv_caches.items():
                 kv_shape = kv_cache.shape
                 kv_dtype = kv_cache.dtype
-                xfer_buffers[layer_name] = torch.empty(kv_shape,
-                                                       dtype=kv_dtype,
-                                                       device="cpu")
+                if (
+                    self.kv_cache_layout == "NHD"
+                    and self.vllm_config.kv_transfer_config is not None
+                    and self.vllm_config.kv_transfer_config.enable_permute_local_kv
+                ):
+                    logger.info_once(
+                        "'enable_permute_local_kv' flag is enabled while "
+                        "device KV Layout is NHD. Init host buffer with"
+                        " HND to better support Decode/Prefill TP_ratio > 1."
+                    )
+                    # Since NHD will not support Decode/Prefill TP_ratio > 1,
+                    # we can leverage host_buffer for permute
+                    self.host_buffer_kv_cache_layout = "HND"
+                    kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4])
+                xfer_buffers[layer_name] = torch.empty(
+                    kv_shape, dtype=kv_dtype, device="cpu"
+                )
         except MemoryError as e:
             logger.error("NIXLConnectorWorker gets %s.", e)
             raise
@@ -701,17 +1066,28 @@ class NixlConnectorWorker:
 
     def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
         """Assign copy (d2h, h2d) operations when host buffer is used."""
+        # Set a no-op if the host buffer is not cpu.
+        if self.kv_buffer_device != "cpu":
+            return
+        # Set a no-op if self.device_type is 'cpu'.
+        if self.device_type == "cpu":
+            return
         assert self.use_host_buffer
         self.copy_blocks = copy_operation
 
-    def _background_nixl_handshake(self, req_id: str,
-                                   remote_engine_id: EngineId, meta: ReqMeta):
+    def _background_nixl_handshake(
+        self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
+    ):
         # Do NIXL handshake in background and add to _ready_requests when done.
         fut = self._handshake_futures.get(remote_engine_id)
         if fut is None:
             fut = self._handshake_initiation_executor.submit(
-                self._nixl_handshake, meta.remote_host, meta.remote_port,
-                meta.tp_size, remote_engine_id)
+                self._nixl_handshake,
+                meta.remote_host,
+                meta.remote_port,
+                meta.tp_size,
+                remote_engine_id,
+            )
             self._handshake_futures[remote_engine_id] = fut
 
             def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
@@ -724,10 +1100,20 @@ class NixlConnectorWorker:
 
             fut.add_done_callback(done_callback)
 
-        # TODO: handle failure state of future in the
-        # callback, we want to fail the request in this case.
-        def request_ready(_f: Future[Any], entry=(req_id, meta)):
-            self._ready_requests.put(entry)
+        # check handshake success before proceeding with request
+        def request_ready(f: Future[Any], entry=(req_id, meta)):
+            try:
+                # check if handshake succeeded
+                f.result()
+                self._ready_requests.put(entry)
+            except Exception:
+                # handshake failed - mark blocks as invalid
+                logger.exception(
+                    "Handshake failed for request %s, marking blocks as invalid", req_id
+                )
+                if req_meta := self._recving_metadata.get(req_id):
+                    self._invalid_block_ids.update(req_meta.local_block_ids)
+                self._failed_recv_reqs.add(req_id)
 
         fut.add_done_callback(request_ready)
 
@@ -738,18 +1124,23 @@ class NixlConnectorWorker:
             self.initialize_host_xfer_buffer(kv_caches=kv_caches)
             assert len(self.host_xfer_buffers) == len(kv_caches), (
                 f"host_buffer: {len(self.host_xfer_buffers)}, "
-                f"kv_caches: {len(kv_caches)}")
+                f"kv_caches: {len(kv_caches)}"
+            )
             xfer_buffers = self.host_xfer_buffers
         else:
             xfer_buffers = kv_caches
             assert not self.host_xfer_buffers, (
                 "host_xfer_buffer should not be initialized when "
-                f"kv_buffer_device is {self.kv_buffer_device}")
+                f"kv_buffer_device is {self.kv_buffer_device}"
+            )
 
         logger.info(
             "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
-            "use_host_buffer: %s", self.use_mla, self.kv_buffer_device,
-            self.use_host_buffer)
+            "use_host_buffer: %s",
+            self.use_mla,
+            self.kv_buffer_device,
+            self.use_host_buffer,
+        )
 
         caches_data = []
         # With hybrid allocator, layers can share a kv cache tensor
@@ -763,22 +1154,38 @@ class NixlConnectorWorker:
         # (roughly 8KB vs 5KB).
         # Conversely for FlashInfer, K and V are registered in the same region
         # to better exploit the memory layout (ie num_blocks is the first dim).
-        split_k_and_v = not (self.use_mla or self._use_pallas
-                             or self._use_flashinfer)
+        split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
+        self.device_id = self.tp_rank
         for layer_name, cache_or_caches in xfer_buffers.items():
-            cache_list = cache_or_caches if split_k_and_v else [
-                cache_or_caches
-            ]
+            cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
 
             for cache in cache_list:
                 base_addr = cache.data_ptr()
+                if not self.use_host_buffer and current_platform.is_cuda_alike():
+                    self.device_id = cache.device.index
                 if base_addr in seen_base_addresses:
                     continue
 
+                # TODO (NickLucche): Get kernel_block_size in a cleaner way
+                # NHD default "view" for non-MLA cache
+                kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
+
+                if self.block_size != kernel_block_size:
+                    logger.info_once(
+                        "User-specified logical block size (%s) does not match"
+                        " physical kernel block size (%s). Using the latter. ",
+                        self.block_size,
+                        kernel_block_size,
+                    )
+                    self._physical_blocks_per_logical_kv_block = (
+                        self.block_size // kernel_block_size
+                    )
+                    self.block_size = kernel_block_size
+
                 seen_base_addresses.append(base_addr)
                 curr_tensor_size_bytes = cache.numel() * cache.element_size()
 
@@ -786,23 +1193,33 @@ class NixlConnectorWorker:
                     tensor_size_bytes = curr_tensor_size_bytes
                     self.num_blocks = cache.shape[0]
 
-                assert cache.shape[0] == self.num_blocks, \
+                assert cache.shape[0] == self.num_blocks, (
                     "All kv cache tensors must have the same number of blocks"
+                )
 
-                self.block_len_per_layer.append(curr_tensor_size_bytes //
-                                                self.num_blocks)
-                self.slot_size_per_layer.append(self.block_len_per_layer[-1] //
-                                                self.block_size)
+                self.block_len_per_layer.append(
+                    curr_tensor_size_bytes // self.num_blocks
+                )
+                self.slot_size_per_layer.append(
+                    self.block_len_per_layer[-1] // self.block_size
+                )
 
                 if not self.use_mla:
                     # Different kv cache shape is not supported by HeteroTP
-                    assert tensor_size_bytes == curr_tensor_size_bytes, \
+                    assert tensor_size_bytes == curr_tensor_size_bytes, (
                         "All kv cache tensors must have the same size"
+                    )
+                # Need to make sure the device ID is non-negative for NIXL,
+                # Torch uses -1 to indicate CPU tensors while NIXL uses explicit
+                # memory type.
+                self.device_id = max(cache.get_device(), 0)
                 caches_data.append(
-                    (base_addr, curr_tensor_size_bytes, self.tp_rank, ""))
+                    (base_addr, curr_tensor_size_bytes, self.device_id, "")
+                )
 
-        logger.debug("Different block lengths collected: %s",
-                     set(self.block_len_per_layer))
+        logger.debug(
+            "Different block lengths collected: %s", set(self.block_len_per_layer)
+        )
         assert len(self.block_len_per_layer) == len(seen_base_addresses)
         assert self.num_blocks != 0
 
@@ -810,8 +1227,7 @@ class NixlConnectorWorker:
         self.num_regions = len(caches_data)
         self.num_layers = len(xfer_buffers.keys())
 
-        descs = self.nixl_wrapper.get_reg_descs(caches_data,
-                                                self.nixl_memory_type)
+        descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type)
         logger.debug("Registering descs: %s", caches_data)
         self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends)
         logger.debug("Done registering descs")
@@ -819,7 +1235,7 @@ class NixlConnectorWorker:
 
         self.device_kv_caches = kv_caches
         self.dst_num_blocks[self.engine_id] = self.num_blocks
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             for i in range(len(self.slot_size_per_layer)):
                 assert self.slot_size_per_layer[i] % 2 == 0
                 self.slot_size_per_layer[i] //= 2
@@ -833,46 +1249,18 @@ class NixlConnectorWorker:
             self.num_regions *= 2
 
         # Register local/src descr for NIXL xfer.
-        blocks_data = []
-        for i, base_addr in enumerate(seen_base_addresses):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            # NOTE With heter-TP, more blocks are prepared than what are
-            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
-            # could create fewer, but then _get_block_descs_ids needs to
-            # select agent_meta.num_blocks instead of self.num_blocks for
-            # local descr, and that makes handling regular flow less clean.
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len_per_layer[i]
-                addr = base_addr + block_offset
-                # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, self.tp_rank))
-
-            if self._use_flashinfer:
-                # Separate and interleave K/V regions to maintain the same
-                # descs ordering. This is needed for selecting contiguous heads
-                # when split across TP ranks.
-                for block_id in range(self.num_blocks):
-                    block_offset = block_id * self.block_len_per_layer[i]
-                    addr = base_addr + block_offset
-                    # Register addresses for V cache (K registered first).
-                    v_addr = addr + kv_block_len
-                    blocks_data.append((v_addr, kv_block_len, self.tp_rank))
-        logger.debug("Created %s blocks for src engine %s and rank %s",
-                     len(blocks_data), self.engine_id, self.tp_rank)
+        self.seen_base_addresses = seen_base_addresses
+        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
 
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-                                                 self.nixl_memory_type)
-        # NIXL_INIT_AGENT to be used for preparations of local descs.
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs)
+        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
-        if self.vllm_config.model_config.hf_config.model_type == "llama4":
+        if self.model_config.hf_config.model_type == "llama4":
             from transformers import Llama4TextConfig
-            assert isinstance(self.vllm_config.model_config.hf_text_config,
-                              Llama4TextConfig)
-            llama4_config = self.vllm_config.model_config.hf_text_config
+
+            assert isinstance(self.model_config.hf_text_config, Llama4TextConfig)
+            llama4_config = self.model_config.hf_text_config
             no_rope_layers = llama4_config.no_rope_layers
             chunk_size = llama4_config.attention_chunk_size
             chunk_block_size = math.ceil(chunk_size / self.block_size)
@@ -882,45 +1270,99 @@ class NixlConnectorWorker:
                 is_local_attention = no_rope_layers[layer_idx] != 0
                 block_window = chunk_block_size if is_local_attention else None
                 self.block_window_per_layer.append(block_window)
-            logger.debug("Llama 4 block window per layer mapping: %s",
-                         self.block_window_per_layer)
+            logger.debug(
+                "Llama 4 block window per layer mapping: %s",
+                self.block_window_per_layer,
+            )
             assert len(self.block_window_per_layer) == self.num_layers
 
         # After KV Caches registered, listen for new connections.
-        metadata = NixlAgentMetadata(
+        self.xfer_handshake_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
+            device_id=self.device_id,
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
             attn_backend_name=self.backend_name,
-            kv_cache_layout=self.kv_cache_layout)
-        ready_event = threading.Event()
-        self._nixl_handshake_listener_t = threading.Thread(
-            target=self._nixl_handshake_listener,
-            args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
-            daemon=True,
-            name="nixl_handshake_listener")
-        self._nixl_handshake_listener_t.start()
-        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
-
-    def add_remote_agent(self,
-                         nixl_agent_meta: NixlAgentMetadata,
-                         remote_tp_rank: int = 0,
-                         remote_tp_size: int = 1) -> str:
+            kv_cache_layout=self.kv_cache_layout
+            if not self.use_host_buffer
+            else self.host_buffer_kv_cache_layout,
+            block_size=self.block_size,
+        )
+
+    def register_local_xfer_handler(
+        self,
+        block_size: int,
+    ) -> int:
+        """
+        Function used for register local xfer handler with local block_size or
+        Remote block_size.
+
+        When local block_size is same as remote block_size, we use local block_size
+        to register local_xfer_handler during init.
+
+        When remote block size is less than local block size, we need to use
+        register another local_xfer_handler using remote block len to ensure
+        data copy correctness.
+        """
+        block_size_ratio = self.block_size // block_size
+        blocks_data = []
+        for i, base_addr in enumerate(self.seen_base_addresses):
+            # The new block_len is using prefill block_len;
+            # and num_blocks is multiple with N
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
+            )
+            block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
+            num_blocks = self.num_blocks * block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * block_len_per_layer
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, kv_block_len, self.device_id))
+
+            if self.kv_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(num_blocks):
+                    block_offset = block_id * block_len_per_layer
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.device_id))
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
+        )
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+
+    def add_remote_agent(
+        self,
+        nixl_agent_meta: NixlAgentMetadata,
+        remote_tp_rank: int = 0,
+        remote_tp_size: int = 1,
+    ) -> str:
         """
         Add the remote NIXL agent and prepare the descriptors for reading cache
         blocks from remote.
 
         In particular, handle both homogeneous and heterogeneous TP. The former
-        requires local rank_i to read from remote rank_i. 
-        The latter, assuming D.world_size > P.world_size, requires that two or 
+        requires local rank_i to read from remote rank_i.
+        The latter, assuming D.world_size > P.world_size, requires that two or
         more local TP worker share the xfer from a single TP worker.
 
         Here's an example (non-MLA case):
 
         rank_offset     p_remote_tp_rank
-        (kv split no)    
+        (kv split no)
         --------------------------------
             0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
                                                                         /
@@ -933,100 +1375,83 @@ class NixlConnectorWorker:
 
                                 Decoder TP workers                     Prefix TP workers
                                   (world_size=4)                         (world_size=2)
-                                                 tp_ratio = 4 // 2 = 2                  
-                                
-        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+                                                 tp_ratio = 4 // 2 = 2
+
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
         then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
-        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio
         first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
-        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
-        
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.
+
         Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
 
         Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
         so that the whole cache is shared by "tp_ratio" D TP workers.
-        """ # noqa: E501
+        """  # noqa: E501
         engine_id = nixl_agent_meta.engine_id
         # TODO re-evaluate refreshing for scaling/recovery
         if remote_tp_rank in self._remote_agents.get(engine_id, {}):
+            logger.debug(
+                "Remote agent with engine_id %s and rank"
+                "%s already exchanged metadata, skip handshake.",
+                engine_id,
+                remote_tp_rank,
+            )
             return self._remote_agents[engine_id][remote_tp_rank]
 
+        ### Register remote agent metadata
         if engine_id not in self._tp_size:
             self._tp_size[engine_id] = remote_tp_size
-        else:
-            assert self._tp_size[engine_id] == remote_tp_size
-        # TODO We may eventually want to skip enforcing the same attn backend.
-        assert nixl_agent_meta.attn_backend_name == self.backend_name
+        if engine_id not in self._block_size:
+            self._block_size[engine_id] = nixl_agent_meta.block_size
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
-            nixl_agent_meta.agent_metadata)
-
-        # Number of D TP workers reading from a single P TP worker. This is
-        # 1 when P and D `--tensor-parallel-size` match.
-        tp_ratio = divide(self._tp_size[self.engine_id],
-                          self._tp_size[engine_id])
-        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
-        assert not self._use_pallas or tp_ratio == 1, \
-               "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
+            nixl_agent_meta.agent_metadata
+        )
 
         # Handle tp_size>num_kv_heads: replicate KV cache.
-        total_num_kv_heads = self.model_config.get_total_num_kv_heads()
-        is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1
+        replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id)
+
+        # Create dst descs and xfer side handles. TP workers have same #blocks
+        # so we only register once per engine_id.
+        # Example:
+        # block_size_ratio > 1:
+        # remote:               | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
+        # local origin:|          0|          1|          8|         12|
+        # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
+
+        if engine_id not in self.dst_num_blocks:
+            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
-        remote_block_len = nixl_agent_meta.block_lens[0]
-        if self.use_mla or is_kv_replicated:
-            # With replicated KV cache, only the number of blocks can differ.
-            assert self.block_len_per_layer == nixl_agent_meta.block_lens, \
-                "KV cache sizes must match between P and D when replicated"
-            remote_block_size = remote_block_len // (
-                self.slot_size_per_layer[0])
-        else:
-            # When MLA is not used, this is a list of the same block length
-            for block_len in nixl_agent_meta.block_lens:
-                assert block_len == remote_block_len, \
-                    "All remote layers must have the same block size"
-            remote_block_size = remote_block_len // (
-                self.slot_size_per_layer[0] * tp_ratio)
-            if self._use_flashinfer:
-                # With flashinfer, KV are sent in the same message.
-                remote_block_size //= 2
-            if tp_ratio > 1:
-                # Heterogeneous TP expects same kv_cache_layout.
-                assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout
-                if self.device_type == "xpu":
-                    raise ValueError(
-                        "Heterogeneous TP is not supported on XPU")
-
-            assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, (
-                "Remote P worker KV layer cache must be of shape [2, N, "
-                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
-            )
+        # Keep track of remote agent kv caches base addresses.
+        self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr
 
-        assert self.block_size == remote_block_size, (
-            "Remote P worker with different page/block size is not supported "
-            f"{self.block_size=}, {remote_block_size=}")
+        self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
 
-        # Create dst descs and xfer side handles. TP workers have same #blocks.
-        if engine_id in self.dst_num_blocks:
-            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
-        else:
-            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
+        # Number of D TP workers reading from a single P TP worker. This is
+        # 1 when P and D `--tensor-parallel-size` match.
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
 
+        ### Register remote agent memory regions
         blocks_data = []
         # With homogeneous TP, D pulls the whole kv cache from corresponding
         # rank. With heterogeneous TP, prepare the descriptors by splitting the
         # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
-        self.kv_caches_base_addr[
-            engine_id] = nixl_agent_meta.kv_caches_base_addr
 
-        assert len(nixl_agent_meta.kv_caches_base_addr) == len(
-            self.block_len_per_layer)
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
             kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            rank_offset = self.tp_rank % tp_ratio * kv_block_len \
-                if not (self.use_mla or is_kv_replicated) else 0
+            remote_kv_block_len = kv_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # using remote kv_block_len as transfer unit
+                kv_block_len = remote_kv_block_len
+            rank_offset = (
+                self.tp_rank % tp_ratio * remote_kv_block_len
+                if not replicates_kv_cache
+                else 0
+            )
             for block_id in range(nixl_agent_meta.num_blocks):
                 block_offset = block_id * nixl_agent_meta.block_lens[i]
                 # For each block, grab the heads chunk belonging to rank_i
@@ -1034,43 +1459,133 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, remote_tp_rank))
+                blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
 
-            if self._use_flashinfer:
+            if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
                 for block_id in range(nixl_agent_meta.num_blocks):
                     block_offset = block_id * nixl_agent_meta.block_lens[i]
                     addr = base_addr + block_offset + rank_offset
                     v_addr = addr + nixl_agent_meta.block_lens[i] // 2
-                    blocks_data.append((v_addr, kv_block_len, remote_tp_rank))
+                    blocks_data.append(
+                        (v_addr, kv_block_len, nixl_agent_meta.device_id)
+                    )
 
         logger.debug(
-            "Created %s blocks for dst engine %s with remote rank %s and "
-            "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
-            self.tp_rank)
+            "Created %s blocks for dst engine %s with remote rank %s and local rank %s",
+            len(blocks_data),
+            engine_id,
+            remote_tp_rank,
+            self.tp_rank,
+        )
 
         # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-                                                 self.nixl_memory_type)
-        self.dst_xfer_side_handles[
-            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                remote_agent_name, descs)
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
+        self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+            remote_agent_name, descs
+        )
+
+        if block_size_ratio > 1:
+            # when prefill with smaller block_size, we need to init a
+            # new handler with same block_len to match
+            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            )
 
         return remote_agent_name
 
+    def _validate_remote_agent_handshake(
+        self, nixl_agent_meta: NixlAgentMetadata, remote_tp_size: int
+    ):
+        """
+        Validate the remote agent handshake metadata ensuring the
+        invariants hold true.
+        """
+        remote_engine_id = nixl_agent_meta.engine_id
+
+        assert self._tp_size[remote_engine_id] == remote_tp_size
+        # TODO We may eventually want to skip enforcing the same attn backend.
+        assert nixl_agent_meta.attn_backend_name == self.backend_name
+
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+            remote_engine_id
+        )
+        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+        assert not self._use_pallas or tp_ratio == 1, (
+            "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
+        )
+        kv_cache_layout = (
+            self.kv_cache_layout
+            if not self.use_host_buffer
+            else self.host_buffer_kv_cache_layout
+        )
+        if not self.use_mla and nixl_agent_meta.kv_cache_layout != kv_cache_layout:
+            if (
+                self.kv_transfer_config.enable_permute_local_kv
+                and nixl_agent_meta.kv_cache_layout == "HND"
+            ):
+                logger.info(
+                    "Remote is HND and local is NHD, enabled additional permute "
+                    "on local device KV."
+                )
+                self.enable_permute_local_kv = True
+            else:
+                raise RuntimeError(
+                    "Heterogeneous TP expects same kv_cache_layout. "
+                    "Or enable experimental feature to use HND to NHD support by "
+                    "setting 'enable_permute_local_kv'=True in --kv-transfer-config."
+                )
+
+        # Block len can only vary across layers when using MLA.
+        remote_block_len = nixl_agent_meta.block_lens[0]
+        if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
+            # With replicated KV cache, only the number of blocks can differ.
+            for i in range(len(self.block_len_per_layer)):
+                assert (
+                    self.block_len_per_layer[i] // block_size_ratio
+                    == nixl_agent_meta.block_lens[i]
+                ), "KV cache sizes must match between P and D when replicated"
+        else:
+            # When MLA is not used, this is a list of the same block length
+            for block_len in nixl_agent_meta.block_lens:
+                assert block_len == remote_block_len, (
+                    "All remote layers must have the same block size"
+                )
+
+            assert (
+                remote_block_len
+                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+            ), (
+                "Remote P worker KV layer cache must be of shape [2, N, "
+                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            )
+
+        # TP workers have same #blocks.
+        assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
+
+        assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
+
     def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
         """copy recved kv from host buffer to device."""
         assert self.use_host_buffer
         assert self.copy_blocks is not None
 
-        local_block_ids = meta.local_block_ids
-        self.copy_blocks(self.host_xfer_buffers, self.device_kv_caches,
-                         local_block_ids, local_block_ids, "h2d")
+        local_block_ids = meta.local_physical_block_ids
+        self.copy_blocks(
+            self.host_xfer_buffers,
+            self.device_kv_caches,
+            local_block_ids,
+            local_block_ids,
+            "h2d",
+        )
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(
                 "synced recved kv of request[%s] to device kv buffer,"
-                "local_block_ids: %s. ", req_id,
-                ",".join(map(str, meta.local_block_ids)))
+                "local_block_ids: %s. ",
+                req_id,
+                ",".join(map(str, local_block_ids)),
+            )
 
     def save_kv_to_host(self, metadata: NixlConnectorMetadata):
         """copy kv from device to host buffer."""
@@ -1078,14 +1593,109 @@ class NixlConnectorWorker:
         assert self.copy_blocks is not None
 
         for req_id, meta in metadata.reqs_to_save.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
             if logger.isEnabledFor(logging.DEBUG):
                 logger.debug(
                     "save_load_kv for request[%s] to host xfer buffer."
-                    "local_block_ids: %s. ", req_id,
-                    ",".join(map(str, meta.local_block_ids)))
+                    "local_block_ids: %s. ",
+                    req_id,
+                    ",".join(map(str, meta.local_physical_block_ids)),
+                )
             # blocking
-            self.copy_blocks(self.device_kv_caches, self.host_xfer_buffers,
-                             meta.local_block_ids, meta.local_block_ids, "d2h")
+            self.copy_blocks(
+                self.device_kv_caches,
+                self.host_xfer_buffers,
+                meta.local_physical_block_ids,
+                meta.local_physical_block_ids,
+                "d2h",
+            )
+
+    def permute_device_kv(self, block_ids: list[int]):
+        """Transforms the layout of received KV cache blocks to the local format.
+
+        This method corrects layout mismatches from direct memory copies by
+        permuting the tensor dimensions.
+
+        - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]`
+        - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]`
+
+        Args:
+            block_ids: A list of block IDs to update and permute.
+
+        Implementation:
+        - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout
+        - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size
+        - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
+
+        """
+        split_k_and_v = self.kv_topo.split_k_and_v
+        inv_order = [0, 2, 1, 3]
+        sample_cache = list(self.device_kv_caches.values())[0][0]
+        target_shape = list(sample_cache.shape)
+        target_shape[0] = -1
+        src_shape = tuple(target_shape[i] for i in inv_order)
+        indices = torch.tensor(block_ids, device=sample_cache.device)
+
+        for _, cache_or_caches in self.device_kv_caches.items():
+            cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+            for cache in cache_list:
+                blocks_to_update = cache.index_select(0, indices)
+                permuted_blocks = blocks_to_update.reshape(src_shape).permute(
+                    *inv_order
+                )
+                cache.index_copy_(0, indices, permuted_blocks)
+
+    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+        def _process_local_gt_remote(blocks_to_update, block_size_ratio):
+            n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
+            remote_block_size = block_size // block_size_ratio
+            n_blocks = block_size_ratio
+            # actual permute is to convert
+            # for local blocksize > remote blocksize
+            # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
+            # local block[0] = remote block[0, 1, 2, 3]
+            # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
+            # local is  |h0-b0..................|h1-b0..................|...
+            # permute is to:
+            # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
+            # 2. permute => (H, nblocks, remoteN, D)
+            # 3. flatten => (H, localN, D)
+            permuted_blocks = (
+                blocks_to_update.reshape(
+                    -1, n_blocks, n_kv_heads, remote_block_size, head_size
+                )
+                .permute(0, 2, 1, 3, 4)
+                .flatten(2, 3)
+            )
+            return permuted_blocks
+
+        if len(self.device_kv_caches) == 0:
+            return
+        split_k_and_v = not (
+            self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
+        )
+        sample_cache = list(self.device_kv_caches.values())[0][0]
+        for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
+            assert block_size_ratio > 1, "Only nP < nD supported currently."
+            block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
+
+            for block_ids in block_ids_list:
+                indices = torch.tensor(block_ids, device=sample_cache.device)
+
+                for _, cache_or_caches in self.device_kv_caches.items():
+                    cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+                    for cache in cache_list:
+                        blocks_to_update = cache.index_select(0, indices)
+                        # because kv_cache is always using original layout NHD as
+                        # virtual shape while stride can be either HND / NHD at
+                        # initialization.
+                        # we need to firstly get physical view of the tensor
+                        permuted_blocks = _process_local_gt_remote(
+                            blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
+                        ).permute(0, 2, 1, 3)
+                        cache.index_copy_(0, indices, permuted_blocks)
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
@@ -1095,17 +1705,46 @@ class NixlConnectorWorker:
         """
         done_sending = self._get_new_notifs()
         done_recving = self._pop_done_transfers(self._recving_transfers)
+
+        # add requests that skipped transfer to done_recving
+        done_recving.update(self._failed_recv_reqs)
+        self._failed_recv_reqs.clear()
+
         if len(done_sending) > 0 or len(done_recving) > 0:
             logger.debug(
                 "Rank %s, get_finished: %s requests done sending "
-                "and %s requests done recving", self.tp_rank,
-                len(done_sending), len(done_recving))
+                "and %s requests done recving",
+                self.tp_rank,
+                len(done_sending),
+                len(done_recving),
+            )
 
-        if self.use_host_buffer:
-            for req_id in done_recving:
-                meta = self._recving_metadata.pop(req_id)
-                assert meta, f"{req_id} not found in recving_metadata list"
+        block_ids_to_permute = []
+        block_ids_for_blocksize_post_process = defaultdict(list)
+        for req_id in done_recving:
+            # clean up metadata for completed requests
+            meta = self._recving_metadata.pop(req_id, None)
+            assert meta is not None, f"{req_id} not found in recving_metadata list"
+            if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
+            if self.enable_permute_local_kv:
+                block_ids_to_permute += meta.local_physical_block_ids
+
+            # post processing for heteroblocksize
+            block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+                meta.remote_engine_id
+            )
+            if (
+                not self.use_mla
+                and block_size_ratio > 1
+                and self.kv_cache_layout == "HND"
+            ):
+                block_ids_for_blocksize_post_process[block_size_ratio].append(
+                    meta.local_block_ids
+                )
+        self.blocksize_post_process(block_ids_for_blocksize_post_process)
+        if len(block_ids_to_permute) > 0:
+            self.permute_device_kv(block_ids_to_permute)
 
         # Handle timeout to avoid stranding blocks on remote.
         now = time.perf_counter()
@@ -1117,8 +1756,11 @@ class NixlConnectorWorker:
             count = self.consumer_notification_counts_by_req.pop(req_id, 0)
             logger.warning(
                 "Releasing expired KV blocks for request %s which were "
-                "retrieved by %d decode worker(s) within %d seconds.", req_id,
-                count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
+                "retrieved by %d decode worker(s) within %d seconds.",
+                req_id,
+                count,
+                envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
+            )
             self._reqs_to_process.remove(req_id)
             del self._reqs_to_send[req_id]
             done_sending.add(req_id)
@@ -1135,18 +1777,21 @@ class NixlConnectorWorker:
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
                 req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
-                if (req_id not in self._reqs_to_send
-                        and req_id not in self._reqs_to_process):
+                if (
+                    req_id not in self._reqs_to_send
+                    and req_id not in self._reqs_to_process
+                ):
                     logger.error(
                         "Potentially invalid KV blocks for "
                         "unrecognized request %s were retrieved by "
-                        "a decode worker. They may have expired.", req_id)
+                        "a decode worker. They may have expired.",
+                        req_id,
+                    )
                     continue
 
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
-                if self.consumer_notification_counts_by_req[req_id] == int(
-                        tp_ratio):
+                if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
                     self._reqs_to_process.remove(req_id)
@@ -1154,7 +1799,8 @@ class NixlConnectorWorker:
         return notified_req_ids
 
     def _pop_done_transfers(
-            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
+        self, transfers: dict[str, list[tuple[int, float]]]
+    ) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -1168,15 +1814,27 @@ class NixlConnectorWorker:
             for handle, _xfer_stime in handles:
                 xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                 if xfer_state == "DONE":
+                    # Get telemetry from NIXL
+                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
+                    self.xfer_stats.record_transfer(res)
                     self.nixl_wrapper.release_xfer_handle(handle)
-                    # TODO (NickLucche) Get from NIXL telemetry once integrated
-                    self.xfer_stats.record_transfer()
                 elif xfer_state == "PROC":
                     in_progress = True
                     continue
                 else:
-                    raise RuntimeError("Transfer failed with state %s",
-                                       xfer_state)
+                    # transfer failed - mark blocks as invalid
+                    logger.error(
+                        "NIXL transfer failed for request %s with state %s. "
+                        "Marking blocks as invalid.",
+                        req_id,
+                        xfer_state,
+                    )
+                    # mark all (logical)blocks for this request as invalid
+                    if meta := self._recving_metadata.pop(req_id, None):
+                        self._invalid_block_ids.update(meta.local_block_ids)
+                    self._recving_metadata.pop(req_id, None)
+                    self.nixl_wrapper.release_xfer_handle(handle)
+                    self.xfer_stats.record_failed_transfer()
             if not in_progress:
                 done_req_ids.add(req_id)
                 del transfers[req_id]
@@ -1188,20 +1846,28 @@ class NixlConnectorWorker:
         We check for these trnxs to complete in each step().
         """
         for req_id, meta in metadata.reqs_to_recv.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
+            meta.remote_block_ids = self._logical_to_kernel_block_ids(
+                meta.remote_block_ids
+            )
             remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
-                "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
-                remote_engine_id, len(meta.local_block_ids),
-                len(meta.remote_block_ids))
-            if self.use_host_buffer:
-                self._recving_metadata[req_id] = meta
+                "Num local_block_ids: %s. Num remote_block_ids: %s. ",
+                req_id,
+                remote_engine_id,
+                len(meta.local_physical_block_ids),
+                len(meta.remote_block_ids),
+            )
+            # always store metadata for failure recovery
+            self._recving_metadata[req_id] = meta
             if remote_engine_id not in self._remote_agents:
                 # Initiate handshake with remote engine to exchange metadata.
                 with self._handshake_lock:
                     if remote_engine_id not in self._remote_agents:
-                        self._background_nixl_handshake(
-                            req_id, remote_engine_id, meta)
+                        self._background_nixl_handshake(req_id, remote_engine_id, meta)
                         continue
 
             # Handshake already completed, start async read xfer.
@@ -1220,6 +1886,12 @@ class NixlConnectorWorker:
         for req_id in metadata.reqs_in_batch:
             self._reqs_to_process.add(req_id)
 
+        # Remove all requests that are not to be processed (eg aborted).
+        for req_id in metadata.reqs_not_processed:
+            self._reqs_to_process.discard(req_id)
+            # We should never get an abort after setting an expiry timer
+            assert req_id not in self._reqs_to_send
+
         # Add to requests that are waiting to be read and track expiration.
         for req_id, expiration_time in metadata.reqs_to_send.items():
             if req_id in self._reqs_to_process:
@@ -1228,17 +1900,41 @@ class NixlConnectorWorker:
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         logger.debug(
             "Remote agent %s available, calling _read_blocks for req %s",
-            meta.remote_engine_id, req_id)
+            meta.remote_engine_id,
+            req_id,
+        )
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
-            local_block_ids=meta.local_block_ids,
+            local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
 
-    def _read_blocks(self, local_block_ids: list[int],
-                     remote_block_ids: list[int], dst_engine_id: str,
-                     request_id: str):
+    def _read_blocks(
+        self,
+        local_block_ids: list[int],
+        remote_block_ids: list[int],
+        dst_engine_id: str,
+        request_id: str,
+    ):
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
+        if block_size_ratio > 1:
+            local_block_ids = self.get_mapped_blocks(
+                np.asarray(local_block_ids), block_size_ratio
+            )
+            if len(local_block_ids) > len(remote_block_ids):
+                # NOTE:
+                # get_mapped_blocks will always expand block_ids for n times.
+                # ex:
+                # prefill block_ids with block_size as 4:
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                # Local decode block_ids with block_size as 16: [1, 2, 3]
+                # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # Then we clip local to align with prefill
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                local_block_ids = local_block_ids[: len(remote_block_ids)]
         # NOTE(rob): having the staging blocks be on the READER side is
         # not going to work well (since we will have to call rearrange tensors).
         # after we detect the txn is complete (which means we cannot make the
@@ -1251,17 +1947,27 @@ class NixlConnectorWorker:
 
         # Number of D TP workers that will read from dst P. Propagate tp_ratio
         # on notification so that dst worker can wait before freeing blocks.
-        tp_ratio = self._tp_size[
-            self.engine_id] // self._tp_size[dst_engine_id]
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
         notif_id = f"{request_id}:{tp_ratio}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            remote_rank = self.tp_rank // tp_ratio
+            remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id(
+                dst_engine_id
+            )
             agent_name = self._remote_agents[dst_engine_id][remote_rank]
-            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
+            try:
+                self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
+            except Exception:
+                logger.exception(
+                    "NIXL send_notif failed for request %s: "
+                    "P worker blocks will be freed after timeout. "
+                    "This may indicate network issues.",
+                    request_id,
+                )
+                self.xfer_stats.record_failed_notification()
             return
 
         # Partial prefix cache hit: just read uncomputed blocks.
@@ -1271,7 +1977,10 @@ class NixlConnectorWorker:
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
         # Get side handles.
-        local_xfer_side_handle = self.src_xfer_side_handle
+        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
+        local_xfer_side_handle = self.src_xfer_side_handles.get(
+            remote_block_size, self.src_xfer_side_handle
+        )
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -1281,19 +1990,24 @@ class NixlConnectorWorker:
         # Get descs ids.
         local_block_descs_ids: np.ndarray
         remote_block_descs_ids: np.ndarray
+
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
-                dst_engine_id, remote_block_ids)
+                dst_engine_id,
+                remote_block_ids,
+            )
             local_block_descs_ids = self._get_block_descs_ids(
-                self.engine_id, local_block_ids)
+                self.engine_id,
+                local_block_ids,
+                block_size_ratio=block_size_ratio,
+            )
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
             # Optimization for models with local attention (Llama 4)
             local_descs_list = []
             remote_descs_list = []
-            for layer_idx, block_window in enumerate(
-                    self.block_window_per_layer):
+            for layer_idx, block_window in enumerate(self.block_window_per_layer):
                 # For each layer:
                 if block_window is None:
                     # If not chunked, we just use the
@@ -1307,9 +2021,16 @@ class NixlConnectorWorker:
 
                 # Get descs ids for the layer.
                 layer_local_desc_ids = self._get_block_descs_ids(
-                    self.engine_id, layer_local_block_ids, layer_idx)
+                    dst_engine_id,
+                    layer_local_block_ids,
+                    layer_idx,
+                )
                 layer_remote_desc_ids = self._get_block_descs_ids(
-                    dst_engine_id, layer_remote_block_ids, layer_idx)
+                    self.engine_id,
+                    layer_remote_block_ids,
+                    layer_idx,
+                    block_size_ratio=block_size_ratio,
+                )
 
                 local_descs_list.append(layer_local_desc_ids)
                 remote_descs_list.append(layer_remote_desc_ids)
@@ -1320,26 +2041,62 @@ class NixlConnectorWorker:
         assert len(local_block_descs_ids) == len(remote_block_descs_ids)
 
         # Prepare transfer with Nixl.
-        handle = self.nixl_wrapper.make_prepped_xfer(
-            "READ",
-            local_xfer_side_handle,
-            local_block_descs_ids,
-            remote_xfer_side_handle,
-            remote_block_descs_ids,
-            notif_msg=notif_id,
-        )
+        handle = None
+        try:
+            handle = self.nixl_wrapper.make_prepped_xfer(
+                "READ",
+                local_xfer_side_handle,
+                local_block_descs_ids,
+                remote_xfer_side_handle,
+                remote_block_descs_ids,
+                notif_msg=notif_id,
+            )
+
+            # Begin async xfer.
+            self.nixl_wrapper.transfer(handle)
 
-        # Begin async xfer.
-        self.nixl_wrapper.transfer(handle)
+            # Use handle to check completion in future step().
+            self._recving_transfers[request_id].append((handle, time.perf_counter()))
+        except Exception:
+            logger.exception(
+                "NIXL transfer setup/initiation failed for request %s. "
+                "Marking blocks as invalid.",
+                request_id,
+            )
+            # mark all (logical) blocks for this request as invalid
+            if meta := self._recving_metadata.get(request_id):
+                self._invalid_block_ids.update(meta.local_block_ids)
+            self.xfer_stats.record_failed_transfer()
+            if handle is not None:
+                self.nixl_wrapper.release_xfer_handle(handle)
+            self._failed_recv_reqs.add(request_id)
+
+    def get_mapped_blocks(self, block_ids, block_size_ratio):
+        """
+          Calculates the new set of block IDs by mapping every element
+          in the (potentially sparse) input array.
+          Example: block_ids=[0, 2], block_size_ratio=2
+        get_mapped_blocks    0     1     [2     3]     4     5
+              # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
+              # local is  |h0-b0......||h1-b0......||h2-b0........
+        local_block_ids         0           [1]           2
+        """
+        if block_ids.size == 0:
+            return np.array([], dtype=np.int64)
 
-        # Use handle to check completion in future step().
-        self._recving_transfers[request_id].append(
-            (handle, time.perf_counter()))
+        start_ids = block_ids * block_size_ratio
+        offsets = np.arange(block_size_ratio)
+        mapped_2d = start_ids[:, None] + offsets[None, :]
 
-    def _get_block_descs_ids(self,
-                             engine_id: str,
-                             block_ids: list[int],
-                             layer_idx: Optional[int] = None) -> np.ndarray:
+        return mapped_2d.flatten().astype(np.int64)
+
+    def _get_block_descs_ids(
+        self,
+        engine_id: str,
+        block_ids: list[int],
+        layer_idx: int | None = None,
+        block_size_ratio: float | None = None,
+    ) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
         If layer_idx is provided, we use the region_ids for the given layer.
@@ -1361,6 +2118,8 @@ class NixlConnectorWorker:
                 region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
 
         # Compute the desc ids for each block.
         region_ids = region_ids[:, None]
@@ -1368,23 +2127,40 @@ class NixlConnectorWorker:
         descs_ids = region_ids * num_blocks + block_ids
         return descs_ids.flatten()
 
+    def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
+        """
+        Convert logical block ids to kernel physical block ids.
+        This is required when the logical block size (the one set by the user)
+        does not match the one required by the attn backend.
+        """
+        if self._physical_blocks_per_logical_kv_block == 1:
+            # Noop when physical and logical block sizes are the same
+            return block_ids
+        block_ids_np = np.array(block_ids)
+        block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
+            1, -1
+        )
+        return BlockTable.map_to_kernel_blocks(
+            block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
+        ).tolist()
+
     def get_backend_aware_kv_block_len(self, layer_idx: int):
         """
         Get the block length for one K/V element (K and V have the same size).
 
-        For FA and other backends, this is equal to the length of the whole 
+        For FA and other backends, this is equal to the length of the whole
         block, as K and V are in separate regions.
         For FlashInfer, this is half the length of the whole block, as K and V
         share the same region.
         """
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             # For indexing only half (either just the K or V part).
             block_len = self.block_len_per_layer[layer_idx] // 2
         else:
             block_len = self.block_len_per_layer[layer_idx]
         return block_len
 
-    def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
         """
         Get the KV transfer stats for the connector.
         """
@@ -1393,12 +2169,23 @@ class NixlConnectorWorker:
             return self.xfer_stats.clone_and_reset()
         return None
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Return and clear the set of block IDs that failed to load.
+
+        This is called by the scheduler to identify blocks that need
+        to be retried after a NIXL transfer failure.
+        """
+        result = self._invalid_block_ids
+        self._invalid_block_ids = set()
+        return result
+
+    def __del__(self):
+        self.shutdown()
+
     def shutdown(self):
         """Shutdown the connector worker."""
         self._handshake_initiation_executor.shutdown(wait=False)
-        if self._nixl_handshake_listener_t is not None:
-            self._nixl_handshake_listener_t.join(timeout=0)
-            self._nixl_handshake_listener_t = None
         for handles in self._recving_transfers.values():
             for handle, _ in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
@@ -1425,13 +2212,12 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
     if socket_type not in (zmq.ROUTER, zmq.REQ):
         raise ValueError(f"Unexpected socket type: {socket_type}")
 
-    ctx: Optional[zmq.Context] = None
+    ctx: zmq.Context | None = None
     try:
         ctx = zmq.Context()  # type: ignore[attr-defined]
-        yield make_zmq_socket(ctx=ctx,
-                              path=addr,
-                              socket_type=socket_type,
-                              bind=socket_type == zmq.ROUTER)
+        yield make_zmq_socket(
+            ctx=ctx, path=addr, socket_type=socket_type, bind=socket_type == zmq.ROUTER
+        )
     finally:
         if ctx is not None:
             ctx.destroy(linger=0)
@@ -1442,15 +2228,35 @@ class NixlKVConnectorStats(KVConnectorStats):
     """Container for transfer performance metrics"""
 
     def __post_init__(self):
-        if "num_successful_transfers" not in self.data:
-            self.data["num_successful_transfers"] = 0
+        if not self.data:
+            # Empty container init, no data is passed in.
+            self.reset()
 
     def reset(self):
-        self.data = {"num_successful_transfers": 0}
+        # Must be serializable
+        self.data: dict[str, list[float]] = {
+            "transfer_duration": [],
+            "post_duration": [],
+            "bytes_transferred": [],
+            "num_descriptors": [],
+            "num_failed_transfers": [],
+            "num_failed_notifications": [],
+        }
+
+    def record_transfer(self, res: nixlXferTelemetry):
+        # Keep metrics units consistent with rest of the code: time us->s
+        self.data["transfer_duration"].append(res.xferDuration / 1e6)
+        self.data["post_duration"].append(res.postDuration / 1e6)
+        self.data["bytes_transferred"].append(res.totalBytes)
+        self.data["num_descriptors"].append(res.descCount)
+
+    def record_failed_transfer(self):
+        """Record a failed NIXL transfer operation."""
+        self.data["num_failed_transfers"].append(1.0)
 
-    def record_transfer(self):
-        # TODO: record actual transfer stats when available
-        self.data["num_successful_transfers"] += 1
+    def record_failed_notification(self):
+        """Record a failed NIXL notification (send_notif)."""
+        self.data["num_failed_notifications"].append(1.0)
 
     def clone_and_reset(self) -> "NixlKVConnectorStats":
         old = copy.copy(self)
@@ -1458,16 +2264,177 @@ class NixlKVConnectorStats(KVConnectorStats):
         return old
 
     def is_empty(self) -> bool:
-        return self.data["num_successful_transfers"] == 0
+        return self.num_successful_transfers == 0
 
     def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
         if not other.is_empty():
-            self.data["num_successful_transfers"] += other.data[
-                "num_successful_transfers"]
+            for k, v in other.data.items():
+                accumulator = self.data[k]
+                assert isinstance(accumulator, list)
+                accumulator.extend(v)
         return self
 
-    def reduce(self) -> dict[str, Union[int, float]]:
-        # TODO: reduce stats to a single value, calculate latency/throughput
+    def reduce(self) -> dict[str, int | float]:
+        # Compute compact representative stats suitable for CLI logging
+        if self.is_empty():
+            return {
+                "Num successful transfers": 0,
+                "Avg xfer time (ms)": 0,
+                "P90 xfer time (ms)": 0,
+                "Avg post time (ms)": 0,
+                "P90 post time (ms)": 0,
+                "Avg MB per transfer": 0,
+                "Throughput (MB/s)": 0,
+                "Avg number of descriptors": 0,
+            }
+
+        xfer_time = np.asarray(self.data["transfer_duration"])
+        post_time = np.asarray(self.data["post_duration"])
+        # Convert to MB for CLI logging.
+        mb = np.asarray(self.data["bytes_transferred"]) / 2**20
+        descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
+        n = len(descs)
+        assert n == self.num_successful_transfers
+
+        total_mb = mb.sum()
+        avg_mb = total_mb / n
+
+        total_time_seconds = xfer_time.sum()
+        throughput_mb_s = total_mb / total_time_seconds
+
         return {
-            "num_successful_transfers": self.data["num_successful_transfers"]
+            "Num successful transfers": n,
+            "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
+            "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
+            "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
+            "Avg MB per transfer": round(avg_mb, 3),
+            "Throughput (MB/s)": round(throughput_mb_s, 3),
+            "Avg number of descriptors": round(descs.mean(), 1),
         }
+
+    @property
+    def num_successful_transfers(self) -> int:
+        return len(self.data["transfer_duration"])
+
+
+class NixlPromMetrics(KVConnectorPromMetrics):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        metric_types: dict[type[PromMetric], type[PromMetricT]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ):
+        super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
+
+        buckets = [
+            0.001,
+            0.005,
+            0.01,
+            0.025,
+            0.05,
+            0.075,
+            0.1,
+            0.2,
+            0.3,
+            0.5,
+            0.75,
+            1.0,
+            5.0,
+        ]
+        nixl_histogram_xfer_time = self._histogram_cls(
+            name="vllm:nixl_xfer_time_seconds",
+            documentation="Histogram of transfer duration for NIXL KV Cache transfers.",
+            buckets=buckets[1:],
+            labelnames=labelnames,
+        )
+        self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time)
+        nixl_histogram_post_time = self._histogram_cls(
+            name="vllm:nixl_post_time_seconds",
+            documentation="Histogram of transfer post time for NIXL KV"
+            " Cache transfers.",
+            buckets=buckets,
+            labelnames=labelnames,
+        )
+        self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time)
+        # uniform 2kb to 16gb range
+        buckets = [2 ** (10 + i) for i in range(1, 25, 2)]
+        nixl_histogram_bytes_transferred = self._histogram_cls(
+            name="vllm:nixl_bytes_transferred",
+            documentation="Histogram of bytes transferred per NIXL KV Cache transfers.",
+            buckets=buckets,
+            labelnames=labelnames,
+        )
+        self.nixl_histogram_bytes_transferred = self.make_per_engine(
+            nixl_histogram_bytes_transferred
+        )
+        buckets = [
+            10,
+            20,
+            30,
+            50,
+            75,
+            100,
+            200,
+            400,
+            1000,
+            2000,
+            4000,
+            10000,
+            20000,
+            50000,
+        ]
+        nixl_histogram_num_descriptors = self._histogram_cls(
+            name="vllm:nixl_num_descriptors",
+            documentation="Histogram of number of descriptors per NIXL"
+            "  KV Cache transfers.",
+            buckets=buckets,
+            labelnames=labelnames,
+        )
+        self.nixl_histogram_num_descriptors = self.make_per_engine(
+            nixl_histogram_num_descriptors
+        )
+        counter_nixl_num_failed_transfers = self._counter_cls(
+            name="vllm:nixl_num_failed_transfers",
+            documentation="Number of failed NIXL KV Cache transfers.",
+            labelnames=labelnames,
+        )
+        self.counter_nixl_num_failed_transfers = self.make_per_engine(
+            counter_nixl_num_failed_transfers
+        )
+        counter_nixl_num_failed_notifications = self._counter_cls(
+            name="vllm:nixl_num_failed_notifications",
+            documentation="Number of failed NIXL KV Cache notifications.",
+            labelnames=labelnames,
+        )
+        self.counter_nixl_num_failed_notifications = self.make_per_engine(
+            counter_nixl_num_failed_notifications
+        )
+
+    def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
+        for prom_obj, list_item_key in zip(
+            [
+                self.nixl_histogram_xfer_time,
+                self.nixl_histogram_post_time,
+                self.nixl_histogram_bytes_transferred,
+                self.nixl_histogram_num_descriptors,
+            ],
+            [
+                "transfer_duration",
+                "post_duration",
+                "bytes_transferred",
+                "num_descriptors",
+            ],
+        ):
+            for list_item in transfer_stats_data[list_item_key]:
+                prom_obj[engine_idx].observe(list_item)
+        for counter_obj, counter_item_key in zip(
+            [
+                self.counter_nixl_num_failed_transfers,
+                self.counter_nixl_num_failed_notifications,
+            ],
+            ["num_failed_transfers", "num_failed_notifications"],
+        ):
+            for list_item in transfer_stats_data[counter_item_key]:
+                counter_obj[engine_idx].inc(list_item)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index c23efa604544da968817244eae3bf69e3ee869f9..582e42cc466ae2f7fe70b7e5dc6ffa399d376d20 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,22 +4,24 @@ from collections import defaultdict
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
-from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
-                                                          KVConnectorRole)
-from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorMetadata)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import BlockHash
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import OffloadingManager
 from vllm.v1.kv_offload.factory import OffloadingSpecFactory
 from vllm.v1.kv_offload.mediums import GPULoadStoreSpec
@@ -40,14 +42,18 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
-
-    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
-        super().__init__(vllm_config, role)
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
 
         spec = OffloadingSpecFactory.create_spec(vllm_config)
 
-        self.connector_scheduler: Optional[OffloadingConnectorScheduler] = None
-        self.connector_worker: Optional[OffloadingConnectorWorker] = None
+        self.connector_scheduler: OffloadingConnectorScheduler | None = None
+        self.connector_worker: OffloadingConnectorWorker | None = None
         if role == KVConnectorRole.SCHEDULER:
             self.connector_scheduler = OffloadingConnectorScheduler(spec)
         elif role == KVConnectorRole.WORKER:
@@ -57,47 +63,51 @@ class OffloadingConnector(KVConnectorBase_V1):
         assert self.connector_worker is not None
         self.connector_worker.register_kv_caches(kv_caches)
 
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
-        assert isinstance(self._connector_metadata,
-                          OffloadingConnectorMetadata)
+        assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
         self.connector_worker.start_load_kv(self._connector_metadata)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         pass
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
         pass
 
     def wait_for_save(self):
         assert self.connector_worker is not None
-        assert isinstance(self._connector_metadata,
-                          OffloadingConnectorMetadata)
+        assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
         self.connector_worker.start_store_kv(self._connector_metadata)
 
-    def get_finished(self,
-                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+    def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         assert self.connector_worker is not None
         return self.connector_worker.get_finished(finished_req_ids)
 
     def get_num_new_matched_tokens(
-            self, request: "Request",
-            num_computed_tokens: int) -> tuple[int, bool]:
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.get_num_new_matched_tokens(
-            request, num_computed_tokens)
+            request, num_computed_tokens
+        )
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         assert self.connector_scheduler is not None
         return self.connector_scheduler.update_state_after_alloc(
-            request, blocks, num_external_tokens)
+            request, blocks, num_external_tokens
+        )
 
     def build_connector_meta(
-            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.build_connector_meta(scheduler_output)
 
@@ -109,7 +119,7 @@ class OffloadingConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
 
@@ -124,8 +134,7 @@ class OffloadingConnectorScheduler:
     def __init__(self, spec: OffloadingSpec):
         self.gpu_block_size = spec.gpu_block_size
         self.offloaded_block_size = spec.offloaded_block_size
-        self.block_size_factor = (self.offloaded_block_size //
-                                  self.gpu_block_size)
+        self.block_size_factor = self.offloaded_block_size // self.gpu_block_size
         self.manager: OffloadingManager = spec.get_manager()
 
         self._requests: dict[ReqId, Request] = {}
@@ -145,17 +154,18 @@ class OffloadingConnectorScheduler:
         self,
         req: Request,
         start_idx: int = 0,
-        end_idx: Optional[int] = None,
+        end_idx: int | None = None,
     ) -> Iterable[BlockHash]:
         return islice(
             req.block_hashes,
             self.block_size_factor * start_idx + self.block_size_factor - 1,
             self.block_size_factor * end_idx if end_idx else None,
-            self.block_size_factor)
+            self.block_size_factor,
+        )
 
     def get_num_new_matched_tokens(
-            self, request: Request,
-            num_computed_tokens: int) -> tuple[int, bool]:
+        self, request: Request, num_computed_tokens: int
+    ) -> tuple[int, bool]:
         """
         Get number of new tokens that can be loaded beyond the
         num_computed_tokens.
@@ -174,8 +184,7 @@ class OffloadingConnectorScheduler:
         """
         num_blocks = request.num_tokens // self.offloaded_block_size
 
-        assert (len(request.block_hashes) //
-                self.block_size_factor == num_blocks)
+        assert len(request.block_hashes) // self.block_size_factor == num_blocks
         block_hashes = self._get_block_hashes(request)
 
         self.manager.touch(block_hashes)
@@ -187,12 +196,14 @@ class OffloadingConnectorScheduler:
 
         start_block_idx = num_computed_tokens // self.offloaded_block_size
         hits = self.manager.lookup(
-            self._get_block_hashes(request, start_idx=start_block_idx))
+            self._get_block_hashes(request, start_idx=start_block_idx)
+        )
         if hits == 0:
             return 0, False
 
-        num_hit_tokens = (self.offloaded_block_size *
-                          (start_block_idx + hits) - num_computed_tokens)
+        num_hit_tokens = (
+            self.offloaded_block_size * (start_block_idx + hits) - num_computed_tokens
+        )
         logger.debug(
             "Request %s hit %s offloaded tokens after %s GPU hit tokens",
             request.request_id,
@@ -204,8 +215,9 @@ class OffloadingConnectorScheduler:
 
         return num_hit_tokens, True
 
-    def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks,
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int
+    ):
         self._requests[request.request_id] = request
         # the block ids are updated in _get_reqs_to_store
         self._request_block_ids[request.request_id] = []
@@ -216,31 +228,30 @@ class OffloadingConnectorScheduler:
         block_groups = blocks.get_block_ids()
         block_ids = block_groups[0]
 
-        num_computed_gpu_blocks = sum(block.block_hash is not None
-                                      for block in blocks.blocks[0])
+        num_computed_gpu_blocks = sum(
+            block.block_hash is not None for block in blocks.blocks[0]
+        )
         num_computed_tokens = num_computed_gpu_blocks * self.gpu_block_size
         full_block_tokens = num_computed_tokens + num_external_tokens
         assert full_block_tokens % self.offloaded_block_size == 0
 
         num_pending_gpu_blocks = len(block_ids) - num_computed_gpu_blocks
-        assert (num_external_tokens == num_pending_gpu_blocks *
-                self.gpu_block_size)
+        assert num_external_tokens == num_pending_gpu_blocks * self.gpu_block_size
 
         start_block_idx = num_computed_tokens // self.offloaded_block_size
         num_blocks = full_block_tokens // self.offloaded_block_size
 
-        assert (len(request.block_hashes) // self.block_size_factor
-                >= num_blocks)
-        block_hashes = self._get_block_hashes(request,
-                                              start_idx=start_block_idx,
-                                              end_idx=num_blocks)
+        assert len(request.block_hashes) // self.block_size_factor >= num_blocks
+        block_hashes = self._get_block_hashes(
+            request, start_idx=start_block_idx, end_idx=num_blocks
+        )
 
         src_spec = self.manager.prepare_load(block_hashes)
         dst_spec = GPULoadStoreSpec(block_ids[num_computed_gpu_blocks:])
 
-        block_hashes = self._get_block_hashes(request,
-                                              start_idx=start_block_idx,
-                                              end_idx=num_blocks)
+        block_hashes = self._get_block_hashes(
+            request, start_idx=start_block_idx, end_idx=num_blocks
+        )
 
         self._reqs_to_load[request.request_id] = (src_spec, dst_spec)
         self._reqs_being_loaded[request.request_id].update(block_hashes)
@@ -249,9 +260,7 @@ class OffloadingConnectorScheduler:
     def _get_reqs_to_store(self, scheduler_output: SchedulerOutput):
         reqs_to_store: dict[ReqId, TransferSpec] = {}
         # iterate over both new and cached requests
-        for req_id, new_block_id_groups, preempted in yield_req_data(
-                scheduler_output):
-
+        for req_id, new_block_id_groups, preempted in yield_req_data(scheduler_output):
             if preempted:
                 self._request_block_ids[req_id] = []
 
@@ -271,15 +280,18 @@ class OffloadingConnectorScheduler:
             if num_new_blocks <= 0:
                 continue
 
-            num_gpu_blocks = num_blocks * self.block_size_factor
-            assert len(req.block_hashes) >= num_gpu_blocks
+            # NOTE: In async scheduling, placeholders may temporarily make
+            # len(req.block_hashes) < num_blocks * self.block_size_factor.
 
             new_block_hashes = self._get_block_hashes(
-                req, start_idx=start_block_idx, end_idx=num_blocks)
+                req, start_idx=start_block_idx, end_idx=num_blocks
+            )
             store_output = self.manager.prepare_store(new_block_hashes)
             if store_output is None:
-                logger.warning("Cannot store %s blocks", num_new_blocks)
-                break
+                logger.warning(
+                    "Request %s: cannot store %s blocks", req_id, num_new_blocks
+                )
+                continue
 
             self._next_stored_block_idx[req_id] = num_blocks
 
@@ -291,7 +303,8 @@ class OffloadingConnectorScheduler:
             self.manager.touch(block_hashes)
 
             new_block_hashes = self._get_block_hashes(
-                req, start_idx=start_block_idx, end_idx=num_blocks)
+                req, start_idx=start_block_idx, end_idx=num_blocks
+            )
             dst_spec = store_output.store_spec
             src_block_ids: list[int] = []
             for idx, blk_hash in enumerate(new_block_hashes):
@@ -316,10 +329,12 @@ class OffloadingConnectorScheduler:
         return reqs_to_store
 
     def build_connector_meta(
-            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
         meta = OffloadingConnectorMetadata(
             reqs_to_load=self._reqs_to_load,
-            reqs_to_store=self._get_reqs_to_store(scheduler_output))
+            reqs_to_store=self._get_reqs_to_store(scheduler_output),
+        )
         self._reqs_to_load = {}
         return meta
 
@@ -345,7 +360,7 @@ class OffloadingConnectorScheduler:
         self,
         request: Request,
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
@@ -372,15 +387,16 @@ class OffloadingConnectorScheduler:
         """
         for event in self.manager.take_events():
             if event.removed:
-                yield BlockRemoved(block_hashes=event.block_hashes,
-                                   medium=event.medium)
+                yield BlockRemoved(block_hashes=event.block_hashes, medium=event.medium)
             else:
-                yield BlockStored(block_hashes=event.block_hashes,
-                                  parent_block_hash=None,
-                                  token_ids=[],
-                                  lora_id=None,
-                                  block_size=event.block_size,
-                                  medium=event.medium)
+                yield BlockStored(
+                    block_hashes=event.block_hashes,
+                    parent_block_hash=None,
+                    token_ids=[],
+                    lora_id=None,
+                    block_size=event.block_size,
+                    medium=event.medium,
+                )
 
 
 class OffloadingConnectorWorker:
@@ -407,7 +423,7 @@ class OffloadingConnectorWorker:
         return job_id
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
-        for src_cls, dst_cls, handler in (self.spec.get_handlers(kv_caches)):
+        for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
             self.worker.register_handler(src_cls, dst_cls, handler)
 
     def start_load_kv(self, metadata: OffloadingConnectorMetadata):
@@ -425,8 +441,7 @@ class OffloadingConnectorWorker:
             self._store_jobs[req_id].add(job_id)
             assert self.worker.transfer_async(job_id, transfer_spec)
 
-    def get_finished(self,
-                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+    def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -470,7 +485,8 @@ class OffloadingConnectorWorker:
 
 
 def yield_req_data(
-        scheduler_output) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
+    scheduler_output,
+) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
     """
     Yields:
         (req_id, new_block_id_groups, preempted)
@@ -481,5 +497,8 @@ def yield_req_data(
 
     # cached requests
     cached_reqs = scheduler_output.scheduled_cached_reqs
-    yield from zip(cached_reqs.req_ids, cached_reqs.new_block_ids,
-                   cached_reqs.resumed_from_preemption)
+    yield from zip(
+        cached_reqs.req_ids,
+        cached_reqs.new_block_ids,
+        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
+    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 3dadfa595ef1ef0e74ecf44a0d2dda0ae0d3802d..a124a0d519db8b179c15819bdb8d6786eef41df1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -9,9 +9,13 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import (
-    P2pNcclEngine)
+    P2pNcclEngine,
+)
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
@@ -21,6 +25,7 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -36,8 +41,9 @@ class ReqMeta:
     num_tokens: int
 
     @staticmethod
-    def make_meta(request_id: str, token_ids: list[int], block_ids: list[int],
-                  block_size: int) -> "ReqMeta":
+    def make_meta(
+        request_id: str, token_ids: list[int], block_ids: list[int], block_size: int
+    ) -> "ReqMeta":
         block_ids_tensor = torch.tensor(block_ids)
         return ReqMeta(
             request_id=request_id,
@@ -61,37 +67,48 @@ class P2pNcclConnectorMetadata(KVConnectorMetadata):
         block_size: int,
     ) -> None:
         self.requests.append(
-            ReqMeta.make_meta(request_id, token_ids, block_ids, block_size))
+            ReqMeta.make_meta(request_id, token_ids, block_ids, block_size)
+        )
 
 
 class P2pNcclConnector(KVConnectorBase_V1):
-
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config,
+            role=role,
+            kv_cache_config=kv_cache_config,
+        )
         self._block_size = vllm_config.cache_config.block_size
         self._requests_need_load: dict[str, Any] = {}
-        self.config = vllm_config.kv_transfer_config
-        self.is_producer = self.config.is_kv_producer
-        self.chunked_prefill: dict[str, Any] = {}
-
-        self._rank = get_world_group().rank \
-            if role == KVConnectorRole.WORKER else 0
-        self._local_rank = get_world_group().local_rank \
-            if role == KVConnectorRole.WORKER else 0
-
-        self.p2p_nccl_engine = P2pNcclEngine(
-            local_rank=self._local_rank,
-            config=self.config,
-            hostname="",
-            port_offset=self._rank,
-        ) if role == KVConnectorRole.WORKER else None
+        self.is_producer = self._kv_transfer_config.is_kv_producer
+        self.chunked_prefill: dict[str, tuple[list[int], list[int] | None]] = {}
+
+        self._rank = get_world_group().rank if role == KVConnectorRole.WORKER else 0
+        self._local_rank = (
+            get_world_group().local_rank if role == KVConnectorRole.WORKER else 0
+        )
+
+        self.p2p_nccl_engine = (
+            P2pNcclEngine(
+                local_rank=self._local_rank,
+                config=self._kv_transfer_config,
+                hostname="",
+                port_offset=self._rank,
+            )
+            if role == KVConnectorRole.WORKER
+            else None
+        )
 
     # ==============================
     # Worker-side methods
     # ==============================
 
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs: Any) -> None:
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
         """Start loading the KV cache from the connector buffer to vLLM's
         paged KV buffer.
 
@@ -143,8 +160,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
             Returns:
                 None. The function modifies `layer` in-place.
             """
-            if (isinstance(attn_metadata, MLACommonMetadata)
-                    or layer.shape[1] == 2):  # MLA or FlashInfer
+            if (
+                isinstance(attn_metadata, MLACommonMetadata) or layer.shape[1] == 2
+            ):  # MLA or FlashInfer
                 num_block = kv_cache.shape[0]
                 self.check_tensors_except_dim(layer, kv_cache, 0)
                 if len(block_ids) == num_block:
@@ -153,8 +171,11 @@ class P2pNcclConnector(KVConnectorBase_V1):
                     layer[block_ids[:num_block], ...] = kv_cache
                     logger.warning(
                         "🚧kv_cache does not match, block_ids:%d, "
-                        "num_block:%d, request_id:%s", len(block_ids),
-                        num_block, request_id)
+                        "num_block:%d, request_id:%s",
+                        len(block_ids),
+                        num_block,
+                        request_id,
+                    )
 
             elif layer.shape[0] == 2:  # FlashAttention
                 num_block = kv_cache.shape[1]
@@ -165,12 +186,14 @@ class P2pNcclConnector(KVConnectorBase_V1):
                     layer[:, block_ids[:num_block], ...] = kv_cache
                     logger.warning(
                         "🚧kv_cache does not match, block_ids:%d, "
-                        "num_block:%d, request_id:%s", len(block_ids),
-                        num_block, request_id)
+                        "num_block:%d, request_id:%s",
+                        len(block_ids),
+                        num_block,
+                        request_id,
+                    )
 
         # Get the metadata
-        metadata: KVConnectorMetadata = \
-            self._get_connector_metadata()
+        metadata: KVConnectorMetadata = self._get_connector_metadata()
         assert isinstance(metadata, P2pNcclConnectorMetadata)
 
         if metadata is None:
@@ -187,21 +210,23 @@ class P2pNcclConnector(KVConnectorBase_V1):
                 # Only process layers that have kv_cache
                 # attribute (attention layers) Skip non-attention
                 # layers like FusedMoE
-                kv_cache = getattr(layer, 'kv_cache', None)
+                kv_cache = getattr(layer, "kv_cache", None)
                 if kv_cache is None:
                     continue
 
                 layer = kv_cache[forward_context.virtual_engine]
 
                 kv_cache = self.p2p_nccl_engine.recv_tensor(
-                    request.request_id + "#" + layer_name, remote_address)
+                    request.request_id + "#" + layer_name, remote_address
+                )
 
                 if kv_cache is None:
                     logger.warning("🚧kv_cache is None, %s", request.request_id)
                     continue
 
-                inject_kv_into_layer(layer, kv_cache, request.block_ids,
-                                     request.request_id)
+                inject_kv_into_layer(
+                    layer, kv_cache, request.block_ids, request.request_id
+                )
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """Blocking until the KV for a specific layer is loaded into vLLM's
@@ -214,9 +239,13 @@ class P2pNcclConnector(KVConnectorBase_V1):
         """
         return
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata",
-                      **kwargs: Any) -> None:
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
         to the connector.
 
@@ -255,8 +284,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
                 torch.Tensor: A tensor containing the extracted KV slices.
                 Returns None if the layout is unsupported.
             """
-            if (isinstance(attn_metadata, MLACommonMetadata)
-                    or layer.shape[1] == 2):  # MLA or FlashInfer
+            if (
+                isinstance(attn_metadata, MLACommonMetadata) or layer.shape[1] == 2
+            ):  # MLA or FlashInfer
                 return layer[block_ids, ...]
 
             if layer.shape[0] == 2:  # FlashAttention
@@ -272,8 +302,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
             remote_address = ip + ":" + str(port + self._rank)
 
             kv_cache = extract_kv_from_layer(kv_layer, request.block_ids)
-            self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
-                                             kv_cache, remote_address)
+            self.p2p_nccl_engine.send_tensor(
+                request_id + "#" + layer_name, kv_cache, remote_address
+            )
 
     def wait_for_save(self):
         if self.is_producer:
@@ -281,8 +312,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
             self.p2p_nccl_engine.wait_for_sent()
 
     def get_finished(
-            self, finished_req_ids: set[str],
-            **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        self, finished_req_ids: set[str], **kwargs: Any
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -296,10 +327,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         assert self.p2p_nccl_engine is not None
 
-        no_compile_layers = (
-            self._vllm_config.compilation_config.static_forward_context)
-        return self.p2p_nccl_engine.get_finished(finished_req_ids,
-                                                 no_compile_layers)
+        no_compile_layers = self._vllm_config.compilation_config.static_forward_context
+        return self.p2p_nccl_engine.get_finished(finished_req_ids, no_compile_layers)
 
     # ==============================
     # Scheduler-side methods
@@ -326,23 +355,25 @@ class P2pNcclConnector(KVConnectorBase_V1):
         if self.is_producer:
             return 0, False
 
-        num_external_tokens = (len(request.prompt_token_ids) - 1 -
-                               num_computed_tokens)
+        prompt_token_ids = request.prompt_token_ids or []
+        num_external_tokens = len(prompt_token_ids) - 1 - num_computed_tokens
 
         if num_external_tokens < 0:
             num_external_tokens = 0
 
         return num_external_tokens, False
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         """
         Update KVConnector state after block allocation.
         """
         if not self.is_producer and num_external_tokens > 0:
             self._requests_need_load[request.request_id] = (
-                request, blocks.get_block_ids()[0])
+                request,
+                blocks.get_block_ids()[0],
+            )
 
     def build_connector_meta(
         self,
@@ -361,53 +392,62 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         for new_req in scheduler_output.scheduled_new_reqs:
             if self.is_producer:
-                num_scheduled_tokens = (
-                    scheduler_output.num_scheduled_tokens)[new_req.req_id]
+                num_scheduled_tokens = (scheduler_output.num_scheduled_tokens)[
+                    new_req.req_id
+                ]
                 num_tokens = num_scheduled_tokens + new_req.num_computed_tokens
                 # the request's prompt is chunked prefill
-                if num_tokens < len(new_req.prompt_token_ids):
+                if num_tokens < len(new_req.prompt_token_ids or []):
                     # 'CachedRequestData' has no attribute 'prompt_token_ids'
                     self.chunked_prefill[new_req.req_id] = (
-                        new_req.block_ids[0], new_req.prompt_token_ids)
+                        new_req.block_ids[0],
+                        new_req.prompt_token_ids,
+                    )
                     continue
                 # the request's prompt is not chunked prefill
-                meta.add_request(request_id=new_req.req_id,
-                                 token_ids=new_req.prompt_token_ids,
-                                 block_ids=new_req.block_ids[0],
-                                 block_size=self._block_size)
+                meta.add_request(
+                    request_id=new_req.req_id,
+                    token_ids=new_req.prompt_token_ids or [],
+                    block_ids=new_req.block_ids[0],
+                    block_size=self._block_size,
+                )
                 continue
             if new_req.req_id in self._requests_need_load:
-                meta.add_request(request_id=new_req.req_id,
-                                 token_ids=new_req.prompt_token_ids,
-                                 block_ids=new_req.block_ids[0],
-                                 block_size=self._block_size)
+                meta.add_request(
+                    request_id=new_req.req_id,
+                    token_ids=new_req.prompt_token_ids or [],
+                    block_ids=new_req.block_ids[0],
+                    block_size=self._block_size,
+                )
                 self._requests_need_load.pop(new_req.req_id)
 
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
             num_computed_tokens = cached_reqs.num_computed_tokens[i]
             new_block_ids = cached_reqs.new_block_ids[i]
-            resumed_from_preemption = cached_reqs.resumed_from_preemption[i]
+            resumed_from_preemption = req_id in cached_reqs.resumed_req_ids
 
             if self.is_producer:
-                num_scheduled_tokens = (
-                    scheduler_output.num_scheduled_tokens)[req_id]
-                num_tokens = (num_scheduled_tokens + num_computed_tokens)
+                num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+                num_tokens = num_scheduled_tokens + num_computed_tokens
                 assert req_id in self.chunked_prefill
+                assert new_block_ids is not None
                 block_ids = new_block_ids[0]
                 if not resumed_from_preemption:
-                    block_ids = (self.chunked_prefill[req_id][0] + block_ids)
+                    block_ids = self.chunked_prefill[req_id][0] + block_ids
                 prompt_token_ids = self.chunked_prefill[req_id][1]
+                assert prompt_token_ids is not None
                 # the request's prompt is chunked prefill again
                 if num_tokens < len(prompt_token_ids):
-                    self.chunked_prefill[req_id] = (block_ids,
-                                                    prompt_token_ids)
+                    self.chunked_prefill[req_id] = (block_ids, prompt_token_ids)
                     continue
                 # the request's prompt is all prefilled finally
-                meta.add_request(request_id=req_id,
-                                 token_ids=prompt_token_ids,
-                                 block_ids=block_ids,
-                                 block_size=self._block_size)
+                meta.add_request(
+                    request_id=req_id,
+                    token_ids=prompt_token_ids,
+                    block_ids=block_ids,
+                    block_size=self._block_size,
+                )
                 self.chunked_prefill.pop(req_id, None)
                 continue
 
@@ -422,12 +462,15 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
                 # NOTE(rob): For resumed req, new_block_ids is all
                 # of the block_ids for the request.
+                assert new_block_ids is not None
                 block_ids = new_block_ids[0]
 
-                meta.add_request(request_id=req_id,
-                                 token_ids=token_ids,
-                                 block_ids=block_ids,
-                                 block_size=self._block_size)
+                meta.add_request(
+                    request_id=req_id,
+                    token_ids=token_ids,
+                    block_ids=block_ids,
+                    block_size=self._block_size,
+                )
 
         self._requests_need_load.clear()
         return meta
@@ -436,7 +479,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
@@ -472,8 +515,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
             port = int(match.group(2))
 
             return ip, port
-        raise ValueError(
-            f"Request id {request_id} does not contain hostname and port")
+        raise ValueError(f"Request id {request_id} does not contain hostname and port")
 
     @staticmethod
     def check_tensors_except_dim(tensor1, tensor2, dim):
@@ -481,8 +523,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
         shape2 = tensor2.size()
 
         if len(shape1) != len(shape2) or not all(
-                s1 == s2
-                for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim):
+            s1 == s2 for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim
+        ):
             raise NotImplementedError(
                 "Currently, only symmetric TP is supported. Asymmetric TP, PP,"
-                "and others will be supported in future PRs.")
+                "and others will be supported in future PRs."
+            )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index 959bf0277a3f5429096194d44dbaeb10c476539c..0e748db666e6472110569248ab9823411d3f546f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -1,15 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
 import logging
 import os
 import threading
 import time
-import typing
 from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import msgpack
 import torch
@@ -17,10 +17,17 @@ import zmq
 
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.device_communicators.pynccl_wrapper import (
-    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum)
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import (  # noqa: E501
-    TensorMemoryPool)
-from vllm.utils import current_stream, get_ip
+    TensorMemoryPool,
+)
+from vllm.utils.network_utils import get_ip
+from vllm.utils.torch_utils import current_stream
 
 logger = logging.getLogger(__name__)
 
@@ -31,12 +38,12 @@ DEFAULT_MEM_POOL_SIZE_GB = 32
 def set_p2p_nccl_context(num_channels: str):
     original_values: dict[str, Any] = {}
     env_vars = [
-        'NCCL_MAX_NCHANNELS',
-        'NCCL_MIN_NCHANNELS',
-        'NCCL_CUMEM_ENABLE',
-        'NCCL_BUFFSIZE',
-        'NCCL_PROTO',  # LL,LL128,SIMPLE
-        'NCCL_ALGO',  # RING,TREE
+        "NCCL_MAX_NCHANNELS",
+        "NCCL_MIN_NCHANNELS",
+        "NCCL_CUMEM_ENABLE",
+        "NCCL_BUFFSIZE",
+        "NCCL_PROTO",  # LL,LL128,SIMPLE
+        "NCCL_ALGO",  # RING,TREE
     ]
 
     for var in env_vars:
@@ -45,9 +52,9 @@ def set_p2p_nccl_context(num_channels: str):
     logger.info("set_p2p_nccl_context, original_values: %s", original_values)
 
     try:
-        os.environ['NCCL_MAX_NCHANNELS'] = num_channels
-        os.environ['NCCL_MIN_NCHANNELS'] = num_channels
-        os.environ['NCCL_CUMEM_ENABLE'] = '1'
+        os.environ["NCCL_MAX_NCHANNELS"] = num_channels
+        os.environ["NCCL_MIN_NCHANNELS"] = num_channels
+        os.environ["NCCL_CUMEM_ENABLE"] = "1"
         yield
     finally:
         for var in env_vars:
@@ -65,13 +72,14 @@ class SendQueueItem:
 
 
 class P2pNcclEngine:
-
-    def __init__(self,
-                 local_rank: int,
-                 config: KVTransferConfig,
-                 hostname: str = "",
-                 port_offset: int = 0,
-                 library_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        local_rank: int,
+        config: KVTransferConfig,
+        hostname: str = "",
+        port_offset: int = 0,
+        library_path: str | None = None,
+    ) -> None:
         self.config = config
         self.rank = port_offset
         self.local_rank = local_rank
@@ -89,19 +97,30 @@ class P2pNcclEngine:
         # Each card corresponds to a ZMQ address.
         self.zmq_address = f"{self._hostname}:{self._port}"
 
-        # The `http_port` must be consistent with the port of OpenAI.
-        self.http_address = (
-            f"{self._hostname}:"
-            f"{self.config.kv_connector_extra_config['http_port']}")
-
         # If `proxy_ip` or `proxy_port` is `""`,
         # then the ping thread will not be enabled.
         proxy_ip = self.config.get_from_extra_config("proxy_ip", "")
         proxy_port = self.config.get_from_extra_config("proxy_port", "")
         if proxy_ip == "" or proxy_port == "":
             self.proxy_address = ""
+            self.http_address = ""
         else:
             self.proxy_address = proxy_ip + ":" + proxy_port
+            # the `http_port` must be consistent with the port of OpenAI.
+            http_port = self.config.get_from_extra_config("http_port", None)
+            if http_port is None:
+                example_cfg = {
+                    "kv_connector": "P2pNcclConnector",
+                    "kv_connector_extra_config": {"http_port": 8000},
+                }
+                example = (
+                    f"--port=8000 --kv-transfer-config='{json.dumps(example_cfg)}'"
+                )
+                raise ValueError(
+                    "kv_connector_extra_config.http_port is required. "
+                    f"Example: {example}"
+                )
+            self.http_address = f"{self._hostname}:{http_port}"
 
         self.context = zmq.Context()
         self.router_socket = self.context.socket(zmq.ROUTER)
@@ -118,15 +137,17 @@ class P2pNcclEngine:
         self.recv_stream = torch.cuda.Stream()
 
         mem_pool_size_gb = float(
-            self.config.get_from_extra_config("mem_pool_size_gb",
-                                              DEFAULT_MEM_POOL_SIZE_GB))
-        self.pool = TensorMemoryPool(max_block_size=int(mem_pool_size_gb *
-                                                        1024**3))  # GB
+            self.config.get_from_extra_config(
+                "mem_pool_size_gb", DEFAULT_MEM_POOL_SIZE_GB
+            )
+        )
+        self.pool = TensorMemoryPool(
+            max_block_size=int(mem_pool_size_gb * 1024**3)
+        )  # GB
 
         # The sending type includes tree mutually exclusive options:
         # PUT, GET, PUT_ASYNC.
-        self.send_type = self.config.get_from_extra_config(
-            "send_type", "PUT_ASYNC")
+        self.send_type = self.config.get_from_extra_config("send_type", "PUT_ASYNC")
         if self.send_type == "GET":
             # tensor_id: torch.Tensor
             self.send_store: dict[str, torch.Tensor] = {}
@@ -135,8 +156,9 @@ class P2pNcclEngine:
             # tensor_id: torch.Tensor
             self.send_queue: deque[SendQueueItem] = deque()
             if self.send_type == "PUT_ASYNC":
-                self._send_thread = threading.Thread(target=self.send_async,
-                                                     daemon=True)
+                self._send_thread = threading.Thread(
+                    target=self.send_async, daemon=True
+                )
                 self._send_thread.start()
 
         # tensor_id: torch.Tensor/(addr, dtype, shape)
@@ -150,10 +172,12 @@ class P2pNcclEngine:
         self.buffer_size_threshold = float(self.config.kv_buffer_size)
 
         self.nccl_num_channels = self.config.get_from_extra_config(
-            "nccl_num_channels", "8")
+            "nccl_num_channels", "8"
+        )
 
         self._listener_thread = threading.Thread(
-            target=self.listen_for_requests, daemon=True)
+            target=self.listen_for_requests, daemon=True
+        )
         self._listener_thread.start()
 
         self._ping_thread = None
@@ -164,11 +188,18 @@ class P2pNcclEngine:
         logger.info(
             "💯P2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, "
             "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_"
-            "threshold:%.2f, nccl_num_channels:%s", self.rank, self.local_rank,
-            self.http_address, self.zmq_address, self.proxy_address,
-            self.send_type, self.buffer_size_threshold, self.nccl_num_channels)
-
-    def create_connect(self, remote_address: typing.Optional[str] = None):
+            "threshold:%.2f, nccl_num_channels:%s",
+            self.rank,
+            self.local_rank,
+            self.http_address,
+            self.zmq_address,
+            self.proxy_address,
+            self.send_type,
+            self.buffer_size_threshold,
+            self.nccl_num_channels,
+        )
+
+    def create_connect(self, remote_address: str | None = None):
         assert remote_address is not None
         if remote_address not in self.socks:
             sock = self.context.socket(zmq.DEALER)
@@ -176,8 +207,11 @@ class P2pNcclEngine:
             sock.connect(f"tcp://{remote_address}")
             self.socks[remote_address] = sock
             if remote_address in self.comms:
-                logger.info("👋comm exists, remote_address:%s, comms:%s",
-                            remote_address, self.comms)
+                logger.info(
+                    "👋comm exists, remote_address:%s, comms:%s",
+                    remote_address,
+                    self.comms,
+                )
                 return sock, self.comms[remote_address]
 
             unique_id = self.nccl.ncclGetUniqueId()
@@ -187,11 +221,14 @@ class P2pNcclEngine:
             with torch.cuda.device(self.device):
                 rank = 0
                 with set_p2p_nccl_context(self.nccl_num_channels):
-                    comm: ncclComm_t = self.nccl.ncclCommInitRank(
-                        2, unique_id, rank)
+                    comm: ncclComm_t = self.nccl.ncclCommInitRank(2, unique_id, rank)
                 self.comms[remote_address] = (comm, rank)
-                logger.info("🤝ncclCommInitRank Success, %s👉%s, MyRank:%s",
-                            self.zmq_address, remote_address, rank)
+                logger.info(
+                    "🤝ncclCommInitRank Success, %s👉%s, MyRank:%s",
+                    self.zmq_address,
+                    remote_address,
+                    rank,
+                )
 
         return self.socks[remote_address], self.comms[remote_address]
 
@@ -199,7 +236,7 @@ class P2pNcclEngine:
         self,
         tensor_id: str,
         tensor: torch.Tensor,
-        remote_address: typing.Optional[str] = None,
+        remote_address: str | None = None,
     ) -> bool:
         if remote_address is None:
             with self.recv_store_cv:
@@ -207,9 +244,9 @@ class P2pNcclEngine:
                 self.recv_store_cv.notify()
             return True
 
-        item = SendQueueItem(tensor_id=tensor_id,
-                             remote_address=remote_address,
-                             tensor=tensor)
+        item = SendQueueItem(
+            tensor_id=tensor_id, remote_address=remote_address, tensor=tensor
+        )
 
         if self.send_type == "PUT":
             return self.send_sync(item)
@@ -227,37 +264,51 @@ class P2pNcclEngine:
                 logger.warning(
                     "❗[GET]tensor_id:%s, tensor_size:%d, is greater than"
                     "buffer size threshold :%d, skip send to %s, rank:%d",
-                    tensor_id, tensor_size, self.buffer_size_threshold,
-                    remote_address, self.rank)
+                    tensor_id,
+                    tensor_size,
+                    self.buffer_size_threshold,
+                    remote_address,
+                    self.rank,
+                )
                 return False
-            while (self.buffer_size + tensor_size
-                   > self.buffer_size_threshold):
+            while self.buffer_size + tensor_size > self.buffer_size_threshold:
                 assert len(self.send_store) > 0
                 oldest_tensor_id = next(iter(self.send_store))
                 oldest_tensor = self.send_store.pop(oldest_tensor_id)
-                oldest_tensor_size = oldest_tensor.element_size(
-                ) * oldest_tensor.numel()
+                oldest_tensor_size = (
+                    oldest_tensor.element_size() * oldest_tensor.numel()
+                )
                 self.buffer_size -= oldest_tensor_size
                 logger.debug(
                     "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d,"
                     " buffer_size:%d, oldest_tensor_size:%d, rank:%d",
-                    remote_address, tensor_id, tensor_size, self.buffer_size,
-                    oldest_tensor_size, self.rank)
+                    remote_address,
+                    tensor_id,
+                    tensor_size,
+                    self.buffer_size,
+                    oldest_tensor_size,
+                    self.rank,
+                )
 
             self.send_store[tensor_id] = tensor
             self.buffer_size += tensor_size
             logger.debug(
                 "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, "
-                "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", remote_address,
-                tensor_id, tensor_size, tensor.shape, self.rank,
+                "shape:%s, rank:%d, buffer_size:%d(%.2f%%)",
+                remote_address,
+                tensor_id,
+                tensor_size,
+                tensor.shape,
+                self.rank,
                 self.buffer_size,
-                self.buffer_size / self.buffer_size_threshold * 100)
+                self.buffer_size / self.buffer_size_threshold * 100,
+            )
         return True
 
     def recv_tensor(
         self,
         tensor_id: str,
-        remote_address: typing.Optional[str] = None,
+        remote_address: str | None = None,
     ) -> torch.Tensor:
         if self.send_type == "PUT" or self.send_type == "PUT_ASYNC":
             start_time = time.time()
@@ -269,17 +320,18 @@ class P2pNcclEngine:
             if tensor is not None:
                 if isinstance(tensor, tuple):
                     addr, dtype, shape = tensor
-                    tensor = self.pool.load_tensor(addr, dtype, shape,
-                                                   self.device)
+                    tensor = self.pool.load_tensor(addr, dtype, shape, self.device)
                 else:
-                    self.buffer_size -= (tensor.element_size() *
-                                         tensor.numel())
+                    self.buffer_size -= tensor.element_size() * tensor.numel()
             else:
                 duration = time.time() - start_time
                 logger.warning(
-                    "🔴[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, "
-                    "rank:%d", remote_address, tensor_id, duration * 1000,
-                    self.rank)
+                    "🔴[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, rank:%d",
+                    remote_address,
+                    tensor_id,
+                    duration * 1000,
+                    self.rank,
+                )
             return tensor
 
         # GET
@@ -298,14 +350,18 @@ class P2pNcclEngine:
         message = sock.recv()
         data = msgpack.loads(message)
         if data["ret"] != 0:
-            logger.warning("🔴[GET]Recv From %s, tensor_id: %s, ret: %d",
-                           remote_address, tensor_id, data["ret"])
+            logger.warning(
+                "🔴[GET]Recv From %s, tensor_id: %s, ret: %d",
+                remote_address,
+                tensor_id,
+                data["ret"],
+            )
             return None
 
         with torch.cuda.stream(self.recv_stream):
-            tensor = torch.empty(data["shape"],
-                                 dtype=getattr(torch, data["dtype"]),
-                                 device=self.device)
+            tensor = torch.empty(
+                data["shape"], dtype=getattr(torch, data["dtype"]), device=self.device
+            )
 
         self.recv(comm, tensor, rank ^ 1, self.recv_stream)
 
@@ -320,38 +376,45 @@ class P2pNcclEngine:
             remote_address, message = self.router_socket.recv_multipart()
             data = msgpack.loads(message)
             if data["cmd"] == "NEW":
-                unique_id = self.nccl.unique_id_from_bytes(
-                    bytes(data["unique_id"]))
+                unique_id = self.nccl.unique_id_from_bytes(bytes(data["unique_id"]))
                 with torch.cuda.device(self.device):
                     rank = 1
                     with set_p2p_nccl_context(self.nccl_num_channels):
                         comm: ncclComm_t = self.nccl.ncclCommInitRank(
-                            2, unique_id, rank)
+                            2, unique_id, rank
+                        )
                     self.comms[remote_address.decode()] = (comm, rank)
-                    logger.info("🤝ncclCommInitRank Success, %s👈%s, MyRank:%s",
-                                self.zmq_address, remote_address.decode(),
-                                rank)
+                    logger.info(
+                        "🤝ncclCommInitRank Success, %s👈%s, MyRank:%s",
+                        self.zmq_address,
+                        remote_address.decode(),
+                        rank,
+                    )
             elif data["cmd"] == "PUT":
                 tensor_id = data["tensor_id"]
                 try:
                     with torch.cuda.stream(self.recv_stream):
-                        tensor = torch.empty(data["shape"],
-                                             dtype=getattr(
-                                                 torch, data["dtype"]),
-                                             device=self.device)
+                        tensor = torch.empty(
+                            data["shape"],
+                            dtype=getattr(torch, data["dtype"]),
+                            device=self.device,
+                        )
                     self.router_socket.send_multipart([remote_address, b"0"])
                     comm, rank = self.comms[remote_address.decode()]
                     self.recv(comm, tensor, rank ^ 1, self.recv_stream)
                     tensor_size = tensor.element_size() * tensor.numel()
-                    if (self.buffer_size + tensor_size
-                            > self.buffer_size_threshold):
+                    if self.buffer_size + tensor_size > self.buffer_size_threshold:
                         # Store Tensor in memory pool
                         addr = self.pool.store_tensor(tensor)
                         tensor = (addr, tensor.dtype, tensor.shape)
                         logger.warning(
                             "🔴[PUT]Recv Tensor, Out Of Threshold, "
-                            "%s👈%s, data:%s, addr:%d", self.zmq_address,
-                            remote_address.decode(), data, addr)
+                            "%s👈%s, data:%s, addr:%d",
+                            self.zmq_address,
+                            remote_address.decode(),
+                            data,
+                            addr,
+                        )
                     else:
                         self.buffer_size += tensor_size
 
@@ -359,9 +422,11 @@ class P2pNcclEngine:
                     self.router_socket.send_multipart([remote_address, b"1"])
                     tensor = None
                     logger.warning(
-                        "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, "
-                        "data:%s", self.zmq_address, remote_address.decode(),
-                        data)
+                        "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, data:%s",
+                        self.zmq_address,
+                        remote_address.decode(),
+                        data,
+                    )
 
                 with self.recv_store_cv:
                     self.recv_store[tensor_id] = tensor
@@ -376,7 +441,7 @@ class P2pNcclEngine:
                         data = {
                             "ret": 0,
                             "shape": tensor.shape,
-                            "dtype": str(tensor.dtype).replace("torch.", "")
+                            "dtype": str(tensor.dtype).replace("torch.", ""),
                         }
                         # LRU
                         self.send_store[tensor_id] = tensor
@@ -384,26 +449,26 @@ class P2pNcclEngine:
                     else:
                         data = {"ret": 1}
 
-                self.router_socket.send_multipart(
-                    [remote_address, msgpack.dumps(data)])
+                self.router_socket.send_multipart([remote_address, msgpack.dumps(data)])
 
                 if data["ret"] == 0:
                     comm, rank = self.comms[remote_address.decode()]
-                    self.send(comm, tensor.to(self.device), rank ^ 1,
-                              self.send_stream)
+                    self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream)
             else:
                 logger.warning(
                     "🚧Unexpected, Received message from %s, data:%s",
-                    remote_address, data)
+                    remote_address,
+                    data,
+                )
 
     def have_sent_tensor_id(self, tensor_id: str):
-        request_id = tensor_id.split('#')[0]
+        request_id = tensor_id.split("#")[0]
         if request_id not in self.send_request_id_to_tensor_ids:
             self.send_request_id_to_tensor_ids[request_id] = set()
         self.send_request_id_to_tensor_ids[request_id].add(tensor_id)
 
     def have_received_tensor_id(self, tensor_id: str):
-        request_id = tensor_id.split('#')[0]
+        request_id = tensor_id.split("#")[0]
         if request_id not in self.recv_request_id_to_tensor_ids:
             self.recv_request_id_to_tensor_ids[request_id] = set()
         self.recv_request_id_to_tensor_ids[request_id].add(tensor_id)
@@ -427,7 +492,10 @@ class P2pNcclEngine:
             duration = time.time() - start_time
             logger.debug(
                 "🚧[PUT_ASYNC]It took %.3fms to wait for the send_queue"
-                " to be empty, rank:%d", duration * 1000, self.rank)
+                " to be empty, rank:%d",
+                duration * 1000,
+                self.rank,
+            )
 
     def send_sync(self, item: SendQueueItem) -> bool:
         if item.remote_address is None:
@@ -443,7 +511,7 @@ class P2pNcclEngine:
             "cmd": "PUT",
             "tensor_id": item.tensor_id,
             "shape": tensor.shape,
-            "dtype": str(tensor.dtype).replace("torch.", "")
+            "dtype": str(tensor.dtype).replace("torch.", ""),
         }
         sock.send(msgpack.dumps(data))
 
@@ -452,10 +520,14 @@ class P2pNcclEngine:
             logger.error(
                 "🔴Send Tensor, Peer Out Of Memory/Threshold, %s 👉 %s, "
                 "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s",
-                self.zmq_address, item.remote_address, rank, data,
+                self.zmq_address,
+                item.remote_address,
+                rank,
+                data,
                 tensor.shape,
                 tensor.element_size() * tensor.numel() / 1024**3,
-                response.decode())
+                response.decode(),
+            )
             return False
 
         self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream)
@@ -466,8 +538,8 @@ class P2pNcclEngine:
         return True
 
     def get_finished(
-            self, finished_req_ids: set[str], no_compile_layers
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        self, finished_req_ids: set[str], no_compile_layers
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -486,10 +558,8 @@ class P2pNcclEngine:
                 if tensor_id in self.recv_store:
                     with self.recv_store_cv:
                         tensor = self.recv_store.pop(tensor_id, None)
-                        self.send_request_id_to_tensor_ids.pop(
-                            request_id, None)
-                        self.recv_request_id_to_tensor_ids.pop(
-                            request_id, None)
+                        self.send_request_id_to_tensor_ids.pop(request_id, None)
+                        self.recv_request_id_to_tensor_ids.pop(request_id, None)
                     if isinstance(tensor, tuple):
                         addr, _, _ = tensor
                         self.pool.free(addr)
@@ -510,7 +580,7 @@ class P2pNcclEngine:
         data = {
             "type": "P" if self.config.is_kv_producer else "D",
             "http_address": self.http_address,
-            "zmq_address": self.zmq_address
+            "zmq_address": self.zmq_address,
         }
         while True:
             sock.send(msgpack.dumps(data))
@@ -519,27 +589,39 @@ class P2pNcclEngine:
     def send(self, comm, tensor: torch.Tensor, dst: int, stream=None):
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
 
         with torch.cuda.stream(stream):
-            self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
-                               ncclDataTypeEnum.from_torch(tensor.dtype), dst,
-                               comm, cudaStream_t(stream.cuda_stream))
+            self.nccl.ncclSend(
+                buffer_type(tensor.data_ptr()),
+                tensor.numel(),
+                ncclDataTypeEnum.from_torch(tensor.dtype),
+                dst,
+                comm,
+                cudaStream_t(stream.cuda_stream),
+            )
         stream.synchronize()
 
     def recv(self, comm, tensor: torch.Tensor, src: int, stream=None):
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
 
         with torch.cuda.stream(stream):
-            self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
-                               ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                               comm, cudaStream_t(stream.cuda_stream))
+            self.nccl.ncclRecv(
+                buffer_type(tensor.data_ptr()),
+                tensor.numel(),
+                ncclDataTypeEnum.from_torch(tensor.dtype),
+                src,
+                comm,
+                cudaStream_t(stream.cuda_stream),
+            )
         stream.synchronize()
 
     def close(self) -> None:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
index 26070488bad898bee7abb7a18665aa094d181912..899f1eae86d277103f5c9cd9b8e16a8059ce4f53 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -67,8 +67,7 @@ class TensorMemoryPool:
         if max_block_size <= 0 or min_block_size <= 0:
             raise ValueError("Block sizes must be positive")
         if max_block_size < min_block_size:
-            raise ValueError(
-                "Max block size must be greater than min block size")
+            raise ValueError("Max block size must be greater than min block size")
 
         self.max_block_size = self._round_to_power_of_two(max_block_size)
         self.min_block_size = self._round_to_power_of_two(min_block_size)
@@ -91,17 +90,18 @@ class TensorMemoryPool:
             size //= 2
 
     def _allocate_pinned_memory(self):
-        self.base_tensor = torch.empty(self.max_block_size // 4,
-                                       dtype=torch.float32,
-                                       pin_memory=True)
+        self.base_tensor = torch.empty(
+            self.max_block_size // 4, dtype=torch.float32, pin_memory=True
+        )
         self.base_address = self.base_tensor.data_ptr()
-        initial_block = MemoryBlock(size=self.max_block_size,
-                                    addr=self.base_address)
-        self.free_lists[self.max_block_size][
-            initial_block.addr] = initial_block
+        initial_block = MemoryBlock(size=self.max_block_size, addr=self.base_address)
+        self.free_lists[self.max_block_size][initial_block.addr] = initial_block
 
-        logger.debug("TensorMemoryPool, base_address:%d, max_block_size:%d",
-                     self.base_address, self.max_block_size)
+        logger.debug(
+            "TensorMemoryPool, base_address:%d, max_block_size:%d",
+            self.base_address,
+            self.max_block_size,
+        )
 
     def allocate(self, size: int) -> int:
         """Allocates a memory block of at least the requested size.
@@ -118,8 +118,7 @@ class TensorMemoryPool:
         if size <= 0:
             raise ValueError("Allocation size must be positive")
 
-        required_size = self._round_to_power_of_two(
-            max(size, self.min_block_size))
+        required_size = self._round_to_power_of_two(max(size, self.min_block_size))
         if required_size > self.max_block_size:
             raise ValueError("Requested size exceeds maximum block size")
 
@@ -135,8 +134,7 @@ class TensorMemoryPool:
         raise ValueError("Insufficient memory")
 
     def _split_block(self, block: MemoryBlock, required_size: int):
-        while (block.size > required_size
-               and block.size // 2 >= self.min_block_size):
+        while block.size > required_size and block.size // 2 >= self.min_block_size:
             buddy_size = block.size // 2
             buddy_addr = block.addr + buddy_size
 
@@ -165,8 +163,11 @@ class TensorMemoryPool:
         depth = 0
 
         while depth < MAX_MERGE_DEPTH:
-            buddy_offset = block.size if (block.addr - self.base_address) % (
-                2 * block.size) == 0 else -block.size
+            buddy_offset = (
+                block.size
+                if (block.addr - self.base_address) % (2 * block.size) == 0
+                else -block.size
+            )
             buddy_addr = block.addr + buddy_offset
             buddy = self.free_lists[block.size].get(buddy_addr)
             if buddy:
@@ -202,14 +203,14 @@ class TensorMemoryPool:
             self.free(addr)
             raise ValueError(
                 f"Allocated block size {block.size} is smaller than "
-                f"required size {size}")
+                f"required size {size}"
+            )
 
         try:
             buffer = (ctypes.c_byte * block.size).from_address(block.addr)
-            cpu_tensor = torch.frombuffer(buffer,
-                                          dtype=tensor.dtype,
-                                          count=tensor.numel()).reshape(
-                                              tensor.shape)
+            cpu_tensor = torch.frombuffer(
+                buffer, dtype=tensor.dtype, count=tensor.numel()
+            ).reshape(tensor.shape)
         except ValueError as err:
             self.free(addr)
             raise ValueError(f"Failed to create tensor view: {err}") from err
@@ -218,9 +219,13 @@ class TensorMemoryPool:
 
         return addr
 
-    def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int,
-                                                                      ...],
-                    device: torch.device) -> torch.Tensor:
+    def load_tensor(
+        self,
+        addr: int,
+        dtype: torch.dtype,
+        shape: tuple[int, ...],
+        device: torch.device,
+    ) -> torch.Tensor:
         """Loads a tensor from pinned host memory to the specified device.
 
         Args:
@@ -247,8 +252,9 @@ class TensorMemoryPool:
             raise ValueError("Requested tensor size exceeds block size")
 
         buffer = (ctypes.c_byte * block.size).from_address(block.addr)
-        cpu_tensor = torch.frombuffer(buffer, dtype=dtype,
-                                      count=num_elements).reshape(shape)
+        cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements).reshape(
+            shape
+        )
 
         cuda_tensor = torch.empty(shape, dtype=dtype, device=device)
 
@@ -260,7 +266,7 @@ class TensorMemoryPool:
         """Cleans up all memory resources and resets the pool state."""
         self.free_lists.clear()
         self.allocated_blocks.clear()
-        if hasattr(self, 'base_tensor'):
+        if hasattr(self, "base_tensor"):
             del self.base_tensor
 
     def __del__(self):
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 48fa1a82c67759e757727ffa9cfeb42e417c1714..016d1d45b3593b88bdbb179eb1a5c6cbbf52e4d6 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
 
 import safetensors
@@ -10,7 +10,10 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -19,6 +22,7 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -35,15 +39,22 @@ class ReqMeta:
     mm_hashes: list[str]
 
     @staticmethod
-    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
-                  is_store: bool, mm_hashes: list[str]) -> "ReqMeta":
+    def make_meta(
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+        mm_hashes: list[str],
+    ) -> "ReqMeta":
         valid_num_tokens = align_to_block_size(len(token_ids), block_size)
         token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
         block_ids_tensor = torch.tensor(block_ids)
         num_blocks = block_ids_tensor.shape[0]
         block_offsets = torch.arange(0, block_size)
-        slot_mapping = block_offsets.reshape((1, block_size)) + \
-                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = (
+            block_offsets.reshape((1, block_size))
+            + block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        )
         slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
         return ReqMeta(
             token_ids=token_ids_tensor,
@@ -55,10 +66,7 @@ class ReqMeta:
 
 @dataclass
 class SharedStorageConnectorMetadata(KVConnectorMetadata):
-    requests: list[ReqMeta]
-
-    def __init__(self):
-        self.requests = []
+    requests: list[ReqMeta] = field(default_factory=list)
 
     def add_request(
         self,
@@ -69,8 +77,8 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata):
         mm_hashes: list[str],
     ) -> None:
         self.requests.append(
-            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store,
-                              mm_hashes))
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store, mm_hashes)
+        )
 
 
 class SharedStorageConnector(KVConnectorBase_V1):
@@ -79,19 +87,27 @@ class SharedStorageConnector(KVConnectorBase_V1):
     # It does extra work which will overwrite the existing prefix-cache in GPU
     # - to remove the overhead, need to add some "mask" in the ReqMeta class
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config,
+            role=role,
+            kv_cache_config=kv_cache_config,
+        )
         self._block_size = vllm_config.cache_config.block_size
         self._requests_need_load: dict[str, Request] = {}
-        transfer_config = vllm_config.kv_transfer_config
-        self._storage_path = transfer_config.get_from_extra_config(
-            "shared_storage_path", "/tmp")
-        logger.info(vllm_config.kv_transfer_config)
+        self._storage_path = self._kv_transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp"
+        )
+        logger.info(self._kv_transfer_config)
         logger.info("Shared storage path is %s", self._storage_path)
 
-    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs: Any) -> None:
-        """Start loading the KV cache from the connector buffer to vLLM's 
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's
         paged KV buffer.
 
         Args:
@@ -99,7 +115,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
             **kwargs: additional arguments for the load operation
 
         Note:
-            The number of elements in kv_caches and layer_names should be 
+            The number of elements in kv_caches and layer_names should be
             the same.
         """
         attn_metadata = forward_context.attn_metadata
@@ -112,13 +128,13 @@ class SharedStorageConnector(KVConnectorBase_V1):
             """Inject the KV cache into the layer.
 
             Args:
-                dst_kv_cache_layer (torch.Tensor): the destination KV cache 
-                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache
+                    layer. In shape [2, num_pages, page_size, xxx] if not
                     using MLA, [num_pages, page_size, xxx] otherwise.
                 src_kv_cache (torch.Tensor): the source KV cache. In shape
-                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
                     otherwise.
-                slot_mapping (torch.Tensor): the slot mapping. In shape 
+                slot_mapping (torch.Tensor): the slot mapping. In shape
                     [num_tokens].
             """
             dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
@@ -126,14 +142,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 num_pages = dst_kv_cache_layer_shape[0]
                 page_size = dst_kv_cache_layer_shape[1]
                 dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    num_pages * page_size, -1)
+                    num_pages * page_size, -1
+                )
                 dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
                 dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
             else:
                 num_pages = dst_kv_cache_layer_shape[1]
                 page_size = dst_kv_cache_layer_shape[2]
                 dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    2, num_pages * page_size, -1)
+                    2, num_pages * page_size, -1
+                )
                 dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
                 dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
 
@@ -149,40 +167,39 @@ class SharedStorageConnector(KVConnectorBase_V1):
 
         attn_metadata = forward_context.attn_metadata
         if attn_metadata is None:
-            logger.warning(
-                "In connector.start_load_kv, but the attn_metadata is None")
+            logger.warning("In connector.start_load_kv, but the attn_metadata is None")
             return
 
         # Load the KV for each request each layer
         for request in metadata.requests:
             if request.is_store:
                 continue
-            logger.info("Inject KV cache of %d tokens to the paged memory",
-                        len(request.slot_mapping))
+            logger.info(
+                "Inject KV cache of %d tokens to the paged memory",
+                len(request.slot_mapping),
+            )
             for layer_name in forward_context.no_compile_layers:
                 layer = forward_context.no_compile_layers[layer_name]
 
                 # Only process layers that have kv_cache
                 # attribute (attention layers) Skip non-attention
                 # layers like FusedMoE/MLP etc.
-                kv_cache_attr = getattr(layer, 'kv_cache', None)
+                kv_cache_attr = getattr(layer, "kv_cache", None)
                 if kv_cache_attr is None:
                     continue
 
-                kv_cache_layer = kv_cache_attr[ \
-                        forward_context.virtual_engine]
+                kv_cache_layer = kv_cache_attr[forward_context.virtual_engine]
 
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids, request.mm_hashes)
-                kv_cache = safetensors.torch.load_file(
-                    filename)["kv_cache"].cuda()
-                inject_kv_into_layer(kv_cache_layer, kv_cache,
-                                     request.slot_mapping)
+                    layer_name, request.token_ids, request.mm_hashes
+                )
+                kv_cache = safetensors.torch.load_file(filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """Blocking until the KV for a specific layer is loaded into vLLM's
-        paged buffer. 
-        
+        paged buffer.
+
         This interface will be useful for layer-by-layer pipelining.
 
         Args:
@@ -190,15 +207,19 @@ class SharedStorageConnector(KVConnectorBase_V1):
         """
         return
 
-    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata",
-                      **kwargs: Any) -> None:
-        """Start saving the KV cache of the layer from vLLM's paged buffer 
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer
         to the connector.
 
         Args:
             layer_name (str): the name of the layer.
-            kv_layer (torch.Tensor): the paged KV buffer of the current 
+            kv_layer (torch.Tensor): the paged KV buffer of the current
                 layer in vLLM.
             attn_metadata (AttentionMetadata): the attention metadata.
             **kwargs: additional arguments for the save operation.
@@ -215,20 +236,18 @@ class SharedStorageConnector(KVConnectorBase_V1):
             """
             if isinstance(attn_metadata, MLACommonMetadata):
                 num_pages, page_size = layer.shape[0], layer.shape[1]
-                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
-                                                                ...]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...]
             num_pages, page_size = layer.shape[1], layer.shape[2]
-            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
-                                                               ...]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...]
 
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
         for request in connector_metadata.requests:
             if request.is_store:
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids, request.mm_hashes)
-                kv_cache = extract_kv_from_layer(kv_layer,
-                                                 request.slot_mapping)
+                    layer_name, request.token_ids, request.mm_hashes
+                )
+                kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
                 tensors = {"kv_cache": kv_cache.detach().cpu()}
                 safetensors.torch.save_file(tensors, filename)
 
@@ -239,18 +258,18 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
                 computed tokens for this request
 
         Returns:
-            the number of tokens that can be loaded from the 
+            the number of tokens that can be loaded from the
             external KV cache beyond what is already computed.
         """
         # NOTE: in this debug implementation, we assume that the prompt is
@@ -267,14 +286,14 @@ class SharedStorageConnector(KVConnectorBase_V1):
 
         # Now, first num_tokens_to_check tokens are hit, we need to prepare
         # the metadata for the worker connector to correctly load the KV
-        num_tokens_to_check = align_to_block_size(
-            len(request.prompt_token_ids) - 1, self._block_size)
+        token_ids = request.prompt_token_ids or []
+        num_tokens_to_check = align_to_block_size(len(token_ids) - 1, self._block_size)
 
         return num_tokens_to_check - num_computed_tokens, False
 
-    def update_state_after_alloc(self, request: "Request",
-                                 blocks: "KVCacheBlocks",
-                                 num_external_tokens: int):
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
         """
         Update KVConnector state after block allocation.
 
@@ -300,57 +319,61 @@ class SharedStorageConnector(KVConnectorBase_V1):
 
         total_need_load = 0
         for new_req in scheduler_output.scheduled_new_reqs:
+            token_ids = new_req.prompt_token_ids or []
+            mm_hashes = [f.identifier for f in new_req.mm_features]
             if new_req.req_id in self._requests_need_load:
                 meta.add_request(
-                    token_ids=new_req.prompt_token_ids,
+                    token_ids=token_ids,
                     block_ids=new_req.block_ids[0],
                     block_size=self._block_size,
                     is_store=False,
-                    mm_hashes=[f.identifier for f in new_req.mm_features])
+                    mm_hashes=mm_hashes,
+                )
                 total_need_load += 1
             else:
                 # NOTE: here, we set the store and load being exclusive,
                 # but a single request can have both store and load.
                 # NOTE(rob): for this debug implementation, we only cache
                 # the original prompt tokens.
-                if not self._found_match_for_request(new_req):
+                if not self._found_match_for_prompt(token_ids, mm_hashes):
                     meta.add_request(
-                        token_ids=new_req.prompt_token_ids,
+                        token_ids=token_ids,
                         block_ids=new_req.block_ids[0],
                         block_size=self._block_size,
                         is_store=True,
-                        mm_hashes=[f.identifier for f in new_req.mm_features])
+                        mm_hashes=mm_hashes,
+                    )
 
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
+            resumed_from_preemption = req_id in cached_reqs.resumed_req_ids
+            if not resumed_from_preemption or req_id not in self._requests_need_load:
+                continue
+
             num_computed_tokens = cached_reqs.num_computed_tokens[i]
             num_new_tokens = scheduler_output.num_scheduled_tokens[req_id]
             new_block_ids = cached_reqs.new_block_ids[i]
-            resumed_from_preemption = cached_reqs.resumed_from_preemption[i]
-
-            # NOTE(rob): here we rely on the resumed requests being
-            # the first N requests in the list scheduled_cache_reqs.
-            if not resumed_from_preemption:
-                break
-            if req_id in self._requests_need_load:
-                # NOTE(rob): cached_req_data does not have the full
-                # list of token ids (only new tokens). So we look it
-                # up in the actual request object.
-                request = self._requests_need_load[req_id]
-                total_tokens = num_computed_tokens + num_new_tokens
-                token_ids = request.all_token_ids[:total_tokens]
-
-                # NOTE(rob): For resumed req, new_block_ids is all
-                # of the block_ids for the request.
-                block_ids = new_block_ids[0]
 
-                meta.add_request(
-                    token_ids=token_ids,
-                    block_ids=block_ids,
-                    block_size=self._block_size,
-                    is_store=False,
-                    mm_hashes=[f.identifier for f in request.mm_features])
-                total_need_load += 1
+            # NOTE(rob): cached_req_data does not have the full
+            # list of token ids (only new tokens). So we look it
+            # up in the actual request object.
+            request = self._requests_need_load[req_id]
+            total_tokens = num_computed_tokens + num_new_tokens
+            token_ids = request.all_token_ids[:total_tokens]
+
+            # NOTE(rob): For resumed req, new_block_ids is all
+            # of the block_ids for the request.
+            assert new_block_ids is not None
+            block_ids = new_block_ids[0]
+
+            meta.add_request(
+                token_ids=token_ids,
+                block_ids=block_ids,
+                block_size=self._block_size,
+                is_store=False,
+                mm_hashes=[f.identifier for f in request.mm_features],
+            )
+            total_need_load += 1
 
         assert total_need_load == len(self._requests_need_load)
         self._requests_need_load.clear()
@@ -364,14 +387,25 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         request: "Request",
     ) -> bool:
-        """Check if the cache is hit for the request.
-        """
+        """Check if the cache is hit for the request."""
+        return self._found_match_for_prompt(
+            list(request.prompt_token_ids or []),
+            [f.identifier for f in request.mm_features],
+        )
+
+    def _found_match_for_prompt(
+        self,
+        prompt_token_ids: list[int],
+        mm_hashes: list[str],
+    ) -> bool:
         num_tokens_to_check = align_to_block_size(
-            len(request.prompt_token_ids) - 1, self._block_size)
+            len(prompt_token_ids) - 1, self._block_size
+        )
         foldername = self._generate_foldername_debug(
-            torch.tensor(request.prompt_token_ids)[:num_tokens_to_check],
-            [f.identifier for f in request.mm_features],
-            create_folder=False)
+            torch.tensor(prompt_token_ids)[:num_tokens_to_check],
+            mm_hashes,
+            create_folder=False,
+        )
         return os.path.exists(foldername)
 
     def _generate_foldername_debug(
@@ -380,7 +414,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         mm_hashes: list[str],
         create_folder=False,
     ) -> str:
-        """Generate a folder name based on the hash of the bytes of the input 
+        """Generate a folder name based on the hash of the bytes of the input
         ids.
         """
         token_bytes = token_ids.numpy().tobytes()
@@ -388,9 +422,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
         # to create a canonical key.
         if mm_hashes:
             mm_str = "-".join(mm_hashes)
-            token_bytes += mm_str.encode('utf-8')
-        input_ids_hash = hashlib.md5(token_bytes,
-                                     usedforsecurity=False).hexdigest()
+            token_bytes += mm_str.encode("utf-8")
+        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
 
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
@@ -403,16 +436,15 @@ class SharedStorageConnector(KVConnectorBase_V1):
         token_ids: torch.Tensor,
         mm_hashes: list[str],
     ) -> str:
-        """Generate a file name based on the layer name and the hash 
+        """Generate a file name based on the layer name and the hash
         of the bytes of the input ids.
         """
-        foldername = self._generate_foldername_debug(token_ids,
-                                                     mm_hashes=mm_hashes,
-                                                     create_folder=True)
+        foldername = self._generate_foldername_debug(
+            token_ids, mm_hashes=mm_hashes, create_folder=True
+        )
         return os.path.join(foldername, f"{layer_name}.safetensors")
 
 
 def align_to_block_size(num_tokens: int, block_size) -> int:
-    """Align the number of tokens to the block size.
-    """
+    """Align the number of tokens to the block size."""
     return (num_tokens - 1) // block_size * block_size
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index eef14269f1961b94d4ec8532c3a928c2657a539c..f48d03d0b0cd5f705a8242f30f16da8d1ce18656 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -14,7 +14,6 @@ These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -42,39 +41,44 @@ class KVLookupBufferBase(KVCacheBufferBase):
     Abstract base class for a KVCache lookup buffer.
 
     This class provides an abstraction for a key-value (KV) cache lookup buffer.
-    
+
     The key of the lookup buffer:
     - input_tokens: token IDs of the request
     - roi: a binary mask on top of input_tokens.
-      - Purpose of roi: Since KV cache may only be available for a subset of 
-        tokens in the input (for example, when vLLM is connected to an external 
-        KV cache service), roi specifies the subset of tokens that the KV cache 
+      - Purpose of roi: Since KV cache may only be available for a subset of
+        tokens in the input (for example, when vLLM is connected to an external
+        KV cache service), roi specifies the subset of tokens that the KV cache
         is associated with.
-      - NOTE: roi can be further extended to describe which part of KV the 
-        current process is holding (each process may only hold a part of KV 
+      - NOTE: roi can be further extended to describe which part of KV the
+        current process is holding (each process may only hold a part of KV
         due to TP and PP). This is not implemented for now.
-        
+
     The value of the lookup buffer:
     - key: the key tensor in the KV cache
     - value: the value tensor in the KV cache
-    - hidden: the final hidden state generated by model forwarding. This allows 
+    - hidden: the final hidden state generated by model forwarding. This allows
       vLLM to bypass further model forwarding by transmitting the hidden state.
     """
 
     @abstractmethod
-    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
-               key: torch.Tensor, value: torch.Tensor,
-               hidden: torch.Tensor) -> None:
+    def insert(
+        self,
+        input_tokens: torch.Tensor,
+        roi: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        hidden: torch.Tensor,
+    ) -> None:
         """Insert into the lookup buffer.
-        
+
         The functionality is similar to the following python statement
         ```
         buffer[input_tokens, roi] = [key, value, hidden]
         ```
-        
+
         FIXME: in the future, we should only have two arguments, key and value,
         where key is a tensor dict and value is a tensor dict.
-        
+
         FIXME: we should transmit both sampler outputs and the hidden states.
 
         Args:
@@ -82,8 +86,8 @@ class KVLookupBufferBase(KVCacheBufferBase):
             roi (torch.Tensor): A binary mask on top of the input tokens
             key (torch.Tensor): The key tensor in the KV cache.
             value (torch.Tensor): The value tensor in the KV cache.
-            hidden (torch.Tensor): The final hidden state tensor generated 
-                                   during model forwarding to bypass model 
+            hidden (torch.Tensor): The final hidden state tensor generated
+                                   during model forwarding to bypass model
                                    forwarding.
 
         Raises:
@@ -93,16 +97,16 @@ class KVLookupBufferBase(KVCacheBufferBase):
 
     @abstractmethod
     def drop_select(
-            self, input_tokens: Optional[torch.Tensor],
-            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
+        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
+    ) -> list[torch.Tensor | None]:
         """Select and *drop* KV cache entries from the lookup buffer.
-        
+
         The functionality is similar to the following python statements
         ```
         ret = buffer.pop(input_tokens, roi)
         return ret
         ```
-        
+
         If `input_tokens` and `roi` is `None`, it means selecting any of the
         KV caches in the buffer, return, and remove it from the buffer, useful
         when offloading KV cache to KV cache storage service.
@@ -138,7 +142,7 @@ class KVStoreBufferBase(KVCacheBufferBase):
     def put(
         self,
         key: str,
-        value: Optional[torch.Tensor],
+        value: torch.Tensor | None,
     ) -> None:
         """Store a key-value pair in the buffer.
 
@@ -158,7 +162,7 @@ class KVStoreBufferBase(KVCacheBufferBase):
     def get(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Retrieve a value from the buffer by key.
 
         Args:
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 4381aad1e9956823f1f1f4a03a9a4784cd55adcb..7861bea1f9c5402e934a14471a86361cf90665d0 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -6,18 +6,17 @@ think of KV cache transfer operations as putting new KV cache entries
 into a remote KVStore-based lookup buffer and getting existing KV caches
 from this remote lookup buffer.
 """
+
 import json
 import os
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 from safetensors.torch import load as safetensors_load
 from safetensors.torch import save as safetensors_save
 
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
-    KVStoreBufferBase)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
 from vllm.logger import init_logger
 
 DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
@@ -37,65 +36,69 @@ class MooncakeStoreConfig:
     master_server_address: str
 
     @staticmethod
-    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+    def from_file(file_path: str) -> "MooncakeStoreConfig":
         """Load the config from a JSON file."""
         with open(file_path) as fin:
             config = json.load(fin)
         return MooncakeStoreConfig(
             local_hostname=config.get("local_hostname"),
             metadata_server=config.get("metadata_server"),
-            global_segment_size=config.get("global_segment_size",
-                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
-            local_buffer_size=config.get("local_buffer_size",
-                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            global_segment_size=config.get(
+                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            ),
+            local_buffer_size=config.get(
+                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
+            ),
             protocol=config.get("protocol", "tcp"),
             device_name=config.get("device_name", ""),
             master_server_address=config.get("master_server_address"),
         )
 
     @staticmethod
-    def load_from_env() -> 'MooncakeStoreConfig':
+    def load_from_env() -> "MooncakeStoreConfig":
         """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
         if config_file_path is None:
             raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
+            )
         return MooncakeStoreConfig.from_file(config_file_path)
 
 
 class MooncakeStore(KVStoreBufferBase):
-
     def __init__(
         self,
         config: VllmConfig,
     ):
-
         try:
             from mooncake.store import MooncakeDistributedStore
         except ImportError as e:
             raise ImportError(
                 "Please install mooncake by following the instructions at "
                 "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector.") from e
+                "to run vLLM with MooncakeConnector."
+            ) from e
 
         try:
             self.store = MooncakeDistributedStore()
             self.config = MooncakeStoreConfig.load_from_env()
             logger.info("Mooncake Configuration loaded successfully.")
 
-            self.store.setup(self.config.local_hostname,
-                             self.config.metadata_server,
-                             self.config.global_segment_size,
-                             self.config.local_buffer_size,
-                             self.config.protocol, self.config.device_name,
-                             self.config.master_server_address)
+            self.store.setup(
+                self.config.local_hostname,
+                self.config.metadata_server,
+                self.config.global_segment_size,
+                self.config.local_buffer_size,
+                self.config.protocol,
+                self.config.device_name,
+                self.config.master_server_address,
+            )
 
         except ValueError as e:
             logger.error("Configuration loading failed: %s", e)
             raise
         except Exception as exc:
-            logger.error(
-                "An error occurred while loading the configuration: %s", exc)
+            logger.error("An error occurred while loading the configuration: %s", exc)
             raise
 
     def close(self):
@@ -106,7 +109,7 @@ class MooncakeStore(KVStoreBufferBase):
     def put(
         self,
         key: str,
-        value: Optional[torch.Tensor],
+        value: torch.Tensor | None,
     ) -> None:
         # A message queue needs to be introduced before making it asynchronous.
         if value is not None:
@@ -115,7 +118,7 @@ class MooncakeStore(KVStoreBufferBase):
     def get(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         # A message queue needs to be introduced before making it asynchronous.
         value = self._get_impl(key)
         return value
@@ -126,12 +129,9 @@ class MooncakeStore(KVStoreBufferBase):
         value: torch.Tensor,
     ) -> None:
         """Put KVCache to Mooncake Store"""
-        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_id = value.device.index if value.device.type == "cuda" else -1
         device_tensor = torch.tensor(device_id, dtype=torch.int32)
-        value_bytes = safetensors_save({
-            "tensor": value,
-            "device_id": device_tensor
-        })
+        value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
         try:
             self.store.put(key, value_bytes)
         except TypeError as err:
@@ -141,7 +141,7 @@ class MooncakeStore(KVStoreBufferBase):
     def _get_impl(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Get KVCache from Mooncake Store"""
         try:
             data = self.store.get(key)
@@ -154,8 +154,11 @@ class MooncakeStore(KVStoreBufferBase):
             tensor = loaded_tensors["tensor"]
             device_id_tensor = loaded_tensors["device_id"]
             device_id = int(device_id_tensor.item())
-            device = torch.device(
-                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            device = (
+                torch.device("cuda", device_id)
+                if device_id >= 0
+                else torch.device("cpu")
+            )
             return tensor.to(device)
 
         return None
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index a0ff7c320f61e6a07649ff0885323ed16d8a85dc..f046a349874e634fb9b40049dccbb79d77eb0bfd 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,23 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-    Implements a distributed key-value (KV) cache transfer mechanism.
-
-    Key Features:
-    - Distributed KV cache transmission using PyNccl pipes.
-    - Non-blocking `insert`, blocking `drop_select`.
-    - Use CPU signal pipe to avoid racing condition
-    - Handles buffer size constraints and provide backpressure mechanism to
-      stop the prefill instance when the decode instance is slow.
+Implements a distributed key-value (KV) cache transfer mechanism.
+
+Key Features:
+- Distributed KV cache transmission using PyNccl pipes.
+- Non-blocking `insert`, blocking `drop_select`.
+- Use CPU signal pipe to avoid racing condition
+- Handles buffer size constraints and provide backpressure mechanism to
+  stop the prefill instance when the decode instance is slow.
 """
+
 import threading
 from collections import deque
-from typing import Optional, Union
 
 import torch
 
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
-    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 
@@ -25,9 +24,9 @@ logger = init_logger(__name__)
 
 
 class SimpleBuffer(KVLookupBufferBase):
-
-    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
-                 buffer_size_thresh: float):
+    def __init__(
+        self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
+    ):
         """
         signal_pipe: on CPU
 
@@ -46,14 +45,16 @@ class SimpleBuffer(KVLookupBufferBase):
         self.buffer_cv = threading.Condition()
         self.signal_pipe = signal_pipe
         self.data_pipe = data_pipe
-        self.request_handling_thread: Optional[threading.Thread] = None
+        self.request_handling_thread: threading.Thread | None = None
 
         self.normal_signal = torch.tensor([0], device="cpu")
         self.end_signal = None
 
-    def _matches(self, tokens_roi_sender: list[torch.Tensor],
-                 tokens_roi_recver: list[torch.Tensor]):
-
+    def _matches(
+        self,
+        tokens_roi_sender: list[torch.Tensor],
+        tokens_roi_recver: list[torch.Tensor],
+    ):
         # tokens_roi_sender: tokens and roi of the producer (in the buffer)
         # tokens_roi_recver: tokens and roi of the consumer (query)
 
@@ -74,23 +75,19 @@ class SimpleBuffer(KVLookupBufferBase):
 
         # simple common prefix matching
         min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length],
-                          tokens_recver[:min_length]):
+        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
             return min_length
 
         return 0
 
-    def _send_tensor_and_dec_size(self,
-                                  tensor: Optional[torch.Tensor]) -> None:
-
+    def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
         assert tensor is not None, "Use self.data_pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
         if tensor.dtype == torch.bool:
             tensor = tensor.float()
         self.data_pipe.send_tensor(tensor)
 
-    def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]):
-
+    def _get_element_size(self, data: list | torch.Tensor | None):
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
         if not data:
@@ -100,10 +97,14 @@ class SimpleBuffer(KVLookupBufferBase):
 
         raise AssertionError(f"Unknown data type {type(data)}")
 
-    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
-                       key: torch.Tensor, value: torch.Tensor,
-                       hidden: torch.Tensor):
-
+    def _add_to_buffer(
+        self,
+        input_tokens: torch.Tensor,
+        roi: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        hidden: torch.Tensor,
+    ):
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
         if isinstance(roi, torch.Tensor):
@@ -134,9 +135,7 @@ class SimpleBuffer(KVLookupBufferBase):
         return signal is None
 
     def drop_select_handler(self):
-
         try:
-
             while True:
                 signal = self.signal_pipe.recv_tensor()
                 if self._is_end_signal(signal):
@@ -146,20 +145,21 @@ class SimpleBuffer(KVLookupBufferBase):
                 input_tokens = self.data_pipe.recv_tensor()
 
                 roi = self.data_pipe.recv_tensor()
-                assert roi is not None, "Please provide the roi when sending "\
-                    "drop-select request"
-                roi = (roi > 0.5)
+                assert roi is not None, (
+                    "Please provide the roi when sending drop-select request"
+                )
+                roi = roi > 0.5
                 tokens_roi_recver = [input_tokens, roi]
 
                 def is_buffer_available(
-                    tokens_roi_recver: list[torch.Tensor], ) -> bool:
+                    tokens_roi_recver: list[torch.Tensor],
+                ) -> bool:
                     # perform input tokens and roi matching
                     # FIXME: this matching is O(n), ideally it should be O(1)
                     # but this buffer size won't (and shouldn't) be too large so
                     # the fix is not urgent.
                     for _ in range(len(self.buffer)):
-                        if self._matches(self.buffer[0],
-                                         tokens_roi_recver) > 0:
+                        if self._matches(self.buffer[0], tokens_roi_recver) > 0:
                             return True
                         # rotate the element we just accessed to the end
                         self.buffer.rotate(-1)
@@ -167,8 +167,7 @@ class SimpleBuffer(KVLookupBufferBase):
 
                 with self.buffer_cv:
                     while not is_buffer_available(tokens_roi_recver):
-                        logger.debug(
-                            "KV transfer buffer is not available. Waiting...")
+                        logger.debug("KV transfer buffer is not available. Waiting...")
                         self.buffer_cv.wait()
                     # need to clone the tensor
                     # in case the tensor is freed before sending finishes
@@ -178,18 +177,18 @@ class SimpleBuffer(KVLookupBufferBase):
                     self.buffer_cv.notify()
 
         except RuntimeError as e:
-            if 'Connection closed by peer' not in str(e):
+            if "Connection closed by peer" not in str(e):
                 raise e
 
         logger.debug("Closing drop_select_handler")
 
     def drop_select(
-            self, input_tokens: Optional[torch.Tensor],
-            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
-
-        assert self.request_handling_thread is None, \
-            "drop_select should be called by the KV cache consumer "\
+        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
+    ) -> list[torch.Tensor | None]:
+        assert self.request_handling_thread is None, (
+            "drop_select should be called by the KV cache consumer "
             "(e.g. the decode vLLM instance)"
+        )
 
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
@@ -205,30 +204,36 @@ class SimpleBuffer(KVLookupBufferBase):
         if roi is not None:
             # convert from float tensor to bool tensor
             # as PyNccl does not support sending bool tensor
-            roi = (roi > 0.5)
+            roi = roi > 0.5
         key = self.data_pipe.recv_tensor()
         value = self.data_pipe.recv_tensor()
         hidden = self.data_pipe.recv_tensor()
 
         return [input_tokens, roi, key, value, hidden]
 
-    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
-               key: torch.Tensor, value: torch.Tensor,
-               hidden: torch.Tensor) -> None:
-
+    def insert(
+        self,
+        input_tokens: torch.Tensor,
+        roi: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        hidden: torch.Tensor,
+    ) -> None:
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
 
         # when calling the insert, the current process is a sender
         # need to launch the request handler and start listening to request.
         if self.request_handling_thread is None:
             self.request_handling_thread = threading.Thread(
-                target=self.drop_select_handler)
+                target=self.drop_select_handler
+            )
             self.request_handling_thread.start()
 
     def close(self):
-
-        if hasattr(self, "request_handling_thread"
-                   ) and self.request_handling_thread is not None:
+        if (
+            hasattr(self, "request_handling_thread")
+            and self.request_handling_thread is not None
+        ):
             self.request_handling_thread.join()
 
         else:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 1423fd032477eae7255806546c79d1e080ec4cef..1fe7a90e9a712fac9981eacb9e011ed501cc0aa9 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -12,7 +12,6 @@ you can bypass this interface and directly start from `kv_lookup_buffer`.
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -24,13 +23,13 @@ class KVPipeBase(ABC):
     """
 
     @abstractmethod
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """Send a tensor, or None, via the pipe.
-        
+
         Need to support sending None -- important for error handling.
-        
-        TODO: add a `key` argument so that we can use traditional 
-        key-value database as the distributed communication mechanism behind 
+
+        TODO: add a `key` argument so that we can use traditional
+        key-value database as the distributed communication mechanism behind
         the pipe.
 
         Args:
@@ -42,11 +41,11 @@ class KVPipeBase(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """Receive a tensor (can be None) from the pipeline.
 
         Returns:
-            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
+            Optional[torch.Tensor]: The tensor received from the pipeline. Can
                                     be None.
 
         Raises:
@@ -58,7 +57,7 @@ class KVPipeBase(ABC):
     def close(self) -> None:
         """Close the pipeline and release resources.
 
-        This method is responsible for closing the communication pipeline 
+        This method is responsible for closing the communication pipeline
         and releasing any resources associated with it.
 
         Raises:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 2a434e280179ea1f4e571d52036d91098156f259..542dde09abad4a32cfcb7ac198f473c0c1c03324 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -6,7 +6,6 @@ import os
 import struct
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 import zmq
@@ -16,7 +15,7 @@ from safetensors.torch import save as safetensors_save
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
-from vllm.utils import join_host_port, make_zmq_path, split_host_port
+from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
 
 logger = init_logger(__name__)
 NONE_INT = -150886311
@@ -26,13 +25,13 @@ NONE_INT = -150886311
 class MooncakeTransferEngineConfig:
     prefill_url: str
     decode_url: str
-    metadata_backend: Union[str, None]
+    metadata_backend: str | None
     metadata_server: str
     protocol: str
     device_name: str
 
     @staticmethod
-    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+    def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
         """Load the config from a JSON file."""
         with open(file_path) as fin:
             config = json.load(fin)
@@ -46,12 +45,13 @@ class MooncakeTransferEngineConfig:
         )
 
     @staticmethod
-    def load_from_env() -> 'MooncakeTransferEngineConfig':
+    def load_from_env() -> "MooncakeTransferEngineConfig":
         """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
         if config_file_path is None:
             raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
+            )
         return MooncakeTransferEngineConfig.from_file(config_file_path)
 
 
@@ -65,7 +65,8 @@ class MooncakeTransferEngine:
             raise ImportError(
                 "Please install mooncake by following the instructions at "
                 "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector.") from e
+                "to run vLLM with MooncakeConnector."
+            ) from e
 
         self.engine = TransferEngine()
         self.local_rank = local_rank
@@ -77,16 +78,13 @@ class MooncakeTransferEngine:
             logger.error(e)
             raise
         except Exception as exc:
-            logger.error(
-                "An error occurred while loading the configuration: %s", exc)
+            logger.error("An error occurred while loading the configuration: %s", exc)
             raise
-        prefill_host, base_prefill_port = split_host_port(
-            self.config.prefill_url)
+        prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
         decode_host, base_decode_port = split_host_port(self.config.decode_url)
 
         # Avoid ports conflict when running prefill and decode on the same node
-        if prefill_host == decode_host and \
-                base_prefill_port == base_decode_port:
+        if prefill_host == decode_host and base_prefill_port == base_decode_port:
             base_decode_port = base_decode_port + 100
 
         prefill_port = base_prefill_port + self.local_rank
@@ -94,12 +92,15 @@ class MooncakeTransferEngine:
         self.prefill_url = join_host_port(prefill_host, prefill_port)
         self.decode_url = join_host_port(decode_host, decode_port)
 
-        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
-                        self.config.metadata_server, self.config.protocol,
-                        self.config.device_name, self.config.metadata_backend)
+        self.initialize(
+            self.prefill_url if kv_rank == 0 else self.decode_url,
+            self.config.metadata_server,
+            self.config.protocol,
+            self.config.device_name,
+            self.config.metadata_backend,
+        )
 
-        self.remote_url = (self.decode_url
-                           if kv_rank == 0 else self.prefill_url)
+        self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
 
         # Initialize ZeroMQ context and sockets
         self.context = zmq.Context()  # type: ignore[attr-defined]
@@ -109,51 +110,57 @@ class MooncakeTransferEngine:
         self.receiver_ack = self.context.socket(zmq.constants.PUSH)
 
         self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
-        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
-                                     decode_host, base_decode_port)
+        self._setup_metadata_sockets(
+            kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
+        )
 
-    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int,
-                                d_host: str, d_port: int) -> None:
+    def _setup_metadata_sockets(
+        self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
+    ) -> None:
         """Set up ZeroMQ sockets for sending and receiving data."""
         # Offsets < 8 are left for initialization in case tp and pp are enabled
         p_rank_offset = p_port + 8 + self.local_rank * 2
         d_rank_offset = d_port + 8 + self.local_rank * 2
         if kv_rank == 0:
-            self.sender_socket.bind(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
             self.receiver_socket.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.sender_ack.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.receiver_ack.bind(
-                make_zmq_path("tcp", p_host, p_rank_offset + 2))
+                make_zmq_path("tcp", d_host, d_rank_offset + 1)
+            )
+            self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
         else:
             self.receiver_socket.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.sender_socket.bind(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.receiver_ack.bind(
-                make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.sender_ack.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 2))
-
-    def initialize(self, local_hostname: str, metadata_server: str,
-                   protocol: str, device_name: str,
-                   metadata_backend: Union[str, None]) -> None:
+                make_zmq_path("tcp", p_host, p_rank_offset + 1)
+            )
+            self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
+
+    def initialize(
+        self,
+        local_hostname: str,
+        metadata_server: str,
+        protocol: str,
+        device_name: str,
+        metadata_backend: str | None,
+    ) -> None:
         """Initialize the mooncake instance."""
         if metadata_backend is None:
-            self.engine.initialize(local_hostname, metadata_server, protocol,
-                                   device_name)
+            self.engine.initialize(
+                local_hostname, metadata_server, protocol, device_name
+            )
         else:
             supported_backend = ["etcd", "redis"]
             metadata_backend = metadata_backend.lower()
             if metadata_backend not in supported_backend:
                 raise ValueError(
                     "Mooncake Configuration error. `metadata_backend`"
-                    f" should be one of {supported_backend}.")
+                    f" should be one of {supported_backend}."
+                )
 
-            self.engine.initialize_ext(local_hostname, metadata_server,
-                                       protocol, device_name, metadata_backend)
+            self.engine.initialize_ext(
+                local_hostname, metadata_server, protocol, device_name, metadata_backend
+            )
 
     def allocate_managed_buffer(self, length: int) -> int:
         """Allocate a managed buffer of the specified length."""
@@ -167,18 +174,17 @@ class MooncakeTransferEngine:
         """Free a previously allocated managed buffer."""
         return self.engine.free_managed_buffer(buffer, length)
 
-    def transfer_sync(self, buffer: int, peer_buffer_address: int,
-                      length: int) -> int:
+    def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
         """Synchronously transfer data to the specified address."""
-        ret = self.engine.transfer_sync_read(self.remote_url, buffer,
-                                             peer_buffer_address, length)
+        ret = self.engine.transfer_sync_read(
+            self.remote_url, buffer, peer_buffer_address, length
+        )
         if ret < 0:
             logger.error("Transfer Return Error")
             raise Exception("Transfer Return Error")
         return ret
 
-    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
-                              length: int) -> int:
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
         """Write bytes to the allocated buffer."""
         return self.engine.write_bytes_to_buffer(buffer, user_data, length)
 
@@ -189,7 +195,7 @@ class MooncakeTransferEngine:
     def wait_for_ack(self, src_ptr: int, length: int) -> None:
         """Asynchronously wait for ACK from the receiver."""
         ack = self.sender_ack.recv()
-        if ack != b'ACK':
+        if ack != b"ACK":
             logger.error("Failed to receive ACK from the receiver")
 
         self.free_managed_buffer(src_ptr, length)
@@ -200,8 +206,8 @@ class MooncakeTransferEngine:
         src_ptr = self.allocate_managed_buffer(length)
         self.write_bytes_to_buffer(src_ptr, user_data, length)
         self.sender_socket.send_multipart(
-            [struct.pack("!Q", src_ptr),
-             struct.pack("!Q", length)])
+            [struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
+        )
         self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
 
     def recv_bytes(self) -> bytes:
@@ -214,7 +220,7 @@ class MooncakeTransferEngine:
         ret = self.read_bytes_from_buffer(dst_ptr, length)
 
         # Buffer cleanup
-        self.receiver_ack.send(b'ACK')
+        self.receiver_ack.send(b"ACK")
         self.free_managed_buffer(dst_ptr, length)
 
         return ret
@@ -223,22 +229,21 @@ class MooncakeTransferEngine:
 class MooncakePipe(KVPipeBase):
     """MooncakeTransferEngine based Pipe implementation."""
 
-    def __init__(self,
-                 local_rank: int,
-                 config: KVTransferConfig,
-                 device: Optional[str] = None):
+    def __init__(
+        self, local_rank: int, config: KVTransferConfig, device: str | None = None
+    ):
         """Initialize the mooncake pipe and set related parameters."""
         self.config = config
         self.local_rank = local_rank
         self.kv_rank = self.config.kv_rank
+        assert self.kv_rank is not None
         if device is None:
             self.device = self._select_device(self.config.kv_buffer_device)
         else:
             self.device = self._select_device(device)
 
-        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
-                                                      self.local_rank)
-        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
+        self.transport_thread: ThreadPoolExecutor | None = None
         self.none_tensor = torch.tensor([NONE_INT], device=self.device)
 
     def _select_device(self, device: str) -> torch.device:
@@ -262,15 +267,15 @@ class MooncakePipe(KVPipeBase):
         data = self.transfer_engine.recv_bytes()
         return safetensors_load(data)["tensor"].to(self.device)
 
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """Send tensor to the target process."""
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
         tensor = tensor if tensor is not None else self.none_tensor
-        assert (len(tensor.shape) > 0)
+        assert len(tensor.shape) > 0
         self.transport_thread.submit(self._send_impl, tensor)
 
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """Receive tensor from other processes."""
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 7a79a8cc0c9328df45277abfe1df606f6cea3031..526c5cd1d5278d0b6cdff051ee14ccf15fe4227e 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,22 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-    This module implements a PyNccl pipe for sending and receiving
-    Optional[torch.Tensor] between distributed ranks with advanced
-    communication features.
-
-    Key Features:
-    - Supports sending and receiving tensors with metadata
-    - Handles both CUDA and CPU device communications
-    - Implements a non-blocking tensor transfer mechanism
-    - Manages buffer size and provides backpressure control
-    - Supports distributed process groups with configurable parameters
+This module implements a PyNccl pipe for sending and receiving
+Optional[torch.Tensor] between distributed ranks with advanced
+communication features.
+
+Key Features:
+- Supports sending and receiving tensors with metadata
+- Handles both CUDA and CPU device communications
+- Implements a non-blocking tensor transfer mechanism
+- Manages buffer size and provides backpressure control
+- Supports distributed process groups with configurable parameters
 """
 
 import threading
 import time
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
-from typing import Callable, Optional
 
 import torch
 
@@ -30,29 +30,30 @@ logger = init_logger(__name__)
 
 
 class BrokenPipeException(Exception):
-
     def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
 
-Metadata = dict[str, Optional[torch.Tensor]]
+Metadata = dict[str, torch.Tensor | None]
 
 
 class PyNcclPipe(KVPipeBase):
-
     METADATA_LENGTH = 16
     MAX_TENSOR_DIMENSIONS = 14
     METADATA_DTYPE = torch.int64
 
-    def __init__(self,
-                 local_rank: int,
-                 config: KVTransferConfig,
-                 device: Optional[str] = None,
-                 port_offset: int = 0):
+    def __init__(
+        self,
+        local_rank: int,
+        config: KVTransferConfig,
+        device: str | None = None,
+        port_offset: int = 0,
+    ):
         self.config = config
         self.local_rank = local_rank
         self.kv_rank = self.config.kv_rank
+        assert self.kv_rank is not None
         self.kv_parallel_size = self.config.kv_parallel_size
         if device is None:
             self.device = self._select_device(self.config.kv_buffer_device)
@@ -77,16 +78,16 @@ class PyNcclPipe(KVPipeBase):
         self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
 
         # transportation-related variables
-        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.transport_thread: ThreadPoolExecutor | None = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
         self.buffer_size_thresh = self.config.kv_buffer_size
 
     def _get_device_send_recv_impl(
         self, group: StatelessProcessGroup
-    ) -> tuple[Callable[[torch.Tensor, int], None], Callable[
-        [torch.Tensor, int], None]]:
-
+    ) -> tuple[
+        Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
+    ]:
         send: Callable[[torch.Tensor, int], None]
         recv: Callable[[torch.Tensor, int], None]
         if self.device.type == "cuda":
@@ -115,7 +116,7 @@ class PyNcclPipe(KVPipeBase):
         else:
             return torch.device("cpu")
 
-    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
+    def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
         """
         Create the metadata as a dictionary based on the input tensor.
 
@@ -144,9 +145,9 @@ class PyNcclPipe(KVPipeBase):
             buffer: A tensor of the specified type and shape,
                 allocated on `self.device`.
         """
-        return torch.empty(metadata["shape"],
-                           dtype=metadata["dtype"],
-                           device=self.device)
+        return torch.empty(
+            metadata["shape"], dtype=metadata["dtype"], device=self.device
+        )
 
     def _send_metadata(self, metadata: Metadata):
         """
@@ -167,7 +168,7 @@ class PyNcclPipe(KVPipeBase):
         """
         return self.group.recv_obj(self.target_rank_for_recv)
 
-    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
+    def _send_impl(self, tensor: torch.Tensor | None) -> None:
         """
         The actual implementation of sending the tensor and its metadata to the
         target rank.
@@ -179,10 +180,9 @@ class PyNcclPipe(KVPipeBase):
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
         if tensor is not None:
-            self.device_send_func(tensor.to(self.device),
-                                  self.target_rank_for_send)
+            self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
 
-    def _recv_impl(self) -> Optional[torch.Tensor]:
+    def _recv_impl(self) -> torch.Tensor | None:
         """
         The actual implementation of receiving a tensor and its metadata from
         the target rank.
@@ -198,8 +198,9 @@ class PyNcclPipe(KVPipeBase):
 
         return buffer
 
-    def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
-                            tensor_size: int) -> None:
+    def send_tensor_wrapper(
+        self, tensor: torch.Tensor | None, tensor_size: int
+    ) -> None:
         """
         Wrapper for _send_impl to handle exceptions and update buffer size.
         """
@@ -209,9 +210,14 @@ class PyNcclPipe(KVPipeBase):
             with self.buffer_size_lock:
                 self.buffer_size -= tensor_size
         except Exception as e:
-            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
-                         torch.distributed.get_rank(), str(tensor), str(e))
+            logger.error(
+                "[rank%d]: Exception when trying to send %s, msg: %s",
+                torch.distributed.get_rank(),
+                str(tensor),
+                str(e),
+            )
             import traceback
+
             traceback.print_exc()
 
     def block_if_full(self):
@@ -223,7 +229,7 @@ class PyNcclPipe(KVPipeBase):
             logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """
         Sends a tensor and its metadata to the destination rank in a
         non-blocking way.
@@ -244,10 +250,9 @@ class PyNcclPipe(KVPipeBase):
         with self.buffer_size_lock:
             self.buffer_size += tensor_size
 
-        self.transport_thread.submit(self.send_tensor_wrapper, tensor,
-                                     tensor_size)
+        self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
 
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """
         Receives a tensor and its metadata from the source rank. Blocking call.
 
@@ -266,6 +271,7 @@ class PyNcclPipe(KVPipeBase):
             logger.error("%s", e)
             logger.error("My device: %s", self.device)
             import traceback
+
             traceback.print_exc()
             raise e
 
@@ -275,6 +281,5 @@ class PyNcclPipe(KVPipeBase):
         """
         Close the pipe and release associated resources.
         """
-        if hasattr(self,
-                   "transport_thread") and self.transport_thread is not None:
+        if hasattr(self, "transport_thread") and self.transport_thread is not None:
             self.transport_thread.shutdown()
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index d5747bed92771d0487bf31c5f949e2beae03a0b2..54b46d98870a594ac9e65588ffb6ea963a14e1a5 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -2,22 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING, Optional
 
-from vllm import envs
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
-                                                          KVConnectorRole)
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
-_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+_KV_CONNECTOR_AGENT: KVConnectorBaseType | None = None
 
 
 def get_kv_transfer_group() -> KVConnectorBaseType:
     assert _KV_CONNECTOR_AGENT is not None, (
-        "disaggregated KV cache transfer parallel group is not initialized")
+        "disaggregated KV cache transfer parallel group is not initialized"
+    )
     return _KV_CONNECTOR_AGENT
 
 
@@ -25,8 +27,7 @@ def has_kv_transfer_group() -> bool:
     return _KV_CONNECTOR_AGENT is not None
 
 
-def is_v1_kv_transfer_group(
-        connector: Optional[KVConnectorBaseType] = None) -> bool:
+def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> bool:
     """Check if the KV connector is the v1 connector.
     If the argument is None, it will check the global KV connector
 
@@ -47,7 +48,9 @@ def is_v1_kv_transfer_group(
     return isinstance(connector, KVConnectorBase_V1)
 
 
-def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+def ensure_kv_transfer_initialized(
+    vllm_config: "VllmConfig", kv_cache_config: Optional["KVCacheConfig"] = None
+) -> None:
     """
     Initialize KV cache transfer parallel group.
     """
@@ -57,13 +60,15 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
     if vllm_config.kv_transfer_config is None:
         return
 
-    if (vllm_config.kv_transfer_config.is_kv_transfer_instance
-            and _KV_CONNECTOR_AGENT is None):
-        if envs.VLLM_USE_V1:
-            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
-                config=vllm_config, role=KVConnectorRole.WORKER)
-        else:
-            raise ValueError("V0 is no longer supported")
+    if (
+        vllm_config.kv_transfer_config.is_kv_transfer_instance
+        and _KV_CONNECTOR_AGENT is None
+    ):
+        _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+            config=vllm_config,
+            role=KVConnectorRole.WORKER,
+            kv_cache_config=kv_cache_config,
+        )
 
 
 def ensure_kv_transfer_shutdown() -> None:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 638170963e2b1233a462702c8d55041b47e690b1..852c4c644433f02112ac511bb026e03516a10075 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -22,30 +22,39 @@ If you only need to use the distributed environment without model/pipeline
  parallelism, you can skip the model parallel initialization and destruction
  steps.
 """
+
 import contextlib
 import gc
 import pickle
 import weakref
 from collections import namedtuple
+from collections.abc import Callable
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
 from multiprocessing import shared_memory
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional
 from unittest.mock import patch
 
 import torch
 import torch.distributed
+import torch.distributed._functional_collectives as funcol
+import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
 from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase)
+    DeviceCommunicatorBase,
+)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
-                        resolve_obj_by_qualname, supports_custom_op)
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.network_utils import get_distributed_init_method
+from vllm.utils.torch_utils import (
+    direct_register_custom_op,
+    supports_custom_op,
+)
 
 
 @dataclass
@@ -57,7 +66,7 @@ TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
 
 
 def _split_tensor_dict(
-    tensor_dict: dict[str, Union[torch.Tensor, Any]]
+    tensor_dict: dict[str, torch.Tensor | Any],
 ) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
@@ -74,7 +83,8 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
+                (key, TensorMetadata(device, value.dtype, value.size()))
+            )
             tensor_list.append(value)
         else:
             metadata_list.append((key, value))
@@ -116,8 +126,9 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
     return torch.empty_like(tensor)
 
 
-def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int,
-                   group_name: str) -> torch.Tensor:
+def reduce_scatter(
+    tensor: torch.Tensor, dim: int, world_size: int, group_name: str
+) -> torch.Tensor:
     assert group_name in _groups, f"Group {group_name} is not found."
     group = _groups[group_name]()
     if group is None:
@@ -125,15 +136,17 @@ def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int,
     return group._reduce_scatter_out_place(tensor, dim)
 
 
-def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int,
-                        group_name: str) -> torch.Tensor:
+def reduce_scatter_fake(
+    tensor: torch.Tensor, dim: int, world_size: int, group_name: str
+) -> torch.Tensor:
     new_shape = list(tensor.shape)
     new_shape[dim] = tensor.shape[dim] // world_size
     return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
 
 
-def all_gather(tensor: torch.Tensor, dim: int, world_size: int,
-               group_name: str) -> torch.Tensor:
+def all_gather(
+    tensor: torch.Tensor, dim: int, world_size: int, group_name: str
+) -> torch.Tensor:
     assert group_name in _groups, f"Group {group_name} is not found."
     group = _groups[group_name]()
     if group is None:
@@ -141,13 +154,98 @@ def all_gather(tensor: torch.Tensor, dim: int, world_size: int,
     return group._all_gather_out_place(tensor, dim)
 
 
-def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
-                    group_name: str) -> torch.Tensor:
+def all_gather_fake(
+    tensor: torch.Tensor, dim: int, world_size: int, group_name: str
+) -> torch.Tensor:
     new_shape = list(tensor.shape)
     new_shape[dim] = tensor.shape[dim] * world_size
     return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
 
 
+def patched_fused_scaled_matmul_reduce_scatter_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    # Copied from
+    # https://github.com/pytorch/pytorch/blob/50c338c2da905062449e4d9ac807832d1b5cd90e/torch/distributed/_symmetric_memory/__init__.py#L1189
+    if A_scale.numel() > 1:
+        if A_scale.shape[:-1] != A.shape[:-1]:
+            raise ValueError(
+                "For row-wise scaling, the leading dims of A_scale "
+                "must match the leading dims of A "
+                f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})"
+            )
+        A_scale = A_scale.flatten(0, -2).contiguous()
+    elif A_scale.numel() != 1:
+        raise ValueError(
+            "Invalid A_scale shape "
+            f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})"
+        )
+
+    C = torch._scaled_mm(
+        A.flatten(0, -2).contiguous(),
+        B,
+        A_scale,
+        B_scale,
+        bias,
+        result_scale,
+        out_dtype,
+        use_fast_accum,
+    )
+    C = C.view(*output_shape[:-1], B.shape[1])
+    res = funcol.reduce_scatter_tensor(
+        C,
+        reduce_op,
+        orig_scatter_dim,  # need original scatter dim for 3D+ output tensor here
+        group_name,
+    )
+    res = funcol.wait_tensor(res)
+    return res
+
+
+def patched_fused_scaled_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    return torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+        A,
+        B,
+        A_scale,
+        B_scale,
+        reduce_op,
+        orig_scatter_dim,
+        scatter_dim_after_maybe_reshape,
+        group_name,
+        output_shape,
+        bias,
+        result_scale,
+        out_dtype,
+        use_fast_accum,
+    )
+
+
 if supports_custom_op():
     direct_register_custom_op(
         op_name="all_reduce",
@@ -167,6 +265,15 @@ if supports_custom_op():
         fake_impl=all_gather_fake,
     )
 
+    # TODO: Remove this once the pytorch fix
+    # (https://github.com/pytorch/pytorch/pull/165086) gets released,
+    # in either 2.9.1 or 2.10
+    direct_register_custom_op(
+        op_name="patched_fused_scaled_matmul_reduce_scatter",
+        op_func=patched_fused_scaled_matmul_reduce_scatter,
+        fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake,
+    )
+
 
 class GroupCoordinator:
     """
@@ -194,17 +301,17 @@ class GroupCoordinator:
     cpu_group: ProcessGroup  # group for CPU communication
     device_group: ProcessGroup  # group for device communication
     # device communicator (if use_device_communicator=True)
-    device_communicator: Optional[DeviceCommunicatorBase]
-    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+    device_communicator: DeviceCommunicatorBase | None
+    mq_broadcaster: Any | None  # shared memory broadcaster
 
     def __init__(
         self,
         group_ranks: list[list[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
+        torch_distributed_backend: str | Backend,
         use_device_communicator: bool,  # whether to use device communicator
         use_message_queue_broadcaster: bool = False,
-        group_name: Optional[str] = None,
+        group_name: str | None = None,
     ):
         group_name = group_name or "anonymous"
         self.unique_name = _get_unique_name(group_name)
@@ -218,7 +325,8 @@ class GroupCoordinator:
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
+                ranks, backend=torch_distributed_backend
+            )
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
@@ -242,8 +350,7 @@ class GroupCoordinator:
         elif current_platform.is_xpu():
             self.device = torch.device(f"xpu:{local_rank}")
         elif current_platform.is_out_of_tree():
-            self.device = torch.device(
-                f"{current_platform.device_name}:{local_rank}")
+            self.device = torch.device(f"{current_platform.device_name}:{local_rank}")
         else:
             self.device = torch.device("cpu")
 
@@ -251,7 +358,8 @@ class GroupCoordinator:
         self.device_communicator = None
         if use_device_communicator and self.world_size > 1:
             device_comm_cls = resolve_obj_by_qualname(
-                current_platform.get_device_communicator_cls())
+                current_platform.get_device_communicator_cls()
+            )
             self.device_communicator = device_comm_cls(
                 cpu_group=self.cpu_group,
                 device=self.device,
@@ -259,19 +367,50 @@ class GroupCoordinator:
                 unique_name=self.unique_name,
             )
 
-        from vllm.distributed.device_communicators.shm_broadcast import (
-            MessageQueue)
-        self.mq_broadcaster: Optional[MessageQueue] = None
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        self.mq_broadcaster: MessageQueue | None = None
         if use_message_queue_broadcaster and self.world_size > 1:
             self.mq_broadcaster = MessageQueue.create_from_process_group(
-                self.cpu_group, 1 << 22, 6)
+                self.cpu_group, 1 << 22, 6
+            )
 
         from vllm.platforms import current_platform
-        self.use_custom_op_call = (current_platform.is_cuda_alike()
-                                   or current_platform.is_tpu())
 
-        self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr(
-            torch.ops._C, "init_shm_manager"))
+        self.use_custom_op_call = (
+            current_platform.is_cuda_alike() or current_platform.is_tpu()
+        )
+
+        self.use_cpu_custom_send_recv = current_platform.is_cpu() and hasattr(
+            torch.ops._C, "init_shm_manager"
+        )
+
+    def create_mq_broadcaster(
+        self, writer_rank=0, external_writer_handle=None, blocking=True
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group(
+            self.cpu_group,
+            1 << 22,
+            6,
+            writer_rank=writer_rank,
+            external_writer_handle=external_writer_handle,
+            blocking=blocking,
+        )
+
+    def create_single_reader_mq_broadcasters(
+        self, reader_rank_in_group=0, blocking=False
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group_single_reader(
+            self.cpu_group,
+            1 << 22,
+            6,
+            reader_rank=self.ranks[reader_rank_in_group],
+            blocking=blocking,
+        )
 
     @property
     def first_rank(self):
@@ -308,8 +447,7 @@ class GroupCoordinator:
         return self.ranks[(rank_in_group - 1) % world_size]
 
     @contextmanager
-    def graph_capture(
-            self, graph_capture_context: Optional[GraphCaptureContext] = None):
+    def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None):
         if graph_capture_context is None:
             stream = torch.cuda.Stream()
             graph_capture_context = GraphCaptureContext(stream)
@@ -320,7 +458,9 @@ class GroupCoordinator:
         # so we don't abstract it into the base class
         maybe_ca_context = nullcontext()
         from vllm.distributed.device_communicators.cuda_communicator import (
-            CudaCommunicator)
+            CudaCommunicator,
+        )
+
         if self.device_communicator is not None:
             assert isinstance(self.device_communicator, CudaCommunicator)
             ca_comm = self.device_communicator.ca_comm
@@ -356,8 +496,7 @@ class GroupCoordinator:
             return input_
 
         if self.use_custom_op_call:
-            return torch.ops.vllm.all_reduce(input_,
-                                             group_name=self.unique_name)
+            return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
         else:
             return self._all_reduce_out_place(input_)
 
@@ -372,66 +511,62 @@ class GroupCoordinator:
         if world_size == 1:
             return input_
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
 
         if self.use_custom_op_call:
-            return torch.ops.vllm.all_gather(input_,
-                                             dim,
-                                             world_size,
-                                             group_name=self.unique_name)
+            return torch.ops.vllm.all_gather(
+                input_, dim, world_size, group_name=self.unique_name
+            )
         else:
             return self._all_gather_out_place(input_, dim)
 
-    def _all_gather_out_place(self, input_: torch.Tensor,
-                              dim: int) -> torch.Tensor:
+    def _all_gather_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor:
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
         return self.device_communicator.all_gather(input_, dim)
 
-    def all_gatherv(self,
-                    input_: Union[torch.Tensor, list[torch.Tensor]],
-                    dim: int = 0,
-                    sizes: Optional[list[int]] = None):
+    def all_gatherv(
+        self,
+        input_: torch.Tensor | list[torch.Tensor],
+        dim: int = 0,
+        sizes: list[int] | None = None,
+    ):
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
         return self.device_communicator.all_gatherv(input_, dim, sizes)
 
-    def reduce_scatter(self,
-                       input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
         assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
 
         if self.use_custom_op_call:
-            return torch.ops.vllm.reduce_scatter(input_,
-                                                 dim,
-                                                 world_size,
-                                                 group_name=self.unique_name)
+            return torch.ops.vllm.reduce_scatter(
+                input_, dim, world_size, group_name=self.unique_name
+            )
         else:
             return self._reduce_scatter_out_place(input_, dim)
 
-    def reduce_scatterv(self,
-                        input_: torch.Tensor,
-                        dim: int = -1,
-                        sizes: Optional[list[int]] = None) -> torch.Tensor:
+    def reduce_scatterv(
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
+    ) -> torch.Tensor:
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
         return self.device_communicator.reduce_scatterv(input_, dim, sizes)
 
-    def _reduce_scatter_out_place(self, input_: torch.Tensor,
-                                  dim: int) -> torch.Tensor:
+    def _reduce_scatter_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor:
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
         return self.device_communicator.reduce_scatter(input_, dim)
 
-    def gather(self,
-               input_: torch.Tensor,
-               dst: int = 0,
-               dim: int = -1) -> Optional[torch.Tensor]:
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -455,12 +590,12 @@ class GroupCoordinator:
         if self.world_size == 1:
             return input_
         # Broadcast.
-        torch.distributed.broadcast(input_,
-                                    src=self.ranks[src],
-                                    group=self.device_group)
+        torch.distributed.broadcast(
+            input_, src=self.ranks[src], group=self.device_group
+        )
         return input_
 
-    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+    def broadcast_object(self, obj: Any | None = None, src: int = 0):
         """Broadcast the input object.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -473,21 +608,20 @@ class GroupCoordinator:
             assert src == 0, "Message queue broadcaster only supports src=0"
             return self.mq_broadcaster.broadcast_object(obj)
         if self.rank_in_group == src:
-            torch.distributed.broadcast_object_list([obj],
-                                                    src=self.ranks[src],
-                                                    group=self.cpu_group)
+            torch.distributed.broadcast_object_list(
+                [obj], src=self.ranks[src], group=self.cpu_group
+            )
             return obj
         else:
             recv = [None]
-            torch.distributed.broadcast_object_list(recv,
-                                                    src=self.ranks[src],
-                                                    group=self.cpu_group)
+            torch.distributed.broadcast_object_list(
+                recv, src=self.ranks[src], group=self.cpu_group
+            )
             return recv[0]
 
-    def broadcast_object_list(self,
-                              obj_list: list[Any],
-                              src: int = 0,
-                              group: Optional[ProcessGroup] = None):
+    def broadcast_object_list(
+        self, obj_list: list[Any], src: int = 0, group: ProcessGroup | None = None
+    ):
         """Broadcast the input object list.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -497,9 +631,9 @@ class GroupCoordinator:
         if self.world_size == 1:
             return obj_list
         # Broadcast.
-        torch.distributed.broadcast_object_list(obj_list,
-                                                src=self.ranks[src],
-                                                group=self.device_group)
+        torch.distributed.broadcast_object_list(
+            obj_list, src=self.ranks[src], group=self.device_group
+        )
         return obj_list
 
     def send_object(self, obj: Any, dst: int) -> None:
@@ -510,25 +644,22 @@ class GroupCoordinator:
 
         assert dst != self.rank_in_group, (
             "Invalid destination rank. Destination rank is the same "
-            "as the current rank.")
+            "as the current rank."
+        )
 
         # Serialize object to tensor and get the size as well
         object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
 
-        size_tensor = torch.tensor([object_tensor.numel()],
-                                   dtype=torch.long,
-                                   device="cpu")
+        size_tensor = torch.tensor(
+            [object_tensor.numel()], dtype=torch.long, device="cpu"
+        )
 
         # Send object size
 
-        torch.distributed.send(size_tensor,
-                               dst=self.ranks[dst],
-                               group=self.cpu_group)
+        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
 
         # Send object
-        torch.distributed.send(object_tensor,
-                               dst=self.ranks[dst],
-                               group=self.cpu_group)
+        torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group)
 
         return None
 
@@ -545,22 +676,24 @@ class GroupCoordinator:
         size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
 
         # Receive object size
-        rank_size = torch.distributed.recv(size_tensor,
-                                           src=self.ranks[src],
-                                           group=self.cpu_group)
+        rank_size = torch.distributed.recv(
+            size_tensor, src=self.ranks[src], group=self.cpu_group
+        )
 
         # Tensor to receive serialized objects into.
         object_tensor = torch.empty(  # type: ignore[call-overload]
             size_tensor.item(),  # type: ignore[arg-type]
             dtype=torch.uint8,
-            device="cpu")
+            device="cpu",
+        )
 
-        rank_object = torch.distributed.recv(object_tensor,
-                                             src=self.ranks[src],
-                                             group=self.cpu_group)
+        rank_object = torch.distributed.recv(
+            object_tensor, src=self.ranks[src], group=self.cpu_group
+        )
 
         assert rank_object == rank_size, (
-            "Received object sender rank does not match the size sender rank.")
+            "Received object sender rank does not match the size sender rank."
+        )
 
         obj = pickle.loads(object_tensor.numpy().tobytes())
 
@@ -568,16 +701,16 @@ class GroupCoordinator:
 
     def broadcast_tensor_dict(
         self,
-        tensor_dict: Optional[dict[str, Union[torch.Tensor, Any]]] = None,
+        tensor_dict: dict[str, torch.Tensor | Any] | None = None,
         src: int = 0,
-        group: Optional[ProcessGroup] = None,
-        metadata_group: Optional[ProcessGroup] = None
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        group: ProcessGroup | None = None,
+        metadata_group: ProcessGroup | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Broadcast the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
         # Bypass the function if we are using only 1 GPU.
-        if (not torch.distributed.is_initialized() or self.world_size == 1):
+        if not torch.distributed.is_initialized() or self.world_size == 1:
             return tensor_dict
 
         group = self.device_group
@@ -587,9 +720,9 @@ class GroupCoordinator:
         rank_in_group = self.rank_in_group
         if rank_in_group == src:
             metadata_list: list[tuple[Any, Any]] = []
-            assert isinstance(
-                tensor_dict,
-                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            assert isinstance(tensor_dict, dict), (
+                f"Expecting a dictionary, got {type(tensor_dict)}"
+            )
             metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
             # `metadata_list` lives in CPU memory.
             # `broadcast_object_list` has serialization & deserialization,
@@ -602,16 +735,14 @@ class GroupCoordinator:
                     continue
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=self.ranks[src],
-                                                         group=metadata_group,
-                                                         async_op=True)
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
+                    )
                 else:
                     # use group for GPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=self.ranks[src],
-                                                         group=group,
-                                                         async_op=True)
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=group, async_op=True
+                    )
                 async_handles.append(handle)
             for async_handle in async_handles:
                 async_handle.wait()
@@ -622,9 +753,9 @@ class GroupCoordinator:
             async_handles = []
             for key, value in metadata_list:
                 if isinstance(value, TensorMetadata):
-                    tensor = torch.empty(value.size,
-                                         dtype=value.dtype,
-                                         device=value.device)
+                    tensor = torch.empty(
+                        value.size, dtype=value.dtype, device=value.device
+                    )
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
                         tensor_dict[key] = tensor
@@ -635,14 +766,13 @@ class GroupCoordinator:
                             tensor,
                             src=self.ranks[src],
                             group=metadata_group,
-                            async_op=True)
+                            async_op=True,
+                        )
                     else:
                         # use group for GPU tensors
                         handle = torch.distributed.broadcast(
-                            tensor,
-                            src=self.ranks[src],
-                            group=group,
-                            async_op=True)
+                            tensor, src=self.ranks[src], group=group, async_op=True
+                        )
                     async_handles.append(handle)
                     tensor_dict[key] = tensor
                 else:
@@ -653,11 +783,11 @@ class GroupCoordinator:
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
-        dst: Optional[int] = None,
+        tensor_dict: dict[str, torch.Tensor | Any],
+        dst: int | None = None,
         all_gather_group: Optional["GroupCoordinator"] = None,
-        all_gather_tensors: Optional[dict[str, bool]] = None,
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
 
@@ -679,10 +809,10 @@ class GroupCoordinator:
         # Bypass the function if we are using only 1 GPU.
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return tensor_dict
-        all_gather_size = (1 if all_gather_group is None else
-                           all_gather_group.world_size)
-        all_gather_rank = (0 if all_gather_group is None else
-                           all_gather_group.rank_in_group)
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
 
         group = self.device_group
         metadata_group = self.cpu_group
@@ -695,22 +825,21 @@ class GroupCoordinator:
             if self.device_communicator is None:
                 raise ValueError("No device communicator found")
             self.device_communicator.send_tensor_dict(  # type: ignore
-                tensor_dict, dst)
+                tensor_dict, dst
+            )
             return None
 
         metadata_list: list[tuple[Any, Any]] = []
-        assert isinstance(
-            tensor_dict,
-            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        assert isinstance(tensor_dict, dict), (
+            f"Expecting a dictionary, got {type(tensor_dict)}"
+        )
         metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
         # `metadata_list` lives in CPU memory.
         # `send_object_list` has serialization & deserialization,
         # all happening on CPU. Therefore, we can use the CPU group.
         self.send_object(metadata_list, dst=dst)
 
-        tensor_keys = [
-            k for k, v in tensor_dict.items() if isinstance(v, torch.Tensor)
-        ]
+        tensor_keys = [k for k, v in tensor_dict.items() if isinstance(v, torch.Tensor)]
         assert len(tensor_keys) == len(tensor_list)
 
         for key, tensor in zip(tensor_keys, tensor_list):
@@ -719,31 +848,33 @@ class GroupCoordinator:
                 continue
 
             # send-allgather: send only a slice, then do allgather.
-            use_all_gather = (all_gather_group is not None
-                              and tensor.numel() % all_gather_size == 0)
-            use_all_gather = all_gather_tensors.get(key, use_all_gather) \
-                if all_gather_tensors else use_all_gather
+            use_all_gather = (
+                all_gather_group is not None and tensor.numel() % all_gather_size == 0
+            )
+            use_all_gather = (
+                all_gather_tensors.get(key, use_all_gather)
+                if all_gather_tensors
+                else use_all_gather
+            )
             if use_all_gather:
                 tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
 
             if tensor.is_cpu:
                 # use metadata_group for CPU tensors
-                torch.distributed.send(tensor,
-                                       dst=self.ranks[dst],
-                                       group=metadata_group)
+                torch.distributed.send(
+                    tensor, dst=self.ranks[dst], group=metadata_group
+                )
             else:
                 # use group for GPU tensors
-                torch.distributed.send(tensor,
-                                       dst=self.ranks[dst],
-                                       group=group)
+                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
         return None
 
     def recv_tensor_dict(
         self,
-        src: Optional[int] = None,
+        src: int | None = None,
         all_gather_group: Optional["GroupCoordinator"] = None,
-        all_gather_tensors: Optional[dict[str, bool]] = None,
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Recv the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
 
@@ -765,10 +896,10 @@ class GroupCoordinator:
         # Bypass the function if we are using only 1 GPU.
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return None
-        all_gather_size = (1 if all_gather_group is None else
-                           all_gather_group.world_size)
-        all_gather_rank = (0 if all_gather_group is None else
-                           all_gather_group.rank_in_group)
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
 
         group = self.device_group
         metadata_group = self.cpu_group
@@ -781,45 +912,47 @@ class GroupCoordinator:
             if self.device_communicator is None:
                 raise ValueError("No device communicator found")
             return self.device_communicator.recv_tensor_dict(  # type: ignore
-                src)
+                src
+            )
 
         recv_metadata_list = self.recv_object(src=src)
         tensor_dict: dict[str, Any] = {}
         for key, value in recv_metadata_list:
             if isinstance(value, TensorMetadata):
-                tensor = torch.empty(value.size,
-                                     dtype=value.dtype,
-                                     device=value.device)
+                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
                     tensor_dict[key] = tensor
                     continue
 
                 # send-allgather: send only a slice, then do allgather.
-                use_all_gather = (all_gather_group is not None
-                                  and tensor.numel() % all_gather_size == 0)
-                use_all_gather = all_gather_tensors.get(key, use_all_gather) \
-                    if all_gather_tensors else use_all_gather
+                use_all_gather = (
+                    all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0
+                )
+                use_all_gather = (
+                    all_gather_tensors.get(key, use_all_gather)
+                    if all_gather_tensors
+                    else use_all_gather
+                )
 
                 if use_all_gather:
                     orig_shape = tensor.shape
-                    tensor = tensor.reshape(all_gather_size,
-                                            -1)[all_gather_rank]
+                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
 
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
-                    torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=metadata_group)
+                    torch.distributed.recv(
+                        tensor, src=self.ranks[src], group=metadata_group
+                    )
                 else:
                     # use group for GPU tensors
-                    torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=group)
+                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
                 if use_all_gather:
                     # do the allgather
                     tensor = all_gather_group.all_gather(  # type: ignore
-                        tensor, dim=0)
+                        tensor, dim=0
+                    )
                     tensor = tensor.reshape(orig_shape)
 
                 tensor_dict[key] = tensor
@@ -836,17 +969,16 @@ class GroupCoordinator:
         """
         torch.distributed.barrier(group=self.cpu_group)
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
         self.device_communicator.send(tensor, dst)
 
-    def recv(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
+    ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
         if self.device_communicator is None:
@@ -867,43 +999,48 @@ class GroupCoordinator:
 
     def prepare_communication_buffer_for_model(self, model: torch.nn.Module):
         if self.device_communicator is not None:
-            self.device_communicator.prepare_communication_buffer_for_model(
-                model)
+            self.device_communicator.prepare_communication_buffer_for_model(model)
 
     def dispatch(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False
+        is_sequence_parallel: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.device_communicator is not None:
-            return self.device_communicator.dispatch(hidden_states,
-                                                     router_logits,
-                                                     is_sequence_parallel)
+            return self.device_communicator.dispatch(
+                hidden_states, router_logits, is_sequence_parallel
+            )
         else:
             return hidden_states, router_logits
 
-    def combine(self,
-                hidden_states,
-                is_sequence_parallel: bool = False) -> torch.Tensor:
+    def combine(
+        self, hidden_states, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
         if self.device_communicator is not None:
-            return self.device_communicator.combine(hidden_states,
-                                                    is_sequence_parallel)
+            return self.device_communicator.combine(hidden_states, is_sequence_parallel)
         else:
             return hidden_states
 
 
-_WORLD: Optional[GroupCoordinator] = None
-_NODE_COUNT: Optional[int] = None
+_WORLD: GroupCoordinator | None = None
+_INNER_DP_WORLD: GroupCoordinator | None = None
+_NODE_COUNT: int | None = None
 
 
 def get_world_group() -> GroupCoordinator:
-    assert _WORLD is not None, ("world group is not initialized")
+    assert _WORLD is not None, "world group is not initialized"
     return _WORLD
 
 
-def init_world_group(ranks: list[int], local_rank: int,
-                     backend: str) -> GroupCoordinator:
+def get_inner_dp_world_group() -> GroupCoordinator:
+    assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized"
+    return _INNER_DP_WORLD
+
+
+def init_world_group(
+    ranks: list[int], local_rank: int, backend: str
+) -> GroupCoordinator:
     return GroupCoordinator(
         group_ranks=[ranks],
         local_rank=local_rank,
@@ -918,73 +1055,75 @@ def init_model_parallel_group(
     local_rank: int,
     backend: str,
     use_message_queue_broadcaster: bool = False,
-    group_name: Optional[str] = None,
+    group_name: str | None = None,
+    use_device_communicator: bool = True,
 ) -> GroupCoordinator:
-
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_device_communicator=True,
+        use_device_communicator=use_device_communicator,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
 
 
-_TP: Optional[GroupCoordinator] = None
+_TP: GroupCoordinator | None = None
 
 
 def get_tp_group() -> GroupCoordinator:
-    assert _TP is not None, ("tensor model parallel group is not initialized")
+    assert _TP is not None, "tensor model parallel group is not initialized"
     return _TP
 
 
-@deprecated("`get_tensor_model_parallel_group` has been replaced with "
-            "`get_tp_group` and may be removed after v0.12. Please use "
-            "`get_tp_group` instead.")
+@deprecated(
+    "`get_tensor_model_parallel_group` has been replaced with "
+    "`get_tp_group` and may be removed after v0.12. Please use "
+    "`get_tp_group` instead."
+)
 def get_tensor_model_parallel_group():
     return get_tp_group()
 
 
-_DCP: Optional[GroupCoordinator] = None
+_DCP: GroupCoordinator | None = None
 
 
 def get_dcp_group() -> GroupCoordinator:
-    assert _DCP is not None, (
-        "decode context model parallel group is not initialized")
+    assert _DCP is not None, "decode context model parallel group is not initialized"
     return _DCP
 
 
 # kept for backward compatibility
 get_context_model_parallel_group = get_dcp_group
 
-_PP: Optional[GroupCoordinator] = None
+_PP: GroupCoordinator | None = None
 
-_DP: Optional[GroupCoordinator] = None
+_DP: GroupCoordinator | None = None
 
 
 def get_dp_group() -> GroupCoordinator:
-    assert _DP is not None, ("data parallel group is not initialized")
+    assert _DP is not None, "data parallel group is not initialized"
     return _DP
 
 
-_EP: Optional[GroupCoordinator] = None
+_EP: GroupCoordinator | None = None
 
 
 def get_ep_group() -> GroupCoordinator:
-    assert _EP is not None, ("expert parallel group is not initialized")
+    assert _EP is not None, "expert parallel group is not initialized"
     return _EP
 
 
 def get_pp_group() -> GroupCoordinator:
-    assert _PP is not None, (
-        "pipeline model parallel group is not initialized")
+    assert _PP is not None, "pipeline model parallel group is not initialized"
     return _PP
 
 
-@deprecated("`get_pipeline_model_parallel_group` has been replaced with "
-            "`get_pp_group` and may be removed in v0.12. Please use "
-            "`get_pp_group` instead.")
+@deprecated(
+    "`get_pipeline_model_parallel_group` has been replaced with "
+    "`get_pp_group` and may be removed in v0.12. Please use "
+    "`get_pp_group` instead."
+)
 def get_pipeline_model_parallel_group():
     return get_pp_group()
 
@@ -1005,8 +1144,7 @@ def graph_capture(device: torch.device):
     from other kernels possibly launched on background in the default stream.
     """
     context = GraphCaptureContext(torch.cuda.Stream(device=device))
-    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(
-            context):
+    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(context):
         yield context
 
 
@@ -1020,21 +1158,37 @@ def set_custom_all_reduce(enable: bool):
     _ENABLE_CUSTOM_ALL_REDUCE = enable
 
 
-def init_distributed_environment(world_size: int = -1,
-                                 rank: int = -1,
-                                 distributed_init_method: str = "env://",
-                                 local_rank: int = -1,
-                                 backend: str = "nccl",
-                                 timeout: Optional[timedelta] = None):
+def init_distributed_environment(
+    world_size: int = -1,
+    rank: int = -1,
+    distributed_init_method: str = "env://",
+    local_rank: int = -1,
+    backend: str = "nccl",
+    timeout: timedelta | None = None,
+):
     logger.debug(
-        "world_size=%d rank=%d local_rank=%d "
-        "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
-        distributed_init_method, backend)
+        "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
+        world_size,
+        rank,
+        local_rank,
+        distributed_init_method,
+        backend,
+    )
     from vllm.config import get_current_vllm_config
+
     config = get_current_vllm_config()
-    if config is not None and config.parallel_config.data_parallel_size > 1 \
-        and config.parallel_config.distributed_executor_backend \
-        != "external_launcher":
+    if config is not None and config.parallel_config.nnodes > 1:
+        parallel_config = config.parallel_config
+        ip = parallel_config.master_addr
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        world_size = parallel_config.world_size_across_dp
+        port = parallel_config.master_port
+        distributed_init_method = get_distributed_init_method(ip, port)
+    elif (
+        config is not None
+        and config.parallel_config.data_parallel_size > 1
+        and config.parallel_config.distributed_executor_backend != "external_launcher"
+    ):
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
         # offset the rank by the data parallel rank
@@ -1044,19 +1198,33 @@ def init_distributed_environment(world_size: int = -1,
         ip = parallel_config.data_parallel_master_ip
         port = parallel_config.get_next_dp_init_port()
         distributed_init_method = get_distributed_init_method(ip, port)
-        logger.info(
+        logger.debug(
             "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
-            world_size, rank, distributed_init_method)
+            world_size,
+            rank,
+            distributed_init_method,
+        )
     if not torch.distributed.is_initialized():
+        logger.info(
+            "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
+            world_size,
+            rank,
+            local_rank,
+            distributed_init_method,
+            backend,
+        )
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
-            "distributed environment")
+            "distributed environment"
+        )
         if not torch.distributed.is_backend_available(backend):
             logger.warning(
-                "Distributed backend %s is not available; "
-                "falling back to gloo.", backend)
+                "Distributed backend %s is not available; falling back to gloo.",
+                backend,
+            )
             assert torch.distributed.is_gloo_available(), (
-                "Fallback Gloo backend is not available.")
+                "Fallback Gloo backend is not available."
+            )
             backend = "gloo"
         # this backend is used for WORLD
         torch.distributed.init_process_group(
@@ -1064,34 +1232,52 @@ def init_distributed_environment(world_size: int = -1,
             init_method=distributed_init_method,
             world_size=world_size,
             rank=rank,
-            timeout=timeout)
+            timeout=timeout,
+        )
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
     if local_rank == -1:
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
-        if distributed_init_method == "env://":
-            local_rank = envs.LOCAL_RANK
-        else:
-            local_rank = rank
-    global _WORLD, _NODE_COUNT
+        local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank
+    global _WORLD, _NODE_COUNT, _INNER_DP_WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
-        _NODE_COUNT = _node_count(_WORLD.cpu_group)
-        logger.debug("Detected %d nodes in the distributed environment",
-                     _NODE_COUNT)
+        if config.parallel_config.nnodes > 1:
+            _NODE_COUNT = config.parallel_config.nnodes
+        else:
+            _NODE_COUNT = _node_count(_WORLD.cpu_group)
+        logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
-            "world group already initialized with a different world size")
+            "world group already initialized with a different world size"
+        )
+    if config.parallel_config.nnodes_within_dp > 1:
+        if parallel_config.data_parallel_size > 1:
+            world_size_inner_dp = parallel_config.world_size
+            group_ranks = [
+                [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)]
+                for dp_rank in range(parallel_config.data_parallel_size)
+            ]
+            _INNER_DP_WORLD = init_model_parallel_group(
+                group_ranks,
+                get_world_group().local_rank,
+                backend,
+                use_message_queue_broadcaster=True,
+                group_name="inner_dp_world",
+                use_device_communicator=False,
+            )
+        else:
+            _INNER_DP_WORLD = _WORLD
 
 
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
-    decode_context_model_parallel_size: Optional[int] = 1,
-    backend: Optional[str] = None,
+    decode_context_model_parallel_size: int | None = 1,
+    backend: str | None = None,
 ) -> None:
     """
     Initialize model parallel groups.
@@ -1120,11 +1306,11 @@ def initialize_model_parallel(
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
     rank = torch.distributed.get_rank()
-    backend = backend or torch.distributed.get_backend(
-        get_world_group().device_group)
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
 
     data_parallel_size = 1
     from vllm.config import get_current_vllm_config
+
     config = get_current_vllm_config()
     if config is not None:
         data_parallel_size = config.parallel_config.data_parallel_size
@@ -1139,107 +1325,115 @@ def initialize_model_parallel(
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        -1, data_parallel_size, pipeline_model_parallel_size,
-        tensor_model_parallel_size)  # noqa
+        -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size
+    )  # noqa
 
     # Build the tensor model-parallel groups.
     global _TP
-    assert _TP is None, ("tensor model parallel group is already initialized")
+    assert _TP is None, "tensor model parallel group is already initialized"
     group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
 
     # message queue broadcaster is only used in tensor model parallel group
-    _TP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank,
-                                    backend,
-                                    use_message_queue_broadcaster=True,
-                                    group_name="tp")
+    _TP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_message_queue_broadcaster=True,
+        group_name="tp",
+    )
 
     # Build the DCP model-parallel groups.
     global _DCP
-    assert _DCP is None, (
-        "decode context model parallel group is already initialized")
+    assert _DCP is None, "decode context model parallel group is already initialized"
     # Note(hc): In the current implementation of decode context parallel,
     # dcp_size must not exceed tp_size, because the world size does not
     # change by DCP, it simply reuses the GPUs of TP group, and split one
     # TP group into tp_size//dcp_size DCP groups.
-    group_ranks = all_ranks.reshape(
-        -1, decode_context_model_parallel_size).unbind(0)
+    group_ranks = all_ranks.reshape(-1, decode_context_model_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
-    _DCP = init_model_parallel_group(group_ranks,
-                                     get_world_group().local_rank,
-                                     backend,
-                                     use_message_queue_broadcaster=True,
-                                     group_name="dcp")
+    _DCP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_message_queue_broadcaster=True,
+        group_name="dcp",
+    )
 
     # Build the pipeline model-parallel groups.
     global _PP
-    assert _PP is None, (
-        "pipeline model parallel group is already initialized")
-    group_ranks = all_ranks.transpose(2, 3).reshape(
-        -1, pipeline_model_parallel_size).unbind(0)
+    assert _PP is None, "pipeline model parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0)
+    )
     group_ranks = [x.tolist() for x in group_ranks]
-    _PP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank,
-                                    backend,
-                                    group_name="pp")
+    _PP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="pp"
+    )
 
     global _DP
-    assert _DP is None, ("data parallel group is already initialized")
-    group_ranks = all_ranks.transpose(1,
-                                      3).reshape(-1,
-                                                 data_parallel_size).unbind(0)
+    assert _DP is None, "data parallel group is already initialized"
+    group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
-    _DP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank,
-                                    backend,
-                                    group_name="dp")
+    _DP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="dp"
+    )
 
     global _EP
-    assert _EP is None, ("expert parallel group is already initialized")
-    group_ranks = all_ranks.transpose(1, 2).reshape(
-        -1, data_parallel_size * tensor_model_parallel_size).unbind(0)
+    assert _EP is None, "expert parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(1, 2)
+        .reshape(-1, data_parallel_size * tensor_model_parallel_size)
+        .unbind(0)
+    )
     group_ranks = [x.tolist() for x in group_ranks]
-    _EP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank,
-                                    backend,
-                                    group_name="ep")
+    _EP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="ep"
+    )
 
-    logger.info(
+    logger.info_once(
         "rank %s in world size %s is assigned as "
-        "DP rank %s, PP rank %s, TP rank %s, EP rank %s", rank, world_size,
-        _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group,
-        _EP.rank_in_group)
+        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
+        rank,
+        world_size,
+        _DP.rank_in_group,
+        _PP.rank_in_group,
+        _TP.rank_in_group,
+        _EP.rank_in_group,
+    )
 
 
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
-    decode_context_model_parallel_size: Optional[int] = 1,
-    backend: Optional[str] = None,
+    decode_context_model_parallel_size: int | None = 1,
+    backend: str | None = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
     or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
     values if the model parallel groups are initialized.
     """
-    backend = backend or torch.distributed.get_backend(
-        get_world_group().device_group)
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
     if not model_parallel_is_initialized():
-        initialize_model_parallel(tensor_model_parallel_size,
-                                  pipeline_model_parallel_size,
-                                  decode_context_model_parallel_size, backend)
+        initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            decode_context_model_parallel_size,
+            backend,
+        )
         return
 
-    assert (
-        get_tensor_model_parallel_world_size() == tensor_model_parallel_size
-    ), ("tensor parallel group already initialized, but of unexpected size. "
+    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
+        "tensor parallel group already initialized, but of unexpected size. "
         f"got: {get_tensor_model_parallel_world_size()=} vs. "
-        f"wanted: {tensor_model_parallel_size=}")
+        f"wanted: {tensor_model_parallel_size=}"
+    )
     pp_world_size = get_pp_group().world_size
-    assert (pp_world_size == pipeline_model_parallel_size), (
+    assert pp_world_size == pipeline_model_parallel_size, (
         "pipeline parallel group already initialized, but of unexpected size. "
         f"got: {pp_world_size=} vs. "
-        f"wanted: {pipeline_model_parallel_size=}")
+        f"wanted: {pipeline_model_parallel_size=}"
+    )
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1261,7 +1455,7 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
 
 def model_parallel_is_initialized():
     """Check if tensor and pipeline parallel groups are initialized."""
-    return (_TP is not None and _PP is not None)
+    return _TP is not None and _PP is not None
 
 
 _TP_STATE_PATCHED = False
@@ -1313,9 +1507,8 @@ def get_decode_context_model_parallel_rank():
 
 
 def get_node_count() -> int:
-    """Return the total number of nodes in the distributed environment. """
-    assert _NODE_COUNT is not None, (
-        "distributed environment is not initialized")
+    """Return the total number of nodes in the distributed environment."""
+    assert _NODE_COUNT is not None, "distributed environment is not initialized"
     return _NODE_COUNT
 
 
@@ -1359,13 +1552,18 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    # Ensure all objects are not freezed before cleanup
+    gc.unfreeze()
+
     destroy_model_parallel()
     destroy_distributed_environment()
     if shutdown_ray:
         import ray  # Lazy import Ray
+
         ray.shutdown()
     gc.collect()
     from vllm.platforms import current_platform
+
     empty_cache = current_platform.empty_cache
     if empty_cache is not None:
         empty_cache()
@@ -1373,21 +1571,21 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         if not current_platform.is_cpu():
             torch._C._host_emptyCache()
     except AttributeError:
-        logger.warning(
-            "torch._C._host_emptyCache() only available in Pytorch >=2.5")
+        logger.warning("torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
-def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
-                        source_rank: int = 0) -> list[bool]:
+def in_the_same_node_as(
+    pg: ProcessGroup | StatelessProcessGroup, source_rank: int = 0
+) -> list[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
     if isinstance(pg, ProcessGroup):
-        assert torch.distributed.get_backend(
-            pg) != torch.distributed.Backend.NCCL, (
-                "in_the_same_node_as should be tested with a non-NCCL group.")
+        assert torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL, (
+            "in_the_same_node_as should be tested with a non-NCCL group."
+        )
         # local rank inside the group
         rank = torch.distributed.get_rank(group=pg)
         world_size = torch.distributed.get_world_size(group=pg)
@@ -1400,7 +1598,9 @@ def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
         ranks = list(range(world_size))
 
     # local tensor in each process to store the result
-    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+    is_in_the_same_node = torch.tensor(
+        [0] * world_size, dtype=torch.int32, device="cpu"
+    )
 
     magic_message = b"magic_message"
     shm = None
@@ -1410,10 +1610,11 @@ def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
             if rank == source_rank:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
-                shm.buf[:len(magic_message)] = magic_message
+                shm.buf[: len(magic_message)] = magic_message
                 if isinstance(pg, ProcessGroup):
                     torch.distributed.broadcast_object_list(
-                        [shm.name], src=ranks[source_rank], group=pg)
+                        [shm.name], src=ranks[source_rank], group=pg
+                    )
                 else:
                     pg.broadcast_obj(shm.name, src=source_rank)
                 is_in_the_same_node[rank] = 1
@@ -1422,17 +1623,20 @@ def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
                 if isinstance(pg, ProcessGroup):
                     recv = [None]
                     torch.distributed.broadcast_object_list(
-                        recv, src=ranks[source_rank], group=pg)
+                        recv, src=ranks[source_rank], group=pg
+                    )
                     name = recv[0]
                 else:
                     name = pg.broadcast_obj(None, src=source_rank)
                 # fix to https://stackoverflow.com/q/62748654/9191338
                 # Python incorrectly tracks shared memory even if it is not
                 # created by the process. The following patch is a workaround.
-                with patch("multiprocessing.resource_tracker.register",
-                           lambda *args, **kwargs: None):
+                with patch(
+                    "multiprocessing.resource_tracker.register",
+                    lambda *args, **kwargs: None,
+                ):
                     shm = shared_memory.SharedMemory(name=name)
-                if shm.buf[:len(magic_message)] == magic_message:
+                if shm.buf[: len(magic_message)] == magic_message:
                     is_in_the_same_node[rank] = 1
     except Exception as e:
         logger.error("Error ignored in is_in_the_same_node: %s", e)
@@ -1493,7 +1697,30 @@ def is_global_first_rank() -> bool:
         return True
 
 
-def _node_count(pg: Union[ProcessGroup, StatelessProcessGroup]) -> int:
+def is_local_first_rank() -> bool:
+    """
+    Check if the current process is the first local rank (rank 0 on its node).
+    """
+    try:
+        # prefer the initialized world group if available
+        global _WORLD
+        if _WORLD is not None:
+            return _WORLD.local_rank == 0
+
+        if not torch.distributed.is_initialized():
+            return True
+
+        # fallback to environment-provided local rank if available
+        # note: envs.LOCAL_RANK is set when using env:// launchers (e.g., torchrun)
+        try:
+            return int(envs.LOCAL_RANK) == 0  # type: ignore[arg-type]
+        except Exception:
+            return torch.distributed.get_rank() == 0
+    except Exception:
+        return True
+
+
+def _node_count(pg: ProcessGroup | StatelessProcessGroup) -> int:
     """
     Returns the total number of nodes in the process group.
 
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
index 0a786b4a1708f7c329d6332903180255d16a5287..4ff1f0ce4410a6dd0a1e9540cd6d83c955461c5f 100644
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -10,18 +10,17 @@ import torch_xla.distributed.spmd as xs
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 
 logger = init_logger(__name__)
 
 
 class XlaQKVParallelLinear(nn.Module):
-
-    def __init__(self,
-                 qkv_linear: nn.Module,
-                 mesh: Optional["xs.Mesh"] = None):
+    def __init__(self, qkv_linear: nn.Module, mesh: Optional["xs.Mesh"] = None):
         super().__init__()
         assert isinstance(qkv_linear, QKVParallelLinear)
         self.skip_bias_add = qkv_linear.skip_bias_add
@@ -31,29 +30,30 @@ class XlaQKVParallelLinear(nn.Module):
         self.q_weight: Parameter
         self.k_weight: Parameter
         self.v_weight: Parameter
-        self.q_bias: Optional[Parameter]
-        self.k_bias: Optional[Parameter]
-        self.v_bias: Optional[Parameter]
+        self.q_bias: Parameter | None
+        self.k_bias: Parameter | None
+        self.v_bias: Parameter | None
         self._load_weights_from_qkv_linear(qkv_linear)
         if mesh is not None:
             self._shard_weight(mesh)
 
     def _shard_weight(self, mesh: "xs.Mesh"):
-        self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False)
-        self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False)
-        self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False)
-        xs.mark_sharding(self.q_weight, mesh, ('x', None))
-        xs.mark_sharding(self.k_weight, mesh, ('x', None))
-        xs.mark_sharding(self.v_weight, mesh, ('x', None))
+        self.q_weight = Parameter(self.q_weight.to("xla"), requires_grad=False)
+        self.k_weight = Parameter(self.k_weight.to("xla"), requires_grad=False)
+        self.v_weight = Parameter(self.v_weight.to("xla"), requires_grad=False)
+        xs.mark_sharding(self.q_weight, mesh, ("x", None))
+        xs.mark_sharding(self.k_weight, mesh, ("x", None))
+        xs.mark_sharding(self.v_weight, mesh, ("x", None))
         if self.q_bias is not None:
-            assert self.k_bias is not None and self.v_bias is not None, \
+            assert self.k_bias is not None and self.v_bias is not None, (
                 "QKVParallelLinear should have q, k, and v biases together."
-            self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False)
-            xs.mark_sharding(self.q_bias, mesh, ('x', ))
-            self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False)
-            xs.mark_sharding(self.k_bias, mesh, ('x', ))
-            self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False)
-            xs.mark_sharding(self.v_bias, mesh, ('x', ))
+            )
+            self.q_bias = Parameter(self.q_bias.to("xla"), requires_grad=False)
+            xs.mark_sharding(self.q_bias, mesh, ("x",))
+            self.k_bias = Parameter(self.k_bias.to("xla"), requires_grad=False)
+            xs.mark_sharding(self.k_bias, mesh, ("x",))
+            self.v_bias = Parameter(self.v_bias.to("xla"), requires_grad=False)
+            xs.mark_sharding(self.v_bias, mesh, ("x",))
 
     def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module):
         q_proj_size, k_proj_size, _ = qkv_linear.output_sizes
@@ -61,22 +61,25 @@ class XlaQKVParallelLinear(nn.Module):
         # along the output dimension.
         qkv_weight = qkv_linear.weight.data.cpu()
         q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False)
-        k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size],
-                             requires_grad=False)
-        v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:],
-                             requires_grad=False)
+        k_weight = Parameter(
+            qkv_weight[q_proj_size : q_proj_size + k_proj_size], requires_grad=False
+        )
+        v_weight = Parameter(
+            qkv_weight[q_proj_size + k_proj_size :], requires_grad=False
+        )
         self.register_parameter("q_weight", q_weight)
         self.register_parameter("k_weight", k_weight)
         self.register_parameter("v_weight", v_weight)
 
         if qkv_linear.bias is not None:
-            q_bias = Parameter(qkv_linear.bias[:q_proj_size],
-                               requires_grad=False)
-            k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size +
-                                               k_proj_size],
-                               requires_grad=False)
-            v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:],
-                               requires_grad=False)
+            q_bias = Parameter(qkv_linear.bias[:q_proj_size], requires_grad=False)
+            k_bias = Parameter(
+                qkv_linear.bias[q_proj_size : q_proj_size + k_proj_size],
+                requires_grad=False,
+            )
+            v_bias = Parameter(
+                qkv_linear.bias[q_proj_size + k_proj_size :], requires_grad=False
+            )
             self.register_parameter("q_bias", q_bias)
             self.register_parameter("k_bias", k_bias)
             self.register_parameter("v_bias", v_bias)
@@ -102,42 +105,48 @@ class XlaQKVParallelLinear(nn.Module):
         # The concat and the following split will be noop, and should be
         # optimized away by the compiler.
         qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1)
-        output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \
-                            self.skip_bias_add else None
+        output_bias = (
+            torch.cat([q_bias, k_bias, v_bias], dim=-1) if self.skip_bias_add else None
+        )
         if not self.return_bias:
             return qkv_proj
         return qkv_proj, output_bias
 
 
-def partition_column_parallel_linear(layer: torch.nn.Module,
-                                     mesh: xs.Mesh) -> torch.nn.Module:
+def partition_column_parallel_linear(
+    layer: torch.nn.Module, mesh: xs.Mesh
+) -> torch.nn.Module:
     assert isinstance(layer, ColumnParallelLinear)
-    xs.mark_sharding(layer.weight, mesh, ('x', None))
+    xs.mark_sharding(layer.weight, mesh, ("x", None))
     logger.debug("Applied column-parallel sharding to %s", layer)
     return layer
 
 
-def partition_row_parallel_linear(layer: torch.nn.Module,
-                                  mesh: xs.Mesh) -> torch.nn.Module:
+def partition_row_parallel_linear(
+    layer: torch.nn.Module, mesh: xs.Mesh
+) -> torch.nn.Module:
     assert isinstance(layer, RowParallelLinear)
-    xs.mark_sharding(layer.weight, mesh, (None, 'x'))
+    xs.mark_sharding(layer.weight, mesh, (None, "x"))
     logger.debug("Applied row-parallel sharding to %s", layer)
     return layer
 
 
-def partition_qkv_parallel_linear(layer: torch.nn.Module,
-                                  mesh: xs.Mesh) -> torch.nn.Module:
+def partition_qkv_parallel_linear(
+    layer: torch.nn.Module, mesh: xs.Mesh
+) -> torch.nn.Module:
     assert isinstance(layer, QKVParallelLinear)
     xla_layer = XlaQKVParallelLinear(layer, mesh)
     logger.debug("Applied qkv parallel sharding to %s", layer)
     return xla_layer
 
 
-MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([
-    ("QKVParallelLinear", partition_qkv_parallel_linear),
-    ("ColumnParallelLinear", partition_column_parallel_linear),
-    ("RowParallelLinear", partition_row_parallel_linear),
-])
+MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict(
+    [
+        ("QKVParallelLinear", partition_qkv_parallel_linear),
+        ("ColumnParallelLinear", partition_column_parallel_linear),
+        ("RowParallelLinear", partition_row_parallel_linear),
+    ]
+)
 
 
 def get_fqn(module):
@@ -147,9 +156,9 @@ def get_fqn(module):
 
 def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
     """
-    Recursively check a PyTorch model and apply appropriate sharding based on 
+    Recursively check a PyTorch model and apply appropriate sharding based on
     the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
-    
+
     Args:
         model: torch.nn.Module to process
         mesh: An XLA SPMD mesh object used for sharding
@@ -161,7 +170,8 @@ def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
                 wrapped_module = wrapping_func(module, mesh)
 
                 assert parent is not None and name is not None, (
-                    "Top Level module is not expected to be wrapped.")
+                    "Top Level module is not expected to be wrapped."
+                )
                 if wrapped_module is not module:
                     # Wrapped module and module are different py object.
                     # The original module should be replaced by the
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 67f71643d039c3f6b05e93172017f7c3baa77723..debf69c49b7d9c20d5c79d872124735deb95e09b 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,27 +15,31 @@ import uuid
 from collections import deque
 from collections.abc import Sequence
 from datetime import timedelta
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.distributed import ProcessGroup, TCPStore
-from torch.distributed.distributed_c10d import (Backend, PrefixStore,
-                                                _get_default_timeout,
-                                                _unregister_process_group)
+from torch.distributed.distributed_c10d import (
+    Backend,
+    PrefixStore,
+    _get_default_timeout,
+    _unregister_process_group,
+)
 from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import get_tcp_uri, is_torch_equal_or_newer
+from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
 # We prefer to use os.sched_yield as it results in tighter polling loops,
 # measured to be around 3e-7 seconds. However on earlier versions of Python
 # os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
-USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
-                   or (sys.version_info[:2] == (3, 10)
-                       and sys.version_info[2] >= 8))
+USE_SCHED_YIELD = (sys.version_info[:3] >= (3, 11, 1)) or (
+    sys.version_info[:2] == (3, 10) and sys.version_info[2] >= 8
+)
 
 
 def sched_yield():
@@ -48,7 +52,8 @@ def sched_yield():
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
     assert numerator % denominator == 0, "{} is not divisible by {}".format(
-        numerator, denominator)
+        numerator, denominator
+    )
 
 
 def divide(numerator, denominator):
@@ -63,16 +68,16 @@ def split_tensor_along_last_dim(
     num_partitions: int,
     contiguous_split_chunks: bool = False,
 ) -> Sequence[torch.Tensor]:
-    """ Split a tensor along its last dimension.
+    """Split a tensor along its last dimension.
 
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                     in memory.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
 
-        Returns:
-            A list of Tensors
+    Returns:
+        A list of Tensors
     """
     # Get the size and dimension.
     last_dim = tensor.dim() - 1
@@ -86,8 +91,9 @@ def split_tensor_along_last_dim(
     return tensor_list
 
 
-def get_pp_indices(num_hidden_layers: int, pp_rank: int,
-                   pp_size: int) -> tuple[int, int]:
+def get_pp_indices(
+    num_hidden_layers: int, pp_rank: int, pp_size: int
+) -> tuple[int, int]:
     """Try to evenly distribute layers across partitions.
 
     If the number of layers is not divisible by the number of partitions,
@@ -104,17 +110,15 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
     partition_list_str = envs.VLLM_PP_LAYER_PARTITION
     if partition_list_str is not None:
         try:
-            partitions = [
-                int(layer) for layer in partition_list_str.split(",")
-            ]
+            partitions = [int(layer) for layer in partition_list_str.split(",")]
         except ValueError as err:
-            raise ValueError("Invalid partition string: {}".format(
-                partition_list_str)) from err
+            raise ValueError(
+                "Invalid partition string: {}".format(partition_list_str)
+            ) from err
         if len(partitions) != pp_size:
             raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
         if sum(partitions) != num_hidden_layers:
-            raise ValueError(
-                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
     else:
         layers_per_partition = num_hidden_layers // pp_size
         partitions = [layers_per_partition for _ in range(pp_size)]
@@ -126,7 +130,8 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
                 "Hidden layers were unevenly partitioned: [%s]. "
                 "This can be manually overridden using the "
                 "VLLM_PP_LAYER_PARTITION environment variable",
-                ",".join(str(p) for p in partitions))
+                ",".join(str(p) for p in partitions),
+            )
 
     start_layer = sum(partitions[:pp_rank])
     end_layer = start_layer + partitions[pp_rank]
@@ -140,12 +145,13 @@ class StatelessProcessGroup:
     group. Only use it to communicate metadata between processes.
     For data-plane communication, create NCCL-related objects.
     """
+
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
 
     # stores a reference to the socket so that the file descriptor stays alive
-    socket: Optional[socket.socket]
+    socket: socket.socket | None
 
     data_expiration_seconds: int = 3600  # 1 hour
 
@@ -154,21 +160,16 @@ class StatelessProcessGroup:
     # src rank -> counter
     recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict)
     broadcast_send_counter: int = 0
-    broadcast_recv_src_counter: dict[int, int] = dataclasses.field(
-        default_factory=dict)
+    broadcast_recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict)
 
     # A deque to store the data entries, with key and timestamp.
-    entries: deque[tuple[str,
-                         float]] = dataclasses.field(default_factory=deque)
+    entries: deque[tuple[str, float]] = dataclasses.field(default_factory=deque)
 
     def __post_init__(self):
         assert self.rank < self.world_size
         self.send_dst_counter = {i: 0 for i in range(self.world_size)}
         self.recv_src_counter = {i: 0 for i in range(self.world_size)}
-        self.broadcast_recv_src_counter = {
-            i: 0
-            for i in range(self.world_size)
-        }
+        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}
 
     def send_obj(self, obj: Any, dst: int):
         """Send an object to a destination rank."""
@@ -192,27 +193,25 @@ class StatelessProcessGroup:
     def recv_obj(self, src: int) -> Any:
         """Receive an object from a source rank."""
         obj = pickle.loads(
-            self.store.get(
-                f"send_to/{self.rank}/{self.recv_src_counter[src]}"))
+            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
+        )
         self.recv_src_counter[src] += 1
         return obj
 
-    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+    def broadcast_obj(self, obj: Any | None, src: int) -> Any:
         """Broadcast an object from a source rank to all other ranks.
         It does not clean up after all ranks have received the object.
         Use it for limited times, e.g., for initialization.
         """
         if self.rank == src:
             self.expire_data()
-            key = (f"broadcast_from/{src}/"
-                   f"{self.broadcast_send_counter}")
+            key = f"broadcast_from/{src}/{self.broadcast_send_counter}"
             self.store.set(key, pickle.dumps(obj))
             self.broadcast_send_counter += 1
             self.entries.append((key, time.time()))
             return obj
         else:
-            key = (f"broadcast_from/{src}/"
-                   f"{self.broadcast_recv_src_counter[src]}")
+            key = f"broadcast_from/{src}/{self.broadcast_recv_src_counter[src]}"
             recv_obj = pickle.loads(self.store.get(key))
             self.broadcast_recv_src_counter[src] += 1
             return recv_obj
@@ -278,8 +277,7 @@ class StatelessProcessGroup:
             # Check for timeout
             cur_time = time.time()
             if cur_time - start_time > timeout:
-                raise RuntimeError("Barrier timed out after %f seconds",
-                                   timeout)
+                raise RuntimeError(f"Barrier timed out after {timeout:.2f} seconds")
 
             # Check for each process
             for i in range(self.world_size):
@@ -326,8 +324,9 @@ class StatelessProcessGroup:
         while len(processes_departed) < self.world_size:
             # Check for timeout
             if time.time() - start_time > timeout:
-                raise RuntimeError("Barrier departure timed out after %f s",
-                                   timeout)
+                raise RuntimeError(
+                    f"Barrier departure timed out after {timeout:.2f} seconds"
+                )
 
             # Check for each process
             for i in range(self.world_size):
@@ -356,14 +355,12 @@ class StatelessProcessGroup:
             try:
                 self.store.delete_key(f"arrival_{barrier_id}_{i}")
             except Exception:
-                logger.debug("Error deleting key: %s",
-                             f'arrival_{barrier_id}_{i}')
+                logger.debug("Error deleting key: %s", f"arrival_{barrier_id}_{i}")
 
             try:
                 self.store.delete_key(f"departure_{barrier_id}_{i}")
             except Exception:
-                logger.debug("Error deleting key: %s",
-                             f'departure_{barrier_id}_{i}')
+                logger.debug("Error deleting key: %s", f"departure_{barrier_id}_{i}")
 
     @staticmethod
     def create(
@@ -388,7 +385,7 @@ class StatelessProcessGroup:
         used for exchanging metadata. With this function, process A and process B
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
-        """ # noqa
+        """  # noqa
         launch_server = rank == 0
         if launch_server:
             # listen on the specified interface (instead of 0.0.0.0)
@@ -416,14 +413,18 @@ class StatelessProcessGroup:
             world_size=world_size,
             store=store,
             socket=listen_socket,
-            data_expiration_seconds=data_expiration_seconds)
+            data_expiration_seconds=data_expiration_seconds,
+        )
 
 
-def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
-                            group_rank: int, group_size: int,
-                            timeout: timedelta) -> ProcessGroup:
+def init_gloo_process_group(
+    prefix_store: PrefixStore,
+    group_rank: int,
+    group_size: int,
+    timeout: timedelta,
+) -> ProcessGroup:
     """
-    Stateless init ProcessGroup with gloo backend compatible with 
+    Stateless init ProcessGroup with gloo backend compatible with
     different torch versions.
     """
     if is_torch_equal_or_newer("2.6"):
@@ -433,7 +434,7 @@ def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
             group_size,
         )
     else:
-        options = ProcessGroup.Options(backend=backend)
+        options = ProcessGroup.Options(backend="gloo")
         pg = ProcessGroup(
             prefix_store,
             group_rank,
@@ -441,10 +442,10 @@ def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
             options,
         )
     from torch.distributed.distributed_c10d import ProcessGroupGloo
-    backend_class = ProcessGroupGloo(prefix_store,
-                                     group_rank,
-                                     group_size,
-                                     timeout=timeout)
+
+    backend_class = ProcessGroupGloo(
+        prefix_store, group_rank, group_size, timeout=timeout
+    )
     backend_type = ProcessGroup.BackendType.GLOO
     device = torch.device("cpu")
     if is_torch_equal_or_newer("2.6"):
@@ -457,8 +458,8 @@ def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
 
 
 def stateless_init_torch_distributed_process_group(
-        host: str, port: int, rank: int, world_size: int,
-        backend: str) -> ProcessGroup:
+    host: str, port: int, rank: int, world_size: int, backend: str
+) -> ProcessGroup:
     """
     A replacement for `torch.distributed.init_process_group` that does not
     pollute the global state. The created ProcessGroup object can be used for
@@ -495,7 +496,8 @@ def stateless_init_torch_distributed_process_group(
     timeout = _get_default_timeout(backend)
 
     store, rank, world_size = next(
-        rendezvous(init_method, rank, world_size, timeout=timeout))
+        rendezvous(init_method, rank, world_size, timeout=timeout)
+    )
     store.set_timeout(timeout)
 
     group_rank = rank
@@ -504,24 +506,28 @@ def stateless_init_torch_distributed_process_group(
     # Use a PrefixStore to avoid accidental overrides of keys used by
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
+    try:
+        from vllm.platforms import current_platform
+
+        return current_platform.stateless_init_device_torch_dist_pg(
+            backend=backend,
+            prefix_store=prefix_store,
+            group_rank=group_rank,
+            group_size=group_size,
+            timeout=timeout,
+        )
+    except NotImplementedError:
+        # If platform doesn't implement stateless_init_device_torch_dist_pg, it
+        # will raise a NotImplementedError. In this case, we fall back to gloo.
+        return init_gloo_process_group(
+            prefix_store=prefix_store,
+            group_rank=group_rank,
+            group_size=group_size,
+            timeout=timeout,
+        )
 
-    if backend == "gloo":
-        return init_gloo_process_group(backend=backend,
-                                       prefix_store=prefix_store,
-                                       group_rank=group_rank,
-                                       group_size=group_size,
-                                       timeout=timeout)
-    from vllm.platforms import current_platform
-    return current_platform.stateless_init_device_torch_dist_pg(
-        backend=backend,
-        prefix_store=prefix_store,
-        group_rank=group_rank,
-        group_size=group_size,
-        timeout=timeout)
-
-
-def stateless_destroy_torch_distributed_process_group(
-        pg: ProcessGroup) -> None:
+
+def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None:
     """
     Destroy ProcessGroup returned by
         stateless_init_torch_distributed_process_group().
@@ -531,6 +537,7 @@ def stateless_destroy_torch_distributed_process_group(
     else:
         # Lazy import for non-CUDA backends.
         from torch.distributed.distributed_c10d import _shutdown_backend
+
         _shutdown_backend(pg)
 
     _unregister_process_group(pg.group_name)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7fbf0879308d86a2bd6b6efcea4cd5dc6a122dab..ab6e5e594c239c801f6dd3cc205cec53c312ef89 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,62 +1,104 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# yapf: disable
 import argparse
 import copy
 import dataclasses
 import functools
 import json
 import sys
+from collections.abc import Callable
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
-from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
-                    Literal, Optional, Type, TypeVar, Union, cast, get_args,
-                    get_origin)
+from types import UnionType
+from typing import (
+    TYPE_CHECKING,
+    Annotated,
+    Any,
+    Literal,
+    TypeAlias,
+    TypeVar,
+    Union,
+    cast,
+    get_args,
+    get_origin,
+)
 
 import huggingface_hub
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
+from pydantic.fields import FieldInfo
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
-from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigType, ConvertOption, DetailedTraceModules,
-                         Device, DeviceConfig, DistributedExecutorBackend,
-                         EPLBConfig, HfOverrides, KVEventsConfig,
-                         KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
-                         ModelDType, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
-                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         StructuredOutputsConfig, TaskOption, TokenizerMode,
-                         VllmConfig, get_attr_docs)
-from vllm.config.multimodal import MMCacheType, MultiModalConfig
-from vllm.config.parallel import ExpertPlacementStrategy
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    ConfigType,
+    DeviceConfig,
+    ECTransferConfig,
+    EPLBConfig,
+    KVEventsConfig,
+    KVTransferConfig,
+    LoadConfig,
+    LoRAConfig,
+    ModelConfig,
+    MultiModalConfig,
+    ObservabilityConfig,
+    ParallelConfig,
+    PoolerConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    StructuredOutputsConfig,
+    VllmConfig,
+    get_attr_docs,
+)
+from vllm.config.cache import (
+    BlockSize,
+    CacheDType,
+    KVOffloadingBackend,
+    MambaDType,
+    PrefixCachingHashAlgo,
+)
+from vllm.config.device import Device
+from vllm.config.model import (
+    ConvertOption,
+    HfOverrides,
+    LogprobsMode,
+    ModelDType,
+    RunnerOption,
+    TaskOption,
+    TokenizerMode,
+)
+from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
+from vllm.config.observability import DetailedTraceModules
+from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
+from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
-from vllm.ray.lazy_utils import is_ray_initialized
-from vllm.reasoning import ReasoningParserManager
-from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
-from vllm.transformers_utils.config import (get_model_path, is_interleaved,
-                                            maybe_override_with_speculators)
-from vllm.transformers_utils.utils import check_gguf_file
-from vllm.utils import (FlexibleArgumentParser, GiB_bytes, get_ip,
-                        is_in_ray_actor)
+from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
+from vllm.transformers_utils.config import (
+    get_model_path,
+    is_interleaved,
+    maybe_override_with_speculators,
+)
+from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.network_utils import get_ip
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
-# yapf: enable
-
 if TYPE_CHECKING:
-    from vllm.executor.executor_base import ExecutorBase
     from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.model_executor.model_loader import LoadFormats
     from vllm.usage.usage_lib import UsageContext
+    from vllm.v1.executor import Executor
 else:
-    ExecutorBase = Any
+    Executor = Any
     QuantizationMethods = Any
     LoadFormats = Any
     UsageContext = Any
@@ -65,26 +107,24 @@ logger = init_logger(__name__)
 
 # object is used to allow for special typing forms
 T = TypeVar("T")
-TypeHint = Union[type[Any], object]
-TypeHintT = Union[type[T], object]
+TypeHint: TypeAlias = type[Any] | object
+TypeHintT: TypeAlias = type[T] | object
 
 
 def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
-
     def _parse_type(val: str) -> T:
         try:
             return return_type(val)
         except ValueError as e:
             raise argparse.ArgumentTypeError(
-                f"Value {val} cannot be converted to {return_type}.") from e
+                f"Value {val} cannot be converted to {return_type}."
+            ) from e
 
     return _parse_type
 
 
-def optional_type(
-        return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
-
-    def _optional_type(val: str) -> Optional[T]:
+def optional_type(return_type: Callable[[str], T]) -> Callable[[str], T | None]:
+    def _optional_type(val: str) -> T | None:
         if val == "" or val == "None":
             return None
         return parse_type(return_type)(val)
@@ -92,7 +132,7 @@ def optional_type(
     return _optional_type
 
 
-def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
+def union_dict_and_str(val: str) -> str | dict[str, str] | None:
     if not re.match(r"(?s)^\s*{.*}\s*$", val):
         return str(val)
     return optional_type(json.loads)(val)
@@ -124,11 +164,37 @@ def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
     if not all(isinstance(option, option_type) for option in options):
         raise ValueError(
             "All options must be of the same type. "
-            f"Got {options} with types {[type(c) for c in options]}")
+            f"Got {options} with types {[type(c) for c in options]}"
+        )
     kwarg = "metavar" if contains_type(type_hints, str) else "choices"
     return {"type": option_type, kwarg: sorted(options)}
 
 
+def collection_to_kwargs(type_hints: set[TypeHint], type: TypeHint) -> dict[str, Any]:
+    type_hint = get_type(type_hints, type)
+    types = get_args(type_hint)
+    elem_type = types[0]
+
+    # Handle Ellipsis
+    assert all(t is elem_type for t in types if t is not Ellipsis), (
+        f"All non-Ellipsis elements must be of the same type. Got {types}."
+    )
+
+    # Handle Union types
+    if get_origin(elem_type) in {Union, UnionType}:
+        # Union for Union[X, Y] and UnionType for X | Y
+        assert str in get_args(elem_type), (
+            "If element can have multiple types, one must be 'str' "
+            f"(i.e. 'list[int | str]'). Got {elem_type}."
+        )
+        elem_type = str
+
+    return {
+        "type": elem_type,
+        "nargs": "+" if type is not tuple or Ellipsis in types else len(types),
+    }
+
+
 def is_not_builtin(type_hint: TypeHint) -> bool:
     """Check if the class is not a built-in type."""
     return type_hint.__module__ != "builtins"
@@ -142,7 +208,8 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
 
     if origin is Annotated:
         type_hints.update(get_type_hints(args[0]))
-    elif origin is Union:
+    elif origin in {Union, UnionType}:
+        # Union for Union[X, Y] and UnionType for X | Y
         for arg in args:
             type_hints.update(get_type_hints(arg))
     else:
@@ -163,7 +230,7 @@ NEEDS_HELP = (
 
 
 @functools.lru_cache(maxsize=30)
-def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
+def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
     # Save time only getting attr docs if we're generating help text
     cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
     kwargs = {}
@@ -178,6 +245,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
         # Get the default value of the field
         if field.default is not MISSING:
             default = field.default
+            # Handle pydantic.Field defaults
+            if isinstance(default, FieldInfo):
+                default = (
+                    default.default
+                    if default.default_factory is None
+                    else default.default_factory()
+                )
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
@@ -191,8 +265,9 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
         kwargs[name] = {"default": default, "help": help}
 
         # Set other kwargs based on the type hints
-        json_tip = ("Should either be a valid JSON string or JSON keys passed "
-                    "individually.")
+        json_tip = (
+            "Should either be a valid JSON string or JSON keys passed individually."
+        )
         if dataclass_cls is not None:
 
             def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
@@ -209,24 +284,11 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
         elif contains_type(type_hints, Literal):
             kwargs[name].update(literal_to_kwargs(type_hints))
         elif contains_type(type_hints, tuple):
-            type_hint = get_type(type_hints, tuple)
-            types = get_args(type_hint)
-            tuple_type = types[0]
-            assert all(t is tuple_type for t in types if t is not Ellipsis), (
-                "All non-Ellipsis tuple elements must be of the same "
-                f"type. Got {types}.")
-            kwargs[name]["type"] = tuple_type
-            kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
+            kwargs[name].update(collection_to_kwargs(type_hints, tuple))
         elif contains_type(type_hints, list):
-            type_hint = get_type(type_hints, list)
-            types = get_args(type_hint)
-            list_type = types[0]
-            if get_origin(list_type) is Union:
-                msg = "List type must contain str if it is a Union."
-                assert str in get_args(list_type), msg
-                list_type = str
-            kwargs[name]["type"] = list_type
-            kwargs[name]["nargs"] = "+"
+            kwargs[name].update(collection_to_kwargs(type_hints, list))
+        elif contains_type(type_hints, set):
+            kwargs[name].update(collection_to_kwargs(type_hints, set))
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int
             # Special case for large integers
@@ -240,19 +302,20 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
                 kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
-        elif (contains_type(type_hints, dict)
-              and (contains_type(type_hints, str)
-                   or any(is_not_builtin(th) for th in type_hints))):
+        elif contains_type(type_hints, dict) and (
+            contains_type(type_hints, str)
+            or any(is_not_builtin(th) for th in type_hints)
+        ):
             kwargs[name]["type"] = union_dict_and_str
         elif contains_type(type_hints, dict):
             kwargs[name]["type"] = parse_type(json.loads)
             kwargs[name]["help"] += f"\n\n{json_tip}"
-        elif (contains_type(type_hints, str)
-              or any(is_not_builtin(th) for th in type_hints)):
+        elif contains_type(type_hints, str) or any(
+            is_not_builtin(th) for th in type_hints
+        ):
             kwargs[name]["type"] = str
         else:
-            raise ValueError(
-                f"Unsupported type {type_hints} for argument {name}.")
+            raise ValueError(f"Unsupported type {type_hints} for argument {name}.")
 
         # If the type hint was a sequence of literals, use the helper function
         # to update the type and choices
@@ -268,7 +331,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
     return kwargs
 
 
-def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+def get_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
     """Return argparse kwargs for the given Config dataclass.
 
     If `--help` or `mkdocs` are not present in the command line command, the
@@ -284,191 +347,208 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
+
     model: str = ModelConfig.model
-    served_model_name: Optional[Union[
-        str, List[str]]] = ModelConfig.served_model_name
-    tokenizer: Optional[str] = ModelConfig.tokenizer
-    hf_config_path: Optional[str] = ModelConfig.hf_config_path
+    served_model_name: str | list[str] | None = ModelConfig.served_model_name
+    tokenizer: str | None = ModelConfig.tokenizer
+    hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
-    task: Optional[TaskOption] = ModelConfig.task
+    task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
     tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
-    allowed_media_domains: Optional[
-        list[str]] = ModelConfig.allowed_media_domains
-    download_dir: Optional[str] = LoadConfig.download_dir
+    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
+    download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
-    load_format: Union[str, LoadFormats] = LoadConfig.load_format
+    load_format: str | LoadFormats = LoadConfig.load_format
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: Optional[int] = ModelConfig.seed
-    max_model_len: Optional[int] = ModelConfig.max_model_len
-    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
-                                            "cuda_graph_sizes")
+    seed: int | None = ModelConfig.seed
+    max_model_len: int | None = ModelConfig.max_model_len
+    cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
+    cudagraph_capture_sizes: list[int] | None = (
+        CompilationConfig.cudagraph_capture_sizes
+    )
+    max_cudagraph_capture_size: int | None = get_field(
+        CompilationConfig, "max_cudagraph_capture_size"
+    )
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    distributed_executor_backend: Optional[Union[
-        str, DistributedExecutorBackend,
-        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
+    distributed_executor_backend: (
+        str | DistributedExecutorBackend | type[Executor] | None
+    ) = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    master_addr: str = ParallelConfig.master_addr
+    master_port: int = ParallelConfig.master_port
+    nnodes: int = ParallelConfig.nnodes
+    node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
-    decode_context_parallel_size: int = \
-        ParallelConfig.decode_context_parallel_size
+    decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
+    dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
-    data_parallel_rank: Optional[int] = None
-    data_parallel_start_rank: Optional[int] = None
-    data_parallel_size_local: Optional[int] = None
-    data_parallel_address: Optional[str] = None
-    data_parallel_rpc_port: Optional[int] = None
+    data_parallel_rank: int | None = None
+    data_parallel_start_rank: int | None = None
+    data_parallel_size_local: int | None = None
+    data_parallel_address: str | None = None
+    data_parallel_rpc_port: int | None = None
     data_parallel_hybrid_lb: bool = False
+    data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    all2all_backend: str | None = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
-    dbo_decode_token_threshold: int = \
-        ParallelConfig.dbo_decode_token_threshold
-    dbo_prefill_token_threshold: int = \
-        ParallelConfig.dbo_prefill_token_threshold
+    dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
+    dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
+    disable_nccl_for_dp_synchronization: bool = (
+        ParallelConfig.disable_nccl_for_dp_synchronization
+    )
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
-    expert_placement_strategy: ExpertPlacementStrategy = \
+    expert_placement_strategy: ExpertPlacementStrategy = (
         ParallelConfig.expert_placement_strategy
+    )
     _api_process_count: int = ParallelConfig._api_process_count
     _api_process_rank: int = ParallelConfig._api_process_rank
     num_redundant_experts: int = EPLBConfig.num_redundant_experts
     eplb_window_size: int = EPLBConfig.window_size
     eplb_step_interval: int = EPLBConfig.step_interval
     eplb_log_balancedness: bool = EPLBConfig.log_balancedness
-    max_parallel_loading_workers: Optional[
-        int] = ParallelConfig.max_parallel_loading_workers
-    block_size: Optional[BlockSize] = CacheConfig.block_size
-    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
-    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
+    max_parallel_loading_workers: int | None = (
+        ParallelConfig.max_parallel_loading_workers
+    )
+    block_size: BlockSize | None = CacheConfig.block_size
+    enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
+    )
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
     swap_space: float = CacheConfig.swap_space
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
-    kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: Optional[
-        int] = SchedulerConfig.max_num_batched_tokens
+    kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
+    max_num_batched_tokens: int | None = None
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
-    long_prefill_token_threshold: int = \
-        SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
+    long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
+    max_num_seqs: int | None = None
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
-    revision: Optional[str] = ModelConfig.revision
-    code_revision: Optional[str] = ModelConfig.code_revision
-    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
-    rope_theta: Optional[float] = ModelConfig.rope_theta
-    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
+    aggregate_engine_logging: bool = False
+    revision: str | None = ModelConfig.revision
+    code_revision: str | None = ModelConfig.code_revision
+    hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
-    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
-    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
+    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
+    quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
-    limit_mm_per_prompt: dict[str, int] = \
-        get_field(MultiModalConfig, "limit_per_prompt")
+    limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
+        MultiModalConfig, "limit_per_prompt"
+    )
+    enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
-    media_io_kwargs: dict[str, dict[str,
-                                    Any]] = get_field(MultiModalConfig,
-                                                      "media_io_kwargs")
-    mm_processor_kwargs: Optional[Dict[str, Any]] = \
-        MultiModalConfig.mm_processor_kwargs
+    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
+        MultiModalConfig, "media_io_kwargs"
+    )
+    mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
-    mm_processor_cache_type: Optional[MMCacheType] = \
+    mm_processor_cache_type: MMCacheType | None = (
         MultiModalConfig.mm_processor_cache_type
-    mm_shm_cache_max_object_size_mb: int = \
+    )
+    mm_shm_cache_max_object_size_mb: int = (
         MultiModalConfig.mm_shm_cache_max_object_size_mb
+    )
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
-    io_processor_plugin: Optional[str] = None
+    mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
+        MultiModalConfig.mm_encoder_attn_backend
+    )
+    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
     # LoRA fields
     enable_lora: bool = False
-    enable_lora_bias: bool = LoRAConfig.bias_enabled
     max_loras: int = LoRAConfig.max_loras
     max_lora_rank: int = LoRAConfig.max_lora_rank
-    default_mm_loras: Optional[Dict[str, str]] = \
-        LoRAConfig.default_mm_loras
+    default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
-    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
-    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
+    max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
+    lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
-    num_gpu_blocks_override: Optional[
-        int] = CacheConfig.num_gpu_blocks_override
+    num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
     num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
-    model_loader_extra_config: dict = \
-        get_field(LoadConfig, "model_loader_extra_config")
-    ignore_patterns: Optional[Union[str,
-                                    List[str]]] = LoadConfig.ignore_patterns
+    model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
+    ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
-    enable_chunked_prefill: Optional[
-        bool] = SchedulerConfig.enable_chunked_prefill
+    enable_chunked_prefill: bool | None = None
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
-        SchedulerConfig.disable_hybrid_kv_cache_manager)
+        SchedulerConfig.disable_hybrid_kv_cache_manager
+    )
 
     structured_outputs_config: StructuredOutputsConfig = get_field(
-        VllmConfig, "structured_outputs_config")
+        VllmConfig, "structured_outputs_config"
+    )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
+    reasoning_parser_plugin: str | None = None
     # Deprecated guided decoding fields
-    guided_decoding_backend: Optional[str] = None
-    guided_decoding_disable_fallback: Optional[bool] = None
-    guided_decoding_disable_any_whitespace: Optional[bool] = None
-    guided_decoding_disable_additional_properties: Optional[bool] = None
+    guided_decoding_backend: str | None = None
+    guided_decoding_disable_fallback: bool | None = None
+    guided_decoding_disable_any_whitespace: bool | None = None
+    guided_decoding_disable_additional_properties: bool | None = None
 
-    logits_processor_pattern: Optional[
-        str] = ModelConfig.logits_processor_pattern
+    logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
-    speculative_config: Optional[Dict[str, Any]] = None
+    speculative_config: dict[str, Any] | None = None
 
-    show_hidden_metrics_for_version: Optional[str] = \
+    show_hidden_metrics_for_version: str | None = (
         ObservabilityConfig.show_hidden_metrics_for_version
-    otlp_traces_endpoint: Optional[str] = \
-        ObservabilityConfig.otlp_traces_endpoint
-    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
+    )
+    otlp_traces_endpoint: str | None = ObservabilityConfig.otlp_traces_endpoint
+    collect_detailed_traces: list[DetailedTraceModules] | None = (
         ObservabilityConfig.collect_detailed_traces
+    )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
-    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
+    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
-    pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
-    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
+    pooler_config: PoolerConfig | None = ModelConfig.pooler_config
+    override_pooler_config: dict | PoolerConfig | None = (
         ModelConfig.override_pooler_config
-    compilation_config: CompilationConfig = \
-        get_field(VllmConfig, "compilation_config")
+    )
+    compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
-    kv_transfer_config: Optional[KVTransferConfig] = None
-    kv_events_config: Optional[KVEventsConfig] = None
+    kv_transfer_config: KVTransferConfig | None = None
+    kv_events_config: KVEventsConfig | None = None
+
+    ec_transfer_config: ECTransferConfig | None = None
 
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
-    override_generation_config: dict[str, Any] = \
-        get_field(ModelConfig, "override_generation_config")
+    override_generation_config: dict[str, Any] = get_field(
+        ModelConfig, "override_generation_config"
+    )
     model_impl: str = ModelConfig.model_impl
     override_attention_dtype: str = ModelConfig.override_attention_dtype
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
     mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
     mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype
+    mamba_block_size: int | None = get_field(CacheConfig, "mamba_block_size")
 
-    additional_config: dict[str, Any] = \
-        get_field(VllmConfig, "additional_config")
+    additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
@@ -476,34 +556,44 @@ class EngineArgs:
     # DEPRECATED
     enable_multimodal_encoder_data_parallel: bool = False
 
-    logits_processors: Optional[list[Union[
-        str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
+    logits_processors: list[str | type[LogitsProcessor]] | None = (
+        ModelConfig.logits_processors
+    )
     """Custom logitproc types"""
 
-    async_scheduling: bool = SchedulerConfig.async_scheduling
+    async_scheduling: bool | None = SchedulerConfig.async_scheduling
+
+    stream_interval: int = SchedulerConfig.stream_interval
+
+    kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
-    kv_sharing_fast_prefill: bool = \
-        CacheConfig.kv_sharing_fast_prefill
+    kv_offloading_size: float | None = CacheConfig.kv_offloading_size
+    kv_offloading_backend: KVOffloadingBackend | None = (
+        CacheConfig.kv_offloading_backend
+    )
+    tokens_only: bool = False
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
         if isinstance(self.compilation_config, dict):
-            self.compilation_config = CompilationConfig(
-                **self.compilation_config)
+            self.compilation_config = CompilationConfig(**self.compilation_config)
         if isinstance(self.eplb_config, dict):
             self.eplb_config = EPLBConfig(**self.eplb_config)
         # Setup plugins
         from vllm.plugins import load_general_plugins
+
         load_general_plugins()
         # when use hf offline,replace model id to local model path
         if huggingface_hub.constants.HF_HUB_OFFLINE:
             model_id = self.model
             self.model = get_model_path(self.model, self.revision)
             logger.info(
-                "HF_HUB_OFFLINE is True, replace model_id [%s] " \
-                "to model_path [%s]",model_id, self.model)
+                "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
+                model_id,
+                self.model,
+            )
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -515,86 +605,90 @@ class EngineArgs:
             title="ModelConfig",
             description=ModelConfig.__doc__,
         )
-        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
+        if not ("serve" in sys.argv[1:] and "--help" in sys.argv[1:]):
             model_group.add_argument("--model", **model_kwargs["model"])
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
-        model_group.add_argument("--task",
-                                 **model_kwargs["task"],
-                                 deprecated=True)
+        model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
-        model_group.add_argument("--tokenizer-mode",
-                                 **model_kwargs["tokenizer_mode"])
-        model_group.add_argument("--trust-remote-code",
-                                 **model_kwargs["trust_remote_code"])
+        model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
+        model_group.add_argument(
+            "--trust-remote-code", **model_kwargs["trust_remote_code"]
+        )
         model_group.add_argument("--dtype", **model_kwargs["dtype"])
         model_group.add_argument("--seed", **model_kwargs["seed"])
-        model_group.add_argument("--hf-config-path",
-                                 **model_kwargs["hf_config_path"])
-        model_group.add_argument("--allowed-local-media-path",
-                                 **model_kwargs["allowed_local_media_path"])
-        model_group.add_argument("--allowed-media-domains",
-                                 **model_kwargs["allowed_media_domains"])
+        model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
+        model_group.add_argument(
+            "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
+        )
+        model_group.add_argument(
+            "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
+        )
         model_group.add_argument("--revision", **model_kwargs["revision"])
-        model_group.add_argument("--code-revision",
-                                 **model_kwargs["code_revision"])
-        model_group.add_argument("--rope-scaling",
-                                 **model_kwargs["rope_scaling"])
-        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
-        model_group.add_argument("--tokenizer-revision",
-                                 **model_kwargs["tokenizer_revision"])
-        model_group.add_argument("--max-model-len",
-                                 **model_kwargs["max_model_len"])
-        model_group.add_argument("--quantization", "-q",
-                                 **model_kwargs["quantization"])
-        model_group.add_argument("--enforce-eager",
-                                 **model_kwargs["enforce_eager"])
-        model_group.add_argument("--max-logprobs",
-                                 **model_kwargs["max_logprobs"])
-        model_group.add_argument("--logprobs-mode",
-                                 **model_kwargs["logprobs_mode"])
-        model_group.add_argument("--disable-sliding-window",
-                                 **model_kwargs["disable_sliding_window"])
-        model_group.add_argument("--disable-cascade-attn",
-                                 **model_kwargs["disable_cascade_attn"])
-        model_group.add_argument("--skip-tokenizer-init",
-                                 **model_kwargs["skip_tokenizer_init"])
-        model_group.add_argument("--enable-prompt-embeds",
-                                 **model_kwargs["enable_prompt_embeds"])
-        model_group.add_argument("--served-model-name",
-                                 **model_kwargs["served_model_name"])
-        model_group.add_argument("--config-format",
-                                 **model_kwargs["config_format"])
+        model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
+        model_group.add_argument(
+            "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
+        )
+        model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
+        model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
+        model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
+        model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
+        model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
+        model_group.add_argument(
+            "--disable-sliding-window", **model_kwargs["disable_sliding_window"]
+        )
+        model_group.add_argument(
+            "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
+        )
+        model_group.add_argument(
+            "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
+        )
+        model_group.add_argument(
+            "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
+        )
+        model_group.add_argument(
+            "--served-model-name", **model_kwargs["served_model_name"]
+        )
+        model_group.add_argument("--config-format", **model_kwargs["config_format"])
         # This one is a special case because it can bool
         # or str. TODO: Handle this in get_kwargs
-        model_group.add_argument("--hf-token",
-                                 type=str,
-                                 nargs="?",
-                                 const=True,
-                                 default=model_kwargs["hf_token"]["default"],
-                                 help=model_kwargs["hf_token"]["help"])
-        model_group.add_argument("--hf-overrides",
-                                 **model_kwargs["hf_overrides"])
-        model_group.add_argument("--pooler-config",
-                                 **model_kwargs["pooler_config"])
-        model_group.add_argument("--override-pooler-config",
-                                 **model_kwargs["override_pooler_config"],
-                                 deprecated=True)
-        model_group.add_argument("--logits-processor-pattern",
-                                 **model_kwargs["logits_processor_pattern"])
-        model_group.add_argument("--generation-config",
-                                 **model_kwargs["generation_config"])
-        model_group.add_argument("--override-generation-config",
-                                 **model_kwargs["override_generation_config"])
-        model_group.add_argument("--enable-sleep-mode",
-                                 **model_kwargs["enable_sleep_mode"])
+        model_group.add_argument(
+            "--hf-token",
+            type=str,
+            nargs="?",
+            const=True,
+            default=model_kwargs["hf_token"]["default"],
+            help=model_kwargs["hf_token"]["help"],
+        )
+        model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
+        model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
+        model_group.add_argument(
+            "--override-pooler-config",
+            **model_kwargs["override_pooler_config"],
+            deprecated=True,
+        )
+        model_group.add_argument(
+            "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
+        )
+        model_group.add_argument(
+            "--generation-config", **model_kwargs["generation_config"]
+        )
+        model_group.add_argument(
+            "--override-generation-config", **model_kwargs["override_generation_config"]
+        )
+        model_group.add_argument(
+            "--enable-sleep-mode", **model_kwargs["enable_sleep_mode"]
+        )
         model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
-        model_group.add_argument("--override-attention-dtype",
-                                 **model_kwargs["override_attention_dtype"])
-        model_group.add_argument("--logits-processors",
-                                 **model_kwargs["logits_processors"])
-        model_group.add_argument("--io-processor-plugin",
-                                 **model_kwargs["io_processor_plugin"])
+        model_group.add_argument(
+            "--override-attention-dtype", **model_kwargs["override_attention_dtype"]
+        )
+        model_group.add_argument(
+            "--logits-processors", **model_kwargs["logits_processors"]
+        )
+        model_group.add_argument(
+            "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
+        )
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
@@ -603,18 +697,18 @@ class EngineArgs:
             description=LoadConfig.__doc__,
         )
         load_group.add_argument("--load-format", **load_kwargs["load_format"])
-        load_group.add_argument("--download-dir",
-                                **load_kwargs["download_dir"])
-        load_group.add_argument("--safetensors-load-strategy",
-                                **load_kwargs["safetensors_load_strategy"])
-        load_group.add_argument("--model-loader-extra-config",
-                                **load_kwargs["model_loader_extra_config"])
-        load_group.add_argument("--ignore-patterns",
-                                **load_kwargs["ignore_patterns"])
-        load_group.add_argument("--use-tqdm-on-load",
-                                **load_kwargs["use_tqdm_on_load"])
-        load_group.add_argument('--pt-load-map-location',
-                                **load_kwargs["pt_load_map_location"])
+        load_group.add_argument("--download-dir", **load_kwargs["download_dir"])
+        load_group.add_argument(
+            "--safetensors-load-strategy", **load_kwargs["safetensors_load_strategy"]
+        )
+        load_group.add_argument(
+            "--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]
+        )
+        load_group.add_argument("--ignore-patterns", **load_kwargs["ignore_patterns"])
+        load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"])
+        load_group.add_argument(
+            "--pt-load-map-location", **load_kwargs["pt_load_map_location"]
+        )
 
         # Structured outputs arguments
         structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
@@ -624,9 +718,13 @@ class EngineArgs:
         )
         structured_outputs_group.add_argument(
             "--reasoning-parser",
-            # This choice is a special case because it's not static
-            choices=list(ReasoningParserManager.reasoning_parsers),
-            **structured_outputs_kwargs["reasoning_parser"])
+            # Choices need to be validated after parsing to include plugins
+            **structured_outputs_kwargs["reasoning_parser"],
+        )
+        structured_outputs_group.add_argument(
+            "--reasoning-parser-plugin",
+            **structured_outputs_kwargs["reasoning_parser_plugin"],
+        )
         # Deprecated guided decoding arguments
         for arg, type in [
             ("--guided-decoding-backend", str),
@@ -638,7 +736,8 @@ class EngineArgs:
                 arg,
                 type=type,
                 help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
-                deprecated=True)
+                deprecated=True,
+            )
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -648,111 +747,150 @@ class EngineArgs:
         )
         parallel_group.add_argument(
             "--distributed-executor-backend",
-            **parallel_kwargs["distributed_executor_backend"])
+            **parallel_kwargs["distributed_executor_backend"],
+        )
         parallel_group.add_argument(
-            "--pipeline-parallel-size", "-pp",
-            **parallel_kwargs["pipeline_parallel_size"])
-        parallel_group.add_argument("--tensor-parallel-size", "-tp",
-                                    **parallel_kwargs["tensor_parallel_size"])
+            "--pipeline-parallel-size",
+            "-pp",
+            **parallel_kwargs["pipeline_parallel_size"],
+        )
+        parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
+        parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
+        parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
+        parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
         parallel_group.add_argument(
-            "--decode-context-parallel-size", "-dcp",
-            **parallel_kwargs["decode_context_parallel_size"])
-        parallel_group.add_argument("--data-parallel-size", "-dp",
-                                    **parallel_kwargs["data_parallel_size"])
+            "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
+        )
+        parallel_group.add_argument(
+            "--decode-context-parallel-size",
+            "-dcp",
+            **parallel_kwargs["decode_context_parallel_size"],
+        )
         parallel_group.add_argument(
-            '--data-parallel-rank',
-            '-dpn',
+            "--dcp-kv-cache-interleave-size",
+            **parallel_kwargs["dcp_kv_cache_interleave_size"],
+        )
+        parallel_group.add_argument(
+            "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
+        )
+        parallel_group.add_argument(
+            "--data-parallel-rank",
+            "-dpn",
+            type=int,
+            help="Data parallel rank of this instance. "
+            "When set, enables external load balancer mode.",
+        )
+        parallel_group.add_argument(
+            "--data-parallel-start-rank",
+            "-dpr",
+            type=int,
+            help="Starting data parallel rank for secondary nodes.",
+        )
+        parallel_group.add_argument(
+            "--data-parallel-size-local",
+            "-dpl",
             type=int,
-            help='Data parallel rank of this instance. '
-            'When set, enables external load balancer mode.')
-        parallel_group.add_argument('--data-parallel-start-rank',
-                                    '-dpr',
-                                    type=int,
-                                    help='Starting data parallel rank '
-                                    'for secondary nodes.')
-        parallel_group.add_argument('--data-parallel-size-local',
-                                    '-dpl',
-                                    type=int,
-                                    help='Number of data parallel replicas '
-                                    'to run on this node.')
-        parallel_group.add_argument('--data-parallel-address',
-                                    '-dpa',
-                                    type=str,
-                                    help='Address of data parallel cluster '
-                                    'head-node.')
-        parallel_group.add_argument('--data-parallel-rpc-port',
-                                    '-dpp',
-                                    type=int,
-                                    help='Port for data parallel RPC '
-                                    'communication.')
-        parallel_group.add_argument('--data-parallel-backend',
-                                    '-dpb',
-                                    type=str,
-                                    default='mp',
-                                    help='Backend for data parallel, either '
-                                    '"mp" or "ray".')
+            help="Number of data parallel replicas to run on this node.",
+        )
+        parallel_group.add_argument(
+            "--data-parallel-address",
+            "-dpa",
+            type=str,
+            help="Address of data parallel cluster head-node.",
+        )
+        parallel_group.add_argument(
+            "--data-parallel-rpc-port",
+            "-dpp",
+            type=int,
+            help="Port for data parallel RPC communication.",
+        )
+        parallel_group.add_argument(
+            "--data-parallel-backend",
+            "-dpb",
+            type=str,
+            default="mp",
+            help='Backend for data parallel, either "mp" or "ray".',
+        )
         parallel_group.add_argument(
             "--data-parallel-hybrid-lb",
-            **parallel_kwargs["data_parallel_hybrid_lb"])
+            "-dph",
+            **parallel_kwargs["data_parallel_hybrid_lb"],
+        )
         parallel_group.add_argument(
-            "--enable-expert-parallel",
-            **parallel_kwargs["enable_expert_parallel"])
-        parallel_group.add_argument("--enable-dbo",
-                                    **parallel_kwargs["enable_dbo"])
+            "--data-parallel-external-lb",
+            "-dpe",
+            **parallel_kwargs["data_parallel_external_lb"],
+        )
+        parallel_group.add_argument(
+            "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
+        )
+        parallel_group.add_argument(
+            "--all2all-backend", **parallel_kwargs["all2all_backend"]
+        )
+        parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
-            **parallel_kwargs["dbo_decode_token_threshold"])
+            **parallel_kwargs["dbo_decode_token_threshold"],
+        )
         parallel_group.add_argument(
             "--dbo-prefill-token-threshold",
-            **parallel_kwargs["dbo_prefill_token_threshold"])
-        parallel_group.add_argument("--enable-eplb",
-                                    **parallel_kwargs["enable_eplb"])
-        parallel_group.add_argument("--eplb-config",
-                                    **parallel_kwargs["eplb_config"])
+            **parallel_kwargs["dbo_prefill_token_threshold"],
+        )
+        parallel_group.add_argument(
+            "--disable-nccl-for-dp-synchronization",
+            **parallel_kwargs["disable_nccl_for_dp_synchronization"],
+        )
+        parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
+        parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
         parallel_group.add_argument(
             "--expert-placement-strategy",
-            **parallel_kwargs["expert_placement_strategy"])
+            **parallel_kwargs["expert_placement_strategy"],
+        )
         parallel_group.add_argument(
             "--num-redundant-experts",
             type=int,
-            help=
-            "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
-            deprecated=True)
+            help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
+            deprecated=True,
+        )
         parallel_group.add_argument(
             "--eplb-window-size",
             type=int,
             help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
-            deprecated=True)
+            deprecated=True,
+        )
         parallel_group.add_argument(
             "--eplb-step-interval",
             type=int,
-            help=
-            "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
-            deprecated=True)
+            help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
+            deprecated=True,
+        )
         parallel_group.add_argument(
             "--eplb-log-balancedness",
             action=argparse.BooleanOptionalAction,
-            help=
-            "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
-            deprecated=True)
+            help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
+            deprecated=True,
+        )
 
         parallel_group.add_argument(
             "--max-parallel-loading-workers",
-            **parallel_kwargs["max_parallel_loading_workers"])
+            **parallel_kwargs["max_parallel_loading_workers"],
+        )
         parallel_group.add_argument(
-            "--ray-workers-use-nsight",
-            **parallel_kwargs["ray_workers_use_nsight"])
+            "--ray-workers-use-nsight", **parallel_kwargs["ray_workers_use_nsight"]
+        )
         parallel_group.add_argument(
             "--disable-custom-all-reduce",
-            **parallel_kwargs["disable_custom_all_reduce"])
-        parallel_group.add_argument("--worker-cls",
-                                    **parallel_kwargs["worker_cls"])
-        parallel_group.add_argument("--worker-extension-cls",
-                                    **parallel_kwargs["worker_extension_cls"])
+            **parallel_kwargs["disable_custom_all_reduce"],
+        )
+        parallel_group.add_argument("--worker-cls", **parallel_kwargs["worker_cls"])
+        parallel_group.add_argument(
+            "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]
+        )
         parallel_group.add_argument(
             "--enable-multimodal-encoder-data-parallel",
             action="store_true",
-            deprecated=True)
+            deprecated=True,
+        )
 
         # KV cache arguments
         cache_kwargs = get_kwargs(CacheConfig)
@@ -761,29 +899,45 @@ class EngineArgs:
             description=CacheConfig.__doc__,
         )
         cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
-        cache_group.add_argument("--gpu-memory-utilization",
-                                 **cache_kwargs["gpu_memory_utilization"])
-        cache_group.add_argument("--kv-cache-memory-bytes",
-                                 **cache_kwargs["kv_cache_memory_bytes"])
+        cache_group.add_argument(
+            "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
+        )
+        cache_group.add_argument(
+            "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
+        )
         cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
-        cache_group.add_argument("--kv-cache-dtype",
-                                 **cache_kwargs["cache_dtype"])
-        cache_group.add_argument("--num-gpu-blocks-override",
-                                 **cache_kwargs["num_gpu_blocks_override"])
-        cache_group.add_argument("--enable-prefix-caching",
-                                 **cache_kwargs["enable_prefix_caching"])
-        cache_group.add_argument("--prefix-caching-hash-algo",
-                                 **cache_kwargs["prefix_caching_hash_algo"])
-        cache_group.add_argument("--cpu-offload-gb",
-                                 **cache_kwargs["cpu_offload_gb"])
-        cache_group.add_argument("--calculate-kv-scales",
-                                 **cache_kwargs["calculate_kv_scales"])
-        cache_group.add_argument("--kv-sharing-fast-prefill",
-                                 **cache_kwargs["kv_sharing_fast_prefill"])
-        cache_group.add_argument("--mamba-cache-dtype",
-                                 **cache_kwargs["mamba_cache_dtype"])
-        cache_group.add_argument("--mamba-ssm-cache-dtype",
-                                 **cache_kwargs["mamba_ssm_cache_dtype"])
+        cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
+        cache_group.add_argument(
+            "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
+        )
+        cache_group.add_argument(
+            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+        )
+        cache_group.add_argument(
+            "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
+        )
+        cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
+        cache_group.add_argument(
+            "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
+        )
+        cache_group.add_argument(
+            "--kv-sharing-fast-prefill", **cache_kwargs["kv_sharing_fast_prefill"]
+        )
+        cache_group.add_argument(
+            "--mamba-cache-dtype", **cache_kwargs["mamba_cache_dtype"]
+        )
+        cache_group.add_argument(
+            "--mamba-ssm-cache-dtype", **cache_kwargs["mamba_ssm_cache_dtype"]
+        )
+        cache_group.add_argument(
+            "--mamba-block-size", **cache_kwargs["mamba_block_size"]
+        )
+        cache_group.add_argument(
+            "--kv-offloading-size", **cache_kwargs["kv_offloading_size"]
+        )
+        cache_group.add_argument(
+            "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"]
+        )
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -791,35 +945,48 @@ class EngineArgs:
             title="MultiModalConfig",
             description=MultiModalConfig.__doc__,
         )
-        multimodal_group.add_argument("--limit-mm-per-prompt",
-                                      **multimodal_kwargs["limit_per_prompt"])
-        multimodal_group.add_argument("--media-io-kwargs",
-                                      **multimodal_kwargs["media_io_kwargs"])
         multimodal_group.add_argument(
-            "--mm-processor-kwargs",
-            **multimodal_kwargs["mm_processor_kwargs"])
+            "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"]
+        )
+        multimodal_group.add_argument(
+            "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
+        )
         multimodal_group.add_argument(
-            "--mm-processor-cache-gb",
-            **multimodal_kwargs["mm_processor_cache_gb"])
-        multimodal_group.add_argument("--disable-mm-preprocessor-cache",
-                                      action="store_true",
-                                      deprecated=True)
+            "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
+        )
         multimodal_group.add_argument(
-            "--mm-processor-cache-type",
-            **multimodal_kwargs["mm_processor_cache_type"])
+            "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
+        )
+        multimodal_group.add_argument(
+            "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]
+        )
+        multimodal_group.add_argument(
+            "--disable-mm-preprocessor-cache", action="store_true", deprecated=True
+        )
+        multimodal_group.add_argument(
+            "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"]
+        )
         multimodal_group.add_argument(
             "--mm-shm-cache-max-object-size-mb",
-            **multimodal_kwargs["mm_shm_cache_max_object_size_mb"])
+            **multimodal_kwargs["mm_shm_cache_max_object_size_mb"],
+        )
         multimodal_group.add_argument(
-            "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
+            "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]
+        )
         multimodal_group.add_argument(
-            "--interleave-mm-strings",
-            **multimodal_kwargs["interleave_mm_strings"])
-        multimodal_group.add_argument("--skip-mm-profiling",
-                                      **multimodal_kwargs["skip_mm_profiling"])
+            "--mm-encoder-attn-backend",
+            **multimodal_kwargs["mm_encoder_attn_backend"],
+        )
+        multimodal_group.add_argument(
+            "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]
+        )
+        multimodal_group.add_argument(
+            "--skip-mm-profiling", **multimodal_kwargs["skip_mm_profiling"]
+        )
 
         multimodal_group.add_argument(
-            "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"])
+            "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"]
+        )
 
         # LoRA related configs
         lora_kwargs = get_kwargs(LoRAConfig)
@@ -830,24 +997,22 @@ class EngineArgs:
         lora_group.add_argument(
             "--enable-lora",
             action=argparse.BooleanOptionalAction,
-            help="If True, enable handling of LoRA adapters.")
-        lora_group.add_argument("--enable-lora-bias",
-                                **lora_kwargs["bias_enabled"])
+            help="If True, enable handling of LoRA adapters.",
+        )
         lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
-        lora_group.add_argument("--max-lora-rank",
-                                **lora_kwargs["max_lora_rank"])
-        lora_group.add_argument("--lora-extra-vocab-size",
-                                **lora_kwargs["lora_extra_vocab_size"])
+        lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
+        lora_group.add_argument(
+            "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]
+        )
         lora_group.add_argument(
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
         )
-        lora_group.add_argument("--max-cpu-loras",
-                                **lora_kwargs["max_cpu_loras"])
-        lora_group.add_argument("--fully-sharded-loras",
-                                **lora_kwargs["fully_sharded_loras"])
-        lora_group.add_argument("--default-mm-loras",
-                                **lora_kwargs["default_mm_loras"])
+        lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
+        lora_group.add_argument(
+            "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
+        )
+        lora_group.add_argument("--default-mm-loras", **lora_kwargs["default_mm_loras"])
 
         # Observability arguments
         observability_kwargs = get_kwargs(ObservabilityConfig)
@@ -857,21 +1022,22 @@ class EngineArgs:
         )
         observability_group.add_argument(
             "--show-hidden-metrics-for-version",
-            **observability_kwargs["show_hidden_metrics_for_version"])
+            **observability_kwargs["show_hidden_metrics_for_version"],
+        )
         observability_group.add_argument(
-            "--otlp-traces-endpoint",
-            **observability_kwargs["otlp_traces_endpoint"])
+            "--otlp-traces-endpoint", **observability_kwargs["otlp_traces_endpoint"]
+        )
         # TODO: generalise this special case
         choices = observability_kwargs["collect_detailed_traces"]["choices"]
         metavar = f"{{{','.join(choices)}}}"
         observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
         observability_kwargs["collect_detailed_traces"]["choices"] += [
-            ",".join(p)
-            for p in permutations(get_args(DetailedTraceModules), r=2)
+            ",".join(p) for p in permutations(get_args(DetailedTraceModules), r=2)
         ]
         observability_group.add_argument(
             "--collect-detailed-traces",
-            **observability_kwargs["collect_detailed_traces"])
+            **observability_kwargs["collect_detailed_traces"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -881,39 +1047,83 @@ class EngineArgs:
         )
         scheduler_group.add_argument(
             "--max-num-batched-tokens",
-            **scheduler_kwargs["max_num_batched_tokens"])
-        scheduler_group.add_argument("--max-num-seqs",
-                                     **scheduler_kwargs["max_num_seqs"])
+            **{
+                **scheduler_kwargs["max_num_batched_tokens"],
+                "default": None,
+            },
+        )
         scheduler_group.add_argument(
-            "--max-num-partial-prefills",
-            **scheduler_kwargs["max_num_partial_prefills"])
+            "--max-num-seqs",
+            **{
+                **scheduler_kwargs["max_num_seqs"],
+                "default": None,
+            },
+        )
+        scheduler_group.add_argument(
+            "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
+        )
         scheduler_group.add_argument(
             "--max-long-partial-prefills",
-            **scheduler_kwargs["max_long_partial_prefills"])
-        scheduler_group.add_argument('--cuda-graph-sizes',
-                                     **scheduler_kwargs["cuda_graph_sizes"])
+            **scheduler_kwargs["max_long_partial_prefills"],
+        )
         scheduler_group.add_argument(
             "--long-prefill-token-threshold",
-            **scheduler_kwargs["long_prefill_token_threshold"])
-        scheduler_group.add_argument("--num-lookahead-slots",
-                                     **scheduler_kwargs["num_lookahead_slots"])
+            **scheduler_kwargs["long_prefill_token_threshold"],
+        )
+        scheduler_group.add_argument(
+            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
+        )
         # multi-step scheduling has been removed; corresponding arguments
         # are no longer supported.
-        scheduler_group.add_argument("--scheduling-policy",
-                                     **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            "--scheduling-policy", **scheduler_kwargs["policy"]
+        )
         scheduler_group.add_argument(
             "--enable-chunked-prefill",
-            **scheduler_kwargs["enable_chunked_prefill"])
+            **{
+                **scheduler_kwargs["enable_chunked_prefill"],
+                "default": None,
+            },
+        )
         scheduler_group.add_argument(
-            "--disable-chunked-mm-input",
-            **scheduler_kwargs["disable_chunked_mm_input"])
-        scheduler_group.add_argument("--scheduler-cls",
-                                     **scheduler_kwargs["scheduler_cls"])
+            "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
+        )
+        scheduler_group.add_argument(
+            "--scheduler-cls", **scheduler_kwargs["scheduler_cls"]
+        )
         scheduler_group.add_argument(
             "--disable-hybrid-kv-cache-manager",
-            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
-        scheduler_group.add_argument("--async-scheduling",
-                                     **scheduler_kwargs["async_scheduling"])
+            **scheduler_kwargs["disable_hybrid_kv_cache_manager"],
+        )
+        scheduler_group.add_argument(
+            "--async-scheduling", **scheduler_kwargs["async_scheduling"]
+        )
+        scheduler_group.add_argument(
+            "--stream-interval", **scheduler_kwargs["stream_interval"]
+        )
+
+        # Compilation arguments
+        compilation_kwargs = get_kwargs(CompilationConfig)
+        compilation_group = parser.add_argument_group(
+            title="CompilationConfig",
+            description=CompilationConfig.__doc__,
+        )
+        compilation_group.add_argument(
+            "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
+        )
+        compilation_kwargs["cudagraph_capture_sizes"]["help"] = (
+            "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0,"
+            " whichever is soonest. Please use --cudagraph-capture-sizes instead."
+        )
+        compilation_group.add_argument(
+            "--cuda-graph-sizes",
+            **compilation_kwargs["cudagraph_capture_sizes"],
+            deprecated=True,
+        )
+        compilation_group.add_argument(
+            "--max-cudagraph-capture-size",
+            **compilation_kwargs["max_cudagraph_capture_size"],
+        )
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -925,24 +1135,39 @@ class EngineArgs:
         # create_engine_config. So we set the type to a JSON string here to
         # delay the Pydantic validation that comes with SpeculativeConfig.
         vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
-        vllm_group.add_argument("--speculative-config",
-                                **vllm_kwargs["speculative_config"])
-        vllm_group.add_argument("--kv-transfer-config",
-                                **vllm_kwargs["kv_transfer_config"])
-        vllm_group.add_argument('--kv-events-config',
-                                **vllm_kwargs["kv_events_config"])
-        vllm_group.add_argument("--compilation-config", "-O",
-                                **vllm_kwargs["compilation_config"])
-        vllm_group.add_argument("--additional-config",
-                                **vllm_kwargs["additional_config"])
-        vllm_group.add_argument('--structured-outputs-config',
-                                **vllm_kwargs["structured_outputs_config"])
+        vllm_group.add_argument(
+            "--speculative-config", **vllm_kwargs["speculative_config"]
+        )
+        vllm_group.add_argument(
+            "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]
+        )
+        vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"])
+        vllm_group.add_argument(
+            "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
+        )
+        vllm_group.add_argument(
+            "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
+        )
+        vllm_group.add_argument(
+            "--additional-config", **vllm_kwargs["additional_config"]
+        )
+        vllm_group.add_argument(
+            "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
+        )
 
         # Other arguments
-        parser.add_argument('--disable-log-stats',
-                            action='store_true',
-                            help='Disable logging statistics.')
+        parser.add_argument(
+            "--disable-log-stats",
+            action="store_true",
+            help="Disable logging statistics.",
+        )
 
+        parser.add_argument(
+            "--aggregate-engine-logging",
+            action="store_true",
+            help="Log aggregate rather than per-engine statistics "
+            "when using data parallelism.",
+        )
         return parser
 
     @classmethod
@@ -950,10 +1175,9 @@ class EngineArgs:
         # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         # Set the attributes from the parsed arguments.
-        engine_args = cls(**{
-            attr: getattr(args, attr)
-            for attr in attrs if hasattr(args, attr)
-        })
+        engine_args = cls(
+            **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)}
+        )
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
@@ -961,16 +1185,12 @@ class EngineArgs:
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
 
-        # NOTE: This is to allow model loading from S3 in CI
-        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
-                and self.model in MODELS_ON_S3 and self.load_format == "auto"):
-            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
-
         if self.disable_mm_preprocessor_cache:
             logger.warning(
                 "`--disable-mm-preprocessor-cache` is deprecated "
                 "and will be removed in v0.13. "
-                "Please use `--mm-processor-cache-gb 0` instead.", )
+                "Please use `--mm-processor-cache-gb 0` instead.",
+            )
 
             self.mm_processor_cache_gb = 0
         elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
@@ -987,7 +1207,8 @@ class EngineArgs:
             logger.warning(
                 "--enable-multimodal-encoder-data-parallel` is deprecated "
                 "and will be removed in v0.13. "
-                "Please use `--mm-encoder-tp-mode data` instead.")
+                "Please use `--mm-encoder-tp-mode data` instead."
+            )
 
             self.mm_encoder_tp_mode = "data"
 
@@ -1006,8 +1227,6 @@ class EngineArgs:
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
-            rope_scaling=self.rope_scaling,
-            rope_theta=self.rope_theta,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
@@ -1022,6 +1241,7 @@ class EngineArgs:
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
+            enable_mm_embeds=self.enable_mm_embeds,
             interleave_mm_strings=self.interleave_mm_strings,
             media_io_kwargs=self.media_io_kwargs,
             skip_mm_profiling=self.skip_mm_profiling,
@@ -1029,9 +1249,9 @@ class EngineArgs:
             mm_processor_kwargs=self.mm_processor_kwargs,
             mm_processor_cache_gb=self.mm_processor_cache_gb,
             mm_processor_cache_type=self.mm_processor_cache_type,
-            mm_shm_cache_max_object_size_mb=self.
-            mm_shm_cache_max_object_size_mb,
+            mm_shm_cache_max_object_size_mb=self.mm_shm_cache_max_object_size_mb,
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
+            mm_encoder_attn_backend=self.mm_encoder_attn_backend,
             pooler_config=self.pooler_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
@@ -1046,33 +1266,34 @@ class EngineArgs:
         )
 
     def validate_tensorizer_args(self):
-        from vllm.model_executor.model_loader.tensorizer import (
-            TensorizerConfig)
+        from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
         for key in self.model_loader_extra_config:
             if key in TensorizerConfig._fields:
-                self.model_loader_extra_config["tensorizer_config"][
-                    key] = self.model_loader_extra_config[key]
+                self.model_loader_extra_config["tensorizer_config"][key] = (
+                    self.model_loader_extra_config[key]
+                )
 
     def create_load_config(self) -> LoadConfig:
-
         if self.quantization == "bitsandbytes":
             self.load_format = "bitsandbytes"
 
         if self.load_format == "tensorizer":
             if hasattr(self.model_loader_extra_config, "to_serializable"):
                 self.model_loader_extra_config = (
-                    self.model_loader_extra_config.to_serializable())
+                    self.model_loader_extra_config.to_serializable()
+                )
             self.model_loader_extra_config["tensorizer_config"] = {}
-            self.model_loader_extra_config["tensorizer_config"][
-                "tensorizer_dir"] = self.model
+            self.model_loader_extra_config["tensorizer_config"]["tensorizer_dir"] = (
+                self.model
+            )
             self.validate_tensorizer_args()
 
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
             safetensors_load_strategy=self.safetensors_load_strategy,
-            device="cpu"
-            if is_online_quantization(self.quantization) else None,
+            device="cpu" if is_online_quantization(self.quantization) else None,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
             use_tqdm_on_load=self.use_tqdm_on_load,
@@ -1083,9 +1304,7 @@ class EngineArgs:
         self,
         target_model_config: ModelConfig,
         target_parallel_config: ParallelConfig,
-        enable_chunked_prefill: bool,
-        disable_log_stats: bool,
-    ) -> Optional["SpeculativeConfig"]:
+    ) -> SpeculativeConfig | None:
         """Initializes and returns a SpeculativeConfig object based on
         `speculative_config`.
 
@@ -1100,76 +1319,76 @@ class EngineArgs:
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
         # config.
-        self.speculative_config.update({
-            "target_model_config": target_model_config,
-            "target_parallel_config": target_parallel_config,
-            "enable_chunked_prefill": enable_chunked_prefill,
-            "disable_log_stats": disable_log_stats,
-        })
+        self.speculative_config.update(
+            {
+                "target_model_config": target_model_config,
+                "target_parallel_config": target_parallel_config,
+            }
+        )
         return SpeculativeConfig(**self.speculative_config)
 
     def create_engine_config(
         self,
-        usage_context: Optional[UsageContext] = None,
+        usage_context: UsageContext | None = None,
         headless: bool = False,
     ) -> VllmConfig:
         """
         Create the VllmConfig.
 
-        NOTE: for autoselection of V0 vs V1 engine, we need to
-        create the ModelConfig first, since ModelConfig's attrs
-        (e.g. the model arch) are needed to make the decision.
-
-        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
-        unspecified by the user.
-
-        If VLLM_USE_V1 is specified by the user but the VllmConfig
-        is incompatible, we raise an error.
+        NOTE: If VllmConfig is incompatible, we raise an error.
         """
         current_platform.pre_register_and_update()
 
-        device_config = DeviceConfig(
-            device=cast(Device, current_platform.device_type))
-
-        (self.model, self.tokenizer,
-         self.speculative_config) = maybe_override_with_speculators(
-             model=self.model,
-             tokenizer=self.tokenizer,
-             revision=self.revision,
-             trust_remote_code=self.trust_remote_code,
-             vllm_speculative_config=self.speculative_config,
-         )
+        device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
+
+        # Check if the model is a speculator and override model/tokenizer/config
+        # BEFORE creating ModelConfig, so the config is created with the target model
+        # Skip speculator detection for cloud storage models (eg: S3, GCS) since
+        # HuggingFace cannot load configs directly from S3 URLs. S3 models can still
+        # use speculators with explicit --speculative-config.
+        if not is_cloud_storage(self.model):
+            (self.model, self.tokenizer, self.speculative_config) = (
+                maybe_override_with_speculators(
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    revision=self.revision,
+                    trust_remote_code=self.trust_remote_code,
+                    vllm_speculative_config=self.speculative_config,
+                )
+            )
+
         model_config = self.create_model_config()
+        self.model = model_config.model
+        self.tokenizer = model_config.tokenizer
 
-        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
-        #   and fall back to V0 for experimental or unsupported features.
-        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
-        #   features and raise error for unsupported features.
-        # * If VLLM_USE_V1=0, we disable V1.
-        use_v1 = False
-        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
-        if try_v1 and self._is_v1_supported_oracle(model_config):
-            use_v1 = True
-
-        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
-        if envs.is_set("VLLM_USE_V1"):
-            assert use_v1 == envs.VLLM_USE_V1
-        # Otherwise, set the VLLM_USE_V1 variable globally.
-        else:
-            envs.set_vllm_use_v1(use_v1)
+        self._check_feature_supported(model_config)
 
         # Set default arguments for V1 Engine.
         self._set_default_args(usage_context, model_config)
-        # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
-        if current_platform.is_cpu() and current_platform.get_cpu_architecture(
-        ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
-            logger.info("Chunked prefill is not supported for ARM and POWER "
-                        "and S390X CPUs; "
-                        "disabling it for V1 backend.")
+        # Disable chunked prefill and prefix caching for:
+        # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
+        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
+            CpuArchEnum.POWERPC,
+            CpuArchEnum.S390X,
+            CpuArchEnum.ARM,
+            CpuArchEnum.RISCV,
+        ):
+            logger.info(
+                "Chunked prefill is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
             self.enable_chunked_prefill = False
+            logger.info(
+                "Prefix caching is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_prefix_caching = False
+
         assert self.enable_chunked_prefill is not None
 
-        sliding_window: Optional[int] = None
+        sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
             # Only set CacheConfig.sliding_window if the model is all sliding
             # window. Otherwise CacheConfig.sliding_window will override the
@@ -1181,8 +1400,7 @@ class EngineArgs:
         # because the world size does not change by dcp, it simply
         # reuses the GPUs of TP group, and split one TP group into
         # tp_size//dcp_size DCP groups.
-        assert self.tensor_parallel_size % self.decode_context_parallel_size \
-            == 0, (
+        assert self.tensor_parallel_size % self.decode_context_parallel_size == 0, (
             f"tp_size={self.tensor_parallel_size} must be divisible by"
             f"dcp_size={self.decode_context_parallel_size}."
         )
@@ -1203,6 +1421,9 @@ class EngineArgs:
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
             mamba_cache_dtype=self.mamba_cache_dtype,
             mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
+            mamba_block_size=self.mamba_block_size,
+            kv_offloading_size=self.kv_offloading_size,
+            kv_offloading_backend=self.kv_offloading_backend,
         )
 
         ray_runtime_env = None
@@ -1211,8 +1432,15 @@ class EngineArgs:
             # of a Ray task, therefore we check is_ray_initialized()
             # as opposed to is_in_ray_actor().
             import ray
+
             ray_runtime_env = ray.get_runtime_context().runtime_env
-            logger.info("Using ray runtime env: %s", ray_runtime_env)
+            # Avoid logging sensitive environment variables
+            sanitized_env = ray_runtime_env.to_dict() if ray_runtime_env else {}
+            if "env_vars" in sanitized_env:
+                sanitized_env["env_vars"] = {
+                    k: "***" for k in sanitized_env["env_vars"]
+                }
+            logger.info("Using ray runtime env (env vars redacted): %s", sanitized_env)
 
         # Get the current placement group if Ray is initialized and
         # we are in a Ray actor. If so, then the placement group will be
@@ -1226,15 +1454,59 @@ class EngineArgs:
             placement_group = ray.util.get_current_placement_group()
 
         assert not headless or not self.data_parallel_hybrid_lb, (
-            "data_parallel_hybrid_lb is not applicable in "
-            "headless mode")
-
-        data_parallel_external_lb = self.data_parallel_rank is not None
+            "data_parallel_hybrid_lb is not applicable in headless mode"
+        )
+        assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
+            "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
+        )
+        assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
+            "nnodes > 1 is only supported with data_parallel_backend=mp"
+        )
+        inferred_data_parallel_rank = 0
+        if self.nnodes > 1:
+            world_size = (
+                self.data_parallel_size
+                * self.pipeline_parallel_size
+                * self.tensor_parallel_size
+            )
+            world_size_within_dp = (
+                self.pipeline_parallel_size * self.tensor_parallel_size
+            )
+            local_world_size = world_size // self.nnodes
+            assert world_size % self.nnodes == 0, (
+                f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
+            )
+            assert self.node_rank < self.nnodes, (
+                f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
+            )
+            inferred_data_parallel_rank = (
+                self.node_rank * local_world_size
+            ) // world_size_within_dp
+            if self.data_parallel_size > 1 and self.data_parallel_external_lb:
+                self.data_parallel_rank = inferred_data_parallel_rank
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d for external lb",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
+            elif self.data_parallel_size_local is None:
+                # Infer data parallel size local for internal dplb:
+                self.data_parallel_size_local = max(
+                    local_world_size // world_size_within_dp, 1
+                )
+        data_parallel_external_lb = (
+            self.data_parallel_external_lb or self.data_parallel_rank is not None
+        )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
+            assert self.data_parallel_rank is not None, (
+                "data_parallel_rank or node_rank must be spefified if "
+                "data_parallel_external_lb is enable."
+            )
             assert self.data_parallel_size_local in (1, None), (
-                "data_parallel_size_local must be 1 when data_parallel_rank "
-                "is set")
+                "data_parallel_size_local must be 1 or None when data_parallel_rank "
+                "is set"
+            )
             data_parallel_size_local = 1
             # Use full external lb if we have local_size of 1.
             self.data_parallel_hybrid_lb = False
@@ -1247,6 +1519,11 @@ class EngineArgs:
 
             if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                 # Use full external lb if we have local_size of 1.
+                logger.warning(
+                    "data_parallel_hybrid_lb is not eligible when "
+                    "data_parallel_size_local = 1, autoswitch to "
+                    "data_parallel_external_lb."
+                )
                 data_parallel_external_lb = True
                 self.data_parallel_hybrid_lb = False
 
@@ -1254,14 +1531,29 @@ class EngineArgs:
                 # Disable hybrid LB mode if set for a single node
                 self.data_parallel_hybrid_lb = False
 
-            self.data_parallel_rank = self.data_parallel_start_rank or 0
+            self.data_parallel_rank = (
+                self.data_parallel_start_rank or inferred_data_parallel_rank
+            )
+            if self.nnodes > 1:
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
         else:
             assert not self.data_parallel_hybrid_lb, (
-                "data_parallel_size_local must be set to use "
-                "data_parallel_hybrid_lb.")
+                "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
+            )
 
-            # Local DP size defaults to global DP size if not set.
-            data_parallel_size_local = self.data_parallel_size
+            if self.data_parallel_backend == "ray" and (
+                envs.VLLM_RAY_DP_PACK_STRATEGY == "span"
+            ):
+                # Data parallel size defaults to 1 if DP ranks are spanning
+                # multiple nodes
+                data_parallel_size_local = 1
+            else:
+                # Otherwise local DP size defaults to global DP size if not set
+                data_parallel_size_local = self.data_parallel_size
 
         # DP address, used in multi-node case for torch distributed group
         # and ZMQ sockets.
@@ -1269,39 +1561,31 @@ class EngineArgs:
             if self.data_parallel_backend == "ray":
                 host_ip = get_ip()
                 logger.info(
-                    "Using host IP %s as ray-based data parallel address",
-                    host_ip)
+                    "Using host IP %s as ray-based data parallel address", host_ip
+                )
                 data_parallel_address = host_ip
             else:
                 assert self.data_parallel_backend == "mp", (
                     "data_parallel_backend can only be ray or mp, got %s",
-                    self.data_parallel_backend)
-                data_parallel_address = ParallelConfig.data_parallel_master_ip
+                    self.data_parallel_backend,
+                )
+                data_parallel_address = (
+                    self.master_addr or ParallelConfig.data_parallel_master_ip
+                )
         else:
             data_parallel_address = self.data_parallel_address
 
         # This port is only used when there are remote data parallel engines,
         # otherwise the local IPC transport is used.
-        data_parallel_rpc_port = self.data_parallel_rpc_port if (
+        data_parallel_rpc_port = (
             self.data_parallel_rpc_port
-            is not None) else ParallelConfig.data_parallel_rpc_port
-
-        if self.async_scheduling:
-            # Async scheduling does not work with the uniprocess backend.
-            if self.distributed_executor_backend is None:
-                self.distributed_executor_backend = "mp"
-                logger.info("Defaulting to mp-based distributed executor "
-                            "backend for async scheduling.")
-            if self.pipeline_parallel_size > 1:
-                raise ValueError("Async scheduling is not supported with "
-                                 "pipeline-parallel-size > 1.")
-
-            # Currently, async scheduling does not support speculative decoding.
-            # TODO(woosuk): Support it.
-            if self.speculative_config is not None:
-                raise ValueError(
-                    "Currently, speculative decoding is not supported with "
-                    "async scheduling.")
+            if (self.data_parallel_rpc_port is not None)
+            else ParallelConfig.data_parallel_rpc_port
+        )
+
+        if self.tokens_only and not model_config.skip_tokenizer_init:
+            model_config.skip_tokenizer_init = True
+            logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
@@ -1320,14 +1604,20 @@ class EngineArgs:
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
             data_parallel_size_local=data_parallel_size_local,
+            master_addr=self.master_addr,
+            master_port=self.master_port,
+            nnodes=self.nnodes,
+            node_rank=self.node_rank,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
             data_parallel_backend=self.data_parallel_backend,
             data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
             enable_expert_parallel=self.enable_expert_parallel,
+            all2all_backend=self.all2all_backend,
             enable_dbo=self.enable_dbo,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
+            disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
             enable_eplb=self.enable_eplb,
             eplb_config=self.eplb_config,
             expert_placement_strategy=self.expert_placement_strategy,
@@ -1340,6 +1630,7 @@ class EngineArgs:
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
             decode_context_parallel_size=self.decode_context_parallel_size,
+            dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
             _api_process_count=self._api_process_count,
             _api_process_rank=self._api_process_rank,
         )
@@ -1347,8 +1638,6 @@ class EngineArgs:
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
-            enable_chunked_prefill=self.enable_chunked_prefill,
-            disable_log_stats=self.disable_log_stats,
         )
 
         # make sure num_lookahead_slots is set appropriately depending on
@@ -1362,38 +1651,56 @@ class EngineArgs:
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
-            cuda_graph_sizes=self.cuda_graph_sizes,
             num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
-            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
-                             and parallel_config.use_ray),
+            is_encoder_decoder=model_config.is_encoder_decoder,
             policy=self.scheduling_policy,
             scheduler_cls=self.scheduler_cls,
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
-            disable_hybrid_kv_cache_manager=self.
-            disable_hybrid_kv_cache_manager,
+            disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
+            stream_interval=self.stream_interval,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
             raise ValueError(
                 "Default modality-specific LoRA(s) were provided for a "
-                "non multimodal model")
-
-        lora_config = LoRAConfig(
-            bias_enabled=self.enable_lora_bias,
-            max_lora_rank=self.max_lora_rank,
-            max_loras=self.max_loras,
-            default_mm_loras=self.default_mm_loras,
-            fully_sharded_loras=self.fully_sharded_loras,
-            lora_extra_vocab_size=self.lora_extra_vocab_size,
-            lora_dtype=self.lora_dtype,
-            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
-            and self.max_cpu_loras > 0 else None) if self.enable_lora else None
+                "non multimodal model"
+            )
+
+        lora_config = (
+            LoRAConfig(
+                max_lora_rank=self.max_lora_rank,
+                max_loras=self.max_loras,
+                default_mm_loras=self.default_mm_loras,
+                fully_sharded_loras=self.fully_sharded_loras,
+                lora_extra_vocab_size=self.lora_extra_vocab_size,
+                lora_dtype=self.lora_dtype,
+                max_cpu_loras=self.max_cpu_loras
+                if self.max_cpu_loras and self.max_cpu_loras > 0
+                else None,
+            )
+            if self.enable_lora
+            else None
+        )
+
+        if (
+            lora_config is not None
+            and speculative_config is not None
+            and scheduler_config.max_num_batched_tokens
+            < (
+                scheduler_config.max_num_seqs
+                * (speculative_config.num_speculative_tokens + 1)
+            )
+        ):
+            raise ValueError(
+                "Consider increasing max_num_batched_tokens or "
+                "decreasing num_speculative_tokens"
+            )
 
         # bitsandbytes pre-quantized model need a specific model loader
         if model_config.quantization == "bitsandbytes":
@@ -1403,31 +1710,65 @@ class EngineArgs:
 
         # Pass reasoning_parser into StructuredOutputsConfig
         if self.reasoning_parser:
-            self.structured_outputs_config.reasoning_parser = \
-                self.reasoning_parser
+            self.structured_outputs_config.reasoning_parser = self.reasoning_parser
+
+        if self.reasoning_parser_plugin:
+            self.structured_outputs_config.reasoning_parser_plugin = (
+                self.reasoning_parser_plugin
+            )
 
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = \
-            self.guided_decoding_backend
+            so_config.guided_decoding_backend = self.guided_decoding_backend
         if self.guided_decoding_disable_fallback is not None:
-            so_config.guided_decoding_disable_fallback = \
-            self.guided_decoding_disable_fallback
+            so_config.disable_fallback = self.guided_decoding_disable_fallback
         if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.guided_decoding_disable_any_whitespace = \
-            self.guided_decoding_disable_any_whitespace
+            so_config.disable_any_whitespace = (
+                self.guided_decoding_disable_any_whitespace
+            )
         if self.guided_decoding_disable_additional_properties is not None:
-            so_config.guided_decoding_disable_additional_properties = \
-            self.guided_decoding_disable_additional_properties
+            so_config.disable_additional_properties = (
+                self.guided_decoding_disable_additional_properties
+            )
 
         observability_config = ObservabilityConfig(
-            show_hidden_metrics_for_version=(
-                self.show_hidden_metrics_for_version),
+            show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
+        # Compilation config overrides
+        compilation_config = copy.deepcopy(self.compilation_config)
+        if self.cuda_graph_sizes is not None:
+            logger.warning(
+                "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
+                "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
+                "instead."
+            )
+            if compilation_config.cudagraph_capture_sizes is not None:
+                raise ValueError(
+                    "cuda_graph_sizes and compilation_config."
+                    "cudagraph_capture_sizes are mutually exclusive"
+                )
+            compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
+        if self.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
+                raise ValueError(
+                    "cudagraph_capture_sizes and compilation_config."
+                    "cudagraph_capture_sizes are mutually exclusive"
+                )
+            compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
+        if self.max_cudagraph_capture_size is not None:
+            if compilation_config.max_cudagraph_capture_size is not None:
+                raise ValueError(
+                    "max_cudagraph_capture_size and compilation_config."
+                    "max_cudagraph_capture_size are mutually exclusive"
+                )
+            compilation_config.max_cudagraph_capture_size = (
+                self.max_cudagraph_capture_size
+            )
+
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1439,42 +1780,29 @@ class EngineArgs:
             load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
-            compilation_config=self.compilation_config,
+            compilation_config=compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
+            ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
         )
 
         return config
 
-    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
-        """Oracle for whether to use V0 or V1 Engine by default."""
-
-        #############################################################
-        # Unsupported Feature Flags on V1.
-
-        if (self.logits_processor_pattern
-                != EngineArgs.logits_processor_pattern):
-            _raise_or_fallback(feature_name="--logits-processor-pattern",
-                               recommend_to_remove=False)
-            return False
-
-        # No Mamba or Encoder-Decoder so far.
-        if not model_config.is_v1_compatible:
-            _raise_or_fallback(feature_name=model_config.architectures,
-                               recommend_to_remove=False)
-            return False
+    def _check_feature_supported(self, model_config: ModelConfig):
+        """Raise an error if the feature is not supported."""
+        if self.logits_processor_pattern != EngineArgs.logits_processor_pattern:
+            _raise_unsupported_error(feature_name="--logits-processor-pattern")
 
         # No Concurrent Partial Prefills so far.
-        if (self.max_num_partial_prefills
-                != SchedulerConfig.max_num_partial_prefills
-                or self.max_long_partial_prefills
-                != SchedulerConfig.max_long_partial_prefills):
-            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
-                               recommend_to_remove=False)
-            return False
-
-        # V1 supports N-gram, Medusa, and Eagle speculative decoding.
+        if (
+            self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills
+            or self.max_long_partial_prefills
+            != SchedulerConfig.max_long_partial_prefills
+        ):
+            _raise_unsupported_error(feature_name="Concurrent Partial Prefill")
+
+        # N-gram, Medusa, and Eagle are supported for speculative decoding.
         if self.speculative_config is not None:
             # speculative_config could still be a dict at this point
             if isinstance(self.speculative_config, dict):
@@ -1486,109 +1814,61 @@ class EngineArgs:
                 raise NotImplementedError(
                     "Draft model speculative decoding is not supported yet. "
                     "Please consider using other speculative decoding methods "
-                    "such as ngram, medusa, eagle, or mtp.")
-
-        V1_BACKENDS = [
-            "FLASH_ATTN",
-            "PALLAS",
-            "TRITON_ATTN",
-            "TRITON_MLA",
-            "CUTLASS_MLA",
-            "FLASHMLA",
-            "FLASH_ATTN_MLA",
-            "FLASHINFER",
-            "FLASHINFER_MLA",
-            "ROCM_AITER_MLA",
-            "TORCH_SDPA",
-            "FLEX_ATTENTION",
-            "TREE_ATTN",
-            "XFORMERS",
-            "ROCM_ATTN",
-        ]
-        if (envs.is_set("VLLM_ATTENTION_BACKEND")
-                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
-            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
-            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
-            return False
-
-        #############################################################
-        # Experimental Features - allow users to opt in.
+                    "such as ngram, medusa, eagle, or mtp."
+                )
 
         if self.pipeline_parallel_size > 1:
-            supports_pp = getattr(self.distributed_executor_backend,
-                                  'supports_pp', False)
+            supports_pp = getattr(
+                self.distributed_executor_backend, "supports_pp", False
+            )
             if not supports_pp and self.distributed_executor_backend not in (
-                    ParallelConfig.distributed_executor_backend, "ray", "mp",
-                    "external_launcher"):
-                name = "Pipeline Parallelism without Ray distributed " \
-                        "executor or multiprocessing executor or external " \
-                        "launcher"
-                _raise_or_fallback(feature_name=name,
-                                   recommend_to_remove=False)
-                return False
-
-        if (current_platform.is_cpu()
-                and model_config.get_sliding_window() is not None):
-            _raise_or_fallback(feature_name="sliding window (CPU backend)",
-                               recommend_to_remove=False)
-            return False
-
-        #############################################################
-
-        return True
-
-    def _set_default_args(self, usage_context: UsageContext,
-                          model_config: ModelConfig) -> None:
-        """Set Default Arguments for V1 Engine."""
+                ParallelConfig.distributed_executor_backend,
+                "ray",
+                "mp",
+                "external_launcher",
+            ):
+                name = (
+                    "Pipeline Parallelism without Ray distributed "
+                    "executor or multiprocessing executor or external "
+                    "launcher"
+                )
+                _raise_unsupported_error(feature_name=name)
 
-        # V1 always uses chunked prefills and prefix caching
-        # for non-pooling tasks.
-        # For pooling tasks the default is False
+    @classmethod
+    def get_chunked_prefill_prefix_caching_defaults(
+        cls,
+        model_config: ModelConfig,
+    ) -> tuple[bool, bool]:
         if model_config.runner_type != "pooling":
-            self.enable_chunked_prefill = True
+            default_chunked_prefill = True
 
-            if model_config.enable_chunked_prefill is not None and \
-                model_config.enable_chunked_prefill is False:
-                self.enable_chunked_prefill = False
-
-            # TODO: When prefix caching supports prompt embeds inputs, this
-            # check can be removed.
-            if (self.enable_prompt_embeds
-                    and self.enable_prefix_caching is not False):
-                logger.warning(
-                    "--enable-prompt-embeds and --enable-prefix-caching "
-                    "are not supported together in V1. Prefix caching has "
-                    "been disabled.")
-                self.enable_prefix_caching = False
-
-            if self.enable_prefix_caching is None:
-                self.enable_prefix_caching = True
+            # Disable prefix caching default for hybrid models
+            # since the feature is still experimental.
+            default_prefix_caching = not model_config.is_hybrid
         else:
+            assert model_config.pooler_config is not None
 
             pooling_type = model_config.pooler_config.pooling_type
-            is_causal = getattr(model_config.hf_config, "is_causal", True)
-            incremental_prefill_supported = (pooling_type is not None
-                                             and pooling_type.lower() == "last"
-                                             and is_causal)
-
-            action = "Enabling" if \
-                incremental_prefill_supported else "Disabling"
-                
-            if model_config.enable_chunked_prefill is not None and \
-                model_config.enable_chunked_prefill is False:
-                self.enable_chunked_prefill = False
-
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = incremental_prefill_supported
-                logger.info("(%s) chunked prefill by default", action)
-            if self.enable_prefix_caching is None:
-                self.enable_prefix_caching = incremental_prefill_supported
-                logger.info("(%s) prefix caching by default", action)
-
-        # V1 should use the new scheduler by default.
-        # Swap it only if this arg is set to the original V0 default
-        if self.scheduler_cls == EngineArgs.scheduler_cls:
-            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
+            incremental_prefill_supported = (
+                pooling_type is not None
+                and pooling_type.lower() == "last"
+                and getattr(model_config.hf_config, "is_causal", True)
+            )
+
+            default_chunked_prefill = incremental_prefill_supported
+            default_prefix_caching = incremental_prefill_supported
+
+        return default_chunked_prefill, default_prefix_caching
+
+    @classmethod
+    def get_batch_defaults(
+        cls,
+        world_size: int,
+    ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
+        from vllm.usage.usage_lib import UsageContext
+
+        default_max_num_batched_tokens: dict[UsageContext | None, int]
+        default_max_num_seqs: dict[UsageContext | None, int]
 
         # When no user override, set the default values based on the usage
         # context.
@@ -1609,7 +1889,6 @@ class EngineArgs:
         # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
         # throughput, see PR #17885 for more details.
         # So here we do an extra device name check to prevent such regression.
-        from vllm.usage.usage_lib import UsageContext
         if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
             # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
@@ -1633,22 +1912,26 @@ class EngineArgs:
 
         # tpu specific default values.
         if current_platform.is_tpu():
-            default_max_num_batched_tokens_tpu = {
-                UsageContext.LLM_CLASS: {
-                    'V6E': 2048,
-                    'V5E': 1024,
-                    'V5P': 512,
-                },
-                UsageContext.OPENAI_API_SERVER: {
-                    'V6E': 1024,
-                    'V5E': 512,
-                    'V5P': 256,
+            chip_name = current_platform.get_device_name()
+
+            if chip_name == "V6E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 2048,
+                    UsageContext.OPENAI_API_SERVER: 1024,
+                }
+            elif chip_name == "V5E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 1024,
+                    UsageContext.OPENAI_API_SERVER: 512,
+                }
+            elif chip_name == "V5P":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 512,
+                    UsageContext.OPENAI_API_SERVER: 256,
                 }
-            }
 
         # cpu specific default values.
         if current_platform.is_cpu():
-            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 4096 * world_size,
                 UsageContext.OPENAI_API_SERVER: 2048 * world_size,
@@ -1658,48 +1941,119 @@ class EngineArgs:
                 UsageContext.OPENAI_API_SERVER: 128 * world_size,
             }
 
-        use_context_value = usage_context.value if usage_context else None
-        if (self.max_num_batched_tokens is None
-                and usage_context in default_max_num_batched_tokens):
-            if current_platform.is_tpu():
-                chip_name = current_platform.get_device_name()
-                if chip_name in default_max_num_batched_tokens_tpu[
-                        usage_context]:
-                    self.max_num_batched_tokens = \
-                        default_max_num_batched_tokens_tpu[
-                            usage_context][chip_name]
-                else:
-                    self.max_num_batched_tokens = \
-                        default_max_num_batched_tokens[usage_context]
-            else:
-                if not self.enable_chunked_prefill:
-                    self.max_num_batched_tokens = model_config.max_model_len
-                else:
-                    self.max_num_batched_tokens = \
-                        default_max_num_batched_tokens[usage_context]
+        return default_max_num_batched_tokens, default_max_num_seqs
+
+    def _set_default_args(
+        self, usage_context: UsageContext, model_config: ModelConfig
+    ) -> None:
+        """Set Default Arguments for V1 Engine."""
+        (
+            default_chunked_prefill,
+            default_prefix_caching,
+        ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
+
+        if self.enable_chunked_prefill is None:
+            self.enable_chunked_prefill = default_chunked_prefill
+
+            logger.debug(
+                "%s chunked prefill by default",
+                "Enabling" if default_chunked_prefill else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_chunked_prefill
+            and not default_chunked_prefill
+        ):
+            logger.warning(
+                "This model does not officially support chunked prefill. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
+
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = default_prefix_caching
+
             logger.debug(
-                "Setting max_num_batched_tokens to %d for %s usage context.",
-                self.max_num_batched_tokens, use_context_value)
+                "%s prefix caching by default",
+                "Enabling" if default_prefix_caching else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_prefix_caching
+            and not default_prefix_caching
+        ):
+            logger.warning(
+                "This model does not officially support prefix caching. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
 
-        if (self.max_num_seqs is None
-                and usage_context in default_max_num_seqs):
-            self.max_num_seqs = min(default_max_num_seqs[usage_context],
-                                    self.max_num_batched_tokens or sys.maxsize)
+        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        (
+            default_max_num_batched_tokens,
+            default_max_num_seqs,
+        ) = self.get_batch_defaults(world_size)
 
-            logger.debug("Setting max_num_seqs to %d for %s usage context.",
-                         self.max_num_seqs, use_context_value)
+        orig_max_num_batched_tokens = self.max_num_batched_tokens
+        orig_max_num_seqs = self.max_num_seqs
+
+        if self.max_num_batched_tokens is None:
+            self.max_num_batched_tokens = default_max_num_batched_tokens.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = default_max_num_seqs.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
+            )
+
+        if orig_max_num_batched_tokens is None:
+            if not self.enable_chunked_prefill:
+                # If max_model_len is too short, use the default for higher throughput.
+                self.max_num_batched_tokens = max(
+                    model_config.max_model_len,
+                    self.max_num_batched_tokens,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * model_config.max_model_len,
+                self.max_num_batched_tokens,
+            )
+
+            logger.debug(
+                "Defaulting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens,
+                usage_context.value if usage_context else None,
+            )
+
+        if orig_max_num_seqs is None:
+            assert self.max_num_batched_tokens is not None  # For type checking
+            self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
+
+            logger.debug(
+                "Defaulting max_num_seqs to %d for %s usage context.",
+                self.max_num_seqs,
+                usage_context.value if usage_context else None,
+            )
 
 
 @dataclass
 class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
+
     enable_log_requests: bool = False
 
     @property
     @deprecated(
         "`disable_log_requests` is deprecated and has been replaced with "
         "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead.")
+        "`enable_log_requests` instead."
+    )
     def disable_log_requests(self) -> bool:
         return not self.enable_log_requests
 
@@ -1707,42 +2061,44 @@ class AsyncEngineArgs(EngineArgs):
     @deprecated(
         "`disable_log_requests` is deprecated and has been replaced with "
         "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead.")
+        "`enable_log_requests` instead."
+    )
     def disable_log_requests(self, value: bool):
         self.enable_log_requests = not value
 
     @staticmethod
-    def add_cli_args(parser: FlexibleArgumentParser,
-                     async_args_only: bool = False) -> FlexibleArgumentParser:
+    def add_cli_args(
+        parser: FlexibleArgumentParser, async_args_only: bool = False
+    ) -> FlexibleArgumentParser:
         # Initialize plugin to update the parser, for example, The plugin may
         # add a new kind of quantization method to --quantization argument or
         # a new device to --device argument.
         load_general_plugins()
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
-        parser.add_argument('--enable-log-requests',
-                            action=argparse.BooleanOptionalAction,
-                            default=AsyncEngineArgs.enable_log_requests,
-                            help='Enable logging requests.')
-        parser.add_argument('--disable-log-requests',
-                            action=argparse.BooleanOptionalAction,
-                            default=not AsyncEngineArgs.enable_log_requests,
-                            help='[DEPRECATED] Disable logging requests.',
-                            deprecated=True)
+        parser.add_argument(
+            "--enable-log-requests",
+            action=argparse.BooleanOptionalAction,
+            default=AsyncEngineArgs.enable_log_requests,
+            help="Enable logging requests.",
+        )
+        parser.add_argument(
+            "--disable-log-requests",
+            action=argparse.BooleanOptionalAction,
+            default=not AsyncEngineArgs.enable_log_requests,
+            help="[DEPRECATED] Disable logging requests.",
+            deprecated=True,
+        )
         current_platform.pre_register_and_update(parser)
         return parser
 
 
-def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
-    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-        raise NotImplementedError(
-            f"VLLM_USE_V1=1 is not supported with {feature_name}.")
-    msg = f"{feature_name} is not supported by the V1 Engine. "
-    msg += "Falling back to V0. "
-    if recommend_to_remove:
-        msg += f"We recommend to remove {feature_name} from your config "
-        msg += "in favor of the V1 Engine."
-    logger.warning(msg)
+def _raise_unsupported_error(feature_name: str):
+    msg = (
+        f"{feature_name} is not supported. We recommend to "
+        f"remove {feature_name} from your config."
+    )
+    raise NotImplementedError(msg)
 
 
 def human_readable_int(value):
@@ -1755,17 +2111,17 @@ def human_readable_int(value):
     - '25.6k' -> 25,600
     """
     value = value.strip()
-    match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
+    match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
     if match:
         decimal_multiplier = {
-            'k': 10**3,
-            'm': 10**6,
-            'g': 10**9,
+            "k": 10**3,
+            "m": 10**6,
+            "g": 10**9,
         }
         binary_multiplier = {
-            'K': 2**10,
-            'M': 2**20,
-            'G': 2**30,
+            "K": 2**10,
+            "M": 2**20,
+            "G": 2**30,
         }
 
         number, suffix = match.groups()
@@ -1778,9 +2134,11 @@ def human_readable_int(value):
             try:
                 return int(number) * mult
             except ValueError as e:
-                raise argparse.ArgumentTypeError("Decimals are not allowed " \
-                f"with binary suffixes like {suffix}. Did you mean to use " \
-                f"{number}{suffix.lower()} instead?") from e
+                raise argparse.ArgumentTypeError(
+                    "Decimals are not allowed "
+                    f"with binary suffixes like {suffix}. Did you mean to use "
+                    f"{number}{suffix.lower()} instead?"
+                ) from e
 
     # Regular plain number.
     return int(value)
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
deleted file mode 100644
index 2762175c430fbfaa143c4379874d91a71d469710..0000000000000000000000000000000000000000
--- a/vllm/engine/metrics.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import time
-from typing import Counter as CollectionsCounter
-from typing import Dict, List, Optional, Type, Union, cast
-
-import numpy as np
-import prometheus_client
-
-from vllm.config import SupportsMetricsInfo, VllmConfig
-from vllm.engine.metrics_types import StatLoggerBase, Stats
-from vllm.executor.ray_utils import ray
-from vllm.logger import init_logger
-
-if ray is not None:
-    from ray.util import metrics as ray_metrics
-else:
-    ray_metrics = None
-
-logger = init_logger(__name__)
-
-prometheus_client.disable_created_metrics()
-
-# The begin-* and end* here are used by the documentation generator
-# to extract the metrics definitions.
-
-
-# --8<-- [start:metrics-definitions]
-class Metrics:
-    """
-    vLLM uses a multiprocessing-based frontend for the OpenAI server.
-    This means that we need to run prometheus_client in multiprocessing mode
-    See https://prometheus.github.io/client_python/multiprocess/ for more
-    details on limitations.
-    """
-
-    labelname_finish_reason = "finished_reason"
-    labelname_waiting_lora_adapters = "waiting_lora_adapters"
-    labelname_running_lora_adapters = "running_lora_adapters"
-    labelname_max_lora = "max_lora"
-    _gauge_cls = prometheus_client.Gauge
-    _counter_cls = prometheus_client.Counter
-    _histogram_cls = prometheus_client.Histogram
-
-    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
-        # Unregister any existing vLLM collectors (for CI/CD)
-        self._unregister_vllm_metrics()
-
-        max_model_len = vllm_config.model_config.max_model_len
-
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
-
-        # System stats
-        #   Scheduler State
-        self.gauge_scheduler_running = self._gauge_cls(
-            name="vllm:num_requests_running",
-            documentation="Number of requests currently running on GPU.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-        self.gauge_scheduler_waiting = self._gauge_cls(
-            name="vllm:num_requests_waiting",
-            documentation="Number of requests waiting to be processed.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-        self.gauge_lora_info = self._gauge_cls(
-            name="vllm:lora_requests_info",
-            documentation="Running stats on lora requests.",
-            labelnames=[
-                self.labelname_running_lora_adapters,
-                self.labelname_max_lora,
-                self.labelname_waiting_lora_adapters,
-            ],
-            multiprocess_mode="livemostrecent",
-        )
-
-        #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = self._gauge_cls(
-            name="vllm:gpu_cache_usage_perc",
-            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-
-        # Iteration stats
-        self.counter_num_preemption = self._counter_cls(
-            name="vllm:num_preemptions_total",
-            documentation="Cumulative number of preemption from the engine.",
-            labelnames=labelnames)
-        self.counter_prompt_tokens = self._counter_cls(
-            name="vllm:prompt_tokens_total",
-            documentation="Number of prefill tokens processed.",
-            labelnames=labelnames)
-        self.counter_generation_tokens = self._counter_cls(
-            name="vllm:generation_tokens_total",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames)
-        self.histogram_iteration_tokens = self._histogram_cls(
-            name="vllm:iteration_tokens_total",
-            documentation="Histogram of number of tokens per engine_step.",
-            labelnames=labelnames,
-            buckets=[
-                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
-            ])
-        self.histogram_time_to_first_token = self._histogram_cls(
-            name="vllm:time_to_first_token_seconds",
-            documentation="Histogram of time to first token in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
-                2560.0
-            ])
-        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # TODO: in 0.12, only enable if show_hidden_metrics=True
-        self.histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation=(
-                "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
-            labelnames=labelnames,
-            buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
-            ])
-        self.histogram_inter_token_latency = self._histogram_cls(
-            name="vllm:inter_token_latency_seconds",
-            documentation="Histogram of inter token latency in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
-            ])
-
-        # Request stats
-        #   Latency
-        request_latency_buckets = [
-            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
-        ]
-        self.histogram_e2e_time_request = self._histogram_cls(
-            name="vllm:e2e_request_latency_seconds",
-            documentation="Histogram of end to end request latency in seconds.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_queue_time_request = self._histogram_cls(
-            name="vllm:request_queue_time_seconds",
-            documentation=
-            "Histogram of time spent in WAITING phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_inference_time_request = self._histogram_cls(
-            name="vllm:request_inference_time_seconds",
-            documentation=
-            "Histogram of time spent in RUNNING phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_prefill_time_request = self._histogram_cls(
-            name="vllm:request_prefill_time_seconds",
-            documentation=
-            "Histogram of time spent in PREFILL phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_decode_time_request = self._histogram_cls(
-            name="vllm:request_decode_time_seconds",
-            documentation=
-            "Histogram of time spent in DECODE phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-
-        #   Metadata
-        self.histogram_num_prompt_tokens_request = self._histogram_cls(
-            name="vllm:request_prompt_tokens",
-            documentation="Number of prefill tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.histogram_num_generation_tokens_request = \
-            self._histogram_cls(
-                name="vllm:request_generation_tokens",
-                documentation="Number of generation tokens processed.",
-                labelnames=labelnames,
-                buckets=build_1_2_5_buckets(max_model_len),
-            )
-        self.histogram_max_num_generation_tokens_request = self._histogram_cls(
-            name="vllm:request_max_num_generation_tokens",
-            documentation=
-            "Histogram of maximum number of requested generation tokens.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len))
-        self.histogram_n_request = self._histogram_cls(
-            name="vllm:request_params_n",
-            documentation="Histogram of the n request parameter.",
-            labelnames=labelnames,
-            buckets=[1, 2, 5, 10, 20],
-        )
-        self.histogram_max_tokens_request = self._histogram_cls(
-            name="vllm:request_params_max_tokens",
-            documentation="Histogram of the max_tokens request parameter.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.counter_request_success = self._counter_cls(
-            name="vllm:request_success_total",
-            documentation="Count of successfully processed requests.",
-            labelnames=labelnames + [Metrics.labelname_finish_reason])
-
-
-# --8<-- [end:metrics-definitions]
-
-    def _unregister_vllm_metrics(self) -> None:
-        for collector in list(prometheus_client.REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                prometheus_client.REGISTRY.unregister(collector)
-
-
-class _RayGaugeWrapper:
-    """Wraps around ray.util.metrics.Gauge to provide same API as
-    prometheus_client.Gauge"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None,
-                 multiprocess_mode: str = ""):
-        del multiprocess_mode
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        self._gauge = ray_metrics.Gauge(name=name,
-                                        description=documentation,
-                                        tag_keys=labelnames_tuple)
-
-    def labels(self, **labels):
-        self._gauge.set_default_tags(labels)
-        return self
-
-    def set(self, value: Union[int, float]):
-        return self._gauge.set(value)
-
-    def set_to_current_time(self):
-        # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
-        return self._gauge.set(time.time())
-
-
-class _RayCounterWrapper:
-    """Wraps around ray.util.metrics.Counter to provide same API as
-    prometheus_client.Counter"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        self._counter = ray_metrics.Counter(name=name,
-                                            description=documentation,
-                                            tag_keys=labelnames_tuple)
-
-    def labels(self, **labels):
-        self._counter.set_default_tags(labels)
-        return self
-
-    def inc(self, value: Union[int, float] = 1.0):
-        if value == 0:
-            return
-        return self._counter.inc(value)
-
-
-class _RayHistogramWrapper:
-    """Wraps around ray.util.metrics.Histogram to provide same API as
-    prometheus_client.Histogram"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None,
-                 buckets: Optional[List[float]] = None):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        boundaries = buckets if buckets else []
-        self._histogram = ray_metrics.Histogram(name=name,
-                                                description=documentation,
-                                                tag_keys=labelnames_tuple,
-                                                boundaries=boundaries)
-
-    def labels(self, **labels):
-        self._histogram.set_default_tags(labels)
-        return self
-
-    def observe(self, value: Union[int, float]):
-        return self._histogram.observe(value)
-
-
-class RayMetrics(Metrics):
-    """
-    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
-    Provides the same metrics as Metrics but uses Ray's util.metrics library.
-    """
-    _gauge_cls: Type[prometheus_client.Gauge] = cast(
-        Type[prometheus_client.Gauge], _RayGaugeWrapper)
-    _counter_cls: Type[prometheus_client.Counter] = cast(
-        Type[prometheus_client.Counter], _RayCounterWrapper)
-    _histogram_cls: Type[prometheus_client.Histogram] = cast(
-        Type[prometheus_client.Histogram], _RayHistogramWrapper)
-
-    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
-        if ray_metrics is None:
-            raise ImportError("RayMetrics requires Ray to be installed.")
-        super().__init__(labelnames, vllm_config)
-
-    def _unregister_vllm_metrics(self) -> None:
-        # No-op on purpose
-        pass
-
-
-def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
-    """
-    Builds a list of buckets with increasing powers of 10 multiplied by
-    mantissa values until the value exceeds the specified maximum.
-
-    """
-    exponent = 0
-    buckets: List[int] = []
-    while True:
-        for m in mantissa_lst:
-            value = m * 10**exponent
-            if value <= max_value:
-                buckets.append(value)
-            else:
-                return buckets
-        exponent += 1
-
-
-def build_1_2_5_buckets(max_value: int) -> List[int]:
-    """
-    Example:
-    >>> build_1_2_5_buckets(100)
-    [1, 2, 5, 10, 20, 50, 100]
-    """
-    return build_buckets([1, 2, 5], max_value)
-
-
-def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
-    """
-    Example:
-    >>> build_1_2_3_5_8_buckets(100)
-    [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
-    """
-    return build_buckets([1, 2, 3, 5, 8], max_value)
-
-
-def local_interval_elapsed(now: float, last_log: float,
-                           local_interval: float) -> bool:
-    elapsed_time = now - last_log
-    return elapsed_time > local_interval
-
-
-def get_throughput(tracked_stats: List[int], now: float,
-                   last_log: float) -> float:
-    return float(np.sum(tracked_stats) / (now - last_log))
-
-
-class LoggingStatLogger(StatLoggerBase):
-    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
-
-    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
-        super().__init__(local_interval, vllm_config)
-        self.last_prompt_throughput: Optional[float] = None
-        self.last_generation_throughput: Optional[float] = None
-
-    def log(self, stats: Stats) -> None:
-        """Called by LLMEngine.
-           Logs to Stdout every self.local_interval seconds."""
-
-        # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
-        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
-
-        # Log locally every local_interval seconds.
-        if local_interval_elapsed(stats.now, self.last_local_log,
-                                  self.local_interval):
-            # Compute summary metrics for tracked stats (and log them
-            # to prometheus if applicable).
-            prompt_throughput = get_throughput(self.num_prompt_tokens,
-                                               now=stats.now,
-                                               last_log=self.last_local_log)
-            generation_throughput = get_throughput(
-                self.num_generation_tokens,
-                now=stats.now,
-                last_log=self.last_local_log)
-
-            log_fn = logger.info
-            if not any((prompt_throughput, generation_throughput,
-                        self.last_prompt_throughput,
-                        self.last_generation_throughput)):
-                # Avoid log noise on an idle production system
-                log_fn = logger.debug
-
-            log_fn(
-                "Avg prompt throughput: %.1f tokens/s, "
-                "Avg generation throughput: %.1f tokens/s, "
-                "Running: %d reqs, Swapped: %d reqs, "
-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%.",
-                prompt_throughput,
-                generation_throughput,
-                stats.num_running_sys,
-                stats.num_swapped_sys,
-                stats.num_waiting_sys,
-                stats.gpu_cache_usage_sys * 100,
-                stats.cpu_cache_usage_sys * 100,
-            )
-            if (stats.cpu_prefix_cache_hit_rate >= 0
-                    or stats.gpu_prefix_cache_hit_rate >= 0):
-                log_fn(
-                    "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
-                    stats.gpu_prefix_cache_hit_rate * 100,
-                    stats.cpu_prefix_cache_hit_rate * 100,
-                )
-
-            self._reset(stats, prompt_throughput, generation_throughput)
-
-    def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
-        # Reset tracked stats for next interval.
-        self.num_prompt_tokens = []
-        self.num_generation_tokens = []
-        self.last_local_log = stats.now
-        self.last_prompt_throughput = prompt_throughput
-        self.last_generation_throughput = generation_throughput
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
-
-
-class PrometheusStatLogger(StatLoggerBase):
-    """PrometheusStatLogger is used LLMEngine to log to Prometheus."""
-    _metrics_cls = Metrics
-    _gauge_cls = prometheus_client.Gauge
-
-    def __init__(self, local_interval: float, labels: Dict[str, str],
-                 vllm_config: VllmConfig) -> None:
-        super().__init__(local_interval, vllm_config)
-        # Prometheus metrics
-        self.labels = labels
-        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
-                                         vllm_config=vllm_config)
-
-    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        # Prevent ValueError from negative increment
-        if data < 0:
-            logger.warning("Skipping negative increment of %g to %s", data,
-                           counter)
-            return
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(self, counter, data: CollectionsCounter,
-                            label_key: str) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram, data: Union[List[int],
-                                                    List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
-    def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
-        gauge.labels(**data).set_to_current_time()
-
-    def _log_prometheus(self, stats: Stats) -> None:
-        # System state data
-        self._log_gauge(self.metrics.gauge_scheduler_running,
-                        stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_waiting,
-                        stats.num_waiting_sys)
-        self._log_gauge(self.metrics.gauge_gpu_cache_usage,
-                        stats.gpu_cache_usage_sys)
-        # Including max-lora in metric, in future this property of lora
-        # config maybe extended to be dynamic.
-        lora_info = {
-            self.metrics.labelname_running_lora_adapters:
-            ",".join(stats.running_lora_adapters),
-            self.metrics.labelname_waiting_lora_adapters:
-            ",".join(stats.waiting_lora_adapters),
-            self.metrics.labelname_max_lora:
-            stats.max_lora,
-        }
-        self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
-        # Iteration level data
-        self._log_counter(self.metrics.counter_num_preemption,
-                          stats.num_preemption_iter)
-        self._log_counter(self.metrics.counter_prompt_tokens,
-                          stats.num_prompt_tokens_iter)
-        self._log_counter(self.metrics.counter_generation_tokens,
-                          stats.num_generation_tokens_iter)
-        self._log_histogram(self.metrics.histogram_iteration_tokens,
-                            [stats.num_tokens_iter])
-        self._log_histogram(self.metrics.histogram_time_to_first_token,
-                            stats.time_to_first_tokens_iter)
-        self._log_histogram(self.metrics.histogram_time_per_output_token,
-                            stats.inter_token_latencies_iter)
-        self._log_histogram(self.metrics.histogram_inter_token_latency,
-                            stats.inter_token_latencies_iter)
-
-        # Request level data
-        # Latency
-        self._log_histogram(self.metrics.histogram_e2e_time_request,
-                            stats.time_e2e_requests)
-        self._log_histogram(self.metrics.histogram_queue_time_request,
-                            stats.time_queue_requests)
-        self._log_histogram(self.metrics.histogram_inference_time_request,
-                            stats.time_inference_requests)
-        self._log_histogram(self.metrics.histogram_prefill_time_request,
-                            stats.time_prefill_requests)
-        self._log_histogram(self.metrics.histogram_decode_time_request,
-                            stats.time_decode_requests)
-        # Metadata
-        finished_reason_counter = CollectionsCounter(
-            stats.finished_reason_requests)
-        self._log_counter_labels(self.metrics.counter_request_success,
-                                 finished_reason_counter,
-                                 Metrics.labelname_finish_reason)
-        self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
-                            stats.num_prompt_tokens_requests)
-        self._log_histogram(
-            self.metrics.histogram_num_generation_tokens_request,
-            stats.num_generation_tokens_requests)
-        self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
-        self._log_histogram(
-            self.metrics.histogram_max_num_generation_tokens_request,
-            stats.max_num_generation_tokens_requests)
-        self._log_histogram(self.metrics.histogram_max_tokens_request,
-                            stats.max_tokens_requests)
-
-    def log(self, stats: Stats):
-        """Logs to prometheus and tracked stats every iteration."""
-        # Log to prometheus.
-        self._log_prometheus(stats)
-
-        # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
-        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
-
-        # Log locally every local_interval seconds.
-        if local_interval_elapsed(stats.now, self.last_local_log,
-                                  self.local_interval):
-
-            # Reset tracked stats for next interval.
-            self.num_prompt_tokens = []
-            self.num_generation_tokens = []
-            self.last_local_log = stats.now
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        # Info type metrics are syntactic sugar for a gauge permanently set to 1
-        # Since prometheus multiprocessing mode does not support Info, emulate
-        # info here with a gauge.
-        if type == "cache_config":
-            metrics_info = obj.metrics_info()
-            info_gauge = self._gauge_cls(
-                name="vllm:cache_config_info",
-                documentation="Information of the LLMEngine CacheConfig",
-                labelnames=metrics_info.keys(),
-                multiprocess_mode="mostrecent")
-            info_gauge.labels(**metrics_info).set(1)
-
-
-class RayPrometheusStatLogger(PrometheusStatLogger):
-    """RayPrometheusStatLogger uses Ray metrics instead."""
-    _metrics_cls = RayMetrics
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        return None
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
deleted file mode 100644
index 9778ab5a8c99b266c481ef17174466f34a4c446c..0000000000000000000000000000000000000000
--- a/vllm/engine/metrics_types.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-These types are defined in this file to avoid importing vllm.engine.metrics
-and therefore importing prometheus_client.
-
-This is required due to usage of Prometheus multiprocess mode to enable 
-metrics after splitting out the uvicorn process from the engine process.
-
-Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR
-before prometheus_client is imported. Typically, this is done by setting
-the env variable before launch, but since we are a library, we need to
-do this in Python code and lazily import prometheus_client.
-"""
-
-import time
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import List
-
-from vllm.config import SupportsMetricsInfo, VllmConfig
-
-
-@dataclass
-class Stats:
-    """Created by LLMEngine for use by StatLogger."""
-    now: float
-
-    # System stats (should have _sys suffix)
-    #   Scheduler State
-    num_running_sys: int
-    num_waiting_sys: int
-    num_swapped_sys: int
-    #   KV Cache Usage in %
-    gpu_cache_usage_sys: float
-    cpu_cache_usage_sys: float
-    #   Prefix caching block hit rate
-    cpu_prefix_cache_hit_rate: float
-    gpu_prefix_cache_hit_rate: float
-
-    # Iteration stats (should have _iter suffix)
-    num_prompt_tokens_iter: int
-    num_generation_tokens_iter: int
-    num_tokens_iter: int
-    time_to_first_tokens_iter: List[float]
-    inter_token_latencies_iter: List[float]
-    num_preemption_iter: int
-
-    # Request stats (should have _requests suffix)
-    #   Latency
-    time_e2e_requests: List[float]
-    time_queue_requests: List[float]
-    time_inference_requests: List[float]
-    time_prefill_requests: List[float]
-    time_decode_requests: List[float]
-    #   Metadata
-    num_prompt_tokens_requests: List[int]
-    num_generation_tokens_requests: List[int]
-    n_requests: List[int]
-    max_num_generation_tokens_requests: List[int]
-    max_tokens_requests: List[int]
-    finished_reason_requests: List[str]
-    waiting_lora_adapters: List[str]
-    running_lora_adapters: List[str]
-    max_lora: str
-
-
-class StatLoggerBase(ABC):
-    """Base class for StatLogger."""
-
-    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
-        # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: List[int] = []
-        self.num_generation_tokens: List[int] = []
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
-
-    @abstractmethod
-    def log(self, stats: Stats) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 9aea74d0c8f3c9ddd7295c1cbd37558827a08aea..462d2c4e50e737b4b027c3f00e3da0053d7e4862 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,234 +1,90 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import asyncio
+import enum
 from abc import ABC, abstractmethod
-from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
+from collections.abc import AsyncGenerator, Iterable, Mapping
+from typing import Any
 
-from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import ModelConfig, VllmConfig
-from vllm.inputs.data import PromptType, TokensPrompt
-from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
-from vllm.inputs.preprocess import InputPreprocessor
+from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
-from vllm.plugins.io_processors.interface import IOProcessor
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors import IOProcessor
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import Device, collect_from_async_generator, random_uuid
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.processor import Processor
 
 logger = init_logger(__name__)
 
 
+class Device(enum.Enum):
+    GPU = enum.auto()
+    CPU = enum.auto()
+
+
 class EngineClient(ABC):
     """Protocol class for Clients to Engine"""
 
+    vllm_config: VllmConfig
+    model_config: ModelConfig
+    processor: Processor
+    io_processor: IOProcessor | None
+
     @property
     @abstractmethod
-    def is_running(self) -> bool:
-        ...
+    def is_running(self) -> bool: ...
 
     @property
     @abstractmethod
-    def is_stopped(self) -> bool:
-        ...
+    def is_stopped(self) -> bool: ...
 
     @property
     @abstractmethod
-    def errored(self) -> bool:
-        ...
+    def errored(self) -> bool: ...
 
     @property
     @abstractmethod
-    def dead_error(self) -> BaseException:
-        ...
+    def dead_error(self) -> BaseException: ...
 
     @abstractmethod
     def generate(
         self,
-        prompt: PromptType,
+        prompt: EngineCoreRequest | PromptType,
         sampling_params: SamplingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        *,
+        prompt_text: str | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
+        data_parallel_rank: int | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request."""
         ...
 
-    async def beam_search(
-        self,
-        prompt: PromptType,
-        request_id: str,
-        params: BeamSearchParams,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-
-        beam_width = params.beam_width
-        max_tokens = params.max_tokens
-        ignore_eos = params.ignore_eos
-        temperature = params.temperature
-        length_penalty = params.length_penalty
-        include_stop_str_in_output = params.include_stop_str_in_output
-
-        preprocessor = await self.get_input_preprocessor()
-        tokenizer = preprocessor.get_tokenizer()
-        eos_token_id = tokenizer.eos_token_id
-
-        if is_explicit_encoder_decoder_prompt(prompt):
-            raise NotImplementedError
-        else:
-            processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
-
-        if processed_inputs["type"] == "embeds":
-            raise NotImplementedError
-
-        # This is a workaround to fix multimodal beam search; this is a
-        # bandaid fix for 2 small problems:
-        # 1. Multi_modal_data on the processed_inputs currently resolves to
-        #    `None`.
-        # 2. preprocessing above expands the multimodal placeholders. However,
-        #    this happens again in generation, so the double expansion causes
-        #    a mismatch.
-        # TODO - would be ideal to handle this more gracefully.
-        prompt_token_ids = prompt.get("prompt_token_ids")
-        multi_modal_data = prompt.get("multi_modal_data")
-
-        prompt_text = processed_inputs.get("prompt")
-        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
-
-        tokenized_length = len(prompt_token_ids)
-
-        sort_beams_key = create_sort_beams_key_function(
-            eos_token_id, length_penalty)
-
-        beam_search_params = SamplingParams(
-            logprobs=2 * beam_width,
-            max_tokens=1,
-            temperature=temperature,
-        )
-        all_beams = [
-            BeamSearchSequence(tokens=prompt_token_ids,
-                               cum_logprob=0,
-                               logprobs=[],
-                               multi_modal_data=multi_modal_data,
-                               mm_processor_kwargs=mm_processor_kwargs,
-                               lora_request=lora_request)
-        ]
-        completed = []
-
-        for _ in range(max_tokens):
-            prompts_batch, lora_req_batch = zip(*[(
-                TokensPrompt(prompt_token_ids=beam.tokens,
-                             multi_modal_data=beam.multi_modal_data,
-                             mm_processor_kwargs=beam.mm_processor_kwargs),
-                beam.lora_request,
-            ) for beam in all_beams])
-
-            tasks = []
-
-            request_id = f"beam_search-{random_uuid()}"
-            for i, (individual_prompt,
-                    lora_req) in enumerate(zip(prompts_batch, lora_req_batch)):
-                request_id_item = f"{request_id}-{i}"
-                task = asyncio.create_task(
-                    collect_from_async_generator(
-                        self.generate(individual_prompt,
-                                      beam_search_params,
-                                      request_id_item,
-                                      lora_request=lora_req)))
-                tasks.append(task)
-
-            output = await asyncio.gather(*tasks)
-
-            output = [x[0] for x in output]
-
-            new_beams = []
-            for i, current_beam in enumerate(all_beams):
-                result = output[i]
-
-                if result.outputs[0].logprobs is not None:
-                    logprobs = result.outputs[0].logprobs[0]
-                    for token_id, logprob_obj in logprobs.items():
-                        if token_id == eos_token_id and \
-                            not ignore_eos:
-                            completed.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens +
-                                    [token_id] if include_stop_str_in_output
-                                    else current_beam.tokens,
-                                    logprobs=current_beam.logprobs +
-                                    [logprobs],
-                                    cum_logprob=current_beam.cum_logprob +
-                                    logprob_obj.logprob,
-                                    finish_reason="stop",
-                                    stop_reason=eos_token_id))
-                        else:
-                            new_beams.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id],
-                                    logprobs=current_beam.logprobs +
-                                    [logprobs],
-                                    lora_request=current_beam.lora_request,
-                                    cum_logprob=current_beam.cum_logprob +
-                                    logprob_obj.logprob,
-                                    multi_modal_data=current_beam.
-                                    multi_modal_data,
-                                    mm_processor_kwargs=current_beam.
-                                    mm_processor_kwargs))
-
-            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
-            all_beams = sorted_beams[:beam_width]
-
-        completed.extend(all_beams)
-        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
-        best_beams = sorted_completed[:beam_width]
-
-        for beam in best_beams:
-            if (beam.tokens[-1] == eos_token_id and not ignore_eos):
-                # Skip the eos token in the text.
-                tokens = beam.tokens[tokenized_length:-1]
-            else:
-                tokens = beam.tokens[tokenized_length:]
-            beam.text = tokenizer.decode(tokens)
-
-        yield RequestOutput(
-            request_id=request_id,
-            prompt=prompt_text,
-            outputs=[
-                CompletionOutput(text=beam.text,
-                                 cumulative_logprob=beam.cum_logprob,
-                                 token_ids=beam.tokens[tokenized_length:],
-                                 index=i,
-                                 logprobs=beam.logprobs,
-                                 finish_reason=beam.finish_reason if
-                                 beam.finish_reason is not None else "length",
-                                 stop_reason=beam.stop_reason)
-                for (i, beam) in enumerate(best_beams)
-            ],
-            finished=True,
-            prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None)
-
     @abstractmethod
     def encode(
         self,
         prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        truncate_prompt_tokens: int | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model."""
         ...
 
     @abstractmethod
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
+    async def abort(self, request_id: str | Iterable[str]) -> None:
         """Abort a request.
 
         Args:
@@ -237,36 +93,16 @@ class EngineClient(ABC):
         """
         ...
 
-    @abstractmethod
-    async def get_vllm_config(self) -> VllmConfig:
-        """Get the vllm configuration of the vLLM engine."""
-        ...
-
-    @abstractmethod
-    async def get_model_config(self) -> ModelConfig:
-        """Get the model configuration of the vLLM engine."""
-        ...
-
-    @abstractmethod
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        """Get the input processor of the vLLM engine."""
-        ...
-
     @abstractmethod
     async def get_tokenizer(self) -> AnyTokenizer:
         """Get the tokenizer"""
         ...
 
-    async def get_io_processor(self) -> IOProcessor:
-        raise NotImplementedError
-
     @abstractmethod
-    async def is_tracing_enabled(self) -> bool:
-        ...
+    async def is_tracing_enabled(self) -> bool: ...
 
     @abstractmethod
-    async def do_log_stats(self) -> None:
-        ...
+    async def do_log_stats(self) -> None: ...
 
     @abstractmethod
     async def check_health(self) -> None:
@@ -280,7 +116,7 @@ class EngineClient(ABC):
 
     @abstractmethod
     async def stop_profile(self) -> None:
-        """Start profiling the engine"""
+        """Stop profiling the engine"""
         ...
 
     @abstractmethod
@@ -289,8 +125,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
+    async def reset_prefix_cache(self) -> None:
         """Reset the prefix cache"""
         ...
 
@@ -300,7 +135,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up(self, tags: list[str] | None = None) -> None:
         """Wake up the engine"""
         ...
 
@@ -314,17 +149,19 @@ class EngineClient(ABC):
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
-    async def scale_elastic_ep(self,
-                               new_data_parallel_size: int,
-                               drain_timeout: int = 300) -> None:
+    async def scale_elastic_ep(
+        self, new_data_parallel_size: int, drain_timeout: int = 300
+    ) -> None:
         """Scale the engine"""
         raise NotImplementedError
 
-    async def collective_rpc(self,
-                             method: str,
-                             timeout: Optional[float] = None,
-                             args: tuple = (),
-                             kwargs: Optional[dict] = None):
+    async def collective_rpc(
+        self,
+        method: str,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+    ):
         """Perform a collective RPC call to the given path."""
         raise NotImplementedError
 
diff --git a/vllm/entrypoints/anthropic/__init__.py b/vllm/entrypoints/anthropic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..626ca7472ae64b011d50f4b052e7ec51bb678f70
--- /dev/null
+++ b/vllm/entrypoints/anthropic/protocol.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pydantic models for Anthropic API protocol"""
+
+import time
+from typing import Any, Literal, Optional
+
+from pydantic import BaseModel, field_validator
+
+
+class AnthropicError(BaseModel):
+    """Error structure for Anthropic API"""
+
+    type: str
+    message: str
+
+
+class AnthropicErrorResponse(BaseModel):
+    """Error response structure for Anthropic API"""
+
+    type: Literal["error"] = "error"
+    error: AnthropicError
+
+
+class AnthropicUsage(BaseModel):
+    """Token usage information"""
+
+    input_tokens: int
+    output_tokens: int
+    cache_creation_input_tokens: int | None = None
+    cache_read_input_tokens: int | None = None
+
+
+class AnthropicContentBlock(BaseModel):
+    """Content block in message"""
+
+    type: Literal["text", "image", "tool_use", "tool_result"]
+    text: str | None = None
+    # For image content
+    source: dict[str, Any] | None = None
+    # For tool use/result
+    id: str | None = None
+    name: str | None = None
+    input: dict[str, Any] | None = None
+    content: str | list[dict[str, Any]] | None = None
+    is_error: bool | None = None
+
+
+class AnthropicMessage(BaseModel):
+    """Message structure"""
+
+    role: Literal["user", "assistant"]
+    content: str | list[AnthropicContentBlock]
+
+
+class AnthropicTool(BaseModel):
+    """Tool definition"""
+
+    name: str
+    description: str | None = None
+    input_schema: dict[str, Any]
+
+    @field_validator("input_schema")
+    @classmethod
+    def validate_input_schema(cls, v):
+        if not isinstance(v, dict):
+            raise ValueError("input_schema must be a dictionary")
+        if "type" not in v:
+            v["type"] = "object"  # Default to object type
+        return v
+
+
+class AnthropicToolChoice(BaseModel):
+    """Tool Choice definition"""
+
+    type: Literal["auto", "any", "tool"]
+    name: str | None = None
+
+
+class AnthropicMessagesRequest(BaseModel):
+    """Anthropic Messages API request"""
+
+    model: str
+    messages: list[AnthropicMessage]
+    max_tokens: int
+    metadata: dict[str, Any] | None = None
+    stop_sequences: list[str] | None = None
+    stream: bool | None = False
+    system: str | list[AnthropicContentBlock] | None = None
+    temperature: float | None = None
+    tool_choice: AnthropicToolChoice | None = None
+    tools: list[AnthropicTool] | None = None
+    top_k: int | None = None
+    top_p: float | None = None
+
+    @field_validator("model")
+    @classmethod
+    def validate_model(cls, v):
+        if not v:
+            raise ValueError("Model is required")
+        return v
+
+    @field_validator("max_tokens")
+    @classmethod
+    def validate_max_tokens(cls, v):
+        if v <= 0:
+            raise ValueError("max_tokens must be positive")
+        return v
+
+
+class AnthropicDelta(BaseModel):
+    """Delta for streaming responses"""
+
+    type: Literal["text_delta", "input_json_delta"] | None = None
+    text: str | None = None
+    partial_json: str | None = None
+
+    # Message delta
+    stop_reason: (
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None
+    ) = None
+    stop_sequence: str | None = None
+
+
+class AnthropicStreamEvent(BaseModel):
+    """Streaming event"""
+
+    type: Literal[
+        "message_start",
+        "message_delta",
+        "message_stop",
+        "content_block_start",
+        "content_block_delta",
+        "content_block_stop",
+        "ping",
+        "error",
+    ]
+    message: Optional["AnthropicMessagesResponse"] = None
+    delta: AnthropicDelta | None = None
+    content_block: AnthropicContentBlock | None = None
+    index: int | None = None
+    error: AnthropicError | None = None
+    usage: AnthropicUsage | None = None
+
+
+class AnthropicMessagesResponse(BaseModel):
+    """Anthropic Messages API response"""
+
+    id: str
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: list[AnthropicContentBlock]
+    model: str
+    stop_reason: (
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None
+    ) = None
+    stop_sequence: str | None = None
+    usage: AnthropicUsage | None = None
+
+    def model_post_init(self, __context):
+        if not self.id:
+            self.id = f"msg_{int(time.time() * 1000)}"
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
new file mode 100644
index 0000000000000000000000000000000000000000..340dabf0e7117d372d29feb3b58a253de165ae8f
--- /dev/null
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/vllm/vllm/entrypoints/openai/serving_chat.py
+
+"""Anthropic Messages API serving handler"""
+
+import json
+import logging
+import time
+from collections.abc import AsyncGenerator
+from typing import Any
+
+from fastapi import Request
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicContentBlock,
+    AnthropicDelta,
+    AnthropicError,
+    AnthropicMessagesRequest,
+    AnthropicMessagesResponse,
+    AnthropicStreamEvent,
+    AnthropicUsage,
+)
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionStreamResponse,
+    ChatCompletionToolsParam,
+    ErrorResponse,
+    StreamOptions,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+
+logger = logging.getLogger(__name__)
+
+
+def wrap_data_with_event(data: str, event: str):
+    return f"event: {event}\ndata: {data}\n\n"
+
+
+class AnthropicServingMessages(OpenAIServingChat):
+    """Handler for Anthropic Messages API requests"""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        response_role: str,
+        *,
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        reasoning_parser: str = "",
+        enable_auto_tools: bool = False,
+        tool_parser: str | None = None,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+    ):
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            response_role=response_role,
+            request_logger=request_logger,
+            chat_template=chat_template,
+            chat_template_content_format=chat_template_content_format,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            reasoning_parser=reasoning_parser,
+            enable_auto_tools=enable_auto_tools,
+            tool_parser=tool_parser,
+            enable_prompt_tokens_details=enable_prompt_tokens_details,
+            enable_force_include_usage=enable_force_include_usage,
+        )
+        self.stop_reason_map = {
+            "stop": "end_turn",
+            "length": "max_tokens",
+            "tool_calls": "tool_use",
+        }
+
+    def _convert_anthropic_to_openai_request(
+        self, anthropic_request: AnthropicMessagesRequest
+    ) -> ChatCompletionRequest:
+        """Convert Anthropic message format to OpenAI format"""
+        openai_messages = []
+
+        # Add system message if provided
+        if anthropic_request.system:
+            if isinstance(anthropic_request.system, str):
+                openai_messages.append(
+                    {"role": "system", "content": anthropic_request.system}
+                )
+            else:
+                system_prompt = ""
+                for block in anthropic_request.system:
+                    if block.type == "text" and block.text:
+                        system_prompt += block.text
+                openai_messages.append({"role": "system", "content": system_prompt})
+
+        for msg in anthropic_request.messages:
+            openai_msg: dict[str, Any] = {"role": msg.role}  # type: ignore
+            if isinstance(msg.content, str):
+                openai_msg["content"] = msg.content
+            else:
+                # Handle complex content blocks
+                content_parts: list[dict[str, Any]] = []
+                tool_calls: list[dict[str, Any]] = []
+
+                for block in msg.content:
+                    if block.type == "text" and block.text:
+                        content_parts.append({"type": "text", "text": block.text})
+                    elif block.type == "image" and block.source:
+                        content_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": block.source.get("data", "")},
+                            }
+                        )
+                    elif block.type == "tool_use":
+                        # Convert tool use to function call format
+                        tool_call = {
+                            "id": block.id or f"call_{int(time.time())}",
+                            "type": "function",
+                            "function": {
+                                "name": block.name or "",
+                                "arguments": json.dumps(block.input or {}),
+                            },
+                        }
+                        tool_calls.append(tool_call)
+                    elif block.type == "tool_result":
+                        if msg.role == "user":
+                            openai_messages.append(
+                                {
+                                    "role": "tool",
+                                    "tool_call_id": block.id or "",
+                                    "content": str(block.content)
+                                    if block.content
+                                    else "",
+                                }
+                            )
+                        else:
+                            # Assistant tool result becomes regular text
+                            tool_result_text = (
+                                str(block.content) if block.content else ""
+                            )
+                            content_parts.append(
+                                {
+                                    "type": "text",
+                                    "text": f"Tool result: {tool_result_text}",
+                                }
+                            )
+
+                # Add tool calls to the message if any
+                if tool_calls:
+                    openai_msg["tool_calls"] = tool_calls  # type: ignore
+
+                # Add content parts if any
+                if content_parts:
+                    if len(content_parts) == 1 and content_parts[0]["type"] == "text":
+                        openai_msg["content"] = content_parts[0]["text"]
+                    else:
+                        openai_msg["content"] = content_parts  # type: ignore
+                elif not tool_calls:
+                    continue
+
+            openai_messages.append(openai_msg)
+
+        req = ChatCompletionRequest(
+            model=anthropic_request.model,
+            messages=openai_messages,
+            max_tokens=anthropic_request.max_tokens,
+            max_completion_tokens=anthropic_request.max_tokens,
+            stop=anthropic_request.stop_sequences,
+            temperature=anthropic_request.temperature,
+            top_p=anthropic_request.top_p,
+            top_k=anthropic_request.top_k,
+        )
+
+        if anthropic_request.stream:
+            req.stream = anthropic_request.stream
+            req.stream_options = StreamOptions.validate({"include_usage": True})
+
+        if anthropic_request.tool_choice is None:
+            req.tool_choice = None
+        elif anthropic_request.tool_choice.type == "auto":
+            req.tool_choice = "auto"
+        elif anthropic_request.tool_choice.type == "any":
+            req.tool_choice = "required"
+        elif anthropic_request.tool_choice.type == "tool":
+            req.tool_choice = ChatCompletionNamedToolChoiceParam.model_validate(
+                {
+                    "type": "function",
+                    "function": {"name": anthropic_request.tool_choice.name},
+                }
+            )
+
+        tools = []
+        if anthropic_request.tools is None:
+            return req
+        for tool in anthropic_request.tools:
+            tools.append(
+                ChatCompletionToolsParam.model_validate(
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": tool.name,
+                            "description": tool.description,
+                            "parameters": tool.input_schema,
+                        },
+                    }
+                )
+            )
+        if req.tool_choice is None:
+            req.tool_choice = "auto"
+        req.tools = tools
+        return req
+
+    async def create_messages(
+        self,
+        request: AnthropicMessagesRequest,
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | AnthropicMessagesResponse | ErrorResponse:
+        """
+        Messages API similar to Anthropic's API.
+
+        See https://docs.anthropic.com/en/api/messages
+        for the API specification. This API mimics the Anthropic messages API.
+        """
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("Received messages request %s", request.model_dump_json())
+        chat_req = self._convert_anthropic_to_openai_request(request)
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("Convert to OpenAI request %s", chat_req.model_dump_json())
+        generator = await self.create_chat_completion(chat_req, raw_request)
+
+        if isinstance(generator, ErrorResponse):
+            return generator
+
+        elif isinstance(generator, ChatCompletionResponse):
+            return self.messages_full_converter(generator)
+
+        return self.message_stream_converter(generator)
+
+    def messages_full_converter(
+        self,
+        generator: ChatCompletionResponse,
+    ) -> AnthropicMessagesResponse:
+        result = AnthropicMessagesResponse(
+            id=generator.id,
+            content=[],
+            model=generator.model,
+            usage=AnthropicUsage(
+                input_tokens=generator.usage.prompt_tokens,
+                output_tokens=generator.usage.completion_tokens,
+            ),
+        )
+        if generator.choices[0].finish_reason == "stop":
+            result.stop_reason = "end_turn"
+        elif generator.choices[0].finish_reason == "length":
+            result.stop_reason = "max_tokens"
+        elif generator.choices[0].finish_reason == "tool_calls":
+            result.stop_reason = "tool_use"
+
+        content: list[AnthropicContentBlock] = [
+            AnthropicContentBlock(
+                type="text",
+                text=generator.choices[0].message.content
+                if generator.choices[0].message.content
+                else "",
+            )
+        ]
+
+        for tool_call in generator.choices[0].message.tool_calls:
+            anthropic_tool_call = AnthropicContentBlock(
+                type="tool_use",
+                id=tool_call.id,
+                name=tool_call.function.name,
+                input=json.loads(tool_call.function.arguments),
+            )
+            content += [anthropic_tool_call]
+
+        result.content = content
+
+        return result
+
+    async def message_stream_converter(
+        self,
+        generator: AsyncGenerator[str, None],
+    ) -> AsyncGenerator[str, None]:
+        try:
+            first_item = True
+            finish_reason = None
+            content_block_index = 0
+            content_block_started = False
+
+            async for item in generator:
+                if item.startswith("data:"):
+                    data_str = item[5:].strip().rstrip("\n")
+                    if data_str == "[DONE]":
+                        stop_message = AnthropicStreamEvent(
+                            type="message_stop",
+                        )
+                        data = stop_message.model_dump_json(
+                            exclude_unset=True, exclude_none=True
+                        )
+                        yield wrap_data_with_event(data, "message_stop")
+                        yield "data: [DONE]\n\n"
+                    else:
+                        origin_chunk = ChatCompletionStreamResponse.model_validate_json(
+                            data_str
+                        )
+
+                        if first_item:
+                            chunk = AnthropicStreamEvent(
+                                type="message_start",
+                                message=AnthropicMessagesResponse(
+                                    id=origin_chunk.id,
+                                    content=[],
+                                    model=origin_chunk.model,
+                                ),
+                            )
+                            first_item = False
+                            data = chunk.model_dump_json(exclude_unset=True)
+                            yield wrap_data_with_event(data, "message_start")
+                            continue
+
+                        # last chunk including usage info
+                        if len(origin_chunk.choices) == 0:
+                            if content_block_started:
+                                stop_chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_stop",
+                                )
+                                data = stop_chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_stop")
+                            stop_reason = self.stop_reason_map.get(
+                                finish_reason or "stop"
+                            )
+                            chunk = AnthropicStreamEvent(
+                                type="message_delta",
+                                delta=AnthropicDelta(stop_reason=stop_reason),
+                                usage=AnthropicUsage(
+                                    input_tokens=origin_chunk.usage.prompt_tokens
+                                    if origin_chunk.usage
+                                    else 0,
+                                    output_tokens=origin_chunk.usage.completion_tokens
+                                    if origin_chunk.usage
+                                    else 0,
+                                ),
+                            )
+                            data = chunk.model_dump_json(exclude_unset=True)
+                            yield wrap_data_with_event(data, "message_delta")
+                            continue
+
+                        if origin_chunk.choices[0].finish_reason is not None:
+                            finish_reason = origin_chunk.choices[0].finish_reason
+                            continue
+
+                        # content
+                        if origin_chunk.choices[0].delta.content is not None:
+                            if not content_block_started:
+                                chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_start",
+                                    content_block=AnthropicContentBlock(
+                                        type="text", text=""
+                                    ),
+                                )
+                                data = chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_start")
+                                content_block_started = True
+
+                            if origin_chunk.choices[0].delta.content == "":
+                                continue
+                            chunk = AnthropicStreamEvent(
+                                index=content_block_index,
+                                type="content_block_delta",
+                                delta=AnthropicDelta(
+                                    type="text_delta",
+                                    text=origin_chunk.choices[0].delta.content,
+                                ),
+                            )
+                            data = chunk.model_dump_json(exclude_unset=True)
+                            yield wrap_data_with_event(data, "content_block_delta")
+                            continue
+
+                        # tool calls
+                        elif len(origin_chunk.choices[0].delta.tool_calls) > 0:
+                            tool_call = origin_chunk.choices[0].delta.tool_calls[0]
+                            if tool_call.id is not None:
+                                if content_block_started:
+                                    stop_chunk = AnthropicStreamEvent(
+                                        index=content_block_index,
+                                        type="content_block_stop",
+                                    )
+                                    data = stop_chunk.model_dump_json(
+                                        exclude_unset=True
+                                    )
+                                    yield wrap_data_with_event(
+                                        data, "content_block_stop"
+                                    )
+                                    content_block_started = False
+                                    content_block_index += 1
+
+                                chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_start",
+                                    content_block=AnthropicContentBlock(
+                                        type="tool_use",
+                                        id=tool_call.id,
+                                        name=tool_call.function.name
+                                        if tool_call.function
+                                        else None,
+                                        input={},
+                                    ),
+                                )
+                                data = chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_start")
+                                content_block_started = True
+
+                            else:
+                                chunk = AnthropicStreamEvent(
+                                    index=content_block_index,
+                                    type="content_block_delta",
+                                    delta=AnthropicDelta(
+                                        type="input_json_delta",
+                                        partial_json=tool_call.function.arguments
+                                        if tool_call.function
+                                        else None,
+                                    ),
+                                )
+                                data = chunk.model_dump_json(exclude_unset=True)
+                                yield wrap_data_with_event(data, "content_block_delta")
+                            continue
+                else:
+                    error_response = AnthropicStreamEvent(
+                        type="error",
+                        error=AnthropicError(
+                            type="internal_error",
+                            message="Invalid data format received",
+                        ),
+                    )
+                    data = error_response.model_dump_json(exclude_unset=True)
+                    yield wrap_data_with_event(data, "error")
+                    yield "data: [DONE]\n\n"
+
+        except Exception as e:
+            logger.exception("Error in message stream converter.")
+            error_response = AnthropicStreamEvent(
+                type="error",
+                error=AnthropicError(type="internal_error", message=str(e)),
+            )
+            data = error_response.model_dump_json(exclude_unset=True)
+            yield wrap_data_with_event(data, "error")
+            yield "data: [DONE]\n\n"
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 3d1e5dc14d2f3d5c00a6b1b9c849dfc8a5e25ceb..154cdeb42a3ea03ff5a637ebfa24f8a57560a766 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -7,12 +7,13 @@ For production use, we recommend using our OpenAI compatible server.
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
+
 import asyncio
 import json
 import ssl
 from argparse import Namespace
 from collections.abc import AsyncGenerator
-from typing import Any, Optional
+from typing import Any
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -25,7 +26,9 @@ from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
+from vllm.utils import random_uuid
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.system_utils import set_ulimit
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -68,9 +71,7 @@ async def _generate(request_dict: dict, raw_request: Request) -> Response:
         async for request_output in results_generator:
             prompt = request_output.prompt
             assert prompt is not None
-            text_outputs = [
-                prompt + output.text for output in request_output.outputs
-            ]
+            text_outputs = [prompt + output.text for output in request_output.outputs]
             ret = {"text": text_outputs}
             yield (json.dumps(ret) + "\n").encode("utf-8")
 
@@ -102,23 +103,27 @@ def build_app(args: Namespace) -> FastAPI:
 
 async def init_app(
     args: Namespace,
-    llm_engine: Optional[AsyncLLMEngine] = None,
+    llm_engine: AsyncLLMEngine | None = None,
 ) -> FastAPI:
     app = build_app(args)
 
     global engine
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = (llm_engine
-              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
-                  engine_args, usage_context=UsageContext.API_SERVER))
+    engine = (
+        llm_engine
+        if llm_engine is not None
+        else AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.API_SERVER
+        )
+    )
     app.state.engine_client = engine
     return app
 
 
-async def run_server(args: Namespace,
-                     llm_engine: Optional[AsyncLLMEngine] = None,
-                     **uvicorn_kwargs: Any) -> None:
+async def run_server(
+    args: Namespace, llm_engine: AsyncLLMEngine | None = None, **uvicorn_kwargs: Any
+) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
@@ -151,26 +156,27 @@ if __name__ == "__main__":
     parser.add_argument("--port", type=parser.check_port, default=8000)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
     parser.add_argument("--ssl-certfile", type=str, default=None)
-    parser.add_argument("--ssl-ca-certs",
-                        type=str,
-                        default=None,
-                        help="The CA certificates file")
+    parser.add_argument(
+        "--ssl-ca-certs", type=str, default=None, help="The CA certificates file"
+    )
     parser.add_argument(
         "--enable-ssl-refresh",
         action="store_true",
         default=False,
-        help="Refresh SSL Context when SSL certificate files change")
+        help="Refresh SSL Context when SSL certificate files change",
+    )
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
         default=int(ssl.CERT_NONE),
-        help="Whether client certificate is required (see stdlib ssl module's)"
+        help="Whether client certificate is required (see stdlib ssl module's)",
     )
     parser.add_argument(
         "--root-path",
         type=str,
         default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
+        help="FastAPI root_path when app is behind a path based routing proxy",
+    )
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 6b0ed23277d36b6d3a3ae67fb5b89c843308cb8e..4c704e4e2f0149a79583b11a9aad4808b9aadaea 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -2,14 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import inspect
 import json
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict, deque
-from collections.abc import Awaitable, Iterable
+from collections.abc import Awaitable, Callable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
-from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
-                    cast)
+from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast
 
 import jinja2
 import jinja2.ext
@@ -18,44 +18,42 @@ import jinja2.nodes
 import jinja2.parser
 import jinja2.sandbox
 import transformers.utils.chat_template_utils as hf_chat_utils
-# yapf conflicts with isort for this block
-# yapf: disable
-from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam,
-                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
-    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
-from openai.types.chat import (ChatCompletionContentPartRefusalParam,
-                               ChatCompletionContentPartTextParam)
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartInputAudioParam,
+    ChatCompletionContentPartRefusalParam,
+    ChatCompletionContentPartTextParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionToolMessageParam,
+)
 from openai.types.chat import (
-    ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
-from openai.types.chat import (ChatCompletionMessageToolCallParam,
-                               ChatCompletionToolMessageParam)
-from openai.types.chat.chat_completion_content_part_input_audio_param import (
-    InputAudio)
+    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
+)
+from openai.types.chat import (
+    ChatCompletionMessageParam as OpenAIChatCompletionMessageParam,
+)
+from openai.types.chat.chat_completion_content_part_input_audio_param import InputAudio
 from openai.types.responses import ResponseInputImageParam
 from openai_harmony import Message as OpenAIHarmonyMessage
 from PIL import Image
 from pydantic import BaseModel, ConfigDict, TypeAdapter
-# yapf: enable
-from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
-                          ProcessorMixin)
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
+
 # pydantic needs the TypedDict from typing_extensions
-from typing_extensions import Required, TypeAlias, TypedDict
+from typing_extensions import Required, TypedDict
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalUUIDDict)
-from vllm.multimodal.utils import MediaConnector
-# yapf: disable
-from vllm.transformers_utils.chat_templates import (
-    get_chat_template_fallback_path)
-# yapf: enable
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
+from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid, supports_kw
+from vllm.utils import random_uuid
+from vllm.utils.func_utils import supports_kw
 
 logger = init_logger(__name__)
 
@@ -81,7 +79,7 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
-    image_embeds: Optional[Union[str, dict[str, str]]]
+    image_embeds: str | dict[str, str] | None
     """
     The image embeddings. It can be either:
     - A single base64 string.
@@ -89,7 +87,7 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
     type: Required[Literal["image_embeds"]]
     """The type of the content part."""
-    uuid: Optional[str]
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -128,8 +126,8 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
     }
     """
 
-    image_pil: Optional[PILImage]
-    uuid: Optional[str]
+    image_pil: PILImage | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -146,8 +144,8 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     }
     """
 
-    image_url: Optional[str]
-    uuid: Optional[str]
+    image_url: str | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -163,7 +161,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     }
     """
 
-    audio_url: Optional[str]
+    audio_url: str | None
 
 
 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
@@ -175,8 +173,8 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     }
     """
 
-    video_url: Optional[str]
-    uuid: Optional[str]
+    video_url: str | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -204,20 +202,20 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     """The thinking type."""
 
 
-ChatCompletionContentPartParam: TypeAlias = Union[
-    OpenAIChatCompletionContentPartParam,
-    ChatCompletionContentPartAudioParam,
-    ChatCompletionContentPartInputAudioParam,
-    ChatCompletionContentPartVideoParam,
-    ChatCompletionContentPartRefusalParam,
-    CustomChatCompletionContentPILImageParam,
-    CustomChatCompletionContentSimpleImageParam,
-    ChatCompletionContentPartImageEmbedsParam,
-    CustomChatCompletionContentSimpleAudioParam,
-    CustomChatCompletionContentSimpleVideoParam,
-    str,
-    CustomThinkCompletionContentParam,
-]
+ChatCompletionContentPartParam: TypeAlias = (
+    OpenAIChatCompletionContentPartParam
+    | ChatCompletionContentPartAudioParam
+    | ChatCompletionContentPartInputAudioParam
+    | ChatCompletionContentPartVideoParam
+    | ChatCompletionContentPartRefusalParam
+    | CustomChatCompletionContentPILImageParam
+    | CustomChatCompletionContentSimpleImageParam
+    | ChatCompletionContentPartImageEmbedsParam
+    | CustomChatCompletionContentSimpleAudioParam
+    | CustomChatCompletionContentSimpleVideoParam
+    | str
+    | CustomThinkCompletionContentParam
+)
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
@@ -226,7 +224,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[str, list[ChatCompletionContentPartParam]]
+    content: str | list[ChatCompletionContentPartParam]
     """The contents of the message."""
 
     name: str
@@ -236,18 +234,21 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     same role.
     """
 
-    tool_call_id: Optional[str]
+    tool_call_id: str | None
     """Tool call that this message is responding to."""
 
-    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
 
-ChatCompletionMessageParam = Union[
-    OpenAIChatCompletionMessageParam,
-    CustomChatCompletionMessageParam,
-    OpenAIHarmonyMessage,
-]
+ChatCompletionMessageParam: TypeAlias = (
+    OpenAIChatCompletionMessageParam
+    | CustomChatCompletionMessageParam
+    | OpenAIHarmonyMessage
+)
 
 
 # TODO: Make fields ReadOnly once mypy supports it
@@ -255,18 +256,24 @@ class ConversationMessage(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[Optional[str], list[dict[str, str]]]
+    content: str | None | list[dict[str, str]]
     """The contents of the message"""
 
-    tool_call_id: Optional[str]
+    tool_call_id: str | None
     """Tool call that this message is responding to."""
 
-    name: Optional[str]
+    name: str | None
     """The name of the function to call"""
 
-    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
+    reasoning_content: str | None
+    """Deprecated: The reasoning content for interleaved thinking."""
+
 
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -284,9 +291,11 @@ def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
 
 def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
     if isinstance(node, jinja2.nodes.Getitem):
-        return (_is_var_access(node.node, varname)
-                and isinstance(node.arg, jinja2.nodes.Const)
-                and node.arg.value == key)
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
 
     if isinstance(node, jinja2.nodes.Getattr):
         return _is_var_access(node.node, varname) and node.attr == key
@@ -297,23 +306,21 @@ def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
 def _is_var_or_elems_access(
     node: jinja2.nodes.Node,
     varname: str,
-    key: Optional[str] = None,
+    key: str | None = None,
 ) -> bool:
     if isinstance(node, jinja2.nodes.Filter):
         return node.node is not None and _is_var_or_elems_access(
-            node.node, varname, key)
+            node.node, varname, key
+        )
     if isinstance(node, jinja2.nodes.Test):
         return _is_var_or_elems_access(node.node, varname, key)
 
     if isinstance(node, jinja2.nodes.Getitem) and isinstance(
-            node.arg, jinja2.nodes.Slice):
+        node.arg, jinja2.nodes.Slice
+    ):
         return _is_var_or_elems_access(node.node, varname, key)
 
-    # yapf: disable
-    return (
-        _is_attr_access(node, varname, key) if key
-        else _is_var_access(node, varname)
-    ) # yapf: enable
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
 
 
 def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
@@ -342,8 +349,7 @@ def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
 # the scope in which each variable is defined, but that is too complicated
 def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
     messages_varnames = [
-        varname
-        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
+        varname for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
     ]
 
     # Search for {%- for message in messages -%} loops
@@ -375,7 +381,7 @@ def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
                 break
 
 
-def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
+def _try_extract_ast(chat_template: str) -> jinja2.nodes.Template | None:
     try:
         jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
         return jinja_compiled.environment.parse(chat_template)
@@ -406,27 +412,19 @@ def _detect_content_format(
 
 
 def resolve_mistral_chat_template(
-    chat_template: Optional[str],
+    chat_template: str | None,
     **kwargs: Any,
-) -> Optional[str]:
-    if chat_template is not None:
-        logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer."
-        )
-    if "add_generation_prompt" in kwargs:
-        logger.warning_once(
-            "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored."
-        )
-    if "continue_final_message" in kwargs:
-        logger.warning_once(
-            "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored."
+) -> str | None:
+    if chat_template is not None or kwargs.get("chat_template_kwargs") is not None:
+        raise ValueError(
+            "'chat_template' or 'chat_template_kwargs' cannot be overridden "
+            "for mistral tokenizer."
         )
+
     return None
 
 
-_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
+_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]()
 """
 Used in `_try_get_processor_chat_template` to avoid calling
 `cached_get_processor` again if the processor fails to be loaded.
@@ -436,9 +434,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 
 def _try_get_processor_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     model_config: ModelConfig,
-) -> Optional[str]:
+) -> str | None:
     cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
@@ -472,20 +470,19 @@ def _try_get_processor_chat_template(
 
 
 def resolve_hf_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     *,
     model_config: ModelConfig,
-) -> Optional[str]:
+) -> str | None:
     # 1st priority: The given chat template
     if chat_template is not None:
         return chat_template
 
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
-        chat_template = _try_get_processor_chat_template(tokenizer,
-                                                         model_config)
+        chat_template = _try_get_processor_chat_template(tokenizer, model_config)
         if chat_template is not None:
             return chat_template
 
@@ -505,14 +502,14 @@ def resolve_hf_chat_template(
         tokenizer_name_or_path=model_config.tokenizer,
     )
     if path is not None:
-        logger.info(
+        logger.info_once(
             "Loading chat template fallback for %s as there isn't one "
             "defined on HF Hub.",
             tokenizer.name_or_path,
         )
         chat_template = load_chat_template(path)
     else:
-        logger.debug(
+        logger.debug_once(
             "There is no chat template fallback for %s", tokenizer.name_or_path
         )
 
@@ -520,8 +517,8 @@ def resolve_hf_chat_template(
 
 
 def _resolve_chat_template_content_format(
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     tokenizer: AnyTokenizer,
     *,
     model_config: ModelConfig,
@@ -553,7 +550,7 @@ def _resolve_chat_template_content_format(
 
 @lru_cache
 def _log_chat_template_content_format(
-    chat_template: Optional[str],
+    chat_template: str | None,
     given_format: ChatTemplateContentFormatOption,
     detected_format: ChatTemplateContentFormatOption,
 ):
@@ -576,8 +573,8 @@ def _log_chat_template_content_format(
 
 
 def resolve_chat_template_content_format(
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
     tokenizer: AnyTokenizer,
     *,
@@ -619,8 +616,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._model_config = model_config
         self._tokenizer = tokenizer
 
-        self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
-        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
+        self._items_by_modality = defaultdict[str, list[_T | None]](list)
+        self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
     def model_config(self) -> ModelConfig:
@@ -652,9 +649,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def add(
         self,
         modality: ModalityStr,
-        item: Optional[_T],
-        uuid: Optional[str] = None,
-    ) -> Optional[str]:
+        item: _T | None,
+        uuid: str | None = None,
+    ) -> str | None:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
@@ -672,22 +669,18 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         return self.model_cls.get_placeholder_str(modality, num_items)
 
-    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+    def all_mm_uuids(self) -> MultiModalUUIDDict | None:
         if not self._items_by_modality:
             return None
         mm_uuids = {}
         uuids_by_modality = dict(self._uuids_by_modality)
         if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
-            raise ValueError(
-                "Mixing raw image and embedding inputs is not allowed"
-            )
+            raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in uuids_by_modality:
             image_embeds_uuids = uuids_by_modality["image_embeds"]
             if len(image_embeds_uuids) > 1:
-                raise ValueError(
-                    "Only one message can have {'type': 'image_embeds'}"
-                )
+                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
@@ -703,22 +696,18 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
 
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
-    def all_mm_data(self) -> Optional[MultiModalDataDict]:
+    def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
-            raise ValueError(
-                "Mixing raw image and embedding inputs is not allowed"
-            )
+            raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
-                raise ValueError(
-                    "Only one message can have {'type': 'image_embeds'}"
-                )
+                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
@@ -733,7 +722,7 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
 
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
-    async def all_mm_data(self) -> Optional[MultiModalDataDict]:
+    async def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
@@ -748,16 +737,12 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
             items_by_modality[modality] = await asyncio.gather(*coros)
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
-            raise ValueError(
-                "Mixing raw image and embedding inputs is not allowed"
-            )
+            raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
-                raise ValueError(
-                    "Only one message can have {'type': 'image_embeds'}"
-                )
+                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
@@ -783,9 +768,7 @@ class BaseMultiModalContentParser(ABC):
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
-    def _add_placeholder(
-        self, modality: ModalityStr, placeholder: Optional[str]
-    ):
+    def _add_placeholder(self, modality: ModalityStr, placeholder: str | None):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
             self._placeholder_storage[mod_placeholder].append(placeholder)
@@ -794,40 +777,35 @@ class BaseMultiModalContentParser(ABC):
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(
-        self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(
-        self, audio_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(
-        self, video_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
 
@@ -838,15 +816,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
+
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    def parse_image(
-        self, image_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._tracker.model_config
+
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
 
         placeholder = self._tracker.add("image", image, uuid)
@@ -854,9 +836,15 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `image_embeds`"
+            )
+
         if isinstance(image_embeds, dict):
             embeds = {
                 k: self._connector.fetch_image_embedding(v)
@@ -874,21 +862,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
         placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(
-        self, audio_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
         audio = self._connector.fetch_audio(audio_url) if audio_url else None
 
         placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         if input_audio:
             audio_data = input_audio.get("data", "")
@@ -903,14 +889,8 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(
-        self, video_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
-        video = (
-            self._connector.fetch_video(video_url=video_url)
-            if video_url
-            else None
-        )
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
+        video = self._connector.fetch_video(video_url=video_url) if video_url else None
 
         placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
@@ -923,30 +903,35 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    def parse_image(
-        self, image_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
-        image_coro = (
-            self._connector.fetch_image_async(image_url) if image_url else None
-        )
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._tracker.model_config
+
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
+        image_coro = self._connector.fetch_image_async(image_url) if image_url else None
 
         placeholder = self._tracker.add("image", image_coro, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
-        future: asyncio.Future[Union[str, dict[str, str], None]] = (
-            asyncio.Future()
-        )
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `image_embeds`"
+            )
+
+        future: asyncio.Future[str | dict[str, str] | None] = asyncio.Future()
 
         if isinstance(image_embeds, dict):
             embeds = {
@@ -966,9 +951,9 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
-        future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
+        future: asyncio.Future[Image.Image | None] = asyncio.Future()
         if image_pil:
             future.set_result(image_pil)
         else:
@@ -977,18 +962,14 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(
-        self, audio_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
-        audio_coro = (
-            self._connector.fetch_audio_async(audio_url) if audio_url else None
-        )
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
+        audio_coro = self._connector.fetch_audio_async(audio_url) if audio_url else None
 
         placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         if input_audio:
             audio_data = input_audio.get("data", "")
@@ -1003,9 +984,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(
-        self, video_url: Optional[str], uuid: Optional[str] = None
-    ) -> None:
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         video = (
             self._connector.fetch_video_async(video_url=video_url)
             if video_url
@@ -1016,7 +995,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("video", placeholder)
 
 
-def validate_chat_template(chat_template: Optional[Union[Path, str]]):
+def validate_chat_template(chat_template: Path | str | None):
     """Raises if the provided chat template appears invalid."""
     if chat_template is None:
         return
@@ -1036,16 +1015,14 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
             )
 
     else:
-        raise TypeError(
-            f"{type(chat_template)} is not a valid chat template type"
-        )
+        raise TypeError(f"{type(chat_template)} is not a valid chat template type")
 
 
 def _load_chat_template(
-    chat_template: Optional[Union[Path, str]],
+    chat_template: Path | str | None,
     *,
     is_literal: bool = False,
-) -> Optional[str]:
+) -> str | None:
     if chat_template is None:
         return None
 
@@ -1082,10 +1059,10 @@ _cached_load_chat_template = lru_cache(_load_chat_template)
 
 
 def load_chat_template(
-    chat_template: Optional[Union[Path, str]],
+    chat_template: Path | str | None,
     *,
     is_literal: bool = False,
-) -> Optional[str]:
+) -> str | None:
     return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
@@ -1145,9 +1122,7 @@ def _get_full_multimodal_text_prompt(
                 "actual multimodal data items."
             )
 
-        missing_placeholders.extend(
-            [placeholder] * placeholder_counts[placeholder]
-        )
+        missing_placeholders.extend([placeholder] * placeholder_counts[placeholder])
 
     # NOTE: Default behaviour: we always add missing placeholders
     # at the front of the prompt, if interleave_strings=False
@@ -1166,10 +1141,8 @@ _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
-_ResponsesInputImageParser = TypeAdapter(
-    ResponseInputImageParam
-).validate_python
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
+_ResponsesInputImageParser = TypeAdapter(ResponseInputImageParam).validate_python
+_ContentPart: TypeAlias = str | dict[str, str] | InputAudio | PILImage
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -1179,26 +1152,14 @@ MM_PARSER_MAP: dict[
     "text": lambda part: _TextParser(part).get("text", None),
     "thinking": lambda part: _ThinkParser(part).get("thinking", None),
     "input_text": lambda part: _TextParser(part).get("text", None),
-    "input_image": lambda part: _ResponsesInputImageParser(part).get(
-        "image_url", None
-    ),
-    "image_url": lambda part: _ImageParser(part)
-    .get("image_url", {})
-    .get("url", None),
-    "image_embeds": lambda part: _ImageEmbedsParser(part).get(
-        "image_embeds", None
-    ),
+    "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
+    "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
+    "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
-    "audio_url": lambda part: _AudioParser(part)
-    .get("audio_url", {})
-    .get("url", None),
-    "input_audio": lambda part: _InputAudioParser(part).get(
-        "input_audio", None
-    ),
+    "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
+    "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
     "refusal": lambda part: _RefusalParser(part).get("refusal", None),
-    "video_url": lambda part: _VideoParser(part)
-    .get("video_url", {})
-    .get("url", None),
+    "video_url": lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
 }
 
 
@@ -1225,15 +1186,14 @@ def _parse_chat_message_content_mm_part(
     part_type = part.get("type", None)
     uuid = part.get("uuid", None)
 
-    if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501
+    if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None:  # noqa: E501
         content = MM_PARSER_MAP[part_type](part)
 
         # Special case for 'image_url.detail'
         # We only support 'auto', which is the default
         if part_type == "image_url" and part.get("detail", "auto") != "auto":
             logger.warning(
-                "'image_url.detail' is currently not supported "
-                "and will be ignored."
+                "'image_url.detail' is currently not supported and will be ignored."
             )
 
         return part_type, content
@@ -1242,9 +1202,7 @@ def _parse_chat_message_content_mm_part(
     # 'type' is required field by pydantic
     if part_type is None or uuid is not None:
         if "image_url" in part:
-            image_params = cast(
-                CustomChatCompletionContentSimpleImageParam, part
-            )
+            image_params = cast(CustomChatCompletionContentSimpleImageParam, part)
             image_url = image_params.get("image_url", None)
             if isinstance(image_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1253,22 +1211,20 @@ def _parse_chat_message_content_mm_part(
             return "image_url", image_url
         if "image_pil" in part:
             # "image_pil" could be None if UUID is provided.
-            image_params = cast( # type: ignore 
+            image_params = cast(  # type: ignore
                 CustomChatCompletionContentPILImageParam, part
             )
             image_pil = image_params.get("image_pil", None)
             return "image_pil", image_pil
         if "image_embeds" in part:
             # "image_embeds" could be None if UUID is provided.
-            image_params = cast( # type: ignore 
+            image_params = cast(  # type: ignore
                 ChatCompletionContentPartImageEmbedsParam, part
             )
             image_embeds = image_params.get("image_embeds", None)
             return "image_embeds", image_embeds
         if "audio_url" in part:
-            audio_params = cast(
-                CustomChatCompletionContentSimpleAudioParam, part
-            )
+            audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part)
             audio_url = audio_params.get("audio_url", None)
             if isinstance(audio_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1279,9 +1235,7 @@ def _parse_chat_message_content_mm_part(
             input_audio_params = cast(dict[str, str], part)
             return "input_audio", input_audio_params
         if "video_url" in part:
-            video_params = cast(
-                CustomChatCompletionContentSimpleVideoParam, part
-            )
+            video_params = cast(CustomChatCompletionContentSimpleVideoParam, part)
             video_url = video_params.get("video_url", None)
             if isinstance(video_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1345,7 +1299,7 @@ def _parse_chat_message_content_part(
     *,
     wrap_dicts: bool,
     interleave_strings: bool,
-) -> Optional[_ContentPart]:
+) -> _ContentPart | None:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -1383,10 +1337,7 @@ def _parse_chat_message_content_part(
 
     modality = None
     if part_type == "image_pil":
-        if content is not None:
-            image_content = cast(Image.Image, content)
-        else:
-            image_content = None
+        image_content = cast(Image.Image, content) if content is not None else None
         mm_parser.parse_image_pil(image_content, uuid)
         modality = "image"
     elif part_type in ("image_url", "input_image"):
@@ -1394,10 +1345,7 @@ def _parse_chat_message_content_part(
         mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
-        if content is not None:
-            content = cast(Union[str, dict[str, str]], content)
-        else:
-            content = None
+        content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
@@ -1418,9 +1366,7 @@ def _parse_chat_message_content_part(
     return (
         {"type": modality}
         if wrap_dicts
-        else (
-            MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
-        )
+        else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None)
     )
 
 
@@ -1437,13 +1383,11 @@ def _parse_chat_message_content(
 ) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
-
+    reasoning = message.get("reasoning") or message.get("reasoning_content")
     if content is None:
         content = []
     elif isinstance(content, str):
-        content = [
-            ChatCompletionContentPartTextParam(type="text", text=content)
-        ]
+        content = [ChatCompletionContentPartTextParam(type="text", text=content)]
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore
@@ -1459,11 +1403,14 @@ def _parse_chat_message_content(
             # The 'tool_calls' is not None check ensures compatibility.
             # It's needed only if downstream code doesn't strictly
             # follow the OpenAI spec.
-            if (
-                "tool_calls" in parsed_msg
-                and parsed_msg["tool_calls"] is not None
-            ):
+            if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+            # Include reasoning if present for interleaved thinking.
+            if reasoning is not None:
+                result_msg["reasoning"] = cast(str, reasoning)
+                result_msg["reasoning_content"] = cast(
+                    str, reasoning
+                )  # keep compatibility
         elif role == "tool":
             parsed_msg = _ToolParser(message)
             if "tool_call_id" in parsed_msg:
@@ -1502,8 +1449,8 @@ def parse_chat_messages(
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
-    Optional[MultiModalDataDict],
-    Optional[MultiModalUUIDDict],
+    MultiModalDataDict | None,
+    MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@@ -1534,8 +1481,8 @@ def parse_chat_messages_futures(
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
-    Awaitable[Optional[MultiModalDataDict]],
-    Optional[MultiModalUUIDDict],
+    Awaitable[MultiModalDataDict | None],
+    MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
@@ -1572,16 +1519,9 @@ class AssistantTracker(jinja2.ext.Extension):
         return call_block.set_lineno(lineno)
 
 
-def resolve_chat_template_kwargs(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+def _resolve_chat_template_kwargs(
     chat_template: str,
-    chat_template_kwargs: dict[str, Any],
-) -> dict[str, Any]:
-    fn_kw = {
-        k for k in chat_template_kwargs
-        if supports_kw(tokenizer.apply_chat_template, k, allow_var_kwargs=False)
-    }
-
+):
     env = jinja2.sandbox.ImmutableSandboxedEnvironment(
         trim_blocks=True,
         lstrip_blocks=True,
@@ -1589,24 +1529,68 @@ def resolve_chat_template_kwargs(
     )
     parsed_content = env.parse(chat_template)
     template_vars = jinja2.meta.find_undeclared_variables(parsed_content)
+    return template_vars
+
+
+_cached_resolve_chat_template_kwargs = lru_cache(_resolve_chat_template_kwargs)
+
 
+@lru_cache
+def _get_hf_base_chat_template_params() -> frozenset[str]:
+    # Get standard parameters from HuggingFace's base tokenizer class.
+    # This dynamically extracts parameters from PreTrainedTokenizer's
+    # apply_chat_template method, ensuring compatibility with tokenizers
+    # that use **kwargs to receive standard parameters.
+
+    # Read signature from HF's base class - the single source of truth
+    base_sig = inspect.signature(PreTrainedTokenizer.apply_chat_template)
+    # Exclude VAR_KEYWORD (**kwargs) and VAR_POSITIONAL (*args) placeholders
+    return frozenset(
+        p.name
+        for p in base_sig.parameters.values()
+        if p.kind
+        not in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+    )
+
+
+def resolve_chat_template_kwargs(
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    chat_template: str,
+    chat_template_kwargs: dict[str, Any],
+    raise_on_unexpected: bool = True,
+) -> dict[str, Any]:
     # We exclude chat_template from kwargs here, because
     # chat template has been already resolved at this stage
-    unexpected_vars = {"chat_template"}
-    accept_vars = (fn_kw | template_vars) - unexpected_vars
-    return {
-        k: v for k, v in chat_template_kwargs.items() if k in accept_vars
+    unexpected_vars = {"chat_template", "tokenize"}
+    if raise_on_unexpected and (
+        unexpected_in_kwargs := unexpected_vars & chat_template_kwargs.keys()
+    ):
+        raise ValueError(
+            "Found unexpected chat template kwargs from request: "
+            f"{unexpected_in_kwargs}"
+        )
+
+    fn_kw = {
+        k
+        for k in chat_template_kwargs
+        if supports_kw(tokenizer.apply_chat_template, k, allow_var_kwargs=False)
     }
+    template_vars = _cached_resolve_chat_template_kwargs(chat_template)
+
+    # Allow standard HF parameters even if tokenizer uses **kwargs to receive them
+    hf_base_params = _get_hf_base_chat_template_params()
+
+    accept_vars = (fn_kw | template_vars | hf_base_params) - unexpected_vars
+    return {k: v for k, v in chat_template_kwargs.items() if k in accept_vars}
 
 
 def apply_hf_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     conversation: list[ConversationMessage],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     *,
     model_config: ModelConfig,
-    tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
@@ -1623,6 +1607,12 @@ def apply_hf_chat_template(
             "does not define one."
         )
 
+    resolved_kwargs = resolve_chat_template_kwargs(
+        tokenizer=tokenizer,
+        chat_template=hf_chat_template,
+        chat_template_kwargs=kwargs,
+    )
+
     try:
         resolved_kwargs = resolve_chat_template_kwargs(
             tokenizer=tokenizer,
@@ -1633,7 +1623,7 @@ def apply_hf_chat_template(
             conversation=conversation,  # type: ignore[arg-type]
             tools=tools,  # type: ignore[arg-type]
             chat_template=hf_chat_template,
-            tokenize=tokenize,
+            tokenize=False,
             **resolved_kwargs,
         )
 
@@ -1651,8 +1641,8 @@ def apply_hf_chat_template(
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
     messages: list[ChatCompletionMessageParam],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     **kwargs: Any,
 ) -> list[int]:
     from mistral_common.exceptions import MistralCommonException
diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py
index 41671b5b98abb83269a078db5ad6c4473ae21645..9dff68236fe94db90871813a776d850ad6a8f0e4 100644
--- a/vllm/entrypoints/cli/__init__.py
+++ b/vllm/entrypoints/cli/__init__.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
 from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
-from vllm.entrypoints.cli.benchmark.throughput import (
-    BenchmarkThroughputSubcommand)
+from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
+from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
 
 __all__: list[str] = [
     "BenchmarkLatencySubcommand",
     "BenchmarkServingSubcommand",
+    "BenchmarkSweepSubcommand",
     "BenchmarkThroughputSubcommand",
-]
\ No newline at end of file
+]
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index 0c22bc75105e6c3a3cd2b9dfb500924d563ba564..d8543822cf6e14c8c38e4a729f9617a140cd9607 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -6,7 +6,7 @@ from vllm.entrypoints.cli.types import CLISubcommand
 
 
 class BenchmarkSubcommandBase(CLISubcommand):
-    """ The base class of subcommands for vllm bench. """
+    """The base class of subcommands for `vllm bench`."""
 
     help: str
 
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
index 3e68963cfd44e104cf259c3212b256634dc57aee..60f2b03341b1c63da4501a0be43047ad714f0d83 100644
--- a/vllm/entrypoints/cli/benchmark/latency.py
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 
 
 class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
-    """ The `latency` subcommand for vllm bench. """
+    """The `latency` subcommand for `vllm bench`."""
 
     name = "latency"
     help = "Benchmark the latency of a single batch of requests."
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 0c65fd97fc04c81b35eaa108e1ecd17bce28f790..2ff98577c3634887f8e4c2c752de437a4a972a30 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
@@ -11,11 +9,13 @@ from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 
 if typing.TYPE_CHECKING:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class BenchmarkSubcommand(CLISubcommand):
-    """ The `bench` subcommand for the vLLM CLI. """
+    """The `bench` subcommand for the vLLM CLI."""
 
     name = "bench"
     help = "vLLM bench subcommand."
@@ -28,14 +28,14 @@ class BenchmarkSubcommand(CLISubcommand):
         pass
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         bench_parser = subparsers.add_parser(
             self.name,
             description=self.help,
-            usage=f"vllm {self.name} <bench_type> [options]")
-        bench_subparsers = bench_parser.add_subparsers(required=True,
-                                                       dest="bench_type")
+            usage=f"vllm {self.name} <bench_type> [options]",
+        )
+        bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
 
         for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
             cmd_subparser = bench_subparsers.add_parser(
@@ -47,7 +47,8 @@ class BenchmarkSubcommand(CLISubcommand):
             cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
             cmd_cls.add_cli_args(cmd_subparser)
             cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
-                subcmd=f"{self.name} {cmd_cls.name}")
+                subcmd=f"{self.name} {cmd_cls.name}"
+            )
         return bench_parser
 
 
diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py
index 3dd7a46d6284b99fb59e5141f39d1009b3b74bf5..6616305c7472fa65c7e2a526a9ca0a832267ce0d 100644
--- a/vllm/entrypoints/cli/benchmark/serve.py
+++ b/vllm/entrypoints/cli/benchmark/serve.py
@@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 
 
 class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
-    """ The `serve` subcommand for vllm bench. """
+    """The `serve` subcommand for `vllm bench`."""
 
     name = "serve"
     help = "Benchmark the online serving throughput."
diff --git a/vllm/entrypoints/cli/benchmark/sweep.py b/vllm/entrypoints/cli/benchmark/sweep.py
new file mode 100644
index 0000000000000000000000000000000000000000..c385207690a153a56f736da38f3af5bbc1b14f62
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/sweep.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.sweep.cli import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkSweepSubcommand(BenchmarkSubcommandBase):
+    """The `sweep` subcommand for `vllm bench`."""
+
+    name = "sweep"
+    help = "Benchmark for a parameter sweep."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
index d5d43ad4a35911cbbd1a8a5a81baa5c48099f959..2097f9ea0781a935168708aa61e79ff948857d15 100644
--- a/vllm/entrypoints/cli/benchmark/throughput.py
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 
 
 class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
-    """ The `throughput` subcommand for vllm bench. """
+    """The `throughput` subcommand for `vllm bench`."""
 
     name = "throughput"
     help = "Benchmark offline inference throughput."
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
index 785c18812adb7986b98f869e45ede2bc4b8530d3..ad943a63de9d54ac6cb3bf503e0d89a1374b5c5f 100644
--- a/vllm/entrypoints/cli/collect_env.py
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
@@ -10,11 +8,14 @@ from vllm.collect_env import main as collect_env_main
 from vllm.entrypoints.cli.types import CLISubcommand
 
 if typing.TYPE_CHECKING:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class CollectEnvSubcommand(CLISubcommand):
-    """The `collect-env` subcommand for the vLLM CLI. """
+    """The `collect-env` subcommand for the vLLM CLI."""
+
     name = "collect-env"
 
     @staticmethod
@@ -23,13 +24,14 @@ class CollectEnvSubcommand(CLISubcommand):
         collect_env_main()
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         return subparsers.add_parser(
             "collect-env",
             help="Start collecting environment information.",
             description="Start collecting environment information.",
-            usage="vllm collect-env")
+            usage="vllm collect-env",
+        )
 
 
 def cmd_init() -> list[CLISubcommand]:
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index f1bcbc8262bdb8f8e91ff3a74736da27573b337c..a3e73eb7a4c9d15c9980aba567b619de0695c4b2 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''The CLI entrypoints of vLLM
+"""The CLI entrypoints of vLLM
 
 Note that all future modules must be lazily loaded within main
-to avoid certain eager import breakage.'''
-from __future__ import annotations
+to avoid certain eager import breakage."""
 
 import importlib.metadata
+import sys
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 def main():
@@ -16,7 +20,7 @@ def main():
     import vllm.entrypoints.cli.run_batch
     import vllm.entrypoints.cli.serve
     from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 
     CMD_MODULES = [
         vllm.entrypoints.cli.openai,
@@ -28,23 +32,38 @@ def main():
 
     cli_env_setup()
 
+    # For 'vllm bench *': use CPU instead of UnspecifiedPlatform by default
+    if len(sys.argv) > 1 and sys.argv[1] == "bench":
+        logger.debug(
+            "Bench command detected, must ensure current platform is not "
+            "UnspecifiedPlatform to avoid device type inference error"
+        )
+        from vllm import platforms
+
+        if platforms.current_platform.is_unspecified():
+            from vllm.platforms.cpu import CpuPlatform
+
+            platforms.current_platform = CpuPlatform()
+            logger.info(
+                "Unspecified platform detected, switching to CPU Platform instead."
+            )
+
     parser = FlexibleArgumentParser(
         description="vLLM CLI",
         epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"),
     )
     parser.add_argument(
-        '-v',
-        '--version',
-        action='version',
-        version=importlib.metadata.version('vllm'),
+        "-v",
+        "--version",
+        action="version",
+        version=importlib.metadata.version("vllm"),
     )
     subparsers = parser.add_subparsers(required=False, dest="subparser")
     cmds = {}
     for cmd_module in CMD_MODULES:
         new_cmds = cmd_module.cmd_init()
         for cmd in new_cmds:
-            cmd.subparser_init(subparsers).set_defaults(
-                dispatch_function=cmd.cmd)
+            cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
             cmds[cmd.name] = cmd
     args = parser.parse_args()
     if args.subparser in cmds:
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 1929d6a7f77af08d146846813b4c4ba86a72a56f..fb49be370203e92fc5df30763d83308fe297fb81 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import os
 import signal
@@ -15,11 +13,12 @@ from openai.types.chat import ChatCompletionMessageParam
 from vllm.entrypoints.cli.types import CLISubcommand
 
 if TYPE_CHECKING:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 def _register_signal_handlers():
-
     def signal_handler(sig, frame):
         sys.exit(0)
 
@@ -80,26 +79,29 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
             break
         conversation.append({"role": "user", "content": input_message})
 
-        stream = client.chat.completions.create(model=model_name,
-                                                messages=conversation,
-                                                stream=True)
+        stream = client.chat.completions.create(
+            model=model_name, messages=conversation, stream=True
+        )
         output = _print_chat_stream(stream)
         conversation.append({"role": "assistant", "content": output})
 
 
-def _add_query_options(
-        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
         "--url",
         type=str,
         default="http://localhost:8000/v1",
-        help="url of the running OpenAI-Compatible RESTful API server")
+        help="url of the running OpenAI-Compatible RESTful API server",
+    )
     parser.add_argument(
         "--model-name",
         type=str,
         default=None,
-        help=("The model name used in prompt completion, default to "
-              "the first model in list models API call."))
+        help=(
+            "The model name used in prompt completion, default to "
+            "the first model in list models API call."
+        ),
+    )
     parser.add_argument(
         "--api-key",
         type=str,
@@ -107,12 +109,14 @@ def _add_query_options(
         help=(
             "API key for OpenAI services. If provided, this api key "
             "will overwrite the api key obtained through environment variables."
-        ))
+        ),
+    )
     return parser
 
 
 class ChatCommand(CLISubcommand):
-    """The `chat` subcommand for the vLLM CLI. """
+    """The `chat` subcommand for the vLLM CLI."""
+
     name = "chat"
 
     @staticmethod
@@ -127,9 +131,9 @@ class ChatCommand(CLISubcommand):
         if args.quick:
             conversation.append({"role": "user", "content": args.quick})
 
-            stream = client.chat.completions.create(model=model_name,
-                                                    messages=conversation,
-                                                    stream=True)
+            stream = client.chat.completions.create(
+                model=model_name, messages=conversation, stream=True
+            )
             output = _print_chat_stream(stream)
             conversation.append({"role": "assistant", "content": output})
             return
@@ -142,9 +146,9 @@ class ChatCommand(CLISubcommand):
                 break
             conversation.append({"role": "user", "content": input_message})
 
-            stream = client.chat.completions.create(model=model_name,
-                                                    messages=conversation,
-                                                    stream=True)
+            stream = client.chat.completions.create(
+                model=model_name, messages=conversation, stream=True
+            )
             output = _print_chat_stream(stream)
             conversation.append({"role": "assistant", "content": output})
 
@@ -156,39 +160,50 @@ class ChatCommand(CLISubcommand):
             "--system-prompt",
             type=str,
             default=None,
-            help=("The system prompt to be added to the chat template, "
-                  "used for models that support system prompts."))
-        parser.add_argument("-q",
-                            "--quick",
-                            type=str,
-                            metavar="MESSAGE",
-                            help=("Send a single prompt as MESSAGE "
-                                  "and print the response, then exit."))
+            help=(
+                "The system prompt to be added to the chat template, "
+                "used for models that support system prompts."
+            ),
+        )
+        parser.add_argument(
+            "-q",
+            "--quick",
+            type=str,
+            metavar="MESSAGE",
+            help=("Send a single prompt as MESSAGE and print the response, then exit."),
+        )
         return parser
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         parser = subparsers.add_parser(
             "chat",
             help="Generate chat completions via the running API server.",
             description="Generate chat completions via the running API server.",
-            usage="vllm chat [options]")
+            usage="vllm chat [options]",
+        )
         return ChatCommand.add_cli_args(parser)
 
 
 class CompleteCommand(CLISubcommand):
-    """The `complete` subcommand for the vLLM CLI. """
-    name = 'complete'
+    """The `complete` subcommand for the vLLM CLI."""
+
+    name = "complete"
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
         model_name, client = _interactive_cli(args)
 
+        kwargs = {
+            "model": model_name,
+            "stream": True,
+        }
+        if args.max_tokens:
+            kwargs["max_tokens"] = args.max_tokens
+
         if args.quick:
-            stream = client.completions.create(model=model_name,
-                                               prompt=args.quick,
-                                               stream=True)
+            stream = client.completions.create(prompt=args.quick, **kwargs)
             _print_completion_stream(stream)
             return
 
@@ -198,34 +213,42 @@ class CompleteCommand(CLISubcommand):
                 input_prompt = input("> ")
             except EOFError:
                 break
-            stream = client.completions.create(model=model_name,
-                                               prompt=input_prompt,
-                                               stream=True)
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
             _print_completion_stream(stream)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Add CLI arguments for the complete command."""
         _add_query_options(parser)
+        parser.add_argument(
+            "--max-tokens",
+            type=int,
+            help="Maximum number of tokens to generate per output sequence.",
+        )
         parser.add_argument(
             "-q",
             "--quick",
             type=str,
             metavar="PROMPT",
-            help=
-            "Send a single prompt and print the completion output, then exit.")
+            help="Send a single prompt and print the completion output, then exit.",
+        )
         return parser
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         parser = subparsers.add_parser(
             "complete",
-            help=("Generate text completions based on the given prompt "
-                  "via the running API server."),
-            description=("Generate text completions based on the given prompt "
-                         "via the running API server."),
-            usage="vllm complete [options]")
+            help=(
+                "Generate text completions based on the given prompt "
+                "via the running API server."
+            ),
+            description=(
+                "Generate text completions based on the given prompt "
+                "via the running API server."
+            ),
+            usage="vllm complete [options]",
+        )
         return CompleteCommand.add_cli_args(parser)
 
 
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index e669464bff83014ba31a3dc950c849ec4d8f5058..64d1bec1f1ff1c3d9ff10988bc8ab0dab8ba3c62 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import asyncio
 import importlib.metadata
@@ -13,21 +11,25 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 from vllm.logger import init_logger
 
 if typing.TYPE_CHECKING:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 logger = init_logger(__name__)
 
 
 class RunBatchSubcommand(CLISubcommand):
     """The `run-batch` subcommand for vLLM CLI."""
+
     name = "run-batch"
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
         from vllm.entrypoints.openai.run_batch import main as run_batch_main
 
-        logger.info("vLLM batch processing API version %s",
-                    importlib.metadata.version("vllm"))
+        logger.info(
+            "vLLM batch processing API version %s", importlib.metadata.version("vllm")
+        )
         logger.info("args: %s", args)
 
         # Start the Prometheus metrics server.
@@ -44,8 +46,8 @@ class RunBatchSubcommand(CLISubcommand):
         asyncio.run(run_batch_main(args))
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         from vllm.entrypoints.openai.run_batch import make_arg_parser
 
         run_batch_parser = subparsers.add_parser(
@@ -53,13 +55,12 @@ class RunBatchSubcommand(CLISubcommand):
             help="Run batch prompts and write results to file.",
             description=(
                 "Run batch prompts using vLLM's OpenAI-compatible API.\n"
-                "Supports local or HTTP input/output files."),
-            usage=
-            "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
+                "Supports local or HTTP input/output files."
+            ),
+            usage="vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
         )
         run_batch_parser = make_arg_parser(run_batch_parser)
-        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
-            subcmd=self.name)
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
         return run_batch_parser
 
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 0a55471448008b7e1c99bc84c627b71242736ed6..96608f360e17b3e108ad9e973425b4e25325b7d3 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -3,28 +3,30 @@
 
 import argparse
 import signal
-from typing import Optional
 
 import uvloop
 
 import vllm
 import vllm.envs as envs
 from vllm.entrypoints.cli.types import CLISubcommand
-from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
-                                                setup_server)
-from vllm.entrypoints.openai.cli_args import (make_arg_parser,
-                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.api_server import (
+    run_server,
+    run_server_worker,
+    setup_server,
+)
+from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
 from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri,
-                        set_process_title)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.system_utils import decorate_logs, set_process_title
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
-from vllm.v1.utils import (APIServerProcessManager,
-                           wait_for_completion_or_failure)
+from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
 logger = init_logger(__name__)
 
@@ -38,13 +40,14 @@ Search by using: `--help=<ConfigGroup>` to explore options by section (e.g.,
 
 
 class ServeSubcommand(CLISubcommand):
-    """The `serve` subcommand for the vLLM CLI. """
+    """The `serve` subcommand for the vLLM CLI."""
+
     name = "serve"
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
         # If model is specified in CLI (as positional arg), it takes precedence
-        if hasattr(args, 'model_tag') and args.model_tag is not None:
+        if hasattr(args, "model_tag") and args.model_tag is not None:
             args.model = args.model_tag
 
         if args.headless or args.api_server_count < 1:
@@ -60,16 +63,14 @@ class ServeSubcommand(CLISubcommand):
         validate_parsed_serve_args(args)
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         serve_parser = subparsers.add_parser(
-            self.name,
-            description=DESCRIPTION,
-            usage="vllm serve [model_tag] [options]")
+            self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
+        )
 
         serve_parser = make_arg_parser(serve_parser)
-        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
-            subcmd=self.name)
+        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
         return serve_parser
 
 
@@ -78,45 +79,65 @@ def cmd_init() -> list[CLISubcommand]:
 
 
 def run_headless(args: argparse.Namespace):
-
     if args.api_server_count > 1:
         raise ValueError("api_server_count can't be set in headless mode")
 
     # Create the EngineConfig.
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
-    vllm_config = engine_args.create_engine_config(usage_context=usage_context,
-                                                   headless=True)
-
-    if not envs.VLLM_USE_V1:
-        raise ValueError("Headless mode is only supported for V1")
+    vllm_config = engine_args.create_engine_config(
+        usage_context=usage_context, headless=True
+    )
 
     if engine_args.data_parallel_hybrid_lb:
-        raise ValueError("data_parallel_hybrid_lb is not applicable in "
-                         "headless mode")
+        raise ValueError("data_parallel_hybrid_lb is not applicable in headless mode")
 
     parallel_config = vllm_config.parallel_config
     local_engine_count = parallel_config.data_parallel_size_local
 
     if local_engine_count <= 0:
-        raise ValueError("data_parallel_size_local must be > 0 in "
-                         "headless mode")
+        raise ValueError("data_parallel_size_local must be > 0 in headless mode")
 
-    host = parallel_config.data_parallel_master_ip
-    port = engine_args.data_parallel_rpc_port  # add to config too
-    handshake_address = get_tcp_uri(host, port)
+    shutdown_requested = False
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
+        nonlocal shutdown_requested
         logger.debug("Received %d signal.", signum)
-        raise SystemExit
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit
 
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
 
+    if parallel_config.node_rank_within_dp > 0:
+        from vllm.version import __version__ as VLLM_VERSION
+
+        # Run headless workers (for multi-node PP/TP).
+        host = parallel_config.master_addr
+        head_node_address = f"{host}:{parallel_config.master_port}"
+        logger.info(
+            "Launching vLLM (v%s) headless multiproc executor, "
+            "with head node address %s for torch.distributed process group.",
+            VLLM_VERSION,
+            head_node_address,
+        )
+
+        executor = MultiprocExecutor(vllm_config, monitor_workers=False)
+        executor.start_worker_monitor(inline=True)
+        return
+
+    host = parallel_config.data_parallel_master_ip
+    port = parallel_config.data_parallel_rpc_port
+    handshake_address = get_tcp_uri(host, port)
+
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
-        "with head node address %s.", local_engine_count, handshake_address)
+        "with head node address %s.",
+        local_engine_count,
+        handshake_address,
+    )
 
     # Create the engines.
     engine_manager = CoreEngineProcManager(
@@ -139,7 +160,6 @@ def run_headless(args: argparse.Namespace):
 
 
 def run_multi_api_server(args: argparse.Namespace):
-
     assert not args.headless
     num_api_servers: int = args.api_server_count
     assert num_api_servers > 0
@@ -156,13 +176,10 @@ def run_multi_api_server(args: argparse.Namespace):
     usage_context = UsageContext.OPENAI_API_SERVER
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
-    if num_api_servers > 1:
-        if not envs.VLLM_USE_V1:
-            raise ValueError("api_server_count > 1 is only supported for V1")
-
-        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-            raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
-                             "with api_server_count > 1")
+    if num_api_servers > 1 and envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        raise ValueError(
+            "VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1"
+        )
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
@@ -173,12 +190,11 @@ def run_multi_api_server(args: argparse.Namespace):
     hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb
     assert external_dp_lb or hybrid_dp_lb or dp_rank == 0
 
-    api_server_manager: Optional[APIServerProcessManager] = None
-
-    with launch_core_engines(vllm_config, executor_class, log_stats,
-                             num_api_servers) as (local_engine_manager,
-                                                  coordinator, addresses):
+    api_server_manager: APIServerProcessManager | None = None
 
+    with launch_core_engines(
+        vllm_config, executor_class, log_stats, num_api_servers
+    ) as (local_engine_manager, coordinator, addresses):
         # Construct common args for the APIServerProcessManager up-front.
         api_server_manager_kwargs = dict(
             target_server_fn=run_api_server_worker_proc,
@@ -189,7 +205,9 @@ def run_multi_api_server(args: argparse.Namespace):
             input_addresses=addresses.inputs,
             output_addresses=addresses.outputs,
             stats_update_address=coordinator.get_stats_publish_address()
-            if coordinator else None)
+            if coordinator
+            else None,
+        )
 
         # For dp ranks > 0 in external/hybrid DP LB modes, we must delay the
         # start of the API servers until the local engine is started
@@ -198,27 +216,26 @@ def run_multi_api_server(args: argparse.Namespace):
         # via the handshake with the local engine.
         if dp_rank == 0 or not (external_dp_lb or hybrid_dp_lb):
             # Start API servers using the manager.
-            api_server_manager = APIServerProcessManager(
-                **api_server_manager_kwargs)
+            api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
 
     # Start API servers now if they weren't already started.
     if api_server_manager is None:
         api_server_manager_kwargs["stats_update_address"] = (
-            addresses.frontend_stats_publish_address)
-        api_server_manager = APIServerProcessManager(
-            **api_server_manager_kwargs)
+            addresses.frontend_stats_publish_address
+        )
+        api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
 
     # Wait for API servers
-    wait_for_completion_or_failure(api_server_manager=api_server_manager,
-                                   engine_manager=local_engine_manager,
-                                   coordinator=coordinator)
+    wait_for_completion_or_failure(
+        api_server_manager=api_server_manager,
+        engine_manager=local_engine_manager,
+        coordinator=coordinator,
+    )
 
 
-def run_api_server_worker_proc(listen_address,
-                               sock,
-                               args,
-                               client_config=None,
-                               **uvicorn_kwargs) -> None:
+def run_api_server_worker_proc(
+    listen_address, sock, args, client_config=None, **uvicorn_kwargs
+) -> None:
     """Entrypoint for individual API server worker processes."""
     client_config = client_config or {}
     server_index = client_config.get("client_index", 0)
@@ -228,5 +245,5 @@ def run_api_server_worker_proc(listen_address,
     decorate_logs()
 
     uvloop.run(
-        run_server_worker(listen_address, sock, args, client_config,
-                          **uvicorn_kwargs))
+        run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)
+    )
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
index b88f094b302ad05ab3e0ba228415fae0ac8f912a..f22b844b4ddf5ef516da065414ff07c00a03a54d 100644
--- a/vllm/entrypoints/cli/types.py
+++ b/vllm/entrypoints/cli/types.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
 if typing.TYPE_CHECKING:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class CLISubcommand:
@@ -24,6 +24,6 @@ class CLISubcommand:
         pass
 
     def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        self, subparsers: argparse._SubParsersAction
+    ) -> FlexibleArgumentParser:
         raise NotImplementedError("Subclasses should implement this method")
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index ea81fdbcd825e4197ab777738cbee5b23529806d..7a41c668d7645a8bf15950ab2375152df9fdc622 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -6,13 +6,17 @@ import json
 import logging
 from abc import ABC, abstractmethod
 from contextlib import AsyncExitStack
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Union
 
 from openai.types.responses.tool import Mcp
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
+from vllm import envs
 from vllm.entrypoints.harmony_utils import (
-    get_encoding, get_streamable_parser_for_assistant, render_for_completion)
+    get_encoding,
+    get_streamable_parser_for_assistant,
+    render_for_completion,
+)
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
@@ -34,34 +38,53 @@ _TOOL_NAME_TO_TYPE_MAP = {
 
 def _map_tool_name_to_tool_type(tool_name: str) -> str:
     if tool_name not in _TOOL_NAME_TO_TYPE_MAP:
-        available_tools = ', '.join(_TOOL_NAME_TO_TYPE_MAP.keys())
+        available_tools = ", ".join(_TOOL_NAME_TO_TYPE_MAP.keys())
         raise ValueError(
             f"Built-in tool name '{tool_name}' not defined in mapping. "
-            f"Available tools: {available_tools}")
+            f"Available tools: {available_tools}"
+        )
     return _TOOL_NAME_TO_TYPE_MAP[tool_name]
 
 
-class TurnTokens:
-    """Tracks token counts for a single conversation turn."""
+class TurnMetrics:
+    """Tracks token and toolcall details for a single conversation turn."""
 
-    def __init__(self, input_tokens=0, output_tokens=0):
+    def __init__(
+        self,
+        input_tokens=0,
+        output_tokens=0,
+        cached_input_tokens=0,
+        tool_output_tokens=0,
+    ):
         self.input_tokens = input_tokens
         self.output_tokens = output_tokens
+        self.cached_input_tokens = cached_input_tokens
+        self.tool_output_tokens = tool_output_tokens
 
     def reset(self):
         """Reset counters for a new turn."""
         self.input_tokens = 0
         self.output_tokens = 0
+        self.cached_input_tokens = 0
+        self.tool_output_tokens = 0
 
     def copy(self):
         """Create a copy of this turn's token counts."""
-        return TurnTokens(self.input_tokens, self.output_tokens)
+        return TurnMetrics(
+            self.input_tokens,
+            self.output_tokens,
+            self.cached_input_tokens,
+            self.tool_output_tokens,
+        )
 
 
 class ConversationContext(ABC):
+    @abstractmethod
+    def append_output(self, output: RequestOutput) -> None:
+        pass
 
     @abstractmethod
-    def append_output(self, output) -> None:
+    def append_tool_output(self, output) -> None:
         pass
 
     @abstractmethod
@@ -77,9 +100,13 @@ class ConversationContext(ABC):
         pass
 
     @abstractmethod
-    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack, request_id: str,
-                                 mcp_tools: dict[str, Mcp]) -> None:
+    async def init_tool_sessions(
+        self,
+        tool_server: ToolServer | None,
+        exit_stack: AsyncExitStack,
+        request_id: str,
+        mcp_tools: dict[str, Mcp],
+    ) -> None:
         pass
 
     @abstractmethod
@@ -87,8 +114,29 @@ class ConversationContext(ABC):
         raise NotImplementedError("Should not be called.")
 
 
-class SimpleContext(ConversationContext):
+def _create_json_parse_error_messages(
+    last_msg: Message, e: json.JSONDecodeError
+) -> list[Message]:
+    """
+    Creates an error message when json parse failed.
+    """
+    error_msg = (
+        f"Error parsing tool arguments as JSON: {str(e)}. "
+        "Please ensure the tool call arguments are valid JSON and try again."
+    )
+    content = TextContent(text=error_msg)
+    author = Author(role=Role.TOOL, name=last_msg.recipient)
+    return [
+        Message(
+            author=author,
+            content=[content],
+            recipient=Role.ASSISTANT,
+            channel=last_msg.channel,
+        )
+    ]
+
 
+class SimpleContext(ConversationContext):
     def __init__(self):
         self.last_output = None
         self.num_prompt_tokens = 0
@@ -96,6 +144,8 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = 0
         # todo num_reasoning_tokens is not implemented yet.
         self.num_reasoning_tokens = 0
+        # not implemented yet for SimpleContext
+        self.all_turn_metrics = []
 
     def append_output(self, output) -> None:
         self.last_output = output
@@ -105,6 +155,9 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+    def append_tool_output(self, output) -> None:
+        raise NotImplementedError("Should not be called.")
+
     def need_builtin_tool_call(self) -> bool:
         return False
 
@@ -114,9 +167,13 @@ class SimpleContext(ConversationContext):
     def render_for_completion(self) -> list[int]:
         raise NotImplementedError("Should not be called.")
 
-    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack, request_id: str,
-                                 mcp_tools: dict[str, Mcp]) -> None:
+    async def init_tool_sessions(
+        self,
+        tool_server: ToolServer | None,
+        exit_stack: AsyncExitStack,
+        request_id: str,
+        mcp_tools: dict[str, Mcp],
+    ) -> None:
         pass
 
     async def cleanup_session(self) -> None:
@@ -124,16 +181,15 @@ class SimpleContext(ConversationContext):
 
 
 class HarmonyContext(ConversationContext):
-
     def __init__(
         self,
         messages: list,
         available_tools: list[str],
     ):
         self._messages = messages
-        self.finish_reason: Optional[str] = None
+        self.finish_reason: str | None = None
         self.available_tools = available_tools
-        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
+        self._tool_sessions: dict[str, ClientSession | Tool] = {}
         self.called_tools: set[str] = set()
 
         self.parser = get_streamable_parser_for_assistant()
@@ -145,8 +201,9 @@ class HarmonyContext(ConversationContext):
         self.num_tool_output_tokens = 0
 
         # Turn tracking - replaces multiple individual tracking variables
-        self.current_turn = TurnTokens()
-        self.previous_turn = TurnTokens()
+        self.current_turn_metrics = TurnMetrics()
+        # Track metrics for all turns
+        self.all_turn_metrics: list[TurnMetrics] = []
         self.is_first_turn = True
         self.first_tok_of_message = True  # For streaming support
 
@@ -155,30 +212,28 @@ class HarmonyContext(ConversationContext):
         if self.parser.current_channel in {"analysis", "commentary"}:
             self.num_reasoning_tokens += 1
 
-    def append_output(self, output: Union[RequestOutput,
-                                          list[Message]]) -> None:
-        if isinstance(output, RequestOutput):
-            output_token_ids = output.outputs[0].token_ids
-            self.parser = get_streamable_parser_for_assistant()
-            for token_id in output_token_ids:
-                self.parser.process(token_id)
-                # Check if the current token is part of reasoning content
-                self._update_num_reasoning_tokens()
-            self._update_prefill_token_usage(output)
-            # Reset current turn output tokens for this turn
-            self.current_turn.output_tokens = 0
-            self._update_decode_token_usage(output)
-            # Move current turn to previous turn for next turn's calculations
-            self.previous_turn = self.current_turn.copy()
-            # append_output is called only once before tool calling
-            # in non-streaming case
-            # so we can append all the parser messages to _messages
-            output_msgs = self.parser.messages
-            # The responses finish reason is set in the last message
-            self.finish_reason = output.outputs[0].finish_reason
-        else:
-            # Tool output.
-            output_msgs = output
+    def append_output(self, output: RequestOutput) -> None:
+        output_token_ids = output.outputs[0].token_ids
+        self.parser = get_streamable_parser_for_assistant()
+        for token_id in output_token_ids:
+            self.parser.process(token_id)
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens()
+        self._update_prefill_token_usage(output)
+        self._update_decode_token_usage(output)
+        # Append current turn to all turn list for next turn's calculations
+        self.all_turn_metrics.append(self.current_turn_metrics.copy())
+        self.current_turn_metrics.reset()
+        # append_output is called only once before tool calling
+        # in non-streaming case
+        # so we can append all the parser messages to _messages
+        output_msgs = self.parser.messages
+        # The responses finish reason is set in the last message
+        self.finish_reason = output.outputs[0].finish_reason
+        self._messages.extend(output_msgs)
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        output_msgs = output
         self._messages.extend(output_msgs)
 
     def _update_prefill_token_usage(self, output: RequestOutput) -> None:
@@ -202,23 +257,25 @@ class HarmonyContext(ConversationContext):
             this_turn_input_tokens = len(output.prompt_token_ids)
         else:
             this_turn_input_tokens = 0
-            logger.error(
-                "RequestOutput appended contains no prompt_token_ids.")
+            logger.error("RequestOutput appended contains no prompt_token_ids.")
 
         # Update current turn input tokens
-        self.current_turn.input_tokens = this_turn_input_tokens
+        self.current_turn_metrics.input_tokens = this_turn_input_tokens
         self.num_prompt_tokens += this_turn_input_tokens
 
         # Calculate tool tokens (except on first turn)
         if self.is_first_turn:
             self.is_first_turn = False
         else:
+            previous_turn = self.all_turn_metrics[-1]
             # start counting tool after first turn
             # tool tokens = this turn prefill - last turn prefill -
             # last turn decode
-            this_turn_tool_tokens = (self.current_turn.input_tokens -
-                                     self.previous_turn.input_tokens -
-                                     self.previous_turn.output_tokens)
+            this_turn_tool_tokens = (
+                self.current_turn_metrics.input_tokens
+                - previous_turn.input_tokens
+                - previous_turn.output_tokens
+            )
 
             # Handle negative tool token counts (shouldn't happen in normal
             # cases)
@@ -227,16 +284,21 @@ class HarmonyContext(ConversationContext):
                     "Negative tool output tokens calculated: %d "
                     "(current_input=%d, previous_input=%d, "
                     "previous_output=%d). Setting to 0.",
-                    this_turn_tool_tokens, self.current_turn.input_tokens,
-                    self.previous_turn.input_tokens,
-                    self.previous_turn.output_tokens)
+                    this_turn_tool_tokens,
+                    self.current_turn_metrics.input_tokens,
+                    previous_turn.input_tokens,
+                    previous_turn.output_tokens,
+                )
                 this_turn_tool_tokens = 0
 
             self.num_tool_output_tokens += this_turn_tool_tokens
+            self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens
 
         # Update cached tokens
-        if output.num_cached_tokens is not None:
-            self.num_cached_tokens += output.num_cached_tokens
+        num_cached_token = output.num_cached_tokens
+        if num_cached_token is not None:
+            self.num_cached_tokens += num_cached_token
+            self.current_turn_metrics.cached_input_tokens = num_cached_token
 
     def _update_decode_token_usage(self, output: RequestOutput) -> int:
         """Update token usage statistics for the decode phase of generation.
@@ -261,7 +323,7 @@ class HarmonyContext(ConversationContext):
                 # only keep last round
                 updated_output_token_count += len(completion_output.token_ids)
             self.num_output_tokens += updated_output_token_count
-            self.current_turn.output_tokens += updated_output_token_count
+            self.current_turn_metrics.output_tokens += updated_output_token_count
         return updated_output_token_count
 
     @property
@@ -271,9 +333,11 @@ class HarmonyContext(ConversationContext):
     def need_builtin_tool_call(self) -> bool:
         last_msg = self.messages[-1]
         recipient = last_msg.recipient
-        return recipient is not None and (recipient.startswith("browser.")
-                                          or recipient.startswith("python") or
-                                          recipient.startswith("container."))
+        return recipient is not None and (
+            recipient.startswith("browser.")
+            or recipient.startswith("python")
+            or recipient.startswith("container.")
+        )
 
     async def call_tool(self) -> list[Message]:
         if not self.messages:
@@ -283,40 +347,51 @@ class HarmonyContext(ConversationContext):
         if recipient is not None:
             if recipient.startswith("browser."):
                 return await self.call_search_tool(
-                    self._tool_sessions["browser"], last_msg)
+                    self._tool_sessions["browser"], last_msg
+                )
             elif recipient.startswith("python"):
                 return await self.call_python_tool(
-                    self._tool_sessions["python"], last_msg)
+                    self._tool_sessions["python"], last_msg
+                )
             elif recipient.startswith("container."):
                 return await self.call_container_tool(
-                    self._tool_sessions["container"], last_msg)
+                    self._tool_sessions["container"], last_msg
+                )
         raise ValueError("No tool call found")
 
     def render_for_completion(self) -> list[int]:
         return render_for_completion(self.messages)
 
-    async def call_search_tool(self, tool_session: Union["ClientSession",
-                                                         Tool],
-                               last_msg: Message) -> list[Message]:
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
         self.called_tools.add("browser")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
         tool_name = last_msg.recipient.split(".")[1]
-        args = json.loads(last_msg.content[0].text)
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.content[0].text)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.content[0].text)
         result = await tool_session.call_tool(tool_name, args)
         result_str = result.content[0].text
         content = TextContent(text=result_str)
         author = Author(role=Role.TOOL, name=last_msg.recipient)
         return [
-            Message(author=author,
-                    content=[content],
-                    recipient=Role.ASSISTANT,
-                    channel=last_msg.channel)
+            Message(
+                author=author,
+                content=[content],
+                recipient=Role.ASSISTANT,
+                channel=last_msg.channel,
+            )
         ]
 
-    async def call_python_tool(self, tool_session: Union["ClientSession",
-                                                         Tool],
-                               last_msg: Message) -> list[Message]:
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
         self.called_tools.add("python")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
@@ -330,60 +405,75 @@ class HarmonyContext(ConversationContext):
         author = Author(role=Role.TOOL, name="python")
 
         return [
-            Message(author=author,
-                    content=[content],
-                    channel=last_msg.channel,
-                    recipient=Role.ASSISTANT)
+            Message(
+                author=author,
+                content=[content],
+                channel=last_msg.channel,
+                recipient=Role.ASSISTANT,
+            )
         ]
 
-    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack, request_id: str,
-                                 mcp_tools: dict[str, Mcp]):
+    async def init_tool_sessions(
+        self,
+        tool_server: ToolServer | None,
+        exit_stack: AsyncExitStack,
+        request_id: str,
+        mcp_tools: dict[str, Mcp],
+    ):
         if tool_server:
             for tool_name in self.available_tools:
                 if tool_name not in self._tool_sessions:
                     tool_type = _map_tool_name_to_tool_type(tool_name)
-                    headers = mcp_tools[
-                        tool_type].headers if tool_type in mcp_tools else None
+                    headers = (
+                        mcp_tools[tool_type].headers if tool_type in mcp_tools else None
+                    )
                     tool_session = await exit_stack.enter_async_context(
-                        tool_server.new_session(tool_name, request_id,
-                                                headers))
+                        tool_server.new_session(tool_name, request_id, headers)
+                    )
                     self._tool_sessions[tool_name] = tool_session
                     exit_stack.push_async_exit(self.cleanup_session)
 
-    async def call_container_tool(self, tool_session: Union["ClientSession",
-                                                            Tool],
-                                  last_msg: Message) -> list[Message]:
+    async def call_container_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
         """
-            Call container tool. Expect this to be run in a stateful docker
-            with command line terminal.
-            The official container tool would at least
-            expect the following format:
-            - for tool name: exec
-                - args:
-                    {
-                        "cmd":List[str] "command to execute",
-                        "workdir":optional[str] "current working directory",
-                        "env":optional[object/dict] "environment variables",
-                        "session_name":optional[str] "session name",
-                        "timeout":optional[int] "timeout in seconds",
-                        "user":optional[str] "user name",
-                    }
+        Call container tool. Expect this to be run in a stateful docker
+        with command line terminal.
+        The official container tool would at least
+        expect the following format:
+        - for tool name: exec
+            - args:
+                {
+                    "cmd":List[str] "command to execute",
+                    "workdir":optional[str] "current working directory",
+                    "env":optional[object/dict] "environment variables",
+                    "session_name":optional[str] "session name",
+                    "timeout":optional[int] "timeout in seconds",
+                    "user":optional[str] "user name",
+                }
         """
         self.called_tools.add("container")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
         tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
-        args = json.loads(last_msg.content[0].text)
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.content[0].text)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.content[0].text)
         result = await tool_session.call_tool(tool_name, args)
         result_str = result.content[0].text
         content = TextContent(text=result_str)
         author = Author(role=Role.TOOL, name=last_msg.recipient)
         return [
-            Message(author=author,
-                    content=[content],
-                    recipient=Role.ASSISTANT,
-                    channel=last_msg.channel)
+            Message(
+                author=author,
+                content=[content],
+                recipient=Role.ASSISTANT,
+                channel=last_msg.channel,
+            )
         ]
 
     async def cleanup_session(self, *args, **kwargs) -> None:
@@ -391,17 +481,21 @@ class HarmonyContext(ConversationContext):
 
         async def cleanup_tool_session(tool_session):
             if not isinstance(tool_session, Tool):
-                logger.info("Cleaning up tool session for %s",
-                            tool_session._client_info)
+                logger.info(
+                    "Cleaning up tool session for %s", tool_session._client_info
+                )
                 with contextlib.suppress(Exception):
                     await tool_session.call_tool("cleanup_session", {})
 
-        await asyncio.gather(*(cleanup_tool_session(self._tool_sessions[tool])
-                               for tool in self.called_tools))
+        await asyncio.gather(
+            *(
+                cleanup_tool_session(self._tool_sessions[tool])
+                for tool in self.called_tools
+            )
+        )
 
 
 class StreamingHarmonyContext(HarmonyContext):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.last_output = None
@@ -415,58 +509,55 @@ class StreamingHarmonyContext(HarmonyContext):
     def messages(self) -> list:
         return self._messages
 
-    def append_output(self, output: Union[RequestOutput,
-                                          list[Message]]) -> None:
-        if isinstance(output, RequestOutput):
-            # append_output is called for each output token in streaming case,
-            # so we only want to add the prompt tokens once for each message.
-            if self.first_tok_of_message:
-                self._update_prefill_token_usage(output)
-                self.current_turn.output_tokens = 0
-            # Reset self.first_tok_of_message if needed:
-            # if the current token is the last one of the current message
-            # (finished=True), then the next token processed will mark the
-            # beginning of a new message
-            self.first_tok_of_message = output.finished
-            for tok in output.outputs[0].token_ids:
-                self.parser.process(tok)
-            self._update_decode_token_usage(output)
-
-            # For streaming, update previous turn when message is complete
-            if output.finished:
-                self.previous_turn = self.current_turn.copy()
-            # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens()
-            self.last_tok = tok
-            if len(self._messages) - self.num_init_messages < len(
-                    self.parser.messages):
-                self._messages.extend(
-                    self.parser.messages[len(self._messages) -
-                                         self.num_init_messages:])
-        else:
-            # Handle the case of tool output in direct message format
-            assert len(output) == 1, "Tool output should be a single message"
-            msg = output[0]
-            # Sometimes the recipient is not set for tool messages,
-            # so we set it to "assistant"
-            if msg.author.role == Role.TOOL and msg.recipient is None:
-                msg.recipient = "assistant"
-            toks = self.encoding.render(msg)
-            for tok in toks:
-                self.parser.process(tok)
-            self.last_tok = toks[-1]
-            # TODO: add tool_output messages to self._messages
+    def append_output(self, output: RequestOutput) -> None:
+        # append_output is called for each output token in streaming case,
+        # so we only want to add the prompt tokens once for each message.
+        if self.first_tok_of_message:
+            self._update_prefill_token_usage(output)
+        # Reset self.first_tok_of_message if needed:
+        # if the current token is the last one of the current message
+        # (finished=True), then the next token processed will mark the
+        # beginning of a new message
+        self.first_tok_of_message = output.finished
+        for tok in output.outputs[0].token_ids:
+            self.parser.process(tok)
+        self._update_decode_token_usage(output)
+
+        # For streaming, update previous turn when message is complete
+        if output.finished:
+            self.all_turn_metrics.append(self.current_turn_metrics.copy())
+            self.current_turn_metrics.reset()
+        # Check if the current token is part of reasoning content
+        self._update_num_reasoning_tokens()
+        self.last_tok = tok
+        if len(self._messages) - self.num_init_messages < len(self.parser.messages):
+            self._messages.extend(
+                self.parser.messages[len(self._messages) - self.num_init_messages :]
+            )
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        # Handle the case of tool output in direct message format
+        assert len(output) == 1, "Tool output should be a single message"
+        msg = output[0]
+        # Sometimes the recipient is not set for tool messages,
+        # so we set it to "assistant"
+        if msg.author.role == Role.TOOL and msg.recipient is None:
+            msg.recipient = "assistant"
+        toks = self.encoding.render(msg)
+        for tok in toks:
+            self.parser.process(tok)
+        self.last_tok = toks[-1]
+        # TODO: add tool_output messages to self._messages
 
     def is_expecting_start(self) -> bool:
         return self.parser.state == StreamState.EXPECT_START
 
     def is_assistant_action_turn(self) -> bool:
-        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions(
-        )
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
 
     def render_for_completion(self) -> list[int]:
         # now this list of tokens as next turn's starting tokens
-        # `<|start|>assistant``,
+        # `<|start|>assistant`,
         # we need to process them in parser.
         rendered_tokens = super().render_for_completion()
 
diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/dynamic_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc0f437e5c77fbbe94721b91d813e06d5e8fdca3
--- /dev/null
+++ b/vllm/entrypoints/dynamic_lora.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import models, validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def register_dynamic_lora_routes(router: APIRouter):
+    @sagemaker_standards.register_load_adapter_handler(
+        request_shape={
+            "lora_name": "body.name",
+            "lora_path": "body.src",
+        },
+    )
+    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
+    async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    @sagemaker_standards.register_unload_adapter_handler(
+        request_shape={
+            "lora_name": "path_params.adapter_name",
+        }
+    )
+    @router.post(
+        "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
+    )
+    async def unload_lora_adapter(
+        request: UnloadLoRAAdapterRequest, raw_request: Request
+    ):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    return router
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 0c1c9c3192fc00fcb40d5f4a7c87128fa574bc05..47a252348c1020b8bfccddd28c5fe0be2b9ac112 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,30 +1,52 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import datetime
 import json
 from collections.abc import Iterable, Sequence
-from typing import Literal, Optional, Union
-
-from openai.types.responses import (ResponseFunctionToolCall,
-                                    ResponseOutputItem, ResponseOutputMessage,
-                                    ResponseOutputText, ResponseReasoningItem)
+from typing import Literal
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
 from openai.types.responses.response_function_web_search import (
-    ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
+    ActionFind,
+    ActionOpenPage,
+    ActionSearch,
+    ResponseFunctionWebSearch,
+)
 from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent)
+    Content as ResponseReasoningTextContent,
+)
 from openai.types.responses.tool import Tool
-from openai_harmony import (Author, ChannelConfig, Conversation,
-                            DeveloperContent, HarmonyEncodingName, Message,
-                            ReasoningEffort, Role, StreamableParser,
-                            SystemContent, TextContent, ToolDescription,
-                            load_harmony_encoding)
+from openai_harmony import (
+    Author,
+    ChannelConfig,
+    Conversation,
+    DeveloperContent,
+    HarmonyEncodingName,
+    Message,
+    ReasoningEffort,
+    Role,
+    StreamableParser,
+    SystemContent,
+    TextContent,
+    ToolDescription,
+    load_harmony_encoding,
+)
+from openai_harmony import Message as OpenAIHarmonyMessage
+from openai_harmony import Role as OpenAIHarmonyRole
 
 from vllm import envs
-from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
-                                              ResponseInputOutputItem)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionToolsParam,
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
 from vllm.utils import random_uuid
 
 REASONING_EFFORT = {
@@ -39,47 +61,51 @@ _harmony_encoding = None
 # they are available and requested by the user.
 # Tool args are provided by MCP tool descriptions. Output
 # of the tools are stringified.
-BUILTIN_TOOLS = {
+MCP_BUILTIN_TOOLS: set[str] = {
     "web_search_preview",
     "code_interpreter",
     "container",
 }
 
 
-def has_custom_tools(tool_types: list[str]) -> bool:
-    return not set(tool_types).issubset(BUILTIN_TOOLS)
+def has_custom_tools(tool_types: set[str]) -> bool:
+    """
+    Checks if the given tool types are custom tools
+    (i.e. any tool other than MCP buildin tools)
+    """
+    return not tool_types.issubset(MCP_BUILTIN_TOOLS)
 
 
 def get_encoding():
     global _harmony_encoding
     if _harmony_encoding is None:
-        _harmony_encoding = load_harmony_encoding(
-            HarmonyEncodingName.HARMONY_GPT_OSS)
+        _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
     return _harmony_encoding
 
 
 def get_system_message(
-    model_identity: Optional[str] = None,
-    reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
-    start_date: Optional[str] = None,
-    browser_description: Optional[str] = None,
-    python_description: Optional[str] = None,
-    container_description: Optional[str] = None,
-    instructions: Optional[str] = None,
+    model_identity: str | None = None,
+    reasoning_effort: Literal["high", "medium", "low"] | None = None,
+    start_date: str | None = None,
+    browser_description: str | None = None,
+    python_description: str | None = None,
+    container_description: str | None = None,
+    instructions: str | None = None,
     with_custom_tools: bool = False,
 ) -> Message:
     sys_msg_content = SystemContent.new()
     if model_identity is not None:
         sys_msg_content = sys_msg_content.with_model_identity(model_identity)
-    if (instructions is not None
-            and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS):
+    if instructions is not None and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
         current_identity = sys_msg_content.model_identity
-        new_identity = (f'{current_identity}\n{instructions}'
-                        if current_identity else instructions)
+        new_identity = (
+            f"{current_identity}\n{instructions}" if current_identity else instructions
+        )
         sys_msg_content = sys_msg_content.with_model_identity(new_identity)
     if reasoning_effort is not None:
         sys_msg_content = sys_msg_content.with_reasoning_effort(
-            REASONING_EFFORT[reasoning_effort])
+            REASONING_EFFORT[reasoning_effort]
+        )
     if start_date is None:
         # NOTE(woosuk): This brings non-determinism in vLLM. Be careful.
         start_date = datetime.datetime.now().strftime("%Y-%m-%d")
@@ -94,13 +120,14 @@ def get_system_message(
         channel_config = sys_msg_content.channel_config
         invalid_channel = "commentary"
         new_config = ChannelConfig.require_channels(
-            [c for c in channel_config.valid_channels if c != invalid_channel])
+            [c for c in channel_config.valid_channels if c != invalid_channel]
+        )
         sys_msg_content = sys_msg_content.with_channel_config(new_config)
     sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
     return sys_msg
 
 
-def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
+def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
     if isinstance(tool, ChatCompletionToolsParam):
         return ToolDescription.new(
             name=tool.function.name,
@@ -115,18 +142,21 @@ def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
 
 
 def get_developer_message(
-    instructions: Optional[str] = None,
-    tools: Optional[list[Union[Tool, ChatCompletionToolsParam]]] = None,
+    instructions: str | None = None,
+    tools: list[Tool | ChatCompletionToolsParam] | None = None,
 ) -> Message:
     dev_msg_content = DeveloperContent.new()
-    if (instructions is not None
-            and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS):
+    if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
         dev_msg_content = dev_msg_content.with_instructions(instructions)
     if tools is not None:
-        function_tools: list[Union[Tool, ChatCompletionToolsParam]] = []
+        function_tools: list[Tool | ChatCompletionToolsParam] = []
         for tool in tools:
-            if tool.type in ("web_search_preview", "code_interpreter",
-                             "container", "mcp"):
+            if tool.type in (
+                "web_search_preview",
+                "code_interpreter",
+                "container",
+                "mcp",
+            ):
                 # These are built-in tools that are added to the system message.
                 # Adding in MCP for now until we support MCP tools executed
                 # server side
@@ -141,7 +171,8 @@ def get_developer_message(
                 create_tool_definition(tool) for tool in function_tools
             ]
             dev_msg_content = dev_msg_content.with_function_tools(
-                function_tool_descriptions)
+                function_tool_descriptions
+            )
     dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
     return dev_msg
 
@@ -152,7 +183,7 @@ def get_user_message(content: str) -> Message:
 
 def parse_response_input(
     response_msg: ResponseInputOutputItem,
-    prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]]
+    prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
 ) -> Message:
     if not isinstance(response_msg, dict):
         response_msg = response_msg.model_dump()
@@ -170,32 +201,32 @@ def parse_response_input(
         if isinstance(content, str):
             msg = Message.from_role_and_content(role, text_prefix + content)
         else:
-            contents = [
-                TextContent(text=text_prefix + c["text"]) for c in content
-            ]
+            contents = [TextContent(text=text_prefix + c["text"]) for c in content]
             msg = Message.from_role_and_contents(role, contents)
         if role == "assistant":
             msg = msg.with_channel("final")
     elif response_msg["type"] == "function_call_output":
         call_id = response_msg["call_id"]
-        call_response: Optional[ResponseFunctionToolCall] = None
+        call_response: ResponseFunctionToolCall | None = None
         for prev_response in reversed(prev_responses):
-            if isinstance(prev_response, ResponseFunctionToolCall
-                          ) and prev_response.call_id == call_id:
+            if (
+                isinstance(prev_response, ResponseFunctionToolCall)
+                and prev_response.call_id == call_id
+            ):
                 call_response = prev_response
                 break
         if call_response is None:
             raise ValueError(f"No call message found for {call_id}")
         msg = Message.from_author_and_content(
             Author.new(Role.TOOL, f"functions.{call_response.name}"),
-            response_msg["output"])
+            response_msg["output"],
+        )
     elif response_msg["type"] == "reasoning":
         content = response_msg["content"]
         assert len(content) == 1
         msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
     elif response_msg["type"] == "function_call":
-        msg = Message.from_role_and_content(Role.ASSISTANT,
-                                            response_msg["arguments"])
+        msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
         msg = msg.with_channel("commentary")
         msg = msg.with_recipient(f"functions.{response_msg['name']}")
         msg = msg.with_content_type("json")
@@ -204,7 +235,7 @@ def parse_response_input(
     return msg
 
 
-def parse_chat_input(chat_msg) -> list[Message]:
+def parse_input_to_harmony_message(chat_msg) -> list[Message]:
     if not isinstance(chat_msg, dict):
         # Handle Pydantic models
         chat_msg = chat_msg.model_dump(exclude_none=True)
@@ -230,9 +261,18 @@ def parse_chat_input(chat_msg) -> list[Message]:
     if role == "tool":
         name = chat_msg.get("name", "")
         content = chat_msg.get("content", "") or ""
+        if isinstance(content, list):
+            # Handle array format for tool message content
+            # by concatenating all text parts.
+            content = "".join(
+                item.get("text", "")
+                for item in content
+                if isinstance(item, dict) and item.get("type") == "text"
+            )
+
         msg = Message.from_author_and_content(
-            Author.new(Role.TOOL, f"functions.{name}"),
-            content).with_channel("commentary")
+            Author.new(Role.TOOL, f"functions.{name}"), content
+        ).with_channel("commentary")
         return [msg]
 
     # Default: user/assistant/system messages with content
@@ -246,10 +286,45 @@ def parse_chat_input(chat_msg) -> list[Message]:
     return [msg]
 
 
+def construct_harmony_previous_input_messages(
+    request: ResponsesRequest,
+) -> list[OpenAIHarmonyMessage]:
+    messages: list[OpenAIHarmonyMessage] = []
+    if request.previous_input_messages:
+        for message in request.previous_input_messages:
+            # Handle both OpenAIHarmonyMessage objects and dictionary inputs
+            if isinstance(message, OpenAIHarmonyMessage):
+                message_role = message.author.role
+                # To match OpenAI, instructions, reasoning and tools are
+                # always taken from the most recent Responses API request
+                # not carried over from previous requests
+                if (
+                    message_role == OpenAIHarmonyRole.SYSTEM
+                    or message_role == OpenAIHarmonyRole.DEVELOPER
+                ):
+                    continue
+                messages.append(message)
+            else:
+                harmony_messages = parse_input_to_harmony_message(message)
+                for harmony_msg in harmony_messages:
+                    message_role = harmony_msg.author.role
+                    # To match OpenAI, instructions, reasoning and tools are
+                    # always taken from the most recent Responses API request
+                    # not carried over from previous requests
+                    if (
+                        message_role == OpenAIHarmonyRole.SYSTEM
+                        or message_role == OpenAIHarmonyRole.DEVELOPER
+                    ):
+                        continue
+                    messages.append(harmony_msg)
+    return messages
+
+
 def render_for_completion(messages: list[Message]) -> list[int]:
     conversation = Conversation.from_messages(messages)
     token_ids = get_encoding().render_conversation_for_completion(
-        conversation, Role.ASSISTANT)
+        conversation, Role.ASSISTANT
+    )
     return token_ids
 
 
@@ -269,18 +344,39 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
         if len(message.content) != 1:
             raise ValueError("Invalid number of contents in browser message")
         content = message.content[0]
-        browser_call = json.loads(content.text)
+        # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY
+        # env variable since if it is not set, we are certain the json is valid
+        # The use of Actions for web search will be removed entirely in
+        # the future, so this is only necessary temporarily
+        try:
+            browser_call = json.loads(content.text)
+        except json.JSONDecodeError:
+            # If the content is not valid JSON, then it was
+            # caught and retried by vLLM, which means we
+            # need to make note of that so the user is aware
+            json_retry_output_message = (
+                f"Invalid JSON args, caught and retried: {content.text}"
+            )
+            browser_call = {
+                "query": json_retry_output_message,
+                "url": json_retry_output_message,
+                "pattern": json_retry_output_message,
+            }
         # TODO: translate to url properly!
         if recipient == "browser.search":
             action = ActionSearch(
-                query=f"cursor:{browser_call.get('query', '')}", type="search")
+                query=f"cursor:{browser_call.get('query', '')}", type="search"
+            )
         elif recipient == "browser.open":
             action = ActionOpenPage(
-                url=f"cursor:{browser_call.get('url', '')}", type="open_page")
+                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+            )
         elif recipient == "browser.find":
-            action = ActionFind(pattern=browser_call["pattern"],
-                                url=f"cursor:{browser_call.get('url', '')}",
-                                type="find")
+            action = ActionFind(
+                pattern=browser_call["pattern"],
+                url=f"cursor:{browser_call.get('url', '')}",
+                type="find",
+            )
         else:
             raise ValueError(f"Unknown browser action: {recipient}")
         web_search_item = ResponseFunctionWebSearch(
@@ -297,8 +393,9 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                 summary=[],
                 type="reasoning",
                 content=[
-                    ResponseReasoningTextContent(text=content.text,
-                                                 type="reasoning_text")
+                    ResponseReasoningTextContent(
+                        text=content.text, type="reasoning_text"
+                    )
                 ],
                 status=None,
             )
@@ -316,17 +413,20 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     id=f"fc_{random_id}",
                 )
                 output_items.append(response_item)
-        elif recipient is not None and (recipient.startswith("python")
-                                        or recipient.startswith("browser")
-                                        or recipient.startswith("container")):
+        elif recipient is not None and (
+            recipient.startswith("python")
+            or recipient.startswith("browser")
+            or recipient.startswith("container")
+        ):
             for content in message.content:
                 reasoning_item = ResponseReasoningItem(
                     id=f"rs_{random_uuid()}",
                     summary=[],
                     type="reasoning",
                     content=[
-                        ResponseReasoningTextContent(text=content.text,
-                                                     type="reasoning_text")
+                        ResponseReasoningTextContent(
+                            text=content.text, type="reasoning_text"
+                        )
                     ],
                     status=None,
                 )
@@ -356,15 +456,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     return output_items
 
 
-def parse_remaining_state(
-        parser: StreamableParser) -> list[ResponseOutputItem]:
+def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
     if not parser.current_content:
         return []
     if parser.current_role != Role.ASSISTANT:
         return []
     current_recipient = parser.current_recipient
-    if (current_recipient is not None
-            and current_recipient.startswith("browser.")):
+    if current_recipient is not None and current_recipient.startswith("browser."):
         return []
 
     if parser.current_channel == "analysis":
@@ -373,8 +471,9 @@ def parse_remaining_state(
             summary=[],
             type="reasoning",
             content=[
-                ResponseReasoningTextContent(text=parser.current_content,
-                                             type="reasoning_text")
+                ResponseReasoningTextContent(
+                    text=parser.current_content, type="reasoning_text"
+                )
             ],
             status=None,
         )
@@ -415,22 +514,22 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
 
 
 def parse_chat_output(
-        token_ids: Sequence[int]) -> tuple[Optional[str], Optional[str], bool]:
+    token_ids: Sequence[int],
+) -> tuple[str | None, str | None, bool]:
     parser = parse_output_into_messages(token_ids)
     output_msgs = parser.messages
     is_tool_call = False  # TODO: update this when tool call is supported
     if len(output_msgs) == 0:
         # The generation has stopped during reasoning.
-        reasoning_content = parser.current_content
+        reasoning = parser.current_content
         final_content = None
     elif len(output_msgs) == 1:
         # The generation has stopped during final message.
-        reasoning_content = output_msgs[0].content[0].text
+        reasoning = output_msgs[0].content[0].text
         final_content = parser.current_content
     else:
         reasoning_msg = output_msgs[:-1]
         final_msg = output_msgs[-1]
-        reasoning_content = "\n".join(
-            [msg.content[0].text for msg in reasoning_msg])
+        reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
         final_content = final_msg.content[0].text
-    return reasoning_content, final_content, is_tool_call
+    return reasoning, final_content, is_tool_call
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 8b2acedf805c1f3248b918be0be742d37c8bc16f..cabf95e8d21462144f92c2f89dd19d4f4762ac88 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -5,27 +5,31 @@ import asyncio
 import signal
 import socket
 from http import HTTPStatus
-from typing import Any, Optional
+from typing import Any
 
 import uvicorn
 from fastapi import FastAPI, Request, Response
 
 from vllm import envs
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
-                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
+from vllm.entrypoints.constants import (
+    H11_MAX_HEADER_COUNT_DEFAULT,
+    H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
+)
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
-from vllm.utils import find_process_using_port
+from vllm.utils.network_utils import find_process_using_port
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
 
-async def serve_http(app: FastAPI,
-                     sock: Optional[socket.socket],
-                     enable_ssl_refresh: bool = False,
-                     **uvicorn_kwargs: Any):
+async def serve_http(
+    app: FastAPI,
+    sock: socket.socket | None,
+    enable_ssl_refresh: bool = False,
+    **uvicorn_kwargs: Any,
+):
     """
     Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
     options.  Supports http header limits via h11_max_incomplete_event_size and
@@ -39,11 +43,12 @@ async def serve_http(app: FastAPI,
         if methods is None or path is None:
             continue
 
-        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+        logger.info("Route: %s, Methods: %s", path, ", ".join(methods))
 
     # Extract header limit options if present
     h11_max_incomplete_event_size = uvicorn_kwargs.pop(
-        "h11_max_incomplete_event_size", None)
+        "h11_max_incomplete_event_size", None
+    )
     h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
 
     # Set safe defaults if not provided
@@ -62,16 +67,19 @@ async def serve_http(app: FastAPI,
 
     loop = asyncio.get_running_loop()
 
-    watchdog_task = loop.create_task(
-        watchdog_loop(server, app.state.engine_client))
-    server_task = loop.create_task(
-        server.serve(sockets=[sock] if sock else None))
-
-    ssl_cert_refresher = None if not enable_ssl_refresh else SSLCertRefresher(
-        ssl_context=config.ssl,
-        key_path=config.ssl_keyfile,
-        cert_path=config.ssl_certfile,
-        ca_path=config.ssl_ca_certs)
+    watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client))
+    server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))
+
+    ssl_cert_refresher = (
+        None
+        if not enable_ssl_refresh
+        else SSLCertRefresher(
+            ssl_context=config.ssl,
+            key_path=config.ssl_keyfile,
+            cert_path=config.ssl_certfile,
+            ca_path=config.ssl_ca_certs,
+        )
+    )
 
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
@@ -95,7 +103,10 @@ async def serve_http(app: FastAPI,
         if process is not None:
             logger.warning(
                 "port %s is used by process %s launched with command:\n%s",
-                port, process, " ".join(process.cmdline()))
+                port,
+                process,
+                " ".join(process.cmdline()),
+            )
         logger.info("Shutting down FastAPI HTTP server.")
         return server.shutdown()
     finally:
@@ -131,14 +142,14 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
     """
     VLLM V1 AsyncLLM catches exceptions and returns
     only two types: EngineGenerateError and EngineDeadError.
-    
+
     EngineGenerateError is raised by the per request generate()
     method. This error could be request specific (and therefore
     recoverable - e.g. if there is an error in input processing).
-    
+
     EngineDeadError is raised by the background output_handler
     method. This error is global and therefore not recoverable.
-    
+
     We register these @app.exception_handlers to return nice
     responses to the end user if they occur and shut down if needed.
     See https://fastapi.tiangolo.com/tutorial/handling-errors/
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 862f383e4ecb22951ca93940b9e65f91ef2f0da5..b0786bd355aa6d27892a47141717e44d93d65b8c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,56 +2,84 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING, Any, cast
 
 import cloudpickle
 import torch.nn as nn
 from pydantic import ValidationError
 from tqdm.auto import tqdm
-from typing_extensions import TypeVar
-
-from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
-                              BeamSearchSequence,
-                              create_sort_beams_key_function)
-from vllm.config import (CompilationConfig, ModelDType,
-                         StructuredOutputsConfig, TokenizerMode, is_init_field)
-from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
-                                   PoolerConfig, RunnerOption)
-from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         ChatTemplateContentFormatOption,
-                                         apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         parse_chat_messages,
-                                         resolve_chat_template_content_format)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.score_utils import (ScoreContentPartParam,
-                                          ScoreMultiModalParam,
-                                          _cosine_similarity,
-                                          _validate_score_input_lens,
-                                          compress_token_type_ids,
-                                          get_score_prompt)
-# yapf: enable
-from vllm.entrypoints.utils import (_validate_truncation_size,
-                                    log_non_default_args)
-from vllm.inputs import (DataPrompt, PromptType, SingletonPrompt, TextPrompt,
-                         TokensPrompt)
+from typing_extensions import TypeVar, deprecated
+
+from vllm.beam_search import (
+    BeamSearchInstance,
+    BeamSearchOutput,
+    BeamSearchSequence,
+    create_sort_beams_key_function,
+)
+from vllm.config import (
+    CompilationConfig,
+    PoolerConfig,
+    StructuredOutputsConfig,
+    is_init_field,
+)
+from vllm.config.compilation import CompilationMode
+from vllm.config.model import (
+    ConvertOption,
+    HfOverrides,
+    ModelDType,
+    RunnerOption,
+    TokenizerMode,
+)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormatOption,
+    apply_hf_chat_template,
+    apply_mistral_chat_template,
+    parse_chat_messages,
+    resolve_chat_template_content_format,
+)
+from vllm.entrypoints.score_utils import (
+    ScoreContentPartParam,
+    ScoreMultiModalParam,
+    _cosine_similarity,
+    _validate_score_input_lens,
+    compress_token_type_ids,
+    get_score_prompt,
+)
+from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
+from vllm.inputs import (
+    DataPrompt,
+    PromptType,
+    SingletonPrompt,
+    TextPrompt,
+    TokensPrompt,
+)
+from vllm.inputs.parse import get_prompt_components
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
-                          PoolingRequestOutput, RequestOutput,
-                          ScoringRequestOutput)
-from vllm.plugins.io_processors import get_io_processor
+from vllm.outputs import (
+    ClassificationRequestOutput,
+    EmbeddingRequestOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+    ScoringRequestOutput,
+)
+from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
-                                  SamplingParams)
+from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               get_cached_tokenizer)
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    MistralTokenizer,
+    get_cached_tokenizer,
+)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, Device, as_iter, is_list_of
+from vllm.utils.collection_utils import as_iter, is_list_of
+from vllm.utils.counter import Counter
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
@@ -86,15 +114,14 @@ class LLM:
             or videos from directories specified by the server file system.
             This is a security risk. Should only be enabled in trusted
             environments.
-        allowed_media_domains: If set, only media URLs that belong to this 
+        allowed_media_domains: If set, only media URLs that belong to this
             domain can be used for multi-modal inputs.
         tensor_parallel_size: The number of GPUs to use for distributed
             execution with tensor parallelism.
         dtype: The data type for the model weights and activations. Currently,
             we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
-            the `torch_dtype` attribute specified in the model config file.
-            However, if the `torch_dtype` in the config is `float32`, we will
-            use `float16` instead.
+            the `dtype` attribute of the Transformers model's config. However,
+            if the `dtype` in the config is `float32`, we will use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
             we support "awq", "gptq", and "fp8" (experimental).
             If None, we first check the `quantization_config` attribute in the
@@ -116,7 +143,7 @@ class LLM:
             size based on gpu_memory_utilization. However, users may want to
             manually specify the kv cache memory size. kv_cache_memory_bytes
             allows more fine-grain control of how much memory gets used when
-            compared with using gpu_memory_memory_utilization. Note that
+            compared with using gpu_memory_utilization. Note that
             kv_cache_memory_bytes (when not-None) ignores
             gpu_memory_utilization
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
@@ -151,7 +178,7 @@ class LLM:
             argument is deprecated and will be removed in v0.12.0 or v1.0.0,
             whichever is sooner.
         compilation_config: Either an integer or a dictionary. If it is an
-            integer, it is used as the level of compilation optimization. If it
+            integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
         **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
 
@@ -166,35 +193,34 @@ class LLM:
         *,
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
-        tokenizer: Optional[str] = None,
+        tokenizer: str | None = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
-        allowed_media_domains: Optional[list[str]] = None,
+        allowed_media_domains: list[str] | None = None,
         tensor_parallel_size: int = 1,
         dtype: ModelDType = "auto",
-        quantization: Optional[QuantizationMethods] = None,
-        revision: Optional[str] = None,
-        tokenizer_revision: Optional[str] = None,
-        seed: Optional[int] = None,
+        quantization: QuantizationMethods | None = None,
+        revision: str | None = None,
+        tokenizer_revision: str | None = None,
+        seed: int | None = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
         enforce_eager: bool = False,
         disable_custom_all_reduce: bool = False,
-        hf_token: Optional[Union[bool, str]] = None,
-        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        pooler_config: Optional[PoolerConfig] = None,
-        override_pooler_config: Optional[PoolerConfig] = None,
-        structured_outputs_config: Optional[Union[dict[
-            str, Any], StructuredOutputsConfig]] = None,
-        kv_cache_memory_bytes: Optional[int] = None,
-        compilation_config: Optional[Union[int, dict[str, Any],
-                                           CompilationConfig]] = None,
-        logits_processors: Optional[list[Union[str,
-                                               type[LogitsProcessor]]]] = None,
+        hf_token: bool | str | None = None,
+        hf_overrides: HfOverrides | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+        pooler_config: PoolerConfig | None = None,
+        override_pooler_config: PoolerConfig | None = None,
+        structured_outputs_config: dict[str, Any]
+        | StructuredOutputsConfig
+        | None = None,
+        kv_cache_memory_bytes: int | None = None,
+        compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
+        logits_processors: list[str | type[LogitsProcessor]] | None = None,
         **kwargs: Any,
     ) -> None:
         """LLM constructor."""
@@ -210,21 +236,23 @@ class LLM:
                 kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
 
         if "kv_transfer_config" in kwargs and isinstance(
-                kwargs["kv_transfer_config"], dict):
+            kwargs["kv_transfer_config"], dict
+        ):
             from vllm.config.kv_transfer import KVTransferConfig
+
             raw_config_dict = kwargs["kv_transfer_config"]
             try:
-                kwargs["kv_transfer_config"] = KVTransferConfig(
-                    **raw_config_dict)
+                kwargs["kv_transfer_config"] = KVTransferConfig(**raw_config_dict)
             except ValidationError as e:
                 logger.error(
                     "Failed to convert 'kv_transfer_config' dict to "
                     "KVTransferConfig object. Dict: %s. Error: %s",
-                    raw_config_dict, e)
+                    raw_config_dict,
+                    e,
+                )
                 # Consider re-raising a more specific vLLM error or ValueError
                 # to provide better context to the user.
-                raise ValueError(
-                    f"Invalid 'kv_transfer_config' provided: {e}") from e
+                raise ValueError(f"Invalid 'kv_transfer_config' provided: {e}") from e
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -232,14 +260,16 @@ class LLM:
         if compilation_config is not None:
             if isinstance(compilation_config, int):
                 compilation_config_instance = CompilationConfig(
-                    level=compilation_config)
+                    mode=CompilationMode(compilation_config)
+                )
             elif isinstance(compilation_config, dict):
                 compilation_config_instance = CompilationConfig(
                     **{
                         k: v
                         for k, v in compilation_config.items()
                         if is_init_field(CompilationConfig, k)
-                    })
+                    }
+                )
             else:
                 compilation_config_instance = compilation_config
         else:
@@ -252,12 +282,28 @@ class LLM:
                         k: v
                         for k, v in structured_outputs_config.items()
                         if is_init_field(StructuredOutputsConfig, k)
-                    })
+                    }
+                )
             else:
                 structured_outputs_instance = structured_outputs_config
         else:
             structured_outputs_instance = StructuredOutputsConfig()
 
+        # warn about single-process data parallel usage.
+        _dp_size = int(kwargs.get("data_parallel_size", 1))
+        _distributed_executor_backend = kwargs.get("distributed_executor_backend")
+        if (
+            _dp_size > 1
+            and not _distributed_executor_backend == "external_launcher"
+            and not current_platform.is_tpu()
+        ):
+            raise ValueError(
+                f"LLM(data_parallel_size={_dp_size}) is not supported for single-"
+                "process usage and may hang. Please use "
+                "the explicit multi-process data-parallel example at "
+                "'examples/offline_inference/data_parallel.py'."
+            )
+
         engine_args = EngineArgs(
             model=model,
             runner=runner,
@@ -295,26 +341,25 @@ class LLM:
 
         # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(
-            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
+            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
+        )
         self.engine_class = type(self.llm_engine)
 
         self.request_counter = Counter()
-        self.default_sampling_params: Union[dict[str, Any], None] = None
-
-        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore
-
-        logger.info("Supported_tasks: %s", supported_tasks)
+        self.default_sampling_params: dict[str, Any] | None = None
 
+        supported_tasks = self.llm_engine.get_supported_tasks()
+        logger.info("Supported tasks: %s", supported_tasks)
         self.supported_tasks = supported_tasks
 
-        # Load the Input/Output processor plugin if any
-        io_processor_plugin = self.llm_engine.model_config.io_processor_plugin
-        self.io_processor = get_io_processor(self.llm_engine.vllm_config,
-                                             io_processor_plugin)
+        self.model_config = self.llm_engine.model_config
+        self.processor = self.llm_engine.processor
+        self.io_processor = self.llm_engine.io_processor
 
     def get_tokenizer(self) -> AnyTokenizer:
         return self.llm_engine.get_tokenizer()
 
+    @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
     def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
@@ -324,23 +369,25 @@ class LLM:
         else:
             self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
 
+    def reset_mm_cache(self) -> None:
+        self.processor.clear_mm_cache()
+        self.llm_engine.reset_mm_cache()
+
     def get_default_sampling_params(self) -> SamplingParams:
         if self.default_sampling_params is None:
-            self.default_sampling_params = (
-                self.llm_engine.model_config.get_diff_sampling_param())
+            self.default_sampling_params = self.model_config.get_diff_sampling_param()
         if self.default_sampling_params:
             return SamplingParams.from_optional(**self.default_sampling_params)
         return SamplingParams()
 
     def generate(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
-        sampling_params: Optional[Union[SamplingParams,
-                                        Sequence[SamplingParams]]] = None,
+        prompts: PromptType | Sequence[PromptType],
+        sampling_params: SamplingParams | Sequence[SamplingParams] | None = None,
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        priority: Optional[list[int]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
+        priority: list[int] | None = None,
     ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -374,21 +421,21 @@ class LLM:
             considered legacy and may be deprecated in the future. You should
             instead pass them via the `inputs` parameter.
         """
-        model_config = self.llm_engine.model_config
+        model_config = self.model_config
         runner_type = model_config.runner_type
         if runner_type != "generate":
             raise ValueError(
                 "LLM.generate() is only supported for generative models. "
                 "Try passing `--runner generate` to use the model as a "
-                "generative model.")
+                "generative model."
+            )
 
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
 
         # Add any modality specific loras to the corresponding prompts
-        lora_request = self._get_modality_specific_lora_reqs(
-            prompts, lora_request)
+        lora_request = self._get_modality_specific_lora_reqs(prompts, lora_request)
 
         self._validate_and_add_requests(
             prompts=prompts,
@@ -402,46 +449,57 @@ class LLM:
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def _get_modality_specific_lora_reqs(
-            self, prompts: Union[PromptType, Sequence[PromptType]],
-            lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
+        self,
+        prompts: PromptType | Sequence[PromptType],
+        lora_request: list[LoRARequest] | LoRARequest | None,
+    ):
         # Grab the lora config off the vllm config on the engine,
         # since this is the same for both v0 & v1.
         lora_config = self.llm_engine.vllm_config.lora_config
 
         # If there's no lora config / default_mm_loras, or the model
         # isn't multimodal, leave the lora as is.
-        if (lora_config is None
-                or not self.llm_engine.model_config.is_multimodal_model
-                or (lora_config and lora_config.default_mm_loras is None)):
+        if (
+            lora_config is None
+            or not self.model_config.is_multimodal_model
+            or (lora_config and lora_config.default_mm_loras is None)
+        ):
             return lora_request
 
         if not isinstance(prompts, Sequence):
             prompts = [prompts]
 
-        optional_loras = ([lora_request] * len(prompts)
-                          if not isinstance(lora_request, Sequence) else
-                          lora_request)
+        optional_loras = (
+            [lora_request] * len(prompts)
+            if not isinstance(lora_request, Sequence)
+            else lora_request
+        )
 
         return [
             self._resolve_single_prompt_mm_lora(
                 prompt,
                 opt_lora_req,
                 lora_config.default_mm_loras,
-            ) for prompt, opt_lora_req in zip(prompts, optional_loras)
+            )
+            for prompt, opt_lora_req in zip(prompts, optional_loras)
         ]
 
-    def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
-                                       lora_request: Optional[LoRARequest],
-                                       default_mm_loras: Optional[dict[str,
-                                                                       str]]):
-        if (not default_mm_loras or not isinstance(prompt, dict)
-                or "multi_modal_data" not in prompt):
+    def _resolve_single_prompt_mm_lora(
+        self,
+        prompt: PromptType,
+        lora_request: LoRARequest | None,
+        default_mm_loras: dict[str, str] | None,
+    ):
+        if (
+            not default_mm_loras
+            or not isinstance(prompt, dict)
+            or not (mm_data := prompt.get("multi_modal_data") or {})
+        ):
             return lora_request
 
-        prompt = cast(Union[TextPrompt, TokensPrompt], prompt)
-
-        intersection = set(prompt["multi_modal_data"].keys()) \
-            .intersection(default_mm_loras.keys())
+        intersection = set(
+            mm_data.keys()  # type: ignore
+        ).intersection(default_mm_loras.keys())
         if not intersection:
             return lora_request
         if len(intersection) > 1:
@@ -451,7 +509,9 @@ class LLM:
                 " used by a single prompt consuming several modalities; "
                 " currently we only support one lora per request; as such,"
                 " lora(s) registered with modalities: %s"
-                " will be skipped", intersection)
+                " will be skipped",
+                intersection,
+            )
             return lora_request
 
         # Build the LoRA request; the ID of the default mm lora is the
@@ -467,7 +527,8 @@ class LLM:
                 logger.warning(
                     "A modality with a registered lora and a lora_request "
                     "with a different ID were provided; falling back to the "
-                    "lora_request as we only apply one LoRARequest per prompt")
+                    "lora_request as we only apply one LoRARequest per prompt"
+                )
             return lora_request
 
         return LoRARequest(
@@ -476,11 +537,13 @@ class LLM:
             modality_lora_path,
         )
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+    def collective_rpc(
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
         """
         Execute an RPC call on all workers.
 
@@ -521,14 +584,14 @@ class LLM:
 
     def _get_beam_search_lora_requests(
         self,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
-        prompts: list[Union[TokensPrompt, TextPrompt]],
-    ) -> list[Optional[LoRARequest]]:
+        lora_request: list[LoRARequest] | LoRARequest | None,
+        prompts: list[TokensPrompt | TextPrompt],
+    ) -> list[LoRARequest | None]:
         """Get the optional lora request corresponding to each prompt."""
-        if isinstance(lora_request,
-                      Sequence) and len(lora_request) != len(prompts):
+        if isinstance(lora_request, Sequence) and len(lora_request) != len(prompts):
             raise ValueError(
-                "Lora request list should be the same length as the prompts")
+                "Lora request list should be the same length as the prompts"
+            )
 
         if lora_request is None or isinstance(lora_request, LoRARequest):
             return [lora_request] * len(prompts)
@@ -537,11 +600,11 @@ class LLM:
 
     def beam_search(
         self,
-        prompts: list[Union[TokensPrompt, TextPrompt]],
+        prompts: list[TokensPrompt | TextPrompt],
         params: BeamSearchParams,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
         use_tqdm: bool = False,
-        concurrency_limit: Optional[int] = None,
+        concurrency_limit: int | None = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -563,8 +626,7 @@ class LLM:
         ignore_eos = params.ignore_eos
         length_penalty = params.length_penalty
 
-        lora_requests = self._get_beam_search_lora_requests(
-            lora_request, prompts)
+        lora_requests = self._get_beam_search_lora_requests(lora_request, prompts)
 
         tokenizer = self.get_tokenizer()
         sort_beams_key = create_sort_beams_key_function(
@@ -575,31 +637,28 @@ class LLM:
         if use_tqdm and concurrency_limit is not None:
             logger.warning(
                 "Progress bar is not supported when using concurrency_limit. "
-                "Disabling progress bar.")
+                "Disabling progress bar."
+            )
             use_tqdm = False
 
         if concurrency_limit is None:
             concurrency_limit = len(prompts)
 
-        def create_tokens_prompt_from_beam(
-                beam: BeamSearchSequence) -> TokensPrompt:
-            token_prompt_kwargs: TokensPrompt = {
-                "prompt_token_ids": beam.tokens
-            }
+        def create_tokens_prompt_from_beam(beam: BeamSearchSequence) -> TokensPrompt:
+            token_prompt_kwargs: TokensPrompt = {"prompt_token_ids": beam.tokens}
             if beam.multi_modal_data is not None:
                 token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data
 
             if beam.mm_processor_kwargs is not None:
-                token_prompt_kwargs[
-                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
+                token_prompt_kwargs["mm_processor_kwargs"] = beam.mm_processor_kwargs
             return TokensPrompt(**token_prompt_kwargs)
 
         # generate 2 * beam_width candidates at each step
         # following the huggingface transformers implementation
         # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
-        beam_search_params = SamplingParams(logprobs=2 * beam_width,
-                                            max_tokens=1,
-                                            temperature=temperature)
+        beam_search_params = SamplingParams(
+            logprobs=2 * beam_width, max_tokens=1, temperature=temperature
+        )
         instances: list[BeamSearchInstance] = []
 
         for lora_req, prompt in zip(lora_requests, prompts):
@@ -608,8 +667,7 @@ class LLM:
             if "multi_modal_data" in prompt:
                 mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
             if "mm_processor_kwargs" in prompt:
-                mm_kwargs["mm_processor_kwargs"] = prompt[
-                    "mm_processor_kwargs"]
+                mm_kwargs["mm_processor_kwargs"] = prompt["mm_processor_kwargs"]
 
             if "prompt_token_ids" in prompt:
                 prompt = cast(TokensPrompt, prompt)  # Needed for mypy
@@ -623,48 +681,58 @@ class LLM:
                     lora_request=lora_req,
                     logprobs=None,
                     **mm_kwargs,
-                ), )
+                ),
+            )
 
         for prompt_start in range(0, len(prompts), concurrency_limit):
-            instances_batch = instances[prompt_start:prompt_start +
-                                        concurrency_limit]
+            instances_batch = instances[prompt_start : prompt_start + concurrency_limit]
 
             token_iter = range(max_tokens)
             if use_tqdm:
-                token_iter = tqdm(token_iter,
-                                  desc="Beam search",
-                                  unit="token",
-                                  unit_scale=False)
+                token_iter = tqdm(
+                    token_iter, desc="Beam search", unit="token", unit_scale=False
+                )
                 logger.warning(
                     "The progress bar shows the upper bound on token steps and "
                     "may finish early due to stopping conditions. It does not "
-                    "reflect instance-level progress.")
+                    "reflect instance-level progress."
+                )
             for _ in token_iter:
                 all_beams: list[BeamSearchSequence] = list(
-                    sum((instance.beams for instance in instances_batch), []))
+                    sum((instance.beams for instance in instances_batch), [])
+                )
                 pos = [0] + list(
                     itertools.accumulate(
-                        len(instance.beams) for instance in instances_batch))
+                        len(instance.beams) for instance in instances_batch
+                    )
+                )
                 instance_start_and_end: list[tuple[int, int]] = list(
-                    zip(pos[:-1], pos[1:]))
+                    zip(pos[:-1], pos[1:])
+                )
 
                 if len(all_beams) == 0:
                     break
 
                 # create corresponding batch entries for prompt & optional lora
                 prompts_batch, lora_req_batch = zip(
-                    *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
-                      for beam in all_beams])
+                    *[
+                        (create_tokens_prompt_from_beam(beam), beam.lora_request)
+                        for beam in all_beams
+                    ]
+                )
 
                 # only runs for one step
                 # we don't need to use tqdm here
-                output = self.generate(prompts_batch,
-                                       sampling_params=beam_search_params,
-                                       use_tqdm=False,
-                                       lora_request=lora_req_batch)
+                output = self.generate(
+                    prompts_batch,
+                    sampling_params=beam_search_params,
+                    use_tqdm=False,
+                    lora_request=lora_req_batch,
+                )
 
-                for (start, end), instance in zip(instance_start_and_end,
-                                                  instances_batch):
+                for (start, end), instance in zip(
+                    instance_start_and_end, instances_batch
+                ):
                     instance_new_beams = []
                     for i in range(start, end):
                         current_beam = all_beams[i]
@@ -679,32 +747,32 @@ class LLM:
                             for token_id, logprob_obj in logprobs.items():
                                 new_beam = BeamSearchSequence(
                                     tokens=current_beam.tokens + [token_id],
-                                    logprobs=current_beam.logprobs +
-                                    [logprobs],
+                                    logprobs=current_beam.logprobs + [logprobs],
                                     lora_request=current_beam.lora_request,
-                                    cum_logprob=current_beam.cum_logprob +
-                                    logprob_obj.logprob,
-                                    multi_modal_data=current_beam.
-                                    multi_modal_data,
-                                    mm_processor_kwargs=current_beam.
-                                    mm_processor_kwargs)
-
-                                if token_id == tokenizer.eos_token_id and \
-                                    not ignore_eos:
+                                    cum_logprob=current_beam.cum_logprob
+                                    + logprob_obj.logprob,
+                                    multi_modal_data=current_beam.multi_modal_data,
+                                    mm_processor_kwargs=current_beam.mm_processor_kwargs,
+                                )
+
+                                if (
+                                    token_id == tokenizer.eos_token_id
+                                    and not ignore_eos
+                                ):
                                     instance.completed.append(new_beam)
                                 else:
                                     instance_new_beams.append(new_beam)
-                    sorted_beams = sorted(instance_new_beams,
-                                          key=sort_beams_key,
-                                          reverse=True)
+                    sorted_beams = sorted(
+                        instance_new_beams, key=sort_beams_key, reverse=True
+                    )
                     instance.beams = sorted_beams[:beam_width]
 
         outputs = []
         for instance in instances:
             instance.completed.extend(instance.beams)
-            sorted_completed = sorted(instance.completed,
-                                      key=sort_beams_key,
-                                      reverse=True)
+            sorted_completed = sorted(
+                instance.completed, key=sort_beams_key, reverse=True
+            )
             best_beams = sorted_completed[:beam_width]
 
             for beam in best_beams:
@@ -715,15 +783,15 @@ class LLM:
 
     def preprocess_chat(
         self,
-        messages: Union[list[ChatCompletionMessageParam],
-                        list[list[ChatCompletionMessageParam]]],
-        chat_template: Optional[str] = None,
+        messages: list[ChatCompletionMessageParam]
+        | list[list[ChatCompletionMessageParam]],
+        chat_template: str | None = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tools: Optional[list[dict[str, Any]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        tools: list[dict[str, Any]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
     ) -> list[TokensPrompt]:
         """
         Generate prompt for a chat conversation. The pre-processed
@@ -740,16 +808,13 @@ class LLM:
         # Handle multi and single conversations
         if is_list_of(messages, list):
             # messages is list[list[...]]
-            list_of_messages = cast(list[list[ChatCompletionMessageParam]],
-                                    messages)
+            list_of_messages = cast(list[list[ChatCompletionMessageParam]], messages)
         else:
             # messages is list[...]
-            list_of_messages = [
-                cast(list[ChatCompletionMessageParam], messages)
-            ]
+            list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
 
         tokenizer = self.get_tokenizer()
-        model_config = self.llm_engine.get_model_config()
+        model_config = self.model_config
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tools,
@@ -794,8 +859,9 @@ class LLM:
                 )
                 # Special tokens are already included in chat templates so
                 # should not be added by the tokenizer in this case.
-                prompt_token_ids = tokenizer.encode(prompt_str,
-                                                    add_special_tokens=False)
+                prompt_token_ids = tokenizer.encode(
+                    prompt_str, add_special_tokens=False
+                )
 
             prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
 
@@ -814,19 +880,18 @@ class LLM:
 
     def chat(
         self,
-        messages: Union[list[ChatCompletionMessageParam],
-                        list[list[ChatCompletionMessageParam]]],
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[LoRARequest] = None,
-        chat_template: Optional[str] = None,
+        messages: list[ChatCompletionMessageParam]
+        | list[list[ChatCompletionMessageParam]],
+        sampling_params: SamplingParams | list[SamplingParams] | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: LoRARequest | None = None,
+        chat_template: str | None = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tools: Optional[list[dict[str, Any]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        tools: list[dict[str, Any]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
     ) -> list[RequestOutput]:
         """
         Generate responses for a chat conversation.
@@ -899,15 +964,14 @@ class LLM:
 
     def encode(
         self,
-        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
+        prompts: PromptType | Sequence[PromptType] | DataPrompt,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
+        pooling_task: PoolingTask | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
         prompts.
@@ -941,49 +1005,33 @@ class LLM:
             instead pass them via the `inputs` parameter.
         """
 
-        if self.supported_tasks == ["encode"] and pooling_task is None:
-            pooling_task = "encode"
+        error_str = (
+            "pooling_task required for `LLM.encode`\n"
+            "Please use one of the more specific methods or set the "
+            "pooling_task when using `LLM.encode`:\n"
+            "  - For embeddings, use `LLM.embed(...)` "
+            'or `pooling_task="embed"`.\n'
+            "  - For classification logits, use `LLM.classify(...)` "
+            'or `pooling_task="classify"`.\n'
+            "  - For similarity scores, use `LLM.score(...)`.\n"
+            "  - For rewards, use `LLM.reward(...)` "
+            'or `pooling_task="token_classify"`\n'
+            "  - For token classification, "
+            'use `pooling_task="token_classify"`\n'
+            '  - For multi-vector retrieval, use `pooling_task="token_embed"`'
+        )
 
         if pooling_task is None:
-            if "embed" in self.supported_tasks:
-                pooling_task = "embed"
-            else:
-                pooling_task = "encode"
-
-            logger.warning_once(
-                "`LLM.encode` is currently using `pooling_task = %s`.\n"
-                "Please use one of the more specific methods or set the "
-                "task directly when using `LLM.encode`:\n"
-                "  - For embeddings, use `LLM.embed(...)` "
-                "or `pooling_task=\"embed\"`.\n"
-                "  - For classification logits, use `LLM.classify(...)` "
-                "or `pooling_task=\"classify\"`.\n"
-                "  - For rewards, use `LLM.reward(...)` "
-                "or `pooling_task=\"reward\"`\n"
-                "  - For similarity scores, use `LLM.score(...)`.",
-                pooling_task)
-
-        model_config = self.llm_engine.model_config
+            raise ValueError(error_str)
+
+        model_config = self.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
             raise ValueError(
                 "LLM.encode() is only supported for pooling models. "
                 "Try passing `--runner pooling` to use the model as a "
-                "pooling model.")
-
-        if pooling_task not in self.supported_tasks:
-            raise ValueError(
-                f"pooling_task must be one of {self.supported_tasks}.")
-
-        if pooling_params is None:
-            # Use default pooling params.
-            pooling_params = PoolingParams()
-
-        for param in as_iter(pooling_params):
-            param.verify(pooling_task, model_config)
-            # for backwards compatibility
-            if truncate_prompt_tokens is not None:
-                param.truncate_prompt_tokens = truncate_prompt_tokens
+                "pooling model."
+            )
 
         io_processor_prompt = False
         if isinstance(prompts, dict) and "data" in prompts:
@@ -993,7 +1041,8 @@ class LLM:
                     "No IOProcessor plugin installed. Please refer "
                     "to the documentation and to the "
                     "'prithvi_geospatial_mae_io_processor' "
-                    "offline inference example for more details.")
+                    "offline inference example for more details."
+                )
 
             # Validate the request data is valid for the loaded plugin
             validated_prompt = self.io_processor.parse_request(prompts)
@@ -1001,6 +1050,34 @@ class LLM:
             # obtain the actual model prompts from the pre-processor
             prompts = self.io_processor.pre_process(prompt=validated_prompt)
 
+        if io_processor_prompt:
+            assert self.io_processor is not None
+            if is_list_of(pooling_params, PoolingParams):
+                validated_pooling_params: list[PoolingParams] = []
+                for param in as_iter(pooling_params):
+                    validated_pooling_params.append(
+                        self.io_processor.validate_or_generate_params(param)
+                    )
+                pooling_params = validated_pooling_params
+            else:
+                assert not isinstance(pooling_params, Sequence)
+                pooling_params = self.io_processor.validate_or_generate_params(
+                    pooling_params
+                )
+        else:
+            if pooling_params is None:
+                # Use default pooling params.
+                pooling_params = PoolingParams()
+
+        if pooling_task not in self.supported_tasks:
+            raise ValueError(f"pooling_task must be one of {self.supported_tasks}.")
+
+        for param in as_iter(pooling_params):
+            param.verify(pooling_task, model_config)
+            # for backwards compatibility
+            if truncate_prompt_tokens is not None:
+                param.truncate_prompt_tokens = truncate_prompt_tokens
+
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params,
@@ -1011,32 +1088,38 @@ class LLM:
         outputs = self._run_engine(use_tqdm=use_tqdm)
 
         model_outputs = self.engine_class.validate_outputs(
-            outputs, PoolingRequestOutput)
+            outputs, PoolingRequestOutput
+        )
 
         if io_processor_prompt:
             # get the post-processed model outputs
             assert self.io_processor is not None
             processed_outputs = self.io_processor.post_process(
-                model_output=model_outputs)
+                model_output=model_outputs
+            )
 
             return [
-                PoolingRequestOutput[Any](request_id="",
-                                          outputs=processed_outputs,
-                                          prompt_token_ids=[],
-                                          finished=True)
+                PoolingRequestOutput[Any](
+                    request_id="",
+                    outputs=processed_outputs,
+                    num_cached_tokens=getattr(
+                        processed_outputs, "num_cached_tokens", 0
+                    ),
+                    prompt_token_ids=[],
+                    finished=True,
+                )
             ]
         else:
             return model_outputs
 
     def embed(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
@@ -1064,7 +1147,8 @@ class LLM:
         if "embed" not in self.supported_tasks:
             raise ValueError(
                 "Embedding API is not supported by this model. "
-                "Try converting the model using `--convert embed`.")
+                "Try converting the model using `--convert embed`."
+            )
 
         items = self.encode(
             prompts,
@@ -1079,12 +1163,11 @@ class LLM:
 
     def classify(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
@@ -1111,7 +1194,8 @@ class LLM:
         if "classify" not in self.supported_tasks:
             raise ValueError(
                 "Classification API is not supported by this model. "
-                "Try converting the model using `--convert classify`.")
+                "Try converting the model using `--convert classify`."
+            )
 
         items = self.encode(
             prompts,
@@ -1125,14 +1209,13 @@ class LLM:
 
     def reward(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         /,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[PoolingRequestOutput]:
         """
         Generate rewards for each prompt.
@@ -1159,20 +1242,19 @@ class LLM:
             lora_request=lora_request,
             pooling_params=pooling_params,
             truncate_prompt_tokens=truncate_prompt_tokens,
-            pooling_task="encode",
+            pooling_task="token_classify",
         )
 
     def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
-        text_1: list[Union[str, TextPrompt, TokensPrompt]],
-        text_2: list[Union[str, TextPrompt, TokensPrompt]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        text_1: list[str | TextPrompt | TokensPrompt],
+        text_2: list[str | TextPrompt | TokensPrompt],
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
-
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
             truncate_prompt_tokens=truncate_prompt_tokens,
@@ -1182,37 +1264,33 @@ class LLM:
             pooling_task="embed",
         )
 
-        encoded_output_1: list[PoolingRequestOutput] = encoded_output[
-            0:len(text_1)]
-        encoded_output_2: list[PoolingRequestOutput] = encoded_output[
-            len(text_1):]
+        encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)]
+        encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(text_1) :]
 
         if len(encoded_output_1) == 1:
             encoded_output_1 = encoded_output_1 * len(encoded_output_2)
 
-        scores = _cosine_similarity(tokenizer=tokenizer,
-                                    embed_1=encoded_output_1,
-                                    embed_2=encoded_output_2)
+        scores = _cosine_similarity(
+            tokenizer=tokenizer, embed_1=encoded_output_1, embed_2=encoded_output_2
+        )
 
-        items = self.engine_class.validate_outputs(scores,
-                                                   PoolingRequestOutput)
+        items = self.engine_class.validate_outputs(scores, PoolingRequestOutput)
         return [ScoringRequestOutput.from_base(item) for item in items]
 
     def _cross_encoding_score(
         self,
         tokenizer: AnyTokenizer,
-        data_1: Union[list[str], list[ScoreContentPartParam]],
-        data_2: Union[list[str], list[ScoreContentPartParam]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        data_1: list[str] | list[ScoreContentPartParam],
+        data_2: list[str] | list[ScoreContentPartParam],
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
-        model_config = self.llm_engine.model_config
+        model_config = self.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
-            raise ValueError(
-                "Score API is not supported for Mistral tokenizer")
+            raise ValueError("Score API is not supported for Mistral tokenizer")
 
         if len(data_1) == 1:
             data_1 = data_1 * len(data_2)
@@ -1220,21 +1298,19 @@ class LLM:
         if pooling_params is None:
             pooling_params = PoolingParams(task="score")
 
-        model_config = self.llm_engine.model_config
         pooling_params.verify("score", model_config)
         pooling_params_list = list[PoolingParams]()
 
         tokenization_kwargs: dict[str, Any] = {}
 
-        _validate_truncation_size(model_config.max_model_len,
-                                  truncate_prompt_tokens, tokenization_kwargs)
+        _validate_truncation_size(
+            model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
+        )
 
         prompts = list[PromptType]()
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        model_config = self.llm_engine.model_config
-
         for q, d in input_pairs:
             _, engine_prompt = get_score_prompt(
                 model_config=model_config,
@@ -1244,7 +1320,7 @@ class LLM:
                 tokenization_kwargs=tokenization_kwargs,
             )
 
-            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
+            if token_type_ids := engine_prompt.pop("token_type_ids", None):
                 params = pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
                 params.extra_kwargs = {"compressed_token_type_ids": compressed}
@@ -1262,23 +1338,20 @@ class LLM:
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        items = self.engine_class.validate_outputs(outputs,
-                                                   PoolingRequestOutput)
+        items = self.engine_class.validate_outputs(outputs, PoolingRequestOutput)
 
         return [ScoringRequestOutput.from_base(item) for item in items]
 
     def score(
         self,
-        data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
-                      ScoreMultiModalParam],
-        data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
-                      ScoreMultiModalParam],
+        data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
+        data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
         /,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs `<text,text_pair>` or
           `<multi-modal data, multi-modal data pair>`.
@@ -1315,22 +1388,27 @@ class LLM:
             A list of `ScoringRequestOutput` objects containing the
             generated scores in the same order as the input prompts.
         """
-        model_config = self.llm_engine.model_config
+        model_config = self.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
             raise ValueError(
                 "LLM.score() is only supported for pooling models. "
                 "Try passing `--runner pooling` to use the model as a "
-                "pooling model.")
+                "pooling model."
+            )
 
         supported_tasks = self.supported_tasks
         if all(t not in supported_tasks for t in ("embed", "classify")):
-            raise ValueError("Score API is not supported by this model. "
-                             "Try converting the model using "
-                             "`--convert embed` or `--convert classify`.")
+            raise ValueError(
+                "Score API is not supported by this model. "
+                "Try converting the model using "
+                "`--convert embed` or `--convert classify`."
+            )
 
-        if (model_config.is_cross_encoder
-                and getattr(model_config.hf_config, "num_labels", 0) != 1):
+        if (
+            model_config.is_cross_encoder
+            and getattr(model_config.hf_config, "num_labels", 0) != 1
+        ):
             raise ValueError("Score API is only enabled for num_labels == 1.")
 
         # the tokenizer for models such as
@@ -1340,12 +1418,16 @@ class LLM:
 
         if not model_config.is_multimodal_model:
 
-            def check_data_type(data: Union[SingletonPrompt,
-                                            Sequence[SingletonPrompt],
-                                            ScoreMultiModalParam]):
+            def check_data_type(
+                data: SingletonPrompt
+                | Sequence[SingletonPrompt]
+                | ScoreMultiModalParam,
+            ):
                 if isinstance(data, dict) and "content" in data:
-                    raise ValueError("ScoreMultiModalParam is not supported "
-                                     f"for {model_config.architecture}")
+                    raise ValueError(
+                        "ScoreMultiModalParam is not supported "
+                        f"for {model_config.architecture}"
+                    )
 
             check_data_type(data_1)
             check_data_type(data_2)
@@ -1353,11 +1435,13 @@ class LLM:
             def ensure_str(prompt: SingletonPrompt):
                 if isinstance(prompt, dict):
                     if "multi_modal_data" in prompt:
-                        raise ValueError("Multi-modal prompt is not "
-                                         "supported for scoring")
+                        raise ValueError(
+                            "Multi-modal prompt is not supported for scoring"
+                        )
                     elif "prompt_token_ids" in prompt:
                         prompt = tokenizer.decode(
-                            cast(TokensPrompt, prompt)["prompt_token_ids"])
+                            cast(TokensPrompt, prompt)["prompt_token_ids"]
+                        )
                     elif "prompt" in prompt:
                         prompt = cast(TextPrompt, prompt)["prompt"]
                 assert type(prompt) is str
@@ -1395,7 +1479,8 @@ class LLM:
                 truncate_prompt_tokens,
                 use_tqdm,
                 pooling_params,
-                lora_request)
+                lora_request,
+            )
         else:
             return self._embedding_score(
                 tokenizer,
@@ -1404,7 +1489,8 @@ class LLM:
                 truncate_prompt_tokens,
                 use_tqdm,
                 pooling_params,
-                lora_request)
+                lora_request,
+            )
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
@@ -1412,8 +1498,8 @@ class LLM:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return self.llm_engine.reset_prefix_cache(device)
+    def reset_prefix_cache(self) -> None:
+        self.llm_engine.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         """
@@ -1437,7 +1523,7 @@ class LLM:
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         """
         Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
         method for more details.
@@ -1455,7 +1541,7 @@ class LLM:
         """Return a snapshot of aggregated metrics from Prometheus.
 
         Returns:
-            A ``MetricSnapshot`` instance capturing the current state
+            A `MetricSnapshot` instance capturing the current state
             of all aggregated metrics from Prometheus.
 
         Note:
@@ -1465,13 +1551,15 @@ class LLM:
 
     def _validate_and_add_requests(
         self,
-        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
-        params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
-                      Sequence[PoolingParams]],
+        prompts: PromptType | Sequence[PromptType] | DataPrompt,
+        params: SamplingParams
+        | Sequence[SamplingParams]
+        | PoolingParams
+        | Sequence[PoolingParams],
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
-        priority: Optional[list[int]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: Sequence[LoRARequest] | LoRARequest | None,
+        priority: list[int] | None = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
@@ -1479,14 +1567,19 @@ class LLM:
 
         num_requests = len(prompts)
         if isinstance(params, Sequence) and len(params) != num_requests:
-            raise ValueError("The lengths of prompts and params "
-                             "must be the same.")
-        if isinstance(lora_request,
-                      Sequence) and len(lora_request) != num_requests:
-            raise ValueError("The lengths of prompts and lora_request "
-                             "must be the same.")
-
-        for sp in params if isinstance(params, Sequence) else (params, ):
+            raise ValueError("The lengths of prompts and params must be the same.")
+        if isinstance(lora_request, Sequence) and len(lora_request) != num_requests:
+            raise ValueError(
+                "The lengths of prompts and lora_request must be the same."
+            )
+        if priority is not None and len(priority) != num_requests:
+            raise ValueError(
+                "The lengths of prompts "
+                f"({num_requests}) and priority ({len(priority)}) "
+                "must be the same."
+            )
+
+        for sp in params if isinstance(params, Sequence) else (params,):
             if isinstance(sp, SamplingParams):
                 # We only care about the final output
                 sp.output_kind = RequestOutputKind.FINAL_ONLY
@@ -1497,35 +1590,32 @@ class LLM:
             tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
             it = tqdm_func(it, desc="Adding requests")
 
-        model_config = self.llm_engine.model_config
-
-        for i, prompt in enumerate(it):
-
-            if isinstance(prompt, dict):
-                self._validate_mm_data_and_uuids(
-                    prompt.get("multi_modal_data"),
-                    prompt.get("multi_modal_uuids"))
-
-            param = params[i] if isinstance(params, Sequence) else params
+        added_request_ids: list[str] = []
 
-            tokenization_kwargs: dict[str, Any] = {}
-            _validate_truncation_size(model_config.max_model_len,
-                                      param.truncate_prompt_tokens,
-                                      tokenization_kwargs)
-
-            self._add_request(
-                prompt,
-                params[i] if isinstance(params, Sequence) else params,
-                tokenization_kwargs=tokenization_kwargs,
-                lora_request=lora_request[i] if isinstance(
-                    lora_request, Sequence) else lora_request,
-                priority=priority[i] if priority else 0,
-            )
+        try:
+            for i, prompt in enumerate(it):
+                if isinstance(prompt, dict):
+                    self._validate_mm_data_and_uuids(
+                        prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids")
+                    )
+                request_id = self._add_request(
+                    prompt,
+                    params[i] if isinstance(params, Sequence) else params,
+                    lora_request=lora_request[i]
+                    if isinstance(lora_request, Sequence)
+                    else lora_request,
+                    priority=priority[i] if priority else 0,
+                )
+                added_request_ids.append(request_id)
+        except Exception as e:
+            if added_request_ids:
+                self.llm_engine.abort_request(added_request_ids)
+            raise e
 
     def _validate_mm_data_and_uuids(
-            self,
-            multi_modal_data: Optional[Any],  # MultiModalDataDict
-            multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
+        self,
+        multi_modal_data: Any | None,  # MultiModalDataDict
+        multi_modal_uuids: Any | None,  # MultiModalUUIDDict
     ):
         """
         Validate that if any multi-modal data is skipped (i.e. None),
@@ -1538,48 +1628,97 @@ class LLM:
             if isinstance(data, list):
                 for i, d in enumerate(data):
                     if d is None:
-                        if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[  # noqa: E501
-                                modality] is None:
+                        if (
+                            multi_modal_uuids is None
+                            or modality not in multi_modal_uuids
+                            or multi_modal_uuids[  # noqa: E501
+                                modality
+                            ]
+                            is None
+                        ):
                             raise ValueError(
                                 f"Multi-modal data for {modality} is None "
-                                f"but UUID is not provided")
+                                f"but UUID is not provided"
+                            )
                         else:
-                            if len(
-                                    multi_modal_uuids[modality]
-                            ) <= i or multi_modal_uuids[modality][i] is None:
+                            if (
+                                len(multi_modal_uuids[modality]) <= i
+                                or multi_modal_uuids[modality][i] is None
+                            ):
                                 raise ValueError(
                                     f"Multi-modal data for {modality} is None "
-                                    f"but UUID is not provided")
+                                    f"but UUID is not provided"
+                                )
             else:
-                if data is None and (multi_modal_uuids is None
-                                     or modality not in multi_modal_uuids
-                                     or multi_modal_uuids[modality] is None):
-                    raise ValueError(f"Multi-modal data for {modality} is None"
-                                     f" but UUID is not provided")
+                if data is None and (
+                    multi_modal_uuids is None
+                    or modality not in multi_modal_uuids
+                    or multi_modal_uuids[modality] is None
+                ):
+                    raise ValueError(
+                        f"Multi-modal data for {modality} is None"
+                        f" but UUID is not provided"
+                    )
+
+    def _process_inputs(
+        self,
+        request_id: str,
+        engine_prompt: PromptType,
+        params: SamplingParams | PoolingParams,
+        *,
+        lora_request: LoRARequest | None,
+        priority: int,
+    ) -> tuple[EngineCoreRequest, dict[str, Any]]:
+        """Use the Processor to process inputs for LLMEngine."""
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(
+            self.model_config.max_model_len,
+            params.truncate_prompt_tokens,
+            tokenization_kwargs,
+        )
+
+        engine_request = self.processor.process_inputs(
+            request_id,
+            engine_prompt,
+            params,
+            lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
+            priority=priority,
+        )
+        return engine_request, tokenization_kwargs
 
     def _add_request(
         self,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        lora_request: Optional[LoRARequest] = None,
+        params: SamplingParams | PoolingParams,
+        lora_request: LoRARequest | None = None,
         priority: int = 0,
-    ) -> None:
+    ) -> str:
+        prompt_text, _, _ = get_prompt_components(prompt)
         request_id = str(next(self.request_counter))
-        self.llm_engine.add_request(
+
+        engine_request, tokenization_kwargs = self._process_inputs(
             request_id,
             prompt,
             params,
             lora_request=lora_request,
+            priority=priority,
+        )
+
+        self.llm_engine.add_request(
+            request_id,
+            engine_request,
+            params,
+            lora_request=lora_request,
             tokenization_kwargs=tokenization_kwargs,
             priority=priority,
+            prompt_text=prompt_text,
         )
+        return request_id
 
     def _run_engine(
-        self,
-        *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True
-    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
+        self, *, use_tqdm: bool | Callable[..., tqdm] = True
+    ) -> list[RequestOutput | PoolingRequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -1588,12 +1727,11 @@ class LLM:
                 total=num_requests,
                 desc="Processed prompts",
                 dynamic_ncols=True,
-                postfix=(f"est. speed input: {0:.2f} toks/s, "
-                         f"output: {0:.2f} toks/s"),
+                postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
             )
 
         # Run the engine.
-        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
+        outputs: list[RequestOutput | PoolingRequestOutput] = []
         total_in_toks = 0
         total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
@@ -1609,12 +1747,13 @@ class LLM:
                             total_in_toks += len(output.prompt_token_ids) * n
                             in_spd = total_in_toks / pbar.format_dict["elapsed"]
                             total_out_toks += sum(
-                                len(stp.token_ids) for stp in output.outputs)
-                            out_spd = (total_out_toks /
-                                       pbar.format_dict["elapsed"])
+                                len(stp.token_ids) for stp in output.outputs
+                            )
+                            out_spd = total_out_toks / pbar.format_dict["elapsed"]
                             pbar.postfix = (
                                 f"est. speed input: {in_spd:.2f} toks/s, "
-                                f"output: {out_spd:.2f} toks/s")
+                                f"output: {out_spd:.2f} toks/s"
+                            )
                             pbar.update(n)
                         else:
                             pbar.update(1)
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 152d11c84ea0295c634726557808532271aea015..678a7b3a60b558c032793cf33367d7842cba98f0 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import torch
 
@@ -15,19 +14,17 @@ logger = init_logger(__name__)
 
 
 class RequestLogger:
-
-    def __init__(self, *, max_log_len: Optional[int]) -> None:
+    def __init__(self, *, max_log_len: int | None) -> None:
         self.max_log_len = max_log_len
 
     def log_inputs(
         self,
         request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_embeds: Optional[torch.Tensor],
-        params: Optional[Union[SamplingParams, PoolingParams,
-                               BeamSearchParams]],
-        lora_request: Optional[LoRARequest],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_embeds: torch.Tensor | None,
+        params: SamplingParams | PoolingParams | BeamSearchParams | None,
+        lora_request: LoRARequest | None,
     ) -> None:
         max_log_len = self.max_log_len
         if max_log_len is not None:
@@ -37,20 +34,29 @@ class RequestLogger:
             if prompt_token_ids is not None:
                 prompt_token_ids = prompt_token_ids[:max_log_len]
 
-        logger.info(
-            "Received request %s: prompt: %r, "
-            "params: %s, prompt_token_ids: %s, "
-            "prompt_embeds shape: %s, "
-            "lora_request: %s.", request_id, prompt, params, prompt_token_ids,
+        logger.debug(
+            "Request %s details: prompt: %r, "
+            "prompt_token_ids: %s, "
+            "prompt_embeds shape: %s.",
+            request_id,
+            prompt,
+            prompt_token_ids,
             prompt_embeds.shape if prompt_embeds is not None else None,
-            lora_request)
+        )
+
+        logger.info(
+            "Received request %s: params: %s, lora_request: %s.",
+            request_id,
+            params,
+            lora_request,
+        )
 
     def log_outputs(
         self,
         request_id: str,
         outputs: str,
-        output_token_ids: Optional[Sequence[int]],
-        finish_reason: Optional[str] = None,
+        output_token_ids: Sequence[int] | None,
+        finish_reason: str | None = None,
         is_streaming: bool = False,
         delta: bool = False,
     ) -> None:
@@ -65,8 +71,7 @@ class RequestLogger:
 
         stream_info = ""
         if is_streaming:
-            stream_info = (" (streaming delta)"
-                           if delta else " (streaming complete)")
+            stream_info = " (streaming delta)" if delta else " (streaming complete)"
 
         logger.info(
             "Generated response %s%s: output: %r, "
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 15844d3162fe91b893ba88338c8be27b3d5d08c9..3cf66fcd27e2a2909eb6d9aeec8b3004b1724f6e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import asyncio
-import gc
 import hashlib
 import importlib
 import inspect
@@ -16,17 +14,17 @@ import socket
 import tempfile
 import uuid
 from argparse import Namespace
-from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Callable, Literal, Optional
+from typing import Annotated, Any, Literal
 
+import model_hosting_container_standards.sagemaker as sagemaker_standards
 import prometheus_client
 import pydantic
 import regex as re
 import uvloop
-from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query,
-                     Request)
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -42,69 +40,87 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (load_chat_template,
-                                         resolve_hf_chat_template,
-                                         resolve_mistral_chat_template)
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicError,
+    AnthropicErrorResponse,
+    AnthropicMessagesRequest,
+    AnthropicMessagesResponse,
+)
+from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.cli_args import (make_arg_parser,
-                                              validate_parsed_serve_args)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionResponse,
-                                              ClassificationRequest,
-                                              ClassificationResponse,
-                                              CompletionRequest,
-                                              CompletionResponse,
-                                              DetokenizeRequest,
-                                              DetokenizeResponse,
-                                              EmbeddingRequest,
-                                              EmbeddingResponse, ErrorInfo,
-                                              ErrorResponse,
-                                              IOProcessorResponse,
-                                              LoadLoRAAdapterRequest,
-                                              PoolingRequest, PoolingResponse,
-                                              RerankRequest, RerankResponse,
-                                              ResponsesRequest,
-                                              ResponsesResponse, ScoreRequest,
-                                              ScoreResponse,
-                                              StreamingResponsesResponse,
-                                              TokenizeRequest,
-                                              TokenizeResponse,
-                                              TranscriptionRequest,
-                                              TranscriptionResponse,
-                                              TranslationRequest,
-                                              TranslationResponse,
-                                              UnloadLoRAAdapterRequest)
-# yapf: enable
+from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
+from vllm.entrypoints.openai.orca_metrics import metrics_header
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ClassificationRequest,
+    ClassificationResponse,
+    CompletionRequest,
+    CompletionResponse,
+    DetokenizeRequest,
+    DetokenizeResponse,
+    EmbeddingBytesResponse,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorInfo,
+    ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingRequest,
+    PoolingResponse,
+    RerankRequest,
+    RerankResponse,
+    ResponsesRequest,
+    ResponsesResponse,
+    ScoreRequest,
+    ScoreResponse,
+    StreamingResponsesResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranslationRequest,
+    TranslationResponse,
+)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_classification import (
-    ServingClassification)
+from vllm.entrypoints.openai.serving_classification import ServingClassification
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import (
+    BaseModelPath,
+    OpenAIServingModels,
+)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
 from vllm.entrypoints.openai.serving_score import ServingScores
-from vllm.entrypoints.openai.serving_tokenization import (
-    OpenAIServingTokenization)
+from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
+from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
-    OpenAIServingTranscription, OpenAIServingTranslation)
+    OpenAIServingTranscription,
+    OpenAIServingTranslation,
+)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.tool_server import (DemoToolServer, MCPToolServer,
-                                          ToolServer)
-from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
-                                    log_non_default_args, with_cancellation)
+from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
+from vllm.entrypoints.utils import (
+    cli_env_setup,
+    load_aware_call,
+    log_non_default_args,
+    process_chat_template,
+    process_lora_modules,
+    with_cancellation,
+)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tasks import POOLING_TASKS
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
-                        is_valid_ipv6_address, set_ulimit)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.network_utils import is_valid_ipv6_address
+from vllm.utils.system_utils import decorate_logs, set_ulimit
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
@@ -112,7 +128,9 @@ from vllm.version import __version__ as VLLM_VERSION
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
 
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
-logger = init_logger('vllm.entrypoints.openai.api_server')
+logger = init_logger("vllm.entrypoints.openai.api_server")
+
+ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
 
 _running_tasks: set[asyncio.Task] = set()
 
@@ -136,8 +154,7 @@ async def lifespan(app: FastAPI):
 
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
+        freeze_gc_heap()
         try:
             yield
         finally:
@@ -153,15 +170,14 @@ async def build_async_engine_client(
     args: Namespace,
     *,
     usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
-    disable_frontend_multiprocessing: Optional[bool] = None,
-    client_config: Optional[dict[str, Any]] = None,
+    disable_frontend_multiprocessing: bool | None = None,
+    client_config: dict[str, Any] | None = None,
 ) -> AsyncIterator[EngineClient]:
-
     if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
         # The executor is expected to be mp.
         # Pre-import heavy modules in the forkserver process
         logger.debug("Setup forkserver with pre-imports")
-        multiprocessing.set_start_method('forkserver')
+        multiprocessing.set_start_method("forkserver")
         multiprocessing.set_forkserver_preload(["vllm.v1.engine.async_llm"])
         forkserver.ensure_running()
         logger.debug("Forkserver setup complete!")
@@ -174,14 +190,13 @@ async def build_async_engine_client(
         engine_args._api_process_rank = client_config.get("client_index", 0)
 
     if disable_frontend_multiprocessing is None:
-        disable_frontend_multiprocessing = bool(
-            args.disable_frontend_multiprocessing)
+        disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing)
 
     async with build_async_engine_client_from_engine_args(
-            engine_args,
-            usage_context=usage_context,
-            disable_frontend_multiprocessing=disable_frontend_multiprocessing,
-            client_config=client_config,
+        engine_args,
+        usage_context=usage_context,
+        disable_frontend_multiprocessing=disable_frontend_multiprocessing,
+        client_config=client_config,
     ) as engine:
         yield engine
 
@@ -192,7 +207,7 @@ async def build_async_engine_client_from_engine_args(
     *,
     usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
     disable_frontend_multiprocessing: bool = False,
-    client_config: Optional[dict[str, Any]] = None,
+    client_config: dict[str, Any] | None = None,
 ) -> AsyncIterator[EngineClient]:
     """
     Create EngineClient, either:
@@ -205,16 +220,12 @@ async def build_async_engine_client_from_engine_args(
     # Create the EngineConfig (determines if we can use V1).
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
-    # V1 AsyncLLM.
-    assert envs.VLLM_USE_V1
-
     if disable_frontend_multiprocessing:
-        logger.warning(
-            "V1 is enabled, but got --disable-frontend-multiprocessing. "
-            "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+        logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.")
 
     from vllm.v1.engine.async_llm import AsyncLLM
-    async_llm: Optional[AsyncLLM] = None
+
+    async_llm: AsyncLLM | None = None
 
     # Don't mutate the input client_config
     client_config = dict(client_config) if client_config else {}
@@ -226,12 +237,15 @@ async def build_async_engine_client_from_engine_args(
             vllm_config=vllm_config,
             usage_context=usage_context,
             enable_log_requests=engine_args.enable_log_requests,
+            aggregate_engine_logging=engine_args.aggregate_engine_logging,
             disable_log_stats=engine_args.disable_log_stats,
             client_addresses=client_config,
             client_count=client_count,
-            client_index=client_index)
+            client_index=client_index,
+        )
 
         # Don't keep the dummy data in memory
+        assert async_llm is not None
         await async_llm.reset_mm_cache()
 
         yield async_llm
@@ -244,9 +258,9 @@ async def validate_json_request(raw_request: Request):
     content_type = raw_request.headers.get("content-type", "").lower()
     media_type = content_type.split(";", maxsplit=1)[0]
     if media_type != "application/json":
-        raise RequestValidationError(errors=[
-            "Unsupported Media Type: Only 'application/json' is allowed"
-        ])
+        raise RequestValidationError(
+            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
+        )
 
 
 router = APIRouter()
@@ -294,35 +308,39 @@ def models(request: Request) -> OpenAIServingModels:
     return request.app.state.openai_serving_models
 
 
-def responses(request: Request) -> Optional[OpenAIServingResponses]:
+def responses(request: Request) -> OpenAIServingResponses | None:
     return request.app.state.openai_serving_responses
 
 
-def chat(request: Request) -> Optional[OpenAIServingChat]:
+def messages(request: Request) -> AnthropicServingMessages:
+    return request.app.state.anthropic_serving_messages
+
+
+def chat(request: Request) -> OpenAIServingChat | None:
     return request.app.state.openai_serving_chat
 
 
-def completion(request: Request) -> Optional[OpenAIServingCompletion]:
+def completion(request: Request) -> OpenAIServingCompletion | None:
     return request.app.state.openai_serving_completion
 
 
-def pooling(request: Request) -> Optional[OpenAIServingPooling]:
+def pooling(request: Request) -> OpenAIServingPooling | None:
     return request.app.state.openai_serving_pooling
 
 
-def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
     return request.app.state.openai_serving_embedding
 
 
-def score(request: Request) -> Optional[ServingScores]:
+def score(request: Request) -> ServingScores | None:
     return request.app.state.openai_serving_scores
 
 
-def classify(request: Request) -> Optional[ServingClassification]:
+def classify(request: Request) -> ServingClassification | None:
     return request.app.state.openai_serving_classification
 
 
-def rerank(request: Request) -> Optional[ServingScores]:
+def rerank(request: Request) -> ServingScores | None:
     return request.app.state.openai_serving_scores
 
 
@@ -342,6 +360,10 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
     """Health check."""
@@ -368,33 +390,19 @@ async def get_server_load_metrics(request: Request):
     # - /rerank
     # - /v1/rerank
     # - /v2/rerank
-    return JSONResponse(
-        content={'server_load': request.app.state.server_load_metrics})
-
-
-@router.get("/ping", response_class=Response)
-@router.post("/ping", response_class=Response)
-async def ping(raw_request: Request) -> Response:
-    """Ping check. Endpoint required for SageMaker"""
-    return await health(raw_request)
-
-
-@router.post("/tokenize",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_FOUND.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_IMPLEMENTED.value: {
-                     "model": ErrorResponse
-                 },
-             })
+    return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
+
+
+@router.post(
+    "/tokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
@@ -402,34 +410,33 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
     try:
         generator = await handler.create_tokenize(request, raw_request)
     except NotImplementedError as e:
-        raise HTTPException(status_code=HTTPStatus.NOT_IMPLEMENTED.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
+        ) from e
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, TokenizeResponse):
         return JSONResponse(content=generator.model_dump())
 
     assert_never(generator)
 
 
-@router.post("/detokenize",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_FOUND.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/detokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
@@ -439,12 +446,14 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
     except OverflowError as e:
         raise RequestValidationError(errors=[str(e)]) from e
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, DetokenizeResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -453,15 +462,18 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 def maybe_register_tokenizer_info_endpoint(args):
     """Conditionally register the tokenizer info endpoint if enabled."""
-    if getattr(args, 'enable_tokenizer_info_endpoint', False):
+    if getattr(args, "enable_tokenizer_info_endpoint", False):
 
         @router.get("/tokenizer_info")
         async def get_tokenizer_info(raw_request: Request):
             """Get comprehensive tokenizer information."""
             result = await tokenization(raw_request).get_tokenizer_info()
-            return JSONResponse(content=result.model_dump(),
-                                status_code=result.error.code if isinstance(
-                                    result, ErrorResponse) else 200)
+            return JSONResponse(
+                content=result.model_dump(),
+                status_code=result.error.code
+                if isinstance(result, ErrorResponse)
+                else 200,
+            )
 
 
 @router.get("/v1/models")
@@ -479,68 +491,66 @@ async def show_version():
 
 
 async def _convert_stream_to_sse_events(
-    generator: AsyncGenerator[StreamingResponsesResponse, None]
+    generator: AsyncGenerator[StreamingResponsesResponse, None],
 ) -> AsyncGenerator[str, None]:
     """Convert the generator to a stream of events in SSE format"""
     async for event in generator:
-        event_type = getattr(event, 'type', 'unknown')
+        event_type = getattr(event, "type", "unknown")
         # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format
-        event_data = (f"event: {event_type}\n"
-                      f"data: {event.model_dump_json(indent=None)}\n\n")
+        event_data = (
+            f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n"
+        )
         yield event_data
 
 
-@router.post("/v1/responses",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.OK.value: {
-                     "content": {
-                         "text/event-stream": {}
-                     }
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_FOUND.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/responses",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 async def create_responses(request: ResponsesRequest, raw_request: Request):
     handler = responses(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Responses API")
+            message="The model does not support Responses API"
+        )
     try:
         generator = await handler.create_responses(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, ResponsesResponse):
         return JSONResponse(content=generator.model_dump())
 
-    return StreamingResponse(content=_convert_stream_to_sse_events(generator),
-                             media_type="text/event-stream")
+    return StreamingResponse(
+        content=_convert_stream_to_sse_events(generator), media_type="text/event-stream"
+    )
 
 
 @router.get("/v1/responses/{response_id}")
 async def retrieve_responses(
     response_id: str,
     raw_request: Request,
-    starting_after: Optional[int] = None,
-    stream: Optional[bool] = False,
+    starting_after: int | None = None,
+    stream: bool | None = False,
 ):
     handler = responses(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Responses API")
+            message="The model does not support Responses API"
+        )
 
     try:
         response = await handler.retrieve_responses(
@@ -549,16 +559,19 @@ async def retrieve_responses(
             stream=stream,
         )
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(response, ErrorResponse):
-        return JSONResponse(content=response.model_dump(),
-                            status_code=response.error.code)
+        return JSONResponse(
+            content=response.model_dump(), status_code=response.error.code
+        )
     elif isinstance(response, ResponsesResponse):
         return JSONResponse(content=response.model_dump())
-    return StreamingResponse(content=_convert_stream_to_sse_events(response),
-                             media_type="text/event-stream")
+    return StreamingResponse(
+        content=_convert_stream_to_sse_events(response), media_type="text/event-stream"
+    )
 
 
 @router.post("/v1/responses/{response_id}/cancel")
@@ -566,165 +579,243 @@ async def cancel_responses(response_id: str, raw_request: Request):
     handler = responses(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Responses API")
+            message="The model does not support Responses API"
+        )
 
     try:
         response = await handler.cancel_responses(response_id)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(response, ErrorResponse):
-        return JSONResponse(content=response.model_dump(),
-                            status_code=response.error.code)
+        return JSONResponse(
+            content=response.model_dump(), status_code=response.error.code
+        )
     return JSONResponse(content=response.model_dump())
 
 
-@router.post("/v1/chat/completions",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.OK.value: {
-                     "content": {
-                         "text/event-stream": {}
-                     }
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_FOUND.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 }
-             })
+@router.post(
+    "/v1/messages",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
-async def create_chat_completion(request: ChatCompletionRequest,
-                                 raw_request: Request):
+async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
+    def translate_error_response(response: ErrorResponse) -> JSONResponse:
+        anthropic_error = AnthropicErrorResponse(
+            error=AnthropicError(
+                type=response.error.type,
+                message=response.error.message,
+            )
+        )
+        return JSONResponse(
+            status_code=response.error.code, content=anthropic_error.model_dump()
+        )
+
+    handler = messages(raw_request)
+    if handler is None:
+        error = base(raw_request).create_error_response(
+            message="The model does not support Messages API"
+        )
+        return translate_error_response(error)
+
+    try:
+        generator = await handler.create_messages(request, raw_request)
+    except Exception as e:
+        logger.exception("Error in create_messages: %s", e)
+        return JSONResponse(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+            content=AnthropicErrorResponse(
+                error=AnthropicError(
+                    type="internal_error",
+                    message=str(e),
+                )
+            ).model_dump(),
+        )
+
+    if isinstance(generator, ErrorResponse):
+        return translate_error_response(generator)
+
+    elif isinstance(generator, AnthropicMessagesResponse):
+        resp = generator.model_dump(exclude_none=True)
+        logger.debug("Anthropic Messages Response: %s", resp)
+        return JSONResponse(content=resp)
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.post(
+    "/v1/chat/completions",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
     handler = chat(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Chat Completions API")
+            message="The model does not support Chat Completions API"
+        )
     try:
         generator = await handler.create_chat_completion(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
     elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(content=generator.model_dump())
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/v1/completions",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.OK.value: {
-                     "content": {
-                         "text/event-stream": {}
-                     }
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.NOT_FOUND.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/completions",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
 async def create_completion(request: CompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
     handler = completion(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Completions API")
+            message="The model does not support Completions API"
+        )
 
     try:
         generator = await handler.create_completion(request, raw_request)
     except OverflowError as e:
-        raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
+        ) from e
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, CompletionResponse):
-        return JSONResponse(content=generator.model_dump())
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/v1/embeddings",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/embeddings",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
-async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+async def create_embedding(
+    request: EmbeddingRequest,
+    raw_request: Request,
+):
     handler = embedding(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API")
+            message="The model does not support Embeddings API"
+        )
 
     try:
         generator = await handler.create_embedding(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, EmbeddingResponse):
         return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, EmbeddingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
 
     assert_never(generator)
 
 
-@router.post("/pooling",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/pooling",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
 async def create_pooling(request: PoolingRequest, raw_request: Request):
     handler = pooling(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Pooling API")
+            message="The model does not support Pooling API"
+        )
     try:
         generator = await handler.create_pooling(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
         return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, PoolingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
 
     assert_never(generator)
 
@@ -732,21 +823,23 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
 @router.post("/classify", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 @load_aware_call
-async def create_classify(request: ClassificationRequest,
-                          raw_request: Request):
+async def create_classify(request: ClassificationRequest, raw_request: Request):
     handler = classify(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Classification API")
+            message="The model does not support Classification API"
+        )
 
     try:
         generator = await handler.create_classify(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
     elif isinstance(generator, ClassificationResponse):
         return JSONResponse(content=generator.model_dump())
@@ -754,96 +847,90 @@ async def create_classify(request: ClassificationRequest,
     assert_never(generator)
 
 
-@router.post("/score",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Score API")
+            message="The model does not support Score API"
+        )
 
     try:
         generator = await handler.create_score(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, ScoreResponse):
         return JSONResponse(content=generator.model_dump())
 
     assert_never(generator)
 
 
-@router.post("/v1/score",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
 async def create_score_v1(request: ScoreRequest, raw_request: Request):
     logger.warning(
         "To indicate that Score API is not part of standard OpenAI API, we "
-        "have moved it to `/score`. Please update your client accordingly.")
+        "have moved it to `/score`. Please update your client accordingly."
+    )
 
     return await create_score(request, raw_request)
 
 
-@router.post("/v1/audio/transcriptions",
-             responses={
-                 HTTPStatus.OK.value: {
-                     "content": {
-                         "text/event-stream": {}
-                     }
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.UNPROCESSABLE_ENTITY.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/audio/transcriptions",
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
-async def create_transcriptions(raw_request: Request,
-                                request: Annotated[TranscriptionRequest,
-                                                   Form()]):
+async def create_transcriptions(
+    raw_request: Request, request: Annotated[TranscriptionRequest, Form()]
+):
     handler = transcription(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Transcriptions API")
+            message="The model does not support Transcriptions API"
+        )
 
     audio_data = await request.file.read()
     try:
-        generator = await handler.create_transcription(audio_data, request,
-                                                       raw_request)
+        generator = await handler.create_transcription(audio_data, request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
     elif isinstance(generator, TranscriptionResponse):
         return JSONResponse(content=generator.model_dump())
@@ -851,44 +938,38 @@ async def create_transcriptions(raw_request: Request,
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/v1/audio/translations",
-             responses={
-                 HTTPStatus.OK.value: {
-                     "content": {
-                         "text/event-stream": {}
-                     }
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.UNPROCESSABLE_ENTITY.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/audio/translations",
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
-async def create_translations(request: Annotated[TranslationRequest,
-                                                 Form()],
-                              raw_request: Request):
+async def create_translations(
+    request: Annotated[TranslationRequest, Form()], raw_request: Request
+):
     handler = translation(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Translations API")
+            message="The model does not support Translations API"
+        )
 
     audio_data = await request.file.read()
     try:
-        generator = await handler.create_translation(audio_data, request,
-                                                     raw_request)
+        generator = await handler.create_translation(audio_data, request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
     elif isinstance(generator, TranslationResponse):
         return JSONResponse(content=generator.model_dump())
@@ -896,90 +977,88 @@ async def create_translations(request: Annotated[TranslationRequest,
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/rerank",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 @load_aware_call
 async def do_rerank(request: RerankRequest, raw_request: Request):
     handler = rerank(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
-            message="The model does not support Rerank (Score) API")
+            message="The model does not support Rerank (Score) API"
+        )
     try:
         generator = await handler.do_rerank(request, raw_request)
     except Exception as e:
-        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                            detail=str(e)) from e
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.error.code)
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
     elif isinstance(generator, RerankResponse):
         return JSONResponse(content=generator.model_dump())
 
     assert_never(generator)
 
 
-@router.post("/v1/rerank",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v1/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 async def do_rerank_v1(request: RerankRequest, raw_request: Request):
     logger.warning_once(
         "To indicate that the rerank API is not part of the standard OpenAI"
         " API, we have located it at `/rerank`. Please update your client "
-        "accordingly. (Note: Conforms to JinaAI rerank API)")
+        "accordingly. (Note: Conforms to JinaAI rerank API)"
+    )
 
     return await do_rerank(request, raw_request)
 
 
-@router.post("/v2/rerank",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/v2/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 @with_cancellation
 async def do_rerank_v2(request: RerankRequest, raw_request: Request):
     return await do_rerank(request, raw_request)
 
 
 if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
-                   "This should NOT be used in production!")
+    logger.warning(
+        "SECURITY WARNING: Development endpoints are enabled! "
+        "This should NOT be used in production!"
+    )
 
     PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
 
     @router.get("/server_info")
     async def show_server_info(
         raw_request: Request,
-        config_format: Annotated[Literal["text", "json"],
-                                 Query()] = "text",
+        config_format: Annotated[Literal["text", "json"], Query()] = "text",
     ):
         vllm_config: VllmConfig = raw_request.app.state.vllm_config
         server_info = {
-            "vllm_config":
-            str(vllm_config)
-            if config_format == "text" else PydanticVllmConfig.dump_python(
-                vllm_config, mode="json", fallback=str)
+            "vllm_config": str(vllm_config)
+            if config_format == "text"
+            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
             # fallback=str is needed to handle e.g. torch.dtype
         }
         return JSONResponse(content=server_info)
@@ -990,12 +1069,18 @@ if envs.VLLM_SERVER_DEV_MODE:
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
-        device = None
-        device_str = raw_request.query_params.get("device")
-        if device_str is not None:
-            device = Device[device_str.upper()]
-        logger.info("Resetting prefix cache with specific %s...", str(device))
-        await engine_client(raw_request).reset_prefix_cache(device)
+        logger.info("Resetting prefix cache...")
+        await engine_client(raw_request).reset_prefix_cache()
+        return Response(status_code=200)
+
+    @router.post("/reset_mm_cache")
+    async def reset_mm_cache(raw_request: Request):
+        """
+        Reset the multi-modal cache. Note that we currently do not check if the
+        multi-modal cache is successfully reset in the API server.
+        """
+        logger.info("Resetting multi-modal cache...")
+        await engine_client(raw_request).reset_mm_cache()
         return Response(status_code=200)
 
     @router.post("/sleep")
@@ -1030,19 +1115,24 @@ if envs.VLLM_SERVER_DEV_MODE:
         try:
             body = await raw_request.json()
         except json.JSONDecodeError as e:
-            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
-                                detail=f"JSON decode error: {e}") from e
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail=f"JSON decode error: {e}",
+            ) from e
         method = body.get("method")
         if method is None:
-            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
-                                detail="Missing 'method' in request body")
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail="Missing 'method' in request body",
+            )
         # For security reason, only serialized string args/kwargs are passed.
         # User-defined `method` is responsible for deserialization if needed.
         args: list[str] = body.get("args", [])
         kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: Optional[float] = body.get("timeout")
+        timeout: float | None = body.get("timeout")
         results = await engine_client(raw_request).collective_rpc(
-            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs)
+            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
+        )
         if results is None:
             return Response(status_code=200)
         response: list[Any] = []
@@ -1054,45 +1144,39 @@ if envs.VLLM_SERVER_DEV_MODE:
         return JSONResponse(content={"results": response})
 
 
-@router.post("/scale_elastic_ep",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.OK.value: {
-                     "model": dict
-                 },
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.REQUEST_TIMEOUT.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
+@router.post(
+    "/scale_elastic_ep",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"model": dict},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
 async def scale_elastic_ep(raw_request: Request):
     try:
         body = await raw_request.json()
     except json.JSONDecodeError as e:
-        raise HTTPException(status_code=400,
-                            detail="Invalid JSON format") from e  # noqa: B904
+        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
 
     new_data_parallel_size = body.get("new_data_parallel_size")
     drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
 
     if new_data_parallel_size is None:
-        raise HTTPException(status_code=400,
-                            detail="new_data_parallel_size is required")
+        raise HTTPException(
+            status_code=400, detail="new_data_parallel_size is required"
+        )
 
-    if not isinstance(new_data_parallel_size,
-                      int) or new_data_parallel_size <= 0:
+    if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
         raise HTTPException(
-            status_code=400,
-            detail="new_data_parallel_size must be a positive integer")
+            status_code=400, detail="new_data_parallel_size must be a positive integer"
+        )
 
     if not isinstance(drain_timeout, int) or drain_timeout <= 0:
-        raise HTTPException(status_code=400,
-                            detail="drain_timeout must be a positive integer")
+        raise HTTPException(
+            status_code=400, detail="drain_timeout must be a positive integer"
+        )
 
     # Set scaling flag to prevent new requests
     global _scaling_elastic_ep
@@ -1100,15 +1184,17 @@ async def scale_elastic_ep(raw_request: Request):
     client = engine_client(raw_request)
     try:
         await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
-        return JSONResponse({
-            "message":
-            f"Scaled to {new_data_parallel_size} "
-            "data parallel engines",
-        })
+        return JSONResponse(
+            {
+                "message": f"Scaled to {new_data_parallel_size} data parallel engines",
+            }
+        )
     except TimeoutError as e:
-        raise HTTPException(status_code=408,
-                            detail="Scale failed due to request drain timeout "
-                            f"after {drain_timeout} seconds") from e
+        raise HTTPException(
+            status_code=408,
+            detail="Scale failed due to request drain timeout "
+            f"after {drain_timeout} seconds",
+        ) from e
     except Exception as e:
         logger.error("Scale failed: %s", e)
         raise HTTPException(status_code=500, detail="Scale failed") from e
@@ -1124,7 +1210,7 @@ async def is_scaling_elastic_ep(raw_request: Request):
 # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
 # (requires typing_extensions >= 4.13)
 RequestType = Any
-GetHandlerFn = Callable[[Request], Optional[OpenAIServing]]
+GetHandlerFn = Callable[[Request], OpenAIServing | None]
 EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
 
 # NOTE: Items defined earlier take higher priority
@@ -1145,54 +1231,52 @@ INVOCATION_VALIDATORS = [
 ]
 
 
-@router.post("/invocations",
-             dependencies=[Depends(validate_json_request)],
-             responses={
-                 HTTPStatus.BAD_REQUEST.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {
-                     "model": ErrorResponse
-                 },
-                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
-                     "model": ErrorResponse
-                 },
-             })
-async def invocations(raw_request: Request):
-    """For SageMaker, routes requests based on the request type."""
+@router.post(
+    "/inference/v1/generate",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
     try:
-        body = await raw_request.json()
-    except json.JSONDecodeError as e:
-        raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
-                            detail=f"JSON decode error: {e}") from e
-
-    valid_endpoints = [(validator, endpoint)
-                       for validator, (get_handler,
-                                       endpoint) in INVOCATION_VALIDATORS
-                       if get_handler(raw_request) is not None]
-
-    for request_validator, endpoint in valid_endpoints:
-        try:
-            request = request_validator.validate_python(body)
-        except pydantic.ValidationError:
-            continue
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
-        return await endpoint(request, raw_request)
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
 
-    type_names = [
-        t.__name__ if isinstance(t := validator._type, type) else str(t)
-        for validator, _ in valid_endpoints
-    ]
-    msg = ("Cannot find suitable handler for request. "
-           f"Expected one of: {type_names}")
-    res = base(raw_request).create_error_response(message=msg)
-    return JSONResponse(content=res.model_dump(), status_code=res.error.code)
+    return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
 if envs.VLLM_TORCH_PROFILER_DIR:
-    logger.warning(
+    logger.warning_once(
         "Torch Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!")
+        "used for local development!"
+    )
+elif envs.VLLM_TORCH_CUDA_PROFILE:
+    logger.warning_once(
+        "CUDA Profiler is enabled in the API server. This should ONLY be "
+        "used for local development!"
+    )
+if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
 
     @router.post("/start_profile")
     async def start_profile(raw_request: Request):
@@ -1209,45 +1293,16 @@ if envs.VLLM_TORCH_PROFILER_DIR:
         return Response(status_code=200)
 
 
-if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-    logger.warning(
-        "LoRA dynamic loading & unloading is enabled in the API server. "
-        "This should ONLY be used for local development!")
-
-    @router.post("/v1/load_lora_adapter",
-                 dependencies=[Depends(validate_json_request)])
-    async def load_lora_adapter(request: LoadLoRAAdapterRequest,
-                                raw_request: Request):
-        handler = models(raw_request)
-        response = await handler.load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.error.code)
-
-        return Response(status_code=200, content=response)
-
-    @router.post("/v1/unload_lora_adapter",
-                 dependencies=[Depends(validate_json_request)])
-    async def unload_lora_adapter(request: UnloadLoRAAdapterRequest,
-                                  raw_request: Request):
-        handler = models(raw_request)
-        response = await handler.unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(content=response.model_dump(),
-                                status_code=response.error.code)
-
-        return Response(status_code=200, content=response)
-
-
-def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
+def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
     try:
         with open(log_config_file) as f:
             return json.load(f)
     except Exception as e:
-        logger.warning("Failed to load log config from file %s: error %s",
-                       log_config_file, e)
+        logger.warning(
+            "Failed to load log config from file %s: error %s", log_config_file, e
+        )
         return None
 
 
@@ -1265,9 +1320,7 @@ class AuthenticationMiddleware:
 
     def __init__(self, app: ASGIApp, tokens: list[str]) -> None:
         self.app = app
-        self.api_tokens = [
-            hashlib.sha256(t.encode("utf-8")).digest() for t in tokens
-        ]
+        self.api_tokens = [hashlib.sha256(t.encode("utf-8")).digest() for t in tokens]
 
     def verify_token(self, headers: Headers) -> bool:
         authorization_header_value = headers.get("Authorization")
@@ -1286,10 +1339,8 @@ class AuthenticationMiddleware:
 
         return token_match
 
-    def __call__(self, scope: Scope, receive: Receive,
-                 send: Send) -> Awaitable[None]:
-        if scope["type"] not in ("http",
-                                 "websocket") or scope["method"] == "OPTIONS":
+    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
+        if scope["type"] not in ("http", "websocket") or scope["method"] == "OPTIONS":
             # scope["type"] can be "lifespan" or "startup" for example,
             # in which case we don't need to do anything
             return self.app(scope, receive, send)
@@ -1298,8 +1349,7 @@ class AuthenticationMiddleware:
         headers = Headers(scope=scope)
         # Type narrow to satisfy mypy.
         if url_path.startswith("/v1") and not self.verify_token(headers):
-            response = JSONResponse(content={"error": "Unauthorized"},
-                                    status_code=401)
+            response = JSONResponse(content={"error": "Unauthorized"}, status_code=401)
             return response(scope, receive, send)
         return self.app(scope, receive, send)
 
@@ -1314,8 +1364,7 @@ class XRequestIdMiddleware:
     def __init__(self, app: ASGIApp) -> None:
         self.app = app
 
-    def __call__(self, scope: Scope, receive: Receive,
-                 send: Send) -> Awaitable[None]:
+    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
         if scope["type"] not in ("http", "websocket"):
             return self.app(scope, receive, send)
 
@@ -1329,8 +1378,7 @@ class XRequestIdMiddleware:
             """
             if message["type"] == "http.response.start":
                 response_headers = MutableHeaders(raw=message["headers"])
-                request_id = request_headers.get("X-Request-Id",
-                                                 uuid.uuid4().hex)
+                request_id = request_headers.get("X-Request-Id", uuid.uuid4().hex)
                 response_headers.append("X-Request-Id", request_id)
             await send(message)
 
@@ -1353,8 +1401,7 @@ class ScalingMiddleware:
     def __init__(self, app: ASGIApp) -> None:
         self.app = app
 
-    def __call__(self, scope: Scope, receive: Receive,
-                 send: Send) -> Awaitable[None]:
+    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
         if scope["type"] != "http":
             return self.app(scope, receive, send)
 
@@ -1362,11 +1409,12 @@ class ScalingMiddleware:
         global _scaling_elastic_ep
         if _scaling_elastic_ep:
             # Return 503 Service Unavailable response
-            response = JSONResponse(content={
-                "error":
-                "The model is currently scaling. Please try again later."
-            },
-                                    status_code=503)
+            response = JSONResponse(
+                content={
+                    "error": "The model is currently scaling. Please try again later."
+                },
+                status_code=503,
+            )
             return response(scope, receive, send)
 
         return self.app(scope, receive, send)
@@ -1376,28 +1424,27 @@ def _extract_content_from_chunk(chunk_data: dict) -> str:
     """Extract content from a streaming response chunk."""
     try:
         from vllm.entrypoints.openai.protocol import (
-            ChatCompletionStreamResponse, CompletionStreamResponse)
+            ChatCompletionStreamResponse,
+            CompletionStreamResponse,
+        )
 
         # Try using Completion types for type-safe parsing
-        if chunk_data.get('object') == 'chat.completion.chunk':
-            chat_response = ChatCompletionStreamResponse.model_validate(
-                chunk_data)
+        if chunk_data.get("object") == "chat.completion.chunk":
+            chat_response = ChatCompletionStreamResponse.model_validate(chunk_data)
             if chat_response.choices and chat_response.choices[0].delta.content:
                 return chat_response.choices[0].delta.content
-        elif chunk_data.get('object') == 'text_completion':
-            completion_response = CompletionStreamResponse.model_validate(
-                chunk_data)
-            if completion_response.choices and completion_response.choices[
-                    0].text:
+        elif chunk_data.get("object") == "text_completion":
+            completion_response = CompletionStreamResponse.model_validate(chunk_data)
+            if completion_response.choices and completion_response.choices[0].text:
                 return completion_response.choices[0].text
     except pydantic.ValidationError:
         # Fallback to manual parsing
-        if 'choices' in chunk_data and chunk_data['choices']:
-            choice = chunk_data['choices'][0]
-            if 'delta' in choice and choice['delta'].get('content'):
-                return choice['delta']['content']
-            elif choice.get('text'):
-                return choice['text']
+        if "choices" in chunk_data and chunk_data["choices"]:
+            choice = chunk_data["choices"][0]
+            if "delta" in choice and choice["delta"].get("content"):
+                return choice["delta"]["content"]
+            elif choice.get("text"):
+                return choice["text"]
     return ""
 
 
@@ -1413,7 +1460,7 @@ class SSEDecoder:
         import json
 
         try:
-            chunk_str = chunk.decode('utf-8')
+            chunk_str = chunk.decode("utf-8")
         except UnicodeDecodeError:
             # Skip malformed chunks
             return []
@@ -1422,18 +1469,18 @@ class SSEDecoder:
         events = []
 
         # Process complete lines
-        while '\n' in self.buffer:
-            line, self.buffer = self.buffer.split('\n', 1)
-            line = line.rstrip('\r')  # Handle CRLF
+        while "\n" in self.buffer:
+            line, self.buffer = self.buffer.split("\n", 1)
+            line = line.rstrip("\r")  # Handle CRLF
 
-            if line.startswith('data: '):
+            if line.startswith("data: "):
                 data_str = line[6:].strip()
-                if data_str == '[DONE]':
-                    events.append({'type': 'done'})
+                if data_str == "[DONE]":
+                    events.append({"type": "done"})
                 elif data_str:
                     try:
                         event_data = json.loads(data_str)
-                        events.append({'type': 'data', 'data': event_data})
+                        events.append({"type": "data", "data": event_data})
                     except json.JSONDecodeError:
                         # Skip malformed JSON
                         continue
@@ -1451,7 +1498,7 @@ class SSEDecoder:
 
     def get_complete_content(self) -> str:
         """Get the complete buffered content."""
-        return ''.join(self.content_buffer)
+        return "".join(self.content_buffer)
 
 
 def _log_streaming_response(response, response_body: list) -> None:
@@ -1472,10 +1519,10 @@ def _log_streaming_response(response, response_body: list) -> None:
             events = sse_decoder.decode_chunk(chunk)
 
             for event in events:
-                if event['type'] == 'data':
-                    content = sse_decoder.extract_content(event['data'])
+                if event["type"] == "data":
+                    content = sse_decoder.extract_content(event["data"])
                     sse_decoder.add_content(content)
-                elif event['type'] == 'done':
+                elif event["type"] == "done":
                     # Log complete content when done
                     full_content = sse_decoder.get_complete_content()
                     if full_content:
@@ -1484,19 +1531,19 @@ def _log_streaming_response(response, response_body: list) -> None:
                             full_content = full_content[:2048] + ""
                             "...[truncated]"
                         logger.info(
-                            "response_body={streaming_complete: " \
-                            "content='%s', chunks=%d}",
-                            full_content, chunk_count)
+                            "response_body={streaming_complete: content=%r, chunks=%d}",
+                            full_content,
+                            chunk_count,
+                        )
                     else:
                         logger.info(
-                            "response_body={streaming_complete: " \
-                            "no_content, chunks=%d}",
-                            chunk_count)
+                            "response_body={streaming_complete: no_content, chunks=%d}",
+                            chunk_count,
+                        )
                     return
 
     response.body_iterator = iterate_in_threadpool(buffered_iterator())
-    logger.info("response_body={streaming_started: chunks=%d}",
-                len(response_body))
+    logger.info("response_body={streaming_started: chunks=%d}", len(response_body))
 
 
 def _log_non_streaming_response(response_body: list) -> None:
@@ -1510,12 +1557,25 @@ def _log_non_streaming_response(response_body: list) -> None:
 
 def build_app(args: Namespace) -> FastAPI:
     if args.disable_fastapi_docs:
-        app = FastAPI(openapi_url=None,
-                      docs_url=None,
-                      redoc_url=None,
-                      lifespan=lifespan)
+        app = FastAPI(
+            openapi_url=None, docs_url=None, redoc_url=None, lifespan=lifespan
+        )
     else:
         app = FastAPI(lifespan=lifespan)
+
+    if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        logger.warning(
+            "LoRA dynamic loading & unloading is enabled in the API server. "
+            "This should ONLY be used for local development!"
+        )
+        from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes
+
+        register_dynamic_lora_routes(router)
+
+    from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
+
+    register_sagemaker_routes(router)
+
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -1532,14 +1592,16 @@ def build_app(args: Namespace) -> FastAPI:
     @app.exception_handler(HTTPException)
     async def http_exception_handler(_: Request, exc: HTTPException):
         err = ErrorResponse(
-            error=ErrorInfo(message=exc.detail,
-                            type=HTTPStatus(exc.status_code).phrase,
-                            code=exc.status_code))
+            error=ErrorInfo(
+                message=exc.detail,
+                type=HTTPStatus(exc.status_code).phrase,
+                code=exc.status_code,
+            )
+        )
         return JSONResponse(err.model_dump(), status_code=exc.status_code)
 
     @app.exception_handler(RequestValidationError)
-    async def validation_exception_handler(_: Request,
-                                           exc: RequestValidationError):
+    async def validation_exception_handler(_: Request, exc: RequestValidationError):
         exc_str = str(exc)
         errors_str = str(exc.errors())
 
@@ -1548,11 +1610,14 @@ def build_app(args: Namespace) -> FastAPI:
         else:
             message = exc_str
 
-        err = ErrorResponse(error=ErrorInfo(message=message,
-                                            type=HTTPStatus.BAD_REQUEST.phrase,
-                                            code=HTTPStatus.BAD_REQUEST))
-        return JSONResponse(err.model_dump(),
-                            status_code=HTTPStatus.BAD_REQUEST)
+        err = ErrorResponse(
+            error=ErrorInfo(
+                message=message,
+                type=HTTPStatus.BAD_REQUEST.phrase,
+                code=HTTPStatus.BAD_REQUEST,
+            )
+        )
+        return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
 
     # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
     if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
@@ -1565,16 +1630,16 @@ def build_app(args: Namespace) -> FastAPI:
     app.add_middleware(ScalingMiddleware)
 
     if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
-        logger.warning("CAUTION: Enabling log response in the API Server. "
-                       "This can include sensitive information and should be "
-                       "avoided in production.")
+        logger.warning(
+            "CAUTION: Enabling log response in the API Server. "
+            "This can include sensitive information and should be "
+            "avoided in production."
+        )
 
         @app.middleware("http")
         async def log_response(request: Request, call_next):
             response = await call_next(request)
-            response_body = [
-                section async for section in response.body_iterator
-            ]
+            response_body = [section async for section in response.body_iterator]
             response.body_iterator = iterate_in_threadpool(iter(response_body))
             # Check if this is a streaming response by looking at content-type
             content_type = response.headers.get("content-type", "")
@@ -1597,18 +1662,47 @@ def build_app(args: Namespace) -> FastAPI:
         elif inspect.iscoroutinefunction(imported):
             app.middleware("http")(imported)
         else:
-            raise ValueError(f"Invalid middleware {middleware}. "
-                             f"Must be a function or a class.")
+            raise ValueError(
+                f"Invalid middleware {middleware}. Must be a function or a class."
+            )
+
+    app = sagemaker_standards.bootstrap(app)
+    # Optional endpoints
+    if args.tokens_only:
+
+        @app.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
 
     return app
 
 
 async def init_app_state(
     engine_client: EngineClient,
-    vllm_config: VllmConfig,
     state: State,
     args: Namespace,
 ) -> None:
+    vllm_config = engine_client.vllm_config
+
     if args.served_model_name is not None:
         served_model_names = args.served_model_name
     else:
@@ -1620,45 +1714,22 @@ async def init_app_state(
         request_logger = None
 
     base_model_paths = [
-        BaseModelPath(name=name, model_path=args.model)
-        for name in served_model_names
+        BaseModelPath(name=name, model_path=args.model) for name in served_model_names
     ]
 
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
     state.vllm_config = vllm_config
-    model_config = vllm_config.model_config
 
     supported_tasks = await engine_client.get_supported_tasks()
+    logger.info("Supported tasks: %s", supported_tasks)
 
-    logger.info("Supported_tasks: %s", supported_tasks)
-
-    resolved_chat_template = load_chat_template(args.chat_template)
-    if resolved_chat_template is not None:
-        # Get the tokenizer to check official template
-        tokenizer = await engine_client.get_tokenizer()
-
-        if isinstance(tokenizer, MistralTokenizer):
-            # The warning is logged in resolve_mistral_chat_template.
-            resolved_chat_template = resolve_mistral_chat_template(
-                chat_template=resolved_chat_template)
-        else:
-            hf_chat_template = resolve_hf_chat_template(
-                tokenizer=tokenizer,
-                chat_template=None,
-                tools=None,
-                model_config=vllm_config.model_config,
-            )
-
-            if hf_chat_template != resolved_chat_template:
-                logger.warning(
-                    "Using supplied chat template: %s\n"
-                    "It is different from official chat template '%s'. "
-                    "This discrepancy may lead to performance degradation.",
-                    resolved_chat_template, args.model)
+    resolved_chat_template = await process_chat_template(
+        args.chat_template, engine_client, vllm_config.model_config
+    )
 
     if args.tool_server == "demo":
-        tool_server: Optional[ToolServer] = DemoToolServer()
+        tool_server: ToolServer | None = DemoToolServer()
         assert isinstance(tool_server, DemoToolServer)
         await tool_server.init_and_validate()
     elif args.tool_server:
@@ -1668,131 +1739,195 @@ async def init_app_state(
         tool_server = None
 
     # Merge default_mm_loras into the static lora_modules
-    default_mm_loras = (vllm_config.lora_config.default_mm_loras
-                        if vllm_config.lora_config is not None else {})
-
-    lora_modules = args.lora_modules
-    if default_mm_loras:
-        default_mm_lora_paths = [
-            LoRAModulePath(
-                name=modality,
-                path=lora_path,
-            ) for modality, lora_path in default_mm_loras.items()
-        ]
-        if args.lora_modules is None:
-            lora_modules = default_mm_lora_paths
-        else:
-            lora_modules += default_mm_lora_paths
+    default_mm_loras = (
+        vllm_config.lora_config.default_mm_loras
+        if vllm_config.lora_config is not None
+        else {}
+    )
+
+    default_mm_loras = (
+        vllm_config.lora_config.default_mm_loras
+        if vllm_config.lora_config is not None
+        else {}
+    )
+    lora_modules = process_lora_modules(args.lora_modules, default_mm_loras)
 
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,
-        model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=lora_modules,
     )
     await state.openai_serving_models.init_static_loras()
-    state.openai_serving_responses = OpenAIServingResponses(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        chat_template=resolved_chat_template,
-        chat_template_content_format=args.chat_template_content_format,
-        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-        enable_auto_tools=args.enable_auto_tool_choice,
-        tool_parser=args.tool_call_parser,
-        tool_server=tool_server,
-        reasoning_parser=args.structured_outputs_config.reasoning_parser,
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-        enable_force_include_usage=args.enable_force_include_usage,
-        enable_log_outputs=args.enable_log_outputs,
-        log_error_stack=args.log_error_stack,
-    ) if "generate" in supported_tasks else None
-    state.openai_serving_chat = OpenAIServingChat(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        args.response_role,
-        request_logger=request_logger,
-        chat_template=resolved_chat_template,
-        chat_template_content_format=args.chat_template_content_format,
-        trust_request_chat_template=args.trust_request_chat_template,
-        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-        enable_auto_tools=args.enable_auto_tool_choice,
-        exclude_tools_when_tool_choice_none=args.
-        exclude_tools_when_tool_choice_none,
-        tool_parser=args.tool_call_parser,
-        reasoning_parser=args.structured_outputs_config.reasoning_parser,
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-        enable_force_include_usage=args.enable_force_include_usage,
-        enable_log_outputs=args.enable_log_outputs,
-        log_error_stack=args.log_error_stack,
-    ) if "generate" in supported_tasks else None
-    state.openai_serving_completion = OpenAIServingCompletion(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-        enable_force_include_usage=args.enable_force_include_usage,
-        log_error_stack=args.log_error_stack,
-    ) if "generate" in supported_tasks else None
-    state.openai_serving_pooling = OpenAIServingPooling(
-        engine_client,
-        vllm_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        chat_template=resolved_chat_template,
-        chat_template_content_format=args.chat_template_content_format,
-        log_error_stack=args.log_error_stack,
-    ) if "encode" in supported_tasks else None
-    state.openai_serving_embedding = OpenAIServingEmbedding(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        chat_template=resolved_chat_template,
-        chat_template_content_format=args.chat_template_content_format,
-        log_error_stack=args.log_error_stack,
-    ) if "embed" in supported_tasks else None
-    state.openai_serving_classification = ServingClassification(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        log_error_stack=args.log_error_stack,
-    ) if "classify" in supported_tasks else None
-    state.openai_serving_scores = ServingScores(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        log_error_stack=args.log_error_stack,
-    ) if ("embed" in supported_tasks or "score" in supported_tasks) else None
+    state.openai_serving_responses = (
+        OpenAIServingResponses(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            enable_auto_tools=args.enable_auto_tool_choice,
+            tool_parser=args.tool_call_parser,
+            tool_server=tool_server,
+            reasoning_parser=args.structured_outputs_config.reasoning_parser,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+            enable_log_outputs=args.enable_log_outputs,
+            log_error_stack=args.log_error_stack,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+    state.openai_serving_chat = (
+        OpenAIServingChat(
+            engine_client,
+            state.openai_serving_models,
+            args.response_role,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            enable_auto_tools=args.enable_auto_tool_choice,
+            exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
+            tool_parser=args.tool_call_parser,
+            reasoning_parser=args.structured_outputs_config.reasoning_parser,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+            enable_log_outputs=args.enable_log_outputs,
+            log_error_stack=args.log_error_stack,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+    state.openai_serving_completion = (
+        OpenAIServingCompletion(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+            log_error_stack=args.log_error_stack,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+    state.openai_serving_pooling = (
+        (
+            OpenAIServingPooling(
+                engine_client,
+                state.openai_serving_models,
+                supported_tasks=supported_tasks,
+                request_logger=request_logger,
+                chat_template=resolved_chat_template,
+                chat_template_content_format=args.chat_template_content_format,
+                trust_request_chat_template=args.trust_request_chat_template,
+                log_error_stack=args.log_error_stack,
+            )
+        )
+        if any(task in POOLING_TASKS for task in supported_tasks)
+        else None
+    )
+    state.openai_serving_embedding = (
+        OpenAIServingEmbedding(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
+            log_error_stack=args.log_error_stack,
+        )
+        if "embed" in supported_tasks
+        else None
+    )
+    state.openai_serving_classification = (
+        ServingClassification(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
+            log_error_stack=args.log_error_stack,
+        )
+        if "classify" in supported_tasks
+        else None
+    )
+    state.openai_serving_scores = (
+        ServingScores(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            log_error_stack=args.log_error_stack,
+        )
+        if ("embed" in supported_tasks or "score" in supported_tasks)
+        else None
+    )
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
-        model_config,
         state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        trust_request_chat_template=args.trust_request_chat_template,
         log_error_stack=args.log_error_stack,
     )
-    state.openai_serving_transcription = OpenAIServingTranscription(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        log_error_stack=args.log_error_stack,
-    ) if "transcription" in supported_tasks else None
-    state.openai_serving_translation = OpenAIServingTranslation(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger,
-        log_error_stack=args.log_error_stack,
-    ) if "transcription" in supported_tasks else None
+    state.openai_serving_transcription = (
+        OpenAIServingTranscription(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            log_error_stack=args.log_error_stack,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "transcription" in supported_tasks
+        else None
+    )
+    state.openai_serving_translation = (
+        OpenAIServingTranslation(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            log_error_stack=args.log_error_stack,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "transcription" in supported_tasks
+        else None
+    )
+    state.anthropic_serving_messages = (
+        AnthropicServingMessages(
+            engine_client,
+            state.openai_serving_models,
+            args.response_role,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            enable_auto_tools=args.enable_auto_tool_choice,
+            tool_parser=args.tool_call_parser,
+            reasoning_parser=args.structured_outputs_config.reasoning_parser,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+    state.serving_tokens = (
+        ServingTokens(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            log_error_stack=args.log_error_stack,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_log_outputs=args.enable_log_outputs,
+            force_no_detokenize=args.tokens_only,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
@@ -1818,18 +1953,21 @@ def create_server_unix_socket(path: str) -> socket.socket:
 
 
 def validate_api_server_args(args):
-    valid_tool_parses = ToolParserManager.tool_parsers.keys()
-    if args.enable_auto_tool_choice \
-            and args.tool_call_parser not in valid_tool_parses:
-        raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
-                       f"(chose from {{ {','.join(valid_tool_parses)} }})")
-
-    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
-    if ((reasoning_parser := args.structured_outputs_config.reasoning_parser)
-            and reasoning_parser not in valid_reasoning_parses):
+    valid_tool_parses = ToolParserManager.list_registered()
+    if args.enable_auto_tool_choice and args.tool_call_parser not in valid_tool_parses:
+        raise KeyError(
+            f"invalid tool call parser: {args.tool_call_parser} "
+            f"(chose from {{ {','.join(valid_tool_parses)} }})"
+        )
+
+    valid_reasoning_parsers = ReasoningParserManager.list_registered()
+    if (
+        reasoning_parser := args.structured_outputs_config.reasoning_parser
+    ) and reasoning_parser not in valid_reasoning_parsers:
         raise KeyError(
             f"invalid reasoning parser: {reasoning_parser} "
-            f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
+            f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
+        )
 
 
 def setup_server(args):
@@ -1842,6 +1980,9 @@ def setup_server(args):
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
+    if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
+        ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
+
     validate_api_server_args(args)
 
     # workaround to make sure that we bind the port before the engine is set up.
@@ -1868,8 +2009,7 @@ def setup_server(args):
     else:
         addr, port = sock_addr
         is_ssl = args.ssl_keyfile and args.ssl_certfile
-        host_part = f"[{addr}]" if is_valid_ipv6_address(
-            addr) else addr or "0.0.0.0"
+        host_part = f"[{addr}]" if is_valid_ipv6_address(addr) else addr or "0.0.0.0"
         listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
     return listen_address, sock
 
@@ -1884,34 +2024,36 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 
 
-async def run_server_worker(listen_address,
-                            sock,
-                            args,
-                            client_config=None,
-                            **uvicorn_kwargs) -> None:
+async def run_server_worker(
+    listen_address, sock, args, client_config=None, **uvicorn_kwargs
+) -> None:
     """Run a single API server worker."""
 
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
+    if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
+        ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
+
     # Load logging config for uvicorn if specified
     log_config = load_log_config(args.log_config_file)
     if log_config is not None:
-        uvicorn_kwargs['log_config'] = log_config
+        uvicorn_kwargs["log_config"] = log_config
 
     async with build_async_engine_client(
-            args,
-            client_config=client_config,
+        args,
+        client_config=client_config,
     ) as engine_client:
         maybe_register_tokenizer_info_endpoint(args)
         app = build_app(args)
 
-        vllm_config = await engine_client.get_vllm_config()
-        await init_app_state(engine_client, vllm_config, app.state, args)
+        await init_app_state(engine_client, app.state, args)
 
-        logger.info("Starting vLLM API server %d on %s",
-                    vllm_config.parallel_config._api_process_rank,
-                    listen_address)
+        logger.info(
+            "Starting vLLM API server %d on %s",
+            engine_client.vllm_config.parallel_config._api_process_rank,
+            listen_address,
+        )
         shutdown_task = await serve_http(
             app,
             sock=sock,
@@ -1945,7 +2087,8 @@ if __name__ == "__main__":
     # entrypoints.
     cli_env_setup()
     parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
+        description="vLLM OpenAI-Compatible RESTful API server."
+    )
     parser = make_arg_parser(parser)
     args = parser.parse_args()
     validate_parsed_serve_args(args)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a306c2bb7cb5b42e3f2d41526b7a354e5d6ba405..946362ce2ef0a616c3736f3474b8fdac2001233f 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,33 +11,36 @@ import json
 import ssl
 from collections.abc import Sequence
 from dataclasses import field
-from typing import Literal, Optional, Union
+from typing import Literal
 
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.config import config
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
-from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
-                                         validate_chat_template)
-from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
-                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
+from vllm.entrypoints.chat_utils import (
+    ChatTemplateContentFormatOption,
+    validate_chat_template,
+)
+from vllm.entrypoints.constants import (
+    H11_MAX_HEADER_COUNT_DEFAULT,
+    H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
+)
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
 
 class LoRAParserAction(argparse.Action):
-
     def __call__(
         self,
         parser: argparse.ArgumentParser,
         namespace: argparse.Namespace,
-        values: Optional[Union[str, Sequence[str]]],
-        option_string: Optional[str] = None,
+        values: str | Sequence[str] | None,
+        option_string: str | None = None,
     ):
         if values is None:
             values = []
@@ -57,8 +60,7 @@ class LoRAParserAction(argparse.Action):
                     lora = LoRAModulePath(**lora_dict)
                     lora_list.append(lora)
                 except json.JSONDecodeError:
-                    parser.error(
-                        f"Invalid JSON format for --lora-modules: {item}")
+                    parser.error(f"Invalid JSON format for --lora-modules: {item}")
                 except TypeError as e:
                     parser.error(
                         f"Invalid fields for --lora-modules: {item} - {str(e)}"
@@ -70,14 +72,16 @@ class LoRAParserAction(argparse.Action):
 @dataclass
 class FrontendArgs:
     """Arguments for the OpenAI-compatible frontend server."""
-    host: Optional[str] = None
+
+    host: str | None = None
     """Host name."""
     port: int = 8000
     """Port number."""
-    uds: Optional[str] = None
+    uds: str | None = None
     """Unix domain socket path. If set, host and port arguments are ignored."""
-    uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical",
-                               "trace"] = "info"
+    uvicorn_log_level: Literal[
+        "debug", "info", "warning", "error", "critical", "trace"
+    ] = "info"
     """Log level for uvicorn."""
     disable_uvicorn_access_log: bool = False
     """Disable uvicorn access log."""
@@ -89,15 +93,15 @@ class FrontendArgs:
     """Allowed methods."""
     allowed_headers: list[str] = field(default_factory=lambda: ["*"])
     """Allowed headers."""
-    api_key: Optional[list[str]] = None
+    api_key: list[str] | None = None
     """If provided, the server will require one of these keys to be presented in
     the header."""
-    lora_modules: Optional[list[LoRAModulePath]] = None
+    lora_modules: list[LoRAModulePath] | None = None
     """LoRA modules configurations in either 'name=path' format or JSON format
     or JSON list format. Example (old format): `'name=path'` Example (new
     format): `{\"name\": \"name\", \"path\": \"lora_path\",
     \"base_model_name\": \"id\"}`"""
-    chat_template: Optional[str] = None
+    chat_template: str | None = None
     """The file path to the chat template, or the template in single-line form
     for the specified model."""
     chat_template_content_format: ChatTemplateContentFormatOption = "auto"
@@ -112,17 +116,17 @@ class FrontendArgs:
     or the ones from tokenizer."""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
-    ssl_keyfile: Optional[str] = None
+    ssl_keyfile: str | None = None
     """The file path to the SSL key file."""
-    ssl_certfile: Optional[str] = None
+    ssl_certfile: str | None = None
     """The file path to the SSL cert file."""
-    ssl_ca_certs: Optional[str] = None
+    ssl_ca_certs: str | None = None
     """The CA certificates file."""
     enable_ssl_refresh: bool = False
     """Refresh SSL Context when SSL certificate files change"""
     ssl_cert_reqs: int = int(ssl.CERT_NONE)
     """Whether client certificate is required (see stdlib ssl module's)."""
-    root_path: Optional[str] = None
+    root_path: str | None = None
     """FastAPI root_path when app is behind a path based routing proxy."""
     middleware: list[str] = field(default_factory=lambda: [])
     """Additional ASGI middleware to apply to the app. We accept multiple
@@ -145,7 +149,7 @@ class FrontendArgs:
     exclude_tools_when_tool_choice_none: bool = False
     """If specified, exclude tool definitions in prompts when
     tool_choice='none'."""
-    tool_call_parser: Optional[str] = None
+    tool_call_parser: str | None = None
     """Select the tool call parser depending on the model that you're using.
     This is used to parse the model-generated tool call into OpenAI API format.
     Required for `--enable-auto-tool-choice`. You can choose any option from
@@ -154,13 +158,13 @@ class FrontendArgs:
     """Special the tool parser plugin write to parse the model-generated tool
     into OpenAI API format, the name register in this plugin can be used in
     `--tool-call-parser`."""
-    tool_server: Optional[str] = None
+    tool_server: str | None = None
     """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
     Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
     purpose."""
-    log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
+    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
     """Path to logging config JSON file for both vllm and uvicorn"""
-    max_log_len: Optional[int] = None
+    max_log_len: int | None = None
     """Max number of prompt characters or prompt ID numbers being printed in
     log. The default of None means unlimited."""
     disable_fastapi_docs: bool = False
@@ -185,6 +189,11 @@ class FrontendArgs:
     Helps mitigate header abuse. Default: 256."""
     log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
     """If set to True, log the stack trace of error responses"""
+    tokens_only: bool = False
+    """
+    If set to True, only enable the Tokens In<>Out endpoint. 
+    This is intended for use in a Disaggregated Everything setup.
+    """
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -215,10 +224,11 @@ class FrontendArgs:
         frontend_kwargs["middleware"]["default"] = []
 
         # Special case: Tool call parser shows built-in options.
-        valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
+        valid_tool_parsers = list(ToolParserManager.list_registered())
         parsers_str = ",".join(valid_tool_parsers)
         frontend_kwargs["tool_call_parser"]["metavar"] = (
-            f"{{{parsers_str}}} or name registered in --tool-parser-plugin")
+            f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
+        )
 
         frontend_group = parser.add_argument_group(
             title="Frontend",
@@ -238,27 +248,32 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     register all arguments instead of manually enumerating them here. This
     avoids code duplication and keeps the argument definitions in one place.
     """
-    parser.add_argument("model_tag",
-                        type=str,
-                        nargs="?",
-                        help="The model tag to serve "
-                        "(optional if specified in config)")
+    parser.add_argument(
+        "model_tag",
+        type=str,
+        nargs="?",
+        help="The model tag to serve (optional if specified in config)",
+    )
     parser.add_argument(
         "--headless",
         action="store_true",
         default=False,
         help="Run in headless mode. See multi-node data parallel "
-        "documentation for more details.")
-    parser.add_argument("--api-server-count",
-                        "-asc",
-                        type=int,
-                        default=1,
-                        help="How many API server processes to run.")
+        "documentation for more details.",
+    )
+    parser.add_argument(
+        "--api-server-count",
+        "-asc",
+        type=int,
+        default=1,
+        help="How many API server processes to run.",
+    )
     parser.add_argument(
         "--config",
         help="Read CLI options from a config file. "
         "Must be a YAML with the following options: "
-        "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
+        "https://docs.vllm.ai/en/latest/configuration/serve_args.html",
+    )
     parser = FrontendArgs.add_cli_args(parser)
     parser = AsyncEngineArgs.add_cli_args(parser)
 
@@ -275,14 +290,13 @@ def validate_parsed_serve_args(args: argparse.Namespace):
 
     # Enable auto tool needs a tool call parser to be valid
     if args.enable_auto_tool_choice and not args.tool_call_parser:
-        raise TypeError("Error: --enable-auto-tool-choice requires "
-                        "--tool-call-parser")
+        raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
     if args.enable_log_outputs and not args.enable_log_requests:
-        raise TypeError("Error: --enable-log-outputs requires "
-                        "--enable-log-requests")
+        raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
 
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
-        prog="-m vllm.entrypoints.openai.api_server")
+        prog="-m vllm.entrypoints.openai.api_server"
+    )
     return make_arg_parser(parser_for_docs)
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
deleted file mode 100644
index 29d72256cf70bba33b1a49530c97fb34760bf83f..0000000000000000000000000000000000000000
--- a/vllm/entrypoints/openai/logits_processors.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections.abc import Iterable
-from functools import lru_cache, partial
-from typing import Optional, Union
-
-import torch
-
-from vllm.sampling_params import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-
-class AllowedTokenIdsLogitsProcessor:
-    """Logits processor for constraining generated tokens to a
-    specific set of token ids."""
-
-    def __init__(self, allowed_ids: Iterable[int]):
-        self.allowed_ids: Optional[list[int]] = list(allowed_ids)
-        self.mask: Optional[torch.Tensor] = None
-
-    def __call__(self, token_ids: list[int],
-                 logits: torch.Tensor) -> torch.Tensor:
-        if self.mask is None:
-            self.mask = torch.ones((logits.shape[-1], ),
-                                   dtype=torch.bool,
-                                   device=logits.device)
-            self.mask[self.allowed_ids] = False
-            self.allowed_ids = None
-        logits.masked_fill_(self.mask, float("-inf"))
-        return logits
-
-
-@lru_cache(maxsize=32)
-def _get_allowed_token_ids_logits_processor(
-    allowed_token_ids: frozenset[int],
-    vocab_size: int,
-) -> LogitsProcessor:
-    if not allowed_token_ids:
-        raise ValueError("Empty allowed_token_ids provided")
-    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
-        raise ValueError("allowed_token_ids contains "
-                         "out-of-vocab token id")
-    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
-
-
-def logit_bias_logits_processor(
-    logit_bias: dict[int, float],
-    token_ids: list[int],
-    logits: torch.Tensor,
-) -> torch.Tensor:
-    for token_id, bias in logit_bias.items():
-        logits[token_id] += bias
-    return logits
-
-
-def get_logits_processors(
-    logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
-    allowed_token_ids: Optional[list[int]],
-    tokenizer: AnyTokenizer,
-) -> list[LogitsProcessor]:
-    logits_processors: list[LogitsProcessor] = []
-    if logit_bias:
-        try:
-            # Convert token_id to integer
-            # Clamp the bias between -100 and 100 per OpenAI API spec
-            clamped_logit_bias: dict[int, float] = {
-                int(token_id): min(100.0, max(-100.0, bias))
-                for token_id, bias in logit_bias.items()
-            }
-        except ValueError as exc:
-            raise ValueError(
-                "Found token_id in logit_bias that is not "
-                "an integer or string representing an integer") from exc
-
-        # Check if token_id is within the vocab size
-        for token_id, bias in clamped_logit_bias.items():
-            if token_id < 0 or token_id >= len(tokenizer):
-                raise ValueError(f"token_id {token_id} in logit_bias contains "
-                                 "out-of-vocab token id")
-
-        logits_processors.append(
-            partial(logit_bias_logits_processor, clamped_logit_bias))
-
-    if allowed_token_ids is not None:
-        logits_processors.append(
-            _get_allowed_token_ids_logits_processor(
-                frozenset(allowed_token_ids), len(tokenizer)))
-
-    return logits_processors
diff --git a/vllm/entrypoints/openai/orca_metrics.py b/vllm/entrypoints/openai/orca_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..3808262bf31f265ec7ab502d13812c5944cfa5f4
--- /dev/null
+++ b/vllm/entrypoints/openai/orca_metrics.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Utility functions that create ORCA endpoint load report response headers.
+"""
+
+import json
+from collections.abc import Mapping
+
+from vllm.logger import init_logger
+from vllm.v1.metrics.reader import Gauge, get_metrics_snapshot
+
+logger = init_logger(__name__)
+
+
+def create_orca_header(
+    metrics_format: str, named_metrics: list[tuple[str, float]]
+) -> Mapping[str, str] | None:
+    """
+    Creates ORCA headers named 'endpoint-load-metrics' in the specified format
+    and adds custom metrics to named_metrics.
+    ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
+    ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
+
+    Parameters:
+    - metrics_format (str): The format of the header ('TEXT', 'JSON').
+    - named_metrics (List[Tuple[str, float]]): List of tuples with metric names
+    and their corresponding double values.
+
+    Returns:
+    - Optional[Mapping[str,str]]: A dictionary with header key as
+    'endpoint-load-metrics' and values as the ORCA header strings with
+    format prefix and data in  with named_metrics in.
+    """
+
+    if metrics_format.lower() not in ["text", "json"]:
+        logger.warning(
+            "Warning: `%s` format is not supported in the ORCA response header",
+            format,
+        )
+        return None
+
+    header = {}
+    orca_report = {
+        "named_metrics": {
+            metric_name: value
+            for metric_name, value in named_metrics
+            if isinstance(metric_name, str) and isinstance(value, float)
+        }
+    }
+    # output example:
+    # endpoint-load-metrics: TEXT named_metrics.kv_cache_utilization=0.4
+    if metrics_format.lower() == "text":
+        native_http_header = ", ".join(
+            [
+                f"named_metrics.{metric_name}={value}"
+                for metric_name, value in named_metrics
+                if isinstance(metric_name, str) and isinstance(value, float)
+            ]
+        )
+        header["endpoint-load-metrics"] = f"TEXT {native_http_header}"
+
+    # output example:
+    # endpoint-load-metrics: JSON “named_metrics”: {“custom-metric-util”: 0.4}
+    elif metrics_format.lower() == "json":
+        header["endpoint-load-metrics"] = f"JSON {json.dumps(orca_report)}"
+
+    logger.info("Created ORCA header %s", header)
+
+    return header
+
+
+def get_named_metrics_from_prometheus() -> list[tuple[str, float]]:
+    """
+    Collects current metrics from Prometheus and returns some of them
+    in the form of the `named_metrics` list for `create_orca_header()`.
+
+    Parameters:
+    - None
+
+    Returns:
+    - list[tuple[str, float]]: List of tuples of metric names and their values.
+    """
+    named_metrics: list[tuple[str, float]] = []
+    # Map from prometheus metric names to ORCA named metrics.
+    prometheus_to_orca_metrics = {
+        "vllm:kv_cache_usage_perc": "kv_cache_usage_perc",
+        "vllm:num_requests_waiting": "num_requests_waiting",
+    }
+    metrics = get_metrics_snapshot()
+    for metric in metrics:
+        orca_name = prometheus_to_orca_metrics.get(metric.name)
+        # If this metric is mapped into ORCA, then add it to the report.
+        # Note: Only Gauge metrics are currently supported.
+        if orca_name is not None and isinstance(metric, Gauge):
+            named_metrics.append((str(orca_name), float(metric.value)))
+    return named_metrics
+
+
+def metrics_header(metrics_format: str) -> Mapping[str, str] | None:
+    """
+    Creates ORCA headers named 'endpoint-load-metrics' in the specified format.
+    Metrics are collected from Prometheus using `get_named_metrics_from_prometheus()`.
+
+    ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
+    ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
+
+    Parameters:
+    - metrics_format (str): The format of the header ('TEXT', 'JSON').
+
+    Returns:
+    - Optional[Mapping[str,str]]: A dictionary with header key as
+    'endpoint-load-metrics' and values as the ORCA header strings with
+    format prefix and data in  with named_metrics in.
+    """
+    if not metrics_format:
+        return None
+    # Get named metrics from prometheus.
+    named_metrics = get_named_metrics_from_prometheus()
+    return create_orca_header(metrics_format, named_metrics)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 8829fa4886f6286b15e3cc38f9f31565f9d62021..65bd15ba387b9c1b786b338526ff27dcd12d2ede 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,59 +6,92 @@
 import json
 import time
 from http import HTTPStatus
-from typing import (Annotated, Any, ClassVar, Generic, Literal, Optional,
-                    TypeVar, Union)
+from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
 
 import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
-# yapf: disable
 from openai.types.chat.chat_completion_audio import (
-    ChatCompletionAudio as OpenAIChatCompletionAudio)
-from openai.types.chat.chat_completion_message import (
-    Annotation as OpenAIAnnotation)
-# yapf: enable
+    ChatCompletionAudio as OpenAIChatCompletionAudio,
+)
+from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
 from openai.types.responses import (
     ResponseCodeInterpreterCallCodeDeltaEvent,
     ResponseCodeInterpreterCallCodeDoneEvent,
     ResponseCodeInterpreterCallCompletedEvent,
     ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
-    ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
-    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
-    ResponseStatus, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
+    ResponseCodeInterpreterCallInterpretingEvent,
+    ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent,
+    ResponseFunctionToolCall,
+    ResponseInputItemParam,
+    ResponseOutputItem,
+    ResponseOutputItemAddedEvent,
+    ResponseOutputItemDoneEvent,
+    ResponsePrompt,
+    ResponseReasoningItem,
+    ResponseReasoningTextDeltaEvent,
+    ResponseReasoningTextDoneEvent,
+    ResponseStatus,
+    ResponseWebSearchCallCompletedEvent,
+    ResponseWebSearchCallInProgressEvent,
+    ResponseWebSearchCallSearchingEvent,
+)
+from openai.types.responses import (
+    ResponseCompletedEvent as OpenAIResponseCompletedEvent,
+)
+from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent
+from openai.types.responses import (
+    ResponseInProgressEvent as OpenAIResponseInProgressEvent,
+)
 from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent)
+    Content as ResponseReasoningTextContent,
+)
+from openai_harmony import Message as OpenAIHarmonyMessage
+
+from vllm.config.pooler import get_use_activation
+from vllm.tasks import PoolingTask
+from vllm.utils.serial_utils import (
+    EmbedDType,
+    EncodingFormat,
+    Endianness,
+)
 
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
     from openai.types.responses import ResponseTextConfig
 except ImportError:  # For newer openai versions (>= 1.100.0)
-    from openai.types.responses import (ResponseFormatTextConfig as
-                                        ResponseTextConfig)
+    from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
+
 
 from openai.types.responses.response import IncompleteDetails, ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
-from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
-                      ValidationInfo, field_validator, model_validator)
-from typing_extensions import TypeAlias
-
-from vllm import envs
-from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         make_tool_call_id)
-from vllm.entrypoints.score_utils import (ScoreContentPartParam,
-                                          ScoreMultiModalParam)
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    TypeAdapter,
+    ValidationError,
+    ValidationInfo,
+    field_serializer,
+    field_validator,
+    model_validator,
+)
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
+from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
-                                  SamplingParams, StructuredOutputsParams)
-from vllm.utils import random_uuid, resolve_obj_by_qualname
+from vllm.sampling_params import (
+    BeamSearchParams,
+    RequestOutputKind,
+    SamplingParams,
+    StructuredOutputsParams,
+)
+from vllm.utils import random_uuid
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -70,7 +103,7 @@ class OpenAIBaseModel(BaseModel):
     model_config = ConfigDict(extra="allow")
 
     # Cache class field names
-    field_names: ClassVar[Optional[set[str]]] = None
+    field_names: ClassVar[set[str] | None] = None
 
     @model_validator(mode="wrap")
     @classmethod
@@ -91,8 +124,7 @@ class OpenAIBaseModel(BaseModel):
         # Compare against both field names and aliases
         if any(k not in field_names for k in data):
             logger.warning(
-                "The following fields were present in the request "
-                "but ignored: %s",
+                "The following fields were present in the request but ignored: %s",
                 data.keys() - field_names,
             )
         return result
@@ -101,7 +133,7 @@ class OpenAIBaseModel(BaseModel):
 class ErrorInfo(OpenAIBaseModel):
     message: str
     type: str
-    param: Optional[str] = None
+    param: str | None = None
     code: int
 
 
@@ -120,7 +152,7 @@ class ModelPermission(OpenAIBaseModel):
     allow_view: bool = True
     allow_fine_tuning: bool = False
     organization: str = "*"
-    group: Optional[str] = None
+    group: str | None = None
     is_blocking: bool = False
 
 
@@ -129,9 +161,9 @@ class ModelCard(OpenAIBaseModel):
     object: str = "model"
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "vllm"
-    root: Optional[str] = None
-    parent: Optional[str] = None
-    max_model_len: Optional[int] = None
+    root: str | None = None
+    parent: str | None = None
+    max_model_len: int | None = None
     permission: list[ModelPermission] = Field(default_factory=list)
 
 
@@ -141,63 +173,74 @@ class ModelList(OpenAIBaseModel):
 
 
 class PromptTokenUsageInfo(OpenAIBaseModel):
-    cached_tokens: Optional[int] = None
+    cached_tokens: int | None = None
 
 
 class UsageInfo(OpenAIBaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
-    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
+    completion_tokens: int | None = 0
+    prompt_tokens_details: PromptTokenUsageInfo | None = None
 
 
 class RequestResponseMetadata(BaseModel):
     request_id: str
-    final_usage_info: Optional[UsageInfo] = None
+    final_usage_info: UsageInfo | None = None
 
 
 class JsonSchemaResponseFormat(OpenAIBaseModel):
     name: str
-    description: Optional[str] = None
+    description: str | None = None
     # schema is the field in openai but that causes conflicts with pydantic so
     # instead use json_schema with an alias
-    json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
-    strict: Optional[bool] = None
+    json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
+    strict: bool | None = None
 
 
-class StructuralTag(OpenAIBaseModel):
+class LegacyStructuralTag(OpenAIBaseModel):
     begin: str
     # schema is the field, but that causes conflicts with pydantic so
     # instead use structural_tag_schema with an alias
-    structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
-                                                            alias="schema")
+    structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
     end: str
 
 
-class StructuralTagResponseFormat(OpenAIBaseModel):
+class LegacyStructuralTagResponseFormat(OpenAIBaseModel):
     type: Literal["structural_tag"]
-    structures: list[StructuralTag]
+    structures: list[LegacyStructuralTag]
     triggers: list[str]
 
 
+class StructuralTagResponseFormat(OpenAIBaseModel):
+    type: Literal["structural_tag"]
+    format: Any
+
+
+AnyStructuralTagResponseFormat: TypeAlias = (
+    LegacyStructuralTagResponseFormat | StructuralTagResponseFormat
+)
+
+
 class ResponseFormat(OpenAIBaseModel):
     # type must be "json_schema", "json_object", or "text"
     type: Literal["text", "json_object", "json_schema"]
-    json_schema: Optional[JsonSchemaResponseFormat] = None
+    json_schema: JsonSchemaResponseFormat | None = None
 
 
-AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
+AnyResponseFormat: TypeAlias = (
+    ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat
+)
 
 
 class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool] = True
-    continuous_usage_stats: Optional[bool] = False
+    include_usage: bool | None = True
+    continuous_usage_stats: bool | None = False
 
 
 class FunctionDefinition(OpenAIBaseModel):
     name: str
-    description: Optional[str] = None
-    parameters: Optional[dict[str, Any]] = None
+    description: str | None = None
+    parameters: dict[str, Any] | None = None
 
 
 class ChatCompletionToolsParam(OpenAIBaseModel):
@@ -218,27 +261,28 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
 # see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
     qualname: str
-    args: Optional[list[Any]] = None
-    kwargs: Optional[dict[str, Any]] = None
+    args: list[Any] | None = None
+    kwargs: dict[str, Any] | None = None
 
     model_config = ConfigDict(extra="forbid")
 
 
-LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
+LogitsProcessors = list[str | LogitsProcessorConstructor]
 
 
-def get_logits_processors(processors: Optional[LogitsProcessors],
-                          pattern: Optional[str]) -> Optional[list[Any]]:
+def get_logits_processors(
+    processors: LogitsProcessors | None, pattern: str | None
+) -> list[Any] | None:
     if processors and pattern:
         logits_processors = []
         for processor in processors:
-            qualname = processor if isinstance(processor,
-                                               str) else processor.qualname
+            qualname = processor if isinstance(processor, str) else processor.qualname
             if not re.match(pattern, qualname):
                 raise ValueError(
                     f"Logits processor '{qualname}' is not allowed by this "
                     "server. See --logits-processor-pattern engine argument "
-                    "for more information.")
+                    "for more information."
+                )
             try:
                 logits_processor = resolve_obj_by_qualname(qualname)
             except Exception as e:
@@ -246,59 +290,63 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
                     f"Logits processor '{qualname}' could not be resolved: {e}"
                 ) from e
             if isinstance(processor, LogitsProcessorConstructor):
-                logits_processor = logits_processor(*processor.args or [],
-                                                    **processor.kwargs or {})
+                logits_processor = logits_processor(
+                    *processor.args or [], **processor.kwargs or {}
+                )
             logits_processors.append(logits_processor)
         return logits_processors
     elif processors:
         raise ValueError(
             "The `logits_processors` argument is not supported by this "
             "server. See --logits-processor-pattern engine argument "
-            "for more information.")
+            "for more information."
+        )
     return None
 
 
-ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
-                                           ResponseReasoningItem,
-                                           ResponseFunctionToolCall]
+ResponseInputOutputItem: TypeAlias = (
+    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
+)
 
 
 class ResponsesRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/responses/create
-    background: Optional[bool] = False
-    include: Optional[list[
-        Literal[
-            "code_interpreter_call.outputs",
-            "computer_call_output.output.image_url",
-            "file_search_call.results",
-            "message.input_image.image_url",
-            "message.output_text.logprobs",
-            "reasoning.encrypted_content",
-        ],
-    ]] = None
-    input: Union[str, list[ResponseInputOutputItem]]
-    instructions: Optional[str] = None
-    max_output_tokens: Optional[int] = None
-    max_tool_calls: Optional[int] = None
-    metadata: Optional[Metadata] = None
-    model: Optional[str] = None
-    parallel_tool_calls: Optional[bool] = True
-    previous_response_id: Optional[str] = None
-    prompt: Optional[ResponsePrompt] = None
-    reasoning: Optional[Reasoning] = None
-    service_tier: Literal["auto", "default", "flex", "scale",
-                          "priority"] = "auto"
-    store: Optional[bool] = True
-    stream: Optional[bool] = False
-    temperature: Optional[float] = None
-    text: Optional[ResponseTextConfig] = None
+    background: bool | None = False
+    include: (
+        list[
+            Literal[
+                "code_interpreter_call.outputs",
+                "computer_call_output.output.image_url",
+                "file_search_call.results",
+                "message.input_image.image_url",
+                "message.output_text.logprobs",
+                "reasoning.encrypted_content",
+            ],
+        ]
+        | None
+    ) = None
+    input: str | list[ResponseInputOutputItem]
+    instructions: str | None = None
+    max_output_tokens: int | None = None
+    max_tool_calls: int | None = None
+    metadata: Metadata | None = None
+    model: str | None = None
+    parallel_tool_calls: bool | None = True
+    previous_response_id: str | None = None
+    prompt: ResponsePrompt | None = None
+    reasoning: Reasoning | None = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
+    store: bool | None = True
+    stream: bool | None = False
+    temperature: float | None = None
+    text: ResponseTextConfig | None = None
     tool_choice: ToolChoice = "auto"
     tools: list[Tool] = Field(default_factory=list)
-    top_logprobs: Optional[int] = 0
-    top_p: Optional[float] = None
-    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
-    user: Optional[str] = None
+    top_logprobs: int | None = 0
+    top_p: float | None = None
+    truncation: Literal["auto", "disabled"] | None = "disabled"
+    user: str | None = None
 
     # --8<-- [start:responses-extra-params]
     request_id: str = Field(
@@ -306,9 +354,10 @@ class ResponsesRequest(OpenAIBaseModel):
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."),
+            "through out the inference process and return in response."
+        ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -317,9 +366,10 @@ class ResponsesRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
-    cache_salt: Optional[str] = Field(
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -327,14 +377,23 @@ class ResponsesRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."))
+            "to 256 bit). Not supported by vLLM engine V0."
+        ),
+    )
 
     enable_response_messages: bool = Field(
         default=False,
         description=(
             "Dictates whether or not to return messages as part of the "
-            "response object. Currently only supported for non-streaming "
-            "non-background and gpt-oss only. "))
+            "response object. Currently only supported for"
+            "non-background and gpt-oss only. "
+        ),
+    )
+    # similar to input_messages / output_messages in ResponsesResponse
+    # we take in previous_input_messages (ie in harmony format)
+    # this cannot be used in conjunction with previous_response_id
+    # TODO: consider supporting non harmony messages as well
+    previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
     # --8<-- [end:responses-extra-params]
 
     _DEFAULT_SAMPLING_PARAMS = {
@@ -345,7 +404,7 @@ class ResponsesRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         default_max_tokens: int,
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict | None = None,
     ) -> SamplingParams:
         if self.max_output_tokens is None:
             max_tokens = default_max_tokens
@@ -355,20 +414,25 @@ class ResponsesRequest(OpenAIBaseModel):
         default_sampling_params = default_sampling_params or {}
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
         if (top_p := self.top_p) is None:
             top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
         stop_token_ids = default_sampling_params.get("stop_token_ids")
 
         # Structured output
         structured_outputs = None
         if self.text is not None and self.text.format is not None:
             response_format = self.text.format
-            if (response_format.type == "json_schema"
-                    and response_format.schema_ is not None):
+            if (
+                response_format.type == "json_schema"
+                and response_format.schema_ is not None
+            ):
                 structured_outputs = StructuredOutputsParams(
-                    json=response_format.schema_)
+                    json=response_format.schema_
+                )
             elif response_format.type == "json_object":
                 raise NotImplementedError("json_object is not supported")
 
@@ -377,11 +441,11 @@ class ResponsesRequest(OpenAIBaseModel):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
-            logprobs=self.top_logprobs
-            if self.is_include_output_logprobs() else None,
+            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
             stop_token_ids=stop_token_ids,
-            output_kind=(RequestOutputKind.DELTA
-                         if self.stream else RequestOutputKind.FINAL_ONLY),
+            output_kind=(
+                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
+            ),
             structured_outputs=structured_outputs,
         )
 
@@ -389,17 +453,17 @@ class ResponsesRequest(OpenAIBaseModel):
         """Check if the request includes output logprobs."""
         if self.include is None:
             return False
-        return isinstance(
-            self.include,
-            list) and "message.output_text.logprobs" in self.include
+        return (
+            isinstance(self.include, list)
+            and "message.output_text.logprobs" in self.include
+        )
 
     @model_validator(mode="before")
     def validate_background(cls, data):
         if not data.get("background"):
             return data
         if not data.get("store", True):
-            raise ValueError(
-                "background can only be used when `store` is true")
+            raise ValueError("background can only be used when `store` is true")
         return data
 
     @model_validator(mode="before")
@@ -410,15 +474,54 @@ class ResponsesRequest(OpenAIBaseModel):
 
     @model_validator(mode="before")
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0.")
-            if not isinstance(data["cache_salt"],
-                              str) or not data["cache_salt"]:
-                raise ValueError("Parameter 'cache_salt' must be a "
-                                 "non-empty string if provided.")
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
+        return data
+
+    @model_validator(mode="before")
+    def function_call_parsing(cls, data):
+        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
+        This ensures Pydantic can properly resolve union types in the input field.
+        Function calls provided as dicts are converted to ResponseFunctionToolCall
+        objects before validation, while invalid structures are left for Pydantic
+        to reject with appropriate error messages.
+        """
+
+        input_data = data.get("input")
+
+        # Early return for None, strings, or bytes
+        # (strings are iterable but shouldn't be processed)
+        if input_data is None or isinstance(input_data, (str, bytes)):
+            return data
+
+        # Convert iterators (like ValidatorIterator) to list
+        if not isinstance(input_data, list):
+            try:
+                input_data = list(input_data)
+            except TypeError:
+                # Not iterable, leave as-is for Pydantic to handle
+                return data
+
+        processed_input = []
+        for item in input_data:
+            if isinstance(item, dict) and item.get("type") == "function_call":
+                try:
+                    processed_input.append(ResponseFunctionToolCall(**item))
+                except ValidationError:
+                    # Let Pydantic handle validation for malformed function calls
+                    logger.debug(
+                        "Failed to parse function_call to ResponseFunctionToolCall, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+            else:
+                processed_input.append(item)
+
+        data["input"] = processed_input
         return data
 
 
@@ -426,55 +529,57 @@ class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
     messages: list[ChatCompletionMessageParam]
-    model: Optional[str] = None
-    frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[dict[str, float]] = None
-    logprobs: Optional[bool] = False
-    top_logprobs: Optional[int] = 0
-    max_tokens: Optional[int] = Field(
-        default=None,
-        deprecated=
-        'max_tokens is deprecated in favor of the max_completion_tokens field')
-    max_completion_tokens: Optional[int] = None
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0.0
-    response_format: Optional[AnyResponseFormat] = None
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = []
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    tools: Optional[list[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[Union[
-        Literal["none"],
-        Literal["auto"],
-        Literal["required"],
-        ChatCompletionNamedToolChoiceParam,
-    ]] = "none"
-    reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
+    model: str | None = None
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = False
+    top_logprobs: int | None = 0
+    max_tokens: int | None = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of "
+        "the max_completion_tokens field",
+    )
+    max_completion_tokens: int | None = None
+    n: int | None = 1
+    presence_penalty: float | None = 0.0
+    response_format: AnyResponseFormat | None = None
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    tools: list[ChatCompletionToolsParam] | None = None
+    tool_choice: (
+        Literal["none"]
+        | Literal["auto"]
+        | Literal["required"]
+        | ChatCompletionNamedToolChoiceParam
+        | None
+    ) = "none"
+    reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: Optional[bool] = False
-    user: Optional[str] = None
+    parallel_tool_calls: bool | None = False
+    user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
-    best_of: Optional[int] = None
+    best_of: int | None = None
     use_beam_search: bool = False
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    repetition_penalty: Optional[float] = None
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = []
+    stop_token_ids: list[int] | None = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
-    prompt_logprobs: Optional[int] = None
-    allowed_token_ids: Optional[list[int]] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    prompt_logprobs: int | None = None
+    allowed_token_ids: list[int] | None = None
     bad_words: list[str] = Field(default_factory=list)
     # --8<-- [end:chat-completion-sampling-params]
 
@@ -483,23 +588,26 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=False,
         description=(
             "If true, the new message will be prepended with the last message "
-            "if they belong to the same role."),
+            "if they belong to the same role."
+        ),
     )
     add_generation_prompt: bool = Field(
         default=True,
-        description=
-        ("If true, the generation prompt will be added to the chat template. "
-         "This is a parameter used by chat template in tokenizer config of the "
-         "model."),
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
     )
     continue_final_message: bool = Field(
         default=False,
-        description=
-        ("If this is set, the chat will be formatted so that the final "
-         "message in the chat is open-ended, without any EOS tokens. The "
-         "model will continue this message rather than starting a new one. "
-         "This allows you to \"prefill\" part of the model's response for it. "
-         "Cannot be used at the same time as `add_generation_prompt`."),
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
     )
     add_special_tokens: bool = Field(
         default=False,
@@ -508,82 +616,92 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "on top of what is added by the chat template. "
             "For most models, the chat template takes care of adding the "
             "special tokens so this should be set to false (as is the "
-            "default)."),
+            "default)."
+        ),
     )
-    documents: Optional[list[dict[str, str]]] = Field(
+    documents: list[dict[str, str]] | None = Field(
         default=None,
-        description=
-        ("A list of dicts representing documents that will be accessible to "
-         "the model if it is performing RAG (retrieval-augmented generation)."
-         " If the template does not support RAG, this argument will have no "
-         "effect. We recommend that each document should be a dict containing "
-         "\"title\" and \"text\" keys."),
+        description=(
+            "A list of dicts representing documents that will be accessible to "
+            "the model if it is performing RAG (retrieval-augmented generation)."
+            " If the template does not support RAG, this argument will have no "
+            "effect. We recommend that each document should be a dict containing "
+            '"title" and "text" keys.'
+        ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."),
+            "does not define one."
+        ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."),
+            "Will be accessible by the chat template."
+        ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    structured_outputs: Optional[StructuredOutputsParams] = Field(
+    structured_outputs: StructuredOutputsParams | None = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    guided_json: str | dict | BaseModel | None = Field(
         default=None,
         description=(
             "`guided_json` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."),
+            "Please pass `json` to `structured_outputs` instead."
+        ),
     )
-    guided_regex: Optional[str] = Field(
+    guided_regex: str | None = Field(
         default=None,
         description=(
             "`guided_regex` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."),
+            "Please pass `regex` to `structured_outputs` instead."
+        ),
     )
-    guided_choice: Optional[list[str]] = Field(
+    guided_choice: list[str] | None = Field(
         default=None,
         description=(
             "`guided_choice` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."),
+            "Please pass `choice` to `structured_outputs` instead."
+        ),
     )
-    guided_grammar: Optional[str] = Field(
+    guided_grammar: str | None = Field(
         default=None,
         description=(
             "`guided_grammar` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."),
+            "Please pass `grammar` to `structured_outputs` instead."
+        ),
     )
-    structural_tag: Optional[str] = Field(
+    structural_tag: str | None = Field(
         default=None,
         description=(
             "`structural_tag` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `structural_tag` to `structured_outputs` instead."),
+            "Please pass `structural_tag` to `structured_outputs` instead."
+        ),
     )
-    guided_decoding_backend: Optional[str] = Field(
+    guided_decoding_backend: str | None = Field(
         default=None,
         description=(
             "`guided_decoding_backend` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."),
+            "Please remove it from your request."
+        ),
     )
-    guided_whitespace_pattern: Optional[str] = Field(
+    guided_whitespace_pattern: str | None = Field(
         default=None,
         description=(
             "`guided_whitespace_pattern` is deprecated. "
@@ -596,16 +714,18 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."),
+            "through out the inference process and return in response."
+        ),
     )
-    logits_processors: Optional[LogitsProcessors] = Field(
+    logits_processors: LogitsProcessors | None = Field(
         default=None,
         description=(
             "A list of either qualified names of logits processors, or "
@@ -615,22 +735,28 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "'args' and 'kwargs' fields containing positional and keyword "
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
-            "{'param': 'value'}}."))
-    return_tokens_as_token_ids: Optional[bool] = Field(
+            "{'param': 'value'}}."
+        ),
+    )
+    return_tokens_as_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
-            "that are not JSON-encodable can be identified."))
-    return_token_ids: Optional[bool] = Field(
+            "that are not JSON-encodable can be identified."
+        ),
+    )
+    return_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified, the result will include token IDs alongside the "
             "generated text. In streaming mode, prompt_token_ids is included "
             "only in the first chunk, and token_ids contains the delta tokens "
             "for each chunk. This is useful for debugging or when you "
-            "need to map generated text back to input tokens."))
-    cache_salt: Optional[str] = Field(
+            "need to map generated text back to input tokens."
+        ),
+    )
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -638,15 +764,20 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."))
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+            "to 256 bit). Not supported by vLLM engine V0."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None,
-        description="KVTransfer parameters used for disaggregated serving.")
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
         default=None,
-        description=("Additional request parameters with string or "
-                     "numeric values, used by custom extensions."),
+        description=(
+            "Additional request parameters with (list of) string or "
+            "numeric values, used by custom extensions."
+        ),
     )
 
     # --8<-- [end:chat-completion-extra-params]
@@ -661,13 +792,13 @@ class ChatCompletionRequest(OpenAIBaseModel):
     }
 
     def to_beam_search_params(
-            self, max_tokens: int,
-            default_sampling_params: dict) -> BeamSearchParams:
-
+        self, max_tokens: int, default_sampling_params: dict
+    ) -> BeamSearchParams:
         n = self.n if self.n is not None else 1
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
 
         return BeamSearchParams(
             beam_width=n,
@@ -681,10 +812,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         max_tokens: int,
-        logits_processor_pattern: Optional[str],
+        logits_processor_pattern: str | None,
         default_sampling_params: dict,
     ) -> SamplingParams:
-
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -693,16 +823,20 @@ class ChatCompletionRequest(OpenAIBaseModel):
             )
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
         if (top_p := self.top_p) is None:
             top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
         if (top_k := self.top_k) is None:
             top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
         if (min_p := self.min_p) is None:
             min_p = default_sampling_params.get(
-                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
+            )
 
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
@@ -723,8 +857,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 self.structured_outputs = StructuredOutputsParams(**kwargs)
 
         response_format = self.response_format
-        json_schema_from_tool = self._get_json_schema_from_tool()
-        if response_format is not None or json_schema_from_tool is not None:
+        if response_format is not None:
             # If structured outputs wasn't already enabled,
             # we must enable it for these features to work
             if self.structured_outputs is None:
@@ -741,14 +874,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 elif response_format.type == "structural_tag":
                     structural_tag = response_format
                     assert structural_tag is not None and isinstance(
-                        structural_tag, StructuralTagResponseFormat)
+                        structural_tag,
+                        (
+                            LegacyStructuralTagResponseFormat,
+                            StructuralTagResponseFormat,
+                        ),
+                    )
                     s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    self.structured_outputs.structural_tag = json.dumps(
-                        s_tag_obj)
-
-            # Set structured output params for tool calling
-            if json_schema_from_tool is not None:
-                self.structured_outputs.json = json_schema_from_tool
+                    self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -774,12 +907,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
             min_tokens=self.min_tokens,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
-            logits_processors=get_logits_processors(self.logits_processors,
-                                                    logits_processor_pattern),
+            logits_processors=get_logits_processors(
+                self.logits_processors, logits_processor_pattern
+            ),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA if self.stream \
-                else RequestOutputKind.FINAL_ONLY,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
             structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
             bad_words=self.bad_words,
@@ -787,85 +922,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
             extra_args=extra_args or None,
         )
 
-    def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
-        # user has chosen to not use any tool
-        if self.tool_choice == "none" or self.tools is None:
-            return None
-
-        # user has chosen to use a named tool
-        if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
-            tool_name = self.tool_choice.function.name
-            tools = {tool.function.name: tool.function for tool in self.tools}
-            if tool_name not in tools:
-                raise ValueError(
-                    f"Tool '{tool_name}' has not been passed in `tools`.")
-            tool = tools[tool_name]
-            return tool.parameters
-
-        if self.tool_choice == "required":
-            # Pydantic schema generation cannot be used since the JSON schema
-            # has to be constructed for a specific instantiation of a tool list
-            # so that parameters of a function are correctly generated
-            # based on the chosen function name
-            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
-                return {
-                    "properties": {
-                        "name": {
-                            "type": "string",
-                            "enum": [tool.function.name]
-                        },
-                        # parameters are always generated as '{}' in the final
-                        # output if they are missing from the request
-                        # (i.e. are None or '{}') so the schema is
-                        # updated to produce an empty object in that case
-                        "parameters": tool.function.parameters
-                        if tool.function.parameters else {
-                            "type": "object",
-                            "properties": {}
-                        }
-                    },
-                    "required": ["name", "parameters"]
-                }
-
-            def get_tool_schema_defs(
-                    tools: list[ChatCompletionToolsParam]) -> dict:
-                all_defs = dict[str, dict[str, Any]]()
-                for tool in tools:
-                    if tool.function.parameters is None:
-                        continue
-                    defs = tool.function.parameters.pop("$defs", {})
-                    for def_name, def_schema in defs.items():
-                        if def_name in all_defs and all_defs[
-                                def_name] != def_schema:
-                            raise ValueError(
-                                f"Tool definition '{def_name}' has "
-                                "multiple schemas, which is not "
-                                "supported.")
-                        else:
-                            all_defs[def_name] = def_schema
-                return all_defs
-
-            json_schema = {
-                "type": "array",
-                "minItems": 1,
-                "items": {
-                    "type": "object",
-                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
-                }
-            }
-            json_schema_defs = get_tool_schema_defs(self.tools)
-            if json_schema_defs:
-                json_schema["$defs"] = json_schema_defs
-            return json_schema
-
-        return None
-
     @model_validator(mode="before")
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError(
-                "Stream options can only be defined when `stream=True`.")
+            raise ValueError("Stream options can only be defined when `stream=True`.")
 
         return data
 
@@ -873,24 +934,18 @@ class ChatCompletionRequest(OpenAIBaseModel):
     @classmethod
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
-            if data.get("stream") and (prompt_logprobs > 0
-                                       or prompt_logprobs == -1):
+            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
                 raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`.")
+                    "`prompt_logprobs` are not available when `stream=True`."
+                )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError(
-                    "`prompt_logprobs` must be a positive value or -1.")
-            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
-                raise ValueError("`prompt_logprobs=-1` is only supported with "
-                                 "vLLM engine V1.")
+                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
         if (top_logprobs := data.get("top_logprobs")) is not None:
             if top_logprobs < 0 and top_logprobs != -1:
-                raise ValueError(
-                    "`top_logprobs` must be a positive value or -1.")
+                raise ValueError("`top_logprobs` must be a positive value or -1.")
 
-            if (top_logprobs == -1
-                    or top_logprobs > 0) and not data.get("logprobs"):
+            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
                 raise ValueError(
                     "when using `top_logprobs`, `logprobs` must be set to true."
                 )
@@ -906,30 +961,32 @@ class ChatCompletionRequest(OpenAIBaseModel):
         if data.get("structured_outputs", None) is None:
             return data
 
-        structured_outputs_kwargs = data['structured_outputs']
+        structured_outputs_kwargs = data["structured_outputs"]
         count = sum(
             structured_outputs_kwargs.get(k) is not None
-            for k in ("json", "regex", "choice"))
+            for k in ("json", "regex", "choice")
+        )
         # you can only use one kind of constraints for structured outputs
         if count > 1:
             raise ValueError(
                 "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice').")
+                "outputs ('json', 'regex' or 'choice')."
+            )
         # you can only either use structured outputs or tools, not both
         if count > 1 and data.get("tool_choice", "none") not in (
-                "none",
-                "auto",
-                "required",
+            "none",
+            "auto",
+            "required",
         ):
             raise ValueError(
                 "You can only either use constraints for structured outputs "
-                "or tools, not both.")
+                "or tools, not both."
+            )
         return data
 
     @model_validator(mode="before")
     @classmethod
     def check_tool_usage(cls, data):
-
         # if "tool_choice" is not specified but tools are provided,
         # default to "auto" tool_choice
         if "tool_choice" not in data and data.get("tools"):
@@ -941,52 +998,58 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
         # if "tool_choice" is specified -- validation
         if "tool_choice" in data and data["tool_choice"] is not None:
-
             # ensure that if "tool choice" is specified, tools are present
             if "tools" not in data or data["tools"] is None:
-                raise ValueError(
-                    "When using `tool_choice`, `tools` must be set.")
+                raise ValueError("When using `tool_choice`, `tools` must be set.")
 
             # make sure that tool choice is either a named tool
             # OR that it's set to "auto" or "required"
-            if data["tool_choice"] not in [
-                    "auto", "required"
-            ] and not isinstance(data["tool_choice"], dict):
+            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
+                data["tool_choice"], dict
+            ):
                 raise ValueError(
-                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
-                    'Only named tools, "none", "auto" or "required" '\
-                    'are supported.'
+                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
+                    'Only named tools, "none", "auto" or "required" '
+                    "are supported."
                 )
 
             # if tool_choice is "required" but the "tools" list is empty,
             # override the data to behave like "none" to align with
             # OpenAI’s behavior.
-            if data["tool_choice"] == "required" and isinstance(
-                    data["tools"], list) and len(data["tools"]) == 0:
+            if (
+                data["tool_choice"] == "required"
+                and isinstance(data["tools"], list)
+                and len(data["tools"]) == 0
+            ):
                 data["tool_choice"] = "none"
                 del data["tools"]
                 return data
 
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
-            correct_usage_message = 'Correct usage: `{"type": "function",' \
+            correct_usage_message = (
+                'Correct usage: `{"type": "function",'
                 ' "function": {"name": "my_function"}}`'
+            )
             if isinstance(data["tool_choice"], dict):
                 valid_tool = False
                 function = data["tool_choice"].get("function")
                 if not isinstance(function, dict):
                     raise ValueError(
                         f"Invalid value for `function`: `{function}` in "
-                        f"`tool_choice`! {correct_usage_message}")
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
                 if "name" not in function:
-                    raise ValueError(f"Expected field `name` in `function` in "
-                                     f"`tool_choice`! {correct_usage_message}")
+                    raise ValueError(
+                        f"Expected field `name` in `function` in "
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
                 function_name = function["name"]
-                if not isinstance(function_name,
-                                  str) or len(function_name) == 0:
+                if not isinstance(function_name, str) or len(function_name) == 0:
                     raise ValueError(
                         f"Invalid `name` in `function`: `{function_name}`"
-                        f" in `tool_choice`! {correct_usage_message}")
+                        f" in `tool_choice`! {correct_usage_message}"
+                    )
                 for tool in data["tools"]:
                     if tool["function"]["name"] == function_name:
                         valid_tool = True
@@ -994,81 +1057,81 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 if not valid_tool:
                     raise ValueError(
                         "The tool specified in `tool_choice` does not match any"
-                        " of the specified `tools`")
+                        " of the specified `tools`"
+                    )
         return data
 
     @model_validator(mode="before")
     @classmethod
     def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get(
-                "add_generation_prompt"):
-            raise ValueError("Cannot set both `continue_final_message` and "
-                             "`add_generation_prompt` to True.")
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
         return data
 
     @model_validator(mode="before")
     @classmethod
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0.")
-            if not isinstance(data["cache_salt"],
-                              str) or not data["cache_salt"]:
-                raise ValueError("Parameter 'cache_salt' must be a "
-                                 "non-empty string if provided.")
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
         return data
 
 
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
-    model: Optional[str] = None
-    prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
-    best_of: Optional[int] = None
-    echo: Optional[bool] = False
-    frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[dict[str, float]] = None
-    logprobs: Optional[int] = None
-    max_tokens: Optional[int] = 16
+    model: str | None = None
+    prompt: list[int] | list[list[int]] | str | list[str] | None = None
+    best_of: int | None = None
+    echo: bool | None = False
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: int | None = None
+    max_tokens: int | None = 16
     n: int = 1
-    presence_penalty: Optional[float] = 0.0
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = []
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = None
-    suffix: Optional[str] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    user: Optional[str] = None
+    presence_penalty: float | None = 0.0
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    suffix: str | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    user: str | None = None
 
     # --8<-- [start:completion-sampling-params]
     use_beam_search: bool = False
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    repetition_penalty: Optional[float] = None
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = []
+    stop_token_ids: list[int] | None = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
-    allowed_token_ids: Optional[list[int]] = None
-    prompt_logprobs: Optional[int] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    allowed_token_ids: list[int] | None = None
+    prompt_logprobs: int | None = None
     # --8<-- [end:completion-sampling-params]
 
     # --8<-- [start:completion-extra-params]
-    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
+    prompt_embeds: bytes | list[bytes] | None = None
     add_special_tokens: bool = Field(
         default=True,
         description=(
             "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."),
+            "the prompt."
+        ),
     )
-    response_format: Optional[AnyResponseFormat] = Field(
+    response_format: AnyResponseFormat | None = Field(
         default=None,
         description=(
             "Similar to chat completion, this parameter specifies the format "
@@ -1076,46 +1139,55 @@ class CompletionRequest(OpenAIBaseModel):
             ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
         ),
     )
-    structured_outputs: Optional[StructuredOutputsParams] = Field(
+    structured_outputs: StructuredOutputsParams | None = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    guided_json: str | dict | BaseModel | None = Field(
         default=None,
         description=(
             "`guided_json` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."),
+            "Please pass `json` to `structured_outputs` instead."
+        ),
     )
-    guided_regex: Optional[str] = Field(
+    guided_regex: str | None = Field(
         default=None,
         description=(
             "`guided_regex` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."),
+            "Please pass `regex` to `structured_outputs` instead."
+        ),
     )
-    guided_choice: Optional[list[str]] = Field(
+    guided_choice: list[str] | None = Field(
         default=None,
         description=(
             "`guided_choice` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."),
+            "Please pass `choice` to `structured_outputs` instead."
+        ),
     )
-    guided_grammar: Optional[str] = Field(
+    guided_grammar: str | None = Field(
         default=None,
         description=(
             "`guided_grammar` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."),
+            "Please pass `grammar` to `structured_outputs` instead."
+        ),
+    )
+    structural_tag: str | None = Field(
+        default=None,
+        description=("If specified, the output will follow the structural tag schema."),
     )
-    guided_decoding_backend: Optional[str] = Field(
+    guided_decoding_backend: str | None = Field(
         default=None,
         description=(
             "`guided_decoding_backend` is deprecated. "
             "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."),
+            "Please remove it from your request."
+        ),
     )
-    guided_whitespace_pattern: Optional[str] = Field(
+    guided_whitespace_pattern: str | None = Field(
         default=None,
         description=(
             "`guided_whitespace_pattern` is deprecated. "
@@ -1128,16 +1200,18 @@ class CompletionRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."),
+            "through out the inference process and return in response."
+        ),
     )
-    logits_processors: Optional[LogitsProcessors] = Field(
+    logits_processors: LogitsProcessors | None = Field(
         default=None,
         description=(
             "A list of either qualified names of logits processors, or "
@@ -1147,24 +1221,30 @@ class CompletionRequest(OpenAIBaseModel):
             "'args' and 'kwargs' fields containing positional and keyword "
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
-            "{'param': 'value'}}."))
+            "{'param': 'value'}}."
+        ),
+    )
 
-    return_tokens_as_token_ids: Optional[bool] = Field(
+    return_tokens_as_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
-            "that are not JSON-encodable can be identified."))
-    return_token_ids: Optional[bool] = Field(
+            "that are not JSON-encodable can be identified."
+        ),
+    )
+    return_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified, the result will include token IDs alongside the "
             "generated text. In streaming mode, prompt_token_ids is included "
             "only in the first chunk, and token_ids contains the delta tokens "
             "for each chunk. This is useful for debugging or when you "
-            "need to map generated text back to input tokens."))
+            "need to map generated text back to input tokens."
+        ),
+    )
 
-    cache_salt: Optional[str] = Field(
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -1172,16 +1252,21 @@ class CompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."))
+            "to 256 bit). Not supported by vLLM engine V0."
+        ),
+    )
 
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None,
-        description="KVTransfer parameters used for disaggregated serving.")
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float] | None = Field(
         default=None,
-        description=("Additional request parameters with string or "
-                     "numeric values, used by custom extensions."),
+        description=(
+            "Additional request parameters with string or "
+            "numeric values, used by custom extensions."
+        ),
     )
 
     # --8<-- [end:completion-extra-params]
@@ -1198,9 +1283,8 @@ class CompletionRequest(OpenAIBaseModel):
     def to_beam_search_params(
         self,
         max_tokens: int,
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict | None = None,
     ) -> BeamSearchParams:
-
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
@@ -1220,10 +1304,9 @@ class CompletionRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         max_tokens: int,
-        logits_processor_pattern: Optional[str],
-        default_sampling_params: Optional[dict] = None,
+        logits_processor_pattern: str | None,
+        default_sampling_params: dict | None = None,
     ) -> SamplingParams:
-
         if default_sampling_params is None:
             default_sampling_params = {}
 
@@ -1235,16 +1318,20 @@ class CompletionRequest(OpenAIBaseModel):
             )
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
         if (top_p := self.top_p) is None:
             top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
         if (top_k := self.top_k) is None:
             top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
         if (min_p := self.min_p) is None:
             min_p = default_sampling_params.get(
-                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
+            )
 
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
@@ -1252,10 +1339,27 @@ class CompletionRequest(OpenAIBaseModel):
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
+        guided_json_object = None
+        if self.response_format is not None:
+            if self.response_format.type == "json_object":
+                guided_json_object = True
+            elif self.response_format.type == "json_schema":
+                json_schema = self.response_format.json_schema
+                assert json_schema is not None
+                self.guided_json = json_schema.json_schema
+            elif self.response_format.type == "structural_tag":
+                structural_tag = self.response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag, StructuralTagResponseFormat
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structural_tag = json.dumps(s_tag_obj)
+
         # Forward deprecated guided_* parameters to structured_outputs
         if self.structured_outputs is None:
             kwargs = dict[str, Any](
                 json=self.guided_json,
+                json_object=guided_json_object,
                 regex=self.guided_regex,
                 choice=self.guided_choice,
                 grammar=self.guided_grammar,
@@ -1265,11 +1369,6 @@ class CompletionRequest(OpenAIBaseModel):
             if len(kwargs) > 0:
                 self.structured_outputs = StructuredOutputsParams(**kwargs)
 
-        if (self.structured_outputs is not None
-                and self.response_format is not None
-                and self.response_format.type == "json_object"):
-            self.structured_outputs.json_object = True
-
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
@@ -1295,16 +1394,18 @@ class CompletionRequest(OpenAIBaseModel):
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
-            logits_processors=get_logits_processors(self.logits_processors,
-                                                    logits_processor_pattern),
+            logits_processors=get_logits_processors(
+                self.logits_processors, logits_processor_pattern
+            ),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA if self.stream \
-                else RequestOutputKind.FINAL_ONLY,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
             structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
-            )
+        )
 
     @model_validator(mode="before")
     @classmethod
@@ -1312,31 +1413,29 @@ class CompletionRequest(OpenAIBaseModel):
         if data.get("structured_outputs", None) is None:
             return data
 
-        structured_outputs_kwargs = data['structured_outputs']
+        structured_outputs_kwargs = data["structured_outputs"]
         count = sum(
             structured_outputs_kwargs.get(k) is not None
-            for k in ("json", "regex", "choice"))
+            for k in ("json", "regex", "choice")
+        )
         if count > 1:
             raise ValueError(
                 "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice').")
+                "outputs ('json', 'regex' or 'choice')."
+            )
         return data
 
     @model_validator(mode="before")
     @classmethod
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
-            if data.get("stream") and (prompt_logprobs > 0
-                                       or prompt_logprobs == -1):
+            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
                 raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`.")
+                    "`prompt_logprobs` are not available when `stream=True`."
+                )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError(
-                    "`prompt_logprobs` must be a positive value or -1.")
-            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
-                raise ValueError("`prompt_logprobs=-1` is only supported with "
-                                 "vLLM engine V1.")
+                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
         if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
             raise ValueError("`logprobs` must be a positive value.")
 
@@ -1346,8 +1445,7 @@ class CompletionRequest(OpenAIBaseModel):
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError(
-                "Stream options can only be defined when `stream=True`.")
+            raise ValueError("Stream options can only be defined when `stream=True`.")
 
         return data
 
@@ -1357,11 +1455,10 @@ class CompletionRequest(OpenAIBaseModel):
         prompt = data.get("prompt")
         prompt_embeds = data.get("prompt_embeds")
 
-        prompt_is_empty = (prompt is None
-                           or (isinstance(prompt, str) and prompt == ""))
-        embeds_is_empty = (prompt_embeds is None
-                           or (isinstance(prompt_embeds, list)
-                               and len(prompt_embeds) == 0))
+        prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
+        embeds_is_empty = prompt_embeds is None or (
+            isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
+        )
 
         if prompt_is_empty and embeds_is_empty:
             raise ValueError(
@@ -1373,76 +1470,96 @@ class CompletionRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     @classmethod
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0.")
-            if not isinstance(data["cache_salt"],
-                              str) or not data["cache_salt"]:
-                raise ValueError("Parameter 'cache_salt' must be a "
-                                 "non-empty string if provided.")
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
         return data
 
 
 class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
-    model: Optional[str] = None
-    input: Union[list[int], list[list[int]], str, list[str]]
-    encoding_format: Literal["float", "base64"] = "float"
-    dimensions: Optional[int] = None
-    user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:embedding-extra-params]
     add_special_tokens: bool = Field(
         default=True,
         description=(
             "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."),
+            "the prompt."
+        ),
     )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."),
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
     )
-    normalize: Optional[bool] = None
-
     # --8<-- [end:embedding-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize)
+            normalize=self.normalize,
+        )
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     messages: list[ChatCompletionMessageParam]
 
-    encoding_format: Literal["float", "base64"] = "float"
-    dimensions: Optional[int] = None
-    user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:chat-embedding-extra-params]
     add_generation_prompt: bool = Field(
         default=False,
-        description=
-        ("If true, the generation prompt will be added to the chat template. "
-         "This is a parameter used by chat template in tokenizer config of the "
-         "model."),
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
     )
 
     add_special_tokens: bool = Field(
@@ -1452,23 +1569,26 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "on top of what is added by the chat template. "
             "For most models, the chat template takes care of adding the "
             "special tokens so this should be set to false (as is the "
-            "default)."),
+            "default)."
+        ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."),
+            "does not define one."
+        ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."),
+            "Will be accessible by the chat template."
+        ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1477,44 +1597,117 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."),
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
     )
-    normalize: Optional[bool] = None
     # --8<-- [end:chat-embedding-extra-params]
 
     @model_validator(mode="before")
     @classmethod
     def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get(
-                "add_generation_prompt"):
-            raise ValueError("Cannot set both `continue_final_message` and "
-                             "`add_generation_prompt` to True.")
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
         return data
 
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
-            normalize=self.normalize)
+            normalize=self.normalize,
+        )
+
 
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
 
-EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
 
-PoolingCompletionRequest = EmbeddingCompletionRequest
-PoolingChatRequest = EmbeddingChatRequest
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
 
 T = TypeVar("T")
 
 
 class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
-    model: Optional[str] = None
+    model: str | None = None
 
     priority: int = Field(default=0)
     """
@@ -1523,19 +1716,32 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     if the served model does not use priority scheduling.
     """
     data: T
-    """
-    When using plugins IOProcessor plugins, the actual input is processed
-    by the plugin itself. Hence, we use a generic type for the request data
-    """
-    softmax: bool = True
+
+    task: PoolingTask = "plugin"
+    encoding_format: EncodingFormat = "float"
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
 
     def to_pooling_params(self):
-        return PoolingParams(task="encode", softmax=self.softmax)
+        return PoolingParams()
 
 
 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
-
-    request_id: Optional[str] = None
+    request_id: str | None = None
     """
     The request_id associated with this response
     """
@@ -1548,19 +1754,20 @@ class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
     """
 
 
-PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest,
-                       IOProcessorRequest]
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
 
 
 class ScoreRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    text_1: Union[list[str], str, ScoreMultiModalParam]
-    text_2: Union[list[str], str, ScoreMultiModalParam]
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:score-extra-params]
 
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1570,29 +1777,44 @@ class ScoreRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
 
-    activation: Optional[bool] = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
 
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation)
+            use_activation=get_use_activation(self),
+        )
 
 
 class RerankRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    query: Union[str, ScoreMultiModalParam]
-    documents: Union[list[str], ScoreMultiModalParam]
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
     top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:rerank-extra-params]
 
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1602,22 +1824,37 @@ class RerankRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
     )
 
-    activation: Optional[bool] = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
 
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation)
+            use_activation=get_use_activation(self),
+        )
 
 
 class RerankDocument(BaseModel):
-    text: Optional[str] = None
-    multi_modal: Optional[ScoreContentPartParam] = None
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
 
 
 class RerankResult(BaseModel):
@@ -1639,27 +1876,27 @@ class RerankResponse(OpenAIBaseModel):
 
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: list[int] = Field(default_factory=list)
-    token_logprobs: list[Optional[float]] = Field(default_factory=list)
+    token_logprobs: list[float | None] = Field(default_factory=list)
     tokens: list[str] = Field(default_factory=list)
-    top_logprobs: list[Optional[dict[str,
-                                     float]]] = Field(default_factory=list)
+    top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
 
 
 class CompletionResponseChoice(OpenAIBaseModel):
     index: int
     text: str
-    logprobs: Optional[CompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = Field(
+    logprobs: CompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = Field(
         default=None,
         description=(
             "The stop string or token id that caused the completion "
             "to stop, None if the completion finished for some other reason "
-            "including encountering the EOS token"),
+            "including encountering the EOS token"
+        ),
     )
-    token_ids: Optional[list[int]] = None  # For response
-    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
-    prompt_token_ids: Optional[list[int]] = None  # For prompt
+    token_ids: list[int] | None = None  # For response
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None  # For prompt
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1668,32 +1905,33 @@ class CompletionResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[CompletionResponseChoice]
-    service_tier: Optional[Literal["auto", "default", "flex", "scale",
-                                   "priority"]] = None
-    system_fingerprint: Optional[str] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
     usage: UsageInfo
 
     # vLLM-specific fields that are not in OpenAI spec
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
-        default=None, description="KVTransfer parameters.")
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None, description="KVTransfer parameters."
+    )
 
 
 class CompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     text: str
-    logprobs: Optional[CompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = Field(
+    logprobs: CompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = Field(
         default=None,
         description=(
             "The stop string or token id that caused the completion "
             "to stop, None if the completion finished for some other reason "
-            "including encountering the EOS token"),
+            "including encountering the EOS token"
+        ),
     )
     # not part of the OpenAI spec but for tracing the tokens
     # prompt tokens is put into choice to align with CompletionResponseChoice
-    prompt_token_ids: Optional[list[int]] = None
-    token_ids: Optional[list[int]] = None
+    prompt_token_ids: list[int] | None = None
+    token_ids: list[int] | None = None
 
 
 class CompletionStreamResponse(OpenAIBaseModel):
@@ -1702,13 +1940,13 @@ class CompletionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[CompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class EmbeddingResponseData(OpenAIBaseModel):
     index: int
     object: str = "embedding"
-    embedding: Union[list[float], str]
+    embedding: list[float] | str
 
 
 class EmbeddingResponse(OpenAIBaseModel):
@@ -1720,10 +1958,16 @@ class EmbeddingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class EmbeddingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
+
+
 class PoolingResponseData(OpenAIBaseModel):
     index: int
     object: str = "pooling"
-    data: Union[list[list[float]], list[float], str]
+    data: list[list[float]] | list[float] | str
 
 
 class PoolingResponse(OpenAIBaseModel):
@@ -1735,6 +1979,12 @@ class PoolingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class PoolingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
+
+
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
@@ -1750,11 +2000,11 @@ class ScoreResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
-class ClassificationRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    input: Union[list[str], str]
-    truncate_prompt_tokens: Optional[int] = None
-    user: Optional[str] = None
+class ClassificationCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
 
     # --8<-- [start:classification-extra-params]
     priority: int = Field(
@@ -1762,22 +2012,147 @@ class ClassificationRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."),
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
     )
 
-    activation: Optional[bool] = None
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
 
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation)
+            use_activation=get_use_activation(self),
+        )
+
+
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
 
 
 class ClassificationData(OpenAIBaseModel):
     index: int
-    label: Optional[str]
+    label: str | None
     probs: list[float]
     num_classes: int
 
@@ -1803,16 +2178,16 @@ class ToolCall(OpenAIBaseModel):
 
 
 class DeltaFunctionCall(BaseModel):
-    name: Optional[str] = None
-    arguments: Optional[str] = None
+    name: str | None = None
+    arguments: str | None = None
 
 
 # a tool call delta where everything is optional
 class DeltaToolCall(OpenAIBaseModel):
-    id: Optional[str] = None
-    type: Optional[Literal["function"]] = None
+    id: str | None = None
+    type: Literal["function"] | None = None
     index: int
-    function: Optional[DeltaFunctionCall] = None
+    function: DeltaFunctionCall | None = None
 
 
 class ExtractedToolCallInformation(BaseModel):
@@ -1824,50 +2199,58 @@ class ExtractedToolCallInformation(BaseModel):
 
     # content - per OpenAI spec, content AND tool calls can be returned rarely
     # But some models will do this intentionally
-    content: Optional[str] = None
+    content: str | None = None
 
 
 class ChatMessage(OpenAIBaseModel):
     role: str
-    content: Optional[str] = None
-    refusal: Optional[str] = None
-    annotations: Optional[OpenAIAnnotation] = None
-    audio: Optional[OpenAIChatCompletionAudio] = None
-    function_call: Optional[FunctionCall] = None
+    content: str | None = None
+    refusal: str | None = None
+    annotations: OpenAIAnnotation | None = None
+    audio: OpenAIChatCompletionAudio | None = None
+    function_call: FunctionCall | None = None
     tool_calls: list[ToolCall] = Field(default_factory=list)
 
     # vLLM-specific fields that are not in OpenAI spec
-    reasoning_content: Optional[str] = None
+    reasoning: str | None = None
+    reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
+
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
 
 
 class ChatCompletionLogProb(OpenAIBaseModel):
     token: str
     logprob: float = -9999.0
-    bytes: Optional[list[int]] = None
+    bytes: list[int] | None = None
 
 
 class ChatCompletionLogProbsContent(ChatCompletionLogProb):
     # Workaround: redefine fields name cache so that it's not
     # shared with the super class.
-    field_names: ClassVar[Optional[set[str]]] = None
+    field_names: ClassVar[set[str] | None] = None
     top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
 
 
 class ChatCompletionLogProbs(OpenAIBaseModel):
-    content: Optional[list[ChatCompletionLogProbsContent]] = None
+    content: list[ChatCompletionLogProbsContent] | None = None
 
 
 class ChatCompletionResponseChoice(OpenAIBaseModel):
     index: int
     message: ChatMessage
-    logprobs: Optional[ChatCompletionLogProbs] = None
+    logprobs: ChatCompletionLogProbs | None = None
     # per OpenAI spec this is the default
-    finish_reason: Optional[str] = "stop"
+    finish_reason: str | None = "stop"
     # not part of the OpenAI spec but included in vLLM for legacy reasons
-    stop_reason: Optional[Union[int, str]] = None
+    stop_reason: int | str | None = None
     # not part of the OpenAI spec but is useful for tracing the tokens
     # in agent scenarios
-    token_ids: Optional[list[int]] = None
+    token_ids: list[int] | None = None
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
@@ -1876,33 +2259,41 @@ class ChatCompletionResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[ChatCompletionResponseChoice]
-    service_tier: Optional[Literal["auto", "default", "flex", "scale",
-                                   "priority"]] = None
-    system_fingerprint: Optional[str] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
     usage: UsageInfo
 
     # vLLM-specific fields that are not in OpenAI spec
-    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
-    prompt_token_ids: Optional[list[int]] = None
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
-        default=None, description="KVTransfer parameters.")
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None, description="KVTransfer parameters."
+    )
 
 
 class DeltaMessage(OpenAIBaseModel):
-    role: Optional[str] = None
-    content: Optional[str] = None
-    reasoning_content: Optional[str] = None
+    role: str | None = None
+    content: str | None = None
+    reasoning: str | None = None
+    reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
     tool_calls: list[DeltaToolCall] = Field(default_factory=list)
 
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
+
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     delta: DeltaMessage
-    logprobs: Optional[ChatCompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    logprobs: ChatCompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
     # not part of the OpenAI spec but for tracing the tokens
-    token_ids: Optional[list[int]] = None
+    token_ids: list[int] | None = None
 
 
 class ChatCompletionStreamResponse(OpenAIBaseModel):
@@ -1911,15 +2302,15 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[ChatCompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
     # not part of the OpenAI spec but for tracing the tokens
-    prompt_token_ids: Optional[list[int]] = None
+    prompt_token_ids: list[int] | None = None
 
 
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
 
 
 class TranscriptionStreamResponse(OpenAIBaseModel):
@@ -1928,16 +2319,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[TranscriptionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class InputTokensDetails(OpenAIBaseModel):
     cached_tokens: int
+    input_tokens_per_turn: list[int] = Field(default_factory=list)
+    cached_tokens_per_turn: list[int] = Field(default_factory=list)
 
 
 class OutputTokensDetails(OpenAIBaseModel):
     reasoning_tokens: int = 0
     tool_output_tokens: int = 0
+    output_tokens_per_turn: list[int] = Field(default_factory=list)
+    tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
 
 
 class ResponseUsage(OpenAIBaseModel):
@@ -1948,21 +2343,36 @@ class ResponseUsage(OpenAIBaseModel):
     total_tokens: int
 
 
+def serialize_message(msg):
+    """
+    Serializes a single message
+    """
+    if isinstance(msg, dict):
+        return msg
+    elif hasattr(msg, "to_dict"):
+        return msg.to_dict()
+    else:
+        # fallback to pyandic dump
+        return msg.model_dump_json()
+
+
+def serialize_messages(msgs):
+    """
+    Serializes multiple messages
+    """
+    return [serialize_message(msg) for msg in msgs] if msgs else None
+
+
 class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
     created_at: int = Field(default_factory=lambda: int(time.time()))
     # error: Optional[ResponseError] = None
-    incomplete_details: Optional[IncompleteDetails] = None
-    instructions: Optional[str] = None
-    metadata: Optional[Metadata] = None
+    incomplete_details: IncompleteDetails | None = None
+    instructions: str | None = None
+    metadata: Metadata | None = None
     model: str
     object: Literal["response"] = "response"
     output: list[ResponseOutputItem]
-    # These are populated when enable_response_messages is set to True
-    # TODO: Currently an issue where content of harmony messages
-    # is not available when these are serialized. Metadata is available
-    input_messages: Optional[list[ChatCompletionMessageParam]] = None
-    output_messages: Optional[list[ChatCompletionMessageParam]] = None
     parallel_tool_calls: bool
     temperature: float
     tool_choice: ToolChoice
@@ -1970,17 +2380,38 @@ class ResponsesResponse(OpenAIBaseModel):
     top_p: float
     background: bool
     max_output_tokens: int
-    max_tool_calls: Optional[int] = None
-    previous_response_id: Optional[str] = None
-    prompt: Optional[ResponsePrompt] = None
-    reasoning: Optional[Reasoning] = None
+    max_tool_calls: int | None = None
+    previous_response_id: str | None = None
+    prompt: ResponsePrompt | None = None
+    reasoning: Reasoning | None = None
     service_tier: Literal["auto", "default", "flex", "scale", "priority"]
     status: ResponseStatus
-    text: Optional[ResponseTextConfig] = None
-    top_logprobs: Optional[int] = None
+    text: ResponseTextConfig | None = None
+    top_logprobs: int | None = None
     truncation: Literal["auto", "disabled"]
-    usage: Optional[ResponseUsage] = None
-    user: Optional[str] = None
+    usage: ResponseUsage | None = None
+    user: str | None = None
+
+    # --8<-- [start:responses-extra-params]
+    # These are populated when enable_response_messages is set to True
+    # NOTE: custom serialization is needed
+    # see serialize_input_messages and serialize_output_messages
+    input_messages: list[ChatCompletionMessageParam] | None = None
+    output_messages: list[ChatCompletionMessageParam] | None = None
+    # --8<-- [end:responses-extra-params]
+
+    # NOTE: openAI harmony doesn't serialize TextContent properly,
+    # TODO: this fixes for TextContent, but need to verify for tools etc
+    # https://github.com/openai/harmony/issues/78
+    @field_serializer("output_messages", when_used="json")
+    def serialize_output_messages(self, msgs, _info):
+        return serialize_messages(msgs)
+
+    # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it
+    # https://github.com/openai/harmony/issues/78
+    @field_serializer("input_messages", when_used="json")
+    def serialize_input_messages(self, msgs, _info):
+        return serialize_messages(msgs)
 
     @classmethod
     def from_request(
@@ -1991,14 +2422,13 @@ class ResponsesResponse(OpenAIBaseModel):
         created_time: int,
         output: list[ResponseOutputItem],
         status: ResponseStatus,
-        usage: Optional[ResponseUsage] = None,
-        input_messages: Optional[list[ChatCompletionMessageParam]] = None,
-        output_messages: Optional[list[ChatCompletionMessageParam]] = None,
+        usage: ResponseUsage | None = None,
+        input_messages: list[ChatCompletionMessageParam] | None = None,
+        output_messages: list[ChatCompletionMessageParam] | None = None,
     ) -> "ResponsesResponse":
-
-        incomplete_details: Optional[IncompleteDetails] = None
-        if status == 'incomplete':
-            incomplete_details = IncompleteDetails(reason='max_output_tokens')
+        incomplete_details: IncompleteDetails | None = None
+        if status == "incomplete":
+            incomplete_details = IncompleteDetails(reason="max_output_tokens")
         # TODO: implement the other reason for incomplete_details,
         # which is content_filter
         # incomplete_details = IncompleteDetails(reason='content_filter')
@@ -2077,30 +2507,45 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
     """The type of the event. Always `response.reasoning_part.added`."""
 
 
-StreamingResponsesResponse: TypeAlias = Union[
-    ResponseCreatedEvent,
-    ResponseInProgressEvent,
-    ResponseCompletedEvent,
-    ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent,
-    ResponseContentPartAddedEvent,
-    ResponseContentPartDoneEvent,
-    ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent,
-    ResponseReasoningPartAddedEvent,
-    ResponseReasoningPartDoneEvent,
-    ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallCodeDeltaEvent,
-    ResponseWebSearchCallInProgressEvent,
-    ResponseWebSearchCallSearchingEvent,
-    ResponseWebSearchCallCompletedEvent,
-    ResponseCodeInterpreterCallCodeDoneEvent,
-    ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterCallCompletedEvent,
-]
+# vLLM Streaming Events
+# Note: we override the response type with the vLLM ResponsesResponse type
+class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
+    response: ResponsesResponse  # type: ignore[override]
 
-BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
-                              ScoreRequest, RerankRequest]
+
+class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+StreamingResponsesResponse: TypeAlias = (
+    ResponseCreatedEvent
+    | ResponseInProgressEvent
+    | ResponseCompletedEvent
+    | ResponseOutputItemAddedEvent
+    | ResponseOutputItemDoneEvent
+    | ResponseContentPartAddedEvent
+    | ResponseContentPartDoneEvent
+    | ResponseReasoningTextDeltaEvent
+    | ResponseReasoningTextDoneEvent
+    | ResponseReasoningPartAddedEvent
+    | ResponseReasoningPartDoneEvent
+    | ResponseCodeInterpreterCallInProgressEvent
+    | ResponseCodeInterpreterCallCodeDeltaEvent
+    | ResponseWebSearchCallInProgressEvent
+    | ResponseWebSearchCallSearchingEvent
+    | ResponseWebSearchCallCompletedEvent
+    | ResponseCodeInterpreterCallCodeDoneEvent
+    | ResponseCodeInterpreterCallInterpretingEvent
+    | ResponseCodeInterpreterCallCompletedEvent
+)
+
+BatchRequestInputBody: TypeAlias = (
+    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
+)
 
 
 class BatchRequestInput(OpenAIBaseModel):
@@ -2125,7 +2570,7 @@ class BatchRequestInput(OpenAIBaseModel):
     # The parameters of the request.
     body: BatchRequestInputBody
 
-    @field_validator('body', mode='plain')
+    @field_validator("body", mode="plain")
     @classmethod
     def check_type_for_url(cls, value: Any, info: ValidationInfo):
         # Use url to disambiguate models
@@ -2149,8 +2594,13 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
-                         ScoreResponse, RerankResponse]] = None
+    body: (
+        ChatCompletionResponse
+        | EmbeddingResponse
+        | ScoreResponse
+        | RerankResponse
+        | None
+    ) = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
@@ -2164,54 +2614,59 @@ class BatchRequestOutput(OpenAIBaseModel):
     # inputs.
     custom_id: str
 
-    response: Optional[BatchResponseData]
+    response: BatchResponseData | None
 
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
-    error: Optional[Any]
+    error: Any | None
 
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     prompt: str
 
     add_special_tokens: bool = Field(
         default=True,
         description=(
             "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."),
+            "the prompt."
+        ),
     )
-    return_token_strs: Optional[bool] = Field(
+    return_token_strs: bool | None = Field(
         default=False,
-        description=("If true, also return the token strings "
-                     "corresponding to the token ids."),
+        description=(
+            "If true, also return the token strings corresponding to the token ids."
+        ),
     )
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     messages: list[ChatCompletionMessageParam]
 
     add_generation_prompt: bool = Field(
         default=True,
-        description=
-        ("If true, the generation prompt will be added to the chat template. "
-         "This is a parameter used by chat template in tokenizer config of the "
-         "model."),
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
     )
-    return_token_strs: Optional[bool] = Field(
+    return_token_strs: bool | None = Field(
         default=False,
-        description=("If true, also return the token strings "
-                     "corresponding to the token ids."),
+        description=(
+            "If true, also return the token strings corresponding to the token ids."
+        ),
     )
     continue_final_message: bool = Field(
         default=False,
-        description=
-        ("If this is set, the chat will be formatted so that the final "
-         "message in the chat is open-ended, without any EOS tokens. The "
-         "model will continue this message rather than starting a new one. "
-         "This allows you to \"prefill\" part of the model's response for it. "
-         "Cannot be used at the same time as `add_generation_prompt`."),
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
     )
     add_special_tokens: bool = Field(
         default=False,
@@ -2220,27 +2675,30 @@ class TokenizeChatRequest(OpenAIBaseModel):
             "on top of what is added by the chat template. "
             "For most models, the chat template takes care of adding the "
             "special tokens so this should be set to false (as is the "
-            "default)."),
+            "default)."
+        ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."),
+            "does not define one."
+        ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."),
+            "Will be accessible by the chat template."
+        ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    tools: Optional[list[ChatCompletionToolsParam]] = Field(
+    tools: list[ChatCompletionToolsParam] | None = Field(
         default=None,
         description=("A list of tools the model may call."),
     )
@@ -2248,25 +2706,26 @@ class TokenizeChatRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     @classmethod
     def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get(
-                "add_generation_prompt"):
-            raise ValueError("Cannot set both `continue_final_message` and "
-                             "`add_generation_prompt` to True.")
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
         return data
 
 
-TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
+TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
 
 
 class TokenizeResponse(OpenAIBaseModel):
     count: int
     max_model_len: int
     tokens: list[int]
-    token_strs: Optional[list[str]] = None
+    token_strs: list[str] | None = None
 
 
 class DetokenizeRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     tokens: list[int]
 
 
@@ -2291,12 +2750,11 @@ class LoadLoRAAdapterRequest(BaseModel):
 
 class UnloadLoRAAdapterRequest(BaseModel):
     lora_name: str
-    lora_int_id: Optional[int] = Field(default=None)
+    lora_int_id: int | None = Field(default=None)
 
 
 ## Protocols for Audio
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
-                                         "vtt"]
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
 
 
 class TranscriptionRequest(OpenAIBaseModel):
@@ -2309,11 +2767,11 @@ class TranscriptionRequest(OpenAIBaseModel):
     formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     """
 
-    model: Optional[str] = None
+    model: str | None = None
     """ID of the model to use.
     """
 
-    language: Optional[str] = None
+    language: str | None = None
     """The language of the input audio.
 
     Supplying the input language in
@@ -2338,7 +2796,8 @@ class TranscriptionRequest(OpenAIBaseModel):
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
-        alias="timestamp_granularities[]", default=[])
+        alias="timestamp_granularities[]", default=[]
+    )
     """The timestamp granularities to populate for this transcription.
 
     `response_format` must be set `verbose_json` to use timestamp granularities.
@@ -2347,23 +2806,25 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
-    stream: Optional[bool] = False
+    stream: bool | None = False
     """When set, it will enable output to be streamed in a similar fashion
     as the Chat Completion endpoint.
     """
     # --8<-- [start:transcription-extra-params]
     # Flattened stream option to simplify form data.
-    stream_include_usage: Optional[bool] = False
-    stream_continuous_usage_stats: Optional[bool] = False
+    stream_include_usage: bool | None = False
+    stream_continuous_usage_stats: bool | None = False
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float] | None = Field(
         default=None,
-        description=("Additional request parameters with string or "
-                     "numeric values, used by custom extensions."),
+        description=(
+            "Additional request parameters with string or "
+            "numeric values, used by custom extensions."
+        ),
     )
     # --8<-- [end:transcription-extra-params]
 
-    to_language: Optional[str] = None
+    to_language: str | None = None
     """The language of the output audio we transcribe to.
 
     Please note that this is not currently used by supported models at this
@@ -2380,29 +2841,29 @@ class TranscriptionRequest(OpenAIBaseModel):
     to automatically increase the temperature until certain thresholds are hit.
     """
 
-    top_p: Optional[float] = None
+    top_p: float | None = None
     """Enables nucleus (top-p) sampling, where tokens are selected from the
     smallest possible set whose cumulative probability exceeds `p`.
     """
 
-    top_k: Optional[int] = None
+    top_k: int | None = None
     """Limits sampling to the `k` most probable tokens at each step."""
 
-    min_p: Optional[float] = None
+    min_p: float | None = None
     """Filters out tokens with a probability lower than `min_p`, ensuring a
     minimum likelihood threshold during sampling.
     """
 
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     """The seed to use for sampling."""
 
-    frequency_penalty: Optional[float] = 0.0
+    frequency_penalty: float | None = 0.0
     """The frequency penalty to use for sampling."""
 
-    repetition_penalty: Optional[float] = None
+    repetition_penalty: float | None = None
     """The repetition penalty to use for sampling."""
 
-    presence_penalty: Optional[float] = 0.0
+    presence_penalty: float | None = 0.0
     """The presence penalty to use for sampling."""
     # --8<-- [end:transcription-sampling-params]
 
@@ -2416,10 +2877,8 @@ class TranscriptionRequest(OpenAIBaseModel):
     }
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-
+        self, default_max_tokens: int, default_sampling_params: dict | None = None
+    ) -> SamplingParams:
         max_tokens = default_max_tokens
 
         if default_sampling_params is None:
@@ -2428,35 +2887,42 @@ class TranscriptionRequest(OpenAIBaseModel):
         # Default parameters
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
         if (top_p := self.top_p) is None:
             top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
         if (top_k := self.top_k) is None:
             top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
         if (min_p := self.min_p) is None:
             min_p = default_sampling_params.get(
-                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
+            )
 
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
                 "repetition_penalty",
-                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
-
-        return SamplingParams.from_optional(temperature=temperature,
-                                            max_tokens=max_tokens,
-                                            seed=self.seed,
-                                            top_p=top_p,
-                                            top_k=top_k,
-                                            min_p=min_p,
-                                            frequency_penalty=self.frequency_penalty,
-                                            repetition_penalty=repetition_penalty,
-                                            presence_penalty=self.presence_penalty,
-                                            output_kind=RequestOutputKind.DELTA
-                                            if self.stream \
-                                            else RequestOutputKind.FINAL_ONLY,
-                                            extra_args=self.vllm_xargs)
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            seed=self.seed,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=self.frequency_penalty,
+            repetition_penalty=repetition_penalty,
+            presence_penalty=self.presence_penalty,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
+            extra_args=self.vllm_xargs,
+        )
 
     @model_validator(mode="before")
     @classmethod
@@ -2470,8 +2936,7 @@ class TranscriptionRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError(
-                "Stream options can only be defined when `stream=True`.")
+            raise ValueError("Stream options can only be defined when `stream=True`.")
 
         return data
 
@@ -2551,17 +3016,17 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
     text: str
     """The transcribed text."""
 
-    segments: Optional[list[TranscriptionSegment]] = None
+    segments: list[TranscriptionSegment] | None = None
     """Segments of the transcribed text and their corresponding details."""
 
-    words: Optional[list[TranscriptionWord]] = None
+    words: list[TranscriptionWord] | None = None
     """Extracted words and their corresponding timestamps."""
 
 
 class TranslationResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
 
 
 class TranslationStreamResponse(OpenAIBaseModel):
@@ -2570,7 +3035,7 @@ class TranslationStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[TranslationResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class TranslationRequest(OpenAIBaseModel):
@@ -2583,7 +3048,7 @@ class TranslationRequest(OpenAIBaseModel):
     formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     """
 
-    model: Optional[str] = None
+    model: str | None = None
     """ID of the model to use.
     """
 
@@ -2603,7 +3068,7 @@ class TranslationRequest(OpenAIBaseModel):
 
     # TODO support additional sampling parameters
     # --8<-- [start:translation-sampling-params]
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     """The seed to use for sampling."""
 
     temperature: float = Field(default=0.0)
@@ -2617,7 +3082,7 @@ class TranslationRequest(OpenAIBaseModel):
     # --8<-- [end:translation-sampling-params]
 
     # --8<-- [start:translation-extra-params]
-    language: Optional[str] = None
+    language: str | None = None
     """The language of the input audio we translate from.
 
     Supplying the input language in
@@ -2625,7 +3090,7 @@ class TranslationRequest(OpenAIBaseModel):
     will improve accuracy.
     """
 
-    to_language: Optional[str] = None
+    to_language: str | None = None
     """The language of the input audio we translate to.
 
     Please note that this is not supported by all models, refer to the specific
@@ -2633,14 +3098,14 @@ class TranslationRequest(OpenAIBaseModel):
     For instance, Whisper only supports `to_language=en`.
     """
 
-    stream: Optional[bool] = False
+    stream: bool | None = False
     """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
     Completion endpoint.
     """
     # Flattened stream option to simplify form data.
-    stream_include_usage: Optional[bool] = False
-    stream_continuous_usage_stats: Optional[bool] = False
+    stream_include_usage: bool | None = False
+    stream_continuous_usage_stats: bool | None = False
     # --8<-- [end:translation-extra-params]
 
     # Default sampling parameters for translation requests.
@@ -2649,10 +3114,8 @@ class TranslationRequest(OpenAIBaseModel):
     }
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-
+        self, default_max_tokens: int, default_sampling_params: dict | None = None
+    ) -> SamplingParams:
         max_tokens = default_max_tokens
 
         if default_sampling_params is None:
@@ -2660,14 +3123,17 @@ class TranslationRequest(OpenAIBaseModel):
         # Default parameters
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
 
-        return SamplingParams.from_optional(temperature=temperature,
-                                            max_tokens=max_tokens,
-                                            seed=self.seed,
-                                            output_kind=RequestOutputKind.DELTA
-                                            if self.stream \
-                                            else RequestOutputKind.FINAL_ONLY)
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            seed=self.seed,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
+        )
 
     @model_validator(mode="before")
     @classmethod
@@ -2675,8 +3141,7 @@ class TranslationRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError(
-                "Stream options can only be defined when `stream=True`.")
+            raise ValueError("Stream options can only be defined when `stream=True`.")
 
         return data
 
@@ -2750,8 +3215,85 @@ class TranslationResponseVerbose(OpenAIBaseModel):
     text: str
     """The translated text."""
 
-    segments: Optional[list[TranslationSegment]] = None
+    segments: list[TranslationSegment] | None = None
     """Segments of the translated text and their corresponding details."""
 
-    words: Optional[list[TranslationWord]] = None
+    words: list[TranslationWord] | None = None
     """Extracted words and their corresponding timestamps."""
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 2568c21c4abe958b7be60f40b996107c4d12ac13..4b9dba085a8e90458de111740367039e69a76b48 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -4,35 +4,36 @@
 import asyncio
 import tempfile
 from argparse import Namespace
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from http import HTTPStatus
 from io import StringIO
-from typing import Callable, Optional
 
 import aiohttp
 import torch
 from prometheus_client import start_http_server
 from tqdm import tqdm
 
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (BatchRequestInput,
-                                              BatchRequestOutput,
-                                              BatchResponseData,
-                                              ChatCompletionResponse,
-                                              EmbeddingResponse, ErrorResponse,
-                                              RerankResponse, ScoreResponse)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    BatchRequestInput,
+    BatchRequestOutput,
+    BatchResponseData,
+    ChatCompletionResponse,
+    EmbeddingResponse,
+    ErrorResponse,
+    RerankResponse,
+    ScoreResponse,
+)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.reasoning import ReasoningParserManager
+from vllm.utils import random_uuid
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -44,10 +45,10 @@ def make_arg_parser(parser: FlexibleArgumentParser):
         "--input-file",
         required=True,
         type=str,
-        help=
-        "The path or url to a single input file. Currently supports local file "
+        help="The path or url to a single input file. Currently supports local file "
         "paths, or the http protocol (http or https). If a URL is specified, "
-        "the file should be available via HTTP GET.")
+        "the file should be available via HTTP GET.",
+    )
     parser.add_argument(
         "-o",
         "--output-file",
@@ -55,7 +56,8 @@ def make_arg_parser(parser: FlexibleArgumentParser):
         type=str,
         help="The path or url to a single output file. Currently supports "
         "local file paths, or web (http or https) urls. If a URL is specified,"
-        " the file should be available via HTTP PUT.")
+        " the file should be available via HTTP PUT.",
+    )
     parser.add_argument(
         "--output-tmp-dir",
         type=str,
@@ -63,24 +65,27 @@ def make_arg_parser(parser: FlexibleArgumentParser):
         help="The directory to store the output file before uploading it "
         "to the output URL.",
     )
-    parser.add_argument("--response-role",
-                        type=optional_type(str),
-                        default="assistant",
-                        help="The role name to return if "
-                        "`request.add_generation_prompt=True`.")
+    parser.add_argument(
+        "--response-role",
+        type=optional_type(str),
+        default="assistant",
+        help="The role name to return if `request.add_generation_prompt=True`.",
+    )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
-    parser.add_argument('--max-log-len',
-                        type=int,
-                        default=None,
-                        help='Max number of prompt characters or prompt '
-                        'ID numbers being printed in log.'
-                        '\n\nDefault: Unlimited')
+    parser.add_argument(
+        "--max-log-len",
+        type=int,
+        default=None,
+        help="Max number of prompt characters or prompt "
+        "ID numbers being printed in log."
+        "\n\nDefault: Unlimited",
+    )
 
-    parser.add_argument("--enable-metrics",
-                        action="store_true",
-                        help="Enable Prometheus metrics")
+    parser.add_argument(
+        "--enable-metrics", action="store_true", help="Enable Prometheus metrics"
+    )
     parser.add_argument(
         "--url",
         type=str,
@@ -97,16 +102,23 @@ def make_arg_parser(parser: FlexibleArgumentParser):
     )
     parser.add_argument(
         "--enable-prompt-tokens-details",
-        action='store_true',
+        action="store_true",
+        default=False,
+        help="If set to True, enable prompt_tokens_details in usage.",
+    )
+    parser.add_argument(
+        "--enable-force-include-usage",
+        action="store_true",
         default=False,
-        help="If set to True, enable prompt_tokens_details in usage.")
+        help="If set to True, include usage on every request "
+        "(even when stream_options is not specified)",
+    )
 
     return parser
 
 
 def parse_args():
-    parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible batch runner.")
+    parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.")
     return make_arg_parser(parser).parse_args()
 
 
@@ -118,10 +130,9 @@ _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elap
 
 
 class BatchProgressTracker:
-
     def __init__(self):
         self._total = 0
-        self._pbar: Optional[tqdm] = None
+        self._pbar: tqdm | None = None
 
     def submitted(self):
         self._total += 1
@@ -131,29 +142,32 @@ class BatchProgressTracker:
             self._pbar.update()
 
     def pbar(self) -> tqdm:
-        enable_tqdm = not torch.distributed.is_initialized(
-        ) or torch.distributed.get_rank() == 0
-        self._pbar = tqdm(total=self._total,
-                          unit="req",
-                          desc="Running batch",
-                          mininterval=5,
-                          disable=not enable_tqdm,
-                          bar_format=_BAR_FORMAT)
+        enable_tqdm = (
+            not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+        )
+        self._pbar = tqdm(
+            total=self._total,
+            unit="req",
+            desc="Running batch",
+            mininterval=5,
+            disable=not enable_tqdm,
+            bar_format=_BAR_FORMAT,
+        )
         return self._pbar
 
 
 async def read_file(path_or_url: str) -> str:
     if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
-        async with aiohttp.ClientSession() as session, \
-                   session.get(path_or_url) as resp:
+        async with aiohttp.ClientSession() as session, session.get(path_or_url) as resp:
             return await resp.text()
     else:
         with open(path_or_url, encoding="utf-8") as f:
             return f.read()
 
 
-async def write_local_file(output_path: str,
-                           batch_outputs: list[BatchRequestOutput]) -> None:
+async def write_local_file(
+    output_path: str, batch_outputs: list[BatchRequestOutput]
+) -> None:
     """
     Write the responses to a local file.
     output_path: The path to write the responses to.
@@ -166,8 +180,7 @@ async def write_local_file(output_path: str,
             print(o.model_dump_json(), file=f)
 
 
-async def upload_data(output_url: str, data_or_file: str,
-                      from_file: bool) -> None:
+async def upload_data(output_url: str, data_or_file: str, from_file: bool) -> None:
     """
     Upload a local file to a URL.
     output_url: The URL to upload the file to.
@@ -184,23 +197,26 @@ async def upload_data(output_url: str, data_or_file: str,
         try:
             # We increase the timeout to 1000 seconds to allow
             # for large files (default is 300).
-            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
-                    total=1000)) as session:
+            async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=1000)
+            ) as session:
                 if from_file:
                     with open(data_or_file, "rb") as file:
-                        async with session.put(output_url,
-                                               data=file) as response:
+                        async with session.put(output_url, data=file) as response:
                             if response.status != 200:
-                                raise Exception(f"Failed to upload file.\n"
-                                                f"Status: {response.status}\n"
-                                                f"Response: {response.text()}")
+                                raise Exception(
+                                    f"Failed to upload file.\n"
+                                    f"Status: {response.status}\n"
+                                    f"Response: {response.text()}"
+                                )
                 else:
-                    async with session.put(output_url,
-                                           data=data_or_file) as response:
+                    async with session.put(output_url, data=data_or_file) as response:
                         if response.status != 200:
-                            raise Exception(f"Failed to upload data.\n"
-                                            f"Status: {response.status}\n"
-                                            f"Response: {response.text()}")
+                            raise Exception(
+                                f"Failed to upload data.\n"
+                                f"Status: {response.status}\n"
+                                f"Response: {response.text()}"
+                            )
 
         except Exception as e:
             if attempt < max_retries:
@@ -217,8 +233,9 @@ async def upload_data(output_url: str, data_or_file: str,
                 ) from e
 
 
-async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
-                     output_tmp_dir: str) -> None:
+async def write_file(
+    path_or_url: str, batch_outputs: list[BatchRequestOutput], output_tmp_dir: str
+) -> None:
     """
     Write batch_outputs to a file or upload to a URL.
     path_or_url: The path or URL to write batch_outputs to.
@@ -242,14 +259,13 @@ async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
         else:
             # Write responses to a temporary file and then upload it to the URL.
             with tempfile.NamedTemporaryFile(
-                    mode="w",
-                    encoding="utf-8",
-                    dir=output_tmp_dir,
-                    prefix="tmp_batch_output_",
-                    suffix=".jsonl",
+                mode="w",
+                encoding="utf-8",
+                dir=output_tmp_dir,
+                prefix="tmp_batch_output_",
+                suffix=".jsonl",
             ) as f:
-                logger.info("Writing outputs to temporary local file %s",
-                            f.name)
+                logger.info("Writing outputs to temporary local file %s", f.name)
                 await write_local_file(f.name, batch_outputs)
                 logger.info("Uploading outputs to %s", path_or_url)
                 await upload_data(path_or_url, f.name, from_file=True)
@@ -258,8 +274,9 @@ async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
         await write_local_file(path_or_url, batch_outputs)
 
 
-def make_error_request_output(request: BatchRequestInput,
-                              error_msg: str) -> BatchRequestOutput:
+def make_error_request_output(
+    request: BatchRequestInput, error_msg: str
+) -> BatchRequestOutput:
     batch_output = BatchRequestOutput(
         id=f"vllm-{random_uuid()}",
         custom_id=request.custom_id,
@@ -273,25 +290,28 @@ def make_error_request_output(request: BatchRequestInput,
 
 
 async def make_async_error_request_output(
-        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
+    request: BatchRequestInput, error_msg: str
+) -> BatchRequestOutput:
     return make_error_request_output(request, error_msg)
 
 
-async def run_request(serving_engine_func: Callable,
-                      request: BatchRequestInput,
-                      tracker: BatchProgressTracker) -> BatchRequestOutput:
+async def run_request(
+    serving_engine_func: Callable,
+    request: BatchRequestInput,
+    tracker: BatchProgressTracker,
+) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
     if isinstance(
-            response,
-        (ChatCompletionResponse, EmbeddingResponse, ScoreResponse,
-         RerankResponse),
+        response,
+        (ChatCompletionResponse, EmbeddingResponse, ScoreResponse, RerankResponse),
     ):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
             response=BatchResponseData(
-                body=response, request_id=f"vllm-batch-{random_uuid()}"),
+                body=response, request_id=f"vllm-batch-{random_uuid()}"
+            ),
             error=None,
         )
     elif isinstance(response, ErrorResponse):
@@ -300,20 +320,32 @@ async def run_request(serving_engine_func: Callable,
             custom_id=request.custom_id,
             response=BatchResponseData(
                 status_code=response.error.code,
-                request_id=f"vllm-batch-{random_uuid()}"),
+                request_id=f"vllm-batch-{random_uuid()}",
+            ),
             error=response,
         )
     else:
         batch_output = make_error_request_output(
-            request, error_msg="Request must not be sent in stream mode")
+            request, error_msg="Request must not be sent in stream mode"
+        )
 
     tracker.completed()
     return batch_output
 
 
+def validate_run_batch_args(args):
+    valid_reasoning_parsers = ReasoningParserManager.list_registered()
+    if (
+        reasoning_parser := args.structured_outputs_config.reasoning_parser
+    ) and reasoning_parser not in valid_reasoning_parsers:
+        raise KeyError(
+            f"invalid reasoning parser: {reasoning_parser} "
+            f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
+        )
+
+
 async def run_batch(
     engine_client: EngineClient,
-    vllm_config: VllmConfig,
     args: Namespace,
 ) -> None:
     if args.served_model_name is not None:
@@ -327,50 +359,62 @@ async def run_batch(
         request_logger = None
 
     base_model_paths = [
-        BaseModelPath(name=name, model_path=args.model)
-        for name in served_model_names
+        BaseModelPath(name=name, model_path=args.model) for name in served_model_names
     ]
 
-    model_config = vllm_config.model_config
-
+    model_config = engine_client.model_config
     supported_tasks = await engine_client.get_supported_tasks()
-    logger.info("Supported_tasks: %s", supported_tasks)
+    logger.info("Supported tasks: %s", supported_tasks)
 
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,
-        model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=None,
     )
-    openai_serving_chat = OpenAIServingChat(
-        engine_client,
-        model_config,
-        openai_serving_models,
-        args.response_role,
-        request_logger=request_logger,
-        chat_template=None,
-        chat_template_content_format="auto",
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if "generate" in supported_tasks else None
-    openai_serving_embedding = OpenAIServingEmbedding(
-        engine_client,
-        model_config,
-        openai_serving_models,
-        request_logger=request_logger,
-        chat_template=None,
-        chat_template_content_format="auto",
-    ) if "embed" in supported_tasks else None
-
-    enable_serving_reranking = ("classify" in supported_tasks and getattr(
-        model_config.hf_config, "num_labels", 0) == 1)
-
-    openai_serving_scores = ServingScores(
-        engine_client,
-        model_config,
-        openai_serving_models,
-        request_logger=request_logger,
-    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
+
+    openai_serving_chat = (
+        OpenAIServingChat(
+            engine_client,
+            openai_serving_models,
+            args.response_role,
+            request_logger=request_logger,
+            chat_template=None,
+            chat_template_content_format="auto",
+            reasoning_parser=args.structured_outputs_config.reasoning_parser,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+
+    openai_serving_embedding = (
+        OpenAIServingEmbedding(
+            engine_client,
+            openai_serving_models,
+            request_logger=request_logger,
+            chat_template=None,
+            chat_template_content_format="auto",
+        )
+        if "embed" in supported_tasks
+        else None
+    )
+
+    enable_serving_reranking = (
+        "classify" in supported_tasks
+        and getattr(model_config.hf_config, "num_labels", 0) == 1
+    )
+
+    openai_serving_scores = (
+        ServingScores(
+            engine_client,
+            openai_serving_models,
+            request_logger=request_logger,
+        )
+        if ("embed" in supported_tasks or enable_serving_reranking)
+        else None
+    )
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -387,61 +431,72 @@ async def run_batch(
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
-            chat_handler_fn = openai_serving_chat.create_chat_completion if \
-                openai_serving_chat is not None else None
+            chat_handler_fn = (
+                openai_serving_chat.create_chat_completion
+                if openai_serving_chat is not None
+                else None
+            )
             if chat_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
-                        error_msg=
-                        "The model does not support Chat Completions API",
-                    ))
+                        error_msg="The model does not support Chat Completions API",
+                    )
+                )
                 continue
 
-            response_futures.append(
-                run_request(chat_handler_fn, request, tracker))
+            response_futures.append(run_request(chat_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/embeddings":
-            embed_handler_fn = openai_serving_embedding.create_embedding if \
-                openai_serving_embedding is not None else None
+            embed_handler_fn = (
+                openai_serving_embedding.create_embedding
+                if openai_serving_embedding is not None
+                else None
+            )
             if embed_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
                         error_msg="The model does not support Embeddings API",
-                    ))
+                    )
+                )
                 continue
 
-            response_futures.append(
-                run_request(embed_handler_fn, request, tracker))
+            response_futures.append(run_request(embed_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url.endswith("/score"):
-            score_handler_fn = openai_serving_scores.create_score if \
-                openai_serving_scores is not None else None
+            score_handler_fn = (
+                openai_serving_scores.create_score
+                if openai_serving_scores is not None
+                else None
+            )
             if score_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
                         error_msg="The model does not support Scores API",
-                    ))
+                    )
+                )
                 continue
 
-            response_futures.append(
-                run_request(score_handler_fn, request, tracker))
+            response_futures.append(run_request(score_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url.endswith("/rerank"):
-            rerank_handler_fn = openai_serving_scores.do_rerank if \
-                openai_serving_scores is not None else None
+            rerank_handler_fn = (
+                openai_serving_scores.do_rerank
+                if openai_serving_scores is not None
+                else None
+            )
             if rerank_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
                         error_msg="The model does not support Rerank API",
-                    ))
+                    )
+                )
                 continue
 
-            response_futures.append(
-                run_request(rerank_handler_fn, request, tracker))
+            response_futures.append(run_request(rerank_handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
@@ -452,7 +507,8 @@ async def run_batch(
                     " /score, /rerank ."
                     "See vllm/entrypoints/openai/api_server.py for supported "
                     "score/rerank versions.",
-                ))
+                )
+            )
 
     with tracker.pbar():
         responses = await asyncio.gather(*response_futures)
@@ -464,14 +520,14 @@ async def main(args: Namespace):
     from vllm.entrypoints.openai.api_server import build_async_engine_client
     from vllm.usage.usage_lib import UsageContext
 
+    validate_run_batch_args(args)
+
     async with build_async_engine_client(
-            args,
-            usage_context=UsageContext.OPENAI_BATCH_RUNNER,
-            disable_frontend_multiprocessing=False,
+        args,
+        usage_context=UsageContext.OPENAI_BATCH_RUNNER,
+        disable_frontend_multiprocessing=False,
     ) as engine_client:
-        vllm_config = await engine_client.get_vllm_config()
-
-        await run_batch(engine_client, vllm_config, args)
+        await run_batch(engine_client, args)
 
 
 if __name__ == "__main__":
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ab4bf75102f434830f2076ef4c2ce5f9cad2423b..59e1c8d53179378ff602cd3bc6b1f01b19efee48 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,86 +6,101 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Callable, Final, Optional, Union
+from typing import Final
 
 import jinja2
 import partial_json_parser
 import regex as re
 from fastapi import Request
 from openai_harmony import Message as OpenAIMessage
-from pydantic import TypeAdapter
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
-                                         ConversationMessage,
-                                         get_history_tool_calls_cnt,
-                                         make_tool_call_id)
+from vllm.entrypoints.chat_utils import (
+    ChatTemplateContentFormatOption,
+    ConversationMessage,
+    get_history_tool_calls_cnt,
+    make_tool_call_id,
+)
 from vllm.entrypoints.harmony_utils import (
-    get_developer_message, get_stop_tokens_for_assistant_actions,
-    get_streamable_parser_for_assistant, get_system_message, parse_chat_input,
-    parse_chat_output, render_for_completion)
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_streamable_parser_for_assistant,
+    get_system_message,
+    parse_chat_output,
+    parse_input_to_harmony_message,
+    render_for_completion,
+)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    ChatCompletionLogProb, ChatCompletionLogProbs,
-    ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
-    ChatCompletionRequest, ChatCompletionResponse,
-    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
-    DeltaToolCall, ErrorResponse, FunctionCall, FunctionDefinition,
-    PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
-                                                    clamp_prompt_logprobs)
+    ChatCompletionLogProb,
+    ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent,
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatMessage,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ErrorResponse,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    ToolCall,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
-    MistralToolCall)
-from vllm.entrypoints.utils import get_max_tokens
+from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
-                                                truncate_tool_call_ids,
-                                                validate_request_params)
-from vllm.utils import as_list
+from vllm.transformers_utils.tokenizers import (
+    maybe_serialize_tool_calls,
+    truncate_tool_call_ids,
+    validate_request_params,
+)
+from vllm.utils.collection_utils import as_list
+from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
 logger = init_logger(__name__)
 
 
 class OpenAIServingChat(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         response_role: str,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         trust_request_chat_template: bool = False,
         return_tokens_as_token_ids: bool = False,
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
         exclude_tools_when_tool_choice_none: bool = False,
-        tool_parser: Optional[str] = None,
+        tool_parser: str | None = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
         log_error_stack: bool = False,
     ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         enable_force_include_usage=enable_force_include_usage,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            log_error_stack=log_error_stack,
+        )
 
         self.response_role = response_role
         self.chat_template = chat_template
@@ -93,62 +108,43 @@ class OpenAIServingChat(OpenAIServing):
         self.trust_request_chat_template = trust_request_chat_template
         self.enable_log_outputs = enable_log_outputs
 
+        # set up logits processors
+        self.logits_processors = self.model_config.logits_processors
+
+        # set up reasoning parser
+        self.reasoning_parser = self._get_reasoning_parser(
+            reasoning_parser_name=reasoning_parser
+        )
         # set up tool use
         self.enable_auto_tools: bool = enable_auto_tools
-        if self.enable_auto_tools:
-            logger.info(
-                "\"auto\" tool choice has been enabled please note that while"
-                " the parallel_tool_calls client option is preset for "
-                "compatibility reasons, it will be ignored.")
-
-        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
-                                                 ReasoningParser]] = None
-        if reasoning_parser:
-            try:
-                self.reasoning_parser = (
-                    ReasoningParserManager.get_reasoning_parser(
-                        reasoning_parser))
-                assert self.reasoning_parser is not None
-            except Exception as e:
-                raise TypeError(
-                    f"{reasoning_parser=} has not been registered") from e
-        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
-        if self.enable_auto_tools:
-            try:
-                if (tool_parser == "pythonic" and
-                        model_config.model.startswith("meta-llama/Llama-3.2")):
-                    logger.warning(
-                        "Llama3.2 models may struggle to emit valid pythonic"
-                        " tool calls")
-                self.tool_parser = ToolParserManager.get_tool_parser(
-                    tool_parser)
-            except Exception as e:
-                raise TypeError("Error: --enable-auto-tool-choice requires "
-                                f"tool_parser:'{tool_parser}' which has not "
-                                "been registered") from e
-        self.exclude_tools_when_tool_choice_none = (
-            exclude_tools_when_tool_choice_none)
+        self.tool_parser = self._get_tool_parser(
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
+        )
+        self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_force_include_usage = enable_force_include_usage
-        self.default_sampling_params = (
-            self.model_config.get_diff_sampling_param())
+        self.default_sampling_params = self.model_config.get_diff_sampling_param()
         if self.default_sampling_params:
             source = self.model_config.generation_config
             source = "model" if source == "auto" else source
-            logger.info("Using default chat sampling params from %s: %s",
-                        source, self.default_sampling_params)
-        if self.model_config.hf_config.model_type == 'kimi_k2':
-            self.tool_call_id_type = 'kimi_k2'
+            logger.info(
+                "Using default chat sampling params from %s: %s",
+                source,
+                self.default_sampling_params,
+            )
+        if self.model_config.hf_config.model_type == "kimi_k2":
+            self.tool_call_id_type = "kimi_k2"
         else:
-            self.tool_call_id_type = 'random'
+            self.tool_call_id_type = "random"
 
-        self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
+        self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss"
         if self.use_harmony:
             if "stop_token_ids" not in self.default_sampling_params:
                 self.default_sampling_params["stop_token_ids"] = []
             self.default_sampling_params["stop_token_ids"].extend(
-                get_stop_tokens_for_assistant_actions())
+                get_stop_tokens_for_assistant_actions()
+            )
 
         # NOTE(woosuk): While OpenAI's chat completion API supports browsing
         # for some models, currently vLLM doesn't support it. Please use the
@@ -163,9 +159,8 @@ class OpenAIServingChat(OpenAIServing):
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
-               ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse:
         """
         Chat Completion API similar to OpenAI's API.
 
@@ -186,7 +181,8 @@ class OpenAIServingChat(OpenAIServing):
 
         try:
             lora_request = self._maybe_get_adapters(
-                request, supports_default_mm_loras=True)
+                request, supports_default_mm_loras=True
+            )
 
             model_name = self.models.model_name(lora_request)
 
@@ -202,36 +198,36 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_tool_call_ids(request)
                 validate_request_params(request)
 
-            if (request.tool_choice == "auto" and
-                    not (self.enable_auto_tools and tool_parser is not None)
-                    and not isinstance(tokenizer, MistralTokenizer)
-                    and not self.use_harmony):
+            if (
+                request.tool_choice == "auto"
+                and not (self.enable_auto_tools and tool_parser is not None)
+                and not isinstance(tokenizer, MistralTokenizer)
+                and not self.use_harmony
+            ):
                 # for hf tokenizers, "auto" tools requires
                 # --enable-auto-tool-choice and --tool-call-parser
                 return self.create_error_response(
-                    "\"auto\" tool choice requires "
+                    '"auto" tool choice requires '
                     "--enable-auto-tool-choice and --tool-call-parser to be set"
                 )
 
-            if (request.tools is None
-                    or (request.tool_choice == "none"
-                        and self.exclude_tools_when_tool_choice_none)):
+            if request.tools is None or (
+                request.tool_choice == "none"
+                and self.exclude_tools_when_tool_choice_none
+            ):
                 tool_dicts = None
             else:
                 tool_dicts = [tool.model_dump() for tool in request.tools]
 
             if not self.use_harmony:
                 # Common case.
-                request_chat_template = request.chat_template
-                chat_template_kwargs = request.chat_template_kwargs
-                if not self.trust_request_chat_template and (
-                        request_chat_template is not None or
-                    (chat_template_kwargs and
-                     chat_template_kwargs.get("chat_template") is not None)):
-                    return self.create_error_response(
-                        "Chat template is passed with request, but "
-                        "--trust-request-chat-template is not set. "
-                        "Refused request with untrusted chat template.")
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=request.chat_template,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
+                )
+                if error_check_ret is not None:
+                    return error_check_ret
                 (
                     conversation,
                     request_prompts,
@@ -240,9 +236,8 @@ class OpenAIServingChat(OpenAIServing):
                     request,
                     tokenizer,
                     request.messages,
-                    chat_template=request_chat_template or self.chat_template,
-                    chat_template_content_format=self.
-                    chat_template_content_format,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.chat_template_content_format,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
                     tool_dicts=tool_dicts,
@@ -258,23 +253,26 @@ class OpenAIServingChat(OpenAIServing):
                     request_prompts,
                     engine_prompts,
                 ) = self._make_request_with_harmony(request)
-        except (ValueError, TypeError, RuntimeError,
-                jinja2.TemplateError) as e:
+        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
 
-        request_id = "chatcmpl-" \
-                     f"{self._base_request_id(raw_request, request.request_id)}"
+        request_id = (
+            f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
+        )
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
+        # Extract data_parallel_rank from header (router can inject it)
+        data_parallel_rank = self._get_data_parallel_rank(raw_request)
+
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
-                sampling_params: Union[SamplingParams, BeamSearchParams]
+                prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -283,39 +281,65 @@ class OpenAIServingChat(OpenAIServing):
                     max_model_len=self.max_model_len,
                     request=request,
                     input_length=len(engine_prompt["prompt_token_ids"]),
-                    default_sampling_params=self.default_sampling_params)
+                    default_sampling_params=self.default_sampling_params,
+                )
 
+                sampling_params: SamplingParams | BeamSearchParams
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params
+                    )
                 else:
                     sampling_params = request.to_sampling_params(
-                        max_tokens, self.model_config.logits_processor_pattern,
-                        self.default_sampling_params)
+                        max_tokens,
+                        self.model_config.logits_processor_pattern,
+                        self.default_sampling_params,
+                    )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
-                self._log_inputs(request_id,
-                                 request_prompts[i],
-                                 params=sampling_params,
-                                 lora_request=lora_request)
+                self._log_inputs(
+                    request_id,
+                    request_prompts[i],
+                    params=sampling_params,
+                    lora_request=lora_request,
+                )
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
+                )
 
                 if isinstance(sampling_params, BeamSearchParams):
-                    generator = self.engine_client.beam_search(
+                    generator = self.beam_search(
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
                     )
                 else:
-                    generator = self.engine_client.generate(
+                    engine_request, tokenization_kwargs = await self._process_inputs(
+                        request_id,
                         engine_prompt,
                         sampling_params,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        priority=request.priority,
+                    )
+
+                    generator = self.engine_client.generate(
+                        engine_request,
+                        sampling_params,
                         request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        prompt_text=prompt_text,
+                        tokenization_kwargs=tokenization_kwargs,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                 generators.append(generator)
@@ -324,7 +348,7 @@ class OpenAIServingChat(OpenAIServing):
             return self.create_error_response(str(e))
 
         assert len(generators) == 1
-        result_generator, = generators
+        (result_generator,) = generators
 
         # Streaming response
         if request.stream:
@@ -336,12 +360,18 @@ class OpenAIServingChat(OpenAIServing):
                 conversation,
                 tokenizer,
                 request_metadata,
-                enable_force_include_usage=self.enable_force_include_usage)
+            )
 
         try:
             return await self.chat_completion_full_generator(
-                request, result_generator, request_id, model_name,
-                conversation, tokenizer, request_metadata)
+                request,
+                result_generator,
+                request_id,
+                model_name,
+                conversation,
+                tokenizer,
+                request_metadata,
+            )
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -352,7 +382,7 @@ class OpenAIServingChat(OpenAIServing):
         return request.messages[-1]["role"]
 
     @staticmethod
-    def _bracket_level(s: str, opening='{', closing='}') -> int:
+    def _bracket_level(s: str, opening="{", closing="}") -> int:
         """
         Calculate the current level of nested brackets in a given string.
         """
@@ -365,8 +395,7 @@ class OpenAIServingChat(OpenAIServing):
         return level
 
     @staticmethod
-    def _filter_delta_text(delta_text: str,
-                           previous_text: str) -> tuple[str, bool]:
+    def _filter_delta_text(delta_text: str, previous_text: str) -> tuple[str, bool]:
         # remove last '},' of the tool definition stemming from the
         # "name"/"parameters" outer object or closing ']' of the tool list
         # count occurrences of opening and closing curly braces and
@@ -376,10 +405,10 @@ class OpenAIServingChat(OpenAIServing):
         bracket_level = OpenAIServingChat._bracket_level(previous_text)
         updated_delta, passed_zero = "", False
         for c in delta_text:
-            if c == '{':
+            if c == "{":
                 bracket_level += 1
                 passed_zero = bracket_level == 0
-            elif c == '}':
+            elif c == "}":
                 bracket_level -= 1
                 passed_zero = bracket_level == 0
 
@@ -387,25 +416,25 @@ class OpenAIServingChat(OpenAIServing):
                 updated_delta += c
             else:
                 # if a comma is reached at level 0 we can stop
-                if c == ',':
+                if c == ",":
                     break
         return updated_delta, passed_zero
 
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
-        current_text: Optional[str],
+        current_text: str | None,
         delta_text: str,
         function_name_returned: bool,
-        tool_call_idx: Optional[int] = None
-    ) -> tuple[Optional[DeltaMessage], bool]:
+        tool_call_idx: int | None = None,
+    ) -> tuple[DeltaMessage | None, bool]:
         if current_text is None or current_text == "":
             # if the current text is empty, we cannot parse it
             return None, function_name_returned
         try:
             obj = partial_json_parser.loads(current_text)
         except partial_json_parser.core.exceptions.MalformedJSON:
-            logger.debug('not enough tokens to parse into JSON yet')
+            logger.debug("not enough tokens to parse into JSON yet")
             obj = None
 
         # check if the current text is a valid array
@@ -416,60 +445,72 @@ class OpenAIServingChat(OpenAIServing):
             delta_message = None
         else:
             _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(
-                delta_text, previous_text)
+                delta_text, previous_text
+            )
             # take the last tool call from the generated list
             current_tool_call = obj[-1]
 
             # once parameters have been generated the name is complete as well
-            if not finishes_previous_tool and ("name" not in current_tool_call
-                                               or "parameters"
-                                               not in current_tool_call):
+            if not finishes_previous_tool and (
+                "name" not in current_tool_call or "parameters" not in current_tool_call
+            ):
                 function_name_returned = False
                 delta_message = None
             else:
                 if not function_name_returned:
                     # get partly generated arguments from the latest tool call
-                    param_match = re.search(r'.*"parameters":\s*(.*)',
-                                            current_text, re.DOTALL)
+                    param_match = re.search(
+                        r'.*"parameters":\s*(.*)', current_text, re.DOTALL
+                    )
                     arguments = param_match.group(1) if param_match else ""
                     arguments, _ = OpenAIServingChat._filter_delta_text(
-                        arguments, previous_text)
+                        arguments, previous_text
+                    )
 
                     # if this iteration finishes a previous tool call but a
                     # new incomplete tool is already generated, take the
                     # previous from the list
-                    if (finishes_previous_tool
-                            and "parameters" not in current_tool_call):
+                    if finishes_previous_tool and "parameters" not in current_tool_call:
                         current_tool_call = obj[-2]
 
                     function_name_returned = True
                     tool_call_id = make_tool_call_id(
                         id_type=self.tool_call_id_type,
                         func_name=current_tool_call["name"],
-                        idx=tool_call_idx)
-                    delta_message = DeltaMessage(tool_calls=[
-                        DeltaToolCall(id=tool_call_id,
-                                      function=DeltaFunctionCall(
-                                          name=current_tool_call["name"],
-                                          arguments=arguments),
-                                      index=len(obj) - 1,
-                                      type="function")
-                    ])
+                        idx=tool_call_idx,
+                    )
+                    delta_message = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                id=tool_call_id,
+                                function=DeltaFunctionCall(
+                                    name=current_tool_call["name"], arguments=arguments
+                                ),
+                                index=len(obj) - 1,
+                                type="function",
+                            )
+                        ]
+                    )
 
                 else:
                     delta_text, _ = OpenAIServingChat._filter_delta_text(
-                        delta_text, previous_text)
+                        delta_text, previous_text
+                    )
 
                     if delta_text != "":
-                        delta_message = DeltaMessage(tool_calls=[
-                            DeltaToolCall(
-                                function=DeltaFunctionCall(
-                                    # OpenAI API returns None
-                                    # instead of name every time
-                                    name=None,
-                                    arguments=delta_text),
-                                index=len(obj) - 1)
-                        ])
+                        delta_message = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    function=DeltaFunctionCall(
+                                        # OpenAI API returns None
+                                        # instead of name every time
+                                        name=None,
+                                        arguments=delta_text,
+                                    ),
+                                    index=len(obj) - 1,
+                                )
+                            ]
+                        )
                     else:
                         delta_message = None
 
@@ -484,7 +525,6 @@ class OpenAIServingChat(OpenAIServing):
         conversation: list[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        enable_force_include_usage: bool,
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
@@ -498,8 +538,7 @@ class OpenAIServingChat(OpenAIServing):
         num_cached_tokens = None
         if self.use_harmony:
             harmony_parsers = [
-                get_streamable_parser_for_assistant()
-                for _ in range(num_choices)
+                get_streamable_parser_for_assistant() for _ in range(num_choices)
             ]
             harmony_tools_streamed = [False] * num_choices
         tools_streamed = [False] * num_choices
@@ -512,11 +551,12 @@ class OpenAIServingChat(OpenAIServing):
         # Determine whether tools are in use with "auto" tool choice
         tool_choice_auto = (
             not tool_choice_function_name
-            and self._should_stream_with_auto_tool_parsing(request))
+            and self._should_stream_with_auto_tool_parsing(request)
+        )
 
-        all_previous_token_ids: Optional[list[list[int]]]
+        all_previous_token_ids: list[list[int]] | None
         function_name_returned = [False] * num_choices
-        if self.tool_call_id_type == 'kimi_k2':
+        if self.tool_call_id_type == "kimi_k2":
             history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
         else:
             history_tool_call_cnt = 0
@@ -532,14 +572,15 @@ class OpenAIServingChat(OpenAIServing):
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
-        elif request.tool_choice == "required":
-            all_previous_token_ids = None
         else:
             all_previous_token_ids = None
 
         try:
             if self.reasoning_parser:
-                reasoning_parser = self.reasoning_parser(tokenizer)
+                reasoning_parser = self.reasoning_parser(
+                    tokenizer,
+                    chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                )
         except RuntimeError as e:
             logger.exception("Error in reasoning parser creation.")
             data = self.create_streaming_error_response(str(e))
@@ -549,7 +590,7 @@ class OpenAIServingChat(OpenAIServing):
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
-                tool_parsers: list[Optional[ToolParser]] = [
+                tool_parsers: list[ToolParser | None] = [
                     self.tool_parser(tokenizer)
                 ] * num_choices
             else:
@@ -562,13 +603,9 @@ class OpenAIServingChat(OpenAIServing):
             return
 
         stream_options = request.stream_options
-        if stream_options:
-            include_usage = stream_options.include_usage \
-                            or enable_force_include_usage
-            include_continuous_usage = include_usage and \
-                                       stream_options.continuous_usage_stats
-        else:
-            include_usage, include_continuous_usage = False, False
+        include_usage, include_continuous_usage = should_include_usage(
+            stream_options, self.enable_force_include_usage
+        )
 
         try:
             async for res in result_generator:
@@ -596,7 +633,8 @@ class OpenAIServingChat(OpenAIServing):
                                 content="",
                             ),
                             logprobs=None,
-                            finish_reason=None)
+                            finish_reason=None,
+                        )
 
                         # return prompt_token_ids at the first chunk ever
                         chunk = ChatCompletionStreamResponse(
@@ -605,16 +643,20 @@ class OpenAIServingChat(OpenAIServing):
                             created=created_time,
                             choices=[choice_data],
                             model=model_name,
-                            prompt_token_ids=(res.prompt_token_ids
-                                              if request.return_token_ids else
-                                              None))
+                            prompt_token_ids=(
+                                res.prompt_token_ids
+                                if request.return_token_ids
+                                else None
+                            ),
+                        )
 
                         # if continuous usage stats are requested, add it
                         if include_continuous_usage:
                             chunk.usage = UsageInfo(
                                 prompt_tokens=num_prompt_tokens,
                                 completion_tokens=0,
-                                total_tokens=num_prompt_tokens)
+                                total_tokens=num_prompt_tokens,
+                            )
 
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
@@ -622,34 +664,37 @@ class OpenAIServingChat(OpenAIServing):
                     # Send response to echo the input portion of the
                     # last message
                     if request.echo:
-                        last_msg_content: Union[str, list[dict[str, str]]] = ""
-                        if conversation and "content" in conversation[
-                                -1] and conversation[-1].get("role") == role:
+                        last_msg_content: str | list[dict[str, str]] = ""
+                        if (
+                            conversation
+                            and "content" in conversation[-1]
+                            and conversation[-1].get("role") == role
+                        ):
                             last_msg_content = conversation[-1]["content"] or ""
 
                         if last_msg_content:
                             for i in range(num_choices):
-                                choice_data = (
-                                    ChatCompletionResponseStreamChoice(
-                                        index=i,
-                                        delta=DeltaMessage(
-                                            content=last_msg_content),
-                                        logprobs=None,
-                                        finish_reason=None))
+                                choice_data = ChatCompletionResponseStreamChoice(
+                                    index=i,
+                                    delta=DeltaMessage(content=last_msg_content),
+                                    logprobs=None,
+                                    finish_reason=None,
+                                )
                                 chunk = ChatCompletionStreamResponse(
                                     id=request_id,
                                     object=chunk_object_type,
                                     created=created_time,
                                     choices=[choice_data],
-                                    model=model_name)
+                                    model=model_name,
+                                )
                                 if include_continuous_usage:
                                     chunk.usage = UsageInfo(
                                         prompt_tokens=num_prompt_tokens,
                                         completion_tokens=0,
-                                        total_tokens=num_prompt_tokens)
+                                        total_tokens=num_prompt_tokens,
+                                    )
 
-                                data = chunk.model_dump_json(
-                                    exclude_unset=True)
+                                data = chunk.model_dump_json(exclude_unset=True)
                                 yield f"data: {data}\n\n"
                     first_iteration = False
 
@@ -661,15 +706,13 @@ class OpenAIServingChat(OpenAIServing):
                         continue
 
                     if request.logprobs and request.top_logprobs is not None:
-                        assert output.logprobs is not None, (
-                            "Did not output logprobs")
+                        assert output.logprobs is not None, "Did not output logprobs"
                         logprobs = self._create_chat_logprobs(
                             token_ids=output.token_ids,
                             top_logprobs=output.logprobs,
                             tokenizer=tokenizer,
                             num_output_top_logprobs=request.top_logprobs,
-                            return_as_token_id=request.
-                            return_tokens_as_token_ids,
+                            return_as_token_id=request.return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -677,20 +720,24 @@ class OpenAIServingChat(OpenAIServing):
                     if self.use_harmony:
                         harmony_parser = harmony_parsers[i]
                         prev_recipient = harmony_parser.current_recipient
+                        delta_text = ""
                         for token_id in output.token_ids:
                             harmony_parser.process(token_id)
+                            delta_text += harmony_parser.last_content_delta or ""
                         cur_channel = harmony_parser.current_channel
                         cur_recipient = harmony_parser.current_recipient
-                        delta_text = harmony_parser.last_content_delta or ""
                     else:
                         delta_text = output.text
 
-                    if not delta_text and not output.token_ids and \
-                        not previous_num_tokens[i]:
+                    if (
+                        not delta_text
+                        and not output.token_ids
+                        and not previous_num_tokens[i]
+                    ):
                         # Chunked prefill case, don't return empty chunks
                         continue
 
-                    delta_message: Optional[DeltaMessage]
+                    delta_message: DeltaMessage | None
 
                     # just update previous_texts and previous_token_ids
                     if tool_choice_auto or self.reasoning_parser:
@@ -702,7 +749,8 @@ class OpenAIServingChat(OpenAIServing):
                         # avoid the None + list error.
                         if previous_token_ids:
                             current_token_ids = previous_token_ids + as_list(
-                                output.token_ids)
+                                output.token_ids
+                            )
                         else:
                             current_token_ids = as_list(output.token_ids)
 
@@ -711,43 +759,50 @@ class OpenAIServingChat(OpenAIServing):
                             delta_message = DeltaMessage(content=delta_text)
                         elif cur_channel == "analysis":
                             if request.include_reasoning:
-                                delta_message = DeltaMessage(
-                                    reasoning_content=delta_text)
+                                delta_message = DeltaMessage(reasoning=delta_text)
                             else:
                                 delta_message = None
-                        elif (cur_channel == "commentary" and cur_recipient
-                              and cur_recipient.startswith("functions.")):
+                        elif (
+                            cur_channel == "commentary"
+                            and cur_recipient
+                            and cur_recipient.startswith("functions.")
+                        ):
                             # Count completed tool calls to determine index
                             base_index = 0
                             for msg in harmony_parser.messages:
-                                if (msg.channel == "commentary"
-                                        and msg.recipient
-                                        and msg.recipient.startswith(
-                                            "functions.")):
+                                if (
+                                    msg.channel == "commentary"
+                                    and msg.recipient
+                                    and msg.recipient.startswith("functions.")
+                                ):
                                     base_index += 1
 
                             if prev_recipient != cur_recipient:
-                                tool_name = cur_recipient.split(
-                                    "functions.", 1)[1]
-                                delta_message = DeltaMessage(tool_calls=[
-                                    DeltaToolCall(
-                                        id=make_tool_call_id(),
-                                        type="function",
-                                        function=DeltaFunctionCall(
-                                            name=tool_name,
-                                            arguments="",
-                                        ),
-                                        index=base_index,
-                                    )
-                                ])
+                                tool_name = cur_recipient.split("functions.", 1)[1]
+                                delta_message = DeltaMessage(
+                                    tool_calls=[
+                                        DeltaToolCall(
+                                            id=make_tool_call_id(),
+                                            type="function",
+                                            function=DeltaFunctionCall(
+                                                name=tool_name,
+                                                arguments="",
+                                            ),
+                                            index=base_index,
+                                        )
+                                    ]
+                                )
                             elif delta_text:
-                                delta_message = DeltaMessage(tool_calls=[
-                                    DeltaToolCall(
-                                        index=base_index,
-                                        function=DeltaFunctionCall(
-                                            arguments=delta_text),
-                                    )
-                                ])
+                                delta_message = DeltaMessage(
+                                    tool_calls=[
+                                        DeltaToolCall(
+                                            index=base_index,
+                                            function=DeltaFunctionCall(
+                                                arguments=delta_text
+                                            ),
+                                        )
+                                    ]
+                                )
                             else:
                                 delta_message = None
 
@@ -757,30 +812,37 @@ class OpenAIServingChat(OpenAIServing):
                             delta_message = None
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
-                        if (self.reasoning_parser and not reasoning_end_arr[i]
-                                and not reasoning_parser.is_reasoning_end(
-                                    previous_token_ids)):
+                        if (
+                            self.reasoning_parser
+                            and not reasoning_end_arr[i]
+                            and not reasoning_parser.is_reasoning_end(
+                                previous_token_ids
+                            )
+                        ):
                             assert reasoning_parser is not None
                             delta_message = (
-                                reasoning_parser.
-                                extract_reasoning_content_streaming(
+                                reasoning_parser.extract_reasoning_streaming(
                                     previous_text,
                                     current_text,
                                     delta_text,
                                     previous_token_ids,
                                     current_token_ids,
                                     output.token_ids,
-                                ))
+                                )
+                            )
                             # When encountering think end id in delta_token_ids
                             # or think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
                             # set reasoning status to end.
-                            # Only keep 'content', remove 'reasoning_content'.
+                            # Only keep 'content', remove 'reasoning'.
                             if reasoning_parser.is_reasoning_end(
-                                    as_list(output.token_ids)) or (
-                                        res.prompt_token_ids
-                                        and reasoning_parser.is_reasoning_end(
-                                            res.prompt_token_ids)):
+                                as_list(output.token_ids)
+                            ) or (
+                                res.prompt_token_ids
+                                and reasoning_parser.is_reasoning_end(
+                                    res.prompt_token_ids
+                                )
+                            ):
                                 reasoning_end_arr[i] = True
                                 if delta_message and delta_message.content:
                                     # This need to be added to next `delta_text`
@@ -796,22 +858,26 @@ class OpenAIServingChat(OpenAIServing):
 
                             if function_name_returned[i]:
                                 delta_tool_call = DeltaToolCall(
-                                    function=DeltaFunctionCall(
-                                        arguments=delta_text),
-                                    index=i)
+                                    function=DeltaFunctionCall(arguments=delta_text),
+                                    index=i,
+                                )
                             else:
                                 delta_tool_call = DeltaToolCall(
                                     id=make_tool_call_id(),
                                     type="function",
                                     function=DeltaFunctionCall(
                                         name=tool_choice_function_name,
-                                        arguments=delta_text),
-                                    index=i)
+                                        arguments=delta_text,
+                                    ),
+                                    index=i,
+                                )
                                 function_name_returned[i] = True
 
-                            delta_message = DeltaMessage(tool_calls=[
-                                delta_tool_call,
-                            ])
+                            delta_message = DeltaMessage(
+                                tool_calls=[
+                                    delta_tool_call,
+                                ]
+                            )
                             tools_streamed[i] = True
 
                     elif request.tool_choice == "required":
@@ -819,26 +885,56 @@ class OpenAIServingChat(OpenAIServing):
                         previous_text = previous_texts[i]
                         current_text = previous_text + delta_text
                         fn_name_returned = function_name_returned[i]
+                        output_token_ids = as_list(output.token_ids)
+
+                        if (
+                            self.reasoning_parser is not None
+                            and not reasoning_end_arr[i]
+                            and res.prompt_token_ids
+                            and reasoning_parser.is_reasoning_end(res.prompt_token_ids)
+                        ):
+                            reasoning_end_arr[i] = True
 
-                        if self.reasoning_parser:
-                            _, content = \
-                                reasoning_parser.extract_reasoning_content(
+                        if self.reasoning_parser and not reasoning_end_arr[i]:
+                            delta_message = (
+                                reasoning_parser.extract_reasoning_streaming(
+                                    previous_text,
                                     current_text,
-                                    request
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output_token_ids,
                                 )
+                            )
+                            if reasoning_parser.is_reasoning_end(output_token_ids):
+                                reasoning_end_arr[i] = True
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    # reasoning ended
+                                    current_text = ""
+
                         else:
+                            # either finished reasoning or no reasoning at all
                             content = current_text
-                        delta_message, function_name_returned[i] = (
-                            self.extract_tool_call_required_streaming(
-                                previous_text=previous_text,
-                                current_text=content,
-                                delta_text=delta_text,
-                                function_name_returned=fn_name_returned,
-                                tool_call_idx=history_tool_call_cnt))
-                        if (delta_message and delta_message.tool_calls and
-                                delta_message.tool_calls[0].id is not None):
-                            history_tool_call_cnt += 1
-                            tools_streamed[i] = True
+
+                            delta_message, function_name_returned[i] = (
+                                self.extract_tool_call_required_streaming(
+                                    previous_text=previous_text,
+                                    current_text=content,
+                                    delta_text=delta_text,
+                                    function_name_returned=fn_name_returned,
+                                    tool_call_idx=history_tool_call_cnt,
+                                )
+                            )
+                            if (
+                                delta_message
+                                and delta_message.tool_calls
+                                and delta_message.tool_calls[0].id is not None
+                            ):
+                                history_tool_call_cnt += 1
+                                tools_streamed[i] = True
 
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
@@ -850,23 +946,26 @@ class OpenAIServingChat(OpenAIServing):
                         output_token_ids = as_list(output.token_ids)
                         if not reasoning_end_arr[i]:
                             delta_message = (
-                                reasoning_parser.
-                                extract_reasoning_content_streaming(
+                                reasoning_parser.extract_reasoning_streaming(
                                     previous_text,
                                     current_text,
                                     delta_text,
                                     previous_token_ids,
                                     current_token_ids,
                                     output_token_ids,
-                                ))
+                                )
+                            )
                             # When encountering think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
                             # set reasoning status to end.
                             # Remove the text and token ids related
-                            # to 'reasoning_content'.
-                            if res.prompt_token_ids and \
-                                reasoning_parser.is_reasoning_end(
-                                    res.prompt_token_ids):
+                            # to 'reasoning'.
+                            if (
+                                res.prompt_token_ids
+                                and reasoning_parser.is_reasoning_end(
+                                    res.prompt_token_ids
+                                )
+                            ):
                                 reasoning_end_arr[i] = True
                                 current_token_ids = output_token_ids
                                 if delta_message and delta_message.content:
@@ -877,13 +976,14 @@ class OpenAIServingChat(OpenAIServing):
                             # When encountering think end id in delta_token_ids,
                             # set reasoning status to end.
                             # Remove the text and token ids related
-                            # to 'reasoning_content'.
-                            if reasoning_parser.is_reasoning_end(
-                                    output_token_ids):
+                            # to 'reasoning'.
+                            if reasoning_parser.is_reasoning_end(output_token_ids):
                                 reasoning_end_arr[i] = True
-                                current_token_ids =  \
+                                current_token_ids = (
                                     reasoning_parser.extract_content_ids(
-                                        output_token_ids)
+                                        output_token_ids
+                                    )
+                                )
                                 if delta_message and delta_message.content:
                                     current_text = delta_message.content
                                     delta_message.content = None
@@ -903,50 +1003,50 @@ class OpenAIServingChat(OpenAIServing):
                                 delta_text = current_text
                                 delta_token_ids = current_token_ids
 
-                            delta_message = (
-                                tool_parser.extract_tool_calls_streaming(
-                                    previous_text=previous_text,
-                                    current_text=current_text,
-                                    delta_text=delta_text,
-                                    previous_token_ids=previous_token_ids,
-                                    current_token_ids=current_token_ids,
-                                    delta_token_ids=delta_token_ids,
-                                    request=request))
-                            if delta_message and delta_message.tool_calls:
-                                tools_streamed[i] = True
-                    # when only tool calls
-                    elif tool_choice_auto:
-                        assert tool_parser is not None
-                        delta_message = (
-                            tool_parser.extract_tool_calls_streaming(
+                            delta_message = tool_parser.extract_tool_calls_streaming(
                                 previous_text=previous_text,
                                 current_text=current_text,
                                 delta_text=delta_text,
                                 previous_token_ids=previous_token_ids,
                                 current_token_ids=current_token_ids,
-                                delta_token_ids=output.token_ids,
-                                request=request))
+                                delta_token_ids=delta_token_ids,
+                                request=request,
+                            )
+                            if delta_message and delta_message.tool_calls:
+                                tools_streamed[i] = True
+                    # when only tool calls
+                    elif tool_choice_auto:
+                        assert tool_parser is not None
+                        delta_message = tool_parser.extract_tool_calls_streaming(
+                            previous_text=previous_text,
+                            current_text=current_text,
+                            delta_text=delta_text,
+                            previous_token_ids=previous_token_ids,
+                            current_token_ids=current_token_ids,
+                            delta_token_ids=output.token_ids,
+                            request=request,
+                        )
                         if delta_message and delta_message.tool_calls:
                             tools_streamed[i] = True
 
                     # when only reasoning
                     elif self.reasoning_parser:
-                        delta_message = (reasoning_parser.
-                                         extract_reasoning_content_streaming(
-                                             previous_text,
-                                             current_text,
-                                             delta_text,
-                                             previous_token_ids,
-                                             current_token_ids,
-                                             output.token_ids,
-                                         ))
+                        delta_message = reasoning_parser.extract_reasoning_streaming(
+                            previous_text,
+                            current_text,
+                            delta_text,
+                            previous_token_ids,
+                            current_token_ids,
+                            output.token_ids,
+                        )
                     # handle streaming just a content delta
                     else:
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if ((tool_choice_auto or self.reasoning_parser)
-                            and not self.use_harmony):
+                    if (
+                        tool_choice_auto or self.reasoning_parser
+                    ) and not self.use_harmony:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text
@@ -978,7 +1078,8 @@ class OpenAIServingChat(OpenAIServing):
                             delta_content = "".join(
                                 tc.function.arguments
                                 for tc in delta_message.tool_calls
-                                if tc.function and tc.function.arguments)
+                                if tc.function and tc.function.arguments
+                            )
 
                         if delta_content:
                             self.request_logger.log_outputs(
@@ -997,8 +1098,12 @@ class OpenAIServingChat(OpenAIServing):
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=None,
-                            token_ids=(as_list(output.token_ids)
-                                       if request.return_token_ids else None))
+                            token_ids=(
+                                as_list(output.token_ids)
+                                if request.return_token_ids
+                                else None
+                            ),
+                        )
 
                     # if the model is finished generating
                     else:
@@ -1008,66 +1113,90 @@ class OpenAIServingChat(OpenAIServing):
                         # only happens if we are NOT using structured outputs
                         auto_tools_called = False
                         if tool_parser:
-                            auto_tools_called = len(
-                                tool_parser.prev_tool_call_arr) > 0
-                            index = len(tool_parser.prev_tool_call_arr
-                                        ) - 1 if auto_tools_called else 0
+                            auto_tools_called = len(tool_parser.prev_tool_call_arr) > 0
+                            index = (
+                                len(tool_parser.prev_tool_call_arr) - 1
+                                if auto_tools_called
+                                else 0
+                            )
                         else:
                             index = 0
 
-                        if self._should_check_for_unstreamed_tool_arg_tokens(
-                                delta_message, output) and tool_parser:
+                        if (
+                            self._should_check_for_unstreamed_tool_arg_tokens(
+                                delta_message, output
+                            )
+                            and tool_parser
+                        ):
                             latest_delta_len = 0
-                            if ((isinstance(
+                            if (
+                                isinstance(
                                     delta_message.tool_calls[0].function,
-                                    DeltaFunctionCall)) and isinstance(
-                                        delta_message.tool_calls[0].function.
-                                        arguments, str)):
+                                    DeltaFunctionCall,
+                                )
+                            ) and isinstance(
+                                delta_message.tool_calls[0].function.arguments, str
+                            ):
                                 latest_delta_len = len(
-                                    delta_message.tool_calls[0].function.
-                                    arguments)
+                                    delta_message.tool_calls[0].function.arguments
+                                )
 
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(
                                 tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}),
-                                ensure_ascii=False)
+                                    "arguments", {}
+                                ),
+                                ensure_ascii=False,
+                            )
 
                             # get what we've streamed so far for arguments
                             # for the current tool
-                            actual_call = tool_parser.streamed_args_for_tool[
-                                index]
-                            if (latest_delta_len > 0):
+                            actual_call = tool_parser.streamed_args_for_tool[index]
+                            if latest_delta_len > 0:
                                 actual_call = actual_call[:-latest_delta_len]
 
                             # check to see if there's anything left to stream
-                            remaining_call = expected_call.replace(
-                                actual_call, "", 1)
+                            remaining_call = expected_call.replace(actual_call, "", 1)
                             # set that as a delta message
-                            delta_message = DeltaMessage(tool_calls=[
-                                DeltaToolCall(index=index,
-                                              function=DeltaFunctionCall(
-                                                  arguments=remaining_call).
-                                              model_dump(exclude_none=True))
-                            ])
+                            delta_message = DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=index,
+                                        function=DeltaFunctionCall(
+                                            arguments=remaining_call
+                                        ).model_dump(exclude_none=True),
+                                    )
+                                ]
+                            )
 
                         # Send the finish response for each request.n only once
-                        if auto_tools_called or tools_streamed[i] or (
-                                self.use_harmony
-                                and harmony_tools_streamed[i]):
+                        # In OpenAI's API, when a tool is called, the
+                        # finish_reason is:
+                        # "tool_calls" for "auto" or "required" tool calls,
+                        # and "stop" for named tool calls.
+                        if (
+                            auto_tools_called
+                            or (tools_streamed[i] and not tool_choice_function_name)
+                            or (self.use_harmony and harmony_tools_streamed[i])
+                        ):
                             finish_reason_ = "tool_calls"
                         else:
-                            finish_reason_ = output.finish_reason \
-                                if output.finish_reason else "stop"
+                            finish_reason_ = (
+                                output.finish_reason if output.finish_reason else "stop"
+                            )
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=finish_reason_,
                             stop_reason=output.stop_reason,
-                            token_ids=(as_list(output.token_ids)
-                                       if request.return_token_ids else None))
+                            token_ids=(
+                                as_list(output.token_ids)
+                                if request.return_token_ids
+                                else None
+                            ),
+                        )
 
                         finish_reason_sent[i] = True
 
@@ -1076,7 +1205,8 @@ class OpenAIServingChat(OpenAIServing):
                         object=chunk_object_type,
                         created=created_time,
                         choices=[choice_data],
-                        model=model_name)
+                        model=model_name,
+                    )
 
                     # handle usage stats if requested & if continuous
                     if include_continuous_usage:
@@ -1094,13 +1224,15 @@ class OpenAIServingChat(OpenAIServing):
             # is sent, send the usage
             if include_usage:
                 completion_tokens = sum(previous_num_tokens)
-                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
-                                        completion_tokens=completion_tokens,
-                                        total_tokens=num_prompt_tokens +
-                                        completion_tokens)
+                final_usage = UsageInfo(
+                    prompt_tokens=num_prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=num_prompt_tokens + completion_tokens,
+                )
                 if self.enable_prompt_tokens_details and num_cached_tokens:
                     final_usage.prompt_tokens_details = PromptTokenUsageInfo(
-                        cached_tokens=num_cached_tokens)
+                        cached_tokens=num_cached_tokens
+                    )
 
                 final_usage_chunk = ChatCompletionStreamResponse(
                     id=request_id,
@@ -1108,9 +1240,11 @@ class OpenAIServingChat(OpenAIServing):
                     created=created_time,
                     choices=[],
                     model=model_name,
-                    usage=final_usage)
-                final_usage_data = (final_usage_chunk.model_dump_json(
-                    exclude_unset=True, exclude_none=True))
+                    usage=final_usage,
+                )
+                final_usage_data = final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True
+                )
                 yield f"data: {final_usage_data}\n\n"
 
             # report to FastAPI middleware aggregate usage across all choices
@@ -1127,14 +1261,13 @@ class OpenAIServingChat(OpenAIServing):
                 for i in range(num_choices):
                     full_text = (
                         previous_texts[i]
-                        if previous_texts and i < len(previous_texts) else
-                        f"<streaming_complete: {previous_num_tokens[i]} tokens>"
+                        if previous_texts and i < len(previous_texts)
+                        else f"<streaming_complete: {previous_num_tokens[i]} tokens>"
                     )
                     self.request_logger.log_outputs(
                         request_id=request_id,
                         outputs=full_text,
-                        output_token_ids=
-                        None,  # Consider also logging all token IDs
+                        output_token_ids=None,  # Consider also logging all token IDs
                         finish_reason="streaming_complete",
                         is_streaming=True,
                         delta=False,
@@ -1157,10 +1290,9 @@ class OpenAIServingChat(OpenAIServing):
         conversation: list[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-    ) -> Union[ErrorResponse, ChatCompletionResponse]:
-
+    ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
-        final_res: Optional[RequestOutput] = None
+        final_res: RequestOutput | None = None
 
         try:
             async for res in result_generator:
@@ -1174,7 +1306,7 @@ class OpenAIServingChat(OpenAIServing):
         assert final_res is not None
 
         choices: list[ChatCompletionResponseChoice] = []
-        if self.tool_call_id_type == 'kimi_k2':
+        if self.tool_call_id_type == "kimi_k2":
             history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
         else:
             history_tool_call_cnt = 0
@@ -1198,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs = None
 
             if self.use_harmony:
-                reasoning_content, content, _ = parse_chat_output(token_ids)
+                reasoning, content, _ = parse_chat_output(token_ids)
                 if not request.include_reasoning:
-                    reasoning_content = None
+                    reasoning = None
 
                 if self.tool_parser is not None:
                     tool_parser = self.tool_parser(tokenizer)
@@ -1213,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing):
                     content = tool_call_info.content
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
+                        reasoning=reasoning,
                         content=content,
                         tool_calls=tool_call_info.tool_calls,
                     )
                 else:
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
+                        reasoning=reasoning,
                         content=content,
                     )
 
@@ -1228,122 +1360,124 @@ class OpenAIServingChat(OpenAIServing):
                     index=output.index,
                     message=message,
                     logprobs=logprobs,
-                    finish_reason="tool_calls" if
-                    (tool_call_info is not None
-                     and tool_call_info.tools_called) else
-                    output.finish_reason if output.finish_reason else "stop",
+                    finish_reason=(
+                        "tool_calls"
+                        if (tool_call_info is not None and tool_call_info.tools_called)
+                        else output.finish_reason
+                        if output.finish_reason
+                        else "stop"
+                    ),
                     stop_reason=output.stop_reason,
+                    token_ids=(
+                        as_list(output.token_ids) if request.return_token_ids else None
+                    ),
                 )
                 choices.append(choice_data)
                 continue
 
             if self.reasoning_parser:
                 try:
-                    reasoning_parser = self.reasoning_parser(tokenizer)
+                    reasoning_parser = self.reasoning_parser(
+                        tokenizer,
+                        chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                    )
                 except RuntimeError as e:
                     logger.exception("Error in reasoning parser creation.")
                     return self.create_error_response(str(e))
                 # If the reasoning parser is enabled,
                 # tool calls are extracted exclusively from the content.
-                reasoning_content, content = (
-                    reasoning_parser.extract_reasoning_content(
-                        output.text, request=request))
+                reasoning, content = reasoning_parser.extract_reasoning(
+                    output.text, request=request
+                )
                 if not request.include_reasoning:
-                    reasoning_content = None
+                    reasoning = None
             else:
-                reasoning_content = None
+                reasoning = None
                 content = output.text
 
             auto_tools_called = False
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools or not self.tool_parser) and \
-                (not isinstance(request.tool_choice,
-                                ChatCompletionNamedToolChoiceParam
-                                ) and request.tool_choice != "required"):
-                message = ChatMessage(role=role,
-                                      reasoning_content=reasoning_content,
-                                      content=content)
+            tool_calls, content = self._parse_tool_calls_from_content(
+                request=request,
+                tokenizer=tokenizer,
+                content=content,
+                enable_auto_tools=self.enable_auto_tools,
+                tool_parser_cls=self.tool_parser,
+            )
+            tool_call_class = (
+                MistralToolCall if isinstance(tokenizer, MistralTokenizer) else ToolCall
+            )
+            if (not self.enable_auto_tools or not self.tool_parser) and (
+                not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
+                and request.tool_choice != "required"
+            ):
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
 
             # if the request uses tools and specified a tool choice
-            elif request.tool_choice and type(
-                    request.tool_choice) is ChatCompletionNamedToolChoiceParam:
-
-                tool_call_class = MistralToolCall if isinstance(
-                    tokenizer, MistralTokenizer) else ToolCall
+            elif (
+                request.tool_choice
+                and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam
+            ):
+                assert tool_calls is not None and len(tool_calls) > 0
                 message = ChatMessage(
                     role=role,
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content="",
-                    tool_calls=[
-                        tool_call_class(function=FunctionCall(
-                            name=request.tool_choice.function.name,
-                            arguments=content,
-                        ))
-                    ],
+                    tool_calls=[tool_call_class(function=tc) for tc in tool_calls],
                 )
 
             elif request.tool_choice and request.tool_choice == "required":
-                tool_call_class = MistralToolCall if isinstance(
-                    tokenizer, MistralTokenizer) else ToolCall
-
-                # the fields of FunctionDefinition are a superset of the
-                # tool call outputs and can be used for parsing
-                assert content is not None
-                tool_calls = TypeAdapter(
-                    list[FunctionDefinition]).validate_json(content)
-                tool_call_ids = []
+                tool_call_class_items = []
+                assert tool_calls is not None and len(tool_calls) > 0
                 for tool_call in tool_calls:
-                    tool_call_ids.append(
-                        make_tool_call_id(id_type=self.tool_call_id_type,
-                                          func_name=tool_call.name,
-                                          idx=history_tool_call_cnt))
+                    tool_call_class_items.append(
+                        tool_call_class(
+                            id=make_tool_call_id(
+                                id_type=self.tool_call_id_type,
+                                func_name=tool_call.name,
+                                idx=history_tool_call_cnt,
+                            ),
+                            function=tool_call,
+                        )
+                    )
                     history_tool_call_cnt += 1
                 message = ChatMessage(
                     role=role,
                     content="",
-                    tool_calls=[
-                        tool_call_class(id=tool_call_ids[i],
-                                        function=FunctionCall(
-                                            name=tool_call.name,
-                                            arguments=json.dumps(
-                                                tool_call.parameters,
-                                                ensure_ascii=False)))
-                        for i, tool_call in enumerate(tool_calls)
-                    ],
-                    reasoning_content=reasoning_content)
+                    tool_calls=tool_call_class_items,
+                    reasoning=reasoning,
+                )
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
-
-                message = ChatMessage(role=role,
-                                      reasoning_content=reasoning_content,
-                                      content=content)
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
 
             # handle when there are tools and tool choice is auto
-            elif request.tools and (
-                    request.tool_choice == "auto"
-                    or request.tool_choice is None) and self.enable_auto_tools \
-                    and self.tool_parser:
-
-                try:
-                    tool_parser = self.tool_parser(tokenizer)
-                except RuntimeError as e:
-                    logger.exception("Error in tool parser creation.")
-                    return self.create_error_response(str(e))
-
-                tool_call_info = tool_parser.extract_tool_calls(
-                    content if content is not None else "", request=request)
+            elif (
+                request.tools
+                and (request.tool_choice == "auto" or request.tool_choice is None)
+                and self.enable_auto_tools
+                and self.tool_parser
+            ):
                 # In the OpenAI API the finish_reason is "tools_called"
                 # if the tool choice is auto and the model produced a tool
                 # call. The same is not true for named function calls
-                auto_tools_called = tool_call_info.tools_called
-                if tool_call_info.tools_called:
-                    message = ChatMessage(role=role,
-                                          reasoning_content=reasoning_content,
-                                          content=tool_call_info.content,
-                                          tool_calls=tool_call_info.tool_calls)
+                auto_tools_called = tool_calls is not None and len(tool_calls) > 0
+                if tool_calls:
+                    message = ChatMessage(
+                        role=role,
+                        reasoning=reasoning,
+                        content=content,
+                        tool_calls=[
+                            ToolCall(
+                                function=tc,
+                                type="function",
+                            )
+                            for tc in tool_calls
+                        ],
+                    )
 
                 else:
                     # FOR NOW make it a chat message; we will have to detect
@@ -1352,48 +1486,61 @@ class OpenAIServingChat(OpenAIServing):
 
                     # try to use content return from tool parser first,
                     # tool parser may do some modify for the content.
-                    if (tool_call_info.content
-                            and len(tool_call_info.content) > 0):
-                        ret_content = tool_call_info.content
-                    message = ChatMessage(role=role,
-                                          reasoning_content=reasoning_content,
-                                          content=ret_content)
+                    if content and len(content) > 0:
+                        ret_content = content
+                    message = ChatMessage(
+                        role=role,
+                        reasoning=reasoning,
+                        content=ret_content,
+                    )
 
             # undetermined case that is still important to handle
             else:
                 logger.error(
                     "Error in chat_completion_full_generator - cannot determine"
                     " if tools should be extracted. Returning a standard chat "
-                    "completion.")
-                message = ChatMessage(role=role,
-                                      reasoning_content=reasoning_content,
-                                      content=content)
+                    "completion."
+                )
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
+            # In OpenAI's API, when a tool is called, the finish_reason is:
+            # "tool_calls" for "auto" or "required" tool calls,
+            # and "stop" for named tool calls.
+            is_finish_reason_tool_calls = auto_tools_called or (
+                request.tool_choice
+                and request.tool_choice == "required"
+                and output.finish_reason == "stop"
+            )
 
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason="tool_calls" if auto_tools_called else
-                output.finish_reason if output.finish_reason else "stop",
+                finish_reason="tool_calls"
+                if is_finish_reason_tool_calls
+                else output.finish_reason
+                if output.finish_reason
+                else "stop",
                 stop_reason=output.stop_reason,
-                token_ids=(as_list(output.token_ids)
-                           if request.return_token_ids else None),
+                token_ids=(
+                    as_list(output.token_ids) if request.return_token_ids else None
+                ),
             )
 
             choices.append(choice_data)
 
         if request.echo:
-            last_msg_content: Union[str, list[dict[str, str]]] = ""
-            if (conversation and "content" in conversation[-1]
-                    and conversation[-1].get("role") == role):
+            last_msg_content: str | list[dict[str, str]] = ""
+            if (
+                conversation
+                and "content" in conversation[-1]
+                and conversation[-1].get("role") == role
+            ):
                 last_msg_content = conversation[-1]["content"] or ""
             if isinstance(last_msg_content, list):
-                last_msg_content = "\n".join(msg['text']
-                                             for msg in last_msg_content)
+                last_msg_content = "\n".join(msg["text"] for msg in last_msg_content)
 
             for choice in choices:
-                full_message = last_msg_content + (choice.message.content
-                                                   or "")
+                full_message = last_msg_content + (choice.message.content or "")
                 choice.message.content = full_message
 
         assert final_res.prompt_token_ids is not None
@@ -1401,14 +1548,17 @@ class OpenAIServingChat(OpenAIServing):
         if final_res.encoder_prompt_token_ids is not None:
             num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
         num_generated_tokens = sum(
-            len(output.token_ids) for output in final_res.outputs)
-        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
-                          completion_tokens=num_generated_tokens,
-                          total_tokens=num_prompt_tokens +
-                          num_generated_tokens)
+            len(output.token_ids) for output in final_res.outputs
+        )
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
         if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
             usage.prompt_tokens_details = PromptTokenUsageInfo(
-                cached_tokens=final_res.num_cached_tokens)
+                cached_tokens=final_res.num_cached_tokens
+            )
 
         request_metadata.final_usage_info = usage
 
@@ -1419,8 +1569,9 @@ class OpenAIServingChat(OpenAIServing):
             choices=choices,
             usage=usage,
             prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
-            prompt_token_ids=(final_res.prompt_token_ids
-                              if request.return_token_ids else None),
+            prompt_token_ids=(
+                final_res.prompt_token_ids if request.return_token_ids else None
+            ),
             kv_transfer_params=final_res.kv_transfer_params,
         )
 
@@ -1435,9 +1586,11 @@ class OpenAIServingChat(OpenAIServing):
                     tool_call_descriptions = []
                     for tc in choice.message.tool_calls:
                         if hasattr(tc.function, "name") and hasattr(
-                                tc.function, "arguments"):
+                            tc.function, "arguments"
+                        ):
                             tool_call_descriptions.append(
-                                f"{tc.function.name}({tc.function.arguments})")
+                                f"{tc.function.name}({tc.function.arguments})"
+                            )
                     tool_calls_str = ", ".join(tool_call_descriptions)
                     output_text = f"[tool_calls: {tool_calls_str}]"
 
@@ -1445,8 +1598,7 @@ class OpenAIServingChat(OpenAIServing):
                     # Get the corresponding output token IDs
                     output_token_ids = None
                     if choice.index < len(final_res.outputs):
-                        output_token_ids = final_res.outputs[
-                            choice.index].token_ids
+                        output_token_ids = final_res.outputs[choice.index].token_ids
 
                     self.request_logger.log_outputs(
                         request_id=request_id,
@@ -1460,40 +1612,48 @@ class OpenAIServingChat(OpenAIServing):
         return response
 
     def _get_top_logprobs(
-            self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
-            tokenizer: AnyTokenizer,
-            should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
+        self,
+        logprobs: dict[int, Logprob],
+        top_logprobs: int | None,
+        tokenizer: AnyTokenizer,
+        should_return_as_token_id: bool,
+    ) -> list[ChatCompletionLogProb]:
         return [
             ChatCompletionLogProb(
-                token=(token := self._get_decoded_token(
-                    p[1],
-                    p[0],
-                    tokenizer,
-                    return_as_token_id=should_return_as_token_id,
-                )),
+                token=(
+                    token := self._get_decoded_token(
+                        p[1],
+                        p[0],
+                        tokenizer,
+                        return_as_token_id=should_return_as_token_id,
+                    )
+                ),
                 logprob=max(p[1].logprob, -9999.0),
                 bytes=list(token.encode("utf-8", errors="replace")),
-            ) for i, p in enumerate(logprobs.items())
-            if top_logprobs and i < top_logprobs
+            )
+            for i, p in enumerate(logprobs.items())
+            if (top_logprobs and i < top_logprobs or top_logprobs == -1)
         ]
 
     def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
         tokenizer: AnyTokenizer,
-        num_output_top_logprobs: Optional[int] = None,
-        return_as_token_id: Optional[bool] = None,
+        num_output_top_logprobs: int | None = None,
+        return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
         logprobs_content: list[ChatCompletionLogProbsContent] = []
 
-        should_return_as_token_id = return_as_token_id if \
-            return_as_token_id is not None else self.return_tokens_as_token_ids
+        should_return_as_token_id = (
+            return_as_token_id
+            if return_as_token_id is not None
+            else self.return_tokens_as_token_ids
+        )
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
-            if step_top_logprobs is None or step_top_logprobs.get(
-                    token_id) is None:
+            if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
                 else:
@@ -1503,7 +1663,8 @@ class OpenAIServingChat(OpenAIServing):
                     ChatCompletionLogProbsContent(
                         token=token,
                         bytes=list(token.encode("utf-8", errors="replace")),
-                    ))
+                    )
+                )
             else:
                 step_token = step_top_logprobs[token_id]
                 step_decoded = step_token.decoded_token
@@ -1517,17 +1678,23 @@ class OpenAIServingChat(OpenAIServing):
                             should_return_as_token_id,
                         ),
                         logprob=max(step_token.logprob, -9999.0),
-                        bytes=None if step_decoded is None else list(
-                            step_decoded.encode("utf-8", errors="replace")),
+                        bytes=(
+                            None
+                            if step_decoded is None
+                            else list(step_decoded.encode("utf-8", errors="replace"))
+                        ),
                         top_logprobs=self._get_top_logprobs(
-                            step_top_logprobs, num_output_top_logprobs,
-                            tokenizer, should_return_as_token_id),
-                    ))
+                            step_top_logprobs,
+                            num_output_top_logprobs,
+                            tokenizer,
+                            should_return_as_token_id,
+                        ),
+                    )
+                )
 
         return ChatCompletionLogProbs(content=logprobs_content)
 
-    def _should_stream_with_auto_tool_parsing(self,
-                                              request: ChatCompletionRequest):
+    def _should_stream_with_auto_tool_parsing(self, request: ChatCompletionRequest):
         """
         Utility function to check if streamed tokens should go through the tool
         call parser that was configured.
@@ -1536,12 +1703,16 @@ class OpenAIServingChat(OpenAIServing):
         is configured, "auto" tool choice is enabled, and the request's tool
         choice field indicates that "auto" tool choice should be used.
         """
-        return (request.tools and self.tool_parser and self.enable_auto_tools
-                and request.tool_choice in ['auto', None])
+        return (
+            request.tools
+            and self.tool_parser
+            and self.enable_auto_tools
+            and request.tool_choice in ["auto", None]
+        )
 
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
-        delta_message: Optional[DeltaMessage],
+        delta_message: DeltaMessage | None,
         output: CompletionOutput,
     ) -> bool:
         """
@@ -1550,13 +1721,15 @@ class OpenAIServingChat(OpenAIServing):
         is a tool call with arguments.
         """
 
-        # yapf: disable
         return bool(
             # if there is a delta message that includes tool calls which
             # include a function that has arguments
             output.finish_reason is not None
-            and self.enable_auto_tools and self.tool_parser and delta_message
-            and delta_message.tool_calls and delta_message.tool_calls[0]
+            and self.enable_auto_tools
+            and self.tool_parser
+            and delta_message
+            and delta_message.tool_calls
+            and delta_message.tool_calls[0]
             and delta_message.tool_calls[0].function
             and delta_message.tool_calls[0].function.arguments is not None
         )
@@ -1575,7 +1748,9 @@ class OpenAIServingChat(OpenAIServing):
         sys_msg = get_system_message(
             reasoning_effort=request.reasoning_effort,
             browser_description=None,
-            python_description=None)
+            python_description=None,
+            with_custom_tools=request.tools is not None,
+        )
         messages.append(sys_msg)
 
         # Add developer message.
@@ -1584,7 +1759,7 @@ class OpenAIServingChat(OpenAIServing):
 
         # Add user message.
         for chat_msg in request.messages:
-            messages.extend(parse_chat_input(chat_msg))
+            messages.extend(parse_input_to_harmony_message(chat_msg))
 
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index fc56668aeb1b665469db05d28abca03fee58d062..167ee152fece382d51a99983ea72f2c8a3047280 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -2,23 +2,30 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
-from typing import Optional, Union, cast
+from typing import cast
 
+import jinja2
 import numpy as np
 from fastapi import Request
-from typing_extensions import override
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ClassificationData,
-                                              ClassificationRequest,
-                                              ClassificationResponse,
-                                              ErrorResponse, UsageInfo)
-# yapf: enable
-from vllm.entrypoints.openai.serving_engine import (ClassificationServeContext,
-                                                    OpenAIServing,
-                                                    ServeContext)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationData,
+    ClassificationRequest,
+    ClassificationResponse,
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import (
+    ClassificationServeContext,
+    OpenAIServing,
+    ServeContext,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
@@ -29,45 +36,96 @@ logger = init_logger(__name__)
 
 
 class ClassificationMixin(OpenAIServing):
+    chat_template: str | None
+    chat_template_content_format: ChatTemplateContentFormatOption
+    trust_request_chat_template: bool
 
-    @override
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """
         Process classification inputs: tokenize text, resolve adapters,
         and prepare model-specific inputs.
         """
         ctx = cast(ClassificationServeContext, ctx)
-        if isinstance(ctx.request.input, str) and not ctx.request.input:
-            return self.create_error_response(
-                "Input cannot be empty for classification",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
-
-        if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0:
-            return None
-
         try:
             ctx.tokenizer = await self.engine_client.get_tokenizer()
 
-            renderer = self._get_renderer(ctx.tokenizer)
-            ctx.engine_prompts = await renderer.render_prompt(
-                prompt_or_prompts=ctx.request.input,
-                config=self._build_render_config(ctx.request))
+            request_obj = ctx.request
+
+            if isinstance(request_obj, ClassificationChatRequest):
+                chat_request = request_obj
+                messages = chat_request.messages
+                trust_request_chat_template = getattr(
+                    self,
+                    "trust_request_chat_template",
+                    False,
+                )
+                ret = self._validate_chat_template(
+                    request_chat_template=chat_request.chat_template,
+                    chat_template_kwargs=chat_request.chat_template_kwargs,
+                    trust_request_chat_template=trust_request_chat_template,
+                )
+                if ret:
+                    return ret
+
+                (
+                    _,
+                    _,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    cast(ChatCompletionRequest, chat_request),
+                    ctx.tokenizer,
+                    messages,
+                    chat_template=(
+                        chat_request.chat_template
+                        or getattr(self, "chat_template", None)
+                    ),
+                    chat_template_content_format=cast(
+                        ChatTemplateContentFormatOption,
+                        getattr(self, "chat_template_content_format", "auto"),
+                    ),
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=chat_request.add_special_tokens,
+                )
+                ctx.engine_prompts = engine_prompts
+
+            elif isinstance(request_obj, ClassificationCompletionRequest):
+                completion_request = request_obj
+                input_data = completion_request.input
+                if input_data in (None, ""):
+                    return self.create_error_response(
+                        "Input or messages must be provided",
+                        status_code=HTTPStatus.BAD_REQUEST,
+                    )
+                if isinstance(input_data, list) and not input_data:
+                    ctx.engine_prompts = []
+                    return None
+
+                renderer = self._get_renderer(ctx.tokenizer)
+                prompt_input = cast(str | list[str], input_data)
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=prompt_input,
+                    config=self._build_render_config(completion_request),
+                )
+            else:
+                return self.create_error_response(
+                    "Invalid classification request type",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
 
             return None
 
-        except (ValueError, TypeError) as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-    @override
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[ClassificationResponse, ErrorResponse]:
+    ) -> ClassificationResponse | ErrorResponse:
         """
         Convert model outputs to a formatted classification response
         with probabilities and labels.
@@ -76,16 +134,16 @@ class ClassificationMixin(OpenAIServing):
         items: list[ClassificationData] = []
         num_prompt_tokens = 0
 
-        final_res_batch_checked = cast(list[PoolingRequestOutput],
-                                       ctx.final_res_batch)
+        final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
 
         for idx, final_res in enumerate(final_res_batch_checked):
             classify_res = ClassificationOutput.from_base(final_res.outputs)
 
             probs = classify_res.probs
             predicted_index = int(np.argmax(probs))
-            label = getattr(self.model_config.hf_config, "id2label",
-                            {}).get(predicted_index)
+            label = getattr(self.model_config.hf_config, "id2label", {}).get(
+                predicted_index
+            )
 
             item = ClassificationData(
                 index=idx,
@@ -111,11 +169,12 @@ class ClassificationMixin(OpenAIServing):
             usage=usage,
         )
 
-    def _build_render_config(self,
-                             request: ClassificationRequest) -> RenderConfig:
+    def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
         return RenderConfig(
             max_length=self.max_model_len,
-            truncate_prompt_tokens=request.truncate_prompt_tokens)
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
+        )
 
 
 class ServingClassification(ClassificationMixin):
@@ -124,28 +183,32 @@ class ServingClassification(ClassificationMixin):
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
-            model_config=model_config,
             models=models,
             request_logger=request_logger,
             log_error_stack=log_error_stack,
         )
 
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
     async def create_classify(
         self,
         request: ClassificationRequest,
         raw_request: Request,
-    ) -> Union[ClassificationResponse, ErrorResponse]:
+    ) -> ClassificationResponse | ErrorResponse:
         model_name = self.models.model_name()
-        request_id = (f"{self.request_id_prefix}-"
-                      f"{self._base_request_id(raw_request)}")
+        request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
 
         ctx = ClassificationServeContext(
             request=request,
@@ -156,11 +219,10 @@ class ServingClassification(ClassificationMixin):
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
     def _create_pooling_params(
         self,
         ctx: ClassificationServeContext,
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
             return pooling_params
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 0c61c48da0bc81411bd2245df0a21182a8f86411..a114b77ebc16ba926f0e672b3165ac8556e8e220 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -5,54 +5,49 @@ import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Optional, Union, cast
+from typing import cast
 
 import jinja2
 from fastapi import Request
-from typing_extensions import assert_never
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
-                                              CompletionRequest,
-                                              CompletionResponse,
-                                              CompletionResponseChoice,
-                                              CompletionResponseStreamChoice,
-                                              CompletionStreamResponse,
-                                              ErrorResponse,
-                                              PromptTokenUsageInfo,
-                                              RequestResponseMetadata,
-                                              UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
-                                                    clamp_prompt_logprobs)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    CompletionLogProbs,
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
+    ErrorResponse,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
-from vllm.entrypoints.utils import get_max_tokens
-from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
-                              is_tokens_prompt)
+from vllm.entrypoints.utils import get_max_tokens, should_include_usage
+from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import as_list, merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.collection_utils import as_list
+from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
 logger = init_logger(__name__)
 
 
 class OpenAIServingCompletion(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
@@ -60,16 +55,18 @@ class OpenAIServingCompletion(OpenAIServing):
     ):
         super().__init__(
             engine_client=engine_client,
-            model_config=model_config,
             models=models,
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
-            enable_force_include_usage=enable_force_include_usage,
             log_error_stack=log_error_stack,
         )
+
+        # set up logits processors
+        self.logits_processors = self.model_config.logits_processors
+
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
-        self.default_sampling_params = (
-            self.model_config.get_diff_sampling_param())
+        self.default_sampling_params = self.model_config.get_diff_sampling_param()
+        self.enable_force_include_usage = enable_force_include_usage
         if self.default_sampling_params:
             source = self.model_config.generation_config
             source = "model" if source == "auto" else source
@@ -82,8 +79,8 @@ class OpenAIServingCompletion(OpenAIServing):
     async def create_completion(
         self,
         request: CompletionRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
         """Completion API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/completions/create
@@ -105,21 +102,17 @@ class OpenAIServingCompletion(OpenAIServing):
 
         # Return error for unsupported features.
         if request.suffix is not None:
-            return self.create_error_response(
-                "suffix is not currently supported")
+            return self.create_error_response("suffix is not currently supported")
 
         if request.echo and request.prompt_embeds is not None:
-            return self.create_error_response(
-                "Echo is unsupported with prompt embeds.")
+            return self.create_error_response("Echo is unsupported with prompt embeds.")
 
-        if (request.prompt_logprobs is not None
-                and request.prompt_embeds is not None):
+        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
             return self.create_error_response(
-                "prompt_logprobs is not compatible with prompt embeds.")
+                "prompt_logprobs is not compatible with prompt embeds."
+            )
 
-        request_id = (
-            f"cmpl-"
-            f"{self._base_request_id(raw_request, request.request_id)}")
+        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
         created_time = int(time.time())
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
@@ -153,27 +146,24 @@ class OpenAIServingCompletion(OpenAIServing):
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
+        # Extract data_parallel_rank from header (router can inject it)
+        data_parallel_rank = self._get_data_parallel_rank(raw_request)
+
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
-                sampling_params: Union[SamplingParams, BeamSearchParams]
-                # Mypy does not infer that engine_prompt will have only one of
-                # "prompt_token_ids" or "prompt_embeds" defined, and both of
-                # these as Union[object, the expected type], where it infers
-                # object if engine_prompt is a subclass of one of the
-                # typeddicts that defines both keys. Worse, because of
-                # https://github.com/python/mypy/issues/8586, mypy does not
-                # infer the type of engine_prompt correctly because of the
-                # enumerate. So we need an unnecessary cast here.
-                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
-                                     engine_prompt)
-                if is_embeds_prompt(engine_prompt):
-                    input_length = len(engine_prompt["prompt_embeds"])
-                elif is_tokens_prompt(engine_prompt):
-                    input_length = len(engine_prompt["prompt_token_ids"])
+                prompt_text, prompt_token_ids, prompt_embeds = (
+                    self._get_prompt_components(engine_prompt)
+                )
+
+                input_length = None
+                if prompt_token_ids is not None:
+                    input_length = len(prompt_token_ids)
+                elif prompt_embeds is not None:
+                    input_length = len(prompt_embeds)
                 else:
-                    assert_never(engine_prompt)
+                    raise NotImplementedError
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -185,15 +175,21 @@ class OpenAIServingCompletion(OpenAIServing):
                     default_sampling_params=self.default_sampling_params,
                 )
 
+                sampling_params: SamplingParams | BeamSearchParams
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params
+                    )
                 else:
                     sampling_params = request.to_sampling_params(
                         max_tokens,
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 request_id_item = f"{request_id}-{i}"
 
@@ -204,29 +200,43 @@ class OpenAIServingCompletion(OpenAIServing):
                     lora_request=lora_request,
                 )
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
+                )
 
                 # Mypy inconsistently requires this second cast in different
                 # environments. It shouldn't be necessary (redundant from above)
                 # but pre-commit in CI fails without it.
-                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
-                                     engine_prompt)
+                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
                 if isinstance(sampling_params, BeamSearchParams):
-                    generator = self.engine_client.beam_search(
+                    generator = self.beam_search(
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
                     )
                 else:
-                    generator = self.engine_client.generate(
+                    engine_request, tokenization_kwargs = await self._process_inputs(
+                        request_id_item,
                         engine_prompt,
                         sampling_params,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        priority=request.priority,
+                    )
+
+                    generator = self.engine_client.generate(
+                        engine_request,
+                        sampling_params,
                         request_id_item,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        prompt_text=prompt_text,
+                        tokenization_kwargs=tokenization_kwargs,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                 generators.append(generator)
@@ -242,9 +252,11 @@ class OpenAIServingCompletion(OpenAIServing):
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. Noting that best_of is only supported in V0. In addition,
         # we do not stream the results when use beam search.
-        stream = (request.stream
-                  and (request.best_of is None or request.n == request.best_of)
-                  and not request.use_beam_search)
+        stream = (
+            request.stream
+            and (request.best_of is None or request.n == request.best_of)
+            and not request.use_beam_search
+        )
 
         # Streaming response
         if stream:
@@ -258,11 +270,10 @@ class OpenAIServingCompletion(OpenAIServing):
                 num_prompts=num_prompts,
                 tokenizer=tokenizer,
                 request_metadata=request_metadata,
-                enable_force_include_usage=self.enable_force_include_usage,
             )
 
         # Non-streaming response
-        final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
+        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
@@ -275,11 +286,13 @@ class OpenAIServingCompletion(OpenAIServing):
                 # with the inputs token IDs
                 if final_res.prompt is None:
                     engine_prompt = engine_prompts[i]
-                    final_res.prompt = None if is_embeds_prompt(
-                        engine_prompt) else engine_prompt.get("prompt")
+                    final_res.prompt = (
+                        None
+                        if is_embeds_prompt(engine_prompt)
+                        else engine_prompt.get("prompt")
+                    )
 
-            final_res_batch_checked = cast(list[RequestOutput],
-                                           final_res_batch)
+            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
 
             response = self.request_output_to_completion_response(
                 final_res_batch_checked,
@@ -312,7 +325,7 @@ class OpenAIServingCompletion(OpenAIServing):
     async def completion_stream_generator(
         self,
         request: CompletionRequest,
-        engine_prompts: list[Union[TokensPrompt, EmbedsPrompt]],
+        engine_prompts: list[TokensPrompt | EmbedsPrompt],
         result_generator: AsyncIterator[tuple[int, RequestOutput]],
         request_id: str,
         created_time: int,
@@ -320,7 +333,6 @@ class OpenAIServingCompletion(OpenAIServing):
         num_prompts: int,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        enable_force_include_usage: bool,
     ) -> AsyncGenerator[str, None]:
         num_choices = 1 if request.n is None else request.n
         previous_text_lens = [0] * num_choices * num_prompts
@@ -331,13 +343,9 @@ class OpenAIServingCompletion(OpenAIServing):
         first_iteration = True
 
         stream_options = request.stream_options
-        if stream_options:
-            include_usage = (stream_options.include_usage
-                             or enable_force_include_usage)
-            include_continuous_usage = (include_usage and
-                                        stream_options.continuous_usage_stats)
-        else:
-            include_usage, include_continuous_usage = False, False
+        include_usage, include_continuous_usage = should_include_usage(
+            stream_options, self.enable_force_include_usage
+        )
 
         try:
             async for prompt_idx, res in result_generator:
@@ -351,16 +359,18 @@ class OpenAIServingCompletion(OpenAIServing):
                 prompt_text = res.prompt
                 if prompt_text is None:
                     engine_prompt = engine_prompts[prompt_idx]
-                    prompt_text = None if is_embeds_prompt(
-                        engine_prompt) else engine_prompt.get("prompt")
+                    prompt_text = (
+                        None
+                        if is_embeds_prompt(engine_prompt)
+                        else engine_prompt.get("prompt")
+                    )
 
                 # Prompt details are excluded from later streamed outputs
                 if prompt_token_ids is not None:
                     num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
 
                 delta_token_ids: GenericSequence[int]
-                out_logprobs: Optional[GenericSequence[Optional[dict[
-                    int, Logprob]]]]
+                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
 
                 for output in res.outputs:
                     i = output.index + prompt_idx * num_choices
@@ -368,7 +378,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     # Useful when request.return_token_ids is True
                     # Returning prompt token IDs shares the same logic
                     # with the echo implementation.
-                    prompt_token_ids_to_return: Optional[list[int]] = None
+                    prompt_token_ids_to_return: list[int] | None = None
 
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
@@ -402,26 +412,27 @@ class OpenAIServingCompletion(OpenAIServing):
 
                         # has_echoed[i] is reused here to indicate whether
                         # we have already returned the prompt token IDs.
-                        if not has_echoed[i]:
+                        if not has_echoed[i] and request.return_token_ids:
                             prompt_token_ids_to_return = prompt_token_ids
                             has_echoed[i] = True
 
-                        if (not delta_text and not delta_token_ids
-                                and not previous_num_tokens[i]):
+                        if (
+                            not delta_text
+                            and not delta_token_ids
+                            and not previous_num_tokens[i]
+                        ):
                             # Chunked prefill case, don't return empty chunks
                             continue
 
                     if request.logprobs is not None:
-                        assert out_logprobs is not None, (
-                            "Did not output logprobs")
+                        assert out_logprobs is not None, "Did not output logprobs"
                         logprobs = self._create_completion_logprobs(
                             token_ids=delta_token_ids,
                             top_logprobs=out_logprobs,
                             num_output_top_logprobs=request.logprobs,
                             tokenizer=tokenizer,
                             initial_text_offset=previous_text_lens[i],
-                            return_as_token_id=request.
-                            return_tokens_as_token_ids,
+                            return_as_token_id=request.return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -443,8 +454,11 @@ class OpenAIServingCompletion(OpenAIServing):
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
                                 prompt_token_ids=prompt_token_ids_to_return,
-                                token_ids=(as_list(output.token_ids) if
-                                           request.return_token_ids else None),
+                                token_ids=(
+                                    as_list(output.token_ids)
+                                    if request.return_token_ids
+                                    else None
+                                ),
                             )
                         ],
                     )
@@ -470,7 +484,8 @@ class OpenAIServingCompletion(OpenAIServing):
 
             if self.enable_prompt_tokens_details and num_cached_tokens:
                 final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
-                    cached_tokens=num_cached_tokens)
+                    cached_tokens=num_cached_tokens
+                )
 
             if include_usage:
                 final_usage_chunk = CompletionStreamResponse(
@@ -481,7 +496,8 @@ class OpenAIServingCompletion(OpenAIServing):
                     usage=final_usage_info,
                 )
                 final_usage_data = final_usage_chunk.model_dump_json(
-                    exclude_unset=False, exclude_none=True)
+                    exclude_unset=False, exclude_none=True
+                )
                 yield f"data: {final_usage_data}\n\n"
 
             # report to FastAPI middleware aggregate usage across all choices
@@ -516,8 +532,7 @@ class OpenAIServingCompletion(OpenAIServing):
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
-            out_logprobs: Optional[GenericSequence[Optional[dict[int,
-                                                                 Logprob]]]]
+            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
 
             for output in final_res.outputs:
                 assert request.max_tokens is not None
@@ -567,10 +582,12 @@ class OpenAIServingCompletion(OpenAIServing):
                     finish_reason=output.finish_reason,
                     stop_reason=output.stop_reason,
                     prompt_logprobs=final_res.prompt_logprobs,
-                    prompt_token_ids=(prompt_token_ids
-                                      if request.return_token_ids else None),
-                    token_ids=(as_list(output.token_ids)
-                               if request.return_token_ids else None),
+                    prompt_token_ids=(
+                        prompt_token_ids if request.return_token_ids else None
+                    ),
+                    token_ids=(
+                        as_list(output.token_ids) if request.return_token_ids else None
+                    ),
                 )
                 choices.append(choice_data)
 
@@ -584,10 +601,14 @@ class OpenAIServingCompletion(OpenAIServing):
             total_tokens=num_prompt_tokens + num_generated_tokens,
         )
 
-        if (self.enable_prompt_tokens_details and last_final_res
-                and last_final_res.num_cached_tokens):
+        if (
+            self.enable_prompt_tokens_details
+            and last_final_res
+            and last_final_res.num_cached_tokens
+        ):
             usage.prompt_tokens_details = PromptTokenUsageInfo(
-                cached_tokens=last_final_res.num_cached_tokens)
+                cached_tokens=last_final_res.num_cached_tokens
+            )
 
         request_metadata.final_usage_info = usage
         if final_res_batch:
@@ -604,23 +625,25 @@ class OpenAIServingCompletion(OpenAIServing):
     def _create_completion_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
         num_output_top_logprobs: int,
         tokenizer: AnyTokenizer,
         initial_text_offset: int = 0,
-        return_as_token_id: Optional[bool] = None,
+        return_as_token_id: bool | None = None,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
         out_text_offset: list[int] = []
-        out_token_logprobs: list[Optional[float]] = []
+        out_token_logprobs: list[float | None] = []
         out_tokens: list[str] = []
-        out_top_logprobs: list[Optional[dict[str, float]]] = []
+        out_top_logprobs: list[dict[str, float] | None] = []
 
         last_token_len = 0
 
-        should_return_as_token_id = (return_as_token_id
-                                     if return_as_token_id is not None else
-                                     self.return_tokens_as_token_ids)
+        should_return_as_token_id = (
+            return_as_token_id
+            if return_as_token_id is not None
+            else self.return_tokens_as_token_ids
+        )
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
@@ -649,19 +672,20 @@ class OpenAIServingCompletion(OpenAIServing):
                 # logprobs, as defined in the openai API
                 # (cf. https://github.com/openai/openai-openapi/blob/
                 # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
-                out_top_logprobs.append({
-                    # Convert float("-inf") to the
-                    # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(
-                        top_lp[1],
-                        top_lp[0],
-                        tokenizer,
-                        return_as_token_id=should_return_as_token_id,
-                    ):
-                    max(top_lp[1].logprob, -9999.0)
-                    for i, top_lp in enumerate(step_top_logprobs.items())
-                    if num_output_top_logprobs >= i
-                })
+                out_top_logprobs.append(
+                    {
+                        # Convert float("-inf") to the
+                        # JSON-serializable float that OpenAI uses
+                        self._get_decoded_token(
+                            top_lp[1],
+                            top_lp[0],
+                            tokenizer,
+                            return_as_token_id=should_return_as_token_id,
+                        ): max(top_lp[1].logprob, -9999.0)
+                        for i, top_lp in enumerate(step_top_logprobs.items())
+                        if num_output_top_logprobs >= i
+                    }
+                )
 
             if len(out_text_offset) == 0:
                 out_text_offset.append(initial_text_offset)
@@ -679,7 +703,7 @@ class OpenAIServingCompletion(OpenAIServing):
     def _build_render_config(
         self,
         request: CompletionRequest,
-        max_input_length: Optional[int] = None,
+        max_input_length: int | None = None,
     ) -> RenderConfig:
         max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
         return RenderConfig(
@@ -687,6 +711,5 @@ class OpenAIServingCompletion(OpenAIServing):
             truncate_prompt_tokens=request.truncate_prompt_tokens,
             add_special_tokens=request.add_special_tokens,
             cache_salt=request.cache_salt,
-            needs_detokenization=bool(request.echo
-                                      and not request.return_token_ids),
+            needs_detokenization=bool(request.echo and not request.return_token_ids),
         )
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 647e7daed6598513be6e1418ece951c4450fa241..51f6106acec3d46cc9a0a629f06efbdec9afaed6 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,61 +1,58 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import base64
+import json
 from collections.abc import AsyncGenerator, Mapping
-from typing import Any, Final, Literal, Optional, Union, cast
+from typing import Any, Final, cast
 
-import numpy as np
 import torch
 from fastapi import Request
+from fastapi.responses import Response
 from typing_extensions import assert_never, override
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this docstring
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
-                                              EmbeddingCompletionRequest,
-                                              EmbeddingRequest,
-                                              EmbeddingResponse,
-                                              EmbeddingResponseData,
-                                              ErrorResponse, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
-                                                    OpenAIServing,
-                                                    ServeContext,
-                                                    TextTokensPrompt)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    EmbeddingResponseData,
+    ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import (
+    EmbeddingServeContext,
+    OpenAIServing,
+    ServeContext,
+    TextTokensPrompt,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
-                          PoolingOutput, PoolingRequestOutput, RequestOutput)
+from vllm.outputs import (
+    EmbeddingRequestOutput,
+    PoolingOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+)
 from vllm.pooling_params import PoolingParams
-from vllm.utils import chunk_list
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.collection_utils import chunk_list
+from vllm.utils.serial_utils import (
+    EmbedDType,
+    EncodingFormat,
+    Endianness,
+    encode_pooling_bytes,
+    encode_pooling_output,
+)
 
 logger = init_logger(__name__)
 
 
-def _get_embedding(
-    output: EmbeddingOutput,
-    encoding_format: Literal["float", "base64"],
-) -> Union[list[float], str]:
-    if encoding_format == "float":
-        return output.embedding
-    elif encoding_format == "base64":
-        # Force to use float32 for base64 encoding
-        # to match the OpenAI python client behavior
-        embedding_bytes = np.array(output.embedding, dtype="float32").tobytes()
-        return base64.b64encode(embedding_bytes).decode("utf-8")
-
-    assert_never(encoding_format)
-
-
 class EmbeddingMixin(OpenAIServing):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -63,15 +60,19 @@ class EmbeddingMixin(OpenAIServing):
 
         # Avoid repeated attribute lookups
         self.supports_chunked_processing = bool(
-            pooler_config and pooler_config.enable_chunked_processing)
-        self.max_embed_len = (pooler_config.max_embed_len if pooler_config
-                              and pooler_config.max_embed_len else None)
+            pooler_config and pooler_config.enable_chunked_processing
+        )
+        self.max_embed_len = (
+            pooler_config.max_embed_len
+            if pooler_config and pooler_config.max_embed_len
+            else None
+        )
 
     @override
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         ctx = cast(EmbeddingServeContext, ctx)
         try:
             ctx.lora_request = self._maybe_get_adapters(ctx.request)
@@ -88,10 +89,8 @@ class EmbeddingMixin(OpenAIServing):
                     ctx.request,
                     tokenizer,
                     ctx.request.messages,
-                    chat_template=ctx.request.chat_template
-                    or ctx.chat_template,
-                    chat_template_content_format=ctx.
-                    chat_template_content_format,
+                    chat_template=ctx.request.chat_template or ctx.chat_template,
+                    chat_template_content_format=ctx.chat_template_content_format,
                     add_generation_prompt=ctx.request.add_generation_prompt,
                     continue_final_message=False,
                     add_special_tokens=ctx.request.add_special_tokens,
@@ -106,8 +105,7 @@ class EmbeddingMixin(OpenAIServing):
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-    def _build_render_config(
-            self, request: EmbeddingCompletionRequest) -> RenderConfig:
+    def _build_render_config(self, request: EmbeddingCompletionRequest) -> RenderConfig:
         # Set max_length based on chunked processing capability
         if self._should_use_chunked_processing(request):
             max_length = None
@@ -117,44 +115,77 @@ class EmbeddingMixin(OpenAIServing):
         return RenderConfig(
             max_length=max_length,
             truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens)
+            add_special_tokens=request.add_special_tokens,
+        )
 
     @override
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[EmbeddingResponse, ErrorResponse]:
-        items: list[EmbeddingResponseData] = []
-        num_prompt_tokens = 0
+    ) -> EmbeddingResponse | Response | ErrorResponse:
+        final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
+
+        encoding_format: EncodingFormat = ctx.request.encoding_format
+        embed_dtype: EmbedDType = ctx.request.embed_dtype
+        endianness: Endianness = ctx.request.endianness
+
+        def encode_float_base64():
+            items: list[EmbeddingResponseData] = []
+            num_prompt_tokens = 0
+
+            for idx, final_res in enumerate(final_res_batch_checked):
+                item = EmbeddingResponseData(
+                    index=idx,
+                    embedding=encode_pooling_output(
+                        final_res,
+                        encoding_format=encoding_format,
+                        embed_dtype=embed_dtype,
+                        endianness=endianness,
+                    ),
+                )
+                prompt_token_ids = final_res.prompt_token_ids
 
-        final_res_batch_checked = cast(list[PoolingRequestOutput],
-                                       ctx.final_res_batch)
+                items.append(item)
+                num_prompt_tokens += len(prompt_token_ids)
 
-        for idx, final_res in enumerate(final_res_batch_checked):
-            embedding_res = EmbeddingRequestOutput.from_base(final_res)
+            usage = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                total_tokens=num_prompt_tokens,
+            )
 
-            item = EmbeddingResponseData(
-                index=idx,
-                embedding=_get_embedding(embedding_res.outputs,
-                                         ctx.request.encoding_format),
+            return EmbeddingResponse(
+                id=ctx.request_id,
+                created=ctx.created_time,
+                model=ctx.model_name,
+                data=items,
+                usage=usage,
             )
-            prompt_token_ids = final_res.prompt_token_ids
 
-            items.append(item)
-            num_prompt_tokens += len(prompt_token_ids)
+        def encode_bytes():
+            body, items, usage = encode_pooling_bytes(
+                pooling_outputs=final_res_batch_checked,
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+            )
 
-        usage = UsageInfo(
-            prompt_tokens=num_prompt_tokens,
-            total_tokens=num_prompt_tokens,
-        )
+            metadata = {
+                "id": ctx.request_id,
+                "created": ctx.created_time,
+                "model": ctx.model_name,
+                "data": items,
+                "usage": usage,
+            }
+            return EmbeddingBytesResponse(
+                body=body,
+                metadata=json.dumps(metadata),
+            )
 
-        return EmbeddingResponse(
-            id=ctx.request_id,
-            created=ctx.created_time,
-            model=ctx.model_name,
-            data=items,
-            usage=usage,
-        )
+        if encoding_format == "float" or encoding_format == "base64":
+            return encode_float_base64()
+        elif encoding_format == "bytes":
+            return encode_bytes()
+        else:
+            assert_never(encoding_format)
 
     def _get_max_position_embeddings(self) -> int:
         """Get the model's effective maximum sequence length for chunking."""
@@ -162,10 +193,10 @@ class EmbeddingMixin(OpenAIServing):
 
     def _should_use_chunked_processing(self, request) -> bool:
         """Check if chunked processing should be used for this request."""
-        return isinstance(
-            request,
-            (EmbeddingCompletionRequest,
-             EmbeddingChatRequest)) and self.supports_chunked_processing
+        return (
+            isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest))
+            and self.supports_chunked_processing
+        )
 
     async def _process_chunked_request(
         self,
@@ -183,25 +214,27 @@ class EmbeddingMixin(OpenAIServing):
         max_pos_embeddings = self._get_max_position_embeddings()
         # Process all chunks for MEAN aggregation
         for chunk_idx, chunk_tokens in enumerate(
-                chunk_list(token_ids, max_pos_embeddings)):
+            chunk_list(token_ids, max_pos_embeddings)
+        ):
             # Create a request ID for this chunk
-            chunk_request_id = (f"{ctx.request_id}-prompt-{prompt_idx}-"
-                                f"chunk-{chunk_idx}")
+            chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
 
             # Create engine prompt for this chunk
-            chunk_engine_prompt = EngineTokensPrompt(
-                prompt_token_ids=chunk_tokens)
+            chunk_engine_prompt = EngineTokensPrompt(prompt_token_ids=chunk_tokens)
 
             # Create chunk request prompt for logging
             chunk_text = ""
             chunk_request_prompt = TextTokensPrompt(
-                prompt=chunk_text, prompt_token_ids=chunk_tokens)
+                prompt=chunk_text, prompt_token_ids=chunk_tokens
+            )
 
             # Log the chunk
-            self._log_inputs(chunk_request_id,
-                             chunk_request_prompt,
-                             params=pooling_params,
-                             lora_request=ctx.lora_request)
+            self._log_inputs(
+                chunk_request_id,
+                chunk_request_prompt,
+                params=pooling_params,
+                lora_request=ctx.lora_request,
+            )
 
             # Create generator for this chunk and wrap it to return indices
             original_generator = self.engine_client.encode(
@@ -227,8 +260,7 @@ class EmbeddingMixin(OpenAIServing):
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest doesn't have max_tokens
-        if isinstance(request,
-                      (EmbeddingCompletionRequest, EmbeddingChatRequest)):
+        if isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)):
             # Check if chunked processing is enabled for pooling models
             enable_chunked = self._should_use_chunked_processing(request)
 
@@ -248,13 +280,15 @@ class EmbeddingMixin(OpenAIServing):
             validation_error_msg = (
                 "This model's {length_type} is {max_length_value} tokens. "
                 "However, you requested {token_num} tokens in the input for "
-                "embedding generation. Please reduce the length of the input.")
+                "embedding generation. Please reduce the length of the input."
+            )
 
             chunked_processing_error_msg = (
                 "This model's {length_type} is {max_length_value} tokens. "
                 "However, you requested {token_num} tokens in the input for "
                 "embedding generation. Please reduce the length of the input "
-                "or enable chunked processing.")
+                "or enable chunked processing."
+            )
 
             # Check if input exceeds max length
             if token_num > max_length_value:
@@ -262,7 +296,9 @@ class EmbeddingMixin(OpenAIServing):
                     validation_error_msg.format(
                         length_type=length_type,
                         max_length_value=max_length_value,
-                        token_num=token_num))
+                        token_num=token_num,
+                    )
+                )
 
             # Check for chunked processing
             # when exceeding max_position_embeddings
@@ -271,41 +307,49 @@ class EmbeddingMixin(OpenAIServing):
                     # Allow long inputs when chunked processing is enabled
                     logger.info(
                         "Input length %s exceeds max_position_embeddings "
-                        "%s, will use chunked processing", token_num,
-                        max_pos_embeddings)
+                        "%s, will use chunked processing",
+                        token_num,
+                        max_pos_embeddings,
+                    )
                 else:
                     raise ValueError(
                         chunked_processing_error_msg.format(
                             length_type="maximum position embeddings length",
                             max_length_value=max_pos_embeddings,
-                            token_num=token_num))
+                            token_num=token_num,
+                        )
+                    )
 
-            return TextTokensPrompt(prompt=input_text,
-                                    prompt_token_ids=input_ids)
+            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # For other request types, use the parent's implementation
         return super()._validate_input(request, input_ids, input_text)
 
     def _is_text_tokens_prompt(self, prompt) -> bool:
         """Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
-        return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
-                and "prompt_embeds" not in prompt)
+        return (
+            isinstance(prompt, dict)
+            and "prompt_token_ids" in prompt
+            and "prompt_embeds" not in prompt
+        )
 
     async def _create_single_prompt_generator(
         self,
         ctx: EmbeddingServeContext,
         engine_prompt: EngineTokensPrompt,
         pooling_params: PoolingParams,
-        trace_headers: Optional[Mapping[str, str]],
+        trace_headers: Mapping[str, str] | None,
         prompt_index: int,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]:
         """Create a generator for a single prompt using standard processing."""
         request_id_item = f"{ctx.request_id}-{prompt_index}"
 
-        self._log_inputs(request_id_item,
-                         engine_prompt,
-                         params=pooling_params,
-                         lora_request=ctx.lora_request)
+        self._log_inputs(
+            request_id_item,
+            engine_prompt,
+            params=pooling_params,
+            lora_request=ctx.lora_request,
+        )
 
         # Return the original generator without wrapping
         return self.engine_client.encode(
@@ -321,7 +365,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _prepare_generators(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Override to support chunked processing."""
         ctx = cast(EmbeddingServeContext, ctx)
 
@@ -333,13 +377,16 @@ class EmbeddingMixin(OpenAIServing):
             return await super()._prepare_generators(ctx)
 
         # Custom logic for chunked processing
-        generators: list[AsyncGenerator[Union[RequestOutput,
-                                              PoolingRequestOutput],
-                                        None]] = []
+        generators: list[
+            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
+        ] = []
 
         try:
-            trace_headers = (None if ctx.raw_request is None else await
-                             self._get_trace_headers(ctx.raw_request.headers))
+            trace_headers = (
+                None
+                if ctx.raw_request is None
+                else await self._get_trace_headers(ctx.raw_request.headers)
+            )
 
             pooling_params = self._create_pooling_params(ctx)
             if isinstance(pooling_params, ErrorResponse):
@@ -352,8 +399,7 @@ class EmbeddingMixin(OpenAIServing):
                 return self.create_error_response(str(e))
 
             if ctx.engine_prompts is None:
-                return self.create_error_response(
-                    "Engine prompts not available")
+                return self.create_error_response("Engine prompts not available")
 
             max_pos_embeddings = self._get_max_position_embeddings()
 
@@ -363,21 +409,20 @@ class EmbeddingMixin(OpenAIServing):
                     # Cast to TextTokensPrompt since we've verified
                     # prompt_token_ids
                     text_tokens_prompt = cast(TextTokensPrompt, engine_prompt)
-                    if (len(text_tokens_prompt["prompt_token_ids"])
-                            > max_pos_embeddings):
+                    if len(text_tokens_prompt["prompt_token_ids"]) > max_pos_embeddings:
                         # Use chunked processing for this prompt
                         chunk_generators = await self._process_chunked_request(
-                            ctx, text_tokens_prompt, pooling_params,
-                            trace_headers, i)
+                            ctx, text_tokens_prompt, pooling_params, trace_headers, i
+                        )
                         generators.extend(chunk_generators)
                         continue
 
                 # Normal processing for short prompts or non-token prompts
                 generator = await self._create_single_prompt_generator(
-                    ctx, engine_prompt, pooling_params, trace_headers, i)
+                    ctx, engine_prompt, pooling_params, trace_headers, i
+                )
                 generators.append(generator)
 
-            from vllm.utils import merge_async_iterators
             ctx.result_generator = merge_async_iterators(*generators)
 
             return None
@@ -390,7 +435,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _collect_batch(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Collect and aggregate batch results
         with support for chunked processing.
 
@@ -401,8 +446,7 @@ class EmbeddingMixin(OpenAIServing):
         ctx = cast(EmbeddingServeContext, ctx)
         try:
             if ctx.engine_prompts is None:
-                return self.create_error_response(
-                    "Engine prompts not available")
+                return self.create_error_response("Engine prompts not available")
 
             # Check if we used chunked processing
             use_chunked = self._should_use_chunked_processing(ctx.request)
@@ -411,8 +455,7 @@ class EmbeddingMixin(OpenAIServing):
                 return await super()._collect_batch(ctx=ctx)
 
             if ctx.result_generator is None:
-                return self.create_error_response(
-                    "Result generator not available")
+                return self.create_error_response("Result generator not available")
 
             # Online aggregation for chunked requests to
             # minimize memory usage
@@ -433,10 +476,10 @@ class EmbeddingMixin(OpenAIServing):
                     # Initialize aggregator for this prompt if needed
                     if prompt_idx not in prompt_aggregators:
                         prompt_aggregators[prompt_idx] = {
-                            'weighted_sum': None,
-                            'total_weight': 0,
-                            'chunk_count': 0,
-                            'request_id': result.request_id.split("-chunk-")[0]
+                            "weighted_sum": None,
+                            "total_weight": 0,
+                            "chunk_count": 0,
+                            "request_id": result.request_id.split("-chunk-")[0],
                         }
 
                     aggregator = prompt_aggregators[prompt_idx]
@@ -448,44 +491,45 @@ class EmbeddingMixin(OpenAIServing):
                         return self.create_error_response(
                             f"Expected PoolingRequestOutput for "
                             f"chunked embedding, got "
-                            f"{type(result).__name__}")
+                            f"{type(result).__name__}"
+                        )
 
                     # Handle both PoolingOutput and
                     # EmbeddingOutput types
-                    if hasattr(result.outputs, 'data'):
+                    if hasattr(result.outputs, "data"):
                         # PoolingOutput case
                         embedding_data = result.outputs.data
-                    elif hasattr(result.outputs, 'embedding'):
+                    elif hasattr(result.outputs, "embedding"):
                         # EmbeddingOutput case -
                         # convert embedding list to tensor
                         embedding_data = result.outputs.embedding
                     else:
                         return self.create_error_response(
-                            f"Unsupported output type: "
-                            f"{type(result.outputs).__name__}")
+                            f"Unsupported output type: {type(result.outputs).__name__}"
+                        )
 
                     if not isinstance(embedding_data, torch.Tensor):
-                        embedding_data = torch.tensor(embedding_data,
-                                                      dtype=torch.float32)
+                        embedding_data = torch.tensor(
+                            embedding_data, dtype=torch.float32
+                        )
 
                     if result.prompt_token_ids is None:
                         return self.create_error_response(
-                            "prompt_token_ids cannot be None for "
-                            "chunked processing")
+                            "prompt_token_ids cannot be None for chunked processing"
+                        )
                     weight = len(result.prompt_token_ids)
 
-                    weighted_embedding = embedding_data.to(
-                        dtype=torch.float32) * weight
+                    weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
 
-                    if aggregator['weighted_sum'] is None:
+                    if aggregator["weighted_sum"] is None:
                         # First chunk
-                        aggregator['weighted_sum'] = weighted_embedding
+                        aggregator["weighted_sum"] = weighted_embedding
                     else:
                         # Accumulate
-                        aggregator['weighted_sum'] += weighted_embedding
+                        aggregator["weighted_sum"] += weighted_embedding
 
-                    aggregator['total_weight'] += weight
-                    aggregator['chunk_count'] += 1
+                    aggregator["total_weight"] += weight
+                    aggregator["chunk_count"] += 1
                 else:
                     # Non-chunked result - extract prompt_idx from request_id
                     parts = result.request_id.split("-")
@@ -496,11 +540,11 @@ class EmbeddingMixin(OpenAIServing):
                         prompt_idx = result_idx  # Fallback to result_idx
 
                     short_prompts_results[prompt_idx] = cast(
-                        PoolingRequestOutput, result)
+                        PoolingRequestOutput, result
+                    )
 
             # Finalize aggregated results
-            final_res_batch: list[Union[PoolingRequestOutput,
-                                        EmbeddingRequestOutput]] = []
+            final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = []
             num_prompts = len(ctx.engine_prompts)
 
             for prompt_idx in range(num_prompts):
@@ -508,55 +552,58 @@ class EmbeddingMixin(OpenAIServing):
                     # Finalize MEAN aggregation for this chunked prompt
                     aggregator = prompt_aggregators[prompt_idx]
 
-                    weighted_sum = aggregator['weighted_sum']
-                    total_weight = aggregator['total_weight']
-
-                    if (weighted_sum is not None
-                            and isinstance(weighted_sum, torch.Tensor)
-                            and isinstance(total_weight,
-                                           (int, float)) and total_weight > 0):
+                    weighted_sum = aggregator["weighted_sum"]
+                    total_weight = aggregator["total_weight"]
 
+                    if (
+                        weighted_sum is not None
+                        and isinstance(weighted_sum, torch.Tensor)
+                        and isinstance(total_weight, (int, float))
+                        and total_weight > 0
+                    ):
                         # Compute final mean embedding
                         final_embedding = weighted_sum / total_weight
 
                         # Create a PoolingRequestOutput
                         # for the aggregated result
-                        pooling_output_data = PoolingOutput(
-                            data=final_embedding)
+                        pooling_output_data = PoolingOutput(data=final_embedding)
 
                         # Get original prompt token IDs for this prompt
                         original_prompt = ctx.engine_prompts[prompt_idx]
                         if not self._is_text_tokens_prompt(original_prompt):
                             return self.create_error_response(
-                                f"Chunked prompt {prompt_idx} is not a "
-                                f"TextTokensPrompt")
+                                f"Chunked prompt {prompt_idx} is not a TextTokensPrompt"
+                            )
 
-                        original_token_ids = cast(
-                            TextTokensPrompt,
-                            original_prompt)["prompt_token_ids"]
+                        original_token_ids = cast(TextTokensPrompt, original_prompt)[
+                            "prompt_token_ids"
+                        ]
 
                         pooling_request_output = PoolingRequestOutput(
-                            request_id=aggregator['request_id'],
+                            request_id=aggregator["request_id"],
                             prompt_token_ids=original_token_ids,
                             outputs=pooling_output_data,
-                            finished=True)
+                            num_cached_tokens=0,
+                            finished=True,
+                        )
 
                         final_res_batch.append(pooling_request_output)
                     else:
                         return self.create_error_response(
-                            f"Failed to aggregate chunks "
-                            f"for prompt {prompt_idx}")
+                            f"Failed to aggregate chunks for prompt {prompt_idx}"
+                        )
                 elif prompt_idx in short_prompts_results:
                     final_res_batch.append(
-                        cast(PoolingRequestOutput,
-                             short_prompts_results[prompt_idx]))
+                        cast(PoolingRequestOutput, short_prompts_results[prompt_idx])
+                    )
                 else:
                     return self.create_error_response(
-                        f"Result not found for prompt {prompt_idx}")
+                        f"Result not found for prompt {prompt_idx}"
+                    )
 
             ctx.final_res_batch = cast(
-                list[Union[RequestOutput, PoolingRequestOutput]],
-                final_res_batch)
+                list[RequestOutput | PoolingRequestOutput], final_res_batch
+            )
 
             return None
 
@@ -570,28 +617,30 @@ class OpenAIServingEmbedding(EmbeddingMixin):
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
 
     async def create_embedding(
         self,
         request: EmbeddingRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[EmbeddingResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> EmbeddingResponse | ErrorResponse:
         """
         Embedding API similar to OpenAI's API.
 
@@ -601,7 +650,8 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         model_name = self.models.model_name()
         request_id = (
             f"{self.request_id_prefix}-"
-            f"{self._base_request_id(raw_request, request.request_id)}")
+            f"{self._base_request_id(raw_request, request.request_id)}"
+        )
 
         ctx = EmbeddingServeContext(
             request=request,
@@ -618,7 +668,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
     def _create_pooling_params(
         self,
         ctx: ServeContext[EmbeddingRequest],
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
             return pooling_params
@@ -629,3 +679,17 @@ class OpenAIServingEmbedding(EmbeddingMixin):
             return self.create_error_response(str(e))
 
         return pooling_params
+
+    async def _preprocess(
+        self,
+        ctx: ServeContext,
+    ) -> ErrorResponse | None:
+        if isinstance(ctx.request, EmbeddingChatRequest):
+            error_check_ret = self._validate_chat_template(
+                request_chat_template=ctx.request.chat_template,
+                chat_template_kwargs=ctx.request.chat_template_kwargs,
+                trust_request_chat_template=self.trust_request_chat_template,
+            )
+            if error_check_ret is not None:
+                return error_check_ret
+        return await super()._preprocess(ctx)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 4eb1f8b89d64fbb902174b3f58d18a1f864b9b0e..c50b0c4a23e17c398bb26e93c7aee5cd58300abf 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
 import json
 import sys
 import time
 import traceback
-from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
+from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
-from typing import Any, Callable, ClassVar, Generic, Optional, TypeVar, Union
+from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import torch
 from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
@@ -20,96 +21,135 @@ if sys.version_info >= (3, 12):
 else:
     from typing_extensions import TypedDict
 
+from openai.types.responses import (
+    ToolChoiceFunction,
+)
+
 import vllm.envs as envs
-from vllm.config import ModelConfig
+from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.engine.protocol import EngineClient
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         ChatTemplateContentFormatOption,
-                                         ConversationMessage,
-                                         apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         parse_chat_messages_futures,
-                                         resolve_chat_template_content_format)
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormatOption,
+    ConversationMessage,
+    apply_hf_chat_template,
+    apply_mistral_chat_template,
+    parse_chat_messages_futures,
+    resolve_chat_template_content_format,
+)
 from vllm.entrypoints.context import ConversationContext
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionResponse,
-                                              ClassificationRequest,
-                                              ClassificationResponse,
-                                              CompletionRequest,
-                                              CompletionResponse,
-                                              DetokenizeRequest,
-                                              EmbeddingChatRequest,
-                                              EmbeddingCompletionRequest,
-                                              EmbeddingRequest,
-                                              EmbeddingResponse, ErrorInfo,
-                                              ErrorResponse,
-                                              IOProcessorRequest,
-                                              PoolingResponse, RerankRequest,
-                                              ResponsesRequest, ScoreRequest,
-                                              ScoreResponse,
-                                              TokenizeChatRequest,
-                                              TokenizeCompletionRequest,
-                                              TokenizeResponse,
-                                              TranscriptionRequest,
-                                              TranscriptionResponse,
-                                              TranslationRequest)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationRequest,
+    ClassificationResponse,
+    CompletionRequest,
+    CompletionResponse,
+    DetokenizeRequest,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorInfo,
+    ErrorResponse,
+    FunctionCall,
+    FunctionDefinition,
+    GenerateRequest,
+    GenerateResponse,
+    IOProcessorRequest,
+    PoolingResponse,
+    RerankRequest,
+    ResponsesRequest,
+    ScoreRequest,
+    ScoreResponse,
+    TokenizeChatRequest,
+    TokenizeCompletionRequest,
+    TokenizeResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranslationRequest,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer,
-                                       RenderConfig)
-# yapf: enable
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.parse import (
+    PromptComponents,
+    get_prompt_components,
+    is_explicit_encoder_decoder_prompt,
+)
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
-    MultiModalDataDict, MultiModalUUIDDict)
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+    MultiModalDataDict,
+    MultiModalUUIDDict,
+)
+from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
-                          log_tracing_disabled_warning)
+from vllm.tracing import (
+    contains_trace_headers,
+    extract_trace_headers,
+    log_tracing_disabled_warning,
+)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
-                        merge_async_iterators, random_uuid)
+from vllm.utils import random_uuid
+from vllm.utils.async_utils import (
+    AsyncMicrobatchTokenizer,
+    collect_from_async_generator,
+    make_async,
+    merge_async_iterators,
+)
+from vllm.utils.collection_utils import is_list_of
+from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
 
-CompletionLikeRequest = Union[
-    CompletionRequest,
-    DetokenizeRequest,
-    EmbeddingCompletionRequest,
-    RerankRequest,
-    ClassificationRequest,
-    ScoreRequest,
-    TokenizeCompletionRequest,
-]
-
-ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
-                        TokenizeChatRequest]
-SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
-AnyRequest = Union[
-    CompletionLikeRequest,
-    ChatLikeRequest,
-    SpeechToTextRequest,
-    ResponsesRequest,
-    IOProcessorRequest,
-]
-
-AnyResponse = Union[
-    CompletionResponse,
-    ChatCompletionResponse,
-    EmbeddingResponse,
-    TranscriptionResponse,
-    TokenizeResponse,
-    PoolingResponse,
-    ClassificationResponse,
-    ScoreResponse,
-]
+CompletionLikeRequest: TypeAlias = (
+    CompletionRequest
+    | DetokenizeRequest
+    | EmbeddingCompletionRequest
+    | RerankRequest
+    | ClassificationCompletionRequest
+    | ScoreRequest
+    | TokenizeCompletionRequest
+)
+
+ChatLikeRequest: TypeAlias = (
+    ChatCompletionRequest
+    | EmbeddingChatRequest
+    | TokenizeChatRequest
+    | ClassificationChatRequest
+)
+SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
+AnyRequest: TypeAlias = (
+    CompletionLikeRequest
+    | ChatLikeRequest
+    | SpeechToTextRequest
+    | ResponsesRequest
+    | IOProcessorRequest
+    | GenerateRequest
+)
+
+AnyResponse: TypeAlias = (
+    CompletionResponse
+    | ChatCompletionResponse
+    | EmbeddingResponse
+    | TranscriptionResponse
+    | TokenizeResponse
+    | PoolingResponse
+    | ClassificationResponse
+    | ScoreResponse
+    | GenerateResponse
+)
 
 
 class TextTokensPrompt(TypedDict):
@@ -121,17 +161,23 @@ class EmbedsPrompt(TypedDict):
     prompt_embeds: torch.Tensor
 
 
-RequestPrompt = Union[list[int], str, TextTokensPrompt, EmbedsPrompt]
+RequestPrompt: TypeAlias = list[int] | str | TextTokensPrompt | EmbedsPrompt
 
 
 def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]:
-    return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
-            and "prompt_embeds" not in prompt)
+    return (
+        isinstance(prompt, dict)
+        and "prompt_token_ids" in prompt
+        and "prompt_embeds" not in prompt
+    )
 
 
 def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
-    return (isinstance(prompt, dict) and "prompt_token_ids" not in prompt
-            and "prompt_embeds" in prompt)
+    return (
+        isinstance(prompt, dict)
+        and "prompt_token_ids" not in prompt
+        and "prompt_embeds" in prompt
+    )
 
 
 RequestT = TypeVar("RequestT", bound=AnyRequest)
@@ -143,8 +189,8 @@ class RequestProcessingMixin(BaseModel):
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Optional[Sequence[RequestPrompt]] = []
-    engine_prompts: Optional[list[EngineTokensPrompt]] = []
+    request_prompts: Sequence[RequestPrompt] | None = []
+    engine_prompts: list[EngineTokensPrompt] | None = []
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -155,30 +201,32 @@ class ResponseGenerationMixin(BaseModel):
     managing result generators and final batch results.
     """
 
-    result_generator: Optional[AsyncGenerator[tuple[int, Union[
-        RequestOutput, PoolingRequestOutput]], None]] = None
-    final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
-        default_factory=list)
+    result_generator: (
+        AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
+    ) = None
+    final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
+        default_factory=list
+    )
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
 class ServeContext(
-        RequestProcessingMixin,
-        ResponseGenerationMixin,
-        BaseModel,
-        Generic[RequestT],
+    RequestProcessingMixin,
+    ResponseGenerationMixin,
+    BaseModel,
+    Generic[RequestT],
 ):
     # Shared across all requests
     request: RequestT
-    raw_request: Optional[Request] = None
+    raw_request: Request | None = None
     model_name: str
     request_id: str
     created_time: int = Field(default_factory=lambda: int(time.time()))
-    lora_request: Optional[LoRARequest] = None
+    lora_request: LoRARequest | None = None
 
     # Shared across most requests
-    tokenizer: Optional[AnyTokenizer] = None
+    tokenizer: AnyTokenizer | None = None
 
     # `protected_namespaces` resolves Pydantic v2's warning
     # on conflict with protected namespace "model_"
@@ -192,7 +240,7 @@ ClassificationServeContext = ServeContext[ClassificationRequest]
 
 
 class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
-    chat_template: Optional[str] = None
+    chat_template: str | None = None
     chat_template_content_format: ChatTemplateContentFormatOption
 
 
@@ -213,33 +261,258 @@ class OpenAIServing:
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
-        enable_force_include_usage: bool = False,
         log_error_stack: bool = False,
     ):
         super().__init__()
 
         self.engine_client = engine_client
-        self.model_config = model_config
-        self.max_model_len = model_config.max_model_len
 
         self.models = models
 
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
-        self.enable_force_include_usage = enable_force_include_usage
-
         self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
+        self._apply_mistral_chat_template_async = make_async(
+            apply_mistral_chat_template, executor=self._tokenizer_executor
+        )
 
-        self._async_tokenizer_pool: dict[AnyTokenizer,
-                                         AsyncMicrobatchTokenizer] = {}
+        self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
-    def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
+        self.processor = self.models.processor
+        self.io_processor = self.models.io_processor
+        self.model_config = self.models.model_config
+        self.max_model_len = self.model_config.max_model_len
+
+    def _get_tool_parser(
+        self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
+    ) -> Callable[[AnyTokenizer], ToolParser] | None:
+        """Get the tool parser based on the name."""
+        parser = None
+        if not enable_auto_tools or tool_parser_name is None:
+            return parser
+        logger.info(
+            '"auto" tool choice has been enabled please note that while'
+            " the parallel_tool_calls client option is preset for "
+            "compatibility reasons, it will be ignored."
+        )
+
+        try:
+            if tool_parser_name == "pythonic" and self.model_config.model.startswith(
+                "meta-llama/Llama-3.2"
+            ):
+                logger.warning(
+                    "Llama3.2 models may struggle to emit valid pythonic tool calls"
+                )
+            parser = ToolParserManager.get_tool_parser(tool_parser_name)
+        except Exception as e:
+            raise TypeError(
+                "Error: --enable-auto-tool-choice requires "
+                f"tool_parser:'{tool_parser_name}' which has not "
+                "been registered"
+            ) from e
+        return parser
+
+    def _get_reasoning_parser(
+        self,
+        reasoning_parser_name: str,
+    ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
+        """Get the reasoning parser based on the name."""
+        parser = None
+        if not reasoning_parser_name:
+            return None
+        try:
+            parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
+            assert parser is not None
+        except Exception as e:
+            raise TypeError(f"{reasoning_parser_name=} has not been registered") from e
+        return parser
+
+    async def reset_mm_cache(self) -> None:
+        self.processor.clear_mm_cache()
+        await self.engine_client.reset_mm_cache()
+
+    async def beam_search(
+        self,
+        prompt: PromptType,
+        request_id: str,
+        params: BeamSearchParams,
+        lora_request: LoRARequest | None = None,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        beam_width = params.beam_width
+        max_tokens = params.max_tokens
+        ignore_eos = params.ignore_eos
+        temperature = params.temperature
+        length_penalty = params.length_penalty
+        include_stop_str_in_output = params.include_stop_str_in_output
+
+        processor = self.processor
+        tokenizer = processor.tokenizer
+        if tokenizer is None:
+            raise ValueError(
+                "You cannot use beam search when `skip_tokenizer_init` is True"
+            )
+
+        eos_token_id: int = tokenizer.eos_token_id  # type: ignore
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise NotImplementedError
+
+        prompt_text: str | None
+        prompt_token_ids: list[int]
+        multi_modal_data: MultiModalDataDict | None
+        if isinstance(prompt, str):
+            prompt_text = prompt
+            prompt_token_ids = []
+            multi_modal_data = None
+        else:
+            prompt_text = prompt.get("prompt")  # type: ignore
+            prompt_token_ids = prompt.get("prompt_token_ids", [])  # type: ignore
+            multi_modal_data = prompt.get("multi_modal_data")  # type: ignore
+
+        mm_processor_kwargs: dict[str, Any] | None = None
+
+        # This is a workaround to fix multimodal beam search; this is a
+        # bandaid fix for 2 small problems:
+        # 1. Multi_modal_data on the processed_inputs currently resolves to
+        #    `None`.
+        # 2. preprocessing above expands the multimodal placeholders. However,
+        #    this happens again in generation, so the double expansion causes
+        #    a mismatch.
+        # TODO - would be ideal to handle this more gracefully.
+
+        tokenized_length = len(prompt_token_ids)
+
+        sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)
+
+        beam_search_params = SamplingParams(
+            logprobs=2 * beam_width,
+            max_tokens=1,
+            temperature=temperature,
+        )
+        all_beams = [
+            BeamSearchSequence(
+                tokens=prompt_token_ids,
+                cum_logprob=0,
+                logprobs=[],
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+                lora_request=lora_request,
+            )
+        ]
+        completed = []
+
+        for _ in range(max_tokens):
+            prompts_batch, lora_req_batch = zip(
+                *[
+                    (
+                        EngineTokensPrompt(
+                            prompt_token_ids=beam.tokens,
+                            multi_modal_data=beam.multi_modal_data,
+                            mm_processor_kwargs=beam.mm_processor_kwargs,
+                        ),
+                        beam.lora_request,
+                    )
+                    for beam in all_beams
+                ]
+            )
+
+            tasks = []
+            request_id_batch = f"{request_id}-{random_uuid()}"
+
+            for i, (individual_prompt, lora_req) in enumerate(
+                zip(prompts_batch, lora_req_batch)
+            ):
+                request_id_item = f"{request_id_batch}-beam-{i}"
+                task = asyncio.create_task(
+                    collect_from_async_generator(
+                        self.engine_client.generate(
+                            individual_prompt,
+                            beam_search_params,
+                            request_id_item,
+                            lora_request=lora_req,
+                        )
+                    )
+                )
+                tasks.append(task)
+
+            output = [x[0] for x in await asyncio.gather(*tasks)]
+
+            new_beams = []
+            for i, current_beam in enumerate(all_beams):
+                result = output[i]
+
+                if result.outputs[0].logprobs is not None:
+                    logprobs = result.outputs[0].logprobs[0]
+                    for token_id, logprob_obj in logprobs.items():
+                        if token_id == eos_token_id and not ignore_eos:
+                            completed.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id]
+                                    if include_stop_str_in_output
+                                    else current_beam.tokens,
+                                    logprobs=current_beam.logprobs + [logprobs],
+                                    cum_logprob=current_beam.cum_logprob
+                                    + logprob_obj.logprob,
+                                    finish_reason="stop",
+                                    stop_reason=eos_token_id,
+                                )
+                            )
+                        else:
+                            new_beams.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],
+                                    logprobs=current_beam.logprobs + [logprobs],
+                                    lora_request=current_beam.lora_request,
+                                    cum_logprob=current_beam.cum_logprob
+                                    + logprob_obj.logprob,
+                                    multi_modal_data=current_beam.multi_modal_data,
+                                    mm_processor_kwargs=current_beam.mm_processor_kwargs,
+                                )
+                            )
+
+            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
+            all_beams = sorted_beams[:beam_width]
+
+        completed.extend(all_beams)
+        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
+        best_beams = sorted_completed[:beam_width]
+
+        for beam in best_beams:
+            if beam.tokens[-1] == eos_token_id and not ignore_eos:
+                # Skip the eos token in the text.
+                tokens = beam.tokens[tokenized_length:-1]
+            else:
+                tokens = beam.tokens[tokenized_length:]
+            beam.text = tokenizer.decode(tokens)
+
+        yield RequestOutput(
+            request_id=request_id,
+            prompt=prompt_text,
+            outputs=[
+                CompletionOutput(
+                    text=beam.text,  # type: ignore
+                    cumulative_logprob=beam.cum_logprob,
+                    token_ids=beam.tokens[tokenized_length:],
+                    index=i,
+                    logprobs=beam.logprobs,
+                    finish_reason=beam.finish_reason
+                    if beam.finish_reason is not None
+                    else "length",
+                    stop_reason=beam.stop_reason,
+                )
+                for (i, beam) in enumerate(best_beams)
+            ],
+            finished=True,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=None,
+        )
+
+    def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
         """
         Get a Renderer instance with the provided tokenizer.
         Uses shared async tokenizer pool for efficiency.
@@ -247,7 +520,8 @@ class OpenAIServing:
         return CompletionRenderer(
             model_config=self.model_config,
             tokenizer=tokenizer,
-            async_tokenizer_pool=self._async_tokenizer_pool)
+            async_tokenizer_pool=self._async_tokenizer_pool,
+        )
 
     def _build_render_config(
         self,
@@ -276,7 +550,7 @@ class OpenAIServing:
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """
         Default preprocessing hook. Subclasses may override
         to prepare `ctx` (classification, embedding, etc.).
@@ -286,7 +560,7 @@ class OpenAIServing:
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[AnyResponse, ErrorResponse]:
+    ) -> AnyResponse | ErrorResponse:
         """
         Default response builder. Subclass may override this method
         to return the appropriate response object.
@@ -296,8 +570,8 @@ class OpenAIServing:
     async def handle(
         self,
         ctx: ServeContext,
-    ) -> Union[AnyResponse, ErrorResponse]:
-        generation: AsyncGenerator[Union[AnyResponse, ErrorResponse], None]
+    ) -> AnyResponse | ErrorResponse:
+        generation: AsyncGenerator[AnyResponse | ErrorResponse, None]
         generation = self._pipeline(ctx)
 
         async for response in generation:
@@ -308,7 +582,7 @@ class OpenAIServing:
     async def _pipeline(
         self,
         ctx: ServeContext,
-    ) -> AsyncGenerator[Union[AnyResponse, ErrorResponse], None]:
+    ) -> AsyncGenerator[AnyResponse | ErrorResponse, None]:
         """Execute the request processing pipeline yielding responses."""
         if error := await self._check_model(ctx.request):
             yield error
@@ -329,48 +603,53 @@ class OpenAIServing:
 
         yield self._build_response(ctx)
 
-    def _validate_request(self, ctx: ServeContext) -> Optional[ErrorResponse]:
-        truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
-                                         None)
+    def _validate_request(self, ctx: ServeContext) -> ErrorResponse | None:
+        truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", None)
 
-        if (truncate_prompt_tokens is not None
-                and truncate_prompt_tokens > self.max_model_len):
+        if (
+            truncate_prompt_tokens is not None
+            and truncate_prompt_tokens > self.max_model_len
+        ):
             return self.create_error_response(
                 "truncate_prompt_tokens value is "
                 "greater than max_model_len."
-                " Please, select a smaller truncation size.")
+                " Please, select a smaller truncation size."
+            )
         return None
 
     def _create_pooling_params(
         self,
         ctx: ServeContext,
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         if not hasattr(ctx.request, "to_pooling_params"):
             return self.create_error_response(
-                "Request type does not support pooling parameters")
+                "Request type does not support pooling parameters"
+            )
 
         return ctx.request.to_pooling_params()
 
     async def _prepare_generators(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Schedule the request and get the result generator."""
-        generators: list[AsyncGenerator[Union[RequestOutput,
-                                              PoolingRequestOutput],
-                                        None]] = []
+        generators: list[
+            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
+        ] = []
 
         try:
-            trace_headers = (None if ctx.raw_request is None else await
-                             self._get_trace_headers(ctx.raw_request.headers))
+            trace_headers = (
+                None
+                if ctx.raw_request is None
+                else await self._get_trace_headers(ctx.raw_request.headers)
+            )
 
             pooling_params = self._create_pooling_params(ctx)
             if isinstance(pooling_params, ErrorResponse):
                 return pooling_params
 
             if ctx.engine_prompts is None:
-                return self.create_error_response(
-                    "Engine prompts not available")
+                return self.create_error_response("Engine prompts not available")
 
             for i, engine_prompt in enumerate(ctx.engine_prompts):
                 request_id_item = f"{ctx.request_id}-{i}"
@@ -404,32 +683,28 @@ class OpenAIServing:
     async def _collect_batch(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Collect batch results from the result generator."""
         try:
             if ctx.engine_prompts is None:
-                return self.create_error_response(
-                    "Engine prompts not available")
+                return self.create_error_response("Engine prompts not available")
 
             num_prompts = len(ctx.engine_prompts)
-            final_res_batch: list[Optional[Union[RequestOutput,
-                                                 PoolingRequestOutput]]]
+            final_res_batch: list[RequestOutput | PoolingRequestOutput | None]
             final_res_batch = [None] * num_prompts
 
             if ctx.result_generator is None:
-                return self.create_error_response(
-                    "Result generator not available")
+                return self.create_error_response("Result generator not available")
 
             async for i, res in ctx.result_generator:
                 final_res_batch[i] = res
 
             if None in final_res_batch:
                 return self.create_error_response(
-                    "Failed to generate results for all prompts")
+                    "Failed to generate results for all prompts"
+                )
 
-            ctx.final_res_batch = [
-                res for res in final_res_batch if res is not None
-            ]
+            ctx.final_res_batch = [res for res in final_res_batch if res is not None]
 
             return None
 
@@ -448,8 +723,9 @@ class OpenAIServing:
                 traceback.print_exc()
             else:
                 traceback.print_stack()
-        return ErrorResponse(error=ErrorInfo(
-            message=message, type=err_type, code=status_code.value))
+        return ErrorResponse(
+            error=ErrorInfo(message=message, type=err_type, code=status_code.value)
+        )
 
     def create_streaming_error_response(
         self,
@@ -458,27 +734,33 @@ class OpenAIServing:
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
     ) -> str:
         json_str = json.dumps(
-            self.create_error_response(message=message,
-                                       err_type=err_type,
-                                       status_code=status_code).model_dump())
+            self.create_error_response(
+                message=message, err_type=err_type, status_code=status_code
+            ).model_dump()
+        )
         return json_str
 
     async def _check_model(
         self,
         request: AnyRequest,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         error_response = None
 
         if self._is_model_supported(request.model):
             return None
         if request.model in self.models.lora_requests:
             return None
-        if (envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and
-            (load_result := await self.models.resolve_lora(request.model))):
+        if (
+            envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING
+            and request.model
+            and (load_result := await self.models.resolve_lora(request.model))
+        ):
             if isinstance(load_result, LoRARequest):
                 return None
-            if (isinstance(load_result, ErrorResponse) and
-                    load_result.error.code == HTTPStatus.BAD_REQUEST.value):
+            if (
+                isinstance(load_result, ErrorResponse)
+                and load_result.error.code == HTTPStatus.BAD_REQUEST.value
+            ):
                 error_response = load_result
 
         return error_response or self.create_error_response(
@@ -487,8 +769,7 @@ class OpenAIServing:
             status_code=HTTPStatus.NOT_FOUND,
         )
 
-    def _get_active_default_mm_loras(
-            self, request: AnyRequest) -> Optional[LoRARequest]:
+    def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
         """Determine if there are any active default multimodal loras."""
         # TODO: Currently this is only enabled for chat completions
         # to be better aligned with only being enabled for .generate
@@ -515,7 +796,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         supports_default_mm_loras: bool = False,
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         if request.model in self.models.lora_requests:
             return self.models.lora_requests[request.model]
 
@@ -542,9 +823,16 @@ class OpenAIServing:
         if not hasattr(request, "messages"):
             return message_types
 
-        for message in request.messages:
-            if (isinstance(message, dict) and "content" in message
-                    and isinstance(message["content"], list)):
+        messages = request.messages
+        if messages is None or isinstance(messages, (str, bytes)):
+            return message_types
+
+        for message in messages:
+            if (
+                isinstance(message, dict)
+                and "content" in message
+                and isinstance(message["content"], list)
+            ):
                 for content_dict in message["content"]:
                     if "type" in content_dict:
                         message_types.add(content_dict["type"].split("_")[0])
@@ -559,17 +847,18 @@ class OpenAIServing:
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
 
-        if (self.model_config.encoder_config is not None
-                and self.model_config.encoder_config.get(
-                    "do_lower_case", False)):
+        if (
+            self.model_config.encoder_config is not None
+            and self.model_config.encoder_config.get("do_lower_case", False)
+        ):
             prompt = prompt.lower()
 
-        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
-                                         None)
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
         if truncate_prompt_tokens is None:
             encoded = await async_tokenizer(
-                prompt, add_special_tokens=add_special_tokens)
+                prompt, add_special_tokens=add_special_tokens
+            )
         elif truncate_prompt_tokens < 0:
             # Negative means we cap at the model's max length
             encoded = await async_tokenizer(
@@ -595,15 +884,14 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         prompt_ids: list[int],
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
     ) -> TextTokensPrompt:
-        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
-                                         None)
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
-            input_ids = prompt_ids[-self.max_model_len:]
+            input_ids = prompt_ids[-self.max_model_len :]
         else:
             input_ids = prompt_ids[-truncate_prompt_tokens:]
 
@@ -626,13 +914,14 @@ class OpenAIServing:
         # Note: EmbeddingRequest, ClassificationRequest,
         # and ScoreRequest doesn't have max_tokens
         if isinstance(
-                request,
+            request,
             (
                 EmbeddingChatRequest,
                 EmbeddingCompletionRequest,
                 ScoreRequest,
                 RerankRequest,
-                ClassificationRequest,
+                ClassificationCompletionRequest,
+                ClassificationChatRequest,
             ),
         ):
             # Note: input length can be up to the entire model context length
@@ -640,27 +929,25 @@ class OpenAIServing:
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification",
+                    ClassificationCompletionRequest: "classification",
+                    ClassificationChatRequest: "classification",
                 }
-                operation = operations.get(type(request),
-                                           "embedding generation")
+                operation = operations.get(type(request), "embedding generation")
                 raise ValueError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the input for {operation}. "
-                    f"Please reduce the length of the input.")
-            return TextTokensPrompt(prompt=input_text,
-                                    prompt_token_ids=input_ids)
+                    f"Please reduce the length of the input."
+                )
+            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
         # and does not require model context length validation
         if isinstance(
-                request,
-            (TokenizeCompletionRequest, TokenizeChatRequest,
-             DetokenizeRequest),
+            request,
+            (TokenizeCompletionRequest, TokenizeChatRequest, DetokenizeRequest),
         ):
-            return TextTokensPrompt(prompt=input_text,
-                                    prompt_token_ids=input_ids)
+            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
         # chat completion endpoint supports max_completion_tokens
         if isinstance(request, ChatCompletionRequest):
@@ -676,16 +963,17 @@ class OpenAIServing:
                 f"This model's maximum context length is "
                 f"{self.max_model_len} tokens. However, your request has "
                 f"{token_num} input tokens. Please reduce the length of "
-                "the input messages.")
+                "the input messages."
+            )
 
-        if (max_tokens is not None
-                and token_num + max_tokens > self.max_model_len):
+        if max_tokens is not None and token_num + max_tokens > self.max_model_len:
             raise ValueError(
                 "'max_tokens' or 'max_completion_tokens' is too large: "
                 f"{max_tokens}. This model's maximum context length is "
                 f"{self.max_model_len} tokens and your request has "
                 f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
-                f" - {token_num}).")
+                f" - {token_num})."
+            )
 
         return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
@@ -693,17 +981,17 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_input: Union[str, list[int]],
+        prompt_input: str | list[int],
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
         A simpler implementation that tokenizes a single prompt input.
         """
         async for result in self._tokenize_prompt_inputs_async(
-                request,
-                tokenizer,
+            request,
+            tokenizer,
             [prompt_input],
-                add_special_tokens=add_special_tokens,
+            add_special_tokens=add_special_tokens,
         ):
             return result
         raise ValueError("No results yielded from tokenization")
@@ -712,7 +1000,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_inputs: Iterable[Union[str, list[int]]],
+        prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
@@ -733,24 +1021,44 @@ class OpenAIServing:
                     tokenizer=tokenizer,
                 )
 
+    def _validate_chat_template(
+        self,
+        request_chat_template: str | None,
+        chat_template_kwargs: dict[str, Any] | None,
+        trust_request_chat_template: bool,
+    ) -> ErrorResponse | None:
+        if not trust_request_chat_template and (
+            request_chat_template is not None
+            or (
+                chat_template_kwargs
+                and chat_template_kwargs.get("chat_template") is not None
+            )
+        ):
+            return self.create_error_response(
+                "Chat template is passed with request, but "
+                "--trust-request-chat-template is not set. "
+                "Refused request with untrusted chat template."
+            )
+        return None
+
     async def _preprocess_chat(
         self,
-        request: Union[ChatLikeRequest, ResponsesRequest],
+        request: ChatLikeRequest | ResponsesRequest,
         tokenizer: AnyTokenizer,
         messages: list[ChatCompletionMessageParam],
-        chat_template: Optional[str],
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tool_dicts: Optional[list[dict[str, Any]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+        tool_dicts: list[dict[str, Any]] | None = None,
+        documents: list[dict[str, str]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[
-            list[ConversationMessage],
-            Sequence[RequestPrompt],
-            list[EngineTokensPrompt],
+        list[ConversationMessage],
+        Sequence[RequestPrompt],
+        list[EngineTokensPrompt],
     ]:
         model_config = self.model_config
 
@@ -777,12 +1085,12 @@ class OpenAIServing:
         )
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
-        request_prompt: Union[str, list[int]]
+        request_prompt: str | list[int]
 
         if tokenizer is None:
             request_prompt = "placeholder"
         elif isinstance(tokenizer, MistralTokenizer):
-            request_prompt = apply_mistral_chat_template(
+            request_prompt = await self._apply_mistral_chat_template_async(
                 tokenizer,
                 messages=messages,
                 **_chat_template_kwargs,
@@ -800,24 +1108,27 @@ class OpenAIServing:
         # tool parsing is done only if a tool_parser has been set and if
         # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
         # is set, we want to prevent parsing a tool_call hallucinated by the LLM
-        should_parse_tools = tool_parser is not None and (hasattr(
-            request, "tool_choice") and request.tool_choice != "none")
+        should_parse_tools = tool_parser is not None and (
+            hasattr(request, "tool_choice") and request.tool_choice != "none"
+        )
 
         if should_parse_tools:
-            if not isinstance(request, ChatCompletionRequest):
-                msg = "Tool usage is only supported for Chat Completions API"
+            if not isinstance(request, ChatCompletionRequest | ResponsesRequest):
+                msg = (
+                    "Tool usage is only supported for Chat Completions API "
+                    "or Responses API requests."
+                )
                 raise NotImplementedError(msg)
-
-            request = tool_parser(tokenizer).adjust_request(  # type: ignore
-                request=request)
+            request = tool_parser(tokenizer).adjust_request(request=request)  # type: ignore
 
         if tokenizer is None:
             assert isinstance(request_prompt, str), (
                 "Prompt has to be a string",
                 "when the tokenizer is not initialised",
             )
-            prompt_inputs = TextTokensPrompt(prompt=request_prompt,
-                                             prompt_token_ids=[1])
+            prompt_inputs = TextTokensPrompt(
+                prompt=request_prompt, prompt_token_ids=[1]
+            )
         elif isinstance(request_prompt, str):
             prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
@@ -828,14 +1139,16 @@ class OpenAIServing:
         else:
             # For MistralTokenizer
             assert is_list_of(request_prompt, int), (
-                "Prompt has to be either a string or a list of token ids")
+                "Prompt has to be either a string or a list of token ids"
+            )
             prompt_inputs = TextTokensPrompt(
                 prompt=tokenizer.decode(request_prompt),
                 prompt_token_ids=request_prompt,
             )
 
         engine_prompt = EngineTokensPrompt(
-            prompt_token_ids=prompt_inputs["prompt_token_ids"])
+            prompt_token_ids=prompt_inputs["prompt_token_ids"]
+        )
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
 
@@ -850,6 +1163,33 @@ class OpenAIServing:
 
         return conversation, [request_prompt], [engine_prompt]
 
+    async def _process_inputs(
+        self,
+        request_id: str,
+        engine_prompt: PromptType,
+        params: SamplingParams | PoolingParams,
+        *,
+        lora_request: LoRARequest | None,
+        trace_headers: Mapping[str, str] | None,
+        priority: int,
+    ) -> tuple[EngineCoreRequest, dict[str, Any]]:
+        """Use the Processor to process inputs for AsyncLLM."""
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(
+            self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
+        )
+
+        engine_request = self.processor.process_inputs(
+            request_id,
+            engine_prompt,
+            params,
+            lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
+            trace_headers=trace_headers,
+            priority=priority,
+        )
+        return engine_request, tokenization_kwargs
+
     async def _generate_with_builtin_tools(
         self,
         request_id: str,
@@ -857,10 +1197,11 @@ class OpenAIServing:
         engine_prompt: EngineTokensPrompt,
         sampling_params: SamplingParams,
         context: ConversationContext,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: LoRARequest | None = None,
         priority: int = 0,
         **kwargs,
     ):
+        prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
         while True:
             self._log_inputs(
@@ -869,14 +1210,27 @@ class OpenAIServing:
                 params=sampling_params,
                 lora_request=lora_request,
             )
-            generator = self.engine_client.generate(
+            trace_headers = kwargs.get("trace_headers")
+            engine_request, tokenization_kwargs = await self._process_inputs(
+                request_id,
                 engine_prompt,
                 sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+            )
+
+            generator = self.engine_client.generate(
+                engine_request,
+                sampling_params,
                 request_id,
                 lora_request=lora_request,
                 priority=priority,
+                prompt_text=prompt_text,
+                tokenization_kwargs=tokenization_kwargs,
                 **kwargs,
             )
+
             async for res in generator:
                 context.append_output(res)
                 # NOTE(woosuk): The stop condition is handled by the engine.
@@ -888,7 +1242,7 @@ class OpenAIServing:
 
             # Call the tool and update the context with the result.
             tool_output = await context.call_tool()
-            context.append_output(tool_output)
+            context.append_tool_output(tool_output)
 
             # TODO: uncomment this and enable tool output streaming
             # yield context
@@ -896,33 +1250,33 @@ class OpenAIServing:
             # Create inputs for the next turn.
             # Render the next prompt token ids.
             prompt_token_ids = context.render_for_completion()
-            engine_prompt = EngineTokensPrompt(
-                prompt_token_ids=prompt_token_ids)
+            engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
             request_prompt = prompt_token_ids
             # Update the sampling params.
-            sampling_params.max_tokens = self.max_model_len - len(
-                prompt_token_ids)
+            sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
 
+    def _get_prompt_components(
+        self,
+        prompt: RequestPrompt | PromptType,
+    ) -> PromptComponents:
+        if isinstance(prompt, list):
+            return PromptComponents(token_ids=prompt)
+
+        return get_prompt_components(prompt)  # type: ignore[arg-type]
+
     def _log_inputs(
         self,
         request_id: str,
-        inputs: Union[RequestPrompt, PromptType],
-        params: Optional[Union[SamplingParams, PoolingParams,
-                               BeamSearchParams]],
-        lora_request: Optional[LoRARequest],
+        inputs: RequestPrompt | PromptType,
+        params: SamplingParams | PoolingParams | BeamSearchParams | None,
+        lora_request: LoRARequest | None,
     ) -> None:
         if self.request_logger is None:
             return
-        prompt, prompt_token_ids, prompt_embeds = None, None, None
-        if isinstance(inputs, str):
-            prompt = inputs
-        elif isinstance(inputs, list):
-            prompt_token_ids = inputs
-        else:
-            prompt = getattr(inputs, 'prompt', None)
-            prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)
+
+        prompt, prompt_token_ids, prompt_embeds = self._get_prompt_components(inputs)
 
         self.request_logger.log_inputs(
             request_id,
@@ -936,7 +1290,7 @@ class OpenAIServing:
     async def _get_trace_headers(
         self,
         headers: Headers,
-    ) -> Optional[Mapping[str, str]]:
+    ) -> Mapping[str, str] | None:
         is_tracing_enabled = await self.engine_client.is_tracing_enabled()
 
         if is_tracing_enabled:
@@ -948,8 +1302,9 @@ class OpenAIServing:
         return None
 
     @staticmethod
-    def _base_request_id(raw_request: Optional[Request],
-                         default: Optional[str] = None) -> Optional[str]:
+    def _base_request_id(
+        raw_request: Request | None, default: str | None = None
+    ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
         default = default or random_uuid()
         if raw_request is None:
@@ -957,6 +1312,92 @@ class OpenAIServing:
 
         return raw_request.headers.get("X-Request-Id", default)
 
+    @staticmethod
+    def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
+        """Pulls the data parallel rank from a header, if provided"""
+        if raw_request is None:
+            return None
+
+        rank_str = raw_request.headers.get("X-data-parallel-rank")
+        if rank_str is None:
+            return None
+
+        try:
+            return int(rank_str)
+        except ValueError:
+            return None
+
+    @staticmethod
+    def _parse_tool_calls_from_content(
+        request: ResponsesRequest | ChatCompletionRequest,
+        tokenizer: AnyTokenizer,
+        enable_auto_tools: bool,
+        tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+        content: str | None = None,
+    ) -> tuple[list[FunctionCall] | None, str | None]:
+        function_calls = list[FunctionCall]()
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            assert content is not None
+            # Forced Function Call
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.name, arguments=content)
+            )
+            content = None  # Clear content since tool is called.
+        elif request.tool_choice and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam
+        ):
+            assert content is not None
+            # Forced Function Call
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.function.name, arguments=content)
+            )
+            content = None  # Clear content since tool is called.
+        elif request.tool_choice == "required":
+            assert content is not None
+            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
+            function_calls.extend(
+                [
+                    FunctionCall(
+                        name=tool_call.name,
+                        arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+                    )
+                    for tool_call in tool_calls
+                ]
+            )
+            content = None  # Clear content since tool is called.
+        elif (
+            tool_parser_cls
+            and enable_auto_tools
+            and (request.tool_choice == "auto" or request.tool_choice is None)
+        ):
+            # Automatic Tool Call Parsing
+            try:
+                tool_parser = tool_parser_cls(tokenizer)
+            except RuntimeError as e:
+                logger.exception("Error in tool parser creation.")
+                raise e
+            tool_call_info = tool_parser.extract_tool_calls(
+                content if content is not None else "",
+                request=request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                # extract_tool_calls() returns a list of tool calls.
+                function_calls.extend(
+                    FunctionCall(
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                content = tool_call_info.content
+                if content and content.strip() == "":
+                    content = None
+            else:
+                # No tool calls.
+                return None, content
+
+        return function_calls, content
+
     @staticmethod
     def _get_decoded_token(
         logprob: Logprob,
@@ -971,15 +1412,15 @@ class OpenAIServing:
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    def _is_model_supported(self, model_name: Optional[str]) -> bool:
+    def _is_model_supported(self, model_name: str | None) -> bool:
         if not model_name:
             return True
         return self.models.is_base_model(model_name)
 
 
 def clamp_prompt_logprobs(
-    prompt_logprobs: Union[PromptLogprobs,
-                           None], ) -> Union[PromptLogprobs, None]:
+    prompt_logprobs: PromptLogprobs | None,
+) -> PromptLogprobs | None:
     if prompt_logprobs is None:
         return prompt_logprobs
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index a4efa0815b4e7f4d2a991a0b19f2cf1d02857338..24b9587010cad98148b33c65e3f0fdbc1624dd4b 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,19 +5,21 @@ from asyncio import Lock
 from collections import defaultdict
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Optional, Union
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (ErrorInfo, ErrorResponse,
-                                              LoadLoRAAdapterRequest,
-                                              ModelCard, ModelList,
-                                              ModelPermission,
-                                              UnloadLoRAAdapterRequest)
+from vllm.entrypoints.openai.protocol import (
+    ErrorInfo,
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    ModelCard,
+    ModelList,
+    ModelPermission,
+    UnloadLoRAAdapterRequest,
+)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
-from vllm.utils import AtomicCounter
+from vllm.utils.counter import AtomicCounter
 
 logger = init_logger(__name__)
 
@@ -32,7 +34,7 @@ class BaseModelPath:
 class LoRAModulePath:
     name: str
     path: str
-    base_model_name: Optional[str] = None
+    base_model_name: str | None = None
 
 
 class OpenAIServingModels:
@@ -47,47 +49,50 @@ class OpenAIServingModels:
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         base_model_paths: list[BaseModelPath],
         *,
-        lora_modules: Optional[list[LoRAModulePath]] = None,
+        lora_modules: list[LoRAModulePath] | None = None,
     ):
         super().__init__()
 
-        self.base_model_paths = base_model_paths
-
-        self.max_model_len = model_config.max_model_len
         self.engine_client = engine_client
-        self.model_config = model_config
+        self.base_model_paths = base_model_paths
 
         self.static_lora_modules = lora_modules
         self.lora_requests: dict[str, LoRARequest] = {}
         self.lora_id_counter = AtomicCounter(0)
 
         self.lora_resolvers: list[LoRAResolver] = []
-        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(
-        ):
+        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers():
             self.lora_resolvers.append(
-                LoRAResolverRegistry.get_resolver(lora_resolver_name))
+                LoRAResolverRegistry.get_resolver(lora_resolver_name)
+            )
         self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
 
+        self.processor = self.engine_client.processor
+        self.io_processor = self.engine_client.io_processor
+        self.model_config = self.engine_client.model_config
+        self.max_model_len = self.model_config.max_model_len
+
     async def init_static_loras(self):
         """Loads all static LoRA modules.
         Raises if any fail to load"""
         if self.static_lora_modules is None:
             return
         for lora in self.static_lora_modules:
-            load_request = LoadLoRAAdapterRequest(lora_path=lora.path,
-                                                  lora_name=lora.name)
+            load_request = LoadLoRAAdapterRequest(
+                lora_path=lora.path, lora_name=lora.name
+            )
             load_result = await self.load_lora_adapter(
-                request=load_request, base_model_name=lora.base_model_name)
+                request=load_request, base_model_name=lora.base_model_name
+            )
             if isinstance(load_result, ErrorResponse):
                 raise ValueError(load_result.error.message)
 
     def is_base_model(self, model_name) -> bool:
         return any(model.name == model_name for model in self.base_model_paths)
 
-    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+    def model_name(self, lora_request: LoRARequest | None = None) -> str:
         """Returns the appropriate model name depending on the availability
         and support of the LoRA or base model.
         Parameters:
@@ -100,47 +105,48 @@ class OpenAIServingModels:
         return self.base_model_paths[0].name
 
     async def show_available_models(self) -> ModelList:
-        """Show available models. This includes the base model and all 
+        """Show available models. This includes the base model and all
         adapters"""
         model_cards = [
-            ModelCard(id=base_model.name,
-                      max_model_len=self.max_model_len,
-                      root=base_model.model_path,
-                      permission=[ModelPermission()])
+            ModelCard(
+                id=base_model.name,
+                max_model_len=self.max_model_len,
+                root=base_model.model_path,
+                permission=[ModelPermission()],
+            )
             for base_model in self.base_model_paths
         ]
         lora_cards = [
-            ModelCard(id=lora.lora_name,
-                      root=lora.local_path,
-                      parent=lora.base_model_name if lora.base_model_name else
-                      self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
+            ModelCard(
+                id=lora.lora_name,
+                root=lora.local_path,
+                parent=lora.base_model_name
+                if lora.base_model_name
+                else self.base_model_paths[0].name,
+                permission=[ModelPermission()],
+            )
             for lora in self.lora_requests.values()
         ]
         model_cards.extend(lora_cards)
         return ModelList(data=model_cards)
 
     async def load_lora_adapter(
-            self,
-            request: LoadLoRAAdapterRequest,
-            base_model_name: Optional[str] = None
-    ) -> Union[ErrorResponse, str]:
+        self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
+    ) -> ErrorResponse | str:
         lora_name = request.lora_name
 
         # Ensure atomicity based on the lora name
         async with self.lora_resolver_lock[lora_name]:
-            error_check_ret = await self._check_load_lora_adapter_request(
-                request)
+            error_check_ret = await self._check_load_lora_adapter_request(request)
             if error_check_ret is not None:
                 return error_check_ret
 
             lora_path = request.lora_path
             unique_id = self.lora_id_counter.inc(1)
-            lora_request = LoRARequest(lora_name=lora_name,
-                                       lora_int_id=unique_id,
-                                       lora_path=lora_path)
-            if base_model_name is not None and self.is_base_model(
-                    base_model_name):
+            lora_request = LoRARequest(
+                lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
+            )
+            if base_model_name is not None and self.is_base_model(base_model_name):
                 lora_request.base_model_name = base_model_name
 
             # Validate that the adapter can be loaded into the engine
@@ -154,24 +160,24 @@ class OpenAIServingModels:
                     error_type = "NotFoundError"
                     status_code = HTTPStatus.NOT_FOUND
 
-                return create_error_response(message=str(e),
-                                             err_type=error_type,
-                                             status_code=status_code)
+                return create_error_response(
+                    message=str(e), err_type=error_type, status_code=status_code
+                )
 
             self.lora_requests[lora_name] = lora_request
-            logger.info("Loaded new LoRA adapter: name '%s', path '%s'",
-                        lora_name, lora_path)
+            logger.info(
+                "Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path
+            )
             return f"Success: LoRA adapter '{lora_name}' added successfully."
 
     async def unload_lora_adapter(
-            self,
-            request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
+        self, request: UnloadLoRAAdapterRequest
+    ) -> ErrorResponse | str:
         lora_name = request.lora_name
 
         # Ensure atomicity based on the lora name
         async with self.lora_resolver_lock[lora_name]:
-            error_check_ret = await self._check_unload_lora_adapter_request(
-                request)
+            error_check_ret = await self._check_unload_lora_adapter_request(request)
             if error_check_ret is not None:
                 return error_check_ret
 
@@ -181,48 +187,49 @@ class OpenAIServingModels:
             return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
-            self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
+        self, request: LoadLoRAAdapterRequest
+    ) -> ErrorResponse | None:
         # Check if both 'lora_name' and 'lora_path' are provided
         if not request.lora_name or not request.lora_path:
             return create_error_response(
                 message="Both 'lora_name' and 'lora_path' must be provided.",
                 err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
 
         # Check if the lora adapter with the given name already exists
         if request.lora_name in self.lora_requests:
             return create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' has already been "
+                message=f"The lora adapter '{request.lora_name}' has already been "
                 "loaded.",
                 err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
 
         return None
 
     async def _check_unload_lora_adapter_request(
-            self,
-            request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
+        self, request: UnloadLoRAAdapterRequest
+    ) -> ErrorResponse | None:
         # Check if 'lora_name' is not provided return an error
         if not request.lora_name:
             return create_error_response(
-                message=
-                "'lora_name' needs to be provided to unload a LoRA adapter.",
+                message="'lora_name' needs to be provided to unload a LoRA adapter.",
                 err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
 
         # Check if the lora adapter with the given name exists
         if request.lora_name not in self.lora_requests:
             return create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' cannot be found.",
+                message=f"The lora adapter '{request.lora_name}' cannot be found.",
                 err_type="NotFoundError",
-                status_code=HTTPStatus.NOT_FOUND)
+                status_code=HTTPStatus.NOT_FOUND,
+            )
 
         return None
 
-    async def resolve_lora(
-            self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+    async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
         """Attempt to resolve a LoRA adapter using available resolvers.
 
         Args:
@@ -244,8 +251,7 @@ class OpenAIServingModels:
 
             # Try to resolve using available resolvers
             for resolver in self.lora_resolvers:
-                lora_request = await resolver.resolve_lora(
-                    base_model_name, lora_name)
+                lora_request = await resolver.resolve_lora(base_model_name, lora_name)
 
                 if lora_request is not None:
                     found_adapter = True
@@ -256,33 +262,43 @@ class OpenAIServingModels:
                         self.lora_requests[lora_name] = lora_request
                         logger.info(
                             "Resolved and loaded LoRA adapter '%s' using %s",
-                            lora_name, resolver.__class__.__name__)
+                            lora_name,
+                            resolver.__class__.__name__,
+                        )
                         return lora_request
                     except BaseException as e:
                         logger.warning(
                             "Failed to load LoRA '%s' resolved by %s: %s. "
-                            "Trying next resolver.", lora_name,
-                            resolver.__class__.__name__, e)
+                            "Trying next resolver.",
+                            lora_name,
+                            resolver.__class__.__name__,
+                            e,
+                        )
                         continue
 
             if found_adapter:
                 # An adapter was found, but all attempts to load it failed.
                 return create_error_response(
-                    message=(f"LoRA adapter '{lora_name}' was found "
-                             "but could not be loaded."),
+                    message=(
+                        f"LoRA adapter '{lora_name}' was found but could not be loaded."
+                    ),
                     err_type="BadRequestError",
-                    status_code=HTTPStatus.BAD_REQUEST)
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
             else:
                 # No adapter was found
                 return create_error_response(
                     message=f"LoRA adapter {lora_name} does not exist",
                     err_type="NotFoundError",
-                    status_code=HTTPStatus.NOT_FOUND)
+                    status_code=HTTPStatus.NOT_FOUND,
+                )
 
 
 def create_error_response(
-        message: str,
-        err_type: str = "BadRequestError",
-        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
-    return ErrorResponse(error=ErrorInfo(
-        message=message, type=err_type, code=status_code.value))
+    message: str,
+    err_type: str = "BadRequestError",
+    status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+) -> ErrorResponse:
+    return ErrorResponse(
+        error=ErrorInfo(message=message, type=err_type, code=status_code.value)
+    )
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 0750c7ec3e9f15f971bf652d9f74485686eed193..ee4c5c8bacaae5a0415f6f3674bd367caf886c40 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -2,87 +2,79 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import base64
+import json
 import time
-from collections.abc import AsyncGenerator
-from typing import Final, Literal, Optional, Union, cast
+from collections.abc import AsyncGenerator, Sequence
+from typing import Final, cast
 
 import jinja2
-import numpy as np
-import torch
 from fastapi import Request
 from typing_extensions import assert_never
 
-from vllm.config import VllmConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              IOProcessorRequest,
-                                              IOProcessorResponse,
-                                              PoolingChatRequest,
-                                              PoolingCompletionRequest,
-                                              PoolingRequest, PoolingResponse,
-                                              PoolingResponseData, UsageInfo)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    IOProcessorRequest,
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingChatRequest,
+    PoolingCompletionRequest,
+    PoolingRequest,
+    PoolingResponse,
+    PoolingResponseData,
+    UsageInfo,
+)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
-from vllm.outputs import PoolingOutput, PoolingRequestOutput
-from vllm.plugins.io_processors import get_io_processor
-from vllm.utils import merge_async_iterators
+from vllm.outputs import PoolingRequestOutput
+from vllm.tasks import PoolingTask, SupportedTask
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.serial_utils import (
+    EmbedDType,
+    EncodingFormat,
+    Endianness,
+    encode_pooling_bytes,
+    encode_pooling_output,
+)
 
 logger = init_logger(__name__)
 
 
-def _get_data(
-    output: PoolingOutput,
-    encoding_format: Literal["float", "base64"],
-) -> Union[list[float], str]:
-    if encoding_format == "float":
-        return output.data.tolist()
-    elif encoding_format == "base64":
-        # Force to use float32 for base64 encoding
-        # to match the OpenAI python client behavior
-        pt_float32 = output.data.to(dtype=torch.float32)
-        pooling_bytes = np.array(pt_float32, dtype="float32").tobytes()
-        return base64.b64encode(pooling_bytes).decode("utf-8")
-
-    assert_never(encoding_format)
-
-
 class OpenAIServingPooling(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        vllm_config: VllmConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        supported_tasks: tuple[SupportedTask, ...],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=vllm_config.model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
 
+        self.supported_tasks = supported_tasks
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
-        io_processor_plugin = self.model_config.io_processor_plugin
-        self.io_processor = get_io_processor(vllm_config, io_processor_plugin)
+        self.trust_request_chat_template = trust_request_chat_template
 
     async def create_pooling(
         self,
         request: PoolingRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[PoolingResponse, IOProcessorResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> PoolingResponse | IOProcessorResponse | PoolingBytesResponse | ErrorResponse:
         """
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
@@ -108,12 +100,13 @@ class OpenAIServingPooling(OpenAIServing):
 
             if getattr(request, "dimensions", None) is not None:
                 return self.create_error_response(
-                    "dimensions is currently not supported")
+                    "dimensions is currently not supported"
+                )
 
-            truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
-                                             None)
+            truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
             truncate_prompt_tokens = _validate_truncation_size(
-                self.max_model_len, truncate_prompt_tokens)
+                self.max_model_len, truncate_prompt_tokens
+            )
 
             if is_io_processor_request:
                 if self.io_processor is None:
@@ -121,14 +114,27 @@ class OpenAIServingPooling(OpenAIServing):
                         "No IOProcessor plugin installed. Please refer "
                         "to the documentation and to the "
                         "'prithvi_geospatial_mae_io_processor' "
-                        "offline inference example for more details.")
+                        "offline inference example for more details."
+                    )
 
                 validated_prompt = self.io_processor.parse_request(request)
 
                 engine_prompts = await self.io_processor.pre_process_async(
-                    prompt=validated_prompt, request_id=request_id)
+                    prompt=validated_prompt, request_id=request_id
+                )
+                if not isinstance(engine_prompts, Sequence) or isinstance(
+                    engine_prompts, (str, bytes, bytearray)
+                ):
+                    engine_prompts = [engine_prompts]
 
             elif isinstance(request, PoolingChatRequest):
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=request.chat_template,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
+                )
+                if error_check_ret is not None:
+                    return error_check_ret
                 (
                     _,
                     _,
@@ -138,8 +144,7 @@ class OpenAIServingPooling(OpenAIServing):
                     tokenizer,
                     request.messages,
                     chat_template=request.chat_template or self.chat_template,
-                    chat_template_content_format=self.
-                    chat_template_content_format,
+                    chat_template_content_format=self.chat_template_content_format,
                     # In pooling requests, we are not generating tokens,
                     # so there is no need to append extra tokens to the input
                     add_generation_prompt=False,
@@ -152,8 +157,7 @@ class OpenAIServingPooling(OpenAIServing):
                     config=self._build_render_config(request),
                 )
             else:
-                raise ValueError(
-                    f"Unsupported request of type {type(request)}")
+                raise ValueError(f"Unsupported request of type {type(request)}")
         except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -161,23 +165,55 @@ class OpenAIServingPooling(OpenAIServing):
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
-            pooling_params = request.to_pooling_params()
+            if is_io_processor_request:
+                assert self.io_processor is not None and isinstance(
+                    request, IOProcessorRequest
+                )
+                pooling_params = self.io_processor.validate_or_generate_params()
+            else:
+                pooling_params = request.to_pooling_params()
+
+            pooling_task: PoolingTask
+            if request.task is None:
+                if "token_embed" in self.supported_tasks:
+                    pooling_task = "token_embed"
+                elif "token_classify" in self.supported_tasks:
+                    pooling_task = "token_classify"
+                elif "plugin" in self.supported_tasks:
+                    pooling_task = "plugin"
+                else:
+                    return self.create_error_response(
+                        f"pooling_task must be one of {self.supported_tasks}."
+                    )
+            else:
+                pooling_task = request.task
+
+            if pooling_task not in self.supported_tasks:
+                return self.create_error_response(
+                    f"Task {pooling_task} is not supported, it"
+                    f" must be one of {self.supported_tasks}."
+                )
 
             try:
-                pooling_params.verify("encode", self.model_config)
+                pooling_params.verify(pooling_task, self.model_config)
             except ValueError as e:
                 return self.create_error_response(str(e))
 
             for i, engine_prompt in enumerate(engine_prompts):
                 request_id_item = f"{request_id}-{i}"
 
-                self._log_inputs(request_id_item,
-                                 engine_prompt,
-                                 params=pooling_params,
-                                 lora_request=lora_request)
+                self._log_inputs(
+                    request_id_item,
+                    engine_prompt,
+                    params=pooling_params,
+                    lora_request=lora_request,
+                )
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
+                )
 
                 generator = self.engine_client.encode(
                     engine_prompt,
@@ -203,12 +239,11 @@ class OpenAIServingPooling(OpenAIServing):
             )
             return self.io_processor.output_to_response(output)
 
-        assert isinstance(request,
-                          (PoolingCompletionRequest, PoolingChatRequest))
+        assert isinstance(request, (PoolingCompletionRequest, PoolingChatRequest))
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: list[Optional[PoolingRequestOutput]]
+        final_res_batch: list[PoolingRequestOutput | None]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
@@ -216,8 +251,7 @@ class OpenAIServingPooling(OpenAIServing):
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(list[PoolingRequestOutput],
-                                           final_res_batch)
+            final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch)
 
             response = self.request_output_to_pooling_response(
                 final_res_batch_checked,
@@ -225,6 +259,8 @@ class OpenAIServingPooling(OpenAIServing):
                 created_time,
                 model_name,
                 request.encoding_format,
+                request.embed_dtype,
+                request.endianness,
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
@@ -240,37 +276,71 @@ class OpenAIServingPooling(OpenAIServing):
         request_id: str,
         created_time: int,
         model_name: str,
-        encoding_format: Literal["float", "base64"],
-    ) -> PoolingResponse:
-        items: list[PoolingResponseData] = []
-        num_prompt_tokens = 0
-
-        for idx, final_res in enumerate(final_res_batch):
-            item = PoolingResponseData(
-                index=idx,
-                data=_get_data(final_res.outputs, encoding_format),
+        encoding_format: EncodingFormat,
+        embed_dtype: EmbedDType,
+        endianness: Endianness,
+    ) -> PoolingResponse | PoolingBytesResponse:
+        def encode_float_base64():
+            items: list[PoolingResponseData] = []
+            num_prompt_tokens = 0
+
+            for idx, final_res in enumerate(final_res_batch):
+                item = PoolingResponseData(
+                    index=idx,
+                    data=encode_pooling_output(
+                        final_res,
+                        encoding_format=encoding_format,
+                        embed_dtype=embed_dtype,
+                        endianness=endianness,
+                    ),
+                )
+                prompt_token_ids = final_res.prompt_token_ids
+
+                items.append(item)
+                num_prompt_tokens += len(prompt_token_ids)
+
+            usage = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                total_tokens=num_prompt_tokens,
             )
-            prompt_token_ids = final_res.prompt_token_ids
 
-            items.append(item)
-            num_prompt_tokens += len(prompt_token_ids)
+            return PoolingResponse(
+                id=request_id,
+                created=created_time,
+                model=model_name,
+                data=items,
+                usage=usage,
+            )
 
-        usage = UsageInfo(
-            prompt_tokens=num_prompt_tokens,
-            total_tokens=num_prompt_tokens,
-        )
+        def encode_bytes():
+            body, items, usage = encode_pooling_bytes(
+                pooling_outputs=final_res_batch,
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+            )
 
-        return PoolingResponse(
-            id=request_id,
-            created=created_time,
-            model=model_name,
-            data=items,
-            usage=usage,
-        )
+            metadata = {
+                "id": request_id,
+                "created": created_time,
+                "model": model_name,
+                "data": items,
+                "usage": usage,
+            }
+            return PoolingBytesResponse(
+                body=body,
+                metadata=json.dumps(metadata),
+            )
+
+        if encoding_format == "float" or encoding_format == "base64":
+            return encode_float_base64()
+        elif encoding_format == "bytes":
+            return encode_bytes()
+        else:
+            assert_never(encoding_format)
 
-    def _build_render_config(
-            self, request: PoolingCompletionRequest) -> RenderConfig:
+    def _build_render_config(self, request: PoolingCompletionRequest) -> RenderConfig:
         return RenderConfig(
             max_length=self.max_model_len,
             truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens)
+            add_special_tokens=request.add_special_tokens,
+        )
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index c70baba88d433201e015b83ead4f34ff8398bd0a..06efb43ecb7b8495d92c6e1faaf092206e58ab46 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -6,74 +6,105 @@ import json
 import time
 import uuid
 from collections import deque
-from collections.abc import AsyncGenerator, AsyncIterator, Sequence
+from collections.abc import AsyncGenerator, AsyncIterator, Callable, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
-from typing import Callable, Final, Optional, Union
+from typing import Final
 
 import jinja2
 from fastapi import Request
-# yapf conflicts with isort for this block
-# yapf: disable
 from openai.types.responses import (
     ResponseCodeInterpreterCallCodeDeltaEvent,
     ResponseCodeInterpreterCallCodeDoneEvent,
     ResponseCodeInterpreterCallCompletedEvent,
     ResponseCodeInterpreterCallInProgressEvent,
     ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch,
-    ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText,
-    ResponseReasoningItem, ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent,
-    ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent,
-    response_function_web_search, response_text_delta_event)
-from openai.types.responses.response_output_text import (Logprob,
-                                                         LogprobTopLogprob)
-# yapf: enable
+    ResponseCodeInterpreterToolCallParam,
+    ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent,
+    ResponseFunctionCallArgumentsDeltaEvent,
+    ResponseFunctionCallArgumentsDoneEvent,
+    ResponseFunctionToolCall,
+    ResponseFunctionWebSearch,
+    ResponseOutputItem,
+    ResponseOutputItemAddedEvent,
+    ResponseOutputItemDoneEvent,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+    ResponseReasoningTextDeltaEvent,
+    ResponseReasoningTextDoneEvent,
+    ResponseStatus,
+    ResponseTextDeltaEvent,
+    ResponseTextDoneEvent,
+    ResponseWebSearchCallCompletedEvent,
+    ResponseWebSearchCallInProgressEvent,
+    ResponseWebSearchCallSearchingEvent,
+    response_function_web_search,
+    response_text_delta_event,
+)
+from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
 from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent)
+    Content as ResponseReasoningTextContent,
+)
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         ChatTemplateContentFormatOption)
-from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
-                                      SimpleContext, StreamingHarmonyContext)
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormatOption,
+)
+from vllm.entrypoints.context import (
+    ConversationContext,
+    HarmonyContext,
+    SimpleContext,
+    StreamingHarmonyContext,
+)
 from vllm.entrypoints.harmony_utils import (
-    get_developer_message, get_stop_tokens_for_assistant_actions,
-    get_system_message, get_user_message, has_custom_tools,
-    parse_output_message, parse_remaining_state, parse_response_input,
-    render_for_completion)
+    construct_harmony_previous_input_messages,
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_system_message,
+    get_user_message,
+    has_custom_tools,
+    parse_output_message,
+    parse_remaining_state,
+    parse_response_input,
+    render_for_completion,
+)
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
-                                              InputTokensDetails,
-                                              OutputTokensDetails,
-                                              RequestResponseMetadata,
-                                              ResponseReasoningPartAddedEvent,
-                                              ResponseReasoningPartDoneEvent,
-                                              ResponsesRequest,
-                                              ResponsesResponse, ResponseUsage,
-                                              StreamingResponsesResponse)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    DeltaMessage,
+    ErrorResponse,
+    InputTokensDetails,
+    OutputTokensDetails,
+    RequestResponseMetadata,
+    ResponseCompletedEvent,
+    ResponseCreatedEvent,
+    ResponseInProgressEvent,
+    ResponseReasoningPartAddedEvent,
+    ResponseReasoningPartDoneEvent,
+    ResponsesRequest,
+    ResponsesResponse,
+    ResponseUsage,
+    StreamingResponsesResponse,
+)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
+    convert_tool_responses_to_completions_format,
+    extract_tool_types,
+)
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob as SampleLogprob
 from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
@@ -81,21 +112,19 @@ logger = init_logger(__name__)
 
 
 class OpenAIServingResponses(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
-        tool_parser: Optional[str] = None,
-        tool_server: Optional[ToolServer] = None,
+        tool_parser: str | None = None,
+        tool_server: ToolServer | None = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
@@ -103,11 +132,9 @@ class OpenAIServingResponses(OpenAIServing):
     ) -> None:
         super().__init__(
             engine_client=engine_client,
-            model_config=model_config,
             models=models,
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
-            enable_force_include_usage=enable_force_include_usage,
             log_error_stack=log_error_stack,
         )
 
@@ -115,27 +142,20 @@ class OpenAIServingResponses(OpenAIServing):
         self.chat_template_content_format: Final = chat_template_content_format
         self.enable_log_outputs = enable_log_outputs
 
-        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
-                                                 ReasoningParser]] = None
-        if reasoning_parser:
-            try:
-                self.reasoning_parser = (
-                    ReasoningParserManager.get_reasoning_parser(
-                        reasoning_parser))
-                assert self.reasoning_parser is not None
-            except Exception as e:
-                raise TypeError(
-                    f"{reasoning_parser=} has not been registered") from e
-
+        self.reasoning_parser = self._get_reasoning_parser(
+            reasoning_parser_name=reasoning_parser
+        )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_force_include_usage = enable_force_include_usage
-        self.default_sampling_params = (
-            self.model_config.get_diff_sampling_param())
+        self.default_sampling_params = self.model_config.get_diff_sampling_param()
         if self.default_sampling_params:
             source = self.model_config.generation_config
             source = "model" if source == "auto" else source
-            logger.info("Using default chat sampling params from %s: %s",
-                        source, self.default_sampling_params)
+            logger.info(
+                "Using default chat sampling params from %s: %s",
+                source,
+                self.default_sampling_params,
+            )
 
         # If False (default), the "store" option is (silently) ignored and the
         # response is not stored. If True, the response is stored in memory.
@@ -147,27 +167,28 @@ class OpenAIServingResponses(OpenAIServing):
             logger.warning_once(
                 "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
                 "cause a memory leak since we never remove responses from "
-                "the store.")
+                "the store."
+            )
 
-        self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
+        self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss"
         if self.use_harmony:
-            logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice "
-                           "and always enable tool use.")
+            logger.warning(
+                "For gpt-oss, we ignore --enable-auto-tool-choice "
+                "and always enable tool use."
+            )
             # OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
             # We need to add them to the stop token ids.
             if "stop_token_ids" not in self.default_sampling_params:
                 self.default_sampling_params["stop_token_ids"] = []
             self.default_sampling_params["stop_token_ids"].extend(
-                get_stop_tokens_for_assistant_actions())
-
+                get_stop_tokens_for_assistant_actions()
+            )
+        self.enable_auto_tools = enable_auto_tools
         # set up tool use
-        self.enable_auto_tools: bool = enable_auto_tools
-        if self.enable_auto_tools:
-            logger.info(
-                "\"auto\" tool choice has been enabled please note that while"
-                " the parallel_tool_calls client option is preset for "
-                "compatibility reasons, it will be ignored.")
-
+        self.tool_parser = self._get_tool_parser(
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
+        )
+        self.exclude_tools_when_tool_choice_none = False
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove responses from the store.
@@ -182,23 +203,78 @@ class OpenAIServingResponses(OpenAIServing):
         # HACK(wuhang): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove events from the store.
-        self.event_store: dict[str, tuple[deque[StreamingResponsesResponse],
-                                          asyncio.Event]] = {}
+        self.event_store: dict[
+            str, tuple[deque[StreamingResponsesResponse], asyncio.Event]
+        ] = {}
 
         self.background_tasks: dict[str, asyncio.Task] = {}
 
         self.tool_server = tool_server
 
+    def _validate_generator_input(
+        self, engine_prompt: EngineTokensPrompt
+    ) -> ErrorResponse | None:
+        """Add validations to the input to the generator here."""
+        if self.max_model_len <= len(engine_prompt["prompt_token_ids"]):
+            error_message = (
+                "The engine prompt length"
+                f" {len(engine_prompt['prompt_token_ids'])} "
+                f"exceeds the max_model_len {self.max_model_len}. "
+                "Please reduce prompt."
+            )
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message=error_message,
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        return None
+
+    def _validate_create_responses_input(
+        self, request: ResponsesRequest
+    ) -> ErrorResponse | None:
+        if self.use_harmony and request.is_include_output_logprobs():
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="logprobs are not supported with gpt-oss models",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        if request.store and not self.enable_store and request.background:
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message=(
+                    "This vLLM engine does not support `store=True` and "
+                    "therefore does not support the background mode. To "
+                    "enable these features, set the environment variable "
+                    "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
+                    "the vLLM server."
+                ),
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        if request.previous_input_messages and request.previous_response_id:
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="Only one of `previous_input_messages` and "
+                "`previous_response_id` can be set.",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        return None
+
     async def create_responses(
         self,
         request: ResponsesRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[AsyncGenerator[StreamingResponsesResponse, None],
-               ResponsesResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> (
+        AsyncGenerator[StreamingResponsesResponse, None]
+        | ResponsesResponse
+        | ErrorResponse
+    ):
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             logger.error("Error with model %s", error_check_ret)
             return error_check_ret
+        maybe_validation_error = self._validate_create_responses_input(request)
+        if maybe_validation_error is not None:
+            return maybe_validation_error
 
         # If the engine is dead, raise the engine's DEAD_ERROR.
         # This is required for the streaming case, where we return a
@@ -207,17 +283,6 @@ class OpenAIServingResponses(OpenAIServing):
             raise self.engine_client.dead_error
 
         if request.store and not self.enable_store:
-            if request.background:
-                return self.create_error_response(
-                    err_type="invalid_request_error",
-                    message=(
-                        "This vLLM engine does not support `store=True` and "
-                        "therefore does not support the background mode. To "
-                        "enable these features, set the environment variable "
-                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
-                        "the vLLM server."),
-                    status_code=HTTPStatus.BAD_REQUEST,
-                )
             # Disable the store option.
             # NOTE(woosuk): Although returning an error is possible, we opted
             # to implicitly disable store and process the request anyway, as
@@ -225,12 +290,6 @@ class OpenAIServingResponses(OpenAIServing):
             # (i.e., their request's `store=True` just because it's the default
             # value).
             request.store = False
-        if self.use_harmony and request.is_include_output_logprobs():
-            return self.create_error_response(
-                err_type="invalid_request_error",
-                message="logprobs are not supported with gpt-oss models",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
 
         # Handle the previous response ID.
         prev_response_id = request.previous_response_id
@@ -249,19 +308,24 @@ class OpenAIServingResponses(OpenAIServing):
 
             if self.use_harmony:
                 messages, request_prompts, engine_prompts = (
-                    self._make_request_with_harmony(request, prev_response))
+                    self._make_request_with_harmony(request, prev_response)
+                )
             else:
-                messages, request_prompts, engine_prompts = (
-                    await self._make_request(request, prev_response,
-                                             tokenizer))
+                messages, request_prompts, engine_prompts = await self._make_request(
+                    request, prev_response, tokenizer
+                )
 
-        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError,
-                NotImplementedError) as e:
+        except (
+            ValueError,
+            TypeError,
+            RuntimeError,
+            jinja2.TemplateError,
+            NotImplementedError,
+        ) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
 
-        request_metadata = RequestResponseMetadata(
-            request_id=request.request_id)
+        request_metadata = RequestResponseMetadata(request_id=request.request_id)
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
@@ -284,23 +348,45 @@ class OpenAIServingResponses(OpenAIServing):
             available_tools = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
+                maybe_error = self._validate_generator_input(engine_prompt)
+                if maybe_error is not None:
+                    return maybe_error
+
                 default_max_tokens = self.max_model_len - len(
-                    engine_prompt["prompt_token_ids"])
+                    engine_prompt["prompt_token_ids"]
+                )
+
                 sampling_params = request.to_sampling_params(
-                    default_max_tokens, self.default_sampling_params)
+                    default_max_tokens, self.default_sampling_params
+                )
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
+                )
 
                 context: ConversationContext
                 if self.use_harmony:
                     if request.stream:
-                        context = StreamingHarmonyContext(
-                            messages, available_tools)
+                        context = StreamingHarmonyContext(messages, available_tools)
                     else:
                         context = HarmonyContext(messages, available_tools)
                 else:
                     context = SimpleContext()
+
+                if self.reasoning_parser is not None:
+                    reasoning_parser = self.reasoning_parser(tokenizer)
+                    if sampling_params.structured_outputs is None:
+                        sampling_params.structured_outputs = StructuredOutputsParams()
+                    struct_out = sampling_params.structured_outputs
+                    if struct_out.all_non_structural_tag_constraints_none():
+                        sampling_params.structured_outputs.structural_tag = (
+                            reasoning_parser.prepare_structured_tag(
+                                sampling_params.structured_outputs.structural_tag,
+                                self.tool_server,
+                            )
+                        )
                 generator = self._generate_with_builtin_tools(
                     request_id=request.request_id,
                     request_prompt=request_prompts[i],
@@ -317,7 +403,7 @@ class OpenAIServingResponses(OpenAIServing):
             return self.create_error_response(str(e))
 
         assert len(generators) == 1
-        result_generator, = generators
+        (result_generator,) = generators
 
         # Store the input messages.
         if request.store:
@@ -371,11 +457,11 @@ class OpenAIServingResponses(OpenAIServing):
             response_id = response.id
             self.background_tasks[response_id] = task
             task.add_done_callback(
-                lambda _: self.background_tasks.pop(response_id, None))
+                lambda _: self.background_tasks.pop(response_id, None)
+            )
 
             if request.stream:
-                return self.responses_background_stream_generator(
-                    request.request_id)
+                return self.responses_background_stream_generator(request.request_id)
             return response
 
         if request.stream:
@@ -405,18 +491,26 @@ class OpenAIServingResponses(OpenAIServing):
     async def _make_request(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
         tokenizer: AnyTokenizer,
     ):
-        if len(request.tools) > 0:
-            raise NotImplementedError(
-                "Tool use is not supported in Responses API without Harmony")
+        if request.tools is None or (
+            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
+        ):
+            tool_dicts = None
+        else:
+            tool_dicts = [
+                convert_tool_responses_to_completions_format(tool.model_dump())
+                for tool in request.tools
+            ]
         # Construct the input messages.
         messages = self._construct_input_messages(request, prev_response)
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
             messages,
+            tool_dicts=tool_dicts,
+            tool_parser=self.tool_parser,
             chat_template=self.chat_template,
             chat_template_content_format=self.chat_template_content_format,
         )
@@ -425,14 +519,13 @@ class OpenAIServingResponses(OpenAIServing):
     def _make_request_with_harmony(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
     ):
         if request.tool_choice != "auto":
             raise NotImplementedError(
-                "Only 'auto' tool_choice is supported in "
-                "response API with Harmony")
-        messages = self._construct_input_messages_with_harmony(
-            request, prev_response)
+                "Only 'auto' tool_choice is supported in response API with Harmony"
+            )
+        messages = self._construct_input_messages_with_harmony(request, prev_response)
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
 
@@ -442,6 +535,22 @@ class OpenAIServingResponses(OpenAIServing):
 
         return messages, [prompt_token_ids], [engine_prompt]
 
+    async def _initialize_tool_sessions(
+        self,
+        request: ResponsesRequest,
+        context: ConversationContext,
+        exit_stack: AsyncExitStack,
+    ):
+        # we should only initialize the tool session if the request needs tools
+        if len(request.tools) == 0:
+            return
+        mcp_tools = {
+            tool.server_label: tool for tool in request.tools if tool.type == "mcp"
+        }
+        await context.init_tool_sessions(
+            self.tool_server, exit_stack, request.request_id, mcp_tools
+        )
+
     async def responses_full_generator(
         self,
         request: ResponsesRequest,
@@ -451,19 +560,14 @@ class OpenAIServingResponses(OpenAIServing):
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
-    ) -> Union[ErrorResponse, ResponsesResponse]:
+        created_time: int | None = None,
+    ) -> ErrorResponse | ResponsesResponse:
         if created_time is None:
             created_time = int(time.time())
 
         async with AsyncExitStack() as exit_stack:
             try:
-                mcp_tools = {
-                    tool.server_label: tool
-                    for tool in request.tools if tool.type == "mcp"
-                }
-                await context.init_tool_sessions(self.tool_server, exit_stack,
-                                                 request.request_id, mcp_tools)
+                await self._initialize_tool_sessions(request, context, exit_stack)
                 async for _ in result_generator:
                     pass
             except asyncio.CancelledError:
@@ -483,8 +587,8 @@ class OpenAIServingResponses(OpenAIServing):
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
             if request.enable_response_messages:
-                input_messages = context.messages[:context.num_init_messages]
-                output_messages = context.messages[context.num_init_messages:]
+                input_messages = context.messages[: context.num_init_messages]
+                output_messages = context.messages[context.num_init_messages :]
             num_tool_output_tokens = context.num_tool_output_tokens
             if len(output) > 0:
                 if context.finish_reason == "length":
@@ -500,15 +604,14 @@ class OpenAIServingResponses(OpenAIServing):
             assert len(final_res.outputs) == 1
             final_output = final_res.outputs[0]
 
-            output = self._make_response_output_items(request, final_output,
-                                                      tokenizer)
+            output = self._make_response_output_items(request, final_output, tokenizer)
 
             # TODO: context for non-gptoss models doesn't use messages
             # so we can't get them out yet
             if request.enable_response_messages:
                 raise NotImplementedError(
-                    "enable_response_messages is currently"
-                    " only supported for gpt-oss")
+                    "enable_response_messages is currently only supported for gpt-oss"
+                )
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
@@ -524,10 +627,24 @@ class OpenAIServingResponses(OpenAIServing):
             output_tokens=num_generated_tokens,
             total_tokens=num_prompt_tokens + num_generated_tokens,
             input_tokens_details=InputTokensDetails(
-                cached_tokens=num_cached_tokens),
+                cached_tokens=num_cached_tokens,
+                input_tokens_per_turn=[
+                    turn.input_tokens for turn in context.all_turn_metrics
+                ],
+                cached_tokens_per_turn=[
+                    turn.cached_input_tokens for turn in context.all_turn_metrics
+                ],
+            ),
             output_tokens_details=OutputTokensDetails(
                 reasoning_tokens=num_reasoning_tokens,
-                tool_output_tokens=num_tool_output_tokens),
+                tool_output_tokens=num_tool_output_tokens,
+                output_tokens_per_turn=[
+                    turn.output_tokens for turn in context.all_turn_metrics
+                ],
+                tool_output_tokens_per_turn=[
+                    turn.tool_output_tokens for turn in context.all_turn_metrics
+                ],
+            ),
         )
         response = ResponsesResponse.from_request(
             request,
@@ -545,76 +662,96 @@ class OpenAIServingResponses(OpenAIServing):
             async with self.response_store_lock:
                 stored_response = self.response_store.get(response.id)
                 # If the response is already cancelled, don't update it.
-                if (stored_response is None
-                        or stored_response.status != "cancelled"):
+                if stored_response is None or stored_response.status != "cancelled":
                     self.response_store[response.id] = response
         return response
 
-    def _topk_logprobs(self, logprobs: dict[int,
-                                            SampleLogprob], top_logprobs: int,
-                       tokenizer: AnyTokenizer) -> list[LogprobTopLogprob]:
+    def _topk_logprobs(
+        self,
+        logprobs: dict[int, SampleLogprob],
+        top_logprobs: int,
+        tokenizer: AnyTokenizer,
+    ) -> list[LogprobTopLogprob]:
         """Returns the top-k logprobs from the logprobs dictionary."""
         out = []
         for i, (token_id, _logprob) in enumerate(logprobs.items()):
             if i >= top_logprobs:
                 break
-            text = _logprob.decoded_token if _logprob.decoded_token \
-                is not None else tokenizer.decode([token_id])
+            text = (
+                _logprob.decoded_token
+                if _logprob.decoded_token is not None
+                else tokenizer.decode([token_id])
+            )
             out.append(
                 LogprobTopLogprob(
                     token=text,
                     logprob=max(_logprob.logprob, -9999.0),
                     bytes=list(text.encode("utf-8", errors="replace")),
-                ))
+                )
+            )
         return out
 
     def _create_response_logprobs(
-            self,
-            token_ids: Sequence[int],
-            logprobs: Optional[SampleLogprobs],
-            tokenizer: AnyTokenizer,
-            top_logprobs: Optional[int] = None) -> list[Logprob]:
+        self,
+        token_ids: Sequence[int],
+        logprobs: SampleLogprobs | None,
+        tokenizer: AnyTokenizer,
+        top_logprobs: int | None = None,
+    ) -> list[Logprob]:
         assert logprobs is not None, "logprobs must be provided"
         assert len(token_ids) == len(logprobs), (
-            "token_ids and logprobs.token_ids must have the same length")
+            "token_ids and logprobs.token_ids must have the same length"
+        )
         out = []
         for i, token_id in enumerate(token_ids):
             logprob = logprobs[i]
             token_logprob = logprob[token_id]
-            text = token_logprob.decoded_token if token_logprob.decoded_token \
-                is not None else tokenizer.decode([token_id])
+            text = (
+                token_logprob.decoded_token
+                if token_logprob.decoded_token is not None
+                else tokenizer.decode([token_id])
+            )
             out.append(
                 Logprob(
                     token=text,
                     logprob=max(token_logprob.logprob, -9999.0),
                     bytes=list(text.encode("utf-8", errors="replace")),
-                    top_logprobs=self._topk_logprobs(logprob,
-                                                     top_logprobs=top_logprobs,
-                                                     tokenizer=tokenizer)
-                    if top_logprobs else [],
-                ))
+                    top_logprobs=(
+                        self._topk_logprobs(
+                            logprob, top_logprobs=top_logprobs, tokenizer=tokenizer
+                        )
+                        if top_logprobs
+                        else []
+                    ),
+                )
+            )
         return out
 
     def _create_stream_response_logprobs(
         self,
         token_ids: Sequence[int],
-        logprobs: Optional[SampleLogprobs],
+        logprobs: SampleLogprobs | None,
         tokenizer: AnyTokenizer,
-        top_logprobs: Optional[int] = None
+        top_logprobs: int | None = None,
     ) -> list[response_text_delta_event.Logprob]:
-        lgs = self._create_response_logprobs(token_ids=token_ids,
-                                             logprobs=logprobs,
-                                             tokenizer=tokenizer,
-                                             top_logprobs=top_logprobs)
+        lgs = self._create_response_logprobs(
+            token_ids=token_ids,
+            logprobs=logprobs,
+            tokenizer=tokenizer,
+            top_logprobs=top_logprobs,
+        )
         return [
             response_text_delta_event.Logprob(
                 token=lg.token,
                 logprob=lg.logprob,
                 top_logprobs=[
                     response_text_delta_event.LogprobTopLogprob(
-                        token=tl.token, logprob=tl.logprob)
+                        token=tl.token, logprob=tl.logprob
+                    )
                     for tl in lg.top_logprobs
-                ]) for lg in lgs
+                ],
+            )
+            for lg in lgs
         ]
 
     def _make_response_output_items(
@@ -630,11 +767,11 @@ class OpenAIServingResponses(OpenAIServing):
                 logger.exception("Error in reasoning parser creation.")
                 raise e
 
-            reasoning_content, content = (
-                reasoning_parser.extract_reasoning_content(final_output.text,
-                                                           request=request))
+            reasoning, content = reasoning_parser.extract_reasoning(
+                final_output.text, request=request
+            )
         else:
-            reasoning_content = None
+            reasoning = None
             content = final_output.text
 
         # Log complete response if output logging is enabled
@@ -642,8 +779,8 @@ class OpenAIServingResponses(OpenAIServing):
             output_text = ""
             if content:
                 output_text = content
-            elif reasoning_content:
-                output_text = f"[reasoning: {reasoning_content}]"
+            elif reasoning:
+                output_text = f"[reasoning: {reasoning}]"
 
             if output_text:
                 self.request_logger.log_outputs(
@@ -655,40 +792,68 @@ class OpenAIServingResponses(OpenAIServing):
                     delta=False,
                 )
 
-        output = []
-        if reasoning_content:
+        reasoning_item = None
+        message_item = None
+        if reasoning:
             reasoning_item = ResponseReasoningItem(
                 id=f"rs_{random_uuid()}",
                 summary=[],
                 type="reasoning",
                 content=[
-                    ResponseReasoningTextContent(text=reasoning_content,
-                                                 type="reasoning_text")
+                    ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
                 ],
                 status=None,  # NOTE: Only the last output item has status.
             )
-            output.append(reasoning_item)
+        tool_calls, content = self._parse_tool_calls_from_content(
+            request=request,
+            tokenizer=tokenizer,
+            content=content,
+            enable_auto_tools=self.enable_auto_tools,
+            tool_parser_cls=self.tool_parser,
+        )
         if content:
             output_text = ResponseOutputText(
                 text=content,
                 annotations=[],  # TODO
                 type="output_text",
-                logprobs=self._create_response_logprobs(
-                    token_ids=final_output.token_ids,
-                    logprobs=final_output.logprobs,
-                    tokenizer=tokenizer,
-                    top_logprobs=request.top_logprobs,
-                ) if request.is_include_output_logprobs() else None,
+                logprobs=(
+                    self._create_response_logprobs(
+                        token_ids=final_output.token_ids,
+                        logprobs=final_output.logprobs,
+                        tokenizer=tokenizer,
+                        top_logprobs=request.top_logprobs,
+                    )
+                    if request.is_include_output_logprobs()
+                    else None
+                ),
             )
-            message = ResponseOutputMessage(
+            message_item = ResponseOutputMessage(
                 id=f"msg_{random_uuid()}",
                 content=[output_text],
                 role="assistant",
                 status="completed",
                 type="message",
             )
-            output.append(message)
-        return output
+        outputs = []
+
+        if reasoning_item:
+            outputs.append(reasoning_item)
+        if message_item:
+            outputs.append(message_item)
+        if tool_calls:
+            tool_call_items = [
+                ResponseFunctionToolCall(
+                    id=f"fc_{random_uuid()}",
+                    call_id=f"call_{random_uuid()}",
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                )
+                for tool_call in tool_calls
+            ]
+            outputs.extend(tool_call_items)
+        return outputs
 
     def _make_response_output_items_with_harmony(
         self,
@@ -707,14 +872,16 @@ class OpenAIServingResponses(OpenAIServing):
     def _construct_input_messages(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse] = None,
+        prev_response: ResponsesResponse | None = None,
     ) -> list[ChatCompletionMessageParam]:
         messages: list[ChatCompletionMessageParam] = []
         if request.instructions:
-            messages.append({
-                "role": "system",
-                "content": request.instructions,
-            })
+            messages.append(
+                {
+                    "role": "system",
+                    "content": request.instructions,
+                }
+            )
 
         # Prepend the conversation history.
         if prev_response is not None:
@@ -727,68 +894,85 @@ class OpenAIServingResponses(OpenAIServing):
                 # NOTE: We skip the reasoning output.
                 if isinstance(output_item, ResponseOutputMessage):
                     for content in output_item.content:
-                        messages.append({
-                            "role": "assistant",
-                            "content": content.text,
-                        })
+                        messages.append(
+                            {
+                                "role": "assistant",
+                                "content": content.text,
+                            }
+                        )
 
         # Append the new input.
         # Responses API supports simple text inputs without chat format.
         if isinstance(request.input, str):
             messages.append({"role": "user", "content": request.input})
         else:
-            messages.extend(request.input)  # type: ignore
+            for item in request.input:
+                messages.append(construct_chat_message_with_tool_call(item))
         return messages
 
+    def _construct_harmony_system_input_message(
+        self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
+    ) -> OpenAIHarmonyMessage:
+        reasoning_effort = request.reasoning.effort if request.reasoning else None
+        enable_browser = (
+            "web_search_preview" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("browser")
+        )
+        enable_code_interpreter = (
+            "code_interpreter" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("python")
+        )
+        enable_container = (
+            "container" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("container")
+        )
+        sys_msg = get_system_message(
+            reasoning_effort=reasoning_effort,
+            browser_description=(
+                self.tool_server.get_tool_description("browser")
+                if enable_browser and self.tool_server is not None
+                else None
+            ),
+            python_description=(
+                self.tool_server.get_tool_description("python")
+                if enable_code_interpreter and self.tool_server is not None
+                else None
+            ),
+            container_description=(
+                self.tool_server.get_tool_description("container")
+                if enable_container and self.tool_server is not None
+                else None
+            ),
+            instructions=request.instructions,
+            with_custom_tools=with_custom_tools,
+        )
+        return sys_msg
+
     def _construct_input_messages_with_harmony(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
     ) -> list[OpenAIHarmonyMessage]:
         messages: list[OpenAIHarmonyMessage] = []
         if prev_response is None:
             # New conversation.
-            reasoning_effort = (request.reasoning.effort
-                                if request.reasoning else None)
-            tool_types = [tool.type for tool in request.tools]
-
-            # Allow the MCP Tool type to enable built in tools if the
-            # server_label is allowlisted in
-            # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
-            if envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
-                for tool in request.tools:
-                    if (tool.type == "mcp" and tool.server_label
-                            in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS):
-                        tool_types.append(tool.server_label)
-            enable_browser = ("web_search_preview" in tool_types
-                              and self.tool_server is not None
-                              and self.tool_server.has_tool("browser"))
-            enable_code_interpreter = ("code_interpreter" in tool_types
-                                       and self.tool_server is not None
-                                       and self.tool_server.has_tool("python"))
-            enable_container = ("container" in tool_types
-                                and self.tool_server is not None
-                                and self.tool_server.has_tool("container"))
+            tool_types = extract_tool_types(request.tools)
             with_custom_tools = has_custom_tools(tool_types)
-            sys_msg = get_system_message(
-                reasoning_effort=reasoning_effort,
-                browser_description=self.tool_server.get_tool_description(
-                    "browser")
-                if enable_browser and self.tool_server is not None else None,
-                python_description=self.tool_server.get_tool_description(
-                    "python") if enable_code_interpreter
-                and self.tool_server is not None else None,
-                container_description=self.tool_server.get_tool_description(
-                    "container")
-                if enable_container and self.tool_server is not None else None,
-                instructions=request.instructions,
-                with_custom_tools=with_custom_tools,
+
+            sys_msg = self._construct_harmony_system_input_message(
+                request, with_custom_tools, tool_types
             )
             messages.append(sys_msg)
             if with_custom_tools:
                 dev_msg = get_developer_message(
-                    instructions=request.instructions, tools=request.tools)
+                    instructions=request.instructions, tools=request.tools
+                )
                 messages.append(dev_msg)
+            messages += construct_harmony_previous_input_messages(request)
+
         else:
             # Continue the previous conversation.
             # FIXME(woosuk): Currently, request params like reasoning and
@@ -808,8 +992,8 @@ class OpenAIServingResponses(OpenAIServing):
                         if prev_msg_i.channel == "final":
                             prev_final_msg_idx = i
                             break
-                    recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:]
-                    del prev_msgs[prev_final_msg_idx + 1:]
+                    recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
+                    del prev_msgs[prev_final_msg_idx + 1 :]
                     for msg in recent_turn_msgs:
                         assert isinstance(msg, OpenAIHarmonyMessage)
                         if msg.channel != "analysis":
@@ -825,8 +1009,7 @@ class OpenAIServingResponses(OpenAIServing):
             else:
                 prev_outputs = []
             for response_msg in request.input:
-                messages.append(
-                    parse_response_input(response_msg, prev_outputs))
+                messages.append(parse_response_input(response_msg, prev_outputs))
                 # User passes in a tool call request and its output. We need
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
@@ -846,14 +1029,12 @@ class OpenAIServingResponses(OpenAIServing):
         self.event_store[request.request_id] = (event_deque, new_event_signal)
         response = None
         try:
-            generator = self.responses_stream_generator(
-                request, *args, **kwargs)
+            generator = self.responses_stream_generator(request, *args, **kwargs)
             async for event in generator:
                 event_deque.append(event)
                 new_event_signal.set()  # Signal new event available
         except Exception as e:
-            logger.exception("Background request failed for %s",
-                             request.request_id)
+            logger.exception("Background request failed for %s", request.request_id)
             response = self.create_error_response(str(e))
         finally:
             new_event_signal.set()
@@ -874,11 +1055,9 @@ class OpenAIServingResponses(OpenAIServing):
         **kwargs,
     ):
         try:
-            response = await self.responses_full_generator(
-                request, *args, **kwargs)
+            response = await self.responses_full_generator(request, *args, **kwargs)
         except Exception as e:
-            logger.exception("Background request failed for %s",
-                             request.request_id)
+            logger.exception("Background request failed for %s", request.request_id)
             response = self.create_error_response(str(e))
 
         if isinstance(response, ErrorResponse):
@@ -893,7 +1072,7 @@ class OpenAIServingResponses(OpenAIServing):
     async def responses_background_stream_generator(
         self,
         response_id: str,
-        starting_after: Optional[int] = None,
+        starting_after: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         if response_id not in self.event_store:
             raise ValueError(f"Unknown response_id: {response_id}")
@@ -909,7 +1088,7 @@ class OpenAIServingResponses(OpenAIServing):
             while current_index < len(event_deque):
                 event = event_deque[current_index]
                 yield event
-                if getattr(event, 'type', 'unknown') == "response.completed":
+                if getattr(event, "type", "unknown") == "response.completed":
                     return
                 current_index += 1
 
@@ -918,10 +1097,13 @@ class OpenAIServingResponses(OpenAIServing):
     async def retrieve_responses(
         self,
         response_id: str,
-        starting_after: Optional[int],
-        stream: Optional[bool],
-    ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[
-            StreamingResponsesResponse, None]]:
+        starting_after: int | None,
+        stream: bool | None,
+    ) -> (
+        ErrorResponse
+        | ResponsesResponse
+        | AsyncGenerator[StreamingResponsesResponse, None]
+    ):
         async with self.response_store_lock:
             response = self.response_store.get(response_id)
 
@@ -938,7 +1120,7 @@ class OpenAIServingResponses(OpenAIServing):
     async def cancel_responses(
         self,
         response_id: str,
-    ) -> Union[ErrorResponse, ResponsesResponse]:
+    ) -> ErrorResponse | ResponsesResponse:
         async with self.response_store_lock:
             response = self.response_store.get(response_id)
             if response is None:
@@ -955,13 +1137,12 @@ class OpenAIServingResponses(OpenAIServing):
             response.status = "cancelled"
 
         # Abort the request.
-        if (task := self.background_tasks.get(response_id)):
+        if task := self.background_tasks.get(response_id):
             task.cancel()
             try:
                 await task
             except asyncio.CancelledError:
-                logger.exception("Background task for %s was cancelled",
-                                 response_id)
+                logger.exception("Background task for %s was cancelled", response_id)
         return response
 
     def _make_not_found_error(self, response_id: str) -> ErrorResponse:
@@ -974,10 +1155,12 @@ class OpenAIServingResponses(OpenAIServing):
     def _make_store_not_supported_error(self) -> ErrorResponse:
         return self.create_error_response(
             err_type="invalid_request_error",
-            message=("`store=True` (default) is not supported. Please set "
-                     "`store=False` in Responses API or set "
-                     "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
-                     "starting the vLLM server."),
+            message=(
+                "`store=True` (default) is not supported. Please set "
+                "`store=False` in Responses API or set "
+                "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
+                "starting the vLLM server."
+            ),
             status_code=HTTPStatus.BAD_REQUEST,
         )
 
@@ -985,14 +1168,15 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
-            [StreamingResponsesResponse], StreamingResponsesResponse],
+            [StreamingResponsesResponse], StreamingResponsesResponse
+        ],
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         current_content_index = 0
         current_output_index = 0
@@ -1011,25 +1195,25 @@ class OpenAIServingResponses(OpenAIServing):
             if ctx.last_output.outputs:
                 output = ctx.last_output.outputs[0]
                 if reasoning_parser:
-                    delta_message = \
-                        reasoning_parser.extract_reasoning_content_streaming(
+                    delta_message = reasoning_parser.extract_reasoning_streaming(
                         previous_text=previous_text,
                         current_text=previous_text + output.text,
                         delta_text=output.text,
                         previous_token_ids=previous_token_ids,
-                        current_token_ids=previous_token_ids +
-                        output.token_ids,
+                        current_token_ids=previous_token_ids + output.token_ids,
                         delta_token_ids=output.token_ids,
                     )
                 else:
-                    delta_message = DeltaMessage(content=output.text, )
+                    delta_message = DeltaMessage(
+                        content=output.text,
+                    )
                 previous_text += output.text
                 previous_token_ids += output.token_ids
                 if not delta_message:
                     continue
                 if not first_delta_sent:
                     current_item_id = str(uuid.uuid4())
-                    if delta_message.reasoning_content:
+                    if delta_message.reasoning:
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemAddedEvent(
                                 type="response.output_item.added",
@@ -1041,7 +1225,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     summary=[],
                                     status="in_progress",
                                 ),
-                            ))
+                            )
+                        )
                     else:
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemAddedEvent(
@@ -1055,7 +1240,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     content=[],
                                     status="in_progress",
                                 ),
-                            ))
+                            )
+                        )
                     yield _increment_sequence_number_and_return(
                         ResponseContentPartAddedEvent(
                             type="response.content_part.added",
@@ -1069,21 +1255,26 @@ class OpenAIServingResponses(OpenAIServing):
                                 annotations=[],
                                 logprobs=[],
                             ),
-                        ))
+                        )
+                    )
                     current_content_index += 1
                     first_delta_sent = True
                 # todo(kebe7jun) tool call support
 
                 # check delta message and previous delta message are
                 # same as content or reasoning content
-                if (previous_delta_messages
-                        and previous_delta_messages[-1].reasoning_content
-                        is not None and delta_message.content is not None):
+                if (
+                    previous_delta_messages
+                    and previous_delta_messages[-1].reasoning is not None
+                    and delta_message.content is not None
+                ):
                     # from reasoning to normal content, send done
                     # event for reasoning
-                    reason_content = ''.join(
-                        pm.reasoning_content for pm in previous_delta_messages
-                        if pm.reasoning_content is not None)
+                    reason_content = "".join(
+                        pm.reasoning
+                        for pm in previous_delta_messages
+                        if pm.reasoning is not None
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseReasoningTextDoneEvent(
                             type="response.reasoning_text.done",
@@ -1092,7 +1283,8 @@ class OpenAIServingResponses(OpenAIServing):
                             output_index=current_output_index,
                             content_index=current_content_index,
                             text=reason_content,
-                        ))
+                        )
+                    )
                     current_content_index = 0
                     reasoning_item = ResponseReasoningItem(
                         type="reasoning",
@@ -1112,7 +1304,8 @@ class OpenAIServingResponses(OpenAIServing):
                             sequence_number=-1,
                             output_index=current_output_index,
                             item=reasoning_item,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseOutputItemAddedEvent(
                             type="response.output_item.added",
@@ -1125,7 +1318,8 @@ class OpenAIServingResponses(OpenAIServing):
                                 content=[],
                                 status="in_progress",
                             ),
-                        ))
+                        )
+                    )
                     current_output_index += 1
                     current_item_id = str(uuid.uuid4())
                     yield _increment_sequence_number_and_return(
@@ -1141,12 +1335,13 @@ class OpenAIServingResponses(OpenAIServing):
                                 annotations=[],
                                 logprobs=[],
                             ),
-                        ))
+                        )
+                    )
                     current_content_index += 1
                     # reset previous delta messages
                     previous_delta_messages = []
 
-                if delta_message.reasoning_content is not None:
+                if delta_message.reasoning is not None:
                     yield _increment_sequence_number_and_return(
                         ResponseReasoningTextDeltaEvent(
                             type="response.reasoning_text.delta",
@@ -1154,8 +1349,9 @@ class OpenAIServingResponses(OpenAIServing):
                             content_index=current_content_index,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                            delta=delta_message.reasoning_content,
-                        ))
+                            delta=delta_message.reasoning,
+                        )
+                    )
                 elif delta_message.content is not None:
                     yield _increment_sequence_number_and_return(
                         ResponseTextDeltaEvent(
@@ -1165,21 +1361,28 @@ class OpenAIServingResponses(OpenAIServing):
                             output_index=current_output_index,
                             item_id=current_item_id,
                             delta=delta_message.content,
-                            logprobs=self._create_stream_response_logprobs(
-                                token_ids=output.token_ids,
-                                logprobs=output.logprobs,
-                                tokenizer=tokenizer,
-                                top_logprobs=request.top_logprobs,
-                            ) if request.is_include_output_logprobs() else [],
-                        ))
+                            logprobs=(
+                                self._create_stream_response_logprobs(
+                                    token_ids=output.token_ids,
+                                    logprobs=output.logprobs,
+                                    tokenizer=tokenizer,
+                                    top_logprobs=request.top_logprobs,
+                                )
+                                if request.is_include_output_logprobs()
+                                else []
+                            ),
+                        )
+                    )
                 current_content_index += 1
 
                 previous_delta_messages.append(delta_message)
         if previous_delta_messages:
-            if previous_delta_messages[-1].reasoning_content is not None:
-                reason_content = ''.join(pm.reasoning_content
-                                         for pm in previous_delta_messages
-                                         if pm.reasoning_content is not None)
+            if previous_delta_messages[-1].reasoning is not None:
+                reason_content = "".join(
+                    pm.reasoning
+                    for pm in previous_delta_messages
+                    if pm.reasoning is not None
+                )
                 yield _increment_sequence_number_and_return(
                     ResponseReasoningTextDoneEvent(
                         type="response.reasoning_text.done",
@@ -1188,7 +1391,8 @@ class OpenAIServingResponses(OpenAIServing):
                         output_index=current_output_index,
                         content_index=current_content_index,
                         text=reason_content,
-                    ))
+                    )
+                )
                 current_content_index += 1
                 reasoning_item = ResponseReasoningItem(
                     type="reasoning",
@@ -1208,11 +1412,14 @@ class OpenAIServingResponses(OpenAIServing):
                         sequence_number=-1,
                         output_index=current_output_index,
                         item=reasoning_item,
-                    ))
+                    )
+                )
             elif previous_delta_messages[-1].content is not None:
-                final_content = ''.join(pm.content
-                                        for pm in previous_delta_messages
-                                        if pm.content is not None)
+                final_content = "".join(
+                    pm.content
+                    for pm in previous_delta_messages
+                    if pm.content is not None
+                )
                 yield _increment_sequence_number_and_return(
                     ResponseTextDoneEvent(
                         type="response.output_text.done",
@@ -1222,7 +1429,8 @@ class OpenAIServingResponses(OpenAIServing):
                         text=final_content,
                         logprobs=[],
                         item_id=current_item_id,
-                    ))
+                    )
+                )
                 current_content_index += 1
                 part = ResponseOutputText(
                     text=final_content,
@@ -1237,7 +1445,8 @@ class OpenAIServingResponses(OpenAIServing):
                         output_index=current_output_index,
                         content_index=current_content_index,
                         part=part,
-                    ))
+                    )
+                )
                 current_content_index += 1
                 item = ResponseOutputMessage(
                     type="message",
@@ -1255,39 +1464,69 @@ class OpenAIServingResponses(OpenAIServing):
                         sequence_number=-1,
                         output_index=current_output_index,
                         item=item,
-                    ))
+                    )
+                )
 
     async def _process_harmony_streaming_events(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
-            [StreamingResponsesResponse], StreamingResponsesResponse],
+            [StreamingResponsesResponse], StreamingResponsesResponse
+        ],
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         current_content_index = -1
         current_output_index = 0
         current_item_id: str = ""
         sent_output_item_added = False
-
+        is_first_function_call_delta = False
         async for ctx in result_generator:
-
             assert isinstance(ctx, StreamingHarmonyContext)
 
             if ctx.is_expecting_start():
                 current_output_index += 1
                 sent_output_item_added = False
-
+                is_first_function_call_delta = False
                 if len(ctx.parser.messages) > 0:
                     previous_item = ctx.parser.messages[-1]
                     if previous_item.recipient is not None:
-                        # Deal with tool call here
-                        pass
+                        # Deal with tool call
+                        if previous_item.recipient.startswith("functions."):
+                            function_name = previous_item.recipient[len("functions.") :]
+                            yield _increment_sequence_number_and_return(
+                                ResponseFunctionCallArgumentsDoneEvent(
+                                    type="response.function_call_arguments.done",
+                                    arguments=previous_item.content[0].text,
+                                    name=function_name,
+                                    item_id=current_item_id,
+                                    output_index=current_output_index,
+                                    sequence_number=-1,
+                                )
+                            )
+                            function_call_item = ResponseFunctionToolCall(
+                                type="function_call",
+                                arguments=previous_item.content[0].text,
+                                name=function_name,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                sequence_number=-1,
+                                call_id=f"fc_{random_uuid()}",
+                                status="completed",
+                            )
+                            yield _increment_sequence_number_and_return(
+                                ResponseOutputItemDoneEvent(
+                                    type="response.output_item.done",
+                                    sequence_number=-1,
+                                    output_index=current_output_index,
+                                    item=function_call_item,
+                                )
+                            )
                     elif previous_item.channel == "analysis":
                         content = ResponseReasoningTextContent(
                             text=previous_item.content[0].text,
@@ -1308,7 +1547,8 @@ class OpenAIServingResponses(OpenAIServing):
                                 output_index=current_output_index,
                                 content_index=current_content_index,
                                 text=previous_item.content[0].text,
-                            ))
+                            )
+                        )
                         yield _increment_sequence_number_and_return(
                             ResponseReasoningPartDoneEvent(
                                 type="response.reasoning_part.done",
@@ -1317,14 +1557,16 @@ class OpenAIServingResponses(OpenAIServing):
                                 output_index=current_output_index,
                                 content_index=current_content_index,
                                 part=content,
-                            ))
+                            )
+                        )
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemDoneEvent(
                                 type="response.output_item.done",
                                 sequence_number=-1,
                                 output_index=current_output_index,
                                 item=reasoning_item,
-                            ))
+                            )
+                        )
                     elif previous_item.channel == "final":
                         text_content = ResponseOutputText(
                             type="output_text",
@@ -1340,7 +1582,8 @@ class OpenAIServingResponses(OpenAIServing):
                                 text=previous_item.content[0].text,
                                 logprobs=[],
                                 item_id=current_item_id,
-                            ))
+                            )
+                        )
                         yield _increment_sequence_number_and_return(
                             ResponseContentPartDoneEvent(
                                 type="response.content_part.done",
@@ -1349,7 +1592,8 @@ class OpenAIServingResponses(OpenAIServing):
                                 output_index=current_output_index,
                                 content_index=current_content_index,
                                 part=text_content,
-                            ))
+                            )
+                        )
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemDoneEvent(
                                 type="response.output_item.done",
@@ -1362,12 +1606,15 @@ class OpenAIServingResponses(OpenAIServing):
                                     content=[text_content],
                                     status="completed",
                                 ),
-                            ))
+                            )
+                        )
 
             # stream the output of a harmony message
             if ctx.parser.last_content_delta:
-                if (ctx.parser.current_channel == "final"
-                        and ctx.parser.current_recipient is None):
+                if (
+                    ctx.parser.current_channel == "final"
+                    and ctx.parser.current_recipient is None
+                ):
                     if not sent_output_item_added:
                         sent_output_item_added = True
                         current_item_id = f"msg_{random_uuid()}"
@@ -1383,7 +1630,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     content=[],
                                     status="in_progress",
                                 ),
-                            ))
+                            )
+                        )
                         current_content_index += 1
                         yield _increment_sequence_number_and_return(
                             ResponseContentPartAddedEvent(
@@ -1398,7 +1646,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     annotations=[],
                                     logprobs=[],
                                 ),
-                            ))
+                            )
+                        )
                     yield _increment_sequence_number_and_return(
                         ResponseTextDeltaEvent(
                             type="response.output_text.delta",
@@ -1409,9 +1658,12 @@ class OpenAIServingResponses(OpenAIServing):
                             delta=ctx.parser.last_content_delta,
                             # TODO, use logprobs from ctx.last_request_output
                             logprobs=[],
-                        ))
-                elif (ctx.parser.current_channel == "analysis"
-                      and ctx.parser.current_recipient is None):
+                        )
+                    )
+                elif (
+                    ctx.parser.current_channel == "analysis"
+                    and ctx.parser.current_recipient is None
+                ):
                     if not sent_output_item_added:
                         sent_output_item_added = True
                         current_item_id = f"msg_{random_uuid()}"
@@ -1426,7 +1678,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     summary=[],
                                     status="in_progress",
                                 ),
-                            ))
+                            )
+                        )
                         current_content_index += 1
                         yield _increment_sequence_number_and_return(
                             ResponseReasoningPartAddedEvent(
@@ -1439,7 +1692,8 @@ class OpenAIServingResponses(OpenAIServing):
                                     text="",
                                     type="reasoning_text",
                                 ),
-                            ))
+                            )
+                        )
                     yield _increment_sequence_number_and_return(
                         ResponseReasoningTextDeltaEvent(
                             type="response.reasoning_text.delta",
@@ -1448,13 +1702,15 @@ class OpenAIServingResponses(OpenAIServing):
                             content_index=current_content_index,
                             delta=ctx.parser.last_content_delta,
                             sequence_number=-1,
-                        ))
+                        )
+                    )
                 # built-in tools will be triggered on the analysis channel
                 # However, occasionally built-in tools will
                 # still be output to commentary.
-                elif (ctx.parser.current_channel == "commentary"
-                      or ctx.parser.current_channel == "analysis"
-                      ) and ctx.parser.current_recipient == "python":
+                elif (
+                    ctx.parser.current_channel == "commentary"
+                    or ctx.parser.current_channel == "analysis"
+                ) and ctx.parser.current_recipient == "python":
                     if not sent_output_item_added:
                         sent_output_item_added = True
                         current_item_id = f"tool_{random_uuid()}"
@@ -1471,15 +1727,16 @@ class OpenAIServingResponses(OpenAIServing):
                                     outputs=None,
                                     status="in_progress",
                                 ),
-                            ))
+                            )
+                        )
                         yield _increment_sequence_number_and_return(
                             ResponseCodeInterpreterCallInProgressEvent(
-                                type=
-                                "response.code_interpreter_call.in_progress",
+                                type="response.code_interpreter_call.in_progress",
                                 sequence_number=-1,
                                 output_index=current_output_index,
                                 item_id=current_item_id,
-                            ))
+                            )
+                        )
                     yield _increment_sequence_number_and_return(
                         ResponseCodeInterpreterCallCodeDeltaEvent(
                             type="response.code_interpreter_call_code.delta",
@@ -1487,41 +1744,41 @@ class OpenAIServingResponses(OpenAIServing):
                             output_index=current_output_index,
                             item_id=current_item_id,
                             delta=ctx.parser.last_content_delta,
-                        ))
+                        )
+                    )
 
             # stream tool call outputs
             if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
                 previous_item = ctx.parser.messages[-1]
-                if (self.tool_server is not None
-                        and self.tool_server.has_tool("browser")
-                        and previous_item.recipient is not None
-                        and previous_item.recipient.startswith("browser.")):
-                    function_name = previous_item.recipient[len("browser."):]
+                if (
+                    self.tool_server is not None
+                    and self.tool_server.has_tool("browser")
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("browser.")
+                ):
+                    function_name = previous_item.recipient[len("browser.") :]
                     action = None
                     parsed_args = json.loads(previous_item.content[0].text)
                     if function_name == "search":
-                        action = (response_function_web_search.ActionSearch(
+                        action = response_function_web_search.ActionSearch(
                             type="search",
                             query=parsed_args["query"],
-                        ))
+                        )
                     elif function_name == "open":
-                        action = (
-                            response_function_web_search.ActionOpenPage(
-                                type="open_page",
-                                # TODO: translate to url
-                                url=f"cursor:{parsed_args.get('cursor', '')}",
-                            ))
+                        action = response_function_web_search.ActionOpenPage(
+                            type="open_page",
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
                     elif function_name == "find":
-                        action = (
-                            response_function_web_search.ActionFind(
-                                type="find",
-                                pattern=parsed_args["pattern"],
-                                # TODO: translate to url
-                                url=f"cursor:{parsed_args.get('cursor', '')}",
-                            ))
+                        action = response_function_web_search.ActionFind(
+                            type="find",
+                            pattern=parsed_args["pattern"],
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
                     else:
-                        raise ValueError(
-                            f"Unknown function name: {function_name}")
+                        raise ValueError(f"Unknown function name: {function_name}")
 
                     current_item_id = f"tool_{random_uuid()}"
                     yield _increment_sequence_number_and_return(
@@ -1529,29 +1786,31 @@ class OpenAIServingResponses(OpenAIServing):
                             type="response.output_item.added",
                             sequence_number=-1,
                             output_index=current_output_index,
-                            item=response_function_web_search.
-                            ResponseFunctionWebSearch(
+                            item=response_function_web_search.ResponseFunctionWebSearch(
                                 # TODO: generate a unique id for web search call
                                 type="web_search_call",
                                 id=current_item_id,
                                 action=action,
                                 status="in_progress",
                             ),
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseWebSearchCallInProgressEvent(
                             type="response.web_search_call.in_progress",
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseWebSearchCallSearchingEvent(
                             type="response.web_search_call.searching",
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                        ))
+                        )
+                    )
 
                     # enqueue
                     yield _increment_sequence_number_and_return(
@@ -1560,7 +1819,8 @@ class OpenAIServingResponses(OpenAIServing):
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseOutputItemDoneEvent(
                             type="response.output_item.done",
@@ -1572,12 +1832,15 @@ class OpenAIServingResponses(OpenAIServing):
                                 action=action,
                                 status="completed",
                             ),
-                        ))
+                        )
+                    )
 
-                if (self.tool_server is not None
-                        and self.tool_server.has_tool("python")
-                        and previous_item.recipient is not None
-                        and previous_item.recipient.startswith("python")):
+                if (
+                    self.tool_server is not None
+                    and self.tool_server.has_tool("python")
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("python")
+                ):
                     yield _increment_sequence_number_and_return(
                         ResponseCodeInterpreterCallCodeDoneEvent(
                             type="response.code_interpreter_call_code.done",
@@ -1585,21 +1848,24 @@ class OpenAIServingResponses(OpenAIServing):
                             output_index=current_output_index,
                             item_id=current_item_id,
                             code=previous_item.content[0].text,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseCodeInterpreterCallInterpretingEvent(
                             type="response.code_interpreter_call.interpreting",
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseCodeInterpreterCallCompletedEvent(
                             type="response.code_interpreter_call.completed",
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                        ))
+                        )
+                    )
                     yield _increment_sequence_number_and_return(
                         ResponseOutputItemDoneEvent(
                             type="response.output_item.done",
@@ -1614,18 +1880,56 @@ class OpenAIServingResponses(OpenAIServing):
                                 outputs=[],
                                 status="completed",
                             ),
-                        ))
+                        )
+                    )
+            # developer tools will be triggered on the commentary channel
+            # and recipient starts with "functions.TOOL_NAME"
+            if (
+                ctx.parser.current_channel == "commentary"
+                and ctx.parser.current_recipient
+                and ctx.parser.current_recipient.startswith("functions.")
+            ):
+                if is_first_function_call_delta is False:
+                    is_first_function_call_delta = True
+                    fc_name = ctx.parser.current_recipient[len("functions.") :]
+                    tool_call_item = ResponseFunctionToolCall(
+                        name=fc_name,
+                        type="function_call",
+                        id=current_item_id,
+                        call_id=f"call_{random_uuid()}",
+                        arguments="",
+                        status="in_progress",
+                    )
+                    current_item_id = f"fc_{random_uuid()}"
+                    yield _increment_sequence_number_and_return(
+                        ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=tool_call_item,
+                        )
+                    )
+                else:
+                    yield _increment_sequence_number_and_return(
+                        ResponseFunctionCallArgumentsDeltaEvent(
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                            output_index=current_output_index,
+                            sequence_number=-1,
+                            type="response.function_call_arguments.delta",
+                        )
+                    )
 
     async def responses_stream_generator(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
+        created_time: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         # TODO:
         # 1. Handle disconnect
@@ -1635,11 +1939,11 @@ class OpenAIServingResponses(OpenAIServing):
         sequence_number = 0
 
         def _increment_sequence_number_and_return(
-                event: StreamingResponsesResponse
+            event: StreamingResponsesResponse,
         ) -> StreamingResponsesResponse:
             nonlocal sequence_number
             # Set sequence_number if the event has this attribute
-            if hasattr(event, 'sequence_number'):
+            if hasattr(event, "sequence_number"):
                 event.sequence_number = sequence_number
             sequence_number += 1
             return event
@@ -1647,15 +1951,13 @@ class OpenAIServingResponses(OpenAIServing):
         async with AsyncExitStack() as exit_stack:
             processer = None
             if self.use_harmony:
-                mcp_tools = {
-                    tool.server_label: tool
-                    for tool in request.tools if tool.type == "mcp"
-                }
-                await context.init_tool_sessions(self.tool_server, exit_stack,
-                                                 request.request_id, mcp_tools)
+                # TODO: in streaming, we noticed this bug:
+                # https://github.com/vllm-project/vllm/issues/25697
+                await self._initialize_tool_sessions(request, context, exit_stack)
                 processer = self._process_harmony_streaming_events
             else:
                 processer = self._process_simple_streaming_events
+            # TODO Hanchen make sampling params to include the structural tag
 
             initial_response = ResponsesResponse.from_request(
                 request,
@@ -1671,18 +1973,27 @@ class OpenAIServingResponses(OpenAIServing):
                     type="response.created",
                     sequence_number=-1,
                     response=initial_response,
-                ))
+                )
+            )
             yield _increment_sequence_number_and_return(
                 ResponseInProgressEvent(
                     type="response.in_progress",
                     sequence_number=-1,
                     response=initial_response,
-                ))
+                )
+            )
 
             async for event_data in processer(
-                    request, sampling_params, result_generator, context,
-                    model_name, tokenizer, request_metadata, created_time,
-                    _increment_sequence_number_and_return):
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+                created_time,
+                _increment_sequence_number_and_return,
+            ):
                 yield event_data
 
             async def empty_async_generator():
@@ -1705,5 +2016,6 @@ class OpenAIServingResponses(OpenAIServing):
                 ResponseCompletedEvent(
                     type="response.completed",
                     sequence_number=-1,
-                    response=final_response.model_dump(),
-                ))
+                    response=final_response,
+                )
+            )
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 623b1c863f779aca9baa54f593671c2ba3d34b20..9cbfc9791819ec17632257710030dea7774ce045 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -3,89 +3,92 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
-from typing import Any, Optional, Union
+from typing import Any
 
 from fastapi import Request
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
-                                              RerankRequest, RerankResponse,
-                                              RerankResult, RerankUsage,
-                                              ScoreRequest, ScoreResponse,
-                                              ScoreResponseData, UsageInfo)
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    RerankDocument,
+    RerankRequest,
+    RerankResponse,
+    RerankResult,
+    RerankUsage,
+    ScoreRequest,
+    ScoreResponse,
+    ScoreResponseData,
+    UsageInfo,
+)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.score_utils import (ScoreContentPartParam,
-                                          ScoreMultiModalParam,
-                                          _cosine_similarity,
-                                          _validate_score_input_lens,
-                                          compress_token_type_ids,
-                                          get_score_prompt)
-# yapf: enable
+from vllm.entrypoints.score_utils import (
+    ScoreContentPartParam,
+    ScoreMultiModalParam,
+    _cosine_similarity,
+    _validate_score_input_lens,
+    compress_token_type_ids,
+    get_score_prompt,
+)
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import make_async, merge_async_iterators
+from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
 
 
 class ServingScores(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         log_error_stack: bool = False,
     ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
 
     async def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
         texts_1: list[str],
         texts_2: list[str],
-        request: Union[RerankRequest, ScoreRequest],
+        request: RerankRequest | ScoreRequest,
         request_id: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        lora_request: Optional[Union[LoRARequest, None]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         input_texts = texts_1 + texts_2
 
         engine_prompts: list[TokensPrompt] = []
-        tokenize_async = make_async(tokenizer.__call__,
-                                    executor=self._tokenizer_executor)
+        tokenize_async = make_async(
+            tokenizer.__call__, executor=self._tokenizer_executor
+        )
 
         tokenization_kwargs = tokenization_kwargs or {}
         tokenized_prompts = await asyncio.gather(
-            *(tokenize_async(t, **tokenization_kwargs) for t in input_texts))
+            *(tokenize_async(t, **tokenization_kwargs) for t in input_texts)
+        )
 
         for tok_result, input_text in zip(tokenized_prompts, input_texts):
-
-            text_token_prompt = \
-                self._validate_input(
-                    request,
-                    tok_result["input_ids"],
-                    input_text)
+            text_token_prompt = self._validate_input(
+                request, tok_result["input_ids"], input_text
+            )
 
             engine_prompts.append(
-                TokensPrompt(
-                    prompt_token_ids=text_token_prompt["prompt_token_ids"]))
+                TokensPrompt(prompt_token_ids=text_token_prompt["prompt_token_ids"])
+            )
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
@@ -97,13 +100,14 @@ class ServingScores(OpenAIServing):
             return self.create_error_response(str(e))
 
         for i, engine_prompt in enumerate(engine_prompts):
-
             request_id_item = f"{request_id}-{i}"
 
-            self._log_inputs(request_id_item,
-                             input_texts[i],
-                             params=pooling_params,
-                             lora_request=lora_request)
+            self._log_inputs(
+                request_id_item,
+                input_texts[i],
+                params=pooling_params,
+                lora_request=lora_request,
+            )
 
             generators.append(
                 self.engine_client.encode(
@@ -113,15 +117,15 @@ class ServingScores(OpenAIServing):
                     lora_request=lora_request,
                     trace_headers=trace_headers,
                     priority=request.priority,
-                ))
+                )
+            )
 
         result_generator = merge_async_iterators(*generators)
 
         # Non-streaming response
         final_res_batch: list[PoolingRequestOutput] = []
 
-        embeddings: list[Optional[PoolingRequestOutput]] =\
-              [None] * len(engine_prompts)
+        embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
 
         async for i, res in result_generator:
             embeddings[i] = res
@@ -140,21 +144,20 @@ class ServingScores(OpenAIServing):
         if len(emb_texts_1) == 1:
             emb_texts_1 = emb_texts_1 * len(emb_texts_2)
 
-        final_res_batch = _cosine_similarity(tokenizer=tokenizer,
-                                             embed_1=emb_texts_1,
-                                             embed_2=emb_texts_2)
+        final_res_batch = _cosine_similarity(
+            tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2
+        )
 
         return final_res_batch
 
     def _preprocess_score(
         self,
-        request: Union[RerankRequest, ScoreRequest],
+        request: RerankRequest | ScoreRequest,
         tokenizer: AnyTokenizer,
         tokenization_kwargs: dict[str, Any],
-        data_1: Union[str, ScoreContentPartParam],
-        data_2: Union[str, ScoreContentPartParam],
+        data_1: str | ScoreContentPartParam,
+        data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
-
         model_config = self.model_config
 
         full_prompt, engine_prompt = get_score_prompt(
@@ -164,8 +167,7 @@ class ServingScores(OpenAIServing):
             tokenizer=tokenizer,
             tokenization_kwargs=tokenization_kwargs,
         )
-        self._validate_input(request, engine_prompt["prompt_token_ids"],
-                             full_prompt)
+        self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
@@ -174,14 +176,14 @@ class ServingScores(OpenAIServing):
     async def _cross_encoding_score(
         self,
         tokenizer: AnyTokenizer,
-        data_1: Union[list[str], list[ScoreContentPartParam]],
-        data_2: Union[list[str], list[ScoreContentPartParam]],
-        request: Union[RerankRequest, ScoreRequest],
+        data_1: list[str] | list[ScoreContentPartParam],
+        data_2: list[str] | list[ScoreContentPartParam],
+        request: RerankRequest | ScoreRequest,
         request_id: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        lora_request: Optional[Union[LoRARequest, None]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         request_prompts: list[str] = []
         engine_prompts: list[TokensPrompt] = []
 
@@ -189,22 +191,28 @@ class ServingScores(OpenAIServing):
             data_1 = data_1 * len(data_2)
 
         if isinstance(tokenizer, MistralTokenizer):
-            raise ValueError(
-                "MistralTokenizer not supported for cross-encoding")
+            raise ValueError("MistralTokenizer not supported for cross-encoding")
 
         tokenization_kwargs = tokenization_kwargs or {}
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        preprocess_async = make_async(self._preprocess_score,
-                                      executor=self._tokenizer_executor)
+        preprocess_async = make_async(
+            self._preprocess_score, executor=self._tokenizer_executor
+        )
 
         preprocessed_prompts = await asyncio.gather(
-            *(preprocess_async(request=request,
-                               tokenizer=tokenizer,
-                               tokenization_kwargs=tokenization_kwargs,
-                               data_1=t1,
-                               data_2=t2) for t1, t2 in input_pairs))
+            *(
+                preprocess_async(
+                    request=request,
+                    tokenizer=tokenizer,
+                    tokenization_kwargs=tokenization_kwargs,
+                    data_1=t1,
+                    data_2=t2,
+                )
+                for t1, t2 in input_pairs
+            )
+        )
 
         for full_prompt, engine_prompt in preprocessed_prompts:
             request_prompts.append(full_prompt)
@@ -223,19 +231,19 @@ class ServingScores(OpenAIServing):
         for i, engine_prompt in enumerate(engine_prompts):
             request_id_item = f"{request_id}-{i}"
 
-            self._log_inputs(request_id_item,
-                             request_prompts[i],
-                             params=default_pooling_params,
-                             lora_request=lora_request)
+            self._log_inputs(
+                request_id_item,
+                request_prompts[i],
+                params=default_pooling_params,
+                lora_request=lora_request,
+            )
 
-            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
+            if token_type_ids := engine_prompt.pop("token_type_ids", None):
                 pooling_params = default_pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
-                pooling_params.extra_kwargs = {
-                    "compressed_token_type_ids": compressed
-                }
+                pooling_params.extra_kwargs = {"compressed_token_type_ids": compressed}
             else:
-                pooling_params = (default_pooling_params)
+                pooling_params = default_pooling_params
 
             generator = self.engine_client.encode(
                 engine_prompt,
@@ -251,8 +259,9 @@ class ServingScores(OpenAIServing):
         result_generator = merge_async_iterators(*generators)
 
         # Non-streaming response
-        final_res_batch: list[
-            Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
+        final_res_batch: list[PoolingRequestOutput | None] = [None] * len(
+            engine_prompts
+        )
 
         async for i, res in result_generator:
             final_res_batch[i] = res
@@ -261,28 +270,32 @@ class ServingScores(OpenAIServing):
 
     async def _run_scoring(
         self,
-        data_1: Union[list[str], str, ScoreMultiModalParam],
-        data_2: Union[list[str], str, ScoreMultiModalParam],
-        request: Union[ScoreRequest, RerankRequest],
+        data_1: list[str] | str | ScoreMultiModalParam,
+        data_2: list[str] | str | ScoreMultiModalParam,
+        request: ScoreRequest | RerankRequest,
         request_id: str,
-        raw_request: Optional[Request] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         lora_request = self._maybe_get_adapters(request)
 
         tokenizer = await self.engine_client.get_tokenizer()
 
-        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
-                                         None)
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
         tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
-                                  tokenization_kwargs)
+        _validate_truncation_size(
+            self.max_model_len, truncate_prompt_tokens, tokenization_kwargs
+        )
 
-        trace_headers = (None if raw_request is None else await
-                         self._get_trace_headers(raw_request.headers))
+        trace_headers = (
+            None
+            if raw_request is None
+            else await self._get_trace_headers(raw_request.headers)
+        )
 
-        if not self.model_config.is_multimodal_model and (isinstance(
-                data_1, dict) or isinstance(data_2, dict)):
+        if not self.model_config.is_multimodal_model and (
+            isinstance(data_1, dict) or isinstance(data_2, dict)
+        ):
             raise ValueError(
                 f"MultiModalParam is not supported for {self.model_config.architecture}"  # noqa: E501
             )
@@ -308,7 +321,8 @@ class ServingScores(OpenAIServing):
                 request_id=request_id,
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                trace_headers=trace_headers)
+                trace_headers=trace_headers,
+            )
 
         else:
             return await self._embedding_score(
@@ -319,13 +333,14 @@ class ServingScores(OpenAIServing):
                 request_id=request_id,
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                trace_headers=trace_headers)
+                trace_headers=trace_headers,
+            )
 
     async def create_score(
         self,
         request: ScoreRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[ScoreResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> ScoreResponse | ErrorResponse:
         """
         Score API similar to Sentence Transformers cross encoder
 
@@ -362,10 +377,8 @@ class ServingScores(OpenAIServing):
             return self.create_error_response(str(e))
 
     async def do_rerank(
-        self,
-        request: RerankRequest,
-        raw_request: Optional[Request] = None
-    ) -> Union[RerankResponse, ErrorResponse]:
+        self, request: RerankRequest, raw_request: Request | None = None
+    ) -> RerankResponse | ErrorResponse:
         """
         Rerank API based on JinaAI's rerank API; implements the same
         API interface. Designed for compatibility with off-the-shelf
@@ -381,9 +394,15 @@ class ServingScores(OpenAIServing):
 
         request_id = f"rerank-{self._base_request_id(raw_request)}"
         documents = request.documents
-        top_n = request.top_n if request.top_n > 0 else (
-            len(documents)
-            if isinstance(documents, list) else len(documents["content"]))
+        top_n = (
+            request.top_n
+            if request.top_n > 0
+            else (
+                len(documents)
+                if isinstance(documents, list)
+                else len(documents["content"])
+            )
+        )
 
         try:
             final_res_batch = await self._run_scoring(
@@ -445,9 +464,13 @@ class ServingScores(OpenAIServing):
         )
 
     def request_output_to_rerank_response(
-            self, final_res_batch: list[PoolingRequestOutput], request_id: str,
-            model_name: str, documents: Union[list[str], ScoreMultiModalParam],
-            top_n: int) -> RerankResponse:
+        self,
+        final_res_batch: list[PoolingRequestOutput],
+        request_id: str,
+        model_name: str,
+        documents: list[str] | ScoreMultiModalParam,
+        top_n: int,
+    ) -> RerankResponse:
         """
         Convert the output of do_rank to a RerankResponse
         """
@@ -458,9 +481,9 @@ class ServingScores(OpenAIServing):
 
             result = RerankResult(
                 index=idx,
-                document=RerankDocument(text=documents[idx]) if isinstance(
-                    documents, list) else RerankDocument(
-                        multi_modal=documents["content"][idx]),
+                document=RerankDocument(text=documents[idx])
+                if isinstance(documents, list)
+                else RerankDocument(multi_modal=documents["content"][idx]),
                 relevance_score=classify_res.outputs.score,
             )
             results.append(result)
@@ -476,4 +499,5 @@ class ServingScores(OpenAIServing):
             id=request_id,
             model=model_name,
             results=results,
-            usage=RerankUsage(total_tokens=num_prompt_tokens))
+            usage=RerankUsage(total_tokens=num_prompt_tokens),
+        )
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 3918d08ebf81d134b70f8b5395470e778dfdda2e..39aae0cd04956364d304ff93c20bf18232bb6443 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,25 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Final, Optional, Union
+from typing import Any, Final
 
 import jinja2
 from fastapi import Request
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
-                                              DetokenizeResponse,
-                                              ErrorResponse,
-                                              TokenizeChatRequest,
-                                              TokenizeRequest,
-                                              TokenizeResponse,
-                                              TokenizerInfoResponse)
-# yapf: enable
+from vllm.entrypoints.openai.protocol import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorResponse,
+    TokenizeChatRequest,
+    TokenizeRequest,
+    TokenizeResponse,
+    TokenizerInfoResponse,
+)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
@@ -30,32 +28,33 @@ logger = init_logger(__name__)
 
 
 class OpenAIServingTokenization(OpenAIServing):
-
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
 
     async def create_tokenize(
         self,
         request: TokenizeRequest,
         raw_request: Request,
-    ) -> Union[TokenizeResponse, ErrorResponse]:
+    ) -> TokenizeResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -69,8 +68,18 @@ class OpenAIServingTokenization(OpenAIServing):
             renderer = self._get_renderer(tokenizer)
 
             if isinstance(request, TokenizeChatRequest):
-                tool_dicts = (None if request.tools is None else
-                              [tool.model_dump() for tool in request.tools])
+                tool_dicts = (
+                    None
+                    if request.tools is None
+                    else [tool.model_dump() for tool in request.tools]
+                )
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=request.chat_template,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
+                )
+                if error_check_ret is not None:
+                    return error_check_ret
                 (
                     _,
                     _,
@@ -81,8 +90,7 @@ class OpenAIServingTokenization(OpenAIServing):
                     request.messages,
                     tool_dicts=tool_dicts,
                     chat_template=request.chat_template or self.chat_template,
-                    chat_template_content_format=self.
-                    chat_template_content_format,
+                    chat_template_content_format=self.chat_template_content_format,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
                     chat_template_kwargs=request.chat_template_kwargs,
@@ -99,29 +107,29 @@ class OpenAIServingTokenization(OpenAIServing):
 
         input_ids: list[int] = []
         for engine_prompt in engine_prompts:
-            self._log_inputs(request_id,
-                             engine_prompt,
-                             params=None,
-                             lora_request=lora_request)
+            self._log_inputs(
+                request_id, engine_prompt, params=None, lora_request=lora_request
+            )
 
-            if isinstance(engine_prompt,
-                          dict) and "prompt_token_ids" in engine_prompt:
+            if isinstance(engine_prompt, dict) and "prompt_token_ids" in engine_prompt:
                 input_ids.extend(engine_prompt["prompt_token_ids"])
 
         token_strs = None
         if request.return_token_strs:
             token_strs = tokenizer.convert_ids_to_tokens(input_ids)
 
-        return TokenizeResponse(tokens=input_ids,
-                                token_strs=token_strs,
-                                count=len(input_ids),
-                                max_model_len=self.max_model_len)
+        return TokenizeResponse(
+            tokens=input_ids,
+            token_strs=token_strs,
+            count=len(input_ids),
+            max_model_len=self.max_model_len,
+        )
 
     async def create_detokenize(
         self,
         request: DetokenizeRequest,
         raw_request: Request,
-    ) -> Union[DetokenizeResponse, ErrorResponse]:
+    ) -> DetokenizeResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -132,10 +140,9 @@ class OpenAIServingTokenization(OpenAIServing):
 
         tokenizer = await self.engine_client.get_tokenizer()
 
-        self._log_inputs(request_id,
-                         request.tokens,
-                         params=None,
-                         lora_request=lora_request)
+        self._log_inputs(
+            request_id, request.tokens, params=None, lora_request=lora_request
+        )
 
         prompt_input = await self._tokenize_prompt_input_async(
             request,
@@ -147,15 +154,15 @@ class OpenAIServingTokenization(OpenAIServing):
         return DetokenizeResponse(prompt=input_text)
 
     async def get_tokenizer_info(
-        self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
+        self,
+    ) -> TokenizerInfoResponse | ErrorResponse:
         """Get comprehensive tokenizer information."""
         try:
             tokenizer = await self.engine_client.get_tokenizer()
             info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
             return TokenizerInfoResponse(**info)
         except Exception as e:
-            return self.create_error_response(
-                f"Failed to get tokenizer info: {str(e)}")
+            return self.create_error_response(f"Failed to get tokenizer info: {str(e)}")
 
     def _build_render_config(self, request: TokenizeRequest) -> RenderConfig:
         return RenderConfig(add_special_tokens=request.add_special_tokens)
@@ -164,7 +171,7 @@ class OpenAIServingTokenization(OpenAIServing):
 @dataclass
 class TokenizerInfo:
     tokenizer: AnyTokenizer
-    chat_template: Optional[str]
+    chat_template: str | None
 
     def to_dict(self) -> dict[str, Any]:
         """Return the tokenizer configuration."""
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a526b9b70d276ca261846ec8aea6ced9b62e09
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator
+from collections.abc import Sequence as GenericSequence
+
+from fastapi import Request
+
+# yapf: disable
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProb,
+    ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent,
+    ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils.collection_utils import as_list
+
+logger = init_logger(__name__)
+
+
+class ServingTokens(OpenAIServing):
+    """Provides Tokens IN <> Tokens OUT functionality to vLLM API."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        force_no_detokenize: bool = False,
+        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
+        enable_prompt_tokens_details: bool = False,
+        enable_log_outputs: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_log_outputs = enable_log_outputs
+        self.force_no_detokenize = force_no_detokenize
+        if force_no_detokenize:
+            logger.info("Tokens-only mode is enabled, skipping detokenization "
+            "step for incoming requests.")
+
+    async def serve_tokens(
+        self,
+        request: GenerateRequest,
+        raw_request: Request | None = None
+    ) -> GenerateResponse | ErrorResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        lora_request = None
+        lora_request = self._maybe_get_adapters(request,
+            supports_default_mm_loras=True)
+
+        model_name = self.models.model_name(lora_request)
+
+        request_id = "generate-tokens-" \
+                     f"{self._base_request_id(raw_request, request.request_id)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
+        # completed
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        if request.features is not None:
+            engine_prompt["multi_modal_data"] = None
+
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
+        # Schedule the request and get the result generator.
+        result_generator: AsyncGenerator[RequestOutput, None] | None = None
+        try:
+            sampling_params = request.sampling_params
+            if self.force_no_detokenize:
+                sampling_params.detokenize = False
+
+            self._log_inputs(request_id,
+                             request.token_ids,
+                             params=sampling_params,
+                             lora_request=lora_request)
+
+            trace_headers = (None if raw_request is None else await
+                             self._get_trace_headers(raw_request.headers))
+
+            result_generator = self.engine_client.generate(
+                engine_prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        # TODO(NickLucche): Implement streaming response
+
+        try:
+            assert result_generator is not None
+            return await self.serve_tokens_full_generator(
+                request, result_generator, request_id, model_name,
+                request_metadata)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+    async def serve_tokens_full_generator(
+        self,
+        request: GenerateRequest,
+        result_generator: AsyncGenerator[RequestOutput, None],
+        request_id: str,
+        model_name: str,
+        request_metadata: RequestResponseMetadata,
+    ) -> ErrorResponse | GenerateResponse:
+
+        created_time = int(time.time())
+        final_res: RequestOutput | None = None
+        sampling_params: SamplingParams = request.sampling_params
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+
+        choices: list[GenerateResponseChoice] = []
+        num_generated_tokens = 0
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            out_logprobs = output.logprobs
+
+            # This is top_logprobs in completions API
+            if sampling_params.logprobs:
+                assert out_logprobs is not None, "Did not output logprobs"
+                logprobs = self._create_tokens_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=out_logprobs,
+                    num_output_top_logprobs=sampling_params.logprobs,
+                )
+            else:
+                logprobs = None
+
+            choice_data = GenerateResponseChoice(
+                index=output.index,
+                logprobs=logprobs,
+                finish_reason=output.finish_reason
+                if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids))
+
+            choices.append(choice_data)
+            num_generated_tokens += len(output.token_ids)
+
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        if final_res.encoder_prompt_token_ids is not None:
+            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+
+        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                          completion_tokens=num_generated_tokens,
+                          total_tokens=num_prompt_tokens +
+                          num_generated_tokens)
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            # This info is not available at the /coordinator level
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+
+        request_metadata.final_usage_info = usage
+
+        response = GenerateResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            kv_transfer_params=final_res.kv_transfer_params,
+        )
+
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            for choice in choices:
+                # Get the corresponding output token IDs
+                output_token_ids = None
+                if choice.index < len(final_res.outputs):
+                    output_token_ids = final_res.outputs[
+                        choice.index].token_ids
+
+                if output_token_ids:
+                    # Log token_ids only.
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs="", 
+                        output_token_ids=output_token_ids,
+                        finish_reason=choice.finish_reason,
+                        is_streaming=False,
+                        delta=False,
+                    )
+
+        return response
+
+    def _create_tokens_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
+        num_output_top_logprobs: int | None = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
+
+        for i, token_id in enumerate(token_ids):
+            token = f"token_id:{token_id}"
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None or step_top_logprobs.get(
+                    token_id) is None:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(token=token, ))
+            else:
+                step_token = step_top_logprobs[token_id]
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                        logprob=max(step_token.logprob, -9999.0),
+                        top_logprobs=[
+                            ChatCompletionLogProb(
+                                token=token,
+                                logprob=max(p[1].logprob, -9999.0),
+                            ) for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs
+                            and i < num_output_top_logprobs
+                        ]))
+
+        return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 9ba58d4425221979315af73b1c1f16368d3b453f..33da7034afabcc0d2a267e79d55fcf6178ab9674 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,18 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import AsyncGenerator
-from typing import Optional, Union
 
 from fastapi import Request
 
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    ErrorResponse, RequestResponseMetadata, TranscriptionRequest,
-    TranscriptionResponse, TranscriptionResponseStreamChoice,
-    TranscriptionStreamResponse, TranslationRequest, TranslationResponse,
-    TranslationResponseStreamChoice, TranslationStreamResponse)
+    ErrorResponse,
+    RequestResponseMetadata,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse,
+    TranslationRequest,
+    TranslationResponse,
+    TranslationResponseStreamChoice,
+    TranslationStreamResponse,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText
 from vllm.logger import init_logger
@@ -27,26 +32,26 @@ class OpenAIServingTranscription(OpenAISpeechToText):
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         log_error_stack: bool = False,
+        enable_force_include_usage: bool = False,
     ):
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="transcribe",
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            task_type="transcribe",
+            log_error_stack=log_error_stack,
+            enable_force_include_usage=enable_force_include_usage,
+        )
 
     async def create_transcription(
-        self, audio_data: bytes, request: TranscriptionRequest,
-        raw_request: Request
-    ) -> Union[TranscriptionResponse, AsyncGenerator[str, None],
-               ErrorResponse]:
+        self, audio_data: bytes, request: TranscriptionRequest, raw_request: Request
+    ) -> TranscriptionResponse | AsyncGenerator[str, None] | ErrorResponse:
         """Transcription API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -61,10 +66,13 @@ class OpenAIServingTranscription(OpenAISpeechToText):
         )
 
     async def transcription_stream_generator(
-            self, request: TranscriptionRequest,
-            result_generator: list[AsyncGenerator[RequestOutput, None]],
-            request_id: str, request_metadata: RequestResponseMetadata,
-            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        self,
+        request: TranscriptionRequest,
+        result_generator: list[AsyncGenerator[RequestOutput, None]],
+        request_id: str,
+        request_metadata: RequestResponseMetadata,
+        audio_duration_s: float,
+    ) -> AsyncGenerator[str, None]:
         generator = self._speech_to_text_stream_generator(
             request=request,
             list_result_generator=result_generator,
@@ -85,25 +93,26 @@ class OpenAIServingTranslation(OpenAISpeechToText):
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         log_error_stack: bool = False,
+        enable_force_include_usage: bool = False,
     ):
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="translate",
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            task_type="translate",
+            log_error_stack=log_error_stack,
+            enable_force_include_usage=enable_force_include_usage,
+        )
 
     async def create_translation(
-        self, audio_data: bytes, request: TranslationRequest,
-        raw_request: Request
-    ) -> Union[TranslationResponse, AsyncGenerator[str, None], ErrorResponse]:
+        self, audio_data: bytes, request: TranslationRequest, raw_request: Request
+    ) -> TranslationResponse | AsyncGenerator[str, None] | ErrorResponse:
         """Translation API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranslation
@@ -118,10 +127,13 @@ class OpenAIServingTranslation(OpenAISpeechToText):
         )
 
     async def translation_stream_generator(
-            self, request: TranslationRequest,
-            result_generator: list[AsyncGenerator[RequestOutput, None]],
-            request_id: str, request_metadata: RequestResponseMetadata,
-            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        self,
+        request: TranslationRequest,
+        result_generator: list[AsyncGenerator[RequestOutput, None]],
+        request_id: str,
+        request_metadata: RequestResponseMetadata,
+        audio_duration_s: float,
+    ) -> AsyncGenerator[str, None]:
         generator = self._speech_to_text_stream_generator(
             request=request,
             list_result_generator=result_generator,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 965bdac3ac5ad2685f6b2181a6db5a4682018d09..b9b9b1ab30ad883f5346569c9b9d54e4ea3e294d 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -4,81 +4,91 @@ import asyncio
 import io
 import math
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Callable
 from functools import cached_property
-from typing import Callable, Literal, Optional, TypeVar, Union, cast
+from typing import Literal, TypeAlias, TypeVar, cast
 
 import numpy as np
 from fastapi import Request
 
 import vllm.envs as envs
-from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    DeltaMessage, ErrorResponse, RequestResponseMetadata,
-    TranscriptionResponse, TranscriptionResponseStreamChoice,
-    TranscriptionStreamResponse, TranslationResponse,
-    TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
-                                                    SpeechToTextRequest)
+    DeltaMessage,
+    ErrorResponse,
+    RequestResponseMetadata,
+    TranscriptionResponse,
+    TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse,
+    TranslationResponse,
+    TranslationResponseStreamChoice,
+    TranslationStreamResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsTranscription
 from vllm.outputs import RequestOutput
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 try:
     import librosa
 except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
-SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse]
+SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 T = TypeVar("T", bound=SpeechToTextResponse)
 
 logger = init_logger(__name__)
 
 
 class OpenAISpeechToText(OpenAIServing):
-    """Base class for speech-to-text operations like transcription and 
+    """Base class for speech-to-text operations like transcription and
     translation."""
 
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         task_type: Literal["transcribe", "translate"] = "transcribe",
         log_error_stack: bool = False,
+        enable_force_include_usage: bool = False,
     ):
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         log_error_stack=log_error_stack)
-
-        self.default_sampling_params = (
-            self.model_config.get_diff_sampling_param())
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            log_error_stack=log_error_stack,
+        )
+
+        self.default_sampling_params = self.model_config.get_diff_sampling_param()
         self.task_type = task_type
 
         self.asr_config = self.model_cls.get_speech_to_text_config(
-            model_config, task_type)
+            self.model_config, task_type
+        )
+
+        self.enable_force_include_usage = enable_force_include_usage
 
         self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
 
         if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
-                self.default_sampling_params)
+                self.default_sampling_params,
+            )
 
     @cached_property
     def model_cls(self) -> type[SupportsTranscription]:
         from vllm.model_executor.model_loader import get_model_cls
+
         model_cls = get_model_cls(self.model_config)
         return cast(type[SupportsTranscription], model_cls)
 
@@ -90,8 +100,11 @@ class OpenAISpeechToText(OpenAIServing):
         # Validate request
         language = self.model_cls.validate_language(request.language)
         # Skip to_language validation to avoid extra logging for Whisper.
-        to_language = self.model_cls.validate_language(request.to_language) \
-            if request.to_language else None
+        to_language = (
+            self.model_cls.validate_language(request.to_language)
+            if request.to_language
+            else None
+        )
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise ValueError("Maximum file size exceeded.")
@@ -102,8 +115,10 @@ class OpenAISpeechToText(OpenAIServing):
             y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
 
         duration = librosa.get_duration(y=y, sr=sr)
-        do_split_audio = (self.asr_config.allow_audio_chunking
-                          and duration > self.asr_config.max_audio_clip_s)
+        do_split_audio = (
+            self.asr_config.allow_audio_chunking
+            and duration > self.asr_config.max_audio_clip_s
+        )
         chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
         prompts = []
         for chunk in chunks:
@@ -128,8 +143,8 @@ class OpenAISpeechToText(OpenAIServing):
         raw_request: Request,
         response_class: type[T],
         stream_generator_method: Callable[..., AsyncGenerator[str, None]],
-    ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]:
-        """Base method for speech-to-text operations like transcription and 
+    ) -> T | AsyncGenerator[str, None] | ErrorResponse:
+        """Base method for speech-to-text operations like transcription and
         translation."""
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -141,9 +156,10 @@ class OpenAISpeechToText(OpenAIServing):
         if self.engine_client.errored:
             raise self.engine_client.dead_error
 
-        if request.response_format not in ['text', 'json']:
+        if request.response_format not in ["text", "json"]:
             return self.create_error_response(
-                "Currently only support response_format `text` or `json`")
+                "Currently only support response_format `text` or `json`"
+            )
 
         request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
 
@@ -154,11 +170,6 @@ class OpenAISpeechToText(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if lora_request:
-                return self.create_error_response(
-                    "Currently do not support LoRA for "
-                    f"{self.task_type.title()}.")
-
             prompts, duration_s = await self._preprocess_speech_to_text(
                 request=request,
                 audio_data=audio_data,
@@ -168,38 +179,41 @@ class OpenAISpeechToText(OpenAIServing):
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        list_result_generator: Optional[list[AsyncGenerator[RequestOutput,
-                                                            None]]] = None
+        list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
         try:
             # Unlike most decoder-only models, whisper generation length is not
             # constrained by the size of the input audio, which is mapped to a
             # fixed-size log-mel-spectogram.
             default_max_tokens = self.model_config.max_model_len
             sampling_params = request.to_sampling_params(
-                default_max_tokens, self.default_sampling_params)
+                default_max_tokens, self.default_sampling_params
+            )
 
             self._log_inputs(
                 request_id,
                 # It will not display special tokens like <|startoftranscript|>
                 request.prompt,
                 params=sampling_params,
-                lora_request=None)
+                lora_request=lora_request,
+            )
 
             list_result_generator = [
                 self.engine_client.generate(
                     prompt,
                     sampling_params,
                     request_id,
-                ) for prompt in prompts
+                    lora_request=lora_request,
+                )
+                for prompt in prompts
             ]
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
         if request.stream:
-            return stream_generator_method(request, list_result_generator,
-                                           request_id, request_metadata,
-                                           duration_s)
+            return stream_generator_method(
+                request, list_result_generator, request_id, request_metadata, duration_s
+            )
         # Non-streaming response.
         try:
             assert list_result_generator is not None
@@ -215,12 +229,10 @@ class OpenAISpeechToText(OpenAIServing):
                     # rounded up as per openAI specs
                     "seconds": int(math.ceil(duration_s)),
                 }
-                final_response = cast(T, response_class(text=text,
-                                                        usage=usage))
+                final_response = cast(T, response_class(text=text, usage=usage))
             else:
                 # no usage in response for translation task
-                final_response = cast(
-                    T, response_class(text=text))  # type: ignore[call-arg]
+                final_response = cast(T, response_class(text=text))  # type: ignore[call-arg]
 
             return final_response
         except asyncio.CancelledError:
@@ -237,11 +249,10 @@ class OpenAISpeechToText(OpenAIServing):
         request_metadata: RequestResponseMetadata,
         audio_duration_s: float,
         chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
-        response_stream_choice_class: Union[
-            type[TranscriptionResponseStreamChoice],
-            type[TranslationResponseStreamChoice]],
-        stream_response_class: Union[type[TranscriptionStreamResponse],
-                                     type[TranslationStreamResponse]],
+        response_stream_choice_class: type[TranscriptionResponseStreamChoice]
+        | type[TranslationResponseStreamChoice],
+        stream_response_class: type[TranscriptionStreamResponse]
+        | type[TranslationStreamResponse],
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
         model_name = request.model
@@ -249,11 +260,12 @@ class OpenAISpeechToText(OpenAIServing):
         completion_tokens = 0
         num_prompt_tokens = 0
 
-        include_usage = request.stream_include_usage \
-            if request.stream_include_usage else False
-        include_continuous_usage = request.stream_continuous_usage_stats\
-            if include_usage and request.stream_continuous_usage_stats\
+        include_usage = self.enable_force_include_usage or request.stream_include_usage
+        include_continuous_usage = (
+            request.stream_continuous_usage_stats
+            if include_usage and request.stream_continuous_usage_stats
             else False
+        )
 
         try:
             for result_generator in list_result_generator:
@@ -262,8 +274,8 @@ class OpenAISpeechToText(OpenAIServing):
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                                audio_duration_s, self.asr_config,
-                                self.model_config):
+                            audio_duration_s, self.asr_config, self.model_config
+                        ):
                             num_prompt_tokens += audio_tokens
 
                     # We need to do it here, because if there are exceptions in
@@ -279,20 +291,22 @@ class OpenAISpeechToText(OpenAIServing):
 
                     if output.finish_reason is None:
                         # Still generating, send delta update.
-                        choice_data = response_stream_choice_class(
-                            delta=delta_message)
+                        choice_data = response_stream_choice_class(delta=delta_message)
                     else:
                         # Model is finished generating.
                         choice_data = response_stream_choice_class(
                             delta=delta_message,
                             finish_reason=output.finish_reason,
-                            stop_reason=output.stop_reason)
+                            stop_reason=output.stop_reason,
+                        )
 
-                    chunk = stream_response_class(id=request_id,
-                                                  object=chunk_object_type,
-                                                  created=created_time,
-                                                  choices=[choice_data],
-                                                  model=model_name)
+                    chunk = stream_response_class(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name,
+                    )
 
                     # handle usage stats if requested & if continuous
                     if include_continuous_usage:
@@ -308,10 +322,11 @@ class OpenAISpeechToText(OpenAIServing):
             # Once the final token is handled, if stream_options.include_usage
             # is sent, send the usage.
             if include_usage:
-                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
-                                        completion_tokens=completion_tokens,
-                                        total_tokens=num_prompt_tokens +
-                                        completion_tokens)
+                final_usage = UsageInfo(
+                    prompt_tokens=num_prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=num_prompt_tokens + completion_tokens,
+                )
 
                 final_usage_chunk = stream_response_class(
                     id=request_id,
@@ -319,16 +334,19 @@ class OpenAISpeechToText(OpenAIServing):
                     created=created_time,
                     choices=[],
                     model=model_name,
-                    usage=final_usage)
-                final_usage_data = (final_usage_chunk.model_dump_json(
-                    exclude_unset=True, exclude_none=True))
+                    usage=final_usage,
+                )
+                final_usage_data = final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True
+                )
                 yield f"data: {final_usage_data}\n\n"
 
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = UsageInfo(
                 prompt_tokens=num_prompt_tokens,
                 completion_tokens=completion_tokens,
-                total_tokens=num_prompt_tokens + completion_tokens)
+                total_tokens=num_prompt_tokens + completion_tokens,
+            )
 
         except Exception as e:
             # TODO: Use a vllm-specific Validation Error
@@ -338,8 +356,9 @@ class OpenAISpeechToText(OpenAIServing):
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
 
-    def _split_audio(self, audio_data: np.ndarray,
-                     sample_rate: int) -> list[np.ndarray]:
+    def _split_audio(
+        self, audio_data: np.ndarray, sample_rate: int
+    ) -> list[np.ndarray]:
         chunk_size = sample_rate * self.asr_config.max_audio_clip_s
         overlap_size = sample_rate * self.asr_config.overlap_chunk_second
         chunks = []
@@ -353,17 +372,15 @@ class OpenAISpeechToText(OpenAIServing):
             # Find the best split point in the overlap region
             search_start = i + chunk_size - overlap_size
             search_end = min(i + chunk_size, audio_data.shape[-1])
-            split_point = self._find_split_point(audio_data, search_start,
-                                                 search_end)
+            split_point = self._find_split_point(audio_data, search_start, search_end)
 
             # Extract chunk up to the split point
             chunks.append(audio_data[..., i:split_point])
             i = split_point
         return chunks
 
-    def _find_split_point(self, wav: np.ndarray, start_idx: int,
-                          end_idx: int) -> int:
-        """Find the best point to split audio by 
+    def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
+        """Find the best point to split audio by
         looking for silence or low amplitude.
         Args:
             wav: Audio tensor [1, T]
@@ -380,8 +397,8 @@ class OpenAISpeechToText(OpenAIServing):
         min_energy_window = self.asr_config.min_energy_split_window_size
         assert min_energy_window is not None
         for i in range(0, len(segment) - min_energy_window, min_energy_window):
-            window = segment[i:i + min_energy_window]
-            energy = (window**2).mean()**0.5
+            window = segment[i : i + min_energy_window]
+            energy = (window**2).mean() ** 0.5
             if energy < min_energy:
                 quietest_idx = i + start_idx
                 min_energy = energy
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 2c5a0a6af23f06d8d38311a1e3b719ee927efb3e..89e439dd53f5fe5e506ea47e254eeb833f503586 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,55 +1,142 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .abstract_tool_parser import ToolParser, ToolParserManager
-from .deepseekv3_tool_parser import DeepSeekV3ToolParser
-from .deepseekv31_tool_parser import DeepSeekV31ToolParser
-from .glm4_moe_tool_parser import Glm4MoeModelToolParser
-from .granite_20b_fc_tool_parser import Granite20bFCToolParser
-from .granite_tool_parser import GraniteToolParser
-from .hermes_tool_parser import Hermes2ProToolParser
-from .hunyuan_a13b_tool_parser import HunyuanA13BToolParser
-from .internlm2_tool_parser import Internlm2ToolParser
-from .jamba_tool_parser import JambaToolParser
-from .kimi_k2_tool_parser import KimiK2ToolParser
-from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
-from .llama_tool_parser import Llama3JsonToolParser
-from .longcat_tool_parser import LongcatFlashToolParser
-from .minimax_tool_parser import MinimaxToolParser
-from .mistral_tool_parser import MistralToolParser
-from .openai_tool_parser import OpenAIToolParser
-from .phi4mini_tool_parser import Phi4MiniJsonToolParser
-from .pythonic_tool_parser import PythonicToolParser
-from .qwen3coder_tool_parser import Qwen3CoderToolParser
-from .qwen3xml_tool_parser import Qwen3XMLToolParser
-from .seed_oss_tool_parser import SeedOssToolParser
-from .step3_tool_parser import Step3ToolParser
-from .xlam_tool_parser import xLAMToolParser
-
-__all__ = [
-    "ToolParser",
-    "ToolParserManager",
-    "Granite20bFCToolParser",
-    "GraniteToolParser",
-    "Hermes2ProToolParser",
-    "MistralToolParser",
-    "Internlm2ToolParser",
-    "Llama3JsonToolParser",
-    "JambaToolParser",
-    "Llama4PythonicToolParser",
-    "LongcatFlashToolParser",
-    "PythonicToolParser",
-    "Phi4MiniJsonToolParser",
-    "DeepSeekV3ToolParser",
-    "DeepSeekV31ToolParser",
-    "xLAMToolParser",
-    "MinimaxToolParser",
-    "KimiK2ToolParser",
-    "HunyuanA13BToolParser",
-    "Glm4MoeModelToolParser",
-    "Qwen3CoderToolParser",
-    "Qwen3XMLToolParser",
-    "SeedOssToolParser",
-    "Step3ToolParser",
-    "OpenAIToolParser",
-]
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+
+__all__ = ["ToolParser", "ToolParserManager"]
+
+
+"""
+Register a lazy module mapping.
+
+Example:
+    ToolParserManager.register_lazy_module(
+        name="kimi_k2",
+        module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
+        class_name="KimiK2ToolParser",
+    )
+"""
+
+
+_TOOL_PARSERS_TO_REGISTER = {
+    "deepseek_v3": (  # name
+        "deepseekv3_tool_parser",  # filename
+        "DeepSeekV3ToolParser",  # class_name
+    ),
+    "deepseek_v31": (
+        "deepseekv31_tool_parser",
+        "DeepSeekV31ToolParser",
+    ),
+    "ernie45": (
+        "ernie45_tool_parser",
+        "Ernie45ToolParser",
+    ),
+    "glm45": (
+        "glm4_moe_tool_parser",
+        "Glm4MoeModelToolParser",
+    ),
+    "granite-20b-fc": (
+        "granite_20b_fc_tool_parser",
+        "Granite20bFCToolParser",
+    ),
+    "granite": (
+        "granite_tool_parser",
+        "GraniteToolParser",
+    ),
+    "hermes": (
+        "hermes_tool_parser",
+        "Hermes2ProToolParser",
+    ),
+    "hunyuan_a13b": (
+        "hunyuan_a13b_tool_parser",
+        "HunyuanA13BToolParser",
+    ),
+    "internlm": (
+        "internlm2_tool_parser",
+        "Internlm2ToolParser",
+    ),
+    "jamba": (
+        "jamba_tool_parser",
+        "JambaToolParser",
+    ),
+    "kimi_k2": (
+        "kimi_k2_tool_parser",
+        "KimiK2ToolParser",
+    ),
+    "llama3_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_pythonic": (
+        "llama4_pythonic_tool_parser",
+        "Llama4PythonicToolParser",
+    ),
+    "longcat": (
+        "longcat_tool_parser",
+        "LongcatFlashToolParser",
+    ),
+    "minimax_m2": (
+        "minimax_m2_tool_parser",
+        "MinimaxM2ToolParser",
+    ),
+    "minimax": (
+        "minimax_tool_parser",
+        "MinimaxToolParser",
+    ),
+    "mistral": (
+        "mistral_tool_parser",
+        "MistralToolParser",
+    ),
+    "olmo3": (
+        "olmo3_tool_parser",
+        "Olmo3PythonicToolParser",
+    ),
+    "openai": (
+        "openai_tool_parser",
+        "OpenAIToolParser",
+    ),
+    "phi4_mini_json": (
+        "phi4mini_tool_parser",
+        "Phi4MiniJsonToolParser",
+    ),
+    "pythonic": (
+        "pythonic_tool_parser",
+        "PythonicToolParser",
+    ),
+    "qwen3_coder": (
+        "qwen3coder_tool_parser",
+        "Qwen3CoderToolParser",
+    ),
+    "qwen3_xml": (
+        "qwen3xml_tool_parser",
+        "Qwen3XMLToolParser",
+    ),
+    "seed_oss": (
+        "seed_oss_tool_parser",
+        "SeedOssToolParser",
+    ),
+    "step3": (
+        "step3_tool_parser",
+        "Step3ToolParser",
+    ),
+    "xlam": (
+        "xlam_tool_parser",
+        "xLAMToolParser",
+    ),
+}
+
+
+def register_lazy_tool_parsers():
+    for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
+        module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}"
+        ToolParserManager.register_lazy_module(name, module_path, class_name)
+
+
+register_lazy_tool_parsers()
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 02aeab6136316b1a5888e3e617213c0c55d15df0..e99e405f5de65b2bef3bc59a27cdea671609a8ab 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,17 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
 import os
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import cached_property
-from typing import Callable, Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage,
-                                              ExtractedToolCallInformation)
+from openai.types.responses.response_format_text_json_schema_config import (
+    ResponseFormatTextJSONSchemaConfig,
+)
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    ResponsesRequest,
+    ResponseTextConfig,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.logger import init_logger
+from vllm.sampling_params import (
+    StructuredOutputsParams,
+)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import import_from_path, is_list_of
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.import_utils import import_from_path
 
 logger = init_logger(__name__)
 
@@ -38,16 +51,37 @@ class ToolParser:
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
         """
         Static method that used to adjust the request parameters.
         """
+        if not request.tools:
+            return request
+        json_schema_from_tool = get_json_schema_from_tools(
+            tool_choice=request.tool_choice, tools=request.tools
+        )
+        # Set structured output params for tool calling
+        if json_schema_from_tool is not None:
+            if isinstance(request, ChatCompletionRequest):
+                request.structured_outputs = StructuredOutputsParams()
+                # tool_choice: "Forced Function" or "required" will override
+                # structured output json settings to make tool calling work correctly
+                request.structured_outputs.json = json_schema_from_tool
+            if isinstance(request, ResponsesRequest):
+                request.text = ResponseTextConfig()
+                request.text.format = ResponseFormatTextJSONSchemaConfig(
+                    name="tool_calling_response",
+                    schema=json_schema_from_tool,
+                    type="json_schema",
+                    description="Response format for tool calling",
+                    strict=True,
+                )
+
         return request
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Static method that should be implemented for extracting tool calls from
         a complete model-generated string.
@@ -56,7 +90,8 @@ class ToolParser:
         Static because it's stateless.
         """
         raise NotImplementedError(
-            "AbstractToolParser.extract_tool_calls has not been implemented!")
+            "AbstractToolParser.extract_tool_calls has not been implemented!"
+        )
 
     def extract_tool_calls_streaming(
         self,
@@ -67,7 +102,7 @@ class ToolParser:
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Instance method that should be implemented for extracting tool calls
         from an incomplete response; for use when handling tool calls and
@@ -76,89 +111,163 @@ class ToolParser:
         previously been parsed and extracted (see constructor)
         """
         raise NotImplementedError(
-            "AbstractToolParser.extract_tool_calls_streaming has not been "
-            "implemented!")
+            "AbstractToolParser.extract_tool_calls_streaming has not been implemented!"
+        )
 
 
 class ToolParserManager:
-    tool_parsers: dict[str, type] = {}
+    """
+    Central registry for ToolParser implementations.
+
+    Supports two modes:
+      - Eager (immediate) registration via `register_module`
+      - Lazy registration via `register_lazy_module`
+    """
+
+    tool_parsers: dict[str, type[ToolParser]] = {}
+    lazy_parsers: dict[str, tuple[str, str]] = {}  # name -> (module_path, class_name)
 
     @classmethod
-    def get_tool_parser(cls, name) -> type:
+    def get_tool_parser(cls, name: str) -> type[ToolParser]:
         """
-        Get tool parser by name which is registered by `register_module`.
+        Retrieve a registered or lazily registered ToolParser class.
 
-        Raise a KeyError exception if the name is not registered.
+        If the parser is lazily registered,
+        it will be imported and cached on first access.
+        Raises KeyError if not found.
         """
         if name in cls.tool_parsers:
             return cls.tool_parsers[name]
 
-        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
+        if name in cls.lazy_parsers:
+            return cls._load_lazy_parser(name)
+
+        raise KeyError(f"Tool parser '{name}' not found.")
+
+    @classmethod
+    def _load_lazy_parser(cls, name: str) -> type[ToolParser]:
+        """Import and register a lazily loaded parser."""
+        module_path, class_name = cls.lazy_parsers[name]
+        try:
+            mod = importlib.import_module(module_path)
+            parser_cls = getattr(mod, class_name)
+            if not issubclass(parser_cls, ToolParser):
+                raise TypeError(
+                    f"{class_name} in {module_path} is not a ToolParser subclass."
+                )
+            cls.tool_parsers[name] = parser_cls  # cache
+            return parser_cls
+        except Exception as e:
+            logger.exception(
+                "Failed to import lazy tool parser '%s' from %s: %s",
+                name,
+                module_path,
+                e,
+            )
+            raise
 
     @classmethod
-    def _register_module(cls,
-                         module: type,
-                         module_name: Optional[Union[str, list[str]]] = None,
-                         force: bool = True) -> None:
+    def _register_module(
+        cls,
+        module: type[ToolParser],
+        module_name: str | list[str] | None = None,
+        force: bool = True,
+    ) -> None:
+        """Register a ToolParser class immediately."""
         if not issubclass(module, ToolParser):
             raise TypeError(
-                f'module must be subclass of ToolParser, but got {type(module)}'
+                f"module must be subclass of ToolParser, but got {type(module)}"
             )
+
         if module_name is None:
             module_name = module.__name__
+
         if isinstance(module_name, str):
-            module_name = [module_name]
-        for name in module_name:
+            module_names = [module_name]
+        elif is_list_of(module_name, str):
+            module_names = module_name
+        else:
+            raise TypeError("module_name must be str, list[str], or None.")
+
+        for name in module_names:
             if not force and name in cls.tool_parsers:
-                existed_module = cls.tool_parsers[name]
-                raise KeyError(f'{name} is already registered '
-                               f'at {existed_module.__module__}')
+                existed = cls.tool_parsers[name]
+                raise KeyError(f"{name} is already registered at {existed.__module__}")
             cls.tool_parsers[name] = module
 
+    @classmethod
+    def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
+        """
+        Register a lazy module mapping.
+
+        Example:
+            ToolParserManager.register_lazy_module(
+                name="kimi_k2",
+                module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
+                class_name="KimiK2ToolParser",
+            )
+        """
+        cls.lazy_parsers[name] = (module_path, class_name)
+
     @classmethod
     def register_module(
-            cls,
-            name: Optional[Union[str, list[str]]] = None,
-            force: bool = True,
-            module: Union[type, None] = None) -> Union[type, Callable]:
+        cls,
+        name: str | list[str] | None = None,
+        force: bool = True,
+        module: type[ToolParser] | None = None,
+    ) -> type[ToolParser] | Callable[[type[ToolParser]], type[ToolParser]]:
         """
-        Register module with the given name or name list. it can be used as a
-        decoder(with module as None) or normal function(with module as not 
-        None).
+        Register module immediately or lazily (as a decorator).
+
+        Usage:
+            @ToolParserManager.register_module("kimi_k2")
+            class KimiK2ToolParser(ToolParser):
+                ...
+
+        Or:
+            ToolParserManager.register_module(module=SomeToolParser)
         """
         if not isinstance(force, bool):
-            raise TypeError(f'force must be a boolean, but got {type(force)}')
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
 
-        # raise the error ahead of time
-        if not (name is None or isinstance(name, str)
-                or is_list_of(name, str)):
-            raise TypeError(
-                'name must be None, an instance of str, or a sequence of str, '
-                f'but got {type(name)}')
-
-        # use it as a normal method: x.register_module(module=SomeClass)
+        # Immediate registration
         if module is not None:
             cls._register_module(module=module, module_name=name, force=force)
             return module
 
-        # use it as a decorator: @x.register_module()
-        def _register(module):
-            cls._register_module(module=module, module_name=name, force=force)
-            return module
+        # Decorator usage
+        def _decorator(obj: type[ToolParser]) -> type[ToolParser]:
+            module_path = obj.__module__
+            class_name = obj.__name__
+
+            if isinstance(name, str):
+                names = [name]
+            elif name is not None and is_list_of(name, str):
+                names = name
+            else:
+                names = [class_name]
 
-        return _register
+            for n in names:
+                # Lazy mapping only: do not import now
+                cls.lazy_parsers[n] = (module_path, class_name)
+
+            return obj
+
+        return _decorator
+
+    @classmethod
+    def list_registered(cls) -> list[str]:
+        """Return names of all eagerly and lazily registered tool parsers."""
+        return sorted(set(cls.tool_parsers.keys()) | set(cls.lazy_parsers.keys()))
 
     @classmethod
     def import_tool_parser(cls, plugin_path: str) -> None:
-        """
-        Import a user-defined tool parser by the path of the tool parser define
-        file.
-        """
-        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+        """Import a user-defined parser file from arbitrary path."""
 
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
         try:
             import_from_path(module_name, plugin_path)
         except Exception:
-            logger.exception("Failed to load module '%s' from %s.",
-                             module_name, plugin_path)
-            return
+            logger.exception(
+                "Failed to load module '%s' from %s.", module_name, plugin_path
+            )
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index 09095f8991773f1f4134c36e400ad80e4b2cd7eb..cbeb879969ece9d16d583909777cfacfde6d6449 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -2,35 +2,38 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("deepseek_v31")
 class DeepSeekV31ToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = (
-            [])  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
 
         self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
         self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
@@ -43,41 +46,43 @@ class DeepSeekV31ToolParser(ToolParser):
         )
 
         self.stream_tool_call_portion_regex = re.compile(
-            r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)")
+            r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)"
+        )
 
         self.stream_tool_call_name_regex = re.compile(
-            r"(?P<function_name>.*)<｜tool▁sep｜>")
+            r"(?P<function_name>.*)<｜tool▁sep｜>"
+        )
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
-        self.tool_calls_start_token_id = self.vocab.get(
-            self.tool_calls_start_token)
-        self.tool_calls_end_token_id = self.vocab.get(
-            self.tool_calls_end_token)
-
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+                "constructor during construction."
+            )
+        self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
-        if (self.tool_calls_start_token_id is None
-                or self.tool_calls_end_token_id is None):
+        if (
+            self.tool_calls_start_token_id is None
+            or self.tool_calls_end_token_id is None
+        ):
             raise RuntimeError(
                 "DeepSeek-V3.1 Tool parser could not locate tool call "
-                "start/end tokens in the tokenizer!")
+                "start/end tokens in the tokenizer!"
+            )
 
     def extract_tool_calls(
         self,
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
         # sanity check; avoid unnecessary processing
         if self.tool_calls_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         else:
             try:
@@ -85,8 +90,7 @@ class DeepSeekV31ToolParser(ToolParser):
                 # tag and end-of-string so the result of
                 # findall is an array of tuples where one is a function call and
                 # the other is None
-                function_call_tuples = self.tool_call_regex.findall(
-                    model_output)
+                function_call_tuples = self.tool_call_regex.findall(model_output)
 
                 tool_calls = []
                 for match in function_call_tuples:
@@ -94,12 +98,13 @@ class DeepSeekV31ToolParser(ToolParser):
                     tool_calls.append(
                         ToolCall(
                             type="function",
-                            function=FunctionCall(name=function_name,
-                                                  arguments=function_args),
-                        ))
+                            function=FunctionCall(
+                                name=function_name, arguments=function_args
+                            ),
+                        )
+                    )
 
-                content = model_output[:model_output.
-                                       find(self.tool_calls_start_token)]
+                content = model_output[: model_output.find(self.tool_calls_start_token)]
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=tool_calls,
@@ -107,11 +112,10 @@ class DeepSeekV31ToolParser(ToolParser):
                 )
 
             except Exception:
-                logger.exception(
-                    "Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
     def extract_tool_calls_streaming(
         self,
@@ -122,56 +126,59 @@ class DeepSeekV31ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
         if self.tool_calls_start_token_id not in current_token_ids:
             logger.debug("No tool call tokens found!")
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token,
-                                        "").replace(self.tool_calls_end_token,
-                                                    "")
+        delta_text = delta_text.replace(self.tool_calls_start_token, "").replace(
+            self.tool_calls_end_token, ""
+        )
         try:
-
             # figure out where we are in the parsing by counting tool call
             # start & end tags
             prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id)
             cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id)
             tool_call_portion = None
             text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
-            if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count
-                    and self.tool_call_end_token not in delta_text):
+            if (
+                cur_tool_start_count == cur_tool_end_count
+                and prev_tool_end_count == cur_tool_end_count
+                and self.tool_call_end_token not in delta_text
+            ):
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
             if self.tool_call_end_token in delta_text:
                 logger.debug("tool_call_end_token in delta_text")
                 full_text = current_text + delta_text
-                tool_call_portion = full_text.split(
-                    self.tool_call_start_token)[-1].split(
-                        self.tool_call_end_token)[0].rstrip()
-                delta_text = delta_text.split(
-                    self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(
-                    self.tool_call_end_token)[-1].lstrip()
+                tool_call_portion = (
+                    full_text.split(self.tool_call_start_token)[-1]
+                    .split(self.tool_call_end_token)[0]
+                    .rstrip()
+                )
+                delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip()
 
             # case -- we're starting a new tool call
-            if (cur_tool_start_count > cur_tool_end_count
-                    and cur_tool_start_count > prev_tool_start_count):
+            if (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count > prev_tool_start_count
+            ):
                 if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(
-                        self.tool_call_start_token)[-1]
+                    tool_call_portion = current_text.split(self.tool_call_start_token)[
+                        -1
+                    ]
                 else:
                     tool_call_portion = None
                     delta = None
@@ -185,27 +192,29 @@ class DeepSeekV31ToolParser(ToolParser):
                 logger.debug("Starting on a new tool %s", self.current_tool_id)
 
             # case -- we're updating an existing tool call
-            elif (cur_tool_start_count > cur_tool_end_count
-                  and cur_tool_start_count == prev_tool_start_count):
-
+            elif (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count == prev_tool_start_count
+            ):
                 # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(
-                    self.tool_call_start_token)[-1]
+                tool_call_portion = current_text.split(self.tool_call_start_token)[-1]
                 text_portion = None
 
             # case -- the current tool call is being closed.
-            elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count >= prev_tool_end_count):
-                if self.prev_tool_call_arr is None or len(
-                        self.prev_tool_call_arr) == 0:
-                    logger.debug(
-                        "attempting to close tool call, but no tool call")
+            elif (
+                cur_tool_start_count == cur_tool_end_count
+                and cur_tool_end_count >= prev_tool_end_count
+            ):
+                if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
+                    logger.debug("attempting to close tool call, but no tool call")
                     return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments")
+                diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
-                    diff = (diff.encode("utf-8").decode("unicode_escape")
-                            if diff is str else diff)
+                    diff = (
+                        diff.encode("utf-8").decode("unicode_escape")
+                        if diff is str
+                        else diff
+                    )
                     if '"}' not in delta_text:
                         return None
                     end_loc = delta_text.rindex('"}')
@@ -216,13 +225,16 @@ class DeepSeekV31ToolParser(ToolParser):
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=diff).model_dump(exclude_none=True),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(arguments=diff).model_dump(
+                                    exclude_none=True
+                                ),
+                            )
+                        ]
+                    )
 
             # case -- otherwise we're just generating text
             else:
@@ -233,17 +245,17 @@ class DeepSeekV31ToolParser(ToolParser):
 
             current_tool_call = dict()
             if tool_call_portion:
-                current_tool_call_matches = (
-                    self.stream_tool_call_portion_regex.match(
-                        tool_call_portion))
+                current_tool_call_matches = self.stream_tool_call_portion_regex.match(
+                    tool_call_portion
+                )
                 if current_tool_call_matches:
                     tool_name, tool_args = current_tool_call_matches.groups()
                     current_tool_call["name"] = tool_name
                     current_tool_call["arguments"] = tool_args
                 else:
                     current_tool_call_name_matches = (
-                        self.stream_tool_call_name_regex.match(
-                            tool_call_portion))
+                        self.stream_tool_call_name_regex.match(tool_call_portion)
+                    )
                     if current_tool_call_name_matches:
                         tool_name = current_tool_call_name_matches.groups()
                         current_tool_call["name"] = tool_name
@@ -257,19 +269,21 @@ class DeepSeekV31ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            type="function",
-                            id=make_tool_call_id(),
-                            function=DeltaFunctionCall(
-                                name=function_name).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                 else:
                     return None
 
@@ -279,15 +293,19 @@ class DeepSeekV31ToolParser(ToolParser):
             if tool_call_portion is None:
                 # if there's text but not tool calls, send that -
                 # otherwise None to skip chunk
-                delta = (DeltaMessage(
-                    content=delta_text) if text_portion is not None else None)
+                delta = (
+                    DeltaMessage(content=delta_text)
+                    if text_portion is not None
+                    else None
+                )
                 return delta
 
             # now, the nitty-gritty of tool calls
             # now we have the portion to parse as tool call.
 
-            logger.debug("Trying to parse current tool call with ID %s",
-                         self.current_tool_id)
+            logger.debug(
+                "Trying to parse current tool call with ID %s", self.current_tool_id
+            )
 
             # if we're starting a new tool call, push an empty object in as
             #   a placeholder for the arguments
@@ -297,7 +315,8 @@ class DeepSeekV31ToolParser(ToolParser):
             # main logic for tool parsing here - compare prev. partially-parsed
             #   JSON to the current partially-parsed JSON
             prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                "arguments")
+                "arguments"
+            )
             cur_arguments = current_tool_call.get("arguments")
 
             logger.debug("diffing old arguments: %s", prev_arguments)
@@ -311,52 +330,56 @@ class DeepSeekV31ToolParser(ToolParser):
             # case -- prev arguments are defined, but non are now.
             #   probably impossible, but not a fatal error - just keep going
             elif not cur_arguments and prev_arguments:
-                logger.error("should be impossible to have arguments reset "
-                             "mid-call. skipping streaming anything.")
+                logger.error(
+                    "should be impossible to have arguments reset "
+                    "mid-call. skipping streaming anything."
+                )
                 delta = None
 
             # case -- we now have the first info about arguments available from
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
-
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_id,
-                        function=DeltaFunctionCall(
-                            arguments=cur_arguments).model_dump(
-                                exclude_none=True),
-                    )
-                ])
-                self.streamed_args_for_tool[
-                    self.current_tool_id] = cur_arguments
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=cur_arguments
+                            ).model_dump(exclude_none=True),
+                        )
+                    ]
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
 
             # last case -- we have an update to existing arguments.
             elif cur_arguments and prev_arguments:
-                if (isinstance(delta_text, str)
-                        and cur_arguments != prev_arguments
-                        and len(cur_arguments) > len(prev_arguments)
-                        and cur_arguments.startswith(prev_arguments)):
-                    delta_arguments = cur_arguments[len(prev_arguments):]
+                if (
+                    isinstance(delta_text, str)
+                    and cur_arguments != prev_arguments
+                    and len(cur_arguments) > len(prev_arguments)
+                    and cur_arguments.startswith(prev_arguments)
+                ):
+                    delta_arguments = cur_arguments[len(prev_arguments) :]
                     logger.debug("got diff %s", delta_text)
 
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=delta_arguments).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] = cur_arguments
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=delta_arguments
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
                 else:
                     delta = None
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
             if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[
-                    self.current_tool_id] = current_tool_call
+                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index ac272b0c3b2054aea0a89c35c5d332de9829b952..bf7f6fa61ab90f2f428aa204aa2edf2ca7bb61fa 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -2,35 +2,38 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("deepseek_v3")
 class DeepSeekV3ToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = (
-            [])  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
 
         self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
         self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
@@ -47,38 +50,39 @@ class DeepSeekV3ToolParser(ToolParser):
         )
 
         self.stream_tool_call_name_regex = re.compile(
-            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n")
+            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n"
+        )
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
-        self.tool_calls_start_token_id = self.vocab.get(
-            self.tool_calls_start_token)
-        self.tool_calls_end_token_id = self.vocab.get(
-            self.tool_calls_end_token)
-
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+                "constructor during construction."
+            )
+        self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
-        if (self.tool_calls_start_token_id is None
-                or self.tool_calls_end_token_id is None):
+        if (
+            self.tool_calls_start_token_id is None
+            or self.tool_calls_end_token_id is None
+        ):
             raise RuntimeError(
                 "DeepSeek-V3 Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+                "tokens in the tokenizer!"
+            )
 
     def extract_tool_calls(
         self,
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
         # sanity check; avoid unnecessary processing
         if self.tool_calls_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         else:
             try:
@@ -86,8 +90,7 @@ class DeepSeekV3ToolParser(ToolParser):
                 # tag and end-of-string so the result of
                 # findall is an array of tuples where one is a function call and
                 # the other is None
-                function_call_tuples = self.tool_call_regex.findall(
-                    model_output)
+                function_call_tuples = self.tool_call_regex.findall(model_output)
 
                 tool_calls = []
                 for match in function_call_tuples:
@@ -95,12 +98,13 @@ class DeepSeekV3ToolParser(ToolParser):
                     tool_calls.append(
                         ToolCall(
                             type=tool_type,
-                            function=FunctionCall(name=function_name,
-                                                  arguments=function_args),
-                        ))
+                            function=FunctionCall(
+                                name=function_name, arguments=function_args
+                            ),
+                        )
+                    )
 
-                content = model_output[:model_output.
-                                       find(self.tool_calls_start_token)]
+                content = model_output[: model_output.find(self.tool_calls_start_token)]
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=tool_calls,
@@ -108,11 +112,10 @@ class DeepSeekV3ToolParser(ToolParser):
                 )
 
             except Exception:
-                logger.exception(
-                    "Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
     def extract_tool_calls_streaming(
         self,
@@ -123,56 +126,59 @@ class DeepSeekV3ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
         if self.tool_calls_start_token_id not in current_token_ids:
             logger.debug("No tool call tokens found!")
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token,
-                                        "").replace(self.tool_calls_end_token,
-                                                    "")
+        delta_text = delta_text.replace(self.tool_calls_start_token, "").replace(
+            self.tool_calls_end_token, ""
+        )
         try:
-
             # figure out where we are in the parsing by counting tool call
             # start & end tags
             prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id)
             cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id)
             tool_call_portion = None
             text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
-            if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count
-                    and self.tool_call_end_token not in delta_text):
+            if (
+                cur_tool_start_count == cur_tool_end_count
+                and prev_tool_end_count == cur_tool_end_count
+                and self.tool_call_end_token not in delta_text
+            ):
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
             if self.tool_call_end_token in delta_text:
                 logger.debug("tool_call_end_token in delta_text")
                 full_text = current_text + delta_text
-                tool_call_portion = full_text.split(
-                    self.tool_call_start_token)[-1].split(
-                        self.tool_call_end_token)[0].rstrip()
-                delta_text = delta_text.split(
-                    self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(
-                    self.tool_call_end_token)[-1].lstrip()
+                tool_call_portion = (
+                    full_text.split(self.tool_call_start_token)[-1]
+                    .split(self.tool_call_end_token)[0]
+                    .rstrip()
+                )
+                delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip()
 
             # case -- we're starting a new tool call
-            if (cur_tool_start_count > cur_tool_end_count
-                    and cur_tool_start_count > prev_tool_start_count):
+            if (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count > prev_tool_start_count
+            ):
                 if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(
-                        self.tool_call_start_token)[-1]
+                    tool_call_portion = current_text.split(self.tool_call_start_token)[
+                        -1
+                    ]
                 else:
                     tool_call_portion = None
                     delta = None
@@ -186,27 +192,29 @@ class DeepSeekV3ToolParser(ToolParser):
                 logger.debug("Starting on a new tool %s", self.current_tool_id)
 
             # case -- we're updating an existing tool call
-            elif (cur_tool_start_count > cur_tool_end_count
-                  and cur_tool_start_count == prev_tool_start_count):
-
+            elif (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count == prev_tool_start_count
+            ):
                 # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(
-                    self.tool_call_start_token)[-1]
+                tool_call_portion = current_text.split(self.tool_call_start_token)[-1]
                 text_portion = None
 
             # case -- the current tool call is being closed.
-            elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count >= prev_tool_end_count):
-                if self.prev_tool_call_arr is None or len(
-                        self.prev_tool_call_arr) == 0:
-                    logger.debug(
-                        "attempting to close tool call, but no tool call")
+            elif (
+                cur_tool_start_count == cur_tool_end_count
+                and cur_tool_end_count >= prev_tool_end_count
+            ):
+                if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
+                    logger.debug("attempting to close tool call, but no tool call")
                     return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments")
+                diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
-                    diff = (diff.encode("utf-8").decode("unicode_escape")
-                            if diff is str else diff)
+                    diff = (
+                        diff.encode("utf-8").decode("unicode_escape")
+                        if diff is str
+                        else diff
+                    )
                     if '"}' not in delta_text:
                         return None
                     end_loc = delta_text.rindex('"}')
@@ -217,13 +225,16 @@ class DeepSeekV3ToolParser(ToolParser):
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=diff).model_dump(exclude_none=True),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(arguments=diff).model_dump(
+                                    exclude_none=True
+                                ),
+                            )
+                        ]
+                    )
 
             # case -- otherwise we're just generating text
             else:
@@ -234,21 +245,19 @@ class DeepSeekV3ToolParser(ToolParser):
 
             current_tool_call = dict()
             if tool_call_portion:
-                current_tool_call_matches = (
-                    self.stream_tool_call_portion_regex.match(
-                        tool_call_portion))
+                current_tool_call_matches = self.stream_tool_call_portion_regex.match(
+                    tool_call_portion
+                )
                 if current_tool_call_matches:
-                    tool_type, tool_name, tool_args = (
-                        current_tool_call_matches.groups())
+                    tool_type, tool_name, tool_args = current_tool_call_matches.groups()
                     current_tool_call["name"] = tool_name
                     current_tool_call["arguments"] = tool_args
                 else:
                     current_tool_call_name_matches = (
-                        self.stream_tool_call_name_regex.match(
-                            tool_call_portion))
+                        self.stream_tool_call_name_regex.match(tool_call_portion)
+                    )
                     if current_tool_call_name_matches:
-                        tool_type, tool_name = (
-                            current_tool_call_name_matches.groups())
+                        tool_type, tool_name = current_tool_call_name_matches.groups()
                         current_tool_call["name"] = tool_name
                         current_tool_call["arguments"] = ""
                     else:
@@ -260,19 +269,21 @@ class DeepSeekV3ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            type="function",
-                            id=make_tool_call_id(),
-                            function=DeltaFunctionCall(
-                                name=function_name).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                 else:
                     return None
 
@@ -282,15 +293,19 @@ class DeepSeekV3ToolParser(ToolParser):
             if tool_call_portion is None:
                 # if there's text but not tool calls, send that -
                 # otherwise None to skip chunk
-                delta = (DeltaMessage(
-                    content=delta_text) if text_portion is not None else None)
+                delta = (
+                    DeltaMessage(content=delta_text)
+                    if text_portion is not None
+                    else None
+                )
                 return delta
 
             # now, the nitty-gritty of tool calls
             # now we have the portion to parse as tool call.
 
-            logger.debug("Trying to parse current tool call with ID %s",
-                         self.current_tool_id)
+            logger.debug(
+                "Trying to parse current tool call with ID %s", self.current_tool_id
+            )
 
             # if we're starting a new tool call, push an empty object in as
             #   a placeholder for the arguments
@@ -300,7 +315,8 @@ class DeepSeekV3ToolParser(ToolParser):
             # main logic for tool parsing here - compare prev. partially-parsed
             #   JSON to the current partially-parsed JSON
             prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                "arguments")
+                "arguments"
+            )
             cur_arguments = current_tool_call.get("arguments")
 
             logger.debug("diffing old arguments: %s", prev_arguments)
@@ -314,52 +330,56 @@ class DeepSeekV3ToolParser(ToolParser):
             # case -- prev arguments are defined, but non are now.
             #   probably impossible, but not a fatal error - just keep going
             elif not cur_arguments and prev_arguments:
-                logger.error("should be impossible to have arguments reset "
-                             "mid-call. skipping streaming anything.")
+                logger.error(
+                    "should be impossible to have arguments reset "
+                    "mid-call. skipping streaming anything."
+                )
                 delta = None
 
             # case -- we now have the first info about arguments available from
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
-
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_id,
-                        function=DeltaFunctionCall(
-                            arguments=cur_arguments).model_dump(
-                                exclude_none=True),
-                    )
-                ])
-                self.streamed_args_for_tool[
-                    self.current_tool_id] = cur_arguments
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=cur_arguments
+                            ).model_dump(exclude_none=True),
+                        )
+                    ]
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
 
             # last case -- we have an update to existing arguments.
             elif cur_arguments and prev_arguments:
-                if (isinstance(delta_text, str)
-                        and cur_arguments != prev_arguments
-                        and len(cur_arguments) > len(prev_arguments)
-                        and cur_arguments.startswith(prev_arguments)):
-                    delta_arguments = cur_arguments[len(prev_arguments):]
+                if (
+                    isinstance(delta_text, str)
+                    and cur_arguments != prev_arguments
+                    and len(cur_arguments) > len(prev_arguments)
+                    and cur_arguments.startswith(prev_arguments)
+                ):
+                    delta_arguments = cur_arguments[len(prev_arguments) :]
                     logger.debug("got diff %s", delta_text)
 
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=delta_arguments).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] = cur_arguments
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=delta_arguments
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
                 else:
                     delta = None
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
             if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[
-                    self.current_tool_id] = current_tool_call
+                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..82370323cb00d65ec5735af01d1f3c429e14c15b
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class Ernie45ToolParser(ToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        """
+        Ernie thinking model format:
+        abc\n</think>\n\n\n<tool_call>\ndef\n</tool_call>\n
+        """
+        super().__init__(tokenizer)
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool: list[str] = []
+        self.think_end_token = "</think>"
+        self.response_start_token: str = "<response>"
+        self.response_end_token: str = "</response>"
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+        self.tool_calls_start_token = self.tool_call_start_token
+        self.newline_token: str = "<0x0A>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_call>\s*(?P<json>\{.*?\})\s*</tool_call>", re.DOTALL
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction."
+            )
+
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        self.response_start_token_id = self.vocab.get(self.response_start_token)
+        self.response_end_token_id = self.vocab.get(self.response_end_token)
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        self.newline_token_id = self.vocab.get(self.newline_token)
+        self.parser_token_ids = [
+            self.think_end_token_id,
+            self.response_start_token_id,
+            self.response_end_token_id,
+        ]
+
+        self._buffer = ""
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        else:
+            try:
+                tool_call_json_list = self.tool_call_regex.findall(model_output)
+
+                tool_calls = []
+                for tool_call_json in tool_call_json_list:
+                    tool_call_dict = json.loads(tool_call_json)
+                    args_str = json.dumps(
+                        tool_call_dict.get("arguments", {}), ensure_ascii=False
+                    )
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=tool_call_dict.get("name", ""),
+                                arguments=args_str,
+                            ),
+                        )
+                    )
+
+                content = model_output[
+                    : model_output.find(self.tool_calls_start_token)
+                ].rstrip("\n")
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        self._buffer += delta_text
+        cur_text = self._buffer
+        start_idx = cur_text.find(self.tool_call_start_token)
+        if start_idx == -1:
+            self._buffer = ""
+            # At least one toolcall has been completed
+            if self.current_tool_id > 0:
+                cur_text = ""
+            if self.current_tool_id == -1 and all(
+                token_id == self.newline_token_id for token_id in previous_token_ids
+            ):
+                cur_text = cur_text.strip("\n")
+
+            # handle <response> </response> when tool_call is not triggered
+            # cur_text === delta_text
+            content = cur_text
+            if self.response_start_token_id in delta_token_ids:
+                content = content.lstrip("\n")
+                response_start_idx = content.find(self.response_start_token)
+                content = content[response_start_idx + len(self.response_start_token) :]
+                # if have </response>, remove it
+                response_end_idx = content.rfind(self.response_end_token)
+                if response_end_idx != -1:
+                    content = content[:response_end_idx]
+            elif self.response_end_token_id in delta_token_ids:
+                response_end_idx = content.rfind(self.response_end_token)
+                content = content[:response_end_idx]
+            # remove \n after </think> or <response> or </response>
+            if (
+                len(previous_token_ids) > 0
+                and previous_token_ids[-1] in self.parser_token_ids
+            ) and (
+                len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
+            ):
+                content = content.lstrip("\n")
+
+            return DeltaMessage(content=content if content else None)
+        logger.debug("cur_text = %s", cur_text)
+        end_idx = cur_text.find(self.tool_call_end_token)
+        if end_idx != -1:
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = []
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+
+            extracted_tool_calls = self.extract_tool_calls(
+                cur_text[: end_idx + len(self.tool_call_end_token)], request
+            )
+
+            if len(extracted_tool_calls.tool_calls) == 0:
+                logger.warning("Failed to extract any tool calls.")
+                return None
+            tool_call = extracted_tool_calls.tool_calls[0]
+            self.prev_tool_call_arr[self.current_tool_id] = {
+                "name": tool_call.function.name,
+                "arguments": json.loads(tool_call.function.arguments),
+            }
+            self.streamed_args_for_tool[self.current_tool_id] = (
+                tool_call.function.arguments
+            )
+            delta = DeltaMessage(
+                content=extracted_tool_calls.content,
+                tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        id=tool_call.id,
+                        type=tool_call.type,
+                        function=DeltaFunctionCall(
+                            name=tool_call.function.name,
+                            arguments=tool_call.function.arguments,
+                        ),
+                    )
+                ],
+            )
+            self.current_tool_id += 1
+            self._buffer = cur_text[end_idx + len(self.tool_call_end_token) :]
+            return delta
+
+        self._buffer = cur_text[start_idx:]
+        content = cur_text[:start_idx].rstrip("\n")
+        return DeltaMessage(content=content if content else None)
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 8fd14f171d0afdf40039bc4dd035e1e2175f7059..120e63b929b16f993bb927b02dca12f541264dda 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -4,27 +4,30 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("glm45")
 class Glm4MoeModelToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
         self.current_tool_name_sent = False
@@ -36,20 +39,20 @@ class Glm4MoeModelToolParser(ToolParser):
 
         self.tool_calls_start_token = self.tool_call_start_token
 
-        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>",
-                                          re.DOTALL)
+        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
         self.func_detail_regex = re.compile(
-            r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
+            r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL
+        )
         self.func_arg_regex = re.compile(
-            r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
-            re.DOTALL)
+            r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
+        )
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
         self._buffer = ""
 
@@ -58,18 +61,22 @@ class Glm4MoeModelToolParser(ToolParser):
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
         def _is_string_type(
-                tool_name: str, arg_name: str,
-                tools: Optional[list[ChatCompletionToolsParam]]) -> bool:
+            tool_name: str,
+            arg_name: str,
+            tools: list[ChatCompletionToolsParam] | None,
+        ) -> bool:
             if tools is None:
                 return False
             for tool in tools:
                 if tool.function.name == tool_name:
                     if tool.function.parameters is None:
                         return False
-                    arg_type = tool.function.parameters.get(
-                        "properties", {}).get(arg_name, {}).get("type", None)
+                    arg_type = (
+                        tool.function.parameters.get("properties", {})
+                        .get(arg_name, {})
+                        .get("type", None)
+                    )
                     return arg_type == "string"
             logger.warning("No tool named '%s'.", tool_name)
             return False
@@ -101,28 +108,30 @@ class Glm4MoeModelToolParser(ToolParser):
                     arg_val = value.strip()
                     if not _is_string_type(tc_name, arg_key, request.tools):
                         arg_val = _deserialize(arg_val)
-                    logger.debug("arg_key = %s, arg_val = %s", arg_key,
-                                 arg_val)
+                    logger.debug("arg_key = %s, arg_val = %s", arg_key, arg_val)
                     arg_dct[arg_key] = arg_val
                 tool_calls.append(
-                    ToolCall(type="function",
-                             function=FunctionCall(
-                                 name=tc_name, arguments=json.dumps(arg_dct))))
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=tc_name, arguments=json.dumps(arg_dct)
+                        ),
+                    )
+                )
         except Exception:
             logger.exception("Failed to extract tool call spec")
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
         else:
             if len(tool_calls) > 0:
-                content = model_output[:model_output.
-                                       find(self.tool_calls_start_token)]
-                return ExtractedToolCallInformation(tools_called=True,
-                                                    tool_calls=tool_calls,
-                                                    content=content)
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+                content = model_output[: model_output.find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True, tool_calls=tool_calls, content=content
+                )
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -133,7 +142,7 @@ class Glm4MoeModelToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         self._buffer += delta_text
         cur_text = self._buffer
         start_idx = cur_text.find(self.tool_call_start_token)
@@ -155,7 +164,8 @@ class Glm4MoeModelToolParser(ToolParser):
                 self.streamed_args_for_tool.append("")
 
             extracted_tool_calls = self.extract_tool_calls(
-                cur_text[:end_idx + len(self.tool_call_end_token)], request)
+                cur_text[: end_idx + len(self.tool_call_end_token)], request
+            )
 
             if len(extracted_tool_calls.tool_calls) == 0:
                 logger.warning("Failed to extract any tool calls.")
@@ -163,22 +173,27 @@ class Glm4MoeModelToolParser(ToolParser):
             tool_call = extracted_tool_calls.tool_calls[0]
             self.prev_tool_call_arr[self.current_tool_id] = {
                 "name": tool_call.function.name,
-                "arguments": json.loads(tool_call.function.arguments)
+                "arguments": json.loads(tool_call.function.arguments),
             }
-            self.streamed_args_for_tool[
-                self.current_tool_id] = tool_call.function.arguments
+            self.streamed_args_for_tool[self.current_tool_id] = (
+                tool_call.function.arguments
+            )
             delta = DeltaMessage(
                 content=extracted_tool_calls.content,
                 tool_calls=[
-                    DeltaToolCall(index=self.current_tool_id,
-                                  id=tool_call.id,
-                                  type=tool_call.type,
-                                  function=DeltaFunctionCall(
-                                      name=tool_call.function.name,
-                                      arguments=tool_call.function.arguments))
-                ])
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        id=tool_call.id,
+                        type=tool_call.type,
+                        function=DeltaFunctionCall(
+                            name=tool_call.function.name,
+                            arguments=tool_call.function.arguments,
+                        ),
+                    )
+                ],
+            )
             self.current_tool_id += 1
-            self._buffer = cur_text[end_idx + len(self.tool_call_end_token):]
+            self._buffer = cur_text[end_idx + len(self.tool_call_end_token) :]
             return delta
 
         self._buffer = cur_text[start_idx:]
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 824b100f357b5bd616db11ddf3b508d4a8c9fd46..ae9217426fb5122289062cb2719f7b7021049d6c 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -4,31 +4,36 @@
 import json
 from collections.abc import Sequence
 from json import JSONDecoder
-from typing import Union
 
 import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
-                                                        find_common_prefix,
-                                                        is_complete_json,
-                                                        partial_json_loads)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    consume_space,
+    find_common_prefix,
+    is_complete_json,
+    partial_json_loads,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("granite-20b-fc")
 class Granite20bFCToolParser(ToolParser):
     """
     Tool call parser for the granite-20b-functioncalling model intended
@@ -47,12 +52,12 @@ class Granite20bFCToolParser(ToolParser):
         self.tool_call_regex = re.compile(r"<function_call>\s*")
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         if self.tool_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         dec = JSONDecoder()
         try:
@@ -66,13 +71,15 @@ class Granite20bFCToolParser(ToolParser):
                 start_of_json = match.end()
                 # end_index == the start of the next function call
                 # (if exists)
-                next_function_call_start = (matches[i + 1].start() if i +
-                                            1 < len(matches) else None)
+                next_function_call_start = (
+                    matches[i + 1].start() if i + 1 < len(matches) else None
+                )
 
                 raw_function_calls.append(
                     dec.raw_decode(
-                        model_output[start_of_json:next_function_call_start])
-                    [0])
+                        model_output[start_of_json:next_function_call_start]
+                    )[0]
+                )
 
             logger.debug("Extracted %d tool calls", len(raw_function_calls))
             tool_calls = [
@@ -81,13 +88,15 @@ class Granite20bFCToolParser(ToolParser):
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"],
-                                             ensure_ascii=False),
+                        arguments=json.dumps(
+                            function_call["arguments"], ensure_ascii=False
+                        ),
                     ),
-                ) for function_call in raw_function_calls
+                )
+                for function_call in raw_function_calls
             ]
 
-            content = model_output[:model_output.find(self.bot_token)]
+            content = model_output[: model_output.find(self.bot_token)]
             return ExtractedToolCallInformation(
                 tools_called=True,
                 tool_calls=tool_calls,
@@ -96,9 +105,9 @@ class Granite20bFCToolParser(ToolParser):
 
         except Exception as e:
             logger.error("Error in extracting tool call from response %s", e)
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -109,10 +118,10 @@ class Granite20bFCToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
-        if len(current_text) < len(
-                self.bot_token) and self.bot_token.startswith(current_text):
+    ) -> DeltaMessage | None:
+        if len(current_text) < len(self.bot_token) and self.bot_token.startswith(
+            current_text
+        ):
             return None
 
         if not current_text.startswith(self.bot_token):
@@ -122,8 +131,7 @@ class Granite20bFCToolParser(ToolParser):
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
             tool_call_arr = []
             is_complete = []
@@ -132,24 +140,23 @@ class Granite20bFCToolParser(ToolParser):
                 start_idx = consume_space(start_idx, current_text)
 
                 while start_idx < len(current_text):
-                    (obj,
-                     end_idx) = partial_json_loads(current_text[start_idx:],
-                                                   flags)
+                    (obj, end_idx) = partial_json_loads(current_text[start_idx:], flags)
                     is_complete.append(
-                        is_complete_json(current_text[start_idx:start_idx +
-                                                      end_idx]))
+                        is_complete_json(current_text[start_idx : start_idx + end_idx])
+                    )
                     start_idx += end_idx
                     start_idx = consume_space(start_idx, current_text)
                     start_idx += len(self.bot_token)
                     start_idx = consume_space(start_idx, current_text)
                     tool_call_arr.append(obj)
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
-                if len(tool_call_arr) > 0 else {}
+            current_tool_call: dict = (
+                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
+            )
 
             # case -- if no tokens have been streamed for the tool, e.g.
             #   only the array brackets, stream nothing
@@ -158,9 +165,9 @@ class Granite20bFCToolParser(ToolParser):
 
             # case: we are starting a new tool in the array
             #   -> array has > 0 length AND length has moved past cursor
-            elif (len(tool_call_arr) > 0
-                  and len(tool_call_arr) > self.current_tool_id + 1):
-
+            elif (
+                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
+            ):
                 # if we're moving on to a new call, first make sure we
                 # haven't missed anything in the previous one that was
                 # auto-generated due to JSON completions, but wasn't
@@ -168,21 +175,24 @@ class Granite20bFCToolParser(ToolParser):
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments,
-                                                   ensure_ascii=False)
-                        sent = len(
-                            self.streamed_args_for_tool[self.current_tool_id])
+                        cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                        sent = len(self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
 
                         logger.debug("got arguments diff: %s", argument_diff)
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
                     else:
                         delta = None
                 else:
@@ -199,15 +209,18 @@ class Granite20bFCToolParser(ToolParser):
             elif not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
                 else:
                     delta = None
@@ -219,34 +232,35 @@ class Granite20bFCToolParser(ToolParser):
                 delta = None
 
                 if cur_arguments:
-                    sent = len(
-                        self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_arguments = self.prev_tool_call_arr[
-                        self.current_tool_id].get("arguments")
+                    sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                        "arguments"
+                    )
 
                     argument_diff = None
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments,
-                                                    ensure_ascii=False)
+                        prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
                         if cur_args_json != prev_args_json:
-
-                            prefix = find_common_prefix(
-                                prev_args_json, cur_args_json)
+                            prefix = find_common_prefix(prev_args_json, cur_args_json)
                             argument_diff = prefix[sent:]
 
                     if argument_diff is not None:
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
 
             self.prev_tool_call_arr = tool_call_arr
             return delta
@@ -254,6 +268,6 @@ class Granite20bFCToolParser(ToolParser):
         except Exception as e:
             logger.error("Error trying to handle streaming tool call: %s", e)
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index ac517616a95b4852c1a75b29464df134bad466ca..d29c427694dc9fd752d4e599734a65f424455567 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -3,30 +3,35 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
-                                                        find_common_prefix,
-                                                        is_complete_json,
-                                                        partial_json_loads)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    consume_space,
+    find_common_prefix,
+    is_complete_json,
+    partial_json_loads,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("granite")
 class GraniteToolParser(ToolParser):
     """
     Tool call parser for the granite 3.0 models. Intended
@@ -45,21 +50,24 @@ class GraniteToolParser(ToolParser):
         self.bot_string = "<tool_call>"
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-        stripped = model_output.strip()\
-                    .removeprefix(self.bot_token)\
-                    .removeprefix(self.bot_string)\
-                    .lstrip()
-        if not stripped or stripped[0] != '[':
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
+        stripped = (
+            model_output.strip()
+            .removeprefix(self.bot_token)
+            .removeprefix(self.bot_string)
+            .lstrip()
+        )
+        if not stripped or stripped[0] != "[":
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
         try:
             raw_function_calls = json.loads(stripped)
             if not isinstance(raw_function_calls, list):
                 raise Exception(
-                    f"Expected dict or list, got {type(raw_function_calls)}")
+                    f"Expected dict or list, got {type(raw_function_calls)}"
+                )
 
             logger.debug("Extracted %d tool calls", len(raw_function_calls))
             tool_calls = [
@@ -68,10 +76,12 @@ class GraniteToolParser(ToolParser):
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"],
-                                             ensure_ascii=False),
+                        arguments=json.dumps(
+                            function_call["arguments"], ensure_ascii=False
+                        ),
                     ),
-                ) for function_call in raw_function_calls
+                )
+                for function_call in raw_function_calls
             ]
 
             return ExtractedToolCallInformation(
@@ -82,9 +92,9 @@ class GraniteToolParser(ToolParser):
 
         except Exception as e:
             logger.error("Error in extracting tool call from response %s", e)
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -95,42 +105,41 @@ class GraniteToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         start_idx = consume_space(0, current_text)
         if current_text[start_idx:].startswith(self.bot_token):
-            start_idx = consume_space(start_idx + len(self.bot_token),
-                                      current_text)
+            start_idx = consume_space(start_idx + len(self.bot_token), current_text)
         if current_text[start_idx:].startswith(self.bot_string):
-            start_idx = consume_space(start_idx + len(self.bot_string),
-                                      current_text)
-        if not current_text or start_idx >= len(current_text)\
-            or current_text[start_idx] != '[':
+            start_idx = consume_space(start_idx + len(self.bot_string), current_text)
+        if (
+            not current_text
+            or start_idx >= len(current_text)
+            or current_text[start_idx] != "["
+        ):
             return DeltaMessage(content=delta_text)
 
         # bit mask flags for partial JSON parsing. If the name hasn't been
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
             tool_call_arr = None
             is_complete = None
             try:
                 tool_calls, end_idx = partial_json_loads(
-                    current_text[start_idx:], flags)
+                    current_text[start_idx:], flags
+                )
                 if type(tool_calls) is list:
                     tool_call_arr = tool_calls
                 else:
                     return DeltaMessage(content=delta_text)
 
                 is_complete = [True] * len(tool_calls)
-                if not is_complete_json(
-                        current_text[start_idx:start_idx + end_idx]):
+                if not is_complete_json(current_text[start_idx : start_idx + end_idx]):
                     is_complete[-1] = False
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # case -- if no tokens have been streamed for the tool, e.g.
@@ -145,7 +154,6 @@ class GraniteToolParser(ToolParser):
             # case: we are starting a new tool in the array
             #   -> array has > 0 length AND length has moved past cursor
             if len(tool_call_arr) > self.current_tool_id + 1:
-
                 # if we're moving on to a new call, first make sure we
                 # haven't missed anything in the previous one that was
                 # auto-generated due to JSON completions, but wasn't
@@ -153,21 +161,24 @@ class GraniteToolParser(ToolParser):
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments,
-                                                   ensure_ascii=False)
-                        sent = len(
-                            self.streamed_args_for_tool[self.current_tool_id])
+                        cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                        sent = len(self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
 
                         logger.debug("got arguments diff: %s", argument_diff)
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
 
                 # re-set stuff pertaining to progress in the current tool
                 self.current_tool_id = len(tool_call_arr) - 1
@@ -181,15 +192,18 @@ class GraniteToolParser(ToolParser):
             elif not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
 
             # now we know we're on the same tool call and we're streaming
@@ -198,33 +212,35 @@ class GraniteToolParser(ToolParser):
                 cur_arguments = current_tool_call.get("arguments")
 
                 if cur_arguments:
-                    sent = len(
-                        self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_arguments = self.prev_tool_call_arr[
-                        self.current_tool_id].get("arguments")
+                    sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                        "arguments"
+                    )
 
                     argument_diff = None
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments,
-                                                    ensure_ascii=False)
+                        prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
                         if cur_args_json != prev_args_json:
-                            prefix = find_common_prefix(
-                                prev_args_json, cur_args_json)
+                            prefix = find_common_prefix(prev_args_json, cur_args_json)
                             argument_diff = prefix[sent:]
 
                     if argument_diff is not None:
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
 
             self.prev_tool_call_arr = tool_call_arr
             return delta
@@ -232,6 +248,6 @@ class GraniteToolParser(ToolParser):
         except Exception as e:
             logger.error("Error trying to handle streaming tool call: %s", e)
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 87595953da067d7c6cccf4b8ea2658bb95e86211..4336a5438109fde5e3fed9cd82b78f407590db18 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -3,59 +3,66 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("hermes")
 class Hermes2ProToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
         if isinstance(self.model_tokenizer, MistralTokenizer):
-            logger.error(
-                "Detected Mistral tokenizer when using a Hermes model")
+            logger.error("Detected Mistral tokenizer when using a Hermes model")
             self.model_tokenizer = self.model_tokenizer.tokenizer
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = [
-        ]  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
 
         self.tool_call_start_token: str = "<tool_call>"
         self.tool_call_end_token: str = "</tool_call>"
 
         self.tool_call_regex = re.compile(
-            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
+        )
         self.scratch_pad_regex = re.compile(
-            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)
+            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL
+        )
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
         self.tool_call_start_token_ids = self.model_tokenizer.encode(
-            self.tool_call_start_token, add_special_tokens=False)
+            self.tool_call_start_token, add_special_tokens=False
+        )
         self.tool_call_end_token_ids = self.model_tokenizer.encode(
-            self.tool_call_end_token, add_special_tokens=False)
+            self.tool_call_end_token, add_special_tokens=False
+        )
 
         self.tool_call_start_token_array = [
             self.model_tokenizer.decode([token_id])
@@ -77,13 +84,17 @@ class Hermes2ProToolParser(ToolParser):
     def tool_call_delta_buffer(self, delta_text: str):
         # If the sequence of tool_call_start or tool_call_end tokens is not yet
         # complete, fill the buffer with the token and return "".
-        if (delta_text in self.tool_call_start_token_array
-                or delta_text in self.tool_call_end_token_array):
+        if (
+            delta_text in self.tool_call_start_token_array
+            or delta_text in self.tool_call_end_token_array
+        ):
             # If delta_text is the last token of tool_call_start_token or
             # tool_call_end_token, empty the buffer and return
             # the buffered text + delta_text.
-            if (delta_text == self.tool_call_start_token_array[-1]
-                    or delta_text == self.tool_call_end_token_array[-1]):
+            if (
+                delta_text == self.tool_call_start_token_array[-1]
+                or delta_text == self.tool_call_end_token_array[-1]
+            ):
                 buffered_text = self.buffered_delta_text
                 self.buffered_delta_text = ""
                 return buffered_text + delta_text
@@ -98,9 +109,9 @@ class Hermes2ProToolParser(ToolParser):
             else:
                 return delta_text
 
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
             # do not skip special tokens because the tool_call tokens are
             # marked "special" in some models. Since they are skipped
             # prior to the call to the tool parser, it breaks tool calling.
@@ -112,22 +123,19 @@ class Hermes2ProToolParser(ToolParser):
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
         # sanity check; avoid unnecessary processing
         if self.tool_call_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         else:
-
             try:
                 # there are two possible captures - between tags, or between a
                 # tag and end-of-string so the result of
                 # findall is an array of tuples where one is a function call and
                 # the other is None
-                function_call_tuples = (
-                    self.tool_call_regex.findall(model_output))
+                function_call_tuples = self.tool_call_regex.findall(model_output)
 
                 # load the JSON, and then use it to build the Function and
                 # Tool Call
@@ -141,24 +149,26 @@ class Hermes2ProToolParser(ToolParser):
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"],
-                                                 ensure_ascii=False)))
+                            arguments=json.dumps(
+                                function_call["arguments"], ensure_ascii=False
+                            ),
+                        ),
+                    )
                     for function_call in raw_function_calls
                 ]
 
-                content = model_output[:model_output.
-                                       find(self.tool_call_start_token)]
+                content = model_output[: model_output.find(self.tool_call_start_token)]
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=tool_calls,
-                    content=content if content else None)
+                    content=content if content else None,
+                )
 
             except Exception:
-                logger.exception(
-                    "Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
     def extract_tool_calls_streaming(
         self,
@@ -169,7 +179,7 @@ class Hermes2ProToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # 1. All tokens are parsed based on _text, not token_ids.
         # 2. All incoming text data is processed by the tool_call_delta_buffer
         #    function for buffering before being used for parsing.
@@ -177,10 +187,12 @@ class Hermes2ProToolParser(ToolParser):
         delta_text = self.tool_call_delta_buffer(delta_text)
         # If the last characters of previous_text
         # match self.buffered_delta_text, remove only the matching part.
-        if (len(previous_text) >= len(self.buffered_delta_text)
-                and previous_text[-len(self.buffered_delta_text):]
-                == self.buffered_delta_text):
-            previous_text = previous_text[:-len(self.buffered_delta_text)]
+        if (
+            len(previous_text) >= len(self.buffered_delta_text)
+            and previous_text[-len(self.buffered_delta_text) :]
+            == self.buffered_delta_text
+        ):
+            previous_text = previous_text[: -len(self.buffered_delta_text)]
             current_text = previous_text + delta_text
 
         logger.debug("delta_text: %s", delta_text)
@@ -191,50 +203,51 @@ class Hermes2ProToolParser(ToolParser):
             return DeltaMessage(content=delta_text)
 
         try:
-
             # figure out where we are in the parsing by counting tool call
             # start & end tags
-            prev_tool_start_count = previous_text.count(
-                self.tool_call_start_token)
+            prev_tool_start_count = previous_text.count(self.tool_call_start_token)
             prev_tool_end_count = previous_text.count(self.tool_call_end_token)
-            cur_tool_start_count = current_text.count(
-                self.tool_call_start_token)
+            cur_tool_start_count = current_text.count(self.tool_call_start_token)
             cur_tool_end_count = current_text.count(self.tool_call_end_token)
             tool_call_portion = None
             text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
-            if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count
-                    and self.tool_call_end_token not in delta_text):
+            if (
+                cur_tool_start_count == cur_tool_end_count
+                and prev_tool_end_count == cur_tool_end_count
+                and self.tool_call_end_token not in delta_text
+            ):
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
             if self.tool_call_end_token in delta_text:
                 logger.debug("tool_call_end_token in delta_text")
                 full_text = current_text + delta_text
-                tool_call_portion = full_text.split(
-                    self.tool_call_start_token)[-1].split(
-                        self.tool_call_end_token)[0].rstrip()
-                delta_text = delta_text.split(
-                    self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(
-                    self.tool_call_end_token)[-1].lstrip()
+                tool_call_portion = (
+                    full_text.split(self.tool_call_start_token)[-1]
+                    .split(self.tool_call_end_token)[0]
+                    .rstrip()
+                )
+                delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip()
 
             # case: if tool open & close tag counts don't match, we're doing
             # imaginary "else" block here
             # something with tools with this diff.
             # flags for partial JSON parting. exported constants from
             # "Allow" are handled via BIT MASK
-            flags = Allow.ALL if self.current_tool_name_sent \
-                else Allow.ALL & ~Allow.STR
+            flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
 
             # case -- we're starting a new tool call
-            if (cur_tool_start_count > cur_tool_end_count
-                    and cur_tool_start_count > prev_tool_start_count):
+            if (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count > prev_tool_start_count
+            ):
                 if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(
-                        self.tool_call_start_token)[-1]
+                    tool_call_portion = current_text.split(self.tool_call_start_token)[
+                        -1
+                    ]
                 else:
                     tool_call_portion = None
                     delta = None
@@ -248,42 +261,49 @@ class Hermes2ProToolParser(ToolParser):
                 logger.debug("Starting on a new tool %s", self.current_tool_id)
 
             # case -- we're updating an existing tool call
-            elif (cur_tool_start_count > cur_tool_end_count
-                  and cur_tool_start_count == prev_tool_start_count):
-
+            elif (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count == prev_tool_start_count
+            ):
                 # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(
-                    self.tool_call_start_token)[-1]
+                tool_call_portion = current_text.split(self.tool_call_start_token)[-1]
                 text_portion = None
 
             # case -- the current tool call is being closed.
-            elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count >= prev_tool_end_count):
-                if (self.prev_tool_call_arr is None
-                        or len(self.prev_tool_call_arr) == 0):
-                    logger.debug(
-                        "attempting to close tool call, but no tool call")
+            elif (
+                cur_tool_start_count == cur_tool_end_count
+                and cur_tool_end_count >= prev_tool_end_count
+            ):
+                if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
+                    logger.debug("attempting to close tool call, but no tool call")
                     return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments")
+                diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
-                    diff = diff.encode('utf-8').decode(
-                        'unicode_escape') if diff is str else diff
-                    if ('"}' not in delta_text):
+                    diff = (
+                        diff.encode("utf-8").decode("unicode_escape")
+                        if diff is str
+                        else diff
+                    )
+                    if '"}' not in delta_text:
                         return None
                     end_loc = delta_text.rindex('"}')
                     diff = delta_text[:end_loc] + '"}'
                     logger.debug(
                         "Finishing tool and found diff that had not "
-                        "been streamed yet: %s", diff)
-                    self.streamed_args_for_tool[self.current_tool_id] \
-                        += diff
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=diff).model_dump(
-                                              exclude_none=True))
-                    ])
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(arguments=diff).model_dump(
+                                    exclude_none=True
+                                ),
+                            )
+                        ]
+                    )
 
             # case -- otherwise we're just generating text
             else:
@@ -293,13 +313,14 @@ class Hermes2ProToolParser(ToolParser):
                 return delta
 
             try:
-
-                current_tool_call = partial_json_parser.loads(
-                    tool_call_portion or "{}",
-                    flags) if tool_call_portion else None
+                current_tool_call = (
+                    partial_json_parser.loads(tool_call_portion or "{}", flags)
+                    if tool_call_portion
+                    else None
+                )
                 logger.debug("Parsed tool call %s", current_tool_call)
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
             except json.decoder.JSONDecodeError:
                 logger.debug("unable to parse JSON")
@@ -308,19 +329,23 @@ class Hermes2ProToolParser(ToolParser):
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
             if not self.current_tool_name_sent:
-                if (current_tool_call is None):
+                if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                 else:
                     return None
             # case -- otherwise, send the tool call delta
@@ -329,15 +354,19 @@ class Hermes2ProToolParser(ToolParser):
             if tool_call_portion is None:
                 # if there's text but not tool calls, send that -
                 # otherwise None to skip chunk
-                delta = DeltaMessage(content=delta_text) \
-                    if text_portion is not None else None
+                delta = (
+                    DeltaMessage(content=delta_text)
+                    if text_portion is not None
+                    else None
+                )
                 return delta
 
             # now, the nitty-gritty of tool calls
             # now we have the portion to parse as tool call.
 
-            logger.debug("Trying to parse current tool call with ID %s",
-                         self.current_tool_id)
+            logger.debug(
+                "Trying to parse current tool call with ID %s", self.current_tool_id
+            )
 
             # if we're starting a new tool call, push an empty object in as
             #   a placeholder for the arguments
@@ -346,8 +375,9 @@ class Hermes2ProToolParser(ToolParser):
 
             # main logic for tool parsing here - compare prev. partially-parsed
             #   JSON to the current partially-parsed JSON
-            prev_arguments = (
-                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments"
+            )
             cur_arguments = current_tool_call.get("arguments")
 
             logger.debug("diffing old arguments: %s", prev_arguments)
@@ -361,8 +391,10 @@ class Hermes2ProToolParser(ToolParser):
             # case -- prev arguments are defined, but non are now.
             #   probably impossible, but not a fatal error - just keep going
             elif not cur_arguments and prev_arguments:
-                logger.error("should be impossible to have arguments reset "
-                             "mid-call. skipping streaming anything.")
+                logger.error(
+                    "should be impossible to have arguments reset "
+                    "mid-call. skipping streaming anything."
+                )
                 delta = None
 
             # case -- we now have the first info about arguments available from
@@ -378,38 +410,41 @@ class Hermes2ProToolParser(ToolParser):
                 #     {"search_request": {}}
                 function_name = current_tool_call.get("name")
                 match = re.search(
-                    r'\{"name":\s*"' +
-                    re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)',
-                    tool_call_portion.strip(), re.DOTALL)
+                    r'\{"name":\s*"'
+                    + re.escape(function_name)
+                    + r'"\s*,\s*"arguments":\s*(.*)',
+                    tool_call_portion.strip(),
+                    re.DOTALL,
+                )
                 if match:
                     cur_arguments_json = match.group(1)
                 else:
-                    cur_arguments_json = json.dumps(cur_arguments,
-                                                    ensure_ascii=False)
+                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)
 
-                logger.debug("finding %s in %s", delta_text,
-                             cur_arguments_json)
+                logger.debug("finding %s in %s", delta_text, cur_arguments_json)
 
                 # get the location where previous args differ from current.
-                if (delta_text not in cur_arguments_json):
+                if delta_text not in cur_arguments_json:
                     return None
-                args_delta_start_loc = cur_arguments_json. \
-                                           rindex(delta_text) + \
-                                           len(delta_text)
+                args_delta_start_loc = cur_arguments_json.rindex(delta_text) + len(
+                    delta_text
+                )
 
                 # use that to find the actual delta
                 arguments_delta = cur_arguments_json[:args_delta_start_loc]
-                logger.debug("First tokens in arguments received: %s",
-                             arguments_delta)
-
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.current_tool_id,
-                                  function=DeltaFunctionCall(
-                                      arguments=arguments_delta).model_dump(
-                                          exclude_none=True))
-                ])
-                self.streamed_args_for_tool[self.current_tool_id] \
-                    += arguments_delta
+                logger.debug("First tokens in arguments received: %s", arguments_delta)
+
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=arguments_delta
+                            ).model_dump(exclude_none=True),
+                        )
+                    ]
+                )
+                self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
 
             # last case -- we have an update to existing arguments.
             elif cur_arguments and prev_arguments:
@@ -423,28 +458,32 @@ class Hermes2ProToolParser(ToolParser):
                 # if the delta_text ends with a '}' and tool_call_portion is a
                 #   complete JSON, then the last '}' does not belong to the
                 #   arguments, so we should trim it off
-                if isinstance(delta_text, str) \
-                    and len(delta_text.rstrip()) >= 1 \
-                    and delta_text.rstrip()[-1] == '}' \
-                    and is_complete_json:
+                if (
+                    isinstance(delta_text, str)
+                    and len(delta_text.rstrip()) >= 1
+                    and delta_text.rstrip()[-1] == "}"
+                    and is_complete_json
+                ):
                     delta_text = delta_text.rstrip()[:-1]
 
                 logger.debug("got diff %s", delta_text)
 
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.current_tool_id,
-                                  function=DeltaFunctionCall(
-                                      arguments=delta_text).model_dump(
-                                          exclude_none=True))
-                ])
-                self.streamed_args_for_tool[self.current_tool_id] \
-                    += delta_text
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(arguments=delta_text).model_dump(
+                                exclude_none=True
+                            ),
+                        )
+                    ]
+                )
+                self.streamed_args_for_tool[self.current_tool_id] += delta_text
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
             if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[self.current_tool_id] = \
-                    current_tool_call
+                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 2b65f2579fb43a6c8b6fdbfc3a6b51476726ab63..920675c8389b8aa0aff3c223dae3073130e4a74c 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -4,17 +4,22 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -23,9 +28,7 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("hunyuan_a13b")
 class HunyuanA13BToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
@@ -33,8 +36,7 @@ class HunyuanA13BToolParser(ToolParser):
         self.prev_tool_calls: list[dict] = []
         self.current_tool_id = -1
         self.current_tool_name_sent = False
-        self.streamed_args: list[str] = [
-        ]  # Track arguments sent for each tool
+        self.streamed_args: list[str] = []  # Track arguments sent for each tool
 
         # For backward compatibility with tests
         self.current_tools_sent: list[bool] = []
@@ -44,12 +46,14 @@ class HunyuanA13BToolParser(ToolParser):
 
         # Regex patterns for preprocessing
         self.answer_tool_calls_pattern = re.compile(
-            r"<tool_calls>([\s\S]*?)</tool_calls>", re.DOTALL)
+            r"<tool_calls>([\s\S]*?)</tool_calls>", re.DOTALL
+        )
 
         self.tool_name_reg = re.compile(r'"name"\s*:\s*"([^"]+)"')
 
         self.tool_empty_arg_reg = re.compile(
-            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
+            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}'
+        )
 
         # TODO: not support nested json object in fc arguments.
         self.tool_non_empty_arg_reg = re.compile(
@@ -66,15 +70,21 @@ class HunyuanA13BToolParser(ToolParser):
         }
 
     def preprocess_model_output(
-            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
+        self, model_output: str
+    ) -> tuple[str | None, str | None]:
         # find the location tool call
         for match in self.answer_tool_calls_pattern.finditer(model_output):
             start, end = match.span()
             # check tool_calls whether in side of <think>
-            think_regions = [(m.start(), m.end()) for m in re.finditer(
-                r"<think>(.*?)</think>", model_output, flags=re.DOTALL)]
-            in_think = any(start > t_start and end < t_end
-                           for t_start, t_end in think_regions)
+            think_regions = [
+                (m.start(), m.end())
+                for m in re.finditer(
+                    r"<think>(.*?)</think>", model_output, flags=re.DOTALL
+                )
+            ]
+            in_think = any(
+                start > t_start and end < t_end for t_start, t_end in think_regions
+            )
             if not in_think:
                 content = model_output[:start]
                 tool_calls_content = match.group(1).strip()
@@ -86,24 +96,23 @@ class HunyuanA13BToolParser(ToolParser):
         return model_output, None
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract tool calls from a complete model output.
         """
         try:
             # Preprocess the model output
-            content, potential_tool_calls = self.preprocess_model_output(
-                model_output)
+            content, potential_tool_calls = self.preprocess_model_output(model_output)
 
             if not potential_tool_calls:
                 # some text should be filtered out for no function call
                 # this text is in a13b's chat template.
                 if content:
                     content = content.replace("助手：", "", 1)
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=content)
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=content
+                )
 
             # Parse the potential tool calls as JSON
             tool_calls_data = json.loads(potential_tool_calls)
@@ -120,8 +129,11 @@ class HunyuanA13BToolParser(ToolParser):
             tool_calls: list[ToolCall] = []
 
             for idx, call in enumerate(tool_calls_data):
-                if (not isinstance(call, dict) or "name" not in call
-                        or "arguments" not in call):
+                if (
+                    not isinstance(call, dict)
+                    or "name" not in call
+                    or "arguments" not in call
+                ):
                     continue
 
                 tool_call = ToolCall(
@@ -129,8 +141,11 @@ class HunyuanA13BToolParser(ToolParser):
                     type="function",
                     function=FunctionCall(
                         name=call["name"],
-                        arguments=(json.dumps(call["arguments"]) if isinstance(
-                            call["arguments"], dict) else call["arguments"]),
+                        arguments=(
+                            json.dumps(call["arguments"])
+                            if isinstance(call["arguments"], dict)
+                            else call["arguments"]
+                        ),
                     ),
                 )
                 tool_calls.append(tool_call)
@@ -146,9 +161,9 @@ class HunyuanA13BToolParser(ToolParser):
             )
 
         except Exception:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -159,17 +174,19 @@ class HunyuanA13BToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract tool calls for streaming mode.
         """
 
         start_idx = consume_space(0, current_text)
         if current_text[start_idx:].startswith(self.bot_string):
-            start_idx = consume_space(start_idx + len(self.bot_string),
-                                      current_text)
-        if not current_text or start_idx >= len(
-                current_text) or current_text[start_idx] != '[':
+            start_idx = consume_space(start_idx + len(self.bot_string), current_text)
+        if (
+            not current_text
+            or start_idx >= len(current_text)
+            or current_text[start_idx] != "["
+        ):
             return DeltaMessage(content=delta_text)
 
         self._try_parse_json_tools(current_text[start_idx:])
@@ -185,13 +202,15 @@ class HunyuanA13BToolParser(ToolParser):
         self._ensure_state_arrays(tool_count)
         current_idx = self.streaming_state["current_tool_index"]
 
-        name_delta = self._handle_tool_name_streaming(current_idx, tool_count,
-                                                      name_matches)
+        name_delta = self._handle_tool_name_streaming(
+            current_idx, tool_count, name_matches
+        )
         if name_delta:
             return name_delta
 
-        args_delta = self._handle_tool_args_streaming(current_text,
-                                                      current_idx, tool_count)
+        args_delta = self._handle_tool_args_streaming(
+            current_text, current_idx, tool_count
+        )
         if args_delta:
             return args_delta
 
@@ -207,166 +226,195 @@ class HunyuanA13BToolParser(ToolParser):
 
     def _handle_test_compatibility(self, current_text: str):
         if len(self.current_tools_sent) > 0:
-            if (len(self.current_tools_sent) == 1
-                    and self.current_tools_sent[0] is False):
+            if (
+                len(self.current_tools_sent) == 1
+                and self.current_tools_sent[0] is False
+            ):
                 name_match = self.tool_name_reg.search(current_text)
                 if name_match:
                     function_name = name_match.group(1)
                     tool_id = f"chatcmpl-tool-{random_uuid()}"
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=0,
-                            type="function",
-                            id=tool_id,
-                            function=DeltaFunctionCall(
-                                name=function_name).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=0,
+                                type="function",
+                                id=tool_id,
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tools_sent = [True]
                     self.current_tool_id = 0
                     self.streaming_state["current_tool_index"] = 0
                     if len(self.streaming_state["sent_tools"]) == 0:
-                        self.streaming_state["sent_tools"].append({
-                            "sent_name":
-                            True,
-                            "sent_arguments_prefix":
-                            False,
-                            "sent_arguments":
-                            "",
-                        })
+                        self.streaming_state["sent_tools"].append(
+                            {
+                                "sent_name": True,
+                                "sent_arguments_prefix": False,
+                                "sent_arguments": "",
+                            }
+                        )
                     else:
-                        self.streaming_state["sent_tools"][0][
-                            "sent_name"] = True
+                        self.streaming_state["sent_tools"][0]["sent_name"] = True
                     self.current_tool_name_sent = True
                     return delta
         return None
 
     def _ensure_state_arrays(self, tool_count: int):
         while len(self.streaming_state["sent_tools"]) < tool_count:
-            self.streaming_state["sent_tools"].append({
-                "sent_name": False,
-                "sent_arguments_prefix": False,
-                "sent_arguments": "",
-            })
+            self.streaming_state["sent_tools"].append(
+                {
+                    "sent_name": False,
+                    "sent_arguments_prefix": False,
+                    "sent_arguments": "",
+                }
+            )
         while len(self.streaming_state["tool_ids"]) < tool_count:
             self.streaming_state["tool_ids"].append(None)
 
-    def _handle_tool_name_streaming(self, current_idx: int, tool_count: int,
-                                    name_matches):
+    def _handle_tool_name_streaming(
+        self, current_idx: int, tool_count: int, name_matches
+    ):
         if current_idx == -1 or current_idx < tool_count - 1:
             next_idx = current_idx + 1
-            if (next_idx < tool_count
-                    and not self.streaming_state["sent_tools"][next_idx]
-                ["sent_name"]):
+            if (
+                next_idx < tool_count
+                and not self.streaming_state["sent_tools"][next_idx]["sent_name"]
+            ):
                 self.streaming_state["current_tool_index"] = next_idx
                 self.current_tool_id = next_idx
                 current_idx = next_idx
                 tool_name = name_matches[current_idx].group(1)
                 tool_id = f"call_{current_idx}_{random_uuid()}"
                 self.streaming_state["tool_ids"][current_idx] = tool_id
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=current_idx,
-                        type="function",
-                        id=tool_id,
-                        function=DeltaFunctionCall(name=tool_name).model_dump(
-                            exclude_none=True),
-                    )
-                ])
-                self.streaming_state["sent_tools"][current_idx][
-                    "sent_name"] = True
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=current_idx,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(name=tool_name).model_dump(
+                                exclude_none=True
+                            ),
+                        )
+                    ]
+                )
+                self.streaming_state["sent_tools"][current_idx]["sent_name"] = True
                 self.current_tool_name_sent = True
                 while len(self.streamed_args) <= current_idx:
                     self.streamed_args.append("")
                 return delta
         return None
 
-    def _handle_tool_args_streaming(self, current_text: str, current_idx: int,
-                                    tool_count: int):
-
+    def _handle_tool_args_streaming(
+        self, current_text: str, current_idx: int, tool_count: int
+    ):
         if current_idx >= 0 and current_idx < tool_count:
             empty_args_match = self.tool_empty_arg_reg.search(current_text)
             if empty_args_match and empty_args_match.start() > 0:
                 for i in range(tool_count):
                     if i == current_idx:
                         if not self.streaming_state["sent_tools"][current_idx][
-                                "sent_arguments_prefix"]:
+                            "sent_arguments_prefix"
+                        ]:
                             self.streaming_state["sent_tools"][current_idx][
-                                "sent_arguments_prefix"] = True
+                                "sent_arguments_prefix"
+                            ] = True
                             self.streaming_state["sent_tools"][current_idx][
-                                "sent_arguments"] = "{}"
+                                "sent_arguments"
+                            ] = "{}"
                             while len(self.streamed_args) <= current_idx:
                                 self.streamed_args.append("")
                             self.streamed_args[current_idx] += "{}"
-                            delta = DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=current_idx,
-                                    function=DeltaFunctionCall(
-                                        arguments="{}").model_dump(
-                                            exclude_none=True),
-                                )
-                            ])
+                            delta = DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=current_idx,
+                                        function=DeltaFunctionCall(
+                                            arguments="{}"
+                                        ).model_dump(exclude_none=True),
+                                    )
+                                ]
+                            )
                             if current_idx < tool_count - 1:
                                 self.streaming_state["current_tool_index"] += 1
                                 self.current_tool_id = self.streaming_state[
-                                    "current_tool_index"]
+                                    "current_tool_index"
+                                ]
                             return delta
 
-            args_matches = list(
-                self.tool_non_empty_arg_reg.finditer(current_text))
+            args_matches = list(self.tool_non_empty_arg_reg.finditer(current_text))
             if current_idx < len(args_matches):
                 args_text = args_matches[current_idx].group(1)
                 is_last_tool = current_idx == tool_count - 1
                 if not is_last_tool:
                     next_tool_pos = current_text.find(
-                        "},{", args_matches[current_idx].start())
+                        "},{", args_matches[current_idx].start()
+                    )
                     if next_tool_pos != -1:
-                        args_end_pos = (next_tool_pos + 1)
+                        args_end_pos = next_tool_pos + 1
                         args_text = (
-                            current_text[args_matches[current_idx].start(
-                            ):args_end_pos].split('"arguments":')[1].strip())
+                            current_text[
+                                args_matches[current_idx].start() : args_end_pos
+                            ]
+                            .split('"arguments":')[1]
+                            .strip()
+                        )
                 sent_args = self.streaming_state["sent_tools"][current_idx][
-                    "sent_arguments"]
+                    "sent_arguments"
+                ]
                 if not self.streaming_state["sent_tools"][current_idx][
-                        "sent_arguments_prefix"] and args_text.startswith("{"):
+                    "sent_arguments_prefix"
+                ] and args_text.startswith("{"):
                     self.streaming_state["sent_tools"][current_idx][
-                        "sent_arguments_prefix"] = True
+                        "sent_arguments_prefix"
+                    ] = True
                     self.streaming_state["sent_tools"][current_idx][
-                        "sent_arguments"] = "{"
+                        "sent_arguments"
+                    ] = "{"
                     while len(self.streamed_args) <= current_idx:
                         self.streamed_args.append("")
                     self.streamed_args[current_idx] += "{"
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=current_idx,
-                            function=DeltaFunctionCall(
-                                arguments="{").model_dump(exclude_none=True),
-                        )
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                function=DeltaFunctionCall(arguments="{").model_dump(
+                                    exclude_none=True
+                                ),
+                            )
+                        ]
+                    )
                     return delta
 
                 if args_text.startswith(sent_args):
-                    args_diff = args_text[len(sent_args):]
+                    args_diff = args_text[len(sent_args) :]
                     if args_diff:
                         self.streaming_state["sent_tools"][current_idx][
-                            "sent_arguments"] = args_text
+                            "sent_arguments"
+                        ] = args_text
                         while len(self.streamed_args) <= current_idx:
                             self.streamed_args.append("")
                         self.streamed_args[current_idx] += args_diff
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(
-                                index=current_idx,
-                                function=DeltaFunctionCall(
-                                    arguments=args_diff).model_dump(
-                                        exclude_none=True),
-                            )
-                        ])
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=current_idx,
+                                    function=DeltaFunctionCall(
+                                        arguments=args_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
                         return delta
 
                 if args_text.endswith("}") and args_text == sent_args:
                     if current_idx < tool_count - 1:
                         self.streaming_state["current_tool_index"] += 1
                         self.current_tool_id = self.streaming_state[
-                            "current_tool_index"]
+                            "current_tool_index"
+                        ]
         return None
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 37c360145b04a69287030a66bdc18fe06a2e0bef..1dd327f645b3a8fe0b4d95ccfb7942255f5d8488 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -3,37 +3,38 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["internlm"])
 class Internlm2ToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
         self.position = 0
 
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
             # do not skip special tokens because internlm use the special
             # tokens to indicate the start and end of the tool calls
             # information.
@@ -56,35 +57,34 @@ class Internlm2ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-        if '<|action_start|>' not in current_text:
+    ) -> DeltaMessage | None:
+        if "<|action_start|>" not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
         # if the tool call is sent, return an empty delta message
         # to make sure the finish_reason will be sent correctly.
         if self.current_tool_id > 0:
-            return DeltaMessage(content='')
+            return DeltaMessage(content="")
 
         last_pos = self.position
-        if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
+        if "<|action_start|><|plugin|>" not in current_text[last_pos:]:
             return None
 
         new_delta = current_text[last_pos:]
-        text, action = new_delta.split('<|action_start|><|plugin|>')
+        text, action = new_delta.split("<|action_start|><|plugin|>")
 
         if len(text) > 0:
             self.position = self.position + len(text)
             return DeltaMessage(content=text)
 
         action = action.strip()
-        action = action.split('<|action_end|>'.strip())[0]
+        action = action.split("<|action_end|>".strip())[0]
 
         # bit mask flags for partial JSON parsing. If the name hasn't been
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
 
         try:
             parsable_arr = action
@@ -92,10 +92,9 @@ class Internlm2ToolParser(ToolParser):
             # tool calls are generated in an object in internlm2
             # it's not support parallel tool calls
             try:
-                tool_call_arr: dict = partial_json_parser.loads(
-                    parsable_arr, flags)
+                tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # if the current tool name hasn't been sent, send if available
@@ -104,14 +103,18 @@ class Internlm2ToolParser(ToolParser):
                 function_name = tool_call_arr.get("name")
                 if function_name:
                     self.current_tool_id = self.current_tool_id + 1
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
                     self.streamed_args_for_tool.append("")
                 else:
@@ -120,7 +123,8 @@ class Internlm2ToolParser(ToolParser):
             # arguments
             else:
                 prev_arguments = self.get_arguments(
-                    self.prev_tool_call_arr[self.current_tool_id])
+                    self.prev_tool_call_arr[self.current_tool_id]
+                )
                 cur_arguments = self.get_arguments(tool_call_arr)
 
                 # not arguments generated
@@ -129,43 +133,47 @@ class Internlm2ToolParser(ToolParser):
                 # will never happen
                 elif not cur_arguments and prev_arguments:
                     logger.error(
-                        "INVARIANT - impossible to have arguments reset "
-                        "mid-arguments")
+                        "INVARIANT - impossible to have arguments reset mid-arguments"
+                    )
                     delta = None
                 # first time to get parameters
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments,
-                                                    ensure_ascii=False)
-
-                    arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(delta_text) +
-                                                         len(delta_text)]
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=arguments_delta).
-                                      model_dump(exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += arguments_delta
+                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)
+
+                    arguments_delta = cur_arguments_json[
+                        : cur_arguments_json.index(delta_text) + len(delta_text)
+                    ]
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=arguments_delta
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
                 # both prev and cur parameters, send the increase parameters
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments,
-                                                ensure_ascii=False)
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
 
                     argument_diff = extract_intermediate_diff(
-                        cur_args_json, prev_args_json)
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=argument_diff).model_dump(
-                                              exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += argument_diff
+                        cur_args_json, prev_args_json
+                    )
+
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=argument_diff
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
 
             # check to see if the name is defined and has been sent. if so,
             # stream the name - otherwise keep waiting
@@ -176,8 +184,8 @@ class Internlm2ToolParser(ToolParser):
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
 
     def extract_tool_calls(
@@ -187,30 +195,33 @@ class Internlm2ToolParser(ToolParser):
     ) -> ExtractedToolCallInformation:
         text = model_output
         tools = request.tools
-        if '<|action_start|><|plugin|>' in text:
-            text, action = text.split('<|action_start|><|plugin|>')
-            action = action.split('<|action_end|>'.strip())[0]
-            action = action[action.find('{'):]
+        if "<|action_start|><|plugin|>" in text:
+            text, action = text.split("<|action_start|><|plugin|>")
+            action = action.split("<|action_end|>".strip())[0]
+            action = action[action.find("{") :]
             action_dict = json.loads(action)
-            name, parameters = action_dict['name'], json.dumps(
-                action_dict.get('parameters', action_dict.get('arguments',
-                                                              {})),
-                ensure_ascii=False)
+            name, parameters = (
+                action_dict["name"],
+                json.dumps(
+                    action_dict.get("parameters", action_dict.get("arguments", {})),
+                    ensure_ascii=False,
+                ),
+            )
 
             if not tools or name not in [t.function.name for t in tools]:
-                ExtractedToolCallInformation(tools_called=False,
-                                             tool_calls=[],
-                                             content=text)
+                ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=text
+                )
 
             tool_calls = [
-                ToolCall(
-                    function=FunctionCall(name=name, arguments=parameters))
+                ToolCall(function=FunctionCall(name=name, arguments=parameters))
             ]
             return ExtractedToolCallInformation(
                 tools_called=True,
                 tool_calls=tool_calls,
-                content=text if len(text) > 0 else None)
+                content=text if len(text) > 0 else None,
+            )
 
-        return ExtractedToolCallInformation(tools_called=False,
-                                            tool_calls=[],
-                                            content=text)
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=text
+        )
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 3b41f6034704cfb15eafba6cfcf5c2f7f97e2444..6f53ddea4f0efd754ae694ea938a77ea0f80445c 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -3,21 +3,23 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers import MistralTokenizer
@@ -25,9 +27,7 @@ from vllm.transformers_utils.tokenizers import MistralTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("jamba")
 class JambaToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
@@ -39,33 +39,36 @@ class JambaToolParser(ToolParser):
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = [
-        ]  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
 
         self.tool_calls_start_token: str = "<tool_calls>"
         self.tool_calls_end_token: str = "</tool_calls>"
 
         self.tool_calls_regex = re.compile(
-            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
-            re.DOTALL)
+            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}", re.DOTALL
+        )
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
-        self.tool_calls_start_token_id = self.vocab.get(
-            self.tool_calls_start_token)
-        self.tool_calls_end_token_id = self.vocab.get(
-            self.tool_calls_end_token)
-        if (self.tool_calls_start_token_id is None
-                or self.tool_calls_end_token_id is None):
+                "constructor during construction."
+            )
+        self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
+        if (
+            self.tool_calls_start_token_id is None
+            or self.tool_calls_end_token_id is None
+        ):
             raise RuntimeError(
                 "Jamba Tool parser could not locate tool calls start/end "
-                "tokens in the tokenizer!")
+                "tokens in the tokenizer!"
+            )
 
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
             # do not skip special tokens because jamba use the special
             # tokens to indicate the start and end of the tool calls
             # information.
@@ -73,17 +76,15 @@ class JambaToolParser(ToolParser):
         return request
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         # sanity check; avoid unnecessary processing
         if self.tool_calls_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         else:
-
             try:
                 # use a regex to find the tool call between the tags
                 function_calls = self.tool_calls_regex.findall(model_output)[0]
@@ -97,25 +98,26 @@ class JambaToolParser(ToolParser):
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"],
-                                                 ensure_ascii=False),
-                        )) for function_call in raw_function_calls
+                            arguments=json.dumps(
+                                function_call["arguments"], ensure_ascii=False
+                            ),
+                        ),
+                    )
+                    for function_call in raw_function_calls
                 ]
 
-                content = model_output[:model_output.
-                                       find(self.tool_calls_start_token)]
+                content = model_output[: model_output.find(self.tool_calls_start_token)]
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=tool_calls,
-                    content=content if
-                    (len(content) > 0 and content != " ") else None)
+                    content=content if (len(content) > 0 and content != " ") else None,
+                )
 
             except Exception:
-                logger.exception(
-                    "Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
     def extract_tool_calls_streaming(
         self,
@@ -126,8 +128,7 @@ class JambaToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         # if the tool call token is not in the tokens generated so far, append
         # output to contents since it's not a tool
         if self.tool_calls_start_token not in current_text:
@@ -138,8 +139,10 @@ class JambaToolParser(ToolParser):
 
         # handle if we detected the start of tool calls token which means
         # the start of tool calling
-        if (self.tool_calls_start_token_id in delta_token_ids
-                and len(delta_token_ids) == 1):
+        if (
+            self.tool_calls_start_token_id in delta_token_ids
+            and len(delta_token_ids) == 1
+        ):
             # if it's the only token, return None, so we don't send a chat
             # completion and don't send a control token
             return None
@@ -148,28 +151,28 @@ class JambaToolParser(ToolParser):
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
-
             # Extract the tool calls between the special tool call tokens
-            parsable_arr = current_text.split(
-                self.tool_calls_start_token)[-1].split(
-                    self.tool_calls_end_token)[0]
+            parsable_arr = current_text.split(self.tool_calls_start_token)[-1].split(
+                self.tool_calls_end_token
+            )[0]
 
             # tool calls are generated in an array, so do partial JSON
             # parsing on the entire array
             try:
                 tool_call_arr: list[dict] = partial_json_parser.loads(
-                    parsable_arr, flags)
+                    parsable_arr, flags
+                )
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # select as the current tool call the one we're on the state at
 
-            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
-                if len(tool_call_arr) > 0 else {}
+            current_tool_call: dict = (
+                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
+            )
 
             # case -- if no tokens have been streamed for the tool, e.g.
             #   only the array brackets, stream nothing
@@ -178,28 +181,31 @@ class JambaToolParser(ToolParser):
 
             # case: we are starting a new tool in the array
             #   -> array has > 0 length AND length has moved past cursor
-            elif (len(tool_call_arr) > 0
-                  and len(tool_call_arr) > self.current_tool_id + 1):
-
+            elif (
+                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
+            ):
                 # if we're moving on to a new call, first make sure we
                 # haven't missed anything in the previous one that was
                 # auto-generated due to JSON completions, but wasn't
                 # streamed to the client yet.
                 if self.current_tool_id >= 0:
-                    diff: Union[str, None] = current_tool_call.get("arguments")
+                    diff: str | None = current_tool_call.get("arguments")
 
                     if diff:
                         diff = json.dumps(diff, ensure_ascii=False).replace(
-                            self.streamed_args_for_tool[self.current_tool_id],
-                            "")
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=diff).model_dump(
-                                                  exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += diff
+                            self.streamed_args_for_tool[self.current_tool_id], ""
+                        )
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += diff
                     else:
                         delta = None
                 else:
@@ -218,15 +224,18 @@ class JambaToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
                 else:
                     delta = None
@@ -234,60 +243,66 @@ class JambaToolParser(ToolParser):
             # now we know we're on the same tool call and we're streaming
             # arguments
             else:
-
-                prev_arguments = self.prev_tool_call_arr[
-                    self.current_tool_id].get("arguments")
+                prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments"
+                )
                 cur_arguments = current_tool_call.get("arguments")
 
-                new_text = delta_text.replace("\'", "\"")
+                new_text = delta_text.replace("'", '"')
 
                 if not cur_arguments and not prev_arguments:
-
                     delta = None
                 elif not cur_arguments and prev_arguments:
                     logger.error(
-                        "INVARIANT - impossible to have arguments reset "
-                        "mid-arguments")
+                        "INVARIANT - impossible to have arguments reset mid-arguments"
+                    )
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments,
-                                                    ensure_ascii=False)
-                    logger.debug("finding %s in %s", new_text,
-                                 cur_arguments_json)
-
-                    arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(new_text) +
-                                                         len(new_text)]
-                    logger.debug("First tokens in arguments received: %s",
-                                 arguments_delta)
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=arguments_delta).
-                                      model_dump(exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += arguments_delta
+                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    logger.debug("finding %s in %s", new_text, cur_arguments_json)
+
+                    arguments_delta = cur_arguments_json[
+                        : cur_arguments_json.index(new_text) + len(new_text)
+                    ]
+                    logger.debug(
+                        "First tokens in arguments received: %s", arguments_delta
+                    )
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=arguments_delta
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments,
-                                                ensure_ascii=False)
-                    logger.debug("Searching for diff between \n%s\n%s",
-                                 cur_args_json, prev_args_json)
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
+                    logger.debug(
+                        "Searching for diff between \n%s\n%s",
+                        cur_args_json,
+                        prev_args_json,
+                    )
 
                     argument_diff = extract_intermediate_diff(
-                        cur_args_json, prev_args_json)
+                        cur_args_json, prev_args_json
+                    )
                     logger.debug("got arguments diff: %s", argument_diff)
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=argument_diff).model_dump(
-                                              exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += argument_diff
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=argument_diff
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
                 else:
                     # try parsing it with regular JSON - if it works we're
                     # at the end, and we need to send the difference between
@@ -303,6 +318,6 @@ class JambaToolParser(ToolParser):
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 834b33052b45d038edbbc9b1d093a33a54f586bd..2b84c60a3b841a615767141387b3b65471d4a65b 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -3,81 +3,160 @@
 # code modified from deepseekv3_tool_parser.py
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["kimi_k2"])
 class KimiK2ToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = (
-            [])  # map what has been streamed for each tool so far to a list
-
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
+
+        # Section-level state management to prevent token leakage
+        self.in_tool_section: bool = False
+        self.token_buffer: str = ""
+        # Buffer size: empirical worst-case for longest marker (~30 chars) * 2
+        # + safety margin for unicode + partial overlap. Prevents unbounded growth.
+        self.buffer_max_size: int = 1024
+        self.section_char_count: int = 0  # Track characters processed in tool section
+        self.max_section_chars: int = 8192  # Force exit if section exceeds this
+        self._buffer_overflow_logged: bool = False  # Log overflow once per session
+
+        # Support both singular and plural variants
         self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
         self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
+        self.tool_calls_start_token_variants: list[str] = [
+            "<|tool_calls_section_begin|>",
+            "<|tool_call_section_begin|>",  # singular variant
+        ]
+        self.tool_calls_end_token_variants: list[str] = [
+            "<|tool_calls_section_end|>",
+            "<|tool_call_section_end|>",  # singular variant
+        ]
 
         self.tool_call_start_token: str = "<|tool_call_begin|>"
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>",
+            re.DOTALL,
         )
 
         self.stream_tool_call_portion_regex = re.compile(
             r"(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
         )
 
-        self.stream_tool_call_name_regex = re.compile(
-            r"(?P<tool_call_id>.+:\d+)\s*")
+        self.stream_tool_call_name_regex = re.compile(r"(?P<tool_call_id>.+:\d+)\s*")
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
-        self.tool_calls_start_token_id = self.vocab.get(
-            self.tool_calls_start_token)
-        self.tool_calls_end_token_id = self.vocab.get(
-            self.tool_calls_end_token)
-
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+                "constructor during construction."
+            )
+        self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
+
+        # Get token IDs for all variants
+        self.tool_calls_start_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_start_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+        self.tool_calls_end_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_end_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
-        if (self.tool_calls_start_token_id is None
-                or self.tool_calls_end_token_id is None):
+        if (
+            self.tool_calls_start_token_id is None
+            or self.tool_calls_end_token_id is None
+        ):
             raise RuntimeError(
                 "Kimi-K2 Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+                "tokens in the tokenizer!"
+            )
+
+    def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]:
+        """
+        Check for section begin/end markers in text and strip them.
+        Returns: (cleaned_text, found_section_begin, found_section_end)
+        """
+        found_begin = False
+        found_end = False
+        cleaned = text
+
+        # Check for section begin markers (any variant)
+        for variant in self.tool_calls_start_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_begin = True
+
+        # Check for section end markers (any variant)
+        for variant in self.tool_calls_end_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_end = True
+
+        return cleaned, found_begin, found_end
+
+    def _reset_section_state(self) -> None:
+        """Reset state when exiting tool section."""
+        self.in_tool_section = False
+        self.token_buffer = ""
+        self.section_char_count = 0
+
+    def reset_streaming_state(self) -> None:
+        """
+        Reset all streaming state. Call this between requests to prevent
+        state leakage when parser instance is reused.
+        """
+        # Reset section state
+        self._reset_section_state()
+
+        # Reset parent class state
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+
+        logger.debug("Streaming state reset")
 
     def extract_tool_calls(
         self,
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
         # sanity check; avoid unnecessary processing
         if self.tool_calls_start_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         else:
             try:
@@ -85,26 +164,26 @@ class KimiK2ToolParser(ToolParser):
                 # tag and end-of-string so the result of
                 # findall is an array of tuples where one is a function call and
                 # the other is None
-                function_call_tuples = self.tool_call_regex.findall(
-                    model_output)
+                function_call_tuples = self.tool_call_regex.findall(model_output)
 
                 logger.debug("function_call_tuples: %s", function_call_tuples)
 
                 tool_calls = []
                 for match in function_call_tuples:
                     function_id, function_args = match
-                    # function_id: functions.get_weather:0
-                    function_name = function_id.split('.')[1].split(':')[0]
+                    # function_id: functions.get_weather:0 or get_weather:0
+                    function_name = function_id.split(":")[0].split(".")[-1]
                     tool_calls.append(
                         ToolCall(
                             id=function_id,
-                            type='function',
-                            function=FunctionCall(name=function_name,
-                                                  arguments=function_args),
-                        ))
+                            type="function",
+                            function=FunctionCall(
+                                name=function_name, arguments=function_args
+                            ),
+                        )
+                    )
 
-                content = model_output[:model_output.
-                                       find(self.tool_calls_start_token)]
+                content = model_output[: model_output.find(self.tool_calls_start_token)]
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=tool_calls,
@@ -112,11 +191,10 @@ class KimiK2ToolParser(ToolParser):
                 )
 
             except Exception:
-                logger.exception(
-                    "Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                logger.exception("Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
     def extract_tool_calls_streaming(
         self,
@@ -127,56 +205,150 @@ class KimiK2ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
-        # check to see if we should be streaming a tool call - is there a
-        if self.tool_calls_start_token_id not in current_token_ids:
+
+        # Flag to defer section exit until after tool parsing completes
+        deferred_section_exit = False
+
+        # Add delta to buffer for split marker detection
+        self.token_buffer += delta_text
+
+        # Enforce buffer size limit to prevent memory issues
+        if len(self.token_buffer) > self.buffer_max_size:
+            if not self._buffer_overflow_logged:
+                logger.warning(
+                    "Token buffer exceeded max size (%d bytes), flushing excess. "
+                    "This may indicate very long markers or unusual tokenization.",
+                    self.buffer_max_size,
+                )
+                self._buffer_overflow_logged = True
+            # Keep only the most recent content that might contain partial markers
+            self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :]
+
+        # Check buffer for section markers (handles split tokens)
+        buffered_text, found_section_begin, found_section_end = (
+            self._check_and_strip_markers(self.token_buffer)
+        )
+
+        # Track section state transitions
+        if found_section_begin and not self.in_tool_section:
+            logger.debug("Entering tool section")
+            self.in_tool_section = True
+            self.token_buffer = buffered_text  # Use cleaned buffer
+            self.section_char_count = 0  # Reset counter for new section
+        if found_section_end and self.in_tool_section:
+            logger.debug("Detected section end marker")
+            # CRITICAL: Don't exit early if tool_call_end is in this chunk.
+            # Tool parser must emit final arguments/close first to avoid dropping
+            # the final tool update and leaking tokens into reasoning channel.
+            has_tool_end = self.tool_call_end_token_id in delta_token_ids
+            if has_tool_end:
+                # Defer exit until after tool parsing completes
+                deferred_section_exit = True
+                logger.debug("Deferring section exit: tool_call_end in same chunk")
+                self.token_buffer = buffered_text
+            else:
+                # No tool call ending, safe to exit immediately
+                logger.debug("Exiting tool section")
+                remaining = buffered_text
+                self._reset_section_state()
+                # Return remaining text as reasoning content if non-empty
+                if remaining.strip():
+                    return DeltaMessage(content=remaining)
+                # Return empty delta to maintain function contract
+                # (always returns DeltaMessage)
+                return DeltaMessage(content="")
+        else:
+            self.token_buffer = buffered_text
+
+        # Check if any variant of section start token is in current_token_ids
+        has_section_token = any(
+            tid in current_token_ids for tid in self.tool_calls_start_token_ids
+        )
+
+        # Early return: if no section token detected yet, return as reasoning content
+        if not has_section_token and not self.in_tool_section:
             logger.debug("No tool call tokens found!")
+            # Don't clear buffer - it needs to accumulate partial markers across deltas
+            # Buffer overflow is already protected by lines 215-224
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token,
-                                        "").replace(self.tool_calls_end_token,
-                                                    "")
-        try:
 
+        # Strip section markers from delta_text for subsequent processing
+        # NOTE: This preprocessing happens BEFORE the regex-based tool call
+        # parsing (from PR #24847) to ensure markers are removed cleanly
+        # before pattern matching. No double-stripping occurs because
+        # section markers and tool call markers are distinct.
+        delta_text, _, _ = self._check_and_strip_markers(delta_text)
+
+        # Error recovery: If in tool section for too long, force exit
+        if self.in_tool_section:
+            self.section_char_count += len(delta_text)
+            if self.section_char_count > self.max_section_chars:
+                logger.warning(
+                    "Tool section exceeded max length (%d chars), forcing exit. "
+                    "This may indicate malformed model output.",
+                    self.max_section_chars,
+                )
+                self._reset_section_state()
+                # Deferred exit already handled by forced exit above
+                # Return remaining content as reasoning (or empty delta if no content)
+                return DeltaMessage(content=delta_text if delta_text.strip() else "")
+
+        try:
             # figure out where we are in the parsing by counting tool call
             # start & end tags
             prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id)
             cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
+                self.tool_call_start_token_id
+            )
+            cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id)
             tool_call_portion = None
             text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
-            if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count
-                    and self.tool_call_end_token not in delta_text):
+            if (
+                cur_tool_start_count == cur_tool_end_count
+                and prev_tool_end_count == cur_tool_end_count
+                and self.tool_call_end_token not in delta_text
+            ):
+                # CRITICAL FIX: Suppress content if in tool section but
+                # no tool calls started
+                if self.in_tool_section and cur_tool_start_count == 0:
+                    logger.debug(
+                        "In tool section but no tool calls started yet. "
+                        "Suppressing: %s",
+                        delta_text,
+                    )
+                    # Return empty delta to maintain iterator contract
+                    return DeltaMessage(content="")
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
             if self.tool_call_end_token in delta_text:
                 logger.debug("tool_call_end_token in delta_text")
                 full_text = current_text + delta_text
-                tool_call_portion = full_text.split(
-                    self.tool_call_start_token)[-1].split(
-                        self.tool_call_end_token)[0].rstrip()
-                delta_text = delta_text.split(
-                    self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(
-                    self.tool_call_end_token)[-1].lstrip()
+                tool_call_portion = (
+                    full_text.split(self.tool_call_start_token)[-1]
+                    .split(self.tool_call_end_token)[0]
+                    .rstrip()
+                )
+                delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip()
 
             # case -- we're starting a new tool call
-            if (cur_tool_start_count > cur_tool_end_count
-                    and cur_tool_start_count > prev_tool_start_count):
+            if (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count > prev_tool_start_count
+            ):
                 if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(
-                        self.tool_call_start_token)[-1]
+                    tool_call_portion = current_text.split(self.tool_call_start_token)[
+                        -1
+                    ]
                 else:
                     tool_call_portion = None
                     delta = None
@@ -190,28 +362,36 @@ class KimiK2ToolParser(ToolParser):
                 logger.debug("Starting on a new tool %s", self.current_tool_id)
 
             # case -- we're updating an existing tool call
-            elif (cur_tool_start_count > cur_tool_end_count
-                  and cur_tool_start_count == prev_tool_start_count):
-
+            elif (
+                cur_tool_start_count > cur_tool_end_count
+                and cur_tool_start_count == prev_tool_start_count
+            ):
                 # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(
-                    self.tool_call_start_token)[-1]
+                tool_call_portion = current_text.split(self.tool_call_start_token)[-1]
                 text_portion = None
 
             # case -- the current tool call is being closed.
-            elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count >= prev_tool_end_count):
-                if self.prev_tool_call_arr is None or len(
-                        self.prev_tool_call_arr) == 0:
-                    logger.debug(
-                        "attempting to close tool call, but no tool call")
+            elif (
+                cur_tool_start_count == cur_tool_end_count
+                and cur_tool_end_count >= prev_tool_end_count
+            ):
+                if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
+                    logger.debug("attempting to close tool call, but no tool call")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
                     return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments")
+                diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
-                    diff = (diff.encode("utf-8").decode("unicode_escape")
-                            if diff is str else diff)
+                    diff = (
+                        diff.encode("utf-8").decode("unicode_escape")
+                        if diff is str
+                        else diff
+                    )
                     if '"}' not in delta_text:
+                        # Handle deferred section exit before returning
+                        if deferred_section_exit and self.in_tool_section:
+                            self._reset_section_state()
                         return None
                     end_loc = delta_text.rindex('"}')
                     diff = delta_text[:end_loc] + '"}'
@@ -221,40 +401,57 @@ class KimiK2ToolParser(ToolParser):
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=diff).model_dump(exclude_none=True),
-                        )
-                    ])
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        logger.debug("Completing deferred section exit")
+                        self._reset_section_state()
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(arguments=diff).model_dump(
+                                    exclude_none=True
+                                ),
+                            )
+                        ]
+                    )
 
             # case -- otherwise we're just generating text
             else:
+                # Check if we're in tool section - if so, suppress
+                if self.in_tool_section:
+                    logger.debug("In tool section, suppressing text generation")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit:
+                        self._reset_section_state()
+                    return DeltaMessage(content="")
                 text = delta_text.replace(self.tool_call_start_token, "")
                 text = text.replace(self.tool_call_end_token, "")
                 delta = DeltaMessage(tool_calls=[], content=text)
+                # Handle deferred section exit before returning
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
                 return delta
 
             current_tool_call = dict()
             if tool_call_portion:
-                current_tool_call_matches = (
-                    self.stream_tool_call_portion_regex.match(
-                        tool_call_portion))
+                current_tool_call_matches = self.stream_tool_call_portion_regex.match(
+                    tool_call_portion
+                )
                 if current_tool_call_matches:
-                    tool_id, tool_args = (current_tool_call_matches.groups())
-                    tool_name = tool_id.split('.')[1].split(':')[0]
-                    current_tool_call['id'] = tool_id
+                    tool_id, tool_args = current_tool_call_matches.groups()
+                    tool_name = tool_id.split(":")[0].split(".")[-1]
+                    current_tool_call["id"] = tool_id
                     current_tool_call["name"] = tool_name
                     current_tool_call["arguments"] = tool_args
                 else:
                     current_tool_call_name_matches = (
-                        self.stream_tool_call_name_regex.match(
-                            tool_call_portion))
+                        self.stream_tool_call_name_regex.match(tool_call_portion)
+                    )
                     if current_tool_call_name_matches:
-                        tool_id_str, = current_tool_call_name_matches.groups()
-                        tool_name = tool_id_str.split('.')[1].split(':')[0]
-                        current_tool_call['id'] = tool_id_str
+                        (tool_id_str,) = current_tool_call_name_matches.groups()
+                        tool_name = tool_id_str.split(":")[0].split(".")[-1]
+                        current_tool_call["id"] = tool_id_str
                         current_tool_call["name"] = tool_name
                         current_tool_call["arguments"] = ""
                     else:
@@ -266,20 +463,22 @@ class KimiK2ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 tool_id = current_tool_call.get("id")
                 if function_name:
                     self.current_tool_name_sent = True
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            type="function",
-                            id=tool_id,
-                            function=DeltaFunctionCall(
-                                name=function_name).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=tool_id,
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                 else:
                     return None
 
@@ -289,15 +488,19 @@ class KimiK2ToolParser(ToolParser):
             if tool_call_portion is None:
                 # if there's text but not tool calls, send that -
                 # otherwise None to skip chunk
-                delta = (DeltaMessage(
-                    content=delta_text) if text_portion is not None else None)
+                delta = (
+                    DeltaMessage(content=delta_text)
+                    if text_portion is not None
+                    else None
+                )
                 return delta
 
             # now, the nitty-gritty of tool calls
             # now we have the portion to parse as tool call.
 
-            logger.debug("Trying to parse current tool call with ID %s",
-                         self.current_tool_id)
+            logger.debug(
+                "Trying to parse current tool call with ID %s", self.current_tool_id
+            )
 
             # if we're starting a new tool call, push an empty object in as
             #   a placeholder for the arguments
@@ -307,7 +510,8 @@ class KimiK2ToolParser(ToolParser):
             # main logic for tool parsing here - compare prev. partially-parsed
             #   JSON to the current partially-parsed JSON
             prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                "arguments")
+                "arguments"
+            )
             cur_arguments = current_tool_call.get("arguments")
 
             logger.debug("diffing old arguments: %s", prev_arguments)
@@ -321,55 +525,64 @@ class KimiK2ToolParser(ToolParser):
             # case -- prev arguments are defined, but non are now.
             #   probably impossible, but not a fatal error - just keep going
             elif not cur_arguments and prev_arguments:
-                logger.error("should be impossible to have arguments reset "
-                             "mid-call. skipping streaming anything.")
+                logger.error(
+                    "should be impossible to have arguments reset "
+                    "mid-call. skipping streaming anything."
+                )
                 delta = None
 
             # case -- we now have the first info about arguments available from
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
-
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_id,
-                        function=DeltaFunctionCall(
-                            arguments=cur_arguments).model_dump(
-                                exclude_none=True),
-                    )
-                ])
-                self.streamed_args_for_tool[
-                    self.current_tool_id] = cur_arguments
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=cur_arguments
+                            ).model_dump(exclude_none=True),
+                        )
+                    ]
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
 
             # last case -- we have an update to existing arguments.
             elif cur_arguments and prev_arguments:
-                if (isinstance(delta_text, str)
-                        and cur_arguments != prev_arguments
-                        and len(cur_arguments) > len(prev_arguments)
-                        and cur_arguments.startswith(prev_arguments)):
-                    delta_arguments = cur_arguments[len(prev_arguments):]
+                if (
+                    isinstance(delta_text, str)
+                    and cur_arguments != prev_arguments
+                    and len(cur_arguments) > len(prev_arguments)
+                    and cur_arguments.startswith(prev_arguments)
+                ):
+                    delta_arguments = cur_arguments[len(prev_arguments) :]
                     logger.debug("got diff %s", delta_text)
 
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=delta_arguments).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] = cur_arguments
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=delta_arguments
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
                 else:
                     delta = None
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
             if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[
-                    self.current_tool_id] = current_tool_call
+                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
+            # Handle deferred section exit after tool parsing completes
+            if deferred_section_exit and self.in_tool_section:
+                logger.debug("Completing deferred section exit")
+                self._reset_section_state()
+
             return delta
 
         except Exception:
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 9a9a19ce2188eaa93e01ad6496074a66068f80b2..1d6de9244066e8a8e89f1a4443eab02f3360904e 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -3,19 +3,24 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -25,12 +30,12 @@ class _UnexpectedAstError(Exception):
     pass
 
 
-@ToolParserManager.register_module("llama4_pythonic")
 class Llama4PythonicToolParser(ToolParser):
     """
     Toolcall parser for Llama4 that produce tool calls in a pythonic style
     Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
     """
+
     # TODO(mdepinet): Possible future improvements:
     #   1. Support text + tools separated by either <|python_tag|> or \n\n
     #   2. Support tools outside of a list (or separated by a semicolon).
@@ -40,7 +45,8 @@ class Llama4PythonicToolParser(ToolParser):
 
     TOOL_CALL_REGEX = re.compile(
         r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
-        re.DOTALL)
+        re.DOTALL,
+    )
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
@@ -55,8 +61,8 @@ class Llama4PythonicToolParser(ToolParser):
         self.current_tool_id = value
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
         """
@@ -64,46 +70,52 @@ class Llama4PythonicToolParser(ToolParser):
         # remove <|python_start|> and <|python_end|>
         # as Llama 4 model sometime will output those tokens
         if model_output.startswith("<|python_start|>"):
-            model_output = model_output[len("<|python_start|>"):]
+            model_output = model_output[len("<|python_start|>") :]
             model_output = model_output.replace("<|python_end|>", "")
 
         is_tool_call_pattern = False
         try:
-            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
-                model_output,
-                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+            is_tool_call_pattern = (
+                self.TOOL_CALL_REGEX.match(
+                    model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+                )
+                is not None
+            )
         except TimeoutError:
-            logger.warning(
-                "Regex timeout occurred when matching tool call pattern.")
-            logger.debug("Regex timeout occurred when matching user input: %s",
-                         model_output)
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
 
         if not is_tool_call_pattern:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
             module = ast.parse(model_output)
             parsed = getattr(module.body[0], "value", None)
             if isinstance(parsed, ast.List) and all(
-                    isinstance(e, ast.Call) for e in parsed.elts):
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=[
                         _handle_single_tool(e)  # type: ignore
                         for e in parsed.elts
                     ],
-                    content=None)
+                    content=None,
+                )
             else:
                 raise _UnexpectedAstError(
-                    "Tool output must be a list of function calls")
+                    "Tool output must be a list of function calls"
+                )
         except Exception:
             logger.exception("Error in extracting tool call from response.")
             # Treat as regular text
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -114,19 +126,18 @@ class Llama4PythonicToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         if not current_text.startswith("[") and not current_text.startswith(
-                "<|python_start|>"):
+            "<|python_start|>"
+        ):
             return DeltaMessage(content=delta_text)
 
         try:
             # remove <|python_start|> and <|python_end|>
             if current_text.startswith("<|python_start|>"):
-                current_text = current_text[len("<|python_start|>"):]
+                current_text = current_text[len("<|python_start|>") :]
             if current_text.endswith("<|python_end|>"):
-                current_text = current_text[:current_text.
-                                            rfind("<|python_end|>")]
+                current_text = current_text[: current_text.rfind("<|python_end|>")]
             valid_and_added_text = _make_valid_python(current_text)
             if valid_and_added_text is None:
                 return None
@@ -135,9 +146,11 @@ class Llama4PythonicToolParser(ToolParser):
             module = ast.parse(valid_text)
             parsed = getattr(module.body[0], "value", None)
             if not isinstance(parsed, ast.List) or not all(
-                    isinstance(e, ast.Call) for e in parsed.elts):
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
                 raise _UnexpectedAstError(
-                    "Tool output must be a list of function calls")
+                    "Tool output must be a list of function calls"
+                )
             tool_calls = [
                 _handle_single_tool(e)  # type: ignore
                 for e in parsed.elts
@@ -152,34 +165,36 @@ class Llama4PythonicToolParser(ToolParser):
                 if len(self.streamed_args_for_tool) == index:
                     self.streamed_args_for_tool.append("")
 
-                new_call_complete = index < len(
-                    tool_calls) - 1 or ")]" not in added_text
+                new_call_complete = (
+                    index < len(tool_calls) - 1 or ")]" not in added_text
+                )
                 if new_call_complete:
                     self.current_tool_index += 1
 
-                withheld_suffix = (added_text[:-2]
-                                   if not new_call_complete else "")
+                withheld_suffix = added_text[:-2] if not new_call_complete else ""
                 if not new_call_complete and added_text[-2] == ")":
                     # Function call is incomplete. Withhold the closing bracket.
                     withheld_suffix = withheld_suffix + "}"
                 # Strings get single quotes in the model-produced string.
                 # JSON requires double quotes.
                 withheld_suffix = withheld_suffix.replace("'", '"')
-                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
-                                            new_call, index, withheld_suffix)
+                delta = _compute_tool_delta(
+                    self.streamed_args_for_tool[index], new_call, index, withheld_suffix
+                )
 
                 if delta is not None:
                     tool_deltas.append(delta)
-                    if (delta.function is not None
-                            and delta.function.arguments is not None):
-                        self.streamed_args_for_tool[
-                            index] += delta.function.arguments
-
-        # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining its final streaming delta, automatically
-        # adding autocompleted JSON.
-        # These two lines avoid that nonsense while ensuring finish_reason
-        # is set to tool_calls when at least one tool is called.
+                    if (
+                        delta.function is not None
+                        and delta.function.arguments is not None
+                    ):
+                        self.streamed_args_for_tool[index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining its final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
             if tool_deltas and not self.prev_tool_call_arr:
                 self.prev_tool_call_arr = [{"arguments": {}}]
 
@@ -188,14 +203,14 @@ class Llama4PythonicToolParser(ToolParser):
             elif not added_text and self.current_tool_id > 0:
                 # Return an empty DeltaMessage once the tool calls are all done
                 # so that finish_reason gets set.
-                return DeltaMessage(content='')
+                return DeltaMessage(content="")
             else:
                 return None
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
 
 
@@ -204,8 +219,7 @@ def _get_parameter_value(val: ast.expr) -> Any:
         return val.value
     elif isinstance(val, ast.Dict):
         if not all(isinstance(k, ast.Constant) for k in val.keys):
-            raise _UnexpectedAstError(
-                "Dict tool call arguments must have literal keys")
+            raise _UnexpectedAstError("Dict tool call arguments must have literal keys")
         return {
             k.value: _get_parameter_value(v)  # type: ignore
             for k, v in zip(val.keys, val.values)
@@ -223,12 +237,13 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     arguments = {}
     for keyword in call.keywords:
         arguments[keyword.arg] = _get_parameter_value(keyword.value)
-    return ToolCall(type="function",
-                    function=FunctionCall(name=function_name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name, arguments=json.dumps(arguments)),
+    )
 
 
-def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+def _make_valid_python(text: str) -> tuple[str, str] | None:
     bracket_stack = []
     for index, char in enumerate(text):
         if char in {"[", "(", "{"}:
@@ -261,21 +276,25 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
         # we can't fill in a valid value.
         return None
     if bracket_stack and bracket_stack[-1] == "{":
-        trailing_dict_text = text[:text.rfind("{")]
+        trailing_dict_text = text[: text.rfind("{")]
         num_keys = trailing_dict_text.count(":")
         num_values = trailing_dict_text.count(",")
         if num_keys <= num_values:
             return None  # Incomplete property name within parameter value
     if bracket_stack and bracket_stack[-1] == "(":
-        trailing_params_text = text[:text.rfind("(")]
+        trailing_params_text = text[: text.rfind("(")]
         num_full_param_names = trailing_params_text.count("=")
         num_full_param_values = trailing_params_text.count(",")
         if num_full_param_names <= num_full_param_values:
             return None  # Incomplete parameter name
     if text.endswith(","):
         text = text[:-1]
-    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
-            "[") and not text.endswith(")"):
+    if (
+        bracket_stack
+        and bracket_stack[-1] == "["
+        and not text.endswith("[")
+        and not text.endswith(")")
+    ):
         return None  # Incomplete function name
 
     added_text = ""
@@ -294,23 +313,29 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
     return text + added_text, added_text
 
 
-def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
-                        index: int,
-                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+def _compute_tool_delta(
+    previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
+) -> DeltaToolCall | None:
     new_call_args = new_call.function.arguments
     if withheld_suffix:
         assert new_call_args.endswith(withheld_suffix)
-        new_call_args = new_call_args[:-len(withheld_suffix)]
+        new_call_args = new_call_args[: -len(withheld_suffix)]
     if not previously_sent_args:
-        return DeltaToolCall(id=new_call.id,
-                             type="function",
-                             index=index,
-                             function=DeltaFunctionCall(
-                                 name=new_call.function.name,
-                                 arguments=new_call_args,
-                             ))
-
-    arg_diff = new_call_args[len(previously_sent_args):]
-    return DeltaToolCall(
-        id=None, index=index, function=DeltaFunctionCall(
-            arguments=arg_diff)) if arg_diff else None
+        return DeltaToolCall(
+            id=new_call.id,
+            type="function",
+            index=index,
+            function=DeltaFunctionCall(
+                name=new_call.function.name,
+                arguments=new_call_args,
+            ),
+        )
+
+    arg_diff = new_call_args[len(previously_sent_args) :]
+    return (
+        DeltaToolCall(
+            id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff)
+        )
+        if arg_diff
+        else None
+    )
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 31b19c8db4163410dc363dda70b98d5da84ce471..02fc9b8a4d34e51515ae9142d30e12d001d9f261 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -11,29 +10,34 @@ from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
-                                                        is_complete_json,
-                                                        partial_json_loads)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    find_common_prefix,
+    is_complete_json,
+    partial_json_loads,
+)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("llama3_json")
-@ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
     Tool call parser for Llama 3.x and 4 models intended for use with the
     examples/tool_chat_template_llama.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or 
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or
     llama4_json are set.
     """
 
@@ -45,42 +49,45 @@ class Llama3JsonToolParser(ToolParser):
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: list[str] = [
-        ]  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
         self.bot_token = "<|python_tag|>"
-        self.bot_token_id = tokenizer.encode(self.bot_token,
-                                             add_special_tokens=False)[0]
+        self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
+            0
+        ]
         # Updated regex to match multiple JSONs separated by semicolons
         # This pattern is more robust and can handle nested JSON objects
         self.tool_call_regex = re.compile(
-            r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
-            re.DOTALL)
+            r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
+            re.DOTALL,
+        )
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
         Only extracts JSON content and ignores any surrounding plain text.
         Supports both single JSON and multiple JSONs separated by semicolons.
         """
         # Quick check before running regex
-        if not (self.bot_token in model_output or '{' in model_output):
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+        if not (self.bot_token in model_output or "{" in model_output):
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         # Find JSON object(s) in the text using regex
         match = self.tool_call_regex.search(model_output)
         if not match:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
             json_str = match.group(0)
             # Split by semicolon and strip whitespace
-            json_objects = [obj.strip() for obj in json_str.split(';')]
+            json_objects = [obj.strip() for obj in json_str.split(";")]
 
             tool_calls: list[ToolCall] = []
             for json_obj in json_objects:
@@ -95,19 +102,24 @@ class Llama3JsonToolParser(ToolParser):
                             # function call args are JSON but as a string
                             arguments=json.dumps(
                                 obj["arguments"]
-                                if "arguments" in obj else obj["parameters"],
-                                ensure_ascii=False))))
-
-            return ExtractedToolCallInformation(tools_called=True,
-                                                tool_calls=tool_calls,
-                                                content=None)
+                                if "arguments" in obj
+                                else obj["parameters"],
+                                ensure_ascii=False,
+                            ),
+                        ),
+                    )
+                )
+
+            return ExtractedToolCallInformation(
+                tools_called=True, tool_calls=tool_calls, content=None
+            )
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
             # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -118,48 +130,50 @@ class Llama3JsonToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
-        if not (current_text.startswith(self.bot_token)
-                or current_text.startswith('{')):
+    ) -> DeltaMessage | None:
+        if not (
+            current_text.startswith(self.bot_token) or current_text.startswith("{")
+        ):
             return DeltaMessage(content=delta_text)
 
         # bit mask flags for partial JSON parsing. If the name hasn't been
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
             tool_call_arr = []
             is_complete = []
             try:
                 # depending on the prompt format the Llama model may or may not
                 # prefix the output with the <|python_tag|> token
-                start_idx = len(self.bot_token) if current_text.startswith(
-                    self.bot_token) else 0
+                start_idx = (
+                    len(self.bot_token)
+                    if current_text.startswith(self.bot_token)
+                    else 0
+                )
                 while start_idx < len(current_text):
-                    (obj,
-                     end_idx) = partial_json_loads(current_text[start_idx:],
-                                                   flags)
+                    (obj, end_idx) = partial_json_loads(current_text[start_idx:], flags)
                     is_complete.append(
-                        is_complete_json(current_text[start_idx:start_idx +
-                                                      end_idx]))
-                    start_idx += end_idx + len('; ')
+                        is_complete_json(current_text[start_idx : start_idx + end_idx])
+                    )
+                    start_idx += end_idx + len("; ")
                     # depending on the prompt Llama can use
                     # either arguments or parameters
                     if "parameters" in obj:
-                        assert "arguments" not in obj, \
+                        assert "arguments" not in obj, (
                             "model generated both parameters and arguments"
+                        )
                         obj["arguments"] = obj["parameters"]
                     tool_call_arr.append(obj)
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
-                if len(tool_call_arr) > 0 else {}
+            current_tool_call: dict = (
+                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
+            )
 
             # case -- if no tokens have been streamed for the tool, e.g.
             #   only the array brackets, stream nothing
@@ -168,9 +182,9 @@ class Llama3JsonToolParser(ToolParser):
 
             # case: we are starting a new tool in the array
             #   -> array has > 0 length AND length has moved past cursor
-            elif (len(tool_call_arr) > 0
-                  and len(tool_call_arr) > self.current_tool_id + 1):
-
+            elif (
+                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
+            ):
                 # if we're moving on to a new call, first make sure we
                 # haven't missed anything in the previous one that was
                 # auto-generated due to JSON completions, but wasn't
@@ -178,21 +192,24 @@ class Llama3JsonToolParser(ToolParser):
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments,
-                                                   ensure_ascii=False)
-                        sent = len(
-                            self.streamed_args_for_tool[self.current_tool_id])
+                        cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                        sent = len(self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
 
                         logger.debug("got arguments diff: %s", argument_diff)
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
                     else:
                         delta = None
                 else:
@@ -209,15 +226,18 @@ class Llama3JsonToolParser(ToolParser):
             elif not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=make_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=make_tool_call_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
                 else:
                     delta = None
@@ -229,34 +249,35 @@ class Llama3JsonToolParser(ToolParser):
                 delta = None
 
                 if cur_arguments:
-                    sent = len(
-                        self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_arguments = self.prev_tool_call_arr[
-                        self.current_tool_id].get("arguments")
+                    sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                        "arguments"
+                    )
 
                     argument_diff = None
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments,
-                                                    ensure_ascii=False)
+                        prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
                         if cur_args_json != prev_args_json:
-
-                            prefix = find_common_prefix(
-                                prev_args_json, cur_args_json)
+                            prefix = find_common_prefix(prev_args_json, cur_args_json)
                             argument_diff = prefix[sent:]
 
                     if argument_diff is not None:
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=argument_diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            argument_diff
+                        )
 
             self.prev_tool_call_arr = tool_call_arr
             return delta
@@ -264,6 +285,6 @@ class Llama3JsonToolParser(ToolParser):
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index 87a3fdc44397e8cd328ab7f81fd10545ac668028..c6c8ae8ae95f15710102cec08462ec50e204c5de 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -3,16 +3,11 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import (
-    Hermes2ProToolParser)
+from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 
-@ToolParserManager.register_module("longcat")
 class LongcatFlashToolParser(Hermes2ProToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
@@ -21,12 +16,15 @@ class LongcatFlashToolParser(Hermes2ProToolParser):
 
         self.tool_call_regex = re.compile(
             r"<longcat_tool_call>(.*?)</longcat_tool_call>|<longcat_tool_call>(.*)",
-            re.DOTALL)
+            re.DOTALL,
+        )
 
         self.tool_call_start_token_ids = self.model_tokenizer.encode(
-            self.tool_call_start_token, add_special_tokens=False)
+            self.tool_call_start_token, add_special_tokens=False
+        )
         self.tool_call_end_token_ids = self.model_tokenizer.encode(
-            self.tool_call_end_token, add_special_tokens=False)
+            self.tool_call_end_token, add_special_tokens=False
+        )
 
         self.tool_call_start_token_array = [
             self.model_tokenizer.decode([token_id])
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c2258ba62b296b6c2dac90e24782a56cc3e3c8e
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -0,0 +1,643 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class MinimaxM2ToolParser(ToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.prev_tool_call_arr: list[dict] = []
+
+        # Sentinel tokens
+        self.tool_call_start_token: str = "<minimax:tool_call>"
+        self.tool_call_end_token: str = "</minimax:tool_call>"
+        self.invoke_start_prefix: str = "<invoke name="
+        self.invoke_end_token: str = "</invoke>"
+        self.parameter_prefix: str = "<parameter name="
+        self.parameter_end_token: str = "</parameter>"
+
+        # Streaming state variables
+        self.current_tool_name_sent: bool = False
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: str | None = None  # type: ignore
+        self.streamed_args_for_tool: list[str] = []
+        self.is_tool_call_started: bool = False
+        self.failed_count: int = 0
+
+        # Initialize streaming state variables
+        self.current_tool_index: int = 0
+        self.invoke_index: int = 0
+        self.header_sent: bool = False
+        self.current_function_name: str | None = None
+        self.current_param_name: str | None = None
+        self.current_param_value: str = ""
+        self.param_count: int = 0
+        self.in_param: bool = False
+        self.in_function: bool = False
+        self.accumulated_text: str = ""
+        self.json_started: bool = False
+        self.json_closed: bool = False
+        self.accumulated_params: dict = {}
+        self.streaming_request: ChatCompletionRequest | None = None
+
+        # Enhanced streaming state - reset for each new message
+        self._reset_streaming_state()
+
+        # Regex patterns for complete parsing
+        self.tool_call_complete_regex = re.compile(
+            r"<minimax:tool_call>(.*?)</minimax:tool_call>", re.DOTALL
+        )
+        self.invoke_complete_regex = re.compile(
+            r"<invoke name=(.*?)</invoke>", re.DOTALL
+        )
+        self.parameter_complete_regex = re.compile(
+            r"<parameter name=(.*?)</parameter>", re.DOTALL
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction."
+            )
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
+            raise RuntimeError(
+                "MiniMax M2 Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!"
+            )
+
+        logger.debug(
+            "vLLM Successfully import tool parser %s !", self.__class__.__name__
+        )
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.invoke_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_id = None
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.accumulated_text = ""
+        self.json_started = False
+        self.json_closed = False
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+        # Clear previous tool call history to avoid state pollution
+        self.prev_tool_call_arr.clear()
+
+    def _extract_name(self, name_str: str) -> str:
+        """Extract name from quoted string."""
+        name_str = name_str.strip()
+        if (
+            name_str.startswith('"')
+            and name_str.endswith('"')
+            or name_str.startswith("'")
+            and name_str.endswith("'")
+        ):
+            return name_str[1:-1]
+        return name_str
+
+    def _convert_param_value(self, value: str, param_type: str) -> Any:
+        """Convert parameter value to the correct type."""
+        if value.lower() == "null":
+            return None
+
+        param_type = param_type.lower()
+        if param_type in ["string", "str", "text"]:
+            return value
+        elif param_type in ["integer", "int"]:
+            try:
+                return int(value)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["number", "float"]:
+            try:
+                val = float(value)
+                return val if val != int(val) else int(val)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["boolean", "bool"]:
+            return value.lower() in ["true", "1"]
+        elif param_type in ["object", "array"]:
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+        else:
+            # Try JSON parse first, fallback to string
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+
+    def _parse_single_invoke(
+        self, invoke_str: str, tools: list | None
+    ) -> ToolCall | None:
+        """Parse a single <invoke> block."""
+        # Extract function name
+        name_match = re.search(r"^([^>]+)", invoke_str)
+        if not name_match:
+            return None
+
+        function_name = self._extract_name(name_match.group(1))
+
+        # Get parameter configuration
+        param_config = {}
+        if tools:
+            for tool in tools:
+                if (
+                    hasattr(tool, "function")
+                    and tool.function.name == function_name
+                    and hasattr(tool.function, "parameters")
+                ):
+                    params = tool.function.parameters
+                    if isinstance(params, dict) and "properties" in params:
+                        param_config = params["properties"]
+                    break
+
+        # Extract parameters
+        param_dict = {}
+        for match in self.parameter_complete_regex.findall(invoke_str):
+            param_match = re.search(r"^([^>]+)>(.*)", match, re.DOTALL)
+            if param_match:
+                param_name = self._extract_name(param_match.group(1))
+                param_value = param_match.group(2).strip()
+                if param_value.startswith("\n"):
+                    param_value = param_value[1:]
+                if param_value.endswith("\n"):
+                    param_value = param_value[:-1]
+
+                # Get parameter type
+                param_type = "string"
+                if (
+                    param_name in param_config
+                    and isinstance(param_config[param_name], dict)
+                    and "type" in param_config[param_name]
+                ):
+                    param_type = param_config[param_name]["type"]
+
+                # Convert value
+                param_dict[param_name] = self._convert_param_value(
+                    param_value, param_type
+                )
+
+        return ToolCall(
+            type="function",
+            function=FunctionCall(
+                name=function_name,
+                arguments=json.dumps(param_dict, ensure_ascii=False),
+            ),
+        )
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """Extract tool calls from complete model output (non-streaming)."""
+        # Quick check
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        try:
+            tool_calls = []
+
+            # Find all complete tool_call blocks
+            for tool_call_match in self.tool_call_complete_regex.findall(model_output):
+                # Find all invokes within this tool_call
+                for invoke_match in self.invoke_complete_regex.findall(tool_call_match):
+                    tool_call = self._parse_single_invoke(
+                        invoke_match, request.tools if request else None
+                    )
+                    if tool_call:
+                        tool_calls.append(tool_call)
+
+            if not tool_calls:
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
+
+            # Update prev_tool_call_arr
+            self.prev_tool_call_arr.clear()
+            for tool_call in tool_calls:
+                self.prev_tool_call_arr.append(
+                    {
+                        "name": tool_call.function.name,
+                        "arguments": tool_call.function.arguments,
+                    }
+                )
+
+            # Extract content before first tool call
+            first_tool_idx = model_output.find(self.tool_call_start_token)
+            content = model_output[:first_tool_idx] if first_tool_idx > 0 else None
+
+            return ExtractedToolCallInformation(
+                tools_called=True, tool_calls=tool_calls, content=content
+            )
+
+        except Exception:
+            logger.exception("Error extracting tool calls")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        current_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        """Extract tool calls from streaming model output."""
+
+        # Store request for type conversion
+        if not previous_text or self.tool_call_start_token in delta_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids:
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text)
+                )
+
+                # If we have completed tool calls and populated prev_tool_call_arr
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
+                    # Check if all tool calls are closed
+                    open_calls = current_text.count(
+                        self.tool_call_start_token
+                    ) - current_text.count(self.tool_call_end_token)
+                    if open_calls == 0:
+                        # Return empty delta for finish_reason processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Update accumulated text
+        self.accumulated_text = current_text
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            invoke_ends = current_text.count(self.invoke_end_token)
+            if invoke_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+                self.in_function = False  # Now we can safely set this to False
+                self.accumulated_params = {}
+                # Continue processing next tool
+                return None
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if (
+                self.tool_call_start_token_id in delta_token_ids
+                or self.tool_call_start_token in delta_text
+            ):
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.tool_call_start_token in delta_text:
+                    content_before = delta_text[
+                        : delta_text.index(self.tool_call_start_token)
+                    ]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (
+                    current_text.rstrip().endswith(self.tool_call_end_token)
+                    and delta_text.strip() == ""
+                ):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        invoke_starts_count = current_text.count(self.invoke_start_prefix)
+        if self.current_tool_index >= invoke_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # Find the current tool call portion
+        invoke_start_positions: list[int] = []
+        idx = 0
+        while True:
+            idx = current_text.find(self.invoke_start_prefix, idx)
+            if idx == -1:
+                break
+            invoke_start_positions.append(idx)
+            idx += len(self.invoke_start_prefix)
+
+        if self.current_tool_index >= len(invoke_start_positions):
+            # No more tool calls to process yet
+            return None
+
+        invoke_start_idx = invoke_start_positions[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        invoke_end_idx = current_text.find(self.invoke_end_token, invoke_start_idx)
+        if invoke_end_idx == -1:
+            tool_text = current_text[invoke_start_idx:]
+        else:
+            tool_text = current_text[
+                invoke_start_idx : invoke_end_idx + len(self.invoke_end_token)
+            ]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.invoke_start_prefix in tool_text:
+                func_start = tool_text.find(self.invoke_start_prefix) + len(
+                    self.invoke_start_prefix
+                )
+                # Find the end quote for the function name
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    function_name_raw = tool_text[func_start:func_end]
+                    self.current_function_name = self._extract_name(function_name_raw)
+                    self.current_tool_id = self._generate_tool_call_id()
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # Add to prev_tool_call_arr immediately when we detect a tool call
+                    # Each tool call should be recorded regardless of function name
+                    # Ensure we don't add the same tool call index multiple times
+                    if len(self.prev_tool_call_arr) <= self.current_tool_index:
+                        self.prev_tool_call_arr.append(
+                            {
+                                "name": self.current_function_name,
+                                "arguments": "{}",  # Placeholder, will be updated later
+                            }
+                        )
+
+                    # Send header with function info
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                id=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    name=self.current_function_name, arguments=""
+                                ),
+                                type="function",
+                            )
+                        ]
+                    )
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if self.in_function and not self.json_started:
+                self.json_started = True
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="{"),
+                        )
+                    ]
+                )
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.invoke_end_token in tool_text:
+                # Count total parameters in the tool text
+                total_param_count = tool_text.count(self.parameter_prefix)
+
+                # Only close JSON if all parameters have been processed
+                if self.param_count >= total_param_count:
+                    # Close JSON
+                    self.json_closed = True
+
+                    # Extract complete tool call
+                    # Find the invoke content
+                    invoke_start = tool_text.find(self.invoke_start_prefix) + len(
+                        self.invoke_start_prefix
+                    )
+                    invoke_content_end = tool_text.find(
+                        self.invoke_end_token, invoke_start
+                    )
+                    if invoke_content_end != -1:
+                        invoke_content = tool_text[invoke_start:invoke_content_end]
+                        # Parse to get the complete arguments
+                        try:
+                            parsed_tool = self._parse_single_invoke(
+                                invoke_content,
+                                self.streaming_request.tools
+                                if self.streaming_request
+                                else None,
+                            )
+                            if parsed_tool and self.current_tool_index < len(
+                                self.prev_tool_call_arr
+                            ):
+                                # Update existing entry in prev_tool_call_arr
+                                args = parsed_tool.function.arguments
+                                self.prev_tool_call_arr[self.current_tool_index][
+                                    "arguments"
+                                ] = args
+                        except Exception:
+                            pass  # Ignore parsing errors during streaming
+
+                    result = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(arguments="}"),
+                            )
+                        ]
+                    )
+
+                    # Reset state for next tool
+                    self.json_closed = True
+                    self.in_function = False
+                    self.accumulated_params = {}
+
+                    logger.debug("[M2_STREAMING] Tool call completed")
+
+                    return result
+                else:
+                    # Don't close JSON yet, continue processing parameters
+                    return None
+
+            # Look for parameters
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)
+
+            # Check if we should start a new parameter
+            if (
+                not self.in_param
+                and self.param_count < len(param_starts)
+                and len(param_starts) > self.param_count
+            ):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
+
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    param_name_raw = remaining[:name_end]
+                    self.current_param_name = self._extract_name(param_name_raw)
+
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
+
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.invoke_end_token)
+
+                        if next_param_idx != -1 and (
+                            func_end_idx == -1 or next_param_idx < func_end_idx
+                        ):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.invoke_end_token in tool_text:
+                                # Tool call and parameter is complete
+                                param_end_idx = len(value_text)
+                            else:
+                                # Still streaming, wait for more content
+                                return None
+
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
+
+                        # Store raw value for later processing
+                        self.accumulated_params[self.current_param_name] = param_value
+
+                        # Get parameter configuration for type conversion
+                        param_config = {}
+                        if self.streaming_request and self.streaming_request.tools:
+                            for tool in self.streaming_request.tools:
+                                if (
+                                    hasattr(tool, "function")
+                                    and tool.function.name == self.current_function_name
+                                    and hasattr(tool.function, "parameters")
+                                ):
+                                    params = tool.function.parameters
+                                    if (
+                                        isinstance(params, dict)
+                                        and "properties" in params
+                                    ):
+                                        param_config = params["properties"]
+                                    break
+
+                        # Get parameter type
+                        param_type = "string"
+                        if (
+                            self.current_param_name in param_config
+                            and isinstance(param_config[self.current_param_name], dict)
+                            and "type" in param_config[self.current_param_name]
+                        ):
+                            param_type = param_config[self.current_param_name]["type"]
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, param_type
+                        )
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(
+                            converted_value, ensure_ascii=False
+                        )
+
+                        if self.param_count == 0:
+                            json_fragment = (
+                                f'"{self.current_param_name}": {serialized_value}'
+                            )
+                        else:
+                            json_fragment = (
+                                f', "{self.current_param_name}": {serialized_value}'
+                            )
+
+                        self.param_count += 1
+
+                        return DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(arguments=json_fragment),
+                                )
+                            ]
+                        )
+
+        return None
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 0fd62f0b6a7f134dec3e431f8d6a266bcaf1c42d..982518a52e3dab9c17db52daa010e99d1ec7d2a2 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -3,29 +3,31 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("minimax")
 class MinimaxToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
@@ -40,7 +42,8 @@ class MinimaxToolParser(ToolParser):
         self.tool_call_start_token = "<tool_calls>"
         self.tool_call_end_token = "</tool_calls>"
         self.tool_call_regex = re.compile(
-            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
+            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL
+        )
         self.thinking_tag_pattern = r"<think>(.*?)</think>"
         self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"')
         self.tool_args_pattern = re.compile(r'"arguments":\s*')
@@ -52,50 +55,51 @@ class MinimaxToolParser(ToolParser):
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
         # Get token IDs for tool call start/end tokens
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
-        if (self.tool_call_start_token_id is None
-                or self.tool_call_end_token_id is None):
+        if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
             logger.warning(
                 "Minimax Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer. Falling back to string matching.")
+                "tokens in the tokenizer. Falling back to string matching."
+            )
 
     def preprocess_model_output(self, model_output: str) -> str:
         """
         Preprocess model output by removing tool calls from thinking tags.
-        
+
         Args:
             model_output: Raw model output string
-            
+
         Returns:
             Preprocessed model output with tool calls removed from thinking tags
         """
 
         def remove_tool_calls_from_think(match):
             think_content = match.group(1)
-            cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
-                                     "",
-                                     think_content,
-                                     flags=re.DOTALL)
+            cleaned_content = re.sub(
+                r"<tool_calls>.*?</tool_calls>", "", think_content, flags=re.DOTALL
+            )
             return f"<think>{cleaned_content}</think>"
 
-        return re.sub(self.thinking_tag_pattern,
-                      remove_tool_calls_from_think,
-                      model_output,
-                      flags=re.DOTALL)
+        return re.sub(
+            self.thinking_tag_pattern,
+            remove_tool_calls_from_think,
+            model_output,
+            flags=re.DOTALL,
+        )
 
     def _clean_duplicate_braces(self, args_text: str) -> str:
         """
         Clean duplicate closing braces from arguments text.
-        
+
         Args:
             args_text: Raw arguments text
-            
+
         Returns:
             Cleaned arguments text with proper JSON formatting
         """
@@ -109,7 +113,7 @@ class MinimaxToolParser(ToolParser):
         except json.JSONDecodeError:
             pass
 
-        while args_text.endswith('}}'):
+        while args_text.endswith("}}"):
             candidate = args_text[:-1]
             try:
                 json.loads(candidate)
@@ -122,10 +126,10 @@ class MinimaxToolParser(ToolParser):
     def _clean_delta_braces(self, delta_text: str) -> str:
         """
         Clean delta text by removing excessive closing braces.
-        
+
         Args:
             delta_text: Delta text to clean
-            
+
         Returns:
             Cleaned delta text
         """
@@ -134,10 +138,10 @@ class MinimaxToolParser(ToolParser):
 
         delta_stripped = delta_text.strip()
 
-        if delta_stripped and all(c in '}\n\r\t ' for c in delta_stripped):
-            brace_count = delta_stripped.count('}')
+        if delta_stripped and all(c in "}\n\r\t " for c in delta_stripped):
+            brace_count = delta_stripped.count("}")
             if brace_count > 1:
-                return '}\n' if delta_text.endswith('\n') else '}'
+                return "}\n" if delta_text.endswith("\n") else "}"
 
         return delta_text
 
@@ -148,34 +152,32 @@ class MinimaxToolParser(ToolParser):
     ) -> ExtractedToolCallInformation:
         """
         Extract tool calls from model output for non-streaming mode.
-        
+
         Args:
             model_output: Complete model output
             request: Chat completion request
-            
+
         Returns:
             ExtractedToolCallInformation containing tool calls and content
         """
         processed_output = self.preprocess_model_output(model_output)
 
         if self.tool_call_start_token not in processed_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
-            function_call_tuples = self.tool_call_regex.findall(
-                processed_output)
+            function_call_tuples = self.tool_call_regex.findall(processed_output)
 
             raw_function_calls = []
             for match in function_call_tuples:
                 tool_call_content = match[0] if match[0] else match[1]
                 if tool_call_content.strip():
-                    lines = tool_call_content.strip().split('\n')
+                    lines = tool_call_content.strip().split("\n")
                     for line in lines:
                         line = line.strip()
-                        if line and line.startswith('{') and line.endswith(
-                                '}'):
+                        if line and line.startswith("{") and line.endswith("}"):
                             try:
                                 parsed_call = json.loads(line)
                                 raw_function_calls.append(parsed_call)
@@ -186,25 +188,29 @@ class MinimaxToolParser(ToolParser):
             for function_call in raw_function_calls:
                 if "name" in function_call and "arguments" in function_call:
                     tool_calls.append(
-                        ToolCall(type="function",
-                                 function=FunctionCall(
-                                     name=function_call["name"],
-                                     arguments=json.dumps(
-                                         function_call["arguments"],
-                                         ensure_ascii=False))))
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=function_call["name"],
+                                arguments=json.dumps(
+                                    function_call["arguments"], ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
 
             processed_pos = processed_output.find(self.tool_call_start_token)
             if processed_pos != -1:
                 processed_content = processed_output[:processed_pos].strip()
 
                 if processed_content:
-                    lines = processed_content.split('\n')
+                    lines = processed_content.split("\n")
                     for line in reversed(lines):
                         line = line.strip()
                         if line:
                             pos = model_output.find(line)
                             if pos != -1:
-                                content = model_output[:pos + len(line)]
+                                content = model_output[: pos + len(line)]
                                 break
                     else:
                         content = ""
@@ -216,68 +222,74 @@ class MinimaxToolParser(ToolParser):
             return ExtractedToolCallInformation(
                 tools_called=len(tool_calls) > 0,
                 tool_calls=tool_calls,
-                content=content.strip() if content.strip() else None)
+                content=content.strip() if content.strip() else None,
+            )
 
         except Exception:
             logger.exception(
-                "An unexpected error occurred during tool call extraction.")
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+                "An unexpected error occurred during tool call extraction."
+            )
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def _update_thinking_state(self, text: str) -> None:
         """
         Update the thinking tag state based on text content.
-        
+
         Args:
             text: Text to analyze for thinking tags
         """
         open_count = text.count("<think>")
         close_count = text.count("</think>")
         self.in_thinking_tag = open_count > close_count or (
-            open_count == close_count and text.endswith("</think>"))
+            open_count == close_count and text.endswith("</think>")
+        )
 
     def _is_potential_tag_start(self, text: str) -> bool:
         """
         Check if text might be the start of a tool call tag.
-        
+
         Args:
             text: Text to check
-            
+
         Returns:
             True if text could be the start of a tool call tag
         """
         for tag in [self.tool_call_start_token, self.tool_call_end_token]:
             if any(
-                    tag.startswith(text[-i:])
-                    for i in range(1, min(len(text) + 1, len(tag)))):
+                tag.startswith(text[-i:])
+                for i in range(1, min(len(text) + 1, len(tag)))
+            ):
                 return True
         return False
 
     def _should_buffer_content(self, delta_text: str) -> bool:
         """
         Determine if content should be buffered for later processing.
-        
+
         Args:
             delta_text: Delta text to check
-            
+
         Returns:
             True if content should be buffered
         """
         if self.in_thinking_tag:
             return False
-        return bool(self.pending_buffer
-                    or self.tool_call_start_token in delta_text
-                    or self.tool_call_end_token in delta_text
-                    or delta_text.startswith('<'))
+        return bool(
+            self.pending_buffer
+            or self.tool_call_start_token in delta_text
+            or self.tool_call_end_token in delta_text
+            or delta_text.startswith("<")
+        )
 
     def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]:
         """
         Split delta text into safe content and potential tag content.
-        
+
         Args:
             delta_text: Delta text to split
-            
+
         Returns:
             Tuple of (safe_content, potential_tag_content)
         """
@@ -295,10 +307,10 @@ class MinimaxToolParser(ToolParser):
     def _process_buffer(self, new_content: str) -> str:
         """
         Process buffered content and return output content.
-        
+
         Args:
             new_content: New content to add to buffer
-            
+
         Returns:
             Processed output content
         """
@@ -326,7 +338,7 @@ class MinimaxToolParser(ToolParser):
                 break
 
             output_content += self.pending_buffer[:tag_pos]
-            self.pending_buffer = self.pending_buffer[tag_pos + tag_len:]
+            self.pending_buffer = self.pending_buffer[tag_pos + tag_len :]
 
         return output_content
 
@@ -340,13 +352,14 @@ class MinimaxToolParser(ToolParser):
 
     def _advance_to_next_tool(self) -> None:
         """Advance to the next tool in the streaming sequence."""
-        self.streaming_state["current_tool_index"] = int(
-            self.streaming_state["current_tool_index"]) + 1
+        self.streaming_state["current_tool_index"] = (
+            int(self.streaming_state["current_tool_index"]) + 1
+        )
 
     def _set_current_tool_index(self, index: int) -> None:
         """
         Set the current tool index.
-        
+
         Args:
             index: Tool index to set
         """
@@ -355,7 +368,7 @@ class MinimaxToolParser(ToolParser):
     def _get_current_tool_index(self) -> int:
         """
         Get the current tool index.
-        
+
         Returns:
             Current tool index
         """
@@ -364,10 +377,10 @@ class MinimaxToolParser(ToolParser):
     def _get_next_unsent_tool_index(self, tool_count: int) -> int:
         """
         Get the index of the next unsent tool.
-        
+
         Args:
             tool_count: Total number of tools
-            
+
         Returns:
             Index of next unsent tool, or -1 if all tools sent
         """
@@ -383,7 +396,7 @@ class MinimaxToolParser(ToolParser):
     def _ensure_state_arrays(self, tool_count: int) -> None:
         """
         Ensure state arrays have sufficient capacity for tool_count tools.
-        
+
         Args:
             tool_count: Number of tools to prepare for
         """
@@ -391,11 +404,13 @@ class MinimaxToolParser(ToolParser):
         tool_ids = list(self.streaming_state["tool_ids"])
 
         while len(sent_tools) < tool_count:
-            sent_tools.append({
-                "sent_name": False,
-                "sent_arguments": "",
-                "id": make_tool_call_id(),
-            })
+            sent_tools.append(
+                {
+                    "sent_name": False,
+                    "sent_arguments": "",
+                    "id": make_tool_call_id(),
+                }
+            )
 
         while len(tool_ids) < tool_count:
             tool_ids.append(None)
@@ -406,10 +421,10 @@ class MinimaxToolParser(ToolParser):
     def _detect_tools_in_text(self, text: str) -> int:
         """
         Detect the number of tools in text by counting name patterns.
-        
+
         Args:
             text: Text to analyze
-            
+
         Returns:
             Number of tools detected
         """
@@ -419,26 +434,26 @@ class MinimaxToolParser(ToolParser):
     def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
         """
         Find the boundaries of tool calls in text.
-        
+
         Args:
             text: Text to analyze
-            
+
         Returns:
             List of (start, end) positions for tool calls
         """
         boundaries = []
         i = 0
         while i < len(text):
-            if text[i] == '{':
+            if text[i] == "{":
                 start = i
                 depth = 0
                 has_name = False
                 has_arguments = False
 
                 while i < len(text):
-                    if text[i] == '{':
+                    if text[i] == "{":
                         depth += 1
-                    elif text[i] == '}':
+                    elif text[i] == "}":
                         depth -= 1
                         if depth == 0:
                             end = i + 1
@@ -447,10 +462,9 @@ class MinimaxToolParser(ToolParser):
                                 boundaries.append((start, end))
                             break
 
-                    if not has_name and '"name"' in text[start:i + 1]:
+                    if not has_name and '"name"' in text[start : i + 1]:
                         has_name = True
-                    if not has_arguments and '"arguments"' in text[start:i +
-                                                                   1]:
+                    if not has_arguments and '"arguments"' in text[start : i + 1]:
                         has_arguments = True
 
                     i += 1
@@ -461,47 +475,46 @@ class MinimaxToolParser(ToolParser):
                 i += 1
         return boundaries
 
-    def _extract_tool_args(self, tool_content: str,
-                           args_match: re.Match[str]) -> str:
+    def _extract_tool_args(self, tool_content: str, args_match: re.Match[str]) -> str:
         """
         Extract tool arguments from tool content.
-        
+
         Args:
             tool_content: Tool call content
             args_match: Regex match for arguments pattern
-            
+
         Returns:
             Extracted arguments as string
         """
         args_start_pos = args_match.end()
         remaining_content = tool_content[args_start_pos:]
 
-        if remaining_content.strip().startswith('{'):
+        if remaining_content.strip().startswith("{"):
             depth = 0
             for i, char in enumerate(remaining_content):
-                if char == '{':
+                if char == "{":
                     depth += 1
-                elif char == '}':
+                elif char == "}":
                     depth -= 1
                     if depth == 0:
-                        return remaining_content[:i + 1]
+                        return remaining_content[: i + 1]
         else:
-            args_end = remaining_content.find('}')
+            args_end = remaining_content.find("}")
             if args_end > 0:
                 return remaining_content[:args_end].strip()
 
-        return remaining_content.rstrip('}').strip()
+        return remaining_content.rstrip("}").strip()
 
     def _get_current_tool_content(
-            self, text: str,
-            tool_index: int) -> tuple[Optional[str], Optional[str]]:
+        self, text: str, tool_index: int
+    ) -> tuple[str | None, str | None]:
         """
         Get the content of a specific tool by index.
-        
+
         Args:
             text: Text containing tool calls
             tool_index: Index of tool to extract
-            
+
         Returns:
             Tuple of (tool_name, tool_arguments) or (None, None) if not found
         """
@@ -522,22 +535,22 @@ class MinimaxToolParser(ToolParser):
                 args_text = self._extract_tool_args(tool_content, args_match)
                 return name, args_text
             except Exception:
-                remaining_content = tool_content[args_match.end():]
-                args_text = remaining_content.rstrip('}').strip()
+                remaining_content = tool_content[args_match.end() :]
+                args_text = remaining_content.rstrip("}").strip()
                 return name, args_text
 
         return name, None
 
     def _handle_tool_name_streaming(
-            self, tool_content: str,
-            tool_count: int) -> Union[DeltaMessage, None]:
+        self, tool_content: str, tool_count: int
+    ) -> DeltaMessage | None:
         """
         Handle streaming of tool names.
-        
+
         Args:
             tool_content: Content containing tool calls
             tool_count: Total number of tools
-            
+
         Returns:
             DeltaMessage with tool name or None if no tool to stream
         """
@@ -565,24 +578,29 @@ class MinimaxToolParser(ToolParser):
         self.streaming_state["sent_tools"] = sent_tools
         self.streaming_state["tool_ids"] = tool_ids
 
-        return DeltaMessage(tool_calls=[
-            DeltaToolCall(index=next_idx,
-                          type="function",
-                          id=tool_id,
-                          function=DeltaFunctionCall(
-                              name=tool_name).model_dump(exclude_none=True))
-        ])
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=next_idx,
+                    type="function",
+                    id=tool_id,
+                    function=DeltaFunctionCall(name=tool_name).model_dump(
+                        exclude_none=True
+                    ),
+                )
+            ]
+        )
 
     def _handle_tool_args_streaming(
-            self, tool_content: str,
-            tool_count: int) -> Union[DeltaMessage, None]:
+        self, tool_content: str, tool_count: int
+    ) -> DeltaMessage | None:
         """
         Handle streaming of tool arguments.
-        
+
         Args:
             tool_content: Content containing tool calls
             tool_count: Total number of tools
-            
+
         Returns:
             DeltaMessage with tool arguments or None if no arguments to stream
         """
@@ -591,8 +609,7 @@ class MinimaxToolParser(ToolParser):
         if current_idx < 0 or current_idx >= tool_count:
             return None
 
-        tool_name, tool_args = self._get_current_tool_content(
-            tool_content, current_idx)
+        tool_name, tool_args = self._get_current_tool_content(tool_content, current_idx)
         if not tool_name or tool_args is None:
             return None
 
@@ -612,29 +629,37 @@ class MinimaxToolParser(ToolParser):
                     sent_tools[current_idx]["sent_arguments"] = clean_args
                     self.streaming_state["sent_tools"] = sent_tools
 
-                    if clean_args.endswith('}'):
+                    if clean_args.endswith("}"):
                         self._advance_to_next_tool()
 
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=current_idx,
-                                      function=DeltaFunctionCall(
-                                          arguments=args_delta).model_dump(
-                                              exclude_none=True))
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                function=DeltaFunctionCall(
+                                    arguments=args_delta
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
             elif not sent_args and clean_args:
                 clean_args_delta = self._clean_delta_braces(clean_args)
                 sent_tools[current_idx]["sent_arguments"] = clean_args
                 self.streaming_state["sent_tools"] = sent_tools
 
-                if clean_args.endswith('}'):
+                if clean_args.endswith("}"):
                     self._advance_to_next_tool()
 
-                return DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=current_idx,
-                                  function=DeltaFunctionCall(
-                                      arguments=clean_args_delta).model_dump(
-                                          exclude_none=True))
-                ])
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=current_idx,
+                            function=DeltaFunctionCall(
+                                arguments=clean_args_delta
+                            ).model_dump(exclude_none=True),
+                        )
+                    ]
+                )
 
         return None
 
@@ -652,14 +677,15 @@ class MinimaxToolParser(ToolParser):
             search_start = pos + 1
 
         think_regions = []
-        for match in re.finditer(self.thinking_tag_pattern,
-                                 current_text,
-                                 flags=re.DOTALL):
+        for match in re.finditer(
+            self.thinking_tag_pattern, current_text, flags=re.DOTALL
+        ):
             think_regions.append((match.start(), match.end()))
 
         for pos in end_token_positions:
-            in_think = any(pos >= t_start and pos < t_end
-                           for t_start, t_end in think_regions)
+            in_think = any(
+                pos >= t_start and pos < t_end for t_start, t_end in think_regions
+            )
             if not in_think:
                 return True
 
@@ -674,7 +700,7 @@ class MinimaxToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         self._update_thinking_state(current_text)
 
         if self.in_thinking_tag:
@@ -682,14 +708,12 @@ class MinimaxToolParser(ToolParser):
 
         if self._should_buffer_content(delta_text):
             buffered_output = self._process_buffer(delta_text)
-            return DeltaMessage(
-                content=buffered_output) if buffered_output else None
+            return DeltaMessage(content=buffered_output) if buffered_output else None
 
         if self._is_end_tool_calls(current_text):
             return DeltaMessage(content=delta_text)
 
-        safe_content, potential_tag = self._split_content_for_buffering(
-            delta_text)
+        safe_content, potential_tag = self._split_content_for_buffering(delta_text)
         if potential_tag:
             self.pending_buffer += potential_tag
             return DeltaMessage(content=safe_content) if safe_content else None
@@ -697,35 +721,39 @@ class MinimaxToolParser(ToolParser):
         processed_current_text = self.preprocess_model_output(current_text)
 
         if self.tool_call_start_token not in processed_current_text:
-            if (self.tool_call_end_token in delta_text
-                    and self.tool_call_start_token in current_text):
+            if (
+                self.tool_call_end_token in delta_text
+                and self.tool_call_start_token in current_text
+            ):
                 return None
-            if delta_text.strip(
-            ) == '' and self.tool_call_start_token in current_text:
+            if delta_text.strip() == "" and self.tool_call_start_token in current_text:
                 return None
-            if (self._get_current_tool_index() != -1
-                    and self.tool_call_end_token in current_text):
+            if (
+                self._get_current_tool_index() != -1
+                and self.tool_call_end_token in current_text
+            ):
                 self._reset_streaming_state()
             return DeltaMessage(content=delta_text)
 
-        if (self.tool_call_start_token_id is not None
-                and self.tool_call_start_token_id in delta_token_ids
-                and len(delta_token_ids) == 1):
+        if (
+            self.tool_call_start_token_id is not None
+            and self.tool_call_start_token_id in delta_token_ids
+            and len(delta_token_ids) == 1
+        ):
             return None
 
-        original_tool_start = self._find_tool_start_outside_thinking(
-            current_text)
+        original_tool_start = self._find_tool_start_outside_thinking(current_text)
         if original_tool_start is None:
             return None
 
         content_before_tools = self._extract_content_before_tools(
-            current_text, delta_text, original_tool_start)
+            current_text, delta_text, original_tool_start
+        )
         if content_before_tools:
             return DeltaMessage(content=content_before_tools)
 
         try:
-            tool_content = self._extract_tool_content(current_text,
-                                                      original_tool_start)
+            tool_content = self._extract_tool_content(current_text, original_tool_start)
             current_tools_count = self._detect_tools_in_text(tool_content)
 
             if current_tools_count == 0:
@@ -736,24 +764,23 @@ class MinimaxToolParser(ToolParser):
 
             self._ensure_state_arrays(current_tools_count)
 
-            return (self._handle_tool_name_streaming(tool_content,
-                                                     current_tools_count)
-                    or self._handle_tool_args_streaming(
-                        tool_content, current_tools_count))
+            return self._handle_tool_name_streaming(
+                tool_content, current_tools_count
+            ) or self._handle_tool_args_streaming(tool_content, current_tools_count)
 
         except Exception:
-            logger.exception("An unexpected error occurred ",
-                             "during streaming tool call handling.")
+            logger.exception(
+                "An unexpected error occurred ", "during streaming tool call handling."
+            )
             return None
 
-    def _find_tool_start_outside_thinking(self,
-                                          current_text: str) -> Optional[int]:
+    def _find_tool_start_outside_thinking(self, current_text: str) -> int | None:
         """
         Find the start position of tool calls outside of thinking tags.
-        
+
         Args:
             current_text: Current text to search
-            
+
         Returns:
             Position of tool call start or None if not found
         """
@@ -763,26 +790,32 @@ class MinimaxToolParser(ToolParser):
             if pos == -1:
                 return None
 
-            think_regions = [(m.start(), m.end()) for m in re.finditer(
-                r"<think>(.*?)</think>", current_text, flags=re.DOTALL)]
-            in_think = any(pos >= t_start and pos < t_end
-                           for t_start, t_end in think_regions)
+            think_regions = [
+                (m.start(), m.end())
+                for m in re.finditer(
+                    r"<think>(.*?)</think>", current_text, flags=re.DOTALL
+                )
+            ]
+            in_think = any(
+                pos >= t_start and pos < t_end for t_start, t_end in think_regions
+            )
 
             if not in_think:
                 return pos
 
             search_start = pos + 1
 
-    def _extract_content_before_tools(self, current_text: str, delta_text: str,
-                                      tool_start: int) -> Optional[str]:
+    def _extract_content_before_tools(
+        self, current_text: str, delta_text: str, tool_start: int
+    ) -> str | None:
         """
         Extract content that appears before tool calls.
-        
+
         Args:
             current_text: Current text
             delta_text: Delta text
             tool_start: Start position of tools
-            
+
         Returns:
             Content before tools or None
         """
@@ -791,18 +824,18 @@ class MinimaxToolParser(ToolParser):
             if delta_start_pos < tool_start:
                 content_part = delta_text
                 if delta_start_pos + len(delta_text) > tool_start:
-                    content_part = delta_text[:tool_start - delta_start_pos]
+                    content_part = delta_text[: tool_start - delta_start_pos]
                 return content_part if content_part else None
         return None
 
     def _extract_tool_content(self, current_text: str, tool_start: int) -> str:
         """
         Extract tool content from current text starting at tool_start.
-        
+
         Args:
             current_text: Current text
             tool_start: Start position of tool calls
-            
+
         Returns:
             Extracted tool content
         """
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index e6b300fd84e94cafcbadf352111e205b9acab5ec..85671271522d3b616dc1326771fd431f0a615f87 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -5,22 +5,25 @@ import json
 from collections.abc import Sequence
 from random import choices
 from string import ascii_letters, digits
-from typing import Union
 
 import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 from pydantic import Field
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
+    ToolParser,
+)
+from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 
@@ -30,8 +33,7 @@ ALPHANUMERIC = ascii_letters + digits
 
 
 class MistralToolCall(ToolCall):
-    id: str = Field(
-        default_factory=lambda: MistralToolCall.generate_random_id())
+    id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())
 
     @staticmethod
     def generate_random_id():
@@ -45,11 +47,11 @@ class MistralToolCall(ToolCall):
 
 
 def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
-    return isinstance(model_tokenizer, MistralTokenizer) \
-        and model_tokenizer.version >= 11
+    return (
+        isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
+    )
 
 
-@ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
     """
     Tool call parser for Mistral 7B Instruct v0.3, intended for use with
@@ -63,35 +65,39 @@ class MistralToolParser(ToolParser):
         super().__init__(tokenizer)
 
         if not isinstance(self.model_tokenizer, MistralTokenizer):
-            logger.info("Non-Mistral tokenizer detected when using a Mistral "
-                        "model...")
+            logger.info("Non-Mistral tokenizer detected when using a Mistral model...")
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: list[str] = [
-        ]  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
         if _is_fn_name_regex_support(self.model_tokenizer):
             self.fn_name_regex = re.compile(
-                r'([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)', re.DOTALL)
+                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)", re.DOTALL
+            )
         else:
             self.fn_name_regex = None
 
         if self.bot_token_id is None:
             raise RuntimeError(
                 "Mistral Tool Parser could not locate the tool call token in "
-                "the tokenizer!")
-
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if not isinstance(
-                self.model_tokenizer, MistralTokenizer
-        ) and request.tools and request.tool_choice != 'none':
+                "the tokenizer!"
+            )
+
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if (
+            not isinstance(self.model_tokenizer, MistralTokenizer)
+            and request.tools
+            and request.tool_choice != "none"
+        ):
             # Do not skip special tokens when using chat template
             # with Mistral parser as TOOL_CALL token is needed
             # for tool detection.
@@ -113,9 +119,9 @@ class MistralToolParser(ToolParser):
 
         # case -- if a tool call token is not present, return a text response
         if self.bot_token not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         # first remove the BOT token
         tool_content = model_output.replace(self.bot_token, "").strip()
@@ -134,10 +140,9 @@ class MistralToolParser(ToolParser):
 
                         # fn_name is encoded outside serialized json dump
                         # only arguments are serialized
-                        function_call_arr.append({
-                            "name": fn_name,
-                            "arguments": json.loads(args)
-                        })
+                        function_call_arr.append(
+                            {"name": fn_name, "arguments": json.loads(args)}
+                        )
                 else:
                     function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
@@ -155,8 +160,11 @@ class MistralToolParser(ToolParser):
                     function=FunctionCall(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"],
-                                             ensure_ascii=False)))
+                        arguments=json.dumps(
+                            raw_function_call["arguments"], ensure_ascii=False
+                        ),
+                    ),
+                )
                 for raw_function_call in function_call_arr
             ]
 
@@ -165,14 +173,15 @@ class MistralToolParser(ToolParser):
             return ExtractedToolCallInformation(
                 tools_called=True,
                 tool_calls=tool_calls,
-                content=content if len(content) > 0 else None)
+                content=content if len(content) > 0 else None,
+            )
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
             # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=tool_content)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=tool_content
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -183,8 +192,7 @@ class MistralToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         # if the tool call token is not in the tokens generated so far, append
         # output to contents since it's not a tool
         if self.bot_token not in current_text:
@@ -195,8 +203,7 @@ class MistralToolParser(ToolParser):
 
         # handle if we detected the BOT token which means the start of tool
         # calling
-        if (self.bot_token_id in delta_token_ids
-                and len(delta_token_ids) == 1):
+        if self.bot_token_id in delta_token_ids and len(delta_token_ids) == 1:
             # if it's the only token, return None, so we don't send a chat
             # completion any don't send a control token
             return None
@@ -205,10 +212,8 @@ class MistralToolParser(ToolParser):
         # sent yet, don't allow sending
         # an incomplete string since OpenAI only ever (as far as I have
         # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
-
             # replace BOT token with empty string, and convert single quotes
             # to double to allow parsing as JSON since mistral uses single
             # quotes instead of double for tool calls
@@ -218,15 +223,17 @@ class MistralToolParser(ToolParser):
             # parsing on the entire array
             try:
                 tool_call_arr: list[dict] = partial_json_parser.loads(
-                    parsable_arr, flags)
+                    parsable_arr, flags
+                )
             except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug('not enough tokens to parse into JSON yet')
+                logger.debug("not enough tokens to parse into JSON yet")
                 return None
 
             # select as the current tool call the one we're on the state at
 
-            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
-                if len(tool_call_arr) > 0 else {}
+            current_tool_call: dict = (
+                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
+            )
 
             # case -- if no tokens have been streamed for the tool, e.g.
             #   only the array brackets, stream nothing
@@ -235,28 +242,31 @@ class MistralToolParser(ToolParser):
 
             # case: we are starting a new tool in the array
             #   -> array has > 0 length AND length has moved past cursor
-            elif (len(tool_call_arr) > 0
-                  and len(tool_call_arr) > self.current_tool_id + 1):
-
+            elif (
+                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
+            ):
                 # if we're moving on to a new call, first make sure we
                 # haven't missed anything in the previous one that was
                 # auto-generated due to JSON completions, but wasn't
                 # streamed to the client yet.
                 if self.current_tool_id >= 0:
-                    diff: Union[str, None] = current_tool_call.get("arguments")
+                    diff: str | None = current_tool_call.get("arguments")
 
                     if diff:
                         diff = json.dumps(diff, ensure_ascii=False).replace(
-                            self.streamed_args_for_tool[self.current_tool_id],
-                            "")
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=diff).model_dump(
-                                                  exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += diff
+                            self.streamed_args_for_tool[self.current_tool_id], ""
+                        )
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=diff
+                                    ).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        self.streamed_args_for_tool[self.current_tool_id] += diff
                     else:
                         delta = None
                 else:
@@ -275,15 +285,18 @@ class MistralToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=MistralToolCall.generate_random_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=MistralToolCall.generate_random_id(),
+                                function=DeltaFunctionCall(
+                                    name=function_name
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
                     self.current_tool_name_sent = True
                 else:
                     delta = None
@@ -291,64 +304,72 @@ class MistralToolParser(ToolParser):
             # now we know we're on the same tool call and we're streaming
             # arguments
             else:
-
-                prev_arguments = self.prev_tool_call_arr[
-                    self.current_tool_id].get("arguments")
+                prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments"
+                )
                 cur_arguments = current_tool_call.get("arguments")
 
-                new_text = delta_text.replace("\'", "\"")
-                if ('"}' in new_text):
-                    new_text = new_text[:new_text.rindex('"}')]
+                new_text = delta_text.replace("'", '"')
+                if '"}' in new_text:
+                    new_text = new_text[: new_text.rindex('"}')]
 
                 if not cur_arguments and not prev_arguments:
-
                     delta = None
                 elif not cur_arguments and prev_arguments:
                     logger.error(
-                        "INVARIANT - impossible to have arguments reset "
-                        "mid-arguments")
+                        "INVARIANT - impossible to have arguments reset mid-arguments"
+                    )
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments,
-                                                    ensure_ascii=False)[:-2]
-                    logger.debug("finding %s in %s", new_text,
-                                 cur_arguments_json)
+                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)[
+                        :-2
+                    ]
+                    logger.debug("finding %s in %s", new_text, cur_arguments_json)
 
-                    if (new_text not in cur_arguments_json):
+                    if new_text not in cur_arguments_json:
                         return None
-                    arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         rindex(new_text) +
-                                                         len(new_text)]
-                    logger.debug("First tokens in arguments received: %s",
-                                 arguments_delta)
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=arguments_delta).
-                                      model_dump(exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += arguments_delta
+                    arguments_delta = cur_arguments_json[
+                        : cur_arguments_json.rindex(new_text) + len(new_text)
+                    ]
+                    logger.debug(
+                        "First tokens in arguments received: %s", arguments_delta
+                    )
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=arguments_delta
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments,
-                                                ensure_ascii=False)
-                    logger.debug("Searching for diff between \n%s\n%s",
-                                 cur_args_json, prev_args_json)
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
+                    logger.debug(
+                        "Searching for diff between \n%s\n%s",
+                        cur_args_json,
+                        prev_args_json,
+                    )
 
                     argument_diff = extract_intermediate_diff(
-                        cur_args_json, prev_args_json)
+                        cur_args_json, prev_args_json
+                    )
                     logger.debug("got arguments diff: %s", argument_diff)
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=argument_diff).model_dump(
-                                              exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += argument_diff
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    arguments=argument_diff
+                                ).model_dump(exclude_none=True),
+                            )
+                        ]
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
                 else:
                     # try parsing it with regular JSON - if it works we're
                     # at the end, and we need to send the difference between
@@ -364,6 +385,6 @@ class MistralToolParser(ToolParser):
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
diff --git a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..baff33bd7e8ac17a4bdeb723a988d6c8f6674004
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
@@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+class Olmo3PythonicToolParser(ToolParser):
+    """
+    Tool call parser for Olmo 3 models that produce tool calls as
+    newline-separated pythonic strings.
+    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
+    Code copied from pythonic_tool_parser.py and updated to handle
+    - newline separated pythonic tool calls.
+    - argument values being null/true/false instead of Pythonic literals.
+    """
+
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL,
+    )
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        original_model_output = model_output
+        # Remove xml tags.
+        match = re.search(
+            r"<function_calls>(.*?)</function_calls>", model_output, re.DOTALL
+        )
+        if match:
+            model_output = match.group(1).strip()
+        # Make the newline separated function calls into a list.
+        model_output = ", ".join(
+            [line.strip() for line in model_output.splitlines() if line.strip()]
+        )
+        model_output = f"[{model_output}]"
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = (
+                self.TOOL_CALL_REGEX.match(
+                    model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+                )
+                is not None
+            )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=original_model_output
+            )
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None,
+                )
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls"
+                )
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=original_model_output
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        # All function calls start with the <function_calls> tag.
+        # But since this is streaming, we may have seen only part of the tag.
+        if not current_text.startswith("<"):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # Remove xml tags.
+            if current_text.startswith("<function_calls>"):
+                current_text = current_text[len("<function_calls>") :]
+            if current_text.endswith("</function_calls>"):
+                current_text = current_text[: -len("</function_calls>")]
+
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            # Make the newline separated function calls into a list.
+            valid_text = ", ".join(
+                [line.strip() for line in valid_text.splitlines() if line.strip()]
+            )
+            valid_text = f"[{valid_text}]"
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                raise _UnexpectedAstError(
+                    "Tool output must be a sequence of newline-separated calls"
+                )
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(tool_calls) - 1 or ")" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = added_text[:-1] if not new_call_complete else ""
+                if not new_call_complete and added_text[-1] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(
+                    self.streamed_args_for_tool[index], new_call, index, withheld_suffix
+                )
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (
+                        delta.function is not None
+                        and delta.function.arguments is not None
+                    ):
+                        self.streamed_args_for_tool[index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining its final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content="")
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError("Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    # The model may return function calls where the values are null/true/false
+    # because the system prompt has API description in json.
+    elif isinstance(val, ast.Name) and val.id in ["null", "true", "false"]:
+        if val.id == "null":
+            return None
+        elif val.id == "true":
+            return True
+        elif val.id == "false":
+            return False
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(
+        type="function",
+        function=FunctionCall(
+            name=function_name, arguments=json.dumps(arguments, ensure_ascii=False)
+        ),
+    )
+
+
+def _make_valid_python(text: str) -> tuple[str, str] | None:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[: text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[: text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if (
+        bracket_stack
+        and bracket_stack[-1] == "["
+        and not text.endswith("[")
+        and not text.endswith(")")
+    ):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(
+    previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
+) -> DeltaToolCall | None:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[: -len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(
+            id=new_call.id,
+            type="function",
+            index=index,
+            function=DeltaFunctionCall(
+                name=new_call.function.name,
+                arguments=new_call_args,
+            ),
+        )
+
+    arg_diff = new_call_args[len(previously_sent_args) :]
+    return (
+        DeltaToolCall(
+            id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff)
+        )
+        if arg_diff
+        else None
+    )
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index 1729fdbc9971062cf701e78d0c2641d750a49414..d1b36a297e0b12d16fee132144fb8fb8e7c6d768 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -1,30 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import json
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from vllm.entrypoints.harmony_utils import parse_output_into_messages
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    AnyTokenizer = object
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("openai")
 class OpenAIToolParser(ToolParser):
-
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: "AnyTokenizer"):
         super().__init__(tokenizer)
 
     def extract_tool_calls(
@@ -57,7 +59,8 @@ class OpenAIToolParser(ToolParser):
                             tool_args = json.dumps(json.loads(msg_text))
                         except json.JSONDecodeError:
                             logger.exception(
-                                "Error decoding JSON tool call from response.")
+                                "Error decoding JSON tool call from response."
+                            )
                             tool_args = msg_text
                     else:
                         tool_args = msg_text
@@ -68,7 +71,8 @@ class OpenAIToolParser(ToolParser):
                                 name=msg.recipient.split("functions.")[1],
                                 arguments=tool_args,
                             ),
-                        ))
+                        )
+                    )
                 elif msg.channel == "final":
                     final_content = msg_text
 
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 85dd56213c6ac505a6c6004b7dc1d4ee0e9f0e64..acb25ea2768e19ef31b02a6a07447ac9e9752890 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -3,30 +3,33 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("phi4_mini_json")
 class Phi4MiniJsonToolParser(ToolParser):
     """
     Tool call parser for phi-4-mini models intended for use with the
     examples/tool_chat_template_llama.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json
     are all set
     """
 
@@ -38,39 +41,42 @@ class Phi4MiniJsonToolParser(ToolParser):
         self.prev_tool_call_arr: list[dict[str, Any]] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: list[str] = [
-        ]  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[
+            str
+        ] = []  # map what has been streamed for each tool so far to a list
         self.bot_token: str = "functools"
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
         """
         logger.debug("Model output: %s", model_output)
 
-        pattern = r'functools\[(.*?)\]'
+        pattern = r"functools\[(.*?)\]"
         matches = re.search(pattern, model_output, re.DOTALL)
 
         if not matches:
             logger.debug("No function calls found")
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
             function_call_arr: list[dict[str, Any]] = []
             try:
-                json_content = '[' + matches.group(1) + ']'
+                json_content = "[" + matches.group(1) + "]"
 
                 function_call_arr = json.loads(json_content)
-                logger.debug("Successfully extracted %d function calls",
-                             len(function_call_arr))
+                logger.debug(
+                    "Successfully extracted %d function calls", len(function_call_arr)
+                )
             except json.JSONDecodeError as e:
                 logger.error(
-                    "Failed to parse function calls from model output. "
-                    "Error: %s", str(e))
+                    "Failed to parse function calls from model output. Error: %s",
+                    str(e),
+                )
 
             tool_calls: list[ToolCall] = [
                 ToolCall(
@@ -81,22 +87,25 @@ class Phi4MiniJsonToolParser(ToolParser):
                         # function call args are JSON but as a string
                         arguments=json.dumps(
                             raw_function_call["arguments"]
-                            if "arguments" in raw_function_call else
-                            raw_function_call["parameters"],
-                            ensure_ascii=False),
-                    )) for raw_function_call in function_call_arr
+                            if "arguments" in raw_function_call
+                            else raw_function_call["parameters"],
+                            ensure_ascii=False,
+                        ),
+                    ),
+                )
+                for raw_function_call in function_call_arr
             ]
 
             # get any content before the tool call
-            ret = ExtractedToolCallInformation(tools_called=True,
-                                               tool_calls=tool_calls,
-                                               content=None)
+            ret = ExtractedToolCallInformation(
+                tools_called=True, tool_calls=tool_calls, content=None
+            )
             return ret
 
         except Exception:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -107,6 +116,5 @@ class Phi4MiniJsonToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Optional[DeltaMessage]:
-
+    ) -> DeltaMessage | None:
         return None
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 992f141bef0f2b905ebf12655ef968c0a109dafb..abeb923b932274f107eb1272beb966d0179eb05b 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -4,19 +4,24 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -26,7 +31,6 @@ class _UnexpectedAstError(Exception):
     pass
 
 
-@ToolParserManager.register_module("pythonic")
 class PythonicToolParser(ToolParser):
     """
     Tool call parser for models that produce tool calls in a pythonic style,
@@ -34,6 +38,7 @@ class PythonicToolParser(ToolParser):
 
     Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
     """
+
     # TODO(mdepinet): Possible future improvements:
     #   1. Support text + tools separated by either <|python_tag|> or \n\n
     #   2. Support tools outside of a list (or separated by a semicolon).
@@ -43,7 +48,8 @@ class PythonicToolParser(ToolParser):
 
     TOOL_CALL_REGEX = re.compile(
         r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
-        re.DOTALL)
+        re.DOTALL,
+    )
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
@@ -58,48 +64,54 @@ class PythonicToolParser(ToolParser):
         self.current_tool_id = value
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
         """
         is_tool_call_pattern = False
         try:
-            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
-                model_output,
-                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+            is_tool_call_pattern = (
+                self.TOOL_CALL_REGEX.match(
+                    model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+                )
+                is not None
+            )
         except TimeoutError:
-            logger.warning(
-                "Regex timeout occurred when matching tool call pattern.")
-            logger.debug("Regex timeout occurred when matching user input: %s",
-                         model_output)
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
 
         if not is_tool_call_pattern:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
             module = ast.parse(model_output)
             parsed = getattr(module.body[0], "value", None)
             if isinstance(parsed, ast.List) and all(
-                    isinstance(e, ast.Call) for e in parsed.elts):
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
                 return ExtractedToolCallInformation(
                     tools_called=True,
                     tool_calls=[
                         _handle_single_tool(e)  # type: ignore
                         for e in parsed.elts
                     ],
-                    content=None)
+                    content=None,
+                )
             else:
                 raise _UnexpectedAstError(
-                    "Tool output must be a list of function calls")
+                    "Tool output must be a list of function calls"
+                )
         except Exception:
             logger.exception("Error in extracting tool call from response.")
             # Treat as regular text
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -110,8 +122,7 @@ class PythonicToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         if not current_text.startswith("["):
             return DeltaMessage(content=delta_text)
 
@@ -124,9 +135,11 @@ class PythonicToolParser(ToolParser):
             module = ast.parse(valid_text)
             parsed = getattr(module.body[0], "value", None)
             if not isinstance(parsed, ast.List) or not all(
-                    isinstance(e, ast.Call) for e in parsed.elts):
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
                 raise _UnexpectedAstError(
-                    "Tool output must be a list of function calls")
+                    "Tool output must be a list of function calls"
+                )
             tool_calls = [
                 _handle_single_tool(e)  # type: ignore
                 for e in parsed.elts
@@ -141,28 +154,30 @@ class PythonicToolParser(ToolParser):
                 if len(self.streamed_args_for_tool) == index:
                     self.streamed_args_for_tool.append("")
 
-                new_call_complete = index < len(
-                    tool_calls) - 1 or ")]" not in added_text
+                new_call_complete = (
+                    index < len(tool_calls) - 1 or ")]" not in added_text
+                )
                 if new_call_complete:
                     self.current_tool_index += 1
 
-                withheld_suffix = (added_text[:-2]
-                                   if not new_call_complete else "")
+                withheld_suffix = added_text[:-2] if not new_call_complete else ""
                 if not new_call_complete and added_text[-2] == ")":
                     # Function call is incomplete. Withhold the closing bracket.
                     withheld_suffix = withheld_suffix + "}"
                 # Strings get single quotes in the model-produced string.
                 # JSON requires double quotes.
                 withheld_suffix = withheld_suffix.replace("'", '"')
-                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
-                                            new_call, index, withheld_suffix)
+                delta = _compute_tool_delta(
+                    self.streamed_args_for_tool[index], new_call, index, withheld_suffix
+                )
 
                 if delta is not None:
                     tool_deltas.append(delta)
-                    if (delta.function is not None
-                            and delta.function.arguments is not None):
-                        self.streamed_args_for_tool[
-                            index] += delta.function.arguments
+                    if (
+                        delta.function is not None
+                        and delta.function.arguments is not None
+                    ):
+                        self.streamed_args_for_tool[index] += delta.function.arguments
 
             # HACK: serving_chat.py inspects the internal state of tool parsers
             # when determining its final streaming delta, automatically
@@ -177,14 +192,14 @@ class PythonicToolParser(ToolParser):
             elif not added_text and self.current_tool_id > 0:
                 # Return an empty DeltaMessage once the tool calls are all done
                 # so that finish_reason gets set.
-                return DeltaMessage(content='')
+                return DeltaMessage(content="")
             else:
                 return None
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
             logger.debug(
-                "Skipping chunk as a result of tool streaming extraction "
-                "error")
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
             return None
 
 
@@ -193,8 +208,7 @@ def _get_parameter_value(val: ast.expr) -> Any:
         return val.value
     elif isinstance(val, ast.Dict):
         if not all(isinstance(k, ast.Constant) for k in val.keys):
-            raise _UnexpectedAstError(
-                "Dict tool call arguments must have literal keys")
+            raise _UnexpectedAstError("Dict tool call arguments must have literal keys")
         return {
             k.value: _get_parameter_value(v)  # type: ignore
             for k, v in zip(val.keys, val.values)
@@ -214,13 +228,13 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
         arguments[keyword.arg] = _get_parameter_value(keyword.value)
     return ToolCall(
         type="function",
-        function=FunctionCall(name=function_name,
-                              arguments=json.dumps(arguments,
-                                                   ensure_ascii=False)),
+        function=FunctionCall(
+            name=function_name, arguments=json.dumps(arguments, ensure_ascii=False)
+        ),
     )
 
 
-def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+def _make_valid_python(text: str) -> tuple[str, str] | None:
     bracket_stack = []
     for index, char in enumerate(text):
         if char in {"[", "(", "{"}:
@@ -253,21 +267,25 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
         # we can't fill in a valid value.
         return None
     if bracket_stack and bracket_stack[-1] == "{":
-        trailing_dict_text = text[:text.rfind("{")]
+        trailing_dict_text = text[: text.rfind("{")]
         num_keys = trailing_dict_text.count(":")
         num_values = trailing_dict_text.count(",")
         if num_keys <= num_values:
             return None  # Incomplete property name within parameter value
     if bracket_stack and bracket_stack[-1] == "(":
-        trailing_params_text = text[:text.rfind("(")]
+        trailing_params_text = text[: text.rfind("(")]
         num_full_param_names = trailing_params_text.count("=")
         num_full_param_values = trailing_params_text.count(",")
         if num_full_param_names <= num_full_param_values:
             return None  # Incomplete parameter name
     if text.endswith(","):
         text = text[:-1]
-    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
-            "[") and not text.endswith(")"):
+    if (
+        bracket_stack
+        and bracket_stack[-1] == "["
+        and not text.endswith("[")
+        and not text.endswith(")")
+    ):
         return None  # Incomplete function name
 
     added_text = ""
@@ -286,23 +304,29 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
     return text + added_text, added_text
 
 
-def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
-                        index: int,
-                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+def _compute_tool_delta(
+    previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
+) -> DeltaToolCall | None:
     new_call_args = new_call.function.arguments
     if withheld_suffix:
         assert new_call_args.endswith(withheld_suffix)
-        new_call_args = new_call_args[:-len(withheld_suffix)]
+        new_call_args = new_call_args[: -len(withheld_suffix)]
     if not previously_sent_args:
-        return DeltaToolCall(id=new_call.id,
-                             type="function",
-                             index=index,
-                             function=DeltaFunctionCall(
-                                 name=new_call.function.name,
-                                 arguments=new_call_args,
-                             ))
-
-    arg_diff = new_call_args[len(previously_sent_args):]
-    return DeltaToolCall(
-        id=None, index=index, function=DeltaFunctionCall(
-            arguments=arg_diff)) if arg_diff else None
+        return DeltaToolCall(
+            id=new_call.id,
+            type="function",
+            index=index,
+            function=DeltaFunctionCall(
+                name=new_call.function.name,
+                arguments=new_call_args,
+            ),
+        )
+
+    arg_diff = new_call_args[len(previously_sent_args) :]
+    return (
+        DeltaToolCall(
+            id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff)
+        )
+        if arg_diff
+        else None
+    )
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 955813ddd340861884a20306ce34df69fe838c7f..26261c0065ead4a569cbe897c1df6d6c858453c5 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -4,34 +4,37 @@ import ast
 import json
 import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("qwen3_coder")
 class Qwen3CoderToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         # Override base class type - we use string IDs for tool calls
-        self.current_tool_id: Optional[str] = None  # type: ignore
+        self.current_tool_id: str | None = None  # type: ignore
         self.streamed_args_for_tool: list[str] = []
 
         # Sentinel tokens for streaming mode
@@ -49,32 +52,37 @@ class Qwen3CoderToolParser(ToolParser):
 
         # Regex patterns
         self.tool_call_complete_regex = re.compile(
-            r"<tool_call>(.*?)</tool_call>", re.DOTALL)
+            r"<tool_call>(.*?)</tool_call>", re.DOTALL
+        )
         self.tool_call_regex = re.compile(
-            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL)
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
+        )
         self.tool_call_function_regex = re.compile(
-            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+        )
         self.tool_call_parameter_regex = re.compile(
             r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
-            re.DOTALL)
+            re.DOTALL,
+        )
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
-        if (self.tool_call_start_token_id is None
-                or self.tool_call_end_token_id is None):
+        if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
             raise RuntimeError(
                 "Qwen3 XML Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+                "tokens in the tokenizer!"
+            )
 
-        logger.info("vLLM Successfully import tool parser %s !",
-                    self.__class__.__name__)
+        logger.info(
+            "vLLM Successfully import tool parser %s !", self.__class__.__name__
+        )
 
     def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
@@ -100,14 +108,15 @@ class Qwen3CoderToolParser(ToolParser):
         self.streaming_request = None
 
     def _get_arguments_config(
-            self, func_name: str,
-            tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
+        self, func_name: str, tools: list[ChatCompletionToolsParam] | None
+    ) -> dict:
         """Extract argument configuration for a function."""
         if tools is None:
             return {}
         for config in tools:
-            if not hasattr(config, "type") or not (hasattr(
-                    config, "function") and hasattr(config.function, "name")):
+            if not hasattr(config, "type") or not (
+                hasattr(config, "function") and hasattr(config.function, "name")
+            ):
                 continue
             if config.type == "function" and config.function.name == func_name:
                 if not hasattr(config.function, "parameters"):
@@ -119,12 +128,12 @@ class Qwen3CoderToolParser(ToolParser):
                     return params
                 else:
                     return {}
-        logger.warning("Tool '%s' is not defined in the tools list.",
-                       func_name)
+        logger.warning("Tool '%s' is not defined in the tools list.", func_name)
         return {}
 
-    def _convert_param_value(self, param_value: str, param_name: str,
-                             param_config: dict, func_name: str) -> Any:
+    def _convert_param_value(
+        self, param_value: str, param_name: str, param_config: dict, func_name: str
+    ) -> Any:
         """Convert parameter value based on its type in the schema."""
         # Handle null value for any type
         if param_value.lower() == "null":
@@ -135,38 +144,55 @@ class Qwen3CoderToolParser(ToolParser):
                 logger.warning(
                     "Parsed parameter '%s' is not defined in the tool "
                     "parameters for tool '%s', directly returning the "
-                    "string value.", param_name, func_name)
+                    "string value.",
+                    param_name,
+                    func_name,
+                )
             return param_value
 
-        if isinstance(param_config[param_name],
-                      dict) and "type" in param_config[param_name]:
+        if (
+            isinstance(param_config[param_name], dict)
+            and "type" in param_config[param_name]
+        ):
             param_type = str(param_config[param_name]["type"]).strip().lower()
         else:
             param_type = "string"
         if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
             return param_value
-        elif param_type.startswith("int") or param_type.startswith(
-                "uint") or param_type.startswith(
-                    "long") or param_type.startswith(
-                        "short") or param_type.startswith("unsigned"):
+        elif (
+            param_type.startswith("int")
+            or param_type.startswith("uint")
+            or param_type.startswith("long")
+            or param_type.startswith("short")
+            or param_type.startswith("unsigned")
+        ):
             try:
                 return int(param_value)
             except (ValueError, TypeError):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' is not an "
                     "integer in tool '%s', degenerating to string.",
-                    param_value, param_name, func_name)
+                    param_value,
+                    param_name,
+                    func_name,
+                )
                 return param_value
         elif param_type.startswith("num") or param_type.startswith("float"):
             try:
                 float_param_value = float(param_value)
-                return float_param_value if float_param_value - int(
-                    float_param_value) != 0 else int(float_param_value)
+                return (
+                    float_param_value
+                    if float_param_value - int(float_param_value) != 0
+                    else int(float_param_value)
+                )
             except (ValueError, TypeError):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' is not a float "
-                    "in tool '%s', degenerating to string.", param_value,
-                    param_name, func_name)
+                    "in tool '%s', degenerating to string.",
+                    param_value,
+                    param_name,
+                    func_name,
+                )
                 return param_value
         elif param_type in ["boolean", "bool", "binary"]:
             param_value = param_value.lower()
@@ -174,12 +200,18 @@ class Qwen3CoderToolParser(ToolParser):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' is not a boolean "
                     "(`true` or `false`) in tool '%s', degenerating to "
-                    "false.", param_value, param_name, func_name)
+                    "false.",
+                    param_value,
+                    param_name,
+                    func_name,
+                )
             return param_value == "true"
         else:
-            if param_type in ["object", "array", "arr"
-                              ] or param_type.startswith(
-                                  "dict") or param_type.startswith("list"):
+            if (
+                param_type in ["object", "array", "arr"]
+                or param_type.startswith("dict")
+                or param_type.startswith("list")
+            ):
                 try:
                     param_value = json.loads(param_value)
                     return param_value
@@ -187,33 +219,37 @@ class Qwen3CoderToolParser(ToolParser):
                     logger.warning(
                         "Parsed value '%s' of parameter '%s' cannot be "
                         "parsed with json.loads in tool '%s', will try "
-                        "other methods to parse it.", param_value, param_name,
-                        func_name)
+                        "other methods to parse it.",
+                        param_value,
+                        param_name,
+                        func_name,
+                    )
             try:
                 param_value = ast.literal_eval(param_value)  # safer
             except (ValueError, SyntaxError, TypeError):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' cannot be "
                     "converted via Python `ast.literal_eval()` in tool "
-                    "'%s', degenerating to string.", param_value, param_name,
-                    func_name)
+                    "'%s', degenerating to string.",
+                    param_value,
+                    param_name,
+                    func_name,
+                )
             return param_value
 
     def _parse_xml_function_call(
-            self, function_call_str: str,
-            tools: Optional[list[ChatCompletionToolsParam]]
-    ) -> Optional[ToolCall]:
-
+        self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None
+    ) -> ToolCall | None:
         # Extract function name
         end_index = function_call_str.index(">")
         function_name = function_call_str[:end_index]
         param_config = self._get_arguments_config(function_name, tools)
-        parameters = function_call_str[end_index + 1:]
+        parameters = function_call_str[end_index + 1 :]
         param_dict = {}
         for match_text in self.tool_call_parameter_regex.findall(parameters):
             idx = match_text.index(">")
             param_name = match_text[:idx]
-            param_value = str(match_text[idx + 1:])
+            param_value = str(match_text[idx + 1 :])
             # Remove prefix and trailing \n
             if param_value.startswith("\n"):
                 param_value = param_value[1:]
@@ -221,12 +257,13 @@ class Qwen3CoderToolParser(ToolParser):
                 param_value = param_value[:-1]
 
             param_dict[param_name] = self._convert_param_value(
-                param_value, param_name, param_config, function_name)
+                param_value, param_name, param_config, function_name
+            )
         return ToolCall(
             type="function",
-            function=FunctionCall(name=function_name,
-                                  arguments=json.dumps(param_dict,
-                                                       ensure_ascii=False)),
+            function=FunctionCall(
+                name=function_name, arguments=json.dumps(param_dict, ensure_ascii=False)
+            ),
         )
 
     def _get_function_calls(self, model_output: str) -> list[str]:
@@ -242,8 +279,7 @@ class Qwen3CoderToolParser(ToolParser):
 
         raw_function_calls = []
         for tool_call in raw_tool_calls:
-            raw_function_calls.extend(
-                self.tool_call_function_regex.findall(tool_call))
+            raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call))
 
         function_calls = [
             match[0] if match[0] else match[1] for match in raw_function_calls
@@ -257,16 +293,16 @@ class Qwen3CoderToolParser(ToolParser):
     ) -> ExtractedToolCallInformation:
         # Quick check to avoid unnecessary processing
         if self.tool_call_prefix not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         try:
             function_calls = self._get_function_calls(model_output)
             if len(function_calls) == 0:
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
             tool_calls = [
                 self._parse_xml_function_call(function_call_str, request.tools)
@@ -277,12 +313,12 @@ class Qwen3CoderToolParser(ToolParser):
             self.prev_tool_call_arr.clear()  # Clear previous calls
             for tool_call in tool_calls:
                 if tool_call:
-                    self.prev_tool_call_arr.append({
-                        "name":
-                        tool_call.function.name,
-                        "arguments":
-                        tool_call.function.arguments,
-                    })
+                    self.prev_tool_call_arr.append(
+                        {
+                            "name": tool_call.function.name,
+                            "arguments": tool_call.function.arguments,
+                        }
+                    )
 
             # Extract content before tool calls
             content_index = model_output.find(self.tool_call_start_token)
@@ -298,9 +334,9 @@ class Qwen3CoderToolParser(ToolParser):
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -311,7 +347,7 @@ class Qwen3CoderToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # Store request for type conversion
         if not previous_text:
             self._reset_streaming_state()
@@ -322,19 +358,19 @@ class Qwen3CoderToolParser(ToolParser):
             # Check if this is an EOS token after all tool calls are complete
             # Check for tool calls in text even if is_tool_call_started
             # is False (might have been reset after processing all tools)
-            if (delta_token_ids
-                    and self.tool_call_end_token_id not in delta_token_ids):
+            if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids:
                 # Count complete tool calls
                 complete_calls = len(
-                    self.tool_call_complete_regex.findall(current_text))
+                    self.tool_call_complete_regex.findall(current_text)
+                )
 
                 # If we have completed tool calls and populated
                 # prev_tool_call_arr
                 if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                     # Check if all tool calls are closed
                     open_calls = current_text.count(
-                        self.tool_call_start_token) - current_text.count(
-                            self.tool_call_end_token)
+                        self.tool_call_start_token
+                    ) - current_text.count(self.tool_call_end_token)
                     if open_calls == 0:
                         # Return empty delta for finish_reason processing
                         return DeltaMessage(content="")
@@ -370,20 +406,25 @@ class Qwen3CoderToolParser(ToolParser):
         # Handle normal content before tool calls
         if not self.is_tool_call_started:
             # Check if tool call is starting
-            if (self.tool_call_start_token_id in delta_token_ids
-                    or self.tool_call_start_token in delta_text):
+            if (
+                self.tool_call_start_token_id in delta_token_ids
+                or self.tool_call_start_token in delta_text
+            ):
                 self.is_tool_call_started = True
                 # Return any content before the tool call
                 if self.tool_call_start_token in delta_text:
-                    content_before = delta_text[:delta_text.index(
-                        self.tool_call_start_token)]
+                    content_before = delta_text[
+                        : delta_text.index(self.tool_call_start_token)
+                    ]
                     if content_before:
                         return DeltaMessage(content=content_before)
                 return None
             else:
                 # Check if we're between tool calls - skip whitespace
-                if (current_text.rstrip().endswith(self.tool_call_end_token)
-                        and delta_text.strip() == ""):
+                if (
+                    current_text.rstrip().endswith(self.tool_call_end_token)
+                    and delta_text.strip() == ""
+                ):
                     # We just ended a tool call, skip whitespace
                     return None
                 # Normal content, no tool call
@@ -413,19 +454,20 @@ class Qwen3CoderToolParser(ToolParser):
 
         tool_start_idx = tool_start_positions[self.current_tool_index]
         # Find where this tool call ends (or current position if not ended yet)
-        tool_end_idx = current_text.find(self.tool_call_end_token,
-                                         tool_start_idx)
+        tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx)
         if tool_end_idx == -1:
             tool_text = current_text[tool_start_idx:]
         else:
-            tool_text = current_text[tool_start_idx:tool_end_idx +
-                                     len(self.tool_call_end_token)]
+            tool_text = current_text[
+                tool_start_idx : tool_end_idx + len(self.tool_call_end_token)
+            ]
 
         # Looking for function header
         if not self.header_sent:
             if self.tool_call_prefix in tool_text:
                 func_start = tool_text.find(self.tool_call_prefix) + len(
-                    self.tool_call_prefix)
+                    self.tool_call_prefix
+                )
                 func_end = tool_text.find(">", func_start)
 
                 if func_end != -1:
@@ -440,38 +482,44 @@ class Qwen3CoderToolParser(ToolParser):
                     # finish_reason="tool_calls" even if parsing isn't complete
                     already_added = any(
                         tool.get("name") == self.current_function_name
-                        for tool in self.prev_tool_call_arr)
+                        for tool in self.prev_tool_call_arr
+                    )
                     if not already_added:
-                        self.prev_tool_call_arr.append({
-                            "name": self.current_function_name,
-                            "arguments":
-                            "{}",  # Placeholder, will be updated later
-                        })
+                        self.prev_tool_call_arr.append(
+                            {
+                                "name": self.current_function_name,
+                                "arguments": "{}",  # Placeholder, will be updated later
+                            }
+                        )
 
                     # Send header with function info
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_index,
-                            id=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                name=self.current_function_name, arguments=""),
-                            type="function",
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                id=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    name=self.current_function_name, arguments=""
+                                ),
+                                type="function",
+                            )
+                        ]
+                    )
             return None
 
         # We've sent header, now handle function body
         if self.in_function:
             # Send opening brace if not sent yet
-            if (not self.json_started
-                    and self.parameter_prefix not in delta_text):
+            if not self.json_started and self.parameter_prefix not in delta_text:
                 self.json_started = True
-                return DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_index,
-                        function=DeltaFunctionCall(arguments="{"),
-                    )
-                ])
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="{"),
+                        )
+                    ]
+                )
 
             # Make sure json_started is set if we're processing parameters
             if not self.json_started:
@@ -486,35 +534,38 @@ class Qwen3CoderToolParser(ToolParser):
                 # prev_tool_call_arr with final arguments
                 # Find the function content
                 func_start = tool_text.find(self.tool_call_prefix) + len(
-                    self.tool_call_prefix)
-                func_content_end = tool_text.find(self.function_end_token,
-                                                  func_start)
+                    self.tool_call_prefix
+                )
+                func_content_end = tool_text.find(self.function_end_token, func_start)
                 if func_content_end != -1:
                     func_content = tool_text[func_start:func_content_end]
                     # Parse to get the complete arguments
                     try:
                         parsed_tool = self._parse_xml_function_call(
-                            func_content, self.streaming_request.tools
-                            if self.streaming_request else None)
+                            func_content,
+                            self.streaming_request.tools
+                            if self.streaming_request
+                            else None,
+                        )
                         if parsed_tool:
                             # Update existing entry in
                             # prev_tool_call_arr with complete args
                             for i, tool in enumerate(self.prev_tool_call_arr):
-                                if tool.get(
-                                        "name") == parsed_tool.function.name:
+                                if tool.get("name") == parsed_tool.function.name:
                                     args = parsed_tool.function.arguments
-                                    self.prev_tool_call_arr[i][
-                                        "arguments"] = args
+                                    self.prev_tool_call_arr[i]["arguments"] = args
                                     break
                     except Exception:
                         pass  # Ignore parsing errors during streaming
 
-                result = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_index,
-                        function=DeltaFunctionCall(arguments="}"),
-                    )
-                ])
+                result = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="}"),
+                        )
+                    ]
+                )
 
                 # Reset state for next tool
                 self.in_function = False
@@ -535,8 +586,11 @@ class Qwen3CoderToolParser(ToolParser):
                 idx += len(self.parameter_prefix)
 
             # Check if we should start a new parameter
-            if (not self.in_param and self.param_count < len(param_starts)
-                    and len(param_starts) > self.param_count):
+            if (
+                not self.in_param
+                and self.param_count < len(param_starts)
+                and len(param_starts) > self.param_count
+            ):
                 # Process the next parameter
                 param_idx = param_starts[self.param_count]
                 param_start = param_idx + len(self.parameter_prefix)
@@ -561,9 +615,9 @@ class Qwen3CoderToolParser(ToolParser):
                         next_param_idx = value_text.find(self.parameter_prefix)
                         func_end_idx = value_text.find(self.function_end_token)
 
-                        if next_param_idx != -1 and (func_end_idx == -1
-                                                     or next_param_idx
-                                                     < func_end_idx):
+                        if next_param_idx != -1 and (
+                            func_end_idx == -1 or next_param_idx < func_end_idx
+                        ):
                             param_end_idx = next_param_idx
                         elif func_end_idx != -1:
                             param_end_idx = func_end_idx
@@ -585,41 +639,49 @@ class Qwen3CoderToolParser(ToolParser):
                             param_value = param_value[:-1]
 
                         # Store raw value for later processing
-                        self.accumulated_params[
-                            self.current_param_name] = param_value
+                        self.accumulated_params[self.current_param_name] = param_value
 
                         # Get parameter configuration for type conversion
                         param_config = self._get_arguments_config(
                             self.current_function_name or "",
                             self.streaming_request.tools
-                            if self.streaming_request else None)
+                            if self.streaming_request
+                            else None,
+                        )
 
                         # Convert param value to appropriate type
                         converted_value = self._convert_param_value(
-                            param_value, self.current_param_name, param_config,
-                            self.current_function_name or "")
+                            param_value,
+                            self.current_param_name,
+                            param_config,
+                            self.current_function_name or "",
+                        )
 
                         # Build JSON fragment based on the converted type
                         # Use json.dumps to properly serialize the value
-                        serialized_value = json.dumps(converted_value,
-                                                      ensure_ascii=False)
+                        serialized_value = json.dumps(
+                            converted_value, ensure_ascii=False
+                        )
 
                         if self.param_count == 0:
-                            json_fragment = (f'"{self.current_param_name}": '
-                                             f'{serialized_value}')
+                            json_fragment = (
+                                f'"{self.current_param_name}": {serialized_value}'
+                            )
                         else:
-                            json_fragment = (f', "{self.current_param_name}": '
-                                             f'{serialized_value}')
+                            json_fragment = (
+                                f', "{self.current_param_name}": {serialized_value}'
+                            )
 
                         self.param_count += 1
 
-                        return DeltaMessage(tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_index,
-                                function=DeltaFunctionCall(
-                                    arguments=json_fragment),
-                            )
-                        ])
+                        return DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(arguments=json_fragment),
+                                )
+                            ]
+                        )
 
             # Continue parameter value - Not used in the current implementation
             # since we process complete parameters above
@@ -632,31 +694,33 @@ class Qwen3CoderToolParser(ToolParser):
                     # Skip past > if at start
                     if not self.current_param_value and ">" in value_chunk:
                         gt_idx = value_chunk.find(">")
-                        value_chunk = value_chunk[gt_idx + 1:]
+                        value_chunk = value_chunk[gt_idx + 1 :]
 
-                    if not self.current_param_value and value_chunk.startswith(
-                            "\n"):
+                    if not self.current_param_value and value_chunk.startswith("\n"):
                         value_chunk = value_chunk[1:]
 
                     # Store complete value
                     full_value = self.current_param_value + value_chunk
-                    self.accumulated_params[
-                        self.current_param_name] = full_value
+                    self.accumulated_params[self.current_param_name] = full_value
 
                     # Get parameter configuration for type conversion
                     param_config = self._get_arguments_config(
                         self.current_function_name or "",
                         self.streaming_request.tools
-                        if self.streaming_request else None)
+                        if self.streaming_request
+                        else None,
+                    )
 
                     # Convert the parameter value to the appropriate type
                     converted_value = self._convert_param_value(
-                        full_value, self.current_param_name or "",
-                        param_config, self.current_function_name or "")
+                        full_value,
+                        self.current_param_name or "",
+                        param_config,
+                        self.current_function_name or "",
+                    )
 
                     # Serialize the converted value
-                    serialized_value = json.dumps(converted_value,
-                                                  ensure_ascii=False)
+                    serialized_value = json.dumps(converted_value, ensure_ascii=False)
 
                     # Since we've been streaming the quoted version,
                     # we need to close it properly
@@ -665,13 +729,16 @@ class Qwen3CoderToolParser(ToolParser):
                     self.current_param_value = ""
 
                     # Just close the current parameter string
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_index,
-                            function=DeltaFunctionCall(
-                                arguments='"'),  # Close the string quote
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(
+                                    arguments='"'
+                                ),  # Close the string quote
+                            )
+                        ]
+                    )
                 else:
                     # Continue accumulating value
                     value_chunk = delta_text
@@ -679,29 +746,36 @@ class Qwen3CoderToolParser(ToolParser):
                     # Handle first chunk after param name
                     if not self.current_param_value and ">" in value_chunk:
                         gt_idx = value_chunk.find(">")
-                        value_chunk = value_chunk[gt_idx + 1:]
+                        value_chunk = value_chunk[gt_idx + 1 :]
 
-                    if not self.current_param_value and value_chunk.startswith(
-                            "\n"):
+                    if not self.current_param_value and value_chunk.startswith("\n"):
                         value_chunk = value_chunk[1:]
 
                     if value_chunk:
                         # Stream the escaped delta
-                        prev_escaped = json.dumps(
-                            self.current_param_value, ensure_ascii=False
-                        )[1:-1] if self.current_param_value else ""
+                        prev_escaped = (
+                            json.dumps(self.current_param_value, ensure_ascii=False)[
+                                1:-1
+                            ]
+                            if self.current_param_value
+                            else ""
+                        )
                         self.current_param_value += value_chunk
-                        full_escaped = json.dumps(self.current_param_value,
-                                                  ensure_ascii=False)[1:-1]
-                        delta_escaped = full_escaped[len(prev_escaped):]
+                        full_escaped = json.dumps(
+                            self.current_param_value, ensure_ascii=False
+                        )[1:-1]
+                        delta_escaped = full_escaped[len(prev_escaped) :]
 
                         if delta_escaped:
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=delta_escaped),
-                                )
-                            ])
+                            return DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_index,
+                                        function=DeltaFunctionCall(
+                                            arguments=delta_escaped
+                                        ),
+                                    )
+                                ]
+                            )
 
-        return None
\ No newline at end of file
+        return None
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 4ab67dfea104c6d3b3841d8a664661853d9d99c3..432c419db189a8a7ea5e7b6508e732bdb0a8af5b 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -2,21 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import json
-import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 from xml.parsers.expat import ParserCreate
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -33,13 +38,13 @@ class StreamingXMLToolCallParser:
         self.reset_streaming_state()
 
         # Tool configuration information
-        self.tools: Union[list[ChatCompletionToolsParam], None] = None
-        self.tool_call_start_token: str = '<tool_call>'
-        self.tool_call_end_token: str = '</tool_call>'
-        self.function_start_token: str = '<function='
-        self.function_end_token: str = '</function>'
-        self.parameter_start_token: str = '<parameter='
-        self.parameter_end_token: str = '</parameter>'
+        self.tools: list[ChatCompletionToolsParam] | None = None
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.function_start_token: str = "<function="
+        self.function_end_token: str = "</function>"
+        self.parameter_start_token: str = "<parameter="
+        self.parameter_end_token: str = "</parameter>"
 
     def reset_streaming_state(self):
         """Reset streaming parsing state"""
@@ -53,16 +58,16 @@ class StreamingXMLToolCallParser:
         self.current_function_open = False
         self.parameters = {}
         self.current_param_name = None
-        self.current_param_value = ''
-        self.current_param_value_converted = ''
+        self.current_param_value = ""
+        self.current_param_value_converted = ""
         self.current_param_is_first = False
         self.should_emit_end_newline = False
         self.start_quote_emitted = False
 
-        self.streaming_buffer = ''
+        self.streaming_buffer = ""
         self.last_processed_pos = 0
 
-        self.text_content_buffer = ''
+        self.text_content_buffer = ""
 
         # state for preprocessing and deferred parsing
         self._pre_inside_parameter = False
@@ -78,13 +83,13 @@ class StreamingXMLToolCallParser:
     def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage:
         """
         Parse single streaming XML chunk and return Delta response
-        This is the actual streaming interface that receives chunks 
+        This is the actual streaming interface that receives chunks
         one by one and maintains internal state
 
         Args:
             xml_chunk: Single XML chunk string
         Returns:
-            DeltaMessage: Contains delta information generated by this chunk, 
+            DeltaMessage: Contains delta information generated by this chunk,
             returns empty response if no complete elements
         """
         # Record delta count before processing
@@ -101,42 +106,67 @@ class StreamingXMLToolCallParser:
                 new_deltas = self.deltas[initial_delta_count:]
                 # If this chunk contains </function>
                 # but didn't generate '}', then complete it
-                if (self.current_call_id is not None
-                        and self.function_end_token in xml_chunk):
-
+                if (
+                    self.current_call_id is not None
+                    and self.function_end_token in xml_chunk
+                ):
                     # - Added '}' (non-empty parameter ending)
                     # - Added '{}' (empty parameter function)
-                    has_function_close = any((td.tool_calls and any(
-                        (tc.function and tc.id == self.current_call_id
-                         and isinstance(tc.function.arguments, str) and
-                         (tc.function.arguments in ('}', '{}')))
-                        for tc in td.tool_calls)) for td in new_deltas)
+                    has_function_close = any(
+                        (
+                            td.tool_calls
+                            and any(
+                                (
+                                    tc.function
+                                    and tc.id == self.current_call_id
+                                    and isinstance(tc.function.arguments, str)
+                                    and (tc.function.arguments in ("}", "{}"))
+                                )
+                                for tc in td.tool_calls
+                            )
+                        )
+                        for td in new_deltas
+                    )
                     if not has_function_close:
                         # Close potentially unclosed element
                         if self.current_param_name:
-                            self._end_element('parameter')
+                            self._end_element("parameter")
                         if self.current_function_name:
-                            self._end_element('function')
+                            self._end_element("function")
                 # If this chunk contains </tool_call>
                 # but didn't generate final empty delta, then complete it
-                if (self.current_call_id is not None
-                        and self.tool_call_end_token in xml_chunk):
-                    has_toolcall_close = any((td.tool_calls and any(
-                        (tc.type == 'function' and tc.function and tc.function.
-                         arguments == '' and tc.id == self.current_call_id)
-                        for tc in td.tool_calls)) for td in new_deltas)
+                if (
+                    self.current_call_id is not None
+                    and self.tool_call_end_token in xml_chunk
+                ):
+                    has_toolcall_close = any(
+                        (
+                            td.tool_calls
+                            and any(
+                                (
+                                    tc.type == "function"
+                                    and tc.function
+                                    and tc.function.arguments == ""
+                                    and tc.id == self.current_call_id
+                                )
+                                for tc in td.tool_calls
+                            )
+                        )
+                        for td in new_deltas
+                    )
                     if not has_toolcall_close:
                         # Close potentially unclosed element
                         if self.current_param_name:
-                            self._end_element('parameter')
+                            self._end_element("parameter")
                         if self.current_function_name:
-                            self._end_element('function')
-                        self._end_element('tool_call')
+                            self._end_element("function")
+                        self._end_element("tool_call")
             except Exception as e:
                 logger.warning("Error with fallback parsing: %s", e)
             # Merge newly generated deltas into single response
             result_delta = self._merge_new_deltas_to_single_response(
-                initial_delta_count)
+                initial_delta_count
+            )
             return result_delta
         else:
             # No complete elements, check if there's unoutput text content
@@ -145,7 +175,7 @@ class StreamingXMLToolCallParser:
                 text_delta = DeltaMessage(content=self.text_content_buffer)
                 self._emit_delta(text_delta)
                 # Clear buffer to avoid duplicate output
-                self.text_content_buffer = ''
+                self.text_content_buffer = ""
                 return text_delta
 
             # If this chunk contains end tags but wasn't triggered by parser,
@@ -153,20 +183,21 @@ class StreamingXMLToolCallParser:
             # Only execute when still on the same call as when entered,
             # to prevent accidentally closing new calls
             # in multi <tool_call> scenarios
-            if (self.current_call_id is not None
-                    and (self.function_end_token in xml_chunk
-                         or self.tool_call_end_token in xml_chunk)):
+            if self.current_call_id is not None and (
+                self.function_end_token in xml_chunk
+                or self.tool_call_end_token in xml_chunk
+            ):
                 # Close potentially unclosed element
                 if self.current_param_name:
-                    self._end_element('parameter')
-                if self.function_end_token in xml_chunk and \
-                    self.current_function_name:
-                    self._end_element('function')
+                    self._end_element("parameter")
+                if self.function_end_token in xml_chunk and self.current_function_name:
+                    self._end_element("function")
                 if self.tool_call_end_token in xml_chunk:
-                    self._end_element('tool_call')
+                    self._end_element("tool_call")
                 # Return the merged delta result generated by this fallback
                 result_delta = self._merge_new_deltas_to_single_response(
-                    initial_delta_count)
+                    initial_delta_count
+                )
                 return result_delta
 
             # No complete elements, return empty response
@@ -181,11 +212,11 @@ class StreamingXMLToolCallParser:
             Escaped text
         """
         xml_escapes = {
-            '&': '&amp;',
-            '<': '&lt;',
-            '>': '&gt;',
-            '"': '&quot;',
-            "'": '&apos;'
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            '"': "&quot;",
+            "'": "&apos;",
         }
 
         for char, escape in xml_escapes.items():
@@ -204,8 +235,7 @@ class StreamingXMLToolCallParser:
 
         while self.last_processed_pos < len(self.streaming_buffer):
             # Find next complete xml element
-            element, end_pos = self._find_next_complete_element(
-                self.last_processed_pos)
+            element, end_pos = self._find_next_complete_element(self.last_processed_pos)
             if element is None:
                 # No complete element found, wait for more data
                 break
@@ -219,38 +249,46 @@ class StreamingXMLToolCallParser:
             try:
                 preprocessed_element = self._preprocess_xml_chunk(element)
                 # Check if this is the first tool_call start
-                if ((preprocessed_element.strip().startswith('<tool_call>') or
-                     preprocessed_element.strip().startswith('<function name=')
-                     ) and self.tool_call_index
-                        == 0) and self.text_content_buffer:
+                if (
+                    (
+                        preprocessed_element.strip().startswith("<tool_call>")
+                        or preprocessed_element.strip().startswith("<function name=")
+                    )
+                    and self.tool_call_index == 0
+                ) and self.text_content_buffer:
                     # First tool_call starts,
                     # output previously collected text content first
                     text_delta = DeltaMessage(content=self.text_content_buffer)
                     self._emit_delta(text_delta)
                     # Clear buffer for potential subsequent text content
-                    self.text_content_buffer = ''
+                    self.text_content_buffer = ""
 
                 # If a new tool_call starts and
                 # there are already completed tool_calls
-                if (preprocessed_element.strip().startswith('<tool_call>')
-                        and self.tool_call_index > 0 and self.current_call_id):
+                if (
+                    preprocessed_element.strip().startswith("<tool_call>")
+                    and self.tool_call_index > 0
+                    and self.current_call_id
+                ):
                     # Reset parser state but preserve generated deltas
                     if self.current_param_name:
-                        self._end_element('parameter')
+                        self._end_element("parameter")
                     if self.current_function_open or self.current_function_name:
-                        self._end_element('function')
+                        self._end_element("function")
                     # Output final tool_call tail delta
                     final_delta = DeltaMessage(
                         role=None,
                         content=None,
-                        reasoning_content=None,
+                        reasoning=None,
                         tool_calls=[
-                            DeltaToolCall(index=self.tool_call_index - 1,
-                                          id=self.current_call_id,
-                                          type='function',
-                                          function=DeltaFunctionCall(
-                                              name=None, arguments=''))
-                        ])
+                            DeltaToolCall(
+                                index=self.tool_call_index - 1,
+                                id=self.current_call_id,
+                                type="function",
+                                function=DeltaFunctionCall(name=None, arguments=""),
+                            )
+                        ],
+                    )
                     self._emit_delta(final_delta)
                     # Reset XML parser and current call state
                     self._reset_xml_parser_after_tool_call()
@@ -278,10 +316,11 @@ class StreamingXMLToolCallParser:
         """
 
         # If it's a tool_call XML tag, don't skip
-        if element.startswith(
-                self.tool_call_start_token) or element.startswith(
-                    self.function_start_token) or element.startswith(
-                        self.parameter_start_token):
+        if (
+            element.startswith(self.tool_call_start_token)
+            or element.startswith(self.function_start_token)
+            or element.startswith(self.parameter_start_token)
+        ):
             return False
 
         # If currently not parsing tool calls and not blank,
@@ -301,8 +340,7 @@ class StreamingXMLToolCallParser:
         # Skip blank content
         return not element
 
-    def _find_next_complete_element(
-            self, start_pos: int) -> tuple[Optional[str], int]:
+    def _find_next_complete_element(self, start_pos: int) -> tuple[str | None, int]:
         """
         Find next complete XML element from specified position
 
@@ -310,7 +348,7 @@ class StreamingXMLToolCallParser:
             start_pos: Position to start searching
 
         Returns:
-            (Complete element string, element end position), 
+            (Complete element string, element end position),
             returns (None, start_pos) if no complete element found
         """
         buffer = self.streaming_buffer[start_pos:]
@@ -318,32 +356,39 @@ class StreamingXMLToolCallParser:
         if not buffer:
             return None, start_pos
 
-        if buffer.startswith('<'):
+        if buffer.startswith("<"):
             # Need to ensure no new < appears,
             # find the nearest one between < and >
-            tag_end = buffer.find('<', 1)
-            tag_end2 = buffer.find('>', 1)
+            tag_end = buffer.find("<", 1)
+            tag_end2 = buffer.find(">", 1)
             if tag_end != -1 and tag_end2 != -1:
                 # Next nearest is <
                 if tag_end < tag_end2:
                     return buffer[:tag_end], start_pos + tag_end
                 # Next nearest is >, means found XML element
                 else:
-                    return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1
+                    return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1
             elif tag_end != -1:
                 return buffer[:tag_end], start_pos + tag_end
             elif tag_end2 != -1:
-                return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1
+                return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1
             else:
                 # If currently not parsing tool calls (entering a tool_call),
-                # check if starts with <tool_call>
+                # check if starts with <tool_call> or <function=
                 if self.current_call_id is None:
                     # Check if might be start of <tool_call>
-                    if buffer == '<tool_call>'[:len(buffer)]:
+                    if buffer == "<tool_call>"[: len(buffer)]:
                         # Might be start of <tool_call>, wait for more data
                         return None, start_pos
+                    elif (
+                        buffer.startswith("<function=")
+                        or buffer == "<function="[: len(buffer)]
+                    ):
+                        # Might be start of <function=, wait for more data
+                        # to get the complete function tag
+                        return None, start_pos
                     else:
-                        # Not start of <tool_call>, treat as text
+                        # Not start of <tool_call> or <function=, treat as text
                         return buffer, start_pos + len(buffer)
                 else:
                     # When parsing tool calls,
@@ -351,7 +396,7 @@ class StreamingXMLToolCallParser:
                     return None, start_pos
         else:
             # Find text content (until next < or buffer end)
-            next_tag_pos = buffer.find('<')
+            next_tag_pos = buffer.find("<")
             if next_tag_pos != -1:
                 # Found text content
                 text_content = buffer[:next_tag_pos]
@@ -362,8 +407,7 @@ class StreamingXMLToolCallParser:
                 remaining = buffer
                 return remaining, start_pos + len(remaining)
 
-    def _merge_new_deltas_to_single_response(
-            self, initial_count: int) -> DeltaMessage:
+    def _merge_new_deltas_to_single_response(self, initial_count: int) -> DeltaMessage:
         """
         Merge newly generated deltas from this processing
         into a single DeltaMessage
@@ -386,7 +430,7 @@ class StreamingXMLToolCallParser:
 
         # Merge multiple new deltas
         merged_tool_calls: list[DeltaToolCall] = []
-        merged_content: str = ''
+        merged_content: str = ""
 
         for delta in new_deltas:
             if delta.content:
@@ -404,12 +448,13 @@ class StreamingXMLToolCallParser:
                     if existing_call and existing_call.function:
                         # Merge to existing tool_call
                         if tool_call.function and tool_call.function.name:
-                            existing_call.function.name = \
-                                tool_call.function.name
-                        if tool_call.function \
-                            and tool_call.function.arguments is not None:
+                            existing_call.function.name = tool_call.function.name
+                        if (
+                            tool_call.function
+                            and tool_call.function.arguments is not None
+                        ):
                             if existing_call.function.arguments is None:
-                                existing_call.function.arguments = ''
+                                existing_call.function.arguments = ""
 
                             # For streaming JSON parameters,
                             # simply concatenate in order
@@ -421,12 +466,14 @@ class StreamingXMLToolCallParser:
                         # Add new tool_call
                         merged_tool_calls.append(tool_call)
 
-        return DeltaMessage(content=merged_content if merged_content else None,
-                            tool_calls=merged_tool_calls)
+        return DeltaMessage(
+            content=merged_content if merged_content else None,
+            tool_calls=merged_tool_calls,
+        )
 
     def _preprocess_xml_chunk(self, chunk: str) -> str:
         """
-        Preprocess XML chunk, handle non-standard formats, 
+        Preprocess XML chunk, handle non-standard formats,
         and escape special characters
 
         Args:
@@ -439,27 +486,28 @@ class StreamingXMLToolCallParser:
         # Check if this is a tool_call related element
         is_tool_call = False
         if chunk.startswith(self.tool_call_start_token) or chunk.startswith(
-                self.tool_call_end_token):
+            self.tool_call_end_token
+        ):
             is_tool_call = True
         if chunk.startswith(self.function_start_token) or chunk.startswith(
-                self.function_end_token):
+            self.function_end_token
+        ):
             is_tool_call = True
         if chunk.startswith(self.parameter_start_token) or chunk.startswith(
-                self.parameter_end_token):
+            self.parameter_end_token
+        ):
             is_tool_call = True
         # Handle <function=name> format -> <function name="name">
-        processed = re.sub(r'<function=([^>]+)>', r'<function name="\1">',
-                           chunk)
+        processed = re.sub(r"<function=([^>]+)>", r'<function name="\1">', chunk)
         # Handle <parameter=name> format -> <parameter name="name">
-        processed = re.sub(r'<parameter=([^>]+)>', r'<parameter name="\1">',
-                           processed)
+        processed = re.sub(r"<parameter=([^>]+)>", r'<parameter name="\1">', processed)
 
         original_chunk = chunk
         # If in parameter value accumulation mode
         if self._pre_inside_parameter:
             # Parameter end: output accumulated raw text
             # safely then return </parameter>
-            if processed.startswith('</parameter>'):
+            if processed.startswith("</parameter>"):
                 body_text = self._pre_param_buffer
                 # Trigger deferred parsing mode
                 # literal_eval+json output in end_element
@@ -478,29 +526,38 @@ class StreamingXMLToolCallParser:
                 # and pass through directly
                 if self._pre_param_buffer == "":
                     # Get current parameter type
-                    param_type = self._get_param_type(
-                        self._pre_current_param_name
-                    ) if self._pre_current_param_name else 'string'
+                    param_type = (
+                        self._get_param_type(self._pre_current_param_name)
+                        if self._pre_current_param_name
+                        else "string"
+                    )
                     # Only these types need deferred parsing to
                     # handle Python literals containing single quotes
                     is_object_type = param_type in ["object"]
-                    is_complex_type = (param_type
-                                       in ["array", "arr", "sequence"]
-                                       or param_type.startswith("dict")
-                                       or param_type.startswith("list"))
+                    is_complex_type = (
+                        param_type in ["array", "arr", "sequence"]
+                        or param_type.startswith("dict")
+                        or param_type.startswith("list")
+                    )
 
                     # Only delay when contains container symbols
                     # and has single quotes and is complex type
-                    has_container_hint = ('[' in original_chunk) or (
-                        '{' in original_chunk) or ('(' in original_chunk)
+                    has_container_hint = (
+                        ("[" in original_chunk)
+                        or ("{" in original_chunk)
+                        or ("(" in original_chunk)
+                    )
 
                     # Determine if deferred parsing is needed
                     need_defer = False
                     if is_complex_type:
                         # Complex type, always need deferred parsing
                         need_defer = True
-                    elif is_object_type and has_container_hint and (
-                            "'" in original_chunk):
+                    elif (
+                        is_object_type
+                        and has_container_hint
+                        and ("'" in original_chunk)
+                    ):
                         # Object type with container symbols
                         # and single quotes, need deferred parsing
                         need_defer = True
@@ -514,7 +571,7 @@ class StreamingXMLToolCallParser:
                 return ""
 
         # Parameter start: enable accumulation
-        if processed.startswith('<parameter name='):
+        if processed.startswith("<parameter name="):
             m = re.match(r'<parameter name="([^"]+)">', processed)
             if m:
                 self._pre_current_param_name = m.group(1)
@@ -533,76 +590,78 @@ class StreamingXMLToolCallParser:
         """Emit Delta response (streaming output)"""
         self.deltas.append(delta)
 
-    def _auto_close_open_parameter_if_needed(self,
-                                             incoming_tag: Optional[str] = None
-                                             ):
-        """Before starting to process new elements, 
-        if there are unclosed tags from before, 
+    def _auto_close_open_parameter_if_needed(self, incoming_tag: str | None = None):
+        """Before starting to process new elements,
+        if there are unclosed tags from before,
         automatically complete their endings to the parser.
-        - If there are unclosed parameters, 
+        - If there are unclosed parameters,
         it's equivalent to feeding `</parameter>`
-        - When about to start a new function or tool_call, 
+        - When about to start a new function or tool_call,
         if there are unclosed functions, complete `</function>`.
-        - When about to start a new tool_call, 
+        - When about to start a new tool_call,
         if there are unclosed tool_calls, complete `</tool_call>`.
         """
         # First close unclosed parameters
         if self.current_param_name:
-            self._end_element('parameter')
+            self._end_element("parameter")
 
         # If about to start new function or tool_call,
         # and there are unclosed functions, close function first
-        if incoming_tag in ('function',
-                            'tool_call') and self.current_function_name:
-            self._end_element('function')
+        if incoming_tag in ("function", "tool_call") and self.current_function_name:
+            self._end_element("function")
 
         # If about to start new tool_call,
         # and there are unclosed tool_calls, close tool_call first
-        if incoming_tag == 'tool_call' and self.current_call_id:
-            self._end_element('tool_call')
+        if incoming_tag == "tool_call" and self.current_call_id:
+            self._end_element("tool_call")
 
     def _start_element(self, name: str, attrs: dict[str, str]):
         """Handle XML start element events"""
 
-        if name == 'root':
+        if name == "root":
             return
 
-        if name == 'tool_call':
+        if name == "tool_call":
             # Before opening new tool_call,
             # automatically complete previous unclosed tags
-            self._auto_close_open_parameter_if_needed('tool_call')
+            self._auto_close_open_parameter_if_needed("tool_call")
 
             self.parameters = {}
-            self.current_call_id = self._get_next_call_id()
+            self.current_call_id = make_tool_call_id()
             self.current_param_is_first = True
             self.tool_call_index += 1
-        elif name.startswith('function') or (name == 'function'):
+        elif name.startswith("function") or (name == "function"):
             # If missing tool_call, manually complete
             if not self.current_call_id:
-                self._start_element('tool_call', {})
+                self._start_element("tool_call", {})
             # Before opening new function,
             # automatically complete previous unclosed tags (parameter/function)
-            self._auto_close_open_parameter_if_needed('function')
+            self._auto_close_open_parameter_if_needed("function")
             function_name = self._extract_function_name(name, attrs)
             self.current_function_name = function_name
             self.current_function_open = True
             if function_name:
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.tool_call_index - 1,
-                                  id=self.current_call_id,
-                                  type='function',
-                                  function=DeltaFunctionCall(
-                                      name=function_name, arguments=''))
-                ])
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.tool_call_index - 1,
+                            id=self.current_call_id,
+                            type="function",
+                            function=DeltaFunctionCall(
+                                name=function_name, arguments=""
+                            ),
+                        )
+                    ]
+                )
                 self._emit_delta(delta)
-        elif name.startswith('parameter') or (name == 'parameter'):
+        elif name.startswith("parameter") or (name == "parameter"):
             # If previous parameter hasn't ended normally,
             # complete its end first, then start new parameter
-            self._auto_close_open_parameter_if_needed('parameter')
+            self._auto_close_open_parameter_if_needed("parameter")
             param_name = self._extract_parameter_name(name, attrs)
             self.current_param_name = param_name
-            self.current_param_value = ''
-            self.current_param_value_converted = ''
+            self.current_param_value = ""
+            self.current_param_value_converted = ""
             self.start_quote_emitted = False  # Reset start quote flag
 
             # Only output parameter name and colon,
@@ -613,26 +672,36 @@ class StreamingXMLToolCallParser:
                     # First parameter
                     # start JSON, only output parameter name and colon
                     json_start = f'{{"{param_name}": '
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.tool_call_index - 1,
-                                      id=self.current_call_id,
-                                      type='function',
-                                      function=DeltaFunctionCall(
-                                          name=None, arguments=json_start))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.tool_call_index - 1,
+                                id=self.current_call_id,
+                                type="function",
+                                function=DeltaFunctionCall(
+                                    name=None, arguments=json_start
+                                ),
+                            )
+                        ]
+                    )
                     self._emit_delta(delta)
                     self.current_param_is_first = True
                 else:
                     # Subsequent parameters
                     # add comma and parameter name, no quotes
                     json_continue = f', "{param_name}": '
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.tool_call_index - 1,
-                                      id=self.current_call_id,
-                                      type='function',
-                                      function=DeltaFunctionCall(
-                                          name=None, arguments=json_continue))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.tool_call_index - 1,
+                                id=self.current_call_id,
+                                type="function",
+                                function=DeltaFunctionCall(
+                                    name=None, arguments=json_continue
+                                ),
+                            )
+                        ]
+                    )
                     self._emit_delta(delta)
                     self.current_param_is_first = False
 
@@ -644,9 +713,9 @@ class StreamingXMLToolCallParser:
             if self.defer_current_parameter:
                 original_data = data
                 if self.should_emit_end_newline:
-                    original_data = '\n' + original_data
+                    original_data = "\n" + original_data
                     self.should_emit_end_newline = False
-                if original_data.endswith('\n'):
+                if original_data.endswith("\n"):
                     self.should_emit_end_newline = True
                     original_data = original_data[:-1]
                 self.current_param_value += original_data
@@ -656,20 +725,24 @@ class StreamingXMLToolCallParser:
 
             # Check if this is the first time receiving data for this parameter
             # If this is the first packet of data and starts with \n, remove \n
-            if not self.current_param_value and data.startswith('\n'):
+            if not self.current_param_value and data.startswith("\n"):
                 data = data[1:]
 
             # Output start quote for string type (if not already output)
-            if (param_type
-                    in ['string', 'str', 'text', 'varchar', 'char', 'enum']
-                    and not self.start_quote_emitted):
-                quote_delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.tool_call_index - 1,
-                                  id=self.current_call_id,
-                                  type='function',
-                                  function=DeltaFunctionCall(name=None,
-                                                             arguments='"'))
-                ])
+            if (
+                param_type in ["string", "str", "text", "varchar", "char", "enum"]
+                and not self.start_quote_emitted
+            ):
+                quote_delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.tool_call_index - 1,
+                            id=self.current_call_id,
+                            type="function",
+                            function=DeltaFunctionCall(name=None, arguments='"'),
+                        )
+                    ]
+                )
                 self._emit_delta(quote_delta)
                 self.start_quote_emitted = True
 
@@ -679,45 +752,50 @@ class StreamingXMLToolCallParser:
             original_data = data
             # Delay output of trailing newline
             if self.should_emit_end_newline:
-                original_data = '\n' + original_data
+                original_data = "\n" + original_data
                 self.should_emit_end_newline = False
-            if original_data.endswith('\n'):
+            if original_data.endswith("\n"):
                 self.should_emit_end_newline = True
                 original_data = original_data[:-1]
             self.current_param_value += original_data
 
             # convert parameter value by param_type
             converted_value = self._convert_param_value(
-                self.current_param_value, param_type)
-            output_data = self._convert_for_json_streaming(
-                converted_value, param_type)
+                self.current_param_value, param_type
+            )
+            output_data = self._convert_for_json_streaming(converted_value, param_type)
 
-            delta_data = output_data[len(self.current_param_value_converted):]
+            delta_data = output_data[len(self.current_param_value_converted) :]
             self.current_param_value_converted = output_data
 
-            delta = DeltaMessage(tool_calls=[
-                DeltaToolCall(index=self.tool_call_index - 1,
-                              id=self.current_call_id,
-                              type='function',
-                              function=DeltaFunctionCall(name=None,
-                                                         arguments=delta_data))
-            ])
+            delta = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=self.tool_call_index - 1,
+                        id=self.current_call_id,
+                        type="function",
+                        function=DeltaFunctionCall(name=None, arguments=delta_data),
+                    )
+                ]
+            )
             self._emit_delta(delta)
 
     def _end_element(self, name: str):
         """Handle XML end element events"""
 
-        if name == 'root':
+        if name == "root":
             return
 
         # If function or tool_call ends and there are still unclosed parameters,
         # complete parameter end first
-        if (name.startswith('function') or name == 'function'
-                or name == 'tool_call') and self.current_param_name:
+        if (
+            name.startswith("function") or name == "function" or name == "tool_call"
+        ) and self.current_param_name:
             self._auto_close_open_parameter_if_needed()
 
-        if (name.startswith('parameter')
-                or name == 'parameter') and self.current_param_name:
+        if (
+            name.startswith("parameter") or name == "parameter"
+        ) and self.current_param_name:
             # End current parameter
             param_name = self.current_param_name
             param_value = self.current_param_value
@@ -726,32 +804,39 @@ class StreamingXMLToolCallParser:
             # perform overall parsing on raw content
             # accumulated in preprocessing stage and output once
             if self.defer_current_parameter:
-                raw_text = self.deferred_param_raw_value \
-                if self.deferred_param_raw_value else param_value
+                raw_text = (
+                    self.deferred_param_raw_value
+                    if self.deferred_param_raw_value
+                    else param_value
+                )
                 parsed_value = None
                 output_arguments = None
                 try:
                     # If previously delayed trailing newline,
                     # add it back before parsing
                     if self.should_emit_end_newline:
-                        raw_for_parse = raw_text + '\n'
+                        raw_for_parse = raw_text + "\n"
                     else:
                         raw_for_parse = raw_text
                     parsed_value = ast.literal_eval(raw_for_parse)
-                    output_arguments = json.dumps(parsed_value,
-                                                  ensure_ascii=False)
+                    output_arguments = json.dumps(parsed_value, ensure_ascii=False)
                 except Exception:
                     # Fallback: output as string as-is
                     output_arguments = json.dumps(raw_text, ensure_ascii=False)
                     parsed_value = raw_text
 
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.tool_call_index - 1,
-                                  id=self.current_call_id,
-                                  type='function',
-                                  function=DeltaFunctionCall(
-                                      name=None, arguments=output_arguments))
-                ])
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.tool_call_index - 1,
+                            id=self.current_call_id,
+                            type="function",
+                            function=DeltaFunctionCall(
+                                name=None, arguments=output_arguments
+                            ),
+                        )
+                    ]
+                )
                 self._emit_delta(delta)
 
                 # Clean up and store
@@ -768,84 +853,96 @@ class StreamingXMLToolCallParser:
             param_type = self._get_param_type(param_name)
 
             # convert complete parameter value by param_type
-            converted_value = self._convert_param_value(
-                param_value, param_type)
+            converted_value = self._convert_param_value(param_value, param_type)
 
             # Decide whether to add end quote based on parameter type
-            if param_type in [
-                    'string', 'str', 'text', 'varchar', 'char', 'enum'
-            ]:
+            if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
                 # For empty string parameters, need special handling
                 if not param_value and not self.start_quote_emitted:
                     # No start quote output,
                     # directly output complete empty string
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.tool_call_index - 1,
-                                      id=self.current_call_id,
-                                      type='function',
-                                      function=DeltaFunctionCall(
-                                          name=None, arguments='""'))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.tool_call_index - 1,
+                                id=self.current_call_id,
+                                type="function",
+                                function=DeltaFunctionCall(name=None, arguments='""'),
+                            )
+                        ]
+                    )
                     self._emit_delta(delta)
                 else:
                     # Non-empty parameter value, output end quote
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.tool_call_index - 1,
-                                      id=self.current_call_id,
-                                      type='function',
-                                      function=DeltaFunctionCall(
-                                          name=None, arguments='"'))
-                    ])
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.tool_call_index - 1,
+                                id=self.current_call_id,
+                                type="function",
+                                function=DeltaFunctionCall(name=None, arguments='"'),
+                            )
+                        ]
+                    )
                     self._emit_delta(delta)
 
             self.should_emit_end_newline = False
             # Store converted value
             self.parameters[param_name] = converted_value
             self.current_param_name = None
-            self.current_param_value = ''
-            self.current_param_value_converted = ''
+            self.current_param_value = ""
+            self.current_param_value_converted = ""
             self.start_quote_emitted = False
 
-        elif name.startswith('function') or name == 'function':
+        elif name.startswith("function") or name == "function":
             # if there are parameters, close JSON object
             if self.parameters:
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.tool_call_index - 1,
-                                  id=self.current_call_id,
-                                  type='function',
-                                  function=DeltaFunctionCall(name=None,
-                                                             arguments='}'))
-                ])
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.tool_call_index - 1,
+                            id=self.current_call_id,
+                            type="function",
+                            function=DeltaFunctionCall(name=None, arguments="}"),
+                        )
+                    ]
+                )
                 self._emit_delta(delta)
             # return empty object
             else:
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.tool_call_index - 1,
-                                  id=self.current_call_id,
-                                  type='function',
-                                  function=DeltaFunctionCall(name=None,
-                                                             arguments='{}'))
-                ])
+                delta = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.tool_call_index - 1,
+                            id=self.current_call_id,
+                            type="function",
+                            function=DeltaFunctionCall(name=None, arguments="{}"),
+                        )
+                    ]
+                )
                 self._emit_delta(delta)
             self.current_function_open = False
 
-        elif name == 'tool_call':
+        elif name == "tool_call":
             # Before ending tool_call,
             # ensure function is closed to complete missing right brace
             if self.current_function_open:
                 # If there are still unclosed parameters, close them first
                 if self.current_param_name:
-                    self._end_element('parameter')
+                    self._end_element("parameter")
                 # Close function, ensure output '}' or '{}'
-                self._end_element('function')
+                self._end_element("function")
             # Final Delta
-            delta = DeltaMessage(tool_calls=[
-                DeltaToolCall(index=self.tool_call_index - 1,
-                              id=self.current_call_id,
-                              type='function',
-                              function=DeltaFunctionCall(name=None,
-                                                         arguments=''))
-            ])
+            delta = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=self.tool_call_index - 1,
+                        id=self.current_call_id,
+                        type="function",
+                        function=DeltaFunctionCall(name=None, arguments=""),
+                    )
+                ]
+            )
             self._emit_delta(delta)
 
             # Check if there's text content to output (between tool_calls)
@@ -862,36 +959,30 @@ class StreamingXMLToolCallParser:
         self.parser.EndElementHandler = self._end_element
         self.parser.CharacterDataHandler = self._char_data
 
-    def set_tools(self, tools: Union[list[ChatCompletionToolsParam], None]):
+    def set_tools(self, tools: list[ChatCompletionToolsParam] | None):
         """Set tool configuration information"""
         self.tools = tools
 
-    def _get_next_call_id(self):
-        """Generate unique call ID"""
-        return f'call_{uuid.uuid4().hex[:24]}'
-
-    def _extract_function_name(self, name: str,
-                               attrs: dict[str, str]) -> Optional[str]:
+    def _extract_function_name(self, name: str, attrs: dict[str, str]) -> str | None:
         """Extract function name from various formats"""
-        if attrs and 'name' in attrs:
-            return attrs['name']
+        if attrs and "name" in attrs:
+            return attrs["name"]
 
-        if '=' in name:
-            parts = name.split('=', 1)
-            if len(parts) == 2 and parts[0] == 'function':
+        if "=" in name:
+            parts = name.split("=", 1)
+            if len(parts) == 2 and parts[0] == "function":
                 return parts[1]
 
         return None
 
-    def _extract_parameter_name(self, name: str,
-                                attrs: dict[str, str]) -> Optional[str]:
+    def _extract_parameter_name(self, name: str, attrs: dict[str, str]) -> str | None:
         """Extract parameter name from various formats"""
-        if attrs and 'name' in attrs:
-            return attrs['name']
+        if attrs and "name" in attrs:
+            return attrs["name"]
 
-        if '=' in name:
-            parts = name.split('=', 1)
-            if len(parts) == 2 and parts[0] == 'parameter':
+        if "=" in name:
+            parts = name.split("=", 1)
+            if len(parts) == 2 and parts[0] == "parameter":
                 return parts[1]
 
         return None
@@ -905,30 +996,36 @@ class StreamingXMLToolCallParser:
             Parameter type
         """
         if not self.tools or not self.current_function_name:
-            return 'string'
+            return "string"
 
         for tool in self.tools:
-            if not hasattr(tool, 'type') or not (hasattr(
-                    tool, 'function') and hasattr(tool.function, 'name')):
+            if not hasattr(tool, "type") or not (
+                hasattr(tool, "function") and hasattr(tool.function, "name")
+            ):
                 continue
-            if tool.type == 'function' and \
-                tool.function.name == self.current_function_name:
-                if not hasattr(tool.function, 'parameters'):
-                    return 'string'
+            if (
+                tool.type == "function"
+                and tool.function.name == self.current_function_name
+            ):
+                if not hasattr(tool.function, "parameters"):
+                    return "string"
                 params = tool.function.parameters
-                if isinstance(params, dict) and 'properties' in params:
-                    properties = params['properties']
+                if isinstance(params, dict) and "properties" in params:
+                    properties = params["properties"]
                     if param_name in properties and isinstance(
-                            properties[param_name], dict):
+                        properties[param_name], dict
+                    ):
                         return self.repair_param_type(
-                            str(properties[param_name].get('type', 'string')))
+                            str(properties[param_name].get("type", "string"))
+                        )
                 elif isinstance(params, dict) and param_name in params:
                     param_config = params[param_name]
                     if isinstance(param_config, dict):
                         return self.repair_param_type(
-                            str(param_config.get('type', 'string')))
+                            str(param_config.get("type", "string"))
+                        )
                 break
-        return 'string'
+        return "string"
 
     def repair_param_type(self, param_type: str) -> str:
         """Repair unknown parameter types by treating them as string
@@ -938,21 +1035,25 @@ class StreamingXMLToolCallParser:
         Returns:
             Repaired parameter type
         """
-        if param_type in [
-                'string', 'str', 'text', 'varchar', 'char', 'enum'
-        ] or param_type.startswith('int') or param_type.startswith(
-                'uint'
-        ) or param_type.startswith('long') or param_type.startswith(
-                'short'
-        ) or param_type.startswith('unsigned') or param_type.startswith(
-                'num') or param_type.startswith('float') or param_type in [
-                    'boolean', 'bool', 'binary'
-                ] or (param_type in ["object", "array", "arr", "sequence"]
-                      or param_type.startswith("dict")
-                      or param_type.startswith("list")):
+        if (
+            param_type in ["string", "str", "text", "varchar", "char", "enum"]
+            or param_type.startswith("int")
+            or param_type.startswith("uint")
+            or param_type.startswith("long")
+            or param_type.startswith("short")
+            or param_type.startswith("unsigned")
+            or param_type.startswith("num")
+            or param_type.startswith("float")
+            or param_type in ["boolean", "bool", "binary"]
+            or (
+                param_type in ["object", "array", "arr", "sequence"]
+                or param_type.startswith("dict")
+                or param_type.startswith("list")
+            )
+        ):
             return param_type
         else:
-            return 'string'
+            return "string"
 
     def _convert_param_value(self, param_value: str, param_type: str) -> Any:
         """Convert value based on parameter type
@@ -963,42 +1064,51 @@ class StreamingXMLToolCallParser:
         Returns:
             Converted value
         """
-        if param_value.lower() == 'null':
+        if param_value.lower() == "null":
             return None
 
         param_type = param_type.strip().lower()
-        if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']:
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
             return param_value
-        elif (param_type.startswith('int') or param_type.startswith('uint')
-              or param_type.startswith('long')
-              or param_type.startswith('short')
-              or param_type.startswith('unsigned')):
+        elif (
+            param_type.startswith("int")
+            or param_type.startswith("uint")
+            or param_type.startswith("long")
+            or param_type.startswith("short")
+            or param_type.startswith("unsigned")
+        ):
             try:
                 return int(param_value)
             except (ValueError, TypeError):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' is not an integer "
-                    "in tool '%s', degenerating to string.", param_value)
+                    "in tool '%s', degenerating to string.",
+                    param_value,
+                )
             return param_value
-        elif param_type.startswith('num') or param_type.startswith('float'):
+        elif param_type.startswith("num") or param_type.startswith("float"):
             try:
                 float_param_value: float = float(param_value)
-                return float_param_value if float_param_value - int(
-                    float_param_value) != 0 else int(float_param_value)
+                return (
+                    float_param_value
+                    if float_param_value - int(float_param_value) != 0
+                    else int(float_param_value)
+                )
             except (ValueError, TypeError):
                 logger.warning(
                     "Parsed value '%s' of parameter '%s' is not a float "
-                    "in tool '%s', degenerating to string.", param_value)
+                    "in tool '%s', degenerating to string.",
+                    param_value,
+                )
             return param_value
-        elif param_type in ['boolean', 'bool', 'binary']:
+        elif param_type in ["boolean", "bool", "binary"]:
             param_value = param_value.lower()
-            return param_value == 'true'
+            return param_value == "true"
         else:
             return param_value
 
-    def _convert_for_json_streaming(self, converted_value: Any,
-                                    param_type: str) -> str:
-        """Convert converted_value based on 
+    def _convert_for_json_streaming(self, converted_value: Any, param_type: str) -> str:
+        """Convert converted_value based on
         whether it's empty and if type is string
         Args:
             converted_value: Converted value
@@ -1008,10 +1118,10 @@ class StreamingXMLToolCallParser:
             Converted string for streaming output
         """
         # Check if value is empty, but exclude numeric 0
-        if converted_value is None or converted_value == '':
-            return ''
+        if converted_value is None or converted_value == "":
+            return ""
 
-        if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']:
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
             # String type, remove double quotes
             return json.dumps(converted_value, ensure_ascii=False)[1:-1]
         else:
@@ -1023,7 +1133,7 @@ class StreamingXMLToolCallParser:
 
     def _reset_xml_parser_after_tool_call(self):
         """
-        Each tool_call is treated as a separate XML document, 
+        Each tool_call is treated as a separate XML document,
         so we need to reset the parser after each tool_call.
         """
 
@@ -1039,12 +1149,12 @@ class StreamingXMLToolCallParser:
         self.current_function_open = False
         self.parameters = {}
         self.current_param_name = None
-        self.current_param_value = ''
-        self.current_param_value_converted = ''
+        self.current_param_value = ""
+        self.current_param_value_converted = ""
         self.current_param_is_first = False
         self.should_emit_end_newline = False
         self.start_quote_emitted = False
-        self.text_content_buffer = ''
+        self.text_content_buffer = ""
 
         # Reset preprocessing and deferred parsing state
         self._pre_inside_parameter = False
@@ -1054,15 +1164,18 @@ class StreamingXMLToolCallParser:
         self.deferred_param_raw_value = ""
 
 
-@ToolParserManager.register_module("qwen3_xml")
 class Qwen3XMLToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
         self.parser = StreamingXMLToolCallParser()
 
-        logger.info("vLLM Successfully import tool parser %s !",
-                    self.__class__.__name__)
+        # Add missing attributes for compatibility with serving_chat.py
+        self.prev_tool_call_arr: list[dict] = []
+        self.streamed_args_for_tool: list[str] = []
+
+        logger.info(
+            "vLLM Successfully import tool parser %s !", self.__class__.__name__
+        )
 
     def extract_tool_calls(
         self,
@@ -1070,6 +1183,9 @@ class Qwen3XMLToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
         self.parser.reset_streaming_state()
+        # Reset tool call tracking arrays for new extraction
+        self.prev_tool_call_arr = []
+        self.streamed_args_for_tool = []
         if request:
             self.parser.set_tools(request.tools)
         result = self.parser.parse_single_streaming_chunks(model_output)
@@ -1091,7 +1207,36 @@ class Qwen3XMLToolParser(ToolParser):
                                 name=tool_call.function.name,
                                 arguments=tool_call.function.arguments,
                             ),
-                        ))
+                        )
+                    )
+
+                    # Update tool call tracking arrays for compatibility
+                    tool_index = (
+                        tool_call.index
+                        if tool_call.index is not None
+                        else len(self.prev_tool_call_arr) - 1
+                    )
+
+                    # Ensure we have enough entries in our tracking arrays
+                    while len(self.prev_tool_call_arr) <= tool_index:
+                        self.prev_tool_call_arr.append({"name": "", "arguments": ""})
+                    while len(self.streamed_args_for_tool) <= tool_index:
+                        self.streamed_args_for_tool.append("")
+
+                    # Update tool call information
+                    self.prev_tool_call_arr[tool_index]["name"] = (
+                        tool_call.function.name
+                    )
+                    self.prev_tool_call_arr[tool_index]["arguments"] = (
+                        tool_call.function.arguments
+                    )
+
+                    # Update streamed arguments
+                    if tool_call.function.arguments:
+                        self.streamed_args_for_tool[tool_index] = (
+                            tool_call.function.arguments
+                        )
+
             return ExtractedToolCallInformation(
                 tool_calls=tool_calls,
                 tools_called=len(tool_calls) > 0,
@@ -1107,9 +1252,12 @@ class Qwen3XMLToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if not previous_text:
             self.parser.reset_streaming_state()
+            # Reset tool call tracking arrays for new streaming session
+            self.prev_tool_call_arr = []
+            self.streamed_args_for_tool = []
             if request:
                 self.parser.set_tools(request.tools)
 
@@ -1119,19 +1267,50 @@ class Qwen3XMLToolParser(ToolParser):
         # to correctly output tool_call field
         if not delta_text and delta_token_ids:
             open_calls = current_text.count(
-                self.parser.tool_call_start_token) - current_text.count(
-                    self.parser.tool_call_end_token)
-            if open_calls == 0 and self.parser.tool_call_index > 0:
-                # If current_call_id is None, use last_completed_call_id
-                call_id = self.parser.current_call_id or \
-                    self.parser.last_completed_call_id
-                return DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.parser.tool_call_index - 1,
-                        id=call_id,
-                        function=DeltaFunctionCall(arguments=''),
-                        type='function',
+                self.parser.tool_call_start_token
+            ) - current_text.count(self.parser.tool_call_end_token)
+            if (
+                open_calls == 0
+                and self.parser.tool_call_index > 0
+                or not self.parser.tool_call_index
+                and current_text
+            ):
+                return DeltaMessage(content="")
+            return None
+
+        # Parse the delta text and get the result
+        result = self.parser.parse_single_streaming_chunks(delta_text)
+
+        # Update tool call tracking arrays based on incremental parsing results
+        if result and result.tool_calls:
+            for tool_call in result.tool_calls:
+                if tool_call.function:
+                    tool_index = (
+                        tool_call.index
+                        if tool_call.index is not None
+                        else len(self.prev_tool_call_arr) - 1
                     )
-                ])
 
-        return self.parser.parse_single_streaming_chunks(delta_text)
+                    # Ensure we have enough entries in our tracking arrays
+                    while len(self.prev_tool_call_arr) <= tool_index:
+                        self.prev_tool_call_arr.append({"name": "", "arguments": ""})
+                    while len(self.streamed_args_for_tool) <= tool_index:
+                        self.streamed_args_for_tool.append("")
+
+                    # Update tool name if provided
+                    if tool_call.function.name:
+                        self.prev_tool_call_arr[tool_index]["name"] = (
+                            tool_call.function.name
+                        )
+
+                    # Update arguments incrementally
+                    if tool_call.function.arguments is not None:
+                        # Concatenate the incremental arguments
+                        # to the existing streamed arguments
+                        self.prev_tool_call_arr[tool_index]["arguments"] += (
+                            tool_call.function.arguments
+                        )
+                        self.streamed_args_for_tool[tool_index] += (
+                            tool_call.function.arguments
+                        )
+        return result
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 95458f07ff2a2b99a1e22bbf404b550f1e2d6ccf..8aed7f0e9fc9687571855fc8fd5e44a063882f18 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -7,25 +7,29 @@ import ast
 import json
 import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("seed_oss")
 class SeedOssToolParser(ToolParser):
     TOOL_CALL_START = "<seed:tool_call>"
     TOOL_CALL_END = "</seed:tool_call>"
@@ -51,33 +55,36 @@ class SeedOssToolParser(ToolParser):
         self.failed_count: int = 0
         self._reset_streaming_state()
 
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
         self.think_end_token_id = self.vocab.get(self.think_end_token)
 
-        if (self.tool_call_start_token_id is None
-                or self.tool_call_end_token_id is None):
+        if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
             raise RuntimeError(
                 "Seed_Oss XML parser: tokenizer did not include "
-                "<seed:tool_call> or its closing tag.")
+                "<seed:tool_call> or its closing tag."
+            )
 
         tool_start_re = re.escape(self.tool_call_start_token)
         tool_end_re = re.escape(self.tool_call_end_token)
 
         self.tool_call_complete_regex = re.compile(
-            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
+            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL
+        )
         self.tool_call_regex = re.compile(
-            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
-            re.DOTALL)
+            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$", re.DOTALL
+        )
 
         self.tool_call_function_regex = re.compile(
-            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+        )
         self.tool_call_parameter_regex = re.compile(
-            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL
+        )
 
-        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
-                    self.__class__.__name__)
+        logger.info(
+            "vLLM Seed-Oss XML tool parser loaded (%s).", self.__class__.__name__
+        )
 
     def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
@@ -100,20 +107,17 @@ class SeedOssToolParser(ToolParser):
         self.json_closed = False
 
     def _parse_xml_function_call(
-            self, function_call_str: str,
-            tools: Optional[list[ChatCompletionToolsParam]]
-    ) -> Optional[ToolCall]:
-
+        self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None
+    ) -> ToolCall | None:
         def get_arguments_config(func_name: str) -> dict:
             if tools is None:
                 return {}
             for config in tools:
                 if not hasattr(config, "type") or not (
-                        hasattr(config, "function")
-                        and hasattr(config.function, "name")):
+                    hasattr(config, "function") and hasattr(config.function, "name")
+                ):
                     continue
-                if (config.type == "function"
-                        and config.function.name == func_name):
+                if config.type == "function" and config.function.name == func_name:
                     if not hasattr(config.function, "parameters"):
                         return {}
                     params = config.function.parameters
@@ -123,12 +127,12 @@ class SeedOssToolParser(ToolParser):
                         return params
                     else:
                         return {}
-            logger.warning("Tool '%s' is not defined in the tools list.",
-                           func_name)
+            logger.warning("Tool '%s' is not defined in the tools list.", func_name)
             return {}
 
-        def convert_param_value(param_value: str, param_name: str,
-                                param_config: dict, func_name: str) -> Any:
+        def convert_param_value(
+            param_value: str, param_name: str, param_config: dict, func_name: str
+        ) -> Any:
             # Handle null value for any type
             if param_value.lower() == "null":
                 return None
@@ -138,44 +142,55 @@ class SeedOssToolParser(ToolParser):
                     logger.warning(
                         "Parsed parameter '%s' is not defined in "
                         "the tool parameters for tool '%s', "
-                        "directly returning the string value.", param_name,
-                        func_name)
+                        "directly returning the string value.",
+                        param_name,
+                        func_name,
+                    )
                 return param_value
 
-            if (isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]):
-                param_type = str(
-                    param_config[param_name]["type"]).strip().lower()
+            if (
+                isinstance(param_config[param_name], dict)
+                and "type" in param_config[param_name]
+            ):
+                param_type = str(param_config[param_name]["type"]).strip().lower()
             else:
                 param_type = "string"
-            if param_type in [
-                    "string", "str", "text", "varchar", "char", "enum"
-            ]:
+            if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
                 return param_value
-            elif (param_type.startswith("int") or param_type.startswith("uint")
-                  or param_type.startswith("long")
-                  or param_type.startswith("short")
-                  or param_type.startswith("unsigned")):
+            elif (
+                param_type.startswith("int")
+                or param_type.startswith("uint")
+                or param_type.startswith("long")
+                or param_type.startswith("short")
+                or param_type.startswith("unsigned")
+            ):
                 try:
                     param_value = int(param_value)  # type: ignore
                 except (ValueError, TypeError):
                     logger.warning(
                         "Parsed value '%s' of parameter '%s' is not an integer in tool "
-                        "'%s', degenerating to string.", param_value,
-                        param_name, func_name)
+                        "'%s', degenerating to string.",
+                        param_value,
+                        param_name,
+                        func_name,
+                    )
                 return param_value
-            elif param_type.startswith("num") or param_type.startswith(
-                    "float"):
+            elif param_type.startswith("num") or param_type.startswith("float"):
                 try:
                     float_param_value = float(param_value)
-                    param_value = float_param_value if float_param_value - int(
-                        float_param_value) != 0 else int(
-                            float_param_value)  # type: ignore
+                    param_value = (
+                        float_param_value  # type: ignore
+                        if float_param_value - int(float_param_value) != 0
+                        else int(float_param_value)  # type: ignore
+                    )
                 except (ValueError, TypeError):
                     logger.warning(
                         "Parsed value '%s' of parameter '%s' is not a float in tool "
-                        "'%s', degenerating to string.", param_value,
-                        param_name, func_name)
+                        "'%s', degenerating to string.",
+                        param_value,
+                        param_name,
+                        func_name,
+                    )
                 return param_value
             elif param_type in ["boolean", "bool", "binary"]:
                 param_value = param_value.lower()
@@ -183,7 +198,10 @@ class SeedOssToolParser(ToolParser):
                     logger.warning(
                         "Parsed value '%s' of parameter '%s' is not a boolean "
                         "(`true` of `false`) in tool '%s', degenerating to false.",
-                        param_value, param_name, func_name)
+                        param_value,
+                        param_name,
+                        func_name,
+                    )
                 return param_value == "true"
             else:
                 if param_type == "object" or param_type.startswith("dict"):
@@ -194,27 +212,33 @@ class SeedOssToolParser(ToolParser):
                         logger.warning(
                             "Parsed value '%s' of parameter '%s' is not a valid JSON "
                             "object in tool '%s', will try other methods to parse it.",
-                            param_value, param_name, func_name)
+                            param_value,
+                            param_name,
+                            func_name,
+                        )
                 try:
                     param_value = ast.literal_eval(param_value)
                 except (ValueError, SyntaxError):
                     logger.warning(
                         "Parsed value '%s' of parameter '%s' cannot be converted via "
                         "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
-                        param_value, param_name, func_name)
+                        param_value,
+                        param_name,
+                        func_name,
+                    )
                 return param_value
 
         # Extract function name
         end_index = function_call_str.index(">")
         function_name = function_call_str[:end_index]
         param_config = get_arguments_config(function_name)
-        parameters = function_call_str[end_index + 1:]
+        parameters = function_call_str[end_index + 1 :]
         param_dict = {}
         for match in self.tool_call_parameter_regex.findall(parameters):
             match_text = match[0] if match[0] else match[1]
             idx = match_text.index(">")
             param_name = match_text[:idx]
-            param_value = str(match_text[idx + 1:])
+            param_value = str(match_text[idx + 1 :])
             # Remove prefix and trailing \n
             if param_value.startswith("\n"):
                 param_value = param_value[1:]
@@ -222,12 +246,13 @@ class SeedOssToolParser(ToolParser):
                 param_value = param_value[:-1]
 
             param_dict[param_name] = convert_param_value(
-                param_value, param_name, param_config, function_name)
+                param_value, param_name, param_config, function_name
+            )
         return ToolCall(
             type="function",
-            function=FunctionCall(name=function_name,
-                                  arguments=json.dumps(param_dict,
-                                                       ensure_ascii=False)),
+            function=FunctionCall(
+                name=function_name, arguments=json.dumps(param_dict, ensure_ascii=False)
+            ),
         )
 
     def _get_function_calls(self, model_output: str) -> list[str]:
@@ -243,8 +268,7 @@ class SeedOssToolParser(ToolParser):
 
         raw_function_calls = []
         for tool_call in raw_tool_calls:
-            raw_function_calls.extend(
-                self.tool_call_function_regex.findall(tool_call))
+            raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call))
 
         function_calls = [
             match[0] if match[0] else match[1] for match in raw_function_calls
@@ -258,16 +282,19 @@ class SeedOssToolParser(ToolParser):
     ) -> ExtractedToolCallInformation:
         # Quick check to avoid unnecessary processing
         if self.tool_call_prefix not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         # Check if both think start and end tokens are present
-        if (self.think_start_token in model_output
-                and self.think_end_token in model_output):
+        if (
+            self.think_start_token in model_output
+            and self.think_end_token in model_output
+        ):
             # Find the position of think end token
             think_end_index = model_output.find(self.think_end_token) + len(
-                self.think_end_token)
+                self.think_end_token
+            )
             # Extract content after think end token
             result_content = model_output[think_end_index:]
             thinking_content = model_output[:think_end_index]
@@ -278,9 +305,9 @@ class SeedOssToolParser(ToolParser):
         try:
             function_calls = self._get_function_calls(result_content)
             if len(function_calls) == 0:
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=model_output)
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
 
             tool_calls = [
                 self._parse_xml_function_call(function_call_str, request.tools)
@@ -291,19 +318,20 @@ class SeedOssToolParser(ToolParser):
             self.prev_tool_call_arr.clear()  # Clear previous calls
             for tool_call in tool_calls:
                 if tool_call:
-                    self.prev_tool_call_arr.append({
-                        "name":
-                        tool_call.function.name,
-                        "arguments":
-                        tool_call.function.arguments,
-                    })
+                    self.prev_tool_call_arr.append(
+                        {
+                            "name": tool_call.function.name,
+                            "arguments": tool_call.function.arguments,
+                        }
+                    )
 
             # Extract content before tool calls
-            tool_call_start_index = result_content.find(
-                self.tool_call_start_token)
+            tool_call_start_index = result_content.find(self.tool_call_start_token)
             tool_call_start_index = (
-                tool_call_start_index if tool_call_start_index >= 0 else
-                result_content.find(self.tool_call_prefix))
+                tool_call_start_index
+                if tool_call_start_index >= 0
+                else result_content.find(self.tool_call_prefix)
+            )
             content = thinking_content + result_content[:tool_call_start_index]
 
             return ExtractedToolCallInformation(
@@ -314,9 +342,9 @@ class SeedOssToolParser(ToolParser):
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -327,25 +355,25 @@ class SeedOssToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # If no delta text, return None unless
         # it's an EOS token after tool calls
         if not delta_text:
             # Check if this is an EOS token after all tool calls are complete
             # We check for tool calls in the text even if is_tool_call_started
             # is False because it might have been reset after processing all tools
-            if (delta_token_ids
-                    and self.tool_call_end_token_id not in delta_token_ids):
+            if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids:
                 # Count complete tool calls
                 complete_calls = len(
-                    self.tool_call_complete_regex.findall(current_text))
+                    self.tool_call_complete_regex.findall(current_text)
+                )
 
                 # If we have completed tool calls and populated prev_tool_call_arr
                 if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                     # Check if all tool calls are closed
                     open_calls = current_text.count(
-                        self.tool_call_start_token) - current_text.count(
-                            self.tool_call_end_token)
+                        self.tool_call_start_token
+                    ) - current_text.count(self.tool_call_end_token)
                     if open_calls == 0:
                         # Return empty delta message to allow finish_reason processing
                         return DeltaMessage(content="")
@@ -375,16 +403,18 @@ class SeedOssToolParser(ToolParser):
 
                 # Check if there are more tool calls
                 if self.current_tool_index >= current_text.count(
-                        self.tool_call_start_token):
+                    self.tool_call_start_token
+                ):
                     # No more tool calls
                     self.is_tool_call_started = False
                 # Continue processing next tool
                 return None
 
         # Check if end thinking
-        if (not self.is_thinking_end
-                and (self.think_end_token_id in delta_token_ids
-                     or self.think_end_token in delta_text)):
+        if not self.is_thinking_end and (
+            self.think_end_token_id in delta_token_ids
+            or self.think_end_token in delta_text
+        ):
             self.is_thinking_end = True
 
         # If thinking hasn't ended yet, don't process any tool calls
@@ -394,20 +424,25 @@ class SeedOssToolParser(ToolParser):
         # Handle normal content before tool calls
         if not self.is_tool_call_started:
             # Check if tool call is starting
-            if (self.tool_call_start_token_id in delta_token_ids
-                    or self.tool_call_start_token in delta_text):
+            if (
+                self.tool_call_start_token_id in delta_token_ids
+                or self.tool_call_start_token in delta_text
+            ):
                 self.is_tool_call_started = True
                 # Return any content before the tool call
                 if self.tool_call_start_token in delta_text:
-                    content_before = delta_text[:delta_text.index(
-                        self.tool_call_start_token)]
+                    content_before = delta_text[
+                        : delta_text.index(self.tool_call_start_token)
+                    ]
                     if content_before:
                         return DeltaMessage(content=content_before)
                 return None
             else:
                 # Check if we're between tool calls - skip whitespace
-                if (current_text.rstrip().endswith(self.tool_call_end_token)
-                        and delta_text.strip() == ""):
+                if (
+                    current_text.rstrip().endswith(self.tool_call_end_token)
+                    and delta_text.strip() == ""
+                ):
                     # We just ended a tool call, skip whitespace
                     return None
                 # Normal content, no tool call
@@ -423,9 +458,11 @@ class SeedOssToolParser(ToolParser):
         # We're in a tool call, find the current tool call portion
         # Need to find the correct tool call based on current_tool_index
         # Only process tool calls after think_end_token
-        think_end_index = current_text.find(self.think_end_token) + len(
-            self.think_end_token
-        ) if self.think_end_token in current_text else 0
+        think_end_index = (
+            current_text.find(self.think_end_token) + len(self.think_end_token)
+            if self.think_end_token in current_text
+            else 0
+        )
         tool_starts: list[int] = []
         idx = think_end_index
         while True:
@@ -441,26 +478,26 @@ class SeedOssToolParser(ToolParser):
 
         tool_start_idx = tool_starts[self.current_tool_index]
         # Find where this tool call ends (or current position if not ended yet)
-        tool_end_idx = current_text.find(self.tool_call_end_token,
-                                         tool_start_idx)
+        tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx)
         if tool_end_idx == -1:
             tool_text = current_text[tool_start_idx:]
         else:
-            tool_text = current_text[tool_start_idx:tool_end_idx +
-                                     len(self.tool_call_end_token)]
+            tool_text = current_text[
+                tool_start_idx : tool_end_idx + len(self.tool_call_end_token)
+            ]
 
         # Looking for function header
         if not self.header_sent:
             if self.tool_call_prefix in tool_text:
                 func_start = tool_text.find(self.tool_call_prefix) + len(
-                    self.tool_call_prefix)
+                    self.tool_call_prefix
+                )
                 func_end = tool_text.find(">", func_start)
 
                 if func_end != -1:
                     # Found complete function name
                     self.current_function_name = tool_text[func_start:func_end]
-                    self.current_tool_id = self._generate_tool_call_id(
-                    )  # type: ignore
+                    self.current_tool_id = self._generate_tool_call_id()  # type: ignore
                     self.header_sent = True
                     self.in_function = True
 
@@ -468,38 +505,44 @@ class SeedOssToolParser(ToolParser):
                     # This ensures finish_reason="tool_calls" even if parsing isn't complete
                     already_added = any(
                         tool.get("name") == self.current_function_name
-                        for tool in self.prev_tool_call_arr)
+                        for tool in self.prev_tool_call_arr
+                    )
                     if not already_added:
-                        self.prev_tool_call_arr.append({
-                            "name": self.current_function_name,
-                            "arguments":
-                            "{}",  # Placeholder, will be updated later
-                        })
+                        self.prev_tool_call_arr.append(
+                            {
+                                "name": self.current_function_name,
+                                "arguments": "{}",  # Placeholder, will be updated later
+                            }
+                        )
 
                     # Send header with function info
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_index,
-                            id=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                name=self.current_function_name, arguments=""),
-                            type="function",
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                id=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    name=self.current_function_name, arguments=""
+                                ),
+                                type="function",
+                            )
+                        ]
+                    )
             return None
 
         # We've sent header, now handle function body
         if self.in_function:
             # Send opening brace if not sent yet
-            if (not self.json_started
-                    and self.parameter_prefix not in delta_text):
+            if not self.json_started and self.parameter_prefix not in delta_text:
                 self.json_started = True
-                return DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_index,
-                        function=DeltaFunctionCall(arguments="{"),
-                    )
-                ])
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="{"),
+                        )
+                    ]
+                )
 
             # Make sure json_started is set if we're processing parameters
             if not self.json_started:
@@ -513,34 +556,38 @@ class SeedOssToolParser(ToolParser):
                 # Extract the complete tool call to update prev_tool_call_arr with final arguments
                 # Find the function content
                 func_start = tool_text.find(self.tool_call_prefix) + len(
-                    self.tool_call_prefix)
-                func_content_end = tool_text.find(self.function_end_token,
-                                                  func_start)
+                    self.tool_call_prefix
+                )
+                func_content_end = tool_text.find(self.function_end_token, func_start)
                 if func_content_end != -1:
                     func_content = tool_text[func_start:func_content_end]
                     # Parse to get the complete arguments
                     try:
                         parsed_tool = self._parse_xml_function_call(
-                            func_content, request.tools if request else None)
+                            func_content, request.tools if request else None
+                        )
                         if parsed_tool:
                             # Update existing entry in prev_tool_call_arr with complete arguments
                             for i, tool in enumerate(self.prev_tool_call_arr):
-                                if tool.get(
-                                        "name") == parsed_tool.function.name:
+                                if tool.get("name") == parsed_tool.function.name:
                                     self.prev_tool_call_arr[i]["arguments"] = (
-                                        parsed_tool.function.arguments)
+                                        parsed_tool.function.arguments
+                                    )
                                     break
                     except Exception:
                         logger.warning(
                             "Failed to parse tool arguments during streaming.",
-                            exc_info=True)
+                            exc_info=True,
+                        )
 
-                result = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_index,
-                        function=DeltaFunctionCall(arguments="}"),
-                    )
-                ])
+                result = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="}"),
+                        )
+                    ]
+                )
 
                 # Reset state for next tool
                 self.in_function = False
@@ -583,8 +630,7 @@ class SeedOssToolParser(ToolParser):
                             value_text = value_text[1:]
 
                         # Find where this parameter ends
-                        param_end_idx = value_text.find(
-                            self.parameter_end_token)
+                        param_end_idx = value_text.find(self.parameter_end_token)
                         if param_end_idx != -1:
                             # Complete parameter found
                             param_value = value_text[:param_end_idx]
@@ -594,22 +640,33 @@ class SeedOssToolParser(ToolParser):
                             # Build complete JSON fragment for this parameter
                             if self.param_count == 0:
                                 json_fragment = (
-                                    '"' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                                    '"'
+                                    + self.current_param_name
+                                    + '": "'
+                                    + json.dumps(param_value)[1:-1]
+                                    + '"'
+                                )
                             else:
                                 json_fragment = (
-                                    ', "' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                                    ', "'
+                                    + self.current_param_name
+                                    + '": "'
+                                    + json.dumps(param_value)[1:-1]
+                                    + '"'
+                                )
 
                             self.param_count += 1
 
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=json_fragment),
-                                )
-                            ])
+                            return DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_index,
+                                        function=DeltaFunctionCall(
+                                            arguments=json_fragment
+                                        ),
+                                    )
+                                ]
+                            )
 
             # Continue parameter value
             if self.in_param:
@@ -621,29 +678,34 @@ class SeedOssToolParser(ToolParser):
                     # Skip past > if at start
                     if not self.current_param_value and ">" in value_chunk:
                         gt_idx = value_chunk.find(">")
-                        value_chunk = value_chunk[gt_idx + 1:]
+                        value_chunk = value_chunk[gt_idx + 1 :]
 
-                    if not self.current_param_value and value_chunk.startswith(
-                            "\n"):
+                    if not self.current_param_value and value_chunk.startswith("\n"):
                         value_chunk = value_chunk[1:]
 
                     # Calculate incremental JSON
                     full_value = self.current_param_value + value_chunk
-                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-                                    if self.current_param_value else "")
+                    prev_escaped = (
+                        json.dumps(self.current_param_value)[1:-1]
+                        if self.current_param_value
+                        else ""
+                    )
                     full_escaped = json.dumps(full_value)[1:-1]
-                    delta_escaped = full_escaped[len(prev_escaped):]
+                    delta_escaped = full_escaped[len(prev_escaped) :]
 
                     self.in_param = False
                     self.current_param_value = ""
 
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_index,
-                            function=DeltaFunctionCall(
-                                arguments=delta_escaped + '"'),
-                        )
-                    ])
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(
+                                    arguments=delta_escaped + '"'
+                                ),
+                            )
+                        ]
+                    )
                 else:
                     # Continue accumulating value
                     value_chunk = delta_text
@@ -651,29 +713,32 @@ class SeedOssToolParser(ToolParser):
                     # Handle first chunk after param name
                     if not self.current_param_value and ">" in value_chunk:
                         gt_idx = value_chunk.find(">")
-                        value_chunk = value_chunk[gt_idx + 1:]
+                        value_chunk = value_chunk[gt_idx + 1 :]
 
-                    if not self.current_param_value and value_chunk.startswith(
-                            "\n"):
+                    if not self.current_param_value and value_chunk.startswith("\n"):
                         value_chunk = value_chunk[1:]
 
                     if value_chunk:
                         # Stream the escaped delta
-                        prev_escaped = (json.dumps(
-                            self.current_param_value)[1:-1]
-                                        if self.current_param_value else "")
+                        prev_escaped = (
+                            json.dumps(self.current_param_value)[1:-1]
+                            if self.current_param_value
+                            else ""
+                        )
                         self.current_param_value += value_chunk
-                        full_escaped = json.dumps(
-                            self.current_param_value)[1:-1]
-                        delta_escaped = full_escaped[len(prev_escaped):]
+                        full_escaped = json.dumps(self.current_param_value)[1:-1]
+                        delta_escaped = full_escaped[len(prev_escaped) :]
 
                         if delta_escaped:
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=delta_escaped),
-                                )
-                            ])
+                            return DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_index,
+                                        function=DeltaFunctionCall(
+                                            arguments=delta_escaped
+                                        ),
+                                    )
+                                ]
+                            )
 
         return None
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index a20d18eb5254406c1f1606d0e69c95b4f407fdff..adcb9f47654736cb9e37b051ebbd702be934ddc9 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -4,17 +4,22 @@
 import contextlib
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -22,7 +27,6 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["step3"])
 class Step3ToolParser(ToolParser):
     """
     Tool parser for a model that uses a specific XML-like format for tool calls.
@@ -35,9 +39,7 @@ class Step3ToolParser(ToolParser):
     TOOL_CALL_BEGIN = "<｜tool_call_begin｜>"
     TOOL_CALL_END = "<｜tool_call_end｜>"
     TOOL_SEP = "<｜tool_sep｜>"
-    SPECIAL_TOKENS = [
-        TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END
-    ]
+    SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END]
 
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
@@ -46,18 +48,17 @@ class Step3ToolParser(ToolParser):
         self.tool_block_started = False
         self.tool_block_finished = False
 
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
             request.skip_special_tokens = False
         return request
 
     @staticmethod
     def _parse_steptml_invoke(
-            action_text: str
-    ) -> tuple[Optional[str], Optional[dict[str, str]]]:
-        func_name_match = re.search(r'<steptml:invoke name="([^"]+)">',
-                                    action_text)
+        action_text: str,
+    ) -> tuple[str | None, dict[str, str] | None]:
+        func_name_match = re.search(r'<steptml:invoke name="([^"]+)">', action_text)
         if not func_name_match:
             return None, None
         func_name = func_name_match.group(1)
@@ -65,7 +66,8 @@ class Step3ToolParser(ToolParser):
         params: dict[str, str] = {}
         param_matches = re.findall(
             r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>',
-            action_text)
+            action_text,
+        )
         for name, value in param_matches:
             params[name] = value.strip()
         return func_name, params
@@ -95,11 +97,13 @@ class Step3ToolParser(ToolParser):
                             params[key] = float(value)
                     elif typ == "boolean":
                         lower_val = value.lower()
-                        params[key] = lower_val == "true" if lower_val in (
-                            "true", "false") else value
+                        params[key] = (
+                            lower_val == "true"
+                            if lower_val in ("true", "false")
+                            else value
+                        )
                     elif typ == "null":
-                        params[key] = None if value.lower(
-                        ) == "null" else value
+                        params[key] = None if value.lower() == "null" else value
                 break
         return params
 
@@ -112,14 +116,13 @@ class Step3ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-
+    ) -> DeltaMessage | None:
         # The main loop processes the stream from the last known position.
         while True:
             if self.position >= len(current_text):
                 return None  # We've processed the entire stream.
 
-            unprocessed_text = current_text[self.position:]
+            unprocessed_text = current_text[self.position :]
 
             # STATE: After all tools are done, all subsequent text is content.
             if self.tool_block_finished:
@@ -135,8 +138,10 @@ class Step3ToolParser(ToolParser):
 
                 start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN)
                 if start_pos == -1:
-                    if self.TOOL_CALLS_BEGIN.startswith(
-                            unprocessed_text.strip()) and unprocessed_text:
+                    if (
+                        self.TOOL_CALLS_BEGIN.startswith(unprocessed_text.strip())
+                        and unprocessed_text
+                    ):
                         return None  # It's a prefix, wait.
                     self.position = len(current_text)
                     return DeltaMessage(content=unprocessed_text)
@@ -157,9 +162,9 @@ class Step3ToolParser(ToolParser):
                 continue
 
             # Check if we are between tool calls.
-            tool_finished = (
-                self.current_tool_id != -1 and
-                self.prev_tool_call_arr[self.current_tool_id].get("finished"))
+            tool_finished = self.current_tool_id != -1 and self.prev_tool_call_arr[
+                self.current_tool_id
+            ].get("finished")
             if self.current_tool_id == -1 or tool_finished:
                 if unprocessed_text.startswith(self.TOOL_CALL_BEGIN):
                     self.position += len(self.TOOL_CALL_BEGIN)
@@ -170,8 +175,7 @@ class Step3ToolParser(ToolParser):
                     self.current_tool_name_sent = False
                     while len(self.prev_tool_call_arr) <= self.current_tool_id:
                         self.prev_tool_call_arr.append({})
-                    self.prev_tool_call_arr[
-                        self.current_tool_id]["finished"] = False
+                    self.prev_tool_call_arr[self.current_tool_id]["finished"] = False
                     continue
 
                 if self.TOOL_CALL_BEGIN.startswith(unprocessed_text):
@@ -179,63 +183,65 @@ class Step3ToolParser(ToolParser):
 
             # STATE: Parsing an active tool call.
             if self.current_tool_id != -1 and not self.prev_tool_call_arr[
-                    self.current_tool_id].get("finished", False):
+                self.current_tool_id
+            ].get("finished", False):
                 end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END)
                 if end_tool_pos == -1:
                     tool_body = unprocessed_text
                 else:
                     tool_body = unprocessed_text[:end_tool_pos]
 
-                if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(
-                        tool_body):
+                if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(tool_body):
                     return None
 
-                function_name, arguments = self._parse_steptml_invoke(
-                    tool_body)
+                function_name, arguments = self._parse_steptml_invoke(tool_body)
                 if not function_name:
                     return None
 
-                tool_call_arr = {
-                    "name": function_name,
-                    "parameters": arguments or {}
-                }
+                tool_call_arr = {"name": function_name, "parameters": arguments or {}}
 
                 # Send the function name as soon as it's parsed.
                 if not self.current_tool_name_sent:
                     self.current_tool_name_sent = True
-                    self.prev_tool_call_arr[self.current_tool_id].update(
-                        tool_call_arr)
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=f"chatcmpl-tool-{random_uuid()}",
-                                      function=DeltaFunctionCall(
-                                          name=function_name))
-                    ])
+                    self.prev_tool_call_arr[self.current_tool_id].update(tool_call_arr)
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                type="function",
+                                id=f"chatcmpl-tool-{random_uuid()}",
+                                function=DeltaFunctionCall(name=function_name),
+                            )
+                        ]
+                    )
 
                 # Update our internal state with the latest parsed arguments.
-                self.prev_tool_call_arr[
-                    self.current_tool_id].update(  # noqa: E501
-                        tool_call_arr)
+                self.prev_tool_call_arr[self.current_tool_id].update(  # noqa: E501
+                    tool_call_arr
+                )
 
                 # Only send arguments when the tool call is complete.
                 if end_tool_pos != -1:
                     self.position += end_tool_pos + len(self.TOOL_CALL_END)
-                    self.prev_tool_call_arr[
-                        self.current_tool_id]["finished"] = True
+                    self.prev_tool_call_arr[self.current_tool_id]["finished"] = True
 
                     final_args = self._cast_arguments(
                         function_name,
                         tool_call_arr.get("parameters", {}),  # type: ignore
-                        request)
+                        request,
+                    )
                     if final_args:
-                        final_args_json = json.dumps(final_args,
-                                                     ensure_ascii=False)
-                        return DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=final_args_json))
-                        ])
+                        final_args_json = json.dumps(final_args, ensure_ascii=False)
+                        return DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    function=DeltaFunctionCall(
+                                        arguments=final_args_json
+                                    ),
+                                )
+                            ]
+                        )
 
                 # If tool is not finished, return None to wait for more tokens.
                 return None
@@ -248,15 +254,15 @@ class Step3ToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
         if self.TOOL_CALLS_BEGIN not in model_output:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1)
         if self.TOOL_CALLS_END not in rest:
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
         tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1)
         content = (pre_text + post_text).strip()
@@ -276,21 +282,22 @@ class Step3ToolParser(ToolParser):
             if type_part.strip() != "function":
                 continue
 
-            function_name, params_dict = self._parse_steptml_invoke(
-                invoke_part)
+            function_name, params_dict = self._parse_steptml_invoke(invoke_part)
 
             if function_name and params_dict is not None:
-                params_dict = self._cast_arguments(function_name, params_dict,
-                                                   request)
+                params_dict = self._cast_arguments(function_name, params_dict, request)
                 params_str = json.dumps(params_dict, ensure_ascii=False)
                 tool_calls.append(
-                    ToolCall(function=FunctionCall(name=function_name,
-                                                   arguments=params_str)))
+                    ToolCall(
+                        function=FunctionCall(name=function_name, arguments=params_str)
+                    )
+                )
         if tool_calls:
             return ExtractedToolCallInformation(
                 tools_called=True,
                 tool_calls=tool_calls,
-                content=content if content else None)
-        return ExtractedToolCallInformation(tools_called=False,
-                                            tool_calls=[],
-                                            content=model_output)
+                content=content if content else None,
+            )
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=model_output
+        )
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index aa41cd6dc53ed1692487f7aded2205e3e82a6623..570eb447a46781359ddaa0601b41be64d2be3775 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -6,8 +6,18 @@ from json import JSONDecodeError, JSONDecoder
 from typing import Any
 
 import partial_json_parser
+from openai.types.responses import (
+    FunctionTool,
+    ToolChoiceFunction,
+)
+from openai.types.responses.tool import Tool
 from partial_json_parser.core.options import Allow
 
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionToolsParam,
+)
+
 
 def find_common_prefix(s1: str, s2: str) -> str:
     """
@@ -22,7 +32,7 @@ def find_common_prefix(s1: str, s2: str) -> str:
     e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
     '{"fruit": "ap'
     """
-    prefix = ''
+    prefix = ""
     min_length = min(len(s1), len(s2))
     for i in range(0, min_length):
         if s1[i] == s2[i]:
@@ -40,7 +50,7 @@ def find_common_suffix(s1: str, s2: str) -> str:
 
     e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
     """
-    suffix = ''
+    suffix = ""
     min_length = min(len(s1), len(s2))
     for i in range(1, min_length + 1):
         if s1[-i] == s2[-i] and not s1[-i].isalnum():
@@ -70,15 +80,15 @@ def extract_intermediate_diff(curr: str, old: str) -> str:
     """
     suffix = find_common_suffix(curr, old)
 
-    old = old[::-1].replace(suffix[::-1], '', 1)[::-1]
+    old = old[::-1].replace(suffix[::-1], "", 1)[::-1]
     prefix = find_common_prefix(curr, old)
     diff = curr
     if len(suffix):
-        diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1]
+        diff = diff[::-1].replace(suffix[::-1], "", 1)[::-1]
 
     if len(prefix):
         # replace the prefix only once in case it's mirrored
-        diff = diff.replace(prefix, '', 1)
+        diff = diff.replace(prefix, "", 1)
 
     return diff
 
@@ -122,3 +132,98 @@ def consume_space(i: int, s: str) -> int:
     while i < len(s) and s[i].isspace():
         i += 1
     return i
+
+
+def _extract_tool_info(
+    tool: Tool | ChatCompletionToolsParam,
+) -> tuple[str, dict[str, Any] | None]:
+    if isinstance(tool, FunctionTool):
+        return tool.name, tool.parameters
+    elif isinstance(tool, ChatCompletionToolsParam):
+        return tool.function.name, tool.function.parameters
+    else:
+        raise TypeError(f"Unsupported tool type: {type(tool)}")
+
+
+def _get_tool_schema_from_tool(tool: Tool | ChatCompletionToolsParam) -> dict:
+    name, params = _extract_tool_info(tool)
+    params = params if params else {"type": "object", "properties": {}}
+    return {
+        "properties": {
+            "name": {"type": "string", "enum": [name]},
+            "parameters": params,
+        },
+        "required": ["name", "parameters"],
+    }
+
+
+def _get_tool_schema_defs(
+    tools: list[Tool | ChatCompletionToolsParam],
+) -> dict:
+    all_defs: dict[str, dict[str, Any]] = {}
+    for tool in tools:
+        _, params = _extract_tool_info(tool)
+        if params is None:
+            continue
+        defs = params.pop("$defs", {})
+        for def_name, def_schema in defs.items():
+            if def_name in all_defs and all_defs[def_name] != def_schema:
+                raise ValueError(
+                    f"Tool definition '{def_name}' has multiple schemas, "
+                    "which is not supported."
+                )
+            all_defs[def_name] = def_schema
+    return all_defs
+
+
+def _get_json_schema_from_tools(
+    tools: list[Tool | ChatCompletionToolsParam],
+) -> dict:
+    json_schema = {
+        "type": "array",
+        "minItems": 1,
+        "items": {
+            "type": "object",
+            "anyOf": [_get_tool_schema_from_tool(tool) for tool in tools],
+        },
+    }
+    json_schema_defs = _get_tool_schema_defs(tools)
+    if json_schema_defs:
+        json_schema["$defs"] = json_schema_defs
+    return json_schema
+
+
+def get_json_schema_from_tools(
+    tool_choice: str | ToolChoiceFunction | ChatCompletionNamedToolChoiceParam,
+    tools: list[FunctionTool | ChatCompletionToolsParam] | None,
+) -> str | dict | None:
+    # tool_choice: "none"
+    if tool_choice in ("none", None) or tools is None:
+        return None
+    # tool_choice: Forced Function (Responses)
+    if (not isinstance(tool_choice, str)) and isinstance(
+        tool_choice, ToolChoiceFunction
+    ):
+        tool_name = tool_choice.name
+        tool_map = {tool.name: tool for tool in tools if isinstance(tool, FunctionTool)}
+        if tool_name not in tool_map:
+            raise ValueError(f"Tool '{tool_name}' has not been passed in `tools`.")
+        return tool_map[tool_name].parameters
+    # tool_choice: Forced Function (ChatCompletion)
+    if (not isinstance(tool_choice, str)) and isinstance(
+        tool_choice, ChatCompletionNamedToolChoiceParam
+    ):
+        tool_name = tool_choice.function.name
+        tool_map = {
+            tool.function.name: tool
+            for tool in tools
+            if isinstance(tool, ChatCompletionToolsParam)
+        }
+        if tool_name not in tool_map:
+            raise ValueError(f"Tool '{tool_name}' has not been passed in `tools`.")
+        return tool_map[tool_name].function.parameters
+    # tool_choice: "required"
+    if tool_choice == "required":
+        return _get_json_schema_from_tools(tools)
+    # tool_choice: "auto"
+    return None
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 484e904cd8c3687f85e4f698662bbdc3c82a43b8..9d308af4de601062405b1f531521ee99dec38c37 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -8,13 +8,18 @@ from typing import Any, Optional, Union
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaFunctionCall, DeltaMessage,
-                                              DeltaToolCall,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-    ToolParser, ToolParserManager)
+    ToolParser,
+)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -22,9 +27,7 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("xlam")
 class xLAMToolParser(ToolParser):
-
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
@@ -32,8 +35,7 @@ class xLAMToolParser(ToolParser):
         self.prev_tool_calls: list[dict] = []
         self.current_tool_id = -1
         self.current_tool_name_sent = False
-        self.streamed_args: list[str] = [
-        ]  # Track arguments sent for each tool
+        self.streamed_args: list[str] = []  # Track arguments sent for each tool
 
         # For backward compatibility with tests
         self.current_tools_sent: list[bool] = []
@@ -57,7 +59,8 @@ class xLAMToolParser(ToolParser):
         }
 
     def preprocess_model_output(
-            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
+        self, model_output: str
+    ) -> tuple[Optional[str], Optional[str]]:
         """
         Preprocess the model output to extract content and potential tool calls.
         Returns:
@@ -66,8 +69,7 @@ class xLAMToolParser(ToolParser):
         # Check for thinking tag
         thinking_match = re.search(self.thinking_tag_pattern, model_output)
         if thinking_match:
-            content = model_output[:thinking_match.start() +
-                                   len("</think>")].strip()
+            content = model_output[: thinking_match.start() + len("</think>")].strip()
             thinking_content = thinking_match.group(1).strip()
 
             # Try to parse the thinking content as JSON
@@ -94,8 +96,7 @@ class xLAMToolParser(ToolParser):
                     try:
                         json.loads(json_str)
                         # Extract content by removing the JSON code block
-                        content = re.sub(json_pattern, "",
-                                         model_output).strip()
+                        content = re.sub(json_pattern, "", model_output).strip()
                         return content, json_str
                     except json.JSONDecodeError:
                         continue
@@ -107,28 +108,30 @@ class xLAMToolParser(ToolParser):
                 return None, model_output
             except json.JSONDecodeError:
                 # Even if it's not valid JSON yet, it might be a tool call in progress
-                if ("{" in model_output and "name" in model_output
-                        and "arguments" in model_output):
+                if (
+                    "{" in model_output
+                    and "name" in model_output
+                    and "arguments" in model_output
+                ):
                     return None, model_output
 
         # If no tool calls found, return the original output as content
         return model_output, None
 
     def extract_tool_calls(
-            self, model_output: str,
-            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
         """
         Extract tool calls from a complete model output.
         """
         try:
             # Preprocess the model output
-            content, potential_tool_calls = self.preprocess_model_output(
-                model_output)
+            content, potential_tool_calls = self.preprocess_model_output(model_output)
 
             if not potential_tool_calls:
-                return ExtractedToolCallInformation(tools_called=False,
-                                                    tool_calls=[],
-                                                    content=content)
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=content
+                )
 
             # Parse the potential tool calls as JSON
             tool_calls_data = json.loads(potential_tool_calls)
@@ -145,8 +148,11 @@ class xLAMToolParser(ToolParser):
             tool_calls: list[ToolCall] = []
 
             for idx, call in enumerate(tool_calls_data):
-                if (not isinstance(call, dict) or "name" not in call
-                        or "arguments" not in call):
+                if (
+                    not isinstance(call, dict)
+                    or "name" not in call
+                    or "arguments" not in call
+                ):
                     logger.debug("Invalid tool call format at index %d", idx)
                     continue
 
@@ -155,8 +161,11 @@ class xLAMToolParser(ToolParser):
                     type="function",
                     function=FunctionCall(
                         name=call["name"],
-                        arguments=(json.dumps(call["arguments"]) if isinstance(
-                            call["arguments"], dict) else call["arguments"]),
+                        arguments=(
+                            json.dumps(call["arguments"])
+                            if isinstance(call["arguments"], dict)
+                            else call["arguments"]
+                        ),
                     ),
                 )
                 tool_calls.append(tool_call)
@@ -169,9 +178,9 @@ class xLAMToolParser(ToolParser):
 
         except Exception as e:
             logger.exception("Error extracting tool calls: %s", str(e))
-            return ExtractedToolCallInformation(tools_called=False,
-                                                tool_calls=[],
-                                                content=model_output)
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -189,26 +198,36 @@ class xLAMToolParser(ToolParser):
         # First, check for a definitive start of a tool call block.
         # This prevents premature parsing of incomplete output.
         stripped_text = current_text.strip()
-        preprocessed_content, preprocessed_tool_calls = (
-            self.preprocess_model_output(current_text))
+        preprocessed_content, preprocessed_tool_calls = self.preprocess_model_output(
+            current_text
+        )
 
         # For JSON code blocks, we need to detect them earlier, even if incomplete
-        has_potential_json_block = ("```json" in current_text
-                                    or "```\n[" in current_text
-                                    or "[TOOL_CALLS]" in current_text
-                                    or "<tool_call>" in current_text)
+        has_potential_json_block = (
+            "```json" in current_text
+            or "```\n[" in current_text
+            or "[TOOL_CALLS]" in current_text
+            or "<tool_call>" in current_text
+        )
 
         is_tool_call_block = (
             stripped_text.startswith("[")
             or stripped_text.startswith("<tool_call>")
-            or stripped_text.startswith("[TOOL_CALLS]") or
+            or stripped_text.startswith("[TOOL_CALLS]")
+            or
             # Check if we have thinking tags with JSON-like content following
-            ("</think>[" in current_text) or
+            ("</think>[" in current_text)
+            or
             # Check if the text contains a JSON array after preprocessing
-            preprocessed_tool_calls is not None or
+            preprocessed_tool_calls is not None
+            or
             # For JSON code blocks, detect early if we see enough structure
-            (has_potential_json_block and '"name"' in current_text
-             and '"arguments"' in current_text))
+            (
+                has_potential_json_block
+                and '"name"' in current_text
+                and '"arguments"' in current_text
+            )
+        )
 
         if not is_tool_call_block:
             return DeltaMessage(content=delta_text)
@@ -225,8 +244,9 @@ class xLAMToolParser(ToolParser):
             # Try parsing as JSON to check for complete tool calls
             try:
                 # Use preprocessed tool calls if available
-                tool_calls_text = (preprocessed_tool_calls if
-                                   preprocessed_tool_calls else current_text)
+                tool_calls_text = (
+                    preprocessed_tool_calls if preprocessed_tool_calls else current_text
+                )
                 parsed_tools = json.loads(tool_calls_text)
                 if isinstance(parsed_tools, list):
                     # Update our tool array for next time
@@ -237,11 +257,15 @@ class xLAMToolParser(ToolParser):
 
             # Check for test-specific state setup (current_tools_sent)
             # This handles the case where tests manually set current_tools_sent
-            if (hasattr(self, "current_tools_sent")  # type: ignore
-                    and len(self.current_tools_sent) > 0):
+            if (
+                hasattr(self, "current_tools_sent")  # type: ignore
+                and len(self.current_tools_sent) > 0
+            ):
                 # If current_tools_sent is set to [False], it means the test wants us to send the name
-                if (len(self.current_tools_sent) == 1
-                        and self.current_tools_sent[0] is False):
+                if (
+                    len(self.current_tools_sent) == 1
+                    and self.current_tools_sent[0] is False
+                ):
                     # Extract the function name using regex
                     name_pattern = r'"name"\s*:\s*"([^"]+)"'
                     name_match = re.search(name_pattern, current_text)
@@ -250,51 +274,53 @@ class xLAMToolParser(ToolParser):
 
                         # The test expects us to send just the name first
                         tool_id = make_tool_call_id()
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(
-                                index=0,
-                                type="function",
-                                id=tool_id,
-                                function=DeltaFunctionCall(
-                                    name=function_name).model_dump(
-                                        exclude_none=True),  # type: ignore
-                            )
-                        ])
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=0,
+                                    type="function",
+                                    id=tool_id,
+                                    function=DeltaFunctionCall(
+                                        name=function_name
+                                    ).model_dump(exclude_none=True),  # type: ignore
+                                )
+                            ]
+                        )
                         # Update state to reflect that we've sent the name
                         self.current_tools_sent = [True]
                         self.current_tool_id = 0
                         self.streaming_state["current_tool_index"] = 0
                         if len(self.streaming_state["sent_tools"]) == 0:
-                            self.streaming_state["sent_tools"].append({
-                                "sent_name":
-                                True,
-                                "sent_arguments_prefix":
-                                False,
-                                "sent_arguments":
-                                "",
-                            })
+                            self.streaming_state["sent_tools"].append(
+                                {
+                                    "sent_name": True,
+                                    "sent_arguments_prefix": False,
+                                    "sent_arguments": "",
+                                }
+                            )
                         else:
-                            self.streaming_state["sent_tools"][0][
-                                "sent_name"] = True
+                            self.streaming_state["sent_tools"][0]["sent_name"] = True
                         self.current_tool_name_sent = True
                         return delta
 
             # Use regex to identify tool calls in the output
             # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
-            search_text = (preprocessed_tool_calls
-                           if preprocessed_tool_calls else current_text)
+            search_text = (
+                preprocessed_tool_calls if preprocessed_tool_calls else current_text
+            )
 
             # For JSON code blocks that aren't complete yet, try to extract the JSON content
             if not preprocessed_tool_calls and has_potential_json_block:
                 # Try to extract the JSON array from within the code block
-                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)(?:```|$)",
-                                       current_text)
+                json_match = re.search(
+                    r"```(?:json)?\s*([\s\S]*?)(?:```|$)", current_text
+                )
                 if json_match:
                     potential_json = json_match.group(1).strip()
                     # Use this as search text even if it's incomplete
                     if potential_json.startswith("[") and (
-                            '"name"' in potential_json
-                            and '"arguments"' in potential_json):
+                        '"name"' in potential_json and '"arguments"' in potential_json
+                    ):
                         search_text = potential_json
 
             # Try to find complete tool names first
@@ -306,8 +332,7 @@ class xLAMToolParser(ToolParser):
             if tool_count == 0:
                 # Check if we're in the middle of parsing a tool name
                 partial_name_pattern = r'"name"\s*:\s*"([^"]*)'
-                partial_matches = list(
-                    re.finditer(partial_name_pattern, search_text))
+                partial_matches = list(re.finditer(partial_name_pattern, search_text))
                 if partial_matches:
                     # We have a partial tool name - not ready to emit yet
                     return None
@@ -317,14 +342,13 @@ class xLAMToolParser(ToolParser):
 
             # Ensure our state arrays are large enough
             while len(self.streaming_state["sent_tools"]) < tool_count:
-                self.streaming_state["sent_tools"].append({
-                    "sent_name":
-                    False,
-                    "sent_arguments_prefix":
-                    False,
-                    "sent_arguments":
-                    "",
-                })
+                self.streaming_state["sent_tools"].append(
+                    {
+                        "sent_name": False,
+                        "sent_arguments_prefix": False,
+                        "sent_arguments": "",
+                    }
+                )
 
             while len(self.streaming_state["tool_ids"]) < tool_count:
                 self.streaming_state["tool_ids"].append(None)
@@ -337,14 +361,13 @@ class xLAMToolParser(ToolParser):
                 next_idx = current_idx + 1
 
                 # If tool at next_idx has not been sent yet
-                if (next_idx < tool_count
-                        and not self.streaming_state["sent_tools"][next_idx]
-                    ["sent_name"]):
+                if (
+                    next_idx < tool_count
+                    and not self.streaming_state["sent_tools"][next_idx]["sent_name"]
+                ):
                     # Update indexes
                     self.streaming_state["current_tool_index"] = next_idx
-                    self.current_tool_id = (
-                        next_idx  # For backward compatibility
-                    )
+                    self.current_tool_id = next_idx  # For backward compatibility
                     current_idx = next_idx
 
                     # Extract the tool name
@@ -354,21 +377,20 @@ class xLAMToolParser(ToolParser):
                     tool_id = f"call_{current_idx}_{random_uuid()}"
                     self.streaming_state["tool_ids"][current_idx] = tool_id
 
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=current_idx,
-                            type="function",
-                            id=tool_id,
-                            function=DeltaFunctionCall(
-                                name=tool_name).model_dump(
-                                    exclude_none=True),  # type: ignore
-                        )
-                    ])
-                    self.streaming_state["sent_tools"][current_idx][
-                        "sent_name"] = True
-                    self.current_tool_name_sent = (
-                        True  # For backward compatibility
+                    delta = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                type="function",
+                                id=tool_id,
+                                function=DeltaFunctionCall(name=tool_name).model_dump(
+                                    exclude_none=True
+                                ),  # type: ignore
+                            )
+                        ]
                     )
+                    self.streaming_state["sent_tools"][current_idx]["sent_name"] = True
+                    self.current_tool_name_sent = True  # For backward compatibility
 
                     # Keep track of streamed args for backward compatibility
                     while len(self.streamed_args) <= current_idx:
@@ -381,7 +403,8 @@ class xLAMToolParser(ToolParser):
                 # Support both regular and empty argument objects
                 # First, check for the empty arguments case: "arguments": {}
                 empty_args_pattern = (
-                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
+                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}'
+                )
                 empty_args_match = re.search(empty_args_pattern, search_text)
 
                 # Check if this tool has empty arguments
@@ -391,36 +414,39 @@ class xLAMToolParser(ToolParser):
                     for i in range(tool_count):
                         if i == current_idx:
                             # If this is our current tool and it has empty arguments
-                            if not self.streaming_state["sent_tools"][
-                                    current_idx]["sent_arguments_prefix"]:
+                            if not self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments_prefix"
+                            ]:
                                 # Send empty object
-                                self.streaming_state["sent_tools"][
-                                    current_idx][
-                                        "sent_arguments_prefix"] = True
-                                self.streaming_state["sent_tools"][
-                                    current_idx]["sent_arguments"] = "{}"
+                                self.streaming_state["sent_tools"][current_idx][
+                                    "sent_arguments_prefix"
+                                ] = True
+                                self.streaming_state["sent_tools"][current_idx][
+                                    "sent_arguments"
+                                ] = "{}"
 
                                 # Update streamed_args for backward compatibility
                                 while len(self.streamed_args) <= current_idx:
                                     self.streamed_args.append("")
                                 self.streamed_args[current_idx] += "{}"
 
-                                delta = DeltaMessage(tool_calls=[
-                                    DeltaToolCall(
-                                        index=current_idx,
-                                        function=DeltaFunctionCall(
-                                            arguments="{}").
-                                        model_dump(
-                                            exclude_none=True),  # type: ignore
-                                    )
-                                ])
+                                delta = DeltaMessage(
+                                    tool_calls=[
+                                        DeltaToolCall(
+                                            index=current_idx,
+                                            function=DeltaFunctionCall(
+                                                arguments="{}"
+                                            ).model_dump(exclude_none=True),  # type: ignore
+                                        )
+                                    ]
+                                )
 
                                 # Move to next tool if available
                                 if current_idx < tool_count - 1:
-                                    self.streaming_state[
-                                        "current_tool_index"] += 1
+                                    self.streaming_state["current_tool_index"] += 1
                                     self.current_tool_id = self.streaming_state[
-                                        "current_tool_index"]
+                                        "current_tool_index"
+                                    ]
 
                                 return delta
 
@@ -439,72 +465,77 @@ class xLAMToolParser(ToolParser):
                         # Parse the entire JSON structure to properly extract arguments for each tool
                         try:
                             parsed_tools = json.loads(search_text)
-                            if isinstance(
-                                    parsed_tools,
-                                    list) and current_idx < len(parsed_tools):
+                            if isinstance(parsed_tools, list) and current_idx < len(
+                                parsed_tools
+                            ):
                                 current_tool = parsed_tools[current_idx]
-                                if isinstance(current_tool.get("arguments"),
-                                              dict):
-                                    args_text = json.dumps(
-                                        current_tool["arguments"])
+                                if isinstance(current_tool.get("arguments"), dict):
+                                    args_text = json.dumps(current_tool["arguments"])
                                 else:
-                                    args_text = str(
-                                        current_tool.get("arguments", "{}"))
+                                    args_text = str(current_tool.get("arguments", "{}"))
                         except (json.JSONDecodeError, KeyError, IndexError):
                             # Fallback to regex-based extraction
                             pass
 
                     # If arguments haven't been sent yet
-                    sent_args = self.streaming_state["sent_tools"][
-                        current_idx]["sent_arguments"]
+                    sent_args = self.streaming_state["sent_tools"][current_idx][
+                        "sent_arguments"
+                    ]
 
                     # If we haven't sent the opening bracket yet
                     if not self.streaming_state["sent_tools"][current_idx][
-                            "sent_arguments_prefix"] and args_text.startswith(
-                                "{"):
+                        "sent_arguments_prefix"
+                    ] and args_text.startswith("{"):
                         self.streaming_state["sent_tools"][current_idx][
-                            "sent_arguments_prefix"] = True
+                            "sent_arguments_prefix"
+                        ] = True
                         self.streaming_state["sent_tools"][current_idx][
-                            "sent_arguments"] = "{"
+                            "sent_arguments"
+                        ] = "{"
 
                         # Update streamed_args for backward compatibility
                         while len(self.streamed_args) <= current_idx:
                             self.streamed_args.append("")
                         self.streamed_args[current_idx] += "{"
 
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(
-                                index=current_idx,
-                                function=DeltaFunctionCall(
-                                    arguments="{").model_dump(
-                                        exclude_none=True),  # type: ignore
-                            )
-                        ])
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=current_idx,
+                                    function=DeltaFunctionCall(
+                                        arguments="{"
+                                    ).model_dump(exclude_none=True),  # type: ignore
+                                )
+                            ]
+                        )
                         return delta
 
                     # If we need to send more arguments
                     if args_text.startswith(sent_args):
                         # Calculate what part of arguments we need to send
-                        args_diff = args_text[len(sent_args):]
+                        args_diff = args_text[len(sent_args) :]
 
                         if args_diff:
                             # Update our state
                             self.streaming_state["sent_tools"][current_idx][
-                                "sent_arguments"] = args_text
+                                "sent_arguments"
+                            ] = args_text
 
                             # Update streamed_args for backward compatibility
                             while len(self.streamed_args) <= current_idx:
                                 self.streamed_args.append("")
                             self.streamed_args[current_idx] += args_diff
 
-                            delta = DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=current_idx,
-                                    function=DeltaFunctionCall(
-                                        arguments=args_diff).model_dump(
-                                            exclude_none=True),  # type: ignore
-                                )
-                            ])
+                            delta = DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=current_idx,
+                                        function=DeltaFunctionCall(
+                                            arguments=args_diff
+                                        ).model_dump(exclude_none=True),  # type: ignore
+                                    )
+                                ]
+                            )
                             return delta
 
                     # If the tool's arguments are complete, check if we need to move to the next tool
@@ -513,7 +544,8 @@ class xLAMToolParser(ToolParser):
                         if current_idx < tool_count - 1:
                             self.streaming_state["current_tool_index"] += 1
                             self.current_tool_id = self.streaming_state[
-                                "current_tool_index"]  # For compatibility
+                                "current_tool_index"
+                            ]  # For compatibility
 
             # If we got here, we couldn't determine what to stream next
             return None
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index d7ce57c728ba690559ee53833357d34db3099b25..3c5a396a99f935b37a630c7d5e3831b94530cc24 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -5,7 +5,7 @@ import asyncio
 import io
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import pybase64
 import torch
@@ -13,45 +13,67 @@ from pydantic import Field
 
 from vllm.config import ModelConfig
 from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import AsyncMicrobatchTokenizer
+from vllm.utils.async_utils import AsyncMicrobatchTokenizer
 
 
 @dataclass(frozen=True)
 class RenderConfig:
     """Configuration to control how prompts are prepared."""
 
-    max_length: Optional[int] = None
+    max_length: int | None = None
     """Maximum allowable total input token length. If provided,
-    token inputs longer than this raise ``ValueError``."""
+    token inputs longer than this raise `ValueError`."""
 
-    truncate_prompt_tokens: Optional[int] = None
-    """Number of tokens to keep. ``None`` means no truncation.
-    ``0`` yields an empty list (and skips embeds).
-    ``-1`` maps to ``model_config.max_model_len``."""
+    truncate_prompt_tokens: int | None = None
+    """Number of tokens to keep. `None` means no truncation.
+    `0` yields an empty list (and skips embeds).
+    `-1` maps to `model_config.max_model_len`."""
 
-    add_special_tokens: Optional[bool] = True
+    add_special_tokens: bool | None = True
     """Whether to add model-specific special tokens during tokenization."""
 
-    cache_salt: Optional[str] = None
+    cache_salt: str | None = None
     """String to disambiguate prefix cache entries."""
 
-    needs_detokenization: Optional[bool] = False
+    needs_detokenization: bool | None = False
     """If True, detokenize IDs back to text for inclusion in outputs."""
 
+    def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None:
+        """Validate and normalize `truncate_prompt_tokens` parameter."""
+        truncate_prompt_tokens = self.truncate_prompt_tokens
+        if truncate_prompt_tokens is None:
+            return None
+
+        if truncate_prompt_tokens == 0:
+            return 0
+
+        if truncate_prompt_tokens < 0:
+            truncate_prompt_tokens = model_config.max_model_len
+
+        max_length = self.max_length
+        if max_length is not None and truncate_prompt_tokens > max_length:  # type: ignore[operator]
+            raise ValueError(
+                f"{truncate_prompt_tokens=} cannot be greater than "
+                f"{max_length=}. Please select a smaller truncation size."
+            )
+
+        return truncate_prompt_tokens
+
 
 class BaseRenderer(ABC):
     """
     Base class for unified input processing and rendering.
-    
+
     The Renderer serves as a unified input processor that consolidates
     tokenization, chat template formatting, and multimodal input handling
     into a single component.
     It converts high-level API requests (OpenAI-style JSON) into token IDs and
     multimodal features ready for engine consumption.
-    
+
     Key responsibilities:
     - Convert text prompts to token sequences with proper special tokens
     - Apply chat templates and format conversations
@@ -63,7 +85,7 @@ class BaseRenderer(ABC):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ):
         super().__init__()
         self.model_config = model_config
@@ -73,8 +95,8 @@ class BaseRenderer(ABC):
     async def render_prompt(
         self,
         *,
-        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
-        config: "RenderConfig",
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
+        config: RenderConfig,
     ) -> list[EngineTokensPrompt]:
         """
         Convert text or token inputs into engine-ready TokensPrompt objects.
@@ -85,12 +107,12 @@ class BaseRenderer(ABC):
 
         Args:
             prompt_or_prompts: One of:
-                - ``str``: Single text prompt.
-                - ``list[str]``: Batch of text prompts.
-                - ``list[int]``: Single pre-tokenized sequence.
-                - ``list[list[int]]``: Batch of pre-tokenized sequences.
+                - `str`: Single text prompt.
+                - `list[str]`: Batch of text prompts.
+                - `list[int]`: Single pre-tokenized sequence.
+                - `list[list[int]]`: Batch of pre-tokenized sequences.
             config: Render configuration controlling how prompts are prepared
-                (e.g., tokenization and length handling). 
+                (e.g., tokenization and length handling).
 
         Returns:
             list[EngineTokensPrompt]: Engine-ready token prompts.
@@ -104,45 +126,47 @@ class BaseRenderer(ABC):
     async def render_prompt_and_embeds(
         self,
         *,
-        prompt_or_prompts: Optional[Union[str, list[str], list[int],
-                                          list[list[int]]]] = None,
-        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
-        config: "RenderConfig",
-    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
+        prompt_embeds: bytes | list[bytes] | None = None,
+        config: RenderConfig,
+    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
         """
         Convert text/token and/or base64-encoded embeddings inputs into
         engine-ready prompt objects using a unified RenderConfig.
 
-        At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be
+        At least one of `prompt_or_prompts` or `prompt_embeds` must be
         provided and non-empty. If both are omitted or empty (e.g., empty
-        string and empty list), a ``ValueError`` is raised.
+        string and empty list), a `ValueError` is raised.
 
         Args:
             prompt_or_prompts: Text or token inputs to include.
             prompt_embeds: Base64-encoded bytes (or list thereof) containing a
                 torch-saved tensor to be used as prompt embeddings.
             config: Render configuration controlling how prompts are prepared
-                (e.g., tokenization and length handling). 
+                (e.g., tokenization and length handling).
 
         Returns:
             list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
                 Engine-ready prompt objects.
 
         Raises:
-            ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds``
+            ValueError: If both `prompt_or_prompts` and `prompt_embeds`
                 are omitted or empty (decoder prompt cannot be empty), or if
                 length limits are exceeded.
         """
         raise NotImplementedError
 
-    @classmethod
     def load_prompt_embeds(
-        cls,
-        prompt_embeds: Union[bytes, list[bytes]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None,
-        cache_salt: Optional[str] = None,
+        self,
+        prompt_embeds: bytes | list[bytes],
+        truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
+        cache_salt: str | None = None,
     ) -> list[EngineEmbedsPrompt]:
         """Load and validate base64-encoded embeddings into prompt objects."""
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError(
+                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
+            )
 
         def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
             tensor = torch.load(
@@ -173,87 +197,66 @@ class BaseRenderer(ABC):
 
 
 class CompletionRenderer(BaseRenderer):
-
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer] = None,
-        async_tokenizer_pool: Optional[dict[AnyTokenizer,
-                                            AsyncMicrobatchTokenizer]] = None,
+        tokenizer: AnyTokenizer | None = None,
+        async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+        | None = None,
     ):
         super().__init__(model_config, tokenizer)
         self.async_tokenizer_pool = async_tokenizer_pool
-        self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None
+        self.async_tokenizer: AsyncMicrobatchTokenizer | None = None
 
     async def render_prompt(
         self,
         *,
-        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
-        config: "RenderConfig",
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
+        config: RenderConfig,
     ) -> list[EngineTokensPrompt]:
         """Implementation of prompt rendering for completion-style requests.
-        
+
         Uses async tokenizer pooling for improved performance. See base class
         for detailed parameter documentation.
         """
-        truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens(
-            config.truncate_prompt_tokens, config.max_length)
+        truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config)
         if truncate_prompt_tokens == 0:
             return []
 
-        # Parse and batch the input prompts
-        batch_inputs = parse_and_batch_prompt(prompt_or_prompts)
-
-        tasks = []
-        for prompt_input in batch_inputs:
-            if prompt_input["is_tokens"] is True:
-                # Token input
-                # Note: detokenization is needed when echo is enabled,
-                # where the input token IDs are decoded back to text.
-                task = self._maybe_detokenize(prompt_input["content"],
-                                              config.max_length,
-                                              truncate_prompt_tokens,
-                                              config.cache_salt,
-                                              config.needs_detokenization)
-            else:
-                # Text input
-                task = self._tokenize(prompt_input["content"],
-                                      config.max_length,
-                                      truncate_prompt_tokens,
-                                      config.add_special_tokens,
-                                      config.cache_salt)
-            tasks.append(task)
-
-        # Wait for all text tokenization to finish
-        if tasks:
-            tokenized_text_prompts = await asyncio.gather(*tasks)
-            return tokenized_text_prompts
-
-        return []
+        tasks = (
+            self._create_prompt(
+                prompt_input,
+                config=config,
+                truncate_prompt_tokens=truncate_prompt_tokens,
+            )
+            for prompt_input in parse_raw_prompts(prompt_or_prompts)
+        )
+
+        return await asyncio.gather(*tasks)
 
     async def render_prompt_and_embeds(
         self,
         *,
-        prompt_or_prompts: Optional[Union[str, list[str], list[int],
-                                          list[list[int]]]] = None,
-        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
-        config: "RenderConfig",
-    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
+        prompt_embeds: bytes | list[bytes] | None = None,
+        config: RenderConfig,
+    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
         """
         Render text/token prompts and/or precomputed embedding prompts. At
         least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
         """
-        truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens(
-            config.truncate_prompt_tokens, config.max_length)
+        truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config)
         if truncate_prompt_tokens == 0:
             return []
 
-        rendered: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]] = []
+        rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = []
 
         if prompt_embeds is not None:
             rendered.extend(
-                self.load_prompt_embeds(prompt_embeds, truncate_prompt_tokens,
-                                        config.cache_salt))
+                self.load_prompt_embeds(
+                    prompt_embeds, truncate_prompt_tokens, config.cache_salt
+                )
+            )
         if prompt_or_prompts is None or prompt_or_prompts == "":
             return rendered
 
@@ -265,32 +268,9 @@ class CompletionRenderer(BaseRenderer):
 
         return rendered
 
-    def _validate_and_normalize_truncate_tokens(
-        self,
-        truncate_prompt_tokens: Optional[int],
-        max_length: Optional[int],
-    ) -> Optional[int]:
-        """Validate and normalize truncate_prompt_tokens parameter."""
-        if truncate_prompt_tokens is None:
-            return None
-
-        if truncate_prompt_tokens == 0:
-            return 0
-
-        if truncate_prompt_tokens < 0:
-            truncate_prompt_tokens = self.model_config.max_model_len
-
-        if max_length is not None and truncate_prompt_tokens > max_length:  # type: ignore[operator]
-            raise ValueError(
-                f"truncate_prompt_tokens ({truncate_prompt_tokens}) "
-                f"cannot be greater than max_length ({max_length}). "
-                f"Please select a smaller truncation size.")
-
-        return truncate_prompt_tokens
-
     def _maybe_apply_truncation(
-            self, token_ids: list[int],
-            truncate_prompt_tokens: Optional[int]) -> list[int]:
+        self, token_ids: list[int], truncate_prompt_tokens: int | None
+    ) -> list[int]:
         """Apply truncation to token sequence."""
         if truncate_prompt_tokens is None:
             return token_ids
@@ -299,58 +279,92 @@ class CompletionRenderer(BaseRenderer):
 
         return token_ids[-truncate_prompt_tokens:]
 
-    async def _tokenize(
+    async def _create_prompt(
+        self,
+        prompt_input: EngineTextPrompt | EngineTokensPrompt,
+        config: RenderConfig,
+        truncate_prompt_tokens: int | None,
+    ) -> EngineTokensPrompt:
+        prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)
+
+        if prompt_token_ids is not None:
+            # NOTE: detokenization is needed when echo is enabled,
+            # where the input token IDs are decoded back to text.
+            return await self._create_prompt_from_token_ids(
+                prompt_token_ids,
+                config.max_length,
+                truncate_prompt_tokens,
+                config.cache_salt,
+                config.needs_detokenization,
+            )
+
+        if prompt is not None:
+            return await self._create_prompt_from_text(
+                prompt,
+                config.max_length,
+                truncate_prompt_tokens,
+                config.add_special_tokens,
+                config.cache_salt,
+            )
+
+        # TODO: Also handle embeds prompt using this method
+        raise NotImplementedError
+
+    async def _create_prompt_from_text(
         self,
         text: str,
-        max_length: Optional[int],
-        truncate_prompt_tokens: Optional[int],
-        add_special_tokens: Optional[bool],
-        cache_salt: Optional[str],
+        max_length: int | None,
+        truncate_prompt_tokens: int | None,
+        add_special_tokens: bool | None,
+        cache_salt: str | None,
     ) -> EngineTokensPrompt:
         """Tokenize text input asynchronously."""
         async_tokenizer = self._get_async_tokenizer()
 
         # Handle encoder-specific preprocessing
-        if (self.model_config.encoder_config is not None
-                and self.model_config.encoder_config.get(
-                    "do_lower_case", False)):
+        if (
+            self.model_config.encoder_config is not None
+            and self.model_config.encoder_config.get("do_lower_case", False)
+        ):
             text = text.lower()
 
         # Tokenize texts
         if truncate_prompt_tokens is None:
-            encoded = await async_tokenizer(
-                text, add_special_tokens=add_special_tokens)
+            encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens)
         else:
             encoded = await async_tokenizer(
                 text,
                 add_special_tokens=add_special_tokens,
                 truncation=True,
-                max_length=truncate_prompt_tokens)
+                max_length=truncate_prompt_tokens,
+            )
 
-        return self._create_tokens_prompt(encoded.input_ids, max_length,
-                                          cache_salt, text)
+        return self._create_tokens_prompt(
+            encoded.input_ids, max_length, cache_salt, text
+        )
 
-    async def _maybe_detokenize(
+    async def _create_prompt_from_token_ids(
         self,
         token_ids: list[int],
-        max_length: Optional[int],
-        truncate_prompt_tokens: Optional[int],
-        cache_salt: Optional[str],
-        needs_detokenization: Optional[bool] = False,
+        max_length: int | None,
+        truncate_prompt_tokens: int | None,
+        cache_salt: str | None,
+        needs_detokenization: bool | None = False,
     ) -> EngineTokensPrompt:
         """Optionally detokenize token IDs and build a tokens prompt."""
-        token_ids = self._maybe_apply_truncation(token_ids,
-                                                 truncate_prompt_tokens)
+        token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)
 
         prompt = None
-        if needs_detokenization is True:
+        if needs_detokenization:
             async_tokenizer = self._get_async_tokenizer()
             prompt = await async_tokenizer.decode(token_ids)
 
-        return self._create_tokens_prompt(token_ids=token_ids,
-                                          max_length=max_length,
-                                          cache_salt=cache_salt,
-                                          prompt=prompt)
+        return self._create_tokens_prompt(
+            token_ids=token_ids,
+            max_length=max_length,
+            cache_salt=cache_salt,
+            prompt=prompt,
+        )
 
     def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
         """Get or create async tokenizer using shared pool."""
@@ -360,8 +374,7 @@ class CompletionRenderer(BaseRenderer):
 
         tokenizer = self.tokenizer
         if self.tokenizer is None:
-            raise ValueError(
-                "No tokenizer available for text input processing")
+            raise ValueError("No tokenizer available for text input processing")
 
         if self.async_tokenizer_pool is None:
             async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
@@ -376,16 +389,17 @@ class CompletionRenderer(BaseRenderer):
     def _create_tokens_prompt(
         self,
         token_ids: list[int],
-        max_length: Optional[int] = None,
-        cache_salt: Optional[str] = None,
-        prompt: Optional[str] = None,
+        max_length: int | None = None,
+        cache_salt: str | None = None,
+        prompt: str | None = None,
     ) -> EngineTokensPrompt:
         """Create validated EngineTokensPrompt."""
         if max_length is not None and len(token_ids) > max_length:
             raise ValueError(
                 f"This model's maximum context length is {max_length} tokens. "
                 f"However, your request has {len(token_ids)} input tokens. "
-                "Please reduce the length of the input messages.")
+                "Please reduce the length of the input messages."
+            )
 
         tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
         if cache_salt is not None:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d966f58804b670709f4d9eb8d5a613e4a6da4b99
--- /dev/null
+++ b/vllm/entrypoints/responses_utils.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionToolMessageParam,
+)
+from openai.types.chat.chat_completion_message_tool_call_param import (
+    Function as FunctionCallTool,
+)
+from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.tool import Tool
+
+from vllm import envs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionMessageParam,
+    ResponseInputOutputItem,
+)
+
+
+def construct_chat_message_with_tool_call(
+    item: ResponseInputOutputItem,
+) -> ChatCompletionMessageParam:
+    if isinstance(item, ResponseFunctionToolCall):
+        # Append the function call as a tool call.
+        return ChatCompletionAssistantMessageParam(
+            role="assistant",
+            tool_calls=[
+                ChatCompletionMessageToolCallParam(
+                    id=item.call_id,
+                    function=FunctionCallTool(
+                        name=item.name,
+                        arguments=item.arguments,
+                    ),
+                    type="function",
+                )
+            ],
+        )
+    elif item.get("type") == "function_call_output":
+        # Append the function call output as a tool message.
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.get("output"),
+            tool_call_id=item.get("call_id"),
+        )
+    return item  # type: ignore
+
+
+def extract_tool_types(tools: list[Tool]) -> set[str]:
+    """
+    Extracts the tool types from the given tools.
+    """
+    tool_types: set[str] = set()
+    for tool in tools:
+        if tool.type == "mcp":
+            # Allow the MCP Tool type to enable built in tools if the
+            # server_label is allowlisted in
+            # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
+            if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
+                tool_types.add(tool.server_label)
+        else:
+            tool_types.add(tool.type)
+    return tool_types
+
+
+def convert_tool_responses_to_completions_format(tool: dict) -> dict:
+    """
+    Convert a flat tool schema:
+        {"type": "function", "name": "...", "description": "...", "parameters": {...}}
+    into:
+        {"type": "function", "function": {...}}
+    """
+    return {
+        "type": "function",
+        "function": tool,
+    }
diff --git a/vllm/entrypoints/sagemaker/__init__.py b/vllm/entrypoints/sagemaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1767137e4ea11bda3eadd12e405c650facffef0
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""SageMaker-specific integration for vLLM."""
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
new file mode 100644
index 0000000000000000000000000000000000000000..498b7294f0d8c2d859cade9f81dac62908ba606c
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from http import HTTPStatus
+
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+import pydantic
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import (
+    INVOCATION_VALIDATORS,
+    base,
+    health,
+    validate_json_request,
+)
+from vllm.entrypoints.openai.protocol import ErrorResponse
+
+
+def register_sagemaker_routes(router: APIRouter):
+    @router.post("/ping", response_class=Response)
+    @router.get("/ping", response_class=Response)
+    @sagemaker_standards.register_ping_handler
+    async def ping(raw_request: Request) -> Response:
+        """Ping check. Endpoint required for SageMaker"""
+        return await health(raw_request)
+
+    @router.post(
+        "/invocations",
+        dependencies=[Depends(validate_json_request)],
+        responses={
+            HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+            HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
+            HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        },
+    )
+    @sagemaker_standards.register_invocation_handler
+    @sagemaker_standards.stateful_session_manager()
+    @sagemaker_standards.inject_adapter_id(adapter_path="model")
+    async def invocations(raw_request: Request):
+        """For SageMaker, routes requests based on the request type."""
+        try:
+            body = await raw_request.json()
+        except json.JSONDecodeError as e:
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail=f"JSON decode error: {e}",
+            ) from e
+
+        valid_endpoints = [
+            (validator, endpoint)
+            for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
+            if get_handler(raw_request) is not None
+        ]
+
+        for request_validator, endpoint in valid_endpoints:
+            try:
+                request = request_validator.validate_python(body)
+            except pydantic.ValidationError:
+                continue
+
+            return await endpoint(request, raw_request)
+
+        type_names = [
+            t.__name__ if isinstance(t := validator._type, type) else str(t)
+            for validator, _ in valid_endpoints
+        ]
+        msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
+        res = base(raw_request).create_error_response(message=msg)
+        return JSONResponse(content=res.model_dump(), status_code=res.error.code)
+
+    return router
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 642d6389539bc7e3de630ab2c37736824b8c0a15..309a4c996392dde1d357bc9f1e85154c0c782066 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,56 +1,62 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union, cast
+from typing import Any, TypeAlias, cast
 
 from torch.nn import CosineSimilarity
-from typing_extensions import Required, TypeAlias, TypedDict
+from typing_extensions import Required, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
-    BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
-    ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
-    MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
+    BaseMultiModalItemTracker,
+    ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartTextParam,
+    MultiModalItemTracker,
+    _ContentPart,
+    _parse_chat_message_content_part,
+)
 from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
 
-ScoreContentPartParam: TypeAlias = Union[
-    ChatCompletionContentPartImageParam,
-    ChatCompletionContentPartImageEmbedsParam]
+ScoreContentPartParam: TypeAlias = (
+    ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
+)
 
 
 class ScoreMultiModalParam(TypedDict, total=False):
     """
     A specialized parameter type for scoring multimodal content
-    
+
     The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
     1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
     2. Including chat-specific fields would confuse users about their purpose in scoring
     3. This is a more focused interface that only exposes what's needed for scoring
-    """ # noqa: E501
+    """  # noqa: E501
+
     content: Required[list[ScoreContentPartParam]]
     """The multimodal contents"""
 
 
 def _cosine_similarity(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
-
     scorer = CosineSimilarity(0)
-    scores: Union[list[PoolingRequestOutput]] = []
+    scores: list[PoolingRequestOutput] = []
 
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
 
         padding = []
-        if (pad_token_id := getattr(tokenizer, "pad_token_id",
-                                    None)) is not None:
+        if (pad_token_id := getattr(tokenizer, "pad_token_id", None)) is not None:
             padding = [pad_token_id]
 
         tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
@@ -60,14 +66,17 @@ def _cosine_similarity(
                 request_id=f"{emb_1.request_id}_{emb_2.request_id}",
                 outputs=pair_score,
                 prompt_token_ids=tokens,
-                finished=True))
+                num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
+                finished=True,
+            )
+        )
 
     return scores
 
 
 def _validate_score_input_lens(
-    data_1: Union[list[str], list[ScoreContentPartParam]],
-    data_2: Union[list[str], list[ScoreContentPartParam]],
+    data_1: list[str] | list[ScoreContentPartParam],
+    data_2: list[str] | list[ScoreContentPartParam],
 ):
     len_1 = len(data_1)
     len_2 = len(data_2)
@@ -81,23 +90,22 @@ def _validate_score_input_lens(
 
 
 def parse_score_data(
-    data_1: Union[str, ScoreContentPartParam],
-    data_2: Union[str, ScoreContentPartParam],
+    data_1: str | ScoreContentPartParam,
+    data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
-) -> tuple[str, str, Optional[MultiModalDataDict]]:
+) -> tuple[str, str, MultiModalDataDict | None]:
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
 
     content_2 = _parse_score_content(data_2, mm_tracker)
 
-    def ensure_str(content: Optional[_ContentPart]) -> str:
+    def ensure_str(content: _ContentPart | None) -> str:
         if content is not None and isinstance(content, str):
             return cast(str, content)
         else:
-            raise ValueError(
-                f"Only string content is supported, but got {content}.")
+            raise ValueError(f"Only string content is supported, but got {content}.")
 
     prompt_1 = ensure_str(content_1)
     prompt_2 = ensure_str(content_2)
@@ -106,10 +114,9 @@ def parse_score_data(
 
 
 def _parse_score_content(
-    data: Union[str, ScoreContentPartParam],
+    data: str | ScoreContentPartParam,
     mm_tracker: BaseMultiModalItemTracker,
-) -> Optional[_ContentPart]:
-
+) -> _ContentPart | None:
     if isinstance(data, str):
         data = ChatCompletionContentPartTextParam(type="text", text=data)
 
@@ -127,8 +134,10 @@ def _parse_score_content(
 
     mm_placeholder_storage = mm_parser.mm_placeholder_storage()
 
-    if len(mm_placeholder_storage) != 1 or len(
-            next(iter(mm_placeholder_storage.values()))) != 1:
+    if (
+        len(mm_placeholder_storage) != 1
+        or len(next(iter(mm_placeholder_storage.values()))) != 1
+    ):
         raise ValueError("Only one multi-modal item is supported")
 
     return next(iter(mm_placeholder_storage.values()))[0]
@@ -149,8 +158,7 @@ def apply_score_template(
             raise ValueError("Get empty score template from model")
         return full_prompt
 
-    raise ValueError(
-        f"Unsupported model architecture: {model_config.architecture}")
+    raise ValueError(f"Unsupported model architecture: {model_config.architecture}")
 
 
 def post_process_tokens(
@@ -159,7 +167,7 @@ def post_process_tokens(
 ) -> None:
     """
     Perform architecture-specific manipulations on the input tokens.
-    
+
     Note:
         This is an in-place operation.
     """
@@ -175,8 +183,8 @@ def get_score_prompt(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     tokenization_kwargs: dict[str, Any],
-    data_1: Union[str, ScoreContentPartParam],
-    data_2: Union[str, ScoreContentPartParam],
+    data_1: str | ScoreContentPartParam,
+    data_2: str | ScoreContentPartParam,
 ) -> tuple[str, TokensPrompt]:
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
@@ -192,9 +200,9 @@ def get_score_prompt(
         prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
     elif model_config.use_pad_token:
         # cross_encoder models defaults to using pad_token.
-        prompt_inputs = tokenizer(text=prompt_1,
-                                  text_pair=prompt_2,
-                                  **tokenization_kwargs)
+        prompt_inputs = tokenizer(
+            text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
+        )
         full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
     else:
         # `llm as reranker` models defaults to not using pad_token.
@@ -219,8 +227,10 @@ def compress_token_type_ids(token_type_ids: list[int]) -> int:
     if not found.
     """
     first_one = len(token_type_ids)
-    err_msg = "Token type ids are expected to be a sequence"\
-              " of zeros followed by a sequence of ones"
+    err_msg = (
+        "Token type ids are expected to be a sequence"
+        " of zeros followed by a sequence of ones"
+    )
     for i, type_id in enumerate(token_type_ids):
         if type_id == 0 and first_one < i:
             raise ValueError(err_msg)
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
index e3646a60a7cc1bf5a6cc0c72a48104e8dc4c6d9c..4d947bc620cf18d1261cf12f9a7f16a208855d09 100644
--- a/vllm/entrypoints/ssl.py
+++ b/vllm/entrypoints/ssl.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+from collections.abc import Callable
 from ssl import SSLContext
-from typing import Callable, Optional
 
 from watchfiles import Change, awatch
 
@@ -17,11 +17,13 @@ class SSLCertRefresher:
     reloads them when they change.
     """
 
-    def __init__(self,
-                 ssl_context: SSLContext,
-                 key_path: Optional[str] = None,
-                 cert_path: Optional[str] = None,
-                 ca_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        ssl_context: SSLContext,
+        key_path: str | None = None,
+        cert_path: str | None = None,
+        ca_path: str | None = None,
+    ) -> None:
         self.ssl = ssl_context
         self.key_path = key_path
         self.cert_path = cert_path
@@ -36,8 +38,10 @@ class SSLCertRefresher:
         self.watch_ssl_cert_task = None
         if self.key_path and self.cert_path:
             self.watch_ssl_cert_task = asyncio.create_task(
-                self._watch_files([self.key_path, self.cert_path],
-                                  update_ssl_cert_chain))
+                self._watch_files(
+                    [self.key_path, self.cert_path], update_ssl_cert_chain
+                )
+            )
 
         # Setup CA files watcher
         def update_ssl_ca(change: Change, file_path: str) -> None:
@@ -48,22 +52,21 @@ class SSLCertRefresher:
         self.watch_ssl_ca_task = None
         if self.ca_path:
             self.watch_ssl_ca_task = asyncio.create_task(
-                self._watch_files([self.ca_path], update_ssl_ca))
+                self._watch_files([self.ca_path], update_ssl_ca)
+            )
 
-    async def _watch_files(self, paths, fun: Callable[[Change, str],
-                                                      None]) -> None:
+    async def _watch_files(self, paths, fun: Callable[[Change, str], None]) -> None:
         """Watch multiple file paths asynchronously."""
         logger.info("SSLCertRefresher monitors files: %s", paths)
         async for changes in awatch(*paths):
             try:
                 for change, file_path in changes:
-                    logger.info("File change detected: %s - %s", change.name,
-                                file_path)
+                    logger.info("File change detected: %s - %s", change.name, file_path)
                     fun(change, file_path)
             except Exception as e:
                 logger.error(
-                    "SSLCertRefresher failed taking action on file change. "
-                    "Error: %s", e)
+                    "SSLCertRefresher failed taking action on file change. Error: %s", e
+                )
 
     def stop(self) -> None:
         """Stop watching files."""
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index f5f4d7d3b5565bed913df609fbfb5563dc4a3a7e..c74ce1ee16de1f42a5c5f0ccb415a8e2b9f17950 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -14,10 +14,12 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+MIN_GPT_OSS_VERSION = "0.0.7"
+
 
 def validate_gpt_oss_install():
     """
-    Check if the gpt-oss is installed and its version is at least 0.0.3.
+    Check if the gpt-oss is installed and its version is at least 0.0.7.
     If not, raise an ImportError.
     """
     from importlib.metadata import PackageNotFoundError, version
@@ -25,29 +27,27 @@ def validate_gpt_oss_install():
     from packaging.version import InvalidVersion, Version
 
     try:
-        pkg_version_str = version("gpt_oss")  # e.g., "0.0.5"
+        pkg_version_str = version("gpt_oss")
         pkg_version = Version(pkg_version_str)
     except PackageNotFoundError:
         raise ImportError("Package 'gpt_oss' is not installed.") from None
     except InvalidVersion as e:
-        raise ImportError(
-            f"Invalid version string for 'gpt_oss': {e}") from None
+        raise ImportError(f"Invalid version string for 'gpt_oss': {e}") from None
 
-    if pkg_version < Version("0.0.3"):
+    if pkg_version < Version(MIN_GPT_OSS_VERSION):
         raise ImportError(
-            f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed."
+            f"gpt_oss >= {MIN_GPT_OSS_VERSION} is required, "
+            f"but {pkg_version} is installed."
         ) from None
 
 
 class Tool(ABC):
-
     @abstractmethod
     async def get_result(self, context: "ConversationContext") -> Any:
         pass
 
 
 class HarmonyBrowserTool(Tool):
-
     def __init__(self):
         self.enabled = True
         exa_api_key = os.getenv("EXA_API_KEY")
@@ -63,8 +63,8 @@ class HarmonyBrowserTool(Tool):
         except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed properly (%s), browsing is disabled",
-                e)
+                "gpt_oss is not installed properly (%s), browsing is disabled", e
+            )
             return
 
         browser_backend = ExaBackend(source="web", api_key=exa_api_key)
@@ -73,6 +73,7 @@ class HarmonyBrowserTool(Tool):
 
     async def get_result(self, context: "ConversationContext") -> Any:
         from vllm.entrypoints.context import HarmonyContext
+
         assert isinstance(context, HarmonyContext)
         last_msg = context.messages[-1]
         tool_output_msgs = []
@@ -86,7 +87,6 @@ class HarmonyBrowserTool(Tool):
 
 
 class HarmonyPythonTool(Tool):
-
     def __init__(self):
         self.enabled = True
 
@@ -96,8 +96,9 @@ class HarmonyPythonTool(Tool):
         except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed properly (%s), code interpreter is "
-                "disabled", e)
+                "gpt_oss is not installed properly (%s), code interpreter is disabled",
+                e,
+            )
             return
 
         self.python_tool = PythonTool()
@@ -121,12 +122,15 @@ class HarmonyPythonTool(Tool):
             self.enabled = False
             logger.warning_once(
                 "Code interpreter tool failed to initialize (%s), code "
-                "interpreter is disabled", e)
+                "interpreter is disabled",
+                e,
+            )
             return
         logger.info_once("Code interpreter tool initialized")
 
     async def get_result(self, context: "ConversationContext") -> Any:
         from vllm.entrypoints.context import HarmonyContext
+
         assert isinstance(context, HarmonyContext)
         last_msg = context.messages[-1]
         tool_output_msgs = []
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 4c627b865ef92225df431b97b9493b18da4d0615..0d83031ef69fe25f220e086fbe34062e91e4348d 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 from openai_harmony import ToolDescription, ToolNamespaceConfig
 
@@ -18,8 +18,11 @@ if TYPE_CHECKING:
 async def list_server_and_tools(server_url: str):
     from mcp import ClientSession
     from mcp.client.sse import sse_client
-    async with sse_client(url=server_url) as streams, ClientSession(
-            *streams) as session:
+
+    async with (
+        sse_client(url=server_url) as streams,
+        ClientSession(*streams) as session,
+    ):
         initialize_response = await session.initialize()
         list_tools_response = await session.list_tools()
         return initialize_response, list_tools_response
@@ -37,21 +40,22 @@ def trim_schema(schema: dict) -> dict:
         # if there's more than 1 types, also remove "null" type as Harmony will
         # just ignore it
         types = [
-            type_dict["type"] for type_dict in schema["anyOf"]
-            if type_dict["type"] != 'null'
+            type_dict["type"]
+            for type_dict in schema["anyOf"]
+            if type_dict["type"] != "null"
         ]
         schema["type"] = types
         del schema["anyOf"]
     if "properties" in schema:
         schema["properties"] = {
-            k: trim_schema(v)
-            for k, v in schema["properties"].items()
+            k: trim_schema(v) for k, v in schema["properties"].items()
         }
     return schema
 
 
 def post_process_tools_description(
-        list_tools_result: "ListToolsResult") -> "ListToolsResult":
+    list_tools_result: "ListToolsResult",
+) -> "ListToolsResult":
     # Adapt the MCP tool result for Harmony
     for tool in list_tools_result.tools:
         tool.inputSchema = trim_schema(tool.inputSchema)
@@ -59,7 +63,8 @@ def post_process_tools_description(
     # Some tools schema don't need to be part of the prompt (e.g. simple text
     # in text out for Python)
     list_tools_result.tools = [
-        tool for tool in list_tools_result.tools
+        tool
+        for tool in list_tools_result.tools
         if getattr(tool.annotations, "include_in_prompt", True)
     ]
 
@@ -67,7 +72,6 @@ def post_process_tools_description(
 
 
 class ToolServer(ABC):
-
     @abstractmethod
     def has_tool(self, tool_name: str) -> bool:
         """
@@ -76,8 +80,7 @@ class ToolServer(ABC):
         pass
 
     @abstractmethod
-    def get_tool_description(self,
-                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
         """
         Return the tool description for the given tool name.
         If the tool is not supported, return None.
@@ -86,10 +89,7 @@ class ToolServer(ABC):
 
     @abstractmethod
     def new_session(
-        self,
-        tool_name: str,
-        session_id: str,
-        headers: Optional[dict[str, str]] = None
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
     ) -> AbstractAsyncContextManager[Any]:
         """
         Create a session for the tool.
@@ -98,14 +98,14 @@ class ToolServer(ABC):
 
 
 class MCPToolServer(ToolServer):
-
     def __init__(self):
         try:
             import mcp  # noqa: F401
         except ImportError:
             raise ImportError(
                 "mcp is not installed. Please run `pip install mcp` to use "
-                "MCPToolServer.") from None
+                "MCPToolServer."
+            ) from None
         self.harmony_tool_descriptions = {}
 
     async def add_tool_server(self, server_url: str):
@@ -114,19 +114,19 @@ class MCPToolServer(ToolServer):
         self.urls: dict[str, str] = {}
         for url in tool_urls:
             url = f"http://{url}/sse"
-            initialize_response, list_tools_response = (
-                await list_server_and_tools(url))
+            initialize_response, list_tools_response = await list_server_and_tools(url)
 
-            list_tools_response = post_process_tools_description(
-                list_tools_response)
+            list_tools_response = post_process_tools_description(list_tools_response)
 
             tool_from_mcp = ToolNamespaceConfig(
                 name=initialize_response.serverInfo.name,
                 description=initialize_response.instructions,
                 tools=[
-                    ToolDescription.new(name=tool.name,
-                                        description=tool.description,
-                                        parameters=tool.inputSchema)
+                    ToolDescription.new(
+                        name=tool.name,
+                        description=tool.description,
+                        parameters=tool.inputSchema,
+                    )
                     for tool in list_tools_response.tools
                 ],
             )
@@ -136,9 +136,13 @@ class MCPToolServer(ToolServer):
             else:
                 logger.warning(
                     "Tool %s already exists. Ignoring duplicate tool server %s",
-                    tool_from_mcp.name, url)
-        logger.info("MCPToolServer initialized with tools: %s",
-                    list(self.harmony_tool_descriptions.keys()))
+                    tool_from_mcp.name,
+                    url,
+                )
+        logger.info(
+            "MCPToolServer initialized with tools: %s",
+            list(self.harmony_tool_descriptions.keys()),
+        )
 
     def has_tool(self, tool_name: str):
         return tool_name in self.harmony_tool_descriptions
@@ -147,27 +151,27 @@ class MCPToolServer(ToolServer):
         return self.harmony_tool_descriptions.get(tool_name)
 
     @asynccontextmanager
-    async def new_session(self,
-                          tool_name: str,
-                          session_id: str,
-                          headers: Optional[dict[str, str]] = None):
+    async def new_session(
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
+    ):
         from mcp import ClientSession
         from mcp.client.sse import sse_client
+
         url = self.urls.get(tool_name)
         request_headers = {"x-session-id": session_id}
         if headers is not None:
             request_headers.update(headers)
         if not url:
             raise KeyError(f"Tool '{tool_name}' is not supported")
-        async with sse_client(
-                url=url, headers=request_headers) as streams, ClientSession(
-                    *streams) as session:
+        async with (
+            sse_client(url=url, headers=request_headers) as streams,
+            ClientSession(*streams) as session,
+        ):
             await session.initialize()
             yield session
 
 
 class DemoToolServer(ToolServer):
-
     def __init__(self):
         self.tools: dict[str, Tool] = {}
 
@@ -179,14 +183,14 @@ class DemoToolServer(ToolServer):
             self.tools["browser"] = browser_tool
         if python_tool.enabled:
             self.tools["python"] = python_tool
-        logger.info("DemoToolServer initialized with tools: %s",
-                    list(self.tools.keys()))
+        logger.info(
+            "DemoToolServer initialized with tools: %s", list(self.tools.keys())
+        )
 
     def has_tool(self, tool_name: str) -> bool:
         return tool_name in self.tools
 
-    def get_tool_description(self,
-                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
         if tool_name not in self.tools:
             return None
         if tool_name == "browser":
@@ -197,10 +201,9 @@ class DemoToolServer(ToolServer):
             raise ValueError(f"Unknown tool {tool_name}")
 
     @asynccontextmanager
-    async def new_session(self,
-                          tool_name: str,
-                          session_id: str,
-                          headers: Optional[dict[str, str]] = None):
+    async def new_session(
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
+    ):
         if tool_name not in self.tools:
             raise KeyError(f"Tool '{tool_name}' is not supported")
         yield self.tools[tool_name]
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 4a90fe094ae218ab30a48eaa2c2e886336f59daa..088bb679fef401736bad9e594db3c98cc854bf7f 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -6,19 +6,32 @@ import dataclasses
 import functools
 import os
 from argparse import Namespace
-from typing import Any, Optional, Union
+from pathlib import Path
+from typing import Any
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (
+    load_chat_template,
+    resolve_hf_chat_template,
+    resolve_mistral_chat_template,
+)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              CompletionRequest)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    StreamOptions,
+)
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
@@ -26,7 +39,8 @@ VLLM_SUBCMD_PARSER_EPILOG = (
     "For full list:            vllm {subcmd} --help=all\n"
     "For a section:            vllm {subcmd} --help=ModelConfig    (case-insensitive)\n"  # noqa: E501
     "For a flag:               vllm {subcmd} --help=max-model-len  (_ or - accepted)\n"  # noqa: E501
-    "Documentation:            https://docs.vllm.ai\n")
+    "Documentation:            https://docs.vllm.ai\n"
+)
 
 
 async def listen_for_disconnect(request: Request) -> None:
@@ -37,9 +51,9 @@ async def listen_for_disconnect(request: Request) -> None:
             # If load tracking is enabled *and* the counter exists, decrement
             # it. Combines the previous nested checks into a single condition
             # to satisfy the linter rule.
-            if (getattr(request.app.state, "enable_server_load_tracking",
-                        False)
-                    and hasattr(request.app.state, "server_load_metrics")):
+            if getattr(
+                request.app.state, "enable_server_load_tracking", False
+            ) and hasattr(request.app.state, "server_load_metrics"):
                 request.app.state.server_load_metrics -= 1
             break
 
@@ -70,15 +84,15 @@ def with_cancellation(handler_func):
     # normal route handler, with the correct request type hinting.
     @functools.wraps(handler_func)
     async def wrapper(*args, **kwargs):
-
         # The request is either the second positional arg or `raw_request`
         request = args[1] if len(args) > 1 else kwargs["raw_request"]
 
         handler_task = asyncio.create_task(handler_func(*args, **kwargs))
         cancellation_task = asyncio.create_task(listen_for_disconnect(request))
 
-        done, pending = await asyncio.wait([handler_task, cancellation_task],
-                                           return_when=asyncio.FIRST_COMPLETED)
+        done, pending = await asyncio.wait(
+            [handler_task, cancellation_task], return_when=asyncio.FIRST_COMPLETED
+        )
         for task in pending:
             task.cancel()
 
@@ -94,18 +108,16 @@ def decrement_server_load(request: Request):
 
 
 def load_aware_call(func):
-
     @functools.wraps(func)
     async def wrapper(*args, **kwargs):
-        raw_request = kwargs.get("raw_request",
-                                 args[1] if len(args) > 1 else None)
+        raw_request = kwargs.get("raw_request", args[1] if len(args) > 1 else None)
 
         if raw_request is None:
             raise ValueError(
-                "raw_request required when server load tracking is enabled")
+                "raw_request required when server load tracking is enabled"
+            )
 
-        if not getattr(raw_request.app.state, "enable_server_load_tracking",
-                       False):
+        if not getattr(raw_request.app.state, "enable_server_load_tracking", False):
             return await func(*args, **kwargs)
 
         # ensure the counter exists
@@ -121,18 +133,18 @@ def load_aware_call(func):
 
         if isinstance(response, (JSONResponse, StreamingResponse)):
             if response.background is None:
-                response.background = BackgroundTask(decrement_server_load,
-                                                     raw_request)
+                response.background = BackgroundTask(decrement_server_load, raw_request)
             elif isinstance(response.background, BackgroundTasks):
-                response.background.add_task(decrement_server_load,
-                                             raw_request)
+                response.background.add_task(decrement_server_load, raw_request)
             elif isinstance(response.background, BackgroundTask):
                 # Convert the single BackgroundTask to BackgroundTasks
                 # and chain the decrement_server_load task to it
                 tasks = BackgroundTasks()
-                tasks.add_task(response.background.func,
-                               *response.background.args,
-                               **response.background.kwargs)
+                tasks.add_task(
+                    response.background.func,
+                    *response.background.args,
+                    **response.background.kwargs,
+                )
                 tasks.add_task(decrement_server_load, raw_request)
                 response.background = tasks
         else:
@@ -166,10 +178,9 @@ def cli_env_setup():
 
 def _validate_truncation_size(
     max_model_len: int,
-    truncate_prompt_tokens: Optional[int],
-    tokenization_kwargs: Optional[dict[str, Any]] = None,
-) -> Optional[int]:
-
+    truncate_prompt_tokens: int | None,
+    tokenization_kwargs: dict[str, Any] | None = None,
+) -> int | None:
     if truncate_prompt_tokens is not None:
         if truncate_prompt_tokens <= -1:
             truncate_prompt_tokens = max_model_len
@@ -178,7 +189,8 @@ def _validate_truncation_size(
             raise ValueError(
                 f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
                 f"is greater than max_model_len ({max_model_len})."
-                f" Please, select a smaller truncation size.")
+                f" Please, select a smaller truncation size."
+            )
 
         if tokenization_kwargs is not None:
             tokenization_kwargs["truncation"] = True
@@ -191,22 +203,29 @@ def _validate_truncation_size(
     return truncate_prompt_tokens
 
 
-def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
-                                                      CompletionRequest],
-                   input_length: int, default_sampling_params: dict) -> int:
-
-    max_tokens = getattr(request, "max_completion_tokens",
-                         None) or request.max_tokens
+def get_max_tokens(
+    max_model_len: int,
+    request: ChatCompletionRequest | CompletionRequest,
+    input_length: int,
+    default_sampling_params: dict,
+) -> int:
+    max_tokens = getattr(request, "max_completion_tokens", None) or request.max_tokens
     default_max_tokens = max_model_len - input_length
     max_output_tokens = current_platform.get_max_output_tokens(input_length)
 
-    return min(val
-               for val in (default_max_tokens, max_tokens, max_output_tokens,
-                           default_sampling_params.get("max_tokens"))
-               if val is not None)
+    return min(
+        val
+        for val in (
+            default_max_tokens,
+            max_tokens,
+            max_output_tokens,
+            default_sampling_params.get("max_tokens"),
+        )
+        if val is not None
+    )
 
 
-def log_non_default_args(args: Union[Namespace, EngineArgs]):
+def log_non_default_args(args: Namespace | EngineArgs):
     non_default_args = {}
 
     # Handle Namespace
@@ -227,7 +246,74 @@ def log_non_default_args(args: Union[Namespace, EngineArgs]):
         if default_args.model != EngineArgs.model:
             non_default_args["model"] = default_args.model
     else:
-        raise TypeError("Unsupported argument type. " \
-        "Must be Namespace or EngineArgs instance.")
+        raise TypeError(
+            "Unsupported argument type. Must be Namespace or EngineArgs instance."
+        )
 
     logger.info("non-default args: %s", non_default_args)
+
+
+def should_include_usage(
+    stream_options: StreamOptions | None, enable_force_include_usage: bool
+) -> tuple[bool, bool]:
+    if stream_options:
+        include_usage = stream_options.include_usage or enable_force_include_usage
+        include_continuous_usage = include_usage and bool(
+            stream_options.continuous_usage_stats
+        )
+    else:
+        include_usage, include_continuous_usage = enable_force_include_usage, False
+    return include_usage, include_continuous_usage
+
+
+def process_lora_modules(
+    args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
+) -> list[LoRAModulePath]:
+    lora_modules = args_lora_modules
+    if default_mm_loras:
+        default_mm_lora_paths = [
+            LoRAModulePath(
+                name=modality,
+                path=lora_path,
+            )
+            for modality, lora_path in default_mm_loras.items()
+        ]
+        if args_lora_modules is None:
+            lora_modules = default_mm_lora_paths
+        else:
+            lora_modules += default_mm_lora_paths
+    return lora_modules
+
+
+async def process_chat_template(
+    args_chat_template: Path | str | None,
+    engine_client: EngineClient,
+    model_config: ModelConfig,
+) -> str | None:
+    resolved_chat_template = load_chat_template(args_chat_template)
+    if resolved_chat_template is not None:
+        # Get the tokenizer to check official template
+        tokenizer = await engine_client.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            # The warning is logged in resolve_mistral_chat_template.
+            resolved_chat_template = resolve_mistral_chat_template(
+                chat_template=resolved_chat_template
+            )
+        else:
+            hf_chat_template = resolve_hf_chat_template(
+                tokenizer=tokenizer,
+                chat_template=None,
+                tools=None,
+                model_config=model_config,
+            )
+
+            if hf_chat_template != resolved_chat_template:
+                logger.warning(
+                    "Using supplied chat template: %s\n"
+                    "It is different from official chat template '%s'. "
+                    "This discrepancy may lead to performance degradation.",
+                    resolved_chat_template,
+                    model_config.model,
+                )
+    return resolved_chat_template
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 6661b4b134e18cd79ed15a547cd18532226b700d..c2b732ab74aaafbd777007e85352087013fed846 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -5,6 +5,7 @@ import os
 import torch
 
 from vllm.logger import init_logger
+from vllm.utils.torch_utils import is_torch_equal
 
 logger = init_logger(__name__)
 
@@ -15,9 +16,364 @@ logger = init_logger(__name__)
 
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
-os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
 
 # see https://github.com/vllm-project/vllm/issues/10480
-os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
+
 # torch._inductor.config.compile_threads = 1
+
+# ===================================================
+# torch 2.9 Inductor PythonWrapperCodegen monkeypatch
+# ===================================================
+# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around
+# a test failure for test_multi_graph_piecewise_compile_outputs_equal.
+# For more context, see https://github.com/pytorch/pytorch/pull/165514.
+
+
+def memory_plan_reuse_patched(self):
+    import torch._inductor.ir as ir
+    from torch._inductor.codegen.wrapper import (
+        EnterSubgraphLine,
+        ExitSubgraphLine,
+        MemoryPlanningLine,
+        MemoryPlanningState,
+        SubgraphPythonWrapperCodegen,
+    )
+    from torch._inductor.virtualized import V
+
+    def get_output_names(graph_outputs) -> list[str]:
+        import itertools
+
+        names = []
+        shape_counter = itertools.count(0)
+        none_counter = itertools.count(0)
+        for node in graph_outputs:
+            if isinstance(node, ir.NoneAsConstantBuffer):
+                names.append(f"{V.graph.name}_none{next(none_counter)}")
+            elif isinstance(node, ir.ShapeAsConstantBuffer):
+                names.append(f"{V.graph.name}_shape{next(shape_counter)}")
+            else:
+                names.append(node.get_name())
+        return names
+
+    if (
+        isinstance(V.graph.wrapper_code, SubgraphPythonWrapperCodegen)
+        and V.graph.wrapper_code.partition_signatures is not None
+    ):
+        out_names = get_output_names(
+            V.graph.wrapper_code.partition_signatures.output_nodes
+        )
+    else:
+        out_names = V.graph.get_output_names()
+
+    while (
+        self.lines
+        and isinstance(self.lines[-1], MemoryPlanningLine)
+        and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
+    ):
+        # these lines will be pointless
+        self.lines.pop()
+
+    # codegen allocations in two passes
+    planning_states = [MemoryPlanningState()]
+    past_planning_states = []
+    for i in range(len(self.lines)):
+        line = self.lines[i]
+        if isinstance(line, MemoryPlanningLine):
+            self.lines[i] = line.plan(planning_states[-1])
+        elif isinstance(line, EnterSubgraphLine):
+            planning_states.append(MemoryPlanningState())
+        elif isinstance(line, ExitSubgraphLine):
+            past_planning_states.append(planning_states.pop())
+    past_planning_states.append(planning_states.pop())
+    assert len(planning_states) == 0
+
+
+# ===================================================
+# torch 2.9 Inductor get_graph_partition_signature monkeypatch
+# ===================================================
+# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
+# fix inductor partition + attention-nvfp4 quant fusion, tested in
+# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
+# For more context, see https://github.com/pytorch/pytorch/pull/165815.
+
+
+def get_graph_partition_signature_patched(
+    self, partitions, skip_cudagraphs: list[bool]
+):
+    """
+    Gets signature for each graph partition, including input nodes, output nodes, and
+    whether deallocating an input within graph partition.
+    """
+    from torch._inductor import dependencies
+    from torch._inductor.ir import GraphPartitionSignature, MutationOutput, NoneLayout
+    from torch._inductor.virtualized import V
+    from torch.utils._ordered_set import OrderedSet
+
+    signatures = []
+
+    unmet_output_names = OrderedSet(V.graph.get_output_names())
+    name_to_node = self.get_name_to_nodes()
+
+    def is_none_layout(buf_name: str) -> bool:
+        """
+        Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
+        so graph partition should not take it as inputs or outputs.
+        """
+        buf = self.name_to_buf.get(buf_name, None)
+
+        if buf is None:
+            return False
+
+        if isinstance(buf.node.layout, NoneLayout):
+            if isinstance(buf.node, MutationOutput) and (
+                real_name := self.mutation_real_name.get(buf_name, None)
+            ):
+                return is_none_layout(real_name)
+
+            return True
+
+        return False
+
+    for partition, skip_cudagraph in zip(
+        reversed(partitions), reversed(skip_cudagraphs)
+    ):
+        output_names: OrderedSet[str] = OrderedSet()
+
+        for node in partition:
+            output_names.update(node.outputs_by_name.keys())
+
+        returned_output_names = output_names.intersection(unmet_output_names)
+
+        # all reads/writes are partition inputs except those generated
+        # within the partition and tensor constants
+        read_writes = dependencies.ReadWrites.merge_list(
+            [node.read_writes for node in partition]
+        )
+
+        # WeakDep is fake dependency on unused buffer. It should not appear
+        # in partition_input_names for inputs that are actually read or written.
+        partition_input_names = (
+            OrderedSet(
+                [
+                    x.name
+                    for x in read_writes.reads | read_writes.writes
+                    if not is_none_layout(x.name)
+                ]
+            )
+            - output_names
+        )
+
+        partition_input_names = OrderedSet(
+            self.mutation_real_name.get(name, name) for name in partition_input_names
+        )
+
+        buffer_names_to_free: OrderedSet[str] = OrderedSet()
+        for node in partition:
+            buffer_names_to_free.update(node.last_usage)
+
+        # buffer_names_to_free may contain buffers allocated in previous
+        # graph partitions. These buffers should also be a partition
+        # input.
+        extra_input_names = [
+            name
+            for name in (buffer_names_to_free - output_names)
+            if name in name_to_node
+        ]
+        partition_input_names.update(extra_input_names)
+
+        input_nodes = {
+            name: name_to_node[name]
+            for name in partition_input_names
+            if name in name_to_node
+        }
+        input_deallocation = {
+            name: name in buffer_names_to_free
+            for name in partition_input_names
+            if name in name_to_node
+        }
+
+        # if an input tensor is not freed in the partition function, it should
+        # also be returned as an output. This brings benefits to cudagraph
+        # since the returned output tensor is a cudagraph managed tensor with
+        # a static tensor address.
+        extra_output_names = [
+            name
+            for name in partition_input_names
+            if name in name_to_node and name not in buffer_names_to_free
+        ]
+
+        returned_output_names.update(extra_output_names)
+
+        returned_output_names = OrderedSet(
+            self.mutation_real_name.get(name, name) for name in returned_output_names
+        )
+
+        output_nodes = [
+            name_to_node[name]
+            for name in returned_output_names
+            if not is_none_layout(name)
+        ]
+
+        constant_names = [
+            name for name in partition_input_names if name in V.graph.constants
+        ]
+
+        symbol_inputs = self.get_graph_partition_symbol_inputs(partition, input_nodes)
+
+        partition_signature = GraphPartitionSignature(
+            symbol_inputs,
+            input_nodes,
+            output_nodes,
+            input_deallocation,
+            skip_cudagraph,
+            constant_names,
+        )
+
+        signatures.append(partition_signature)
+
+        unmet_output_names = partition_input_names.union(
+            unmet_output_names - returned_output_names
+        )
+
+    return signatures[::-1]
+
+
+# ========================================
+# torch 2.9 Inductor Scheduler monkeypatch
+# ========================================
+# This change monkeypatches a function in Inductor to work around the following
+# bug: https://github.com/vllm-project/vllm/issues/26678
+#
+# The bug occurs when `use_inductor_graph_partition` is turned on and there
+# exists operators inside of `splitting_ops` that have an in-place mutation. In
+# vllm, this specifically occurs on the operator
+# vllm.unified_attention_with_output. In this case, inductor does not populate
+# the inductor IR's `origin_node` field, causing an assertion error when trying
+# to access the node's `origin_node` field.
+#
+# So, we will monkeypatch torch._inductor.scheduler.Scheduler.should_partition
+# so that it does not access the inductor IR node's `origin_node` field and just
+# returns True if a node is registered as having a custom partition function.
+# This is ok for now since vllm's implementation of the custom partition
+# functions just return True.
+# ========================================
+
+
+def should_partition_patched(self, node, should_log: bool = False) -> bool:
+    # This is a patched version of
+    # torch._inductor.scheduler.Scheduler.should_partition that modifies
+    # the following piece of code so that we always return True:
+    # https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724
+    """Return True if we should partition the inductor graph on this node"""
+
+    import torch._inductor.ir as ir
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        FusedSchedulerNode,
+    )
+    from torch._inductor.utils import (
+        _unstable_customized_partition_wrapper,
+        is_cudagraph_unsafe_op,
+        maybe_log_cudagraph_partition,
+    )
+
+    # Allow users to manually specify if a node should be partitioned
+    # Can only do this for FallbackKernels
+    ir_node = node.node
+    if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and (
+        op := ir_node.op_overload
+    ):
+        op_overload_packet_name = op.name()
+        op_overload_name = (
+            f"{op_overload_packet_name}.{op._overloadname}"
+            if isinstance(op, torch._ops.OpOverload)
+            else op_overload_packet_name
+        )
+        if (
+            op_overload_packet_name
+            in torch._inductor.config.custom_should_partition_ops
+            or op_overload_name in torch._inductor.config.custom_should_partition_ops
+        ):
+            assert isinstance(op, torch._ops.OpOverload)
+            return True
+
+    # When not using cudagraphs, keep all kernels in the `call` function
+    # instead of graph partition functions, since graph partition only brings
+    # benefit to cudagraph
+    if (
+        not torch._inductor.config.triton.cudagraphs
+        and _unstable_customized_partition_wrapper.wrapper is None
+    ):
+        return True
+
+    # avoid duplicating logs when should_partition is called multiple times
+    # on the same node
+    def noop_log(msg: str, node: BaseSchedulerNode | None) -> None:
+        return
+
+    log_partition_reason = maybe_log_cudagraph_partition if should_log else noop_log
+
+    if isinstance(node, FusedSchedulerNode):
+        return any(self.should_partition(snode) for snode in node.snodes)
+
+    assert node.node is not None
+
+    if not node.is_gpu():
+        log_partition_reason("non gpu ops", node=node)
+
+        return True
+
+    if isinstance(node.node, ir.DeviceCopy):
+        log_partition_reason("DeviceCopy ops", node=node)
+        return True
+
+    if isinstance(node.node, ir.Conditional):
+        log_partition_reason("Conditional ops", node=node)
+        return True
+
+    if getattr(node.node, "unbacked_bindings", None):
+        log_partition_reason("unbacked binding ops", node=node)
+        return True
+
+    if is_cudagraph_unsafe_op(node.node):
+        log_partition_reason("CUDAGraph-unsafe custom ops", node=node)
+        return True
+
+    return False
+
+
+def _update_scheduler_patched(self) -> None:
+    # Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches
+    # this method so that we can patch Scheduler.should_partition with the
+    # function above
+    """
+    (Re)initializes the scheduler member.  When initializing the scheduler, no CUBIN
+    files should be generated (to avoid biasing any benchmarks and pessimizing
+    fusion decisions).
+    """
+    import torch._inductor.config as config
+    from torch._inductor.scheduler import Scheduler
+
+    Scheduler.should_partition = should_partition_patched
+    Scheduler.get_graph_partition_signature = get_graph_partition_signature_patched
+
+    with config.patch("triton.store_cubin", False):
+        self.scheduler = Scheduler(self.operations)
+
+
+if is_torch_equal("2.9.0"):
+    from torch._inductor.codegen.wrapper import PythonWrapperCodegen
+    from torch._inductor.graph import GraphLowering
+    from torch.utils._config_module import _Config, _ConfigEntry
+
+    # `custom_should_partition_ops` is a new config after 2.9.0. So this would
+    # not overwrite any user configs.
+    torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry(
+        _Config(default=[])
+    )
+
+    PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
+    GraphLowering._update_scheduler = _update_scheduler_patched
diff --git a/vllm/envs.py b/vllm/envs.py
index a04f14692018f85ffcc45e475d68def634b34955..3790d190f04a7e97d2d83df422600724d8775b48 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,33 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
 import hashlib
 import json
 import os
 import sys
 import tempfile
-from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Literal
 
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
-    VLLM_PORT: Optional[int] = None
+    VLLM_PORT: int | None = None
     VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
-    VLLM_NCCL_SO_PATH: Optional[str] = None
-    LD_LIBRARY_PATH: Optional[str] = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_NCCL_SO_PATH: str | None = None
+    LD_LIBRARY_PATH: str | None = None
+    VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
-    VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
-    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
+    VLLM_FLASH_ATTN_VERSION: int | None = None
     LOCAL_RANK: int = 0
-    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    CUDA_VISIBLE_DEVICES: str | None = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
-    VLLM_API_KEY: Optional[str] = None
-    S3_ACCESS_KEY_ID: Optional[str] = None
-    S3_SECRET_ACCESS_KEY: Optional[str] = None
-    S3_ENDPOINT_URL: Optional[str] = None
-    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
+    VLLM_API_KEY: str | None = None
+    VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
+    S3_ACCESS_KEY_ID: str | None = None
+    S3_SECRET_ACCESS_KEY: str | None = None
+    S3_ENDPOINT_URL: str | None = None
+    VLLM_MODEL_REDIRECT_PATH: str | None = None
     VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
@@ -39,26 +41,22 @@ if TYPE_CHECKING:
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
-    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
-    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
-    VLLM_LOG_STATS_INTERVAL: float = 10.
+    VLLM_LOGGING_CONFIG_PATH: str | None = None
+    VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
-    VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_PP_LAYER_PARTITION: Optional[str] = None
-    VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
+    VLLM_ATTENTION_BACKEND: str | None = None
+    VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
+    VLLM_PP_LAYER_PARTITION: str | None = None
+    VLLM_CPU_KVCACHE_SPACE: int | None = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
-    VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
+    VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
     VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
-    VLLM_USE_RAY_SPMD_WORKER: bool = False
-    VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl",
-                                                    "shm"] = "auto"
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
     VLLM_XLA_USE_SPMD: bool = False
@@ -72,34 +70,37 @@ if TYPE_CHECKING:
     VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+    VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.8"
-    MAX_JOBS: Optional[str] = None
-    NVCC_THREADS: Optional[str] = None
+    MAX_JOBS: str | None = None
+    NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
-    CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release",
-                                       "RelWithDebInfo"]] = None
+    CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
-    VLLM_PLUGINS: Optional[list[str]] = None
-    VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
-    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_PLUGINS: list[str] | None = None
+    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
+    VLLM_TORCH_CUDA_PROFILE: bool = False
+    VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_USE_AOT_COMPILE: bool = False
+    VLLM_USE_BYTECODE_HOOK: bool = False
+    VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: list[str] = []
-    VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False
-    VLLM_USE_V1: bool = True
+    VLLM_DISABLE_PYNCCL: bool = False
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
@@ -108,8 +109,11 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
     VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
-    VLLM_ROCM_USE_TRITON_ROPE: bool = False
+    VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
+    VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
+    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -126,62 +130,71 @@ if TYPE_CHECKING:
     VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
-    VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_CUDART_SO_PATH: str | None = None
     VLLM_DP_RANK: int = 0
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
-    VLLM_USE_STANDALONE_COMPILE: bool = False
+    VLLM_USE_STANDALONE_COMPILE: bool = True
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
+    VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
-    VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
-    VLLM_V0_USE_OUTLINES_CACHE: bool = False
+    VLLM_MXFP4_USE_MARLIN: bool | None = None
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
-    VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
+    VLLM_TPU_MOST_MODEL_LEN: int | None = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = True
+    VLLM_MOE_USE_DEEP_GEMM: bool = True
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
-    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
-    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
+    VLLM_DEEP_GEMM_WARMUP: Literal[
+        "skip",
+        "full",
+        "relax",
+    ] = "relax"
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput",
-                                         "latency"] = "throughput"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
-    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
-    VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx",
-                                  "deepep_high_throughput",
-                                  "deepep_low_latency",
-                                  "allgather_reducescatter",
-                                  "flashinfer_all2allv"] = \
-                                  "allgather_reducescatter"
+    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
+    VLLM_ALL2ALL_BACKEND: Literal[
+        "naive",
+        "pplx",
+        "deepep_high_throughput",
+        "deepep_low_latency",
+        "allgather_reducescatter",
+        "flashinfer_all2allv",
+    ] = "allgather_reducescatter"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
     VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
     VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
-    VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None
+    VLLM_KV_CACHE_LAYOUT: Literal["NHD", "HND"] | None = None
     VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
     VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
-    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4",
-                                                 "NONE"] = "NONE"
+    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[
+        "FP", "INT8", "INT6", "INT4", "NONE"
+    ] = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
-    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
-    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
+    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
+    VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
-    VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
+    VLLM_USE_TRTLLM_ATTENTION: str | None = None
+    VLLM_NVFP4_GEMM_BACKEND: str | None = None
     VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
     VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
@@ -189,22 +202,30 @@ if TYPE_CHECKING:
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
-    VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
-    VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
+    VLLM_TUNED_CONFIG_FOLDER: str | None = None
+    VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
+    VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
     VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
     VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER"
     VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024
+    VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False
+    VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False
     VLLM_DBO_COMM_SMS: int = 20
-    GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = []
-    VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None
+    VLLM_PATTERN_MATCH_DEBUG: str | None = None
+    VLLM_DEBUG_DUMP_PATH: str | None = None
     VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True
     VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
     VLLM_USE_NCCL_SYMM_MEM: bool = False
-    VLLM_NCCL_INCLUDE_PATH: Optional[str] = None
+    VLLM_NCCL_INCLUDE_PATH: str | None = None
     VLLM_USE_FBGEMM: bool = False
+    VLLM_GC_DEBUG: str = ""
+    VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
+    VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
+    VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_FLAT_LOGPROBS: bool = False
     VLLM_USE_FLASH_MLA: bool = False
 
 
@@ -222,37 +243,60 @@ def get_default_config_root():
     )
 
 
-def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+def maybe_convert_int(value: str | None) -> int | None:
     if value is None:
         return None
     return int(value)
 
 
-def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
+def maybe_convert_bool(value: str | None) -> bool | None:
     if value is None:
         return None
     return bool(int(value))
 
 
+def disable_compile_cache() -> bool:
+    return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0")))
+
+
+def use_aot_compile() -> bool:
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_is_batch_invariant,
+    )
+    from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+    default_value = (
+        "1"
+        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        else "0"
+    )
+
+    return (
+        not vllm_is_batch_invariant()
+        and os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1"
+    )
+
+
 def env_with_choices(
-        env_name: str,
-        default: Optional[str],
-        choices: Union[list[str], Callable[[], list[str]]],
-        case_sensitive: bool = True) -> Callable[[], Optional[str]]:
+    env_name: str,
+    default: str | None,
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], str | None]:
     """
     Create a lambda that validates environment variable against allowed choices
-    
+
     Args:
         env_name: Name of the environment variable
         default: Default value if not set (can be None)
         choices: List of valid string options or callable that returns list
         case_sensitive: Whether validation should be case sensitive
-        
+
     Returns:
         Lambda function for environment_variables dict
     """
 
-    def _get_validated_env() -> Optional[str]:
+    def _get_validated_env() -> str | None:
         value = os.getenv(env_name)
         if value is None:
             return default
@@ -268,8 +312,10 @@ def env_with_choices(
             check_choices = actual_choices
 
         if check_value not in check_choices:
-            raise ValueError(f"Invalid value '{value}' for {env_name}. "
-                             f"Valid options: {actual_choices}.")
+            raise ValueError(
+                f"Invalid value '{value}' for {env_name}. "
+                f"Valid options: {actual_choices}."
+            )
 
         return value
 
@@ -277,20 +323,21 @@ def env_with_choices(
 
 
 def env_list_with_choices(
-        env_name: str,
-        default: list[str],
-        choices: Union[list[str], Callable[[], list[str]]],
-        case_sensitive: bool = True) -> Callable[[], list[str]]:
+    env_name: str,
+    default: list[str],
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], list[str]]:
     """
-    Create a lambda that validates environment variable 
+    Create a lambda that validates environment variable
     containing comma-separated values against allowed choices
-    
+
     Args:
         env_name: Name of the environment variable
         default: Default list of values if not set
         choices: List of valid string options or callable that returns list
         case_sensitive: Whether validation should be case sensitive
-        
+
     Returns:
         Lambda function for environment_variables
         dict that returns list of strings
@@ -320,15 +367,35 @@ def env_list_with_choices(
                 check_choices = actual_choices
 
             if check_value not in check_choices:
-                raise ValueError(f"Invalid value '{val}' in {env_name}. "
-                                 f"Valid options: {actual_choices}.")
+                raise ValueError(
+                    f"Invalid value '{val}' in {env_name}. "
+                    f"Valid options: {actual_choices}."
+                )
 
         return values
 
     return _get_validated_env_list
 
 
-def get_vllm_port() -> Optional[int]:
+def env_set_with_choices(
+    env_name: str,
+    default: list[str],
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], set[str]]:
+    """
+    Creates a lambda which that validates environment variable
+    containing comma-separated values against allowed choices which
+    returns choices as a set.
+    """
+
+    def _get_validated_env_set() -> set[str]:
+        return set(env_list_with_choices(env_name, default, choices, case_sensitive)())
+
+    return _get_validated_env_set
+
+
+def get_vllm_port() -> int | None:
     """Get the port from VLLM_PORT environment variable.
 
     Returns:
@@ -337,15 +404,16 @@ def get_vllm_port() -> Optional[int]:
     Raises:
         ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
     """
-    if 'VLLM_PORT' not in os.environ:
+    if "VLLM_PORT" not in os.environ:
         return None
 
-    port = os.getenv('VLLM_PORT', '0')
+    port = os.getenv("VLLM_PORT", "0")
 
     try:
         return int(port)
     except ValueError as err:
         from urllib.parse import urlparse
+
         parsed = urlparse(port)
         if parsed.scheme:
             raise ValueError(
@@ -353,252 +421,209 @@ def get_vllm_port() -> Optional[int]:
                 "This may be caused by a Kubernetes service discovery issue,"
                 "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
             ) from None
-        raise ValueError(
-            f"VLLM_PORT '{port}' must be a valid integer") from err
+        raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err
 
 
-# The begin-* and end* here are used by the documentation generator
+# The start-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
 # --8<-- [start:env-vars-definition]
 
 environment_variables: dict[str, Callable[[], Any]] = {
-
     # ================== Installation Time Env Vars ==================
-
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
-    "VLLM_TARGET_DEVICE":
-    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-
+    "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
     # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
     # 12.8 is the default. This follows PyTorch but can be overridden.
-    "VLLM_MAIN_CUDA_VERSION":
-    lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8",
-
+    "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
+    or "12.8",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
-    "MAX_JOBS":
-    lambda: os.getenv("MAX_JOBS", None),
-
+    "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
     # Number of threads to use for nvcc
     # By default this is 1.
     # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
-    "NVCC_THREADS":
-    lambda: os.getenv("NVCC_THREADS", None),
-
+    "NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None),
     # If set, vllm will use precompiled binaries (*.so)
-    "VLLM_USE_PRECOMPILED":
-    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
-    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
-
+    "VLLM_USE_PRECOMPILED": lambda: os.environ.get("VLLM_USE_PRECOMPILED", "")
+    .strip()
+    .lower()
+    in ("1", "true")
+    or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
     # Used to mark that setup.py is running in a Docker build context,
     # in order to force the use of precompiled binaries.
-    "VLLM_DOCKER_BUILD_CONTEXT":
-    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
-    ("1", "true"),
-
+    "VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")
+    .strip()
+    .lower()
+    in ("1", "true"),
     # Whether to force using nightly wheel in python build.
     # This is used for testing the nightly wheel in python build.
-    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL":
-    lambda: bool(int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
-                 ),
-
+    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
+        int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
+    ),
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"
-    "CMAKE_BUILD_TYPE":
-    env_with_choices("CMAKE_BUILD_TYPE", None,
-        ["Debug", "Release", "RelWithDebInfo"]),
-
+    "CMAKE_BUILD_TYPE": env_with_choices(
+        "CMAKE_BUILD_TYPE", None, ["Debug", "Release", "RelWithDebInfo"]
+    ),
     # If set, vllm will print verbose logs during installation
-    "VERBOSE":
-    lambda: bool(int(os.getenv('VERBOSE', '0'))),
-
+    "VERBOSE": lambda: bool(int(os.getenv("VERBOSE", "0"))),
     # Root directory for vLLM configuration files
     # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
     # Note that this not only affects how vllm finds its configuration files
     # during runtime, but also affects how vllm installs its configuration
     # files during **installation**.
-    "VLLM_CONFIG_ROOT":
-    lambda: os.path.expanduser(
+    "VLLM_CONFIG_ROOT": lambda: os.path.expanduser(
         os.getenv(
             "VLLM_CONFIG_ROOT",
             os.path.join(get_default_config_root(), "vllm"),
-        )),
-
+        )
+    ),
     # ================== Runtime Env Vars ==================
-
     # Root directory for vLLM cache files
     # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
-    "VLLM_CACHE_ROOT":
-    lambda: os.path.expanduser(
+    "VLLM_CACHE_ROOT": lambda: os.path.expanduser(
         os.getenv(
             "VLLM_CACHE_ROOT",
             os.path.join(get_default_cache_root(), "vllm"),
-        )),
-
+        )
+    ),
     # used in distributed environment to determine the ip address
     # of the current node, when the node has multiple network interfaces.
     # If you are using multi-node inference, you should set this differently
     # on each node.
-    'VLLM_HOST_IP':
-    lambda: os.getenv('VLLM_HOST_IP', ""),
-
+    "VLLM_HOST_IP": lambda: os.getenv("VLLM_HOST_IP", ""),
     # used in distributed environment to manually set the communication port
     # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
     # VLLM_PORT will be used as the first port, and the rest will be generated
     # by incrementing the VLLM_PORT value.
-    'VLLM_PORT':
-    get_vllm_port,
-
+    "VLLM_PORT": get_vllm_port,
     # path used for ipc when the frontend api server is running in
     # multi-processing mode to communicate with the backend engine process.
-    'VLLM_RPC_BASE_PATH':
-    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
-
+    "VLLM_RPC_BASE_PATH": lambda: os.getenv(
+        "VLLM_RPC_BASE_PATH", tempfile.gettempdir()
+    ),
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
-    "VLLM_USE_MODELSCOPE":
-    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
-
+    "VLLM_USE_MODELSCOPE": lambda: os.environ.get(
+        "VLLM_USE_MODELSCOPE", "False"
+    ).lower()
+    == "true",
     # Interval in seconds to log a warning message when the ring buffer is full
-    "VLLM_RINGBUFFER_WARNING_INTERVAL":
-    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
-
+    "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
+        os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
+    ),
     # path to cudatoolkit home directory, under which should be bin, include,
     # and lib directories.
-    "CUDA_HOME":
-    lambda: os.environ.get("CUDA_HOME", None),
-
+    "CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None),
     # Path to the NCCL library file. It is needed because nccl>=2.19 brought
     # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
-    "VLLM_NCCL_SO_PATH":
-    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
-
+    "VLLM_NCCL_SO_PATH": lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
     # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
     # library file in the locations specified by `LD_LIBRARY_PATH`
-    "LD_LIBRARY_PATH":
-    lambda: os.environ.get("LD_LIBRARY_PATH", None),
-
-    # flag to control if vllm should use triton flash attention
-    "VLLM_USE_TRITON_FLASH_ATTN":
-    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
-             ("true", "1")),
-
+    "LD_LIBRARY_PATH": lambda: os.environ.get("LD_LIBRARY_PATH", None),
+    # flag to control the chunk size (in MB) for sleeping memory allocations under ROCm
+    "VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int(
+        os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256")
+    ),
     # Use separate prefill and decode kernels for V1 attention instead of
     # the unified triton kernel.
-    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
-    lambda:
-    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
-     ("true", "1")),
-
-    # Use AITER triton unified attention for V1 attention
-    "VLLM_USE_AITER_UNIFIED_ATTENTION":
-    lambda:
-    (os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in
-     ("true", "1")),
-
+    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: (
+        os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
+        in ("true", "1")
+    ),
     # Force vllm to use a specific flash-attention version (2 or 3), only valid
     # when using the flash-attention backend.
-    "VLLM_FLASH_ATTN_VERSION":
-    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
-
+    "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
+        os.environ.get("VLLM_FLASH_ATTN_VERSION", None)
+    ),
     # Feature flag to enable/disable Inductor standalone compile.
-    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
-    # disabled by default.
-    "VLLM_USE_STANDALONE_COMPILE":
-    lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "0") == "1",
-
+    # In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is
+    # enabled by default.
+    "VLLM_USE_STANDALONE_COMPILE": lambda: os.environ.get(
+        "VLLM_USE_STANDALONE_COMPILE", "1"
+    )
+    == "1",
     # Debug pattern matching inside custom passes.
     # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3').
-    "VLLM_PATTERN_MATCH_DEBUG":
-    lambda: os.environ.get("VLLM_PATTERN_MATCH_DEBUG", None),
-
+    "VLLM_PATTERN_MATCH_DEBUG": lambda: os.environ.get(
+        "VLLM_PATTERN_MATCH_DEBUG", None
+    ),
+    # Dump fx graphs to the given directory.
+    # It will override CompilationConfig.debug_dump_path if set.
+    "VLLM_DEBUG_DUMP_PATH": lambda: os.environ.get("VLLM_DEBUG_DUMP_PATH", None),
+    # Feature flag to enable/disable AOT compilation. This will ensure
+    # compilation is done in warmup phase and the compilation will be
+    # reused in subsequent calls.
+    "VLLM_USE_AOT_COMPILE": use_aot_compile,
+    # Feature flag to enable/disable bytecode in
+    # TorchCompileWithNoGuardsWrapper.
+    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+    ),
+    # Force vllm to always load AOT compiled models from disk. Failure
+    # to load will result in a hard error when this is enabled.
+    # Will be ignored when VLLM_USE_AOT_COMPILE is disabled.
+    "VLLM_FORCE_AOT_LOAD": lambda: os.environ.get("VLLM_FORCE_AOT_LOAD", "0") == "1",
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
-    "LOCAL_RANK":
-    lambda: int(os.environ.get("LOCAL_RANK", "0")),
-
+    "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),
     # used to control the visible devices in the distributed setting
-    "CUDA_VISIBLE_DEVICES":
-    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
-
+    "CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
     # timeout for each iteration in the engine
-    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
-    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
-
+    "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
+        os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
+    ),
     # API key for vLLM API server
-    "VLLM_API_KEY":
-    lambda: os.environ.get("VLLM_API_KEY", None),
-
+    "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
     # Whether to log responses from API Server for debugging
-    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
-    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
-                           ).lower() == "true",
-
+    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: os.environ.get(
+        "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
+    ).lower()
+    == "true",
     # S3 access information, used for tensorizer to load model from S3
-    "S3_ACCESS_KEY_ID":
-    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
-    "S3_SECRET_ACCESS_KEY":
-    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
-    "S3_ENDPOINT_URL":
-    lambda: os.environ.get("S3_ENDPOINT_URL", None),
-
+    "S3_ACCESS_KEY_ID": lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+    "S3_SECRET_ACCESS_KEY": lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL": lambda: os.environ.get("S3_ENDPOINT_URL", None),
     # Usage stats collection
-    "VLLM_USAGE_STATS_SERVER":
-    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
-    "VLLM_NO_USAGE_STATS":
-    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
-    "VLLM_DISABLE_FLASHINFER_PREFILL":
-    lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1",
-    "VLLM_DO_NOT_TRACK":
-    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
-        "DO_NOT_TRACK", None) or "0") == "1",
-    "VLLM_USAGE_SOURCE":
-    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
-
+    "VLLM_USAGE_STATS_SERVER": lambda: os.environ.get(
+        "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
+    ),
+    "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DISABLE_FLASHINFER_PREFILL": lambda: os.environ.get(
+        "VLLM_DISABLE_FLASHINFER_PREFILL", "0"
+    )
+    == "1",
+    "VLLM_DO_NOT_TRACK": lambda: (
+        os.environ.get("VLLM_DO_NOT_TRACK", None)
+        or os.environ.get("DO_NOT_TRACK", None)
+        or "0"
+    )
+    == "1",
+    "VLLM_USAGE_SOURCE": lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
     # Logging configuration
     # If set to 0, vllm will not configure logging
     # If set to 1, vllm will configure logging using the default configuration
     #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
-    "VLLM_CONFIGURE_LOGGING":
-    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
-    "VLLM_LOGGING_CONFIG_PATH":
-    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
-
+    "VLLM_CONFIGURE_LOGGING": lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
     # this is used for configuring the default logging level
-    "VLLM_LOGGING_LEVEL":
-    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
-
+    "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
     # this is used for configuring the default logging stream
-    "VLLM_LOGGING_STREAM":
-    lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
-
+    "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
-    "VLLM_LOGGING_PREFIX":
-    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
-
-    # if set, vllm will call logits processors in a thread pool with this many
-    # threads. This is useful when using custom logits processors that either
-    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
-    # while not holding the python GIL, or both.
-    "VLLM_LOGITS_PROCESSOR_THREADS":
-    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
-    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
-
+    "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
     # If set, vllm will log stats at this interval in seconds
     # If not set, vllm will log stats every 10 seconds.
-    "VLLM_LOG_STATS_INTERVAL":
-    lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10.")))
-        > 0. else 10.,
-
+    "VLLM_LOG_STATS_INTERVAL": lambda: val
+    if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
+    else 10.0,
     # Trace function calls
     # If set to 1, vllm will trace function calls
     # Useful for debugging
-    "VLLM_TRACE_FUNCTION":
-    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
-
+    "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
     # Backend for attention computation
     # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
@@ -609,64 +634,45 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     # - "FLASHINFER_MLA": use FlashInfer for MLA
     # - "CUTLASS_MLA": use CUTLASS for MLA
-    # All possible options loaded dynamically from _Backend enum
-    "VLLM_ATTENTION_BACKEND":
-    env_with_choices("VLLM_ATTENTION_BACKEND", None,
-                     lambda: list(__import__('vllm.platforms.interface', \
-                        fromlist=['_Backend'])._Backend.__members__.keys())),
-
+    # All possible options loaded dynamically from AttentionBackendEnum
+    "VLLM_ATTENTION_BACKEND": env_with_choices(
+        "VLLM_ATTENTION_BACKEND",
+        None,
+        lambda: list(
+            __import__(
+                "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"]
+            ).AttentionBackendEnum.__members__.keys()
+        ),
+    ),
     # If set, vllm will use flashinfer sampler
-    "VLLM_USE_FLASHINFER_SAMPLER":
-    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
-    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
-
+    "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
+        int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
+    )
+    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
+    else None,
     # Pipeline stage partition strategy
-    "VLLM_PP_LAYER_PARTITION":
-    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
-
+    "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
     # (CPU backend only) CPU key-value cache space.
     # default is None and will be set as 4 GB
-    "VLLM_CPU_KVCACHE_SPACE":
-    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
-    if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None,
-
+    "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
+    if "VLLM_CPU_KVCACHE_SPACE" in os.environ
+    else None,
     # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
     # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
-    "VLLM_CPU_OMP_THREADS_BIND":
-    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
-
+    "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
     # (CPU backend only) CPU cores not used by OMP threads .
     # Those CPU cores will not be used by OMP threads of a rank.
-    "VLLM_CPU_NUM_OF_RESERVED_CPU":
-    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
-    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None,
-
+    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
+        os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
+    )
+    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
+    else None,
     # (CPU backend only) whether to use prepack for MoE layer. This will be
     # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
     # need to set this to "0" (False).
-    "VLLM_CPU_MOE_PREPACK":
-    lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
-
+    "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
     # (CPU backend only) whether to use SGL kernels, optimized for small batch.
-    "VLLM_CPU_SGL_KERNEL":
-    lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
-
-    # If the env var is set, then all workers will execute as separate
-    # processes from the engine, and we use the same mechanism to trigger
-    # execution on all workers.
-    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
-    "VLLM_USE_RAY_SPMD_WORKER":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
-
-    # If the env var is set, it uses the Ray's Compiled Graph
-    # (previously known as ADAG) API which optimizes the
-    # control plane overhead.
-    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-    # Note that this variable is set to 1 in V1 by default
-    # when ray distributed executor is used.
-    "VLLM_USE_RAY_COMPILED_DAG":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
-
+    "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
     # If the env var is set, Ray Compiled Graph uses the specified
     # channel type to communicate between workers belonging to
     # different pipeline-parallel stages.
@@ -674,76 +680,67 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "auto": use the default channel type
     # - "nccl": use NCCL for communication
     # - "shm": use shared memory and gRPC for communication
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto",
-        ["auto", "nccl", "shm"]),
-
+    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices(
+        "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto", ["auto", "nccl", "shm"]
+    ),
     # If the env var is set, it enables GPU communication overlap
-    # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
-                 ),
-
+    # (experimental feature) in Ray's Compiled Graph.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
+        int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
+    ),
     # If the env var is set, it uses a Ray Communicator wrapping
     # vLLM's pipeline parallelism communicator to interact with Ray's
     # Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_WRAPPED_PP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
-
+    "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
+        int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
+    ),
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
-    "VLLM_WORKER_MULTIPROC_METHOD":
-    env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "spawn",
-       ["spawn", "fork"]),
-
+    "VLLM_WORKER_MULTIPROC_METHOD": env_with_choices(
+        "VLLM_WORKER_MULTIPROC_METHOD", "fork", ["spawn", "fork"]
+    ),
     # Path to the cache for storing downloaded assets
-    "VLLM_ASSETS_CACHE":
-    lambda: os.path.expanduser(
+    "VLLM_ASSETS_CACHE": lambda: os.path.expanduser(
         os.getenv(
             "VLLM_ASSETS_CACHE",
             os.path.join(get_default_cache_root(), "vllm", "assets"),
-        )),
-
+        )
+    ),
     # If the env var is set, we will clean model file in
     # this path $VLLM_ASSETS_CACHE/model_streamer/$model_name
-    "VLLM_ASSETS_CACHE_MODEL_CLEAN":
-    lambda: bool(int(os.getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0"))),
-
+    "VLLM_ASSETS_CACHE_MODEL_CLEAN": lambda: bool(
+        int(os.getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0"))
+    ),
     # Timeout for fetching images when serving multimodal models
     # Default is 5 seconds
-    "VLLM_IMAGE_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
-
+    "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
     # Timeout for fetching videos when serving multimodal models
     # Default is 30 seconds
-    "VLLM_VIDEO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
-
+    "VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
+        os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
+    ),
     # Timeout for fetching audio when serving multimodal models
     # Default is 10 seconds
-    "VLLM_AUDIO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
-
+    "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
+        os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
+    ),
     # Whether to allow HTTP redirects when fetching from media URLs.
     # Default to True
-    "VLLM_MEDIA_URL_ALLOW_REDIRECTS":
-    lambda: bool(int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))),
-
+    "VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
+        int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))
+    ),
     # Max number of workers for the thread pool handling
     # media bytes loading. Set to 1 to disable parallel processing.
     # Default is 8
-    "VLLM_MEDIA_LOADING_THREAD_COUNT":
-    lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")),
-
+    "VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int(
+        os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")
+    ),
     # Maximum filesize in MB for a single audio file when processing
     # speech-to-text requests. Files larger than this will be rejected.
     # Default is 25 MB
-    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
-    lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
-
+    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
+        os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
+    ),
     # Backend for Video IO
     # - "opencv": Default backend that uses OpenCV stream buffered backend.
     #
@@ -751,284 +748,272 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
     # imported at runtime.
     # If a non-existing backend is used, an AssertionError will be thrown.
-    "VLLM_VIDEO_LOADER_BACKEND":
-    lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"),
-
+    "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv(
+        "VLLM_VIDEO_LOADER_BACKEND", "opencv"
+    ),
+    # Media connector implementation.
+    # - "http": Default connector that supports fetching media via HTTP.
+    #
+    # Custom implementations can be registered
+    # via `@MEDIA_CONNECTOR_REGISTRY.register("my_custom_media_connector")` and
+    # imported at runtime.
+    # If a non-existing backend is used, an AssertionError will be thrown.
+    "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
     # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
     # Default is 4 GiB per API process + 4 GiB per engine core process
-    "VLLM_MM_INPUT_CACHE_GIB":
-    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
-
+    "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
-    "VLLM_XLA_CACHE_PATH":
-    lambda: os.path.expanduser(
+    "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser(
         os.getenv(
             "VLLM_XLA_CACHE_PATH",
             os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
-        )),
-
+        )
+    ),
     # If set, assert on XLA recompilation after each execution step.
-    "VLLM_XLA_CHECK_RECOMPILATION":
-    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
-
+    "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
+        int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
+    ),
     # Enable SPMD mode for TPU backend.
-    "VLLM_XLA_USE_SPMD":
-    lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
-    "VLLM_FUSED_MOE_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
+    "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
+    "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
+        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
+    ),
     # Control whether to use fused MoE activation chunking. Current chunking
     # logic is incompatible with torch.compile and causes IMA. See issue
     # https://github.com/vllm-project/vllm/issues/19631.
-    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING":
-    lambda: bool(
-        int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
-
+    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))
+    ),
     # If set, the OpenAI API server will stay alive even after the underlying
     # AsyncLLMEngine errors and stops serving requests
-    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
-    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
-
+    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
+        int(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", "0"))
+    ),
     # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
     # the user to specify a max sequence length greater than
     # the max length derived from the model's config.json.
     # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
-    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
-     ("1", "true")),
-
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: (
+        os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower()
+        in ("1", "true")
+    ),
     # If set, forces FP8 Marlin to be used for FP8 quantization regardless
     # of the hardware support for FP8 compute.
-    "VLLM_TEST_FORCE_FP8_MARLIN":
-    lambda:
-    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
-     ("1", "true")),
-    "VLLM_TEST_FORCE_LOAD_FORMAT":
-    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
-
+    "VLLM_TEST_FORCE_FP8_MARLIN": lambda: (
+        os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower()
+        in ("1", "true")
+    ),
+    "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv(
+        "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
+    ),
     # Time in ms for the zmq client to wait for a response from the backend
     # server for simple data operations
-    "VLLM_RPC_TIMEOUT":
-    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
-
+    "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
     # Timeout in seconds for keeping HTTP connections alive in API server
-    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
-    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
-
+    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
+        os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
+    ),
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded
-    "VLLM_PLUGINS":
-    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
-        "VLLM_PLUGINS"].split(","),
-
+    "VLLM_PLUGINS": lambda: None
+    if "VLLM_PLUGINS" not in os.environ
+    else os.environ["VLLM_PLUGINS"].split(","),
     # a local directory to look in for unrecognized LoRA adapters.
     # only works if plugins are enabled and
     # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
-    "VLLM_LORA_RESOLVER_CACHE_DIR":
-    lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
-
+    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
+        "VLLM_LORA_RESOLVER_CACHE_DIR", None
+    ),
+    # Enables torch CUDA profiling if set.
+    # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
+    "VLLM_TORCH_CUDA_PROFILE": lambda: bool(
+        os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
+    ),
     # Enables torch profiler if set.
     # Both AsyncLLM's CPU traces as well as workers'
     # traces (CPU & GPU) will be saved under this directory.
     # Note that it must be an absolute path.
-    "VLLM_TORCH_PROFILER_DIR":
-    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
-             .path.abspath(os.path.expanduser(os.getenv(
-        "VLLM_TORCH_PROFILER_DIR", ".")))),
-
+    "VLLM_TORCH_PROFILER_DIR": lambda: (
+        None
+        if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
+        else (
+            val
+            if val.startswith("gs://") and val[5:] and val[5] != "/"
+            else os.path.abspath(os.path.expanduser(val))
+        )
+    ),
     # Enable torch profiler to record shapes if set
     # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
     # not record shapes.
-    "VLLM_TORCH_PROFILER_RECORD_SHAPES":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"),
-
+    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
+    ),
     # Enable torch profiler to profile memory if set
     # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
     # will not profile memory.
-    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY":
-    lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"),
-
+    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
+    ),
     # Enable torch profiler to profile stack if set
     # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
     # profile stack by default.
-    "VLLM_TORCH_PROFILER_WITH_STACK":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"),
-
+    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
+    ),
     # Enable torch profiler to profile flops if set
     # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
     # not profile flops.
-    "VLLM_TORCH_PROFILER_WITH_FLOPS":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"),
-
+    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
+    ),
     # If set, vLLM will use Triton implementations of AWQ.
-    "VLLM_USE_TRITON_AWQ":
-    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
-
+    "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
-    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
-     ("1", "true")),
-
+    "VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: (
+        os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower()
+        in ("1", "true")
+    ),
     # We assume drivers can report p2p status correctly.
     # If the program hangs when using custom allreduce,
     # potantially caused by a bug in the driver (535 series),
     # if might be helpful to set VLLM_SKIP_P2P_CHECK=0
     # so that vLLM can verify if p2p is actually working.
     # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
-    "VLLM_SKIP_P2P_CHECK":
-    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1",
-
+    "VLLM_SKIP_P2P_CHECK": lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1",
     # List of quantization kernels that should be disabled, used for testing
     # and performance comparisons. Currently only affects MPLinearKernel
     # selection
     # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
-    "VLLM_DISABLED_KERNELS":
-    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
-        "VLLM_DISABLED_KERNELS"].split(","),
-
-    # Swaps the all reduce backend that we use to coordinate the DP padding
-    # information from NCCL to gloo.
-    "VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION":
-    lambda:
-    (os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower() in
-             ("true", "1")),
-
-    # If set, use the V1 code path.
-    "VLLM_USE_V1":
-    lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
-
+    "VLLM_DISABLED_KERNELS": lambda: []
+    if "VLLM_DISABLED_KERNELS" not in os.environ
+    else os.environ["VLLM_DISABLED_KERNELS"].split(","),
+    # Disable pynccl (using torch.distributed instead)
+    "VLLM_DISABLE_PYNCCL": lambda: (
+        os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
+    ),
     # Disable aiter ops unless specifically enabled.
     # Acts as a parent switch to enable the rest of the other operations.
-    "VLLM_ROCM_USE_AITER":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1")
+    ),
     # Whether to use aiter paged attention.
     # By default is disabled.
-    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in ("true", "1")
+    ),
     # use aiter linear op if aiter ops are enabled
     # The following list of related ops
     # - scaled_mm (per-tensor / rowwise)
-    "VLLM_ROCM_USE_AITER_LINEAR":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_LINEAR": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in ("true", "1")
+    ),
     # Whether to use aiter moe ops.
     # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MOE":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_MOE": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in ("true", "1")
+    ),
     # use aiter rms norm op if aiter ops are enabled.
-    "VLLM_ROCM_USE_AITER_RMSNORM":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_RMSNORM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in ("true", "1")
+    ),
     # Whether to use aiter mla ops.
     # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MLA":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_MLA": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in ("true", "1")
+    ),
     # Whether to use aiter mha ops.
     # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MHA":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_MHA": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in ("true", "1")
+    ),
     # Whether to use aiter fp4 gemm asm.
     # By default is disabled.
-    "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", "False").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", "False").lower() in ("true", "1")
+    ),
     # Whether to use aiter rope.
     # By default is disabled.
-    "VLLM_ROCM_USE_TRITON_ROPE":
-    lambda: (os.getenv("VLLM_ROCM_USE_TRITON_ROPE", "False").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1")
+    ),
     # Whether to use aiter triton fp8 bmm kernel
     # By default is enabled.
-    "VLLM_ROCM_USE_AITER_FP8BMM":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_AITER_FP8BMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in ("true", "1")
+    ),
+    # Use AITER triton unified attention for V1 attention
+    "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", "False").lower()
+        in ("true", "1")
+    ),
+    # Whether to use aiter fusion shared experts ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
+        in ("true", "1")
+    ),
+    # Whether to use aiter triton kernels for gemm ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() in ("true", "1")
+    ),
     # use rocm skinny gemms
-    "VLLM_ROCM_USE_SKINNY_GEMM":
-    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1")
+    ),
     # Pad the fp8 weights to 256 bytes for ROCm
-    "VLLM_ROCM_FP8_PADDING":
-    lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
-
+    "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
     # Pad the weights for the moe kernel
-    "VLLM_ROCM_MOE_PADDING":
-    lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
-
+    "VLLM_ROCM_MOE_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
     # custom paged attention kernel for MI3* cards
-    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
-             ("true", "1")),
-
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: (
+        os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in ("true", "1")
+    ),
     # Custom quick allreduce kernel for MI3* cards
     # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
     # Recommended for large models to get allreduce
-    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
-    env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE",
-                            ["FP", "INT8", "INT6", "INT4", "NONE"]),
-
+    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": env_with_choices(
+        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
+        "NONE",
+        ["FP", "INT8", "INT6", "INT4", "NONE"],
+    ),
     # Custom quick allreduce kernel for MI3* cards
     # Due to the lack of the bfloat16 asm instruction, bfloat16
     # kernels are slower than fp16,
     # If environment variable is set to 1, the input is converted to fp16
-    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16":
-    lambda:
-    (os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower() in
-     ("true", "1")),
-
+    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: (
+        os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower()
+        in ("true", "1")
+    ),
     # Custom quick allreduce kernel for MI3* cards.
     # Controls the maximum allowed number of data bytes(MB) for custom quick
     # allreduce communication.
     # Default: 2048 MB.
     # Data exceeding this size will use either custom allreduce or RCCL
     # communication.
-    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB":
-    lambda: maybe_convert_int(
-        os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None)),
-
+    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
+        os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None)
+    ),
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
-    "Q_SCALE_CONSTANT":
-    lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
+    "Q_SCALE_CONSTANT": lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
     # Divisor for dynamic key scale factor calculation for FP8 KV Cache
-    "K_SCALE_CONSTANT":
-    lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
+    "K_SCALE_CONSTANT": lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
     # Divisor for dynamic value scale factor calculation for FP8 KV Cache
-    "V_SCALE_CONSTANT":
-    lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
-
+    "V_SCALE_CONSTANT": lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
     # If set, enable multiprocessing in LLM for the V1 code path.
-    "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
-    "VLLM_LOG_BATCHSIZE_INTERVAL":
-    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
-    "VLLM_DISABLE_COMPILE_CACHE":
-    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
-
+    "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
+    ),
+    "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
+        os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
+    ),
+    "VLLM_DISABLE_COMPILE_CACHE": disable_compile_cache,
     # If set, vllm will run in development mode, which will enable
     # some additional endpoints for developing and debugging,
     # e.g. `/reset_prefix_cache`
-    "VLLM_SERVER_DEV_MODE":
-    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
-
+    "VLLM_SERVER_DEV_MODE": lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
     # Controls the maximum number of requests to handle in a
     # single asyncio task when processing per-token outputs in the
     # V1 AsyncLLM interface. It is applicable when handling a high
@@ -1036,175 +1021,175 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Setting this too high can result in a higher variance of
     # inter-message latencies. Setting it too low can negatively impact
     # TTFT and overall throughput.
-    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
-
+    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
+        os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
+    ),
     # If set, vLLM will disable the MLA attention optimizations.
-    "VLLM_MLA_DISABLE":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
-
+    "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
     # If set, vLLM will pick up the provided Flash Attention MLA
     # max number splits for cuda graph decode
-    "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH":
-    lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-                          "32")),
-
+    "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
+        os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "32")
+    ),
     # Number of GPUs per worker in Ray, if it is set to be a fraction,
     # it allows ray to schedule multiple actors on a single GPU,
     # so that users can colocate other actors on the same GPUs as vLLM.
-    "VLLM_RAY_PER_WORKER_GPUS":
-    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
-
+    "VLLM_RAY_PER_WORKER_GPUS": lambda: float(
+        os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
+    ),
     # Bundle indices for Ray, if it is set, it can control precisely
     # which indices are used for the Ray bundle, for every worker.
     # Format: comma-separated list of integers, e.g. "0,1,2,3"
-    "VLLM_RAY_BUNDLE_INDICES":
-    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
-
+    "VLLM_RAY_BUNDLE_INDICES": lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
     # In some system, find_loaded_library() may not work. So we allow users to
     # specify the path through environment variable VLLM_CUDART_SO_PATH.
-    "VLLM_CUDART_SO_PATH":
-    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
-
+    "VLLM_CUDART_SO_PATH": lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
     # Rank of the process in the data parallel setting
-    "VLLM_DP_RANK":
-    lambda: int(os.getenv("VLLM_DP_RANK", "0")),
-
+    "VLLM_DP_RANK": lambda: int(os.getenv("VLLM_DP_RANK", "0")),
     # Rank of the process in the data parallel setting.
     # Defaults to VLLM_DP_RANK when not set.
-    "VLLM_DP_RANK_LOCAL":
-    lambda: int(
-        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
-
+    "VLLM_DP_RANK_LOCAL": lambda: int(
+        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)
+    ),
     # World size of the data parallel setting
-    "VLLM_DP_SIZE":
-    lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
-
+    "VLLM_DP_SIZE": lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
     # IP address of the master node in the data parallel setting
-    "VLLM_DP_MASTER_IP":
-    lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
-
+    "VLLM_DP_MASTER_IP": lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
     # Port of the master node in the data parallel setting
-    "VLLM_DP_MASTER_PORT":
-    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
-
+    "VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
     # In the context of executing MoE models with Data-Parallel, Expert-Parallel
     # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
     # dictates the quantum of tokens that can be dispatched from a DP
     # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
     # units.
-    "VLLM_MOE_DP_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
-
+    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
     # Randomize inputs during dummy runs when using Data Parallel
-    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
-    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
-
+    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get(
+        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
+    )
+    == "1",
+    # Strategy to pack the data parallel ranks for Ray.
+    # Available options:
+    # - "fill":
+    #   for DP master node, allocate exactly data-parallel-size-local DP ranks,
+    #   for non-master nodes, allocate as many DP ranks as can fit;
+    # - "strict":
+    #   allocate exactly data-parallel-size-local DP ranks to each picked node;
+    # - "span":
+    #   Should be used only when a single DP rank requires multiple nodes.
+    #   allocate one DP rank over as many nodes as required for set world_size;
+    # This environment variable is ignored if data-parallel-backend is not Ray.
+    "VLLM_RAY_DP_PACK_STRATEGY": lambda: os.getenv(
+        "VLLM_RAY_DP_PACK_STRATEGY", "strict"
+    ),
     # Whether to use S3 path for model loading in CI via RunAI Streamer
-    "VLLM_CI_USE_S3":
-    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
-
+    "VLLM_CI_USE_S3": lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
     # Use model_redirect to redirect the model name to a local folder.
     # `model_redirect` can be a json file mapping the model between
     # repo_id and local folder:
     # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
     # or a space separated values table file:
     # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
-    "VLLM_MODEL_REDIRECT_PATH":
-    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
-
+    "VLLM_MODEL_REDIRECT_PATH": lambda: os.environ.get(
+        "VLLM_MODEL_REDIRECT_PATH", None
+    ),
     # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
-    "VLLM_MARLIN_USE_ATOMIC_ADD":
-    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
-
+    "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: os.environ.get(
+        "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
+    )
+    == "1",
     # Whether to use marlin kernel in mxfp4 quantization method
-    "VLLM_MXFP4_USE_MARLIN":
-    lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
-
-    # Whether to turn on the outlines cache for V0
-    # This cache is unbounded and on disk, so it's not safe to use in
-    # an environment with potentially malicious users.
-    "VLLM_V0_USE_OUTLINES_CACHE":
-    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
-
+    "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
+        os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
-    "VLLM_V1_USE_OUTLINES_CACHE":
-    lambda: os.environ.get("VLLM_V1_USE_OUTLINES_CACHE", "0") == "1",
-
+    "VLLM_V1_USE_OUTLINES_CACHE": lambda: os.environ.get(
+        "VLLM_V1_USE_OUTLINES_CACHE", "0"
+    )
+    == "1",
     # Gap between padding buckets for the forward pass. So we have
     # 8, we will run forward pass with [16, 24, 32, ...].
-    "VLLM_TPU_BUCKET_PADDING_GAP":
-    lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
-    "VLLM_TPU_MOST_MODEL_LEN":
-    lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)),
-
+    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
+        os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]
+    )
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
+    else 0,
+    "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
+        os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
+    ),
     # Whether using Pathways
-    "VLLM_TPU_USING_PATHWAYS":
-    lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()),
-
+    "VLLM_TPU_USING_PATHWAYS": lambda: bool(
+        "proxy" in os.getenv("JAX_PLATFORMS", "").lower()
+    ),
     # Allow use of DeepGemm kernels for fused moe ops.
-    "VLLM_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))),
-
+    "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))),
+    # Allow use of DeepGemm specifically for MoE fused ops (overrides only MoE).
+    "VLLM_MOE_USE_DEEP_GEMM": lambda: bool(
+        int(os.getenv("VLLM_MOE_USE_DEEP_GEMM", "1"))
+    ),
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    "VLLM_USE_DEEP_GEMM_E8M0":
-    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
-    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
-    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
-    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
-    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
+    "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
+        int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
+    ),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
     # startup time by a couple of minutes.
-    # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
-    "VLLM_SKIP_DEEP_GEMM_WARMUP":
-    lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
-
+    # Available options:
+    #  - "skip"  : Skip warmup.
+    #  - "full"  : Warmup deepgemm by running all possible gemm shapes the
+    #   engine could encounter.
+    #  - "relax" : Select gemm shapes to run based on some heuristics. The
+    #   heuristic aims to have the same effect as running all possible gemm
+    #   shapes, but provides no guarantees.
+    "VLLM_DEEP_GEMM_WARMUP": env_with_choices(
+        "VLLM_DEEP_GEMM_WARMUP",
+        "relax",
+        [
+            "skip",
+            "full",
+            "relax",
+        ],
+    ),
     # Whether to use fused grouped_topk used for MoE expert selection.
-    "VLLM_USE_FUSED_MOE_GROUPED_TOPK":
-    lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))),
-
+    "VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
+        int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
+    ),
     # Allow use of FlashInfer MoE kernels for fused moe ops.
-    "VLLM_USE_FLASHINFER_MOE_FP16":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))),
-
+    "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
+    ),
     # Allow use of FlashInfer MoE kernels for fused moe ops.
-    "VLLM_USE_FLASHINFER_MOE_FP8":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
-
+    "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
+    ),
     # Allow use of FlashInfer CUTLASS kernels for fused moe ops.
-    "VLLM_USE_FLASHINFER_MOE_FP4":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
-
+    "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
+    ),
     # If set to 1, use the FlashInfer
     # MXFP8 (activation) x MXFP4 (weight) MoE backend.
-    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
-
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))
+    ),
     # If set to 1, use the FlashInfer CUTLASS backend for
     # MXFP8 (activation) x MXFP4 (weight) MoE.
     # This is separate from the TRTLLMGEN path controlled by
     # VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8.
-    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS":
-    lambda: bool(int(
-        os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0")
-        )),
-
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0"))
+    ),
     # If set to 1, use the FlashInfer
     # BF16 (activation) x MXFP4 (weight) MoE backend.
-    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
-
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
+        int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))
+    ),
     # Control the cache sized used by the xgrammar compiler. The default
     # of 512 MB should be enough for roughly 1000 JSON schemas.
     # It can be changed with this variable if needed for some reason.
-    "VLLM_XGRAMMAR_CACHE_MB":
-    lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
-
+    "VLLM_XGRAMMAR_CACHE_MB": lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
     # Control the threshold for msgspec to use 'zero copy' for
     # serialization/deserialization of tensors. Tensors below
     # this limit will be encoded into the msgpack buffer, and
@@ -1212,23 +1197,23 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # While the sending side still actually copies the tensor
     # in all cases, on the receiving side, tensors above this
     # limit will actually be zero-copy decoded.
-    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
-    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
-
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
+        os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
+    ),
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the
     # insecure method and it is needed for some reason.
-    "VLLM_ALLOW_INSECURE_SERIALIZATION":
-    lambda: bool(int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0"))),
-
+    "VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
+        int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0"))
+    ),
     # IP address used for NIXL handshake between remote agents.
-    "VLLM_NIXL_SIDE_CHANNEL_HOST":
-    lambda: os.getenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"),
-
+    "VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: os.getenv(
+        "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
+    ),
     # Port used for NIXL handshake between remote agents.
-    "VLLM_NIXL_SIDE_CHANNEL_PORT":
-    lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")),
-
+    "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
+        os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
+    ),
     # all2all backend for vllm's expert parallel communication
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
@@ -1238,14 +1223,18 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "deepep_high_throughput", use deepep high-throughput kernels
     # - "deepep_low_latency", use deepep low-latency kernels
     # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
-    "VLLM_ALL2ALL_BACKEND":
-    env_with_choices("VLLM_ALL2ALL_BACKEND", "allgather_reducescatter",
-                     ["naive", "pplx",
-                     "deepep_high_throughput",
-                     "deepep_low_latency",
-                     "allgather_reducescatter",
-                     "flashinfer_all2allv"]),
-
+    "VLLM_ALL2ALL_BACKEND": env_with_choices(
+        "VLLM_ALL2ALL_BACKEND",
+        "allgather_reducescatter",
+        [
+            "naive",
+            "pplx",
+            "deepep_high_throughput",
+            "deepep_low_latency",
+            "allgather_reducescatter",
+            "flashinfer_all2allv",
+        ],
+    ),
     # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
     # Both require compute capability 10.0 or above.
     # Available options:
@@ -1253,56 +1242,56 @@ environment_variables: dict[str, Callable[[], Any]] = {
     #     Uses CUTLASS kernels optimized for high-throughput batch inference.
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
-    "VLLM_FLASHINFER_MOE_BACKEND":
-    env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput",
-    ["throughput", "latency"]),
-
+    "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
+        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
+    ),
+    # Control the workspace buffer size for the FlashInfer backend.
+    "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
+        os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024))
+    ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
     # the blockscale tensor of activations NVFP4 Quantization.
     # This is used to prevent the kernel from running out of memory.
-    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
-    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
-
+    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
+        os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")
+    ),
     # Specifies the thresholds of the communicated tensor sizes under which
     # vllm should use flashinfer fused allreduce. The variable should be a
     # JSON with the following format:
     #     { <world size>: <max size in mb> }
     # Unspecified world sizes will fall back to
     #     { 2: 64, 4: 1, <everything else>: 0.5 }
-    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
-    lambda: json.loads(os.getenv(
-        "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
-
+    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: json.loads(
+        os.getenv("VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")
+    ),
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.
     # Cutstom routing strategies can be registered by
     # RoutingSimulator.register_strategy()
     # Note: custom strategies may not produce correct model outputs
-    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY":
-    lambda: os.environ.get("VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "").lower(),
-
+    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get(
+        "VLLM_MOE_ROUTING_SIMULATION_STRATEGY", ""
+    ).lower(),
     # Regex timeout for use by the vLLM tool parsing plugins.
-    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
-    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
-
+    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
+        os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
+    ),
     # Reduce CPU usage when vLLM is idle. Enabling this will incur small
     # latency penalty when a request eventually comes.
-    "VLLM_SLEEP_WHEN_IDLE":
-    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
-
+    "VLLM_SLEEP_WHEN_IDLE": lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
     # Control the max chunk bytes (in MB) for the rpc message queue.
     # Object larger than this threshold will be broadcast to worker
     # processes via zmq.
-    "VLLM_MQ_MAX_CHUNK_BYTES_MB":
-    lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
-
+    "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
+        os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
+    ),
     # Timeout in seconds for execute_model RPC calls in multiprocessing
     # executor (only applies when TP > 1).
-    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS":
-    lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")),
-
+    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
+        os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
+    ),
     # KV Cache layout used throughout vllm.
     # Some common values are:
     # - NHD
@@ -1310,76 +1299,75 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Where N=num_blocks, H=num_heads and D=head_size. The default value will
     # leave the layout choice to the backend. Mind that backends may only
     # implement and support a subset of all possible layouts.
-    "VLLM_KV_CACHE_LAYOUT":
-    env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]),
-
+    "VLLM_KV_CACHE_LAYOUT": env_with_choices(
+        "VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]
+    ),
     # Enable checking whether the generated logits contain NaNs,
     # indicating corrupted output. Useful for debugging low level bugs
     # or bad hardware but it may add compute overhead.
-    "VLLM_COMPUTE_NANS_IN_LOGITS":
-    lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
-
+    "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
+        int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
+    ),
     # Controls whether or not emulations are used for NVFP4
     # generations on machines < 100 for compressed-tensors
     # models
-    "VLLM_USE_NVFP4_CT_EMULATIONS":
-    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
-
+    "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
+        int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
+    ),
     # Time (in seconds) after which the KV cache on the producer side is
     # automatically cleared if no READ notification is received from the
     # consumer. This is only applicable when using NixlConnector in a
     # disaggregated decode-prefill setup.
-    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
-    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
-
+    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
+        os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
+    ),
     # Controls whether or not to use cudnn prefill
-    "VLLM_USE_CUDNN_PREFILL":
-    lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
-
+    "VLLM_USE_CUDNN_PREFILL": lambda: bool(
+        int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))
+    ),
+    # Controls whether to use TRT-LLM ragged DeepSeek prefill
+    "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
+        int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0"))
+    ),
     # If set to 1/True, use the TRTLLM attention backend in flashinfer.
     # If set to 0/False, use the default attention backend in flashinfer.
     # If not set, auto-detect the attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_ATTENTION":
-    lambda: (None if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ else
-             os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true")),
-
+    "VLLM_USE_TRTLLM_ATTENTION": lambda: (
+        None
+        if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ
+        else os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true")
+    ),
     # If set to 1, when we use fp8 kv, we do not quantize Q to fp8
-    "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))),
-
+    "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
+        int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))
+    ),
     # If set, it means we pre-downloaded cubin files and flashinfer will
     # read the cubin files directly.
-    "VLLM_HAS_FLASHINFER_CUBIN":
-    lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
-
-    # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
-    # Otherwise, uses the first available of: flashinfer cutlass GEMM,
-    # vllm cutlass GEMM, marlin GEMM.
-    "VLLM_USE_TRTLLM_FP4_GEMM":
-    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
-
+    "VLLM_HAS_FLASHINFER_CUBIN": lambda: bool(
+        int(os.getenv("VLLM_HAS_FLASHINFER_CUBIN", "0"))
+    ),
+    # Supported options:
+    # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
+    # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
+    # - "flashinfer-cutlass": use flashinfer cutlass GEMM backend
+    # - <none>: automatically pick an available backend
+    "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
+        "VLLM_NVFP4_GEMM_BACKEND",
+        None,
+        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"],
+    ),
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
     # If set to 1, allows GC to run during capture.
-    "VLLM_ENABLE_CUDAGRAPH_GC":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
-
-    # Disable padding to CUDA graph capture batch sizes.
-    # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
-    # After the issue is fixed, we can remove this flag.
-    "VLLM_DISABLE_PAD_FOR_CUDAGRAPH":
-    lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))),
-
+    "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
+    ),
     # Used to force set up loopback IP
-    "VLLM_LOOPBACK_IP":
-    lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
-
+    "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
     # Used to set the process name prefix for vLLM processes.
     # This is useful for debugging and monitoring purposes.
     # The default value is "VLLM".
-    "VLLM_PROCESS_NAME_PREFIX":
-    lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
-
+    "VLLM_PROCESS_NAME_PREFIX": lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
     # Allow chunked local attention with hybrid kv cache manager.
     # Currently using the Hybrid KV cache manager with chunked local attention
     # in the Llama4 models (the only models currently using chunked local attn)
@@ -1387,10 +1375,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # This flag is used to allow users to enable it if they want to (to save on
     # kv-cache memory usage and enable longer contexts)
     # TODO(lucas): Remove this flag once latency regression is resolved.
-    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
-    lambda: bool(int(os.getenv(\
-            "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
-
+    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
+        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))
+    ),
     # Enables support for the "store" option in the OpenAI Responses API.
     # When set to 1, vLLM's OpenAI server will retain the input and output
     # messages for those requests in memory. By default, this is disabled (0),
@@ -1400,99 +1387,163 @@ environment_variables: dict[str, Callable[[], Any]] = {
     #    lost when the vLLM server shuts down.
     # 2. Enabling this option will cause a memory leak, as stored messages are
     #    never removed from memory until the server terminates.
-    "VLLM_ENABLE_RESPONSES_API_STORE":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
-
+    "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))
+    ),
     # If set, use the fp8 mfma in rocm paged attention.
-    "VLLM_ROCM_FP8_MFMA_PAGE_ATTN":
-    lambda: bool(int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))),
-
+    "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool(
+        int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))
+    ),
     # Whether to use pytorch symmetric memory for allreduce
-    "VLLM_ALLREDUCE_USE_SYMM_MEM":
-    lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))),
-
+    "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
+        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
+    ),
     # Allows vllm to find tuned config under customized folder
-    "VLLM_TUNED_CONFIG_FOLDER":
-    lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
-
+    "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
+    # Valid values are container,code_interpreter,web_search_preview
+    # ex VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter
+    # If the server_label of your mcp tool is not in this list it will
+    # be completely ignored.
+    "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_set_with_choices(
+        "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
+        default=[],
+        choices=["container", "code_interpreter", "web_search_preview"],
+    ),
     # Allows harmony instructions to be injected on system messages
-    "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS":
-    lambda: bool(
-        int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0"))),
-
+    "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool(
+        int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0"))
+    ),
+    # Enable automatic retry when tool call JSON parsing fails
+    # If enabled, returns an error message to the model to retry
+    # If disabled (default), raises an exception and fails the request
+    "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool(
+        int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0"))
+    ),
     # Add optional custom scopes for profiling, disable to avoid overheads
-    "VLLM_CUSTOM_SCOPES_FOR_PROFILING":
-    lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))),
-
+    "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
+        int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
+    ),
     # Add optional nvtx scopes for profiling, disable to avoid overheads
-    "VLLM_NVTX_SCOPES_FOR_PROFILING":
-    lambda: bool(int(os.getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))),
-
+    "VLLM_NVTX_SCOPES_FOR_PROFILING": lambda: bool(
+        int(os.getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))
+    ),
     # Represent block hashes in KV cache events as 64-bit integers instead of
     # raw bytes. Defaults to True for backward compatibility.
-    "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES":
-    lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))),
-
+    "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(
+        int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))
+    ),
     # Name of the shared memory buffer used for object storage.
     # Only effective when mm_config.mm_processor_cache_type == "shm".
-    "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME":
-    lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
-                      "VLLM_OBJECT_STORAGE_SHM_BUFFER"),
-
+    "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: os.getenv(
+        "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", "VLLM_OBJECT_STORAGE_SHM_BUFFER"
+    ),
     # The size in MB of the buffers (NVL and RDMA) used by DeepEP
-    "VLLM_DEEPEP_BUFFER_SIZE_MB":
-    lambda: int(os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")),
-
+    "VLLM_DEEPEP_BUFFER_SIZE_MB": lambda: int(
+        os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
+    ),
+    # Force DeepEP to use intranode kernel for inter-node communication in
+    # high throughput mode. This is useful archive higher prefill throuhgput
+    # on system supports multi-node nvlink (e.g GB200).
+    "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
+        int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0"))
+    ),
+    # Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel,
+    # turn this for better latency on GB200 like system
+    "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
+        int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0"))
+    ),
     # The number of SMs to allocate for communication kernels when running DBO
     # the rest of the SMs on the device will be allocated to compute
-    "VLLM_DBO_COMM_SMS":
-    lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")),
-
-    # Valid values are container,code_interpreter,web_search_preview
-    # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter
-    "GPT_OSS_SYSTEM_TOOL_MCP_LABELS":
-    env_list_with_choices("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", [],
-                            ["container",
-                            "code_interpreter",
-                            "web_search_preview"]),
-
+    "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")),
     # Enable max_autotune & coordinate_descent_tuning in inductor_config
     # to compile static shapes passed from compile_sizes in compilation_config
     # If set to 1, enable max_autotune; By default, this is enabled (1)
-    "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1"))),
+    "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1"))
+    ),
     # If set to 1, enable coordinate_descent_tuning;
     # By default, this is enabled (1)
-    "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
-        "1"))),
-
+    "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", "1"))
+    ),
     # Flag to enable NCCL symmetric memory allocation and registration
-    "VLLM_USE_NCCL_SYMM_MEM":
-    lambda: bool(int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))),
-
+    "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
+        int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
+    ),
     # NCCL header path
-    "VLLM_NCCL_INCLUDE_PATH":
-    lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
+    "VLLM_NCCL_INCLUDE_PATH": lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
     # Flag to enable FBGemm kernels on model execution
     "VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
-    
+    # GC debug config
+    # - VLLM_GC_DEBUG=0: disable GC debugger
+    # - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
+    # - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger with
+    #                                      top 5 collected objects
+    "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""),
+    # Disables parallel execution of shared_experts via separate cuda stream
+    "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
+        int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
+    ),
+    # Limits when we run shared_experts in a separate stream.
+    # We found out that for large batch sizes, the separate stream
+    # execution is not beneficial (most likely because of the input clone)
+    # TODO(alexm-redhat): Tune to be more dynamic based on GPU type
+    "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
+        int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
+    ),
+    # Format for saving torch.compile cache artifacts
+    # - "binary": saves as binary file
+    #     Safe for multiple vllm serve processes accessing the same torch compile cache.
+    # - "unpacked": saves as directory structure (for inspection/debugging)
+    #     NOT multiprocess safe - race conditions may occur with multiple processes.
+    #     Allows viewing and setting breakpoints in Inductor's code output files.
+    "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
+        "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
+    ),
+    # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
+    # the original list[dict[int, Logprob]] approach.
+    # After enabled, PromptLogprobs and SampleLogprobs would populated as
+    # FlatLogprobs.
+    "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
     # If set, vLLM will use FLASH MLA attention optimizations.
     "VLLM_USE_FLASH_MLA":
     lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "0"))),
-
 }
 
 # --8<-- [end:env-vars-definition]
 
 
 def __getattr__(name: str):
-    # lazy evaluation of environment variables
+    """
+    Gets environment variables lazily.
+
+    NOTE: After enable_envs_cache() invocation (which triggered after service
+    initialization), all environment variables will be cached.
+    """
     if name in environment_variables:
         return environment_variables[name]()
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
+def enable_envs_cache() -> None:
+    """
+    Enables caching of environment variables. This is useful for performance
+    reasons, as it avoids the need to re-evaluate environment variables on
+    every call.
+
+    NOTE: Currently, it's invoked after service initialization to reduce
+    runtime overhead. This also means that environment variables should NOT
+    be updated after the service is initialized.
+    """
+    # Tag __getattr__ with functools.cache
+    global __getattr__
+    __getattr__ = functools.cache(__getattr__)
+
+    # Cache all environment variables
+    for key in environment_variables:
+        __getattr__(key)
+
+
 def __dir__():
     return list(environment_variables.keys())
 
@@ -1504,15 +1555,6 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def set_vllm_use_v1(use_v1: bool):
-    if is_set("VLLM_USE_V1"):
-        raise ValueError(
-            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
-            "explicitly by the user. Please raise this as a Github "
-            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-
-
 def compute_hash() -> str:
     """
     WARNING: Whenever a new key is added to this environment
@@ -1532,7 +1574,6 @@ def compute_hash() -> str:
         "VLLM_PP_LAYER_PARTITION",
         "VLLM_MLA_DISABLE",
         "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_FLASH_ATTN",
         "VLLM_USE_TRITON_AWQ",
         "VLLM_DP_RANK",
         "VLLM_DP_SIZE",
@@ -1540,14 +1581,12 @@ def compute_hash() -> str:
         "VLLM_FUSED_MOE_CHUNK_SIZE",
         "VLLM_FLASHINFER_MOE_BACKEND",
         "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_USE_AITER_UNIFIED_ATTENTION",
         "VLLM_ATTENTION_BACKEND",
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_MOE_USE_DEEP_GEMM",
         "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
-        "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP16",
         "VLLM_USE_FLASHINFER_MOE_FP8",
@@ -1555,7 +1594,9 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
+        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
         "VLLM_USE_CUDNN_PREFILL",
+        "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
         "VLLM_USE_TRTLLM_ATTENTION",
         "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
         "VLLM_ROCM_USE_AITER",
@@ -1566,8 +1607,10 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_MLA",
         "VLLM_ROCM_USE_AITER_MHA",
         "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_TRITON_ROPE",
+        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
         "VLLM_ROCM_USE_AITER_FP8BMM",
+        "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
+        "VLLM_ROCM_USE_AITER_TRITON_GEMM",
         "VLLM_ROCM_USE_SKINNY_GEMM",
         "VLLM_ROCM_FP8_PADDING",
         "VLLM_ROCM_MOE_PADDING",
@@ -1578,19 +1621,43 @@ def compute_hash() -> str:
         "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
         "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
         "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
+        "VLLM_NVFP4_GEMM_BACKEND",
         "VLLM_USE_FBGEMM",
+        "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
+        "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
     ]
     for key in environment_variables_to_hash:
         # if this goes out of sync with environment_variables,
         # it's not a user error, it's a bug
-        assert key in environment_variables, \
+        assert key in environment_variables, (
             "Please update environment_variables_to_hash in envs.py"
-
-    factors = [
-        environment_variables[key]() for key in environment_variables_to_hash
+        )
+
+    factors = [environment_variables[key]() for key in environment_variables_to_hash]
+
+    ray_noset_env_vars = [
+        # Refer to
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/nvidia_gpu.py#L11
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/amd_gpu.py#L11
+        # https://github.com/ray-project/ray/blob/b97d21dab233c2bd8ed7db749a82a1e594222b5c/python/ray/_private/accelerators/amd_gpu.py#L10
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/npu.py#L12
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/hpu.py#L12
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/neuron.py#L14
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/tpu.py#L38
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/intel_gpu.py#L10
+        # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/rbln.py#L10
+        "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES",
+        "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES",
+        "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES",
+        "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES",
+        "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES",
+        "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES",
+        "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS",
+        "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR",
+        "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES",
     ]
+    factors.extend([os.getenv(var) for var in ray_noset_env_vars])
 
-    hash_str = hashlib.md5(str(factors).encode(),
-                           usedforsecurity=False).hexdigest()
+    hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
 
     return hash_str
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
deleted file mode 100644
index e3063ec2b8ab33c50bd0a8d97ef467c6b48462e7..0000000000000000000000000000000000000000
--- a/vllm/executor/executor_base.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import time
-from abc import ABC, abstractmethod
-from functools import cached_property
-from typing import Any, Awaitable, Callable, List, Optional, Set, Union
-
-import torch.nn as nn
-from typing_extensions import TypeVar, deprecated
-
-import vllm.platforms
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.tasks import SupportedTask
-from vllm.utils import make_async
-from vllm.v1.outputs import PoolerOutput, SamplerOutput
-from vllm.worker.worker_base import WorkerBase
-
-logger = init_logger(__name__)
-
-_R = TypeVar("_R", default=Any)
-
-
-class ExecutorBase(ABC):
-    """Base class for all executors.
-
-    An executor is responsible for executing the model on one device,
-    or it can be a distributed executor 
-    that can execute the model on multiple devices.
-    """
-
-    uses_ray: bool  # whether the executor uses Ray for orchestration.
-    supports_pp: bool = False  # whether the executor supports PP
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self._init_executor()
-        self.is_sleeping = False
-        self.sleeping_tags: set[str] = set()
-        self.kv_output_aggregator = None
-
-    @abstractmethod
-    def _init_executor(self) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def collective_rpc(self,
-                       method: Union[str, Callable[[WorkerBase], _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        """
-        Execute an RPC call on all workers.
-
-        Args:
-            method: Name of the worker method to execute, or a callable that
-                is serialized and sent to all workers to execute.
-
-                If the method is a callable, it should accept an additional
-                `self` argument, in addition to the arguments passed in `args`
-                and `kwargs`. The `self` argument will be the worker object.
-            timeout: Maximum time in seconds to wait for execution. Raises a
-                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
-            args: Positional arguments to pass to the worker method.
-            kwargs: Keyword arguments to pass to the worker method.
-
-        Returns:
-            A list containing the results from each worker.
-        
-        Note:
-            It is recommended to use this API to only pass control messages,
-            and set up data-plane communication to pass data.
-        """
-        raise NotImplementedError
-
-    def determine_num_available_blocks(self) -> tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        Normally, this should simply delegate to the underlying Worker. Some
-        ExecutorBase may require modification of the result, e.g. to ensure the
-        selected cache sizes are compatible with all workers.
-
-        Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
-        `num_gpu_blocks` are blocks that are "active" on the device and can be
-        appended to. 
-        `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        results = self.collective_rpc("determine_num_available_blocks")
-        a = min([r[0] for r in results])
-        b = min([r[1] for r in results])
-        return a, b
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: This is logged in the executor because there can be >1 workers.
-        logger.info("# %s blocks: %d, # CPU blocks: %d",
-                    vllm.platforms.current_platform.device_name,
-                    num_gpu_blocks, num_cpu_blocks)
-        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
-                           self.model_config.max_model_len)
-        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                    self.model_config.max_model_len, max_concurrency)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self.collective_rpc("initialize_cache",
-                            args=(num_gpu_blocks, num_cpu_blocks))
-
-    @deprecated("`llm_engine.model_executor.apply_model` will no longer work "
-                "in V1 Engine. Please replace with `llm_engine.apply_model` "
-                "and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.")
-    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        """
-        Run a function directly on the model inside each worker,
-        returning the result for each of them.
-        """
-        return self.collective_rpc("apply_model", args=(func, ))
-
-    @cached_property  # Avoid unnecessary RPC calls
-    def supported_tasks(self) -> tuple[SupportedTask, ...]:
-        output = self.collective_rpc("get_supported_tasks")
-        return output[0]
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.collective_rpc("execute_model",
-                                     args=(execute_model_req, ))
-        return output[0]
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        """Releases parallel workers from model loop."""
-        return
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("add_lora", args=(lora_request, )))
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("remove_lora", args=(lora_id, )))
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("pin_lora", args=(lora_id, )))
-
-    def list_loras(self) -> Set[int]:
-        sets = self.collective_rpc("list_loras")
-        for s in sets:
-            assert s == sets[0], "All workers should have the same LORAs."
-        return sets[0]
-
-    def start_profile(self) -> None:
-        self.collective_rpc("start_profile")
-
-    def stop_profile(self) -> None:
-        self.collective_rpc("stop_profile")
-
-    def sleep(self, level: int = 1):
-        if self.is_sleeping:
-            logger.warning("Executor is already sleeping.")
-            return
-        time_before_sleep = time.perf_counter()
-        self.collective_rpc("sleep", kwargs=dict(level=level))
-        time_after_sleep = time.perf_counter()
-        self.sleeping_tags = {"weights", "kv_cache"}
-        self.is_sleeping = True
-        logger.info("It took %.6f seconds to fall asleep.",
-                    time_after_sleep - time_before_sleep)
-
-    def wake_up(self, tags: Optional[list[str]] = None):
-        if not self.is_sleeping:
-            logger.warning("Executor is not sleeping.")
-            return
-        if tags:
-            for tag in tags:
-                if tag not in self.sleeping_tags:
-                    logger.warning("Tag %s is not in sleeping tags %s", tag,
-                                   self.sleeping_tags)
-                    return
-        time_before_wakeup = time.perf_counter()
-        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
-        time_after_wakeup = time.perf_counter()
-        logger.info("It took %.6f seconds to wake up tags %s.",
-                    time_after_wakeup - time_before_wakeup,
-                    tags if tags is not None else self.sleeping_tags)
-        if tags:
-            for tag in tags:
-                self.sleeping_tags.remove(tag)
-        else:
-            self.sleeping_tags.clear()
-        if not self.sleeping_tags:
-            self.is_sleeping = False
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self.collective_rpc("save_sharded_state",
-                            kwargs=dict(path=path,
-                                        pattern=pattern,
-                                        max_size=max_size))
-
-    @abstractmethod
-    def check_health(self) -> None:
-        """Checks if the executor is healthy. If not, it should raise an
-        exception."""
-        raise NotImplementedError
-
-    def shutdown(self) -> None:
-        """Shutdown the executor."""
-        self.collective_rpc("shutdown")
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        """Executes one model step on the given sequences."""
-        output = await make_async(self.execute_model)(execute_model_req)
-        return output
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        """Releases parallel workers from model loop."""
-        return
-
-    async def check_health_async(self) -> None:
-        """Checks if the executor is healthy. If not, it should raise an
-        exception."""
-        self.check_health()
-
-    def init_kv_output_aggregator(self, finished_count: Optional[int]) -> None:
-        """Init KVOutputAggregator"""
-        self.kv_output_aggregator = KVOutputAggregator(
-            finished_count or self.parallel_config.world_size)
-
-
-class DistributedExecutorBase(ExecutorBase):
-    """Abstract superclass of distributed executor implementations."""
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-
-        super().__init__(*args, **kwargs)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        # TODO: unify into collective_rpc
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_tensor_parallel_workers_only=True)
-
-        # Only the driver worker returns the sampling results.
-        driver_outputs = self._driver_execute_model(execute_model_req)
-        assert driver_outputs is not None
-        return driver_outputs
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model(execute_model_req=None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    @abstractmethod
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution loop
-        running in each of the remote workers. In this case, this method
-        returns None. Otherwise, this method returns the model output.
-        """
-        raise NotImplementedError
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[Any]:
-        return self._run_workers(method, *args, **(kwargs or {}))
-
-    @abstractmethod
-    def _run_workers(
-        self,
-        method: Union[str, Callable],
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-        
-        # TODO: simplify and merge with collective_rpc
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        raise NotImplementedError
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    @abstractmethod
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-        """Execute the model asynchronously in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def _start_worker_execution_loop(self):
-        """Run execution loop on all workers. It guarantees all workers run
-        the loop or None of them is running the loop. Loop can be stopped by
-        `stop_remote_worker_execution_loop`.
-        The API is idempotent (guarantee only 1 loop run at any moment)."""
-        raise NotImplementedError
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
deleted file mode 100644
index 4ce6d8dfad2cc4d2277202c9f23e48d69a5c9513..0000000000000000000000000000000000000000
--- a/vllm/executor/msgspec_utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from array import array
-from typing import Any, Type
-
-from vllm.multimodal.inputs import MultiModalKwargs
-from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
-
-
-def encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
-
-    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
-    """
-    if isinstance(obj, array):
-        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
-            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
-            f"Given array has a type code of {obj.typecode}.")
-        return obj.tobytes()
-    if isinstance(obj, MultiModalKwargs):
-        return dict(obj)
-
-
-def decode_hook(type: Type, obj: Any) -> Any:
-    """Custom msgspec dec hook that supports array types and MultiModalKwargs.
-
-    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
-    """
-    if type is array:
-        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
-        deserialized.frombytes(obj)
-        return deserialized
-    if type is MultiModalKwargs:
-        return MultiModalKwargs(obj)
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
deleted file mode 100644
index 84747575b4960b9f7b161a197aa1dbd6b5b121c1..0000000000000000000000000000000000000000
--- a/vllm/executor/ray_distributed_executor.py
+++ /dev/null
@@ -1,699 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
-
-import cloudpickle
-import msgspec
-
-import vllm.envs as envs
-from vllm.executor.executor_base import (
-    DistributedExecutorBase)  # yapf: disable
-from vllm.executor.msgspec_utils import encode_hook
-from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
-                                     ray)
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.ray.ray_env import get_env_vars_to_copy
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
-from vllm.v1.outputs import SamplerOutput
-
-if ray is not None:
-    from ray.actor import ActorHandle
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-else:
-    ActorHandle = None
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class RayWorkerMetaData:
-    """
-    Metadata for a Ray worker.
-    The order of ray worker creation can be random,
-    and we need to reset the rank after creating all workers.
-    """
-    worker: ActorHandle
-    created_rank: int
-    adjusted_rank: int = -1
-    ip: str = ""
-
-
-class RayDistributedExecutor(DistributedExecutorBase):
-    """Ray-based distributed executor"""
-
-    # These env vars are worker-specific, therefore are NOT copied
-    # from the driver to the workers
-    WORKER_SPECIFIC_ENV_VARS = {
-        "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES"
-    }
-
-    # These non-vLLM env vars are copied from the driver to workers
-    ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}
-
-    uses_ray: bool = True
-
-    def _init_executor(self) -> None:
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
-        if envs.VLLM_USE_V1:
-            # V1 uses SPMD worker and compiled DAG
-            os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
-            os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
-
-            # For TPU or XPU, avoid compiling NVIDIA's NCCL
-            if current_platform.is_tpu() or current_platform.is_xpu():
-                os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
-
-        # If the env var is set, it uses the Ray's compiled DAG API
-        # which optimizes the control plane overhead.
-        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-        # Currently, this requires USE_RAY_SPMD_WORKER=True.
-        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
-        # If the env var is set, then we do not distinguish between the
-        # "driver worker" vs other workers. Also, the rank 0 worker will
-        # be executed in a remote Ray worker. Currently this requires
-        # USE_RAY_COMPILED_DAG=True.
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if self.use_ray_compiled_dag:
-            assert self.use_ray_spmd_worker, (
-                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
-                "VLLM_USE_RAY_SPMD_WORKER=1")
-        if self.use_ray_spmd_worker:
-            # TODO: Support SPMD worker for non-DAG Ray executor.
-            assert self.use_ray_compiled_dag, (
-                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
-                "VLLM_USE_RAY_COMPILED_DAG=1")
-
-        assert self.uses_ray
-        initialize_ray_cluster(self.parallel_config)
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel GPU workers.
-        self._init_workers_ray(placement_group)
-
-        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-        self.output_decoder = msgspec.msgpack.Decoder(
-            Optional[List[SamplerOutput]])
-        self.use_v1 = envs.VLLM_USE_V1
-
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-        if not self.use_ray_compiled_dag:
-            self.driver_exec_method = make_async(
-                self.driver_worker.execute_method)
-
-    def shutdown(self) -> None:
-        if logger:
-            # Somehow logger can be None here.
-            logger.info(
-                "Shutting down Ray distributed executor. If you see error log "
-                "from logging.cc regarding SIGTERM received, please ignore "
-                "because this is the expected termination process in Ray.")
-        if hasattr(self, "forward_dag") and self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
-            self.forward_dag = None
-
-    def _configure_ray_workers_use_nsight(self,
-                                          ray_remote_kwargs) -> Dict[str, Any]:
-        # If nsight profiling is enabled, we need to set the profiling
-        # configuration for the ray workers as runtime env.
-        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
-        runtime_env.update({
-            "nsight": {
-                "t": "cuda,cudnn,cublas",
-                "o": "'worker_process_%p'",
-                "cuda-graph-trace": "node",
-            }
-        })
-
-        return ray_remote_kwargs
-
-    # child class could overwrite this to return actual env vars.
-    def _get_env_vars_to_be_updated(self):
-        return self._env_vars_for_all_workers
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
-
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Used in ray compiled DAG: indexed first by PP rank,
-        # and then TP rank. In other words, the inner list is
-        # the TP group of workers for a PP rank.
-        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
-
-        if self.parallel_config.ray_workers_use_nsight:
-            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
-                ray_remote_kwargs)
-
-        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
-
-        # Create the workers.
-        bundle_indices: List[int]
-        if envs.VLLM_RAY_BUNDLE_INDICES:
-            # Use the bundle indices specified by the user.
-            bundle_indices = list(
-                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
-            assert len(bundle_indices) == self.parallel_config.world_size, \
-            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
-            f" as the world size, but got {bundle_indices=} "
-            f"and {self.parallel_config.world_size=}")
-            assert len(set(bundle_indices)) == len(bundle_indices), \
-            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
-            f" but got {bundle_indices=}")
-        else:
-            # use the first N bundles that have GPU resources.
-            bundle_indices = []
-            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-                if bundle.get(current_platform.ray_device_key, 0):
-                    bundle_indices.append(bundle_id)
-            bundle_indices = bundle_indices[:self.parallel_config.world_size]
-
-        worker_metadata: List[RayWorkerMetaData] = []
-        driver_ip = get_ip()
-        for rank, bundle_id in enumerate(bundle_indices):
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            if current_platform.ray_device_key == "GPU":
-                # NV+AMD GPUs, and Intel XPUs
-                worker = ray.remote(
-                    num_cpus=0,
-                    num_gpus=num_gpus,
-                    scheduling_strategy=scheduling_strategy,
-                    **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rpc_rank=rank)
-            else:
-                worker = ray.remote(
-                    num_cpus=0,
-                    num_gpus=0,
-                    resources={current_platform.ray_device_key: num_gpus},
-                    scheduling_strategy=scheduling_strategy,
-                    **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rpc_rank=rank)
-            worker_metadata.append(
-                RayWorkerMetaData(worker=worker, created_rank=rank))
-
-        worker_ips = ray.get([
-            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
-            for each in worker_metadata
-        ])
-
-        for each, ip in zip(worker_metadata, worker_ips):
-            each.ip = ip
-
-        if not self.use_ray_spmd_worker:
-            for i, each in enumerate(worker_metadata):
-                # find and remove the dummy worker from the list
-                worker = each.worker
-                worker_ip = each.ip
-                if self.driver_dummy_worker is None and worker_ip == driver_ip:
-                    # If the worker is on the same node as the driver, we use it
-                    # as the resource holder for the driver process.
-                    self.driver_dummy_worker = worker
-                    self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config, rpc_rank=0)
-                    worker_metadata.pop(i)
-                    break
-
-        logger.debug("workers: %s", worker_metadata)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any GPUs on the driver node."
-                f"Driver IP: {driver_ip}, worker IPs: {worker_ips}."
-                "Consider adjusting the Ray placement group or running "
-                "the driver on a GPU node.")
-
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = item.ip
-            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        sorted_worker_metadata = sorted(worker_metadata,
-                                        key=sort_by_driver_then_worker_ip)
-        start_rank = 0 if self.use_ray_spmd_worker else 1
-        for i, item in enumerate(sorted_worker_metadata):
-            item.adjusted_rank = i + start_rank
-        self.workers = [item.worker for item in sorted_worker_metadata]
-        rerank_mapping = {
-            item.created_rank: item.adjusted_rank
-            for item in sorted_worker_metadata
-        }
-        self._run_workers("adjust_rank", rerank_mapping)
-
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)  # node id -> list of worker ranks
-        node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-            # `gpu_ids` can be a list of strings or integers.
-            # convert them to integers for consistency.
-            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-            # string sorting is not sufficient.
-            # see https://github.com/vllm-project/vllm/issues/5590
-            gpu_ids = [int(x) for x in gpu_ids]
-            node_gpus[node_id].extend(gpu_ids)
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        all_ips = set(worker_ips + [driver_ip])
-        n_ips = len(all_ips)
-        n_nodes = len(node_workers)
-
-        if n_nodes != n_ips:
-            raise RuntimeError(
-                f"Every node should have a unique IP address. Got {n_nodes}"
-                f" nodes with node ids {list(node_workers.keys())} and "
-                f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP`"
-                " environment variable, make sure it is unique for"
-                " each node.")
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [{
-            current_platform.device_control_env_var:
-            ",".join(map(str, node_gpus[node_id])),
-        } for (node_id, _) in worker_node_and_gpu_ids]
-
-        # Environment variables to copy from driver to workers
-        env_vars_to_copy = get_env_vars_to_copy(
-            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
-            additional_vars=set(current_platform.additional_env_vars).union(
-                self.ADDITIONAL_ENV_VARS),
-            destination="workers")
-
-        # Copy existing env vars to each worker's args
-        for args in all_args_to_update_environment_variables:
-            # TODO: refactor platform-specific env vars
-            for name in env_vars_to_copy:
-                if name in os.environ:
-                    args[name] = os.environ[name]
-
-        self._env_vars_for_all_workers = (
-            all_args_to_update_environment_variables)
-
-        self._run_workers("update_environment_variables",
-                          self._get_env_vars_to_be_updated())
-
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        all_kwargs = []
-        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-            local_rank = node_workers[node_id].index(rank)
-            kwargs = dict(
-                vllm_config=self.vllm_config,
-                local_rank=local_rank,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                is_driver_worker=(not self.parallel_config)
-                or (rank % self.parallel_config.tensor_parallel_size == 0),
-            )
-            all_kwargs.append(kwargs)
-        self._run_workers("init_worker", all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-        if self.use_ray_spmd_worker:
-            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-                self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        self.parallel_config.tensor_parallel_size):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
-                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
-                    assert pp_rank < len(self.pp_tp_workers)
-                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
-
-        # This is the list of workers that are rank 0 of each TP group EXCEPT
-        # global rank 0. These are the workers that will broadcast to the
-        # rest of the workers.
-        self.tp_driver_workers: List[RayWorkerWrapper] = []
-        # This is the list of workers that are not drivers and not the first
-        # worker in a TP group. These are the workers that will be
-        # broadcasted to.
-        self.non_driver_workers: List[RayWorkerWrapper] = []
-
-        # Enforce rank order for correct rank to return final output.
-        for index, worker in enumerate(self.workers):
-            # The driver worker is rank 0 and not in self.workers.
-            rank = index + 1
-            if rank % self.parallel_config.tensor_parallel_size == 0:
-                self.tp_driver_workers.append(worker)
-            else:
-                self.non_driver_workers.append(worker)
-
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return super().execute_model(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
-
-        if self.use_v1:
-            serialized_data = execute_model_req
-        else:
-            serialized_data = self.input_encoder.encode(execute_model_req)
-        outputs = ray.get(self.forward_dag.execute(serialized_data))
-        if self.use_v1:
-            output = outputs[0]
-        else:
-            output = self.output_decoder.decode(outputs[0])
-        return output
-
-    def _run_workers(
-        self,
-        method: Union[str, Callable],
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        Args:
-        - async_run_tensor_parallel_workers_only: If True the method will be
-          run only in the remote TP workers, not the driver worker.
-          It will also be run asynchronously and return a list of futures
-          rather than blocking on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        """
-        if isinstance(method, str):
-            sent_method = method
-        else:
-            sent_method = cloudpickle.dumps(method)
-        del method
-        if self.use_ray_spmd_worker:
-            assert not async_run_tensor_parallel_workers_only, (
-                "async_run_tensor_parallel_workers_only is not supported for "
-                "spmd mode.")
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        # Start the ray workers first.
-        ray_workers = self.workers
-        if async_run_tensor_parallel_workers_only:
-            ray_workers = self.non_driver_workers
-        ray_worker_outputs = [
-            worker.execute_method.remote(sent_method, *args, **kwargs)
-            for worker in ray_workers
-        ]
-
-        if async_run_tensor_parallel_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_worker_output = []
-        # In SPMD mode, the driver worker is the same as any other worker,
-        # so we only explicitly execute on the driver worker if using a
-        # non-SPMD worker class.
-        if not self.use_ray_spmd_worker:
-            # Start the driver worker after all the ray workers.
-            driver_worker_output = [
-                self.driver_worker.execute_method(sent_method, *args, **kwargs)
-            ]
-
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return driver_worker_output + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def _check_ray_cgraph_installation(self):
-        import importlib.metadata
-
-        from packaging import version
-
-        required_version = version.parse("2.43.0")
-        current_version = version.parse(importlib.metadata.version("ray"))
-        if current_version < required_version:
-            raise ValueError(f"Ray version {required_version} is "
-                             f"required, but found {current_version}")
-
-        import importlib.util
-        cgraph_spec = importlib.util.find_spec(
-            "ray.experimental.compiled_dag_ref")
-        if cgraph_spec is None:
-            raise ValueError("Ray Compiled Graph is not installed. "
-                             "Run `pip install ray[cgraph]` to install it.")
-
-        cupy_spec = importlib.util.find_spec("cupy")
-        if (cupy_spec is None
-                and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"):
-            raise ValueError(
-                "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
-                "Run `pip install ray[cgraph]` and check cupy installation.")
-
-    def _compiled_ray_dag(self, enable_asyncio: bool):
-        assert self.parallel_config.use_ray
-        self._check_ray_cgraph_installation()
-        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
-        # (it is 10 seconds by default). This is a Ray environment variable to
-        # control the timeout of getting result from a compiled graph execution,
-        # i.e., the distributed execution that includes model forward runs and
-        # intermediate tensor communications, in the case of vllm.
-        # Note: we should set this env var before importing
-        # ray.dag, otherwise it will not take effect.
-        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
-        from ray.dag import InputNode, MultiOutputNode
-        logger.info("RAY_CGRAPH_get_timeout is set to %s",
-                    os.environ["RAY_CGRAPH_get_timeout"])  # noqa: SIM112
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
-
-        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
-        if channel_type not in ("auto", "nccl", "shm"):
-            raise ValueError(
-                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
-                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
-
-        with InputNode() as input_data:
-            # Example DAG: PP=2, TP=4
-            #
-            # For V0:
-            # ExecuteModelRequest -> 0 -> (ExecuteModelReq, IntermediateTensors) -> 4 -> SamplerOutput   # noqa: E501
-            # ExecuteModelRequest -> 1 -> (ExecuteModelReq, IntermediateTensors) -> 5 -> SamplerOutput   # noqa: E501
-            # ExecuteModelRequest -> 2 -> (ExecuteModelReq, IntermediateTensors) -> 6 -> SamplerOutput   # noqa: E501
-            # ExecuteModelRequest -> 3 -> (ExecuteModelReq, IntermediateTensors) -> 7 -> SamplerOutput   # noqa: E501
-            #
-            # For V1:
-            # SchedulerOutput -> 0 -> (SchedulerOutput, IntermediateTensors) -> 4 -> ModelRunnerOutput   # noqa: E501
-            # SchedulerOutput -> 1 -> (SchedulerOutput, IntermediateTensors) -> 5 -> ModelRunnerOutput   # noqa: E501
-            # SchedulerOutput -> 2 -> (SchedulerOutput, IntermediateTensors) -> 6 -> ModelRunnerOutput   # noqa: E501
-            # SchedulerOutput -> 3 -> (SchedulerOutput, IntermediateTensors) -> 7 -> ModelRunnerOutput   # noqa: E501
-
-            # All workers in the first TP group will take in the
-            # ExecuteModelRequest as input.
-            outputs = [input_data for _ in self.pp_tp_workers[0]]
-            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
-                # Each PP worker takes in the output of the previous PP worker,
-                # and the TP group executes in SPMD fashion.
-                if self.use_v1:
-                    outputs = [
-                        worker.execute_model_ray.
-                        bind(  # type: ignore[attr-defined]
-                            outputs[i]) for i, worker in enumerate(tp_group)
-                    ]
-                else:
-                    outputs = [
-                        worker.execute_model_spmd.
-                        bind(  # type: ignore[attr-defined]
-                            outputs[i]) for i, worker in enumerate(tp_group)
-                    ]
-
-                last_pp_rank = len(self.pp_tp_workers) - 1
-                if (pp_rank < last_pp_rank and
-                        envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"):
-                    # Specify how intermediate tensors should be passed
-                    # between pp stages, no need to specify for the last
-                    # pp stage or when using shared memory (the default).
-                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
-                    outputs = [
-                        output.with_tensor_transport(transport=transport)
-                        for output in outputs
-                    ]
-
-            forward_dag = MultiOutputNode(outputs)
-
-        if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
-            from ray.experimental.channel.accelerator_context import (
-                register_accelerator_context)
-
-            from vllm.distributed.device_communicators.ray_communicator import (
-                RayPPCommunicator)
-            register_accelerator_context(torch_module_name="cuda",
-                                         communicator_cls=RayPPCommunicator)
-            logger.info("Using RayPPCommunicator "
-                        "(which wraps vLLM _PP GroupCoordinator) "
-                        "for Ray Compiled Graph communication.")
-        else:
-            logger.info("Using Ray's NCCL communicator for "
-                        "Ray Compiled Graph communication.")
-
-        return forward_dag.experimental_compile(
-            enable_asyncio=enable_asyncio,
-            _overlap_gpu_communication=envs.
-            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
-
-    def __del__(self):
-        self.shutdown()
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return await super().execute_model_async(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
-
-        serialized_data = self.input_encoder.encode(execute_model_req)
-        dag_future = await self.forward_dag.execute_async(serialized_data)
-        output = await dag_future[0]
-        return self.output_decoder.decode(output)
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        if not self.tp_driver_workers:
-            return await self.driver_exec_method("execute_model",
-                                                 execute_model_req)
-        if self.pp_locks is None:
-            # This locks each pipeline parallel stage so multiple virtual
-            # engines can't execute on the same stage at the same time
-            # We create the locks here to avoid creating them in the constructor
-            # which uses a different asyncio loop.
-            self.pp_locks = [
-                asyncio.Lock()
-                for _ in range(self.parallel_config.pipeline_parallel_size)
-            ]
-
-        tasks = [
-            asyncio.create_task(
-                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
-                                    "execute_model", execute_model_req))
-        ]
-        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
-                                                start=1):
-            tasks.append(
-                asyncio.create_task(
-                    _run_task_with_lock(driver_worker.execute_method.remote,
-                                        self.pp_locks[pp_rank],
-                                        "execute_model", execute_model_req)))
-
-        results = await asyncio.gather(*tasks)
-
-        # Only the last PP stage has the final results.
-        return results[-1]
-
-    async def _start_worker_execution_loop(self):
-        assert not self.use_ray_spmd_worker, (
-            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.non_driver_workers
-        ]
-        return await asyncio.gather(*coros)
-
-    def check_health(self) -> None:
-        # Assume that the Ray workers are healthy.
-        # TODO: check the health of the Ray workers
-        return
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 344628e888c4094be8dc148485ab2c7bf4d81747..25fb7181a8f29cf954e2330f976d19a96b34e551 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,16 +5,15 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import torch
-import torch.distributed as dist
 
 import vllm.envs as envs
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
+from vllm.v1.worker.ubatch_utils import UBatchSlices
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -34,44 +33,52 @@ class BatchDescriptor(NamedTuple):
     items as minimal as possible to properly and uniquely describe the padded
     batch for cudagraph.
     """
+
     num_tokens: int
     uniform_decode: bool = False
     """
     False can also be used for an uniform decode batch to dispatch to the 
     cudagraph supporting non-uniform batches.
     """
+    has_lora: bool = False
+    """
+    Whether this batch has active LoRA adapters.
+    """
 
     @property
     def non_uniform(self) -> "BatchDescriptor":
         """
         Return a non-uniform version of current batch descriptor.
         """
-        return BatchDescriptor(self.num_tokens, uniform_decode=False)
+        return BatchDescriptor(
+            self.num_tokens, uniform_decode=False, has_lora=self.has_lora
+        )
 
 
-def _compute_sp_num_tokens(num_tokens_across_dp_cpu: torch.Tensor,
-                           sequence_parallel_size: int) -> list[int]:
-    sp_tokens = ((num_tokens_across_dp_cpu + sequence_parallel_size - 1) //
-                 sequence_parallel_size)
+def _compute_sp_num_tokens(
+    num_tokens_across_dp_cpu: torch.Tensor, sequence_parallel_size: int
+) -> list[int]:
+    sp_tokens = (
+        num_tokens_across_dp_cpu + sequence_parallel_size - 1
+    ) // sequence_parallel_size
 
     sp_tokens = sp_tokens.repeat_interleave(sequence_parallel_size)
     return sp_tokens.tolist()
 
 
-def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: torch.Tensor,
-                                      sequence_parallel_size: int,
-                                      max_num_tokens: int,
-                                      chunk_idx: int) -> list[int]:
-
-    sp_tokens = _compute_sp_num_tokens(num_tokens_across_dp_cpu,
-                                       sequence_parallel_size)
+def _compute_chunked_local_num_tokens(
+    num_tokens_across_dp_cpu: torch.Tensor,
+    sequence_parallel_size: int,
+    max_num_tokens: int,
+    chunk_idx: int,
+) -> list[int]:
+    sp_tokens = _compute_sp_num_tokens(num_tokens_across_dp_cpu, sequence_parallel_size)
     sp_size = len(sp_tokens)
 
     local_size = [-1] * sp_size
     for i in range(sp_size):
         # Take into account sharding if MoE activation is sequence parallel.
-        local_size[i] = min(max_num_tokens,
-                            sp_tokens[i] - (max_num_tokens * chunk_idx))
+        local_size[i] = min(max_num_tokens, sp_tokens[i] - (max_num_tokens * chunk_idx))
         if local_size[i] <= 0:
             local_size[i] = 1  # ensure lockstep even if done
     return local_size
@@ -83,128 +90,31 @@ class DPMetadata:
     num_tokens_across_dp_cpu: torch.Tensor
 
     # NOTE: local_sizes should only be set by the chunked_sizes context manager
-    local_sizes: Optional[list[int]] = None
-
-    @staticmethod
-    def num_tokens_across_dp(num_tokens: int, dp_size: int,
-                             dp_rank: int) -> torch.Tensor:
-        """
-        Gather the num_tokens across all DP ranks and return results in a
-        CPU tensor of size dp_size.
-        """
-        from vllm.distributed.parallel_state import get_dp_group
-        device = current_platform.device_type
-        group = get_dp_group().device_group
-
-        # Transfering this tensor from GPU to CPU will introduce a GPU sync
-        # point that could adversely affect performance of vllm with asynch
-        # scheduling. This environment variable exists to quickly disable
-        # this optimization if we run into this case.
-        if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION:
-            logger.info_once(
-                "Using CPU all reduce to syncronize DP padding between ranks.")
-            device = "cpu"
-            group = get_dp_group().cpu_group
-        num_tokens_across_dp = [0] * dp_size
-        num_tokens_across_dp[dp_rank] = num_tokens
-        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
-                                         device=device,
-                                         dtype=torch.int32)
-        dist.all_reduce(num_tokens_tensor, group=group)
-        return num_tokens_tensor.cpu()
-
-    # Get the cumulative tokens across sequence parallel ranks.
-    # In this case the input to the MoEs will be distributed w.r.t both
-    # DP and TP rank.
-    # When sp_size==1, this is just the cummulative num tokens across DP.
-    def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
-        num_tokens_across_sp_cpu = (
-            (self.num_tokens_across_dp_cpu - 1 + sp_size) // sp_size)
-        num_tokens_across_sp_cpu = (
-            num_tokens_across_sp_cpu.repeat_interleave(sp_size))
-        return torch.cumsum(num_tokens_across_sp_cpu, dim=0)
-
-    @staticmethod
-    def should_ubatch_across_dp(
-            should_ubatch: bool, orig_num_tokens_per_ubatch: int,
-            padded_num_tokens_per_ubatch: int, dp_size: int,
-            dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]:
-        """
-        1. Decides if each DP rank is going to microbatch. Either all ranks
-        run with microbatching or none of them do. If this function decides
-        not to run with microbatching. It will "abort" meaning that no padding
-        information will be returned to the caller. It will return (False, None)
-
-        2. Determines the total number of tokens that each rank will run.
-        All ranks will be padded out so that the run with the same number
-        of tokens
-
-        Returns: tuple[
-            should_ubatch: Are all DP ranks going to microbatch
-            num_tokens_after_padding: A tensor containing the total number of
-            tokens per-microbatch for each DP rank including padding. Will be
-            None if should_ubatch if False
-        ]
-        """
-
-        device = current_platform.device_type
-        tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
-        tensor[0][dp_rank] = orig_num_tokens_per_ubatch
-        tensor[1][dp_rank] = padded_num_tokens_per_ubatch
-        tensor[2][dp_rank] = 1 if should_ubatch else 0
-
-        from vllm.distributed.parallel_state import get_dp_group
-        dist.all_reduce(tensor, group=get_dp_group().device_group)
-
-        result: bool = bool(torch.all(tensor[2] == 1).item())
-        if not result:
-            return result, None
-
-        orig_num_tokens_tensor = tensor[0, :]
-        padded_num_tokens_tensor = tensor[1, :]
-
-        orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
-        padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
-        if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
-            logger.debug("Aborting ubatching %s %s", orig_min_num_tokens,
-                         padded_max_num_tokens)
-            return False, None
-        return result, padded_num_tokens_tensor.cpu()
+    local_sizes: list[int] | None = None
 
     @staticmethod
     def make(
         parallel_config: ParallelConfig,
-        attn_metadata: Any,
         num_tokens: int,
-        num_tokens_across_dp_cpu: Optional[torch.Tensor] = None
+        num_tokens_across_dp_cpu: torch.Tensor,
     ) -> "DPMetadata":
-
+        assert num_tokens_across_dp_cpu is not None
         assert parallel_config.data_parallel_size > 1
-        dp_size = parallel_config.data_parallel_size
         dp_rank = parallel_config.data_parallel_rank
-        if attn_metadata is not None and hasattr(attn_metadata,
-                                                 "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
-        else:
-            # for v1 attention backends or no attn_metadata
-            batchsize = num_tokens
+        batchsize = num_tokens
 
         # If num_tokens_across_dp is None, it will be computed by all_reduce
         # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
-        assert (num_tokens_across_dp_cpu is None
-                or num_tokens_across_dp_cpu[dp_rank] == batchsize
-                ), f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
-        if num_tokens_across_dp_cpu is None:
-            num_tokens_across_dp_cpu = DPMetadata.num_tokens_across_dp(
-                batchsize, dp_size, dp_rank)
+        assert num_tokens_across_dp_cpu[dp_rank] == batchsize, (
+            f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
+        )
         max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
         return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)
 
     @contextmanager
-    def chunked_sizes(self, sequence_parallel_size: int,
-                      max_chunk_size_per_rank: int, chunk_idx: int):
+    def chunked_sizes(
+        self, sequence_parallel_size: int, max_chunk_size_per_rank: int, chunk_idx: int
+    ):
         """
         Context manager to compute and temporarily set the per-rank local token
         sizes for a specific chunk during chunked forward execution.
@@ -225,13 +135,16 @@ class DPMetadata:
                                     we use SP between the layers to avoid
                                     redundant ops. We need this value to
                                     compute the chunked sizes.
-            max_chunk_size_per_rank: The max number of tokens each rank is 
+            max_chunk_size_per_rank: The max number of tokens each rank is
                                      allowed to process in this chunk.
             chunk_idx: The index of the chunk to compute sizes for.
         """
         self.local_sizes = _compute_chunked_local_num_tokens(
-            self.num_tokens_across_dp_cpu, sequence_parallel_size,
-            max_chunk_size_per_rank, chunk_idx)
+            self.num_tokens_across_dp_cpu,
+            sequence_parallel_size,
+            max_chunk_size_per_rank,
+            chunk_idx,
+        )
         try:
             yield self.local_sizes
         finally:
@@ -244,79 +157,96 @@ class DPMetadata:
         but without any chunking.
         """
         self.local_sizes = _compute_sp_num_tokens(
-            self.num_tokens_across_dp_cpu, sequence_parallel_size)
+            self.num_tokens_across_dp_cpu, sequence_parallel_size
+        )
         try:
             yield self.local_sizes
         finally:
             self.local_sizes = None
 
-    def get_chunk_sizes_across_dp_rank(self) -> Optional[list[int]]:
+    def get_chunk_sizes_across_dp_rank(self) -> list[int] | None:
         assert self.local_sizes is not None
         return self.local_sizes
 
+    # Get the cumulative tokens across sequence parallel ranks.
+    # In this case the input to the MoEs will be distributed w.r.t both
+    # DP and TP rank.
+    # When sp_size==1, this is just the cummulative num tokens across DP.
+    def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
+        num_tokens_across_sp_cpu = (
+            self.num_tokens_across_dp_cpu - 1 + sp_size
+        ) // sp_size
+        num_tokens_across_sp_cpu = num_tokens_across_sp_cpu.repeat_interleave(sp_size)
+        return torch.cumsum(num_tokens_across_sp_cpu, dim=0)
+
 
 @dataclass
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
     """
-    Type AttentionMetadata for v0, 
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
     Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"],
-                         list[dict[str, "AttentionMetadata"]]]
+    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
-    dp_metadata: Optional[DPMetadata] = None
+    dp_metadata: DPMetadata | None = None
     # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE.
     # by default NONE, no cudagraph is used.
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
-    batch_descriptor: Optional[BatchDescriptor] = None
+    batch_descriptor: BatchDescriptor | None = None
 
-    ubatch_slices: Optional[UBatchSlices] = None
+    ubatch_slices: UBatchSlices | None = None
 
     def __post_init__(self):
-        assert self.cudagraph_runtime_mode in [
-            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+        assert self.cudagraph_runtime_mode.valid_runtime_modes(), (
             f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
+        )
 
 
-_forward_context: Optional[ForwardContext] = None
+_forward_context: ForwardContext | None = None
 
 
 def get_forward_context() -> ForwardContext:
     """Get the current forward context."""
     assert _forward_context is not None, (
         "Forward context is not set. "
-        "Please use `set_forward_context` to set the forward context.")
+        "Please use `set_forward_context` to set the forward context."
+    )
     return _forward_context
 
 
+def is_forward_context_available() -> bool:
+    return _forward_context is not None
+
+
 def create_forward_context(
-        attn_metadata: Any,
-        vllm_config: VllmConfig,
-        virtual_engine: int = 0,
-        dp_metadata: Optional[DPMetadata] = None,
-        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None,
-        ubatch_slices: Optional[UBatchSlices] = None):
-    return ForwardContext(no_compile_layers=vllm_config.compilation_config.
-                          static_forward_context,
-                          virtual_engine=virtual_engine,
-                          attn_metadata=attn_metadata,
-                          dp_metadata=dp_metadata,
-                          cudagraph_runtime_mode=cudagraph_runtime_mode,
-                          batch_descriptor=batch_descriptor,
-                          ubatch_slices=ubatch_slices)
+    attn_metadata: Any,
+    vllm_config: VllmConfig,
+    virtual_engine: int = 0,
+    dp_metadata: DPMetadata | None = None,
+    cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+    batch_descriptor: BatchDescriptor | None = None,
+    ubatch_slices: UBatchSlices | None = None,
+):
+    return ForwardContext(
+        no_compile_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata,
+        dp_metadata=dp_metadata,
+        cudagraph_runtime_mode=cudagraph_runtime_mode,
+        batch_descriptor=batch_descriptor,
+        ubatch_slices=ubatch_slices,
+    )
 
 
 @contextmanager
-def override_forward_context(forward_context: Optional[ForwardContext]):
+def override_forward_context(forward_context: ForwardContext | None):
     """A context manager that overrides the current forward context.
     This is used to override the forward context for a specific
     forward pass.
@@ -332,14 +262,15 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
 
 @contextmanager
 def set_forward_context(
-        attn_metadata: Any,
-        vllm_config: VllmConfig,
-        virtual_engine: int = 0,
-        num_tokens: Optional[int] = None,
-        num_tokens_across_dp: Optional[torch.Tensor] = None,
-        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None,
-        ubatch_slices: Optional[UBatchSlices] = None):
+    attn_metadata: Any,
+    vllm_config: VllmConfig,
+    virtual_engine: int = 0,
+    num_tokens: int | None = None,
+    num_tokens_across_dp: torch.Tensor | None = None,
+    cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+    batch_descriptor: BatchDescriptor | None = None,
+    ubatch_slices: UBatchSlices | None = None,
+):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -349,17 +280,42 @@ def set_forward_context(
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
 
-    dp_metadata: Optional[DPMetadata] = None
+    dp_metadata: DPMetadata | None = None
     if vllm_config.parallel_config.data_parallel_size > 1 and (
-            attn_metadata is not None or num_tokens is not None):
-        dp_metadata = DPMetadata.make(vllm_config.parallel_config,
-                                      attn_metadata, num_tokens or 0,
-                                      num_tokens_across_dp)
-
-    forward_context = create_forward_context(attn_metadata, vllm_config,
-                                             virtual_engine, dp_metadata,
-                                             cudagraph_runtime_mode,
-                                             batch_descriptor, ubatch_slices)
+        attn_metadata is not None or num_tokens is not None
+    ):
+        # If num_tokens_across_dp hasn't already been initialized, then
+        # initialize it here. Both DP padding and Microbatching will be
+        # disabled.
+        if num_tokens_across_dp is None:
+            assert ubatch_slices is None
+            assert num_tokens is not None
+            _, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens,
+                parallel_config=vllm_config.parallel_config,
+                allow_microbatching=False,
+                allow_dp_padding=False,
+            )
+            assert num_tokens_across_dp is not None
+        dp_metadata = DPMetadata.make(
+            vllm_config.parallel_config, num_tokens or 0, num_tokens_across_dp
+        )
+
+    # Convenience: if cudagraph is used and num_tokens is given, we can just
+    # create a batch descriptor here if not given (there's no harm since if it
+    # doesn't match in the wrapper it'll fall through).
+    if cudagraph_runtime_mode != CUDAGraphMode.NONE and num_tokens is not None:
+        batch_descriptor = batch_descriptor or BatchDescriptor(num_tokens=num_tokens)
+
+    forward_context = create_forward_context(
+        attn_metadata,
+        vllm_config,
+        virtual_engine,
+        dp_metadata,
+        cudagraph_runtime_mode,
+        batch_descriptor,
+        ubatch_slices,
+    )
 
     try:
         with override_forward_context(forward_context):
@@ -367,24 +323,18 @@ def set_forward_context(
     finally:
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = attn_metadata.num_prefill_tokens + \
-                    attn_metadata.num_decode_tokens
-            else:
-                # for v1 attention backends
-                batchsize = num_tokens
+            batchsize = num_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
             from vllm.platforms import current_platform
+
             synchronize = current_platform.synchronize
             if synchronize is not None:
                 synchronize()
             now = time.perf_counter()
             # time measurement is in milliseconds
-            batchsize_forward_time[batchsize].append(
-                (now - forward_start_time) * 1000)
+            batchsize_forward_time[batchsize].append((now - forward_start_time) * 1000)
             if now - last_logging_time > batchsize_logging_interval:
                 last_logging_time = now
                 forward_stats = []
@@ -397,6 +347,10 @@ def set_forward_context(
                     forward_stats.append((bs, len(times), medium))
                 forward_stats.sort(key=lambda x: x[1], reverse=True)
                 if forward_stats:
-                    logger.info(("Batchsize forward time stats "
-                                 "(batchsize, count, median_time(ms)): %s"),
-                                forward_stats)
+                    logger.info(
+                        (
+                            "Batchsize forward time stats "
+                            "(batchsize, count, median_time(ms)): %s"
+                        ),
+                        forward_stats,
+                    )
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 3f1cac531f450cade873f419c44669d5365f334f..d9aed70c9b97988e55ba7ff42a574fbe56a31a95 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,12 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
-                   EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
-                   ProcessorInputs, PromptType, SingletonInputs,
-                   SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
-                   build_explicit_enc_dec_prompt, embeds_inputs,
-                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
+from .data import (
+    DataPrompt,
+    DecoderOnlyInputs,
+    EmbedsInputs,
+    EmbedsPrompt,
+    EncoderDecoderInputs,
+    ExplicitEncoderDecoderPrompt,
+    ProcessorInputs,
+    PromptType,
+    SingletonInputs,
+    SingletonPrompt,
+    TextPrompt,
+    TokenInputs,
+    TokensPrompt,
+    build_explicit_enc_dec_prompt,
+    embeds_inputs,
+    to_enc_dec_tuple_list,
+    token_inputs,
+    zip_enc_dec_prompts,
+)
 
 __all__ = [
     "DataPrompt",
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 1718c0767ab687810e4965bf2ef1e7023fc248ba..1f138a72d08429cfa05bd8e8bf11b12d54c9ded0 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,14 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, cast
 
 import torch
 from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs,
-                                        MultiModalUUIDDict)
+    from vllm.multimodal.inputs import (
+        MultiModalDataDict,
+        MultiModalInputs,
+        MultiModalUUIDDict,
+    )
+else:
+    MultiModalDataDict = object
+    MultiModalInputs = object
+    MultiModalUUIDDict = object
 
 
 class TextPrompt(TypedDict):
@@ -17,13 +24,13 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired["MultiModalDataDict"]
+    multi_modal_data: NotRequired[MultiModalDataDict | None]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any] | None]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -31,7 +38,7 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
-    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    multi_modal_uuids: NotRequired[MultiModalUUIDDict]
     """
     Optional user-specified UUIDs for multimodal items, mapped by modality.
     Lists must match the number of items per modality and may contain `None`.
@@ -58,13 +65,13 @@ class TokensPrompt(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """A list of token type IDs to pass to the cross encoder model."""
 
-    multi_modal_data: NotRequired["MultiModalDataDict"]
+    multi_modal_data: NotRequired[MultiModalDataDict | None]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any] | None]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -72,7 +79,7 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
-    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    multi_modal_uuids: NotRequired[MultiModalUUIDDict]
     """
     Optional user-specified UUIDs for multimodal items, mapped by modality.
     Lists must match the number of items per modality and may contain `None`.
@@ -108,7 +115,7 @@ class DataPrompt(TypedDict):
     """The input data format"""
 
 
-SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
+SingletonPrompt: TypeAlias = str | TextPrompt | TokensPrompt | EmbedsPrompt
 """
 Set of possible schemas for a single prompt:
 
@@ -134,23 +141,27 @@ more than one prompt, i.e.
 
 
 def is_tokens_prompt(prompt: SingletonPrompt) -> TypeIs[TokensPrompt]:
-    return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
-            and "prompt_embeds" not in prompt)
+    return (
+        isinstance(prompt, dict)
+        and "prompt_token_ids" in prompt
+        and "prompt_embeds" not in prompt
+    )
 
 
 def is_embeds_prompt(prompt: SingletonPrompt) -> TypeIs[EmbedsPrompt]:
-    return (isinstance(prompt, dict) and "prompt_token_ids" not in prompt
-            and "prompt_embeds" in prompt)
+    return (
+        isinstance(prompt, dict)
+        and "prompt_token_ids" not in prompt
+        and "prompt_embeds" in prompt
+    )
 
 
-_T1_co = TypeVar("_T1_co",
-                 bound=SingletonPrompt,
-                 default=SingletonPrompt,
-                 covariant=True)
-_T2_co = TypeVar("_T2_co",
-                 bound=SingletonPrompt,
-                 default=SingletonPrompt,
-                 covariant=True)
+_T1_co = TypeVar(
+    "_T1_co", bound=SingletonPrompt, default=SingletonPrompt, covariant=True
+)
+_T2_co = TypeVar(
+    "_T2_co", bound=SingletonPrompt, default=SingletonPrompt, covariant=True
+)
 
 
 # TODO: Make fields ReadOnly once mypy supports it
@@ -178,12 +189,12 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 
     encoder_prompt: _T1_co
 
-    decoder_prompt: Optional[_T2_co]
+    decoder_prompt: _T2_co | None
 
     mm_processor_kwargs: NotRequired[dict[str, Any]]
 
 
-PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
+PromptType: TypeAlias = SingletonPrompt | ExplicitEncoderDecoderPrompt
 """
 Set of possible schemas for an LLM input, including
 both decoder-only and encoder/decoder input types:
@@ -205,11 +216,6 @@ class TokenInputs(TypedDict):
     prompt_token_ids: list[int]
     """The token IDs of the prompt."""
 
-    prompt: NotRequired[str]
-    """
-    The original prompt text corresponding to the token IDs, if available.
-    """
-
     cache_salt: NotRequired[str]
     """
     Optional cache salt to be used for prefix caching.
@@ -218,15 +224,12 @@ class TokenInputs(TypedDict):
 
 def token_inputs(
     prompt_token_ids: list[int],
-    prompt: Optional[str] = None,
-    cache_salt: Optional[str] = None,
+    cache_salt: str | None = None,
 ) -> TokenInputs:
     """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
     values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
 
-    if prompt is not None:
-        inputs["prompt"] = prompt
     if cache_salt is not None:
         inputs["cache_salt"] = cache_salt
 
@@ -250,7 +253,7 @@ class EmbedsInputs(TypedDict):
 
 def embeds_inputs(
     prompt_embeds: torch.Tensor,
-    cache_salt: Optional[str] = None,
+    cache_salt: str | None = None,
 ) -> EmbedsInputs:
     """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
     values."""
@@ -262,7 +265,7 @@ def embeds_inputs(
     return inputs
 
 
-DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
 """
 The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
 passed to the model executor.
@@ -278,20 +281,20 @@ class EncoderDecoderInputs(TypedDict):
     This specifies the required data for encoder-decoder models.
     """
 
-    encoder: Union[TokenInputs, "MultiModalInputs"]
+    encoder: TokenInputs | MultiModalInputs
     """The inputs for the encoder portion."""
 
-    decoder: Union[TokenInputs, "MultiModalInputs"]
+    decoder: TokenInputs | MultiModalInputs
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+SingletonInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
 """
 A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
 passed to [`Sequence`][collections.abc.Sequence].
 """
 
-ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
+ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs
 """
 The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
 """
@@ -302,8 +305,8 @@ _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
 
 def build_explicit_enc_dec_prompt(
     encoder_prompt: _T1,
-    decoder_prompt: Optional[_T2],
-    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    decoder_prompt: _T2 | None,
+    mm_processor_kwargs: dict[str, Any] | None = None,
 ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
     if mm_processor_kwargs is None:
         mm_processor_kwargs = {}
@@ -316,16 +319,15 @@ def build_explicit_enc_dec_prompt(
 
 def zip_enc_dec_prompts(
     enc_prompts: Iterable[_T1],
-    dec_prompts: Iterable[Optional[_T2]],
-    mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
-                                        dict[str, Any]]] = None,
+    dec_prompts: Iterable[_T2 | None],
+    mm_processor_kwargs: Iterable[dict[str, Any]] | dict[str, Any] | None = None,
 ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
     [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
     instances.
 
-    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
+    `mm_processor_kwargs` may also be provided; if a dict is passed, the same
     dictionary will be used for every encoder/decoder prompt. If an iterable is
     provided, it will be zipped with the encoder/decoder prompts.
     """
@@ -337,20 +339,21 @@ def zip_enc_dec_prompts(
                 encoder_prompt,
                 decoder_prompt,
                 cast(dict[str, Any], mm_processor_kwargs),
-            ) for (encoder_prompt,
-                   decoder_prompt) in zip(enc_prompts, dec_prompts)
+            )
+            for (encoder_prompt, decoder_prompt) in zip(enc_prompts, dec_prompts)
         ]
     return [
-        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
-                                      mm_proc_kwargs)
-        for (encoder_prompt, decoder_prompt, mm_proc_kwargs
-             ) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
+        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, mm_proc_kwargs)
+        for (encoder_prompt, decoder_prompt, mm_proc_kwargs) in zip(
+            enc_prompts, dec_prompts, mm_processor_kwargs
+        )
     ]
 
 
 def to_enc_dec_tuple_list(
     enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
-) -> list[tuple[_T1, Optional[_T2]]]:
-    return [(enc_dec_prompt["encoder_prompt"],
-             enc_dec_prompt["decoder_prompt"])
-            for enc_dec_prompt in enc_dec_prompts]
+) -> list[tuple[_T1, _T2 | None]]:
+    return [
+        (enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"])
+        for enc_dec_prompt in enc_dec_prompts
+    ]
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 8c3700799e4ab9160057bfee164acff4b0b57e41..211551be8e60b4b8d7dd41b4965d4196cfa41fa8 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,45 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Literal, Optional, TypedDict, Union, cast, overload
+from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cast
 
 from typing_extensions import TypeIs
 
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
 
-from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
-                   PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
-                   TokensPrompt)
+from .data import (
+    EmbedsPrompt,
+    ExplicitEncoderDecoderPrompt,
+    ProcessorInputs,
+    PromptType,
+    SingletonInputs,
+    SingletonPrompt,
+    TextPrompt,
+    TokensPrompt,
+)
 
-
-class ParsedText(TypedDict):
-    content: str
-    is_tokens: Literal[False]
-
-
-class ParsedTokens(TypedDict):
-    content: list[int]
-    is_tokens: Literal[True]
-
-
-@overload
-def parse_and_batch_prompt(
-    prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
-    ...
+if TYPE_CHECKING:
+    import torch
 
 
-@overload
-def parse_and_batch_prompt(
-    prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
-    ...
-
-
-def parse_and_batch_prompt(
-    prompt: Union[str, list[str], list[int], list[list[int]]],
-) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
+def parse_raw_prompts(
+    prompt: str | list[str] | list[int] | list[list[int]],
+) -> Sequence[TextPrompt] | Sequence[TokensPrompt]:
     if isinstance(prompt, str):
         # case 1: a string
-        return [ParsedText(content=prompt, is_tokens=False)]
+        return [TextPrompt(prompt=prompt)]
 
     if isinstance(prompt, list):
         if len(prompt) == 0:
@@ -48,13 +36,11 @@ def parse_and_batch_prompt(
         if is_list_of(prompt, str):
             # case 2: array of strings
             prompt = cast(list[str], prompt)
-            return [
-                ParsedText(content=elem, is_tokens=False) for elem in prompt
-            ]
+            return [TextPrompt(prompt=elem) for elem in prompt]
         if is_list_of(prompt, int):
             # case 3: array of tokens
             prompt = cast(list[int], prompt)
-            return [ParsedTokens(content=prompt, is_tokens=True)]
+            return [TokensPrompt(prompt_token_ids=prompt)]
         if is_list_of(prompt, list):
             prompt = cast(list[list[int]], prompt)
             if len(prompt[0]) == 0:
@@ -62,13 +48,12 @@ def parse_and_batch_prompt(
 
             if is_list_of(prompt[0], int):
                 # case 4: array of token arrays
-                return [
-                    ParsedTokens(content=elem, is_tokens=True)
-                    for elem in prompt
-                ]
+                return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
 
-    raise TypeError("prompt must be a string, array of strings, "
-                    "array of tokens, or array of token arrays")
+    raise TypeError(
+        "prompt must be a string, array of strings, "
+        "array of tokens, or array of token arrays"
+    )
 
 
 class ParsedStrPrompt(TypedDict):
@@ -91,28 +76,9 @@ class ParsedEmbedsPrompt(TypedDict):
     content: EmbedsPrompt
 
 
-ParsedSingletonPrompt = Union[ParsedStrPrompt, ParsedTextPrompt,
-                              ParsedTokensPrompt, ParsedEmbedsPrompt]
-
-
-@overload
-def parse_singleton_prompt(prompt: str) -> ParsedStrPrompt:
-    ...
-
-
-@overload
-def parse_singleton_prompt(prompt: TextPrompt) -> ParsedTextPrompt:
-    ...
-
-
-@overload
-def parse_singleton_prompt(prompt: TokensPrompt) -> ParsedTokensPrompt:
-    ...
-
-
-@overload
-def parse_singleton_prompt(prompt: EmbedsPrompt) -> ParsedEmbedsPrompt:
-    ...
+ParsedSingletonPrompt: TypeAlias = (
+    ParsedStrPrompt | ParsedTextPrompt | ParsedTokensPrompt | ParsedEmbedsPrompt
+)
 
 
 def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
@@ -122,25 +88,25 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
         # Type ignores are because mypy does not correctly infer the TypedDicts
         # Pyright does succeed.
         if "prompt_embeds" in prompt:
-            return ParsedEmbedsPrompt(
-                type="embeds", content=prompt)  # type: ignore[typeddict-item]
+            return ParsedEmbedsPrompt(type="embeds", content=prompt)  # type: ignore[typeddict-item]
         elif "prompt_token_ids" in prompt:
-            return ParsedTokensPrompt(
-                type="tokens", content=prompt)  # type: ignore[typeddict-item]
+            return ParsedTokensPrompt(type="tokens", content=prompt)  # type: ignore[typeddict-item]
         elif "prompt" in prompt:
             return ParsedTextPrompt(type="text", content=prompt)
     raise TypeError(
-        "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt")
+        "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt"
+    )
 
 
 def is_explicit_encoder_decoder_prompt(
-    prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    prompt: PromptType,
+) -> TypeIs[ExplicitEncoderDecoderPrompt]:
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
 def split_enc_dec_inputs(
     inputs: ProcessorInputs,
-) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+) -> tuple[SingletonInputs | None, SingletonInputs]:
     if "encoder" in inputs and "decoder" in inputs:
         # NOTE: This passes pyright but not mypy
         return (
@@ -149,3 +115,23 @@ def split_enc_dec_inputs(
         )
 
     return None, inputs
+
+
+class PromptComponents(NamedTuple):
+    text: str | None = None
+    token_ids: list[int] | None = None
+    embeds: "torch.Tensor | None" = None
+
+
+def get_prompt_components(prompt: PromptType) -> PromptComponents:
+    if isinstance(prompt, str):
+        return PromptComponents(text=prompt)
+
+    if encoder_prompt := prompt.get("encoder_prompt"):
+        return get_prompt_components(encoder_prompt)  # type: ignore[arg-type]
+
+    return PromptComponents(
+        text=prompt.get("prompt"),  # type: ignore[arg-type]
+        token_ids=prompt.get("prompt_token_ids"),  # type: ignore[arg-type]
+        embeds=prompt.get("prompt_embeds"),
+    )
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 7518cd8fc897ff5025015400061c8eadf061a73d..839c13868a16c60ec98ed92cc9f06c3d6ed6519b 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import Any, Optional, Union, cast
+from typing import Any, cast
 
 from typing_extensions import assert_never
 
@@ -10,28 +10,45 @@ from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                                    MultiModalInputs, MultiModalUUIDDict)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalEncDecInputs,
+    MultiModalInputs,
+    MultiModalUUIDDict,
+)
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
-                   EncoderDecoderInputs, ProcessorInputs, PromptType,
-                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
-                   TokensPrompt, embeds_inputs, token_inputs)
+from vllm.utils.jsontree import json_iter_leaves
+from vllm.v1.metrics.stats import MultiModalCacheStats
+
+from .data import (
+    DecoderOnlyInputs,
+    EmbedsInputs,
+    EmbedsPrompt,
+    EncoderDecoderInputs,
+    ExplicitEncoderDecoderPrompt,
+    ProcessorInputs,
+    PromptType,
+    SingletonInputs,
+    SingletonPrompt,
+    TextPrompt,
+    TokenInputs,
+    TokensPrompt,
+    embeds_inputs,
+    token_inputs,
+)
 from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
 
 logger = init_logger(__name__)
 
 
 class InputPreprocessor:
-
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
+        mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
@@ -40,30 +57,35 @@ class InputPreprocessor:
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
 
+        self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
+
     def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
-            raise ValueError("You cannot pass text prompts when "
-                             "`skip_tokenizer_init` is True")
+            raise ValueError(
+                "You cannot pass text prompts when `skip_tokenizer_init` is True"
+            )
 
         return self.tokenizer
 
-    def get_bos_token_id(self) -> Optional[int]:
+    def get_bos_token_id(self) -> int | None:
         if self.tokenizer is None:
-            logger.warning("Using None for BOS token id because tokenizer "
-                           "is not initialized")
+            logger.warning_once(
+                "Using None for BOS token id because tokenizer is not initialized"
+            )
             return None
 
         return self.tokenizer.bos_token_id
 
-    def get_eos_token_id(self) -> Optional[int]:
+    def get_eos_token_id(self) -> int | None:
         if self.tokenizer is None:
-            logger.warning("Using None for EOS token id because tokenizer "
-                           "is not initialized")
+            logger.warning_once(
+                "Using None for EOS token id because tokenizer is not initialized"
+            )
             return None
 
         return self.tokenizer.eos_token_id
 
-    def get_decoder_start_token_id(self) -> Optional[int]:
+    def get_decoder_start_token_id(self) -> int | None:
         """
         Obtain the decoder start token id employed by an encoder/decoder
         model. Returns None for non-encoder/decoder models or if the
@@ -73,22 +95,26 @@ class InputPreprocessor:
         if not self.model_config.is_encoder_decoder:
             logger.warning_once(
                 "Using None for decoder start token id because "
-                "this is not an encoder/decoder model.")
+                "this is not an encoder/decoder model."
+            )
             return None
 
         if self.model_config is None or self.model_config.hf_config is None:
             logger.warning_once(
                 "Using None for decoder start token id because "
-                "model config is not available.")
+                "model config is not available."
+            )
             return None
 
-        dec_start_token_id = getattr(self.model_config.hf_config,
-                                     "decoder_start_token_id", None)
+        dec_start_token_id = getattr(
+            self.model_config.hf_config, "decoder_start_token_id", None
+        )
         if dec_start_token_id is None:
             logger.warning_once(
                 "Falling back on <BOS> for decoder start token "
                 "id because decoder start token id is not "
-                "available.")
+                "available."
+            )
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -131,7 +157,7 @@ class InputPreprocessor:
 
     def _prepare_decoder_input_ids_for_generation(
         self,
-        decoder_input_ids: Optional[list[int]],
+        decoder_input_ids: list[int] | None,
     ) -> list[int]:
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
@@ -158,15 +184,17 @@ class InputPreprocessor:
             # use decoder_start_token_id as decoder_input_ids
             decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
 
-        if (len(decoder_input_ids) == 0
-                or decoder_input_ids[0] != decoder_start_token_id):
+        if (
+            len(decoder_input_ids) == 0
+            or decoder_input_ids[0] != decoder_start_token_id
+        ):
             decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
 
         return decoder_input_ids
 
     def _get_tokenization_kw(
         self,
-        overrides: Optional[dict[str, Any]] = None,
+        overrides: dict[str, Any] | None = None,
     ) -> dict[str, Any]:
         kwargs = dict[str, Any]()
 
@@ -184,7 +212,7 @@ class InputPreprocessor:
     def _tokenize_prompt(
         self,
         prompt: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[int]:
         """
         Apply the model's tokenizer to a text prompt, returning the
@@ -223,12 +251,12 @@ class InputPreprocessor:
 
     def _process_multimodal(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Optional[Mapping[str, object]],
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        mm_processor_kwargs: Mapping[str, object] | None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -249,11 +277,15 @@ class InputPreprocessor:
         mm_hashes = mm_input["mm_hashes"]
 
         # Validate that all mm items have a string as their hash
-        if not contains_only_strings(mm_hashes):
+        contains_only_strings = all(
+            isinstance(leaf, str) for leaf in json_iter_leaves(mm_hashes)
+        )
+        if not contains_only_strings:
             raise ValueError(
                 f"mm_hashes must contain only strings, got: {mm_hashes}. "
                 "This is likely due to an incorrect custom implementation of "
-                "MultiModalProcessor.apply method.")
+                "MultiModalProcessor.apply method."
+            )
 
         return mm_input
 
@@ -262,8 +294,9 @@ class InputPreprocessor:
         parsed_content: EmbedsPrompt,
     ) -> EmbedsInputs:
         if not self.model_config.enable_prompt_embeds:
-            raise ValueError("You must set `--enable-prompt-embeds` to input "
-                             "`prompt_embeds`.")
+            raise ValueError(
+                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
+            )
 
         prompt_embeds = parsed_content["prompt_embeds"]
 
@@ -275,24 +308,25 @@ class InputPreprocessor:
             prompt_embeds = prompt_embeds.squeeze(dim=0)
 
         if prompt_embeds.ndim != 2:
-            raise ValueError(
-                "prompt_embeds must be of shape (seq_len, hidden_size).")
+            raise ValueError("prompt_embeds must be of shape (seq_len, hidden_size).")
 
         # Tensors must be on CPU for serialization between processes
         # in the MsgpackEncoder. Casting to CPU here ensures that there is no
         # hidden device transfer in the critical path of generation.
         prompt_embeds = prompt_embeds.cpu()
 
-        return embeds_inputs(prompt_embeds=prompt_embeds,
-                             cache_salt=parsed_content.get("cache_salt"))
+        return embeds_inputs(
+            prompt_embeds=prompt_embeds, cache_salt=parsed_content.get("cache_salt")
+        )
 
     def _truncate_inputs(
-            self,
-            inputs: list[int],
-            tokenization_kwargs: Optional[dict[str, Any]] = None) -> list[int]:
-
-        if not tokenization_kwargs or "truncation" not in \
-                tokenization_kwargs or self.tokenizer is None:
+        self, inputs: list[int], tokenization_kwargs: dict[str, Any] | None = None
+    ) -> list[int]:
+        if (
+            not tokenization_kwargs
+            or "truncation" not in tokenization_kwargs
+            or self.tokenizer is None
+        ):
             return inputs
 
         max_length = tokenization_kwargs["max_length"]
@@ -305,24 +339,25 @@ class InputPreprocessor:
     def _process_tokens(
         self,
         parsed_content: TokensPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
-    ) -> Union[TokenInputs, MultiModalInputs]:
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> TokenInputs | MultiModalInputs:
         prompt_token_ids = self._truncate_inputs(
-            parsed_content["prompt_token_ids"], tokenization_kwargs)
+            parsed_content["prompt_token_ids"], tokenization_kwargs
+        )
 
-        inputs: Union[TokenInputs, MultiModalInputs]
+        inputs: TokenInputs | MultiModalInputs
         if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_token_ids,
                 multi_modal_data,
-                parsed_content.get("mm_processor_kwargs"),
+                parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            inputs = token_inputs(prompt_token_ids=prompt_token_ids)
+            inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
@@ -332,18 +367,18 @@ class InputPreprocessor:
     def _process_text(
         self,
         parsed_content: TextPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
-    ) -> Union[TokenInputs, MultiModalInputs]:
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> TokenInputs | MultiModalInputs:
         prompt_text = parsed_content["prompt"]
 
-        inputs: Union[TokenInputs, MultiModalInputs]
+        inputs: TokenInputs | MultiModalInputs
         if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_text,
                 multi_modal_data,
-                parsed_content.get("mm_processor_kwargs"),
+                parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
@@ -352,10 +387,7 @@ class InputPreprocessor:
                 prompt_text,
                 tokenization_kwargs=tokenization_kwargs,
             )
-            inputs = token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
-            )
+            inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
@@ -365,9 +397,9 @@ class InputPreprocessor:
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -407,18 +439,20 @@ class InputPreprocessor:
     def _build_enc_dec_llm_inputs(
         self,
         encoder_inputs: SingletonInputs,
-        decoder_inputs: Optional[SingletonInputs],
+        decoder_inputs: SingletonInputs | None,
     ) -> EncoderDecoderInputs:
-        if (encoder_inputs["type"] == "embeds"
-                or decoder_inputs and decoder_inputs["type"] == "embeds"):
-            raise ValueError("Embedding inputs are not supported for encoder-"
-                             "decoder models")
+        if (
+            encoder_inputs["type"] == "embeds"
+            or decoder_inputs
+            and decoder_inputs["type"] == "embeds"
+        ):
+            raise ValueError(
+                "Embedding inputs are not supported for encoder-decoder models"
+            )
 
         # Needed for mypy
-        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs],
-                              encoder_inputs)
-        decoder_inputs = cast(Optional[Union[TokenInputs, MultiModalInputs]],
-                              decoder_inputs)
+        encoder_inputs = cast(TokenInputs | MultiModalInputs, encoder_inputs)
+        decoder_inputs = cast(TokenInputs | MultiModalInputs | None, decoder_inputs)
 
         if decoder_inputs is None:
             if self.model_config.hf_config.model_type == "whisper":
@@ -428,16 +462,18 @@ class InputPreprocessor:
                 # overridden by the audio features.
                 dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
             else:
-                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                    None)
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(None)
             decoder_inputs = token_inputs(dec_token_ids)
         else:
             if "multi_modal_data" in decoder_inputs:
-                raise ValueError("Multi-modal decoder inputs of encoder-"
-                                 "decoder models are not supported yet")
+                raise ValueError(
+                    "Multi-modal decoder inputs of encoder-"
+                    "decoder models are not supported yet"
+                )
 
             dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                decoder_inputs["prompt_token_ids"])
+                decoder_inputs["prompt_token_ids"]
+            )
             decoder_inputs["prompt_token_ids"] = dec_token_ids
 
         return EncoderDecoderInputs(
@@ -447,25 +483,29 @@ class InputPreprocessor:
 
     def _split_enc_dec_mm_inputs(
         self,
-        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
-        decoder_inputs_to_override: Optional[SingletonInputs] = None,
+        inputs: SingletonInputs | MultiModalEncDecInputs,
+        decoder_inputs_to_override: SingletonInputs | None = None,
     ) -> tuple[SingletonInputs, SingletonInputs]:
         """
         For encoder/decoder models only:
         Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
         """
-        if (inputs["type"] == "embeds" or decoder_inputs_to_override
-                and decoder_inputs_to_override["type"] == "embeds"):
-            raise ValueError("Embedding inputs are not supported for encoder-"
-                             "decoder models")
+        if (
+            inputs["type"] == "embeds"
+            or decoder_inputs_to_override
+            and decoder_inputs_to_override["type"] == "embeds"
+        ):
+            raise ValueError(
+                "Embedding inputs are not supported for encoder-decoder models"
+            )
 
         # Needed for mypy
         inputs = cast(
-            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
+            TokenInputs | MultiModalInputs | MultiModalEncDecInputs,
             inputs,
         )
         decoder_inputs_to_override = cast(
-            Optional[Union[TokenInputs, MultiModalInputs]],
+            TokenInputs | MultiModalInputs | None,
             decoder_inputs_to_override,
         )
 
@@ -473,22 +513,19 @@ class InputPreprocessor:
         decoder_inputs: SingletonInputs
 
         if inputs["type"] == "multimodal":  # Multimodal data inputs
-            if not ("encoder_prompt" in inputs
-                    and "encoder_prompt_token_ids" in inputs):
-                raise RuntimeError("You should register an encoder-decoder "
-                                   "multi-modal processor for encoder-decoder "
-                                   "models.")
+            if "encoder_prompt_token_ids" not in inputs:
+                raise RuntimeError(
+                    "You should register an encoder-decoder "
+                    "multi-modal processor for encoder-decoder "
+                    "models."
+                )
             inputs = cast(MultiModalEncDecInputs, inputs)
 
-            encoder_inputs = token_inputs(
-                prompt=inputs["encoder_prompt"],
-                prompt_token_ids=inputs["encoder_prompt_token_ids"],
-            )
+            encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
 
             decoder_prompt_inputs = decoder_inputs_to_override or inputs
             decoder_inputs = MultiModalInputs(
                 type="multimodal",
-                prompt=decoder_prompt_inputs.get("prompt", ""),
                 prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
                 mm_kwargs=inputs["mm_kwargs"],
                 mm_hashes=inputs["mm_hashes"],
@@ -498,7 +535,7 @@ class InputPreprocessor:
                 decoder_inputs["cache_salt"] = cache_salt
 
         elif inputs["type"] == "token":  # Text-only inputs
-            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
+            encoder_inputs = token_inputs(prompt_token_ids=[])
             decoder_inputs = decoder_inputs_to_override or inputs
         else:
             assert_never(inputs)  # type: ignore[arg-type]
@@ -508,9 +545,9 @@ class InputPreprocessor:
     def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -546,34 +583,36 @@ class InputPreprocessor:
           instance
         """
         encoder_inputs: SingletonInputs
-        decoder_inputs: Optional[SingletonInputs]
+        decoder_inputs: SingletonInputs | None
 
         if is_explicit_encoder_decoder_prompt(prompt):
+            # `cast` is needed for mypy, but not pyright
+            prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
             encoder_inputs = self._prompt_to_llm_inputs(
-                prompt["encoder_prompt"],
+                prompt_["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
-            if (decoder_input := prompt["decoder_prompt"]) is None:
+            if (decoder_input := prompt_["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
                 decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
             if self.model_config.is_multimodal_model:
-                encoder_inputs, decoder_inputs = (
-                    self._split_enc_dec_mm_inputs(encoder_inputs,
-                                                  decoder_inputs))
+                encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs(
+                    encoder_inputs, decoder_inputs
+                )
         else:
+            # `cast` is needed for mypy, but not pyright
             inputs = self._prompt_to_llm_inputs(
-                prompt,
+                cast(SingletonPrompt, prompt),
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
-                encoder_inputs, decoder_inputs = (
-                    self._split_enc_dec_mm_inputs(inputs))
+                encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs(inputs)
             else:
                 encoder_inputs = inputs
                 decoder_inputs = None
@@ -585,17 +624,18 @@ class InputPreprocessor:
         prompt_inputs: DecoderOnlyInputs,
     ) -> DecoderOnlyInputs:
         if "prompt_token_ids" in prompt_inputs:
-            prompt_inputs = cast(Union[TokenInputs, MultiModalInputs],
-                                 prompt_inputs)  # Needed for mypy
+            prompt_inputs = cast(
+                TokenInputs | MultiModalInputs, prompt_inputs
+            )  # Needed for mypy
 
         return prompt_inputs
 
     def _process_decoder_only_prompt(
         self,
         prompt: SingletonPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -619,14 +659,13 @@ class InputPreprocessor:
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
 
-    def preprocess(
+    def _preprocess(
         self,
         prompt: PromptType,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> ProcessorInputs:
-        """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder.
@@ -637,28 +676,52 @@ class InputPreprocessor:
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
-            raise ValueError("Cannot pass encoder-decoder prompt "
-                             "to decoder-only models")
+            raise ValueError(
+                "Cannot pass encoder-decoder prompt to decoder-only models"
+            )
 
         # Decoder-only operation
+        # `cast` is needed for mypy, but not pyright
         return self._process_decoder_only_prompt(
-            prompt,
+            cast(SingletonPrompt, prompt),
             tokenization_kwargs=tokenization_kwargs,
             mm_uuids=mm_uuids,
         )
 
-    def clear_cache(self) -> None:
+    def preprocess(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        *,
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> ProcessorInputs:
+        """Preprocess the input prompt."""
+        res = self._preprocess(
+            prompt,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+
+        if self.mm_processor_cache and self.mm_cache_stats is not None:
+            delta = self.mm_processor_cache.make_stats(delta=True)
+            self.mm_cache_stats.requests += 1
+            self.mm_cache_stats.queries += delta.total
+            self.mm_cache_stats.hits += delta.hits
+
+        return res
+
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
+        mm_cache_stats = self.mm_cache_stats
+        if mm_cache_stats is None:
+            return None
+
+        self.mm_cache_stats = MultiModalCacheStats()
+
+        return mm_cache_stats
+
+    def clear_mm_cache(self) -> None:
         if self.mm_processor_cache is not None:
             self.mm_processor_cache.clear_cache()
 
-
-# Helper function to validate that a nested dictionary contains
-# only strings or list of strings as the leaf values.
-def contains_only_strings(obj: object):
-    if isinstance(obj, str):
-        return True
-    if isinstance(obj, list):
-        return all(isinstance(x, str) for x in obj)
-    if isinstance(obj, dict):
-        return all(contains_only_strings(v) for v in obj.values())
-    return False
+        if self.mm_cache_stats is not None:
+            self.mm_cache_stats.reset = True
diff --git a/vllm/logger.py b/vllm/logger.py
index 2861e0f1686c4ecbe830c2b5152e31fa550207e2..9341008296843ecdc2b1d090df272295b24316c6 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Logging configuration for vLLM."""
+
 import datetime
 import json
 import logging
@@ -12,7 +13,7 @@ from logging import Logger
 from logging.config import dictConfig
 from os import path
 from types import MethodType
-from typing import Any, Optional, cast
+from typing import Any, Literal, cast
 
 import vllm.envs as envs
 
@@ -22,8 +23,10 @@ VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
 VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
 VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
 
-_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
-           "[%(fileinfo)s:%(lineno)d] %(message)s")
+_FORMAT = (
+    f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+    "[%(fileinfo)s:%(lineno)d] %(message)s"
+)
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
 DEFAULT_LOGGING_CONFIG = {
@@ -50,26 +53,43 @@ DEFAULT_LOGGING_CONFIG = {
         },
     },
     "version": 1,
-    "disable_existing_loggers": False
+    "disable_existing_loggers": False,
 }
 
 
 @lru_cache
 def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
-    # Set the stacklevel to 2 to print the original caller's line info
-    logger.debug(msg, *args, stacklevel=2)
+    # Set the stacklevel to 3 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=3)
 
 
 @lru_cache
 def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
-    # Set the stacklevel to 2 to print the original caller's line info
-    logger.info(msg, *args, stacklevel=2)
+    # Set the stacklevel to 3 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=3)
 
 
 @lru_cache
 def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
-    # Set the stacklevel to 2 to print the original caller's line info
-    logger.warning(msg, *args, stacklevel=2)
+    # Set the stacklevel to 3 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=3)
+
+
+LogScope = Literal["process", "global", "local"]
+
+
+def _should_log_with_scope(scope: LogScope) -> bool:
+    """Decide whether to log based on scope"""
+    if scope == "global":
+        from vllm.distributed.parallel_state import is_global_first_rank
+
+        return is_global_first_rank()
+    if scope == "local":
+        from vllm.distributed.parallel_state import is_local_first_rank
+
+        return is_local_first_rank()
+    # default "process" scope: always log
+    return True
 
 
 class _VllmLogger(Logger):
@@ -81,33 +101,43 @@ class _VllmLogger(Logger):
         `intel_extension_for_pytorch.utils._logger`.
     """
 
-    def debug_once(self, msg: str, *args: Hashable) -> None:
+    def debug_once(
+        self, msg: str, *args: Hashable, scope: LogScope = "process"
+    ) -> None:
         """
         As [`debug`][logging.Logger.debug], but subsequent calls with
         the same message are silently dropped.
         """
+        if not _should_log_with_scope(scope):
+            return
         _print_debug_once(self, msg, *args)
 
-    def info_once(self, msg: str, *args: Hashable) -> None:
+    def info_once(self, msg: str, *args: Hashable, scope: LogScope = "process") -> None:
         """
         As [`info`][logging.Logger.info], but subsequent calls with
         the same message are silently dropped.
         """
+        if not _should_log_with_scope(scope):
+            return
         _print_info_once(self, msg, *args)
 
-    def warning_once(self, msg: str, *args: Hashable) -> None:
+    def warning_once(
+        self, msg: str, *args: Hashable, scope: LogScope = "process"
+    ) -> None:
         """
         As [`warning`][logging.Logger.warning], but subsequent calls with
         the same message are silently dropped.
         """
+        if not _should_log_with_scope(scope):
+            return
         _print_warning_once(self, msg, *args)
 
 
 # Pre-defined methods mapping to avoid repeated dictionary creation
 _METHODS_TO_PATCH = {
-    "debug_once": _print_debug_once,
-    "info_once": _print_info_once,
-    "warning_once": _print_warning_once,
+    "debug_once": _VllmLogger.debug_once,
+    "info_once": _VllmLogger.info_once,
+    "warning_once": _VllmLogger.warning_once,
 }
 
 
@@ -119,7 +149,8 @@ def _configure_vllm_root_logger() -> None:
             "VLLM_CONFIGURE_LOGGING evaluated to false, but "
             "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
             "implies VLLM_CONFIGURE_LOGGING. Please enable "
-            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH."
+        )
 
     if VLLM_CONFIGURE_LOGGING:
         logging_config = DEFAULT_LOGGING_CONFIG
@@ -128,13 +159,16 @@ def _configure_vllm_root_logger() -> None:
         if not path.exists(VLLM_LOGGING_CONFIG_PATH):
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
-                VLLM_LOGGING_CONFIG_PATH)
+                VLLM_LOGGING_CONFIG_PATH,
+            )
         with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
-            raise ValueError("Invalid logging config. Expected dict, got %s.",
-                             type(custom_config).__name__)
+            raise ValueError(
+                "Invalid logging config. Expected dict, got %s.",
+                type(custom_config).__name__,
+            )
         logging_config = custom_config
 
     for formatter in logging_config.get("formatters", {}).values():
@@ -168,7 +202,7 @@ logger = init_logger(__name__)
 
 
 def _trace_calls(log_path, root_dir, frame, event, arg=None):
-    if event in ['call', 'return']:
+    if event in ["call", "return"]:
         # Extract the filename, line number, function name, and the code object
         filename = frame.f_code.co_filename
         lineno = frame.f_lineno
@@ -188,26 +222,29 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None):
                 last_filename = ""
                 last_lineno = 0
                 last_func_name = ""
-            with open(log_path, 'a') as f:
+            with open(log_path, "a") as f:
                 ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
-                if event == 'call':
-                    f.write(f"{ts} Call to"
-                            f" {func_name} in {filename}:{lineno}"
-                            f" from {last_func_name} in {last_filename}:"
-                            f"{last_lineno}\n")
+                if event == "call":
+                    f.write(
+                        f"{ts} Call to"
+                        f" {func_name} in {filename}:{lineno}"
+                        f" from {last_func_name} in {last_filename}:"
+                        f"{last_lineno}\n"
+                    )
                 else:
-                    f.write(f"{ts} Return from"
-                            f" {func_name} in {filename}:{lineno}"
-                            f" to {last_func_name} in {last_filename}:"
-                            f"{last_lineno}\n")
+                    f.write(
+                        f"{ts} Return from"
+                        f" {func_name} in {filename}:{lineno}"
+                        f" to {last_func_name} in {last_filename}:"
+                        f"{last_lineno}\n"
+                    )
         except NameError:
             # modules are deleted during shutdown
             pass
     return partial(_trace_calls, log_path, root_dir)
 
 
-def enable_trace_function_call(log_file_path: str,
-                               root_dir: Optional[str] = None):
+def enable_trace_function_call(log_file_path: str, root_dir: str | None = None):
     """
     Enable tracing of every function call in code under `root_dir`.
     This is useful for debugging hangs or crashes.
@@ -221,7 +258,8 @@ def enable_trace_function_call(log_file_path: str,
     logger.warning(
         "VLLM_TRACE_FUNCTION is enabled. It will record every"
         " function executed by Python. This will slow down the code. It "
-        "is suggested to be used for debugging hang or crashes only.")
+        "is suggested to be used for debugging hang or crashes only."
+    )
     logger.info("Trace frame log is saved to %s", log_file_path)
     if root_dir is None:
         # by default, this is the vllm root directory
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index ad89638e106141933123eb57549795020c3e408a..cb289d04e3f40d9fb37437024d65972a13f9233b 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -4,7 +4,6 @@
 import contextlib
 import enum
 import json
-from typing import Optional
 
 import torch
 
@@ -21,9 +20,10 @@ def prepare_object_to_dump(obj) -> str:
     if isinstance(obj, str):
         return f"'{obj}'"  # Double quotes
     elif isinstance(obj, dict):
-        dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \
-            for k, v in obj.items()})
-        return f'{{{dict_str}}}'
+        dict_str = ", ".join(
+            {f"{str(k)}: {prepare_object_to_dump(v)}" for k, v in obj.items()}
+        )
+        return f"{{{dict_str}}}"
     elif isinstance(obj, list):
         return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
     elif isinstance(obj, set):
@@ -36,15 +36,14 @@ def prepare_object_to_dump(obj) -> str:
     elif isinstance(obj, torch.Tensor):
         # We only print the 'draft' of the tensor to not expose sensitive data
         # and to get some metadata in case of CUDA runtime crashed
-        return (f"Tensor(shape={obj.shape}, "
-                f"device={obj.device},"
-                f"dtype={obj.dtype})")
-    elif hasattr(obj, 'anon_repr'):
+        return f"Tensor(shape={obj.shape}, device={obj.device},dtype={obj.dtype})"
+    elif hasattr(obj, "anon_repr"):
         return obj.anon_repr()
-    elif hasattr(obj, '__dict__'):
+    elif hasattr(obj, "__dict__"):
         items = obj.__dict__.items()
-        dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \
-            for k, v in items])
+        dict_str = ", ".join(
+            [f"{str(k)}={prepare_object_to_dump(v)}" for k, v in items]
+        )
         return f"{type(obj).__name__}({dict_str})"
     else:
         # Hacky way to make sure we can serialize the object in JSON format
@@ -54,18 +53,22 @@ def prepare_object_to_dump(obj) -> str:
             return repr(obj)
 
 
-def dump_engine_exception(config: VllmConfig,
-                          scheduler_output: SchedulerOutput,
-                          scheduler_stats: Optional[SchedulerStats]):
+def dump_engine_exception(
+    config: VllmConfig,
+    scheduler_output: SchedulerOutput,
+    scheduler_stats: SchedulerStats | None,
+):
     # NOTE: ensure we can log extra info without risking raises
     # unexpected errors during logging
     with contextlib.suppress(Exception):
         _dump_engine_exception(config, scheduler_output, scheduler_stats)
 
 
-def _dump_engine_exception(config: VllmConfig,
-                           scheduler_output: SchedulerOutput,
-                           scheduler_stats: Optional[SchedulerStats]):
+def _dump_engine_exception(
+    config: VllmConfig,
+    scheduler_output: SchedulerOutput,
+    scheduler_stats: SchedulerStats | None,
+):
     logger.error(
         "Dumping input data for V1 LLM engine (v%s) with config: %s, ",
         VLLM_VERSION,
@@ -73,8 +76,7 @@ def _dump_engine_exception(config: VllmConfig,
     )
     try:
         dump_obj = prepare_object_to_dump(scheduler_output)
-        logger.error("Dumping scheduler output for model execution: %s",
-                     dump_obj)
+        logger.error("Dumping scheduler output for model execution: %s", dump_obj)
         if scheduler_stats:
             logger.error("Dumping scheduler stats: %s", scheduler_stats)
     except Exception:
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 004b79f3ea6e274431daa619b4a25471751e83bb..02ba308e187963c3ffe11f0305713de0eb301bb9 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -18,7 +18,6 @@ class NewLineFormatter(logging.Formatter):
             self.root_dir = Path(__file__).resolve().parent.parent.parent
 
     def format(self, record):
-
         def shrink_path(relpath: Path) -> str:
             """
             Shortens a file path for logging display:
@@ -62,8 +61,7 @@ class NewLineFormatter(logging.Formatter):
             abs_path = getattr(record, "pathname", None)
             if abs_path:
                 try:
-                    relpath = Path(abs_path).resolve().relative_to(
-                        self.root_dir)
+                    relpath = Path(abs_path).resolve().relative_to(self.root_dir)
                 except Exception:
                     relpath = Path(record.filename)
             else:
diff --git a/vllm/logging_utils/log_time.py b/vllm/logging_utils/log_time.py
index 013dd144beaf84ddd6cbf4cfe219ee153ce2d6bf..9e94f463711d3a16f2f35d2ff8a2f1b4ecd7f9e5 100644
--- a/vllm/logging_utils/log_time.py
+++ b/vllm/logging_utils/log_time.py
@@ -15,15 +15,17 @@ def logtime(logger, msg=None):
     """
 
     def _inner(func):
-
         @functools.wraps(func)
         def _wrapper(*args, **kwargs):
             start = time.perf_counter()
             result = func(*args, **kwargs)
             elapsed = time.perf_counter() - start
 
-            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
-                if msg is None else msg
+            prefix = (
+                f"Function '{func.__module__}.{func.__qualname__}'"
+                if msg is None
+                else msg
+            )
             logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
             return result
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 48f7e7495b17ddeacae8083db43d0b2465cca194..7b6a6528e20e86ad9f2852588feeab529a163779 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import Callable, Union
+from collections.abc import Callable, Sequence
+from typing import TypeAlias
 
 import torch
 
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-LogitsProcessor = Union[
-    Callable[[list[int], torch.Tensor], torch.Tensor],
-    Callable[[list[int], list[int], torch.Tensor], torch.Tensor],
-]
+LogitsProcessor: TypeAlias = (
+    Callable[[list[int], torch.Tensor], torch.Tensor]
+    | Callable[[list[int], list[int], torch.Tensor], torch.Tensor]
+)
 """LogitsProcessor is a function that takes a list
 of previously generated tokens, the logits tensor
 for the next token and, optionally, prompt tokens as a
@@ -19,8 +19,8 @@ to sample from."""
 
 
 def get_bad_words_logits_processors(
-        bad_words: list[str],
-        tokenizer: AnyTokenizer) -> list[LogitsProcessor]:
+    bad_words: list[str], tokenizer: AnyTokenizer
+) -> list[LogitsProcessor]:
     bad_words_ids: list[list[int]] = list()
 
     for bad_word in bad_words:
@@ -31,15 +31,15 @@ def get_bad_words_logits_processors(
             prefix = " " if add_prefix_space else ""
             prompt = prefix + bad_word.lstrip()
 
-            prompt_token_ids = tokenizer.encode(text=prompt,
-                                                add_special_tokens=False)
+            prompt_token_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
 
             # If no space at the beginning
             # or if prefix space produces a new word token
             if (not add_prefix_space) or (
-                    add_prefix_space
-                    and prompt_token_ids[0] != bad_words_ids[-1][0]
-                    and len(prompt_token_ids) == len(bad_words_ids[-1])):
+                add_prefix_space
+                and prompt_token_ids[0] != bad_words_ids[-1][0]
+                and len(prompt_token_ids) == len(bad_words_ids[-1])
+            ):
                 bad_words_ids.append(prompt_token_ids)
 
     return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
@@ -78,8 +78,9 @@ class NoBadWordsLogitsProcessor:
             assert len(actual_prefix) == len(expected_prefix)
 
             is_match = tuple(actual_prefix) == tuple(expected_prefix)
-            last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
-                                               else self._NEUTRAL_LOGIT)
+            last_token_bias[last_token_id] += (
+                self._SMALLEST_LOGIT if is_match else self._NEUTRAL_LOGIT
+            )
 
         logits = logits + self.word_bias + last_token_bias
 
@@ -93,9 +94,9 @@ class NoBadWordsLogitsProcessor:
 
         self._check_token_ids_bounds(vocab_size=vocab_size)
 
-        self.word_bias = torch.zeros((vocab_size, ),
-                                     dtype=torch.float,
-                                     device=logits.device)
+        self.word_bias = torch.zeros(
+            (vocab_size,), dtype=torch.float, device=logits.device
+        )
 
         for bad_word_ids in self.bad_words_ids:
             if len(bad_word_ids) == 1:
@@ -116,4 +117,5 @@ class NoBadWordsLogitsProcessor:
                 f" but the following tokens"
                 f" were specified as bad: {invalid_token_ids}."
                 f" All token id values should be integers satisfying:"
-                f" 0 <= token_id < {vocab_size}.")
+                f" 0 <= token_id < {vocab_size}."
+            )
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index e58ca142c00a4dd07f479c3f2f9e34786a0df36d..a34398db2c960d11ef7d555c12fe2f21e3b9c68a 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -1,7 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-from typing import Optional
+import itertools
+from collections.abc import Iterable, Iterator, MutableSequence
+from dataclasses import dataclass, field
+from typing import overload
+
+import vllm.envs as envs
 
 
 # We use dataclass for now because it is used for
@@ -16,13 +20,189 @@ class Logprob:
         rank: The vocab rank of chosen token (>=1)
         decoded_token: The decoded chosen token index
     """
+
     logprob: float
-    rank: Optional[int] = None
-    decoded_token: Optional[str] = None
+    rank: int | None = None
+    decoded_token: str | None = None
+
+
+LogprobsOnePosition = dict[int, Logprob]
+
+
+@dataclass
+class FlatLogprobs(MutableSequence[LogprobsOnePosition]):
+    """
+    Flat logprobs of a request into multiple primitive type lists.
+
+    Compared to list[dict[int, Logprob]], this data structure reduced GC
+    overhead significantly. As it flattened logprob information for
+    all positions and ranks in to multiple primitive type lists (i.e.
+    logprobs, token_ids, ranks per token_ids, decoded_tokens).
+    So regardless of the sequence length and top_logprobs setup,
+    FlatLogprobs would only introduce a constant amount of objects.
+
+    As each position might contains different amount of ranks,
+    start_indices_per_position would be used to access the logprob ranges
+    for different positions.
+
+    NOTE: To reduce the migration overhead and improve backward compatibility,
+    we support the key Sequence APIs of list, so it could act as
+    list[LogprobsOnePosition]
+    """
+
+    # Start / end indices to indicate the range of logprobs for each position.
+    start_indices: list[int] = field(default_factory=list)
+    end_indices: list[int] = field(default_factory=list)
+
+    # Flatten Logprob information for (each position, rank).
+    # For position <i>, the logprobs are ranged
+    # from self.start_indices[i] to self.end_indices[i] (exclusive).
+    token_ids: list[int] = field(default_factory=list)
+    logprobs: list[float] = field(default_factory=list)
+    ranks: list[int | None] = field(default_factory=list)
+    decoded_tokens: list[str | None] = field(default_factory=list)
+
+    def append(self, logprobs_one_position: LogprobsOnePosition | None) -> None:
+        """Appends the container with logprobs for the next position"""
+        self.start_indices.append(len(self.logprobs))
+        if logprobs_one_position:
+            for token_id, logprob in logprobs_one_position.items():
+                self.token_ids.append(token_id)
+                self.logprobs.append(logprob.logprob)
+                self.ranks.append(logprob.rank)
+                self.decoded_tokens.append(logprob.decoded_token)
+        self.end_indices.append(len(self.logprobs))
+
+    def append_fast(
+        self,
+        token_ids: list[int],
+        logprobs: list[float],
+        ranks: itertools.chain[int],
+        decoded_tokens: Iterable[str | None],
+    ) -> None:
+        """
+        Appends logprobs for the next position without creating
+        the intermediate logprob dictionary.
+        """
+        self.start_indices.append(len(self.logprobs))
+        for token_id, logprob, rank, decoded_token in zip(
+            token_ids, logprobs, ranks, decoded_tokens
+        ):
+            self.token_ids.append(token_id)
+            self.logprobs.append(logprob)
+            self.ranks.append(rank)
+            self.decoded_tokens.append(decoded_token)
+        self.end_indices.append(len(self.logprobs))
+
+    def extend(self, logprobs_multi_positions) -> None:
+        """Extends the container with logprobs for the next multiple positions"""
+        for logprobs_one_position in logprobs_multi_positions:
+            self.append(logprobs_one_position)
+
+    def __len__(self) -> int:
+        """Gets number of positions stored in the container"""
+        return len(self.start_indices)
+
+    @overload
+    def __getitem__(self, position: int) -> LogprobsOnePosition: ...
+
+    @overload
+    def __getitem__(self, s: slice, /) -> "FlatLogprobs": ...
+
+    def __getitem__(self, index: int | slice):
+        """Extracts logprobs of a given position or slice"""
+        if isinstance(index, int):
+            return {
+                self.token_ids[i]: Logprob(
+                    logprob=self.logprobs[i],
+                    rank=self.ranks[i],
+                    decoded_token=self.decoded_tokens[i],
+                )
+                for i in range(self.start_indices[index], self.end_indices[index])
+            }
+        elif isinstance(index, slice):
+            min_index = self.start_indices[index][0]
+            max_index = self.end_indices[index][-1]
+            return FlatLogprobs(
+                # Shift updated start_indices and end_indices to
+                # be 0-indexed
+                start_indices=[i - min_index for i in self.start_indices[index]],
+                end_indices=[i - min_index for i in self.end_indices[index]],
+                token_ids=self.token_ids[min_index:max_index],
+                logprobs=self.logprobs[min_index:max_index],
+                ranks=self.ranks[min_index:max_index],
+                decoded_tokens=self.decoded_tokens[min_index:max_index],
+            )
+        else:
+            raise TypeError(f"Invalid index type: {type(index)}")
+
+    def __setitem__(self, item, value) -> None:
+        raise TypeError("Cannot set logprobs in FlatLogprobs")
+
+    def __delitem__(self, item) -> None:
+        raise TypeError("Cannot delete logprobs from FlatLogprobs")
+
+    def insert(self, item) -> None:
+        raise TypeError("Cannot insert logprobs to FlatLogprobs")
+
+    def __iter__(self) -> Iterator[LogprobsOnePosition]:
+        """
+        Iterates the container and yields LogprobsOnePosition for
+        each position.
+        """
+        for i in range(0, len(self.start_indices)):
+            yield self.__getitem__(i)
 
 
 # {token_id -> logprob} per each sequence group. None if the corresponding
 # sequence group doesn't require prompt logprob.
-PromptLogprobs = list[Optional[dict[int, Logprob]]]
+PromptLogprobs = FlatLogprobs | list[LogprobsOnePosition | None]
 # {token_id -> logprob} for each sequence group.
-SampleLogprobs = list[dict[int, Logprob]]
+SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition]
+
+
+def create_prompt_logprobs() -> PromptLogprobs:
+    """Creates a container to store prompt logprobs for a request"""
+    logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    # NOTE: logprob of first prompt token is None.
+    logprobs.append(None)
+    return logprobs
+
+
+def create_sample_logprobs() -> SampleLogprobs:
+    """Creates a container to store decode logprobs for a request"""
+    return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+
+
+def append_logprobs_for_next_position(
+    request_logprobs: PromptLogprobs | SampleLogprobs,
+    token_ids: list[int],
+    logprobs: list[float],
+    decoded_tokens: Iterable[str | None],
+    rank: int,
+    num_logprobs: int,
+) -> None:
+    """Appends logprobs for the next position"""
+    if num_logprobs == -1:
+        num_logprobs = len(logprobs)
+    # We do not need a special case for the sampled token
+    # being in the topk, since inserting duplicated data
+    # into a dictionary twice is the same as doing it once.
+    topk_ranks = range(1, num_logprobs + 1)
+    ranks = itertools.chain((rank,), topk_ranks)
+
+    if isinstance(request_logprobs, FlatLogprobs):
+        request_logprobs.append_fast(token_ids, logprobs, ranks, decoded_tokens)
+    else:
+        request_logprobs.append(
+            {
+                token_id: Logprob(
+                    logprob=logprob,
+                    rank=rank,
+                    decoded_token=token,
+                )
+                for token_id, logprob, rank, token in zip(
+                    token_ids, logprobs, ranks, decoded_tokens
+                )
+            }
+        )
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index d3bb145dc7bf83dd9ca196bc209fccd64b75d96b..8a4f5ff175d4fdae1cc349fd529698643694f2f5 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -2,18 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.lora.layers.column_parallel_linear import (
-    ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA,
+    ColumnParallelLinearWithLoRA,
+    ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithLoRA,
-    MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA,
-    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA,
-    QKVParallelLinearWithShardedLoRA)
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithLoRA,
+    QKVParallelLinearWithShardedLoRA,
+)
+from vllm.lora.layers.fused_moe import FusedMoEWithLoRA
 from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
 from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
 from vllm.lora.layers.row_parallel_linear import (
-    RowParallelLinearWithLoRA, RowParallelLinearWithShardedLoRA)
+    RowParallelLinearWithLoRA,
+    RowParallelLinearWithShardedLoRA,
+)
 from vllm.lora.layers.utils import LoRAMapping
-from vllm.lora.layers.vocal_parallel_embedding import (
-    VocabParallelEmbeddingWithLoRA)
+from vllm.lora.layers.vocal_parallel_embedding import VocabParallelEmbeddingWithLoRA
 
 __all__ = [
     "BaseLayerWithLoRA",
@@ -31,4 +37,5 @@ __all__ = [
     "RowParallelLinearWithShardedLoRA",
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
+    "FusedMoEWithLoRA",
 ]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index a80a033e39b4044121ba2158ff4d6014f2382c36..0c7e806848892f10f7811ab1e20a47571b13eb09 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 import torch.nn as nn
@@ -14,16 +14,15 @@ if TYPE_CHECKING:
 
 
 class BaseLayerWithLoRA(nn.Module):
-
     def slice_lora_a(
-        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        self, lora_a: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
         """Slice lora a if splitting for tensor parallelism."""
         ...
 
     def slice_lora_b(
-        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        self, lora_b: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
         """Slice lora b if splitting with tensor parallelism."""
         ...
 
@@ -31,7 +30,7 @@ class BaseLayerWithLoRA(nn.Module):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
         ...
@@ -45,8 +44,7 @@ class BaseLayerWithLoRA(nn.Module):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         """Overwrites lora tensors at index."""
         ...
@@ -63,7 +61,7 @@ class BaseLayerWithLoRA(nn.Module):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index ed294b0aedaf485c9dec2034439bbc04b5b42155..3db4165e2017614d84a7e1215b87a3e90a98df4c 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, cast
 
 import torch
 from transformers import PretrainedConfig
 
 from vllm.config.lora import LoRAConfig
 from vllm.distributed.utils import divide
-# yapf: disable
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase, ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    LinearBase,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.platforms import current_platform
 
 from .base import BaseLayerWithLoRA
@@ -19,7 +20,6 @@ from .utils import _get_lora_device
 
 
 class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
-
     def __init__(self, base_layer: LinearBase):
         super().__init__()
         self.base_layer = base_layer
@@ -28,7 +28,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         self.tp_size = self.base_layer.tp_size
         self.tp_rank = self.base_layer.tp_rank
         self.device = _get_lora_device(self.base_layer)
-        self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None
         self.output_slices: tuple[int, ...]
         self.output_size: int
         self.n_slices: int
@@ -37,7 +36,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         self.lora_config = lora_config
         #
@@ -46,16 +45,20 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             lora_b_out_size = self.output_size
 
         elif isinstance(self.base_layer, ColumnParallelLinear):
-            lora_a_out_size = (lora_config.max_lora_rank if
-                               not lora_config.fully_sharded_loras else divide(
-                                   lora_config.max_lora_rank, self.tp_size))
+            lora_a_out_size = (
+                lora_config.max_lora_rank
+                if not lora_config.fully_sharded_loras
+                else divide(lora_config.max_lora_rank, self.tp_size)
+            )
             lora_b_out_size = self.output_size
 
         elif isinstance(self.base_layer, RowParallelLinear):
             lora_a_out_size = lora_config.max_lora_rank
-            lora_b_out_size = (self.output_size if
-                               not lora_config.fully_sharded_loras else divide(
-                                   self.output_size, self.tp_size))
+            lora_b_out_size = (
+                self.output_size
+                if not lora_config.fully_sharded_loras
+                else divide(self.output_size, self.tp_size)
+            )
         else:
             raise NotImplementedError
 
@@ -67,7 +70,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(self.n_slices))
+            )
+            for _ in range(self.n_slices)
+        )
         self.lora_b_stacked = tuple(
             torch.zeros(
                 max_loras,
@@ -76,81 +81,56 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(self.n_slices))
-        if lora_config.bias_enabled:
-            lora_bias_out_size = lora_b_out_size
-            self.lora_bias_stacked = tuple(
-                torch.zeros(
-                    max_loras,
-                    1,
-                    lora_bias_out_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ) for _ in range(self.n_slices))
-        self.output_slices = (self.lora_b_stacked[0].shape[2], )
+            )
+            for _ in range(self.n_slices)
+        )
+        self.output_slices = (self.lora_b_stacked[0].shape[2],)
 
     def reset_lora(self, index: int):
         for s_index in range(self.n_slices):
             self.lora_a_stacked[s_index][index] = 0
             self.lora_b_stacked[s_index][index] = 0
-            if self.lora_config.bias_enabled:
-                # Make mypy happy
-                self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                              self.lora_bias_stacked)
-                self.lora_bias_stacked[s_index][index] = 0
 
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        lora_bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
-        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
-                self.n_slices == 1)
+        assert (
+            len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
+        )
 
         self.reset_lora(index)
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if lora_bias is not None:
-                lora_bias = self.slice_bias(lora_bias)
-
-        self.lora_a_stacked[0][index,
-                               0, :lora_a.shape[0], :lora_a.shape[1]].copy_(
-                                   lora_a, non_blocking=True)
-        self.lora_b_stacked[0][index,
-                               0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
-                                   lora_b, non_blocking=True)
-        if lora_bias is not None:
-
-            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                          self.lora_bias_stacked)
-            assert len(self.lora_bias_stacked)
-            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
-                lora_bias, non_blocking=True)
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True
+        )
+        self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
+            lora_b, non_blocking=True
+        )
+
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # In transformers backend, x and output have extra batch dimension like
+        # In Transformers modeling backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
         if x.ndim == 3 and output.ndim == 3:
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_linear(
-                output, x, self.lora_a_stacked, self.lora_b_stacked,
-                self.lora_bias_stacked, 1.0, self.output_slices)
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_linear(
+            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices
+        )
         if not current_platform.can_update_inplace():
             output = lora_output
 
@@ -158,7 +138,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
     @property
     def weight(self) -> torch.Tensor:
-
         # unquantizedLinear
         if hasattr(self.base_layer, "weight"):
             return self.base_layer.weight
@@ -178,7 +157,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             raise ValueError(f"Unsupported base layer: {self.base_layer}")
 
     @property
-    def bias(self) -> Optional[torch.Tensor]:
+    def bias(self) -> torch.Tensor | None:
         if hasattr(self.base_layer, "bias"):
             return self.base_layer.bias
         else:
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 6284576446c8f4ccfd13c6c49aebe156d180d3a9..637ded9b2a0f0c1a0765bc025083e3b09050a164 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union, cast
 
 import torch
 import torch.nn as nn
@@ -10,9 +9,11 @@ from transformers import PretrainedConfig
 from vllm.config.lora import LoRAConfig
 from vllm.distributed import tensor_model_parallel_all_gather
 from vllm.distributed.utils import divide
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+)
 from vllm.platforms import current_platform
 
 from .base_linear import BaseLinearLayerWithLoRA
@@ -20,14 +21,16 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
 
 
 def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
-    """ 
-    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    """
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from
     `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
     """
-    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
-        layer.lora_b_stacked) == len(layer.output_slices))
-    if layer.lora_bias_stacked is not None:
-        assert layer.n_slices == len(layer.lora_bias_stacked)
+    assert (
+        layer.n_slices
+        == len(layer.lora_a_stacked)
+        == len(layer.lora_b_stacked)
+        == len(layer.output_slices)
+    )
 
     output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
 
@@ -42,22 +45,23 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
         device=x.device,
     )
 
-    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
-        buffers, x, layer.lora_a_stacked, 1.0)
+    shrunk_buffers: torch.Tensor | None = layer.punica_wrapper.add_shrink(
+        buffers, x, layer.lora_a_stacked, 1.0
+    )
 
     if not current_platform.can_update_inplace():
         buffers = shrunk_buffers
 
     buffers = tensor_model_parallel_all_gather(buffers)
 
-    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+    lora_output: torch.Tensor | None = layer.punica_wrapper.add_expand(
         output,
         buffers,
         layer.lora_b_stacked,
-        layer.lora_bias_stacked,
         layer.output_slices,
         offset_start=0,
-        add_input=True)
+        add_input=True,
+    )
 
     if not current_platform.can_update_inplace():
         output = lora_output
@@ -81,8 +85,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         # The base_layer type is ColumnParallelLinear or
         # MergedColumnParallelLinear, their weight sharding logic is
         # inconsistent when TP is greater than 1.
-        self.is_merged_col_linear = type(
-            base_layer) is MergedColumnParallelLinear
+        self.is_merged_col_linear = type(base_layer) is MergedColumnParallelLinear
         self.output_size = self.base_layer.output_size_per_partition
         # There is only one LoRA layer
         self.n_slices = 1
@@ -97,10 +100,14 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
             shard_size = self.output_size // 2
             offset = lora_b.shape[0] // 2
 
-            left_weight = lora_b[self.tp_rank * shard_size:(self.tp_rank + 1) *
-                                 shard_size, :]
-            right_weight = lora_b[offset + self.tp_rank * shard_size:offset +
-                                  (self.tp_rank + 1) * shard_size, :]
+            left_weight = lora_b[
+                self.tp_rank * shard_size : (self.tp_rank + 1) * shard_size, :
+            ]
+            right_weight = lora_b[
+                offset + self.tp_rank * shard_size : offset
+                + (self.tp_rank + 1) * shard_size,
+                :,
+            ]
             lora_b = torch.cat([left_weight, right_weight], dim=0)
         # Applicable to cases where the base_layer is
         # ColumnParallelLinear.
@@ -111,19 +118,9 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
             lora_b = lora_b[start_idx:end_idx, :]
         return lora_b
 
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        # TODO: Fix the slicing logic of bias.
-        if bias is None:
-            return bias
-        shard_size = self.output_size
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        bias = bias[start_idx:end_idx]
-        return bias
-
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -133,8 +130,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
             - output
             - bias
         """
-        bias = (self.base_layer.bias
-                if not self.base_layer.skip_bias_add else None)
+        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
 
         # Matrix multiply.
         output_parallel = self.apply(input_, bias)
@@ -147,8 +143,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         if not self.base_layer.return_bias:
             return output
 
-        output_bias = (self.base_layer.bias
-                       if self.base_layer.skip_bias_add else None)
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
         return output, output_bias
 
     @classmethod
@@ -158,11 +153,12 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
-            and len(packed_modules_list) == 1)
+            and len(packed_modules_list) == 1
+        )
 
 
 class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
@@ -175,33 +171,36 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     """
 
     def __init__(
-        self, base_layer: Union[MergedColumnParallelLinear,
-                                QKVParallelLinear]) -> None:
+        self, base_layer: MergedColumnParallelLinear | QKVParallelLinear
+    ) -> None:
         super().__init__(base_layer)
         # There are two LoRA layers
         # the output_sizes in MergedColumnParallelLinear is not sharded by tp
         # we need to divide it by the tp_size to get correct slices size
         output_sizes = self.base_layer.output_sizes
         self.output_slices = tuple(
-            divide(output_size, self.tp_size) for output_size in output_sizes)
+            divide(output_size, self.tp_size) for output_size in output_sizes
+        )
         self.n_slices = len(self.output_slices)
-        self.output_ids = (self.tp_rank, ) * self.n_slices
+        self.output_ids = (self.tp_rank,) * self.n_slices
 
     def create_lora_weights(
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """
-        The main reason for overriding this function is to enhance  code 
+        The main reason for overriding this function is to enhance  code
         maintainability.
         """
         self.lora_config = lora_config
 
         lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
+            lora_config.max_lora_rank
+            if not lora_config.fully_sharded_loras
+            else divide(lora_config.max_lora_rank, self.tp_size)
+        )
 
         self.lora_a_stacked = tuple(
             torch.zeros(
@@ -211,7 +210,9 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(self.n_slices))
+            )
+            for _ in range(self.n_slices)
+        )
         self.lora_b_stacked = tuple(
             torch.zeros(
                 max_loras,
@@ -220,78 +221,50 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for output_size in self.output_slices)
-        if lora_config.bias_enabled:
-            self.lora_bias_stacked = tuple(
-                torch.zeros(
-                    max_loras,
-                    1,
-                    output_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ) for output_size in self.output_slices)
+            )
+            for output_size in self.output_slices
+        )
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         return lora_a
 
     def slice_lora_b(
-        self, lora_b: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_b: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         sliced_lora_b = [None] * self.n_slices
         for i, (shard_id, shard_size) in enumerate(
-                zip(self.output_ids, self.output_slices)):
+            zip(self.output_ids, self.output_slices)
+        ):
             if (lora_b_i := lora_b[i]) is not None:
-                sliced_lora_b[i] = lora_b_i[shard_size * shard_id:shard_size *
-                                            (shard_id + 1), :]
+                sliced_lora_b[i] = lora_b_i[
+                    shard_size * shard_id : shard_size * (shard_id + 1), :
+                ]
         return sliced_lora_b
 
-    def slice_bias(
-        self, bias: list[Union[torch.Tensor,
-                               None]]) -> list[Union[torch.Tensor, None]]:
-        for i, (shard_id, shard_size) in enumerate(
-                zip(self.output_ids, self.output_slices)):
-            if (bias_i := bias[i]) is not None:
-                bias[i] = bias_i[shard_size * shard_id:shard_size *
-                                 (shard_id + 1)]
-        return bias
-
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        lora_bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if lora_bias is not None:
-                lora_bias = self.slice_bias(lora_bias)
 
         for i in range(self.n_slices):
             if (lora_a_i := lora_a[i]) is not None:
                 self.lora_a_stacked[i][
-                    index, 0, :lora_a_i.shape[0], :lora_a_i.shape[1]].copy_(
-                        lora_a_i, non_blocking=True)
+                    index, 0, : lora_a_i.shape[0], : lora_a_i.shape[1]
+                ].copy_(lora_a_i, non_blocking=True)
             if (lora_b_i := lora_b[i]) is not None:
                 self.lora_b_stacked[i][
-                    index, 0, :lora_b_i.shape[0], :lora_b_i.shape[1]].copy_(
-                        lora_b_i, non_blocking=True)
-
-        if lora_bias is not None:
-            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                          self.lora_bias_stacked)
-            for i in range(self.n_slices):
-                if (lora_bias_i := lora_bias[i]) is not None:
-                    self.lora_bias_stacked[i][index,
-                                              0, :lora_bias_i.shape[0]].copy_(
-                                                  lora_bias_i,
-                                                  non_blocking=True)
+                    index, 0, : lora_b_i.shape[0], : lora_b_i.shape[1]
+                ].copy_(lora_b_i, non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -300,10 +273,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
-        return (type(source_layer) is MergedColumnParallelLinear
-                and len(packed_modules_list) == 2)
+        return (
+            type(source_layer) is MergedColumnParallelLinear
+            and len(packed_modules_list) == 2
+        )
 
 
 class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
@@ -321,57 +296,52 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-        self.q_proj_total_size = (self.base_layer.total_num_heads *
-                                  self.base_layer.head_size)
-        self.q_proj_shard_size = (self.base_layer.num_heads *
-                                  self.base_layer.head_size)
-        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
-                                   self.base_layer.head_size)
-        self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
-                                   self.base_layer.head_size)
+        self.q_proj_total_size = (
+            self.base_layer.total_num_heads * self.base_layer.head_size
+        )
+        self.q_proj_shard_size = self.base_layer.num_heads * self.base_layer.head_size
+        self.kv_proj_shard_size = (
+            self.base_layer.num_kv_heads * self.base_layer.head_size
+        )
+        self.kv_proj_total_size = (
+            self.base_layer.total_num_kv_heads * self.base_layer.head_size
+        )
         # There is only one LoRA layer
         self.n_slices = 1
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-
         self.q_shard_id = self.tp_rank
         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
-        lora_b_q = lora_b[self.q_proj_shard_size *
-                          self.q_shard_id:self.q_proj_shard_size *
-                          (self.q_shard_id + 1), :]
+        lora_b_q = lora_b[
+            self.q_proj_shard_size * self.q_shard_id : self.q_proj_shard_size
+            * (self.q_shard_id + 1),
+            :,
+        ]
         k_offset = self.q_proj_total_size
-        lora_b_k = lora_b[k_offset +
-                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
-                          self.kv_proj_shard_size * (self.kv_shard_id + 1), :]
+        lora_b_k = lora_b[
+            k_offset + self.kv_proj_shard_size * self.kv_shard_id : k_offset
+            + self.kv_proj_shard_size * (self.kv_shard_id + 1),
+            :,
+        ]
         v_offset = k_offset + self.kv_proj_total_size
-        lora_b_v = lora_b[v_offset +
-                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
-                          self.kv_proj_shard_size * (self.kv_shard_id + 1), :]
+        lora_b_v = lora_b[
+            v_offset + self.kv_proj_shard_size * self.kv_shard_id : v_offset
+            + self.kv_proj_shard_size * (self.kv_shard_id + 1),
+            :,
+        ]
         lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=0)
         return lora_b
 
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        bias_q = bias[self.q_proj_shard_size *
-                      self.q_shard_id:self.q_proj_shard_size *
-                      (self.q_shard_id + 1)]
-        k_offset = self.q_proj_total_size
-        bias_k = bias[k_offset +
-                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
-                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        v_offset = k_offset + self.kv_proj_total_size
-        bias_v = bias[v_offset +
-                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
-                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
-        return bias
-
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: list,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 1
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
 
 class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
@@ -390,10 +360,10 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         # There are three LoRA layer.
         self.n_slices = len(self.base_layer.output_sizes)
 
-        self.q_proj_shard_size = (self.base_layer.num_heads *
-                                  self.base_layer.head_size)
-        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
-                                   self.base_layer.head_size)
+        self.q_proj_shard_size = self.base_layer.num_heads * self.base_layer.head_size
+        self.kv_proj_shard_size = (
+            self.base_layer.num_kv_heads * self.base_layer.head_size
+        )
         self.q_shard_id = self.tp_rank
         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
 
@@ -412,10 +382,10 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """
-        The main reason for overloading this function is to handle inconsistent 
+        The main reason for overloading this function is to handle inconsistent
         weight dimensions in qkv lora.
         """
         super().create_lora_weights(max_loras, lora_config, model_config)
@@ -427,10 +397,9 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
-        return (type(source_layer) is QKVParallelLinear
-                and len(packed_modules_list) == 3)
+        return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
 
 # These following layers are based on the tensor parallelism strategy given in
@@ -453,12 +422,10 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
-        lora_a = lora_a[start_idx:start_idx + shard_size, :]
+        lora_a = lora_a[start_idx : start_idx + shard_size, :]
         return lora_a
 
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -468,7 +435,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -480,8 +447,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         )
 
 
-class MergedColumnParallelLinearWithShardedLoRA(
-        MergedColumnParallelLinearWithLoRA):
+class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLoRA):
     """
     Differs from MergedColumnParallelLinearWithLoRA by slicing the
     LoRA A's also.
@@ -490,22 +456,22 @@ class MergedColumnParallelLinearWithShardedLoRA(
     """
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
-        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
+        # NOTE: lora_a contains 2 subloras, and each sublora could be None.
         output_shard_size = self.lora_a_stacked[0].shape[2]
         output_start_idx = self.tp_rank * output_shard_size
         lora_a = [
-            lora_a[0][output_start_idx:output_start_idx +
-                      output_shard_size, :] if lora_a[0] is not None else None,
-            lora_a[1][output_start_idx:output_start_idx +
-                      output_shard_size, :] if lora_a[1] is not None else None,
+            lora_a[0][output_start_idx : output_start_idx + output_shard_size, :]
+            if lora_a[0] is not None
+            else None,
+            lora_a[1][output_start_idx : output_start_idx + output_shard_size, :]
+            if lora_a[1] is not None
+            else None,
         ]
         return lora_a
 
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -515,7 +481,7 @@ class MergedColumnParallelLinearWithShardedLoRA(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -538,19 +504,21 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
-        lora_a = lora_a[start_idx:start_idx + shard_size, :]
+        lora_a = lora_a[start_idx : start_idx + shard_size, :]
         return lora_a
 
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: list,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -563,31 +531,32 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
 
 class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
     """
-    Differs from MergedQKVParallelLinearWithLoRA by slicing the 
+    Differs from MergedQKVParallelLinearWithLoRA by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
     """
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         # NOTE: lora_a contains 3 subloras, and each sublora could be None.
         shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
         start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
         lora_a = [
-            lora_a[0][start_idx[0]:start_idx[0] +
-                      shard_size[0], :] if lora_a[0] is not None else None,
-            lora_a[1][start_idx[1]:start_idx[1] +
-                      shard_size[1], :] if lora_a[1] is not None else None,
-            lora_a[2][start_idx[2]:start_idx[2] +
-                      shard_size[2], :] if lora_a[2] is not None else None,
+            lora_a[0][start_idx[0] : start_idx[0] + shard_size[0], :]
+            if lora_a[0] is not None
+            else None,
+            lora_a[1][start_idx[1] : start_idx[1] + shard_size[1], :]
+            if lora_a[1] is not None
+            else None,
+            lora_a[2][start_idx[2] : start_idx[2] + shard_size[2], :]
+            if lora_a[2] is not None
+            else None,
         ]
         return lora_a
 
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -597,7 +566,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fb3efa220f6d4eadaf6dd32b831e239912eea32
--- /dev/null
+++ b/vllm/lora/layers/fused_moe.py
@@ -0,0 +1,472 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm import envs
+from vllm.config.lora import LoRAConfig
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.lora.layers.base import BaseLayerWithLoRA
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import (
+    _get_config_dtype_str,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    modular_marlin_fused_moe,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    modular_triton_fused_moe,
+    try_get_optimal_moe_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
+
+
+class FusedMoEWithLoRA(BaseLayerWithLoRA):
+    def __init__(self, base_layer: FusedMoE) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+
+        assert not self.base_layer.use_ep, (
+            "EP support for Fused MoE LoRA is not implemented yet."
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.device = base_layer.w2_weight.device
+        self._inject_lora_into_fused_moe()
+
+    def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
+        normalized_config = {}
+        for key, value in config.items():
+            if key.islower():
+                if key.startswith("block_"):
+                    normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper()
+                else:
+                    normalized_key = key.upper()
+            else:
+                normalized_key = key
+            normalized_config[normalized_key] = value
+        return normalized_config
+
+    def _get_lora_moe_configs(
+        self,
+        op_prefix: str,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        num_slices: int,
+        M: int,
+        layer: FusedMoE,
+        top_k: int,
+        config_dtype: str,
+    ):
+        if envs.VLLM_TUNED_CONFIG_FOLDER:
+            shrink_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_shrink",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+            expand_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_expand",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+        else:  # fall back to the default config
+            get_config_func = functools.partial(
+                try_get_optimal_moe_config,
+                layer.w13_weight.size(),
+                layer.w2_weight.size(),
+                top_k,
+                config_dtype,
+                block_shape=layer.quant_method.moe_quant_config.block_shape,
+            )
+            shrink_config = get_config_func(M)
+            expand_config = get_config_func(M)
+        shrink_config = self._normalize_keys(shrink_config)
+        expand_config = self._normalize_keys(expand_config)
+        return shrink_config, expand_config
+
+    def _inject_lora_into_fused_moe(self):
+        moe_state_dict = {}
+        top_k = self.base_layer.top_k
+
+        self.base_layer.ensure_moe_quant_config_init()
+        quant_config = self.base_layer.quant_method.moe_quant_config
+
+        m_fused_moe_fn = (
+            modular_triton_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
+            )
+            if not quant_config.use_mxfp4_w4a16
+            else modular_marlin_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
+            )
+        )
+
+        def fwd_decorator(layer, func):
+            def wrapper(*args, **kwargs):
+                moe_state_dict["hidden_states"] = kwargs["hidden_states"]
+                moe_state_dict["topk_ids"] = kwargs["topk_ids"]
+                moe_state_dict["topk_weights"] = kwargs["topk_weights"]
+                moe_state_dict["expert_map"] = kwargs["expert_map"]
+                moe_state_dict["apply_router_weight_on_input"] = kwargs[
+                    "apply_router_weight_on_input"
+                ]
+                result = func(*args, **kwargs)
+                return result
+
+            return wrapper
+
+        def act_decorator(layer, func):
+            def wrapper(*args, **kwargs):
+                _, output, input = args
+
+                hidden_states = moe_state_dict["hidden_states"]
+                topk_weights = moe_state_dict["topk_weights"]
+                curr_topk_ids = moe_state_dict["topk_ids"]
+
+                expert_map = moe_state_dict["expert_map"]
+
+                config_dtype = _get_config_dtype_str(
+                    dtype=hidden_states.dtype,
+                    use_fp8_w8a8=False,
+                    use_int8_w8a16=False,
+                    use_int4_w4a16=False,
+                )
+                CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+                num_tokens = hidden_states.size(0)
+                M = min(num_tokens, CHUNK_SIZE)
+
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w13",
+                    lora_a_stacked=self.w1_lora_a_stacked,
+                    lora_b_stacked=self.w1_lora_b_stacked,
+                    num_slices=2,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
+                )
+
+                # get the block size of m from customized config or default config
+                max_loras = self.w1_lora_a_stacked.shape[0]
+                (
+                    sorted_token_ids_lora,
+                    expert_ids_lora,
+                    num_tokens_post_padded_lora,
+                ) = self.punica_wrapper.moe_lora_align_block_size(
+                    curr_topk_ids,
+                    num_tokens,
+                    shrink_config["BLOCK_SIZE_M"],
+                    self.base_layer.local_num_experts,
+                    max_loras,
+                    self.adapter_enabled,
+                    expert_map,
+                )
+
+                moe_state_dict["sorted_token_ids_lora"] = sorted_token_ids_lora
+                moe_state_dict["expert_ids_lora"] = expert_ids_lora
+                moe_state_dict["num_tokens_post_padded_lora"] = (
+                    num_tokens_post_padded_lora
+                )
+
+                w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked]
+                w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked]
+                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
+                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+
+                self.punica_wrapper.add_lora_fused_moe(
+                    input.view(-1, top_k, input.shape[-1]),
+                    hidden_states,
+                    w13_lora_a_stacked,
+                    w13_lora_b_stacked,
+                    topk_weights,
+                    sorted_token_ids_lora,
+                    expert_ids_lora,
+                    num_tokens_post_padded_lora,
+                    max_lora_rank,
+                    top_k,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
+                    self.adapter_enabled,
+                )
+
+                result = func(*args, **kwargs)
+
+                moe_state_dict["intermediate_cache2"] = output
+                return result
+
+            return wrapper
+
+        def moe_sum_decorator(layer, func):
+            def wrapper(*args, **kwargs):
+                hidden_states = moe_state_dict["hidden_states"]
+                topk_weights = moe_state_dict["topk_weights"]
+
+                config_dtype = _get_config_dtype_str(
+                    dtype=hidden_states.dtype,
+                    use_fp8_w8a8=False,
+                    use_int8_w8a16=False,
+                    use_int4_w4a16=False,
+                )
+                CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+                num_tokens = hidden_states.size(0)
+                M = min(num_tokens, CHUNK_SIZE)
+
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w2",
+                    lora_a_stacked=self.w2_lora_a_stacked,
+                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_slices=1,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
+                )
+
+                sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"]
+                expert_ids_lora = moe_state_dict["expert_ids_lora"]
+                num_tokens_post_padded_lora = moe_state_dict[
+                    "num_tokens_post_padded_lora"
+                ]
+                max_loras = self.w1_lora_a_stacked.shape[0]
+                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+                intermediate_cache2 = moe_state_dict["intermediate_cache2"]
+                intermediate_cache3 = args[0]
+                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
+                self.punica_wrapper.add_lora_fused_moe(
+                    intermediate_cache3,
+                    intermediate_cache2,
+                    [self.w2_lora_a_stacked],
+                    [self.w2_lora_b_stacked],
+                    topk_weights,
+                    sorted_token_ids_lora,
+                    expert_ids_lora,
+                    num_tokens_post_padded_lora,
+                    max_lora_rank,
+                    top_k,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
+                    self.adapter_enabled,
+                    True,
+                )
+
+                result = func(*args, **kwargs)
+                return result
+
+            return wrapper
+
+        fused_experts = m_fused_moe_fn.fused_experts
+
+        m_fused_moe_fn.forward = fwd_decorator(self.base_layer, m_fused_moe_fn.forward)
+        fused_experts.activation = act_decorator(
+            self.base_layer, fused_experts.activation
+        )
+        fused_experts.moe_sum = moe_sum_decorator(
+            self.base_layer, fused_experts.moe_sum
+        )
+
+        self.base_layer.quant_method = FusedMoEModularMethod(
+            self.base_layer.quant_method, m_fused_moe_fn
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
+        )
+
+        self.w1_lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                lora_config.max_lora_rank,
+                self.base_layer.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.w1_lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                self.base_layer.intermediate_size_per_partition,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+
+        self.w2_lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                lora_config.max_lora_rank,
+                self.base_layer.intermediate_size_per_partition,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.w2_lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                self.base_layer.hidden_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+
+        self.w3_lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                lora_config.max_lora_rank,
+                self.base_layer.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.w3_lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.local_num_experts,
+                self.base_layer.intermediate_size_per_partition,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+
+        # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
+        # to create a dummy LoRA weights.
+        self.lora_a_stacked = []
+        self.lora_b_stacked = []
+        for lora_id in range(max_loras):
+            for experts_id in range(self.base_layer.local_num_experts):
+                # gate_proj,down_proj,up_proj
+                self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id])
+
+                self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id])
+
+    def reset_lora(self, index: int):
+        """Resets the lora weights at index back to 0."""
+        self.w1_lora_a_stacked[index] = 0
+        self.w1_lora_b_stacked[index] = 0
+        self.w3_lora_a_stacked[index] = 0
+        self.w3_lora_b_stacked[index] = 0
+        self.w2_lora_a_stacked[index] = 0
+        self.w2_lora_b_stacked[index] = 0
+        self.adapter_enabled[index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: torch.Tensor | None,
+        bias: torch.Tensor | None = None,
+    ):
+        """Overwrites lora tensors at index."""
+        self.reset_lora(index)
+        self.adapter_enabled[index] = 1
+        for eid in range(len(lora_a) // 3):
+            w1_lora_a = lora_a[eid * 3]
+            w2_lora_a = lora_a[eid * 3 + 1]
+            w3_lora_a = lora_a[eid * 3 + 2]
+            w1_lora_b = lora_b[eid * 3]
+            w2_lora_b = lora_b[eid * 3 + 1]
+            w3_lora_b = lora_b[eid * 3 + 2]
+
+            # Handle the case of adding LoRA to only a subset of experts
+            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
+                continue
+
+            if self.tp_size > 1:
+                shard_size = self.base_layer.intermediate_size_per_partition
+                start_idx = self.tp_rank * shard_size
+                end_idx = (self.tp_rank + 1) * shard_size
+
+                w1_lora_b = w1_lora_b[start_idx:end_idx, :]
+                w3_lora_b = w3_lora_b[start_idx:end_idx, :]
+                w2_lora_a = w2_lora_a[:, start_idx:end_idx]
+
+            self.w1_lora_a_stacked[
+                index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
+            ].copy_(w1_lora_a, non_blocking=True)
+
+            self.w3_lora_a_stacked[
+                index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
+            ].copy_(w3_lora_a, non_blocking=True)
+
+            self.w2_lora_b_stacked[
+                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
+            ].copy_(w2_lora_b, non_blocking=True)
+
+            self.w1_lora_b_stacked[
+                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
+            ].copy_(w1_lora_b, non_blocking=True)
+            self.w3_lora_b_stacked[
+                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
+            ].copy_(w3_lora_b, non_blocking=True)
+            self.w2_lora_a_stacked[
+                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
+            ].copy_(w2_lora_a, non_blocking=True)
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        # return type(source_layer) is FusedMoE
+        return isinstance(source_layer, FusedMoE)
+
+    def forward(self, *args, **kwargs):
+        return self.base_layer.forward(*args, **kwargs)
+
+    def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs):
+        return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs)
+
+    @property
+    def _shared_experts(self):
+        return self.base_layer._shared_experts
+
+    @property
+    def quant_method(self):
+        return self.base_layer.quant_method
+
+    @property
+    def is_internal_router(self) -> bool:
+        return self.base_layer.is_internal_router
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index b8fbad3a4af01f53dbdd8cf69d2b40e85f0960d0..adc5e861f57fbd06aea8f16622d3738dc5453118 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -2,18 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
 from vllm.config.lora import LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.platforms import current_platform
 
 from .base import BaseLayerWithLoRA
@@ -34,9 +34,14 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             no reindexing will be done.
     """
 
-    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
-                 dtype: torch.dtype, device: torch.device,
-                 sharded_to_full_mapping: Optional[list[int]]) -> None:
+    def __init__(
+        self,
+        base_layer: LogitsProcessor,
+        hidden_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        sharded_to_full_mapping: list[int] | None,
+    ) -> None:
         super().__init__()
         self.base_layer = base_layer
         self.hidden_size = hidden_size
@@ -82,12 +87,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         # TODO: Verify if this condition can be further relaxed
         if 32000 < self.base_layer.vocab_size > 257024:
-            raise ValueError("When using LoRA, vocab size must be "
-                             "32000 >= vocab_size <= 257024")
+            raise ValueError(
+                "When using LoRA, vocab size must be 32000 >= vocab_size <= 257024"
+            )
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
@@ -103,9 +109,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
                 max_loras,
                 1,
                 # Pad for kernel compatibility
-                math.ceil(self.base_layer.vocab_size /
-                          lora_config.lora_vocab_padding_size) *
-                lora_config.lora_vocab_padding_size,
+                math.ceil(
+                    self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
+                )
+                * lora_config.lora_vocab_padding_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -119,9 +126,8 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         )
         if self.sharded_to_full_mapping is not None:
             self.sharded_to_full_mapping_gpu = torch.tensor(
-                self.sharded_to_full_mapping,
-                device=self.device,
-                dtype=torch.long)
+                self.sharded_to_full_mapping, device=self.device, dtype=torch.long
+            )
         else:
             self.sharded_to_full_mapping_gpu = None
 
@@ -135,29 +141,28 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[0], :lora_a.shape[1]].copy_(
-                                lora_a, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
-                                lora_b, non_blocking=True)
+        self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True
+        )
+        self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
+            lora_b, non_blocking=True
+        )
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index,
-                :embeddings_tensor.shape[0],
-                :embeddings_tensor.shape[1],
+                : embeddings_tensor.shape[0],
+                : embeddings_tensor.shape[1],
             ] = embeddings_tensor
 
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | None:
         # Get the logits for the next tokens.
         logits = lm_head.quant_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
@@ -195,41 +200,41 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             dtype=self.embeddings_tensors.dtype,
             device=self.embeddings_tensors.device,
         )
-        torch.matmul(self.embeddings_tensors,
-                     hidden_states.T,
-                     out=lora_logits[:-1])
+        torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
 
-        neg_inf, pos_inf = current_platform.get_infinity_values(
-            lora_logits.dtype)
+        neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
 
         lora_logits[-1] = neg_inf
         lora_logits = lora_logits.mT
         indices_padded = self.punica_wrapper.sampler_indices_padded
 
         if current_platform.is_tpu() or current_platform.is_xpu():
-            indices_padded = indices_padded[:logits.size(0)]
-
-        lora_logits = (lora_logits.reshape(
-            lora_logits.shape[0] * lora_logits.shape[1],
-            lora_logits.shape[2],
-        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
-                                                      posinf=pos_inf,
-                                                      neginf=neg_inf))
+            indices_padded = indices_padded[: logits.size(0)]
+
+        lora_logits = (
+            lora_logits.reshape(
+                lora_logits.shape[0] * lora_logits.shape[1],
+                lora_logits.shape[2],
+            )
+            .index_select(0, indices_padded)
+            .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
+        )
 
-        logits[:,
-               self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1]] = lora_logits
+        logits[
+            :,
+            self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
+            + lora_logits.shape[1],
+        ] = lora_logits
 
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_logits(
-                logits, hidden_states, self.lora_a_stacked,
-                self.lora_b_stacked, 1.0)
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
+            logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
+        )
 
         if not current_platform.can_update_inplace():
             logits = lora_output
 
         # Remove paddings in vocab (if any).
-        logits = logits[:, :self.base_layer.vocab_size]
+        logits = logits[:, : self.base_layer.vocab_size]
         return logits
 
     def forward(self, *args, **kwargs):
@@ -241,7 +246,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/qkv_x_parallel_linear.py b/vllm/lora/layers/qkv_x_parallel_linear.py
deleted file mode 100644
index 367482d0ee0785f82c34a60bd3805b68b343c4ef..0000000000000000000000000000000000000000
--- a/vllm/lora/layers/qkv_x_parallel_linear.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from .base import BaseLayerWithLoRA
-
-
-#TODO: Implement this
-class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
-    pass
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 18a8f13ed94275bb25ab8fab17e5146cde1b92e9..243736c4ebc654b665622134473b8209597b6b35 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -14,16 +13,17 @@ from .base_linear import BaseLinearLayerWithLoRA
 
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
-
     def __init__(self, base_layer: ReplicatedLinear) -> None:
-        super().__init__(base_layer, )
+        super().__init__(
+            base_layer,
+        )
         # To ensure interface compatibility, set to 1 always.
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -33,14 +33,12 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
             - output
             - bias
         """
-        bias = (self.base_layer.bias
-                if not self.base_layer.skip_bias_add else None)
+        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
 
         # Matrix multiply.
         output = self.apply(input_, bias)
 
-        output_bias = (self.base_layer.bias
-                       if self.base_layer.skip_bias_add else None)
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
 
         if not self.base_layer.return_bias:
             return output
@@ -55,6 +53,18 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
+
+    def slice_lora_a(
+        self, lora_a: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
+        """Slice lora a if splitting for tensor parallelism."""
+        return lora_a
+
+    def slice_lora_b(
+        self, lora_b: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
+        """Slice lora b if splitting with tensor parallelism."""
+        return lora_b
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index d468655e629ae8ffbd30408b2c9c4ae2fb608dd1..2ef1bd98fc61255cf046f94f07de8ebce085e03b 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union, cast
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
 from vllm.config.lora import LoRAConfig
-from vllm.distributed import (split_tensor_along_last_dim,
-                              tensor_model_parallel_all_reduce)
-# yapf: disable
+from vllm.distributed import (
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.linear import RowParallelLinear
 from vllm.platforms import current_platform
 
@@ -19,7 +19,6 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
 
 
 class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
-
     def __init__(self, base_layer: RowParallelLinear) -> None:
         super().__init__(base_layer)
 
@@ -30,22 +29,18 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-
         shard_size = self.input_size
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        lora_a = lora_a[:,start_idx:end_idx]
+        lora_a = lora_a[:, start_idx:end_idx]
         return lora_a
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         return lora_b
 
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        return bias
-
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of RowParallelLinear
 
         Args:
@@ -63,7 +58,8 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         else:
             # TODO: simplify code below
             splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.tp_size)
+                input_, num_partitions=self.tp_size
+            )
             input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
@@ -74,8 +70,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
             output_ = output_parallel
 
         if not self.base_layer.skip_bias_add:
-            output = (output_ + self.base_layer.bias
-                      if self.base_layer.bias is not None else output_)
+            output = (
+                output_ + self.base_layer.bias
+                if self.base_layer.bias is not None
+                else output_
+            )
             output_bias = None
         else:
             output = output_
@@ -93,16 +92,16 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
 
-
 # The following layer is based on the tensor parallelism strategy given in
 # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
 # https://arxiv.org/abs/2311.03285.
 
+
 class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
     Differs from RowParallelLinearWithLoRA by slicing the
@@ -117,39 +116,26 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         shard_size = self.lora_b_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        lora_b = lora_b[ start_idx:end_idx,:]
+        lora_b = lora_b[start_idx:end_idx, :]
         return lora_b
 
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        if bias is None:
-            return bias
-        self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                      self.lora_bias_stacked)
-        shard_size = self.lora_bias_stacked[0].shape[2]
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        bias = bias[start_idx:end_idx]
-        return bias
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
         x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
+        output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
         buffer = torch.zeros(
             (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
             dtype=torch.float32,
             device=x.device,
         )
 
-        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
-            buffer, x, self.lora_a_stacked, 1.0)
+        shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0
+        )
         if not current_platform.can_update_inplace():
             buffer = shrunk_buffer
-        if self.tp_size>1:
+        if self.tp_size > 1:
             buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -161,11 +147,10 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         # NOTE offset are based on the rank.
         shard_size = self.lora_b_stacked[0].shape[2]
         offset_start = self.tp_rank * shard_size
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_expand(
             output,
             buffer,
             self.lora_b_stacked,
-            self.lora_bias_stacked,
             self.output_slices,
             offset_start=offset_start,
             add_input=True,
@@ -184,7 +169,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 772d32a44c22d1eca76e5f1aeb90b5453b70f47b..2da90f180ee74bac260166155155545da3cefe73 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -45,8 +45,7 @@ def _not_fully_sharded_can_replace(can_replace):
 
     def dec(*args, **kwargs):
         decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
-        condition = (not kwargs["lora_config"].fully_sharded_loras
-                     if decorate else True)
+        condition = not kwargs["lora_config"].fully_sharded_loras if decorate else True
         return can_replace(*args, **kwargs) and condition
 
     return dec
@@ -59,7 +58,8 @@ def _fully_sharded_can_replace(can_replace):
     """
 
     def dec(*args, **kwargs):
-        return (can_replace(*args, **kwargs)
-                and kwargs["lora_config"].fully_sharded_loras)
+        return (
+            can_replace(*args, **kwargs) and kwargs["lora_config"].fully_sharded_loras
+        )
 
     return dec
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index ca01c7e17fff423789cb101131c15010d9e95590..ca4ad8012e9c35414aa6c4a4083844ac229069f3 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -9,40 +8,40 @@ import torch.nn.functional as F
 from transformers import PretrainedConfig
 
 from vllm.config.lora import LoRAConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.platforms import current_platform
 
 from .base import BaseLayerWithLoRA
 
 
 class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
-
     def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
-        self.embeddings_slice: Optional[tuple[int, int]]
-        self.embeddings_weights: Optional[torch.Tensor]
+        self.embeddings_slice: tuple[int, int] | None
+        self.embeddings_weights: torch.Tensor | None
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
-
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
         if self.base_layer.num_added_embeddings_per_partition > 0:
             # We can start adding lora weights
             self.embeddings_weights = self.base_layer.weight.data[
-                self.base_layer.num_org_embeddings_per_partition:self.
-                base_layer.num_org_embeddings_per_partition +
-                self.base_layer.num_added_embeddings_per_partition]
+                self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition  # noqa: E501
+                + self.base_layer.num_added_embeddings_per_partition
+            ]
             self.embeddings_slice = (
-                self.base_layer.shard_indices.added_vocab_start_index -
-                self.base_layer.org_vocab_size,
-                self.base_layer.shard_indices.added_vocab_end_index -
-                self.base_layer.org_vocab_size)
+                self.base_layer.shard_indices.added_vocab_start_index
+                - self.base_layer.org_vocab_size,
+                self.base_layer.shard_indices.added_vocab_end_index
+                - self.base_layer.org_vocab_size,
+            )
             self.base_layer.weight.data[
-                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
+                self.base_layer.num_org_embeddings_per_partition :
+            ].fill_(0)
         else:
             self.embeddings_slice = None
             self.embeddings_weights = None
@@ -59,8 +58,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
-                self.base_layer.org_vocab_size +
-                lora_config.lora_extra_vocab_size,
+                self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -91,37 +89,35 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
         # so we need transpose here
-        self.lora_a_stacked[index, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-            lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
-                                lora_b, non_blocking=True)
+        self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_(
+            lora_a.T, non_blocking=True
+        )
+        self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
+            lora_b, non_blocking=True
+        )
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index,
-                :embeddings_tensor.shape[0],
-                :embeddings_tensor.shape[1],
+                : embeddings_tensor.shape[0],
+                : embeddings_tensor.shape[1],
             ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
                 embeddings = self.embeddings_tensors.view(
-                    self.embeddings_tensors.shape[0] *
-                    self.embeddings_tensors.shape[1],
+                    self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1],
                     self.embeddings_tensors.shape[2],
-                )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                )[self.embeddings_slice[0] : self.embeddings_slice[1]]
                 assert self.embeddings_weights is not None
-                self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
+                self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
-                                        1, 0)
+        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0)
 
         # NB: Don't use torch.narrow here. torch.narrow triggers some
         # Dynamic Shape specialization in torch.compile
@@ -133,26 +129,22 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
             x + indices_1,
             self.lora_a_stacked_2d,
         )
-        full_output = self.base_layer.forward(x +
-                                              (indices_0 * added_tokens_mask))
+        full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask))
 
         full_output_org = full_output
         if full_output.ndim == 3:
             full_output = full_output.view(
-                full_output.shape[0] * full_output.shape[1], -1)
+                full_output.shape[0] * full_output.shape[1], -1
+            )
         if full_lora_a_embeddings.ndim == 3:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
-                full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1],
+                full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1],
                 -1,
             )
 
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
-                full_output,
-                full_lora_a_embeddings,
-                self.lora_b_stacked,
-                add_input=True)
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_embedding(
+            full_output, full_lora_a_embeddings, self.lora_b_stacked, add_input=True
+        )
 
         if not current_platform.can_update_inplace():
             full_output = lora_output
@@ -165,7 +157,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index 90e18217d28be7a14285fcb36493bc2e35c3908d..7691481d5039ecfbf2d1e3ce4fbbdc61f3dd3b32 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -8,7 +8,7 @@ import torch
 import torch.types
 
 from vllm.lora.peft_helper import PEFTHelper
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 
 
 class LoRALayerWeights:
@@ -21,16 +21,14 @@ class LoRALayerWeights:
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-        embeddings_tensor: Optional[torch.Tensor] = None,
-        scaling: Optional[float] = None,
+        embeddings_tensor: torch.Tensor | None = None,
+        scaling: float | None = None,
     ) -> None:
         self.module_name = module_name
         self.rank = rank
         self.lora_alpha = lora_alpha
         self.lora_a = lora_a
         self.lora_b = lora_b
-        self.bias = bias
         self.embeddings_tensor = embeddings_tensor
 
         if scaling is None:
@@ -60,61 +58,64 @@ class LoRALayerWeights:
 
     @property
     def extra_vocab_size(self) -> int:
-        return self.embeddings_tensor.shape[
-            0] if self.embeddings_tensor is not None else 0
+        return (
+            self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0
+        )
 
     @classmethod
     def from_config(
         cls,
         module_name: str,
         peft_helper: PEFTHelper,
-        embeddings_tensor: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None = None,
     ) -> "LoRALayerWeights":
-        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
-                   None, None, embeddings_tensor,
-                   peft_helper.vllm_lora_scaling_factor)
+        # lora_a and lora_b are set to None for config-based construction
+        return cls(
+            module_name,
+            peft_helper.r,
+            peft_helper.lora_alpha,
+            None,
+            None,
+            embeddings_tensor,
+            peft_helper.vllm_lora_scaling_factor,
+        )
 
     @classmethod
     def create_dummy_lora_weights(
-            cls,
-            module_name: str,
-            input_dim: int,
-            output_dim: int,
-            rank: int,
-            dtype: torch.dtype,
-            device: torch.types.Device,
-            embeddings_tensor_dim: Optional[int] = None,
-            bias_enabled: Optional[bool] = False) -> "LoRALayerWeights":
+        cls,
+        module_name: str,
+        input_dim: int,
+        output_dim: int,
+        rank: int,
+        dtype: torch.dtype,
+        device: torch.types.Device,
+        embeddings_tensor_dim: int | None = None,
+    ) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
-        lora_a = torch.zeros([rank, input_dim],
-                             dtype=dtype,
-                             device=device,
-                             pin_memory=pin_memory)
-        lora_b = torch.zeros([output_dim, rank],
-                             dtype=dtype,
-                             device=device,
-                             pin_memory=pin_memory)
-        if bias_enabled:
-            bias = torch.zeros([output_dim],
-                               dtype=dtype,
-                               device=device,
-                               pin_memory=pin_memory)
-        else:
-            bias = None
-
-        embeddings_tensor = torch.rand(
-            10,
-            embeddings_tensor_dim,
-            dtype=dtype,
-            device=device,
-            pin_memory=pin_memory) if embeddings_tensor_dim else None
+        lora_a = torch.zeros(
+            [rank, input_dim], dtype=dtype, device=device, pin_memory=pin_memory
+        )
+        lora_b = torch.zeros(
+            [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory
+        )
+
+        embeddings_tensor = (
+            torch.rand(
+                10,
+                embeddings_tensor_dim,
+                dtype=dtype,
+                device=device,
+                pin_memory=pin_memory,
+            )
+            if embeddings_tensor_dim
+            else None
+        )
         return cls(
             module_name,
             rank=rank,
             lora_alpha=1,
             lora_a=lora_a,
             lora_b=lora_b,
-            bias=bias,
             embeddings_tensor=embeddings_tensor,
         )
 
@@ -126,11 +127,10 @@ class PackedLoRALayerWeights(LoRALayerWeights):
         self,
         module_name: str,
         rank: int,
-        lora_alphas: list[Optional[int]],
-        lora_a: list[Optional[torch.Tensor]],
-        lora_b: list[Optional[torch.Tensor]],
-        bias: Optional[list[Optional[torch.Tensor]]] = None,
-        scaling: Optional[list[float]] = None,
+        lora_alphas: list[int | None],
+        lora_a: list[torch.Tensor | None],
+        lora_b: list[torch.Tensor | None],
+        scaling: list[float] | None = None,
     ) -> None:
         super().__init__(
             module_name=module_name,
@@ -138,7 +138,6 @@ class PackedLoRALayerWeights(LoRALayerWeights):
             lora_alpha=0,
             lora_a=lora_a,
             lora_b=lora_b,
-            bias=bias,
             scaling=scaling,  # type: ignore
             embeddings_tensor=None,
         )
@@ -170,11 +169,11 @@ class PackedLoRALayerWeights(LoRALayerWeights):
             [lora.lora_alpha if lora is not None else None for lora in loras],
             [lora.lora_a if lora is not None else None for lora in loras],
             [lora.lora_b if lora is not None else None for lora in loras],
-            [lora.bias if lora is not None else None for lora in loras],
             scaling=[
                 1 if lora is not None else None  # type: ignore
                 for lora in loras
-            ])
+            ],
+        )
         return obj
 
     def optimize(self) -> "PackedLoRALayerWeights":
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index cc64cc78affa77957f35c360ec72d4e24ab1945d..02c252f15bfabf3295267eb85ea8c4985b7e5ac1 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,8 +3,8 @@
 
 import math
 import os
-from collections.abc import Sequence
-from typing import Callable, Optional, TypeVar, Union
+from collections.abc import Callable
+from typing import TypeVar
 
 import regex as re
 import safetensors.torch
@@ -13,22 +13,26 @@ from torch import nn
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
+from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
-from vllm.lora.utils import (from_layer, from_layer_logits_processor,
-                             get_supported_lora_modules,
-                             is_regex_target_modules,
-                             parse_fine_tuned_lora_name, replace_submodule)
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.lora.utils import (
+    from_layer,
+    from_layer_logits_processor,
+    get_supported_lora_modules,
+    is_regex_target_modules,
+    parse_fine_tuned_lora_name,
+    process_packed_modules_mapping,
+    replace_submodule,
+)
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
-from vllm.model_executor.utils import get_packed_modules_mapping
-from vllm.utils import LRUCache, is_pin_memory_available
+from vllm.utils.cache import LRUCache
+from vllm.utils.platform_utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -36,12 +40,11 @@ T = TypeVar("T")
 
 
 class AdapterLRUCache(LRUCache[int, T]):
-
     def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: int, value: Optional[T]):
+    def _on_remove(self, key: int, value: T | None):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
@@ -56,17 +59,6 @@ def get_lora_id():
     return _GLOBAL_LORA_ID
 
 
-def is_moe_model(model: nn.Module) -> bool:
-    """Checks if the model contains FusedMoE layers and warns the user."""
-    if any(isinstance(module, FusedMoE) for module in model.modules()):
-        logger.warning_once(
-            "For MoE models, vLLM currently does not support fused MoE LoRA "
-            "inference. Please ensure that the loaded LoRA model does not "
-            "contain expert weights.")
-        return True
-    return False
-
-
 class LoRAModel:
     """A LoRA fine-tuned model."""
 
@@ -85,9 +77,9 @@ class LoRAModel:
         """
         self.id = lora_model_id
 
-        assert (
-            lora_model_id
-            > 0), f"a valid lora id should be greater than 0, got {self.id}"
+        assert lora_model_id > 0, (
+            f"a valid lora id should be greater than 0, got {self.id}"
+        )
         self.rank = rank
         self.loras: dict[str, LoRALayerWeights] = loras
 
@@ -103,10 +95,13 @@ class LoRAModel:
 
     @property
     def extra_vocab_size(self) -> int:
-        return max(lora.extra_vocab_size
-                   for lora in self.loras.values()) if self.loras else 0
+        return (
+            max(lora.extra_vocab_size for lora in self.loras.values())
+            if self.loras
+            else 0
+        )
 
-    def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
 
@@ -121,63 +116,56 @@ class LoRAModel:
         tensors: dict[str, torch.Tensor],
         peft_helper: PEFTHelper,
         device: str = "cuda",
-        dtype: Optional[torch.dtype] = None,
-        embeddings: Optional[dict[str, torch.Tensor]] = None,
-        target_embedding_padding: Optional[int] = None,
-        embedding_modules: Optional[dict[str, str]] = None,
-        embedding_padding_modules: Optional[list[str]] = None,
-        weights_mapper: Optional[WeightsMapper] = None,
+        dtype: torch.dtype | None = None,
+        embeddings: dict[str, torch.Tensor] | None = None,
+        target_embedding_padding: int | None = None,
+        embedding_modules: dict[str, str] | None = None,
+        embedding_padding_modules: list[str] | None = None,
+        weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
-            module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
-                tensor_name, weights_mapper)
+            module_name, is_lora_a = parse_fine_tuned_lora_name(
+                tensor_name, weights_mapper
+            )
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
                     assert embedding_modules is not None
                     embeddings_module = next(
-                        (k for k in embedding_modules if k in module_name),
-                        None)
+                        (k for k in embedding_modules if k in module_name), None
+                    )
                     if embeddings_module:
                         lora_embeddings_tensor = embeddings[
-                            embedding_modules[embeddings_module]].to(
-                                device=device, dtype=dtype)
+                            embedding_modules[embeddings_module]
+                        ].to(device=device, dtype=dtype)
                         if pin_memory:
-                            lora_embeddings_tensor = (
-                                lora_embeddings_tensor.pin_memory())
+                            lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
                 loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper, lora_embeddings_tensor)
+                    module_name, peft_helper, lora_embeddings_tensor
+                )
 
-            if is_bias:
-                loras[module_name].bias = tensor.to(device=device, dtype=dtype)
-                bias = tensor.to(device=device, dtype=dtype)
-                if pin_memory:
-                    bias = bias.pin_memory()
-                loras[module_name].bias = bias
-            elif is_lora_a:
-                loras[module_name].lora_a = tensor.to(device=device,
-                                                      dtype=dtype)
+            if is_lora_a:
+                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
                 if pin_memory:
-                    loras[module_name].lora_a = loras[
-                        module_name].lora_a.pin_memory()
+                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
-                loras[module_name].lora_b = tensor.to(device=device,
-                                                      dtype=dtype)
+                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
                 assert embedding_padding_modules is not None
-                if any(name in module_name
-                       for name in embedding_padding_modules
-                       ) and target_embedding_padding is not None:
+                if (
+                    any(name in module_name for name in embedding_padding_modules)
+                    and target_embedding_padding is not None
+                ):
                     lora_b = loras[module_name].lora_b
                     assert target_embedding_padding >= lora_b.shape[0]
                     addition = target_embedding_padding - lora_b.shape[0]
                     loras[module_name].lora_b = torch.nn.functional.pad(
-                        lora_b, (0, 0, 0, addition))
+                        lora_b, (0, 0, 0, addition)
+                    )
                 if pin_memory:
-                    loras[module_name].lora_b = loras[
-                        module_name].lora_b.pin_memory()
+                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
         for lora in loras.values():
             lora.optimize()
@@ -186,19 +174,20 @@ class LoRAModel:
 
     @classmethod
     def from_local_checkpoint(
-            cls,
-            lora_dir: str,
-            expected_lora_modules: list[str],
-            peft_helper: PEFTHelper,
-            *,
-            lora_model_id: Optional[int] = None,
-            device: str = "cuda",
-            dtype: Optional[torch.dtype] = None,
-            target_embedding_padding: Optional[int] = None,
-            embedding_modules: Optional[dict[str, str]] = None,
-            embedding_padding_modules: Optional[list[str]] = None,
-            weights_mapper: Optional[WeightsMapper] = None,
-            tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
+        cls,
+        lora_dir: str,
+        expected_lora_modules: list[str],
+        peft_helper: PEFTHelper,
+        *,
+        lora_model_id: int | None = None,
+        device: str = "cuda",
+        dtype: torch.dtype | None = None,
+        target_embedding_padding: int | None = None,
+        embedding_modules: dict[str, str] | None = None,
+        embedding_padding_modules: list[str] | None = None,
+        weights_mapper: WeightsMapper | None = None,
+        tensorizer_config_dict: dict | None = None,
+    ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
 
         Args:
@@ -218,37 +207,49 @@ class LoRAModel:
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
         new_embeddings_tensor_path = os.path.join(
-            lora_dir, "new_embeddings.safetensors")
-        new_embeddings_bin_file_path = os.path.join(lora_dir,
-                                                    "new_embeddings.bin")
+            lora_dir, "new_embeddings.safetensors"
+        )
+        new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
         tensors: dict[str, torch.Tensor] = {}
-        unexpected_modules: list[Union[list[str], str]] = []
+        unexpected_modules: list[list[str] | str] = []
 
         def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
-                module_name, _, _ = parse_fine_tuned_lora_name(
-                    lora_module, weights_mapper)
-                part_name = module_name.split(".")[-1]
-                if part_name not in expected_lora_modules:
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
+                # Handle FSDP file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                if "base_layer" in lora_module:
+                    continue
+                # Case for expert lora weights
+                if ".experts" in module_name:
+                    if not any(
+                        module_name.endswith(ele) for ele in expected_lora_modules
+                    ):
+                        unexpected_modules.append(module_name)
+                elif module_name.split(".")[-1] not in expected_lora_modules:
                     unexpected_modules.append(module_name)
+
             if unexpected_modules:
                 raise ValueError(
                     f"While loading {lora_dir}, expected"
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct")
+                    f" Please verify that the loaded LoRA module is correct"
+                )
 
         if tensorizer_config_dict:
             from tensorizer import TensorDeserializer
 
             tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
-            lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
-                                            "adapter_model.tensors")
+            lora_tensor_path = os.path.join(
+                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
+            )
             tensorizer_args = tensorizer_config._construct_tensorizer_args()
             tensors = TensorDeserializer(
                 lora_tensor_path,
                 dtype=tensorizer_config.dtype,
-                **tensorizer_args.deserialization_kwargs)
+                **tensorizer_args.deserialization_kwargs,
+            )
             check_unexpected_modules(tensors)
 
         elif os.path.isfile(lora_tensor_path):
@@ -259,14 +260,12 @@ class LoRAModel:
             # loraified. C won’t exist in the safetensor but it will exist in
             # the target_modules of the adapter_config.json.
             unexpected_modules = []
-            with safetensors.safe_open(lora_tensor_path,
-                                       framework="pt") as f:  # type: ignore
+            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
                 # Load tensors if there are only expected modules.
                 check_unexpected_modules(f)
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
-        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(
-                lora_pt_file_path):
+        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
             # When a bin/pt file is provided, we rely on config to find
             # unexpected modules.
             unexpected_modules = []
@@ -284,33 +283,33 @@ class LoRAModel:
             # https://github.com/vllm-project/vllm/pull/5909. But there's no
             # other better mechanism.
             if unexpected_modules and not is_regex_target_modules(
-                    peft_helper.target_modules, expected_lora_modules):
+                peft_helper.target_modules, expected_lora_modules
+            ):
                 raise ValueError(
                     f"While loading {lora_dir}, expected"
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct")
-            lora_file_path = (lora_bin_file_path
-                              if os.path.isfile(lora_bin_file_path) else
-                              lora_pt_file_path)
-            tensors = torch.load(lora_file_path,
-                                 map_location=device,
-                                 weights_only=True)
+                    f" Please verify that the loaded LoRA module is correct"
+                )
+            lora_file_path = (
+                lora_bin_file_path
+                if os.path.isfile(lora_bin_file_path)
+                else lora_pt_file_path
+            )
+            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
         embeddings = None
         if os.path.isfile(new_embeddings_tensor_path):
-            embeddings = safetensors.torch.load_file(
-                new_embeddings_tensor_path)
+            embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
         elif os.path.isfile(new_embeddings_bin_file_path):
-            embeddings = torch.load(new_embeddings_bin_file_path,
-                                    map_location=device,
-                                    weights_only=True)
+            embeddings = torch.load(
+                new_embeddings_bin_file_path, map_location=device, weights_only=True
+            )
 
         return cls.from_lora_tensors(
-            lora_model_id=get_lora_id()
-            if lora_model_id is None else lora_model_id,
+            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
             tensors=tensors,
             peft_helper=peft_helper,
             device=device,
@@ -319,7 +318,8 @@ class LoRAModel:
             target_embedding_padding=target_embedding_padding,
             embedding_modules=embedding_modules,
             embedding_padding_modules=embedding_padding_modules,
-            weights_mapper=weights_mapper)
+            weights_mapper=weights_mapper,
+        )
 
 
 class LoRAModelManager:
@@ -355,7 +355,7 @@ class LoRAModelManager:
         self.max_num_seqs = max_num_seqs
         assert self.capacity >= self.lora_slots
         self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
-        self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots
+        self.lora_index_to_id: list[int | None] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.punica_wrapper = get_punica_wrapper(
             max_num_batched_tokens,
@@ -368,19 +368,19 @@ class LoRAModelManager:
         assert self.supported_lora_modules, "No supported LoRA modules found in"
         f" {self.model.__class__.__name__}."
 
-        self.packed_modules_mapping = get_packed_modules_mapping(self.model)
+        self.packed_modules_mapping = process_packed_modules_mapping(self.model)
         # Used to indicate whether the model is a multimodal model
         self.supports_mm: bool = (
             supports_multimodal(self.model)
             # In case the model only supports LoRA for
             # text modules (e.g. ChatGLM)
-            and hasattr(self.model, "get_mm_mapping"))
+            and hasattr(self.model, "get_mm_mapping")
+        )
         self.is_pooling_model = is_pooling_model(self.model)
-        self.is_moe_model = is_moe_model(self.model)
         self.packed_modules: dict[str, list[str]] = {}
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
-        self._last_mapping: Optional[LoRAMapping] = None
+        self._last_mapping: LoRAMapping | None = None
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -407,33 +407,75 @@ class LoRAModelManager:
         if lora_id in self._active_adapters:
             return False
         first_free_slot = next(
-            ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
-             if lora_id is None), None)
+            (
+                (i, lora_id)
+                for i, lora_id in enumerate(self.lora_index_to_id)
+                if lora_id is None
+            ),
+            None,
+        )
         if first_free_slot is None:
             raise ValueError("No free lora slots")
         index, _ = first_free_slot
         self._active_adapters[lora_id] = None
         lora_model = self._registered_adapters[lora_id]
-        logger.debug("Activating LoRA. int id: %d, slot index: %d",
-                     lora_model.id, index)
+        logger.debug(
+            "Activating LoRA. int id: %d, slot index: %d", lora_model.id, index
+        )
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = self._get_lora_layer_weights(lora_model, module_name)
             if module_lora:
-                module_lora.optimize()
-                # Bias is not explicitly enabled with the flag enable_lora_bias.
-                bias = module_lora.bias
-                if ((torch.is_tensor(bias) or
-                     (isinstance(bias, Sequence) and any(b is not None
-                                                         for b in bias)))
-                        and not self.lora_config.bias_enabled):
-                    module_lora.bias = None
-                    raise ValueError(
-                        f"Adapter bias cannot be used for {module_name}"
-                        " without --enable-lora-bias.")
-                module.set_lora(index, module_lora.lora_a, module_lora.lora_b,
-                                module_lora.embeddings_tensor,
-                                module_lora.bias)
+                # Note (gnovack) - If MOE lora weights are not split into
+                # num_experts chunks, we split them here
+                if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+                    module_lora.lora_a
+                ):
+                    # Handle FSDP file format where experts.base_layer is the
+                    # gate_up_proj and experts is the down_proj
+                    gate_up_proj_lora = self._get_lora_layer_weights(
+                        lora_model, module_name + ".base_layer"
+                    )
+
+                    assert gate_up_proj_lora is not None
+                    assert module_lora is not None
+
+                    down_proj_lora = module_lora
+                    num_experts = module_lora.lora_a.shape[0] // module_lora.rank
+
+                    gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
+                    up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
+
+                    gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk(
+                        num_experts, dim=-1
+                    )
+                    up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk(
+                        num_experts, dim=-1
+                    )
+
+                    down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0)
+                    down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1)
+
+                    lora_a = []
+                    lora_b = []
+                    for i in range(num_experts):
+                        lora_a.append(gate_proj_a[i])
+                        lora_a.append(down_proj_a[i])
+                        lora_a.append(up_proj_a[i])
+
+                        lora_b.append(gate_proj_b[i])
+                        lora_b.append(down_proj_b[i])
+                        lora_b.append(up_proj_b[i])
+
+                    module_lora.lora_a = lora_a
+                    module_lora.lora_b = lora_b
+
+                module.set_lora(
+                    index,
+                    module_lora.lora_a,
+                    module_lora.lora_b,
+                    module_lora.embeddings_tensor,
+                )
             else:
                 module.reset_lora(index)
         return True
@@ -453,7 +495,8 @@ class LoRAModelManager:
         """Pin a LoRAModel in the manager cache."""
         raise NotImplementedError(
             "Pinning is not supported in LoRAModelManager. "
-            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
+            "Use LRUCacheLoRAModelManager for pinning"
+        )  # type: ignore
 
     def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
         # update lora states
@@ -472,18 +515,17 @@ class LoRAModelManager:
         self._active_adapters.clear()
 
     def _create_lora_modules(self):
-
         def _parent_module(module_name: str) -> str:
             # module name is a dot separated name.
             # for example:
             #  - given an input 'x.y.z' return 'x.y'
             #  - given an input 'x' return ''
-            return module_name.rpartition('.')[0]
+            return module_name.rpartition(".")[0]
 
-        for module_name, module in self.model.named_modules(
-                remove_duplicate=False):
+        for module_name, module in self.model.named_modules(remove_duplicate=False):
             if isinstance(module, PPMissingLayer):
                 continue
+
             if not self._match_target_modules(module_name):
                 continue
             # A temporary approach for multimodal models to support LoRA
@@ -498,35 +540,48 @@ class LoRAModelManager:
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
             new_module = replace_submodule(
-                self.model, module_name,
-                from_layer(module, self.lora_slots, self.lora_config,
-                           packed_moduled_lst, self.model.config))
+                self.model,
+                module_name,
+                from_layer(
+                    module,
+                    self.lora_slots,
+                    self.lora_config,
+                    packed_moduled_lst,
+                    self.model.config,
+                ),
+            )
 
             # (yard1): TODO make this more robust
             if "lm_head" in module_name:
-                logits_processor_module_name = 'logits_processor'
+                logits_processor_module_name = "logits_processor"
                 parent_module = _parent_module(module_name)
                 if parent_module:
                     logits_processor_module_name = (
-                        f"{parent_module}.{logits_processor_module_name}")
+                        f"{parent_module}.{logits_processor_module_name}"
+                    )
 
                 logits_processor_module = self.model.get_submodule(
-                    logits_processor_module_name)
+                    logits_processor_module_name
+                )
 
                 new_module = replace_submodule(
-                    self.model, logits_processor_module_name,
-                    from_layer_logits_processor(logits_processor_module,
-                                                module, self.lora_slots,
-                                                self.lora_config,
-                                                self.model.config))
+                    self.model,
+                    logits_processor_module_name,
+                    from_layer_logits_processor(
+                        logits_processor_module,
+                        module,
+                        self.lora_slots,
+                        self.lora_config,
+                        self.model.config,
+                    ),
+                )
 
             # In some models, especially multimodal ones, layers with the same
             # name may have different types, such as nn.Linear and
             # ReplicatedLinear. The nn.Linear layers cannot be replaced with
             # LoRA layers, leading to assertion error. The following check
             # aims to prevent this error
-            if self.supports_mm and not isinstance(new_module,
-                                                   BaseLayerWithLoRA):
+            if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA):
                 continue
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
@@ -534,37 +589,47 @@ class LoRAModelManager:
             new_module.set_mapping(self.punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
-        assert isinstance(module, BaseLayerWithLoRA)
+        assert isinstance(module, BaseLayerWithLoRA), (
+            f"Module {module_name} must be a BaseLayerWithLoRA instance,"
+        )
+        f" got {type(module)}"
         self.modules[module_name] = module
 
     def create_dummy_lora(
-            self,
-            lora_id: int,
-            rank: int,
-            embedding_modules: Optional[dict[str, str]] = None) -> LoRAModel:
+        self,
+        lora_id: int,
+        rank: int,
+        embedding_modules: dict[str, str] | None = None,
+    ) -> LoRAModel:
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {})
         for module_name, module in self.model.named_modules():
-            bias_enabled = self.lora_config.bias_enabled
-            if (not self._match_target_modules(module_name)
-                    or not isinstance(module, BaseLayerWithLoRA)
-                    or self._filter_unsupported_mm_module(module_name)):
+            if (
+                not self._match_target_modules(module_name)
+                or not isinstance(module, BaseLayerWithLoRA)
+                or self._filter_unsupported_mm_module(module_name)
+            ):
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
                 assert embedding_modules is not None
                 if parts[-1] in embedding_modules:
-                    input_dim = (module.base_layer.org_vocab_size +
-                                 self.lora_config.lora_extra_vocab_size if
-                                 hasattr(module.base_layer, "org_vocab_size")
-                                 else module.base_layer.weight.shape[1])
-                    output_dim = module.base_layer.embedding_dim if hasattr(
-                        module.base_layer,
-                        "embedding_dim") else module.base_layer.weight.shape[0]
-                    embeddings_tensor_dim = (module.base_layer.embedding_dim if
-                                             hasattr(module.base_layer,
-                                                     "embedding_dim") else
-                                             module.base_layer.weight.shape[1])
+                    input_dim = (
+                        module.base_layer.org_vocab_size
+                        + self.lora_config.lora_extra_vocab_size
+                        if hasattr(module.base_layer, "org_vocab_size")
+                        else module.base_layer.weight.shape[1]
+                    )
+                    output_dim = (
+                        module.base_layer.embedding_dim
+                        if hasattr(module.base_layer, "embedding_dim")
+                        else module.base_layer.weight.shape[0]
+                    )
+                    embeddings_tensor_dim = (
+                        module.base_layer.embedding_dim
+                        if hasattr(module.base_layer, "embedding_dim")
+                        else module.base_layer.weight.shape[1]
+                    )
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
                         input_dim,
@@ -573,7 +638,7 @@ class LoRAModelManager:
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                         embeddings_tensor_dim=embeddings_tensor_dim,
-                        bias_enabled=bias_enabled)
+                    )
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -582,12 +647,11 @@ class LoRAModelManager:
                         rank,
                         module.lora_a_stacked[0].dtype,
                         "cpu",
-                        bias_enabled=bias_enabled,
                     )
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: list[Optional[LoRALayerWeights]] = []
+                subloras: list[LoRALayerWeights | None] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
@@ -596,7 +660,6 @@ class LoRAModelManager:
                         rank,
                         module.lora_a_stacked[i].dtype,
                         "cpu",
-                        bias_enabled=bias_enabled,
                     )
                     subloras.append(lora)
                 lora = PackedLoRALayerWeights.pack(subloras)
@@ -606,9 +669,11 @@ class LoRAModelManager:
     def _match_target_modules(self, module_name: str):
         return any(
             re.match(
-                r".*\.{target_module}$".format(target_module=target_module),
-                module_name) or target_module == module_name
-            for target_module in self.supported_lora_modules)
+                r".*\.{target_module}$".format(target_module=target_module), module_name
+            )
+            or target_module == module_name
+            for target_module in self.supported_lora_modules
+        )
 
     def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
@@ -619,8 +684,7 @@ class LoRAModelManager:
         if self.supports_mm:
             module_mapping: MultiModelKeys = self.model.get_mm_mapping()
             prefix_lst = module_mapping.connector + module_mapping.tower_model
-            return any(
-                [module_name.startswith(prefix) for prefix in prefix_lst])
+            return any([module_name.startswith(prefix) for prefix in prefix_lst])
         return False
 
     def _register_packed_modules(self, module_full_name: str) -> None:
@@ -638,7 +702,7 @@ class LoRAModelManager:
 
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
-            replacement_loras: list[Optional[LoRALayerWeights]] = []
+            replacement_loras: list[LoRALayerWeights | None] = []
             replaced_module: set[str] = set()
             has_replacement = False
             for r in new_module_names:
@@ -654,23 +718,22 @@ class LoRAModelManager:
                     continue
                 replacement_loras[i] = None
             # HACK Temporary solution for the pool model.
-            if self.is_pooling_model and not lora_model.check_lora_name(
-                    module_name):
+            if self.is_pooling_model and not lora_model.check_lora_name(module_name):
                 replaced_module_name = module_name.replace("model.", "")
                 if lora_model.check_lora_name(module_name):
                     module_name = replaced_module_name
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
-                replacement_loras)
+                replacement_loras
+            )
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
     def _get_lora_layer_weights(
-            self, lora_model: LoRAModel,
-            module_name: str) -> Optional[LoRALayerWeights]:
+        self, lora_model: LoRAModel, module_name: str
+    ) -> LoRALayerWeights | None:
         org_module_name = module_name
-        if self.is_pooling_model and not lora_model.check_lora_name(
-                module_name):
+        if self.is_pooling_model and not lora_model.check_lora_name(module_name):
             # If it's a pool model, and the layer name is not found,
             # remove the prefix 'model.' and search again.
             module_name = module_name.replace("model.", "")
@@ -678,7 +741,8 @@ class LoRAModelManager:
                 org_module_name = module_name
                 logger.info_once(
                     "For the pool model, successfully loaded the LoRA weights "
-                    "after removing the prefix 'model.'.")
+                    "after removing the prefix 'model.'."
+                )
         return lora_model.get_lora(org_module_name)
 
     def deactivate_adapter(self, adapter_id: int) -> bool:
@@ -689,8 +753,7 @@ class LoRAModelManager:
         return True
 
     def add_adapter(self, adapter: LoRAModel) -> bool:
-        logger.debug("Adding lora. Model id: %d, "
-                     "int id: %d", adapter.id, adapter.id)
+        logger.debug("Adding lora. Model id: %d, int id: %d", adapter.id, adapter.id)
         if adapter.id in self._registered_adapters:
             return False
         if len(self._registered_adapters) >= self.capacity:
@@ -713,29 +776,36 @@ class LoRAModelManager:
     def list_adapters(self) -> dict[int, LoRAModel]:
         return dict(self._registered_adapters)
 
-    def get_adapter(self, adapter_id: int) -> Optional[LoRAModel]:
+    def get_adapter(self, adapter_id: int) -> LoRAModel | None:
         return self._registered_adapters.get(adapter_id)
 
 
 class LoRALRUCache(AdapterLRUCache[LoRAModel]):
-
-    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
-                                                                   bool]):
+    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], bool]):
         super().__init__(capacity, deactivate_lora_fn)
 
 
 class LRUCacheLoRAModelManager(LoRAModelManager):
     """A model manager that manages multiple LoRAs with LRU cache."""
 
-    def __init__(self, model: nn.Module, max_num_seqs: int,
-                 max_num_batched_tokens: int, vocab_size: int,
-                 lora_config: LoRAConfig, device: torch.device):
-        super().__init__(model, max_num_seqs, max_num_batched_tokens,
-                         vocab_size, lora_config, device)
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        lora_config: LoRAConfig,
+        device: torch.device,
+    ):
+        super().__init__(
+            model, max_num_seqs, max_num_batched_tokens, vocab_size, lora_config, device
+        )
         self._registered_adapters: LoRALRUCache = LoRALRUCache(
-            self.capacity, self.deactivate_adapter)
+            self.capacity, self.deactivate_adapter
+        )
         self._active_adapters: LoRALRUCache = LoRALRUCache(
-            self.lora_slots, self._deactivate_adapter)
+            self.lora_slots, self._deactivate_adapter
+        )
 
     def list_adapters(self) -> dict[int, LoRAModel]:
         """List all registered LoRAModels."""
@@ -743,8 +813,7 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
 
     def add_adapter(self, lora: LoRAModel) -> bool:
         """Add a LoRAModel to the manager."""
-        logger.debug("Adding lora. Model id: %d, "
-                     "int id: %d", lora.id, lora.id)
+        logger.debug("Adding lora. Model id: %d, int id: %d", lora.id, lora.id)
         if lora.id not in self._registered_adapters:
             self._add_adapter(lora)
             was_added = True
@@ -758,8 +827,10 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
         self,
         lora_id: int,
     ) -> bool:
-        if lora_id not in self._active_adapters and len(
-                self._active_adapters) >= self.lora_slots:
+        if (
+            lora_id not in self._active_adapters
+            and len(self._active_adapters) >= self.lora_slots
+        ):
             self._active_adapters.remove_oldest()
         result = super().activate_adapter(lora_id)
         # We always touch to update the LRU cache order
@@ -782,8 +853,9 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
         try:
             self._registered_adapters.pin(lora_id)
         except ValueError as err:
-            raise ValueError("Pinning failed. "
-                             f"LoRA {lora_id} is not registered.") from err
+            raise ValueError(
+                f"Pinning failed. LoRA {lora_id} is not registered."
+            ) from err
 
     def _pin_lora_in_gpu_cache(self, lora_id: int):
         if lora_id not in self._active_adapters:
@@ -794,14 +866,15 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
 
 
 def create_lora_manager(
-        model: nn.Module,
-        max_num_seqs: int,
-        max_num_batched_tokens: int,
-        vocab_size: int,
-        lora_config: LoRAConfig,
-        device: torch.device,
-        lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
-        **kwargs) -> LoRAModelManager:
+    model: nn.Module,
+    max_num_seqs: int,
+    max_num_batched_tokens: int,
+    vocab_size: int,
+    lora_config: LoRAConfig,
+    device: torch.device,
+    lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
+    **kwargs,
+) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
     if not isinstance(model, SupportsLoRA):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
@@ -812,5 +885,6 @@ def create_lora_manager(
         vocab_size=vocab_size,
         lora_config=lora_config,
         device=device,
-        **kwargs)
+        **kwargs,
+    )
     return lora_manager
diff --git a/vllm/lora/ops/ipex_ops/__init__.py b/vllm/lora/ops/ipex_ops/__init__.py
index 5daa432493b1993f0658125f5c0ab722d08d8c1d..f5a5e0e6f951f3140764552dafb64a44a55609f3 100644
--- a/vllm/lora/ops/ipex_ops/__init__.py
+++ b/vllm/lora/ops/ipex_ops/__init__.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.ipex_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
-                                             bgmv_shrink)
+from vllm.lora.ops.ipex_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
 
 __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm/lora/ops/ipex_ops/lora_ops.py b/vllm/lora/ops/ipex_ops/lora_ops.py
index 7590c868ecb67829c58fe32b94829a7daecb945e..0767f90b2f9e7ed49e444a2d6d172f29ad81accb 100644
--- a/vllm/lora/ops/ipex_ops/lora_ops.py
+++ b/vllm/lora/ops/ipex_ops/lora_ops.py
@@ -13,32 +13,45 @@ except ImportError as e:
     raise e
 
 
-def bgmv_shrink(inputs: torch.Tensor,
-                lora_a_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                scaling: float = 1.0) -> None:
-
-    ipex.llm.functional.bgmv_shrink(inputs, lora_a_weights, output_tensor,
-                                    lora_indices_tensor, scaling)
-
-
-def bgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                add_inputs: bool = True) -> None:
-    ipex.llm.functional.bgmv_expand(inputs, lora_b_weights, output_tensor,
-                                    lora_indices_tensor, add_inputs)
-
-
-def bgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = True) -> None:
-    ipex.llm.functional.bgmv_expand_slice(inputs, lora_b_weights,
-                                          output_tensor, lora_indices_tensor,
-                                          slice_offset, slice_size, add_inputs)
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+) -> None:
+    ipex.llm.functional.bgmv_shrink(
+        inputs, lora_a_weights, output_tensor, lora_indices_tensor, scaling
+    )
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+) -> None:
+    ipex.llm.functional.bgmv_expand(
+        inputs, lora_b_weights, output_tensor, lora_indices_tensor, add_inputs
+    )
+
+
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+) -> None:
+    ipex.llm.functional.bgmv_expand_slice(
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        lora_indices_tensor,
+        slice_offset,
+        slice_size,
+        add_inputs,
+    )
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 22aa3c63dce193522a73d63de8857916fad86d41..89865af4e9b892beaef85a01af98ff0848d03197 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
-from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
-                                              sgmv_expand, sgmv_expand_slice,
-                                              sgmv_shrink)
+from vllm.lora.ops.torch_ops.lora_ops import (
+    bgmv_expand,  # noqa: F401
+    bgmv_expand_slice,
+    bgmv_shrink,
+    sgmv_expand,
+    sgmv_expand_slice,
+    sgmv_shrink,
+)
 
 __all__ = [
     "bgmv_expand",
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index cba5baad8668613df85be14af0b26e39d834dc08..4fc6248d5448e093f294acb36fca8628ba0444ff 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -4,30 +4,31 @@
 import torch
 
 
-def sgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                b_seq_start_loc: torch.Tensor,
-                seq_len_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                batches: int,
-                max_seq_length: int,
-                token_nums: int,
-                add_inputs: bool = False):
-    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
-                                               seq_len_tensor)
-
-    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
-                add_inputs)
-
-
-def bgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                add_inputs: bool = True):
-    selected_loras = lora_b_weights[lora_indices_tensor].to(
-        dtype=output_tensor.dtype)
+def sgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    add_inputs: bool = False,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor)
+
+    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices, add_inputs)
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
     if len(selected_loras.shape) == 4:
         selected_loras = selected_loras.squeeze(dim=1)
     inputs = inputs.to(dtype=output_tensor.dtype)
@@ -58,62 +59,70 @@ def sgmv_shrink(
     token_nums: int,
     scaling: float,
 ):
-    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
-                                               seq_len_tensor)
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor)
 
-    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
-                scaling)
+    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices, scaling)
 
 
-def bgmv_shrink(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                scaling: float = 1.0):
-    selected_loras = lora_b_weights[lora_indices_tensor].to(
-        dtype=output_tensor.dtype)
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
     if len(selected_loras.shape) == 4:
         selected_loras = selected_loras.squeeze(dim=1)
     inputs = inputs.to(dtype=output_tensor.dtype)
     outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
 
-    output_tensor[:, :outputs.shape[1]] = scaling * outputs[:]
-
-
-def sgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      b_seq_start_loc: torch.Tensor,
-                      seq_len_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      batches: int,
-                      max_seq_length: int,
-                      token_nums: int,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = False):
-    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
-                                               seq_len_tensor)
-
-    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
-                      slice_offset, slice_size, add_inputs)
-
-
-def bgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = True):
-    selected_loras = lora_b_weights[lora_indices_tensor].to(
-        dtype=output_tensor.dtype)
+    output_tensor[:, : outputs.shape[1]] = scaling * outputs[:]
+
+
+def sgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor)
+
+    bgmv_expand_slice(
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        exploded_indices,
+        slice_offset,
+        slice_size,
+        add_inputs,
+    )
+
+
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
     inputs = inputs.to(dtype=output_tensor.dtype)
     if len(selected_loras.shape) == 4:
         selected_loras = selected_loras.squeeze(dim=1)
     outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
 
     if add_inputs:
-        output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:]
+        output_tensor[:, slice_offset : slice_offset + slice_size] += outputs[:]
     else:
-        output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:]
+        output_tensor[:, slice_offset : slice_offset + slice_size] = outputs[:]
diff --git a/vllm/lora/ops/triton_ops/README_TUNING.md b/vllm/lora/ops/triton_ops/README_TUNING.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ebe1fd7c3700c2680743f964b7a1f9a70fdfc0d
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/README_TUNING.md
@@ -0,0 +1,60 @@
+# Multi-LoRA Tuning
+
+**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`.
+Without this, the shrink/expand kernels will use default configurations.
+
+## Tuning Process
+
+Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from
+[Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py).
+
+1. Define the searching space. Here is an example of searching space:
+
+   ```python
+   block_m_range = [16, 32, 64, 128, 256]
+   block_n_range = [32, 64, 128, 256]
+   block_k_range = [32, 64, 128, 256]
+   num_warps_range = [4, 8]
+   num_stage_range = [2, 3, 4, 5]
+   num_ctas_range = [1]
+   split_k_range = [4, 8, 16, 32, 64]
+   ```
+
+2. Get all hidden_state sizes and num_slices that the target model uses for a specific TP size.
+
+   For example, you can acquire the info by simply checking
+   [add_lora_linear](https://github.com/vllm-project/vllm/blob/main/vllm/lora/punica_wrapper/punica_gpu.py#L181):
+
+   ```python
+   print(f"x_shape: {x.view(-1, x.shape[-1]).shape}")
+   print(f"num_slices: {len(output_slices)}")
+   for i in range(len(output_slices)):
+       print(f"a{i} shape: {lora_a_stacked[i].shape}")
+       print(f"b{i} shape: {lora_b_stacked[i].shape}")
+   print("y_shape", y.shape)
+   ```
+
+3. Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space
+   by performing a grid search to find the optimal kernel configuration.
+   vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py)
+   can be used to search for configurations for different shapes.
+
+## Config Files
+
+### File Naming
+
+| Kernel Type               | File Name Template                          | Example                                     |
+|---------------------------|--------------------------------------------|---------------------------------------------|
+| shrink                    | `{gpu_name}_SHRINK.json`                   | `NVIDIA_H200_SHRINK.json`                  |
+| expand                    | `{gpu_name}_EXPAND_{add_input}.json`       | `NVIDIA_H200_EXPAND_TRUE.json`             |
+| fused_moe_lora_w13_shrink | `{gpu_name}_FUSED_MOE_LORA_W13_SHRINK.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_SHRINK.json` |
+| fused_moe_lora_w13_expand | `{gpu_name}_FUSED_MOE_LORA_W13_EXPAND.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_EXPAND.json` |
+| fused_moe_lora_w2_shrink  | `{gpu_name}_FUSED_MOE_LORA_W2_SHRINK.json`  | `NVIDIA_H200_FUSED_MOE_LORA_W2_SHRINK.json` |
+| fused_moe_lora_w2_expand  | `{gpu_name}_FUSED_MOE_LORA_W2_EXPAND.json`  | `NVIDIA_H200_FUSED_MOE_LORA_W2_EXPAND.json` |
+
+The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`.
+
+### JSON Structure
+
+Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n][i]`,
+where `i` is an optional dimension in the `fused_moe_lora` configuration, representing the intermediate size of the MoE layer.
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 805de4b6f657034a9029d5c052e27023b387b00f..7e8b9a79add39c0a18ffbe1be6e806b5d4a56f04 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,6 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
+from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+    fused_moe_lora,
+    fused_moe_lora_expand,
+    fused_moe_lora_shrink,
+)
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
@@ -9,4 +15,7 @@ __all__ = [
     "lora_expand",
     "lora_shrink",
     "LoRAKernelMeta",
+    "fused_moe_lora",
+    "fused_moe_lora_shrink",
+    "fused_moe_lora_expand",
 ]
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2dd47dbb4e6491c68d681ad14c7cd7da6335824
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -0,0 +1,641 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.utils.torch_utils import direct_register_custom_op
+
+from .utils import supports_pdl
+
+_LORA_PTR_DICT: dict[tuple[int, ...], torch.tensor] = {}
+
+
+def _get_ptr(lora_weights: list[torch.Tensor], device: torch.device):
+    """
+    `_LORA_PTR_DICT` collects the required information during `profile_run`,
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to:
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+
+    if (ptr_tensor := _LORA_PTR_DICT.get(key)) is not None:
+        return ptr_tensor
+
+    tensor_ptrs = []
+    for lora_weight in lora_weights:
+        tensor_ptrs.append(lora_weight.data_ptr())
+    ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64)
+
+    _LORA_PTR_DICT[key] = ptr_tensor
+    return _LORA_PTR_DICT.get(key)
+
+
+@triton.jit(
+    do_not_specialize=[
+        "num_valid_tokens",
+        "EM",
+        "stride_tl",
+        "stride_el",
+        "slice_a_size",
+        "slice_c_size",
+    ]
+)
+def _fused_moe_lora_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    num_experts,
+    lora_ids,
+    adapter_enabled,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_bl,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_tl,
+    stride_el,
+    slice_a_size,
+    slice_c_size,
+    # Meta-parameters
+    num_slice_a: tl.constexpr,
+    num_slice_c: tl.constexpr,
+    top_k: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
+    IS_PRIMARY: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+    lora_id = tl.load(lora_ids + lora_idx)
+    moe_enabled = tl.load(adapter_enabled + lora_id)
+    if lora_id == -1 or moe_enabled == 0:
+        # Early exit for the no-lora case.
+        return
+    max_loras = tl.num_programs(axis=2)
+    grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
+
+    # calculate pid_m,pid_n
+    pid_sk = pid % SPLIT_K
+    pid_m_n = pid // SPLIT_K
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid_m_n // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m)
+    pid_n = (pid_m_n % num_pid_in_group) // group_size_m
+
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_id)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    # get the expert_id to process curr shard
+    ind = lora_id * stride_el + pid_m
+    expert_id = tl.load(expert_ids_ptr + ind, ind < max_loras * stride_el, -1)
+    if expert_id == -1:
+        return
+    # get a_ptr,b_ptr,c_ptr
+    cur_a_ptr = a_ptr + (slice_id % num_slice_a) * slice_a_size
+    cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
+    cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    token_ind = stride_tl * lora_id + offs_token_id
+    offs_token = tl.load(
+        sorted_token_ids_ptr + token_ind, token_ind < max_loras * stride_tl, 0
+    )
+    token_mask = offs_token < num_valid_tokens
+
+    # get a_ptrs,b_ptrs
+    a_ptrs = cur_a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
+
+    b_ptrs = (
+        cur_b_ptr
+        + lora_id * stride_bl
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+
+    # accumulator
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, grid_k):
+        k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
+        # pre-fetch lora weight
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        # GDC wait waits for ALL programs in the prior kernel to complete
+        # before continuing.
+        if USE_GDC and not IS_PRIMARY:
+            tl.extra.cuda.gdc_wait()
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
+            other=0.0,
+        )
+        accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if USE_GDC and IS_PRIMARY:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        tl.extra.cuda.gdc_launch_dependents()
+    accumulator = accumulator.to(c_ptr.dtype.element_ty)
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = cur_c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+
+    if SPLIT_K == 1:
+        tl.store(c_ptrs, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptrs, accumulator, mask=c_mask, sem="relaxed")
+
+
+@torch.inference_mode()
+def _fused_moe_lora_shrink(
+    a_intermediate_cache1: torch.Tensor,
+    # (num_slices, num_tokens, top_k_num, max_lora_rank)
+    qcurr_hidden_states: torch.Tensor,  # (num_tokens, K,)
+    lora_a_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
+    topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
+    sorted_token_ids: torch.Tensor,  # (max_loras, _)
+    expert_ids: torch.Tensor,  # (max_loras, _ ,)
+    num_tokens_post_padded: torch.Tensor,  # (max_loras, )
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    ## adding for kernel
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    w1_lora_a_stacked = lora_a_stacked[0]
+    use_gdc = supports_pdl(qcurr_hidden_states.device)
+    shrink_config = {
+        "BLOCK_SIZE_M": block_size_m,
+        "BLOCK_SIZE_N": block_size_n,
+        "BLOCK_SIZE_K": block_size_k,
+        "GROUP_SIZE_M": group_size_m,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+        "SPLIT_K": split_k,
+        "USE_GDC": use_gdc,
+        "launch_pdl": use_gdc,  # triton kernel metadata
+    }
+
+    b_ptr = _get_ptr(lora_a_stacked, device)
+
+    grid = lambda META: (
+        split_k
+        * triton.cdiv(EM, META["BLOCK_SIZE_M"])
+        * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        len(lora_a_stacked),
+        lora_a_stacked[0].shape[0],
+    )
+    _fused_moe_lora_kernel[grid](
+        qcurr_hidden_states,
+        b_ptr,
+        a_intermediate_cache1,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        N,
+        K,
+        EM,
+        num_tokens,
+        num_experts,
+        lora_ids,
+        adapter_enabled,
+        qcurr_hidden_states.stride(0),
+        qcurr_hidden_states.stride(1),
+        w1_lora_a_stacked.stride(0),
+        w1_lora_a_stacked.stride(1),
+        w1_lora_a_stacked.stride(3),
+        w1_lora_a_stacked.stride(2),
+        a_intermediate_cache1.stride(2),
+        a_intermediate_cache1.stride(3),
+        sorted_token_ids.stride(0),
+        expert_ids.stride(0),
+        slice_a_size=qcurr_hidden_states.numel(),
+        slice_c_size=a_intermediate_cache1.numel() // num_slices,
+        num_slice_a=1,
+        num_slice_c=num_slices,
+        top_k=1 if mul_routed_weight else top_k_num,
+        MUL_ROUTED_WEIGHT=False,
+        IS_PRIMARY=True,
+        **shrink_config,
+    )
+
+
+@torch.inference_mode()
+def _fused_moe_lora_expand(
+    output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
+    a_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, max_lora_rank)
+    lora_b_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
+    topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
+    sorted_token_ids: torch.Tensor,  # (max_loras, _)
+    expert_ids: torch.Tensor,  # (max_loras, _ ,)
+    num_tokens_post_padded: torch.Tensor,  # (max_loras, )
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    ## adding for kernel
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    max_lora_rank: int,
+    w1_output_dim_size: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    b_ptr = _get_ptr(lora_b_stacked, device)
+    K = max_lora_rank
+    N = w1_output_dim_size
+
+    w1_lora_b_stacked = lora_b_stacked[0]
+
+    a_intermediate_cache1 = a_intermediate_cache1.view(
+        -1, a_intermediate_cache1.shape[3]
+    )
+
+    b_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, w1_output_dim_size),
+        dtype=output.dtype,
+        device=device,
+    )
+    use_gdc = supports_pdl(a_intermediate_cache1.device)
+    expand_config = {
+        "BLOCK_SIZE_M": block_size_m,
+        "BLOCK_SIZE_N": block_size_n,
+        "BLOCK_SIZE_K": block_size_k,
+        "GROUP_SIZE_M": group_size_m,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+        "SPLIT_K": split_k,  # Set split_k = 1 for expand calls
+        "USE_GDC": use_gdc,
+        "launch_pdl": use_gdc,  # triton kernel metadata
+    }
+
+    grid = lambda META: (
+        triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        len(lora_b_stacked),
+        lora_b_stacked[0].shape[0],
+    )
+    _fused_moe_lora_kernel[grid](
+        a_intermediate_cache1,
+        b_ptr,
+        b_intermediate_cache1,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        N,
+        K,
+        EM,
+        num_tokens,
+        num_experts,
+        lora_ids,
+        adapter_enabled,
+        a_intermediate_cache1.stride(0),
+        a_intermediate_cache1.stride(1),
+        w1_lora_b_stacked.stride(0),
+        w1_lora_b_stacked.stride(1),
+        w1_lora_b_stacked.stride(3),
+        w1_lora_b_stacked.stride(2),
+        b_intermediate_cache1.stride(2),
+        b_intermediate_cache1.stride(3),
+        sorted_token_ids.stride(0),
+        expert_ids.stride(0),
+        slice_a_size=a_intermediate_cache1.numel() // num_slices,
+        slice_c_size=b_intermediate_cache1.numel() // num_slices,
+        num_slice_a=num_slices,
+        num_slice_c=num_slices,
+        top_k=1,
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        IS_PRIMARY=False,
+        **expand_config,
+    )
+    for i in range(num_slices):
+        output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i]
+
+
+@torch.inference_mode()
+def _fused_moe_lora(
+    output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
+    qcurr_hidden_states: torch.Tensor,  # (num_tokens, K,)
+    lora_a_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
+    lora_b_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, N, max_lora_rank,),...]
+    topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
+    sorted_token_ids: torch.Tensor,  # (max_loras, _)
+    expert_ids: torch.Tensor,  # (max_loras, _ ,)
+    num_tokens_post_padded: torch.Tensor,  # (max_loras, )
+    max_lora_rank: int,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    shrink_block_size_m: int,
+    shrink_block_size_n: int,
+    shrink_block_size_k: int,
+    shrink_group_size_m: int,
+    shrink_num_warps: int,
+    shrink_num_stages: int,
+    shrink_split_k: int,
+    expand_block_size_m: int,
+    expand_block_size_n: int,
+    expand_block_size_k: int,
+    expand_group_size_m: int,
+    expand_num_warps: int,
+    expand_num_stages: int,
+    expand_split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    assert len(lora_a_stacked) == len(lora_b_stacked) > 0
+    assert (
+        sorted_token_ids.dim()
+        == expert_ids.dim()
+        == topk_weights.dim()
+        == qcurr_hidden_states.dim()
+        == 2
+    )
+    assert (
+        sorted_token_ids.shape[0]
+        == expert_ids.shape[0]
+        == num_tokens_post_padded.shape[0]
+    )
+    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
+    assert output.shape[0] == topk_weights.shape[0]
+    assert top_k_num == topk_weights.shape[1]
+    device = qcurr_hidden_states.device
+    num_slices = len(lora_a_stacked)
+    w1_lora_b_stacked = lora_b_stacked[0]
+    num_experts = lora_a_stacked[0].shape[1]
+    N = max_lora_rank
+    M = topk_weights.shape[0]
+    EM = sorted_token_ids.shape[1]
+    K = qcurr_hidden_states.shape[1]
+    num_tokens = M * top_k_num
+    w1_output_dim_size = w1_lora_b_stacked.shape[2]
+
+    a_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, max_lora_rank),
+        dtype=output.dtype,
+        device=device,
+    )
+
+    _fused_moe_lora_shrink(
+        a_intermediate_cache1,
+        qcurr_hidden_states,
+        lora_a_stacked,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        top_k_num,
+        lora_ids,
+        adapter_enabled,
+        ## adding for kernel
+        device,
+        N,
+        M,
+        EM,
+        K,
+        num_tokens,
+        num_experts,
+        num_slices,
+        shrink_block_size_m,
+        shrink_block_size_n,
+        shrink_block_size_k,
+        shrink_group_size_m,
+        shrink_num_warps,
+        shrink_num_stages,
+        shrink_split_k,
+        mul_routed_weight,
+    )
+
+    _fused_moe_lora_expand(
+        output,
+        a_intermediate_cache1,
+        lora_b_stacked,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        top_k_num,
+        lora_ids,
+        adapter_enabled,
+        ## adding for kernel
+        device,
+        N,
+        M,
+        EM,
+        K,
+        num_tokens,
+        num_experts,
+        num_slices,
+        max_lora_rank,
+        w1_output_dim_size,
+        expand_block_size_m,
+        expand_block_size_n,
+        expand_block_size_k,
+        expand_group_size_m,
+        expand_num_warps,
+        expand_num_stages,
+        expand_split_k,
+        mul_routed_weight,
+    )
+
+
+def _fused_moe_lora_fake(
+    output: torch.Tensor,
+    qcurr_hidden_states: torch.Tensor,
+    lora_a_stacked: list[torch.Tensor],
+    lora_b_stacked: list[torch.Tensor],
+    topk_weights: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    max_lora_rank: int,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    shrink_block_size_m: int,
+    shrink_block_size_n: int,
+    shrink_block_size_k: int,
+    shrink_group_size_m: int,
+    shrink_num_warps: int,
+    shrink_num_stages: int,
+    shrink_split_k: int,
+    expand_block_size_m: int,
+    expand_block_size_n: int,
+    expand_block_size_k: int,
+    expand_group_size_m: int,
+    expand_num_warps: int,
+    expand_num_stages: int,
+    expand_split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    return
+
+
+def _fused_moe_lora_shrink_fake(
+    a_intermediate_cache1: torch.Tensor,
+    qcurr_hidden_states: torch.Tensor,
+    lora_a_stacked: list[torch.Tensor],
+    topk_weights: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    return
+
+
+def _fused_moe_lora_expand_fake(
+    output: torch.Tensor,
+    a_intermediate_cache1: torch.Tensor,
+    lora_b_stacked: list[torch.Tensor],
+    topk_weights: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    max_lora_rank: int,
+    w1_output_dim_size: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="fused_moe_lora",
+        op_func=_fused_moe_lora,
+        mutates_args=["output"],
+        fake_impl=_fused_moe_lora_fake,
+    )
+
+    direct_register_custom_op(
+        op_name="fused_moe_lora_shrink",
+        op_func=_fused_moe_lora_shrink,
+        mutates_args=["a_intermediate_cache1"],
+        fake_impl=_fused_moe_lora_shrink_fake,
+    )
+
+    direct_register_custom_op(
+        op_name="fused_moe_lora_expand",
+        op_func=_fused_moe_lora_expand,
+        mutates_args=["output"],
+        fake_impl=_fused_moe_lora_expand_fake,
+    )
+
+    fused_moe_lora = torch.ops.vllm.fused_moe_lora
+    fused_moe_lora_shrink = torch.ops.vllm.fused_moe_lora_shrink
+    fused_moe_lora_expand = torch.ops.vllm.fused_moe_lora_expand
+
+except AttributeError:
+    fused_moe_lora = _fused_moe_lora
+    fused_moe_lora_shrink = _fused_moe_lora_shrink
+    fused_moe_lora_expand = _fused_moe_lora_expand
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index e93064d0c83ad646403f60859a6de3e61cf19cb3..c6c2a02fdeb53a9d3e703276c03704e95ca22e01 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -3,23 +3,37 @@
 """
 Utilities for Punica kernel construction.
 """
+
 from vllm.triton_utils import tl, triton
 
 
 @triton.jit
-def mm_k(a_ptr, b_ptr, ak_stride, bk_stride, offset_k, K: tl.constexpr,
-         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-         EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, CAST_TYPE: tl.constexpr,
-         b_dtype: tl.constexpr):
+def mm_k(
+    a_ptr,
+    b_ptr,
+    ak_stride,
+    bk_stride,
+    offset_k,
+    K: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+    b_dtype: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    base_k,
+):
     """
     Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of
     B (k x n), iterate, through the K dimension to compute the partial/complete
     matrix block product.
     If SPLIT_K == 1, the output m x n product is complete.
     If SPLIT_K > 1, the thread block computes partial outputs. The partial
-    outputs are then atomically summed in the caller code. 
+    outputs are then atomically summed in the caller code.
     Args:
-        a_ptr: Array of pointers, identifying rows of A 
+        a_ptr: Array of pointers, identifying rows of A
         b_ptr: Array of pointers, identifying columns of B
         ak_stride: K dimension stride of the A matrix
         bk_stride: K dimension stride of the B matrix
@@ -29,33 +43,67 @@ def mm_k(a_ptr, b_ptr, ak_stride, bk_stride, offset_k, K: tl.constexpr,
         BLOCK_K: K dimension atom
         EVEN_K: True if the blocks of A and B can be loaded without any
           masking.
-        SPLIT_K: Parameter signifying parallelism in the K dimension. 
+        SPLIT_K: Parameter signifying parallelism in the K dimension.
         CAST_TYPE: if True, cast the values from the A matrix to the B
           matrix dtype.
         b_dtype: datatype of the B matrix
+        USE_GDC: Whether to use PDL. True indicates use.
+        base_k: Base offset along K dimension for current SPLIT_K group
     """
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
+
+    # Step size along K for each iteration
+    STEP_K = BLOCK_K * SPLIT_K
+
+    # Total number of iterations (compile-time constant)
+    num_iters = tl.cdiv(K, STEP_K)
+
+    for k in range(num_iters):
+        # Current iteration's global K offset
+        iter_k = k * STEP_K + base_k
+
+        # Check if this iteration is completely valid (no masking needed)
+        block_end = iter_k + BLOCK_K
+
         if EVEN_K:
-            tiled_a = tl.load(a_ptr)
+            # K is divisible by BLOCK_K, no masking ever needed
+            # pre-fetch lora weight
             tiled_b = tl.load(b_ptr)
+            if USE_GDC:
+                tl.extra.cuda.gdc_wait()
+            tiled_a = tl.load(a_ptr)
+            if CAST_TYPE:
+                tiled_a = tiled_a.to(b_dtype)
+            accumulator += tl.dot(tiled_a, tiled_b)
         else:
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :]
-                              < K - k * (BLOCK_K * SPLIT_K),
-                              other=0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None]
-                              < K - k * (BLOCK_K * SPLIT_K),
-                              other=0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(b_dtype)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * SPLIT_K * ak_stride
-        b_ptr += BLOCK_K * SPLIT_K * bk_stride
+            # Check if we need element-wise masking
+            if iter_k >= K:
+                # Entire block out of range, skip
+                pass
+            elif block_end <= K:
+                # Entire block in range, no masking needed (fast path)
+                tiled_b = tl.load(b_ptr)
+                if USE_GDC:
+                    tl.extra.cuda.gdc_wait()
+                tiled_a = tl.load(a_ptr)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+            else:
+                # Partial block, need masking (only last iteration)
+                k_offsets = tl.arange(0, BLOCK_K)
+                mask = iter_k + k_offsets < K
+                tiled_b = tl.load(b_ptr, mask=mask[:, None], other=0.0)
+                if USE_GDC:
+                    tl.extra.cuda.gdc_wait()
+                tiled_a = tl.load(a_ptr, mask=mask[None, :], other=0.0)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+
+        a_ptr += STEP_K * ak_stride
+        b_ptr += STEP_K * bk_stride
+
     return accumulator
 
 
@@ -92,6 +140,7 @@ def do_expand_kernel(
     EVEN_K: tl.constexpr,
     CAST_TYPE: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     """
     Given an array of integers that identifies the rows of A, ram,
@@ -121,7 +170,8 @@ def do_expand_kernel(
     else:
         cur_input_ptr = input_ptr + slice_id * input_d0_stride
         cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
-            tl.pointer_type(out_ptr.dtype.element_ty))
+            tl.pointer_type(out_ptr.dtype.element_ty)
+        )
 
     # Identify the column indices of B to process.
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
@@ -129,17 +179,38 @@ def do_expand_kernel(
 
     # Identify A and B block pointers
     offset_k = tl.arange(0, BLOCK_K)
-    a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
-             offset_k[None, :] * input_d2_stride)
-    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
-             offset_k[:, None] * cur_lora_d2_stride +
-             rbn[None, :] * cur_lora_d1_stride)
+    a_ptr = (
+        cur_input_ptr
+        + ram[:, None] * input_d1_stride
+        + offset_k[None, :] * input_d2_stride
+    )
+    b_ptr = (
+        cur_lora_ptr
+        + cur_lora_d0_stride * lora_index
+        + offset_k[:, None] * cur_lora_d2_stride
+        + rbn[None, :] * cur_lora_d1_stride
+    )
 
     # Compute the block matrix product.
     SPLIT_K = 1
-    accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride,
-                       offset_k, K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K,
-                       CAST_TYPE, cur_lora_ptr.dtype.element_ty)
+
+    accumulator = mm_k(
+        a_ptr,
+        b_ptr,
+        input_d2_stride,
+        cur_lora_d2_stride,
+        offset_k,
+        K,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        CAST_TYPE,
+        cur_lora_ptr.dtype.element_ty,
+        USE_GDC,
+        base_k=0,
+    )
 
     tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
     if SLICE_NUM == 1:
@@ -150,10 +221,12 @@ def do_expand_kernel(
     # Identify the C output pointers to store the results of the accumulator.
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
     offset_cm = tl.arange(0, BLOCK_M)
-    c_ptr = (out_ptr + ram[:, None] * output_d0_stride +
-             offset_cn[None, :] * output_d1_stride)
-    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :]
-                                             < (cur_slice_start + N))
+    c_ptr = (
+        out_ptr
+        + ram[:, None] * output_d0_stride
+        + offset_cn[None, :] * output_d1_stride
+    )
+    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < (cur_slice_start + N))
 
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
@@ -192,6 +265,7 @@ def do_shrink_kernel(
     EVEN_K: tl.constexpr,
     SPLIT_K: tl.constexpr,
     SLICE_NUM: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     """
     Given an array of integers that identifies the rows of A, ram,
@@ -207,7 +281,8 @@ def do_shrink_kernel(
     else:
         # current lora ptr
         cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
-            tl.pointer_type(input_ptr.dtype.element_ty))
+            tl.pointer_type(input_ptr.dtype.element_ty)
+        )
 
     # Identify the column indices of B to process.
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
@@ -215,29 +290,51 @@ def do_shrink_kernel(
 
     # Identify A and B block pointers
     offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
-    a_ptr = (input_ptr + ram[:, None] * input_d0_stride +
-             offset_k[None, :] * input_d1_stride)
-    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
-             rbn[None, :] * lora_d1_stride +
-             offset_k[:, None] * lora_d2_stride)
+    a_ptr = (
+        input_ptr + ram[:, None] * input_d0_stride + offset_k[None, :] * input_d1_stride
+    )
+    b_ptr = (
+        cur_lora_ptr
+        + lora_d0_stride * lora_index
+        + rbn[None, :] * lora_d1_stride
+        + offset_k[:, None] * lora_d2_stride
+    )
 
     # Compute partial/complete block matrix product.
-    accumulator = mm_k(a_ptr, b_ptr, input_d1_stride, lora_d2_stride, offset_k,
-                       K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K, False,
-                       cur_lora_ptr.dtype.element_ty)
-
+    accumulator = mm_k(
+        a_ptr,
+        b_ptr,
+        input_d1_stride,
+        lora_d2_stride,
+        offset_k,
+        K,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        False,
+        cur_lora_ptr.dtype.element_ty,
+        False,  # USE_GDC is always False in shrink kernel
+        base_k=pid_sk * BLOCK_K,
+    )
+    # GDC launch dependents hints the runtime system to launch dependent kernels.
+    if USE_GDC:
+        tl.extra.cuda.gdc_launch_dependents()
     # Identify the C output pointers to store the results of the accumulator.
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_cm = tl.arange(0, BLOCK_M)
-    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
-                   slice_id * output_d0_stride)
-    c_ptr = cur_out_ptr + ram[:, None] * output_d1_stride + offset_cn[
-        None, :] * output_d2_stride
+    cur_out_ptr = out_ptr if SLICE_NUM == 1 else out_ptr + slice_id * output_d0_stride
+    c_ptr = (
+        cur_out_ptr
+        + ram[:, None] * output_d1_stride
+        + offset_cn[None, :] * output_d2_stride
+    )
     c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N)
-
     accumulator *= scaling
+
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
         tl.store(c_ptr, accumulator, mask=c_mask)
     else:
-        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask, sem="relaxed")
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 467cbaa8af48f0ed29699034f08cb38c20ea457b..311c4b191859741b03edc30445aaef671f650bc1 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -10,42 +10,46 @@ https://arxiv.org/abs/2310.18547
 import torch
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
-from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
+
+from .utils import supports_pdl
 
 
 @triton.jit
 def _lora_expand_kernel(
-        input_ptr,
-        lora_ptr,
-        out_ptr,
-        M,
-        N,
-        K,
-        token_indices_sorted_by_lora_ids,
-        num_tokens_per_lora,
-        lora_token_start_loc,
-        lora_ids,
-        slice_start_loc,
-        input_d0_stride,
-        input_d1_stride,
-        input_d2_stride,  # 1
-        ls_d0_ptr,
-        ls_d1_ptr,
-        ls_d2_ptr,  # 1
-        output_d0_stride,
-        output_d1_stride,  # 1
-        output_hs_ptr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        EVEN_K: tl.constexpr,
-        ADD_INPUTS: tl.constexpr,
-        CAST_TYPE: tl.constexpr,
-        SLICE_NUM: tl.constexpr,
-        SAME_STRIDE: tl.constexpr):
-
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    M,
+    N,
+    K,
+    token_indices_sorted_by_lora_ids,
+    num_tokens_per_lora,
+    lora_token_start_loc,
+    lora_ids,
+    slice_start_loc,
+    input_d0_stride,
+    input_d1_stride,
+    input_d2_stride,  # 1
+    ls_d0_ptr,
+    ls_d1_ptr,
+    ls_d2_ptr,  # 1
+    output_d0_stride,
+    output_d1_stride,  # 1
+    output_hs_ptr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+    SAME_STRIDE: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
+):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
 
@@ -81,8 +85,9 @@ def _lora_expand_kernel(
 
     # Identify all rows that this CTA should process.
     lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
-    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
-                            lora_m_indices_start + cta_m_offset)
+    cta_lora_seq_indices = (
+        token_indices_sorted_by_lora_ids + lora_m_indices_start + cta_m_offset
+    )
 
     # Load all relevant row indices.
     offset_m = tl.arange(0, BLOCK_M) % cta_m_len
@@ -119,22 +124,22 @@ def _lora_expand_kernel(
         SLICE_NUM,
         EVEN_K,
         CAST_TYPE,
-        ADD_INPUTS)
+        ADD_INPUTS,
+        USE_GDC,
+    )
 
 
 @torch.inference_mode()
 def _lora_expand(
     inputs: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
-    lora_b_weights: list[
-        torch.Tensor],  # shape [num_lora, hidden_size, lora_rank]
-    output_tensor: torch.
-    Tensor,  # shape [num_tokens, hidden_size * num_slices]
+    lora_b_weights: list[torch.Tensor],  # shape [num_lora, hidden_size, lora_rank]
+    output_tensor: torch.Tensor,  # shape [num_tokens, hidden_size * num_slices]
     token_lora_mapping: torch.Tensor,  # shape [num_tokens]
     token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens]
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
-    no_lora_flag_cpu: torch.Tensor,  # shape [1] 
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
@@ -149,7 +154,7 @@ def _lora_expand(
         token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
             the A matrix grouped by LoRA IDs.
         num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
-            of tokens that are to be processed by LoRA ID lora_ids[i] 
+            of tokens that are to be processed by LoRA ID lora_ids[i]
         lora_token_start_loc (torch.Tensor): A cumulative sum of
             num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
             lora_token_start_loc[i], along with num_tokens_per_lora[i]
@@ -158,9 +163,9 @@ def _lora_expand(
         lora_ids (torch.Tensor): LoRA ids to process.
         no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
             if there are any requests that require LoRA.
-        offset_start (int, optional): Offset start for output_tensor. 
+        offset_start (int, optional): Offset start for output_tensor.
             Defaults to 0.
-        add_inputs (bool, optional): Whether to add the input tensor to the 
+        add_inputs (bool, optional): Whether to add the input tensor to the
             output tensor. Defaults to False.
     """
 
@@ -179,15 +184,20 @@ def _lora_expand(
     # metadata sanity check.
     M = inputs.size(1)
     assert token_lora_mapping.size(0) == M
-    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
-        0)
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
     assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
 
-    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
-     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
-     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
-                                           inputs.device)
+    (
+        slice_start_tensor,
+        lora_ptr_tensor,
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
+        hidden_sizes_tensor,
+        same_stride,
+        MAX_N,
+    ) = _get_lora_b_ptr(lora_b_weights, offset_start, inputs.device)
 
     K = lora_b_weights[0].shape[-1]  # K= rank
     ADD_INPUTS = add_inputs
@@ -196,18 +206,27 @@ def _lora_expand(
     NUM_SLICES = len(lora_b_weights)
 
     # Triton kernel configs.
-    BLOCK_M = 64
-    BLOCK_N = 128
-    BLOCK_K = 16
-    NUM_WARPS = 4
-    NUM_CTAS = 1
-    NUM_STAGES = 2
+    kernel_config = get_lora_op_configs(
+        op_type="expand",
+        max_loras=MAX_LORAS,
+        batch=M,
+        hidden_size=MAX_N,
+        rank=K,
+        num_slices=NUM_SLICES,
+        add_inputs=add_inputs,
+    )
+    BLOCK_M = kernel_config["block_m"]
+    BLOCK_N = kernel_config["block_n"]
+    BLOCK_K = kernel_config["block_k"]
+    NUM_WARPS = kernel_config["num_warps"]
+    NUM_CTAS = kernel_config["num_ctas"]
+    NUM_STAGES = kernel_config["num_stages"]
 
     EVEN_K = K % BLOCK_K == 0  # type: ignore
 
     if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
-            torch.float16,
-            torch.bfloat16,
+        torch.float16,
+        torch.bfloat16,
     ]:
         CAST_TYPE = True
 
@@ -222,7 +241,7 @@ def _lora_expand(
         # thread blocks simply exit.
         MAX_LORAS,
     )
-
+    use_gdc = supports_pdl(inputs.device)
     _lora_expand_kernel[grid](
         inputs,
         lora_ptr_tensor,
@@ -252,9 +271,11 @@ def _lora_expand(
         CAST_TYPE,
         NUM_SLICES,
         same_stride,
+        use_gdc,
         num_warps=NUM_WARPS,
         num_ctas=NUM_CTAS,
         num_stages=NUM_STAGES,
+        launch_pdl=use_gdc,
     )
 
     return
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index e27604728ed06c6f34dd31a1765a50d107163d37..c3bef7680dd0dc62f6cc116819201ace64d0d118 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -5,7 +5,6 @@ LoRA kernels metadata preparation utilities.
 """
 
 from dataclasses import dataclass
-from typing import Union
 
 import torch
 
@@ -30,39 +29,35 @@ class LoRAKernelMeta:
     no_lora_flag_cpu: torch.Tensor
 
     @staticmethod
-    def make(max_loras: int, max_num_tokens: int,
-             device: Union[torch.device, str]) -> "LoRAKernelMeta":
-
-        token_lora_mapping = torch.empty(max_num_tokens,
-                                         dtype=torch.int32,
-                                         device=device)
+    def make(
+        max_loras: int, max_num_tokens: int, device: torch.device | str
+    ) -> "LoRAKernelMeta":
+        token_lora_mapping = torch.empty(
+            max_num_tokens, dtype=torch.int32, device=device
+        )
 
-        token_indices_sorted_by_lora_ids = torch.empty(max_num_tokens,
-                                                       dtype=torch.int32,
-                                                       device=device)
+        token_indices_sorted_by_lora_ids = torch.empty(
+            max_num_tokens, dtype=torch.int32, device=device
+        )
 
         # +1 because "no-lora" is also a possibility
         # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1]
         # is a possibility.
-        active_lora_ids = torch.empty(max_loras + 1,
-                                      dtype=torch.int32,
-                                      device=device)
+        active_lora_ids = torch.empty(max_loras + 1, dtype=torch.int32, device=device)
 
         # using running example, [3, 10, 5, 2] is a possibility.
-        num_tokens_per_lora = torch.zeros(max_loras + 1,
-                                          dtype=torch.int32,
-                                          device=device)
+        num_tokens_per_lora = torch.zeros(
+            max_loras + 1, dtype=torch.int32, device=device
+        )
 
         # +2 for this because, the first index is always 0.
         # using running example, lora_token_start_loc
         # is [0, 3, 13, 18, 20].
-        lora_token_start_loc = torch.zeros(max_loras + 2,
-                                           dtype=torch.int32,
-                                           device=device)
+        lora_token_start_loc = torch.zeros(
+            max_loras + 2, dtype=torch.int32, device=device
+        )
 
-        no_lora_flag_cpu = torch.tensor([False],
-                                        dtype=torch.bool,
-                                        device='cpu')
+        no_lora_flag_cpu = torch.tensor([False], dtype=torch.bool, device="cpu")
 
         return LoRAKernelMeta(
             token_lora_mapping=token_lora_mapping,
@@ -70,7 +65,8 @@ class LoRAKernelMeta:
             active_lora_ids=active_lora_ids,
             num_tokens_per_lora=num_tokens_per_lora,
             lora_token_start_loc=lora_token_start_loc,
-            no_lora_flag_cpu=no_lora_flag_cpu)
+            no_lora_flag_cpu=no_lora_flag_cpu,
+        )
 
     def _reset(self):
         self.active_lora_ids.fill_(-1)
@@ -100,34 +96,44 @@ class LoRAKernelMeta:
         num_tokens = token_lora_mapping.size(0)
 
         # copy token lora mapping
-        self.token_lora_mapping[:num_tokens].copy_(token_lora_mapping,
-                                                   non_blocking=True)
+        self.token_lora_mapping[:num_tokens].copy_(
+            token_lora_mapping, non_blocking=True
+        )
 
         # token_indices_sorted_by_lora_ids
-        _, token_indices_sorted_by_lora_ids = torch.sort(token_lora_mapping,
-                                                         stable=True)
+        _, token_indices_sorted_by_lora_ids = torch.sort(
+            token_lora_mapping, stable=True
+        )
         # start gpu transfer
         self.token_indices_sorted_by_lora_ids[:num_tokens].copy_(
-            token_indices_sorted_by_lora_ids, non_blocking=True)
+            token_indices_sorted_by_lora_ids, non_blocking=True
+        )
 
         # active_lora_ids, num_tokens_per_lora
-        lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
-                                                     sorted=True,
-                                                     return_counts=True)
-        self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
-                                                      non_blocking=True)
-        self.num_tokens_per_lora[:num_tokens_per_lora.size(0)].copy_(
-            num_tokens_per_lora, non_blocking=True)
+        lora_ids, num_tokens_per_lora = torch.unique(
+            token_lora_mapping, sorted=True, return_counts=True
+        )
+        self.active_lora_ids[: lora_ids.size(0)].copy_(lora_ids, non_blocking=True)
+        self.num_tokens_per_lora[: num_tokens_per_lora.size(0)].copy_(
+            num_tokens_per_lora, non_blocking=True
+        )
 
         # lora_token_start_loc
         lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0)
-        self.lora_token_start_loc[1:1 + lora_token_start_loc.size(0)].copy_(
-            lora_token_start_loc, non_blocking=True)
+        self.lora_token_start_loc[1 : 1 + lora_token_start_loc.size(0)].copy_(
+            lora_token_start_loc, non_blocking=True
+        )
 
     def meta_args(
         self, token_nums: int
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-               torch.Tensor, torch.Tensor]:
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
         """
         This function returns the kernel metadata required for the current
         forward pass execution of the kernel. The function returns all the
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index 57da93c226d25bca04c7080372019e0791110ca6..71bd5e3614667b626c7a9ea42c790e3f3c727ecc 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -2,37 +2,67 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
 import torch
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
-from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
+from .utils import supports_pdl
 
-@triton.jit
-def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
-                        token_indices_sorted_by_lora_ids, num_tokens_per_lora,
-                        lora_token_start_loc, lora_ids, scaling,
-                        input_d0_stride, input_d1_stride, lora_d0_stride,
-                        lora_d1_stride, lora_d2_stride, output_d0_stride,
-                        output_d1_stride, output_d2_stride,
-                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
-                        BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr,
-                        SPLIT_K: tl.constexpr, SLICE_NUM: tl.constexpr):
 
+@triton.jit
+def _lora_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    M,
+    N,
+    K,
+    token_indices_sorted_by_lora_ids,
+    num_tokens_per_lora,
+    lora_token_start_loc,
+    lora_ids,
+    scaling,
+    input_d0_stride,
+    input_d1_stride,
+    lora_d0_stride,
+    lora_d1_stride,
+    lora_d2_stride,
+    output_d0_stride,
+    output_d1_stride,
+    output_d2_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
+):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
 
     pid_sk_m_n = tl.program_id(axis=0)
     pid_sk = pid_sk_m_n % SPLIT_K
-    pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num
-    pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num
+
+    pid_m_n = pid_sk_m_n // SPLIT_K
+    num_pid_in_group = GROUP_SIZE_M * cta_n_num
+    group_id = pid_m_n // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(cta_m_num - first_pid_m, GROUP_SIZE_M)
+
+    # Column-major ordering within groups for better cache reuse
+    pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m)
+    pid_n = (pid_m_n % num_pid_in_group) // group_size_m
 
     slice_id = tl.program_id(axis=1)
     lora_idx = tl.program_id(axis=2)
@@ -54,9 +84,9 @@ def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
 
     # Identify all rows that this CTA should process.
     lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
-    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
-                            lora_m_indices_start + cta_m_offset)
-
+    cta_lora_seq_indices = (
+        token_indices_sorted_by_lora_ids + lora_m_indices_start + cta_m_offset
+    )
     # Load all relevant row indices.
     offset_m = tl.arange(0, BLOCK_M) % cta_m_len
     ram = tl.load(cta_lora_seq_indices + offset_m)
@@ -90,17 +120,18 @@ def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
-        SLICE_NUM)
+        SLICE_NUM,
+        USE_GDC,
+    )
 
 
 @torch.inference_mode()
 def _lora_shrink(
     inputs: torch.Tensor,  #  shape [num_tokens, hidden_size]
-    lora_a_weights: list[
-        torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
+    lora_a_weights: list[torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
     output_tensor: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
     token_lora_mapping: torch.Tensor,  # shape [num_tokens]
-    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens] 
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens]
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
@@ -118,7 +149,7 @@ def _lora_shrink(
         token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
             the A matrix grouped by LoRA IDs.
         num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
-            of tokens that are to be processed by LoRA ID lora_ids[i] 
+            of tokens that are to be processed by LoRA ID lora_ids[i]
         lora_token_start_loc (torch.Tensor): A cumulative sum of
             num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
             lora_token_start_loc[i], along with num_tokens_per_lora[i]
@@ -147,26 +178,36 @@ def _lora_shrink(
     # metadata sanity check
     M = inputs.size(0)
     assert token_lora_mapping.size(0) == M
-    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
-        0)
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
     assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
 
-    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
-     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
+    output_tensor.zero_()
+
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1, lora_strides_d2) = (
+        _get_lora_a_ptr(lora_a_weights, inputs.device)
+    )
     N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
     NUM_SLICES = len(lora_a_weights)
     MAX_LORAS = lora_ids.size(0)
 
     # Triton kernel configs
-    BLOCK_M = 32
-    BLOCK_N = 16
-    BLOCK_K = 256 if M < 128 else 32
-    SPLIT_K = 64 if M < 128 else 8
-    NUM_WARPS = 4
-    NUM_CTAS = 1
-    NUM_STAGES = 2
-
+    kernel_config = get_lora_op_configs(
+        "shrink",
+        max_loras=MAX_LORAS,
+        batch=M,
+        hidden_size=K,
+        rank=N,
+        num_slices=NUM_SLICES,
+    )
+    BLOCK_M = kernel_config["block_m"]
+    BLOCK_N = kernel_config["block_n"]
+    BLOCK_K = kernel_config["block_k"]
+    SPLIT_K = kernel_config["split_k"]
+    NUM_WARPS = kernel_config["num_warps"]
+    NUM_STAGES = kernel_config["num_stages"]
+    NUM_CTAS = kernel_config["num_ctas"]
+    GROUP_SIZE_M = kernel_config.get("group_size_m", 8)
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
 
     # TODO (varun): This grid formulation maximizes parallelization at the
@@ -180,7 +221,7 @@ def _lora_shrink(
         # thread blocks exit early.
         MAX_LORAS,
     )
-
+    use_gdc = supports_pdl(inputs.device)
     _lora_shrink_kernel[grid](
         inputs,
         lora_ptr_tensor,
@@ -206,10 +247,13 @@ def _lora_shrink(
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
+        GROUP_SIZE_M,
         NUM_SLICES,
+        use_gdc,
         num_warps=NUM_WARPS,
         num_ctas=NUM_CTAS,
         num_stages=NUM_STAGES,
+        launch_pdl=use_gdc,
     )
 
     return
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 4c50fbd270516ddc4de84842a7f27e9394d774e3..8ed42382e3a86a5ac6196f8361777c32527c5696 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,17 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+import json
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
 import torch
 
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
 _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 
 
 def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
     """
-    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`,
     After this, it remains constant and subsequent usage is through LUT.
-    Refer to: 
+    Refer to:
     https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
     """
     key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
@@ -35,14 +47,15 @@ def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
         lora_strides_d1.append(lora_a_weight.stride(1))
         lora_strides_d2.append(lora_a_weight.stride(2))
     if len(lora_a_weights) > 1:
-        lora_ptr_tensor = torch.tensor(tensor_ptrs,
-                                       device=device,
-                                       dtype=torch.uint64)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64)
     else:
         lora_ptr_tensor = lora_a_weights[0]
 
-    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
-            or len(set(lora_strides_d2)) > 1):
+    if (
+        len(set(lora_strides_d0)) > 1
+        or len(set(lora_strides_d1)) > 1
+        or len(set(lora_strides_d2)) > 1
+    ):
         raise ValueError("All LoRA weights must have the same stride.")
 
     _LORA_A_PTR_DICT[key] = (
@@ -54,12 +67,13 @@ def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
     return _LORA_A_PTR_DICT.get(key)
 
 
-def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
-                    device: torch.device):
-    """ 
-     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
+def _get_lora_b_ptr(
+    lora_weights: list[torch.Tensor], offset_start: int, device: torch.device
+):
+    """
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`,
     After this, it remains constant and subsequent usage is through LUT.
-    Refer to: 
+    Refer to:
     https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
 
     """
@@ -91,20 +105,21 @@ def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
 
     if len(lora_weights) > 1:
         # note these are device tensors
-        lora_ptr_tensor = torch.tensor(tensor_ptrs,
-                                       device=device,
-                                       dtype=torch.uint64)
-        slice_start_tensor = torch.tensor(slice_offset_lst,
-                                          device=device,
-                                          dtype=torch.uint64)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64)
+        slice_start_tensor = torch.tensor(
+            slice_offset_lst, device=device, dtype=torch.uint64
+        )
     else:
         slice_start_tensor = slice_offset_lst[0]
         lora_ptr_tensor = lora_b_weight[0]
 
     # If each lora has the same stride, there's no need to use a
     # tensor for storage.
-    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
-            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+    if (
+        len(set(lora_strides_d0)) == 1
+        and len(set(lora_strides_d1)) == 1
+        and len(set(lora_strides_d2)) == 1
+    ) and len(set(hidden_sizes)) == 1:
         lora_strides_d0_tensor = lora_strides_d0[0]
         lora_strides_d1_tensor = lora_strides_d1[0]
         lora_strides_d2_tensor = lora_strides_d2[0]
@@ -119,8 +134,162 @@ def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
         same_stride = False
     # MAX_N is the maximum hidden size among all the lora_b weights
     MAX_N = max(hidden_sizes)
-    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
-                             lora_strides_d0_tensor, lora_strides_d1_tensor,
-                             lora_strides_d2_tensor, hidden_sizes_tensor,
-                             same_stride, MAX_N)
+    _LORA_B_PTR_DICT[key] = (
+        slice_start_tensor,
+        lora_ptr_tensor,
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
+        hidden_sizes_tensor,
+        same_stride,
+        MAX_N,
+    )
     return _LORA_B_PTR_DICT.get(key)
+
+
+@functools.lru_cache
+def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
+    user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
+    if user_defined_config_folder is not None:
+        gpu_name = torch.cuda.get_device_name()
+        gpu_name = gpu_name.replace(" ", "_")
+        gpu_name = gpu_name.replace("-", "_")
+
+        config_fname = None
+        # only expand op needs to consider add_inputs
+        if op_type == "expand":
+            config_fname = (
+                f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json"
+            )
+        else:
+            config_fname = f"{gpu_name}_{op_type.upper()}.json"
+
+        config_path = Path(f"{user_defined_config_folder}/{config_fname}")
+        if not config_path.exists():
+            logger.warning_once(f"No LoRA kernel configs founded in {config_path}")
+            return None
+
+        # Load json
+        logger.info_once(f"Using tuned LoRA kernel configs from {config_path}.")
+        with open(str(config_path)) as f:
+            config_data = json.load(f)
+    else:
+        config_data = None
+
+    return config_data
+
+
+@functools.lru_cache
+def get_lora_op_configs(
+    op_type: str,
+    max_loras: int,
+    batch: int,
+    hidden_size: int,
+    rank: int,
+    num_slices: int,
+    add_inputs: bool | None = None,
+    moe_intermediate_size: int | None = None,
+) -> dict[str, int | None]:
+    # Add support for fused_moe_lora ops
+    assert op_type in [
+        "shrink",
+        "expand",
+        "fused_moe_lora_w13_shrink",
+        "fused_moe_lora_w13_expand",
+        "fused_moe_lora_w2_shrink",
+        "fused_moe_lora_w2_expand",
+    ]
+
+    # default config
+    default = {}
+    if op_type == "shrink":
+        default = {
+            "block_m": 32,
+            "block_n": 16,
+            "block_k": 256 if batch < 128 else 32,
+            "split_k": 64 if batch < 128 else 8,
+            "num_warps": 4,
+            "num_ctas": 1,
+            "group_size_m": 8,
+            "num_stages": 2,
+            "max_nreg": None,
+        }
+    # The default config for fused_moe_lora ops
+    elif op_type in [
+        "fused_moe_lora_w13_shrink",
+        "fused_moe_lora_w13_expand",
+        "fused_moe_lora_w2_shrink",
+        "fused_moe_lora_w2_expand",
+    ]:
+        default = {
+            "block_m": 64,
+            "block_n": 64,
+            "block_k": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+            "group_size_m": 8,
+            "split_k": 1,
+        }
+    else:
+        default = {
+            "block_m": 64,
+            "block_n": 128,
+            "block_k": 16,
+            "num_warps": 4,
+            "num_ctas": 1,
+            "num_stages": 2,
+            "max_nreg": None,
+        }
+    m = batch
+
+    k, n = (hidden_size, rank) if op_type == "shrink" else (rank, hidden_size)
+
+    config_data: Any
+    config_data = load_lora_op_config(op_type, add_inputs)
+    if not config_data:
+        logger.warning_once("Using default LoRA kernel configs")
+        return default
+
+    # config is structured as config_data[max_loras][num_slices][m][k][n] = {}
+    # slice by max_loras
+    config_data = (
+        config_data.get(str(max_loras))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - max_loras))]
+    )
+    # slice by num_slices
+    config_data = config_data[str(num_slices)]
+    # slice by m
+    config_data = (
+        config_data.get(str(m))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - m))]
+    )
+    # slice by k
+    config_data = (
+        config_data.get(str(k))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - k))]
+    )
+    # slice by n
+    config_data = (
+        config_data.get(str(n))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))]
+    )
+
+    # slice by moe-intermediate-size if applicable
+    if moe_intermediate_size is not None:
+        i = moe_intermediate_size
+        config_data = (
+            config_data.get(str(i))
+            or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - i))]
+        )
+
+    assert config_data is not None
+    return config_data
+
+
+@lru_cache
+def supports_pdl(device: torch.device | None = None) -> bool:
+    """
+    Refer to: https://github.com/triton-lang/triton/blob/v3.5.0/python/tutorials/11-programmatic-dependent-launch.py
+    """
+    # PDL requires compute capability SM90 or above
+    return current_platform.is_cuda() and current_platform.has_device_capability(90)
diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
index 7e7c3c892457a9811e1dd1f3bd04e2e9c19f8682..b5570ceca68cab53130dad43cfc9c39e89fdd936 100644
--- a/vllm/lora/ops/xla_ops/__init__.py
+++ b/vllm/lora/ops/xla_ops/__init__.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
-                                            bgmv_shrink)
+from vllm.lora.ops.xla_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
 
 __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
index 29bfd5753a588a8c1218ee091b7dfd02f0acba05..4924890b388cbd35510b724139764f6483503b62 100644
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ b/vllm/lora/ops/xla_ops/lora_ops.py
@@ -33,8 +33,7 @@ def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
 
 
 @impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
-def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
-                 idxs: torch.IntTensor):
+def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
     T, _ = inputs.shape
     if len(loras.shape) == 4:
         loras = loras.squeeze(axis=1)
@@ -73,13 +72,12 @@ def bgmv_expand(
         limit = 1
 
     if output_tensor.shape[1] > outputs.shape[1]:
-        outputs = F.pad(outputs,
-                        (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
+        outputs = F.pad(outputs, (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
 
     if add_inputs:
-        return output_tensor + outputs[:limit, :output_tensor.shape[1]]
+        return output_tensor + outputs[:limit, : output_tensor.shape[1]]
     else:
-        return outputs[:limit, :output_tensor.shape[1]]
+        return outputs[:limit, : output_tensor.shape[1]]
 
 
 def bgmv_shrink(
@@ -98,8 +96,7 @@ def bgmv_shrink(
         scaling (float, optional): Scalar multiplier applied to the output.
     """
 
-    return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights,
-                                        lora_indices_tensor)
+    return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
 
 
 def bgmv_expand_slice(
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index dc7249c38602154437ba781ee057c2e8712bec1f..975c3d8fc0a7faa8cc571459a765f8ed39e9aae0 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -7,7 +7,7 @@ import json
 import math
 import os
 from dataclasses import MISSING, dataclass, field, fields
-from typing import Literal, Optional, Union
+from typing import Literal
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
@@ -18,26 +18,26 @@ logger = init_logger(__name__)
 
 @dataclass
 class PEFTHelper:
-    """ 
+    """
     A helper class for PEFT configurations, specifically designed for LoRA.
-    This class handles configuration validation, compatibility checks for 
+    This class handles configuration validation, compatibility checks for
     various LoRA implementations.
     """
 
     # Required fields
     r: int
     lora_alpha: int
-    target_modules: Union[list[str], str]
+    target_modules: list[str] | str
 
-    bias: Literal["none", "all", "lora_only"] = field(default="none")
-    modules_to_save: Optional[list[str]] = field(default=None)
+    bias: Literal["none"] = field(default="none")
+    modules_to_save: list[str] | None = field(default=None)
     # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
     use_rslora: bool = field(default=False)
     # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
     # Extra vllm field, start with 'vllm_' to avoid conflict
     vllm_lora_scaling_factor: float = field(default=1.0)
-    vllm_max_position_embeddings: Optional[int] = field(default=False)
+    vllm_max_position_embeddings: int | None = field(default=False)
 
     def _validate_features(self) -> list[str]:
         """
@@ -71,37 +71,38 @@ class PEFTHelper:
         # Identify any missing required fields
         missing_fields = required_fields - set(config_dict.keys())
         if missing_fields:
-            raise ValueError(
-                f"Missing required configuration fields: {missing_fields}")
+            raise ValueError(f"Missing required configuration fields: {missing_fields}")
 
         # Filter out fields that aren't defined in the class
-        filtered_dict = {
-            k: v
-            for k, v in config_dict.items() if k in class_fields
-        }
+        filtered_dict = {k: v for k, v in config_dict.items() if k in class_fields}
         return cls(**filtered_dict)
 
     @classmethod
     def from_local_dir(
-            cls,
-            lora_path: str,
-            max_position_embeddings: Optional[int],
-            tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper":
+        cls,
+        lora_path: str,
+        max_position_embeddings: int | None,
+        tensorizer_config_dict: dict | None = None,
+    ) -> "PEFTHelper":
         lora_config_path = os.path.join(lora_path, "adapter_config.json")
 
         if tensorizer_config_dict:
             tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
             tensorizer_args = tensorizer_config._construct_tensorizer_args()
             from tensorizer.stream_io import open_stream
-            lora_config_path = os.path.join(tensorizer_config.tensorizer_dir,
-                                            "adapter_config.json")
-            with open_stream(lora_config_path,
-                             mode="rb",
-                             **tensorizer_args.stream_kwargs) as f:
+
+            lora_config_path = os.path.join(
+                tensorizer_config.tensorizer_dir, "adapter_config.json"
+            )
+            with open_stream(
+                lora_config_path, mode="rb", **tensorizer_args.stream_kwargs
+            ) as f:
                 config = json.load(f)
 
-            logger.info("Successfully deserialized LoRA config from %s",
-                        tensorizer_config.tensorizer_dir)
+            logger.info(
+                "Successfully deserialized LoRA config from %s",
+                tensorizer_config.tensorizer_dir,
+            )
 
         else:
             with open(lora_config_path) as f:
@@ -112,16 +113,16 @@ class PEFTHelper:
 
     def validate_legal(self, lora_config: LoRAConfig) -> None:
         """
-        Validates the LoRA configuration settings against application 
+        Validates the LoRA configuration settings against application
         constraints and requirements.
         """
         error_msg = self._validate_features()
         if self.r > lora_config.max_lora_rank:
             error_msg.append(
                 f"LoRA rank {self.r} is greater than max_lora_rank"
-                f" {lora_config.max_lora_rank}.")
-        if self.bias != "none" and not lora_config.bias_enabled:
-            error_msg.append(
-                "Adapter bias cannot be used without bias_enabled.")
+                f" {lora_config.max_lora_rank}."
+            )
+        if self.bias != "none":
+            error_msg.append("Adapter bias is not supported.")
         if error_msg:
             raise ValueError(f"{' '.join(error_msg)}")
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b3413de1c8163c10b755ec6466368378c454a596..b6186e8561529efb984946263b8a77d03357a921 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -28,7 +28,7 @@ class PunicaWrapperABC(ABC):
     def update_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -42,12 +42,12 @@ class PunicaWrapperABC(ABC):
     @abstractmethod
     def add_shrink(
         self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
         x: torch.Tensor,
         lora_a_stacked: tuple[torch.Tensor, ...],
         scale: float,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM  for multiple slices of lora_a.
         """
@@ -58,16 +58,15 @@ class PunicaWrapperABC(ABC):
     def add_expand(
         self,
         y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
         lora_b_stacked: tuple[torch.Tensor, ...],
-        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
         add_inputs=True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
-        Performs GEMM and bias addition for multiple slices of lora_b.
+        Performs GEMM for multiple slices of lora_b.
         """
         raise NotImplementedError
 
@@ -79,41 +78,44 @@ class PunicaWrapperABC(ABC):
         lora_b_stacked: torch.Tensor,
         add_inputs: bool = True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
-        Applies lora  specifically for VocabParallelEmbeddingWithLoRA, 
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA,
         and this layer only requires the expand operation.
         """
         raise NotImplementedError
 
     @abstractmethod
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
-                        **kwargs) -> Optional[torch.Tensor]:
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: tuple[torch.Tensor, ...] | None = None,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
-        Applicable to linear-related lora. 
+        Applicable to linear-related lora.
         """
 
         raise NotImplementedError
 
     @abstractmethod
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> Optional[torch.Tensor]:
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
         """
@@ -122,41 +124,41 @@ class PunicaWrapperABC(ABC):
 
 class PunicaWrapperBase(PunicaWrapperABC):
     """
-    PunicaWrapperBase is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
+    PunicaWrapperBase is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the punica.
     """
 
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        self._token_lora_indices = torch.empty(max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
-        self._sampler_indices = torch.empty(max_num_batched_tokens,
-                                            dtype=torch.long,
-                                            device=device)
-        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
-                                                   dtype=torch.long,
-                                                   device=device)
-        self._embeddings_indices = torch.empty(2,
-                                               max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=device)
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        max_batches: int,
+        device: torch.device | str,
+        **kwargs,
+    ):
+        self._token_lora_indices = torch.empty(
+            max_num_batched_tokens, dtype=torch.long, device=device
+        )
+        self._sampler_indices = torch.empty(
+            max_num_batched_tokens, dtype=torch.long, device=device
+        )
+        self._sampler_indices_padded = torch.empty(
+            max_num_batched_tokens, dtype=torch.long, device=device
+        )
+        self._embeddings_indices = torch.empty(
+            2, max_num_batched_tokens, dtype=torch.long, device=device
+        )
 
         # 4 is the number of indices tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
-        self.indices_len: list[Optional[int]] = [None] * 4
+        self.indices_len: list[int | None] = [None] * 4
         # these attributes are the information required for sgmv kernel
-        self._seq_start_locs = torch.empty(max_batches,
-                                           dtype=torch.long,
-                                           device=device)
-        self._seq_lengths = torch.empty(max_batches,
-                                        dtype=torch.long,
-                                        device=device)
-        self._lora_indices_per_batch = torch.empty(max_batches,
-                                                   dtype=torch.long,
-                                                   device=device)
+        self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, device=device)
+        self._seq_lengths = torch.empty(max_batches, dtype=torch.long, device=device)
+        self._lora_indices_per_batch = torch.empty(
+            max_batches, dtype=torch.long, device=device
+        )
         self.device: torch.device = device
         self.max_length: int = 0
         self.token_nums: int = 0
@@ -167,7 +169,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
     def _update_base_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -186,89 +188,66 @@ class PunicaWrapperBase(PunicaWrapperABC):
             extra_vocab_size,
             self.device,
         )
-        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
-        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self._embeddings_indices[:embeddings_indices.
-                                 shape[0], :embeddings_indices.shape[1]].copy_(
-                                     embeddings_indices)
+        self._token_lora_indices[: base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[: sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[: sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded
+        )
+        self._embeddings_indices[
+            : embeddings_indices.shape[0], : embeddings_indices.shape[1]
+        ].copy_(embeddings_indices)
 
         self.indices_len[:] = indices_len
 
-    def _update_prefill_metadata(self,
-                                 token_lora_tensor: torch.Tensor) -> None:
-
-        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-         batch_size, max_length, token_nums,
-         no_lora) = compute_meta(token_lora_tensor)
-
-        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
-            b_seq_start_tensor)
-        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
-        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
-            lora_indices_tensor)
+    def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None:
+        (
+            b_seq_start_tensor,
+            seq_length_tensor,
+            lora_indices_tensor,
+            batch_size,
+            max_length,
+            token_nums,
+            no_lora,
+        ) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[: b_seq_start_tensor.shape[0]].copy_(b_seq_start_tensor)
+        self._seq_lengths[: seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[: lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor
+        )
         self.batch_size = batch_size
         self.max_length = max_length
         self.token_nums = token_nums
         self.no_lora = no_lora
 
-    def _apply_bias(
-        self,
-        indices: torch.Tensor,
-        output: torch.Tensor,
-        output_slices: tuple[int, ...],
-        lora_bias_stacked: tuple[Optional[torch.Tensor], ...],
-    ):
-        """Applies bias to output
-
-        Input shapes:
-            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
-            indices:           (batch_size)
-            output:            (batch_size, q_slice_size + 2*kv_slice_size)
-            output_slices:     n-1 element tuple of (slice_size...),
-                            where n is number of slices
-        """
-        org_output = output
-        output = output.view(-1, output.shape[-1])
-        indices = indices.view(-1)
-
-        offset_left = 0
-        for slice_idx, slice in enumerate(output_slices):
-            bias = lora_bias_stacked[slice_idx]
-            if bias is not None:
-                bias = bias.view(-1, bias.shape[-1])
-                bias = bias[indices]
-                bias[indices == -1] = 0
-                output[:, offset_left:offset_left + slice] += bias
-            offset_left += slice
-
-        return output.view_as(org_output)
-
     @property
     def prefill_metadata(
-        self
+        self,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
         """
-        This property provides a convenient way to access the necessary 
+        This property provides a convenient way to access the necessary
         metadata for prefill-related  kernel computations.
             1. seq_start_locs: Tensor of sequence start positions.
             2. seq_lengths: Tensor of sequence lengths.
-            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of
                 -1 means no lora should be applied.
             4. batch_size: Batch size after clustering identical lora indices.
             5. max_length: The maximum sequence length in the batch.
             6. token_nums: The token numbers in the batch.
         """
-        return (self._seq_start_locs[:self.batch_size],
-                self._seq_lengths[:self.batch_size],
-                self._lora_indices_per_batch[:self.batch_size],
-                self.batch_size, self.max_length, self.token_nums)
+        return (
+            self._seq_start_locs[: self.batch_size],
+            self._seq_lengths[: self.batch_size],
+            self._lora_indices_per_batch[: self.batch_size],
+            self.batch_size,
+            self.max_length,
+            self.token_nums,
+        )
 
     @property
     def token_lora_indices(self) -> torch.Tensor:
         """
-        This property provides the lora indices corresponding to each token 
+        This property provides the lora indices corresponding to each token
         in the batch. An index of -1 means no lora should be applied.
         """
         token_lora_len = self.indices_len[0]
@@ -276,8 +255,8 @@ class PunicaWrapperBase(PunicaWrapperABC):
 
     @property
     def sampler_indices(self) -> torch.Tensor:
-        """ 
-        This property is used to access the lora indices specifically for 
+        """
+        This property is used to access the lora indices specifically for
         LogitsProcessorWithLoRA.
         """
         sampler_indices_len = self.indices_len[1]
@@ -294,18 +273,24 @@ class PunicaWrapperBase(PunicaWrapperABC):
     @property
     def embeddings_indices(self) -> torch.Tensor:
         """
-        This property provides access to the indices used for lora embeddings, 
+        This property provides access to the indices used for lora embeddings,
         specifically for VocabParallelEmbeddingWithLoRA.
         """
         embeddings_indices_len = self.indices_len[3]
         return self._embeddings_indices[:, :embeddings_indices_len]
 
-    def update_metadata(self, mapping: "LoRAMapping",
-                        lora_index_to_id: list[Optional[int]], max_loras: int,
-                        vocab_size: int, extra_vocab_size: int, **kwargs):
-
-        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                   vocab_size, extra_vocab_size)
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[int | None],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        **kwargs,
+    ):
+        self._update_base_metadata(
+            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
+        )
 
         if mapping.is_prefill:
             # Update metadata required for prefill-related operators.
@@ -315,16 +300,21 @@ class PunicaWrapperBase(PunicaWrapperABC):
             self.is_prefill = False
 
     @abstractmethod
-    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
-                   scale: float, **kwargs) -> Optional[torch.Tensor]:
+    def add_shrink(
+        self,
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
         Performs GEMM  for multiple slices of lora_a.
 
         Semantics:
         for i in range(len(lora_a_stacked)):
             y[i] += (x @ lora_a_stacked[i]) * scale
-        
+
         Args:
             y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
             x (torch.Tensor): Input tensor
@@ -336,32 +326,30 @@ class PunicaWrapperBase(PunicaWrapperABC):
         raise NotImplementedError
 
     @abstractmethod
-    def add_expand(self,
-                   y: torch.Tensor,
-                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   lora_b_stacked: tuple[torch.Tensor, ...],
-                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                   output_slices: tuple[int, ...],
-                   offset_start: int = 0,
-                   add_inputs=True,
-                   **kwargs) -> Optional[torch.Tensor]:
-        """
-        Performs GEMM and bias addition for multiple slices of lora_b.
-      
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> torch.Tensor | None:
+        """
+        Performs GEMM for multiple slices of lora_b.
+
         Semantics:
             offset = offset_start
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    lora_bias_stacked[i] 
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                 offset += slice
-            
+
         Args:
             y (torch.Tensor): Output tensor.
             x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
-                bias's weight
             output_slices (tuple[int, ...]): Every slice's size
             offset_start (int): The starting position of y, defaults to 0
             add_inputs (bool):  Defaults to True.
@@ -371,12 +359,14 @@ class PunicaWrapperBase(PunicaWrapperABC):
         raise NotImplementedError
 
     @abstractmethod
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> Optional[torch.Tensor]:
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
         and this layer only requires the expand operation.
@@ -393,19 +383,20 @@ class PunicaWrapperBase(PunicaWrapperABC):
         raise NotImplementedError
 
     @abstractmethod
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
-                        **kwargs) -> Optional[torch.Tensor]:
-        """
-        Applicable to linear-related lora. 
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: tuple[torch.Tensor, ...] | None = None,
+        **kwargs,
+    ) -> torch.Tensor | None:
+        """
+        Applicable to linear-related lora.
 
         Semantics:
             for i in range(len(lora_a_stacked)):
@@ -414,14 +405,13 @@ class PunicaWrapperBase(PunicaWrapperABC):
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
+                    ).squeeze(0)
 
         Args:
             y (torch.Tensor): Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (tuple[int, ...]): Every slice's size.
             buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
@@ -430,18 +420,20 @@ class PunicaWrapperBase(PunicaWrapperABC):
         raise NotImplementedError
 
     @abstractmethod
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> Optional[torch.Tensor]:
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
-        
+
         Semantics:
             buffer = (x @ lora_a_stacked) * scale
             y += buffer @ lora_b_stacked
@@ -456,3 +448,45 @@ class PunicaWrapperBase(PunicaWrapperABC):
         """
         # TODO: implement it based on torch ops
         raise NotImplementedError
+
+    def moe_lora_align_block_size(
+        self,
+        topk_ids: torch.Tensor,
+        num_tokens: int,
+        block_size: int,
+        num_experts: int,
+        max_loras: int,
+        adapter_enabled: torch.Tensor,
+        expert_map: torch.Tensor | None = None,
+        pad_sorted_ids: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Aligns tokens and experts into block-sized chunks for LoRA-based
+        mixture-of-experts (MoE) execution.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
+
+    def add_lora_fused_moe(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: list[torch.Tensor],
+        lora_b_stacked: list[torch.Tensor],
+        topk_weights: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+        expert_ids: torch.Tensor,
+        num_tokens_post_padded: torch.Tensor,
+        max_lora_rank: int,
+        top_k_num: int,
+        shrink_config,
+        expand_config,
+        adapter_enabled: torch.Tensor,
+        mul_routed_weight=False,
+    ):
+        """
+        Performs a fused forward computation for LoRA of
+        Mixture-of-Experts (MoE) layer.
+        """
+        # TODO: implement it based on torch ops
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index 59049cccc8cbeb46bf1fa272bf1ce9bd0471d611..1a700d9bf1f06e2419b584d09a9a95469a9db052 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.torch_ops import (
+    bgmv_expand,
+    bgmv_expand_slice,
+    bgmv_shrink,
+    sgmv_expand,
+    sgmv_expand_slice,
+    sgmv_shrink,
+)
 
 from .punica_base import PunicaWrapperBase
 
@@ -16,15 +21,19 @@ from .punica_base import PunicaWrapperBase
 # inherit this class
 class PunicaWrapperCPU(PunicaWrapperBase):
     """
-    PunicaWrapperCPU is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
+    PunicaWrapperCPU is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the pytorch punica ops.
     """
 
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
-                                   device)
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        max_batches: int,
+        device: torch.device | str,
+        **kwargs,
+    ):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
 
     def _shrink_prefill(
         self,
@@ -33,7 +42,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         w_t_all: torch.Tensor,
         scale: float,
     ):
-        #No LoRA request, so return directly
+        # No LoRA request, so return directly
         if self.no_lora:
             return
         sgmv_shrink(
@@ -60,7 +69,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         w_t_all: torch.Tensor,
         add_inputs: bool,
     ):
-        #No LoRA request, so return directly
+        # No LoRA request, so return directly
         if self.no_lora:
             return
         sgmv_expand(
@@ -89,7 +98,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         y_slice_size: int,
         add_inputs: bool,
     ):
-        #No LoRA request, so return directly
+        # No LoRA request, so return directly
         if self.no_lora:
             return
         sgmv_expand_slice(
@@ -111,8 +120,9 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         y_slice_size: int,
         add_inputs: bool,
     ):
-        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_inputs)
+        bgmv_expand_slice(
+            x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_inputs
+        )
 
     def _apply_expand(
         self,
@@ -124,18 +134,19 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         add_inputs: bool = True,
     ):
         """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all`
         computation, which is suitable for the
         GEMM of lora'b.
         """
 
-        expand_slice_fun: Callable = (self._expand_slice_prefill
-                                      if self.is_prefill else
-                                      self._expand_slice_decode)
+        expand_slice_fun: Callable = (
+            self._expand_slice_prefill if self.is_prefill else self._expand_slice_decode
+        )
         expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
 
-    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                      w_t_all: torch.Tensor, scale: float):
+    def _apply_shrink(
+        self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, scale: float
+    ):
         """
         Perform the ` y+=x@w_t_all` computation, which is suitable for the
         GEMM of lora'a.
@@ -146,25 +157,31 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self._shrink_prefill
-                                if self.is_prefill else self._shrink_decode)
+        shrink_fun: Callable = (
+            self._shrink_prefill if self.is_prefill else self._shrink_decode
+        )
         shrink_fun(y, x, w_t_all, scale)
         y = y.view_as(y_org)
 
-    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
-                   scale: float, **kwargs):
+    def add_shrink(
+        self,
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ):
         """
         Performs GEMM  for multiple slices of lora_a.
         When `is_prefill is` true, it indicates that it is currently the
         prefill stage, and the `_shrink_prefill` function should be called.
         Otherwise, it is the decode stage, and the _shrink_decode function
         should be called.
-            
+
         Semantics:
         for i in range(len(lora_a_stacked)):
             y[i] += (x @ lora_a_stacked[i]) * scale
-        
+
         Args:
             y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
             x (torch.Tensor): Input tensor
@@ -175,43 +192,37 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         x = x.view(-1, x.shape[-1])
         # TODO fuse these kernels
         for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
-
-    def add_expand(self,
-                   y: torch.Tensor,
-                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   lora_b_stacked: tuple[torch.Tensor, ...],
-                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                   output_slices: tuple[int, ...],
-                   offset_start: int = 0,
-                   add_inputs=True,
-                   **kwargs) -> None:
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale)
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
         """
-        Performs GEMM and bias addition for multiple slices of lora_b.
-      
+        Performs GEMM for multiple slices of lora_b.
+
         Semantics:
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    lora_bias_stacked[i] 
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                 offset += slice
-            
+
         Args:
             y (torch.Tensor): Output tensor.
             x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
-                bias's weight
             output_slices (tuple[int, ...]): Every slice's size
             add_inputs (bool):  Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
         offset_left = offset_start
-        if lora_bias_stacked is not None:
-            self._apply_bias(self.token_lora_indices, y, output_slices,
-                             lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
             self._apply_expand(
                 y,
@@ -224,12 +235,14 @@ class PunicaWrapperCPU(PunicaWrapperBase):
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
 
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> None:
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
 
@@ -244,23 +257,25 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         """
 
         # Embedding layer only need expand op
-        expand_fun: Callable = (self._expand_prefill
-                                if self.is_prefill else self._expand_decode)
+        expand_fun: Callable = (
+            self._expand_prefill if self.is_prefill else self._expand_decode
+        )
         expand_fun(y, x, lora_b_stacked, add_inputs)
 
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
-                        **kwargs) -> None:
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: tuple[torch.Tensor, ...] | None = None,
+        **kwargs,
+    ) -> None:
         """
-        Applicable to linear-related lora. 
+        Applicable to linear-related lora.
 
         Semantics:
             for i in range(len(lora_a_stacked)):
@@ -269,54 +284,47 @@ class PunicaWrapperCPU(PunicaWrapperBase):
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
+                    ).squeeze(0)
 
         Args:
             y (torch.Tensor): Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (tuple[int, ...]): Every slice's size.
             buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if lora_bias_stacked is not None:
-            assert len(lora_bias_stacked) == len(output_slices)
-            y = self._apply_bias(self.token_lora_indices, y, output_slices,
-                                 lora_bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default, consistent with the
             # triton op
             buffer = tuple(
-                torch.zeros(
-                    (x.size(0), r), dtype=torch.float32, device=x.device)
-                for _ in range(len(output_slices)))
+                torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices))
+            )
         self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
-        self.add_expand(y,
-                        buffer,
-                        lora_b_stacked,
-                        None,
-                        output_slices,
-                        add_inputs=True,
-                        **kwargs)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
+        self.add_expand(
+            y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs
+        )
+
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
-        
+
         Semantics:
             buffer = (x @ lora_a_stacked) * scale
             y += buffer @ lora_b_stacked
@@ -336,14 +344,8 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         if buffer is None:
             # We set the buffer to be float32 by default, consistent with the
             # triton op
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
+            buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device)
         # LogitsProcessorWithLoRA always using bgmv.
         bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
-        bgmv_expand(buffer,
-                    lora_b_stacked,
-                    y,
-                    self.sampler_indices,
-                    add_inputs=True)
+        bgmv_expand(buffer, lora_b_stacked, y, self.sampler_indices, add_inputs=True)
         y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 467f50050eb2143544f05a2bf561f2648c968230..ede50a48af9858b6b21b8aa9b15e94166f4eb6a9 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -2,21 +2,28 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Optional, Union, final
+from typing import final
 
 import torch
 
 from vllm.lora.layers import LoRAMapping
-from vllm.triton_utils import HAS_TRITON
+from vllm.triton_utils import HAS_TRITON, triton
+from vllm.utils.math_utils import round_up
 
 if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
-                                          lora_shrink)
+    from vllm.lora.ops.triton_ops import (
+        LoRAKernelMeta,
+        fused_moe_lora,
+        lora_expand,
+        lora_shrink,
+    )
+
+from vllm import _custom_ops as ops
 
 from .punica_base import PunicaWrapperBase
 
@@ -24,48 +31,67 @@ from .punica_base import PunicaWrapperBase
 @final
 class PunicaWrapperGPU(PunicaWrapperBase):
     """
-    PunicaWrapperGPU is designed to manage and provide metadata for the punica 
-    kernel. The main function is to maintain the state information for 
+    PunicaWrapperGPU is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the punica triton kernel.
     """
 
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
-                                   device)
-
-        self.max_loras = kwargs['max_loras']
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        max_batches: int,
+        device: torch.device | str,
+        **kwargs,
+    ):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
 
-        self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                      max_num_batched_tokens,
-                                                      device=device)
+        self.max_loras = kwargs["max_loras"]
 
-        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                       max_batches,
-                                                       device=device)
+        self.token_mapping_meta = LoRAKernelMeta.make(
+            self.max_loras, max_num_batched_tokens, device=device
+        )
 
-    def update_metadata(self, mapping: LoRAMapping,
-                        lora_index_to_id: list[Optional[int]], max_loras: int,
-                        vocab_size: int, extra_vocab_size: int, **kwargs):
+        # When speculative decoding is enabled, max_num_samples is
+        # max_batches * (num_speculative_decoding_tokens + 1).
+        # This line can be optimized by replacing max_num_batched_tokens
+        # to  max_batches * (num_speculative_decoding_tokens + 1).
+        self.prompt_mapping_meta = LoRAKernelMeta.make(
+            self.max_loras, max_num_batched_tokens, device=device
+        )
 
+    def update_metadata(
+        self,
+        mapping: LoRAMapping,
+        lora_index_to_id: list[int | None],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        **kwargs,
+    ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                   vocab_size, extra_vocab_size)
+        self._update_base_metadata(
+            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
+        )
 
         # Prepare cuda kernel metadata tensors
         self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
         self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
 
-    def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                   lora_a_stacked: tuple[torch.Tensor,
-                                         ...], scale: float, **kwargs):
+    def add_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ):
         """
         Performs GEMM  for multiple slices of lora_a.
-            
+
         Semantics:
         for i in range(len(lora_a_stacked)):
             y[i] += (x @ lora_a_stacked[i]) * scale
-        
+
         Args:
             y (torch.Tensor): Output tensors
             x (torch.Tensor): Input tensor
@@ -82,41 +108,34 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             scale,
         )
 
-    def add_expand(self,
-                   y: torch.Tensor,
-                   x: torch.Tensor,
-                   lora_b_stacked: tuple[torch.Tensor, ...],
-                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                   output_slices: tuple[int, ...],
-                   offset_start: int = 0,
-                   add_inputs=True,
-                   **kwargs) -> None:
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
         """
-        Performs GEMM and bias addition for multiple slices of lora_b.
-      
+        Performs GEMM for multiple slices of lora_b.
+
         Semantics:
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    lora_bias_stacked[i] 
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                 offset += slice
-            
+
         Args:
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensors
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
-                bias's weight
             output_slices (tuple[int, ...]): Every slice's size
             add_inputs (bool): Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        if lora_bias_stacked is not None:
-            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
-                                              y.size(0))
-            self._apply_bias(token_lora_indices, y, output_slices,
-                             lora_bias_stacked)
 
         assert x.ndim == 3
         assert x.size(0) == len(output_slices)
@@ -133,12 +152,14 @@ class PunicaWrapperGPU(PunicaWrapperBase):
 
         y = y.view_as(y_org)
 
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> None:
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
 
@@ -154,26 +175,27 @@ class PunicaWrapperGPU(PunicaWrapperBase):
 
         lora_expand(
             x.unsqueeze(dim=0),
-            (lora_b_stacked, ),
+            (lora_b_stacked,),
             y,
             *self.token_mapping_meta.meta_args(x.size(0)),
             offset_start=0,
             add_inputs=add_inputs,
         )
 
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> None:
         """
-        Applicable to linear-related lora. 
+        Applicable to linear-related lora.
 
         Semantics:
             for i in range(len(lora_a_stacked)):
@@ -182,63 +204,61 @@ class PunicaWrapperGPU(PunicaWrapperBase):
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
-
+                    ).squeeze(0)
         Args:
             y (torch.Tensor): Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (tuple[int, ...]): Every slice's size.
             buffer (Optional[torch.Tensor]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if lora_bias_stacked is not None:
-            assert len(lora_bias_stacked) == len(output_slices)
-            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
-                                              y.size(0))
-            y = self._apply_bias(token_lora_indices, y, output_slices,
-                                 lora_bias_stacked)
-
-        if buffer is None:
-            r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default, refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros(  # type: ignore
-                (len(output_slices), x.size(0), r),
-                dtype=torch.float32,
-                device=x.device,
-            )
+
+        assert buffer is None, (
+            "To minimize overhead, the buffer should be created by "
+            ".add_lora_linear() instead of being passed in."
+        )
+        r = lora_b_stacked[0].size(-1)
+        # We set the buffer to be float32 by default, refer to:
+        # https://github.com/triton-lang/triton/issues/1387
+        # Note: buffer is zeroed inside the shrink op
+        buffer = torch.empty(
+            (len(output_slices), x.size(0), r), dtype=torch.float32, device=x.device
+        )
+
         self.add_shrink(
             buffer,  # type: ignore
             x,
             lora_a_stacked,
             scale,
-            **kwargs)
+            **kwargs,
+        )
         self.add_expand(
             y,
             buffer,  # type: ignore
             lora_b_stacked,
-            None,
             output_slices,
             add_inputs=True,
-            **kwargs)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
+            **kwargs,
+        )
+
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
-        
+
         Semantics:
             buffer = (x @ lora_a_stacked) * scale
             y += buffer @ lora_b_stacked
@@ -255,18 +275,137 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
         r = lora_b_stacked.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default, refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
-                    *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
-
-        lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
-                    y,
-                    *self.prompt_mapping_meta.meta_args(buffer.size(0)),
-                    add_inputs=True)
+
+        assert buffer is None, (
+            "To minimize overhead, the buffer should be created by "
+            ".add_lora_linear() instead of being passed in."
+        )
+        # We set the buffer to be float32 by default, refer to:
+        # https://github.com/triton-lang/triton/issues/1387
+        # Note: buffer is zeroed inside the shrink op
+        buffer = torch.empty((x.size(0), r), dtype=torch.float32, device=x.device)
+
+        lora_shrink(
+            x,
+            [lora_a_stacked],
+            buffer.unsqueeze(dim=0),
+            *self.prompt_mapping_meta.meta_args(x.size(0)),
+            scale,
+        )
+
+        lora_expand(
+            buffer.unsqueeze(dim=0),
+            [lora_b_stacked],
+            y,
+            *self.prompt_mapping_meta.meta_args(buffer.size(0)),
+            add_inputs=True,
+        )
         y = y.view_as(y_org)
+
+    def moe_lora_align_block_size(
+        self,
+        topk_ids: torch.Tensor,
+        num_tokens: int,
+        block_size: int,
+        num_experts: int,
+        max_loras: int,
+        adapter_enabled: torch.Tensor,
+        expert_map: torch.Tensor | None = None,
+        pad_sorted_ids: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Aligns tokens and experts into block-sized chunks for LoRA-based
+        mixture-of-experts (MoE) execution.
+        """
+        max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+        if pad_sorted_ids:
+            max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+        sorted_ids = torch.empty(
+            (max_loras * max_num_tokens_padded,),
+            dtype=torch.int32,
+            device=topk_ids.device,
+        )
+        max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+        # Expert ids must be set default to -1 to prevent a blank block
+        expert_ids = torch.empty(
+            (max_loras * max_num_m_blocks,),
+            dtype=torch.int32,
+            device=topk_ids.device,
+        )
+        num_tokens_post_pad = torch.empty(
+            (max_loras), dtype=torch.int32, device=topk_ids.device
+        )
+
+        (token_lora_mapping, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(
+            num_tokens
+        )
+
+        ops.moe_lora_align_block_size(
+            topk_ids,
+            token_lora_mapping,
+            num_experts,
+            block_size,
+            max_loras,
+            max_num_tokens_padded,
+            max_num_m_blocks,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+            adapter_enabled,
+            lora_ids,
+        )
+        if expert_map is not None:
+            expert_ids = expert_map[expert_ids]
+
+        return sorted_ids, expert_ids, num_tokens_post_pad
+
+    def add_lora_fused_moe(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: list[torch.Tensor],
+        lora_b_stacked: list[torch.Tensor],
+        topk_weights: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+        expert_ids: torch.Tensor,
+        num_tokens_post_padded: torch.Tensor,
+        max_lora_rank: int,
+        top_k_num: int,
+        shrink_config,
+        expand_config,
+        adapter_enabled: torch.Tensor,
+        mul_routed_weight=False,
+    ):
+        """
+        Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
+        """
+        (_, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(x.size(0))
+        fused_moe_lora(
+            y,
+            x,
+            lora_a_stacked,
+            lora_b_stacked,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            max_lora_rank,
+            top_k_num,
+            lora_ids,
+            adapter_enabled,
+            shrink_config.get("BLOCK_SIZE_M", 64),
+            shrink_config.get("BLOCK_SIZE_N", 64),
+            shrink_config.get("BLOCK_SIZE_K", 32),
+            shrink_config.get("GROUP_SIZE_M", 8),
+            shrink_config.get("NUM_WARPS", 4),
+            shrink_config.get("NUM_STAGES", 3),
+            shrink_config.get("SPLIT_K", 1),
+            expand_config.get("BLOCK_SIZE_M", 64),
+            expand_config.get("BLOCK_SIZE_N", 64),
+            expand_config.get("BLOCK_SIZE_K", 32),
+            expand_config.get("GROUP_SIZE_M", 8),
+            expand_config.get("NUM_WARPS", 4),
+            expand_config.get("NUM_STAGES", 3),
+            expand_config.get("SPLIT_K", 1),
+            mul_routed_weight,
+        )
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index c684ac77cc9ca30ee7b4eab93d9abd19a74b6a51..d8763e913e3a5f2481296b510ee125a01694a873 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -3,7 +3,7 @@
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .punica_base import PunicaWrapperBase
 
@@ -14,7 +14,8 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     punica_wrapper_qualname = current_platform.get_punica_wrapper()
     punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
     punica_wrapper = punica_wrapper_cls(*args, **kwargs)
-    assert punica_wrapper is not None, \
+    assert punica_wrapper is not None, (
         "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
+    )
     logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
     return punica_wrapper
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 5896da516540bc47850539c50e2e2a3bafbc3652..090878dcd25469f1a34c798422c9efbeea31e577 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -25,27 +25,29 @@ class PunicaWrapperTPU(PunicaWrapperBase):
     Multi-LoRA, and to provide the interface for the pytorch punica ops.
     """
 
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
-                                   device)
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        max_batches: int,
+        device: torch.device | str,
+        **kwargs,
+    ):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
 
         # PunicaWrapperBase defines some tensors with dtype=torch.int64, which
         # isn't supported by the TPU. So convert those tensors to int32.
         # Not all of them are used by the TPU so only convert the useful ones.
-        self._token_lora_indices = self._token_lora_indices.to(
-            dtype=torch.int32)
+        self._token_lora_indices = self._token_lora_indices.to(dtype=torch.int32)
         self._sampler_indices = self._sampler_indices.to(dtype=torch.int32)
         self._sampler_indices_padded = self._sampler_indices_padded.to(
-            dtype=torch.int32)
+            dtype=torch.int32
+        )
 
         torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True)
         torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded,
-                                               True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, True)
         torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch,
-                                               True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, True)
 
         torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
         torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
@@ -77,21 +79,38 @@ class PunicaWrapperTPU(PunicaWrapperBase):
     ):
         return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale)
 
-    def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor,
-               add_inputs: bool):
-        return bgmv_expand(x, w_t_all, y, self._get_token_lora_indices(x),
-                           add_inputs)
-
-    def expand_slice(self, y: torch.Tensor, x: torch.Tensor,
-                     w_t_all: torch.Tensor, y_offset: int, y_slice_size: int,
-                     add_inputs: bool) -> torch.Tensor:
-        return bgmv_expand_slice(x, w_t_all, y,
-                                 self._get_token_lora_indices(x), y_offset,
-                                 y_slice_size, add_inputs)
-
-    def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...],
-                   scale: float, **kwargs) -> Optional[torch.Tensor]:
+    def expand(
+        self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, add_inputs: bool
+    ):
+        return bgmv_expand(x, w_t_all, y, self._get_token_lora_indices(x), add_inputs)
+
+    def expand_slice(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ) -> torch.Tensor:
+        return bgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            self._get_token_lora_indices(x),
+            y_offset,
+            y_slice_size,
+            add_inputs,
+        )
+
+    def add_shrink(
+        self,
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> torch.Tensor | None:
         """
         Performs GEMM for multiple slices of lora_a.
 
@@ -115,31 +134,29 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             y[slice_idx, :, :] = y_s  # type: ignore[index]
         return y
 
-    def add_expand(self,
-                   y: torch.Tensor,
-                   x: Union[tuple[torch.Tensor, ...], torch.Tensor],
-                   lora_b_stacked: tuple[torch.Tensor, ...],
-                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                   output_slices: tuple[int, ...],
-                   offset_start: int = 0,
-                   add_inputs=True,
-                   **kwargs) -> torch.Tensor:
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> torch.Tensor:
         """
-        Performs GEMM and bias addition for multiple slices of lora_b.
+        Performs GEMM for multiple slices of lora_b.
 
         Semantics:
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] +
-                    lora_bias_stacked[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                 offset += slice
 
         Args:
             y (torch.Tensor): Output tensor.
             x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]):
-                bias's weight
             output_slices (tuple[int, ...]): Every slice's size
             add_inputs (bool):  Defaults to True.
         """
@@ -147,25 +164,26 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         y = y.view(-1, y.shape[-1])
         offset_left = 0
 
-        if lora_bias_stacked is not None:
-            y = self._apply_bias(self._get_token_lora_indices(y), y,
-                                 output_slices, lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            y = self.expand_slice(y,
-                                  x[slice_idx],
-                                  lora_b_stacked[slice_idx],
-                                  offset_left,
-                                  output_slices[slice_idx],
-                                  add_inputs=add_inputs)
+            y = self.expand_slice(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
             offset_left += output_slices[slice_idx]
         return y.view_as(y_org)
 
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> torch.Tensor:
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
 
@@ -182,17 +200,18 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         # Embedding layer only needs the expand op
         return self.expand(y, x, lora_b_stacked, add_inputs)
 
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
-                        **kwargs) -> torch.Tensor:
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: tuple[torch.Tensor, ...] | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
         """
         Applicable to linear-related lora.
 
@@ -203,24 +222,19 @@ class PunicaWrapperTPU(PunicaWrapperBase):
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
+                    ).squeeze(0)
 
         Args:
             y (torch.Tensor): Output tensor. Will not be changed in-place.
             x (torch.Tensor): Input tensor (T, E)
             lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (tuple[int, ...]): Every slice's size.
             buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if lora_bias_stacked is not None:
-            assert len(lora_bias_stacked) == len(output_slices)
-            y = self._apply_bias(self._get_token_lora_indices(y), y,
-                                 output_slices, lora_bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
@@ -231,23 +245,21 @@ class PunicaWrapperTPU(PunicaWrapperBase):
                 device=x.device,
             )
         buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
-        return self.add_expand(y,
-                               buffer,
-                               lora_b_stacked,
-                               None,
-                               output_slices,
-                               add_inputs=True,
-                               **kwargs)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> torch.Tensor:
+        return self.add_expand(
+            y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs
+        )
+
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
         """
         Applies lora specifically for LogitsProcessorWithLoRA.
 
@@ -269,55 +281,15 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
         sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
         buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale)
-        y = bgmv_expand(buffer,
-                        lora_b_stacked,
-                        y,
-                        sampler_indices,
-                        add_inputs=True)
+        y = bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True)
         return y.view_as(y_org)
 
-    def _apply_bias(
-        self,
-        indices: torch.Tensor,
-        output: torch.Tensor,
-        output_slices: tuple[int, ...],
-        lora_bias_stacked: tuple[Optional[torch.Tensor], ...],
-    ):
-        """Applies bias to output
-
-        Input shapes:
-            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
-            indices:           (batch_size)
-            output:            (batch_size, q_slice_size + 2*kv_slice_size)
-            output_slices:     n-1 element tuple of (slice_size...),
-                            where n is number of slices
-        """
-        org_output = output
-        output = output.view(-1, output.shape[-1])
-        indices = indices.view(-1)
-
-        offset_left = 0
-        for slice_idx, slice in enumerate(output_slices):
-            bias = lora_bias_stacked[slice_idx]
-            if bias is not None:
-                bias = bias.view(-1, bias.shape[-1])
-                bias = bias[indices]
-                bias = torch.where(indices[:, None] == -1, 0, bias)
-
-                bias = F.pad(bias, (offset_left, output.shape[1] -
-                                    (offset_left + slice), 0, 0))
-
-                output += bias
-            offset_left += slice
-
-        return output.view_as(org_output)
-
     # This performs the same tensor ops as the base method, except it does them
     # on the CPU then transfers the results to the TPU
     def _update_base_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -328,8 +300,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
         # avoid having backend specific LoRAMapping classes?
-        mapping.prompt_mapping = self._pad_prompt_mapping(
-            mapping.prompt_mapping)
+        mapping.prompt_mapping = self._pad_prompt_mapping(mapping.prompt_mapping)
 
         (
             base_indices,
@@ -346,35 +317,33 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             "cpu",
         )
         self._token_lora_indices = self._pad_to_shape(
-            base_indices, self._token_lora_indices.shape,
-            dims=1).to(self.device)
-        self._sampler_indices = self._pad_to_shape(sampler_indices,
-                                                   self._sampler_indices.shape,
-                                                   dims=1).to(self.device)
+            base_indices, self._token_lora_indices.shape, dims=1
+        ).to(self.device)
+        self._sampler_indices = self._pad_to_shape(
+            sampler_indices, self._sampler_indices.shape, dims=1
+        ).to(self.device)
         self._sampler_indices_padded = self._pad_to_shape(
-            sampler_indices_padded, self._sampler_indices_padded.shape,
-            dims=1).to(self.device)
+            sampler_indices_padded, self._sampler_indices_padded.shape, dims=1
+        ).to(self.device)
         self._embeddings_indices = self._pad_to_shape(
-            embeddings_indices, self._embeddings_indices.shape,
-            dims=2).to(self.device)
+            embeddings_indices, self._embeddings_indices.shape, dims=2
+        ).to(self.device)
         self.indices_len[:] = indices_len
 
-    def _update_prefill_metadata(self,
-                                 token_lora_tensor: torch.Tensor) -> None:
+    def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None:
         self.batch_size = 1
-        self._lora_indices_per_batch[:self.
-                                     batch_size] = token_lora_tensor[:self.
-                                                                     batch_size]
+        self._lora_indices_per_batch[: self.batch_size] = token_lora_tensor[
+            : self.batch_size
+        ]
 
-    def _pad_prompt_mapping(
-            self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
+    def _pad_prompt_mapping(self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
         num_reqs = len(prompt_mapping)
 
         # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular
         # import
         MIN_NUM_SEQS = 8
 
-        padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
+        padded_num_reqs = max(2 ** math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
         pad_len = padded_num_reqs - num_reqs
 
         padding = [-1] * pad_len
@@ -387,5 +356,4 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         else:
             pad_rows = target_shape[0] - src.shape[0]
             pad_cols = target_shape[1] - src.shape[1]
-            return F.pad(src, (0, pad_cols, 0, pad_rows),
-                         value=0).to(torch.int32)
+            return F.pad(src, (0, pad_cols, 0, pad_rows), value=0).to(torch.int32)
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index 163bb412235ce28616347722546648a5f35d4e02..b95087d0ff8341079a9af0477a77c325a0926e57 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -2,12 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Optional, Union, final
+from typing import final
 
 import torch
 
@@ -21,25 +21,35 @@ from .punica_base import PunicaWrapperBase
 class PunicaWrapperXPU(PunicaWrapperBase):
     """
     PunicaWrapperXPU is designed to manage and provide metadata for the punica
-    kernel. The main function is to maintain the state information for 
+    kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the punica ipex kernel.
     """
 
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
-                                   device)
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        max_batches: int,
+        device: torch.device | str,
+        **kwargs,
+    ):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
         torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
         torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
         torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
 
-    def update_metadata(self, mapping: LoRAMapping,
-                        lora_index_to_id: list[Optional[int]], max_loras: int,
-                        vocab_size: int, extra_vocab_size: int, **kwargs):
-
+    def update_metadata(
+        self,
+        mapping: LoRAMapping,
+        lora_index_to_id: list[int | None],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        **kwargs,
+    ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                   vocab_size, extra_vocab_size)
+        self._update_base_metadata(
+            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
+        )
 
     def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
         return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
@@ -63,19 +73,25 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         add_inputs: bool,
     ):
         token_lora_indices = self._get_token_lora_indices(x)
-        bgmv_expand_slice(x, w_t_all, y, token_lora_indices, y_offset,
-                          y_slice_size, add_inputs)
+        bgmv_expand_slice(
+            x, w_t_all, y, token_lora_indices, y_offset, y_slice_size, add_inputs
+        )
 
-    def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                   lora_a_stacked: tuple[torch.Tensor,
-                                         ...], scale: float, **kwargs):
+    def add_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ):
         """
         Performs GEMM  for multiple slices of lora_a.
-            
+
         Semantics:
         for i in range(len(lora_a_stacked)):
             y[i] += (x @ lora_a_stacked[i]) * scale
-        
+
         Args:
             y (torch.Tensor): Output tensors
             x (torch.Tensor): Input tensor
@@ -85,43 +101,36 @@ class PunicaWrapperXPU(PunicaWrapperBase):
 
         x = x.view(-1, x.shape[-1])
         for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
-
-    def add_expand(self,
-                   y: torch.Tensor,
-                   x: torch.Tensor,
-                   lora_b_stacked: tuple[torch.Tensor, ...],
-                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                   output_slices: tuple[int, ...],
-                   offset_start: int = 0,
-                   add_inputs=True,
-                   **kwargs) -> None:
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale)
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        output_slices: tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
         """
-        Performs GEMM and bias addition for multiple slices of lora_b.
-      
+        Performs GEMM for multiple slices of lora_b.
+
         Semantics:
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
-                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    lora_bias_stacked[i] 
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                 offset += slice
-            
+
         Args:
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensors
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
-                bias's weight
             output_slices (tuple[int, ...]): Every slice's size
             add_inputs (bool): Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        if lora_bias_stacked is not None:
-            token_lora_indices = self._get_token_lora_indices(y)
-            self._apply_bias(token_lora_indices, y, output_slices,
-                             lora_bias_stacked)
 
         assert x.ndim == 3
         assert x.size(0) == len(output_slices)
@@ -139,12 +148,14 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             offset_start += output_slices[slice_idx]
         y.view_as(y_org)
 
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> None:
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        add_inputs: bool = True,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
 
@@ -160,17 +171,18 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         token_lora_indices = self._get_token_lora_indices(x)
         bgmv_expand(x, lora_b_stacked, y, token_lora_indices, add_inputs)
 
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
+    def add_lora_linear(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        scale: float,
+        output_slices: tuple[int, ...],
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> None:
         """
         Applicable to linear-related lora.
 
@@ -181,25 +193,19 @@ class PunicaWrapperXPU(PunicaWrapperBase):
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+lora_bias_stacked[i]
+                    ).squeeze(0)
 
         Args:
             y (torch.Tensor): Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
-            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (tuple[int, ...]): Every slice's size.
             buffer (Optional[torch.Tensor]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if lora_bias_stacked is not None:
-            assert len(lora_bias_stacked) == len(output_slices)
-            token_lora_indices = self._get_token_lora_indices(y)
-            y = self._apply_bias(token_lora_indices, y, output_slices,
-                                 lora_bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
@@ -215,15 +221,16 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             x,
             lora_a_stacked,
             scale,
-            **kwargs)
+            **kwargs,
+        )
         self.add_expand(
             y,
             buffer,  # type: ignore
             lora_b_stacked,
-            None,
             output_slices,
             add_inputs=True,
-            **kwargs)
+            **kwargs,
+        )
 
     @property
     def sampler_indices_padded(self) -> torch.Tensor:
@@ -232,18 +239,20 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         """
         return self._sampler_indices_padded[:]
 
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
+    def add_lora_logits(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        scale,
+        *,
+        buffer: torch.Tensor | None = None,
+        **kwargs,
+    ) -> None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
-        
+
         Semantics:
             buffer = (x @ lora_a_stacked) * scale
             y += buffer @ lora_b_stacked
@@ -263,14 +272,8 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         if buffer is None:
             # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
+            buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device)
         sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
         bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale)
-        bgmv_expand(buffer,
-                    lora_b_stacked,
-                    y,
-                    sampler_indices,
-                    add_inputs=True)
+        bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True)
         return y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index d22c29da1c615e879730f3a966ac5ca1274296f5..584745f86b1a82e262762585f32ac17f2a79216e 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
 
 
 def compute_meta(
-    token_lora_tensor: torch.Tensor
+    token_lora_tensor: torch.Tensor,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
     """
     Get the information required for the sgmv kernel. With the  features:
@@ -23,7 +23,8 @@ def compute_meta(
     """
 
     lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
-        token_lora_tensor, return_counts=True)
+        token_lora_tensor, return_counts=True
+    )
     cum_result = torch.cumsum(seq_length_tensor, dim=0)
     b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
     b_seq_start_tensor[1:].copy_(cum_result[:-1])
@@ -36,14 +37,21 @@ def compute_meta(
     # does not need to launch the triton kernel, which can improve performance
     if batch_size == 1 and lora_indices_tensor == -1:
         no_lora = True
-    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-            batch_size, max_length, token_nums, no_lora)
+    return (
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+        token_nums,
+        no_lora,
+    )
 
 
 # TODO see if this can be vectorized
 def convert_mapping(
     mapping: "LoRAMapping",
-    lora_index_to_id: list[Optional[int]],
+    lora_index_to_id: list[int | None],
     max_loras: int,
     vocab_size: int,
     extra_vocab_size: int,
@@ -83,41 +91,47 @@ def convert_mapping(
     lora_indices = index_mapping_indices.copy()
 
     prompt_mapping: list[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
+        lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping
     ]
     lora_idx = None
     for i in range(len(index_mapping_indices)):
         # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
+        lora_idx = (
+            lora_index_to_id.index(index_mapping_indices[i])
+            if index_mapping_indices[i] > 0
+            else -1
+        )
         embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
         lora_indices[i] = lora_idx
 
-    indices_list: list[Union[list[int], torch.Tensor]] = [
+    indices_list: list[list[int] | torch.Tensor] = [
         index_mapping_indices,
         lora_indices,
         embedding_indices,
     ]
 
     indices = torch.tensor(indices_list, dtype=torch.long, device=device)
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         dtype=torch.long,
-                                         device=device)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size),
-    ])
-    embeddings_indices = torch.where(embeddings_indices == -1, max_loras - 1,
-                                     embeddings_indices)
+    prompt_mapping_tensor = torch.tensor(
+        prompt_mapping, dtype=torch.long, device=device
+    )
+    embeddings_indices = torch.stack(
+        [
+            indices[2] * extra_vocab_size,
+            indices[2] * (vocab_size + extra_vocab_size),
+        ]
+    )
+    embeddings_indices = torch.where(
+        embeddings_indices == -1, max_loras - 1, embeddings_indices
+    )
     base_indices = indices[1]
     sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded = torch.where(sampler_indices_padded == -1,
-                                         max_loras - 1, sampler_indices_padded)
+    sampler_indices_padded = torch.where(
+        sampler_indices_padded == -1, max_loras - 1, sampler_indices_padded
+    )
     sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
-            sampler_indices_padded * len(sampler_indices_padded))
+        0, len(sampler_indices_padded), device=device, dtype=torch.long
+    ) + (sampler_indices_padded * len(sampler_indices_padded))
 
     # Contain length of indices tensors. Used to index into each tensor.
     indices_len = [
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 523525d46f0b3f1d68dfaf310c54c9efa2bc9777..c97e435e32165f2d0f13ca580f549a84a2f11768 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -2,15 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from typing import Optional
 
 import msgspec
 
 
 class LoRARequest(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    array_like=True,
+):  # type: ignore[call-arg]
     """
     Request for a LoRA adapter.
 
@@ -22,13 +22,14 @@ class LoRARequest(
     lora_int_id must be globally unique for a given adapter.
     This is currently not enforced in vLLM.
     """
+
     lora_name: str
     lora_int_id: int
     lora_path: str = ""
-    lora_local_path: Optional[str] = msgspec.field(default=None)
-    long_lora_max_len: Optional[int] = None
-    base_model_name: Optional[str] = msgspec.field(default=None)
-    tensorizer_config_dict: Optional[dict] = None
+    lora_local_path: str | None = msgspec.field(default=None)
+    long_lora_max_len: int | None = None
+    base_model_name: str | None = msgspec.field(default=None)
+    tensorizer_config_dict: dict | None = None
 
     def __post_init__(self):
         if self.lora_int_id < 1:
@@ -39,7 +40,8 @@ class LoRARequest(
                 "and will be removed in a future version. "
                 "Please use 'lora_path' instead.",
                 DeprecationWarning,
-                stacklevel=2)
+                stacklevel=2,
+            )
             if not self.lora_path:
                 self.lora_path = self.lora_local_path or ""
 
@@ -65,7 +67,8 @@ class LoRARequest(
             "and will be removed in a future version. "
             "Please use 'path' instead.",
             DeprecationWarning,
-            stacklevel=2)
+            stacklevel=2,
+        )
         return self.lora_path
 
     @local_path.setter
@@ -75,7 +78,8 @@ class LoRARequest(
             "and will be removed in a future version. "
             "Please use 'path' instead.",
             DeprecationWarning,
-            stacklevel=2)
+            stacklevel=2,
+        )
         self.lora_path = value
 
     def __eq__(self, value: object) -> bool:
@@ -84,8 +88,7 @@ class LoRARequest(
         instances based on lora_name. This allows for identification
         and comparison lora adapter across engines.
         """
-        return isinstance(value,
-                          self.__class__) and self.lora_name == value.lora_name
+        return isinstance(value, self.__class__) and self.lora_name == value.lora_name
 
     def __hash__(self) -> int:
         """
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
index 5808ae105e86459eaf7c327f200d0ec2153998e9..bcfe26467cfb4cd5473f0cbe8ad5b347dd8afd41 100644
--- a/vllm/lora/resolver.py
+++ b/vllm/lora/resolver.py
@@ -4,7 +4,6 @@
 from abc import ABC, abstractmethod
 from collections.abc import Set
 from dataclasses import dataclass, field
-from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -22,8 +21,9 @@ class LoRAResolver(ABC):
     """
 
     @abstractmethod
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
         """Abstract method to resolve and fetch a LoRA model adapter.
 
         Implements logic to locate and download LoRA adapter based on the name.
@@ -61,8 +61,10 @@ class _LoRAResolverRegistry:
         if resolver_name in self.resolvers:
             logger.warning(
                 "LoRA resolver %s is already registered, and will be "
-                "overwritten by the new resolver instance %s.", resolver_name,
-                resolver)
+                "overwritten by the new resolver instance %s.",
+                resolver_name,
+                resolver,
+            )
 
         self.resolvers[resolver_name] = resolver
 
@@ -78,7 +80,8 @@ class _LoRAResolverRegistry:
         if resolver_name not in self.resolvers:
             raise KeyError(
                 f"LoRA resolver '{resolver_name}' not found. "
-                f"Available resolvers: {list(self.resolvers.keys())}")
+                f"Available resolvers: {list(self.resolvers.keys())}"
+            )
         return self.resolvers[resolver_name]
 
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 10ba390bffd9e2a60e3506ee32af751544e13fd1..0f43ff06d8f2b2e961c0c4642fffd5b9e3a1430d 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -2,41 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 import huggingface_hub
 import regex as re
-from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   HFValidationError, RepositoryNotFoundError)
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    HFValidationError,
+    RepositoryNotFoundError,
+)
 from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
+
 # being imported for _all_lora_classes below
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              ColumnParallelLinearWithShardedLoRA,
-                              LogitsProcessorWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithShardedLoRA,
-                              MergedQKVParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithShardedLoRA,
-                              QKVParallelLinearWithLoRA,
-                              QKVParallelLinearWithShardedLoRA,
-                              ReplicatedLinearWithLoRA,
-                              RowParallelLinearWithLoRA,
-                              RowParallelLinearWithShardedLoRA,
-                              VocabParallelEmbeddingWithLoRA)
+from vllm.lora.layers import (
+    BaseLayerWithLoRA,
+    ColumnParallelLinearWithLoRA,
+    ColumnParallelLinearWithShardedLoRA,
+    FusedMoEWithLoRA,
+    LogitsProcessorWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithLoRA,
+    QKVParallelLinearWithShardedLoRA,
+    ReplicatedLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+    RowParallelLinearWithShardedLoRA,
+    VocabParallelEmbeddingWithLoRA,
+)
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase
-
-# yapf: enable
+from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping
 
 if TYPE_CHECKING:
     from vllm.model_executor.layers.logits_processor import LogitsProcessor
-    from vllm.model_executor.layers.vocab_parallel_embedding import (
-        ParallelLMHead)
+    from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.models.utils import WeightsMapper
 
 logger = init_logger(__name__)
@@ -55,23 +61,35 @@ _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     MergedColumnParallelLinearWithShardedLoRA,
     MergedQKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA,
+    FusedMoEWithLoRA,
 }
 
 
-def from_layer(layer: nn.Module,
-               max_loras: int,
-               lora_config: LoRAConfig,
-               packed_modules_list: list,
-               model_config: Optional[PretrainedConfig] = None) -> nn.Module:
+def is_moe_model(model: nn.Module) -> bool:
+    """Checks if the model contains FusedMoE layers and warns the user."""
+    if any(isinstance(module, FusedMoE) for module in model.modules()):
+        logger.info_once("MoE model detected. Using fused MoE LoRA implementation.")
+        return True
+    return False
+
+
+def from_layer(
+    layer: nn.Module,
+    max_loras: int,
+    lora_config: LoRAConfig,
+    packed_modules_list: list,
+    model_config: PretrainedConfig | None = None,
+) -> nn.Module:
     for lora_cls in _all_lora_classes:
         # specifying kwargs so they can be easily accessed in decorator
-        if lora_cls.can_replace_layer(source_layer=layer,
-                                      lora_config=lora_config,
-                                      packed_modules_list=packed_modules_list,
-                                      model_config=model_config):
+        if lora_cls.can_replace_layer(
+            source_layer=layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+        ):
             instance_layer = lora_cls(layer)
-            instance_layer.create_lora_weights(max_loras, lora_config,
-                                               model_config)
+            instance_layer.create_lora_weights(max_loras, lora_config, model_config)
             return instance_layer
     return layer
 
@@ -81,17 +99,22 @@ def from_layer_logits_processor(
     lm_head: "ParallelLMHead",
     max_loras: int,
     lora_config: LoRAConfig,
-    model_config: Optional[PretrainedConfig] = None,
+    model_config: PretrainedConfig | None = None,
 ) -> LogitsProcessorWithLoRA:
-    ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
-                                  lm_head.weight.dtype, lm_head.weight.device,
-                                  lm_head.get_sharded_to_full_mapping())
+    ret = LogitsProcessorWithLoRA(
+        layer,
+        lm_head.embedding_dim,
+        lm_head.weight.dtype,
+        lm_head.weight.device,
+        lm_head.get_sharded_to_full_mapping(),
+    )
     ret.create_lora_weights(max_loras, lora_config, model_config)
     return ret
 
 
-def replace_submodule(model: nn.Module, module_name: str,
-                      new_module: nn.Module) -> nn.Module:
+def replace_submodule(
+    model: nn.Module, module_name: str, new_module: nn.Module
+) -> nn.Module:
     """Replace a submodule in a model with a new module."""
     parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
     target_name = module_name.split(".")[-1]
@@ -100,9 +123,8 @@ def replace_submodule(model: nn.Module, module_name: str,
 
 
 def parse_fine_tuned_lora_name(
-    name: str,
-    weights_mapper: Optional["WeightsMapper"] = None
-) -> tuple[str, bool, bool]:
+    name: str, weights_mapper: Optional["WeightsMapper"] = None
+) -> tuple[str, bool]:
     """Parse the name of lora weights.
 
     args:
@@ -114,7 +136,6 @@ def parse_fine_tuned_lora_name(
         tuple(module_name, is_lora_a):
             module_name: the name of the module, e.g. model.dense1,
             is_lora_a whether the tensor is lora_a or lora_b.
-            is_bias whether the tensor is lora bias.
     """
 
     # LoRA weight qualified name usually starts with `base_model.model.`,
@@ -134,28 +155,24 @@ def parse_fine_tuned_lora_name(
     start_index = 2 if name.startswith("base_model.model.") else 0
 
     parts = name.split(".")
-    if parts[-1] == "weight" and (parts[-2] == "lora_A"
-                                  or parts[-2] == "lora_B"):
+    if parts[-1] == "weight" and (parts[-2] == "lora_A" or parts[-2] == "lora_B"):
         new_name = ".".join(parts[start_index:-2])
-        return new_name, parts[-2] == "lora_A", False
+        return new_name, parts[-2] == "lora_A"
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
         new_name = ".".join(parts[start_index:-1])
-        return new_name, parts[-1] == "lora_embedding_A", False
-
-    if parts[-1] == "bias":
-        new_name = ".".join(parts[start_index:-2])
-        return new_name, False, True
+        return new_name, parts[-1] == "lora_embedding_A"
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 
 
-def is_regex_target_modules(load_modules: Union[str, list[str]],
-                            expected_lora_modules: list[str]) -> bool:
+def is_regex_target_modules(
+    load_modules: str | list[str], expected_lora_modules: list[str]
+) -> bool:
     """
-    PEFT supports passing `target_modules` in the form of regular expressions, 
-    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to 
-    determine whether the suffix in the regular expression is present in the 
+    PEFT supports passing `target_modules` in the form of regular expressions,
+    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
+    determine whether the suffix in the regular expression is present in the
     `expected_lora_modules`.
     """
 
@@ -197,7 +214,10 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
                 supported_lora_modules.add(name)
 
         # get all the linear subfixes.
-        if isinstance(module, (LinearBase, )):
+        if isinstance(module, (LinearBase,)):
+            supported_lora_modules.add(name.split(".")[-1])
+
+        if isinstance(module, (FusedMoE,)):
             supported_lora_modules.add(name.split(".")[-1])
 
     return list(supported_lora_modules)
@@ -225,7 +245,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
         return lora_path
 
     # If the path starts with ~, expand the user home directory.
-    if lora_path.startswith('~'):
+    if lora_path.startswith("~"):
         return os.path.expanduser(lora_path)
 
     # Check if the expanded relative path exists locally.
@@ -234,13 +254,40 @@ def get_adapter_absolute_path(lora_path: str) -> str:
 
     # If the path does not exist locally, assume it's a Hugging Face repo.
     try:
-        local_snapshot_path = huggingface_hub.snapshot_download(
-            repo_id=lora_path)
-    except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
-            HFValidationError):
+        local_snapshot_path = huggingface_hub.snapshot_download(repo_id=lora_path)
+    except (
+        HfHubHTTPError,
+        RepositoryNotFoundError,
+        EntryNotFoundError,
+        HFValidationError,
+    ):
         # Handle errors that may occur during the download
         # Return original path instead of throwing error here
         logger.exception("Error downloading the HuggingFace model")
         return lora_path
 
     return local_snapshot_path
+
+
+def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
+    if is_moe_model(model):
+        if moe_packed_mapping := get_moe_expert_mapping(model):
+            # This method generates and returns a dictionary mapping packed module
+            # names to lists of their corresponding submodule names. It includes
+            # both static mappings and dynamic mappings for expert layers, where
+            # the expert indices are expanded based on the configured number
+            # of routed experts.
+            packed_modules_mapping = get_packed_modules_mapping(model)
+
+            packed_modules_mapping["experts"] = [
+                weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping
+            ]
+
+            return packed_modules_mapping
+        else:
+            raise AttributeError(
+                "To support LoRA for MoE model, "
+                "'get_expert_mapping' must be implemented"
+            )
+    else:
+        return get_packed_modules_mapping(model)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index cdb2f86611d81f29046f50fcb6faec06761e962a..b85151f2c7592a6ee5bf41a97a04adcfc4b694df 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -2,14 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.lora.models import (LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager, create_lora_manager)
+from vllm.lora.models import (
+    LoRAModel,
+    LoRAModelManager,
+    LRUCacheLoRAModelManager,
+    create_lora_manager,
+)
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
@@ -36,10 +40,11 @@ class WorkerLoRAManager:
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
-        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
+        self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
+            vllm_config.scheduler_config.max_num_batched_tokens
+        )
         self.vocab_size = vllm_config.model_config.get_vocab_size()
         self.lora_config = vllm_config.lora_config
 
@@ -81,24 +86,24 @@ class WorkerLoRAManager:
 
     def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
-            supported_lora_modules = (
-                self._adapter_manager.supported_lora_modules)
-            packed_modules_mapping = (
-                self._adapter_manager.packed_modules_mapping)
+            supported_lora_modules = self._adapter_manager.supported_lora_modules
+            packed_modules_mapping = self._adapter_manager.packed_modules_mapping
             expected_lora_modules: list[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
-                    expected_lora_modules.extend(
-                        packed_modules_mapping[module])
+                    expected_lora_modules.extend(packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
-
+                if module == "experts":
+                    expected_lora_modules.append(module)
             expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
-                lora_path, self.max_position_embeddings,
-                lora_request.tensorizer_config_dict)
+                lora_path,
+                self.max_position_embeddings,
+                lora_request.tensorizer_config_dict,
+            )
 
             # Validates the LoRA configuration against requirements before
             # loading weights, throwing an exception if validation fails.
@@ -116,12 +121,13 @@ class WorkerLoRAManager:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size +
-                self.lora_config.lora_extra_vocab_size,
+                target_embedding_padding=self.vocab_size
+                + self.lora_config.lora_extra_vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
-                weights_mapper=hf_to_vllm_mapper)
+                weights_mapper=hf_to_vllm_mapper,
+            )
 
         except FileNotFoundError as e:
             # FileNotFoundError should be raised if both
@@ -131,26 +137,29 @@ class WorkerLoRAManager:
             # For NotFoundError
             raise ValueError(
                 f"Loading lora {lora_request.lora_name} failed: No adapter "
-                f"found for {lora_request.lora_path}") from e
+                f"found for {lora_request.lora_path}"
+            ) from e
         except Exception as e:
             # For BadRequestError
             raise e
 
         if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
-                             f"is greater than lora_extra_vocab_size "
-                             f"{self.lora_config.lora_extra_vocab_size}.")
+            raise ValueError(
+                f"LoRA added vocab size {lora.extra_vocab_size} "
+                f"is greater than lora_extra_vocab_size "
+                f"{self.lora_config.lora_extra_vocab_size}."
+            )
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
         if lora_request.lora_int_id in self.list_adapters():
             return False
         if isinstance(self._cached_dummy_lora, LoRAModel):
-            dummy_lora = self._cached_dummy_lora.clone(
-                lora_request.lora_int_id)
+            dummy_lora = self._cached_dummy_lora.clone(lora_request.lora_int_id)
         else:
             dummy_lora = self._adapter_manager.create_dummy_lora(
-                lora_request.lora_int_id, rank, self.embedding_modules)
+                lora_request.lora_int_id, rank, self.embedding_modules
+            )
             if self._cached_dummy_lora is None:
                 self._cached_dummy_lora = dummy_lora
         return self._adapter_manager.add_adapter(dummy_lora)
@@ -158,8 +167,7 @@ class WorkerLoRAManager:
     def pin_adapter(self, adapter_id: int) -> bool:
         return self._adapter_manager.pin_adapter(adapter_id)
 
-    def set_active_adapters(self, requests: set[Any],
-                            mapping: Optional[Any]) -> None:
+    def set_active_adapters(self, requests: set[Any], mapping: Any | None) -> None:
         self._apply_adapters(requests)
         if mapping is not None:
             self._adapter_manager.set_adapter_mapping(mapping)
@@ -168,13 +176,15 @@ class WorkerLoRAManager:
         existing_adapters = self.list_adapters()
         models_map = {
             adapter_request.adapter_id: adapter_request
-            for adapter_request in adapter_requests if adapter_request
+            for adapter_request in adapter_requests
+            if adapter_request
         }
         if len(models_map) > self._adapter_manager.adapter_slots:
             raise RuntimeError(
                 f"Number of requested models ({len(models_map)}) is greater "
                 "than the number of GPU model slots "
-                f"({self._adapter_manager.adapter_slots}).")
+                f"({self._adapter_manager.adapter_slots})."
+            )
         requested_ids = set(models_map)
         for adapter_id in existing_adapters - requested_ids:
             self.remove_adapter(adapter_id)
@@ -227,13 +237,15 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None:
         loras_map = {
             lora_request.lora_int_id: lora_request
-            for lora_request in lora_requests if lora_request
+            for lora_request in lora_requests
+            if lora_request
         }
         if len(loras_map) > self._adapter_manager.lora_slots:
             raise RuntimeError(
                 f"Number of requested LoRAs ({len(loras_map)}) is greater "
                 "than the number of GPU LoRA slots "
-                f"({self._adapter_manager.lora_slots}).")
+                f"({self._adapter_manager.lora_slots})."
+            )
         for lora in loras_map.values():
             self.add_adapter(lora)
 
@@ -253,15 +265,15 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
             # Loading succeeded, now check if we will exceed cache capacity and
             # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
-                assert isinstance(self._adapter_manager,
-                                  LRUCacheLoRAModelManager)
+                assert isinstance(self._adapter_manager, LRUCacheLoRAModelManager)
                 self._adapter_manager.remove_oldest_adapter()
             # Then add the new adapter to the cache
             loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
             # update its position in the caches
-            loaded = self._adapter_manager.get_adapter(
-                lora_request.lora_int_id) is not None
+            loaded = (
+                self._adapter_manager.get_adapter(lora_request.lora_int_id) is not None
+            )
         self._adapter_manager.activate_adapter(lora_request.lora_int_id)
         return loaded
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 3c094cfdb553fbcadc3b900c64dfb76949783538..b50f0cb3a61a2f38ffed1a3b8bfa4e9f2af7bd6d 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           PackedvLLMParameter)
+from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter
 from vllm.model_executor.utils import set_random_seed
 
 __all__ = [
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index e7eb8247d5efd663914a852febe86f9dbdbe8c8f..9ef696d80712c7cd45bf2f8ac93042fc0b8e60e9 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch.nn as nn
 
@@ -32,8 +31,11 @@ class CustomOp(nn.Module):
             op_cls_to_instantiate = cls
         else:
             op_cls_to_instantiate = cls.op_registry_oot[op_name]
-            logger.debug("Instantiating custom op: %s using %s", op_name,
-                         str(op_cls_to_instantiate))
+            logger.debug(
+                "Instantiating custom op: %s using %s",
+                op_name,
+                str(op_cls_to_instantiate),
+            )
         return super().__new__(op_cls_to_instantiate)
 
     def __init__(self):
@@ -86,8 +88,7 @@ class CustomOp(nn.Module):
         if enabled:
             compilation_config.enabled_custom_ops.update([self.__class__.name])
         else:
-            compilation_config.disabled_custom_ops.update(
-                [self.__class__.name])
+            compilation_config.disabled_custom_ops.update([self.__class__.name])
 
         if not enabled:
             return self.forward_native
@@ -112,44 +113,45 @@ class CustomOp(nn.Module):
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             logger.warning_once(
-                "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",  # noqa: E501
+                "Custom op %s was not registered, which means it won't appear "
+                "in the op registry. It will be enabled/disabled based on the "
+                "global settings.",
                 cls.__name__,
             )
             return CustomOp.default_on()
 
         enabled = f"+{cls.name}" in custom_ops
         disabled = f"-{cls.name}" in custom_ops
-        assert not (enabled
-                    and disabled), f"Cannot enable and disable {cls.name}"
+        assert not (enabled and disabled), f"Cannot enable and disable {cls.name}"
 
         return (CustomOp.default_on() or enabled) and not disabled
 
     @staticmethod
     def default_on() -> bool:
         """
-        On by default if PyTorch Inductor is not used.
-        Specifying 'all' or 'none' in custom_op takes precedence.
+        Behavior controlled by `CompilationConfig.custom_ops`: On by default if
+        'all', off by default if 'none'.
+        When PyTorch Inductor is used, 'none' is the default value,
+        otherwise 'all'.
         """
-        from vllm.config import CompilationLevel
         compilation_config = get_cached_compilation_config()
-        default_on = (compilation_config.level < CompilationLevel.PIECEWISE
-                      or not compilation_config.use_inductor)
         count_none = compilation_config.custom_ops.count("none")
         count_all = compilation_config.custom_ops.count("all")
-        return default_on and not count_none > 0 or count_all > 0
+        assert count_none + count_all == 1
+
+        return not count_none > 0 or count_all > 0
 
     # Dictionary of all custom ops (classes, indexed by registered name).
     # To check if an op with a name is enabled, call .enabled() on the class.
     # Examples:
     # - MyOp.enabled()
     # - op_registry["my_op"].enabled()
-    op_registry: dict[str, type['CustomOp']] = {}
-    op_registry_oot: dict[str, type['CustomOp']] = {}
+    op_registry: dict[str, type["CustomOp"]] = {}
+    op_registry_oot: dict[str, type["CustomOp"]] = {}
 
     # Decorator to register custom ops.
     @classmethod
     def register(cls, name: str):
-
         def decorator(op_cls):
             assert name not in cls.op_registry, f"Duplicate op name: {name}"
             op_cls.name = name
@@ -168,12 +170,10 @@ class CustomOp(nn.Module):
     # or
     # - @CustomOP.register_oot(name="UnquantizedFusedMoEMethod")
     @classmethod
-    def register_oot(cls, _decorated_op_cls=None, name: Optional[str] = None):
-
+    def register_oot(cls, _decorated_op_cls=None, name: str | None = None):
         def decorator(op_cls):
             reg_name = name if name is not None else cls.__name__
-            assert reg_name not in cls.op_registry_oot, \
-                f"Duplicate op name: {reg_name}"
+            assert reg_name not in cls.op_registry_oot, f"Duplicate op name: {reg_name}"
             op_cls.name = reg_name
             cls.op_registry_oot[reg_name] = op_cls
             return op_cls
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 235df1a77c5cec11a85d58cd5fd60ed0a44502da..3471ee327cf8cbf8e5f66e3514b10c2f696c2004 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,20 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom activation functions."""
+
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import LazyDict
+from vllm.utils.collection_utils import LazyDict
 
 logger = init_logger(__name__)
 
@@ -32,7 +35,7 @@ class FatreluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
-    def __init__(self, threshold: float = 0.):
+    def __init__(self, threshold: float = 0.0):
         super().__init__()
         self.threshold = threshold
         if current_platform.is_cuda_alike():
@@ -49,7 +52,7 @@ class FatreluAndMul(CustomOp):
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x, self.threshold)
         return out
@@ -72,25 +75,27 @@ class SiluAndMul(CustomOp):
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             self.op = ipex_ops.silu_and_mul
         elif current_platform.is_cpu():
             self._forward_method = self.forward_native
 
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+    @staticmethod
+    def forward_native(x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
@@ -113,6 +118,7 @@ class MulAndSilu(CustomOp):
             self.op = torch.ops._C.mul_and_silu
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             self.op = ipex_ops.silu_and_mul
         elif current_platform.is_cpu():
             self._forward_method = self.forward_native
@@ -124,7 +130,7 @@ class MulAndSilu(CustomOp):
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
@@ -156,10 +162,8 @@ class GeluAndMulSparse(CustomOp):
 
         # Sparsity.
         if activation_sparsity == 0.0:
-            raise ValueError(
-                "activation_sparsity is 0.0. Please use GeluAndMul.")
-        target_sparsity_tensor = torch.tensor(activation_sparsity,
-                                              dtype=torch.float32)
+            raise ValueError("activation_sparsity is 0.0. Please use GeluAndMul.")
+        target_sparsity_tensor = torch.tensor(activation_sparsity, dtype=torch.float32)
         normal_dist = torch.distributions.normal.Normal(0, 1)
         self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)
 
@@ -207,6 +211,7 @@ class GeluAndMul(CustomOp):
                 self.op = torch.ops._C.gelu_tanh_and_mul
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             if approximate == "none":
                 self.op = ipex_ops.gelu_and_mul
             else:
@@ -219,20 +224,20 @@ class GeluAndMul(CustomOp):
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
 
     def extra_repr(self) -> str:
-        return f'approximate={repr(self.approximate)}'
+        return f"approximate={repr(self.approximate)}"
 
 
 @CustomOp.register("swigluoai_and_mul")
@@ -255,7 +260,7 @@ class SwigluOAIAndMul(CustomOp):
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
         return out
@@ -266,20 +271,19 @@ class SwigluOAIAndMul(CustomOp):
 
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
-
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.gelu_new
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             self.op = ipex_ops.gelu_new
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
-        return 0.5 * x * (1.0 + torch.tanh(c *
-                                           (x + 0.044715 * torch.pow(x, 3.0))))
+        return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         out = torch.empty_like(x)
@@ -292,19 +296,18 @@ class NewGELU(CustomOp):
 
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
-
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.gelu_fast
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             self.op = ipex_ops.gelu_fast
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
-        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
-                                           (1.0 + 0.044715 * x * x)))
+        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         out = torch.empty_like(x)
@@ -324,6 +327,7 @@ class QuickGELU(CustomOp):
             self.op = torch.ops._C.gelu_quick
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
+
             self.op = ipex_ops.gelu_quick
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -355,7 +359,7 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kernels
+        # TODO : implement cuda kernels
         return self.forward_native(x)
 
 
@@ -378,12 +382,15 @@ class XIELU(CustomOp):
     ):
         super().__init__()
         self.alpha_p = nn.Parameter(
-            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) -
-                      1).unsqueeze(0))
+            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(
+                0
+            )
+        )
         self.alpha_n = nn.Parameter(
             torch.log(
-                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) -
-                1).unsqueeze(0))
+                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1
+            ).unsqueeze(0)
+        )
         self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
         self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
         self.with_vector_loads = with_vector_loads
@@ -403,8 +410,10 @@ class XIELU(CustomOp):
                 self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
                 msg += " Enabled torch._dynamo for xIELU CUDA."
             except Exception as err:
-                msg += (f" Could not enable torch._dynamo for xIELU ({err}) - "
-                        "this may result in slower performance.")
+                msg += (
+                    f" Could not enable torch._dynamo for xIELU ({err}) - "
+                    "this may result in slower performance."
+                )
                 self._xielu_cuda_fn = self._xielu_cuda
             logger.warning_once(msg)
         except Exception as err:
@@ -421,14 +430,12 @@ class XIELU(CustomOp):
         return torch.where(
             x > 0,
             alpha_p * x * x + self.beta * x,
-            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n +
-            self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
         )
 
     def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
         """Firewall function to prevent torch.compile from seeing .item()"""
-        assert self._xielu_cuda_obj is not None, (
-            "XIELU CUDA object must not be None")
+        assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None"
         original_shape = x.shape
         # CUDA kernel expects 3D tensors, reshape if needed
         while x.dim() < 3:
@@ -479,21 +486,21 @@ class ScaledActivation(nn.Module):
         act_module: nn.Module,
         intermediate_size: int,
         input_is_parallel: bool = True,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
     ):
         super().__init__()
         self.act = act_module
         self.input_is_parallel = input_is_parallel
         if input_is_parallel:
             tp_size = get_tensor_model_parallel_world_size()
-            intermediate_size_per_partition = divide(intermediate_size,
-                                                     tp_size)
+            intermediate_size_per_partition = divide(intermediate_size, tp_size)
         else:
             intermediate_size_per_partition = intermediate_size
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.scales = nn.Parameter(
-            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
+        )
         set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -510,30 +517,21 @@ class ScaledActivation(nn.Module):
         param_data.copy_(loaded_weight)
 
 
-_ACTIVATION_REGISTRY = LazyDict({
-    "gelu":
-    lambda: nn.GELU(),
-    "gelu_fast":
-    lambda: FastGELU(),
-    "gelu_new":
-    lambda: NewGELU(),
-    "gelu_pytorch_tanh":
-    lambda: nn.GELU(approximate="tanh"),
-    "relu":
-    lambda: nn.ReLU(),
-    "relu2":
-    lambda: ReLUSquaredActivation(),
-    "silu":
-    lambda: nn.SiLU(),
-    "quick_gelu":
-    lambda: QuickGELU(),
-    "tanh":
-    lambda: nn.Tanh(),
-    "sigmoid":
-    lambda: nn.Sigmoid(),
-    "xielu":
-    lambda: XIELU(),
-})
+_ACTIVATION_REGISTRY = LazyDict(
+    {
+        "gelu": lambda: nn.GELU(),
+        "gelu_fast": lambda: FastGELU(),
+        "gelu_new": lambda: NewGELU(),
+        "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"),
+        "relu": lambda: nn.ReLU(),
+        "relu2": lambda: ReLUSquaredActivation(),
+        "silu": lambda: nn.SiLU(),
+        "quick_gelu": lambda: QuickGELU(),
+        "tanh": lambda: nn.Tanh(),
+        "sigmoid": lambda: nn.Sigmoid(),
+        "xielu": lambda: XIELU(),
+    }
+)
 
 
 def get_act_fn(act_fn_name: str) -> nn.Module:
@@ -547,29 +545,25 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
         act_fn_name = activation_name
 
     if act_fn_name not in _ACTIVATION_REGISTRY:
-        raise ValueError(
-            f"Activation function {act_fn_name!r} is not supported.")
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
 
     return _ACTIVATION_REGISTRY[act_fn_name]
 
 
-_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu":
-    lambda: GeluAndMul(),
-    "silu":
-    lambda: SiluAndMul(),
-    "geglu":
-    lambda: GeluAndMul(),
-    "swigluoai":
-    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
-})
+_ACTIVATION_AND_MUL_REGISTRY = LazyDict(
+    {
+        "gelu": lambda: GeluAndMul(),
+        "silu": lambda: SiluAndMul(),
+        "geglu": lambda: GeluAndMul(),
+        "swigluoai": lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
+    }
+)
 
 
 def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
     """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
     act_fn_name = act_fn_name.lower()
     if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
-        raise ValueError(
-            f"Activation function {act_fn_name!r} is not supported.")
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
 
     return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index 782818f55fbc2d783f1bfea36c2df3212c6aa38c..ffbef470b18680a8785a87174c183e8b63a353ce 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -1,19 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Base class for attention-like layers."""
+
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
 
+from vllm.config import VllmConfig
+from vllm.v1.kv_cache_interface import KVCacheSpec
+
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
 
 class AttentionLayerBase(ABC):
     """
-    Base class for attention-like layers (Attention, Mamba, etc.) 
+    Base class for attention-like layers (Attention, Mamba, etc.)
     that support the v1 engine.
-    
-    This provides a common interface for getting attention backends 
+
+    This provides a common interface for getting attention backends
     from different layer types.
     """
 
@@ -21,3 +25,11 @@ class AttentionLayerBase(ABC):
     def get_attn_backend(self) -> type["AttentionBackend"]:
         """Get the attention backend class for this layer."""
         pass
+
+    @abstractmethod
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
+        """
+        Get the KV cache spec for this layer.
+        May be None if the layer does not need KV cache.
+        """
+        pass
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
new file mode 100644
index 0000000000000000000000000000000000000000..7920d117de5e01409b3a23652aae3f2c9195cd35
--- /dev/null
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -0,0 +1,854 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Callable
+from functools import cache
+from typing import Any
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+logger = init_logger(__name__)
+
+
+def _matmul_launch_metadata(
+    grid: Callable[..., Any], kernel: Any, args: dict[str, Any]
+) -> dict[str, Any]:
+    ret = {}
+    m, n, k = args["M"], args["N"], args["K"]
+    ret["name"] = f"{kernel.name} [M={m}, N={n}, K={k}]"
+    if "tiles_per_update" in args:
+        ret["name"] = (
+            f"{kernel.name} [M={m}, N={n}, K={k}, "
+            f"tiles_per_update={args['tiles_per_update']:02}]"
+        )
+    if "c_ptr" in args:
+        bytes_per_elem = args["c_ptr"].element_size()
+    else:
+        bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
+    ret[f"flops{bytes_per_elem * 8}"] = 2.0 * m * n * k
+    ret["bytes"] = bytes_per_elem * (m * k + n * k + m * n)
+    return ret
+
+
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton.jit(launch_metadata=_matmul_launch_metadata)
+def matmul_kernel_persistent(
+    a_ptr,
+    b_ptr,
+    c_ptr,  #
+    bias_ptr,
+    M,
+    N,
+    K,  #
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,  #
+    BLOCK_SIZE_N: tl.constexpr,  #
+    BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,  #
+    NUM_SMS: tl.constexpr,  #
+    A_LARGE: tl.constexpr,
+    B_LARGE: tl.constexpr,
+    C_LARGE: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    start_pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    num_tiles = num_pid_m * num_pid_n
+
+    tile_id_c = start_pid - NUM_SMS
+
+    offs_k_for_mask = tl.arange(0, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
+        pid_m, pid_n = _compute_pid(
+            tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
+        )
+        start_m = pid_m * BLOCK_SIZE_M
+        start_n = pid_n * BLOCK_SIZE_N
+        offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = start_n + tl.arange(0, BLOCK_SIZE_N)
+        if A_LARGE:
+            offs_am = offs_am.to(tl.int64)
+        if B_LARGE:
+            offs_bn = offs_bn.to(tl.int64)
+        offs_am = tl.where(offs_am < M, offs_am, 0)
+        offs_bn = tl.where(offs_bn < N, offs_bn, 0)
+        offs_am = tl.max_contiguous(tl.multiple_of(offs_am, BLOCK_SIZE_M), BLOCK_SIZE_M)
+        offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for ki in range(k_tiles):
+            if A_LARGE or B_LARGE:
+                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+            else:
+                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+            a_ptrs = a_ptr + (
+                offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+            )
+            b_ptrs = b_ptr + (
+                offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+            )
+
+            a = tl.load(
+                a_ptrs, mask=offs_k_for_mask[None, :] < K - ki * BLOCK_SIZE_K, other=0.0
+            )
+            b = tl.load(
+                b_ptrs, mask=offs_k_for_mask[:, None] < K - ki * BLOCK_SIZE_K, other=0.0
+            )
+            accumulator = tl.dot(a, b, accumulator)
+
+        tile_id_c += NUM_SMS
+        pid_m, pid_n = _compute_pid(
+            tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
+        )
+        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        if C_LARGE:
+            offs_cm = offs_cm.to(tl.int64)
+            offs_cn = offs_cn.to(tl.int64)
+        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+        if HAS_BIAS:
+            bias_ptrs = bias_ptr + offs_cn
+            bias = tl.load(bias_ptrs, mask=offs_cn < N, other=0.0).to(tl.float32)
+            accumulator += bias
+        c = accumulator.to(c_ptr.dtype.element_ty)
+        tl.store(c_ptrs, c, mask=c_mask)
+
+
+def matmul_persistent(
+    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
+):
+    # Check constraints.
+    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
+    assert a.dtype == b.dtype, "Incompatible dtypes"
+    assert bias is None or bias.dim() == 1, (
+        "Currently assuming bias is 1D, let Horace know if you run into this"
+    )
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    M, K = a.shape
+    K, N = b.shape
+    dtype = a.dtype
+    # Allocates output.
+    c = torch.empty((M, N), device=a.device, dtype=dtype)
+
+    # 1D launch kernel where each block gets its own program.
+    def grid(META):
+        return (
+            min(
+                NUM_SMS,
+                triton.cdiv(M, META["BLOCK_SIZE_M"])
+                * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+            ),
+        )
+
+    configs = {
+        torch.bfloat16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float32: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+    }
+    # print(a.device, b.device, c.device)
+    matmul_kernel_persistent[grid](
+        a,
+        b,
+        c,  #
+        bias,
+        M,
+        N,
+        K,  #
+        a.stride(0),
+        a.stride(1),  #
+        b.stride(0),
+        b.stride(1),  #
+        c.stride(0),
+        c.stride(1),  #
+        NUM_SMS=NUM_SMS,  #
+        A_LARGE=a.numel() > 2**31,
+        B_LARGE=b.numel() > 2**31,
+        C_LARGE=c.numel() > 2**31,
+        HAS_BIAS=bias is not None,
+        **configs[dtype],
+    )
+    return c
+
+
+@triton.jit
+def _log_softmax_kernel(
+    input_ptr,
+    output_ptr,
+    input_row_stride,
+    output_row_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Compute log_softmax along the last dimension of a 2D tensor.
+    Each block handles one row of the input tensor.
+    """
+    # Get the row index for this block
+    row_idx = tl.program_id(0).to(tl.int64)
+
+    # Compute base pointers for input and output rows
+    row_start_ptr = input_ptr + row_idx * input_row_stride
+    output_row_start_ptr = output_ptr + row_idx * output_row_stride
+
+    # Step 1: Find maximum value in the row for numerical stability
+    max_val = -float("inf")
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=-float("inf"))
+
+        # Update maximum
+        max_val = tl.max(tl.maximum(vals, max_val))
+
+    # Step 2: Compute sum of exp(x - max_val)
+    sum_exp = 0.0
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+
+        # Compute exp(x - max_val) and accumulate
+        exp_vals = tl.exp(vals - max_val)
+        sum_exp += tl.sum(tl.where(mask, exp_vals, 0.0))
+
+    # Compute log(sum_exp)
+    log_sum_exp = tl.log(sum_exp)
+
+    # Step 3: Compute final log_softmax values: x - max_val - log_sum_exp
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask)
+
+        # Compute log_softmax
+        output = vals - max_val - log_sum_exp
+
+        # Store results
+        tl.store(output_row_start_ptr + col_idx, output, mask=mask)
+
+
+def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+    Compute log_softmax using Triton kernel.
+
+    Args:
+        input: Input tensor
+        dim: Dimension along which to compute log_softmax
+             (only -1 or last dim supported)
+    >> Stashed changes
+    Returns:
+        Tensor with log_softmax applied along the specified dimension
+    """
+    if dim != -1 and dim != input.ndim - 1:
+        raise ValueError(
+            "This implementation only supports log_softmax along the last dimension"
+        )
+
+    # Flatten all dimensions except the last one
+    original_shape = input.shape
+    input_2d = input.reshape(-1, input.shape[-1])
+    input_2d = input_2d.contiguous()
+
+    n_rows, n_cols = input_2d.shape
+
+    # Allocate output tensor
+    output = torch.empty_like(input_2d)
+
+    # Choose block size based on the number of columns
+    BLOCK_SIZE = 1024
+
+    # Launch kernel with one block per row
+    grid = (n_rows,)
+    _log_softmax_kernel[grid](
+        input_2d,
+        output,
+        input_2d.stride(0),
+        output.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    # Reshape output back to original shape
+    return output.reshape(original_shape)
+
+
+@triton.jit
+def mean_kernel(
+    input_ptr,
+    output_ptr,
+    input_stride0,
+    input_stride1,
+    input_stride2,
+    output_stride0,
+    output_stride1,
+    M,  # size before reduction dim
+    N,  # size of reduction dim
+    K,  # size after reduction dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for computing mean along a single dimension.
+    Input is viewed as (M, N, K) where N is the dimension being reduced.
+    """
+    # Program ID gives us which output element we're computing
+    pid = tl.program_id(0)
+
+    # Compute output indices
+    m_idx = pid // K
+    k_idx = pid % K
+
+    # Bounds check
+    if m_idx >= M or k_idx >= K:
+        return
+
+    # Accumulate sum across reduction dimension
+    acc = 0.0
+    for n_start in range(0, N, BLOCK_SIZE):
+        n_offsets = n_start + tl.arange(0, BLOCK_SIZE)
+        mask = n_offsets < N
+
+        # Calculate input indices
+        input_idx = (
+            m_idx * input_stride0 + n_offsets * input_stride1 + k_idx * input_stride2
+        )
+
+        # Load and accumulate
+        vals = tl.load(input_ptr + input_idx, mask=mask, other=0.0)
+        acc += tl.sum(vals)
+
+    # Compute mean and store
+    mean_val = acc / N
+    output_idx = m_idx * output_stride0 + k_idx * output_stride1
+    tl.store(output_ptr + output_idx, mean_val)
+
+
+def mean_dim(
+    input: torch.Tensor,
+    dim: int,
+    keepdim: bool = False,
+    dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    """
+    Triton implementation of torch.mean with single dimension reduction.
+
+    Args:
+        input: Input tensor
+        dim: Single dimension along which to compute mean
+        keepdim: Whether to keep the reduced dimension
+        dtype: Output dtype. If None, uses input dtype
+               (or float32 for integer inputs)
+
+    Returns:
+        Tensor with mean values along specified dimension
+    """
+    # Validate inputs
+    assert -input.ndim <= dim < input.ndim, (
+        f"Invalid dimension {dim} for tensor with {input.ndim} dimensions"
+    )
+
+    # Handle negative dim
+    if dim < 0:
+        dim = dim + input.ndim
+
+    # Handle dtype
+    if dtype is None:
+        if input.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
+            dtype = torch.float32
+        else:
+            dtype = input.dtype
+
+    # Convert input to appropriate dtype if needed
+    if input.dtype != dtype:
+        input = input.to(dtype)
+
+    # Get input shape and strides
+    shape = list(input.shape)
+
+    # Calculate dimensions for kernel
+    M = 1
+    for i in range(dim):
+        M *= shape[i]
+
+    N = shape[dim]
+
+    K = 1
+    for i in range(dim + 1, len(shape)):
+        K *= shape[i]
+
+    # Reshape input to 3D view (M, N, K)
+    input_3d = input.reshape(M, N, K)
+
+    # Create output shape
+    if keepdim:
+        output_shape = shape.copy()
+        output_shape[dim] = 1
+    else:
+        output_shape = shape[:dim] + shape[dim + 1 :]
+
+    # Create output tensor
+    output = torch.empty(output_shape, dtype=dtype, device=input.device)
+
+    # Reshape output for kernel
+    output_2d = output.reshape(M, 1, K).squeeze(1) if keepdim else output.reshape(M, K)
+
+    # Launch kernel
+    grid = (M * K,)
+    BLOCK_SIZE = 1024
+
+    mean_kernel[grid](
+        input_3d,
+        output_2d,
+        input_3d.stride(0),
+        input_3d.stride(1),
+        input_3d.stride(2),
+        output_2d.stride(0),
+        output_2d.stride(1) if output_2d.ndim > 1 else 0,
+        M,
+        N,
+        K,
+        BLOCK_SIZE,
+    )
+
+    return output
+
+
+def mm_batch_invariant(a, b):
+    return matmul_persistent(a, b)
+
+
+def matmul_batch_invariant(a, b, *, out=None):
+    # torch.matmul can handle various dimensions
+    # For 2D x 2D, it's the same as mm
+    if a.ndim == 2 and b.ndim == 2:
+        result = matmul_persistent(a, b)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+    elif a.ndim == 3 and b.ndim == 3:
+        # Handle batched case like bmm
+        return bmm_batch_invariant(a, b, out=out)
+    elif a.ndim == 3 and b.ndim == 2:
+        # Handle 3D x 2D: common for linear layers
+        # (batch, seq, hidden) @ (hidden, out) -> (batch, seq, out)
+        # Reshape to 2D, do mm, reshape back
+        batch, seq, hidden = a.shape
+        a_2d = a.reshape(-1, hidden)
+        result_2d = matmul_persistent(a_2d, b)
+        result = result_2d.reshape(batch, seq, -1)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+    elif a.ndim == 2 and b.ndim == 3:
+        # Handle 2D x 3D: (M, K) @ (B, K, N) -> (B, M, N)
+        # By broadcasting `a` to 3D, we can reuse the batched matrix
+        # multiplication logic.
+        a_expanded = a.unsqueeze(0).expand(b.shape[0], -1, -1)
+        return bmm_batch_invariant(a_expanded, b, out=out)
+    elif a.ndim == 4 and b.ndim == 4:
+        # Handle 4D attention tensors: [batch, heads, seq, dim]
+        # Reshape to 3D, process, reshape back
+        batch, heads, seq_a, dim_a = a.shape
+        _, _, dim_b, seq_b = b.shape
+
+        # Reshape to [batch*heads, seq_a, dim_a]
+        a_3d = a.reshape(batch * heads, seq_a, dim_a)
+        b_3d = b.reshape(batch * heads, dim_b, seq_b)
+
+        # Do batched matmul
+        result_3d = bmm_batch_invariant(a_3d, b_3d)
+
+        # Reshape back to [batch, heads, seq_a, seq_b]
+        result = result_3d.reshape(batch, heads, seq_a, seq_b)
+
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+    else:
+        raise ValueError(
+            f"matmul_batch_invariant currently only supports 2D x 2D, 3D x 3D, "
+            f"3D x 2D, 2D x 3D, and 4D x 4D, "
+            f"got shapes {a.shape} and {b.shape}"
+        )
+
+
+def bmm_batch_invariant(a, b, *, out=None):
+    # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N)
+    # Process each batch separately with our persistent kernel
+    if a.ndim == 3 and b.ndim == 3:
+        results = []
+        for i in range(a.shape[0]):
+            results.append(matmul_persistent(a[i], b[i]))
+        result = torch.stack(results, dim=0)
+
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+    else:
+        raise ValueError(
+            f"bmm_batch_invariant expects 3D tensors, "
+            f"got shapes {a.shape} and {b.shape}"
+        )
+
+
+def addmm_batch_invariant(bias, a, b):
+    return matmul_persistent(a, b, bias=bias)
+
+
+def _log_softmax_batch_invariant(input, dim, _half_to_float):
+    assert not _half_to_float, "not implemented"
+    return log_softmax(input, dim=dim)
+
+
+def softmax_batch_invariant(input, dim, dtype=None):
+    # Compute softmax in a deterministic way
+    # First subtract max for numerical stability (standard practice)
+    input_max = torch.amax(input, dim=dim, keepdim=True)
+    input = input - input_max
+    exp_x = torch.exp(input)
+    sum_exp_x = torch.sum(exp_x, dim=dim, keepdim=True)
+    return exp_x / sum_exp_x
+
+
+def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype | None = None):
+    assert dtype is None or dtype == torch.float32, f"unsupported dtype: {dtype}"
+
+    result = input.to(torch.float32)
+
+    if len(dim) == 0:
+        dim = [i for i in range(len(input.shape))]
+
+    # Sort dimensions to reduce from largest to smallest to handle shifting dims
+    # during iterative reduction.
+    sorted_dims = sorted([d % input.ndim for d in dim], reverse=True)
+
+    # Iteratively apply a deterministic mean.
+    for d in sorted_dims:
+        result = mean_dim(result, dim=d, keepdim=True)
+
+    if not keepdim:
+        # Squeeze the reduced dimensions.
+        for d in sorted_dims:
+            result = result.squeeze(d)
+
+    return result
+
+
+@triton.jit
+def _rms_norm_kernel(
+    input_ptr,
+    weight_ptr,
+    output_ptr,
+    input_row_stride,
+    output_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Compute RMS normalization along the last dimension of a 2D tensor.
+    RMS Norm: y = x / sqrt(mean(x^2) + eps) * weight
+    Each block handles one row of the input tensor.
+    """
+    row_idx = tl.program_id(0).to(tl.int64)
+    row_start_ptr = input_ptr + row_idx * input_row_stride
+    output_row_start_ptr = output_ptr + row_idx * output_row_stride
+
+    # Step 1: Compute sum of squares in float32 to avoid overflow
+    sum_sq = tl.zeros([1], dtype=tl.float32)
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+        # Convert to float32 for accumulation to prevent overflow
+        vals_f32 = vals.to(tl.float32)
+        sq_vals = vals_f32 * vals_f32
+        sum_sq += tl.sum(tl.where(mask, sq_vals, 0.0))
+
+    # Step 2: Compute RMS (root mean square) in float32
+    mean_sq = sum_sq / n_cols
+    rms = tl.sqrt(mean_sq + eps)
+    inv_rms = 1.0 / rms
+
+    # Step 3: Normalize and apply weight
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+        weight = tl.load(weight_ptr + col_idx, mask=mask, other=1.0)
+        # Compute in float32 then convert back to input dtype
+        vals_f32 = vals.to(tl.float32)
+        weight_f32 = weight.to(tl.float32)
+        output_f32 = vals_f32 * inv_rms * weight_f32
+        output = output_f32.to(vals.dtype)
+        tl.store(output_row_start_ptr + col_idx, output, mask=mask)
+
+
+def rms_norm(
+    input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> torch.Tensor:
+    """
+    Compute RMS normalization using Triton kernel.
+
+    RMS Norm normalizes the input by the root mean square and scales by weight:
+    output = input / sqrt(mean(input^2) + eps) * weight
+
+    Args:
+        input: Input tensor of shape (..., hidden_size)
+        weight: Weight tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+
+    Returns:
+        Tensor with RMS normalization applied along the last dimension
+    """
+    assert weight.dim() == 1, "Weight must be 1-dimensional"
+    assert input.shape[-1] == weight.shape[0], (
+        f"Input last dimension ({input.shape[-1]}) must match "
+        f"weight dimension ({weight.shape[0]})"
+    )
+
+    # Flatten all dimensions except the last one
+    original_shape = input.shape
+    input_2d = input.reshape(-1, input.shape[-1])
+    input_2d = input_2d.contiguous()
+    weight = weight.contiguous()
+
+    n_rows, n_cols = input_2d.shape
+
+    output = torch.empty_like(input_2d)
+    BLOCK_SIZE = 1024
+    grid = (n_rows,)
+    _rms_norm_kernel[grid](
+        input_2d,
+        weight,
+        output,
+        input_2d.stride(0),
+        output.stride(0),
+        n_cols,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return output.reshape(original_shape)
+
+
+def rms_norm_batch_invariant(
+    input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> torch.Tensor:
+    """
+    Batch-invariant wrapper for RMS normalization.
+
+    This function provides a deterministic, batch-invariant implementation
+    of RMS normalization for use with the batch_invariant mode.
+
+    Args:
+        input: Input tensor of shape (..., hidden_size)
+        weight: Weight tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+
+    Returns:
+        RMS normalized tensor
+    """
+    return rms_norm(input, weight, eps=eps)
+
+
+def linear_batch_invariant(input, weight, bias=None):
+    output = matmul_batch_invariant(input, weight.t())
+
+    if bias is not None:
+        output = output + bias
+    return output
+
+
+_batch_invariant_MODE = False
+_batch_invariant_LIB = None
+_original_torch_bmm = None
+_original_fp16_reduction_precision = None
+_original_bf16_reduction_precision = None
+_original_cublas_workspace_cfg = None
+_original_cublaslt_workspace_size = None
+
+
+def enable_batch_invariant_mode():
+    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
+    global _original_fp16_reduction_precision, _original_bf16_reduction_precision
+    global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
+    if _batch_invariant_MODE:
+        return
+
+    _batch_invariant_MODE = True
+    _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
+
+    # Batch invariant matmuls are no longer needed after cublas overrides
+    if not is_torch_equal_or_newer("2.10.0.dev"):
+        if current_platform.is_device_capability(100):
+            # For PyTorch 2.9, B200 uses GEMV for bs=1
+            # Requires https://github.com/pytorch/pytorch/pull/166735
+            _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+        else:
+            # Only source of batch invariance for Hopper is split-k, can disable through
+            # cuBLAS workspace config
+            _original_cublas_workspace_cfg = os.environ.get(
+                "CUBLAS_WORKSPACE_CONFIG", None
+            )
+            _original_cublaslt_workspace_size = os.environ.get(
+                "CUBLASLT_WORKSPACE_SIZE", None
+            )
+            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+            os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
+
+    _batch_invariant_LIB.impl(
+        "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"
+    )
+    _batch_invariant_LIB.impl("aten::softmax", softmax_batch_invariant, "CUDA")
+    _batch_invariant_LIB.impl("aten::_softmax", softmax_batch_invariant, "CUDA")
+    _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, "CUDA")
+
+    # Also monkeypatch torch.bmm directly as a fallback
+    _batch_invariant_LIB.impl("aten::bmm", bmm_batch_invariant, "CUDA")
+    _original_torch_bmm = torch.bmm
+    torch.bmm = bmm_batch_invariant
+
+    _original_bf16_reduction_precision = (
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+    )
+    _original_fp16_reduction_precision = (
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+    )
+
+    reduced_precision_val = (
+        (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
+    )
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+        reduced_precision_val
+    )
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+        reduced_precision_val
+    )
+    torch.backends.cuda.preferred_blas_library(backend="cublaslt")
+
+
+@cache
+def vllm_is_batch_invariant():
+    env_key = "VLLM_BATCH_INVARIANT"
+    is_overridden = False
+    val = os.getenv(env_key, "0")
+    try:
+        is_overridden = int(val) != 0
+    except ValueError:
+        is_overridden = False
+    return is_overridden
+
+
+def override_envs_for_invariance():
+    curr_attn_backend = envs.VLLM_ATTENTION_BACKEND
+    supported_backends = [
+        "FLASH_ATTN",  # best supported backend
+        "FLASHINFER",
+        "FLASH_ATTN_MLA",
+        "FLASHINFER_MLA",
+        "TRITON_MLA",
+        # Not yet supported MLA backends
+        # "FLASHMLA",
+        # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
+    ]
+    if curr_attn_backend not in supported_backends:
+        warning = (
+            "Forcibly updating attention backend to"
+            f" {supported_backends[0]} for batch_invariant. "
+            f" Supported backends: {supported_backends}."
+        )
+        logger.warning_once(warning)
+        os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
+    if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
+        warning = (
+            "You are using a decode-invariant form of batch invariance. "
+            "This will not be invariant between prefill and decode."
+        )
+        logger.warning_once(warning)
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+    # NCCL determinism settings
+    os.environ["NCCL_LAUNCH_MODE"] = "GROUP"
+    os.environ["NCCL_COLLNET_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["NCCL_P2P_NET_DISABLE"] = "1"
+    os.environ["NCCL_MIN_NCHANNELS"] = "1"
+    os.environ["NCCL_MAX_NCHANNELS"] = "1"
+    os.environ["NCCL_PROTO"] = "Simple"
+    os.environ["NCCL_ALGO"] = "allreduce:tree"
+    os.environ["NCCL_NTHREADS"] = "1"
+    os.environ["NCCL_SOCKET_NTHREADS"] = "1"
+
+    # torch.compile settings
+    os.environ["VLLM_USE_AOT_COMPILE"] = "0"
+
+
+def init_batch_invariance():
+    # this will hit all the csrc overrides as well
+    if vllm_is_batch_invariant():
+        override_envs_for_invariance()
+        enable_batch_invariant_mode()
+
+        # Disable TF32 for batch invariance - it causes non-deterministic rounding
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f2d2990c241a00d3181fdb186a333b04d32dad
--- /dev/null
+++ b/vllm/model_executor/layers/conv.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Conv Layer Class."""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.torch_utils import is_torch_equal
+
+
+class ConvLayerBase(CustomOp):
+    """Conv layer base class."""
+
+    num_dim: int
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, ...],
+        stride: int | tuple[int, ...] = 1,
+        padding: int | tuple[int, ...] = 0,
+        dilation: int | tuple[int, ...] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        kernel_size = (
+            (kernel_size,) * self.num_dim
+            if isinstance(kernel_size, int)
+            else kernel_size
+        )
+        stride = (stride,) * self.num_dim if isinstance(stride, int) else stride
+        padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
+        dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        self.enable_linear = (
+            (self.kernel_size == self.stride)
+            and not any(self.padding)
+            and self.groups == 1
+        )
+        self.input_size = in_channels * math.prod(self.kernel_size)
+
+        self.weight = nn.Parameter(
+            torch.empty(
+                out_channels,
+                in_channels // groups,
+                *kernel_size,
+                dtype=params_dtype,
+            ),
+        )
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype))
+        else:
+            self.register_parameter("bias", None)
+
+    def extra_repr(self) -> str:
+        s = f"in_channels={self.in_channels}, "
+        s += f"out_channels={self.out_channels}, "
+        s += f"kernel_size={self.kernel_size}, "
+        s += f"stride={self.stride}, "
+        s += f"padding={self.padding}, "
+        s += f"bias={self.bias is not None}"
+        return s
+
+
+@CustomOp.register("conv2d")
+class Conv2dLayer(ConvLayerBase):
+    """Conv layer with Conv2d."""
+
+    num_dim = 2
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        K1, K2 = self.kernel_size
+        H, W = H // K1, W // K2
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2)
+        x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        x = F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # By default, we use CUDNN's convolution ops with optimization.
+        return self._forward_conv(x)
+
+
+class CausalConv2dLayer(Conv2dLayer):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2dLayer."
+            )
+        self._left_padding: int = kernel_size - 1
+        self._right_padding: int = stride - 1
+        padding = 0
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            params_dtype=params_dtype,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0))
+        x = super().forward(x)
+        return x
+
+
+@CustomOp.register("conv3d")
+class Conv3dLayer(ConvLayerBase):
+    """Conv layer with Conv3d."""
+
+    num_dim = 3
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        B, C, T, H, W = x.shape
+        K1, K2, K3 = self.kernel_size
+        T, H, W = T // K1, H // K2, W // K3
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3)
+        x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        x = F.conv3d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, time, height, width)"""
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a
+        # significant performance regression.
+        # See: https://github.com/vllm-project/vllm/issues/27406
+        # and https://github.com/pytorch/pytorch/issues/166122
+        # By default, we use CUDNN's convolution ops with optimization.
+        if self.enable_linear and is_torch_equal("2.9.0"):
+            return self._forward_mulmat(x)
+        return self._forward_conv(x)
diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py
index e7d295aff2392911e67418f1330465bc70d83610..4c8bf9f4399727aea7d24cb3ed00f7a95308abde 100644
--- a/vllm/model_executor/layers/fla/ops/chunk.py
+++ b/vllm/model_executor/layers/fla/ops/chunk.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
 import warnings
-from typing import Optional
 
 import torch
 from einops import rearrange
@@ -23,22 +22,22 @@ from .utils import SUPPRESS_LEVEL, input_guard
 from .wy_fast import recompute_w_u_fwd
 
 
-def chunk_gated_delta_rule_fwd(q: torch.Tensor,
-                               k: torch.Tensor,
-                               v: torch.Tensor,
-                               g: torch.Tensor,
-                               beta: torch.Tensor,
-                               scale: float,
-                               initial_state: torch.Tensor,
-                               output_final_state: bool,
-                               cu_seqlens: Optional[torch.LongTensor] = None):
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: torch.LongTensor | None = None,
+):
     g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
     # obtain WY representation. u is actually the new v.
-    A = chunk_scaled_dot_kkt_fwd(k=k,
-                                 beta=beta,
-                                 g_cumsum=g,
-                                 cu_seqlens=cu_seqlens,
-                                 output_dtype=torch.float32)
+    A = chunk_scaled_dot_kkt_fwd(
+        k=k, beta=beta, g=g, cu_seqlens=cu_seqlens, output_dtype=torch.float32
+    )
     A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
     w, u = recompute_w_u_fwd(
         k=k,
@@ -73,21 +72,22 @@ def chunk_gated_delta_rule_fwd(q: torch.Tensor,
 
 
 class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
-
     @staticmethod
     @input_guard
-    @torch.amp.custom_fwd(device_type='cuda')
-    def forward(ctx,
-                q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                g: torch.Tensor,
-                beta: torch.Tensor,
-                scale: float,
-                initial_state: torch.Tensor,
-                output_final_state: bool,
-                cu_seqlens: Optional[torch.LongTensor] = None,
-                use_qk_l2norm_in_kernel: bool = False):
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: torch.LongTensor | None = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
         if use_qk_l2norm_in_kernel:
             q = l2norm_fwd(q)
             k = l2norm_fwd(k)
@@ -109,17 +109,19 @@ class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
 
 
 @torch.compiler.disable
-def chunk_gated_delta_rule(q: torch.Tensor,
-                           k: torch.Tensor,
-                           v: torch.Tensor,
-                           g: torch.Tensor,
-                           beta: torch.Tensor,
-                           scale: float = None,
-                           initial_state: torch.Tensor = None,
-                           output_final_state: bool = False,
-                           cu_seqlens: Optional[torch.LongTensor] = None,
-                           head_first: bool = False,
-                           use_qk_l2norm_in_kernel: bool = False):
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: torch.LongTensor | None = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
     r"""
     Args:
         q (torch.Tensor):
@@ -184,42 +186,55 @@ def chunk_gated_delta_rule(q: torch.Tensor,
         )
     """
     assert q.dtype == k.dtype == v.dtype
-    assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
-    assert len(
-        beta.shape
-    ) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+    assert q.dtype != torch.float32, (
+        "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    )
+    assert len(beta.shape) == 3, (
+        "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+    )
 
     if head_first:
         raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead.",
-            stacklevel=2)
+            stacklevel=2,
+        )
         q, k, v, beta, g = map(
-            lambda x: rearrange(x, 'b h t ... -> b t h ...'),
-            (q, k, v, beta, g))
+            lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g)
+        )
     if not head_first and q.shape[1] < q.shape[2]:
         warnings.warn(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
             "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
-            stacklevel=2)
+            stacklevel=2,
+        )
     if cu_seqlens is not None:
         if q.shape[0] != 1:
             raise ValueError(
                 f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
-                f"Please flatten variable-length inputs before processing.")
-        if initial_state is not None and initial_state.shape[0] != len(
-                cu_seqlens) - 1:
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
             raise ValueError(
                 f"The number of initial states is expected to be equal to the number of input sequences, "
                 f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
             )
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     o, final_state = ChunkGatedDeltaRuleFunction.apply(
-        q, k, v, g, beta, scale, initial_state, output_final_state, cu_seqlens,
-        use_qk_l2norm_in_kernel)
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
     if head_first:
-        o = rearrange(o, 'b t h ... -> b h t ...')
+        o = rearrange(o, "b t h ... -> b h t ...")
     return o, final_state
diff --git a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
index 34006f87f457bb7f4ca12279f4659e687b448380..f0b78b65c4a3287dc83dd6e9cdde8d1f77273cbb 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -15,33 +14,39 @@ from vllm.triton_utils import tl, triton
 
 from .index import prepare_chunk_indices, prepare_chunk_offsets
 from .op import exp
-from .utils import is_nvidia_hopper, use_cuda_graph
+from .utils import use_cuda_graph
 
-NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+NUM_WARPS = [2, 4, 8, 16]
 
 
-@triton.heuristics({
-    'USE_G': lambda args: args['g'] is not None,
-    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
-    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
-    'SAVE_NEW_VALUE': lambda args: args['v_new'] is not None,
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
-})
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
 @triton.autotune(
     configs=[
-        triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [2, 4] for num_stages in [2, 3, 4] for BV in [32, 64]
+        triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in [2, 3, 4]
+        for BV in [32, 64]
     ],
-    key=['H', 'K', 'V', 'BT', 'USE_G'],
+    key=["H", "K", "V", "BT"],
     use_cuda_graph=use_cuda_graph,
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=["T"])
 def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     k,
     v,
     w,
     v_new,
     g,
+    gk,
     h,
     h0,
     ht,
@@ -55,6 +60,7 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     BT: tl.constexpr,
     BV: tl.constexpr,
     USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
     USE_INITIAL_STATE: tl.constexpr,
     STORE_FINAL_STATE: tl.constexpr,
     SAVE_NEW_VALUE: tl.constexpr,
@@ -63,8 +69,10 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     i_v, i_nh = tl.program_id(0), tl.program_id(1)
     i_n, i_h = i_nh // H, i_nh % H
     if IS_VARLEN:
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
         NT = tl.cdiv(T, BT)
         boh = tl.load(chunk_offsets + i_n).to(tl.int32)
@@ -83,12 +91,12 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
         b_h4 = tl.zeros([64, BV], dtype=tl.float32)
 
     # calculate offset
-    h += (boh * H + i_h) * K * V
-    v += (bos * H + i_h) * V
-    k += (bos * Hg + i_h // (H // Hg)) * K
-    w += (bos * H + i_h) * K
+    h += ((boh * H + i_h) * K * V).to(tl.int64)
+    v += ((bos * H + i_h) * V).to(tl.int64)
+    k += ((bos * Hg + i_h // (H // Hg)) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
     if SAVE_NEW_VALUE:
-        v_new += (bos * H + i_h) * V
+        v_new += ((bos * H + i_h) * V).to(tl.int64)
     stride_v = H * V
     stride_h = H * K * V
     stride_k = Hg * K
@@ -100,175 +108,219 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
 
     # load initial state
     if USE_INITIAL_STATE:
-        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV),
-                                   (1, 0))
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
         b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
         if K > 64:
-            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV),
-                                       (64, BV), (1, 0))
+            p_h0_2 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
             b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
         if K > 128:
-            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV),
-                                       (64, BV), (1, 0))
+            p_h0_3 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
             b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
         if K > 192:
-            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV),
-                                       (64, BV), (1, 0))
+            p_h0_4 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
             b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
 
     # main recurrence
     for i_t in range(NT):
-        p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
-                                 (0, i_v * BV), (64, BV), (1, 0))
+        p_h1 = tl.make_block_ptr(
+            h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)
+        )
         tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
         if K > 64:
-            p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
-                                     (64, i_v * BV), (64, BV), (1, 0))
-            tl.store(p_h2,
-                     b_h2.to(p_h2.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_h2 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
         if K > 128:
-            p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
-                                     (128, i_v * BV), (64, BV), (1, 0))
-            tl.store(p_h3,
-                     b_h3.to(p_h3.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_h3 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
         if K > 192:
-            p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
-                                     (192, i_v * BV), (64, BV), (1, 0))
-            tl.store(p_h4,
-                     b_h4.to(p_h4.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_h4 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
 
-        p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV),
-                                (BT, BV), (1, 0))
-        p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1),
-                                    (i_t * BT, i_v * BV), (BT, BV),
-                                    (1, 0)) if SAVE_NEW_VALUE else None
-        b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
-        p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 0),
-                                (BT, 64), (1, 0))
+        p_w = tl.make_block_ptr(
+            w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0)
+        )
         b_w = tl.load(p_w, boundary_check=(0, 1))
-        b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+        b_v = tl.dot(b_w, b_h1.to(b_w.dtype))
         if K > 64:
-            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 64),
-                                    (BT, 64), (1, 0))
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0)
+            )
             b_w = tl.load(p_w, boundary_check=(0, 1))
-            b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+            b_v += tl.dot(b_w, b_h2.to(b_w.dtype))
         if K > 128:
-            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 128),
-                                    (BT, 64), (1, 0))
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0)
+            )
             b_w = tl.load(p_w, boundary_check=(0, 1))
-            b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+            b_v += tl.dot(b_w, b_h3.to(b_w.dtype))
         if K > 192:
-            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 192),
-                                    (BT, 64), (1, 0))
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0)
+            )
             b_w = tl.load(p_w, boundary_check=(0, 1))
-            b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
-        b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h4.to(b_w.dtype))
+        p_v = tl.make_block_ptr(
+            v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v
 
         if SAVE_NEW_VALUE:
-            p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1),
-                                        (i_t * BT, i_v * BV), (BT, BV), (1, 0))
-            tl.store(p_v_new,
-                     b_v_new.to(p_v_new.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_v = tl.make_block_ptr(
+                v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+            )
+            tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1))
 
+        last_idx = min((i_t + 1) * BT, T) - 1
         if USE_G:
             m_t = (i_t * BT + tl.arange(0, BT)) < T
-            last_idx = min((i_t + 1) * BT, T) - 1
             b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
-            p_g = tl.make_block_ptr(g + bos * H + i_h, (T, ), (H, ),
-                                    (i_t * BT, ), (BT, ), (0, ))
-            b_g = tl.load(p_g, boundary_check=(0, ))
-            b_v_new = b_v_new * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
+            p_g = tl.make_block_ptr(
+                g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+            )
+            b_g = tl.load(p_g, boundary_check=(0,))
+            b_v = b_v * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
             b_g_last = exp(b_g_last)
-            b_h1 = b_h1 * b_g_last
+            b_h1 *= b_g_last
             if K > 64:
-                b_h2 = b_h2 * b_g_last
+                b_h2 *= b_g_last
             if K > 128:
-                b_h3 = b_h3 * b_g_last
+                b_h3 *= b_g_last
             if K > 192:
-                b_h4 = b_h4 * b_g_last
-        b_v_new = b_v_new.to(k.dtype.element_ty)
-        p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT),
-                                (64, BT), (0, 1))
+                b_h4 *= b_g_last
+
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(
+                gk + (bos + last_idx) * H * K + i_h * K + o_k1,
+                mask=(o_k1 < K),
+                other=0.0,
+            )
+            b_h1 *= exp(b_gk_last1)[:, None]
+            if K > 64:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k2,
+                    mask=(o_k2 < K),
+                    other=0.0,
+                )
+                b_h2 *= exp(b_gk_last2)[:, None]
+            if K > 128:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k3,
+                    mask=(o_k3 < K),
+                    other=0.0,
+                )
+                b_h3 *= exp(b_gk_last3)[:, None]
+            if K > 192:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k4,
+                    mask=(o_k4 < K),
+                    other=0.0,
+                )
+                b_h4 *= exp(b_gk_last4)[:, None]
+        b_v = b_v.to(k.dtype.element_ty)
+
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1)
+        )
         b_k = tl.load(p_k, boundary_check=(0, 1))
-        b_h1 += tl.dot(b_k, b_v_new)
+        b_h1 += tl.dot(b_k, b_v)
         if K > 64:
-            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT),
-                                    (64, BT), (0, 1))
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1)
+            )
             b_k = tl.load(p_k, boundary_check=(0, 1))
-            b_h2 += tl.dot(b_k, b_v_new)
+            b_h2 += tl.dot(b_k, b_v)
         if K > 128:
-            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT),
-                                    (64, BT), (0, 1))
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1)
+            )
             b_k = tl.load(p_k, boundary_check=(0, 1))
-            b_h3 += tl.dot(b_k, b_v_new)
+            b_h3 += tl.dot(b_k, b_v)
         if K > 192:
-            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT),
-                                    (64, BT), (0, 1))
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1)
+            )
             b_k = tl.load(p_k, boundary_check=(0, 1))
-            b_h4 += tl.dot(b_k, b_v_new)
-
+            b_h4 += tl.dot(b_k, b_v)
     # epilogue
     if STORE_FINAL_STATE:
-        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV),
-                                 (1, 0))
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
         tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
         if K > 64:
-            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV),
-                                     (64, BV), (1, 0))
-            tl.store(p_ht,
-                     b_h2.to(p_ht.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
         if K > 128:
-            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV),
-                                     (64, BV), (1, 0))
-            tl.store(p_ht,
-                     b_h3.to(p_ht.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
         if K > 192:
-            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV),
-                                     (64, BV), (1, 0))
-            tl.store(p_ht,
-                     b_h4.to(p_ht.dtype.element_ty),
-                     boundary_check=(0, 1))
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
 
 
 def chunk_gated_delta_rule_fwd_h(
     k: torch.Tensor,
     w: torch.Tensor,
     u: torch.Tensor,
-    g: Optional[torch.Tensor] = None,
-    initial_state: Optional[torch.Tensor] = None,
+    g: torch.Tensor | None = None,
+    gk: torch.Tensor | None = None,
+    initial_state: torch.Tensor | None = None,
     output_final_state: bool = False,
     chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
     save_new_value: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    # This kernel is slightly different from fla to support Q/K with different head numbers.
+    # In fla, Q/K always have the same head number, so Hg is always equal to H.
     B, T, Hg, K, V = *k.shape, u.shape[-1]
     H = u.shape[-2]
     BT = chunk_size
 
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
     # N: the actual number of sequences in the batch with either equal or variable lengths
     if cu_seqlens is None:
         N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
     else:
-        N, NT, chunk_offsets = len(cu_seqlens) - 1, len(
-            chunk_indices), prepare_chunk_offsets(cu_seqlens, BT)
+        N, NT, chunk_offsets = (
+            len(cu_seqlens) - 1,
+            len(chunk_indices),
+            prepare_chunk_offsets(cu_seqlens, BT),
+        )
     assert K <= 256, "current kernel does not support head dimension larger than 256."
 
     h = k.new_empty(B, NT, H, K, V)
-    final_state = k.new_empty(
-        N, H, K, V, dtype=torch.float32) if output_final_state else None
+    final_state = (
+        k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    )
 
     v_new = torch.empty_like(u) if save_new_value else None
 
     def grid(meta):
-        return (triton.cdiv(V, meta['BV']), N * H)
+        return (triton.cdiv(V, meta["BV"]), N * H)
 
     chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
         k=k,
@@ -276,6 +328,7 @@ def chunk_gated_delta_rule_fwd_h(
         w=w,
         v_new=v_new,
         g=g,
+        gk=gk,
         h=h,
         h0=initial_state,
         ht=final_state,
@@ -286,5 +339,6 @@ def chunk_gated_delta_rule_fwd_h(
         Hg=Hg,
         K=K,
         V=V,
-        BT=BT)
+        BT=BT,
+    )
     return h, v_new, final_state
diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py
index 332751a1860a900ad4a3a679f30146cce1e4d0f0..4e8e04c1d48ca4c271ed340939d394cacc7c4278 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_o.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_o.py
@@ -9,7 +9,6 @@
 
 # ruff: noqa: E501
 
-from typing import Optional
 
 import torch
 
@@ -23,24 +22,23 @@ BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
 NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
 
 
-@triton.heuristics({
-    'USE_G': lambda args: args['g'] is not None,
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
-})
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
 @triton.autotune(
     configs=[
-        triton.Config({
-            'BK': BK,
-            'BV': BV
-        },
-                      num_warps=num_warps,
-                      num_stages=num_stages) for BK in BKV_LIST
-        for BV in BKV_LIST for num_warps in NUM_WARPS
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BKV_LIST
+        for BV in BKV_LIST
+        for num_warps in NUM_WARPS
         for num_stages in [2, 3, 4]
     ],
-    key=['H', 'K', 'V', 'BT'],
+    key=["H", "K", "V", "BT"],
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=["T"])
 def chunk_fwd_kernel_o(
     q,
     k,
@@ -67,10 +65,14 @@ def chunk_fwd_kernel_o(
 
     if IS_VARLEN:
         i_tg = i_t
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
         NT = tl.cdiv(T, BT)
     else:
@@ -89,12 +91,15 @@ def chunk_fwd_kernel_o(
     b_A = tl.zeros([BT, BT], dtype=tl.float32)
 
     for i_k in range(tl.cdiv(K, BK)):
-        p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK),
-                                (BT, BK), (1, 0))
-        p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT),
-                                (BK, BT), (0, 1))
-        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV),
-                                (BK, BV), (1, 0))
+        p_q = tl.make_block_ptr(
+            q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)
+        )
+        p_h = tl.make_block_ptr(
+            h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
         # [BT, BK]
         b_q = tl.load(p_q, boundary_check=(0, 1))
         # [BK, BT]
@@ -109,8 +114,8 @@ def chunk_fwd_kernel_o(
 
     if USE_G:
         g += bos * H + i_h
-        p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, ))
-        b_g = tl.load(p_g, boundary_check=(0, ))
+        p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
         b_o = b_o * exp(b_g)[:, None]
         b_A = b_A * exp(b_g[:, None] - b_g[None, :])
 
@@ -119,10 +124,12 @@ def chunk_fwd_kernel_o(
     m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t)
     b_A = tl.where(m_A, b_A, 0)
 
-    p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
-                            (BT, BV), (1, 0))
-    p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
-                            (BT, BV), (1, 0))
+    p_v = tl.make_block_ptr(
+        v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    p_o = tl.make_block_ptr(
+        o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
     b_v = tl.load(p_v, boundary_check=(0, 1))
 
     # to fix mma -> mma layout conversion
@@ -132,30 +139,29 @@ def chunk_fwd_kernel_o(
 
 
 def chunk_fwd_o(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        h: torch.Tensor,
-        g: Optional[torch.Tensor] = None,  # cumsum of log decay
-        scale: Optional[float] = None,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        chunk_size: int = 64) -> torch.Tensor:
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: torch.Tensor | None = None,  # cumsum of log decay
+    scale: float | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+) -> torch.Tensor:
     B, T, Hg, K, V = *q.shape, v.shape[-1]
     H = v.shape[-2]
-    if FLA_GDN_FIX_BT:
-        BT = 64
-    else:
-        BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
+    BT = 64 if FLA_GDN_FIX_BT else min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
 
     o = torch.empty_like(v)
 
     def grid(meta):
-        return (triton.cdiv(V, meta['BV']), NT, B * H)
+        return (triton.cdiv(V, meta["BV"]), NT, B * H)
 
     chunk_fwd_kernel_o[grid](
         q,
diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
index d1adc6978f245960e770b20308fd601e047eb63d..7724fa513d92e50725f4e1eb2d802269f27cdecd 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -17,23 +16,26 @@ from .index import prepare_chunk_indices
 from .op import exp
 
 
-@triton.heuristics({
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
-    'USE_G': lambda args: args['g_cumsum'] is not None
-})
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
 @triton.autotune(
     configs=[
-        triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages)
-        for BK in [32, 64, 128] for num_warps in [2, 4, 8]
+        triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128]
+        for num_warps in [2, 4, 8]
         for num_stages in [2, 3, 4]
     ],
-    key=['H', 'K', 'BT', 'IS_VARLEN'],
+    key=["H", "K", "BT", "IS_VARLEN"],
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=["T"])
 def chunk_scaled_dot_kkt_fwd_kernel(
     k,
     beta,
-    g_cumsum,
+    g,
     A,
     cu_seqlens,
     chunk_indices,
@@ -49,50 +51,61 @@ def chunk_scaled_dot_kkt_fwd_kernel(
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
     o_t = i_t * BT + tl.arange(0, BT)
     m_t = o_t < T
 
-    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ),
-                               (i_t * BT, ), (BT, ), (0, ))
-    b_beta = tl.load(p_beta, boundary_check=(0, ))
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
 
     b_A = tl.zeros([BT, BT], dtype=tl.float32)
     for i_k in range(tl.cdiv(K, BK)):
-        p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K),
-                                (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK),
-                                (1, 0))
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
         b_k = tl.load(p_k, boundary_check=(0, 1))
         b_kb = b_k * b_beta[:, None]
         b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k))
 
     if USE_G:
-        p_g = tl.make_block_ptr(g_cumsum + bos * H + i_h, (T, ), (H, ),
-                                (i_t * BT, ), (BT, ), (0, ))
-        b_g = tl.load(p_g, boundary_check=(0, ))
+        p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
         b_g_diff = b_g[:, None] - b_g[None, :]
         b_A = b_A * exp(b_g_diff)
 
     m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
     b_A = tl.where(m_A, b_A, 0)
-    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1),
-                            (i_t * BT, 0), (BT, BT), (1, 0))
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
     tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
 
 
 def chunk_scaled_dot_kkt_fwd(
-        k: torch.Tensor,
-        beta: torch.Tensor,
-        g_cumsum: Optional[torch.Tensor] = None,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        chunk_size: int = 64,
-        output_dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    k: torch.Tensor,
+    g: torch.Tensor | None = None,
+    beta: torch.Tensor | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
     r"""
     Compute beta * K * K^T.
 
@@ -101,9 +114,8 @@ def chunk_scaled_dot_kkt_fwd(
             The key tensor of shape `[B, T, H, K]`.
         beta (torch.Tensor):
             The beta tensor of shape `[B, T, H]`.
-        g_cumsum (torch.Tensor):
-            The cumulative sum of the gate tensor of shape `[B, T, H]`.
-            Default: None
+        g (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H]`. Default: `None`.
         cu_seqlens (torch.LongTensor):
             The cumulative sequence lengths of the input tensor.
             Default: None
@@ -115,19 +127,21 @@ def chunk_scaled_dot_kkt_fwd(
     Returns:
         beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
     """
-
+    # This kernel is slightly different from fla to support Q/K with different head numbers.
+    # In fla, Q/K always have the same head number, so Hg is always equal to H.
     B, T, Hg, K = k.shape
-
     H = beta.shape[-1]
     BT = chunk_size
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
     A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
     chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)](
         k=k,
+        g=g,
         beta=beta,
-        g_cumsum=g_cumsum,
         A=A,
         cu_seqlens=cu_seqlens,
         chunk_indices=chunk_indices,
diff --git a/vllm/model_executor/layers/fla/ops/cumsum.py b/vllm/model_executor/layers/fla/ops/cumsum.py
index 370a45fe163582e6db2a5046360927902dcedd9f..99b41794796d8f9e2f10ef856f59ee17a86b8697 100644
--- a/vllm/model_executor/layers/fla/ops/cumsum.py
+++ b/vllm/model_executor/layers/fla/ops/cumsum.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
 import warnings
-from typing import Optional
 
 import torch
 
@@ -20,12 +19,12 @@ from .utils import check_shared_mem, input_guard
 BS_LIST = [32, 64] if check_shared_mem() else [16, 32]
 
 
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
-@triton.autotune(configs=[
-    triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]
-],
-                 key=['B', 'H', 'BT', 'IS_VARLEN', 'REVERSE'])
-@triton.jit(do_not_specialize=['T'])
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+    key=["B", "H", "BT", "IS_VARLEN", "REVERSE"],
+)
+@triton.jit(do_not_specialize=["T"])
 def chunk_local_cumsum_scalar_kernel(
     s,
     o,
@@ -42,40 +41,47 @@ def chunk_local_cumsum_scalar_kernel(
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
     if HEAD_FIRST:
-        p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T, ), (1, ),
-                                (i_t * BT, ), (BT, ), (0, ))
-        p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T, ), (1, ),
-                                (i_t * BT, ), (BT, ), (0, ))
+        p_s = tl.make_block_ptr(
+            s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
+        p_o = tl.make_block_ptr(
+            o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
     else:
-        p_s = tl.make_block_ptr(s + bos * H + i_h, (T, ), (H, ), (i_t * BT, ),
-                                (BT, ), (0, ))
-        p_o = tl.make_block_ptr(o + bos * H + i_h, (T, ), (H, ), (i_t * BT, ),
-                                (BT, ), (0, ))
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
     # [BT]
-    b_s = tl.load(p_s, boundary_check=(0, )).to(tl.float32)
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
     b_o = tl.cumsum(b_s, axis=0)
     if REVERSE:
         b_z = tl.sum(b_s, axis=0)
         b_o = -b_o + b_z[None] + b_s
-    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, ))
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
 
 
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
-@triton.autotune(configs=[
-    triton.Config({'BS': BS}, num_warps=num_warps) for BS in BS_LIST
-    for num_warps in [2, 4, 8]
-],
-                 key=['B', 'H', 'S', 'BT', 'IS_VARLEN', 'REVERSE'])
-@triton.jit(do_not_specialize=['T'])
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({"BS": BS}, num_warps=num_warps)
+        for BS in BS_LIST
+        for num_warps in [2, 4, 8]
+    ],
+    key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE"],
+)
+@triton.jit(do_not_specialize=["T"])
 def chunk_local_cumsum_vector_kernel(
     s,
     o,
@@ -94,30 +100,58 @@ def chunk_local_cumsum_vector_kernel(
     i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
     o_i = tl.arange(0, BT)
     if REVERSE:
-        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1., 0.)
+        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1.0, 0.0)
     else:
-        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1., 0.)
+        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0)
 
     if HEAD_FIRST:
-        p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1),
-                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
-        p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1),
-                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
     else:
-        p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1),
-                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
-        p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1),
-                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
     # [BT, BS]
     b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
     b_o = tl.dot(m_s, b_s, allow_tf32=False)
@@ -125,102 +159,122 @@ def chunk_local_cumsum_vector_kernel(
 
 
 def chunk_local_cumsum_scalar(
-        g: torch.Tensor,
-        chunk_size: int,
-        reverse: bool = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        head_first: bool = False,
-        output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor:
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+) -> torch.Tensor:
     if head_first:
         B, H, T = g.shape
     else:
         B, T, H = g.shape
-    assert chunk_size == 2**(chunk_size.bit_length() -
-                             1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), (
+        "chunk_size must be a power of 2"
+    )
     BT = chunk_size
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
     g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
     grid = (NT, B * H)
-    chunk_local_cumsum_scalar_kernel[grid](g_org,
-                                           g,
-                                           cu_seqlens,
-                                           chunk_indices,
-                                           T=T,
-                                           B=B,
-                                           H=H,
-                                           BT=BT,
-                                           HEAD_FIRST=head_first,
-                                           REVERSE=reverse)
+    chunk_local_cumsum_scalar_kernel[grid](
+        g_org,
+        g,
+        cu_seqlens,
+        chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
     return g
 
 
 def chunk_local_cumsum_vector(
-        g: torch.Tensor,
-        chunk_size: int,
-        reverse: bool = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        head_first: bool = False,
-        output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor:
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+) -> torch.Tensor:
     if head_first:
         B, H, T, S = g.shape
     else:
         B, T, H, S = g.shape
     BT = chunk_size
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
-    assert chunk_size == 2**(chunk_size.bit_length() -
-                             1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), (
+        "chunk_size must be a power of 2"
+    )
 
     g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
 
     def grid(meta):
-        return (triton.cdiv(meta['S'], meta['BS']), NT, B * H)
+        return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H)
 
     # keep cumulative normalizer in fp32
     # this kernel is equivalent to
     # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
-    chunk_local_cumsum_vector_kernel[grid](g_org,
-                                           g,
-                                           cu_seqlens,
-                                           chunk_indices,
-                                           T=T,
-                                           B=B,
-                                           H=H,
-                                           S=S,
-                                           BT=BT,
-                                           HEAD_FIRST=head_first,
-                                           REVERSE=reverse)
+    chunk_local_cumsum_vector_kernel[grid](
+        g_org,
+        g,
+        cu_seqlens,
+        chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
     return g
 
 
 @input_guard
-def chunk_local_cumsum(g: torch.Tensor,
-                       chunk_size: int,
-                       reverse: bool = False,
-                       cu_seqlens: Optional[torch.Tensor] = None,
-                       head_first: bool = False,
-                       output_dtype: Optional[torch.dtype] = torch.float,
-                       **kwargs) -> torch.Tensor:
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+    **kwargs,
+) -> torch.Tensor:
     if not head_first and g.shape[1] < g.shape[2]:
         warnings.warn(
             f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
             "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
-            stacklevel=2)
+            stacklevel=2,
+        )
     if cu_seqlens is not None:
-        assert g.shape[
-            0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+        assert g.shape[0] == 1, (
+            "Only batch size 1 is supported when cu_seqlens are provided"
+        )
     if len(g.shape) == 3:
-        return chunk_local_cumsum_scalar(g, chunk_size, reverse, cu_seqlens,
-                                         head_first, output_dtype)
+        return chunk_local_cumsum_scalar(
+            g, chunk_size, reverse, cu_seqlens, head_first, output_dtype
+        )
     elif len(g.shape) == 4:
-        return chunk_local_cumsum_vector(g, chunk_size, reverse, cu_seqlens,
-                                         head_first, output_dtype)
+        return chunk_local_cumsum_vector(
+            g, chunk_size, reverse, cu_seqlens, head_first, output_dtype
+        )
     else:
-        raise ValueError(f"Unsupported input shape {g.shape}. "
-                         f"which should be (B, T, H, D) if `head_first=False` "
-                         f"or (B, H, T, D) otherwise")
+        raise ValueError(
+            f"Unsupported input shape {g.shape}. "
+            f"which should be (B, T, H, D) if `head_first=False` "
+            f"or (B, H, T, D) otherwise"
+        )
diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
index 98437340fd242c05c7ba7eb2ffafebcc66986f62..0f27504780ac4c29f65b8eaacfd05b76fcfdc88a 100644
--- a/vllm/model_executor/layers/fla/ops/fused_recurrent.py
+++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -16,17 +15,15 @@ from vllm.triton_utils import tl, triton
 from .op import exp
 
 
-@triton.heuristics({
-    'USE_INITIAL_STATE':
-    lambda args: args['h0'] is not None,
-    'IS_VARLEN':
-    lambda args: args['cu_seqlens'] is not None,
-    "IS_CONTINUOUS_BATCHING":
-    lambda args: args['ssm_state_indices'] is not None,
-    "IS_SPEC_DECODING":
-    lambda args: args['num_accepted_tokens'] is not None,
-})
-@triton.jit(do_not_specialize=['N', 'T'])
+@triton.heuristics(
+    {
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+        "IS_CONTINUOUS_BATCHING": lambda args: args["ssm_state_indices"] is not None,
+        "IS_SPEC_DECODING": lambda args: args["num_accepted_tokens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["N", "T"])
 def fused_recurrent_gated_delta_rule_fwd_kernel(
     q,
     k,
@@ -55,19 +52,21 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
     stride_indices_tok: tl.constexpr,
     USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
     INPLACE_FINAL_STATE: tl.constexpr,  # whether to store final state inplace
-    IS_BETA_HEADWISE: tl.
-    constexpr,  # whether beta is headwise vector or scalar,
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
     USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
     IS_VARLEN: tl.constexpr,
     IS_CONTINUOUS_BATCHING: tl.constexpr,
     IS_SPEC_DECODING: tl.constexpr,
+    IS_KDA: tl.constexpr,
 ):
     i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_n, i_hv = i_nh // HV, i_nh % HV
     i_h = i_hv // (HV // H)
     if IS_VARLEN:
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int64),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int64),
+        )
         all = T
         T = eos - bos
     else:
@@ -88,7 +87,12 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
         p_beta = beta + (bos * HV + i_hv) * V + o_v
     else:
         p_beta = beta + bos * HV + i_hv
-    p_g = g + bos * HV + i_hv
+
+    if not IS_KDA:
+        p_g = g + bos * HV + i_hv
+    else:
+        p_gk = g + (bos * HV + i_hv) * K + o_k
+
     p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
 
     mask_k = o_k < K
@@ -102,8 +106,13 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
                 i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1
             else:
                 i_t = 0
-            p_h0 = h0 + tl.load(ssm_state_indices + i_n * stride_indices_seq +
-                                i_t).to(tl.int64) * stride_init_state_token
+            p_h0 = (
+                h0
+                + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to(
+                    tl.int64
+                )
+                * stride_init_state_token
+            )
         else:
             p_h0 = h0 + bos * HV * K * V
         p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
@@ -113,14 +122,18 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
         b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
         b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
         b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
-        b_g = tl.load(p_g).to(tl.float32)
 
         if USE_QK_L2NORM_IN_KERNEL:
             b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6)
             b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6)
         b_q = b_q * scale
         # [BK, BV]
-        b_h *= exp(b_g)
+        if not IS_KDA:
+            b_g = tl.load(p_g).to(tl.float32)
+            b_h *= exp(b_g)
+        else:
+            b_gk = tl.load(p_gk).to(tl.float32)
+            b_h *= exp(b_gk[:, None])
         # [BV]
         b_v -= tl.sum(b_h * b_k[:, None], 0)
         if IS_BETA_HEADWISE:
@@ -136,8 +149,13 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
 
         # keep the states for multi-query tokens
         if INPLACE_FINAL_STATE:
-            p_ht = ht + tl.load(ssm_state_indices + i_n * stride_indices_seq +
-                                i_t).to(tl.int64) * stride_final_state_token
+            p_ht = (
+                ht
+                + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to(
+                    tl.int64
+                )
+                * stride_final_state_token
+            )
         else:
             p_ht = ht + (bos + i_t) * stride_final_state_token
         p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
@@ -147,7 +165,10 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
         p_k += H * K
         p_o += HV * V
         p_v += HV * V
-        p_g += HV
+        if not IS_KDA:
+            p_g += HV
+        else:
+            p_gk += HV * K
         p_beta += HV * (V if IS_BETA_HEADWISE else 1)
 
 
@@ -160,9 +181,9 @@ def fused_recurrent_gated_delta_rule_fwd(
     scale: float,
     initial_state: torch.Tensor,
     inplace_final_state: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    ssm_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
     use_qk_l2norm_in_kernel: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
@@ -220,6 +241,7 @@ def fused_recurrent_gated_delta_rule_fwd(
         IS_BETA_HEADWISE=beta.ndim == v.ndim,
         USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
         INPLACE_FINAL_STATE=inplace_final_state,
+        IS_KDA=False,
         num_warps=num_warps,
         num_stages=num_stages,
     )
@@ -228,21 +250,22 @@ def fused_recurrent_gated_delta_rule_fwd(
 
 
 class FusedRecurrentFunction(torch.autograd.Function):
-
     @staticmethod
-    def forward(ctx,
-                q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                g: torch.Tensor,
-                beta: torch.Tensor,
-                scale: float,
-                initial_state: torch.Tensor,
-                inplace_final_state: bool = True,
-                cu_seqlens: Optional[torch.LongTensor] = None,
-                ssm_state_indices: Optional[torch.Tensor] = None,
-                num_accepted_tokens: Optional[torch.Tensor] = None,
-                use_qk_l2norm_in_kernel: bool = False):
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        inplace_final_state: bool = True,
+        cu_seqlens: torch.LongTensor | None = None,
+        ssm_state_indices: torch.Tensor | None = None,
+        num_accepted_tokens: torch.Tensor | None = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
         o, final_state = fused_recurrent_gated_delta_rule_fwd(
             q=q.contiguous(),
             k=k.contiguous(),
@@ -270,9 +293,9 @@ def fused_recurrent_gated_delta_rule(
     scale: float = None,
     initial_state: torch.Tensor = None,
     inplace_final_state: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    ssm_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
     use_qk_l2norm_in_kernel: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     r"""
@@ -342,9 +365,10 @@ def fused_recurrent_gated_delta_rule(
     if cu_seqlens is not None and q.shape[0] != 1:
         raise ValueError(
             f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
-            f"Please flatten variable-length inputs before processing.")
+            f"Please flatten variable-length inputs before processing."
+        )
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     else:
         assert scale > 0, "scale must be positive"
     if beta is None:
diff --git a/vllm/model_executor/layers/fla/ops/index.py b/vllm/model_executor/layers/fla/ops/index.py
index 9eca32bc31a044b2ff9e36bc2a086b81e5ca5704..f023e1378bb88117aabb493f7bc3a0f2aeea3b7c 100644
--- a/vllm/model_executor/layers/fla/ops/index.py
+++ b/vllm/model_executor/layers/fla/ops/index.py
@@ -20,20 +20,22 @@ def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
 
 
 @tensor_cache
-def prepare_chunk_indices(cu_seqlens: torch.LongTensor,
-                          chunk_size: int) -> torch.LongTensor:
-    indices = torch.cat([
-        torch.arange(n)
-        for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
-    ])
-    return torch.stack([indices.eq(0).cumsum(0) - 1, indices],
-                       1).to(cu_seqlens)
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    indices = torch.cat(
+        [
+            torch.arange(n)
+            for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
+        ]
+    )
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
 
 
 @tensor_cache
-def prepare_chunk_offsets(cu_seqlens: torch.LongTensor,
-                          chunk_size: int) -> torch.LongTensor:
-    return torch.cat([
-        cu_seqlens.new_tensor([0]),
-        triton.cdiv(prepare_lens(cu_seqlens), chunk_size)
-    ]).cumsum(-1)
+def prepare_chunk_offsets(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    return torch.cat(
+        [cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]
+    ).cumsum(-1)
diff --git a/vllm/model_executor/layers/fla/ops/kda.py b/vllm/model_executor/layers/fla/ops/kda.py
new file mode 100644
index 0000000000000000000000000000000000000000..700f287ca4569a7a459753e36da962ef84438482
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/kda.py
@@ -0,0 +1,1351 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+
+
+import torch
+import torch.nn as nn
+
+from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import cdiv, next_power_of_2
+
+from .chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from .cumsum import chunk_local_cumsum
+from .fused_recurrent import fused_recurrent_gated_delta_rule_fwd_kernel
+from .index import prepare_chunk_indices
+from .l2norm import l2norm_fwd
+from .op import exp, log
+from .solve_tril import solve_tril
+from .utils import is_amd
+
+BT_LIST_AUTOTUNE = [32, 64, 128]
+NUM_WARPS_AUTOTUNE = [2, 4, 8, 16] if is_amd else [4, 8, 16, 32]
+
+
+def fused_recurrent_kda_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    inplace_final_state: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = next_power_of_2(K), min(next_power_of_2(V), 8)
+    NK, NV = cdiv(K, BK), cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = torch.empty_like(k)
+    if inplace_final_state:
+        final_state = initial_state
+    else:
+        final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype)
+
+    stride_init_state_token = initial_state.stride(0)
+    stride_final_state_token = final_state.stride(0)
+
+    if ssm_state_indices is None:
+        stride_indices_seq, stride_indices_tok = 1, 1
+    elif ssm_state_indices.ndim == 1:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1
+    else:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride()
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        scale=scale,
+        N=N,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        stride_init_state_token=stride_init_state_token,
+        stride_final_state_token=stride_final_state_token,
+        stride_indices_seq=stride_indices_seq,
+        stride_indices_tok=stride_indices_tok,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        INPLACE_FINAL_STATE=inplace_final_state,
+        IS_KDA=True,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return o, final_state
+
+
+def fused_recurrent_kda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    inplace_final_state: bool = True,
+    use_qk_l2norm_in_kernel: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.LongTensor | None = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if cu_seqlens is not None and q.shape[0] != 1:
+        raise ValueError(
+            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+            f"Please flatten variable-length inputs before processing."
+        )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    o, final_state = fused_recurrent_kda_fwd(
+        q=q.contiguous(),
+        k=k.contiguous(),
+        v=v.contiguous(),
+        g=g.contiguous(),
+        beta=beta.contiguous(),
+        scale=scale,
+        initial_state=initial_state,
+        inplace_final_state=inplace_final_state,
+        cu_seqlens=cu_seqlens,
+        ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=None,
+        use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
+
+
+@triton.heuristics(
+    {
+        "STORE_RESIDUAL_OUT": lambda args: args["residual_out"] is not None,
+        "HAS_RESIDUAL": lambda args: args["residual"] is not None,
+        "HAS_WEIGHT": lambda args: args["w"] is not None,
+        "HAS_BIAS": lambda args: args["b"] is not None,
+    }
+)
+@triton.jit
+def layer_norm_gated_fwd_kernel(
+    x,  # pointer to the input
+    g,  # pointer to the gate
+    y,  # pointer to the output
+    w,  # pointer to the weights
+    b,  # pointer to the biases
+    residual,  # pointer to the residual
+    residual_out,  # pointer to the residual
+    mean,  # pointer to the mean
+    rstd,  # pointer to the 1/std
+    eps,  # epsilon to avoid division by zero
+    T,  # number of rows in x
+    D: tl.constexpr,  # number of columns in x
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+
+    o_d = tl.arange(0, BD)
+    m_d = o_d < D
+
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    if HAS_RESIDUAL:
+        p_res = tl.make_block_ptr(
+            residual, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)
+        )
+        b_x += tl.load(p_res, boundary_check=(0, 1)).to(tl.float32)
+    if STORE_RESIDUAL_OUT:
+        p_res_out = tl.make_block_ptr(
+            residual_out, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)
+        )
+        tl.store(p_res_out, b_x.to(p_res_out.dtype.element_ty), boundary_check=(0, 1))
+    if not IS_RMS_NORM:
+        b_mean = tl.sum(b_x, axis=1) / D
+        p_mean = tl.make_block_ptr(mean, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        tl.store(p_mean, b_mean.to(p_mean.dtype.element_ty), boundary_check=(0,))
+        b_xbar = tl.where(m_d[None, :], b_x - b_mean[:, None], 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=1) / D
+    else:
+        b_xbar = tl.where(m_d[None, :], b_x, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=1) / D
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+
+    p_rstd = tl.make_block_ptr(rstd, (T,), (1,), (i_t * BT,), (BT,), (0,))
+    tl.store(p_rstd, b_rstd.to(p_rstd.dtype.element_ty), boundary_check=(0,))
+
+    if HAS_WEIGHT:
+        b_w = tl.load(w + o_d, mask=m_d).to(tl.float32)
+    if HAS_BIAS:
+        b_b = tl.load(b + o_d, mask=m_d).to(tl.float32)
+    b_x_hat = (
+        (b_x - b_mean[:, None]) * b_rstd[:, None]
+        if not IS_RMS_NORM
+        else b_x * b_rstd[:, None]
+    )
+    b_y = b_x_hat * b_w[None, :] if HAS_WEIGHT else b_x_hat
+    if HAS_BIAS:
+        b_y = b_y + b_b[None, :]
+
+    # swish/sigmoid output gate
+    p_g = tl.make_block_ptr(g, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)
+    if ACTIVATION == "swish" or ACTIVATION == "silu":
+        b_y = b_y * b_g * tl.sigmoid(b_g)
+    elif ACTIVATION == "sigmoid":
+        b_y = b_y * tl.sigmoid(b_g)
+
+    # Write output
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "STORE_RESIDUAL_OUT": lambda args: args["residual_out"] is not None,
+        "HAS_RESIDUAL": lambda args: args["residual"] is not None,
+        "HAS_WEIGHT": lambda args: args["w"] is not None,
+        "HAS_BIAS": lambda args: args["b"] is not None,
+    }
+)
+@triton.jit
+def layer_norm_gated_fwd_kernel1(
+    x,  # pointer to the input
+    g,  # pointer to the gate
+    y,  # pointer to the output
+    w,  # pointer to the weights
+    b,  # pointer to the biases
+    residual,  # pointer to the residual
+    residual_out,  # pointer to the residual
+    mean,  # pointer to the mean
+    rstd,  # pointer to the 1/std
+    eps,  # epsilon to avoid division by zero
+    D: tl.constexpr,  # number of columns in x
+    BD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    g += i_t * D
+    if HAS_RESIDUAL:
+        residual += i_t * D
+    if STORE_RESIDUAL_OUT:
+        residual_out += i_t * D
+
+    o_d = tl.arange(0, BD)
+    m_d = o_d < D
+    b_x = tl.load(x + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if HAS_RESIDUAL:
+        b_x += tl.load(residual + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if STORE_RESIDUAL_OUT:
+        tl.store(residual_out + o_d, b_x, mask=m_d)
+    if not IS_RMS_NORM:
+        b_mean = tl.sum(b_x, axis=0) / D
+        tl.store(mean + i_t, b_mean)
+        b_xbar = tl.where(m_d, b_x - b_mean, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=0) / D
+    else:
+        b_xbar = tl.where(m_d, b_x, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=0) / D
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    tl.store(rstd + i_t, b_rstd)
+
+    if HAS_WEIGHT:
+        b_w = tl.load(w + o_d, mask=m_d).to(tl.float32)
+    if HAS_BIAS:
+        b_b = tl.load(b + o_d, mask=m_d).to(tl.float32)
+    b_x_hat = (b_x - b_mean) * b_rstd if not IS_RMS_NORM else b_x * b_rstd
+    b_y = b_x_hat * b_w if HAS_WEIGHT else b_x_hat
+    if HAS_BIAS:
+        b_y = b_y + b_b
+
+    # swish/sigmoid output gate
+    b_g = tl.load(g + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if ACTIVATION == "swish" or ACTIVATION == "silu":
+        b_y = b_y * b_g * tl.sigmoid(b_g)
+    elif ACTIVATION == "sigmoid":
+        b_y = b_y * tl.sigmoid(b_g)
+
+    # Write output
+    tl.store(y + o_d, b_y, mask=m_d)
+
+
+def layer_norm_gated_fwd(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    activation: str = "swish",
+    eps: float = 1e-5,
+    residual: torch.Tensor = None,
+    out_dtype: torch.dtype = None,
+    residual_dtype: torch.dtype = None,
+    is_rms_norm: bool = False,
+):
+    if residual is not None:
+        residual_dtype = residual.dtype
+    T, D = x.shape
+    if residual is not None:
+        assert residual.shape == (T, D)
+    if weight is not None:
+        assert weight.shape == (D,)
+    if bias is not None:
+        assert bias.shape == (D,)
+    # allocate output
+    y = x if out_dtype is None else torch.empty_like(x, dtype=out_dtype)
+    if residual is not None or (
+        residual_dtype is not None and residual_dtype != x.dtype
+    ):
+        residual_out = torch.empty(T, D, device=x.device, dtype=residual_dtype)
+    else:
+        residual_out = None
+    mean = (
+        torch.empty((T,), dtype=torch.float, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((T,), dtype=torch.float, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+
+    if D <= 512:
+        BT = 32
+        layer_norm_gated_fwd_kernel[(cdiv(T, BT),)](
+            x=x,
+            g=g,
+            y=y,
+            w=weight,
+            b=bias,
+            residual=residual,
+            residual_out=residual_out,
+            mean=mean,
+            rstd=rstd,
+            eps=eps,
+            T=T,
+            D=D,
+            BD=BD,
+            BT=BT,
+            ACTIVATION=activation,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=4,
+        )
+    else:
+        layer_norm_gated_fwd_kernel1[(T,)](
+            x=x,
+            g=g,
+            y=y,
+            w=weight,
+            b=bias,
+            residual=residual,
+            residual_out=residual_out,
+            mean=mean,
+            rstd=rstd,
+            eps=eps,
+            D=D,
+            BD=BD,
+            ACTIVATION=activation,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=4,
+        )
+    # residual_out is None if residual is None and residual_dtype == input_dtype
+    return y, mean, rstd, residual_out if residual_out is not None else x
+
+
+def rms_norm_gated(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    activation: str = "swish",
+    residual: torch.Tensor | None = None,
+    prenorm: bool = False,
+    residual_in_fp32: bool = False,
+    eps: float = 1e-6,
+):
+    x_shape_og = x.shape
+    # reshape input data into 2D tensor
+    x = x.contiguous().reshape(-1, x.shape[-1])
+    g = g.contiguous().reshape(-1, g.shape[-1])
+    if residual is not None:
+        assert residual.shape == x_shape_og
+        residual = residual.contiguous().reshape(-1, residual.shape[-1])
+    residual_dtype = (
+        residual.dtype
+        if residual is not None
+        else (torch.float if residual_in_fp32 else None)
+    )
+    y, _, _, residual_out = layer_norm_gated_fwd(
+        x=x,
+        g=g,
+        weight=weight,
+        bias=bias,
+        activation=activation,
+        eps=eps,
+        residual=residual,
+        residual_dtype=residual_dtype,
+        is_rms_norm=True,
+    )
+    y = y.reshape(x_shape_og)
+    return y if not prenorm else (y, residual_out.reshape(x_shape_og))
+
+
+class FusedRMSNormGated(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        activation: str = "swish",
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.elementwise_affine = elementwise_affine
+        self.eps = eps
+        self.activation = activation
+
+        if self.activation not in ["swish", "silu", "sigmoid"]:
+            raise ValueError(f"Unsupported activation: {self.activation}")
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+        self.register_parameter("bias", None)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        g: torch.Tensor,
+        residual: torch.Tensor | None = None,
+        prenorm: bool = False,
+        residual_in_fp32: bool = False,
+    ) -> torch.Tensor:
+        return rms_norm_gated(
+            x,
+            g,
+            self.weight,
+            self.bias,
+            self.activation,
+            residual=residual,
+            eps=self.eps,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+        )
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for num_warps in [1, 2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BC"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter(
+    q,
+    k,
+    g,
+    beta,
+    A,
+    Aqk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_i, i_j = i_c // NC, i_c % NC
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT + i_i * BC >= T:
+        return
+    if i_i <= i_j:
+        return
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    A += (bos * H + i_h) * BT
+    Aqk += (bos * H + i_h) * BT
+
+    p_b = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT + i_i * BC,), (BC,), (0,)
+    )
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    b_A = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk = tl.zeros([BC, BC], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        p_g = tl.make_block_ptr(
+            g, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        b_kt = tl.make_block_ptr(
+            k, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1)
+        )
+        p_gk = tl.make_block_ptr(
+            g, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1)
+        )
+
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+        # [BK,]
+        b_gn = tl.load(g + (i_t * BT + i_i * BC) * H * K + o_k, mask=m_k, other=0)
+        # [BC, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1)) * exp(b_g - b_gn[None, :])
+        # [BK, BC]
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kt = tl.load(b_kt, boundary_check=(0, 1))
+        # [BC, BC]
+        b_ktg = b_kt * exp(b_gn[:, None] - b_gk)
+        b_A += tl.dot(b_k, b_ktg)
+
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_qg = b_q * exp(b_g - b_gn[None, :]) * scale
+        b_Aqk += tl.dot(b_qg, b_ktg)
+
+    b_A *= b_b[:, None]
+
+    p_A = tl.make_block_ptr(
+        A, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)
+    )
+    tl.store(p_A, b_A.to(A.dtype.element_ty), boundary_check=(0, 1))
+    p_Aqk = tl.make_block_ptr(
+        Aqk, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)
+    )
+    tl.store(p_Aqk, b_Aqk.to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+    key=["BK", "BT"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra(
+    q,
+    k,
+    g,
+    beta,
+    A,
+    Aqk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_i, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT + i_i * BC >= T:
+        return
+
+    o_i = tl.arange(0, BC)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+    m_A = (i_t * BT + i_i * BC + o_i) < T
+    o_A = (bos + i_t * BT + i_i * BC + o_i) * H * BT + i_h * BT + i_i * BC
+
+    p_q = tl.make_block_ptr(
+        q + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    p_k = tl.make_block_ptr(
+        k + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    p_g = tl.make_block_ptr(
+        g + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+
+    p_b = beta + (bos + i_t * BT + i_i * BC + o_i) * H + i_h
+    b_k = b_k * tl.load(p_b, mask=m_A, other=0)[:, None]
+
+    p_kt = k + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k
+    p_gk = g + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        b_kt = tl.load(p_kt, mask=m_k, other=0).to(tl.float32)
+        b_gk = tl.load(p_gk, mask=m_k, other=0).to(tl.float32)
+        b_ktg = b_kt[None, :] * exp(b_g - b_gk[None, :])
+        b_A = tl.sum(b_k * b_ktg, 1)
+        b_A = tl.where(o_i > j, b_A, 0.0)
+        b_Aqk = tl.sum(b_q * b_ktg, 1)
+        b_Aqk = tl.where(o_i >= j, b_Aqk * scale, 0.0)
+        tl.store(A + o_A + j, b_A, mask=m_A)
+        tl.store(Aqk + o_A + j, b_Aqk, mask=m_A)
+        p_kt += H * K
+        p_gk += H * K
+
+
+def chunk_kda_scaled_dot_kkt_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    gk: torch.Tensor | None = None,
+    beta: torch.Tensor | None = None,
+    scale: float | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        gk (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H, K]` applied to the key tensor. Default: `None`.
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+    B, T, H, K = k.shape
+    assert K <= 256
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    BC = min(16, BT)
+    NC = cdiv(BT, BC)
+    BK = max(next_power_of_2(K), 16)
+    A = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype)
+    Aqk = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype)
+    grid = (NT, NC * NC, B * H)
+    chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        A=A,
+        Aqk=Aqk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        NC=NC,
+    )
+
+    grid = (NT, NC, B * H)
+    chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        A=A,
+        Aqk=Aqk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+    )
+    return A, Aqk
+
+
+@triton.heuristics(
+    {
+        "STORE_QG": lambda args: args["qg"] is not None,
+        "STORE_KG": lambda args: args["kg"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+)
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    q,
+    k,
+    qg,
+    kg,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    gk,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    STORE_QG: tl.constexpr,
+    STORE_KG: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, input_precision=DOT_PRECISION)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_k = tl.make_block_ptr(
+            k + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_b[:, None]
+
+        p_gk = tl.make_block_ptr(
+            gk + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kb *= exp(b_gk)
+        if STORE_QG:
+            p_q = tl.make_block_ptr(
+                q + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            p_qg = tl.make_block_ptr(
+                qg + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_qg = b_q * exp(b_gk)
+            tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty), boundary_check=(0, 1))
+        if STORE_KG:
+            last_idx = min(i_t * BT + BT, T) - 1
+
+            o_k = i_k * BK + tl.arange(0, BK)
+            m_k = o_k < K
+            b_gn = tl.load(
+                gk + ((bos + last_idx) * H + i_h) * K + o_k, mask=m_k, other=0.0
+            )
+            b_kg = b_k * exp(b_gn - b_gk)
+
+            p_kg = tl.make_block_ptr(
+                kg + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty), boundary_check=(0, 1))
+
+        b_w = tl.dot(b_A, b_kb.to(b_k.dtype))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    A: torch.Tensor,
+    q: torch.Tensor | None = None,
+    gk: torch.Tensor | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = A.shape[-1]
+    BK = 64
+    BV = 64
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+    kg = torch.empty_like(k) if gk is not None else None
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        q=q,
+        k=k,
+        qg=None,
+        kg=kg,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        gk=gk,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        DOT_PRECISION="ieee",
+    )
+    return w, u, None, kg
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for BV in [64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_fwd_kernel_o(
+    q,
+    v,
+    g,
+    h,
+    o,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_g = tl.make_block_ptr(
+            g + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_h = tl.make_block_ptr(
+            h + (i_tg * H + i_h) * K * V,
+            (K, V),
+            (V, 1),
+            (i_k * BK, i_v * BV),
+            (BK, BV),
+            (1, 0),
+        )
+
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BT, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        # [BT, BK]
+        b_qg = (b_q * exp(b_g)).to(b_q.dtype)
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # works but dkw, owing to divine benevolence
+        # [BT, BV]
+        if i_k >= 0:
+            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+    p_v = tl.make_block_ptr(
+        v + (bos * H + i_h) * V,
+        (T, V),
+        (H * V, 1),
+        (i_t * BT, i_v * BV),
+        (BT, BV),
+        (1, 0),
+    )
+    p_o = tl.make_block_ptr(
+        o + (bos * H + i_h) * V,
+        (T, V),
+        (H * V, 1),
+        (i_t * BT, i_v * BV),
+        (BT, BV),
+        (1, 0),
+    )
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    # [BT, BV]
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_A = tl.where(m_s, b_A, 0.0).to(b_v.dtype)
+    b_o += tl.dot(b_A, b_v, allow_tf32=False)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_fwd_o_gk(
+    q: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    A: torch.Tensor,
+    h: torch.Tensor,
+    o: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+):
+    B, T, H, K, V = *q.shape, v.shape[-1]
+    BT = chunk_size
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    def grid(meta):
+        return (cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_gla_fwd_kernel_o[grid](
+        q=q,
+        v=v,
+        g=g,
+        h=h,
+        o=o,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return o
+
+
+def chunk_kda_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: torch.LongTensor | None = None,
+):
+    chunk_size = 64
+    g = chunk_local_cumsum(g, chunk_size=chunk_size, cu_seqlens=cu_seqlens)
+    # the intra Aqk is kept in fp32
+    # the computation has very marginal effect on the entire throughput
+    A, Aqk = chunk_kda_scaled_dot_kkt_fwd(
+        q=q,
+        k=k,
+        gk=g,
+        beta=beta,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        output_dtype=torch.float32,
+    )
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u, _, kg = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        gk=g,
+        cu_seqlens=cu_seqlens,
+    )
+    del A
+    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+        k=kg,
+        w=w,
+        u=u,
+        gk=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+    )
+    del w, u, kg
+    o = chunk_gla_fwd_o_gk(
+        q=q,
+        v=v_new,
+        g=g,
+        A=Aqk,
+        h=h,
+        o=v,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_size=chunk_size,
+    )
+    del Aqk, v_new, h
+    return o, final_state
+
+
+def chunk_kda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: torch.LongTensor | None = None,
+    **kwargs,
+):
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    if use_qk_l2norm_in_kernel:
+        q = l2norm_fwd(q.contiguous())
+        k = l2norm_fwd(k.contiguous())
+
+    o, final_state = chunk_kda_fwd(
+        q=q,
+        k=k,
+        v=v.contiguous(),
+        g=g.contiguous(),
+        beta=beta.contiguous(),
+        scale=scale,
+        initial_state=initial_state.contiguous(),
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+    )
+    return o, final_state
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": bt}, num_warps=nw, num_stages=ns)
+        for bt in BT_LIST_AUTOTUNE
+        for nw in NUM_WARPS_AUTOTUNE
+        for ns in [2, 3]
+    ],
+    key=["H", "D"],
+)
+@triton.jit
+def kda_gate_fwd_kernel(
+    g,
+    A,
+    y,
+    g_bias,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    T,
+    H,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t, i_h = tl.program_id(0), tl.program_id(1)
+    n_t = i_t * BT
+
+    b_a = tl.load(A + i_h).to(tl.float32)
+    b_a = -tl.exp(b_a)
+
+    stride_row = H * D
+    stride_col = 1
+
+    g_ptr = tl.make_block_ptr(
+        base=g + i_h * D,
+        shape=(T, D),
+        strides=(stride_row, stride_col),
+        offsets=(n_t, 0),
+        block_shape=(BT, BD),
+        order=(1, 0),
+    )
+
+    y_ptr = tl.make_block_ptr(
+        base=y + i_h * D,
+        shape=(T, D),
+        strides=(stride_row, stride_col),
+        offsets=(n_t, 0),
+        block_shape=(BT, BD),
+        order=(1, 0),
+    )
+
+    b_g = tl.load(g_ptr, boundary_check=(0, 1)).to(tl.float32)
+
+    if HAS_BIAS:
+        n_d = tl.arange(0, BD)
+        bias_mask = n_d < D
+        b_bias = tl.load(g_bias + i_h * D + n_d, mask=bias_mask, other=0.0).to(
+            tl.float32
+        )
+        b_g = b_g + b_bias[None, :]
+
+    # softplus(x, beta) = (1/beta) * log(1 + exp(beta * x))
+    # When beta * x > threshold, use linear approximation x
+    # Use threshold to switch to linear when beta*x > threshold
+    g_scaled = b_g * beta
+    use_linear = g_scaled > threshold
+    sp = tl.where(use_linear, b_g, (1.0 / beta) * log(1.0 + tl.exp(g_scaled)))
+    b_y = b_a * sp
+
+    tl.store(y_ptr, b_y.to(y.dtype.element_ty), boundary_check=(0, 1))
+
+
+def fused_kda_gate(
+    g: torch.Tensor,
+    A: torch.Tensor,
+    head_k_dim: int,
+    g_bias: torch.Tensor | None = None,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> torch.Tensor:
+    """
+    Forward pass for KDA gate:
+      input g: [..., H*D]
+      param A: [H] or [1, 1, H, 1]
+      beta: softplus beta parameter
+      threshold: softplus threshold parameter
+      return  : [..., H, D]
+    """
+    orig_shape = g.shape[:-1]
+
+    g = g.view(-1, g.shape[-1])
+    T = g.shape[0]
+    HD = g.shape[1]
+    H = A.numel()
+    assert H * head_k_dim == HD
+
+    y = torch.empty_like(g, dtype=torch.float32)
+
+    def grid(meta):
+        return (cdiv(T, meta["BT"]), H)
+
+    kda_gate_fwd_kernel[grid](
+        g,
+        A,
+        y,
+        g_bias,
+        beta,
+        threshold,
+        T,
+        H,
+        head_k_dim,
+        BD=next_power_of_2(head_k_dim),
+        HAS_BIAS=g_bias is not None,
+    )
+
+    y = y.view(*orig_shape, H, head_k_dim)
+    return y
diff --git a/vllm/model_executor/layers/fla/ops/l2norm.py b/vllm/model_executor/layers/fla/ops/l2norm.py
index ef9788ceaf20e2968169a104c043f3242db29ad9..4d7dbb510068167c2191c485bd234a48571c34a7 100644
--- a/vllm/model_executor/layers/fla/ops/l2norm.py
+++ b/vllm/model_executor/layers/fla/ops/l2norm.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import os
-from typing import Optional
 
 import torch
 
@@ -19,11 +18,12 @@ BT_LIST = [8, 16, 32, 64, 128]
 USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0"))
 
 
-@triton.autotune(configs=[
-    triton.Config({}, num_warps=num_warps)
-    for num_warps in [1, 2, 4, 8, 16, 32]
-],
-                 key=['D'])
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16, 32]
+    ],
+    key=["D"],
+)
 @triton.jit
 def l2norm_fwd_kernel1(
     x,
@@ -47,11 +47,14 @@ def l2norm_fwd_kernel1(
     tl.store(y + cols, b_y, mask=mask)
 
 
-@triton.autotune(configs=[
-    triton.Config({'BT': BT}, num_warps=num_warps)
-    for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST
-],
-                 key=['D'])
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+        for BT in BT_LIST
+    ],
+    key=["D"],
+)
 @triton.jit(do_not_specialize=["NB"])
 def l2norm_fwd_kernel(
     x,
@@ -85,9 +88,9 @@ def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr):
     tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask)
 
 
-def l2norm_fwd(x: torch.Tensor,
-               eps: float = 1e-6,
-               output_dtype: Optional[torch.dtype] = None):
+def l2norm_fwd(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: torch.dtype | None = None
+):
     x_shape_og = x.shape
     x = x.view(-1, x.shape[-1])
     # allocate output
@@ -107,7 +110,7 @@ def l2norm_fwd(x: torch.Tensor,
     if not USE_DEFAULT_FLA_NORM:
         MBLOCK = 32
         # M, N = x.shape
-        l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK), )](
+        l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK),)](
             x,
             y,
             eps,
@@ -120,7 +123,7 @@ def l2norm_fwd(x: torch.Tensor,
             NB = triton.cdiv(T, 2048)
 
             def grid(meta):
-                return (triton.cdiv(T, meta['BT']), )
+                return (triton.cdiv(T, meta["BT"]),)
 
             l2norm_fwd_kernel[grid](
                 x,
@@ -132,7 +135,7 @@ def l2norm_fwd(x: torch.Tensor,
                 BD=BD,
             )
         else:
-            l2norm_fwd_kernel1[(T, )](
+            l2norm_fwd_kernel1[(T,)](
                 x,
                 y,
                 eps=eps,
diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
index a733c6c81e369a2b562edfa405a053c320391497..89352d12beefb7e3a16058a718802507433c1b1e 100644
--- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py
+++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
@@ -13,7 +13,7 @@
 # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 
-from typing import Optional
+from functools import lru_cache
 
 import torch
 import torch.nn as nn
@@ -21,18 +21,21 @@ import torch.nn.functional as F
 from einops import rearrange
 
 from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import cdiv, next_power_of_2
 
 from .utils import input_guard
 
 
-def rms_norm_ref(x,
-                 weight,
-                 bias,
-                 z=None,
-                 eps=1e-6,
-                 group_size=None,
-                 norm_before_gate=True,
-                 upcast=True):
+def rms_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    upcast=True,
+):
     dtype = x.dtype
     weight = weight.float()
     bias = bias.float() if bias is not None else None
@@ -43,12 +46,10 @@ def rms_norm_ref(x,
         x = x * F.silu(z)
     if group_size is None:
         rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd *
-                                                                   weight)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
     else:
         x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) +
-                              eps)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
         out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
         if bias is not None:
             out = out + bias
@@ -57,10 +58,12 @@ def rms_norm_ref(x,
     return out.to(dtype)
 
 
-@triton.heuristics({
-    "HAS_BIAS": lambda args: args["B"] is not None,
-    "HAS_Z": lambda args: args["Z"] is not None,
-})
+@triton.heuristics(
+    {
+        "HAS_BIAS": lambda args: args["B"] is not None,
+        "HAS_Z": lambda args: args["Z"] is not None,
+    }
+)
 @triton.jit
 def layer_norm_fwd_kernel(
     X,  # pointer to the input
@@ -74,55 +77,103 @@ def layer_norm_fwd_kernel(
     stride_y_row,
     stride_z_row,
     M,  # number of rows in X
-    N,  # number of columns in X
+    N: tl.constexpr,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     BLOCK_N: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
     HAS_BIAS: tl.constexpr,
     HAS_Z: tl.constexpr,
     NORM_BEFORE_GATE: tl.constexpr,
     IS_RMS_NORM: tl.constexpr,
 ):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
+    # Map the program id to the starting row of X and Y it should compute.
+    row_start = tl.program_id(0) * ROWS_PER_BLOCK
     group = tl.program_id(1)
-    X += row * stride_x_row + group * N
-    Y += row * stride_y_row + group * N
-    if HAS_Z:
-        Z += row * stride_z_row + group * N
-    if not IS_RMS_NORM:
-        Mean += group * M
-    Rstd += group * M
-    W += group * N
-    if HAS_BIAS:
-        B += group * N
-    # Compute mean and variance
+
+    # Create 2D tile: [ROWS_PER_BLOCK, BLOCK_N]
+    rows = row_start + tl.arange(0, ROWS_PER_BLOCK)
     cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+
+    # Compute offsets for 2D tile
+    row_offsets = rows[:, None] * stride_x_row
+    col_offsets = cols[None, :] + group * N
+
+    # Base pointers
+    X_base = X + row_offsets + col_offsets
+    Y_base = Y + rows[:, None] * stride_y_row + col_offsets
+
+    # Create mask for valid rows and columns
+    row_mask = rows[:, None] < M
+    col_mask = cols[None, :] < N
+    mask = row_mask & col_mask
+
+    # Load input data with 2D tile
+    x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32)
+
     if HAS_Z and not NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
         x *= z * tl.sigmoid(z)
+
+    # Compute mean and variance per row (reduce along axis 1)
     if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.)
-        var = tl.sum(xbar * xbar, axis=0) / N
+        mean = tl.sum(x, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        # Store mean for each row
+        mean_offsets = group * M + rows
+        mean_mask = rows < M
+        tl.store(Mean + mean_offsets, mean, mask=mean_mask)
+        # Broadcast mean back to 2D for subtraction
+        xbar = tl.where(mask, x - mean[:, None], 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
     else:
-        xbar = tl.where(cols < N, x, 0.)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
+        xbar = tl.where(mask, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        mean = 0.0  # Placeholder for RMS norm
+
+    rstd = tl.rsqrt(var + eps)  # Shape: [ROWS_PER_BLOCK]
+
+    # Store rstd for each row
+    rstd_offsets = group * M + rows
+    rstd_mask = rows < M
+    tl.store(Rstd + rstd_offsets, rstd, mask=rstd_mask)
+
+    # Load weights and biases (broadcast across rows)
+    w_offsets = cols + group * N
+    w_mask = cols < N
+    w = tl.load(W + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
     if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
+        b = tl.load(B + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
+    # Normalize and apply linear transformation
+    if not IS_RMS_NORM:
+        x_hat = (x - mean[:, None]) * rstd[:, None]
+    else:
+        x_hat = x * rstd[:, None]
+
+    y = x_hat * w[None, :] + b[None, :] if HAS_BIAS else x_hat * w[None, :]
+
     if HAS_Z and NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
         y *= z * tl.sigmoid(z)
+
     # Write output
-    tl.store(Y + cols, y, mask=mask)
+    tl.store(Y_base, y, mask=mask)
+
+
+@lru_cache
+def _get_sm_count(device: torch.device) -> int:
+    """Get and cache the SM count for a given device."""
+    props = torch.cuda.get_device_properties(device)
+    return props.multi_processor_count
+
+
+def calc_rows_per_block(M: int, device: torch.device) -> int:
+    sm_count = _get_sm_count(device)
+    rows_per_block = next_power_of_2(cdiv(M, 2 * sm_count))
+    rows_per_block = min(rows_per_block, 4)
+    return rows_per_block
 
 
 def layer_norm_fwd(
@@ -145,64 +196,72 @@ def layer_norm_fwd(
     if z is not None:
         assert z.stride(-1) == 1
         assert z.shape == (M, N)
-    assert weight.shape == (N, )
+    assert weight.shape == (N,)
     assert weight.stride(-1) == 1
     if bias is not None:
         assert bias.stride(-1) == 1
-        assert bias.shape == (N, )
+        assert bias.shape == (N,)
     # allocate output
     if out is not None:
         assert out.shape == x.shape
     else:
         out = torch.empty_like(x)
     assert out.stride(-1) == 1
-    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
-                       device=x.device) if not is_rms_norm else None
-    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
     if group_size > BLOCK_N:
-        raise RuntimeError(
-            "This layer norm doesn't support feature dim >= 64KB.")
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     # heuristics for number of warps
     num_warps = min(max(BLOCK_N // 256, 1), 8)
-    grid = (M, ngroups)
-    layer_norm_fwd_kernel[grid](x,
-                                out,
-                                weight,
-                                bias,
-                                z,
-                                mean,
-                                rstd,
-                                x.stride(0),
-                                out.stride(0),
-                                z.stride(0) if z is not None else 0,
-                                M,
-                                group_size,
-                                eps,
-                                BLOCK_N=BLOCK_N,
-                                NORM_BEFORE_GATE=norm_before_gate,
-                                IS_RMS_NORM=is_rms_norm,
-                                num_warps=num_warps)
+    # Calculate rows per block based on SM count
+    rows_per_block = calc_rows_per_block(M, x.device)
+    # Update grid to use rows_per_block
+    grid = (cdiv(M, rows_per_block), ngroups)
+    layer_norm_fwd_kernel[grid](
+        x,
+        out,
+        weight,
+        bias,
+        z,
+        mean,
+        rstd,
+        x.stride(0),
+        out.stride(0),
+        z.stride(0) if z is not None else 0,
+        M,
+        group_size,
+        eps,
+        BLOCK_N=BLOCK_N,
+        ROWS_PER_BLOCK=rows_per_block,
+        NORM_BEFORE_GATE=norm_before_gate,
+        IS_RMS_NORM=is_rms_norm,
+        num_warps=num_warps,
+    )
     return out, mean, rstd
 
 
 class LayerNormFn(torch.autograd.Function):
-
     @input_guard
     @staticmethod
-    def forward(ctx,
-                x,
-                weight,
-                bias,
-                z=None,
-                eps=1e-6,
-                group_size=None,
-                norm_before_gate=True,
-                is_rms_norm=False):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        z=None,
+        eps=1e-6,
+        group_size=None,
+        norm_before_gate=True,
+        is_rms_norm=False,
+    ):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
 
         x_shape_og = x.shape
         # reshape input data into 2D tensor
@@ -236,39 +295,38 @@ class LayerNormFn(torch.autograd.Function):
         return y.reshape(x_shape_og)
 
 
-def layernorm_fn(x,
-                 weight,
-                 bias,
-                 z=None,
-                 eps=1e-6,
-                 group_size=None,
-                 norm_before_gate=True,
-                 is_rms_norm=False):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
-                             norm_before_gate, is_rms_norm)
+def layernorm_fn(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm
+    )
 
 
-def rmsnorm_fn(x,
-               weight,
-               bias,
-               z=None,
-               eps=1e-6,
-               group_size=None,
-               norm_before_gate=True):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
-                             norm_before_gate, True)
+def rmsnorm_fn(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, True
+    )
 
 
 class LayerNormGated(nn.Module):
-
     def __init__(
         self,
         hidden_size,
         eps: float = 1e-5,
-        group_size: Optional[int] = None,
+        group_size: int | None = None,
         norm_before_gate: bool = True,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ):
         """If group_size is not None, we do GroupNorm with each group having group_size elements.
         group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
@@ -288,27 +346,27 @@ class LayerNormGated(nn.Module):
         torch.nn.init.zeros_(self.bias)
 
     def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return layernorm_fn(x,
-                            self.weight,
-                            self.bias,
-                            z=z,
-                            group_size=self.group_size,
-                            eps=self.eps,
-                            norm_before_gate=self.norm_before_gate)
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return layernorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            group_size=self.group_size,
+            eps=self.eps,
+            norm_before_gate=self.norm_before_gate,
+        )
 
 
 class RMSNormGated(nn.Module):
-
     def __init__(
         self,
         hidden_size,
         eps: float = 1e-5,
-        group_size: Optional[int] = None,
+        group_size: int | None = None,
         norm_before_gate: bool = False,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ):
         """If group_size is not None, we do GroupNorm with each group having group_size elements.
         group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
@@ -326,12 +384,13 @@ class RMSNormGated(nn.Module):
         torch.nn.init.ones_(self.weight)
 
     def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return rmsnorm_fn(x,
-                          self.weight,
-                          self.bias,
-                          z=z,
-                          eps=self.eps,
-                          group_size=self.group_size,
-                          norm_before_gate=self.norm_before_gate)
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return rmsnorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            eps=self.eps,
+            group_size=self.group_size,
+            norm_before_gate=self.norm_before_gate,
+        )
diff --git a/vllm/model_executor/layers/fla/ops/op.py b/vllm/model_executor/layers/fla/ops/op.py
index 8c29434ca106ad7d8e66399eef533cb64c9bb50d..a91975c8e567ae3e6413fab9cdddf5aa6bec8494 100644
--- a/vllm/model_executor/layers/fla/ops/op.py
+++ b/vllm/model_executor/layers/fla/ops/op.py
@@ -11,29 +11,50 @@ import os
 
 from vllm.triton_utils import tl, tldevice, triton
 
-if os.environ.get('FLA_USE_FAST_OPS', '0') == '1':
-    div = tldevice.fast_dividef
+from .utils import is_gather_supported
+
+if os.environ.get("FLA_USE_FAST_OPS", "0") == "1":
     exp = tldevice.fast_expf
     log = tldevice.fast_logf
     log2 = tldevice.fast_log2f
 else:
-
-    @triton.jit
-    def div_normal(x, y):
-        return x / y
-
-    div = div_normal
     exp = tl.exp
     log = tl.log
     log2 = tl.log2
 
 
-if not hasattr(tl, 'gather'):
+if not is_gather_supported:
 
     @triton.jit
     def gather(src, index, axis, _builder=None):
-        # This is a fallback implementation when tl.gather is not supported
-        # In order to pass triton compiler, there is no actual gather operation
-        return src
+        """
+        Gather operation that works when tl.gather is not supported.
+        This is a fallback implementation that returns None.
+        Just to make triton compiler happy.
+        """
+        return None
 else:
     gather = tl.gather
+
+if hasattr(triton.language, "_experimental_make_tensor_descriptor"):
+    # For Triton 3.3.x
+    make_tensor_descriptor = triton.language._experimental_make_tensor_descriptor
+elif hasattr(triton.language, "make_tensor_descriptor"):
+    # For Triton 3.4.x and later
+    make_tensor_descriptor = triton.language.make_tensor_descriptor
+else:
+    """
+    Fallback implementation when TMA is not supported.
+    Returns None to indicate TMA descriptors are unavailable.
+    Just make triton compiler happy.
+    """
+
+    @triton.jit
+    def make_tensor_descriptor(
+        base,
+        shape,
+        strides,
+        block_shape,
+        _builder=None,
+    ):
+        return None
diff --git a/vllm/model_executor/layers/fla/ops/solve_tril.py b/vllm/model_executor/layers/fla/ops/solve_tril.py
index 97cb0d800d4110dd44bc343ea5b32caff638317c..da85aab19207db65b8eee6908f2552806f331088 100644
--- a/vllm/model_executor/layers/fla/ops/solve_tril.py
+++ b/vllm/model_executor/layers/fla/ops/solve_tril.py
@@ -7,359 +7,550 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
+
+import os
 
 import torch
 
 from vllm.triton_utils import tl, triton
 
 from .index import prepare_chunk_indices
-from .utils import input_guard
+from .op import make_tensor_descriptor
+from .utils import input_guard, is_amd, is_tma_supported
+
+FLA_TRIL_PRECISION = os.environ.get("FLA_TRIL_PRECISION", "ieee")
+ALLOWED_TRIL_PRECISIONS = ["ieee", "tf32"] if is_amd else ["ieee", "tf32", "tf32x3"]
+assert FLA_TRIL_PRECISION in ALLOWED_TRIL_PRECISIONS, (
+    f"FLA_TRIL_PRECISION must be one of {ALLOWED_TRIL_PRECISIONS}, but got {FLA_TRIL_PRECISION}"
+)
 
 
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5]
+        for num_warps in [1, 2, 4, 8]
+        for num_stages in [2, 3, 4, 5]
     ],
-    key=['BT'],
+    key=["BT"],
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=["T"])
 def solve_tril_16x16_kernel(
     A,
-    Ad,
+    Ai,
     cu_seqlens,
     chunk_indices,
     T,
     H: tl.constexpr,
     BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
     IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
 ):
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
 
     A = A + (bos * H + i_h) * BT
-    Ad = Ad + (bos * H + i_h) * 16
+    Ai = Ai + (bos * H + i_h) * 16
 
     offset = (i_t * 16) % BT
-    p_A = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * 16, offset),
-                            (16, 16), (1, 0))
-    p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16),
-                             (1, 0))
-    b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
-    b_A = -tl.where(
-        tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0)
+    if not USE_TMA:
+        p_A = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0)
+        )
+        # [16, 16]
+        b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, 16], [H * 16, 1], [16, 16])
+        b_A = desc.load([i_t * 16, offset]).to(tl.float32)
+    b_A = -tl.where(m_A, b_A, 0)
 
-    o_i = tl.arange(0, 16)
-    for i in range(1, min(16, T - i_t * 16)):
+    for i in range(2, min(16, T - i_t * 16)):
+        # [16]
         b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset)
         b_a = b_a + tl.sum(b_a[:, None] * b_A, 0)
-        mask = o_i == i
-        b_A = tl.where(mask[:, None], b_a, b_A)
-    b_A += o_i[:, None] == o_i[None, :]
-    tl.store(p_Ai,
-             b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
+        b_A = tl.where((o_i == i)[:, None], b_a, b_A)
+    b_A += m_I
+    if not USE_TMA:
+        p_Ai = tl.make_block_ptr(
+            Ai, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0)
+        )
+        tl.store(
+            p_Ai,
+            b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store([i_t * 16, 0], b_A.to(desc_o.dtype, fp_downcast_rounding="rtne"))
 
 
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5]
+        for num_warps in [1, 2, 4, 8]
+        for num_stages in [2, 3, 4, 5]
     ],
-    key=['H', 'BT', 'IS_VARLEN'],
+    key=["H", "BT", "IS_VARLEN"],
 )
-@triton.jit(do_not_specialize=['T'])
-def merge_16x16_to_32x32_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices,
-                                        T, H: tl.constexpr, BT: tl.constexpr,
-                                        IS_VARLEN: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_32x32_inverse_kernel(
+    A,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    A += (bos * H + i_h) * 32
-    Ad += (bos * H + i_h) * 16
-    Ai += (bos * H + i_h) * 32
-
-    p_A_21 = tl.make_block_ptr(A, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0),
-                               (16, 16), (1, 0))
-    p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32, 0),
-                                (16, 16), (1, 0))
-    p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32 + 16, 0),
-                                (16, 16), (1, 0))
-    p_Ai_11 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32, 0),
-                                (16, 16), (1, 0))
-    p_Ai_22 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 16),
-                                (16, 16), (1, 0))
-    p_Ai_21 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0),
-                                (16, 16), (1, 0))
-
-    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
-    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
-    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
-    Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'),
-                    Ai_11,
-                    input_precision='ieee')
-    tl.store(p_Ai_11,
-             Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_22,
-             Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_21,
-             Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-
-
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+    A += (bos * H + i_h) * BT
+    Ai += (bos * H + i_h) * BT
+
+    if not USE_TMA:
+        p_A_11 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0)
+        )
+        p_A_22 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0)
+        )
+        b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16])
+        b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32)
+        b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32)
+
+    # [16, 16]
+    b_Ai_11 = -tl.where(m_A, b_Ai_11, 0)
+    b_Ai_22 = -tl.where(m_A, b_Ai_22, 0)
+
+    for i in range(2, min(16, T - i_t * BT)):
+        b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i)
+        b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0)
+        b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11)
+    for i in range(16 + 2, min(32, T - i_t * BT)):
+        b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16)
+        b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0)
+        b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22)
+
+    b_Ai_11 += m_I
+    b_Ai_22 += m_I
+
+    if not USE_TMA:
+        p_A_21 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0)
+        )
+        b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32)
+
+    b_Ai_21 = -tl.dot(
+        tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION),
+        b_Ai_11,
+        input_precision=DOT_PRECISION,
+    )
+
+    if not USE_TMA:
+        p_Ai_11 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0)
+        )
+        p_Ai_21 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0)
+        )
+        p_Ai_22 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0)
+        )
+        tl.store(
+            p_Ai_11,
+            b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_22,
+            b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_21,
+            b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store(
+            [i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [2, 4, 8] for num_stages in [2, 3, 4, 5]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4, 5]
     ],
-    key=['H', 'BT', 'IS_VARLEN'],
+    key=["H", "BT", "IS_VARLEN"],
 )
-@triton.jit(do_not_specialize=['T'])
-def merge_16x16_to_64x64_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices,
-                                        T, H: tl.constexpr, BT: tl.constexpr,
-                                        IS_VARLEN: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_64x64_inverse_kernel(
+    A,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    A += (bos * H + i_h) * 64
-    Ad += (bos * H + i_h) * 16
-    Ai += (bos * H + i_h) * 64
-
-    p_A_21 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0),
-                               (16, 16), (1, 0))
-    p_A_32 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16),
-                               (16, 16), (1, 0))
-    p_A_31 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0),
-                               (16, 16), (1, 0))
-    p_A_43 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32),
-                               (16, 16), (1, 0))
-    p_A_42 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16),
-                               (16, 16), (1, 0))
-    p_A_41 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0),
-                               (16, 16), (1, 0))
-    p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64, 0),
-                                (16, 16), (1, 0))
-    p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 16, 0),
-                                (16, 16), (1, 0))
-    p_Ad_33 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 32, 0),
-                                (16, 16), (1, 0))
-    p_Ad_44 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 48, 0),
-                                (16, 16), (1, 0))
-
-    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
-    A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
-    A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
-    A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
-    A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
-    A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
-
-    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
-    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
-    Ai_33 = tl.load(p_Ad_33, boundary_check=(0, 1)).to(tl.float32)
-    Ai_44 = tl.load(p_Ad_44, boundary_check=(0, 1)).to(tl.float32)
-
-    Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'),
-                    Ai_11,
-                    input_precision='ieee')
-    Ai_32 = -tl.dot(tl.dot(Ai_33, A_32, input_precision='ieee'),
-                    Ai_22,
-                    input_precision='ieee')
-    Ai_43 = -tl.dot(tl.dot(Ai_44, A_43, input_precision='ieee'),
-                    Ai_33,
-                    input_precision='ieee')
-
-    Ai_31 = -tl.dot(Ai_33,
-                    tl.dot(A_31, Ai_11, input_precision='ieee') +
-                    tl.dot(A_32, Ai_21, input_precision='ieee'),
-                    input_precision='ieee')
-    Ai_42 = -tl.dot(Ai_44,
-                    tl.dot(A_42, Ai_22, input_precision='ieee') +
-                    tl.dot(A_43, Ai_32, input_precision='ieee'),
-                    input_precision='ieee')
-    Ai_41 = -tl.dot(Ai_44,
-                    tl.dot(A_41, Ai_11, input_precision='ieee') +
-                    tl.dot(A_42, Ai_21, input_precision='ieee') +
-                    tl.dot(A_43, Ai_31, input_precision='ieee'),
-                    input_precision='ieee')
-
-    p_Ai_11 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 0),
-                                (16, 16), (1, 0))
-    p_Ai_22 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 16),
-                                (16, 16), (1, 0))
-    p_Ai_33 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 32),
-                                (16, 16), (1, 0))
-    p_Ai_44 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 48),
-                                (16, 16), (1, 0))
-    p_Ai_21 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0),
-                                (16, 16), (1, 0))
-    p_Ai_31 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0),
-                                (16, 16), (1, 0))
-    p_Ai_32 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16),
-                                (16, 16), (1, 0))
-    p_Ai_41 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0),
-                                (16, 16), (1, 0))
-    p_Ai_42 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16),
-                                (16, 16), (1, 0))
-    p_Ai_43 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32),
-                                (16, 16), (1, 0))
-    tl.store(p_Ai_11,
-             Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_22,
-             Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_33,
-             Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_44,
-             Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_21,
-             Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_31,
-             Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_32,
-             Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_41,
-             Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_42,
-             Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_43,
-             Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-
-    fill_zeros = tl.zeros((16, 16), dtype=tl.float32)
-    p_Ai_12 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 16),
-                                (16, 16), (1, 0))
-    p_Ai_13 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 32),
-                                (16, 16), (1, 0))
-    p_Ai_14 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 48),
-                                (16, 16), (1, 0))
-    p_Ai_23 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 32),
-                                (16, 16), (1, 0))
-    p_Ai_24 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 48),
-                                (16, 16), (1, 0))
-    p_Ai_34 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 48),
-                                (16, 16), (1, 0))
-    tl.store(p_Ai_12,
-             fill_zeros.to(p_Ai_12.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_13,
-             fill_zeros.to(p_Ai_13.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_14,
-             fill_zeros.to(p_Ai_14.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_23,
-             fill_zeros.to(p_Ai_23.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_24,
-             fill_zeros.to(p_Ai_24.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
-    tl.store(p_Ai_34,
-             fill_zeros.to(p_Ai_34.dtype.element_ty,
-                           fp_downcast_rounding="rtne"),
-             boundary_check=(0, 1))
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+    A += (bos * H + i_h) * BT
+    Ai += (bos * H + i_h) * BT
+
+    if not USE_TMA:
+        p_A_11 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0)
+        )
+        p_A_22 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0)
+        )
+        p_A_33 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0)
+        )
+        p_A_44 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0)
+        )
+        b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_33 = tl.load(p_A_33, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_44 = tl.load(p_A_44, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16])
+        b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32)
+        b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32)
+        b_Ai_33 = desc.load([i_t * BT + 32, 32]).to(tl.float32)
+        b_Ai_44 = desc.load([i_t * BT + 48, 48]).to(tl.float32)
+
+    # [16, 16]
+    b_Ai_11 = -tl.where(m_A, b_Ai_11, 0)
+    b_Ai_22 = -tl.where(m_A, b_Ai_22, 0)
+    b_Ai_33 = -tl.where(m_A, b_Ai_33, 0)
+    b_Ai_44 = -tl.where(m_A, b_Ai_44, 0)
+
+    for i in range(2, min(16, T - i_t * BT)):
+        b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i)
+        b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0)
+        b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11)
+    for i in range(16 + 2, min(32, T - i_t * BT)):
+        b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16)
+        b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0)
+        b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22)
+    for i in range(32 + 2, min(48, T - i_t * BT)):
+        b_a_33 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 32)
+        b_a_33 += tl.sum(b_a_33[:, None] * b_Ai_33, 0)
+        b_Ai_33 = tl.where((o_i == i - 32)[:, None], b_a_33, b_Ai_33)
+    for i in range(48 + 2, min(64, T - i_t * BT)):
+        b_a_44 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 48)
+        b_a_44 += tl.sum(b_a_44[:, None] * b_Ai_44, 0)
+        b_Ai_44 = tl.where((o_i == i - 48)[:, None], b_a_44, b_Ai_44)
+    b_Ai_11 += m_I
+    b_Ai_22 += m_I
+    b_Ai_33 += m_I
+    b_Ai_44 += m_I
+
+    if not USE_TMA:
+        p_A_21 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0)
+        )
+        p_A_31 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0)
+        )
+        p_A_32 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0)
+        )
+        p_A_41 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0)
+        )
+        p_A_42 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0)
+        )
+        p_A_43 = tl.make_block_ptr(
+            A, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0)
+        )
+        b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+        b_A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
+        b_A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
+        b_A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
+        b_A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
+        b_A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32)
+        b_A_31 = desc.load([i_t * BT + 32, 0]).to(tl.float32)
+        b_A_32 = desc.load([i_t * BT + 32, 16]).to(tl.float32)
+        b_A_41 = desc.load([i_t * BT + 48, 0]).to(tl.float32)
+        b_A_42 = desc.load([i_t * BT + 48, 16]).to(tl.float32)
+        b_A_43 = desc.load([i_t * BT + 48, 32]).to(tl.float32)
+
+    b_Ai_21 = -tl.dot(
+        tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION),
+        b_Ai_11,
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_32 = -tl.dot(
+        tl.dot(b_Ai_33, b_A_32, input_precision=DOT_PRECISION),
+        b_Ai_22,
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_43 = -tl.dot(
+        tl.dot(b_Ai_44, b_A_43, input_precision=DOT_PRECISION),
+        b_Ai_33,
+        input_precision=DOT_PRECISION,
+    )
+
+    b_Ai_31 = -tl.dot(
+        b_Ai_33,
+        tl.dot(b_A_31, b_Ai_11, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_32, b_Ai_21, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_42 = -tl.dot(
+        b_Ai_44,
+        tl.dot(b_A_42, b_Ai_22, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_43, b_Ai_32, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_41 = -tl.dot(
+        b_Ai_44,
+        tl.dot(b_A_41, b_Ai_11, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_42, b_Ai_21, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_43, b_Ai_31, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+
+    if not USE_TMA:
+        p_Ai_11 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0)
+        )
+        p_Ai_22 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0)
+        )
+        p_Ai_33 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0)
+        )
+        p_Ai_44 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0)
+        )
+        p_Ai_21 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0)
+        )
+        p_Ai_31 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0)
+        )
+        p_Ai_32 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0)
+        )
+        p_Ai_41 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0)
+        )
+        p_Ai_42 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0)
+        )
+        p_Ai_43 = tl.make_block_ptr(
+            Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0)
+        )
+        tl.store(
+            p_Ai_11,
+            b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_22,
+            b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_33,
+            b_Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_44,
+            b_Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_21,
+            b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_31,
+            b_Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_32,
+            b_Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_41,
+            b_Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_42,
+            b_Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_43,
+            b_Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store(
+            [i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 32, 32], b_Ai_33.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 48, 48], b_Ai_44.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 32, 0], b_Ai_31.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 32, 16], b_Ai_32.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 48, 0], b_Ai_41.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 48, 16], b_Ai_42.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
+        desc_o.store(
+            [i_t * BT + 48, 32], b_Ai_43.to(desc_o.dtype, fp_downcast_rounding="rtne")
+        )
 
 
 @input_guard
-def solve_tril(A: torch.Tensor,
-               cu_seqlens: Optional[torch.Tensor] = None,
-               output_dtype: torch.dtype = torch.float) -> torch.Tensor:
+def solve_tril(
+    A: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
     """
-    Compute the inverse of the lower triangular matrix
+    Compute the inverse of the matrix I + A
     A should be strictly lower triangular, i.e., A.triu() == 0.
 
     Args:
         A (torch.Tensor):
-            [B, T, H, K]
+            [B, T, H, BT], where BT should only be 16, 32, or 64.
         cu_seqlens (torch.Tensor):
-            The cumulative sequence lengths of the input tensor.
-            Default: None.
+            The cumulative sequence lengths of the input tensor. Default: `None`.
         output_dtype (torch.dtype):
-            The dtype of the output tensor. Default: `torch.float`
+            The dtype of the output tensor. Default: `torch.float`.
+            If `None`, the output dtype will be the same as the input dtype.
 
     Returns:
         (I + A)^-1 with the same shape as A
     """
     assert A.shape[-1] in [16, 32, 64]
+    output_dtype = A.dtype if output_dtype is None else output_dtype
 
     B, T, H, BT = A.shape
-    Ad = torch.empty(B,
-                     T,
-                     H,
-                     16,
-                     device=A.device,
-                     dtype=torch.float if BT != 16 else output_dtype)
-
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, 16) if cu_seqlens is not None else None
-    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16)
-    solve_tril_16x16_kernel[NT, B * H](
-        A=A,
-        Ad=Ad,
-        cu_seqlens=cu_seqlens,
-        chunk_indices=chunk_indices,
-        T=T,
-        H=H,
-        BT=BT,
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     )
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
+
+    Ai = torch.zeros_like(A, dtype=output_dtype)
     if BT == 16:
-        return Ad
+        merge_fn = solve_tril_16x16_kernel
+    elif BT == 32:
+        merge_fn = merge_16x16_to_32x32_inverse_kernel
+    elif BT == 64:
+        merge_fn = merge_16x16_to_64x64_inverse_kernel
 
-    Ai = torch.empty(B, T, H, BT, device=A.device, dtype=output_dtype)
-    merge_fn = merge_16x16_to_32x32_inverse_kernel if BT == 32 else merge_16x16_to_64x64_inverse_kernel
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
-    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
     merge_fn[NT, B * H](
         A=A,
-        Ad=Ad,
         Ai=Ai,
         cu_seqlens=cu_seqlens,
         chunk_indices=chunk_indices,
         T=T,
         H=H,
         BT=BT,
+        USE_TMA=is_tma_supported,
+        DOT_PRECISION=FLA_TRIL_PRECISION,
     )
     return Ai
diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py
index 7fd90cee45d0e71b3c5a67456da6823efcbdb728..5a48e56a5fbbfa1fb21ebf18e581af0e5e696feb 100644
--- a/vllm/model_executor/layers/fla/ops/utils.py
+++ b/vllm/model_executor/layers/fla/ops/utils.py
@@ -11,11 +11,13 @@ import contextlib
 import functools
 import logging
 import os
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable, Literal, Optional
+from typing import Any, Literal
 
 import torch
 
+from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 
 logger = logging.getLogger(__name__)
@@ -27,8 +29,7 @@ FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1"
 SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0"))
 
 
-def tensor_cache(
-        fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
     """
     A decorator that caches the most recent results of a function with tensor inputs.
 
@@ -44,20 +45,27 @@ def tensor_cache(
             A wrapped version of the input function with single-entry caching.
     """
 
-    cache_entries: tuple[Optional[tuple], Optional[dict], Any] = []
-    cache_size = 4
+    cache_entries: tuple[tuple | None, dict | None, Any] = []
+    cache_size = 8
 
     @functools.wraps(fn)
     def wrapper(*args: Any, **kwargs: Any) -> Any:
         nonlocal cache_entries, cache_size
         for i, entry in enumerate(cache_entries):
             last_args, last_kwargs, last_result = entry
-            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs) \
-                and all(a is b for a, b in zip(args, last_args)) \
-                and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()):
-                cache_entries = cache_entries[:i] + cache_entries[i + 1:] + [
-                    (args, kwargs, last_result)
-                ]
+            if (
+                len(args) == len(last_args)
+                and len(kwargs) == len(last_kwargs)
+                and all(a is b for a, b in zip(args, last_args))
+                and all(
+                    k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()
+                )
+            ):
+                cache_entries = (
+                    cache_entries[:i]
+                    + cache_entries[i + 1 :]
+                    + [(args, kwargs, last_result)]
+                )
                 return last_result
 
         result = fn(*args, **kwargs)
@@ -70,16 +78,16 @@ def tensor_cache(
     return wrapper
 
 
-def input_guard(
-        fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
     """
     A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
     """
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
-        contiguous_args = (i if not isinstance(i, torch.Tensor) else
-                           i.contiguous() for i in args)
+        contiguous_args = (
+            i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args
+        )
         contiguous_kwargs = {
             k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
             for k, v in kwargs.items()
@@ -112,11 +120,11 @@ def get_available_device() -> str:
     try:
         return triton.runtime.driver.active.get_current_target().backend
     except BaseException:
-        return 'cpu'
+        return "cpu"
 
 
 @functools.cache
-def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']:
+def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
     device = get_available_device()
     mapping = {
         "cuda": "nvidia",
@@ -130,27 +138,33 @@ def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']:
 # For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
 # However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
 # Therefore, we need to check the triton backend to determine the actual GPU vendor.
-device = get_available_device() if get_available_device() != 'hip' else 'cuda'
-device_torch_lib = getattr(torch, device)
+device = "cuda" if current_platform.is_cuda_alike() else get_available_device()
+device_torch_lib = getattr(torch, device, None)
 device_platform = _check_platform()
 
-is_amd = (device_platform == 'amd')
-is_intel = (device_platform == 'intel')
-is_nvidia = (device_platform == 'nvidia')
-is_intel_alchemist = (is_intel
-                      and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0))
-is_nvidia_hopper = (is_nvidia
-                    and ('NVIDIA H' in torch.cuda.get_device_name(0)
-                         or torch.cuda.get_device_capability()[0] >= 9))
-use_cuda_graph = (is_nvidia
-                  and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1')
+is_amd = device_platform == "amd"
+is_intel = device_platform == "intel"
+is_nvidia = device_platform == "nvidia"
+is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)
+is_nvidia_hopper = is_nvidia and (
+    "NVIDIA H" in torch.cuda.get_device_name(0)
+    or torch.cuda.get_device_capability()[0] >= 9
+)
+use_cuda_graph = is_nvidia and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+is_gather_supported = hasattr(triton.language, "gather")
+is_tma_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 9) and (
+    hasattr(triton.language, "_experimental_make_tensor_descriptor")
+    or hasattr(triton.language, "make_tensor_descriptor")
+)
 
 
 def get_all_max_shared_mem():
     try:
         return [
-            triton.runtime.driver.active.utils.get_device_properties(i)
-            ['max_shared_mem'] for i in range(device_torch_lib.device_count())
+            triton.runtime.driver.active.utils.get_device_properties(i)[
+                "max_shared_mem"
+            ]
+            for i in range(device_torch_lib.device_count())
         ]
     except BaseException:
         return [-1]
diff --git a/vllm/model_executor/layers/fla/ops/wy_fast.py b/vllm/model_executor/layers/fla/ops/wy_fast.py
index 70374eb6506425199e44a9ab0d96f20f2eb295e1..a66ec1d60d6687c52c91b24195c0c3183e63dd9c 100644
--- a/vllm/model_executor/layers/fla/ops/wy_fast.py
+++ b/vllm/model_executor/layers/fla/ops/wy_fast.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -17,56 +16,100 @@ from vllm.triton_utils import tl, triton
 from .index import prepare_chunk_indices
 
 
-@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [2, 4, 8] for num_stages in [2, 3, 4]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
     ],
-    key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'IS_VARLEN'],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
 )
-@triton.jit(do_not_specialize=['T'])
-def recompute_w_u_fwd_kernel(k, v, beta, w, u, A, g, cu_seqlens, chunk_indices,
-                             T, H: tl.constexpr, Hg: tl.constexpr,
-                             K: tl.constexpr, V: tl.constexpr,
-                             BT: tl.constexpr, BK: tl.constexpr,
-                             BV: tl.constexpr, IS_VARLEN: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    k,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    g,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
     i_t, i_bh = tl.program_id(0), tl.program_id(1)
     i_b, i_h = i_bh // H, i_bh % H
     if IS_VARLEN:
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
-    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ),
-                               (i_t * BT, ), (BT, ), (0, ))
-    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T, ), (H, ), (i_t * BT, ),
-                            (BT, ), (0, ))
-    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1),
-                            (i_t * BT, 0), (BT, BT), (1, 0))
-    b_beta = tl.load(p_beta, boundary_check=(0, ))
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
     b_A = tl.load(p_A, boundary_check=(0, 1))
-    b_g = tl.exp(tl.load(p_g, boundary_check=(0, )))
+    b_g = tl.exp(tl.load(p_g, boundary_check=(0,)))
 
     for i_v in range(tl.cdiv(V, BV)):
-        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1),
-                                (i_t * BT, i_v * BV), (BT, BV), (1, 0))
-        p_u = tl.make_block_ptr(u + (bos * H + i_h) * V, (T, V), (H * V, 1),
-                                (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
         b_v = tl.load(p_v, boundary_check=(0, 1))
         b_vb = (b_v * b_beta[:, None]).to(b_v.dtype)
         b_u = tl.dot(b_A, b_vb, allow_tf32=False)
         tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
 
     for i_k in range(tl.cdiv(K, BK)):
-        p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K),
-                                (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK),
-                                (1, 0))
-        p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H * K, 1),
-                                (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
         b_k = tl.load(p_k, boundary_check=(0, 1))
         b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype)
         b_w = tl.dot(b_A, b_kb)
@@ -79,14 +122,15 @@ def recompute_w_u_fwd(
     beta: torch.Tensor,
     g_cumsum: torch.Tensor,
     A: torch.Tensor,
-    cu_seqlens: Optional[torch.LongTensor],
+    cu_seqlens: torch.LongTensor | None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     B, T, Hg, K, V = *k.shape, v.shape[-1]
     H = v.shape[-2]
     BT = A.shape[-1]
 
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
     BK = 64
     BV = 64
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 75f56cd01a4ea8822e42bcedef1dcf42f4a970bb..53d98d0650b437d258eb654312186f343b29855b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -2,18 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Any, Optional
+from typing import Any
 
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE,
+    FusedMoeWeightScaleSupported,
+)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat, FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize)
+    FusedMoEActivationFormat,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.triton_utils import HAS_TRITON
 
-_config: Optional[dict[str, Any]] = None
+_config: dict[str, Any] | None = None
 
 
 @contextmanager
@@ -25,7 +33,7 @@ def override_config(config):
     _config = old_config
 
 
-def get_config() -> Optional[dict[str, Any]]:
+def get_config() -> dict[str, Any] | None:
     return _config
 
 
@@ -37,6 +45,7 @@ __all__ = [
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
     "FusedMoEPrepareAndFinalize",
+    "SharedFusedMoE",
     "activation_without_mul",
     "override_config",
     "get_config",
@@ -44,23 +53,32 @@ __all__ = [
 
 if HAS_TRITON:
     # import to register the custom ops
-    import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-        BatchedDeepGemmExperts)
+        BatchedDeepGemmExperts,
+    )
     from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-        BatchedTritonOrDeepGemmExperts)
+        BatchedTritonOrDeepGemmExperts,
+    )
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        CutlassBatchedExpertsFp8, CutlassExpertsFp8, cutlass_moe_fp4,
-        cutlass_moe_fp8)
-    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-        DeepGemmExperts)
+        CutlassBatchedExpertsFp8,
+        CutlassExpertsFp8,
+        cutlass_moe_fp4,
+        cutlass_moe_fp8,
+    )
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
     from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-        BatchedTritonExperts)
+        BatchedTritonExperts,
+    )
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        TritonExperts, fused_experts, fused_topk, get_config_file_name,
-        grouped_topk)
+        TritonExperts,
+        fused_experts,
+        fused_topk,
+        get_config_file_name,
+        grouped_topk,
+    )
     from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-        TritonOrDeepGemmExperts)
+        TritonOrDeepGemmExperts,
+    )
 
     __all__ += [
         "fused_topk",
@@ -82,8 +100,7 @@ else:
     # Some model classes directly use the custom ops. Add placeholders
     # to avoid import errors.
     def _raise_exception(method: str):
-        raise NotImplementedError(
-            f"{method} is not implemented as lack of triton.")
+        raise NotImplementedError(f"{method} is not implemented as lack of triton.")
 
     fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk")
     fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts")
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dd625054339cb47a1b2224370e12e036f5e3a45
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.distributed import (
+    get_ep_group,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.platforms import current_platform
+from vllm.utils.import_utils import has_deep_ep, has_pplx
+
+if current_platform.is_cuda_alike():
+    if has_pplx():
+        from .pplx_prepare_finalize import (
+            PplxPrepareAndFinalize,
+            pplx_hidden_dim_scale_bytes,
+        )
+    if has_deep_ep():
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import (
+            DEEPEP_QUANT_BLOCK_SHAPE,
+            DeepEPLLPrepareAndFinalize,
+        )
+
+
+def maybe_roundup_layer_hidden_size(
+    hidden_size: int,
+    act_dtype: torch.dtype,
+    moe_parallel_config: FusedMoEParallelConfig,
+) -> int:
+    """
+    Given layer hidden size and MoE configurations, round up hidden_size
+    if necessary.
+
+    Args:
+        hidden_size: Layer hidden-size
+        act_dtype: Data type of the layer activations.
+        moe_parallel_config: Fused MoE parallelization strategy configuration.
+
+    Return:
+        Rounded up hidden_size if rounding up is required based on the configs
+        and all2all backend.
+        Original hidden size otherwise.
+    """
+    if moe_parallel_config.use_deepep_ht_kernels:
+        hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size, act_dtype
+        )
+
+    if moe_parallel_config.use_deepep_ll_kernels:
+        hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size
+        )
+
+    return hidden_size
+
+
+def maybe_make_prepare_finalize(
+    moe: FusedMoEConfig,
+    quant_config: FusedMoEQuantConfig | None,
+) -> FusedMoEPrepareAndFinalize | None:
+    if not moe.moe_parallel_config.use_all2all_kernels:
+        return None
+
+    all2all_manager = get_ep_group().device_communicator.all2all_manager
+    assert all2all_manager is not None
+
+    prepare_finalize: FusedMoEPrepareAndFinalize | None = None
+
+    # TODO: could allow this now
+    assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
+
+    if moe.use_pplx_kernels:
+        assert quant_config is not None
+
+        hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
+            moe.max_num_tokens,
+            moe.hidden_dim,
+            moe.in_dtype,
+            quant_config.quant_dtype,
+            per_act_token_quant=quant_config.per_act_token_quant,
+            block_shape=quant_config.block_shape,
+        )
+
+        all_to_all_args = dict(
+            max_num_tokens=moe.max_num_tokens,
+            num_experts=moe.num_experts,
+            experts_per_token=moe.experts_per_token,  # topk
+            rank=all2all_manager.rank,
+            world_size=all2all_manager.world_size,
+            # dp_size actually means tp_size, bug in pplx kernels
+            dp_size=all2all_manager.tp_group.world_size,
+            hidden_dim=moe.hidden_dim,
+            hidden_dim_bytes=hidden_dim_bytes,
+            hidden_dim_scale_bytes=hidden_scale_bytes,
+        )
+
+        num_dispatchers = (
+            all2all_manager.world_size // all2all_manager.tp_group.world_size
+        )
+
+        # Intranode pplx a2a takes a group name while internode does not.
+        if not all2all_manager.internode:
+            all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
+
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        prepare_finalize = PplxPrepareAndFinalize(
+            handle,
+            max_num_tokens=moe.max_num_tokens,
+            num_local_experts=moe.num_local_experts,
+            num_dispatchers=num_dispatchers,
+        )
+    elif moe.use_deepep_ht_kernels:
+        assert moe.dp_size == all2all_manager.dp_world_size
+
+        all_to_all_args = dict()
+        handle = all2all_manager.get_handle(all_to_all_args)
+        prepare_finalize = DeepEPHTPrepareAndFinalize(
+            handle,
+            num_dispatchers=all2all_manager.world_size,
+            dp_size=all2all_manager.dp_world_size,
+            rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
+        )
+
+    elif moe.use_deepep_ll_kernels:
+        assert quant_config is not None
+        all_to_all_args = dict(
+            max_num_tokens_per_dp_rank=moe.max_num_tokens,
+            token_hidden_size=moe.hidden_dim,
+            num_ep_ranks=all2all_manager.world_size,
+            num_global_experts=moe.num_experts,
+            num_local_experts=moe.num_experts // all2all_manager.world_size,
+        )
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        # Note: We may want to use FP8 dispatch just to reduce
+        # data movement.
+        use_fp8_dispatch = (
+            quant_config.quant_dtype == current_platform.fp8_dtype()
+            and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
+        )
+
+        prepare_finalize = DeepEPLLPrepareAndFinalize(
+            handle,
+            max_tokens_per_rank=moe.max_num_tokens,
+            num_dispatchers=all2all_manager.world_size,
+            use_fp8_dispatch=use_fp8_dispatch,
+        )
+
+    return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index cf0b965cc8c517b1835a06a09d1e9b1104aa3e48..53362277dae8afdcfeb4d71ecb77d6651a371c50 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,26 +1,47 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from math import log2
-from typing import Optional
+
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
-    deep_gemm_block_shape)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate,
+)
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
-                                  is_deep_gemm_e8m0_used)
+from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
+    fp8_m_grouped_gemm_nt_masked,
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+)
+from vllm.utils.math_utils import cdiv, round_up
 
 logger = init_logger(__name__)
 
 
+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+
+
 @triton.jit
 def _silu_mul_fp8_quant_deep_gemm(
     # Pointers ------------------------------------------------------------
@@ -49,7 +70,7 @@ def _silu_mul_fp8_quant_deep_gemm(
     eps: tl.constexpr,
     fp8_min: tl.constexpr,
     fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
     # Meta ---------------------------------------------------------------
     BLOCK: tl.constexpr,
     NUM_STAGES: tl.constexpr,
@@ -73,23 +94,20 @@ def _silu_mul_fp8_quant_deep_gemm(
     base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
     base_gate_offset = base_input_offset + cols * stride_i_h
     base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
-    base_yq_offset = (e * stride_yq_e + g * GROUP_SIZE * stride_yq_h +
-                      cols * stride_yq_h)
+    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
     base_ys_offset = e * stride_ys_e + g * stride_ys_g
 
     for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
-        gate = tl.load(input_ptr + base_gate_offset + t * stride_i_t,
-                       mask=mask,
-                       other=0.0).to(tl.float32)
-        up = tl.load(input_ptr + base_up_offset + t * stride_i_t,
-                     mask=mask,
-                     other=0.0)
+        gate = tl.load(
+            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+        ).to(tl.float32)
+        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
 
         gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
         y = gate * up
 
         y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
+        if ceil_ue8m0:
             y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 
         y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
@@ -98,18 +116,57 @@ def _silu_mul_fp8_quant_deep_gemm(
         tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
 
 
-def silu_mul_fp8_quant_deep_gemm_cuda(
+def persistent_masked_m_silu_mul_quant(
     y: torch.Tensor,  # (E, T, 2*H)
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     num_parallel_tokens=16,
     group_size: int = 128,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
     y has shape (E, T, 2*H). The first half of the last dimension is
     silu-activated, multiplied by the second half, then quantized into FP8.
+    We launch a fixed grid of threads to accommodate CUDA graphs. Let `P2`
+    be a parallelization factor for persistent_masked_m_silu_mul_quant over the
+    hidden dimension.
+
+    Let `expert_offsets = [0] + [num_tokens.cumsum()]` and
+    `total_tokens = expert_offsets[-1]`.
+    persistent_masked_m_silu_mul_quant launches `total_tokens x P2` number of
+    thread blocks. Each thread block contains `NUM_WARPS` warps.
+
+    Every thread block needs to find it's corresponding expert by warp-parallel scanning
+    over the `expert_offsets` array.
+
+    The i-th warp in the first thread block processes
+    `[i * warp_chunk_size, (i + 1) * warp_chunk_size]` groups
+    sequentially, where `warp_chunk_size = ((H / GROUP_SIZE) / P2) / NUM_WARPS`,
+    pipelining loads and computes.
+
+    The shared memory layout for 4 warps with a 2-stage pipeline for SiLU V2
+    can is visualized like so:
+
+                         stage0                              stage1
+    ┌─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┐
+    │gate0│up0│gate1│up1│gate2│up2│gate3│up3│gate0│up0│gate1│up1│gate2│up2│gate3│up3│
+    └─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┘
+
+    with the main difference between V1 and V2 being the global load
+    stride between warps, and between half-warps. Regarding the latter stride,
+    we assign the first half warp of every warp for `gate` loads and the second
+    half-warp to `up` loads.
+
     Returns `(y_q, y_s)` where
     * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    Let NUM_WARPS be the number of warps in a single thread block and
+    `GROUP_SIZE = 128` be the size of the quantization group.
     """
     assert y.ndim == 3, "y must be (E, T, 2*H)"
     E, T, H2 = y.shape
@@ -120,70 +177,50 @@ def silu_mul_fp8_quant_deep_gemm_cuda(
     assert group_size == 128, "H must be divisible by 8"
     assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E
 
-    tokens_per_expert = tokens_per_expert.to(device=y.device,
-                                             dtype=torch.int32)
+    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
 
     fp8_dtype = torch.float8_e4m3fn
     y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
 
-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
-    y_s = torch.empty_strided((E, T, G),
-                              (stride_ys_e, stride_ys_t, stride_ys_g),
-                              dtype=torch.float32,
-                              device=y.device)
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
+    y_s = torch.empty_strided(
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
+        device=y.device,
+    )
 
-    use_ue8m0 = is_deep_gemm_e8m0_used()
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
-    if E <= 16:
-        max_empirical_parallelism = 64
-    elif E <= 32:
-        max_empirical_parallelism = 16
-    else:
-        max_empirical_parallelism = 4
-
-    # We never want to launch more than Tx number of threads
-    # This computes the clip.
-    num_parallel_tokens = max(
-        1,
-        min(max_empirical_parallelism, 2**int(log2(min(num_parallel_tokens,
-                                                       T)))))
     cuda_arch = current_platform.get_device_capability(
-        device_id=y.device.index).to_int()
+        device_id=y.device.index
+    ).to_int()
 
     if cuda_arch >= 80:
-        torch.ops._C.silu_mul_fp8_quant_deep_gemm_cuda(y, tokens_per_expert,
-                                                       y_q, y_s, group_size,
-                                                       use_ue8m0,
-                                                       num_parallel_tokens)
+        torch.ops._C.persistent_masked_m_silu_mul_quant(
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
+        )
     else:
-        # Default to triton if not on cuda or if arch is too old
-        y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
-
         stride_cnt_e = tokens_per_expert.stride()[0]
 
         # Static grid over experts and H-groups.
         # A loop inside the kernel handles the token dim
-        grid = (E * G, )
+        grid = (E * G,)
         # strides (elements)
         stride_i_e, stride_i_t, stride_i_h = y.stride()
         stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
 
-        # desired scale strides (elements): (T*G, 1, T)
-        stride_ys_e = T * G
-        stride_ys_t = 1
-        stride_ys_g = T
-        y_s = torch.empty_strided(
-            (E, T, G),
-            (stride_ys_e, stride_ys_t, stride_ys_g),
-            dtype=torch.float32,
-            device=y.device,
-        )
         f_info = torch.finfo(fp8_dtype)
         fp8_max = f_info.max
         fp8_min = f_info.min
         eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
         _silu_mul_fp8_quant_deep_gemm[grid](
             y,
             y_q,
@@ -197,14 +234,14 @@ def silu_mul_fp8_quant_deep_gemm_cuda(
             stride_yq_e,
             stride_yq_t,
             stride_yq_h,
-            stride_ys_e,
-            stride_ys_t,
-            stride_ys_g,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
             stride_cnt_e,
             eps,
             fp8_min,
             fp8_max,
-            is_deep_gemm_e8m0_used(),
+            ceil_ue8m0,
             BLOCK=group_size,
             NUM_STAGES=4,
             num_warps=1,
@@ -214,7 +251,6 @@ def silu_mul_fp8_quant_deep_gemm_cuda(
 
 
 class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         max_num_tokens: int,
@@ -227,16 +263,19 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         quant_config: Quantization configuration
         """
         super().__init__(quant_config)
-        assert self.block_shape == deep_gemm_block_shape()
+        assert self.block_shape == get_mk_alignment_for_contiguous_layout()
+        assert self.quant_config.use_fp8_w8a8
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.BatchedExperts,
-                mk.FusedMoEActivationFormat.BatchedExperts)
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
 
     def supports_chunking(self) -> bool:
         return False
@@ -244,35 +283,63 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def supports_expert_map(self) -> bool:
         return False
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
+        """
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
         return TopKWeightAndReduceDelegate()
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_metadata: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        assert a.dim() == 2
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # FIXME (varun): We should be able to dispatch only from the leader
         # DP ranks in the case of TP > 1. At the moment, all the Ranks
         # end up sending their tokens. This needs to be fixed.
         num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
-        max_num_tokens = a.size(
-            0) if self.max_num_tokens is None else self.max_num_tokens
-        workspace13 = (num_experts, max_num_tokens * num_dispatchers,
-                       max(K, N))
+        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
         workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
         output = (num_experts, max_num_tokens * num_dispatchers, K)
-        return (workspace13, workspace2, output, a.dtype)
+        return (workspace13, workspace2, output)
+
+    def estimate_expected_m(
+        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
+    ) -> int:
+        dp_meta = (
+            get_forward_context().dp_metadata
+            if is_forward_context_available()
+            else None
+        )
+        if dp_meta is None:
+            logger.warning_once(
+                "DPMetadata unavailable. Defaulting expected_m to "
+                f"{max_tokens_per_expert}.",
+                scope="local",
+            )
+            return max_tokens_per_expert
+
+        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
+        total_num_tokens_replicated = total_num_tokens * topk
+
+        # Assume even load balancing
+        assert global_num_experts != 0
+        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
+        # clamp estimate
+        estimate = max(estimate, 16)
+        estimate = min(max_tokens_per_expert, estimate)
+        return estimate
 
     def apply(
         self,
@@ -284,12 +351,12 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert expert_tokens_meta is not None
@@ -303,20 +370,37 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
         assert w2.size(1) == K
 
-        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
-            hidden_states, w1, w2, topk_ids)
+        E, max_num_tokens, N, K, _ = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
 
         workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
 
-        # (from deepgemm docs) : A value hint (which is a value on CPU)
-        # for the M expectation of each batch, correctly setting this value
-        # may lead to better performance.
-        expected_m = max_num_tokens
-        fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, self.w1_scale),
-                                     workspace1, expert_num_tokens, expected_m)
+        expected_m = self.estimate_expected_m(
+            global_num_experts=global_num_experts,
+            max_tokens_per_expert=max_num_tokens,
+            topk=topk_ids.size(-1),
+        )
 
-        a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm_cuda(
-            workspace1, expert_num_tokens)
+        fp8_m_grouped_gemm_nt_masked(
+            (a1q, a1q_scale),
+            (w1, self.w1_scale),
+            workspace1,
+            expert_num_tokens,
+            expected_m,
+        )
 
-        fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, self.w2_scale),
-                                     output, expert_num_tokens, expected_m)
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
+        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
+        )
+
+        fp8_m_grouped_gemm_nt_masked(
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index c3c4f4a5d190ecc60b09bf2f881eab45b865e42e..e69e9fd307aebf143fd64f3fb2f9da81146d0c52 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,21 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
+    BatchedDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
-    deep_gemm_block_shape)
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
+from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
 
 
 class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         max_num_tokens: int,
@@ -31,27 +28,37 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             quant_config=self.quant_config,
         )
 
-        self.allow_deep_gemm = (allow_deep_gemm
-                                and self.quant_config.use_fp8_w8a8 and
-                                self.block_shape == deep_gemm_block_shape())
+        self.allow_deep_gemm = (
+            allow_deep_gemm
+            and self.quant_config.use_fp8_w8a8
+            and self.block_shape == get_mk_alignment_for_contiguous_layout()
+        )
 
-        self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
-            max_num_tokens=max_num_tokens,
-            num_dispatchers=num_dispatchers,
-            quant_config=self.quant_config,
-        ) if self.allow_deep_gemm else None
+        self.batched_deep_gemm_experts = (
+            BatchedDeepGemmExperts(
+                max_num_tokens=max_num_tokens,
+                num_dispatchers=num_dispatchers,
+                quant_config=self.quant_config,
+            )
+            if self.allow_deep_gemm
+            else None
+        )
 
-        assert (self.batched_deep_gemm_experts is not None
-                or self.batched_triton_experts is not None)
+        assert (
+            self.batched_deep_gemm_experts is not None
+            or self.batched_triton_experts is not None
+        )
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
         if self.batched_triton_experts is not None:
-            assert (self.batched_deep_gemm_experts is None
-                    or self.batched_deep_gemm_experts.activation_formats
-                    == self.batched_triton_experts.activation_formats)
+            assert (
+                self.batched_deep_gemm_experts is None
+                or self.batched_deep_gemm_experts.activation_formats
+                == self.batched_triton_experts.activation_formats
+            )
             return self.batched_triton_experts.activation_formats
         else:
             assert self.batched_deep_gemm_experts is not None
@@ -60,14 +67,16 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def supports_chunking(self) -> bool:
         bdge = self.batched_deep_gemm_experts
         bte = self.batched_triton_experts
-        return ((bdge is None or bdge.supports_chunking())
-                and (bte is None or bte.supports_chunking()))
+        return (bdge is None or bdge.supports_chunking()) and (
+            bte is None or bte.supports_chunking()
+        )
 
     def supports_expert_map(self) -> bool:
         bdge = self.batched_deep_gemm_experts
         bte = self.batched_triton_experts
-        return ((bdge is None or bdge.supports_expert_map())
-                and (bte is None or bte.supports_expert_map()))
+        return (bdge is None or bdge.supports_expert_map()) and (
+            bte is None or bte.supports_expert_map()
+        )
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         bdge = self.batched_deep_gemm_experts
@@ -80,7 +89,8 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         if is_bdge_war and is_bte_war:
             assert bdge_war == bte_war, (
                 "Both implementations should agree on WeightAndReduce impls. "
-                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}")
+                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}"
+            )
 
         if bdge_war is not None:
             return bdge_war
@@ -88,31 +98,44 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert bte_war is not None
         return bte_war
 
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return act_dtype
+
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_metadata: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_metadata: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm:
             assert self.batched_deep_gemm_experts is not None
             return self.batched_deep_gemm_experts.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
-                expert_tokens_metadata)
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_metadata,
+            )
         else:
             assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
-                expert_tokens_metadata)
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_metadata,
+            )
 
     def apply(
         self,
@@ -124,18 +147,34 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
-        experts = (self.batched_deep_gemm_experts
-                   if self.allow_deep_gemm else self.batched_triton_experts)
+        experts = (
+            self.batched_deep_gemm_experts
+            if self.allow_deep_gemm
+            else self.batched_triton_experts
+        )
         assert experts is not None
-        experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids,
-                      activation, global_num_experts, expert_map, a1q_scale,
-                      a2_scale, workspace13, workspace2, expert_tokens_meta,
-                      apply_router_weight_on_input)
+        experts.apply(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            a1q_scale,
+            a2_scale,
+            workspace13,
+            workspace2,
+            expert_tokens_meta,
+            apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 34bfe1c16aac7596ad55b633f65eb736cf231905..a7bd64b1c65e9faeddc7f8ac50e83c235b42f12a 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from enum import IntEnum
 from typing import Optional, Union
 
 import torch
@@ -9,10 +10,14 @@ import vllm.envs as envs
 from vllm.config import ParallelConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
-from vllm.utils import cdiv, has_triton_kernels
+from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
+    OCP_MX_DTYPES,
+    OCP_MX_Scheme,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -22,7 +27,8 @@ if has_triton_kernels():
     except ImportError:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible.")
+            "version is compatible."
+        )
 
 
 def _get_config_dtype_str(
@@ -30,8 +36,8 @@ def _get_config_dtype_str(
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
-) -> Optional[str]:
+    ocp_mx_scheme: str | None = None,
+) -> str | None:
     """
     Return a string used to construct the filename that contains the
     tuning info for a particular quantization scheme.  See
@@ -43,8 +49,11 @@ def _get_config_dtype_str(
         return "int8_w8a16"
     elif use_int4_w4a16:
         return "int4_w4a16"
-    elif use_mxfp4_w4a4:
-        return "mxfp4_w4a4"
+    elif ocp_mx_scheme is not None:
+        # The output of this function is passed to `try_get_optimal_moe_config`,
+        # and as we only simulate OCP MX execution in fused_moe for now,
+        # we will NOT look for `*,dtype=w_mxfp4_a_mxfp4.json` for now.
+        return None
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs
@@ -53,16 +62,16 @@ def _get_config_dtype_str(
 
 
 def _quant_flags_to_group_shape(
-    quant_dtype: Union[torch.dtype, str, None],
+    quant_dtype: torch.dtype | str | None,
     per_act_token_quant: bool,
     per_out_ch_quant: bool,
-    block_shape: Optional[list[int]],
-) -> tuple[Optional[GroupShape], Optional[GroupShape]]:
+    block_shape: list[int] | None,
+) -> tuple[GroupShape | None, GroupShape | None]:
     """
     Convert MoE quantization flags into more generic GroupShapes.
     """
-    a_shape: Optional[GroupShape]
-    w_shape: Optional[GroupShape]
+    a_shape: GroupShape | None
+    w_shape: GroupShape | None
     if block_shape is not None:
         assert not per_act_token_quant
         assert not per_out_ch_quant
@@ -83,6 +92,26 @@ def _quant_flags_to_group_shape(
     return a_shape, w_shape
 
 
+# The type of method in top-K routing
+# Please keep this in sync with the counterpart defined in https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/trtllm/fused_moe/runner.h
+class RoutingMethodType(IntEnum):
+    # Default: Softmax -> TopK
+    Default = (0,)
+    # Renormalize: TopK -> Softmax
+    Renormalize = (1,)
+    # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups
+    # -> Top8 experts from the Top4 groups
+    DeepSeekV3 = (2,)
+    # Llama4: Top1 -> Sigmoid
+    Llama4 = (3,)
+    # RenormalizeNaive: Softmax -> TopK -> Renormalize
+    RenormalizeNaive = (4,)
+    # TopK: TopK (no softmax)
+    TopK = (5,)
+    # Unspecified
+    Unspecified = 6.0
+
+
 @dataclass
 class FusedMoEQuantDesc:
     """
@@ -93,7 +122,7 @@ class FusedMoEQuantDesc:
     # The quantized type of this parameters.  None means unquantized or
     # already quantized.
     # TODO (bnell): use scalar_type instead of Union.
-    dtype: Union[torch.dtype, str, None] = None
+    dtype: torch.dtype | str | None = None
 
     # A field that describes the quantization group shape, from quant_utils.py.
     #  * (-1, -1)   for per-tensor quantization
@@ -102,7 +131,7 @@ class FusedMoEQuantDesc:
     #  * (128, 128) for 128x128 deepseek style block quantization
     #  * (1, 128)   for deepseek style activation quantization
     #               (i.e. per-token-per-group)
-    shape: Optional[GroupShape] = None
+    shape: GroupShape | None = None
 
     # Quantization scales.
     # TODO(bnell): maybe put PrecisionConfigs in subclass of QuantDesc?
@@ -110,13 +139,13 @@ class FusedMoEQuantDesc:
 
     # Quantization alphas or gscales, used for nvfp4 types.
     # TODO(bnell): put some of these in subclasses
-    alpha_or_gscale: Optional[torch.Tensor] = None
+    alpha_or_gscale: torch.Tensor | None = None
 
     # Zero points for int4/int8 types
-    zp: Optional[torch.Tensor] = None
+    zp: torch.Tensor | None = None
 
     # Biases for GPT triton MoE
-    bias: Optional[torch.Tensor] = None
+    bias: torch.Tensor | None = None
 
 
 # TODO(bnell): have subclasses for specific moe methods?
@@ -163,15 +192,16 @@ class FusedMoEQuantConfig:
     _w2: FusedMoEQuantDesc
 
     def __post_init__(self):
-        assert (not self.per_act_token_quant
-                or self.block_shape is None), "illegal quantization"
+        assert not self.per_act_token_quant or self.block_shape is None, (
+            "illegal quantization"
+        )
 
     #
     # Convenience accessors for various properties.
     #
 
     @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
         return self._a1.dtype
 
     @property
@@ -195,10 +225,12 @@ class FusedMoEQuantConfig:
         return self._a1.shape == GroupShape.PER_TENSOR
 
     @property
-    def block_shape(self) -> Optional[list[int]]:
-        if (self._a1.shape is not None
-                and self._a1.shape != GroupShape.PER_TENSOR
-                and self._a1.shape != GroupShape.PER_TOKEN):
+    def block_shape(self) -> list[int] | None:
+        if (
+            self._a1.shape is not None
+            and self._a1.shape != GroupShape.PER_TENSOR
+            and self._a1.shape != GroupShape.PER_TOKEN
+        ):
             return [self._a1.shape.row, self._a1.shape.col]
         else:
             return None
@@ -208,71 +240,65 @@ class FusedMoEQuantConfig:
         return self.block_shape is not None
 
     @property
-    def a1_scale(self) -> Optional[torch.Tensor]:
-        assert self._a1.scale is None or isinstance(self._a1.scale,
-                                                    torch.Tensor)
+    def a1_scale(self) -> torch.Tensor | None:
+        assert self._a1.scale is None or isinstance(self._a1.scale, torch.Tensor)
         return self._a1.scale
 
     @property
-    def a1_gscale(self) -> Optional[torch.Tensor]:
+    def a1_gscale(self) -> torch.Tensor | None:
         return self._a1.alpha_or_gscale
 
     @property
-    def a2_scale(self) -> Optional[torch.Tensor]:
-        assert self._a2.scale is None or isinstance(self._a2.scale,
-                                                    torch.Tensor)
+    def a2_scale(self) -> torch.Tensor | None:
+        assert self._a2.scale is None or isinstance(self._a2.scale, torch.Tensor)
         return self._a2.scale
 
     @property
-    def a2_gscale(self) -> Optional[torch.Tensor]:
+    def a2_gscale(self) -> torch.Tensor | None:
         return self._a2.alpha_or_gscale
 
     @property
-    def w1_scale(self) -> Optional[torch.Tensor]:
-        assert self._w1.scale is None or isinstance(self._w1.scale,
-                                                    torch.Tensor)
+    def w1_scale(self) -> torch.Tensor | None:
+        assert self._w1.scale is None or isinstance(self._w1.scale, torch.Tensor)
         return self._w1.scale
 
     @property
-    def w1_zp(self) -> Optional[torch.Tensor]:
+    def w1_zp(self) -> torch.Tensor | None:
         return self._w1.zp
 
     @property
-    def w1_bias(self) -> Optional[torch.Tensor]:
+    def w1_bias(self) -> torch.Tensor | None:
         return self._w1.bias
 
     @property
     def w1_precision(self) -> Optional["PrecisionConfig"]:
-        assert self._w1.scale is None or isinstance(self._w1.scale,
-                                                    PrecisionConfig)
+        assert self._w1.scale is None or isinstance(self._w1.scale, PrecisionConfig)
         return self._w1.scale
 
     @property
-    def g1_alphas(self) -> Optional[torch.Tensor]:
+    def g1_alphas(self) -> torch.Tensor | None:
         return self._w1.alpha_or_gscale
 
     @property
-    def w2_scale(self) -> Optional[torch.Tensor]:
-        assert self._w2.scale is None or isinstance(self._w2.scale,
-                                                    torch.Tensor)
+    def w2_scale(self) -> torch.Tensor | None:
+        assert self._w2.scale is None or isinstance(self._w2.scale, torch.Tensor)
         return self._w2.scale
 
     @property
-    def w2_zp(self) -> Optional[torch.Tensor]:
+    def w2_zp(self) -> torch.Tensor | None:
         return self._w2.zp
 
     @property
-    def w2_bias(self) -> Optional[torch.Tensor]:
+    def w2_bias(self) -> torch.Tensor | None:
         return self._w2.bias
 
     @property
     def w2_precision(self) -> Optional["PrecisionConfig"]:
-        assert self._w2.scale is None or isinstance(self._w2.scale,
-                                                    PrecisionConfig)
+        assert self._w2.scale is None or isinstance(self._w2.scale, PrecisionConfig)
         return self._w2.scale
 
     @property
-    def g2_alphas(self) -> Optional[torch.Tensor]:
+    def g2_alphas(self) -> torch.Tensor | None:
         return self._w2.alpha_or_gscale
 
     @property
@@ -285,25 +311,40 @@ class FusedMoEQuantConfig:
 
     @property
     def use_int8_w8a16(self) -> bool:
-        return (self._a1.dtype is None and self._w1.dtype == torch.int8)
+        return self._a1.dtype is None and self._w1.dtype == torch.int8
 
     @property
     def use_int4_w4a16(self) -> bool:
-        return (self._a1.dtype is None and self._w1.dtype == "int4")
+        return self._a1.dtype is None and self._w1.dtype == "int4"
 
     @property
-    def use_mxfp4_w4a4(self) -> bool:
-        return (self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4")
+    def ocp_mx_scheme(self) -> str | None:
+        if not hasattr(self, "_ocp_mx_scheme"):
+            if (self._a1.dtype is not None and not isinstance(self._a1.dtype, str)) or (
+                self._w1.dtype is not None and not isinstance(self._w1.dtype, str)
+            ):
+                self._ocp_mx_scheme = None
+            else:
+                ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
+                    self._a1.dtype, self._w1.dtype
+                )
+
+                if ocp_mx_scheme is not None:
+                    ocp_mx_scheme = ocp_mx_scheme.value
+
+                self._ocp_mx_scheme = ocp_mx_scheme
+
+        return self._ocp_mx_scheme
 
     @property
     def use_mxfp4_w4a16(self) -> bool:
-        return (self._a1.dtype is None and self._w1.dtype == "mxfp4")
+        return self._a1.dtype is None and self._w1.dtype == "mxfp4"
 
     @property
     def use_nvfp4_w4a4(self) -> bool:
         return self.quant_dtype == "nvfp4"
 
-    def config_name(self, dtype: torch.dtype) -> Optional[str]:
+    def config_name(self, dtype: torch.dtype) -> str | None:
         """
         Return a string used to construct the filename that contains the
         tuning info for a particular quantization scheme.  See
@@ -313,7 +354,7 @@ class FusedMoEQuantConfig:
             use_fp8_w8a8=self.use_fp8_w8a8,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
-            use_mxfp4_w4a4=self.use_mxfp4_w4a4,
+            ocp_mx_scheme=self.ocp_mx_scheme,
             dtype=dtype,
         )
 
@@ -321,7 +362,7 @@ class FusedMoEQuantConfig:
         self,
         max_tokens: int,
         hidden_dim: int,
-    ) -> Optional[tuple[int, int]]:
+    ) -> tuple[int, int] | None:
         """
         Construct the proper activation scale shape for this
         config.
@@ -344,7 +385,7 @@ class FusedMoEQuantConfig:
         num_experts: int,
         max_tokens: int,
         hidden_dim: int,
-    ) -> Optional[tuple[int, int, int]]:
+    ) -> tuple[int, int, int] | None:
         """
         Construct the proper activation batched scale shape for this
         config, e.g. (num experts, *scale_shape).
@@ -358,28 +399,30 @@ class FusedMoEQuantConfig:
 
     @staticmethod
     def make(
-        quant_dtype: Union[torch.dtype, str, None] = None,
+        quant_dtype: torch.dtype | str | None = None,
         per_act_token_quant: bool = False,
         per_out_ch_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        block_shape: list[int] | None = None,
         w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
         w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        g1_alphas: Optional[torch.Tensor] = None,
-        g2_alphas: Optional[torch.Tensor] = None,
-        a1_gscale: Optional[torch.Tensor] = None,
-        a2_gscale: Optional[torch.Tensor] = None,
-        w1_bias: Optional[torch.Tensor] = None,
-        w2_bias: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+        g1_alphas: torch.Tensor | None = None,
+        g2_alphas: torch.Tensor | None = None,
+        a1_gscale: torch.Tensor | None = None,
+        a2_gscale: torch.Tensor | None = None,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
+        w1_zp: torch.Tensor | None = None,
+        w2_zp: torch.Tensor | None = None,
+        weight_dtype: torch.dtype | str | None = None,
     ) -> "FusedMoEQuantConfig":
         """
         General builder function for a FusedMoEQuantConfig.
         - quant_dtype: Optional quantization type. None if activations are
-          unquantized or quantized prior to calling.  Note: "nvfp4" and
-          "mxfp4" are the only valid string values for quant_dtype.
+          unquantized or quantized prior to calling.  Note: "nvfp4", "mxfp4",
+          "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values
+          for quant_dtype.
         - per_act_token_quant: Activations have per token quantization.
         - per_out_ch_quant: Outputs have per channel quantization. (only
           for cutlass).
@@ -398,19 +441,34 @@ class FusedMoEQuantConfig:
         - w1_zp: Optional w1 zero points for int4/int8 quantization.
         - w2_zp: Optional w2 zero points for int4/int8 quantization.
         """
-        assert (not isinstance(quant_dtype, str) or quant_dtype == "nvfp4"
-                or quant_dtype == "mxfp4")
-        a_shape, w_shape = _quant_flags_to_group_shape(quant_dtype,
-                                                       per_act_token_quant,
-                                                       per_out_ch_quant,
-                                                       block_shape)
+        assert not isinstance(quant_dtype, str) or quant_dtype in {
+            "nvfp4",
+            "mxfp4",
+            "mxfp6_e3m2",
+            "mxfp6_e2m3",
+        }
+        assert not isinstance(weight_dtype, str) or weight_dtype in {
+            "nvfp4",
+            "mxfp4",
+            "mxfp6_e3m2",
+            "mxfp6_e2m3",
+        }
+
+        if weight_dtype is None:
+            weight_dtype = quant_dtype
+
+        a_shape, w_shape = _quant_flags_to_group_shape(
+            quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape
+        )
         quant_config = FusedMoEQuantConfig(
             _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale),
             _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale),
-            _w1=FusedMoEQuantDesc(quant_dtype, w_shape, w1_scale, g1_alphas,
-                                  w1_zp, w1_bias),
-            _w2=FusedMoEQuantDesc(quant_dtype, w_shape, w2_scale, g2_alphas,
-                                  w2_zp, w2_bias),
+            _w1=FusedMoEQuantDesc(
+                weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias
+            ),
+            _w2=FusedMoEQuantDesc(
+                weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias
+            ),
         )
         assert quant_config.per_act_token_quant == per_act_token_quant
         assert quant_config.per_out_ch_quant == per_out_ch_quant
@@ -421,30 +479,40 @@ class FusedMoEQuantConfig:
 def fp8_w8a8_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
     per_act_token_quant: bool = False,
     per_out_ch_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
+    a1_gscale: torch.Tensor | None = None,
+    a2_gscale: torch.Tensor | None = None,
+    g1_alphas: torch.Tensor | None = None,
+    g2_alphas: torch.Tensor | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for fp8 activations and fp8 weights.
     """
-    return FusedMoEQuantConfig.make(torch.float8_e4m3fn,
-                                    w1_scale=w1_scale,
-                                    w2_scale=w2_scale,
-                                    a1_scale=a1_scale,
-                                    a2_scale=a2_scale,
-                                    per_act_token_quant=per_act_token_quant,
-                                    per_out_ch_quant=per_out_ch_quant,
-                                    block_shape=block_shape)
+    return FusedMoEQuantConfig.make(
+        torch.float8_e4m3fn,
+        w1_scale=w1_scale,
+        g1_alphas=g1_alphas,
+        w2_scale=w2_scale,
+        g2_alphas=g2_alphas,
+        a1_scale=a1_scale,
+        a1_gscale=a1_gscale,
+        a2_scale=a2_scale,
+        a2_gscale=a2_gscale,
+        per_act_token_quant=per_act_token_quant,
+        per_out_ch_quant=per_out_ch_quant,
+        block_shape=block_shape,
+    )
 
 
 def int8_w8a8_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    a1_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
+    a1_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
     per_act_token_quant: bool = False,
 ) -> FusedMoEQuantConfig:
     """
@@ -463,10 +531,11 @@ def int8_w8a8_moe_quant_config(
 
 
 def mxfp4_w4a16_moe_quant_config(
-        w1_scale: Union[torch.Tensor, "PrecisionConfig"],
-        w2_scale: Union[torch.Tensor, "PrecisionConfig"],
-        w1_bias: Optional[torch.Tensor] = None,
-        w2_bias: Optional[torch.Tensor] = None) -> FusedMoEQuantConfig:
+    w1_scale: Union[torch.Tensor, "PrecisionConfig"],
+    w2_scale: Union[torch.Tensor, "PrecisionConfig"],
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+) -> FusedMoEQuantConfig:
     """
     Construct a quant config for unquantized activations and mxfp4 weights.
     """
@@ -478,20 +547,44 @@ def mxfp4_w4a16_moe_quant_config(
     )
 
 
-def mxfp4_w4a4_moe_quant_config(
+def mxfp4_mxfp8_moe_quant_config(
     w1_scale: Union[torch.Tensor, "PrecisionConfig"],
     w2_scale: Union[torch.Tensor, "PrecisionConfig"],
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for mxfp4 activations and mxfp4 weights.
     """
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc("mxfp8"),
+        _a2=FusedMoEQuantDesc("mxfp8"),
+        _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias),
+        _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias),
+    )
+
+
+def ocp_mx_moe_quant_config(
+    quant_dtype: str,
+    w1_scale: Union[torch.Tensor, "PrecisionConfig"],
+    w2_scale: Union[torch.Tensor, "PrecisionConfig"],
+    weight_dtype: str | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for mxfp4 activations and mxfp4 weights.
+    """
+    assert quant_dtype in OCP_MX_DTYPES
     return FusedMoEQuantConfig.make(
-        "mxfp4",
+        quant_dtype=quant_dtype,
+        weight_dtype=weight_dtype,
         w1_scale=w1_scale,
         w2_scale=w2_scale,
         a1_scale=a1_scale,
@@ -532,9 +625,9 @@ def nvfp4_moe_quant_config(
 def int4_w4a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    block_shape: Optional[list[int]] = None,
+    w1_zp: torch.Tensor | None,
+    w2_zp: torch.Tensor | None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int4 weights.
@@ -552,9 +645,9 @@ def int4_w4a16_moe_quant_config(
 def int8_w8a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    block_shape: Optional[list[int]] = None,
+    w1_zp: torch.Tensor | None,
+    w2_zp: torch.Tensor | None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int8 weights.
@@ -570,8 +663,8 @@ def int8_w8a16_moe_quant_config(
 
 
 def biased_moe_quant_config(
-    w1_bias: Optional[torch.Tensor],
-    w2_bias: Optional[torch.Tensor],
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for unquantized activations with biases.
@@ -598,6 +691,7 @@ class FusedMoEParallelConfig:
     ep_rank: int
 
     use_ep: bool  # whether to use EP or not
+    all2all_backend: str  # all2all backend for MoE communication
 
     @property
     def use_all2all_kernels(self):
@@ -605,22 +699,34 @@ class FusedMoEParallelConfig:
 
     @property
     def use_pplx_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+        return self.use_all2all_kernels and self.all2all_backend == "pplx"
 
     @property
     def use_deepep_ht_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+        return (
+            self.use_all2all_kernels
+            and self.all2all_backend == "deepep_high_throughput"
+        )
 
     @property
     def use_deepep_ll_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
+        return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"
+
+    @staticmethod
+    def flatten_tp_across_dp(
+        tp_size: int, dp_size: int, dp_rank: int
+    ) -> tuple[int, int]:
+        tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
+        # There are actually dp_size * tp_size devices. Update tp_size
+        # and tp_rank so we shard across all devices.
+        flatten_tp_size = dp_size * tp_size
+        flatten_tp_rank = dp_rank * tp_size + tp_rank
+        return flatten_tp_size, flatten_tp_rank
 
     @staticmethod
-    def make(tp_size_: int, dp_size_: int,
-             vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
+    def make(
+        tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig
+    ) -> "FusedMoEParallelConfig":
         """
         Determine MoE parallel configuration. Based on the input `tp_size_`,
         `dp_size_` and vllm's parallel config, determine what
@@ -692,42 +798,41 @@ class FusedMoEParallelConfig:
                 between the 4 devices.
         """
 
-        def flatten_tp_across_dp(dp_rank: int):
-            tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
-            # There are actually dp_size_ * tp_size_ devices. Update tp_size
-            # and tp_rank so we shard across all devices.
-            tp_size = dp_size_ * tp_size_
-            tp_rank = dp_rank * tp_size_ + tp_rank
-            return tp_size, tp_rank
-
-        use_ep = (dp_size_ * tp_size_ > 1
-                  and vllm_parallel_config.enable_expert_parallel)
+        use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel
 
         dp_size = dp_size_
         dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
-        tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+            tp_size_, dp_size_, dp_rank
+        )
 
         if not use_ep:
-            return FusedMoEParallelConfig(tp_size=tp_size,
-                                          tp_rank=tp_rank,
-                                          dp_size=dp_size,
-                                          dp_rank=dp_rank,
-                                          ep_size=1,
-                                          ep_rank=0,
-                                          use_ep=False)
+            return FusedMoEParallelConfig(
+                tp_size=tp_size,
+                tp_rank=tp_rank,
+                dp_size=dp_size,
+                dp_rank=dp_rank,
+                ep_size=1,
+                ep_rank=0,
+                use_ep=False,
+                all2all_backend=vllm_parallel_config.all2all_backend,
+            )
         # DP + EP / TP + EP / DP + TP + EP
         assert use_ep
         # In EP, each device owns a set of experts fully. There is no tensor
         # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
         ep_size = tp_size
         ep_rank = tp_rank
-        return FusedMoEParallelConfig(tp_size=1,
-                                      tp_rank=0,
-                                      dp_size=dp_size,
-                                      dp_rank=dp_rank,
-                                      ep_size=ep_size,
-                                      ep_rank=ep_rank,
-                                      use_ep=True)
+        return FusedMoEParallelConfig(
+            tp_size=1,
+            tp_rank=0,
+            dp_size=dp_size,
+            dp_rank=dp_rank,
+            ep_size=ep_size,
+            ep_rank=ep_rank,
+            use_ep=True,
+            all2all_backend=vllm_parallel_config.all2all_backend,
+        )
 
 
 # Adapted from pplx-kernels tests/all_to_all_utils.py
@@ -747,10 +852,15 @@ class FusedMoEConfig:
 
     has_bias: bool = False
 
+    is_act_and_mul: bool = True
+
+    is_lora_enabled: bool = False
+
     def __post_init__(self):
         if self.dp_size > 1:
-            logger.debug_once("Using FusedMoEConfig::max_num_tokens=%d",
-                              self.max_num_tokens)
+            logger.debug_once(
+                "Using FusedMoEConfig::max_num_tokens=%d", self.max_num_tokens
+            )
 
         assert self.max_num_tokens > 0
 
@@ -799,6 +909,8 @@ class FusedMoEConfig:
         """
         Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
         """
-        return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe()
-                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
+        return (
+            envs.VLLM_USE_FLASHINFER_MOE_FP4
+            and has_flashinfer_cutlass_fused_moe()
+            and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput"
+        )
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..99501df6f17642c5ea9bb2a39d816fd28cadca23
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+  }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
index 2c78bfaba7890772bf266721f5577202ea443882..2e0dd7a4b950755b46a72d36bf67bf8c3e62485b 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -1,218 +1,146 @@
 {
   "1": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
+    "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "2": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "num_warps": 4,
     "num_stages": 5
   },
   "4": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "num_warps": 4,
     "num_stages": 5
   },
   "8": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 4
+    "num_warps": 4,
+    "num_stages": 5
   },
   "16": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 5
+    "num_warps": 4,
+    "num_stages": 3
   },
   "24": {
-    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_M": 64,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   },
   "32": {
-    "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
-    "num_stages": 5
+    "num_warps": 8,
+    "num_stages": 3
   },
   "48": {
-    "BLOCK_SIZE_M": 64,
-    "BLOCK_SIZE_N": 32,
-    "BLOCK_SIZE_K": 256,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 4
   },
   "64": {
-    "BLOCK_SIZE_M": 64,
-    "BLOCK_SIZE_N": 32,
-    "BLOCK_SIZE_K": 256,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 4
   },
   "96": {
     "BLOCK_SIZE_M": 64,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 256,
+    "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 2
+    "num_stages": 5
   },
   "128": {
-    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
     "num_warps": 4,
-    "num_stages": 2
+    "num_stages": 4
   },
   "256": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
     "num_stages": 3
   },
   "512": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
+    "num_warps": 4,
+    "num_stages": 4
   },
   "1024": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
     "num_stages": 3
   },
   "1536": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
     "num_stages": 3
   },
   "2048": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   },
   "3072": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   },
   "4096": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "5120": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "9216": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "13312": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "17408": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "25600": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "33792": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "41984": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "50176": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
-  },
-  "58368": {
-    "BLOCK_SIZE_M": 256,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   }
 }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
index 4da841e74a79f9589fecac1fa557ea132d34805f..4ea86340c3243400a0052322eac547611c82a548 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -5,7 +5,7 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 5
   },
   "2": {
     "BLOCK_SIZE_M": 16,
@@ -13,7 +13,7 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "4": {
     "BLOCK_SIZE_M": 16,
@@ -21,7 +21,7 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "8": {
     "BLOCK_SIZE_M": 16,
@@ -29,7 +29,7 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "16": {
     "BLOCK_SIZE_M": 16,
@@ -37,52 +37,52 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 5
+    "num_stages": 3
   },
   "24": {
-    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_M": 64,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 4
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "32": {
     "BLOCK_SIZE_M": 64,
     "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
-    "num_stages": 5
+    "num_warps": 8,
+    "num_stages": 3
   },
   "48": {
-    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 5
+    "num_stages": 4
   },
   "64": {
-    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 5
+    "num_stages": 4
   },
   "96": {
     "BLOCK_SIZE_M": 64,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 5
   },
   "128": {
-    "BLOCK_SIZE_M": 64,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 16,
     "num_warps": 4,
     "num_stages": 4
@@ -91,57 +91,57 @@
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 5
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "512": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 16,
-    "num_warps": 8,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
     "num_stages": 4
   },
   "1024": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 32,
-    "num_warps": 8,
-    "num_stages": 4
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "1536": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 32,
-    "num_warps": 8,
-    "num_stages": 4
+    "num_warps": 4,
+    "num_stages": 3
   },
   "2048": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 16,
-    "num_warps": 8,
+    "num_warps": 4,
     "num_stages": 3
   },
   "3072": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 4
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "4096": {
     "BLOCK_SIZE_M": 128,
-    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 32,
-    "num_warps": 8,
-    "num_stages": 4
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "5120": {
     "BLOCK_SIZE_M": 128,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3f1a562710b094e2c0dbdae87391c634c4d500f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..19046fcf1d6a21dfd74160f241ed4a1a94bc20c9
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f9422fe6f7c4cb1b1448fd1bf51a27d425c429a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..555d173644522713f502ee8ee21efa5f04916184
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..86b49127f9bf2fadc7cbc99053430832ccc27376
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea1ce9ad2cdc4ecd4b93c2b9007379e1b5ecebd2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "256": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee8a28b833d5a6174f52bcd451efd1c76f7ef620
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000000000000000000000000000000..09d3fa584edd89c14f971d441a08be2a4c3e51e7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
new file mode 100755
index 0000000000000000000000000000000000000000..f5990fc1dd89497cde0fd3c31b8b44cbe4e73251
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
@@ -0,0 +1,213 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "768": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
new file mode 100644
index 0000000000000000000000000000000000000000..d613de3a754f98d334ae7da92737d061dca0cf62
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
@@ -0,0 +1,82 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..592b60c5aceadc8b4b0afab7cae88d601e589ff6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,82 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc6454ebfb2fe13a6aa164d1497d41d3272b30da
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000000000000000000000000000000..48997646d99b683bb31c304d25f01f25f33e7b58
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a626ac47b8d174a98c137100827c3107800faa6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..371e87f946829db5a939a1f3b6810918e8eec875
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d0cdfd274293433be628efbcdfab05ce1864b97
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..de8eec366eca3907c0ef6de248beadc48ba49057
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+  }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..80fce79fb64c9820a640c1e3436961210943a64d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 4,
+      "num_stages": 5
+    },
+    "2": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 4,
+      "num_stages": 5
+    },
+    "4": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 8,
+      "num_stages": 3
+    },
+    "8": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 32,
+      "BLOCK_SIZE_K": 256,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 5
+    },
+    "16": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 2
+    },
+    "24": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 256,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 4,
+      "num_stages": 2
+    },
+    "32": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 8,
+      "num_stages": 2
+    },
+    "48": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "64": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 2
+    },
+    "96": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 3
+    },
+    "128": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "256": {
+      "BLOCK_SIZE_M": 64,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 3
+    },
+    "512": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 8,
+      "num_stages": 3
+    },
+    "1024": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "1536": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "2048": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "3072": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "4096": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 8,
+      "num_stages": 4
+    }
+  }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b94452197b0f7081790a09aadd65182534cbf8e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..48f19df24cc9e7bc1d8d79bcb69a6e9dcad06fb4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
new file mode 100644
index 0000000000000000000000000000000000000000..54d3bf190ebec617abdb479c64ad09aef8d5d4c0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
index 26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec..6a4018195603aca71a33224d5bf77c5811cd9051 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -2,73 +2,73 @@
   "1": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 32,
-    "BLOCK_SIZE_K": 256,
+    "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
     "num_stages": 5
   },
   "2": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
+    "GROUP_SIZE_M": 16,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 3
   },
   "4": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
-    "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
     "num_warps": 8,
-    "num_stages": 4
+    "num_stages": 3
   },
   "8": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 32,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
     "num_warps": 4,
     "num_stages": 5
   },
   "16": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 32,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 64,
+    "GROUP_SIZE_M": 32,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 3
   },
   "24": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
     "num_stages": 3
   },
   "32": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
+    "GROUP_SIZE_M": 16,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 3
   },
   "48": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 4,
-    "num_stages": 4
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
   },
   "64": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
     "num_stages": 3
   },
   "96": {
@@ -77,22 +77,22 @@
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 32,
     "num_warps": 4,
-    "num_stages": 4
+    "num_stages": 3
   },
   "128": {
     "BLOCK_SIZE_M": 32,
-    "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
-    "num_stages": 4
+    "num_warps": 8,
+    "num_stages": 3
   },
   "256": {
-    "BLOCK_SIZE_M": 32,
-    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
     "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 16,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
     "num_stages": 4
   },
   "512": {
@@ -100,47 +100,47 @@
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "num_warps": 4,
     "num_stages": 4
   },
   "1024": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 5
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "1536": {
     "BLOCK_SIZE_M": 256,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
     "num_warps": 8,
-    "num_stages": 3
+    "num_stages": 4
   },
   "2048": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 5
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "3072": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 5
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
   },
   "4096": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   }
 }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f500d487c56dcb9b7525a318f3dc7698550a1c2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed8afa6b6db88f22c6240403180cc87bb8027696
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fea55a8000ff5a4f7d29a5c620e82abc43d7c7d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
index bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a..1e3f46e0ba84a418e138515812cab7dc2a5f4ab4 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -2,7 +2,7 @@
   "1": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 32,
-    "BLOCK_SIZE_K": 256,
+    "BLOCK_SIZE_K": 128,
     "GROUP_SIZE_M": 1,
     "num_warps": 4,
     "num_stages": 4
@@ -20,78 +20,78 @@
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
-    "num_stages": 4
+    "num_warps": 8,
+    "num_stages": 3
   },
   "8": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 1,
+    "GROUP_SIZE_M": 16,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "16": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
+    "GROUP_SIZE_M": 64,
     "num_warps": 4,
-    "num_stages": 3
+    "num_stages": 5
   },
   "24": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_N": 256,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
     "num_stages": 3
   },
   "32": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 256,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
     "num_stages": 3
   },
   "48": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
+    "num_warps": 8,
     "num_stages": 3
   },
   "64": {
     "BLOCK_SIZE_M": 16,
-    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 16,
-    "num_warps": 4,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
     "num_stages": 3
   },
   "96": {
     "BLOCK_SIZE_M": 16,
     "BLOCK_SIZE_N": 64,
     "BLOCK_SIZE_K": 256,
-    "GROUP_SIZE_M": 32,
+    "GROUP_SIZE_M": 64,
     "num_warps": 4,
     "num_stages": 4
   },
   "128": {
     "BLOCK_SIZE_M": 32,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 256,
     "GROUP_SIZE_M": 1,
-    "num_warps": 4,
+    "num_warps": 8,
     "num_stages": 3
   },
   "256": {
-    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_M": 64,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 64,
+    "GROUP_SIZE_M": 1,
     "num_warps": 4,
     "num_stages": 3
   },
@@ -100,47 +100,47 @@
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 3
+    "num_warps": 4,
+    "num_stages": 4
   },
   "1024": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
-    "num_stages": 5
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "1536": {
     "BLOCK_SIZE_M": 256,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_K": 64,
     "GROUP_SIZE_M": 1,
     "num_warps": 8,
-    "num_stages": 3
+    "num_stages": 4
   },
   "2048": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 64,
-    "num_warps": 8,
-    "num_stages": 3
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
   },
   "3072": {
     "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
     "BLOCK_SIZE_K": 64,
-    "GROUP_SIZE_M": 32,
-    "num_warps": 8,
-    "num_stages": 4
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
   },
   "4096": {
-    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_M": 128,
     "BLOCK_SIZE_N": 128,
-    "BLOCK_SIZE_K": 128,
-    "GROUP_SIZE_M": 1,
-    "num_warps": 8,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
     "num_stages": 3
   }
 }
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000000000000000000000000000000..38034fe2ddae79bfd37b2f0f402ddec98782f739
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,201 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb4d11c6be2b458e55830bf75a5b17be004d7ee8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2f79b966abb75e31022143fa7dd1e77b5f5c5b7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1ca1006318905037e04e06219b1c6a17bd7ce51
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..381eb5d826a5bf9f67a67d9eee48ffe8446a305a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ed3ad3527170a6c405e3ba46d211aa581a7dfe0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..bf97f671477b309f8d7eecb063a116a60fba6382
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..24f13cdeff4f805396e2288372793b116c863013
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ffa2ac894871e2cf205c575b053113be776d271
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7df36e8b174083ad7aee7179a409ee08be5d389
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b0bf1bf51785b913f1c0b2a60e18253007ba4c72
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..6825378d3798938cc6a773c6f215354ef316fac4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..9952f80834794570ca8480f8883080611813bd05
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..2c897dbce17e482fb82514b8882963aa7e71511d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd675df5d564a3073dd603bac4b1feffbb5d732b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
@@ -0,0 +1,82 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..e410671b6fd4363ed36adaef2c5dda8c9b51ba7e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,82 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 114f349538fbe672e7a6d4670b94449dcec1ac46..23ace3408562a2172e0ca34510a85692fb673d0f 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn import functional as F
 
+from vllm import _custom_ops as ops
 from vllm import envs
 
 
@@ -13,6 +14,17 @@ def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     return F.silu(x[..., :d]) * x[..., d:]
 
 
+def swigluoai_and_mul(
+    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
+) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    gate, up = x[..., :d], x[..., d:]
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(alpha * gate)
+    return (up + 1) * glu
+
+
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -22,10 +34,9 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
 
     gating_output = gating_output.float()
     if scoring_func == "softmax":
@@ -39,29 +50,30 @@ def grouped_topk(
     if e_score_correction_bias is not None:
         original_scores = scores
         scores = scores + e_score_correction_bias.unsqueeze(0)
-        group_scores = (scores.view(num_token, num_expert_group,
-                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+        group_scores = (
+            scores.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+        )
     else:
-        group_scores = scores.view(num_token, num_expert_group,
-                                   -1).max(dim=-1).values  # [n, n_group]
-    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
-                           sorted=False)[1]  # [n, top_k_group]
+        group_scores = (
+            scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+        )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
     group_mask = torch.zeros_like(group_scores)  # [n, n_group]
     group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-    score_mask = group_mask.unsqueeze(-1).expand(
-        num_token, num_expert_group,
-        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
-    tmp_scores = scores.masked_fill(~score_mask.bool(),
-                                    float("-inf"))  # [n, e]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
 
     if e_score_correction_bias is not None:
         topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
         topk_weights = original_scores.gather(1, topk_ids)
     else:
-        topk_weights, topk_ids = torch.topk(tmp_scores,
-                                            k=topk,
-                                            dim=-1,
-                                            sorted=False)
+        topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
@@ -77,31 +89,32 @@ def select_experts(
     top_k: int,
     use_grouped_topk: bool,
     renormalize: bool,
-    topk_group: Optional[int] = None,
-    num_expert_group: Optional[int] = None,
-    custom_routing_function: Optional[Callable] = None,
+    topk_group: int | None = None,
+    num_expert_group: int | None = None,
+    custom_routing_function: Callable | None = None,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None
-        return grouped_topk(hidden_states=hidden_states,
-                            gating_output=router_logits,
-                            topk=top_k,
-                            renormalize=renormalize,
-                            num_expert_group=num_expert_group,
-                            topk_group=topk_group,
-                            scoring_func=scoring_func,
-                            routed_scaling_factor=routed_scaling_factor,
-                            e_score_correction_bias=e_score_correction_bias)
+        return grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+        )
     elif custom_routing_function is None:
         assert scoring_func == "softmax"
-        topk_logit_vals, topk_idx = torch.topk(router_logits,
-                                               k=top_k,
-                                               dim=-1,
-                                               sorted=False)
+        topk_logit_vals, topk_idx = torch.topk(
+            router_logits, k=top_k, dim=-1, sorted=False
+        )
         if renormalize:
             topk_vals = torch.softmax(topk_logit_vals, dim=-1)
         else:
@@ -109,16 +122,18 @@ def select_experts(
             topk_vals = (topk_logit_vals - logZ).exp()
         return topk_vals.to(torch.float32), topk_idx.to(torch.int32)
     else:
-        return custom_routing_function(hidden_states=hidden_states,
-                                       gating_output=router_logits,
-                                       topk=top_k,
-                                       renormalize=renormalize)
+        return custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+        )
 
 
 class IPEXFusedMOE:
-
     def __init__(self, layer: torch.nn.Module) -> None:
         import intel_extension_for_pytorch as ipex
+
         layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
             layer.w13_weight,
             layer.w2_weight,
@@ -133,21 +148,22 @@ class IPEXFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", f"{activation} is not supported."
         assert not apply_router_weight_on_input
-        assert routed_scaling_factor == 1.0, \
+        assert routed_scaling_factor == 1.0, (
             f"routed_scaling_factor {routed_scaling_factor} is not supported."
+        )
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -163,7 +179,6 @@ class IPEXFusedMOE:
 
 
 class SGLFusedMOE:
-
     def __init__(self, layer: torch.nn.Module) -> None:
         pass
 
@@ -175,14 +190,14 @@ class SGLFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
@@ -222,9 +237,44 @@ class SGLFusedMOE:
 
 
 class CPUFusedMOE:
-
     def __init__(self, layer: torch.nn.Module) -> None:
-        pass
+        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
+
+        num_experts = layer.w13_weight.size(0)
+        has_w13_bias = hasattr(layer, "w13_bias")
+        has_w2_bias = hasattr(layer, "w2_bias")
+
+        layer.gate_up_linear = []
+        layer.down_linear = []
+
+        for i in range(num_experts):
+            layer_w13_weight = layer.w13_weight[i]
+            layer_w13_bias = layer.w13_bias[i] if has_w13_bias else None
+            layer_w2_weight = layer.w2_weight[i]
+            layer_w2_bias = layer.w2_bias[i] if has_w2_bias else None
+            if use_onednn_mm:
+                gate_up_handle = ops.create_onednn_mm(layer_w13_weight.t(), 32)
+                layer.gate_up_linear.append(
+                    lambda x, handle=gate_up_handle, bias=layer_w13_bias: ops.onednn_mm(
+                        handle, x, bias
+                    )
+                )
+                down_handle = ops.create_onednn_mm(layer_w2_weight.t(), 32)
+                layer.down_linear.append(
+                    lambda x, handle=down_handle, bias=layer_w2_bias: ops.onednn_mm(
+                        handle, x, bias
+                    )
+                )
+            else:
+                layer.gate_up_linear.append(
+                    lambda x, w=layer_w13_weight, b=layer_w13_bias: F.linear(x, w, b)
+                )
+                layer.down_linear.append(
+                    lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b)
+                )
+        if use_onednn_mm:  # remove weight
+            layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
     def __call__(
         self,
@@ -234,18 +284,18 @@ class CPUFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation == "silu", f"{activation} is not supported."
+        assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
         assert not apply_router_weight_on_input
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
@@ -274,27 +324,31 @@ class CPUFusedMOE:
 
         outputs = []
         start_idx = 0
+
         for i, num_tokens in enumerate(tokens_per_expert):
             end_idx = start_idx + num_tokens
             if num_tokens == 0:
                 continue
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
-            layer_w13_weight = layer.w13_weight[i]
-            layer_w2_weight = layer.w2_weight[i]
-
-            gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
-            gate_up = silu_and_mul(gate_up)
-            expert_out = F.linear(gate_up, layer_w2_weight)
+            gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
+            if activation == "swigluoai":
+                gate_up = swigluoai_and_mul(gate_up)
+            else:
+                gate_up = silu_and_mul(gate_up)
+            expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx
 
-        outs = torch.cat(outputs,
-                         dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
         new_x = torch.empty_like(outs)
 
         new_x[idxs] = outs
-        final_out = (new_x.view(
-            *topk_ids.shape, -1).type(topk_weights.dtype).mul_(
-                topk_weights.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
+        final_out = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weights.dtype)
+            .mul_(topk_weights.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
         return final_out
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 8c2ff580575f56a227c7c52d409d9d1be0a4e4d5..6753a19250b3bad6a20ed257fb0152df4556c220 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" CUTLASS based Fused MoE kernels."""
-from typing import Callable, Optional
+"""CUTLASS based Fused MoE kernels."""
+
+from collections.abc import Callable
 
 import torch
 
@@ -10,13 +11,17 @@ from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    moe_permute, moe_unpermute)
+    moe_permute,
+    moe_unpermute,
+)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP)
-from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize,
-                                                        _resize_cache)
+    TopKWeightAndReduceDelegate,
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -30,23 +35,23 @@ def run_cutlass_moe_fp8(
     topk_ids: torch.Tensor,
     activation_callable: Callable,
     global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
-    a1q_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+    a1q_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
     ab_strides1: torch.Tensor,
     ab_strides2: torch.Tensor,
     c_strides1: torch.Tensor,
     c_strides2: torch.Tensor,
     workspace13: torch.Tensor,
     workspace2: torch.Tensor,
-    expert_num_tokens: Optional[torch.Tensor],
+    expert_num_tokens: torch.Tensor | None,
     out_dtype: torch.dtype,
     per_act_token: bool,
     per_out_ch: bool,
     use_batched_format: bool,
-    topk_weights: Optional[torch.Tensor],
+    topk_weights: torch.Tensor | None,
 ):
     a1q = hidden_states
 
@@ -56,20 +61,28 @@ def run_cutlass_moe_fp8(
     assert w2.dtype == torch.float8_e4m3fn
     assert a1q.size(-1) == w1.size(2), "Hidden size mismatch w1"
     assert w1.size(1) == w2.size(2) * 2, "Hidden size mismatch w2"
-    assert w1_scale.dim() == 1 or w1_scale.size(
-        1) == 1 or w1_scale.shape[1] == w1.size(1), "W1 scale shape mismatch"
-    assert w2_scale.dim() == 1 or w2_scale.size(
-        1) == 1 or w2_scale.shape[1] == w2.size(1), "W2 scale shape mismatch"
+    assert (
+        w1_scale.dim() == 1 or w1_scale.size(1) == 1 or w1_scale.shape[1] == w1.size(1)
+    ), "W1 scale shape mismatch"
+    assert (
+        w2_scale.dim() == 1 or w2_scale.size(1) == 1 or w2_scale.shape[1] == w2.size(1)
+    ), "W2 scale shape mismatch"
     assert w1.size(0) == w2.size(0), "Expert number mismatch"
-    assert a1q_scale is None or a1q_scale.dim() == 0 or a1q_scale.size(
-        0) == 1 or a1q_scale.size(
-            0) == a1q.shape[0], "Input scale shape mismatch"
+    assert (
+        a1q_scale is None
+        or a1q_scale.dim() == 0
+        or a1q_scale.size(0) == 1
+        or a1q_scale.size(0) == a1q.shape[0]
+    ), "Input scale shape mismatch"
     assert w1.size(0) == w2.size(0), "Weights expert number mismatch"
     assert w1.size(0) == w1_scale.size(0), "w1 scales expert number mismatch"
     assert w1.size(0) == w2_scale.size(0), "w2 scales expert number mismatch"
-    assert a2_scale is None or a2_scale.dim() == 0 or a2_scale.size(
-        0) == 1 or a2_scale.size(
-            0) == a1q.shape[0], "Intermediate scale shape mismatch"
+    assert (
+        a2_scale is None
+        or a2_scale.dim() == 0
+        or a2_scale.size(0) == 1
+        or a2_scale.size(0) == a1q.shape[0]
+    ), "Intermediate scale shape mismatch"
     assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
     if expert_map is not None:
         assert expert_num_tokens is None
@@ -97,8 +110,9 @@ def run_cutlass_moe_fp8(
 
     if expert_map is not None:
         "Translate info from expert_map to topk_ids"
-        local_topk_ids = torch.where(expert_map[topk_ids] != -1,
-                                     expert_map[topk_ids], -1)
+        local_topk_ids = torch.where(
+            expert_map[topk_ids] != -1, expert_map[topk_ids], -1
+        )
     else:
         local_topk_ids = topk_ids
 
@@ -108,35 +122,39 @@ def run_cutlass_moe_fp8(
     if use_batched_format:
         mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2))
         act_out = _resize_cache(workspace2, (local_E * padded_M, N))
-        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
-                                  (local_E * padded_M, N))
+        quant_out = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (local_E * padded_M, N)
+        )
         mm2_out = _resize_cache(workspace2, (local_E * padded_M, K))
     else:
-        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn),
-                                 (M * topk, K))
+        a1q_perm = _resize_cache(
+            workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K)
+        )
         mm1_out = _resize_cache(workspace13, (M * topk, N * 2))
         act_out = _resize_cache(workspace2, (M * topk, N))
         # original workspace are based on input hidden_states dtype (bf16)
-        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
-                                  (M * topk, N))
+        quant_out = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N)
+        )
         mm2_out = _resize_cache(workspace2, (M * topk, K))
 
     if use_batched_format:
         assert expert_num_tokens is not None
 
-        expert_offsets = torch.empty((local_E),
-                                     dtype=torch.int32,
-                                     device=device)
-        problem_sizes1 = torch.empty((local_E, 3),
-                                     dtype=torch.int32,
-                                     device=device)
-        problem_sizes2 = torch.empty((local_E, 3),
-                                     dtype=torch.int32,
-                                     device=device)
+        expert_offsets = torch.empty((local_E), dtype=torch.int32, device=device)
+        problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
+        problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
 
-        ops.get_cutlass_pplx_moe_mm_data(expert_offsets, problem_sizes1,
-                                         problem_sizes2, expert_num_tokens,
-                                         local_E, padded_M, N, K)
+        ops.get_cutlass_pplx_moe_mm_data(
+            expert_offsets,
+            problem_sizes1,
+            problem_sizes2,
+            expert_num_tokens,
+            local_E,
+            padded_M,
+            N,
+            K,
+        )
 
         w1_scale = w1_scale.reshape(w1_scale.size(0), -1)
         w2_scale = w2_scale.reshape(w2_scale.size(0), -1)
@@ -146,15 +164,14 @@ def run_cutlass_moe_fp8(
         # during offset calculations
         expert_offsets = expert_offsets.to(torch.int64)
     else:
-        problem_sizes1 = torch.empty((global_num_experts, 3),
-                                     dtype=torch.int32,
-                                     device=device)
-        problem_sizes2 = torch.empty((global_num_experts, 3),
-                                     dtype=torch.int32,
-                                     device=device)
-
-        num_expert = global_num_experts if expert_map is None \
-                     else expert_map.size(0)
+        problem_sizes1 = torch.empty(
+            (global_num_experts, 3), dtype=torch.int32, device=device
+        )
+        problem_sizes2 = torch.empty(
+            (global_num_experts, 3), dtype=torch.int32, device=device
+        )
+
+        num_expert = global_num_experts if expert_map is None else expert_map.size(0)
         # permuted a1q reuses workspace2
         a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
             a1q,
@@ -163,12 +180,13 @@ def run_cutlass_moe_fp8(
             num_expert,
             local_E,
             expert_map,
-            permuted_hidden_states=a1q_perm)
+            permuted_hidden_states=a1q_perm,
+        )
         expert_offsets = expert_offsets[:-1]
 
-        ops.get_cutlass_moe_mm_problem_sizes(local_topk_ids, problem_sizes1,
-                                             problem_sizes2,
-                                             global_num_experts, N, K)
+        ops.get_cutlass_moe_mm_problem_sizes(
+            local_topk_ids, problem_sizes1, problem_sizes2, global_num_experts, N, K
+        )
 
     if not per_act_token and (expert_map is not None or use_batched_format):
         # this is necessary to avoid imprecise scale calculation caused by
@@ -176,41 +194,62 @@ def run_cutlass_moe_fp8(
         # this rank handles only partial tokens, or when it is batched .
         mm1_out.fill_(0)
 
-    ops.cutlass_moe_mm(mm1_out, a1q, w1, a1q_scale, w1_scale, expert_offsets,
-                       problem_sizes1, ab_strides1, ab_strides1, c_strides1,
-                       per_act_token, per_out_ch)
+    ops.cutlass_moe_mm(
+        mm1_out,
+        a1q,
+        w1,
+        a1q_scale,
+        w1_scale,
+        expert_offsets,
+        problem_sizes1,
+        ab_strides1,
+        ab_strides1,
+        c_strides1,
+        per_act_token,
+        per_out_ch,
+    )
 
     activation_callable(act_out, mm1_out)
 
     a2q, a2q_scale = ops.scaled_fp8_quant(
-        act_out,
-        a2_scale,
-        use_per_token_if_dynamic=per_act_token,
-        output=quant_out)
+        act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out
+    )
 
     if expert_map is not None:
         mm2_out.fill_(0)
 
-    ops.cutlass_moe_mm(mm2_out, a2q, w2, a2q_scale, w2_scale, expert_offsets,
-                       problem_sizes2, ab_strides2, ab_strides2, c_strides2,
-                       per_act_token, per_out_ch)
+    ops.cutlass_moe_mm(
+        mm2_out,
+        a2q,
+        w2,
+        a2q_scale,
+        w2_scale,
+        expert_offsets,
+        problem_sizes2,
+        ab_strides2,
+        ab_strides2,
+        c_strides2,
+        per_act_token,
+        per_out_ch,
+    )
 
     if use_batched_format:
         output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True)
     else:
         # for non-chunking mode the output is resized from workspace13
         # so we need to make sure mm2_out uses workspace2.
-        moe_unpermute(out=output,
-                      permuted_hidden_states=mm2_out,
-                      topk_weights=topk_weights,
-                      inv_permuted_idx=inv_perm)
+        moe_unpermute(
+            out=output,
+            permuted_hidden_states=mm2_out,
+            topk_weights=topk_weights,
+            inv_permuted_idx=inv_perm,
+        )
 
 
 class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -239,12 +278,12 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
@@ -256,26 +295,43 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
 
         activation_callable = lambda o, i: self.activation(activation, o, i)
 
-        use_batched_format = self.activation_formats[
-            0] == mk.FusedMoEActivationFormat.BatchedExperts
+        use_batched_format = (
+            self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts
+        )
 
         in_dtype = hidden_states.dtype
         run_cutlass_moe_fp8(
-            output, hidden_states, w1, w2, topk_ids, activation_callable,
-            global_num_experts, expert_map, self.w1_scale, self.w2_scale,
-            a1q_scale, a2_scale, self.ab_strides1, self.ab_strides2,
-            self.c_strides1, self.c_strides2, workspace13, workspace2,
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_ids,
+            activation_callable,
+            global_num_experts,
+            expert_map,
+            self.w1_scale,
+            self.w2_scale,
+            a1q_scale,
+            a2_scale,
+            self.ab_strides1,
+            self.ab_strides2,
+            self.c_strides1,
+            self.c_strides2,
+            workspace13,
+            workspace2,
             expert_num_tokens,
             self.out_dtype if self.out_dtype is not None else in_dtype,
-            self.per_act_token_quant, self.per_out_ch_quant,
-            use_batched_format, topk_weights)
+            self.per_act_token_quant,
+            self.per_out_ch_quant,
+            use_batched_format,
+            topk_weights,
+        )
 
 
 class CutlassExpertsFp8(CutlassExpertsFp8Base):
-
     def __init__(
         self,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -293,10 +349,12 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_chunking(self) -> bool:
         return True
@@ -308,32 +366,31 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
         # topk weights and reduction are fused in moe_unpermute cuda kernel
         return TopKWeightAndReduceNoOP()
 
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return self.out_dtype if self.out_dtype is not None else act_dtype
+
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1 = (M * topk, max(N, K))
         workspace2 = (M * topk, max(N // 2, K))
         output = (M, K)
-        return (workspace1, workspace2, output,
-                self.out_dtype if self.out_dtype is not None else a.dtype)
+        return (workspace1, workspace2, output)
 
 
 class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
-
     def __init__(
         self,
         max_experts_per_worker: int,
         num_dispatchers: int,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -354,10 +411,12 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.BatchedExperts,
-                mk.FusedMoEActivationFormat.BatchedExperts)
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
 
     def supports_chunking(self) -> bool:
         return False
@@ -365,29 +424,25 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
     def supports_expert_map(self) -> bool:
         return False
 
-    # TODO(bnell): maybe remove need for passing aq to workspace_shapes
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return self.out_dtype if self.out_dtype is not None else act_dtype
+
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        padded_M = aq.size(1)
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         assert num_dp is not None
-        workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
-                      max(N, K))
-        workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
-                      max(N // 2, K))
-        output = (self.max_experts_per_worker, padded_M, K)
-        return (workspace1, workspace2, output,
-                self.out_dtype if self.out_dtype is not None else a.dtype)
+        workspace1 = (self.max_experts_per_worker, M * num_dp, max(N, K))
+        workspace2 = (self.max_experts_per_worker, M * num_dp, max(N // 2, K))
+        output = (self.max_experts_per_worker, M, K)
+        return (workspace1, workspace2, output)
 
 
 def cutlass_moe_fp8(
@@ -402,7 +457,7 @@ def cutlass_moe_fp8(
     c_strides2: torch.Tensor,
     quant_config: FusedMoEQuantConfig,
     activation: str = "silu",
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
 ) -> torch.Tensor:
@@ -456,18 +511,21 @@ def cutlass_moe_fp8(
     assert quant_config is not None
 
     if quant_config.a1_scale is not None:
-        assert (quant_config.per_act_token_quant ==
-                quant_config.a1_scale.numel() != 1)
+        assert quant_config.per_act_token_quant == (quant_config.a1_scale.numel() != 1)
     if quant_config.a2_scale is not None:
-        assert (quant_config.per_act_token_quant ==
-                quant_config.a2_scale.numel() != 1)
+        assert quant_config.per_act_token_quant == (quant_config.a2_scale.numel() != 1)
 
-    assert (quant_config.w1_scale is None
-            or (quant_config.per_out_ch_quant == (quant_config.w1_scale.size(1)
-                                                  == w1_q.size(1))))
+    if quant_config.w1_scale is not None:
+        if quant_config.per_out_ch_quant:
+            assert quant_config.w1_scale.dim() > 1 and quant_config.w1_scale.size(
+                1
+            ) == w1_q.size(1)
+        else:
+            assert (
+                quant_config.w1_scale.dim() == 1 or quant_config.w1_scale.size(1) == 1
+            )
 
-    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(
-        0)
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0)
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
@@ -550,25 +608,30 @@ def run_cutlass_moe_fp4(
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
     assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
-    assert (w1_fp4.ndim == 3 and w2_fp4.ndim == 3 and w1_blockscale.ndim == 3
-            and w2_blockscale.ndim
-            == 3), ("All Weights must be of rank 3 for cutlass_moe_fp4")
+    assert (
+        w1_fp4.ndim == 3
+        and w2_fp4.ndim == 3
+        and w1_blockscale.ndim == 3
+        and w2_blockscale.ndim == 3
+    ), "All Weights must be of rank 3 for cutlass_moe_fp4"
     m_a, k_a = a.shape
     e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
     e_w2, k_w2, half_n_w2 = w2_fp4.shape
 
-    assert (e_w1 == e_w2
-            and e_w1 == e), ("Number of experts must match",
-                             f" between weights. {e_w1}, {e_w2}, {e}")
-    assert (k_a == half_k_w1 * 2
-            and k == k_w2), ("Hidden size mismatch between a, w1 and w2")
-    assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in "
-                                                      "expected `n`")
-    assert (m == m_a), "input shape mismatch"
+    assert e_w1 == e_w2 and e_w1 == e, (
+        "Number of experts must match",
+        f" between weights. {e_w1}, {e_w2}, {e}",
+    )
+    assert k_a == half_k_w1 * 2 and k == k_w2, (
+        "Hidden size mismatch between a, w1 and w2"
+    )
+    assert nx2_w1 == n * 2 and half_n_w2 * 2 == n, "mismatch in expected `n`"
+    assert m == m_a, "input shape mismatch"
     assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
     assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
-    assert (topk_weights.size(0) == m and topk_ids.size(0)
-            == m), ("topk must be provided for each row of a")
+    assert topk_weights.size(0) == m and topk_ids.size(0) == m, (
+        "topk must be provided for each row of a"
+    )
     topk = topk_ids.size(1)
     out_dtype = a.dtype
     num_topk = topk_ids.size(1)
@@ -585,15 +648,25 @@ def run_cutlass_moe_fp4(
 
     if apply_router_weight_on_input:
         # TODO: this only works for topK=1, will need to update for topK>1
-        assert num_topk == 1, \
+        assert num_topk == 1, (
             "apply_router_weight_on_input is only implemented for topk=1"
+        )
         a.mul_(topk_weights.to(out_dtype))
 
     # problem shapes should have [m, n, k]
     # Note that problem sizes are based on logical number of elements.
-    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, e, n, k,
-                                blockscale_offsets)
+    ops.get_cutlass_moe_mm_data(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        e,
+        n,
+        k,
+        blockscale_offsets,
+    )
 
     a = ops.shuffle_rows(a, a_map)
     rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
@@ -606,17 +679,34 @@ def run_cutlass_moe_fp4(
     c1 = _resize_cache(workspace13, (m * topk, n * 2))
     c2 = _resize_cache(workspace2, (m * topk, n))
     c3 = _resize_cache(workspace13, (m * topk, k))
-    ops.cutlass_fp4_moe_mm(c1, rep_a_fp4, w1_fp4, rep_a_blockscale,
-                           w1_blockscale, w1_alphas, problem_sizes1,
-                           expert_offsets[:-1], blockscale_offsets[:-1])
+    ops.cutlass_fp4_moe_mm(
+        c1,
+        rep_a_fp4,
+        w1_fp4,
+        rep_a_blockscale,
+        w1_blockscale,
+        w1_alphas,
+        problem_sizes1,
+        expert_offsets[:-1],
+        blockscale_offsets[:-1],
+    )
     del rep_a_fp4, rep_a_blockscale
     torch.ops._C.silu_and_mul(c2, c1)
     int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
-        c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk)
+        c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk
+    )
 
-    ops.cutlass_fp4_moe_mm(c3, int_fp4, w2_fp4, int_blockscale, w2_blockscale,
-                           w2_alphas, problem_sizes2, expert_offsets[:-1],
-                           blockscale_offsets[:-1])
+    ops.cutlass_fp4_moe_mm(
+        c3,
+        int_fp4,
+        w2_fp4,
+        int_blockscale,
+        w2_blockscale,
+        w2_alphas,
+        problem_sizes2,
+        expert_offsets[:-1],
+        blockscale_offsets[:-1],
+    )
     del int_fp4, int_blockscale
 
     c3 = ops.shuffle_rows(c3, c_map)
@@ -624,9 +714,12 @@ def run_cutlass_moe_fp4(
     assert output.dtype == out_dtype
     if not apply_router_weight_on_input:
         output.copy_(
-            (c3.view(m, num_topk, k) *
-             topk_weights.view(m, num_topk, 1).to(out_dtype)).sum(dim=1),
-            non_blocking=True)
+            (
+                c3.view(m, num_topk, k)
+                * topk_weights.view(m, num_topk, 1).to(out_dtype)
+            ).sum(dim=1),
+            non_blocking=True,
+        )
     else:
         output.copy_(c3.view(m, num_topk, k).sum(dim=1), non_blocking=True)
     return
@@ -634,7 +727,6 @@ def run_cutlass_moe_fp4(
 
 # Split into batched and non-batched
 class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         max_experts_per_worker: int,
@@ -649,14 +741,18 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
         if self.use_batched_format:
-            return (mk.FusedMoEActivationFormat.BatchedExperts,
-                    mk.FusedMoEActivationFormat.BatchedExperts)
+            return (
+                mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts,
+            )
         else:
-            return (mk.FusedMoEActivationFormat.Standard,
-                    mk.FusedMoEActivationFormat.Standard)
+            return (
+                mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard,
+            )
 
     def supports_expert_map(self) -> bool:
         return False
@@ -667,32 +763,31 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         return TopKWeightAndReduceNoOP()
 
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return self.out_dtype if self.out_dtype is not None else act_dtype
+
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1: tuple[int, ...] = ()
         workspace2: tuple[int, ...] = ()
         output: tuple[int, ...] = ()
         if self.use_batched_format:
-            padded_M = aq.size(1)
-            workspace1 = (self.max_experts_per_worker, padded_M, max(N, K))
-            workspace2 = (self.max_experts_per_worker, padded_M, (N // 2))
-            output = (self.max_experts_per_worker, padded_M, K)
+            workspace1 = (self.max_experts_per_worker, M, max(N, K))
+            workspace2 = (self.max_experts_per_worker, M, (N // 2))
+            output = (self.max_experts_per_worker, M, K)
         else:
             workspace1 = (M * topk, max(2 * N, K))
             workspace2 = (M * topk, N)
             output = (M, K)
-        return (workspace1, workspace2, output,
-                self.out_dtype if self.out_dtype is not None else a.dtype)
+        return (workspace1, workspace2, output)
 
     def apply(
         self,
@@ -704,15 +799,15 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],  # unused
-        a2_scale: Optional[torch.Tensor],  # unused
-        workspace13: Optional[torch.Tensor],
-        workspace2: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,  # unused
+        a2_scale: torch.Tensor | None,  # unused
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
-        e, m, n, k, _ = mk._moe_problem_size(hidden_states, w1, w2, topk_ids)
+        e, m, n, k, _ = self.moe_problem_size(hidden_states, w1, w2, topk_ids)
         n = w2.shape[2] * 2
 
         run_cutlass_moe_fp4(
@@ -740,21 +835,24 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
 
 
 def cutlass_moe_fp4(
-        a: torch.Tensor,
-        w1_fp4: torch.Tensor,
-        w2_fp4: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        quant_config: FusedMoEQuantConfig,
-        m: int,
-        n: int,
-        k: int,
-        e: int,
-        expert_map: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False) -> torch.Tensor:
-    assert expert_map is None, ("Expert Parallelism / expert_map "
-                                "is currently not supported for "
-                                "ModelOptNvFp4FusedMoE's cutlass_moe_fp4.")
+    a: torch.Tensor,
+    w1_fp4: torch.Tensor,
+    w2_fp4: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    assert expert_map is None, (
+        "Expert Parallelism / expert_map "
+        "is currently not supported for "
+        "ModelOptNvFp4FusedMoE's cutlass_moe_fp4."
+    )
 
     # TODO(bnell): this feels a bit hacky
     # NVFP4 requires two levels of quantization, which involves
@@ -799,10 +897,13 @@ def cutlass_moe_fp4(
 
 
 def _valid_cutlass_block_scaled_grouped_gemm(
-        w1: torch.Tensor, w2: torch.Tensor, inplace: bool, activation: str,
-        apply_router_weight_on_input: bool,
-        expert_map: Optional[torch.Tensor]) -> bool:
-
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    inplace: bool,
+    activation: str,
+    apply_router_weight_on_input: bool,
+    expert_map: torch.Tensor | None,
+) -> bool:
     def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
         return N % 128 == 0 and K % 128 == 0
 
@@ -816,7 +917,7 @@ def _valid_cutlass_block_scaled_grouped_gemm(
         )
         return False
 
-    if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn):
+    if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
         logger.debug_once(
             "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
             "w1.dtype: %s, w2.dtype: %s",
@@ -827,19 +928,21 @@ def _valid_cutlass_block_scaled_grouped_gemm(
 
     if expert_map is not None:
         logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is"
-            " not supported.")
+            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
+        )
         return False
 
     if activation != "silu":
         logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: only activation silu is"
-            " supported.")
+            "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
+        )
         return False
 
     if apply_router_weight_on_input:
-        logger.debug_once("CutlassBlockScaledGroupedGemm disabled:"
-                          " apply_router_weight_on_input is not supported.")
+        logger.debug_once(
+            "CutlassBlockScaledGroupedGemm disabled:"
+            " apply_router_weight_on_input is not supported."
+        )
         return False
 
     if inplace:
@@ -867,17 +970,16 @@ def run_cutlass_block_scaled_fused_experts(
     w2_scale = w2_scale.transpose(1, 2)
 
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert a.shape[0] == topk_ids.shape[
-        0], "a and topk_ids must have the same batch size"
+    assert a.shape[0] == topk_ids.shape[0], (
+        "a and topk_ids must have the same batch size"
+    )
     assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
     assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
     assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
     assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
     assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[
-        0], "w1_scale expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[
-        0], "w2_scale expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
     assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
 
     out_dtype = a.dtype
@@ -888,21 +990,14 @@ def run_cutlass_block_scaled_fused_experts(
 
     topk = topk_ids.size(1)
 
-    a_q, a1_scale = _fp8_quantize(a,
-                                  A_scale=None,
-                                  per_act_token=False,
-                                  block_shape=[128, 128])
+    a_q, a1_scale = _fp8_quantize(
+        a, A_scale=None, per_act_token=False, block_shape=[128, 128]
+    )
     device = a_q.device
 
-    expert_offsets = torch.empty((num_experts + 1, ),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes1 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes2 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
+    expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
+    problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
+    problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
 
     a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
     c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
@@ -938,10 +1033,9 @@ def run_cutlass_block_scaled_fused_experts(
     intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
     torch.ops._C.silu_and_mul(intermediate, c1)
 
-    intermediate_q, a2_scale = _fp8_quantize(intermediate,
-                                             A_scale=None,
-                                             per_act_token=False,
-                                             block_shape=[128, 128])
+    intermediate_q, a2_scale = _fp8_quantize(
+        intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
+    )
 
     ops.cutlass_blockwise_scaled_grouped_mm(
         c2,
@@ -953,5 +1047,6 @@ def run_cutlass_block_scaled_fused_experts(
         expert_offsets[:-1],
     )
 
-    return (c2[c_map].view(m, topk, k) *
-            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
+    return (
+        c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
+    ).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 51a4f275e98cc5379d2f0d3af72052df2aafdc7a..86cdd25f2c873b0efc1308913016a7a26c8a7021 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -9,30 +8,42 @@ import vllm.envs as env
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, fp8_w8a8_moe_quant_config)
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
-    compute_aligned_M, deep_gemm_block_shape, deepgemm_moe_permute,
-    deepgemm_unpermute_and_reduce)
+    compute_aligned_M,
+    deepgemm_moe_permute,
+    deepgemm_unpermute_and_reduce,
+)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP)
+    TopKWeightAndReduceNoOP,
+)
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
-from vllm.utils import has_deep_gemm, run_once
-from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
+    per_token_group_quant_fp8,
+)
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    m_grouped_fp8_gemm_nt_contiguous,
+)
+from vllm.utils.func_utils import run_once
+from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
 
 def _valid_deep_gemm_shape(M: int, N: int, K: int) -> bool:
-    align = deep_gemm_block_shape()[0]
+    align = get_mk_alignment_for_contiguous_layout()[0]
     return align <= M and N % align == 0 and K % align == 0
 
 
-def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
-                     w2: torch.Tensor) -> bool:
+def _valid_deep_gemm(
+    hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
+) -> bool:
     """
     Check if the given problem size is supported by the DeepGemm grouped
     gemm kernel.  All of M, N, K and the quantization block_shape must be
@@ -45,7 +56,7 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
     M = hidden_states.size(0)
     _, K, N = w2.size()
 
-    align = deep_gemm_block_shape()[0]
+    align = get_mk_alignment_for_contiguous_layout()[0]
 
     if not _valid_deep_gemm_shape(M, N, K):
         logger.debug_once(
@@ -71,17 +82,19 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
         )
         return False
 
-    if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn):
+    if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
         logger.debug_once(
-            "DeepGemm disabled: invalid weight dtype(s). "
-            "w1.dtype: %s, w2.dtype: %s",
+            "DeepGemm disabled: invalid weight dtype(s). w1.dtype: %s, w2.dtype: %s",
             w1.dtype,
             w2.dtype,
         )
         return False
 
-    if (not hidden_states.is_contiguous() or not w1.is_contiguous()
-            or not w2.is_contiguous()):
+    if (
+        not hidden_states.is_contiguous()
+        or not w1.is_contiguous()
+        or not w2.is_contiguous()
+    ):
         logger.debug_once(
             "DeepGemm disabled: weights or activations not contiguous. "
             "hidden_states.is_contiguous(): %s, w1.is_contiguous(): %s, "
@@ -96,10 +109,13 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
 
 
 @run_once
-def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
-                                          w1_scale: torch.Tensor,
-                                          w2_scale: torch.Tensor,
-                                          num_topk: int):
+def warmup_deepgemm_gg_contiguous_kernels(
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    num_topk: int,
+):
     """
     DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
     input tensor shapes. In this function, we construct all possible input
@@ -108,45 +124,47 @@ def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
     call and not during actual model inference.
     """
 
-    assert w1.size(0) == w2.size(0), (
-        "w1 and w2 must have the same number of experts")
+    assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
 
-    block_m = deep_gemm_block_shape()[0]
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     num_experts = w1.size(0)
     device = w1.device
 
     # This is the maximum GroupedGemm M size that we expect to run
     # the grouped_gemm with.
-    MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE,
-                              num_topk,
-                              num_experts,
-                              block_m,
-                              expert_tokens_meta=None)
+    MAX_M = compute_aligned_M(
+        env.VLLM_FUSED_MOE_CHUNK_SIZE,
+        num_topk,
+        num_experts,
+        block_m,
+        expert_tokens_meta=None,
+    )
     # Distribute expert-ids evenly.
     MAX_BLOCKS = MAX_M // block_m
-    expert_ids_block = torch.randint(low=0,
-                                     high=num_experts,
-                                     size=(MAX_BLOCKS, ),
-                                     device=device,
-                                     dtype=torch.int32)
+    expert_ids_block = torch.randint(
+        low=0, high=num_experts, size=(MAX_BLOCKS,), device=device, dtype=torch.int32
+    )
     expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
 
     def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
-
         _, n, k = w.size()
         a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
-        a1q_scales = torch.empty((MAX_M, k // block_m),
-                                 device=device,
-                                 dtype=torch.float32)
+        a1q_scales = torch.empty(
+            (MAX_M, k // block_m), device=device, dtype=torch.float32
+        )
         out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
 
-        pbar = tqdm(total=MAX_BLOCKS,
-                    desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})")
+        pbar = tqdm(
+            total=MAX_BLOCKS, desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})"
+        )
         num_tokens = MAX_M
         while num_tokens > 0:
             m_grouped_fp8_gemm_nt_contiguous(
-                (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
-                out[:num_tokens], expert_ids[:num_tokens])
+                (a1q[:num_tokens], a1q_scales[:num_tokens]),
+                (w, w_scale),
+                out[:num_tokens],
+                expert_ids[:num_tokens],
+            )
             pbar.update(1)
             num_tokens = num_tokens - block_m
 
@@ -155,20 +173,21 @@ def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
 
 
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(self, quant_config: FusedMoEQuantConfig):
         super().__init__(quant_config)
-        assert quant_config.block_shape == deep_gemm_block_shape()
+        assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout()
         assert quant_config.quant_dtype == torch.float8_e4m3fn
         assert not quant_config.per_act_token_quant
         assert not quant_config.per_out_ch_quant
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_chunking(self) -> bool:
         return True
@@ -181,26 +200,25 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         assert self.block_shape is not None
         block_m = self.block_shape[0]
-        M_sum = compute_aligned_M(M, topk, local_num_experts, block_m,
-                                  expert_tokens_meta)
+        M_sum = compute_aligned_M(
+            M, topk, local_num_experts, block_m, expert_tokens_meta
+        )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, max(N, K))
+        workspace1 = (M_sum, N)
         workspace2 = (M_sum, max(N // 2, K))
         output = (M, K)
-        return (workspace1, workspace2, output, a.dtype)
+        return (workspace1, workspace2, output)
 
     def apply(
         self,
@@ -212,12 +230,12 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert a1q_scale is not None
@@ -235,18 +253,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
         assert w2.size(1) == K
 
-        M_sum = compute_aligned_M(M=topk_ids.size(0),
-                                  num_topk=topk_ids.size(1),
-                                  local_num_experts=local_num_experts,
-                                  alignment=deep_gemm_block_shape()[0],
-                                  expert_tokens_meta=expert_tokens_meta)
+        M_sum = compute_aligned_M(
+            M=topk_ids.size(0),
+            num_topk=topk_ids.size(1),
+            local_num_experts=local_num_experts,
+            alignment=get_mk_alignment_for_contiguous_layout()[0],
+            expert_tokens_meta=expert_tokens_meta,
+        )
 
-        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn),
-                                 (M_sum, K))
+        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M_sum, K))
         mm1_out = _resize_cache(workspace13, (M_sum, N))
         act_out = _resize_cache(workspace2, (M_sum, N // 2))
-        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
-                                  (M_sum, N // 2))
+        quant_out = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+        )
         mm2_out = _resize_cache(workspace2, (M_sum, K))
 
         a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
@@ -256,32 +276,36 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             local_num_experts=local_num_experts,
             expert_map=expert_map,
             expert_tokens_meta=expert_tokens_meta,
-            aq_out=a1q_perm)
+            aq_out=a1q_perm,
+        )
         assert a1q.size(0) == M_sum
 
-        m_grouped_fp8_gemm_nt_contiguous((a1q, a1q_scale), (w1, self.w1_scale),
-                                         mm1_out, expert_ids)
+        m_grouped_fp8_gemm_nt_contiguous(
+            (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
+        )
 
         self.activation(activation, act_out, mm1_out.view(-1, N))
 
-        a2q_scale: Optional[torch.Tensor] = None
-        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
-                                                   self.block_shape[1],
-                                                   column_major_scales=True,
-                                                   out_q=quant_out)
+        a2q_scale: torch.Tensor | None = None
+        a2q, a2q_scale = per_token_group_quant_fp8(
+            act_out, self.block_shape[1], column_major_scales=True, out_q=quant_out
+        )
 
-        m_grouped_fp8_gemm_nt_contiguous((a2q, a2q_scale), (w2, self.w2_scale),
-                                         mm2_out, expert_ids)
+        m_grouped_fp8_gemm_nt_contiguous(
+            (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids
+        )
 
         if apply_router_weight_on_input:
             topk_weights = torch.ones_like(topk_weights)
 
-        deepgemm_unpermute_and_reduce(a=mm2_out,
-                                      topk_ids=topk_ids,
-                                      topk_weights=topk_weights,
-                                      inv_perm=inv_perm,
-                                      expert_map=expert_map,
-                                      output=output)
+        deepgemm_unpermute_and_reduce(
+            a=mm2_out,
+            topk_ids=topk_ids,
+            topk_weights=topk_weights,
+            inv_perm=inv_perm,
+            expert_map=expert_map,
+            output=output,
+        )
 
 
 def deep_gemm_moe_fp8(
@@ -295,9 +319,9 @@ def deep_gemm_moe_fp8(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
     apply_router_weight_on_input=False,
 ) -> torch.Tensor:
     """
@@ -342,7 +366,8 @@ def deep_gemm_moe_fp8(
         w2_scale=w2_scale,
         a1_scale=a1_scale,
         a2_scale=a2_scale,
-        block_shape=deep_gemm_block_shape())
+        block_shape=get_mk_alignment_for_contiguous_layout(),
+    )
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
index c8469501af5db78cbf28ccac8462841b9f422ed8..6cca9541232744a150cd2c6f633db5d313be12d4 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -5,42 +5,37 @@ Taken from https://github.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00b
 and updated to fit vllm needs and terminology.
 """
 
-import functools
-from typing import Optional
-
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
 from vllm.triton_utils import tl, triton
-from vllm.utils import round_up
-
-
-@functools.cache
-def deep_gemm_block_shape() -> list[int]:
-    # Lazy import to avoid CUDA initialization problems.
-    import deep_gemm as dg
-    block = dg.get_m_alignment_for_contiguous_layout()
-    return [block, block]
+from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
+from vllm.utils.math_utils import round_up
 
 
-def expert_num_tokens_round_up_and_sum(expert_num_tokens: torch.Tensor,
-                                       alignment: int) -> int:
+def expert_num_tokens_round_up_and_sum(
+    expert_num_tokens: torch.Tensor, alignment: int
+) -> int:
     # Round up each element in expert_num_tokens to the nearest multiple of
     # alignment.
-    ent = (expert_num_tokens.to(torch.int64) +
-           (alignment - 1)) // alignment * alignment
+    ent = (expert_num_tokens.to(torch.int64) + (alignment - 1)) // alignment * alignment
     return torch.sum(ent).item()
 
 
-def compute_aligned_M(M: int, num_topk: int, local_num_experts: int,
-                      alignment: int,
-                      expert_tokens_meta: Optional[mk.ExpertTokensMetadata]):
-
-    if ((expert_tokens_meta is not None)
-            and (expert_tokens_meta.expert_num_tokens_cpu is not None)):
+def compute_aligned_M(
+    M: int,
+    num_topk: int,
+    local_num_experts: int,
+    alignment: int,
+    expert_tokens_meta: mk.ExpertTokensMetadata | None,
+):
+    if (expert_tokens_meta is not None) and (
+        expert_tokens_meta.expert_num_tokens_cpu is not None
+    ):
         return expert_num_tokens_round_up_and_sum(
-            expert_tokens_meta.expert_num_tokens_cpu, alignment=alignment)
+            expert_tokens_meta.expert_num_tokens_cpu, alignment=alignment
+        )
 
     # expert_num_tokens information is not available on the cpu.
     # compute the max required size.
@@ -74,14 +69,14 @@ def _fwd_kernel_ep_scatter_1(
     cur_expert = tl.program_id(0)
 
     offset_cumsum = tl.arange(0, BLOCK_EXPERT_NUM)
-    tokens_per_expert = tl.load(num_recv_tokens_per_expert + offset_cumsum,
-                                mask=offset_cumsum < num_experts,
-                                other=0)
+    tokens_per_expert = tl.load(
+        num_recv_tokens_per_expert + offset_cumsum,
+        mask=offset_cumsum < num_experts,
+        other=0,
+    )
     tokens_per_expert = round_up_128(tokens_per_expert)
     cumsum = tl.cumsum(tokens_per_expert) - tokens_per_expert
-    tl.store(expert_start_loc + offset_cumsum,
-             cumsum,
-             mask=offset_cumsum < num_experts)
+    tl.store(expert_start_loc + offset_cumsum, cumsum, mask=offset_cumsum < num_experts)
 
     cur_expert_start = tl.load(expert_start_loc + cur_expert)
     cur_expert_token_num = tl.load(num_recv_tokens_per_expert + cur_expert)
@@ -136,34 +131,31 @@ def _fwd_kernel_ep_scatter_2(
     mask_s = offset_in_s < SCALE_HIDDEN_SIZE
 
     for token_id in range(start_token_id, total_token_num, grid_num):
-        to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in,
-                          mask=mask)
-        to_copy_s = tl.load(recv_x_scale + token_id * recv_x_scale_stride0 +
-                            offset_in_s,
-                            mask=mask_s)
+        to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
+        to_copy_s = tl.load(
+            recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s
+        )
 
         for topk_index in tl.range(0, topk_num, 1, num_stages=4):
-            expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 +
-                                topk_index)
+            expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
 
             if HAS_EXPERT_MAP:
                 expert_id = apply_expert_map(expert_id, expert_map)
 
             if expert_id >= 0:
-                dest_token_index = tl.atomic_add(expert_start_loc + expert_id,
-                                                 1)
+                dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
                 tl.store(
-                    output_index + token_id * output_index_stride0 +
-                    topk_index, dest_token_index)
-                output_tensor_ptr = (output_tensor +
-                                     dest_token_index * output_tensor_stride0)
+                    output_index + token_id * output_index_stride0 + topk_index,
+                    dest_token_index,
+                )
+                output_tensor_ptr = (
+                    output_tensor + dest_token_index * output_tensor_stride0
+                )
                 output_tensor_scale_ptr = (
-                    output_tensor_scale +
-                    dest_token_index * output_tensor_scale_stride0)
+                    output_tensor_scale + dest_token_index * output_tensor_scale_stride0
+                )
                 tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
-                tl.store(output_tensor_scale_ptr + offset_in_s,
-                         to_copy_s,
-                         mask=mask_s)
+                tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s)
 
 
 @torch.no_grad()
@@ -172,7 +164,7 @@ def ep_scatter(
     recv_x_scale: torch.Tensor,
     recv_topk: torch.Tensor,
     num_recv_tokens_per_expert: torch.Tensor,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     expert_start_loc: torch.Tensor,
     output_tensor: torch.Tensor,
     output_tensor_scale: torch.Tensor,
@@ -189,7 +181,7 @@ def ep_scatter(
 
     assert m_indices.shape[0] % BLOCK_E == 0
 
-    _fwd_kernel_ep_scatter_1[(grid, )](
+    _fwd_kernel_ep_scatter_1[(grid,)](
         num_recv_tokens_per_expert,
         expert_start_loc,
         m_indices,
@@ -201,7 +193,7 @@ def ep_scatter(
 
     grid = min(recv_topk.shape[0], 1024 * 8)
 
-    _fwd_kernel_ep_scatter_2[(grid, )](
+    _fwd_kernel_ep_scatter_2[(grid,)](
         recv_topk.shape[0],
         expert_start_loc,
         recv_x,
@@ -265,27 +257,33 @@ def _fwd_kernel_ep_gather(
         off_d = tl.arange(0, BLOCK_D)
         accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
         for topk_index in range(0, topk_num):
-            expert_id = tl.load(recv_topk_ids +
-                                cur_token * recv_topk_ids_stride0 + topk_index)
+            expert_id = tl.load(
+                recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
+            )
 
             if HAS_EXPERT_MAP:
                 expert_id = apply_expert_map(expert_id, expert_map)
 
             if expert_id >= 0:
-                source_token_index = tl.load(input_index +
-                                             cur_token * input_index_stride0 +
-                                             topk_index)
-                acc_weight = tl.load(recv_topk_weight +
-                                     cur_token * recv_topk_weight_stride0 +
-                                     topk_index)
-                tmp = tl.load(input_tensor +
-                              source_token_index * input_tensor_stride0 +
-                              cur_block * BLOCK_D + off_d)
+                source_token_index = tl.load(
+                    input_index + cur_token * input_index_stride0 + topk_index
+                )
+                acc_weight = tl.load(
+                    recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
+                )
+                tmp = tl.load(
+                    input_tensor
+                    + source_token_index * input_tensor_stride0
+                    + cur_block * BLOCK_D
+                    + off_d
+                )
                 accumulator += tmp.to(tl.float32) * acc_weight
 
         tl.store(
-            output_tensor + cur_token * output_tensor_stride0 +
-            cur_block * BLOCK_D + off_d,
+            output_tensor
+            + cur_token * output_tensor_stride0
+            + cur_block * BLOCK_D
+            + off_d,
             accumulator.to(output_tensor.dtype.element_ty),
         )
 
@@ -296,7 +294,7 @@ def ep_gather(
     recv_topk_ids: torch.Tensor,
     recv_topk_weight: torch.Tensor,
     input_index: torch.Tensor,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     output_tensor: torch.Tensor,
 ):
     num_warps = 2
@@ -332,44 +330,45 @@ def ep_gather(
     return
 
 
-def deepgemm_moe_permute(aq: torch.Tensor,
-                         aq_scale: torch.Tensor,
-                         topk_ids: torch.Tensor,
-                         local_num_experts: int,
-                         expert_map: Optional[torch.Tensor],
-                         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-                         aq_out: Optional[torch.Tensor] = None):
-
+def deepgemm_moe_permute(
+    aq: torch.Tensor,
+    aq_scale: torch.Tensor,
+    topk_ids: torch.Tensor,
+    local_num_experts: int,
+    expert_map: torch.Tensor | None,
+    expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    aq_out: torch.Tensor | None = None,
+):
     assert aq.ndim == 2
-    assert topk_ids.dtype.is_signed, (
-        "The kernel uses -1 to represent invalid topk_ids")
+    assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids"
     H = aq.size(1)
     device = aq.device
 
-    block_m = deep_gemm_block_shape()[0]
-    block_k = deep_gemm_block_shape()[1]
+    block_m, block_k = get_mk_alignment_for_contiguous_layout()
 
-    M_sum = compute_aligned_M(M=topk_ids.size(0),
-                              num_topk=topk_ids.size(1),
-                              local_num_experts=local_num_experts,
-                              alignment=block_m,
-                              expert_tokens_meta=expert_tokens_meta)
+    M_sum = compute_aligned_M(
+        M=topk_ids.size(0),
+        num_topk=topk_ids.size(1),
+        local_num_experts=local_num_experts,
+        alignment=block_m,
+        expert_tokens_meta=expert_tokens_meta,
+    )
 
-    expert_start_loc = torch.empty((local_num_experts),
-                                   device=device,
-                                   dtype=torch.int32)
+    expert_start_loc = torch.empty(
+        (local_num_experts), device=device, dtype=torch.int32
+    )
 
     assert aq_out is None or aq_out.shape == (M_sum, H)
     if aq_out is None:
         aq_out = torch.empty((M_sum, H), device=device, dtype=aq.dtype)
 
-    aq_scale_out = torch.empty((M_sum, H // block_k),
-                               device=device,
-                               dtype=torch.float32)
+    aq_scale_out = torch.empty(
+        (M_sum, H // block_k), device=device, dtype=torch.float32
+    )
 
-    maybe_has_empty_blocks = ((expert_tokens_meta is None)
-                              or (expert_tokens_meta.expert_num_tokens_cpu
-                                  is None))
+    maybe_has_empty_blocks = (expert_tokens_meta is None) or (
+        expert_tokens_meta.expert_num_tokens_cpu is None
+    )
     expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty
 
     expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32)
@@ -379,35 +378,39 @@ def deepgemm_moe_permute(aq: torch.Tensor,
     if expert_tokens_meta is not None:
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
     else:
-        expert_num_tokens = count_expert_num_tokens(topk_ids,
-                                                    local_num_experts,
-                                                    expert_map)
-
-    ep_scatter(recv_x=aq,
-               recv_x_scale=aq_scale,
-               recv_topk=topk_ids,
-               num_recv_tokens_per_expert=expert_num_tokens,
-               expert_start_loc=expert_start_loc,
-               expert_map=expert_map,
-               output_tensor=aq_out,
-               output_tensor_scale=aq_scale_out,
-               m_indices=expert_ids,
-               output_index=inv_perm)
+        expert_num_tokens = count_expert_num_tokens(
+            topk_ids, local_num_experts, expert_map
+        )
+
+    ep_scatter(
+        recv_x=aq,
+        recv_x_scale=aq_scale,
+        recv_topk=topk_ids,
+        num_recv_tokens_per_expert=expert_num_tokens,
+        expert_start_loc=expert_start_loc,
+        expert_map=expert_map,
+        output_tensor=aq_out,
+        output_tensor_scale=aq_scale_out,
+        m_indices=expert_ids,
+        output_index=inv_perm,
+    )
 
     return aq_out, aq_scale_out, expert_ids, inv_perm
 
 
 def deepgemm_unpermute_and_reduce(
-        a: torch.Tensor,  # Grouped gemm output
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        inv_perm: torch.Tensor,
-        expert_map: Optional[torch.Tensor],
-        output: torch.Tensor):
-
-    return ep_gather(input_tensor=a,
-                     recv_topk_ids=topk_ids,
-                     recv_topk_weight=topk_weights,
-                     input_index=inv_perm,
-                     expert_map=expert_map,
-                     output_tensor=output)
+    a: torch.Tensor,  # Grouped gemm output
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    inv_perm: torch.Tensor,
+    expert_map: torch.Tensor | None,
+    output: torch.Tensor,
+):
+    return ep_gather(
+        input_tensor=a,
+        recv_topk_ids=topk_ids,
+        recv_topk_weight=topk_weights,
+        input_index=inv_perm,
+        expert_map=expert_map,
+        output_tensor=output,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 9e9a9afc18a03e33370c131ac7a45ab45d42cc26..929cff79980c004a5a08466fde1af40ed74429d2 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import deep_ep
 import torch
@@ -8,15 +8,21 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
-from vllm.utils import round_up
+    TopKWeightAndReduceContiguous,
+    TopKWeightAndReduceDelegate,
+)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.utils.math_utils import round_up
 from vllm.v1.worker.ubatching import (
-    dbo_current_ubatch_id, dbo_enabled, dbo_switch_to_comm,
-    dbo_switch_to_compute, dbo_switch_to_compute_sync,
+    dbo_current_ubatch_id,
+    dbo_enabled,
+    dbo_get_previous_event,
+    dbo_switch_to_comm,
+    dbo_switch_to_compute,
+    dbo_switch_to_compute_sync,
     dbo_yield_and_switch_from_comm_to_compute,
-    dbo_yield_and_switch_from_compute_to_comm)
+    dbo_yield_and_switch_from_compute_to_comm,
+)
 
 
 class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
@@ -25,8 +31,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     """
 
     @staticmethod
-    def maybe_roundup_layer_hidden_size(hidden_size: int,
-                                        dtype: torch.dtype) -> int:
+    def maybe_roundup_layer_hidden_size(hidden_size: int, dtype: torch.dtype) -> int:
         # Round up hidden size so it is compatible with DeepEP High Throughput
         # kernels.
         # DeepEP intranode kernels make copies in units of,
@@ -41,8 +46,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         hidden_size_bytes = round_up(hidden_size_bytes, xfer_atom_size)
         return hidden_size_bytes // dtype.itemsize
 
-    def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int,
-                 dp_size: int, rank_expert_offset: int):
+    def __init__(
+        self,
+        buffer: deep_ep.Buffer,
+        num_dispatchers: int,
+        dp_size: int,
+        rank_expert_offset: int,
+    ):
         super().__init__()
         self.buffer = buffer
         self.num_dispatchers_ = num_dispatchers
@@ -61,22 +71,25 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
-    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+    def _get_dispatch_config(self) -> deep_ep.Config | None:
         if self.num_dispatchers_ not in self.available_rank_configs:
             return None
         return deep_ep.Buffer.get_dispatch_config(self.num_dispatchers_)
 
-    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+    def _get_combine_config(self) -> deep_ep.Config | None:
         if self.num_dispatchers_ not in self.available_rank_configs:
             return None
         return deep_ep.Buffer.get_combine_config(self.num_dispatchers_)
@@ -84,14 +97,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def _do_dispatch(
         self,
         tokens: torch.Tensor,
-        token_scales: Optional[torch.Tensor],
+        token_scales: torch.Tensor | None,
         rank_topk_ids: torch.Tensor,
         rank_topk_weights: torch.Tensor,
         num_experts: int,
-        a1_scale: Optional[torch.Tensor],
+        a1_scale: torch.Tensor | None,
         quant_config: FusedMoEQuantConfig,
     ) -> Callable:
-
         has_scales = token_scales is not None
 
         # We yield before launching the dispatch kernel since the dispatch
@@ -99,22 +111,35 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # for the other ubatch before the dispatch kernel starts.
         dbo_yield_and_switch_from_compute_to_comm()
 
-        (num_tokens_per_rank, num_tokens_per_rdma_rank,
-         dispatch_expert_num_tokens, is_token_in_rank,
-         event) = self.buffer.get_dispatch_layout(
-             topk_idx=rank_topk_ids,
-             num_experts=num_experts,
-             previous_event=None,
-             async_finish=False,
-             allocate_on_comm_stream=False)
+        # capture a DeepEP event and pass it as previous_event so
+        # DeepEP honors the dependency internally.
+        previous_event = dbo_get_previous_event(self.buffer.capture)
+
+        (
+            num_tokens_per_rank,
+            num_tokens_per_rdma_rank,
+            dispatch_expert_num_tokens,
+            is_token_in_rank,
+            event,
+        ) = self.buffer.get_dispatch_layout(
+            topk_idx=rank_topk_ids,
+            num_experts=num_experts,
+            previous_event=previous_event,
+            async_finish=False,
+            allocate_on_comm_stream=False,
+        )
 
         token_data = tokens
         if has_scales:
             token_data = (tokens, token_scales)
 
         (
-            token_data, expert_topk_ids, expert_topk_weights,
-            expert_num_tokens_per_expert_list, handle, event
+            token_data,
+            expert_topk_ids,
+            expert_topk_weights,
+            expert_num_tokens_per_expert_list,
+            handle,
+            event,
         ) = self.buffer.dispatch(
             x=token_data,
             handle=None,
@@ -128,9 +153,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             # to this value.
             expert_alignment=1,
             config=self._get_dispatch_config(),
-            previous_event=None,
+            previous_event=previous_event,
             async_finish=self.async_prepare and not dbo_enabled(),
-            allocate_on_comm_stream=False)
+            allocate_on_comm_stream=False,
+        )
 
         # record the handle for this ubatch
         a2a_idx = dbo_current_ubatch_id()
@@ -154,12 +180,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         event: deep_ep.EventOverlap,
         has_scales: bool,
-        token_data: Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor],
-        expert_topk_ids: Optional[torch.Tensor],
+        token_data: tuple[torch.Tensor, torch.Tensor] | torch.Tensor,
+        expert_topk_ids: torch.Tensor | None,
         num_experts: int,
         expert_num_tokens_per_expert_list: list[int],
-        expert_topk_weights: Optional[torch.Tensor],
-        a1_scale: Optional[torch.Tensor],
+        expert_topk_weights: torch.Tensor | None,
+        a1_scale: torch.Tensor | None,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
         if event.event is not None:
@@ -185,13 +211,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_topk_ids = torch.where(
             expert_topk_ids == -1,
             num_experts - 1 if self.rank_expert_offset == 0 else 0,
-            expert_topk_ids + self.rank_expert_offset)
+            expert_topk_ids + self.rank_expert_offset,
+        )
 
         # Makes a GPU-CPU copy.
         # TODO (varun): Maybe it is better to re-compute the expert_num_tokens
         # on GPU.
         expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list(
-            expert_num_tokens_per_expert_list, device=expert_x.device)
+            expert_num_tokens_per_expert_list, device=expert_x.device
+        )
 
         # Dispatch and Quant
         # DeepEP kernels only support dispatching block-quantized
@@ -206,10 +234,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                     a1_scale,
                     quant_dtype=quant_config.quant_dtype,
                     per_act_token_quant=False,
-                    block_shape=quant_config.block_shape)
+                    block_shape=quant_config.block_shape,
+                )
 
-        return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
-                expert_topk_weights)
+        return (
+            expert_x,
+            expert_x_scale,
+            expert_tokens_meta,
+            expert_topk_ids,
+            expert_topk_weights,
+        )
 
     def supports_async(self) -> bool:
         return True
@@ -220,16 +254,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.ReceiverType:
-
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1")
+                "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1 = a1 * topk_weights.to(a1.dtype)
 
         if quant_config.is_block_quantized:
@@ -249,13 +283,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             a1q_scale = None
             a1_post_scale = quant_config.a1_scale
 
-        return self._do_dispatch(tokens=a1q,
-                                 token_scales=a1q_scale,
-                                 rank_topk_ids=topk_ids,
-                                 rank_topk_weights=topk_weights,
-                                 num_experts=num_experts,
-                                 a1_scale=a1_post_scale,
-                                 quant_config=quant_config)
+        return self._do_dispatch(
+            tokens=a1q,
+            token_scales=a1q_scale,
+            rank_topk_ids=topk_ids,
+            rank_topk_weights=topk_weights,
+            num_experts=num_experts,
+            a1_scale=a1_post_scale,
+            quant_config=quant_config,
+        )
 
     def prepare(
         self,
@@ -263,13 +299,19 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
-        receiver = self.prepare_async(a1, topk_weights, topk_ids, num_experts,
-                                      expert_map, apply_router_weight_on_input,
-                                      quant_config)
+        receiver = self.prepare_async(
+            a1,
+            topk_weights,
+            topk_ids,
+            num_experts,
+            expert_map,
+            apply_router_weight_on_input,
+            quant_config,
+        )
         return receiver()
 
     def _finalize(
@@ -281,8 +323,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
         do_async: bool,
-    ) -> Optional[Callable]:
-
+    ) -> Callable | None:
         a2a_idx = dbo_current_ubatch_id()
         handle = self.handles[a2a_idx]
         assert handle is not None
@@ -300,14 +341,20 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         dbo_yield_and_switch_from_compute_to_comm()
+        assert fused_expert_output.dtype == torch.bfloat16, (
+            f"Expected fused_expert_output bfloat16, got {fused_expert_output.dtype}"
+        )
+        previous_event = dbo_get_previous_event(self.buffer.capture)
         combined_x, _, event = self.buffer.combine(
+            # HT combine only supports BF16
             x=fused_expert_output,
             handle=handle,
             topk_weights=None,
             config=self._get_combine_config(),
-            previous_event=None,
+            previous_event=previous_event,
             async_finish=do_async and not dbo_enabled(),
-            allocate_on_comm_stream=False)
+            allocate_on_comm_stream=False,
+        )
 
         dbo_switch_to_compute()
 
@@ -341,9 +388,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
     ) -> Callable:
-        receiver = self._finalize(output, fused_expert_output, topk_weights,
-                                  topk_ids, apply_router_weight_on_input,
-                                  weight_and_reduce_impl, True)
+        receiver = self._finalize(
+            output,
+            fused_expert_output,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+            weight_and_reduce_impl,
+            True,
+        )
         assert receiver is not None
         return receiver
 
@@ -356,6 +409,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
     ) -> None:
-        self._finalize(output, fused_expert_output, topk_weights, topk_ids,
-                       apply_router_weight_on_input, weight_and_reduce_impl,
-                       False)
+        self._finalize(
+            output,
+            fused_expert_output,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+            weight_and_reduce_impl,
+            False,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index a9554291db69ce8c4eb38a561246ef5b46fecb8d..06c9df317f7c75683329476652b0c4d094db26ae 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,26 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import deep_ep
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate,
+)
 from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input, normalize_batched_scales_shape)
-from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled,
-                                      dbo_maybe_run_recv_hook)
+    moe_kernel_quantize_input,
+    normalize_batched_scales_shape,
+)
+from vllm.v1.worker.ubatching import (
+    dbo_current_ubatch_id,
+    dbo_enabled,
+    dbo_maybe_run_recv_hook,
+)
+
+logger = init_logger(__name__)
 
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
 
-def dequant_fp8(expert_x_fp8: torch.Tensor,
-                expert_x_scales: torch.Tensor) -> torch.Tensor:
+def dequant_fp8(
+    expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor
+) -> torch.Tensor:
     """
     Return dequantized tensor in fp32
     """
@@ -30,7 +40,8 @@ def dequant_fp8(expert_x_fp8: torch.Tensor,
     num_experts = expert_x_fp8.size(0)
 
     expert_x_fp32 = expert_x_fp8.to(torch.float32).view(
-        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE)
+        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE
+    )
     expert_x_scales = expert_x_scales.view(num_experts, -1, 1)
     return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.size())
 
@@ -42,13 +53,39 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     # DeepEP low-latency kernels are compiled only for certain
     # specific hidden sizes.
-    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 6144, 7168]
+    # NOTE: Keep this list sorted, maybe_roundup_layer_hidden_size depends
+    # on it.
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 3072, 4096, 5120, 6144, 7168, 8192]
+
+    @staticmethod
+    def maybe_roundup_layer_hidden_size(hidden_size: int) -> int:
+        # Round up hidden size to the closest supported hidden size.
+        _supported_hs = DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES
+        # Check sorted
+        num_supported_hs = len(_supported_hs)
+        assert all(
+            [
+                _supported_hs[i] < _supported_hs[i + 1]
+                for i in range(num_supported_hs - 1)
+            ]
+        )
+
+        for x in _supported_hs:
+            if x >= hidden_size:
+                return x
+
+        raise ValueError(
+            f"Hidden Size {hidden_size} is greater than the "
+            f"maximum supported hidden size {_supported_hs[-1]}"
+        )
 
-    def __init__(self,
-                 buffer: deep_ep.Buffer,
-                 max_tokens_per_rank: int,
-                 num_dispatchers: int,
-                 use_fp8_dispatch: bool = False):
+    def __init__(
+        self,
+        buffer: deep_ep.Buffer,
+        max_tokens_per_rank: int,
+        num_dispatchers: int,
+        use_fp8_dispatch: bool = False,
+    ):
         super().__init__()
 
         self.buffer = buffer
@@ -57,32 +94,60 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
-        self.handles: list[Optional[tuple]] = [None, None]
+        self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        # We don't have enough information to determine if we should dispatch
+        # activation scales in a packed ue8m0 format during object construction
+        # time. This setting is handled by post_init_setup.
+        self.use_ue8m0_dispatch = False
+
+    def post_init_setup(self, fused_experts: mk.FusedMoEPermuteExpertsUnpermute):
+        if not fused_experts.supports_packed_ue8m0_act_scales():
+            # Early exit.
+            return
+
+        if self.use_fp8_dispatch:
+            logger.debug_once(
+                "Update DeepEPLLPrepareFinalize to do packed ue8m0 scales dispatch."
+            )
+            self.use_ue8m0_dispatch = True
+        else:
+            logger.warning_once(
+                "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
+                f"activations despite ({fused_experts.__class__.__name__}) being able "
+                "to support quantized activations.",
+                scope="local",
+            )
+
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_tokens_per_rank
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
     def _do_quant(
         self,
-        x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         a1_dtype: torch.dtype,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if self.use_fp8_dispatch:
-            block_k = quant_config.block_shape[
-                1] if quant_config.block_shape is not None else None
+            block_k = (
+                quant_config.block_shape[1]
+                if quant_config.block_shape is not None
+                else None
+            )
             if block_k == DEEPEP_QUANT_BLOCK_SIZE:
                 # DeepEP kernels did the quantization for us.
                 x, x_scales = x
@@ -99,8 +164,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # TODO (varun): Optimization - Use a batched version of quant
         x = x.view((-1, hidden_dim))
         x, x_scales = moe_kernel_quantize_input(
-            x, quant_config.a1_scale, quant_config.quant_dtype,
-            quant_config.per_act_token_quant, quant_config.block_shape)
+            x,
+            quant_config.a1_scale,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+        )
         x = x.view((num_experts, -1, hidden_dim))
 
         if quant_config.quant_dtype is not None:
@@ -118,65 +187,83 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> tuple[Callable, mk.ReceiverType]:
-
         hidden_size = a1.size(1)
-        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
-            (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
-            f"{self.SUPPORTED_HIDDEN_SIZES}")
+        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, (
+            f"Hidden Size {hidden_size} not in supported list of hidden sizes"
+            f"{self.SUPPORTED_HIDDEN_SIZES}"
+        )
 
         a2a_idx = dbo_current_ubatch_id()
 
         if self.use_fp8_dispatch:
-            assert hidden_size % 128 == 0, \
-            "DeepEP kernels quantize the inputs in blocks of shape 128"
-
-        has_per_token_scales = quant_config.a1_scale.numel(
-        ) != 1 if quant_config.a1_scale is not None else (
-            quant_config.a2_scale.numel() != 1
-            if quant_config.a2_scale is not None else False)
+            assert hidden_size % 128 == 0, (
+                "DeepEP kernels quantize the inputs in blocks of shape 128"
+            )
+
+        has_per_token_scales = (
+            quant_config.a1_scale.numel() != 1
+            if quant_config.a1_scale is not None
+            else (
+                quant_config.a2_scale.numel() != 1
+                if quant_config.a2_scale is not None
+                else False
+            )
+        )
         assert not has_per_token_scales, (
-            "low_latency kernels doesn't support dispatching per-token scales")
+            "low_latency kernels doesn't support dispatching per-token scales"
+        )
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1")
+                "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
-        expert_x, expert_num_tokens, handle, _, hook= \
-                self.buffer.low_latency_dispatch(a1,
-                                                topk_ids,
-                                                self.max_tokens_per_rank,
-                                                num_experts,
-                                                use_fp8=self.use_fp8_dispatch,
-                                                async_finish=False,
-                                                return_recv_hook=True)
+        expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
+            a1,
+            topk_ids,
+            self.max_tokens_per_rank,
+            num_experts,
+            use_fp8=self.use_fp8_dispatch,
+            # round_scale needs to be set to dispatch in ue8m0
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
+            async_finish=False,
+            return_recv_hook=True,
+        )
         self.handles[a2a_idx] = handle
 
         return (
             hook,
-            lambda: self._receiver(expert_x, expert_num_tokens, quant_config.
-                                   a1_scale, a1.dtype, quant_config))
+            lambda: self._receiver(
+                expert_x,
+                expert_num_tokens,
+                quant_config.a1_scale,
+                a1.dtype,
+                quant_config,
+            ),
+        )
 
     def _receiver(
         self,
-        expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        expert_x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         expert_num_tokens: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
+        a1_scale: torch.Tensor | None,
         a1_dtype: torch.dtype,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
-        expert_x, expert_x_scale = self._do_quant(expert_x, a1_dtype,
-                                                  quant_config)
+        expert_x, expert_x_scale = self._do_quant(expert_x, a1_dtype, quant_config)
 
         expert_tokens_meta = mk.ExpertTokensMetadata(
-            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
+            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
+        )
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
@@ -186,14 +273,19 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
-        hook, receiver = self.prepare_async(a1, topk_weights, topk_ids,
-                                            num_experts, expert_map,
-                                            apply_router_weight_on_input,
-                                            quant_config)
+        hook, receiver = self.prepare_async(
+            a1,
+            topk_weights,
+            topk_ids,
+            num_experts,
+            expert_map,
+            apply_router_weight_on_input,
+            quant_config,
+        )
         hook()
         return receiver()
 
@@ -207,9 +299,9 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
         do_async: bool,
     ) -> tuple[Callable, Callable]:
-        assert isinstance(
-            weight_and_reduce_impl, TopKWeightAndReduceDelegate
-        ), ("Weight application and reduction happens in the combine kernel.")
+        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), (
+            "Weight application and reduction happens in the combine kernel."
+        )
 
         a2a_idx = dbo_current_ubatch_id()
         do_recv_hook = dbo_enabled() or do_async
@@ -231,7 +323,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             async_finish=False,
             zero_copy=False,
             return_recv_hook=do_recv_hook,
-            out=output)
+            out=output,
+        )
 
         return recv_hook, lambda: None
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 3ea4ed39e956888e570d7b0c9296f2fa50f04388..f864634c661768deb3f4d939915461599b7296da 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -8,40 +7,47 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    create_flashinfer_prepare_finalize)
+    create_flashinfer_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP)
-from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
-                                   has_flashinfer_cutlass_fused_moe)
+    TopKWeightAndReduceNoOP,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutlass_fused_moe,
+    has_flashinfer_cutlass_fused_moe,
+)
 
 logger = init_logger(__name__)
 
 
-def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor,
-                                          w1: torch.Tensor,
-                                          w2: torch.Tensor) -> bool:
+def is_valid_flashinfer_cutlass_fused_moe(
+    hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
+) -> bool:
     """
     Check if the given problem size is supported by the FlashInfer CUTLASS MoE
     kernel.
     """
     if not has_flashinfer_cutlass_fused_moe():
-        logger.debug_once("FlashInferExperts disabled: "
-                          "flashinfer_cutlass_fused_moe not available.")
+        logger.debug_once(
+            "FlashInferExperts disabled: flashinfer_cutlass_fused_moe not available."
+        )
         return False
     # Data type checks
-    if (w1.dtype != torch.uint8 or w2.dtype != torch.uint8
-            or hidden_states.dtype
-            not in [torch.float32, torch.float16, torch.bfloat16]):
+    if (
+        w1.dtype != torch.uint8
+        or w2.dtype != torch.uint8
+        or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16]
+    ):
         logger.debug_once(
             "FlashInferExperts disabled: w1/w2 must be torch.uint8 "
             f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be "
-            f"float32, float16, or bfloat16 (got {hidden_states.dtype}).")
+            f"float32, float16, or bfloat16 (got {hidden_states.dtype})."
+        )
         return False
     return True
 
 
 class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         out_dtype: torch.dtype,
@@ -50,24 +56,33 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         ep_size: int = 1,
         tp_rank: int = 0,
         tp_size: int = 1,
+        use_dp: bool = False,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__(quant_config)
-        assert quant_config.quant_dtype in (
-            "nvfp4", torch.float8_e4m3fn,
-            None), ("Only nvfp4, fp8, bfloat16 and"
-                    " float16 quantization are currently supported.")
+        assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), (
+            "Only nvfp4, fp8, bfloat16 and"
+            " float16 quantization are currently supported."
+        )
         self.ep_rank = ep_rank
         self.ep_size = ep_size
         self.tp_rank = tp_rank
         self.tp_size = tp_size
         self.out_dtype = out_dtype
+        self.use_dp = use_dp
+        # Enables DeepSeek-style FP8 block-scale path:
+        # - pass per-block weight scales to the kernel
+        # - skip input activation quantization (kernel applies scaling)
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_expert_map(self) -> bool:
         return False
@@ -81,16 +96,14 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # We use global_num_experts due to how moe_align_block_size handles
         # expert_maps.
         """
@@ -109,16 +122,13 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        aq_m, aq_n = aq.shape
-        workspace2 = (0, )
-        output_shape = (aq_m,
-                        aq_n * 2) if self.quant_dtype == "nvfp4" else (aq_m,
-                                                                       aq_n)
-        workspace_dtype = a.dtype
-        workspace1 = output_shape
+        workspace1 = (M, K)
+        workspace2 = (0,)
+        # For TP, the quantization is fused with fused_moe call.
+        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K)
         # The workspace is determined by `aq`, since it comes after any
         # potential communication op and is involved in the expert computation.
-        return (workspace1, workspace2, output_shape, workspace_dtype)
+        return (workspace1, workspace2, output_shape)
 
     def apply(
         self,
@@ -130,21 +140,35 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        workspace13: Optional[torch.Tensor],
-        workspace2: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-        apply_router_weight_on_input: Optional[bool],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool | None,
     ):
+        from flashinfer.fused_moe.core import ActivationType
 
-        assert activation == "silu", ("Only activation silu is supported in "
-                                      "FlashInferExperts")
+        activation_str_to_value_map = {
+            "silu": ActivationType.Swiglu,  # This is the default
+            "relu2_no_mul": ActivationType.Relu2,
+        }
+        assert activation in activation_str_to_value_map, (
+            f"{activation=} missing from {activation_str_to_value_map.keys()=}"
+        )
 
-        if self.quant_dtype == torch.float8_e4m3fn:
+        # Select quantization metadata based on FP8 format/path
+        if (
+            self.quant_dtype == torch.float8_e4m3fn
+            and not self.use_deepseek_fp8_block_scale
+        ):
+            # FP8 per-tensor path: use global alphas/scales; do not pass input_sf
             quant_scales = [
-                self.g1_alphas, self.a2_gscale, self.g2_alphas, self.a1_gscale
+                self.g1_alphas,
+                self.a2_gscale,
+                self.g2_alphas,
+                self.a1_gscale,
             ]
 
             a1q_scale = None  # not passing input_sf in fp8
@@ -153,8 +177,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         elif self.quant_dtype == "nvfp4":
             # Ensure w1_scale and w2_scale are not None before calling view
             assert self.w1_scale is not None and self.w2_scale is not None, (
-                "w1_scale and w2_scale must not "
-                "be None for FlashInferExperts")
+                "w1_scale and w2_scale must not be None for FlashInferExperts"
+            )
             # Flashinfer CUTLASS kernel takes scalar global scales,
             # min because inv_scale.
             quant_scales = [
@@ -168,6 +192,15 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights = w1.view(torch.long)
             fc2_expert_weights = w2.view(torch.long)
+        elif self.use_deepseek_fp8_block_scale:
+            # FP8 block-scale path: provide block-scale weights, omit a1q_scale
+            quant_scales = [
+                self.w1_scale,
+                self.w2_scale,
+            ]
+            a1q_scale = None
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
         else:
             quant_scales = None
             a1q_scale = None
@@ -188,6 +221,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            activation_type=activation_str_to_value_map[activation],
+            # Informs FlashInfer to use the block-scale decoding path when True
+            use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
         )
 
 
@@ -201,7 +237,7 @@ def flashinfer_cutlass_moe_fp4(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     fused_experts = mk.FusedMoEModularKernel(
@@ -209,7 +245,9 @@ def flashinfer_cutlass_moe_fp4(
         FlashInferExperts(
             out_dtype=hidden_states.dtype,
             quant_config=quant_config,
-        ))
+            use_dp=False,
+        ),
+    )
 
     return fused_experts(
         hidden_states=hidden_states,
@@ -235,7 +273,7 @@ def flashinfer_cutlass_moe(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     tp_rank: int = 0,
     tp_size: int = 1,
@@ -252,7 +290,8 @@ def flashinfer_cutlass_moe(
             tp_size=tp_size,
             ep_rank=ep_rank,
             ep_size=ep_size,
-        ))
+        ),
+    )
 
     return fused_experts(
         hidden_states=hidden_states,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index ed364ac77b2862d3e77add02e35d5b9238daf800..762890867e6054f80b220848374fcb2a50e685bf 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.distributed.device_communicators.base_device_communicator import (
-    All2AllManagerBase)
+    All2AllManagerBase,
+)
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.flashinfer import nvfp4_block_scale_interleave
 
 
@@ -26,25 +28,32 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__()
         self.num_dispatchers_ = num_dispatchers
         self.use_dp = use_dp
         self.local_tokens = None
+        # Toggle for DeepSeek-style FP8 block-scale path where activations are
+        # not quantized here and weight block scales are consumed by the kernel.
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def _apply_router_weight_on_input(
         self,
         a1: torch.Tensor,
@@ -55,28 +64,28 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         """Apply router weight on input if needed."""
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
-            assert topk == 1, \
+            assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1.mul_(topk_weights.to(a1.dtype))
 
 
-class FlashInferAllToAllMoEPrepareAndFinalize(
-        FlashInferCutlassMoEPrepareAndFinalize):
+class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
     """FlashInfer implementation using AllToAll communication."""
 
     def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
         self.alltoall_info = None
 
         # Initialize all2all_manager only for DP case
         self.all2all_manager = None
         if self.use_dp:
-            self.all2all_manager = get_ep_group(
-            ).device_communicator.all2all_manager
+            self.all2all_manager = get_ep_group().device_communicator.all2all_manager
 
     def prepare(
         self,
@@ -84,41 +93,47 @@ class FlashInferAllToAllMoEPrepareAndFinalize(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
-
-        self._apply_router_weight_on_input(a1, topk_weights, topk_ids,
-                                           apply_router_weight_on_input)
+        self._apply_router_weight_on_input(
+            a1, topk_weights, topk_ids, apply_router_weight_on_input
+        )
 
         if not self.use_dp:
-            # Non-DP case: standard quantization
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
+            # Non-DP case: quantize activations unless using block-scale path
+            if not self.use_deepseek_fp8_block_scale:
+                a1q, a1q_scale = moe_kernel_quantize_input(
+                    a1,
+                    quant_config.a1_gscale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                    is_fp4_scale_swizzled=not self.use_dp,
+                )
+            else:
+                a1q = a1
+                a1q_scale = None
         else:
             # DP case: use FlashInfer AllToAll
             global_num_tokens_cpu = get_local_sizes()
             top_k = topk_ids.size(1)
 
-            (self.alltoall_info, topk_ids, topk_weights, a1q,
-             a1q_scale) = flashinfer_alltoall_dispatch(
-                 self.all2all_manager,
-                 global_num_tokens_cpu,
-                 a1,
-                 quant_config.a1_gscale,
-                 topk_ids,
-                 topk_weights,
-                 top_k,
-                 num_experts,
-                 quant_config,
-             )
+            (self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = (
+                flashinfer_alltoall_dispatch(
+                    self.all2all_manager,
+                    global_num_tokens_cpu,
+                    a1,
+                    quant_config.a1_gscale,
+                    topk_ids,
+                    topk_weights,
+                    top_k,
+                    num_experts,
+                    quant_config,
+                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
+                )
+            )
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
 
@@ -144,15 +159,14 @@ class FlashInferAllToAllMoEPrepareAndFinalize(
         output.copy_(fused_expert_output)
 
 
-class FlashInferAllGatherMoEPrepareAndFinalize(
-        FlashInferCutlassMoEPrepareAndFinalize):
-
+class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
     def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
 
     def prepare(
         self,
@@ -160,31 +174,53 @@ class FlashInferAllGatherMoEPrepareAndFinalize(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
+        self._apply_router_weight_on_input(
+            a1, topk_weights, topk_ids, apply_router_weight_on_input
+        )
+        if not self.use_dp and quant_config.quant_dtype == "nvfp4":
+            return a1, None, None, topk_ids, topk_weights
 
-        self._apply_router_weight_on_input(a1, topk_weights, topk_ids,
-                                           apply_router_weight_on_input)
+        if not self.use_deepseek_fp8_block_scale:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=not self.use_dp,
+            )
+        else:
+            # Block-scale path: pass activations through, omit per-token scales
+            a1q = a1
+            a1q_scale = None
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=not self.use_dp,
-        )
         if self.use_dp:
-            topk_weights, topk_ids, a1q, a1q_scale = \
-                get_dp_group().all_gatherv(
-                    [topk_weights, topk_ids, a1q, a1q_scale],
+            # Build gather list conditionally - omit a1q_scale if None
+            # (block-scale path)
+            gather_list = [topk_weights, topk_ids, a1q]
+            if a1q_scale is not None:
+                gather_list.append(a1q_scale)
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q, a1q_scale = gathered
+            else:
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
                     dim=0,
                     sizes=get_local_sizes(),
                 )
-            if quant_config.quant_dtype == "nvfp4":
-                a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
+                topk_weights, topk_ids, a1q = gathered
+                a1q_scale = None
+
+        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
+            a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
 
@@ -197,10 +233,12 @@ class FlashInferAllGatherMoEPrepareAndFinalize(
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
     ) -> None:
+        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
 
         if self.use_dp:
             fused_expert_output = get_dp_group().reduce_scatterv(
-                fused_expert_output, dim=0, sizes=get_local_sizes())
+                fused_expert_output, dim=0, sizes=get_local_sizes()
+            )
         output.copy_(fused_expert_output)
 
 
@@ -214,53 +252,72 @@ def flashinfer_alltoall_dispatch(
     top_k: int,
     num_experts: int,
     quant_config: FusedMoEQuantConfig,
+    use_deepseek_fp8_block_scale: bool = False,
 ):
     from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert (all2all_manager.ensure_alltoall_workspace_initialized()
-            ), "FlashInfer AllToAll workspace not available"
+
+    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
+        "FlashInfer AllToAll workspace not available"
+    )
 
     ep_rank = all2all_manager.rank
     ep_size = all2all_manager.world_size
-    max_num_token = max(global_num_tokens_cpu
-                        ) if global_num_tokens_cpu is not None else x.shape[0]
+    max_num_token = (
+        max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
+    )
+    orig_topk_weights_dtype = topk_weights.dtype
     alltoall_info, topk_ids, topk_weights, _ = (
         MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
             topk_ids,
             topk_weights,
             None,
-            all2all_manager.prepare_workspace,
+            all2all_manager.prepare_workspace_tensor,
             max_num_token,
             ep_rank,
             ep_size,
             num_experts,
             num_experts,
             top_k,
-        ))
-
-    x, x_sf = moe_kernel_quantize_input(
-        x,
-        gs,
-        quant_config.quant_dtype,
-        quant_config.per_act_token_quant,
-        quant_config.block_shape,
-        is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-    )
-    x = MnnvlMoe.mnnvl_moe_alltoallv(
-        x,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
+        )
     )
+    topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
 
-    x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-        x_sf,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
-    x_sf = nvfp4_block_scale_interleave(x_sf)
+    if not use_deepseek_fp8_block_scale:
+        x, x_sf = moe_kernel_quantize_input(
+            x,
+            gs,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+        )
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+
+        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
+            x_sf,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        if quant_config.quant_dtype == "nvfp4":
+            x_sf = nvfp4_block_scale_interleave(x_sf)
+    else:
+        # Block-scale path: pass activations through without quantization
+        x_sf = None
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
     return alltoall_info, topk_ids, topk_weights, x, x_sf
 
 
@@ -272,8 +329,10 @@ def flashinfer_alltoall_combine(
     alltoall_info,
 ):
     from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert (all2all_manager.ensure_alltoall_workspace_initialized()
-            ), "FlashInfer AllToAll workspace not available"
+
+    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
+        "FlashInfer AllToAll workspace not available"
+    )
     return MnnvlMoe.mnnvl_moe_alltoallv_combine(
         output,
         alltoall_info,
@@ -289,6 +348,7 @@ def create_flashinfer_prepare_finalize(
     use_dp: bool,
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> FlashInferCutlassMoEPrepareAndFinalize:
     """Factory function to create the appropriate FlashInfer implementation."""
     if use_nvfp4:
@@ -296,5 +356,7 @@ def create_flashinfer_prepare_finalize(
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
         else:
             return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
-    # Fp8 only supports AllGather
-    return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
+    # FP8 path currently supported via AllGather; optionally enable block-scale
+    return FlashInferAllGatherMoEPrepareAndFinalize(
+        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index fe586a22e2506ae1ef2f2b1bc7b7ed3a20bc1849..51e06ac54f497cc4ac2837297416f728b75c8c0e 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -1,45 +1,47 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import List  # noqa: UP035
-from typing import Optional
 
 import torch
 
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    calculate_tile_tokens_dim)
+    calculate_tile_tokens_dim,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
-from vllm.utils import direct_register_custom_op
+    per_token_group_quant_fp8,
+)
+from vllm.utils.torch_utils import direct_register_custom_op
 
 
 def flashinfer_fused_moe_blockscale_fp8(
-        routing_logits: torch.Tensor,
-        routing_bias: torch.Tensor,
-        x: torch.Tensor,
-        w13_weight: torch.Tensor,
-        w13_weight_scale_inv: torch.Tensor,
-        w2_weight: torch.Tensor,
-        w2_weight_scale_inv: torch.Tensor,
-        global_num_experts: int,
-        top_k: int,
-        num_expert_group: int,
-        topk_group: int,
-        intermediate_size: int,
-        expert_offset: int,
-        local_num_experts: int,
-        block_shape: List[int],  #noqa: UP006
-        routed_scaling: float = 1.0) -> torch.Tensor:
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    x: torch.Tensor,
+    w13_weight: torch.Tensor,
+    w13_weight_scale_inv: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_scale_inv: torch.Tensor,
+    global_num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    expert_offset: int,
+    local_num_experts: int,
+    block_shape: list[int],
+    routing_method_type: int = RoutingMethodType.DeepSeekV3,
+    routed_scaling: float | None = 1.0,
+) -> torch.Tensor:
     from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
+
+    topk_group = topk_group if topk_group is not None else 0
     assert top_k <= global_num_experts
-    assert top_k <= 8
-    assert topk_group <= 4
-    assert global_num_experts > num_expert_group
-    assert global_num_experts % num_expert_group == 0
+    assert top_k <= 10
     assert global_num_experts % 4 == 0
-    assert top_k < (topk_group * global_num_experts / num_expert_group)
     assert block_shape == [128, 128]
+    # Routing kernel expects #experts <= #threads 512
+    assert global_num_experts <= 512
 
     a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
     # NOTE: scales of hidden states have to be transposed!
@@ -61,30 +63,31 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k,
-                                                  global_num_experts),
-        routing_method_type=2,  # DeepSeek-styled routing method
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
         use_shuffled_weight=False,
     )
 
 
 def flashinfer_fused_moe_blockscale_fp8_fake(
-        routing_logits: torch.Tensor,
-        routing_bias: torch.Tensor,
-        x: torch.Tensor,
-        w13_weight: torch.Tensor,
-        w13_weight_scale_inv: torch.Tensor,
-        w2_weight: torch.Tensor,
-        w2_weight_scale_inv: torch.Tensor,
-        global_num_experts: int,
-        top_k: int,
-        num_expert_group: int,
-        topk_group: int,
-        intermediate_size: int,
-        expert_offset: int,
-        local_num_experts: int,
-        block_shape: list[int],
-        routed_scaling: float = 1.0) -> torch.Tensor:
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    x: torch.Tensor,
+    w13_weight: torch.Tensor,
+    w13_weight_scale_inv: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_scale_inv: torch.Tensor,
+    global_num_experts: int,
+    top_k: int,
+    num_expert_group: int,
+    topk_group: int,
+    intermediate_size: int,
+    expert_offset: int,
+    local_num_experts: int,
+    block_shape: list[int],
+    routing_method_type: int,
+    routed_scaling: float = 1.0,
+) -> torch.Tensor:
     return torch.empty_like(x)
 
 
@@ -93,30 +96,31 @@ direct_register_custom_op(
     op_name="flashinfer_fused_moe_blockscale_fp8",
     op_func=flashinfer_fused_moe_blockscale_fp8,
     fake_impl=flashinfer_fused_moe_blockscale_fp8_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(torch.Tag.needs_fixed_stride_order,),
 )
 
 
 def flashinfer_fused_moe_per_tensor_scale_fp8(
-        routing_logits: torch.Tensor,
-        routing_bias: Optional[torch.Tensor],
-        hidden_states: torch.Tensor,
-        input_scale: torch.Tensor,
-        gemm1_weights: torch.Tensor,
-        gemm2_weights: torch.Tensor,
-        output1_scales_scalar: torch.Tensor,
-        output1_scales_gate_scalar: torch.Tensor,
-        output2_scales_scalar: torch.Tensor,
-        num_experts: int,
-        top_k: int,
-        num_expert_group: Optional[int],
-        topk_group: Optional[int],
-        intermediate_size: int,
-        local_expert_offset: int,
-        local_num_experts: int,
-        use_routing_scales_on_input: bool,
-        routing_method_type: int,
-        routed_scaling_factor: float = 1.0) -> torch.Tensor:
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor | None,
+    hidden_states: torch.Tensor,
+    input_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    output1_scales_scalar: torch.Tensor,
+    output1_scales_gate_scalar: torch.Tensor,
+    output2_scales_scalar: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    use_routing_scales_on_input: bool,
+    routing_method_type: int,
+    routed_scaling_factor: float = 1.0,
+) -> torch.Tensor:
     num_expert_group = num_expert_group if num_expert_group is not None else 0
     topk_group = topk_group if topk_group is not None else 0
 
@@ -124,10 +128,11 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         hidden_states,
         input_scale,
         quant_dtype=torch.float8_e4m3fn,
-        per_act_token_quant=False)
+        per_act_token_quant=False,
+    )
+
+    from vllm.utils.flashinfer import flashinfer_trtllm_fp8_per_tensor_scale_moe
 
-    from vllm.utils.flashinfer import (
-        flashinfer_trtllm_fp8_per_tensor_scale_moe)
     return flashinfer_trtllm_fp8_per_tensor_scale_moe(
         routing_logits=routing_logits,
         routing_bias=routing_bias,
@@ -146,31 +151,34 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling_factor,
         use_routing_scales_on_input=use_routing_scales_on_input,
-        tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0],
-                                                  top_k, num_experts),
-        routing_method_type=routing_method_type)
+        tile_tokens_dim=calculate_tile_tokens_dim(
+            hidden_states.shape[0], top_k, num_experts
+        ),
+        routing_method_type=routing_method_type,
+    )
 
 
 def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
-        routing_logits: torch.Tensor,
-        routing_bias: Optional[torch.Tensor],
-        hidden_states: torch.Tensor,
-        input_scale: torch.Tensor,
-        gemm1_weights: torch.Tensor,
-        gemm2_weights: torch.Tensor,
-        output1_scales_scalar: torch.Tensor,
-        output1_scales_gate_scalar: torch.Tensor,
-        output2_scales_scalar: torch.Tensor,
-        num_experts: int,
-        top_k: int,
-        num_expert_group: Optional[int],
-        topk_group: Optional[int],
-        intermediate_size: int,
-        local_expert_offset: int,
-        local_num_experts: int,
-        use_routing_scales_on_input: bool,
-        routing_method_type: int,
-        routed_scaling_factor: float = 1.0) -> torch.Tensor:
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor | None,
+    hidden_states: torch.Tensor,
+    input_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    output1_scales_scalar: torch.Tensor,
+    output1_scales_gate_scalar: torch.Tensor,
+    output2_scales_scalar: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    use_routing_scales_on_input: bool,
+    routing_method_type: int,
+    routed_scaling_factor: float = 1.0,
+) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -180,5 +188,5 @@ direct_register_custom_op(
     op_func=flashinfer_fused_moe_per_tensor_scale_fp8,
     mutates_args=["hidden_states"],
     fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(torch.Tag.needs_fixed_stride_order,),
 )
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 660bae31460264311fbd8df07cd6d6bd822e1e76..7fd8511e297de3835a15ecd396d2be220da0b6bb 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    try_get_optimal_moe_config)
+from vllm.model_executor.layers.fused_moe.fused_moe import try_get_optimal_moe_config
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate, TopKWeightAndReduceNaiveBatched)
+    TopKWeightAndReduceDelegate,
+    TopKWeightAndReduceNaiveBatched,
+)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache, moe_kernel_quantize_input, normalize_batched_scales_shape,
-    normalize_scales_shape)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    group_broadcast)
+    _resize_cache,
+    moe_kernel_quantize_input,
+    normalize_batched_scales_shape,
+    normalize_scales_shape,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 from vllm.triton_utils import tl, triton
 
 
@@ -56,12 +58,12 @@ def moe_mmk(
     use_w8a16: tl.constexpr,
     per_act_token_quant: tl.constexpr,
 ):
-
     offs_k = tl.arange(0, BLOCK_K)
 
     if use_w8a16:
-        b_scale_ptrs = b_scale_ptr + expert_id * stride_bse + offs_n[
-            None, :] * stride_bsn
+        b_scale_ptrs = (
+            b_scale_ptr + expert_id * stride_bse + offs_n[None, :] * stride_bsn
+        )
         b_scale = tl.load(b_scale_ptrs)
 
     if use_w8a8:
@@ -94,9 +96,11 @@ def moe_mmk(
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
-        a = tl.load(a_ptrs,
-                    mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K),
-                    other=0.0)
+        a = tl.load(
+            a_ptrs,
+            mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_K, other=0.0)
         # We accumulate along the K dimension.
         if use_w8a16:
@@ -105,13 +109,12 @@ def moe_mmk(
             if group_k > 0 and group_n > 0:
                 k_start = k * BLOCK_K
                 offs_ks = k_start // group_k
-                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
-                                  mask=mask_m,
-                                  other=0.0)
+                a_scale = tl.load(
+                    a_scale_ptrs + offs_ks * stride_ask, mask=mask_m, other=0.0
+                )
                 b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
 
-                accumulator += tl.dot(a, b) * a_scale[:,
-                                                      None] * b_scale[None, :]
+                accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
             else:
                 # acc used to enable fp8_fast_accum
                 accumulator = tl.dot(a, b, acc=accumulator)
@@ -137,9 +140,9 @@ def moe_mmk(
 
 @triton.jit
 def expert_triton_kernel(
-    a_ptr,  #[max_tokens, K]
-    b_ptr,  #[K, N]
-    c_ptr,  #[max_tokens, N]
+    a_ptr,  # [max_tokens, K]
+    b_ptr,  # [K, N]
+    c_ptr,  # [max_tokens, N]
     expert_id,
     compute_type: tl.constexpr,
     # Dimensions
@@ -177,7 +180,6 @@ def expert_triton_kernel(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
 ):
-
     offs_m = tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N) % N
     offs_k = tl.arange(0, BLOCK_K)
@@ -221,7 +223,8 @@ def expert_triton_kernel(
         compute_type,
         use_fp8_w8a8,
         use_int8_w8a16,
-        per_act_token_quant)
+        per_act_token_quant,
+    )
 
     # store in C
     offs_cn = tl.arange(0, BLOCK_N)
@@ -284,7 +287,7 @@ def batched_triton_kernel(
 
     # axis 1 is M_blocks * N_blocks
     pid_mn = tl.program_id(axis=1)
-    #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M)
+    # num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M)
     num_pid_n = tl.cdiv(N, BLOCK_N)
     pid_m = pid_mn // num_pid_n
     pid_n = pid_mn % num_pid_n
@@ -300,8 +303,12 @@ def batched_triton_kernel(
 
     a_ptr = a_ptr + expert_id * stride_ae + cta_m_start * stride_am
     b_ptr = b_ptr + expert_id * stride_be + cta_n_start * stride_bn
-    c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm +
-             cta_n_start * stride_cn)
+    c_ptr = (
+        c_ptr
+        + expert_id * stride_ce
+        + cta_m_start * stride_cm
+        + cta_n_start * stride_cn
+    )
 
     offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N).to(tl.int64)) % N
 
@@ -350,50 +357,54 @@ def batched_triton_kernel(
         # Kernel config
         BLOCK_M,
         BLOCK_N,
-        BLOCK_K)
+        BLOCK_K,
+    )
 
 
 def invoke_moe_batched_triton_kernel(
-        A: torch.Tensor,  # [E, max_tokens, K]
-        B: torch.Tensor,  # [E, K, N]
-        C: torch.Tensor,  # [E, max_tokens, N]
-        expert_num_tokens: torch.Tensor,  # [E]
-        compute_type: tl.dtype,
-        # Quantization data
-        A_scale: Optional[torch.Tensor],
-        B_scale: Optional[torch.Tensor],
-        B_zp: torch.Tensor,
-        # Quantization schemes
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
-        use_int4_w4a16: bool,
-        config: dict[str, int],
-        per_act_token_quant: bool,
-        block_shape: Optional[list[int]] = None):
-
+    A: torch.Tensor,  # [E, max_tokens, K]
+    B: torch.Tensor,  # [E, N, K]
+    C: torch.Tensor,  # [E, max_tokens, N]
+    expert_num_tokens: torch.Tensor,  # [E]
+    compute_type: tl.dtype,
+    # Quantization data
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
+    B_zp: torch.Tensor,
+    # Quantization schemes
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    config: dict[str, int],
+    per_act_token_quant: bool,
+    block_shape: list[int] | None = None,
+):
     assert not use_int4_w4a16
     max_num_tokens = A.size(1)
     K = A.size(2)
     N = C.size(2)
 
-    BLOCK_M = config['BLOCK_SIZE_M']
-    BLOCK_N = config['BLOCK_SIZE_N']
-    BLOCK_K = config['BLOCK_SIZE_K']
+    BLOCK_M = config["BLOCK_SIZE_M"]
+    BLOCK_N = config["BLOCK_SIZE_N"]
+    BLOCK_K = config["BLOCK_SIZE_K"]
 
-    grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
-            triton.cdiv(B.size(1), BLOCK_N))
+    grid = (
+        expert_num_tokens.size(0),
+        triton.cdiv(max_num_tokens, BLOCK_M) * triton.cdiv(B.size(1), BLOCK_N),
+    )
 
-    A_scale = normalize_batched_scales_shape(A_scale,
-                                             expert_num_tokens.shape[0])
+    A_scale = normalize_batched_scales_shape(A_scale, expert_num_tokens.shape[0])
 
     if B_scale is not None and B_scale.ndim == 1:
         assert B_scale.numel() == expert_num_tokens.shape[0]
         B_scale = B_scale.view(-1, 1, 1)
 
     assert A_scale is None or A_scale.ndim == 3, (
-        f"{0 if A_scale is None else A_scale.shape}")
+        f"{0 if A_scale is None else A_scale.shape}"
+    )
     assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, (
-        f"{0 if B_scale is None else B_scale.shape}")
+        f"{0 if B_scale is None else B_scale.shape}"
+    )
 
     if B_scale is not None:
         if B_scale.ndim == 1:
@@ -459,7 +470,8 @@ def invoke_moe_batched_triton_kernel(
         # Kernel config
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        BLOCK_K=BLOCK_K)
+        BLOCK_K=BLOCK_K,
+    )
 
 
 class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
@@ -486,22 +498,25 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_num_tokens
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def prepare(
         self,
         a1: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -512,16 +527,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
-            assert topk == 1, \
+            assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1.mul_(topk_weights.to(a1.dtype))
 
         num_tokens, hidden_dim = a1.size()
         topk = topk_ids.size(1)
 
-        tokens_per_expert = torch.zeros(num_experts,
-                                        dtype=torch.int,
-                                        device=a1.device)
+        tokens_per_expert = torch.zeros(num_experts, dtype=torch.int, device=a1.device)
 
         num_local_experts = self.num_local_experts
 
@@ -533,15 +547,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         b_a1 = torch.zeros(
             (num_local_experts, self.max_num_tokens, hidden_dim),
             dtype=b_type,
-            device=a1.device)
+            device=a1.device,
+        )
 
         if quant_config.is_quantized:
             scale_shape = quant_config.batched_scale_shape(
-                num_local_experts, self.max_num_tokens, hidden_dim)
+                num_local_experts, self.max_num_tokens, hidden_dim
+            )
 
-            b_a1_scale = torch.empty(scale_shape,
-                                     dtype=torch.float32,
-                                     device=a1.device)
+            b_a1_scale = torch.empty(scale_shape, dtype=torch.float32, device=a1.device)
         else:
             assert quant_config.a1_scale is None
             b_a1_scale = None
@@ -558,11 +572,11 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 continue
             idx = expert_id - first_expert
             tokens_per_expert[idx] = rows
-            rhs = a1[:topks.numel()][topks]
+            rhs = a1[: topks.numel()][topks]
             if quant_config.quant_dtype is not None:
                 if a1_scale is not None:
                     if quant_config.is_per_act_token:
-                        rhs_a1_scale = a1_scale[:topks.numel()][topks]
+                        rhs_a1_scale = a1_scale[: topks.numel()][topks]
                     else:
                         rhs_a1_scale = a1_scale
                 else:
@@ -578,14 +592,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 if quant_config.is_per_act_token:
                     b_a1_scale[idx, :rows] = b_s[:rows]
                 else:
-                    b_a1_scale[idx, :b_s.shape[0]] = b_s
+                    b_a1_scale[idx, : b_s.shape[0]] = b_s
             else:
                 b_a1[idx, :rows, :] = rhs
 
         assert b_a1_scale is None or b_a1_scale.ndim == 3
 
         expert_tokens_meta = mk.ExpertTokensMetadata(
-            expert_num_tokens=tokens_per_expert, expert_num_tokens_cpu=None)
+            expert_num_tokens=tokens_per_expert, expert_num_tokens_cpu=None
+        )
 
         return b_a1, b_a1_scale, expert_tokens_meta, None, None
 
@@ -626,16 +641,18 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert not self.quant_config.use_int8_w8a8, "NYI"
         assert not self.quant_config.use_int8_w8a16, "NYI"
         assert not self.quant_config.use_int4_w4a16, "NYI"
-        assert not self.quant_config.use_mxfp4_w4a4, "NYI"
+        assert self.quant_config.ocp_mx_scheme is None, "NYI"
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.BatchedExperts,
-                mk.FusedMoEActivationFormat.BatchedExperts)
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
 
     def supports_chunking(self) -> bool:
         return False
@@ -649,29 +666,25 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        assert a.dim() == 2
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
         workspace13 = (num_experts, self.max_num_tokens * num_dp, K)
         workspace2 = (self.max_num_tokens * num_dp, N)
         output = workspace13
-        return (workspace13, workspace2, output, a.dtype)
+        return (workspace13, workspace2, output)
 
     def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         assert self.quant_config.is_quantized
         f32 = torch.float32
-        if (self.quant_config.is_per_act_token
-                or self.quant_config.is_per_tensor):
+        if self.quant_config.is_per_act_token or self.quant_config.is_per_tensor:
             return t.to(f32) * scale
         else:
             return t.to(f32) * group_broadcast(scale, t.shape)
@@ -686,12 +699,12 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert hidden_states.dim() == 3
@@ -699,15 +712,16 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
 
         num_local_experts = w1.size(0)
-        assert num_local_experts == w1.size(0), (
-            f"{num_local_experts} == {w1.size(0)}")
+        assert num_local_experts == w1.size(0), f"{num_local_experts} == {w1.size(0)}"
 
         N = w1.size(1) // 2
 
         for expert in range(num_local_experts):
             # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor
-            if (torch.compiler.is_compiling()
-                    or torch.cuda.is_current_stream_capturing()):
+            if (
+                torch.compiler.is_compiling()
+                or torch.cuda.is_current_stream_capturing()
+            ):
                 num = hidden_states.shape[1]
             else:
                 num = int(expert_num_tokens[expert].item())
@@ -719,13 +733,11 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
             if self.quant_config.is_quantized:
                 assert a1q_scale is not None and self.w1_scale is not None
-                input = self.dequant(hidden_states[expert, :, :],
-                                     a1q_scale[expert])
+                input = self.dequant(hidden_states[expert, :, :], a1q_scale[expert])
                 w1_dq = self.dequant(w1[expert], self.w1_scale[expert])
                 input = input[:num] @ w1_dq.transpose(0, 1)
             else:
-                input = hidden_states[expert, :num, :] @ w1[expert].transpose(
-                    0, 1)
+                input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1)
 
             self.activation(activation, tmp, input.to(tmp.dtype))
 
@@ -740,26 +752,25 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
 def batched_moe_kernel_quantize_input(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     num_tokens: int,
     E: int,
     N: int,
     expert_num_tokens: torch.Tensor,
-    qtype: Optional[torch.dtype],
+    qtype: torch.dtype | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-    if (torch.compiler.is_compiling()
-            or torch.cuda.is_current_stream_capturing()):
+    block_shape: list[int] | None = None,
+) -> tuple[torch.Tensor, torch.Tensor | None]:
+    if torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing():
         # Note: this does a bunch of extra work because expert_num_tokens is
         # ignored but it does support torch.compile + cudagraphs.
         hidden_dim = A.size(-1)
         assert A_scale is None or A_scale.ndim <= 2, (
-            f"{A_scale.shape if A_scale is not None else None}")
-        A_q, A_q_scale = moe_kernel_quantize_input(A.view(-1,
-                                                          hidden_dim), A_scale,
-                                                   qtype, per_act_token_quant,
-                                                   block_shape)
+            f"{A_scale.shape if A_scale is not None else None}"
+        )
+        A_q, A_q_scale = moe_kernel_quantize_input(
+            A.view(-1, hidden_dim), A_scale, qtype, per_act_token_quant, block_shape
+        )
         A_q = A_q.view(E, -1, hidden_dim)
         A_q_scale = normalize_batched_scales_shape(A_q_scale, E)
 
@@ -779,9 +790,7 @@ def batched_moe_kernel_quantize_input(
         else:
             scale_shape = (E, 1, 1)
 
-        A_q_scale = torch.zeros(scale_shape,
-                                dtype=torch.float32,
-                                device=A.device)
+        A_q_scale = torch.zeros(scale_shape, dtype=torch.float32, device=A.device)
 
         num_experts = expert_num_tokens.numel()
 
@@ -791,7 +800,7 @@ def batched_moe_kernel_quantize_input(
             num_tokens = int(expert_num_tokens[e].item())
             if num_tokens > 0:
                 if A_scale is not None:
-                    scales = A_scale[e, :min(num_tokens, A_scale.shape[1])]
+                    scales = A_scale[e, : min(num_tokens, A_scale.shape[1])]
                 else:
                     scales = None
                 A_q[e, :num_tokens], tmp_scale = moe_kernel_quantize_input(
@@ -802,7 +811,7 @@ def batched_moe_kernel_quantize_input(
                     block_shape,
                 )
                 assert tmp_scale is not None
-                A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale
+                A_q_scale[e, : tmp_scale.shape[0]] = tmp_scale
 
         return A_q, A_q_scale
 
@@ -824,7 +833,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert not self.quant_config.use_int8_w8a8, "NYI"
         assert not self.quant_config.use_int8_w8a16, "NYI"
         assert not self.quant_config.use_int4_w4a16, "NYI"
-        assert not self.quant_config.use_mxfp4_w4a4, "NYI"
+        assert self.quant_config.ocp_mx_scheme is None, "NYI"
         assert max_num_tokens > 0
         assert num_dispatchers > 0
         self.max_num_tokens = max_num_tokens
@@ -832,10 +841,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.BatchedExperts,
-                mk.FusedMoEActivationFormat.BatchedExperts)
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
 
     def supports_chunking(self) -> bool:
         return False
@@ -849,24 +860,21 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        assert a.dim() == 2
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
         workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2))
         output = (num_experts, max_num_tokens * num_dp, K)
-        return (workspace13, workspace2, output, a.dtype)
+        return (workspace13, workspace2, output)
 
     def apply(
         self,
@@ -878,36 +886,38 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         # Check constraints.
         if self.quant_config.use_int4_w4a16:
-            assert hidden_states.size(-1) // 2 == w1.size(2), (
-                "Hidden size mismatch")
+            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
         else:
             assert hidden_states.size(-1) == w1.size(2), (
-                f"Hidden size mismatch {hidden_states.size(-1)} "
-                f"!= {w1.size(2)}")
+                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
+            )
 
-        assert hidden_states.is_contiguous(
-        ), "Hidden_states must be contiguous"
+        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
         assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
         assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
         assert hidden_states.dtype in [
-            torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
         ]
         assert expert_tokens_meta is not None
 
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
 
-        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
-            hidden_states, w1, w2, topk_ids)
+        E, max_num_tokens, N, K, top_k_num = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
 
         assert w1.size(0) == E
         assert w2.size(0) == E
@@ -932,15 +942,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         elif hidden_states.dtype == torch.float8_e4m3fn:
             compute_type = tl.bfloat16
         else:
-            raise ValueError(
-                f"Unsupported compute_type: {hidden_states.dtype}")
+            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
-        intermediate_cache1 = _resize_cache(workspace13,
-                                            (E, max_num_tokens, N))
-        intermediate_cache2 = _resize_cache(workspace2,
-                                            (E, max_num_tokens, N // 2))
+        intermediate_cache1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+        intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2))
 
         # TODO(bnell): should this be done for any quantized type?
         if self.quant_config.use_fp8_w8a8:
@@ -963,18 +970,29 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             use_int4_w4a16=self.quant_config.use_int4_w4a16,
             config=config,
             per_act_token_quant=self.per_act_token_quant,
-            block_shape=self.block_shape)
+            block_shape=self.block_shape,
+        )
 
         intermediate_cache2.fill_(0)
 
         # TODO (bnell): use triton utility from batched deep gemm.
-        self.activation(activation, intermediate_cache2.view(-1, N // 2),
-                        intermediate_cache1.view(-1, N))
+        self.activation(
+            activation,
+            intermediate_cache2.view(-1, N // 2),
+            intermediate_cache1.view(-1, N),
+        )
 
         qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
-            intermediate_cache2, a2_scale, max_num_tokens, E, N,
-            expert_num_tokens, self.quant_dtype, self.per_act_token_quant,
-            self.block_shape)
+            intermediate_cache2,
+            a2_scale,
+            max_num_tokens,
+            E,
+            N,
+            expert_num_tokens,
+            self.quant_dtype,
+            self.per_act_token_quant,
+            self.block_shape,
+        )
 
         invoke_moe_batched_triton_kernel(
             A=qintermediate_cache2,
@@ -990,4 +1008,5 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             use_int4_w4a16=self.quant_config.use_int4_w4a16,
             config=config,
             per_act_token_quant=self.per_act_token_quant,
-            block_shape=self.block_shape)
+            block_shape=self.block_shape,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index eb12a9b0a233fad425e5d429449dc089bcdf7bbb..0b0f59f673182926d21bf2c6f7ee19a4c2edb286 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,134 +1,116 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
-from typing import Optional
+
+from collections.abc import Callable
 
 import torch
 
 import vllm._custom_ops as ops
-from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    batched_moe_align_block_size,
+    moe_align_block_size,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate,
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache, disable_inplace
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    marlin_make_workspace_new, maybe_warn_marlin_atomic_add)
+    marlin_make_workspace_new,
+    marlin_moe_intermediate_size,
+    maybe_warn_marlin_atomic_add,
+)
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import direct_register_custom_op
-
-
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     bias1: Optional[torch.Tensor],
-                     bias2: Optional[torch.Tensor],
-                     w1_scale: torch.Tensor,
-                     w2_scale: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     topk_weights: torch.Tensor,
-                     topk_ids: torch.Tensor,
-                     quant_type_id: int,
-                     apply_router_weight_on_input: bool = False,
-                     global_num_experts: int = -1,
-                     activation: Optional[str] = "silu",
-                     expert_map: Optional[torch.Tensor] = None,
-                     global_scale1: Optional[torch.Tensor] = None,
-                     global_scale2: Optional[torch.Tensor] = None,
-                     g_idx1: Optional[torch.Tensor] = None,
-                     g_idx2: Optional[torch.Tensor] = None,
-                     sort_indices1: Optional[torch.Tensor] = None,
-                     sort_indices2: Optional[torch.Tensor] = None,
-                     w1_zeros: Optional[torch.Tensor] = None,
-                     w2_zeros: Optional[torch.Tensor] = None,
-                     workspace: Optional[torch.Tensor] = None,
-                     is_k_full: bool = True,
-                     inplace: bool = False) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
 
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - w1_scale (torch.Tensor): Scale to be used for w1.
-    - w2_scale (torch.Tensor): Scale to be used for w2.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
-    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
-    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
-        permutation.
-    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
-        permutation.
-    - topk_weights (torch.Tensor): Top-k weights.
-    - topk_ids (torch.Tensor): Indices of topk-k elements.
-    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
-    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
-    - num_bits (bool): The number of bits in expert weights quantization.
 
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    quant_type = ScalarType.from_id(quant_type_id)
-    assert quant_type in [
-        scalar_types.uint4, scalar_types.uint8b128, scalar_types.uint4b8,
-        scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f
-    ]
+def default_activation_func(
+    activation: str, output: torch.Tensor, input: torch.Tensor
+) -> None:
+    if activation == "silu":
+        torch.ops._C.silu_and_mul(output, input)
+    elif activation == "swigluoai":
+        # alpha = 1.702, limit = 7.0
+        torch.ops._C.swigluoai_and_mul(output, input)
+    else:
+        raise ValueError(
+            f"Unsupported activation: {activation}. "
+            "Only silu and swigluoai activations are supported."
+        )
 
-    bit4_scalar_types = [
-        scalar_types.uint4, scalar_types.uint4b8, scalar_types.float4_e2m1f
-    ]
-    num_bits = 4 if quant_type in bit4_scalar_types else 8
 
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[
-        0], "Number of tokens mismatch"
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[1] == w2.shape[2] // (
-        num_bits // 2), "Hidden size mismatch w2"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
-    assert num_bits in [4, 8]
-    assert topk_weights.dtype == torch.float32
+def _fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    bias1: torch.Tensor | None,
+    bias2: torch.Tensor | None,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    num_topk: int,
+    quant_type: ScalarType,
+    apply_router_weight_on_input: bool,
+    expert_map: torch.Tensor | None,
+    block_size_m: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    activation: str = "silu",
+    activation_func: Callable[
+        [str, torch.Tensor, torch.Tensor], None
+    ] = default_activation_func,
+    global_scale1: torch.Tensor | None = None,
+    global_scale2: torch.Tensor | None = None,
+    g_idx1: torch.Tensor | None = None,
+    g_idx2: torch.Tensor | None = None,
+    sort_indices1: torch.Tensor | None = None,
+    sort_indices2: torch.Tensor | None = None,
+    w1_zeros: torch.Tensor | None = None,
+    w2_zeros: torch.Tensor | None = None,
+    workspace: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
+    output: torch.Tensor | None = None,
+    is_k_full: bool = True,
+) -> torch.Tensor:
+    assert hidden_states.ndim == 2
+    M, K = hidden_states.size()
+    N = marlin_moe_intermediate_size(w1, w2)
 
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-    topk = topk_ids.shape[1]
+    if workspace is None:
+        workspace = marlin_make_workspace_new(hidden_states.device, 4)
 
-    # M block size selection logic
-    # TODO: tune this further for specific models
-    for block_size_m in [8, 16, 32, 48, 64]:
-        if M * topk / E / block_size_m < 0.9:
-            break
+    if intermediate_cache13 is None:
+        intermediate_cache13 = torch.empty(
+            (M * num_topk * max(2 * N, K),),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
 
-    if global_num_experts == -1:
-        global_num_experts = E
-    sorted_token_ids, expert_ids, num_tokens_post_padded = \
-        moe_align_block_size(topk_ids, block_size_m, global_num_experts,
-                             expert_map)
+    if intermediate_cache2 is None:
+        intermediate_cache2 = torch.empty(
+            (M * num_topk, N),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
 
-    if workspace is None:
-        workspace = marlin_make_workspace_new(hidden_states.device, 4)
+    intermediate_cache1 = _resize_cache(intermediate_cache13, (M * num_topk, 2 * N))
 
-    intermediate_cache2 = torch.empty(
-        (M * topk_ids.shape[1], N),
-        device=hidden_states.device,
-        dtype=hidden_states.dtype,
-    )
-    intermediate_cache13 = torch.empty(
-        (M * topk_ids.shape[1] * max(2 * N, K), ),
-        device=hidden_states.device,
-        dtype=hidden_states.dtype,
-    )
-    intermediate_cache1 = intermediate_cache13[:M * topk_ids.shape[1] * 2 * N]
-    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
-    intermediate_cache3 = intermediate_cache13[:M * topk_ids.shape[1] * K]
-    intermediate_cache3 = intermediate_cache3.view(-1, K)
+    intermediate_cache3 = _resize_cache(intermediate_cache13, (M * num_topk, K))
+
+    intermediate_cache2 = _resize_cache(intermediate_cache2, (M * num_topk, N))
 
     maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype)
-    use_atomic_add = hidden_states.dtype == torch.half or \
-        torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+    use_atomic_add = (
+        hidden_states.dtype == torch.half
+        or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+    )
 
     intermediate_cache1 = ops.moe_wna16_marlin_gemm(
         hidden_states,
@@ -146,7 +128,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         num_tokens_post_padded,
         topk_weights,
         moe_block_size=block_size_m,
-        top_k=topk,
+        top_k=num_topk,
         mul_topk_weights=apply_router_weight_on_input,
         is_ep=expert_map is not None,
         b_q_type=quant_type,
@@ -156,25 +138,22 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         is_k_full=is_k_full,
         use_atomic_add=use_atomic_add,
         use_fp32_reduce=True,
-        is_zp_float=False)
+        is_zp_float=False,
+    )
 
-    if activation == "silu":
-        torch.ops._C.silu_and_mul(intermediate_cache2,
-                                  intermediate_cache1.view(-1, 2 * N))
-    elif activation == "swigluoai":
-        # alpha = 1.702, limit = 7.0
-        torch.ops._C.swigluoai_and_mul(intermediate_cache2,
-                                       intermediate_cache1.view(-1, 2 * N))
-    else:
-        raise ValueError(f"Unsupported activation: {activation}. "
-                         "Only silu and swigluoai activations are supported.")
+    activation_func(
+        activation, intermediate_cache2, intermediate_cache1.view(-1, 2 * N)
+    )
+
+    if output is None:
+        output = intermediate_cache3
 
     if expert_map is not None:
-        intermediate_cache3.zero_()
+        output.zero_()
 
-    intermediate_cache3 = ops.moe_wna16_marlin_gemm(
+    output = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
-        intermediate_cache3,
+        output,
         w2,
         bias2,
         w2_scale,
@@ -192,48 +171,622 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         mul_topk_weights=not apply_router_weight_on_input,
         is_ep=expert_map is not None,
         b_q_type=quant_type,
-        size_m=M * topk,
+        size_m=M * num_topk,
         size_n=K,
         size_k=N,
         is_k_full=is_k_full,
         use_atomic_add=use_atomic_add,
         use_fp32_reduce=True,
-        is_zp_float=False).view(-1, topk, K)
-
-    output = hidden_states if inplace else torch.empty_like(hidden_states)
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1,
-                     out=output)
-
-
-def fused_marlin_moe_fake(hidden_states: torch.Tensor,
-                          w1: torch.Tensor,
-                          w2: torch.Tensor,
-                          w1_scale: torch.Tensor,
-                          w2_scale: torch.Tensor,
-                          gating_output: torch.Tensor,
-                          topk_weights: torch.Tensor,
-                          topk_ids: torch.Tensor,
-                          quant_type_id: int,
-                          apply_router_weight_on_input: bool = False,
-                          global_num_experts: int = -1,
-                          global_scale1: Optional[torch.Tensor] = None,
-                          global_scale2: Optional[torch.Tensor] = None,
-                          expert_map: Optional[torch.Tensor] = None,
-                          g_idx1: Optional[torch.Tensor] = None,
-                          g_idx2: Optional[torch.Tensor] = None,
-                          sort_indices1: Optional[torch.Tensor] = None,
-                          sort_indices2: Optional[torch.Tensor] = None,
-                          w1_zeros: Optional[torch.Tensor] = None,
-                          w2_zeros: Optional[torch.Tensor] = None,
-                          workspace: Optional[torch.Tensor] = None,
-                          is_k_full: bool = True,
-                          inplace: bool = False) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-direct_register_custom_op(
-    op_name="fused_marlin_moe",
-    op_func=fused_marlin_moe,
-    fake_impl=fused_marlin_moe_fake,
-)
+        is_zp_float=False,
+    )
+
+    return output
+
+
+def fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    bias1: torch.Tensor | None,
+    bias2: torch.Tensor | None,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    gating_output: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    quant_type_id: int,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    activation: str = "silu",
+    activation_func: Callable[
+        [str, torch.Tensor, torch.Tensor], None
+    ] = default_activation_func,
+    moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None,
+    expert_map: torch.Tensor | None = None,
+    global_scale1: torch.Tensor | None = None,
+    global_scale2: torch.Tensor | None = None,
+    g_idx1: torch.Tensor | None = None,
+    g_idx2: torch.Tensor | None = None,
+    sort_indices1: torch.Tensor | None = None,
+    sort_indices2: torch.Tensor | None = None,
+    w1_zeros: torch.Tensor | None = None,
+    w2_zeros: torch.Tensor | None = None,
+    workspace: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
+    is_k_full: bool = True,
+    output: torch.Tensor | None = None,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (torch.Tensor): Scale to be used for w1.
+    - w2_scale (torch.Tensor): Scale to be used for w2.
+    - gating_output (torch.Tensor|None): The output of the gating
+        operation (before softmax).
+    - g_idx1 (torch.Tensor|None): The first set of act_order indices.
+    - g_idx2 (torch.Tensor|None): The second set of act_order indices.
+    - sort_indices1 (torch.Tensor|None): The first act_order input
+        permutation.
+    - sort_indices2 (torch.Tensor|None): The second act_order input
+        permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - w1_zeros (torch.Tensor|None): Optional zero points to be used for w1.
+    - w2_zeros (torch.Tensor|None): Optional zero points to be used for w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+
+    if inplace:
+        assert output is None, "Conflicting request"
+
+    quant_type = ScalarType.from_id(quant_type_id)
+    assert quant_type in [
+        scalar_types.uint4,
+        scalar_types.uint8b128,
+        scalar_types.uint4b8,
+        scalar_types.float8_e4m3fn,
+        scalar_types.float4_e2m1f,
+    ]
+
+    bit4_scalar_types = [
+        scalar_types.uint4,
+        scalar_types.uint4b8,
+        scalar_types.float4_e2m1f,
+    ]
+    num_bits = 4 if quant_type in bit4_scalar_types else 8
+
+    M, K = hidden_states.size()
+    E = w1.size(0)
+    topk = topk_ids.size(1)
+
+    # Check constraints.
+    if gating_output is not None:
+        assert gating_output.size(0) == M, "Number of tokens mismatch"
+    assert w1.size(1) * 16 == K, "Hidden size mismatch w1"
+    assert w2.size(2) // (num_bits // 2) == K, "Hidden size mismatch w2"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    assert num_bits in [4, 8]
+    assert topk_weights.dtype == torch.float32
+
+    # M block size selection logic
+    # TODO: tune this further for specific models
+    for block_size_m in [8, 16, 32, 48, 64]:
+        if M * topk / E / block_size_m < 0.9:
+            break
+
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_ids, block_size_m, global_num_experts, expert_map
+    )
+
+    assert activation is not None
+    moe_output = _fused_marlin_moe(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        bias1=bias1,
+        bias2=bias2,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        topk_weights=topk_weights,
+        num_topk=topk,
+        quant_type=quant_type,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        expert_map=expert_map,
+        block_size_m=block_size_m,
+        sorted_token_ids=sorted_token_ids,
+        expert_ids=expert_ids,
+        num_tokens_post_padded=num_tokens_post_padded,
+        activation=activation,
+        activation_func=activation_func,
+        global_scale1=global_scale1,
+        global_scale2=global_scale2,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=w1_zeros,
+        w2_zeros=w2_zeros,
+        workspace=workspace,
+        intermediate_cache13=intermediate_cache13,
+        intermediate_cache2=intermediate_cache2,
+        output=None,
+        is_k_full=is_k_full,
+    ).view(-1, topk, K)
+
+    if output is None:
+        if inplace and not disable_inplace():
+            output = hidden_states
+        else:
+            output = torch.empty_like(hidden_states)
+
+    if moe_sum is None:
+        return torch.sum(moe_output.view(-1, topk, K), dim=1, out=output)
+    else:
+        return moe_sum(moe_output, output)
+
+
+def batched_fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    expert_num_tokens: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    bias1: torch.Tensor | None,
+    bias2: torch.Tensor | None,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    gating_output: torch.Tensor | None,
+    quant_type_id: int,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    activation: str | None = "silu",
+    expert_map: torch.Tensor | None = None,
+    global_scale1: torch.Tensor | None = None,
+    global_scale2: torch.Tensor | None = None,
+    g_idx1: torch.Tensor | None = None,
+    g_idx2: torch.Tensor | None = None,
+    sort_indices1: torch.Tensor | None = None,
+    sort_indices2: torch.Tensor | None = None,
+    w1_zeros: torch.Tensor | None = None,
+    w2_zeros: torch.Tensor | None = None,
+    workspace: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
+    is_k_full: bool = True,
+    output: torch.Tensor | None = None,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """
+    This function massages the inputs so the batched hidden_states can be
+    presented as a 2D contiguous tensor that could be used with
+    _fused_marlin_moe.
+
+    Note that both batched_fused_marlin_moe and fused_marlin_moe ultimately
+    use `ops.moe_wna16_marlin_gemm` for the gemm operation and
+    `ops.moe_mna16_marlin_gemm` supports only 2D contiguous hidden_states.
+    Note that the moe_align_block_size function indicates,
+        - What rows of the A matrix (hidden_states) to access during the
+        matmul, via sorted_ids output.
+        - What expert_id to use for each block matmul, via expert_ids ouptut.
+
+    In the batched version, the tokens are already grouped/batched by experts
+    they subscribe to. Due to this, we can represent the batched hidden_states
+    tensor of shape [B, MAX_TOKENS_PER_BATCH, K] as a 2D tensor of shape,
+    [B * MAX_TOKENS_PER_BATCH, K]. We may treat this a 2D contiguous tensor
+    with topk=1 as each token (row in the tensor) subscribes to exactly one
+    expert_id (which is the batch_id). With the expert_num_tokens tensor, that
+    indicates how many tokens are actually valid in each batch, the
+    batched_moe_align_block_size function constructs the sorted_ids and
+    expert_ids tensors, so only relevant/valid rows of A (hidden_states)
+    are accessed and are processed with the correct expert_ids.
+    """
+
+    assert hidden_states.ndim == 3, (
+        f"hidden states must be batched. e.g. [B, MAX_TOKENS, K]."
+        f"But got {hidden_states.size()}"
+    )
+    if inplace:
+        assert output is None, "Conflicting request."
+
+    quant_type = ScalarType.from_id(quant_type_id)
+    assert quant_type in [
+        scalar_types.uint4,
+        scalar_types.uint8b128,
+        scalar_types.uint4b8,
+        scalar_types.float8_e4m3fn,
+        scalar_types.float4_e2m1f,
+    ]
+
+    bit4_scalar_types = [
+        scalar_types.uint4,
+        scalar_types.uint4b8,
+        scalar_types.float4_e2m1f,
+    ]
+    num_bits = 4 if quant_type in bit4_scalar_types else 8
+
+    B, BATCH_TOKENS_MAX, K = hidden_states.size()
+    M = hidden_states.view(-1, K).size(0)
+    E = w1.size(0)
+
+    # Check constraints.
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    assert expert_num_tokens.size(0) == E
+    assert B == E, (
+        "Batch must be as big as number of experts as the tokens"
+        "are sorted into the batch/expert they belong to"
+    )
+    assert w1.size(1) * 16 == K, "Hidden size mismatch w1"
+    assert w2.size(2) // (num_bits // 2) == K, "Hidden size mismatch w2"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert num_bits in [4, 8]
+
+    # Technically, the tokens are already separated by their expert ids.
+    # Hidden-States can just be squeezed to have just 2 dimensions,
+    # [B * MAX_TOKENS, K] and top_k can be interpreted as just 1.
+    topk = 1
+
+    # TODO(varun) : Choose a decent block size like in fused_marlin_moe
+    block_size_m = 64
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = batched_moe_align_block_size(
+        max_tokens_per_batch=BATCH_TOKENS_MAX,
+        block_size=block_size_m,
+        expert_num_tokens=expert_num_tokens,
+    )
+
+    if output is None and inplace:
+        output = hidden_states
+
+    # TODO (varun): This can be avoided by plumbing the marlin kernel to
+    # ignore topk_weights when topk_weights_ptr is a nullptr.
+    topk_weights = torch.ones(
+        (M, topk), device=hidden_states.device, dtype=torch.float32
+    )
+
+    assert activation is not None
+    output = _fused_marlin_moe(
+        hidden_states=hidden_states.view(-1, K),
+        w1=w1,
+        w2=w2,
+        bias1=bias1,
+        bias2=bias2,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        topk_weights=topk_weights,
+        num_topk=topk,
+        quant_type=quant_type,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        activation=activation,
+        expert_map=expert_map,
+        block_size_m=block_size_m,
+        sorted_token_ids=sorted_token_ids,
+        expert_ids=expert_ids,
+        num_tokens_post_padded=num_tokens_post_padded,
+        global_scale1=global_scale1,
+        global_scale2=global_scale2,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=w1_zeros,
+        w2_zeros=w2_zeros,
+        workspace=workspace,
+        intermediate_cache13=intermediate_cache13,
+        intermediate_cache2=intermediate_cache2,
+        output=output.view(-1, K) if output is not None else output,
+        is_k_full=is_k_full,
+    )
+
+    output = output.view(B, BATCH_TOKENS_MAX, K)
+
+    return output
+
+
+class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
+            "Supports only mxfp4_w4a16 or int4_w4a16"
+        )
+        self.w13_g_idx = w13_g_idx
+        self.w2_g_idx = w2_g_idx
+        self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
+        self.w2_g_idx_sort_indices = w2_g_idx_sort_indices
+        self.is_k_full = is_k_full
+        super().__init__(quant_config)
+
+    @property
+    def quant_type_id(self) -> int:
+        # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
+        return (
+            scalar_types.uint4b8.id
+            if self.quant_config.use_int4_w4a16
+            else scalar_types.float4_e2m1f.id
+        )
+
+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        assert w1.dim() == 3 and w2.dim() == 3
+
+        E = w1.size(0)
+        K = a1.size(-1)
+        N = marlin_moe_intermediate_size(w1, w2)
+
+        if a1.dim() == 2:
+            # Make sure we are using the correct a1 (pre-permute).
+            assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+            M = a1.size(0)
+        else:
+            assert a1.dim() == 3
+            assert a1.size(0) == E, f"{a1.size(0)} == {E}"
+            M = a1.size(1)  # This is max_num_tokens
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
+
+class MarlinExperts(MarlinExpertsBase):
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Modular Kernel provisions output buffer from workspace1. However in
+        # the fused_marlin_moe() function, the final torch.sum(), is defined
+        # essentially as,
+        # `torch.sum(workspace1, dim=1, out=output)`
+        # Having overlapping input and output tensors for torch.sum seems
+        # error prone and depends on how the torch.sum is implemented.
+        # For this reason we swap let the output buffer provision from
+        # workspace2.
+
+        # Workspace/IntermediateCache allocation matching fused_marlin_moe()
+        # workspace1 = (M * topk * max(2 * N, K),)
+        # workspace2 = (M * topk, N)
+
+        # Workspace/IntermediateCache allocation accounting for output buffer
+        # provisioning
+        workspace1 = (M * topk, max(N, K))
+        workspace2 = (M * topk * max(2 * N, K),)
+        output = (M, K)
+
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert self.w1_scale is not None
+        assert self.w2_scale is not None
+        return fused_marlin_moe(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            bias1=self.w1_bias,
+            bias2=self.w2_bias,
+            w1_scale=self.w1_scale,
+            w2_scale=self.w2_scale,
+            gating_output=None,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            quant_type_id=self.quant_type_id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            activation=activation,
+            activation_func=self.activation,
+            moe_sum=self.moe_sum,
+            expert_map=expert_map,
+            output=output,
+            # Workspaces are swapped in workspace_shapes() to account for proper
+            # output buffer allocation. Please refer to workspace_shapes().
+            intermediate_cache13=workspace2,
+            intermediate_cache2=workspace13,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
+        )
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
+        ops.moe_sum(input, output)
+
+
+def modular_marlin_fused_moe(
+    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
+) -> mk.FusedMoEModularKernel:
+    return mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        MarlinExperts(quant_config),
+        shared_experts,
+    )
+
+
+class BatchedMarlinExperts(MarlinExpertsBase):
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceDelegate()
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        num_dispatchers = self.num_dispatchers
+        num_experts = local_num_experts
+        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = (num_experts * max_num_tokens * num_dispatchers, max(K, N * 2))
+        workspace2 = (num_experts * max_num_tokens * num_dispatchers, N)
+        output = (num_experts, max_num_tokens * num_dispatchers, K)
+        return (workspace13, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert expert_tokens_meta is not None, "Num valid tokens per batch is required"
+        return batched_fused_marlin_moe(
+            hidden_states=hidden_states,
+            expert_num_tokens=expert_tokens_meta.expert_num_tokens,
+            w1=w1,
+            w2=w2,
+            bias1=self.w1_bias,
+            bias2=self.w2_bias,
+            w1_scale=self.w1_scale,
+            w2_scale=self.w2_scale,
+            gating_output=None,
+            quant_type_id=self.quant_type_id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            activation=activation,
+            expert_map=expert_map,
+            output=output,
+            intermediate_cache13=workspace13,
+            intermediate_cache2=workspace2,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 9e3e311c8a60bd06ba7a5ecba1ea23110f571e30..2ef013d1e6a42fe4fe9045f5eb56667002d3d5e6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE Triton kernels."""
+
 import functools
 import json
 import os
-# torch.compile needs typing.List. It will fail torch.library.infer_schema
-# otherwise
-from typing import List  # noqa: UP035
-from typing import Any, Callable, Optional, Union, List, Dict
-
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -16,95 +14,120 @@ import torch.nn.functional as F
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
-# yapf: disable
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig, _get_config_dtype_str)
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEQuantConfig,
+    _get_config_dtype_str,
+)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
     _valid_cutlass_block_scaled_grouped_gemm,
-    run_cutlass_block_scaled_fused_experts)
-# yapf: enable
+    run_cutlass_block_scaled_fused_experts,
+)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _valid_deep_gemm, deep_gemm_moe_fp8)
+    _valid_deep_gemm,
+    deep_gemm_moe_fp8,
+)
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size)
+    moe_align_block_size,
+)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP)
+    TopKWeightAndReduceNoOP,
+)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache, activation_without_mul, moe_kernel_quantize_input)
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    dequant_mxfp4)
+    _resize_cache,
+    activation_without_mul,
+    disable_inplace,
+    moe_kernel_quantize_input,
+)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import dequant_mxfp4
+from vllm.model_executor.layers.quantization.utils.mxfp6_utils import dequant_mxfp6
+from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_Scheme
+from vllm.model_executor.utils import maybe_disable_graph_partition
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
-
-# from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
+from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
 
 @triton.jit
-def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
-                          token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N,
-                          compute_type):
+def write_zeros_to_output(
+    c_ptr,
+    stride_cm,
+    stride_cn,
+    pid_n,
+    N,
+    offs_token,
+    token_mask,
+    BLOCK_SIZE_M,
+    BLOCK_SIZE_N,
+    compute_type,
+):
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
 @triton.jit
 def fused_moe_kernel_gptq_awq(
-        # Pointers to matrices
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        b_scale_ptr,
-        b_zp_ptr,
-        topk_weights_ptr,
-        sorted_token_ids_ptr,
-        expert_ids_ptr,
-        num_tokens_post_padded_ptr,
-        # Matrix dimensions
-        N: tl.constexpr,
-        K: tl.constexpr,
-        EM,
-        num_valid_tokens,
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_am,
-        stride_ak,
-        stride_be,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        stride_bze,
-        stride_bzk,
-        stride_bzn,
-        block_k_diviable: tl.constexpr,
-        group_size: tl.constexpr,
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-        MUL_ROUTED_WEIGHT: tl.constexpr,
-        top_k: tl.constexpr,
-        compute_type: tl.constexpr,
-        has_zp: tl.constexpr,
-        use_int4_w4a16: tl.constexpr,
-        use_int8_w8a16: tl.constexpr):
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N: tl.constexpr,
+    K: tl.constexpr,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    stride_bze,
+    stride_bzk,
+    stride_bzn,
+    block_k_diviable: tl.constexpr,
+    group_size: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    has_zp: tl.constexpr,
+    use_int4_w4a16: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
     token and expert matrices.
@@ -153,8 +176,7 @@ def fused_moe_kernel_gptq_awq(
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
-        tl.int64)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
@@ -163,25 +185,41 @@ def fused_moe_kernel_gptq_awq(
         # -----------------------------------------------------------
         # Write back zeros to the output when the expert is not
         # in the current expert parallel rank.
-        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
-                              offs_token, token_mask, BLOCK_SIZE_M,
-                              BLOCK_SIZE_N, compute_type)
+        write_zeros_to_output(
+            c_ptr,
+            stride_cm,
+            stride_cn,
+            pid_n,
+            N,
+            offs_token,
+            token_mask,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            compute_type,
+        )
         return
 
-    offs_bn = (pid_n * BLOCK_SIZE_N +
-               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
-                      offs_k[None, :] * stride_ak)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
 
     if use_int4_w4a16:
-        b_ptrs = b_ptr + off_experts * stride_be + \
-            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \
-                stride_bn
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + (offs_k[:, None] // 2) * stride_bk
+            + offs_bn[None, :] * stride_bn
+        )
         b_shifter = (offs_k[:, None] % 2) * 4
     elif use_int8_w8a16:
-        b_ptrs = b_ptr + off_experts * stride_be + \
-            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + offs_k[:, None] * stride_bk
+            + offs_bn[None, :] * stride_bn
+        )
 
     if not has_zp and use_int4_w4a16:
         b_zp_num = 8
@@ -207,34 +245,43 @@ def fused_moe_kernel_gptq_awq(
             k_mask = None
             k_other = None
 
-        a = tl.load(a_ptrs,
-                    mask=token_mask[:, None] &
-                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-                    other=0.0)
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
         b = tl.load(b_ptrs)
         if use_int4_w4a16:
             b = (b >> b_shifter) & 0xF
 
-        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
-            offs_bn[None, :] * stride_bsn + \
-            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \
-                stride_bsk
+        b_scale_ptrs = (
+            b_scale_ptr
+            + off_experts * stride_bse
+            + offs_bn[None, :] * stride_bsn
+            + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+        )
         b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
         b_scale = b_scale.to(tl.float32)
 
         if has_zp and use_int4_w4a16:
             offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
-            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
-                (offs_bn[None, :] // 2) * stride_bzn + \
-                offs_k_true * stride_bzk
+            b_zp_ptrs = (
+                b_zp_ptr
+                + off_experts * stride_bze
+                + (offs_bn[None, :] // 2) * stride_bzn
+                + offs_k_true * stride_bzk
+            )
             b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
-            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = (b_zp >> b_zp_shifter) & 0xF
             b_zp = b_zp.to(tl.float32)
         elif has_zp and use_int8_w8a16:
             offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
-            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
-                offs_bn[None, :] * stride_bzn + \
-                offs_k_true * stride_bzk
+            b_zp_ptrs = (
+                b_zp_ptr
+                + off_experts * stride_bze
+                + offs_bn[None, :] * stride_bzn
+                + offs_k_true * stride_bzk
+            )
             b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
             b_zp = b_zp.to(tl.float32)
 
@@ -253,17 +300,14 @@ def fused_moe_kernel_gptq_awq(
             b_ptrs += BLOCK_SIZE_K * stride_bk
 
     if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token,
-                             mask=token_mask,
-                             other=0)
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
 
     accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
@@ -312,6 +356,7 @@ def fused_moe_kernel(
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
     GROUP_SIZE_M: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     MUL_ROUTED_WEIGHT: tl.constexpr,
     top_k: tl.constexpr,
     compute_type: tl.constexpr,
@@ -383,8 +428,7 @@ def fused_moe_kernel(
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
-        tl.int64)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
@@ -393,22 +437,35 @@ def fused_moe_kernel(
         # -----------------------------------------------------------
         # Write back zeros to the output when the expert is not
         # in the current expert parallel rank.
-        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
-                              offs_token, token_mask, BLOCK_SIZE_M,
-                              BLOCK_SIZE_N, compute_type)
+        write_zeros_to_output(
+            c_ptr,
+            stride_cm,
+            stride_cn,
+            pid_n,
+            N,
+            offs_token,
+            token_mask,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            compute_type,
+        )
         return
 
-    offs_bn = (pid_n * BLOCK_SIZE_N +
-               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
-                      offs_k[None, :] * stride_ak)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
 
-    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
-                                                offs_bn[None, :] * stride_bn)
+    b_ptrs = (
+        b_ptr
+        + off_experts * stride_be
+        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    )
     if use_int8_w8a16:
-        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
-            None, :] * stride_bsn
+        b_scale_ptrs = (
+            b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+        )
         b_scale = tl.load(b_scale_ptrs)
 
     if use_fp8_w8a8 or use_int8_w8a8:
@@ -416,17 +473,18 @@ def fused_moe_kernel(
         if group_k > 0 and group_n > 0:
             a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
             offs_bsn = offs_bn // group_n
-            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
-                            offs_bsn * stride_bsn)
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+            )
         # channel-wise
         elif per_channel_quant:
-            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
-                None, :] * stride_bsn
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+            )
             b_scale = tl.load(b_scale_ptrs)
             # Load per-token scale for activations
             a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
-            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:,
-                                                                        None]
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None]
         # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
@@ -444,13 +502,12 @@ def fused_moe_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
-        a = tl.load(a_ptrs,
-                    mask=token_mask[:, None] &
-                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
@@ -458,13 +515,12 @@ def fused_moe_kernel(
             if group_k > 0 and group_n > 0:
                 k_start = k * BLOCK_SIZE_K
                 offs_ks = k_start // group_k
-                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
-                                  mask=token_mask,
-                                  other=0.0)
+                a_scale = tl.load(
+                    a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0
+                )
                 b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
 
-                accumulator += tl.dot(a, b) * a_scale[:,
-                                                      None] * b_scale[None, :]
+                accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
             else:
                 if use_fp8_w8a8:
                     # acc used to enable fp8_fast_accum
@@ -479,9 +535,7 @@ def fused_moe_kernel(
     if HAS_BIAS:
         accumulator = accumulator + bias[None, :]
     if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token,
-                             mask=token_mask,
-                             other=0)
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
     if use_int8_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
@@ -496,44 +550,47 @@ def fused_moe_kernel(
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
-def invoke_fused_moe_kernel(A: torch.Tensor,
-                            B: torch.Tensor,
-                            C: torch.Tensor,
-                            A_scale: Optional[torch.Tensor],
-                            B_scale: Optional[torch.Tensor],
-                            B_zp: Optional[torch.Tensor],
-                            topk_weights: Optional[torch.Tensor],
-                            sorted_token_ids: torch.Tensor,
-                            expert_ids: torch.Tensor,
-                            num_tokens_post_padded: torch.Tensor,
-                            mul_routed_weight: bool,
-                            top_k: int,
-                            config: dict[str, Any],
-                            compute_type: tl.dtype,
-                            use_fp8_w8a8: bool,
-                            use_int8_w8a8: bool,
-                            use_int8_w8a16: bool,
-                            use_int4_w4a16: bool,
-                            per_channel_quant: bool,
-                            block_shape: Optional[list[int]] = None,
-                            B_bias: Optional[torch.Tensor] = None,
-                            use_nn_moe: Optional[bool]=False) -> None:
+def invoke_fused_moe_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
+    B_zp: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: dict[str, Any],
+    compute_type: tl.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    per_channel_quant: bool,
+    block_shape: list[int] | None = None,
+    B_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None = False,
+) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8 or use_int8_w8a8:
         assert B_scale is not None
-        assert (block_shape is None
-                or triton.cdiv(B.size(-2), block_shape[0]) == B_scale.size(-2))
-        assert (block_shape is None
-                or triton.cdiv(B.size(-1), block_shape[1]) == B_scale.size(-1))
+        assert block_shape is None or triton.cdiv(
+            B.size(-2), block_shape[0]
+        ) == B_scale.size(-2)
+        assert block_shape is None or triton.cdiv(
+            B.size(-1), block_shape[1]
+        ) == B_scale.size(-1)
 
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
@@ -551,13 +608,17 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         # We assume that top_ids of each token is unique,
         # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
         # and we can skip some invalid blocks.
-        EM = min(sorted_token_ids.size(0),
-                 A.size(0) * top_k * config['BLOCK_SIZE_M'])
-    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
-        B.size(1) if not use_nn_moe else B.size(2), META['BLOCK_SIZE_N']), )
+        EM = min(sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"])
+    grid = lambda META: (
+        triton.cdiv(EM, META["BLOCK_SIZE_M"])
+        * triton.cdiv(B.size(1) if not use_nn_moe else B.size(2), META["BLOCK_SIZE_N"]),
+    )
     HAS_BIAS = B_bias is not None
-    if (use_int8_w8a16 or use_int4_w4a16) and \
-            block_shape is not None and block_shape[1] > 0:
+    if (
+        (use_int8_w8a16 or use_int4_w4a16)
+        and block_shape is not None
+        and block_shape[1] > 0
+    ):
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
 
@@ -565,29 +626,42 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             num_valid_tokens=num_tokens,
             group_size=block_shape[1],
             num_experts=B.size(0),
-            bit=4 if use_int4_w4a16 else 8)
+            bit=4 if use_int4_w4a16 else 8,
+        )
         config = config.copy()
         config.update(
-            get_moe_wna16_block_config(config=config,
-                                       use_moe_wna16_cuda=use_moe_wna16_cuda,
-                                       num_valid_tokens=num_tokens,
-                                       size_k=A.size(1),
-                                       size_n=B.size(1),
-                                       num_experts=B.size(1),
-                                       group_size=block_shape[1],
-                                       real_top_k=top_k,
-                                       block_size_m=config["BLOCK_SIZE_M"]))
+            get_moe_wna16_block_config(
+                config=config,
+                use_moe_wna16_cuda=use_moe_wna16_cuda,
+                num_valid_tokens=num_tokens,
+                size_k=A.size(1),
+                size_n=B.size(1),
+                num_experts=B.size(1),
+                group_size=block_shape[1],
+                real_top_k=top_k,
+                block_size_m=config["BLOCK_SIZE_M"],
+            )
+        )
 
         if use_moe_wna16_cuda:
             bit = 4 if use_int4_w4a16 else 8
-            ops.moe_wna16_gemm(A, C, B, B_scale, B_zp,
-                               topk_weights if mul_routed_weight else None,
-                               sorted_token_ids, expert_ids,
-                               num_tokens_post_padded, top_k,
-                               config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
-                               config["BLOCK_SIZE_K"], bit)
+            ops.moe_wna16_gemm(
+                A,
+                C,
+                B,
+                B_scale,
+                B_zp,
+                topk_weights if mul_routed_weight else None,
+                sorted_token_ids,
+                expert_ids,
+                num_tokens_post_padded,
+                top_k,
+                config["BLOCK_SIZE_M"],
+                config["BLOCK_SIZE_N"],
+                config["BLOCK_SIZE_K"],
+                bit,
+            )
             return
-
         fused_moe_kernel_gptq_awq[grid](
             A,
             B,
@@ -627,10 +701,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         )
     else:
         config = config.copy()
+        config["SPLIT_K"] = 1
         BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
         if block_shape is not None:
-            BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0],
-                                                 block_shape[1]))
+            BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))
         fused_moe_kernel[grid](
             A,
             B,
@@ -653,16 +727,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.stride(1) if not use_nn_moe else B.stride(2),
             C.stride(1),
             C.stride(2),
-            A_scale.stride(0)
-            if A_scale is not None and A_scale.ndim == 2 else 0,
-            A_scale.stride(1)
-            if A_scale is not None and A_scale.ndim == 2 else 0,
-            B_scale.stride(0)
-            if B_scale is not None and B_scale.ndim >= 2 else 0,
-            B_scale.stride(2)
-            if B_scale is not None and B_scale.ndim == 3 else 0,
-            B_scale.stride(1)
-            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
             B_bias.stride(0) if B_bias is not None else 0,
             B_bias.stride(1) if B_bias is not None else 0,
             0 if block_shape is None else block_shape[0],
@@ -699,28 +768,36 @@ def compute_identity_kernel(
     if batch_id >= num_tokens or dim_offset >= hidden_dim:
         return
 
-    h = tl.load(hidden_states_ptr + batch_id * hidden_dim + dim_offset +
-                tl.arange(0, BLOCK_SIZE),
-                mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim)
+    h = tl.load(
+        hidden_states_ptr
+        + batch_id * hidden_dim
+        + dim_offset
+        + tl.arange(0, BLOCK_SIZE),
+        mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+    )
 
     result = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
     for i in range(top_k):
         scale = tl.load(expert_scales_ptr + batch_id * scales_stride + i)
         result += h * scale
 
-    tl.store(output_ptr + batch_id * hidden_dim + dim_offset +
-             tl.arange(0, BLOCK_SIZE),
-             result,
-             mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim)
+    tl.store(
+        output_ptr + batch_id * hidden_dim + dim_offset + tl.arange(0, BLOCK_SIZE),
+        result,
+        mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+    )
 
 
-def zero_experts_compute_triton(expert_indices: torch.Tensor,
-                                expert_scales: torch.Tensor, num_experts: int,
-                                zero_expert_type: str,
-                                hidden_states: torch.Tensor) -> torch.Tensor:
+def zero_experts_compute_triton(
+    expert_indices: torch.Tensor,
+    expert_scales: torch.Tensor,
+    num_experts: int,
+    zero_expert_type: str,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
     N = expert_indices.numel()
     top_k = expert_indices.size(-1)
-    grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']), )
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),)
 
     if zero_expert_type == "identity":
         zero_expert_mask = expert_indices < num_experts
@@ -735,7 +812,7 @@ def zero_experts_compute_triton(expert_indices: torch.Tensor,
     hidden_dim = hidden_states.size(-1)
     num_tokens = hidden_states.size(0)
 
-    grid = lambda meta: (num_tokens * (hidden_dim // meta['BLOCK_SIZE']), )
+    grid = lambda meta: (num_tokens * (hidden_dim // meta["BLOCK_SIZE"]),)
     compute_identity_kernel[grid](
         top_k,
         hidden_states,
@@ -751,14 +828,17 @@ def zero_experts_compute_triton(expert_indices: torch.Tensor,
 
 
 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
-def get_config_file_name(E: int,
-                         N: int,
-                         dtype: Optional[str],
-                         block_shape: Optional[List[int]] = None, use_nn_moe: Optional[bool] = False) -> str:
+def get_config_file_name(
+    E: int, N: int, dtype: str | None, block_shape: list[int] | None = None, use_nn_moe: bool | None = False,
+) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
+    # Set device_name to H200 if a device from the H200 family is detected
+    if "H200" in device_name.split("_"):
+        device_name = "NVIDIA_H200"
     dtype_selector = "" if not dtype else f",dtype={dtype}"
-    block_shape_selector = ("" if not block_shape or not all(block_shape) else
-                            f",block_shape={block_shape}").replace(" ", "")
+    block_shape_selector = (
+        "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
+    ).replace(" ", "")
     if not use_nn_moe:
         return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
     else:
@@ -769,11 +849,11 @@ def get_config_file_name(E: int,
 def get_moe_configs(
     E: int,
     N: int,
-    dtype: Optional[str],
-    block_n: Optional[int] = None,
-    block_k: Optional[int] = None,
-    use_nn_moe: Optional[bool] = False,
-) -> Optional[Dict[int, Any]]:
+    dtype: str | None,
+    block_n: int | None = None,
+    block_k: int | None = None,
+    use_nn_moe: bool | None = False,
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -783,6 +863,10 @@ def get_moe_configs(
     be picked and the associated configuration chosen to invoke the kernel.
     """
 
+    # Avoid optimizing for the batch invariant case. Use default config
+    if vllm_is_batch_invariant():
+        return None
+
     # First look up if an optimized configuration is available in the configs
     # directory
     block_shape = [block_n, block_k] if block_n and block_k else None
@@ -794,18 +878,21 @@ def get_moe_configs(
     user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
     if user_defined_config_folder is not None:
         user_defined_config_file_path = os.path.join(
-            user_defined_config_folder, json_file_name)
+            user_defined_config_folder, json_file_name
+        )
         config_file_paths.append(user_defined_config_file_path)
 
     default_config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
     config_file_paths.append(default_config_file_path)
 
     for config_file_path in config_file_paths:
         if os.path.exists(config_file_path):
             with open(config_file_path) as f:
-                logger.info("Using configuration from %s for MoE layer.",
-                            config_file_path)
+                logger.info(
+                    "Using configuration from %s for MoE layer.", config_file_path
+                )
                 # If a configuration has been found, return it
                 tuned_config = json.load(f)
                 # Delete triton_version from tuned_config
@@ -815,16 +902,26 @@ def get_moe_configs(
     # If no optimized configuration is available, we will use the default
     # configuration
     logger.warning(
-        ("Using default MoE config. Performance might be sub-optimal! "
-         "Config file not found at %s"), config_file_paths)
+        (
+            "Using default MoE config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_paths,
+    )
     return None
 
 
-def get_moe_wna16_block_config(config: dict[str,
-                                            int], use_moe_wna16_cuda: bool,
-                               num_valid_tokens: int, size_k: int, size_n: int,
-                               num_experts: int, group_size: int,
-                               real_top_k: int, block_size_m: int):
+def get_moe_wna16_block_config(
+    config: dict[str, int],
+    use_moe_wna16_cuda: bool,
+    num_valid_tokens: int,
+    size_k: int,
+    size_n: int,
+    num_experts: int,
+    group_size: int,
+    real_top_k: int,
+    block_size_m: int,
+):
     if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config:
         # optimal block config is set
         return {}
@@ -846,20 +943,24 @@ def get_moe_wna16_block_config(config: dict[str,
 
         num_n_blocks = size_k // block_size_k
         num_k_blocks = size_n // block_size_k
-        num_m_blocks = (num_valid_tokens + block_size_m - 1) / block_size_m + \
-            num_experts
+        num_m_blocks = (
+            num_valid_tokens + block_size_m - 1
+        ) / block_size_m + num_experts
         if num_valid_tokens // real_top_k <= block_size_m:
             num_m_blocks = min(num_m_blocks, num_valid_tokens)
         num_blocks = num_m_blocks * num_n_blocks * num_k_blocks
 
-        if size_k % 256 == 0 and num_blocks >= 256 and \
-                block_size_k < 256:
+        if size_k % 256 == 0 and num_blocks >= 256 and block_size_k < 256:
             block_size_k = 256
             num_blocks = num_blocks // (256 // block_size_k)
 
-        if num_m_blocks <= 16 and size_k % (block_size_k * 2) == 0 and \
-                size_k % (block_size_k * 2) == 0 and block_size_k <= 512 and \
-                num_blocks >= 512:
+        if (
+            num_m_blocks <= 16
+            and size_k % (block_size_k * 2) == 0
+            and size_k % (block_size_k * 2) == 0
+            and block_size_k <= 512
+            and num_blocks >= 512
+        ):
             block_size_k = block_size_k * 2
             num_blocks = num_blocks // 2
 
@@ -878,10 +979,15 @@ def get_moe_wna16_block_config(config: dict[str,
         return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
 
 
-def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
-                              num_experts: int, bit: int):
-    return current_platform.is_cuda() and bit == 4 and \
-        group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
+def should_moe_wna16_use_cuda(
+    num_valid_tokens: int, group_size: int, num_experts: int, bit: int
+):
+    return (
+        current_platform.is_cuda()
+        and bit == 4
+        and group_size in [32, 64, 128]
+        and num_valid_tokens / num_experts <= 6
+    )
 
 
 def get_default_config(
@@ -890,10 +996,20 @@ def get_default_config(
     N: int,
     K: int,
     topk: int,
-    dtype: Optional[str],
-    block_shape: Optional[list[int]] = None,
-    use_nn_moe: Optional[bool]=False,
+    dtype: str | None,
+    block_shape: list[int] | None = None,
+    use_nn_moe: bool | None =False,
 ) -> dict[str, int]:
+    if vllm_is_batch_invariant():
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+        }
+        return config
+
     if dtype == "fp8_w8a8" and block_shape is not None:
         # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
         # BLOCK_SIZE_K must be divisible by block_shape[1]
@@ -904,6 +1020,7 @@ def get_default_config(
             "BLOCK_SIZE_N": block_shape[0],
             "BLOCK_SIZE_K": block_shape[1],
             "GROUP_SIZE_M": 32,
+            "SPLIT_K": 1,
             "num_warps": 4,
             "num_stages": 3 if not current_platform.is_rocm() else 2,
         }
@@ -912,22 +1029,22 @@ def get_default_config(
         # only set BLOCK_SIZE_M
         # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later
         bit = 4 if dtype == "int4_w4a16" else 8
-        use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk,
-                                                       block_shape[1], E, bit)
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk, block_shape[1], E, bit)
         if use_moe_wna16_cuda:
-            config = {"BLOCK_SIZE_M": min(16, M)}
+            config = {"BLOCK_SIZE_M": min(16, M), "SPLIT_K": 1}
         elif M <= 20:
-            config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1}
+            config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1, "SPLIT_K": 1}
         elif M <= 40:
-            config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
+            config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1, "SPLIT_K": 1}
         else:
-            config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
+            config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1, "SPLIT_K": 1}
     elif M <= E:
         config = {
             "BLOCK_SIZE_M": 16,
             "BLOCK_SIZE_N": 32,
             "BLOCK_SIZE_K": 64,
             "GROUP_SIZE_M": 1,
+            "SPLIT_K": 1,
         }
     else:
         config = {
@@ -935,6 +1052,7 @@ def get_default_config(
             "BLOCK_SIZE_N": 64,
             "BLOCK_SIZE_K": 32,
             "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
         }
 
     if use_nn_moe:
@@ -946,12 +1064,13 @@ def try_get_optimal_moe_config(
     w1_shape: tuple[int, ...],
     w2_shape: tuple[int, ...],
     top_k: int,
-    dtype: Optional[str],
+    dtype: str | None,
     M: int,
-    block_shape: Optional[list[int]] = None,
-    use_nn_moe: Optional[bool] = False,
+    block_shape: list[int] | None = None,
+    use_nn_moe: bool | None = False,
 ) -> dict[str, int]:
     from vllm.model_executor.layers.fused_moe import get_config
+
     override_config = get_config()
     if override_config:
         config = override_config
@@ -973,31 +1092,33 @@ def try_get_optimal_moe_config(
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
-            config = get_default_config(M, E, N, w1_shape[2] if not use_nn_moe else w1_shape[1], top_k, dtype,
-                                        block_shape, use_nn_moe=use_nn_moe)
+            config = get_default_config(M, E, N, w1_shape[2] if not use_nn_moe else w1_shape[1], top_k, dtype, block_shape, use_nn_moe=use_nn_moe)
     return config
 
 
-def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
-                      token_expert_indices: torch.Tensor,
-                      gating_output: torch.Tensor,
-                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+def vllm_topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> tuple[torch.Tensor, ...]:
     ops.topk_softmax(
         topk_weights,
         topk_indices,
         token_expert_indices,
         gating_output,
+        renormalize,
     )
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
     return topk_weights, topk_indices
 
 
-def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
-    # if is_rocm_aiter_moe_enabled():
-    #     from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
-    #     return rocm_aiter_topk_softmax
+def dispatch_topk_func(
+    use_rocm_aiter: bool = False,
+) -> Callable[..., tuple[torch.Tensor, ...]]:
+    # if use_rocm_aiter:
+    #     return rocm_aiter_ops.topk_softmax
     return vllm_topk_softmax
 
 
@@ -1006,33 +1127,29 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-    indices_type: Optional[torch.dtype] = None,
+    indices_type: torch.dtype | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    assert hidden_states.size(0) == gating_output.size(0), (
-        "Number of tokens mismatch")
+    assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
 
     M, _ = hidden_states.size()
 
-    topk_weights = torch.empty(M,
-                               topk,
-                               dtype=torch.float32,
-                               device=hidden_states.device)
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
     topk_ids = torch.empty(
         M,
         topk,
         dtype=torch.int32 if indices_type is None else indices_type,
-        device=hidden_states.device)
-    token_expert_indices = torch.empty(M,
-                                       topk,
-                                       dtype=torch.int32,
-                                       device=hidden_states.device)
-
-    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
+        device=hidden_states.device,
+    )
+    token_expert_indices = torch.empty(
+        M, topk, dtype=torch.int32, device=hidden_states.device
+    )
 
-    topk_func = dispatch_topk_func()
-    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
-                                       token_expert_indices,
-                                       gating_output_float, renormalize)
+    topk_func = dispatch_topk_func(use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled())
+    topk_weights, topk_ids = topk_func(
+        topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
+    )
 
     return topk_weights, topk_ids, token_expert_indices
 
@@ -1047,9 +1164,12 @@ def fused_topk_bias(
     n_routed_experts = gating_output.shape[-1]
     scores = gating_output.softmax(dim=-1)
     scores_for_choice = scores.view(
-        -1, n_routed_experts) + e_score_correction_bias.unsqueeze(0)
-    topk_indices = torch.topk(scores_for_choice, k=topk, dim=-1,
-                              sorted=False)[1]
+        -1, n_routed_experts
+    ) + e_score_correction_bias.unsqueeze(0)
+
+    # For batch invariance, use sorted=True to ensure deterministic expert selection
+    use_sorted = vllm_is_batch_invariant()
+    topk_indices = torch.topk(scores_for_choice, k=topk, dim=-1, sorted=use_sorted)[1]
     topk_weights = scores.gather(1, topk_indices)
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
@@ -1057,7 +1177,11 @@ def fused_topk_bias(
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
-@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+@torch.compile(
+    dynamic=True,
+    backend=current_platform.simple_compile_backend,
+    options=maybe_disable_graph_partition(current_platform.simple_compile_backend),
+)
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -1067,12 +1191,15 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    if envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK and \
-            current_platform.is_cuda() and \
-            num_expert_group <= 32 and topk <= 32 and \
-            e_score_correction_bias is not None:
+    if (
+        envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK
+        and current_platform.is_cuda()
+        and num_expert_group <= 32
+        and topk <= 32
+        and e_score_correction_bias is not None
+    ):
         return fused_grouped_topk(
             hidden_states=hidden_states,
             gating_output=gating_output,
@@ -1082,10 +1209,10 @@ def grouped_topk(
             num_expert_group=num_expert_group,
             topk_group=topk_group,
             scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor)
+            routed_scaling_factor=routed_scaling_factor,
+        )
 
-    assert hidden_states.size(0) == gating_output.size(0), (
-        "Number of tokens mismatch")
+    assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
 
     if scoring_func == "softmax":
         scores = torch.softmax(gating_output, dim=-1)
@@ -1100,30 +1227,36 @@ def grouped_topk(
         # scores for expert selection but original scores for routing weights
         original_scores = scores
         scores = scores + e_score_correction_bias.unsqueeze(0)
-        group_scores = (scores.view(num_token, num_expert_group,
-                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+        group_scores = (
+            scores.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+        )
     else:
-        group_scores = scores.view(num_token, num_expert_group,
-                                   -1).max(dim=-1).values  # [n, n_group]
-    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
-                           sorted=False)[1]  # [n, top_k_group]
+        group_scores = (
+            scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+        )  # [n, n_group]
+
+    # For batch invariance, use sorted=True to ensure deterministic expert selection
+    use_sorted = vllm_is_batch_invariant()
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=use_sorted)[
+        1
+    ]  # [n, top_k_group]
     group_mask = torch.zeros_like(group_scores)  # [n, n_group]
     group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-    score_mask = group_mask.unsqueeze(-1).expand(
-        num_token, num_expert_group,
-        scores.size(-1) // num_expert_group).reshape(num_token, -1)  # [n, e]
-    tmp_scores = scores.masked_fill(~score_mask.bool(),
-                                    float("-inf"))  # [n, e]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.size(-1) // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
 
     if e_score_correction_bias is not None:
-        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=use_sorted)[1]
         # Use original unbiased scores for the routing weights
         topk_weights = original_scores.gather(1, topk_ids)
     else:
-        topk_weights, topk_ids = torch.topk(tmp_scores,
-                                            k=topk,
-                                            dim=-1,
-                                            sorted=False)
+        topk_weights, topk_ids = torch.topk(
+            tmp_scores, k=topk, dim=-1, sorted=use_sorted
+        )
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
@@ -1135,12 +1268,13 @@ def grouped_topk(
 
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def eplb_map_to_physical_and_record(
-        topk_ids: torch.Tensor,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-        indices_type: Optional[torch.dtype] = None) -> torch.Tensor:
-    '''
+    topk_ids: torch.Tensor,
+    expert_load_view: torch.Tensor,
+    logical_to_physical_map: torch.Tensor,
+    logical_replica_count: torch.Tensor,
+    indices_type: torch.dtype | None = None,
+) -> torch.Tensor:
+    """
     Map the logical expert ids to physical expert ids
     and record the expert load metrics.
 
@@ -1156,7 +1290,7 @@ def eplb_map_to_physical_and_record(
 
     Returns:
         The physical expert ids.
-    '''
+    """
 
     # 1. Convert the logical expert ids to physical expert ids
     # Directly select a random replica for each logical expert
@@ -1168,13 +1302,14 @@ def eplb_map_to_physical_and_record(
     # to deterministically choose a replica
     replica_count = logical_replica_count[topk_ids_long]
     # Flatten-position based index, reshaped back to `topk_ids` shape
-    pos_indices = torch.arange(topk_ids.numel(),
-                               device=topk_ids.device,
-                               dtype=torch.long).reshape_as(topk_ids)
+    pos_indices = torch.arange(
+        topk_ids.numel(), device=topk_ids.device, dtype=torch.long
+    ).reshape_as(topk_ids)
     # Compute pseudo-random indices by modulo
     replica_indices = (pos_indices % replica_count).unsqueeze(-1)
-    physical_ids = logical_to_physical_map[topk_ids_long].gather(
-        -1, replica_indices).squeeze(-1)
+    physical_ids = (
+        logical_to_physical_map[topk_ids_long].gather(-1, replica_indices).squeeze(-1)
+    )
 
     topk_ids = physical_ids
 
@@ -1199,7 +1334,8 @@ def eplb_map_to_physical_and_record(
     expert_load_view.scatter_add_(
         dim=0,
         index=topk_ids_flatten.long(),
-        src=torch.ones_like(topk_ids_flatten).to(expert_load_view))
+        src=torch.ones_like(topk_ids_flatten).to(expert_load_view),
+    )
 
     if indices_type is not None:
         topk_ids = topk_ids.to(dtype=indices_type)
@@ -1217,21 +1353,39 @@ def fused_grouped_topk(
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    assert hidden_states.size(0) == gating_output.size(0), (
-        "Number of tokens mismatch")
-
-    if scoring_func == "softmax":
+    assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
+
+    if scoring_func == "sigmoid":
+        # Fully fused kernel path for sigmoid
+        topk_values, topk_indices = ops.grouped_topk(
+            gating_output,  # raw logits
+            num_expert_group,
+            topk_group,
+            topk,
+            renormalize,
+            routed_scaling_factor,
+            e_score_correction_bias.to(gating_output.dtype),
+            1,  # scoring_func=1 for sigmoid
+        )
+    elif scoring_func == "softmax":
+        # Apply softmax in Python, then use fused kernel
+        # TODO: Add support for softmax in kernel
         scores = torch.softmax(gating_output, dim=-1)
-    elif scoring_func == "sigmoid":
-        scores = gating_output.sigmoid()
+        topk_values, topk_indices = ops.grouped_topk(
+            scores,  # pre-computed scores
+            num_expert_group,
+            topk_group,
+            topk,
+            renormalize,
+            routed_scaling_factor,
+            e_score_correction_bias.to(gating_output.dtype),
+            0,  # scoring_func=0 (no activation, scores already computed)
+        )
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
-    scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
-    topk_values, topk_indices = ops.grouped_topk(
-        scores, scores_with_bias.to(scores.dtype), num_expert_group,
-        topk_group, topk, renormalize, routed_scaling_factor)
-    return topk_values.to(torch.float32), topk_indices.to(torch.int32)
+    # Fused kernel outputs float32 values and int32 indices directly
+    return topk_values, topk_indices
 
 
 def inplace_fused_experts(
@@ -1246,27 +1400,48 @@ def inplace_fused_experts(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,  #noqa: UP006
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    use_nn_moe: Optional[bool] = False,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None = False,
 ) -> None:
-    fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       activation, apply_router_weight_on_input, use_fp8_w8a8,
-                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
-                       use_mxfp4_w4a4, per_channel_quant, global_num_experts,
-                       expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                       a2_scale, block_shape, w1_bias, w2_bias, use_nn_moe)
+    fused_experts_impl(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        True,
+        activation,
+        apply_router_weight_on_input,
+        use_fp8_w8a8,
+        use_int8_w8a8,
+        use_int8_w8a16,
+        use_int4_w4a16,
+        ocp_mx_scheme,
+        per_channel_quant,
+        global_num_experts,
+        expert_map,
+        w1_scale,
+        w2_scale,
+        w1_zp,
+        w2_zp,
+        a1_scale,
+        a2_scale,
+        block_shape,
+        w1_bias,
+        w2_bias,
+    )
 
 
 def inplace_fused_experts_fake(
@@ -1281,20 +1456,20 @@ def inplace_fused_experts_fake(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,  #noqa: UP006
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    use_nn_moe: Optional[bool] = False,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None = False,
 ) -> None:
     pass
 
@@ -1304,8 +1479,11 @@ direct_register_custom_op(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
-    tags=(() if is_torch_equal_or_newer("2.7.0") else
-          (torch.Tag.needs_fixed_stride_order, )),
+    tags=(
+        ()
+        if is_torch_equal_or_newer("2.7.0")
+        else (torch.Tag.needs_fixed_stride_order,)
+    ),
 )
 
 
@@ -1321,27 +1499,48 @@ def outplace_fused_experts(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,  #noqa: UP006
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    use_nn_moe: Optional[bool] = False
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None = False,
 ) -> torch.Tensor:
     return fused_experts_impl(
-        hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
-        apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8,
-        use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant,
-        global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp,
-        a1_scale, a2_scale, block_shape, w1_bias, w2_bias)
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        False,
+        activation,
+        apply_router_weight_on_input,
+        use_fp8_w8a8,
+        use_int8_w8a8,
+        use_int8_w8a16,
+        use_int4_w4a16,
+        ocp_mx_scheme,
+        per_channel_quant,
+        global_num_experts,
+        expert_map,
+        w1_scale,
+        w2_scale,
+        w1_zp,
+        w2_zp,
+        a1_scale,
+        a2_scale,
+        block_shape,
+        w1_bias,
+        w2_bias,
+    )
 
 
 def outplace_fused_experts_fake(
@@ -1355,19 +1554,19 @@ def outplace_fused_experts_fake(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
@@ -1376,14 +1575,17 @@ direct_register_custom_op(
     op_name="outplace_fused_experts",
     op_func=outplace_fused_experts,
     fake_impl=outplace_fused_experts_fake,
-    tags=(() if is_torch_equal_or_newer("2.7.0") else
-          (torch.Tag.needs_fixed_stride_order, )),
+    tags=(
+        ()
+        if is_torch_equal_or_newer("2.7.0")
+        else (torch.Tag.needs_fixed_stride_order,)
+    ),
 )
 
 
 def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
     torch.ops.vllm.inplace_fused_experts(**kwargs)
-    hidden_states = kwargs['hidden_states']
+    hidden_states = kwargs["hidden_states"]
     return hidden_states
 
 
@@ -1392,7 +1594,7 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
 
 
 def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
-    if inplace:
+    if inplace and not disable_inplace():
         return torch_vllm_inplace_fused_experts
     return torch_vllm_outplace_fused_experts
 
@@ -1408,12 +1610,11 @@ def fused_experts(
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    expert_map: torch.Tensor | None = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     allow_deep_gemm: bool = False,
     allow_cutlass_block_scaled_grouped_gemm: bool = False,
 ) -> torch.Tensor:
-
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
     use_fp8_w8a8 = quant_config.use_fp8_w8a8
@@ -1424,8 +1625,11 @@ def fused_experts(
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    if (allow_deep_gemm and quant_config.use_fp8_w8a8 and
-        (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))):
+    if (
+        allow_deep_gemm
+        and quant_config.use_fp8_w8a8
+        and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))
+    ):
         assert quant_config is not None
         assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
@@ -1444,10 +1648,13 @@ def fused_experts(
             a2_scale=quant_config.a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
-    elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8
-          and _valid_cutlass_block_scaled_grouped_gemm(
-              w1, w2, inplace, activation, apply_router_weight_on_input,
-              expert_map)):
+    elif (
+        allow_cutlass_block_scaled_grouped_gemm
+        and use_fp8_w8a8
+        and _valid_cutlass_block_scaled_grouped_gemm(
+            w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
+        )
+    ):
         assert quant_config is not None
         return run_cutlass_block_scaled_fused_experts(
             a=hidden_states,
@@ -1456,7 +1663,8 @@ def fused_experts(
             w1_scale=quant_config.w1_scale,
             w2_scale=quant_config.w2_scale,
             topk_weights=topk_weights,
-            topk_ids=topk_ids)
+            topk_ids=topk_ids,
+        )
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,
@@ -1470,7 +1678,7 @@ def fused_experts(
             use_int8_w8a8=quant_config.use_int8_w8a8,
             use_int8_w8a16=quant_config.use_int8_w8a16,
             use_int4_w4a16=quant_config.use_int4_w4a16,
-            use_mxfp4_w4a4=quant_config.use_mxfp4_w4a4,
+            ocp_mx_scheme=quant_config.ocp_mx_scheme,
             per_channel_quant=quant_config.per_act_token_quant,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
@@ -1482,18 +1690,20 @@ def fused_experts(
             a2_scale=quant_config.a2_scale,
             block_shape=quant_config.block_shape,
             w1_bias=quant_config.w1_bias,
-            w2_bias=quant_config.w2_bias)
+            w2_bias=quant_config.w2_bias,
+        )
 
 
 SILU_NO_MUL: str = activation_without_mul("silu")
 GELU_NO_MUL: str = activation_without_mul("gelu")
+RELU2_NO_MUL: str = activation_without_mul("relu2")
 
 
 def _get_config_quant_dtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
-    use_mxfp4_w4a4: bool,
-) -> Union[None, torch.dtype, str]:
+    ocp_mx_scheme: str | None,
+) -> None | torch.dtype | str:
     """
     Get the quantization type based on the quantization strategy flags.
     We don't have a quant_config at this point so we need to work backwards.
@@ -1505,8 +1715,12 @@ def _get_config_quant_dtype(
         return torch.float8_e4m3fn
     elif use_int8_w8a8:
         return torch.int8
-    elif use_mxfp4_w4a4:
+    elif ocp_mx_scheme == "w_mxfp4_a_mxfp4":
         return "mxfp4"
+    elif ocp_mx_scheme in {"w_mxfp4_a_mxfp6_e3m2", "w_mxfp6_e3m2_a_mxfp6_e3m2"}:
+        return "mxfp6_e3m2"
+    elif ocp_mx_scheme in {"w_mxfp4_a_mxfp6_e2m3", "w_mxfp6_e2m3_a_mxfp6_e2m3"}:
+        return "mxfp6_e2m3"
     return None
 
 
@@ -1523,42 +1737,53 @@ def fused_experts_impl(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    use_mxfp4_w4a4: bool = False,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    use_nn_moe: Optional[bool] = False,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None= False,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
-        assert hidden_states.size(1) // 2 == w1.size(2), (
-            "Hidden size mismatch")
+        assert hidden_states.size(1) // 2 == w1.size(2), "Hidden size mismatch"
     elif use_nn_moe:
         assert hidden_states.size(1) == w1.size(1), "Hidden size mismatch"
-
-    elif use_mxfp4_w4a4:
-        # 16bit activation and fp4x2 packed weight
-        assert hidden_states.size(1) // 2 == w1.size(2), "hidden size mismatch"
+    elif ocp_mx_scheme is not None:
+        if ocp_mx_scheme in {
+            "w_mxfp4_a_mxfp4",
+            "w_mxfp4_a_mxfp6_e3m2",
+            "w_mxfp4_a_mxfp6_e2m3",
+        }:
+            # 16bit activation and fp4x2 packed weight
+            assert hidden_states.size(1) == w1.size(2) * 2, "hidden size mismatch"
+        elif ocp_mx_scheme in {
+            "w_mxfp6_e3m2_a_mxfp6_e3m2",
+            "w_mxfp6_e2m3_a_mxfp6_e2m3",
+        }:
+            assert hidden_states.size(1) == (w1.size(2) * 4) // 3, (
+                "hidden size mismatch"
+            )
+        else:
+            raise NotImplementedError(f"Unsupported ocp_mx_scheme={ocp_mx_scheme}")
     else:
         assert hidden_states.size(1) == w1.size(2), (
-            f"Hidden size mismatch {hidden_states.size(1)} != {w1.size(2)}")
+            f"Hidden size mismatch {hidden_states.size(1)} != {w1.size(2)}"
+        )
 
     assert topk_weights.size() == topk_ids.size(), "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
     assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
 
     num_tokens = hidden_states.size(0)
     if use_nn_moe:
@@ -1574,17 +1799,21 @@ def fused_experts_impl(
     CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
     M = min(num_tokens, CHUNK_SIZE)
 
-    config_dtype = _get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
-                                         use_int8_w8a16=use_int8_w8a16,
-                                         use_int4_w4a16=use_int4_w4a16,
-                                         use_mxfp4_w4a4=use_mxfp4_w4a4,
-                                         dtype=hidden_states.dtype)
+    config_dtype = _get_config_dtype_str(
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        ocp_mx_scheme=ocp_mx_scheme,
+        dtype=hidden_states.dtype,
+    )
 
     # Note: for use_int8_w8a16 or use_int4_w4a16, the activations are
     # quantized prior to calling fused_experts.
-    quant_dtype = _get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
-                                          use_int8_w8a8=use_int8_w8a8,
-                                          use_mxfp4_w4a4=use_mxfp4_w4a4)
+    quant_dtype = _get_config_quant_dtype(
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a8=use_int8_w8a8,
+        ocp_mx_scheme=ocp_mx_scheme,
+    )
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
@@ -1600,17 +1829,18 @@ def fused_experts_impl(
 
     # We can reuse the memory between these because by the time we need
     # cache3, we're done with cache1
-    cache13 = torch.empty(M * top_k_num * max(N, K if not use_nn_moe else w2.shape[2]),
-                          device=hidden_states.device,
-                          dtype=hidden_states.dtype)
-
-    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
-    intermediate_cache3 = cache13[:M * top_k_num * (K if not use_nn_moe else w2.shape[2])].view(M, top_k_num, K if not use_nn_moe else w2.shape[2])
+    cache13 = torch.empty(
+        M * top_k_num * max(N, K if not use_nn_moe else w2.shape[2]),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = cache13[: M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[: M * top_k_num * (K if not use_nn_moe else w2.shape[2])].view(M, top_k_num, K if not use_nn_moe else w2.shape[2])
 
     # This needs separate memory since it's used concurrently with cache1
-    intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
+    intermediate_cache2 = torch.empty(
+        (M * top_k_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype
+    )
 
     if hidden_states.dtype == torch.bfloat16:
         compute_type = tl.bfloat16
@@ -1621,22 +1851,51 @@ def fused_experts_impl(
     else:
         raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
-    if inplace:
+    if inplace and not disable_inplace():
         out_hidden_states = hidden_states
     else:
         out_hidden_states = torch.empty_like(hidden_states)
 
-    if use_mxfp4_w4a4:
-        # Weight has to be dequantized for mxfp4 emulation.
-        w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
-        w1_scale = None
-        w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype)
-        w2_scale = None
+    if ocp_mx_scheme is not None:
+        # TODO: On platforms for which `current_platform.supports_mx()` is True
+        # and for which we have a native OCP mx fused MOE kernel,
+        # this dequantization step should not be done.
+        if ocp_mx_scheme in {
+            OCP_MX_Scheme.w_mxfp4_a_mxfp4,
+            OCP_MX_Scheme.w_mxfp4_a_mxfp6_e3m2,
+            OCP_MX_Scheme.w_mxfp4_a_mxfp6_e2m3,
+        }:
+            # Weight has to be dequantized for mxfp4 emulation.
+            w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
+            w1_scale = None
+            w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype)
+            w2_scale = None
+        elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e3m2_a_mxfp6_e3m2:
+            w1 = dequant_mxfp6(
+                w1, w1_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype
+            )
+            w1_scale = None
+            w2 = dequant_mxfp6(
+                w2, w2_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype
+            )
+            w2_scale = None
+        elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e2m3_a_mxfp6_e2m3:
+            w1 = dequant_mxfp6(
+                w1, w1_scale, quant_dtype="fp6_e2m3", float_dtype=hidden_states.dtype
+            )
+            w1_scale = None
+            w2 = dequant_mxfp6(
+                w2, w2_scale, quant_dtype="fp6_e2m3", float_dtype=hidden_states.dtype
+            )
+            w2_scale = None
+        else:
+            raise NotImplementedError(f"Unsupported ocp_mx_scheme={ocp_mx_scheme}")
 
     for chunk in range((num_tokens // CHUNK_SIZE) + 1):
-        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
-                                          min((chunk + 1) * CHUNK_SIZE,
-                                              num_tokens))
+        begin_chunk_idx, end_chunk_idx = (
+            chunk * CHUNK_SIZE,
+            min((chunk + 1) * CHUNK_SIZE, num_tokens),
+        )
         curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
         tokens_in_chunk, _ = curr_hidden_states.size()
 
@@ -1649,8 +1908,9 @@ def fused_experts_impl(
             # so the cache size and config are already set correctly and
             # do not need to be adjusted.
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk *
-                                                      topk_ids.size(1)]
+            intermediate_cache2 = intermediate_cache2[
+                : tokens_in_chunk * topk_ids.size(1)
+            ]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
             config = get_config_func(tokens_in_chunk)
 
@@ -1661,52 +1921,59 @@ def fused_experts_impl(
             A_scale=a1_scale,
             quant_dtype=quant_dtype,
             per_act_token_quant=per_channel_quant,
-            block_shape=block_shape)
-
-        sorted_token_ids, expert_ids, num_tokens_post_padded = (
-            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
-                                 global_num_experts, expert_map))
-
-        invoke_fused_moe_kernel(qcurr_hidden_states,
-                                w1,
-                                intermediate_cache1,
-                                a1q_scale,
-                                w1_scale,
-                                w1_zp,
-                                curr_topk_weights,
-                                sorted_token_ids,
-                                expert_ids,
-                                num_tokens_post_padded,
-                                apply_router_weight_on_input,
-                                top_k_num,
-                                config,
-                                compute_type=compute_type,
-                                use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a8=use_int8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16,
-                                use_int4_w4a16=use_int4_w4a16,
-                                per_channel_quant=per_channel_quant,
-                                block_shape=block_shape,
-                                B_bias=w1_bias,
-                                use_nn_moe=use_nn_moe)
+            block_shape=block_shape,
+        )
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+        )
+
+        invoke_fused_moe_kernel(
+            qcurr_hidden_states,
+            w1,
+            intermediate_cache1,
+            a1q_scale,
+            w1_scale,
+            w1_zp,
+            curr_topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            apply_router_weight_on_input,
+            top_k_num,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+            B_bias=w1_bias,
+            use_nn_moe=use_nn_moe,
+        )
 
         # Activation function with multiplication
         if activation == "silu":
-            torch.ops._C.silu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
+            torch.ops._C.silu_and_mul(
+                intermediate_cache2, intermediate_cache1.view(-1, N)
+            )
         elif activation == "gelu":
-            torch.ops._C.gelu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
+            torch.ops._C.gelu_and_mul(
+                intermediate_cache2, intermediate_cache1.view(-1, N)
+            )
         elif activation == "swigluoai":
             # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
-                                           intermediate_cache1.view(-1, N))
+            torch.ops._C.swigluoai_and_mul(
+                intermediate_cache2, intermediate_cache1.view(-1, N)
+            )
         # Activation function without multiplication
         elif activation == SILU_NO_MUL:
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == GELU_NO_MUL:
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-
+        elif activation == RELU2_NO_MUL:
+            intermediate_cache2 = torch.square(F.relu(intermediate_cache1.view(-1, N)))
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
 
@@ -1715,39 +1982,43 @@ def fused_experts_impl(
             A_scale=a2_scale,
             quant_dtype=quant_dtype,
             per_act_token_quant=per_channel_quant,
-            block_shape=block_shape)
-
-        invoke_fused_moe_kernel(qintermediate_cache2,
-                                w2,
-                                intermediate_cache3,
-                                a2q_scale,
-                                w2_scale,
-                                w2_zp,
-                                curr_topk_weights,
-                                sorted_token_ids,
-                                expert_ids,
-                                num_tokens_post_padded,
-                                not apply_router_weight_on_input,
-                                1,
-                                config,
-                                compute_type=compute_type,
-                                use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a8=use_int8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16,
-                                use_int4_w4a16=use_int4_w4a16,
-                                per_channel_quant=per_channel_quant,
-                                block_shape=block_shape,
-                                B_bias=w2_bias,
-                                use_nn_moe=use_nn_moe)
-
-        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
-                    out_hidden_states[begin_chunk_idx:end_chunk_idx])
+            block_shape=block_shape,
+        )
+
+        invoke_fused_moe_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2q_scale,
+            w2_scale,
+            w2_zp,
+            curr_topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+            B_bias=w2_bias,
+            use_nn_moe=use_nn_moe,
+        )
+
+        ops.moe_sum(
+            intermediate_cache3.view(*intermediate_cache3.size()),
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+        )
 
     return out_hidden_states
 
 
 class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         quant_config: FusedMoEQuantConfig,
@@ -1756,10 +2027,12 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_chunking(self) -> bool:
         return True
@@ -1772,20 +2045,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1 = (M, topk, max(N // 2, K))
         workspace2 = (M, topk, max(N, K))
         output = (M, K)
-        return (workspace1, workspace2, output, a.dtype)
+        return (workspace1, workspace2, output)
 
     def apply(
         self,
@@ -1797,34 +2068,36 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         # Check constraints.
         if self.quant_config.use_int4_w4a16:
-            assert hidden_states.size(-1) // 2 == w1.size(2), (
-                "Hidden size mismatch")
+            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
         else:
-            assert hidden_states.size(-1) == w1.size(2), \
-                (f"Hidden size mismatch {hidden_states.size(-1)} "
-                 f"!= {w1.size(2)}")
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
+            )
 
-        assert hidden_states.is_contiguous(
-        ), "Hidden_states must be contiguous"
+        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
         assert hidden_states.dim() == 2
         assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
         assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
         assert hidden_states.dtype in [
-            torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
         ]
 
-        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
-            hidden_states, w1, w2, topk_ids)
+        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
 
         if global_num_experts == -1:
             global_num_experts = E
@@ -1847,20 +2120,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         elif hidden_states.dtype == torch.float8_e4m3fn:
             compute_type = tl.bfloat16
         else:
-            raise ValueError(
-                f"Unsupported compute_type: {hidden_states.dtype}")
+            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
         # Note that the output tensor might be in workspace1
-        intermediate_cache1 = _resize_cache(workspace2,
-                                            (num_tokens, top_k_num, N))
-        intermediate_cache2 = _resize_cache(workspace13,
-                                            (num_tokens * top_k_num, N // 2))
-        intermediate_cache3 = _resize_cache(workspace2,
-                                            (num_tokens, top_k_num, K))
+        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
+        intermediate_cache2 = _resize_cache(
+            workspace13, (num_tokens * top_k_num, N // 2)
+        )
+        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
 
-        sorted_token_ids, expert_ids, num_tokens_post_padded = (
-            moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
-                                 global_num_experts, expert_map))
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+        )
 
         invoke_fused_moe_kernel(
             hidden_states,
@@ -1886,14 +2157,19 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             B_bias=self.w1_bias,
         )
 
-        self.activation(activation, intermediate_cache2,
-                        intermediate_cache1.view(-1, N))
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
 
-        a2q_scale: Optional[torch.Tensor] = None
+        a2q_scale: torch.Tensor | None = None
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2, a2_scale, self.quant_dtype,
-            self.per_act_token_quant, self.block_shape)
+            intermediate_cache2,
+            a2_scale,
+            self.quant_dtype,
+            self.per_act_token_quant,
+            self.block_shape,
+        )
 
         invoke_fused_moe_kernel(
             qintermediate_cache2,
@@ -1919,12 +2195,18 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             B_bias=self.w2_bias,
         )
 
-        ops.moe_sum(intermediate_cache3, output)
+        # separate function is required for MoE + LoRA
+        self.moe_sum(intermediate_cache3, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
+        ops.moe_sum(input, output)
 
 
 def modular_triton_fused_moe(
-        quant_config: FusedMoEQuantConfig) -> mk.FusedMoEModularKernel:
+    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
+) -> mk.FusedMoEModularKernel:
     return mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         TritonExperts(quant_config),
+        shared_experts,
     )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f8c8d75a9b50e7f0d4a10ba34e6492359538b1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase,
+)
+
+logger = init_logger(__name__)
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.moe: FusedMoEConfig = moe
+        self.moe_quant_config: FusedMoEQuantConfig | None = None
+
+    @abstractmethod
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        from .all2all_utils import maybe_make_prepare_finalize
+
+        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        # based on the all2all implementation, select the appropriate
+        # gemm implementation
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must select appropriate gemm "
+            "implementation based on the prepare_finalize"
+        )
+
+    @abstractmethod
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        raise NotImplementedError
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
+
+    @property
+    def allow_inplace(self) -> bool:
+        return False
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..43974ba917e425ad3a721e48493aac990d78e1e1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+    FusedMoEPrepareAndFinalize,
+)
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("modular_fused_moe")
+class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    def __init__(
+        self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
+    ):
+        super().__init__(old_quant_method.moe)
+        self.moe_quant_config = old_quant_method.moe_quant_config
+        self.fused_experts = experts
+        self.disable_expert_map = getattr(
+            old_quant_method,
+            "disable_expert_map",
+            not self.fused_experts.supports_expert_map(),
+        )
+        self.old_quant_method = old_quant_method
+        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
+
+    @staticmethod
+    def make(
+        moe_layer: torch.nn.Module,
+        old_quant_method: FusedMoEMethodBase,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        shared_experts: torch.nn.Module | None,
+    ) -> "FusedMoEModularMethod":
+        return FusedMoEModularMethod(
+            old_quant_method,
+            FusedMoEModularKernel(
+                prepare_finalize,
+                old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
+                shared_experts,
+            ),
+        )
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
+
+    @property
+    def supports_eplb(self) -> bool:
+        return self.old_quant_method.supports_eplb
+
+    @property
+    def allow_inplace(self) -> bool:
+        return self.old_quant_method.allow_inplace
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return self.moe_quant_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        # Is getattr needed?
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        if enable_eplb:
+            if self.supports_eplb:
+                assert expert_load_view is not None
+                assert logical_to_physical_map is not None
+                assert logical_replica_count is not None
+            else:
+                raise NotImplementedError(
+                    "EPLB is not supported for "
+                    f"{self.old_quant_method.__class__.__name__}."
+                )
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+        )
+
+        result = self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=self.allow_inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else expert_map,
+        )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 18de758519346ee9938a15c147dc2831bcf17af7..badedfc54c382a5ad8fe54e208a45feb46c0b47f 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,41 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig)
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP)
+    TopKWeightAndReduceNoOP,
+)
 from vllm.triton_utils import tl, triton
-from vllm.utils import has_triton_kernels
+from vllm.utils.import_utils import has_triton_kernels
 
 logger = init_logger(__name__)
 
 if has_triton_kernels():
     try:
         import triton_kernels.swiglu
-        from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
-                                               matmul_ogs)
-        from triton_kernels.routing import (RoutingData, routing,
-                                            routing_from_bitmatrix)
+        from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
+        from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
         from triton_kernels.tensor import Bitmatrix
     except (AttributeError, ImportError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible. Error: %s", e)
+            "version is compatible. Error: %s",
+            e,
+        )
 
 
 @triton.jit
 def pack_bitmatrix(
     bitmatrix,
     topk_ids,
-    n_rows,  # n_rows in bitmatrix / topk_ids 
+    n_rows,  # n_rows in bitmatrix / topk_ids
     bm_cols: tl.constexpr,  # n int32_t bitpacks in bitmatrix
-    n_expts_act,  # num_topk 
+    n_expts_act,  # num_topk
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
 ):
@@ -60,12 +62,12 @@ def pack_bitmatrix(
         offs = tl.arange(0, BLOCK_SIZE_K // 32) + i * (BLOCK_SIZE_K // 32)
         # All topks that need to go into this column has the correct bit set.
         # Other bits are 0. x is a 2D tensor.
-        x = tl.where(div[:, :, None] == offs[None, None, :],
-                     (one << rem)[:, :, None], 0)
+        x = tl.where(
+            div[:, :, None] == offs[None, None, :], (one << rem)[:, :, None], 0
+        )
         # Reduce x to get a single int32_t bitpack.
         y = tl.reduce_or(x, axis=1)
-        bitmatrix_ptrs = bitmatrix + offsets_m[:,
-                                               None] * bm_cols + offs[None, :]
+        bitmatrix_ptrs = bitmatrix + offsets_m[:, None] * bm_cols + offs[None, :]
         tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
 
 
@@ -77,15 +79,14 @@ def triton_kernel_moe_forward(
     topk: int,
     renormalize: bool,
     activation: str = "silu",
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
-
-    routing_data, gather_idx, scatter_idx = routing(gating_output,
-                                                    topk,
-                                                    sm_first=not renormalize)
+    routing_data, gather_idx, scatter_idx = routing(
+        gating_output, topk, sm_first=not renormalize
+    )
 
     return triton_kernel_fused_experts(
         None,
@@ -99,7 +100,8 @@ def triton_kernel_moe_forward(
         quant_config=quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
         global_num_experts=global_num_experts,
-        expert_map=expert_map)
+        expert_map=expert_map,
+    )
 
 
 # This is a triton implementation of the fused_experts function
@@ -112,23 +114,21 @@ def triton_kernel_fused_experts(
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
     activation: str = "silu",
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
     swiglu_limit: float = 7.0,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1q_scale: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
 
     # type check, uint8 means mxfp4
     assert hidden_states.dtype == torch.bfloat16
-    assert (quant_config.w1_bias is None
-            or quant_config.w1_bias.dtype == torch.float32)
-    assert (quant_config.w2_bias is None
-            or quant_config.w2_bias.dtype == torch.float32)
+    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
+    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
 
     # Shape check, only check non-mxfp4
     assert hidden_states.shape[-1] == w1.shape[-2]
@@ -141,7 +141,9 @@ def triton_kernel_fused_experts(
 
     act = FusedActivation(
         FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
-        (swiglu_alpha, swiglu_limit), 2)
+        (swiglu_alpha, swiglu_limit),
+        2,
+    )
     gammas = routing_data.gate_scal if routing_data else None
 
     intermediate_cache1 = matmul_ogs(
@@ -152,7 +154,8 @@ def triton_kernel_fused_experts(
         gather_indx=gather_indx,
         precision_config=quant_config.w1_precision,
         gammas=gammas if apply_router_weight_on_input else None,
-        fused_activation=act)
+        fused_activation=act,
+    )
 
     intermediate_cache3 = matmul_ogs(
         intermediate_cache1,
@@ -172,7 +175,6 @@ def make_routing_data(
     topk_weights: torch.Tensor,
     num_local_experts: int,
 ) -> tuple["RoutingData", torch.Tensor, torch.Tensor]:
-
     topk_ids = topk_ids.to(torch.int16)
     topk_weights = topk_weights.to(torch.bfloat16)
 
@@ -182,11 +184,11 @@ def make_routing_data(
     BLOCK_SIZE_K = 32
 
     bm_cols = triton.cdiv(num_local_experts, BLOCK_SIZE_K)  # n_bitpacks
-    bitmatrix = torch.zeros((n_rows, bm_cols),
-                            dtype=torch.uint32,
-                            device=topk_ids.device)
+    bitmatrix = torch.zeros(
+        (n_rows, bm_cols), dtype=torch.uint32, device=topk_ids.device
+    )
 
-    grid = (triton.cdiv(n_rows, BLOCK_SIZE_M), )
+    grid = (triton.cdiv(n_rows, BLOCK_SIZE_M),)
     pack_bitmatrix[grid](
         bitmatrix,
         topk_ids,
@@ -199,21 +201,20 @@ def make_routing_data(
 
     bitmatrix_shape = [n_rows, bm_cols * 32]
     bitmatrix_shape_max = [n_rows, None]
-    bitmatrix = Bitmatrix(bitmatrix,
-                          shape=bitmatrix_shape,
-                          shape_max=bitmatrix_shape_max,
-                          scratchpad=None)
+    bitmatrix = Bitmatrix(
+        bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
+    )
 
     # matmul_ogs expects invalid topk_weights to be -1s
     topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
     routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
-        bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk)
+        bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
+    )
 
     return routing_data, gather_indx, scatter_indx
 
 
 class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(self, quant_config: FusedMoEQuantConfig):
         super().__init__(quant_config)
 
@@ -234,7 +235,6 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
 
 class OAITritonExperts(BaseOAITritonExperts):
-
     def __init__(self, quant_config: FusedMoEQuantConfig):
         # TODO (varun) : Enable activation quantization
         assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
@@ -242,24 +242,31 @@ class OAITritonExperts(BaseOAITritonExperts):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_chunking(self) -> bool:
         return True
 
     def workspace_shapes(
-        self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int,
-        topk: int, global_num_experts: int, local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata]
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
         workspace1 = (M, K)
         workspace2 = (0, 0)
         output = (M, K)
-        return (workspace1, workspace2, output, a.dtype)
+        return (workspace1, workspace2, output)
 
     def apply(
         self,
@@ -271,12 +278,12 @@ class OAITritonExperts(BaseOAITritonExperts):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         if expert_map is not None:
@@ -287,7 +294,8 @@ class OAITritonExperts(BaseOAITritonExperts):
             global_num_experts = local_num_experts
 
         routing_data, gather_indx, scatter_indx = self._make_routing_data(
-            topk_ids, topk_weights, local_num_experts)
+            topk_ids, topk_weights, local_num_experts
+        )
 
         experts_output = triton_kernel_fused_experts(
             None,
@@ -302,6 +310,7 @@ class OAITritonExperts(BaseOAITritonExperts):
             apply_router_weight_on_input=False,
             global_num_experts=local_num_experts,
             expert_map=None,  # applied already
-            a1q_scale=a1q_scale)
+            a1q_scale=a1q_scale,
+        )
 
         output.copy_(experts_output, non_blocking=True)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 19de67f342d6c3de7a1534543a7ec03f3748372d..97a269d0a68226e88b545262c4158dac9087dce9 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -2,88 +2,98 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-
-from abc import abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
-from typing import Callable, Literal, Optional, Union, get_args, overload
+from functools import partial
+from typing import Literal, get_args, overload
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
-from vllm.config import get_current_vllm_config
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.parallel import ExpertPlacementStrategy
-from vllm.distributed import (get_dp_group, get_ep_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_dp_group,
+    get_ep_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-# yapf: disable
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEConfig, FusedMoEParallelConfig,
-    FusedMoEQuantConfig, biased_moe_quant_config)
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    zero_experts_compute_triton)
-# yapf: enable
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat, FusedMoEModularKernel,
-    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
 # from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-#     is_rocm_aiter_moe_enabled)
-from vllm.model_executor.layers.fused_moe.routing_simulator import (
-    RoutingSimulator)
+#     init_aiter_topK_meta_data,
+# )
+from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.utils import set_weight_attrs
+    QuantizationConfig,
+)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    is_flashinfer_supporting_global_sf,
+)
 from vllm.platforms import current_platform
-from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx,
-                        round_up)
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.torch_utils import (
+    aux_stream,
+    current_stream,
+    direct_register_custom_op,
+)
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import (TritonExperts, eplb_map_to_physical_and_record,
-                            fused_experts)
-    if has_pplx():
-        from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
-                                            pplx_hidden_dim_scale_bytes)
-    if has_deep_ep():
-        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
-        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
-                                                 DeepEPLLPrepareAndFinalize)
+    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
 else:
     fused_experts = None  # type: ignore
-    FusedMoEPermuteExpertsUnpermute = None  # type: ignore
-    FusedMoEPrepareAndFinalize = None  # type: ignore
+    FusedMoEPermuteExpertsUnpermute = object  # type: ignore
+    FusedMoEPrepareAndFinalize = object  # type: ignore
 
     def _eplb_map_to_physical_and_record(
-            topk_ids: torch.Tensor, expert_load_view: torch.Tensor,
-            logical_to_physical_map: torch.Tensor,
-            logical_replica_count: torch.Tensor,
-            indices_type: Optional[torch.dtype]) -> torch.Tensor:
+        topk_ids: torch.Tensor,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+        indices_type: torch.dtype | None,
+    ) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
-
-# if is_rocm_aiter_moe_enabled():
-#     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-#         rocm_aiter_grouped_topk as grouped_topk)
-# else:
 from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+    rocm_aiter_grouped_topk,
+)
 
 if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
 
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
+
 logger = init_logger(__name__)
 
 
@@ -94,839 +104,166 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
-class FusedMoEMethodBase(QuantizeMethodBase):
-
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
-        self.moe = moe
-        self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
-        self.fused_experts: Optional[FusedMoEModularKernel] = None
-        self.topk_indices_dtype = None
-
-    @abstractmethod
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-        raise NotImplementedError
-
-    def uses_weight_scale_2_pattern(self) -> bool:
-        """
-        Returns True if this quantization method uses 'weight_scale_2' pattern
-        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
-
-        This method should be overridden by subclasses that use the
-        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
-        """
-        return False
-
-    @staticmethod
-    def _maybe_make_prepare_finalize(
-        moe: FusedMoEConfig,
-        quant_config: Optional[FusedMoEQuantConfig],
-    ) -> Optional[FusedMoEPrepareAndFinalize]:
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-
-        prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
-
-        # TODO: could allow this now
-        assert not moe.use_flashinfer_cutlass_kernels, \
-            "Must be created in modelopt.py"
-
-        if moe.use_pplx_kernels:
-            assert quant_config is not None
-
-            hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
-                moe.max_num_tokens,
-                moe.hidden_dim,
-                moe.in_dtype,
-                quant_config.quant_dtype,
-                per_act_token_quant=quant_config.per_act_token_quant,
-                block_shape=quant_config.block_shape,
-            )
-
-            all_to_all_args = dict(
-                max_num_tokens=moe.max_num_tokens,
-                num_experts=moe.num_experts,
-                experts_per_token=moe.experts_per_token,  # topk
-                rank=all2all_manager.rank,
-                world_size=all2all_manager.world_size,
-                # dp_size actually means tp_size, bug in pplx kernels
-                dp_size=all2all_manager.tp_group.world_size,
-                hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=hidden_dim_bytes,
-                hidden_dim_scale_bytes=hidden_scale_bytes,
-            )
-
-            num_dispatchers = (all2all_manager.world_size //
-                               all2all_manager.tp_group.world_size)
-
-            # Intranode pplx a2a takes a group name while internode does not.
-            if not all2all_manager.internode:
-                all_to_all_args[
-                    "group_name"] = all2all_manager.cpu_group.group_name
-
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            prepare_finalize = PplxPrepareAndFinalize(
-                handle,
-                max_num_tokens=moe.max_num_tokens,
-                num_local_experts=moe.num_local_experts,
-                num_dispatchers=num_dispatchers,
-            )
-        elif moe.use_deepep_ht_kernels:
-            assert moe.dp_size == all2all_manager.dp_world_size
-
-            all_to_all_args = dict()
-            handle = all2all_manager.get_handle(all_to_all_args)
-            prepare_finalize = DeepEPHTPrepareAndFinalize(
-                handle,
-                num_dispatchers=all2all_manager.world_size,
-                dp_size=all2all_manager.dp_world_size,
-                rank_expert_offset=all2all_manager.rank *
-                moe.num_local_experts,
-            )
-
-        elif moe.use_deepep_ll_kernels:
-            assert quant_config is not None
-            all_to_all_args = dict(
-                max_num_tokens_per_dp_rank=moe.max_num_tokens,
-                token_hidden_size=moe.hidden_dim,
-                num_ep_ranks=all2all_manager.world_size,
-                num_global_experts=moe.num_experts,
-                num_local_experts=moe.num_experts //
-                all2all_manager.world_size)
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            # Note: We may want to use FP8 dispatch just to reduce
-            # data movement.
-            use_fp8_dispatch = (
-                quant_config.quant_dtype == current_platform.fp8_dtype()
-                and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE)
-
-            prepare_finalize = DeepEPLLPrepareAndFinalize(
-                handle,
-                max_tokens_per_rank=moe.max_num_tokens,
-                num_dispatchers=all2all_manager.world_size,
-                use_fp8_dispatch=use_fp8_dispatch,
-            )
-
-        return prepare_finalize
-
-    def maybe_make_prepare_finalize(
-            self) -> Optional[FusedMoEPrepareAndFinalize]:
-        if self.moe.moe_parallel_config.use_all2all_kernels:
-            return FusedMoEMethodBase._maybe_make_prepare_finalize(
-                self.moe, self.moe_quant_config)
-        else:
-            return None
-
-    # Note: init_prepare_finalize should only be called by
-    # prepare_communication_buffer_for_model.
-    def init_prepare_finalize(self, layer: torch.nn.Module):
-        assert self.moe is not None
-
-        # We must get the quant config here so that the layer is
-        # completely initialized, i.e. all weights loaded and post
-        # processed.
-        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-
-        prepare_finalize = self.maybe_make_prepare_finalize()
-
-        if prepare_finalize is not None:
-            logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__,
-                         self, id(self))
-            assert self.topk_indices_dtype is None
-            assert self.fused_experts is None, \
-                f"Attempt to override experts for {id(self)}!"
-            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
-            experts = self.select_gemm_impl(prepare_finalize, layer)
-            self.fused_experts = FusedMoEModularKernel(
-                prepare_finalize,
-                experts,
-                layer.shared_experts,
-            )
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        # based on the all2all implementation, select the appropriate
-        # gemm implementation
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must select appropriate gemm "
-            "implementation based on the prepare_finalize")
-
-    @abstractmethod
-    def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        raise NotImplementedError
-
-
-@CustomOp.register("unquantized_fused_moe")
-class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-    """MoE method without quantization."""
-
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-        # self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
-        self.rocm_aiter_moe_enabled = False
-        if self.rocm_aiter_moe_enabled:
-            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
-            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
-        else:
-            self.rocm_aiter_fused_experts = None  # type: ignore
-
-        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
-        self.flashinfer_cutlass_moe_enabled = (
-            has_flashinfer_cutlass_fused_moe()
-            and envs.VLLM_USE_FLASHINFER_MOE_FP16
-            and self.moe.moe_parallel_config.use_ep
-            and self.moe.moe_parallel_config.dp_size == 1
-            and current_platform.get_device_capability()[0] >= 9)
-        if self.flashinfer_cutlass_moe_enabled:
-            logger.info_once(
-                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
-            )
-            from functools import partial
-
-            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
-            self.flashinfer_cutlass_moe = partial(
-                flashinfer_cutlass_moe,
-                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
-                tp_rank=self.moe.moe_parallel_config.tp_rank,
-                tp_size=self.moe.moe_parallel_config.tp_size,
-                ep_rank=self.moe.moe_parallel_config.ep_rank,
-                ep_size=self.moe.moe_parallel_config.ep_size)
-        else:
-            if (self.moe.moe_parallel_config.use_ep
-                    and self.moe.moe_parallel_config.dp_size == 1):
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is available for EP"
-                    " but not enabled, consider setting"
-                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.")
-            elif self.moe.moe_parallel_config.dp_size > 1:
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is currently not available for DP."
-                )
-            self.flashinfer_cutlass_moe = None  # type: ignore
-
-    def maybe_make_prepare_finalize(
-            self) -> Optional[FusedMoEPrepareAndFinalize]:
-        if self.rocm_aiter_moe_enabled:
-            return None
-        else:
-            return super().maybe_make_prepare_finalize()
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        assert self.moe_quant_config is not None
-        if (prepare_finalize.activation_format ==
-                FusedMoEActivationFormat.BatchedExperts):
-            logger.debug("BatchedTritonExperts %s", self.moe)
-            return BatchedTritonExperts(
-                max_num_tokens=self.moe.max_num_tokens,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                quant_config=self.moe_quant_config,
-            )
-        else:
-            logger.debug("TritonExperts %s", self.moe)
-            return TritonExperts(self.moe_quant_config)
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, use_nn_moe: bool,
-                       **extra_weight_attrs):
-        # Fused gate_up_proj (column parallel)
-        if not use_nn_moe:
-            w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                        2 * intermediate_size_per_partition,
-                                                        hidden_size,
-                                                        dtype=params_dtype),
-                                            requires_grad=False)
-        else:
-            w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                        hidden_size,
-                                                        2 * intermediate_size_per_partition,
-                                                        dtype=params_dtype),
-                                            requires_grad=False)
-
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w13_bias = torch.nn.Parameter(torch.zeros(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                dtype=params_dtype),
-                                          requires_grad=False)
-            layer.register_parameter("w13_bias", w13_bias)
-            set_weight_attrs(w13_bias, extra_weight_attrs)
-        # down_proj (row parallel)
-        if not use_nn_moe:
-            w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                       hidden_size,
-                                                       intermediate_size_per_partition,
-                                                       dtype=params_dtype),
-                                                       requires_grad=False)
-        else:
-            w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                       intermediate_size_per_partition,
-                                                       hidden_size,
-                                                       dtype=params_dtype),
-                                                       requires_grad=False)
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
-                                                     hidden_size,
-                                                     dtype=params_dtype),
-                                         requires_grad=False)
-            layer.register_parameter("w2_bias", w2_bias)
-            set_weight_attrs(w2_bias, extra_weight_attrs)
-
-    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        # Pad the weight tensor. This is an optimization on ROCm platform, which
-        # can benefit from tensors located far enough from one another in memory
-        if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
-                and weight.stride(-1) == 1
-                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
-            num_pad = 256 // weight.element_size()
-            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
-
-        return weight
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        super().process_weights_after_loading(layer)
-
-        # Padding the weight for better performance on ROCm
-        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
-        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
-        # Lazy import to avoid importing triton.
-        # from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        #     shuffle_weights)
-
-        # if self.rocm_aiter_moe_enabled:
-        #     shuffled_w13, shuffled_w2 = shuffle_weights(
-        #         layer.w13_weight.data, layer.w2_weight.data)
-
-        #     layer.w13_weight.data = shuffled_w13
-        #     layer.w2_weight.data = shuffled_w2
-
-        if self.flashinfer_cutlass_moe_enabled:
-            # Swap halves to arrange as [w3; w1] (kernel expectation)
-            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
-            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
-            layer.w13_weight.data = w13_weight_swapped.contiguous()
-
-        if current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-                layer.w13_weight,
-                layer.w2_weight,
-                use_prepack=True,
-            )
-        elif current_platform.is_cpu():
-            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                from vllm.model_executor.layers.utils import (
-                    check_cpu_sgl_kernel)
-                dtype_w13 = layer.w13_weight.dtype
-                _, n_w13, k_w13 = layer.w13_weight.size()
-                dtype_w2 = layer.w2_weight.dtype
-                _, n_w2, k_w2 = layer.w2_weight.size()
-                if (envs.VLLM_CPU_SGL_KERNEL
-                        and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
-                        and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)):
-                    packed_w13_weight = torch.ops._C.convert_weight_packed(
-                        layer.w13_weight)
-                    assert packed_w13_weight.size() == layer.w13_weight.size()
-                    layer.w13_weight.copy_(packed_w13_weight)
-                    del packed_w13_weight
-                    packed_w2_weight = torch.ops._C.convert_weight_packed(
-                        layer.w2_weight)
-                    assert packed_w2_weight.size() == layer.w2_weight.size()
-                    layer.w2_weight.copy_(packed_w2_weight)
-                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
-                else:
-                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
-            else:
-                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        use_nn_moe: Optional[bool] = False,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        return self.forward(
-            x=x,
-            layer=layer,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            enable_eplb=enable_eplb,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            use_nn_moe=use_nn_moe,
-        )
-
-    def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        if self.moe.has_bias:
-            return biased_moe_quant_config(
-                layer.w13_bias,
-                layer.w2_bias,
-            )
-        else:
-            return FUSED_MOE_UNQUANTIZED_CONFIG
-
-    def forward_cuda(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        use_nn_moe: Optional[bool] = False,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-
-        zero_expert_num = getattr(layer, 'zero_expert_num', 0)
-        zero_expert_type = getattr(layer, 'zero_expert_type', None)
-
-        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type)
-
-        if self.rocm_aiter_moe_enabled:
-            assert self.fused_experts is None
-            result = self.rocm_aiter_fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                expert_map=expert_map,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input)
-        elif self.flashinfer_cutlass_moe_enabled:
-            return self.flashinfer_cutlass_moe(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input)
-        elif self.fused_experts is not None:
-            if self.moe.has_bias:
-                raise ValueError(
-                    "FusedMoEModularKernel does not support bias.")
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                use_nn_moe=use_nn_moe,
-            )
-        else:
-            assert fused_experts is not None
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-            )
-
-        if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), \
-                "Shared + zero experts are mutually exclusive not yet supported"
-            return result, zero_expert_result
-        else:
-            return result
-
-    def forward_cpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        use_nn_moe: Optional[bool] = False,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        if enable_eplb is not False or expert_load_view is not None or \
-                logical_to_physical_map is not None or \
-                logical_replica_count is not None:
-            raise NotImplementedError("Expert load balancing is not supported "
-                                      "for CPU.")
-        return layer.cpu_fused_moe(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            global_num_experts,
-            expert_map,
-            custom_routing_function,
-            scoring_func,
-            routed_scaling_factor,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            activation,
-        )
-
-    def forward_xpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        if enable_eplb is not False or expert_load_view is not None or \
-                logical_to_physical_map is not None or \
-                logical_replica_count is not None:
-            raise NotImplementedError("Expert load balancing is not supported "
-                                      "for XPU.")
-        assert custom_routing_function is None
-        return layer.ipex_fusion(
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-        )
-
-    def forward_tpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        use_nn_moe: Optional[bool] = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax scoring function is supported for TPU.")
-        if e_score_correction_bias is not None:
-            raise NotImplementedError(
-                "Expert score correction bias is not supported for TPU.")
-        assert activation == "silu", f"{activation} is not supported for TPU."
-        assert routed_scaling_factor == 1.0, \
-            f"routed_scaling_factor {routed_scaling_factor} is not supported " \
-            f"for TPU."
-        if enable_eplb is not False or expert_load_view is not None or \
-                logical_to_physical_map is not None or \
-                logical_replica_count is not None:
-            raise NotImplementedError("Expert load balancing is not supported "
-                                      "for TPU.")
-        return fused_moe_pallas(hidden_states=x,
-                                w1=layer.w13_weight,
-                                w2=layer.w2_weight,
-                                topk=top_k,
-                                gating_output=router_logits,
-                                global_num_experts=global_num_experts,
-                                expert_map=expert_map,
-                                renormalize=renormalize)
-
-    if current_platform.is_tpu():
-        forward_native = forward_tpu
-    elif current_platform.is_cpu():
-        forward_native = forward_cpu
-    elif current_platform.is_xpu():
-        forward_native = forward_xpu
-    else:
-        forward_native = forward_cuda
-
-
 def determine_expert_map(
     ep_size: int,
     ep_rank: int,
     global_num_experts: int,
     expert_placement_strategy: ExpertPlacementStrategy = "linear",
-) -> tuple[int, Optional[torch.Tensor]]:
+    num_fused_shared_experts: int = 0,
+    return_expert_mask: bool = False,
+) -> tuple[int, torch.Tensor | None, torch.Tensor | None]:
     """
-        Calculates how many experts should be assigned to each rank for EP and
-        creates a mapping from global to local expert index. Experts are
-        distributed evenly across ranks. Any remaining are assigned to the
-        last rank.
-
-        Args:
-            ep_size: The size of the expert parallel group
-            ep_rank: The rank of the current process in the expert parallel
-                group
-            global_num_experts: The total number of experts in the model.
-            expert_placement_strategy: The expert placement strategy.
+    Calculates how many experts should be assigned to each rank for EP and
+    creates a mapping from global to local expert index. Experts are
+    distributed evenly across ranks. Any remaining are assigned to the
+    last rank.
 
-        Returns:
-            tuple[int, Optional[torch.Tensor]]: A tuple containing:
-                - local_num_experts (int): The number of experts assigned
-                    to the current rank.
-                - expert_map (Optional[torch.Tensor]): A tensor of shape
-                    (global_num_experts,) mapping from global to local index.
-                    Contains -1 for experts not assigned to the current rank.
-                    Returns None if ep_size is 1.
-        """
+    Args:
+        ep_size: The size of the expert parallel group
+        ep_rank: The rank of the current process in the expert parallel
+            group
+        global_num_experts: The total number of experts in the model.
+        expert_placement_strategy: The expert placement strategy.
+
+    Returns:
+        tuple[int, Optional[torch.Tensor]]: A tuple containing:
+            - local_num_experts (int): The number of experts assigned
+                to the current rank.
+            - expert_map (Optional[torch.Tensor]): A tensor of shape
+                (global_num_experts,) mapping from global to local index.
+                Contains -1 for experts not assigned to the current rank.
+                Returns None if ep_size is 1.
+            - expert_mask (Optional[torch.Tensor]): A tensor of shape
+                (global_num_experts + num_fused_shared_experts + 1,)
+                containing 1 for experts assigned to the current rank
+                and 0 for sentinel.
+                Returns None if ep_size is 1.
+                Used only when AITER MOE is enabled.
+    """
     assert ep_size > 0
     if ep_size == 1:
-        return (global_num_experts, None)
+        return (global_num_experts, None, None)
 
     # Distribute experts as evenly as possible to each rank.
     base_experts = global_num_experts // ep_size
     remainder = global_num_experts % ep_size
-    if ep_rank < remainder:
-        local_num_experts = base_experts + 1
-    else:
-        local_num_experts = base_experts
+    local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts
 
     # Create a tensor of size num_experts filled with -1
-    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
+    expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32)
     # Create an expert map for the local experts
     if expert_placement_strategy == "linear":
         start_idx = ep_rank * base_experts + min(ep_rank, remainder)
-        expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
-            0, local_num_experts, dtype=torch.int32)
+        expert_map[start_idx : start_idx + local_num_experts] = torch.arange(
+            0, local_num_experts, dtype=torch.int32
+        )
     elif expert_placement_strategy == "round_robin":
-        local_log_experts = torch.arange(ep_rank,
-                                         global_num_experts,
-                                         ep_size,
-                                         dtype=torch.int32)
-
-        expert_map[local_log_experts] = torch.arange(0,
-                                                     local_num_experts,
-                                                     dtype=torch.int32)
+        local_log_experts = torch.arange(
+            ep_rank, global_num_experts, ep_size, dtype=torch.int32
+        )
+
+        expert_map[local_log_experts] = torch.arange(
+            0, local_num_experts, dtype=torch.int32
+        )
     else:
-        raise ValueError("Unsupported expert placement strategy "
-                         f"'{expert_placement_strategy}', expected one of "
-                         f"{get_args(ExpertPlacementStrategy)}")
-    return (local_num_experts, expert_map)
+        raise ValueError(
+            "Unsupported expert placement strategy "
+            f"'{expert_placement_strategy}', expected one of "
+            f"{get_args(ExpertPlacementStrategy)}"
+        )
+
+    expert_mask = None
+    if return_expert_mask:
+        expert_mask = torch.ones(
+            (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32
+        )
+        expert_mask[-1] = 0
+        expert_mask[:global_num_experts] = expert_map > -1
+        expert_map = torch.cat(
+            (
+                expert_map,
+                torch.tensor(
+                    [local_num_experts + i for i in range(num_fused_shared_experts)],
+                    dtype=torch.int32,
+                ),
+            ),
+            dim=0,
+        )
+
+    return (local_num_experts, expert_map, expert_mask)
 
 
 def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
     """
-        Compresses the expert map by removing any -1 entries.
+    Compresses the expert map by removing any -1 entries.
 
-        Args:
-            expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
-                mapping from global to local index. Contains -1 for experts not
-                assigned to the current rank.
+    Args:
+        expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
+            mapping from global to local index. Contains -1 for experts not
+            assigned to the current rank.
 
-        Returns:
-            str: A string mapping from local to global index.
-                Using str to support hashing for logging once only.
-        """
+    Returns:
+        str: A string mapping from local to global index.
+            Using str to support hashing for logging once only.
+    """
     global_indices = torch.where(expert_map != -1)[0]
     local_indices = expert_map[global_indices]
     return ", ".join(
         f"{local_index.item()}->{global_index.item()}"
-        for local_index, global_index in zip(local_indices, global_indices))
+        for local_index, global_index in zip(local_indices, global_indices)
+    )
 
 
 def maybe_roundup_hidden_size(
-        hidden_size: int, act_dtype: torch.dtype,
-        quant_config: Optional[QuantizationConfig],
-        moe_parallel_config: FusedMoEParallelConfig) -> int:
+    hidden_size: int,
+    act_dtype: torch.dtype,
+    quant_config: QuantizationConfig | None,
+    moe_parallel_config: FusedMoEParallelConfig,
+    is_lora_enabled: bool,
+) -> int:
     """
     Given layer hidden size and MoE configurations, round up hidden_size
     if necessary.
-    
+
     Args:
         hidden_size: Layer hidden-size
         act_dtype: Data type of the layer activations.
         quant_config: Fused MoE quantization configuration.
         moe_parallel_config: Fused MoE parallelization strategy configuration.
+        is_lora_enabled: True if the engine is enabled with LoRA. This
+            is used in the case of mxfp4 quantization in selecting the
+            MxFP4Backend.
 
     Return:
         Rounded up hidden_size if rounding up is required based on the configs.
         Original hidden size otherwise.
     """
+    from vllm.model_executor.layers.fused_moe.all2all_utils import (
+        maybe_roundup_layer_hidden_size,
+    )
 
-    if (moe_parallel_config.use_deepep_ht_kernels):
-        hidden_size = (
-            DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
-                hidden_size, act_dtype))
+    hidden_size = maybe_roundup_layer_hidden_size(
+        hidden_size, act_dtype, moe_parallel_config
+    )
 
     # we are padding globally so EP buffer allocation works
     if quant_config and quant_config.get_name() == "mxfp4":
-
         from vllm.model_executor.layers.quantization.mxfp4 import (
-            Mxfp4Backend, get_mxfp4_backend)
-        current_mxfp4_backend = get_mxfp4_backend()
-        if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
-                or current_mxfp4_backend
-                == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS):
+            Mxfp4Backend,
+            get_mxfp4_backend,
+        )
+
+        current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled)
+        if (
+            current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
+            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+        ):
             hidden_size = round_up(hidden_size, 128)
-        elif (current_platform.is_rocm() or current_mxfp4_backend
-              == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-              or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+        elif (
+            current_platform.is_rocm()
+            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
+        ):
             hidden_size = round_up(hidden_size, 256)
 
     return hidden_size
@@ -949,7 +286,7 @@ class FusedMoE(CustomOp):
         hidden_size: Input hidden state size of the transformer
         intermediate_size: Intermediate size of the experts
         params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
+        reduce_results: Whether to all_reduce on the output of the layer
         renormalize: Whether to renormalize the logits in the fused_moe kernel
         quant_config: Quantization configure.
         enable_eplb: Whether to enable expert parallelism load balancer.
@@ -961,36 +298,56 @@ class FusedMoE(CustomOp):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         reduce_results: bool = False,
         renormalize: bool = True,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
-        topk_group: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        ep_size: Optional[int] = None,
-        dp_size: Optional[int] = None,
+        num_expert_group: int | None = None,
+        topk_group: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        ep_size: int | None = None,
+        dp_size: int | None = None,
         prefix: str = "",
-        custom_routing_function: Optional[Callable] = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         enable_eplb: bool = False,
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: Optional[int] = 0,
-        zero_expert_type: Optional[str] = None,
+        zero_expert_num: int | None = 0,
+        zero_expert_type: str | None = None,
+        expert_mapping: list[tuple[str, str, int, str]] | None = None,
+        n_shared_experts: int | None = None,
+        routing_method_type: int | None = None,
     ):
         super().__init__()
+
+        # Allow disabling of the separate shared experts stream for
+        # debug purposes.
+        # TODO: Remove this after more extensive testings with TP/DP
+        # and other execution modes
+        if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
+            logger.info_once("Disabling MoE shared_experts cuda stream")
+            self.shared_experts_stream = None
+        else:
+            # TODO(rob): enable shared expert overlap with non-cuda.
+            # aux_stream() returns None on non-cuda platforms.
+            self.shared_experts_stream = aux_stream()
+            if self.shared_experts_stream is not None:
+                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
 
         vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
 
         # FIXME (varun): We should have a better way of inferring the activation
         # datatype. This works for now as the tensor datatype entering the MoE
@@ -1002,28 +359,36 @@ class FusedMoE(CustomOp):
             # since model_config is not set in the pytest test.
             moe_in_dtype = params_dtype
 
-        tp_size_ = (tp_size if tp_size is not None else
-                    get_tensor_model_parallel_world_size())
-        dp_size_ = (dp_size
-                    if dp_size is not None else get_dp_group().world_size)
+        tp_size_ = (
+            tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
+        )
+        dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
 
         self.is_sequence_parallel = is_sequence_parallel
         self.sp_size = tp_size_ if is_sequence_parallel else 1
 
-        self.moe_parallel_config: FusedMoEParallelConfig = (
-            FusedMoEParallelConfig.make(
-                tp_size_=tp_size_,
-                dp_size_=dp_size_,
-                vllm_parallel_config=vllm_config.parallel_config))
+        self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
+            tp_size_=tp_size_,
+            dp_size_=dp_size_,
+            vllm_parallel_config=vllm_config.parallel_config,
+        )
 
         self.global_num_experts = num_experts + num_redundant_experts
+        self.logical_num_experts = num_experts
         self.zero_expert_num = zero_expert_num
         self.zero_expert_type = zero_expert_type
 
+        # Expert mapping used in self.load_weights
+        self.expert_mapping = expert_mapping
+
         # Round up hidden size if needed.
-        hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype,
-                                                quant_config,
-                                                self.moe_parallel_config)
+        hidden_size = maybe_roundup_hidden_size(
+            hidden_size,
+            moe_in_dtype,
+            quant_config,
+            self.moe_parallel_config,
+            is_lora_enabled=self.vllm_config.lora_config is not None,
+        )
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
@@ -1033,59 +398,99 @@ class FusedMoE(CustomOp):
         self.layer_name = prefix
 
         self.enable_eplb = enable_eplb
-        self.expert_load_view: Optional[torch.Tensor] = None
-        self.logical_to_physical_map: Optional[torch.Tensor] = None
-        self.logical_replica_count: Optional[torch.Tensor] = None
+        self.expert_load_view: torch.Tensor | None = None
+        self.logical_to_physical_map: torch.Tensor | None = None
+        self.logical_replica_count: torch.Tensor | None = None
+
+        # ROCm aiter shared experts fusion
+        self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        self.aiter_fmoe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+
+        self.num_fused_shared_experts = (
+            n_shared_experts
+            if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled
+            else 0
+        )
+        if (
+            not self.aiter_fmoe_shared_expert_enabled
+            and self.num_fused_shared_experts != 0
+        ):
+            raise ValueError(
+                "n_shared_experts is only supported on ROCm aiter when "
+                "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled"
+            )
 
         # Determine expert maps
         if self.use_ep:
             if self.enable_eplb:
-                assert self.global_num_experts % self.ep_size == 0, \
-                    "EPLB currently only supports even distribution of " \
+                assert self.global_num_experts % self.ep_size == 0, (
+                    "EPLB currently only supports even distribution of "
                     "experts across ranks."
+                )
             else:
-                assert num_redundant_experts == 0, \
+                assert num_redundant_experts == 0, (
                     "Redundant experts are only supported with EPLB."
+                )
 
             expert_placement_strategy = (
-                vllm_config.parallel_config.expert_placement_strategy)
+                vllm_config.parallel_config.expert_placement_strategy
+            )
             if expert_placement_strategy == "round_robin":
                 # TODO(Bruce): will support round robin expert placement with
                 # EPLB enabled in the future.
-                round_robin_supported = ((num_expert_group is not None
-                                          and num_expert_group > 1)
-                                         and num_redundant_experts == 0
-                                         and not self.enable_eplb)
+                round_robin_supported = (
+                    (num_expert_group is not None and num_expert_group > 1)
+                    and num_redundant_experts == 0
+                    and not self.enable_eplb
+                )
 
                 if not round_robin_supported:
                     logger.warning(
                         "Round-robin expert placement is only supported for "
                         "models with multiple expert groups and no redundant "
-                        "experts. Falling back to linear expert placement.")
+                        "experts. Falling back to linear expert placement."
+                    )
                     expert_placement_strategy = "linear"
 
-            self.expert_map: Optional[torch.Tensor]
-            local_num_experts, expert_map = determine_expert_map(
+            self.expert_map: torch.Tensor | None
+            local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
                 expert_placement_strategy=expert_placement_strategy,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
+            self.register_buffer("expert_mask", expert_mask)
             logger.info_once(
                 "[EP Rank %s/%s] Expert parallelism is enabled. Expert "
                 "placement strategy: %s. Local/global"
                 " number of experts: %s/%s. Experts local to global index map:"
-                " %s.", self.ep_rank, self.ep_size, expert_placement_strategy,
-                self.local_num_experts, self.global_num_experts,
-                get_compressed_expert_map(self.expert_map))
+                " %s.",
+                self.ep_rank,
+                self.ep_size,
+                expert_placement_strategy,
+                self.local_num_experts,
+                self.global_num_experts,
+                get_compressed_expert_map(self.expert_map),
+            )
         else:
-            self.local_num_experts, self.expert_map = (self.global_num_experts,
-                                                       None)
+            self.local_num_experts, self.expert_map, self.expert_mask = (
+                self.global_num_experts,
+                None,
+                None,
+            )
 
         self.top_k = top_k
 
+        self._init_aiter_shared_experts_topK_buffer(
+            vllm_config=vllm_config, dp_size=dp_size_
+        )
+
         assert intermediate_size % self.tp_size == 0
         self.hidden_size = hidden_size
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
@@ -1104,10 +509,29 @@ class FusedMoE(CustomOp):
         self.activation = activation
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
-            raise ValueError("Only softmax scoring function is supported for "
-                             "non-grouped topk.")
+            raise ValueError(
+                "Only softmax scoring function is supported for non-grouped topk."
+            )
 
-        moe = FusedMoEConfig(
+        # ToDo: Better logic to determine the routing method type
+        if routing_method_type is not None:
+            self.routing_method_type = routing_method_type
+        else:
+            if scoring_func == "sigmoid":
+                if self.use_grouped_topk:
+                    self.routing_method_type = RoutingMethodType.DeepSeekV3
+                elif self.top_k == 1:
+                    self.routing_method_type = RoutingMethodType.Llama4
+            elif self.scoring_func == "softmax":
+                self.routing_method_type = (
+                    RoutingMethodType.Renormalize
+                    if not self.renormalize
+                    else RoutingMethodType.RenormalizeNaive
+                )
+            else:
+                self.routing_method_type = RoutingMethodType.TopK
+
+        self.moe_config: FusedMoEConfig = FusedMoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
@@ -1116,20 +540,59 @@ class FusedMoE(CustomOp):
             in_dtype=moe_in_dtype,
             max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
             has_bias=has_bias,
+            is_act_and_mul=is_act_and_mul,
+            is_lora_enabled=vllm_config.lora_config is not None,
         )
-        self.moe_config = moe
-        self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
+
         self.quant_config = quant_config
 
+        def _get_quant_method() -> FusedMoEMethodBase:
+            """
+            Helper method to ensure self.quant_method is never None and
+            of the proper type.
+            """
+            quant_method = None
+            if self.quant_config is not None:
+                quant_method = self.quant_config.get_quant_method(self, prefix)
+            if quant_method is None:
+                quant_method = UnquantizedFusedMoEMethod(self.moe_config)
+            assert isinstance(quant_method, FusedMoEMethodBase)
+            return quant_method
+
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
-        quant_method: Optional[QuantizeMethodBase] = None
-        quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
-                        else quant_config.get_quant_method(self, prefix))
+        self.quant_method: FusedMoEMethodBase = _get_quant_method()
 
-        assert quant_method is not None
-        assert isinstance(quant_method, FusedMoEMethodBase)
-        self.quant_method = quant_method
+        if not self.moe_config.is_act_and_mul:
+            # Avoid circular import
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8MoEMethod,
+            )
+
+            if not isinstance(
+                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+            ):
+                raise NotImplementedError(
+                    "is_act_and_mul=False is supported only for unquantized "
+                    "and ModelOpt FP8 moe for now"
+                )
+            if not current_platform.is_cuda():
+                raise NotImplementedError(
+                    "is_act_and_mul=False is supported only for CUDA for now"
+                )
+
+        if self.enable_eplb and not self.quant_method.supports_eplb:
+            # TODO: Add support for additional quantization methods.
+            # The implementation for other quantization methods does not
+            # contain essential differences, but the current quant API
+            # design causes duplicated work when extending to new
+            # quantization methods, so I'm leaving it for now.
+            # If you plan to add support for more quantization methods,
+            # please refer to the implementation in `Fp8MoEMethod`.
+            raise NotImplementedError(
+                f"EPLB is not supported {self.quant_method.__class__.__name__}. "
+                "EPLB is only supported for FP8 quantization for now."
+            )
         
         if quant_config is None:
             # Not considering quant for now, temporarily
@@ -1138,73 +601,50 @@ class FusedMoE(CustomOp):
         else:
             self.use_nn_moe = False
 
-        if self.enable_eplb:
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8MoEMethod)
-            if not isinstance(quant_method,
-                              (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
-                # TODO: Add support for additional quantization methods.
-                # The implementation for other quantization methods does not
-                # contain essential differences, but the current quant API
-                # design causes duplicated work when extending to new
-                # quantization methods, so I'm leaving it for now.
-                # If you plan to add support for more quantization methods,
-                # please refer to the implementation in `Fp8MoEMethod`.
-                raise NotImplementedError("EPLB is only supported for FP8 "
-                                          "quantization for now.")
-
         moe_quant_params = {
             "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
-            "intermediate_size_per_partition":
-            self.intermediate_size_per_partition,
+            "intermediate_size_per_partition": self.intermediate_size_per_partition,
             "params_dtype": params_dtype,
             "weight_loader": self.weight_loader,
+            "global_num_experts": self.global_num_experts,
             "use_nn_moe": self.use_nn_moe,
         }
         # need full intermediate size pre-sharding for WNA16 act order
-        if (self.quant_method.__class__.__name__
-                in ("GPTQMarlinMoEMethod",
-                    "CompressedTensorsWNA16MarlinMoEMethod",
-                    "CompressedTensorsWNA16MoEMethod")):
+        if self.quant_method.__class__.__name__ in (
+            "GPTQMarlinMoEMethod",
+            "CompressedTensorsWNA16MarlinMoEMethod",
+            "CompressedTensorsWNA16MoEMethod",
+        ):
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
         # Chunked all2all staging tensor
-        self.batched_hidden_states: Optional[torch.Tensor] = None
-        self.batched_router_logits: Optional[torch.Tensor] = None
-
-        # TODO(bnell): flashinfer uses non-batched format.
-        # Does it really need a batched buffer?
-        if (self.moe_parallel_config.use_pplx_kernels
-                or self.moe_parallel_config.use_deepep_ll_kernels
-                or self.moe_config.use_flashinfer_cutlass_kernels):
-            if vllm_config.parallel_config.enable_dbo:
-                self.batched_hidden_states = torch.zeros(
-                    (2, moe.max_num_tokens, self.hidden_size),
-                    dtype=moe.in_dtype,
-                    device=torch.cuda.current_device())
-
-                # Note here we use `num_experts` which is logical expert count
-                self.batched_router_logits = torch.zeros(
-                    (2, moe.max_num_tokens, num_experts),
-                    dtype=moe.in_dtype,
-                    device=torch.cuda.current_device())
-            else:
-                self.batched_hidden_states = torch.zeros(
-                    (moe.max_num_tokens, self.hidden_size),
-                    dtype=moe.in_dtype,
-                    device=torch.cuda.current_device())
+        self.batched_hidden_states: torch.Tensor | None = None
+        self.batched_router_logits: torch.Tensor | None = None
+
+    # Note: maybe_init_modular_kernel should only be called by
+    # prepare_communication_buffer_for_model.
+    # This is called after all weight loading and post-processing, so it
+    # should be safe to swap out the quant_method.
+    def maybe_init_modular_kernel(self) -> None:
+        self.ensure_moe_quant_config_init()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        if prepare_finalize is not None:
+            logger.debug(
+                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
+            )
+            self.quant_method = FusedMoEModularMethod.make(
+                self, self.quant_method, prepare_finalize, self.shared_experts
+            )
 
-                # Note here we use `num_experts` which is logical expert count
-                self.batched_router_logits = torch.zeros(
-                    (moe.max_num_tokens, num_experts),
-                    dtype=moe.in_dtype,
-                    device=torch.cuda.current_device())
+    @property
+    def shared_experts(self) -> torch.nn.Module | None:
+        return None
 
     @property
-    def shared_experts(self) -> Optional[torch.nn.Module]:
+    def gate(self) -> torch.nn.Module | None:
         return None
 
     @property
@@ -1249,25 +689,56 @@ class FusedMoE(CustomOp):
 
     @property
     def use_flashinfer_cutlass_kernels(self):
-        return (self.moe_quant_config is not None
-                and self.moe_quant_config.quant_dtype == "nvfp4"
-                and self.moe_config.use_flashinfer_cutlass_kernels)
+        return (
+            self.moe_quant_config is not None
+            and self.moe_quant_config.quant_dtype == "nvfp4"
+            and self.moe_config.use_flashinfer_cutlass_kernels
+        )
+
+    @property
+    def use_marlin_kernels(self):
+        return getattr(self.quant_method, "use_marlin", False)
+
+    @property
+    def use_dp_chunking(self) -> bool:
+        return (
+            self.moe_parallel_config.use_pplx_kernels
+            or self.moe_parallel_config.use_deepep_ll_kernels
+            or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels)
+        )
+
+    @property
+    def is_internal_router(self) -> bool:
+        # By default, router/gate is called before FusedMoE forward pass
+        return False
 
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
         assert self.expert_map is not None
         with self.expert_map.device:
-            local_num_experts, expert_map = determine_expert_map(
+            local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
-                global_num_experts=self.global_num_experts)
+                global_num_experts=self.global_num_experts,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
+            )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
+            self.register_buffer("expert_mask", expert_mask)
+            if self.aiter_fmoe_shared_expert_enabled:
+                self._init_aiter_shared_experts_topK_buffer(
+                    vllm_config=get_current_vllm_config(),
+                    dp_size=get_dp_group().world_size,
+                )
 
-    def _load_per_tensor_weight_scale(self, shard_id: str,
-                                      param: torch.nn.Parameter,
-                                      loaded_weight: torch.Tensor,
-                                      expert_id: int):
+    def _load_per_tensor_weight_scale(
+        self,
+        shard_id: str,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        expert_id: int,
+    ):
         param_data = param.data
         # for per tensor weight quantization
         if shard_id in ("w1", "w3"):
@@ -1279,25 +750,32 @@ class FusedMoE(CustomOp):
         elif shard_id == "w2":
             param_data[expert_id] = loaded_weight
 
-    def _load_combined_w13_weight_scale(self, shard_dim: int,
-                                        loaded_weight: torch.Tensor,
-                                        param: torch.Tensor, tp_rank: int):
+    def _load_combined_w13_weight_scale(
+        self,
+        shard_dim: int,
+        loaded_weight: torch.Tensor,
+        param: torch.Tensor,
+        tp_rank: int,
+    ):
         """
         Load w13 weight scales assuming that w1 weight scales and w3 weight
         scales are stored in the same loaded_weight tensor.
         """
         shard_size = param.shape[shard_dim]
-        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
-                                             shard_size)
+        loaded_weight = loaded_weight.narrow(
+            shard_dim, shard_size * tp_rank, shard_size
+        )
         param.copy_(loaded_weight)
 
-    def _load_model_weight_or_group_weight_scale(self,
-                                                 shard_dim: int,
-                                                 expert_data: torch.Tensor,
-                                                 shard_id: str,
-                                                 loaded_weight: torch.Tensor,
-                                                 tp_rank: int,
-                                                 load_full_w2: bool = False):
+    def _load_model_weight_or_group_weight_scale(
+        self,
+        shard_dim: int,
+        expert_data: torch.Tensor,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        load_full_w2: bool = False,
+    ):
         """
         Load grouped weight scales for group quantization or model weights
             :param shard_dim: dimension to shard
@@ -1310,47 +788,61 @@ class FusedMoE(CustomOp):
         if shard_id == "w2":
             # In the case where we have actorder/g_idx, we do not partition the
             # w2 scales, as indicated by `load_full` argument, for all tp cases
-            self._load_w2(shard_dim=shard_dim,
-                          loaded_weight=loaded_weight,
-                          expert_data=expert_data,
-                          tp_rank=tp_rank,
-                          load_full=load_full_w2)
+            self._load_w2(
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+                load_full=load_full_w2,
+            )
         elif shard_id in ("w1", "w3"):
-            self._load_w13(shard_id=shard_id,
-                           shard_dim=shard_dim,
-                           loaded_weight=loaded_weight,
-                           expert_data=expert_data,
-                           tp_rank=tp_rank)
-
-    def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
-                                       shard_dim: int, shard_id: str,
-                                       loaded_weight: torch.Tensor,
-                                       tp_rank: int):
+            self._load_w13(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
+
+    def _load_per_channel_weight_scale(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+    ):
         # for per channel weight quantization
         if shard_id == "w2":
             expert_data.copy_(loaded_weight)
         elif shard_id in ("w1", "w3"):
-            self._load_w13(shard_id=shard_id,
-                           shard_dim=shard_dim,
-                           loaded_weight=loaded_weight,
-                           expert_data=expert_data,
-                           tp_rank=tp_rank)
-
-    def _load_w13(self,
-                  expert_data: torch.Tensor,
-                  shard_dim: int,
-                  shard_id: str,
-                  loaded_weight: torch.Tensor,
-                  tp_rank: int,
-                  load_full: bool = False):
+            self._load_w13(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
 
+    def _load_w13(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        load_full: bool = False,
+    ):
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
-        shard_size = expert_data.shape[shard_dim] // 2
+        if self.moe_config.is_act_and_mul:
+            shard_size = expert_data.shape[shard_dim] // 2
+        else:
+            shard_size = expert_data.shape[shard_dim]
         if not load_full:
-            loaded_weight = loaded_weight.narrow(shard_dim if not self.use_nn_moe else ~shard_dim,
-                                                 shard_size * tp_rank,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(
+                shard_dim if not self.use_nn_moe else ~shard_dim, shard_size * tp_rank, shard_size
+            )
         # Narrow parameter and load.
         # w1, gate_proj: Load into first logical weight of w13.
         if shard_id == "w1":
@@ -1364,29 +856,31 @@ class FusedMoE(CustomOp):
         else:
             expert_data.copy_(loaded_weight.T)
 
-    def _load_w2(self,
-                 expert_data: torch.Tensor,
-                 shard_dim: int,
-                 loaded_weight: torch.Tensor,
-                 tp_rank: int,
-                 load_full: bool = False):
-
+    def _load_w2(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        load_full: bool = False,
+    ):
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
         # Narrow parameter and load.
         shard_size = expert_data.shape[shard_dim]
         if not load_full:
-            loaded_weight = loaded_weight.narrow(shard_dim if not self.use_nn_moe else ~shard_dim,
-                                                 shard_size * tp_rank,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(
+                shard_dim if not self.use_nn_moe else ~shard_dim, shard_size * tp_rank, shard_size
+            )
         # w2, down_proj: Load into only logical weight of w2.
         if not self.use_nn_moe:
             expert_data.copy_(loaded_weight)
         else:
             expert_data.copy_(loaded_weight.T)
 
-    def _load_single_value(self, param: torch.nn.Parameter,
-                           loaded_weight: torch.Tensor, expert_id: int):
+    def _load_single_value(
+        self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
+    ):
         param_data = param.data
 
         # Input scales can be loaded directly and should be equal.
@@ -1395,14 +889,21 @@ class FusedMoE(CustomOp):
         else:
             param_data[expert_id] = loaded_weight.T
 
-    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
-                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
-
+    def _load_g_idx(
+        self,
+        shard_id: str,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+    ):
         if shard_id == "w2":
-            self._load_w2(shard_dim=shard_dim,
-                          loaded_weight=loaded_weight,
-                          expert_data=expert_data,
-                          tp_rank=tp_rank)
+            self._load_w2(
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
         else:
             assert shard_id in ("w1", "w3")
             if not self.use_nn_moe:
@@ -1415,28 +916,54 @@ class FusedMoE(CustomOp):
             return expert_id
         return self.expert_map[expert_id].item()
 
-    @overload
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: str, expert_id: int,
-                      return_success: Literal[False]) -> None:
-        ...
+    def _init_aiter_shared_experts_topK_buffer(
+        self, vllm_config: VllmConfig, dp_size: int
+    ):
+        if self.num_fused_shared_experts > 0:
+            init_aiter_topK_meta_data(
+                n_routed_experts=self.global_num_experts,
+                n_shared_experts=self.num_fused_shared_experts,
+                top_k=self.top_k,
+                tp_rank=self.ep_rank if self.use_ep else self.tp_rank,
+                tp_size=self.ep_size if self.use_ep else self.tp_size,
+                shared_experts_score=1.0,
+                max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens
+                * dp_size,
+                is_EP=self.use_ep,
+            )
+        self.local_num_experts += self.num_fused_shared_experts
 
     @overload
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: str, expert_id: int,
-                      return_success: Literal[True]) -> bool:
-        ...
-
-    def weight_loader(self,
-                      param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor,
-                      weight_name: str,
-                      shard_id: str,
-                      expert_id: int,
-                      return_success: bool = False) -> Optional[bool]:
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+        return_success: Literal[False],
+    ) -> None: ...
 
+    @overload
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+        return_success: Literal[True],
+    ) -> bool: ...
+
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+        return_success: bool = False,
+    ) -> bool | None:
         if self.quant_config and self.quant_config.get_name() == "mxfp4":
             # (FIXME) for gpt-oss all experts are combined
             if "bias" in weight_name:
@@ -1448,24 +975,36 @@ class FusedMoE(CustomOp):
                 param.data[:, :dim1, :dim2].copy_(loaded_weight)
             return True if return_success else None
 
-        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
-        if expert_id == -1:
+        quant_method_name = self.quant_method.__class__.__name__
+        global_expert_id = expert_id
+        expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id)
+
+        allow_flashinfer = getattr(self.quant_method, "allow_flashinfer", False)
+        moe_backend = getattr(self.quant_method, "flashinfer_moe_backend", None)
+
+        use_global_sf = (
+            allow_flashinfer
+            and is_flashinfer_supporting_global_sf(moe_backend)
+            and "input_scale" in weight_name
+            and quant_method_name == "ModelOptNvFp4FusedMoE"
+        )
+
+        if expert_id == -1 and not use_global_sf:
             # Failed to load this param since it's not local to this rank
             return False if return_success else None
         # Hereafter, `expert_id` is local physical id
 
-        quant_method_name = self.quant_method.__class__.__name__
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
         if self.quant_method.__class__.__name__ in (
-                "CompressedTensorsWNA16MarlinMoEMethod",
-                "CompressedTensorsWNA16MoEMethod"):
+            "CompressedTensorsWNA16MarlinMoEMethod",
+            "CompressedTensorsWNA16MoEMethod",
+        ):
             loaded_weight = loaded_weight.t().contiguous()
 
         if shard_id not in ("w1", "w2", "w3"):
-            raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
-                             f"got {shard_id}.")
+            raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.")
 
         # Fetch the dim to shard the parameter/loaded weight
         # based on the shard id. This will be whatever
@@ -1527,43 +1066,51 @@ class FusedMoE(CustomOp):
             # this is needed for compressed-tensors only
             loaded_weight = loaded_weight.to(param.data.device)
 
-            if ("compressed" in quant_method_name.lower()
-                    and param.data[expert_id] != 1
-                    and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
+            if (
+                "compressed" in quant_method_name.lower()
+                and param.data[expert_id] != 1
+                and (param.data[expert_id] - loaded_weight).abs() > 1e-5
+            ):
                 raise ValueError(
                     "input_scales of w1 and w3 of a layer "
                     f"must be equal. But got {param.data[expert_id]} "
-                    f"vs. {loaded_weight}")
+                    f"vs. {loaded_weight}"
+                )
 
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
+            self._load_single_value(
+                param=param,
+                loaded_weight=loaded_weight,
+                expert_id=global_expert_id if use_global_sf else expert_id,
+            )
             return True if return_success else None
 
         # Case g_idx
         if "g_idx" in weight_name:
-            self._load_g_idx(shard_dim=0,
-                             shard_id=shard_id,
-                             loaded_weight=loaded_weight,
-                             expert_data=expert_data,
-                             tp_rank=self.tp_rank)
+            self._load_g_idx(
+                shard_dim=0,
+                shard_id=shard_id,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=self.tp_rank,
+            )
             return True if return_success else None
 
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
             # Determine per-tensor weight scale patterns based on variant
             # Use the dedicated method instead of brittle string matching
-            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
-            )
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern()
 
             # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
             # weights scales.
             # Input scales are always per-tensor.
             # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
             # "weight_scale" for per-tensor scales.
-            is_per_tensor = ("weight_scale_2" in weight_name
-                             if uses_weight_scale_2 else "weight_scale"
-                             in weight_name) or "input_scale" in weight_name
+            is_per_tensor = (
+                "weight_scale_2" in weight_name
+                if uses_weight_scale_2
+                else "weight_scale" in weight_name
+            ) or "input_scale" in weight_name
             if is_per_tensor:
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
@@ -1598,12 +1145,12 @@ class FusedMoE(CustomOp):
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=self.tp_rank)
+                    tp_rank=self.tp_rank,
+                )
             return True if return_success else None
 
         # Case weight scales, zero_points and offset, weight/input global scales
-        if ("scale" in weight_name or "zero" in weight_name
-                or "offset" in weight_name):
+        if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
             # load the weight scales and zp based on the quantization scheme
             # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
@@ -1616,10 +1163,11 @@ class FusedMoE(CustomOp):
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=self.tp_rank)
+                    tp_rank=self.tp_rank,
+                )
             elif quant_method in [
-                    FusedMoeWeightScaleSupported.GROUP.value,
-                    FusedMoeWeightScaleSupported.BLOCK.value,
+                FusedMoeWeightScaleSupported.GROUP.value,
+                FusedMoeWeightScaleSupported.BLOCK.value,
             ]:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
@@ -1627,26 +1175,28 @@ class FusedMoE(CustomOp):
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
                     tp_rank=self.tp_rank,
-                    load_full_w2=getattr(param, "load_full_w2", False))
+                    load_full_w2=getattr(param, "load_full_w2", False),
+                )
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
-                self._load_per_tensor_weight_scale(shard_id=shard_id,
-                                                   param=param,
-                                                   loaded_weight=loaded_weight,
-                                                   expert_id=expert_id)
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
             else:
-                WEIGHT_SCALE_SUPPORTED = [
-                    e.value for e in FusedMoeWeightScaleSupported
-                ]
+                WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
                 raise ValueError(
-                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
+                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}"
+                )
             return True if return_success else None
 
         # Case weight_shape
         if "weight_shape" in weight_name:
             # only required by compressed-tensors
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
+            self._load_single_value(
+                param=param, loaded_weight=loaded_weight, expert_id=expert_id
+            )
             return True if return_success else None
 
         # Case model weights
@@ -1656,14 +1206,52 @@ class FusedMoE(CustomOp):
                 shard_dim=shard_dim,
                 loaded_weight=loaded_weight,
                 expert_data=expert_data,
-                tp_rank=self.tp_rank)
+                tp_rank=self.tp_rank,
+            )
             return True if return_success else None
 
         return False if return_success else None
 
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[str]:
+        if (expert_mapping := self.expert_mapping) is None:
+            raise ValueError(
+                "`self.expert_mapping` must be provided to "
+                "load weights using `self.load_weights`."
+            )
+        for expert_name, loaded_weight in weights:
+            qual_name = f"{self.layer_name}.{expert_name}"
+            for param_name, weight_name, expert_id, shard_id in expert_mapping:
+                if weight_name not in qual_name:
+                    continue
+                weight_name = qual_name.replace(weight_name, param_name)
+                param_name = weight_name.removeprefix(f"{self.layer_name}.")
+                param = getattr(self, param_name)
+                success = self.weight_loader(
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    weight_name=weight_name,
+                    shard_id=shard_id,
+                    expert_id=expert_id,
+                    return_success=True,
+                )
+                if success:
+                    logger.debug(
+                        "Loaded %s for expert %d into %s",
+                        param_name,
+                        expert_id,
+                        self.layer_name,
+                    )
+                    yield param_name
+
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
         weights = list(self.named_parameters())
-        assert all(weight.is_contiguous() for _, weight in weights)
+        assert all(
+            weight.is_contiguous()
+            for name, weight in weights
+            if not name.startswith("_shared_experts.")
+        )
 
         # Filter out the non-expert weights.
         # `e_score_correction_bias` is a bias for each logical expert,
@@ -1673,9 +1261,13 @@ class FusedMoE(CustomOp):
         }
 
         return [
-            weight.view(self.local_num_experts, -1) for name, weight in weights
-            if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size(
-                []) and not name.startswith("_shared_experts.")
+            weight.view(self.local_num_experts, -1)
+            for name, weight in weights
+            if name not in NON_EXPERT_WEIGHTS
+            and weight.shape != torch.Size([])
+            and not name.startswith("_shared_experts.")
+            # exclude parameters from non-expert submodules (e.g. gate/shared)
+            and not name.startswith("_gate.")
         ]
 
     def set_eplb_state(
@@ -1695,10 +1287,42 @@ class FusedMoE(CustomOp):
         self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
         self.logical_replica_count = logical_replica_count[moe_layer_idx]
 
-    def ensure_moe_quant_config(self):
+    def ensure_moe_quant_config_init(self):
         if self.quant_method.moe_quant_config is None:
+            # Note: the moe_quant_config can't be constructed until after
+            # weight loading post processing.
             self.quant_method.moe_quant_config = (
-                self.quant_method.get_fused_moe_quant_config(self))
+                self.quant_method.get_fused_moe_quant_config(self)
+            )
+
+    @property
+    def moe_quant_config(self) -> FusedMoEQuantConfig | None:
+        self.ensure_moe_quant_config_init()
+        return self.quant_method.moe_quant_config
+
+    def ensure_dp_chunking_init(self):
+        if not self.use_dp_chunking or self.batched_hidden_states is not None:
+            return
+
+        states_shape: tuple[int, ...]
+        logits_shape: tuple[int, ...]
+
+        moe = self.moe_config
+
+        if self.vllm_config.parallel_config.enable_dbo:
+            states_shape = (2, moe.max_num_tokens, self.hidden_size)
+            logits_shape = (2, moe.max_num_tokens, self.logical_num_experts)
+        else:
+            states_shape = (moe.max_num_tokens, self.hidden_size)
+            logits_shape = (moe.max_num_tokens, self.logical_num_experts)
+
+        self.batched_hidden_states = torch.zeros(
+            states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
+        )
+
+        self.batched_router_logits = torch.zeros(
+            logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
+        )
 
     @staticmethod
     def select_experts(
@@ -1707,28 +1331,29 @@ class FusedMoE(CustomOp):
         top_k: int,
         use_grouped_topk: bool,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        indices_type: Optional[torch.dtype] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        indices_type: torch.dtype | None = None,
         enable_eplb: bool = False,
-        expert_map: Optional[torch.Tensor] = None,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        global_num_experts: Optional[int] = None,
-        zero_expert_num: Optional[int] = None,
-        zero_expert_type: Optional[str] = None,
+        expert_map: torch.Tensor | None = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+        global_num_experts: int | None = None,
+        zero_expert_num: int | None = None,
+        zero_expert_type: str | None = None,
+        num_fused_shared_experts: int = 0,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
 
         Returns:
-                (topk_weights, topk_ids, zero_expert_result) 
+                (topk_weights, topk_ids, zero_expert_result)
                 (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
                 The weights, expert ids, and zero expert computation result.
 
@@ -1737,7 +1362,9 @@ class FusedMoE(CustomOp):
             plain MoE implementations without redundant experts.
         """
         from vllm.model_executor.layers.fused_moe.fused_moe import (
-            fused_topk, fused_topk_bias)
+            fused_topk,
+            fused_topk_bias,
+        )
 
         # Check if we should use a routing simulation strategy
         routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
@@ -1747,13 +1374,24 @@ class FusedMoE(CustomOp):
                 router_logits=router_logits,
                 strategy_name=routing_strategy,
                 top_k=top_k,
-                indices_type=indices_type)
+                indices_type=indices_type,
+            )
 
         # DeepSeekv2 uses grouped_top_k
-        if use_grouped_topk:
+        elif use_grouped_topk:
             assert topk_group is not None
             assert num_expert_group is not None
-            topk_weights, topk_ids = grouped_topk(
+            if rocm_aiter_ops.is_fused_moe_enabled():
+                if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
+                    assert num_fused_shared_experts == 0
+                grouped_topk_impl = partial(
+                    rocm_aiter_grouped_topk,
+                    num_fused_shared_experts=num_fused_shared_experts,
+                )
+            else:
+                grouped_topk_impl = grouped_topk
+
+            topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
                 topk=top_k,
@@ -1762,7 +1400,8 @@ class FusedMoE(CustomOp):
                 topk_group=topk_group,
                 scoring_func=scoring_func,
                 routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias)
+                e_score_correction_bias=e_score_correction_bias,
+            )
             if indices_type is not None:
                 topk_ids = topk_ids.to(dtype=indices_type)
         elif e_score_correction_bias is not None:
@@ -1788,7 +1427,8 @@ class FusedMoE(CustomOp):
                 hidden_states=hidden_states,
                 gating_output=router_logits,
                 topk=top_k,
-                renormalize=renormalize)
+                renormalize=renormalize,
+            )
             if indices_type is not None:
                 topk_ids = topk_ids.to(dtype=indices_type)
 
@@ -1808,9 +1448,12 @@ class FusedMoE(CustomOp):
         assert topk_ids.dtype == indices_type or indices_type is None
 
         # Compute zero expert result if needed
-        if (zero_expert_num is not None and zero_expert_num > 0
-                and zero_expert_type is not None
-                and global_num_experts is not None):
+        if (
+            zero_expert_num is not None
+            and zero_expert_num > 0
+            and zero_expert_type is not None
+            and global_num_experts is not None
+        ):
             zero_expert_result = zero_experts_compute_triton(
                 expert_indices=topk_ids,
                 expert_scales=topk_weights,
@@ -1835,16 +1478,17 @@ class FusedMoE(CustomOp):
         Therefore it is required that we reduce the shared_experts output
         early.
         """
-        return (self.use_pplx_kernels or self.use_deepep_ht_kernels
-                or self.use_deepep_ll_kernels)
+        assert self.quant_method is not None
+        return (
+            isinstance(self.quant_method, FusedMoEModularMethod)
+            and self.quant_method.fused_experts.output_is_reduced()
+        )
 
-    def maybe_all_reduce_tensor_model_parallel(
-            self, final_hidden_states: torch.Tensor):
+    def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor):
         """
-        The pplx combine kernel reduces across GPU ranks by default.
+        Some combine kernels reduce across GPU ranks by default.
         """
-        if (self.use_pplx_kernels or self.use_deepep_ht_kernels
-                or self.use_deepep_ll_kernels):
+        if self.must_reduce_shared_expert_outputs():
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -1853,13 +1497,25 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         og_hidden_states = hidden_states.shape[-1]
         if self.hidden_size != og_hidden_states:
-            hidden_states = F.pad(hidden_states,
-                                  (0, self.hidden_size - og_hidden_states),
-                                  mode='constant',
-                                  value=0.0)
+            hidden_states = F.pad(
+                hidden_states,
+                (0, self.hidden_size - og_hidden_states),
+                mode="constant",
+                value=0.0,
+            )
+
+        def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            if (
+                not self.is_sequence_parallel
+                and not self.use_dp_chunking
+                and self.reduce_results
+                and (self.tp_size > 1 or self.ep_size > 1)
+            ):
+                states = self.maybe_all_reduce_tensor_model_parallel(states)
+            return states
 
         if self.shared_experts is None:
             if current_platform.is_tpu():
@@ -1869,48 +1525,56 @@ class FusedMoE(CustomOp):
                 assert not isinstance(fused_output, tuple)
             else:
                 fused_output = torch.ops.vllm.moe_forward(
-                    hidden_states, router_logits, self.layer_name)
-            return fused_output[..., :og_hidden_states]
+                    hidden_states, router_logits, self.layer_name
+                )
+            if self.zero_expert_num is not None and self.zero_expert_num > 0:
+                assert isinstance(fused_output, tuple)
+                fused_output, zero_expert_result = fused_output
+                return (reduce_output(fused_output) + zero_expert_result)[
+                    ..., :og_hidden_states
+                ]
+            else:
+                return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
                 shared_output, fused_output = self.forward_impl(
-                    hidden_states, router_logits)
+                    hidden_states, router_logits
+                )
             else:
                 shared_output, fused_output = torch.ops.vllm.moe_forward_shared(
-                    hidden_states, router_logits, self.layer_name)
-            return (shared_output[..., :og_hidden_states],
-                    fused_output[..., :og_hidden_states])
+                    hidden_states, router_logits, self.layer_name
+                )
+            return (
+                reduce_output(shared_output)[..., :og_hidden_states],
+                reduce_output(fused_output)[..., :og_hidden_states],
+            )
 
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         return self.forward_native(hidden_states, router_logits)
 
     def forward_impl_chunked(
         self,
         full_hidden_states: torch.Tensor,
         full_router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        has_separate_shared_experts: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.batched_hidden_states is not None
         assert self.batched_router_logits is not None
         assert self.batched_hidden_states.dtype == full_hidden_states.dtype
         assert self.batched_router_logits.dtype == full_router_logits.dtype
         # Check size compatibility.
-        assert (
-            self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
-        assert (
-            self.batched_router_logits.size(-1) == full_router_logits.size(-1))
-
-        self.ensure_moe_quant_config()
+        assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1)
+        assert self.batched_router_logits.size(-1) == full_router_logits.size(-1)
 
         full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
         if self.shared_experts is not None:
-            full_shared_final_hidden_states = torch.empty_like(
-                full_hidden_states)
+            full_shared_final_hidden_states = torch.empty_like(full_hidden_states)
 
         def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             chunk_size = chunk_end - chunk_start
@@ -1924,22 +1588,22 @@ class FusedMoE(CustomOp):
             if self.batched_hidden_states.dim() == 3:
                 assert self.batched_router_logits.dim() == 3
                 batch_buffer_idx = dbo_current_ubatch_id()
-                batched_hidden_states = self.batched_hidden_states[
-                    batch_buffer_idx, :]
-                batched_router_logits = self.batched_router_logits[
-                    batch_buffer_idx, :]
+                batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :]
+                batched_router_logits = self.batched_router_logits[batch_buffer_idx, :]
             else:
                 batched_hidden_states = self.batched_hidden_states
                 batched_router_logits = self.batched_router_logits
 
-            assert (batched_hidden_states.size(0)  # type: ignore
-                    >= chunk_size)
-            assert (batched_router_logits.size(0)  # type: ignore 
-                    >= chunk_size)
-            staged_hidden_states = batched_hidden_states[:
-                                                         chunk_size, :]  # type: ignore
-            staged_router_logits = batched_router_logits[:
-                                                         chunk_size, :]  # type: ignore
+            assert (
+                batched_hidden_states.size(0)  # type: ignore
+                >= chunk_size
+            )
+            assert (
+                batched_router_logits.size(0)  # type: ignore
+                >= chunk_size
+            )
+            staged_hidden_states = batched_hidden_states[:chunk_size, :]  # type: ignore
+            staged_router_logits = batched_router_logits[:chunk_size, :]  # type: ignore
             staged_hidden_states.copy_(hidden_states, non_blocking=True)
             staged_router_logits.copy_(router_logits, non_blocking=True)
 
@@ -1952,7 +1616,9 @@ class FusedMoE(CustomOp):
                 renormalize=self.renormalize,
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map,
+                expert_map=self.expert_map
+                if not self.rocm_aiter_fmoe_enabled
+                else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
                 custom_routing_function=self.custom_routing_function,
@@ -1966,8 +1632,16 @@ class FusedMoE(CustomOp):
                 logical_replica_count=self.logical_replica_count,
             )
 
-            assert self.shared_experts is None or isinstance(
-                final_hidden_states, tuple)
+            if has_separate_shared_experts:
+                assert not isinstance(final_hidden_states, tuple)
+                assert self.shared_experts is not None
+
+                shared_output = self.shared_experts(staged_hidden_states)
+
+                final_hidden_states = (
+                    shared_output,
+                    final_hidden_states,
+                )
 
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(final_hidden_states, tuple)
@@ -1978,16 +1652,16 @@ class FusedMoE(CustomOp):
 
             if not skip_result_store:
                 if self.shared_experts is None:
-                    full_fused_final_hidden_states[
-                        chunk_start:chunk_end, :].copy_(final_hidden_states,
-                                                        non_blocking=True)
+                    full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
+                        final_hidden_states, non_blocking=True
+                    )
                 else:
-                    full_shared_final_hidden_states[
-                        chunk_start:chunk_end, :].copy_(final_hidden_states[0],
-                                                        non_blocking=True)
-                    full_fused_final_hidden_states[
-                        chunk_start:chunk_end, :].copy_(final_hidden_states[1],
-                                                        non_blocking=True)
+                    full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_(
+                        final_hidden_states[0], non_blocking=True
+                    )
+                    full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
+                        final_hidden_states[1], non_blocking=True
+                    )
 
         ctx = get_forward_context()
         # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
@@ -1997,84 +1671,123 @@ class FusedMoE(CustomOp):
         # If the input to the MoE is sequence parallel then divide by sp_size
         # to find the maximum number of tokens for any individual dispatcher.
         if self.is_sequence_parallel:
-            max_tokens_across_dispatchers = cdiv(max_tokens_across_dispatchers,
-                                                 self.sp_size)
+            max_tokens_across_dispatchers = cdiv(
+                max_tokens_across_dispatchers, self.sp_size
+            )
 
         num_tokens = full_hidden_states.size(0)
         for chunk_idx, chunk_start_ in enumerate(
-                range(0, max_tokens_across_dispatchers,
-                      moe_dp_chunk_size_per_rank)):
+            range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank)
+        ):
             chunk_start = chunk_start_
-            chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
-                            max_tokens_across_dispatchers)
+            chunk_end = min(
+                chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers
+            )
             # clamp start and end
             chunk_start = min(chunk_start, num_tokens - 1)
             chunk_end = min(chunk_end, num_tokens)
-            with ctx.dp_metadata.chunked_sizes(self.sp_size,
-                                               moe_dp_chunk_size_per_rank,
-                                               chunk_idx):
-                process_chunk(chunk_start,
-                              chunk_end,
-                              skip_result_store=chunk_start_ >= num_tokens)
+            with ctx.dp_metadata.chunked_sizes(
+                self.sp_size, moe_dp_chunk_size_per_rank, chunk_idx
+            ):
+                process_chunk(
+                    chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens
+                )
 
         if self.shared_experts is None:
             return full_fused_final_hidden_states
         else:
-            return (full_shared_final_hidden_states,
-                    full_fused_final_hidden_states)
+            return (full_shared_final_hidden_states, full_fused_final_hidden_states)
 
     def forward_impl(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.quant_method is not None
 
-        self.ensure_moe_quant_config()
-
-        # Route to the chunked forward path using the FlashInfer Cutlass kernel
-        # only when data parallelism (DP) is enabled.
-        _use_flashinfer_cutlass_kernels = (self.dp_size > 1 and
-                                           self.use_flashinfer_cutlass_kernels)
-
-        if (self.moe_parallel_config.use_pplx_kernels
-                or self.moe_parallel_config.use_deepep_ll_kernels
-                or _use_flashinfer_cutlass_kernels):
-            return self.forward_impl_chunked(hidden_states, router_logits)
-
-        do_naive_dispatch_combine: bool = (
-            self.dp_size > 1
-            and not self.moe_parallel_config.use_deepep_ht_kernels
-            and not self.moe_config.use_flashinfer_cutlass_kernels)
-
-        # If there are shared experts but we are not using a modular kernel, the
-        # shared experts must be called here
-        if (not isinstance(self.quant_method.fused_experts,
-                           FusedMoEModularKernel)
-                and self.shared_experts is not None):
-            shared_output = self.shared_experts(hidden_states)
-        else:
-            shared_output = None
+        self.ensure_moe_quant_config_init()
+        self.ensure_dp_chunking_init()
+
+        has_separate_shared_experts = (
+            not isinstance(self.quant_method, FusedMoEModularMethod)
+            and self.shared_experts is not None
+        )
+
+        use_chunked_impl = self.use_dp_chunking
+
+        use_shared_experts_stream = (
+            has_separate_shared_experts
+            and not use_chunked_impl
+            and self.shared_experts_stream is not None
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
+            self.shared_experts_stream.wait_stream(current_stream())
+
+        # If router/gate provided, then apply it here.
+        # (Note: This code runs only when "overlapped mode" is on to allow
+        #        parallel execution of shared experts with the FusedMoE via
+        #        separate cuda stream)
+        if self.gate is not None:
+            router_logits, _ = self.gate(hidden_states)
+
+        if use_chunked_impl:
+            return self.forward_impl_chunked(
+                hidden_states, router_logits, has_separate_shared_experts
+            )
+
+        do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance(
+            self.quant_method, FusedMoEModularMethod
+        )
 
         ctx = get_forward_context()
-        sp_ctx = ctx.dp_metadata.sp_local_sizes(
-            self.sp_size) if ctx.dp_metadata else nullcontext()
+        sp_ctx = (
+            ctx.dp_metadata.sp_local_sizes(self.sp_size)
+            if ctx.dp_metadata
+            else nullcontext()
+        )
 
         with sp_ctx:
             if do_naive_dispatch_combine:
-                hidden_states, router_logits = get_ep_group().dispatch(
-                    hidden_states, router_logits, self.is_sequence_parallel)
+                hidden_states_combined, router_logits = get_ep_group().dispatch(
+                    hidden_states, router_logits, self.is_sequence_parallel
+                )
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
+                x=hidden_states_combined
+                if do_naive_dispatch_combine
+                else hidden_states,
                 router_logits=router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map,
+                expert_map=self.expert_map
+                if not self.rocm_aiter_fmoe_enabled
+                else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
                 custom_routing_function=self.custom_routing_function,
@@ -2089,9 +1802,23 @@ class FusedMoE(CustomOp):
                 logical_replica_count=self.logical_replica_count,
             )
 
-            if shared_output is not None:
-                assert not isinstance(final_hidden_states, tuple)
+            if has_separate_shared_experts:
                 assert self.shared_experts is not None
+
+                if use_shared_experts_stream:
+                    # Run shared experts in parallel on a separate stream
+                    # NOTE: We start the separate stream here and mark the
+                    # sync end point immediately after it is done. This is
+                    # important to avoid excessive stream allocations by the cuda
+                    # graph replay later.
+                    with torch.cuda.stream(self.shared_experts_stream):
+                        # Note that hidden_states clone() is necessary here to avoid
+                        # conflict with the main stream
+                        shared_output = self.shared_experts(hidden_states_clone)
+                    current_stream().wait_stream(self.shared_experts_stream)
+                else:
+                    shared_output = self.shared_experts(hidden_states)
+
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,
@@ -2100,55 +1827,54 @@ class FusedMoE(CustomOp):
                 assert isinstance(final_hidden_states, tuple)
                 final_hidden_states, zero_expert_result = final_hidden_states
 
-            def reduce_output(states: torch.Tensor,
-                              do_combine: bool = True) -> torch.Tensor:
-                if do_naive_dispatch_combine and do_combine:
-                    states = get_ep_group().combine(states,
-                                                    self.is_sequence_parallel)
-
-                if (not self.is_sequence_parallel and self.reduce_results
-                        and (self.tp_size > 1 or self.ep_size > 1)):
-                    states = self.maybe_all_reduce_tensor_model_parallel(
-                        states)
-
+            def combine_output(states: torch.Tensor) -> torch.Tensor:
+                if do_naive_dispatch_combine:
+                    states = get_ep_group().combine(states, self.is_sequence_parallel)
                 return states
 
             if self.shared_experts is not None:
                 return (
-                    reduce_output(final_hidden_states[0], do_combine=False),
-                    reduce_output(final_hidden_states[1]),
+                    final_hidden_states[0],
+                    combine_output(final_hidden_states[1]),
                 )
             elif self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(final_hidden_states, torch.Tensor)
-                return reduce_output(final_hidden_states) + zero_expert_result
+                return (combine_output(final_hidden_states), zero_expert_result)
             else:
-                return reduce_output(final_hidden_states)
+                return combine_output(final_hidden_states)
 
     @classmethod
     def make_expert_params_mapping(
-            cls,
-            ckpt_gate_proj_name: str,
-            ckpt_down_proj_name: str,
-            ckpt_up_proj_name: str,
-            num_experts: int,
-            num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]:
-
+        cls,
+        ckpt_gate_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_up_proj_name: str,
+        num_experts: int,
+        num_redundant_experts: int = 0,
+    ) -> list[tuple[str, str, int, str]]:
         num_physical_experts = num_experts + num_redundant_experts
 
         # In the returned mapping:
         # - `expert_id` is the physical expert id
         # - `weight_name` contains the weight name of the logical expert
         # So that we should map the expert id to logical in `weight_name`
-        physical_to_logical_map = \
+        physical_to_logical_map = (
             EplbState.build_initial_global_physical_to_logical_map(
-            num_experts, num_redundant_experts)
+                num_experts, num_redundant_experts
+            )
+        )
 
         return [
             # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_" if weight_name
-             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
-             f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
-             expert_id, shard_id) for expert_id in range(num_physical_experts)
+            (
+                "experts.w13_"
+                if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
+                else "experts.w2_",
+                f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_physical_experts)
             for shard_id, weight_name in [
                 ("w1", ckpt_gate_proj_name),
                 ("w2", ckpt_down_proj_name),
@@ -2157,7 +1883,6 @@ class FusedMoE(CustomOp):
         ]
 
     def extra_repr(self) -> str:
-
         s = (
             f"global_num_experts={self.global_num_experts}, "
             f"local_num_experts={self.local_num_experts}, "
@@ -2167,7 +1892,8 @@ class FusedMoE(CustomOp):
             f"ep_size={self.ep_size}, "
             f"reduce_results={self.reduce_results}, "
             f"renormalize={self.renormalize}, "
-            f"use_grouped_topk={self.use_grouped_topk}")
+            f"use_grouped_topk={self.use_grouped_topk}"
+        )
 
         if self.use_grouped_topk:
             s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501
@@ -2201,7 +1927,7 @@ direct_register_custom_op(
     op_func=moe_forward,
     mutates_args=["hidden_states"],
     fake_impl=moe_forward_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(torch.Tag.needs_fixed_stride_order,),
 )
 
 
@@ -2231,7 +1957,7 @@ direct_register_custom_op(
     op_func=moe_forward_shared,
     mutates_args=["hidden_states"],
     fake_impl=moe_forward_shared_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(torch.Tag.needs_fixed_stride_order,),
 )
 
 # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b6afc8651e36d39481b6d179d6797cca8aadafbc..093affe51f503b6797d49a83d74dfcfe8ce98a0b 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,21 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Callable, Optional, Union, final
+from typing import final
 
 import torch
 
 import vllm.envs as envs
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.utils import (  # yapf: disable
-    _resize_cache, count_expert_num_tokens)
-from vllm.utils import cdiv
-from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled,
-                                      dbo_maybe_run_recv_hook,
-                                      dbo_register_recv_hook, dbo_yield)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache,
+    count_expert_num_tokens,
+    disable_inplace,
+)
+from vllm.utils.math_utils import cdiv
+from vllm.v1.worker.ubatching import (
+    dbo_current_ubatch_id,
+    dbo_enabled,
+    dbo_maybe_run_recv_hook,
+    dbo_register_recv_hook,
+    dbo_yield,
+)
 
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
@@ -55,75 +63,38 @@ from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled,
 #
 
 
-def _moe_problem_size(
-    a1: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> tuple[int, int, int, int, int]:
-    """
-    Extract the MoE problem size from the given tensor arguments:
-    - a: The hidden states, input to the MoE layer.
-    - w1: The first set of expert weights.
-    - w2: The second set of expert weights.
-    - topk_ids: The topk ids.
-
-    Note: extracting the problem shape from the weight and activation tensors is
-    not obvious.  It needs to be done this way specifically due to subtle issues
-    with particular kernels, e.g. the int4 kernels divide the trailing dimension
-    by two, so it's not "correct" to extract N or K from the trailing dimension
-    of w1 or w2.  Similarly, some kernels transpose the weights, so this needs
-    to be kept in mind.
-    """
-    assert w1.dim() == 3 and w2.dim() == 3
-    E, N, _ = w1.size()
-    K = a1.size(-1)
-
-    if a1.dim() == 2:
-        # Make sure we are using the correct a1 (pre-permute).
-        assert topk_ids.size(0) == a1.size(0), \
-            f"{topk_ids.size(0)} != {a1.size(0)}"
-        M = a1.size(0)
-    else:
-        assert a1.dim() == 3
-        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
-        M = a1.size(1)  # This is max_num_tokens
-
-    assert topk_ids.dim() == 2
-    topk = topk_ids.size(1)
-
-    return E, M, N, K, topk
-
-
 class FusedMoEActivationFormat(Enum):
     """
     The standard activation format (num_tokens, hidden dim).
     """
-    Standard = "standard",
+
+    Standard = ("standard",)
     """
     The batched experts format (num experts, max tokens per expert, hidden dim)
     """
-    BatchedExperts = "batched_experts",
+    BatchedExperts = ("batched_experts",)
 
 
 @dataclass
 class ExpertTokensMetadata:
     """
-  Metadata regarding expert-token routing.
-  """
+    Metadata regarding expert-token routing.
+    """
+
     expert_num_tokens: torch.Tensor
-    expert_num_tokens_cpu: Optional[torch.Tensor]
+    expert_num_tokens_cpu: torch.Tensor | None
 
     @staticmethod
-    def make_from_list(expert_num_tokens_list: list[int],
-                       device: str) -> "ExpertTokensMetadata":
-        expert_num_tokens_cpu = torch.tensor(expert_num_tokens_list,
-                                             device="cpu",
-                                             dtype=torch.int32)
+    def make_from_list(
+        expert_num_tokens_list: list[int], device: str
+    ) -> "ExpertTokensMetadata":
+        expert_num_tokens_cpu = torch.tensor(
+            expert_num_tokens_list, device="cpu", dtype=torch.int32
+        )
         return ExpertTokensMetadata(
-            expert_num_tokens=expert_num_tokens_cpu.to(device,
-                                                       non_blocking=True),
-            expert_num_tokens_cpu=expert_num_tokens_cpu)
+            expert_num_tokens=expert_num_tokens_cpu.to(device, non_blocking=True),
+            expert_num_tokens_cpu=expert_num_tokens_cpu,
+        )
 
 
 class TopKWeightAndReduce(ABC):
@@ -132,10 +103,14 @@ class TopKWeightAndReduce(ABC):
     """
 
     @abstractmethod
-    def apply(self, output: Optional[torch.Tensor],
-              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              apply_router_weight_on_input: bool) -> torch.Tensor:
+    def apply(
+        self,
+        output: torch.Tensor | None,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
         """
         Apply topk_weights to the fused_experts_outputs and/or reduce.
         If an output tensor is not passed, it will be created in the
@@ -158,10 +133,10 @@ class TopKWeightAndReduce(ABC):
 #
 PrepareResultType = tuple[
     torch.Tensor,
-    Optional[torch.Tensor],
-    Optional[ExpertTokensMetadata],
-    Optional[torch.Tensor],
-    Optional[torch.Tensor],
+    torch.Tensor | None,
+    ExpertTokensMetadata | None,
+    torch.Tensor | None,
+    torch.Tensor | None,
 ]
 
 ReceiverType = Callable[[], PrepareResultType]
@@ -174,6 +149,15 @@ class FusedMoEPrepareAndFinalize(ABC):
     described above.
     """
 
+    def post_init_setup(self, fused_experts: "FusedMoEPermuteExpertsUnpermute"):
+        """
+        Initialize FusedMoEPrepareAndFinalize settings that depend on
+        FusedMoEPermuteExpertsUnpermute experts object.
+        The FusedMoEPrepareAndFinalize implementations that have such
+        dependencies may choose to override this function.
+        """
+        return
+
     @abstractmethod
     def prepare(
         self,
@@ -181,7 +165,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> PrepareResultType:
@@ -221,10 +205,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> Union[tuple[Callable, ReceiverType], ReceiverType]:
+    ) -> tuple[Callable, ReceiverType] | ReceiverType:
         """
         Perform any quantization (and/or) dispatching needed for this kernel
         but do not wait for results from other workers.
@@ -240,16 +224,16 @@ class FusedMoEPrepareAndFinalize(ABC):
         - apply_router_weight_on_input: When True, apply the weights to the
           activations, before quantization + dispatching.
 
-        Returns a callback or a hook callback pair that when invoked waits for 
-        results from other workers and has the same return signature as 
+        Returns a callback or a hook callback pair that when invoked waits for
+        results from other workers and has the same return signature as
         `prepare`, if a hook is returned this is more lightweight check that
-        the recv is complete without doing extra work (used by DBO, will be 
+        the recv is complete without doing extra work (used by DBO, will be
         refactored in the very near future)
-        
+
         e.g.
 
         ret = obj.prepare_async(...)
-        
+
         if isinstance(ret, tuple):
             hook, receiver = ret
             hook()
@@ -296,7 +280,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: TopKWeightAndReduce,
-    ) -> Union[tuple[Callable, Callable], Callable]:
+    ) -> tuple[Callable, Callable] | Callable:
         """
         Perform any combine plus apply weights and perform a reduction on the
         fused experts output but do not wait for results from other workers.
@@ -310,10 +294,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         - weight_and_reduce_impl: An optional TopKWeightAndReduce
           implementation.
 
-        Returns a callback or a hook callback pair that when invoked waits for 
-        results from other workers and has the same return signature as 
+        Returns a callback or a hook callback pair that when invoked waits for
+        results from other workers and has the same return signature as
         `finalize`, if a hook is returned this is more lightweight check that
-        the recv is complete without doing extra work (used by DBO, will be 
+        the recv is complete without doing extra work (used by DBO, will be
         refactored in the very near future)
 
         ret = obj.finalize_async(output, ...)
@@ -340,7 +324,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         """
         The PrepareFinalize All2All implementations generally constrain the
         dtype of the topk_ids they support. This function returns the
@@ -350,7 +334,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         """
         Some PrepareFinalize All2All implementations are batched. Meaning,
         they can process only as set of tokens at a time. This
@@ -364,6 +348,14 @@ class FusedMoEPrepareAndFinalize(ABC):
     def num_dispatchers(self) -> int:
         raise NotImplementedError
 
+    @abstractmethod
+    def output_is_reduced(self) -> bool:
+        """
+        Indicates whether or not the output of finalize is reduced across all
+        ranks.
+        """
+        raise NotImplementedError
+
 
 # TODO: add supported activations method (return string)
 class FusedMoEPermuteExpertsUnpermute(ABC):
@@ -384,24 +376,68 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     @property
     @abstractmethod
     def activation_formats(
-            self) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
+        self,
+    ) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
         """
         A property which is a tuple of the input and output activation formats
         for the 'apply' method.
         """
         raise NotImplementedError
 
+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        """
+        Extract the MoE problem size from the given tensor arguments:
+        - a: The hidden states, input to the MoE layer.
+        - w1: The first set of expert weights.
+        - w2: The second set of expert weights.
+        - topk_ids: The topk ids.
+
+        Note: extracting the problem shape from the weight and activation
+        tensors is not obvious.  It needs to be done this way specifically
+        due to subtle issues with particular kernels, e.g. the int4 kernels
+        divide the trailing dimension by two, so it's not "correct" to
+        extract N or K from the trailing dimension of w1 or w2.  Similarly,
+        some kernels transpose the weights, so this needs to be kept in mind.
+
+        Note: This implementation covers most cases. However, if experts
+        require a specialized implementation, like MarlinExperts, they are free
+        to override this function.
+        """
+        assert w1.dim() == 3 and w2.dim() == 3
+        E, N, _ = w1.size()
+        K = a1.size(-1)
+
+        if a1.dim() == 2:
+            # Make sure we are using the correct a1 (pre-permute).
+            assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+            M = a1.size(0)
+        else:
+            assert a1.dim() == 3
+            assert a1.size(0) == E, f"{a1.size(0)} == {E}"
+            M = a1.size(1)  # This is max_num_tokens
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
     #
     # Various helpers for accessing quantization parameters from the
     # quant_config.
     #
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
+    def quant_dtype(self) -> torch.dtype | None:
         return self.quant_config.quant_dtype
 
     @property
-    def block_shape(self) -> Optional[list[int]]:
+    def block_shape(self) -> list[int] | None:
         return self.quant_config.block_shape
 
     @property
@@ -413,51 +449,51 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         return self.quant_config.per_out_ch_quant
 
     @property
-    def a1_scale(self) -> Optional[torch.Tensor]:
+    def a1_scale(self) -> torch.Tensor | None:
         return self.quant_config.a1_scale
 
     @property
-    def a2_scale(self) -> Optional[torch.Tensor]:
+    def a2_scale(self) -> torch.Tensor | None:
         return self.quant_config.a2_scale
 
     @property
-    def a1_gscale(self) -> Optional[torch.Tensor]:
+    def a1_gscale(self) -> torch.Tensor | None:
         return self.quant_config.a1_gscale
 
     @property
-    def a2_gscale(self) -> Optional[torch.Tensor]:
+    def a2_gscale(self) -> torch.Tensor | None:
         return self.quant_config.a2_gscale
 
     @property
-    def w1_scale(self) -> Optional[torch.Tensor]:
+    def w1_scale(self) -> torch.Tensor | None:
         return self.quant_config.w1_scale
 
     @property
-    def w2_scale(self) -> Optional[torch.Tensor]:
+    def w2_scale(self) -> torch.Tensor | None:
         return self.quant_config.w2_scale
 
     @property
-    def w1_zp(self) -> Optional[torch.Tensor]:
+    def w1_zp(self) -> torch.Tensor | None:
         return self.quant_config.w1_zp
 
     @property
-    def w2_zp(self) -> Optional[torch.Tensor]:
+    def w2_zp(self) -> torch.Tensor | None:
         return self.quant_config.w2_zp
 
     @property
-    def w1_bias(self) -> Optional[torch.Tensor]:
+    def w1_bias(self) -> torch.Tensor | None:
         return self.quant_config.w1_bias
 
     @property
-    def w2_bias(self) -> Optional[torch.Tensor]:
+    def w2_bias(self) -> torch.Tensor | None:
         return self.quant_config.w2_bias
 
     @property
-    def g1_alphas(self) -> Optional[torch.Tensor]:
+    def g1_alphas(self) -> torch.Tensor | None:
         return self.quant_config.g1_alphas
 
     @property
-    def g2_alphas(self) -> Optional[torch.Tensor]:
+    def g2_alphas(self) -> torch.Tensor | None:
         return self.quant_config.g2_alphas
 
     # TODO (bnell): make this return a CHUNK_SIZE or None instead?
@@ -476,50 +512,77 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         """
         raise NotImplementedError
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        A flag indicating whether or not this class can process packed ue8m0
+        activation scales.
+        """
+        return False
+
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        """
+        Workspace type: The dtype to use for the workspace tensors.
+        """
+        return act_dtype
+
     @abstractmethod
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         """
         Compute the shapes for the temporary and final outputs of the two gemms
         and activation in the fused expert function.  Since the gemms are
         independent, the workspace for the first gemm can be shared with the
         workspace for the last gemm.
 
+        Inputs:
+        - M: number of tokens.
+        - N: Row (or column) dimension of expert weights.
+        - K: hidden dimension
+        - topk: The number of top-k experts to select.
+        - global_num_experts: global number of experts.
+        - local_num_experts: local number of experts due to DP/EP.
+        - expert_tokens_meta: number of tokens per expert metadata for batched
+                              format.
+
         Returns a tuple of:
         - workspace13 shape tuple: must be large enough to hold the
           result of either expert gemm.
         - workspace2 shape tuple: must be large enough to hold the
           result of the activation function.
         - output shape tuple: must be exact size of the final gemm output.
-        - Workspace type: The dtype to use for the workspace tensors.
-        - Note: in order for activation chunking to work, the first dimension
-          of each tuple must be the number of tokens.
+        - Note: workspace shapes can be 0 if the workspace is not needed.
+          But in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens when the shape is
+          not 0.
         """
         raise NotImplementedError
 
-    def activation(self, activation: str, output: torch.Tensor,
-                   input: torch.Tensor) -> None:
+    def activation(
+        self, activation: str, output: torch.Tensor, input: torch.Tensor
+    ) -> None:
         assert output.size(-1) * 2 == input.size(-1)
         if activation == "silu":
             torch.ops._C.silu_and_mul(output, input)
         elif activation == "gelu":
             torch.ops._C.gelu_and_mul(output, input)
+        elif activation == "swigluoai":
+            # alpha = 1.702, limit = 7.0
+            torch.ops._C.swigluoai_and_mul(output, input)
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
     def enable_chunking(self):
-        return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
-          self.supports_chunking()
+        return (
+            envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and self.supports_chunking()
+        )
 
     def finalize_weight_and_reduce_impl(self) -> TopKWeightAndReduce:
         raise NotImplementedError
@@ -535,14 +598,14 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        expert_tokens_meta: ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
-    ):
+    ) -> None:
         """
         This function computes the intermediate result of a Mixture of Experts
         (MoE) layer using two sets of weights, w1 and w2.
@@ -581,8 +644,9 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         raise NotImplementedError
 
 
-def _chunk_scales(scales: Optional[torch.Tensor], start: int,
-                  end: int) -> Optional[torch.Tensor]:
+def _slice_scales(
+    scales: torch.Tensor | None, start: int, end: int
+) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             return scales
@@ -592,17 +656,20 @@ def _chunk_scales(scales: Optional[torch.Tensor], start: int,
 
 
 class SharedResizableBuffer:
-
     def __init__(self):
         self.buffer = None
 
-    def get(self, shape: tuple[int, ...], device: torch.device,
-            dtype: torch.dtype):
-        if shape == () or shape is None:
-            return None
+    def get(
+        self, shape: tuple[int, ...], device: torch.device, dtype: torch.dtype
+    ) -> torch.Tensor:
+        assert shape != ()
         shape_numel = prod(shape)
-        if (self.buffer is None or self.buffer.numel() < shape_numel
-                or self.buffer.device != device or self.buffer.dtype != dtype):
+        if (
+            self.buffer is None
+            or self.buffer.numel() < shape_numel
+            or self.buffer.device != device
+            or self.buffer.dtype != dtype
+        ):
             self.buffer = torch.empty(shape_numel, device=device, dtype=dtype)
         return self.buffer[:shape_numel].view(*shape)
 
@@ -622,7 +689,6 @@ class FusedMoEModularKernel(torch.nn.Module):
     """
 
     class SharedBuffers:
-
         def __init__(self) -> None:
             self.fused_out = SharedResizableBuffer()
             self.workspace13 = SharedResizableBuffer()
@@ -642,290 +708,233 @@ class FusedMoEModularKernel(torch.nn.Module):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
-        shared_experts: Optional[torch.nn.Module] = None,
+        shared_experts: torch.nn.Module | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
-        assert prepare_finalize.activation_format == \
-            fused_experts.activation_formats[0], (
-                f"{prepare_finalize.__class__.__name__}."
-                f"{prepare_finalize.activation_format} == "
-                f"{fused_experts.__class__.__name__}."
-                f"{fused_experts.activation_formats[0]}")
-
-    def _do_fused_experts(
-        self,
-        fused_out: Optional[torch.Tensor],
-        a1: torch.Tensor,
-        a1q: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
-        apply_router_weight_on_input: bool,
-    ) -> torch.Tensor:
 
-        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+        self._post_init_setup()
+        assert (
+            prepare_finalize.activation_format == fused_experts.activation_formats[0]
+        ), (
+            f"{prepare_finalize.__class__.__name__}."
+            f"{prepare_finalize.activation_format} == "
+            f"{fused_experts.__class__.__name__}."
+            f"{fused_experts.activation_formats[0]}"
+        )
 
-        (workspace13_shape, workspace2_shape, fused_out_shape,
-         workspace_dtype) = self.fused_experts.workspace_shapes(
-             a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
-             expert_tokens_meta)
+    def _post_init_setup(self):
+        """
+        Resolve any leftover setup dependencies between self.prepare_finalize
+        and self.fused_experts here.
+        """
+        self.prepare_finalize.post_init_setup(self.fused_experts)
 
-        # select per-ubatch buffers to avoid cross-ubatch reuse under DBO
-        ubatch_idx = dbo_current_ubatch_id()
-        buffers = self.shared_buffers[ubatch_idx]
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps.
+        """
+        return self.fused_experts.supports_expert_map()
 
-        # We can reuse the memory between cache1 and cache3 because by the
-        # time we need cache3, we're done with cache1.
-        workspace13 = buffers.workspace13.get(workspace13_shape,
-                                              device=a1.device,
-                                              dtype=workspace_dtype)
-        workspace2 = buffers.workspace2.get(workspace2_shape,
-                                            device=a1.device,
-                                            dtype=workspace_dtype)
-
-        assert fused_out is None or fused_out.shape == fused_out_shape, (
-            f"fused_out {fused_out.shape} but expected {fused_out_shape}")
-        if fused_out is None:
-            # reuse workspace13 for the output
-            fused_out = _resize_cache(workspace13, fused_out_shape)
+    def output_is_reduced(self) -> bool:
+        """
+        Indicates whether or not the output of fused MoE kernel
+        is reduced across all ranks.
+        """
+        return self.prepare_finalize.output_is_reduced()
 
-        self.fused_experts.apply(
-            fused_out,
-            a1q,
-            w1,
-            w2,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            a1q_scale=a1q_scale,
-            a2_scale=a2_scale,
-            workspace13=workspace13,
-            workspace2=workspace2,
-            expert_tokens_meta=expert_tokens_meta,
-            apply_router_weight_on_input=apply_router_weight_on_input,
+    def _chunk_info(self, M: int) -> tuple[int, int]:
+        """
+        Compute number of chunks and chunk size for given M.
+        If chunking is not supported, set the CHUNK_SIZE to M so we
+        get num_chunks == 1. Take max(M, 1) to avoid divide by zero.
+        If there are no tokens to process, the number of chunks will be zero.
+        """
+        CHUNK_SIZE = max(
+            1,
+            (
+                M
+                if not self.fused_experts.supports_chunking()
+                else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+            ),
         )
+        num_chunks = cdiv(M, CHUNK_SIZE)
+        # If there are no tokens, then there should be no loop iterations.
+        assert M > 0 or num_chunks == 0
+        return num_chunks, CHUNK_SIZE
 
-        return fused_out
-
-    def _maybe_chunk_fused_experts(
+    def _allocate_buffers(
         self,
-        a1: torch.Tensor,
-        a1q: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
+        out_dtype: torch.dtype,
+        device: torch.device,
+        M_chunk: int,
+        M_full: int,
+        N: int,
+        K: int,
+        top_k: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
-        apply_router_weight_on_input: bool,
-    ) -> torch.Tensor:
-
-        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+        expert_tokens_meta: ExpertTokensMetadata | None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Allocate temporary and output buffers for the fused experts op.
+        Inputs:
+        - out_dtype: output type of workspace and output tensors.
+        - device: the device of the workspace and output tensors.
+        See `workspace_shapes` for a description of the remainder of arguments.
+        Returns a tuple of (workspace13, workspace2, output) tensors.
+        """
+        assert M_full > 0 and M_chunk > 0
 
-        CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
-        num_chunks = cdiv(M, CHUNK_SIZE)
+        num_chunks, _ = self._chunk_info(M_full)
 
-        # TODO(bnell): get rid of one level here, update slice functions
-        # to nops on num_chunks==1
+        # select per-ubatch buffers to avoid cross-ubatch reuse under DBO
+        ubatch_idx = dbo_current_ubatch_id()
+        buffers = self.shared_buffers[ubatch_idx]
+        workspace_dtype = self.fused_experts.workspace_dtype(out_dtype)
+
+        # Get intermediate workspace shapes based off the chunked M size.
+        workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
+            M_chunk,
+            N,
+            K,
+            top_k,
+            global_num_experts,
+            local_num_experts,
+            expert_tokens_meta,
+        )
 
-        if not self.fused_experts.supports_chunking() or num_chunks == 1:
-            return self._do_fused_experts(
-                fused_out=None,
-                a1=a1,
-                a1q=a1q,
-                w1=w1,
-                w2=w2,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                local_num_experts=local_num_experts,
-                expert_map=expert_map,
-                a1q_scale=a1q_scale,
-                a2_scale=self.fused_experts.a2_scale,
-                expert_tokens_meta=expert_tokens_meta,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
+        # Get final output shape based on the full M size.
+        _, _, fused_out_shape = self.fused_experts.workspace_shapes(
+            M_full,
+            N,
+            K,
+            top_k,
+            global_num_experts,
+            local_num_experts,
+            expert_tokens_meta,
+        )
 
-        # Chunking required case
-        assert num_chunks > 1
+        # We can reuse the memory between cache1 and cache3 because by the
+        # time we need cache3, we're done with cache1.
+        workspace13 = buffers.workspace13.get(
+            workspace13_shape, device=device, dtype=workspace_dtype
+        )
+        workspace2 = buffers.workspace2.get(
+            workspace2_shape, device=device, dtype=workspace_dtype
+        )
 
         # Construct the entire output that can then be processed in chunks.
-        (_, _, fused_out_shape, _) = self.fused_experts.workspace_shapes(
-            a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
-            expert_tokens_meta)
-        ubatch_idx = dbo_current_ubatch_id()
-        buffers = self.shared_buffers[ubatch_idx]
-        fused_out = buffers.fused_out.get(fused_out_shape,
-                                          device=a1q.device,
-                                          dtype=a1.dtype)
-
-        def slice_input_tensors(
-            chunk_idx: int
-        ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-                   Optional[torch.Tensor], torch.Tensor, torch.Tensor]:
-            s = chunk_idx * CHUNK_SIZE
-            e = min(s + CHUNK_SIZE, M)
-            return (
-                a1q[s:e],
-                _chunk_scales(a1q_scale, s, e),
-                _chunk_scales(self.fused_experts.a2_scale, s, e),
-                topk_ids[s:e],
-                topk_weights[s:e],
+        # Reuse workspace13 for the output in the non-chunked case as long
+        # as it is large enough. This will not always be the case for standard
+        # format experts and with experts that have empty workspaces.
+        if num_chunks == 1 and prod(workspace13_shape) >= prod(fused_out_shape):
+            fused_out = _resize_cache(workspace13, fused_out_shape)
+        else:
+            fused_out = buffers.fused_out.get(
+                fused_out_shape, device=device, dtype=out_dtype
             )
 
-        def slice_output_tensor(chunk_idx: int) -> torch.Tensor:
-            assert fused_out.size(0) % M == 0, (
-                f"fused_out shape {fused_out.shape} vs M {M}")
-            factor = fused_out.size(0) // M
-            out_chunk_size = CHUNK_SIZE * factor
-            s = chunk_idx * out_chunk_size
-            e = min(s + out_chunk_size, fused_out.size(0))
-            return fused_out[s:e]
-
-        def slice_expert_tokens_metadata(
-                full_expert_tokens_meta: ExpertTokensMetadata,
-                chunk_topk_ids: torch.Tensor, local_num_experts: int,
-                expert_map: Optional[torch.Tensor]) -> ExpertTokensMetadata:
-            # The existing expert_num_tokens is for the entire a1q
-            # input. Chunking forces recomputation of the number
-            # of tokens assigned to each expert.
-            c_expert_num_tokens = count_expert_num_tokens(
-                chunk_topk_ids, local_num_experts, expert_map)
-
-            c_expert_num_tokens_cpu = None
-            need_expert_num_tokens_cpu = (
-                full_expert_tokens_meta.expert_num_tokens_cpu is not None)
-            if need_expert_num_tokens_cpu:
-                # This is blocking as some implementations need the count
-                # on the CPU to determine appropriate input/out fused-moe
-                # buffers
-                c_expert_num_tokens_cpu = c_expert_num_tokens.to(
-                    "cpu", non_blocking=False)
-
-            return ExpertTokensMetadata(
-                expert_num_tokens=c_expert_num_tokens,
-                expert_num_tokens_cpu=c_expert_num_tokens_cpu)
+        return workspace13, workspace2, fused_out
 
-        for chunk_idx in range(num_chunks):
-            c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = (
-                slice_input_tensors(chunk_idx))
-
-            c_expert_tokens_meta = None
-            if expert_tokens_meta is not None:
-                c_expert_tokens_meta = slice_expert_tokens_metadata(
-                    expert_tokens_meta, c_topk_ids, local_num_experts,
-                    expert_map)
-
-            self._do_fused_experts(
-                fused_out=slice_output_tensor(chunk_idx),
-                a1=a1,
-                a1q=c_a1q,
-                w1=w1,
-                w2=w2,
-                topk_weights=c_topk_weights,
-                topk_ids=c_topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                local_num_experts=local_num_experts,
-                expert_map=expert_map,
-                a1q_scale=c_a1q_scale,
-                a2_scale=c_a2_scale,
-                expert_tokens_meta=c_expert_tokens_meta,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
+    @staticmethod
+    def _slice_output_tensor(
+        fused_out: torch.Tensor,
+        chunk_idx: int,
+        num_chunks: int,
+        CHUNK_SIZE: int,
+        M: int,
+    ) -> torch.Tensor:
+        if num_chunks == 1:
+            return fused_out
 
-        return fused_out
+        assert fused_out.size(0) % M == 0, f"fused_out shape {fused_out.shape} vs M {M}"
+        factor = fused_out.size(0) // M
+        out_chunk_size = CHUNK_SIZE * factor
+        s = chunk_idx * out_chunk_size
+        e = min(s + out_chunk_size, fused_out.size(0))
+        return fused_out[s:e]
 
-    def forward(
+    @staticmethod
+    def _slice_expert_tokens_metadata(
+        num_chunks: int,
+        full_expert_tokens_meta: ExpertTokensMetadata | None,
+        chunk_topk_ids: torch.Tensor,
+        local_num_experts: int,
+        expert_map: torch.Tensor | None,
+    ) -> ExpertTokensMetadata | None:
+        if num_chunks == 1 or full_expert_tokens_meta is None:
+            return full_expert_tokens_meta
+
+        # The existing expert_num_tokens is for the entire a1q
+        # input. Chunking forces recomputation of the number
+        # of tokens assigned to each expert.
+        c_expert_num_tokens = count_expert_num_tokens(
+            chunk_topk_ids, local_num_experts, expert_map
+        )
+
+        c_expert_num_tokens_cpu = None
+        need_expert_num_tokens_cpu = (
+            full_expert_tokens_meta.expert_num_tokens_cpu is not None
+        )
+        if need_expert_num_tokens_cpu:
+            # This is blocking as some implementations need the count
+            # on the CPU to determine appropriate input/out fused-moe
+            # buffers
+            c_expert_num_tokens_cpu = c_expert_num_tokens.to("cpu", non_blocking=False)
+
+        return ExpertTokensMetadata(
+            expert_num_tokens=c_expert_num_tokens,
+            expert_num_tokens_cpu=c_expert_num_tokens_cpu,
+        )
+
+    def _prepare(
         self,
         hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        inplace: bool = False,
-        activation: str = "silu",
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor | None,
+        ExpertTokensMetadata | None,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
         """
-        This function computes a Mixture of Experts (MoE) layer using two sets
-        of weights, w1 and w2, and top-k gating mechanism.
-
-        Parameters:
-        - hidden_states: (torch.Tensor): The input tensor to the MoE layer.
-        - w1 (torch.Tensor): The first set of expert weights.
-        - w2 (torch.Tensor): The second set of expert weights.
-        - topk_weights (torch.Tensor): The topk weights applied at the end of
-          the layer.
-        - topk_ids (torch.Tensor): A map of row to expert id.
-        - inplace (bool): If True, perform the operation in-place.
-          Defaults to False.
-        - activation (str): The activation function to apply after the first
-          MoE layer.
-        - global_num_experts (int): The total number of experts in the global
-          expert space.
-        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-          from the global expert space to the local expert space of the expert
-          parallel shard.
-        - apply_router_weight_on_input (bool): When true, the topk weights are
-          applied directly on the inputs. This is only applicable when topk is
-          1.
-
-        Returns:
-        - torch.Tensor: The output tensor after applying the MoE layer.
+        The _prepare method is a wrapper around self.prepare_finalize.prepare
+        that handles DBO and async.
         """
-
-        a1 = hidden_states
-        if inplace and self.shared_experts is None:
-            output = a1
-        else:
-            output = torch.zeros_like(a1)
-
-        local_num_experts = w1.size(0)
-        if global_num_experts == -1:
-            global_num_experts = local_num_experts
-
         if not self.prepare_finalize.supports_async():
             # We shouldn't be running an a2a kernel that doesn't
             # support async prepare/finalize
             # TODO(lucas): enable in follow-up
             assert not dbo_enabled()
 
-            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
-             _expert_topk_weights) = self.prepare_finalize.prepare(
-                 a1,
-                 topk_weights,
-                 topk_ids,
-                 global_num_experts,
-                 expert_map,
-                 apply_router_weight_on_input,
-                 self.fused_experts.quant_config,
-             )
+            (
+                a1q,
+                a1q_scale,
+                expert_tokens_meta,
+                _expert_topk_ids,
+                _expert_topk_weights,
+            ) = self.prepare_finalize.prepare(
+                hidden_states,
+                topk_weights,
+                topk_ids,
+                global_num_experts,
+                expert_map,
+                apply_router_weight_on_input,
+                self.fused_experts.quant_config,
+            )
         else:
             # Overlap shared expert compute with all2all dispatch.
             dbo_maybe_run_recv_hook()
             prepare_ret = self.prepare_finalize.prepare_async(
-                a1,
+                hidden_states,
                 topk_weights,
                 topk_ids,
                 global_num_experts,
@@ -937,8 +946,9 @@ class FusedMoEModularKernel(torch.nn.Module):
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
             # receiver (see finalize_async docstring)
-            hook, receiver = prepare_ret \
-                if isinstance(prepare_ret, tuple) else (None, prepare_ret)
+            hook, receiver = (
+                prepare_ret if isinstance(prepare_ret, tuple) else (None, prepare_ret)
+            )
 
             if hook is not None:
                 if dbo_enabled():
@@ -950,42 +960,129 @@ class FusedMoEModularKernel(torch.nn.Module):
                 else:
                     hook()
 
-            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
-             _expert_topk_weights) = receiver()
+            (
+                a1q,
+                a1q_scale,
+                expert_tokens_meta,
+                _expert_topk_ids,
+                _expert_topk_weights,
+            ) = receiver()
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
         topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
-        topk_weights = (topk_weights if _expert_topk_weights is None else
-                        _expert_topk_weights)
-
-        fused_out = None
-
-        if a1q.numel() == 0:
-            # This happens when none of the tokens from the all2all reach this
-            # EP rank. Also, note that this is only relevant for CUDAGraph
-            # incompatible all2all kernels like the DeepEP high-throughput
-            # kernels. CUDAGraph compatible all2all kernels like the pplx
-            # kernels and the DeepEP low-latency kernels are always batched
-            # and can never run into the tensor.numel() == 0 case.
-            fused_out = torch.empty_like(a1q).to(dtype=a1.dtype)
+        topk_weights = (
+            topk_weights if _expert_topk_weights is None else _expert_topk_weights
+        )
+
+        return a1q, a1q_scale, expert_tokens_meta, topk_ids, topk_weights
+
+    def _fused_experts(
+        self,
+        in_dtype: torch.dtype,
+        a1q: torch.Tensor,
+        a1q_scale: torch.Tensor | None,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_map: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        expert_tokens_meta: ExpertTokensMetadata | None,
+    ) -> torch.Tensor:
+        _, M_full, N, K, top_k = self.fused_experts.moe_problem_size(
+            a1q, w1, w2, topk_ids
+        )
+
+        num_chunks, CHUNK_SIZE = self._chunk_info(M_full)
+
+        def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
+            if num_chunks == 1:
+                # Use a1q.size(0) here since batched format does not
+                # keep M in the first dimension.
+                return 0, a1q.size(0)
+            else:
+                s = chunk_idx * CHUNK_SIZE
+                e = min(s + CHUNK_SIZE, M_full)
+                return s, e
+
+        # This happens when none of the tokens from the all2all reach this
+        # EP rank. Also, note that this is only relevant for CUDAGraph
+        # incompatible all2all kernels like the DeepEP high-throughput
+        # kernels. CUDAGraph compatible all2all kernels like the pplx
+        # kernels and the DeepEP low-latency kernels are always batched
+        # and can never run into the tensor.numel() == 0 case.
+        if M_full == 0:
+            assert num_chunks == 0
+            workspace13 = None
+            workspace2 = None
+            fused_out = torch.empty_like(a1q, dtype=in_dtype)
         else:
-            fused_out = self._maybe_chunk_fused_experts(
-                a1=a1,
-                a1q=a1q,
+            assert num_chunks > 0
+            workspace13, workspace2, fused_out = self._allocate_buffers(
+                in_dtype,
+                a1q.device,
+                CHUNK_SIZE,
+                M_full,
+                N,
+                K,
+                top_k,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_meta,
+            )
+
+        for chunk_idx in range(num_chunks):
+            s, e = input_chunk_range(chunk_idx)
+
+            c_expert_tokens_meta = self._slice_expert_tokens_metadata(
+                num_chunks,
+                expert_tokens_meta,
+                topk_ids[s:e],
+                local_num_experts,
+                expert_map,
+            )
+
+            c_fused_out = self._slice_output_tensor(
+                fused_out, chunk_idx, num_chunks, CHUNK_SIZE, M_full
+            )
+
+            self.fused_experts.apply(
+                output=c_fused_out,
+                hidden_states=a1q[s:e],
                 w1=w1,
                 w2=w2,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
+                topk_weights=topk_weights[s:e],
+                topk_ids=topk_ids[s:e],
                 activation=activation,
                 global_num_experts=global_num_experts,
-                local_num_experts=local_num_experts,
                 expert_map=expert_map,
-                a1q_scale=a1q_scale,
-                expert_tokens_meta=expert_tokens_meta,
+                a1q_scale=_slice_scales(a1q_scale, s, e),
+                a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e),
+                workspace13=workspace13,
+                workspace2=workspace2,
+                expert_tokens_meta=c_expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
-        shared_output: Optional[torch.Tensor] = None
+        return fused_out
+
+    def _finalize(
+        self,
+        output: torch.Tensor,
+        fused_out: torch.Tensor,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        """
+        The _finalize method is a wrapper around self.prepare_finalize.finalize
+        that handles DBO, async and shared expert overlap.
+        """
+        shared_output: torch.Tensor | None = None
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -999,7 +1096,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
             if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
+                shared_output = self.shared_experts(hidden_states)
         else:
             finalize_ret = self.prepare_finalize.finalize_async(
                 output,
@@ -1011,13 +1108,16 @@ class FusedMoEModularKernel(torch.nn.Module):
             )
 
             if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
+                shared_output = self.shared_experts(hidden_states)
 
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
             # receiver (see finalize_async docstring)
-            hook, receiver = finalize_ret \
-                if isinstance(finalize_ret, tuple) else (None, finalize_ret)
+            hook, receiver = (
+                finalize_ret
+                if isinstance(finalize_ret, tuple)
+                else (None, finalize_ret)
+            )
 
             if hook is not None:
                 if dbo_enabled():
@@ -1036,3 +1136,87 @@ class FusedMoEModularKernel(torch.nn.Module):
         else:
             assert shared_output is not None
             return shared_output, output
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool = False,
+        activation: str = "silu",
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        """
+        This function computes a Mixture of Experts (MoE) layer using two sets
+        of weights, w1 and w2, and top-k gating mechanism.
+
+        Parameters:
+        - hidden_states: (torch.Tensor): The input tensor to the MoE layer.
+        - w1 (torch.Tensor): The first set of expert weights.
+        - w2 (torch.Tensor): The second set of expert weights.
+        - topk_weights (torch.Tensor): The topk weights applied at the end of
+          the layer.
+        - topk_ids (torch.Tensor): A map of row to expert id.
+        - inplace (bool): If True, perform the operation in-place.
+          Defaults to False.
+        - activation (str): The activation function to apply after the first
+          MoE layer.
+        - global_num_experts (int): The total number of experts in the global
+          expert space.
+        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+          from the global expert space to the local expert space of the expert
+          parallel shard.
+        - apply_router_weight_on_input (bool): When true, the topk weights are
+          applied directly on the inputs. This is only applicable when topk is
+          1.
+
+        Returns:
+        - torch.Tensor: The output tensor after applying the MoE layer.
+        """
+
+        if inplace and self.shared_experts is None and not disable_inplace():
+            output = hidden_states
+        else:
+            output = torch.zeros_like(hidden_states)
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        a1q, a1q_scale, expert_tokens_meta, topk_ids, topk_weights = self._prepare(
+            hidden_states,
+            topk_weights,
+            topk_ids,
+            global_num_experts,
+            expert_map,
+            apply_router_weight_on_input,
+        )
+
+        fused_out = self._fused_experts(
+            in_dtype=hidden_states.dtype,
+            a1q=a1q,
+            a1q_scale=a1q_scale,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            local_num_experts=local_num_experts,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_tokens_meta=expert_tokens_meta,
+        )
+
+        return self._finalize(
+            output,
+            fused_out,
+            hidden_states,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index c7d7126bab3adbda3526ea43685be54be2137086..7f6155997264d89cf46f502328d0696098401399 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -1,20 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 
 def moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
-    pad_sorted_ids: bool = False
+    expert_map: torch.Tensor | None = None,
+    pad_sorted_ids: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -68,20 +67,108 @@ def moe_align_block_size(
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
     max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    expert_ids = torch.empty((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-
-    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    ops.moe_align_block_size(
+        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+    )
     if expert_map is not None:
         expert_ids = expert_map[expert_ids]
 
     return sorted_ids, expert_ids, num_tokens_post_pad
+
+
+def batched_moe_align_block_size(
+    max_tokens_per_batch: int, block_size: int, expert_num_tokens: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Given num_batches, max_tokens_per_batch, block_size and the number of
+    valid-tokens in each batch, prepare sorted_token_ids, expert_ids and
+    num_tokens_post_pad. sorted_token_ids, expert_ids and num_tokens_post_pad
+    have the same semantics as in moe_align_block_size.
+
+    This function is intended to be a drop in replacement for
+    moe_align_batch_size for the batched case.
+
+    Parameters:
+    - max_tokens_per_batch (int): Number of tokens in each batch (both
+        valid and invalid).
+    - block_size (int): block_size to align the data to.
+    - expert_num_tokens (torch.Tensor): expert_num_tokens[i], indicates
+        the number of valid tokens in batch i.
+
+    Returns:
+    - sorted_token_ids (torch.Tensor): Torch tensor of size
+        (num_batches * max_tokens_per_batch) indicating the token indices for
+        that block.
+    - expert_ids (torch.Tensor): Torch tensor of size
+        ceil((num_batches * max_tokens_per_batch) / block_size) indicating
+        what expert to use for each block.
+    - num_tokens_post_pad (torch.Tensor): Torch tensor of size 1
+        indicating the number of valid blocks with actual data to
+        process. This is represented in terms of num tokens.
+    Example:
+    Let num_batches=5, max_tokens_per_batch=8, block_size=4, and
+    expert_num_tokens=[2, 3, 0, 6, 8]. This expert_num_tokens tensor
+    indicates that,
+     - The first 2 tokens in the 0th batch are valid and the rest 6 are
+     invalid (i.e. in the 2D hidden_states tensor of shape,
+     [num_batches * max_tokens_per_batch, K], indices 0, 1 are valid)
+     - The first 3 tokens in the 1st batch are valid. i.e. indices 8, 9, 10
+     - 0 tokens in the 2nd batch are valid
+     - first 6 tokens in the  3rd batch are valid. i.e. indices,
+     24, 25, 26, 27, 28, 29
+     - so on ...
+
+     In this case,
+      sorted_token_ids will be [0, 1, 40, 40,
+                                8, 9, 10, 40,
+                                24, 25, 26, 27,
+                                28, 29, 40, 40,
+                                32, 33, 34, 35,
+                                36, 37, 38, 39,
+                                40, 40, 40, 40,
+                                (rest all 40, 40, 40, 40)
+                                ...]
+      Here, 40 represents an invalid index. as there is no token index 40.
+      The gemm kernel using this sorted_token_ids is expected to skip the
+      gemm computation when it encounters this invalid index.
+
+      expert_ids will be [0, 1, 3, 3, 4, 5, 5, -1, -1, (rest all -1) ...]
+      Here, -1 represents an invalid expert. The gemm kernel using this
+      expert_ids is expected to skip the gemm computation when it encounters
+      an expert of id -1.
+
+      num_tokens_post_pad will be 24 as sorted_token_ids has valid entries
+      until 24.
+    """
+
+    B = expert_num_tokens.size(0)
+    device = expert_num_tokens.device
+
+    # Round up so each batch can be split to blocks evenly.
+    max_num_tokens_padded = B * round_up(max_tokens_per_batch, block_size)
+
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device=device)
+    assert max_num_tokens_padded % block_size == 0
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device=device)
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=device)
+
+    ops.batched_moe_align_block_size(
+        max_tokens_per_batch,
+        block_size,
+        expert_num_tokens,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+    )
+
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 23f618b1a5fd76fe401ab31eb3fed6a5b2d86445..66c00cf89873a07837627cf5a57bb07e7c43ff8e 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -7,18 +7,20 @@ import torch.nn.functional as F
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
     """
-  Compute the histogram of an int32 tensor. The bin edges are defined by the
-  min and max values, with step = 1.
-  """
+    Compute the histogram of an int32 tensor. The bin edges are defined by the
+    min and max values, with step = 1.
+    """
     assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
     assert min <= max, "min must be less than or equal to max."
 
-    def searchsorted(sorted_sequence: torch.Tensor,
-                     values_to_search: torch.Tensor) -> torch.Tensor:
+    def searchsorted(
+        sorted_sequence: torch.Tensor, values_to_search: torch.Tensor
+    ) -> torch.Tensor:
         return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
 
-    bin_edges = torch.linspace(min, max, max - min + 1,
-                               dtype=input.dtype).to(input.device)
+    bin_edges = torch.linspace(min, max, max - min + 1, dtype=input.dtype).to(
+        input.device
+    )
     return searchsorted(bin_edges, input).to(torch.int32)
 
 
@@ -41,6 +43,7 @@ def fused_moe(
     """
     assert expert_map is None, "expert_map is not supported for pallas MoE."
     import torch_xla.experimental.custom_kernel  # noqa: F401
+
     orig_shape = hidden_states.shape
     hidden_size = hidden_states.shape[-1]
     num_tokens = hidden_states.shape[:-1].numel()
@@ -50,7 +53,8 @@ def fused_moe(
     dtype = hidden_states.dtype
     assert (num_tokens * topk) % 16 == 0, (
         "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
-        f"16 but got {num_tokens * topk}")
+        f"16 but got {num_tokens * topk}"
+    )
 
     hidden_states = hidden_states.view(num_tokens, hidden_size)
     gating_output = gating_output.view(num_tokens, num_experts)
@@ -63,8 +67,7 @@ def fused_moe(
     topk_indices = topk_indices.flatten()
     topk_argsort_indices = topk_indices.argsort()
     topk_argsort_revert_indices = topk_argsort_indices.argsort()
-    token_indices = torch.arange(num_tokens,
-                                 device=device).repeat_interleave(topk)
+    token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk)
     token_indices = token_indices[topk_argsort_indices]
     group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 16a155e7184788d43c3573842daf3544f1c228e9..9dcdcc3800363be4930e27cad57793820c5600c5 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -1,24 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size)
+    moe_align_block_size,
+)
 from vllm.model_executor.layers.fused_moe.utils import _fp8_perm
 
 
 def _moe_permute(
     curr_hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
+    a1q_scale: torch.Tensor | None,
     curr_topk_ids: torch.Tensor,
     global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     block_m: int,
-) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
@@ -27,14 +26,11 @@ def _moe_permute(
 
     tokens_in_chunk = curr_hidden_states.size(0)
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = (
-        moe_align_block_size(curr_topk_ids,
-                             block_m,
-                             global_num_experts,
-                             expert_map,
-                             pad_sorted_ids=True))
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        curr_topk_ids, block_m, global_num_experts, expert_map, pad_sorted_ids=True
+    )
 
-    inv_perm: Optional[torch.Tensor] = None
+    inv_perm: torch.Tensor | None = None
 
     num_tokens = top_k_num * tokens_in_chunk
     expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
@@ -43,20 +39,18 @@ def _moe_permute(
     # Permute according to sorted token ids.
     sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
 
-    curr_hidden_states = _fp8_perm(curr_hidden_states,
-                                   sorted_token_ids // top_k_num)
+    curr_hidden_states = _fp8_perm(curr_hidden_states, sorted_token_ids // top_k_num)
 
     if a1q_scale is not None:
         a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
 
-    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-            inv_perm)
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, inv_perm)
 
 
 def _moe_unpermute_and_reduce(
     out: torch.Tensor,
     curr_hidden: torch.Tensor,
-    inv_perm: Optional[torch.Tensor],
+    inv_perm: torch.Tensor | None,
     topk_weight: torch.Tensor,
     apply_router_weight_on_input: bool,
 ) -> None:
@@ -76,16 +70,15 @@ def _moe_unpermute_and_reduce(
 
 def moe_permute(
     hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
+    a1q_scale: torch.Tensor | None,
     topk_ids: torch.Tensor,
     n_expert: int,
     n_local_expert: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    align_block_size: Optional[int] = None,
+    expert_map: torch.Tensor | None = None,
+    align_block_size: int | None = None,
     fill_invalid_expert: int = -1,
-    permuted_hidden_states: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           torch.Tensor]:
+    permuted_hidden_states: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     This function expands and permutes activation to gather uncontinuous tokens
       for each expert.
@@ -117,13 +110,21 @@ def moe_permute(
     """
     n_token, n_hidden = hidden_states.size()
     topk = topk_ids.size(1)
-    assert (n_hidden * hidden_states.element_size()
-            ) % 16 == 0, "permue kernel need hidden dim align to 16B"
+    assert (n_hidden * hidden_states.element_size()) % 16 == 0, (
+        "permue kernel need hidden dim align to 16B"
+    )
     permuted_row_size = n_token * topk
     if align_block_size is not None:
-        permuted_row_size = (permuted_row_size + n_expert *
-                             (align_block_size - 1) + align_block_size -
-                             1) // align_block_size * align_block_size
+        permuted_row_size = (
+            (
+                permuted_row_size
+                + n_expert * (align_block_size - 1)
+                + align_block_size
+                - 1
+            )
+            // align_block_size
+            * align_block_size
+        )
     if n_local_expert == -1:
         n_local_expert = n_expert
     if permuted_hidden_states is None:
@@ -134,40 +135,57 @@ def moe_permute(
         )
     assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), (
         f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}"
-        f" but got {permuted_hidden_states.size()}")
-
-    token_expert_indices = torch.arange(0,
-                                        n_token * topk,
-                                        dtype=torch.int32,
-                                        device=hidden_states.device).reshape(
-                                            (n_token, topk))
-
-    m_indices = torch.full((permuted_row_size, ),
-                           fill_invalid_expert,
-                           dtype=torch.int32,
-                           device=hidden_states.device)
-    expert_first_token_offset = torch.empty(n_local_expert + 1,
-                                            dtype=torch.int64,
-                                            device=hidden_states.device)
-    permuted_idx = torch.full((permuted_row_size, ),
-                              n_token * topk,
-                              dtype=torch.int32,
-                              device=hidden_states.device)
-    inv_permuted_idx = torch.empty((n_token, topk),
-                                   dtype=torch.int32,
-                                   device=hidden_states.device)
+        f" but got {permuted_hidden_states.size()}"
+    )
+
+    token_expert_indices = torch.arange(
+        0, n_token * topk, dtype=torch.int32, device=hidden_states.device
+    ).reshape((n_token, topk))
+
+    m_indices = torch.full(
+        (permuted_row_size,),
+        fill_invalid_expert,
+        dtype=torch.int32,
+        device=hidden_states.device,
+    )
+    expert_first_token_offset = torch.empty(
+        n_local_expert + 1, dtype=torch.int64, device=hidden_states.device
+    )
+    permuted_idx = torch.full(
+        (permuted_row_size,),
+        n_token * topk,
+        dtype=torch.int32,
+        device=hidden_states.device,
+    )
+    inv_permuted_idx = torch.empty(
+        (n_token, topk), dtype=torch.int32, device=hidden_states.device
+    )
     topk_ids = topk_ids.to(torch.int32)
-    torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
-                                 expert_map, n_expert, n_local_expert, topk,
-                                 align_block_size, permuted_hidden_states,
-                                 expert_first_token_offset, inv_permuted_idx,
-                                 permuted_idx, m_indices)
+    torch.ops._moe_C.moe_permute(
+        hidden_states,
+        topk_ids,
+        token_expert_indices,
+        expert_map,
+        n_expert,
+        n_local_expert,
+        topk,
+        align_block_size,
+        permuted_hidden_states,
+        expert_first_token_offset,
+        inv_permuted_idx,
+        permuted_idx,
+        m_indices,
+    )
 
     if a1q_scale is not None and a1q_scale.dim() > 1:
-        a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
-                              topk]
-    return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
-            inv_permuted_idx.flatten(), m_indices)
+        a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) // topk]
+    return (
+        permuted_hidden_states,
+        a1q_scale,
+        expert_first_token_offset,
+        inv_permuted_idx.flatten(),
+        m_indices,
+    )
 
 
 def moe_unpermute(
@@ -175,7 +193,7 @@ def moe_unpermute(
     permuted_hidden_states: torch.Tensor,
     topk_weights: torch.Tensor,
     inv_permuted_idx: torch.Tensor,
-    expert_first_token_offset: Optional[torch.Tensor] = None,
+    expert_first_token_offset: torch.Tensor | None = None,
 ) -> None:
     """
     This function expands and permutes activation to gathering uncontinuous
@@ -185,7 +203,7 @@ def moe_unpermute(
     - permuted_hidden_states (torch.Tensor): permuted activation.
     - topk_weights (torch.Tensor): topk expert route weight for each token.
     - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
-    - expert_first_token_offset (Optional[torch.Tensor]): offset of the first 
+    - expert_first_token_offset (Optional[torch.Tensor]): offset of the first
       token of each expert for grouped gemm.
     Returns:
     - hidden_states (torch.Tensor): The reduced and unpermuted activation
@@ -193,12 +211,18 @@ def moe_unpermute(
     """
     topk = topk_weights.size(1)
     n_hidden = permuted_hidden_states.size(-1)
-    assert (n_hidden * permuted_hidden_states.element_size()
-            ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
-
-    torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
-                                   inv_permuted_idx, expert_first_token_offset,
-                                   topk, out)
+    assert (n_hidden * permuted_hidden_states.element_size()) % 16 == 0, (
+        "unpermue kernel need hidden dim align to 16B"
+    )
+
+    torch.ops._moe_C.moe_unpermute(
+        permuted_hidden_states,
+        topk_weights,
+        inv_permuted_idx,
+        expert_first_token_offset,
+        topk,
+        out,
+    )
 
 
 def moe_permute_unpermute_supported():
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index 6160da7329518e7aac33cfc03a4efcce2be03ccd..f721d00d75ea7161b9be673ebc23b30ae3c97a85 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -45,7 +45,7 @@ def fused_moe(
     for expert_idx in range(num_experts):
         expert_w1 = w1[expert_idx]
         expert_w2 = w2[expert_idx]
-        expert_mask = (selected_experts == expert_idx)
+        expert_mask = selected_experts == expert_idx
         expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
         x = F.linear(hidden_states, expert_w1)
         gate = F.silu(x[:, :intermediate_size])
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index ddddd2a3b7a2e4adb2b8337865c38c692af84a38..2766a2c2249fb5353d102b351d008a26b5470b6d 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import pplx_kernels as pplx
 import torch
@@ -9,10 +9,13 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate,
+)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _validate_scale_shape, moe_kernel_quantize_input)
-from vllm.utils import cdiv, round_up
+    _validate_scale_shape,
+    moe_kernel_quantize_input,
+)
+from vllm.utils.math_utils import cdiv, round_up
 
 logger = init_logger(__name__)
 
@@ -21,9 +24,9 @@ def pplx_hidden_dim_scale_bytes(
     max_num_tokens: int,
     hidden_dim: int,
     in_dtype: torch.dtype,
-    quant_dtype: Union[torch.dtype, str, None],
+    quant_dtype: torch.dtype | str | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
 ):
     # All pplx byte sizes must be 16-byte aligned.
     align = 16
@@ -60,7 +63,6 @@ def pplx_hidden_dim_scale_bytes(
 
 
 class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
-
     def __init__(
         self,
         a2a: pplx.AllToAll,
@@ -80,15 +82,18 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_num_tokens
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.uint32
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     def supports_async(self) -> bool:
         return True
 
@@ -98,7 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> tuple[Callable, mk.ReceiverType]:
@@ -113,8 +118,9 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         if expert_map is not None:
             logger.warning_once(
                 "The PPLX backend does not support expert mapping. "
-                "The provided `expert_map` will be ignored.")
-        expert_map = None  #noqa: F841
+                "The provided `expert_map` will be ignored."
+            )
+        expert_map = None  # noqa: F841
 
         # Is this always going to be a1.device?
         device = a1.device
@@ -123,23 +129,26 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1")
+                "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1 = a1 * topk_weights.to(a1.dtype)
 
         repeat_cols = 4
         repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
         # TODO(bnell): always pass quant_config.a1_scale?
         a1q, a1q_scale = moe_kernel_quantize_input(
-            a1, (None if quant_config.per_act_token_quant else
-                 quant_config.a1_scale),
+            a1,
+            (None if quant_config.per_act_token_quant else quant_config.a1_scale),
             quant_dtype=quant_config.quant_dtype,
             per_act_token_quant=quant_config.per_act_token_quant,
-            block_shape=quant_config.block_shape)
+            block_shape=quant_config.block_shape,
+        )
 
-        _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant,
-                              quant_config.block_shape)
+        _validate_scale_shape(
+            a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape
+        )
 
-        orig_a_scale_block_shape: Optional[int] = None
+        orig_a_scale_block_shape: int | None = None
 
         if a1q_scale is not None:
             scalar_scales = a1q_scale.numel() == 1
@@ -155,8 +164,9 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 # TODO (bnell): use group_broadcast instead?
                 a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
-        assert a1q_scale is None or a1q_scale.ndim == 2, \
+        assert a1q_scale is None or a1q_scale.ndim == 2, (
             f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
+        )
 
         expert_num_tokens = torch.empty(
             self.num_local_experts,
@@ -165,13 +175,16 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         )
 
         expert_x = torch.empty(
-            (self.num_local_experts,
-             self.max_num_tokens * self.num_dispatchers(), hidden_dim),
+            (
+                self.num_local_experts,
+                self.max_num_tokens * self.num_dispatchers(),
+                hidden_dim,
+            ),
             dtype=a1q.dtype,
             device=device,
         )
 
-        expert_x_scale: Optional[torch.Tensor] = None
+        expert_x_scale: torch.Tensor | None = None
         if a1q.dtype.itemsize == 1:
             if quant_config.is_per_act_token:
                 # (M x 1) -> (E x M x K)
@@ -182,14 +195,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             else:
                 # (M x K_tiles) -> (E x M x K_tiles)
                 assert quant_config.block_shape is not None
-                num_blocks = cdiv(expert_x.size(2),
-                                  quant_config.block_shape[1])
+                num_blocks = cdiv(expert_x.size(2), quant_config.block_shape[1])
                 final_dim = num_blocks
 
             expert_x_scale_shape = (
                 self.num_local_experts,
                 expert_x.size(1),
-                round_up(final_dim, 4)  # round up for alignment
+                round_up(final_dim, 4),  # round up for alignment
             )
 
             expert_x_scale = torch.empty(
@@ -200,7 +212,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
-        bound_m: Optional[torch.Tensor] = None
+        bound_m: torch.Tensor | None = None
 
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
@@ -226,27 +238,30 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             do_recv=True,
         )
 
-        return (hook, lambda: self._receiver(
-            expert_num_tokens,
-            expert_x,
-            expert_x_scale,
-            orig_a_scale_block_shape,
-        ))
+        return (
+            hook,
+            lambda: self._receiver(
+                expert_num_tokens,
+                expert_x,
+                expert_x_scale,
+                orig_a_scale_block_shape,
+            ),
+        )
 
     def _receiver(
         self,
         expert_num_tokens: torch.Tensor,
         expert_x: torch.Tensor,
-        expert_x_scale: Optional[torch.Tensor],
-        orig_a_scale_block_shape: Optional[int],
+        expert_x_scale: torch.Tensor | None,
+        orig_a_scale_block_shape: int | None,
     ) -> mk.PrepareResultType:
-
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
             assert expert_x_scale.ndim == 3
 
         expert_tokens_meta = mk.ExpertTokensMetadata(
-            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
+            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
+        )
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
@@ -256,7 +271,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -281,22 +296,24 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
     ) -> Callable:
-        assert isinstance(
-            weight_and_reduce_impl, TopKWeightAndReduceDelegate
-        ), ("Weight application and reduction happens in the combine kernel.")
+        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), (
+            "Weight application and reduction happens in the combine kernel."
+        )
 
         # This argument is optional
         # There's not much point setting this unless it is != topk_ids.size(0)
-        bound_m: Optional[torch.Tensor] = None
+        bound_m: torch.Tensor | None = None
 
         # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
-        #num_tokens = output.size(0)  # M
-        #assert topk_ids.size(0) == num_tokens, (
+        # num_tokens = output.size(0)  # M
+        # assert topk_ids.size(0) == num_tokens, (
         #    f"{topk_ids.size(0)} == {num_tokens}")
         assert topk_ids.size() == topk_weights.size(), (
-            f"{topk_ids.size()} == {topk_weights.size()}")
+            f"{topk_ids.size()} == {topk_weights.size()}"
+        )
         assert output.size(0) <= self.max_num_tokens, (
-            f"{output.size(0)} <= {self.max_num_tokens}")
+            f"{output.size(0)} <= {self.max_num_tokens}"
+        )
         assert output.size(1) == fused_expert_output.size(-1)
 
         # Set weights to 1 if we did them in dispatch. This is hacky.
@@ -305,21 +322,25 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         topk_ids_u32 = topk_ids.view(dtype=torch.uint32)
 
-        self.a2a.combine(out_tokens=output,
-                         indices=topk_ids_u32,
-                         weights=topk_weights,
-                         expert_y=fused_expert_output,
-                         bound_m=bound_m,
-                         do_send=True,
-                         do_recv=False)
-
-        return lambda: self.a2a.combine(out_tokens=output,
-                                        indices=topk_ids_u32,
-                                        weights=topk_weights,
-                                        expert_y=fused_expert_output,
-                                        bound_m=bound_m,
-                                        do_send=False,
-                                        do_recv=True)
+        self.a2a.combine(
+            out_tokens=output,
+            indices=topk_ids_u32,
+            weights=topk_weights,
+            expert_y=fused_expert_output,
+            bound_m=bound_m,
+            do_send=True,
+            do_recv=False,
+        )
+
+        return lambda: self.a2a.combine(
+            out_tokens=output,
+            indices=topk_ids_u32,
+            weights=topk_weights,
+            expert_y=fused_expert_output,
+            bound_m=bound_m,
+            do_send=False,
+            do_recv=True,
+        )
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 588e5de865dd9eb28755f66f6f43fafc3d61f30e..9bb976fb9ec9300b174284a26ee037328af563da 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,53 +1,59 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    TopKWeightAndReduceContiguous,
+    TopKWeightAndReduceDelegate,
+)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 
 
 class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
-
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
         return 1
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def prepare(
         self,
         a1: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
-
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
-            assert topk == 1, \
+            assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
+            )
             a1.mul_(topk_weights.to(a1.dtype))
 
         a1q, a1q_scale = moe_kernel_quantize_input(
-            a1, quant_config.a1_scale, quant_config.quant_dtype,
-            quant_config.per_act_token_quant, quant_config.block_shape)
+            a1,
+            quant_config.a1_scale,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+        )
 
         return a1q, a1q_scale, None, None, None
 
@@ -67,4 +73,5 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
             fused_expert_output=fused_expert_output,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            apply_router_weight_on_input=apply_router_weight_on_input)
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 2764af5fc53232c14866075db739437d6fec9095..8f05828d74f5ff514eb8a8190db6256fb2a8cbbd 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,16 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
-from functools import cache
-from typing import Optional
+from functools import lru_cache
 
 import torch
 
-from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig)
-from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEQuantConfig,
+)
 
 
 class QuantMethod(IntEnum):
@@ -36,222 +35,60 @@ class ActivationMethod(IntEnum):
     GELU = 1
 
 
-@cache
-def is_rocm_aiter_moe_enabled() -> bool:
-    return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER_MOE \
-        and envs.VLLM_ROCM_USE_AITER
-
-
-def rocm_aiter_asm_moe_tkw1_impl(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        fc1_scale: Optional[torch.Tensor] = None,
-        fc2_scale: Optional[torch.Tensor] = None,
-        fc1_smooth_scale: Optional[torch.Tensor] = None,
-        fc2_smooth_scale: Optional[torch.Tensor] = None,
-        a16: bool = False,
-        per_tensor_quant_scale: Optional[torch.Tensor] = None,
-        expert_mask: Optional[torch.Tensor] = None,
-        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
-
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
-
-    activation = ActivationType(activation_method)
-
-    return asm_moe_tkw1(hidden_states,
-                        w1,
-                        w2,
-                        topk_weights,
-                        topk_ids,
-                        fc1_scale=fc1_scale,
-                        fc2_scale=fc2_scale,
-                        fc1_smooth_scale=fc1_smooth_scale,
-                        fc2_smooth_scale=fc2_smooth_scale,
-                        a16=a16,
-                        per_tensor_quant_scale=per_tensor_quant_scale,
-                        expert_mask=expert_mask,
-                        activation=activation)
-
-
-def rocm_aiter_asm_moe_tkw1_fake(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        fc1_scale: Optional[torch.Tensor] = None,
-        fc2_scale: Optional[torch.Tensor] = None,
-        fc1_smooth_scale: Optional[torch.Tensor] = None,
-        fc2_smooth_scale: Optional[torch.Tensor] = None,
-        a16: bool = False,
-        per_tensor_quant_scale: Optional[torch.Tensor] = None,
-        expert_mask: Optional[torch.Tensor] = None,
-        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
-                                 topk_indices: torch.Tensor,
-                                 token_expert_indices: torch.Tensor,
-                                 gating_output: torch.Tensor,
-                                 renormalize: bool) -> None:
-    from aiter import topk_softmax
-    topk_softmax(topk_weights, topk_indices, token_expert_indices,
-                 gating_output, renormalize)
-
-
-def rocm_aiter_topk_softmax_fake(topk_weights: torch.Tensor,
-                                 topk_indices: torch.Tensor,
-                                 token_expert_indices: torch.Tensor,
-                                 gating_output: torch.Tensor,
-                                 renormalize: bool) -> None:
-    pass
-
-
-def rocm_aiter_biased_grouped_topk_impl(
-        gating_output: torch.Tensor,
-        correction_bias: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_expert_group: int,
-        topk_group: int,
-        need_renorm: bool,
-        routed_scaling_factor: float = 1.0  # mul to topk_weights
-) -> None:
-
-    from aiter import biased_grouped_topk
-
-    biased_grouped_topk(gating_output, correction_bias, topk_weights, topk_ids,
-                        num_expert_group, topk_group, need_renorm,
-                        routed_scaling_factor)
-
-
-def rocm_aiter_biased_grouped_topk_fake(
-        gating_output: torch.Tensor,
-        correction_bias: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_expert_group: int,
-        topk_group: int,
-        need_renorm: bool,
-        routed_scaling_factor: float = 1.0  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_grouped_topk_impl(
-        gating_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_expert_group: int,
-        topk_group: int,
-        need_renorm: bool,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0  # mul to topk_weights
-) -> None:
-
-    from aiter import grouped_topk
-
-    grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group,
-                 topk_group, need_renorm, scoring_func, routed_scaling_factor)
-
-
-def rocm_aiter_grouped_topk_fake(
-        gating_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_expert_group: int,
-        topk_group: int,
-        need_renorm: bool,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_fused_moe_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: Optional[torch.Tensor] = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    from aiter import ActivationType, QuantType
-    from aiter.fused_moe import fused_moe
-
-    activation = ActivationType(activation_method)
-    quant_type = QuantType(quant_method)
-
-    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask,
-                     activation, quant_type, doweight_stage1, w1_scale,
-                     w2_scale, a1_scale, a2_scale)
-
-
-def rocm_aiter_fused_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: Optional[torch.Tensor] = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-if current_platform.is_rocm():
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_asm_moe_tkw1",
-        op_func=rocm_aiter_asm_moe_tkw1_impl,
-        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+aiter_topK_meta_data = None
+
+
+@lru_cache(maxsize=1)
+def init_aiter_topK_meta_data(
+    n_routed_experts: int,
+    n_shared_experts: int,
+    top_k: int,
+    tp_rank: int,
+    tp_size: int,
+    shared_experts_score: float = 1.0,
+    max_num_tokens: int = 32768,
+    is_EP: bool = False,
+):
+    global aiter_topK_meta_data
+    fake_expertid = n_routed_experts + n_shared_experts
+
+    # all layers reuse same buffer
+    # This extra element when EP is enabled is used as a sentinel
+    # to mask out shared expert processing for tokens not owned by
+    # the current EP rank. This is necessary to avoid double-processing
+    # of shared experts.
+    total_topk_ids = torch.empty(
+        (max_num_tokens, top_k + n_shared_experts + is_EP),
+        dtype=torch.int32,
+        device="cuda",
     )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_fused_moe",
-        op_func=rocm_aiter_fused_moe_impl,
-        fake_impl=rocm_aiter_fused_moe_fake,
+    ns_topk_ids, s_topk_ids = total_topk_ids.split(
+        [top_k, n_shared_experts + is_EP], dim=1
     )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_topk_softmax",
-        op_func=rocm_aiter_topk_softmax_impl,
-        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
-        fake_impl=rocm_aiter_topk_softmax_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_biased_grouped_topk",
-        op_func=rocm_aiter_biased_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_biased_grouped_topk_fake,
+    shared_expert_ids = [n_routed_experts + i for i in range(n_shared_experts + is_EP)]
+    if is_EP:
+        s_topk_ids_list = [
+            [fake_expertid] * (n_shared_experts + is_EP)
+        ] * max_num_tokens
+        for i in range(tp_rank, max_num_tokens, tp_size):
+            s_topk_ids_list[i] = shared_expert_ids
+    else:
+        s_topk_ids_list = [
+            list(range(n_routed_experts, fake_expertid))
+        ] * max_num_tokens
+    s_topk_ids[:] = torch.tensor(s_topk_ids_list, dtype=torch.int32, device="cuda")
+
+    total_topk_weights = torch.empty(
+        (max_num_tokens, top_k + n_shared_experts + is_EP),
+        dtype=torch.float32,
+        device="cuda",
     )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_grouped_topk",
-        op_func=rocm_aiter_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_grouped_topk_fake,
+    ns_topk_weights, s_topk_weights = total_topk_weights.split(
+        [top_k, n_shared_experts + is_EP], dim=1
     )
+    s_topk_weights.fill_(shared_experts_score)
+    assert aiter_topK_meta_data is None, "AITER topK meta data is already initialized"
+    aiter_topK_meta_data = (total_topk_weights, total_topk_ids)
 
 
 def rocm_aiter_grouped_topk(
@@ -263,17 +100,40 @@ def rocm_aiter_grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None
+    e_score_correction_bias: torch.Tensor | None = None,
+    num_fused_shared_experts: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
     device = hidden_states.device
-    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
-    topk_weights = torch.empty((token, topk),
-                               dtype=torch.float32,
-                               device=device)
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
+        assert aiter_topK_meta_data is not None, (
+            "AITER topK meta data is not initialized. "
+            "Please ensure that init_aiter_topK_meta_data "
+            "is called before this function."
+        )
+        total_topk_weights, total_topk_ids = aiter_topK_meta_data
+        assert total_topk_weights.shape[0] >= token, (
+            f"AITER topK meta data support {total_topk_weights.shape[0]} "
+            f"tokens which is determined by max_num_batched_tokens, "
+            f"but got {token} tokens now."
+        )
+        total_topk_weights = total_topk_weights[:token]
+        total_topk_ids = total_topk_ids[:token]
+        topk_weights, _ = total_topk_weights.split(
+            [topk, total_topk_weights.shape[1] - topk], dim=1
+        )
+        topk_ids, _ = total_topk_ids.split(
+            [topk, total_topk_ids.shape[1] - topk], dim=1
+        )
+    else:
+        topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+        topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     if e_score_correction_bias is not None:
-        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+        rocm_aiter_ops.biased_grouped_topk(
             gating_output,
             e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
@@ -281,10 +141,11 @@ def rocm_aiter_grouped_topk(
             num_expert_group,
             topk_group,
             renormalize,
+            routed_scaling_factor=routed_scaling_factor,
         )
     else:
-        assert (scoring_func == "softmax" or scoring_func == "sigmoid")
-        torch.ops.vllm.rocm_aiter_grouped_topk(
+        assert scoring_func == "softmax" or scoring_func == "sigmoid"
+        rocm_aiter_ops.grouped_topk(
             gating_output,
             topk_weights,
             topk_ids,
@@ -292,10 +153,14 @@ def rocm_aiter_grouped_topk(
             topk_group,
             renormalize,
             scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
         )
 
-    if routed_scaling_factor != 1.0:
-        topk_weights = topk_weights * routed_scaling_factor
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
+        return total_topk_weights, total_topk_ids
     return topk_weights, topk_ids
 
 
@@ -307,36 +172,38 @@ def rocm_aiter_fused_experts(
     topk_ids: torch.Tensor,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
-    expert_map: Optional[torch.Tensor] = None,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    expert_map: torch.Tensor | None = None,
+    quant_config: FusedMoEQuantConfig | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
 
-    activation_method = (ActivationMethod.SILU
-                         if activation == "silu" else ActivationMethod.GELU)
+    activation_method = (
+        ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU
+    )
     # All AITER Fused MoE kernels are expecting the following datatypes
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
 
-    if expert_map is not None:
-        expert_mask = (expert_map > -1).to(torch.int32)
-    else:
-        expert_mask = None
+    expert_mask = expert_map if expert_map is not None else None
 
     # w8a8 per-channel quantization
-    if (quant_config.per_act_token_quant and apply_router_weight_on_input
-            and quant_config.use_fp8_w8a8):
+    if (
+        quant_config.per_act_token_quant
+        and apply_router_weight_on_input
+        and quant_config.use_fp8_w8a8
+    ):
         # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
         # This applies topk_weights on the GEMM output of the first FC layer
         #  rather than the second FC.
-        assert (topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.dim() == 2, (
+            "`topk_weights` should be in shape (num_tokens, topk)"
+        )
         assert topk_weights.shape[-1] == 1, (
-            "Only support topk=1 when"
-            " `apply_router_weight_on_input` is True")
+            "Only support topk=1 when `apply_router_weight_on_input` is True"
+        )
 
-        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+        return rocm_aiter_ops.asm_moe_tkw1(
             hidden_states,
             w1,
             w2,
@@ -349,32 +216,39 @@ def rocm_aiter_fused_experts(
             a16=False,
             per_tensor_quant_scale=None,
             expert_mask=expert_mask,
-            activation_method=activation_method)
+            activation_method=activation_method,
+        )
 
     else:
         quant_method = QuantMethod.NO.value
-
+        # quark moe for mxfp4 w_dtype
+        if quant_config.use_mxfp4_w4a16:
+            quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
             assert not apply_router_weight_on_input, (
                 "apply_router_weight_on_input is\
-                not supported for block scaled moe")
+                not supported for block scaled moe"
+            )
             assert quant_config.w1_scale is not None
             assert quant_config.w2_scale is not None
             quant_method = QuantMethod.BLOCK_128x128.value
+        elif quant_config.use_fp8_w8a8 and quant_config.per_out_ch_quant:
+            quant_method = QuantMethod.PER_TOKEN.value
         elif quant_config.use_fp8_w8a8:
             # Currently only per tensor quantization method is enabled.
             quant_method = QuantMethod.PER_TENSOR.value
 
         if apply_router_weight_on_input:
-            assert (topk_weights.dim() == 2
-                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+            assert topk_weights.dim() == 2, (
+                "`topk_weights` should be in shape (num_tokens, topk)"
+            )
             _, topk = topk_weights.shape
-            assert (
-                topk == 1
-            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+            assert topk == 1, (
+                "Only support topk=1 when `apply_router_weight_on_input` is True"
+            )
 
-        return torch.ops.vllm.rocm_aiter_fused_moe(
+        return rocm_aiter_ops.fused_moe(
             hidden_states,
             w1,
             w2,
@@ -387,38 +261,5 @@ def rocm_aiter_fused_experts(
             w2_scale=quant_config.w2_scale,
             a1_scale=quant_config.a1_scale,
             a2_scale=quant_config.a2_scale,
-            doweight_stage1=apply_router_weight_on_input)
-
-
-def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
-                            topk_indices: torch.Tensor,
-                            token_expert_indices: torch.Tensor,
-                            gating_output: torch.Tensor,
-                            renormalize: bool) -> tuple[torch.Tensor, ...]:
-    torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
-                                           token_expert_indices, gating_output,
-                                           renormalize)
-    return topk_weights, topk_indices
-
-
-def shuffle_weights(
-    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
-) -> tuple[torch.Tensor, ...]:
-    """
-    Applies shuffle_weight function from AITER to each
-    input tensor and returns them.
-
-    Rearranges (shuffles) the input tensor/s
-    into a specified block layout for optimized computation.
-
-    Args:
-        *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the block sizes used to divide 
-            the tensors during shuffling. Default is (16, 16).
-
-    Returns:
-    A Tuple of shuffled tensors.
-    """
-    from aiter.ops.shuffle import shuffle_weight
-
-    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
+            doweight_stage1=apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py
index 8758a570b3c63d9ec572f4397ff737b5a9506351..a01cdc4908b93e46c6f2ecd34649fe67f24ba196 100644
--- a/vllm/model_executor/layers/fused_moe/routing_simulator.py
+++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py
@@ -10,10 +10,14 @@ like uniform random routing.
 """
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 class RoutingStrategy(ABC):
     """Base class for token-to-expert routing strategies."""
@@ -24,7 +28,7 @@ class RoutingStrategy(ABC):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route tokens to experts.
@@ -50,9 +54,7 @@ class DistributionBasedRouting(RoutingStrategy):
     distributions for testing different routing patterns.
     """
 
-    def __init__(self,
-                 distribution: str = "uniform",
-                 **distribution_params: Any):
+    def __init__(self, distribution: str = "uniform", **distribution_params: Any):
         """
         Initialize distribution-based routing.
 
@@ -76,8 +78,10 @@ class DistributionBasedRouting(RoutingStrategy):
         valid_distributions = ["uniform", "normal"]
 
         if self.distribution not in valid_distributions:
-            raise ValueError(f"Unsupported distribution: {self.distribution}. "
-                             f"Supported distributions: {valid_distributions}")
+            raise ValueError(
+                f"Unsupported distribution: {self.distribution}. "
+                f"Supported distributions: {valid_distributions}"
+            )
 
         # Set default parameters if not provided
         if self.distribution == "normal":
@@ -89,7 +93,7 @@ class DistributionBasedRouting(RoutingStrategy):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Randomly select experts for each token using the specified distribution.
@@ -112,12 +116,12 @@ class DistributionBasedRouting(RoutingStrategy):
             indices_type = torch.long
 
         # Generate expert IDs based on the specified distribution
-        topk_ids = self._sample_expert_ids(num_tokens, num_experts, top_k,
-                                           hidden_states.device, indices_type)
+        topk_ids = self._sample_expert_ids(
+            num_tokens, num_experts, top_k, hidden_states.device, indices_type
+        )
 
         # Generate weights based on the distribution
-        topk_weights = self._generate_weights(num_tokens, top_k,
-                                              hidden_states.device)
+        topk_weights = self._generate_weights(num_tokens, top_k, hidden_states.device)
 
         return topk_weights, topk_ids
 
@@ -145,7 +149,8 @@ class DistributionBasedRouting(RoutingStrategy):
             # For normal distribution, sample continuous values and map to
             # expert IDs
             continuous_samples = self._sample_continuous_distribution(
-                num_tokens, top_k, device)
+                num_tokens, top_k, device
+            )
 
             # Map continuous samples to expert indices
             # Normalize to [0, 1] range and scale to [0, num_experts)
@@ -158,8 +163,9 @@ class DistributionBasedRouting(RoutingStrategy):
         else:
             raise ValueError(f"Unsupported distribution: {self.distribution}")
 
-    def _sample_continuous_distribution(self, num_tokens: int, top_k: int,
-                                        device: torch.device) -> torch.Tensor:
+    def _sample_continuous_distribution(
+        self, num_tokens: int, top_k: int, device: torch.device
+    ) -> torch.Tensor:
         """Sample from continuous distributions."""
         shape = (num_tokens, top_k)
 
@@ -170,7 +176,8 @@ class DistributionBasedRouting(RoutingStrategy):
 
         else:
             raise ValueError(
-                f"Unsupported continuous distribution: {self.distribution}")
+                f"Unsupported continuous distribution: {self.distribution}"
+            )
 
     def _normalize_samples(self, samples: torch.Tensor) -> torch.Tensor:
         """Normalize samples to [0, 1] range."""
@@ -179,11 +186,13 @@ class DistributionBasedRouting(RoutingStrategy):
             return torch.sigmoid(samples)
 
         else:
-            raise ValueError(f"Unsupported distribution for normalization: "
-                             f"{self.distribution}")
+            raise ValueError(
+                f"Unsupported distribution for normalization: {self.distribution}"
+            )
 
-    def _generate_weights(self, num_tokens: int, top_k: int,
-                          device: torch.device) -> torch.Tensor:
+    def _generate_weights(
+        self, num_tokens: int, top_k: int, device: torch.device
+    ) -> torch.Tensor:
         """Generate weights based on the distribution."""
         if self.distribution == "uniform":
             # All-ones weights for uniform distribution
@@ -197,7 +206,8 @@ class DistributionBasedRouting(RoutingStrategy):
             # For normal distribution, generate weights from the same
             # distribution
             continuous_weights = self._sample_continuous_distribution(
-                num_tokens, top_k, device)
+                num_tokens, top_k, device
+            )
             # Normalize to positive values and sum to 1
             weights = torch.abs(continuous_weights)
             weights = weights / weights.sum(dim=-1, keepdim=True)
@@ -205,14 +215,14 @@ class DistributionBasedRouting(RoutingStrategy):
 
         else:
             raise ValueError(
-                f"Unsupported distribution for weight generation: "
-                f"{self.distribution}")
+                f"Unsupported distribution for weight generation: {self.distribution}"
+            )
 
     def get_distribution_info(self) -> dict:
         """Get information about the current distribution configuration."""
         return {
             "distribution": self.distribution,
-            "parameters": self.distribution_params.copy()
+            "parameters": self.distribution_params.copy(),
         }
 
 
@@ -228,10 +238,12 @@ class RoutingSimulator:
     # Class-level registry of routing strategies
     _routing_strategies: dict[str, RoutingStrategy] = {
         # Basic routing strategies
-        "uniform_random":
-        DistributionBasedRouting(distribution="uniform", mean=0.0, std=1.0),
-        "normal_routing":
-        DistributionBasedRouting(distribution="normal", mean=0.0, std=1.0),
+        "uniform_random": DistributionBasedRouting(
+            distribution="uniform", mean=0.0, std=1.0
+        ),
+        "normal_routing": DistributionBasedRouting(
+            distribution="normal", mean=0.0, std=1.0
+        ),
     }
 
     @classmethod
@@ -261,7 +273,7 @@ class RoutingSimulator:
         router_logits: torch.Tensor,
         strategy_name: str,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Simulate token-to-expert routing using the specified strategy.
@@ -280,7 +292,14 @@ class RoutingSimulator:
             raise ValueError(
                 f"Unknown routing strategy: {strategy_name}. "
                 f"Available strategies: "
-                f"{list(RoutingSimulator._routing_strategies.keys())}")
+                f"{list(RoutingSimulator._routing_strategies.keys())}"
+            )
+        logger.warning_once(
+            "Simulating MoE routing using a %s strategy. "
+            "This should only be used for performance testing. "
+            "Model outputs will not be valid.",
+            strategy_name,
+        )
 
         strategy = RoutingSimulator._routing_strategies[strategy_name]
         return strategy.route_tokens(
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ec8b33ed930977fa08498797c1d6140e1fa91bf
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+# TODO(bnell): Add shared + fused combo function? e.g. +
+class SharedFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of shared experts.
+    If an all2all communicator is being used the shared expert computation
+    can be interleaved with the fused all2all dispatch communication step.
+    """
+
+    def __init__(
+        self,
+        shared_experts: torch.nn.Module | None,
+        gate: torch.nn.Module | None = None,
+        use_overlapped: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._shared_experts = shared_experts
+
+        # Disable shared expert overlap if:
+        #   - we are using eplb, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
+        #   - we are using marlin kjernels
+        self.use_overlapped = (
+            use_overlapped
+            and not (
+                # TODO(wentao): find the root cause and remove this condition
+                self.enable_eplb
+                or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or self.use_marlin_kernels
+            )
+            and self._shared_experts is not None
+        )
+
+        self._gate = gate
+
+    @property
+    def shared_experts(self) -> torch.nn.Module | None:
+        return self._shared_experts if self.use_overlapped else None
+
+    @property
+    def gate(self) -> torch.nn.Module | None:
+        return self._gate if self.use_overlapped else None
+
+    @property
+    def is_internal_router(self) -> bool:
+        return self.gate is not None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not self.use_overlapped:
+            if self._shared_experts is not None:
+                shared_out = self._shared_experts(hidden_states)
+
+                # Reduce shared expert outputs if necessary, since the MLP
+                # should have been created with reduce_results=False.
+                if (
+                    self.reduce_results
+                    and get_tensor_model_parallel_world_size() > 1
+                    and self.must_reduce_shared_expert_outputs()
+                ):
+                    shared_out = tensor_model_parallel_all_reduce(shared_out)
+            else:
+                shared_out = None
+
+            fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+        else:
+            shared_out, fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+            # ensure early TP reduction of shared expert outputs when required
+            if (
+                shared_out is not None
+                and self.reduce_results
+                and get_tensor_model_parallel_world_size() > 1
+                and self.must_reduce_shared_expert_outputs()
+            ):
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
+        return shared_out, fused_out
diff --git a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
index fb398eec119fa2366ecf5b83259595a2cbc6d9b6..99d4038ec3813229015fb5fcd4883901fd575cfb 100644
--- a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
+++ b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -19,7 +18,7 @@ class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
     PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize
     does the weight-application + reduction as part of the pplx combine kernel.
     But the BatchedPrepareAndFinalize needs an implementation. To facilitate
-    this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate 
+    this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate
     so the PrepareAndFinalize implementations could choose how to
     weight + reduce.
     """
@@ -27,12 +26,18 @@ class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
     def __eq__(self, other):
         return isinstance(other, TopKWeightAndReduceDelegate)
 
-    def apply(self, output: Optional[torch.Tensor],
-              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              apply_router_weight_on_input: bool) -> torch.Tensor:
-        raise RuntimeError("The caller is expected to choose an appropriate "
-                           "TopKWeightAndReduce implementation.")
+    def apply(
+        self,
+        output: torch.Tensor | None,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
+        raise RuntimeError(
+            "The caller is expected to choose an appropriate "
+            "TopKWeightAndReduce implementation."
+        )
 
 
 class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
@@ -44,10 +49,14 @@ class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
     def __eq__(self, other):
         return isinstance(other, TopKWeightAndReduceNoOP)
 
-    def apply(self, output: Optional[torch.Tensor],
-              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              apply_router_weight_on_input: bool) -> torch.Tensor:
+    def apply(
+        self,
+        output: torch.Tensor | None,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
         # Weight application and reduction operations are already done.
         if output is None:
             return fused_expert_output
@@ -57,7 +66,8 @@ class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
         assert output.size() == fused_expert_output.size(), (
             "output shape is expected to match the fused_expert_output shape. "
             f"But got output={output.size()}, "
-            f"used_expert_output={fused_expert_output.size()}")
+            f"used_expert_output={fused_expert_output.size()}"
+        )
         output.copy_(fused_expert_output, non_blocking=True)
         return output
 
@@ -71,11 +81,14 @@ class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
     def __eq__(self, other):
         return isinstance(other, TopKWeightAndReduceContiguous)
 
-    def apply(self, output: Optional[torch.Tensor],
-              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              apply_router_weight_on_input: bool) -> torch.Tensor:
-
+    def apply(
+        self,
+        output: torch.Tensor | None,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
         m, num_topk = topk_ids.size()
         k = fused_expert_output.size(-1)
         if fused_expert_output.ndim == 2:
@@ -83,17 +96,21 @@ class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
 
         assert fused_expert_output.size() == (m, num_topk, k), (
             f"Expected fused_expert_output size {(m, num_topk, k)}. But got "
-            f"{fused_expert_output.size()}")
+            f"{fused_expert_output.size()}"
+        )
 
         if not apply_router_weight_on_input:
             fused_expert_output.mul_(topk_weights.view(m, -1, 1))
 
         if output is None:
-            output = torch.empty((m, k),
-                                 device=fused_expert_output.device,
-                                 dtype=fused_expert_output.dtype)
+            output = torch.empty(
+                (m, k),
+                device=fused_expert_output.device,
+                dtype=fused_expert_output.dtype,
+            )
         assert output.size() == (m, k), (
-            f"Expected output size {(m, k)}. But got {output.size()}")
+            f"Expected output size {(m, k)}. But got {output.size()}"
+        )
 
         ops.moe_sum(fused_expert_output, output)
         return output
@@ -109,27 +126,35 @@ class TopKWeightAndReduceNaiveBatched(mk.TopKWeightAndReduce):
         self.rank = rank
 
     def __eq__(self, other):
-        return (isinstance(other, TopKWeightAndReduceNaiveBatched)
-                and (other.rank == self.rank))
-
-    def apply(self, output: Optional[torch.Tensor],
-              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              apply_router_weight_on_input: bool) -> torch.Tensor:
+        return isinstance(other, TopKWeightAndReduceNaiveBatched) and (
+            other.rank == self.rank
+        )
+
+    def apply(
+        self,
+        output: torch.Tensor | None,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
         assert fused_expert_output.ndim == 3
         num_tokens = topk_ids.size(0)
         num_local_experts = fused_expert_output.size(0)
         K = fused_expert_output.size(-1)
 
         if output is None:
-            output = torch.zeros((num_tokens, K),
-                                 device=fused_expert_output.device,
-                                 dtype=fused_expert_output.dtype)
+            output = torch.zeros(
+                (num_tokens, K),
+                device=fused_expert_output.device,
+                dtype=fused_expert_output.dtype,
+            )
         else:
             output.fill_(0)
 
         assert output.size() == (num_tokens, K), (
-            f"Expected output size {(num_tokens, K)}, but got {output.size()}")
+            f"Expected output size {(num_tokens, K)}, but got {output.size()}"
+        )
 
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 3de80ff85747e4792f305870787a348a9056c97d..b8e0837162ef63ecdf5e81ea810cdc5c8a76de23 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape)
-from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
-    deep_gemm_block_shape)
+    DeepGemmExperts,
+    _valid_deep_gemm,
+    _valid_deep_gemm_shape,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+)
 
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         quant_config: FusedMoEQuantConfig,
@@ -25,33 +27,40 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
         self.triton_expert = TritonExperts(quant_config)
 
-        self.allow_deep_gemm = (allow_deep_gemm
-                                and self.quant_config.use_fp8_w8a8 and
-                                self.block_shape == deep_gemm_block_shape())
+        self.allow_deep_gemm = (
+            allow_deep_gemm
+            and self.quant_config.use_fp8_w8a8
+            and self.block_shape == get_mk_alignment_for_contiguous_layout()
+        )
 
-        self.deep_gemm_expert = DeepGemmExperts(
-            self.quant_config) if self.allow_deep_gemm else None
+        self.deep_gemm_expert = (
+            DeepGemmExperts(self.quant_config) if self.allow_deep_gemm else None
+        )
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        assert (self.deep_gemm_expert is None
-                or self.triton_expert.activation_formats
-                == self.deep_gemm_expert.activation_formats)
+        assert (
+            self.deep_gemm_expert is None
+            or self.triton_expert.activation_formats
+            == self.deep_gemm_expert.activation_formats
+        )
         return self.triton_expert.activation_formats
 
     def supports_chunking(self) -> bool:
         dge = self.deep_gemm_expert
         te = self.triton_expert
-        return ((dge is None or dge.supports_chunking())
-                and (te is None or te.supports_chunking()))
+        return (dge is None or dge.supports_chunking()) and (
+            te is None or te.supports_chunking()
+        )
 
     def supports_expert_map(self) -> bool:
         dge = self.deep_gemm_expert
         te = self.triton_expert
-        return ((dge is None or dge.supports_expert_map())
-                and (te is None or te.supports_expert_map()))
+        return (dge is None or dge.supports_expert_map()) and (
+            te is None or te.supports_expert_map()
+        )
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         dge = self.deep_gemm_expert
@@ -64,7 +73,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         if is_dge_war and is_te_war:
             assert dge_war == te_war, (
                 "Both implementations should agree on WeightAndReduce impls. "
-                f"Got dge_war: {dge_war}, and te_war: {te_war}")
+                f"Got dge_war: {dge_war}, and te_war: {te_war}"
+            )
 
         if dge_war is not None:
             return dge_war
@@ -74,30 +84,40 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (is_deep_gemm_e8m0_used()
-                                     or _valid_deep_gemm_shape(M, N, K)):
+        if self.allow_deep_gemm and (
+            is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K)
+        ):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
-                expert_tokens_meta)
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_meta,
+            )
         else:
-            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
-                                                       global_num_experts,
-                                                       local_num_experts,
-                                                       expert_tokens_meta)
+            return self.triton_expert.workspace_shapes(
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_meta,
+            )
 
     def apply(
         self,
@@ -109,17 +129,17 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
-        use_deep_gemm = (self.allow_deep_gemm
-                         and (_valid_deep_gemm(hidden_states, w1, w2)
-                              or is_deep_gemm_e8m0_used()))
+        use_deep_gemm = self.allow_deep_gemm and (
+            is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2)
+        )
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
         assert experts is not None
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index 05ed93c942c8eb609cf2f70a606e79e5d4a9a2b6..132d35e65aba85f7c6c47c70fb314b41882eadd2 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP)
-from vllm.utils import next_power_of_2
+    TopKWeightAndReduceNoOP,
+)
 
 
 class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
     def __init__(
         self,
         moe: FusedMoEConfig,
@@ -32,10 +32,12 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     @property
     def activation_formats(
-        self
+        self,
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
 
     def supports_chunking(self) -> bool:
         return True
@@ -48,48 +50,19 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def workspace_shapes(
         self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # The workspaces for this implementation are managed by flashinfer.
-        # TODO(varun) : workspace1 is could be used as the output tensor. This
-        # is error-prone. Allow the `workspace_shapes` to return None workspaces
-        workspace1 = (M, K)
-        workspace2 = (0, 0)
+        workspace1 = (0,)
+        workspace2 = (0,)
         output = (M, K)
-        return (workspace1, workspace2, output, a.dtype)
-
-    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int,
-                             local_num_experts: int):
-        # Number of tokens in the input tensor.
-        num_tokens = x.shape[0]
-        # Factor to account for the imbalance of the experts.
-        # factor equals to the
-        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
-        # 1.0 means perfect expert distribution.
-        # > 1.0 means some experts have more tokens than the perfect
-        # distribution.
-        # < 1.0 does not make sense.
-        imbalance_factor = 1.3
-        # Calculate the number of tokens per expert assuming perfect
-        # distribution.
-        num_tokens_per_expert = (num_tokens * top_k) // local_num_experts
-        # Apply the imbalance factor.
-        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
-        # And pad the number to the next power of 2.
-        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
-        # Cap to 8-64 tokens per CTA tile as it's the range supported by the
-        #  kernel.
-        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-
-        return tile_tokens_dim
+        return (workspace1, workspace2, output)
 
     def apply(
         self,
@@ -101,12 +74,12 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         topk = topk_ids.size(-1)
@@ -117,75 +90,54 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
         x_quant = hidden_states
         x_scale = a1q_scale
         if x_scale is not None:
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
-                *x_quant.shape[:-1], -1)
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x_quant.shape[:-1], -1)
 
         packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
-            torch.bfloat16).view(torch.int16)
+            torch.bfloat16
+        ).view(torch.int16)
 
         assert self.w1_scale is not None
         assert self.w2_scale is not None
         kwargs = {
-            "topk_ids":
-            packed_tensor,
-            "routing_bias":
-            None,
-            "hidden_states":
-            x_quant,
-            "hidden_states_scale":
-            x_scale,
-            "gemm1_weights":
-            w1,
-            "gemm1_weights_scale":
-            self.w1_scale,
-            "gemm1_bias":
-            self.w1_bias,
-            "gemm1_alpha":
-            self.gemm1_alpha,
-            "gemm1_beta":
-            self.gemm1_beta,
-            "gemm1_clamp_limit":
-            self.gemm1_clamp_limit,
-            "gemm2_weights":
-            w2,
-            "gemm2_weights_scale":
-            self.w2_scale,
-            "gemm2_bias":
-            self.w2_bias,
-            "output1_scale_scalar":
-            None,
-            "output1_scale_gate_scalar":
-            None,
-            "output2_scale_scalar":
-            None,
-            "num_experts":
-            global_num_experts,
-            "top_k":
-            topk,
-            "n_group":
-            None,
-            "topk_group":
-            None,
-            "intermediate_size":
-            intermediate_size,
-            "local_expert_offset":
-            local_expert_offset,
-            "local_num_experts":
-            local_num_experts,
-            "routed_scaling_factor":
-            None,
-            "tile_tokens_dim":
-            self._get_tile_tokens_dim(x_quant, topk, local_num_experts),
-            "routing_method_type":
-            1,
-            "do_finalize":
-            True,
-            "output":
-            output,
-            "tune_max_num_tokens":
-            self.max_capture_size,
+            "topk_ids": packed_tensor,
+            "routing_bias": None,
+            "hidden_states": x_quant,
+            "hidden_states_scale": x_scale,
+            "gemm1_weights": w1,
+            "gemm1_weights_scale": self.w1_scale,
+            "gemm1_bias": self.w1_bias,
+            "gemm1_alpha": self.gemm1_alpha,
+            "gemm1_beta": self.gemm1_beta,
+            "gemm1_clamp_limit": self.gemm1_clamp_limit,
+            "gemm2_weights": w2,
+            "gemm2_weights_scale": self.w2_scale,
+            "gemm2_bias": self.w2_bias,
+            "output1_scale_scalar": None,
+            "output1_scale_gate_scalar": None,
+            "output2_scale_scalar": None,
+            "num_experts": global_num_experts,
+            "top_k": topk,
+            "n_group": None,
+            "topk_group": None,
+            "intermediate_size": intermediate_size,
+            "local_expert_offset": local_expert_offset,
+            "local_num_experts": local_num_experts,
+            "routed_scaling_factor": None,
+            "tile_tokens_dim": None,
+            "routing_method_type": 1,
+            "do_finalize": True,
+            "output": output,
+            "tune_max_num_tokens": max(self.max_capture_size, 1),
         }
 
         from flashinfer import trtllm_fp4_block_scale_routed_moe
-        trtllm_fp4_block_scale_routed_moe(**kwargs)
+
+        from vllm.utils.flashinfer import autotune
+
+        with autotune(False):
+            # Enable autotune when,
+            # https://github.com/flashinfer-ai/flashinfer/issues/2023 is
+            # resolved.
+            trtllm_fp4_block_scale_routed_moe(**kwargs)
+
         return output
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce56887f1c26d1ea918d2c58e3dec35f60e58345
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -0,0 +1,578 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch.nn.functional as F
+
+import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    biased_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if current_platform.is_cuda_alike():
+    from .fused_batched_moe import BatchedTritonExperts
+    from .fused_moe import TritonExperts, fused_experts
+else:
+    fused_experts = None  # type: ignore
+
+if current_platform.is_tpu():
+    from .moe_pallas import fused_moe as fused_moe_pallas
+else:
+    fused_moe_pallas = None  # type: ignore
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("unquantized_fused_moe")
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if self.rocm_aiter_moe_enabled:
+            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+
+            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
+        else:
+            self.rocm_aiter_fused_experts = None  # type: ignore
+
+        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
+        self.flashinfer_cutlass_moe_enabled = (
+            has_flashinfer_cutlass_fused_moe()
+            and envs.VLLM_USE_FLASHINFER_MOE_FP16
+            and self.moe.moe_parallel_config.use_ep
+            and self.moe.moe_parallel_config.dp_size == 1
+            and current_platform.get_device_capability()[0] >= 9
+        )
+        if self.flashinfer_cutlass_moe_enabled:
+            logger.info_once(
+                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
+            )
+            from functools import partial
+
+            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
+
+            self.flashinfer_cutlass_moe = partial(
+                flashinfer_cutlass_moe,
+                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
+                tp_rank=self.moe.moe_parallel_config.tp_rank,
+                tp_size=self.moe.moe_parallel_config.tp_size,
+                ep_rank=self.moe.moe_parallel_config.ep_rank,
+                ep_size=self.moe.moe_parallel_config.ep_size,
+            )
+        else:
+            if (
+                self.moe.moe_parallel_config.use_ep
+                and self.moe.moe_parallel_config.dp_size == 1
+            ):
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is available for EP"
+                    " but not enabled, consider setting"
+                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
+                    scope="local",
+                )
+            elif self.moe.moe_parallel_config.dp_size > 1:
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is currently not available for DP.",
+                    scope="local",
+                )
+            self.flashinfer_cutlass_moe = None  # type: ignore
+
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        if self.rocm_aiter_moe_enabled:
+            return None
+        else:
+            return super().maybe_make_prepare_finalize()
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == FusedMoEActivationFormat.BatchedExperts
+        ):
+            logger.debug("BatchedTritonExperts %s", self.moe)
+            return BatchedTritonExperts(
+                max_num_tokens=self.moe.max_num_tokens,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+            )
+        else:
+            logger.debug("TritonExperts %s", self.moe)
+            return TritonExperts(self.moe_quant_config)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w13_up_dim,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w13_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w2_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
+
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (
+            envs.VLLM_ROCM_MOE_PADDING
+            and current_platform.is_rocm()
+            and weight.stride(-1) == 1
+            and (weight.stride(-2) * weight.element_size()) % 512 == 0
+        ):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+
+        return weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        # Padding the weight for better performance on ROCm
+        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
+
+        if self.rocm_aiter_moe_enabled:
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data
+            )
+
+            layer.w13_weight.data = shuffled_w13
+            layer.w2_weight.data = shuffled_w2
+
+        if self.flashinfer_cutlass_moe_enabled:
+            # Swap halves to arrange as [w3; w1] (kernel expectation)
+            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
+            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
+            layer.w13_weight.data = w13_weight_swapped.contiguous()
+
+        if current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+
+            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                layer.w13_weight,
+                layer.w2_weight,
+                use_prepack=True,
+                experts_start_id=ep_rank_start,
+            )
+        elif current_platform.is_cpu():
+            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
+
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
+
+                dtype_w13 = layer.w13_weight.dtype
+                _, n_w13, k_w13 = layer.w13_weight.size()
+                dtype_w2 = layer.w2_weight.dtype
+                _, n_w2, k_w2 = layer.w2_weight.size()
+                if (
+                    envs.VLLM_CPU_SGL_KERNEL
+                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
+                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
+                ):
+                    packed_w13_weight = torch.ops._C.convert_weight_packed(
+                        layer.w13_weight
+                    )
+                    assert packed_w13_weight.size() == layer.w13_weight.size()
+                    layer.w13_weight.copy_(packed_w13_weight)
+                    del packed_w13_weight
+                    packed_w2_weight = torch.ops._C.convert_weight_packed(
+                        layer.w2_weight
+                    )
+                    assert packed_w2_weight.size() == layer.w2_weight.size()
+                    layer.w2_weight.copy_(packed_w2_weight)
+                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
+                else:
+                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+            else:
+                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            enable_eplb=enable_eplb,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        if self.moe.has_bias:
+            return biased_moe_quant_config(
+                layer.w13_bias,
+                layer.w2_bias,
+            )
+        else:
+            return FUSED_MOE_UNQUANTIZED_CONFIG
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+            num_fused_shared_experts=layer.num_fused_shared_experts,
+        )
+
+        if self.rocm_aiter_moe_enabled:
+            result = self.rocm_aiter_fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                expert_map=expert_map,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        elif self.flashinfer_cutlass_moe_enabled:
+            return self.flashinfer_cutlass_moe(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        else:
+            result = fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                quant_config=self.moe_quant_config,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for CPU.")
+        return layer.cpu_fused_moe(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            global_num_experts,
+            expert_map,
+            custom_routing_function,
+            scoring_func,
+            routed_scaling_factor,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+        )
+
+    def forward_xpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for XPU.")
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )
+
+    def forward_tpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU."
+            )
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU."
+            )
+        assert activation == "silu", f"{activation} is not supported for TPU."
+        assert routed_scaling_factor == 1.0, (
+            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
+        )
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for TPU.")
+        return fused_moe_pallas(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk=top_k,
+            gating_output=router_logits,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            renormalize=renormalize,
+        )
+
+    if current_platform.is_tpu():
+        forward_native = forward_tpu
+    elif current_platform.is_cpu():
+        forward_native = forward_cpu
+    elif current_platform.is_xpu():
+        forward_native = forward_xpu
+    else:
+        forward_native = forward_cuda
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 678942e568d86f3f5920586f81e3aad93a3891d8..1f946d67a8f5d4c8ed8abae65d5ab5dac93b4398 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,46 +1,56 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
 from math import prod
-from typing import Optional, Union
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+    per_token_group_quant_fp8,
+)
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    per_token_group_quant_int8, per_token_quant_int8)
+    per_token_group_quant_int8,
+    per_token_quant_int8,
+)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    quant_dequant_mxfp4)
+    quant_dequant_mxfp4,
+)
+from vllm.model_executor.layers.quantization.utils.mxfp6_utils import (
+    quant_dequant_mxfp6,
+)
 from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
-    mxfp8_quantize)
-from vllm.platforms import current_platform
+    mxfp8_e4m3_quantize,
+)
 from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv
-from vllm.utils.flashinfer import fp4_quantize
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
 @triton.jit
-def _count_expert_num_tokens(topk_ids_ptr, expert_num_tokens_ptr, num_experts,
-                             topk_numel, expert_map,
-                             HAS_EXPERT_MAP: tl.constexpr,
-                             BLOCK_SIZE: tl.constexpr):
-
+def _count_expert_num_tokens(
+    topk_ids_ptr,
+    expert_num_tokens_ptr,
+    num_experts,
+    topk_numel,
+    expert_map,
+    HAS_EXPERT_MAP: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
     curr_expert = tl.program_id(0)
 
     offsets = tl.arange(0, BLOCK_SIZE)
     topk_ids_ptrs = topk_ids_ptr + offsets
 
-    acc = tl.zeros((BLOCK_SIZE, ), dtype=tl.int32)
+    acc = tl.zeros((BLOCK_SIZE,), dtype=tl.int32)
     for x in range(tl.cdiv(topk_numel, BLOCK_SIZE)):
         mask = offsets < (topk_numel - x * BLOCK_SIZE)
         expert_ids = tl.load(topk_ids_ptrs, mask=mask, other=-1)
         if HAS_EXPERT_MAP:
             expert_map_ptrs = expert_map + expert_ids
             expert_map_mask = expert_ids >= 0
-            expert_ids = tl.load(expert_map_ptrs,
-                                 mask=expert_map_mask,
-                                 other=-1)
+            expert_ids = tl.load(expert_map_ptrs, mask=expert_map_mask, other=-1)
 
         has_curr_expert = tl.where(expert_ids == curr_expert, 1, 0)
         acc = acc + has_curr_expert
@@ -51,8 +61,8 @@ def _count_expert_num_tokens(topk_ids_ptr, expert_num_tokens_ptr, num_experts,
 
 
 def count_expert_num_tokens(
-        topk_ids: torch.Tensor, num_local_experts: int,
-        expert_map: Optional[torch.Tensor]) -> torch.Tensor:
+    topk_ids: torch.Tensor, num_local_experts: int, expert_map: torch.Tensor | None
+) -> torch.Tensor:
     """
     Count the number to tokens assigned to each expert.
 
@@ -68,17 +78,16 @@ def count_expert_num_tokens(
     A tensor of size num_local_experts, where tensor[i] holds the number
     of tokens assigned to the ith expert.
     """
-    assert topk_ids.dtype.is_signed, (
-        "The kernel uses -1 to represent invalid topk_ids")
-    expert_num_tokens = torch.empty((num_local_experts),
-                                    device=topk_ids.device,
-                                    dtype=torch.int32)
+    assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids"
+    expert_num_tokens = torch.empty(
+        (num_local_experts), device=topk_ids.device, dtype=torch.int32
+    )
 
     grid = num_local_experts
     BLOCK_SIZE = min(topk_ids.numel(), 1024)
     BLOCK_SIZE = triton.next_power_of_2(BLOCK_SIZE)
 
-    _count_expert_num_tokens[(grid, )](
+    _count_expert_num_tokens[(grid,)](
         topk_ids,
         expert_num_tokens,
         num_local_experts,
@@ -96,26 +105,27 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
     Shrink the given tensor and apply the given view to it.  This is
     used to resize the intermediate fused_moe caches.
     """
-    assert prod(v) <= x.numel(
-    ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"  # CUDAGRAPH unfriendly?
-    return x.flatten()[:prod(v)].view(*v)
+    assert prod(v) <= x.numel(), (
+        f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"
+    )  # CUDAGRAPH unfriendly?
+    return x.flatten()[: prod(v)].view(*v)
 
 
-def _fp4_quantize(
+def _nvfp4_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     is_sf_swizzled_layout: bool,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    return fp4_quantize(A,
-                        A_scale,
-                        is_sf_swizzled_layout=is_sf_swizzled_layout)
+    return flashinfer_fp4_quantize(
+        A, A_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
+    )
 
 
 def _fp8_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform fp8 quantization on the inputs.  If a block_shape
@@ -125,7 +135,8 @@ def _fp8_quantize(
         # TODO(luka): use QuantFP8 custom op
         #  https://github.com/vllm-project/vllm/issues/20711
         A, A_scale = ops.scaled_fp8_quant(
-            A, A_scale, use_per_token_if_dynamic=per_act_token)
+            A, A_scale, use_per_token_if_dynamic=per_act_token
+        )
     else:
         assert not per_act_token
         assert len(block_shape) == 2
@@ -138,9 +149,9 @@ def _fp8_quantize(
 
 def _int8_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform int8 quantization on the inputs.  If a block_shape
@@ -151,8 +162,7 @@ def _int8_quantize(
     # activations apply per-token quantization. Otherwise, assume
     # activation tensor-wise fp8/int8 quantization, dynamic or static
     if block_shape is None:
-        assert per_act_token, \
-            "int8 quantization only supports block or channel-wise"
+        assert per_act_token, "int8 quantization only supports block or channel-wise"
         A, A_scale = per_token_quant_int8(A)
     else:
         assert not per_act_token
@@ -166,51 +176,90 @@ def _int8_quantize(
 
 def _mxfp4_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, None]:
     assert block_shape is None
-    if not current_platform.supports_mx():
-        A = quant_dequant_mxfp4(A)
-    else:
-        raise NotImplementedError()
+    # TODO: native mxfp4 is currently not integrated in vllm,
+    # so simulating even on devices supporting this data type natively.
+    # Once integrated, `current_platform.supports_mx()` should be used to
+    # control quantize+dequantize, or simply quantize here down to mxfp4.
+    A = quant_dequant_mxfp4(A)
 
     return A, None
 
 
-def _mxfp8_quantize(
+def _mxfp8_e4m3_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert A_scale is None
     assert not per_act_token_quant
     assert block_shape is None
-    return mxfp8_quantize(A)
+    return mxfp8_e4m3_quantize(A)
+
+
+def _mxfp6_e3m2_quantize(
+    A: torch.Tensor,
+    A_scale: torch.Tensor | None,
+    per_act_token_quant: bool,
+    block_shape: list[int] | None = None,
+) -> tuple[torch.Tensor, None]:
+    assert block_shape is None
+
+    # TODO: native mxfp6 is currently not integrated in vllm,
+    # so simulating even on devices supporting this data type natively.
+    # Eventually, there should be a check based on
+    # `current_platform.supports_mx()` here.
+    A = quant_dequant_mxfp6(A, quant_dtype="fp6_e3m2")
+
+    return A, None
+
+
+def _mxfp6_e2m3_quantize(
+    A: torch.Tensor,
+    A_scale: torch.Tensor | None,
+    per_act_token_quant: bool,
+    block_shape: list[int] | None = None,
+) -> tuple[torch.Tensor, None]:
+    assert block_shape is None
+
+    # TODO: native mxfp6 is currently not integrated in vllm,
+    # so simulating even on devices supporting this data type natively.
+    # Eventually, there should be a check based on
+    # `current_platform.supports_mx()` here.
+    A = quant_dequant_mxfp6(A, quant_dtype="fp6_e2m3")
+
+    return A, None
 
 
 def moe_kernel_quantize_input(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    quant_dtype: Union[None, torch.dtype, str],
+    A_scale: torch.Tensor | None,
+    quant_dtype: None | torch.dtype | str,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     is_fp4_scale_swizzled: bool = True,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor | None]:
     if quant_dtype == torch.float8_e4m3fn:
         return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == torch.int8:
         return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == "nvfp4":
-        return _fp4_quantize(A,
-                             A_scale,
-                             is_sf_swizzled_layout=is_fp4_scale_swizzled)
+        return _nvfp4_quantize(A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled)
     elif quant_dtype == "mxfp4":
         return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == "mxfp8":
-        return _mxfp8_quantize(A, A_scale, per_act_token_quant, block_shape)
+        # TODO: `quant_dtype == "mxfp8"` is ambiguous,
+        # should be fp8_e4m3. OCP MX also defines `fp8_e5m2`.
+        return _mxfp8_e4m3_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == "mxfp6_e3m2":
+        return _mxfp6_e3m2_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == "mxfp6_e2m3":
+        return _mxfp6_e2m3_quantize(A, A_scale, per_act_token_quant, block_shape)
     else:
         return A, A_scale
 
@@ -225,8 +274,7 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
         return m[idx, ...]
 
 
-def normalize_scales_shape(
-        scales: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+def normalize_scales_shape(scales: torch.Tensor | None) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             scales = scales.view(1, 1)
@@ -236,14 +284,15 @@ def normalize_scales_shape(
 
 
 def normalize_batched_scales_shape(
-    scales: Optional[torch.Tensor],
+    scales: torch.Tensor | None,
     num_experts: int,
-) -> Optional[torch.Tensor]:
+) -> torch.Tensor | None:
     if scales is not None and scales.ndim < 3:
         if scales.numel() == 1:
             scales = scales.view(1)
-            scales = torch.repeat_interleave(scales, num_experts,
-                                             dim=0).view(num_experts, 1, 1)
+            scales = torch.repeat_interleave(scales, num_experts, dim=0).view(
+                num_experts, 1, 1
+            )
         else:
             scales = scales.view(num_experts, -1, scales.size(-1))
 
@@ -252,9 +301,9 @@ def normalize_batched_scales_shape(
 
 def _validate_scale_shape(
     a: torch.Tensor,
-    a_scale: Optional[torch.Tensor],
+    a_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
 ) -> None:
     if a_scale is None:
         return
@@ -263,7 +312,8 @@ def _validate_scale_shape(
         assert a_scale.numel() == 1, f"{a_scale.shape}"
     elif per_act_token_quant:
         assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, (
-            f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1")
+            f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1"
+        )
     else:
         assert block_shape is not None
         expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
@@ -272,3 +322,11 @@ def _validate_scale_shape(
 
 def activation_without_mul(activation: str) -> str:
     return activation + "_no_mul"
+
+
+# Torch custom ops can't deal with outputs aliasing inputs so we need to
+# disable inplace for torch >= 2.9.
+# See https://github.com/vllm-project/vllm/issues/26378
+@functools.cache
+def disable_inplace() -> bool:
+    return is_torch_equal_or_newer("2.9")
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7500bac71881feef246375174358cb4a64c5ab
--- /dev/null
+++ b/vllm/model_executor/layers/kda.py
@@ -0,0 +1,448 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+from einops import rearrange
+from torch import nn
+
+from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+
+from .fla.ops.kda import (
+    FusedRMSNormGated,
+    chunk_kda,
+    fused_kda_gate,
+    fused_recurrent_kda,
+)
+from .linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from .mamba.abstract import MambaBase
+from .mamba.mamba_utils import MambaStateDtypeCalculator, MambaStateShapeCalculator
+from .mamba.ops.causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+from .quantization.base_config import QuantizationConfig
+
+logger = init_logger(__name__)
+
+
+def kda_attention(
+    q_proj_states: torch.Tensor,
+    k_proj_states: torch.Tensor,
+    v_proj_states: torch.Tensor,
+    g1: torch.Tensor,
+    beta: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(
+        q_proj_states=q_proj_states,
+        k_proj_states=k_proj_states,
+        v_proj_states=v_proj_states,
+        g1=g1,
+        beta=beta,
+        core_attn_out=core_attn_out,
+    )
+
+
+def kda_attention_fake(
+    q_proj_states: torch.Tensor,
+    k_proj_states: torch.Tensor,
+    v_proj_states: torch.Tensor,
+    g1: torch.Tensor,
+    beta: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="kda_attention",
+    op_func=kda_attention,
+    mutates_args=["core_attn_out"],
+    fake_impl=kda_attention_fake,
+)
+
+
+class KimiDeltaAttention(nn.Module, MambaBase):
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
+
+        return GDNAttentionBackend
+
+    def get_state_dtype(
+        self,
+    ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]:
+        if self.model_config is None or self.cache_config is None:
+            raise ValueError("model_config and cache_config must be set")
+        return MambaStateDtypeCalculator.kda_state_dtype(
+            self.model_config.dtype, self.cache_config.mamba_cache_dtype
+        )
+
+    def get_state_shape(
+        self,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.kda_state_shape(
+            self.tp_size, self.num_heads, self.head_dim, conv_kernel_size=self.conv_size
+        )
+
+    def __init__(
+        self,
+        layer_idx: int,
+        hidden_size: int,
+        quant_config: QuantizationConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        model_config: ModelConfig | None = None,
+        rms_norm_eps: float = 1e-5,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.hidden_size = hidden_size
+        self.model_config = model_config
+        self.cache_config = cache_config
+        if model_config is None:
+            raise ValueError("model_config must be provided")
+        kda_config = model_config.linear_attn_config
+        self.head_dim = kda_config["head_dim"]
+        self.num_heads = kda_config["num_heads"]
+        self.layer_idx = layer_idx
+        self.prefix = prefix
+        assert self.num_heads % self.tp_size == 0
+        self.local_num_heads = divide(self.num_heads, self.tp_size)
+
+        projection_size = self.head_dim * self.num_heads
+        self.conv_size = kda_config["short_conv_kernel_size"]
+
+        self.q_proj = ColumnParallelLinear(
+            self.hidden_size,
+            projection_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.k_proj = ColumnParallelLinear(
+            self.hidden_size,
+            projection_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.k_proj",
+        )
+        self.v_proj = ColumnParallelLinear(
+            self.hidden_size,
+            projection_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.v_proj",
+        )
+
+        self.f_a_proj = ReplicatedLinear(
+            self.hidden_size,
+            self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.f_a_proj",
+        )
+
+        self.f_b_proj = ColumnParallelLinear(
+            self.head_dim,
+            projection_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.f_b_proj",
+        )
+        self.dt_bias = nn.Parameter(
+            torch.empty(divide(projection_size, self.tp_size), dtype=torch.float32)
+        )
+
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+        self.b_proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.b_proj",
+        )
+
+        self.q_conv1d = ColumnParallelLinear(
+            input_size=self.conv_size,
+            output_size=projection_size,
+            bias=False,
+            params_dtype=torch.float32,
+            prefix=f"{prefix}.q_conv1d",
+        )
+        self.k_conv1d = ColumnParallelLinear(
+            input_size=self.conv_size,
+            output_size=projection_size,
+            bias=False,
+            params_dtype=torch.float32,
+            prefix=f"{prefix}.k_conv1d",
+        )
+        self.v_conv1d = ColumnParallelLinear(
+            input_size=self.conv_size,
+            output_size=projection_size,
+            bias=False,
+            params_dtype=torch.float32,
+            prefix=f"{prefix}.v_conv1d",
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.q_conv1d.weight.data = self.q_conv1d.weight.data.unsqueeze(1)
+        self.k_conv1d.weight.data = self.k_conv1d.weight.data.unsqueeze(1)
+        self.v_conv1d.weight.data = self.v_conv1d.weight.data.unsqueeze(1)
+
+        self.A_log = nn.Parameter(
+            torch.empty(1, 1, self.local_num_heads, 1, dtype=torch.float32)
+        )
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(2)})
+
+        self.g_a_proj = ReplicatedLinear(
+            self.hidden_size,
+            self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.g_a_proj",
+        )
+        self.g_b_proj = ColumnParallelLinear(
+            self.head_dim,
+            projection_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.g_b_proj",
+        )
+        self.o_norm = FusedRMSNormGated(
+            self.head_dim, eps=rms_norm_eps, activation="sigmoid"
+        )
+        self.o_proj = RowParallelLinear(
+            projection_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        output: torch.Tensor,
+    ) -> None:
+        num_tokens = hidden_states.size(0)
+        q = self.q_proj(hidden_states)[0]
+        k = self.k_proj(hidden_states)[0]
+        v = self.v_proj(hidden_states)[0]
+
+        beta = self.b_proj(hidden_states)[0].float().sigmoid()
+        g1 = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0]
+        g1 = fused_kda_gate(g1, self.A_log, self.head_dim, g_bias=self.dt_bias)
+        beta = beta.unsqueeze(0)
+        g1 = g1.unsqueeze(0)
+
+        g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0]
+        g2 = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim)
+
+        core_attn_out = torch.zeros(
+            (1, num_tokens, self.local_num_heads, self.head_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        torch.ops.vllm.kda_attention(
+            q,
+            k,
+            v,
+            g1,
+            beta,
+            core_attn_out,
+            self.prefix,
+        )
+        core_attn_out = self.o_norm(core_attn_out, g2)
+        core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)")
+        output[:] = self.o_proj(core_attn_out)[0]
+
+    def _forward(
+        self,
+        q_proj_states: torch.Tensor,
+        k_proj_states: torch.Tensor,
+        v_proj_states: torch.Tensor,
+        g1: torch.Tensor,
+        beta: torch.Tensor,
+        core_attn_out: torch.Tensor,
+    ) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+
+        if attn_metadata is None:
+            #     # V1 profile run
+            return
+
+        assert isinstance(attn_metadata, dict)
+        attn_metadata = attn_metadata[self.prefix]
+        assert isinstance(attn_metadata, GDNAttentionMetadata)
+        has_initial_state = attn_metadata.has_initial_state
+        non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
+        non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        constant_caches = self.kv_cache[forward_context.virtual_engine]
+
+        q_proj_states = q_proj_states[:num_actual_tokens]
+        k_proj_states = k_proj_states[:num_actual_tokens]
+        v_proj_states = v_proj_states[:num_actual_tokens]
+        g1 = g1[:num_actual_tokens]
+        beta = beta[:num_actual_tokens]
+
+        (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches
+        # deal with strides
+        conv_state_q = conv_state_q.transpose(-1, -2)
+        conv_state_k = conv_state_k.transpose(-1, -2)
+        conv_state_v = conv_state_v.transpose(-1, -2)
+
+        q_conv_weights = self.q_conv1d.weight.view(
+            self.q_conv1d.weight.size(0), self.q_conv1d.weight.size(2)
+        )
+        k_conv_weights = self.k_conv1d.weight.view(
+            self.k_conv1d.weight.size(0), self.k_conv1d.weight.size(2)
+        )
+        v_conv_weights = self.v_conv1d.weight.view(
+            self.v_conv1d.weight.size(0), self.v_conv1d.weight.size(2)
+        )
+        if attn_metadata.num_prefills > 0:
+            q_proj_states = q_proj_states.transpose(0, 1)
+            k_proj_states = k_proj_states.transpose(0, 1)
+            v_proj_states = v_proj_states.transpose(0, 1)
+            q = causal_conv1d_fn(
+                q_proj_states,
+                q_conv_weights,
+                self.q_conv1d.bias,
+                activation="silu",
+                conv_states=conv_state_q,
+                has_initial_state=has_initial_state,
+                cache_indices=non_spec_state_indices_tensor,
+                query_start_loc=non_spec_query_start_loc,
+                metadata=attn_metadata,
+            ).transpose(0, 1)
+            k = causal_conv1d_fn(
+                k_proj_states,
+                k_conv_weights,
+                self.k_conv1d.bias,
+                activation="silu",
+                conv_states=conv_state_k,
+                has_initial_state=has_initial_state,
+                cache_indices=non_spec_state_indices_tensor,
+                query_start_loc=non_spec_query_start_loc,
+                metadata=attn_metadata,
+            ).transpose(0, 1)
+            v = causal_conv1d_fn(
+                v_proj_states,
+                v_conv_weights,
+                self.v_conv1d.bias,
+                activation="silu",
+                conv_states=conv_state_v,
+                has_initial_state=has_initial_state,
+                cache_indices=non_spec_state_indices_tensor,
+                query_start_loc=non_spec_query_start_loc,
+                metadata=attn_metadata,
+            ).transpose(0, 1)
+        else:
+            decode_conv_indices = non_spec_state_indices_tensor[
+                : attn_metadata.num_actual_tokens
+            ]
+            q = causal_conv1d_update(
+                q_proj_states,
+                conv_state_q,
+                q_conv_weights,
+                self.q_conv1d.bias,
+                activation="silu",
+                conv_state_indices=decode_conv_indices,
+                validate_data=True,
+            )
+            k = causal_conv1d_update(
+                k_proj_states,
+                conv_state_k,
+                k_conv_weights,
+                self.k_conv1d.bias,
+                activation="silu",
+                conv_state_indices=decode_conv_indices,
+                validate_data=True,
+            )
+            v = causal_conv1d_update(
+                v_proj_states,
+                conv_state_v,
+                v_conv_weights,
+                self.v_conv1d.bias,
+                activation="silu",
+                conv_state_indices=decode_conv_indices,
+                validate_data=True,
+            )
+
+        q, k, v = map(
+            lambda x: rearrange(x, "n (h d) -> 1 n h d", d=self.head_dim), (q, k, v)
+        )
+
+        if attn_metadata.num_prefills > 0:
+            zero_idx = non_spec_state_indices_tensor[~has_initial_state]
+            recurrent_state[zero_idx] = 0
+            initial_state = recurrent_state[non_spec_state_indices_tensor].contiguous()
+            (
+                core_attn_out_non_spec,
+                last_recurrent_state,
+            ) = chunk_kda(
+                q=q,
+                k=k,
+                v=v,
+                g=g1,
+                beta=beta,
+                initial_state=initial_state,
+                output_final_state=True,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=non_spec_query_start_loc,
+            )
+            # Init cache
+            recurrent_state[non_spec_state_indices_tensor] = last_recurrent_state
+        else:
+            (
+                core_attn_out_non_spec,
+                last_recurrent_state,
+            ) = fused_recurrent_kda(
+                q=q,
+                k=k,
+                v=v,
+                g=g1,
+                beta=beta,
+                initial_state=recurrent_state,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],
+                ssm_state_indices=non_spec_state_indices_tensor,
+            )
+        core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[
+            0, :num_actual_tokens
+        ]
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 363245daa89d19f82c5f01b02f91ef4c9f85da96..8cc374ac9155d5e3a13a7556663856dd0e634f78 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,26 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.batch_invariant import (
+    rms_norm_batch_invariant,
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
 
 
-def is_rocm_aiter_rmsnorm_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER_RMSNORM \
-        and envs.VLLM_ROCM_USE_AITER
-
-
-def rms_norm(x: torch.Tensor, weight: torch.Tensor,
-             variance_epsilon: float) -> torch.Tensor:
+def rms_norm(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
     from vllm import _custom_ops as ops
+
+    if vllm_is_batch_invariant():
+        return rms_norm_batch_invariant(x, weight, variance_epsilon)
     out = torch.empty_like(x)
     ops.rms_norm(
         out,
@@ -32,9 +33,17 @@ def rms_norm(x: torch.Tensor, weight: torch.Tensor,
 
 
 def fused_add_rms_norm(
-        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
     from vllm import _custom_ops as ops
+
+    if vllm_is_batch_invariant():
+        return rms_norm_batch_invariant(
+            x + residual, weight, variance_epsilon
+        ), x + residual
     ops.fused_add_rms_norm(
         x,
         residual,
@@ -44,9 +53,11 @@ def fused_add_rms_norm(
     return x, residual
 
 
-def poly_norm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
-              variance_epsilon: float) -> torch.Tensor:
+def poly_norm(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
     from vllm import _custom_ops as ops
+
     out = torch.empty_like(x)
     ops.poly_norm(
         out,
@@ -58,71 +69,18 @@ def poly_norm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
     return out
 
 
-def rocm_aiter_rms_norm_impl(x: torch.Tensor, weight: torch.Tensor,
-                             variance_epsilon: float) -> torch.Tensor:
-    import aiter as rocm_aiter
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rocm_aiter.rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
-
-    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
-
-
-def rocm_aiter_rmsnorm2d_fwd_with_add_impl(
-        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
-
-    import aiter as rocm_aiter
-
-    residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
-    rocm_aiter.rmsnorm2d_fwd_with_add(
-        output,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
-        weight,
-        variance_epsilon,
-    )
-    return output, residual_out
-
-
-def rocm_aiter_rms_norm_fake(x: torch.Tensor, weight: torch.Tensor,
-                             variance_epsilon: float) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-def rocm_aiter_rmsnorm2d_fwd_with_add_fake(
-        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_rms_norm",
-        op_func=rocm_aiter_rms_norm_impl,
-        fake_impl=rocm_aiter_rms_norm_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
-        op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-        fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-    )
-
-
-def dispatch_rocm_rmsnorm_func(with_fused_add: bool, dtype: torch.dtype):
-    use_aiter = is_rocm_aiter_rmsnorm_enabled() and dtype in [
-        torch.float16, torch.bfloat16
+def dispatch_rocm_rmsnorm_func(
+    with_fused_add: bool, dtype: torch.dtype, use_aiter: bool = False
+):
+    use_aiter = use_aiter and dtype in [
+        torch.float16,
+        torch.bfloat16,
     ]
 
     if use_aiter and with_fused_add:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        return rocm_aiter_ops.rms_norm2d_with_add
     if use_aiter:
-        return torch.ops.vllm.rocm_aiter_rms_norm
+        return rocm_aiter_ops.rms_norm
 
     # fall back to CUDA implementation
     if with_fused_add:
@@ -142,105 +100,134 @@ class RMSNorm(CustomOp):
         self,
         hidden_size: int,
         eps: float = 1e-6,
-        var_hidden_size: Optional[int] = None,
+        var_hidden_size: int | None = None,
         has_weight: bool = True,
-        dtype: Optional[torch.dtype] = None,
+        dtype: torch.dtype | None = None,
     ) -> None:
         super().__init__()
 
         self.hidden_size = hidden_size
         self.variance_epsilon = eps
-        self.variance_size_override = (None if var_hidden_size == hidden_size
-                                       else var_hidden_size)
+        self.variance_size_override = (
+            None if var_hidden_size == hidden_size else var_hidden_size
+        )
+        weight_dtype = dtype or torch.get_default_dtype()
         self.has_weight = has_weight
-        if dtype is not None:
-            self.weight = torch.ones(hidden_size, dtype=dtype)
-        else:
-            self.weight = torch.ones(hidden_size)
+        self.weight = torch.ones(hidden_size, dtype=weight_dtype)
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
-        weight_dtype = self.weight.data.dtype
 
         if current_platform.is_rocm():
+            aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled()
             self.rocm_norm_func = dispatch_rocm_rmsnorm_func(
-                with_fused_add=False, dtype=weight_dtype)
+                with_fused_add=False,
+                dtype=weight_dtype,
+                use_aiter=aiter_rmsnorm_enabled,
+            )
             self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
-                with_fused_add=True, dtype=weight_dtype)
+                with_fused_add=True, dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled
+            )
 
-    def forward_native(
-        self,
+    @staticmethod
+    def forward_static(
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        variance_epsilon: float,
+        hidden_size: int,
+        orig_dtype: torch.dtype,
+        weight: torch.Tensor | None = None,
+        residual: torch.Tensor | None = None,
+        variance_size_override: int | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
-        orig_dtype = x.dtype
         x = x.to(torch.float32)
         if residual is not None:
-            x = x + residual.to(torch.float32)
+            # residual promoted f16->f32 automatically,
+            # otherwise Inductor eliminates the casts to and from f16,
+            # increasing memory usage (and complicating pattern matching)
+            x = x + residual
             residual = x.to(orig_dtype)
 
-        hidden_size = x.shape[-1]
-        if hidden_size != self.hidden_size:
-            raise ValueError("Expected hidden_size to be "
-                             f"{self.hidden_size}, but found: {hidden_size}")
+        if x.shape[-1] != hidden_size:
+            raise ValueError(
+                f"Expected hidden_size to be {hidden_size}, but found: {x.shape[-1]}"
+            )
 
-        if self.variance_size_override is None:
+        if variance_size_override is None:
             x_var = x
         else:
-            if hidden_size < self.variance_size_override:
+            if hidden_size < variance_size_override:
                 raise ValueError(
                     "Expected hidden_size to be at least "
-                    f"{self.variance_size_override}, but found: {hidden_size}")
+                    f"{variance_size_override}, but found: {hidden_size}"
+                )
 
-            x_var = x[:, :, :self.variance_size_override]
+            x_var = x[:, :, :variance_size_override]
 
         variance = x_var.pow(2).mean(dim=-1, keepdim=True)
 
-        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x * torch.rsqrt(variance + variance_epsilon)
         x = x.to(orig_dtype)
-        if self.has_weight:
-            x = x * self.weight
+        if weight is not None:
+            x = x * weight
         if residual is None:
             return x
         else:
             return x, residual
 
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+
+        return self.forward_static(
+            x,
+            self.variance_epsilon,
+            self.hidden_size,
+            x.dtype,
+            self.weight.data if self.has_weight else None,
+            residual,
+            self.variance_size_override,
+        )
+
     def forward_cuda(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
         add_residual = residual is not None
         if add_residual:
-            return fused_add_rms_norm(x, residual, self.weight.data,
-                                      self.variance_epsilon)
+            return fused_add_rms_norm(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
         else:
             return rms_norm(x, self.weight.data, self.variance_epsilon)
 
     def forward_hip(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
         add_residual = residual is not None
         if add_residual:
-            return self.rocm_norm_func_with_add(x, residual, self.weight.data,
-                                                self.variance_epsilon)
+            return self.rocm_norm_func_with_add(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
         else:
-            return self.rocm_norm_func(x, self.weight.data,
-                                       self.variance_epsilon)
+            return self.rocm_norm_func(x, self.weight.data, self.variance_epsilon)
 
     def forward_xpu(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
@@ -289,15 +276,16 @@ class GemmaRMSNorm(CustomOp):
         weight: torch.Tensor,
         variance_epsilon: float,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
         if residual is not None:
-            if orig_dtype == torch.float16:
-                x = x + residual.float()
-            else:
-                x = x + residual
+            x = (
+                x.float() + residual.float()
+                if orig_dtype == torch.float16
+                else x + residual
+            )
             residual = x
 
         x = x.float()
@@ -312,70 +300,128 @@ class GemmaRMSNorm(CustomOp):
     def forward_native(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
-        return self.forward_static(self.weight.data, self.variance_epsilon, x,
-                                   residual)
+        return self.forward_static(self.weight.data, self.variance_epsilon, x, residual)
 
     def forward_cuda(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if torch.compiler.is_compiling():
             return self.forward_native(x, residual)
 
         if not getattr(self, "_is_compiled", False):
             self.forward_static = torch.compile(  # type: ignore
-                self.forward_static)
+                self.forward_static
+            )
             self._is_compiled = True
         return self.forward_native(x, residual)
 
 
-@CustomOp.register("poly_norm")
-class PolyNorm(CustomOp):
-    """Polynomial normalization.
+@CustomOp.register("rms_norm_gated")
+class RMSNormGated(CustomOp):
+    """RMS Normalization with optional gating.
 
-    Computes x -> w_0 * RMSNorm(x^3) + w_1 * RMSNorm(x^2) + w_2 * RMSNorm(x) + b
-    where w_n is the learned weight and b is the bias.
-    Refer to https://arxiv.org/html/2411.03884v1
+    This is a native PyTorch implementation that supports:
+    - Standard RMS normalization
+    - Group RMS normalization
+    - Optional gating with SiLU activation
     """
 
     def __init__(
         self,
-        eps: float = 1e-6,
-    ) -> None:
+        hidden_size: int,
+        eps: float = 1e-5,
+        group_size: int | None = None,
+        norm_before_gate: bool = False,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        """Initialize RMSNormGated.
+
+        Args:
+            hidden_size: Size of the hidden dimension
+            eps: Epsilon for numerical stability
+            group_size: If not None, do GroupNorm with each group
+                        having group_size elements.
+                        group_size=None is equivalent to group_size=hidden_size
+                        (i.e. there's only 1 group).
+            norm_before_gate: If True and z is provided: out = norm(x) * silu(z)
+                              If False and z is provided: out = norm(x * silu(z))
+            device: Device to create parameters on
+            dtype: Data type for parameters
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
-        self.bias = torch.nn.Parameter(torch.zeros(1))
-        self.variance_epsilon = eps
+        self.eps = eps
+        self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
 
-    def _norm(self, x):
-        return x / torch.sqrt(
-            x.pow(2).mean(-1, keepdim=True) + self.variance_epsilon)
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
 
     def forward_native(
-        self,
-        x: torch.Tensor,
+        self, x: torch.Tensor, z: torch.Tensor | None = None
     ) -> torch.Tensor:
-        """PyTorch-native implementation equivalent to forward().
+        """
+        Native PyTorch implementation of RMS normalization with gating.
+
+        Args:
+            x: Input tensor
+            z: Optional gating tensor
 
-        Refer to https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md
+        Returns:
+            Normalized (and optionally gated) tensor
+
+        If z is not None:
+            - norm_before_gate=True: out = norm(x) * silu(z)
+            - norm_before_gate=False: out = norm(x * silu(z))
         """
+        # Apply gating before normalization if needed
+        if z is not None and not self.norm_before_gate:
+            x = x * F.silu(z)
+
+        # RMS Normalization
+        if self.group_size is None:
+            # Standard RMS norm across the last dimension
+            variance = x.pow(2).mean(dim=-1, keepdim=True)
+            x_normed = x * torch.rsqrt(variance + self.eps)
+            out = x_normed * self.weight
+        else:
+            # Group RMS norm
+            from einops import rearrange
 
-        orig_dtype = x.dtype
-        x_float = x.to(torch.float32)
-        output = (self.weight[0] * self._norm(x_float**3) +
-                  self.weight[1] * self._norm(x_float**2) +
-                  self.weight[2] * self._norm(x_float) + self.bias)
-        return output.to(orig_dtype)
+            x_group = rearrange(x, "... (g d) -> ... g d", d=self.group_size)
+            variance = x_group.pow(2).mean(dim=-1, keepdim=True)
+            x_normed = x_group * torch.rsqrt(variance + self.eps)
+            out = rearrange(x_normed, "... g d -> ... (g d)") * self.weight
+
+        # Apply gating after normalization if needed
+        if z is not None and self.norm_before_gate:
+            out = out * F.silu(z)
+
+        return out
 
     def forward_cuda(
-        self,
-        x: torch.Tensor,
+        self, x: torch.Tensor, z: torch.Tensor | None = None
     ) -> torch.Tensor:
-        return poly_norm(x, self.weight, self.bias, self.variance_epsilon)
+        from vllm.model_executor.layers.fla.ops.layernorm_guard import rmsnorm_fn
+
+        return rmsnorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            eps=self.eps,
+            group_size=self.group_size,
+            norm_before_gate=self.norm_before_gate,
+        )
 
 
 class LayerNorm(nn.Module):
@@ -391,5 +437,6 @@ class LayerNorm(nn.Module):
         self.bias = nn.Parameter(torch.zeros(dim, dtype=torch.float32))
 
     def forward(self, x: torch.Tensor):
-        return F.layer_norm(x.float(), (self.dim, ), self.weight, self.bias,
-                            self.eps).type_as(x)
+        return F.layer_norm(
+            x.float(), (self.dim,), self.weight, self.bias, self.eps
+        ).type_as(x)
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 0b87acc8512080bd05a32ec892e6c0bc838b4237..99853680eac6c0c13f85f74f98bfa521c3ae21ab 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from einops import rearrange
@@ -9,9 +8,21 @@ from vllm.triton_utils import tl, triton
 
 
 @triton.jit
-def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
-                     d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
-                     NUM_BLOCK, CBLOCK: tl.constexpr):
+def _fwd_diag_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    CBLOCK: tl.constexpr,
+):
     # This kernel computes the diagonal blocks of the attention matrix
     # Each diagonal block represents attention
     # where queries attend to keys in the same block
@@ -39,18 +50,36 @@ def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
     o_cblock_offset = cblock_offset * e
 
     # Calculate pointers to the query, key, value, and output tensors
-    Q_block_ptr = (Q + qk_offset + qk_block_offset + q_cblock_offset +
-                   tl.arange(0, CBLOCK)[:, None] * d +
-                   tl.arange(0, d)[None, :])
-    K_trans_block_ptr = (K + qk_offset + qk_block_offset +
-                         tl.arange(0, CBLOCK)[None, :] * d +
-                         tl.arange(0, d)[:, None])
-    V_block_ptr = (V + v_offset + v_block_offset +
-                   tl.arange(0, CBLOCK)[:, None] * e +
-                   tl.arange(0, e)[None, :])
-    O_block_ptr = (Out + o_offset + o_block_offset + o_cblock_offset +
-                   tl.arange(0, CBLOCK)[:, None] * e +
-                   tl.arange(0, e)[None, :])
+    Q_block_ptr = (
+        Q
+        + qk_offset
+        + qk_block_offset
+        + q_cblock_offset
+        + tl.arange(0, CBLOCK)[:, None] * d
+        + tl.arange(0, d)[None, :]
+    )
+    K_trans_block_ptr = (
+        K
+        + qk_offset
+        + qk_block_offset
+        + tl.arange(0, CBLOCK)[None, :] * d
+        + tl.arange(0, d)[:, None]
+    )
+    V_block_ptr = (
+        V
+        + v_offset
+        + v_block_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, e)[None, :]
+    )
+    O_block_ptr = (
+        Out
+        + o_offset
+        + o_block_offset
+        + o_cblock_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, e)[None, :]
+    )
 
     # Load the decay rate for the current head
     S_block_ptr = S + off_h
@@ -60,9 +89,9 @@ def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
     q_index = tl.arange(0, CBLOCK) + i * CBLOCK
 
     # Load query values
-    q = tl.load(Q_block_ptr,
-                mask=block_offset + q_index[:, None] < n,
-                other=0.0).to(tl.float32)
+    q = tl.load(Q_block_ptr, mask=block_offset + q_index[:, None] < n, other=0.0).to(
+        tl.float32
+    )
 
     # Initialize output accumulator
     qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
@@ -146,18 +175,30 @@ def _fwd_kv_parallel(
     kv_offset = off_bh * NUM_BLOCK * d * e
 
     # Calculate pointers to the key, value, and key-value tensors
-    K_trans_block_ptr = (K + k_offset + k_block_offset +
-                         tl.arange(0, CBLOCK)[None, :] * d +
-                         tl.arange(0, D_FBLOCK)[:, None])
-    V_block_ptr = (V + v_offset + v_block_offset +
-                   tl.arange(0, CBLOCK)[:, None] * e +
-                   tl.arange(0, E_FBLOCK)[None, :])
-    KV_block_ptr = (KV + kv_offset + kv_block_offset +
-                    tl.arange(0, D_FBLOCK)[:, None] * e +
-                    tl.arange(0, E_FBLOCK)[None, :])
+    K_trans_block_ptr = (
+        K
+        + k_offset
+        + k_block_offset
+        + tl.arange(0, CBLOCK)[None, :] * d
+        + tl.arange(0, D_FBLOCK)[:, None]
+    )
+    V_block_ptr = (
+        V
+        + v_offset
+        + v_block_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+    KV_block_ptr = (
+        KV
+        + kv_offset
+        + kv_block_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
 
     # Load the decay factors for the current head and block
-    k_decay_ptr = (K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :])
+    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :]
 
     kv_index = tl.arange(0, CBLOCK)
 
@@ -165,10 +206,7 @@ def _fwd_kv_parallel(
     kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
 
     # Handle the last block which might be smaller than BLOCK
-    if off_block == NUM_BLOCK - 1:
-        split_n = n - (NUM_BLOCK - 1) * BLOCK
-    else:
-        split_n = BLOCK
+    split_n = n - (NUM_BLOCK - 1) * BLOCK if off_block == NUM_BLOCK - 1 else BLOCK
     left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
     num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
     k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
@@ -177,12 +215,16 @@ def _fwd_kv_parallel(
     for j in range(num_blocks):
         left_bound = (1 - j) * left_shift
         # Load key and value, handling boundary conditions
-        k_trans = tl.load(K_trans_block_ptr - left_shift * d,
-                          mask=kv_index[None, :] >= left_bound,
-                          other=0.0)
-        v = tl.load(V_block_ptr - left_shift * e,
-                    mask=kv_index[:, None] >= left_bound,
-                    other=0.0)
+        k_trans = tl.load(
+            K_trans_block_ptr - left_shift * d,
+            mask=kv_index[None, :] >= left_bound,
+            other=0.0,
+        )
+        v = tl.load(
+            V_block_ptr - left_shift * e,
+            mask=kv_index[:, None] >= left_bound,
+            other=0.0,
+        )
 
         # Load decay factor and compute weighted key-value outer product
         k_decay = tl.load(k_decay_ptr)
@@ -198,9 +240,20 @@ def _fwd_kv_parallel(
 
 
 @triton.jit
-def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
-                   d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
-                   NUM_BLOCK, D_FBLOCK: tl.constexpr, E_FBLOCK: tl.constexpr):
+def _fwd_kv_reduce(
+    S,
+    KV,
+    KV_HISTORY,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+):
     # This kernel reduces the key-value outer products
     # across blocks and updates the KV history
     off_bh = tl.program_id(0)  # batch-head index
@@ -209,8 +262,12 @@ def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
     kv_offset = off_bh * NUM_BLOCK * d * e
 
     # Calculate pointer to the key-value tensor
-    KV_block_ptr = (KV + kv_offset + tl.arange(0, D_FBLOCK)[:, None] * e +
-                    tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (
+        KV
+        + kv_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
 
     # Load the decay rate for the current head
     s_ptrs = S + off_h
@@ -218,9 +275,12 @@ def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
 
     # Calculate pointer to the key-value history tensor
     kv_history_offset = off_bh * d * e
-    KV_HISTORY_block_ptr = (KV_HISTORY + kv_history_offset +
-                            tl.arange(0, D_FBLOCK)[:, None] * e +
-                            tl.arange(0, E_FBLOCK)[None, :])
+    KV_HISTORY_block_ptr = (
+        KV_HISTORY
+        + kv_history_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
 
     # Load the previous key-value history
     kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
@@ -283,12 +343,18 @@ def _fwd_none_diag_kernel(
     kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
 
     # Calculate pointers to the query, output, and key-value tensors
-    Q_block_ptr = (Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d +
-                   tl.arange(0, d)[None, :])
-    O_block_ptr = (Out + o_offset + tl.arange(0, CBLOCK)[:, None] * e +
-                   tl.arange(0, E_FBLOCK)[None, :])
-    KV_block_ptr = (KV + kv_offset + tl.arange(0, d)[:, None] * e +
-                    tl.arange(0, E_FBLOCK)[None, :])
+    Q_block_ptr = (
+        Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d + tl.arange(0, d)[None, :]
+    )
+    O_block_ptr = (
+        Out
+        + o_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+    KV_block_ptr = (
+        KV + kv_offset + tl.arange(0, d)[:, None] * e + tl.arange(0, E_FBLOCK)[None, :]
+    )
 
     # Load the decay rate for the current head
     S_block_ptr = S + off_h
@@ -301,8 +367,7 @@ def _fwd_none_diag_kernel(
     q_index = block_offset + tl.arange(0, CBLOCK)
 
     # Load query values
-    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n,
-                other=0.).to(tl.float32)
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32)
 
     # Compute decay factors for the current sub-block
     q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
@@ -311,20 +376,18 @@ def _fwd_none_diag_kernel(
     qkv_none_diag = tl.dot(q, kv) * q_decay
 
     # Load diagonal attention output (computed by _fwd_diag_kernel)
-    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n,
-                       other=0.).to(tl.float32)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32)
 
     # Combine diagonal and non-diagonal attention outputs
     qkv = qkv_diag + qkv_none_diag
 
     # Store the result
-    tl.store(O_block_ptr,
-             qkv.to(O_block_ptr.dtype.element_ty),
-             mask=q_index[:, None] < n)
+    tl.store(
+        O_block_ptr, qkv.to(O_block_ptr.dtype.element_ty), mask=q_index[:, None] < n
+    )
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, s, kv_history):
         # Forward pass of the lightning attention algorithm
@@ -336,8 +399,10 @@ class _attention(torch.autograd.Function):
         # Check CUDA compute capability
         capability = torch.cuda.get_device_capability()
         if capability[0] < 8:
-            raise RuntimeError("Flash attention currently only supported",
-                               "for compute capability >= 80")
+            raise RuntimeError(
+                "Flash attention currently only supported",
+                "for compute capability >= 80",
+            )
 
         # Get input dimensions
         b, h, n, d = q.shape
@@ -360,19 +425,21 @@ class _attention(torch.autograd.Function):
 
         # Step 1: Compute diagonal blocks of attention
         grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
-        _fwd_diag_kernel[grid](q,
-                               k,
-                               v,
-                               o,
-                               s,
-                               b,
-                               h,
-                               n,
-                               d,
-                               e,
-                               BLOCK=BLOCK,
-                               NUM_BLOCK=NUM_BLOCK,
-                               CBLOCK=CBLOCK)
+        _fwd_diag_kernel[grid](
+            q,
+            k,
+            v,
+            o,
+            s,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            CBLOCK=CBLOCK,
+        )
 
         # Set feature block sizes
         NUM_FBLOCK = 1
@@ -386,9 +453,7 @@ class _attention(torch.autograd.Function):
         assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
 
         # Step 2: Compute key-value outer products for each block in parallel
-        kv = torch.empty((b, h, NUM_BLOCK, d, e),
-                         dtype=torch.float32,
-                         device=q.device)
+        kv = torch.empty((b, h, NUM_BLOCK, d, e), dtype=torch.float32, device=q.device)
         grid = (b * h, NUM_BLOCK)
         _fwd_kv_parallel[grid](
             k,
@@ -412,18 +477,20 @@ class _attention(torch.autograd.Function):
         # Step 3: Reduce key-value outer products
         # across blocks and update KV history
         grid = (b * h, NUM_FBLOCK)
-        _fwd_kv_reduce[grid](s,
-                             kv,
-                             kv_history,
-                             b,
-                             h,
-                             n,
-                             d,
-                             e,
-                             BLOCK=BLOCK,
-                             NUM_BLOCK=NUM_BLOCK,
-                             D_FBLOCK=D_FBLOCK,
-                             E_FBLOCK=E_FBLOCK)
+        _fwd_kv_reduce[grid](
+            s,
+            kv,
+            kv_history,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+        )
 
         # Step 4: Compute non-diagonal blocks of attention
         grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
@@ -461,12 +528,12 @@ def lightning_attention(
     v: torch.Tensor,
     ed: torch.Tensor,
     block_size: int = 256,
-    kv_history: Optional[torch.Tensor] = None
+    kv_history: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
-    Apply lightning attention algorithm 
+    Apply lightning attention algorithm
     to compute attention efficiently.
-    
+
     Args:
         q: Query tensor of shape [batch, heads, seq_len, dim]
         k: Key tensor of shape [batch, heads, seq_len, dim]
@@ -474,7 +541,7 @@ def lightning_attention(
         ed: Decay rate tensor of shape [heads]
         block_size: Size of blocks for block-sparse attention
         kv_history: Optional key-value history from previous computations
-        
+
     Returns:
         output: Attention output
         kv: Updated key-value history
@@ -496,9 +563,9 @@ def lightning_attention(
 
     # Initialize or clone key-value history
     if kv_history is None:
-        kv_history = torch.zeros((q.shape[0], q.shape[1], d, e),
-                                 dtype=torch.float32,
-                                 device=q.device)
+        kv_history = torch.zeros(
+            (q.shape[0], q.shape[1], d, e), dtype=torch.float32, device=q.device
+        )
     else:
         kv_history = kv_history.clone().contiguous()
 
@@ -533,7 +600,7 @@ def _linear_attn_decode_kernel(
 ):
     """
     Kernel for linear attention decoding with KV cache.
-    
+
     This kernel computes attention for a single token using the KV cache.
     """
     pid_b = tl.program_id(0)  # batch index
@@ -556,8 +623,9 @@ def _linear_attn_decode_kernel(
     # Calculate offsets for dimensions
     qk_d_offsets = tl.arange(0, D)
     v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
-    cache_d_offsets = qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[
-        None, :] * cache_d1_stride
+    cache_d_offsets = (
+        qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[None, :] * cache_d1_stride
+    )
 
     # Calculate offsets for the current batch and head
     q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
@@ -605,7 +673,7 @@ def linear_decode_forward_triton(
 ) -> torch.Tensor:
     """
     Perform linear attention decoding using Triton kernels.
-    
+
     Args:
         q: Query tensor of shape [B, H, 1, D]
         k: Key tensor of shape [B, H, 1, D]
@@ -614,7 +682,7 @@ def linear_decode_forward_triton(
         slope_rate: Decay rate tensor
         slot_idx: Slot indices for batches
         BLOCK_SIZE: Size of blocks for processing
-        
+
     Returns:
         output: Attention output tensor
     """
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 04a5db07e95ceca297eb2177842d68139a12584b..dfcc601a1c5308cfde4410750dbec7c265845cf7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,34 +3,37 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Any, Literal, Optional, Union
+from typing import Any
 
 import torch
-import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
-# yapf: disable
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           BlockQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PackedColumnParameter,
-                                           PackedvLLMParameter,
-                                           PerTensorScaleParameter,
-                                           RowvLLMParameter)
-# yapf: enable
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    BlockQuantScaleParameter,
+    ModelWeightParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    PerTensorScaleParameter,
+    RowvLLMParameter,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import GiB_bytes
 
 logger = init_logger(__name__)
 
@@ -62,8 +65,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
 def adjust_bitblas_shard(param, shard_size, shard_offset):
     bitblas_tile_size = getattr(param, "bitblas_tile_size", None)
     if bitblas_tile_size is not None:
-        return (shard_size // bitblas_tile_size,
-                shard_offset // bitblas_tile_size)
+        return (shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size)
 
     return shard_size, shard_offset
 
@@ -76,9 +78,9 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
-def adjust_bitsandbytes_4bit_shard(param: Parameter,
-                                   shard_offsets: dict[str, tuple[int, int]],
-                                   loaded_shard_id: str) -> tuple[int, int]:
+def adjust_bitsandbytes_4bit_shard(
+    param: Parameter, shard_offsets: dict[str, tuple[int, int]], loaded_shard_id: str
+) -> tuple[int, int]:
     """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
 
     total, _ = shard_offsets["total"]
@@ -94,8 +96,8 @@ def adjust_bitsandbytes_4bit_shard(param: Parameter,
 def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     """For fused modules (QKV and MLP) we have an array of length
     N that holds 1 scale for each "logical" matrix. So the param
-    is an array of length N. The loaded_weight corresponds to 
-    one of the shards on disk. Here, we slice the param based on 
+    is an array of length N. The loaded_weight corresponds to
+    one of the shards on disk. Here, we slice the param based on
     the shard_id for loading.
     """
     qkv_idxs = {"q": 0, "k": 1, "v": 2}
@@ -122,13 +124,13 @@ def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]):
 
     For example, given bnb weight attributes as below:
     {
-        'bnb_shard_offsets': array([0, 4, 8, 16]), 
+        'bnb_shard_offsets': array([0, 4, 8, 16]),
         'bnb_quant_state': {0: ..., 1: ..., 2: ...},
     }
 
     The function will return:
     {
-        'bnb_shard_offsets': array([0, 4]), 
+        'bnb_shard_offsets': array([0, 4]),
         'bnb_quant_state': {0: ...},
     }
     and
@@ -143,8 +145,7 @@ def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]):
     quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]}
     quant_state_r = {
         i - 1: bnb_weight_attrs["bnb_quant_state"][i]
-        for i in range(1,
-                       len(shard_offsets) - 1)
+        for i in range(1, len(shard_offsets) - 1)
     }
     left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l)
     right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r)
@@ -155,18 +156,23 @@ class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
     @abstractmethod
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-        """Create weights for a linear layer. 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for a linear layer.
            The weights will be set as attributes of the layer.
 
         Args:
             layer: The layer that is using the LinearMethodBase factory.
             input_size_per_partition: Size of the weight input dim on rank X.
-            output_partition_sizes: Sizes of the output dim of each logical 
+            output_partition_sizes: Sizes of the output dim of each logical
                 weight on rank X. E.g., output_partition_sizes for QKVLinear
                 is a list contains the width of Wq, Wk, Wv on rank X.
             input_size: Size of the input dim of the weight across all ranks.
@@ -176,10 +182,12 @@ class LinearMethodBase(QuantizeMethodBase):
         raise NotImplementedError
 
     @abstractmethod
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         """Apply the weights in layer to the input tensor.
         Expects create_weights to have been called before on the layer."""
         raise NotImplementedError
@@ -188,51 +196,47 @@ class LinearMethodBase(QuantizeMethodBase):
 class UnquantizedLinearMethod(LinearMethodBase):
     """Linear method without quantization."""
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # This method creates unquantized linear weights.
         # The weights are not quantized, and they are not sharded.
         # The amount of memory allocated for the weights is
         # sum(output_partition_sizes) * input_size_per_partition.
-        try:
-            weight_loader = extra_weight_attrs.pop("weight_loader")
-            weight = ModelWeightParameter(data=torch.empty(
+        weight_loader = extra_weight_attrs.pop("weight_loader")
+        weight = ModelWeightParameter(
+            data=torch.empty(
                 sum(output_partition_sizes),
                 input_size_per_partition,
-                dtype=params_dtype),
-                                          input_dim=1,
-                                          output_dim=0,
-                                          weight_loader=weight_loader)
-        except torch.cuda.OutOfMemoryError as e:
-            logger.error("Failed to create unquantized linear weights: %s", e)
-            if torch.cuda.is_available():
-                logger.debug("CUDA device: %s", torch.cuda.current_device())
-                logger.debug("Allocated: %.2f GiB",
-                             torch.cuda.memory_allocated() / GiB_bytes)
-                logger.debug("Reserved: %.2f GiB",
-                             torch.cuda.memory_reserved() / GiB_bytes)
-            raise RuntimeError(
-                "Failed to create unquantized linear weights. "
-                "This may be caused by insufficient memory to allocate "
-                "the weight.") from e
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if current_platform.is_cpu():
-            from vllm.model_executor.layers.utils import (
-                dispatch_cpu_unquantized_gemm)
-            dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
+            from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
 
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
 
@@ -255,8 +259,8 @@ class LinearBase(CustomOp):
         input_size: int,
         output_size: int,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -274,17 +278,13 @@ class LinearBase(CustomOp):
         self.quant_config = quant_config
         self.prefix = prefix
         if quant_config is None:
-            self.quant_method: Optional[
-                QuantizeMethodBase] = UnquantizedLinearMethod()
+            self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
         else:
-            self.quant_method = quant_config.get_quant_method(self,
-                                                              prefix=prefix)
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
         self.return_bias = return_bias
         self.disable_tp = disable_tp
-        self.tp_rank = (get_tensor_model_parallel_rank()
-                        if not disable_tp else 0)
-        self.tp_size = (get_tensor_model_parallel_world_size()
-                        if not disable_tp else 1)
+        self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
+        self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
 
     def update_param_tp_status(self):
         for param in self.parameters():
@@ -316,8 +316,8 @@ class ReplicatedLinear(LinearBase):
         output_size: int,
         bias: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -329,32 +329,40 @@ class ReplicatedLinear(LinearBase):
         else:
             self.output_partition_sizes = [output_size]
 
-        super().__init__(input_size,
-                         output_size,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias,
-                         disable_tp=disable_tp)
+        super().__init__(
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
-        self.quant_method.create_weights(self,
-                                         self.input_size,
-                                         self.output_partition_sizes,
-                                         self.input_size,
-                                         self.output_size,
-                                         self.params_dtype,
-                                         weight_loader=self.weight_loader)
+        self.quant_method.create_weights(
+            self,
+            self.input_size,
+            self.output_partition_sizes,
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+            weight_loader=self.weight_loader,
+        )
 
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size, dtype=self.params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+                torch.empty(self.output_size, dtype=self.params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
 
@@ -377,13 +385,14 @@ class ReplicatedLinear(LinearBase):
 
         assert param.size() == loaded_weight.size(), (
             f"Tried to load weights of size {loaded_weight.size()}"
-            f"to a parameter of size {param.size()}")
+            f"to a parameter of size {param.size()}"
+        )
         param.data.copy_(loaded_weight)
 
     def forward(
         self,
         x: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
 
@@ -423,7 +432,7 @@ class ColumnParallelLinear(LinearBase):
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
         prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj) 
+                        (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
         disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
@@ -435,37 +444,36 @@ class ColumnParallelLinear(LinearBase):
         bias: bool = True,
         gather_output: bool = False,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        output_sizes: Optional[list[int]] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        output_sizes: list[int] | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
     ):
         # Divide the weight matrix along the last dimension.
-        self.tp_rank = (get_tensor_model_parallel_rank()
-                        if not disable_tp else 0)
-        self.tp_size = (get_tensor_model_parallel_world_size()
-                        if not disable_tp else 1)
+        self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
+        self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.input_size_per_partition = input_size
         self.output_size_per_partition = divide(output_size, self.tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
         # If QKV or MergedColumn, use output size of each partition.
         if hasattr(self, "output_sizes"):
             self.output_partition_sizes = [
-                divide(output_size, self.tp_size)
-                for output_size in self.output_sizes
+                divide(output_size, self.tp_size) for output_size in self.output_sizes
             ]
 
-        super().__init__(input_size,
-                         output_size,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix,
-                         return_bias=return_bias,
-                         disable_tp=disable_tp)
+        super().__init__(
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )
 
         self.gather_output = gather_output
 
@@ -481,22 +489,27 @@ class ColumnParallelLinear(LinearBase):
             output_size=self.output_size,
             params_dtype=self.params_dtype,
             weight_loader=(
-                self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+                torch.empty(self.output_size_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
         self.update_param_tp_status()
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-
         output_dim = getattr(param, "output_dim", None)
 
         is_sharded_weight = getattr(param, "is_sharded_weight", False)
@@ -516,16 +529,14 @@ class ColumnParallelLinear(LinearBase):
             final_shape = list(loaded_weight.shape)
             if output_dim is not None:
                 assert final_shape[output_dim] % self.tp_size == 0
-                final_shape[output_dim] = (final_shape[output_dim] //
-                                           self.tp_size)
+                final_shape[output_dim] = final_shape[output_dim] // self.tp_size
             param.materialize(final_shape, dtype=loaded_weight.dtype)
 
         param_data = param.data
         if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]
             start_idx = self.tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -535,8 +546,7 @@ class ColumnParallelLinear(LinearBase):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def weight_loader_v2(self, param: BasevLLMParameter,
-                         loaded_weight: torch.Tensor):
+    def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
@@ -547,7 +557,7 @@ class ColumnParallelLinear(LinearBase):
     def forward(
         self,
         input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -606,37 +616,37 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         bias: bool = True,
         gather_output: bool = False,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
     ):
         self.output_sizes = output_sizes
-        self.tp_size = (get_tensor_model_parallel_world_size()
-                        if not disable_tp else 1)
-        self.tp_rank = (get_tensor_model_parallel_rank()
-                        if not disable_tp else 0)
-
-        assert all(output_size % self.tp_size == 0
-                   for output_size in output_sizes)
-        super().__init__(input_size=input_size,
-                         output_size=sum(output_sizes),
-                         bias=bias,
-                         gather_output=gather_output,
-                         skip_bias_add=skip_bias_add,
-                         params_dtype=params_dtype,
-                         quant_config=quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias,
-                         disable_tp=disable_tp)
-
-    def weight_loader(self,
-                      param: Parameter,
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[int] = None):
+        self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
+        self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
+
+        assert all(output_size % self.tp_size == 0 for output_size in output_sizes)
+        super().__init__(
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )
 
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: int | None = None,
+    ):
         # Special case for GGUF
         # initialize GGUF param after we know the quantize type
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
@@ -647,20 +657,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             else:
                 param.shard_weight_type = {
-                    i: loaded_weight.item()
-                    for i, _ in enumerate(self.output_sizes)
+                    i: loaded_weight.item() for i, _ in enumerate(self.output_sizes)
                 }
             return
 
         if is_gguf_weight:
-
             output_dim = getattr(param, "output_dim", None)
             shard_size = loaded_weight.size(output_dim) // self.tp_size
             start_idx = self.tp_rank * shard_size
 
             if loaded_shard_id is not None:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                     shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
                 param.shard_id.append(loaded_shard_id)
                 param.shard_id_map[loaded_shard_id] = len(param.data_container)
                 param.data_container.append(loaded_weight)
@@ -677,14 +684,14 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             if output_dim is None:
                 if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
-                        param_data, loaded_weight, 0)
+                        param_data, loaded_weight, 0
+                    )
 
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
-            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
-                                            False)
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
             shard_offsets: list[tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
@@ -699,10 +706,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     shard_offset = shard_offset // param.packed_factor
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
-                        param, shard_size, shard_offset)
+                        param, shard_size, shard_offset
+                    )
 
                 shard_size, shard_offset = adjust_bitblas_shard(
-                    param, shard_size, shard_offset)
+                    param, shard_size, shard_offset
+                )
 
                 if use_bitsandbytes_4bit:
                     index = list(itertools.accumulate([0] + self.output_sizes))
@@ -712,17 +721,18 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     }
                     orig_offsets["total"] = (self.output_size, 0)
                     shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
-                        param, orig_offsets, str(shard_id))
+                        param, orig_offsets, str(shard_id)
+                    )
 
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size)
+                    output_dim, shard_offset, shard_size
+                )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
         assert loaded_shard_id < len(self.output_sizes)
         if output_dim is not None:
-            shard_offset = (sum(self.output_sizes[:loaded_shard_id]) //
-                            self.tp_size)
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
             shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
             # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
@@ -733,12 +743,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 shard_offset = shard_offset // param.packed_factor
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
-                    param, shard_size, shard_offset)
+                    param, shard_size, shard_offset
+                )
             shard_size, shard_offset = adjust_bitblas_shard(
-                param, shard_size, shard_offset)
+                param, shard_size, shard_offset
+            )
 
-            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
-                                            False)
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
             is_sharded_weight = getattr(param, "is_sharded_weight", False)
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow
@@ -746,19 +757,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
             if use_bitsandbytes_4bit:
                 shard_size = loaded_weight.shape[output_dim]
-                shard_offset = loaded_weight.shape[output_dim] * \
-                    loaded_shard_id
+                shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
 
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             start_idx = self.tp_rank * shard_size
             if not is_sharded_weight:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                     shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
-                param_data, loaded_weight, loaded_shard_id)
+                param_data, loaded_weight, loaded_shard_id
+            )
 
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -766,13 +775,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "MergedColumnParallelLinear, assume the weight is "
-                    "the same for all partitions.")
+                    "the same for all partitions."
+                )
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
-                                           loaded_weight: torch.Tensor):
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
         """
         Handle special case for models where MLP layers are already
         fused on disk. In this case, we have no shard id. This function
@@ -793,25 +804,28 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             # Special case for Quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
-            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
-                                  )) and param.packed_dim == param.output_dim:
-                shard_size, shard_offset = \
-                    param.adjust_shard_indexes_for_packing(
-                    shard_size=shard_size, shard_offset=shard_offset)
-
-            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
-                                                       shard_offset,
-                                                       shard_size)
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_offset, shard_size
+            )
             self.weight_loader_v2(param, loaded_weight_shard, shard_id)
 
-    def weight_loader_v2(self,
-                         param: BasevLLMParameter,
-                         loaded_weight: torch.Tensor,
-                         loaded_shard_id: Optional[int] = None):
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: int | None = None,
+    ):
         if loaded_shard_id is None:
             if isinstance(param, PerTensorScaleParameter):
-                param.load_merged_column_weight(loaded_weight=loaded_weight,
-                                                shard_id=0)
+                param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0)
                 return
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight)
@@ -830,20 +844,24 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             assert weight_block_size is not None
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
-                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n) // self.tp_size
-            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n // self.tp_size)
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
+            ) // self.tp_size
+            shard_size = (
+                (self.output_sizes[loaded_shard_id] + block_n - 1)
+                // block_n
+                // self.tp_size
+            )
         else:
-            shard_offset = sum(
-                self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
             shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
 
-        param.load_merged_column_weight(loaded_weight=loaded_weight,
-                                        shard_id=loaded_shard_id,
-                                        shard_offset=shard_offset,
-                                        shard_size=shard_size,
-                                        tp_rank=self.tp_rank)
+        param.load_merged_column_weight(
+            loaded_weight=loaded_weight,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            tp_rank=self.tp_rank,
+        )
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -879,11 +897,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         hidden_size: int,
         head_size: int,
         total_num_heads: int,
-        total_num_kv_heads: Optional[int] = None,
+        total_num_kv_heads: int | None = None,
         bias: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -896,42 +914,43 @@ class QKVParallelLinear(ColumnParallelLinear):
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
         # Divide the weight matrix along the last dimension.
-        tp_size = (get_tensor_model_parallel_world_size()
-                   if not disable_tp else 1)
+        tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
-            self.num_kv_head_replicas = divide(tp_size,
-                                               self.total_num_kv_heads)
+            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
         else:
             self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
-        output_size = (self.num_heads +
-                       2 * self.num_kv_heads) * tp_size * self.head_size
+        output_size = (
+            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
+        )
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj 
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
         ]
 
-        super().__init__(input_size=input_size,
-                         output_size=output_size,
-                         bias=bias,
-                         gather_output=False,
-                         skip_bias_add=skip_bias_add,
-                         params_dtype=params_dtype,
-                         quant_config=quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias,
-                         disable_tp=disable_tp)
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            bias=bias,
+            gather_output=False,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
         shard_offset_mapping = {
             "q": 0,
             "k": self.num_heads * self.head_size,
             "v": (self.num_heads + self.num_kv_heads) * self.head_size,
-            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size
+            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
         }
         return shard_offset_mapping.get(loaded_shard_id)
 
@@ -943,10 +962,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         }
         return shard_size_mapping.get(loaded_shard_id)
 
-    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
-                                           loaded_weight: torch.Tensor):
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
         """
-        Handle special case for models where QKV layers are already 
+        Handle special case for models where QKV layers are already
         fused on disk. In this case, we have no shard id. This function
         determines the shard id by splitting these layers and then calls
         the weight loader using the shard id.
@@ -957,41 +977,49 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_offsets = [
             # (shard_id, shard_offset, shard_size)
             ("q", 0, self.total_num_heads * self.head_size),
-            ("k", self.total_num_heads * self.head_size,
-             self.total_num_kv_heads * self.head_size),
-            ("v",
-             (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-             self.total_num_kv_heads * self.head_size),
+            (
+                "k",
+                self.total_num_heads * self.head_size,
+                self.total_num_kv_heads * self.head_size,
+            ),
+            (
+                "v",
+                (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                self.total_num_kv_heads * self.head_size,
+            ),
         ]
 
         for shard_id, shard_offset, shard_size in shard_offsets:
             # Special case for Quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
-            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
-                                  )) and param.packed_dim == param.output_dim:
-                shard_size, shard_offset = \
-                    param.adjust_shard_indexes_for_packing(
-                    shard_size=shard_size, shard_offset=shard_offset)
-
-            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
-                                                       shard_offset,
-                                                       shard_size)
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_offset, shard_size
+            )
             self.weight_loader_v2(param, loaded_weight_shard, shard_id)
 
-    def weight_loader_v2(self,
-                         param: BasevLLMParameter,
-                         loaded_weight: torch.Tensor,
-                         loaded_shard_id: Optional[str] = None):
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: str | None = None,
+    ):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight,
-                                      shard_id=0,
-                                      tp_rank=self.tp_rank)
+                param.load_qkv_weight(
+                    loaded_weight=loaded_weight, shard_id=0, tp_rank=self.tp_rank
+                )
                 return
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight,
-                                      tp_rank=self.tp_rank)
+                param.load_qkv_weight(loaded_weight=loaded_weight, tp_rank=self.tp_rank)
                 return
             # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
@@ -1013,18 +1041,21 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
 
-        param.load_qkv_weight(loaded_weight=loaded_weight,
-                              num_heads=self.num_kv_head_replicas,
-                              shard_id=loaded_shard_id,
-                              shard_offset=shard_offset,
-                              shard_size=shard_size,
-                              tp_rank=self.tp_rank)
-
-    def weight_loader(self,
-                      param: Parameter,
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[str] = None):
+        param.load_qkv_weight(
+            loaded_weight=loaded_weight,
+            num_heads=self.num_kv_head_replicas,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            tp_rank=self.tp_rank,
+        )
 
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: str | None = None,
+    ):
         # Special case for GGUF
         # initialize GGUF param after we know the quantize type
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
@@ -1035,10 +1066,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
                 param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             else:
-                param.shard_weight_type = {
-                    k: loaded_weight.item()
-                    for k in idx_map
-                }
+                param.shard_weight_type = {k: loaded_weight.item() for k in idx_map}
             return
 
         if is_gguf_weight:
@@ -1047,8 +1075,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             start_idx = self.tp_rank * shard_size
 
             if loaded_shard_id is not None:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                     shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
                 param.shard_id.append(loaded_shard_id)
                 param.shard_id_map[loaded_shard_id] = len(param.data_container)
                 param.data_container.append(loaded_weight)
@@ -1066,7 +1093,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             if output_dim is None:
                 if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
-                        param_data, loaded_weight, 0)
+                        param_data, loaded_weight, 0
+                    )
 
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
@@ -1074,13 +1102,18 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_offsets = [
                 # (shard_id, shard_offset, shard_size)
                 ("q", 0, self.total_num_heads * self.head_size),
-                ("k", self.total_num_heads * self.head_size,
-                 self.total_num_kv_heads * self.head_size),
-                ("v", (self.total_num_heads + self.total_num_kv_heads) *
-                 self.head_size, self.total_num_kv_heads * self.head_size),
+                (
+                    "k",
+                    self.total_num_heads * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
+                (
+                    "v",
+                    (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
             ]
-            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
-                                            False)
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
 
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
@@ -1093,27 +1126,35 @@ class QKVParallelLinear(ColumnParallelLinear):
 
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
-                        param, shard_size, shard_offset)
+                        param, shard_size, shard_offset
+                    )
 
                 if use_bitsandbytes_4bit:
                     orig_qkv_offsets = {
                         "q": (0, self.total_num_heads * self.head_size),
-                        "k": (self.total_num_heads * self.head_size,
-                              self.total_num_kv_heads * self.head_size),
-                        "v":
-                        ((self.total_num_heads + self.total_num_kv_heads) *
-                         self.head_size,
-                         self.total_num_kv_heads * self.head_size),
-                        "total":
-                        ((self.total_num_heads + 2 * self.total_num_kv_heads) *
-                         self.head_size, 0)
+                        "k": (
+                            self.total_num_heads * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "v": (
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "total": (
+                            (self.total_num_heads + 2 * self.total_num_kv_heads)
+                            * self.head_size,
+                            0,
+                        ),
                     }
 
                     shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
-                        param, orig_qkv_offsets, shard_id)
+                        param, orig_qkv_offsets, shard_id
+                    )
 
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size)
+                    output_dim, shard_offset, shard_size
+                )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
@@ -1128,8 +1169,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 shard_offset = self.num_heads * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
-                shard_offset = (self.num_heads +
-                                self.num_kv_heads) * self.head_size
+                shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
@@ -1141,10 +1181,10 @@ class QKVParallelLinear(ColumnParallelLinear):
 
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
-                    param, shard_size, shard_offset)
+                    param, shard_size, shard_offset
+                )
 
-            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
-                                            False)
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
             is_sharded_weight = getattr(param, "is_sharded_weight", False)
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow
@@ -1153,41 +1193,46 @@ class QKVParallelLinear(ColumnParallelLinear):
             if use_bitsandbytes_4bit:
                 orig_qkv_offsets = {
                     "q": (0, self.num_heads * self.head_size),
-                    "k": (self.num_heads * self.head_size,
-                          self.num_kv_heads * self.head_size),
-                    "v":
-                    ((self.num_heads + self.num_kv_heads) * self.head_size,
-                     self.num_kv_heads * self.head_size),
-                    "total":
-                    ((self.num_heads + 2 * self.num_kv_heads) * self.head_size,
-                     0)
+                    "k": (
+                        self.num_heads * self.head_size,
+                        self.num_kv_heads * self.head_size,
+                    ),
+                    "v": (
+                        (self.num_heads + self.num_kv_heads) * self.head_size,
+                        self.num_kv_heads * self.head_size,
+                    ),
+                    "total": (
+                        (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                        0,
+                    ),
                 }
                 shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
-                    param, orig_qkv_offsets, loaded_shard_id)
+                    param, orig_qkv_offsets, loaded_shard_id
+                )
 
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
-                shard_id = self.tp_rank
+                shard_rank = self.tp_rank
             else:
-                shard_id = self.tp_rank // self.num_kv_head_replicas
-            start_idx = shard_id * shard_size
+                shard_rank = self.tp_rank // self.num_kv_head_replicas
+            start_idx = shard_rank * shard_size
 
             if not is_sharded_weight:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                     shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
-                param_data, loaded_weight, loaded_shard_id)
+                param_data, loaded_weight, loaded_shard_id
+            )
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "QKVParallelLinear, assume the weight is the same "
-                    "for all partitions.")
+                    "for all partitions."
+                )
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -1234,31 +1279,31 @@ class RowParallelLinear(LinearBase):
         bias: bool = True,
         input_is_parallel: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         reduce_results: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
     ):
         # Divide the weight matrix along the first dimension.
-        self.tp_rank = (get_tensor_model_parallel_rank()
-                        if not disable_tp else 0)
-        self.tp_size = (get_tensor_model_parallel_world_size()
-                        if not disable_tp else 1)
+        self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
+        self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.input_size_per_partition = divide(input_size, self.tp_size)
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
 
-        super().__init__(input_size,
-                         output_size,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix,
-                         return_bias=return_bias,
-                         disable_tp=disable_tp)
+        super().__init__(
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
@@ -1272,19 +1317,26 @@ class RowParallelLinear(LinearBase):
             output_size=self.output_size,
             params_dtype=self.params_dtype,
             weight_loader=(
-                self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
         if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError("When not reduce the results, adding bias to the "
-                             "results can lead to incorrect results")
+            raise ValueError(
+                "When not reduce the results, adding bias to the "
+                "results can lead to incorrect results"
+            )
 
         if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size, dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+            self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
         self.update_param_tp_status()
@@ -1307,16 +1359,14 @@ class RowParallelLinear(LinearBase):
         if is_gguf_weight and isinstance(param, UninitializedParameter):
             weight_shape = list(loaded_weight.shape)
             if input_dim:
-                weight_shape[input_dim] = (weight_shape[input_dim] //
-                                           self.tp_size)
+                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
 
         param_data = param.data
         if input_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[input_dim]
             start_idx = self.tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(input_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -1326,9 +1376,7 @@ class RowParallelLinear(LinearBase):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def weight_loader_v2(self, param: BasevLLMParameter,
-                         loaded_weight: torch.Tensor):
-
+    def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
@@ -1340,12 +1388,13 @@ class RowParallelLinear(LinearBase):
     def forward(
         self,
         input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
             splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.tp_size)
+                input_, num_partitions=self.tp_size
+            )
             input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
@@ -1373,231 +1422,3 @@ class RowParallelLinear(LinearBase):
         s += f", tp_size={self.tp_size}"
         s += f", reduce_results={self.reduce_results}"
         return s
-
-
-@CustomOp.register("qkv_cross_parallel_linear")
-class QKVCrossParallelLinear(LinearBase):
-    """Linear layers for efficient cross-attention's QKV transformation.
-
-    Args:
-        hidden_size: input hidden state size of the transformer.
-        head_size: size of each attention head.
-        total_num_heads: total number of attention query heads.
-        total_num_kv_heads: total number of attention key/value heads. If
-                            None, assume total_num_kv_heads = total_num_heads.
-        bias: If true, add bias.
-        skip_bias_add: This was added to enable performance optimizations where
-                       bias can be fused with other element-wise operations. we
-                       skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj)
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 head_size: int,
-                 total_num_heads: int,
-                 total_num_kv_heads: Optional[int] = None,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        # input_size and output_size are not used, just for alignment
-        input_size = hidden_size
-        output_size = (total_num_heads + (total_num_kv_heads or 0)) * head_size
-        super().__init__(input_size=input_size,
-                         output_size=output_size,
-                         skip_bias_add=skip_bias_add,
-                         params_dtype=params_dtype,
-                         quant_config=quant_config,
-                         prefix=prefix)
-
-        self.quant_config = quant_config
-
-        # Empty placeholders for loading as a single module.
-        placeholder_size = 0
-        assert self.quant_method is not None
-        self.quant_method.create_weights(self,
-                                         placeholder_size, [placeholder_size],
-                                         placeholder_size,
-                                         placeholder_size,
-                                         self.params_dtype,
-                                         weight_loader=self.weight_loader)
-
-        # Use a dictionary to avoid submodules parameters auto-registration:
-        # drop-in replacement for a `QKVParallelLinear` module.
-        self.proj = dict()
-        self.proj["q_proj_decoder"] = ColumnParallelLinear(
-            input_size=hidden_size,
-            output_size=total_num_heads * head_size,
-            bias=bias,
-            quant_config=quant_config,
-            skip_bias_add=skip_bias_add,
-            params_dtype=params_dtype,
-            prefix=f"{prefix}.q_proj_decoder")
-
-        self.proj["kv_proj_encoder"] = QKVParallelLinear(
-            hidden_size=hidden_size,
-            head_size=head_size,
-            total_num_heads=0,
-            total_num_kv_heads=total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-            skip_bias_add=skip_bias_add,
-            params_dtype=params_dtype,
-            prefix=f"{prefix}.kv_proj_encoder")
-
-        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
-        self.q_size = self.q_proj_decoder.output_size_per_partition
-        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
-
-        if bias:
-            self.bias = torch.nn.Parameter()
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader_v1,
-            })
-        else:
-            self.bias = None
-
-    def process_weights_after_loading(self):
-        for layer in self.proj.values():
-            if self.quant_method is not None:
-                self.quant_method.process_weights_after_loading(layer)
-
-    @property
-    def q_proj_decoder(self) -> ColumnParallelLinear:
-        layer = self.proj["q_proj_decoder"]
-        for name, param in self.named_parameters():
-            target_param = getattr(layer, name, None)
-            if target_param is not None:
-                self.sync_weight_attrs(param,
-                                       target_param,
-                                       mode="q_proj_decoder")
-        return layer
-
-    @property
-    def kv_proj_encoder(self) -> QKVParallelLinear:
-        layer = self.proj["kv_proj_encoder"]
-        for name, param in self.named_parameters():
-            target_param = getattr(layer, name, None)
-            if target_param is not None:
-                self.sync_weight_attrs(param,
-                                       target_param,
-                                       mode="kv_proj_encoder")
-        return layer
-
-    def sync_weight_attrs(
-        self,
-        src_param: nn.Parameter,
-        tgt_param: nn.Parameter,
-        mode: Literal["q_proj_decoder", "kv_proj_encoder"],
-    ):
-        missing_attrs_dict = {
-            k: getattr(src_param, k)
-            for k in (set(vars(src_param).keys()) -
-                      set(vars(tgt_param).keys()))
-        }
-        # TODO(Isotr0py): handle bitsandbytes 8bit
-        use_bitsandbytes_4bit = getattr(src_param, "use_bitsandbytes_4bit",
-                                        False)
-        if (missing_attrs_dict and use_bitsandbytes_4bit):
-            q_proj_attrs, kv_proj_attrs = left_shift_bitsandbytes_4bit_shard(
-                missing_attrs_dict)
-            if mode == "q_proj_decoder":
-                set_weight_attrs(tgt_param, q_proj_attrs)
-            elif mode == "kv_proj_encoder":
-                set_weight_attrs(tgt_param, kv_proj_attrs)
-        else:
-            set_weight_attrs(tgt_param, missing_attrs_dict)
-
-    def _is_same_param(
-        self,
-        src_param: torch.nn.Parameter,
-        map_param: torch.nn.Parameter,
-    ) -> bool:
-        """Check if two parameters are exactly pointing to same things."""
-        # ignore weight_loader because it's always different
-        key_to_ignore = ["weight_loader", "_weight_loader"]
-        has_same_type_name = type(src_param) is type(map_param)
-        src_param_attrs = {
-            k: v
-            for k, v in src_param.__dict__.items() if k not in key_to_ignore
-        }
-        map_param_attrs = {
-            k: v
-            for k, v in map_param.__dict__.items() if k not in key_to_ignore
-        }
-        has_same_attrs = src_param_attrs == map_param_attrs
-        return has_same_type_name and has_same_attrs
-
-    def select_proj_params(
-        self,
-        layer: nn.Module,
-        param: nn.Parameter,
-    ) -> nn.Parameter:
-        """
-        Given the placeholder param, 
-        return the corresponding param in the proj layers.
-        """
-        target_param_list = [
-            v for _, v in layer.named_parameters()
-            if self._is_same_param(param, v)
-        ]
-        assert len(target_param_list) == 1
-        target_param = target_param_list[0]
-        return target_param
-
-    def forward(  # type: ignore[override]
-        self,
-        decoder_hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-    ) -> tuple[torch.Tensor, ...]:
-        q, _ = self.q_proj_decoder(decoder_hidden_states)
-        if encoder_hidden_states is None:
-            # Encoder KV already cached.
-            k = None
-            v = None
-        else:
-            # Prefill phase, encoder KV cached here.
-            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
-            # Split kv in half
-            k, v = kv_enc.split(self.kv_size, dim=-1)
-        return q, k, v
-
-    def weight_loader_v1(self,
-                         param: torch.nn.Parameter,
-                         loaded_weight: torch.Tensor,
-                         loaded_shard_id: Optional[str] = None):
-        # just like all other parameters, does not yet
-        # support loading bias with weight_loader_v2
-        layer = (self.q_proj_decoder
-                 if loaded_shard_id == "q" else self.kv_proj_encoder)
-        target_param = self.select_proj_params(layer, param)
-        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
-        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
-
-    def weight_loader(self,
-                      param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[str] = None):
-        layer = (self.q_proj_decoder
-                 if loaded_shard_id == "q" else self.kv_proj_encoder)
-        target_param = self.select_proj_params(layer, param)
-        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
-        if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED:
-            layer.weight_loader_v2(target_param, loaded_weight, *shard_id_args)
-        else:
-            layer.weight_loader(target_param, loaded_weight, *shard_id_args)
-
-    def extra_repr(self) -> str:
-        s = f"in_features={self.input_size}"
-        s += f", q_size={self.q_size}"
-        s += f", kv_size={self.kv_size}"
-        s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
-        s += ", gather_output=False"
-        return s
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 2110aa2769b93dabcfd75b2da810549f0c2c9ff8..c8d57f597d1ca24636af3b245299afa4d08a92c1 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,15 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
-from typing import Optional
 
 import torch
 
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_gather)
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_gather,
+)
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.platforms import current_platform
 
 
@@ -23,12 +23,14 @@ class LogitsProcessor(CustomOp):
     3. Apply logits processors (if any).
     """
 
-    def __init__(self,
-                 vocab_size: int,
-                 org_vocab_size: Optional[int] = None,
-                 scale: float = 1.0,
-                 logits_as_input: bool = False,
-                 soft_cap: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        vocab_size: int,
+        org_vocab_size: int | None = None,
+        scale: float = 1.0,
+        logits_as_input: bool = False,
+        soft_cap: float | None = None,
+    ) -> None:
         """
         Args:
             scale: A scaling factor to apply to the logits.
@@ -49,8 +51,8 @@ class LogitsProcessor(CustomOp):
         self,
         lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | None:
         if self.logits_as_input:
             logits = hidden_states
         else:
@@ -84,19 +86,17 @@ class LogitsProcessor(CustomOp):
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor],
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None,
+    ) -> torch.Tensor | None:
         # Get the logits for the next tokens.
-        logits = lm_head.quant_method.apply(lm_head,
-                                            hidden_states,
-                                            bias=embedding_bias)
+        logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
 
         # Gather logits for TP
         logits = self._gather_logits(logits)
 
         # Remove paddings in vocab (if any).
         if logits is not None:
-            logits = logits[..., :self.org_vocab_size]
+            logits = logits[..., : self.org_vocab_size]
         return logits
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index 6da62b5426bb6b35cb71d5753348c13e5ed7b1bd..e68b09b4d81f52aac3ee73fc7809eed527c5b0f0 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -40,3 +42,30 @@ class MambaBase(AttentionLayerBase):
     def get_attn_backend(self) -> type["AttentionBackend"]:
         """Get the attention backend class for this Mamba layer."""
         pass
+
+    @abstractmethod
+    def get_state_dtype(self) -> tuple[torch.dtype, ...]:
+        pass
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
+        if (
+            vllm_config.speculative_config is not None
+            and vllm_config.model_config.hf_config.model_type not in ["qwen3_next"]
+        ):
+            raise NotImplementedError(
+                "Mamba with speculative decoding is not supported yet."
+            )
+        mamba_block_size = vllm_config.cache_config.mamba_block_size
+        page_size_padded = vllm_config.cache_config.mamba_page_size_padded
+        return MambaSpec(
+            shapes=self.get_state_shape(),
+            dtypes=self.get_state_dtype(),
+            block_size=mamba_block_size,
+            page_size_padded=page_size_padded,
+            mamba_type=self.mamba_type,
+            num_speculative_blocks=(
+                vllm_config.speculative_config.num_speculative_tokens
+                if vllm_config.speculative_config
+                else 0
+            ),
+        )
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 410cbef4f6bc0db396ef3e39131efb40aa47d4ff..0a2742ff49a449b3b294900149269f3fa0412bcd 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -10,7 +10,6 @@ if TYPE_CHECKING:
 from typing import TYPE_CHECKING
 
 import torch
-import torch.distributed
 import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
@@ -19,27 +18,28 @@ from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.lightning_attn import (
-    lightning_attention, linear_decode_forward_triton)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+    lightning_attention,
+    linear_decode_forward_triton,
+)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.utils import direct_register_custom_op
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
-import torch
-import torch.distributed
-
 
 class MiniMaxText01RMSNormTP(CustomOp):
     name = "MiniMaxText01RMSNormTP"
@@ -48,8 +48,7 @@ class MiniMaxText01RMSNormTP(CustomOp):
         super().__init__()
         self.tp_world = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.weight = nn.Parameter(torch.ones(int(hidden_size /
-                                                  self.tp_world)))
+        self.weight = nn.Parameter(torch.ones(int(hidden_size / self.tp_world)))
 
         self.weight.weight_loader = self.weight_loader
         self.variance_epsilon = eps
@@ -76,33 +75,32 @@ class MiniMaxText01RMSNormTP(CustomOp):
         x = x.to(torch.float32)
         variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
         if self.tp_world > 1:
-            variance = tensor_model_parallel_all_reduce(
-                variance) / self.tp_world
+            variance = tensor_model_parallel_all_reduce(variance) / self.tp_world
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-        x = x.to(orig_dtype) * self.weight
+        x = (x * self.weight).to(orig_dtype)
         return x
 
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert residual is None, "RMSNorm does not support residual connection."
         return self._forward(x)
 
 
 class MiniMaxText01LinearKernel:
-
     @staticmethod
-    def jit_linear_forward_prefix(q: torch.Tensor,
-                                  k: torch.Tensor,
-                                  v: torch.Tensor,
-                                  kv_caches: torch.Tensor,
-                                  slope_rate: torch.Tensor,
-                                  block_size: int,
-                                  layer_idx: Optional[int] = None,
-                                  **kwargs) -> torch.Tensor:
-
+    def jit_linear_forward_prefix(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        kv_caches: torch.Tensor,
+        slope_rate: torch.Tensor,
+        block_size: int,
+        layer_idx: int | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
         slope_rate = slope_rate.to(torch.float32)
         should_pad_dim = q.dim() == 3
         if should_pad_dim:
@@ -112,26 +110,22 @@ class MiniMaxText01LinearKernel:
         b, h, n, d = q.shape
         e = d
         kv_history = kv_caches.reshape(1, h, d, e).contiguous()
-        output, kv_history = lightning_attention(q,
-                                                 k,
-                                                 v,
-                                                 slope_rate,
-                                                 block_size=block_size,
-                                                 kv_history=kv_history)
+        output, kv_history = lightning_attention(
+            q, k, v, slope_rate, block_size=block_size, kv_history=kv_history
+        )
         kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
         assert output.shape[0] == 1, "batch size must be 1"
         return rearrange(output.squeeze(0), "h n d -> n (h d)")
 
 
 class MiniMaxText01LinearAttention(nn.Module, MambaBase):
-
     @property
     def mamba_type(self) -> str:
         return "linear_attention"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import (
-            LinearAttentionBackend)
+        from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
+
         return LinearAttentionBackend
 
     def get_state_dtype(self) -> tuple[torch.dtype]:
@@ -144,9 +138,8 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
 
     def get_state_shape(self) -> tuple[tuple[int, int, int], ...]:
         return MambaStateShapeCalculator.linear_attention_state_shape(
-            num_heads=self.num_heads,
-            tp_size=self.tp_size,
-            head_dim=self.head_dim)
+            num_heads=self.num_heads, tp_size=self.tp_size, head_dim=self.head_dim
+        )
 
     def __init__(
         self,
@@ -157,9 +150,9 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         max_position: int,
         block_size: int,
         num_hidden_layer: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = 0,
         linear_layer_idx: int = 0,
         prefix: str = "linear_attn",
@@ -210,16 +203,16 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
             eps=1e-5,
         )
 
-        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
-            self.num_heads)
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(self.num_heads)
         if num_hidden_layer <= 1:
             self.slope_rate = slope_rate * (1 + 1e-5)
         else:
-            self.slope_rate = slope_rate * (1 - layer_idx /
-                                            (num_hidden_layer - 1) + 1e-5)
-        self.tp_slope = self.slope_rate[self.tp_rank *
-                                        self.tp_heads:(self.tp_rank + 1) *
-                                        self.tp_heads].contiguous()
+            self.slope_rate = slope_rate * (
+                1 - layer_idx / (num_hidden_layer - 1) + 1e-5
+            )
+        self.tp_slope = self.slope_rate[
+            self.tp_rank * self.tp_heads : (self.tp_rank + 1) * self.tp_heads
+        ].contiguous()
 
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -227,36 +220,36 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         compilation_config.static_forward_context[prefix] = self
 
     @staticmethod
-    def weight_direct_load(param: torch.Tensor,
-                           loaded_weight: torch.Tensor) -> None:
+    def weight_direct_load(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
         return
 
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
-
         def get_slopes(n):
-
             def get_slopes_power_of_2(n):
-                start = 2**(-(2**-(math.log2(n) - 3)))
+                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
 
             if math.log2(n).is_integer():
                 return get_slopes_power_of_2(n)
             else:
-                closest_power_of_2 = 2**math.floor(math.log2(n))
-                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
-                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
-
-        slopes = torch.tensor(get_slopes(n_attention_heads),
-                              dtype=torch.float32).reshape(
-                                  n_attention_heads, 1, 1)
+                closest_power_of_2 = 2 ** math.floor(math.log2(n))
+                return (
+                    get_slopes_power_of_2(closest_power_of_2)
+                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+                )
+
+        slopes = torch.tensor(
+            get_slopes(n_attention_heads), dtype=torch.float32
+        ).reshape(n_attention_heads, 1, 1)
         return slopes
 
-    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                               attn_metadata):
+    def _prefill_and_mix_infer(
+        self, q, k, v, kv_cache, state_indices_tensor, attn_metadata
+    ):
         hidden = []
         for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
             if _prefill_idx >= len(attn_metadata.query_start_loc):
@@ -279,12 +272,13 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
                 slice_layer_cache,
                 self.tp_slope,
                 self.BLOCK,
-                layer_idx=self.layer_idx)
+                layer_idx=self.layer_idx,
+            )
             hidden.append(out_slice.contiguous())
         if attn_metadata.num_decode_tokens > 0:
-            hidden_decode = self._decode_infer(q, k, v, kv_cache,
-                                               state_indices_tensor,
-                                               attn_metadata)
+            hidden_decode = self._decode_infer(
+                q, k, v, kv_cache, state_indices_tensor, attn_metadata
+            )
             hidden.insert(0, hidden_decode)
 
         if not hidden:
@@ -293,18 +287,19 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         hidden = torch.concat(hidden, dim=0).contiguous()
         return hidden
 
-    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                      attn_metadata):
-        q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        slot_id = state_indices_tensor[:attn_metadata.num_decodes]
-        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
-                                              slot_id, 32)
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
+        q = q[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        k = k[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        v = v[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[: attn_metadata.num_decodes]
+        hidden = linear_decode_forward_triton(
+            q, k, v, kv_cache, self.tp_slope, slot_id, 32
+        )
         return hidden
 
-    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                positions: torch.Tensor) -> None:
+    def forward(
+        self, hidden_states: torch.Tensor, output: torch.Tensor, positions: torch.Tensor
+    ) -> None:
         torch.ops.vllm.linear_attention(
             hidden_states,
             output,
@@ -312,16 +307,18 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
             self.prefix,
         )
 
-    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                 positions: torch.Tensor) -> None:
+    def _forward(
+        self, hidden_states: torch.Tensor, output: torch.Tensor, positions: torch.Tensor
+    ) -> None:
         forward_context = get_forward_context()
         attn_metadata: AttentionMetadata = forward_context.attn_metadata
         if attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             attn_metadata = attn_metadata[self.prefix]
             assert isinstance(attn_metadata, LinearAttentionMetadata)
-            num_actual_tokens = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
+            num_actual_tokens = (
+                attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
+            )
         else:
             num_actual_tokens = hidden_states.shape[0]
 
@@ -336,35 +333,39 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
 
             num_prefills = getattr(attn_metadata, "num_prefills", 0)
             if num_prefills > 0:
-                num_decode_tokens = getattr(attn_metadata, "num_decode_tokens",
-                                            0)
+                num_decode_tokens = getattr(attn_metadata, "num_decode_tokens", 0)
                 for prefill_idx in range(num_prefills):
-                    q_start = attn_metadata.query_start_loc[num_decode_tokens +
-                                                            prefill_idx]
-                    q_end = attn_metadata.query_start_loc[num_decode_tokens +
-                                                          prefill_idx + 1]
+                    q_start = attn_metadata.query_start_loc[
+                        num_decode_tokens + prefill_idx
+                    ]
+                    q_end = attn_metadata.query_start_loc[
+                        num_decode_tokens + prefill_idx + 1
+                    ]
                     query_len = q_end - q_start
-                    context_len = attn_metadata.seq_lens[
-                        num_decode_tokens + prefill_idx] - query_len
+                    context_len = (
+                        attn_metadata.seq_lens[num_decode_tokens + prefill_idx]
+                        - query_len
+                    )
                     if context_len == 0:
-                        block_to_clear = state_indices_tensor[num_decode_tokens
-                                                              + prefill_idx]
+                        block_to_clear = state_indices_tensor[
+                            num_decode_tokens + prefill_idx
+                        ]
                         kv_cache[block_to_clear, ...] = 0
 
         decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
         if attn_metadata is None:
-            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
-                                 device=q.device,
-                                 dtype=q.dtype)
+            hidden = torch.empty(
+                (q.shape[0], q.shape[1] * q.shape[2]), device=q.device, dtype=q.dtype
+            )
         else:
             if not decode_only:
-                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
-                                                     state_indices_tensor,
-                                                     attn_metadata)
+                hidden = self._prefill_and_mix_infer(
+                    q, k, v, kv_cache, state_indices_tensor, attn_metadata
+                )
             else:
-                hidden = self._decode_infer(q, k, v, kv_cache,
-                                            state_indices_tensor,
-                                            attn_metadata)
+                hidden = self._decode_infer(
+                    q, k, v, kv_cache, state_indices_tensor, attn_metadata
+                )
         hidden = self.norm._forward(hidden)
         gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
         hidden = F.sigmoid(gate) * hidden
@@ -381,9 +382,7 @@ def linear_attention(
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self._forward(hidden_states=hidden_states,
-                  output=output,
-                  positions=positions)
+    self._forward(hidden_states=hidden_states, output=output, positions=positions)
 
 
 def linear_attention_fake(
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index d64854cdb3818bd7e73b215ee514d487c03b6c34..b6345b8af7f0a383abd1aaaa90272f106b68b338 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -12,22 +12,32 @@ from torch.nn.parameter import Parameter
 
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+    selective_scan_fn,
+    selective_state_update,
+)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
 
 
@@ -44,22 +54,24 @@ class MambaMixer(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 ssm_state_size: int,
-                 conv_kernel_size: int,
-                 intermediate_size: int,
-                 time_step_rank: int,
-                 use_conv_bias: bool,
-                 use_bias: bool,
-                 use_rms_norm: bool,
-                 rms_norm_has_weight: bool = True,
-                 rms_norm_eps: float = 1e-5,
-                 activation="silu",
-                 is_lora_enabled: bool = False,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        ssm_state_size: int,
+        conv_kernel_size: int,
+        intermediate_size: int,
+        time_step_rank: int,
+        use_conv_bias: bool,
+        use_bias: bool,
+        use_rms_norm: bool,
+        rms_norm_has_weight: bool = True,
+        rms_norm_eps: float = 1e-5,
+        activation="silu",
+        is_lora_enabled: bool = False,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.time_step_rank = time_step_rank
         self.ssm_state_size = ssm_state_size
@@ -80,9 +92,9 @@ class MambaMixer(MambaBase, CustomOp):
         # doesn't allow to override it
         self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
 
-        self.in_proj = MergedColumnParallelLinear(hidden_size,
-                                                  [intermediate_size] * 2,
-                                                  bias=use_bias)
+        self.in_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2, bias=use_bias
+        )
 
         # selective projection used to make dt, B and C input dependent
         self.x_proj = RowParallelLinear(
@@ -93,17 +105,18 @@ class MambaMixer(MambaBase, CustomOp):
         # time step projection (discretization) -
         # In the forward we need to apply dt_proj without the bias,
         # as the bias is added in the selective scan kernel.
-        self.dt_proj = ColumnParallelLinear(time_step_rank,
-                                            intermediate_size,
-                                            bias=True,
-                                            skip_bias_add=True)
+        self.dt_proj = ColumnParallelLinear(
+            time_step_rank, intermediate_size, bias=True, skip_bias_add=True
+        )
 
         def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
             tp_rank = get_tensor_model_parallel_rank()
             tp_size = get_tensor_model_parallel_world_size()
             param.data.copy_(
-                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
-                                         dim=0)[tp_rank])
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size, dim=0)[
+                    tp_rank
+                ]
+            )
 
         def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
             weight_loader(param, -torch.exp(loaded_weight.float()))
@@ -114,7 +127,8 @@ class MambaMixer(MambaBase, CustomOp):
                 intermediate_size // tp_size,
                 ssm_state_size,
                 dtype=torch.float32,
-            ))
+            )
+        )
         self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
 
         set_weight_attrs(self.D, {"weight_loader": weight_loader})
@@ -127,23 +141,35 @@ class MambaMixer(MambaBase, CustomOp):
             input_is_parallel=True,
         )
 
-        self.dt_layernorm = RMSNorm(
-            time_step_rank,
-            eps=rms_norm_eps,
-            has_weight=rms_norm_has_weight,
-        ) if use_rms_norm else None
+        self.dt_layernorm = (
+            RMSNorm(
+                time_step_rank,
+                eps=rms_norm_eps,
+                has_weight=rms_norm_has_weight,
+            )
+            if use_rms_norm
+            else None
+        )
 
-        self.b_layernorm = RMSNorm(
-            ssm_state_size,
-            eps=rms_norm_eps,
-            has_weight=rms_norm_has_weight,
-        ) if use_rms_norm else None
+        self.b_layernorm = (
+            RMSNorm(
+                ssm_state_size,
+                eps=rms_norm_eps,
+                has_weight=rms_norm_has_weight,
+            )
+            if use_rms_norm
+            else None
+        )
 
-        self.c_layernorm = RMSNorm(
-            ssm_state_size,
-            eps=rms_norm_eps,
-            has_weight=rms_norm_has_weight,
-        ) if use_rms_norm else None
+        self.c_layernorm = (
+            RMSNorm(
+                ssm_state_size,
+                eps=rms_norm_eps,
+                has_weight=rms_norm_has_weight,
+            )
+            if use_rms_norm
+            else None
+        )
 
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -157,7 +183,7 @@ class MambaMixer(MambaBase, CustomOp):
         self.prefix = prefix
 
     def _ssm_transform(
-            self, x: torch.Tensor
+        self, x: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         if self.is_lora_enabled:
             #  Lora kernel requires contiguous tensor.
@@ -167,7 +193,8 @@ class MambaMixer(MambaBase, CustomOp):
         time_step, B, C = torch.split(
             ssm_params,
             [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
-            dim=-1)
+            dim=-1,
+        )
         if self.use_rms_norm:
             assert self.dt_layernorm is not None
             assert self.b_layernorm is not None
@@ -185,8 +212,7 @@ class MambaMixer(MambaBase, CustomOp):
             self.prefix,
         )
 
-    def forward_native(self, hidden_states: torch.Tensor,
-                       output: torch.Tensor):
+    def forward_native(self, hidden_states: torch.Tensor, output: torch.Tensor):
         pass
 
     def forward_cuda(self, hidden_states: torch.Tensor, output: torch.Tensor):
@@ -215,25 +241,29 @@ class MambaMixer(MambaBase, CustomOp):
         forward_context: ForwardContext = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
+        assert self.cache_config is not None
+        mamba_block_size = self.cache_config.mamba_block_size
+        prefix_caching_enabled = self.cache_config.enable_prefix_caching
+
         if attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             attn_metadata = attn_metadata[self.prefix]
-            mamba1_metadata = attn_metadata
-            assert isinstance(mamba1_metadata, Mamba1AttentionMetadata)
-            query_start_loc = mamba1_metadata.query_start_loc
-            state_indices_tensor = mamba1_metadata.state_indices_tensor
+            assert isinstance(attn_metadata, Mamba1AttentionMetadata)
+            query_start_loc_p = attn_metadata.query_start_loc_p
+            state_indices_tensor = attn_metadata.state_indices_tensor
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
-            has_initial_states = mamba1_metadata.has_initial_states
-            num_padded_decodes = mamba1_metadata.num_padded_decodes
+            has_initial_states_p = attn_metadata.has_initial_states_p
+            num_padded_decodes = attn_metadata.num_padded_decodes
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
         hidden_states_BC, gate = projected_states.chunk(2, dim=-2)
 
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
 
         if attn_metadata is None:
             # V1 profile run
@@ -252,12 +282,8 @@ class MambaMixer(MambaBase, CustomOp):
             hidden_states_BC,
             gate,
             state_indices_tensor,
-            query_start_loc,
-            has_initial_states,
             num_prefill_tokens,
-            num_decode_tokens,
             num_prefills,
-            num_decodes,
             num_padded_decodes,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
@@ -266,8 +292,34 @@ class MambaMixer(MambaBase, CustomOp):
         gate_d = prefill_decode_split.gate_d
         state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p
         state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d
-        query_start_loc_p = prefill_decode_split.query_start_loc_p
-        has_initial_states_p = prefill_decode_split.has_initial_states_p
+
+        if prefix_caching_enabled:
+            block_idx_last_computed_token_d, block_idx_last_computed_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_computed_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+            block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_scheduled_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+
+            block_idx_first_scheduled_token_p = (
+                attn_metadata.block_idx_first_scheduled_token_p
+            )
+            num_computed_tokens_p = attn_metadata.num_computed_tokens_p
+        else:
+            block_idx_last_computed_token_d = None
+            block_idx_last_computed_token_p = None
+            block_idx_last_scheduled_token_d = None
+            block_idx_last_scheduled_token_p = None
+            block_idx_first_scheduled_token_p = None
+            num_computed_tokens_p = None
 
         ssm_outputs = []
 
@@ -281,10 +333,17 @@ class MambaMixer(MambaBase, CustomOp):
                 conv_states=conv_state,
                 has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
-                query_start_loc=query_start_loc_p)
+                query_start_loc=query_start_loc_p,
+                block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
+                initial_state_idx=block_idx_last_computed_token_p,
+                num_computed_tokens=num_computed_tokens_p,
+                block_size_to_align=mamba_block_size,
+            )
             # 3. State Space Model sequence transformations.
             discrete_time_step_p, B_p, C_p = self._ssm_transform(
-                conv_out_p.transpose(-2, -1))
+                conv_out_p.transpose(-2, -1)
+            )
             time_proj_bias = self._time_proj_bias()
 
             # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
@@ -301,10 +360,25 @@ class MambaMixer(MambaBase, CustomOp):
                 delta_softplus=True,
                 cache_indices=state_indices_tensor_p,
                 has_initial_state=has_initial_states_p,
-                query_start_loc=query_start_loc_p)
+                query_start_loc=query_start_loc_p,
+                block_size=mamba_block_size,
+                block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
+                initial_state_idx=block_idx_last_computed_token_p,
+            )
             ssm_outputs.append(scan_out_p)
 
         if has_decode:
+            if prefix_caching_enabled:
+                state_indices_tensor_d_input = state_indices_tensor_d.gather(
+                    1, block_idx_last_computed_token_d.unsqueeze(1)
+                ).squeeze(1)
+                state_indices_tensor_d_output = state_indices_tensor_d.gather(
+                    1, block_idx_last_scheduled_token_d.unsqueeze(1)
+                ).squeeze(1)
+            else:
+                state_indices_tensor_d_input = state_indices_tensor_d
+                state_indices_tensor_d_output = state_indices_tensor_d
             # 2. Convolution sequence transformation
             conv_out_d = causal_conv1d_update(
                 hidden_states_BC_d.transpose(0, 1),
@@ -312,39 +386,45 @@ class MambaMixer(MambaBase, CustomOp):
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=state_indices_tensor_d).transpose(0, 1)
+                conv_state_indices=state_indices_tensor_d,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_d,
+                initial_state_idx=block_idx_last_computed_token_d,
+            ).transpose(0, 1)
 
             # 3. State Space Model sequence transformation.
             discrete_time_step_d, B_d, C_d = self._ssm_transform(
-                conv_out_d.transpose(-2, -1))
+                conv_out_d.transpose(-2, -1)
+            )
             time_proj_bias = self._time_proj_bias()
 
             # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
-            scan_outputs_d = torch.empty_like(
-                hidden_states_BC_d.transpose(0, 1))
-            selective_state_update(ssm_state,
-                                   conv_out_d.transpose(0, 1),
-                                   discrete_time_step_d.transpose(0, 1),
-                                   self.A,
-                                   B_d,
-                                   C_d,
-                                   self.D,
-                                   gate_d.transpose(0, 1),
-                                   time_proj_bias,
-                                   dt_softplus=True,
-                                   state_batch_indices=state_indices_tensor_d,
-                                   out=scan_outputs_d)
+            scan_outputs_d = torch.empty_like(hidden_states_BC_d.transpose(0, 1))
+            selective_state_update(
+                ssm_state,
+                conv_out_d.transpose(0, 1),
+                discrete_time_step_d.transpose(0, 1),
+                self.A,
+                B_d,
+                C_d,
+                self.D,
+                gate_d.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=state_indices_tensor_d_input,
+                dst_state_batch_indices=state_indices_tensor_d_output,
+                out=scan_outputs_d,
+            )
             scan_outputs_d = scan_outputs_d.transpose(0, 1)
 
             ssm_outputs.insert(0, scan_outputs_d)
 
-        scan_outputs_combined = ssm_outputs[0] if len(
-            ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1)
+        scan_outputs_combined = (
+            ssm_outputs[0] if len(ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1)
+        )
 
         # 5. Final output projection
         if self.is_lora_enabled:  # Lora kernel requires contiguous tensor.
-            scan_outputs_combined = scan_outputs_combined.transpose(
-                -2, -1).contiguous()
+            scan_outputs_combined = scan_outputs_combined.transpose(-2, -1).contiguous()
             out = self.out_proj(scan_outputs_combined)[0]
         else:
             out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0]
@@ -373,11 +453,11 @@ class MambaMixer(MambaBase, CustomOp):
         return "mamba1"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba1_attn import (
-            Mamba1AttentionBackend)
+        from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
+
         return Mamba1AttentionBackend
 
-    def _time_proj_bias(self) -> Optional[torch.Tensor]:
+    def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
         return None
@@ -390,43 +470,34 @@ class PrefillDecodeSplit(NamedTuple):
     gate_d: torch.Tensor
     state_indices_tensor_p: torch.Tensor
     state_indices_tensor_d: torch.Tensor
-    query_start_loc_p: Optional[torch.Tensor]
-    has_initial_states_p: Optional[torch.Tensor]
 
 
 def split_batch_to_prefill_and_decode(
     hidden_states_BC: torch.Tensor,
     gate: torch.Tensor,
     state_indices_tensor: torch.Tensor,
-    query_start_loc: torch.Tensor,
-    has_initial_states: Optional[torch.Tensor],
     num_prefill_tokens: int,
-    num_decode_tokens: int,
     num_prefills: int,
-    num_decodes: int,
     num_padded_decodes: int,
 ) -> PrefillDecodeSplit:
-
     num_actual_tokens = num_prefill_tokens + num_padded_decodes
 
     # In v1, decode tokens come first, then prefill tokens.
     hidden_states_BC_d, hidden_states_BC_p = torch.split(
         hidden_states_BC[..., :num_actual_tokens],
         [num_padded_decodes, num_prefill_tokens],
-        dim=-1)
-    gate_d, gate_p = torch.split(gate[..., :num_actual_tokens],
-                                 [num_padded_decodes, num_prefill_tokens],
-                                 dim=-1)
+        dim=-1,
+    )
+    gate_d, gate_p = torch.split(
+        gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1
+    )
 
     # num_padded_decodes accounts for CUDA graph padding when applicable
     state_indices_tensor_d, state_indices_tensor_p = torch.split(
-        state_indices_tensor[:num_padded_decodes + num_prefills],
+        state_indices_tensor[: num_padded_decodes + num_prefills],
         [num_padded_decodes, num_prefills],
-        dim=0)
-    query_start_loc_p = (query_start_loc[-num_prefills - 1:] -
-                         num_padded_decodes if num_prefills > 0 else None)
-    has_initial_states_p = has_initial_states[-num_prefills:] if (
-        has_initial_states is not None and num_prefills > 0) else None
+        dim=0,
+    )
 
     return PrefillDecodeSplit(
         hidden_states_BC_p=hidden_states_BC_p,
@@ -435,8 +506,6 @@ def split_batch_to_prefill_and_decode(
         gate_d=gate_d,
         state_indices_tensor_p=state_indices_tensor_p,
         state_indices_tensor_d=state_indices_tensor_d,
-        query_start_loc_p=query_start_loc_p,
-        has_initial_states_p=has_initial_states_p,
     )
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 6dd09fad7a907142c3d7c46ec31bacdddcf65684..fb45afa33dad683fc7f0ad72f5f383403a23358c 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -11,30 +11,42 @@ from torch import nn
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
-from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_state_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import selective_state_update
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined_varlen)
+    mamba_chunk_scan_combined_varlen,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import (
-    LoaderFunction, composed_weight_loader, sharded_weight_loader)
+    LoaderFunction,
+    composed_weight_loader,
+    sharded_weight_loader,
+)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 # Added by the IBM Team, 2024
@@ -43,12 +55,13 @@ from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 # Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
 @CustomOp.register("mixer2_gated_rms_norm")
 class Mixer2RMSNormGated(CustomOp):
-
-    def __init__(self,
-                 full_hidden_size: int,
-                 full_n_groups: int,
-                 use_rms_norm: bool = True,
-                 eps: float = 1e-6):
+    def __init__(
+        self,
+        full_hidden_size: int,
+        full_n_groups: int,
+        use_rms_norm: bool = True,
+        eps: float = 1e-6,
+    ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -62,13 +75,13 @@ class Mixer2RMSNormGated(CustomOp):
         if self.use_rms_norm:
             # Register norm weight only if we're actually applying RMSNorm
             self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size))
-            set_weight_attrs(self.weight,
-                             {"weight_loader": sharded_weight_loader(0)})
+            set_weight_attrs(self.weight, {"weight_loader": sharded_weight_loader(0)})
         else:
             # Avoid checkpoint mismatch by skipping unused parameter
             self.register_parameter("weight", None)
-        assert (self.full_hidden_size % self.tp_size == 0
-                ), "Tensor parallel world size must divide hidden size."
+        assert self.full_hidden_size % self.tp_size == 0, (
+            "Tensor parallel world size must divide hidden size."
+        )
 
     def forward_native(
         self,
@@ -111,8 +124,7 @@ class Mixer2RMSNormGated(CustomOp):
             group_count = hidden_dim // self.group_size
             x_grouped = x.view(*prefix_dims, group_count, self.group_size)
             variance = x_grouped.pow(2).mean(-1, keepdim=True)
-            x_grouped = x_grouped * torch.rsqrt(variance +
-                                                self.variance_epsilon)
+            x_grouped = x_grouped * torch.rsqrt(variance + self.variance_epsilon)
             x = x_grouped.view(*prefix_dims, hidden_dim)
 
             if redundant_tp:
@@ -126,22 +138,23 @@ class Mixer2RMSNormGated(CustomOp):
         self,
         x: torch.Tensor,
         gate: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         input_dtype = x.dtype
         if not self.use_rms_norm:
             # Keep gate in float32 for numerical stability during silu
-            return x * nn.functional.silu(gate.to(
-                torch.float32)).to(input_dtype)
+            return x * nn.functional.silu(gate.to(torch.float32)).to(input_dtype)
 
-        if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
+        if ((self.n_groups % self.tp_size) != 0) or self.n_groups != 1:
             return self.forward_native(x, gate)
 
-        return rms_norm_gated(x,
-                              self.weight.data,
-                              bias=None,
-                              z=gate,
-                              eps=self.variance_epsilon,
-                              norm_before_gate=False)
+        return rms_norm_gated(
+            x,
+            self.weight.data,
+            bias=None,
+            z=gate,
+            eps=self.variance_epsilon,
+            norm_before_gate=False,
+        )
 
 
 def mamba_v2_sharded_weight_loader(
@@ -156,7 +169,6 @@ def mamba_v2_sharded_weight_loader(
     """
 
     def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
-
         # - track boundary of (sharded) param, and loaded_weight, respectively
         boundary, loaded_boundary = 0, 0
 
@@ -191,11 +203,12 @@ def mamba_v2_sharded_weight_loader(
             #   seem to handle slices well.
             # https://github.com/python/mypy/issues/2410
             param.data[
-                boundary:(boundary + take),
-                ...  # type: ignore[misc]
-            ] = loaded_weight[loaded_start_idx:(loaded_start_idx +
-                                                take)  # type: ignore[misc]
-                              ]  # type: ignore[misc]
+                boundary : (boundary + take), ...  # type: ignore[misc]
+            ] = loaded_weight[
+                loaded_start_idx : (
+                    loaded_start_idx + take
+                )  # type: ignore[misc]
+            ]  # type: ignore[misc]
 
             # move indexing boundaries
             boundary += shard_size
@@ -217,23 +230,25 @@ class MambaMixer2(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 ssm_state_size: int,
-                 conv_kernel_size: int,
-                 intermediate_size: int,
-                 use_conv_bias: bool,
-                 use_bias: bool,
-                 n_groups: int = 1,
-                 num_heads: int = 128,
-                 head_dim: int = 64,
-                 rms_norm_eps: float = 1e-5,
-                 activation: str = "silu",
-                 use_rms_norm: bool = True,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        ssm_state_size: int,
+        conv_kernel_size: int,
+        intermediate_size: int,
+        use_conv_bias: bool,
+        use_bias: bool,
+        n_groups: int = 1,
+        num_heads: int = 128,
+        head_dim: int = 64,
+        rms_norm_eps: float = 1e-5,
+        activation: str = "silu",
+        use_rms_norm: bool = True,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         # For TP, the sharding plan is as follows:
@@ -253,15 +268,18 @@ class MambaMixer2(MambaBase, CustomOp):
         self.tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
-        assert (num_heads % self.tp_size == 0
-                ), "Tensor parallel world size must divide num heads."
+        assert num_heads % self.tp_size == 0, (
+            "Tensor parallel world size must divide num heads."
+        )
 
         assert (n_groups % self.tp_size) == 0 or n_groups == 1, (
             "If tensor parallel world size does not divide num_groups, "
-            "then num_groups must equal 1.")
+            "then num_groups must equal 1."
+        )
 
-        assert (n_groups % self.tp_size == 0) or self.tp_size == 1 or \
-            quant_config is None, (
+        assert (
+            (n_groups % self.tp_size == 0) or self.tp_size == 1 or quant_config is None
+        ), (
             "Tensor parallel currently supported for quantized models only "
             "if tensor parallel world size divides num groups."
         )
@@ -280,7 +298,8 @@ class MambaMixer2(MambaBase, CustomOp):
             # - but if n_groups cannot divide tp_size, we need to
             #   extend some extra groups
             groups = MambaStateShapeCalculator.extra_groups_for_head_shards(
-                n_groups, self.tp_size)
+                n_groups, self.tp_size
+            )
             self.n_groups = n_groups + groups
 
         self.groups_ssm_state_size = self.n_groups * self.ssm_state_size
@@ -340,8 +359,7 @@ class MambaMixer2(MambaBase, CustomOp):
             #   to the head shards
             group_shard_settings = (
                 self.groups_ssm_state_size,  # expected model size
-                (self.n_groups - n_groups) *
-                self.ssm_state_size,  # extra dims assigned
+                (self.n_groups - n_groups) * self.ssm_state_size,  # extra dims assigned
                 n_groups == 1,  # if there was only one group
             )
             intermediate_settings = (intermediate_size, 0, False)
@@ -355,8 +373,7 @@ class MambaMixer2(MambaBase, CustomOp):
             set_weight_attrs(
                 self.conv1d.bias,
                 {
-                    "weight_loader":
-                    mamba_v2_sharded_weight_loader(
+                    "weight_loader": mamba_v2_sharded_weight_loader(
                         [
                             intermediate_settings,
                             group_shard_settings,
@@ -372,8 +389,7 @@ class MambaMixer2(MambaBase, CustomOp):
             set_weight_attrs(
                 self.conv1d.weight,
                 {
-                    "weight_loader":
-                    mamba_v2_sharded_weight_loader(
+                    "weight_loader": mamba_v2_sharded_weight_loader(
                         [
                             intermediate_settings,
                             group_shard_settings,
@@ -391,8 +407,7 @@ class MambaMixer2(MambaBase, CustomOp):
                 set_weight_attrs(
                     self.in_proj.weight,
                     {
-                        "weight_loader":
-                        mamba_v2_sharded_weight_loader(
+                        "weight_loader": mamba_v2_sharded_weight_loader(
                             [
                                 intermediate_settings,  # for gate
                                 intermediate_settings,
@@ -418,17 +433,18 @@ class MambaMixer2(MambaBase, CustomOp):
             torch.empty(
                 divide(num_heads, self.tp_size),
                 dtype=torch.float32,
-            ))
+            )
+        )
         self.D = nn.Parameter(torch.ones(num_heads // self.tp_size))
         self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size))
         self.use_rms_norm = use_rms_norm
 
         set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
         a_weight_loader = composed_weight_loader(
-            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float())
+        )
         set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
-        set_weight_attrs(self.dt_bias,
-                         {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
 
         self.out_proj = RowParallelLinear(
             intermediate_size,
@@ -439,10 +455,9 @@ class MambaMixer2(MambaBase, CustomOp):
             prefix=f"{prefix}.out_proj",
         )
 
-        self.norm = Mixer2RMSNormGated(intermediate_size,
-                                       n_groups,
-                                       self.use_rms_norm,
-                                       eps=rms_norm_eps)
+        self.norm = Mixer2RMSNormGated(
+            intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps
+        )
 
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -459,7 +474,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         pass
 
@@ -467,7 +482,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         torch.ops.vllm.mamba_mixer2(
             hidden_states,
@@ -480,7 +495,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         forward_context = get_forward_context()
         # attn_metadata contains metadata necessary for the mamba2 triton
@@ -489,6 +504,9 @@ class MambaMixer2(MambaBase, CustomOp):
         # stay the same and reused for all mamba layers in the same iteration
         attn_metadata: AttentionMetadata = forward_context.attn_metadata
 
+        assert self.cache_config is not None
+        mamba_block_size = self.cache_config.mamba_block_size
+        prefix_caching_enabled = self.cache_config.enable_prefix_caching
         if attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             attn_metadata = attn_metadata[self.prefix]
@@ -502,9 +520,9 @@ class MambaMixer2(MambaBase, CustomOp):
             prep_initial_states = attn_metadata.prep_initial_states
             chunk_size = attn_metadata.chunk_size
             seq_idx_p = attn_metadata.seq_idx_p
-            chunk_indices_p = attn_metadata.chunk_indices_p
-            chunk_offsets_p = attn_metadata.chunk_offsets_p
             query_start_loc_p = attn_metadata.query_start_loc_p
+            cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
+            last_chunk_indices_p = attn_metadata.last_chunk_indices_p
 
         # 1. Gated MLP's linear projection
         projected_states, _ = self.in_proj(hidden_states)
@@ -522,8 +540,9 @@ class MambaMixer2(MambaBase, CustomOp):
             dim=-1,
         )
 
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
 
         # - get hidden_states, B and C after depthwise convolution.
         split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
@@ -538,10 +557,10 @@ class MambaMixer2(MambaBase, CustomOp):
 
         if attn_metadata is None:
             # profile run
-            hidden_states_B_C = (hidden_states_B_C.transpose(
-                0, 1).clone().transpose(0, 1)).contiguous()
-            hidden_states, _B, _C = split_hidden_states_B_C_fn(
-                hidden_states_B_C)
+            hidden_states_B_C = (
+                hidden_states_B_C.transpose(0, 1).clone().transpose(0, 1)
+            ).contiguous()
+            hidden_states, _B, _C = split_hidden_states_B_C_fn(hidden_states_B_C)
             hidden_states = self.norm(hidden_states, gate)
             out, _ = self.out_proj(hidden_states)
             return out
@@ -573,12 +592,42 @@ class MambaMixer2(MambaBase, CustomOp):
             dim=0,
         )
 
+        if prefix_caching_enabled:
+            # If prefix caching is enabled, retrieve the relevant variables
+            # for prefill and decode
+            block_idx_last_computed_token_d, block_idx_last_computed_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_computed_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+            block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_scheduled_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+            # Prefill-only variables:
+            block_idx_first_scheduled_token_p = (
+                attn_metadata.block_idx_first_scheduled_token_p
+            )
+            num_computed_tokens_p = attn_metadata.num_computed_tokens_p
+        else:
+            block_idx_last_computed_token_d = None
+            block_idx_last_computed_token_p = None
+            block_idx_last_scheduled_token_d = None
+            block_idx_last_scheduled_token_p = None
+            block_idx_first_scheduled_token_p = None
+            num_computed_tokens_p = None
+
         # Preallocate output tensor to avoid memcpy cost for merging prefill
         # and decode outputs
         preallocated_ssm_out = torch.empty(
             [
                 num_prefill_tokens + num_decodes,
-                (self.num_heads // self.tp_size) * self.head_dim
+                (self.num_heads // self.tp_size) * self.head_dim,
             ],
             dtype=hidden_states.dtype,
             device=hidden_states.device,
@@ -592,10 +641,21 @@ class MambaMixer2(MambaBase, CustomOp):
         # Process prefill requests
         if has_prefill:
             # 2. Convolution sequence transformation
-            # - "cache_indices" updates the conv_state cache in positions
-            #   pointed to by "state_indices_tensor"
+            # - It will read the initial states for every sequence,
+            #   that has "has_initial_states_p" == True,
+            #   from "cache_indices", using "state_indices_tensor_p".
+            # - It updates the "conv_state" cache in positions pointed
+            #   to by "state_indices_tensor_p".
+            #   In particular, it will always write the state at the
+            #   sequence end.
+            #   In addition, "block_idx_first_scheduled_token_p" and
+            #   "block_idx_last_scheduled_token_p"
+            #   are provided (which are pointers into
+            #   "state_indices_tensor_p"), it will write additional cache
+            #   states aligned at "block_size_to_align".
             x = hidden_states_B_C_p.transpose(
-                0, 1)  # this is the form that causal-conv see
+                0, 1
+            )  # this is the form that causal-conv see
             hidden_states_B_C_p = causal_conv1d_fn(
                 x,
                 conv_weights,
@@ -604,52 +664,153 @@ class MambaMixer2(MambaBase, CustomOp):
                 conv_states=conv_state,
                 has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
+                block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
+                initial_state_idx=block_idx_last_computed_token_p,
+                num_computed_tokens=num_computed_tokens_p,
+                block_size_to_align=mamba_block_size,
                 metadata=attn_metadata,
-                query_start_loc=query_start_loc_p).transpose(
-                    0, 1)[:num_prefill_tokens]
+                query_start_loc=query_start_loc_p,
+            ).transpose(0, 1)[:num_prefill_tokens]
 
-            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(
-                hidden_states_B_C_p)
+            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p)
 
             # 3. State Space Model sequence transformation
             initial_states = None
-            if (has_initial_states_p is not None and prep_initial_states):
+            if has_initial_states_p is not None and prep_initial_states:
+                kernel_ssm_indices = state_indices_tensor_p
+                if prefix_caching_enabled:
+                    kernel_ssm_indices = state_indices_tensor_p.gather(
+                        1, block_idx_last_computed_token_p.unsqueeze(1)
+                    ).squeeze(1)
                 initial_states = torch.where(
                     has_initial_states_p[:, None, None, None],
-                    ssm_state[state_indices_tensor_p], 0)
+                    ssm_state[kernel_ssm_indices],
+                    0,
+                )
 
             # NOTE: final output is an in-place update of out tensor
             varlen_states = mamba_chunk_scan_combined_varlen(
-                hidden_states_p.view(num_prefill_tokens,
-                                     self.num_heads // self.tp_size,
-                                     self.head_dim),
+                hidden_states_p.view(
+                    num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim
+                ),
                 dt_p,
                 self.A,
-                B_p.view(num_prefill_tokens, self.n_groups // self.tp_size,
-                         -1),
-                C_p.view(num_prefill_tokens, self.n_groups // self.tp_size,
-                         -1),
+                B_p.view(num_prefill_tokens, self.n_groups // self.tp_size, -1),
+                C_p.view(num_prefill_tokens, self.n_groups // self.tp_size, -1),
                 chunk_size=chunk_size,
                 D=self.D,
                 z=None,
                 dt_bias=self.dt_bias,
                 seq_idx=seq_idx_p,
-                chunk_indices=chunk_indices_p,
-                chunk_offsets=chunk_offsets_p,
                 cu_seqlens=query_start_loc_p,
+                cu_chunk_seqlens=cu_chunk_seqlen_p,
+                last_chunk_indices=last_chunk_indices_p,
                 initial_states=initial_states,
+                return_intermediate_states=prefix_caching_enabled,
                 dt_softplus=True,
                 dt_limit=(0.0, float("inf")),
-                out=preallocated_ssm_out_p.view(num_prefill_tokens, -1,
-                                                self.head_dim),
-                state_dtype=ssm_state.dtype)
+                out=preallocated_ssm_out_p.view(num_prefill_tokens, -1, self.head_dim),
+                state_dtype=ssm_state.dtype,
+            )
+
+            if prefix_caching_enabled:
+                # The chunk_stride is the number of chunks per mamba block
+                # e.g., if mamba_block_size = 512 and chunk_size = 256,
+                # then chunk_stride = 2
+                chunk_stride = mamba_block_size // chunk_size
+
+                # Save state for sequences with more than just final state
+                for seq_idx in range(num_prefills):
+                    # Block index for the first scheduled token
+                    block_idx_first_scheduled_token = block_idx_first_scheduled_token_p[
+                        seq_idx
+                    ]
+
+                    # Block index for the last scheduled token
+                    block_idx_last_scheduled_token = block_idx_last_scheduled_token_p[
+                        seq_idx
+                    ]
+
+                    # Number of blocks that need to be written
+                    n_blocks_to_fill = (
+                        block_idx_last_scheduled_token - block_idx_first_scheduled_token
+                    )
+
+                    # Skip sequences that don't have any blocks to fill
+                    if n_blocks_to_fill == 0:
+                        continue
+
+                    # Look up the state indices
+                    cache_blocks_to_fill = state_indices_tensor_p[
+                        seq_idx,
+                        block_idx_first_scheduled_token:block_idx_last_scheduled_token,
+                    ]
+
+                    # First chunk index for this sequence
+                    if seq_idx == 0:
+                        first_chunk = 0
+                    else:
+                        first_chunk = 1 + last_chunk_indices_p[seq_idx - 1]
+
+                    # First chunk that is aligned on the mamba block boundary
+                    first_aligned_chunk = first_chunk + chunk_stride - 1
+
+                    # Calculate the number of computed tokens that were not
+                    # already cached
+                    num_unaligned_computed_tokens = (
+                        num_computed_tokens_p[seq_idx] % mamba_block_size
+                    )
 
-            # update ssm states
-            # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
-            ssm_state[state_indices_tensor_p] = varlen_states
+                    if num_unaligned_computed_tokens > 0:
+                        # If the number of computed tokens is not block aligned,
+                        # then we need to shift the index accordingly
+                        first_aligned_chunk -= (
+                            num_unaligned_computed_tokens // chunk_size
+                        )
+
+                    # Get states to write
+                    from_where = varlen_states[
+                        first_aligned_chunk : first_aligned_chunk
+                        + n_blocks_to_fill * chunk_stride : chunk_stride
+                    ]
+
+                    # Write the states
+                    ssm_state[cache_blocks_to_fill] = from_where
+
+                # For all seqs, store the last state (note: might be partial):
+                ssm_state[
+                    state_indices_tensor_p.gather(
+                        1, block_idx_last_scheduled_token_p.unsqueeze(1)
+                    ).squeeze(1)
+                ] = varlen_states[last_chunk_indices_p]
+
+            else:
+                # update ssm states
+                # - varlen state is a (num_prefills, nheads, headdim, dstate)
+                #   tensor
+                ssm_state[state_indices_tensor_p] = varlen_states
 
         # Process decode requests
         if has_decode:
+            if prefix_caching_enabled:
+                state_indices_tensor_d_input = state_indices_tensor_d.gather(
+                    1, block_idx_last_computed_token_d.unsqueeze(1)
+                ).squeeze(1)
+                state_indices_tensor_d_output = state_indices_tensor_d.gather(
+                    1, block_idx_last_scheduled_token_d.unsqueeze(1)
+                ).squeeze(1)
+                # for decode:
+                #   block_idx_first_scheduled_token_d ==
+                #       block_idx_last_scheduled_token_d
+                # at block boundaries:
+                #   block_idx_first_scheduled_token_d >
+                #       block_idx_last_computed_token_d
+            else:
+                # Without caching, read and write in-place to the same blocks:
+                state_indices_tensor_d_input = state_indices_tensor_d
+                state_indices_tensor_d_output = state_indices_tensor_d
+
             # 2. Convolution sequence transformation
             hidden_states_B_C_d = causal_conv1d_update(
                 hidden_states_B_C_d,
@@ -657,22 +818,28 @@ class MambaMixer2(MambaBase, CustomOp):
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=state_indices_tensor_d)
+                conv_state_indices=state_indices_tensor_d,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_d,
+                initial_state_idx=block_idx_last_computed_token_d,
+            )
 
-            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(
-                hidden_states_B_C_d)
+            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d)
 
             # 3. State Space Model sequence transformation
             n_groups = self.n_groups // self.tp_size
-            A_d = self.A[:, None, ...][:, :, None].expand(
-                -1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            A_d = (
+                self.A[:, None, ...][:, :, None]
+                .expand(-1, self.head_dim, self.ssm_state_size)
+                .to(dtype=torch.float32)
+            )
             dt_d = dt_d[:, :, None].expand(-1, -1, self.head_dim)
             dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
             D_d = self.D[:, None, ...].expand(-1, self.head_dim)
             B_d = B_d.view(-1, n_groups, B_d.shape[1] // n_groups)
             C_d = C_d.view(-1, n_groups, C_d.shape[1] // n_groups)
             hidden_states_d = hidden_states_d.view(
-                -1, self.num_heads // self.tp_size, self.head_dim)
+                -1, self.num_heads // self.tp_size, self.head_dim
+            )
 
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
@@ -689,17 +856,16 @@ class MambaMixer2(MambaBase, CustomOp):
                 z=None,
                 dt_bias=dt_bias,
                 dt_softplus=True,
-                state_batch_indices=state_indices_tensor_d,
-                out=preallocated_ssm_out_d.view(num_decodes, -1,
-                                                self.head_dim),
+                state_batch_indices=state_indices_tensor_d_input,
+                dst_state_batch_indices=state_indices_tensor_d_output,
+                out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
             )
 
         # 4. gated MLP
         # GatedRMSNorm internally applying SiLU to the gate
         # SiLU is applied internally before normalization, unlike standard
         # norm usage
-        hidden_states = self.norm(preallocated_ssm_out,
-                                  gate[:num_actual_tokens])
+        hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens])
 
         # 5. Final linear projection
         output[:num_actual_tokens], _ = self.out_proj(hidden_states)
@@ -729,8 +895,8 @@ class MambaMixer2(MambaBase, CustomOp):
         return "mamba2"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import (
-            Mamba2AttentionBackend)
+        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
+
         return Mamba2AttentionBackend
 
 
@@ -738,20 +904,18 @@ def mamba_mixer2(
     hidden_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: Optional[torch.Tensor] = None,
+    mup_vector: torch.Tensor | None = None,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self.forward_cuda(hidden_states=hidden_states,
-                      output=output,
-                      mup_vector=mup_vector)
+    self.forward_cuda(hidden_states=hidden_states, output=output, mup_vector=mup_vector)
 
 
 def mamba_mixer2_fake(
     hidden_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: Optional[torch.Tensor] = None,
+    mup_vector: torch.Tensor | None = None,
 ) -> None:
     return
 
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 677a4b9d87fc7b6a066deb9124454626458252e1..831dab2fbb01c49b446576864d2151446f00ffc4 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -1,87 +1,96 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import torch
 
-from vllm.config import MambaDType, ModelDType
+from vllm.config.cache import MambaDType
+from vllm.config.model import ModelDType
 from vllm.distributed import divide
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_kv_cache_torch_dtype
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    get_kv_cache_torch_dtype,
+)
 
 
 class MambaStateDtypeCalculator:
-
     @classmethod
     def linear_attention_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
         # TODO (tdoublep) requires testing
         if mamba_cache_dtype == "float32":
             raise ValueError("fp32 state for minimax is not yet supported")
         state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
-        return (state_dtype, )
+        return (state_dtype,)
 
     @classmethod
     def mamba1_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
-                                      mamba_ssm_cache_dtype)
+        return cls._mamba_state_dtype(
+            model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype
+        )
 
     @classmethod
     def mamba2_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
-                                      mamba_ssm_cache_dtype)
+        return cls._mamba_state_dtype(
+            model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype
+        )
 
     @classmethod
     def _mamba_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
-                                                    model_dtype)
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
         if mamba_ssm_cache_dtype == "auto":
             temporal_state_dtype = conv_state_dtype
         else:
-            temporal_state_dtype = (
-                STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype])
+            temporal_state_dtype = STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype]
 
         return (conv_state_dtype, temporal_state_dtype)
 
     @classmethod
     def short_conv_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
-                                                    model_dtype)
-        return (conv_state_dtype, )
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        return (conv_state_dtype,)
 
     @classmethod
     def gated_delta_net_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, torch.dtype]:
         state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
         return (state_dtype, state_dtype)
 
+    @classmethod
+    def kda_state_dtype(
+        cls,
+        model_dtype: ModelDType | torch.dtype,
+        mamba_cache_dtype: MambaDType,
+    ):
+        state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        return (state_dtype, state_dtype, state_dtype, torch.float32)
+
 
 class MambaStateShapeCalculator:
-
     @classmethod
     def linear_attention_state_shape(
         cls,
@@ -89,9 +98,8 @@ class MambaStateShapeCalculator:
         tp_size: int,
         head_dim: int,
     ) -> tuple[tuple[int, int, int], ...]:
-
         state_shape = (num_heads // tp_size, head_dim, head_dim)
-        return (state_shape, )
+        return (state_shape,)
 
     @classmethod
     def mamba1_state_shape(
@@ -101,11 +109,9 @@ class MambaStateShapeCalculator:
         state_size: int,
         conv_kernel: int,
     ) -> tuple[tuple[int, int], tuple[int, int]]:
-        conv_state_shape = (divide(intermediate_size,
-                                   tp_world_size), conv_kernel - 1)
+        conv_state_shape = (divide(intermediate_size, tp_world_size), conv_kernel - 1)
 
-        temporal_state_shape = (divide(intermediate_size,
-                                       tp_world_size), state_size)
+        temporal_state_shape = (divide(intermediate_size, tp_world_size), state_size)
 
         conv_state_shape = conv_state_shape[1], conv_state_shape[0]
 
@@ -124,8 +130,7 @@ class MambaStateShapeCalculator:
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         # if n_groups is not divisible by world_size, need to extend the shards
         # to ensure all groups needed by a head is sharded along with it
-        n_groups = n_groups + cls.extra_groups_for_head_shards(
-            n_groups, tp_world_size)
+        n_groups = n_groups + cls.extra_groups_for_head_shards(n_groups, tp_world_size)
         # heads and n_groups are TP-ed
         conv_dim = intermediate_size + 2 * n_groups * state_size
 
@@ -135,8 +140,7 @@ class MambaStateShapeCalculator:
         # These are not TP-ed as they depend on A, dt_bias, D
         # - they are typically small
         #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
-        temporal_state_shape = (divide(num_heads,
-                                       tp_world_size), head_dim, state_size)
+        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
         return conv_state_shape, temporal_state_shape
 
     @classmethod
@@ -148,7 +152,7 @@ class MambaStateShapeCalculator:
     ) -> tuple[tuple[int, int]]:
         conv_dim = divide(intermediate_size, tp_world_size)
         conv_state_shape = (conv_kernel - 1, conv_dim)
-        return (conv_state_shape, )
+        return (conv_state_shape,)
 
     @classmethod
     def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int):
@@ -173,7 +177,7 @@ class MambaStateShapeCalculator:
         conv_kernel_size: int,
         num_spec: int = 0,
     ):
-        conv_dim = (head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads)
+        conv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads
         conv_state_shape = (
             divide(conv_dim, tp_world_size),
             conv_kernel_size - 1 + num_spec,
@@ -181,6 +185,41 @@ class MambaStateShapeCalculator:
 
         conv_state_shape = conv_state_shape[1], conv_state_shape[0]
 
-        temporal_state_shape = (divide(num_v_heads,
-                                       tp_world_size), head_k_dim, head_v_dim)
+        temporal_state_shape = (
+            divide(num_v_heads, tp_world_size),
+            head_k_dim,
+            head_v_dim,
+        )
         return conv_state_shape, temporal_state_shape
+
+    @classmethod
+    def kda_state_shape(
+        cls,
+        tp_world_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_k_heads: int | None = None,
+        head_k_dim: int | None = None,
+        conv_kernel_size: int = 4,
+        num_spec: int = 0,
+    ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int, int]]:
+        if num_k_heads is None:
+            num_k_heads = num_heads
+        if head_k_dim is None:
+            head_k_dim = head_dim
+
+        proj_size = num_heads * head_dim
+        proj_k_size = num_k_heads * head_k_dim
+
+        conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1)
+        conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1)
+        recurrent_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim)
+
+        conv_state_shape = conv_state_shape[1], conv_state_shape[0]
+        conv_state_k_shape = conv_state_k_shape[1], conv_state_k_shape[0]
+        return (
+            conv_state_shape,
+            conv_state_k_shape,
+            conv_state_k_shape,
+            recurrent_state_shape,
+        )
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index c4102c4753c7e14484e810cc8dfaf32a500d1158..83c2c5f11e187b262af36daabc159912aa6510da 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -4,7 +4,6 @@
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
 
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -20,40 +19,41 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     w_ptr,  # (dim, width)
     bias_ptr,
     initial_states_ptr,  # conv_states_ptr
-    cache_indices_ptr,  # conv_state_indices_ptr
+    cache_indices_ptr,  # (batch, n_blocks + padding) The second dimension contains
+    # the block indices relevant for each sequence
+    # plus potential 0-padding at the beginning and at the end
     has_initial_states_ptr,
     query_start_loc_ptr,
     batch_ptr,
     token_chunk_offset_ptr,
+    block_idx_first_scheduled_token,  # (batch,)
+    block_idx_last_scheduled_token,  # (batch,)
+    initial_state_idx,  # (batch,)
+    num_computed_tokens,  # (batch,)
     o_ptr,  # (dim, seqlen) - actually pointing to x_ptr
     # Matrix dimensions
-    batch: tl.int32,  # actually padded_batch
     dim: tl.constexpr,
     seqlen: tl.int32,  # cu_seqlen
     num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
     # Strides
-    stride_x_seq: tl.constexpr,  # stride to get to next sequence,
     stride_x_dim: tl.constexpr,  # stride to get to next feature-value,
-    stride_x_token: tl.
-    constexpr,  # stride to get to next token (same feature-index, same sequence-index)
+    stride_x_token: tl.constexpr,  # stride to get to next token (same feature-index, same sequence-index)
     stride_w_dim: tl.constexpr,  # stride to get to next dim-axis value
     stride_w_width: tl.constexpr,  # stride to get to next width-axis value
     stride_istate_seq: tl.constexpr,
     stride_istate_dim: tl.constexpr,
     stride_istate_token: tl.constexpr,
     stride_cache_indices: tl.constexpr,
-    stride_o_seq: tl.constexpr,
     stride_o_dim: tl.constexpr,
     stride_o_token: tl.constexpr,
+    stride_block_m: tl.constexpr,  # Stride block to align divided by BLOCK_M
     # others
     pad_slot_id: tl.constexpr,
     # Meta-parameters
     HAS_BIAS: tl.constexpr,
     KERNEL_WIDTH: tl.constexpr,
     SILU_ACTIVATION: tl.constexpr,
-    HAS_INITIAL_STATES: tl.constexpr,
-    HAS_CACHE: tl.constexpr,
-    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_APC_ENABLED: tl.constexpr,
     USE_PAD_SLOT: tl.constexpr,
     NP2_STATELEN: tl.constexpr,
     BLOCK_M: tl.constexpr,
@@ -64,7 +64,9 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     stride_conv_state_seq = stride_istate_seq
     stride_conv_state_dim = stride_istate_dim
     stride_conv_state_tok = stride_istate_token
-    state_len = KERNEL_WIDTH - 1  # can be passed via argument if it's not the same as this value
+    state_len = (
+        KERNEL_WIDTH - 1
+    )  # can be passed via argument if it's not the same as this value
 
     # one program handles one chunk in a single sequence
     # rather than mixing sequences - to make updating initial_states across sequences efficiently
@@ -84,27 +86,62 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     # find the actual sequence length
     seqlen = sequence_end_index - sequence_start_index
 
+    B_size: tl.constexpr = stride_block_m * BLOCK_M
+
+    if IS_APC_ENABLED:
+        # Handle the case if prefix caching is enabled.
+        # In particular, if prefix caching is enabled, the program write additional cache states to "cache_indices_ptr"
+
+        # Get the length of the completed sequence so far and compute the offset.
+        current_first_index = tl.load(block_idx_first_scheduled_token + idx_seq)
+        current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq)
+        sequence_completed_index = tl.load(num_computed_tokens + idx_seq)
+
+        # Compute the offset where the first stride_block_m-aligned first full block is
+        # Value in "token-space"
+        sequence_completed_offset_token = sequence_completed_index % B_size
+        seq_completed_offset = B_size - sequence_completed_offset_token
+        seq_end_offset = (seqlen - seq_completed_offset) % B_size
+        last_full_block_token_index = sequence_end_index - seq_end_offset
+        # If the sequence without the sequence_offset_index is stride_cache_chunk-aligned, then the last full chunk is the second-to-last one
+        if seq_end_offset == 0:
+            last_full_block_token_index = last_full_block_token_index - B_size
+
+        # Get the number of blocks to be filled for the current sequence
+        # If n_block_to_fill = 0, then only the state at the sequence end is stored
+        n_block_to_fill = current_last_index - current_first_index
+
+        # Get the index of the init block
+        conv_state_init_index = tl.load(initial_state_idx + idx_seq)
+    else:
+        n_block_to_fill = 0
+        current_last_index = 0
+        conv_state_init_index = 0
+        current_first_index = 0
+        last_full_block_token_index = 0
+
     token_offset = BLOCK_M * chunk_offset
     segment_len = min(BLOCK_M, seqlen - token_offset)
 
     # base of the sequence
-    x_base = x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim  # [BLOCK_N,]
+    x_base = (
+        x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim
+    )  # [BLOCK_N,]
+
+    # cache_idx
+    conv_states_input_coord = tl.load(
+        conv_state_indices_ptr + idx_seq * stride_cache_indices + conv_state_init_index
+    ).to(tl.int64)
 
-    if IS_CONTINUOUS_BATCHING:
-        # cache_idx
-        conv_state_batch_coord = tl.load(conv_state_indices_ptr +
-                                         idx_seq * stride_cache_indices).to(
-                                             tl.int64)
-    else:
-        # cache_idx
-        conv_state_batch_coord = idx_seq
     if USE_PAD_SLOT:  # noqa
-        if conv_state_batch_coord == pad_slot_id:
+        if conv_states_input_coord == pad_slot_id:
             # not processing as this is not the actual sequence
             return
-    conv_states_base = (conv_states_ptr +
-                        (conv_state_batch_coord * stride_conv_state_seq) +
-                        (idx_feats * stride_conv_state_dim))  # [BLOCK_N,]
+    conv_states_base = (
+        conv_states_ptr
+        + (conv_states_input_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )  # [BLOCK_N,]
 
     w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
 
@@ -113,14 +150,10 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0]
     if chunk_offset == 0:
         # read from conv_states
-        load_init_state = False
-        if HAS_INITIAL_STATES:  # the new HAS_INITIAL_STATES
-            load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(
-                tl.int1)
+        load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(tl.int1)
         if load_init_state:
             # load from conv_states
-            prior_tokens = conv_states_base + (state_len -
-                                               1) * stride_conv_state_tok
+            prior_tokens = conv_states_base + (state_len - 1) * stride_conv_state_tok
             mask_w = idx_feats < dim
             if KERNEL_WIDTH == 2:
                 conv_states_ptrs = prior_tokens  # [BLOCK_N]
@@ -150,40 +183,56 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
             # prior-tokens are zeros
             if KERNEL_WIDTH >= 2:  # STRATEGY1
                 # first chunk and does not have prior-token, so just set to 0
-                col0 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+                col0 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
             if KERNEL_WIDTH >= 3:  # STRATEGY1
-                col1 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+                col1 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
             if KERNEL_WIDTH >= 4:  # STRATEGY1
-                col2 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+                col2 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
             if KERNEL_WIDTH >= 5:  # STRATEGY1
-                col3 = tl.zeros((BLOCK_N, ), dtype=x_ptr.dtype.element_ty)
+                col3 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
 
         # STEP 2:
         # here prepare data for updating conv_state
-        if state_len <= seqlen:  # SMALL_CACHE=True (only move part of 'x' into conv_state cache)
+        if (
+            state_len <= seqlen
+        ):  # SMALL_CACHE=True (only move part of 'x' into conv_state cache)
             # just read from 'x'
             # copy 'x' data to conv_state
             # load only 'x' data (and set 0 before 'x' if seqlen < state_len)
             idx_tokens_last = (seqlen - state_len) + tl.arange(
-                0, NP2_STATELEN)  # [BLOCK_M]
-            x_ptrs = x_ptr + (
-                (sequence_start_index + idx_tokens_last) *
-                stride_x_token)[:, None] + (
-                    idx_feats * stride_x_dim)[None, :]  # [BLOCK_M,BLOCK_N,]
-            mask_x = ((idx_tokens_last >= 0)[:, None] &
-                      (idx_tokens_last < seqlen)[:, None] &
-                      (idx_feats < dim)[None, :]
-                      )  # token-index  # token-index  # feature-index
+                0, NP2_STATELEN
+            )  # [BLOCK_M]
+            x_ptrs = (
+                x_ptr
+                + ((sequence_start_index + idx_tokens_last) * stride_x_token)[:, None]
+                + (idx_feats * stride_x_dim)[None, :]
+            )  # [BLOCK_M,BLOCK_N,]
+            mask_x = (
+                (idx_tokens_last >= 0)[:, None]
+                & (idx_tokens_last < seqlen)[:, None]
+                & (idx_feats < dim)[None, :]
+            )  # token-index  # token-index  # feature-index
             loaded_x = tl.load(x_ptrs, mask_x, 0.0)
-            new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
             idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
-            conv_states_ptrs_target = conv_states_base[None, :] + (
-                idx_tokens_conv * stride_conv_state_tok)[:, None]
 
-            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats
-                                                             < dim)[None, :]
+            # Compute the offset where the last block should be written in the conv_states
+            conv_states_output_coord = tl.load(
+                conv_state_indices_ptr
+                + idx_seq * stride_cache_indices
+                + current_last_index
+            ).to(tl.int64)
+
+            conv_states_ptrs_target = (
+                conv_states_ptr
+                + (conv_states_output_coord * stride_conv_state_seq)  # Offset from seq
+                + (idx_feats * stride_conv_state_dim)
+            )[None, :] + (  # [BLOCK_N,]
+                idx_tokens_conv * stride_conv_state_tok
+            )[:, None]
+
+            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :]
             tl.debug_barrier()  #  NOTE: use this due to bug in Triton compiler
-            tl.store(conv_states_ptrs_target, new_conv_state, mask)
+            tl.store(conv_states_ptrs_target, loaded_x, mask)
 
         else:
             if load_init_state:
@@ -191,39 +240,43 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
                 idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
 
                 conv_states_ptrs_source = (
-                    conv_states_ptr +
-                    (conv_state_batch_coord * stride_conv_state_seq) +
-                    (idx_feats * stride_conv_state_dim)[None, :] +
-                    ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:,
-                                                                         None]
+                    conv_states_ptr
+                    + (conv_states_input_coord * stride_conv_state_seq)
+                    + (idx_feats * stride_conv_state_dim)[None, :]
+                    + ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:, None]
                 )  # [BLOCK_M, BLOCK_N]
-                mask = ((conv_state_batch_coord < num_cache_lines)
-                        & ((idx_tokens_conv + seqlen) < state_len)[:, None]
-                        & (idx_feats < dim)[None, :])
+                mask = (
+                    (conv_states_input_coord < num_cache_lines)
+                    & ((idx_tokens_conv + seqlen) < state_len)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )
                 conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0)
 
                 VAL = state_len - seqlen
 
-                x_ptrs = x_base[None, :] + (
-                    (idx_tokens_conv - VAL) *
-                    stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
 
-                mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] &
-                          (idx_tokens_conv - VAL < seqlen)[:, None] &
-                          (idx_feats < dim)[None, :]
-                          )  # token-index  # token-index  # feature-index
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
                 loaded_x = tl.load(x_ptrs, mask_x, 0.0)
 
-                tl.debug_barrier(
-                )  # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load
+                tl.debug_barrier()  # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load
                 new_conv_state = tl.where(
                     mask, conv_state, loaded_x
                 )  # BUG in 'tl.where'  which requires a barrier before this
-                conv_states_ptrs_target = conv_states_base + (
-                    idx_tokens_conv *
-                    stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
-                mask = (idx_tokens_conv
-                        < state_len)[:, None] & (idx_feats < dim)[None, :]
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
                 tl.store(conv_states_ptrs_target, new_conv_state, mask)
             else:  # load_init_state == False
                 # update conv_state by shifting left, BUT
@@ -232,21 +285,25 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
 
                 VAL = state_len - seqlen
 
-                x_ptrs = x_base[None, :] + (
-                    (idx_tokens_conv - VAL) *
-                    stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
 
-                mask_x = ((idx_tokens_conv - VAL >= 0)[:, None] &
-                          (idx_tokens_conv - VAL < seqlen)[:, None] &
-                          (idx_feats < dim)[None, :]
-                          )  # token-index  # token-index  # feature-index
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
                 new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
 
-                conv_states_ptrs_target = conv_states_base + (
-                    idx_tokens_conv *
-                    stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
-                mask = (idx_tokens_conv
-                        < state_len)[:, None] & (idx_feats < dim)[None, :]
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
                 tl.store(conv_states_ptrs_target, new_conv_state, mask)
 
     else:  # chunk_offset > 0
@@ -256,37 +313,84 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
         mask_w = idx_feats < dim
         if KERNEL_WIDTH == 2:
             conv_states_ptrs = prior_tokens  # [BLOCK_N]
-            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
         if KERNEL_WIDTH == 3:
             conv_states_ptrs = prior_tokens  # [BLOCK_N]
-            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
-            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
         if KERNEL_WIDTH == 4:
             conv_states_ptrs = prior_tokens  # [BLOCK_N]
-            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
-            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
-            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
         if KERNEL_WIDTH == 5:
             # ruff: noqa: F841
             conv_states_ptrs = prior_tokens  # [BLOCK_N]
-            col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
-            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
-            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
             conv_states_ptrs = prior_tokens - 3 * stride_x_token  # [BLOCK_N]
-            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier='.ca')
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+
+        # Store intermediate states aligned with stride_block_m
+        # The additional states are cached starting from the last stride_block_m.
+        # For example:
+        # If n_block_to_fill = 0, then only the state at the sequence end is cached and the process below is not involved.
+        # If n_block_to_fill > 0, then the states at the sequence end and at the n_block_to_fill-last
+        # stride_block_m are cached.
+        # For example chunk_offset = n_block_to_fill stores the state at last_full_block
+        if (chunk_offset - 1) < n_block_to_fill:
+            # Store the states at the chunk boundaries from the start of the sequence
+            idx_tokens_last = (
+                last_full_block_token_index
+                - (n_block_to_fill - chunk_offset) * B_size
+                - state_len
+            ) + tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+            x_ptrs = (
+                x_ptr
+                + (idx_tokens_last * stride_x_token)[:, None]
+                + (idx_feats * stride_x_dim)[None, :]
+            )  # [BLOCK_M,BLOCK_N,]
+
+            mask_x = (idx_tokens_last >= 0)[:, None] & (idx_feats < dim)[
+                None, :
+            ]  # token-index  # token-index  # feature-index
+            loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+            idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+            # cache_idx
+            conv_states_output_coord = tl.load(
+                conv_state_indices_ptr
+                + idx_seq * stride_cache_indices
+                + current_first_index
+                + (chunk_offset - 1)
+            ).to(tl.int64)
+
+            conv_states_ptrs_target = (
+                conv_states_ptr
+                + (conv_states_output_coord * stride_conv_state_seq)  # Offset from seq
+                + (idx_feats * stride_conv_state_dim)
+            )[None, :] + (  # [BLOCK_N,]
+                idx_tokens_conv * stride_conv_state_tok
+            )[:, None]
+
+            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :]
+            tl.debug_barrier()  #  NOTE: use this due to bug in Triton compiler
+            tl.store(conv_states_ptrs_target, loaded_x, mask)
 
     if HAS_BIAS:
         bias = bias_ptr + idx_feats
         mask_bias = idx_feats < dim
-        acc_preload = tl.load(bias, mask=mask_bias,
-                              other=0.0).to(tl.float32)  # [BLOCK_N]
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
     else:
-        acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
 
     x_base_1d = x_base + token_offset * stride_x_token  # starting of chunk
 
@@ -310,7 +414,6 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
         matrix_w = w_col0
         matrix_x = col0
         for j in tl.static_range(KERNEL_WIDTH):
-
             if KERNEL_WIDTH == 2:
                 if j == 1:  # KERNEL_WIDTH-1:
                     matrix_w = w_col1
@@ -351,9 +454,13 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
         if SILU_ACTIVATION:
             acc = acc / (1 + tl.exp(-acc))
         mask_1d = (idx_token < segment_len) & (
-            idx_feats < dim)  # token-index  # feature-index
-        o_ptrs = o_ptr + (sequence_start_index + token_offset + idx_token
-                          ) * stride_o_token + (idx_feats * stride_o_dim)
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr
+            + (sequence_start_index + token_offset + idx_token) * stride_o_token
+            + (idx_feats * stride_o_dim)
+        )
 
         tl.store(o_ptrs, acc, mask=mask_1d)
 
@@ -361,13 +468,18 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
 def causal_conv1d_fn(
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Union[torch.Tensor, None],
+    bias: torch.Tensor | None,
     conv_states: torch.Tensor,
     query_start_loc: torch.Tensor,
-    cache_indices: Optional[torch.Tensor] = None,
-    has_initial_state: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    cache_indices: torch.Tensor | None = None,
+    has_initial_state: torch.Tensor | None = None,
+    activation: str | None = "silu",
     pad_slot_id: int = PAD_SLOT_ID,
+    block_idx_first_scheduled_token: torch.Tensor | None = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
+    num_computed_tokens: torch.Tensor | None = None,
+    block_size_to_align=0,
     metadata=None,
     validate_data=False,
 ):
@@ -378,7 +490,7 @@ def causal_conv1d_fn(
         sequences are concatenated from left to right for varlen
     weight: (dim, width)
     conv_states: (...,dim,width - 1) itype
-        updated inplace if provided
+        updated inplace if cache_indices are not provided
         [it use `cache_indices` to get the index to the cache of conv_state for that sequence
 
         conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True
@@ -410,7 +522,16 @@ def causal_conv1d_fn(
         for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
         in this case, the kernel will not process entries at
         indices 0 and 3
-
+    block_idx_first_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the first cache block to be filled is located.
+    block_idx_last_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the last cache block to be filled is located.
+    initial_state_idx: (batch,), dtype int32
+        The pointer into cache_indices, where the cache block containing the initial state is located.
+    num_computed_tokens: (batch,), dtype int32
+        The number of tokens already completed for each sequence
+    block_size_to_align: int
+        The block size to align the cached states to
     out: same shape as `x`
     """
     if isinstance(activation, bool) and activation:
@@ -427,21 +548,15 @@ def causal_conv1d_fn(
         batch_ptr = metadata.batch_ptr
         token_chunk_offset_ptr = metadata.token_chunk_offset_ptr
     else:
-        seqlens = query_start_loc.diff().to('cpu')
+        seqlens = query_start_loc.diff().to("cpu")
         args = seqlens
         MAX_NUM_PROGRAMS = 1024
 
         batch_ptr = torch.full(
-            (MAX_NUM_PROGRAMS, ),
-            PAD_SLOT_ID,
-            dtype=torch.int32,
-            device=x.device
+            (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device
         )  # tracking which seq-idx the Triton program is handling
         token_chunk_offset_ptr = torch.full(
-            (MAX_NUM_PROGRAMS, ),
-            PAD_SLOT_ID,
-            dtype=torch.int32,
-            device=x.device
+            (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device
         )  # tracking BLOCK_M-based index in the sequence the Triton program is handling
 
     is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1)
@@ -451,7 +566,6 @@ def causal_conv1d_fn(
     np2_statelen = triton.next_power_of_2(state_len)
 
     padded_batch = query_start_loc.size(0) - 1
-    stride_x_seq = 0
     stride_x_dim = x.stride(0)
     stride_x_token = x.stride(1)
     stride_w_dim = weight.stride(0)
@@ -460,6 +574,7 @@ def causal_conv1d_fn(
     stride_istate_dim = 0
     stride_istate_token = 0
     num_cache_lines = 0
+    BLOCK_M = 8
     if conv_states is not None:
         # extensions to support vLLM:
         # 1. conv_states is used to replaced initial_states
@@ -467,23 +582,22 @@ def causal_conv1d_fn(
         # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx]
         # 4. computation can be skipped if cache_indices[idx] == pad_slot_id
         num_cache_lines = conv_states.size(0)
-        assert (num_cache_lines == conv_states.shape[0]
-                and dim == conv_states.shape[1]
-                and width - 1 <= conv_states.shape[2])
+        assert (
+            num_cache_lines == conv_states.shape[0]
+            and dim == conv_states.shape[1]
+            and width - 1 <= conv_states.shape[2]
+        )
         stride_istate_seq = conv_states.stride(0)
         stride_istate_dim = conv_states.stride(1)
         stride_istate_token = conv_states.stride(2)
         assert stride_istate_dim == 1
     if out.dim() == 2:
-        stride_o_seq = 0
         stride_o_dim = out.stride(0)
         stride_o_token = out.stride(1)
     else:
-        stride_o_seq = out.stride(0)
         stride_o_dim = out.stride(1)
         stride_o_token = out.stride(2)
-    stride_cache_indices = cache_indices.stride(
-        0) if cache_indices is not None else 0
+    stride_cache_indices = cache_indices.stride(0) if cache_indices is not None else 0
 
     if validate_data:
         assert x.dim() == 2
@@ -497,11 +611,19 @@ def causal_conv1d_fn(
             assert cache_indices.dim() == 1
             assert padded_batch == cache_indices.size(0)
         if has_initial_state is not None:
-            assert has_initial_state.size() == (padded_batch, )
-            assert conv_states is not None, "ERROR: `has_initial_state` is used, which needs also `conv_states`"
+            assert has_initial_state.size() == (padded_batch,)
+            assert conv_states is not None, (
+                "ERROR: `has_initial_state` is used, which needs also `conv_states`"
+            )
         assert weight.stride(1) == 1
         assert (dim, width) == weight.shape
         assert is_channel_last, "Need to run in channel-last layout"
+        if block_size_to_align is not None and block_size_to_align > 0:
+            assert (block_size_to_align % BLOCK_M) == 0, (
+                "The mamba block size needs to be divisible by the BLOCK_M"
+            )
+        else:
+            block_size_to_align = BLOCK_M
 
     if metadata is None:
 
@@ -523,44 +645,45 @@ def causal_conv1d_fn(
             if META["batch_ptr"].nelement() < len(mlist):
                 newlen = len(mlist) + 1
                 META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
-                META["token_chunk_offset_ptr"].resize_(newlen).fill_(
-                    PAD_SLOT_ID)
+                META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
 
             if META["batch_ptr"].nelement() >= len(mlist):
-                META["batch_ptr"][0:len(mlist)].copy_(
-                    torch.from_numpy(np.array(mlist)))
-                META["token_chunk_offset_ptr"][0:len(mlist)].copy_(
-                    torch.from_numpy(np.array(offsetlist)))
+                META["batch_ptr"][0 : len(mlist)].copy_(
+                    torch.from_numpy(np.array(mlist))
+                )
+                META["token_chunk_offset_ptr"][0 : len(mlist)].copy_(
+                    torch.from_numpy(np.array(offsetlist))
+                )
 
             META["batch_ptr"] = META["batch_ptr"].to(META["x_ptr"].device)
             META["token_chunk_offset_ptr"] = META["token_chunk_offset_ptr"].to(
-                META["x_ptr"].device)
+                META["x_ptr"].device
+            )
             return tot
     else:
 
         def num_program(META, nums_dict):
-            tot = nums_dict[META["BLOCK_M"]]['tot']
+            tot = nums_dict[META["BLOCK_M"]]["tot"]
 
-            mlist = nums_dict[META["BLOCK_M"]]['mlist']
-            mlist_len = nums_dict[META["BLOCK_M"]]['mlist_len']
+            mlist = nums_dict[META["BLOCK_M"]]["mlist"]
+            mlist_len = nums_dict[META["BLOCK_M"]]["mlist_len"]
 
-            offsetlist = nums_dict[META["BLOCK_M"]]['offsetlist']
+            offsetlist = nums_dict[META["BLOCK_M"]]["offsetlist"]
 
             if nums_dict[META["BLOCK_M"]]["batch_ptr"] is not None:
                 META["batch_ptr"] = nums_dict[META["BLOCK_M"]]["batch_ptr"]
-                META["token_chunk_offset_ptr"] = nums_dict[
-                    META["BLOCK_M"]]["token_chunk_offset_ptr"]
+                META["token_chunk_offset_ptr"] = nums_dict[META["BLOCK_M"]][
+                    "token_chunk_offset_ptr"
+                ]
             else:
                 if META["batch_ptr"].nelement() < mlist_len:
                     newlen = mlist_len + 1
                     META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
-                    META["token_chunk_offset_ptr"].resize_(newlen).fill_(
-                        PAD_SLOT_ID)
+                    META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
 
                 if META["batch_ptr"].nelement() >= mlist_len:
                     META["batch_ptr"][0:mlist_len].copy_(mlist)
-                    META["token_chunk_offset_ptr"][0:mlist_len].copy_(
-                        offsetlist)
+                    META["token_chunk_offset_ptr"][0:mlist_len].copy_(offsetlist)
             return tot
 
     def grid(META):
@@ -584,14 +707,16 @@ def causal_conv1d_fn(
         query_start_loc,
         batch_ptr,
         token_chunk_offset_ptr,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
+        num_computed_tokens,
         out,
         # Matrix dimensions
-        padded_batch,
         dim,
         cu_seqlen,
         num_cache_lines,
         # stride
-        stride_x_seq,
         stride_x_dim,
         stride_x_token,
         stride_w_dim,
@@ -600,22 +725,20 @@ def causal_conv1d_fn(
         stride_istate_dim,
         stride_istate_token,
         stride_cache_indices,
-        stride_o_seq,
         stride_o_dim,
         stride_o_token,
+        block_size_to_align // BLOCK_M,
         # others
         pad_slot_id,
         # META
         HAS_BIAS=bias is not None,
         KERNEL_WIDTH=width,
         SILU_ACTIVATION=activation in ["silu", "swish"],
-        HAS_INITIAL_STATES=has_initial_state is not None,
-        HAS_CACHE=conv_states is not None,
-        IS_CONTINUOUS_BATCHING=cache_indices is not None,
+        IS_APC_ENABLED=block_idx_last_scheduled_token is not None,
         USE_PAD_SLOT=pad_slot_id is not None,
         NP2_STATELEN=np2_statelen,
-        #launch_cooperative_grid=True
-        BLOCK_M=8,
+        # launch_cooperative_grid=True
+        BLOCK_M=BLOCK_M,
         BLOCK_N=256,
         num_stages=2,
     )
@@ -629,10 +752,11 @@ def _causal_conv1d_update_kernel(
     w_ptr,  # (dim, width)
     bias_ptr,
     conv_state_ptr,
-    cache_seqlens_ptr,  # circular buffer
     conv_state_indices_ptr,
     num_accepted_tokens_ptr,
     query_start_loc_ptr,  # (batch + 1)
+    block_idx_last_scheduled_token,  # (batch,)
+    initial_state_idx,  # (batch,)
     o_ptr,  # (batch, dim, seqlen)
     # Matrix dimensions
     batch: int,
@@ -660,7 +784,7 @@ def _causal_conv1d_update_kernel(
     KERNEL_WIDTH: tl.constexpr,
     SILU_ACTIVATION: tl.constexpr,
     IS_VARLEN: tl.constexpr,
-    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_APC_ENABLED: tl.constexpr,
     IS_SPEC_DECODING: tl.constexpr,
     NP2_STATELEN: tl.constexpr,
     USE_PAD_SLOT: tl.constexpr,
@@ -674,25 +798,29 @@ def _causal_conv1d_update_kernel(
     # [BLOCK_N,] elements along the feature-dimension (channel)
     idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 
-    if IS_CONTINUOUS_BATCHING:
-        # mask = idx_seq < batch
-        conv_state_batch_coord = tl.load(conv_state_indices_ptr +
-                                         idx_seq * stride_state_indices).to(
-                                             tl.int64)
+    if IS_APC_ENABLED:
+        # Get the state from the initial_state_idx
+        conv_state_init = tl.load(initial_state_idx + idx_seq)
+        current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq)
     else:
-        conv_state_batch_coord = idx_seq
+        conv_state_init = 0
+        current_last_index = 0
+
+    # cache_idx
+    conv_states_input_coord = tl.load(
+        conv_state_indices_ptr + idx_seq * stride_state_indices + conv_state_init
+    ).to(tl.int64)
+
     if USE_PAD_SLOT:  # noqa
-        if conv_state_batch_coord == pad_slot_id:
+        if conv_states_input_coord == pad_slot_id:
             # not processing as this is not the actual sequence
             return
 
     if IS_VARLEN:
         query_start_index = tl.load(query_start_loc_ptr + idx_seq).to(tl.int64)
-        query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to(
-            tl.int64)
+        query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to(tl.int64)
         # revise state_len and seqlen
-        state_len = state_len - (seqlen -
-                                 (query_end_index - query_start_index))
+        state_len = state_len - (seqlen - (query_end_index - query_start_index))
         seqlen = query_end_index - query_start_index
         x_offset = query_start_index * stride_x_token
         o_offset = query_start_index * stride_o_token
@@ -720,14 +848,17 @@ def _causal_conv1d_update_kernel(
         # - accept 2 tokens: [history3, ..., historyM, draft1, draft2]
         # - and so on.
         conv_state_token_offset = (
-            tl.load(num_accepted_tokens_ptr + idx_seq).to(tl.int64) - 1)
+            tl.load(num_accepted_tokens_ptr + idx_seq).to(tl.int64) - 1
+        )
     else:
         conv_state_token_offset = 0
 
     # STEP 1: READ init_state data
-    conv_states_base = (conv_state_ptr +
-                        (conv_state_batch_coord * stride_conv_state_seq) +
-                        (idx_feats * stride_conv_state_dim))
+    conv_states_base = (
+        conv_state_ptr
+        + (conv_states_input_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )
     mask_w = idx_feats < dim
 
     prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok
@@ -754,35 +885,50 @@ def _causal_conv1d_update_kernel(
     # window manner, at each forward pass, the tokens are shift by 1, so we
     # load since idx_tokens + 1.
     conv_state_ptrs_source = (
-        conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) +
-        conv_state_token_offset * stride_conv_state_tok +
-        (idx_feats * stride_conv_state_dim)[None, :] +
-        ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) *
-         stride_conv_state_tok)[:, None])  # [BLOCK_M, BLOCK_N]
-    mask = ((conv_state_batch_coord < num_cache_lines)
-            & ((idx_tokens + seqlen) < state_len)[:, None]
-            & (idx_feats < dim)[None, :])
+        conv_state_ptr
+        + (conv_states_input_coord * stride_conv_state_seq)
+        + conv_state_token_offset * stride_conv_state_tok
+        + (idx_feats * stride_conv_state_dim)[None, :]
+        + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * stride_conv_state_tok)[
+            :, None
+        ]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = (
+        (conv_states_input_coord < num_cache_lines)
+        & ((idx_tokens + seqlen) < state_len)[:, None]
+        & (idx_feats < dim)[None, :]
+    )
     conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0)
 
     VAL = state_len - seqlen
     x_base = x_ptr + x_offset + (idx_feats * stride_x_dim)  # [BLOCK_N]
 
-    x_ptrs = x_base[None, :] + (
-        (idx_tokens - VAL) * stride_x_token)[:, None]  # [BLOCK_M, BLOCK_N]
+    x_ptrs = (
+        x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
 
-    mask_x = ((idx_tokens - VAL >= 0)[:, None] &
-              (idx_tokens - VAL < seqlen)[:, None] & (idx_feats < dim)[None, :]
-              )  # token-index  # token-index  # feature-index
+    mask_x = (
+        (idx_tokens - VAL >= 0)[:, None]
+        & (idx_tokens - VAL < seqlen)[:, None]
+        & (idx_feats < dim)[None, :]
+    )  # token-index  # token-index  # feature-index
     loaded_x = tl.load(x_ptrs, mask_x, 0.0)
     tl.debug_barrier()
 
     new_conv_state = tl.where(mask, conv_state, loaded_x)
 
-    conv_state_base = (conv_state_ptr +
-                       (conv_state_batch_coord * stride_conv_state_seq) +
-                       (idx_feats * stride_conv_state_dim))  # [BLOCK_N,]
-    conv_state_ptrs_target = conv_state_base + (
-        idx_tokens * stride_conv_state_tok)[:, None]  # [BLOCK_M, BLOCK_N]
+    # Get the state from the initial_state_idx
+    # cache_idx
+    conv_states_offset = tl.load(
+        conv_state_indices_ptr + idx_seq * stride_state_indices + current_last_index
+    ).to(tl.int64)
+    conv_state_ptrs_target = (
+        conv_state_ptr
+        + (conv_states_offset * stride_conv_state_seq)  # Offset from seq
+        + (idx_feats * stride_conv_state_dim)
+    )[None, :] + (  # [BLOCK_N,]
+        idx_tokens * stride_conv_state_tok
+    )[:, None]
     mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :]
     tl.store(conv_state_ptrs_target, new_conv_state, mask)
 
@@ -790,10 +936,11 @@ def _causal_conv1d_update_kernel(
     if HAS_BIAS:
         bias = bias_ptr + idx_feats
         mask_bias = idx_feats < dim
-        acc_preload = tl.load(bias, mask=mask_bias,
-                              other=0.0).to(tl.float32)  # [BLOCK_N]
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
     else:
-        acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
 
     # STEP 4:
     # PRE-LOAD WEIGHTS
@@ -909,10 +1056,12 @@ def _causal_conv1d_update_kernel(
 
         if SILU_ACTIVATION:
             acc = acc / (1 + tl.exp(-acc))
-        mask_1d = (idx_token < seqlen) & (idx_feats < dim
-                                          )  # token-index  # feature-index
-        o_ptrs = o_ptr + o_offset + idx_token * stride_o_token + (idx_feats *
-                                                                  stride_o_dim)
+        mask_1d = (idx_token < seqlen) & (
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr + o_offset + idx_token * stride_o_token + (idx_feats * stride_o_dim)
+        )
 
         tl.store(o_ptrs, acc, mask=mask_1d)
 
@@ -921,14 +1070,15 @@ def causal_conv1d_update(
     x: torch.Tensor,
     conv_state: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    activation: Union[bool, str, None] = None,
-    cache_seqlens: Optional[torch.Tensor] = None,
-    conv_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
-    query_start_loc: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
+    activation: bool | str | None = None,
+    conv_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    query_start_loc: torch.Tensor | None = None,
     max_query_len: int = -1,
     pad_slot_id: int = PAD_SLOT_ID,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
     validate_data=False,
 ):
     """
@@ -942,15 +1092,14 @@ def causal_conv1d_update(
     conv_state: (..., dim, state_len), where state_len >= width - 1
     weight: (dim, width)
     bias: (dim,)
-    cache_seqlens: (batch,), dtype int32.
-        If not None, the conv_state is treated as a circular buffer.
-        The conv_state will be updated by copying x to the conv_state
-        starting at the index
-        @cache_seqlens % state_len.
     conv_state_indices: (batch,), dtype int32
         If not None, the conv_state is a larger tensor along the batch dim,
         and we are selecting the batch coords specified by conv_state_indices.
         Useful for a continuous batching scenario.
+    block_idx_last_scheduled_token: (batch,), dtype int32
+        The pointer into conv_state_indices, where the last cache block to be filled is located.
+    initial_state_idx: (batch,), dtype int32
+        The pointer into conv_state_indices, where the cache block containing the initial state is located.
     num_accepted_tokens: (batch,), dtype int32
         If not None, it indicates the number of accepted tokens for each
         sequence in the batch.
@@ -963,15 +1112,14 @@ def causal_conv1d_update(
         If query_start_loc is not None, this indicates the maximum query
         length in the batch.
     pad_slot_id: int
-            if cache_indices is passed, lets the kernel identify padded
+            if conv_state_indices is passed, lets the kernel identify padded
             entries that will not be processed,
-            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            for example: conv_state_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
             in this case, the kernel will not process entries at
             indices 0 and 3
     out: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim), same shape as `x`
     """
     if validate_data:
-        assert cache_seqlens is None  # not implemented yet - ok for vLLM
         assert pad_slot_id is not None
         assert x.stride(1) == 1
     if isinstance(activation, bool):
@@ -998,20 +1146,19 @@ def causal_conv1d_update(
 
     if validate_data:
         assert dim == weight.size(0)
-        assert conv_state.stride(
-            -2
-        ) == 1, f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        assert conv_state.stride(-2) == 1, (
+            f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        )
         assert state_len >= width - 1
         # when above happens, we don't shift-left to keep any records in conv_state
         assert dim == conv_state.size(1)
         if conv_state_indices is None:
             assert conv_state.size(0) >= batch
         else:
-            assert (batch, ) == conv_state_indices.shape
+            assert (batch,) == conv_state_indices.shape
 
         assert num_cache_lines >= batch
         assert weight.stride(1) == 1  # Need this
-        assert cache_seqlens is None  # not needed for vLLM - circular buffer
 
     # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o'
     out = x
@@ -1028,10 +1175,10 @@ def causal_conv1d_update(
         stride_o_token, stride_o_dim = out.stride()
         stride_o_seq = 0
 
-    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride(
+    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride()
+    stride_state_indices = (
+        conv_state_indices.stride(0) if conv_state_indices is not None else 0
     )
-    stride_state_indices = conv_state_indices.stride(
-        0) if conv_state_indices is not None else 0
     if num_accepted_tokens is not None:
         state_len = width - 1 + (seqlen - 1)  # effective state_len needed
     else:
@@ -1050,10 +1197,11 @@ def causal_conv1d_update(
         weight,
         bias,
         conv_state,
-        cache_seqlens,
         conv_state_indices,
         num_accepted_tokens,
         query_start_loc,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
         out,
         # Matrix dimensions
         batch,
@@ -1081,7 +1229,7 @@ def causal_conv1d_update(
         KERNEL_WIDTH=width,
         SILU_ACTIVATION=activation in ["silu", "swish"],
         IS_VARLEN=query_start_loc is not None,
-        IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        IS_APC_ENABLED=block_idx_last_scheduled_token is not None,
         IS_SPEC_DECODING=num_accepted_tokens is not None,
         NP2_STATELEN=np2_statelen,
         USE_PAD_SLOT=pad_slot_id is not None,
diff --git a/vllm/model_executor/layers/mamba/ops/layernorm_gated.py b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
index f3a45ab097c34cc55293e654857105aa0ec83b31..b592906c6f130965490e6637354abe961656daf1 100644
--- a/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
+++ b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
@@ -46,17 +46,17 @@ def _layer_norm_fwd_1pass_kernel(
         B += group * N
     # Compute mean and variance
     cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
     if HAS_Z and not NORM_BEFORE_GATE:
         z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
         x *= z * tl.sigmoid(z)
     if not IS_RMS_NORM:
         mean = tl.sum(x, axis=0) / N
         tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.)
+        xbar = tl.where(cols < N, x - mean, 0.0)
         var = tl.sum(xbar * xbar, axis=0) / N
     else:
-        xbar = tl.where(cols < N, x, 0.)
+        xbar = tl.where(cols < N, x, 0.0)
         var = tl.sum(xbar * xbar, axis=0) / N
     rstd = 1 / tl.sqrt(var + eps)
     tl.store(Rstd + row, rstd)
@@ -74,15 +74,17 @@ def _layer_norm_fwd_1pass_kernel(
     tl.store(Y + cols, y, mask=mask)
 
 
-def _layer_norm_fwd(x,
-                    weight,
-                    bias,
-                    eps,
-                    z=None,
-                    out=None,
-                    group_size=None,
-                    norm_before_gate=True,
-                    is_rms_norm=False):
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    z=None,
+    out=None,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
     M, N = x.shape
     if group_size is None:
         group_size = N
@@ -92,57 +94,57 @@ def _layer_norm_fwd(x,
     if z is not None:
         assert z.stride(-1) == 1
         assert z.shape == (M, N)
-    assert weight.shape == (N, )
+    assert weight.shape == (N,)
     assert weight.stride(-1) == 1
     if bias is not None:
         assert bias.stride(-1) == 1
-        assert bias.shape == (N, )
+        assert bias.shape == (N,)
     # allocate output
     if out is not None:
         assert out.shape == x.shape
     else:
         out = torch.empty_like(x)
     assert out.stride(-1) == 1
-    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
-                       device=x.device) if not is_rms_norm else None
-    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
     if group_size > BLOCK_N:
-        raise RuntimeError(
-            "This layer norm doesn't support feature dim >= 64KB.")
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     # heuristics for number of warps
     num_warps = min(max(BLOCK_N // 256, 1), 8)
     grid = (M, ngroups)
     with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[grid](x,
-                                           out,
-                                           weight,
-                                           bias,
-                                           z,
-                                           mean,
-                                           rstd,
-                                           x.stride(0),
-                                           out.stride(0),
-                                           z.stride(0) if z is not None else 0,
-                                           M,
-                                           group_size,
-                                           eps,
-                                           BLOCK_N=BLOCK_N,
-                                           NORM_BEFORE_GATE=norm_before_gate,
-                                           IS_RMS_NORM=is_rms_norm,
-                                           num_warps=num_warps)
+        _layer_norm_fwd_1pass_kernel[grid](
+            x,
+            out,
+            weight,
+            bias,
+            z,
+            mean,
+            rstd,
+            x.stride(0),
+            out.stride(0),
+            z.stride(0) if z is not None else 0,
+            M,
+            group_size,
+            eps,
+            BLOCK_N=BLOCK_N,
+            NORM_BEFORE_GATE=norm_before_gate,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=num_warps,
+        )
     return out, mean, rstd
 
 
-def rms_norm_gated(x,
-                   weight,
-                   bias,
-                   z=None,
-                   eps=1e-6,
-                   group_size=None,
-                   norm_before_gate=True):
+def rms_norm_gated(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+):
     x_shape_og = x.shape
     # reshape input data into 2D tensor
     x = x.reshape(-1, x.shape[-1])
@@ -156,13 +158,15 @@ def rms_norm_gated(x,
     weight = weight.contiguous()
     if bias is not None:
         bias = bias.contiguous()
-    y, _, _ = _layer_norm_fwd(x,
-                              weight,
-                              bias,
-                              eps,
-                              z=z,
-                              group_size=group_size,
-                              norm_before_gate=norm_before_gate,
-                              is_rms_norm=True)
+    y, _, _ = _layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        group_size=group_size,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=True,
+    )
 
     return y.reshape(x_shape_og)
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 838290a9f5fb2b5d51961c04dfd2b89625be4eee..53fd5d5458b0966da5667868fbff6c9853bf49d2 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -11,8 +11,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.triton_utils import HAS_TRITON, tl, triton
 
-TRITON3 = HAS_TRITON and (version.parse(triton.__version__)
-                          >= version.parse("3.0.0"))
+TRITON3 = HAS_TRITON and (version.parse(triton.__version__) >= version.parse("3.0.0"))
 
 if TRITON3:
 
@@ -28,16 +27,18 @@ else:
         return dt
 
 
-@triton.heuristics(
-    {"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
+@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
 @triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
 @triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics({
-    "HAS_STATE_BATCH_INDICES":
-    lambda args: args["state_batch_indices_ptr"] is not None
-})
 @triton.heuristics(
-    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
+    {
+        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
+        is not None
+    }
+)
+@triton.heuristics(
+    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
+)
 @triton.jit
 def _selective_scan_update_kernel(
     # Pointers to matrices
@@ -52,6 +53,7 @@ def _selective_scan_update_kernel(
     z_ptr,
     out_ptr,
     state_batch_indices_ptr,
+    dst_state_batch_indices_ptr,
     pad_slot_id,
     # Matrix dimensions
     batch,
@@ -107,11 +109,18 @@ def _selective_scan_update_kernel(
     # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
     # is the same as the batch id.
     if HAS_STATE_BATCH_INDICES:
+        dst_state_batch_indices_ptr += pid_b
+        dst_state_batch_idx = tl.load(dst_state_batch_indices_ptr).to(tl.int64)
+        dst_state_ptr = state_ptr + (
+            dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head
+        )
         state_batch_indices_ptr += pid_b
         state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
-        state_ptr += (state_batch_idx * stride_state_batch +
-                      pid_h * stride_state_head)
+        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
     else:
+        dst_state_ptr = (
+            state_ptr + pid_b * stride_state_batch + pid_h * stride_state_head
+        )
         state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
 
     x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
@@ -119,26 +128,29 @@ def _selective_scan_update_kernel(
     if HAS_DT_BIAS:
         dt_bias_ptr += pid_h * stride_dt_bias_head
     A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h //
-                                       nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h //
-                                       nheads_ngroups_ratio) * stride_C_group
+    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
+    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
     if HAS_Z:
         z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
     out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
 
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim +
-                              offs_n[None, :] * stride_state_dstate)
+    state_ptrs = state_ptr + (
+        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
+    )
+    dst_state_ptrs = dst_state_ptr + (
+        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
+    )
     x_ptrs = x_ptr + offs_m * stride_x_dim
     dt_ptrs = dt_ptr + offs_m * stride_dt_dim
     if HAS_DT_BIAS:
         dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
     if HAS_D:
         D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim +
-                      offs_n[None, :] * stride_A_dstate)
+    A_ptrs = A_ptr + (
+        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
+    )
     B_ptrs = B_ptr + offs_n * stride_B_dstate
     C_ptrs = C_ptr + offs_n * stride_C_dstate
     if HAS_D:
@@ -148,20 +160,19 @@ def _selective_scan_update_kernel(
     out_ptrs = out_ptr + offs_m * stride_out_dim
     mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
     if HAS_STATE_BATCH_INDICES:
-        mask &= (state_batch_idx != pad_slot_id)
+        mask &= state_batch_idx != pad_slot_id
     state = tl.load(state_ptrs, mask=mask, other=0.0)
 
     x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
     if not TIE_HDIM:
         dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
         if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim,
-                          other=0.0).to(tl.float32)
+            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
         if DT_SOFTPLUS:
             dt = softplus(dt)
-        A = tl.load(A_ptrs,
-                    mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
-                    other=0.0).to(tl.float32)
+        A = tl.load(
+            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
+        ).to(tl.float32)
         dA = tl.exp(A * dt[:, None])
     else:
         dt = tl.load(dt_ptr).to(tl.float32)
@@ -184,8 +195,8 @@ def _selective_scan_update_kernel(
 
     mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
     if HAS_STATE_BATCH_INDICES:
-        mask &= (state_batch_idx != pad_slot_id)
-    tl.store(state_ptrs, state, mask=mask)
+        mask &= state_batch_idx != pad_slot_id
+    tl.store(dst_state_ptrs, state, mask=mask)
     out = tl.sum(state * C[None, :], axis=1)
     if HAS_D:
         out += x * D
@@ -194,19 +205,22 @@ def _selective_scan_update_kernel(
     tl.store(out_ptrs, out, mask=offs_m < dim)
 
 
-def selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=None,
-                           z=None,
-                           dt_bias=None,
-                           dt_softplus=False,
-                           state_batch_indices=None,
-                           pad_slot_id=PAD_SLOT_ID,
-                           out=None):
+def selective_state_update(
+    state,
+    x,
+    dt,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    dt_bias=None,
+    dt_softplus=False,
+    state_batch_indices=None,
+    dst_state_batch_indices=None,
+    pad_slot_id=PAD_SLOT_ID,
+    out=None,
+):
     """
     Argument:
         state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
@@ -219,12 +233,12 @@ def selective_state_update(state,
         z: (batch, dim) or (batch, nheads, dim)
         dt_bias: (dim,) or (nheads, dim)
         pad_slot_id: int
-            if cache_indices is passed, lets the kernel identify padded 
-            entries that will not be processed, 
-            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] 
-            in this case, the kernel will not process entries at 
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
             indices 0 and 3
-        out: Preallocated ssm output tensor. Assume same shape as x. 
+        out: Preallocated ssm output tensor. Assume same shape as x.
              In-place updated.
     """
     if state.dim() == 3:
@@ -265,20 +279,33 @@ def selective_state_update(state,
     if dt_bias is not None:
         assert dt_bias.shape == (nheads, dim)
     if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch, )
+        assert state_batch_indices.shape == (batch,)
+    if dst_state_batch_indices is not None:
+        assert dst_state_batch_indices.shape == (batch,)
+    else:
+        # revert to the default behavior of in-place state updates
+        dst_state_batch_indices = state_batch_indices
     assert out.shape == x.shape
 
-    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
-    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
-                 (0, 0, 0))
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
+    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
     # We don't want autotune since it will overwrite the state
     # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16 else
-                               ((16, 4) if dstate <= 32 else
-                                ((8, 4) if dstate <= 64 else
-                                 ((4, 4) if dstate <= 128 else ((4, 8))))))
-    tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(
-        -1) == 0 and dt_bias.stride(-1) == 0
+    BLOCK_SIZE_M, num_warps = (
+        (32, 4)
+        if dstate <= 16
+        else (
+            (16, 4)
+            if dstate <= 32
+            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
+        )
+    )
+    tie_hdim = (
+        A.stride(-1) == 0
+        and A.stride(-2) == 0
+        and dt.stride(-1) == 0
+        and dt_bias.stride(-1) == 0
+    )
     with torch.cuda.device(x.device.index):
         _selective_scan_update_kernel[grid](
             state,
@@ -292,6 +319,7 @@ def selective_state_update(state,
             z,
             out,
             state_batch_indices,
+            dst_state_batch_indices,
             pad_slot_id,
             batch,
             nheads,
@@ -308,8 +336,7 @@ def selective_state_update(state,
             dt.stride(0),
             dt.stride(1),
             dt.stride(2),
-            *(dt_bias.stride(0),
-              dt_bias.stride(1)) if dt_bias is not None else 0,
+            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
             A.stride(0),
             A.stride(1),
             A.stride(2),
@@ -333,54 +360,74 @@ def selective_state_update(state,
         )
 
 
-def selective_scan_fn(u,
-                      ssm_states,
-                      delta,
-                      A,
-                      B,
-                      C,
-                      D=None,
-                      z=None,
-                      delta_bias=None,
-                      delta_softplus=False,
-                      query_start_loc=None,
-                      cache_indices=None,
-                      has_initial_state=None,
-                      pad_slot_id=PAD_SLOT_ID) -> torch.Tensor:
+def selective_scan_fn(
+    u,
+    ssm_states,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    query_start_loc=None,
+    cache_indices=None,
+    has_initial_state=None,
+    pad_slot_id=PAD_SLOT_ID,
+    block_size=1024,
+    block_idx_first_scheduled_token=None,
+    block_idx_last_scheduled_token=None,
+    initial_state_idx=None,
+) -> torch.Tensor:
     """
-    u: (dim, total_length) for varlen or (batch, dim, seqlen) 
+    u: (dim, total_length) for varlen or (batch, dim, seqlen)
         applies changes in place.
     ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate)
         applies changes in place.
     delta: (dim, total_length) for varlen or (batch, dim, seqlen)
-    A: (dim, dstate) 
-    B: (ngroups, dstate, total_length) for varlen or 
+    A: (dim, dstate)
+    B: (ngroups, dstate, total_length) for varlen or
                                         (batch,ngroups,dstate,seqlen)
-    C: (ngroups, dstate, total_length) for varlen or 
+    C: (ngroups, dstate, total_length) for varlen or
                                         (batch,ngroups,dstate,seqlen)
-    D: (dim,) 
-    z: (dim, total_length) for varlen or (batch, dim, seqlen) 
+    D: (dim,)
+    z: (dim, total_length) for varlen or (batch, dim, seqlen)
     dt_bias: (dim,) or (dim)
     query_start_loc: (batch + 1) int32
         The cumulative sequence lengths of the sequences in
         the batch, used to index into sequence. prepended with 0.
-        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
         x.shape=(dim,17)
     cache_indices: (batch) int32
-        A tensor with each cell is a correspondent 
-        input and output ssm_state index
+        A tensor with each cell is a correspondent
+        input and output ssm_state indices
+      - Without APC: (batch,) - single state index per batch item
+      - With APC: (batch, max_positions) - cache block indices for read/write
+        Each non-zero value indicates a cache block to load from and/or write to.
     has_initial_state: (batch) bool
-        A tensor populated with ones and zeros, 
-        indicate if the ssm_state at the corresponding index should be 
-        used as initial state. Not providing argument assumes 
+        A tensor populated with ones and zeros,
+        indicate if the ssm_state at the corresponding index should be
+        used as initial state. Not providing argument assumes
         there's no initial state
     pad_slot_id: int
-        if cache_indices is passed, lets the kernel identify padding entries 
-        that will not be processed, 
-        for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] 
+        if cache_indices is passed, lets the kernel identify padding entries
+        that will not be processed,
+        for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
         in this case, the kernel will not process entries at indices 0 and 3
+    block_size: int
+        The block size to align the cached states to
+    block_idx_first_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the first
+        cache block to be filled is located.
+    block_idx_last_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the last cache block
+        to be filled is located.
+    initial_state_idx: (batch,), dtype int32
+        The pointer into cache_indices, where the cache block
+        containing the initial state is located.
     returns
-        output: (dim, total_length) for varlen or (batch, dim, seqlen) 
+        output: (dim, total_length) for varlen or (batch, dim, seqlen)
                 supports inplace replacement
     """
     if u.stride(-1) != 1:
@@ -404,9 +451,26 @@ def selective_scan_fn(u,
     if C.dim() == 2 and query_start_loc is not None:
         C = C.unsqueeze(0)
 
-    ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus,
-                           query_start_loc, cache_indices, has_initial_state,
-                           ssm_states, pad_slot_id)
+    ops.selective_scan_fwd(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        ssm_states,
+        pad_slot_id,
+        block_size,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
+    )
 
     if z is None:
         return delta  # output written inplace to delta
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
index 601b71ab2a51bcfea8802f1e0931e042ac0b07a8..ac5ffc10f29504e675eab5fc8539a20219cabf4b 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -6,8 +6,6 @@
 
 # ruff: noqa: E501,SIM102
 
-import math
-
 import torch
 
 from vllm.triton_utils import tl, triton
@@ -16,79 +14,52 @@ from vllm.triton_utils import tl, triton
 @triton.autotune(
     configs=[
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
             num_stages=3,
-            num_warps=8),
+            num_warps=8,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=2),
+            num_warps=2,
+        ),
     ],
-    key=['chunk_size', 'K', 'IS_CAUSAL'],
+    key=["chunk_size", "K", "IS_CAUSAL"],
 )
 @triton.jit
 def _bmm_chunk_fwd_kernel(
@@ -96,7 +67,7 @@ def _bmm_chunk_fwd_kernel(
     a_ptr,
     b_ptr,
     out_ptr,
-    seq_idx_ptr,
+    cu_chunk_seqlens_ptr,
     # Matrix dimensions
     seqlen,
     chunk_size: tl.constexpr,
@@ -112,7 +83,6 @@ def _bmm_chunk_fwd_kernel(
     stride_out_head: tl.int64,
     stride_outm: tl.int64,
     stride_outn: tl.constexpr,
-    stride_seq_idx_seqlen: tl.constexpr,
     # Meta-parameters
     IS_CAUSAL: tl.constexpr,
     dot_dtype: tl.constexpr,
@@ -129,32 +99,36 @@ def _bmm_chunk_fwd_kernel(
     if IS_CAUSAL:
         if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
             return
-    a_ptr += pid_c * chunk_size * stride_a_seqlen + pid_h * stride_a_head
-    b_ptr += pid_c * chunk_size * stride_b_seqlen + pid_h * stride_b_head
 
-    seq_idx_ptr += pid_c * chunk_size * stride_seq_idx_seqlen
+    chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c)
+    chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1)
+
+    a_ptr += chunk_seqlen_start * stride_a_seqlen + pid_h * stride_a_head
+    b_ptr += chunk_seqlen_start * stride_b_seqlen + pid_h * stride_b_head
 
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen +
-                      offs_k[None, :] * stride_ak)
-    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk +
-                      offs_n[None, :] * stride_b_seqlen)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_b_seqlen)
+    chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start
 
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     # compute a * b.T
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=(offs_m[:, None] < chunk_size_limit) &
-                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-                    other=0.0).to(dot_dtype)
-        b = tl.load(b_ptrs,
-                    mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) &
-                    (offs_n[None, :] < chunk_size_limit),
-                    other=0.0).to(dot_dtype)
+        a = tl.load(
+            a_ptrs,
+            mask=(offs_m[:, None] < chunk_size_limit)
+            & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        ).to(dot_dtype)
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K)
+            & (offs_n[None, :] < chunk_size_limit),
+            other=0.0,
+        ).to(dot_dtype)
         acc += tl.dot(a, b)
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
@@ -162,32 +136,23 @@ def _bmm_chunk_fwd_kernel(
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 
-    # Zero out the results that are not from the same request
-    # in the varlen batch
-    seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-                        mask=offs_m < chunk_size_limit,
-                        other=-1)
-    seq_idx_n = tl.load(seq_idx_ptr + offs_n * stride_seq_idx_seqlen,
-                        mask=offs_n < chunk_size_limit,
-                        other=-2)
-    acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
-
     out = acc.to(out_ptr.dtype.element_ty)
     out_ptr += pid_c * stride_out_chunk + pid_h * stride_out_head
-    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] +
-                          offs_n[None, :] * stride_outn)
-    tl.store(out_ptrs,
-             out,
-             mask=(offs_m[:, None] < chunk_size) &
-             (offs_n[None, :] < chunk_size))
+    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] + offs_n[None, :] * stride_outn)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size),
+    )
 
 
-def _bmm_chunk_fwd(a, b, chunk_size, seq_idx, causal=False, output_dtype=None):
+def _bmm_chunk_fwd(a, b, chunk_size, cu_chunk_seqlens, causal=False, output_dtype=None):
     """
     Argument:
         a: (seqlen, ngroups, k)
         b: (seqlen, ngroups, k)
-        seq_idx: (seqlen,). out[i, j] for seq_idx[i] != seq_idx[j] will be zeroed out.
+        chunk_size: int
+        cu_chunk_seq_lens: (nchunks+1,)
         causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are
             guaranteed to be correct.
     Return:
@@ -195,32 +160,37 @@ def _bmm_chunk_fwd(a, b, chunk_size, seq_idx, causal=False, output_dtype=None):
     """
     seqlen, ngroups, k = a.shape
     assert b.shape == a.shape
-    assert seq_idx is not None
-    assert seq_idx.shape == (seqlen, )
     if a.stride(-1) != 1 and a.stride(0) != 1:
         a = a.contiguous()
     if b.stride(-1) != 1 and b.stride(0) != 1:
         b = b.contiguous()
 
-    nchunks = math.ceil(seqlen / chunk_size)
+    nchunks = len(cu_chunk_seqlens) - 1
     # Allocates output.
     out_dtype = a.dtype if output_dtype is None else output_dtype
-    out = torch.empty((nchunks, ngroups, chunk_size, chunk_size),
-                      device=a.device,
-                      dtype=out_dtype)
-    dot_dtype = (tl.bfloat16
-                 if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16 else
-                 (tl.float16 if a.dtype == torch.float16
-                  or b.dtype == torch.float16 else tl.float32))
-    grid = lambda META: (triton.cdiv(
-        chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
-            chunk_size, META['BLOCK_SIZE_N']), nchunks * ngroups)
+    out = torch.empty(
+        (nchunks, ngroups, chunk_size, chunk_size), device=a.device, dtype=out_dtype
+    )
+    dot_dtype = (
+        tl.bfloat16
+        if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16
+        else (
+            tl.float16
+            if a.dtype == torch.float16 or b.dtype == torch.float16
+            else tl.float32
+        )
+    )
+    grid = lambda META: (
+        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
+        * triton.cdiv(chunk_size, META["BLOCK_SIZE_N"]),
+        nchunks * ngroups,
+    )
     with torch.cuda.device(a.device.index):
         _bmm_chunk_fwd_kernel[grid](
             a_ptr=a,
             b_ptr=b,
             out_ptr=out,
-            seq_idx_ptr=seq_idx,
+            cu_chunk_seqlens_ptr=cu_chunk_seqlens,
             seqlen=seqlen,
             chunk_size=chunk_size,
             K=k,
@@ -235,7 +205,6 @@ def _bmm_chunk_fwd(a, b, chunk_size, seq_idx, causal=False, output_dtype=None):
             stride_out_head=out.stride(1),
             stride_outm=out.stride(-2),
             stride_outn=out.stride(-1),
-            stride_seq_idx_seqlen=seq_idx.stride(0),
             IS_CAUSAL=causal,
             dot_dtype=dot_dtype,
         )
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index add72617fcea5fe2ef9521cb56ad8e0cb3f949ca..661c884627b00a9cd660954f9ad52a6e542cc5fd 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -10,101 +10,68 @@ from packaging import version
 
 from vllm.triton_utils import tl, triton
 
-TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
 
 
 @triton.autotune(
     configs=[
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
             num_stages=3,
-            num_warps=8),
+            num_warps=8,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=2),
+            num_warps=2,
+        ),
     ],
-    key=['chunk_size', 'hdim', 'dstate', 'IS_CAUSAL'],
+    key=["chunk_size", "hdim", "dstate", "IS_CAUSAL"],
 )
 @triton.jit
 def _chunk_scan_fwd_kernel(
@@ -120,9 +87,7 @@ def _chunk_scan_fwd_kernel(
     states_ptr,
     D_ptr,
     initstates_ptr,
-    chunk_indices_ptr,
-    chunk_offsets_ptr,
-    chunk_meta_num,
+    cu_chunk_seqlens_ptr,
     # Matrix dimensions
     chunk_size: tl.constexpr,
     hdim: tl.constexpr,
@@ -149,7 +114,7 @@ def _chunk_scan_fwd_kernel(
     stride_dA_cs_chunk: tl.int64,
     stride_dA_cs_head: tl.int64,
     stride_dA_cs_csize: tl.constexpr,
-    stride_seq_idx_seqlen: tl.constexpr,
+    stride_seq_idx_chunk: tl.constexpr,
     stride_C_seqlen: tl.int64,
     stride_C_head: tl.int64,
     stride_C_dstate: tl.constexpr,
@@ -175,240 +140,209 @@ def _chunk_scan_fwd_kernel(
     HAS_INITSTATES: tl.constexpr,
 ):
     pid_c = tl.program_id(axis=1).to(tl.int64)
-    if not HAS_INITSTATES:
-        c_idx = pid_c
-        c_off = 0
-    else:
-        c_idx = tl.load(chunk_indices_ptr + pid_c, mask=pid_c > -1, other=0)
-        c_off = tl.load(chunk_offsets_ptr + pid_c, mask=pid_c > -1, other=0)
-
     pid_h = tl.program_id(axis=2)
     num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
     pid_m = tl.program_id(axis=0) // num_pid_n
     pid_n = tl.program_id(axis=0) % num_pid_n
-    cb_ptr += c_idx * stride_cb_chunk + (pid_h //
-                                         nheads_ngroups_ratio) * stride_cb_head
-    x_ptr += c_idx * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += c_idx * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += c_idx * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    C_ptr += c_idx * chunk_size * stride_C_seqlen + (
-        pid_h // nheads_ngroups_ratio) * stride_C_head
+    cb_ptr += pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c)
+    chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1)
+    x_ptr += chunk_seqlen_start * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    C_ptr += (
+        chunk_seqlen_start * stride_C_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_C_head
+    )
 
     # M-block offsets and prev states
     #  - logic in next block may override these if there is an active offset
-    offs_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
-    prev_states_ptr = states_ptr + c_idx * stride_states_chunk + pid_h * stride_states_head
-    prev_states_hdim = stride_states_hdim
-    prev_states_dstate = stride_states_dstate
-
-    chunk_size_limit = min(chunk_size, seqlen - c_idx * chunk_size)
-
-    seq_idx_ptr += c_idx * chunk_size * stride_seq_idx_seqlen
-    # - we only need seq_idx_prev to be aligned to chunk boundary
-    seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen,
-                           mask=c_idx >= 1,
-                           other=0)
-
-    if HAS_INITSTATES:
-        # if there are init states, we only need seq_idx_m to point
-        # what is the current seq_idx
-
-        # get current seq idx
-        if (pid_m * BLOCK_SIZE_M + c_off) < chunk_size_limit:
-            seq_idx_m = tl.load(
-                seq_idx_ptr +
-                (pid_m * BLOCK_SIZE_M + c_off) * stride_seq_idx_seqlen, )
-
-            # - recall that in ssd_state_passing, for the case c_off == 0
-            # i.e., the very first sequence, we made states_ptr hold its initial state
-            # so this edge case is taken care of
-            if ((c_off == 0) and (seq_idx_prev != seq_idx_m
-                                  )  # if a seq is changed exactly on boundary
-                    or (c_off > 0)  # implies a new example (pseudo chunk)
-                ):
-
-                # - replace prev_states_ptr with init_states
-                prev_states_ptr = initstates_ptr + seq_idx_m * stride_init_states_batch + pid_h * stride_init_states_head
-                prev_states_hdim = stride_init_states_hdim  # override strides
-                prev_states_dstate = stride_init_states_dstate
-
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
-                      mask=offs_m < chunk_size,
-                      other=0.0).to(tl.float32)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
 
-    # - handle chunk state limit
-    if HAS_INITSTATES:
-        # have to split this if otherwise compilation will have problems
-        dA_cs_m_boundary = 0.0
+    seq_idx_ptr += pid_c * stride_seq_idx_chunk
+    seq_idx = tl.load(seq_idx_ptr)
+    seq_idx_prev = tl.load(
+        seq_idx_ptr - stride_seq_idx_chunk, mask=pid_c >= 1, other=-1
+    )
 
-        # get the c_idx for the next (logica) chunk
-        c_idx_n = tl.load(
-            chunk_indices_ptr + (pid_c + 1),
-            mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
-            other=-1  # to trigger different chunk
+    if HAS_INITSTATES and (seq_idx != seq_idx_prev):
+        prev_states_ptr = (
+            initstates_ptr
+            + seq_idx * stride_init_states_batch
+            + pid_h * stride_init_states_head
         )
+        prev_states_hdim = stride_init_states_hdim
+        prev_states_dstate = stride_init_states_dstate
+    else:
+        prev_states_ptr = (
+            states_ptr + (pid_c - 1) * stride_states_chunk + pid_h * stride_states_head
+        )
+        prev_states_hdim = stride_states_hdim
+        prev_states_dstate = stride_states_dstate
 
-        # - there are things to consider
-        # A. if c_off > 0 then we need to move the dA_cs boundary to ensure correct
-        #    contribution of past states
-        # B. if c_off_n < chunk_size_limit, then we need to adjust this so as not to
-        #    encroach into the next sequence, where c_off_n is the offset of the next
-        #    (logical) chunk.
-        # An equivalent check for B is c_idx == c_idx_n, where there is repetition in
-        # (logical) chunk indices.
-
-        if (c_idx == c_idx_n) or c_off > 0:
-
-            # get the next offset
-            c_off_n = tl.load(chunk_offsets_ptr + (pid_c + 1),
-                              mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
-                              other=chunk_size)
-
-            # in this case, adjust down the chunk_size_limit
-            if c_idx == c_idx_n:
-                chunk_size_limit = min(c_off_n, chunk_size_limit)
+    chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start
 
-            # get the cs at the offset boundary
-            # - c_off == 0 is a passthrough
-            # - We need dA_cs at the boundary, defined by c_off - no need
-            #   to increase pointer by pid_m (it is a constant offset,
-            #   i.e. the same for all blocks)
-            dA_cs_m_boundary = tl.load(
-                dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize,
-                mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)),
-                other=0.0).to(tl.float32)
-    else:
-        # - handle seq idx when HAS_INITSTATES==False
-        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-                            mask=offs_m < chunk_size_limit,
-                            other=-1)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dA_cs_m = tl.load(
+        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
+    ).to(tl.float32)
 
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
-    # Without the if (pid_c > -1), with Triton 2.1.0, I get
-    # Assertion `!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mm a layout conversion"' failed.
-    # With Triton 2.2.0, this works
-    if IS_TRITON_22 or c_idx > -1:
-        # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-        offs_k_dstate = tl.arange(
-            0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
-        C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen +
-                          offs_k_dstate[None, :] * stride_C_dstate)
+    offs_out_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 
-        prev_states_ptrs = prev_states_ptr + (
-            offs_n[None, :] * prev_states_hdim +
-            offs_k_dstate[:, None] * prev_states_dstate)
+    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+    offs_k_dstate = tl.arange(
+        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
+    )
+    C_ptrs = C_ptr + (
+        offs_m[:, None] * stride_C_seqlen + offs_k_dstate[None, :] * stride_C_dstate
+    )
 
-        if not HAS_INITSTATES:
-            # - this is for continuous batching where there is no init states
-            scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+    scale_m = tl.exp(dA_cs_m)
+    if BLOCK_SIZE_DSTATE <= 128:
+        C = tl.load(
+            C_ptrs,
+            mask=(offs_m[:, None] < chunk_size_limit)
+            & (offs_k_dstate[None, :] < dstate),
+            other=0.0,
+        )
+
+        if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
+            # if no init states AND starting a new sequence, we need zeros
+            prev_states = tl.zeros(
+                (BLOCK_SIZE_DSTATE, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
+            )
         else:
-            # - if there is initstates, we will rely on prev_states, no zeroing
-            #   required.
-            scale_m = tl.exp(dA_cs_m - dA_cs_m_boundary)
+            # otherwise read the previous state
+            prev_states_ptrs = (
+                prev_states_ptr
+                + offs_n[None, :] * prev_states_hdim
+                + offs_k_dstate[:, None] * prev_states_dstate
+            )
+            prev_states = tl.load(
+                prev_states_ptrs,
+                mask=(offs_k_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
+                other=0.0,
+            )
+            prev_states = prev_states.to(C_ptr.dtype.element_ty)
 
-        if BLOCK_SIZE_DSTATE <= 128:
-            C = tl.load(C_ptrs,
-                        mask=(offs_m[:, None] < chunk_size_limit) &
-                        (offs_k_dstate[None, :] < dstate),
-                        other=0.0)
+        acc = tl.dot(C, prev_states) * scale_m[:, None]
 
-            prev_states = tl.load(prev_states_ptrs,
-                                  mask=(offs_k_dstate[:, None] < dstate) &
-                                  (offs_n[None, :] < hdim),
-                                  other=0.0)
-            prev_states = prev_states.to(C_ptr.dtype.element_ty)
-            acc = tl.dot(C, prev_states) * scale_m[:, None]
-        else:
-            for k in range(0, dstate, BLOCK_SIZE_K):
-                C = tl.load(C_ptrs,
-                            mask=(offs_m[:, None] < chunk_size_limit) &
-                            (offs_k_dstate[None, :] < dstate - k),
-                            other=0.0)
-                # C = (C * scale_m[:, None]).to(C_ptr.dtype.element_ty)
+    else:
+        prev_states_ptrs = (
+            prev_states_ptr
+            + offs_n[None, :] * prev_states_hdim
+            + offs_k_dstate[:, None] * prev_states_dstate
+        )
+        for k in range(0, dstate, BLOCK_SIZE_K):
+            C = tl.load(
+                C_ptrs,
+                mask=(offs_m[:, None] < chunk_size_limit)
+                & (offs_k_dstate[None, :] < dstate - k),
+                other=0.0,
+            )
+            if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
+                prev_states = tl.zeros(
+                    (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
+                )
+            else:
                 prev_states = tl.load(
                     prev_states_ptrs,
-                    mask=(offs_k_dstate[:, None] < dstate - k) &
-                    (offs_n[None, :] < hdim),
-                    other=0.0)
+                    mask=(offs_k_dstate[:, None] < dstate - k)
+                    & (offs_n[None, :] < hdim),
+                    other=0.0,
+                )
                 prev_states = prev_states.to(C_ptr.dtype.element_ty)
-                acc += tl.dot(C, prev_states)
-                C_ptrs += BLOCK_SIZE_K
-                prev_states_ptrs += BLOCK_SIZE_K
-            acc *= scale_m[:, None]
-
-    offs_k = tl.arange(0, BLOCK_SIZE_K) + c_off
-    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m +
-                        offs_k[None, :] * stride_cb_csize_k)
-    x_ptrs = x_ptr + (offs_k[:, None] * stride_x_seqlen +
-                      offs_n[None, :] * stride_x_hdim)
+            acc += tl.dot(C, prev_states)
+            C_ptrs += BLOCK_SIZE_K
+            prev_states_ptrs += BLOCK_SIZE_K
+        acc *= scale_m[:, None]
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    cb_ptrs = cb_ptr + (
+        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
+    )
+    x_ptrs = x_ptr + (
+        offs_k[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
+    )
     dt_ptrs = dt_ptr + offs_k * stride_dt_csize
     dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit if not IS_CAUSAL else min(
-        (pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    K_MAX = (
+        chunk_size_limit
+        if not IS_CAUSAL
+        else min((pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    )
     for k in range(0, K_MAX, BLOCK_SIZE_K):
-        cb = tl.load(cb_ptrs,
-                     mask=(offs_m[:, None] < chunk_size) &
-                     (offs_k[None, :] < chunk_size - k),
-                     other=0.0).to(tl.float32)
-        dA_cs_k = tl.load(dA_cumsum_ptrs,
-                          mask=offs_k < chunk_size - k,
-                          other=0.0).to(tl.float32)
+        cb = tl.load(
+            cb_ptrs,
+            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < chunk_size - k),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(
+            tl.float32
+        )
         # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j].
         # So we don't need masking wrt seq_idx here.
         cb *= tl.exp(dA_cs_m[:, None] - dA_cs_k[None, :])
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k,
-                       other=0.0).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32)
         cb *= dt_k
         if IS_CAUSAL:
             mask = offs_m[:, None] >= k + offs_k[None, :]
             cb = tl.where(mask, cb, 0.0)
         cb = cb.to(x_ptr.dtype.element_ty)
-        x = tl.load(x_ptrs,
-                    mask=(offs_k[:, None] < chunk_size_limit - k) &
-                    (offs_n[None, :] < hdim),
-                    other=0.0)
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < hdim),
+            other=0.0,
+        )
         acc += tl.dot(cb, x)
         cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
         x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
         dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
         dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
 
-    offs_out_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 
     if HAS_D:
         if D_HAS_HDIM:
-            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n,
-                        mask=offs_n < hdim,
-                        other=0.0).to(tl.float32)
+            D = tl.load(
+                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
+            ).to(tl.float32)
         else:
             D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        x_residual = tl.load(x_ptr + (offs_m[:, None] * stride_x_seqlen +
-                                      offs_n[None, :] * stride_x_hdim),
-                             mask=(offs_m[:, None] < chunk_size_limit) &
-                             (offs_n[None, :] < hdim),
-                             other=0.0).to(tl.float32)
+        x_residual = tl.load(
+            x_ptr
+            + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim),
+            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
+            other=0.0,
+        ).to(tl.float32)
         acc += x_residual * D
 
     if HAS_Z:
-        z_ptr += c_idx * chunk_size * stride_z_seqlen + pid_h * stride_z_head
-        z_ptrs = z_ptr + (stride_z_seqlen * offs_out_m[:, None] +
-                          stride_z_hdim * offs_out_n[None, :])
-        z = tl.load(z_ptrs,
-                    mask=(offs_out_m[:, None] < chunk_size_limit) &
-                    (offs_out_n[None, :] < hdim),
-                    other=0.0).to(tl.float32)
+        z_ptr += chunk_seqlen_start * stride_z_seqlen + pid_h * stride_z_head
+        z_ptrs = z_ptr + (
+            stride_z_seqlen * offs_out_m[:, None] + stride_z_hdim * offs_out_n[None, :]
+        )
+        z = tl.load(
+            z_ptrs,
+            mask=(offs_out_m[:, None] < chunk_size_limit)
+            & (offs_out_n[None, :] < hdim),
+            other=0.0,
+        ).to(tl.float32)
         acc *= z * tl.sigmoid(z)
 
-    out_ptr += c_idx * chunk_size * stride_out_seqlen + pid_h * stride_out_head
-    out_ptrs = out_ptr + (stride_out_seqlen * offs_out_m[:, None] +
-                          offs_out_n[None, :] * stride_out_hdim)
-    tl.store(out_ptrs,
-             acc,
-             mask=(offs_out_m[:, None] < chunk_size_limit) &
-             (offs_out_n[None, :] < hdim))
+    out_ptr += chunk_seqlen_start * stride_out_seqlen + pid_h * stride_out_head
+    out_ptrs = out_ptr + (
+        stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :] * stride_out_hdim
+    )
+    tl.store(
+        out_ptrs,
+        acc,
+        mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim),
+    )
 
 
 def _chunk_scan_fwd(
@@ -418,12 +352,11 @@ def _chunk_scan_fwd(
     dA_cumsum,
     C,
     states,
+    cu_chunk_seqlens,
     out,
     seq_idx,
     D=None,
     z=None,
-    chunk_indices=None,
-    chunk_offsets=None,
     initial_states=None,
 ):
     assert seq_idx is not None, "this implementation requires seq_idx"
@@ -435,34 +368,32 @@ def _chunk_scan_fwd(
     assert C.shape == (seqlen, ngroups, dstate)
     assert cb.shape == (nchunks, ngroups, chunk_size, chunk_size)
     if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
     if z is not None:
         assert z.shape == x.shape
     assert dt.shape == (nheads, nchunks, chunk_size)
     assert dA_cumsum.shape == (nheads, nchunks, chunk_size)
     assert states.shape == (nchunks, nheads, headdim, dstate)
-    assert seq_idx.shape == (seqlen, )
-
-    if initial_states is not None:
-        # with initial states, we need to take care of how
-        # seq_idx crosses the boundaries
-        assert chunk_indices is not None and chunk_offsets is not None, \
-            "chunk_indices and chunk_offsets should have been set"
-    else:
-        chunk_indices, chunk_offsets = None, None
+    assert seq_idx.shape == (nchunks,)
 
     grid = lambda META: (
-        triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
-            headdim, META['BLOCK_SIZE_N']), nchunks
-        if chunk_offsets is None else len(chunk_offsets), nheads)
+        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
+        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
+        nchunks,
+        nheads,
+    )
 
-    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
-                 (0, 0, 0))
-    initial_states_strides = ((initial_states.stride(0),
-                               initial_states.stride(1),
-                               initial_states.stride(2),
-                               initial_states.stride(3))
-                              if initial_states is not None else (0, 0, 0, 0))
+    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
+    initial_states_strides = (
+        (
+            initial_states.stride(0),
+            initial_states.stride(1),
+            initial_states.stride(2),
+            initial_states.stride(3),
+        )
+        if initial_states is not None
+        else (0, 0, 0, 0)
+    )
 
     _chunk_scan_fwd_kernel[grid](
         cb_ptr=cb,
@@ -476,9 +407,7 @@ def _chunk_scan_fwd(
         states_ptr=states,
         D_ptr=D,
         initstates_ptr=initial_states,
-        chunk_indices_ptr=chunk_indices,
-        chunk_offsets_ptr=chunk_offsets,
-        chunk_meta_num=len(chunk_indices) if chunk_indices is not None else 0,
+        cu_chunk_seqlens_ptr=cu_chunk_seqlens,
         chunk_size=chunk_size,
         hdim=headdim,
         dstate=dstate,
@@ -503,7 +432,7 @@ def _chunk_scan_fwd(
         stride_dA_cs_chunk=dA_cumsum.stride(1),
         stride_dA_cs_head=dA_cumsum.stride(0),
         stride_dA_cs_csize=dA_cumsum.stride(2),
-        stride_seq_idx_seqlen=seq_idx.stride(0),
+        stride_seq_idx_chunk=seq_idx.stride(0),
         stride_C_seqlen=C.stride(0),
         stride_C_head=C.stride(1),
         stride_C_dstate=C.stride(2),
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
index 8ee41f2cbc1b81696c04bcb265d24ec3f91f2745..11cc125bf219cd4595574e4252cfa365f5d8aa9e 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -6,8 +6,6 @@
 
 # ruff: noqa: E501
 
-import math
-
 import torch
 
 from vllm.triton_utils import tl, triton
@@ -17,14 +15,14 @@ from .mamba_ssm import softplus
 
 @triton.autotune(
     configs=[
-        triton.Config({'BLOCK_SIZE_H': 2}),
-        triton.Config({'BLOCK_SIZE_H': 4}),
-        triton.Config({'BLOCK_SIZE_H': 8}),
-        triton.Config({'BLOCK_SIZE_H': 16}),
-        triton.Config({'BLOCK_SIZE_H': 32}),
-        triton.Config({'BLOCK_SIZE_H': 64}),
+        triton.Config({"BLOCK_SIZE_H": 2}),
+        triton.Config({"BLOCK_SIZE_H": 4}),
+        triton.Config({"BLOCK_SIZE_H": 8}),
+        triton.Config({"BLOCK_SIZE_H": 16}),
+        triton.Config({"BLOCK_SIZE_H": 32}),
+        triton.Config({"BLOCK_SIZE_H": 64}),
     ],
-    key=['chunk_size', 'nheads'],
+    key=["chunk_size", "nheads"],
 )
 @triton.jit
 def _chunk_cumsum_fwd_kernel(
@@ -34,6 +32,7 @@ def _chunk_cumsum_fwd_kernel(
     dt_bias_ptr,
     dt_out_ptr,
     dA_cumsum_ptr,
+    cu_chunk_seqlens_ptr,
     # Matrix dimension
     seqlen,
     nheads: tl.constexpr,
@@ -61,124 +60,109 @@ def _chunk_cumsum_fwd_kernel(
     # https://github.com/triton-lang/triton/issues/1058
     pid_c = tl.program_id(axis=0).to(tl.int64)
     pid_h = tl.program_id(axis=1)
-    dt_ptr += pid_c * chunk_size * stride_dt_seqlen
+
+    chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c)
+    chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1)
+
+    dt_ptr += chunk_seqlen_start * stride_dt_seqlen
     dt_out_ptr += pid_c * stride_dt_out_chunk
     dA_cumsum_ptr += pid_c * stride_dA_cs_chunk
 
     offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
     offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head +
-                        offs_c[None, :] * stride_dt_seqlen)
+    dt_ptrs = dt_ptr + (
+        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
+    )
     A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (offs_h[:, None] * stride_dt_out_head +
-                                offs_c[None, :] * stride_dt_out_csize)
-    dA_cs_ptrs = dA_cumsum_ptr + (offs_h[:, None] * stride_dA_cs_head +
-                                  offs_c[None, :] * stride_dA_cs_csize)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-
-    dt = tl.load(dt_ptrs,
-                 mask=(offs_h[:, None] < nheads) &
-                 (offs_c[None, :] < chunk_size_limit),
-                 other=0.0).to(tl.float32)
+    dt_out_ptrs = dt_out_ptr + (
+        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
+    )
+    dA_cs_ptrs = dA_cumsum_ptr + (
+        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
+    )
+    chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start
+
+    dt = tl.load(
+        dt_ptrs,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
+        other=0.0,
+    ).to(tl.float32)
     if HAS_DT_BIAS:
-        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head,
-                          mask=offs_h < nheads,
-                          other=0.0).to(tl.float32)
+        dt_bias = tl.load(
+            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
+        ).to(tl.float32)
         dt += dt_bias[:, None]
     if DT_SOFTPLUS:
         dt = tl.where(dt <= 20.0, softplus(dt), dt)
 
     dt = tl.clamp(dt, dt_min, dt_max)
     dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt,
-        0.0)
-    tl.store(dt_out_ptrs,
-             dt,
-             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
+    )
+    tl.store(
+        dt_out_ptrs,
+        dt,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
+    )
     A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
     dA = dt * A[:, None]
     dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(dA_cs_ptrs,
-             dA_cs,
-             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+    tl.store(
+        dA_cs_ptrs,
+        dA_cs,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
+    )
 
 
 @triton.autotune(
     configs=[
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
             num_stages=3,
-            num_warps=8),
+            num_warps=8,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=2),
+            num_warps=2,
+        ),
     ],
-    key=['hdim', 'dstate', 'chunk_size'],
+    key=["hdim", "dstate", "chunk_size"],
 )
 @triton.jit
 def _chunk_state_fwd_kernel(
@@ -188,7 +172,7 @@ def _chunk_state_fwd_kernel(
     states_ptr,
     dt_ptr,
     dA_cumsum_ptr,
-    seq_idx_ptr,
+    cu_chunk_seqlens_ptr,
     # Matrix dimensions
     hdim: tl.constexpr,
     dstate: tl.constexpr,
@@ -212,7 +196,6 @@ def _chunk_state_fwd_kernel(
     stride_dA_cs_head: tl.int64,
     stride_dA_cs_chunk: tl.int64,
     stride_dA_cs_csize: tl.constexpr,
-    stride_seq_idx_seqlen: tl.constexpr,
     # Meta-parameters
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -223,53 +206,52 @@ def _chunk_state_fwd_kernel(
     num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
     pid_m = tl.program_id(axis=0) // num_pid_n
     pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += pid_c * chunk_size * stride_b_seqlen + (
-        pid_h // nheads_ngroups_ratio) * stride_b_head
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c)
+    chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1)
+    b_ptr += (
+        chunk_seqlen_start * stride_b_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    )
+    x_ptr += chunk_seqlen_start * stride_x_seqlen + pid_h * stride_x_head
     dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
     dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
 
-    seq_idx_ptr += pid_c * chunk_size * stride_seq_idx_seqlen
-
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
-                      offs_k[None, :] * stride_x_seqlen)
-    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
-                      offs_k[:, None] * stride_b_seqlen)
+    x_ptrs = x_ptr + (
+        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
+    )
+    b_ptrs = b_ptr + (
+        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
+    )
     dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr +
-                         (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
+        tl.float32
+    )
     dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
 
-    seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    seq_idx_last = tl.load(seq_idx_ptr +
-                           (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+    chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start
 
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(x_ptrs,
-                    mask=(offs_m[:, None] < hdim) &
-                    (offs_k[None, :] < chunk_size_limit - k),
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=(offs_k[:, None] < chunk_size_limit - k) &
-                    (offs_n[None, :] < dstate),
-                    other=0.0).to(tl.float32)
-        dA_cs_k = tl.load(dA_cumsum_ptrs,
-                          mask=offs_k < chunk_size_limit - k,
-                          other=0.0).to(tl.float32)
-
-        seq_idx_k = tl.load(seq_idx_ptrs,
-                            mask=offs_k < chunk_size_limit - k,
-                            other=-1)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
-                       other=0.0).to(tl.float32)
-
-        scale = tl.where(seq_idx_k == seq_idx_last,
-                         tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(
+            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
+        ).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
+            tl.float32
+        )
+        scale = tl.exp(dA_cs_last - dA_cs_k) * dt_k
         b *= scale[:, None]
         b = b.to(x_ptr.dtype.element_ty)
         acc += tl.dot(x, b)
@@ -278,15 +260,15 @@ def _chunk_state_fwd_kernel(
         b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
         dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
         dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
 
     states = acc.to(states_ptr.dtype.element_ty)
 
     states_ptr += pid_c * stride_states_chunk + pid_h * stride_states_head
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
-                                offs_n[None, :] * stride_states_dstate)
+    states_ptrs = states_ptr + (
+        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
+    )
     c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
     tl.store(states_ptrs, states, mask=c_mask)
 
@@ -294,79 +276,52 @@ def _chunk_state_fwd_kernel(
 @triton.autotune(
     configs=[
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
             num_stages=3,
-            num_warps=8),
+            num_warps=8,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=4),
+            num_warps=4,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=5,
-            num_warps=2),
+            num_warps=2,
+        ),
         triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32
-            },
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
             num_stages=4,
-            num_warps=2),
+            num_warps=2,
+        ),
     ],
-    key=['hdim', 'dstate', 'chunk_size'],
+    key=["hdim", "dstate", "chunk_size"],
 )
 @triton.jit
 def _chunk_state_varlen_kernel(
@@ -422,12 +377,16 @@ def _chunk_state_varlen_kernel(
     pid_n = tl.program_id(axis=0) % num_pid_n
     end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
     pid_c = (end_idx - 1) // chunk_size
-    b_ptr += pid_c * chunk_size * stride_b_seqlen + (
-        pid_h // nheads_ngroups_ratio) * stride_b_head
+    b_ptr += (
+        pid_c * chunk_size * stride_b_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    )
     x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
     dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
     dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+    chunk_states_ptr += (
+        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+    )
 
     if HAS_INITSTATES:
         # if there are init states provided, we differentiate between states (which
@@ -438,13 +397,16 @@ def _chunk_state_varlen_kernel(
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
-                      offs_k[None, :] * stride_x_seqlen)
-    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
-                      offs_k[:, None] * stride_b_seqlen)
+    x_ptrs = x_ptr + (
+        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
+    )
+    b_ptrs = b_ptr + (
+        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
+    )
     dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) *
-                         stride_dA_cs_csize).to(tl.float32)
+    dA_cs_last = tl.load(
+        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
+    ).to(tl.float32)
     dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
 
     chunk_size_limit = end_idx - pid_c * chunk_size
@@ -453,24 +415,31 @@ def _chunk_state_varlen_kernel(
 
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(x_ptrs,
-                    mask=(offs_m[:, None] < hdim) &
-                    (offs_k[None, :] < chunk_size_limit - k) &
-                    (offs_k[None, :] >= start_idx_cur - k),
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=(offs_k[:, None] < chunk_size_limit - k) &
-                    (offs_n[None, :] < dstate) &
-                    (offs_k[:, None] >= start_idx_cur - k),
-                    other=0.0).to(tl.float32)
-        dA_cs_k = tl.load(dA_cumsum_ptrs,
-                          mask=offs_k < chunk_size_limit - k,
-                          other=0.0).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
-                       other=0.0).to(tl.float32)
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_m[:, None] < hdim)
+            & (offs_k[None, :] < chunk_size_limit - k)
+            & (offs_k[None, :] >= start_idx_cur - k),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k)
+            & (offs_n[None, :] < dstate)
+            & (offs_k[:, None] >= start_idx_cur - k),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(
+            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
+        ).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
+            tl.float32
+        )
         scale = tl.where(
             (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-            tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+            tl.exp(dA_cs_last - dA_cs_k) * dt_k,
+            0.0,
+        )
         b *= scale[:, None]
         b = b.to(x_ptr.dtype.element_ty)
         acc += tl.dot(x, b)
@@ -483,39 +452,43 @@ def _chunk_state_varlen_kernel(
     # If HAS_INITSTATES==True need to consider two possibilities
     # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs
     # - if state_idx >= pid * chunk_size, then we need to insert initstates
-    if ((start_idx < pid_c * chunk_size)  # first chunk
-            or (HAS_INITSTATES)):
-
+    if (
+        (start_idx < pid_c * chunk_size)  # first chunk
+        or (HAS_INITSTATES)
+    ):
         dA_cs_boundary = 0.0  # default
 
         if not HAS_INITSTATES:
             past_states_ptrs = chunk_states_ptr + (
-                offs_m[:, None] * stride_chunk_states_hdim +
-                offs_n[None, :] * stride_chunk_states_dstate)
+                offs_m[:, None] * stride_chunk_states_hdim
+                + offs_n[None, :] * stride_chunk_states_dstate
+            )
         else:
-
             # - this seems repetitive, buts its to help the compiler
             if start_idx < pid_c * chunk_size:
                 past_states_ptrs = chunk_states_ptr + (
-                    offs_m[:, None] * stride_chunk_states_hdim +
-                    offs_n[None, :] * stride_chunk_states_dstate)
+                    offs_m[:, None] * stride_chunk_states_hdim
+                    + offs_n[None, :] * stride_chunk_states_dstate
+                )
             else:
                 past_states_ptrs = initstates_ptr + (
-                    pid_b * stride_init_states_batch +
-                    offs_m[:, None] * stride_init_states_hdim +
-                    offs_n[None, :] * stride_init_states_dstate)
+                    pid_b * stride_init_states_batch
+                    + offs_m[:, None] * stride_init_states_hdim
+                    + offs_n[None, :] * stride_init_states_dstate
+                )
 
                 # need to adjust the boundary
                 if start_idx > pid_c * chunk_size:
-                    dA_cs_boundary = tl.load(dA_cumsum_ptr +
-                                             (start_idx - pid_c * chunk_size -
-                                              1) * stride_dA_cs_csize).to(
-                                                  tl.float32)
+                    dA_cs_boundary = tl.load(
+                        dA_cumsum_ptr
+                        + (start_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
+                    ).to(tl.float32)
 
-        past_states = tl.load(past_states_ptrs,
-                              mask=(offs_m[:, None] < hdim) &
-                              (offs_n[None, :] < dstate),
-                              other=0.0).to(tl.float32)
+        past_states = tl.load(
+            past_states_ptrs,
+            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
+            other=0.0,
+        ).to(tl.float32)
 
         scale = tl.exp(dA_cs_last - dA_cs_boundary)
         acc += past_states * scale
@@ -525,35 +498,34 @@ def _chunk_state_varlen_kernel(
     states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
-                                offs_n[None, :] * stride_states_dstate)
+    states_ptrs = states_ptr + (
+        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
+    )
     c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
     tl.store(states_ptrs, states, mask=c_mask)
 
 
-def _chunk_cumsum_fwd(dt,
-                      A,
-                      chunk_size,
-                      dt_bias=None,
-                      dt_softplus=False,
-                      dt_limit=(0.0, float("inf"))):
+def _chunk_cumsum_fwd(
+    dt,
+    A,
+    chunk_size,
+    cu_chunk_seqlens,
+    dt_bias=None,
+    dt_softplus=False,
+    dt_limit=(0.0, float("inf")),
+):
     seqlen, nheads = dt.shape
-    assert A.shape == (nheads, )
+    assert A.shape == (nheads,)
     if dt_bias is not None:
-        assert dt_bias.shape == (nheads, )
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(nheads,
-                         nchunks,
-                         chunk_size,
-                         device=dt.device,
-                         dtype=torch.float32)
-    dA_cumsum = torch.empty(nheads,
-                            nchunks,
-                            chunk_size,
-                            device=dt.device,
-                            dtype=torch.float32)
-    grid_chunk_cs = lambda META: (nchunks,
-                                  triton.cdiv(nheads, META['BLOCK_SIZE_H']))
+        assert dt_bias.shape == (nheads,)
+    nchunks = cu_chunk_seqlens.shape[0] - 1
+    dt_out = torch.empty(
+        nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
+    )
+    dA_cumsum = torch.empty(
+        nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
+    )
+    grid_chunk_cs = lambda META: (nchunks, triton.cdiv(nheads, META["BLOCK_SIZE_H"]))
     with torch.cuda.device(dt.device.index):
         _chunk_cumsum_fwd_kernel[grid_chunk_cs](
             dt_ptr=dt,
@@ -561,6 +533,7 @@ def _chunk_cumsum_fwd(dt,
             dt_bias_ptr=dt_bias,
             dt_out_ptr=dt_out,
             dA_cumsum_ptr=dA_cumsum,
+            cu_chunk_seqlens_ptr=cu_chunk_seqlens,
             seqlen=seqlen,
             nheads=nheads,
             chunk_size=chunk_size,
@@ -569,8 +542,7 @@ def _chunk_cumsum_fwd(dt,
             stride_dt_seqlen=dt.stride(0),
             stride_dt_head=dt.stride(1),
             stride_A_head=A.stride(0),
-            stride_dt_bias_head=dt_bias.stride(0)
-            if dt_bias is not None else 0,
+            stride_dt_bias_head=dt_bias.stride(0) if dt_bias is not None else 0,
             stride_dt_out_head=dt_out.stride(0),
             stride_dt_out_chunk=dt_out.stride(1),
             stride_dt_out_csize=dt_out.stride(2),
@@ -584,13 +556,9 @@ def _chunk_cumsum_fwd(dt,
     return dA_cumsum, dt_out
 
 
-def _chunk_state_fwd(B,
-                     x,
-                     dt,
-                     dA_cumsum,
-                     seq_idx=None,
-                     states=None,
-                     states_in_fp32=True):
+def _chunk_state_fwd(
+    B, x, dt, dA_cumsum, cu_chunk_seqlens, states=None, states_in_fp32=True
+):
     seqlen, nheads, headdim = x.shape
     _, nchunks, chunk_size = dt.shape
     _, ngroups, dstate = B.shape
@@ -599,19 +567,20 @@ def _chunk_state_fwd(B,
     assert dt.shape == (nheads, nchunks, chunk_size)
     assert dA_cumsum.shape == dt.shape
 
-    assert seq_idx is not None
-    assert seq_idx.shape == (seqlen, )
-
     if states is not None:
         assert states.shape == (nchunks, nheads, headdim, dstate)
     else:
         states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty((nchunks, nheads, headdim, dstate),
-                             device=x.device,
-                             dtype=states_dtype)
+        states = torch.empty(
+            (nchunks, nheads, headdim, dstate), device=x.device, dtype=states_dtype
+        )
 
-    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.
-                         cdiv(dstate, META['BLOCK_SIZE_N']), nchunks, nheads)
+    grid = lambda META: (
+        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
+        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
+        nchunks,
+        nheads,
+    )
     with torch.cuda.device(x.device.index):
         _chunk_state_fwd_kernel[grid](
             x_ptr=x,
@@ -619,7 +588,7 @@ def _chunk_state_fwd(B,
             states_ptr=states,
             dt_ptr=dt,
             dA_cumsum_ptr=dA_cumsum,
-            seq_idx_ptr=seq_idx,
+            cu_chunk_seqlens_ptr=cu_chunk_seqlens,
             hdim=headdim,
             dstate=dstate,
             chunk_size=chunk_size,
@@ -641,18 +610,13 @@ def _chunk_state_fwd(B,
             stride_dA_cs_head=dA_cumsum.stride(0),
             stride_dA_cs_chunk=dA_cumsum.stride(1),
             stride_dA_cs_csize=dA_cumsum.stride(2),
-            stride_seq_idx_seqlen=seq_idx.stride(0),
         )
     return states
 
 
-def chunk_state_varlen(B,
-                       x,
-                       dt,
-                       dA_cumsum,
-                       cu_seqlens,
-                       chunk_states,
-                       initial_states=None):
+def chunk_state_varlen(
+    B, x, dt, dA_cumsum, cu_seqlens, chunk_states, initial_states=None
+):
     total_seqlen, nheads, headdim = x.shape
     _, nchunks, chunk_size = dt.shape
     _, ngroups, dstate = B.shape
@@ -667,21 +631,32 @@ def chunk_state_varlen(B,
     if initial_states is not None:
         assert initial_states.shape == (batch, nheads, headdim, dstate)
 
-    states = torch.empty(batch,
-                         nheads,
-                         headdim,
-                         dstate,
-                         dtype=chunk_states.dtype,
-                         device=chunk_states.device)
-
-    initial_states_strides = ((initial_states.stride(0),
-                               initial_states.stride(1),
-                               initial_states.stride(2),
-                               initial_states.stride(3))
-                              if initial_states is not None else (0, 0, 0, 0))
-
-    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.
-                         cdiv(dstate, META['BLOCK_SIZE_N']), batch, nheads)
+    states = torch.empty(
+        batch,
+        nheads,
+        headdim,
+        dstate,
+        dtype=chunk_states.dtype,
+        device=chunk_states.device,
+    )
+
+    initial_states_strides = (
+        (
+            initial_states.stride(0),
+            initial_states.stride(1),
+            initial_states.stride(2),
+            initial_states.stride(3),
+        )
+        if initial_states is not None
+        else (0, 0, 0, 0)
+    )
+
+    grid = lambda META: (
+        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
+        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
+        batch,
+        nheads,
+    )
     with torch.cuda.device(x.device.index):
         _chunk_state_varlen_kernel[grid](
             x_ptr=x,
@@ -720,5 +695,6 @@ def chunk_state_varlen(B,
             stride_init_states_head=initial_states_strides[1],
             stride_init_states_hdim=initial_states_strides[2],
             stride_init_states_dstate=initial_states_strides[3],
-            HAS_INITSTATES=initial_states is not None)
+            HAS_INITSTATES=initial_states is not None,
+        )
     return states
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 37d6c2870812f4d1104bc0ad2a381a8fda0e4505..ac905ada7229bb781e0543a6fc42a7c34a471f2e 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -14,66 +14,69 @@ from vllm.triton_utils import triton
 
 from .ssd_bmm import _bmm_chunk_fwd
 from .ssd_chunk_scan import _chunk_scan_fwd
-from .ssd_chunk_state import (_chunk_cumsum_fwd, _chunk_state_fwd,
-                              chunk_state_varlen)
+from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_state_fwd
 from .ssd_state_passing import _state_passing_fwd
 
-TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
 
 
 def is_int_pow_2(n):
     return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0
 
 
-def _mamba_chunk_scan_combined_fwd(x,
-                                   dt,
-                                   A,
-                                   B,
-                                   C,
-                                   chunk_size,
-                                   out,
-                                   D=None,
-                                   z=None,
-                                   dt_bias=None,
-                                   initial_states=None,
-                                   seq_idx=None,
-                                   chunk_indices=None,
-                                   chunk_offsets=None,
-                                   cu_seqlens=None,
-                                   dt_softplus=False,
-                                   dt_limit=(0.0, float("inf")),
-                                   state_dtype=None):
+def _mamba_chunk_scan_combined_fwd(
+    x,
+    dt,
+    A,
+    B,
+    C,
+    chunk_size,
+    out,
+    D=None,
+    z=None,
+    dt_bias=None,
+    initial_states=None,
+    return_intermediate_states=False,
+    seq_idx=None,
+    cu_seqlens=None,
+    cu_chunk_seqlens=None,
+    last_chunk_indices=None,
+    dt_softplus=False,
+    dt_limit=(0.0, float("inf")),
+    state_dtype=None,
+):
     assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
     seqlen, nheads, headdim = x.shape
     _, ngroups, dstate = B.shape
     assert nheads % ngroups == 0
     assert B.shape == (seqlen, ngroups, dstate)
     assert dt.shape == (seqlen, nheads)
-    assert A.shape == (nheads, )
+    assert A.shape == (nheads,)
     assert C.shape == B.shape
     if z is not None:
         assert z.shape == x.shape
     if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
     if seq_idx is not None:
-        assert seq_idx.shape == (seqlen, )
+        assert seq_idx.shape == (cu_chunk_seqlens.shape[0] - 1,)
     if B.stride(-1) != 1:
         B = B.contiguous()
     if C.stride(-1) != 1:
         C = C.contiguous()
-    if x.stride(-1) != 1 and x.stride(
-            0) != 1:  # Either M or K dimension should be contiguous
+    if (
+        x.stride(-1) != 1 and x.stride(0) != 1
+    ):  # Either M or K dimension should be contiguous
         x = x.contiguous()
-    if z is not None and z.stride(-1) != 1 and z.stride(
-            0) != 1:  # Either M or K dimension should be contiguous
+    if (
+        z is not None and z.stride(-1) != 1 and z.stride(0) != 1
+    ):  # Either M or K dimension should be contiguous
         z = z.contiguous()
     if D is not None and D.stride(-1) != 1:
         D = D.contiguous()
     assert cu_seqlens is not None, "Assuming varlen input - must supply cu_seqlens"
 
     if initial_states is not None:
-        assert initial_states.shape == (len(cu_seqlens) - 1, nheads, headdim,
-                                        dstate)
+        assert initial_states.shape == (len(cu_seqlens) - 1, nheads, headdim, dstate)
 
     # This function executes 5 sub-functions for computing mamba
     # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/
@@ -86,50 +89,42 @@ def _mamba_chunk_scan_combined_fwd(x,
 
     # 1. Compute chunked cumsum of A * dt
     # - here dt may go through a softplus activation
-    dA_cumsum, dt = _chunk_cumsum_fwd(dt,
-                                      A,
-                                      chunk_size,
-                                      dt_bias=dt_bias,
-                                      dt_softplus=dt_softplus,
-                                      dt_limit=dt_limit)
+    dA_cumsum, dt = _chunk_cumsum_fwd(
+        dt,
+        A,
+        chunk_size,
+        cu_chunk_seqlens,
+        dt_bias=dt_bias,
+        dt_softplus=dt_softplus,
+        dt_limit=dt_limit,
+    )
 
     # 2. Compute the state for each intra-chunk
     # (right term of low-rank factorization of off-diagonal blocks; B terms)
-    states = _chunk_state_fwd(B,
-                              x,
-                              dt,
-                              dA_cumsum,
-                              seq_idx=seq_idx,
-                              states_in_fp32=True)
+    states = _chunk_state_fwd(
+        B, x, dt, dA_cumsum, cu_chunk_seqlens, states_in_fp32=True
+    )
 
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
-    # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified.
+    # - for handling chunked prefill, this requires i) initial_states and
+    #   ii) seq_idx to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
-    # - We will also make sure that the dA_cumsum is taken only from the start of the
-    #   sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries)
-    # - this will ensure that states will be updated with the rightmost flushed seq_idx
-    #   of the previous chunk. This implies that the first chunk of states is either 0
-    #   or equal to init_states of the first example.
     states = _state_passing_fwd(
         rearrange(states, "... p n -> ... (p n)"),
         dA_cumsum,  # (nheads, nchunks, chunk_size)
+        cu_chunk_seqlens,
         initial_states=rearrange(initial_states, "... p n -> ... (p n)")
-        if initial_states is not None else
-        None,  # (batch, nheads, headdim*dstate)
+        if initial_states is not None
+        else None,  # (batch, nheads, headdim*dstate)
         seq_idx=seq_idx,
         out_dtype=state_dtype if state_dtype is not None else C.dtype,
-        chunk_offsets=chunk_offsets)
+    )
     states = rearrange(states, "... (p n) -> ... p n", n=dstate)
 
     # 4. Compute batched matrix multiply for C_j^T B_i terms
-    CB = _bmm_chunk_fwd(C,
-                        B,
-                        chunk_size,
-                        seq_idx=seq_idx,
-                        output_dtype=torch.float32)
+    CB = _bmm_chunk_fwd(C, B, chunk_size, cu_chunk_seqlens, output_dtype=torch.float32)
 
     # 5. Scan and compute the diagonal blocks, taking into
     #    account past causal states.
@@ -148,47 +143,40 @@ def _mamba_chunk_scan_combined_fwd(x,
         dA_cumsum,
         C,
         states,
+        cu_chunk_seqlens,
         out,  # in-place update
         seq_idx,
         D=D,
         z=z,
-        chunk_indices=chunk_indices,
-        chunk_offsets=chunk_offsets,
-        initial_states=initial_states,
-    )
-
-    varlen_states = chunk_state_varlen(
-        B,
-        x,
-        dt,
-        dA_cumsum,
-        cu_seqlens,
-        states,
         initial_states=initial_states,
     )
 
-    return varlen_states
+    if return_intermediate_states:
+        return states
+    else:
+        return states[last_chunk_indices]
 
 
 def mamba_chunk_scan_combined_varlen(
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        cu_seqlens,
-        seq_idx,
-        out,
-        D=None,
-        z=None,
-        dt_bias=None,
-        initial_states=None,
-        chunk_indices=None,
-        chunk_offsets=None,
-        dt_softplus=False,
-        dt_limit=(0.0, float("inf")),
-        state_dtype=None,
+    x,
+    dt,
+    A,
+    B,
+    C,
+    chunk_size,
+    cu_seqlens,
+    cu_chunk_seqlens,
+    last_chunk_indices,
+    seq_idx,
+    out,
+    D=None,
+    z=None,
+    dt_bias=None,
+    initial_states=None,
+    dt_softplus=False,
+    dt_limit=(0.0, float("inf")),
+    return_intermediate_states=False,
+    state_dtype=None,
 ):
     """
     Argument:
@@ -198,8 +186,10 @@ def mamba_chunk_scan_combined_varlen(
         B: (seqlen, ngroups, dstate)
         C: (seqlen, ngroups, dstate)
         chunk_size: int
-        seq_idx: (seqlen)
-        cu_seqlens: (batch + 1)
+        cu_seqlens: (batch + 1,)
+        cu_chunk_seqlens: (nchunks + 1,)
+        last_chunk_indices: (batch,)
+        seq_idx: (nchunks,)
         out: (seqlen, nheads, headdim) preallocated output tensor
         D: (nheads, headdim) or (nheads,)
         z: (seqlen, nheads, headdim)
@@ -227,12 +217,14 @@ def mamba_chunk_scan_combined_varlen(
         z=z,
         dt_bias=dt_bias,
         initial_states=initial_states,
+        return_intermediate_states=return_intermediate_states,
         seq_idx=seq_idx,
-        chunk_indices=chunk_indices,
-        chunk_offsets=chunk_offsets,
         cu_seqlens=cu_seqlens,
+        cu_chunk_seqlens=cu_chunk_seqlens,
+        last_chunk_indices=last_chunk_indices,
         dt_softplus=dt_softplus,
         dt_limit=dt_limit,
-        state_dtype=state_dtype)
+        state_dtype=state_dtype,
+    )
 
     return varlen_states
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index 71a8a4b0a1c8697eb3f546d05a9e877660b914c3..5481bab17e5a7c67531c9c66724b7f6413c884ba 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -13,14 +13,14 @@ from vllm.triton_utils import tl, triton
 
 @triton.autotune(
     configs=[
-        triton.Config({'BLOCK_SIZE': 64}),
-        triton.Config({'BLOCK_SIZE': 128}),
-        triton.Config({'BLOCK_SIZE': 256}),
-        triton.Config({'BLOCK_SIZE': 512}),
-        triton.Config({'BLOCK_SIZE': 1024}),
-        triton.Config({'BLOCK_SIZE': 2048}),
+        triton.Config({"BLOCK_SIZE": 64}),
+        triton.Config({"BLOCK_SIZE": 128}),
+        triton.Config({"BLOCK_SIZE": 256}),
+        triton.Config({"BLOCK_SIZE": 512}),
+        triton.Config({"BLOCK_SIZE": 1024}),
+        triton.Config({"BLOCK_SIZE": 2048}),
     ],
-    key=['dim'],
+    key=["dim"],
 )
 @triton.jit
 def _state_passing_fwd_kernel(
@@ -30,8 +30,7 @@ def _state_passing_fwd_kernel(
     dA_cs_ptr,
     initstates_ptr,
     seq_idx_ptr,
-    chunk_offsets_ptr,
-    chunk_meta_num,
+    cu_chunk_seqlens_ptr,
     # Matrix dimensions
     dim: tl.constexpr,
     nchunks,
@@ -50,94 +49,55 @@ def _state_passing_fwd_kernel(
     stride_initstates_batch: tl.int64,
     stride_initstates_head: tl.int64,
     stride_initstates_dim: tl.constexpr,
-    stride_seq_idx_seqlen: tl.constexpr,
+    stride_seq_idx_chunk: tl.constexpr,
     # Meta-parameters
     HAS_INITSTATES: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     pid_h = tl.program_id(axis=1)
     pid_m = tl.program_id(axis=0)
+
     states_ptr += pid_h * stride_states_head
-    dA_cs_ptr += pid_h * stride_dA_cs_head + (chunk_size -
-                                              1) * stride_dA_cs_csize
+    dA_cs_ptr += pid_h * stride_dA_cs_head + (chunk_size - 1) * stride_dA_cs_csize
     out_ptr += pid_h * stride_out_head
-    if HAS_INITSTATES:
-        initstates_ptr += pid_h * stride_initstates_head
 
     offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     states_ptrs = states_ptr + offs_m * stride_states_dim
     out_ptrs = out_ptr + offs_m * stride_out_dim
 
-    # - states will be the past state of the sequence that continues on the current check
-    if not HAS_INITSTATES:
-        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    if HAS_INITSTATES:
+        initstates_ptrs = (
+            initstates_ptr
+            + pid_h * stride_initstates_head
+            + offs_m * stride_initstates_dim
+        )
+
+        states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
     else:
-        initstates_ptr += offs_m * stride_initstates_dim
-        initstates_ptrs = initstates_ptr
-        # - for cont batches, for the first chunk mean it will be the first batch's
-        #   init state
-        states = tl.load(initstates_ptrs, mask=offs_m < dim,
-                         other=0.0).to(tl.float32)
-
-    tl.store(out_ptrs, states, mask=offs_m < dim)
-    out_ptrs += stride_out_chunk
-    prev_seq_idx_chunk_end = 0
-    logical_chunk_idx = 0
-    for c in range(nchunks - 1):
-        new_states = tl.load(states_ptrs, mask=offs_m < dim,
-                             other=0.0).to(tl.float32)
+        states = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    prev_seq_idx = 0
+    for c in range(nchunks):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
         dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale_mask = True
-        # - the seq to pass forward is the one that is flushed to the right
-        #   boundary.
-        # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk.
-        seq_idx_chunk_end = tl.load(seq_idx_ptr +
-                                    (min((c + 1) * chunk_size, seqlen) - 1) *
-                                    stride_seq_idx_seqlen)
-
-        if HAS_INITSTATES:
-            if prev_seq_idx_chunk_end != seq_idx_chunk_end:
-                # this means in the current chunk the rightmost flushed seq
-                # has changed.
-                # - so we do not propagate the state from previous chunk
-                # - but rather we load that sequence's init state
-                initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch
-
-                # - update state with seq_idx_new's init state
-                states = tl.load(initstates_ptrs, mask=offs_m < dim,
-                                 other=0.0).to(tl.float32)
-
-                # - we need to consider the cumsum only of the last sequence in the chunk
-                # - find its starting position (given by c_off of the logical chunk index)
-                # - and subtract the cumsum just before that position from the total cumsum
-                # - first, update the logical chunk index (add the number of sequences in the current physical chunk):
-                # sequence index at the start of the current chunk
-                seq_idx_chunk_start = tl.load(seq_idx_ptr +
-                                              min(c * chunk_size, seqlen) *
-                                              stride_seq_idx_seqlen)
-                logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start
-                # - load the chunk offset:
-                c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx,
-                                mask=logical_chunk_idx < chunk_meta_num,
-                                other=0)
-                # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything
-                if c_off > 0:
-                    # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset
-                    dA_cs_boundary = tl.load(
-                        dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize +
-                        (c_off - 1) * stride_dA_cs_csize,
-                        mask=(c_off - 1) > -1 and c_off < chunk_size,
-                        other=0.0)
-                    dA_cs -= dA_cs_boundary
-
-            # - increment logical chunk index for every physical chunk
-            logical_chunk_idx += 1
-        else:
-            scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end
-        prev_seq_idx_chunk_end = seq_idx_chunk_end
-
-        scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0)
-        states = scale * states + new_states
+        seq_idx = tl.load(seq_idx_ptr + c * stride_seq_idx_chunk)
+        # we have started a new sequence
+        if prev_seq_idx != seq_idx:
+            if HAS_INITSTATES:
+                initstates_ptrs = (
+                    initstates_ptr
+                    + seq_idx * stride_initstates_batch
+                    + pid_h * stride_initstates_head
+                    + offs_m * stride_initstates_dim
+                )
+                states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(
+                    tl.float32
+                )
+            else:
+                states = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+        prev_seq_idx = seq_idx
+        states = tl.exp(dA_cs) * states + new_states
         tl.store(out_ptrs, states, mask=offs_m < dim)
 
         states_ptrs += stride_states_chunk
@@ -148,8 +108,8 @@ def _state_passing_fwd_kernel(
 def _state_passing_fwd(
     states,
     dA_cumsum,
+    cu_chunk_seqlens,
     seq_idx,
-    chunk_offsets,
     initial_states=None,
     out_dtype=None,
 ):
@@ -158,16 +118,15 @@ def _state_passing_fwd(
     assert dA_cumsum.shape == (nheads, nchunks, chunk_size)
     seqlen = seq_idx.shape[-1]
     out_dtype = states.dtype if out_dtype is None else out_dtype
-    out = torch.empty((nchunks, nheads, dim),
-                      device=states.device,
-                      dtype=out_dtype)
+    out = torch.empty((nchunks, nheads, dim), device=states.device, dtype=out_dtype)
 
-    initial_states_strides = ((initial_states.stride(0),
-                               initial_states.stride(1),
-                               initial_states.stride(2))
-                              if initial_states is not None else (0, 0, 0))
+    initial_states_strides = (
+        (initial_states.stride(0), initial_states.stride(1), initial_states.stride(2))
+        if initial_states is not None
+        else (0, 0, 0)
+    )
 
-    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), nheads)
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE"]), nheads)
     with torch.cuda.device(states.device.index):
         _state_passing_fwd_kernel[grid](
             states_ptr=states,
@@ -175,9 +134,7 @@ def _state_passing_fwd(
             dA_cs_ptr=dA_cumsum,
             initstates_ptr=initial_states,
             seq_idx_ptr=seq_idx,
-            chunk_offsets_ptr=chunk_offsets,
-            chunk_meta_num=len(chunk_offsets)
-            if chunk_offsets is not None else 0,
+            cu_chunk_seqlens_ptr=cu_chunk_seqlens,
             dim=dim,
             nchunks=nchunks,
             seqlen=seqlen if seq_idx is not None else 0,
@@ -194,7 +151,7 @@ def _state_passing_fwd(
             stride_initstates_batch=initial_states_strides[0],
             stride_initstates_head=initial_states_strides[1],
             stride_initstates_dim=initial_states_strides[2],
-            stride_seq_idx_seqlen=seq_idx.stride(0),
+            stride_seq_idx_chunk=seq_idx.stride(0),
             HAS_INITSTATES=initial_states is not None,
         )
     return out
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index cc424760e229fd335904568a44b7b13f95ecae24..04efa8a8b37343985f2a9ec019551ba7e2ea980a 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -13,29 +13,35 @@ from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
-from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.short_conv_attn import (
-    ShortConvAttentionMetadata)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata
 
 
 @CustomOp.register("short_conv")
 class ShortConv(MambaBase, CustomOp):
-
-    def __init__(self,
-                 config,
-                 dim: int,
-                 layer_idx: int,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config,
+        dim: int,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -72,7 +78,7 @@ class ShortConv(MambaBase, CustomOp):
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
-        self.kv_cache = (torch.tensor([]), )
+        self.kv_cache = (torch.tensor([]),)
 
         self.model_config = model_config
         self.cache_config = cache_config
@@ -115,14 +121,15 @@ class ShortConv(MambaBase, CustomOp):
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
             conv_state = self_kv_cache[0].transpose(-1, -2)
             state_indices_tensor = attn_metadata.state_indices_tensor
-            has_initial_states_p = attn_metadata.has_initial_states
+            has_initial_states_p = attn_metadata.has_initial_states_p
 
         BCx, _ = self.in_proj(hidden_states)
 
         B, C, x = BCx.chunk(3, dim=-1)
 
-        conv_weights = self.conv.weight.view(self.conv.weight.size(0),
-                                             self.conv.weight.size(2))
+        conv_weights = self.conv.weight.view(
+            self.conv.weight.size(0), self.conv.weight.size(2)
+        )
 
         if attn_metadata is None:
             # V1 profile run
@@ -163,23 +170,26 @@ class ShortConv(MambaBase, CustomOp):
             dim=0,
         )
         query_start_loc_p = (
-            attn_metadata.query_start_loc[-num_prefills - 1:] -
-            num_decodes if has_prefill else None)
+            attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
+            if has_prefill
+            else None
+        )
 
         conv_output_list = []
 
         if has_prefill:
             Bx_p = (B_p * x_p).transpose(0, 1)
-            Bx = causal_conv1d_fn(Bx_p,
-                                  conv_weights,
-                                  self.conv.bias,
-                                  activation=None,
-                                  conv_states=conv_state,
-                                  has_initial_state=has_initial_states_p,
-                                  cache_indices=state_indices_tensor_p,
-                                  metadata=attn_metadata,
-                                  query_start_loc=query_start_loc_p).transpose(
-                                      0, 1)[:num_prefill_tokens]
+            Bx = causal_conv1d_fn(
+                Bx_p,
+                conv_weights,
+                self.conv.bias,
+                activation=None,
+                conv_states=conv_state,
+                has_initial_state=has_initial_states_p,
+                cache_indices=state_indices_tensor_p,
+                metadata=attn_metadata,
+                query_start_loc=query_start_loc_p,
+            ).transpose(0, 1)[:num_prefill_tokens]
 
             y = C_p * Bx
             conv_output_list.append(y)
@@ -192,7 +202,8 @@ class ShortConv(MambaBase, CustomOp):
                 conv_weights,
                 self.conv.bias,
                 activation=None,
-                conv_state_indices=state_indices_tensor_d)
+                conv_state_indices=state_indices_tensor_d,
+            )
             y = C_d * Bx
             conv_output_list.insert(0, y)
 
@@ -222,8 +233,8 @@ class ShortConv(MambaBase, CustomOp):
         return "short_conv"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.short_conv_attn import (
-            ShortConvAttentionBackend)
+        from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
+
         return ShortConvAttentionBackend
 
 
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 66bf3823e19145f99a6c181184e74a259e73df6d..3ff975db5f87d309cca7c15e3cb9019a0508bce8 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
-from vllm.attention import Attention
+from vllm.attention.layer import MLAAttention
 from vllm.config import CacheConfig
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -13,30 +12,31 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 
 @dataclass
 class MLAModules:
-    """Modules used in MLA.
-    """
+    """Modules used in MLA."""
+
     kv_a_layernorm: torch.nn.Module
     kv_b_proj: torch.nn.Module
     rotary_emb: torch.nn.Module
     o_proj: torch.nn.Module
-    fused_qkv_a_proj: Optional[torch.nn.Module]
-    kv_a_proj_with_mqa: Optional[torch.nn.Module]
-    q_a_layernorm: Optional[torch.nn.Module]
-    q_b_proj: Optional[torch.nn.Module]
-    q_proj: Optional[torch.nn.Module]
-    indexer: Optional[torch.nn.Module]
+    fused_qkv_a_proj: torch.nn.Module | None
+    kv_a_proj_with_mqa: torch.nn.Module | None
+    q_a_layernorm: torch.nn.Module | None
+    q_b_proj: torch.nn.Module | None
+    q_proj: torch.nn.Module | None
+    indexer: torch.nn.Module | None
     is_sparse: bool
-    topk_indices_buffer: Optional[torch.Tensor]
+    topk_indices_buffer: torch.Tensor | None
 
 
 @CustomOp.register("multi_head_latent_attention")
-class MultiHeadLatentAttention(CustomOp):
-    """MLA layer registered as CustomOp.
+class MultiHeadLatentAttentionWrapper(CustomOp):
+    """MLA layer registered as CustomOp to allow OOT backends to add
+    custom implementations of the outer MLA layer (including rope & o_proj).
     Note that currently MLA ignores the enable/disable mechanism of CustomOp
     because there is only one in-tree implementation in forward_native.
     TODO: implement this with a new PluggableLayer mechanism.
 
-    This class takes positions and hidden_states as input. 
+    This class takes positions and hidden_states as input.
     The input tensors can either contain prefill tokens or decode tokens.
     The class does the following:
 
@@ -54,11 +54,11 @@ class MultiHeadLatentAttention(CustomOp):
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         v_head_dim: int,
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         mla_modules: MLAModules,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -87,30 +87,19 @@ class MultiHeadLatentAttention(CustomOp):
             self.topk_tokens = self.indexer.topk_tokens
             self.topk_indices_buffer = mla_modules.topk_indices_buffer
 
-        # In the MLA backend, kv_cache includes both k_c and
-        # pe (i.e. decoupled position embeddings). In particular,
-        # the concat_and_cache_mla op requires
-        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
-        # i.e.
-        #     kv_lora_rank + qk_rope_head_dim == head_size
-        self.mla_attn = Attention(
+        self.mla_attn = MLAAttention(
             num_heads=self.num_heads,
-            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
             scale=scale,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            use_sparse=mla_modules.is_sparse,
-            # MLA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
             qk_nope_head_dim=self.qk_nope_head_dim,
             qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
             v_head_dim=self.v_head_dim,
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
             kv_b_proj=self.kv_b_proj,
+            use_sparse=self.is_sparse,
             indexer=self.indexer,
         )
 
@@ -125,12 +114,15 @@ class MultiHeadLatentAttention(CustomOp):
         kv_lora = None
 
         if self.q_lora_rank is not None:
-            assert self.fused_qkv_a_proj is not None, \
+            assert self.fused_qkv_a_proj is not None, (
                 "fused_qkv_a_proj is required when q_lora_rank is not None"
-            assert self.q_a_layernorm is not None, \
+            )
+            assert self.q_a_layernorm is not None, (
                 "q_a_layernorm is required when q_lora_rank is not None"
-            assert self.q_b_proj is not None, \
+            )
+            assert self.q_b_proj is not None, (
                 "q_b_proj is required when q_lora_rank is not None"
+            )
             qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
             q_c, kv_lora = qkv_lora.split(
                 [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
@@ -139,23 +131,29 @@ class MultiHeadLatentAttention(CustomOp):
             q_c = self.q_a_layernorm(q_c)
             q = self.q_b_proj(q_c)[0]
         else:
-            assert self.kv_a_proj_with_mqa is not None, \
+            assert self.kv_a_proj_with_mqa is not None, (
                 "kv_a_proj_with_mqa is required when q_lora_rank is None"
-            assert self.q_proj is not None, \
+            )
+            assert self.q_proj is not None, (
                 "q_proj is required when q_lora_rank is None"
+            )
             kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
             q = self.q_proj(hidden_states)[0]
 
-        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
-                                   dim=-1)
+        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         kv_c_normed = self.kv_a_layernorm(kv_c)
 
         q = q.view(-1, self.num_heads, self.qk_head_dim)
         # Add head dim of 1 to k_pe
         k_pe = k_pe.unsqueeze(1)
 
-        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
-            positions, q[..., self.qk_nope_head_dim:], k_pe)
+        if self.rotary_emb is not None:
+            q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb(
+                positions, q[..., self.qk_nope_head_dim :], k_pe
+            )
+
+        if self.indexer and self.is_sparse:
+            _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb)
 
         if self.indexer and self.is_sparse:
             _topk_indices = self.indexer(hidden_states, q_c, positions,
@@ -165,8 +163,9 @@ class MultiHeadLatentAttention(CustomOp):
             q,
             kv_c_normed,
             k_pe,
-            output_shape=(hidden_states.shape[0],
-                          self.num_heads * self.v_head_dim))
+            output_shape=(hidden_states.shape[0], self.num_heads * self.v_head_dim),
+        )
+
         return self.o_proj(attn_out)[0]
 
     def forward_cuda(self, *args, **kwargs):
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 139011ce10be55c0e714a03320b97b30f52add61..7dd02e32ff211a46e38d84b1092ad228fd5a098b 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from collections.abc import Mapping, Set
+from collections.abc import Callable, Mapping, Set
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import groupby
-from typing import Callable, Optional, TypeVar, Union
+from typing import TypeVar
 
 import torch
 import torch.nn as nn
@@ -17,20 +17,22 @@ from vllm.logger import init_logger
 from vllm.model_executor.models.adapters import _load_st_projector
 from vllm.pooling_params import PoolingParams
 from vllm.tasks import PoolingTask
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.v1.outputs import PoolerOutput
 from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 
 logger = init_logger(__name__)
 
 PoolingFn = Callable[
-    [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
-    Union[torch.Tensor, list[torch.Tensor]]]
+    [torch.Tensor | list[torch.Tensor], PoolingMetadata],
+    torch.Tensor | list[torch.Tensor],
+]
 ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
 
 
 class PoolingType(IntEnum):
     """Enumeration for different types of pooling methods."""
+
     LAST = 0
     ALL = 1
     CLS = 2
@@ -50,8 +52,7 @@ class ResolvedPoolingConfig:
         pooler_config: PoolerConfig,
     ) -> "ResolvedPoolingConfig":
         assert pooler_config.pooling_type is not None
-        return cls(task=task,
-                   pooling_type=PoolingType[pooler_config.pooling_type])
+        return cls(task=task, pooling_type=PoolingType[pooler_config.pooling_type])
 
 
 @dataclass(frozen=True)
@@ -63,76 +64,17 @@ class PoolingParamsUpdate:
         params.requires_token_ids = self.requires_token_ids
 
 
-class Pooler(nn.Module, ABC):
-    """The interface required for all poolers used in pooling models in vLLM."""
-
-    @staticmethod
-    def for_encode(pooler_config: PoolerConfig):
-        if pooler_config.pooling_type == "STEP":
-            return StepPooler()
-
-        resolved_config = ResolvedPoolingConfig(task="encode",
-                                                pooling_type=PoolingType.ALL)
-
-        return SimplePooler.from_config(resolved_config)
-
-    @staticmethod
-    def for_embed(pooler_config: PoolerConfig):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="embed",
-            pooler_config=pooler_config,
-        )
-
-        return SimplePooler.from_config(resolved_config)
-
-    @staticmethod
-    def for_classify(
-        pooler_config: PoolerConfig,
-        classifier: Optional[ClassifierFn],
-    ):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="classify",
-            pooler_config=pooler_config,
-        )
-
-        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
-
-        return ClassifierPooler(
-            pooling=pooling,
-            classifier=classifier,
-        )
-
-    @abstractmethod
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        """Determine which pooling tasks are supported."""
-        raise NotImplementedError
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        """
-        Construct the updated pooling parameters to use for a supported task.
-        """
-        return PoolingParamsUpdate()
-
-    @abstractmethod
-    def forward(
-        self,
-        hidden_states: Union[list[torch.Tensor], torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        raise NotImplementedError
-
-
 def get_prompt_lens(
-    hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+    hidden_states: torch.Tensor | list[torch.Tensor],
     pooling_metadata: PoolingMetadata,
 ) -> torch.Tensor:
     return pooling_metadata.prompt_lens
 
 
-def get_prompt_token_ids(
-        pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
+def get_prompt_token_ids(pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
     assert pooling_metadata.prompt_token_ids is not None, (
-        "Please set `requires_token_ids=True` in `get_pooling_updates`")
+        "Please set `requires_token_ids=True` in `get_pooling_updates`"
+    )
 
     return [
         pooling_metadata.prompt_token_ids[i, :num]
@@ -140,8 +82,7 @@ def get_prompt_token_ids(
     ]
 
 
-def get_pooling_params(
-        pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
+def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
     pooling_params = pooling_metadata.pooling_params
     return pooling_params
 
@@ -150,7 +91,8 @@ def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
     pooling_params = get_pooling_params(pooling_metadata)
 
     tasks: list[PoolingTask] = [
-        task for pooling_param in pooling_params
+        task
+        for pooling_param in pooling_params
         if (task := pooling_param.task) is not None
     ]
     assert len(pooling_params) == len(tasks)
@@ -172,18 +114,23 @@ def get_classification_activation_function(config: PretrainedConfig):
 
 
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-    function_name: Optional[str] = None
-    if (hasattr(config, "sentence_transformers")
-            and "activation_fn" in config.sentence_transformers):
+    function_name: str | None = None
+    if (
+        hasattr(config, "sentence_transformers")
+        and "activation_fn" in config.sentence_transformers
+    ):
         function_name = config.sentence_transformers["activation_fn"]
-    elif (hasattr(config, "sbert_ce_default_activation_function")
-          and config.sbert_ce_default_activation_function is not None):
+    elif (
+        hasattr(config, "sbert_ce_default_activation_function")
+        and config.sbert_ce_default_activation_function is not None
+    ):
         function_name = config.sbert_ce_default_activation_function
 
     if function_name is not None:
         assert function_name.startswith("torch.nn.modules."), (
             "Loading of activation functions is restricted to "
-            "torch.nn.modules for security reasons")
+            "torch.nn.modules for security reasons"
+        )
         fn = resolve_obj_by_qualname(function_name)()
         return PoolerActivation.wraps(fn)
 
@@ -191,7 +138,6 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
 
 
 class PoolingMethod(nn.Module, ABC):
-
     @staticmethod
     def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
         if pooling_type == PoolingType.LAST:
@@ -217,83 +163,81 @@ class PoolingMethod(nn.Module, ABC):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         raise NotImplementedError
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         pooling_cursor = pooling_metadata.pooling_cursor
         return self.forward_all(hidden_states, pooling_cursor)
 
 
 class CLSPool(PoolingMethod):
-
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
-        assert not pooling_cursor.is_partial_prefill(), \
+    ) -> list[torch.Tensor] | torch.Tensor:
+        assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with CLS pooling"
+        )
 
         return hidden_states[pooling_cursor.first_token_indices_gpu]
 
 
 class LastPool(PoolingMethod):
-
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         return hidden_states[pooling_cursor.last_token_indices_gpu]
 
 
 class AllPool(PoolingMethod):
-
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode"}
+        return {"token_embed", "token_classify"}
 
     def forward_all(
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
-
-        assert not pooling_cursor.is_partial_prefill(), \
+    ) -> list[torch.Tensor] | torch.Tensor:
+        assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with ALL pooling"
+        )
 
         hidden_states_lst = list(
-            hidden_states.split(
-                pooling_cursor.num_scheduled_tokens_cpu.tolist()))
+            hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist())
+        )
         return [hidden_states_lst[i] for i in pooling_cursor.index]
 
 
 class MeanPool(PoolingMethod):
-
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
-
-        assert not pooling_cursor.is_partial_prefill(), \
+    ) -> list[torch.Tensor] | torch.Tensor:
+        assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with MEAN pooling"
+        )
 
-        prompt_lens = pooling_cursor.prompt_lens_cpu.to(hidden_states.device,
-                                                        non_blocking=True)
+        prompt_lens = pooling_cursor.prompt_lens_cpu.to(
+            hidden_states.device, non_blocking=True
+        )
 
         # Use float32 for torch.cumsum in MeanPool,
         # otherwise precision will be lost significantly.
@@ -301,15 +245,15 @@ class MeanPool(PoolingMethod):
 
         start_indices = pooling_cursor.first_token_indices_gpu
         end_indices = pooling_cursor.last_token_indices_gpu
-        return (cumsum[end_indices] - cumsum[start_indices] +
-                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+        return (
+            cumsum[end_indices] - cumsum[start_indices] + hidden_states[start_indices]
+        ) / prompt_lens.unsqueeze(1)
 
 
 _T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
 
 
 class BasePoolerActivation(nn.Module, ABC):
-
     @abstractmethod
     def forward(self, pooled_data: _T) -> _T:
         # shape:
@@ -320,7 +264,6 @@ class BasePoolerActivation(nn.Module, ABC):
 
 
 class PoolerActivation(BasePoolerActivation):
-
     @staticmethod
     def wraps(module: nn.Module):
         if isinstance(module, nn.Identity):
@@ -342,42 +285,42 @@ class PoolerActivation(BasePoolerActivation):
 
 
 class PoolerIdentity(PoolerActivation):
-
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
         return pooled_data
 
 
 class PoolerNormalize(PoolerActivation):
-
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
         return F.normalize(pooled_data, p=2, dim=-1)
 
 
 class PoolerMultiLabelClassify(PoolerActivation):
-
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
         return F.sigmoid(pooled_data)
 
 
 class PoolerClassify(PoolerActivation):
-
     def __init__(self, *, static_num_labels: bool = True) -> None:
         super().__init__()
 
         if static_num_labels:
             vllm_config = get_current_vllm_config()
-            self.num_labels = getattr(vllm_config.model_config.hf_config,
-                                      "num_labels", 0)
+            self.num_labels = getattr(
+                vllm_config.model_config.hf_config, "num_labels", 0
+            )
             if self.num_labels == 0:
-                logger.warning("num_labels should be > 0 for classification"
-                               "models, falling back to softmax. "
-                               "Please check if the configuration is correct.")
+                logger.warning(
+                    "num_labels should be > 0 for classification"
+                    "models, falling back to softmax. "
+                    "Please check if the configuration is correct."
+                )
         else:
             self.num_labels = None
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = (self.num_labels if self.num_labels is not None else
-                      pooled_data.shape[-1])
+        num_labels = (
+            self.num_labels if self.num_labels is not None else pooled_data.shape[-1]
+        )
 
         if num_labels < 2:
             return F.sigmoid(pooled_data)
@@ -386,7 +329,6 @@ class PoolerClassify(PoolerActivation):
 
 
 class LambdaPoolerActivation(PoolerActivation):
-
     def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
         super().__init__()
 
@@ -396,33 +338,123 @@ class LambdaPoolerActivation(PoolerActivation):
         return self.fn(pooled_data)
 
 
-class PoolerHead(nn.Module):
+class Pooler(nn.Module, ABC):
+    """The interface required for all poolers used in pooling models in vLLM."""
+
+    @staticmethod
+    def for_token_embed(pooler_config: PoolerConfig):
+        head = TokenEmbeddingPoolerHead()
+
+        if pooler_config.pooling_type == "STEP":
+            return StepPooler(head=head)
+
+        return AllPooler(head=head)
+
+    @staticmethod
+    def for_token_classify(
+        pooler_config: PoolerConfig,
+        classifier: ClassifierFn | None = None,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        head = TokenClassifierPoolerHead(classifier=classifier, act_fn=act_fn)
+
+        if pooler_config.pooling_type == "STEP":
+            return StepPooler(head=head)
+
+        return AllPooler(head=head)
+
+    @staticmethod
+    def for_embed(pooler_config: PoolerConfig):
+        resolved_config = ResolvedPoolingConfig.from_config(
+            task="embed",
+            pooler_config=pooler_config,
+        )
+
+        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
+        head = EmbeddingPoolerHead()
+
+        return SimplePooler(pooling=pooling, head=head)
+
+    @staticmethod
+    def for_classify(
+        pooler_config: PoolerConfig,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        resolved_config = ResolvedPoolingConfig.from_config(
+            task="classify",
+            pooler_config=pooler_config,
+        )
+
+        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
+
+        return ClassifierPooler(
+            pooling=pooling,
+            classifier=classifier,
+            act_fn=act_fn,
+        )
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        """Determine which pooling tasks are supported."""
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        """
+        Construct the updated pooling parameters to use for a supported task.
+        """
+        return PoolingParamsUpdate()
 
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: list[torch.Tensor] | torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        raise NotImplementedError
+
+
+class DummyPooler(Pooler):
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"plugin", "score"}
+
+    def forward(
+        self,
+        hidden_states: list[torch.Tensor] | torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        return hidden_states
+
+
+class PoolerHead(nn.Module):
     def __init__(self, activation: PoolerActivation) -> None:
         super().__init__()
         self.activation = activation
 
-    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
-                pooling_metadata: PoolingMetadata):
-
+    def forward(
+        self,
+        pooled_data: list[torch.Tensor] | torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ):
         return self.activation(pooled_data)
 
 
 class EmbeddingPoolerHead(PoolerHead):
-
     def __init__(self) -> None:
         super().__init__(activation=PoolerNormalize())
 
         # Load ST projector if available
-
         vllm_config = get_current_vllm_config()
-        self.projector: Optional[nn.Module] = _load_st_projector(
-            vllm_config.model_config) if vllm_config else None
+        self.projector: nn.Module | None = (
+            _load_st_projector(vllm_config.model_config) if vllm_config else None
+        )
         self.head_dtype = vllm_config.model_config.head_dtype
 
-    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
-                pooling_metadata: PoolingMetadata):
-
+    def forward(
+        self,
+        pooled_data: list[torch.Tensor] | torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ):
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_dimension]
@@ -437,14 +469,11 @@ class EmbeddingPoolerHead(PoolerHead):
         pooling_params = get_pooling_params(pooling_metadata)
 
         # for matryoshka representation
-        dimensions_list = [
-            pooling_param.dimensions for pooling_param in pooling_params
-        ]
+        dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
         if any(d is not None for d in dimensions_list):
             # change the output dimension
             assert len(pooled_data) == len(dimensions_list)
-            if len(set(dimensions_list)) == 1 and not isinstance(
-                    pooled_data, list):
+            if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
                 # if all dimensions are the same
                 d = dimensions_list[0]
                 pooled_data = pooled_data[..., :d]
@@ -469,38 +498,6 @@ class EmbeddingPoolerHead(PoolerHead):
         return pooled_data
 
 
-class RewardPoolerHead(PoolerHead):
-
-    def __init__(self) -> None:
-        super().__init__(activation=PoolerClassify(static_num_labels=False))
-
-        vllm_config = get_current_vllm_config()
-        self.head_dtype = vllm_config.model_config.head_dtype
-
-    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
-                pooling_metadata: PoolingMetadata):
-
-        if isinstance(pooled_data, list):
-            pooled_data = [p.to(self.head_dtype) for p in pooled_data]
-        else:
-            pooled_data = pooled_data.to(self.head_dtype)
-
-        pooling_params = get_pooling_params(pooling_metadata)
-
-        # for softmax
-        flags = [p.softmax for p in pooling_params]
-        if len(set(flags)) == 1:
-            if flags[0]:
-                pooled_data = self.activation(pooled_data)
-        else:
-            pooled_data = [
-                self.activation(vecs) if f else vecs
-                for vecs, f in zip(pooled_data, flags)
-            ]
-
-        return pooled_data
-
-
 class SimplePooler(Pooler):
     """A layer that pools specific information from hidden states.
 
@@ -510,20 +507,6 @@ class SimplePooler(Pooler):
     3. Returns structured results as `PoolerOutput`.
     """
 
-    @classmethod
-    def from_config(
-        cls,
-        pooler_config: ResolvedPoolingConfig,
-    ) -> "SimplePooler":
-        pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
-        if pooler_config.task == "embed":
-            head = EmbeddingPoolerHead()
-        elif pooler_config.task == "encode":
-            head = RewardPoolerHead()
-        else:
-            raise NotImplementedError(f"Unknown task: {pooler_config.task}")
-        return cls(pooling, head)
-
     def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
         super().__init__()
 
@@ -538,7 +521,7 @@ class SimplePooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
@@ -546,57 +529,6 @@ class SimplePooler(Pooler):
         return pooled_data
 
 
-class StepPooler(Pooler):
-
-    def __init__(self, ) -> None:
-        super().__init__()
-
-        self.pooling = AllPool()
-        self.head = RewardPoolerHead()
-
-    def extract_states(
-        self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
-        pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
-        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
-        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
-
-        pooled_data = list[torch.Tensor]()
-
-        pooling_params = get_pooling_params(pooling_metadata)
-
-        for data, token_id, pooling_param in zip(pooled_data_lst,
-                                                 prompt_token_ids,
-                                                 pooling_params):
-            step_tag_id = pooling_param.step_tag_id
-            returned_token_ids = pooling_param.returned_token_ids
-
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                data = data[:, returned_token_ids]
-
-            if step_tag_id is not None:
-                data = data[token_id == step_tag_id]
-            pooled_data.append(data)
-
-        return pooled_data
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode"}
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return PoolingParamsUpdate(requires_token_ids=True)
-
-    def forward(
-        self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data, pooling_metadata)
-        return pooled_data
-
-
 class ClassifierPooler(Pooler):
     """A pooling layer for classification tasks.
 
@@ -607,28 +539,49 @@ class ClassifierPooler(Pooler):
     """
 
     @staticmethod
-    def act_fn_for_seq_cls(config: ModelConfig):
-        return get_classification_activation_function(config.hf_config)
+    def act_fn_for_seq_cls(model_config: ModelConfig):
+        return get_classification_activation_function(model_config.hf_config)
 
     @staticmethod
-    def act_fn_for_cross_encoder(config: ModelConfig):
-        return get_cross_encoder_activation_function(config.hf_config)
+    def act_fn_for_cross_encoder(model_config: ModelConfig):
+        return get_cross_encoder_activation_function(model_config.hf_config)
+
+    @staticmethod
+    def resolve_act_fn(
+        model_config: ModelConfig,
+        static_num_labels: bool = True,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        if isinstance(act_fn, str):
+            if act_fn == "classify":
+                return ClassifierPooler.act_fn_for_seq_cls(model_config)
+            elif act_fn == "score":
+                return ClassifierPooler.act_fn_for_cross_encoder(model_config)
+            else:
+                raise ValueError(f"act_fn [{act_fn=}] not supported.")
+        elif act_fn is None:
+            return PoolerClassify(static_num_labels=static_num_labels)
+        else:
+            assert callable(act_fn)
+            return act_fn
 
     def __init__(
         self,
         pooling: PoolingFn,
-        classifier: Optional[ClassifierFn],
-        act_fn: Optional[PoolerActivation] = None,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | str | None = None,
     ) -> None:
         super().__init__()
 
         vllm_config = get_current_vllm_config()
-
         self.pooling = pooling
         self.classifier = classifier
-        self.act_fn = act_fn or PoolerClassify()
-        self.logit_bias: Optional[
-            float] = vllm_config.model_config.pooler_config.logit_bias
+        self.act_fn = self.resolve_act_fn(
+            vllm_config.model_config, static_num_labels=True, act_fn=act_fn
+        )
+        self.logit_bias: float | None = (
+            vllm_config.model_config.pooler_config.logit_bias
+        )
         self.head_dtype = vllm_config.model_config.head_dtype
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -636,7 +589,7 @@ class ClassifierPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
@@ -654,20 +607,163 @@ class ClassifierPooler(Pooler):
             pooled_data -= self.logit_bias
 
         pooling_params = get_pooling_params(pooling_metadata)
-        flags = [p.activation for p in pooling_params]
+        flags = [p.use_activation for p in pooling_params]
 
         if len(set(flags)) == 1:
             scores = self.act_fn(pooled_data) if flags[0] else pooled_data
         else:
             scores = [
-                self.act_fn(vecs) if f else vecs
-                for vecs, f in zip(pooled_data, flags)
+                self.act_fn(vecs) if f else vecs for vecs, f in zip(pooled_data, flags)
             ]
 
         # scores shape: [batchsize, num_labels]
         return scores
 
 
+class TokenEmbeddingPoolerHead(EmbeddingPoolerHead):
+    def forward(
+        self, pooled_data: torch.Tensor, pooling_param: PoolingParams
+    ) -> torch.Tensor:
+        pooled_data = pooled_data.to(self.head_dtype)
+        # pooled_data shape: [n_tokens, hidden_dimension]
+
+        # Apply ST projector
+        if self.projector is not None:
+            pooled_data = self.projector(pooled_data)
+        # pooled_data shape: [n_tokens, embedding_dimension]
+
+        # for matryoshka representation
+        pooled_data = pooled_data[..., : pooling_param.dimensions]
+
+        # for normalize
+        if pooling_param.normalize:
+            pooled_data = self.activation(pooled_data)
+
+        # pooled_data shape: [n_tokens, embedding_dimension]
+        return pooled_data
+
+
+class TokenClassifierPoolerHead(nn.Module):
+    def __init__(
+        self,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | str | None = None,
+    ) -> None:
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+
+        self.classifier = classifier
+        self.act_fn = ClassifierPooler.resolve_act_fn(
+            vllm_config.model_config, static_num_labels=False, act_fn=act_fn
+        )
+        self.logit_bias: float | None = (
+            vllm_config.model_config.pooler_config.logit_bias
+        )
+        self.head_dtype = vllm_config.model_config.head_dtype
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_classify"}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_param: PoolingParams,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states.to(self.head_dtype)
+        # hidden_states shape: [n_token, hidden_size]
+
+        if self.classifier is not None:
+            scores = self.classifier(hidden_states)
+        else:
+            scores = hidden_states
+        # scores shape: [n_token, num_labels]
+
+        if self.logit_bias is not None:
+            scores -= self.logit_bias
+
+        if pooling_param.use_activation:
+            scores = self.act_fn(scores)
+
+        # scores shape: [n_token, num_labels]
+        return scores
+
+
+class AllPooler(Pooler):
+    def __init__(self, head: nn.Module | PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify"}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooling_params = get_pooling_params(pooling_metadata)
+        assert len(pooled_data) == len(pooling_params)
+
+        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
+        return pooled_data
+
+
+class StepPooler(Pooler):
+    def __init__(self, head: nn.Module | PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = head
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor | list[torch.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
+        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+
+        pooled_data = list[torch.Tensor]()
+
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        for data, token_id, pooling_param in zip(
+            pooled_data_lst, prompt_token_ids, pooling_params
+        ):
+            step_tag_id = pooling_param.step_tag_id
+            returned_token_ids = pooling_param.returned_token_ids
+
+            if returned_token_ids is not None and len(returned_token_ids) > 0:
+                data = data[:, returned_token_ids]
+
+            if step_tag_id is not None:
+                data = data[token_id == step_tag_id]
+            pooled_data.append(data)
+
+        return pooled_data
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor | list[torch.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooling_params = get_pooling_params(pooling_metadata)
+        assert len(pooled_data) == len(pooling_params)
+
+        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
+        return pooled_data
+
+
 class DispatchPooler(Pooler):
     """Dispatches calls to a sub-pooler based on the pooling task."""
 
@@ -678,7 +774,8 @@ class DispatchPooler(Pooler):
             if task not in pooler.get_supported_tasks():
                 raise ValueError(
                     f"{pooler=} does not support {task=}. "
-                    f"Supported tasks: {pooler.get_supported_tasks()}")
+                    f"Supported tasks: {pooler.get_supported_tasks()}"
+                )
 
         self.poolers_by_task = poolers_by_task
 
@@ -690,7 +787,7 @@ class DispatchPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         poolers_by_task = self.poolers_by_task
@@ -701,12 +798,13 @@ class DispatchPooler(Pooler):
             if not (pooler := poolers_by_task.get(task)):
                 raise ValueError(
                     f"Unsupported task: {task} "
-                    f"Supported tasks: {self.get_supported_tasks()}")
+                    f"Supported tasks: {self.get_supported_tasks()}"
+                )
 
             num_items = len(list(group))
             group_output: PoolerOutput = pooler(
                 hidden_states,
-                pooling_metadata[offset:offset + num_items],
+                pooling_metadata[offset : offset + num_items],
             )
 
             outputs.extend(group_output)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 8cac47b5a39a347d0a9c8c4cd415f6044484acf6..bb42b10f871868a6e5a4f93ac1a66b5606bfe9aa 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -3,8 +3,10 @@
 
 from typing import Literal, get_args
 
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+
+logger = init_logger(__name__)
 
 QuantizationMethods = Literal[
     "awq",
@@ -13,6 +15,7 @@ QuantizationMethods = Literal[
     "fp8",
     "ptpc_fp8",
     "fbgemm_fp8",
+    "fp_quant",
     "modelopt",
     "modelopt_fp4",
     "bitblas",
@@ -52,9 +55,13 @@ def register_quantization_config(quantization: str):
         quantization (str): The quantization method name.
 
     Examples:
-        >>> from vllm.model_executor.layers.quantization import register_quantization_config
+        >>> from vllm.model_executor.layers.quantization import (
+        ...     register_quantization_config,
+        ... )
         >>> from vllm.model_executor.layers.quantization import get_quantization_config
-        >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+        >>> from vllm.model_executor.layers.quantization.base_config import (
+        ...     QuantizationConfig,
+        ... )
         >>>
         >>> @register_quantization_config("my_quant")
         ... class MyQuantConfig(QuantizationConfig):
@@ -66,13 +73,20 @@ def register_quantization_config(quantization: str):
 
     def _wrapper(quant_config_cls):
         if quantization in QUANTIZATION_METHODS:
-            raise ValueError(
-                f"The quantization method `{quantization}` is already exists.")
+            logger.warning(
+                "The quantization method '%s' already exists and will be "
+                "overwritten by the quantization config %s.",
+                quantization,
+                quant_config_cls,
+            )
+        else:
+            QUANTIZATION_METHODS.append(quantization)
+
         if not issubclass(quant_config_cls, QuantizationConfig):
-            raise ValueError("The quantization config must be a subclass of "
-                             "`QuantizationConfig`.")
+            raise ValueError(
+                "The quantization config must be a subclass of `QuantizationConfig`."
+            )
         _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
-        QUANTIZATION_METHODS.append(quantization)
         return quant_config_cls
 
     return _wrapper
@@ -90,12 +104,14 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .awq_marlin import AWQMarlinConfig
     from .bitblas import BitBLASConfig
     from .bitsandbytes import BitsAndBytesConfig
-    from .compressed_tensors.compressed_tensors import (  # noqa: E501
-        CompressedTensorsConfig)
+    from .compressed_tensors.compressed_tensors import (
+        CompressedTensorsConfig,
+    )
     from .deepspeedfp import DeepSpeedFPConfig
     from .experts_int8 import ExpertsInt8Config
     from .fbgemm_fp8 import FBGEMMFp8Config
     from .fp8 import Fp8Config
+    from .fp_quant import FPQuantConfig
     from .gguf import GGUFConfig
     from .gptq import GPTQConfig
     from .gptq_bitblas import GPTQBitBLASConfig
@@ -119,6 +135,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "tpu_int8": Int8TpuConfig,
         "fp8": Fp8Config,
         "fbgemm_fp8": FBGEMMFp8Config,
+        "fp_quant": FPQuantConfig,
         "modelopt": ModelOptFp8Config,
         "modelopt_fp4": ModelOptNvFp4Config,
         "bitblas": BitBLASConfig,
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index bf5141fa48943322e65698b2e2f5c193b3811a00..f1943d4611877d61235491fa833e582f23db2fba 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -2,16 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
+import regex as re
 import torch
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -46,43 +47,52 @@ class AutoRoundConfig(QuantizationConfig):
         group_size: int,
         sym: bool = True,
         packing_format: str = "auto_round:auto_gptq",
-        block_name_to_quantize: Optional[Union[str, list[str]]] = None,
-        extra_config: Optional[dict[str, Any]] = None,
+        block_name_to_quantize: str | list[str] | None = None,
+        extra_config: dict[str, Any] | None = None,
         data_type: str = "int",
         backend: str = "auto",
     ) -> None:
         super().__init__()
         if weight_bits not in self.SUPPORTED_BITS:
-            raise ValueError(f"Unsupported weight_bits: {weight_bits}, "
-                             f"currently only support  {self.SUPPORTED_BITS}")
+            raise ValueError(
+                f"Unsupported weight_bits: {weight_bits}, "
+                f"currently only support  {self.SUPPORTED_BITS}"
+            )
         if data_type not in self.SUPPORTED_DTYPES:
             raise ValueError(
                 f"Unsupported data_type: {data_type},"
-                f" currently only support  {self.SUPPORTED_DTYPES}")
+                f" currently only support  {self.SUPPORTED_DTYPES}"
+            )
         if packing_format not in self.SUPPORTED_FORMATS:
             raise ValueError(
                 f"Unsupported packing_format: {packing_format}, "
-                f"currently only support  {self.SUPPORTED_FORMATS}")
+                f"currently only support  {self.SUPPORTED_FORMATS}"
+            )
         if backend not in self.SUPPORTED_BACKENDS:
             raise ValueError(
                 f"Unsupported backend: {backend},  "
-                f"currently only support  {self.SUPPORTED_BACKENDS}")
+                f"currently only support  {self.SUPPORTED_BACKENDS}"
+            )
 
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.sym = sym
         self.packing_format = packing_format
-        self.block_name_to_quantize = (block_name_to_quantize.split(",") if
-                                       isinstance(block_name_to_quantize, str)
-                                       else block_name_to_quantize)
+        self.block_name_to_quantize = (
+            block_name_to_quantize.split(",")
+            if isinstance(block_name_to_quantize, str)
+            else block_name_to_quantize
+        )
         self.extra_config = extra_config
         self.data_type = data_type
         self.backend = backend
         self.pack_factor = Fraction(32, weight_bits)
 
     def __repr__(self) -> str:
-        return (f"AutoRoundConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, sym={self.sym})")
+        return (
+            f"AutoRoundConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, sym={self.sym})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -106,25 +116,57 @@ class AutoRoundConfig(QuantizationConfig):
             weight_bits=cls.get_from_keys(config, ["bits"]),
             group_size=cls.get_from_keys(config, ["group_size"]),
             sym=cls.get_from_keys(config, ["sym"]),
-            packing_format=cls.get_from_keys_or(config, ["packing_format"],
-                                                "auto_round:auto_gptq"),
+            packing_format=cls.get_from_keys_or(
+                config, ["packing_format"], "auto_round:auto_gptq"
+            ),
             block_name_to_quantize=cls.get_from_keys_or(
-                config, ["block_name_to_quantize", "to_quant_block_names"],
-                None),
+                config, ["block_name_to_quantize", "to_quant_block_names"], None
+            ),
             extra_config=cls.get_from_keys_or(config, ["extra_config"], None),
             data_type=cls.get_from_keys_or(config, ["data_type"], "int"),
-            backend=cls.get_from_keys_or(config, ["backend", "vllm_backend"],
-                                         "auto"),
+            backend=cls.get_from_keys_or(config, ["backend", "vllm_backend"], "auto"),
         )
 
     def get_layer_config(self, layer, layer_name: str):
-
         def get_config(name: str, quantized: bool = True):
-            cfg = self.extra_config.get(name, {}) if self.extra_config else {}
+            if not self.extra_config:
+                return (
+                    self.weight_bits if quantized else 16,
+                    self.group_size if quantized else -1,
+                    self.sym if quantized else True,
+                )
+
+            # exact match first
+            if name in self.extra_config:
+                cfg = self.extra_config[name]
+                return (
+                    cfg.get("bits", self.weight_bits if quantized else 16),
+                    cfg.get("group_size", self.group_size if quantized else -1),
+                    cfg.get("sym", self.sym if quantized else True),
+                )
+
+            REGEX_SPECIAL_CHARS = set(r"*+?^$()[]{}|\\")
+            for pattern, cfg in self.extra_config.items():
+                if not isinstance(pattern, str) or not any(
+                    c in REGEX_SPECIAL_CHARS for c in pattern
+                ):
+                    continue
+
+                try:
+                    if re.search(re.compile(pattern), name) is not None:
+                        return (
+                            cfg.get("bits", self.weight_bits if quantized else 16),
+                            cfg.get("group_size", self.group_size if quantized else -1),
+                            cfg.get("sym", self.sym if quantized else True),
+                        )
+                except re.error:
+                    # Invalid regex, ignore.
+                    continue
+
             return (
-                cfg.get("bits", self.weight_bits if quantized else 16),
-                cfg.get("group_size", self.group_size if quantized else -1),
-                cfg.get("sym", self.sym if quantized else True),
+                self.weight_bits if quantized else 16,
+                self.group_size if quantized else -1,
+                self.sym if quantized else True,
             )
 
         # 1. Exact match from config
@@ -135,41 +177,40 @@ class AutoRoundConfig(QuantizationConfig):
         quantized = not isinstance(layer, ParallelLMHead)
         if self.block_name_to_quantize:
             quantized = any(
-                layer_name.startswith(name)
-                for name in self.block_name_to_quantize)
+                layer_name.startswith(name) for name in self.block_name_to_quantize
+            )
 
         # 3. Handle fused MoE
-        if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower(
-        ):
+        if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower():
             moe_configs = [
-                get_config(name, quantized) for name in self.extra_config
+                get_config(name, quantized)
+                for name in self.extra_config
                 if name.startswith(layer_name)
             ]
             if moe_configs:
                 if len(set(moe_configs)) == 1:
                     return moe_configs[0]
-                raise ValueError(f"Fused MoE layer '{layer_name}' requires "
-                                 f"consistent quant config for all sub-layers")
+                raise ValueError(
+                    f"Fused MoE layer '{layer_name}' requires "
+                    f"consistent quant config for all sub-layers"
+                )
 
         # 4. Handle fused QKV or other patterns
         if self.extra_config:
             for fusion_key, sub_keys in self.packed_modules_mapping.items():
-                if fusion_key in layer_name and layer_name.count(
-                        fusion_key) == 1:
+                if fusion_key in layer_name and layer_name.count(fusion_key) == 1:
                     sub_names = [
-                        layer_name.replace(fusion_key, sub_key)
-                        for sub_key in sub_keys
-                    ]
-                    sub_configs = [
-                        get_config(name, quantized) for name in sub_names
+                        layer_name.replace(fusion_key, sub_key) for sub_key in sub_keys
                     ]
+                    sub_configs = [get_config(name, quantized) for name in sub_names]
                     if len(set(sub_configs)) == 1:
                         return sub_configs[0]
                     raise ValueError(
                         f"Fused module '{layer_name}' requires "
-                        f"consistent quant config for {sub_names}")
+                        f"consistent quant config for {sub_names}"
+                    )
 
-        # 5. Fallback
+        # 5. Fallback or try a regular expression match
         return get_config(layer_name, quantized)
 
     def check_quantized(self, weight_bits: int) -> bool:
@@ -178,14 +219,17 @@ class AutoRoundConfig(QuantizationConfig):
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if self.block_name_to_quantize is not None:
             self.block_name_to_quantize = hf_to_vllm_mapper.apply_list(
-                self.block_name_to_quantize)
+                self.block_name_to_quantize
+            )
         if self.extra_config is not None:
             self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config)
 
     def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            check_marlin_supported, check_moe_marlin_supports_layer)
+            check_marlin_supported,
+            check_moe_marlin_supports_layer,
+        )
 
         weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
         if not self.check_quantized(weight_bits):
@@ -207,19 +251,23 @@ class AutoRoundConfig(QuantizationConfig):
                 4: scalar_types.uint4,
                 8: scalar_types.uint8,
             }
-            use_marlin = (weight_bits
-                          in AWQ_TYPE_MAP) and check_marlin_supported(
-                              AWQ_TYPE_MAP[weight_bits], group_size, not sym)
+            use_marlin = (weight_bits in AWQ_TYPE_MAP) and check_marlin_supported(
+                AWQ_TYPE_MAP[weight_bits], group_size, not sym
+            )
 
             if isinstance(layer, FusedMoE):
                 use_marlin = use_marlin and check_moe_marlin_supports_layer(
-                    layer, group_size)
+                    layer, group_size
+                )
 
         else:
             use_marlin = False
         if use_marlin:
             from vllm.model_executor.layers.quantization.awq_marlin import (
-                AWQMarlinConfig, AWQMarlinLinearMethod, AWQMoEMethod)
+                AWQMarlinConfig,
+                AWQMarlinLinearMethod,
+                AWQMoEMethod,
+            )
 
             quant_args_marlin = AWQMarlinConfig(
                 weight_bits=weight_bits,
@@ -231,7 +279,9 @@ class AutoRoundConfig(QuantizationConfig):
             )
         else:
             from vllm.model_executor.layers.quantization.awq import (
-                AWQConfig, AWQLinearMethod)
+                AWQConfig,
+                AWQLinearMethod,
+            )
 
             quant_args = AWQConfig(
                 weight_bits=weight_bits,
@@ -242,8 +292,7 @@ class AutoRoundConfig(QuantizationConfig):
         if isinstance(layer, FusedMoE):
             if use_marlin:
                 return AWQMoEMethod(quant_args_marlin, layer.moe_config)
-            from vllm.model_executor.layers.quantization.moe_wna16 import (
-                MoeWNA16Config)
+            from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
             config = {
                 "quant_method": "awq",
@@ -252,8 +301,7 @@ class AutoRoundConfig(QuantizationConfig):
                 "zero_point": not sym,
                 "lm_head": False,
             }
-            return MoeWNA16Config.from_config(config).get_quant_method(
-                layer, prefix)
+            return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix)
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin:
@@ -262,13 +310,12 @@ class AutoRoundConfig(QuantizationConfig):
                 return AWQLinearMethod(quant_args)
         return None
 
-    def apply_gptq_quant_layer(self,
-                               layer,
-                               prefix: str,
-                               backend: str = "auto"):
+    def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            check_marlin_supported, check_moe_marlin_supports_layer)
+            check_marlin_supported,
+            check_moe_marlin_supports_layer,
+        )
 
         weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
         if not self.check_quantized(weight_bits):
@@ -290,19 +337,21 @@ class AutoRoundConfig(QuantizationConfig):
                 (4, True): scalar_types.uint4b8,
                 (8, True): scalar_types.uint8b128,
             }
-            use_marlin = (weight_bits,
-                          sym) in GPTQ_TYPE_MAP and check_marlin_supported(
-                              GPTQ_TYPE_MAP[(weight_bits, sym)],
-                              group_size,
-                              has_zp=not sym)
+            use_marlin = (weight_bits, sym) in GPTQ_TYPE_MAP and check_marlin_supported(
+                GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, has_zp=not sym
+            )
             if isinstance(layer, FusedMoE):
                 use_marlin = use_marlin and check_moe_marlin_supports_layer(
-                    layer, group_size)
+                    layer, group_size
+                )
         else:
             use_marlin = False
         if use_marlin:
             from vllm.model_executor.layers.quantization.gptq_marlin import (
-                GPTQMarlinConfig, GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
+                GPTQMarlinConfig,
+                GPTQMarlinLinearMethod,
+                GPTQMarlinMoEMethod,
+            )
 
             quant_args_marlin = GPTQMarlinConfig(
                 weight_bits=weight_bits,
@@ -315,7 +364,9 @@ class AutoRoundConfig(QuantizationConfig):
             )
         else:
             from vllm.model_executor.layers.quantization.gptq import (
-                GPTQConfig, GPTQLinearMethod)
+                GPTQConfig,
+                GPTQLinearMethod,
+            )
 
             quant_args = GPTQConfig(
                 weight_bits=weight_bits,
@@ -330,7 +381,8 @@ class AutoRoundConfig(QuantizationConfig):
                 return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config)
             else:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
-                    MoeWNA16Config)
+                    MoeWNA16Config,
+                )
 
                 config = {
                     "quant_method": "gptq",
@@ -340,7 +392,8 @@ class AutoRoundConfig(QuantizationConfig):
                     "lm_head": False,
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
-                    layer, prefix)
+                    layer, prefix
+                )
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin:
@@ -358,29 +411,42 @@ class AutoRoundConfig(QuantizationConfig):
             else:
                 return None
         from vllm.model_executor.layers.quantization.ipex_quant import (
-            IPEXAWQLinearMethod, IPEXConfig, IPEXGPTQLinearMethod)
+            IPEXAWQLinearMethod,
+            IPEXConfig,
+            IPEXGPTQLinearMethod,
+        )
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if "awq" in self.packing_format:
-                config = IPEXConfig(method="awq",
-                                    weight_bits=weight_bits,
-                                    group_size=group_size)
+                config = IPEXConfig(
+                    method="awq", weight_bits=weight_bits, group_size=group_size
+                )
                 return IPEXAWQLinearMethod(config)
             elif "gptq" in self.packing_format:
-                config = IPEXConfig(method="gptq",
-                                    weight_bits=weight_bits,
-                                    group_size=group_size)
+                config = IPEXConfig(
+                    method="gptq", weight_bits=weight_bits, group_size=group_size
+                )
                 return IPEXGPTQLinearMethod(config)
             else:
                 raise ValueError(
                     f"ipex backend only supports awq "
-                    f"and gtpq format,but got {self.packing_format}")
+                    f"and gtpq format,but got {self.packing_format}"
+                )
         else:
             return None
 
     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
-        if (current_platform.is_cpu() or current_platform.is_xpu()
-                or self.backend == "ipex"):
+        if prefix and self.extra_config:
+            for layer_name in self.extra_config:
+                if (
+                    layer_name == prefix or layer_name == f"model.{prefix}"
+                ) and self.extra_config[layer_name].get("bits", 16) >= 16:
+                    return UnquantizedLinearMethod()
+        if (
+            current_platform.is_cpu()
+            or current_platform.is_xpu()
+            or self.backend == "ipex"
+        ):
             return self.apply_ipex_quant_layer(layer, prefix)
         if "gptq" in self.packing_format or "gptq" in self.backend:
             return self.apply_gptq_quant_layer(layer, prefix)
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index af602eb9aca389e41f5b8f2ae411222f54347fe6..0cf8b69f9f6ba7db93706b64bfbbbcc9fb379faa 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,20 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Union
 
 import torch
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.parameter import (GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from vllm.transformers_utils.config import get_safetensors_params_metadata
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.models.utils import WeightsMapper
 
 logger = init_logger(__name__)
 
@@ -30,7 +40,7 @@ class AWQConfig(QuantizationConfig):
         weight_bits: int,
         group_size: int,
         zero_point: bool,
-        modules_to_not_convert: Optional[list[str]] = None,
+        modules_to_not_convert: list[str] | None = None,
     ) -> None:
         super().__init__()
         self.weight_bits = weight_bits
@@ -41,16 +51,19 @@ class AWQConfig(QuantizationConfig):
         if self.weight_bits != 4:
             raise ValueError(
                 "Currently, only 4-bit weight quantization is supported for "
-                f"AWQ, but got {self.weight_bits} bits.")
+                f"AWQ, but got {self.weight_bits} bits."
+            )
         self.pack_factor = 32 // self.weight_bits
 
     def __repr__(self) -> str:
-        return (f"AWQConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"zero_point={self.zero_point}, "
-                f"modules_to_not_convert={self.modules_to_not_convert})")
-
-    def get_name(self) -> QuantizationMethods:
+        return (
+            f"AWQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    def get_name(self) -> "QuantizationMethods":
         return "awq"
 
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
@@ -75,14 +88,20 @@ class AWQConfig(QuantizationConfig):
         group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
         zero_point = cls.get_from_keys(config, ["zero_point"])
         modules_to_not_convert = cls.get_from_keys_or(
-            config, ["modules_to_not_convert"], None)
+            config, ["modules_to_not_convert"], None
+        )
         return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]:
+    ) -> Union["LinearMethodBase", "QuantizeMethodBase"] | None:
         if isinstance(layer, LinearBase):
-            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+            if is_layer_skipped(
+                prefix,
+                self.modules_to_not_convert,
+                self.packed_modules_mapping,
+                skip_with_substr=True,
+            ):
                 return UnquantizedLinearMethod()
             return AWQLinearMethod(self)
         elif isinstance(layer, FusedMoE):
@@ -90,10 +109,12 @@ class AWQConfig(QuantizationConfig):
             from .awq_marlin import AWQMarlinConfig, AWQMoEMethod
             from .moe_wna16 import MoeWNA16Config
             from .utils.marlin_utils import check_moe_marlin_supports_layer
+
             if not check_moe_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
                     f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
-                    "Falling back to Moe WNA16 kernels.")
+                    "Falling back to Moe WNA16 kernels."
+                )
                 config = {
                     "quant_method": "awq",
                     "bits": self.weight_bits,
@@ -102,7 +123,8 @@ class AWQConfig(QuantizationConfig):
                     "lm_head": False,
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
-                    layer, prefix)
+                    layer, prefix
+                )
             marlin_compatible_config_dict = {
                 "quant_method": "awq",
                 "bits": self.weight_bits,
@@ -112,13 +134,31 @@ class AWQConfig(QuantizationConfig):
                 "modules_to_not_convert": self.modules_to_not_convert,
             }
             awq_marlin_config = AWQMarlinConfig.from_config(
-                marlin_compatible_config_dict)
+                marlin_compatible_config_dict
+            )
             return AWQMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.modules_to_not_convert:
+            self.modules_to_not_convert = hf_to_vllm_mapper.apply_list(
+                self.modules_to_not_convert
+            )
 
-def is_layer_skipped_awq(prefix: str, modules_to_not_convert: list[str]):
-    return any(module_name in prefix for module_name in modules_to_not_convert)
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_to_not_convert:
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        layers = {param_name.rsplit(".", 1)[0] for param_name in metadata}
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_to_not_convert = list(layers - quant_layers)
 
 
 class AWQLinearMethod(LinearMethodBase):
@@ -131,11 +171,16 @@ class AWQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AWQConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # Normalize group_size
         if self.quant_config.group_size != -1:
             group_size = self.quant_config.group_size
@@ -146,14 +191,16 @@ class AWQLinearMethod(LinearMethodBase):
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
 
         output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
 
         weight_loader = extra_weight_attrs.get("weight_loader")
         qweight = PackedvLLMParameter(
@@ -166,7 +213,8 @@ class AWQLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         num_groups = input_size_per_partition // group_size
 
@@ -180,38 +228,40 @@ class AWQLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
-        scales = GroupQuantScaleParameter(data=torch.empty(
-            num_groups,
-            output_size_per_partition,
-            dtype=params_dtype,
-        ),
-                                          input_dim=0,
-                                          output_dim=1,
-                                          weight_loader=weight_loader)
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("qzeros", qzeros)
         layer.register_parameter("scales", scales)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.qweight = torch.nn.Parameter(layer.qweight.data,
-                                           requires_grad=False)
-        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
-                                          requires_grad=False)
-        layer.scales = torch.nn.Parameter(layer.scales.data,
-                                          requires_grad=False)
-
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         qweight = layer.qweight
         scales = layer.scales
         qzeros = layer.qzeros
         pack_factor = self.quant_config.pack_factor
-        out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
+        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
         reshaped_x = x.reshape(-1, x.shape[-1])
 
         # num_tokens >= threshold
@@ -221,8 +271,7 @@ class AWQLinearMethod(LinearMethodBase):
             out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
             out = torch.matmul(reshaped_x, out)
         else:
-            out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
-                               pack_factor)
+            out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor)
         if bias is not None:
             out.add_(bias)
         return out.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 6bf6ea9146514e7a67e9ee563ad1e801be1474d5..3f6ea68072b40c32365176b9602d96f8ae307e80 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,40 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.nn import Parameter
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
-    UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.awq import (AWQConfig,
-                                                         is_layer_skipped_awq)
+    FusedMoE,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod,
+)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+    set_weight_attrs,
+)
+from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
-    check_marlin_supports_layer, check_moe_marlin_supports_layer,
-    marlin_make_empty_g_idx, marlin_make_workspace_new,
-    marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales,
-    moe_awq_to_marlin_zero_points, verify_marlin_supported,
-    verify_marlin_supports_shape)
+    apply_awq_marlin_linear,
+    awq_to_marlin_zero_points,
+    check_marlin_supported,
+    check_marlin_supports_layer,
+    check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace_new,
+    marlin_moe_permute_scales,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    moe_awq_to_marlin_zero_points,
+    verify_marlin_supported,
+    verify_marlin_supports_shape,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.parameter import (GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.transformers_utils.config import get_safetensors_params_metadata
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.models.utils import WeightsMapper
 
 logger = init_logger(__name__)
 
@@ -48,10 +72,15 @@ class AWQMarlinConfig(QuantizationConfig):
         8: scalar_types.uint8,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, zero_point: bool,
-                 lm_head_quantized: bool,
-                 modules_to_not_convert: Optional[list[str]],
-                 full_config: dict[str, Any]) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: list[str] | None,
+        full_config: dict[str, Any],
+    ) -> None:
         super().__init__()
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
@@ -62,24 +91,28 @@ class AWQMarlinConfig(QuantizationConfig):
         self.full_config = full_config
 
         if self.weight_bits not in self.TYPE_MAP:
-            raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
-                             f"Supported num_bits = {self.TYPE_MAP.keys()}")
+            raise ValueError(
+                f"Unsupported num_bits = {self.weight_bits}. "
+                f"Supported num_bits = {self.TYPE_MAP.keys()}"
+            )
 
         self.quant_type = self.TYPE_MAP[self.weight_bits]
 
-        verify_marlin_supported(self.quant_type,
-                                group_size=self.group_size,
-                                has_zp=self.zero_point)
+        verify_marlin_supported(
+            self.quant_type, group_size=self.group_size, has_zp=self.zero_point
+        )
 
     def __repr__(self) -> str:
-        return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
-                f"group_size={self.group_size}, "
-                f"zero_point={self.zero_point}, "
-                f"lm_head_quantized={self.lm_head_quantized}, "
-                f"modules_to_not_convert={self.modules_to_not_convert})")
+        return (
+            f"AWQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
 
     @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(cls) -> "QuantizationMethods":
         return "awq_marlin"
 
     @classmethod
@@ -99,38 +132,57 @@ class AWQMarlinConfig(QuantizationConfig):
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         zero_point = cls.get_from_keys(config, ["zero_point"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         modules_to_not_convert = cls.get_from_keys_or(
-            config, ["modules_to_not_convert"], None)
-        return cls(weight_bits, group_size, zero_point, lm_head_quantized,
-                   modules_to_not_convert, config)
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> Optional["QuantizationMethods"]:
         can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
-                               or user_quant == "awq_marlin")
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin"
+        )
 
         if can_convert and is_valid_user_quant:
-            msg = ("The model is convertible to {} during runtime."
-                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
             logger.info(msg)
             return cls.get_name()
 
         if can_convert and user_quant == "awq":
-            logger.info("Detected that the model can run with awq_marlin"
-                        ", however you specified quantization=awq explicitly,"
-                        " so forcing awq. Use quantization=awq_marlin for"
-                        " faster inference")
+            logger.info(
+                "Detected that the model can run with awq_marlin"
+                ", however you specified quantization=awq explicitly,"
+                " so forcing awq. Use quantization=awq_marlin for"
+                " faster inference"
+            )
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped(
+                prefix,
+                self.modules_to_not_convert,
+                self.packed_modules_mapping,
+                skip_with_substr=True,
+            ):
                 return UnquantizedLinearMethod()
             # Check if the layer is supported by AWQMarlin.
             if not check_marlin_supports_layer(layer, self.group_size):
@@ -138,21 +190,27 @@ class AWQMarlinConfig(QuantizationConfig):
                     "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
                     prefix,
                 )
-                return AWQConfig.from_config(
-                    self.full_config).get_quant_method(layer, prefix)
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            from vllm.model_executor.layers.quantization.moe_wna16 import (
-                MoeWNA16Config)
-            if is_layer_skipped_awq(
-                    prefix, getattr(self, "modules_to_not_convert", [])):
+            from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
+
+            if is_layer_skipped(
+                prefix,
+                getattr(self, "modules_to_not_convert", []),
+                skip_with_substr=True,
+            ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
             if not check_moe_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
                     f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
-                    "Falling back to Moe WNA16 kernels.")
-                return MoeWNA16Config.from_config(
-                    self.full_config).get_quant_method(layer, prefix)
+                    "Falling back to Moe WNA16 kernels."
+                )
+                return MoeWNA16Config.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
             return AWQMoEMethod(self, layer.moe_config)
         return None
 
@@ -171,15 +229,36 @@ class AWQMarlinConfig(QuantizationConfig):
             return False
 
         # If we cannot find the info needed in the config, cannot convert.
-        if (num_bits is None or group_size is None or zero_point is None):
+        if num_bits is None or group_size is None or zero_point is None:
             return False
 
         if num_bits not in cls.TYPE_MAP:
             return False
 
-        return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
-                                      group_size=group_size,
-                                      has_zp=zero_point)
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point
+        )
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.modules_to_not_convert:
+            self.modules_to_not_convert = hf_to_vllm_mapper.apply_list(
+                self.modules_to_not_convert
+            )
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_to_not_convert:
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        layers = {param_name.rsplit(".", 1)[0] for param_name in metadata}
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_to_not_convert = list(layers - quant_layers)
 
 
 class AWQMarlinLinearMethod(LinearMethodBase):
@@ -216,7 +295,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         qweight = PackedvLLMParameter(
             data=torch.empty(
@@ -228,7 +308,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         num_groups = input_size_per_partition // group_size
 
@@ -242,16 +323,19 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
-        scales = GroupQuantScaleParameter(data=torch.empty(
-            num_groups,
-            output_size_per_partition,
-            dtype=params_dtype,
-        ),
-                                          input_dim=0,
-                                          output_dim=1,
-                                          weight_loader=weight_loader)
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("qzeros", qzeros)
@@ -267,12 +351,9 @@ class AWQMarlinLinearMethod(LinearMethodBase):
     # Here, we handle the repacking
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = layer.qweight.device
-        layer.qweight = torch.nn.Parameter(layer.qweight.data,
-                                           requires_grad=False)
-        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
-                                          requires_grad=False)
-        layer.scales = torch.nn.Parameter(layer.scales.data,
-                                          requires_grad=False)
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
 
         # Allocate marlin workspace
         layer.workspace = marlin_make_workspace_new(device)
@@ -282,7 +363,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_parameter(layer, "qweight", marlin_qweight)
 
         # Permute scales from AWQ format to marlin format.
@@ -290,7 +372,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             layer.scales,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_parameter(layer, "scales", marlin_scales)
 
         # Permute zero-points from AWQ format to marlin format.
@@ -298,7 +381,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             layer.qzeros,
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_parameter(layer, "qzeros", marlin_zp)
 
         # Not-used
@@ -312,7 +396,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_awq_marlin_linear(
             input=x,
@@ -325,11 +409,11 @@ class AWQMarlinLinearMethod(LinearMethodBase):
             quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
-            bias=bias)
+            bias=bias,
+        )
 
 
 class AWQMoEMethod(FusedMoEMethodBase):
-
     def __init__(
         self,
         quant_config: AWQMarlinConfig,
@@ -340,76 +424,95 @@ class AWQMoEMethod(FusedMoEMethodBase):
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.use_marlin = True
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-        extra_weight_attrs.update({
-            "is_transposed":
-            True,
-            "quant_method":
-            FusedMoeWeightScaleSupported.GROUP.value,
-        })
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        extra_weight_attrs.update(
+            {
+                "is_transposed": True,
+                "quant_method": FusedMoeWeightScaleSupported.GROUP.value,
+            }
+        )
 
         w13_qweight = Parameter(
-            torch.empty(num_experts,
-                        hidden_size,
-                        2 * intermediate_size_per_partition //
-                        self.quant_config.pack_factor,
-                        dtype=torch.int32),
-            requires_grad=False)
+            torch.empty(
+                num_experts,
+                hidden_size,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_qweight", w13_qweight)
         set_weight_attrs(w13_qweight, extra_weight_attrs)
 
-        w2_qweight = Parameter(torch.empty(num_experts,
-                                           intermediate_size_per_partition,
-                                           hidden_size //
-                                           self.quant_config.pack_factor,
-                                           dtype=torch.int32),
-                               requires_grad=False)
+        w2_qweight = Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_qweight", w2_qweight)
         set_weight_attrs(w2_qweight, extra_weight_attrs)
 
         num_groups_w13 = hidden_size // self.quant_config.group_size
-        num_groups_w2 = (intermediate_size_per_partition //
-                         self.quant_config.group_size)
+        num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
 
         # WEIGHT_SCALES
         # Allocate 2 scales for w1 and w3 respectively.
-        w13_scales = Parameter(torch.empty(num_experts,
-                                           num_groups_w13,
-                                           intermediate_size_per_partition * 2,
-                                           dtype=params_dtype),
-                               requires_grad=False)
+        w13_scales = Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                intermediate_size_per_partition * 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
 
-        w2_scales = Parameter(torch.empty(num_experts,
-                                          num_groups_w2,
-                                          hidden_size,
-                                          dtype=params_dtype),
-                              requires_grad=False)
+        w2_scales = Parameter(
+            torch.empty(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
 
         # WEIGHT_ZERO_POINT
         # Allocate 2 zero points for w1 and w3 respectively.
         w13_qzeros = Parameter(
-            torch.empty(num_experts,
-                        num_groups_w13,
-                        2 * intermediate_size_per_partition //
-                        self.quant_config.pack_factor,
-                        dtype=torch.int32),
-            requires_grad=False)
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_qzeros", w13_qzeros)
         set_weight_attrs(w13_qzeros, extra_weight_attrs)
 
-        w2_qzeros = Parameter(torch.empty(num_experts,
-                                          num_groups_w2,
-                                          hidden_size //
-                                          self.quant_config.pack_factor,
-                                          dtype=torch.int32),
-                              requires_grad=False)
+        w2_qzeros = Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
 
@@ -469,14 +572,16 @@ class AWQMoEMethod(FusedMoEMethodBase):
             layer.w13_qzeros,
             size_k=layer.w13_qzeros.shape[1],
             size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.weight_bits,
+        )
         replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
 
         marlin_w2_zp = moe_awq_to_marlin_zero_points(
             layer.w2_qzeros,
             size_k=layer.w2_qzeros.shape[1],
             size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.weight_bits,
+        )
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
         if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
@@ -486,7 +591,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
             layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -497,26 +603,23 @@ class AWQMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `AWQMoEMethod` yet.")
+            raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
 
         assert activation == "silu", "Only SiLU activation is supported."
 
@@ -532,9 +635,10 @@ class AWQMoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
@@ -551,4 +655,5 @@ class AWQMoEMethod(FusedMoEMethodBase):
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
-            workspace=layer.workspace)
+            workspace=layer.workspace,
+        )
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index 2e8894436a98522c396a1698c037c6a95fbcb1ac..67b4dbbfd4d82ec89d605f5352b24e142507c221 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -10,15 +10,16 @@ AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 @triton.jit
 def awq_dequantize_kernel(
-        qweight_ptr,  # quantized matrix
-        scales_ptr,  # scales, per group
-        zeros_ptr,  # zeros, per group
-        group_size,  # Should always be one of the supported group sizes
-        result_ptr,  # Output matrix
-        num_cols,  # input num cols in qweight
-        num_rows,  # input num rows in qweight
-        BLOCK_SIZE_X: tl.constexpr,
-        BLOCK_SIZE_Y: tl.constexpr):
+    qweight_ptr,  # quantized matrix
+    scales_ptr,  # scales, per group
+    zeros_ptr,  # zeros, per group
+    group_size,  # Should always be one of the supported group sizes
+    result_ptr,  # Output matrix
+    num_cols,  # input num cols in qweight
+    num_rows,  # input num rows in qweight
+    BLOCK_SIZE_X: tl.constexpr,
+    BLOCK_SIZE_Y: tl.constexpr,
+):
     # Set up the pids.
     pid_x = tl.program_id(axis=0)
     pid_y = tl.program_id(axis=1)
@@ -35,10 +36,10 @@ def awq_dequantize_kernel(
 
     # Compute offsets and masks for result output ptr.
     result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
-    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(
-        0, BLOCK_SIZE_X * 8)
-    result_offsets = (8 * num_cols * result_offsets_y[:, None] +
-                      result_offsets_x[None, :])
+    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    result_offsets = (
+        8 * num_cols * result_offsets_y[:, None] + result_offsets_x[None, :]
+    )
 
     result_masks_y = result_offsets_y < num_rows
     result_masks_x = result_offsets_x < num_cols * 8
@@ -52,8 +53,9 @@ def awq_dequantize_kernel(
 
     # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
     # that will map given indices to the correct order.
-    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
-                                tl.arange(0, 4)[:, None]).reshape(8)
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
 
     # Use this to compute a set of shifts that can be used to unpack and
     # reorder the values in iweights and zeros.
@@ -85,10 +87,8 @@ def awq_dequantize_kernel(
 
     # Compute scale offsets and masks.
     scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
-    scale_offsets_x = (pid_x * BLOCK_SIZE_X * 8 +
-                       tl.arange(0, BLOCK_SIZE_X * 8))
-    scale_offsets = (num_cols * 8 * scale_offsets_y[:, None] +
-                     scale_offsets_x[None, :])
+    scale_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    scale_offsets = num_cols * 8 * scale_offsets_y[:, None] + scale_offsets_x[None, :]
     scale_masks_y = scale_offsets_y < num_rows // group_size
     scale_masks_x = scale_offsets_x < num_cols * 8
     scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
@@ -106,10 +106,21 @@ def awq_dequantize_kernel(
 
 
 @triton.jit
-def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
-                    group_size, BLOCK_SIZE_M: tl.constexpr,
-                    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                    SPLIT_K: tl.constexpr):
+def awq_gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    zeros_ptr,
+    scales_ptr,
+    M,
+    N,
+    K,
+    group_size,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
     pid = tl.program_id(axis=0)
     pid_z = tl.program_id(1)
 
@@ -128,18 +139,17 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
     # (BLOCK_SIZE_M, BLOCK_SIZE_N))
     # accumulator = accumulator & 0x0
     # accumulator = accumulator.to(accumulator_dtype)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
-                           dtype=accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
 
     # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
     # that will map given indices to the correct order.
-    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
-                                tl.arange(0, 4)[:, None]).reshape(8)
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
 
     # Create the necessary shifts to use to unpack.
     shifts = reverse_awq_order_tensor * 4
-    shifts = tl.broadcast_to(shifts[None, :],
-                             (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
     shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
 
     # Offsets and masks.
@@ -178,8 +188,8 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
 
         # Dequantize b.
         offsets_szk = (
-            (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
-            tl.arange(0, 1))
+            BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K
+        ) // group_size + tl.arange(0, 1)
         offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
         masks_zk = offsets_szk < K // group_size
         masks_z = masks_zk[:, None] & masks_zn[None, :]
@@ -220,11 +230,13 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
 # qweights - [K     , M // 8], int32
 # scales   - [K // G, M     ], float16
 # zeros    - [K // G, M // 8], int32
-def awq_dequantize_triton(qweight: torch.Tensor,
-                          scales: torch.Tensor,
-                          zeros: torch.Tensor,
-                          block_size_x: int = 32,
-                          block_size_y: int = 32) -> torch.Tensor:
+def awq_dequantize_triton(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    block_size_x: int = 32,
+    block_size_y: int = 32,
+) -> torch.Tensor:
     K = qweight.shape[0]
     M = scales.shape[1]
     group_size = qweight.shape[0] // scales.shape[0]
@@ -238,27 +250,31 @@ def awq_dequantize_triton(qweight: torch.Tensor,
     # Result tensor:
     # number of rows = same as input tensor
     # number of cols = 8 x input tensor num cols
-    result = torch.empty(qweight.shape[0],
-                         qweight.shape[1] * 8,
-                         device=qweight.device,
-                         dtype=scales.dtype)
+    result = torch.empty(
+        qweight.shape[0],
+        qweight.shape[1] * 8,
+        device=qweight.device,
+        dtype=scales.dtype,
+    )
 
     Y = qweight.shape[0]  # num rows
     X = qweight.shape[1]  # num cols
 
     grid = lambda META: (
-        triton.cdiv(X, META['BLOCK_SIZE_X']),
-        triton.cdiv(Y, META['BLOCK_SIZE_Y']),
+        triton.cdiv(X, META["BLOCK_SIZE_X"]),
+        triton.cdiv(Y, META["BLOCK_SIZE_Y"]),
+    )
+    awq_dequantize_kernel[grid](
+        qweight,
+        scales,
+        zeros,
+        group_size,
+        result,
+        X,
+        Y,
+        BLOCK_SIZE_X=block_size_x,
+        BLOCK_SIZE_Y=block_size_y,
     )
-    awq_dequantize_kernel[grid](qweight,
-                                scales,
-                                zeros,
-                                group_size,
-                                result,
-                                X,
-                                Y,
-                                BLOCK_SIZE_X=block_size_x,
-                                BLOCK_SIZE_Y=block_size_y)
 
     return result
 
@@ -268,14 +284,16 @@ def awq_dequantize_triton(qweight: torch.Tensor,
 # qzeros  - [K // G, N // 8]
 # scales  - [K // G, N]
 # split_k_iters - parallelism along K-dimension, int, power of 2.
-def awq_gemm_triton(input: torch.Tensor,
-                    qweight: torch.Tensor,
-                    scales: torch.Tensor,
-                    qzeros: torch.Tensor,
-                    split_k_iters: int,
-                    block_size_m: int = 32,
-                    block_size_n: int = 32,
-                    block_size_k: int = 32) -> torch.Tensor:
+def awq_gemm_triton(
+    input: torch.Tensor,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    qzeros: torch.Tensor,
+    split_k_iters: int,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+) -> torch.Tensor:
     M, K = input.shape
     N = qweight.shape[1] * 8
     group_size = qweight.shape[0] // qzeros.shape[0]
@@ -290,30 +308,29 @@ def awq_gemm_triton(input: torch.Tensor,
     assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
 
     grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-            N, META['BLOCK_SIZE_N']),
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
         split_k_iters,
     )
 
-    result = torch.zeros((split_k_iters, M, N),
-                         dtype=scales.dtype,
-                         device=input.device)
+    result = torch.zeros((split_k_iters, M, N), dtype=scales.dtype, device=input.device)
 
     # A = input, B = qweight, C = result
     # A = M x K, B = K x N, C = M x N
-    awq_gemm_kernel[grid](input,
-                          qweight,
-                          result,
-                          qzeros,
-                          scales,
-                          M,
-                          N,
-                          K,
-                          group_size,
-                          BLOCK_SIZE_M=block_size_m,
-                          BLOCK_SIZE_N=block_size_n,
-                          BLOCK_SIZE_K=block_size_k,
-                          SPLIT_K=split_k_iters)
+    awq_gemm_kernel[grid](
+        input,
+        qweight,
+        result,
+        qzeros,
+        scales,
+        M,
+        N,
+        K,
+        group_size,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        SPLIT_K=split_k_iters,
+    )
 
     result = result.sum(0)
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 807a9866a18b22b7b4e5a05681b5a0a8fc2ac193..c8a8424eb5c88e96a20c9905b3347acac4523789 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -3,7 +3,7 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import nn
@@ -19,8 +19,9 @@ class QuantizeMethodBase(ABC):
     """Base class for different quantized methods."""
 
     @abstractmethod
-    def create_weights(self, layer: torch.nn.Module, *weight_args,
-                       **extra_weight_attrs):
+    def create_weights(
+        self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
+    ):
         """Create weights for a layer.
 
         The weights will be set as attributes of the layer."""
@@ -34,8 +35,7 @@ class QuantizeMethodBase(ABC):
         raise NotImplementedError
 
     # Not required functions
-    def embedding(self, layer: torch.nn.Module, *args,
-                  **kwargs) -> torch.Tensor:
+    def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
         """Gather embeddings in the layer based on indices in the input tensor.
 
         Expects create_weights to have been called before on the layer."""
@@ -49,19 +49,16 @@ class QuantizeMethodBase(ABC):
         return
 
 
-def method_has_implemented_embedding(
-        method_class: type[QuantizeMethodBase]) -> bool:
+def method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) -> bool:
     """
     Not all quant methods have embedding implemented, so we need to check that
     it exists for our given method. We check this by making sure the function
     has been changed from the base implementation.
     """
-    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
-                                            None)
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
     class_embedding = inspect.getattr_static(method_class, "embedding", None)
 
-    return (class_embedding is not None
-            and class_embedding is not base_embedding)
+    return class_embedding is not None and class_embedding is not base_embedding
 
 
 class QuantizationConfig(ABC):
@@ -107,12 +104,13 @@ class QuantizationConfig(ABC):
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         """
-           Detects if this quantization method can support a given checkpoint
-           format by overriding the user specified quantization method -- 
-           this method should only be overwritten by subclasses in exceptional 
-           circumstances
+        Detects if this quantization method can support a given checkpoint
+        format by overriding the user specified quantization method --
+        this method should only be overwritten by subclasses in exceptional
+        circumstances
         """
         return None
 
@@ -122,12 +120,12 @@ class QuantizationConfig(ABC):
         for key in keys:
             if key in config:
                 return config[key]
-        raise ValueError(f"Cannot find any of {keys} in the model's "
-                         "quantization config.")
+        raise ValueError(
+            f"Cannot find any of {keys} in the model's quantization config."
+        )
 
     @staticmethod
-    def get_from_keys_or(config: dict[str, Any], keys: list[str],
-                         default: Any) -> Any:
+    def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any:
         """Get an optional value from the model's quantization config."""
         try:
             return QuantizationConfig.get_from_keys(config, keys)
@@ -135,10 +133,11 @@ class QuantizationConfig(ABC):
             return default
 
     @abstractmethod
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional[QuantizeMethodBase]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> QuantizeMethodBase | None:
         """Get the quantize method to use for the quantized layer.
-        
+
         Args:
             layer: The layer for the quant method.
             prefix: The full name of the layer in the state dict
@@ -148,11 +147,12 @@ class QuantizationConfig(ABC):
         """
         raise NotImplementedError
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         return None
 
     def apply_vllm_mapper(  # noqa: B027
-            self, hf_to_vllm_mapper: "WeightsMapper"):
+        self, hf_to_vllm_mapper: "WeightsMapper"
+    ):
         """
         Interface for models to update module names referenced in
         quantization configs in order to reflect the vllm model structure
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index d05c0c0d5473cb2d92724ca8e48144e48848a79a..be15f20cac21e28258683f8045d4d6af60b89d10 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -7,17 +7,23 @@ from packaging import version
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_NUM_BITS,
-    BITBLAS_SUPPORTED_SYM, MINIMUM_BITBLAS_VERSION)
+    BITBLAS_OPTIMIZE_FEATURES,
+    BITBLAS_SUPPORTED_NUM_BITS,
+    BITBLAS_SUPPORTED_SYM,
+    MINIMUM_BITBLAS_VERSION,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -28,6 +34,7 @@ class BitBLASConfig(QuantizationConfig):
 
     Reference: https://github.com/Microsoft/BitBLAS
     """
+
     TORCH_DTYPE = torch.float16
     STORAGE_DTYPE = "int8"  # assume int8 storage
     TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE)
@@ -38,19 +45,22 @@ class BitBLASConfig(QuantizationConfig):
     def __init__(
         self,
         weight_bits: int,
-        group_size: Optional[int],
-        desc_act: Optional[bool],
-        is_sym: Optional[bool],
-        quant_method: Optional[str],
+        group_size: int | None,
+        desc_act: bool | None,
+        is_sym: bool | None,
+        quant_method: str | None,
         lm_head_quantized: bool,
     ) -> None:
         try:
             import bitblas
+
             if version.parse(bitblas.__version__) < version.parse(
-                    MINIMUM_BITBLAS_VERSION):
+                MINIMUM_BITBLAS_VERSION
+            ):
                 raise ImportError(
                     "bitblas version is wrong. Please "
-                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+                )
         except ImportError as e:
             bitblas_import_exception = e
             raise ValueError(
@@ -78,12 +88,14 @@ class BitBLASConfig(QuantizationConfig):
             raise ValueError(
                 f"BitBLAS does not support weight_bits = {self.weight_bits}. "
                 f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} "
-                "are supported.")
+                "are supported."
+            )
 
         if self.is_sym not in BITBLAS_SUPPORTED_SYM:
             raise ValueError(
                 f"BitBLAS does not support is_sym = {self.is_sym}. "
-                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported.")
+                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported."
+            )
 
         storage_dtype = self.STORAGE_DTYPE
         storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
@@ -98,11 +110,13 @@ class BitBLASConfig(QuantizationConfig):
         self.zeros_mode = self.ZEROS_MODE
 
     def __repr__(self) -> str:
-        return (f"BitBLASConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act}, "
-                f"is_sym={self.is_sym}, "
-                f"quant_method={self.quant_method})")
+        return (
+            f"BitBLASConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"is_sym={self.is_sym}, "
+            f"quant_method={self.quant_method})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -122,9 +136,9 @@ class BitBLASConfig(QuantizationConfig):
         return ["quantize_config.json"]
 
     @staticmethod
-    def get_from_keys(config: dict[str, Any],
-                      keys: list[str],
-                      default: Any = None) -> Any:
+    def get_from_keys(
+        config: dict[str, Any], keys: list[str], default: Any = None
+    ) -> Any:
         """Get a value from the model's quantization config."""
         for key in keys:
             if key in config:
@@ -138,34 +152,40 @@ class BitBLASConfig(QuantizationConfig):
         desc_act = cls.get_from_keys(config, ["desc_act"], False)
         is_sym = cls.get_from_keys(config, ["sym"], False)
         quant_method = cls.get_from_keys(config, ["quant_method"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
-        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
-                   lm_head_quantized)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(
+            weight_bits, group_size, desc_act, is_sym, quant_method, lm_head_quantized
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         # compat: autogptq >=0.8.0 use checkpoint_format: str
         # compat: autogptq <=0.7.1 is_bitblas_format: bool
-        is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
-                             or hf_quant_cfg.get("is_bitblas_format", False))
+        is_bitblas_format = hf_quant_cfg.get(
+            "checkpoint_format"
+        ) == "bitblas" or hf_quant_cfg.get("is_bitblas_format", False)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
-                               or user_quant == "bitblas")
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "bitblas"
+        )
 
         if is_bitblas_format and is_valid_user_quant:
-            msg = ("The model is serialized in {} format. Using {} kernel.".
-                   format(cls.get_name(), cls.get_name()))
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
             logger.info(msg)
             return cls.get_name()
 
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["BitBLASLinearMethod"]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["BitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
             return BitBLASLinearMethod(self)
         return None
 
@@ -176,6 +196,7 @@ class BitBLASLinearMethod(LinearMethodBase):
     Args:
         quant_config: The BitBLAS quantization config.
     """
+
     # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
     # Instead of BITBLAS_OPTIMIZE_FEATURES
     # If you want to high contiguous batching
@@ -205,7 +226,7 @@ class BitBLASLinearMethod(LinearMethodBase):
     ) -> None:
         """Creates quantized weights for use in linear operations.
 
-        The function initializes and returns a dictionary containing quantized 
+        The function initializes and returns a dictionary containing quantized
         weights, scales, and zeros
         for performing quantized matrix multiplication operations.
 
@@ -214,11 +235,11 @@ class BitBLASLinearMethod(LinearMethodBase):
             output_partition_sizes: List of output partition sizes.
             input_size: The total size of the input (unused).
             output_size: The total size of the output (unused).
-            params_dtype: 
+            params_dtype:
                 The data type of the parameters (expected to be torch.float16).
 
         Returns:
-            A dictionary containing the quantized weights ('qweight'), 
+            A dictionary containing the quantized weights ('qweight'),
             scales ('scales'), and zeros ('zeros').
 
         Raises:
@@ -230,17 +251,19 @@ class BitBLASLinearMethod(LinearMethodBase):
         weight_loader = extra_weight_attrs["weight_loader"]
 
         if params_dtype not in self.quant_config.get_supported_act_dtypes():
-            raise ValueError("Parameter data type must be torch.float16, "
-                             f"but got {params_dtype}")
+            raise ValueError(
+                f"Parameter data type must be torch.float16, but got {params_dtype}"
+            )
         group_size = self.quant_config.group_size
         if group_size is None:
             group_size = -1
         # Validate output_size_per_partition
         output_size_per_partition = sum(output_partition_sizes)
-        if (group_size != -1 and input_size_per_partition % group_size != 0):
+        if group_size != -1 and input_size_per_partition % group_size != 0:
             raise ValueError(
                 f"Input size per partition ({input_size_per_partition}) must "
-                f"be divisible by group size ({group_size}).")
+                f"be divisible by group size ({group_size})."
+            )
 
         # Initialize or retrieve the BitBLAS matrix multiplication operator.
         self._configure_bitblas_matmul(
@@ -266,34 +289,33 @@ class BitBLASLinearMethod(LinearMethodBase):
             output_dim=0,
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
-            bitblas_tile_size=(self.bitblas_matmul.retrieve_weight_shape()[-2]
-                               if self.bitblas_matmul.propagate_b else None),
+            bitblas_tile_size=(
+                self.bitblas_matmul.retrieve_weight_shape()[-2]
+                if self.bitblas_matmul.propagate_b
+                else None
+            ),
             weight_loader=weight_loader,
         )
 
         # Compute the number of input groups for channel-wise quantization.
-        input_groups = (1 if group_size == -1 else input_size_per_partition //
-                        group_size)
+        input_groups = 1 if group_size == -1 else input_size_per_partition // group_size
 
         # Initialize scales and zeros for the quantized weights.
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 output_size_per_partition,
                 input_groups,
                 device="cuda",
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         if input_groups == 1:
-            scales = ChannelQuantScaleParameter(output_dim=0,
-                                                **weight_scale_args)
+            scales = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args)
         else:
-            scales = GroupQuantScaleParameter(output_dim=0,
-                                              input_dim=1,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=0, input_dim=1, **weight_scale_args
+            )
 
         if self.quant_config.zeros_mode == "quantized":
             zeros = PackedvLLMParameter(
@@ -313,17 +335,22 @@ class BitBLASLinearMethod(LinearMethodBase):
 
         else:
             zeros = BasevLLMParameter(
-                torch.empty(output_size_per_partition,
-                            input_groups,
-                            device="cuda",
-                            dtype=params_dtype),
+                torch.empty(
+                    output_size_per_partition,
+                    input_groups,
+                    device="cuda",
+                    dtype=params_dtype,
+                ),
                 weight_loader=weight_loader,
             )
             # Set attributes to indicate how scales and zeros are applied.
-            set_weight_attrs(zeros, {
-                "input_dim": None if input_groups == 1 else 1,
-                "output_dim": 0,
-            })
+            set_weight_attrs(
+                zeros,
+                {
+                    "input_dim": None if input_groups == 1 else 1,
+                    "output_dim": 0,
+                },
+            )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("scales", scales)
@@ -340,13 +367,19 @@ class BitBLASLinearMethod(LinearMethodBase):
         **extra_weight_attrs,
     ):
         if self.quant_config.quant_method == "gptq":
-            return self.create_weights_gptq(layer, input_size_per_partition,
-                                            output_partition_sizes, input_size,
-                                            output_size, params_dtype,
-                                            **extra_weight_attrs)
+            return self.create_weights_gptq(
+                layer,
+                input_size_per_partition,
+                output_partition_sizes,
+                input_size,
+                output_size,
+                params_dtype,
+                **extra_weight_attrs,
+            )
         else:
             raise ValueError(
-                f"Unsupported quant_method {self.quant_config.quant_method}")
+                f"Unsupported quant_method {self.quant_config.quant_method}"
+            )
 
     def _configure_bitblas_matmul(
         self,
@@ -360,6 +393,7 @@ class BitBLASLinearMethod(LinearMethodBase):
         out_dtype="float16",
     ):
         from bitblas import MatmulConfig
+
         bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
 
         with_scaling = False
@@ -375,7 +409,8 @@ class BitBLASLinearMethod(LinearMethodBase):
                 W_dtype = f"int{bits}"
         else:
             raise ValueError(
-                f"Unsupported quant_method {self.quant_config.quant_method}")
+                f"Unsupported quant_method {self.quant_config.quant_method}"
+            )
 
         matmul_config = MatmulConfig(
             N=outfeatures,
@@ -393,38 +428,40 @@ class BitBLASLinearMethod(LinearMethodBase):
             zeros_mode=zeros_mode,
         )
         self.bitblas_matmul = self._get_or_create_bitblas_operator(
-            matmul_config, enable_tuning)
+            matmul_config, enable_tuning
+        )
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         from bitblas import Matmul, auto_detect_nvidia_target
         from bitblas.cache import get_database_path, global_operator_cache
+
         BITBLAS_DATABASE_PATH = get_database_path()
         BITBLAS_TARGET = auto_detect_nvidia_target()
         if global_operator_cache.size() == 0:
-            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
-                                                     BITBLAS_TARGET)
+            global_operator_cache.load_from_database(
+                BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+            )
 
         bitblas_matmul = global_operator_cache.get(config)
         if bitblas_matmul is None:
-            bitblas_matmul = Matmul(config,
-                                    target=BITBLAS_TARGET,
-                                    enable_tuning=False)
+            bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False)
             if enable_tuning:
-                TUNING_MESSAGE = (f"BitBLAS Operator {config} is tuning ...")
+                TUNING_MESSAGE = f"BitBLAS Operator {config} is tuning ..."
                 logger.info(TUNING_MESSAGE)
                 bitblas_matmul.hardware_aware_finetune(topk=20)
                 global_operator_cache.add(config, bitblas_matmul)
                 global_operator_cache.save_into_database(
-                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+                )
                 TUNED_MESSAGE = (
-                    f"BitBLAS Operator {config} tuned and saved to database.")
+                    f"BitBLAS Operator {config} tuned and saved to database."
+                )
                 logger.info(TUNED_MESSAGE)
             else:
                 _message = f"BitBLAS Operator {config} created."
                 logger.info(_message)
         else:
-            _message = (
-                f"BitBLAS Operator {config} found in global_operator_cache.")
+            _message = f"BitBLAS Operator {config} found in global_operator_cache."
             logger.info(_message)
         return bitblas_matmul
 
@@ -432,7 +469,7 @@ class BitBLASLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.qweight
         scales = layer.scales
@@ -445,7 +482,7 @@ class BitBLASLinearMethod(LinearMethodBase):
         else:
             output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros)
 
-        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
 
         if bias is not None:
             output.add_(bias)  # In-place add
@@ -461,4 +498,5 @@ class BitBLASLinearMethod(LinearMethodBase):
             return self.apply_gptq(*args, **kwargs)
         else:
             raise ValueError(
-                f"Unsupported quant_method {self.quant_config.quant_method}")
+                f"Unsupported quant_method {self.quant_config.quant_method}"
+            )
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 29584188630f80869a7c5c21bf98a203d6a4a899..e5a741e639ad9b1d88648beb3dcd9e3adc41a978 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,23 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 from packaging import version
 
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+    set_weight_attrs,
+)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
 
 class BitsAndBytesConfig(QuantizationConfig):
@@ -36,7 +42,7 @@ class BitsAndBytesConfig(QuantizationConfig):
         bnb_4bit_use_double_quant: bool = False,
         llm_int8_enable_fp32_cpu_offload: bool = False,
         llm_int8_has_fp16_weight: bool = False,
-        llm_int8_skip_modules: Optional[list[str]] = None,
+        llm_int8_skip_modules: list[str] | None = None,
         llm_int8_threshold: float = 6.0,
     ) -> None:
         super().__init__()
@@ -52,16 +58,19 @@ class BitsAndBytesConfig(QuantizationConfig):
         self.llm_int8_threshold = llm_int8_threshold
 
         if self.bnb_4bit_quant_storage not in ["uint8"]:
-            raise ValueError("Unsupported bnb_4bit_quant_storage: "
-                             f"{self.bnb_4bit_quant_storage}")
+            raise ValueError(
+                f"Unsupported bnb_4bit_quant_storage: {self.bnb_4bit_quant_storage}"
+            )
 
     def __repr__(self) -> str:
-        return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
-                f"load_in_4bit={self.load_in_4bit}, "
-                f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
-                f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
-                f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
-                f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
+        return (
+            f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
+            f"load_in_4bit={self.load_in_4bit}, "
+            f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
+            f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
+            f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
+            f"llm_int8_skip_modules={self.llm_int8_skip_modules})"
+        )
 
     @classmethod
     def get_name(self) -> QuantizationMethods:
@@ -81,7 +90,6 @@ class BitsAndBytesConfig(QuantizationConfig):
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
-
         def get_safe_value(config, keys, default_value=None):
             try:
                 value = cls.get_from_keys(config, keys)
@@ -89,30 +97,32 @@ class BitsAndBytesConfig(QuantizationConfig):
             except ValueError:
                 return default_value
 
-        load_in_8bit = get_safe_value(config, ["load_in_8bit"],
-                                      default_value=False)
-        load_in_4bit = get_safe_value(config, ["load_in_4bit"],
-                                      default_value=True)
-        bnb_4bit_compute_dtype = get_safe_value(config,
-                                                ["bnb_4bit_compute_dtype"],
-                                                default_value="float32")
-        bnb_4bit_quant_storage = get_safe_value(config,
-                                                ["bnb_4bit_quant_storage"],
-                                                default_value="uint8")
-        bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
-                                             default_value="fp4")
+        load_in_8bit = get_safe_value(config, ["load_in_8bit"], default_value=False)
+        load_in_4bit = get_safe_value(config, ["load_in_4bit"], default_value=True)
+        bnb_4bit_compute_dtype = get_safe_value(
+            config, ["bnb_4bit_compute_dtype"], default_value="float32"
+        )
+        bnb_4bit_quant_storage = get_safe_value(
+            config, ["bnb_4bit_quant_storage"], default_value="uint8"
+        )
+        bnb_4bit_quant_type = get_safe_value(
+            config, ["bnb_4bit_quant_type"], default_value="fp4"
+        )
         bnb_4bit_use_double_quant = get_safe_value(
-            config, ["bnb_4bit_use_double_quant"], default_value=False)
+            config, ["bnb_4bit_use_double_quant"], default_value=False
+        )
         llm_int8_enable_fp32_cpu_offload = get_safe_value(
-            config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False)
-        llm_int8_has_fp16_weight = get_safe_value(config,
-                                                  ["llm_int8_has_fp16_weight"],
-                                                  default_value=False)
-        llm_int8_skip_modules = get_safe_value(config,
-                                               ["llm_int8_skip_modules"],
-                                               default_value=[])
-        llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
-                                            default_value=6.0)
+            config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False
+        )
+        llm_int8_has_fp16_weight = get_safe_value(
+            config, ["llm_int8_has_fp16_weight"], default_value=False
+        )
+        llm_int8_skip_modules = get_safe_value(
+            config, ["llm_int8_skip_modules"], default_value=[]
+        )
+        llm_int8_threshold = get_safe_value(
+            config, ["llm_int8_threshold"], default_value=6.0
+        )
 
         return cls(
             load_in_8bit=load_in_8bit,
@@ -124,11 +134,12 @@ class BitsAndBytesConfig(QuantizationConfig):
             llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
             llm_int8_has_fp16_weight=llm_int8_has_fp16_weight,
             llm_int8_skip_modules=llm_int8_skip_modules,
-            llm_int8_threshold=llm_int8_threshold)
+            llm_int8_threshold=llm_int8_threshold,
+        )
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["LinearMethodBase", "BitsAndBytesMoEMethod"]]:
+    ) -> Union["LinearMethodBase", "BitsAndBytesMoEMethod"] | None:
         if isinstance(layer, LinearBase):
             if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
                 return UnquantizedLinearMethod()
@@ -140,15 +151,15 @@ class BitsAndBytesConfig(QuantizationConfig):
 
 def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]):
     # Split the prefix into its dot-separated components
-    components = prefix.split('.')
+    components = prefix.split(".")
 
     # Check if any of the skip modules exactly matches any component
-    substr_check = any(module_name in components
-                       for module_name in llm_int8_skip_modules)
+    substr_check = any(
+        module_name in components for module_name in llm_int8_skip_modules
+    )
 
     # Allow certain layers to not be quantized
-    set_components = set(".".join(components[:i + 1])
-                         for i in range(len(components)))
+    set_components = set(".".join(components[: i + 1]) for i in range(len(components)))
     set_llm_int8_skip_modules = set(llm_int8_skip_modules)
     prefix_check = len(set_llm_int8_skip_modules & set_components) != 0
 
@@ -172,39 +183,53 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if version.parse(
-                    bitsandbytes.__version__) < version.parse("0.46.1"):
-                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.46.1.")
+
+            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.46.1."
+                )
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.46.1 via "
-                              "`pip install bitsandbytes>=0.46.1` to use "
-                              "bitsandbytes quantizer.") from err
+            raise ImportError(
+                "Please install bitsandbytes>=0.46.1 via "
+                "`pip install bitsandbytes>=0.46.1` to use "
+                "bitsandbytes quantizer."
+            ) from err
 
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         from bitsandbytes.nn import Int8Params
 
         def create_qweight_for_8bit():
             qweight = Int8Params(
-                data=torch.empty(sum(output_partition_sizes),
-                                 input_size_per_partition,
-                                 dtype=torch.int8),
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition,
+                    dtype=torch.int8,
+                ),
                 has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight,
-                requires_grad=False)
+                requires_grad=False,
+            )
             set_weight_attrs(
-                qweight, {
+                qweight,
+                {
                     "input_dim": 0,
                     "output_dim": 0,
                     "pack_factor": 1,
                     "use_bitsandbytes_8bit": True,
-                    "generation": 0
-                })
+                    "generation": 0,
+                },
+            )
             return qweight
 
         def create_qweight_for_4bit():
@@ -213,20 +238,22 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             total_size = input_size_per_partition * sum(output_partition_sizes)
             if total_size % quant_ratio != 0:
                 raise ValueError(
-                    "The input size is not aligned with the quantized "
-                    "weight shape.")
+                    "The input size is not aligned with the quantized weight shape."
+                )
 
-            qweight = torch.nn.Parameter(torch.empty(total_size // quant_ratio,
-                                                     1,
-                                                     dtype=torch.uint8),
-                                         requires_grad=False)
+            qweight = torch.nn.Parameter(
+                torch.empty(total_size // quant_ratio, 1, dtype=torch.uint8),
+                requires_grad=False,
+            )
             set_weight_attrs(
-                qweight, {
+                qweight,
+                {
                     "input_dim": 0,
                     "output_dim": 0,
                     "pack_factor": quant_ratio,
-                    "use_bitsandbytes_4bit": True
-                })
+                    "use_bitsandbytes_4bit": True,
+                },
+            )
             return qweight
 
         if self.quant_config.load_in_8bit:
@@ -238,22 +265,23 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
         layer.register_parameter("weight", qweight)
         set_weight_attrs(qweight, extra_weight_attrs)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         if self.quant_config.load_in_8bit:
             return self._apply_8bit_weight(layer, x, bias)
         else:
             return self._apply_4bit_weight(layer, x, bias)
 
     def _apply_8bit_weight(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         # only load the bitsandbytes module when needed
         from bitsandbytes import MatmulLtState, matmul
 
@@ -273,11 +301,9 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
 
         out_dim_0 = x.shape[0]
         out_dim_1 = sum(
-            [quant_state[1].shape[0] for quant_state in quant_states.items()])
-        out = torch.empty(out_dim_0,
-                          out_dim_1,
-                          dtype=torch.float16,
-                          device=x.device)
+            [quant_state[1].shape[0] for quant_state in quant_states.items()]
+        )
+        out = torch.empty(out_dim_0, out_dim_1, dtype=torch.float16, device=x.device)
 
         current_index = 0
         for i in range(len(quant_states)):
@@ -287,33 +313,36 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             # create new matmul_states
             if generation == 0 or generation == 1:
                 matmul_states[i] = MatmulLtState()
-                matmul_states[i].CB = qweight[offsets[i]:offsets[i + 1]]
+                matmul_states[i].CB = qweight[offsets[i] : offsets[i + 1]]
                 matmul_states[i].SCB = quant_states[i].to(x.device)
-                matmul_states[i].threshold = (
-                    self.quant_config.llm_int8_threshold)
-                matmul_states[i].has_fp16_weights = (
-                    self.quant_config.llm_int8_has_fp16_weight)
+                matmul_states[i].threshold = self.quant_config.llm_int8_threshold
+                matmul_states[
+                    i
+                ].has_fp16_weights = self.quant_config.llm_int8_has_fp16_weight
                 matmul_states[i].is_training = False
-                if matmul_states[i].threshold > 0.0 and not matmul_states[
-                        i].has_fp16_weights:
+                if (
+                    matmul_states[i].threshold > 0.0
+                    and not matmul_states[i].has_fp16_weights
+                ):
                     matmul_states[i].use_pool = True
 
             new_x = bf_x.unsqueeze(0)
 
-            out[:, current_index:current_index + output_size] = matmul(
-                new_x,
-                qweight[offsets[i]:offsets[i + 1]],
-                state=matmul_states[i])
+            out[:, current_index : current_index + output_size] = matmul(
+                new_x, qweight[offsets[i] : offsets[i + 1]], state=matmul_states[i]
+            )
 
             current_index += output_size
 
             # only update the matmul_states if it is not profile_run
-            if (generation > 0
-                    and not self.quant_config.llm_int8_has_fp16_weight
-                    and matmul_states[i].CB is not None
-                    and matmul_states[i].CxB is not None):
+            if (
+                generation > 0
+                and not self.quant_config.llm_int8_has_fp16_weight
+                and matmul_states[i].CB is not None
+                and matmul_states[i].CxB is not None
+            ):
                 del matmul_states[i].CB
-                qweight[offsets[i]:offsets[i + 1]] = matmul_states[i].CxB
+                qweight[offsets[i] : offsets[i + 1]] = matmul_states[i].CxB
 
         out = out.to(original_type)
 
@@ -328,11 +357,11 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
         return out
 
     def _apply_4bit_weight(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         original_type = x.dtype
         original_shape = x.shape
         reshape_after_matmul = False
@@ -347,11 +376,9 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
 
         out_dim_0 = x.shape[0]
         out_dim_1 = sum(
-            [quant_state[1].shape[0] for quant_state in quant_states.items()])
-        out = torch.empty(out_dim_0,
-                          out_dim_1,
-                          dtype=torch.bfloat16,
-                          device=x.device)
+            [quant_state[1].shape[0] for quant_state in quant_states.items()]
+        )
+        out = torch.empty(out_dim_0, out_dim_1, dtype=torch.bfloat16, device=x.device)
         apply_bnb_4bit(bf_x, qweight, offsets, out)
         out = out.to(original_type)
 
@@ -372,6 +399,7 @@ def _apply_bnb_4bit(
 ) -> None:
     # only load the bitsandbytes module when needed
     from bitsandbytes import matmul_4bit
+
     quant_states = weight.bnb_quant_state
     current_index = 0
     for i in range(len(quant_states)):
@@ -380,8 +408,9 @@ def _apply_bnb_4bit(
         # matmul_4bit(..., out = ...).  Infeasible now due to the bug
         # https://github.com/TimDettmers/bitsandbytes/issues/1235.
         # Need to change  after the bug is fixed.
-        out[:, current_index:current_index + output_size] = matmul_4bit(
-            x, weight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+        out[:, current_index : current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i] : offsets[i + 1]].t(), quant_states[i]
+        )
         current_index += output_size
 
 
@@ -395,11 +424,13 @@ def _apply_bnb_4bit_fake(
 
 
 try:
-    direct_register_custom_op(op_name="apply_bnb_4bit",
-                              op_func=_apply_bnb_4bit,
-                              mutates_args=["out"],
-                              fake_impl=_apply_bnb_4bit_fake,
-                              dispatch_key=current_platform.dispatch_key)
+    direct_register_custom_op(
+        op_name="apply_bnb_4bit",
+        op_func=_apply_bnb_4bit,
+        mutates_args=["out"],
+        fake_impl=_apply_bnb_4bit_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
     apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
 
 except AttributeError as error:
@@ -421,14 +452,18 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         super().__init__(moe)
         try:
             import bitsandbytes
-            if version.parse(
-                    bitsandbytes.__version__) < version.parse("0.46.1"):
-                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.46.1.")
+
+            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.46.1."
+                )
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.46.1 via "
-                              "`pip install bitsandbytes>=0.46.1` to use "
-                              "bitsandbytes quantizer.") from err
+            raise ImportError(
+                "Please install bitsandbytes>=0.46.1 via "
+                "`pip install bitsandbytes>=0.46.1` to use "
+                "bitsandbytes quantizer."
+            ) from err
         self.quant_config = quant_config
 
     def create_weights(
@@ -454,7 +489,8 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -465,27 +501,28 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
-        assert self.fused_experts is None
 
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `BitsAndBytesMoEMethod` yet.")
+                "EPLB not supported for `BitsAndBytesMoEMethod` yet."
+            )
+
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -498,7 +535,8 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
         if self.quant_config.load_in_8bit:
             w13, w2 = self._apply_8bit_dequant(layer)
         else:
@@ -528,8 +566,9 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ):
         quant_ratio = calculate_quant_ratio(params_dtype)
         # Fused gate_up_proj (column parallel)
-        w13_total_size = (hidden_size * 2 *
-                          intermediate_size_per_partition) // quant_ratio
+        w13_total_size = (
+            hidden_size * 2 * intermediate_size_per_partition
+        ) // quant_ratio
         w13_qweight = torch.nn.Parameter(
             torch.empty(
                 num_experts,
@@ -544,26 +583,20 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         set_weight_attrs(
             w13_qweight,
             {
-                "num_experts":
-                num_experts,
-                "input_dim":
-                hidden_size,
-                "output_dim":
-                2 * intermediate_size_per_partition,
+                "num_experts": num_experts,
+                "input_dim": hidden_size,
+                "output_dim": 2 * intermediate_size_per_partition,
                 "experts_shape": (
                     num_experts,
                     intermediate_size_per_partition * 2,
                     hidden_size,
                 ),
-                "pack_factor":
-                quant_ratio,
-                "use_bitsandbytes_4bit":
-                True,
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes_4bit": True,
             },
         )
         # down_proj (row parallel)
-        w2_total_size = (hidden_size *
-                         intermediate_size_per_partition) // quant_ratio
+        w2_total_size = (hidden_size * intermediate_size_per_partition) // quant_ratio
         w2_qweight = torch.nn.Parameter(
             torch.empty(
                 num_experts,
@@ -576,21 +609,16 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         set_weight_attrs(
             w2_qweight,
             {
-                "num_experts":
-                num_experts,
-                "input_dim":
-                intermediate_size_per_partition,
-                "output_dim":
-                hidden_size,
+                "num_experts": num_experts,
+                "input_dim": intermediate_size_per_partition,
+                "output_dim": hidden_size,
                 "experts_shape": (
                     num_experts,
                     hidden_size,
                     intermediate_size_per_partition,
                 ),
-                "pack_factor":
-                quant_ratio,
-                "use_bitsandbytes_4bit":
-                True,
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes_4bit": True,
             },
         )
         layer.register_parameter("w2_weight", w2_qweight)
@@ -608,8 +636,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         raise NotImplementedError
 
     def _apply_4bit_dequnt(
-            self, layer: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor]:
+        self, layer: torch.nn.Module
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         from bitsandbytes.functional import dequantize_4bit
+
         w13 = dequantize_4bit(
             layer.w13_weight.reshape(-1, 1),
             layer.w13_weight.bnb_quant_state,
@@ -623,5 +653,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         return w13, w2
 
     def _apply_8bit_dequant(
-            self, layer: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor]:
+        self, layer: torch.nn.Module
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6655f89136238c74e0b55ec0daecd929da3908aa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index d6550dd16892f6707f39896b105854b28874a851..6c7d4cd7bd9ab93cecf25c1bb2b8d1474451e69f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -5,39 +5,62 @@ from contextlib import suppress
 from typing import TYPE_CHECKING, Any, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import (CompressionFormat,
-                                       SparsityCompressionConfig,
-                                       SparsityStructure)
-from compressed_tensors.quantization import (QuantizationArgs,
-                                             QuantizationStrategy,
-                                             QuantizationType)
+from compressed_tensors.config import (
+    CompressionFormat,
+    SparsityCompressionConfig,
+    SparsityStructure,
+)
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
 from compressed_tensors.transform import TransformConfig
 
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
-    CompressedTensorsMoEMethod)
+    CompressedTensorsMoEMethod,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
-    CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
-    CompressedTensorsW4A8Fp8, CompressedTensorsW4A8Int,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    W4A16SPARSE24_SUPPORTED_BITS,
+    WNA16_SUPPORTED_BITS,
+    CompressedTensors24,
+    CompressedTensorsScheme,
+    CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A8Fp8,
+    CompressedTensorsW4A8Int,
+    CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
-    CompressedTensorsLinearTransformMethod, get_linear_transform_schemes)
+    CompressedTensorsLinearTransformMethod,
+    get_linear_transform_schemes,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target, is_activation_quantization_format,
-    should_ignore_layer)
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
@@ -48,11 +71,10 @@ logger = init_logger(__name__)
 __all__ = ["CompressedTensorsLinearMethod"]
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
-QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]]
+QUANTIZATION_SCHEME_MAP_TYPE = dict[str, dict[str, QuantizationArgs] | None]
 
 
 class CompressedTensorsConfig(QuantizationConfig):
-
     def __init__(
         self,
         target_scheme_map: dict[str, Any],
@@ -60,9 +82,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         quant_format: str,
         sparsity_scheme_map: dict[str, SparsityCompressionConfig],
         sparsity_ignore_list: list[str],
-        kv_cache_scheme: Optional[dict[str, Any]] = None,
-        config: Optional[dict[str, Any]] = None,
-        transform_config: Optional[dict[str, Any]] = None,
+        kv_cache_scheme: dict[str, Any] | None = None,
+        config: dict[str, Any] | None = None,
+        transform_config: dict[str, Any] | None = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -75,8 +97,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.config = config
 
         if transform_config:
-            self.transform_config = TransformConfig.model_validate(
-                transform_config)
+            self.transform_config = TransformConfig.model_validate(transform_config)
         else:
             self.transform_config = None
 
@@ -94,16 +115,16 @@ class CompressedTensorsConfig(QuantizationConfig):
         return "compressed-tensors"
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
-            self.target_scheme_map)
+        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map)
         self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
         self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
-            self.sparsity_scheme_map)
+            self.sparsity_scheme_map
+        )
         self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
-            self.sparsity_ignore_list)
+            self.sparsity_ignore_list
+        )
         if self.kv_cache_scheme is not None:
-            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(
-                self.kv_cache_scheme)
+            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme)
 
     def get_quant_method(
         self,
@@ -116,8 +137,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
             input_tfms, output_tfms = get_linear_transform_schemes(
-                layer, prefix, self.transform_config,
-                self.packed_modules_mapping)
+                layer, prefix, self.transform_config, self.packed_modules_mapping
+            )
 
             # choose quantization method
             quant_method: LinearMethodBase = UnquantizedLinearMethod()
@@ -128,7 +149,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             # choose transform method
             if any((input_tfms, output_tfms)):
                 return CompressedTensorsLinearTransformMethod.from_schemes(
-                    quant_method, quant_scheme, input_tfms, output_tfms)
+                    quant_method, quant_scheme, input_tfms, output_tfms
+                )
 
             else:
                 return quant_method
@@ -143,10 +165,10 @@ class CompressedTensorsConfig(QuantizationConfig):
     def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: list[str] = cast(list[str], config.get("ignore", []))
         quant_format = cast(str, config.get("format"))
-        target_scheme_map = cls._quantization_scheme_map_from_config(
-            config=config)
+        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
         sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
-            config=config)
+            config=config
+        )
         transform_config = config.get("transform_config")
 
         return cls(
@@ -173,18 +195,17 @@ class CompressedTensorsConfig(QuantizationConfig):
         if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
             return dict(), []
 
-        sparsity_config = SparsityCompressionConfig.model_validate(
-            sparsity_config)
+        sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config)
         sparse_scheme_map: dict[str, SparsityCompressionConfig] = {
-            target: sparsity_config
-            for target in sparsity_config.targets or list()
+            target: sparsity_config for target in sparsity_config.targets or list()
         }
         sparsity_ignore_list = sparsity_config.ignore or list()
         return sparse_scheme_map, sparsity_ignore_list
 
     @classmethod
     def _quantization_scheme_map_from_config(
-            cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        cls, config: dict[str, Any]
+    ) -> QUANTIZATION_SCHEME_MAP_TYPE:
         """
         :param config: The `quantization_config` dictionary from config.json
         :return: A dictionary mapping target layer names to their corresponding
@@ -207,19 +228,19 @@ class CompressedTensorsConfig(QuantizationConfig):
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
-                target_scheme_map[target][
-                    "weights"] = QuantizationArgs.model_validate(
-                        quant_config.get("weights"))
+                target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(
+                    quant_config.get("weights")
+                )
 
                 target_scheme_map[target]["input_activations"] = None
-                target_scheme_map[target]["format"] = quant_config.get(
-                    "format")
+                target_scheme_map[target]["format"] = quant_config.get("format")
                 format = target_scheme_map[target].get("format")
                 # If no per-config format defined, use global format in config
-                act_quant_format = is_activation_quantization_format(
-                    format
-                ) if format is not None else is_activation_quantization_format(
-                    quant_format)
+                act_quant_format = (
+                    is_activation_quantization_format(format)
+                    if format is not None
+                    else is_activation_quantization_format(quant_format)
+                )
                 # TODO(czhu): w4a8fp8 is in packed-quantized format
                 # but needs input activation quantization
                 input_activations = quant_config.get("input_activations")
@@ -229,22 +250,25 @@ class CompressedTensorsConfig(QuantizationConfig):
                     # should be w8a16fp8 w8a16fp8 can also run for cases where
                     # there is an input_quant but it is ignored
                     if not input_activations:
-                        assert target_scheme_map[target][
-                            "weights"].type == QuantizationType.FLOAT
+                        assert (
+                            target_scheme_map[target]["weights"].type
+                            == QuantizationType.FLOAT
+                        )
                     else:
-                        target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
-                                quant_config.get("input_activations"))
+                        target_scheme_map[target]["input_activations"] = (
+                            QuantizationArgs.model_validate(
+                                quant_config.get("input_activations")
+                            )
+                        )
         return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
         return []
 
-    def _check_scheme_supported(self,
-                                min_capability: int,
-                                error: bool = True,
-                                match_exact: bool = False) -> bool:
+    def _check_scheme_supported(
+        self, min_capability: int, error: bool = True, match_exact: bool = False
+    ) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
         if capability_tuple is not None:
@@ -255,115 +279,155 @@ class CompressedTensorsConfig(QuantizationConfig):
                     raise RuntimeError(
                         "Quantization scheme is not supported for ",
                         "the current GPU. Required capability: ",
-                        f"{min_capability}. Current capability: {capability}.")
+                        f"{min_capability}. Current capability: {capability}.",
+                    )
             else:
                 supported = capability >= min_capability
                 if error and not supported:
                     raise RuntimeError(
                         "Quantization scheme is not supported for ",
                         f"the current GPU. Min capability: {min_capability}. ",
-                        f"Current capability: {capability}.")
+                        f"Current capability: {capability}.",
+                    )
             return supported
         else:
             return False
 
-    def _is_fp4a4_nvfp4(self, weight_quant: QuantizationArgs,
-                        input_quant: QuantizationArgs):
-
+    def _is_fp4a4_nvfp4(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ):
         if weight_quant is None or input_quant is None:
             return False
 
-        is_tensor_group_quant = (weight_quant.strategy
-                                 == QuantizationStrategy.TENSOR_GROUP.value
-                                 and input_quant.strategy
-                                 == QuantizationStrategy.TENSOR_GROUP.value)
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
+            and input_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
+        )
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
 
-        is_group_size_16 = (weight_quant.group_size == 16
-                            and input_quant.group_size == 16)
-        is_float_type = (weight_quant.type == QuantizationType.FLOAT
-                         and input_quant.type == QuantizationType.FLOAT.value)
+        is_group_size_16 = (
+            weight_quant.group_size == 16 and input_quant.group_size == 16
+        )
+        is_float_type = (
+            weight_quant.type == QuantizationType.FLOAT
+            and input_quant.type == QuantizationType.FLOAT
+        )
         is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
 
-        return (is_tensor_group_quant and is_float_type and is_4_bits
-                and is_group_size_16 and is_symmetric)
-
-    def _is_fp4a16_nvfp4(self, weight_quant: QuantizationArgs,
-                         input_quant: QuantizationArgs):
+        return (
+            is_tensor_group_quant
+            and is_float_type
+            and is_4_bits
+            and is_group_size_16
+            and is_symmetric
+        )
 
+    def _is_fp4a16_nvfp4(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ):
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
+        )
         is_symmetric = weight_quant.symmetric
 
         is_group_size_16 = weight_quant.group_size == 16
         is_float_type = weight_quant.type == QuantizationType.FLOAT
         is_4_bits = weight_quant.num_bits == 4
 
-        return (is_weight_only and is_tensor_group_quant and is_float_type
-                and is_4_bits and is_group_size_16 and is_symmetric)
+        return (
+            is_weight_only
+            and is_tensor_group_quant
+            and is_float_type
+            and is_4_bits
+            and is_group_size_16
+            and is_symmetric
+        )
 
-    def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs,
-                               input_quant: QuantizationArgs) -> bool:
+    def _is_static_tensor_w8a8(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
-        is_tensor = (weight_strategy and input_quant.strategy
-                     == QuantizationStrategy.TENSOR.value)
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_tensor = (
+            weight_strategy
+            and input_quant.strategy == QuantizationStrategy.TENSOR.value
+        )
         is_static = not weight_quant.dynamic and not input_quant.dynamic
 
         # Both symmetric and asymmetric input quantization supported.
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
-    def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs,
-                               input_quant: QuantizationArgs) -> bool:
+    def _is_dynamic_token_w8a8(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
-        is_token = (weight_strategy and input_quant.strategy
-                    == QuantizationStrategy.TOKEN.value)
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
         # Both symmetric and asymmetric input quantization supported.
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
-    def _is_dynamic_token_w4a8_int(self, weight_quant: QuantizationArgs,
-                                   input_quant: QuantizationArgs) -> bool:
+    def _is_dynamic_token_w4a8_int(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.GROUP.value
-            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
-        is_token = (weight_strategy and input_quant.strategy
-                    == QuantizationStrategy.TOKEN.value)
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
         # Both symmetric and asymmetric input quantization supported.
         # Only symmetric weight quantization supported.
-        return (is_weight_4_bits and is_activation_8_bits and is_token
-                and weight_quant.symmetric and is_dynamic)
+        return (
+            is_weight_4_bits
+            and is_activation_8_bits
+            and is_token
+            and weight_quant.symmetric
+            and is_dynamic
+        )
 
-    def _is_fp8_w8a8(self, weight_quant: QuantizationArgs,
-                     input_quant: QuantizationArgs) -> bool:
+    def _is_fp8_w8a8(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
             return False
 
         # Confirm weight scheme is supported.
-        is_floating_point = (weight_quant.type == QuantizationType.FLOAT
-                             and input_quant.type == QuantizationType.FLOAT)
+        is_floating_point = (
+            weight_quant.type == QuantizationType.FLOAT
+            and input_quant.type == QuantizationType.FLOAT
+        )
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
-            QuantizationStrategy.BLOCK
-        ])
-        if not (is_floating_point and is_symmetric_weight and is_static_weight
-                and is_tensor_or_channel_or_block_weight):
+        is_tensor_or_channel_or_block_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK,
+        ]
+        if not (
+            is_floating_point
+            and is_symmetric_weight
+            and is_static_weight
+            and is_tensor_or_channel_or_block_weight
+        ):
             return False
 
         # Dynamic quantization is always supported if weights supported.
@@ -372,45 +436,56 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         # Confirm activation scheme is supported.
         is_symmetric_activation = input_quant.symmetric
-        is_per_tensor_activation = (
-            input_quant.strategy == QuantizationStrategy.TENSOR)
+        is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
         return is_symmetric_activation and is_per_tensor_activation
 
-    def _is_fp8_w4a8(self, weight_quant: QuantizationArgs,
-                     input_quant: QuantizationArgs) -> bool:
+    def _is_fp8_w4a8(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         if not weight_quant or not input_quant:
             return False
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
-        weight_strategy = (
-            weight_quant.strategy == QuantizationStrategy.GROUP.value)
-        is_token = (weight_strategy and input_quant.strategy
-                    == QuantizationStrategy.TOKEN.value)
+        weight_strategy = weight_quant.strategy == QuantizationStrategy.GROUP.value
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         # Only per-group symmetric weight (4bit)
         # + per-tok symmetric activation (8bit) quantization supported.
-        return (is_weight_4_bits and is_activation_8_bits and is_token
-                and is_symmetric and is_dynamic)
-
-    def _is_fp8_w4a8_sm90(self, weight_quant: QuantizationArgs,
-                          input_quant: QuantizationArgs) -> bool:
-        return (self._check_scheme_supported(90, error=False, match_exact=True)
-                and self._is_fp8_w4a8(weight_quant, input_quant))
-
-    def _is_fp8_w8a8_sm90(self, weight_quant: QuantizationArgs,
-                          input_quant: QuantizationArgs) -> bool:
-        return (self._check_scheme_supported(90, error=False, match_exact=True)
-                and self._is_fp8_w8a8(weight_quant, input_quant))
-
-    def _is_fp8_w8a8_sm100(self, weight_quant: QuantizationArgs,
-                           input_quant: QuantizationArgs) -> bool:
-        return (self._check_scheme_supported(
-            100, error=False, match_exact=True)
-                and self._is_fp8_w8a8(weight_quant, input_quant))
-
-    def _is_fp8_w8a16(self, weight_quant: QuantizationArgs,
-                      input_quant: QuantizationArgs) -> bool:
+        return (
+            is_weight_4_bits
+            and is_activation_8_bits
+            and is_token
+            and is_symmetric
+            and is_dynamic
+        )
+
+    def _is_fp8_w4a8_sm90(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
+        return self._check_scheme_supported(
+            90, error=False, match_exact=True
+        ) and self._is_fp8_w4a8(weight_quant, input_quant)
+
+    def _is_fp8_w8a8_sm90(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
+        return self._check_scheme_supported(
+            90, error=False, match_exact=True
+        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+
+    def _is_fp8_w8a8_sm100(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
+        return self._check_scheme_supported(
+            100, error=False, match_exact=True
+        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+
+    def _is_fp8_w8a16(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
             return False
@@ -422,33 +497,35 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Confirm weight scheme is supported.
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
-            QuantizationStrategy.BLOCK
-        ])
-        if not (is_symmetric_weight and is_static_weight  # noqa: SIM103
-                and is_tensor_or_channel_or_block_weight):
-            return False
-
-        # All conditions satisfied.
-        return True
+        is_tensor_or_channel_or_block_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK,
+        ]
+        return (
+            is_symmetric_weight
+            and is_static_weight
+            and is_tensor_or_channel_or_block_weight
+        )
 
-    def _is_wNa16_group_channel(self, weight_quant: QuantizationArgs,
-                                input_quant: QuantizationArgs) -> bool:
+    def _is_wNa16_group_channel(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (
             weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
         is_static = not weight_quant.dynamic
 
-        return (is_channel_group and input_quant_none and is_static)
+        return is_channel_group and input_quant_none and is_static
 
     def _get_scheme_from_parts(
-            self,
-            weight_quant: QuantizationArgs,
-            input_quant: QuantizationArgs,
-            format: Optional[str] = None) -> "CompressedTensorsScheme":
-
+        self,
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
+        format: str | None = None,
+    ) -> "CompressedTensorsScheme":
         # use the per-layer format if defined, otherwise, use global format
         format = format if format is not None else self.quant_format
 
@@ -457,94 +534,105 @@ class CompressedTensorsConfig(QuantizationConfig):
             return CompressedTensorsW4A16Fp4()
 
         if self._is_fp8_w4a8_sm90(weight_quant, input_quant):
-            return CompressedTensorsW4A8Fp8(num_bits=weight_quant.num_bits,
-                                            strategy=weight_quant.strategy,
-                                            symmetric=weight_quant.symmetric,
-                                            group_size=weight_quant.group_size,
-                                            actorder=weight_quant.actorder)
+            return CompressedTensorsW4A8Fp8(
+                num_bits=weight_quant.num_bits,
+                strategy=weight_quant.strategy,
+                symmetric=weight_quant.symmetric,
+                group_size=weight_quant.group_size,
+                actorder=weight_quant.actorder,
+            )
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
-            if (format == CompressionFormat.marlin_24.value
-                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
+            if (
+                format == CompressionFormat.marlin_24.value
+                and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS
+            ):
                 assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
-                    group_size=weight_quant.group_size)
-            if (format == CompressionFormat.pack_quantized.value
-                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                    group_size=weight_quant.group_size,
+                )
+            if (
+                format == CompressionFormat.pack_quantized.value
+                and weight_quant.num_bits in WNA16_SUPPORTED_BITS
+            ):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
-                    actorder=weight_quant.actorder)
+                    actorder=weight_quant.actorder,
+                )
 
         act_quant_format = is_activation_quantization_format(format)
         if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
-                if cutlass_fp4_supported(
-                ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
+                if cutlass_fp4_supported() or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
                     return CompressedTensorsW4A4Fp4()
                 else:
                     logger.warning_once(
                         "Current platform does not support cutlass NVFP4."
-                        " Running CompressedTensorsW4A16Fp4.")
-                    return CompressedTensorsW4A16Fp4(
-                        has_input_global_scale=True)
+                        " Running CompressedTensorsW4A16Fp4."
+                    )
+                    return CompressedTensorsW4A16Fp4(has_input_global_scale=True)
 
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False
+                )
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         weight_quant=weight_quant,
-                        is_static_input_scheme=(input_quant
-                                                and not input_quant.dynamic))
+                        is_static_input_scheme=(
+                            input_quant and not input_quant.dynamic
+                        ),
+                    )
                 else:
                     # note: input_quant will be present for converted models;
                     # will be ignored during inference post loading
                     return CompressedTensorsW8A16Fp8(
                         strategy=weight_quant.strategy,
-                        is_static_input_scheme=not input_quant.dynamic)
+                        is_static_input_scheme=not input_quant.dynamic,
+                    )
 
             # note: input_quant can be None
             if self._is_fp8_w8a16(weight_quant, input_quant):
-                is_static_input_scheme = (input_quant
-                                          and not input_quant.dynamic)
+                is_static_input_scheme = input_quant and not input_quant.dynamic
                 return CompressedTensorsW8A16Fp8(
                     strategy=weight_quant.strategy,
-                    is_static_input_scheme=is_static_input_scheme)
+                    is_static_input_scheme=is_static_input_scheme,
+                )
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Int8(
                     strategy=weight_quant.strategy,
                     is_static_input_scheme=True,
-                    input_symmetric=input_quant.symmetric)
+                    input_symmetric=input_quant.symmetric,
+                )
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Int8(
                     strategy=weight_quant.strategy,
                     is_static_input_scheme=False,
-                    input_symmetric=input_quant.symmetric)
+                    input_symmetric=input_quant.symmetric,
+                )
 
             if self._is_dynamic_token_w4a8_int(weight_quant, input_quant):
-                is_static_input_scheme = (input_quant
-                                          and not input_quant.dynamic)
+                is_static_input_scheme = input_quant and not input_quant.dynamic
                 return CompressedTensorsW4A8Int(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size,
                     is_static_input_scheme=is_static_input_scheme,
-                    input_symmetric=input_quant.symmetric)
+                    input_symmetric=input_quant.symmetric,
+                )
 
-        raise NotImplementedError(
-            "No compressed-tensors compatible scheme was found.")
+        raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(self,
-                   layer: torch.nn.Module,
-                   layer_name: Optional[str] = None
-                   ) -> Optional["CompressedTensorsScheme"]:
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: str | None = None
+    ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -561,9 +649,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Find the "target" in the compressed-tensors config
         # that our layer conforms to.
         # TODO (@kylesayrs): support ignore module names with ct matching utils
-        if should_ignore_layer(layer_name,
-                               ignore=self.ignore,
-                               fused_mapping=self.packed_modules_mapping):
+        if should_ignore_layer(
+            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
             return None
 
         # Will be empty for models with only sparsity
@@ -573,7 +661,8 @@ class CompressedTensorsConfig(QuantizationConfig):
                 layer_name=layer_name,
                 module=layer,
                 targets=self.target_scheme_map.keys(),
-                fused_mapping=self.packed_modules_mapping)
+                fused_mapping=self.packed_modules_mapping,
+            )
 
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
@@ -582,25 +671,31 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         # Find the sparsity scheme of the layer
         # assume that fused layers inherit first component's sparsity scheme
-        sparsity_targets = (self.sparsity_scheme_map.keys() -
-                            set(self.sparsity_ignore_list))
-        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        sparsity_targets = self.sparsity_scheme_map.keys() - set(
+            self.sparsity_ignore_list
+        )
+        sparsity_scheme: SparsityCompressionConfig | None = None
         with suppress(ValueError):
             matched_target = find_matched_target(
                 layer_name=layer_name,
                 module=layer,
                 targets=sparsity_targets,
-                fused_mapping=self.packed_modules_mapping)
+                fused_mapping=self.packed_modules_mapping,
+            )
             sparsity_scheme = self.sparsity_scheme_map[matched_target]
 
-        if self.supports_cutlass_24(weight_quant=weight_quant,
-                                    input_quant=input_quant,
-                                    sparsity_scheme=sparsity_scheme):
+        if self.supports_cutlass_24(
+            weight_quant=weight_quant,
+            input_quant=input_quant,
+            sparsity_scheme=sparsity_scheme,
+        ):
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
-            model_compression_config = (None if sparsity_scheme is None
-                                        or sparsity_scheme.format == "dense"
-                                        else self.config)
+            model_compression_config = (
+                None
+                if sparsity_scheme is None or sparsity_scheme.format == "dense"
+                else self.config
+            )
 
             scheme = CompressedTensors24(
                 quantized=weight_quant is not None or input_quant is not None,
@@ -609,26 +704,26 @@ class CompressedTensorsConfig(QuantizationConfig):
                 model_compression_config=model_compression_config,
             )
         elif weight_quant is None:
-            logger.warning_once("Acceleration for non-quantized schemes is "
-                                "not supported by Compressed Tensors. "
-                                "Falling back to UnquantizedLinearMethod")
+            logger.warning_once(
+                "Acceleration for non-quantized schemes is "
+                "not supported by Compressed Tensors. "
+                "Falling back to UnquantizedLinearMethod"
+            )
             return None
 
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
-                weight_quant=weight_quant,
-                input_quant=input_quant,
-                format=format)
+                weight_quant=weight_quant, input_quant=input_quant, format=format
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
-                     layer_name)
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
         in compressed-tensors. If this is the case, return its equivalent
@@ -644,11 +739,21 @@ class CompressedTensorsConfig(QuantizationConfig):
         # If no matches, return None
         return None
 
+    def has_blocked_weights(self) -> bool:
+        for scheme in self.target_scheme_map.values():
+            weight_quant = scheme.get("weights")
+            if (
+                weight_quant is not None
+                and weight_quant.strategy == QuantizationStrategy.BLOCK
+            ):
+                return True
+        return False
+
     @staticmethod
     def supports_cutlass_24(
-            weight_quant: Optional[QuantizationArgs],
-            input_quant: Optional[QuantizationArgs],
-            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        weight_quant: QuantizationArgs | None,
+        input_quant: QuantizationArgs | None,
+        sparsity_scheme: SparsityCompressionConfig | None = None,
     ) -> bool:
         """
         Check if the layer is supported by the Cutlass 2:4 Kernel
@@ -658,7 +763,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             - Weight only quantization is not-supported
             - Supported weight quantization strategies are TENSOR and CHANNEL
             - Supported input quantization strategies are TENSOR and TOKEN
-            - Only 8 bit quantization is supported 
+            - Only 8 bit quantization is supported
 
         :return: True if the layer is supported by the Cutlass 2:4 Kernel
             False otherwise
@@ -667,16 +772,17 @@ class CompressedTensorsConfig(QuantizationConfig):
             return False
 
         is_valid_sparsity_structure: bool = (
-            sparsity_scheme.sparsity_structure ==
-            SparsityStructure.TWO_FOUR.value)
+            sparsity_scheme.sparsity_structure == SparsityStructure.TWO_FOUR.value
+        )
 
         valid_compressors = {
             CompressionFormat.dense.value,
-            CompressionFormat.sparse_24_bitmask.value
+            CompressionFormat.sparse_24_bitmask.value,
         }
 
-        is_valid_sparsity = (is_valid_sparsity_structure
-                             and sparsity_scheme.format in valid_compressors)
+        is_valid_sparsity = (
+            is_valid_sparsity_structure and sparsity_scheme.format in valid_compressors
+        )
 
         if not is_valid_sparsity:
             return False
@@ -691,7 +797,7 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         supported_weight_quant_strategies = [
             QuantizationStrategy.TENSOR.value,
-            QuantizationStrategy.CHANNEL.value
+            QuantizationStrategy.CHANNEL.value,
         ]
 
         assert weight_quant is not None
@@ -700,7 +806,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             return False
 
         supported_input_quant_strategies = [
-            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.TOKEN.value,
         ]
 
         if input_quant.strategy not in supported_input_quant_strategies:
@@ -710,18 +817,22 @@ class CompressedTensorsConfig(QuantizationConfig):
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
-
     def __init__(self, quantization_config: CompressedTensorsConfig):
         self.quantization_config = quantization_config
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.scheme.process_weights_after_loading(layer)
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         """
         Use the CompressedTensorsScheme associated with each layer to create
         the necessary parameters for the layer. See LinearMethodBase for param
@@ -735,12 +846,15 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
             output_partition_sizes=output_partition_sizes,
             output_size=output_size,
             params_dtype=params_dtype,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None):
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ):
         """
         Use the output of create_weights and the CompressedTensorsScheme
         associated with the layer to apply the forward pass with the
@@ -764,7 +878,7 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
         super().__init__(quant_config)
 
     @staticmethod
-    def validate_kv_cache_scheme(kv_cache_scheme: Optional[dict[str, Any]]):
+    def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None):
         """
         Validator for the kv cache scheme. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
@@ -780,18 +894,21 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
             raise NotImplementedError(
                 "Currently supported kv cache quantization is "
                 "num_bits=8, type=float, however "
-                f"received num_bits={num_bits}, type={type_}")
+                f"received num_bits={num_bits}, type={type_}"
+            )
 
         strategy = kv_cache_scheme.get("strategy")
         if strategy != "tensor":
             raise NotImplementedError(
                 "Only support per-tensor scaling factor "
                 "for compressed-tensors KV cache. "
-                f"Expected strategy: tensor, found strategy: {strategy}")
+                f"Expected strategy: tensor, found strategy: {strategy}"
+            )
 
         is_symmetric = kv_cache_scheme.get("symmetric")
         if not is_symmetric:
             raise NotImplementedError(
                 "Only support symmetric scaling factor "
                 "for compressed-tensors KV cache. "
-                f"However found symmetric: {is_symmetric}")
+                f"However found symmetric: {is_symmetric}"
+            )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 8504ba73defb38c7125a39726af5e203a6977e31..06ee96d55419c68f2caf9e5b19bac7ba1f06be3e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -2,55 +2,85 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
+from collections.abc import Callable
 from enum import Enum
-from typing import Callable, Optional, Union
 
 import torch
 from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import (ActivationOrdering,
-                                             QuantizationStrategy)
+from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
-    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
-    FusedMoEPermuteExpertsUnpermute, FusedMoeWeightScaleSupported)
+    FusedMoE,
+    FusedMoEActivationFormat,
+    FusedMoEConfig,
+    FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoeWeightScaleSupported,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, fp8_w8a8_moe_quant_config,
-    int4_w4a16_moe_quant_config, int8_w8a8_moe_quant_config,
-    int8_w8a16_moe_quant_config, nvfp4_moe_quant_config)
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+    int4_w4a16_moe_quant_config,
+    int8_w8a8_moe_quant_config,
+    int8_w8a16_moe_quant_config,
+    nvfp4_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.cpu_fused_moe import select_experts
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    is_valid_flashinfer_cutlass_fused_moe)
+    is_valid_flashinfer_cutlass_fused_moe,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
-    WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+    WNA16_SUPPORTED_BITS,
+    WNA16_SUPPORTED_TYPES_MAP,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target)
+    find_matched_target,
+)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
-    select_nvfp4_gemm_impl)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    expert_weight_is_col_major, requant_weight_ue8m0_inplace)
+    expert_weight_is_col_major,
+    requant_weight_ue8m0_inplace,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_moe_marlin_supports_layer, marlin_make_workspace_new,
-    marlin_moe_permute_scales)
+    check_moe_marlin_supports_layer,
+    marlin_make_workspace_new,
+    marlin_moe_permute_scales,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    prepare_moe_fp4_layer_for_marlin)
+    prepare_moe_fp4_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    prepare_moe_fp8_layer_for_marlin)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    swizzle_blockscale)
+    prepare_moe_fp8_layer_for_marlin,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+    all_close_1d,
+    normalize_e4m3fn_to_e4m3fnuz,
+    per_tensor_dequantize,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils.deep_gemm import (get_col_major_tma_aligned_tensor,
-                                  is_deep_gemm_e8m0_used)
+from vllm.utils.deep_gemm import (
+    get_col_major_tma_aligned_tensor,
+    is_deep_gemm_e8m0_used,
+)
 
 logger = init_logger(__name__)
 
@@ -61,22 +91,21 @@ class GPTQMarlinState(Enum):
 
 
 __all__ = [
-    "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsMoEMethod",
+    "CompressedTensorsW8A8Fp8MoEMethod",
     "CompressedTensorsW8A8Int8MoEMethod",
-    "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod", "CompressedTensorsW4A8Int8MoEMethod"
+    "CompressedTensorsWNA16MarlinMoEMethod",
+    "CompressedTensorsWNA16MoEMethod",
+    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
 
 class CompressedTensorsMoEMethod(FusedMoEMethodBase):
-
-    def __init_(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-        layer: torch.nn.Module
+        layer: torch.nn.Module,
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -86,9 +115,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
         else:
             # May have instead defined the linear layers in the fused model
 
-            fused_layers = [
-                "re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"
-            ]
+            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
             current_scheme = None
             for fused_layer in fused_layers:
                 # Check if one of the fused layers are defined in quant_config
@@ -96,64 +123,70 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                     layer_name=fused_layer,
                     module=layer,
                     targets=quant_config.target_scheme_map.keys(),
-                    fused_mapping=quant_config.packed_modules_mapping)
+                    fused_mapping=quant_config.packed_modules_mapping,
+                )
 
                 # Only valid if down_proj, gate_proj, and up_proj
                 # are mapped to the same quant scheme in the quant_config
                 if current_scheme is None:
-                    current_scheme = quant_config.target_scheme_map.get(
-                        matched_target)
+                    current_scheme = quant_config.target_scheme_map.get(matched_target)
                 else:
                     assert current_scheme == quant_config.target_scheme_map.get(
-                        matched_target)
+                        matched_target
+                    )
 
-        weight_quant = quant_config.target_scheme_map[matched_target].get(
-            "weights")
+        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
         input_quant = quant_config.target_scheme_map[matched_target].get(
-            "input_activations")
+            "input_activations"
+        )
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise
             group_size = weight_quant.group_size or -1
             # Prefer to use the MarlinMoE kernel when it is supported.
-            if not check_moe_marlin_supports_layer(layer, group_size):
-                if (weight_quant.strategy in QuantizationStrategy.GROUP and
-                        weight_quant.actorder in (ActivationOrdering.GROUP,
-                                                  ActivationOrdering.DYNAMIC)):
+            if (
+                not check_moe_marlin_supports_layer(layer, group_size)
+                or current_platform.is_rocm()
+            ):
+                if (
+                    weight_quant.strategy == QuantizationStrategy.GROUP
+                    and weight_quant.actorder
+                    in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)
+                ):
                     raise ValueError(
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config,
-                                                       layer.moe_config)
+                return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config)
+                    quant_config, layer.moe_config
+                )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A4MoeMethod(layer.moe_config)
-        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
-              or quant_config._is_fp8_w8a8(weight_quant, input_quant)):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config,
-                                                     layer.moe_config)
+        elif (
+            quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+            or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
+            or quant_config._is_fp8_w8a8(weight_quant, input_quant)
+        ):
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config,
-                                                      layer.moe_config)
-        elif quant_config._is_dynamic_token_w4a8_int(weight_quant,
-                                                     input_quant):
-            return CompressedTensorsW4A8Int8MoEMethod(quant_config,
-                                                      layer.moe_config)
+            return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
+        elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
+            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
         else:
             raise RuntimeError(
-                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+            )
 
 
 class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
-
     def __init__(self, moe: FusedMoEConfig):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
-            detect_nvfp4_moe_support)
+            detect_nvfp4_moe_support,
+        )
+
         super().__init__(moe)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
@@ -161,10 +194,15 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.num_experts = num_experts
         layer.params_dtype = params_dtype
 
@@ -175,8 +213,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // 2,
                 requires_grad=False,
-                dtype=torch.uint8),
-            requires_grad=False)
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_packed", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
@@ -186,8 +226,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 hidden_size,
                 # 2 fp4 items are packed in the input dimension
                 intermediate_size_per_partition // 2,
-                dtype=torch.uint8),
-            requires_grad=False)
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_packed", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -198,11 +240,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 2 * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // self.group_size,
-                dtype=torch.float8_e4m3fn),
-            requires_grad=False)
+                dtype=torch.float8_e4m3fn,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value})
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
 
         w2_weight_scale = torch.nn.Parameter(
@@ -211,120 +256,137 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 hidden_size,
                 # 2 fp4 items are packed in the input dimension
                 intermediate_size_per_partition // self.group_size,
-                dtype=torch.float8_e4m3fn),
-            requires_grad=False)
+                dtype=torch.float8_e4m3fn,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value})
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # Weight Global Scales
-        w13_weight_scale_2 = torch.nn.Parameter(torch.empty(
-            num_experts, 2, dtype=torch.float32),
-                                                requires_grad=False)
+        w13_weight_scale_2 = torch.nn.Parameter(
+            torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
         layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
 
-        w2_weight_scale_2 = torch.nn.Parameter(torch.empty(
-            num_experts, dtype=torch.float32),
-                                               requires_grad=False)
+        w2_weight_scale_2 = torch.nn.Parameter(
+            torch.empty(num_experts, dtype=torch.float32), requires_grad=False
+        )
         layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
 
         # Input Global Scales
-        w13_input_scale = torch.nn.Parameter(torch.empty(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                             requires_grad=False)
+        w13_input_scale = torch.nn.Parameter(
+            torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
         layer.register_parameter("w13_input_global_scale", w13_input_scale)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         set_weight_attrs(w13_input_scale, extra_weight_attrs)
 
-        w2_input_scale = torch.nn.Parameter(torch.empty(num_experts,
-                                                        dtype=torch.float32),
-                                            requires_grad=False)
+        w2_input_scale = torch.nn.Parameter(
+            torch.empty(num_experts, dtype=torch.float32), requires_grad=False
+        )
         layer.register_parameter("w2_input_global_scale", w2_input_scale)
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         set_weight_attrs(w2_input_scale, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-
         # From packed to weight
-        layer.w13_weight = torch.nn.Parameter(layer.w13_weight_packed.data,
-                                              requires_grad=False)
+        layer.w13_weight = torch.nn.Parameter(
+            layer.w13_weight_packed.data, requires_grad=False
+        )
+        delattr(layer, "w13_weight_packed")
 
-        layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
-                                             requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(
+            layer.w2_weight_packed.data, requires_grad=False
+        )
+        delattr(layer, "w2_weight_packed")
 
         # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
         if self.allow_flashinfer:
-            w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
-                                        layer.w13_weight_scale.data,
-                                        dim=-2)
+            w, s = reorder_w1w3_to_w3w1(
+                layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2
+            )
             layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
             layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
 
-        if not torch.allclose(layer.w13_weight_global_scale[:, 0],
-                              layer.w13_weight_global_scale[:, 1]):
+        if not torch.allclose(
+            layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]
+        ):
             logger.warning_once(
                 "w1_weight_global_scale must match w3_weight_global_scale. "
-                "Accuracy may be affected.")
+                "Accuracy may be affected."
+            )
 
         # Take inverse of global scale saved to disk
         layer.w13_weight_scale_2 = torch.nn.Parameter(
-            1 / layer.w13_weight_global_scale[:, 0], requires_grad=False)
+            1 / layer.w13_weight_global_scale[:, 0], requires_grad=False
+        )
 
         layer.w2_weight_scale_2 = torch.nn.Parameter(
-            1 / layer.w2_weight_global_scale.data, requires_grad=False)
+            1 / layer.w2_weight_global_scale.data, requires_grad=False
+        )
 
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
             return
 
         # swizzle weight scales
-        layer.w13_weight_scale = torch.nn.Parameter(swizzle_blockscale(
-            layer.w13_weight_scale),
-                                                    requires_grad=False)
+        layer.w13_weight_scale = torch.nn.Parameter(
+            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+        )
 
-        layer.w2_weight_scale = torch.nn.Parameter(swizzle_blockscale(
-            layer.w2_weight_scale),
-                                                   requires_grad=False)
+        layer.w2_weight_scale = torch.nn.Parameter(
+            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+        )
 
         # w13
-        w13_input_global_scale = layer.w13_input_global_scale.max(
-            dim=1).values.to(torch.float32)
+        w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to(
+            torch.float32
+        )
 
         layer.g1_alphas = torch.nn.Parameter(
             ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
-            requires_grad=False)
+            requires_grad=False,
+        )
 
         layer.w13_input_scale_quant = torch.nn.Parameter(
-            (w13_input_global_scale), requires_grad=False)
+            (w13_input_global_scale), requires_grad=False
+        )
 
         # w2
         layer.g2_alphas = torch.nn.Parameter(
             ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to(
-                torch.float32),
-            requires_grad=False)
+                torch.float32
+            ),
+            requires_grad=False,
+        )
 
         layer.w2_input_scale_quant = torch.nn.Parameter(
-            (layer.w2_input_global_scale), requires_grad=False)
+            (layer.w2_input_global_scale), requires_grad=False
+        )
 
-    def maybe_make_prepare_finalize(
-            self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin:
             return None
         elif not self.allow_flashinfer:
             return super().maybe_make_prepare_finalize()
 
-        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
-            self.moe)
+        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
         logger.debug_once("%s", prepare_finalize.__class__.__name__)
         return prepare_finalize
 
@@ -344,7 +406,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         return experts
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
@@ -365,24 +428,25 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError("EPLB not supported for "
-                                      "`CompressedTensorsW4A4MoeMethod` yet.")
+            raise NotImplementedError(
+                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -400,13 +464,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin.
-        #
         if self.use_marlin:
-            assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -423,34 +482,18 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace)
-
-        elif self.fused_experts is not None:
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                workspace=layer.workspace,
             )
 
         # FlashInfer fused experts path
         elif self.allow_flashinfer:
             from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4)
+                flashinfer_cutlass_moe_fp4,
+            )
 
             assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
+                x, layer.w13_weight, layer.w2_weight
+            ), "Flashinfer CUTLASS Fused MoE not applicable!"
 
             assert self.moe_quant_config is not None
 
@@ -468,12 +511,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         else:
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                cutlass_moe_fp4)
+            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 
-            assert expert_map is None, ("Expert Parallelism / expert_map "
-                                        "is currently not supported for "
-                                        "CompressedTensorsW4A4MoeMethod.")
+            assert expert_map is None, (
+                "Expert Parallelism / expert_map "
+                "is currently not supported for "
+                "CompressedTensorsW4A4MoeMethod."
+            )
             assert self.moe_quant_config is not None
 
             # Cutlass moe takes in activations in BF16/Half precision
@@ -495,7 +539,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
 
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
-
     def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -503,17 +546,19 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     ):
         super().__init__(moe)
         self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "weights")
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
+            "input_activations"
+        )
 
-        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                      and self.input_quant.strategy
-                      == QuantizationStrategy.TENSOR)
+        per_tensor = (
+            self.weight_quant.strategy == QuantizationStrategy.TENSOR
+            and self.input_quant.strategy == QuantizationStrategy.TENSOR
+        )
         per_channel = (
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN
+        )
         if not (per_tensor or per_channel):
             assert self.weight_quant.strategy == QuantizationStrategy.BLOCK
             self.weight_block_size = self.weight_quant.block_structure
@@ -526,33 +571,41 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         if self.static_input_scales and per_channel:
             raise ValueError(
                 "For FP8 Fused MoE layer, we require either per tensor or "
-                "channelwise, dynamic per token quantization.")
+                "channelwise, dynamic per token quantization."
+            )
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN
-                           and not self.block_quant)
+        self.use_marlin = (
+            not current_platform.has_device_capability(89)
+            or envs.VLLM_TEST_FORCE_FP8_MARLIN
+            and not self.block_quant
+        )
         # Disable marlin for rocm
         if current_platform.is_rocm():
             self.use_marlin = False
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled)
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # cutlass path
         self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
-            self.weight_quant, self.input_quant)
+            self.weight_quant, self.input_quant
+        )
         self.use_cutlass = not self.block_quant and (
             quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant)
-            or self.is_fp8_w8a8_sm100)
+            or self.is_fp8_w8a8_sm100
+        )
         self.disable_expert_map = False
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.hidden_size = hidden_size
         layer.num_experts = num_experts
@@ -577,31 +630,38 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "
                     f"{intermediate_size_per_partition} is not divisible by "
-                    f"weight quantization block_n = {block_n}.")
-            if (tp_size > 1
-                    and intermediate_size_per_partition % block_k != 0):
+                    f"weight quantization block_n = {block_n}."
+                )
+            if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
                 # Required by row parallel
                 raise ValueError(
                     f"The input_size of down's weight = "
                     f"{intermediate_size_per_partition} is not divisible by "
-                    f"weight quantization block_k = {block_k}.")
+                    f"weight quantization block_k = {block_k}."
+                )
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -609,70 +669,83 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
             # Allocate 2 scales for w1 and w3 respectively.
             # They are combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, 2, dtype=torch.float32),
-                                                  requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
             # Add PER-TENSOR quantization for FusedMoE.weight_loader.
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                1,
-                dtype=torch.float32),
-                                                  requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    1,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, hidden_size, 1, dtype=torch.float32),
-                                                 requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+                requires_grad=False,
+            )
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
             # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+            )
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         elif self.weight_quant.strategy == QuantizationStrategy.BLOCK:
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts,
-                2 *
-                ((intermediate_size_per_partition + block_n - 1) // block_n),
-                (hidden_size + block_k - 1) // block_k,
-                dtype=torch.float32),
-                                                  requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, (hidden_size + block_n - 1) // block_n,
-                (intermediate_size_per_partition + block_k - 1) // block_k,
-                dtype=torch.float32),
-                                                 requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
             # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value})
+                {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+            )
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
-            w13_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_input_scale", w13_input_scale)
             set_weight_attrs(w13_input_scale, extra_weight_attrs)
 
-            w2_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                requires_grad=False)
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w2_input_scale", w2_input_scale)
             set_weight_attrs(w2_input_scale, extra_weight_attrs)
         else:
@@ -684,46 +757,53 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
             assert self.input_quant.strategy == QuantizationStrategy.TENSOR
-            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+            if layer.w13_input_scale is None or layer.w2_input_scale is None:
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
-                    "activation scales are None.")
-            if (not all_close_1d(layer.w13_input_scale)
-                    or not all_close_1d(layer.w2_input_scale)):
+                    "activation scales are None."
+                )
+            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                layer.w2_input_scale
+            ):
                 logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer.")
+                    "for each layer."
+                )
             layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max(), requires_grad=False)
+                layer.w13_input_scale.max(), requires_grad=False
+            )
             layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max(), requires_grad=False)
+                layer.w2_input_scale.max(), requires_grad=False
+            )
 
         if current_platform.is_fp8_fnuz():
             # Normalize the weights and scales
-            w13_weight, w13_weight_scale, w13_input_scale = \
-                normalize_e4m3fn_to_e4m3fnuz(
-                    layer.w13_weight, layer.w13_weight_scale,
-                    layer.w13_input_scale)
-            w2_weight, w2_weight_scale, w2_input_scale = \
+            w13_weight, w13_weight_scale, w13_input_scale = (
                 normalize_e4m3fn_to_e4m3fnuz(
-                    layer.w2_weight, layer.w2_weight_scale,
-                    layer.w2_input_scale)
+                    layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                )
+            )
+            w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+            )
             # Reset the parameter
-            layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                  requires_grad=False)
-            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
-                                                        requires_grad=False)
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                w13_weight_scale, requires_grad=False
+            )
             if w13_input_scale is not None:
-                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
-                                                           requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                 requires_grad=False)
-            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
-                                                       requires_grad=False)
+                layer.w13_input_scale = torch.nn.Parameter(
+                    w13_input_scale, requires_grad=False
+                )
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(
+                w2_weight_scale, requires_grad=False
+            )
             if w2_input_scale is not None:
-                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
-                                                          requires_grad=False)
+                layer.w2_input_scale = torch.nn.Parameter(
+                    w2_input_scale, requires_grad=False
+                )
 
         # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
         # for w13 per expert. Use max then dequant and requant each expert.
@@ -735,29 +815,26 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 start = 0
                 for shard_id in range(2):
                     dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start:start +
-                                                    shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id])
-                    layer.w13_weight[expert_id][
-                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id])
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
+                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    )
                     start += shard_size
-            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                        requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
 
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                rocm_aiter_fused_experts, shuffle_weights)
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data)
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data
+            )
 
-            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                  requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                 requires_grad=False)
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
         elif self.use_marlin:
             prepare_moe_fp8_layer_for_marlin(layer, False)
@@ -770,20 +847,23 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             device = layer.w13_weight.device
             # ab_strides1 and c_strides2 are the same
             self.ab_strides1_c_strides2 = torch.full(
-                (layer.local_num_experts, ),
+                (layer.local_num_experts,),
                 layer.hidden_size,
                 device=device,
-                dtype=torch.int64)
+                dtype=torch.int64,
+            )
             self.ab_strides2 = torch.full(
-                (layer.local_num_experts, ),
+                (layer.local_num_experts,),
                 layer.intermediate_size_per_partition,
                 device=device,
-                dtype=torch.int64)
+                dtype=torch.int64,
+            )
             self.c_strides1 = torch.full(
-                (layer.local_num_experts, ),
+                (layer.local_num_experts,),
                 2 * layer.intermediate_size_per_partition,
                 device=device,
-                dtype=torch.int64)
+                dtype=torch.int64,
+            )
 
         if is_deep_gemm_e8m0_used() and self.block_quant:
             assert layer.weight_block_size is not None
@@ -803,13 +883,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             # Ensure column-major TMA alignment expected by DeepGEMM.
             if expert_weight_is_col_major(layer.w13_weight_scale):
                 layer.w13_weight_scale = get_col_major_tma_aligned_tensor(
-                    layer.w13_weight_scale)
+                    layer.w13_weight_scale
+                )
             if expert_weight_is_col_major(layer.w2_weight_scale):
                 layer.w2_weight_scale = get_col_major_tma_aligned_tensor(
-                    layer.w2_weight_scale)
+                    layer.w2_weight_scale
+                )
 
-    def maybe_make_prepare_finalize(
-            self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or self.rocm_aiter_moe_enabled:
             return None
         else:
@@ -824,16 +905,19 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         assert self.moe_quant_config is not None
         if self.use_cutlass:
             from vllm.model_executor.layers.fused_moe import (
-                CutlassBatchedExpertsFp8, CutlassExpertsFp8)
+                CutlassBatchedExpertsFp8,
+                CutlassExpertsFp8,
+            )
 
             experts: FusedMoEPermuteExpertsUnpermute
 
             num_dispatchers = prepare_finalize.num_dispatchers()
 
-            if (prepare_finalize.activation_format ==
-                    FusedMoEActivationFormat.BatchedExperts):
-                logger.debug("CutlassBatchedExpertsFp8(%s)",
-                             self.__class__.__name__)
+            if (
+                prepare_finalize.activation_format
+                == FusedMoEActivationFormat.BatchedExperts
+            ):
+                logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__)
                 experts = CutlassBatchedExpertsFp8(
                     self.moe.num_local_experts,
                     num_dispatchers,
@@ -855,23 +939,27 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     quant_config=self.moe_quant_config,
                 )
 
-            self.disable_expert_map = (num_dispatchers > 1
-                                       or not experts.supports_expert_map())
+            self.disable_expert_map = (
+                num_dispatchers > 1 or not experts.supports_expert_map()
+            )
 
             return experts
 
         # triton path
         from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-            BatchedTritonOrDeepGemmExperts)
+            BatchedTritonOrDeepGemmExperts,
+        )
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-            TritonOrDeepGemmExperts)
+            TritonOrDeepGemmExperts,
+        )
 
         assert not self.rocm_aiter_moe_enabled and not self.use_marlin
 
-        if (prepare_finalize.activation_format ==
-                FusedMoEActivationFormat.BatchedExperts):
-            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank(
-            )
+        if (
+            prepare_finalize.activation_format
+            == FusedMoEActivationFormat.BatchedExperts
+        ):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
 
             logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
@@ -879,22 +967,27 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
             )
         else:
-            logger.debug("TritonOrDeepGemmExperts(%s)",
-                         self.__class__.__name__)
-            return TritonOrDeepGemmExperts(self.moe_quant_config,
-                                           allow_deep_gemm=True)
+            logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
+            return TritonOrDeepGemmExperts(
+                self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
+            )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
-        per_act_token = (
-            self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        per_channel_quant = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+        per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
+        per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
@@ -914,25 +1007,26 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsW8A8Fp8MoEMethod` yet.")
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -947,22 +1041,20 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
+            num_fused_shared_experts=layer.num_fused_shared_experts,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
-        per_act_token = (
-            self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        per_channel_quant = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+        per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
+        per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # cutlass fp8 or fused_experts but not marlin or rocm.
-        #
         if self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
-            assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -977,14 +1069,16 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace)
+                workspace=layer.workspace,
+            )
 
         elif self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                rocm_aiter_fused_experts)
+                rocm_aiter_fused_experts,
+            )
+
             assert per_act_token == per_channel_quant
             assert self.moe_quant_config is not None
-            assert self.fused_experts is None
             return rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -997,18 +1091,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 quant_config=self.moe_quant_config,
             )
 
-        elif self.fused_experts is not None:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-            )
-
         # cutlass path
         elif self.use_cutlass:
             assert self.moe_quant_config is not None
@@ -1016,6 +1098,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             # small-batch fallback on SM100
             if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
                 from vllm.model_executor.layers.fused_moe import fused_experts
+
                 assert per_act_token == per_channel_quant
                 return fused_experts(
                     hidden_states=x,
@@ -1032,7 +1115,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 )
             else:
                 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                    cutlass_moe_fp8)
+                    cutlass_moe_fp8,
+                )
+
                 assert per_act_token == per_channel_quant
                 assert self.moe_quant_config is not None
                 return cutlass_moe_fp8(
@@ -1053,6 +1138,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
+
             assert per_act_token == per_channel_quant
             assert self.moe_quant_config is not None
             return fused_experts(
@@ -1069,9 +1155,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 quant_config=self.moe_quant_config,
             )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
 
-class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
+class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -1079,69 +1168,83 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     ):
         super().__init__(moe)
         self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "weights")
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
+            "input_activations"
+        )
 
         per_channel = (
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN
+        )
         if not per_channel:
             raise ValueError(
                 "For INT8 Fused MoE layers, we require channelwise, "
                 "dynamic per token quantization. Found "
-                f"{self.weight_quant}, {self.input_quant}")
+                f"{self.weight_quant}, {self.input_quant}"
+            )
 
         self.static_input_scales = not self.input_quant.dynamic
         if self.static_input_scales:
             raise ValueError(
                 "For INT8 Fused MoE layers, we require channelwise, "
-                "dynamic per token quantization. Found static input scales.")
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+                "dynamic per token quantization. Found static input scales."
+            )
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         params_dtype = torch.int8
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
         assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-        w13_weight_scale = torch.nn.Parameter(torch.ones(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            1,
-            dtype=torch.float32),
-                                              requires_grad=False)
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        hidden_size,
-                                                        1,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
@@ -1154,7 +1257,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         pass
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return int8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
@@ -1171,27 +1275,25 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsW8A8Int8MoEMethod` yet.")
+                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
+            )
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1207,7 +1309,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
         return fused_experts(
             hidden_states=x,
@@ -1225,7 +1328,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
 
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
-
     def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -1241,58 +1343,72 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         self.strategy = config.strategy
         self.group_size = config.group_size
         self.actorder = config.actorder
-        assert config.symmetric, (
-            "Only symmetric quantization is supported for MoE")
-
-        if not (self.quant_config.quant_format
-                == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
-            raise ValueError("For Fused MoE layers, only ",
-                             f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
-        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        assert config.symmetric, "Only symmetric quantization is supported for MoE"
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+        if not (
+            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
+            and self.num_bits in WNA16_SUPPORTED_BITS
+        ):
+            raise ValueError(
+                "For Fused MoE layers, only ",
+                f"{CompressionFormat.pack_quantized.value} ",
+                "is supported for the following bits: ",
+                f"{WNA16_SUPPORTED_BITS}",
+            )
+        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        self.use_marlin = True
 
-        intermediate_size_full = extra_weight_attrs.pop(
-            "intermediate_size_full")
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
 
         # Will transpose the loaded weight along the
         # intermediate and hidden dim sizes. Will
         # shard for TP along the transposed dims
-        extra_weight_attrs.update({
-            "is_transposed": True,
-            "quant_method": self.strategy
-        })
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size // self.packed_factor,
-            2 * intermediate_size_per_partition,
-            dtype=torch.int32),
-                                        requires_grad=False)
+        extra_weight_attrs.update(
+            {"is_transposed": True, "quant_method": self.strategy}
+        )
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.packed_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_packed", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            intermediate_size_per_partition // self.packed_factor,
-            hidden_size,
-            dtype=torch.int32),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // self.packed_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_packed", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # In the case where we have actorder/g_idx,
         # we do not partition the w2 scales
         load_full_w2 = self.actorder and self.group_size != -1
-        w2_scales_size = (intermediate_size_full
-                          if load_full_w2 else intermediate_size_per_partition)
+        w2_scales_size = (
+            intermediate_size_full if load_full_w2 else intermediate_size_per_partition
+        )
 
         self.is_k_full = (not self.actorder) or (
-            intermediate_size_per_partition == intermediate_size_full)
+            intermediate_size_per_partition == intermediate_size_full
+        )
 
         if self.strategy == "channel":
             num_groups_w2 = num_groups_w13 = 1
@@ -1301,30 +1417,34 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
-        w13_scale = torch.nn.Parameter(torch.ones(
-            num_experts,
-            num_groups_w13,
-            2 * intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w13_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_scale", w13_scale)
         set_weight_attrs(w13_scale, extra_weight_attrs)
 
-        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                 num_groups_w2,
-                                                 hidden_size,
-                                                 dtype=params_dtype),
-                                      requires_grad=False)
+        w2_scale = torch.nn.Parameter(
+            torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_scale", w2_scale)
         set_weight_attrs(w2_scale, extra_weight_attrs)
         set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
 
-        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
-                                             requires_grad=False)
+        w2_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
         layer.register_parameter("w2_weight_shape", w2_weight_shape)
         set_weight_attrs(w2_weight_shape, extra_weight_attrs)
-        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
-                                              requires_grad=False)
+        w13_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
 
         layer.register_parameter("w13_weight_shape", w13_weight_shape)
         set_weight_attrs(w13_weight_shape, extra_weight_attrs)
@@ -1359,8 +1479,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx_sort_indices",
-                                 w13_g_idx_sort_indices)
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
         set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
 
         w2_g_idx_sort_indices = torch.nn.Parameter(
@@ -1371,8 +1490,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx_sort_indices",
-                                 w2_g_idx_sort_indices)
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
         layer.a13_scale = None
@@ -1392,41 +1510,37 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
 
             for e in range(num_experts):
-                w13_g_idx_sort_indices[e] = torch.argsort(
-                    layer.w13_weight_g_idx[e]).to(torch.int32)
-                w2_g_idx_sort_indices[e] = torch.argsort(
-                    layer.w2_weight_g_idx[e]).to(torch.int32)
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to(
+                    torch.int32
+                )
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to(
+                    torch.int32
+                )
                 w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
-                    w13_g_idx_sort_indices[e]]
-                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][
-                    w2_g_idx_sort_indices[e]]
+                    w13_g_idx_sort_indices[e]
+                ]
+                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]]
 
             replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
             replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
-            replace_parameter(layer, "w13_g_idx_sort_indices",
-                              w13_g_idx_sort_indices)
-            replace_parameter(layer, "w2_g_idx_sort_indices",
-                              w2_g_idx_sort_indices)
+            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
 
         else:
             layer.w13_weight_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_weight_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
 
@@ -1456,8 +1570,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_weight_scale,
-            size_k=layer.w2_weight_scale.shape[1] *
-            (self.group_size if self.group_size != -1 else self.packed_factor),
+            size_k=layer.w2_weight_scale.shape[1]
+            * (self.group_size if self.group_size != -1 else self.packed_factor),
             size_n=layer.w2_weight_scale.shape[2],
             group_size=self.group_size,
         )
@@ -1466,8 +1580,53 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         layer.workspace = marlin_make_workspace_new(device, 4)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        return None
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        if self.num_bits != 4:
+            return None
+        return int4_w4a16_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size],
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        assert self.num_bits == 4, "only supporting w4"
+        layer.w13_weight = layer.w13_weight_packed
+        layer.w2_weight = layer.w2_weight_packed
+        assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+            assert max_num_tokens_per_rank is not None
+            return BatchedMarlinExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
+        else:
+            return MarlinExperts(
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
 
     def apply(
         self,
@@ -1477,30 +1636,27 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsWNA16MarlinMoEMethod` yet.")
+                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
+            )
 
-        assert activation == "silu", (
-            f"{activation} not supported for Marlin MoE.")
+        assert activation == "silu", f"{activation} not supported for Marlin MoE."
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -1514,9 +1670,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
@@ -1536,11 +1693,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
-            is_k_full=self.is_k_full)
+            is_k_full=self.is_k_full,
+        )
 
 
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
-
     def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -1559,43 +1716,55 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         self.group_size = config.group_size
         # grouped actorder isn't supported by this kernel
         assert config.actorder != "group"
-        assert config.symmetric, (
-            "Only symmetric quantization is supported for MoE")
+        assert config.symmetric, "Only symmetric quantization is supported for MoE"
 
-        if not (self.quant_config.quant_format
-                == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
-            raise ValueError("For Fused MoE layers, only ",
-                             f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+        if not (
+            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
+            and self.num_bits in WNA16_SUPPORTED_BITS
+        ):
+            raise ValueError(
+                "For Fused MoE layers, only ",
+                f"{CompressionFormat.pack_quantized.value} ",
+                "is supported for the following bits: ",
+                f"{WNA16_SUPPORTED_BITS}",
+            )
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # Will transpose the loaded weight along the
         # intermediate and hidden dim sizes. Will
         # shard for TP along the transposed dims
-        extra_weight_attrs.update({
-            "is_transposed": True,
-            "quant_method": self.strategy
-        })
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size // self.packed_factor,
-            2 * intermediate_size_per_partition,
-            dtype=torch.int32),
-                                        requires_grad=False)
+        extra_weight_attrs.update(
+            {"is_transposed": True, "quant_method": self.strategy}
+        )
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.packed_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_packed", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            intermediate_size_per_partition // self.packed_factor,
-            hidden_size,
-            dtype=torch.int32),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // self.packed_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_packed", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -1608,30 +1777,34 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
-        w13_scale = torch.nn.Parameter(torch.ones(
-            num_experts,
-            num_groups_w13,
-            2 * intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w13_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_scale", w13_scale)
         set_weight_attrs(w13_scale, extra_weight_attrs)
 
-        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                 num_groups_w2,
-                                                 hidden_size,
-                                                 dtype=params_dtype),
-                                      requires_grad=False)
+        w2_scale = torch.nn.Parameter(
+            torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_scale", w2_scale)
         set_weight_attrs(w2_scale, extra_weight_attrs)
         set_weight_attrs(w2_scale, {"load_full_w2": False})
 
-        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
-                                             requires_grad=False)
+        w2_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
         layer.register_parameter("w2_weight_shape", w2_weight_shape)
         set_weight_attrs(w2_weight_shape, extra_weight_attrs)
-        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
-                                              requires_grad=False)
+        w13_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
 
         layer.register_parameter("w13_weight_shape", w13_weight_shape)
         set_weight_attrs(w13_weight_shape, extra_weight_attrs)
@@ -1666,8 +1839,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx_sort_indices",
-                                 w13_g_idx_sort_indices)
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
         set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
 
         w2_g_idx_sort_indices = torch.nn.Parameter(
@@ -1678,8 +1850,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx_sort_indices",
-                                 w2_g_idx_sort_indices)
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
         layer.a13_scale = None
@@ -1688,25 +1859,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Reconfigure packed weights and scales to match moe_wna16 format
         layer.w13_weight_packed = torch.nn.Parameter(
-            layer.w13_weight_packed.transpose(1, 2).contiguous().view(
-                torch.uint8),
-            requires_grad=False)
+            layer.w13_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
+            requires_grad=False,
+        )
         layer.w2_weight_packed = torch.nn.Parameter(
-            layer.w2_weight_packed.transpose(1,
-                                             2).contiguous().view(torch.uint8),
-            requires_grad=False)
+            layer.w2_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
+            requires_grad=False,
+        )
         layer.w13_weight_scale = torch.nn.Parameter(
-            layer.w13_weight_scale.transpose(1, 2).contiguous(),
-            requires_grad=False)
+            layer.w13_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
+        )
         layer.w2_weight_scale = torch.nn.Parameter(
-            layer.w2_weight_scale.transpose(1, 2).contiguous(),
-            requires_grad=False)
+            layer.w2_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
+        )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         assert self.num_bits == 4 or self.num_bits == 8
-        config_builder = (int4_w4a16_moe_quant_config if self.num_bits == 4
-                          else int8_w8a16_moe_quant_config)
+        config_builder = (
+            int4_w4a16_moe_quant_config
+            if self.num_bits == 4
+            else int8_w8a16_moe_quant_config
+        )
 
         return config_builder(
             w1_scale=layer.w13_weight_scale,
@@ -1724,26 +1899,25 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError("EPLB not supported for "
-                                      "`CompressedTensorsWNA16MoEMethod` yet.")
+            raise NotImplementedError(
+                "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
+            )
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1759,7 +1933,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
         return fused_experts(
             x,
@@ -1787,9 +1962,10 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
     """
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-            moe: FusedMoEConfig):
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
+    ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
         self.quant_config = quant_config
@@ -1797,8 +1973,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         # Validate scheme: weights=W4 (channel or group),
         # activations=dynamic TOKEN (A8)
         wq = self.quant_config.target_scheme_map["Linear"].get("weights")
-        aq = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
+        aq = self.quant_config.target_scheme_map["Linear"].get("input_activations")
 
         # Must be dynamic per-token activations
         if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic:
@@ -1809,8 +1984,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         # Weight can be channel-wise (group_size=None) or group-wise
         self.group_size = wq.group_size if (wq.group_size is not None) else -1
         if wq.num_bits != 4:
-            raise ValueError(
-                "This method only supports 4-bit weights (num_bits=4).")
+            raise ValueError("This method only supports 4-bit weights (num_bits=4).")
 
         # CPU only
         if not current_platform.is_cpu():
@@ -1824,14 +1998,20 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             except AttributeError as err:
                 raise RuntimeError(
                     f"""PyTorch {torch.__version__} lacks _dyn_quant_* 4bit ops;
-                    install a newer build.""") from err
+                    install a newer build."""
+                ) from err
         self.static_input_scales = False  # always dynamic per token
 
     # ---- parameter creation ----
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # Shapes per local rank (TP/EP):
         #   w13: [E, 2*I_local, H]  int8  (int4 values in [-8,7])
         #   w2 : [E, H, I_local]    int8
@@ -1850,13 +2030,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             return 1 if g == -1 else (in_features // g)
 
         # Register unpacked int4-as-int8 weights the loader will fill.
-        w13 = torch.nn.Parameter(torch.empty(E, 2 * IN, H, dtype=torch.int8),
-                                 requires_grad=False)
+        w13 = torch.nn.Parameter(
+            torch.empty(E, 2 * IN, H, dtype=torch.int8), requires_grad=False
+        )
         set_weight_attrs(w13, extra_weight_attrs)
         layer.register_parameter("w13_weight", w13)
 
-        w2 = torch.nn.Parameter(torch.empty(E, H, IN, dtype=torch.int8),
-                                requires_grad=False)
+        w2 = torch.nn.Parameter(
+            torch.empty(E, H, IN, dtype=torch.int8), requires_grad=False
+        )
         set_weight_attrs(w2, extra_weight_attrs)
         layer.register_parameter("w2_weight", w2)
 
@@ -1865,54 +2047,48 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         # KleidiAI groupwise kernels accepts bfloat16 scales
         scale_dtype = torch.float32 if g == -1 else torch.bfloat16
 
-        w13_s = torch.nn.Parameter(torch.ones(E,
-                                              2 * IN,
-                                              _n_scale_cols(H),
-                                              dtype=scale_dtype),
-                                   requires_grad=False)
+        w13_s = torch.nn.Parameter(
+            torch.ones(E, 2 * IN, _n_scale_cols(H), dtype=scale_dtype),
+            requires_grad=False,
+        )
         set_weight_attrs(
-            w13_s, {
-                "quant_method": "channel" if g == -1 else "group",
-                **extra_weight_attrs
-            })
+            w13_s,
+            {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
+        )
         layer.register_parameter("w13_weight_scale", w13_s)
 
-        w2_s = torch.nn.Parameter(torch.ones(E,
-                                             H,
-                                             _n_scale_cols(IN),
-                                             dtype=scale_dtype),
-                                  requires_grad=False)
+        w2_s = torch.nn.Parameter(
+            torch.ones(E, H, _n_scale_cols(IN), dtype=scale_dtype), requires_grad=False
+        )
         set_weight_attrs(
-            w2_s, {
-                "quant_method": "channel" if g == -1 else "group",
-                **extra_weight_attrs
-            })
+            w2_s,
+            {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
+        )
         layer.register_parameter("w2_weight_scale", w2_s)
 
         if self.has_bias:
-            w13_bias = torch.nn.Parameter(torch.zeros(E,
-                                                      2 * IN,
-                                                      dtype=params_dtype),
-                                          requires_grad=False)
+            w13_bias = torch.nn.Parameter(
+                torch.zeros(E, 2 * IN, dtype=params_dtype), requires_grad=False
+            )
             layer.register_parameter("w13_bias", w13_bias)
             set_weight_attrs(w13_bias, extra_weight_attrs)
 
-            w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
-                                                     hidden_size,
-                                                     dtype=params_dtype),
-                                         requires_grad=False)
+            w2_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
+                requires_grad=False,
+            )
             layer.register_parameter("w2_bias", w2_bias)
             set_weight_attrs(w2_bias, extra_weight_attrs)
 
         # Placeholders for packed weights (will be replaced after packing)
         layer.register_parameter(
-            "w13_weight_packed",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            "w13_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        )
         set_weight_attrs(layer.w13_weight_packed, extra_weight_attrs)
 
         layer.register_parameter(
-            "w2_weight_packed",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            "w2_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        )
         set_weight_attrs(layer.w2_weight_packed, extra_weight_attrs)
 
         # dims for 4 bit fused matmuls
@@ -1930,15 +2106,19 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         IN = layer.w2_in_features
         g = layer.group_size
 
-        def _pack_matrix(int4_as_int8_2d: torch.Tensor,
-                         scales_2d: torch.Tensor,
-                         bias_1d: Optional[torch.Tensor], in_features: int,
-                         out_features: int) -> torch.Tensor:
+        def _pack_matrix(
+            int4_as_int8_2d: torch.Tensor,
+            scales_2d: torch.Tensor,
+            bias_1d: torch.Tensor | None,
+            in_features: int,
+            out_features: int,
+        ) -> torch.Tensor:
             # int4 values are stored as int8 in [-8,7].
             # Shift to unsigned nibble and pack pairs along input-dim.
             tmp = int4_as_int8_2d.add(8)  # [out, in]
             uint8_nibbles = ((tmp[:, 1::2] << 4) | tmp[:, ::2]).to(
-                torch.uint8)  # [out, in//2]
+                torch.uint8
+            )  # [out, in//2]
 
             # KleidiAI groupwise kernels accepts float32 scales
             # KleidiAI groupwise kernels accepts bfloat16 scales
@@ -1946,15 +2126,19 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             scales = scales_2d.to(scale_dtype)
             bias = None if bias_1d is None else bias_1d.to(torch.float32)
             return torch.ops.aten._dyn_quant_pack_4bit_weight(
-                uint8_nibbles, scales, bias, g if g != -1 else in_features,
-                in_features, out_features)
+                uint8_nibbles,
+                scales,
+                bias,
+                g if g != -1 else in_features,
+                in_features,
+                out_features,
+            )
 
         # Pack per expert
         w13_packed_list = []
         w2_packed_list = []
 
-        has_w13_bias = hasattr(layer,
-                               "w13_bias") and layer.w13_bias is not None
+        has_w13_bias = hasattr(layer, "w13_bias") and layer.w13_bias is not None
         has_w2_bias = hasattr(layer, "w2_bias") and layer.w2_bias is not None
 
         for e in range(E):
@@ -1964,7 +2148,9 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
                     layer.w13_weight_scale[e],  # [2I, H/g or 1]
                     layer.w13_bias[e] if has_w13_bias else None,  # [2I]
                     H,
-                    I2))
+                    I2,
+                )
+            )
             w2_packed_list.append(
                 _pack_matrix(
                     # w2 shape is [H, IN]; we need [out, in] == [H, IN].
@@ -1972,42 +2158,58 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
                     layer.w2_weight_scale[e],  # [H, IN/g or 1]
                     layer.w2_bias[e] if has_w2_bias else None,  # [H]
                     IN,
-                    layer.w2_out_features  # in_features=IN, out_features=H
-                ))
+                    layer.w2_out_features,  # in_features=IN, out_features=H
+                )
+            )
 
         # each packed tensor has identical shape per expert; stack on dim 0
         w13_packed = torch.stack(w13_packed_list, dim=0)
         w2_packed = torch.stack(w2_packed_list, dim=0)
 
-        replace_parameter(layer, "w13_weight_packed",
-                          torch.nn.Parameter(w13_packed, requires_grad=False))
-        replace_parameter(layer, "w2_weight_packed",
-                          torch.nn.Parameter(w2_packed, requires_grad=False))
+        replace_parameter(
+            layer,
+            "w13_weight_packed",
+            torch.nn.Parameter(w13_packed, requires_grad=False),
+        )
+        replace_parameter(
+            layer,
+            "w2_weight_packed",
+            torch.nn.Parameter(w2_packed, requires_grad=False),
+        )
 
         # free raw tensors/scales/bias now that they're packed into the payload.
         replace_parameter(
-            layer, "w13_weight",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            layer, "w13_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        )
         replace_parameter(
-            layer, "w2_weight",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            layer, "w2_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        )
         replace_parameter(
-            layer, "w13_weight_scale",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            layer,
+            "w13_weight_scale",
+            torch.nn.Parameter(torch.empty(0), requires_grad=False),
+        )
         replace_parameter(
-            layer, "w2_weight_scale",
-            torch.nn.Parameter(torch.empty(0), requires_grad=False))
+            layer,
+            "w2_weight_scale",
+            torch.nn.Parameter(torch.empty(0), requires_grad=False),
+        )
         if has_w13_bias:
             replace_parameter(
-                layer, "w13_bias",
-                torch.nn.Parameter(torch.empty(0), requires_grad=False))
+                layer,
+                "w13_bias",
+                torch.nn.Parameter(torch.empty(0), requires_grad=False),
+            )
         if has_w2_bias:
             replace_parameter(
-                layer, "w2_bias",
-                torch.nn.Parameter(torch.empty(0), requires_grad=False))
+                layer,
+                "w2_bias",
+                torch.nn.Parameter(torch.empty(0), requires_grad=False),
+            )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         # CPU dynamic 4-bit MoE path does not use modular kernels or
         # fused_experts; quant config is not needed.
         return None
@@ -2020,25 +2222,25 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
-        assert activation in (
-            "silu", "swigluoai",
-            "swiglu"), "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
+        assert activation in ("silu", "swigluoai", "swiglu"), (
+            "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
+        )
         assert expert_map is None, """expert_map/EP not implemented
         for CPU dyn-4bit MoE."""
 
@@ -2068,7 +2270,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         )
 
         return torch.ops._C.dynamic_4bit_int_moe(
-            x, topk_ids.to(torch.long), topk_weights, layer.w13_weight_packed,
-            layer.w2_weight_packed, layer.w2_out_features,
-            layer.w2_in_features, layer.w13_out_features, layer.group_size,
-            apply_router_weight_on_input, int(_act_kind(activation)))
\ No newline at end of file
+            x,
+            topk_ids.to(torch.long),
+            topk_weights,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            layer.w2_out_features,
+            layer.w2_in_features,
+            layer.w13_out_features,
+            layer.group_size,
+            apply_router_weight_on_input,
+            int(_act_kind(activation)),
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index cac65cca5093fddef6df7cf189dc3a42407dfd27..ca286675ebd0cace14ddb53c87f869982a9f104b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -5,23 +5,31 @@ from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
 from .compressed_tensors_w4a8_fp8 import CompressedTensorsW4A8Fp8
 from .compressed_tensors_w4a8_int import CompressedTensorsW4A8Int
-from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
-                                          CompressedTensorsW4A16Sparse24)
+from .compressed_tensors_w4a16_24 import (
+    W4A16SPARSE24_SUPPORTED_BITS,
+    CompressedTensorsW4A16Sparse24,
+)
 from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
-from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
-                                       CompressedTensorsWNA16)
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
 
+# This avoids circular import error
 from .compressed_tensors_24 import CompressedTensors24  # isort: skip
 
 __all__ = [
-    "CompressedTensorsScheme", "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
-    "CompressedTensors24", "CompressedTensorsW4A16Fp4",
-    "CompressedTensorsW4A4Fp4", "CompressedTensorsW4A8Int",
-    "CompressedTensorsW4A8Fp8"
+    "CompressedTensorsScheme",
+    "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8",
+    "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8",
+    "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS",
+    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24",
+    "CompressedTensorsW4A16Fp4",
+    "CompressedTensorsW4A4Fp4",
+    "CompressedTensorsW4A8Int",
+    "CompressedTensorsW4A8Fp8",
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 168b221a9cfe96be21f7e5401cc6e64543cffcd3..571ce267f3fa6acc783ee1a765f6793e4d0cab6a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,29 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from compressed_tensors import CompressionFormat, ModelCompressor
-from compressed_tensors.quantization import (QuantizationArgs,
-                                             QuantizationStrategy,
-                                             QuantizationType)
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
 from compressed_tensors.utils import combine_shards
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise, sparse_cutlass_supported)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    convert_to_channelwise,
+    sparse_cutlass_supported,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 
 __all__ = ["CompressedTensors24"]
 
@@ -31,27 +40,32 @@ from vllm.platforms import current_platform
 
 
 class CompressedTensors24(CompressedTensorsScheme):
-
     def __init__(
         self,
         quantized: bool = False,
-        weight_quant: Optional[QuantizationArgs] = None,
-        input_quant: Optional[QuantizationArgs] = None,
-        model_compression_config: Optional[dict[str, Any]] = None,
+        weight_quant: QuantizationArgs | None = None,
+        input_quant: QuantizationArgs | None = None,
+        model_compression_config: dict[str, Any] | None = None,
     ):
         self.quantized = quantized
         self.weight_quant = weight_quant
         self.input_quant = input_quant
-        self.model_compressor = (
-            ModelCompressor.from_compression_config(model_compression_config)
-            if model_compression_config is not None else None)
+        model_compressor = ModelCompressor.from_compression_config(
+            model_compression_config
+        )
         self.do_sparse_decompress = (
-            self.model_compressor is not None
-            and self.model_compressor.sparsity_config.format
-            == CompressionFormat.sparse_24_bitmask.value)
+            model_compressor is not None
+            and model_compressor.sparsity_config.format
+            == CompressionFormat.sparse_24_bitmask.value
+        )
+        if self.do_sparse_decompress:
+            self.model_compressor = model_compressor
 
-        if quantized and input_quant is not None and \
-                self._get_quant_dtype() == current_platform.fp8_dtype():
+        if (
+            quantized
+            and input_quant is not None
+            and self._get_quant_dtype() == current_platform.fp8_dtype()
+        ):
             static = not input_quant.dynamic
             g_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
             self.quant_fp8 = QuantFP8(static, g_shape)
@@ -74,7 +88,8 @@ class CompressedTensors24(CompressedTensorsScheme):
         if not sparse_cutlass_supported():
             raise ValueError(
                 "Sparse CUTLASS not supported. vLLM must be built with "
-                "CUDA 12.2 or later to use this feature")
+                "CUDA 12.2 or later to use this feature"
+            )
 
         layer.logical_widths = output_partition_sizes
         layer.input_size = input_size
@@ -93,9 +108,9 @@ class CompressedTensors24(CompressedTensorsScheme):
             weight_loader=weight_loader,
         )
         if self.do_sparse_decompress:
-            assert all(partition_size % 8 == 0
-                       for partition_size in output_partition_sizes
-                       ), "All partitions must be divisible by 8 for "
+            assert all(
+                partition_size % 8 == 0 for partition_size in output_partition_sizes
+            ), "All partitions must be divisible by 8 for "
             "2:4 sparse compressed models"
 
             shape = BasevLLMParameter(
@@ -130,20 +145,24 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         # Check if quantized, not just 2:4 Sparse
         if self.quantized:
-            if (self.weight_quant and self.weight_quant.strategy
-                    == QuantizationStrategy.CHANNEL.value):
+            if (
+                self.weight_quant
+                and self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            ):
                 weight_scale = ChannelQuantScaleParameter(
-                    data=torch.empty((sum(output_partition_sizes), 1),
-                                     dtype=torch.float32),
+                    data=torch.empty(
+                        (sum(output_partition_sizes), 1), dtype=torch.float32
+                    ),
                     output_dim=0,
                     weight_loader=weight_loader,
                 )
             else:
-                assert (self.weight_quant and self.weight_quant.strategy
-                        == QuantizationStrategy.TENSOR.value)
+                assert (
+                    self.weight_quant
+                    and self.weight_quant.strategy == QuantizationStrategy.TENSOR.value
+                )
                 weight_scale = PerTensorScaleParameter(
-                    data=torch.empty(len(output_partition_sizes),
-                                     dtype=torch.float32),
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
                     weight_loader=weight_loader,
                 )
 
@@ -152,8 +171,7 @@ class CompressedTensors24(CompressedTensorsScheme):
             # input quant will be non-none
             if self.input_quant and not self.input_quant.dynamic:
                 # register input quant scale
-                assert (self.input_quant.strategy ==
-                        QuantizationStrategy.TENSOR.value)
+                assert self.input_quant.strategy == QuantizationStrategy.TENSOR.value
                 input_scale = BasevLLMParameter(
                     data=torch.empty(1, dtype=torch.float32),
                     weight_loader=weight_loader,
@@ -163,12 +181,12 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         else:
             # for sparse-only, pass in 1 for weight/input scales
-            weight_scale = torch.nn.Parameter(data=torch.ones(
-                1, dtype=torch.float32),
-                                              requires_grad=False)
-            input_scale = torch.nn.Parameter(data=torch.ones(
-                1, dtype=torch.float32),
-                                             requires_grad=False)
+            weight_scale = torch.nn.Parameter(
+                data=torch.ones(1, dtype=torch.float32), requires_grad=False
+            )
+            input_scale = torch.nn.Parameter(
+                data=torch.ones(1, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("input_scale", input_scale)
             layer.register_parameter("weight_scale", weight_scale)
 
@@ -199,8 +217,9 @@ class CompressedTensors24(CompressedTensorsScheme):
 
         # torch.compile workaround
         if hasattr(layer, "input_scale"):
-            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
-                                                   requires_grad=False)
+            layer.input_scale = torch.nn.Parameter(
+                layer.input_scale.data, requires_grad=False
+            )
 
         if self.weight_quant:
             if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
@@ -214,11 +233,11 @@ class CompressedTensors24(CompressedTensorsScheme):
             else:
                 # torch.compile workaround
                 layer.weight_scale = torch.nn.Parameter(
-                    layer.weight_scale.data, requires_grad=False)
+                    layer.weight_scale.data, requires_grad=False
+                )
 
         # Set all negative zero values to 0 prior to compression
-        if (layer.weight.dtype.is_floating_point
-                and layer.weight.dtype.itemsize >= 2):
+        if layer.weight.dtype.is_floating_point and layer.weight.dtype.itemsize >= 2:
             layer.weight.data[layer.weight.data == -0.0] = 0.0
 
         w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
@@ -229,7 +248,7 @@ class CompressedTensors24(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """
         Returns the output tensor for the layer with 2:4
@@ -243,7 +262,7 @@ class CompressedTensors24(CompressedTensorsScheme):
         :return: The output tensor of the layer
         """
         if self.quantized:
-            scale = getattr(layer, 'input_scale', None)
+            scale = getattr(layer, "input_scale", None)
 
             if self.weights_dtype == torch.int8:
                 ops_output = ops.scaled_int8_quant(x, scale=scale)
@@ -286,12 +305,16 @@ class CompressedTensors24(CompressedTensorsScheme):
         if not is_8_bits:
             raise ValueError("Cutlass only supports 8-bit quantization")
 
-        if (self.weight_quant.type == QuantizationType.FLOAT
-                and self.input_quant.type == QuantizationType.FLOAT):
+        if (
+            self.weight_quant.type == QuantizationType.FLOAT
+            and self.input_quant.type == QuantizationType.FLOAT
+        ):
             return torch.float8_e4m3fn
 
-        if (self.weight_quant.type == QuantizationType.INT
-                and self.input_quant.type == QuantizationType.INT):
+        if (
+            self.weight_quant.type == QuantizationType.INT
+            and self.input_quant.type == QuantizationType.INT
+        ):
             return torch.int8
 
         raise ValueError("Quantization type not supported by Cutlass")
@@ -317,7 +340,7 @@ class CompressedTensors24(CompressedTensorsScheme):
         :param bitmask: The 2:4 bitmask associated with the compressed weights,
             representing the positions of non-zero elements in the compressed
             tensor.
-        :param layer: The layer whose weights need to be processed after 
+        :param layer: The layer whose weights need to be processed after
             loading.
         :return: The decompressed 2:4 sparse weight tensor.
         """
@@ -343,14 +366,16 @@ class CompressedTensors24(CompressedTensorsScheme):
         if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
             split_weights = torch.split(compressed, layer.logical_widths)
             split_bitmask = torch.split(bitmask, layer.logical_widths)
-            split_shape = [(out, layer.input_size_per_partition)
-                           for out in layer.logical_widths]
+            split_shape = [
+                (out, layer.input_size_per_partition) for out in layer.logical_widths
+            ]
 
         if split_weights:
             decompressed_shards = [
                 _process_split(compressed_weight, shape, bitmask)
                 for compressed_weight, shape, bitmask in zip(
-                    split_weights, split_shape, split_bitmask)
+                    split_weights, split_shape, split_bitmask
+                )
             ]
             decompressed = combine_shards(decompressed_shards)
         else:
@@ -362,5 +387,6 @@ class CompressedTensors24(CompressedTensorsScheme):
                         layer.input_size_per_partition,
                     ),
                     bitmask=bitmask,
-                ))
+                )
+            )
         return decompressed
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index a5d48f23567446e7f7d807e66fc04d0ab05decab..a7f9076db7e954facfc230d9c1b61c406c537c7e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -11,7 +10,7 @@ __all__ = ["CompressedTensorsScheme"]
 
 class CompressedTensorsScheme(ABC):
     """
-    Abstract class used to describe the weight creation and forward pass 
+    Abstract class used to describe the weight creation and forward pass
     of different quantization schemes supported by CompressedTensors.
     """
 
@@ -26,20 +25,21 @@ class CompressedTensorsScheme(ABC):
     @abstractmethod
     def create_weights(self, *args, **kwargs):
         """
-        Weight creation for the particular scheme. Inputs to this function 
+        Weight creation for the particular scheme. Inputs to this function
 
         """
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]):
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ):
         """
-        Run the forward pass for the particular scheme. This is where 
+        Run the forward pass for the particular scheme. This is where
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: torch.nn.Module with the registered weights and 
-            other parameters relevant to the particular scheme. 
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
         :param x: input to the layer
         :param bias: bias parameter
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 3f3e7668fcf74e45956a28ea631d7afdeb75b129..dd0f4b3d868d1104dd2b4d96a3a6f6714da66f6a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,20 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+    GPTQ_MARLIN_24_MAX_PARALLEL,
+    GPTQ_MARLIN_24_MIN_THREAD_N,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
 from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
@@ -25,11 +30,7 @@ W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
-
-    def __init__(self,
-                 strategy: str,
-                 num_bits: int,
-                 group_size: Optional[int] = None):
+    def __init__(self, strategy: str, num_bits: int, group_size: int | None = None):
         self.strategy = strategy
         self.group_size = group_size
         self.tile_size = 16
@@ -37,13 +38,13 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
         if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
             raise ValueError(
                 f"Unsupported num_bits = {num_bits}. "
-                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
+                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}"
+            )
 
         self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
 
         if self.strategy == "group" and self.group_size is None:
-            raise ValueError(
-                "group_size must be given when using strategy group")
+            raise ValueError("group_size must be given when using strategy group")
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -52,18 +53,20 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # required by torch.compile to be torch.nn.Parameter
-        layer.weight_packed = Parameter(layer.weight_packed.data,
-                                        requires_grad=False)
-        layer.scale_packed = Parameter(layer.scale_packed.data,
-                                       requires_grad=False)
+        layer.weight_packed = Parameter(layer.weight_packed.data, requires_grad=False)
+        layer.scale_packed = Parameter(layer.scale_packed.data, requires_grad=False)
         layer.meta = Parameter(layer.meta.data, requires_grad=False)
 
-    def create_weights(self, layer: torch.nn.Module, input_size: int,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         assert params_dtype == torch.float16, (
             "float16 is required for marlin24 compressed models. Set dtype=torch.float16"  # noqa: E501
         )
@@ -71,55 +74,59 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
         pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 
-        qweight = PackedvLLMParameter(data=torch.empty(
-            input_size_per_partition // self.tile_size // 2,
-            output_size_per_partition * self.tile_size // pack_factor,
-            dtype=torch.int32,
-        ),
-                                      input_dim=0,
-                                      output_dim=1,
-                                      packed_dim=1,
-                                      packed_factor=pack_factor,
-                                      marlin_tile_size=self.tile_size,
-                                      weight_loader=weight_loader)
-
-        input_groups = (1 if self.group_size is None else
-                        input_size_per_partition // self.group_size)
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.tile_size // 2,
+                output_size_per_partition * self.tile_size // pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=pack_factor,
+            marlin_tile_size=self.tile_size,
+            weight_loader=weight_loader,
+        )
+
+        input_groups = (
+            1
+            if self.group_size is None
+            else input_size_per_partition // self.group_size
+        )
 
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 input_groups,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
 
         if self.group_size is not None:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
         else:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
-
-        weight_shape = BasevLLMParameter(data=torch.empty(2,
-                                                          dtype=torch.int64),
-                                         weight_loader=weight_loader)
-
-        meta = PackedvLLMParameter(data=torch.empty(
-            input_size_per_partition // 8 // 2 // 2,
-            output_size_per_partition * 2,
-            dtype=torch.int16,
-        ),
-                                   input_dim=0,
-                                   output_dim=1,
-                                   packed_dim=1,
-                                   packed_factor=1,
-                                   marlin_tile_size=2,
-                                   weight_loader=weight_loader)
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+
+        weight_shape = BasevLLMParameter(
+            data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader
+        )
+
+        meta = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // 8 // 2 // 2,
+                output_size_per_partition * 2,
+                dtype=torch.int16,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=1,
+            marlin_tile_size=2,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight_packed", qweight)
         layer.register_parameter("weight_shape", weight_shape)
@@ -127,16 +134,17 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
         layer.register_parameter("meta", meta)
 
         max_workspace_size = (
-            output_size_per_partition //
-            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+            output_size_per_partition // GPTQ_MARLIN_24_MIN_THREAD_N
+        ) * GPTQ_MARLIN_24_MAX_PARALLEL
 
-        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
-                              requires_grad=False)
+        workspace = Parameter(
+            torch.zeros(max_workspace_size, dtype=torch.int), requires_grad=False
+        )
         layer.workspace = workspace
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
-
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         qweight = layer.weight_packed
         meta = layer.meta
         scales = layer.scale_packed
@@ -148,11 +156,19 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
         size_k = x_2d.shape[1]
         size_n = scales.shape[1]
 
-        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
-                                            workspace, self.quant_type, size_m,
-                                            size_n, size_k)
+        output_2d = ops.gptq_marlin_24_gemm(
+            x_2d,
+            qweight,
+            meta,
+            scales,
+            workspace,
+            self.quant_type,
+            size_m,
+            size_n,
+            size_k,
+        )
 
-        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index 96dccf04d490f511468a8c1911d69a6e652e88af..3afadc6eb7e5bf755d39d1970471cd0909198340 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -1,23 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    apply_fp4_marlin_linear, prepare_fp4_layer_for_marlin)
-from vllm.model_executor.parameter import (GroupQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    apply_fp4_marlin_linear,
+    prepare_fp4_layer_for_marlin,
+)
+from vllm.model_executor.parameter import (
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 
 __all__ = ["CompressedTensorsW4A16Fp4"]
 
 
 class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
-
     def __init__(self, has_input_global_scale: bool = False):
         self.has_input_global_scale = has_input_global_scale
         self.group_size = 16
@@ -27,49 +31,59 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
         # dont restrict as emulations
         return 80
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
 
         # Weight
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition // 2,
-            dtype=torch.uint8),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight_packed", weight)
 
         # Global Weight Scale
         weight_global_scale = PerTensorScaleParameter(
             data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight_global_scale", weight_global_scale)
 
         # Per Group Weight Scale
-        weight_scale = GroupQuantScaleParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition // self.group_size,
-            dtype=torch.float8_e4m3fn,
-        ),
-                                                input_dim=1,
-                                                output_dim=0,
-                                                weight_loader=weight_loader)
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // self.group_size,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight_scale", weight_scale)
 
         if self.has_input_global_scale:
             input_global_scale = PerTensorScaleParameter(
-                data=torch.empty(len(output_partition_sizes),
-                                 dtype=torch.float32),
-                weight_loader=weight_loader)
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
             layer.register_parameter("input_global_scale", input_global_scale)
 
     def process_weights_after_loading(self, layer) -> None:
@@ -81,25 +95,30 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
         # Rename weight_global_scale to weight_scale_2 that marlin expects
         # Note: ct stores the inverse of what is expected by the marlin kernel
         layer.weight_scale_2 = Parameter(
-            1 / layer.weight_global_scale.max().to(torch.float32),
-            requires_grad=False)
+            1 / layer.weight_global_scale.max().to(torch.float32), requires_grad=False
+        )
         del layer.weight_global_scale
 
         if self.has_input_global_scale:
             layer.input_global_scale = torch.nn.Parameter(
-                layer.input_global_scale.data, requires_grad=False)
+                layer.input_global_scale.data, requires_grad=False
+            )
 
         prepare_fp4_layer_for_marlin(layer)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return apply_fp4_marlin_linear(input=x,
-                                       weight=layer.weight,
-                                       weight_scale=layer.weight_scale,
-                                       weight_scale_2=layer.weight_scale_2,
-                                       workspace=layer.workspace,
-                                       size_n=layer.output_size_per_partition,
-                                       size_k=layer.input_size_per_partition,
-                                       bias=bias)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return apply_fp4_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=layer.weight_scale_2,
+            workspace=layer.workspace,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index d472427756d46fb16b22ba8c2d67dd28033d6ee8..b603bdb13280bbfdec4d111b5da79b0e030b6583 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn.parameter import Parameter
@@ -9,14 +9,20 @@ import vllm.envs as envs
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
-    run_nvfp4_emulations)
+    run_nvfp4_emulations,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    swizzle_blockscale)
-from vllm.model_executor.parameter import (GroupQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    cutlass_fp4_supported,
+    swizzle_blockscale,
+)
+from vllm.model_executor.parameter import (
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
 
 logger = init_logger(__name__)
@@ -25,12 +31,13 @@ __all__ = ["CompressedTensorsW4A4Fp4"]
 
 
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
-
     def __init__(self):
-        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
-            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
-            self.backend = "flashinfer-trtllm"
-            logger.info_once("Using flashinfer-trtllm for FP4")
+        self.backend = "none"
+        if envs.VLLM_NVFP4_GEMM_BACKEND is None:
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+            elif cutlass_fp4_supported():
+                self.backend = "cutlass"
         elif envs.VLLM_USE_FBGEMM:
             self.backend = "fbgemm"
             try:
@@ -40,12 +47,20 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                     "Backend fbgemm requires fbgemm.f4f4bf16 operator, "
                     "Please install with: pip install fbgemm-gpu-genai"
                 ) from exc
-            logger.info_once("Using FGBEMM-GPU-GENAI for FP4")
-        elif has_flashinfer():
-            self.backend = "flashinfer-cutlass"
-            logger.info_once("Using flashinfer-cutlass for FP4")
-        else:
+        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
             self.backend = "cutlass"
+            assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}"
+
+        if self.backend == "none":
+            raise ValueError(
+                "No valid NVFP4 GEMM backend found. "
+                "Please check your platform capability."
+            )
+
+        logger.info_once(f"Using {self.backend} for NVFP4 GEMM")
         self.group_size = 16
 
     @classmethod
@@ -54,58 +69,67 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             return 80
         return 100
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
 
         # Weight
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition // 2,
-            dtype=torch.uint8),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight_packed", weight)
 
         # Global Weight Scale
         weight_global_scale = PerTensorScaleParameter(
             data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight_global_scale", weight_global_scale)
 
         # Per Group Weight Scale
-        weight_scale = GroupQuantScaleParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition // self.group_size,
-            dtype=torch.float8_e4m3fn,
-        ),
-                                                input_dim=1,
-                                                output_dim=0,
-                                                weight_loader=weight_loader)
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // self.group_size,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight_scale", weight_scale)
 
         input_global_scale = PerTensorScaleParameter(
             data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("input_global_scale", input_global_scale)
 
     def process_weights_after_loading(self, layer) -> None:
-
         global_input_scale = layer.input_global_scale.max().to(torch.float32)
-        layer.input_global_scale = Parameter(global_input_scale,
-                                             requires_grad=False)
+        layer.input_global_scale = Parameter(global_input_scale, requires_grad=False)
 
         layer.weight_global_scale = Parameter(
-            layer.weight_global_scale.max().to(torch.float32),
-            requires_grad=False)
+            layer.weight_global_scale.max().to(torch.float32), requires_grad=False
+        )
 
         if self.backend == "flashinfer-trtllm":
             # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
@@ -118,40 +142,43 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             weight_scale = layer.weight_scale.data
 
             epilogue_tile_m = 128
-            weight = shuffle_matrix_a(weight.view(torch.uint8),
-                                      epilogue_tile_m)
-            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
-                torch.uint8), epilogue_tile_m).reshape(
-                    weight_scale.shape).view(torch.float8_e4m3fn))
+            weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m)
+            weight_scale = (
+                shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m)
+                .reshape(weight_scale.shape)
+                .view(torch.float8_e4m3fn)
+            )
 
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight_packed = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
             if self.backend == "fbgemm":
-                swizzled_weight_scale = swizzled_weight_scale.view(-1).view(
-                    torch.uint8)
-            layer.weight_scale = Parameter(swizzled_weight_scale,
-                                           requires_grad=False)
-            layer.weight_packed = Parameter(layer.weight_packed.data,
-                                            requires_grad=False)
+                swizzled_weight_scale = swizzled_weight_scale.view(-1).view(torch.uint8)
+            layer.weight_scale = Parameter(swizzled_weight_scale, requires_grad=False)
+            layer.weight_packed = Parameter(
+                layer.weight_packed.data, requires_grad=False
+            )
 
         layer.alpha = Parameter(
             1 / (layer.input_global_scale * layer.weight_global_scale),
-            requires_grad=False)
-
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+            requires_grad=False,
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
             out = run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,
                 weight=layer.weight_packed,
                 weight_scale_swizzled=layer.weight_scale,
-                weight_global_scale=layer.weight_global_scale)
+                weight_global_scale=layer.weight_global_scale,
+            )
             if bias is not None:
                 out = out + bias
             return out
@@ -162,12 +189,17 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
-        mm_args = (x_fp4, layer.weight_packed, x_blockscale,
-                   layer.weight_scale, layer.alpha, output_dtype)
-        if self.backend == "flashinfer-trtllm":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
-        elif self.backend == "flashinfer-cutlass":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        mm_args = (
+            x_fp4,
+            layer.weight_packed,
+            x_blockscale,
+            layer.weight_scale,
+            layer.alpha,
+            output_dtype,
+        )
+        if self.backend.startswith("flashinfer-"):
+            backend_name = self.backend[len("flashinfer-") :]
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name)
         elif self.backend == "fbgemm":
             out = torch.ops.fbgemm.f4f4bf16(
                 x_fp4,
@@ -178,6 +210,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 use_mx=False,
             ).to(output_dtype)
         else:
+            assert self.backend == "cutlass"
             out = cutlass_scaled_fp4_mm(*mm_args)
 
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index 3d9827058803e47dc82ebf40569edfb36deba880..a23961e8975344117f7226875c0b944f932f9dce 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -1,25 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import ActivationOrdering
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
-    MPLinearLayerConfig, choose_mp_linear_kernel)
+    MPLinearLayerConfig,
+    choose_mp_linear_kernel,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    marlin_repeat_scales_on_all_ranks)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
-# yapf: enable
+    marlin_repeat_scales_on_all_ranks,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -34,13 +37,14 @@ W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys())
 class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
     _kernel_backends_being_used: set[str] = set()
 
-    def __init__(self,
-                 strategy: str,
-                 num_bits: int,
-                 group_size: Optional[int] = None,
-                 symmetric: Optional[bool] = True,
-                 actorder: Optional[ActivationOrdering] = None):
-
+    def __init__(
+        self,
+        strategy: str,
+        num_bits: int,
+        group_size: int | None = None,
+        symmetric: bool | None = True,
+        actorder: ActivationOrdering | None = None,
+    ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.symmetric = symmetric
@@ -48,13 +52,15 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
         self.has_g_idx = actorder == ActivationOrdering.GROUP
 
         if self.group_size != 128 or self.strategy != "group":
-            raise ValueError("W4A8 kernels require group quantization " \
-            "with group size 128")
+            raise ValueError(
+                "W4A8 kernels require group quantization with group size 128"
+            )
 
         if num_bits not in W4A8_SUPPORTED_TYPES_MAP:
             raise ValueError(
                 f"Unsupported num_bits = {num_bits}. "
-                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}")
+                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}"
+            )
 
         self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits]
 
@@ -63,38 +69,45 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
         # hopper
         return 90
 
-    def create_weights(self, layer: torch.nn.Module, output_size: int,
-                       input_size: int, output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_size: int,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
-            partition_weight_shape=\
-                (input_size_per_partition, output_size_per_partition),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
             weight_type=self.quant_type,
             act_type=torch.float8_e4m3fn,  # always use fp8(e4m3)
             group_size=self.group_size,
             zero_points=not self.symmetric,
             has_g_idx=self.has_g_idx,
-            out_type=params_dtype
+            out_type=params_dtype,
         )
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for CompressedTensorsW4A8Fp8",
-                        kernel_type.__name__)
+            logger.info("Using %s for CompressedTensorsW4A8Fp8", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # If group_size is -1, we are in channelwise case.
         group_size = self.group_size if self.group_size != -1 else input_size
-        row_parallel = (input_size != input_size_per_partition)
+        row_parallel = input_size != input_size_per_partition
         partition_scales = not marlin_repeat_scales_on_all_ranks(
-            self.has_g_idx, self.group_size, row_parallel)
+            self.has_g_idx, self.group_size, row_parallel
+        )
 
         scales_and_zp_size = input_size // group_size
 
@@ -102,68 +115,69 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             assert input_size_per_partition % group_size == 0
             scales_and_zp_size = input_size_per_partition // group_size
 
-        weight = PackedvLLMParameter(input_dim=1,
-                                     output_dim=0,
-                                     weight_loader=weight_loader,
-                                     packed_factor=self.pack_factor,
-                                     packed_dim=1,
-                                     data=torch.empty(
-                                         output_size_per_partition,
-                                         input_size_per_partition //
-                                         self.pack_factor,
-                                         dtype=torch.int32,
-                                     ))
+        weight = PackedvLLMParameter(
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+            packed_factor=self.pack_factor,
+            packed_dim=1,
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.pack_factor,
+                dtype=torch.int32,
+            ),
+        )
 
         # TODO(czhu): allocate the packed fp8 scales memory here?
         # the scales will be expanded by 8x via `cutlass_pack_scale_fp8`
         weight_scale_args = {
-            "weight_loader":
-            weight_loader,
-            "data":
-            torch.empty(
+            "weight_loader": weight_loader,
+            "data": torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
                 dtype=torch.float8_e4m3fn,
-            )
+            ),
         }
 
         if not partition_scales:
-            weight_scale = ChannelQuantScaleParameter(output_dim=0,
-                                                      **weight_scale_args)
+            weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args)
         else:
-            weight_scale = GroupQuantScaleParameter(output_dim=0,
-                                                    input_dim=1,
-                                                    **weight_scale_args)
+            weight_scale = GroupQuantScaleParameter(
+                output_dim=0, input_dim=1, **weight_scale_args
+            )
 
         # A 2D array defining the original shape of the weights
         # before packing
-        weight_shape = BasevLLMParameter(data=torch.empty(2,
-                                                          dtype=torch.int64),
-                                         weight_loader=weight_loader)
+        weight_shape = BasevLLMParameter(
+            data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader
+        )
 
         # per-channel scales
         weight_chan_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1),
-                             dtype=torch.float32),
+            data=torch.empty((output_size_per_partition, 1), dtype=torch.float32),
             output_dim=0,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
         layer.register_parameter("weight_chan_scale", weight_chan_scale)
 
-        self.kernel = kernel_type(mp_linear_kernel_config,
-                                  w_q_param_name="weight_packed",
-                                  w_s_param_name="weight_scale",
-                                  w_zp_param_name="weight_zero_point",
-                                  w_gidx_param_name="weight_g_idx")
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="weight_packed",
+            w_s_param_name="weight_scale",
+            w_zp_param_name="weight_zero_point",
+            w_gidx_param_name="weight_g_idx",
+        )
 
     # Checkpoints are serialized in compressed-tensors format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
index f1fca85508a6b4948d685accd52ac9061e0d0d2f..aa0c52beda2b59844fea4472d573100c1cecafe0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
@@ -1,18 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
-    MPLinearLayerConfig, choose_mp_linear_kernel)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           ModelWeightParameter)
+    MPLinearLayerConfig,
+    choose_mp_linear_kernel,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -27,12 +32,14 @@ W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys())
 class CompressedTensorsW4A8Int(CompressedTensorsScheme):
     _kernel_backends_being_used: set[str] = set()
 
-    def __init__(self,
-                 strategy: str,
-                 num_bits: int,
-                 group_size: Optional[int] = None,
-                 is_static_input_scheme: bool = False,
-                 input_symmetric: bool = True):
+    def __init__(
+        self,
+        strategy: str,
+        num_bits: int,
+        group_size: int | None = None,
+        is_static_input_scheme: bool = False,
+        input_symmetric: bool = True,
+    ):
         self.strategy = strategy
         self.group_size = -1 if group_size is None else group_size
         self.is_static_input_scheme = is_static_input_scheme
@@ -41,42 +48,53 @@ class CompressedTensorsW4A8Int(CompressedTensorsScheme):
         if num_bits not in W4A8_SUPPORTED_TYPES_MAP:
             raise ValueError(
                 f"Unsupported num_bits = {num_bits}."
-                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}")
+                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}"
+            )
         self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 1
 
-    def create_weights(self, layer: torch.nn.Module, output_size: int,
-                       input_size: int, output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_size: int,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
-        row_parallel = (input_size != input_size_per_partition)
+        row_parallel = input_size != input_size_per_partition
 
         # Compute effective group_size
         if self.group_size == -1:
-            effective_group_size = (input_size_per_partition
-                                    if row_parallel else input_size)
+            effective_group_size = (
+                input_size_per_partition if row_parallel else input_size
+            )
         else:
             effective_group_size = self.group_size
 
         # Ensure group_size divides input_size_per_partition
         assert input_size_per_partition % effective_group_size == 0, (
             f"input_size_per_partition {input_size_per_partition}"
-            f" not divisible by group_size {effective_group_size}")
+            f" not divisible by group_size {effective_group_size}"
+        )
 
         # Determine scale partitioning
-        is_channelwise = (self.group_size == -1)
-        repeat_scales = (is_channelwise and row_parallel)
+        is_channelwise = self.group_size == -1
+        repeat_scales = is_channelwise and row_parallel
         partition_scales = not repeat_scales
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
-            partition_weight_shape=(input_size_per_partition,
-                                    output_size_per_partition),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
             weight_type=self.quant_type,
             act_type=params_dtype,
             group_size=effective_group_size,
@@ -86,50 +104,50 @@ class CompressedTensorsW4A8Int(CompressedTensorsScheme):
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for CompressedTensorsW4A8Int",
-                        kernel_type.__name__)
+            logger.info("Using %s for CompressedTensorsW4A8Int", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         scales_and_zp_size = input_size_per_partition // effective_group_size
 
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=torch.int8),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         weight_scale_args = {
-            "weight_loader":
-            weight_loader,
-            "data":
-            torch.empty(output_size_per_partition,
-                        scales_and_zp_size,
-                        dtype=params_dtype)
+            "weight_loader": weight_loader,
+            "data": torch.empty(
+                output_size_per_partition, scales_and_zp_size, dtype=params_dtype
+            ),
         }
 
         if partition_scales:
-            weight_scale = GroupQuantScaleParameter(output_dim=0,
-                                                    input_dim=1,
-                                                    **weight_scale_args)
+            weight_scale = GroupQuantScaleParameter(
+                output_dim=0, input_dim=1, **weight_scale_args
+            )
         else:
-            weight_scale = ChannelQuantScaleParameter(output_dim=0,
-                                                      **weight_scale_args)
+            weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args)
 
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
 
-        self.kernel = kernel_type(mp_linear_kernel_config,
-                                  w_q_param_name="weight_packed",
-                                  w_s_param_name="weight_scale",
-                                  w_zp_param_name=None,
-                                  w_gidx_param_name=None)
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="weight_packed",
+            w_s_param_name="weight_scale",
+            w_zp_param_name=None,
+            w_gidx_param_name=None,
+        )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 01a87a088899679abc4f1f633a955ad3a86caacf..904a9f5d4907d79ad3bf711c492622c09ac8074c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,30 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    convert_to_channelwise,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 
 __all__ = ["CompressedTensorsW8A16Fp8"]
 
-SUPPORTED_STRATEGIES = [
-    QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
-]
+SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR]
 
 
 class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
-
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
@@ -39,31 +42,36 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
     # we expand each scale to its shard's channels.
     def process_weights_after_loading(self, layer) -> None:
         if self.strategy == QuantizationStrategy.TENSOR:
-            ws_channelwise = convert_to_channelwise(layer.weight_scale,
-                                                    layer.logical_widths)
-            layer.weight_scale = torch.nn.Parameter(ws_channelwise,
-                                                    requires_grad=False)
+            ws_channelwise = convert_to_channelwise(
+                layer.weight_scale, layer.logical_widths
+            )
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False)
         else:
             # required by torch.compile to be torch.nn.Parameter
-            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
-                                                    requires_grad=False)
+            layer.weight_scale = torch.nn.Parameter(
+                layer.weight_scale.data, requires_grad=False
+            )
 
         # Weights must be transposed for marlin
-        layer.weight = torch.nn.Parameter(layer.weight.t(),
-                                          requires_grad=False)
+        layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False)
 
         if self.is_static_input_scheme:
             # required by torch.compile to be torch.nn.Parameter
-            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
-                                                   requires_grad=False)
+            layer.input_scale = torch.nn.Parameter(
+                layer.input_scale.data, requires_grad=False
+            )
         prepare_fp8_layer_for_marlin(layer)
 
-    def create_weights(self, layer: torch.nn.Module, input_size: int,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
@@ -72,50 +80,59 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
         layer.weight_block_size = None
 
         # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=torch.float8_e4m3fn),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
         if self.strategy == QuantizationStrategy.CHANNEL:
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
-                                 dtype=torch.float32),
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
                 output_dim=0,
-                weight_loader=weight_loader)
+                weight_loader=weight_loader,
+            )
         elif self.strategy == QuantizationStrategy.TENSOR:
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
         else:
             raise ValueError(
                 f"Unsupported weight strategy={self.strategy}, "
-                f"supported strategies are {SUPPORTED_STRATEGIES}")
+                f"supported strategies are {SUPPORTED_STRATEGIES}"
+            )
 
         weight_scale[:] = torch.finfo(torch.float32).min
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE (to deal with converted checkpoints)
         if self.is_static_input_scheme:
-            input_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                  weight_loader=weight_loader)
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
             layer.register_parameter("input_scale", input_scale)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        return apply_fp8_marlin_linear(input=x,
-                                       weight=layer.weight,
-                                       weight_scale=layer.weight_scale,
-                                       workspace=layer.workspace,
-                                       size_n=layer.output_size_per_partition,
-                                       size_k=layer.input_size_per_partition,
-                                       bias=bias)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return apply_fp8_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            workspace=layer.workspace,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index d42ae22c51393c44bc930ef88c27b6baa52c9f66..ee99572f5f4999723bc33e3f3cd6d44fedfdeffc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,28 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
-from compressed_tensors.quantization import (QuantizationArgs,
-                                             QuantizationStrategy)
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from torch.nn import Parameter
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_block_linear, check_aiter_fp8_linear_support,
-    create_fp8_input_scale, create_fp8_scale_parameter,
-    create_fp8_weight_parameter, maybe_post_process_fp8_weight_block,
-    process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy,
-    process_fp8_weight_tensor_strategy, validate_fp8_block_shape)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    W8A8BlockFp8LinearOp,
+    create_fp8_input_scale,
+    create_fp8_scale_parameter,
+    create_fp8_weight_parameter,
+    maybe_post_process_fp8_weight_block,
+    process_fp8_weight_block_strategy,
+    process_fp8_weight_channel_strategy,
+    process_fp8_weight_tensor_strategy,
+    validate_fp8_block_shape,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, cutlass_block_fp8_supported, maybe_create_device_identity)
-from vllm.model_executor.parameter import (BlockQuantScaleParameter,
-                                           ChannelQuantScaleParameter,
-                                           PerTensorScaleParameter)
+    Fp8LinearOp,
+    cutlass_block_fp8_supported,
+    maybe_create_device_identity,
+)
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ChannelQuantScaleParameter,
+    PerTensorScaleParameter,
+)
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 
@@ -34,85 +44,117 @@ strategy_to_parameter_type = {
 
 
 class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
-
-    def __init__(self, weight_quant: QuantizationArgs,
-                 is_static_input_scheme: bool):
+    def __init__(self, weight_quant: QuantizationArgs, is_static_input_scheme: bool):
         self.weight_quant = weight_quant
         self.strategy = weight_quant.strategy
         self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
-        self.act_q_group_shape = GroupShape.PER_TENSOR \
-            if is_static_input_scheme else GroupShape.PER_TOKEN
-        self.fp8_linear = Fp8LinearOp(
-            act_quant_static=self.is_static_input_scheme,
-            act_quant_group_shape=self.act_q_group_shape)
 
         self.weight_block_size = self.weight_quant.block_structure
+        if self.weight_block_size is not None:
+            self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
+        else:
+            self.act_q_group_shape = (
+                GroupShape.PER_TENSOR
+                if is_static_input_scheme
+                else GroupShape.PER_TOKEN
+            )
+
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+
+        if self.weight_block_size is not None:
+            assert not self.is_static_input_scheme
+            self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(*self.weight_block_size),
+                act_quant_group_shape=self.act_q_group_shape,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
+            )
+        else:
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=self.is_static_input_scheme,
+                act_quant_group_shape=self.act_q_group_shape,
+            )
 
     @classmethod
     def get_min_capability(cls) -> int:
         # lovelace and up
         return 89
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       weight_loader: Callable, **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         maybe_create_device_identity()
 
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
         layer.weight_block_size = None
+        layer.orig_dtype = params_dtype
 
         if self.strategy == QuantizationStrategy.BLOCK:
             assert self.weight_block_size is not None
             layer.weight_block_size = self.weight_block_size
             # Validate block quantization shapes
-            validate_fp8_block_shape(layer, input_size, output_size,
-                                     input_size_per_partition,
-                                     output_partition_sizes,
-                                     self.weight_block_size)
+            validate_fp8_block_shape(
+                layer,
+                input_size,
+                output_size,
+                input_size_per_partition,
+                output_partition_sizes,
+                self.weight_block_size,
+            )
 
         # WEIGHT
-        weight = create_fp8_weight_parameter(output_size_per_partition,
-                                             input_size_per_partition,
-                                             weight_loader)
+        weight = create_fp8_weight_parameter(
+            output_size_per_partition, input_size_per_partition, weight_loader
+        )
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
         weight_scale = create_fp8_scale_parameter(
-            strategy_to_parameter_type[self.strategy], output_partition_sizes,
-            input_size_per_partition, layer.weight_block_size, weight_loader)
+            strategy_to_parameter_type[self.strategy],
+            output_partition_sizes,
+            input_size_per_partition,
+            layer.weight_block_size,
+            weight_loader,
+        )
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = create_fp8_input_scale(output_partition_sizes,
-                                                 weight_loader)
+            input_scale = create_fp8_input_scale(output_partition_sizes, weight_loader)
             layer.register_parameter("input_scale", input_scale)
 
     def process_weights_after_loading(self, layer) -> None:
         if self.strategy == QuantizationStrategy.TENSOR:
-            weight, weight_scale, input_scale = (
-                process_fp8_weight_tensor_strategy(
-                    layer.weight, layer.weight_scale, layer.logical_widths,
-                    getattr(layer, 'input_scale', None)))
+            weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
+                layer.weight,
+                layer.weight_scale,
+                layer.logical_widths,
+                getattr(layer, "input_scale", None),
+            )
             weight = weight.t()
 
         elif self.strategy == QuantizationStrategy.CHANNEL:
-            weight, weight_scale, input_scale = (
-                process_fp8_weight_channel_strategy(
-                    layer.weight, layer.weight_scale,
-                    getattr(layer, 'input_scale', None)))
+            weight, weight_scale, input_scale = process_fp8_weight_channel_strategy(
+                layer.weight, layer.weight_scale, getattr(layer, "input_scale", None)
+            )
             weight = weight.t()
 
         elif self.strategy == QuantizationStrategy.BLOCK:
             assert self.is_static_input_scheme is False
             weight, weight_scale = process_fp8_weight_block_strategy(
-                layer.weight, layer.weight_scale)
+                layer.weight, layer.weight_scale
+            )
             input_scale = None
 
         else:
@@ -122,36 +164,37 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
         layer.weight = Parameter(weight.data, requires_grad=False)
         layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
         if input_scale is not None:
-            layer.input_scale = Parameter(input_scale.data,
-                                          requires_grad=False)
+            layer.input_scale = Parameter(input_scale.data, requires_grad=False)
 
         # INPUT SCALE
-        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
-            layer.input_scale = Parameter(layer.input_scale.max(),
-                                          requires_grad=False)
+        if self.is_static_input_scheme and hasattr(layer, "input_scale"):
+            layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
         else:
             layer.input_scale = None
 
         if self.strategy == QuantizationStrategy.BLOCK:
-            maybe_post_process_fp8_weight_block(
-                layer, self.cutlass_block_fp8_supported)
-
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        if layer.weight_block_size is not None:
-            return apply_fp8_block_linear(
-                layer,
+            maybe_post_process_fp8_weight_block(layer)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if self.weight_block_size is not None:
+            return self.w8a8_block_fp8_linear.apply(
                 input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                input_scale=layer.input_scale,
                 bias=bias,
-                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
-                use_aiter_and_is_supported=self.use_aiter_and_is_supported)
-
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     out_dtype=self.out_dtype,
-                                     input_scale=layer.input_scale,
-                                     bias=bias)
+            )
+
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            out_dtype=self.out_dtype,
+            input_scale=layer.input_scale,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6189f0609d85d19ec94151cb6220f757a78bb5b3..6fd0a6a1c822ca9b1ab34ce4d488c8cb797b22c4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,20 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
-    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    ScaledMMLinearLayerConfig,
+    choose_scaled_mm_linear_kernel,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 
 logger = init_logger(__name__)
 
@@ -22,8 +27,9 @@ logger = init_logger(__name__)
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
     _kernel_backends_being_used: set[str] = set()
 
-    def __init__(self, strategy: str, is_static_input_scheme: bool,
-                 input_symmetric: bool):
+    def __init__(
+        self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
+    ):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
         self.input_symmetric = input_symmetric
@@ -33,56 +39,61 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
         # turing and up
         return 75
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         layer.logical_widths = output_partition_sizes
 
         scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
             is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
             is_static_input_scheme=self.is_static_input_scheme,
-            input_symmetric=self.input_symmetric)
+            input_symmetric=self.input_symmetric,
+        )
 
-        kernel_type = choose_scaled_mm_linear_kernel(
-            scaled_mm_linear_kernel_config)
+        kernel_type = choose_scaled_mm_linear_kernel(scaled_mm_linear_kernel_config)
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for CompressedTensorsW8A8Int8",
-                        kernel_type.__name__)
+            logger.info("Using %s for CompressedTensorsW8A8Int8", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=torch.int8),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
         if self.strategy == QuantizationStrategy.CHANNEL:
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
-                                 dtype=torch.float32),
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
                 output_dim=0,
-                weight_loader=weight_loader)
+                weight_loader=weight_loader,
+            )
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = BasevLLMParameter(data=torch.empty(
-                1, dtype=torch.float32),
-                                            weight_loader=weight_loader)
+            input_scale = BasevLLMParameter(
+                data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader
+            )
             layer.register_parameter("input_scale", input_scale)
 
             if not self.input_symmetric:
@@ -90,22 +101,25 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
                 # as the weights
                 # AZP loaded as int8 but used as int32
                 input_zero_point = BasevLLMParameter(
-                    data=torch.empty(1, dtype=torch.int8),
-                    weight_loader=weight_loader)
+                    data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader
+                )
                 layer.register_parameter("input_zero_point", input_zero_point)
 
-        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
-                                  w_q_param_name="weight",
-                                  w_s_param_name="weight_scale",
-                                  i_s_param_name="input_scale",
-                                  i_zp_param_name="input_zero_point",
-                                  azp_adj_param_name="azp_adj")
+        self.kernel = kernel_type(
+            c=scaled_mm_linear_kernel_config,
+            w_q_param_name="weight",
+            w_s_param_name="weight_scale",
+            i_s_param_name="input_scale",
+            i_zp_param_name="input_zero_point",
+            azp_adj_param_name="azp_adj",
+        )
 
     # Checkpoints are serialized in compressed-tensors format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 74787603e00295a8bf3561425473e004a9fd36f4..2267395fe67d331534fa74c92a4ce829180b73a7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,36 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import ActivationOrdering
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
-    MPLinearLayerConfig, choose_mp_linear_kernel)
+    MPLinearLayerConfig,
+    choose_mp_linear_kernel,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    marlin_repeat_scales_on_all_ranks)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedColumnParameter,
-                                           PackedvLLMParameter,
-                                           RowvLLMParameter)
-# yapf: enable
+    marlin_repeat_scales_on_all_ranks,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
 __all__ = ["CompressedTensorsWNA16"]
-WNA16_SUPPORTED_TYPES_MAP = {
-    4: scalar_types.uint4b8,
-    8: scalar_types.uint8b128
-}
+WNA16_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4b8, 8: scalar_types.uint8b128}
 WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8}
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
@@ -38,13 +38,14 @@ WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 class CompressedTensorsWNA16(CompressedTensorsScheme):
     _kernel_backends_being_used: set[str] = set()
 
-    def __init__(self,
-                 strategy: str,
-                 num_bits: int,
-                 group_size: Optional[int] = None,
-                 symmetric: Optional[bool] = True,
-                 actorder: Optional[ActivationOrdering] = None):
-
+    def __init__(
+        self,
+        strategy: str,
+        num_bits: int,
+        group_size: int | None = None,
+        symmetric: bool | None = True,
+        actorder: ActivationOrdering | None = None,
+    ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.symmetric = symmetric
@@ -52,55 +53,67 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         self.has_g_idx = actorder == ActivationOrdering.GROUP
 
         if self.group_size == -1 and self.strategy != "channel":
-            raise ValueError("Marlin kernels require group quantization or "
-                             "channelwise quantization, but found no group "
-                             "size and strategy is not channelwise.")
+            raise ValueError(
+                "Marlin kernels require group quantization or "
+                "channelwise quantization, but found no group "
+                "size and strategy is not channelwise."
+            )
 
         if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
             raise ValueError(
                 f"Unsupported num_bits = {num_bits}. "
-                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
+                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}"
+            )
 
-        self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
-                           if not self.symmetric else
-                           WNA16_SUPPORTED_TYPES_MAP[num_bits])
+        self.quant_type = (
+            WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
+            if not self.symmetric
+            else WNA16_SUPPORTED_TYPES_MAP[num_bits]
+        )
 
     @classmethod
     def get_min_capability(cls) -> int:
         # ampere and up
         return 80
 
-    def create_weights(self, layer: torch.nn.Module, output_size: int,
-                       input_size: int, output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_size: int,
+        input_size: int,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
-            partition_weight_shape=\
-                (input_size_per_partition, output_size_per_partition),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
             weight_type=self.quant_type,
             act_type=params_dtype,
             group_size=self.group_size,
             zero_points=not self.symmetric,
-            has_g_idx=self.has_g_idx
+            has_g_idx=self.has_g_idx,
         )
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for CompressedTensorsWNA16",
-                        kernel_type.__name__)
+            logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # If group_size is -1, we are in channelwise case.
         group_size = self.group_size if self.group_size != -1 else input_size
-        row_parallel = (input_size != input_size_per_partition)
+        row_parallel = input_size != input_size_per_partition
         partition_scales = not marlin_repeat_scales_on_all_ranks(
-            self.has_g_idx, self.group_size, row_parallel)
+            self.has_g_idx, self.group_size, row_parallel
+        )
 
         scales_and_zp_size = input_size // group_size
 
@@ -108,65 +121,65 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             assert input_size_per_partition % group_size == 0
             scales_and_zp_size = input_size_per_partition // group_size
 
-        weight = PackedvLLMParameter(input_dim=1,
-                                     output_dim=0,
-                                     weight_loader=weight_loader,
-                                     packed_factor=self.pack_factor,
-                                     packed_dim=1,
-                                     data=torch.empty(
-                                         output_size_per_partition,
-                                         input_size_per_partition //
-                                         self.pack_factor,
-                                         dtype=torch.int32,
-                                     ))
+        weight = PackedvLLMParameter(
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+            packed_factor=self.pack_factor,
+            packed_dim=1,
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.pack_factor,
+                dtype=torch.int32,
+            ),
+        )
 
         weight_scale_args = {
-            "weight_loader":
-            weight_loader,
-            "data":
-            torch.empty(
+            "weight_loader": weight_loader,
+            "data": torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
                 dtype=params_dtype,
-            )
+            ),
         }
 
         zeros_args = {
-            "weight_loader":
-            weight_loader,
-            "data":
-            torch.zeros(
+            "weight_loader": weight_loader,
+            "data": torch.zeros(
                 output_size_per_partition // self.pack_factor,
                 scales_and_zp_size,
                 dtype=torch.int32,
-            )
+            ),
         }
 
         if not partition_scales:
-            weight_scale = ChannelQuantScaleParameter(output_dim=0,
-                                                      **weight_scale_args)
+            weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args)
 
             if not self.symmetric:
-                qzeros = PackedColumnParameter(output_dim=0,
-                                               packed_dim=0,
-                                               packed_factor=self.pack_factor,
-                                               **zeros_args)
+                qzeros = PackedColumnParameter(
+                    output_dim=0,
+                    packed_dim=0,
+                    packed_factor=self.pack_factor,
+                    **zeros_args,
+                )
         else:
-            weight_scale = GroupQuantScaleParameter(output_dim=0,
-                                                    input_dim=1,
-                                                    **weight_scale_args)
+            weight_scale = GroupQuantScaleParameter(
+                output_dim=0, input_dim=1, **weight_scale_args
+            )
             if not self.symmetric:
-                qzeros = PackedvLLMParameter(input_dim=1,
-                                             output_dim=0,
-                                             packed_dim=0,
-                                             packed_factor=self.pack_factor,
-                                             **zeros_args)
+                qzeros = PackedvLLMParameter(
+                    input_dim=1,
+                    output_dim=0,
+                    packed_dim=0,
+                    packed_factor=self.pack_factor,
+                    **zeros_args,
+                )
 
         # A 2D array defining the original shape of the weights
         # before packing
-        weight_shape = BasevLLMParameter(data=torch.empty(2,
-                                                          dtype=torch.int64),
-                                         weight_loader=weight_loader)
+        weight_shape = BasevLLMParameter(
+            data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader
+        )
 
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
@@ -177,25 +190,30 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
 
         # group index (for activation reordering)
         if self.has_g_idx:
-            weight_g_idx = RowvLLMParameter(data=torch.empty(
-                input_size_per_partition,
-                dtype=torch.int32,
-            ),
-                                            input_dim=0,
-                                            weight_loader=weight_loader)
+            weight_g_idx = RowvLLMParameter(
+                data=torch.empty(
+                    input_size_per_partition,
+                    dtype=torch.int32,
+                ),
+                input_dim=0,
+                weight_loader=weight_loader,
+            )
             layer.register_parameter("weight_g_idx", weight_g_idx)
 
-        self.kernel = kernel_type(mp_linear_kernel_config,
-                                  w_q_param_name="weight_packed",
-                                  w_s_param_name="weight_scale",
-                                  w_zp_param_name="weight_zero_point",
-                                  w_gidx_param_name="weight_g_idx")
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="weight_packed",
+            w_s_param_name="weight_scale",
+            w_zp_param_name="weight_zero_point",
+            w_gidx_param_name="weight_g_idx",
+        )
 
     # Checkpoints are serialized in compressed-tensors format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
index d098185146e41c34a492b2102ebdc38c80751de7..bd1964e667d9a52f90a2448d304e435f2f92eb7a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
@@ -1,23 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from itertools import accumulate
-from typing import Callable, Optional
 
 import torch
-from compressed_tensors.transform import (TransformArgs, TransformConfig,
-                                          TransformLocation, TransformScheme)
+from compressed_tensors.transform import (
+    TransformArgs,
+    TransformConfig,
+    TransformLocation,
+    TransformScheme,
+)
 from compressed_tensors.utils import is_match
 
-from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
-                                               LinearMethodBase,
-                                               QKVCrossParallelLinear)
+from vllm.model_executor.layers.linear import (
+    WEIGHT_LOADER_V2_SUPPORTED,
+    LinearMethodBase,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsScheme)
+    CompressedTensorsScheme,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import (  # noqa: E501
-    HadamardTransform)
+    HadamardTransform,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
-    TransformTuple)
+    TransformTuple,
+)
 
 
 class CompressedTensorsLinearTransformMethod(LinearMethodBase):
@@ -30,42 +37,49 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
     def from_schemes(
         cls,
         quant_method: LinearMethodBase,
-        quant_scheme: Optional[CompressedTensorsScheme],
+        quant_scheme: CompressedTensorsScheme | None,
         input_tfms: dict[int, TransformTuple],
         output_tfms: dict[int, TransformTuple],
     ) -> "CompressedTensorsLinearTransformMethod":
         from vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4 import (  # noqa: E501
-            QutlassNvFP4LinearMethod, is_qutlass_fp4_scheme)
+            QutlassNvFP4LinearMethod,
+            is_qutlass_fp4_scheme,
+        )
 
         assert input_tfms or output_tfms
 
         if is_qutlass_fp4_scheme(quant_scheme, input_tfms):
-            return QutlassNvFP4LinearMethod(quant_method, input_tfms,
-                                            output_tfms)
+            return QutlassNvFP4LinearMethod(quant_method, input_tfms, output_tfms)
 
         # hadacore or dense gemm is selected by Transform module
 
         return cls(quant_method, input_tfms, output_tfms)
 
-    def __init__(self, quant_method: LinearMethodBase,
-                 input_tfms: dict[int, TransformTuple],
-                 output_tfms: dict[int, TransformTuple]):
+    def __init__(
+        self,
+        quant_method: LinearMethodBase,
+        input_tfms: dict[int, TransformTuple],
+        output_tfms: dict[int, TransformTuple],
+    ):
         self.quant_method = quant_method
         self.input_tfms = input_tfms
         self.output_tfms = output_tfms
 
-        self.input_transform: Optional[HadamardTransform] = None
-        self.output_transform: Optional[HadamardTransform] = None
-
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-
+        self.input_transform: HadamardTransform | None = None
+        self.output_transform: HadamardTransform | None = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # get weight loader for transforms
-        weight_loader: Callable = extra_weight_attrs.get(
-            "weight_loader")  # type: ignore[assignment]
+        weight_loader: Callable = extra_weight_attrs.get("weight_loader")  # type: ignore[assignment]
 
         # HACK: UnquantizedLinearMethod does not support weight loader v2, but
         # transforms (specifically SharedWeightParameter) requires
@@ -73,10 +87,7 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
         # hack around this by getting weight loader v1 so ULM can load correctly
         quant_method_name = self.quant_method.__class__.__name__
         if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED:
-            if isinstance(layer, QKVCrossParallelLinear):
-                weight_loader_v1 = layer.weight_loader_v1
-            else:
-                weight_loader_v1 = layer.weight_loader
+            weight_loader_v1 = layer.weight_loader
             extra_weight_attrs["weight_loader"] = weight_loader_v1
 
         self.quant_method.create_weights(
@@ -86,7 +97,8 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
             input_size=input_size,
             output_size=output_size,
             params_dtype=params_dtype,
-            **extra_weight_attrs)
+            **extra_weight_attrs,
+        )
 
         # validate schemes
         num_partitions = len(output_partition_sizes)
@@ -98,10 +110,13 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
             location = list(self.input_tfms.values())[0].args.location
             transform_name = f"{scheme_name}_{location}"
 
-            transform = HadamardTransform(self.input_tfms, layer,
-                                          weight_loader,
-                                          input_size_per_partition,
-                                          output_partition_sizes)
+            transform = HadamardTransform(
+                self.input_tfms,
+                layer,
+                weight_loader,
+                input_size_per_partition,
+                output_partition_sizes,
+            )
             layer.register_module(transform_name, transform)
             self.input_transform = transform
 
@@ -110,10 +125,13 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
             location = list(self.output_tfms.values())[0].args.location
             transform_name = f"{scheme_name}_{location}"
 
-            transform = HadamardTransform(self.output_tfms, layer,
-                                          weight_loader,
-                                          input_size_per_partition,
-                                          output_partition_sizes)
+            transform = HadamardTransform(
+                self.output_tfms,
+                layer,
+                weight_loader,
+                input_size_per_partition,
+                output_partition_sizes,
+            )
             layer.register_module(transform_name, transform)
             self.output_transform = transform
 
@@ -128,11 +146,12 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
             if isinstance(submodule, HadamardTransform):
                 submodule.process_weights_after_loading()
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         if self.input_transform is not None:
             x = self.input_transform(x)
 
@@ -143,8 +162,9 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
         # (@ksayers): confirm that this is done concurrently
         if self.output_transform is not None:
             for part_id, (start, length) in enumerate(self.partition_ranges):
-                x[:, start:start + length] = self.output_transform(
-                    x[:, start:start + length].contiguous(), part_id=part_id)
+                x[:, start : start + length] = self.output_transform(
+                    x[:, start : start + length].clone(), part_id=part_id
+                )
 
         return x
 
@@ -171,39 +191,41 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
 
 
 def get_linear_transform_schemes(
-    layer: torch.nn.Module, layer_name: str,
-    transform_config: Optional[TransformConfig],
-    packed_modules_mapping: dict[str, list[str]]
-) -> tuple[dict[int, TransformTuple], dict[
-        int, TransformTuple]]:  # [input_transform, [output_transform, ...]]
+    layer: torch.nn.Module,
+    layer_name: str,
+    transform_config: TransformConfig | None,
+    packed_modules_mapping: dict[str, list[str]],
+) -> tuple[
+    dict[int, TransformTuple], dict[int, TransformTuple]
+]:  # [input_transform, [output_transform, ...]]
     # there can only be one transform input scheme per (fused) module
     input_tfms = {}
     output_tfms = {}
 
-    partition_names = get_layer_partition_names(layer_name,
-                                                packed_modules_mapping)
+    partition_names = get_layer_partition_names(layer_name, packed_modules_mapping)
 
     for scheme_name, scheme, args in get_schemes_args(transform_config):
         for part_index, part_name in enumerate(partition_names):
-            if is_match(part_name, layer, args.targets,
-                        args.ignore) and args.is_online():
+            if (
+                is_match(part_name, layer, args.targets, args.ignore)
+                and args.is_online()
+            ):
                 if args.location == TransformLocation.INPUT:
-                    input_tfms[part_index] = TransformTuple(
-                        scheme_name, scheme, args)
+                    input_tfms[part_index] = TransformTuple(scheme_name, scheme, args)
 
                 elif args.location == TransformLocation.OUTPUT:
-                    output_tfms[part_index] = TransformTuple(
-                        scheme_name, scheme, args)
+                    output_tfms[part_index] = TransformTuple(scheme_name, scheme, args)
 
                 else:
-                    raise ValueError(f"Cannot apply `{args.location}` "
-                                     f"transform to `{layer_name}`")
+                    raise ValueError(
+                        f"Cannot apply `{args.location}` transform to `{layer_name}`"
+                    )
 
     return (input_tfms, output_tfms)
 
 
 def get_schemes_args(
-    transform_config: Optional[TransformConfig]
+    transform_config: TransformConfig | None,
 ) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
     if transform_config is None:
         return
@@ -214,20 +236,20 @@ def get_schemes_args(
 
 
 def get_layer_partition_names(
-        layer_name: str, packed_modules_mapping: dict[str,
-                                                      list[str]]) -> list[str]:
+    layer_name: str, packed_modules_mapping: dict[str, list[str]]
+) -> list[str]:
     """
     Get all partition names associated with this layer.
     Names are returned in order of their partition indices.
-    
+
     ```python
     mapping = {"gate_up_proj", "gate_proj", "up_proj"}
 
-    assert get_layer_partition_names(
-        "mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"]
-    assert get_layer_partition_names(
-        "mlp.down_proj", mapping) == ["down_proj"]
-    """
+    assert get_layer_partition_names("mlp.gate_up_proj", mapping) == [
+        "gate_proj",
+        "up_proj",
+    ]
+    assert get_layer_partition_names("mlp.down_proj", mapping) == ["down_proj"]"""
     for fused_suffix, part_suffixes in packed_modules_mapping.items():
         if layer_name.endswith(fused_suffix):
             return [
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
index 5e863354715e4c2230c074037dd998cc92c854f1..f5589c8c07fa6aed5dad18bbda70d06d492877ff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -1,23 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
-from collections.abc import Hashable
-from typing import Callable
+from collections.abc import Callable, Hashable
 
 import torch
-from compressed_tensors.transform import (TransformArgs, TransformLocation,
-                                          TransformScheme)
+from compressed_tensors.transform import (
+    TransformArgs,
+    TransformLocation,
+    TransformScheme,
+)
 from torch import Tensor
 
 import vllm._custom_ops as ops
-from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_world_size)
+from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
-    TransformTuple)
+    TransformTuple,
+)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.parameter import SharedWeightParameter
 
 
@@ -27,22 +28,28 @@ class HadamardTransform(torch.nn.Module):
     transforms. Meant to be used with `CompressedTensorsLinearTransformMethod`
     and attention transforms method (not implemented yet)
     """
+
     transforms: dict[int, TransformTuple]  # info parsed from transforms config
     weight: SharedWeightParameter  # container for shared tensors
 
     scales: dict[int, float]  # hadamard scale, usually sqrt(matrix.size(0))
 
-    def __init__(self, transforms: dict[int, TransformTuple],
-                 layer: torch.nn.Module, weight_loader: Callable,
-                 input_size_per_partition: int,
-                 output_partition_sizes: list[int]):
+    def __init__(
+        self,
+        transforms: dict[int, TransformTuple],
+        layer: torch.nn.Module,
+        weight_loader: Callable,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+    ):
         super().__init__()
         self.transforms = transforms
         self.scales = {}
 
         if get_tensor_model_parallel_world_size() > 1:
-            raise NotImplementedError("Online transforms with tensor "
-                                      "parallelism is not supported")
+            raise NotImplementedError(
+                "Online transforms with tensor parallelism is not supported"
+            )
 
         # Similar to row/col parallel params, but tensors are separate
         # to allow for loading with shared memory
@@ -50,11 +57,11 @@ class HadamardTransform(torch.nn.Module):
 
         # create shared partition data for each partition of the original weight
         input_size = input_size_per_partition
-        for part_index, (_scheme_name, scheme,
-                         args) in self.transforms.items():
+        for part_index, (_scheme_name, scheme, args) in self.transforms.items():
             output_size = output_partition_sizes[part_index]
-            weight_size = self._get_weight_size(layer, scheme, args,
-                                                input_size, output_size)
+            weight_size = self._get_weight_size(
+                layer, scheme, args, input_size, output_size
+            )
 
             data_key = self._get_data_key(scheme, weight_size)
             self.weight.add_partition(
@@ -101,28 +108,41 @@ class HadamardTransform(torch.nn.Module):
         # fall back to dense
         else:
             weight = self.weight.partitions[part_id]
-            weight = weight if self.transforms[
-                part_id].args.inverse else weight.T  # linear := x(W.T)
+            weight = (
+                weight if self.transforms[part_id].args.inverse else weight.T
+            )  # linear := x(W.T)
             scale = self.scales[part_id]
 
             if self.transforms[part_id].scheme.head_dim is not None:
                 value = value.unflatten(-1, (-1, weight.size(0)))
-                value = dispatch_unquantized_gemm()(self, value.to(
-                    weight.dtype), weight, None).to(value.dtype) * scale
+                value = (
+                    dispatch_unquantized_gemm()(
+                        self, value.to(weight.dtype), weight, None
+                    ).to(value.dtype)
+                    * scale
+                )
                 value = value.flatten(-2, -1)
 
                 return value
 
-            return dispatch_unquantized_gemm()(self, value.to(
-                weight.dtype), weight, None).to(value.dtype) * scale
+            return (
+                dispatch_unquantized_gemm()(
+                    self, value.to(weight.dtype), weight, None
+                ).to(value.dtype)
+                * scale
+            )
 
-    def _get_data_key(self, scheme: TransformScheme,
-                      weight_size: int) -> Hashable:
+    def _get_data_key(self, scheme: TransformScheme, weight_size: int) -> Hashable:
         return (id(scheme), weight_size)
 
-    def _get_weight_size(self, layer: torch.nn.Module, scheme: TransformScheme,
-                         args: TransformArgs, input_size: int,
-                         output_size: int) -> int:
+    def _get_weight_size(
+        self,
+        layer: torch.nn.Module,
+        scheme: TransformScheme,
+        args: TransformArgs,
+        input_size: int,
+        output_size: int,
+    ) -> int:
         if scheme.head_dim is not None:
             return scheme.head_dim
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
index 69b39f31eec1e95c152d30555c1322ec6a8d7dd6..f0bb47a728ad5bd1f803d8727f0fc5a4b3bd3a50 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
@@ -1,46 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsScheme, CompressedTensorsW4A4Fp4)
+    CompressedTensorsScheme,
+    CompressedTensorsW4A4Fp4,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
-    CompressedTensorsLinearTransformMethod, TransformTuple)
+    CompressedTensorsLinearTransformMethod,
+    TransformTuple,
+)
 
 __all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"]
 
 
-def is_qutlass_fp4_scheme(quant_scheme: Optional[CompressedTensorsScheme],
-                          input_tfms: dict[int, TransformTuple]) -> bool:
-    return isinstance(
-        quant_scheme,
-        (CompressedTensorsW4A4Fp4, )) and len(input_tfms) == 1 and input_tfms[
-            0].scheme.head_dim == quant_scheme.group_size
+def is_qutlass_fp4_scheme(
+    quant_scheme: CompressedTensorsScheme | None,
+    input_tfms: dict[int, TransformTuple],
+) -> bool:
+    return (
+        isinstance(quant_scheme, (CompressedTensorsW4A4Fp4,))
+        and len(input_tfms) == 1
+        and input_tfms[0].scheme.head_dim == quant_scheme.group_size
+    )
 
 
 class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod):
-
-    def create_weights(self, layer, input_size_per_partition,
-                       output_partition_sizes, input_size, output_size,
-                       params_dtype, **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer,
+        input_size_per_partition,
+        output_partition_sizes,
+        input_size,
+        output_size,
+        params_dtype,
+        **extra_weight_attrs,
+    ):
         # initializes fp4 qparams
-        assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4, ))
-        ret = super().create_weights(layer, input_size_per_partition,
-                                     output_partition_sizes, input_size,
-                                     output_size, params_dtype,
-                                     **extra_weight_attrs)
+        assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4,))
+        ret = super().create_weights(
+            layer,
+            input_size_per_partition,
+            output_partition_sizes,
+            input_size,
+            output_size,
+            params_dtype,
+            **extra_weight_attrs,
+        )
 
         assert self.input_transform is not None
         assert len(self.input_transform.weight) == 1
-        assert self.input_transform.weight[0].size(
-            0) == layer.scheme.group_size
+        assert self.input_transform.weight[0].size(0) == layer.scheme.group_size
 
         return ret
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         raise NotImplementedError()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index d926b4c12db14cac5703af5efd27f85b29113d00..25c7d335da200cacd7fc08d16e209d863b369348 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -17,13 +16,29 @@ def is_weak_contiguous(x: torch.Tensor):
 
 
 @triton.jit
-def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
-                     M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
-                     stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr,
-                     BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
-                     BLOCK_SIZE_K: tl.constexpr,
-                     BLOCK_SIZE_SCALE_A: tl.constexpr,
-                     BLOCK_SIZE_SCALE_B: tl.constexpr):
+def scaled_mm_kernel(
+    a_ptr,
+    b_ptr,
+    scale_a_ptr,
+    scale_b_ptr,
+    c_ptr,
+    bias_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    ACCUMULATOR_DTYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_SCALE_A: tl.constexpr,
+    BLOCK_SIZE_SCALE_B: tl.constexpr,
+):
     pid = tl.program_id(axis=0)
 
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
@@ -32,8 +47,7 @@ def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
     pid_n = pid % num_pid_n
 
     accumulator_dtype = ACCUMULATOR_DTYPE
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
-                           dtype=accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
 
     # NOTE: Some tensor inputs are so large, they will cause int32 overflow
     # so it is necessary to use tl.int64 for all the offsets, else SEGV will
@@ -47,20 +61,22 @@ def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
     masks_bn = offsets_bn < N
 
     offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
-    offsets_a = (stride_am * offsets_am[:, None] +
-                 stride_ak * offsets_k[None, :])
-    offsets_b = (stride_bk * offsets_k[:, None] +
-                 stride_bn * offsets_bn[None, :])
+    offsets_a = stride_am * offsets_am[:, None] + stride_ak * offsets_k[None, :]
+    offsets_b = stride_bk * offsets_k[:, None] + stride_bn * offsets_bn[None, :]
 
     # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
     # appropriate offsets and masks for each case. Same goes for
     # BLOCK_SIZE_SCALE_B.
-    offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) +
-                        (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M)
+    offsets_scale_am = (
+        tl.arange(0, BLOCK_SIZE_SCALE_A)
+        + (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M
+    )
     masks_scale_am = offsets_scale_am < M
 
-    offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) +
-                        (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N)
+    offsets_scale_bn = (
+        tl.arange(0, BLOCK_SIZE_SCALE_B)
+        + (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N
+    )
     masks_scale_bn = offsets_scale_bn < N
 
     a_ptrs = a_ptr + offsets_a
@@ -114,8 +130,7 @@ def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
     offs_cm = offs_cm.to(tl.int64)
     offs_cn = offs_cn.to(tl.int64)
-    c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] +
-              stride_cn * offs_cn[None, :])
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
     c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
 
     tl.store(c_ptrs, c, mask=c_mask)
@@ -123,16 +138,18 @@ def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
 
 # input   - [M, K]
 # weight - [K, N]
-def triton_scaled_mm(input: torch.Tensor,
-                     weight: torch.Tensor,
-                     scale_a: torch.Tensor,
-                     scale_b: torch.Tensor,
-                     out_dtype: type[torch.dtype],
-                     bias: Optional[torch.Tensor] = None,
-                     block_size_m: int = 32,
-                     block_size_n: int = 32,
-                     block_size_k: int = 32,
-                     use_heuristic=True) -> torch.Tensor:
+def triton_scaled_mm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: torch.Tensor | None = None,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+    use_heuristic=True,
+) -> torch.Tensor:
     M, K = input.shape
     N = weight.shape[1]
 
@@ -144,17 +161,16 @@ def triton_scaled_mm(input: torch.Tensor,
     scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
 
     assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
-    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1
-                                      or scale_a.shape[0] == M)
-    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1
-                                      or scale_b.shape[0] == N)
+    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1 or scale_a.shape[0] == M)
+    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1 or scale_b.shape[0] == N)
     assert out_dtype.is_floating_point
     assert bias is None or bias.is_floating_point()
     assert is_weak_contiguous(input)
     assert is_weak_contiguous(weight)
 
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
-        N, META['BLOCK_SIZE_N']), )
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+    )
 
     result = torch.empty((M, N), dtype=out_dtype, device=input.device)
 
@@ -181,26 +197,28 @@ def triton_scaled_mm(input: torch.Tensor,
 
     # A = input, B = weight, C = result
     # A = M x K, B = K x N, C = M x N
-    scaled_mm_kernel[grid](input,
-                           weight,
-                           scale_a,
-                           scale_b,
-                           result,
-                           bias,
-                           M,
-                           N,
-                           K,
-                           input.stride(0),
-                           input.stride(1),
-                           weight.stride(0),
-                           weight.stride(1),
-                           result.stride(0),
-                           result.stride(1),
-                           accumulator_dtype,
-                           BLOCK_SIZE_M=block_size_m,
-                           BLOCK_SIZE_N=block_size_n,
-                           BLOCK_SIZE_K=block_size_k,
-                           BLOCK_SIZE_SCALE_A=block_size_sa,
-                           BLOCK_SIZE_SCALE_B=block_size_sb)
+    scaled_mm_kernel[grid](
+        input,
+        weight,
+        scale_a,
+        scale_b,
+        result,
+        bias,
+        M,
+        N,
+        K,
+        input.stride(0),
+        input.stride(1),
+        weight.stride(0),
+        weight.stride(1),
+        result.stride(0),
+        result.stride(1),
+        accumulator_dtype,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        BLOCK_SIZE_SCALE_A=block_size_sa,
+        BLOCK_SIZE_SCALE_B=block_size_sb,
+    )
 
     return result.to(out_dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index b2dd2501095f8031621070497b72a3ba681e2ec7..f88092169110ba934622b6bd48269c803446c8f2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
-from typing import Optional
 
 import regex as re
 from compressed_tensors import CompressionFormat
@@ -15,15 +14,15 @@ def is_activation_quantization_format(format: str) -> bool:
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
         CompressionFormat.float_quantized.value,
-        CompressionFormat.nvfp4_pack_quantized.value
+        CompressionFormat.nvfp4_pack_quantized.value,
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
 
 def should_ignore_layer(
-    layer_name: Optional[str],
+    layer_name: str | None,
     ignore: Iterable[str] = tuple(),
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> bool:
     if layer_name is None:
         return False
@@ -49,7 +48,8 @@ def should_ignore_layer(
         should_ignore_layer = None
         for shard_name in shard_names:
             should_ignore_shard = check_equal_or_regex_match(
-                layer_name=shard_name, targets=ignore)
+                layer_name=shard_name, targets=ignore
+            )
 
             # If shard_idx=0, set layer ignore to match shard.
             if should_ignore_layer is None:
@@ -57,37 +57,36 @@ def should_ignore_layer(
 
             # If shard_idx=1+ confirm scheme matches prior shards.
             elif should_ignore_shard != should_ignore_layer:
-                raise ValueError(f"Found a different quantization schemes for "
-                                 f"{shard_proj_names} in {layer_name}. vLLM "
-                                 "requires all to use the same scheme.")
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
 
     # Unfused layers like down_proj and o_proj will match
     # the safetensors checkpoint already.
     else:
-        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
-                                                         targets=ignore)
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
 
     assert should_ignore_layer is not None
     return should_ignore_layer
 
 
-def check_equal_or_regex_match(layer_name: str,
-                               targets: Iterable[str]) -> bool:
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
     """
     Checks whether a layer_name is exactly equal or a regex match for
     if target starts with 're:' to any target in list.
     """
-    for target in targets:
-        if _is_equal_or_regex_match(layer_name, target):
-            return True
-    return False
+    return any(_is_equal_or_regex_match(layer_name, target) for target in targets)
 
 
 def find_matched_target(
-    layer_name: Optional[str],
+    layer_name: str | None,
     module: Module,
     targets: Iterable[str],
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
@@ -120,19 +119,21 @@ def find_matched_target(
     matched_target = (
         _find_first_match(layer_name, targets)
         or _find_first_match(module.__class__.__name__, targets, True)
-        or _match_fused_layer(layer_name, targets, fused_mapping))
+        or _match_fused_layer(layer_name, targets, fused_mapping)
+    )
 
     if matched_target is None:
         raise ValueError(
             f"Unable to find matching target for {layer_name} in the "
-            "compressed-tensors config.")
+            "compressed-tensors config."
+        )
 
     return matched_target
 
 
-def _find_first_match(value: str,
-                      targets: Iterable[str],
-                      check_contains: bool = False) -> Optional[str]:
+def _find_first_match(
+    value: str, targets: Iterable[str], check_contains: bool = False
+) -> str | None:
     """
     Returns first element of target that matches value either
     exactly or as a regex after 're:'. If check_contains is set to True,
@@ -144,16 +145,14 @@ def _find_first_match(value: str,
     """
 
     for target in targets:
-        if _is_equal_or_regex_match(value,
-                                    target,
-                                    check_contains=check_contains):
+        if _is_equal_or_regex_match(value, target, check_contains=check_contains):
             return target
     return None
 
 
-def _is_equal_or_regex_match(value: str,
-                             target: str,
-                             check_contains: bool = False) -> bool:
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
     """
     Checks whether a value is exactly equal or a regex match for target
     if target starts with 're:'. If check_contains is set to True,
@@ -173,10 +172,12 @@ def _is_equal_or_regex_match(value: str,
 
 
 def _match_fused_layer(
-        layer_name: str, target_layers: Iterable[str],
-        fused_mapping: Mapping[str, list[str]]) -> Optional[str]:
+    layer_name: str,
+    target_layers: Iterable[str],
+    fused_mapping: Mapping[str, list[str]],
+) -> str | None:
     """
-    Match a fused layer name to its corresponding individual layer in 
+    Match a fused layer name to its corresponding individual layer in
     target_layers. Returns first value in fused_mapping which matches targets
 
     Implements an "all" matching strategy where a fused layer matches iff
@@ -193,8 +194,7 @@ def _match_fused_layer(
                         "model.layers.0.self_attn.v_proj"]
     """
     # find layer_name in mapping
-    fused = next((key for key in fused_mapping if layer_name.endswith(key)),
-                 None)
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)), None)
     if fused is None:
         return None
 
@@ -204,7 +204,7 @@ def _match_fused_layer(
     ]
 
     # for each unfused component, find a match in targets
-    unfused_matches: list[Optional[str]] = []
+    unfused_matches: list[str | None] = []
     for unfused in unfused_paths:
         for target in target_layers:
             if _is_equal_or_regex_match(unfused, target):
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 2922aef32939a468a7ec06b95f84bb151f25ac0c..4f742d83457395a1a48ea3030594eb062a0222cc 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -9,16 +9,17 @@ import torch.nn.functional as F
 from packaging import version
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.model_executor.utils import set_weight_attrs
 
 
 class DeepSpeedFPConfig(QuantizationConfig):
     """Config for DeepSpeed FP quantizer. It supports fp6 and fp8.
-    
-    Args: 
+
+    Args:
         weight_bits: the target quantization bits, 6 or 8.
         group_size: group size for quantizaiton, default to 128.
     """
@@ -37,11 +38,14 @@ class DeepSpeedFPConfig(QuantizationConfig):
             raise ValueError(
                 "Currently, only 6-bit or 8-bit weight quantization are "
                 f"supported for DeepSpeed FP quantizaiton, but got "
-                f"{self.weight_bits} bits.")
+                f"{self.weight_bits} bits."
+            )
 
     def __repr__(self) -> str:
-        return (f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
-                f"group_size={self.group_size}")
+        return (
+            f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
+            f"group_size={self.group_size}"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -72,8 +76,9 @@ class DeepSpeedFPConfig(QuantizationConfig):
             "quantize_config.json",
         ]
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["DeepSpeedFPLinearMethod"]:
         if isinstance(layer, LinearBase):
             return DeepSpeedFPLinearMethod(self)
         return None
@@ -90,15 +95,17 @@ class DeepSpeedFPLinearMethod(LinearMethodBase):
         self.quant_config = quant_config
         self.weight = None
 
-    def create_weights(self,
-                       layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int],
-                       input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype,
-                       weight_loader=None,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        weight_loader=None,
+        **extra_weight_attrs,
+    ):
         del output_size
         del input_size
         output_size_per_partition = sum(output_partition_sizes)
@@ -107,10 +114,13 @@ class DeepSpeedFPLinearMethod(LinearMethodBase):
             params_dtype=params_dtype,
             quant_config=self.quant_config,
         )
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-        })
+        set_weight_attrs(
+            weight,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+            },
+        )
         layer.register_parameter("weight", weight)
 
         def quant_weight_loader(param, loaded_weight, *args, **kwargs):
@@ -126,10 +136,12 @@ class DeepSpeedFPLinearMethod(LinearMethodBase):
         extra_weight_attrs["weight_loader"] = quant_weight_loader
         set_weight_attrs(weight, extra_weight_attrs)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         weight = layer.weight
         y = weight.ds_dequantize()
         return F.linear(x, y, bias)
@@ -142,23 +154,33 @@ class DeepSpeedFPParameter(nn.Parameter):
     GPUs, and can be dequantized on-the-fly when needed by the model.
     """
 
-    def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
-                quant_config: DeepSpeedFPConfig):
+    def __new__(
+        cls,
+        orig_shape: torch.Size,
+        params_dtype: torch.dtype,
+        quant_config: DeepSpeedFPConfig,
+    ):
         try:
             import deepspeed
+
             if version.parse(deepspeed.__version__) < version.parse("0.14.2"):
-                raise ImportError("deepspeed version is wrong. Please "
-                                  "install deepspeed>=0.14.2.")
+                raise ImportError(
+                    "deepspeed version is wrong. Please install deepspeed>=0.14.2."
+                )
             from deepspeed.ops.fp_quantizer import FP_Quantize
         except ImportError as err:
-            raise ImportError("Please install deepspeed>=0.14.2 via "
-                              "`pip install deepspeed>=0.14.2` to use "
-                              "deepspeedfp quantizer.") from err
-        data = torch.empty((
-            orig_shape.numel() // quant_config.group_size,
-            quant_config.group_size * quant_config.weight_bits // 8 + 4,
-        ),
-                           dtype=torch.int8)
+            raise ImportError(
+                "Please install deepspeed>=0.14.2 via "
+                "`pip install deepspeed>=0.14.2` to use "
+                "deepspeedfp quantizer."
+            ) from err
+        data = torch.empty(
+            (
+                orig_shape.numel() // quant_config.group_size,
+                quant_config.group_size * quant_config.weight_bits // 8 + 4,
+            ),
+            dtype=torch.int8,
+        )
         self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
         self.orig_shape = orig_shape
         self.quant_config = quant_config
@@ -173,7 +195,8 @@ class DeepSpeedFPParameter(nn.Parameter):
             self.fp_quantizer.quantize(
                 tensor.data,
                 q_bits=self.quant_config.weight_bits,
-            ))
+            )
+        )
 
     def ds_dequantize(self, fp_out=None) -> torch.Tensor:
         """
@@ -181,7 +204,8 @@ class DeepSpeedFPParameter(nn.Parameter):
         """
         assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
         return self.fp_quantizer.dequantize(
-            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits)
+            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits
+        )
 
     def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
         """
@@ -190,7 +214,5 @@ class DeepSpeedFPParameter(nn.Parameter):
         """
         assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
         return self.fp_quantizer.selective_dequantize(
-            self.data,
-            indices,
-            fp_out=fp_out,
-            q_bits=self.quant_config.weight_bits)
+            self.data, indices, fp_out=fp_out, q_bits=self.quant_config.weight_bits
+        )
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 38d7e200b3035f04558c3abdf90e0ae1f159f903..5241f9a2301bef7ca12811f5d573c82f7ad420de 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,20 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
-                                                  FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE,
+    FusedMoEConfig,
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, int8_w8a16_moe_quant_config)
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+    FusedMoEQuantConfig,
+    int8_w8a16_moe_quant_config,
+)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.utils import set_weight_attrs
 
 
@@ -44,8 +51,9 @@ class ExpertsInt8Config(QuantizationConfig):
     def from_config(cls, config: dict[str, Any]) -> "ExpertsInt8Config":
         return cls()
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
@@ -54,7 +62,6 @@ class ExpertsInt8Config(QuantizationConfig):
 
 
 class ExpertsInt8MoEMethod(FusedMoEMethodBase):
-
     def __init__(
         self,
         quant_config: ExpertsInt8Config,
@@ -63,57 +70,70 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         super().__init__(moe)
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         int8_dtype = torch.int8
 
-        assert 'weight_loader' in extra_weight_attrs
-        weight_loader = extra_weight_attrs['weight_loader']
+        assert "weight_loader" in extra_weight_attrs
+        weight_loader = extra_weight_attrs["weight_loader"]
         wrapped_weight_loader = ExpertsInt8MoEMethod.quantizing_weight_loader(
-            layer, weight_loader)
-        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+            layer, weight_loader
+        )
+        extra_weight_attrs["weight_loader"] = wrapped_weight_loader
 
         # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=int8_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=int8_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=int8_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=int8_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
-        w13_scale = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            dtype=torch.float32),
-                                       requires_grad=False)
+        w13_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_scale", w13_scale)
 
-        w2_scale = torch.nn.Parameter(torch.zeros(num_experts,
-                                                  hidden_size,
-                                                  dtype=torch.float32),
-                                      requires_grad=False)
+        w2_scale = torch.nn.Parameter(
+            torch.zeros(num_experts, hidden_size, dtype=torch.float32),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_scale", w2_scale)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        return int8_w8a16_moe_quant_config(w1_scale=layer.w13_scale,
-                                           w2_scale=layer.w2_scale,
-                                           w1_zp=None,
-                                           w2_zp=None)
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return int8_w8a16_moe_quant_config(
+            w1_scale=layer.w13_scale, w2_scale=layer.w2_scale, w1_zp=None, w2_zp=None
+        )
 
     def apply(
         self,
@@ -123,26 +143,25 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `ExpertsInt8MoEMethod` yet.")
+                "EPLB not supported for `ExpertsInt8MoEMethod` yet."
+            )
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -158,7 +177,8 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
         return fused_experts(
             x,
@@ -176,11 +196,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
     @staticmethod
     def quantizing_weight_loader(layer, weight_loader):
-
-        def quantize_and_call_weight_loader(param: torch.nn.Parameter,
-                                            loaded_weight: torch.Tensor,
-                                            weight_name: str, shard_id: int,
-                                            expert_id: int):
+        def quantize_and_call_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: int,
+            expert_id: int,
+        ):
             tp_rank = get_tensor_model_parallel_rank()
             shard_size = layer.intermediate_size_per_partition
             shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
@@ -188,33 +210,28 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             loaded_weight = loaded_weight.to(device)
             # w1, gate_proj case: Load into first shard of w13.
             if shard_id == "w1":
-                scales = quantize_in_place_and_get_scales(
-                    loaded_weight[shard, :])
-                layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:,
-                                                                           0])
+                scales = quantize_in_place_and_get_scales(loaded_weight[shard, :])
+                layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:, 0])
             # w3, up_proj case: Load into second shard of w13.
             elif shard_id == "w3":
-                scales = quantize_in_place_and_get_scales(
-                    loaded_weight[shard, :])
-                layer.w13_scale.data[expert_id, shard_size:2 *
-                                     shard_size].copy_(scales[:, 0])
+                scales = quantize_in_place_and_get_scales(loaded_weight[shard, :])
+                layer.w13_scale.data[expert_id, shard_size : 2 * shard_size].copy_(
+                    scales[:, 0]
+                )
             # w2, down_proj case: Load into only shard of w2.
             elif shard_id == "w2":
-                scales = quantize_in_place_and_get_scales(loaded_weight[:,
-                                                                        shard])
+                scales = quantize_in_place_and_get_scales(loaded_weight[:, shard])
                 layer.w2_scale.data[expert_id, :].copy_(scales[:, 0])
             else:
-                raise ValueError(
-                    f"Shard id must be in [0,1,2] but got {shard_id}")
-            weight_loader(param, loaded_weight, weight_name, shard_id,
-                          expert_id)
+                raise ValueError(f"Shard id must be in [0,1,2] but got {shard_id}")
+            weight_loader(param, loaded_weight, weight_name, shard_id, expert_id)
 
         return quantize_and_call_weight_loader
 
 
 def quantize_in_place_and_get_scales(weight: torch.Tensor) -> torch.Tensor:
     vmax = torch.iinfo(torch.int8).max
-    scales = (torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax)
+    scales = torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax
 
     weight.div_(scales)
     weight.round_()
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index b2cab7d4614adad5b570bc4ff5a133b87ea6a0c5..6ba18e59e4d547345e68cec293f64939529f43bf 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -8,19 +8,33 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
+    GroupShape,
+    is_layer_skipped,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter)
+    Fp8LinearOp,
+    maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -60,23 +74,26 @@ class FBGEMMFp8Config(QuantizationConfig):
         input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
         return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix=prefix,
-                                ignored_layers=self.ignore_list,
-                                fused_mapping=self.packed_modules_mapping):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignore_list,
+                fused_mapping=self.packed_modules_mapping,
+            ):
                 return UnquantizedLinearMethod()
             return FBGEMMFp8LinearMethod(self)
         return None
 
 
 class FBGEMMFp8LinearMethod(LinearMethodBase):
-
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(
-            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
+        )
         self.out_dtype = torch.get_default_dtype()
 
     def create_weights(
@@ -101,43 +118,45 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
         layer.orig_dtype = params_dtype
 
         # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=torch.float8_e4m3fn),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
-        weight_scale = ChannelQuantScaleParameter(data=torch.empty(
-            (sum(output_partition_sizes), 1), dtype=torch.float32),
-                                                  output_dim=0,
-                                                  weight_loader=weight_loader)
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         weight_scale[:] = torch.finfo(torch.float32).min
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE UPPER BOUND
-        input_scale_ub = torch.nn.Parameter(torch.tensor(
-            (self.quant_config.input_scale_ub), dtype=torch.float32),
-                                            requires_grad=False)
+        input_scale_ub = torch.nn.Parameter(
+            torch.tensor((self.quant_config.input_scale_ub), dtype=torch.float32),
+            requires_grad=False,
+        )
         layer.input_scale_ub = input_scale_ub
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # required by torch.compile
-        layer.weight_scale = Parameter(layer.weight_scale.data,
-                                       requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
         layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
         weight = layer.weight
 
         if current_platform.is_fp8_fnuz():
-            weight, weight_scale, input_scale = \
-                normalize_e4m3fn_to_e4m3fnuz(
-                    weight=weight,
-                    weight_scale=layer.weight_scale,
-                    input_scale=None)
+            weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                weight=weight, weight_scale=layer.weight_scale, input_scale=None
+            )
             if input_scale is not None:
                 layer.input_scale = Parameter(input_scale, requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
@@ -148,11 +167,12 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
             # Activations not quantized for marlin.
             del layer.input_scale_ub
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         if self.quant_config.use_marlin:
             return apply_fp8_marlin_linear(
                 input=x,
@@ -161,12 +181,15 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
-                bias=bias)
-
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     out_dtype=self.out_dtype,
-                                     input_scale=None,
-                                     input_scale_ub=layer.input_scale_ub,
-                                     bias=bias)
+                bias=bias,
+            )
+
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            out_dtype=self.out_dtype,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index fb448de3c23415926d9ccf771afd28bfec995fc4..0479bec3384080543d09ad2d1af2dcac464e0e8e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from collections.abc import Callable
+from enum import Enum
+from functools import partial
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
@@ -10,55 +13,92 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.model_executor.layers.fused_moe import (
-    FusedMoE, FusedMoEActivationFormat, FusedMoEMethodBase,
-    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
-    FusedMoeWeightScaleSupported)
+    FusedMoE,
+    FusedMoEActivationFormat,
+    FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, fp8_w8a8_moe_quant_config)
-from vllm.model_executor.layers.fused_moe.layer import (
-    UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+    fp8_w8a8_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    FlashinferMoeBackend,
+    apply_flashinfer_per_tensor_scale_fp8,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
-    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
-    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
+    flashinfer_cutlass_moe_fp8,
+    get_flashinfer_moe_backend,
+    register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl,
+    swap_w13_to_w31,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_block_linear, check_aiter_fp8_linear_support,
-    create_fp8_input_scale, create_fp8_scale_parameter,
-    create_fp8_weight_parameter, expert_weight_is_col_major,
-    maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
-    process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace,
-    validate_fp8_block_shape)
+    W8A8BlockFp8LinearOp,
+    create_fp8_input_scale,
+    create_fp8_scale_parameter,
+    create_fp8_weight_parameter,
+    deepgemm_post_process_fp8_weight_block,
+    maybe_post_process_fp8_weight_block,
+    process_fp8_weight_block_strategy,
+    process_fp8_weight_tensor_strategy,
+    validate_fp8_block_shape,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin,
-    prepare_moe_fp8_layer_for_marlin)
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+    prepare_moe_fp8_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
+    GroupShape,
+    is_layer_skipped,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported,
-    cutlass_fp8_supported, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
-from vllm.model_executor.parameter import (BlockQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    Fp8LinearOp,
+    all_close_1d,
+    cutlass_block_fp8_supported,
+    cutlass_fp8_supported,
+    maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz,
+    per_tensor_dequantize,
+)
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (get_col_major_tma_aligned_tensor,
-                                  is_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import (
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
 from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.utils.import_utils import has_deep_gemm
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
@@ -68,6 +108,79 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = init_logger(__name__)
 
 
+class Fp8MoeBackend(Enum):
+    NONE = 0
+    FLASHINFER_TRTLLM = 1
+    FLASHINFER_CUTLASS = 2
+    DEEPGEMM = 3
+    CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
+    MARLIN = 5
+    TRITON = 6
+
+
+def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
+    """
+    Select the primary FP8 MoE backend
+    Note: Shape-specific fallbacks may still occur at runtime.
+    """
+    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
+    if (
+        current_platform.is_cuda()
+        and (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(90)
+        )
+        and envs.VLLM_USE_FLASHINFER_MOE_FP8
+        and has_flashinfer_moe()
+    ):
+        backend = get_flashinfer_moe_backend()
+        if backend == FlashinferMoeBackend.TENSORRT_LLM:
+            logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
+            return Fp8MoeBackend.FLASHINFER_TRTLLM
+        else:
+            if block_quant and current_platform.is_device_capability(100):
+                raise ValueError(
+                    "FlashInfer FP8 MoE throughput backend does not "
+                    "support block quantization. Please use "
+                    "VLLM_FLASHINFER_MOE_BACKEND=latency "
+                    "instead."
+                )
+            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100")
+            return Fp8MoeBackend.FLASHINFER_CUTLASS
+
+    # weight-only path for older GPUs without native FP8
+    use_marlin = (
+        not current_platform.has_device_capability(89)
+        or envs.VLLM_TEST_FORCE_FP8_MARLIN
+    )
+    if current_platform.is_rocm():
+        use_marlin = False
+    if use_marlin:
+        logger.info_once("Using Marlin backend for FP8 MoE")
+        return Fp8MoeBackend.MARLIN
+
+    # deepGEMM on supported platforms with block-quantized weights
+    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
+        if not has_deep_gemm():
+            logger.warning_once("DeepGEMM backend requested but not available.")
+        elif is_deep_gemm_supported():
+            logger.info_once("Using DeepGEMM backend for FP8 MoE")
+            return Fp8MoeBackend.DEEPGEMM
+
+    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
+    if (
+        current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+        and block_quant
+    ):
+        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
+
+    # default to Triton
+    logger.info_once("Using Triton backend for FP8 MoE")
+    return Fp8MoeBackend.TRITON
+
+
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -75,31 +188,34 @@ class Fp8Config(QuantizationConfig):
         self,
         is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
-        ignored_layers: Optional[list[str]] = None,
-        weight_block_size: Optional[list[int]] = None,
+        ignored_layers: list[str] | None = None,
+        weight_block_size: list[int] | None = None,
     ) -> None:
         super().__init__()
 
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
 
         if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(
-                f"Unsupported activation scheme {activation_scheme}")
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
         self.ignored_layers = ignored_layers or []
         if weight_block_size is not None:
             if not is_checkpoint_fp8_serialized:
                 raise ValueError(
                     "The block-wise quantization only supports fp8-serialized "
-                    "checkpoint for now.")
+                    "checkpoint for now."
+                )
             if len(weight_block_size) != 2:
                 raise ValueError(
                     "The quantization block size of weight must have 2 "
-                    f"dimensions, but got {len(weight_block_size)} dimensions")
+                    f"dimensions, but got {len(weight_block_size)} dimensions"
+                )
             if activation_scheme != "dynamic":
-                raise ValueError("The block-wise quantization only supports "
-                                 "dynamic activation scheme for now, but got "
-                                 f"{activation_scheme} activation scheme.")
+                raise ValueError(
+                    "The block-wise quantization only supports "
+                    "dynamic activation scheme for now, but got "
+                    f"{activation_scheme} activation scheme."
+                )
         self.weight_block_size = weight_block_size
 
     @classmethod
@@ -120,41 +236,48 @@ class Fp8Config(QuantizationConfig):
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if self.ignored_layers is not None:
-            self.ignored_layers = hf_to_vllm_mapper.apply_list(
-                self.ignored_layers)
+            self.ignored_layers = hf_to_vllm_mapper.apply_list(self.ignored_layers)
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
         quant_method = cls.get_from_keys(config, ["quant_method"])
-        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        is_checkpoint_fp8_serialized = "fp8" in quant_method
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
-        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
-                                                 None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
         if not ignored_layers:
-            ignored_layers = cls.get_from_keys_or(config,
-                                                  ["modules_to_not_convert"],
-                                                  None)
-        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
-                   activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers,
-                   weight_block_size=weight_block_size)
-
-    def get_xpu_quant_method(self, layer: torch.nn.Module,
-                             prefix: str) -> Optional["QuantizeMethodBase"]:
+            ignored_layers = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None
+            )
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_xpu_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention
         from vllm.model_executor.layers.quantization.ipex_quant import (
-            XPUFp8LinearMethod, XPUFp8MoEMethod)
+            XPUFp8LinearMethod,
+            XPUFp8MoEMethod,
+        )
+
         fp8_config = Fp8Config(
             is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
             activation_scheme=self.activation_scheme,
             ignored_layers=self.ignored_layers,
-            weight_block_size=self.weight_block_size)
+            weight_block_size=self.weight_block_size,
+        )
 
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix=prefix,
-                                ignored_layers=self.ignored_layers,
-                                fused_mapping=self.packed_modules_mapping):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
                 return UnquantizedLinearMethod()
             return XPUFp8LinearMethod(fp8_config)
         elif isinstance(layer, FusedMoE):
@@ -163,29 +286,34 @@ class Fp8Config(QuantizationConfig):
             return Fp8KVCacheMethod(self)
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if current_platform.is_xpu():
             return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix=prefix,
-                                ignored_layers=self.ignored_layers,
-                                fused_mapping=self.packed_modules_mapping):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            if is_layer_skipped(prefix=prefix,
-                                ignored_layers=self.ignored_layers,
-                                fused_mapping=self.packed_modules_mapping):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
             return Fp8MoEMethod(self, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
         in compressed-tensors. If this is the case, return its equivalent
@@ -231,26 +359,45 @@ class Fp8LinearMethod(LinearMethodBase):
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        self.use_marlin = (
+            not current_platform.has_device_capability(89)
+            or envs.VLLM_TEST_FORCE_FP8_MARLIN
+        )
         # Disable marlin for rocm
         if current_platform.is_rocm():
             self.use_marlin = False
+        if vllm_is_batch_invariant():
+            self.use_marlin = False
 
-        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant = self.weight_block_size is not None
         self.act_q_static = self.quant_config.activation_scheme == "static"
-        # Use per-token quantization for better perf if dynamic and cutlass
-        if not self.act_q_static and cutlass_fp8_supported():
-            self.act_q_group_shape = GroupShape.PER_TOKEN
+        if self.weight_block_size:
+            self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
         else:
-            self.act_q_group_shape = GroupShape.PER_TENSOR
+            # Use per-token quantization for better perf if dynamic and cutlass
+            if not self.act_q_static and cutlass_fp8_supported():
+                self.act_q_group_shape = GroupShape.PER_TOKEN
+            else:
+                self.act_q_group_shape = GroupShape.PER_TENSOR
 
-        self.fp8_linear = Fp8LinearOp(
-            act_quant_static=self.act_q_static,
-            act_quant_group_shape=self.act_q_group_shape)
+        if self.block_quant:
+            assert not self.act_q_static
+            assert self.weight_block_size is not None
+            self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(*self.weight_block_size),
+                act_quant_group_shape=self.act_q_group_shape,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
+            )
+        else:
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=self.act_q_static,
+                act_quant_group_shape=self.act_q_group_shape,
+            )
 
     def create_weights(
         self,
@@ -275,25 +422,32 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.block_quant:
             assert self.weight_block_size is not None
             layer.weight_block_size = self.weight_block_size
-            validate_fp8_block_shape(layer, input_size, output_size,
-                                     input_size_per_partition,
-                                     output_partition_sizes,
-                                     self.weight_block_size)
+            validate_fp8_block_shape(
+                layer,
+                input_size,
+                output_size,
+                input_size_per_partition,
+                output_partition_sizes,
+                self.weight_block_size,
+            )
 
         # WEIGHT
         if self.quant_config.is_checkpoint_fp8_serialized:
-            weight = create_fp8_weight_parameter(output_size_per_partition,
-                                                 input_size_per_partition,
-                                                 weight_loader)
+            weight = create_fp8_weight_parameter(
+                output_size_per_partition, input_size_per_partition, weight_loader
+            )
         else:
             # For non-serialized checkpoints, use original dtype
-            weight = ModelWeightParameter(data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                dtype=params_dtype),
-                                          input_dim=1,
-                                          output_dim=0,
-                                          weight_loader=weight_loader)
+            weight = ModelWeightParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    input_size_per_partition,
+                    dtype=params_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
         layer.register_parameter("weight", weight)
 
         # If checkpoint is serialized fp8, load them.
@@ -301,28 +455,32 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             if not self.block_quant:
-                scale = create_fp8_scale_parameter(PerTensorScaleParameter,
-                                                   output_partition_sizes,
-                                                   input_size_per_partition,
-                                                   None, weight_loader)
+                scale = create_fp8_scale_parameter(
+                    PerTensorScaleParameter,
+                    output_partition_sizes,
+                    input_size_per_partition,
+                    None,
+                    weight_loader,
+                )
                 set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 layer.register_parameter("weight_scale", scale)
             else:
                 assert not self.act_q_static
                 assert self.weight_block_size is not None
-                scale = create_fp8_scale_parameter(BlockQuantScaleParameter,
-                                                   output_partition_sizes,
-                                                   input_size_per_partition,
-                                                   self.weight_block_size,
-                                                   weight_loader)
+                scale = create_fp8_scale_parameter(
+                    BlockQuantScaleParameter,
+                    output_partition_sizes,
+                    input_size_per_partition,
+                    self.weight_block_size,
+                    weight_loader,
+                )
                 set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 # The weight_scale_inv name is intentional for deepseekv3
                 layer.register_parameter("weight_scale_inv", scale)
 
             # INPUT ACTIVATION SCALE
             if self.act_q_static:
-                scale = create_fp8_input_scale(output_partition_sizes,
-                                               weight_loader)
+                scale = create_fp8_input_scale(output_partition_sizes, weight_loader)
                 set_weight_attrs(scale, {"scale_type": "input_scale"})
                 layer.register_parameter("input_scale", scale)
             else:
@@ -337,15 +495,15 @@ class Fp8LinearMethod(LinearMethodBase):
             size_k_first = False
 
             weight, weight_scale = process_fp8_weight_block_strategy(
-                layer.weight, layer.weight_scale_inv)
+                layer.weight, layer.weight_scale_inv
+            )
             # Delete the weight_scale_inv parameter to avoid confusion
             # with the weight_scale parameter
             del layer.weight_scale_inv
 
         # If checkpoint not serialized fp8, quantize the weights.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
-            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
-                                                         scale=None)
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             weight = qweight.t()
 
         # If checkpoint is fp8 per-tensor, handle that there are N scales for N
@@ -357,10 +515,12 @@ class Fp8LinearMethod(LinearMethodBase):
             # If using w8a8, torch._scaled_mm needs per tensor, so
             # requantize the logical shards as a single weight.
             if not self.use_marlin:
-                weight, weight_scale, input_scale = (
-                    process_fp8_weight_tensor_strategy(
-                        weight, weight_scale, layer.logical_widths,
-                        getattr(layer, 'input_scale', None)))
+                weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
+                    weight,
+                    weight_scale,
+                    layer.logical_widths,
+                    getattr(layer, "input_scale", None),
+                )
                 if self.act_q_static:
                     assert input_scale is not None
                     input_scale = input_scale.max()
@@ -369,9 +529,11 @@ class Fp8LinearMethod(LinearMethodBase):
         # Update layer with new values.
         layer.weight = Parameter(weight.data, requires_grad=False)
         layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
-        layer.input_scale = Parameter(
-            input_scale,
-            requires_grad=False) if input_scale is not None else None
+        layer.input_scale = (
+            Parameter(input_scale, requires_grad=False)
+            if input_scale is not None
+            else None
+        )
 
         if self.use_marlin:
             prepare_fp8_layer_for_marlin(layer, size_k_first)
@@ -380,13 +542,49 @@ class Fp8LinearMethod(LinearMethodBase):
             return
 
         if self.block_quant:
-            maybe_post_process_fp8_weight_block(
-                layer, self.cutlass_block_fp8_supported)
+            maybe_post_process_fp8_weight_block(layer)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # if batch invariant mode is enabled, prefer DeepGEMM FP8 path
+        # we will use BF16 dequant when DeepGEMM is not supported.
+        if vllm_is_batch_invariant():
+            if self.block_quant:
+                assert self.weight_block_size is not None
+                return self.w8a8_block_fp8_linear.apply(
+                    input=x,
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=layer.input_scale,
+                    bias=bias,
+                )
+            else:
+                # per-tensor/channel: dequant to BF16 and run GEMM
+                weight_fp8 = layer.weight.to(torch.bfloat16)
+                weight_scale = layer.weight_scale.to(torch.bfloat16)
+                if weight_scale.numel() == 1:
+                    # Per-tensor: simple scalar multiplication
+                    weight_bf16 = weight_fp8 * weight_scale
+                else:
+                    # Multiple scales (fused modules like QKV)
+                    # Try to infer correct broadcasting
+                    # weight is [K, N], scale could be [num_logical_weights]
+                    # Need to figure out how to broadcast - for now just try
+                    # direct multiplication
+                    if (
+                        weight_scale.dim() == 1
+                        and weight_scale.shape[0] == weight_fp8.shape[0]
+                    ):
+                        # Per-row scaling
+                        weight_bf16 = weight_fp8 * weight_scale.unsqueeze(1)
+                    else:
+                        # Fallback
+                        weight_bf16 = weight_fp8 * weight_scale
+                return torch.nn.functional.linear(x, weight_bf16.t(), bias)
 
         if self.use_marlin:
             return apply_fp8_marlin_linear(
@@ -396,22 +594,28 @@ class Fp8LinearMethod(LinearMethodBase):
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
-                bias=bias)
+                bias=bias,
+            )
 
         if self.block_quant:
-            return apply_fp8_block_linear(
-                layer,
+            assert self.weight_block_size is not None
+
+            return self.w8a8_block_fp8_linear.apply(
                 input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                input_scale=layer.input_scale,
                 bias=bias,
-                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
-                use_aiter_and_is_supported=self.use_aiter_and_is_supported)
+            )
 
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     out_dtype=self.out_dtype,
-                                     input_scale=layer.input_scale,
-                                     bias=bias)
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            out_dtype=self.out_dtype,
+            input_scale=layer.input_scale,
+            bias=bias,
+        )
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):
@@ -432,59 +636,40 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.layer = layer
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
-        self.block_quant = self.weight_block_size is not None
-
-        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
-        self.fused_experts: Optional[
-            mk.FusedMoEModularKernel] = None  # type: ignore
-        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
-            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
-            logger.info_once(
-                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-            )
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-        # Disable marlin for rocm
-        if current_platform.is_rocm():
-            self.use_marlin = False
-
-        # Check for DeepGemm support.
-        self.allow_deep_gemm = False
-        if envs.VLLM_USE_DEEP_GEMM:
-            if not has_deep_gemm():
-                logger.warning_once("Failed to import DeepGemm kernels.")
-            elif not self.block_quant:
-                logger.warning_once("Model is not block quantized. Not using "
-                                    "DeepGemm kernels")
-            elif (is_deep_gemm_supported()):
-                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
-                self.allow_deep_gemm = True
-            else:
-                logger.warning_once(
-                    "DeepGemm not supported on the current platform.")
-
-        # Check for CutlassBlockScaledGroupedGemm support.
-        self.allow_cutlass_block_scaled_grouped_gemm = False
-        if not self.block_quant:
-            logger.debug_once("Model is not block quantized. Not using "
-                              "CutlassBlockScaledGroupedGemm kernels")
-        elif (current_platform.is_cuda()
-              and current_platform.is_device_capability(100)):
-            logger.info_once(
-                "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod."
+        self.block_quant: bool = self.weight_block_size is not None
+        self.fp8_backend = get_fp8_moe_backend(self.block_quant)
+
+        self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
+        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
+            self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
+        elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            self.flashinfer_moe_fn = partial(
+                flashinfer_cutlass_moe_fp8,
+                moe=self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
-            self.allow_cutlass_block_scaled_grouped_gemm = True
-        else:
-            logger.warning_once(
-                "CutlassBlockScaledGroupedGemm not supported on the current "
-                "platform.")
 
-    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
-                       intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+        self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
+        self.allow_cutlass_block_scaled_grouped_gemm = (
+            self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
+        )
 
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.hidden_size = hidden_size
         layer.num_experts = num_experts
@@ -509,31 +694,38 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "
                     f"{intermediate_size_per_partition} is not divisible by "
-                    f"weight quantization block_n = {block_n}.")
-            if (tp_size > 1
-                    and intermediate_size_per_partition % block_k != 0):
+                    f"weight quantization block_n = {block_n}."
+                )
+            if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
                 # Required by row parallel
                 raise ValueError(
                     f"The input_size of down's weight = "
                     f"{intermediate_size_per_partition} is not divisible by "
-                    f"weight quantization block_k = {block_k}.")
+                    f"weight quantization block_k = {block_k}."
+                )
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -541,20 +733,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         if not self.block_quant:
             # Allocate 2 scales for w1 and w3 respectively.
             # They will be combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, 2, dtype=torch.float32),
-                                                  requires_grad=False)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
         else:
             w13_weight_scale = torch.nn.Parameter(
                 torch.ones(
                     num_experts,
-                    2 * ((intermediate_size_per_partition + block_n - 1) //
-                         block_n),
+                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
                     (hidden_size + block_k - 1) // block_k,
                     dtype=torch.float32,
                 ),
@@ -576,9 +767,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.
-             value} if self.block_quant else
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+            if self.block_quant
+            else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         # If loading fp8 checkpoint, pass the weight loaders.
         # If loading an fp16 checkpoint, do not (we will quantize in
         #   process_weights_after_loading()
@@ -591,17 +783,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             if not self.quant_config.is_checkpoint_fp8_serialized:
                 raise ValueError(
                     "Found static activation scheme for checkpoint that "
-                    "was not serialized fp8.")
+                    "was not serialized fp8."
+                )
 
-            w13_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_input_scale", w13_input_scale)
             set_weight_attrs(w13_input_scale, extra_weight_attrs)
 
-            w2_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                requires_grad=False)
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w2_input_scale", w2_input_scale)
             set_weight_attrs(w2_input_scale, extra_weight_attrs)
 
@@ -609,31 +802,34 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
+        self.rocm_aiter_moe_enabled = False
+
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled, shuffle_weights)
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_fp8_fnuz():
-                w13_weight, w13_weight_scale_inv, w13_input_scale = \
+                w13_weight, w13_weight_scale_inv, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w13_weight, layer.w13_weight_scale_inv,
-                        layer.w13_input_scale)
-                w2_weight, w2_weight_scale_inv, w2_input_scale = \
+                        layer.w13_weight,
+                        layer.w13_weight_scale_inv,
+                        layer.w13_input_scale,
+                    )
+                )
+                w2_weight, w2_weight_scale_inv, w2_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w2_weight, layer.w2_weight_scale_inv,
-                        layer.w2_input_scale)
+                        layer.w2_weight, layer.w2_weight_scale_inv, layer.w2_input_scale
+                    )
+                )
             elif self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
                 # applied on different half for flashinfer vs vllm
                 w13_weight = swap_w13_to_w31(layer.w13_weight.data)
-                w13_weight_scale_inv = swap_w13_to_w31(
-                    layer.w13_weight_scale_inv.data)
+                w13_weight_scale_inv = swap_w13_to_w31(layer.w13_weight_scale_inv.data)
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
             else:
@@ -644,65 +840,83 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # torch.compile() cannot use Parameter subclasses.
             layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
-                                                   requires_grad=False)
+            layer.w13_weight_scale_inv = Parameter(
+                w13_weight_scale_inv, requires_grad=False
+            )
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
-                                                  requires_grad=False)
+            layer.w2_weight_scale_inv = Parameter(
+                w2_weight_scale_inv, requires_grad=False
+            )
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight.data, layer.w2_weight.data)
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data
+                )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                      requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                     requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
-                if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                    layer.w13_weight_scale_inv = \
-                        get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv)
-                if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                    layer.w2_weight_scale_inv = \
-                        get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv)
+            if self.allow_deep_gemm:
+                dg_w13_weight, dg_w13_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w13_weight.data,
+                        ws=layer.w13_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
+                    )
+                )
+                dg_w2_weight, dg_w2_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w2_weight.data,
+                        ws=layer.w2_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
+                    )
+                )
+                layer.w13_weight = Parameter(dg_w13_weight, requires_grad=False)
+                layer.w13_weight_scale_inv = Parameter(
+                    dg_w13_weight_scale_inv, requires_grad=False
+                )
+                layer.w2_weight = Parameter(dg_w2_weight, requires_grad=False)
+                layer.w2_weight_scale_inv = Parameter(
+                    dg_w2_weight_scale_inv, requires_grad=False
+                )
 
         # If checkpoint is fp16, quantize in place.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
-            w13_weight = torch.empty_like(layer.w13_weight.data,
-                                          dtype=fp8_dtype)
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
 
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
-            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
-                layer.local_num_experts,
-                dtype=torch.float32,
-                device=w13_weight.device),
-                                                        requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    layer.local_num_experts,
+                    dtype=torch.float32,
+                    device=w13_weight.device,
+                ),
+                requires_grad=False,
+            )
             for expert in range(layer.local_num_experts):
-                w13_weight[expert, :, :], layer.w13_weight_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        layer.w13_weight.data[expert, :, :])
-                w2_weight[expert, :, :], layer.w2_weight_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        layer.w2_weight.data[expert, :, :])
-            layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                  requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                 requires_grad=False)
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight, layer.w2_weight)
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                    layer.w13_weight, layer.w2_weight
+                )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                      requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                     requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
         # If checkpoint is fp8, we need to handle that the
         # MoE kernels require single activation scale and single weight
         # scale for w13 per expert.
@@ -710,46 +924,54 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             # Fp8 moe kernels require a single activation scale.
             # We take the max of all the scales in case they differ.
             if self.quant_config.activation_scheme == "static":
-                if (layer.w13_input_scale is None
-                        or layer.w2_input_scale is None):
+                if layer.w13_input_scale is None or layer.w2_input_scale is None:
                     raise ValueError(
                         "QuantConfig has static quantization, but found "
-                        "activation scales are None.")
-                if (not all_close_1d(layer.w13_input_scale)
-                        or not all_close_1d(layer.w2_input_scale)):
+                        "activation scales are None."
+                    )
+                if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                    layer.w2_input_scale
+                ):
                     logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer.")
+                        "for each layer."
+                    )
                 layer.w13_input_scale = torch.nn.Parameter(
-                    layer.w13_input_scale.max(), requires_grad=False)
+                    layer.w13_input_scale.max(), requires_grad=False
+                )
                 layer.w2_input_scale = torch.nn.Parameter(
-                    layer.w2_input_scale.max(), requires_grad=False)
+                    layer.w2_input_scale.max(), requires_grad=False
+                )
             if current_platform.is_fp8_fnuz():
                 # Normalize the weights and scales
-                w13_weight, w13_weight_scale, w13_input_scale = \
+                w13_weight, w13_weight_scale, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w13_weight, layer.w13_weight_scale,
-                        layer.w13_input_scale)
-                w2_weight, w2_weight_scale, w2_input_scale = \
+                        layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                    )
+                )
+                w2_weight, w2_weight_scale, w2_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
-                        layer.w2_weight, layer.w2_weight_scale,
-                        layer.w2_input_scale)
+                        layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+                    )
+                )
                 # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                      requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
                 layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_weight_scale, requires_grad=False)
+                    w13_weight_scale, requires_grad=False
+                )
                 if w13_input_scale is not None:
                     layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                     requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
-                                                           requires_grad=False)
+                        w13_input_scale, requires_grad=False
+                    )
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
                 if w2_input_scale is not None:
                     layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False)
+                        w2_input_scale, requires_grad=False
+                    )
 
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
@@ -760,25 +982,25 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 start = 0
                 for shard_id in range(2):
                     dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start:start +
-                                                    shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id])
-                    layer.w13_weight[expert_id][
-                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id])
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
+                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    )
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight, layer.w2_weight)
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                    layer.w13_weight, layer.w2_weight
+                )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                      requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                     requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
-            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                        requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
 
             if self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
@@ -786,8 +1008,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 assert not self.block_quant
                 register_moe_scaling_factors(layer)
                 w13_weight = swap_w13_to_w31(layer.w13_weight.data)
-                if self.flashinfer_moe_backend == \
-                    FlashinferMoeBackend.TENSORRT_LLM:
+                if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
                     rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
                 layer.w13_weight.data = w13_weight.data
 
@@ -797,38 +1018,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_deep_gemm_e8m0_used() and self.block_quant:
-            assert layer.weight_block_size is not None
-            # Re-quantise the expert weights so their scales are UE8M0.
-            block_sz = tuple(layer.weight_block_size)
-            requant_weight_ue8m0_inplace(
-                layer.w13_weight.data,
-                layer.w13_weight_scale_inv.data,
-                block_sz,
-            )
-            requant_weight_ue8m0_inplace(
-                layer.w2_weight.data,
-                layer.w2_weight_scale_inv.data,
-                block_sz,
-            )
-
-            # Ensure column-major TMA alignment expected by DeepGEMM.
-            if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w13_weight_scale_inv)
-            if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w2_weight_scale_inv)
-
-    def maybe_make_prepare_finalize(
-            self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
-        if (self.rocm_aiter_moe_enabled or self.use_marlin
-                or self.flashinfer_moe_backend
-                == FlashinferMoeBackend.TENSORRT_LLM):
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+        if (
+            self.rocm_aiter_moe_enabled
+            or self.use_marlin
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            prepare_finalize = (
-                build_flashinfer_fp8_cutlass_moe_prepare_finalize(self.moe))
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            # Wire block-scale flag through prepare/finalize when using CUTLASS
+            prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+                self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
+            )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
@@ -840,60 +1047,90 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
-            BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
+            BatchedDeepGemmExperts,
+            BatchedTritonExperts,
+            TritonOrDeepGemmExperts,
+        )
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
-            "Marlin and ROCm AITER are not supported with all2all yet.")
+            "Marlin and ROCm AITER are not supported with all2all yet."
+        )
 
         assert self.moe_quant_config is not None
 
-        if (prepare_finalize.activation_format ==
-                FusedMoEActivationFormat.BatchedExperts):
-            max_num_tokens_per_rank = (
-                prepare_finalize.max_num_tokens_per_rank())
+        if (
+            prepare_finalize.activation_format
+            == FusedMoEActivationFormat.BatchedExperts
+        ):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
+
+            experts_impl = (
+                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+            )
             logger.debug(
-                "BatchedTritonOrDeepGemmExperts(%s): "
-                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
-                self.__class__.__name__, max_num_tokens_per_rank,
-                self.weight_block_size, False)
-            return BatchedTritonOrDeepGemmExperts(
+                "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                experts_impl.__name__,
+                self.__class__.__name__,
+                max_num_tokens_per_rank,
+                self.weight_block_size,
+                False,
+            )
+            return experts_impl(
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
             )
+
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
                 self.moe_quant_config,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("Using %s", experts.__class__.__name__)
             return experts
         else:
             logger.debug(
                 "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
-                self.__class__.__name__, self.weight_block_size, False)
+                self.__class__.__name__,
+                self.weight_block_size,
+                False,
+            )
             return TritonOrDeepGemmExperts(
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
             )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
         return fp8_w8a8_moe_quant_config(
-            w1_scale=(layer.w13_weight_scale_inv
-                      if self.block_quant else layer.w13_weight_scale),
-            w2_scale=(layer.w2_weight_scale_inv
-                      if self.block_quant else layer.w2_weight_scale),
+            w1_scale=(
+                layer.w13_weight_scale_inv
+                if self.block_quant
+                else layer.w13_weight_scale
+            ),
+            w2_scale=(
+                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
+            ),
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.weight_block_size,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -902,40 +1139,45 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
 
-        if (self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-                and self.fused_experts is None):
-            assert activation == 'silu', (
-                f"Expected 'silu' activation but got {activation}")
-            assert scoring_func == 'sigmoid', (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}")
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            assert activation == "silu", (
+                f"Expected 'silu' activation but got {activation}"
+            )
+
             if self.block_quant:
                 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
-                assert (renormalize and use_grouped_topk
-                        and custom_routing_function is None)
-                result = torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                    routing_logits=router_logits.to(torch.float32),
+
+                e_score_correction_bias = (
+                    e_score_correction_bias.to(x.dtype)
+                    if e_score_correction_bias is not None
+                    else None
+                )
+                routing_method_type = layer.routing_method_type
+                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                    routing_logits=router_logits.to(torch.float32)
+                    if routing_method_type == RoutingMethodType.DeepSeekV3
+                    else router_logits,
                     routing_bias=e_score_correction_bias,
                     x=x,
                     w13_weight=layer.w13_weight,
@@ -950,11 +1192,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.weight_block_size,
+                    routing_method_type=routing_method_type,
                     routed_scaling=routed_scaling_factor,
                 )
             else:
-                assert (not renormalize
-                        and custom_routing_function is not None)
+                assert not renormalize and custom_routing_function is not None
                 result = apply_flashinfer_per_tensor_scale_fp8(
                     layer=layer,
                     hidden_states=x,
@@ -964,10 +1206,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     top_k=top_k,
                     num_expert_group=num_expert_group,
                     topk_group=topk_group,
-                    apply_router_weight_on_input=apply_router_weight_on_input)
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
 
-        zero_expert_num = getattr(layer, 'zero_expert_num', 0)
-        zero_expert_type = getattr(layer, 'zero_expert_type', None)
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
 
         select_result = FusedMoE.select_experts(
             hidden_states=x,
@@ -990,18 +1233,16 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             global_num_experts=global_num_experts,
             zero_expert_num=zero_expert_num,
             zero_expert_type=zero_expert_type,
+            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
-        #
-        # Note: the order of checks is important since self.fused_experts
-        # can override fused_experts or cutlass but not rocm or marlin.
-        #
         topk_weights, topk_ids, zero_expert_result = select_result
 
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-                rocm_aiter_fused_experts)
-            assert self.fused_experts is None
+                rocm_aiter_fused_experts,
+            )
+
             result = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
@@ -1011,12 +1252,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 expert_map=expert_map,
-                quant_config=self.moe_quant_config)
+                quant_config=self.moe_quant_config,
+            )
         elif self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
-            assert self.fused_experts is None
-            result = torch.ops.vllm.fused_marlin_moe(
+            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            result = fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -1031,29 +1271,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace)
-        elif self.fused_experts:
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert self.block_quant is None
-            assert (not renormalize and custom_routing_function is not None)
-            assert activation == 'silu', (
-                f"Expected 'silu' activation but got {activation}")
-            assert scoring_func == 'sigmoid', (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}")
-
-            result = flashinfer_cutlass_moe_fp8(
+            assert activation == "silu", (
+                f"Expected 'silu' activation but got {activation}"
+            )
+            if not self.block_quant:
+                assert not renormalize and custom_routing_function is not None
+                assert scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                )
+            # Delegate to CUTLASS FlashInfer path; function already bound with
+            # use_deepseek_fp8_block_scale for block-quant when applicable
+            result = self.flashinfer_moe_fn(
                 x,
                 layer,
                 topk_weights,
@@ -1066,6 +1297,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
+
             result = fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1080,10 +1312,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
                 allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm))
+                    self.allow_cutlass_block_scaled_grouped_gemm
+                ),
+            )
         if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), \
+            assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
+            )
             return result, zero_expert_result
         else:
             return result
diff --git a/vllm/model_executor/layers/quantization/fp_quant.py b/vllm/model_executor/layers/quantization/fp_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a253cef0b7b3f3cd641fe995b99c1bf32c64a8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fp_quant.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Supports FP-Quant compression, see https://arxiv.org/abs/2509.23202
+
+from typing import Any
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._custom_ops import (
+    cutlass_scaled_fp4_mm,
+    fusedQuantizeMx,
+    fusedQuantizeNv,
+    matmul_mxf4_bf16_tn,
+)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op
+
+
+class FPQuantConfig(QuantizationConfig):
+    """Config class for FPQuant."""
+
+    def __init__(
+        self,
+        hadamard_group_size: int = 32,
+        forward_dtype: str = "mxfp4",
+        forward_method: str = "abs_max",
+        pseudoquantization: bool = False,
+        modules_to_not_convert: list[str] | None = None,
+    ) -> None:
+        super().__init__()
+        self.hadamard_group_size = hadamard_group_size
+        self.forward_dtype = forward_dtype
+        self.forward_method = forward_method
+        self.pseudoquantization = pseudoquantization
+        self.modules_to_not_convert = modules_to_not_convert
+
+        if pseudoquantization:
+            raise ValueError("Pseudoquantization is not supported for vLLM")
+
+    def __repr__(self) -> str:
+        return (
+            f"FPQuantConfig(hadamard_group_size={self.hadamard_group_size}, "
+            f"forward_dtype={self.forward_dtype}, "
+            f"forward_method={self.forward_method}, "
+            f"pseudoquantization={self.pseudoquantization}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "fp_quant"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []  # no extra configs.
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "FPQuantConfig":
+        hadamard_group_size = cls.get_from_keys(config, ["hadamard_group_size"])
+        forward_dtype = cls.get_from_keys(config, ["forward_dtype"])
+        forward_method = cls.get_from_keys(config, ["forward_method"])
+        pseudoquantization = cls.get_from_keys(config, ["pseudoquantization"])
+        modules_to_not_convert = cls.get_from_keys(config, ["modules_to_not_convert"])
+        return cls(
+            hadamard_group_size,
+            forward_dtype,
+            forward_method,
+            pseudoquantization,
+            modules_to_not_convert,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> LinearMethodBase | None:
+        if self.modules_to_not_convert is not None and any(
+            prefix.endswith(module) for module in self.modules_to_not_convert
+        ):
+            return UnquantizedLinearMethod()
+
+        if isinstance(layer, LinearBase):
+            return FPQuantLinearMethod(self)
+        return None
+
+
+class FPQuantLinearMethod(LinearMethodBase):
+    """Linear method for FPQuant.
+
+    Args:
+        quant_config: The FPQuant quantization config.
+    """
+
+    def __init__(self, quant_config: FPQuantConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        del input_size  # Unused.
+
+        if params_dtype != torch.bfloat16:
+            raise ValueError("Only bfloat16 is currently supported by FPQuant")
+        if input_size_per_partition % self.quant_config.hadamard_group_size != 0:  # noqa: E501
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size. Or other skill issues."
+            )
+
+        assert self.quant_config.forward_dtype in ["mxfp4", "nvfp4"], (
+            "Only mxfp4 and nvfp4 are supported for now"
+        )
+        if self.quant_config.forward_dtype == "mxfp4":
+            group_size = 32
+        elif self.quant_config.forward_dtype == "nvfp4":
+            group_size = 16
+        else:
+            raise ValueError(
+                f"Unsupported forward_dtype: {self.quant_config.forward_dtype}"
+            )
+
+        qweight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": 2,
+            }
+            | extra_weight_attrs,
+        )
+        layer.register_parameter("qweight", qweight)
+
+        scales = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // group_size,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": group_size,
+            }
+            | extra_weight_attrs,
+        )
+        layer.register_parameter("scales", scales)
+
+        weight_global_scale = Parameter(
+            torch.empty(1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            weight_global_scale, {"ignore_warning": True} | extra_weight_attrs
+        )
+        layer.register_parameter("weight_global_scale", weight_global_scale)
+
+        act_global_scale = Parameter(
+            torch.empty(1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            act_global_scale, {"ignore_warning": True} | extra_weight_attrs
+        )
+        layer.register_parameter("act_global_scale", act_global_scale)
+
+        forward_hadamard_matrix = Parameter(
+            torch.empty(
+                self.quant_config.hadamard_group_size,
+                self.quant_config.hadamard_group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            forward_hadamard_matrix, {"ignore_warning": True} | extra_weight_attrs
+        )
+        layer.register_parameter("forward_hadamard_matrix", forward_hadamard_matrix)
+
+        backward_hadamard_matrix = Parameter(
+            torch.empty(
+                self.quant_config.hadamard_group_size,
+                self.quant_config.hadamard_group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            backward_hadamard_matrix, {"ignore_warning": True} | extra_weight_attrs
+        )
+        layer.register_parameter("backward_hadamard_matrix", backward_hadamard_matrix)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return quantized_forward(
+            x,
+            layer.qweight,
+            layer.scales,
+            layer.weight_global_scale,
+            layer.act_global_scale,
+            bias,
+            layer.forward_hadamard_matrix,
+            self.quant_config.forward_method,
+            self.quant_config.forward_dtype,
+        )
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def fused_quantize_mx(
+    x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return fusedQuantizeMx(x_flat, hadamard_matrix, method=forward_method)
+
+
+def fused_quantize_mx_fake(x_flat, hadamard_matrix, forward_method):
+    rows, cols = x_flat.size(0), x_flat.size(1) // 32
+    padded_rows = ((rows + 128 - 1) // 128) * 128
+    padded_cols = ((cols + 4 - 1) // 4) * 4
+
+    xh_e2m1 = torch.empty(
+        x_flat.size(0), x_flat.size(1) // 2, dtype=torch.uint8, device=x_flat.device
+    )
+    xh_e8m0 = torch.empty(
+        padded_rows, padded_cols, dtype=torch.float8_e8m0fnu, device=x_flat.device
+    )
+
+    return xh_e2m1, xh_e8m0
+
+
+direct_register_custom_op(
+    op_name="fused_quantize_mx",
+    op_func=fused_quantize_mx,
+    mutates_args=[],
+    fake_impl=fused_quantize_mx_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def matmul_mxf4_bf16(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    xs: torch.Tensor,
+    ws: torch.Tensor,
+    alpha: torch.Tensor,
+) -> torch.Tensor:
+    return matmul_mxf4_bf16_tn(
+        x,
+        w,
+        to_blocked(xs, backend="triton").view(torch.float8_e8m0fnu),
+        to_blocked(ws, backend="triton").view(torch.float8_e8m0fnu),
+        alpha,
+    )
+
+
+def matmul_mxf4_bf16_fake(x, w, xs, ws, alpha):
+    return torch.empty(*x.shape[:-1], w.shape[0], dtype=torch.bfloat16, device=x.device)
+
+
+direct_register_custom_op(
+    op_name="matmul_mxf4_bf16",
+    op_func=matmul_mxf4_bf16,
+    mutates_args=[],
+    fake_impl=matmul_mxf4_bf16_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def fused_quantize_nv(
+    x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, global_scale: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return fusedQuantizeNv(x_flat, hadamard_matrix, global_scale)
+
+
+def fused_quantize_nv_fake(x_flat, hadamard_matrix, global_scale):
+    rows, cols = x_flat.size(0), x_flat.size(1) // 16
+    padded_rows = ((rows + 128 - 1) // 128) * 128
+    padded_cols = ((cols + 4 - 1) // 4) * 4
+
+    xh_e2m1 = torch.empty(
+        x_flat.size(0), x_flat.size(1) // 2, dtype=torch.uint8, device=x_flat.device
+    )
+    xh_e8m0 = torch.empty(
+        padded_rows, padded_cols, dtype=torch.float8_e4m3fn, device=x_flat.device
+    )
+
+    return xh_e2m1, xh_e8m0
+
+
+direct_register_custom_op(
+    op_name="fused_quantize_nv",
+    op_func=fused_quantize_nv,
+    mutates_args=[],
+    fake_impl=fused_quantize_nv_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def matmul_nvf4_bf16(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    xs: torch.Tensor,
+    ws: torch.Tensor,
+    alpha: torch.Tensor,
+) -> torch.Tensor:
+    return cutlass_scaled_fp4_mm(
+        x,
+        w,
+        to_blocked(xs, backend="triton")
+        .view(torch.float8_e4m3fn)
+        .view(-1, x.shape[1] // 8),  # *2//16
+        to_blocked(ws, backend="triton")
+        .view(torch.float8_e4m3fn)
+        .view(-1, x.shape[1] // 8),
+        alpha,
+        torch.bfloat16,
+    )
+
+
+def matmul_nvf4_bf16_fake(x, w, xs, ws, alpha):
+    return torch.empty(*x.shape[:-1], w.shape[0], dtype=torch.bfloat16, device=x.device)
+
+
+direct_register_custom_op(
+    op_name="matmul_nvf4_bf16",
+    op_func=matmul_nvf4_bf16,
+    mutates_args=[],
+    fake_impl=matmul_nvf4_bf16_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+def quantized_forward(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_global_scale: torch.Tensor,
+    act_global_scale: torch.Tensor,
+    bias: torch.Tensor | None,
+    forward_hadamard_matrix: torch.Tensor,
+    forward_method: str,
+    forward_dtype: str,
+) -> torch.Tensor:
+    x_flat = x.contiguous().flatten(end_dim=-2)
+
+    if forward_dtype == "mxfp4":
+        x_flat_q, x_flat_scales = torch.ops.vllm.fused_quantize_mx(
+            x_flat, forward_hadamard_matrix, forward_method
+        )
+        y = torch.ops.vllm.matmul_mxf4_bf16(
+            x_flat_q,
+            qweight,
+            x_flat_scales,
+            weight_scales,
+            1 / (weight_global_scale * act_global_scale),
+        )
+    elif forward_dtype == "nvfp4":
+        x_flat_q, x_flat_scales = torch.ops.vllm.fused_quantize_nv(
+            x_flat, forward_hadamard_matrix, act_global_scale
+        )
+        y = torch.ops.vllm.matmul_nvf4_bf16(
+            x_flat_q,
+            qweight,
+            x_flat_scales,
+            weight_scales,
+            1 / (weight_global_scale * act_global_scale),
+        )
+    else:
+        raise ValueError(f"Unsupported forward_dtype: {forward_dtype}")
+
+    y = y.view(*x.shape[:-1], y.shape[-1])
+    if bias is not None:
+        y += bias
+
+    return y
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index da1688808bb5ac4af5658a295e4cfcdd5e02f793..caabcd0ca0ee5581b540615b3af9a457c5d33c2f 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import gguf
 import torch
@@ -10,19 +11,24 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -30,13 +36,12 @@ logger = init_logger(__name__)
 class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
 
-    def __init__(self,
-                 unquantized_modules: Optional[list[str]] = None) -> None:
+    def __init__(self, unquantized_modules: list[str] | None = None) -> None:
         super().__init__()
         self.unquantized_modules = unquantized_modules or []
 
     def __repr__(self) -> str:
-        return ("GGUFConfig()")
+        return "GGUFConfig()"
 
     def get_name(self) -> QuantizationMethods:
         return "gguf"
@@ -56,8 +61,9 @@ class GGUFConfig(QuantizationConfig):
     def from_config(cls, config: dict[str, Any]) -> "GGUFConfig":
         return cls()
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             if is_layer_skipped_gguf(prefix, self.unquantized_modules):
                 return UnquantizedLinearMethod()
@@ -108,8 +114,9 @@ MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
 MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
 
 
-def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
-                        qweight_type: int) -> torch.Tensor:
+def _fused_mul_mat_gguf(
+    x: torch.Tensor, qweight: torch.Tensor, qweight_type: int
+) -> torch.Tensor:
     if qweight_type in IMATRIX_QUANT_TYPES:
         mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
     else:
@@ -117,10 +124,7 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
     # HACK: when doing chunked prefill we don't generate output tokens
     # so input to logits generator is empty which causes invalid parameter
     if x.shape[0] == 0:
-        return torch.empty(x.shape[0],
-                           qweight.shape[0],
-                           dtype=x.dtype,
-                           device=x.device)
+        return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T
@@ -141,8 +145,7 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
         # Might be useful if llama.cpp adds a new quantization type.
         # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
         qweight_type = WeightType(qweight_type)
-        raise NotImplementedError(
-            f"Unsupported GGUF quantization type: {qweight_type}")
+        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
     return y
 
 
@@ -151,10 +154,7 @@ def _fused_mul_mat_gguf_fake(
     qweight: torch.Tensor,
     qweight_type: int,
 ) -> torch.Tensor:
-    return torch.empty(x.shape[0],
-                       qweight.shape[0],
-                       dtype=x.dtype,
-                       device=x.device)
+    return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
 
 
 try:
@@ -179,10 +179,9 @@ def _fused_moe_gguf(
     qweight_type2: int,
     activation: str,
 ) -> torch.Tensor:
-
     def act(x: torch.Tensor):
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         if activation == "silu":
             torch.ops._C.silu_and_mul(out, x)
@@ -193,50 +192,73 @@ def _fused_moe_gguf(
         return out
 
     # lazy import to avoid triggering triton import in CPU backend
-    from vllm.model_executor.layers.fused_moe.fused_moe import (
-        moe_align_block_size)
+    from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 
     out_hidden_states = torch.empty_like(x)
     # unless we decent expert reuse we are better off running moe_vec kernel
-    if (qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES
-            and x.shape[0] > 64):
+    if (
+        qweight_type2 in MMQ_QUANT_TYPES
+        and qweight_type in MMQ_QUANT_TYPES
+        and x.shape[0] > 64
+    ):
         num_tokens, _ = x.shape
         E, N, _ = w1.shape
         top_k = topk_ids.shape[1]
         BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type)
 
-        sorted_token_ids, expert_ids, num_tokens_post_padded = \
-                moe_align_block_size(topk_ids, BLOCK_SIZE, E)
-        out = ops.ggml_moe_a8(x, w1, sorted_token_ids, expert_ids,
-                              num_tokens_post_padded, qweight_type, N, top_k,
-                              num_tokens)
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            topk_ids, BLOCK_SIZE, E
+        )
+        out = ops.ggml_moe_a8(
+            x,
+            w1,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            qweight_type,
+            N,
+            top_k,
+            num_tokens,
+        )
         out = act(out)
-        out = ops.ggml_moe_a8(out, w2, sorted_token_ids, expert_ids,
-                              num_tokens_post_padded, qweight_type2,
-                              w2.shape[1], 1, num_tokens * top_k)
+        out = ops.ggml_moe_a8(
+            out,
+            w2,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            qweight_type2,
+            w2.shape[1],
+            1,
+            num_tokens * top_k,
+        )
         out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
-            topk_weights.view(num_tokens, top_k, 1))
+            topk_weights.view(num_tokens, top_k, 1)
+        )
         ops.moe_sum(out, out_hidden_states)
     elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
         num_tokens, _ = x.shape
         E, N, _ = w1.shape
         top_k = topk_ids.shape[1]
 
-        out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N,
-                                  num_tokens)
+        out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N, num_tokens)
         out = act(out)
 
-        out = ops.ggml_moe_a8_vec(out, w2, topk_ids, 1, qweight_type2,
-                                  w2.shape[1], num_tokens * top_k)
+        out = ops.ggml_moe_a8_vec(
+            out, w2, topk_ids, 1, qweight_type2, w2.shape[1], num_tokens * top_k
+        )
         out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
-            topk_weights.view(num_tokens, top_k, 1))
+            topk_weights.view(num_tokens, top_k, 1)
+        )
         ops.moe_sum(out, out_hidden_states)
     else:
-        logger.warning_once("There is no support for fast MoE kernel "
-                            "for current quantization method. "
-                            "Falling back to slow implementation. ")
+        logger.warning_once(
+            "There is no support for fast MoE kernel "
+            "for current quantization method. "
+            "Falling back to slow implementation. "
+        )
         for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
-            inp = x[tok].reshape((1, ) + x.shape[1:])
+            inp = x[tok].reshape((1,) + x.shape[1:])
             current_hidden_state = None
             for ww, ii in zip(w, idx):
                 expert_up = w1[ii]
@@ -245,8 +267,9 @@ def _fused_moe_gguf(
                 out = act(out)
 
                 expert_down = w2[ii]
-                current_state = fused_mul_mat_gguf(out, expert_down,
-                                                   qweight_type2).mul_(ww)
+                current_state = fused_mul_mat_gguf(
+                    out, expert_down, qweight_type2
+                ).mul_(ww)
                 if current_hidden_state is None:
                     current_hidden_state = current_state
                 else:
@@ -285,22 +308,22 @@ def _apply_gguf_embedding(
     qweight: torch.Tensor,
     qweight_type: int,
     hidden_size: int,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if qweight_type in UNQUANTIZED_TYPES:
         return torch.embedding(qweight, x)
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         x_flat = x.flatten()
-        assert (hidden_size == qweight.shape[1] // type_size * block_size)
+        assert hidden_size == qweight.shape[1] // type_size * block_size
         quant = torch.index_select(qweight, dim=0, index=x_flat)
-        dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0], dtype)
+        dequant = ops.ggml_dequantize(
+            quant, qweight_type, hidden_size, x_flat.shape[0], dtype
+        )
         return dequant.view(*x.shape, hidden_size)
     else:
         qweight_type = WeightType(qweight_type)
-        raise NotImplementedError(
-            f"Unsupported GGUF quantization type: {qweight_type}")
+        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
 
 
 def _apply_gguf_embedding_fake(
@@ -308,7 +331,7 @@ def _apply_gguf_embedding_fake(
     qweight: torch.Tensor,
     qweight_type: int,
     hidden_size: int,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device)
 
@@ -335,18 +358,24 @@ class GGUFLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: GGUFConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         self.params_dtype = params_dtype
         output_size_per_partition = sum(output_partition_sizes)
 
         tensor_shape = (output_size_per_partition, input_size_per_partition)
         qweight = GGUFUninitializedParameter(requires_grad=False)
         set_weight_attrs(
-            qweight, {
+            qweight,
+            {
                 "input_dim": 1,
                 "output_dim": 0,
                 "tensor_shape": tensor_shape,
@@ -354,31 +383,34 @@ class GGUFLinearMethod(LinearMethodBase):
                 "data_container": [],
                 "shard_id": [],
                 "shard_id_map": {},
-            })
+            },
+        )
         set_weight_attrs(qweight, extra_weight_attrs)
         layer.register_parameter("qweight", qweight)
 
-        qweight_type = Parameter(torch.empty(len(output_partition_sizes),
-                                             dtype=torch.uint8),
-                                 requires_grad=False)
+        qweight_type = Parameter(
+            torch.empty(len(output_partition_sizes), dtype=torch.uint8),
+            requires_grad=False,
+        )
         set_weight_attrs(
-            qweight_type, {
+            qweight_type,
+            {
                 "is_gguf_weight_type": True,
                 "weight_type": 0,
                 "shard_weight_type": {},
-                "ignore_warning": True
-            })
+                "ignore_warning": True,
+            },
+        )
         set_weight_attrs(qweight_type, extra_weight_attrs)
         layer.register_parameter("qweight_type", qweight_type)
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
         qweight_type = layer.qweight_type.weight_type
-        if not (qweight_type in UNQUANTIZED_TYPES
-                or qweight_type in DEQUANT_TYPES):
+        if not (qweight_type in UNQUANTIZED_TYPES or qweight_type in DEQUANT_TYPES):
             qweight_type = WeightType(qweight_type)
             raise ValueError(
-                f"Unsupported GGUF quantization type {qweight_type} in "
-                f"layer {layer}.")
+                f"Unsupported GGUF quantization type {qweight_type} in layer {layer}."
+            )
         # For MergedColumnParallelLinear and QKVParallelLinear, we need to
         # materialize the padded weight parameter for CUDA Graph compatibility.
         self._create_padded_weight_param(layer)
@@ -391,22 +423,22 @@ class GGUFLinearMethod(LinearMethodBase):
         if len(data_container := qweight.data_container) > 1:
             dtype = {data.dtype for data in data_container}
             assert len(dtype) == 1, ValueError(
-                f"Data container has mixed dtypes: {dtype}")
+                f"Data container has mixed dtypes: {dtype}"
+            )
             dtype = next(iter(dtype))
             # concat dim0 and pad dim1
             padded_side = max(x.size(1) for x in data_container)
             concat_side = sum(x.size(0) for x in data_container)
             # Pad the quantized weights to dense tensor, and create a map
             # with the location of each shard in the padded tensor.
-            padded_data = torch.zeros((concat_side, padded_side),
-                                      dtype=dtype,
-                                      device=qweight.device)
+            padded_data = torch.zeros(
+                (concat_side, padded_side), dtype=dtype, device=qweight.device
+            )
             # (dim0_start, dim0_end, dim1_size)
             shard_offset_map = dict[str, tuple[int, int, int]]()
             for idx in shard_id:
                 id_in_container = shard_id_map[idx]
-                start = sum(
-                    x.size(0) for x in data_container[:id_in_container])
+                start = sum(x.size(0) for x in data_container[:id_in_container])
                 end = start + data_container[id_in_container].size(0)
                 size = data_container[id_in_container].size(1)
                 padded_data[start:end, :size] = data_container[id_in_container]
@@ -414,14 +446,15 @@ class GGUFLinearMethod(LinearMethodBase):
             qweight.data_container.clear()
             padded_param = Parameter(padded_data, requires_grad=False)
             set_weight_attrs(padded_param, vars(qweight))
-            set_weight_attrs(padded_param,
-                             {"shard_offset_map": shard_offset_map})
+            set_weight_attrs(padded_param, {"shard_offset_map": shard_offset_map})
             layer.register_parameter("qweight", padded_param)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         shard_id = layer.qweight.shard_id
 
         if shard_id:
@@ -434,8 +467,9 @@ class GGUFLinearMethod(LinearMethodBase):
                 qweight_type = layer.qweight_type.shard_weight_type[idx]
                 result.append(
                     fused_mul_mat_gguf(
-                        x, qweight[start:end, :offset].contiguous(),
-                        qweight_type))
+                        x, qweight[start:end, :offset].contiguous(), qweight_type
+                    )
+                )
             out = torch.cat(result, axis=1)
         else:
             qweight = layer.qweight
@@ -461,63 +495,71 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         super().__init__(moe)
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
-        tensor_shape = (num_experts, 2 * intermediate_size_per_partition,
-                        hidden_size)
-        #gate up proj
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        tensor_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size)
+        # gate up proj
         w13_qweight = GGUFUninitializedParameter(requires_grad=False)
         set_weight_attrs(
-            w13_qweight, {
+            w13_qweight,
+            {
                 "input_dim": 1,
                 "output_dim": 0,
                 "tensor_shape": tensor_shape,
                 "is_gguf_weight": True,
                 "data_container": [],
-            })
+            },
+        )
         set_weight_attrs(w13_qweight, extra_weight_attrs)
         layer.register_parameter("w13_qweight", w13_qweight)
 
-        w13_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
-                                     requires_grad=False)
-        set_weight_attrs(w13_qweight_type, {
-            "is_gguf_weight_type": True,
-            "weight_type": 0,
-            "ignore_warning": True
-        })
+        w13_qweight_type = Parameter(
+            torch.empty(1, dtype=torch.uint8), requires_grad=False
+        )
+        set_weight_attrs(
+            w13_qweight_type,
+            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
+        )
         set_weight_attrs(w13_qweight_type, extra_weight_attrs)
         layer.register_parameter("w13_qweight_type", w13_qweight_type)
 
-        tensor_shape = (num_experts, intermediate_size_per_partition,
-                        hidden_size)
-        #gate down proj
+        tensor_shape = (num_experts, intermediate_size_per_partition, hidden_size)
+        # gate down proj
         w2_qweight = GGUFUninitializedParameter(requires_grad=False)
         set_weight_attrs(
-            w2_qweight, {
+            w2_qweight,
+            {
                 "input_dim": 1,
                 "output_dim": 0,
                 "tensor_shape": tensor_shape,
                 "is_gguf_weight": True,
                 "data_container": [],
-            })
+            },
+        )
         set_weight_attrs(w2_qweight, extra_weight_attrs)
         layer.register_parameter("w2_qweight", w2_qweight)
 
-        w2_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
-                                    requires_grad=False)
-        set_weight_attrs(w2_qweight_type, {
-            "is_gguf_weight_type": True,
-            "weight_type": 0,
-            "ignore_warning": True
-        })
+        w2_qweight_type = Parameter(
+            torch.empty(1, dtype=torch.uint8), requires_grad=False
+        )
+        set_weight_attrs(
+            w2_qweight_type,
+            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
+        )
 
         set_weight_attrs(w2_qweight_type, extra_weight_attrs)
         layer.register_parameter("w2_qweight_type", w2_qweight_type)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -528,32 +570,30 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `GGUFMoEMethod` yet.")
+            raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
 
         assert activation == "silu", "Only SiLU activation is supported."
         if apply_router_weight_on_input:
             raise NotImplementedError(
                 "Apply router weight on input is not supported for"
-                "fused GGUF MoE method.")
+                "fused GGUF MoE method."
+            )
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -567,11 +607,18 @@ class GGUFMoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
-        return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
-                              topk_weights, topk_ids,
-                              layer.w13_qweight_type.weight_type,
-                              layer.w2_qweight_type.weight_type, activation)
+            indices_type=self.topk_indices_dtype,
+        )
+        return fused_moe_gguf(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            topk_weights,
+            topk_ids,
+            layer.w13_qweight_type.weight_type,
+            layer.w2_qweight_type.weight_type,
+            activation,
+        )
 
 
 class GGUFEmbeddingMethod(GGUFLinearMethod):
@@ -581,17 +628,14 @@ class GGUFEmbeddingMethod(GGUFLinearMethod):
         quant_config: The GGUF quantization config.
     """
 
-    def embedding(self, layer: torch.nn.Module,
-                  x: torch.Tensor) -> torch.Tensor:
+    def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor:
         qweight = layer.qweight
         qweight_type = layer.qweight_type.weight_type
         hidden_size = qweight.tensor_shape[1]
 
-        return apply_gguf_embedding(x,
-                                    qweight,
-                                    qweight_type,
-                                    hidden_size,
-                                    dtype=self.params_dtype)
+        return apply_gguf_embedding(
+            x, qweight, qweight_type, hidden_size, dtype=self.params_dtype
+        )
 
 
 class GGUFUninitializedParameter(UninitializedParameter):
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 0335b9c46b4d86ce7f5125487d0e913a402f14a1..2ad28048cdce4bf0d74064dd184234d03b545c42 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -4,27 +4,40 @@
 import enum
 from enum import Enum
 from fractions import Fraction
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Union
 
 import torch
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
-    get_linear_quant_method)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedColumnParameter,
-                                           PackedvLLMParameter,
-                                           RowvLLMParameter)
+    get_linear_quant_method,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.models.utils import WeightsMapper
+else:
+    QuantizationMethods = str
+
+logger = init_logger(__name__)
 
 
 class GPTQConfig(QuantizationConfig):
@@ -39,9 +52,10 @@ class GPTQConfig(QuantizationConfig):
         group_size: int,
         desc_act: bool,
         lm_head_quantized: bool,
-        dynamic: dict[str, dict[str, Union[int, bool]]],
+        dynamic: dict[str, dict[str, int | bool]],
         autoround_version: str = "",
-        modules_in_block_to_quantize: Optional[list[str]] = None,
+        modules_in_block_to_quantize: list[str] | None = None,
+        checkpoint_format: str = "",
     ) -> None:
         # GPTQModel use `dynamic` config property to allow per module
         # quantization config so each module can be individually optimized.
@@ -77,13 +91,26 @@ class GPTQConfig(QuantizationConfig):
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
                 "Currently, only 2/3/4/8-bit weight quantization is "
-                f"supported for GPTQ, but got {self.weight_bits} bits.")
+                f"supported for GPTQ, but got {self.weight_bits} bits."
+            )
+        # Somehow gptq_gemm 4-bit is buggy, maybe fix it in the future.
+        # For now, show a warning, since gptq_marlin will be used by default.
+        if self.weight_bits == 4:
+            logger.warning_once(
+                "Currently, the 4-bit gptq_gemm kernel for GPTQ is buggy. "
+                "Please switch to gptq_marlin or gptq_bitblas."
+            )
 
         self.modules_in_block_to_quantize = modules_in_block_to_quantize or []
 
         # used to identify GPTQ model quantized by autoround
         self.autoround_version = autoround_version
 
+        # GPTQ v1 and v2 format deals with zero points differently.
+        # Currently GPTQModel stores v1 format checkpoints by default,
+        # but provides the option to set `format="gptq_v2"` in `QuantizeConfig`.
+        self.checkpoint_format = checkpoint_format
+
     def __repr__(self) -> str:
         return (
             f"GPTQConfig(weight_bits={self.weight_bits}, "
@@ -91,7 +118,8 @@ class GPTQConfig(QuantizationConfig):
             f"desc_act={self.desc_act}), "
             f"lm_head_quantized={self.lm_head_quantized}, "
             f"dynamic={self.dynamic}, "
-            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
+            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize}), "
+            f"checkpoint_format={self.checkpoint_format})"
         )
 
     @classmethod
@@ -119,22 +147,35 @@ class GPTQConfig(QuantizationConfig):
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
-        autoround_version = cls.get_from_keys_or(config, ["autoround_version"],
-                                                 default="")
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        autoround_version = cls.get_from_keys_or(
+            config, ["autoround_version"], default=""
+        )
         modules_in_block_to_quantize = cls.get_from_keys_or(
-            config, ["modules_in_block_to_quantize"], default=None)
-        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
-                   dynamic, autoround_version, modules_in_block_to_quantize)
+            config, ["modules_in_block_to_quantize"], default=None
+        )
+        checkpoint_format = cls.get_from_keys_or(
+            config, ["checkpoint_format"], default=""
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            lm_head_quantized,
+            dynamic,
+            autoround_version,
+            modules_in_block_to_quantize,
+            checkpoint_format,
+        )
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
+    ) -> Union["GPTQLinearMethod", "QuantizeMethodBase"] | None:
         if isinstance(layer, FusedMoE):
             # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
             from .moe_wna16 import MoeWNA16Config
 
+            # TODO: maybe update this for GPTQv2 format checkpoints
             config = {
                 "quant_method": "gptq",
                 "bits": self.weight_bits,
@@ -142,43 +183,40 @@ class GPTQConfig(QuantizationConfig):
                 "sym": True,  # GPTQ typically uses symmetric quantization
                 "lm_head": False,
             }
-            return MoeWNA16Config.from_config(config).get_quant_method(
-                layer, prefix)
+            return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix)
 
         return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
-    def apply_vllm_mapper(self, hf_to_vllm_mapper):
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if self.modules_in_block_to_quantize is not None:
             self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
-                self.modules_in_block_to_quantize)
+                self.modules_in_block_to_quantize
+            )
 
-    def maybe_update_config(self,
-                            model_name: str,
-                            revision: Optional[str] = None):
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
         if self.modules_in_block_to_quantize:
             if is_list_of(self.modules_in_block_to_quantize, list):
                 # original modules_in_block_to_quantize: list[list[str]]
                 # flatten original modules_in_block_to_quantize
                 self.modules_in_block_to_quantize = [
-                    item for sublist in self.modules_in_block_to_quantize
+                    item
+                    for sublist in self.modules_in_block_to_quantize
                     for item in sublist
                 ]
             return
 
         unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-        metadata = get_safetensors_params_metadata(model_name,
-                                                   revision=revision)
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
         quant_layers: set[str] = {
             param_name.rsplit(".", 1)[0]
             for param_name, info in metadata.items()
-            if (dtype := info.get('dtype', None))
+            if (dtype := info.get("dtype", None))
             and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
         }
         self.modules_in_block_to_quantize = list(quant_layers)
 
 
 class ExllamaState(Enum):
-
     UNUSED = enum.auto()
     UNINITIALIZED = enum.auto()
     READY = enum.auto()
@@ -194,6 +232,9 @@ class GPTQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: GPTQConfig):
         self.quant_config = quant_config
 
+        # GPTQ v1 and v2 format deals with zero points differently
+        self.use_v2_format = quant_config.checkpoint_format == "gptq_v2"
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -210,14 +251,15 @@ class GPTQLinearMethod(LinearMethodBase):
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
         output_size_per_partition = sum(output_partition_sizes)
-        if (output_size_per_partition % self.quant_config.pack_factor.numerator
-                != 0):
+        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
 
         if self.quant_config.group_size != -1:
             group_size = self.quant_config.group_size
@@ -226,8 +268,10 @@ class GPTQLinearMethod(LinearMethodBase):
         exllama_state = ExllamaState.UNINITIALIZED
         scale_and_zero_size = input_size // group_size
         scale_and_zero_input_dim = None
-        if (input_size != input_size_per_partition
-                and self.quant_config.group_size != -1):
+        if (
+            input_size != input_size_per_partition
+            and self.quant_config.group_size != -1
+        ):
             # For act-order models, we cannot use Exllama for row parallel layer
             if self.quant_config.desc_act:
                 exllama_state = ExllamaState.UNUSED
@@ -246,56 +290,56 @@ class GPTQLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=0,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
-
-        g_idx = RowvLLMParameter(data=torch.tensor(
-            [
-                i // self.quant_config.group_size
-                for i in range(input_size_per_partition)
-            ],
-            dtype=torch.int32,
-        ),
-                                 input_dim=0,
-                                 weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
+
+        g_idx = RowvLLMParameter(
+            data=torch.tensor(
+                [
+                    i // self.quant_config.group_size
+                    for i in range(input_size_per_partition)
+                ],
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
         qzeros_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         if scale_and_zero_input_dim is None:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
             qzeros = PackedColumnParameter(
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         else:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
             qzeros = PackedvLLMParameter(
                 input_dim=0,
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
@@ -317,24 +361,33 @@ class GPTQLinearMethod(LinearMethodBase):
             if self.quant_config.desc_act:
                 layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
             else:
-                layer.g_idx.data = torch.empty((0, ),
-                                               dtype=torch.int,
-                                               device=layer.g_idx.device)
+                layer.g_idx.data = torch.empty(
+                    (0,), dtype=torch.int, device=layer.g_idx.device
+                )
             layer.exllama_state = ExllamaState.READY
-            ops.gptq_shuffle(layer.qweight, layer.g_idx,
-                             self.quant_config.weight_bits)
-
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
+            ops.gptq_shuffle(layer.qweight, layer.g_idx, self.quant_config.weight_bits)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        out_shape = x.shape[:-1] + (layer.qweight.shape[-1],)
         reshaped_x = x.reshape(-1, x.shape[-1])
 
-        output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
-                               layer.scales, layer.g_idx,
-                               layer.exllama_state == ExllamaState.READY,
-                               self.quant_config.weight_bits)
+        # GPTQ v1 and v2 format checkpoints deals with zero points differently,
+        # and require different gemm kernels.
+        output = ops.gptq_gemm(
+            reshaped_x,
+            layer.qweight,
+            layer.qzeros,
+            layer.scales,
+            layer.g_idx,
+            layer.exllama_state == ExllamaState.READY,
+            self.use_v2_format,
+            self.quant_config.weight_bits,
+        )
         if bias is not None:
             output.add_(bias)
         return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index 64622925864820bbd8c0d3313a74d19437994f1a..92f10bfd5c02df14e9da97e055e4cd1613de8584 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -7,26 +7,39 @@ from packaging import version
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    set_weight_attrs,
+)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
-    BitBLASLinearKernel, MPLinearLayerConfig)
+    BitBLASLinearKernel,
+    MPLinearLayerConfig,
+)
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS)
+    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS,
+)
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM)
+    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM,
+)
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    MINIMUM_BITBLAS_VERSION, bitblas_repeat_scales_on_all_ranks,
-    check_bitblas_supported, verify_bitblas_supported)
+    MINIMUM_BITBLAS_VERSION,
+    bitblas_repeat_scales_on_all_ranks,
+    check_bitblas_supported,
+    verify_bitblas_supported,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedColumnParameter,
-                                           PackedvLLMParameter,
-                                           RowvLLMParameter)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -58,17 +71,19 @@ class GPTQBitBLASConfig(QuantizationConfig):
         group_size: int,
         desc_act: bool,
         is_sym: bool,
-        quant_method: Optional[str],
+        quant_method: str | None,
         lm_head_quantized: bool,
     ) -> None:
-
         try:
             import bitblas
+
             if version.parse(bitblas.__version__) < version.parse(
-                    MINIMUM_BITBLAS_VERSION):
+                MINIMUM_BITBLAS_VERSION
+            ):
                 raise ImportError(
                     "bitblas version is wrong. Please "
-                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+                )
         except ImportError as e:
             bitblas_import_exception = e
             raise ValueError(
@@ -96,17 +111,20 @@ class GPTQBitBLASConfig(QuantizationConfig):
             raise ValueError(
                 f"BitBLAS does not support weight_bits = {self.weight_bits}. "
                 f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} "
-                "are supported.")
+                "are supported."
+            )
 
         if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM:
             raise ValueError(
                 f"BitBLAS does not support is_sym = {self.is_sym}. "
-                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported.")
+                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported."
+            )
 
         self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE
 
-        storage_nbit = int("".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE
-                                   if c.isdigit()))
+        storage_nbit = int(
+            "".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE if c.isdigit())
+        )
 
         # 4 Bits packed into 32 bit datatype.
         self.pack_factor = storage_nbit // weight_bits
@@ -116,17 +134,20 @@ class GPTQBitBLASConfig(QuantizationConfig):
         self.zeros_mode = self.ZEROS_MODE
 
         if (weight_bits, is_sym) not in self.TYPE_MAP:
-            raise ValueError("Unsupported quantization config: "
-                             f"bits={weight_bits}, sym={is_sym}")
+            raise ValueError(
+                f"Unsupported quantization config: bits={weight_bits}, sym={is_sym}"
+            )
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
     def __repr__(self) -> str:
-        return (f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})"
-                f"is_sym={self.is_sym}, "
-                f"quant_method={self.quant_method})")
+        return (
+            f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act})"
+            f"is_sym={self.is_sym}, "
+            f"quant_method={self.quant_method})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -151,36 +172,46 @@ class GPTQBitBLASConfig(QuantizationConfig):
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
         quant_method = cls.get_from_keys(config, ["quant_method"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
-        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
-                   lm_head_quantized)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(
+            weight_bits, group_size, desc_act, is_sym, quant_method, lm_head_quantized
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "bitblas"
-                               or user_quant == "gptq_bitblas")
+        is_valid_user_quant = (
+            user_quant is None
+            or user_quant == "bitblas"
+            or user_quant == "gptq_bitblas"
+        )
 
         if can_convert and is_valid_user_quant:
-            msg = ("The model is convertible to {} during runtime."
-                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
             logger.info(msg)
             return cls.get_name()
 
         if can_convert and user_quant == "gptq":
-            logger.info("Detected that the model can run with gptq_bitblas"
-                        ", however you specified quantization=gptq explicitly,"
-                        " so forcing gptq. Use quantization=gptq_bitblas for"
-                        " faster inference")
+            logger.info(
+                "Detected that the model can run with gptq_bitblas"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_bitblas for"
+                " faster inference"
+            )
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQBitBLASLinearMethod"]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["GPTQBitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
             return GPTQBitBLASLinearMethod(self)
         return None
 
@@ -201,8 +232,7 @@ class GPTQBitBLASConfig(QuantizationConfig):
             return False
 
         # If we cannot find the info needed in the config, cannot convert.
-        if (num_bits is None or group_size is None or sym is None
-                or desc_act is None):
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
             return False
 
         if (num_bits, sym) not in cls.TYPE_MAP:
@@ -215,9 +245,9 @@ class GPTQBitBLASConfig(QuantizationConfig):
             return False
 
         # Otherwise, can convert if model satisfies bitblas constraints.
-        return check_bitblas_supported(quant_type=cls.TYPE_MAP[(num_bits,
-                                                                sym)],
-                                       group_size=group_size)
+        return check_bitblas_supported(
+            quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
+        )
 
 
 class GPTQBitBLASLinearMethod(LinearMethodBase):
@@ -233,8 +263,10 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: GPTQBitBLASConfig) -> None:
         self.quant_config = quant_config
         # Verify supported on platform.
-        verify_bitblas_supported(quant_type=self.quant_config.quant_type,
-                                 group_size=self.quant_config.group_size)
+        verify_bitblas_supported(
+            quant_type=self.quant_config.quant_type,
+            group_size=self.quant_config.group_size,
+        )
 
     def create_weights(
         self,
@@ -248,7 +280,7 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
     ) -> None:
         """Creates quantized weights for use in linear operations.
 
-        The function initializes and returns a dictionary containing 
+        The function initializes and returns a dictionary containing
         quantized weights, scales, and zeros
         for performing quantized matrix multiplication operations.
 
@@ -257,11 +289,11 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
             output_partition_sizes: The size of the output partition.
             input_size: The total size of the input (unused).
             output_size: The total size of the output (unused).
-            params_dtype: 
+            params_dtype:
                 The data type of the parameters (expected to be torch.float16).
 
         Returns:
-            A dictionary containing the quantized weights ('qweight'), 
+            A dictionary containing the quantized weights ('qweight'),
             scales ('scales'), and zeros ('zeros').
 
         Raises:
@@ -270,8 +302,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
                 in `quant_config`.
         """
         if params_dtype != torch.float16:
-            raise ValueError("Parameter data type must be torch.float16, "
-                             f"but got {params_dtype}")
+            raise ValueError(
+                f"Parameter data type must be torch.float16, but got {params_dtype}"
+            )
 
         # Normalize group_size
         if self.quant_config.group_size != -1:
@@ -294,18 +327,19 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
-            partition_weight_shape=\
-                (input_size_per_partition, output_size_per_partition),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
             weight_type=self.quant_config.quant_type,
             act_type=params_dtype,
             group_size=self.quant_config.group_size,
             zero_points=False,
-            has_g_idx=self.quant_config.desc_act
+            has_g_idx=self.quant_config.desc_act,
         )
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for GPTQBitBLASLinearMethod",
-                        kernel_type.__name__)
+            logger.info("Using %s for GPTQBitBLASLinearMethod", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # Normalize group_size
@@ -315,9 +349,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
             group_size = input_size
 
         # Determine sharding
-        if bitblas_repeat_scales_on_all_ranks(self.quant_config.desc_act,
-                                              self.quant_config.group_size,
-                                              is_row_parallel):
+        if bitblas_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
             # By setting scale_dim == None, weight_loader will
             # repeat the scales on each GPU in TP>1 case.
             scales_and_zp_input_dim = None
@@ -340,16 +374,19 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=0,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         # Activation order
         # Ignore warning from fused linear layers such as QKVParallelLinear.
-        g_idx = RowvLLMParameter(data=torch.empty(
-            input_size_per_partition,
-            dtype=torch.int32,
-        ),
-                                 input_dim=0,
-                                 weight_loader=weight_loader)
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
 
         # Scales
         scales = Parameter(
@@ -371,45 +408,42 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
 
         # Quantized zero-points
         qzeros_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scales_and_zp_size,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scales_and_zp_size,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
 
         if scales_and_zp_input_dim is None:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
             qzeros = PackedColumnParameter(
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         else:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
             qzeros = PackedvLLMParameter(
                 input_dim=0,
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
@@ -440,7 +474,7 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         out = self.kernel.apply_gptq_bitblas_linear(layer, x)
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 967e46c24378ea58a62598359b924c35081a57ec..68a122fd46c6bdf31a1da2b7d86b0aeb0a4e07e1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional
 
 import torch
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
@@ -10,40 +11,59 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
-    UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               set_weight_attrs)
+    FusedMoE,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod,
+)
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
-    MPLinearLayerConfig, choose_mp_linear_kernel)
+    MPLinearLayerConfig,
+    choose_mp_linear_kernel,
+)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
-    get_dynamic_override, get_linear_quant_method, override_config)
+    get_dynamic_override,
+    get_linear_quant_method,
+    override_config,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_marlin_supported, check_moe_marlin_supports_layer,
-    marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias,
-    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedColumnParameter,
-                                           PackedvLLMParameter,
-                                           RowvLLMParameter)
+    check_marlin_supported,
+    check_moe_marlin_supports_layer,
+    marlin_make_workspace_new,
+    marlin_moe_permute_scales,
+    marlin_permute_bias,
+    marlin_repeat_scales_on_all_ranks,
+    verify_marlin_supported,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
 
 logger = init_logger(__name__)
 
 
 def get_moe_quant_method(
-    config: QuantizationConfig,
+    config: "GPTQMarlinConfig",
     layer: torch.nn.Module,
     prefix: str,
     moe_method_cls: type,
@@ -52,9 +72,13 @@ def get_moe_quant_method(
 
     if isinstance(layer, FusedMoE):
         # False = skip module, None = no override, else = Positive match
-        if get_dynamic_override(  # noqa: E712
+        if (
+            get_dynamic_override(  # noqa: E712
                 cloned_config,  # noqa: E712
-                layer_name=prefix) == False:  # noqa: E712
+                layer_name=prefix,
+            )
+            == False
+        ):  # noqa: E712
             return UnquantizedFusedMoEMethod(layer.moe_config)
 
         if prefix:
@@ -75,15 +99,16 @@ class GPTQMarlinConfig(QuantizationConfig):
     }
 
     def __init__(
-            self,
-            weight_bits: int,
-            group_size: int,
-            desc_act: bool,
-            is_sym: bool,
-            lm_head_quantized: bool,
-            dynamic: dict[str, dict[str, Union[int, bool]]],
-            full_config: dict[str, Any],
-            modules_in_block_to_quantize: Optional[list[str]] = None) -> None:
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: dict[str, dict[str, int | bool]],
+        full_config: dict[str, Any],
+        modules_in_block_to_quantize: list[str] | None = None,
+    ) -> None:
         super().__init__()
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
@@ -125,8 +150,9 @@ class GPTQMarlinConfig(QuantizationConfig):
         self.full_config = full_config
 
         if (weight_bits, is_sym) not in self.TYPE_MAP:
-            raise ValueError("Unsupported quantization config: "
-                             f"bits={weight_bits}, sym={is_sym}")
+            raise ValueError(
+                f"Unsupported quantization config: bits={weight_bits}, sym={is_sym}"
+            )
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
@@ -169,50 +195,64 @@ class GPTQMarlinConfig(QuantizationConfig):
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         modules_in_block_to_quantize = cls.get_from_keys_or(
-            config, ["modules_in_block_to_quantize"], default=None)
-        return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized, dynamic, config,
-                   modules_in_block_to_quantize)
+            config, ["modules_in_block_to_quantize"], default=None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+            modules_in_block_to_quantize,
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
-                               or user_quant == "gptq_marlin")
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
+        )
 
         if can_convert and is_valid_user_quant:
-            msg = ("The model is convertible to {} during runtime."
-                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
             logger.info(msg)
             return cls.get_name()
 
         if can_convert and user_quant == "gptq":
-            logger.info("Detected that the model can run with gptq_marlin"
-                        ", however you specified quantization=gptq explicitly,"
-                        " so forcing gptq. Use quantization=gptq_marlin for"
-                        " faster inference")
+            logger.info(
+                "Detected that the model can run with gptq_marlin"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_marlin for"
+                " faster inference"
+            )
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, FusedMoE):
-            from vllm.model_executor.layers.quantization.moe_wna16 import (
-                MoeWNA16Config)
+            from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
+
             if not check_moe_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
                     f"Layer '{prefix}' is not supported by GPTQMoeMarlin. "
-                    "Falling back to Moe WNA16 kernels.")
-                return MoeWNA16Config.from_config(
-                    self.full_config).get_quant_method(layer, prefix)
-            return get_moe_quant_method(self, layer, prefix,
-                                        GPTQMarlinMoEMethod)
-        return get_linear_quant_method(self, layer, prefix,
-                                       GPTQMarlinLinearMethod)
+                    "Falling back to Moe WNA16 kernels."
+                )
+                return MoeWNA16Config.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            return get_moe_quant_method(self, layer, prefix, GPTQMarlinMoEMethod)
+        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]):
@@ -229,41 +269,40 @@ class GPTQMarlinConfig(QuantizationConfig):
             return False
 
         # Marlin conversion is only valid if required properties are found
-        if (num_bits is None or group_size is None or sym is None
-                or desc_act is None):
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
             return False
 
         if (num_bits, sym) not in cls.TYPE_MAP:
             return False
 
-        return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
-                                      group_size=group_size)
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
+        )
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper):
         if self.modules_in_block_to_quantize is not None:
             self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
-                self.modules_in_block_to_quantize)
+                self.modules_in_block_to_quantize
+            )
 
-    def maybe_update_config(self,
-                            model_name: str,
-                            revision: Optional[str] = None):
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
         if self.modules_in_block_to_quantize:
             if is_list_of(self.modules_in_block_to_quantize, list):
                 # original modules_in_block_to_quantize: list[list[str]]
                 # flatten original modules_in_block_to_quantize
                 self.modules_in_block_to_quantize = [
-                    item for sublist in self.modules_in_block_to_quantize
+                    item
+                    for sublist in self.modules_in_block_to_quantize
                     for item in sublist
                 ]
             return
 
         unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-        metadata = get_safetensors_params_metadata(model_name,
-                                                   revision=revision)
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
         quant_layers: set[str] = {
             param_name.rsplit(".", 1)[0]
             for param_name, info in metadata.items()
-            if (dtype := info.get('dtype', None))
+            if (dtype := info.get("dtype", None))
             and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
         }
         self.modules_in_block_to_quantize = list(quant_layers)
@@ -282,8 +321,10 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         self.quant_config = quant_config
 
         # Verify supported on platform.
-        verify_marlin_supported(quant_type=self.quant_config.quant_type,
-                                group_size=self.quant_config.group_size)
+        verify_marlin_supported(
+            quant_type=self.quant_config.quant_type,
+            group_size=self.quant_config.group_size,
+        )
 
     def create_weights(
         self,
@@ -301,20 +342,21 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
-            partition_weight_shape=\
-                (input_size_per_partition, output_size_per_partition),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
             weight_type=self.quant_config.quant_type,
             act_type=params_dtype,
             group_size=self.quant_config.group_size,
             zero_points=False,
-            has_g_idx=self.quant_config.desc_act
+            has_g_idx=self.quant_config.desc_act,
         )
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
-            logger.info("Using %s for GPTQMarlinLinearMethod",
-                        kernel_type.__name__)
+            logger.info("Using %s for GPTQMarlinLinearMethod", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # Normalize group_size
@@ -324,9 +366,9 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
             group_size = input_size
 
         # Determine sharding
-        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
-                                             self.quant_config.group_size,
-                                             is_row_parallel):
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
             # By setting scale_dim == None, weight_loader will
             # repeat the scales on each GPU in TP>1 case.
             scales_and_zp_input_dim = None
@@ -348,67 +390,69 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
             output_dim=1,
             packed_dim=0,
             packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         # Activation order
-        g_idx = RowvLLMParameter(data=torch.empty(
-            input_size_per_partition,
-            dtype=torch.int32,
-        ),
-                                 input_dim=0,
-                                 weight_loader=weight_loader)
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
 
         qzeros_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scales_and_zp_size,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 scales_and_zp_size,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
 
         if scales_and_zp_input_dim is None:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
             qzeros = PackedColumnParameter(
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         else:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
             qzeros = PackedvLLMParameter(
                 input_dim=0,
                 output_dim=1,
                 packed_dim=1,
                 packed_factor=self.quant_config.pack_factor,
-                **qzeros_args)
+                **qzeros_args,
+            )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
         layer.register_parameter("scales", scales)
         layer.register_parameter("qzeros", qzeros)
 
-        self.kernel = kernel_type(mp_linear_kernel_config,
-                                  w_q_param_name="qweight",
-                                  w_s_param_name="scales",
-                                  w_zp_param_name="qzeros",
-                                  w_gidx_param_name="g_idx")
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="qweight",
+            w_s_param_name="scales",
+            w_zp_param_name="qzeros",
+            w_gidx_param_name="g_idx",
+        )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         self.kernel.process_weights_after_loading(layer)
@@ -417,7 +461,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
 
@@ -437,8 +481,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         elif self.quant_config.quant_type.size_bits == 8:
             self.quant_type = scalar_types.uint8b128
         else:
-            raise ValueError(
-                "GPTQMarlinMoEMethod only supports int4 and int8 now.")
+            raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.use_marlin = True
 
     def create_weights(
         self,
@@ -449,28 +493,27 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        intermediate_size_full = extra_weight_attrs.pop(
-            "intermediate_size_full")
+        intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
 
         self.is_k_full = (not self.quant_config.desc_act) or (
-            intermediate_size_per_partition == intermediate_size_full)
+            intermediate_size_per_partition == intermediate_size_full
+        )
 
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
-            w2_scales_size = (intermediate_size_full
-                              if self.quant_config.desc_act else
-                              intermediate_size_per_partition)
-            scales_size2 = (w2_scales_size // self.quant_config.group_size)
+            w2_scales_size = (
+                intermediate_size_full
+                if self.quant_config.desc_act
+                else intermediate_size_per_partition
+            )
+            scales_size2 = w2_scales_size // self.quant_config.group_size
             strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
             scales_size13 = 1
             scales_size2 = 1
             strategy = FusedMoeWeightScaleSupported.CHANNEL.value
 
-        extra_weight_attrs.update({
-            "quant_method": strategy,
-            "is_transposed": True
-        })
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
             torch.empty(
@@ -487,8 +530,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         w2_qweight = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size_per_partition //
-                self.quant_config.pack_factor,
+                intermediate_size_per_partition // self.quant_config.pack_factor,
                 hidden_size,
                 dtype=torch.int32,
             ),
@@ -498,51 +540,51 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         set_weight_attrs(w2_qweight, extra_weight_attrs)
         # up_proj scales
         w13_scales = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size13,
-                        2 * intermediate_size_per_partition,
-                        dtype=params_dtype),
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
         # down_proj scales
         w2_scales = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size2,
-                        hidden_size,
-                        dtype=params_dtype),
+            torch.empty(num_experts, scales_size2, hidden_size, dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
         # don't shard the w2 scales when running act order
-        set_weight_attrs(w2_scales,
-                         {"load_full_w2": self.quant_config.desc_act})
+        set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act})
         # up_proj scales
         w13_qzeros = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size13,
-                        2 * intermediate_size_per_partition //
-                        self.quant_config.pack_factor,
-                        dtype=params_dtype),
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
             requires_grad=False,
         )
         layer.register_parameter("w13_qzeros", w13_qzeros)
         set_weight_attrs(w13_qzeros, extra_weight_attrs)
         # down_proj scales
         w2_qzeros = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size2,
-                        hidden_size // self.quant_config.pack_factor,
-                        dtype=params_dtype),
+            torch.empty(
+                num_experts,
+                scales_size2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
             requires_grad=False,
         )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
         # don't shard the w2 scales when running act order
-        set_weight_attrs(w2_qzeros,
-                         {"load_full_w2": self.quant_config.desc_act})
+        set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act})
         w13_g_idx = torch.nn.Parameter(
             torch.empty(
                 num_experts,
@@ -571,8 +613,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx_sort_indices",
-                                 w13_g_idx_sort_indices)
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
         set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
         w2_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty(
@@ -582,15 +623,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx_sort_indices",
-                                 w2_g_idx_sort_indices)
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
         device = layer.w13_qweight.device
         layer.workspace = marlin_make_workspace_new(device, 4)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-
         # Process act_order
         if self.quant_config.desc_act:
             # Get sorting based on g_idx
@@ -600,42 +639,36 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
             w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
             for e in range(num_experts):
-                w13_g_idx_sort_indices[e] = torch.argsort(
-                    layer.w13_g_idx[e]).to(torch.int32)
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
+                    torch.int32
+                )
                 w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
-                    torch.int32)
-                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
-                    w13_g_idx_sort_indices[e]]
-                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
-                    w2_g_idx_sort_indices[e]]
+                    torch.int32
+                )
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
             replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
             replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
-            replace_parameter(layer, "w13_g_idx_sort_indices",
-                              w13_g_idx_sort_indices)
-            replace_parameter(layer, "w2_g_idx_sort_indices",
-                              w2_g_idx_sort_indices)
+            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         else:
             # Reset g_idx related tensors
             num_experts = layer.w13_g_idx.shape[0]
             device = layer.w13_g_idx.device
             layer.w13_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
         # Repack weights
@@ -665,9 +698,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=layer.w2_scales.shape[1] *
-            (self.quant_config.group_size if self.quant_config.group_size != -1
-             else self.quant_config.pack_factor),
+            size_k=layer.w2_scales.shape[1]
+            * (
+                self.quant_config.group_size
+                if self.quant_config.group_size != -1
+                else self.quant_config.pack_factor
+            ),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
@@ -680,7 +716,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -691,26 +728,25 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
+                "EPLB not supported for `GPTQMarlinMoEMethod` yet."
+            )
 
         assert activation == "silu", "Only SiLU activation is supported."
 
@@ -726,9 +762,10 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
@@ -748,4 +785,5 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
-            is_k_full=self.is_k_full)
+            is_k_full=self.is_k_full,
+        )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index eba917d8541181919cd1fb9e51550de530ccfe3d..2fb614b4746ea4f786a56a9f86db5ef068be226a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -9,13 +9,16 @@ from torch.nn.parameter import Parameter
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -25,15 +28,12 @@ GPTQ_MARLIN_24_MIN_THREAD_N = 128
 GPTQ_MARLIN_24_MIN_THREAD_K = 128
 GPTQ_MARLIN_24_MAX_PARALLEL = 64
 
-GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
-    scalar_types.uint4b8, scalar_types.uint8b128
-]
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
 
 
 class GPTQMarlin24Config(QuantizationConfig):
-    """Config class for Marlin24.
-    """
+    """Config class for Marlin24."""
 
     def __init__(
         self,
@@ -49,17 +49,18 @@ class GPTQMarlin24Config(QuantizationConfig):
         self.group_size = group_size
 
         # Verify
-        if quant_type is None or \
-            quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
+        if quant_type is None or quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
             raise ValueError(
                 f"Marlin_24 does not support quant_type = {quant_type}. "
                 f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
-                "are supported.")
+                "are supported."
+            )
         if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
             raise ValueError(
                 f"Marlin_24 does not support group_size = {self.group_size}. "
                 f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
+                "are supported."
+            )
 
         self.quant_type = quant_type
 
@@ -84,7 +85,8 @@ class GPTQMarlin24Config(QuantizationConfig):
 
     def __repr__(self) -> str:
         return "Marlin24Config(quant_type={}, group_size={})".format(
-            self.quant_type, self.group_size)
+            self.quant_type, self.group_size
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -111,23 +113,26 @@ class GPTQMarlin24Config(QuantizationConfig):
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
-        is_marlin_24_format = (
-            hf_quant_cfg.get("checkpoint_format") == "marlin_24")
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        is_marlin_24_format = hf_quant_cfg.get("checkpoint_format") == "marlin_24"
 
-        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
-                               or user_quant == "gptq_marlin_24")
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "gptq_marlin_24"
+        )
 
         if is_marlin_24_format and is_valid_user_quant:
-            msg = ("The model is serialized in {} format. "
-                   "Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
             logger.info(msg)
             return cls.get_name()
 
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlin24LinearMethod"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["GPTQMarlin24LinearMethod"]:
         if isinstance(layer, LinearBase):
             return GPTQMarlin24LinearMethod(self)
         return None
@@ -157,7 +162,8 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
         weight_loader = extra_weight_attrs["weight_loader"]
         if params_dtype != torch.float16:
             raise ValueError(
-                f"The params dtype must be float16, but got {params_dtype}")
+                f"The params dtype must be float16, but got {params_dtype}"
+            )
 
         # Validate output_size_per_partition
         output_size_per_partition = sum(output_partition_sizes)
@@ -165,38 +171,46 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
             raise ValueError(
                 f"Weight output_size_per_partition = "
                 f"{output_size_per_partition} is not divisible by "
-                f"min_n_threads = {self.quant_config.min_n_threads}.")
+                f"min_n_threads = {self.quant_config.min_n_threads}."
+            )
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 f"Weight output_size_per_partition = "
                 f"{output_size_per_partition} is not divisible by "
-                f"pack_factor = {self.quant_config.pack_factor}.")
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
 
         # Validate input_size_per_partition
         if input_size_per_partition % self.quant_config.min_k_threads != 0:
             raise ValueError(
                 f"Weight input_size_per_partition = "
                 f"{input_size_per_partition} is not divisible by "
-                f"min_k_threads = {self.quant_config.min_k_threads}.")
-        if (self.quant_config.group_size != -1 and
-                input_size_per_partition % self.quant_config.group_size != 0):
-            raise ValueError(f"Weight input_size_per_partition = "
-                             f"{input_size_per_partition} is not divisible by "
-                             f"group_size = {self.quant_config.group_size}.")
+                f"min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
 
         # Check that we have at least 4 tiles horizontally in the shard
         num_tiles_per_perm = self.quant_config.perm_len // (
-            self.quant_config.tile_size**2)
+            self.quant_config.tile_size**2
+        )
         if output_size_per_partition % num_tiles_per_perm != 0:
-            raise ValueError(
-                "Each permutation group must reside on the same gpu")
+            raise ValueError("Each permutation group must reside on the same gpu")
 
         # Quantized 4Bit weights packed into Int32.
         qweight = PackedvLLMParameter(
             data=torch.empty(
                 input_size_per_partition // self.quant_config.tile_size // 2,
-                output_size_per_partition * self.quant_config.tile_size //
-                self.quant_config.pack_factor,
+                output_size_per_partition
+                * self.quant_config.tile_size
+                // self.quant_config.pack_factor,
                 device="cuda",
                 dtype=torch.int32,
             ),
@@ -205,55 +219,57 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
             packed_dim=1,
             packed_factor=self.quant_config.pack_factor,
             marlin_tile_size=self.quant_config.tile_size,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
         # Meta
-        meta = PackedvLLMParameter(data=torch.empty(
-            input_size_per_partition // 8 // 2 // 2,
-            output_size_per_partition * 2,
-            device="cuda",
-            dtype=torch.int16,
-        ),
-                                   input_dim=0,
-                                   output_dim=1,
-                                   packed_dim=1,
-                                   packed_factor=1,
-                                   marlin_tile_size=2,
-                                   weight_loader=weight_loader)
+        meta = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // 8 // 2 // 2,
+                output_size_per_partition * 2,
+                device="cuda",
+                dtype=torch.int16,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=1,
+            marlin_tile_size=2,
+            weight_loader=weight_loader,
+        )
 
         # Determine if channelwise or not
-        input_groups = (1 if self.quant_config.group_size == -1 else
-                        input_size_per_partition //
-                        self.quant_config.group_size)
+        input_groups = (
+            1
+            if self.quant_config.group_size == -1
+            else input_size_per_partition // self.quant_config.group_size
+        )
 
         weight_scale_args = {
-            "data":
-            torch.empty(
+            "data": torch.empty(
                 input_groups,
                 output_size_per_partition,
                 device="cuda",
                 dtype=params_dtype,
             ),
-            "weight_loader":
-            weight_loader
+            "weight_loader": weight_loader,
         }
         if input_groups == 1:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
         else:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
 
         # Allocate workspace (Used for internal locking mechanism)
         max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+            output_size_per_partition // self.quant_config.min_n_threads
+        ) * self.quant_config.max_parallel
 
-        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
-                                                       device="cuda",
-                                                       dtype=torch.int),
-                                      weight_loader=weight_loader)
+        workspace = BasevLLMParameter(
+            data=torch.zeros(max_workspace_size, device="cuda", dtype=torch.int),
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("B_24", qweight)
         layer.register_parameter("B_meta", meta)
@@ -271,7 +287,7 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.B_24
         meta = layer.B_meta
@@ -284,12 +300,19 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
         size_k = x_2d.shape[1]
         size_n = scales.shape[1]
 
-        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
-                                            workspace,
-                                            self.quant_config.quant_type,
-                                            size_m, size_n, size_k)
-
-        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        output_2d = ops.gptq_marlin_24_gemm(
+            x_2d,
+            qweight,
+            meta,
+            scales,
+            workspace,
+            self.quant_config.quant_type,
+            size_m,
+            size_n,
+            size_k,
+        )
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 8385ccac32a28608b398c8b48cac7d920c3271c7..5fb67c35378bea3a6aa431caa24d55ae08af37f4 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -7,20 +7,32 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales)
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_make_empty_g_idx,
+    marlin_permute_bias,
+    marlin_permute_scales,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace)
+    MarlinWorkspace,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -33,13 +45,13 @@ class HQQMarlinConfig(QuantizationConfig):
         self,
         weight_bits: int,
         group_size: int,
-        skip_modules: Optional[list[str]] = None,
+        skip_modules: list[str] | None = None,
     ) -> None:
         super().__init__()
-        assert group_size == 64, ("The only supported HQQ group size is "
-                                  "currently 64.")
-        assert weight_bits == 4, ("The only supported HQQ quantization "
-                                  "bitsize is currently 4.")
+        assert group_size == 64, "The only supported HQQ group size is currently 64."
+        assert weight_bits == 4, (
+            "The only supported HQQ quantization bitsize is currently 4."
+        )
 
         self.weight_bits = weight_bits
         self.group_size = group_size
@@ -48,8 +60,10 @@ class HQQMarlinConfig(QuantizationConfig):
         self.skip_modules = skip_modules
 
     def __repr__(self) -> str:
-        return (f"HQQMarlinConfig(quant_type={self.quant_type}, "
-                f"group_size={self.group_size})")
+        return (
+            f"HQQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -69,7 +83,7 @@ class HQQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig":
-        wq_params = (config["quant_config"]["weight_quant_params"])
+        wq_params = config["quant_config"]["weight_quant_params"]
         weight_bits = cls.get_from_keys(wq_params, ["nbits"])
         group_size = cls.get_from_keys(wq_params, ["group_size"])
         skip_modules = config["skip_modules"]
@@ -77,14 +91,16 @@ class HQQMarlinConfig(QuantizationConfig):
 
     def is_layer_skipped(self, prefix: str) -> bool:
         # Split the prefix into its dot-separated components
-        components = prefix.split('.')
+        components = prefix.split(".")
 
         # Check if any of the skip modules exactly matches any component
         return self.skip_modules is not None and any(
-            module_name in components for module_name in self.skip_modules)
+            module_name in components for module_name in self.skip_modules
+        )
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             if self.is_layer_skipped(prefix):
                 return UnquantizedLinearMethod()
@@ -94,7 +110,6 @@ class HQQMarlinConfig(QuantizationConfig):
 
 # Empty HQQ parameter, will be ignored during loading
 class HQQEmptyParameter(BasevLLMParameter):
-
     def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         pass
 
@@ -112,23 +127,18 @@ def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
 # HQQ packing creates issues with sharding - therefore, prior to loading, we
 # repack to GPTQ. We also reshape the weights to their proper GPTQ shape.
 class HQQweightParameter(PackedvLLMParameter):
-
     # unpack function from https://github.com/mobiusml/hqq
-    def unpack_4bit_u8(self,
-                       W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
+    def unpack_4bit_u8(self, W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
         assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)"
 
         dtype = torch.uint8
         step = W_q.shape[0]
-        tmp = torch.empty([2 * step, W_q.shape[1]],
-                          dtype=dtype,
-                          device=W_q.device)
+        tmp = torch.empty([2 * step, W_q.shape[1]], dtype=dtype, device=W_q.device)
         tmp[:step] = (W_q & 0b11110000) >> 4
         tmp[step:] = W_q & 0b00001111
         return tmp
 
-    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int,
-                 **kwargs):
+    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int, **kwargs):
         super().__init__(packed_factor, packed_dim, None, **kwargs)
         self.weight_bits = weight_bits
         self.input_shape = self.shape[self.input_dim] * self.packed_factor
@@ -136,36 +146,41 @@ class HQQweightParameter(PackedvLLMParameter):
 
     def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         loaded_weight = self.unpack_4bit_u8(loaded_weight)
-        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
-            1, 0)
-        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
-                                  loaded_weight.shape[0],
-                                  loaded_weight.shape[1])
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
         super().load_merged_column_weight(loaded_weight, **kwargs)
 
     def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
         loaded_weight = self.unpack_4bit_u8(loaded_weight)
-        loaded_weight = loaded_weight.reshape(self.output_shape,
-                                              -1).transpose(1, 0)
-        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
-                                  loaded_weight.shape[0],
-                                  loaded_weight.shape[1])
+        loaded_weight = loaded_weight.reshape(self.output_shape, -1).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
         super().load_row_parallel_weight(loaded_weight)
 
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         loaded_weight = self.unpack_4bit_u8(loaded_weight)
-        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
-            1, 0)
-        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
-                                  loaded_weight.shape[0],
-                                  loaded_weight.shape[1])
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
         super().load_qkv_weight(loaded_weight, **kwargs)
 
 
 # Zero points and scales in HQQ must also be reshaped to correspond to W_q's
 # GPTQ shape (transposed - we transpose them too when processing weights).
 class HQQZeroScaleParameter(GroupQuantScaleParameter):
-
     def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         loaded_weight = loaded_weight.reshape(-1, self.shape[1])
         super().load_merged_column_weight(loaded_weight, **kwargs)
@@ -180,8 +195,7 @@ class HQQZeroScaleParameter(GroupQuantScaleParameter):
 
 
 class HQQMarlinMethod(LinearMethodBase):
-    """Linear method for HQQ Marlin.
-    """
+    """Linear method for HQQ Marlin."""
 
     def __init__(
         self,
@@ -204,8 +218,9 @@ class HQQMarlinMethod(LinearMethodBase):
 
         weight_loader = extra_weight_attrs.get("weight_loader", error_loader)
 
-        self.scales_and_zp_size = (input_size_per_partition //
-                                   self.quant_config.group_size)
+        self.scales_and_zp_size = (
+            input_size_per_partition // self.quant_config.group_size
+        )
 
         qweight = HQQweightParameter(
             data=torch.empty(
@@ -218,25 +233,30 @@ class HQQMarlinMethod(LinearMethodBase):
             packed_dim=0,
             packed_factor=self.quant_config.pack_factor,
             weight_bits=self.quant_config.weight_bits,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
 
-        zeros = HQQZeroScaleParameter(data=torch.empty(
-            self.output_size_per_partition,
-            self.scales_and_zp_size,
-            dtype=params_dtype,
-        ),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
-
-        scales = HQQZeroScaleParameter(data=torch.empty(
-            self.output_size_per_partition,
-            self.scales_and_zp_size,
-            dtype=params_dtype,
-        ),
-                                       input_dim=1,
-                                       output_dim=0,
-                                       weight_loader=weight_loader)
+        zeros = HQQZeroScaleParameter(
+            data=torch.empty(
+                self.output_size_per_partition,
+                self.scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        scales = HQQZeroScaleParameter(
+            data=torch.empty(
+                self.output_size_per_partition,
+                self.scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("W_q", qweight)
         layer.register_parameter("zero", zeros)
@@ -244,17 +264,29 @@ class HQQMarlinMethod(LinearMethodBase):
 
         # Ignore extra parameters in the HQQ model.
         # To be added as needed.
-        ignore_parameters = ("axis", "channel_wise", "compute_dtype",
-                             "encoded_state_dict", "group_size", "nbits",
-                             "offload_meta", "optimize", "packing",
-                             "quant_scale", "quant_zero", "round_zero",
-                             "shape", "stores_quant_config",
-                             "unpack_view_dtype", "view_as_float")
+        ignore_parameters = (
+            "axis",
+            "channel_wise",
+            "compute_dtype",
+            "encoded_state_dict",
+            "group_size",
+            "nbits",
+            "offload_meta",
+            "optimize",
+            "packing",
+            "quant_scale",
+            "quant_zero",
+            "round_zero",
+            "shape",
+            "stores_quant_config",
+            "unpack_view_dtype",
+            "view_as_float",
+        )
         for name in ignore_parameters:
             layer.register_parameter(
                 name,
-                HQQEmptyParameter(data=torch.empty(0),
-                                  weight_loader=weight_loader))
+                HQQEmptyParameter(data=torch.empty(0), weight_loader=weight_loader),
+            )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         dev = layer.W_q.device
@@ -268,14 +300,18 @@ class HQQMarlinMethod(LinearMethodBase):
             self.output_size_per_partition,
             self.quant_config.weight_bits,
         ).to(dev)
-        marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0),
-                                         self.input_size_per_partition,
-                                         self.output_size_per_partition,
-                                         self.quant_config.group_size).to(dev)
-        marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0),
-                                          self.input_size_per_partition,
-                                          self.output_size_per_partition,
-                                          self.quant_config.group_size).to(dev)
+        marlin_s = marlin_permute_scales(
+            layer.scale.transpose(1, 0),
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.group_size,
+        ).to(dev)
+        marlin_zp = marlin_permute_scales(
+            layer.zero.transpose(1, 0),
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.group_size,
+        ).to(dev)
 
         layer.g_idx = marlin_make_empty_g_idx(dev)
         layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
@@ -291,11 +327,13 @@ class HQQMarlinMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        workspace = MarlinWorkspace(self.output_size_per_partition,
-                                    GPTQ_MARLIN_MIN_THREAD_N,
-                                    GPTQ_MARLIN_MAX_PARALLEL)
+        workspace = MarlinWorkspace(
+            self.output_size_per_partition,
+            GPTQ_MARLIN_MIN_THREAD_N,
+            GPTQ_MARLIN_MAX_PARALLEL,
+        )
 
         scales = layer.marlin_scales
         zeros = layer.marlin_zeros
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
index 8aa1f1a14bfc97d50b2c02e0d4543cce19b4c487..4e736378e9dace8906f3ea202dbdd2bdbda2e69c 100644
--- a/vllm/model_executor/layers/quantization/inc.py
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -21,12 +21,15 @@ from typing import Any, Optional
 import torch
 
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+    FusedMoE,
+    UnquantizedFusedMoEMethod,
+)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 
 
 class INCConfig(QuantizationConfig):
@@ -44,8 +47,9 @@ class INCConfig(QuantizationConfig):
     def from_config(cls, config: dict[str, Any]) -> "INCConfig":
         raise AssertionError
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 31182f40b48f6dd80d2571b8713ab66bfce449c5..7ded8eea79060e0021cd52e2f099471b9af598fa 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 
 # Using the default value (240.0) from pytorch will cause accuracy
@@ -27,11 +25,14 @@ class QuantFP8(CustomOp):
     This CustomOp supports both static and dynamic quantization.
     """
 
-    def __init__(self,
-                 static: bool,
-                 group_shape: GroupShape,
-                 num_token_padding: Optional[int] = None,
-                 column_major_scales: bool = False):
+    def __init__(
+        self,
+        static: bool,
+        group_shape: GroupShape,
+        num_token_padding: int | None = None,
+        column_major_scales: bool = False,
+        use_ue8m0: bool | None = None,  # for Torch compile
+    ):
         """
         :param static: static or dynamic quantization
         :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
@@ -46,6 +47,7 @@ class QuantFP8(CustomOp):
         self.group_shape = group_shape
         self.num_token_padding = num_token_padding
         self.column_major_scales = column_major_scales
+        self.use_ue8m0 = use_ue8m0
 
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
@@ -53,50 +55,59 @@ class QuantFP8(CustomOp):
             self.group_size = group_shape.col
         else:
             assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR}
-            assert not static or group_shape == GroupShape.PER_TENSOR, \
+            assert not static or group_shape == GroupShape.PER_TENSOR, (
                 "Only per-tensor scales supported for static quantization."
+            )
             self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
 
     def forward_cuda(
         self,
         x: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.is_group_quant:
             assert scale is None, "Group quantization is always dynamic"
             from vllm.model_executor.layers.quantization.utils import fp8_utils
+
             return fp8_utils.per_token_group_quant_fp8(
                 x,
                 group_size=self.group_size,
                 column_major_scales=self.column_major_scales,
-                dtype=_FP8_DTYPE)
+                dtype=_FP8_DTYPE,
+                use_ue8m0=self.use_ue8m0,
+            )
 
         assert (scale is not None) == self.static
-        assert scale_ub is None or (not self.static and self.group_shape
-                                    == GroupShape.PER_TOKEN
-                                    and scale_ub.numel() == 1)
+        assert scale_ub is None or (
+            not self.static
+            and self.group_shape == GroupShape.PER_TOKEN
+            and scale_ub.numel() == 1
+        )
         return ops.scaled_fp8_quant(
             x,
             scale,
             num_token_padding=self.num_token_padding,
             scale_ub=scale_ub,
-            use_per_token_if_dynamic=self.use_per_token_if_dynamic)
+            use_per_token_if_dynamic=self.use_per_token_if_dynamic,
+        )
 
     def forward_native(
         self,
         x: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
     ):
         if self.is_group_quant:
             assert scale is None, "Group quantization is always dynamic"
             return self._quantize_group_native(x)
 
         assert (scale is not None) == self.static
-        assert scale_ub is None or (not self.static and self.group_shape
-                                    == GroupShape.PER_TOKEN
-                                    and scale_ub.numel() == 1)
+        assert scale_ub is None or (
+            not self.static
+            and self.group_shape == GroupShape.PER_TOKEN
+            and scale_ub.numel() == 1
+        )
 
         if scale is None:
             if self.group_shape == GroupShape.PER_TOKEN:
@@ -125,7 +136,8 @@ class QuantFP8(CustomOp):
         return out, scale
 
     def _quantize_group_native(
-            self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         orig_shape = x.shape
         hidden_dim = x.shape[-1]
         num_groups = (hidden_dim + self.group_size - 1) // self.group_size
@@ -133,11 +145,14 @@ class QuantFP8(CustomOp):
 
         if padded_dim != hidden_dim:
             padding = padded_dim - hidden_dim
-            x = F.pad(x, (0, padding), mode='constant', value=0.0)
+            x = F.pad(x, (0, padding), mode="constant", value=0.0)
 
         x_grouped = x.view(-1, num_groups, self.group_size)
         absmax = x_grouped.abs().max(dim=-1, keepdim=True)[0].float()
-        scales = (absmax / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR)
+        scales_raw = absmax / _FP8_MAX
+        if self.use_ue8m0:
+            scales_raw = torch.exp2(torch.ceil(torch.log2(scales_raw)))
+        scales = (scales_raw).clamp(min=_FP8_MIN_SCALING_FACTOR)
 
         x_scaled = x_grouped / scales
         x_quant = x_scaled.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE)
@@ -148,9 +163,9 @@ class QuantFP8(CustomOp):
         x_quant = x_quant.view(orig_shape)
 
         scales = scales.squeeze(-1)
-        scales = scales.reshape(orig_shape[:-1] + (num_groups, ))
+        scales = scales.reshape(orig_shape[:-1] + (num_groups,))
 
         if self.column_major_scales:
-            scales = scales.transpose(-2, -1).contiguous()
+            scales = scales.transpose(-2, -1).contiguous().transpose(-1, -2)
 
         return x_quant, scales
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index c83b0b47a4b7eeb42d6b8adbacc8439fa0fe8114..5ca9167faec80ba1329a3d1c51f0247c2cb168b0 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from packaging import version
@@ -9,19 +10,24 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 from vllm._ipex_ops import ipex_ops as ops
-from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
-                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
-                                                         is_layer_skipped_awq)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
-                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
+from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -43,9 +49,10 @@ class IPEXConfig(QuantizationConfig):
         method: str,
         weight_bits: int,
         group_size: int,
-        modules_to_not_convert: Optional[list[str]] = None,
-        desc_act: Optional[bool] = None,
-        lm_head_quantized: Optional[bool] = None,
+        modules_to_not_convert: list[str] | None = None,
+        desc_act: bool | None = None,
+        lm_head_quantized: bool | None = None,
+        is_sym: bool | None = None,
     ) -> None:
         super().__init__()
         self.method = method
@@ -54,20 +61,26 @@ class IPEXConfig(QuantizationConfig):
         self.modules_to_not_convert = modules_to_not_convert or []
         self.desc_act = desc_act
         self.lm_head_quantized = lm_head_quantized
+        self.is_sym = is_sym
         self.pack_factor = 32 // self.weight_bits
 
         if self.weight_bits not in [4]:
-            raise ValueError(f"IPEX quantization supports weight bits [4], "
-                             f"but got {self.weight_bits}.")
+            raise ValueError(
+                f"IPEX quantization supports weight bits [4], "
+                f"but got {self.weight_bits}."
+            )
 
         if self.method not in ["awq", "gptq"]:
-            raise ValueError(f"IPEX quantization supports [awq, gptq], "
-                             f"but got {self.method}.")
+            raise ValueError(
+                f"IPEX quantization supports [awq, gptq], but got {self.method}."
+            )
 
     def __repr__(self) -> str:
-        return (f"IPEXConfig(method={self.method},"
-                f"weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size})")
+        return (
+            f"IPEXConfig(method={self.method},"
+            f"weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -93,24 +106,34 @@ class IPEXConfig(QuantizationConfig):
         method = cls.get_from_keys(config, ["quant_method"]).lower()
         if method == "awq":
             weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
-            group_size = cls.get_from_keys(config,
-                                           ["q_group_size", "group_size"])
+            group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
             modules_to_not_convert = cls.get_from_keys_or(
-                config, ["modules_to_not_convert"], None)
-            return cls(method, weight_bits, group_size, modules_to_not_convert,
-                       False, False)
+                config, ["modules_to_not_convert"], None
+            )
+            is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
+            return cls(
+                method,
+                weight_bits,
+                group_size,
+                modules_to_not_convert,
+                False,
+                False,
+                is_sym,
+            )
         # otherwise for gptq
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-        return cls(method, weight_bits, group_size, [], desc_act,
-                   lm_head_quantized)
+        is_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+        return cls(
+            method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         if not current_platform.is_cpu() and not current_platform.is_xpu():
             return None
 
@@ -121,11 +144,14 @@ class IPEXConfig(QuantizationConfig):
 
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["LinearMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["LinearMethodBase"]:
         if isinstance(layer, LinearBase):
             if self.method == "awq":
-                if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                if is_layer_skipped(
+                    prefix, self.modules_to_not_convert, self.packed_modules_mapping
+                ):
                     return UnquantizedLinearMethod()
                 return IPEXAWQLinearMethod(self)
             if self.method == "gptq":
@@ -134,8 +160,7 @@ class IPEXConfig(QuantizationConfig):
 
 
 class IPEXGPTQLinearMethod(GPTQLinearMethod):
-    """GPTQ linear method using IPEX for the CPU/XPU backend.
-    """
+    """GPTQ linear method using IPEX for the CPU/XPU backend."""
 
     def __init__(self, quant_config: IPEXConfig):
         self.quant_config = quant_config  # type: ignore
@@ -145,18 +170,20 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
 
         try:
             import intel_extension_for_pytorch as ipex
-            if version.parse(
-                    ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
                 raise ImportError(
                     "intel_extension_for_pytorch version is "
                     "wrong. Please install "
-                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
         except ImportError as err:
             raise ImportError(
                 "Please install "
                 f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
                 f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
-                " to use IPEX-AWQ linear method.") from err
+                " to use IPEX-AWQ linear method."
+            ) from err
         # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
         # with better performance.
         lowp_mode = ipex.quantization.WoqLowpMode.INT8
@@ -165,6 +192,7 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -173,32 +201,35 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
         )
         layer.ipex_output_size = layer.qweight.shape[-1]
         g_idx = layer.g_idx if self.quant_config.desc_act else None
-        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
-            IPEXWeightOnlyQuantizedLinear.from_weight(
-            layer.qweight,
-            layer.scales,
-            layer.qzeros,
-            layer.qweight.size(0),
-            layer.ipex_output_size,
-            qconfig=qconfig,
-            g_idx=g_idx,
-            bias=bias,
-            group_size=self.quant_config.group_size,
-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                layer.qweight,
+                layer.scales,
+                layer.qzeros,
+                layer.qweight.size(0),
+                layer.ipex_output_size,
+                qconfig=qconfig,
+                g_idx=g_idx,
+                bias=bias,
+                group_size=self.quant_config.group_size,
+                quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
+            )
         )
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
-        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))
 
 
 class IPEXAWQLinearMethod(AWQLinearMethod):
-    """AWQ linear method using IPEX for the CPU/XPU backend.
-    """
+    """AWQ linear method using IPEX for the CPU/XPU backend."""
 
     def __init__(self, quant_config: IPEXConfig):
         self.quant_config = quant_config  # type: ignore
@@ -210,18 +241,20 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
 
         try:
             import intel_extension_for_pytorch as ipex
-            if version.parse(
-                    ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
                 raise ImportError(
                     "intel_extension_for_pytorch version is "
                     "wrong. Please install "
-                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
         except ImportError as err:
             raise ImportError(
                 "Please install "
                 f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
                 f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
-                " to use IPEX-AWQ linear method.") from err
+                " to use IPEX-AWQ linear method."
+            ) from err
 
         # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
         # with better performance.
@@ -231,6 +264,7 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -238,104 +272,118 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
             group_size=self.quant_config.group_size,
         )
 
-        layer.ipex_output_size = layer.qweight.size(
-            1) * self.quant_config.pack_factor
-        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
-            IPEXWeightOnlyQuantizedLinear.from_weight(
-            layer.qweight,
-            layer.scales,
-            layer.qzeros,
-            layer.qweight.size(0),
-            layer.ipex_output_size,
-            qconfig=qconfig,
-            bias=bias,
-            group_size=self.quant_config.group_size,
-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
+        layer.ipex_output_size = layer.qweight.size(1) * self.quant_config.pack_factor
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                layer.qweight,
+                layer.scales,
+                layer.qzeros,
+                layer.qweight.size(0),
+                layer.ipex_output_size,
+                qconfig=qconfig,
+                bias=bias,
+                group_size=self.quant_config.group_size,
+                quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],  # type: ignore
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
+            )
         )
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
-        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))
 
 
 class XPUFp8LinearMethod(Fp8LinearMethod):
-
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
-                                                         scale=None)
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             # Update the layer with the new values.
             layer.weight = Parameter(qweight, requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.input_scale = None
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         weight = layer.weight.data
         weight_scale = layer.weight_scale.data
-        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True,
-                                                     weight_scale, bias)
+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(
+            x, weight, True, weight_scale, bias
+        )
         return output
 
 
 class XPUFp8MoEMethod(FusedMoEMethodBase):
-
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
         super().__init__(layer.moe_config)
         self.quant_config = quant_config
 
-    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
-                       intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.hidden_size = hidden_size
         layer.num_experts = num_experts
         layer.orig_dtype = params_dtype
         layer.weight_block_size = None
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # Allocate 2 scales for w1 and w3 respectively.
         # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
 
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
         # INPUT_SCALES
         layer.w13_input_scale = None
         layer.w2_input_scale = None
@@ -343,29 +391,31 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: Module) -> None:
         if not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
-            w13_weight = torch.empty_like(layer.w13_weight.data,
-                                          dtype=fp8_dtype)
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
 
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
-            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
-                layer.local_num_experts,
-                dtype=torch.float32,
-                device=w13_weight.device),
-                                                        requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    layer.local_num_experts,
+                    dtype=torch.float32,
+                    device=w13_weight.device,
+                ),
+                requires_grad=False,
+            )
             for expert in range(layer.local_num_experts):
-                w13_weight[expert, :, :], layer.w13_weight_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        layer.w13_weight.data[expert, :, :])
-                w2_weight[expert, :, :], layer.w2_weight_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        layer.w2_weight.data[expert, :, :])
-            layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                  requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                 requires_grad=False)
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
         import intel_extension_for_pytorch as ipex
+
+        ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
         layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
             layer.w13_weight,
             layer.w2_weight,
@@ -374,10 +424,12 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
             a1_scale_inv=layer.w13_input_scale,
             a2_scale_inv=layer.w2_input_scale,
             use_prepack=True,
+            experts_start_id=ep_rank_start,
         )
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -388,20 +440,20 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return layer.ipex_fusion(
             x,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 1280f5f1eadf7b894506d5bb4ad69b21cc688544..7aeb1f86c2794beaab17e01fc16299144a18fe52 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, Optional
 
 import torch
 
@@ -20,11 +20,10 @@ class MPLinearLayerConfig:
     group_size: int
     zero_points: bool
     has_g_idx: bool
-    out_type: Optional[torch.dtype] = None
+    out_type: torch.dtype | None = None
 
 
 class MPLinearKernel(ABC):
-
     @classmethod
     @abstractmethod
     def get_min_capability(cls) -> int:
@@ -32,16 +31,17 @@ class MPLinearKernel(ABC):
 
     @classmethod
     @abstractmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         raise NotImplementedError
 
-    def __init__(self,
-                 c: MPLinearLayerConfig,
-                 w_q_param_name: str,
-                 w_s_param_name: str,
-                 w_zp_param_name: Optional[str] = None,
-                 w_gidx_param_name: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: str | None = None,
+        w_gidx_param_name: str | None = None,
+    ) -> None:
         assert self.can_implement(c)
         self.config = c
         self.w_q_name = w_q_param_name
@@ -58,31 +58,34 @@ class MPLinearKernel(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
-    def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
-                         fn: Callable) -> None:
+    def _transform_param(
+        self, layer: torch.nn.Module, name: str | None, fn: Callable
+    ) -> None:
         if name is not None and getattr(layer, name, None) is not None:
-
             old_param = getattr(layer, name)
             new_param = fn(old_param)
             # replace the parameter with torch.nn.Parameter for TorchDynamo
             # compatibility
             replace_parameter(
-                layer, name,
-                torch.nn.Parameter(new_param.data, requires_grad=False))
+                layer, name, torch.nn.Parameter(new_param.data, requires_grad=False)
+            )
 
     def _get_weight_params(
-            self, layer: torch.nn.Module) -> tuple[
-                torch.Tensor,  # w_q
-                torch.Tensor,  # w_s
-                Optional[torch.Tensor],  # w_zp, 
-                Optional[torch.Tensor]  # w_gidx
-            ]:
+        self, layer: torch.nn.Module
+    ) -> tuple[
+        torch.Tensor,  # w_q
+        torch.Tensor,  # w_s
+        torch.Tensor | None,  # w_zp,
+        torch.Tensor | None,  # w_gidx
+    ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index f10d20999bee3571c3404d7fcbc51b35ecaeb7bb..0cf3f12af55225ff7d1cd26654ca89feec6ee8ba 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,27 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
-
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
-    AllSparkLinearKernel)
+    AllSparkLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import (  # noqa: E501
-    BitBLASLinearKernel)
+    BitBLASLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import (  # noqa: E501
-    ConchLinearKernel)
+    ConchLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.cutlass import (  # noqa: E501
-    CutlassW4A8LinearKernel)
+    CutlassW4A8LinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.dynamic_4bit import (  # noqa: E501
-    Dynamic4bitLinearKernel)
+    Dynamic4bitLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
-    ExllamaLinearKernel)
+    ExllamaLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
-    MacheteLinearKernel)
+    MacheteLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
-    MarlinLinearKernel)
+    MarlinLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
-    MPLinearKernel, MPLinearLayerConfig)
+    MPLinearKernel,
+    MPLinearLayerConfig,
+)
 from vllm.platforms import current_platform
 
 # in priority/performance order (when available)
@@ -38,11 +46,11 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
 
 
 def choose_mp_linear_kernel(
-        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> type[MPLinearKernel]:
+    config: MPLinearLayerConfig, compute_capability: int | None = None
+) -> type[MPLinearKernel]:
     """
     Choose an MPLinearKernel that can implement the given config for the given
-     compute capability. Attempts to choose the best kernel in terms of 
+     compute capability. Attempts to choose the best kernel in terms of
      performance.
 
     Args:
@@ -69,14 +77,18 @@ def choose_mp_linear_kernel(
     for kernel in _POSSIBLE_KERNELS:
         if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
             failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
+                f" {kernel.__name__} disabled by environment variable"
+            )
             continue
-        if (compute_capability is not None
-                and kernel.get_min_capability() > compute_capability):
+        if (
+            compute_capability is not None
+            and kernel.get_min_capability() > compute_capability
+        ):
             failure_reasons.append(
                 f"{kernel.__name__} requires capability "
                 f"{kernel.get_min_capability()}, current compute "
-                f" capability is {compute_capability}")
+                f" capability is {compute_capability}"
+            )
             continue
 
         can_implement, failure_reason = kernel.can_implement(config)
@@ -84,10 +96,10 @@ def choose_mp_linear_kernel(
             return kernel
         else:
             failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+                f" {kernel.__name__} cannot implement due to: {failure_reason}"
             )
 
     raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
+        "Failed to find a kernel that can implement the "
+        "WNA16 linear layer. Reasons: \n" + "\n".join(failure_reasons)
+    )
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
index 785e559df8f75070727e27c00a7364311f170569..3baef454251a063e371d2fbedd03df64ab64baeb 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,29 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
-    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, check_allspark_supported_dtype_shape)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    check_allspark_supported_dtype_shape,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class AllSparkLinearKernel(MPLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if c.has_g_idx:
             return False, "Act reordering currently not supported by AllSpark"
 
@@ -35,7 +33,8 @@ class AllSparkLinearKernel(MPLinearKernel):
             c.partition_weight_shape[1],  # out_features
             c.group_size,
             c.weight_type,
-            c.act_type)
+            c.act_type,
+        )
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
@@ -49,8 +48,8 @@ class AllSparkLinearKernel(MPLinearKernel):
         sm_count = properties.multi_processor_count
         sm_version = properties.major * 10 + properties.minor
         gemm_args = {}
-        gemm_args['sm_count'] = sm_count
-        gemm_args['sm_version'] = sm_version
+        gemm_args["sm_count"] = sm_count
+        gemm_args["sm_version"] = sm_version
 
         self.gemm_args = gemm_args
 
@@ -59,43 +58,42 @@ class AllSparkLinearKernel(MPLinearKernel):
         old_scale_param = getattr(layer, self.w_s_name)
 
         assert isinstance(old_weight_param, BasevLLMParameter)
-        permute_param_layout_(old_weight_param,
-                              input_dim=0,
-                              output_dim=1,
-                              packed_dim=0)
+        permute_param_layout_(old_weight_param, input_dim=0, output_dim=1, packed_dim=0)
 
         assert isinstance(old_scale_param, BasevLLMParameter)
         permute_param_layout_(old_scale_param, input_dim=0, output_dim=1)
 
         # unpack weight from K / 4 x N int32 to K x N uint8
-        new_weight_param = torch.nn.Parameter(old_weight_param.data,
-                                              requires_grad=False)
-        new_weight_param.data = new_weight_param.data.t().contiguous().view(
-            dtype=torch.uint8)
+        new_weight_param = torch.nn.Parameter(
+            old_weight_param.data, requires_grad=False
+        )
+        new_weight_param.data = (
+            new_weight_param.data.t().contiguous().view(dtype=torch.uint8)
+        )
         new_weight_param.data = new_weight_param.data.t().contiguous()
 
-        new_scale_param = torch.nn.Parameter(old_scale_param.data,
-                                             requires_grad=False)
+        new_scale_param = torch.nn.Parameter(old_scale_param.data, requires_grad=False)
 
         # reorder K x N weight as N32K16 format for Ampere W8A16
-        new_weight_param.data, new_scale_param.data, _ = \
-            ops.allspark_repack_weight(
-                new_weight_param.data, new_scale_param.data, None,
-                c.zero_points)
+        new_weight_param.data, new_scale_param.data, _ = ops.allspark_repack_weight(
+            new_weight_param.data, new_scale_param.data, None, c.zero_points
+        )
 
         replace_parameter(layer, self.w_q_name, new_weight_param.data)
         replace_parameter(layer, self.w_s_name, new_scale_param.data)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
         gemm_args = self.gemm_args
         w_q, w_s, _, _ = self._get_weight_params(layer)
 
         reshaped_x = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         output = ops.allspark_w8a16_gemm(
             a=reshaped_x,
@@ -104,11 +102,12 @@ class AllSparkLinearKernel(MPLinearKernel):
             b_qzeros=None,
             n=c.partition_weight_shape[1],
             group_size=c.group_size,
-            sm_count=gemm_args['sm_count'],
-            sm_version=gemm_args['sm_version'],
+            sm_count=gemm_args["sm_count"],
+            sm_version=gemm_args["sm_version"],
             CUBLAS_M_THRESHOLD=ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
             has_zp=c.zero_points,
-            n32k16_reorder=True)
+            n32k16_reorder=True,
+        )
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index 0eca3b4c024e7bffa5bae8304fd2b749beb9cce2..59c6a4f9615478f6639460415f0dc78f4e394bef 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,20 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 from packaging import version
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_GROUP_SIZES,
-    MINIMUM_BITBLAS_VERSION, bitblas_make_empty_g_idx, bitblas_sort_g_idx,
-    check_bitblas_supports_shape, query_bitblas_supported_quant_types,
-    unpack_gptq_qweight, unpack_gptq_qzeros)
+    BITBLAS_OPTIMIZE_FEATURES,
+    BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION,
+    bitblas_make_empty_g_idx,
+    bitblas_sort_g_idx,
+    check_bitblas_supports_shape,
+    query_bitblas_supported_quant_types,
+    unpack_gptq_qweight,
+    unpack_gptq_qzeros,
+)
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
@@ -22,7 +26,6 @@ logger = init_logger(__name__)
 
 
 class BitBLASLinearKernel(MPLinearKernel):
-
     OPT_FEATURES: list[int] = BITBLAS_OPTIMIZE_FEATURES
     ENABLE_TUNING: bool = True
     MATMUL_LAYOUT: str = "nt"
@@ -40,34 +43,34 @@ class BitBLASLinearKernel(MPLinearKernel):
         c: MPLinearLayerConfig,
         w_q_param_name: str,
         w_s_param_name: str,
-        w_zp_param_name: Optional[str] = None,
-        w_gidx_param_name: Optional[str] = None,
-        bitblas_quant_config: Optional[QuantizationConfig] = None,
+        w_zp_param_name: str | None = None,
+        w_gidx_param_name: str | None = None,
+        bitblas_quant_config: QuantizationConfig | None = None,
     ):
         self.quant_config = bitblas_quant_config
-        super().__init__(c, w_q_param_name, w_s_param_name, w_zp_param_name,
-                         w_gidx_param_name)
+        super().__init__(
+            c, w_q_param_name, w_s_param_name, w_zp_param_name, w_gidx_param_name
+        )
 
     def repack_bitblas_from_gptq(
         self,
         b_q_weight: torch.Tensor,
         scales: torch.Tensor,
-        qzeros: Optional[torch.Tensor] = None,
+        qzeros: torch.Tensor | None = None,
     ):
         from bitblas.quantization.utils import general_compress
+
         assert self.bitblas_matmul is not None, "bitblas_matmul is None"
 
         quant_config = self.quant_config
         # qweight in gptq old quant linear stored with
         # (outfeatures, infeatures), should be transposed.
-        qweight = b_q_weight.T.contiguous().view(
-            quant_config.torch_storage_dtype)  # type: ignore[union-attr]
-        intweight = unpack_gptq_qweight(
-            qweight,
-            quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        qweight = b_q_weight.T.contiguous().view(quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(qweight, quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
         if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
             qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
-                intweight.cpu()).cuda()
+                intweight.cpu()
+            ).cuda()
         # scales in gptq old quant linear stored with
         # (infeatures // group_size, outfeatures), should be transposed.
         scales = scales.T.contiguous()
@@ -78,7 +81,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         # qzeros should be de-quantized to int zeros.
         weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
         intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
-        zeros: Optional[torch.Tensor] = None
+        zeros: torch.Tensor | None = None
         zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
         if zeros_mode == "original":
             zeros = intzeros.to(torch.float16).contiguous()
@@ -91,9 +94,14 @@ class BitBLASLinearKernel(MPLinearKernel):
                     general_compress(
                         intzeros.T.contiguous().cpu().numpy(),
                         weight_bits,
-                    )).to(qweight.device).
-                to(quant_config.torch_storage_dtype  # type: ignore[union-attr]
-                   ).contiguous())
+                    )
+                )
+                .to(qweight.device)
+                .to(
+                    quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                )
+                .contiguous()
+            )
         else:
             raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
 
@@ -104,41 +112,50 @@ class BitBLASLinearKernel(MPLinearKernel):
         return 70
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
-
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         is_bitblas_installed = True
 
         try:
             import bitblas
+
             if version.parse(bitblas.__version__) < version.parse(
-                    MINIMUM_BITBLAS_VERSION):
+                MINIMUM_BITBLAS_VERSION
+            ):
                 raise ImportError(
                     "bitblas version is wrong. Please "
-                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+                )
         except ImportError:
             is_bitblas_installed = False
 
         if not is_bitblas_installed:
-            return False, "bitblas is not installed. Please install bitblas "\
-                          "by running `pip install bitblas>="\
-                           f"{MINIMUM_BITBLAS_VERSION}`"
+            return (
+                False,
+                "bitblas is not installed. Please install bitblas "
+                "by running `pip install bitblas>="
+                f"{MINIMUM_BITBLAS_VERSION}`",
+            )
 
         quant_types = query_bitblas_supported_quant_types(c.zero_points)
         if c.weight_type not in quant_types:
-            return False, (f"Quant type ({c.weight_type}) not supported by"
-                           f"  BitBLAS, supported types are: {quant_types}")
+            return False, (
+                f"Quant type ({c.weight_type}) not supported by"
+                f"  BitBLAS, supported types are: {quant_types}"
+            )
 
         if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
-            return False, (f"Group size ({c.group_size}) not supported by "
-                           "BitBLAS, supported group sizes are: "
-                           f"{BITBLAS_SUPPORTED_GROUP_SIZES}")
+            return False, (
+                f"Group size ({c.group_size}) not supported by "
+                "BitBLAS, supported group sizes are: "
+                f"{BITBLAS_SUPPORTED_GROUP_SIZES}"
+            )
 
         return check_bitblas_supports_shape(
             c.partition_weight_shape[1],  # out_features
             c.partition_weight_shape[0],  # in_features
             c.full_weight_shape[0],  # in_features
-            c.group_size)
+            c.group_size,
+        )
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
@@ -150,14 +167,15 @@ class BitBLASLinearKernel(MPLinearKernel):
 
         # Default names since bitblas requires empty parameters for these,
         # TODO: remove this requirement from bitblas (allow optional tensors)
-        if self.w_gidx_name is None:
-            self.w_gidx_name = "g_idx"
-        if self.w_zp_name is None:
-            self.w_zp_name = "qzeros"
+        if getattr(self, "w_gidx_name", None) is None:
+            self.w_gidx_name: str = "g_idx"
+        if getattr(self, "w_zp_name", None) is None:
+            self.w_zp_name: str = "qzeros"
 
         if c.has_g_idx:
             g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
-                getattr(layer, self.w_gidx_name))
+                getattr(layer, self.w_gidx_name)
+            )
             self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
             layer.g_idx_sort_indices = g_idx_sort_indices
         else:
@@ -170,13 +188,11 @@ class BitBLASLinearKernel(MPLinearKernel):
             setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
 
         # Repack weights
-        bitblas_qweight, bitblas_scales, bitblas_qzeros = (
-            self.repack_bitblas_from_gptq(
-                layer.qweight,
-                layer.scales,
-                None if quant_config.is_sym else  # type: ignore[union-attr]
-                layer.qzeros,  # type: ignore[union-attr]
-            ))
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = self.repack_bitblas_from_gptq(
+            layer.qweight,
+            layer.scales,
+            None if quant_config.is_sym else layer.qzeros,  # type: ignore[union-attr]
+        )
         replace_parameter(layer, self.w_q_name, bitblas_qweight)
         replace_parameter(layer, self.w_s_name, bitblas_scales)
         if bitblas_qzeros is not None:
@@ -213,6 +229,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         bits,
     ):
         from bitblas import MatmulConfig
+
         bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
         quant_config = self.quant_config
         with_scaling = False
@@ -249,30 +266,33 @@ class BitBLASLinearKernel(MPLinearKernel):
             zeros_mode=zeros_mode,
         )
         self.bitblas_matmul = self._get_or_create_bitblas_operator(
-            matmul_config, enable_tuning)
+            matmul_config, enable_tuning
+        )
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         from bitblas import Matmul, auto_detect_nvidia_target
         from bitblas.cache import get_database_path, global_operator_cache
+
         BITBLAS_DATABASE_PATH = get_database_path()
         BITBLAS_TARGET = auto_detect_nvidia_target()
 
         if global_operator_cache.size() == 0:
-            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
-                                                     BITBLAS_TARGET)
+            global_operator_cache.load_from_database(
+                BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+            )
 
         bitblas_matmul = global_operator_cache.get(config)
         if bitblas_matmul is None:
-            bitblas_matmul = Matmul(config,
-                                    target=BITBLAS_TARGET,
-                                    enable_tuning=False)
+            bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False)
             if enable_tuning:
                 bitblas_matmul.hardware_aware_finetune(topk=20)
                 global_operator_cache.add(config, bitblas_matmul)
                 global_operator_cache.save_into_database(
-                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+                )
                 TUNING_MESSAGE = (
-                    f"BitBLAS Operator {config} tuned and saved to database.")
+                    f"BitBLAS Operator {config} tuned and saved to database."
+                )
                 logger.info(TUNING_MESSAGE)
             else:
                 _message = f"BitBLAS Operator {config} created without tuning. "
@@ -288,7 +308,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         x: torch.Tensor,
     ) -> torch.Tensor:
         output_size_per_partition = self.config.partition_weight_shape[1]
-        out_shape = x.shape[:-1] + (output_size_per_partition, )
+        out_shape = x.shape[:-1] + (output_size_per_partition,)
         args = [x, layer.qweight, layer.scales]
         if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
             args.append(layer.qzeros)
@@ -298,5 +318,6 @@ class BitBLASLinearKernel(MPLinearKernel):
     def apply_weights(self, layer, x, bias=None):
         NOT_IMPLEMENT_MESSAGE = (
             f"{self.__class__.__name__}.apply_weights is not implemented. "
-            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead")
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead"
+        )
         raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
index f80af548f0199f8123e182eb5faced1897546e92..53b2e15df76db3f7ddee73f3509c3f1ff4661aca 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
@@ -2,48 +2,53 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from importlib.util import find_spec
-from typing import Final, Optional
+from typing import Final
 
 import torch
 
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 _CONCH_SUPPORTED_WEIGHT_TYPES: Final = [
-    scalar_types.uint4, scalar_types.uint8, scalar_types.uint4b8,
-    scalar_types.uint8b128
+    scalar_types.uint4,
+    scalar_types.uint8,
+    scalar_types.uint4b8,
+    scalar_types.uint8b128,
 ]
 _CONCH_SUPPORTED_GROUP_SIZES: Final = [-1, 128]
 
 
 class ConchLinearKernel(MPLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if c.weight_type not in _CONCH_SUPPORTED_WEIGHT_TYPES:
-            error_msg = f"Weight type ({c.weight_type}) not supported by "\
-                        "ConchLinearKernel, supported types are: " \
-                        f"{_CONCH_SUPPORTED_WEIGHT_TYPES}"
+            error_msg = (
+                f"Weight type ({c.weight_type}) not supported by "
+                "ConchLinearKernel, supported types are: "
+                f"{_CONCH_SUPPORTED_WEIGHT_TYPES}"
+            )
             return False, error_msg
 
         if c.group_size not in _CONCH_SUPPORTED_GROUP_SIZES:
-            error_msg = f"Group size ({c.group_size}) not supported by "\
-                        "ConchLinearKernel, supported group sizes are: " \
-                        f"{_CONCH_SUPPORTED_GROUP_SIZES}"
+            error_msg = (
+                f"Group size ({c.group_size}) not supported by "
+                "ConchLinearKernel, supported group sizes are: "
+                f"{_CONCH_SUPPORTED_GROUP_SIZES}"
+            )
             return False, error_msg
 
         if find_spec("conch") is None:
-            error_msg = "conch-triton-kernels is not installed, please "\
-                        "install it via `pip install conch-triton-kernels` "\
-                        "and try again!"
+            error_msg = (
+                "conch-triton-kernels is not installed, please "
+                "install it via `pip install conch-triton-kernels` "
+                "and try again!"
+            )
             return False, error_msg
 
         return True, None
@@ -52,7 +57,6 @@ class ConchLinearKernel(MPLinearKernel):
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale` is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
@@ -68,10 +72,12 @@ class ConchLinearKernel(MPLinearKernel):
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         from conch.ops.quantization.gemm import mixed_precision_gemm
 
         w_q, w_s, w_zp, _ = self._get_weight_params(layer)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index 9e23c0dd3595b037ea1e74c1a26458d8302d4efe..8ef6457c952f1fac860393749a3c2ef5b418d145 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -1,16 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -18,26 +15,22 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class CutlassW4A8LinearKernel(MPLinearKernel):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # dynamic per-tok fp8 activation quantization
-        self.quant_fp8 = QuantFP8(static=False,
-                                  group_shape=GroupShape.PER_TOKEN)
+        self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN)
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 90
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cuda():
             return False, "CUTLASS only supported on CUDA"
 
         if not current_platform.is_device_capability(90):
-            return False, "CUTLASS W4A8 requires compute capability of 90 "\
-                "(Hopper)"
+            return False, "CUTLASS W4A8 requires compute capability of 90 (Hopper)"
 
         if c.act_type != torch.float8_e4m3fn:
             return False, "CUTLASS W4A8 only supports FP8 (e4m3) activations"
@@ -49,8 +42,11 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
             return False, "Zero points not supported by CUTLASS W4A8"
 
         if c.weight_type != scalar_types.int4:
-            return False, f"Quant type ({c.weight_type}) not supported by "\
-                           "CUTLASS W4A8, only supported int4"
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by "
+                "CUTLASS W4A8, only supported int4",
+            )
 
         # TODO(czhu): support -1 (column-wise)
         if c.group_size != 128:
@@ -58,12 +54,16 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
 
         in_features, out_features = c.partition_weight_shape
         if in_features % 128 or out_features % 128:
-            return False, "K and N must be divisible by 128, got "\
-                           f"{c.partition_weight_shape}"
+            return (
+                False,
+                f"K and N must be divisible by 128, got {c.partition_weight_shape}",
+            )
 
         if c.out_type != torch.bfloat16:
-            return False, "Only bfloat16 output type currently supported"\
-                           f"got {c.out_type=}"
+            return (
+                False,
+                f"Only bfloat16 output type currently supportedgot {c.out_type=}",
+            )
 
         return True, None
 
@@ -71,13 +71,11 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
-
         # TODO(czhu): optimize speed/mem usage
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
-            x.data = ops.cutlass_encode_and_reorder_int4b(
-                x.data.t().contiguous().t())
+            x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
             return x
 
         def transform_w_s(x):
@@ -92,24 +90,28 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         self._transform_param(layer, self.w_s_name, transform_w_s)
         self._transform_param(layer, "weight_chan_scale", lambda x: x)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
         w_q, w_s, _, _ = self._get_weight_params(layer)
         w_ch_s = layer.weight_chan_scale
 
         x_2d = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         x_2d, act_scales = self.quant_fp8(x_2d)
-        output = ops.cutlass_w4a8_mm(a=x_2d,
-                                     b_q=w_q,
-                                     b_group_scales=w_s,
-                                     b_group_size=c.group_size,
-                                     a_token_scales=act_scales,
-                                     b_channel_scales=w_ch_s)
+        output = ops.cutlass_w4a8_mm(
+            a=x_2d,
+            b_q=w_q,
+            b_group_scales=w_s,
+            b_group_size=c.group_size,
+            a_token_scales=act_scales,
+            b_channel_scales=w_ch_s,
+        )
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
index 7bd326f47f9e4f7b0ff91c18a6bc506e7998a083..d09bd86a7274ad1372bf849c1c00303aa3452733 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -20,37 +19,45 @@ class Dynamic4bitLinearKernel(MPLinearKernel):
         return 1
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cpu():
             return False, "Only CPU is supported"
         if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
             return False, f"Unsupported quant type {c.weight_type}"
-        if current_platform.get_cpu_architecture(
-        ) == CpuArchEnum.ARM and c.act_type not in [
+        if (
+            current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+            and c.act_type
+            not in [
                 torch.float32,
-        ]:
-            return False, "Dynamic4bitLinearKernel on Arm requires"\
-                " Float32 activations"
+            ]
+        ):
+            return False, "Dynamic4bitLinearKernel on Arm requires Float32 activations"
         if c.full_weight_shape[0] % c.group_size != 0:
-            return False, f"Group size ({c.group_size}) does not evenly divide"\
-                " the number of input features "\
-                f"({c.full_weight_shape[0]})"
+            return (
+                False,
+                f"Group size ({c.group_size}) does not evenly divide"
+                " the number of input features "
+                f"({c.full_weight_shape[0]})",
+            )
         if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
             try:
                 # Attempt to retrieve the operation
                 _ = torch.ops.aten._dyn_quant_matmul_4bit
             except AttributeError:
-                return False, f"PyTorch {torch.__version__} does not support"\
-                    " _dyn_quant_matmul_4bit. Install a newer version"
+                return (
+                    False,
+                    f"PyTorch {torch.__version__} does not support"
+                    " _dyn_quant_matmul_4bit. Install a newer version",
+                )
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
         c = self.config
         packed_weight = getattr(layer, self.w_q_name)
         packed_weight = packed_weight.add(8)
-        uint8_packed = (packed_weight[::, 1::2] << 4
-                        | packed_weight[::, ::2]).to(torch.uint8)
+        uint8_packed = (packed_weight[::, 1::2] << 4 | packed_weight[::, ::2]).to(
+            torch.uint8
+        )
 
         scales = getattr(layer, self.w_s_name)
         block_size = c.group_size
@@ -71,22 +78,34 @@ class Dynamic4bitLinearKernel(MPLinearKernel):
 
         # Repack weights as per kernel requirement
         w = torch.ops.aten._dyn_quant_pack_4bit_weight(
-            uint8_packed, scales, layer.bias, block_size,
-            c.partition_weight_shape[0], c.partition_weight_shape[1])
-        replace_parameter(layer, self.w_q_name,
-                          torch.nn.Parameter(w, requires_grad=False))
+            uint8_packed,
+            scales,
+            layer.bias,
+            block_size,
+            c.partition_weight_shape[0],
+            c.partition_weight_shape[1],
+        )
+        replace_parameter(
+            layer, self.w_q_name, torch.nn.Parameter(w, requires_grad=False)
+        )
         setattr(layer, self.w_s_name, None)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
         x_2d = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         w_q = getattr(layer, self.w_q_name)
         output = torch.ops.aten._dyn_quant_matmul_4bit(
-            x_2d, w_q, c.group_size, c.partition_weight_shape[0],
-            c.partition_weight_shape[1])
+            x_2d,
+            w_q,
+            c.group_size,
+            c.partition_weight_shape[0],
+            c.partition_weight_shape[1],
+        )
         return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index fef333e862d5a85d095b3e8a6f2683dadca0f820..9fba4aafb05a7c21f8277ecb73edd728cb0b852d 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,15 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_quantized_values_into_int32)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+    pack_quantized_values_into_int32,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
@@ -25,31 +24,41 @@ class ExllamaLinearKernel(MPLinearKernel):
         return 60
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
-        if c.has_g_idx and\
-            c.partition_weight_shape[0] != c.full_weight_shape[0]:
-            return False, "Act reordering currently not supported by Exllama, "\
-                          "when the input features are partitioned across "\
-                          "devices"
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return (
+                False,
+                "Act reordering currently not supported by Exllama, "
+                "when the input features are partitioned across "
+                "devices",
+            )
 
         if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
-            return False, "Output features must be a multiple of the pack " \
-                            "factor (32 / num_bits) so that we can correctly " \
-                            "pack the zero points"
+            return (
+                False,
+                "Output features must be a multiple of the pack "
+                "factor (32 / num_bits) so that we can correctly "
+                "pack the zero points",
+            )
 
         if c.act_type != torch.float16:
             return False, "Exllama only supports float16 activations"
 
         if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
-            return False, f"Quant type ({c.weight_type}) not supported by "\
-                           "Exllama, supported types are: "\
-                           f"{cls.SUPPORTED_QUANT_TYPES}"
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by "
+                "Exllama, supported types are: "
+                f"{cls.SUPPORTED_QUANT_TYPES}",
+            )
 
         if c.full_weight_shape[0] % c.group_size != 0:
-            return False, f"Group size ({c.group_size}) does not evenly divide"\
-                           " the number of input features "\
-                           f"({c.full_weight_shape[0]})"
+            return (
+                False,
+                f"Group size ({c.group_size}) does not evenly divide"
+                " the number of input features "
+                f"({c.full_weight_shape[0]})",
+            )
 
         return True, None
 
@@ -70,21 +79,23 @@ class ExllamaLinearKernel(MPLinearKernel):
                 # exllama kernel adding 1 to the zero points during inference)
                 # Documentation of the bug can be found here:
                 #  https://garden.danieldk.eu/GPTQ-Checkpoint-Format
-                zeros = torch.full((groups, out_features),
-                                   c.weight_type.bias - 1,
-                                   dtype=torch.int32,
-                                   device=device)
+                zeros = torch.full(
+                    (groups, out_features),
+                    c.weight_type.bias - 1,
+                    dtype=torch.int32,
+                    device=device,
+                )
             else:
                 raise NotImplementedError(
                     "A 0 zero-point is not supported by Exllama due to "
                     "a bug in the original GPTQ checkpoint format leading to "
                     "exllama kernel adding 1 to the zero points during "
-                    "inference")
-            zeros = pack_quantized_values_into_int32(zeros,
-                                                     c.weight_type,
-                                                     packed_dim=1)
-            setattr(layer, self.w_zp_name,
-                    torch.nn.Parameter(zeros, requires_grad=False))
+                    "inference"
+                )
+            zeros = pack_quantized_values_into_int32(zeros, c.weight_type, packed_dim=1)
+            setattr(
+                layer, self.w_zp_name, torch.nn.Parameter(zeros, requires_grad=False)
+            )
 
         if c.has_g_idx:
 
@@ -96,10 +107,9 @@ class ExllamaLinearKernel(MPLinearKernel):
             self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
         else:
             self.w_gidx_name = "g_idx"
-            empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
-                                                         dtype=torch.int,
-                                                         device=device),
-                                             requires_grad=False)
+            empty_g_idx = torch.nn.Parameter(
+                torch.empty((0,), dtype=torch.int, device=device), requires_grad=False
+            )
             setattr(layer, self.w_gidx_name, empty_g_idx)
 
         def transform_w_q(x):
@@ -122,21 +132,29 @@ class ExllamaLinearKernel(MPLinearKernel):
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
 
         x_2d = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
 
+        # gptq_gemm supports GPTQv2 format by passing use_v2_format=True.
+        # However, the MPLinearLayerConfig doesn't contain format info.
+        # So hardcode GPTQv1 format here, to keep its behavior unchanged.
+        use_v2_format = False
+
         assert w_zp is not None, "Zero points are required by Exllama"
         assert w_g_idx is not None, "Group index is required by Exllama"
-        output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
-                               c.weight_type.size_bits)
+        output = ops.gptq_gemm(
+            x_2d, w_q, w_zp, w_s, w_g_idx, True, use_v2_format, c.weight_type.size_bits
+        )
 
         if bias is not None:
             output.add_(bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index da951ddab2e4e4bb2d0e63388df11e059443e2dc..7953ed5e8ee475a9e2468edf2a6b2a96a73bd9b1 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -2,32 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.machete_utils import (
-    check_machete_supports_shape, query_machete_supported_group_sizes,
-    query_machete_supported_quant_types)
+    check_machete_supports_shape,
+    query_machete_supported_group_sizes,
+    query_machete_supported_quant_types,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+    pack_quantized_values_into_int32,
+    unpack_quantized_values_into_int32,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class MacheteLinearKernel(MPLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 90
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         # Machete uses CUTLASS, so it can only be compatible with Nvidia
         if not current_platform.is_cuda():
             return False, "Machete only supported on CUDA"
@@ -35,25 +35,33 @@ class MacheteLinearKernel(MPLinearKernel):
         if not current_platform.is_device_capability(90):
             return False, "Machete requires compute capability of 90 (Hopper)"
 
-        if c.has_g_idx and\
-            c.partition_weight_shape[0] != c.full_weight_shape[0]:
-            return False, "Act reordering currently not supported by Machete, "\
-                          "when the input features are partitioned across "\
-                          "devices"
-
-        if c.weight_type not in query_machete_supported_quant_types(
-                c.zero_points):
-            return False, f"Quant type ({c.weight_type}) not supported by "\
-                           "Machete, supported types are: "\
-                           f"{query_machete_supported_quant_types(c.zero_points)}"
+        if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return (
+                False,
+                "Act reordering currently not supported by Machete, "
+                "when the input features are partitioned across "
+                "devices",
+            )
+
+        if c.weight_type not in query_machete_supported_quant_types(c.zero_points):
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by "
+                "Machete, supported types are: "
+                f"{query_machete_supported_quant_types(c.zero_points)}",
+            )
 
         if c.group_size not in query_machete_supported_group_sizes(c.act_type):
-            return False, f"Group size ({c.group_size}) not supported by "\
-                            "Machete, supported group sizes are: "\
-                            f"{query_machete_supported_group_sizes(c.act_type)}"
+            return (
+                False,
+                f"Group size ({c.group_size}) not supported by "
+                "Machete, supported group sizes are: "
+                f"{query_machete_supported_group_sizes(c.act_type)}",
+            )
 
-        return check_machete_supports_shape(c.partition_weight_shape[0],
-                                            c.partition_weight_shape[1])
+        return check_machete_supports_shape(
+            c.partition_weight_shape[0], c.partition_weight_shape[1]
+        )
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
@@ -64,30 +72,33 @@ class MacheteLinearKernel(MPLinearKernel):
 
         if c.has_g_idx:
             assert self.w_gidx_name is not None
-            perm = torch.argsort(getattr(layer, self.w_gidx_name))\
-                .to(torch.int)
+            perm = torch.argsort(getattr(layer, self.w_gidx_name)).to(torch.int)
 
             self.act_perm = lambda x: x[:, perm]
             # use `ops.permute_cols` if possible
-            if c.act_type in [torch.float16, torch.bfloat16] \
-                and c.partition_weight_shape[0] % 8 == 0:
+            if (
+                c.act_type in [torch.float16, torch.bfloat16]
+                and c.partition_weight_shape[0] % 8 == 0
+            ):
                 self.act_perm = partial(ops.permute_cols, perm=perm)
 
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
             if c.has_g_idx:
-                x_unpacked = unpack_quantized_values_into_int32(x.data,
-                                                                c.weight_type,
-                                                                packed_dim=0)
+                x_unpacked = unpack_quantized_values_into_int32(
+                    x.data, c.weight_type, packed_dim=0
+                )
                 x_perm = x_unpacked[perm, :]
-                x.data = pack_quantized_values_into_int32(x_perm,
-                                                          c.weight_type,
-                                                          packed_dim=0)
-            x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
-                                           a_type=c.act_type,
-                                           b_type=c.weight_type,
-                                           group_scales_type=c.act_type)
+                x.data = pack_quantized_values_into_int32(
+                    x_perm, c.weight_type, packed_dim=0
+                )
+            x.data = ops.machete_prepack_B(
+                x.data.t().contiguous().t(),
+                a_type=c.act_type,
+                b_type=c.weight_type,
+                group_scales_type=c.act_type,
+            )
             return x
 
         def transform_w_s(x):
@@ -99,9 +110,9 @@ class MacheteLinearKernel(MPLinearKernel):
         def transform_w_zp(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=1)
-            x_unpacked = unpack_quantized_values_into_int32(x.data,
-                                                            c.weight_type,
-                                                            packed_dim=1)
+            x_unpacked = unpack_quantized_values_into_int32(
+                x.data, c.weight_type, packed_dim=1
+            )
             w_s = getattr(layer, self.w_s_name).data
             # pre-apply scales to zero-points
             x.data = (-1.0 * w_s * (x_unpacked.to(w_s.dtype))).contiguous()
@@ -113,15 +124,17 @@ class MacheteLinearKernel(MPLinearKernel):
         if c.zero_points:
             self._transform_param(layer, self.w_zp_name, transform_w_zp)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
         w_q, w_s, w_zp, _ = self._get_weight_params(layer)
 
         x_2d = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         if c.has_g_idx:
             x_2d = self.act_perm(x_2d)
@@ -131,12 +144,14 @@ class MacheteLinearKernel(MPLinearKernel):
         else:
             w_zp = None
 
-        output = ops.machete_mm(a=x_2d,
-                                b_q=w_q,
-                                b_type=c.weight_type,
-                                b_group_zeros=w_zp,
-                                b_group_scales=w_s,
-                                b_group_size=c.group_size)
+        output = ops.machete_mm(
+            a=x_2d,
+            b_q=w_q,
+            b_type=c.weight_type,
+            b_group_zeros=w_zp,
+            b_group_scales=w_s,
+            b_group_size=c.group_size,
+        )
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 5eb99383097b53a152f7fa0b87d61b83013988d0..ac21286eeffac5db05d56681f076bde1ece5369c 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,52 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
-    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
-    marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
-    unpack_cols)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           permute_param_layout_)
+    MARLIN_SUPPORTED_GROUP_SIZES,
+    apply_gptq_marlin_linear,
+    check_marlin_supports_shape,
+    marlin_is_k_full,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace_new,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    marlin_sort_g_idx,
+    marlin_zero_points,
+    query_marlin_supported_quant_types,
+    unpack_cols,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class MarlinLinearKernel(MPLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
     @classmethod
-    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         # Marlin uses inline PTX, so it can only be compatible with Nvidia
         if not current_platform.is_cuda():
             return False, "Marlin only supported on CUDA"
 
         quant_types = query_marlin_supported_quant_types(c.zero_points)
         if c.weight_type not in quant_types:
-            return False, f"Quant type ({c.weight_type}) not supported by"\
-                          f"  Marlin, supported types are: {quant_types}"
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by"
+                f"  Marlin, supported types are: {quant_types}",
+            )
 
         if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-            return False, f"Group size ({c.group_size}) not supported by "\
-                            "Marlin, supported group sizes are: "\
-                            f"{MARLIN_SUPPORTED_GROUP_SIZES}"
+            return (
+                False,
+                f"Group size ({c.group_size}) not supported by "
+                "Marlin, supported group sizes are: "
+                f"{MARLIN_SUPPORTED_GROUP_SIZES}",
+            )
 
         return check_marlin_supports_shape(
             c.partition_weight_shape[1],  # out_features
             c.partition_weight_shape[0],  # in_features
             c.full_weight_shape[0],  # in_features
-            c.group_size)
+            c.group_size,
+        )
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
@@ -55,7 +66,7 @@ class MarlinLinearKernel(MPLinearKernel):
         device = getattr(layer, self.w_q_name).device
         c = self.config
 
-        row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
+        row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
         self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
 
         # Allocate marlin workspace.
@@ -71,25 +82,30 @@ class MarlinLinearKernel(MPLinearKernel):
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
-            x.data = ops.gptq_marlin_repack(x.data.contiguous(),
-                                            perm=layer.g_idx_sort_indices,
-                                            size_k=c.partition_weight_shape[0],
-                                            size_n=c.partition_weight_shape[1],
-                                            num_bits=c.weight_type.size_bits)
+            x.data = ops.gptq_marlin_repack(
+                x.data.contiguous(),
+                perm=layer.g_idx_sort_indices,
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                num_bits=c.weight_type.size_bits,
+            )
             return x
 
         def transform_w_s(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1)
-            x.data = marlin_permute_scales(x.data.contiguous(),
-                                           size_k=c.partition_weight_shape[0],
-                                           size_n=c.partition_weight_shape[1],
-                                           group_size=c.group_size)
+            x.data = marlin_permute_scales(
+                x.data.contiguous(),
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                group_size=c.group_size,
+            )
             return x
 
         if c.has_g_idx:
             g_idx, g_idx_sort_indices = marlin_sort_g_idx(
-                getattr(layer, self.w_gidx_name))
+                getattr(layer, self.w_gidx_name)
+            )
             self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
             layer.g_idx_sort_indices = g_idx_sort_indices
         else:
@@ -97,16 +113,24 @@ class MarlinLinearKernel(MPLinearKernel):
             layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
         if c.zero_points:
-            grouped_k = (c.partition_weight_shape[0] //
-                         c.group_size if c.group_size != -1 else 1)
-            self._transform_param(layer, self.w_zp_name, lambda x: \
-                marlin_zero_points(
-                    unpack_cols(x.t(), c.weight_type.size_bits,
-                                grouped_k,
-                                c.partition_weight_shape[1]),
+            grouped_k = (
+                c.partition_weight_shape[0] // c.group_size if c.group_size != -1 else 1
+            )
+            self._transform_param(
+                layer,
+                self.w_zp_name,
+                lambda x: marlin_zero_points(
+                    unpack_cols(
+                        x.t(),
+                        c.weight_type.size_bits,
+                        grouped_k,
+                        c.partition_weight_shape[1],
+                    ),
                     size_k=grouped_k,
                     size_n=c.partition_weight_shape[1],
-                    num_bits=c.weight_type.size_bits))
+                    num_bits=c.weight_type.size_bits,
+                ),
+            )
         else:
             setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
         self._transform_param(layer, self.w_q_name, transform_w_q)
@@ -115,10 +139,12 @@ class MarlinLinearKernel(MPLinearKernel):
         if hasattr(layer, "bias") and layer.bias is not None:
             layer.bias.data = marlin_permute_bias(layer.bias)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
 
@@ -136,4 +162,5 @@ class MarlinLinearKernel(MPLinearKernel):
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
             is_k_full=self.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 9ebf5f30379221fc3fbcb8c6487a3eefcc94e7a8..2a885ec8994582c512da8462501c88a19d17cd64 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -3,7 +3,6 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -16,7 +15,6 @@ class ScaledMMLinearLayerConfig:
 
 
 class ScaledMMLinearKernel(ABC):
-
     @classmethod
     @abstractmethod
     def get_min_capability(cls) -> int:
@@ -24,13 +22,18 @@ class ScaledMMLinearKernel(ABC):
 
     @classmethod
     @abstractmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         raise NotImplementedError
 
-    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
-                 w_s_param_name: str, i_s_param_name: str,
-                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+    def __init__(
+        self,
+        c: ScaledMMLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        i_s_param_name: str,
+        i_zp_param_name: str,
+        azp_adj_param_name: str,
+    ) -> None:
         assert self.can_implement(c)
         self.config = c
         self.w_q_name = w_q_param_name
@@ -44,20 +47,23 @@ class ScaledMMLinearKernel(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     def _get_weight_params(
-            self, layer: torch.nn.Module) -> tuple[
-                torch.Tensor,  # weight
-                torch.Tensor,  # weight_scale
-                Optional[torch.Tensor],  # input_scale, 
-                Optional[torch.Tensor],  # input_zp
-                Optional[torch.Tensor],  # azp_adj
-            ]:
+        self, layer: torch.nn.Module
+    ) -> tuple[
+        torch.Tensor,  # weight
+        torch.Tensor,  # weight_scale
+        torch.Tensor | None,  # input_scale,
+        torch.Tensor | None,  # input_zp
+        torch.Tensor | None,  # azp_adj
+    ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 2bc68ab3ebd186771710a9a808f166769473244a..dd59e5d935dcbdc4be10ce897a65006e8f3abc2f 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -2,20 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
 
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
-    AiterScaledMMLinearKernel)
+    AiterScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
-    CPUScaledMMLinearKernel)
+    CPUScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
-    CutlassScaledMMLinearKernel)
+    CutlassScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
-    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+    ScaledMMLinearKernel,
+    ScaledMMLinearLayerConfig,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
-    TritonScaledMMLinearKernel)
+    TritonScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
-    XLAScaledMMLinearKernel)
+    XLAScaledMMLinearKernel,
+)
 from vllm.platforms import PlatformEnum, current_platform
 
 # in priority/performance order (when available)
@@ -28,19 +34,18 @@ _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
 
 
 def choose_scaled_mm_linear_kernel(
-        config: ScaledMMLinearLayerConfig,
-        compute_capability: Optional[int] = None
+    config: ScaledMMLinearLayerConfig, compute_capability: int | None = None
 ) -> type[ScaledMMLinearKernel]:
     """
-    Choose an ScaledMMLinearKernel that can implement the given config for the 
-    given compute capability. Attempts to choose the best kernel in terms of 
+    Choose an ScaledMMLinearKernel that can implement the given config for the
+    given compute capability. Attempts to choose the best kernel in terms of
     performance.
 
     Args:
-        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+        config (ScaledMMLinearLayerConfig): Description of the linear layer
             to be implemented.
         compute_capability (Optional[int], optional): The compute capability of
-            the target device, if None uses `current_platform` to get the 
+            the target device, if None uses `current_platform` to get the
             compute capability. Defaults to None.
 
     Raises:
@@ -57,22 +62,25 @@ def choose_scaled_mm_linear_kernel(
 
     failure_reasons = []
     for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
-        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
-            .split(","):
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","):
             failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
+                f" {kernel.__name__} disabled by environment variable"
+            )
             continue
 
         # If the current platform uses compute_capability,
         # make sure the kernel supports the compute cability.
         if compute_capability is not None:
             kernel_min_capability = kernel.get_min_capability()
-            if (kernel_min_capability is not None
-                    and kernel_min_capability > compute_capability):
+            if (
+                kernel_min_capability is not None
+                and kernel_min_capability > compute_capability
+            ):
                 failure_reasons.append(
                     f"{kernel.__name__} requires capability "
                     f"{kernel_min_capability}, current compute capability "
-                    f"is {compute_capability}")
+                    f"is {compute_capability}"
+                )
                 continue
 
         can_implement, failure_reason = kernel.can_implement(config)
@@ -80,10 +88,10 @@ def choose_scaled_mm_linear_kernel(
             return kernel
         else:
             failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+                f" {kernel.__name__} cannot implement due to: {failure_reason}"
             )
 
     raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "ScaledMM linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
+        "Failed to find a kernel that can implement the "
+        "ScaledMM linear layer. Reasons: \n" + "\n".join(failure_reasons)
+    )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index e8e950a4bb7b6abb034d3a742850b6cb02a0ee9e..038a92c516cecbfc8aa1658888854b35312cd729 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,105 +1,65 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
 
 from .cutlass import CutlassScaledMMLinearKernel
 from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
 
 
-def rocm_aiter_gemm_w8a8_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-
-    from aiter import gemm_a8w8_CK
-
-    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
-    # a to be [M, K]
-    # b to be [N, K]
-    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
-
-
-def rocm_aiter_gemm_w8a8_fake(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-
-    m = A.shape[0]
-    n = B.shape[0]
-    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
-    return Y
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8",
-        op_func=rocm_aiter_gemm_w8a8_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_fake,
-    )
-
-
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 90
 
     @classmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_rocm():
             return (
                 False,
-                "AiterScaledMMLinearKernel requires `aiter` which is not " +
-                "currently supported on non-ROCm platform.")
+                "AiterScaledMMLinearKernel requires `aiter` which is not "
+                + "currently supported on non-ROCm platform.",
+            )
 
         try:
             import aiter  # noqa: F401 # deliberately attempt to import aiter
         except Exception:
             return (
                 False,
-                "AiterScaledMMLinearKernel requires `aiter` which is not " +
-                "installed on ROCm.")
+                "AiterScaledMMLinearKernel requires `aiter` which is not "
+                + "installed on ROCm.",
+            )
         # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
-        if not (
-            envs.VLLM_ROCM_USE_AITER_LINEAR \
-            and envs.VLLM_ROCM_USE_AITER
-        ):
-            return (False, "AiterScaledMMLinearKernel is disabled. " +
-                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
-                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
-                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+        if not (rocm_aiter_ops.is_linear_enabled()):
+            return (
+                False,
+                "AiterScaledMMLinearKernel is disabled. "
+                + "Enable by setting `VLLM_ROCM_USE_AITER=1` "
+                + "and `VLLM_ROCM_USE_AITER_LINEAR=1`. "
+                + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.",
+            )
 
         if not c.input_symmetric:
-            return (False,
-                    "AiterScaledMMLinearKernel only supports symmetric " +
-                    "quantization.")
+            return (
+                False,
+                "AiterScaledMMLinearKernel only supports symmetric " + "quantization.",
+            )
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         """
         `AiterScaledMMLinearKernel` implements a fused version of
             `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
@@ -116,29 +76,27 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         # * dynamic, i_s is None and x_s computed from x.
         # * static, i_s is scalar and x_s is i_s.
         symmetric = azp_adj is None
-        assert symmetric, ("AiterScaledMMLinearKernel only supports"
-                           " symmetric quantization.")
-        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
-                                               i_s,
-                                               i_zp,
-                                               symmetric=symmetric)
-
-        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
-                              " symmetric quantization.")
+        assert symmetric, (
+            "AiterScaledMMLinearKernel only supports symmetric quantization."
+        )
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x, i_s, i_zp, symmetric=symmetric)
+
+        assert x_zp is None, (
+            "AiterScaledMMLinearKernel only supports symmetric quantization."
+        )
         out_dtype = x.dtype
 
-        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
-        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-        assert bias is None or bias.shape[0] == w_q.shape[
-            1] and bias.dtype == out_dtype
+        assert w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0
+        assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
+        assert bias is None or bias.shape[0] == w_q.shape[1] and bias.dtype == out_dtype
 
         m = x_q.shape[0]  # a
         n = w_q.shape[1]  # b
 
-        per_tensor_scale_a = (x_s.numel() == 1)
-        per_tensor_scale_b = (w_s.numel() == 1)
-        per_token_scale_a = (x_s.numel() == m)
-        per_channel_scale_b = (w_s.numel() == n)
+        per_tensor_scale_a = x_s.numel() == 1
+        per_tensor_scale_b = w_s.numel() == 1
+        per_token_scale_a = x_s.numel() == m
+        per_channel_scale_b = w_s.numel() == n
 
         # @TODO:
         # Maybe broadcast the per-tensor-scale into per-channel-scale
@@ -146,16 +104,17 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         # For now, it only supports:
         # - per-tensor-per-tensor a8w8 scaled GEMM, and
         # - per-token-per-channel a8w8 scaled GEMM
-        assert ((per_tensor_scale_a and per_tensor_scale_b)
-                or (per_token_scale_a and per_channel_scale_b)), (
-                    "Currently only support per-tensor-per-tensor GEMM " +
-                    " and per-token-per-channel GEMM through AITER"
-                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
-                    "does not support AITER block scaled GEMM.")
+        assert (per_tensor_scale_a and per_tensor_scale_b) or (
+            per_token_scale_a and per_channel_scale_b
+        ), (
+            "Currently only support per-tensor-per-tensor GEMM "
+            + " and per-token-per-channel GEMM through AITER"
+            " w8a8 scaled gemm. `AiterScaledMMLinearKernel` "
+            + "does not support AITER block scaled GEMM."
+        )
 
         # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
         # a to be [M, K]
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8(x_q, w_q.t(), x_s, w_s,
-                                                   bias, out_dtype)
+        return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
index 59d2b5bce962ecb018313270bd8a72fd5390e5fe..feb1e0bee1aaf84b35aed23a812270ba69fd0e50 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -9,24 +8,22 @@ from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise)
+    convert_to_channelwise,
+)
 from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 
-from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
-                                   ScaledMMLinearLayerConfig)
+from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
 
 
 class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 75
 
     @classmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cpu():
             return False, "CPUScaledMM requires running on CPU."
 
@@ -36,9 +33,12 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         weight = getattr(layer, self.w_q_name)
         dtype = weight.dtype
         N, K = weight.size()
-        if (current_platform.get_cpu_architecture() == CpuArchEnum.X86
-                and envs.VLLM_CPU_SGL_KERNEL and self.config.input_symmetric
-                and check_cpu_sgl_kernel(N, K, dtype)):
+        if (
+            current_platform.get_cpu_architecture() == CpuArchEnum.X86
+            and envs.VLLM_CPU_SGL_KERNEL
+            and self.config.input_symmetric
+            and check_cpu_sgl_kernel(N, K, dtype)
+        ):
             self.linear_method = self._apply_weights_sgl
             self.process_weights_for_sgl(layer)
         else:
@@ -50,8 +50,10 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         # Transpose to [K, N] for convenience
         weight = getattr(layer, self.w_q_name)
         replace_parameter(
-            layer, self.w_q_name,
-            torch.nn.Parameter(weight.t().data, requires_grad=False))
+            layer,
+            self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False),
+        )
 
         # WEIGHT SCALE
         # oneDNN kernels support only per-tensor and per-channel.
@@ -60,11 +62,12 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         is_fused_module = len(layer.logical_widths) > 1
         weight_scale = getattr(layer, self.w_s_name)
         if is_fused_module and not self.config.is_channelwise:
-            weight_scale = convert_to_channelwise(weight_scale,
-                                                  layer.logical_widths)
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
         replace_parameter(
-            layer, self.w_s_name,
-            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
 
         # INPUT SCALE
         if self.config.is_static_input_scheme:
@@ -72,8 +75,10 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
 
             if self.config.input_symmetric:
                 replace_parameter(
-                    layer, self.i_s_name,
-                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                    layer,
+                    self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False),
+                )
                 setattr(layer, self.i_zp_name, None)
             else:
                 input_zero_point = getattr(layer, self.i_zp_name)
@@ -84,16 +89,17 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
                 range_max = (input_scale * (int8_traits.max - azps)).max()
                 range_min = (input_scale * (int8_traits.min - azps)).min()
 
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
+                scale = (range_max - range_min) / (int8_traits.max - int8_traits.min)
                 replace_parameter(
-                    layer, self.i_s_name,
-                    torch.nn.Parameter(scale, requires_grad=False))
+                    layer, self.i_s_name, torch.nn.Parameter(scale, requires_grad=False)
+                )
 
-                azp = (int8_traits.min -
-                       range_min / scale).round().to(dtype=torch.int32)
-                replace_parameter(layer, self.i_zp_name,
-                                  torch.nn.Parameter(azp, requires_grad=False))
+                azp = (
+                    (int8_traits.min - range_min / scale).round().to(dtype=torch.int32)
+                )
+                replace_parameter(
+                    layer, self.i_zp_name, torch.nn.Parameter(azp, requires_grad=False)
+                )
 
         else:
             setattr(layer, self.i_s_name, None)
@@ -105,14 +111,16 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         # s_a * s_b * [(A - zp_a)B] + bias =
         # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias =
         # s_a * GEMM_output - s_a * zp_a * adj + bias
-        if not (self.config.input_symmetric
-                and self.config.is_static_input_scheme):
+        if not (self.config.input_symmetric and self.config.is_static_input_scheme):
             weight = getattr(layer, self.w_q_name)
             weight_scale = getattr(layer, self.w_s_name)
             azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32)
             azp_adj = azp_adj * weight_scale.squeeze()
-            setattr(layer, self.azp_adj_name,
-                    torch.nn.Parameter(azp_adj, requires_grad=False))
+            setattr(
+                layer,
+                self.azp_adj_name,
+                torch.nn.Parameter(azp_adj, requires_grad=False),
+            )
         else:
             setattr(layer, self.azp_adj_name, None)
 
@@ -135,34 +143,37 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         weight = getattr(layer, self.w_q_name)
         packed_weight = torch.ops._C.convert_weight_packed(weight)
         replace_parameter(
-            layer, self.w_q_name,
-            torch.nn.Parameter(packed_weight, requires_grad=False))
+            layer, self.w_q_name, torch.nn.Parameter(packed_weight, requires_grad=False)
+        )
 
         if layer.bias is not None:
             bias = layer.bias
             layer.register_parameter(
-                "bias_fp32",
-                torch.nn.Parameter(bias.float().data, requires_grad=False))
+                "bias_fp32", torch.nn.Parameter(bias.float().data, requires_grad=False)
+            )
 
         # WEIGHT SCALE
         # CPU SGL kernels only support per-channel.
         # For per-tensor quant, convert to the per-channel case.
         weight_scale = getattr(layer, self.w_s_name)
         if not self.config.is_channelwise:
-            weight_scale = convert_to_channelwise(weight_scale,
-                                                  layer.logical_widths)
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
         replace_parameter(
-            layer, self.w_s_name,
-            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
 
         setattr(layer, self.i_s_name, None)
         setattr(layer, self.i_zp_name, None)
         setattr(layer, self.azp_adj_name, None)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         return self.linear_method(
             layer,
             x,
@@ -170,31 +181,33 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         )
 
     def _apply_weights_onednn(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
 
         # ops.scaled_int8_quant supports both dynamic and static quant:
         # * dynamic, i_s is None and x_s computed from x.
         # * static, i_s is scalar and x_s is i_s.
         x_q, x_s, x_zp = ops.onednn_scaled_int8_quant(
-            x, i_s, i_zp, self.config.input_symmetric)
+            x, i_s, i_zp, self.config.input_symmetric
+        )
 
         m = x.size(0)
         n = self.dnnl_handler.n
         out = torch.empty((m, n), dtype=x.dtype)
-        ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj,
-                             bias)
+        ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj, bias)
 
         return out
 
     def _apply_weights_sgl(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         w_q, w_s, _, _, _ = self._get_weight_params(layer)
         return torch.ops._C.int8_scaled_mm_with_quant(
             x,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 2f982f96b0d049445e3f393ff4bbbdf088b5b5ab..e8769916b4cefed828999431cdd33f43aa493c38 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,30 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise)
+    convert_to_channelwise,
+)
 from vllm.platforms import current_platform
 
-from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
-                                   ScaledMMLinearLayerConfig)
+from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
 
 
 class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 75
 
     @classmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
-
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cuda():
             return False, "CutlassScaledMM requires running on CUDA."
 
@@ -35,8 +31,10 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         # Cutlass kernels need transposed weight.
         weight = getattr(layer, self.w_q_name)
         replace_parameter(
-            layer, self.w_q_name,
-            torch.nn.Parameter(weight.t().data, requires_grad=False))
+            layer,
+            self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False),
+        )
 
         # WEIGHT SCALE
         # Cutlass kernels support only per-tensor and per-channel.
@@ -45,11 +43,12 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         is_fused_module = len(layer.logical_widths) > 1
         weight_scale = getattr(layer, self.w_s_name)
         if is_fused_module and not self.config.is_channelwise:
-            weight_scale = convert_to_channelwise(weight_scale,
-                                                  layer.logical_widths)
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
         replace_parameter(
-            layer, self.w_s_name,
-            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
 
         # INPUT SCALE
         if self.config.is_static_input_scheme:
@@ -57,8 +56,10 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
 
             if self.config.input_symmetric:
                 replace_parameter(
-                    layer, self.i_s_name,
-                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                    layer,
+                    self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False),
+                )
                 setattr(layer, self.i_zp_name, None)
             else:
                 input_zero_point = getattr(layer, self.i_zp_name)
@@ -69,17 +70,16 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
                 range_max = (input_scale * (int8_traits.max - azps)).max()
                 range_min = (input_scale * (int8_traits.min - azps)).min()
 
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
+                scale = (range_max - range_min) / (int8_traits.max - int8_traits.min)
                 replace_parameter(
-                    layer, self.i_s_name,
-                    torch.nn.Parameter(scale, requires_grad=False))
+                    layer, self.i_s_name, torch.nn.Parameter(scale, requires_grad=False)
+                )
 
                 # AZP loaded as int8 but used as int32
-                azp = (int8_traits.min -
-                       range_min / scale).to(dtype=torch.int32)
-                replace_parameter(layer, self.i_zp_name,
-                                  torch.nn.Parameter(azp, requires_grad=False))
+                azp = (int8_traits.min - range_min / scale).to(dtype=torch.int32)
+                replace_parameter(
+                    layer, self.i_zp_name, torch.nn.Parameter(azp, requires_grad=False)
+                )
 
         else:
             setattr(layer, self.i_s_name, None)
@@ -88,8 +88,8 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         # azp_adj is the AZP adjustment term, used to account for weights.
         # It does not depend on scales or azp, so it is the same for
         # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        # For more details, see csrc/quantization/w8a8/cutlass/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/main/csrc/quantization/w8a8/cutlass/Epilogues.md
         if not self.config.input_symmetric:
             weight = getattr(layer, self.w_q_name)
             azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
@@ -97,41 +97,44 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
                 # cutlass_w8a8 requires azp to be folded into azp_adj
                 # in the per-tensor case
                 azp_adj = getattr(layer, self.i_zp_name) * azp_adj
-            setattr(layer, self.azp_adj_name,
-                    torch.nn.Parameter(azp_adj, requires_grad=False))
+            setattr(
+                layer,
+                self.azp_adj_name,
+                torch.nn.Parameter(azp_adj, requires_grad=False),
+            )
         else:
             setattr(layer, self.azp_adj_name, None)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
 
         # ops.scaled_int8_quant supports both dynamic and static quant:
         # * dynamic, i_s is None and x_s computed from x.
         # * static, i_s is scalar and x_s is i_s.
         symmetric = azp_adj is None
-        x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
-                                               i_s,
-                                               i_zp,
-                                               symmetric=symmetric)
+        x_q, x_s, x_zp = ops.scaled_int8_quant(
+            x.contiguous(), i_s, i_zp, symmetric=symmetric
+        )
 
         if x_zp is not None:
             # Currently, static is always per-tensor and dynamic is per-token
             static = i_zp is not None
             azp = None if static else x_zp
-            return ops.cutlass_scaled_mm_azp(x_q,
-                                             w_q,
-                                             scale_a=x_s,
-                                             scale_b=w_s,
-                                             out_dtype=x.dtype,
-                                             azp_adj=azp_adj,
-                                             azp=azp,
-                                             bias=bias)
-        return ops.cutlass_scaled_mm(x_q,
-                                     w_q,
-                                     scale_a=x_s,
-                                     scale_b=w_s,
-                                     out_dtype=x.dtype,
-                                     bias=bias)
+            return ops.cutlass_scaled_mm_azp(
+                x_q,
+                w_q,
+                scale_a=x_s,
+                scale_b=w_s,
+                out_dtype=x.dtype,
+                azp_adj=azp_adj,
+                azp=azp,
+                bias=bias,
+            )
+        return ops.cutlass_scaled_mm(
+            x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 817565cf2827753d067d242967c84de5649e9101..3f4ec7f2a738b62006c42d071df1124f00745ce8 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -12,30 +11,32 @@ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
 
 
 class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 75
 
     @classmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if current_platform.is_cpu():
             return (
                 False,
-                "TritonScaledMMLinearKernel requires Triton which is not " +
-                "currently supported on CPU.")
+                "TritonScaledMMLinearKernel requires Triton which is not "
+                + "currently supported on CPU.",
+            )
         if not c.input_symmetric:
-            return (False,
-                    "TritonScaledMMLinearKernel only supports symmetric " +
-                    "quantization.")
+            return (
+                False,
+                "TritonScaledMMLinearKernel only supports symmetric " + "quantization.",
+            )
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         return super().apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 0b931b2d8b8156be004b4db0896438bc5416ced0..ddac9f13cf4f30eb87bbdd630398f5168829af7e 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -2,32 +2,29 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from typing import Optional
 
 import torch
 from functorch.experimental.control_flow import cond  # noqa: F401
 
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise)
+    convert_to_channelwise,
+)
 from vllm.platforms import current_platform
 
-from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
-                                   ScaledMMLinearLayerConfig)
+from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
 
 
 class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
-
     @classmethod
     def get_min_capability(cls) -> int:
         raise NotImplementedError(
             "TPU platform does have a concept of compute capability, "
-            "this method should not be called.")
+            "this method should not be called."
+        )
 
     @classmethod
-    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
-
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_tpu():
             return False, "ScaledMMXLA requires running on TPU."
 
@@ -46,8 +43,9 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
         # WEIGHT
         # [out, in] (different than cutlass_scaled_mm)
         weight = getattr(layer, self.w_q_name)
-        replace_parameter(layer, self.w_q_name,
-                          torch.nn.Parameter(weight.data, requires_grad=False))
+        replace_parameter(
+            layer, self.w_q_name, torch.nn.Parameter(weight.data, requires_grad=False)
+        )
 
         # WEIGHT SCALE
         # XLA kernels support only per-tensor and per-channel.
@@ -56,14 +54,15 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
         is_fused_module = len(layer.logical_widths) > 1
         weight_scale = getattr(layer, self.w_s_name)
         if is_fused_module and not self.config.is_channelwise:
-            weight_scale = convert_to_channelwise(weight_scale,
-                                                  layer.logical_widths)
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
 
         # [out_channel,] (different than cutlass_scaled_mm)
         weight_scale = weight_scale.squeeze(-1)
         replace_parameter(
-            layer, self.w_s_name,
-            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
 
         # Only support symmetric dynamic activation quantization.
         setattr(layer, self.i_s_name, None)
@@ -74,24 +73,26 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
         # to specialize the graph since bias is not dynamic.
         warnings.filterwarnings(
             "ignore",
-            message=
-            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+            message="Pred is a Python constant. When used with torch.cond, it specializes on one of the branches.",  # noqa: E501
         )
 
-    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+    def no_add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
         return x
 
-    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+    def add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
         return x + bias
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         w_q, w_s, _, _, _ = self._get_weight_params(layer)
 
         # Required to register custom ops.
         import torch_xla.experimental.custom_kernel  # noqa: F401
+
         out = torch.ops.xla.quantized_matmul_int8(
             x,
             w_q,
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 275a1c43fdd2bd17d89eb6d5ff048ffeeca8be45..78456dcf1ca56f9a30595bd8e6e82b9ed722a3ca 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -5,7 +5,9 @@ import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -14,12 +16,12 @@ logger = init_logger(__name__)
 class BaseKVCacheMethod(QuantizeMethodBase):
     """
     Quant method that adds `_k_scale` and `_v_scale` attributes to the
-    Attention layer to support loading those scaling factors from checkpoints. 
+    Attention layer to support loading those scaling factors from checkpoints.
     The k/v_scale will be used to:
         - quantize k/v_cache entries before saving them to the cache
         - dequantize k/v_cache entries before fetching them from the cache
 
-    :param quant_config: the appropriate QuantizationConfig 
+    :param quant_config: the appropriate QuantizationConfig
     """
 
     def __init__(self, quant_config: QuantizationConfig):
@@ -33,19 +35,14 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         # Initialize the Q and KV cache scales to -1.0, an invalid value.
         # If the q and k/v_scales appear in the checkpoint, it will be
         # overwritten when loading weights.
-        layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0),
-                                           requires_grad=False)
-        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
-                                           requires_grad=False)
-        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
-                                           requires_grad=False)
+        layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
+        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
+        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
         # Initialize P = softmax(QK^T) scales
-        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0),
-                                              requires_grad=False)
+        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
 
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
-        raise RuntimeError(
-            f"{self.__class__.__name__}.apply should not be called.")
+        raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
@@ -77,16 +74,17 @@ class BaseKVCacheMethod(QuantizeMethodBase):
                     k_scale *= 2
                     v_scale *= 2
 
-            if not isinstance(k_scale, float) or not isinstance(
-                    v_scale, float):
-                raise ValueError("Only support per-tensor scaling factor "
-                                 "for fp8 KV cache")
+            if not isinstance(k_scale, float) or not isinstance(v_scale, float):
+                raise ValueError(
+                    "Only support per-tensor scaling factor for fp8 KV cache"
+                )
 
             if layer.q_scale < 0.0:
                 logger.warning_once(
                     "Checkpoint does not provide a q scaling factor. "
                     "Setting it to k_scale. This only matters for "
-                    "the flash-attn backend.")
+                    "FP8 Attention backends (flash-attn or flashinfer)."
+                )
                 layer._q_scale.copy_(k_scale)
                 layer._q_scale_float = k_scale
 
@@ -95,12 +93,12 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             layer._v_scale.copy_(v_scale)
             layer._k_scale_float = k_scale
             layer._v_scale_float = v_scale
-            if (k_scale == 1.0 and v_scale == 1.0
-                    and "e5m2" not in layer.kv_cache_dtype):
+            if k_scale == 1.0 and v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype:
                 logger.warning_once(
-                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
-                    "may cause accuracy issues. Please make sure k/v_scale "
-                    "scaling factors are available in the fp8 checkpoint.")
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. "
+                    "If this is unintended, verify that k/v_scale "
+                    "scaling factors are properly set in the checkpoint."
+                )
 
         if layer.q_scale > 0.0:
             q_scale = layer.q_scale
@@ -116,26 +114,31 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         else:
             prob_scale = 1.0
 
-        is_singleton_float = lambda x: isinstance(x, float) or isinstance(
-            x, torch.Tensor) and x.numel() == 1 and x.is_floating_point()
-        if not is_singleton_float(q_scale) or not is_singleton_float(
-                prob_scale):
-            raise ValueError("Only support per-tensor scaling factor"
-                             "for fp8-quantized Q/prob")
+        is_singleton_float = (
+            lambda x: isinstance(x, float)
+            or isinstance(x, torch.Tensor)
+            and x.numel() == 1
+            and x.is_floating_point()
+        )
+        if not is_singleton_float(q_scale) or not is_singleton_float(prob_scale):
+            raise ValueError(
+                "Only support per-tensor scaling factorfor fp8-quantized Q/prob"
+            )
 
         # These are used in the final Attention.forward()
         layer._q_scale.copy_(q_scale)
-        layer._q_scale_float = q_scale.item() if isinstance(
-            q_scale, torch.Tensor) else q_scale
+        layer._q_scale_float = (
+            q_scale.item() if isinstance(q_scale, torch.Tensor) else q_scale
+        )
 
         layer._prob_scale.copy_(prob_scale)
-        if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0
-                                              or prob_scale == 1.0):
+        if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0 or prob_scale == 1.0):
             logger.warning_once(
                 f"Using uncalibrated q_scale {q_scale} and/or prob_scale "
                 f"{prob_scale} with fp8 attention. This may cause accuracy "
                 "issues. Please make sure q/prob scaling factors are "
-                "available in the fp8 checkpoint.")
+                "available in the fp8 checkpoint."
+            )
 
         del layer.k_scale
         del layer.v_scale
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 0be43da00b5337986c0f030e19342e0efb2eeaeb..476521813f464c5cd4a447b604557dad173cf60d 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
@@ -12,40 +13,69 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig, FusedMoEQuantConfig, fp8_w8a8_moe_quant_config,
-    nvfp4_moe_quant_config)
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    is_valid_flashinfer_cutlass_fused_moe)
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+    fp8_w8a8_moe_quant_config,
+    nvfp4_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+    FusedMoE,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
-    select_nvfp4_gemm_impl)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl,
+)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    FlashinferMoeBackend,
+    apply_flashinfer_per_tensor_scale_fp8,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
-    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
-    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
+    flashinfer_cutlass_moe_fp8,
+    get_flashinfer_moe_backend,
+    is_flashinfer_supporting_global_sf,
+    register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl,
+    swap_w13_to_w31,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    apply_fp4_marlin_linear, is_fp4_marlin_supported,
-    prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
+    apply_fp4_marlin_linear,
+    is_fp4_marlin_supported,
+    prepare_fp4_layer_for_marlin,
+    prepare_moe_fp4_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
+    GroupShape,
+    cutlass_fp4_supported,
+    is_layer_skipped,
+    swizzle_blockscale,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, requantize_with_max_scale)
-from vllm.model_executor.parameter import (ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    Fp8LinearOp,
+    requantize_with_max_scale,
+)
+from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 from vllm.scalar_type import scalar_types
-from vllm.utils import next_power_of_2
-from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer,
-                                   has_flashinfer_moe)
+from vllm.utils.flashinfer import (
+    flashinfer_scaled_fp4_mm,
+    has_flashinfer,
+    has_flashinfer_moe,
+)
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
@@ -62,16 +92,18 @@ class ModelOptFp8Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
-        kv_cache_quant_method: Optional[str] = None,
-        exclude_modules: Optional[list[str]] = None,
+        kv_cache_quant_method: str | None = None,
+        exclude_modules: list[str] | None = None,
     ) -> None:
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
         self.exclude_modules = exclude_modules or []
         if is_checkpoint_fp8_serialized:
-            logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
-                           " the format is experimental and could change.")
+            logger.warning(
+                "Detected ModelOpt fp8 checkpoint. Please note that"
+                " the format is experimental and could change."
+            )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -91,12 +123,12 @@ class ModelOptFp8Config(QuantizationConfig):
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(
-                self.exclude_modules)
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         """Detect if this ModelOpt config should be used based on
         quantization config."""
 
@@ -132,30 +164,31 @@ class ModelOptFp8Config(QuantizationConfig):
             # ModelOpt format: {"quantization": {"quant_algo": "..."}}
             quant_config = cls.get_from_keys(config, ["quantization"])
             if not isinstance(quant_config, dict):
-                raise ValueError(
-                    "Expected 'quantization' to be a dictionary in config")
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
             quant_method = quant_config.get("quant_algo", "")
             if not quant_method:
                 raise ValueError("Missing 'quant_algo' in quantization config")
             kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
             exclude_modules = quant_config.get("exclude_modules")
         else:
             # Compressed-tensors style format:
             # {"quant_algo": "...", "quant_method": "modelopt"}
             quant_method = config.get("quant_algo", "")
             kv_cache_quant_method = config.get("kv_cache_quant_algo")
-            exclude_modules = config.get("exclude_modules")
+            # "ignore" is the key in config.json
+            exclude_modules = config.get("ignore")
 
         if quant_method not in QUANT_ALGOS:
             raise ValueError(
                 f"ModelOpt currently only supports: {QUANT_ALGOS} "
                 "quantizations in vLLM. Please check the "
                 "`hf_quant_config.json` file for your model's "
-                "quant configuration.")
-        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
+                "quant configuration."
+            )
+        is_checkpoint_fp8_serialized = "FP8" in quant_method
 
-        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
-                   exclude_modules)
+        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
 
     def is_layer_excluded(self, prefix: str) -> bool:
         """
@@ -170,31 +203,38 @@ class ModelOptFp8Config(QuantizationConfig):
             return False
 
         # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules,
-                            self.packed_modules_mapping):
+        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
             return True
 
         # Then check substring matching for patterns not caught by exact match
         for module in self.exclude_modules:
             # Skip exact matches already handled above
-            if (module != prefix and
-                (module in prefix or
-                 (prefix.startswith("language_model.")
-                  and module in prefix.removeprefix("language_model.")))):
+            if module != prefix and (
+                module in prefix
+                or (
+                    prefix.startswith("language_model.")
+                    and module in prefix.removeprefix("language_model.")
+                )
+            ):
                 return True
         return False
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
+
         if isinstance(layer, LinearBase):
             if self.is_layer_excluded(prefix):
                 return UnquantizedLinearMethod()
             # Check if this is a vision model layer that should not be quantized
-            if ("vision_tower" in prefix or "vision_model" in prefix):
+            if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelOptFp8MoEMethod(self, layer)
@@ -216,7 +256,8 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: ModelOptFp8Config) -> None:
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(
-            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
+            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR
+        )
 
     def create_weights(
         self,
@@ -234,29 +275,34 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
-        weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_checkpoint_fp8_serialized else
-                        params_dtype)
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=weight_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=weight_dtype
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
             weight_scale[:] = torch.finfo(torch.float32).min
             layer.register_parameter("weight_scale", weight_scale)
             # INPUT SCALE
-            scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                            weight_loader=weight_loader)
+            scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
 
             scale[:] = torch.finfo(torch.float32).min
             layer.register_parameter("input_scale", scale)
@@ -266,23 +312,25 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         max_w_scale = layer.weight_scale.max()
         if not (layer.weight_scale == layer.weight_scale[0]).all():
             max_w_scale, weight = requantize_with_max_scale(
-                layer.weight, layer.weight_scale, layer.logical_widths)
+                layer.weight, layer.weight_scale, layer.logical_widths
+            )
         layer.weight = Parameter(weight.t(), requires_grad=False)
         layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
-        layer.input_scale = Parameter(layer.input_scale.max(),
-                                      requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
 
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     input_scale=layer.input_scale,
-                                     bias=bias)
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+        )
 
 
 class ModelOptFp8MoEMethod(FusedMoEMethodBase):
@@ -302,23 +350,37 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         self.layer = layer
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-            cutlass_fp8_supported)
+            cutlass_fp8_supported,
+        )
+
         self.cutlass_fp8_supported = cutlass_fp8_supported()
-        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            if (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+                and not self.moe.is_act_and_mul
+            ):
+                logger.info_once(
+                    "Non-gated MoE is not supported for min-latency mode,"
+                    "falling back to high-throughput mode"
+                )
+                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
             )
 
     def maybe_make_prepare_finalize(
-        self, ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+        self,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             return None
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            prepare_finalize = (
-                build_flashinfer_fp8_cutlass_moe_prepare_finalize(self.moe))
+            prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+                self.moe
+            )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
@@ -346,18 +408,26 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-
         # Use FP8 dtype if checkpoint is serialized
-        weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_checkpoint_fp8_serialized else
-                        params_dtype)
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
         weight_loader = extra_weight_attrs.get("weight_loader")
 
+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
+
         w13_weight = ModelWeightParameter(
-            data=torch.empty(num_experts,
-                             2 * intermediate_size_per_partition,
-                             hidden_size,
-                             dtype=weight_dtype),
+            data=torch.empty(
+                num_experts,
+                w13_up_dim,
+                hidden_size,
+                dtype=weight_dtype,
+            ),
             input_dim=2,
             output_dim=1,
             weight_loader=weight_loader,
@@ -365,10 +435,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         layer.register_parameter("w13_weight", w13_weight)
 
         w2_weight = ModelWeightParameter(
-            data=torch.empty(num_experts,
-                             hidden_size,
-                             intermediate_size_per_partition,
-                             dtype=weight_dtype),
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=weight_dtype,
+            ),
             input_dim=2,
             output_dim=1,
             weight_loader=weight_loader,
@@ -377,18 +449,23 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALES - Per-tensor scaling for ModelOpts
-            # Allocate 2 scales for w1 and w3 respectively.
+            # For gated MoE, allocate 2 scales for w1 and w3 respectively.
             # They will be combined to a single scale after weight loading.
+            # For non-gated MoE, allocate 1 scale for w13.
+            if self.moe.is_act_and_mul:
+                w13_weight_scale_shape = (num_experts, 2)
+            else:
+                w13_weight_scale_shape = (num_experts, 1)
             w13_weight_scale = PerTensorScaleParameter(
                 data=torch.full(
-                    (num_experts, 2),
+                    w13_weight_scale_shape,
                     1.0,
                     dtype=torch.float32,
                 ),
                 weight_loader=weight_loader,
             )
             w2_weight_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
@@ -396,15 +473,16 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
             # Set weight loader attributes for scales
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
 
             # INPUT SCALES - Per-tensor scaling for ModelOpt
             w13_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             w2_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             layer.register_parameter("w13_input_scale", w13_input_scale)
@@ -415,22 +493,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         Only supports pre-quantized checkpoints with FP8 weights and scales.
         """
 
-        layer.w13_weight = Parameter(layer.w13_weight.data,
-                                     requires_grad=False)
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
         layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
         from vllm._custom_ops import scaled_fp8_quant
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-            per_tensor_dequantize)
+            per_tensor_dequantize,
+        )
 
         # Handle scale parameters
-        if hasattr(layer,
-                   "w13_weight_scale") and layer.w13_weight_scale is not None:
+        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max of the w1 and w3 scales
             # then dequant and requant each expert.
-            if layer.w13_weight_scale.dim() == 2:
-
+            if (
+                layer.w13_weight_scale.dim() == 2
+                and layer.w13_weight_scale.shape[1] == 2
+            ):
+                assert self.moe.is_act_and_mul, (
+                    "w13_weight_scale should have 2 elements per expert "
+                    "only for gated MoE"
+                )
                 # Get the maximum scale across w1 and w3 for each expert
                 max_w13_scales = layer.w13_weight_scale.max(dim=1).values
 
@@ -443,59 +526,65 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                     for shard_id in range(2):  # w1 and w3
                         # Dequantize using the original scale for this shard
                         dq_weight = per_tensor_dequantize(
-                            layer.w13_weight[expert_id][start:start +
-                                                        intermediate_size, :],
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
                             layer.w13_weight_scale[expert_id][shard_id],
                         )
                         # Requantize using the combined max scale
 
                         (
-                            layer.w13_weight[expert_id][start:start +
-                                                        intermediate_size, :],
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
                             _,
-                        ) = scaled_fp8_quant(dq_weight,
-                                             max_w13_scales[expert_id])
+                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
 
                         start += intermediate_size
 
                 # Update the scale parameter to be per-expert
-                layer.w13_weight_scale = Parameter(max_w13_scales,
-                                                   requires_grad=False)
+                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
             else:
-                layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data,
-                                                   requires_grad=False)
+                layer.w13_weight_scale = Parameter(
+                    layer.w13_weight_scale.data, requires_grad=False
+                )
 
-        if hasattr(layer,
-                   "w2_weight_scale") and layer.w2_weight_scale is not None:
-            layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data,
-                                              requires_grad=False)
+        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(
+                layer.w2_weight_scale.data, requires_grad=False
+            )
         # Input scales must be equal for each expert in fp8 MoE layers.
-        if hasattr(layer,
-                   "w13_input_scale") and layer.w13_input_scale is not None:
-            layer.w13_input_scale = Parameter(layer.w13_input_scale.max(),
-                                              requires_grad=False)
-        if hasattr(layer,
-                   "w2_input_scale") and layer.w2_input_scale is not None:
-            layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
-                                             requires_grad=False)
+        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
 
         if self.flashinfer_moe_backend is not None:
-            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            register_moe_scaling_factors(layer)
+            if self.moe.is_act_and_mul:
+                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-                rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
-                                                  layer.w2_weight)
+                rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+        register_moe_scaling_factors(layer)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             return None
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
+            g1_alphas=layer.output1_scales_gate_scalar.squeeze(),
             w2_scale=layer.w2_weight_scale,
+            g2_alphas=layer.output2_scales_scalar.squeeze(),
             a1_scale=layer.w13_input_scale,
+            a1_gscale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
             per_act_token_quant=False,
         )
 
@@ -507,29 +596,30 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
+                "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+            )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-            assert self.fused_experts is None
-            assert activation == 'silu', (
-                f"Expected 'silu' activation but got {activation}")
+            assert activation == "silu", (
+                f"Expected 'silu' activation but got {activation}"
+            )
             assert not renormalize
             return apply_flashinfer_per_tensor_scale_fp8(
                 layer=layer,
@@ -540,7 +630,8 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 top_k=top_k,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                apply_router_weight_on_input=apply_router_weight_on_input)
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
 
         # Expert selection
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -558,27 +649,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # cutlass or fused_experts.
-        #
-        if self.fused_experts is not None:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            assert activation in ("silu", "relu2_no_mul"), (
+                "Expected activation to be in ('silu', 'relu2_no_mul'),"
+                f"but got {activation}"
             )
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not renormalize
-            assert activation == 'silu', (
-                f"Expected 'silu' activation but got {activation}")
             return flashinfer_cutlass_moe_fp8(
                 x,
                 layer,
@@ -591,8 +666,8 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         else:
-            from vllm.model_executor.layers.fused_moe.fused_moe import (
-                fused_experts)
+            from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+
             assert self.moe_quant_config is not None
 
             return fused_experts(
@@ -616,7 +691,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_nvfp4_serialized: bool,
-        kv_cache_quant_algo: Optional[str],
+        kv_cache_quant_algo: str | None,
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
@@ -625,7 +700,8 @@ class ModelOptNvFp4Config(QuantizationConfig):
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
                 "Detected ModelOpt NVFP4 checkpoint. Please note that"
-                " the format is experimental and could change in future.")
+                " the format is experimental and could change in future."
+            )
 
             self.group_size = group_size
             self.kv_cache_quant_algo = kv_cache_quant_algo
@@ -649,12 +725,12 @@ class ModelOptNvFp4Config(QuantizationConfig):
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
         if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(
-                self.exclude_modules)
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         """Detect if this ModelOpt FP4 config should be used based on
         quantization config."""
         if hf_quant_cfg is None:
@@ -692,8 +768,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
             # {"quantization": {"quant_algo": "..."}}
             quant_config = cls.get_from_keys(config, ["quantization"])
             if not isinstance(quant_config, dict):
-                raise ValueError(
-                    "Expected 'quantization' to be a dictionary in config")
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
 
             quant_method = quant_config.get("quant_algo", "")
             if not quant_method:
@@ -707,8 +782,10 @@ class ModelOptNvFp4Config(QuantizationConfig):
             elif isinstance(kv_cache_quant_algo_raw, str):
                 kv_cache_quant_algo = kv_cache_quant_algo_raw
             else:
-                raise ValueError(f"kv_cache_quant_algo must be a string, got "
-                                 f"{type(kv_cache_quant_algo_raw)}")
+                raise ValueError(
+                    f"kv_cache_quant_algo must be a string, got "
+                    f"{type(kv_cache_quant_algo_raw)}"
+                )
 
             # Handle group_size with proper type validation
             group_size_raw = quant_config.get("group_size")
@@ -720,13 +797,16 @@ class ModelOptNvFp4Config(QuantizationConfig):
                 try:
                     group_size = int(group_size_raw)
                 except (ValueError, TypeError):
-                    raise ValueError(f"group_size must be an integer, got "
-                                     f"{type(group_size_raw)}") from None
+                    raise ValueError(
+                        f"group_size must be an integer, got {type(group_size_raw)}"
+                    ) from None
 
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
             exclude_modules = quant_config.get("exclude_modules", [])
             if not isinstance(exclude_modules, list):
-                raise ValueError(f"exclude_modules must be a list, got "
-                                 f"{type(exclude_modules)}")
+                raise ValueError(
+                    f"exclude_modules must be a list, got {type(exclude_modules)}"
+                )
         else:
             # Compressed-tensors style format:
             # {"quant_algo": "...", "quant_method": "modelopt"}
@@ -740,8 +820,10 @@ class ModelOptNvFp4Config(QuantizationConfig):
             elif isinstance(kv_cache_quant_algo_raw, str):
                 kv_cache_quant_algo = kv_cache_quant_algo_raw
             else:
-                raise ValueError(f"kv_cache_quant_algo must be a string, got "
-                                 f"{type(kv_cache_quant_algo_raw)}")
+                raise ValueError(
+                    f"kv_cache_quant_algo must be a string, got "
+                    f"{type(kv_cache_quant_algo_raw)}"
+                )
 
             # Handle group_size with proper type validation
             group_size_raw = config.get("group_size")
@@ -753,39 +835,46 @@ class ModelOptNvFp4Config(QuantizationConfig):
                 try:
                     group_size = int(group_size_raw)
                 except (ValueError, TypeError):
-                    raise ValueError(f"group_size must be an integer, got "
-                                     f"{type(group_size_raw)}") from None
+                    raise ValueError(
+                        f"group_size must be an integer, got {type(group_size_raw)}"
+                    ) from None
 
-            exclude_modules = config.get("exclude_modules", [])
+            # "ignore" is the key in config.json
+            exclude_modules = config.get("ignore", [])
             if not isinstance(exclude_modules, list):
-                raise ValueError(f"exclude_modules must be a list, got "
-                                 f"{type(exclude_modules)}")
+                raise ValueError(
+                    f"exclude_modules must be a list, got {type(exclude_modules)}"
+                )
 
         if quant_method not in QUANT_ALGOS:
             raise ValueError(
                 f"ModelOpt currently only supports: {QUANT_ALGOS} "
                 "quantizations in vLLM. Please check the "
                 "`hf_quant_config.json` file for your model's "
-                "quant configuration.")
-        is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method)
+                "quant configuration."
+            )
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
         # For FP4, these fields are required
         if is_checkpoint_nvfp4_serialized and "quantization" in config:
             # Check if required fields are present in the quantization config
             quant_config = config["quantization"]
-            required_fields = [
-                "group_size", "kv_cache_quant_algo", "exclude_modules"
-            ]
+            required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"]
             missing_fields = [
                 field for field in required_fields if field not in quant_config
             ]
             if missing_fields:
                 raise ValueError(
                     f"NVFP4 quantization requires the following fields in "
-                    f"hf_quant_config.json: {missing_fields}")
-
-        return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
-                   exclude_modules, group_size)
+                    f"hf_quant_config.json: {missing_fields}"
+                )
+
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            exclude_modules,
+            group_size,
+        )
 
     def is_layer_excluded(self, prefix: str) -> bool:
         """
@@ -793,33 +882,41 @@ class ModelOptNvFp4Config(QuantizationConfig):
         Handles both exact matching (for fused layers) and pattern matching.
         """
         # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules,
-                            self.packed_modules_mapping):
+        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
             return True
 
         # Check regex pattern matching for patterns not caught by exact match
         import regex as re
+
         for pattern in self.exclude_modules:
             # Skip patterns that would be caught by exact matching
-            if '*' in pattern or '.' in pattern:
-                regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
+            if "*" in pattern or "." in pattern:
+                regex_str = pattern.replace(".", r"\.").replace("*", r".*")
                 if re.fullmatch(regex_str, prefix):
                     return True
         return False
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
+
+        skip_layer = self.is_layer_excluded(prefix)
         if isinstance(layer, LinearBase):
-            if self.is_layer_excluded(prefix):
+            if skip_layer:
                 return UnquantizedLinearMethod()
             # Check if this is a vision model layer that should not be quantized
-            if ("vision_tower" in prefix or "vision_model" in prefix):
+            if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
+            if skip_layer:
+                return None
             return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
         return None
 
@@ -829,8 +926,7 @@ class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
     Supports loading kv-cache scaling factors from FP8 checkpoints.
     """
 
-    def __init__(self, quant_config: Union[ModelOptFp8Config,
-                                           ModelOptNvFp4Config]):
+    def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config):
         super().__init__(quant_config)
 
 
@@ -848,19 +944,28 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
 
-        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
-            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
-            self.backend = "flashinfer-trtllm"
-        elif has_flashinfer():
-            self.backend = "flashinfer-cutlass"
-        elif cutlass_fp4_supported():
+        self.backend = "none"
+        if envs.VLLM_NVFP4_GEMM_BACKEND is None:
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+            elif cutlass_fp4_supported():
+                self.backend = "cutlass"
+            elif is_fp4_marlin_supported():
+                self.backend = "marlin"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
             self.backend = "cutlass"
-        elif is_fp4_marlin_supported():
-            self.backend = "marlin"
-        else:
-            raise ValueError("Current platform does not support NVFP4"
-                             " quantization. Please use Blackwell and"
-                             " above.")
+            assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}"
+
+        if self.backend == "none":
+            raise ValueError(
+                "No valid NVFP4 GEMM backend found. "
+                "Please check your platform capability."
+            )
+
+        logger.info_once(f"Using {self.backend} for NVFP4 GEMM")
 
     def create_weights(
         self,
@@ -874,59 +979,69 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
     ):
         del input_size, output_size
         if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError("NVFP4 quantization was selected, "
-                             " dynamic quantization is not supported.")
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
 
-        if (input_size_per_partition % 16 != 0):
-            raise ValueError("Unsupported model when in features size is "
-                             "not multiple of 16")
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is not multiple of 16"
+            )
         # The nvfp4 weight is still represented as
-        weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_checkpoint_nvfp4_serialized
-                        else params_dtype)
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
         # Weight
         weight = ModelWeightParameter(
             data=torch.empty(
                 # 2 fp4 items are packed in the input dimension
                 layer.output_size_per_partition,
                 layer.input_size_per_partition // 2,
-                dtype=torch.uint8),
+                dtype=torch.uint8,
+            ),
             input_dim=1,
             output_dim=0,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         # Input Weight Scale
-        input_scale = PerTensorScaleParameter(data=torch.empty(
-            len(output_partition_sizes), dtype=torch.float32),
-                                              weight_loader=weight_loader)
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("input_scale", input_scale)
 
         # Global Weight Scale
-        weight_scale_2 = PerTensorScaleParameter(data=torch.empty(
-            len(output_partition_sizes), dtype=torch.float32),
-                                                 weight_loader=weight_loader)
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight_scale_2", weight_scale_2)
 
         # Per Block Weight Scale
-        weight_scale = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition // self.quant_config.group_size,
-            dtype=weight_dtype,
-        ),
-                                            input_dim=1,
-                                            output_dim=0,
-                                            weight_loader=weight_loader)
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight_scale", weight_scale)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-
         # global scales:
         input_scale_2 = layer.input_scale.max().to(torch.float32)
         layer.input_scale = Parameter(input_scale_2, requires_grad=False)
@@ -934,18 +1049,21 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
         layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
 
-        layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
-                                requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
 
         # Calculate `1 / input_scale` so that we don't need to do so at runtime
         layer.input_scale_inv = Parameter(
-            (1 / layer.input_scale).to(torch.float32), requires_grad=False)
+            (1 / layer.input_scale).to(torch.float32), requires_grad=False
+        )
 
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
-        assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
-            "Weight Block scale must be represented as FP8-E4M3")
+        assert layer.weight_scale.dtype == torch.float8_e4m3fn, (
+            "Weight Block scale must be represented as FP8-E4M3"
+        )
 
         if self.backend == "marlin":
             prepare_fp4_layer_for_marlin(layer)
@@ -962,25 +1080,25 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
             weight_scale = layer.weight_scale.data
 
             epilogue_tile_m = 128
-            weight = shuffle_matrix_a(weight.view(torch.uint8),
-                                      epilogue_tile_m)
-            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
-                torch.uint8), epilogue_tile_m).reshape(
-                    weight_scale.shape).view(torch.float8_e4m3fn))
+            weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m)
+            weight_scale = (
+                shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m)
+                .reshape(weight_scale.shape)
+                .view(torch.float8_e4m3fn)
+            )
 
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
-            layer.weight_scale = Parameter(swizzled_weight_scale,
-                                           requires_grad=False)
+            layer.weight_scale = Parameter(swizzled_weight_scale, requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.backend == "marlin":
             return apply_fp4_marlin_linear(
@@ -991,7 +1109,8 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
-                bias=bias)
+                bias=bias,
+            )
 
         output_dtype = x.dtype
         output_shape = [x.shape[0], layer.weight.shape[0]]
@@ -1001,11 +1120,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
         # validate dtypes of quantized input, input block scale,
         # weight and weight_blockscale
-        assert (x_fp4.dtype == torch.uint8)
-        assert (layer.weight.dtype == torch.uint8)
-        assert (x_blockscale.dtype == torch.float8_e4m3fn)
-        assert (layer.weight_scale.dtype == torch.float8_e4m3fn)
-        assert (layer.alpha.dtype == torch.float32)
+        assert x_fp4.dtype == torch.uint8
+        assert layer.weight.dtype == torch.uint8
+        assert x_blockscale.dtype == torch.float8_e4m3fn
+        assert layer.weight_scale.dtype == torch.float8_e4m3fn
+        assert layer.alpha.dtype == torch.float32
 
         mm_args = (
             x_fp4,
@@ -1015,11 +1134,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
             layer.alpha,
             output_dtype,
         )
-        if self.backend == "flashinfer-trtllm":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
-        elif self.backend == "flashinfer-cutlass":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        if self.backend.startswith("flashinfer-"):
+            backend_name = self.backend[len("flashinfer-") :]
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name)
         else:
+            assert self.backend == "cutlass"
             out = cutlass_scaled_fp4_mm(*mm_args)
 
         if bias is not None:
@@ -1027,16 +1146,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         return out.view(*output_shape)
 
 
-def _get_tile_tokens_dim(num_tokens: int, top_k: int, num_experts: int) -> int:
-    # Guess tokens per expert assuming perfect expert distribution first.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-    return tile_tokens_dim
-
-
 class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     """
     MoE Method for FP4 Quantization.
@@ -1050,8 +1159,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         moe: FusedMoEConfig,
         layer: torch.nn.Module,
     ) -> None:
-        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
-            detect_nvfp4_moe_support)
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
+            detect_nvfp4_moe_support,  # noqa: E501
+        )
+
         super().__init__(moe)
         self.quant_config = quant_config
         self.layer = layer
@@ -1060,24 +1171,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.flashinfer_moe_backend = None
-
+        self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for ModelOptNvFp4FusedMoE.")
+                " for ModelOptNvFp4FusedMoE."
+            )
 
-    def maybe_make_prepare_finalize(
-            self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
-        if (self.use_marlin
-                or (self.allow_flashinfer and self.flashinfer_moe_backend
-                    == FlashinferMoeBackend.TENSORRT_LLM)):
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+        if self.use_marlin or (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
-        elif (self.allow_flashinfer
-              and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS):
+        elif (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        ):
             # For now, fp4 moe only works with the flashinfer dispatcher.
-            prepare_finalize = (
-                build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe))
+            prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+                self.moe
+            )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
@@ -1103,12 +1218,20 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         """
         return True
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError("NVFP4 quantization was selected, "
-                             " dynamic quantization is not supported.")
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
 
         layer.num_experts = num_experts
         layer.params_dtype = params_dtype
@@ -1116,6 +1239,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         weight_dtype = torch.uint8
         weight_scale_dtype = torch.float8_e4m3fn
         weight_loader = extra_weight_attrs.get("weight_loader")
+        global_num_experts = extra_weight_attrs.get("global_num_experts")
         # GEMM 1
         w13_weight = ModelWeightParameter(
             data=torch.empty(
@@ -1123,10 +1247,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 2 * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // 2,
-                dtype=weight_dtype),
+                dtype=weight_dtype,
+            ),
             input_dim=1,
             output_dim=2,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w13_weight", w13_weight)
 
         # GEMM 2
@@ -1136,10 +1262,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 hidden_size,
                 # 2 fp4 items are packed in the input dimension
                 intermediate_size_per_partition // 2,
-                dtype=weight_dtype),
+                dtype=weight_dtype,
+            ),
             input_dim=1,
             output_dim=2,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w2_weight", w2_weight)
 
         w13_weight_scale = ModelWeightParameter(
@@ -1148,10 +1276,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 2 * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // self.quant_config.group_size,
-                dtype=weight_scale_dtype),
+                dtype=weight_scale_dtype,
+            ),
             input_dim=1,
             output_dim=2,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
 
         w2_weight_scale = ModelWeightParameter(
@@ -1159,128 +1289,175 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 num_experts,
                 hidden_size,
                 # 2 fp4 items are packed in the input dimension
-                intermediate_size_per_partition //
-                self.quant_config.group_size,
-                dtype=weight_scale_dtype),
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
             input_dim=1,
             output_dim=2,
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
 
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value})
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
 
         w13_weight_scale_2 = PerTensorScaleParameter(
             data=torch.empty(num_experts, 2, dtype=torch.float32),
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
 
         w2_weight_scale_2 = PerTensorScaleParameter(
             data=torch.empty(num_experts, dtype=torch.float32),
-            weight_loader=weight_loader)
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
 
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
 
-        w13_input_scale = PerTensorScaleParameter(data=torch.empty(
-            num_experts, 2, dtype=torch.float32),
-                                                  weight_loader=weight_loader)
+        use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf(
+            self.flashinfer_moe_backend
+        )
+        global_scale_num_experts = global_num_experts if use_global_sf else num_experts
+
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.empty(global_scale_num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w13_input_scale", w13_input_scale)
 
-        w2_input_scale = PerTensorScaleParameter(data=torch.empty(
-            num_experts, dtype=torch.float32),
-                                                 weight_loader=weight_loader)
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.empty(global_scale_num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def prepare_static_weight_layouts_for_trtllm_moe(
+    def prepare_static_weights_for_trtllm_fp4_moe(
         self,
-        gemm1_weights: torch.Tensor,
-        gemm2_weights: torch.Tensor,
-        gemm1_scales_linear_fp4_bytes: torch.Tensor,
-        gemm2_scales_linear_fp4_bytes: torch.Tensor,
-        hidden_size: int,
-        intermediate_size: int,
-        num_experts: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # args_dequant,
+        # args,
+        gemm1_weights,
+        gemm2_weights,
+        gemm1_scales_linear_fp4_bytes,
+        gemm2_scales_linear_fp4_bytes,
+        hidden_size,
+        intermediate_size,
+        num_experts,
+    ):
+        from flashinfer import nvfp4_block_scale_interleave
+        from flashinfer.fused_moe.core import (
+            _maybe_get_cached_w3_w1_permute_indices,
+            get_w2_permute_indices_with_cache,
+        )
+
         """Prepare quantized weights for kernel (done offline with weights)."""
-        from flashinfer import (reorder_rows_for_gated_act_gemm,
-                                shuffle_matrix_a, shuffle_matrix_sf_a)
         epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
 
         # Convert quantized weights to proper formats
         gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 2)  # packed fp4
+            num_experts, 2 * intermediate_size, hidden_size // 2
+        )  # packed fp4
         gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn).reshape(num_experts, 2 * intermediate_size,
-                                         hidden_size //
-                                         16)  # fp8 scaling factors
+            torch.float8_e4m3fn
+        ).reshape(
+            num_experts, 2 * intermediate_size, hidden_size // 16
+        )  # fp8 scaling factors
 
         gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, hidden_size, intermediate_size // 2)  # packed fp4
+            num_experts, hidden_size, intermediate_size // 2
+        )  # packed fp4
         gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn).reshape(num_experts, hidden_size,
-                                         intermediate_size //
-                                         16)  # fp8 scaling factors
+            torch.float8_e4m3fn
+        ).reshape(
+            num_experts, hidden_size, intermediate_size // 16
+        )  # fp8 scaling factors
 
-        # Reorder rows of W1 and scales for fused gated activation
-        gemm1_weights_fp4_interleaved = []
-        gemm1_scales_fp4_interleaved = []
-        for i in range(num_experts):
-            gemm1_weights_fp4_interleaved.append(
-                reorder_rows_for_gated_act_gemm(gemm1_weights_fp4[i].clone()))
-            gemm1_scales_fp4_interleaved.append(
-                reorder_rows_for_gated_act_gemm(
-                    gemm1_scales_linear_fp4[i].clone()))
-
-        # Stack weights and scales for all experts
-        gemm1_weights_fp4_interleaved = torch.stack(
-            gemm1_weights_fp4_interleaved).reshape(num_experts,
-                                                   2 * intermediate_size,
-                                                   hidden_size // 2)
-        gemm1_scales_fp4_interleaved = torch.stack(
-            gemm1_scales_fp4_interleaved).reshape(num_experts,
-                                                  2 * intermediate_size,
-                                                  hidden_size // 16)
-
-        # Shuffle weights and scaling factors for transposed mma output
         gemm1_weights_fp4_shuffled = []
         gemm1_scales_fp4_shuffled = []
         gemm2_weights_fp4_shuffled = []
         gemm2_scales_fp4_shuffled = []
         for i in range(num_experts):
+            # Calculate the permute indices for the following:
+            # 1. Reorder rows of W1 and scales for fused gated activation
+            # 2. Shuffle weights and scaling factors for transposed mma output
+            # for both w3_w1 and w2 weights and scale factors
+            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                gemm1_weights_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
             gemm1_weights_fp4_shuffled.append(
-                shuffle_matrix_a(
-                    gemm1_weights_fp4_interleaved[i].view(torch.uint8),
-                    epilogue_tile_m))
+                gemm1_weights_fp4[i]
+                .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+                .contiguous()
+            )
+
+            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                gemm1_scales_linear_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
             gemm1_scales_fp4_shuffled.append(
-                shuffle_matrix_sf_a(
-                    gemm1_scales_fp4_interleaved[i].view(torch.uint8),
-                    epilogue_tile_m))
+                nvfp4_block_scale_interleave(
+                    gemm1_scales_linear_fp4[i]
+                    .view(torch.uint8)[
+                        permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                    ]
+                    .contiguous()
+                )
+            )
 
+            permute_indices = get_w2_permute_indices_with_cache(
+                self._cache_permute_indices,
+                gemm2_weights_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
             gemm2_weights_fp4_shuffled.append(
-                shuffle_matrix_a(gemm2_weights_fp4[i].view(torch.uint8),
-                                 epilogue_tile_m))
+                gemm2_weights_fp4[i]
+                .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+                .contiguous()
+            )
+
+            permute_sf_indices = get_w2_permute_indices_with_cache(
+                self._cache_permute_indices,
+                gemm2_scales_linear_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
             gemm2_scales_fp4_shuffled.append(
-                shuffle_matrix_sf_a(
-                    gemm2_scales_linear_fp4[i].view(torch.uint8),
-                    epilogue_tile_m))
+                nvfp4_block_scale_interleave(
+                    gemm2_scales_linear_fp4[i]
+                    .view(torch.uint8)[
+                        permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                    ]
+                    .contiguous()
+                )
+            )
 
         # Stack weights for all experts
         gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
         gemm1_scales_fp4_shuffled = (
-            torch.stack(gemm1_scales_fp4_shuffled).view(
-                torch.float8_e4m3fn).reshape(num_experts,
-                                             2 * intermediate_size,
-                                             hidden_size // 16))
+            torch.stack(gemm1_scales_fp4_shuffled)
+            .view(torch.float8_e4m3fn)
+            .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+        )
 
         gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
         gemm2_scales_fp4_shuffled = (
-            torch.stack(gemm2_scales_fp4_shuffled).view(
-                torch.float8_e4m3fn).reshape(num_experts, hidden_size,
-                                             intermediate_size // 16))
-        return (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled,
-                gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled)
+            torch.stack(gemm2_scales_fp4_shuffled)
+            .view(torch.float8_e4m3fn)
+            .reshape(num_experts, hidden_size, intermediate_size // 16)
+        )
+        return (
+            gemm1_weights_fp4_shuffled,
+            gemm1_scales_fp4_shuffled,
+            gemm2_weights_fp4_shuffled,
+            gemm2_scales_fp4_shuffled,
+        )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1 processing
@@ -1289,72 +1466,104 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
         if self.allow_flashinfer:
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
-                gemm1_weight, gemm1_weight_scale, dim=-2)
+                gemm1_weight, gemm1_weight_scale, dim=-2
+            )
 
         layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
-        layer.w13_weight_scale = Parameter(gemm1_weight_scale,
-                                           requires_grad=False)
+        layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False)
 
         # Common processing for w13_weight_scale_2
-        if not torch.allclose(layer.w13_weight_scale_2[:, 0],
-                              layer.w13_weight_scale_2[:, 1]):
+        if not torch.allclose(
+            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
+        ):
             logger.warning_once(
                 "w1_weight_scale_2 must match w3_weight_scale_2. "
-                "Accuracy may be affected.")
+                "Accuracy may be affected."
+            )
 
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
-        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2,
-                                             requires_grad=False)
+        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
 
         # Common processing for input scales and alphas
-        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(
-            torch.float32)
+        use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf(
+            self.flashinfer_moe_backend
+        )
+        if use_global_sf:
+            # For backends provide by Flashinfer, the input global scales are
+            # shared across all experts.
+            w13_input_scale = (
+                layer.w13_input_scale.max().to(torch.float32).expand(layer.num_experts)
+            )
+        else:
+            w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
         layer.g1_alphas = Parameter(
             (w13_input_scale * w13_weight_scale_2).to(torch.float32),
-            requires_grad=False)
+            requires_grad=False,
+        )
 
         # This is for quantization, so we need to invert it.
         layer.w13_input_scale_quant = Parameter(
-            (1 / w13_input_scale).to(torch.float32), requires_grad=False)
+            (1 / w13_input_scale).to(torch.float32), requires_grad=False
+        )
 
         # GEMM 2 processing
+        if use_global_sf:
+            # For backends provide by Flashinfer, the input global scales are
+            # shared across all experts.
+            w2_input_scale = (
+                layer.w2_input_scale.max().to(torch.float32).expand(layer.num_experts)
+            )
+        else:
+            w2_input_scale = layer.w2_input_scale
         layer.g2_alphas = Parameter(
-            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
-            requires_grad=False)
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
 
         # This is for quantization, so we need to invert it.
         layer.w2_input_scale_quant = Parameter(
-            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False)
+            (1 / w2_input_scale).to(torch.float32), requires_grad=False
+        )
 
         # TensorRT-LLM specific processing
-        if self.allow_flashinfer and \
-            self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             # Prepare static weights for TRT-LLM kernel
-            (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled,
-             gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled
-             ) = self.prepare_static_weight_layouts_for_trtllm_moe(
-                 layer.w13_weight,
-                 layer.w2_weight,
-                 layer.w13_weight_scale,
-                 layer.w2_weight_scale,
-                 layer.w2_weight.size(-2),  # hidden_size
-                 layer.w13_weight.size(-2) // 2,  # intermediate_size
-                 layer.w13_weight.size(0),  # num_experts
-             )
+            # alternate: prepare_static_weight_layouts_for_trtllm_moe
+            (
+                gemm1_weights_fp4_shuffled,
+                gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled,
+                gemm2_scales_fp4_shuffled,
+            ) = self.prepare_static_weights_for_trtllm_fp4_moe(
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                layer.w2_weight.size(-2),  # hidden_size
+                layer.w13_weight.size(-2) // 2,  # intermediate_size
+                layer.w13_weight.size(0),  # num_experts
+            )
+            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
 
             layer.gemm1_weights_fp4_shuffled = Parameter(
-                gemm1_weights_fp4_shuffled, requires_grad=False)
+                gemm1_weights_fp4_shuffled, requires_grad=False
+            )
             layer.gemm2_weights_fp4_shuffled = Parameter(
-                gemm2_weights_fp4_shuffled, requires_grad=False)
+                gemm2_weights_fp4_shuffled, requires_grad=False
+            )
             layer.gemm1_scales_fp4_shuffled = Parameter(
-                gemm1_scales_fp4_shuffled, requires_grad=False)
+                gemm1_scales_fp4_shuffled, requires_grad=False
+            )
             layer.gemm2_scales_fp4_shuffled = Parameter(
-                gemm2_scales_fp4_shuffled, requires_grad=False)
+                gemm2_scales_fp4_shuffled, requires_grad=False
+            )
 
             # Additional parameter needed for TRT-LLM
             layer.g1_scale_c = Parameter(
-                (layer.w2_input_scale_quant * layer.g1_alphas).to(
-                    torch.float32),
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
                 requires_grad=False,
             )
 
@@ -1372,29 +1581,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.w2_input_scale_quant
         else:
             # Non-TRT-LLM processing (Cutlass or non-flashinfer)
-            assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
-                "Expected weight_scale.dim(1) to be divisible by 16")
-            assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
-                "Weight Blockscale must be represented as FP8-E4M3")
-            w13_blockscale_swizzled = swizzle_blockscale(
-                layer.w13_weight_scale)
-            layer.w13_weight_scale = Parameter(w13_blockscale_swizzled,
-                                               requires_grad=False)
-
-            assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
-                "Expected weight_scale.dim(1) to be divisible by 16")
-            assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
-                "Weight Blockscale must be represented as FP8-E4M3")
+            w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
+            layer.w13_weight_scale = Parameter(
+                w13_blockscale_swizzled, requires_grad=False
+            )
+
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
-            layer.w2_weight_scale = Parameter(w2_blockscale_swizzled,
-                                              requires_grad=False)
-            layer.w2_weight = Parameter(layer.w2_weight.data,
-                                        requires_grad=False)
+            layer.w2_weight_scale = Parameter(
+                w2_blockscale_swizzled, requires_grad=False
+            )
+            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        if (self.use_marlin or self.flashinfer_moe_backend
-                == FlashinferMoeBackend.TENSORRT_LLM):
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        if (
+            self.use_marlin
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
 
         return nvfp4_moe_quant_config(
@@ -1414,81 +1618,89 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
+                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+            )
         assert activation == "silu", "Only SiLU activation is supported."
 
-        if (self.allow_flashinfer and self.flashinfer_moe_backend
-                == FlashinferMoeBackend.TENSORRT_LLM):
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             import flashinfer
 
             from vllm.model_executor.models.llama4 import Llama4MoE
 
-            assert self.fused_experts is None
-
             a1_gscale = layer.w13_input_scale_quant
-            (hidden_states_fp4,
-             hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-                 x,
-                 a1_gscale,
-                 is_sf_swizzled_layout=False,
-             )
-            use_llama4_routing = \
+            (hidden_states_fp4, hidden_states_scale_linear_fp4) = (
+                flashinfer.fp4_quantize(
+                    x,
+                    a1_gscale,
+                    is_sf_swizzled_layout=False,
+                )
+            )
+            use_llama4_routing = (
                 custom_routing_function is Llama4MoE.custom_routing_function
-            routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3
+            )
+            routing_method_type = layer.routing_method_type
             if use_llama4_routing:
-                routing_method_type = flashinfer.RoutingMethodType.Llama4
+                routing_method_type = RoutingMethodType.Llama4
+            router_logits = (
+                router_logits.to(torch.float32)
+                if routing_method_type == RoutingMethodType.DeepSeekV3
+                else router_logits
+            )
             routing_bias = e_score_correction_bias
             if routing_bias is not None:
                 routing_bias = routing_bias.to(torch.bfloat16)
             out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits
-                if use_llama4_routing else router_logits.to(torch.float32),
+                routing_logits=router_logits,
                 routing_bias=routing_bias,
                 hidden_states=hidden_states_fp4,
                 hidden_states_scale=hidden_states_scale_linear_fp4.view(
-                    torch.float8_e4m3fn).flatten(),
+                    torch.float8_e4m3fn
+                ).flatten(),
                 gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
                 gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn),
+                    torch.float8_e4m3fn
+                ),
                 gemm1_bias=None,
                 gemm1_alpha=None,
                 gemm1_beta=None,
                 gemm1_clamp_limit=None,
                 gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
                 gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn),
+                    torch.float8_e4m3fn
+                ),
                 gemm2_bias=None,
                 output1_scale_scalar=layer.g1_scale_c.data,
                 output1_scale_gate_scalar=layer.g1_alphas.data,
                 output2_scale_scalar=layer.g2_alphas.data,
                 num_experts=global_num_experts,
                 top_k=top_k,
-                n_group=num_expert_group
-                if num_expert_group is not None else 0,
-                topk_group=topk_group if topk_group is not None else 0,
+                n_group=num_expert_group,
+                topk_group=topk_group,
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,
                 routed_scaling_factor=None,
-                tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
-                                                     layer.local_num_experts),
+                tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 do_finalize=True,
             )[0]
@@ -1506,16 +1718,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin or
-        # trtllm.
-        #
         if self.use_marlin:
-            assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -1532,32 +1739,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace)
-
-        elif self.fused_experts is not None:
-            assert self.allow_flashinfer and \
-               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                workspace=layer.workspace,
             )
-        elif (self.allow_flashinfer
-              and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS):
+
+        elif (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        ):
             from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4)
+                flashinfer_cutlass_moe_fp4,
+            )
+
             assert self.moe_quant_config is not None
 
             return flashinfer_cutlass_moe_fp4(
@@ -1576,8 +1768,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         else:
             # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
             # only (no EP).
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                cutlass_moe_fp4)
+            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+
             assert self.moe_quant_config is not None
             return cutlass_moe_fp4(
                 a=x,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index ee8d33e636f972e64e04e5fbf4ae6e879a9000b2..2090c86f78dc8f58a1bb7f4e7ddd28bee98986f4 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,23 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, int4_w4a16_moe_quant_config,
-    int8_w8a16_moe_quant_config)
+    FusedMoEQuantConfig,
+    int4_w4a16_moe_quant_config,
+    int8_w8a16_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+    FusedMoE,
+    FusedMoEConfig,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_marlin_supports_layer)
+    check_marlin_supports_layer,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -25,10 +34,16 @@ from vllm.platforms import current_platform
 class MoeWNA16Config(QuantizationConfig):
     """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
 
-    def __init__(self, linear_quant_method: str, weight_bits: int,
-                 group_size: int, has_zp: bool, lm_head_quantized: bool,
-                 modules_to_not_convert: Optional[list[str]],
-                 full_config: dict[str, Any]) -> None:
+    def __init__(
+        self,
+        linear_quant_method: str,
+        weight_bits: int,
+        group_size: int,
+        has_zp: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: list[str] | None,
+        full_config: dict[str, Any],
+    ) -> None:
         super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
@@ -40,26 +55,25 @@ class MoeWNA16Config(QuantizationConfig):
         self.use_marlin = False
         # Avoid circular import
         from vllm.model_executor.layers.quantization.awq import AWQConfig
-        from vllm.model_executor.layers.quantization.awq_marlin import (
-            AWQMarlinConfig)
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinConfig)
+        from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
+        from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
+
         if self.linear_quant_method == "gptq":
-            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
-                full_config)
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
         elif self.linear_quant_method == "awq":
             capability_tuple = current_platform.get_device_capability()
-            device_capability = (-1 if capability_tuple is None else
-                                 capability_tuple.to_int())
+            device_capability = (
+                -1 if capability_tuple is None else capability_tuple.to_int()
+            )
             awq_min_capability = AWQConfig.get_min_capability()
             if device_capability < awq_min_capability:
                 raise ValueError(
                     "The quantization method moe_wna16 + awq is not supported "
                     "for the current GPU. "
                     f"Minimum capability: {awq_min_capability}. "
-                    f"Current capability: {device_capability}.")
-            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(
-                full_config)
+                    f"Current capability: {device_capability}."
+                )
+            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(full_config)
         else:
             raise ValueError("moe_wna16 only support gptq and awq.")
 
@@ -89,24 +103,32 @@ class MoeWNA16Config(QuantizationConfig):
         linear_quant_method = cls.get_from_keys(config, ["quant_method"])
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         if linear_quant_method == "gptq":
             has_zp = not cls.get_from_keys(config, ["sym"])
             modules_to_not_convert = []
         elif linear_quant_method == "awq":
             has_zp = cls.get_from_keys(config, ["zero_point"])
             modules_to_not_convert = cls.get_from_keys_or(
-                config, ["modules_to_not_convert"], None)
+                config, ["modules_to_not_convert"], None
+            )
         else:
             raise ValueError("moe_wna16 only support gptq and awq.")
 
-        return cls(linear_quant_method, weight_bits, group_size, has_zp,
-                   lm_head_quantized, modules_to_not_convert, config)
+        return cls(
+            linear_quant_method,
+            weight_bits,
+            group_size,
+            has_zp,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
         if can_convert and user_quant == "moe_wna16":
             return cls.get_name()
@@ -120,46 +142,59 @@ class MoeWNA16Config(QuantizationConfig):
         desc_act = quant_config.get("desc_act")
 
         capability_tuple = current_platform.get_device_capability()
-        device_capability = (-1 if capability_tuple is None else
-                             capability_tuple.to_int())
+        device_capability = (
+            -1 if capability_tuple is None else capability_tuple.to_int()
+        )
         # Avoid circular import
         from vllm.model_executor.layers.quantization.awq import AWQConfig
+
         awq_min_capability = AWQConfig.get_min_capability()
 
-        gptq_compatible = quant_method == "gptq" and \
-            not desc_act and num_bits in [4, 8]
-        awq_compatible = quant_method == "awq" and num_bits == 4 and \
-            device_capability >= awq_min_capability
+        gptq_compatible = quant_method == "gptq" and not desc_act and num_bits in [4, 8]
+        awq_compatible = (
+            quant_method == "awq"
+            and num_bits == 4
+            and device_capability >= awq_min_capability
+        )
 
         return gptq_compatible or awq_compatible
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
             return UnquantizedLinearMethod()
         elif isinstance(layer, LinearBase):
             # Avoid circular import
             from vllm.model_executor.layers.quantization.awq import AWQConfig
             from vllm.model_executor.layers.quantization.awq_marlin import (
-                AWQMarlinConfig)
+                AWQMarlinConfig,
+            )
             from vllm.model_executor.layers.quantization.gptq import GPTQConfig
             from vllm.model_executor.layers.quantization.gptq_marlin import (
-                GPTQMarlinConfig)
+                GPTQMarlinConfig,
+            )
+
             if self.linear_quant_method == "gptq":
                 if self.use_marlin:
                     return GPTQMarlinConfig.from_config(
-                        self.full_config).get_quant_method(layer, prefix)
+                        self.full_config
+                    ).get_quant_method(layer, prefix)
                 else:
-                    return GPTQConfig.from_config(
-                        self.full_config).get_quant_method(layer, prefix)
+                    return GPTQConfig.from_config(self.full_config).get_quant_method(
+                        layer, prefix
+                    )
             elif self.linear_quant_method == "awq":
                 if self.use_marlin and check_marlin_supports_layer(
-                        layer, self.group_size):
+                    layer, self.group_size
+                ):
                     return AWQMarlinConfig.from_config(
-                        self.full_config).get_quant_method(layer, prefix)
+                        self.full_config
+                    ).get_quant_method(layer, prefix)
                 else:
-                    return AWQConfig.from_config(
-                        self.full_config).get_quant_method(layer, prefix)
+                    return AWQConfig.from_config(self.full_config).get_quant_method(
+                        layer, prefix
+                    )
             else:
                 raise ValueError("moe_wna16 only support gptq and awq.")
         elif isinstance(layer, FusedMoE):
@@ -178,16 +213,19 @@ class MoeWNA16Method(FusedMoEMethodBase):
         quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
     """
 
-    def __init__(self, quant_config: MoeWNA16Config,
-                 moe: "FusedMoEConfig") -> None:
+    def __init__(self, quant_config: MoeWNA16Config, moe: "FusedMoEConfig") -> None:
         super().__init__(moe)
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
-        self.moe = layer
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.quant_config = self.quant_config
         bit8_pack_factor = self.quant_config.bit8_pack_factor
         group_size = self.quant_config.group_size
@@ -196,8 +234,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         # make intermediate_size and hidden_size divisible by group_size
         # we reduce the group size to ensure that
         # and we would repeat the loaded_weight later
-        while intermediate_size_per_partition % group_size or \
-                hidden_size % group_size:
+        while intermediate_size_per_partition % group_size or hidden_size % group_size:
             group_size = group_size // 2
             group_size_div_factor *= 2
             assert group_size >= 32
@@ -205,71 +242,85 @@ class MoeWNA16Method(FusedMoEMethodBase):
         layer.group_size_div_factor = group_size_div_factor
 
         strategy = FusedMoeWeightScaleSupported.GROUP.value
-        extra_weight_attrs.update({
-            "quant_method": strategy,
-            "is_transposed": False
-        })
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": False})
 
-        assert 'weight_loader' in extra_weight_attrs
-        weight_loader = extra_weight_attrs['weight_loader']
-        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
-            layer, weight_loader)
-        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+        assert "weight_loader" in extra_weight_attrs
+        weight_loader = extra_weight_attrs["weight_loader"]
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(layer, weight_loader)
+        extra_weight_attrs["weight_loader"] = wrapped_weight_loader
 
         # Fused gate_up_proj (column parallel)
-        w13_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size // bit8_pack_factor,
-            dtype=torch.uint8),
-                                         requires_grad=False)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_qweight", w13_qweight)
         set_weight_attrs(w13_qweight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition // bit8_pack_factor,
-            dtype=torch.uint8),
-                                        requires_grad=False)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_qweight", w2_qweight)
         set_weight_attrs(w2_qweight, extra_weight_attrs)
 
-        w13_scales = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size // group_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
 
-        w2_scales = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition // group_size,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
 
         if self.quant_config.has_zp:
-            w13_qzeros = torch.nn.Parameter(torch.zeros(
-                num_experts,
-                2 * intermediate_size_per_partition // bit8_pack_factor,
-                hidden_size // group_size,
-                dtype=torch.uint8),
-                                            requires_grad=False)
+            w13_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    2 * intermediate_size_per_partition // bit8_pack_factor,
+                    hidden_size // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w13_qzeros", w13_qzeros)
             set_weight_attrs(w13_qzeros, extra_weight_attrs)
 
-            w2_qzeros = torch.nn.Parameter(torch.zeros(
-                num_experts,
-                hidden_size // bit8_pack_factor,
-                intermediate_size_per_partition // group_size,
-                dtype=torch.uint8),
-                                           requires_grad=False)
+            w2_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    hidden_size // bit8_pack_factor,
+                    intermediate_size_per_partition // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w2_qzeros", w2_qzeros)
             set_weight_attrs(w2_qzeros, extra_weight_attrs)
 
@@ -280,19 +331,23 @@ class MoeWNA16Method(FusedMoEMethodBase):
             if not self.quant_config.has_zp:
                 invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
             for key in invalid_param_keys:
-                param = torch.nn.Parameter(torch.empty((0, ),
-                                                       dtype=torch.int32),
-                                           requires_grad=False)
+                param = torch.nn.Parameter(
+                    torch.empty((0,), dtype=torch.int32), requires_grad=False
+                )
                 layer.register_parameter(key, param)
                 set_weight_attrs(param, extra_weight_attrs)
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
         assert weight_bits == 4 or weight_bits == 8
-        config_builder = (int4_w4a16_moe_quant_config
-                          if weight_bits == 4 else int8_w8a16_moe_quant_config)
+        config_builder = (
+            int4_w4a16_moe_quant_config
+            if weight_bits == 4
+            else int8_w8a16_moe_quant_config
+        )
 
         return config_builder(
             w1_scale=layer.w13_scales,
@@ -310,27 +365,26 @@ class MoeWNA16Method(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `MoeWNA16Method` yet.")
+            raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
 
         from vllm.model_executor.layers.fused_moe import fused_experts
+
         assert activation == "silu", "Only SiLU activation is supported."
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -344,7 +398,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
         return fused_experts(
             x,
@@ -361,7 +416,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
     @staticmethod
     def get_weight_loader(layer, weight_loader):
-
         def convert_awq_tensor(tensor, tensor_type):
             # convert awq qweight/qzeros to a standard format (assume int4)
             # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
@@ -377,9 +431,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
             # 2. unpack to uint4 (only when weight_bits == 4)
             #    shape (a, 4 * b) -> (a, 4 * b, 2)
-            shifter = torch.tensor([0, 4],
-                                   dtype=torch.uint8,
-                                   device=tensor.device)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
             tensor = (tensor[:, :, None] >> shifter) & 0xF
 
             # 3. change order, see
@@ -404,20 +456,20 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
         def convert_gptq_int4_qzeros(tensor):
             tensor = tensor.view(torch.uint8)
-            shifter = torch.tensor([0, 4],
-                                   dtype=torch.uint8,
-                                   device=tensor.device)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
             tensor = (tensor[:, :, None] >> shifter) & 0xF
             tensor = tensor + 1
             tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
             return tensor
 
-        def moe_wna16_weight_loader(param: torch.nn.Parameter,
-                                    loaded_weight: torch.Tensor,
-                                    weight_name: str,
-                                    shard_id: str,
-                                    expert_id: int,
-                                    return_success: bool = False):
+        def moe_wna16_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: str,
+            expert_id: int,
+            return_success: bool = False,
+        ):
             if "g_idx" in weight_name:
                 return False if return_success else None
             if not layer.quant_config.has_zp and "qzeros" in weight_name:
@@ -432,8 +484,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
             if layer.quant_config.linear_quant_method == "awq":
                 assert layer.quant_config.weight_bits == 4
                 if "weight" in weight_name:
-                    loaded_weight = convert_awq_tensor(loaded_weight,
-                                                       "qweight")
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
                 elif "zeros" in weight_name:
                     loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
                 else:
@@ -441,44 +492,50 @@ class MoeWNA16Method(FusedMoEMethodBase):
             elif layer.quant_config.linear_quant_method == "gptq":
                 assert layer.quant_config.weight_bits in [4, 8]
                 if "weight" in weight_name:
-                    loaded_weight = loaded_weight.T.contiguous().view(
-                        torch.uint8)
+                    loaded_weight = loaded_weight.T.contiguous().view(torch.uint8)
                 elif "zeros" in weight_name:
                     # add 1 to gptq qzeros to align with awq
                     loaded_weight = loaded_weight.view(torch.uint8)
                     if layer.quant_config.weight_bits == 4:
-                        loaded_weight = convert_gptq_int4_qzeros(
-                            loaded_weight).T
+                        loaded_weight = convert_gptq_int4_qzeros(loaded_weight).T
                     else:
                         loaded_weight = loaded_weight.T + 1
                 else:
                     loaded_weight = loaded_weight.T
 
             # repeat the qzeros/scales to fit new group size
-            if layer.group_size_div_factor > 1 and \
-                    "qzeros" in weight_name or "scales" in weight_name:
+            if (
+                layer.group_size_div_factor > 1
+                and "qzeros" in weight_name
+                or "scales" in weight_name
+            ):
                 loaded_weight = loaded_weight.repeat_interleave(
-                    layer.group_size_div_factor, 1)
+                    layer.group_size_div_factor, 1
+                )
 
             if "w13_qzeros" in weight_name:
-                tensor = loaded_weight.view(layer.tp_size, -1,
-                                            loaded_weight.size(1))[tp_rank]
+                tensor = loaded_weight.view(layer.tp_size, -1, loaded_weight.size(1))[
+                    tp_rank
+                ]
                 if shard_id == "w1":
-                    param.data[expert_id, :shard_size // 2] = tensor
+                    param.data[expert_id, : shard_size // 2] = tensor
                 else:
-                    param.data[expert_id, shard_size // 2:] = tensor
+                    param.data[expert_id, shard_size // 2 :] = tensor
                 return True if return_success else None
             elif "w2_qzeros" in weight_name:
                 param.data[expert_id] = loaded_weight.view(
-                    loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+                    loaded_weight.size(0), layer.tp_size, -1
+                )[:, tp_rank]
                 return True if return_success else None
             else:
                 # Delegate to the original loader, passing return_success
-                return weight_loader(param,
-                                     loaded_weight,
-                                     weight_name,
-                                     shard_id,
-                                     expert_id,
-                                     return_success=return_success)
+                return weight_loader(
+                    param,
+                    loaded_weight,
+                    weight_name,
+                    shard_id,
+                    expert_id,
+                    return_success=return_success,
+                )
 
         return moe_wna16_weight_loader
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 54194b2e7d5b08f0870004d2ce02fb6994455221..b95d1a6b3a1f55b5c73fb1bae4e77ca73b21ad79 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
 from enum import Enum
-from typing import Callable, Optional, Union
+from typing import Optional
 
 import torch
 from torch.nn.parameter import Parameter
@@ -9,32 +10,49 @@ from torch.nn.parameter import Parameter
 from vllm import envs
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
-                                                  FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE,
+    FusedMoEConfig,
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.fused_moe import modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config,
-    mxfp4_w4a16_moe_quant_config)
+    FusedMoEQuantConfig,
+    mxfp4_mxfp8_moe_quant_config,
+    mxfp4_w4a16_moe_quant_config,
+    ocp_mx_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    OAITritonExperts)
+    OAITritonExperts,
+)
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    prepare_moe_fp4_layer_for_marlin)
+    prepare_moe_fp4_layer_for_marlin,
+)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    _can_support_mxfp4, _swizzle_mxfp4)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    is_layer_skipped)
+    _can_support_mxfp4,
+    _swizzle_mxfp4,
+    get_padding_alignment,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
-                        next_power_of_2, round_up)
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
@@ -56,50 +74,78 @@ class Mxfp4Backend(Enum):
     TRITON = 6
 
 
-def get_mxfp4_backend():
+def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
+    """
+    Not all MXFP4 backends support LoRA. Select backends that are known to
+    have LoRA support.
+    """
+    if not current_platform.is_cuda():
+        return Mxfp4Backend.NONE
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
+
+
+def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
     # Backend Selection
+
+    if with_lora_support:
+        return get_mxfp4_backend_with_lora()
+
     if current_platform.is_cuda():
-        if (current_platform.is_device_capability(90) and has_flashinfer()
-                and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if (
+            current_platform.is_device_capability(90)
+            and has_flashinfer()
+            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
+        ):
             logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
             return Mxfp4Backend.SM90_FI_MXFP4_BF16
-        elif (current_platform.is_device_capability(100) and has_flashinfer()
-              and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS):
-            logger.info_once(
-                "Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
+        elif (
+            current_platform.is_device_capability(100)
+            and has_flashinfer()
+            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
+        ):
+            logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
             return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-        elif (current_platform.is_device_capability(100) and has_flashinfer()
-              and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8):
-            logger.info_once(
-                "Using FlashInfer MXFP4 MXFP8 TRTLLM backend for SM100, "
-                "for high concurrency throughput workloads consider setting "
-                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS=1 for better "
-                "performance")
+        elif (
+            current_platform.is_device_capability(100)
+            and has_flashinfer()
+            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+        ):
             return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
         elif current_platform.is_device_capability(100) and has_flashinfer():
             logger.info_once(
                 "Using FlashInfer MXFP4 BF16 backend for SM100, "
                 "For faster performance on SM100, consider setting "
                 "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact "
-                "accuracy.")
+                "accuracy."
+            )
             return Mxfp4Backend.SM100_FI_MXFP4_BF16
-        elif ((current_platform.is_device_capability(100)
-               or current_platform.is_device_capability(90))
-              and not has_flashinfer()):
+        elif (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(90)
+        ) and not has_flashinfer():
             logger.warning_once(
                 "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer "
                 "is not available. This may result in degraded performance. "
-                "Please `pip install vllm[flashinfer]` for best results.")
+                "Please `pip install vllm[flashinfer]` for best results."
+            )
 
         # If FlashInfer is not available, try either Marlin or Triton
-        if current_platform.get_device_capability(
-        )[0] < 9 or not has_triton_kernels() or not is_torch_equal_or_newer(
-                "2.8.0"):
+        if (
+            envs.VLLM_MXFP4_USE_MARLIN
+            or current_platform.get_device_capability()[0] < 9
+            or not has_triton_kernels()
+            or not is_torch_equal_or_newer("2.8.0")
+        ):
             logger.info_once("Using Marlin backend")
             return Mxfp4Backend.MARLIN
         else:
             logger.info_once("Using Triton backend")
             return Mxfp4Backend.TRITON
+    elif current_platform.is_xpu():
+        logger.info_once("Using ipex marlin backend on XPU")
+        return Mxfp4Backend.MARLIN
     elif current_platform.is_rocm() and has_triton_kernels():
         logger.info_once("Using Triton backend")
         return Mxfp4Backend.TRITON
@@ -108,8 +154,7 @@ def get_mxfp4_backend():
 
 
 class Mxfp4Config(QuantizationConfig):
-
-    def __init__(self, ignored_layers: Optional[list[str]] = None):
+    def __init__(self, ignored_layers: list[str] | None = None):
         super().__init__()
         self.ignored_layers = ignored_layers
 
@@ -133,43 +178,65 @@ class Mxfp4Config(QuantizationConfig):
     def get_config_filenames(cls) -> list[str]:
         return []
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
             if self.ignored_layers and is_layer_skipped(
-                    prefix=prefix,
-                    ignored_layers=self.ignored_layers,
-                    fused_mapping=self.packed_modules_mapping):
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
                 return UnquantizedLinearMethod()
-            raise NotImplementedError("Mxfp4 linear layer is not implemented")
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
+            # if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod."
+            )
+            return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
-            return Mxfp4MoEMethod(layer.moe_config)
+            if current_platform.is_xpu():
+                return IpexMxfp4MoEMethod(layer.moe_config)
+            else:
+                return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
-            raise NotImplementedError(
-                "Mxfp4 attention layer is not implemented")
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once(
+                "MXFP4 attention layer is not implemented. "
+                "Skipping quantization for this layer."
+            )
         return None
 
 
 class Mxfp4MoEMethod(FusedMoEMethodBase):
-
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
-        self.topk_indices_dtype = None
-        self.moe = moe
-        self.mxfp4_backend = get_mxfp4_backend()
-        self.max_capture_size = get_current_vllm_config(
-        ).compilation_config.max_capture_size
+        self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
+        self.max_capture_size = (
+            get_current_vllm_config().compilation_config.max_cudagraph_capture_size
+        )
 
         assert self.mxfp4_backend != Mxfp4Backend.NONE, (
-            "No MXFP4 MoE backend (FlashInfer/Marlin/Triton) available."
-            "Please check your environment and try again.")
+            f"get_mxfp4_backend(with_lora_support={moe.is_lora_enabled}) found"
+            "no compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton)."
+            "Please check your environment and try again."
+        )
         self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         self.num_experts = num_experts
         weight_dtype = torch.uint8
         scale_dtype = torch.uint8
@@ -184,8 +251,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         mxfp4_block = 32
 
-        intermediate_size_per_partition_after_pad = \
-            intermediate_size_per_partition
+        intermediate_size_per_partition_after_pad = intermediate_size_per_partition
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             # The moe marlin kernel requires that for each linear
             # n % 256 == 0 and k % 128 == 0.
@@ -196,34 +262,48 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             #    n = hidden_size
             #    k = intermediate_size_per_partition_after_pad
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 128)
-            hidden_size = round_up(hidden_size, 256)
+                intermediate_size_per_partition, 128
+            )
+            if current_platform.is_xpu():
+                hidden_size = round_up(hidden_size, 128)
+            else:
+                hidden_size = round_up(hidden_size, 256)
 
             layer.params_dtype = params_dtype
             layer.num_experts = num_experts
             layer.hidden_size = hidden_size
-            layer.intermediate_size_per_partition = \
+            layer.intermediate_size_per_partition = (
                 intermediate_size_per_partition_after_pad
-        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-              or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+            )
+        elif (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
+        ):
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 256)
+                intermediate_size_per_partition, 256
+            )
             hidden_size = round_up(hidden_size, 256)
-        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
+        elif (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
+        ):
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 128)
+                intermediate_size_per_partition, 128
+            )
             hidden_size = round_up(hidden_size, 128)
         elif current_platform.is_rocm():
+            pad_align = get_padding_alignment()
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 256)
-            hidden_size = round_up(hidden_size, 256)
+                intermediate_size_per_partition, pad_align
+            )
+            hidden_size = round_up(hidden_size, pad_align)
         else:
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 64)
+                intermediate_size_per_partition, 64
+            )
 
         self.intermediate_size = intermediate_size_per_partition_after_pad
         self.hidden_size = hidden_size
@@ -302,47 +382,61 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer):
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             prepare_moe_fp4_layer_for_marlin(layer)
-        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-              or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
-            from flashinfer.fp4_quantization import (
-                nvfp4_block_scale_interleave)
-            from flashinfer.fused_moe.core import (
-                _maybe_get_cached_w2_permute_indices)
-            layer.gemm1_alpha = Parameter(torch.tensor(
-                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
-                                          requires_grad=False)
-            layer.gemm1_beta = Parameter(torch.tensor(
-                [1.0] * self.num_experts, dtype=torch.float32).cuda(),
-                                         requires_grad=False)
-            layer.gemm1_clamp_limit = Parameter(torch.tensor(
-                [7.0] * self.num_experts, dtype=torch.float32).cuda(),
-                                                requires_grad=False)
+        elif (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
+        ):
+            from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
+            from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache
+
+            layer.gemm1_alpha = Parameter(
+                torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_beta = Parameter(
+                torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_clamp_limit = Parameter(
+                torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
             sf_block_size = 32  # mxfp4 block size
 
-            assert (layer.w13_weight.dim() == 3
-                    and layer.w13_weight.shape[0] == self.num_experts
-                    and layer.w13_weight.shape[1] == self.intermediate_size * 2
-                    and layer.w13_weight.shape[2] == self.hidden_size // 2)
-            assert (layer.w13_weight_scale.dim() == 3
-                    and layer.w13_weight_scale.shape[0] == self.num_experts
-                    and layer.w13_weight_scale.shape[1]
-                    == self.intermediate_size * 2
-                    and layer.w13_weight_scale.shape[2]
-                    == self.hidden_size // sf_block_size)
-            assert (layer.w2_weight.dim() == 3
-                    and layer.w2_weight.shape[0] == self.num_experts
-                    and layer.w2_weight.shape[1] == self.hidden_size and
-                    layer.w2_weight.shape[2] == self.intermediate_size // 2)
-            assert (layer.w2_weight_scale.dim() == 3
-                    and layer.w2_weight_scale.shape[1] == self.hidden_size
-                    and layer.w2_weight_scale.shape[2]
-                    == self.intermediate_size // sf_block_size)
-            assert (layer.w13_bias.dim() == 2
-                    and layer.w13_bias.shape[0] == self.num_experts
-                    and layer.w13_bias.shape[1] == self.intermediate_size * 2)
-            assert (layer.w2_bias.dim() == 2
-                    and layer.w2_bias.shape[0] == self.num_experts
-                    and layer.w2_bias.shape[1] == self.hidden_size)
+            assert (
+                layer.w13_weight.dim() == 3
+                and layer.w13_weight.shape[0] == self.num_experts
+                and layer.w13_weight.shape[1] == self.intermediate_size * 2
+                and layer.w13_weight.shape[2] == self.hidden_size // 2
+            )
+            assert (
+                layer.w13_weight_scale.dim() == 3
+                and layer.w13_weight_scale.shape[0] == self.num_experts
+                and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2
+                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
+            )
+            assert (
+                layer.w2_weight.dim() == 3
+                and layer.w2_weight.shape[0] == self.num_experts
+                and layer.w2_weight.shape[1] == self.hidden_size
+                and layer.w2_weight.shape[2] == self.intermediate_size // 2
+            )
+            assert (
+                layer.w2_weight_scale.dim() == 3
+                and layer.w2_weight_scale.shape[1] == self.hidden_size
+                and layer.w2_weight_scale.shape[2]
+                == self.intermediate_size // sf_block_size
+            )
+            assert (
+                layer.w13_bias.dim() == 2
+                and layer.w13_bias.shape[0] == self.num_experts
+                and layer.w13_bias.shape[1] == self.intermediate_size * 2
+            )
+            assert (
+                layer.w2_bias.dim() == 2
+                and layer.w2_bias.shape[0] == self.num_experts
+                and layer.w2_bias.shape[1] == self.hidden_size
+            )
 
             w13_weight_scale = layer.w13_weight_scale.data
             w2_weight_scale = layer.w2_weight_scale.data
@@ -385,126 +479,172 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
             for i in range(self.num_experts):
                 # w13 weight shuffling
-                permute_indices = _maybe_get_cached_w2_permute_indices(
+                permute_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w13_weight[i].view(torch.uint8),
                     epilogue_tile_m,
                 )
-                gemm1_weights_mxfp4_shuffled.append(w13_weight[i].view(
-                    torch.uint8)[permute_indices.to(
-                        w13_weight.device)].contiguous())
+                gemm1_weights_mxfp4_shuffled.append(
+                    w13_weight[i]
+                    .view(torch.uint8)[permute_indices.to(w13_weight.device)]
+                    .contiguous()
+                )
                 # w13 scale shuffling
-                permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                permute_sf_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w13_weight_scale[i].view(torch.uint8),
                     epilogue_tile_m,
                     num_elts_per_sf=16,
                 )
                 gemm1_scales_mxfp4_shuffled.append(
-                    nvfp4_block_scale_interleave(w13_weight_scale[i].view(
-                        torch.uint8)[permute_sf_indices.to(
-                            w13_weight_scale.device)].contiguous()))
+                    nvfp4_block_scale_interleave(
+                        w13_weight_scale[i]
+                        .view(torch.uint8)[
+                            permute_sf_indices.to(w13_weight_scale.device)
+                        ]
+                        .contiguous()
+                    )
+                )
                 # w13 bias shuffling
-                permute_bias_indices = _maybe_get_cached_w2_permute_indices(
+                permute_bias_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w13_bias[i].clone().reshape(-1, 1),
                     epilogue_tile_m,
                 )
-                gemm1_bias_shuffled.append(w13_bias[i].clone().reshape(
-                    -1,
-                    1)[permute_bias_indices.to(w13_bias.device)].contiguous())
+                gemm1_bias_shuffled.append(
+                    w13_bias[i]
+                    .clone()
+                    .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
+                    .contiguous()
+                )
                 # w2 weight shuffling
-                permute_indices = _maybe_get_cached_w2_permute_indices(
+                permute_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w2_weight[i].view(torch.uint8),
                     epilogue_tile_m,
                 )
-                gemm2_weights_mxfp4_shuffled.append(w2_weight[i].view(
-                    torch.uint8)[permute_indices.to(
-                        w2_weight.device)].contiguous())
+                gemm2_weights_mxfp4_shuffled.append(
+                    w2_weight[i]
+                    .view(torch.uint8)[permute_indices.to(w2_weight.device)]
+                    .contiguous()
+                )
                 # w2 scale shuffling
-                permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                permute_sf_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w2_weight_scale[i].view(torch.uint8),
                     epilogue_tile_m,
                     num_elts_per_sf=16,
                 )
                 gemm2_scales_mxfp4_shuffled.append(
-                    nvfp4_block_scale_interleave(w2_weight_scale[i].view(
-                        torch.uint8)[permute_sf_indices.to(
-                            w2_weight_scale.device)].contiguous()))
+                    nvfp4_block_scale_interleave(
+                        w2_weight_scale[i]
+                        .view(torch.uint8)[
+                            permute_sf_indices.to(w2_weight_scale.device)
+                        ]
+                        .contiguous()
+                    )
+                )
                 # w2 bias shuffling
-                permute_indices = _maybe_get_cached_w2_permute_indices(
+                permute_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
                     w2_bias[i].clone().reshape(-1, 1),
                     epilogue_tile_m,
                 )
-                gemm2_bias_shuffled.append(w2_bias[i].clone().reshape(
-                    -1, 1)[permute_indices.to(w2_bias.device)].contiguous())
+                gemm2_bias_shuffled.append(
+                    w2_bias[i]
+                    .clone()
+                    .reshape(-1, 1)[permute_indices.to(w2_bias.device)]
+                    .contiguous()
+                )
 
             w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
-            w13_weight_scale = torch.stack(
-                gemm1_scales_mxfp4_shuffled).reshape(
-                    self.num_experts, 2 * self.intermediate_size,
-                    self.hidden_size // sf_block_size).view(
-                        torch.float8_e4m3fn)
+            w13_weight_scale = (
+                torch.stack(gemm1_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    2 * self.intermediate_size,
+                    self.hidden_size // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
 
             w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled)
-            w2_weight_scale = torch.stack(gemm2_scales_mxfp4_shuffled).reshape(
-                self.num_experts, self.hidden_size, self.intermediate_size //
-                sf_block_size).view(torch.float8_e4m3fn)
+            w2_weight_scale = (
+                torch.stack(gemm2_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    self.hidden_size,
+                    self.intermediate_size // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
 
             layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale = Parameter(w13_weight_scale,
-                                               requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False)
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale = Parameter(w2_weight_scale,
-                                              requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False)
             layer.w13_bias = Parameter(
                 torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1),
-                requires_grad=False)
-            layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape(
-                self.num_experts, -1),
-                                      requires_grad=False)
-        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
-            layer.gemm1_alpha = Parameter(torch.tensor(
-                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
-                                          requires_grad=False)
-            layer.gemm1_beta = Parameter(torch.tensor(
-                [1.0] * self.num_experts, dtype=torch.float32).cuda(),
-                                         requires_grad=False)
-            layer.gemm1_clamp_limit = Parameter(torch.tensor(
-                [7.0] * self.num_experts, dtype=torch.float32).cuda(),
-                                                requires_grad=False)
+                requires_grad=False,
+            )
+            layer.w2_bias = Parameter(
+                torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False,
+            )
+        elif (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
+        ):
+            layer.gemm1_alpha = Parameter(
+                torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_beta = Parameter(
+                torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_clamp_limit = Parameter(
+                torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
 
             sf_block_size = 32  # mxfp4 block size
 
             # Common shape assertions
-            assert (layer.w13_weight.dim() == 3
-                    and layer.w13_weight.shape[0] == self.num_experts
-                    and layer.w13_weight.shape[1] == self.intermediate_size * 2
-                    and layer.w13_weight.shape[2] == self.hidden_size // 2)
-            assert (layer.w13_weight_scale.dim() == 3
-                    and layer.w13_weight_scale.shape[0] == self.num_experts
-                    and layer.w13_weight_scale.shape[1]
-                    == self.intermediate_size * 2
-                    and layer.w13_weight_scale.shape[2]
-                    == self.hidden_size // sf_block_size)
-            assert (layer.w2_weight.dim() == 3
-                    and layer.w2_weight.shape[0] == self.num_experts
-                    and layer.w2_weight.shape[1] == self.hidden_size and
-                    layer.w2_weight.shape[2] == self.intermediate_size // 2)
-            assert (layer.w2_weight_scale.dim() == 3
-                    and layer.w2_weight_scale.shape[1] == self.hidden_size
-                    and layer.w2_weight_scale.shape[2]
-                    == self.intermediate_size // sf_block_size)
-            assert (layer.w13_bias.dim() == 2
-                    and layer.w13_bias.shape[0] == self.num_experts
-                    and layer.w13_bias.shape[1] == self.intermediate_size * 2)
-            assert (layer.w2_bias.dim() == 2
-                    and layer.w2_bias.shape[0] == self.num_experts
-                    and layer.w2_bias.shape[1] == self.hidden_size)
+            assert (
+                layer.w13_weight.dim() == 3
+                and layer.w13_weight.shape[0] == self.num_experts
+                and layer.w13_weight.shape[1] == self.intermediate_size * 2
+                and layer.w13_weight.shape[2] == self.hidden_size // 2
+            )
+            assert (
+                layer.w13_weight_scale.dim() == 3
+                and layer.w13_weight_scale.shape[0] == self.num_experts
+                and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2
+                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
+            )
+            assert (
+                layer.w2_weight.dim() == 3
+                and layer.w2_weight.shape[0] == self.num_experts
+                and layer.w2_weight.shape[1] == self.hidden_size
+                and layer.w2_weight.shape[2] == self.intermediate_size // 2
+            )
+            assert (
+                layer.w2_weight_scale.dim() == 3
+                and layer.w2_weight_scale.shape[1] == self.hidden_size
+                and layer.w2_weight_scale.shape[2]
+                == self.intermediate_size // sf_block_size
+            )
+            assert (
+                layer.w13_bias.dim() == 2
+                and layer.w13_bias.shape[0] == self.num_experts
+                and layer.w13_bias.shape[1] == self.intermediate_size * 2
+            )
+            assert (
+                layer.w2_bias.dim() == 2
+                and layer.w2_bias.shape[0] == self.num_experts
+                and layer.w2_bias.shape[1] == self.hidden_size
+            )
 
             # De-interleave and swap for w13 weight, bias, and scales
             w13_w = layer.w13_weight.data
@@ -530,51 +670,55 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
                 orig_shape = w13_scale_swapped.shape
                 w13_scale_interleaved = block_scale_interleave(
-                    w13_scale_swapped.view(torch.uint8)).reshape(orig_shape)
+                    w13_scale_swapped.view(torch.uint8)
+                ).reshape(orig_shape)
 
                 w2_s = layer.w2_weight_scale.data
                 orig_shape = w2_s.shape
                 w2_scale_interleaved = block_scale_interleave(
-                    w2_s.view(torch.uint8)).reshape(orig_shape)
-
-                layer.w13_weight = Parameter(w13_weight_swapped,
-                                             requires_grad=False)
-                layer.w13_weight_scale = Parameter(w13_scale_interleaved,
-                                                   requires_grad=False)
-                layer.w13_bias = Parameter(w13_bias_swapped,
-                                           requires_grad=False)
-                layer.w2_weight_scale = Parameter(w2_scale_interleaved,
-                                                  requires_grad=False)
+                    w2_s.view(torch.uint8)
+                ).reshape(orig_shape)
+
+                layer.w13_weight = Parameter(w13_weight_swapped, requires_grad=False)
+                layer.w13_weight_scale = Parameter(
+                    w13_scale_interleaved, requires_grad=False
+                )
+                layer.w13_bias = Parameter(w13_bias_swapped, requires_grad=False)
+                layer.w2_weight_scale = Parameter(
+                    w2_scale_interleaved, requires_grad=False
+                )
             elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16:
 
                 def _interleave_mxfp4_cutlass_sm90(w):
                     w_shape = w.shape
-                    w_interleaved = w.reshape(w_shape[0], w_shape[1],
-                                              (w_shape[2] // 4), 4)
+                    w_interleaved = w.reshape(
+                        w_shape[0], w_shape[1], (w_shape[2] // 4), 4
+                    )
                     w_interleaved = w_interleaved.permute(0, 2, 1, 3)
                     w_interleaved = w_interleaved.reshape(
-                        w_shape[0], w_shape[2] // 4, w_shape[1] * 4)
+                        w_shape[0], w_shape[2] // 4, w_shape[1] * 4
+                    )
                     return w_interleaved
 
-                w31_scales = w13_scale_swapped.to(torch.uint8).view(
-                    torch.uint8)
-                w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(
-                    w31_scales)
+                w31_scales = w13_scale_swapped.to(torch.uint8).view(torch.uint8)
+                w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales)
 
                 w2_weight_scale = layer.w2_weight_scale.data
                 w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8)
-                w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(
-                    w2_scales)
-
-                layer.w13_weight = torch.nn.Parameter(torch.cat([w3_w, w1_w],
-                                                                dim=1),
-                                                      requires_grad=False)
-                layer.w13_bias = torch.nn.Parameter(w13_bias_swapped,
-                                                    requires_grad=False)
+                w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scales)
+
+                layer.w13_weight = torch.nn.Parameter(
+                    torch.cat([w3_w, w1_w], dim=1), requires_grad=False
+                )
+                layer.w13_bias = torch.nn.Parameter(
+                    w13_bias_swapped, requires_grad=False
+                )
                 layer.w13_weight_scale = torch.nn.Parameter(
-                    w31_scales_interleaved, requires_grad=False)
+                    w31_scales_interleaved, requires_grad=False
+                )
                 layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales_interleaved, requires_grad=False)
+                    w2_scales_interleaved, requires_grad=False
+                )
         elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -589,66 +733,44 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             # batched activation format. As self.fused_experts is not
             # initialized at this point, we resort to checking the MoE config
             # directly.
-            is_batched_moe = (self.moe.use_pplx_kernels
-                              or self.moe.use_deepep_ll_kernels)
+            is_batched_moe = self.moe.use_pplx_kernels or self.moe.use_deepep_ll_kernels
             if is_batched_moe:
                 num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
             else:
                 num_warps = 8
 
             w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
-                layer.w13_weight, layer.w13_weight_scale, num_warps)
+                layer.w13_weight, layer.w13_weight_scale, num_warps
+            )
             w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
-                layer.w2_weight, layer.w2_weight_scale, num_warps)
+                layer.w2_weight, layer.w2_weight_scale, num_warps
+            )
 
             self.w13_precision_config = PrecisionConfig(
-                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex))
+                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
+            )
             self.w2_precision_config = PrecisionConfig(
-                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex))
-
-            self.w13_weight_triton_tensor = w13_weight
-            self.w2_weight_triton_tensor = w2_weight
+                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
+            )
 
-            # need to delete the original weights to save memory on single GPU
-            del layer.w13_weight
-            del layer.w2_weight
-            layer.w13_weight = None
-            layer.w2_weight = None
-            torch.cuda.empty_cache()
+            self.w13_weight = w13_weight
+            self.w2_weight = w2_weight
+            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
-    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
-        # Number of tokens in the input tensor.
-        num_tokens = x.shape[0]
-        # Factor to account for the imbalance of the experts.
-        # factor equals to the
-        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
-        # - 1.0 means perfect expert distribution.
-        # - > 1.0 means some experts have more
-        #     tokens than the perfect distribution.
-        # - < 1.0 does not make sense.
-        imbalance_factor = 1.3
-        # Calculate the number of tokens per expert
-        # assuming perfect distribution.
-        num_tokens_per_expert = (num_tokens * top_k) // self.num_experts
-        # Apply the imbalance factor.
-        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
-        # And pad the number to the next power of 2.
-        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
-        # Cap to 8-64 tokens per CTA tile
-        # as it's the range supported by the kernel.
-        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-
-        return tile_tokens_dim
-
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            return None
-
-        if self.mxfp4_backend == Mxfp4Backend.TRITON:
+            return mxfp4_w4a16_moe_quant_config(
+                w1_bias=layer.w13_bias,
+                w2_bias=layer.w2_bias,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+            )
+        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             w1_scale = self.w13_precision_config
             w2_scale = self.w2_precision_config
             return mxfp4_w4a16_moe_quant_config(
@@ -657,10 +779,28 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 w1_scale=w1_scale,
                 w2_scale=w2_scale,
             )
+        elif self.mxfp4_backend in [
+            Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM,
+            Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS,
+        ]:
+            return mxfp4_mxfp8_moe_quant_config(
+                w1_bias=layer.w13_bias,
+                w2_bias=layer.w2_bias,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+            )
+        elif self.mxfp4_backend in [Mxfp4Backend.SM100_FI_MXFP4_BF16]:
+            return mxfp4_w4a16_moe_quant_config(
+                w1_bias=layer.w13_bias,
+                w2_bias=layer.w2_bias,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+            )
         else:
             w1_scale = layer.w13_weight_scale
             w2_scale = layer.w2_weight_scale
-            return mxfp4_w4a4_moe_quant_config(
+            return ocp_mx_moe_quant_config(
+                quant_dtype="mxfp4",
                 w1_bias=layer.w13_bias,
                 w2_bias=layer.w2_bias,
                 w1_scale=w1_scale,
@@ -672,14 +812,30 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
-        if (prepare_finalize.activation_format ==
-                mk.FusedMoEActivationFormat.BatchedExperts):
-            raise NotImplementedError(
-                "Mxfp4 does not support batched experts format for EP")
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            if self.mxfp4_backend == Mxfp4Backend.MARLIN:
+                max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+                assert max_num_tokens_per_rank is not None
+                assert self.moe_quant_config is not None
+                return BatchedMarlinExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+            else:
+                raise NotImplementedError(
+                    f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for "
+                    "EP batched experts format"
+                )
         else:
             assert self.moe_quant_config is not None
-            if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-                    or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+            if (
+                self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+                or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
+            ):
                 # B200 code-path
                 kwargs = {
                     "gemm1_alpha": layer.gemm1_alpha,
@@ -688,72 +844,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                     # TODO(bnell): part of quant_config
                     "max_capture_size": self.max_capture_size,
                 }
-                return TrtLlmGenExperts(self.moe, self.moe_quant_config,
-                                        **kwargs)
-            else:
+                return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs)
+            elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
+                return MarlinExperts(self.moe_quant_config)
+            elif self.mxfp4_backend == Mxfp4Backend.TRITON:
                 return OAITritonExperts(self.moe_quant_config)
+            else:
+                raise NotImplementedError(
+                    f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP"
+                )
 
-    def _route_and_experts(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool = False,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            global_num_experts: int = -1,
-            expert_map: Optional[torch.Tensor] = None,
-            custom_routing_function: Optional[Callable] = None,
-            scoring_func: str = "softmax",
-            e_score_correction_bias: Optional[torch.Tensor] = None,
-            apply_router_weight_on_input: bool = False,
-            activation: str = "silu",
-            enable_eplb: bool = False,
-            expert_load_view: Optional[torch.Tensor] = None,
-            logical_to_physical_map: Optional[torch.Tensor] = None,
-            logical_replica_count: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-
-        assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count)
-
-        w13_weight = (self.w13_weight_triton_tensor
-                      if layer.w13_weight is None else layer.w13_weight)
-        w2_weight = (self.w2_weight_triton_tensor
-                     if layer.w2_weight is None else layer.w2_weight)
-        assert all([w is not None for w in [w13_weight, w2_weight]])
-
-        return self.fused_experts(
-            hidden_states=x,
-            w1=w13_weight,
-            w2=w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-        )
+    @property
+    def allow_inplace(self) -> bool:
+        return True
 
     def apply(
         self,
@@ -763,22 +866,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
@@ -794,9 +896,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 custom_routing_function=custom_routing_function,
                 scoring_func=scoring_func,
                 routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias)
+                e_score_correction_bias=e_score_correction_bias,
+            )
 
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -813,51 +916,39 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 activation=activation,
-                expert_map=expert_map)
-
-        if self.fused_experts is not None:
-            return self._route_and_experts(
-                layer,
-                x,
-                router_logits,
-                top_k,
-                renormalize,
-                use_grouped_topk,
-                topk_group,
-                num_expert_group,
-                global_num_experts,
-                expert_map,
-                custom_routing_function,
-                scoring_func,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                expert_load_view,
-                logical_to_physical_map,
-                logical_replica_count,
+                expert_map=expert_map,
             )
 
         assert _can_support_mxfp4(
-            use_grouped_topk, topk_group, num_expert_group, expert_map,
-            custom_routing_function, e_score_correction_bias,
-            apply_router_weight_on_input, scoring_func, activation,
-            expert_load_view, logical_to_physical_map,
-            logical_replica_count), (
-                "MXFP4 are not supported with this configuration.")
-
-        if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-                or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+            use_grouped_topk,
+            topk_group,
+            num_expert_group,
+            expert_map,
+            custom_routing_function,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            scoring_func,
+            activation,
+            expert_load_view,
+            logical_to_physical_map,
+            logical_replica_count,
+        ), "MXFP4 are not supported with this configuration."
+
+        if (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
+        ):
             from flashinfer import trtllm_fp4_block_scale_moe
+
             if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16:
                 assert x.dtype == torch.bfloat16
                 x_quant = x
                 x_scale = None
             elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM:
                 from flashinfer import mxfp8_quantize
+
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
-                    *x.shape[:-1], -1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x.shape[:-1], -1)
 
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
@@ -884,14 +975,16 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 layer.ep_rank * layer.local_num_experts,  # local_expert_offset
                 self.num_experts,  # local num experts
                 None,
-                self._get_tile_tokens_dim(x, top_k),
+                None,
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
             )[0]
             return trtllm_gen_output
-        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
+        elif (
+            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
+        ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
             topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -909,13 +1002,11 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
             # Backend-specific preparation
             if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS:
-
                 from flashinfer import mxfp8_quantize
 
                 x_quant, x_scale = mxfp8_quantize(x, True, 32)
 
-                fake_input_scale = torch.ones(self.num_experts,
-                                              device=x.device)
+                fake_input_scale = torch.ones(self.num_experts, device=x.device)
                 quant_scales = [
                     layer.w13_weight_scale.contiguous().view(torch.int32),
                     fake_input_scale,
@@ -927,10 +1018,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 extra_kwargs = dict(
                     use_mxfp8_act_scaling=True,
                     input_sf=x_scale,
-                    fc1_expert_weights=layer.w13_weight.contiguous().view(
-                        torch.long),
-                    fc2_expert_weights=layer.w2_weight.contiguous().view(
-                        torch.long),
+                    fc1_expert_weights=layer.w13_weight.contiguous().view(torch.long),
+                    fc2_expert_weights=layer.w2_weight.contiguous().view(torch.long),
                 )
             elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16:
                 assert x.dtype == torch.bfloat16
@@ -964,18 +1053,20 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 tp_rank=self.moe.tp_rank,
                 ep_size=self.moe.ep_size,
                 ep_rank=self.moe.ep_rank,
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
                 **extra_kwargs,
             )
 
             return output
         elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
-                triton_kernel_moe_forward)
+                triton_kernel_moe_forward,
+            )
+
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight_triton_tensor,
-                w2=self.w2_weight_triton_tensor,
+                w1=self.w13_weight,
+                w2=self.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,
@@ -986,3 +1077,86 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             )
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
+
+
+class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
+    def __init__(self, moe_config: FusedMoEConfig):
+        super().__init__(moe_config)
+        self.moe_config = moe_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        super().create_weights(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+        self.original_hidden_size = hidden_size
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        import intel_extension_for_pytorch as ipex
+
+        layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
+        layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
+        ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            w1_scale_inv=layer.w13_weight_scale,
+            w2_scale_inv=layer.w2_weight_scale,
+            w13_bias=layer.w13_bias,
+            w2_bias=layer.w2_bias,
+            is_mxfp4=True,
+            experts_start_id=ep_rank_start,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert activation == "swigluoai", (
+            "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
+        )
+        hidden_size_pad = round_up(self.original_hidden_size, 128)
+        x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
+        hidden_states = layer.ipex_fusion(
+            x_pad,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            activation="swiglu_oai",
+        )
+        hidden_states = hidden_states[..., : self.original_hidden_size].contiguous()
+        return hidden_states
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 5b9fee69bb02119940b19e7b99cb3d051bdcf0d7..402cebc38c215579a3904c481dc4893fcd6c5e34 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -9,19 +9,24 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.petit_utils import (
-    apply_petit_nvfp4_linear, prepare_nvfp4_layer_for_petit,
-    verify_petit_nvfp4_supported)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    is_layer_skipped)
-from vllm.model_executor.parameter import (ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    apply_petit_nvfp4_linear,
+    prepare_nvfp4_layer_for_petit,
+    verify_petit_nvfp4_supported,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 from vllm.platforms import current_platform
 
 # Initialize logger for the module
@@ -36,15 +41,17 @@ class PetitNvFp4Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_nvfp4_serialized: bool = False,
-        kv_cache_quant_algo: Optional[str] = None,
-        group_size: Optional[int] = None,
-        exclude_modules: Optional[list[str]] = None,
+        kv_cache_quant_algo: str | None = None,
+        group_size: int | None = None,
+        exclude_modules: list[str] | None = None,
     ) -> None:
         self._check_hardware_support()
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
-            logger.warning("Detected nvfp4 checkpoint. Please note that the "
-                           "format is experimental and subject to change.")
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
         self.group_size = group_size
         self.kv_cache_quant_algo = kv_cache_quant_algo
         self.exclude_modules = exclude_modules
@@ -61,7 +68,8 @@ class PetitNvFp4Config(QuantizationConfig):
                 "The 'petit' quantization backend is designed for AMD GPUs "
                 "and is not supported on the CUDA platform. For NVIDIA GPUs, "
                 "please use a different quantization method such as FP8, AWQ, "
-                "or GPTQ.")
+                "or GPTQ."
+            )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -86,8 +94,7 @@ class PetitNvFp4Config(QuantizationConfig):
 
         quant_method_raw = qc.get("quant_algo")
         if not isinstance(quant_method_raw, str) or not quant_method_raw:
-            raise ValueError(
-                "Missing or invalid 'quant_algo' in quantization config.")
+            raise ValueError("Missing or invalid 'quant_algo' in quantization config.")
         quant_method = quant_method_raw.upper()
 
         group_size_raw = qc.get("group_size")
@@ -101,19 +108,18 @@ class PetitNvFp4Config(QuantizationConfig):
 
         kv_cache_quant_algo_raw = qc.get("kv_cache_quant_algo") or "auto"
         if not isinstance(kv_cache_quant_algo_raw, str):
-            raise ValueError(
-                "'kv_cache_quant_algo' must be a string if provided.")
+            raise ValueError("'kv_cache_quant_algo' must be a string if provided.")
         kv_cache_quant_algo = kv_cache_quant_algo_raw
 
         exclude_raw = qc.get("exclude_modules", [])
         if exclude_raw is None:
             exclude_modules: list[str] = []
         elif isinstance(exclude_raw, list) and all(
-                isinstance(x, str) for x in exclude_raw):
+            isinstance(x, str) for x in exclude_raw
+        ):
             exclude_modules = exclude_raw
         else:
-            raise ValueError(
-                "'exclude_modules' must be a list[str] (or omitted).")
+            raise ValueError("'exclude_modules' must be a list[str] (or omitted).")
 
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
@@ -126,7 +132,8 @@ class PetitNvFp4Config(QuantizationConfig):
 
     @classmethod
     def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
         if not current_platform.is_rocm():
             return None
 
@@ -142,23 +149,24 @@ class PetitNvFp4Config(QuantizationConfig):
         algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper()
         return algo == "NVFP4"
 
-    def is_layer_excluded(self, prefix: str,
-                          exclude_modules: list[str]) -> bool:
+    def is_layer_excluded(self, prefix: str, exclude_modules: list[str]) -> bool:
         for pattern in exclude_modules:
             regex_str = pattern.replace(".", r"\.").replace("*", r".*")
             if re.fullmatch(regex_str, prefix):
                 return True
         return False
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         exclude = self.require_exclude_modules()
 
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, exclude) or self.is_layer_excluded(
-                    prefix, exclude):
+                prefix, exclude
+            ):
                 return UnquantizedLinearMethod()
             return PetitNvFp4LinearMethod(self)
         elif isinstance(layer, Attention):
@@ -220,8 +228,10 @@ class PetitNvFp4LinearMethod(LinearMethodBase):
     ):
         del input_size, output_size
         if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError("NVFP4 quantization was selected, "
-                             " dynamic quantization is not supported.")
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
 
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
@@ -231,12 +241,15 @@ class PetitNvFp4LinearMethod(LinearMethodBase):
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         if input_size_per_partition % 16 != 0:
-            raise ValueError("Unsupported model when in features size is "
-                             "not multiple of 16")
+            raise ValueError(
+                "Unsupported model when in features size is not multiple of 16"
+            )
 
-        weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_checkpoint_nvfp4_serialized
-                        else params_dtype)
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
 
         weight = ModelWeightParameter(
             data=torch.empty(
@@ -283,8 +296,9 @@ class PetitNvFp4LinearMethod(LinearMethodBase):
         weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
         layer.input_scale = Parameter(input_scale_2, requires_grad=False)
         layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
-        layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
-                                requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
 
         prepare_nvfp4_layer_for_petit(layer)
         del layer.input_scale
@@ -293,7 +307,7 @@ class PetitNvFp4LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_petit_nvfp4_linear(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 45ea8e3520f1d85df1c93a15ab3b2851f6dcf30c..26ba8e5b16bc0839b563124eb169140e9277b96e 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -8,18 +8,19 @@ from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
-                                                         Fp8KVCacheMethod,
-                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
+    Fp8KVCacheMethod,
+    Fp8LinearMethod,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+    GroupShape,
+    is_layer_skipped,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -33,23 +34,23 @@ class PTPCFp8Config(Fp8Config):
     def __init__(
         self,
         activation_scheme: str = "dynamic",
-        ignored_layers: Optional[list[str]] = None,
+        ignored_layers: list[str] | None = None,
     ) -> None:
         if not current_platform.is_rocm():
-            raise ValueError(
-                "ptpc_fp8 quantization is supported only on ROCm.")
+            raise ValueError("ptpc_fp8 quantization is supported only on ROCm.")
 
         if not current_platform.has_device_capability(94):
             raise ValueError(
                 "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
             )
         if activation_scheme == "static":
-            raise ValueError(
-                "ptpc_fp8 as of now only support dynamic quantization.")
+            raise ValueError("ptpc_fp8 as of now only support dynamic quantization.")
 
-        super().__init__(is_checkpoint_fp8_serialized=False,
-                         activation_scheme=activation_scheme,
-                         ignored_layers=ignored_layers)
+        super().__init__(
+            is_checkpoint_fp8_serialized=False,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -59,11 +60,11 @@ class PTPCFp8Config(Fp8Config):
     def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config":
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
-        return cls(activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers)
+        return cls(activation_scheme=activation_scheme, ignored_layers=ignored_layers)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
@@ -79,7 +80,7 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
     """Linear method for Per-Token and Per-Channel FP8 Quantization.
     Only supports loading quantized BF16 model checkpoints with dynamic
     activation scaling. To load FP16 model checkpoints, user must specify
-    to convert the FP16 model weight loading into BF16. 
+    to convert the FP16 model weight loading into BF16.
     The weight scaling factor will be initialized after
     the model weights are loaded.
 
@@ -92,38 +93,45 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
     """
 
     def __init__(self, quant_config: PTPCFp8Config):
-        assert current_platform.is_rocm(), \
+        assert current_platform.is_rocm(), (
             "PTPCFp8LinearMethod is only supported on ROCm."
+        )
         super().__init__(quant_config=quant_config)
         # Force weight quantization
         self.quant_config.is_checkpoint_fp8_serialized = False
         self.fp8_linear = Fp8LinearOp(
-            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
+        )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
+        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
 
-        assert layer.weight.data.dtype == torch.bfloat16, \
-            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
+        assert layer.weight.data.dtype == torch.bfloat16, (
+            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."  # noqa: E501
+        )
         # Quantize the weights.
         qweight, weight_scale = ops.scaled_fp8_quant(
-            layer.weight, scale=None, use_per_token_if_dynamic=True)
+            layer.weight, scale=None, use_per_token_if_dynamic=True
+        )
 
         # Update the layer with the new values.
         layer.weight = Parameter(
-            qweight.t(), requires_grad=False)  # Pretranspose the weight
+            qweight.t(), requires_grad=False
+        )  # Pretranspose the weight
         layer.weight_scale = Parameter(weight_scale, requires_grad=False)
         layer.input_scale = None
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     input_scale=None,
-                                     input_scale_ub=None,
-                                     bias=bias)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=None,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index c65212c01819fab18271a8e6eaf77082eb6ead59..f59e5e2a0af7a858831927ea5897c73d927e8bb4 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -2,38 +2,55 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-from typing import Any, Optional, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
-    QuarkMoEMethod)
+    QuarkMoEMethod,
+)
 from vllm.model_executor.layers.quantization.quark.schemes import (
-    QuarkScheme, QuarkW4A4MXFP4, QuarkW8A8Fp8, QuarkW8A8Int8)
+    QuarkOCP_MX,
+    QuarkScheme,
+    QuarkW8A8Fp8,
+    QuarkW8A8Int8,
+)
 from vllm.model_executor.layers.quantization.quark.utils import (
-    deep_compare, should_ignore_layer)
+    deep_compare,
+    should_ignore_layer,
+)
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 __all__ = ["QuarkLinearMethod"]
 
 logger = init_logger(__name__)
 
 
 class QuarkConfig(QuantizationConfig):
-
-    def __init__(self,
-                 quant_config: dict[str, Any],
-                 kv_cache_group: Optional[list[str]] = None,
-                 kv_cache_config: Optional[dict[str, Any]] = None,
-                 pack_method: str = "reorder"):
+    def __init__(
+        self,
+        quant_config: dict[str, Any],
+        kv_cache_group: list[str] | None = None,
+        kv_cache_config: dict[str, Any] | None = None,
+        pack_method: str = "reorder",
+    ):
         super().__init__()
         if kv_cache_group is None:
             kv_cache_group = []
@@ -55,15 +72,43 @@ class QuarkConfig(QuantizationConfig):
     def get_name(self) -> QuantizationMethods:
         return "quark"
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def apply_vllm_mapper(  # noqa: B027
+        self, hf_to_vllm_mapper: "WeightsMapper"
+    ):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        quant_config_with_hf_to_vllm_mapper = {}
+
+        for k, v in self.quant_config.items():
+            if isinstance(v, list):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v)
+            elif isinstance(v, dict):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v)
+            else:
+                if isinstance(v, str):
+                    mapped_v_list = hf_to_vllm_mapper.apply_list([v])
+                    if mapped_v_list:
+                        quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0]
+                else:
+                    quant_config_with_hf_to_vllm_mapper[k] = v
+
+        self.quant_config = quant_config_with_hf_to_vllm_mapper
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(list[str], self.quant_config.get("exclude"))
-        if should_ignore_layer(prefix,
-                               ignore=exclude_layers,
-                               fused_mapping=self.packed_modules_mapping):
+        if should_ignore_layer(
+            prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
+        ):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -73,17 +118,17 @@ class QuarkConfig(QuantizationConfig):
             return QuarkKVCacheMethod(self)
 
         if isinstance(layer, FusedMoE):
-            return QuarkMoEMethod.get_moe_method(self,
-                                                 module=layer,
-                                                 layer_name=prefix)
+            return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
         return None
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
         export_config = config.get("export")
         if export_config is None:
-            raise ValueError("The export key should be included in "
-                             "the configurations of Quark quantized model")
+            raise ValueError(
+                "The export key should be included in "
+                "the configurations of Quark quantized model"
+            )
         kv_cache_group = cast(list[str], export_config.get("kv_cache_group"))
         pack_method = cast(str, export_config.get("pack_method"))
 
@@ -96,33 +141,44 @@ class QuarkConfig(QuantizationConfig):
             kv_cache_config = None
         else:
             kv_cache_set = set(kv_cache_group)
-            layer_quant_config = cast(dict[str, Any],
-                                      config.get("layer_quant_config"))
+            layer_quant_config = cast(dict[str, Any], config.get("layer_quant_config"))
             layer_quant_names = list(layer_quant_config.keys())
             layer_quant_set = set(layer_quant_names)
 
-            if not kv_cache_set.issubset(layer_quant_set):
-                raise ValueError("The Quark quantized model has the "
-                                 "kv_cache_group parameter setting, "
-                                 "but no kv_cache quantization settings "
-                                 "were found in the quantization "
-                                 "configuration.")
+            if not (
+                kv_cache_set.issubset(layer_quant_set)
+                or any(
+                    fnmatch.fnmatchcase(layer_quant, pat)
+                    for layer_quant in list(layer_quant_set)
+                    for pat in list(kv_cache_set)
+                )
+            ):
+                raise ValueError(
+                    "The Quark quantized model has the "
+                    "kv_cache_group parameter setting, "
+                    "but no kv_cache quantization settings "
+                    "were found in the quantization "
+                    "configuration."
+                )
 
             q_configs = [
-                cast(dict[str, Any], layer_quant_config.get(name))
-                for name in kv_cache_group
+                quant_cfg
+                for name, quant_cfg in layer_quant_config.items()
+                if any(fnmatch.fnmatchcase(name, pattern) for pattern in kv_cache_group)
             ]
+
             if not all(
-                    deep_compare(q_config, q_configs[0])
-                    for q_config in q_configs):
+                deep_compare(q_config["output_tensors"], q_configs[0]["output_tensors"])
+                for q_config in q_configs
+            ):
                 raise ValueError(
                     "The quantization method used for kv_cache should "
                     "be the same, but the quantization method for the "
-                    "kv_cache layer in the config is different.")
+                    "kv_cache layer in the config is different."
+                )
             kv_cache_config = q_configs[0].get("output_tensors")
             if kv_cache_config is None:
-                raise ValueError(
-                    "The kv_cache quantization configuration is empty.")
+                raise ValueError("The kv_cache quantization configuration is empty.")
 
             # Since we have already set kv_cache quantization configurations,
             # we will remove the quantization configuration for the
@@ -132,23 +188,22 @@ class QuarkConfig(QuantizationConfig):
 
             # In case q_proj output is also quantized, remove the configuration
             # to keep qkv consistency.
-            q_proj_q_config = cast(dict[str, Any],
-                                   layer_quant_config.get("*q_proj"))
+            q_proj_q_config = cast(dict[str, Any], layer_quant_config.get("*q_proj"))
             if q_proj_q_config is not None:
                 q_proj_q_config["output_tensors"] = None
 
-        return cls(quant_config=config,
-                   kv_cache_group=kv_cache_group,
-                   kv_cache_config=kv_cache_config,
-                   pack_method=pack_method)
+        return cls(
+            quant_config=config,
+            kv_cache_group=kv_cache_group,
+            kv_cache_config=kv_cache_config,
+            pack_method=pack_method,
+        )
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
         return []
 
-    def _check_scheme_supported(self,
-                                min_capability: int,
-                                error: bool = True) -> bool:
+    def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
         if capability_tuple is not None:
@@ -158,26 +213,33 @@ class QuarkConfig(QuantizationConfig):
                 raise RuntimeError(
                     "Quantization scheme is not supported for ",
                     f"the current GPU. Min capability: {min_capability}. ",
-                    f"Current capability: {capability}.")
+                    f"Current capability: {capability}.",
+                )
             return supported
         else:
             return False
 
-    def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]],
-                     input_quant: Optional[dict[str, Any]]) -> bool:
+    def _is_fp8_w8a8(
+        self,
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
             return False
 
         # Confirm weight scheme is supported
-        is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3"
-                        and input_quant.get("dtype") == "fp8_e4m3")
+        is_fp8_dtype = (
+            weight_quant.get("dtype") == "fp8_e4m3"
+            and input_quant.get("dtype") == "fp8_e4m3"
+        )
         is_static_weight = not weight_quant.get("is_dynamic")
-        is_per_tensor_or_channel_weight = (weight_quant.get("qscheme")
-                                           in ["per_tensor", "per_channel"])
+        is_per_tensor_or_channel_weight = weight_quant.get("qscheme") in [
+            "per_tensor",
+            "per_channel",
+        ]
 
-        if not (is_fp8_dtype and is_static_weight
-                and is_per_tensor_or_channel_weight):
+        if not (is_fp8_dtype and is_static_weight and is_per_tensor_or_channel_weight):
             return False
 
         # Dynamic quantization is always supported if weights supported.
@@ -185,76 +247,88 @@ class QuarkConfig(QuantizationConfig):
             return True
 
         # Confirm activation scheme is supported.
-        is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor")
+        is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor"
         return is_per_tensor_activation
 
-    def _is_static_tensor_w8a8(self, weight_quant: Optional[dict[str, Any]],
-                               input_quant: Optional[dict[str, Any]]) -> bool:
+    def _is_static_tensor_w8a8(
+        self,
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
             return False
 
-        is_int8_dtype = (weight_quant.get("dtype") == "int8"
-                         and input_quant.get("dtype") == "int8")
+        is_int8_dtype = (
+            weight_quant.get("dtype") == "int8" and input_quant.get("dtype") == "int8"
+        )
 
-        is_tensor = (weight_quant.get("qscheme")
-                     in ["per_tensor", "per_channel"]
-                     and input_quant.get("qscheme") == "per_tensor")
+        is_tensor = (
+            weight_quant.get("qscheme") in ["per_tensor", "per_channel"]
+            and input_quant.get("qscheme") == "per_tensor"
+        )
 
-        is_static = (not weight_quant.get("is_dynamic")
-                     and not input_quant.get("is_dynamic"))
+        is_static = not weight_quant.get("is_dynamic") and not input_quant.get(
+            "is_dynamic"
+        )
 
-        is_weight_symmetric = (weight_quant.get("symmetric") is True)
+        is_weight_symmetric = weight_quant.get("symmetric") is True
 
         # Both symmetric and asymmetric input quantization supported.
         # Only symmetric weight quantization supported.
         return is_int8_dtype and is_tensor and is_weight_symmetric and is_static
 
-    def _is_mx_fp4(self, weight_quant: Optional[dict[str, Any]],
-                   input_quant: Optional[dict[str, Any]]) -> bool:
+    def _is_ocp_mx(
+        self,
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
-            logger.debug("Quark model is not in MX-FP4 format: "
-                         "weight_quant or input_quant not set")
-            return False
-
-        # Input and weight dtype needs to be fp4.
-        if weight_quant.get("dtype") != "fp4" or input_quant.get(
-                "dtype") != "fp4":
-            logger.debug("Quark model is not in MX-FP4 format: dtype not fp4")
+            logger.debug(
+                "Quark model is not in OCP MX format: "
+                "weight_quant or input_quant not set"
+            )
             return False
 
         # Input and weight qscheme needs to be per group.
-        if weight_quant.get("qscheme") != "per_group" or input_quant.get(
-                "qscheme") != "per_group":
-            logger.debug("Quark model is not in MX-FP4 format: not per_group")
+        if (
+            weight_quant.get("qscheme") != "per_group"
+            or input_quant.get("qscheme") != "per_group"
+        ):
+            logger.debug("Quark model is not in OCP MX format: not per_group")
             return False
 
         # Input and weight group size needs to be 32.
-        if weight_quant.get("group_size") != 32 or input_quant.get(
-                "group_size") != 32:
-            logger.debug(
-                "Quark model is not in MX-FP4 format: not group_size=32")
+        if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32:
+            logger.debug("Quark model is not in OCP MX format: not group_size=32")
             return False
 
-        # Activations need to use dynamic quantization.
-        if input_quant.get("is_dynamic") is False:
-            logger.debug(
-                "Quark model is not in MX-FP4 format: not activation dynamic")
+        # Activations and weight scales need to be in e8m0 format.
+        if (
+            weight_quant.get("scale_format") != "e8m0"
+            or input_quant.get("scale_format") != "e8m0"
+        ):
+            logger.debug("Quark model is not in OCP MX format: not scale_format e8m0")
             return False
 
-        # Activations and weight scales need to be in e8m0 format.
-        if weight_quant.get("scale_format") != "e8m0" or input_quant.get(
-                "scale_format") != "e8m0":
+        # Input and weight dtypes need to be any of fp4,
+        # fp6_e3m2 or fp6_e3m2, possibly mixed.
+        if weight_quant.get("dtype") not in {
+            "fp4",
+            "fp6_e3m2",
+            "fp6_e2m3",
+        } or input_quant.get("dtype") not in {"fp4", "fp6_e3m2", "fp6_e2m3"}:
             logger.debug(
-                "Quark model is not in MX-FP4 format: not scale_format e8m0")
+                "Quark model is not in OCP MX format: dtype not fp4, fp6_e3m2, fp6_e2m3"
+            )
             return False
 
         return True
 
-    def _find_matched_config(self, layer_name: str,
-                             module: torch.nn.Module) -> dict[str, Any]:
-
+    def _find_matched_config(
+        self, layer_name: str, module: torch.nn.Module
+    ) -> dict[str, Any]:
         proj_name = layer_name.split(".")[-1]
         if proj_name in self.packed_modules_mapping:
             shard_proj_names = self.packed_modules_mapping[proj_name]
@@ -269,59 +343,72 @@ class QuarkConfig(QuantizationConfig):
                 for shard_name in shard_names
             ]
             if not all(
-                    deep_compare(q_config, shard_configs[0])
-                    for q_config in shard_configs):
+                deep_compare(q_config, shard_configs[0]) for q_config in shard_configs
+            ):
                 raise ValueError(
                     f"Found a different quantization configuration for "
                     f"{shard_proj_names} in {layer_name}. vLLM "
-                    "requires all to use the same scheme.")
+                    "requires all to use the same scheme."
+                )
             return shard_configs[0]
         else:
             layer_quant_config = cast(
-                dict[str, Any], self.quant_config.get("layer_quant_config"))
-            for name_pattern in layer_quant_config:
-                if fnmatch.fnmatch(layer_name, name_pattern):
-                    return layer_quant_config[name_pattern]
+                dict[str, Any], self.quant_config.get("layer_quant_config")
+            )
+
+            def _matches_pattern(layer_name, pattern):
+                if "*" not in pattern:
+                    return layer_name in pattern
+                return fnmatch.fnmatch(layer_name, pattern)
+
+            for name_pattern, config in layer_quant_config.items():
+                if _matches_pattern(layer_name, name_pattern):
+                    return config
 
             layer_type = cast(str, type(module))
             layer_type_quant_config = cast(
-                dict[str, Any],
-                self.quant_config.get("layer_type_quant_config"))
+                dict[str, Any], self.quant_config.get("layer_type_quant_config")
+            )
             if layer_type in layer_type_quant_config:
                 return layer_type_quant_config[layer_type]
 
             global_quant_config = cast(
-                dict[str, Any], self.quant_config.get("global_quant_config"))
+                dict[str, Any], self.quant_config.get("global_quant_config")
+            )
             return global_quant_config
 
     def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
         if config.get("output_tensors") or config.get("bias"):
             raise NotImplementedError(
                 "Currently, Quark models with output_tensors "
-                "and bias quantized are not supported")
+                "and bias quantized are not supported"
+            )
         weight_config = cast(dict[str, Any], config.get("weight"))
         input_config = cast(dict[str, Any], config.get("input_tensors"))
 
         if self._is_fp8_w8a8(weight_config, input_config):
             is_fp8_w8a8_supported = self._check_scheme_supported(
-                QuarkW8A8Fp8.get_min_capability(), error=False)
+                QuarkW8A8Fp8.get_min_capability(), error=False
+            )
             if is_fp8_w8a8_supported:
                 return QuarkW8A8Fp8(weight_config, input_config)
         elif self._is_static_tensor_w8a8(weight_config, input_config):
             weight_qscheme = cast(str, weight_config.get("qscheme"))
-            return QuarkW8A8Int8(qscheme=weight_qscheme,
-                                 is_static_input_scheme=True,
-                                 input_symmetric=input_config.get("symmetric"))
-        elif self._is_mx_fp4(weight_config, input_config):
-            return QuarkW4A4MXFP4(weight_config, input_config)
-
-        raise NotImplementedError("No quark compatible scheme was found. "
-                                  f"Weight config: {weight_config}, "
-                                  f"Input config: {input_config}")
-
-    def get_scheme(self, layer: torch.nn.Module,
-                   layer_name: str) -> "QuarkScheme":
-
+            return QuarkW8A8Int8(
+                qscheme=weight_qscheme,
+                is_static_input_scheme=True,
+                input_symmetric=input_config.get("symmetric"),
+            )
+        elif self._is_ocp_mx(weight_config, input_config):
+            return QuarkOCP_MX(weight_config, input_config)
+
+        raise NotImplementedError(
+            "No quark compatible scheme was found. "
+            f"Weight config: {weight_config}, "
+            f"Input config: {input_config}"
+        )
+
+    def get_scheme(self, layer: torch.nn.Module, layer_name: str) -> "QuarkScheme":
         layer_quant_config = self._find_matched_config(layer_name, layer)
 
         # Find the quant_scheme
@@ -332,10 +419,10 @@ class QuarkConfig(QuantizationConfig):
 
         return scheme
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
-        in quark. If this is the case, return its equivalent param name 
+        in quark. If this is the case, return its equivalent param name
         expected by vLLM
 
         :param name: param name
@@ -355,18 +442,22 @@ class QuarkConfig(QuantizationConfig):
 
 
 class QuarkLinearMethod(LinearMethodBase):
-
     def __init__(self, quantization_config: QuarkConfig):
         self.quantization_config = quantization_config
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.scheme.process_weights_after_loading(layer)
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         """
         Use the CompressedTensorsScheme associated with each layer to create
         the necessary parameters for the layer. See LinearMethodBase for param
@@ -380,12 +471,15 @@ class QuarkLinearMethod(LinearMethodBase):
             output_partition_sizes=output_partition_sizes,
             output_size=output_size,
             params_dtype=params_dtype,
-            weight_loader=weight_loader)
-
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None):
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ):
         """
         Use the output of create_weights and the CompressedTensorsScheme
         associated with the layer to apply the forward pass with the
@@ -409,7 +503,7 @@ class QuarkKVCacheMethod(BaseKVCacheMethod):
         super().__init__(quant_config)
 
     @staticmethod
-    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
+    def validate_kv_cache_config(kv_cache_config: dict[str, Any] | None):
         """
         Validator for the kv cache configuration. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
@@ -422,11 +516,13 @@ class QuarkKVCacheMethod(BaseKVCacheMethod):
         if dtype != "fp8_e4m3":
             raise NotImplementedError(
                 "Currently supported kv cache quantization is "
-                f"dtype=fp8_e4m3, however received {dtype}")
+                f"dtype=fp8_e4m3, however received {dtype}"
+            )
 
         qscheme = kv_cache_config.get("qscheme")
         if qscheme != "per_tensor":
             raise NotImplementedError(
                 "Only support per-tensor scaling factor "
                 "for quark KV cache. "
-                f"Expected qscheme: per_tensor, found qscheme: {qscheme}")
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}"
+            )
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 24497cc756c1ea9e3d7983f5cdaa37f8329b6282..30772c3665b06163b8944f6e133617921dcab577 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,73 +1,79 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
-                                                  FusedMoEMethodBase,
-                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE,
+    FusedMoEConfig,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, fp8_w8a8_moe_quant_config,
-    mxfp4_w4a4_moe_quant_config)
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled)
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+    ocp_mx_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    prepare_moe_fp8_layer_for_marlin)
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    OCP_MX_BLOCK_SIZE)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    prepare_moe_fp8_layer_for_marlin,
+)
+from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
+    OCP_MX_BLOCK_SIZE,
+    OCP_MX_Scheme,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+    all_close_1d,
+    normalize_e4m3fn_to_e4m3fnuz,
+    per_tensor_dequantize,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
-__all__ = [
-    "QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkW4A4MXFp4MoEMethod"
-]
+__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkOCP_MX_MoEMethod"]
 
 
 class QuarkMoEMethod(FusedMoEMethodBase):
-
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
 
     @staticmethod
     def get_moe_method(
-            quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
-            module: torch.nn.Module,
-            layer_name: str) -> "QuarkMoEMethod":
-        layer_quant_config = quant_config._find_matched_config(
-            layer_name, module)
-
-        if (layer_quant_config.get("output_tensors")
-                or layer_quant_config.get("bias")):
-            raise NotImplementedError("Currently, Quark models with "
-                                      "output_tensors and bias "
-                                      "quantized are not supported")
+        quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
+        module: torch.nn.Module,
+        layer_name: str,
+    ) -> "QuarkMoEMethod":
+        layer_quant_config = quant_config._find_matched_config(layer_name, module)
+
+        if layer_quant_config.get("output_tensors") or layer_quant_config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with "
+                "output_tensors and bias "
+                "quantized are not supported"
+            )
         weight_config = layer_quant_config.get("weight")
         input_config = layer_quant_config.get("input_tensors")
 
         if quant_config._is_fp8_w8a8(weight_config, input_config):
-            return QuarkW8A8Fp8MoEMethod(weight_config, input_config,
-                                         module.moe_config)
-        elif quant_config._is_mx_fp4(weight_config, input_config):
-            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config,
-                                           module.moe_config)
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
+        elif quant_config._is_ocp_mx(weight_config, input_config):
+            return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
         else:
             raise RuntimeError("Unsupported FusedMoe scheme")
 
 
 class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
-
     def __init__(
         self,
         weight_config: dict[str, Any],
@@ -80,38 +86,50 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
         self.weight_qscheme = self.weight_quant.get("qscheme")
         self.input_qscheme = self.input_quant.get("qscheme")
-        per_tensor = (self.weight_qscheme == "per_tensor"
-                      and self.input_qscheme == "per_tensor")
-        per_channel = (self.weight_qscheme == "per_channel"
-                       and self.input_qscheme == "per_channel")
-        self.act_quant_group_shape = GroupShape.PER_TOKEN \
-            if per_channel else GroupShape.PER_TENSOR
+        per_tensor = (
+            self.weight_qscheme == "per_tensor" and self.input_qscheme == "per_tensor"
+        )
+        per_channel = (
+            self.weight_qscheme == "per_channel" and self.input_qscheme == "per_channel"
+        )
+        self.act_quant_group_shape = (
+            GroupShape.PER_TOKEN if per_channel else GroupShape.PER_TENSOR
+        )
         if not (per_tensor or per_channel):
             raise ValueError(
                 "For FP8 Fused MoE layers, only per-tensor and per-channel "
                 "scales for weights and activations are supported. Found "
-                f"{self.weight_qscheme}, {self.input_qscheme}")  # noqa E501
+                f"{self.weight_qscheme}, {self.input_qscheme}"
+            )  # noqa E501
 
         self.static_input_scales = not self.input_quant.get("is_dynamic")
         if self.static_input_scales and per_channel:
             raise ValueError(
                 "For FP8 Fused MoE layer, we require either per tensor or "
-                "channelwise, dynamic per token quantization.")
+                "channelwise, dynamic per token quantization."
+            )
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        self.use_marlin = (
+            not current_platform.has_device_capability(89)
+            or envs.VLLM_TEST_FORCE_FP8_MARLIN
+        )
         # Disable marlin for rocm
         if current_platform.is_rocm():
             self.use_marlin = False
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.hidden_size = hidden_size
         layer.num_experts = num_experts
@@ -120,21 +138,27 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         params_dtype = torch.float8_e4m3fn
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -142,48 +166,54 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         if self.weight_qscheme == "per_tensor":
             # Allocate 2 scales for w1 and w3 respectively.
             # They are combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, 2, dtype=torch.float32),
-                                                  requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
             # Add PER-TENSOR quantization for FusedMoE.weight_loader.
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
         elif self.weight_qscheme == "per_channel":
             # quark's scale is 1 dim.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                dtype=torch.float32),
-                                                  requires_grad=False)
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, hidden_size, dtype=torch.float32),
-                                                 requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
             # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+            )
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
-            w13_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w13_input_scale", w13_input_scale)
             set_weight_attrs(w13_input_scale, extra_weight_attrs)
 
-            w2_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                requires_grad=False)
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
             layer.register_parameter("w2_input_scale", w2_input_scale)
             set_weight_attrs(w2_input_scale, extra_weight_attrs)
         else:
@@ -194,46 +224,53 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
-            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+            if layer.w13_input_scale is None or layer.w2_input_scale is None:
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
-                    "activation scales are None.")
-            if (not all_close_1d(layer.w13_input_scale)
-                    or not all_close_1d(layer.w2_input_scale)):
+                    "activation scales are None."
+                )
+            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                layer.w2_input_scale
+            ):
                 logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer. "
+                )
             layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max(), requires_grad=False)
+                layer.w13_input_scale.max(), requires_grad=False
+            )
             layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max(), requires_grad=False)
+                layer.w2_input_scale.max(), requires_grad=False
+            )
 
         if current_platform.is_fp8_fnuz():
             # Normalize the weights and scales
-            w13_weight, w13_weight_scale, w13_input_scale = \
-                normalize_e4m3fn_to_e4m3fnuz(
-                    layer.w13_weight, layer.w13_weight_scale,
-                    layer.w13_input_scale)
-            w2_weight, w2_weight_scale, w2_input_scale = \
+            w13_weight, w13_weight_scale, w13_input_scale = (
                 normalize_e4m3fn_to_e4m3fnuz(
-                    layer.w2_weight, layer.w2_weight_scale,
-                    layer.w2_input_scale)
+                    layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                )
+            )
+            w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+            )
             # Reset the parameter
-            layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                  requires_grad=False)
-            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
-                                                        requires_grad=False)
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                w13_weight_scale, requires_grad=False
+            )
             if w13_input_scale is not None:
-                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
-                                                           requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                 requires_grad=False)
-            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
-                                                       requires_grad=False)
+                layer.w13_input_scale = torch.nn.Parameter(
+                    w13_input_scale, requires_grad=False
+                )
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(
+                w2_weight_scale, requires_grad=False
+            )
             if w2_input_scale is not None:
-                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
-                                                          requires_grad=False)
+                layer.w2_input_scale = torch.nn.Parameter(
+                    w2_input_scale, requires_grad=False
+                )
 
         # For per-tensor case, Fp8 moe kernel needs single weight scale
         # for w13 per expert. Use max then dequant and requant each expert.
@@ -245,59 +282,54 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 start = 0
                 for shard_id in range(2):
                     dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start:start +
-                                                    shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id])
-                    layer.w13_weight[expert_id][
-                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id])
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
+                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    )
                     start += shard_size
 
-            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                        requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
         # quark's scale is 1 dim.
         elif self.weight_qscheme == "per_channel":
             if self.act_quant_group_shape == GroupShape.PER_TOKEN:
                 w13_weight_scale = layer.w13_weight_scale.unsqueeze(-1)
                 layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_weight_scale, requires_grad=False)
+                    w13_weight_scale, requires_grad=False
+                )
                 w2_weight_scale = layer.w2_weight_scale.unsqueeze(-1)
-                layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
-                                                           requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                rocm_aiter_fused_experts, shuffle_weights)
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data)
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data
+            )
 
-            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                  requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                 requires_grad=False)
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
-            self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts
         elif self.use_marlin:
-
             prepare_moe_fp8_layer_for_marlin(layer, False)
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
-            self.fused_experts_func = None
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            self.fused_experts_func = fused_experts
 
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            per_act_token_quant=self.weight_qscheme == "per_channel",
+            per_act_token_quant=self.input_qscheme == "per_channel",
+            per_out_ch_quant=self.weight_qscheme == "per_channel",
         )
 
     def apply(
@@ -308,26 +340,25 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.")
+                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
+            )
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -341,10 +372,15 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
         if self.rocm_aiter_moe_enabled:
-            return self.rocm_aiter_fused_experts_func(
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
+            )
+
+            return rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -353,11 +389,11 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map)
-        if self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
-            return torch.ops.vllm.fused_marlin_moe(
+                expert_map=expert_map,
+            )
+        elif self.use_marlin:
+            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -371,26 +407,27 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 quant_type_id=scalar_types.float8_e4m3fn.id,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
-                expert_map=expert_map)
-
-        assert self.fused_experts_func is not None
-
-        return self.fused_experts_func(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            quant_config=self.moe_quant_config)
+                expert_map=expert_map,
+            )
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
 
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                quant_config=self.moe_quant_config,
+            )
 
-class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
 
+class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
     def __init__(
         self,
         weight_config: dict[str, Any],
@@ -403,64 +440,98 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
 
         weight_qscheme = self.weight_quant.get("qscheme")
         input_qscheme = self.input_quant.get("qscheme")
-        if not (weight_qscheme == "per_group"
-                and input_qscheme == "per_group"):
+        if not (weight_qscheme == "per_group" and input_qscheme == "per_group"):
             raise ValueError(
                 "For MX(FP4) Fused MoE layers, only per-group scales "
                 "for weights and activations are supported. Found "
-                f"{weight_qscheme}, {input_qscheme}")  # noqa E501
+                f"{weight_qscheme}, {input_qscheme}"
+            )  # noqa E501
 
         self.static_input_scales = not self.input_quant.get("is_dynamic")
 
+        self.weight_dtype = self.weight_quant["dtype"].replace("fp", "mxfp")
+        self.input_dtype = self.input_quant["dtype"].replace("fp", "mxfp")
+        self.fp4_dtype = getattr(torch, "float4_e2m1fn_x2", None)
+
+        self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
+            self.input_dtype, self.weight_dtype
+        )
+
         if self.static_input_scales:
             raise NotImplementedError(
-                "QuarkW4A4MXFp4MoEMethod with static input scales is currently "
-                "not implemented. Please open an issue.")
+                "QuarkOCP_MX_MoEMethod with static input scales is currently "
+                "not implemented. Please open an issue."
+            )
+
+        self.use_rocm_aiter_moe = rocm_aiter_ops.is_fused_moe_enabled()
 
-        if not current_platform.supports_mx():
-            self.emulate = True
+        self.emulate = not current_platform.supports_mx() or not (
+            self.use_rocm_aiter_moe and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
+        )
+        if self.emulate:
             logger.warning_once(
-                "The current platform does not support native MXFP4 "
+                f"The current mode (supports_mx={current_platform.supports_mx()}, "
+                f"use_mxfp4_aiter_moe={self.use_rocm_aiter_moe}, "
+                f"ocp_mx_scheme={self.ocp_mx_scheme}) "
+                "does not support native MXFP4/MXFP6 "
                 "computation. Simulated weight dequantization and activation "
                 "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
+                "layers computed in high precision."
+            )
         else:
-            self.emulate = True
             logger.warning_once(
-                "The current platform supports native MXFP4 "
-                "computation, but kernels are not yet integrated in vLLM. "
-                "Simulated weight dequantization and activation "
-                "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
+                "The current mode supports native MoE MXFP4 computation"
+            )
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+    def get_packed_dim(self, dim: int, quant_dtype: str):
+        if quant_dtype == "mxfp4":
+            assert dim % 2 == 0
+            return dim // 2
+        else:
+            # FP6 packs 4 * 6 = 24 bits on 3 bytes.
+            assert (dim * 3) % 4 == 0
+            return (dim * 3) // 4
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
         extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value})
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
 
         params_dtype = torch.uint8
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size // 2,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                self.get_packed_dim(hidden_size, self.weight_dtype),
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
 
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition // 2,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                self.get_packed_dim(intermediate_size_per_partition, self.weight_dtype),
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
 
         set_weight_attrs(w2_weight, extra_weight_attrs)
@@ -490,9 +561,41 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
 
+    def process_weights_after_loading(self, layer):
+        if self.emulate:
+            return
+
+        from aiter.utility.fp4_utils import e8m0_shuffle
+
+        # Pre-shuffle weight scales
+        s0, s1, _ = layer.w13_weight_scale.shape
+        w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1)
+        w13_weight_scale = e8m0_shuffle(w13_weight_scale)
+        layer.w13_weight_scale.data = w13_weight_scale.view(s0, s1, -1)
+
+        s0, s1, _ = layer.w2_weight_scale.shape
+        w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1)
+        w2_weight_scale = e8m0_shuffle(w2_weight_scale)
+        layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1)
+
+        if self.fp4_dtype is not None:
+            layer.w13_weight = torch.nn.Parameter(
+                layer.w13_weight.view(self.fp4_dtype),
+                requires_grad=layer.w13_weight.requires_grad,
+            )
+            layer.w2_weight = torch.nn.Parameter(
+                layer.w2_weight.view(self.fp4_dtype),
+                requires_grad=layer.w2_weight.requires_grad,
+            )
+
+        torch.cuda.empty_cache()
+
     def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        return mxfp4_w4a4_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return ocp_mx_moe_quant_config(
+            quant_dtype=self.input_dtype,
+            weight_dtype=self.weight_dtype,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             a1_scale=None,
@@ -500,6 +603,10 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
             block_shape=None,
         )
 
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -508,28 +615,25 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
-                "EPLB not supported for `QuarkW4A4MXFp4MoEMethod` yet.")
-
-        from vllm.model_executor.layers.fused_moe import fused_experts
+                "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
+            )
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -543,19 +647,37 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
-
-        out = fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=expert_map,
-            quant_config=self.moe_quant_config,
+            indices_type=self.topk_indices_dtype,
         )
+
+        if not self.emulate:
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
+            )
+
+            out = rocm_aiter_fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                quant_config=self.moe_quant_config,
+            )
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+
+            out = fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=expert_map,
+                quant_config=self.moe_quant_config,
+            )
         return out
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index ec09d9b2ac26f7c2e96ca9dc06b6f0085911ed67..7620d6e41b58a532d3ed4f24d2fcdebbe7336183 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .quark_ocp_mx import QuarkOCP_MX
 from .quark_scheme import QuarkScheme
-from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
 from .quark_w8a8_fp8 import QuarkW8A8Fp8
 from .quark_w8a8_int8 import QuarkW8A8Int8
 
-__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
+__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkOCP_MX"]
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
new file mode 100644
index 0000000000000000000000000000000000000000..007e78e68d5cdcb0ba01ad96d04fe499610bfa24
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from fractions import Fraction
+from functools import cache, partial
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    dequant_mxfp4,
+    quant_dequant_mxfp4,
+)
+from vllm.model_executor.layers.quantization.utils.mxfp6_utils import (
+    dequant_mxfp6,
+    quant_dequant_mxfp6,
+)
+from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
+    OCP_MX_BLOCK_SIZE,
+    OCP_MX_Scheme,
+)
+from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from vllm.platforms import current_platform
+
+from .quark_scheme import QuarkScheme
+
+logger = init_logger(__name__)
+
+
+# TODO: move registration of custom op to aiter_ops.py
+# `from vllm._aiter_ops import rocm_aiter_ops`
+# use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
+# for envs checks which does not require @cache anymore.
+# triton kernel is torch compile compatible.
+# does not require direct registeration.
+# use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
+@cache
+def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
+    return (
+        current_platform.is_rocm()
+        and envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+        and envs.VLLM_ROCM_USE_AITER
+    )
+
+
+try:
+    from aiter.ops.shuffle import shuffle_weight
+    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+    from vllm.utils.torch_utils import direct_register_custom_op
+
+    if is_rocm_aiter_fp4_asm_gemm_enabled():
+        from aiter import gemm_a4w4, per_1x32_f4_quant_hip
+
+    def gemm_with_dynamic_quant(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        rocm_use_aiter_fp4_asm_gemm: bool = False,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+        x_scales: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        M = x.shape[0]
+        if rocm_use_aiter_fp4_asm_gemm:
+            if x_scales is None:
+                # use hip quant kernel for performance
+                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+            else:
+                x_q = x
+                x_s = x_scales
+
+            # 32 alignment is enough for dim0 padding of output for
+            # gemm_a4w4 kernel
+            y = torch.empty(
+                (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype
+            )
+
+            gemm_a4w4(
+                x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
+            )
+            return y[:M]
+        else:
+            if x_scales is None:
+                x_q, x_s = dynamic_mxfp4_quant(x)
+            else:
+                x_q = x
+                x_s = x_scales
+            y = torch.empty(
+                x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype
+            )
+
+            gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
+            return y
+
+    def gemm_with_dynamic_quant_fake(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        x_scales: torch.Tensor = None,
+        rocm_use_aiter_fp4_asm_gemm: bool = False,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+    ) -> torch.Tensor:
+        return torch.empty(
+            (*x.shape[:-1], weight.shape[0]), dtype=out_dtype, device=x.device
+        )
+
+    direct_register_custom_op(
+        op_name="gemm_with_dynamic_quant",
+        op_func=gemm_with_dynamic_quant,
+        mutates_args=[],
+        fake_impl=gemm_with_dynamic_quant_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+except (ImportError, AttributeError):
+    dynamic_mxfp4_quant = gemm_afp4wfp4 = None
+
+
+class QuarkOCP_MX(QuarkScheme):
+    def __init__(
+        self, weight_quant_spec: dict[str, Any], input_quant_spec: dict[str, Any]
+    ):
+        self.out_dtype = torch.get_default_dtype()
+        self.qscheme = "per_group"
+        self.weight_quant_spec = weight_quant_spec
+        self.input_quant_spec = input_quant_spec
+
+        self.weight_dtype = weight_quant_spec["dtype"].replace("fp", "mxfp")
+        self.input_dtype = input_quant_spec["dtype"].replace("fp", "mxfp")
+
+        self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
+            self.input_dtype, self.weight_dtype
+        )
+
+        if self.weight_dtype == "mxfp4":
+            self.packed_factor: int | Fraction = 2
+            self.dequant_func = dequant_mxfp4
+        else:
+            self.packed_factor = Fraction(numerator=8, denominator=6)
+            self.dequant_func = partial(
+                dequant_mxfp6, quant_dtype=self.weight_dtype.replace("mx", "")
+            )
+
+        if self.input_dtype == "mxfp4":
+            self.quant_dequant_func = quant_dequant_mxfp4
+        else:
+            self.quant_dequant_func = partial(
+                quant_dequant_mxfp6, quant_dtype=self.input_dtype.replace("mx", "")
+            )
+
+        self.static_input_scales = not input_quant_spec.get("is_dynamic")
+
+        if self.static_input_scales:
+            raise NotImplementedError(
+                "QuarkOCP_MX with static input scales is currently not "
+                "implemented. Please open an issue."
+            )
+
+        # TODO: integrate (or test) mixed-precision kernel.
+        self.emulate = not current_platform.supports_mx() or (
+            self.input_dtype != "mxfp4" or self.weight_dtype != "mxfp4"
+        )
+
+        self.rocm_use_aiter_fp4_asm_gemm = is_rocm_aiter_fp4_asm_gemm_enabled()
+
+        if not self.emulate and (dynamic_mxfp4_quant is None or gemm_afp4wfp4 is None):
+            # Currently need these kernels if not emulating
+            raise NotImplementedError(
+                f"{self.__class__.__name__} requires AITER to be installed "
+                "for non-emulation mode! Please refer to "
+                "https://github.com/ROCm/aiter for installation details."
+            )
+
+        if not current_platform.supports_mx():
+            logger.warning_once(
+                "The current platform does not support native MXFP4/MXFP6 "
+                "computation. Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision."
+            )
+
+        if current_platform.supports_mx() and (
+            self.input_dtype != "mxfp4" or self.weight_dtype != "mxfp4"
+        ):
+            logger.warning_once(
+                "The current platform supports native MXFP4/MXFP6 "
+                f"computation, but kernels for input_dtype={self.input_dtype} "
+                f"and weight_dtype={self.weight_dtype} are not yet integrated "
+                "in vLLM. Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision."
+            )
+
+    def get_packed_dim(self, dim: int, quant_dtype: str):
+        if quant_dtype == "mxfp4":
+            assert dim % 2 == 0
+            return dim // 2
+        elif quant_dtype in {"mxfp6_e3m2", "mxfp6_e2m3"}:
+            # FP6 packs 4 * 6 = 24 bits on 3 bytes.
+            assert (dim * 3) % 4 == 0
+            return (dim * 3) // 4
+        else:
+            raise NotImplementedError(
+                "Unsupported quant_dtype in QuarkOCP_MX.get_packed_dim, "
+                f"got quant_dtype={quant_dtype}. Something is wrong, please "
+                "open an issue."
+            )
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
+
+        if self.emulate:
+            layer.weight_scale = torch.nn.Parameter(
+                layer.weight_scale.data, requires_grad=False
+            )
+        else:
+            if self.rocm_use_aiter_fp4_asm_gemm:
+                # shuffle weight scale
+                weight_scale_shuffle = layer.weight_scale.data
+                sm, sn = weight_scale_shuffle.shape
+                weight_scale_shuffle = weight_scale_shuffle.view(
+                    sm // 32, 2, 16, sn // 8, 2, 4, 1
+                )
+                weight_scale_shuffle = weight_scale_shuffle.permute(
+                    0, 3, 5, 2, 4, 1, 6
+                ).contiguous()
+                weight_scale_shuffle = weight_scale_shuffle.view(sm, sn)
+                layer.weight_scale = torch.nn.Parameter(
+                    weight_scale_shuffle, requires_grad=False
+                )
+
+                # shuffle weight
+                weight_shuffle = layer.weight.data
+                weight_shuffle = shuffle_weight(weight_shuffle, layout=(16, 16))
+                layer.weight = torch.nn.Parameter(weight_shuffle, requires_grad=False)
+            else:
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data.T.contiguous(), requires_grad=False
+                )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = PackedvLLMParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                self.get_packed_dim(input_size_per_partition, self.weight_dtype),
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=self.packed_factor,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if self.emulate:
+            dq_w = self.dequant_func(layer.weight, layer.weight_scale, x.dtype)
+            qdq_x = self.quant_dequant_func(x)
+            return F.linear(qdq_x, dq_w, bias)
+        else:
+            return torch.ops.vllm.gemm_with_dynamic_quant(
+                x,
+                layer.weight,
+                layer.weight_scale,
+                self.rocm_use_aiter_fp4_asm_gemm,
+                self.out_dtype,
+            )
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index c167e949ac262df1eab312116a2d5bd63e11c3dc..412a07a85fe751c1b4fbad342da61be0a7a1fcfc 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -11,7 +10,7 @@ __all__ = ["QuarkScheme"]
 
 class QuarkScheme(ABC):
     """
-    Abstract class used to describe the weight creation and forward pass 
+    Abstract class used to describe the weight creation and forward pass
     of different quantization schemes supported by Quark.
     """
 
@@ -26,20 +25,21 @@ class QuarkScheme(ABC):
     @abstractmethod
     def create_weights(self, *args, **kwargs):
         """
-        Weight creation for the particular scheme. Inputs to this function 
+        Weight creation for the particular scheme. Inputs to this function
 
         """
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]):
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ):
         """
-        Run the forward pass for the particular scheme. This is where 
+        Run the forward pass for the particular scheme. This is where
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: torch.nn.Module with the registered weights and 
-            other parameters relevant to the particular scheme. 
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
         :param x: input to the layer
         :param bias: bias parameter
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
deleted file mode 100644
index f8628a82277b5c61c0d41dc2cf45d7f25b709e73..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from functools import cache
-from typing import Any, Callable, Optional
-
-import torch
-import torch.nn.functional as F
-
-from vllm import envs
-from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    OCP_MX_BLOCK_SIZE, dequant_mxfp4, quant_dequant_mxfp4)
-from vllm.model_executor.parameter import (GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
-from vllm.platforms import current_platform
-
-
-@cache
-def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
-    return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM \
-        and envs.VLLM_ROCM_USE_AITER
-
-
-try:
-    from aiter.ops.shuffle import shuffle_weight
-    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
-    from aiter.ops.triton.quant import dynamic_mxfp4_quant
-
-    from vllm.utils import direct_register_custom_op
-    if is_rocm_aiter_fp4_asm_gemm_enabled():
-        from aiter import gemm_a4w4, per_1x32_f4_quant_hip
-
-    def gemm_with_dynamic_quant(
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        rocm_use_aiter_fp4_asm_gemm: bool = False,
-        out_dtype: Optional[torch.dtype] = torch.bfloat16,
-        x_scales: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        M = x.shape[0]
-        if rocm_use_aiter_fp4_asm_gemm:
-            if x_scales is None:
-                # use hip quant kernel for performance
-                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
-            else:
-                x_q = x
-                x_s = x_scales
-
-            # 32 alignment is enough for dim0 padding of output for
-            # gemm_a4w4 kernel
-            y = torch.empty((M + 31) // 32 * 32,
-                            weight.shape[0],
-                            device=x_q.device,
-                            dtype=out_dtype)
-
-            gemm_a4w4(x_q,
-                      weight,
-                      x_s,
-                      weight_scale.view(x_s.dtype),
-                      y,
-                      bpreshuffle=True)
-            return y[:M]
-        else:
-            if x_scales is None:
-                x_q, x_s = dynamic_mxfp4_quant(x)
-            else:
-                x_q = x
-                x_s = x_scales
-            y = torch.empty(x_q.shape[0],
-                            weight.shape[0],
-                            device=x_q.device,
-                            dtype=out_dtype)
-
-            gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
-            return y
-
-    def gemm_with_dynamic_quant_fake(
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        x_scales: torch.Tensor = None,
-        rocm_use_aiter_fp4_asm_gemm: bool = False,
-        out_dtype: Optional[torch.dtype] = torch.bfloat16,
-    ) -> torch.Tensor:
-        return torch.empty((*x.shape[:-1], weight.shape[0]),
-                           dtype=out_dtype,
-                           device=x.device)
-
-    direct_register_custom_op(
-        op_name="gemm_with_dynamic_quant",
-        op_func=gemm_with_dynamic_quant,
-        mutates_args=[],
-        fake_impl=gemm_with_dynamic_quant_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-except ImportError:
-    dynamic_mxfp4_quant = gemm_afp4wfp4 = None
-
-__all__ = ["QuarkW4A4MXFP4"]
-
-
-class QuarkW4A4MXFP4(QuarkScheme):
-
-    def __init__(self, weight_quant_spec: dict[str, Any],
-                 input_quant_spec: dict[str, Any]):
-        self.out_dtype = torch.get_default_dtype()
-        self.qscheme = "per_group"
-        self.weight_quant_spec = weight_quant_spec
-        self.input_quant_spec = input_quant_spec
-        self.emulate = not current_platform.supports_mx()
-        self.rocm_use_aiter_fp4_asm_gemm = is_rocm_aiter_fp4_asm_gemm_enabled()
-        if not self.emulate and (dynamic_mxfp4_quant is None
-                                 or gemm_afp4wfp4 is None):
-            # Currently need these kernels if not emulating
-            raise NotImplementedError(
-                f"{self.__class__.__name__} requires AITER to be installed "
-                "for non-emulation mode! Please refer to "
-                "https://github.com/ROCm/aiter for installation details.")
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 70
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
-
-        if self.emulate:
-            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
-                                                    requires_grad=False)
-            try:
-                from quark.torch.export.nn.modules import realquantizer
-                from quark.torch.quantization.config.config import (
-                    QuantizationSpec)
-            except ImportError as err:
-                raise ImportError(
-                    "The package `amd-quark` is required to use AMD Quark "
-                    "MX-FP4 models. Please install it with `pip install "
-                    "amd-quark`.") from err
-
-            weight_quant_spec = QuantizationSpec.from_dict(
-                self.weight_quant_spec)
-
-            weight_quantizer = realquantizer.get_real_quantizer(
-                qspec=weight_quant_spec,
-                quantizer=None,
-                real_quantized=True,
-                reorder=False,
-                float_dtype=self.out_dtype,
-                scale_shape=layer.weight_scale.shape,
-                zero_point_shape=None,
-            )
-            weight_quantizer.scale.data = layer.weight_scale.data
-
-            layer.weight = torch.nn.Parameter(
-                weight_quantizer(layer.weight.data).to(self.out_dtype),
-                requires_grad=False,
-            )
-            layer.weight_scale = None
-
-            # This call is necessary to release the scales memory.
-            torch.cuda.empty_cache()
-        else:
-            if self.rocm_use_aiter_fp4_asm_gemm:
-                # shuffle weight scale
-                weight_scale_shuffle = layer.weight_scale.data
-                sm, sn = weight_scale_shuffle.shape
-                weight_scale_shuffle = weight_scale_shuffle.view(
-                    sm // 32, 2, 16, sn // 8, 2, 4, 1)
-                weight_scale_shuffle = weight_scale_shuffle.permute(
-                    0, 3, 5, 2, 4, 1, 6).contiguous()
-                weight_scale_shuffle = weight_scale_shuffle.view(sm, sn)
-                layer.weight_scale = torch.nn.Parameter(weight_scale_shuffle,
-                                                        requires_grad=False)
-
-                # shuffle weight
-                weight_shuffle = layer.weight.data
-                weight_shuffle = shuffle_weight(weight_shuffle,
-                                                layout=(16, 16))
-                layer.weight = torch.nn.Parameter(weight_shuffle,
-                                                  requires_grad=False)
-            else:
-                layer.weight_scale = torch.nn.Parameter(
-                    layer.weight_scale.data.T.contiguous(),
-                    requires_grad=False)
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-        output_size_per_partition = sum(output_partition_sizes)
-        layer.logical_widths = output_partition_sizes
-
-        # WEIGHT
-        weight = PackedvLLMParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // 2,
-                dtype=torch.uint8,
-            ),
-            input_dim=1,
-            output_dim=0,
-            packed_dim=1,
-            packed_factor=2,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        weight_scale = GroupQuantScaleParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // OCP_MX_BLOCK_SIZE,
-                dtype=torch.uint8,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
-
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        if self.emulate:
-            dq_w = dequant_mxfp4(layer.weight, layer.weight_scale, x.dtype)
-            x = quant_dequant_mxfp4(x)
-            return F.linear(x, dq_w, bias)
-        else:
-            return torch.ops.vllm.gemm_with_dynamic_quant(
-                x, layer.weight, layer.weight_scale,
-                self.rocm_use_aiter_fp4_asm_gemm, self.out_dtype)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 2cb35249f49ef46110d73d3ee3b84376e1ad1bac..1e5ee93b61f2b4e9a66be61c88654c03b825ed40 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,43 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, cast
+from collections.abc import Callable
+from typing import Any, cast
 
 import torch
 from torch.nn import Parameter
 
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+    Fp8LinearOp,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale,
+)
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from vllm.platforms import current_platform
 
 __all__ = ["QuarkW8A8Fp8"]
 
 
 class QuarkW8A8Fp8(QuarkScheme):
-
-    def __init__(self, weight_config: dict[str, Any],
-                 input_config: Optional[dict[str, Any]]):
+    def __init__(
+        self, weight_config: dict[str, Any], input_config: dict[str, Any] | None
+    ):
         self.weight_qscheme = cast(str, weight_config.get("qscheme"))
         self.is_static_input_scheme: bool = False
-        self.input_qscheme: Optional[str] = None
+        self.input_qscheme: str | None = None
         if input_config is not None:
-            self.is_static_input_scheme = not cast(
-                bool, input_config.get("is_dynamic"))
+            self.is_static_input_scheme = not cast(bool, input_config.get("is_dynamic"))
             self.input_qscheme = cast(str, input_config.get("qscheme"))
 
-        per_token = (not self.is_static_input_scheme
-                     and self.input_qscheme == "per_channel")
-        self.act_quant_group_shape = GroupShape.PER_TOKEN \
-            if per_token else GroupShape.PER_TENSOR
+        per_token = (
+            not self.is_static_input_scheme and self.input_qscheme == "per_channel"
+        )
+        self.act_quant_group_shape = (
+            GroupShape.PER_TOKEN if per_token else GroupShape.PER_TENSOR
+        )
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=self.is_static_input_scheme,
-            act_quant_group_shape=self.act_quant_group_shape)
+            act_quant_group_shape=self.act_quant_group_shape,
+        )
         self.out_dtype = torch.get_default_dtype()
 
     @classmethod
@@ -51,14 +58,14 @@ class QuarkW8A8Fp8(QuarkScheme):
         # requantize so we can always run per tensor
         if self.weight_qscheme == "per_tensor":
             if current_platform.is_fp8_fnuz():
-                input_scale = getattr(layer, 'input_scale', None)
+                input_scale = getattr(layer, "input_scale", None)
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.weight,
                     weight_scale=layer.weight_scale,
-                    input_scale=input_scale)
+                    input_scale=input_scale,
+                )
                 if input_scale is not None:
-                    layer.input_scale = Parameter(input_scale,
-                                                  requires_grad=False)
+                    layer.input_scale = Parameter(input_scale, requires_grad=False)
             else:
                 max_w_scale = layer.weight_scale
                 weight = layer.weight
@@ -77,15 +84,14 @@ class QuarkW8A8Fp8(QuarkScheme):
             weight = layer.weight
 
             if current_platform.is_fp8_fnuz():
-                input_scale = getattr(layer, 'input_scale', None)
-                weight, weight_scale, input_scale = \
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        weight=weight,
-                        weight_scale=layer.weight_scale,
-                        input_scale=input_scale)
+                input_scale = getattr(layer, "input_scale", None)
+                weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=input_scale,
+                )
                 if input_scale is not None:
-                    layer.input_scale = Parameter(input_scale,
-                                                  requires_grad=False)
+                    layer.input_scale = Parameter(input_scale, requires_grad=False)
             else:
                 weight_scale = layer.weight_scale.data
             if self.act_quant_group_shape == GroupShape.PER_TOKEN:
@@ -95,32 +101,37 @@ class QuarkW8A8Fp8(QuarkScheme):
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
 
         else:
-            raise ValueError(
-                f"Unknown quantization scheme {self.weight_qscheme}")
+            raise ValueError(f"Unknown quantization scheme {self.weight_qscheme}")
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            layer.input_scale = Parameter(layer.input_scale.max(),
-                                          requires_grad=False)
+            layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
         else:
             layer.input_scale = None
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
 
         # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=torch.float8_e4m3fn),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
@@ -128,15 +139,16 @@ class QuarkW8A8Fp8(QuarkScheme):
         # the newly added parameters
         if self.weight_qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes)),
-                                 dtype=torch.float32),
+                data=torch.empty((sum(output_partition_sizes)), dtype=torch.float32),
                 output_dim=0,
-                weight_loader=weight_loader)
+                weight_loader=weight_loader,
+            )
         else:
             assert self.weight_qscheme == "per_tensor"
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
 
         # min requirement for fp8 kernels
         weight_scale[:] = torch.finfo(torch.float32).min
@@ -144,20 +156,24 @@ class QuarkW8A8Fp8(QuarkScheme):
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                  weight_loader=weight_loader)
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
             input_scale[:] = torch.finfo(torch.float32).min
             layer.register_parameter("input_scale", input_scale)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        return self.fp8_linear.apply(input=x,
-                                     weight=layer.weight,
-                                     weight_scale=layer.weight_scale,
-                                     out_dtype=self.out_dtype,
-                                     input_scale=layer.input_scale,
-                                     bias=bias)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            out_dtype=self.out_dtype,
+            input_scale=layer.input_scale,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index ae68d5bbc26800a485a83bb3e7523713470dc9d3..42d2ed2e85ed9aa7f098355a769cf1193c0bd373 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,18 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
-    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
+    ScaledMMLinearLayerConfig,
+    choose_scaled_mm_linear_kernel,
+)
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 
 logger = init_logger(__name__)
 
@@ -20,8 +24,12 @@ logger = init_logger(__name__)
 class QuarkW8A8Int8(QuarkScheme):
     _kernel_backends_being_used: set[str] = set()
 
-    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool],
-                 input_symmetric: Optional[bool]):
+    def __init__(
+        self,
+        qscheme: str,
+        is_static_input_scheme: bool | None,
+        input_symmetric: bool | None,
+    ):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
         self.input_symmetric = input_symmetric
@@ -31,92 +39,101 @@ class QuarkW8A8Int8(QuarkScheme):
         # turing and up
         return 75
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         layer.logical_widths = output_partition_sizes
 
         scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
             is_channelwise=(self.qscheme == "per_channel"),
             is_static_input_scheme=(self.is_static_input_scheme is True),
-            input_symmetric=(self.input_symmetric is True))
+            input_symmetric=(self.input_symmetric is True),
+        )
 
-        kernel_type = choose_scaled_mm_linear_kernel(
-            scaled_mm_linear_kernel_config)
+        kernel_type = choose_scaled_mm_linear_kernel(scaled_mm_linear_kernel_config)
 
         if kernel_type.__name__ not in self._kernel_backends_being_used:
             logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=torch.int8),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
 
         layer.register_parameter("weight", weight)
 
         # WEIGHT SCALE
         if self.qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes)),
-                                 dtype=torch.float32),
+                data=torch.empty((sum(output_partition_sizes)), dtype=torch.float32),
                 output_dim=0,
-                weight_loader=weight_loader)
+                weight_loader=weight_loader,
+            )
             ChannelQuantZPParameter = ChannelQuantScaleParameter
             weight_zero_point = ChannelQuantZPParameter(
-                data=torch.empty((sum(output_partition_sizes)),
-                                 dtype=torch.int8),
+                data=torch.empty((sum(output_partition_sizes)), dtype=torch.int8),
                 output_dim=0,
-                weight_loader=weight_loader)
+                weight_loader=weight_loader,
+            )
         else:
             assert self.qscheme == "per_tensor"
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
             PerTensorZPParameter = PerTensorScaleParameter
             weight_zero_point = PerTensorZPParameter(
-                data=torch.empty(len(output_partition_sizes),
-                                 dtype=torch.int8),
-                weight_loader=weight_loader)
+                data=torch.empty(len(output_partition_sizes), dtype=torch.int8),
+                weight_loader=weight_loader,
+            )
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_zero_point", weight_zero_point)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = BasevLLMParameter(data=torch.empty(
-                1, dtype=torch.float32),
-                                            weight_loader=weight_loader)
+            input_scale = BasevLLMParameter(
+                data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader
+            )
             layer.register_parameter("input_scale", input_scale)
 
-            input_zero_point = BasevLLMParameter(data=torch.empty(
-                1, dtype=torch.int8),
-                                                 weight_loader=weight_loader)
+            input_zero_point = BasevLLMParameter(
+                data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader
+            )
             layer.register_parameter("input_zero_point", input_zero_point)
 
-        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
-                                  w_q_param_name="weight",
-                                  w_s_param_name="weight_scale",
-                                  i_s_param_name="input_scale",
-                                  i_zp_param_name="input_zero_point",
-                                  azp_adj_param_name="azp_adj")
+        self.kernel = kernel_type(
+            c=scaled_mm_linear_kernel_config,
+            w_q_param_name="weight",
+            w_s_param_name="weight_scale",
+            i_s_param_name="input_scale",
+            i_zp_param_name="input_zero_point",
+            azp_adj_param_name="azp_adj",
+        )
 
     # Checkpoints are serialized in quark format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.register_parameter("weight_zero_point", None)
-        delattr(layer, 'weight_zero_point')
+        delattr(layer, "weight_zero_point")
         if self.input_symmetric:
             layer.register_parameter("input_zero_point", None)
-            delattr(layer, 'input_zero_point')
+            delattr(layer, "input_zero_point")
 
         self.kernel.process_weights_after_loading(layer)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
+    ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 99f5ec15933abc95382ebd8243e85059f64beb1b..dc82f94ebbbfb86b9d8d019bf50e3966a881d883 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
-from typing import Any, Optional
+from typing import Any
 
 import regex as re
 
@@ -22,9 +22,9 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
 
 
 def should_ignore_layer(
-    layer_name: Optional[str],
+    layer_name: str | None,
     ignore: Iterable[str],
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> bool:
     if layer_name is None:
         return False
@@ -50,7 +50,8 @@ def should_ignore_layer(
         should_ignore_layer = None
         for shard_name in shard_names:
             should_ignore_shard = check_equal_or_regex_match(
-                layer_name=shard_name, targets=ignore)
+                layer_name=shard_name, targets=ignore
+            )
 
             # If shard_idx=0, set layer ignore to match shard.
             if should_ignore_layer is None:
@@ -58,35 +59,34 @@ def should_ignore_layer(
 
             # If shard_idx=1+ confirm scheme matches prior shards.
             elif should_ignore_shard != should_ignore_layer:
-                raise ValueError(f"Found a different quantization schemes for "
-                                 f"{shard_proj_names} in {layer_name}. vLLM "
-                                 "requires all to use the same scheme.")
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
 
     # Unfused layers like down_proj and o_proj will match
     # the safetensors checkpoint already.
     else:
-        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
-                                                         targets=ignore)
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
 
     assert should_ignore_layer is not None
     return should_ignore_layer
 
 
-def check_equal_or_regex_match(layer_name: str,
-                               targets: Iterable[str]) -> bool:
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
     """
-    Checks whether a layer_name is exactly equal or a regex match for 
+    Checks whether a layer_name is exactly equal or a regex match for
     if target starts with 're:' to any target in list.
     """
-    for target in targets:
-        if _is_equal_or_regex_match(layer_name, target):
-            return True
-    return False
+    return any(_is_equal_or_regex_match(layer_name, target) for target in targets)
 
 
-def _is_equal_or_regex_match(value: str,
-                             target: str,
-                             check_contains: bool = False) -> bool:
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
     """
     Checks whether a value is exactly equal or a regex match for target
     if target starts with 're:'. If check_contains is set to True,
diff --git a/vllm/model_executor/layers/quantization/qutlass_utils.py b/vllm/model_executor/layers/quantization/qutlass_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..555bb50da199ebcf5a587b8120d6f07afc216911
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/qutlass_utils.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Modified by Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+#
+# Copied from https://github.com/pytorch/ao/tree/main/torchao/prototype/mx_formats
+#
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Literal
+
+import torch
+from torch.library import wrap_triton
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def triton_scale_swizzle(
+    scale_ptr: torch.Tensor,
+    scale_rows: int,
+    scale_cols: int,
+    output_ptr: torch.Tensor,
+    input_row_stride: int,
+    output_block_stride: int,
+    BLOCK_ROWS: tl.constexpr,
+    BLOCK_COLS: tl.constexpr,
+):
+    """
+    Rearranges tensor data from row-major to block-scaled swizzle format.
+
+    Args:
+        scale_ptr: Pointer to the input scale tensor
+        scale_rows: Number of rows in the scale tensor
+        scale_cols: Number of columns in the scale tensor
+        output_ptr: Pointer to the output tensor
+        input_row_stride: Stride between rows in the input tensor
+        output_block_stride: Stride between blocks in the output tensor
+        BLOCK_ROWS: Number of rows in a tile (compile-time constant)
+        BLOCK_COLS: Number of columns in a tile (compile-time constant)
+    """
+    pid_row = tl.program_id(0)
+    pid_col = tl.program_id(1)
+
+    rows = tl.arange(0, BLOCK_ROWS)[:, None]
+    cols = tl.arange(0, BLOCK_COLS)[None, :]
+
+    # Calculate starting row and column for this tile
+    start_row = pid_row * BLOCK_ROWS
+    start_col = pid_col * BLOCK_COLS
+    global_rows = start_row + rows
+    global_cols = start_col + cols
+
+    mask = (global_rows < scale_rows) & (global_cols < scale_cols)
+
+    input_scales = tl.load(
+        scale_ptr + global_rows * input_row_stride + global_cols,
+        mask=mask,
+        other=0.0,
+    )
+
+    r_div_32 = rows // 32
+    r_mod_32 = rows % 32
+
+    # 2) Rearrange to (32, 4, 4) then to final (32, 16) coordinates
+    dest_indices = r_mod_32 * 16 + r_div_32 * 4 + cols
+
+    # Flatten
+    dest_indices_flat = tl.reshape(dest_indices, (BLOCK_ROWS * BLOCK_COLS))
+    scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
+
+    # Calculate block offset using provided output block stride
+    LOCAL_NUMEL = BLOCK_ROWS * BLOCK_COLS
+    block_offset = pid_col * LOCAL_NUMEL + (pid_row * output_block_stride)
+
+    tl.store(
+        output_ptr + block_offset + dest_indices_flat,
+        scales_flat,
+    )
+
+
+def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Rearranges an E8M0 tensor scale from row-major format to
+    block-scaled swizzle format.
+
+    This format is suitable for Tmem as described in NVIDIA documentation:
+    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        scale_tensor: Input tensor in row-major format with 8-bit elements
+
+    Returns:
+        Rearranged tensor in block-scaled swizzle format
+    """
+    assert scale_tensor.element_size() == 1, (
+        "Expected element size to be 1 byte (8 bits)"
+    )
+    assert scale_tensor.is_contiguous(), "Input tensor must be contiguous"
+
+    rows, cols = scale_tensor.shape
+
+    # Calculate blocks needed
+    n_row_blocks = triton.cdiv(rows, 128)
+    n_col_blocks = triton.cdiv(cols, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    out = scale_tensor.new_empty((padded_rows, padded_cols))
+
+    # Input stride (for row-major format)
+    input_row_stride = cols
+
+    # We probably want handle multiple blocks per tile but
+    # for now keep it simple
+    BLOCK_ROWS, BLOCK_COLS = 128, 4
+
+    # Output block stride for the rearranged format
+    output_block_stride = BLOCK_ROWS * BLOCK_COLS * (padded_cols // BLOCK_COLS)
+
+    grid = lambda META: (
+        triton.cdiv(padded_rows, BLOCK_ROWS),
+        triton.cdiv(padded_cols, BLOCK_COLS),
+    )
+
+    wrap_triton(triton_scale_swizzle)[grid](
+        scale_tensor.view(torch.uint8),
+        rows,
+        cols,
+        out.view(torch.uint8),
+        input_row_stride,
+        output_block_stride,
+        BLOCK_ROWS=BLOCK_ROWS,
+        BLOCK_COLS=BLOCK_COLS,
+    )
+
+    return out
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def to_blocked(
+    input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
+) -> torch.Tensor:
+    """
+    Rearrange a large matrix by breaking it into blocks and applying
+    the rearrangement pattern.
+
+    See:
+        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        input_matrix: Input tensor of shape (H, W)
+        backend: "torch" (PyTorch path) or "triton" (Triton kernel)
+
+    Returns:
+        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
+    """
+    if backend == "triton":
+        return triton_mx_block_rearrange(input_matrix).flatten()
+    elif backend != "torch":
+        raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')
+
+    rows, cols = input_matrix.shape
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    # Calculate the padded shape
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    padded = input_matrix
+    assert (rows, cols) == (padded_rows, padded_cols)
+
+    # Rearrange the blocks
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 015dc136bb82c57a9490efa52239aaa81e9c9a20..52656263a601b546df7a17c6e5a13d9dcee823e5 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,43 +3,58 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
+import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
-                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig, int4_w4a16_moe_quant_config,
-    int8_w8a16_moe_quant_config)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs)
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    set_weight_attrs,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_rtn_marlin_linear,
+    marlin_make_workspace_new,
+)
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 """By default, use 8 bit as target precision, but it can be 
 overridden by setting the RTN_NUM_BITS envvar
 """
-NUM_BITS = os.getenv('RTN_NUM_BITS', "8")
+NUM_BITS = os.getenv("RTN_NUM_BITS", "8")
 """By default, use group size of 128 parameters, but it can be 
 overridden by setting the RTN_GROUP_SIZE envvar
 """
-GROUP_SIZE = os.getenv('RTN_GROUP_SIZE', "128")
+GROUP_SIZE = os.getenv("RTN_GROUP_SIZE", "128")
+"""Global Marlin workspace shared by all modules
+"""
+workspace = None
 
 
 class RTNConfig(QuantizationConfig):
-    """Config class for RTN.
-    """
+    """Config class for RTN."""
 
     def __init__(
-            self,
-            weight_bits: int = int(NUM_BITS),
-            group_size: int = int(GROUP_SIZE),
+        self,
+        weight_bits: int = int(NUM_BITS),
+        group_size: int = int(GROUP_SIZE),
     ) -> None:
         self.weight_bits = weight_bits
         self.group_size = group_size
@@ -47,11 +62,17 @@ class RTNConfig(QuantizationConfig):
         if self.weight_bits != 4 and self.weight_bits != 8:
             raise ValueError(
                 "Currently, only 4-bit or 8-bit weight quantization is "
-                f"supported for RTN, but got {self.weight_bits} bits.")
+                f"supported for RTN, but got {self.weight_bits} bits."
+            )
+
+        self.quant_type = (
+            scalar_types.uint8b128 if self.weight_bits == 8 else scalar_types.uint4b8
+        )
 
     def __repr__(self) -> str:
-        return (f"RTNConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size})")
+        return (
+            f"RTNConfig(weight_bits={self.weight_bits}, group_size={self.group_size})"
+        )
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -75,8 +96,9 @@ class RTNConfig(QuantizationConfig):
         group_size = cls.get_from_keys(config, ["group_size"])
         return cls(weight_bits, group_size)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             return RTNLinearMethod(self)
         elif isinstance(layer, FusedMoE):
@@ -89,8 +111,9 @@ class RTNTensor:
     overloading the copy_ method.
     """
 
-    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
-                 quant_config: RTNConfig) -> None:
+    def __init__(
+        self, data: torch.Tensor, scale: torch.Tensor, quant_config: RTNConfig
+    ) -> None:
         self.data = data
         self.scale = scale
         self.quant_config = quant_config
@@ -99,7 +122,9 @@ class RTNTensor:
         factor = 1 if self.quant_config.weight_bits == 8 else 2
         return RTNTensor(
             self.data.narrow(dim, start // factor, length // factor),
-            self.scale.narrow(dim, start, length), self.quant_config)
+            self.scale.narrow(dim, start, length),
+            self.quant_config,
+        )
 
     def __getitem__(self, key):
         return RTNTensor(self.data[key], self.scale[key], self.quant_config)
@@ -115,9 +140,11 @@ class RTNTensor:
             return torch.Size((shape[0] * factor, shape[1]))
 
     def copy_(self, loaded_weight: torch.Tensor) -> None:
-        qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
-                                             self.quant_config.weight_bits,
-                                             self.quant_config.group_size)
+        qweight, weight_scale = rtn_quantize(
+            loaded_weight.cuda(),
+            self.quant_config.weight_bits,
+            self.quant_config.group_size,
+        )
 
         self.data.copy_(qweight)
         self.scale.data.copy_(weight_scale)
@@ -133,8 +160,9 @@ class RTNParameter(Parameter):
     def __new__(cls, data: torch.Tensor, **kwargs):
         return super().__new__(cls, data=data, requires_grad=False)
 
-    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
-                 quant_config: RTNConfig) -> None:
+    def __init__(
+        self, data: torch.Tensor, scale: torch.Tensor, quant_config: RTNConfig
+    ) -> None:
         self.scale = scale
         self.quant_config = quant_config
 
@@ -164,128 +192,167 @@ class RTNLinearMethod(LinearMethodBase):
         **extra_weight_attrs,
     ):
         output_size_per_partition = sum(output_partition_sizes)
-        num_groups_per_col = (input_size_per_partition //
-                              self.quant_config.group_size
-                              if self.quant_config.group_size != -1 else 1)
+        num_groups_per_col = (
+            input_size_per_partition // self.quant_config.group_size
+            if self.quant_config.group_size != -1
+            else 1
+        )
 
         scale = Parameter(
-            torch.empty(output_size_per_partition,
-                        num_groups_per_col,
-                        dtype=params_dtype),
+            torch.empty(
+                output_size_per_partition, num_groups_per_col, dtype=params_dtype
+            ),
             requires_grad=False,
         )
         factor = 1 if self.quant_config.weight_bits == 8 else 2
 
-        weight = RTNParameter(data=torch.empty(output_size_per_partition //
-                                               factor,
-                                               input_size_per_partition,
-                                               dtype=torch.uint8),
-                              scale=scale,
-                              quant_config=self.quant_config)
+        weight = RTNParameter(
+            data=torch.empty(
+                output_size_per_partition // factor,
+                input_size_per_partition,
+                dtype=torch.uint8,
+            ),
+            scale=scale,
+            quant_config=self.quant_config,
+        )
 
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            **extra_weight_attrs,
-            "input_dim": 1,
-            "output_dim": 0,
-        })
+        set_weight_attrs(
+            weight,
+            {
+                **extra_weight_attrs,
+                "input_dim": 1,
+                "output_dim": 0,
+            },
+        )
 
         layer.register_parameter("scale", scale)
         layer.output_size_per_partition = output_size_per_partition
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        fix_weights(layer, "weight")
+        """Repack weights and scales for Marlin kernels."""
+        weight_bits = self.quant_config.weight_bits
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        qweight = layer.weight
-        scale = layer.scale
+        weight, scale = repack_weights(layer.weight, layer.scale, weight_bits)
 
-        weight = rtn_dequantize(qweight, scale)
-        out = F.linear(x, weight)
-        del weight
-        if bias is not None:
-            out.add_(bias)
+        replace_parameter(layer, "weight", weight)
+        replace_parameter(layer, "scale", scale)
 
-        return out
+        init_workspace(layer.weight.device)
 
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return apply_rtn_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.scale,
+            workspace=workspace,
+            quant_type=self.quant_config.quant_type,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            bias=bias,
+        )
 
-class RTNMoEMethod(FusedMoEMethodBase):
 
+class RTNMoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig):
         super().__init__(moe)
         self.quant_config = quant_config
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         factor = 1 if self.quant_config.weight_bits == 8 else 2
 
         # Fused gate_up_proj (column parallel)
-        num_groups_per_col = (hidden_size // self.quant_config.group_size
-                              if self.quant_config.group_size != -1 else 1)
+        num_groups_per_col = (
+            hidden_size // self.quant_config.group_size
+            if self.quant_config.group_size != -1
+            else 1
+        )
         w13_scale = Parameter(
-            torch.empty(num_experts,
-                        2 * intermediate_size_per_partition,
-                        num_groups_per_col,
-                        dtype=params_dtype),
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                num_groups_per_col,
+                dtype=params_dtype,
+            ),
             requires_grad=False,
         )
         layer.register_parameter("w13_scale", w13_scale)
 
-        w13_weight = RTNParameter(data=torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition // factor,
-            hidden_size,
-            dtype=torch.uint8),
-                                  scale=w13_scale,
-                                  quant_config=self.quant_config)
+        w13_weight = RTNParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition // factor,
+                hidden_size,
+                dtype=torch.uint8,
+            ),
+            scale=w13_scale,
+            quant_config=self.quant_config,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        num_groups_per_col = (intermediate_size_per_partition //
-                              self.quant_config.group_size
-                              if self.quant_config.group_size != -1 else 1)
-        w2_scale = Parameter(torch.zeros(num_experts,
-                                         hidden_size,
-                                         num_groups_per_col,
-                                         dtype=params_dtype),
-                             requires_grad=False)
+        num_groups_per_col = (
+            intermediate_size_per_partition // self.quant_config.group_size
+            if self.quant_config.group_size != -1
+            else 1
+        )
+        w2_scale = Parameter(
+            torch.zeros(
+                num_experts, hidden_size, num_groups_per_col, dtype=params_dtype
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_scale", w2_scale)
 
-        w2_weight = RTNParameter(data=torch.empty(
-            num_experts,
-            hidden_size // factor,
-            intermediate_size_per_partition,
-            dtype=torch.uint8),
-                                 scale=w2_scale,
-                                 quant_config=self.quant_config)
+        w2_weight = RTNParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size // factor,
+                intermediate_size_per_partition,
+                dtype=torch.uint8,
+            ),
+            scale=w2_scale,
+            quant_config=self.quant_config,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Repack weights and scales for Marlin kernels."""
         weight_bits = self.quant_config.weight_bits
-        fix_weights(layer, "w13_weight", weight_bits == 4)
-        fix_weights(layer, "w2_weight", weight_bits == 4)
 
-    def get_fused_moe_quant_config(
-            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
-        weight_bits = self.quant_config.weight_bits
-        group_size = self.quant_config.group_size
-        assert weight_bits == 4 or weight_bits == 8
-        config_builder = (int4_w4a16_moe_quant_config
-                          if weight_bits == 4 else int8_w8a16_moe_quant_config)
-        return config_builder(
-            w1_scale=layer.w13_scale,
-            w2_scale=layer.w2_scale,
-            w1_zp=None,
-            w2_zp=None,
-            block_shape=[0, group_size],
+        w13_weight, w13_scale = repack_weights(
+            layer.w13_weight, layer.w13_scale, weight_bits
+        )
+        replace_parameter(layer, "w13_weight", w13_weight)
+        replace_parameter(layer, "w13_scale", w13_scale)
+
+        w2_weight, w2_scale = repack_weights(
+            layer.w2_weight, layer.w2_scale, weight_bits
         )
+        replace_parameter(layer, "w2_weight", w2_weight)
+        replace_parameter(layer, "w2_scale", w2_scale)
+
+        init_workspace(layer.w13_weight.device)
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return None
 
     def apply(
         self,
@@ -295,28 +362,23 @@ class RTNMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert self.fused_experts is None
-
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `RTNMoEMethod` yet.")
-
-        from vllm.model_executor.layers.fused_moe import fused_experts
+            raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -330,32 +392,38 @@ class RTNMoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+        )
 
-        return fused_experts(
+        return fused_marlin_moe(
             x,
             layer.w13_weight,
             layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+            layer.w13_scale,
+            layer.w2_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_config.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
-            quant_config=self.moe_quant_config,
+            workspace=workspace,
         )
 
 
-def rtn_quantize(tensor: torch.Tensor, num_bits: int,
-                 group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+def rtn_quantize(
+    tensor: torch.Tensor, num_bits: int, group_size: int
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize a tensor using per-group static scaling factor.
 
     Args:
         tensor: The input tensor.
         num_bits: Target precision for the result (supported values are
                   8 or 4).
-        group_size: Quantization granularity. 
+        group_size: Quantization granularity.
                     If equal to -1, each row in the input tensor is treated
                     as one group.
     """
@@ -364,15 +432,18 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
         tensor = tensor.unsqueeze(0)
 
     q_range = 2**num_bits
-    num_groups = (tensor.shape[1] * tensor.shape[2] //
-                  group_size if group_size != -1 else tensor.shape[1])
+    num_groups = (
+        tensor.shape[1] * tensor.shape[2] // group_size
+        if group_size != -1
+        else tensor.shape[1]
+    )
     """Calculate a scaling factor per input group.
     """
     input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
     input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
     input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
     input_max_abs = torch.max(input_min.abs(), input_max.abs())
-    scale = (input_max_abs * 2.0 / (q_range - 1))
+    scale = input_max_abs * 2.0 / (q_range - 1)
     """Scale each input group, round to the nearest integer, shift 
     the range and truncate.
     """
@@ -388,9 +459,10 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
     if num_bits == 4:
         """Pack two 4-bit values into each byte.
         """
-        inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
-        inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
-                                    tensor.shape[2])
+        inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xF)
+        inputs_q = inputs_q.reshape(
+            tensor.shape[0], tensor.shape[1] // 2, tensor.shape[2]
+        )
         inputs_q = inputs_q.contiguous()
 
     if not batch_present:
@@ -420,9 +492,9 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     if num_bits == 4:
         input_dim *= 2
 
-    data = torch.empty((batch, input_dim, output_dim),
-                       dtype=scale.dtype,
-                       device=tensor.device)
+    data = torch.empty(
+        (batch, input_dim, output_dim), dtype=scale.dtype, device=tensor.device
+    )
 
     if num_bits == 8:
         data.copy_(tensor)
@@ -432,8 +504,9 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         """
         tensor = tensor.reshape(batch, input_dim, output_dim // 2)
         for i in range(2):
-            data[:, :, i::2] = ((tensor << 4 *
-                                 (1 - i)) >> 4).to(torch.int8) - q_range // 2
+            data[:, :, i::2] = ((tensor << 4 * (1 - i)) >> 4).to(
+                torch.int8
+            ) - q_range // 2
     """Scale each input group with its scaling factor.
     """
     scale = scale.reshape(batch, num_groups, -1)
@@ -447,20 +520,133 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     return input_deq
 
 
-def fix_weights(layer: torch.nn.Module,
-                param_name: str,
-                reshape: bool = False):
-    """torch.compile does not know how to deal with a Parameter subclass
-    (aka RTNParameter). As we don't really need RTNParameters for the
-    forward pass, we replace them with equivalent instances of Parameters.
+def _get_perms():
+    perm = []
+    for i in range(32):
+        perm1 = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm.extend([p + 256 * j for p in perm1])
+
+    perm_arr = np.array(perm)
+    interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    perm_arr = perm_arr.reshape((-1, 8))[:, interleave].ravel()
+    perm_tensor = torch.from_numpy(perm_arr)
+    scale_perm = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return perm_tensor, scale_perm, scale_perm_single
+
+
+_perm, _scale_perm, _scale_perm_single = _get_perms()
+
+
+def pack_for_marlin(weight, scale, qbits):
+    batch = weight.shape[0]
+
+    n = weight.size(1)
+    k = weight.size(2)
+    groupsize = k // scale.size(2)
+
+    tile = 16
+    s = scale.permute(0, 2, 1)  # transpose
+    w = weight.permute(0, 2, 1)  # transpose
+    if groupsize != k:
+        w = w.reshape((batch, -1, groupsize, n))
+        w = w.permute(0, 2, 1, 3)
+        w = w.reshape((batch, groupsize, -1))
+        s = s.reshape((batch, 1, -1))
+
+    if groupsize != k:
+        w = w.reshape((batch, groupsize, -1, n))
+        w = w.permute(0, 2, 1, 3)
+        w = w.reshape((batch, k, n)).contiguous()
+        s = s.reshape((batch, -1, len(_scale_perm)))[:, :, _scale_perm]
+    else:
+        s = s.reshape((batch, -1, len(_scale_perm_single)))[:, :, _scale_perm_single]
+    s = s.reshape((batch, -1, n)).contiguous()
+    w = w.reshape((batch, k // tile, tile, n // tile, tile))
+    w = w.permute((0, 1, 3, 2, 4))
+    w = w.reshape((batch, k // tile, n * tile))
+    res = w
+    res = res.reshape((batch, -1, _perm.numel()))[:, :, _perm].reshape(res.shape)
+    if qbits == 4:
+        q = torch.zeros(
+            (batch, res.shape[1], res.shape[2] // 2), dtype=torch.int8, device=w.device
+        )
+        for i in range(2):
+            q |= res[:, :, i::2] << 4 * i
+        q = q.reshape(batch, -1, n).contiguous()
+    else:
+        q = res.clone()
+        q[:, :, 2::8] = res[:, :, 4::8]
+        q[:, :, 3::8] = res[:, :, 5::8]
+        q[:, :, 4::8] = res[:, :, 2::8]
+        q[:, :, 5::8] = res[:, :, 3::8]
+        q = q.reshape(batch, -1, n).to(torch.int8).contiguous()
+
+    return q, s
+
+
+def repack_8bit_into_32bit(input):
+    output = torch.zeros(
+        (input.shape[0], input.shape[1], input.shape[2] // 4),
+        dtype=torch.int32,
+        device=input.device,
+    )
+    for i in range(4):
+        output |= (input[:, :, i::4] & 0xFF).to(torch.int32) << 8 * i
+
+    return output
+
+
+def repack_weights(qweight, scale, weight_bits):
+    batch_present = len(qweight.shape) == 3
+    if not batch_present:
+        qweight = qweight.unsqueeze(0)
+        scale = scale.unsqueeze(0)
+
+    if weight_bits == 4:
+        """Unpack two 4-bit values from each byte.
+        """
+        qweight_unpacked = torch.empty(
+            (qweight.shape[0], qweight.shape[1] * 2, qweight.shape[2]),
+            dtype=torch.uint8,
+            device=qweight.device,
+        )
+        for i in range(2):
+            qweight_unpacked[:, :, i::2] = ((qweight << 4 * (1 - i)) >> 4).reshape(
+                qweight.shape[0], qweight.shape[1] * 2, qweight.shape[2] // 2
+            )
+    else:
+        qweight_unpacked = qweight
+
+    qweight_packed, scale_packed = pack_for_marlin(qweight_unpacked, scale, weight_bits)
+    """Marlin kernels expect tensors in int32 format in a certain shape
     """
-    old_weight = getattr(layer, param_name)
-    assert isinstance(old_weight, RTNParameter)
-    data = old_weight.data.data
+    qweight_repacked = repack_8bit_into_32bit(qweight_packed.to(torch.uint8))
+    qweight_reshaped = qweight_repacked.reshape(
+        qweight.shape[0], qweight.shape[2] // 16, -1
+    )
+    if not batch_present:
+        qweight_reshaped = qweight_reshaped.squeeze(0)
+        scale_packed = scale_packed.squeeze(0)
+
+    return qweight_reshaped, scale_packed
 
-    delattr(layer, param_name)
 
-    if reshape:
-        data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
-    new_weight = Parameter(data=data, requires_grad=False)
-    layer.register_parameter(param_name, new_weight)
+def init_workspace(device):
+    global workspace
+    if workspace is None:
+        workspace = marlin_make_workspace_new(device, 4)
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index a108152929d9a7fd630db657b82a4c545c1264fa..669bd9d6ed83768c5b5e0c79133dfbf4f05238d5 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -13,8 +13,6 @@ possible on ROCm), the model can be optionally augmented with KV cache
 scaling factors.
 """
 
-from typing import Optional
-
 from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
 
 
@@ -30,7 +28,8 @@ class KVCacheQuantSchema(BaseModel):
     def check_is_fp8(self) -> "KVCacheQuantSchema":
         assert self.dtype == "float8_e4m3fn", (
             "Loaded scaling factors intended for KV cache dtype = "
-            f"{self.dtype} rather than float8_e4m3fn!")
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
         return self
 
     @model_validator(mode="after")
@@ -41,15 +40,18 @@ class KVCacheQuantSchema(BaseModel):
             num_hidden_layers = context["num_hidden_layers"]
             assert len(self.scaling_factor) == tp_size, (
                 f"Loaded dictionary has TP size {len(self.scaling_factor)} "
-                f"but LLM engine is currently running with TP size {tp_size}.")
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
             for tp_rank, layer_maps in self.scaling_factor.items():
                 assert len(layer_maps) == num_hidden_layers, (
                     f"KV cache scales map for TP rank {tp_rank} is malformed. "
                     f"Expected {num_hidden_layers} layers, got "
-                    f"{len(layer_maps)}.")
+                    f"{len(layer_maps)}."
+                )
             for i in range(tp_size):
                 assert i in self.scaling_factor, (
-                    f"KV cache scales map for TP rank {i} not found.")
+                    f"KV cache scales map for TP rank {i} not found."
+                )
         return self
 
     @model_validator(mode="after")
@@ -62,7 +64,8 @@ class KVCacheQuantSchema(BaseModel):
             for i in range(num_hidden_layers):
                 assert i in layer_scales_map, (
                     f"Could not find KV cache scales for layer {i} in "
-                    f"TP rank {tp_rank}.")
+                    f"TP rank {tp_rank}."
+                )
         return self
 
 
@@ -70,7 +73,7 @@ class QuantParamSchema(BaseModel):
     # TODO: Generalize and extend with more fields
     # (e.g. weights/activations params) once functionality is enabled
     model_config = ConfigDict(protected_namespaces=())
-    model_type: Optional[str]
+    model_type: str | None
     kv_cache: KVCacheQuantSchema
 
     @model_validator(mode="after")
@@ -82,5 +85,6 @@ class QuantParamSchema(BaseModel):
                 assert model_type == self.model_type, (
                     f"Model type is {model_type} but loaded "
                     f"scaling factors belonging to different "
-                    f"model type {self.model_type}!")
+                    f"model type {self.model_type}!"
+                )
         return self
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 2efb605f203fd6a4ac9fa004d3decb3ccdaba111..3fee71e193db59b303b97a81c02bddc3dd2851df 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -1,22 +1,78 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import json
+import types
+from importlib.util import find_spec
 from typing import Any, Optional
 
+import regex as re
 import torch
 import torch.nn.functional as F
+from packaging import version
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
 
 
+def _bond_method_to_cls(func, obj):
+    if hasattr(func, "__self__") or not callable(func):
+        # If the function is already bound to an instance, return it as is
+        return func
+    else:
+        return types.MethodType(func, obj)
+
+
+def _get_weight_attrs(param):
+    # record attributes attached to the weight, so we can
+    # recover later
+    recorded_weight_attr = {}
+    for key in param.__dict__:
+        if hasattr(param, key):
+            attr = getattr(param, key)
+            if not callable(attr):
+                recorded_weight_attr[key] = attr
+            elif hasattr(attr, "__self__") and param is attr.__self__:
+                # if attr is a bonded method for an instance, and
+                # attr.__self__ points to the instance (param)
+                # we'll record the underlying function object
+                recorded_weight_attr[key] = attr.__func__
+            else:
+                recorded_weight_attr[key] = attr
+    return recorded_weight_attr
+
+
+def _restore_weight_attrs(param, recorded_weight_attr):
+    for attr_name, attr in recorded_weight_attr.items():
+        if not hasattr(param, attr_name):
+            setattr(param, attr_name, _bond_method_to_cls(attr, param))
+
+
+def torchao_version_at_least(torchao_version: str) -> bool:
+    if find_spec("torchao"):
+        try:
+            if version.parse(importlib.metadata.version("torchao")) >= version.parse(
+                torchao_version
+            ):
+                return True
+        except (ImportError, version.InvalidVersion):
+            return False
+    return False
+
+
 def should_skip(prefix: str, skip_modules: list[str]) -> bool:
     """
     Robust skipping logic:
@@ -35,12 +91,23 @@ def should_skip(prefix: str, skip_modules: list[str]) -> bool:
     return False
 
 
+if torchao_version_at_least("0.15.0"):
+    from torchao.prototype.tensor_conversion.api import (
+        convert_to_packed_tensor_based_on_current_hardware,
+    )
+else:
+    convert_to_packed_tensor_based_on_current_hardware = lambda t: t
+
+
 class TorchAOConfig(QuantizationConfig):
     """Config class for torchao."""
 
-    def __init__(self,
-                 torchao_config,
-                 skip_modules: Optional[list[str]] = None) -> None:
+    def __init__(
+        self,
+        torchao_config,
+        skip_modules: list[str] | None = None,
+        is_checkpoint_torchao_serialized: bool = False,
+    ) -> None:
         """
         # TorchAO quantization relies on tensor subclasses. In order,
         # to enable proper caching this needs standalone compile
@@ -58,9 +125,13 @@ class TorchAOConfig(QuantizationConfig):
         super().__init__()
         self.torchao_config = torchao_config
         self.skip_modules = skip_modules or []
+        self.is_checkpoint_torchao_serialized = is_checkpoint_torchao_serialized
 
     def __repr__(self) -> str:
-        return f"TorchAOConfig({self.torchao_config})"
+        return (
+            f"TorchAOConfig({self.torchao_config=}, {self.skip_modules=}, "
+            f"{self.is_checkpoint_torchao_serialized=})"
+        )
 
     def get_name(self) -> QuantizationMethods:
         return "torchao"
@@ -74,7 +145,10 @@ class TorchAOConfig(QuantizationConfig):
 
     @staticmethod
     def get_config_filenames() -> list[str]:
-        return ["config.json"]
+        """torchao doesn't require additional config files, we use
+        `config.json` from huggingface: `model_config.hf_config`
+        """
+        return []
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig":
@@ -87,10 +161,16 @@ class TorchAOConfig(QuantizationConfig):
                 "`pip install torchao>=0.10.0` to use torchao quantization."
             ) from err
 
+        quant_method = cls.get_from_keys_or(config, ["quant_method"], None)
+        is_checkpoint_torchao_serialized = (
+            quant_method is not None and "torchao" in quant_method
+        )
+
         hf_config = cls.get_from_keys_or(config, ["quant_type"], None)
         assert hf_config is not None, "quant_type must be specified"
         assert len(hf_config) == 1 and "default" in hf_config, (
-            "Expected only one key 'default' in quant_type dictionary")
+            "Expected only one key 'default' in quant_type dictionary"
+        )
         quant_type = hf_config["default"]
         ao_config = config_from_dict(quant_type)
 
@@ -110,10 +190,40 @@ class TorchAOConfig(QuantizationConfig):
             if layer_cfg is None:
                 skip_modules.append(layer)
 
-        return cls(ao_config, skip_modules)
+        return cls(ao_config, skip_modules, is_checkpoint_torchao_serialized)
+
+    @classmethod
+    def from_config_file(cls, config_file: str) -> "TorchAOConfig":
+        """Initialize class from a config file. Example:
+        ```
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        fn = "torchao_config.json"
+
+        with open(fn, "w") as f:
+            f.write(json.dumps(config_to_dict(config)))
+        ```
+        """
+        with open(config_file) as f:
+            f.seek(0)
+            f_read = f.read()
+            config_dict = json.loads(f_read)
+
+        hf_config = {"quant_type": {"default": config_dict}}
+        return cls.from_config(hf_config)
+
+    @classmethod
+    def from_config_dict_json(cls, config_dict_json: str) -> "TorchAOConfig":
+        """Iniitalize class from a config_dict json string, got from
+        torchao_config_object = some AOBaseConfig object
+        json.dumps(config_to_dict(torchao_config_object))
+        """
+        config_dict = json.loads(config_dict_json)
+        hf_config = {"quant_type": {"default": config_dict}}
+        return cls.from_config(hf_config)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
         if not isinstance(layer, LinearBase):
             return None
 
@@ -125,10 +235,30 @@ class TorchAOConfig(QuantizationConfig):
         module_fqn = prefix
         if isinstance(self.torchao_config, ModuleFqnToConfig):
             module_fqn_to_config = self.torchao_config.module_fqn_to_config
-            c = module_fqn_to_config.get(
-                module_fqn) or module_fqn_to_config.get("_default", None)
+            c = None
+            if module_fqn in module_fqn_to_config:
+                assert not module_fqn.startswith("re:"), (
+                    "module fqn should not start with"
+                    "`re:`, which is used for specifying regex"
+                )
+                c = module_fqn_to_config[module_fqn]
+            else:
+                for maybe_module_fqn_pattern in module_fqn_to_config:
+                    if not maybe_module_fqn_pattern.startswith("re:"):
+                        continue
+                    elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn):
+                        # we'll apply the config for first fully matched pattern
+                        c = module_fqn_to_config[maybe_module_fqn_pattern]
+                        break
+                else:
+                    # fallback to use default if no module specific
+                    # config is provided
+                    c = module_fqn_to_config.get("_default", None)
+
             if c is not None:
-                current_torchao_config = TorchAOConfig(c, self.skip_modules)
+                current_torchao_config = TorchAOConfig(
+                    c, self.skip_modules, self.is_checkpoint_torchao_serialized
+                )
                 return TorchAOLinearMethod(current_torchao_config)
             else:
                 return UnquantizedLinearMethod()
@@ -139,8 +269,9 @@ class TorchAOConfig(QuantizationConfig):
         return []
 
 
-def torchao_quantize_param_data(param: torch.Tensor,
-                                torchao_config: Any) -> torch.nn.Parameter:
+def torchao_quantize_param_data(
+    param: torch.Tensor, torchao_config: Any
+) -> torch.nn.Parameter:
     """Quantize a Tensor with torchao quantization specified by torchao_config
 
     Args:
@@ -161,7 +292,8 @@ def torchao_quantize_param_data(param: torch.Tensor,
         # while some of our configs need to do module swap, and only non-top
         # level modules support module swap
         dummy_linear = torch.nn.Sequential(
-            torch.nn.Linear(param.shape[1], param.shape[0], bias=False))
+            torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
+        )
 
     dummy_linear[0].weight = param
     quantize_(dummy_linear, torchao_config)
@@ -172,7 +304,7 @@ class TorchAOLinearMethod(LinearMethodBase):
     """Linear method for torchao.
 
     Args:
-        quant_config: The torchao quantization config, a string that encodes 
+        quant_config: The torchao quantization config, a string that encodes
             the type of quantization and all relevant arguments.
     """
 
@@ -197,8 +329,10 @@ class TorchAOLinearMethod(LinearMethodBase):
             ),
             requires_grad=False,
         )
-        weight = torchao_quantize_param_data(weight,
-                                             self.quant_config.torchao_config)
+        if self.quant_config.is_checkpoint_torchao_serialized:
+            weight = torchao_quantize_param_data(
+                weight, self.quant_config.torchao_config
+            )
 
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
 
@@ -209,6 +343,38 @@ class TorchAOLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return F.linear(x, layer.weight, bias)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if self.quant_config.is_checkpoint_torchao_serialized:
+            if not hasattr(layer, "weight"):
+                return
+
+            # record attributes attached to the weight, so we can
+            # recover later
+            recorded_weight_attr = _get_weight_attrs(layer.weight)
+
+            layer.weight = Parameter(
+                convert_to_packed_tensor_based_on_current_hardware(layer.weight),
+                requires_grad=layer.weight.requires_grad,
+            )
+
+            _restore_weight_attrs(layer.weight, recorded_weight_attr)
+            return
+
+        # online quantize the weight if the checkpoint is not already
+        # quantized by torchao
+        recorded_weight_attr = _get_weight_attrs(layer.weight)
+
+        weight = torchao_quantize_param_data(
+            layer.weight, self.quant_config.torchao_config
+        )
+        weight = torch.nn.Parameter(
+            convert_to_packed_tensor_based_on_current_hardware(weight),
+            weight.requires_grad,
+        )
+
+        _restore_weight_attrs(weight, recorded_weight_attr)
+        layer.register_parameter("weight", weight)
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 38de4b54fb191a54eb5ec8f5e80ddd3261bb0649..64bfa8fb80eb21c73dde5aa856da2472c1be97ed 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -8,9 +8,10 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from vllm.model_executor.parameter import ModelWeightParameter
 
 ACTIVATION_SCHEMES = ["none", "dynamic"]
@@ -25,8 +26,7 @@ class Int8TpuConfig(QuantizationConfig):
     ) -> None:
         super().__init__()
         if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(
-                f"Unsupported activation scheme {activation_scheme}")
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
 
     def get_name(self) -> QuantizationMethods:
@@ -37,8 +37,7 @@ class Int8TpuConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "This function should not be called with TPU Backend")
+        raise NotImplementedError("This function should not be called with TPU Backend")
 
     @staticmethod
     def get_config_filenames() -> list[str]:
@@ -49,50 +48,61 @@ class Int8TpuConfig(QuantizationConfig):
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         return cls(activation_scheme=activation_scheme)
 
-    def get_quant_method(self, layer: Module,
-                         prefix: str) -> Optional["TPUInt8LinearMethod"]:
+    def get_quant_method(
+        self, layer: Module, prefix: str
+    ) -> Optional["TPUInt8LinearMethod"]:
         if isinstance(layer, LinearBase):
             return TPUInt8LinearMethod(self)
         return None
 
 
 class TPUInt8LinearMethod(LinearMethodBase):
-    """Int8 Linear method for TPU Quant. """
+    """Int8 Linear method for TPU Quant."""
 
     def __init__(self, quant_config: Int8TpuConfig):
         self.quant_config = quant_config
         self.quantize_activation = False
-        if self.quant_config.activation_scheme == 'dynamic':
+        if self.quant_config.activation_scheme == "dynamic":
             self.quantize_activation = True
 
-    def create_weights(self, layer: Module, input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         weight_loader = extra_weight_attrs.get("weight_loader")
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=params_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
         layer.register_parameter("weight", weight)
 
     def _quantize_weight(
-            self, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self, weight: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         weight_dtype = weight.dtype
         weight = weight.cpu().to(torch.float32)
         n_bit = 8
         eps = 1e-5
-        max_int = 2**(n_bit - 1) - 1
-        min_int = -(2**(n_bit - 1))
+        max_int = 2 ** (n_bit - 1) - 1
+        min_int = -(2 ** (n_bit - 1))
         max_val = weight.abs().amax(dim=-1, keepdim=True)
         max_val = max_val.clamp(min=eps)
         qscale = max_val / max_int
-        qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int,
-                              max_int).to(torch.int8)
+        qweight = torch.clamp(
+            torch.round(weight * (1.0 / qscale)), min_int, max_int
+        ).to(torch.int8)
         qscale = qscale.squeeze().to(weight_dtype)
         return qweight, qscale
 
@@ -105,21 +115,25 @@ class TPUInt8LinearMethod(LinearMethodBase):
         layer.weight = Parameter(qweight, requires_grad=False)
         layer.scale = Parameter(qscale, requires_grad=False)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         try:
             import torch_xla.experimental.custom_kernel  # noqa: F401
         except ImportError as err:
             raise ImportError(
                 "Please install torch_xla by following the instructions at "
                 "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "  # noqa: E501
-                "to run vLLM on TPU.") from err
+                "to run vLLM on TPU."
+            ) from err
         weight = layer.weight
         scale = layer.scale
         out = torch.ops.xla.quantized_matmul_int8(
-            x, weight, scale, quantize_activation=self.quantize_activation)
+            x, weight, scale, quantize_activation=self.quantize_activation
+        )
         if bias is not None:
             out = out + bias
         return out
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index 6ad56bae3dca0cfd3acd380d16d20583153e1330..07c18029fb4de444f0b5a749ff5d5c120739efb8 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .layer_utils import replace_parameter, update_tensor_inplace
-
-__all__ = ['update_tensor_inplace', 'replace_parameter']
+from .layer_utils import replace_parameter, update_tensor_inplace
+
+__all__ = ["update_tensor_inplace", "replace_parameter"]
diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
index 1992b4d201478ca1062f8d6ac4867c8968985dd8..4c324682e5e62e6c54c4d79a443dafcaa1806509 100644
--- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -12,41 +12,56 @@ ALLSPARK_AMPERE_N_ALIGN = 16
 ALLSPARK_AMPERE_K_ALIGN = 16
 
 
-def check_allspark_supported_dtype_shape(input_size_per_partition: int,
-                                         output_size_per_partition: int,
-                                         group_size: int,
-                                         weight_dtype: ScalarType,
-                                         act_dtype: torch.dtype):
+def check_allspark_supported_dtype_shape(
+    input_size_per_partition: int,
+    output_size_per_partition: int,
+    group_size: int,
+    weight_dtype: ScalarType,
+    act_dtype: torch.dtype,
+):
     capability_tuple = current_platform.get_device_capability()
-    device_capability = (-1 if capability_tuple is None else
-                         capability_tuple.to_int())
+    device_capability = -1 if capability_tuple is None else capability_tuple.to_int()
 
     # For Ampere GPU
     if device_capability >= 80 and device_capability < 90:
         if group_size != -1:
-            return False, \
-                "For Ampere GPU, AllSpark does not support group_size "\
-                f"= {group_size}. Only group_size = -1 are supported."
+            return (
+                False,
+                "For Ampere GPU, AllSpark does not support group_size "
+                f"= {group_size}. Only group_size = -1 are supported.",
+            )
 
         if weight_dtype not in ALLSPARK_SUPPORTED_QUANT_TYPES:
-            return False, "For Ampere GPU, AllSpark does not support "\
-                f"quant type ({weight_dtype}). Only quant type "\
-                f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported."
-
-        if input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0 \
-            or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0:
-            return False, \
-                "AllSpark needs input_size_per_partition % "\
-                f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and "\
-                f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 "\
-                "for Ampere GPU optimized kernels."
+            return (
+                False,
+                "For Ampere GPU, AllSpark does not support "
+                f"quant type ({weight_dtype}). Only quant type "
+                f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported.",
+            )
+
+        if (
+            input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0
+            or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0
+        ):
+            return (
+                False,
+                "AllSpark needs input_size_per_partition % "
+                f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and "
+                f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 "
+                "for Ampere GPU optimized kernels.",
+            )
 
         if act_dtype != torch.float16 and act_dtype != torch.bfloat16:
-            return False, \
-                "AllSpark only supports act_dtype = float16 or bfloat16,"\
-                f"for Ampere GPU, but got act_dtype = {act_dtype}."
+            return (
+                False,
+                "AllSpark only supports act_dtype = float16 or bfloat16,"
+                f"for Ampere GPU, but got act_dtype = {act_dtype}.",
+            )
     else:
-        return False, "AllSpark currently does not support "\
-            f"device_capability = {device_capability}."
+        return (
+            False,
+            "AllSpark currently does not support "
+            f"device_capability = {device_capability}.",
+        )
 
     return True, None
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 4c2e54873586948a2b5414be6202741c8ad1c6bc..62a4f90366887e96312d23c89db451c54d976f9e 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from packaging import version
@@ -28,13 +27,14 @@ BITBLAS_SUPPORTED_SYM = [False, True]
 
 # Determines the supported quantization types for BitBLAS based on the
 # device's capability and whether zero-point (zp) is used.
-def query_bitblas_supported_quant_types(has_zp: bool,
-                                        device_capability: Optional[int] = None
-                                        ):
+def query_bitblas_supported_quant_types(
+    has_zp: bool, device_capability: int | None = None
+):
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
-        device_capability = (-1 if capability_tuple is None else
-                             capability_tuple.to_int())
+        device_capability = (
+            -1 if capability_tuple is None else capability_tuple.to_int()
+        )
 
     if device_capability < 70:
         return []
@@ -50,97 +50,116 @@ def query_bitblas_supported_quant_types(has_zp: bool,
 
 
 def _check_bitblas_supported(
-        quant_type: ScalarType,
-        group_size: Optional[int],
-        has_zp: bool,
-        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
-
+    quant_type: ScalarType,
+    group_size: int | None,
+    has_zp: bool,
+    device_capability: int | None = None,
+) -> tuple[bool, str | None]:
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
-        device_capability = (-1 if capability_tuple is None else
-                             capability_tuple.to_int())
+        device_capability = (
+            -1 if capability_tuple is None else capability_tuple.to_int()
+        )
 
-    supported_types = query_bitblas_supported_quant_types(
-        has_zp, device_capability)
+    supported_types = query_bitblas_supported_quant_types(has_zp, device_capability)
 
     if quant_type not in supported_types:
-        return (False, f"BitBLAS does not support weight_bits = {quant_type}. "
-                f"Only types = {supported_types} "
-                f"are supported (for group_size = {group_size}, "
-                f"device_capability = {device_capability}, zp = {has_zp}).")
-    if (group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES):
-        return (False, f"BitBLAS does not support group_size = {group_size}. "
-                f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
+        return (
+            False,
+            f"BitBLAS does not support weight_bits = {quant_type}. "
+            f"Only types = {supported_types} "
+            f"are supported (for group_size = {group_size}, "
+            f"device_capability = {device_capability}, zp = {has_zp}).",
+        )
+    if group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+        return (
+            False,
+            f"BitBLAS does not support group_size = {group_size}. "
+            f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
+            "are supported.",
+        )
 
     # Finally, check if bitblas is installed
     try:
         import bitblas
-        if version.parse(
-                bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
-            raise ImportError("bitblas version is wrong. Please "
-                              f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+
+        if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
+            raise ImportError(
+                "bitblas version is wrong. Please "
+                f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+            )
     except ImportError:
         return False, "BitBLAS is not installed."
 
     return True, None
 
 
-def check_bitblas_supported(quant_type: ScalarType,
-                            group_size: int,
-                            has_zp: bool = False,
-                            device_capability: Optional[int] = None) -> bool:
-    cond, _ = _check_bitblas_supported(quant_type, group_size, has_zp,
-                                       device_capability)
+def check_bitblas_supported(
+    quant_type: ScalarType,
+    group_size: int,
+    has_zp: bool = False,
+    device_capability: int | None = None,
+) -> bool:
+    cond, _ = _check_bitblas_supported(
+        quant_type, group_size, has_zp, device_capability
+    )
     return cond
 
 
-def verify_bitblas_supported(quant_type: ScalarType,
-                             group_size: int,
-                             has_zp: bool = False) -> None:
+def verify_bitblas_supported(
+    quant_type: ScalarType, group_size: int, has_zp: bool = False
+) -> None:
     cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
 
 
-def verify_bitblas_supports_shape(output_size_per_partition: int,
-                                  input_size_per_partition: int,
-                                  input_size: int, group_size: int) -> None:
-
+def verify_bitblas_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0:
-        raise ValueError(f"Weight output_size_per_partition = "
-                         f"{output_size_per_partition} is not divisible by "
-                         f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
-                         "Consider reducing tensor_parallel_size or running "
-                         "with --quantization gptq.")
+        raise ValueError(
+            f"Weight output_size_per_partition = "
+            f"{output_size_per_partition} is not divisible by "
+            f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
 
     # Validate input_size_per_partition
     if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0:
-        raise ValueError(f"Weight input_size_per_partition = "
-                         f"{input_size_per_partition} is not divisible "
-                         f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
-                         "Consider reducing tensor_parallel_size or running "
-                         "with --quantization gptq.")
-
-    if (group_size < input_size
-            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = "
+            f"{input_size_per_partition} is not divisible "
+            f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
             f" is not divisible by group_size = {group_size}."
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq.")
+            "with --quantization gptq."
+        )
 
 
-def check_bitblas_supports_shape(output_size_per_partition: int,
-                                input_size_per_partition: int,
-                                input_size: int, group_size: int) \
-                                    -> tuple[bool, Optional[str]]:
+def check_bitblas_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> tuple[bool, str | None]:
     try:
-        verify_bitblas_supports_shape(output_size_per_partition,
-                                      input_size_per_partition, input_size,
-                                      group_size)
+        verify_bitblas_supports_shape(
+            output_size_per_partition, input_size_per_partition, input_size, group_size
+        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
@@ -150,8 +169,9 @@ def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
 
 
-def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
-                                       is_row_parallel: bool) -> bool:
+def bitblas_repeat_scales_on_all_ranks(
+    act_order: bool, group_size: int, is_row_parallel: bool
+) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -159,17 +179,18 @@ def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
 
 
 def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
-                              requires_grad=False)
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
 
 
 def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
-                              requires_grad=False)
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
 
 
-def bitblas_sort_g_idx(
-        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def bitblas_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 
@@ -186,8 +207,7 @@ def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor:
 
     for col in range(unpacked_zeros.shape[1]):
         i = col % elems_per_int32
-        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >>
-                                  (bits * i)) & 0xF
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >> (bits * i)) & 0xF
     if not is_gptq_v2:
         return unpacked_zeros + 1
     return unpacked_zeros
@@ -204,7 +224,6 @@ def unpack_gptq_qweight(qweight, bits):
     )
     for col in range(unpacked_weight.shape[1]):
         i = col % elems_per_int8
-        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >>
-                                   (bits * i))
+        unpacked_weight[:, col] = qweight[:, col // elems_per_int8] >> (bits * i)
 
     return torch.bitwise_and(unpacked_weight, 2**bits - 1)
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index a520302c62d9fa9822a9192d69f425d9d624f963..fdf330329e20c5029cbba869cabac2a0843d6249 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -1,18 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
-from __future__ import annotations
 
 import torch
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts)
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    create_flashinfer_prepare_finalize)
+    create_flashinfer_prepare_finalize,
+)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
@@ -24,16 +27,18 @@ __all__ = [
 
 
 def is_flashinfer_fp4_cutlass_moe_available() -> bool:
-    """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
-    return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-            and has_flashinfer_cutlass_fused_moe()
-            and current_platform.is_cuda()
-            and current_platform.is_device_capability(100))
+    """Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
+    return (
+        envs.VLLM_USE_FLASHINFER_MOE_FP4
+        and has_flashinfer_cutlass_fused_moe()
+        and current_platform.is_cuda()
+        and current_platform.has_device_capability(100)
+    )
 
 
-def reorder_w1w3_to_w3w1(weight: torch.Tensor,
-                         scale: torch.Tensor,
-                         dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
+def reorder_w1w3_to_w3w1(
+    weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
     size = weight.size(dim)
     assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
@@ -42,18 +47,21 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor,
     w1, w3 = weight.split(half, dim=dim)
     s1, s3 = scale.split(half, dim=dim)
 
-    return (torch.cat([w3, w1],
-                      dim=dim).contiguous(), torch.cat([s3, s1],
-                                                       dim=dim).contiguous())
+    return (
+        torch.cat([w3, w1], dim=dim).contiguous(),
+        torch.cat([s3, s1], dim=dim).contiguous(),
+    )
 
 
 def build_flashinfer_fp4_cutlass_moe_prepare_finalize(
-        moe: FusedMoEConfig) -> mk.FusedMoEPrepareAndFinalize:
+    moe: FusedMoEConfig,
+) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1
-    enable_alltoallv = envs.VLLM_ALL2ALL_BACKEND == "flashinfer_all2allv"
+    enable_alltoallv = moe.moe_parallel_config.all2all_backend == "flashinfer_all2allv"
     return create_flashinfer_prepare_finalize(
-        use_dp=use_dp, use_nvfp4=True, enable_alltoallv=enable_alltoallv)
+        use_dp=use_dp, use_nvfp4=True, enable_alltoallv=enable_alltoallv
+    )
 
 
 def select_nvfp4_gemm_impl(
@@ -71,9 +79,11 @@ def select_nvfp4_gemm_impl(
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,
             tp_size=moe.moe_parallel_config.tp_size,
+            use_dp=moe.moe_parallel_config.dp_size > 1,
         )
 
     # native cutlass experts currently don't support DP; TP case won't call this
     raise ValueError(
         "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
-        "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")
+        "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)"
+    )
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index b779a5355b679b55afc979603e821b8a373d8c92..f22e17945d1f665d655178aa96d6986f9584f928 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -1,19 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import Enum
-from typing import Optional
 
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts)
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    create_flashinfer_prepare_finalize)
+    create_flashinfer_prepare_finalize,
+)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -24,33 +28,40 @@ class FlashinferMoeBackend(Enum):
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
+    from flashinfer import next_positive_power_of_2
 
     # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
     # TODO: Revert this to dynamic calculation once a new version of FlashInfer
     # with the necessary kernels is released.
     tile_tokens_dim = 8
 
-    # from flashinfer import next_positive_power_of_2
-
-    # # Guess tokens per expert assuming perfect expert distribution first.
-    # num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # # And pad the number to the next power of 2.
-    # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # # Cap to 8-64 tokens per CTA tile as it's the range supported by the
-    # # kernel.
-    # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    # A factor considering tokens are not perfectly balanced among experts.
+    imbalance_factor = 1.3
+    # Calculate the number of tokens per expert
+    # assuming perfect distribution.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # Apply the imbalance factor.
+    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-max_tile_tokens_dim tokens per CTA tile
+    # as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
 
     return tile_tokens_dim
 
 
 def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
-    return x.reshape(-1, 2, x.shape[-2] // 2,
-                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+    return (
+        x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+    )
 
 
-def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
-                                      gemm2_weights: torch.Tensor):
+def rotate_flashinfer_fp8_moe_weights(
+    gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor
+):
     from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
+
     epilogue_tile_m = 128
     num_experts = gemm1_weights.shape[0]
     hidden_size = gemm1_weights.shape[-1]
@@ -60,13 +71,13 @@ def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
     gemm1_weights_fp8_interleaved = []
     for i in range(num_experts):
         gemm1_weights_fp8_interleaved.append(
-            reorder_rows_for_gated_act_gemm(gemm1_weights[i]))
+            reorder_rows_for_gated_act_gemm(gemm1_weights[i])
+        )
 
     # Stack weights and scales for all experts
-    gemm1_weights_fp8_interleaved = torch.stack(
-        gemm1_weights_fp8_interleaved).reshape(num_experts,
-                                               2 * intermediate_size,
-                                               hidden_size)
+    gemm1_weights_fp8_interleaved = torch.stack(gemm1_weights_fp8_interleaved).reshape(
+        num_experts, 2 * intermediate_size, hidden_size
+    )
 
     # Shuffle weights and scaling factors for transposed mma output
     gemm1_weights_fp8_shuffled = []
@@ -74,44 +85,53 @@ def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
     for i in range(num_experts):
         gemm1_weights_fp8_shuffled.append(
             shuffle_matrix_a(
-                gemm1_weights_fp8_interleaved[i].view(torch.uint8),
-                epilogue_tile_m))
+                gemm1_weights_fp8_interleaved[i].view(torch.uint8), epilogue_tile_m
+            )
+        )
 
         gemm2_weights_fp8_shuffled.append(
-            shuffle_matrix_a(gemm2_weights[i].view(torch.uint8),
-                             epilogue_tile_m))
+            shuffle_matrix_a(gemm2_weights[i].view(torch.uint8), epilogue_tile_m)
+        )
 
     # Stack weights for all experts
     gemm1_weights.data = torch.stack(gemm1_weights_fp8_shuffled).view(
-        torch.float8_e4m3fn)
+        torch.float8_e4m3fn
+    )
     gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view(
-        torch.float8_e4m3fn)
+        torch.float8_e4m3fn
+    )
 
 
 def apply_flashinfer_per_tensor_scale_fp8(
     layer: torch.nn.Module,
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
-    routing_bias: Optional[torch.Tensor],
+    routing_bias: torch.Tensor | None,
     top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
+    num_expert_group: int | None,
+    topk_group: int | None,
     global_num_experts: int,
     apply_router_weight_on_input: bool,
 ) -> torch.Tensor:
     from flashinfer.fused_moe import RoutingMethodType
 
     import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
+
     assert layer.output1_scales_scalar is not None, (
-        "Expected output1_scales_scalar to be initialized")
+        "Expected output1_scales_scalar to be initialized"
+    )
     assert layer.output1_scales_scalar is not None, (
-        "Expected output1_scales_gate_scalar to be initialized")
+        "Expected output1_scales_gate_scalar to be initialized"
+    )
     assert layer.output1_scales_scalar is not None, (
-        "Expected output2_scales_scalar to be initialized")
+        "Expected output2_scales_scalar to be initialized"
+    )
 
     from vllm.model_executor.models.llama4 import Llama4MoE
-    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, (
         "FusedMoE flashinfer kernels are only supported for Llama4"
+    )
     return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
         routing_logits=router_logits,
         routing_bias=routing_bias,
@@ -140,46 +160,53 @@ def get_moe_scaling_factors(
     activation_scale: torch.Tensor,
     gemm2_weights_scale: torch.Tensor,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    output1_scales_scalar = gemm1_weights_scale * input_scale * (
-        1.0 / activation_scale)
+    output1_scales_scalar = gemm1_weights_scale * input_scale * (1.0 / activation_scale)
     output1_scales_gate_scalar = gemm1_weights_scale * input_scale
     output2_scales_scalar = activation_scale * gemm2_weights_scale
 
-    return output1_scales_scalar, output1_scales_gate_scalar, \
-        output2_scales_scalar
+    return output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar
 
 
 def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
-    output1_scales, output1_gate_scales, output2_scales = \
-        get_moe_scaling_factors(
-            layer.w13_input_scale, layer.w13_weight_scale,
-            layer.w2_input_scale, layer.w2_weight_scale
-        )
+    output1_scales, output1_gate_scales, output2_scales = get_moe_scaling_factors(
+        layer.w13_input_scale,
+        layer.w13_weight_scale,
+        layer.w2_input_scale,
+        layer.w2_weight_scale,
+    )
     layer.register_parameter(
-        'output1_scales_scalar',
-        torch.nn.Parameter(output1_scales, requires_grad=False))
+        "output1_scales_scalar", torch.nn.Parameter(output1_scales, requires_grad=False)
+    )
     layer.register_parameter(
-        'output1_scales_gate_scalar',
-        torch.nn.Parameter(output1_gate_scales, requires_grad=False))
+        "output1_scales_gate_scalar",
+        torch.nn.Parameter(output1_gate_scales, requires_grad=False),
+    )
     layer.register_parameter(
-        'output2_scales_scalar',
-        torch.nn.Parameter(output2_scales, requires_grad=False))
+        "output2_scales_scalar", torch.nn.Parameter(output2_scales, requires_grad=False)
+    )
     layer.register_parameter(
-        'w2_input_scale_inv',
-        torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False))
+        "w2_input_scale_inv",
+        torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False),
+    )
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-    moe: Optional[FusedMoEConfig], ) -> mk.FusedMoEPrepareAndFinalize:
+    moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False
+) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
-    return create_flashinfer_prepare_finalize(use_dp)
+    # Propagate block-scale flag so prepare/finalize can skip act quantization
+    # and inform the kernel to consume per-block weight scales.
+    return create_flashinfer_prepare_finalize(
+        use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
 
 
 def select_cutlass_fp8_gemm_impl(
-    moe: Optional[FusedMoEConfig],
+    moe: FusedMoEConfig | None,
     quant_config: FusedMoEQuantConfig,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for fused-MoE layers"""
 
@@ -191,13 +218,14 @@ def select_cutlass_fp8_gemm_impl(
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,
             tp_size=moe.moe_parallel_config.tp_size,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         )
 
-    assert out_dtype is not None, (
-        "If moe config is None, out_dtype must be passed")
+    assert out_dtype is not None, "If moe config is None, out_dtype must be passed"
     return FlashInferExperts(
         out_dtype=out_dtype,
         quant_config=quant_config,
+        use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
     )
 
 
@@ -209,17 +237,26 @@ def flashinfer_cutlass_moe_fp8(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
+    moe: FusedMoEConfig | None = None,
 ) -> torch.Tensor:
     quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
     assert quant_config is not None
 
+    # Construct modular kernel with block-scale support when requested.
     fused_experts = mk.FusedMoEModularKernel(
-        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None),
-        select_cutlass_fp8_gemm_impl(moe=None,
-                                     quant_config=quant_config,
-                                     out_dtype=hidden_states.dtype))
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+        ),
+        select_cutlass_fp8_gemm_impl(
+            moe=moe,
+            quant_config=quant_config,
+            out_dtype=hidden_states.dtype,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
+        ),
+    )
 
     return fused_experts(
         hidden_states,
@@ -237,7 +274,10 @@ def flashinfer_cutlass_moe_fp8(
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    if flashinfer_moe_backend == "throughput":
+    # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations
+    if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability(
+        90
+    ):
         return FlashinferMoeBackend.CUTLASS
     elif flashinfer_moe_backend == "latency":
         return FlashinferMoeBackend.TENSORRT_LLM
@@ -245,4 +285,14 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     allowed_backends = ["throughput", "latency"]
     raise ValueError(
         f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-        f" expected one of {allowed_backends}")
+        f" expected one of {allowed_backends}"
+    )
+
+
+def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
+    # TODO(shuw@nvidia): Update when new backends are added.
+    backends_supporting_global_sf = (
+        FlashinferMoeBackend.CUTLASS,
+        FlashinferMoeBackend.TENSORRT_LLM,
+    )
+    return backend in backends_supporting_global_sf
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7e5a50dca9012b679d3e6a4be38ba720814959c9..37b0c659696dc773110c5b79d3e23c5561df5bd5 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -5,37 +5,50 @@
 import functools
 import json
 import os
-
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union, List
+from collections.abc import Callable, Sequence
+from typing import Any
 
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    group_broadcast)
+    GroupShape,
+    group_broadcast,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_BLOCK_FP8_SUPPORTED)
-from vllm.model_executor.parameter import (BlockQuantScaleParameter,
-                                           ChannelQuantScaleParameter,
-                                           PerTensorScaleParameter)
+    CUTLASS_BLOCK_FP8_SUPPORTED,
+)
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ChannelQuantScaleParameter,
+    PerTensorScaleParameter,
+)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
-from vllm.utils.deep_gemm import (fp8_gemm_nt, is_deep_gemm_e8m0_used,
-                                  should_use_deepgemm_for_fp8_linear)
+from vllm.utils.deep_gemm import (
+    fp8_gemm_nt,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+    should_use_deepgemm_for_fp8_linear,
+    transform_sf_into_required_layout,
+)
+from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
 
-def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+def is_fp8(x: torch.dtype | torch.Tensor) -> bool:
     if isinstance(x, torch.Tensor):
         x = x.dtype
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
+# We need to pass in the is_hopper flag as argument because the function
+# current_platform.is_device_capability() is not supported by Torch compiler.
 def cutlass_scaled_mm(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -49,168 +62,355 @@ def cutlass_scaled_mm(
         B.T,
         out_dtype=output_dtype,
         scale_a=As,
-        # SM90 block FP8 requires row-major scale_b, which we do ahead of time
-        scale_b=Bs if block_size is not None
-        and current_platform.is_device_capability(90) else Bs.T)
+        scale_b=Bs.T,
+    )
 
 
-def rocm_aiter_gemm_w8a8_blockscale_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: List[int],
-    output_dtype: torch.dtype = torch.float16,
+# TODO we should be able to change the type of block_size to GroupShape
+# after we resolve GroupShape compilation issue
+# https://github.com/vllm-project/vllm/issues/25270
+def _w8a8_triton_block_scaled_mm_func(
+    qx: torch.Tensor,
+    weight: torch.Tensor,
+    x_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
 ) -> torch.Tensor:
-    import aiter as rocm_aiter
-
-    return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+    return w8a8_triton_block_scaled_mm(
+        qx, weight, x_scale, weight_scale, block_size, output_dtype
+    )
 
 
-def rocm_aiter_gemm_w8a8_blockscale_fake(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: List[int],
-    output_dtype: torch.dtype = torch.float16,
+def _w8a8_triton_block_scaled_mm_fake(
+    qx: torch.Tensor,
+    weight: torch.Tensor,
+    x_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
 ) -> torch.Tensor:
+    return torch.empty(
+        (qx.size(0), weight.size(0)), dtype=output_dtype, device=qx.device
+    )
 
-    m = A.shape[0]
-    n = B.shape[0]
-    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
-    return Y
 
+direct_register_custom_op(
+    "w8a8_triton_block_scaled_mm_func",
+    _w8a8_triton_block_scaled_mm_func,
+    fake_impl=_w8a8_triton_block_scaled_mm_fake,
+)
 
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8_blockscale",
-        op_func=rocm_aiter_gemm_w8a8_blockscale_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
+
+def _padded_cutlass(
+    qx: torch.Tensor,
+    weight: torch.Tensor,
+    x_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    pad_multiple = 4
+    dim = qx.shape[0]
+    padded = (
+        dim if dim % pad_multiple == 0 else dim + pad_multiple - (dim % pad_multiple)
     )
-    if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_LINEAR
-            and current_platform.is_fp8_fnuz()):
 
-        import aiter as rocm_aiter
-        from aiter import get_hip_quant
+    has_pad = padded > dim
 
-        aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
+    if has_pad:
+        padded_shape = [padded, *qx.shape[1:]]
+        padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
+        padded_qx[0 : qx.shape[0], ...].copy_(qx)
 
+        padded_x_scale_shape = [*x_scale.shape[1:], padded]
+        padded_x_scale = torch.ones(
+            padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype
+        ).permute(-1, -2)
+        padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale)
 
-def dispatch_w8a8_blockscale_func(
-    use_cutlass: bool, use_aiter_and_is_supported: bool
-) -> Callable[[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        List[int],
-        torch.dtype,
-], torch.Tensor]:
-    if use_cutlass:
-        return cutlass_scaled_mm
-    if (use_aiter_and_is_supported):
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale
-    return w8a8_block_fp8_matmul
+        output = cutlass_scaled_mm(
+            padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype
+        )
+        return output[0 : qx.shape[0], ...]
+    else:
+        return cutlass_scaled_mm(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
 
 
-# TODO fix ROCm->Triton custom path:
-#  https://github.com/vllm-project/vllm/issues/14397
-def apply_w8a8_block_fp8_linear(
-    input: torch.Tensor,
+def _padded_cutlass_fake(
+    qx: torch.Tensor,
     weight: torch.Tensor,
-    block_size: List[int],
+    x_scale: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
-    use_aiter_and_is_supported: bool = False,
+    block_size: list[int],
+    output_dtype: torch.dtype,
 ) -> torch.Tensor:
-    assert input_scale is None
-    # View input as 2D matrix for fp8 methods
-    input_2d = input.view(-1, input.shape[-1])
-    output_shape = [*input.shape[:-1], weight.shape[0]]
-    output_dtype = input.dtype
+    return torch.empty(
+        (qx.size(0), weight.size(0)), dtype=output_dtype, device=qx.device
+    )
+
 
-    if should_use_deepgemm_for_fp8_linear(output_dtype, weight):
+direct_register_custom_op(
+    "padded_cutlass",
+    _padded_cutlass,
+    fake_impl=_padded_cutlass_fake,
+)
 
+
+def _fp8_gemm_nt_op(
+    q_input: torch.Tensor,
+    input_scale: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    output: torch.Tensor,
+    use_deep_gemm_e8m0: bool,
+) -> None:
+    fp8_gemm_nt(
+        (q_input, input_scale),
+        (weight, weight_scale),
+        output,
+        is_deep_gemm_e8m0_used=use_deep_gemm_e8m0,
+    )
+
+
+def _fp8_gemm_nt_op_fake(
+    q_input: torch.Tensor,
+    input_scale: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    output: torch.Tensor,
+    use_deep_gemm_e8m0: bool,
+) -> None:
+    return None
+
+
+direct_register_custom_op(
+    "fp8_gemm_nt_op",
+    _fp8_gemm_nt_op,
+    mutates_args=["output"],
+    fake_impl=_fp8_gemm_nt_op_fake,
+)
+
+
+# TODO fix ROCm->Triton custom path:
+#  https://github.com/vllm-project/vllm/issues/14397
+class W8A8BlockFp8LinearOp:
+    """
+    This class executes a Blocked FP8 linear layer using cutlass if supported
+    and torch.scaled_mm otherwise.
+    """
+
+    def __init__(
+        self,
+        weight_group_shape: GroupShape,
+        act_quant_group_shape: GroupShape,
+        cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
+        use_aiter_and_is_supported: bool = False,
+    ):
+        self.weight_group_shape = weight_group_shape
+        self.act_quant_group_shape = act_quant_group_shape
+        self.is_deep_gemm_supported = is_deep_gemm_supported()
+        self.is_hopper = current_platform.is_device_capability(90)
+        self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
+
+        # Get the correct blockscale mul and input quant operations.
+        # We can't use _dispatch_w8a8_blockscale_op to figure out if we want
+        # to use deepgemm because we don't know the shape of weights (and
+        # whether deepgemm supports it) at the init time.
+        self.w8a8_blockscale_op, self.input_quant_op = (
+            self._dispatch_w8a8_blockscale_op(
+                cutlass_block_fp8_supported, use_aiter_and_is_supported
+            )
+        )
+        self.deepgemm_input_quant_op = (
+            QuantFP8(
+                False,
+                self.act_quant_group_shape,
+                column_major_scales=True,
+                use_ue8m0=self.use_deep_gemm_e8m0,
+            )
+            if self.is_deep_gemm_supported
+            else None
+        )
+
+    def apply(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert input_scale is None
+        # View input as 2D matrix for fp8 methods
         input_2d = input.view(-1, input.shape[-1])
         output_shape = [*input.shape[:-1], weight.shape[0]]
+        output_dtype = input.dtype
+
+        if should_use_deepgemm_for_fp8_linear(
+            output_dtype, weight, self.is_deep_gemm_supported
+        ):
+            output = self._run_deepgemm(input_2d, weight, weight_scale)
+        else:
+            output = self.w8a8_blockscale_op(
+                input_2d, weight, weight_scale, input_scale
+            )
 
-        q_input, x_scale = per_token_group_quant_fp8(
-            input_2d,
-            block_size[1],
-            column_major_scales=True,
-        )
-        output = torch.empty((q_input.shape[0], weight.shape[0]),
-                             dtype=torch.bfloat16,
-                             device=q_input.device)
-        fp8_gemm_nt((q_input, x_scale), (weight, weight_scale), output)
         if bias is not None:
-            output += bias
-        return output.to(dtype=output_dtype).view(*output_shape)
-
-    w8a8_blockscale_func = dispatch_w8a8_blockscale_func(
-        cutlass_block_fp8_supported, use_aiter_and_is_supported)
-    if cutlass_block_fp8_supported:
-        num_pad = 0
-        if current_platform.is_device_capability(90):
-            # pad first dimension to be divisible by 4 due to
-            # cutlass blockwise gemm limitation for hopper
-            num_pad = 4 - (input_2d.shape[0] % 4)
-            if num_pad > 0:
-                input_2d = torch.nn.functional.pad(input_2d,
-                                                   (0, 0, 0, num_pad),
-                                                   "constant", 0)
-        q_input, x_scale = per_token_group_quant_fp8(input_2d,
-                                                     block_size[1],
-                                                     column_major_scales=True)
-        output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
-                                      block_size, input.dtype)
-        if num_pad > 0:
-            output = output[:-num_pad]
-    else:
-        if use_aiter_and_is_supported:
-            q_input, x_scale = aiter_per1x128_quant(
-                input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8)
+            output = output + bias
+        return output.to(dtype=input.dtype).view(*output_shape)
+
+    def _run_deepgemm(
+        self,
+        input_2d: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        assert self.deepgemm_input_quant_op is not None
+        q_input, input_scale = self.deepgemm_input_quant_op(input_2d)
+        output = torch.empty(
+            (q_input.shape[0], weight.shape[0]),
+            dtype=torch.bfloat16,
+            device=q_input.device,
+        )
+        torch.ops.vllm.fp8_gemm_nt_op(
+            q_input, input_scale, weight, weight_scale, output, self.use_deep_gemm_e8m0
+        )
+        return output
+
+    def _run_cutlass(
+        self,
+        input_2d: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert input_scale is None
+        assert self.input_quant_op is not None
+        q_input, input_scale = self.input_quant_op(input_2d)
+        if self.is_hopper:
+            return torch.ops.vllm.padded_cutlass(
+                q_input,
+                weight,
+                input_scale,
+                weight_scale,
+                list(self.weight_group_shape),
+                input_2d.dtype,
+            )
         else:
-            q_input, x_scale = per_token_group_quant_fp8(
-                input_2d, block_size[1], column_major_scales=False)
+            return cutlass_scaled_mm(
+                q_input,
+                weight,
+                input_scale,
+                weight_scale,
+                list(self.weight_group_shape),
+                input_2d.dtype,
+            )
 
-        output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
-                                      block_size, input.dtype)
+    def _run_aiter(
+        self,
+        input_2d: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert self.act_quant_group_shape == GroupShape(1, 128)
 
-    if bias is not None:
-        output = output + bias
-    return output.to(dtype=input.dtype).view(*output_shape)
+        n, k = weight.shape
 
+        use_triton = (
+            not current_platform.is_fp8_fnuz()
+            and rocm_aiter_ops.is_triton_gemm_w8a8_tuned(n, k)
+        )
 
-def apply_w8a8_block_fp8_linear_fake(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    block_size: List[int],
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
-    use_aiter_and_is_supported: bool = False,
-) -> torch.Tensor:
-    output_shape = [*input.shape[:-1], weight.shape[0]]
-    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+        if use_triton:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
+        else:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale
+
+        if input_scale is not None:
+            q_input = input_2d
+        # MI350 case uses triton kernel
+        elif use_triton:
+            q_input, input_scale = per_token_group_quant_fp8(
+                input_2d,
+                self.act_quant_group_shape.col,
+                column_major_scales=False,
+                use_ue8m0=False,
+            )
+        # MI300 uses tuned AITER ASM/C++ kernel
+        else:
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
+
+        return gemm_a8w8_blockscale_op(
+            q_input,
+            weight,
+            input_scale,
+            weight_scale,
+            list(self.weight_group_shape),
+            output_dtype=input_2d.dtype,
+        )
 
+    def _run_triton(
+        self,
+        input_2d: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert input_scale is None
+        assert self.input_quant_op is not None
+        q_input, input_scale = self.input_quant_op(input_2d)
+        return torch.ops.vllm.w8a8_triton_block_scaled_mm_func(
+            q_input,
+            weight,
+            input_scale,
+            weight_scale,
+            list(self.weight_group_shape),
+            input_2d.dtype,
+        )
 
-if not current_platform.is_cpu():
-    direct_register_custom_op(
-        op_name="apply_w8a8_block_fp8_linear",
-        op_func=apply_w8a8_block_fp8_linear,
-        mutates_args=[],
-        fake_impl=apply_w8a8_block_fp8_linear_fake,
-    )
+    def _dispatch_w8a8_blockscale_op(
+        self,
+        use_cutlass: bool,
+        use_aiter_and_is_supported: bool,
+    ) -> tuple[
+        Callable[
+            [
+                torch.Tensor,
+                torch.Tensor,
+                torch.Tensor,
+                torch.Tensor | None,
+            ],
+            torch.Tensor,
+        ],
+        QuantFP8 | None,
+    ]:
+        if use_cutlass:
+            return self._run_cutlass, (
+                QuantFP8(
+                    False,
+                    self.act_quant_group_shape,
+                    column_major_scales=True,
+                    use_ue8m0=False,
+                )
+            )
+        if use_aiter_and_is_supported:
+            return self._run_aiter, None
+        return self._run_triton, (
+            QuantFP8(
+                False,
+                self.act_quant_group_shape,
+                column_major_scales=False,
+                use_ue8m0=False,
+            )
+        )
 
 
 def input_to_float8(
-        x: torch.Tensor,
-        dtype: Optional[torch.dtype] = None
+    x: torch.Tensor, dtype: torch.dtype | None = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""
@@ -269,8 +469,9 @@ def _per_token_group_quant_fp8(
     row_g_id = g_id % groups_per_row
 
     # Ensure offset calculations use int64 to prevent overflow
-    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
-                                                        group_size)
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (
+        row_g_id.to(tl.int64) * group_size
+    )
     y_ptr += y_ptr_offset
 
     y_q_ptr_offset = g_id.to(tl.int64) * group_size
@@ -324,8 +525,9 @@ def _per_token_group_quant_fp8_colmajor(
     row_g_id = g_id % groups_per_row
 
     # Ensure offset calculations use int64 to prevent overflow
-    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
-                                                        group_size)
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (
+        row_g_id.to(tl.int64) * group_size
+    )
     y_ptr += y_ptr_offset
 
     y_q_ptr_offset = g_id.to(tl.int64) * group_size
@@ -337,8 +539,7 @@ def _per_token_group_quant_fp8_colmajor(
     scale_col = g_id % blocks_per_row
     scale_row = g_id // blocks_per_row
     # Ensure offset calculation uses int64 for y_s_ptr
-    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
-        tl.int64)
+    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(tl.int64)
     y_s_ptr += y_s_ptr_offset
 
     cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
@@ -359,10 +560,10 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     column_major_scales: bool = False,
-    out_q: Optional[torch.Tensor] = None,
-    use_ue8m0: Optional[bool] = None,
+    out_q: torch.Tensor | None = None,
+    use_ue8m0: bool | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -382,9 +583,10 @@ def per_token_group_quant_fp8(
     if use_ue8m0 is None:
         use_ue8m0 = is_deep_gemm_e8m0_used()
     dtype = current_platform.fp8_dtype() if dtype is None else dtype
-    assert (x.shape[-1] % group_size == 0), (
+    assert x.shape[-1] % group_size == 0, (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
-        f"by `group_size` {group_size}")
+        f"by `group_size` {group_size}"
+    )
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
@@ -398,18 +600,18 @@ def per_token_group_quant_fp8(
 
     # Allocate the scale tensor in either row- or column-major format.
     if column_major_scales:
-        shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
-        x_s = torch.empty(shape, device=x.device,
-                          dtype=torch.float32).permute(-1, -2)
+        shape = (x.shape[-1] // group_size,) + x.shape[:-1]
+        x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
     else:
-        shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+        shape = x.shape[:-1] + (x.shape[-1] // group_size,)
         x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
 
     # prefer CUDA kernel if available
     # TODO(bnell): this causes some fp8 moe test to fail.
     if current_platform.is_cuda() and x.is_contiguous():
-        torch.ops._C.per_token_group_fp8_quant(x, x_q, x_s, group_size, eps,
-                                               fp8_min, fp8_max, use_ue8m0)
+        torch.ops._C.per_token_group_fp8_quant(
+            x, x_q, x_s, group_size, eps, fp8_min, fp8_max, use_ue8m0
+        )
         return x_q, x_s
 
     # TRITON FALLBACK
@@ -420,7 +622,7 @@ def per_token_group_quant_fp8(
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
     if column_major_scales:
-        _per_token_group_quant_fp8_colmajor[(M, )](
+        _per_token_group_quant_fp8_colmajor[(M,)](
             x,
             x_q,
             x_s,
@@ -437,7 +639,7 @@ def per_token_group_quant_fp8(
             num_stages=num_stages,
         )
     else:
-        _per_token_group_quant_fp8[(M, )](
+        _per_token_group_quant_fp8[(M,)](
             x,
             x_q,
             x_s,
@@ -457,7 +659,7 @@ def per_token_group_quant_fp8(
 
 
 @triton.jit
-def _w8a8_block_fp8_matmul(
+def _w8a8_triton_block_scaled_mm(
     # Pointers to inputs and output
     A,
     B,
@@ -515,12 +717,8 @@ def _w8a8_block_fp8_matmul(
 
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
 
         k_start = k * BLOCK_SIZE_K
         offs_ks = k_start // group_k
@@ -546,8 +744,9 @@ def _w8a8_block_fp8_matmul(
 
 
 @functools.lru_cache
-def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
-                               block_k: int) -> Optional[dict[int, Any]]:
+def get_w8a8_block_fp8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the w8a8 block fp8 kernel.
     The return value will be a dictionary that maps an irregular grid of
@@ -562,7 +761,8 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
     json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json"  # noqa: E501
 
     config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
             logger.info(
@@ -582,7 +782,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
     return None
 
 
-def w8a8_block_fp8_matmul(
+def w8a8_triton_block_scaled_mm(
     A: torch.Tensor,
     B: torch.Tensor,
     As: torch.Tensor,
@@ -618,7 +818,7 @@ def w8a8_block_fp8_matmul(
     assert triton.cdiv(N, block_n) == Bs.shape[0]
     assert triton.cdiv(K, block_k) == Bs.shape[1]
 
-    C_shape = A.shape[:-1] + (N, )
+    C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
 
     configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
@@ -639,10 +839,11 @@ def w8a8_block_fp8_matmul(
         }
 
     def grid(META):
-        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
-                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
 
-    _w8a8_block_fp8_matmul[grid](
+    _w8a8_triton_block_scaled_mm[grid](
         A,
         B,
         C,
@@ -670,27 +871,28 @@ def w8a8_block_fp8_matmul(
 
 
 def requant_weight_ue8m0_inplace(
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        block_size: Sequence[int] = (128, 128),
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    block_size: Sequence[int] = (128, 128),
 ) -> None:
     """Re-quantise *weight* so that its per-block scaling factors are in the
     UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
 
     Args:
-        weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``.
-            Expected shape ``(..., M, K)``.
-        weight_scale: Corresponding per-block scale tensor (``torch.float32``)
-            with shape ``(..., M // block_size[0], K // block_size[1])``.
-        block_size: 2-element iterable ``[block_m, block_k]`` describing the
+        weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`.
+            Expected shape `(..., M, K)`.
+        weight_scale: Corresponding per-block scale tensor (`torch.float32`)
+            with shape `(..., M // block_size[0], K // block_size[1])`.
+        block_size: 2-element iterable `[block_m, block_k]` describing the
             block quantisation granularity.
     """
     if weight.numel() == 0:
         return
 
     if weight.dtype != torch.float8_e4m3fn:
-        raise ValueError("Expected *weight* to be torch.float8_e4m3fn, got "
-                         f"{weight.dtype} instead.")
+        raise ValueError(
+            f"Expected *weight* to be torch.float8_e4m3fn, got {weight.dtype} instead."
+        )
 
     from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
@@ -719,39 +921,84 @@ def requant_weight_ue8m0_inplace(
         s_exp = s_exp[:m_cur, :k_cur]
         w_dq = w_q.to(torch.float32) * s_exp
         # Re-quantise using power-of-two scaling (UE8M0).
-        w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k],
-                                                     use_ue8m0=True)
+        w_requant, s_requant = per_block_cast_to_fp8(
+            w_dq, [block_m, block_k], use_ue8m0=True
+        )
 
         # Write back the results in-place.
         w_q.copy_(w_requant)
         s_old.copy_(s_requant)
 
 
-def check_aiter_fp8_linear_support() -> bool:
-    """AITER is only supported on ROCm and only for FP8_FNUZ
-    and at the moment are MI300 series"""
-    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
-            and envs.VLLM_ROCM_USE_AITER_LINEAR
-            and current_platform.is_fp8_fnuz())
+def deepgemm_post_process_fp8_weight_block(
+    wq: torch.Tensor, ws: torch.Tensor, quant_block_shape: tuple[int], use_e8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert wq.dtype == torch.float8_e4m3fn, (
+        "Expected quantized tensor dtype "
+        f"to be torch.float8_e4m3fn, got {wq.dtype} instead."
+    )
+    assert ws.dtype == torch.float32, (
+        f"Expected tensor scales dtype to be torch.float32, got {ws.dtype} instead"
+    )
+
+    if use_e8m0:
+        requant_weight_ue8m0_inplace(wq, ws, block_size=quant_block_shape)
+
+    original_ndim = wq.ndim
+    if wq.ndim == 2:
+        assert ws.ndim == 2
+        wq = wq.unsqueeze(0)
+        ws = ws.unsqueeze(0)
+
+    # From https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/utils/layout.hpp#L46
+    recipe = (1, 128, 128)
+
+    # Ref : https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/apis/gemm.hpp
+    # DeepGemm uses the `transform_sf_into_required_layout` function to
+    # represent scales in the correct format.
+    dg_ws = transform_sf_into_required_layout(
+        sf=ws,
+        mn=wq.size(1),
+        k=wq.size(2),
+        recipe=recipe,
+        num_groups=wq.size(0),
+        # is the scale factors for A in (Refers to the argument A in A @ B).
+        # Weights are B.
+        is_sfa=False,
+    )
+
+    if original_ndim == 2:
+        wq = wq.squeeze(0)
+        dg_ws = dg_ws.squeeze(0)
+
+    return wq, dg_ws
 
 
 def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
     """Pad the weight tensor. This is an optimization on ROCm platform, which
     can benefit from tensors located far enough from one another in memory"""
-    if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
-            and weight.stride(-1) == 1
-            and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+    if (
+        envs.VLLM_ROCM_FP8_PADDING
+        and current_platform.is_rocm()
+        and weight.stride(-1) == 1
+        and (weight.stride(-2) * weight.element_size()) % 512 == 0
+    ):
         num_pad = 256 // weight.element_size()
         import torch.nn.functional as F
+
         weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
         torch.cuda.empty_cache()
     return weight
 
 
-def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int,
-                             output_size: int, input_size_per_partition: int,
-                             output_partition_sizes: list[int],
-                             block_size: list[int]) -> None:
+def validate_fp8_block_shape(
+    layer: torch.nn.Module,
+    input_size: int,
+    output_size: int,
+    input_size_per_partition: int,
+    output_partition_sizes: list[int],
+    block_size: list[int],
+) -> None:
     """Validate block quantization shapes for tensor parallelism."""
     from vllm.distributed import get_tensor_model_parallel_world_size
 
@@ -759,15 +1006,18 @@ def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int,
     block_n, block_k = block_size[0], block_size[1]
 
     # Required by row parallel
-    if (tp_size > 1 and input_size // input_size_per_partition == tp_size
-            and input_size_per_partition % block_k != 0):
+    if (
+        tp_size > 1
+        and input_size // input_size_per_partition == tp_size
+        and input_size_per_partition % block_k != 0
+    ):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition} "
-            f"is not divisible by weight quantization block_k = {block_k}.")
+            f"is not divisible by weight quantization block_k = {block_k}."
+        )
 
     # Required by column parallel or enabling merged weights
-    is_tp_split = (tp_size > 1
-                   and output_size // sum(output_partition_sizes) == tp_size)
+    is_tp_split = tp_size > 1 and output_size // sum(output_partition_sizes) == tp_size
     is_merged_gemm = len(output_partition_sizes) > 1
     if is_tp_split or is_merged_gemm:
         sizes_to_check = output_partition_sizes
@@ -780,33 +1030,44 @@ def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int,
                 raise ValueError(
                     f"Weight output_partition_size = "
                     f"{output_partition_size} is not divisible by "
-                    f"weight quantization block_n = {block_n}.")
+                    f"weight quantization block_n = {block_n}."
+                )
 
 
 def create_fp8_weight_parameter(
-        output_size_per_partition: int, input_size_per_partition: int,
-        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    weight_loader: Callable | None,
+) -> torch.nn.Parameter:
     """Create FP8 weight parameter."""
     from vllm.model_executor.parameter import ModelWeightParameter
 
-    return ModelWeightParameter(data=torch.empty(output_size_per_partition,
-                                                 input_size_per_partition,
-                                                 dtype=torch.float8_e4m3fn),
-                                input_dim=1,
-                                output_dim=0,
-                                weight_loader=weight_loader)
+    return ModelWeightParameter(
+        data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn,
+        ),
+        input_dim=1,
+        output_dim=0,
+        weight_loader=weight_loader,
+    )
 
 
 def create_fp8_scale_parameter(
-        parameter_type: torch.nn.Parameter, output_partition_sizes: list[int],
-        input_size_per_partition: int, block_size: Optional[list[int]],
-        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    parameter_type: torch.nn.Parameter,
+    output_partition_sizes: list[int],
+    input_size_per_partition: int,
+    block_size: list[int] | None,
+    weight_loader: Callable | None,
+) -> torch.nn.Parameter:
     """Create scale parameter based on quantization strategy."""
     if parameter_type == ChannelQuantScaleParameter:
-        scale = parameter_type(data=torch.empty(
-            (sum(output_partition_sizes), 1), dtype=torch.float32),
-                               output_dim=0,
-                               weight_loader=weight_loader)
+        scale = parameter_type(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
     elif parameter_type == BlockQuantScaleParameter:
         assert block_size is not None
         block_n, block_k = block_size[0], block_size[1]
@@ -822,9 +1083,10 @@ def create_fp8_scale_parameter(
             weight_loader=weight_loader,
         )
     elif parameter_type == PerTensorScaleParameter:
-        scale = parameter_type(data=torch.empty(len(output_partition_sizes),
-                                                dtype=torch.float32),
-                               weight_loader=weight_loader)
+        scale = parameter_type(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
     else:
         raise ValueError(f"Unknown parameter type: {parameter_type}")
 
@@ -833,14 +1095,15 @@ def create_fp8_scale_parameter(
 
 
 def create_fp8_input_scale(
-        output_partition_sizes: list[int],
-        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    output_partition_sizes: list[int], weight_loader: Callable | None
+) -> torch.nn.Parameter:
     """Create input scale parameter for static activation quantization."""
     from vllm.model_executor.parameter import PerTensorScaleParameter
 
-    scale = PerTensorScaleParameter(data=torch.empty(
-        len(output_partition_sizes), dtype=torch.float32),
-                                    weight_loader=weight_loader)
+    scale = PerTensorScaleParameter(
+        data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+        weight_loader=weight_loader,
+    )
     scale[:] = torch.finfo(torch.float32).min
     return scale
 
@@ -849,15 +1112,18 @@ def process_fp8_weight_tensor_strategy(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
     logical_widths: list[int],
-    input_scale: Optional[torch.Tensor] = None
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """Process weights for tensor-wise quantization strategy."""
     from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-        normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
+        normalize_e4m3fn_to_e4m3fnuz,
+        requantize_with_max_scale,
+    )
 
     if current_platform.is_fp8_fnuz():
         weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
-            weight=weight, weight_scale=weight_scale, input_scale=input_scale)
+            weight=weight, weight_scale=weight_scale, input_scale=input_scale
+        )
 
     # Requantize with max scale
     weight_scale, weight = requantize_with_max_scale(
@@ -873,15 +1139,17 @@ def process_fp8_weight_tensor_strategy(
 def process_fp8_weight_channel_strategy(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """Process weights for channel-wise quantization strategy."""
     from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-        normalize_e4m3fn_to_e4m3fnuz)
+        normalize_e4m3fn_to_e4m3fnuz,
+    )
 
     if current_platform.is_fp8_fnuz():
         weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
-            weight=weight, weight_scale=weight_scale, input_scale=input_scale)
+            weight=weight, weight_scale=weight_scale, input_scale=input_scale
+        )
 
     return weight, weight_scale, input_scale
 
@@ -892,56 +1160,41 @@ def process_fp8_weight_block_strategy(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Process weights for block-wise quantization strategy."""
     from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-        normalize_e4m3fn_to_e4m3fnuz)
+        normalize_e4m3fn_to_e4m3fnuz,
+    )
 
     if current_platform.is_fp8_fnuz():
         weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-            weight=weight, weight_scale=weight_scale)
+            weight=weight, weight_scale=weight_scale
+        )
 
     weight = _maybe_pad_fp8_weight(weight)
     return weight, weight_scale
 
 
-def maybe_post_process_fp8_weight_block(layer: torch.nn.Module,
-                                        cutlass_block_fp8_supported: bool):
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
     assert layer.weight_block_size is not None
 
-    from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
-                                      should_use_deepgemm_for_fp8_linear)
+    from vllm.utils.deep_gemm import (
+        is_deep_gemm_e8m0_used,
+        should_use_deepgemm_for_fp8_linear,
+    )
 
     # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
     # requantize the weight and input to the specific scale
     # at the same time.
     should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
-        layer.orig_dtype, layer.weight)
-    if is_deep_gemm_e8m0_used() and should_use_deepgemm:
-        block_sz = tuple(layer.weight_block_size)
-        requant_weight_ue8m0_inplace(layer.weight.data,
-                                     layer.weight_scale.data, block_sz)
-    # SM90 Block FP8 CUTLASS requires row-major weight scales
-    elif (current_platform.is_device_capability(90)
-          and cutlass_block_fp8_supported and not should_use_deepgemm):
-        layer.weight_scale = torch.nn.Parameter(
-            layer.weight_scale.data.T.contiguous(), requires_grad=False)
-
-
-def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor,
-                           bias: Optional[torch.Tensor],
-                           cutlass_block_fp8_supported: bool,
-                           use_aiter_and_is_supported: bool) -> torch.Tensor:
-    """Apply block-wise FP8 linear operation."""
-    assert layer.weight_block_size is not None
-
-    return torch.ops.vllm.apply_w8a8_block_fp8_linear(
-        input=input,
-        weight=layer.weight,
-        block_size=layer.weight_block_size,
-        weight_scale=layer.weight_scale,
-        input_scale=layer.input_scale,
-        bias=bias,
-        cutlass_block_fp8_supported=cutlass_block_fp8_supported,
-        use_aiter_and_is_supported=use_aiter_and_is_supported,
+        layer.orig_dtype, layer.weight
     )
+    if should_use_deepgemm:
+        dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
+            wq=layer.weight.data,
+            ws=layer.weight_scale.data,
+            quant_block_shape=tuple(layer.weight_block_size),
+            use_e8m0=is_deep_gemm_e8m0_used(),
+        )
+        layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 41b833725b3017282336f802915500ce3cb4d784..dfebeca933920c09a7540734639e713101fbd1d8 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -4,59 +4,67 @@ from collections.abc import Mapping
 from copy import deepcopy
 from fractions import Fraction
 from types import MappingProxyType
-from typing import Optional, Union
+from typing import TYPE_CHECKING
 
 import regex as re
 import torch
 
-from vllm.config import QuantizationConfig
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, UnquantizedEmbeddingMethod)
+    ParallelLMHead,
+    UnquantizedEmbeddingMethod,
+)
+
+if TYPE_CHECKING:
+    from ..gptq import GPTQConfig
+    from ..gptq_marlin import GPTQMarlinConfig
+else:
+    GPTQConfig = object
+    GPTQMarlinConfig = object
 
 
 # Match dynamic rules with module name (prefix) and override quantize
 # config if module (prefix) matches a rule
-def override_config(config: QuantizationConfig, prefix: str):
-    weight_bits = get_dynamic_override(config, prefix, "bits",
-                                       config.weight_bits)
+def override_config(config: GPTQConfig | GPTQMarlinConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits", config.weight_bits)
     if isinstance(weight_bits, int):
         config.weight_bits = weight_bits
-    group_size = get_dynamic_override(config, prefix, "group_size",
-                                      config.group_size)
+    group_size = get_dynamic_override(config, prefix, "group_size", config.group_size)
     if isinstance(group_size, int):
         config.group_size = group_size
-    desc_act = get_dynamic_override(config, prefix, "desc_act",
-                                    config.desc_act)
+    desc_act = get_dynamic_override(config, prefix, "desc_act", config.desc_act)
     if isinstance(desc_act, bool):
         config.desc_act = desc_act
 
     config.pack_factor = Fraction(32, config.weight_bits)  # packed into int32
     if config.get_name() == "gptq_marlin":
+        assert isinstance(config, GPTQMarlinConfig)
         is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
         if isinstance(is_sym, bool):
             config.is_sym = is_sym
 
         if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
-            raise ValueError("Unsupported quantization config: "
-                             f"bits={config.weight_bits}, sym={config.is_sym}")
+            raise ValueError(
+                "Unsupported quantization config: "
+                f"bits={config.weight_bits}, sym={config.is_sym}"
+            )
 
-        config.quant_type = config.TYPE_MAP[(config.weight_bits,
-                                             config.is_sym)]
+        config.quant_type = config.TYPE_MAP[(config.weight_bits, config.is_sym)]
     elif config.get_name() == "gptq":
+        assert isinstance(config, GPTQConfig)
         if config.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
                 "Currently, only 2/3/4/8-bit weight quantization is "
-                f"supported for GPTQ, but got {config.weight_bits} bits.")
+                f"supported for GPTQ, but got {config.weight_bits} bits."
+            )
 
 
 def get_dynamic_override(
-    config: QuantizationConfig,
+    config: GPTQConfig | GPTQMarlinConfig,
     layer_name: str,
-    key: Optional[str] = None,
-    default_value: Union[int, bool,
-                         None] = None) -> Union[dict, int, bool, None]:
+    key: str | None = None,
+    default_value: int | bool | None = None,
+) -> dict | int | bool | None:
     for pattern, pattern_dict in config.dynamic.items():
         # Negative match: matched modules are excluded from quantized init
         if pattern.startswith("-:"):
@@ -75,7 +83,7 @@ def get_dynamic_override(
 def is_layer_gptq_quantized(
     prefix: str,
     quantized_layers: list[str],
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> bool:
     # prefix: model.layers.0.self_attn.q_proj
     # proj_name: q_proj
@@ -98,8 +106,9 @@ def is_layer_gptq_quantized(
 
         is_quantized = None
         for shard_prefix in shard_prefixes:
-            is_shard_quantized = any(layer in shard_prefix
-                                     for layer in quantized_layers)
+            is_shard_quantized = any(
+                layer in shard_prefix for layer in quantized_layers
+            )
 
             if is_quantized is None:
                 is_quantized = is_shard_quantized
@@ -107,7 +116,8 @@ def is_layer_gptq_quantized(
                 raise ValueError(
                     f"Detected some but not all shards of {prefix} "
                     "are quantized. All shards of fused layers "
-                    "to have the same precision.")
+                    "to have the same precision."
+                )
     else:
         is_quantized = any(layer in prefix for layer in quantized_layers)
 
@@ -116,24 +126,26 @@ def is_layer_gptq_quantized(
 
 
 def get_linear_quant_method(
-    config: QuantizationConfig,
+    config: GPTQConfig | GPTQMarlinConfig,
     layer: torch.nn.Module,
     prefix: str,
     linear_method_cls: type,
 ):
     cloned_config = deepcopy(config)
-    parallel_lm_head_quantized = isinstance(
-        layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    parallel_lm_head_quantized = (
+        isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    )
     if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
         is_layer_quantized = is_layer_gptq_quantized(
             prefix=prefix,
             quantized_layers=cloned_config.modules_in_block_to_quantize,
-            fused_mapping=cloned_config.packed_modules_mapping)
+            fused_mapping=cloned_config.packed_modules_mapping,
+        )
         # False = skip module, None = no override, else = Positive match
         if get_dynamic_override(  # noqa: E712
-                cloned_config,  # noqa: E712
-                layer_name=prefix) == False or (
-                    not is_layer_quantized):  # noqa: E712
+            cloned_config,  # noqa: E712
+            layer_name=prefix,
+        ) == False or (not is_layer_quantized):  # noqa: E712
             if parallel_lm_head_quantized:
                 return UnquantizedEmbeddingMethod()
             return UnquantizedLinearMethod()
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 62e458ec3c93ede837e31b37671c03b5a42ca191..925d0a516ce631eebd6a372904f12b2390a94ede 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -6,7 +6,7 @@ import functools
 import json
 import logging
 import os
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -21,8 +21,8 @@ def apply_w8a8_block_int8_linear(
     weight: torch.Tensor,
     block_size: list[int],
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
+    input_scale: torch.Tensor | None = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
@@ -30,12 +30,9 @@ def apply_w8a8_block_int8_linear(
     output_shape = [*input.shape[:-1], weight.shape[0]]
 
     q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
-    output = w8a8_block_int8_matmul(q_input,
-                                    weight,
-                                    x_scale,
-                                    weight_scale,
-                                    block_size,
-                                    output_dtype=input.dtype)
+    output = w8a8_block_int8_matmul(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
+    )
 
     if bias is not None:
         output = output + bias
@@ -43,8 +40,8 @@ def apply_w8a8_block_int8_linear(
 
 
 def input_to_int8(
-        x: torch.Tensor,
-        dtype: torch.dtype = torch.int8) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, dtype: torch.dtype = torch.int8
+) -> tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to int8 values with
     tensor-wise quantization."""
     iinfo = torch.iinfo(dtype)
@@ -78,8 +75,8 @@ def block_dequant(
     for i in range(k_tiles):
         for j in range(n_tiles):
             x_dq_block[
-                j * block_n:min((j + 1) * block_n, n),
-                i * block_k:min((i + 1) * block_k, k),
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
             ] *= x_s[j][i]
 
     return x_dq_block
@@ -91,15 +88,17 @@ if current_platform.is_rocm():
     # NOTE: This can be removed when hip.libdevice.round() is available.
     @core.extern
     def round_f32(arg0, _builder=None):
-        return core.extern_elementwise("",
-                                       "", [arg0], {
-                                           (core.dtype("fp32"), ):
-                                           ("llvm.round", core.dtype("fp32")),
-                                           (core.dtype("fp64"), ):
-                                           ("llvm.round", core.dtype("fp64")),
-                                       },
-                                       is_pure=True,
-                                       _builder=_builder)
+        return core.extern_elementwise(
+            "",
+            "",
+            [arg0],
+            {
+                (core.dtype("fp32"),): ("llvm.round", core.dtype("fp32")),
+                (core.dtype("fp64"),): ("llvm.round", core.dtype("fp64")),
+            },
+            is_pure=True,
+            _builder=_builder,
+        )
 
     @triton.jit
     def round_int8(x):
@@ -127,8 +126,7 @@ def _per_token_quant_int8(
     cols = tl.arange(0, BLOCK)
     mask = cols < N
 
-    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask,
-                other=0.0).to(tl.float32)
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
     absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
     scale_x = absmax / 127
     x_q = x * (127 / absmax)
@@ -142,15 +140,13 @@ def per_token_quant_int8(x):
     M = x.numel() // x.shape[-1]
     N = x.shape[-1]
     x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
-    scales = torch.empty(x.shape[:-1] + (1, ),
-                         device=x.device,
-                         dtype=torch.float32)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
 
     assert x.is_contiguous()
-    _per_token_quant_int8[(M, )](
+    _per_token_quant_int8[(M,)](
         x,
         x_q,
         scales,
@@ -229,8 +225,9 @@ def per_token_group_quant_int8(
         tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
             scaling factor for quantization.
     """
-    assert (x.shape[-1] % group_size == 0
-            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.shape[-1] % group_size == 0, (
+        "the last dimension of `x` cannot be divisible by `group_size`"
+    )
     assert x.is_contiguous(), "`x` is not contiguous"
 
     iinfo = torch.iinfo(dtype)
@@ -239,15 +236,15 @@ def per_token_group_quant_int8(
 
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     x_s = torch.empty(
-        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        x.shape[:-1] + (x.shape[-1] // group_size,),
         device=x.device,
         dtype=torch.float32,
     )
     # prefer CUDA kernel if available
     if current_platform.is_cuda():
-        torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
-                                                float(int8_min),
-                                                float(int8_max))
+        torch.ops._C.per_token_group_quant_int8(
+            x, x_q, x_s, group_size, eps, float(int8_min), float(int8_max)
+        )
         return x_q, x_s
 
     M = x.numel() // group_size
@@ -257,7 +254,7 @@ def per_token_group_quant_int8(
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
-    _per_token_group_quant_int8[(M, )](
+    _per_token_group_quant_int8[(M,)](
         x,
         x_q,
         x_s,
@@ -333,20 +330,15 @@ def _w8a8_block_int8_matmul(
 
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs,
-                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
 
         k_start = k * BLOCK_SIZE_K
         offs_ks = k_start // group_k
         a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
         b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
 
-        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:,
-                                                         None] * b_s[None, :]
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :]
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
 
@@ -365,8 +357,9 @@ def _w8a8_block_int8_matmul(
 
 
 @functools.lru_cache
-def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
-                                block_k: int) -> Optional[dict[int, Any]]:
+def get_w8a8_block_int8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the w8a8 block fp8 kernel.
 
@@ -382,7 +375,8 @@ def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
     json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
 
     config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
             logger.info(
@@ -395,8 +389,10 @@ def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
     # If no optimized configuration is available, we will use the default
     # configuration
     logger.warning(
-        ("Using default W8A8 Block INT8 kernel config. Performance might "
-         "be sub-optimal! Config file not found at %s"),
+        (
+            "Using default W8A8 Block INT8 kernel config. Performance might "
+            "be sub-optimal! Config file not found at %s"
+        ),
         config_file_path,
     )
     return None
@@ -441,7 +437,7 @@ def w8a8_block_int8_matmul(
     assert triton.cdiv(N, block_n) == Bs.shape[0]
     assert triton.cdiv(K, block_k) == Bs.shape[1]
 
-    C_shape = A.shape[:-1] + (N, )
+    C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
 
     configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
@@ -462,8 +458,9 @@ def w8a8_block_int8_matmul(
         }
 
     def grid(META):
-        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
-                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
 
     _w8a8_block_int8_matmul[grid](
         A,
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index fbc0f23acb59ae6769ffb8925954d1fb38754f09..3b8c9a8b6ca1fc38e834e5876135b4b36b1e717f 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Union
 
 import torch
 
@@ -20,12 +19,15 @@ def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor):
 
 # Newly generated tensors need to replace existing tensors that are
 # already registered as parameters by vLLM (and won't be freed)
-def replace_parameter(mod: torch.nn.Module, name: str,
-                      new: Union[torch.Tensor, torch.nn.Parameter]) -> None:
-
+def replace_parameter(
+    mod: torch.nn.Module, name: str, new: torch.Tensor | torch.nn.Parameter
+) -> None:
     old = getattr(mod, name)
-    if type(old) is type(new) and old.dtype == new.dtype and \
-        old.untyped_storage().nbytes() == new.untyped_storage().nbytes():
+    if (
+        type(old) is type(new)
+        and old.dtype == new.dtype
+        and old.untyped_storage().nbytes() == new.untyped_storage().nbytes()
+    ):
         # If we can just update in-place to avoid re-registering
         #   can be faster if the underlying storage is the same
         update_tensor_inplace(old, new)
@@ -36,5 +38,4 @@ def replace_parameter(mod: torch.nn.Module, name: str,
         # parameters for `torch.compile` compatibility
         if not isinstance(new, torch.nn.Parameter):
             new = torch.nn.Parameter(new, requires_grad=False)
-        mod.register_parameter(name,
-                               torch.nn.Parameter(new, requires_grad=False))
+        mod.register_parameter(name, torch.nn.Parameter(new, requires_grad=False))
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index fbb850d227765911204916a1b05863e1f86bfbcb..ccfcdac1ec0fec4c2b3844a3d19f1da2a2342262 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -39,12 +38,19 @@ def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]:
         return [-1, 128]
 
 
-def check_machete_supports_shape(in_features: int, out_featrues: int) \
-    -> tuple[bool, Optional[str]]:
+def check_machete_supports_shape(
+    in_features: int, out_featrues: int
+) -> tuple[bool, str | None]:
     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
-        return False, "Input features size must be divisible by "\
-            f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
+        return (
+            False,
+            "Input features size must be divisible by "
+            f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}",
+        )
     if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
-        return False, "Output features size must be divisible by "\
-            f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
+        return (
+            False,
+            "Output features size must be divisible by "
+            f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}",
+        )
     return True, None
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 317ad079b392df967c02b697c39bc5d8c4b9f404..071fb4ba16867456720425bd382c8184079e7f2b 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import numpy
 import torch
@@ -34,14 +33,15 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: Optional[bool] = None,
+    has_zp: bool | None = None,
     include_fp_type: bool = True,
-    device_capability: Optional[int] = None,
+    device_capability: int | None = None,
 ):
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
-        device_capability = (-1 if capability_tuple is None else
-                             capability_tuple.to_int())
+        device_capability = (
+            -1 if capability_tuple is None else capability_tuple.to_int()
+        )
 
     if device_capability < 80:
         return []
@@ -50,10 +50,12 @@ def query_marlin_supported_quant_types(
     # - has_zp is False: return quant_types that has not zero points
     # - has_zp is None: both
     if has_zp is None:
-        types0 = query_marlin_supported_quant_types(False, include_fp_type,
-                                                    device_capability)
-        types1 = query_marlin_supported_quant_types(True, include_fp_type,
-                                                    device_capability)
+        types0 = query_marlin_supported_quant_types(
+            False, include_fp_type, device_capability
+        )
+        types1 = query_marlin_supported_quant_types(
+            True, include_fp_type, device_capability
+        )
         return types0 + types1
 
     if has_zp:
@@ -68,108 +70,126 @@ def query_marlin_supported_quant_types(
 
 
 def _check_marlin_supported(
-        quant_type: ScalarType,
-        group_size: Optional[int],
-        has_zp: bool,
-        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
-
+    quant_type: ScalarType,
+    group_size: int | None,
+    has_zp: bool,
+    device_capability: int | None = None,
+) -> tuple[bool, str | None]:
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
-        device_capability = (-1 if capability_tuple is None else
-                             capability_tuple.to_int())
+        device_capability = (
+            -1 if capability_tuple is None else capability_tuple.to_int()
+        )
 
     supported_types = query_marlin_supported_quant_types(
-        has_zp, True, device_capability)
+        has_zp, True, device_capability
+    )
 
     if quant_type not in supported_types:
-        return (False, f"Marlin does not support weight_bits = {quant_type}. "
-                f"Only types = {supported_types} "
-                f"are supported (for group_size = {group_size}, "
-                f"device_capability = {device_capability}, zp = {has_zp}).")
-    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
-        return (False, f"Marlin does not support group_size = {group_size}. "
-                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
+        return (
+            False,
+            f"Marlin does not support weight_bits = {quant_type}. "
+            f"Only types = {supported_types} "
+            f"are supported (for group_size = {group_size}, "
+            f"device_capability = {device_capability}, zp = {has_zp}).",
+        )
+    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+        return (
+            False,
+            f"Marlin does not support group_size = {group_size}. "
+            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+            "are supported.",
+        )
 
     return True, None
 
 
-def check_marlin_supported(quant_type: ScalarType,
-                           group_size: int,
-                           has_zp: bool = False,
-                           device_capability: Optional[int] = None) -> bool:
-    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
-                                      device_capability)
+def check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: int,
+    has_zp: bool = False,
+    device_capability: int | None = None,
+) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
 
 
-def verify_marlin_supported(quant_type: ScalarType,
-                            group_size: int,
-                            has_zp: bool = False) -> None:
+def verify_marlin_supported(
+    quant_type: ScalarType, group_size: int, has_zp: bool = False
+) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
 
 
-def verify_marlin_supports_shape(output_size_per_partition: int,
-                                 input_size_per_partition: int,
-                                 input_size: int, group_size: int) -> None:
-
+def verify_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
-        raise ValueError(f"Weight output_size_per_partition = "
-                         f"{output_size_per_partition} is not divisible by "
-                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
-                         "Consider reducing tensor_parallel_size or running "
-                         "with --quantization gptq.")
+        raise ValueError(
+            f"Weight output_size_per_partition = "
+            f"{output_size_per_partition} is not divisible by "
+            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
 
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
-        raise ValueError(f"Weight input_size_per_partition = "
-                         f"{input_size_per_partition} is not divisible "
-                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
-                         "Consider reducing tensor_parallel_size or running "
-                         "with --quantization gptq.")
-
-    if (group_size < input_size
-            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = "
+            f"{input_size_per_partition} is not divisible "
+            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
             f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq.")
+            "with --quantization gptq."
+        )
 
 
-def check_marlin_supports_shape(output_size_per_partition: int,
-                                input_size_per_partition: int,
-                                input_size: int, group_size: int) \
-                                    -> tuple[bool, Optional[str]]:
+def check_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> tuple[bool, str | None]:
     try:
-        verify_marlin_supports_shape(output_size_per_partition,
-                                     input_size_per_partition, input_size,
-                                     group_size)
+        verify_marlin_supports_shape(
+            output_size_per_partition, input_size_per_partition, input_size, group_size
+        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
 
 
-def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
-                                    -> bool:
-    output_size_per_partition = getattr(layer, "output_size_per_partition",
-                                        None) or layer.output_size
-    input_size_per_partition = getattr(layer, "input_size_per_partition",
-                                       None) or layer.input_size
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    output_size_per_partition = (
+        getattr(layer, "output_size_per_partition", None) or layer.output_size
+    )
+    input_size_per_partition = (
+        getattr(layer, "input_size_per_partition", None) or layer.input_size
+    )
 
     return check_marlin_supports_shape(
         output_size_per_partition=output_size_per_partition,
         input_size_per_partition=input_size_per_partition,
         input_size=layer.input_size,
-        group_size=group_size)[0]
+        group_size=group_size,
+    )[0]
 
 
-def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
-                                    -> bool:
+def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
     hidden_size = layer.hidden_size
     intermediate_size_per_partition = layer.intermediate_size_per_partition
     # apply_router_weight_on_input is not supported for moe marlin
@@ -180,41 +200,58 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
     # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
     # down: (n, k) = (hidden_size, intermediate_size_per_partition)
     # moe marlin requires n % 128 == 0 and k % 64 == 0
-    supports_shape = hidden_size % 128 == 0 and \
-        intermediate_size_per_partition % max(64, group_size) == 0
+    supports_shape = (
+        hidden_size % 128 == 0
+        and intermediate_size_per_partition % max(64, group_size) == 0
+    )
     supports_group_size = group_size in [-1, 32, 64, 128]
-    return supports_shape and supports_group_size and \
-        supports_router_weight and supports_activation
+    return (
+        supports_shape
+        and supports_group_size
+        and supports_router_weight
+        and supports_activation
+    )
 
 
-def marlin_make_workspace(output_size_per_partition: int,
-                          device: torch.device) -> torch.Tensor:
-    max_workspace_size = (output_size_per_partition //
-                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tensor):
+    """
+    Given Marlin packed weight matrices w1_packed, and w2_packed,
+    return the MoE intermediate size N
+    """
+    marlin_tile_size = 16
+    return w2_packed.size(1) * marlin_tile_size
 
-    return torch.zeros(max_workspace_size,
-                       dtype=torch.int,
-                       device=device,
-                       requires_grad=False)
 
+def marlin_make_workspace(
+    output_size_per_partition: int, device: torch.device
+) -> torch.Tensor:
+    max_workspace_size = (
+        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
+    ) * GPTQ_MARLIN_MAX_PARALLEL
 
-def marlin_make_workspace_new(device: torch.device,
-                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    return torch.zeros(
+        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
+    )
+
+
+def marlin_make_workspace_new(
+    device: torch.device, max_blocks_per_sm: int = 1
+) -> torch.Tensor:
     # In the new marlin kernel, we use the num of threadblocks as workspace
     # size. The num of threadblocks is sms_count * max_blocks_per_sm.
     sms = torch.cuda.get_device_properties(device).multi_processor_count
-    return torch.zeros(sms * max_blocks_per_sm,
-                       dtype=torch.int,
-                       device=device,
-                       requires_grad=False)
+    return torch.zeros(
+        sms * max_blocks_per_sm, dtype=torch.int, device=device, requires_grad=False
+    )
 
 
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
 
 
-def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
-                                      is_row_parallel: bool) -> bool:
+def marlin_repeat_scales_on_all_ranks(
+    act_order: bool, group_size: int, is_row_parallel: bool
+) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -222,17 +259,18 @@ def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
 
 
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
-                              requires_grad=False)
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
 
 
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
-                              requires_grad=False)
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
 
 
-def marlin_sort_g_idx(
-        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 
@@ -243,14 +281,13 @@ def get_scale_perms():
         scale_perm.extend([i + 8 * j for j in range(8)])
     scale_perm_single: list[int] = []
     for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
 
 
-def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
-                          group_size: int) -> torch.Tensor:
-
+def marlin_permute_scales(
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
@@ -286,8 +323,9 @@ def marlin_moe_permute_scales(
     return output
 
 
-def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
-                       num_bits: int) -> torch.Tensor:
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
@@ -308,8 +346,9 @@ def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
     return zp
 
 
-def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
-                              size_n: int, num_bits: int) -> torch.Tensor:
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
@@ -331,8 +370,9 @@ def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
     return marlin_zp
 
 
-def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
-                                  size_n: int, num_bits: int):
+def moe_awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
@@ -340,8 +380,7 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
-                                              num_bits)
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
     return output
 
 
@@ -353,7 +392,8 @@ def maybe_warn_marlin_atomic_add(device, dtype):
         logger.info_once(
             "You are running Marlin kernel with bf16 on GPUs before SM90. "
             "You can consider change to fp16 to achieve better performance "
-            "if possible.")
+            "if possible."
+        )
 
 
 def maybe_warn_marlin_atomic_add_env():
@@ -365,12 +405,13 @@ def maybe_warn_marlin_atomic_add_env():
         "Marlin kernel can achieve better performance for small size_n "
         "with experimental use_atomic_add feature. "
         "You can consider set environment variable "
-        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
-
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible."
+    )
 
-def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
-                                 dtype: torch.dtype) -> bool:
 
+def should_use_atomic_add_reduce(
+    m: int, n: int, k: int, device: torch.device, dtype: torch.dtype
+) -> bool:
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
     if n >= 2048 or k < 2048 or device.type != "cuda":
@@ -392,88 +433,143 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
 
 def apply_gptq_marlin_linear(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        weight_zp: torch.Tensor,
-        g_idx: torch.Tensor,
-        g_idx_sort_indices: torch.Tensor,
-        workspace: torch.Tensor,
-        wtype: ScalarType,
-        output_size_per_partition: int,
-        input_size_per_partition: int,
-        is_k_full: bool,
-        bias: Optional[torch.Tensor] = None,
-        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    wtype: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    is_k_full: bool,
+    bias: torch.Tensor | None = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition, )
-
-    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
-                                                  n=output_size_per_partition,
-                                                  k=reshaped_x.size(1),
-                                                  device=input.device,
-                                                  dtype=input.dtype)
-
-    output = ops.gptq_marlin_gemm(reshaped_x,
-                                  None,
-                                  weight,
-                                  bias,
-                                  weight_scale,
-                                  None,
-                                  weight_zp,
-                                  g_idx,
-                                  g_idx_sort_indices,
-                                  workspace,
-                                  wtype,
-                                  size_m=reshaped_x.shape[0],
-                                  size_n=output_size_per_partition,
-                                  size_k=input_size_per_partition,
-                                  is_k_full=is_k_full,
-                                  use_atomic_add=use_atomic_add,
-                                  use_fp32_reduce=use_fp32_reduce,
-                                  is_zp_float=False)
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    output = ops.gptq_marlin_gemm(
+        reshaped_x,
+        None,
+        weight,
+        bias,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        wtype,
+        size_m=reshaped_x.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
 
     return output.reshape(out_shape)
 
 
 def apply_awq_marlin_linear(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        weight_zp: torch.Tensor,
-        g_idx: torch.Tensor,
-        g_idx_sort_indices: torch.Tensor,
-        workspace: torch.Tensor,
-        quant_type: ScalarType,
-        output_size_per_partition: int,
-        input_size_per_partition: int,
-        bias: Optional[torch.Tensor] = None,
-        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    quant_type: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    bias: torch.Tensor | None = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition, )
-
-    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
-                                                  n=output_size_per_partition,
-                                                  k=reshaped_x.size(1),
-                                                  device=input.device,
-                                                  dtype=input.dtype)
-
-    output = ops.gptq_marlin_gemm(reshaped_x,
-                                  None,
-                                  weight,
-                                  bias,
-                                  weight_scale,
-                                  None,
-                                  weight_zp,
-                                  g_idx,
-                                  g_idx_sort_indices,
-                                  workspace,
-                                  quant_type,
-                                  size_m=reshaped_x.shape[0],
-                                  size_n=output_size_per_partition,
-                                  size_k=input_size_per_partition,
-                                  use_atomic_add=use_atomic_add,
-                                  use_fp32_reduce=use_fp32_reduce,
-                                  is_zp_float=False)
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    output = ops.gptq_marlin_gemm(
+        reshaped_x,
+        None,
+        weight,
+        bias,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        quant_type,
+        size_m=reshaped_x.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
+
+    return output.reshape(out_shape)
+
+
+def apply_rtn_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    quant_type: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    bias: torch.Tensor | None = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    output = ops.gptq_marlin_gemm(
+        reshaped_x,
+        None,
+        weight,
+        bias,
+        weight_scale,
+        None,
+        None,
+        None,
+        None,
+        workspace,
+        quant_type,
+        size_m=reshaped_x.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
 
     return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 94ffdcd26ecde1df6f4967cf0935b27689fc7e93..842fb9b62267abe312108301bb999b4171c72eca 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -1,15 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
-    marlin_permute_scales, should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT,
+    marlin_make_workspace_new,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    should_use_atomic_add_reduce,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -28,7 +31,8 @@ def nvfp4_marlin_process_scales(marlin_scales):
             "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
             "negative scales. Accuracy will likely be degraded. This is "
             "because it changes the scales from FP8-S1E4M3 to a special "
-            "FP8-S0E5M3 format to speedup the dequantization.")
+            "FP8-S0E5M3 format to speedup the dequantization."
+        )
 
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
@@ -36,11 +40,13 @@ def nvfp4_marlin_process_scales(marlin_scales):
     # 8 is the number of scale number using by one thread
     marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
     marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1)
+        marlin_scales.size(0) * 2, -1
+    )
 
     # fit the layout of fp8 dequantization
     marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
-        marlin_scales.size(0), -1)
+        marlin_scales.size(0), -1
+    )
 
     # We assume that weight_scale (FP8-S1E4M3) is always greater
     # than or equal to 0. So we can convert
@@ -60,11 +66,13 @@ def mxfp4_marlin_process_scales(marlin_scales):
     # 8 is the number of scale number using by one thread
     marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
     marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1)
+        marlin_scales.size(0) * 2, -1
+    )
 
     # fit the layout of fp8 dequantization
     marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
-        marlin_scales.size(0), -1)
+        marlin_scales.size(0), -1
+    )
     marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
     return marlin_scales
 
@@ -78,48 +86,49 @@ def nvfp4_marlin_process_global_scale(global_scale):
         target_exponent = 8
     # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
     # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
-    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
-    return global_scale * (2.0**(exponent_bias - 7))
+    exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp4_exponent - 1)
+    return global_scale * (2.0 ** (exponent_bias - 7))
 
 
 def apply_fp4_marlin_linear(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        weight_scale_2: Optional[torch.Tensor],
-        workspace: torch.Tensor,
-        size_n: int,
-        size_k: int,
-        bias: Optional[torch.Tensor] = None,
-        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_scale_2: torch.Tensor | None,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: torch.Tensor | None = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
     # For GPUs that lack FP4 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP4 quantization
 
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n, )
-
-    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
-                                                  n=size_n,
-                                                  k=size_k,
-                                                  device=input.device,
-                                                  dtype=input.dtype)
-
-    output = ops.gptq_marlin_gemm(a=reshaped_x,
-                                  c=None,
-                                  b_q_weight=weight,
-                                  b_bias=bias,
-                                  b_scales=weight_scale,
-                                  global_scale=weight_scale_2,
-                                  b_zeros=None,
-                                  g_idx=None,
-                                  perm=None,
-                                  workspace=workspace,
-                                  b_q_type=scalar_types.float4_e2m1f,
-                                  size_m=reshaped_x.size(0),
-                                  size_n=size_n,
-                                  size_k=size_k,
-                                  use_atomic_add=use_atomic_add,
-                                  use_fp32_reduce=use_fp32_reduce)
+    out_shape = input.shape[:-1] + (size_n,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
+    )
+
+    output = ops.gptq_marlin_gemm(
+        a=reshaped_x,
+        c=None,
+        b_q_weight=weight,
+        b_bias=bias,
+        b_scales=weight_scale,
+        global_scale=weight_scale_2,
+        b_zeros=None,
+        g_idx=None,
+        perm=None,
+        workspace=workspace,
+        b_q_type=scalar_types.float4_e2m1f,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+    )
 
     return output.reshape(out_shape)
 
@@ -129,7 +138,8 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads.")
+        "performance for compute-heavy workloads."
+    )
 
     is_nvfp4 = hasattr(layer, "weight_scale_2")
     group_size = 16 if is_nvfp4 else 32
@@ -150,11 +160,13 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     perm = torch.empty(0, dtype=torch.int, device=device)
     qweight = layer.weight.view(torch.int32).T.contiguous()
 
-    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
-                                            perm=perm,
-                                            size_k=part_size_k,
-                                            size_n=part_size_n,
-                                            num_bits=4)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=qweight,
+        perm=perm,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        num_bits=4,
+    )
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
     # WEIGHT SCALES
@@ -165,27 +177,23 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         weight_scale = weight_scale.view(torch.float8_e8m0fnu)
 
     weight_scale = weight_scale.to(param_dtype)
-    weight_scale = marlin_permute_scales(s=weight_scale,
-                                         size_k=part_size_k,
-                                         size_n=part_size_n,
-                                         group_size=group_size)
+    weight_scale = marlin_permute_scales(
+        s=weight_scale, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+    )
 
     if is_nvfp4:
         weight_scale = nvfp4_marlin_process_scales(weight_scale)
-        layer.weight_scale = torch.nn.Parameter(weight_scale,
-                                                requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
         weight_scale_2 = layer.weight_scale_2.to(param_dtype)
         weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
-        layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
-                                                  requires_grad=False)
+        layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, requires_grad=False)
     else:
         weight_scale = mxfp4_marlin_process_scales(weight_scale)
-        layer.weight_scale = torch.nn.Parameter(weight_scale,
-                                                requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
-        assert layer.bias.shape == (part_size_n, )
+        assert layer.bias.shape == (part_size_n,)
         bias = marlin_permute_bias(layer.bias)
         layer.bias = torch.nn.Parameter(bias, requires_grad=False)
 
@@ -197,7 +205,8 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads.")
+        "performance for compute-heavy workloads."
+    )
 
     is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
     group_size = 16 if is_nvfp4 else 32
@@ -227,11 +236,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         for i in range(e):
             qweight = weight[i].view(torch.int32).T.contiguous()
 
-            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
-                                                    perm=perm,
-                                                    size_k=size_k,
-                                                    size_n=size_n,
-                                                    num_bits=4)
+            marlin_qweight = ops.gptq_marlin_repack(
+                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=4
+            )
             tensor_list.append(marlin_qweight)
 
         weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -247,8 +254,7 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             scales = scales.view(torch.float8_e8m0fnu)
         scales = scales.to(param_dtype)
         if is_nvfp4:
-            global_scale = getattr(layer,
-                                   name + "_weight_scale_2").to(param_dtype)
+            global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
 
         tensor_list = []
         if "w13" in name:
@@ -259,10 +265,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         for i in range(e):
             scale = scales[i].T
 
-            marlin_scales = marlin_permute_scales(s=scale,
-                                                  size_k=size_k,
-                                                  size_n=size_n,
-                                                  group_size=group_size)
+            marlin_scales = marlin_permute_scales(
+                s=scale, size_k=size_k, size_n=size_n, group_size=group_size
+            )
             if is_nvfp4:
                 marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
             else:
@@ -275,8 +280,7 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
         if is_nvfp4:
             global_scale = nvfp4_marlin_process_global_scale(global_scale)
-            global_scale = torch.nn.Parameter(global_scale,
-                                              requires_grad=False)
+            global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
             setattr(layer, name + "_weight_scale_2", global_scale)
 
     # BIAS
@@ -306,26 +310,26 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
     global_scale = scales.max() / 448
     scales = (scales / global_scale).to(torch.float8_e4m3fn)
 
-    fp4_weight = torch.randint(0,
-                               256, (size_n, size_k // 2),
-                               dtype=torch.uint8,
-                               device=weight.device)
-    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
-                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight = torch.randint(
+        0, 256, (size_n, size_k // 2), dtype=torch.uint8, device=weight.device
+    )
+    fp4_weight_part_1 = (fp4_weight & 0b10000000) | ((fp4_weight & 0b01110000) >> 2)
     fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
     fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
 
     fp4_weight2 = fp4_weight << 4
-    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
-                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = (fp4_weight2 & 0b10000000) | ((fp4_weight2 & 0b01110000) >> 2)
     fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
     fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
 
     weight_ref = torch.cat(
-        [fp4_weight_part_2.unsqueeze(2),
-         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
-    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
-        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+        [fp4_weight_part_2.unsqueeze(2), fp4_weight_part_1.unsqueeze(2)], 2
+    ).view(size_n, size_k)
+    weight_ref = (
+        weight_ref
+        * global_scale.to(weight.dtype)
+        * scales.repeat_interleave(group_size, 1).to(weight.dtype)
+    )
 
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
@@ -335,10 +339,9 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
         num_bits=4,
     )
 
-    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
-                                          size_k=size_k,
-                                          size_n=size_n,
-                                          group_size=group_size)
+    marlin_scales = marlin_permute_scales(
+        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+    )
     marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
     global_scale = nvfp4_marlin_process_global_scale(global_scale)
@@ -351,32 +354,31 @@ def rand_marlin_weight_mxfp4_like(weight, group_size):
     size_n, size_k = weight.shape
     device = weight.device
 
-    scales = torch.randint(100,
-                           125, (size_n, size_k // group_size),
-                           dtype=torch.uint8,
-                           device=weight.device)
+    scales = torch.randint(
+        100,
+        125,
+        (size_n, size_k // group_size),
+        dtype=torch.uint8,
+        device=weight.device,
+    )
     scales = scales.view(torch.float8_e8m0fnu)
 
-    fp4_weight = torch.randint(0,
-                               256, (size_n, size_k // 2),
-                               dtype=torch.uint8,
-                               device=weight.device)
-    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
-                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight = torch.randint(
+        0, 256, (size_n, size_k // 2), dtype=torch.uint8, device=weight.device
+    )
+    fp4_weight_part_1 = (fp4_weight & 0b10000000) | ((fp4_weight & 0b01110000) >> 2)
     fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
     fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
 
     fp4_weight2 = fp4_weight << 4
-    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
-                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = (fp4_weight2 & 0b10000000) | ((fp4_weight2 & 0b01110000) >> 2)
     fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
     fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
 
     weight_ref = torch.cat(
-        [fp4_weight_part_2.unsqueeze(2),
-         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
-    weight_ref = weight_ref * \
-        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+        [fp4_weight_part_2.unsqueeze(2), fp4_weight_part_1.unsqueeze(2)], 2
+    ).view(size_n, size_k)
+    weight_ref = weight_ref * scales.repeat_interleave(group_size, 1).to(weight.dtype)
 
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
@@ -386,10 +388,9 @@ def rand_marlin_weight_mxfp4_like(weight, group_size):
         num_bits=4,
     )
 
-    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
-                                          size_k=size_k,
-                                          size_n=size_n,
-                                          group_size=group_size)
+    marlin_scales = marlin_permute_scales(
+        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+    )
 
     marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 511e19545d5ae32bca03cfa09c7ef7692da9a799..8c96848a853974a6d68ccb370bd5a011d5b751e8 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,15 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
-    marlin_permute_scales, should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT,
+    marlin_make_workspace_new,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    should_use_atomic_add_reduce,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -28,60 +31,63 @@ def fp8_fused_exponent_bias_into_scales(scales):
         target_exponent = 8
     # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
     # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
-    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp8_exponent - 1)
     s = torch.ones_like(scales) * 2
     s = s**exponent_bias
     return scales * s
 
 
 def apply_fp8_marlin_linear(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        workspace: torch.Tensor,
-        size_n: int,
-        size_k: int,
-        bias: Optional[torch.Tensor],
-        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: torch.Tensor | None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
 
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n, )
-
-    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
-                                                  n=size_n,
-                                                  k=size_k,
-                                                  device=input.device,
-                                                  dtype=input.dtype)
-
-    output = ops.gptq_marlin_gemm(a=reshaped_x,
-                                  c=None,
-                                  b_q_weight=weight,
-                                  b_bias=bias,
-                                  b_scales=weight_scale,
-                                  global_scale=None,
-                                  b_zeros=None,
-                                  g_idx=None,
-                                  perm=None,
-                                  workspace=workspace,
-                                  b_q_type=scalar_types.float8_e4m3fn,
-                                  size_m=reshaped_x.size(0),
-                                  size_n=size_n,
-                                  size_k=size_k,
-                                  use_atomic_add=use_atomic_add,
-                                  use_fp32_reduce=use_fp32_reduce)
+    out_shape = input.shape[:-1] + (size_n,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
+    )
+
+    output = ops.gptq_marlin_gemm(
+        a=reshaped_x,
+        c=None,
+        b_q_weight=weight,
+        b_bias=bias,
+        b_scales=weight_scale,
+        global_scale=None,
+        b_zeros=None,
+        g_idx=None,
+        perm=None,
+        workspace=workspace,
+        b_q_type=scalar_types.float8_e4m3fn,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+    )
 
     return output.reshape(out_shape)
 
 
-def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
-                                 size_k_first: bool = True) -> None:
+def prepare_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads.")
+        "performance for compute-heavy workloads."
+    )
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -104,11 +110,13 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
     if not size_k_first:
         qweight = qweight.T.contiguous()
 
-    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
-                                            perm=perm,
-                                            size_k=part_size_k,
-                                            size_n=part_size_n,
-                                            num_bits=8)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=qweight,
+        perm=perm,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        num_bits=8,
+    )
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
     # WEIGHT SCALES
@@ -151,26 +159,27 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
         # size_n may not divisible by block_size[0]
         scales = scales[:, :part_size_n]
 
-    marlin_scales = marlin_permute_scales(s=scales,
-                                          size_k=part_size_k,
-                                          size_n=part_size_n,
-                                          group_size=group_size)
+    marlin_scales = marlin_permute_scales(
+        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+    )
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
-        assert layer.bias.shape == (part_size_n, )
+        assert layer.bias.shape == (part_size_n,)
         bias = marlin_permute_bias(layer.bias)
         layer.bias = torch.nn.Parameter(bias, requires_grad=False)
 
 
-def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
-                                     size_k_first: bool = True) -> None:
+def prepare_moe_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads.")
+        "performance for compute-heavy workloads."
+    )
 
     e = layer.num_experts
     k = layer.hidden_size
@@ -202,11 +211,9 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
             if not size_k_first:
                 qweight = qweight.T.contiguous()
 
-            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
-                                                    perm=perm,
-                                                    size_k=size_k,
-                                                    size_n=size_n,
-                                                    num_bits=8)
+            marlin_qweight = ops.gptq_marlin_repack(
+                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=8
+            )
             tensor_list.append(marlin_qweight)
 
         weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -265,10 +272,9 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
             scales = scales[..., :size_n].contiguous()
 
         for i in range(e):
-            marlin_scales = marlin_permute_scales(s=scales[i],
-                                                  size_k=size_k,
-                                                  size_n=size_n,
-                                                  group_size=group_size)
+            marlin_scales = marlin_permute_scales(
+                s=scales[i], size_k=size_k, size_n=size_n, group_size=group_size
+            )
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -295,8 +301,9 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
         setattr(layer, name, bias)
 
 
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
-                      size_k_first: bool = True) -> torch.Tensor:
+def pack_fp8_to_int32(
+    fp8_tensor: torch.Tensor, size_k_first: bool = True
+) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
@@ -335,10 +342,9 @@ def marlin_quant_fp8_torch(weight, group_size):
         num_bits=8,
     )
 
-    marlin_scales = marlin_permute_scales(s=scales.T,
-                                          size_k=size_k,
-                                          size_n=size_n,
-                                          group_size=group_size)
+    marlin_scales = marlin_permute_scales(
+        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+    )
 
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index f5acd03cc66227720a1775b60a331da6e4349361..89756c45ef556e097dd4965a5a97d2ed96f5ae3f 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -2,31 +2,31 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
-from typing import Optional
-
 import numpy as np
 import torch
 
 from vllm.scalar_type import ScalarType
 
-from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
-                           marlin_zero_points)
-from .quant_utils import (get_pack_factor, gptq_quantize_weights,
-                          quantize_weights, sort_weights)
+from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points
+from .quant_utils import (
+    get_pack_factor,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
 
 
 class MarlinWorkspace:
-
     def __init__(self, out_features, min_thread_n, max_parallel):
-        assert (out_features % min_thread_n == 0), (
+        assert out_features % min_thread_n == 0, (
             "out_features = {} is indivisible by min_thread_n = {}".format(
-                out_features, min_thread_n))
+                out_features, min_thread_n
+            )
+        )
 
-        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+        max_workspace_size = (out_features // min_thread_n) * max_parallel
 
-        self.scratch = torch.zeros(max_workspace_size,
-                                   dtype=torch.int,
-                                   device="cuda")
+        self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
 
 
 def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
@@ -54,8 +54,7 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm):
 
     q_w = q_w.cpu().numpy().astype(np.uint32)
 
-    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
-                        dtype=np.uint32)
+    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=np.uint32)
     for i in range(pack_factor):
         q_packed |= q_w[:, i::pack_factor] << num_bits * i
 
@@ -71,10 +70,10 @@ def get_weight_perm(num_bits: int):
         col = i // 4
         for block in [0, 1]:
             for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
             ]:
                 perm1.append(16 * row + col + 8 * block)
         for j in range(4):
@@ -94,11 +93,13 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor,
-                    quant_type: ScalarType,
-                    group_size: int,
-                    act_order: bool,
-                    test_perm: Optional[torch.Tensor] = None):
+def marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: torch.Tensor | None = None,
+):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -109,7 +110,8 @@ def marlin_quantize(w: torch.Tensor,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        w, quant_type, group_size, act_order, test_perm)
+        w, quant_type, group_size, act_order, test_perm
+    )
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
@@ -130,8 +132,7 @@ def marlin_quantize(w: torch.Tensor,
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
-                        group_size: int):
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -144,18 +145,13 @@ def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
     num_groups = size_k // group_size
 
     # Quantize with zp
-    w_ref, q_w, s, zp = quantize_weights(w,
-                                         quant_type,
-                                         group_size,
-                                         zero_points=True)
+    w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
 
     # Reformat to marlin
     weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
-                                weight_perm)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
     marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n,
-                                   quant_type.size_bits)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 1c93c364679dae9d92da80ec5e7d5c19b2250657..90011f116bb0bd2e46f6fb6e91364c0295900880 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -26,8 +26,7 @@ from .quant_utils import gptq_quantize_weights
 # matrix elements into reordered metadata matrix elements (or,
 # equivalently, for gathering reordered metadata matrix element back
 # into metadata matrix elements).
-def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
-                                               device):
+def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device):
     dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
     dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
 
@@ -35,9 +34,13 @@ def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
     group_x = 64
     group_y = 32 if meta_dtype.itemsize == 2 else 16
 
-    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
-                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
-                ((dst_rows % group_x) // 8) * 4)
+    dst_rows = (
+        dst_rows // group_x * group_x
+        + (dst_rows % 2) * 2
+        + (dst_rows % 8) // 4
+        + ((dst_rows % group_y) % 4) // 2 * 32
+        + ((dst_rows % group_x) // 8) * 4
+    )
 
     topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
     bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
@@ -50,8 +53,7 @@ def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
     interleave = 2
     cols_maj = dst_cols // interleave
     cols_min = dst_cols % interleave
-    return (cols_maj * m * interleave + dst_rows * interleave +
-            cols_min).view(-1)
+    return (cols_maj * m * interleave + dst_rows * interleave + cols_min).view(-1)
 
 
 # This function converts dense matrix into sparse semi-structured
@@ -75,17 +77,18 @@ def sparse_semi_structured_from_dense_cutlass(dense):
         raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
     quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
     if quadbits_per_meta_elem not in (4, 8):
-        raise RuntimeError(
-            "Invalid number of elements per meta element calculated")
+        raise RuntimeError("Invalid number of elements per meta element calculated")
 
     if meta_dtype == torch.int32:
         if m % 16 != 0:
             raise RuntimeError(
-                f"Number of rows of dense matrix {m} must be divisible by 16")
+                f"Number of rows of dense matrix {m} must be divisible by 16"
+            )
     else:
         if m % 32 != 0:
             raise RuntimeError(
-                f"Number of rows of dense matrix {m} must be divisible by 32")
+                f"Number of rows of dense matrix {m} must be divisible by 32"
+            )
     if k % (4 * quadbits_per_meta_elem) != 0:
         raise RuntimeError(
             f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
@@ -146,40 +149,39 @@ def sparse_semi_structured_from_dense_cutlass(dense):
     idxs1 = bit2 | (bit3.to(torch.int64) << 1)
 
     if dense.dtype != torch.float:
-        sparse0 = dense_4.gather(
-            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
         sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
         sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
     else:
-        sparse = dense_2.gather(-1,
-                                idxs0.unsqueeze(-1) // 2).view(
-                                    m,
-                                    k // 2)  # type: ignore[possibly-undefined]
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
 
     meta_4 = idxs0 | (idxs1 << 2)
-    meta_n = meta_4.view(
-        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
 
     if quadbits_per_meta_elem == 4:
-        meta = (meta_n[:, :, 0]
-                | (meta_n[:, :, 1] << 4)
-                | (meta_n[:, :, 2] << 8)
-                | (meta_n[:, :, 3] << 12))
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+        )
     elif quadbits_per_meta_elem == 8:
-        meta = (meta_n[:, :, 0]
-                | (meta_n[:, :, 1] << 4)
-                | (meta_n[:, :, 2] << 8)
-                | (meta_n[:, :, 3] << 12)
-                | (meta_n[:, :, 4] << 16)
-                | (meta_n[:, :, 5] << 20)
-                | (meta_n[:, :, 6] << 24)
-                | (meta_n[:, :, 7] << 28))
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28)
+        )
 
     # Reorder meta tensor elements.
-    meta_reordered = meta.new_empty(
-        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
+    meta_reordered = meta.new_empty((m * meta_ncols,))  # type: ignore[possibly-undefined]
     meta_offsets = _calculate_meta_reordering_scatter_offsets(
-        m, meta_ncols, meta_dtype, device)
+        m, meta_ncols, meta_dtype, device
+    )
     meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
 
     return (sparse, meta_reordered.view(m, meta_ncols))
@@ -222,13 +224,14 @@ def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
     if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
         raise RuntimeError(
             f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
-            "expected according to the number of columns of meta matrix")
+            "expected according to the number of columns of meta matrix"
+        )
 
     # Undo meta tensor elements reordering.
     meta_offsets = _calculate_meta_reordering_scatter_offsets(
-        m, meta_ncols, meta_dtype, device)
-    meta = torch.gather(meta_reordered.view(-1), 0,
-                        meta_offsets).view(m, meta_ncols)
+        m, meta_ncols, meta_dtype, device
+    )
+    meta = torch.gather(meta_reordered.view(-1), 0, meta_offsets).view(m, meta_ncols)
 
     # Unpack sparse tensor back to original dense tensor, using
     # information provided by meta tensor.  Note that torch.float
@@ -270,16 +273,17 @@ def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
         meta_2[:, :, 15] = (meta >> 30) & 0b11
 
     dense_offsets = meta_2.view(-1) + (
-        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
-            -1, 1).repeat(1, 2).view(-1)
+        torch.arange(0, 2 * m * k // ksparse, device=device) * 4
+    ).view(-1, 1).repeat(1, 2).view(-1)
 
-    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
+    dense = torch.zeros((m * 2 * k,), dtype=sparse.dtype, device=device)
     if sparse.dtype != torch.float:
         # dense.scatter_(0, dense_offsets, sparse.view(-1))
         dense.scatter_(0, dense_offsets, sparse.reshape(-1))
     else:
-        dense.view(torch.half).scatter_(0, dense_offsets,
-                                        sparse.view(torch.half).view(-1))
+        dense.view(torch.half).scatter_(
+            0, dense_offsets, sparse.view(torch.half).view(-1)
+        )
 
     return dense.view(m, 2 * k)
 
@@ -287,8 +291,8 @@ def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
 def mask_creator(tensor):
     """
     Class for creating N:M sparsity masks.
-    Masks will be created using the N:M ratio, where for every block of 
-    M weights, N will be pruned based on ranked weight value. Each mask 
+    Masks will be created using the N:M ratio, where for every block of
+    M weights, N will be pruned based on ranked weight value. Each mask
     will correspond to the given tensor.
 
     :param N: The number of weights in a group to keep
@@ -301,14 +305,14 @@ def mask_creator(tensor):
     # for i, tensor in enumerate(tensors):
     if tensor.numel() % M != 0:
         raise ValueError(
-            f"Tensor of size {tensor.shape} can't be evenly divided into "
-            f"{M} groups")
+            f"Tensor of size {tensor.shape} can't be evenly divided into {M} groups"
+        )
 
     num_groups = tensor.numel() // M
 
     # N:M sparsity for linear layers
     tensor_temp = tensor.detach().abs().reshape(num_groups, M)
-    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]
+    index = torch.argsort(tensor_temp, dim=1)[:, : int(M - N)]
 
     w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
     mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
@@ -342,7 +346,7 @@ def check_24(w, num_rows_to_sample=50, _verbose=False):
     for i in sampled_row_idxs:
         for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
             total_segments += 1
-            block = w[i, j:j + BLOCK_SIZE]
+            block = w[i, j : j + BLOCK_SIZE]
             num_nonzero = torch.count_nonzero(block)
             if num_nonzero > MAX_NON_ZEROS:
                 print("i = {} j = {} block = {}".format(i, j, block))
@@ -359,8 +363,7 @@ def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
 
     # Compress
     q_24_no_zp = q_24_no_zp.t().contiguous()
-    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
-        q_24_no_zp)
+    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(q_24_no_zp)
     q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
 
     # Restore bias
@@ -390,13 +393,12 @@ def get_weight_perm_24(num_bits: int):
         col_o = col // 2
         for block in [0, 1]:
             for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
             ]:
-                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
-                             4 * block)
+                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) + 4 * block)
         for j in range(4):
             perm_list.extend([p + 1 * j for p in perm1])
     perm = numpy.array(perm_list)
@@ -413,9 +415,9 @@ def get_weight_perm_24(num_bits: int):
     return perm
 
 
-def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
-                             group_size: int) -> torch.Tensor:
-
+def marlin_permute_scales_24(
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms_24()
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
@@ -443,17 +445,18 @@ def marlin_24_quantize(
 
     # Quantize
     w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
-        w_24, quant_type, group_size, act_order=False)
+        w_24, quant_type, group_size, act_order=False
+    )
 
     # Compress quantized weight
-    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
-                                                     quant_type)
+    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n, quant_type)
     size_k_comp = size_k // 2
 
     # Reformat to marlin
     weight_perm = get_weight_perm_24(quant_type.size_bits)
-    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
-                                        quant_type.size_bits, weight_perm)
+    marlin_24_q_w_comp = marlin_weights(
+        q_w_24_comp, size_k_comp, size_n, quant_type.size_bits, weight_perm
+    )
     marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
 
     # Create result
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index fb1d041f34499b9b0e914eddbb05876817012f0d..34a31bcf6a747ebdb98ca030a7db82bc7df52007 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,21 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
+from vllm.triton_utils import triton
+from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
-OCP_MX_BLOCK_SIZE = 32
-
 
 def _swizzle_mxfp4(quant_tensor, scale, num_warps):
-    """ weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel
-    """
+    """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
     import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
     from triton_kernels.numerics import InFlexData
     from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
@@ -25,30 +24,38 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     value_layout_opts: dict[str, Any] = {}
     scale_layout_opts: dict[str, Any] = {}
 
-    if (current_platform.is_cuda()
-            and current_platform.is_device_capability(90)
-            and not is_torch_equal_or_newer("2.8.1")):
+    if (
+        current_platform.is_cuda()
+        and current_platform.is_device_capability(90)
+        and not is_torch_equal_or_newer("2.8.1")
+    ):
         logger.warning_once(
             "Mxfp4 on hopper is running on torch < 2.8.1, "
             "this cause swizling to be disabled, which may "
-            "cause performance degradation. Please upgrade to torch nightly")
+            "cause performance degradation. Please upgrade to torch nightly"
+        )
         value_layout = StridedLayout
         scale_layout = StridedLayout
     elif current_platform.is_rocm():
-        from triton_kernels.tensor_details.layout import (GFX950MXScaleLayout,
-                                                          StridedLayout)
+        from triton_kernels.tensor_details.layout import (
+            GFX950MXScaleLayout,
+            StridedLayout,
+        )
 
         from vllm.platforms.rocm import on_gfx950
+
         value_layout = StridedLayout
         scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout
     else:
-        value_layout, value_layout_opts = \
-            layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+            mx_axis=1
+        )
         scale_layout, scale_layout_opts = (
             layout.make_default_matmul_mxfp4_w_scale_layout(
-                mx_axis=1, num_warps=num_warps))
-    if current_platform.is_cuda() and \
-        current_platform.is_device_capability(100):
+                mx_axis=1, num_warps=num_warps
+            )
+        )
+    if current_platform.is_cuda() and current_platform.is_device_capability(100):
         constraints = {
             "is_persistent": True,
             "epilogue_subtile": 1,
@@ -57,69 +64,102 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     # transpose the tensor so that the quantization axis is on dim1
     quant_tensor = quant_tensor.transpose(-2, -1)
     scale = scale.transpose(-2, -1)
-    quant_tensor = convert_layout(wrap_torch_tensor(quant_tensor, dtype=FP4),
-                                  value_layout, **value_layout_opts)
-    scale = convert_layout(wrap_torch_tensor(scale), scale_layout,
-                           **scale_layout_opts)
+    quant_tensor = convert_layout(
+        wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout, **value_layout_opts
+    )
+    scale = convert_layout(wrap_torch_tensor(scale), scale_layout, **scale_layout_opts)
     return quant_tensor, InFlexData(), scale
 
 
-def _can_support_mxfp4(use_grouped_topk: bool = False,
-                       topk_group: Optional[int] = None,
-                       num_expert_group: Optional[int] = None,
-                       expert_map: Optional[torch.Tensor] = None,
-                       custom_routing_function: Optional[Callable] = None,
-                       e_score_correction_bias: Optional[torch.Tensor] = None,
-                       apply_router_weight_on_input: bool = False,
-                       scoring_func: str = "softmax",
-                       activation: str = "swigluoai",
-                       expert_load_view: Optional[torch.Tensor] = None,
-                       logical_to_physical_map: Optional[torch.Tensor] = None,
-                       logical_replica_count: Optional[torch.Tensor] = None):
-    return not (use_grouped_topk or topk_group or num_expert_group
-                or custom_routing_function or e_score_correction_bias
-                or apply_router_weight_on_input or scoring_func != "softmax"
-                or activation != "swigluoai" or expert_load_view
-                or logical_to_physical_map or logical_replica_count)
-
-
-def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
-                   float_dtype: torch.dtype) -> torch.Tensor:
+def _can_support_mxfp4(
+    use_grouped_topk: bool = False,
+    topk_group: int | None = None,
+    num_expert_group: int | None = None,
+    expert_map: torch.Tensor | None = None,
+    custom_routing_function: Callable | None = None,
+    e_score_correction_bias: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+    scoring_func: str = "softmax",
+    activation: str = "swigluoai",
+    expert_load_view: torch.Tensor | None = None,
+    logical_to_physical_map: torch.Tensor | None = None,
+    logical_replica_count: torch.Tensor | None = None,
+):
+    return not (
+        use_grouped_topk
+        or topk_group
+        or num_expert_group
+        or custom_routing_function
+        or e_score_correction_bias
+        or apply_router_weight_on_input
+        or scoring_func != "softmax"
+        or activation != "swigluoai"
+        or expert_load_view
+        or logical_to_physical_map
+        or logical_replica_count
+    )
+
+
+def get_padding_alignment():
+    return (
+        256
+        if triton.runtime.driver.active.get_current_target().arch in ("gfx950",)
+        else 128
+    )
+
+
+def _dequant_mxfp4(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
     try:
         from quark.torch.kernel import mx
     except ImportError as err:
-        raise ImportError("The package `amd-quark` is required to use "
-                          "MX-FP4 models. Please install it with `pip install "
-                          "amd-quark`.") from err
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
 
     return mx.dq_mxfp4(x, scale, float_dtype)
 
 
-def _dequant_mxfp4_fake(x: torch.Tensor, scale: torch.Tensor,
-                        float_dtype: torch.dtype) -> torch.Tensor:
-    return torch.empty((*x.shape[:-1], x.shape[-1] * 2),
-                       dtype=float_dtype,
-                       device=x.device)
+def _dequant_mxfp4_fake(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
+    return torch.empty(
+        (*x.shape[:-1], x.shape[-1] * 2), dtype=float_dtype, device=x.device
+    )
 
 
-def _quant_dequant_mxfp4(x: torch.Tensor,
-                         scale_calculation_mode: str = "even") -> torch.Tensor:
+def _quant_dequant_mxfp4(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
     try:
         from quark.torch.kernel import mx
     except ImportError as err:
-        raise ImportError("The package `amd-quark` is required to use "
-                          "MX-FP4 models. Please install it with `pip install "
-                          "amd-quark`.") from err
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
 
     return mx.qdq_mxfp4(x, scale_calculation_mode)
 
 
-def _quant_dequant_mxfp4_fake(x: torch.Tensor,
-                              scale_calculation_mode: str = "even"
-                              ) -> torch.Tensor:
+def _quant_dequant_mxfp4_fake(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
     return torch.empty_like(x)
 
 
+# Protect these operations into a torch custom op to avoid errors as
+# torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped
+# Explanation: Dynamo does not know how to trace the builtin
+# `kernel_ext.PyCapsule.dq_uint8_mxfp4_to_half.` This function is either a
+# Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python
+# extension (perhaps created with pybind).
+# TODO: Make sure there is no way to avoid having these functions
+# marked as skipped by dynamo.
 try:
     direct_register_custom_op(
         op_name="dequant_mxfp4",
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp6_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp6_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5659e3009709d64fa4617804f52106801c0f7f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/mxfp6_utils.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_BLOCK_SIZE
+from vllm.utils.torch_utils import direct_register_custom_op
+
+
+def _quant_dequant_mxfp6(
+    x: torch.Tensor,
+    quant_dtype: str,
+    scale_calculation_mode: str = "even",
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel.hw_emulation.hw_emulation_interface import (
+            fake_quantize_fp4_fp6_per_group_with_scale,
+        )
+        from quark.torch.quantization.utils import even_round, reshape_to_blocks
+    except ImportError as err:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP6 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
+
+    axis = -1
+    block_x = reshape_to_blocks(x, OCP_MX_BLOCK_SIZE, axis)
+    amax, _ = torch.max(torch.abs(block_x), dim=-1, keepdim=True)
+    amax = amax.squeeze(-1)
+
+    # TODO: there are other rounding strategies supported in quark and in the
+    # config.json that we do not check for here!
+    if scale_calculation_mode != "even":
+        raise NotImplementedError(
+            f"Scale calculation mode {scale_calculation_mode} is not yet "
+            "supported in MX-FP6 quantization"
+        )
+    scale = even_round(amax, quant_dtype)
+
+    # Apply dequantize(quantize(x)).
+    x = fake_quantize_fp4_fp6_per_group_with_scale(
+        x,
+        scale.to(x.device),
+        axis=axis,
+        group_size=OCP_MX_BLOCK_SIZE,
+        quant_dtype=quant_dtype,
+    )
+
+    return x
+
+
+def _quant_dequant_mxfp6_fake(
+    x: torch.Tensor,
+    quant_dtype: str,
+    scale_calculation_mode: str = "even",
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+def _dequant_mxfp6(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel.hw_emulation.hw_emulation_interface import (
+            dequantize_fp4_fp6_per_group,
+        )
+        from quark.torch.utils.pack import create_pack_method
+    except ImportError as e:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP6 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from e
+
+    pack_method = create_pack_method(None, dtype=quant_dtype)
+    unpacked_x = pack_method.unpack(x, reorder=False)
+
+    scale = 2 ** (scale.view(torch.uint8).to(torch.int16) - 127).to(float_dtype)
+
+    # TODO: `dequantize_fp4_fp6_per_group` and `prepare_inputs_per_group`
+    # always return fp32.
+    return dequantize_fp4_fp6_per_group(
+        unpacked_x,
+        scale,
+        axis=-1,
+        group_size=OCP_MX_BLOCK_SIZE,
+        quant_dtype=quant_dtype,
+    ).to(float_dtype)
+
+
+def _dequant_mxfp6_fake(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str
+) -> torch.Tensor:
+    assert (x.shape[-1] * 4) % 3 == 0
+    return torch.empty(
+        (*x.shape[:-1], (x.shape[-1] * 4) // 3), dtype=float_dtype, device=x.device
+    )
+
+
+# Protect these operations into a torch custom op to avoid errors as
+# torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped
+# Explanation: Dynamo does not know how to trace the builtin
+# `kernel_ext.PyCapsule.dq_uint8_mxfp4_to_half.` This function is either a
+# Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python
+# extension (perhaps created with pybind).
+# TODO: Make sure there is no way to avoid having these functions
+# marked as skipped by dynamo.
+try:
+    direct_register_custom_op(
+        op_name="quant_dequant_mxfp6",
+        op_func=_quant_dequant_mxfp6,
+        mutates_args=[],
+        fake_impl=_quant_dequant_mxfp6_fake,
+    )
+except AttributeError as error:
+    raise error
+
+
+# Expose keyword arguments.
+def quant_dequant_mxfp6(
+    x: torch.Tensor,
+    quant_dtype: str,
+    scale_calculation_mode: str = "even",
+) -> torch.Tensor:
+    return torch.ops.vllm.quant_dequant_mxfp6(x, quant_dtype, scale_calculation_mode)
+
+
+try:
+    direct_register_custom_op(
+        op_name="dequant_mxfp6",
+        op_func=_dequant_mxfp6,
+        mutates_args=[],
+        fake_impl=_dequant_mxfp6_fake,
+    )
+except AttributeError as error:
+    raise error
+
+
+def dequant_mxfp6(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str
+) -> torch.Tensor:
+    return torch.ops.vllm.dequant_mxfp6(x, scale, float_dtype, quant_dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
index 2a6b21c918f463c522bfc6e955c4d840a2d14636..bed771fd1c4d7353644c88f08f210b7bdabfd1ae 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -8,13 +8,17 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 
-def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-
+def mxfp8_e4m3_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     try:
-        from flashinfer import mxfp8_quantize
+        from flashinfer import mxfp8_quantize as mxfp8_e4m3_quantize
     except ImportError as err:
-        raise ImportError("The package `flashinfer` is required to do "
-                          "MX-FP8 quantization. Please install it with" \
-                          "`pip install flashinfer`") from err
+        raise ImportError(
+            "The package `flashinfer` is required to do "
+            "MX-FP8 quantization. Please install it with"
+            "`pip install flashinfer`"
+        ) from err
 
-    return mxfp8_quantize(x, is_sf_swizzled_layout=False)
+    x_q, x_scales = mxfp8_e4m3_quantize(x, is_sf_swizzled_layout=False)
+    if x_scales.ndim == 1:
+        x_scales = x_scales.view(x.size(0), -1)
+    return x_q, x_scales
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index 8648771cb0177fb3898e955f996afb4d1340b004..62b480210fc06827d04ea618f99576291c961e71 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -12,8 +12,9 @@ __all__ = [
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 
-kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
-                            dtype=torch.float32)
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
 
 
 def break_fp4_bytes(a, dtype):
@@ -45,12 +46,9 @@ def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
     return out[0:m, 0:k]
 
 
-def dequantize_to_dtype(tensor_fp4,
-                        tensor_sf,
-                        global_scale,
-                        dtype,
-                        device,
-                        block_size=16):
+def dequantize_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
     """Dequantize the fp4 tensor back to high precision."""
     # Two fp4 values are packed into one uint8.
     assert tensor_fp4.dtype == torch.uint8
@@ -95,8 +93,7 @@ def ref_nvfp4_quant(x, global_scale, block_size):
     assert x.ndim == 2
     m, n = x.shape
     x = torch.reshape(x, (m, n // block_size, block_size))
-    vec_max = torch.max(torch.abs(x), dim=-1,
-                        keepdim=True)[0].to(torch.float32)
+    vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32)
     scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
     scale = torch.clamp(scale, max=448, min=-448)
     scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
@@ -108,10 +105,13 @@ def ref_nvfp4_quant(x, global_scale, block_size):
     return cast_to_fp4(clipped_x), scale.squeeze(-1)
 
 
-def run_nvfp4_emulations(x: torch.Tensor, input_global_scale: torch.Tensor,
-                         weight: torch.Tensor,
-                         weight_scale_swizzled: torch.Tensor,
-                         weight_global_scale: torch.Tensor):
+def run_nvfp4_emulations(
+    x: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale_swizzled: torch.Tensor,
+    weight_global_scale: torch.Tensor,
+):
     group_size = 16
     x_m, x_k = x.shape
     output_dtype = x.dtype
@@ -127,9 +127,14 @@ def run_nvfp4_emulations(x: torch.Tensor, input_global_scale: torch.Tensor,
 
     # dequantize weight
     w_fp4 = weight.data.view(torch.uint8)
-    w_dq = dequantize_to_dtype(w_fp4, weight_scale_swizzled.data,
-                               weight_global_scale, output_dtype, x.device,
-                               group_size)
+    w_dq = dequantize_to_dtype(
+        w_fp4,
+        weight_scale_swizzled.data,
+        weight_global_scale,
+        output_dtype,
+        x.device,
+        group_size,
+    )
 
     # matmul
     out = torch.matmul(x_dq, w_dq.t())
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
index 21af74c6b72b5f093aae07b6457eedd74a28a0fa..c3f26cc774118b40df9bd975b856dd9dc5a81541 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -5,11 +5,14 @@ from dataclasses import dataclass
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    is_flashinfer_fp4_cutlass_moe_available)
+    is_flashinfer_fp4_cutlass_moe_available,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    is_fp4_marlin_supported)
+    is_fp4_marlin_supported,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 
 __all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
 
@@ -29,12 +32,12 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
     """Detect platform support for NV-FP4 fused-MoE path"""
     cutlass_supported = cutlass_fp4_supported()
 
-    allow_flashinfer = (cutlass_supported
-                        and is_flashinfer_fp4_cutlass_moe_available())
+    allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available()
 
     if allow_flashinfer:
-        _logger.info_once("Using FlashInfer kernels for %s.", class_name
-                          or "NVFP4 path")
+        _logger.info_once(
+            "Using FlashInfer kernels for %s.", class_name or "NVFP4 path"
+        )
     else:
         if envs.VLLM_USE_FLASHINFER_MOE_FP4:
             _logger.warning_once(
@@ -50,7 +53,8 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
         else:
             raise ValueError(
                 "Current platform does not support NVFP4 quantization. "
-                "Please use Blackwell GPUs or enable FlashInfer.")
+                "Please use Blackwell GPUs or enable FlashInfer."
+            )
 
     return NvFp4Support(
         cutlass_supported=cutlass_supported,
diff --git a/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7752324f41fee1568c3b6aa3e1a3e786d2fdab98
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+OCP_MX_BLOCK_SIZE = 32
+
+OCP_MX_DTYPES = {
+    "mxfp4",
+    "mxfp6_e3m2",
+    "mxfp6_e2m3",
+    "mxfp8_e4m3",
+    "mxfp8_e5m2",
+    "mxint8",
+}
+SUPPORTED_OCP_MX_DTYPES = {"mxfp4", "mxfp6_e3m2", "mxfp6_e2m3"}
+
+
+class OCP_MX_Scheme(str, Enum):
+    w_mxfp4_a_mxfp4 = "w_mxfp4_a_mxfp4"
+    w_mxfp4_a_mxfp6_e3m2 = "w_mxfp4_a_mxfp6_e3m2"
+    w_mxfp4_a_mxfp6_e2m3 = "w_mxfp4_a_mxfp6_e2m3"
+    w_mxfp6_e3m2_a_mxfp6_e3m2 = "w_mxfp6_e3m2_a_mxfp6_e3m2"
+    w_mxfp6_e2m3_a_mxfp6_e2m3 = "w_mxfp6_e2m3_a_mxfp6_e2m3"
+
+    @classmethod
+    def from_quant_dtype(cls, input_dtype: str | None, weight_dtype: str | None):
+        if input_dtype not in OCP_MX_DTYPES or weight_dtype not in OCP_MX_DTYPES:
+            return None
+        elif input_dtype == "mxfp4" and weight_dtype == "mxfp4":
+            return cls.w_mxfp4_a_mxfp4
+        elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp4":
+            return cls.w_mxfp4_a_mxfp6_e3m2
+        elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp4":
+            return cls.w_mxfp4_a_mxfp6_e2m3
+        elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp6_e3m2":
+            return cls.w_mxfp6_e3m2_a_mxfp6_e3m2
+        elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp6_e2m3":
+            return cls.w_mxfp6_e2m3_a_mxfp6_e2m3
+        else:
+            logger.warning(
+                "input_dtype='%s' and"
+                " weight_dtype='%s' is not supported "
+                "in OCP_MX_Scheme at the moment.",
+                input_dtype,
+                weight_dtype,
+            )
+            return None
diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py
index 00d3def1db81e40da3d36333c3f10a8891cc8a8c..081f53eac9390d4329839366b5df7fdbb3106361 100644
--- a/vllm/model_executor/layers/quantization/utils/petit_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/petit_utils.py
@@ -11,14 +11,15 @@ if TYPE_CHECKING:
 # 1. Create a global variable as a placeholder for the module
 _petit_kernel: Optional["ModuleType"] = None
 
-_PETIT_INSTALL_MSG = ("Petit is not installed. Please install it with "
-                      "`pip install petit-kernel`.")
+_PETIT_INSTALL_MSG = (
+    "Petit is not installed. Please install it with `pip install petit-kernel`."
+)
 
 
 def _import_petit_kernel() -> "ModuleType":
     """
     A helper function to handle the lazy import.
-    The first time this function is called, it will import the petit_kernel 
+    The first time this function is called, it will import the petit_kernel
     library and store it in the global _petit_kernel variable.
     Subsequent calls will return the already-loaded module directly.
     """
@@ -28,6 +29,7 @@ def _import_petit_kernel() -> "ModuleType":
 
     try:
         import petit_kernel
+
         _petit_kernel = petit_kernel
         return _petit_kernel
     except ImportError:
@@ -41,14 +43,16 @@ _require_petit = _import_petit_kernel
 
 
 def _check_petit_nvfp4_supported(
-        quant_method: str,
-        group_size: Optional[int]) -> tuple[bool, Optional[str]]:
+    quant_method: str, group_size: int | None
+) -> tuple[bool, str | None]:
     if quant_method != "NVFP4":
         return (
             False,
-            ("Petit currently only supports: NVFP4 quantizations in sglang. "
-             "Please check the `hf_quant_config.json` file for your model's "
-             "quant configuration."),
+            (
+                "Petit currently only supports: NVFP4 quantizations in sglang. "
+                "Please check the `hf_quant_config.json` file for your model's "
+                "quant configuration."
+            ),
         )
     if group_size is not None and group_size != 16:
         return (
@@ -58,10 +62,8 @@ def _check_petit_nvfp4_supported(
     return (True, None)
 
 
-def verify_petit_nvfp4_supported(quant_method: str,
-                                 group_size: Optional[int]) -> None:
-    supported, error_msg = _check_petit_nvfp4_supported(
-        quant_method, group_size)
+def verify_petit_nvfp4_supported(quant_method: str, group_size: int | None) -> None:
+    supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size)
     if not supported:
         assert error_msg is not None
         raise ValueError(error_msg)
@@ -77,15 +79,15 @@ def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
     qweight = layer.weight.view(torch.int32).contiguous()
 
     # 3. Call functions through the imported module variable.
-    petit_qweight = petit_kernel.repack_nvfp4(qweight,
-                                              size_n=part_size_n,
-                                              size_k=part_size_k)
+    petit_qweight = petit_kernel.repack_nvfp4(
+        qweight, size_n=part_size_n, size_k=part_size_k
+    )
     layer.weight = torch.nn.Parameter(petit_qweight, requires_grad=False)
 
     # Permute scales
-    weight_scale = petit_kernel.process_nvfp4_scales(scales=layer.weight_scale,
-                                                     size_k=part_size_k,
-                                                     size_n=part_size_n)
+    weight_scale = petit_kernel.process_nvfp4_scales(
+        scales=layer.weight_scale, size_k=part_size_k, size_n=part_size_n
+    )
     layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
 
@@ -96,13 +98,13 @@ def apply_petit_nvfp4_linear(
     weight_scale_2: torch.Tensor,
     size_n: int,
     size_k: int,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Trigger (or get) the import here as well.
     petit_kernel = _import_petit_kernel()
 
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n, )
+    out_shape = input.shape[:-1] + (size_n,)
 
     # TODO: Use auto-tuning to find the performant solution_id
     # Call the function via the module variable.
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index acd9058fe694165bb58cfb407f1b0a122e7db806..d056d3404385a5004be19373c1c9e0c177c8ae63 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
+
 from collections.abc import Mapping
 from dataclasses import dataclass
 from types import MappingProxyType
-from typing import ClassVar, NamedTuple, Optional
+from typing import ClassVar, NamedTuple
 
 import numpy
 import torch
@@ -31,8 +32,8 @@ class GroupShape(_GroupShape):
     """
 
     # Aliases for common quantization group shapes
-    PER_TENSOR: ClassVar['GroupShape']
-    PER_TOKEN: ClassVar['GroupShape']
+    PER_TENSOR: ClassVar["GroupShape"]
+    PER_TOKEN: ClassVar["GroupShape"]
 
     def is_per_tensor(self) -> bool:
         return self.row == -1 and self.col == -1
@@ -56,18 +57,26 @@ class ScaleDesc:
     static: static scale if True, dynamic if False
     group_shape: group shape of the scale
     """
+
     dtype: torch.dtype
     static: bool
     group_shape: GroupShape
 
     def __str__(self):
-        group_shape = ('per_tensor'
-                       if self.group_shape == GroupShape.PER_TENSOR else
-                       ('per_token' if self.group_shape == GroupShape.PER_TOKEN
-                        else str(self.group_shape)))
-
-        return (f"{fx.graph.dtype_abbrs[self.dtype]},"
-                f"{'static' if self.static else 'dynamic'},{group_shape}")
+        group_shape = (
+            "per_tensor"
+            if self.group_shape == GroupShape.PER_TENSOR
+            else (
+                "per_token"
+                if self.group_shape == GroupShape.PER_TOKEN
+                else str(self.group_shape)
+            )
+        )
+
+        return (
+            f"{fx.graph.dtype_abbrs[self.dtype]},"
+            f"{'static' if self.static else 'dynamic'},{group_shape}"
+        )
 
 
 @dataclass(frozen=True)
@@ -79,16 +88,19 @@ class QuantKey:
     scale2: second-level scale descriptor
     symmetric: symmetric if True, asymmetric if False
     """
+
     dtype: torch.dtype
     scale: ScaleDesc
-    scale2: Optional[ScaleDesc] = None
+    scale2: ScaleDesc | None = None
     symmetric: bool = True
 
     def __str__(self):
         scale2_str = f"scale2({self.scale2})," if self.scale2 else ""
-        return (f"QuantKey({fx.graph.dtype_abbrs[self.dtype]},"
-                f"scale({self.scale}),{scale2_str}"
-                f"{'a' if not self.symmetric else ''}symmetric)")
+        return (
+            f"QuantKey({fx.graph.dtype_abbrs[self.dtype]},"
+            f"scale({self.scale}),{scale2_str}"
+            f"{'a' if not self.symmetric else ''}symmetric)"
+        )
 
 
 kStaticTensorScale = ScaleDesc(torch.float32, True, GroupShape.PER_TENSOR)
@@ -101,16 +113,16 @@ kDynamicTokenScale = ScaleDesc(torch.float32, False, GroupShape.PER_TOKEN)
 kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True)
 
 kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16))
-kNvfp4Quant = QuantKey(FP4_DTYPE,
-                       scale=kNvfp4GroupScale,
-                       scale2=kStaticTensorScale)
+kNvfp4Quant = QuantKey(FP4_DTYPE, scale=kNvfp4GroupScale, scale2=kStaticTensorScale)
 
 
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
     # -1 means full extent
-    return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
-            group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+    return (
+        group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+        group_shape[1] if group_shape[1] > 0 else x.shape[-1],
+    )
 
 
 # Useful when treating N-dimensional group scaling as extended numpy-style
@@ -131,9 +143,11 @@ def group_broadcast(t, shape):
     for i, s in enumerate(shape):
         if t.shape[i] != s and t.shape[i] != 1:
             assert s % t.shape[i] == 0
-            t = t.unsqueeze(i + 1)\
-                .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+            t = (
+                t.unsqueeze(i + 1)
+                .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
                 .flatten(i, i + 1)
+            )
     return t
 
 
@@ -151,9 +165,10 @@ def scaled_quantize(
     quant_dtype: torch.dtype,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     group_shape = _normalize_quant_group_shape(x, group_shape)
-    assert quant_dtype.is_floating_point, \
-        "currently `scaled_quantize` only supports floating point dtypes " \
+    assert quant_dtype.is_floating_point, (
+        "currently `scaled_quantize` only supports floating point dtypes "
         "but could be extended to support other dtypes"
+    )
 
     finfo = torch.finfo(quant_dtype)
 
@@ -175,11 +190,13 @@ def scaled_quantize(
 
     # Apply scale and convert form:
     # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
-    x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
-        .clamp(min=finfo.min, max=finfo.max)\
-        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
-        .permute(0, 2, 1, 3)\
+    x_scl_sat = (
+        (x_blkd_permd * scale.unsqueeze(-1))
+        .clamp(min=finfo.min, max=finfo.max)
+        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])
+        .permute(0, 2, 1, 3)
         .reshape(x.shape)
+    )
 
     return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
 
@@ -188,7 +205,7 @@ def scaled_quantize(
 def scaled_dequantize(
     x_q: torch.Tensor,
     x_s: torch.Tensor,
-    group_shape: Optional[GroupShape] = None,
+    group_shape: GroupShape | None = None,
     out_dtype: torch.dtype = torch.float32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if group_shape is not None:
@@ -200,7 +217,8 @@ def scaled_dequantize(
         if group_shape is None:
             raise AssertionError(
                 "if x_s is 1D tensor, group_shape must be provided otherwise "
-                "its ambiguous which dimension to broadcast x_s to")
+                "its ambiguous which dimension to broadcast x_s to"
+            )
         # unsqueeze the scales for the dimension where we want to broadcast
         # across the full extent
         if group_shape[0] == x_q.shape[-2]:
@@ -210,7 +228,8 @@ def scaled_dequantize(
         else:
             raise AssertionError(
                 "if x_s is a vector we should be broadcasting it to the full "
-                "extent of one of the dimensions")
+                "extent of one of the dimensions"
+            )
 
     if group_shape is not None:
         assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
@@ -219,9 +238,9 @@ def scaled_dequantize(
     return (x_q.to(torch.float32) * x_s).to(out_dtype)
 
 
-def pack_quantized_values_into_int32(w_q: torch.Tensor,
-                                     wtype: ScalarType,
-                                     packed_dim: int = 0):
+def pack_quantized_values_into_int32(
+    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
+):
     # move dim to pack to the end
     perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
     inv_perm = tuple(perm.index(i) for i in range(len(perm)))
@@ -241,9 +260,9 @@ def pack_quantized_values_into_int32(w_q: torch.Tensor,
     return res.permute(inv_perm)
 
 
-def unpack_quantized_values_into_int32(w_q: torch.Tensor,
-                                       wtype: ScalarType,
-                                       packed_dim: int = 0):
+def unpack_quantized_values_into_int32(
+    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
+):
     # move dim to pack to the end
     perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
     inv_perm = tuple(perm.index(i) for i in range(len(perm)))
@@ -265,8 +284,19 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor,
 def is_layer_skipped(
     prefix: str,
     ignored_layers: list[str],
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+    *,
+    skip_with_substr: bool = False,
 ) -> bool:
+    def prefix_full_match(prefix: str, ignored_layers: list[str]) -> bool:
+        return prefix in ignored_layers
+
+    # For case like: ignored_layers = ["self_attn"]
+    def substr_match(prefix: str, ignored_layers: list[str]) -> bool:
+        return any(layer in prefix for layer in ignored_layers)
+
+    match_func = substr_match if skip_with_substr else prefix_full_match
+
     # prefix: model.layers.0.self_attn.q_proj
     # proj_name: q_proj
     proj_name = prefix.split(".")[-1]
@@ -283,7 +313,7 @@ def is_layer_skipped(
 
         is_skipped = None
         for shard_prefix in shard_prefixes:
-            is_shard_skipped = shard_prefix in ignored_layers
+            is_shard_skipped = match_func(shard_prefix, ignored_layers)
 
             if is_skipped is None:
                 is_skipped = is_shard_skipped
@@ -291,14 +321,18 @@ def is_layer_skipped(
                 raise ValueError(
                     f"Detected some but not all shards of {prefix} "
                     "are quantized. All shards of fused layers "
-                    "to have the same precision.")
-    elif "experts" in prefix:
-        return any([
-            prefix in layer_name for layer_name in ignored_layers
-            if "experts" in layer_name
-        ])
+                    "to have the same precision."
+                )
+    elif "experts" in prefix and not skip_with_substr:
+        expert_ignore_layers = filter(
+            lambda layer_name: "experts" in layer_name, ignored_layers
+        )
+        return any(
+            prefix in layer_name if not skip_with_substr else layer_name in prefix
+            for layer_name in expert_ignore_layers
+        )
     else:
-        is_skipped = prefix in ignored_layers
+        is_skipped = match_func(prefix, ignored_layers)
 
     assert is_skipped is not None
     return is_skipped
@@ -309,16 +343,18 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor,
-                 w_ref: torch.Tensor,
-                 group_size: int,
-                 test_perm: Optional[torch.Tensor] = None):
+def permute_rows(
+    q_w: torch.Tensor,
+    w_ref: torch.Tensor,
+    group_size: int,
+    test_perm: torch.Tensor | None = None,
+):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
     k_size, _ = q_w.shape
 
-    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
+    g_idx = torch.zeros((k_size,), dtype=torch.int32)
     for i in range(k_size):
         g_idx[i] = i // group_size
 
@@ -337,16 +373,20 @@ def permute_rows(q_w: torch.Tensor,
     )
 
 
-def quantize_weights(w: torch.Tensor,
-                     quant_type: ScalarType,
-                     group_size: Optional[int],
-                     zero_points: bool = False,
-                     ref_zero_points_after_scales: bool = False):
-    assert quant_type.is_integer(), \
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int | None,
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert quant_type.is_integer(), (
         "Floating point quantization may work but has not been tested"
-    assert not zero_points or group_size is not None, \
-        "to have group zero points, group_size must be provided "\
+    )
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
         "(-1 group_size is channelwise)"
+    )
 
     orig_device = w.device
     orig_type = w.dtype
@@ -376,14 +416,16 @@ def quantize_weights(w: torch.Tensor,
         if zero_points:
             assert not quant_type.is_signed() and quant_type.max() > 0
             w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
-            maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
-                .clamp(min_q_val, max_q_val).int()
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
         else:
             # If the bias is such that there are no possible negative/positive
             #  values, set the max value to inf to avoid divide by 0
             w_s = torch.max(
                 abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
-                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
 
     # Quantize
     w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
@@ -430,19 +472,22 @@ SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 
-def gptq_quantize_weights(w: torch.Tensor,
-                          quant_type: ScalarType,
-                          group_size: int,
-                          act_order: bool,
-                          test_perm: Optional[torch.Tensor] = None):
+def gptq_quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: torch.Tensor | None = None,
+):
     size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
+    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, (
         f"Unsupported gptq type = {quant_type}"
-    assert group_size in SUPPORTED_GROUP_SIZES + [
-        size_k
-    ], f"Unsupported groupsize = {group_size}"
+    )
+    assert group_size in SUPPORTED_GROUP_SIZES + [size_k], (
+        f"Unsupported groupsize = {group_size}"
+    )
 
     w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
 
@@ -450,13 +495,13 @@ def gptq_quantize_weights(w: torch.Tensor,
     g_idx = torch.empty(0, dtype=torch.int, device=w.device)
     rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
     if act_order:
-        assert (
-            group_size < size_k
-        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
-            group_size, size_k)
+        assert group_size < size_k, (
+            "For act_order, groupsize = {} must be less than size_k = {}".format(
+                group_size, size_k
+            )
+        )
 
-        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
-                                                    test_perm)
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm)
 
     return w_ref, w_q, w_s, g_idx, rand_perm
 
@@ -464,8 +509,7 @@ def gptq_quantize_weights(w: torch.Tensor,
 def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     orig_device = q_w.device
 
-    sort_indices = torch.argsort(g_idx).to(
-        dtype=torch.int32)  # Sort based on g_idx
+    sort_indices = torch.argsort(g_idx).to(dtype=torch.int32)  # Sort based on g_idx
 
     g_idx = g_idx[sort_indices].contiguous()
     q_w = q_w[sort_indices, :].contiguous()
@@ -535,10 +579,11 @@ def unpack_cols(
 ):
     pack_factor = get_pack_factor(num_bits)
     assert size_n % pack_factor == 0
-    assert packed_q_w.shape == (
-        size_k, size_n // pack_factor
-    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
-        packed_q_w.shape, size_k, size_n, pack_factor)
+    assert packed_q_w.shape == (size_k, size_n // pack_factor), (
+        "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+            packed_q_w.shape, size_k, size_n, pack_factor
+        )
+    )
 
     orig_device = packed_q_w.device
 
@@ -604,7 +649,8 @@ def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
     """
     assert scale.dtype == torch.float8_e4m3fn, (
         "swizzle_blockscale expects the input tensor to be in "
-        "torch.float8_e4m3fn format.")
+        "torch.float8_e4m3fn format."
+    )
 
     scale_ndim = scale.ndim
     if scale_ndim == 2:
@@ -619,9 +665,9 @@ def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
     M_padded = _round_up(M, 128)
     K_padded = _round_up(K, 4)
 
-    padded = torch.zeros((B, M_padded, K_padded),
-                         dtype=scale.dtype,
-                         device=scale.device)
+    padded = torch.zeros(
+        (B, M_padded, K_padded), dtype=scale.dtype, device=scale.device
+    )
     padded[:B, :M, :K] = scale
 
     # Reshape / permute to the layout required by the kernel.
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index b434b7acfea838101f747214f451f39275b4dc71..fceed3e55c2df67005ed532a9f5c3552b8218c85 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,20 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from packaging import version
 
 from vllm import _custom_ops as ops
 from vllm import envs
-from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.config import CompilationMode, get_current_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
 from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
+from vllm.utils.platform_utils import get_cu_count
+from vllm.utils.torch_utils import direct_register_custom_op
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
@@ -24,9 +24,11 @@ TORCH_DEVICE_IDENTITY = None
 # torch._scaled_mm rowwise feature.
 # The condition is determined once as the operations
 # are time-consuming.
-USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
-    torch.__version__) >= version.parse("2.7")
-                               and current_platform.has_device_capability(94))
+USE_ROWWISE_TORCH_SCALED_MM = (
+    current_platform.is_rocm()
+    and version.parse(torch.__version__) >= version.parse("2.7")
+    and current_platform.has_device_capability(94)
+)
 
 
 def sparse_cutlass_supported() -> bool:
@@ -74,8 +76,8 @@ CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 
 
 def per_tensor_dequantize(
-        tensor: torch.Tensor, inv_scale: Union[float,
-                                               torch.Tensor]) -> torch.Tensor:
+    tensor: torch.Tensor, inv_scale: float | torch.Tensor
+) -> torch.Tensor:
     fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
@@ -87,12 +89,12 @@ def all_close_1d(x: torch.Tensor) -> bool:
 
 
 def convert_to_channelwise(
-        weight_scale: torch.Tensor,
-        logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]:
+    weight_scale: torch.Tensor, logical_widths: list[int]
+) -> tuple[torch.Tensor, torch.Tensor]:
     # Create channelwise buffer
-    weight_scale_channel = torch.empty((sum(logical_widths), 1),
-                                       dtype=torch.float32,
-                                       device=weight_scale.device)
+    weight_scale_channel = torch.empty(
+        (sum(logical_widths), 1), dtype=torch.float32, device=weight_scale.device
+    )
 
     # Expand each scale to match the size of each logical matrix.
     start = 0
@@ -105,8 +107,8 @@ def convert_to_channelwise(
 
 
 def requantize_with_max_scale(
-        weight: torch.Tensor, weight_scale: torch.Tensor,
-        logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]:
+    weight: torch.Tensor, weight_scale: torch.Tensor, logical_widths: list[int]
+) -> tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
 
@@ -116,8 +118,9 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1]
-                                    > torch.finfo(torch.float8_e4m3fn).min)
+    unfused_module_in_checkpoint = (
+        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+    )
 
     # If unfused checkpoint, need requanize with the single scale.
     if unfused_module_in_checkpoint:
@@ -127,10 +130,8 @@ def requantize_with_max_scale(
             if logical_width == 0:
                 continue
             end = start + logical_width
-            weight_dq = per_tensor_dequantize(weight[start:end, :],
-                                              weight_scale[idx])
-            weight[start:end, :], _ = ops.scaled_fp8_quant(
-                weight_dq, max_w_scale)
+            weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx])
+            weight[start:end, :], _ = ops.scaled_fp8_quant(weight_dq, max_w_scale)
             start = end
 
     return max_w_scale, weight
@@ -143,75 +144,102 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
-def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
-                           out_dtype: torch.dtype, scale_a: torch.Tensor,
-                           scale_b: torch.Tensor, bias: torch.Tensor,
-                           output_shape: list, **kwargs) -> torch.Tensor:
-
+def cutlass_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+    **kwargs,
+) -> torch.Tensor:
     # Fused GEMM_DQ
-    output = ops.cutlass_scaled_mm(qinput,
-                                   weight,
-                                   out_dtype=out_dtype,
-                                   scale_a=scale_a,
-                                   scale_b=scale_b,
-                                   bias=bias)
+    output = ops.cutlass_scaled_mm(
+        qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias
+    )
     return output.view(*output_shape)
 
 
-def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
-                              out_dtype: torch.dtype, scale_a: torch.Tensor,
-                              scale_b: torch.Tensor, bias: torch.Tensor,
-                              output_shape: list, **kwargs) -> torch.Tensor:
-
-    return flashinfer_scaled_fp8_mm(qinput,
-                                    weight,
-                                    out_dtype=out_dtype,
-                                    scale_a=scale_a,
-                                    scale_b=scale_b,
-                                    bias=bias)
-
-
-def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor,
-                                        weight: torch.Tensor,
-                                        out_dtype: torch.dtype,
-                                        scale_a: torch.Tensor,
-                                        scale_b: torch.Tensor,
-                                        bias: torch.Tensor) -> torch.Tensor:
+def flashinfer_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+    **kwargs,
+) -> torch.Tensor:
+    return flashinfer_scaled_fp8_mm(
+        qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias
+    )
+
+
+def rocm_per_tensor_w8a8_scaled_mm_impl(
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+) -> torch.Tensor:
     from vllm.platforms.rocm import on_mi3xx
-    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx() and \
-            qinput.shape[0] == 1 and \
-            qinput.shape[1] % 16 == 0 and \
-            ((bias is None) or (bias.dtype == out_dtype)) :
-        output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
-                               current_platform.get_cu_count(), bias)
+
+    if (
+        envs.VLLM_ROCM_USE_SKINNY_GEMM
+        and on_mi3xx()
+        and qinput.shape[0] == 1
+        and qinput.shape[1] % 16 == 0
+        and ((bias is None) or (bias.dtype == out_dtype))
+    ):
+        output = ops.wvSplitKQ(
+            weight.t(),
+            qinput,
+            out_dtype,
+            scale_a,
+            scale_b,
+            get_cu_count(),
+            bias,
+        )
     else:
-        output = torch._scaled_mm(qinput,
-                                  weight,
-                                  out_dtype=out_dtype,
-                                  scale_a=scale_a,
-                                  scale_b=scale_b,
-                                  bias=bias)
+        output = torch._scaled_mm(
+            qinput,
+            weight,
+            out_dtype=out_dtype,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            bias=bias,
+        )
     return output
 
 
-def rocm_per_tensor_w8a8_scaled_mm_fake(qinput: torch.Tensor,
-                                        weight: torch.Tensor,
-                                        out_dtype: torch.dtype,
-                                        scale_a: torch.Tensor,
-                                        scale_b: torch.Tensor,
-                                        bias: torch.Tensor) -> torch.Tensor:
-    return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]),
-                            dtype=out_dtype)
+def rocm_per_tensor_w8a8_scaled_mm_fake(
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+) -> torch.Tensor:
+    return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]), dtype=out_dtype)
 
 
-def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
-                                   weight: torch.Tensor,
-                                   out_dtype: torch.dtype,
-                                   scale_a: torch.Tensor,
-                                   scale_b: torch.Tensor, bias: torch.Tensor,
-                                   output_shape: list) -> torch.Tensor:
+def rocm_per_tensor_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+) -> torch.Tensor:
     output = torch.ops.vllm.rocm_per_tensor_w8a8_scaled_mm_impl(
-        qinput, weight, out_dtype, scale_a, scale_b, bias)
+        qinput, weight, out_dtype, scale_a, scale_b, bias
+    )
     return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape)
 
 
@@ -222,18 +250,19 @@ direct_register_custom_op(
 )
 
 
-def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
-                                    weight: torch.Tensor,
-                                    out_dtype: torch.dtype,
-                                    scale_a: torch.Tensor,
-                                    scale_b: torch.Tensor, bias: torch.Tensor,
-                                    output_shape: list) -> torch.Tensor:
-    output = torch._scaled_mm(qinput,
-                              weight,
-                              out_dtype=out_dtype,
-                              scale_a=scale_a,
-                              scale_b=scale_b,
-                              bias=bias)
+def torch_per_tensor_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+) -> torch.Tensor:
+    output = torch._scaled_mm(
+        qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias
+    )
     # A fix for discrepancy in scaled_mm which returns tuple
     # for torch < 2.5 and a single value in torch >= 2.5
     if type(output) is tuple and len(output) == 2:
@@ -242,13 +271,17 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
     return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape)
 
 
-def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
-                                   weight: torch.Tensor,
-                                   out_dtype: torch.dtype,
-                                   scale_a: torch.Tensor,
-                                   scale_b: torch.Tensor, bias: torch.Tensor,
-                                   output_shape: list,
-                                   **kwargs) -> torch.Tensor:
+def torch_per_token_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+    **kwargs,
+) -> torch.Tensor:
     # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
     #  when using it.
     #  For now it has only been validated on ROCm platform.
@@ -260,25 +293,31 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
     #  rowwise scaled GEMM before using it
 
     # Fused GEMM_DQ Rowwise GEMM
-    output = torch._scaled_mm(qinput,
-                              weight,
-                              out_dtype=out_dtype,
-                              scale_a=scale_a,
-                              scale_b=scale_b.t(),
-                              bias=bias)
+    output = torch._scaled_mm(
+        qinput,
+        weight,
+        out_dtype=out_dtype,
+        scale_a=scale_a,
+        scale_b=scale_b.t(),
+        bias=bias,
+    )
 
     output = torch.narrow(output, 0, 0, qinput.shape[0])
     output = output.view(*output_shape)
     return output
 
 
-def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
-                                     weight: torch.Tensor,
-                                     out_dtype: torch.dtype,
-                                     scale_a: torch.Tensor,
-                                     scale_b: torch.Tensor, bias: torch.Tensor,
-                                     output_shape: list,
-                                     **kwargs) -> torch.Tensor:
+def torch_channelwise_w8a8_scaled_mm(
+    *,
+    qinput: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: torch.Tensor,
+    output_shape: list,
+    **kwargs,
+) -> torch.Tensor:
     # Use unfused DQ due to limitations with scaled_mm
 
     # Symmetric quantized GEMM by definition computes the following:
@@ -296,11 +335,13 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
     # GEMM
     # This computes C = (X * W).
     # Output in fp32 to allow subsequent ops to happen in-place
-    output = torch._scaled_mm(qinput,
-                              weight,
-                              scale_a=TORCH_DEVICE_IDENTITY,
-                              scale_b=TORCH_DEVICE_IDENTITY,
-                              out_dtype=torch.float32)
+    output = torch._scaled_mm(
+        qinput,
+        weight,
+        scale_a=TORCH_DEVICE_IDENTITY,
+        scale_b=TORCH_DEVICE_IDENTITY,
+        out_dtype=torch.float32,
+    )
     # A fix for discrepancy in scaled_mm which returns tuple
     # for torch < 2.5 and a single value in torch >= 2.5
     if type(output) is tuple and len(output) == 2:
@@ -318,9 +359,8 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
 
 
 def dispatch_w8a8_scaled_mm(
-        preferred_backend: str, per_tensor_weights: bool,
-        per_tensor_activations: bool) -> Callable[..., torch.Tensor]:
-
+    preferred_backend: str, per_tensor_weights: bool, per_tensor_activations: bool
+) -> Callable[..., torch.Tensor]:
     if per_tensor_weights and per_tensor_activations:
         if preferred_backend == "rocm":
             return rocm_per_tensor_w8a8_scaled_mm
@@ -335,8 +375,11 @@ def dispatch_w8a8_scaled_mm(
         return cutlass_w8a8_scaled_mm
 
     # If torch.scaled_mm supports per-channel (weights) per-token (inputs)
-    if not per_tensor_weights and not per_tensor_activations \
-            and USE_ROWWISE_TORCH_SCALED_MM:
+    if (
+        not per_tensor_weights
+        and not per_tensor_activations
+        and USE_ROWWISE_TORCH_SCALED_MM
+    ):
         return torch_per_token_w8a8_scaled_mm
     # Normally, torch.scaled_mm supports per tensor weights + activations only
     # so fallback to naive if per channel or per token
@@ -353,15 +396,16 @@ class Fp8LinearOp:
     in the __init__ method, as reading config is not allowed inside forward.
     """
 
-    def __init__(self,
-                 act_quant_static: bool,
-                 act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-                 pad_output: Optional[bool] = None):
+    def __init__(
+        self,
+        act_quant_static: bool,
+        act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
+        pad_output: bool | None = None,
+    ):
         if current_platform.is_rocm():
             self.preferred_backend = "rocm"
         elif current_platform.is_cuda() and cutlass_fp8_supported():
-            if has_flashinfer() and current_platform.has_device_capability(
-                    100):
+            if has_flashinfer() and current_platform.has_device_capability(100):
                 self.preferred_backend = "flashinfer"
             else:
                 self.preferred_backend = "cutlass"
@@ -375,25 +419,29 @@ class Fp8LinearOp:
         # as it breaks with dynamic shapes.
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
-            pad_output = config.level < CompilationLevel.PIECEWISE and \
-                         self.preferred_backend == "torch"
+            pad_output = (
+                config.mode < CompilationMode.VLLM_COMPILE
+                and self.preferred_backend == "torch"
+            )
 
         self.output_padding = 17 if pad_output else None
         self.act_quant_static = act_quant_static
         self.act_quant_group_shape = act_quant_group_shape
-        self.quant_fp8 = QuantFP8(static=act_quant_static,
-                                  group_shape=act_quant_group_shape,
-                                  num_token_padding=self.output_padding)
+        self.quant_fp8 = QuantFP8(
+            static=act_quant_static,
+            group_shape=act_quant_group_shape,
+            num_token_padding=self.output_padding,
+        )
 
     def apply(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
-        out_dtype: Optional[torch.dtype] = None,
-        input_scale: Optional[torch.Tensor] = None,
-        input_scale_ub: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None,
+        out_dtype: torch.dtype | None = None,
+        input_scale: torch.Tensor | None = None,
+        input_scale_ub: torch.Tensor | None = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # ops.scaled_fp8_quant supports both dynamic and static quant.
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
@@ -417,28 +465,38 @@ class Fp8LinearOp:
         else:
             qinput, x_scale = input_2d, input_scale
 
-        per_tensor_weights = (weight_scale.numel() == 1)
-        per_tensor_activations = (x_scale.numel() == 1)
+        # Must have dim() conditions
+        # In per-token quant scenario, when the number of token is 1,
+        # the scale will only have 1 elements.
+        # Without checking the dim(),
+        # we cannot distingushes between per-tensor and per-token quant.
+        # Example:
+        # When the number of token is 1, per-token scale is [[1]]
+        # When per-tensor scale is [1] or ().
+        per_tensor_weights = weight_scale.numel() == 1
+        per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2
 
         # TODO(luka) do this dispatch during init (after ScaledMM refactor)
-        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(self.preferred_backend,
-                                                      per_tensor_weights,
-                                                      per_tensor_activations)
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
+            self.preferred_backend, per_tensor_weights, per_tensor_activations
+        )
 
-        return w8a8_scaled_mm_func(qinput=qinput,
-                                   weight=weight,
-                                   out_dtype=out_dtype,
-                                   scale_a=x_scale,
-                                   scale_b=weight_scale,
-                                   bias=bias,
-                                   output_shape=output_shape)
+        return w8a8_scaled_mm_func(
+            qinput=qinput,
+            weight=weight,
+            out_dtype=out_dtype,
+            scale_a=x_scale,
+            scale_b=weight_scale,
+            bias=bias,
+            output_shape=output_shape,
+        )
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     assert weight.dtype == torch.float8_e4m3fn
     # The bits pattern 10000000(-128) represents zero in e4m3fn
     # but NaN in e4m3fnuz. So here we set it to 0.
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 3f2d571777c0098c05a7be6fad428626d7365eda..c9fa8054625e472fc24666d6c82d7578fca4225d 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -32,9 +32,10 @@ related helpers for sincos positional embeddings.
 
 Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
 """
+
 import math
+from collections.abc import Callable
 from functools import partial
-from typing import Callable, Optional, Union
 
 import numpy as np
 import torch
@@ -47,8 +48,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
-def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor,
-                                                       int]) -> torch.Tensor:
+def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor | int) -> torch.Tensor:
     # abs_pos: L, C
     # tgt_size: (H, W)
     # return: M, C
@@ -56,21 +56,26 @@ def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor,
     dtype = abs_pos.dtype
     if isinstance(tgt_size, int):
         tgt_size = (tgt_size, tgt_size)
-    if (src_size == tgt_size[0] and src_size == tgt_size[1]):
+    if src_size == tgt_size[0] and src_size == tgt_size[1]:
         return abs_pos
-    return (F.interpolate(
-        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
-        size=(tgt_size[0], tgt_size[1]),
-        mode="bicubic",
-        align_corners=False,
-    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype))
+    return (
+        F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size[0], tgt_size[1]),
+            mode="bicubic",
+            align_corners=False,
+        )
+        .permute(0, 2, 3, 1)
+        .flatten(0, 2)
+        .to(dtype=dtype)
+    )
 
 
 # sin/cos positional embedding helpers are adapted from:
 # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
 def get_1d_sincos_pos_embed_from_grid(
-    embed_dim: int, pos: np.ndarray,
-    version: tuple[int, int] = (2, 0)) -> torch.Tensor:
+    embed_dim: int, pos: np.ndarray, version: tuple[int, int] = (2, 0)
+) -> torch.Tensor:
     """
     embed_dim: output dimension for each position
     pos: a list of positions to be encoded: size (M,) / (H, W)
@@ -96,15 +101,17 @@ def get_1d_sincos_pos_embed_from_grid(
 
 
 def get_2d_sincos_pos_embed_from_grid(
-    embed_dim: int, grid: np.ndarray,
-    version: tuple[int, int] = (2, 0)) -> torch.Tensor:
+    embed_dim: int, grid: np.ndarray, version: tuple[int, int] = (2, 0)
+) -> torch.Tensor:
     assert embed_dim % 2 == 0
 
     # use half of dimensions to encode grid_h
     emb_h = get_1d_sincos_pos_embed_from_grid(
-        embed_dim // 2, grid[0], version)  # (H*W, D/2) or (H, W, D/2)
+        embed_dim // 2, grid[0], version
+    )  # (H*W, D/2) or (H, W, D/2)
     emb_w = get_1d_sincos_pos_embed_from_grid(
-        embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
+        embed_dim // 2, grid[1], version
+    )  # (H*W, D/2) or (H, W, D/2)
 
     if version == (2, 0):
         emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
@@ -114,10 +121,10 @@ def get_2d_sincos_pos_embed_from_grid(
 
 
 def get_2d_sincos_pos_embed(
-        embed_dim: int,
-        grid_size: Union[int, tuple[int, int]],
-        cls_token: bool = False,
-        version: tuple[int, int] = (2, 0),
+    embed_dim: int,
+    grid_size: int | tuple[int, int],
+    cls_token: bool = False,
+    version: tuple[int, int] = (2, 0),
 ) -> torch.Tensor:
     """
     grid_size: int of the grid height and width
@@ -134,15 +141,13 @@ def get_2d_sincos_pos_embed(
     grid_w = np.arange(grid_w_size, dtype=np.float32)
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
-    assert isinstance(grid, np.ndarray) and \
-        grid.shape == (2, grid_h_size, grid_w_size)
+    assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
 
     if version == (2, 0):
         grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
         pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
         if cls_token:
-            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
-                                       axis=0)
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
     else:
         pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
     return pos_embed
@@ -156,15 +161,17 @@ class BaseResampler(nn.Module):
         A tensor with the shape of (grid_size**2, embed_dim)
     """
 
-    def __init__(self,
-                 num_queries: int,
-                 embed_dim: int,
-                 num_heads: int,
-                 kv_dim: Optional[int] = None,
-                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-                 do_post_projection: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int | None = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        do_post_projection: bool = True,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
 
         self.num_queries = num_queries
@@ -174,14 +181,16 @@ class BaseResampler(nn.Module):
         self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim))
 
         if kv_dim is not None and kv_dim != embed_dim:
-            self.kv_proj = ReplicatedLinear(kv_dim,
-                                            embed_dim,
-                                            bias=False,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.kv_proj")
+            self.kv_proj = ReplicatedLinear(
+                kv_dim,
+                embed_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_proj",
+            )
         else:
             # Maintain the same return value with ReplicatedLinear.forward
-            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa
                 nn.Identity()(*args, **kwargs),
                 None,
             )
@@ -189,10 +198,10 @@ class BaseResampler(nn.Module):
         self.ln_q = norm_layer(embed_dim)
         self.ln_kv = norm_layer(embed_dim)
         self.do_post_projection = do_post_projection
-        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
-        self.proj = nn.Parameter(
-            (embed_dim**-0.5) *
-            torch.empty(embed_dim, embed_dim)) if do_post_projection else None
+        if self.do_post_projection:
+            self.ln_post = norm_layer(embed_dim)
+            data = (embed_dim**-0.5) * torch.empty(embed_dim, embed_dim)
+            self.proj = nn.Parameter(data=data)
 
     def _repeat(self, query, N: int):
         return query.unsqueeze(1).repeat(1, N, 1)
@@ -206,51 +215,55 @@ class Resampler2(BaseResampler):
     present in minicpmv2.0, but not qwen-vl.
     """
 
-    def __init__(self,
-                 grid_size: int,
-                 embed_dim: int,
-                 num_heads: int,
-                 kv_dim: Optional[int] = None,
-                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-                 adaptive: bool = False,
-                 do_post_projection: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
-        super().__init__(grid_size**2,
-                         embed_dim,
-                         num_heads,
-                         kv_dim,
-                         norm_layer,
-                         do_post_projection=do_post_projection,
-                         quant_config=quant_config,
-                         prefix=prefix)
+    def __init__(
+        self,
+        grid_size: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int | None = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        adaptive: bool = False,
+        do_post_projection: bool = True,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            grid_size**2,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            do_post_projection=do_post_projection,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
 
         self.adaptive = adaptive
-        pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
-                                                grid_size,
-                                                version=(2, 0))
+        pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, grid_size, version=(2, 0))
 
         self.pos_embed = nn.Parameter(
-            torch.from_numpy(pos_embed_arr).requires_grad_(False))
+            torch.from_numpy(pos_embed_arr).requires_grad_(False)
+        )
 
     def forward(
         self,
         x: torch.Tensor,
-        tgt_sizes: Optional[torch.Tensor] = None,
-        attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: torch.Tensor | None = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if tgt_sizes is None:
             tgt_sizes = int(math.sqrt(x.size(1)))
         if self.adaptive:
-            pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
-                                                    tgt_sizes,
-                                                    version=(2, 0))
-            pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device,
-                                                           dtype=x.dtype)
+            pos_embed_arr = get_2d_sincos_pos_embed(
+                self.embed_dim, tgt_sizes, version=(2, 0)
+            )
+            pos_embed = torch.from_numpy(pos_embed_arr).to(
+                device=x.device, dtype=x.dtype
+            )
         else:
-            pos_embed = get_abs_pos(self.pos_embed,
-                                    tgt_sizes).to(device=x.device,
-                                                  dtype=x.dtype)
+            pos_embed = get_abs_pos(self.pos_embed, tgt_sizes).to(
+                device=x.device, dtype=x.dtype
+            )
 
         x, _ = self.kv_proj(x)
         x = self.ln_kv(x).permute(1, 0, 2)
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 3576368981c7c0c4b6c961dae694c0dec4b78d3a..56c165f9c041aa3cf555ad458b7f926f1ae12dfe 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Rotary Positional Embeddings."""
-from typing import Any, Optional
+
+from typing import Any
 
 import torch
 
@@ -27,18 +28,17 @@ def get_rope(
     max_position: int,
     base: float,
     is_neox_style: bool = True,
-    rope_scaling: Optional[dict[str, Any]] = None,
-    dtype: Optional[torch.dtype] = None,
+    rope_scaling: dict[str, Any] | None = None,
+    dtype: torch.dtype | None = None,
     partial_rotary_factor: float = 1.0,
-    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+    dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
     if rope_scaling is not None:
         # Transforms every value that is a list into a tuple for caching calls
         rope_scaling_tuple = {
-            k: tuple(v) if isinstance(v, list) else v
-            for k, v in rope_scaling.items()
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
         }
         rope_scaling_args = tuple(rope_scaling_tuple.items())
     else:
@@ -56,8 +56,16 @@ def get_rope(
 
     if partial_rotary_factor < 1.0:
         rotary_dim = int(rotary_dim * partial_rotary_factor)
-    key = (head_size, rotary_dim, max_position, base, is_neox_style,
-           rope_scaling_args, dual_chunk_attention_args, dtype)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dual_chunk_attention_args,
+        dtype,
+    )
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
@@ -67,13 +75,19 @@ def get_rope(
             for k, v in dual_chunk_attention_config.items()
             if k in ("chunk_size", "local_size")
         }
-        rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim,
-                                              max_position, base,
-                                              is_neox_style, dtype,
-                                              **extra_kwargs)
+        rotary_emb = DualChunkRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            **extra_kwargs,
+        )
     elif not rope_scaling:
-        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                     is_neox_style, dtype)
+        rotary_emb = RotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        )
     else:
         scaling_type = rope_scaling["rope_type"]
 
@@ -81,18 +95,23 @@ def get_rope(
             scaling_factor = rope_scaling["factor"]
             low_freq_factor = rope_scaling["low_freq_factor"]
             high_freq_factor = rope_scaling["high_freq_factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
-            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
-                                               max_position, base,
-                                               is_neox_style, dtype,
-                                               scaling_factor, low_freq_factor,
-                                               high_freq_factor,
-                                               original_max_position)
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                scaling_factor,
+                low_freq_factor,
+                high_freq_factor,
+                original_max_position,
+            )
         elif scaling_type == "mllama4":
-            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
-                                                     max_position, base,
-                                                     is_neox_style, dtype)
+            rotary_emb = Llama4VisionRotaryEmbedding(
+                head_size, rotary_dim, max_position, base, is_neox_style, dtype
+            )
         elif scaling_type == "default":
             if "mrope_section" in rope_scaling:
                 rotary_emb = MRotaryEmbedding(
@@ -103,8 +122,7 @@ def get_rope(
                     is_neox_style,
                     dtype,
                     mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved",
-                                                       False),
+                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
                 )
             else:
                 rotary_emb = RotaryEmbedding(
@@ -117,43 +135,72 @@ def get_rope(
                 )
         elif scaling_type == "linear":
             scaling_factor = rope_scaling["factor"]
-            rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
-                                                      max_position, base,
-                                                      is_neox_style,
-                                                      scaling_factor, dtype)
+            rotary_emb = LinearScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
         elif scaling_type == "ntk":
             scaling_factor = rope_scaling["factor"]
-            mixed_b = rope_scaling.get('mixed_b', None)
-            rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim,
-                                                   max_position, base,
-                                                   is_neox_style,
-                                                   scaling_factor, dtype,
-                                                   mixed_b)
+            mixed_b = rope_scaling.get("mixed_b", None)
+            rotary_emb = NTKScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                mixed_b,
+            )
         elif scaling_type == "dynamic":
             if "alpha" in rope_scaling:
                 scaling_alpha = rope_scaling["alpha"]
                 rotary_emb = DynamicNTKAlphaRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_alpha, dtype)
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_alpha,
+                    dtype,
+                )
             elif "factor" in rope_scaling:
                 scaling_factor = rope_scaling["factor"]
                 rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_factor, dtype)
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                )
             else:
-                raise ValueError("Dynamic rope scaling must contain either "
-                                 "'alpha' or 'factor' field")
+                raise ValueError(
+                    "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
+                )
         elif scaling_type == "yarn":
             scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
                 for k, v in rope_scaling.items()
-                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
-                         "beta_slow")
+                if k
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "apply_yarn_scaling",
+                )
             }
             if "mrope_section" in rope_scaling:
+                extra_kwargs.pop("apply_yarn_scaling", None)
                 rotary_emb = MRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -162,42 +209,69 @@ def get_rope(
                     is_neox_style,
                     dtype,
                     mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved",
-                                                       False),
+                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
                     scaling_factor=scaling_factor,
-                    **extra_kwargs)
+                    **extra_kwargs,
+                )
             else:
                 rotary_emb = YaRNScalingRotaryEmbedding(
-                    head_size, rotary_dim, original_max_position, base,
-                    is_neox_style, scaling_factor, dtype, **extra_kwargs)
+                    head_size,
+                    rotary_dim,
+                    original_max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                    **extra_kwargs,
+                )
         elif scaling_type == "deepseek_yarn":
             scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
             # assert max_position == original_max_position * scaling_factor
             extra_kwargs = {
                 k: v
                 for k, v in rope_scaling.items()
-                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
-                         "beta_slow", "mscale", "mscale_all_dim")
+                if k
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "mscale",
+                    "mscale_all_dim",
+                )
             }
             rotary_emb = DeepseekScalingRotaryEmbedding(
-                head_size, rotary_dim, original_max_position, base,
-                is_neox_style, scaling_factor, dtype, **extra_kwargs)
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
         elif scaling_type == "longrope":
             short_factor = rope_scaling["short_factor"]
             long_factor = rope_scaling["long_factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
                 for k, v in rope_scaling.items()
                 if k in ("short_mscale", "long_mscale")
             }
             rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
-                head_size, rotary_dim, max_position, original_max_position,
-                base, is_neox_style, dtype, short_factor, long_factor,
-                **extra_kwargs)
+                head_size,
+                rotary_dim,
+                max_position,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                short_factor,
+                long_factor,
+                **extra_kwargs,
+            )
         else:
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 0cf634f82a8ab0951755e46a7d9377cf12c26536..ce4f40680b0a31ee61dd33e55a68948118cd15ad 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -1,19 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Rotary Positional Embeddings Base Class."""
-from typing import Optional
 
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import (is_rocm_triton_rotary_embedding_enabled,
-                                  rocm_aiter_rotary_emb)
 
 
 @CustomOp.register("rotary_embedding")
-class RotaryEmbedding(CustomOp):
+class RotaryEmbeddingBase(CustomOp):
     """Original rotary positional embedding."""
 
     def __init__(
@@ -47,8 +45,9 @@ class RotaryEmbedding(CustomOp):
             cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_triton_rotary_embedding_enabled = \
-            is_rocm_triton_rotary_embedding_enabled()
+        self.is_rocm_triton_rotary_embed_enabled = (
+            rocm_aiter_ops.is_triton_rotary_embed_enabled()
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -56,8 +55,12 @@ class RotaryEmbedding(CustomOp):
         # use CPU to compute the cache and then move it to GPU. However, we
         # create the cache on GPU for faster initialization. This may cause
         # a slight numerical difference between the HF implementation and ours.
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
@@ -74,87 +77,136 @@ class RotaryEmbedding(CustomOp):
     def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
         # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
         # is expensive, so avoid calling it if possible
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
+        if (
+            self.cos_sin_cache.device != query.device
+            or self.cos_sin_cache.dtype != query.dtype
+        ):
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
 
-    def forward_native(
+
+class RotaryEmbedding(RotaryEmbeddingBase):
+    def __init__(
         self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    @staticmethod
+    def forward_static(
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None,
+        head_size: int,
+        rotary_dim: int,
+        cos_sin_cache: torch.Tensor,
+        is_neox_style: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """A PyTorch-native implementation of forward()."""
         positions = positions.flatten()
         num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos_sin = cos_sin_cache.index_select(0, positions)
         cos, sin = cos_sin.chunk(2, dim=-1)
 
         query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = apply_rotary_emb_torch(query_rot, cos, sin,
-                                           self.is_neox_style)
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         # key may be None in some cases, e.g. cross-layer KV sharing
         if key is not None:
             key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-            key_rot = key[..., :self.rotary_dim]
-            key_pass = key[..., self.rotary_dim:]
-            key_rot = apply_rotary_emb_torch(key_rot, cos, sin,
-                                             self.is_neox_style)
+            key = key.view(num_tokens, -1, head_size)
+            key_rot = key[..., :rotary_dim]
+            key_pass = key[..., rotary_dim:]
+            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style)
             key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A PyTorch-native implementation of forward()."""
+        return self.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if self.use_flashinfer:
-            torch.ops.vllm.flashinfer_rotary_embedding(positions, query, key,
-                                                       self.head_size,
-                                                       self.cos_sin_cache,
-                                                       self.is_neox_style)
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
             return query, key
 
         from vllm import _custom_ops as ops
+
         self._match_cos_sin_cache_dtype(query)
 
         # ops.rotary_embedding() is an in-place operation
         # that updates the query and key tensors.
-        ops.rotary_embedding(positions, query, key, self.head_size,
-                             self.cos_sin_cache, self.is_neox_style)
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
         return query, key
 
     def forward_hip(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.is_rocm_triton_rotary_embedding_enabled:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.is_rocm_triton_rotary_embed_enabled:
             self._match_cos_sin_cache_dtype(query)
-            rocm_aiter_rotary_emb(positions, query, key, self.cos_sin_cache,
-                                  self.head_size, self.rotary_dim,
-                                  self.is_neox_style)
-        else:
-            # ops.rotary_embedding() is an in-place operation
-            # that updates the query and key tensors.
-            self.forward_cuda(positions, query, key)
-        return query, key
+            rocm_aiter_ops.triton_rotary_embed(
+                positions,
+                query,
+                key,
+                self.cos_sin_cache,
+                self.head_size,
+                self.rotary_dim,
+                self.is_neox_style,
+            )
+            return query, key
+        return self.forward_cuda(positions, query, key)
 
     def forward_xpu(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         from vllm._ipex_ops import ipex_ops as ops
 
         self._match_cos_sin_cache_dtype(query)
@@ -166,8 +218,14 @@ class RotaryEmbedding(CustomOp):
             # ipex.llm.functional.rotary_embedding_batched
             return self.forward_native(positions, query, key)
         else:
-            ops.rotary_embedding(positions, query, key, self.head_size,
-                                 self.cos_sin_cache, self.is_neox_style)
+            ops.rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
         return query, key
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 8619651067746c3d9151aaa255d4dbd43c848349..196533b6179590462e05bc964f738bbf59a0f2dc 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -2,20 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
+from collections.abc import Callable
+from functools import cache
+from importlib.util import find_spec
 
 import torch
 
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 
 if current_platform.is_cuda():
     from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
+logger = init_logger(__name__)
+
 
 # common functions
 def rotate_neox(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
@@ -47,9 +53,9 @@ def apply_rotary_emb_torch(
         return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
-def apply_rotary_emb_dispatch(x: torch.Tensor, cos: torch.Tensor,
-                              sin: torch.Tensor,
-                              is_neox_style: bool) -> torch.Tensor:
+def apply_rotary_emb_dispatch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, is_neox_style: bool
+) -> torch.Tensor:
     """
     Args:
         x: [num_tokens, num_heads, head_size]
@@ -59,39 +65,71 @@ def apply_rotary_emb_dispatch(x: torch.Tensor, cos: torch.Tensor,
             positional embeddings.
     """
     if current_platform.is_cuda():
-        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
-                                not is_neox_style).squeeze(0)
+        return apply_rotary_emb(x.unsqueeze(0), cos, sin, not is_neox_style).squeeze(0)
     else:
         return apply_rotary_emb_torch(x, cos, sin, is_neox_style)
 
 
+@cache
+def dispatch_rotary_emb_function(
+    default: Callable[..., torch.Tensor] | None = None,
+) -> Callable[..., torch.Tensor]:
+    if current_platform.is_cuda():
+        return apply_rotary_emb
+
+    # if torch compile is not enabled
+    # use rotary embedding function from flash_attn package
+    # otherwise use the naive pytorch embedding implementation
+    # is faster when torch compile is enabled.
+    if current_platform.is_rocm() and not torch.compiler.is_compiling():
+        if find_spec("flash_attn") is not None:
+            from flash_attn.ops.triton.rotary import apply_rotary
+
+            return apply_rotary
+        else:
+            logger.warning(
+                "flash_attn is not installed. Falling back to PyTorch "
+                "implementation for rotary embeddings."
+            )
+    if default is not None:
+        return default
+
+    return apply_rotary_emb_torch
+
+
 # yarn functions
 # Inverse dim formula to find dim based on number of rotations
-def yarn_find_correction_dim(num_rotations: int,
-                             dim: int,
-                             base: float = 10000,
-                             max_position_embeddings: int = 2048) -> float:
-    return (dim * math.log(max_position_embeddings /
-                           (num_rotations * 2 * math.pi))) / (2 *
-                                                              math.log(base))
+def yarn_find_correction_dim(
+    num_rotations: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
 
 
 # Find dim range bounds based on rotations
 def yarn_find_correction_range(
-        low_rot: int,
-        high_rot: int,
-        dim: int,
-        base: float = 10000,
-        max_position_embeddings: int = 2048) -> tuple[int, int]:
+    low_rot: int,
+    high_rot: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> tuple[int, int]:
     low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
     high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
 
-def yarn_linear_ramp_mask(low: float, high: float, dim: int,
-                          dtype: torch.dtype) -> torch.Tensor:
+def yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype
+) -> torch.Tensor:
     if low == high:
         high += 0.001  # Prevent singularity
 
@@ -115,7 +153,7 @@ def _flashinfer_rotary_embedding(
     is_neox: bool,
 ) -> None:
     """Custom op wrapper for flashinfer's rotary embedding.
-    
+
     This is an in-place operation that modifies query and key tensors directly.
     """
     from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 736ec2c1dd3a35bd4c19d789632f21603843bafc..e72834e473c1559866eb6daaa6407c2743ed9c4f 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -2,15 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
 from vllm.platforms import current_platform
 
-from .base import RotaryEmbedding
-from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range,
-                     yarn_linear_ramp_mask)
+from .base import RotaryEmbeddingBase
+from .common import (
+    rotate_gptj,
+    rotate_neox,
+    yarn_find_correction_range,
+    yarn_linear_ramp_mask,
+)
 
 
 def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
@@ -19,7 +22,7 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     return 0.1 * mscale * math.log(scale) + 1.0
 
 
-class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
     """RotaryEmbedding extended with YaRN method.
 
     Credits to Peng et al. github.com/jquesnelle/yarn
@@ -49,42 +52,56 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         self.beta_slow = beta_slow
         # Get n-d magnitude scaling corrected for interpolation.
         self.mscale = float(
-            yarn_get_mscale(self.scaling_factor, float(mscale)) /
-            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
-            attn_factor)
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+            yarn_get_mscale(self.scaling_factor, float(mscale))
+            / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
+            * attn_factor
+        )
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(
-            torch.arange(0,
-                         self.rotary_dim,
-                         2,
-                         dtype=torch.float,
-                         device=current_platform.device_type) /
-            self.rotary_dim)
+        pos_freqs = self.base ** (
+            torch.arange(
+                0,
+                self.rotary_dim,
+                2,
+                dtype=torch.float,
+                device=current_platform.device_type,
+            )
+            / self.rotary_dim
+        )
         inv_freq_extrapolation = 1.0 / pos_freqs
         inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
 
-        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
-                                               self.rotary_dim, self.base,
-                                               self.max_position_embeddings)
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
         # Get n-d rotational scaling corrected for extrapolation
-        inv_freq_mask = (1 - yarn_linear_ramp_mask(
-            low, high, self.rotary_dim // 2,
-            dtype=torch.float)) * self.extrapolation_factor
-        inv_freq = inv_freq_interpolation * (
-            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
-        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         device=current_platform.device_type,
-                         dtype=torch.float32)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor,
+            device=current_platform.device_type,
+            dtype=torch.float32,
+        )
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = (freqs.cos() * self.mscale)
-        sin = (freqs.sin() * self.mscale)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 
@@ -92,20 +109,21 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """PyTorch-native implementation equivalent to forward()."""
         assert key is not None
         self._match_cos_sin_cache_dtype(query)
-        query_rot = query[..., :self.rotary_dim]
-        key_rot = key[..., :self.rotary_dim]
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
         if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim:]
-            key_pass = key[..., self.rotary_dim:]
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
 
-        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
-                                     if offsets is not None else positions]
+        cos_sin = self.cos_sin_cache[
+            torch.add(positions, offsets) if offsets is not None else positions
+        ]
         cos, sin = cos_sin.chunk(2, dim=-1)
         if self.is_neox_style:
             # NOTE(woosuk): Here we assume that the positions tensor has the
@@ -128,11 +146,20 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             key = key_rot
         return query, key
 
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key, offsets)
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index 27e41dd0fa97e8cada833cb0cebfcd95275aba83..b5dd94cc7f53197ebb081f5e12aad7ca5ff58951 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -35,18 +34,17 @@ class DualChunkRotaryEmbedding(CustomOp):
         self.local_size = local_size
         self.dtype = dtype
         self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
-        (q_cache, qc_cache, k_cache, qc_no_clamp_cache,
-         q_inter_cache) = self._compute_cos_sin_cache()
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
+            self._compute_cos_sin_cache()
+        )
 
         self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
         self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
         self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
-        self.register_buffer("cos_sin_qc_no_clamp_cache",
-                             qc_no_clamp_cache,
-                             persistent=False)
-        self.register_buffer("cos_sin_q_inter_cache",
-                             q_inter_cache,
-                             persistent=False)
+        self.register_buffer(
+            "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
+        )
+        self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -59,8 +57,12 @@ class DualChunkRotaryEmbedding(CustomOp):
         # use CPU to compute the cache and then move it to GPU. However, we
         # create the cache on GPU for faster initialization. This may cause
         # a slight numerical difference between the HF implementation and ours.
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
@@ -68,16 +70,15 @@ class DualChunkRotaryEmbedding(CustomOp):
         inv_freq = self._compute_inv_freq(self.base)
         chunk_len = self.chunk_size - self.local_size
         q_t = torch.arange(chunk_len, dtype=torch.float)
-        qc_t = (torch.arange(chunk_len, dtype=torch.float) +
-                chunk_len).clamp(max=self.chunk_size)
-        k_t = torch.arange(self.max_position_embeddings,
-                           dtype=torch.float) % chunk_len
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
+            max=self.chunk_size
+        )
+        k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
 
         # count from chunk_len, no clamp(self.chunk_size) restriction
         qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
         # count from self.chunk_size for q_inter's rope
-        q_inter_t = torch.arange(chunk_len,
-                                 dtype=torch.float) + self.chunk_size
+        q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
 
         q_freqs = torch.outer(q_t, inv_freq)
         qc_freqs = torch.outer(qc_t, inv_freq)
@@ -97,18 +98,21 @@ class DualChunkRotaryEmbedding(CustomOp):
         q_inter_cos = q_inter_freqs.cos()
         q_inter_sin = q_inter_freqs.sin()
 
-        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype,
-                                                       device=self.device)
-        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype,
-                                                          device=self.device)
-        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype,
-                                                       device=self.device)
-        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin),
-                                      dim=-1).to(dtype=self.dtype,
-                                                 device=self.device)
-        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin),
-                                  dim=-1).to(dtype=self.dtype,
-                                             device=self.device)
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
         return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
 
     def forward_native(
@@ -116,49 +120,63 @@ class DualChunkRotaryEmbedding(CustomOp):
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
+        offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         query = query.view(*query.shape[:-1], -1, self.head_size)
         key = key.view(*key.shape[:-1], -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        key_rot = key[..., :self.rotary_dim]
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
         if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim:]
-            key_pass = key[..., self.rotary_dim:]
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
         else:
             query_pass = None
             key_pass = None
 
-        positions_with_offsets = (torch.add(positions, offsets)
-                                  if offsets is not None else positions)
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
         key = self._apply_rotary_embedding(
-            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass)
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
+        )
         chunk_len = self.chunk_size - self.local_size
         query = self._apply_rotary_embedding(
             self.cos_sin_q_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
+            query_rot,
+            query_pass,
+        )
         query_succ = self._apply_rotary_embedding(
             self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
+            query_rot,
+            query_pass,
+        )
         query_inter = self._apply_rotary_embedding(
             self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
-            query_rot, query_pass)
+            query_rot,
+            query_pass,
+        )
         query_succ_critical = self._apply_rotary_embedding(
             self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
+            query_rot,
+            query_pass,
+        )
         query_inter_critical = self._apply_rotary_embedding(
             self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
+            query_rot,
+            query_pass,
+        )
 
         # merge query into one tensor to simplify the interfaces
-        query = torch.cat((
-            query,
-            query_succ,
-            query_inter,
-            query_succ_critical,
-            query_inter_critical,
-        ),
-                          dim=-1)
+        query = torch.cat(
+            (
+                query,
+                query_succ,
+                query_inter,
+                query_succ_critical,
+                query_inter_critical,
+            ),
+            dim=-1,
+        )
         return query, key
 
     def forward_cuda(
@@ -166,7 +184,7 @@ class DualChunkRotaryEmbedding(CustomOp):
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
+        offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.forward_native(positions, query, key, offsets)
 
diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
index 1da39bbd303bd893984df9ffd19d497118397380..dd9d06d4b288ff8cf0b48528f38c678459f716aa 100644
--- a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
@@ -23,14 +23,16 @@ class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
         dtype: torch.dtype,
     ) -> None:
         self.scaling_alpha = scaling_alpha
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         # For Hunyuan DynamicNTKAlphaRotaryEmbedding
         max_len = self.max_position_embeddings
-        base = self.base * self.scaling_alpha**(self.rotary_dim /
-                                                (self.rotary_dim - 2))
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
         inv_freq = self._compute_inv_freq(base)
         t = torch.arange(max_len, dtype=torch.float)
 
diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
index ec2008b90cfb88274f61180df5d14fb863d108d3..28fd87ecc21fc303d9ced872c36b9b02619cd5cb 100644
--- a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
@@ -44,8 +44,9 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
         dtype: torch.dtype,
     ) -> None:
         self.scaling_factor = scaling_factor
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         # NOTE(woosuk): self.max_position_embeddings is the original
@@ -54,9 +55,9 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
         # self.max_position_embeddings * self.scaling_factor.
         max_len = self.max_position_embeddings * self.scaling_factor
         base = self.base * (
-            (self.scaling_factor * max_len / self.max_position_embeddings) -
-            (self.scaling_factor - 1))**(self.rotary_dim /
-                                         (self.rotary_dim - 2))
+            (self.scaling_factor * max_len / self.max_position_embeddings)
+            - (self.scaling_factor - 1)
+        ) ** (self.rotary_dim / (self.rotary_dim - 2))
         inv_freq = self._compute_inv_freq(base)
         t = torch.arange(max_len, dtype=torch.float)
 
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
index 4960c20f4060a0cc1badf031f5838c8e8e1e3f37..749cdbe88a62ef11098f6957a5e607f057470e95 100644
--- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -16,8 +15,8 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert positions.ndim == 1 or positions.ndim == 2
         assert key is not None
 
@@ -33,41 +32,37 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
             assert section_h == section_w
             # Split according to [h w h w h w h w... t t t...]
             section_cos_t = cos[..., -section_t:]
-            section_cos_h = cos[..., :section_h + section_w:2]
-            section_cos_w = cos[..., 1:section_h + section_w:2]
+            section_cos_h = cos[..., : section_h + section_w : 2]
+            section_cos_w = cos[..., 1 : section_h + section_w : 2]
 
-            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[
-                1], section_cos_w[2]
-            cos_hw = torch.stack([cos_h, cos_w],
-                                 dim=-1).reshape(cos_h.shape[:-1] +
-                                                 (cos_h.shape[-1] * 2, ))
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
+                cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
+            )
             cos = torch.cat([cos_hw, cos_t], dim=-1)
 
             section_sin_t = sin[..., -section_t:]
-            section_sin_h = sin[..., :section_h + section_w:2]
-            section_sin_w = sin[..., 1:section_h + section_w:2]
+            section_sin_h = sin[..., : section_h + section_w : 2]
+            section_sin_w = sin[..., 1 : section_h + section_w : 2]
 
-            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[
-                1], section_sin_w[2]
-            sin_hw = torch.stack([sin_h, sin_w],
-                                 dim=-1).reshape(sin_h.shape[:-1] +
-                                                 (sin_h.shape[-1] * 2, ))
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
+                sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
+            )
             sin = torch.cat([sin_hw, sin_t], dim=-1)
 
         query_shape = query.shape
         query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
-                                              self.is_neox_style)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
-                                            self.is_neox_style)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
@@ -75,6 +70,6 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        return self.forward_native(positions, query, key)
\ No newline at end of file
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key)
diff --git a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
index 6e920991882d41ba8f81a30ae2da88919e544c7e..bb51dcf1c6f508be93340bcda06e3daa5fa83323 100644
--- a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Union
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
@@ -65,14 +64,15 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
         max_position_embeddings: int,
         base: float,
         is_neox_style: bool,
-        scaling_factors: Union[list[float], float],
+        scaling_factors: list[float] | float,
         dtype: torch.dtype,
     ) -> None:
         if isinstance(scaling_factors, float):
             scaling_factors = [scaling_factors]
         self.scaling_factors: list[float] = scaling_factors  # noqa
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
         # Lazy initialized.
         self._scaling_factor_to_offset: dict[float, int]
 
diff --git a/vllm/model_executor/layers/rotary_embedding/llama3_rope.py b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
index adcef549bc4c24364d9f8287d78b0f0fc031734c..ed9a6031eb6f379bf2eb6fcf79b27e5c015f3c6b 100644
--- a/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
@@ -9,7 +9,6 @@ from .base import RotaryEmbedding
 
 
 class Llama3RotaryEmbedding(RotaryEmbedding):
-
     def __init__(
         self,
         head_size: int,
@@ -27,8 +26,9 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         self.low_freq_factor = low_freq_factor
         self.high_freq_factor = high_freq_factor
         self.orig_max_position = orig_max_position
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
@@ -37,8 +37,9 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
 
         wave_len = 2 * math.pi / inv_freqs
         if self.low_freq_factor != self.high_freq_factor:
-            smooth = (self.orig_max_position / wave_len - self.low_freq_factor
-                      ) / (self.high_freq_factor - self.low_freq_factor)
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
         else:
             smooth = 0
         new_freqs = torch.where(
@@ -47,8 +48,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
             torch.where(
                 wave_len > low_freq_wavelen,
                 inv_freqs / self.scaling_factor,
-                (1 - smooth) * inv_freqs / self.scaling_factor +
-                smooth * inv_freqs,
+                (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
             ),
         )
         return new_freqs
diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
index 8717280353068767f8fe957349919906979970ce..9fdac309df7ee39cd56963d6afe5683f2c760c9b 100644
--- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -2,15 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
-from .base import RotaryEmbedding
+from .base import RotaryEmbeddingBase
 
 
-class Llama4VisionRotaryEmbedding(RotaryEmbedding):
-
+class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase):
     def __init__(
         self,
         head_size: int,
@@ -20,12 +18,13 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         is_neox_style: bool,
         dtype: torch.dtype,
     ):
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
-        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
+        inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
         return inv_freqs
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
@@ -34,36 +33,36 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         # self.max_position_embeddings here is number of image patches
         # i.e. (image_size // patch_size) ** 2
         num_patches = self.max_position_embeddings
-        img_idx = torch.arange(num_patches,
-                    dtype=torch.int32) \
-                    .reshape(num_patches, 1)
+        img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
         img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
         img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
         num_patches_single_dim = int(math.sqrt(num_patches))
         frequencies_x = img_idx % num_patches_single_dim
         frequencies_y = img_idx // num_patches_single_dim
-        freqs_x = ((frequencies_x + 1)[..., None] *
-                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
-        freqs_y = ((frequencies_y + 1)[..., None] *
-                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
-        freqs = torch.cat([freqs_x, freqs_y],
-                          dim=-1).float().contiguous()[..., ::2]
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
         freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
         cache = torch.view_as_complex(
-            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
         return cache
 
     def forward_native(  # type: ignore[override]
         self,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert key is not None
-        self._match_cos_sin_cache_dtype(query)
-        query_ = torch.view_as_complex(query.float().reshape(
-            *query.shape[:-1], -1, 2))
-        key_ = torch.view_as_complex(key.float().reshape(
-            *key.shape[:-1], -1, 2))
+        # self.cos_sin_cache here is complex tensor so we cannot cast into
+        # query's dtype directly with self._match_cos_sin_cache_dtype
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
         broadcast_shape = [
             d if i == 1 or i == (query_.ndim - 1) else 1
             for i, d in enumerate(query_.shape)
@@ -76,6 +75,6 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
     def forward_cuda(  # type: ignore[override]
         self,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(query, key)
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index 9bf0d6bd15e74b04fad806f3b9ad2b8b05d821f9..0592aa8f967a645a235a3ab23c1aa38b6ed85834 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -1,16 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
-from typing import Optional, Union
 
 import numpy as np
 import torch
-from transformers import PretrainedConfig
 
 from vllm.triton_utils import tl, triton
 
-from .base import RotaryEmbedding
+from .base import RotaryEmbeddingBase
 from .common import apply_rotary_emb_dispatch
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale
 
@@ -62,10 +59,8 @@ def _triton_mrope_forward(
     # Updated offsets for half head_dim
     cos_offsets = tl.arange(0, pad_hd // 2)
     if is_interleaved:
-        h_mask = (((cos_offsets % 3) == 1) &
-                  (cos_offsets <= 3 * mrope_section_h))
-        w_mask = (((cos_offsets % 3) == 2) &
-                  (cos_offsets <= 3 * mrope_section_w))
+        h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h)
+        w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w)
         t_mask = ~(h_mask | w_mask)
     else:
         t_end = mrope_section_t
@@ -89,21 +84,25 @@ def _triton_mrope_forward(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(
-        0, pad_hd // 2)[None, :]
-    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(
-        0, pad_hd // 2)[None, :]
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(
-        0, pad_hd // 2)[None, :] < rd // 2)
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(
-        0, pad_hd // 2)[None, :] < rd // 2)
+    first_half_q_offsets = (
+        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_half_k_offsets = (
+        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+    )
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+    )
 
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets,
-                       mask=first_q_mask,
-                       other=0).to(sin_row.dtype)
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets,
-                       mask=first_k_mask,
-                       other=0).to(sin_row.dtype)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+        sin_row.dtype
+    )
 
     # right half of the head
     second_half_q_offsets = first_half_q_offsets + (rd // 2)
@@ -111,12 +110,12 @@ def _triton_mrope_forward(
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
 
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets,
-                       mask=second_q_mask,
-                       other=0).to(sin_row.dtype)
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets,
-                       mask=second_k_mask,
-                       other=0).to(sin_row.dtype)
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
+        sin_row.dtype
+    )
 
     # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
     # Since cos and sin are now half-size,
@@ -168,7 +167,7 @@ def triton_mrope(
     cos = cos.contiguous()
     sin = sin.contiguous()
 
-    _triton_mrope_forward[(n_row, )](
+    _triton_mrope_forward[(n_row,)](
         q,
         k,
         cos,
@@ -189,19 +188,18 @@ def triton_mrope(
     return q, k
 
 
-def apply_interleaved_rope(x: torch.Tensor,
-                           mrope_section: list[int]) -> torch.Tensor:
+def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.Tensor:
     """Apply interleaved MRoPE to 3D rotary embeddings.
     Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
     interleaved [THTHWHTHW...TT], preserving frequency continuity.
     """
     x_t = x[0].clone()
-    x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3]
-    x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3]
+    x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3]
+    x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3]
     return x_t
 
 
-class MRotaryEmbedding(RotaryEmbedding):
+class MRotaryEmbedding(RotaryEmbeddingBase):
     """Rotary Embedding with Multimodal Sections."""
 
     def __init__(
@@ -212,17 +210,16 @@ class MRotaryEmbedding(RotaryEmbedding):
         base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
-        mrope_section: Optional[list[int]] = None,
+        mrope_section: list[int] | None = None,
         mrope_interleaved: bool = False,
         # YaRN parameters.
         *,
-        scaling_factor: Optional[float] = None,
+        scaling_factor: float | None = None,
         extrapolation_factor: float = 1,
         attn_factor: float = 1,
         beta_fast: int = 32,
         beta_slow: int = 1,
     ) -> None:
-
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
         self.attn_factor = attn_factor
@@ -230,8 +227,7 @@ class MRotaryEmbedding(RotaryEmbedding):
         self.beta_slow = beta_slow
         if self.scaling_factor is not None:
             # Get n-d magnitude scaling corrected for interpolation
-            self.mscale = float(
-                yarn_get_mscale(self.scaling_factor) * attn_factor)
+            self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
         else:
             self.mscale = 1.0
 
@@ -239,8 +235,14 @@ class MRotaryEmbedding(RotaryEmbedding):
         # the input video. We enlarge max_position_embeddings to 4 times to get
         # a larger the cos and sin cache.
         self.cache_max_position_num = max_position_embeddings * 4
-        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
-                         base, is_neox_style, dtype)
+        super().__init__(
+            head_size,
+            rotary_dim,
+            self.cache_max_position_num,
+            base,
+            is_neox_style,
+            dtype,
+        )
 
         self.mrope_section = mrope_section
         self.mrope_interleaved = mrope_interleaved
@@ -261,9 +263,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """PyTorch-native implementation equivalent to forward().
 
         Args:
@@ -286,31 +288,27 @@ class MRotaryEmbedding(RotaryEmbedding):
                 cos = apply_interleaved_rope(cos, self.mrope_section)
                 sin = apply_interleaved_rope(sin, self.mrope_section)
             else:
-                cos = torch.cat([
-                    m[i] for i, m in enumerate(
-                        cos.split(self.mrope_section, dim=-1))
-                ],
-                                dim=-1)
-                sin = torch.cat([
-                    m[i] for i, m in enumerate(
-                        sin.split(self.mrope_section, dim=-1))
-                ],
-                                dim=-1)
+                cos = torch.cat(
+                    [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
+                sin = torch.cat(
+                    [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
 
         query_shape = query.shape
         query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
-                                              self.is_neox_style)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
-                                            self.is_neox_style)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
@@ -318,10 +316,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert positions.ndim == 1 or positions.ndim == 2
         assert key is not None
 
@@ -348,904 +345,27 @@ class MRotaryEmbedding(RotaryEmbedding):
             return q.reshape(query_shape), k.reshape(key_shape)
 
         query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
-                                              self.is_neox_style)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key = key.view(num_tokens, -1, self.head_size)
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
-                                            self.is_neox_style)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
-    def forward_xpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        return self.forward_native(positions, query, key, offsets)
-
     def forward_cpu(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key, offsets)
 
-    @classmethod
-    def get_input_positions(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[list[list[int]], int]:
-        """Get mrope input positions and delta value."""
-
-        image_grid_thw = [] if image_grid_thw is None else image_grid_thw
-        video_grid_thw = [] if video_grid_thw is None else video_grid_thw
-        second_per_grid_ts = [] if second_per_grid_ts is None else \
-            second_per_grid_ts
-
-        llm_positions, mrope_position_delta = \
-            cls.get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
-
-        return llm_positions.tolist(), mrope_position_delta
-
-    @classmethod
-    def get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        from vllm.transformers_utils.config import thinker_uses_mrope
-        if thinker_uses_mrope(hf_config):
-            return cls._omni_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
-        elif hf_config.model_type in ["glm4v", "glm4v_moe"]:
-            return cls._glm4v_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]:
-            return cls._qwen3vl_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
-            return cls._ernie_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif "KeyeVL1_5" in hf_config.model_type:
-            return cls._keye_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        else:
-            return cls._vl_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-
-    @classmethod
-    def _glm4v_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
-
-        image_token_id = hf_config.image_token_id
-        video_start_token_id = hf_config.video_start_token_id
-        video_end_token_id = hf_config.video_end_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        llm_pos_ids_list: list = []
-
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
-            input_token_type: list[str] = []
-            video_check_flg = False
-            for token in input_tokens:
-                if token == video_start_token_id:
-                    video_check_flg = True
-                elif token == video_end_token_id:
-                    video_check_flg = False
-
-                if (token == image_token_id) and (video_check_flg is False):
-                    input_token_type.append("image")
-                elif (token == image_token_id) and (video_check_flg is True):
-                    input_token_type.append("video")
-                else:
-                    input_token_type.append("text")
-
-            input_type_group: list[tuple[str, int, int]] = []
-            for key, group_iter in itertools.groupby(
-                    enumerate(input_token_type), lambda x: x[1]):
-                group_list = list(group_iter)
-                start_index = group_list[0][0]
-                end_index = group_list[-1][0] + 1
-                input_type_group.append((key, start_index, end_index))
-
-            video_frame_num = 1
-            mm_data_idx = 0
-            for modality_type, start_idx, end_idx in input_type_group:
-                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                    llm_pos_ids_list) > 0 else 0
-                if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = \
-                        t, h // spatial_merge_size, w // spatial_merge_size
-
-                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                        -1, llm_grid_h * llm_grid_w).flatten()
-                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                        llm_grid_t, -1, llm_grid_w).flatten()
-                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                        llm_grid_t, llm_grid_h, -1).flatten()
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + st_idx)
-                    mm_data_idx += 1
-
-                elif modality_type == "video":
-                    t, h, w = (
-                        video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = \
-                        t, h // spatial_merge_size, w // spatial_merge_size
-
-                    for t_idx in range(llm_grid_t):
-                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
-                            -1, llm_grid_h * llm_grid_w).flatten()
-                        h_index = torch.arange(llm_grid_h).view(
-                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-                        w_index = torch.arange(llm_grid_w).view(
-                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-                        llm_pos_ids_list.append(
-                            torch.stack([t_index, h_index, w_index]) + st_idx)
-
-                    mm_data_idx += 1
-                    video_frame_num += 1
-
-                else:
-                    text_len = end_idx - start_idx
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) +
-                        st_idx)
-                    video_frame_num = 1
-
-        else:
-            text_len = len(input_tokens)
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1))
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _qwen3vl_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw
-                          for _ in range(t)]
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = \
-                t, h // spatial_merge_size, w // spatial_merge_size
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w).flatten()
-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                llm_grid_t, -1, llm_grid_w).flatten()
-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                llm_grid_t, llm_grid_h, -1).flatten()
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _ernie_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for Ernie VL."""
-
-        image_token_id = hf_config.im_patch_id
-        video_start_token_id = hf_config.video_start_token_id
-        video_end_token_id = hf_config.video_end_token_id
-        spatial_conv_size = hf_config.spatial_conv_size
-        temporal_conv_size = hf_config.temporal_conv_size
-        llm_pos_ids_list: list = []
-
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
-            input_token_type: list[str] = []
-            video_check_flg = False
-            for token in input_tokens:
-                if token == video_start_token_id:
-                    video_check_flg = True
-                elif token == video_end_token_id:
-                    video_check_flg = False
-
-                if (token == image_token_id) and (video_check_flg is False):
-                    input_token_type.append("image")
-                elif (token == image_token_id) and (video_check_flg is True):
-                    input_token_type.append("video")
-                else:
-                    input_token_type.append("text")
-
-            input_type_group: list[tuple[str, int, int]] = []
-            for key, group_iter in itertools.groupby(
-                    enumerate(input_token_type), lambda x: x[1]):
-                group_list = list(group_iter)
-                start_index = group_list[0][0]
-                end_index = group_list[-1][0] + 1
-                input_type_group.append((key, start_index, end_index))
-
-            video_frame_num = 1
-            mm_data_idx = 0
-            for modality_type, start_idx, end_idx in input_type_group:
-                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                    llm_pos_ids_list) > 0 else 0
-                if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = \
-                        t, h // spatial_conv_size, w // spatial_conv_size
-
-                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                        -1, llm_grid_h * llm_grid_w).flatten()
-                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                        llm_grid_t, -1, llm_grid_w).flatten()
-                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                        llm_grid_t, llm_grid_h, -1).flatten()
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + st_idx)
-                    mm_data_idx += 1
-
-                elif modality_type == "video":
-                    t, h, w = (
-                        video_grid_thw[mm_data_idx][0],
-                        video_grid_thw[mm_data_idx][1],
-                        video_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = (t //
-                                                          temporal_conv_size,
-                                                          h //
-                                                          spatial_conv_size,
-                                                          w //
-                                                          spatial_conv_size)
-
-                    for t_idx in range(llm_grid_t):
-                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
-                            -1, llm_grid_h * llm_grid_w).flatten()
-                        h_index = torch.arange(llm_grid_h).view(
-                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-                        w_index = torch.arange(llm_grid_w).view(
-                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-                        llm_pos_ids_list.append(
-                            torch.stack([t_index, h_index, w_index]) + st_idx)
-
-                    mm_data_idx += 1
-                    video_frame_num += 1
-
-                else:
-                    text_len = end_idx - start_idx
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) +
-                        st_idx)
-                    video_frame_num = 1
-
-        else:
-            text_len = len(input_tokens)
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1))
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _keye_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
-            video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
-
-        def split_thw(
-                grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
-            """
-            Split grid_thw along the t dimension.
-
-            Args:
-                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
-
-            Returns:
-                List of [1, h, w] rows, repeated t times for each original row.
-            """
-
-            if isinstance(grid_thw, list):
-                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
-
-            if grid_thw.numel() == 0:
-                return []
-
-            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
-            ones = torch.ones_like(hw[:, :1])  # [N,1]
-            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
-            return out.tolist()
-
-        video_grid_thw = split_thw(video_grid_thw)
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        image_nums = len(image_grid_thw)
-        frame_nums = len(video_grid_thw)
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_frames = image_nums, frame_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + frame_nums):
-            if remain_images > 0:
-                try:
-                    ed_image = input_tokens.index(image_token_id, st)
-                except ValueError:
-                    ed_image = len(input_tokens) + 1
-            else:
-                ed_image = len(input_tokens) + 1
-            if remain_frames > 0:
-                try:
-                    ed_video = input_tokens.index(video_token_id, st)
-                except ValueError:
-                    ed_video = len(input_tokens) + 1
-            else:
-                ed_video = len(input_tokens) + 1
-
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_frames -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = \
-                t, h // spatial_merge_size, w // spatial_merge_size
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w)).long().flatten()
-
-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                llm_grid_t, -1, llm_grid_w).flatten()
-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                llm_grid_t, llm_grid_h, -1).flatten()
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _vl_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(hf_config.vision_config,
-                                    "tokens_per_second", 1.0)
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            video_second_per_grid_t = 0.0
-            if remain_images > 0:
-                try:
-                    ed_image = input_tokens.index(image_token_id, st)
-                except ValueError:
-                    ed_image = len(input_tokens) + 1
-            else:
-                ed_image = len(input_tokens) + 1
-            if remain_videos > 0:
-                try:
-                    ed_video = input_tokens.index(video_token_id, st)
-                except ValueError:
-                    ed_video = len(input_tokens) + 1
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_second_per_grid_t = 1.0
-                if second_per_grid_ts:
-                    video_second_per_grid_t = second_per_grid_ts[video_index]
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = \
-                t, h // spatial_merge_size, w // spatial_merge_size
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
-                       tokens_per_second).long().flatten()
-
-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                llm_grid_t, -1, llm_grid_w).flatten()
-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                llm_grid_t, llm_grid_h, -1).flatten()
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _omni_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: Optional[list[float]] = None,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value (Qwen2.5-Omni version).
-
-        Differences from MRotaryEmbedding:
-            1. Add audio support (and related `audio_feature_lengths`).
-            2. Add `use_audio_in_video` option to read audio from video inputs.
-                In this case, audio and vision position ids will be split into
-                chunks and interleaved.
-
-        Example:
-
-            (V_i are vision position ids, A_i are audio position ids)
-
-            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
-            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
-        """
-
-        # TODO(fyabc): refactor and share more code with
-        #  _vl_get_input_positions_tensor.
-
-        thinker_config = hf_config.thinker_config
-        audio_token_id = thinker_config.audio_token_index
-        image_token_id = thinker_config.image_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        vision_start_token_id = thinker_config.vision_start_token_id
-        vision_end_token_id = thinker_config.vision_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(thinker_config.vision_config,
-                                    "tokens_per_second", 25)
-
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
-
-        src_item = input_tokens
-        audio_seqlens = audio_feature_lengths
-        if not second_per_grid_ts:
-            second_per_grid_ts = [1] * video_grid_thw.shape[0]
-        audio_idx = 0
-        video_idx = 0
-        image_idx = 0
-        new_src_item: list[int] = []
-        llm_pos_ids_list: list[torch.Tensor] = []
-
-        idx = 0
-        while idx < len(src_item):
-            new_src_item_len = len(new_src_item)
-            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            if src_item[idx] not in [
-                    audio_token_id, video_token_id, image_token_id
-            ]:
-                if use_audio_in_video and idx > 0:
-                    if src_item[idx] == vision_end_token_id and \
-                        src_item[idx - 1] == audio_end_token_id:
-                        # processing the <|audio_eos|> before <|vision_eos|>
-                        start_idx -= 1
-                    elif src_item[idx] == audio_start_token_id and \
-                        src_item[idx - 1] == vision_start_token_id:
-                        # processing the <|audio_bos|> after <|vision_eos|>
-                        start_idx -= 1
-                new_src_item.append(src_item[idx])
-                llm_pos_ids = torch.tensor([start_idx],
-                                           dtype=torch.long).expand(3, -1)
-                llm_pos_ids_list.append(llm_pos_ids)
-            elif src_item[idx] == audio_token_id:
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
-                new_src_item.extend([audio_token_id] * place_num)
-                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
-                llm_pos_ids_list.append(llm_pos_ids)
-                audio_idx += 1
-            elif src_item[idx] == image_token_id:
-                grid_t = image_grid_thw[image_idx][0]
-                grid_hs = image_grid_thw[:, 1]
-                grid_ws = image_grid_thw[:, 2]
-                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
-                    grid_ws)
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = image_grid_thw[image_idx].prod() // (
-                    spatial_merge_size**2)
-                new_src_item.extend([image_token_id] * vision_seqlen)
-                image_idx += 1
-            elif src_item[idx] == video_token_id and not use_audio_in_video:
-                grid_t = video_grid_thw[video_idx][0]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_index = (torch.arange(grid_t) *
-                           second_per_grid_ts[video_idx] *
-                           tokens_per_second).long()
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
-                    grid_ws)
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2)
-                new_src_item.extend([video_token_id] * vision_seqlen)
-                video_idx += 1
-            else:
-                # read audio from video
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2)
-                grid_t = video_grid_thw[video_idx][0]
-                grid_h = video_grid_thw[video_idx][1]
-                grid_w = video_grid_thw[video_idx][2]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-                t_index = (torch.arange(grid_t) *
-                           second_per_grid_ts[video_idx] *
-                           tokens_per_second).long()
-                t_index_split_chunk = cls._split_list_into_ranges(
-                    t_index, t_ntoken_per_chunk)
-                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
-                pure_audio_len = place_num - 2
-                added_audio_len = 0
-                audio_llm_pos_ids_list: list[torch.Tensor] = []
-                for t_chunk in t_index_split_chunk:
-                    vision_ntoken_per_chunk = len(
-                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
-                    new_src_item.extend([video_token_id] *
-                                        vision_ntoken_per_chunk)
-                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
-                        start_idx, video_idx, spatial_merge_size, t_chunk,
-                        grid_hs, grid_ws).split(1, dim=1)
-                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
-                    new_src_item.extend(
-                        min(t_ntoken_per_chunk, pure_audio_len -
-                            added_audio_len) * [audio_token_id])
-                    audio_start_idx = start_idx if len(
-                        audio_llm_pos_ids_list
-                    ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1
-                    if min(t_ntoken_per_chunk,
-                           pure_audio_len - added_audio_len) > 0:
-                        audio_llm_pos_ids_list = (torch.arange(
-                            min(t_ntoken_per_chunk, pure_audio_len -
-                                added_audio_len)).expand(3, -1) +
-                                                  audio_start_idx).split(1,
-                                                                         dim=1)
-                    else:
-                        audio_llm_pos_ids_list = []
-                    added_audio_len += min(t_ntoken_per_chunk,
-                                           pure_audio_len - added_audio_len)
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                if added_audio_len < pure_audio_len:
-                    new_src_item.extend(
-                        (pure_audio_len - added_audio_len) * [audio_token_id])
-                    audio_llm_pos_ids_list = (
-                        torch.arange(pure_audio_len - added_audio_len).expand(
-                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
-                                1, dim=1)
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                audio_idx += 1
-                video_idx += 1
-            # move to the next token
-            idx += len(new_src_item) - new_src_item_len
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
-        mrope_position_delta = torch.cat(llm_pos_ids_list,
-                                         dim=1).max() + 1 - len(src_item)
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @staticmethod
-    def _get_llm_pos_ids_for_vision(
-        start_idx: int,
-        vision_idx: int,
-        spatial_merge_size: int,
-        t_index: list[int],
-        grid_hs: torch.Tensor,
-        grid_ws: torch.Tensor,
-    ) -> torch.Tensor:
-        llm_pos_ids_list = []
-        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
-        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
-        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
-            len(t_index), -1, llm_grid_w).flatten())
-        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
-            len(t_index), llm_grid_h, -1).flatten())
-        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
-            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
-        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
-        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
-        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
-        return llm_pos_ids
-
-    @staticmethod
-    def _split_list_into_ranges(lst: torch.Tensor,
-                                interval: int) -> list[list[int]]:
-        ranges: list[list[int]] = [[]
-                                   for _ in range((max(lst) // interval) + 1)]
-        for num in lst:
-            index = num // interval
-            ranges[index].append(num)
-        return ranges
-
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
@@ -1254,68 +374,24 @@ class MRotaryEmbedding(RotaryEmbedding):
     ) -> list[list[int]]:
         return [
             list(
-                range(context_len + mrope_position_delta,
-                      seq_len + mrope_position_delta)) for _ in range(3)
+                range(
+                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+                )
+            )
+            for _ in range(3)
         ]
 
     @staticmethod
-    def get_next_input_positions_tensor(out: np.ndarray, out_offset: int,
-                                        mrope_position_delta: int,
-                                        context_len: int, num_new_tokens: int):
-
-        values = np.arange(mrope_position_delta + context_len,
-                           mrope_position_delta + context_len + num_new_tokens,
-                           dtype=out.dtype)
-        out[:, out_offset:out_offset + num_new_tokens] = values
-
-    @classmethod
-    def omni_get_updates_use_audio_in_video(
-        cls,
-        thinker_config: PretrainedConfig,
-        audio_len: int,
-        video_grid_thw: Union[list[int], torch.Tensor],
-        video_second_per_grid_t: float,
-    ) -> list[int]:
-        """Get video prompt updates when `use_audio_in_video` is True.
-
-        In this case, audio and vision update ids will be split into
-        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
-
-        <|video_bos|><|VIDEO|><|video_eos|> =>
-        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
-        """
-
-        audio_token_id = thinker_config.audio_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(thinker_config.vision_config,
-                                    "tokens_per_second", 25)
-
-        grid_t = video_grid_thw[0]
-        grid_h = video_grid_thw[1]
-        grid_w = video_grid_thw[2]
-        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
-                   tokens_per_second).long()
-        t_index_split_chunk = cls._split_list_into_ranges(
-            t_index, t_ntoken_per_chunk)
-
-        updates = [audio_start_token_id]
-        added_audio_len = 0
-        for t_chunk in t_index_split_chunk:
-            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
-                spatial_merge_size**2)
-            updates.extend([video_token_id] * vision_ntoken_per_chunk)
-
-            audio_chunk_size = min(t_ntoken_per_chunk,
-                                   audio_len - added_audio_len)
-            updates.extend(audio_chunk_size * [audio_token_id])
-            added_audio_len += audio_chunk_size
-        if added_audio_len < audio_len:
-            updates.extend((audio_len - added_audio_len) * [audio_token_id])
-        updates.extend([audio_end_token_id])
-
-        return updates
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        mrope_position_delta: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            mrope_position_delta + context_len,
+            mrope_position_delta + context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
diff --git a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
index 42926bad22ef6401adbd68857fcffab1a4a34d30..031a12fceba651786a9e2ce2554120be6805dede 100644
--- a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -10,33 +9,39 @@ from .base import RotaryEmbedding
 
 class NTKScalingRotaryEmbedding(RotaryEmbedding):
     """RotaryEmbedding extended with fixed and mixed NTK scaling.
-    https://kexue.fm/archives/9706 """
-
-    def __init__(self,
-                 head_size: int,
-                 rotary_dim: int,
-                 max_position_embeddings: int,
-                 base: float,
-                 is_neox_style: bool,
-                 scaling_factor: float,
-                 dtype: torch.dtype,
-                 mixed_b: Optional[float] = None) -> None:
+    https://kexue.fm/archives/9706"""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        mixed_b: float | None = None,
+    ) -> None:
         self.scaling_factor = scaling_factor
         self.mixed_b = mixed_b
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
         inv_freq = super()._compute_inv_freq(base)
 
         if self.mixed_b is None:
-            inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim)
+            inv_freq = inv_freq / self.scaling_factor ** (2 / self.rotary_dim)
         else:
-            a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim /
-                                                           2)**self.mixed_b
-            lambda_1_m = (a * torch.arange(
-                1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp()
+            a = (
+                torch.tensor(self.scaling_factor).log()
+                / (self.rotary_dim / 2) ** self.mixed_b
+            )
+            lambda_1_m = (
+                a * torch.arange(1, self.rotary_dim // 2 + 1).float() ** self.mixed_b
+            ).exp()
             inv_freq = inv_freq / lambda_1_m
 
         return inv_freq
diff --git a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
index 9c36d633e2a9ffe58489bc04f7e573ecec6245b2..e58c9783479bbdb5b909f9dd6ef2db608df483ca 100644
--- a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
 
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+
 from .common import rotate_neox
 
+logger = init_logger(__name__)
+
 
 class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
     """Phi3 family of models scaled rotary embedding.
@@ -26,8 +30,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         dtype: torch.dtype,
         short_factor: list[float],
         long_factor: list[float],
-        short_mscale: Optional[float] = None,
-        long_mscale: Optional[float] = None,
+        short_mscale: float | None = None,
+        long_mscale: float | None = None,
     ):
         super().__init__()
 
@@ -44,14 +48,29 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         self.short_factor = short_factor
         self.long_factor = long_factor
 
-        scale = self.max_position_embeddings / \
-                self.original_max_position_embeddings
+        # Force long factors if max_model_len (runtime max length) exceeds
+        # original_max_position_embeddings to prevent KV cache invalidation when
+        # sequences cross this threshold during generation
+        max_model_len = get_current_vllm_config().model_config.max_model_len
+        self.use_long_rope = max_model_len > original_max_position_embeddings
+        if self.use_long_rope:
+            logger.warning_once(
+                "Using LongRoPE scaling factors. This enables longer "
+                "contexts (%d tokens vs original %d tokens) at the cost of "
+                "some performance degradation for shorter sequences. If "
+                "this is not desired, set `max_model_len` to be at most %d.",
+                max_position_embeddings,
+                original_max_position_embeddings,
+                original_max_position_embeddings,
+            )
+
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
         if scale <= 1.0:
             scaling_factor = 1.0
         else:
             scaling_factor = math.sqrt(
-                1 + math.log(scale) /
-                math.log(self.original_max_position_embeddings))
+                1 + math.log(scale) / math.log(self.original_max_position_embeddings)
+            )
         if short_mscale is None:
             short_mscale = scaling_factor
         if long_mscale is None:
@@ -61,22 +80,32 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         self.long_mscale = long_mscale
 
         short_cache = self._compute_cos_sin_cache(
-            original_max_position_embeddings, short_factor, short_mscale)
+            original_max_position_embeddings, short_factor, short_mscale
+        )
         short_cache = short_cache.to(dtype)
 
-        long_cache = self._compute_cos_sin_cache(max_position_embeddings,
-                                                 long_factor, long_mscale)
+        long_cache = self._compute_cos_sin_cache(
+            max_position_embeddings, long_factor, long_mscale
+        )
         long_cache = long_cache.to(dtype)
 
         long_short_cache = torch.cat([short_cache, long_cache], dim=0)
-        self.register_buffer("long_short_cos_sin_cache",
-                             long_short_cache,
-                             persistent=False)
+        self.register_buffer(
+            "long_short_cos_sin_cache", long_short_cache, persistent=False
+        )
 
     def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
         rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
-        inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)))
+        inv_freq = 1.0 / (
+            rescale_factors
+            * (
+                self.base
+                ** (
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
+                )
+            )
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(
@@ -97,18 +126,19 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert key is not None
         query = query.view(*query.shape[:-1], -1, self.head_size)
         key = key.view(*key.shape[:-1], -1, self.head_size)
 
-        k = self.original_max_position_embeddings
-        long_prompt_offset = (torch.any(positions > k).float() *
-                              torch.full_like(positions, k)).long()
-        idx = (torch.add(positions, long_prompt_offset)
-               if long_prompt_offset is not None else positions)
+        if self.use_long_rope:
+            k = self.original_max_position_embeddings
+            long_prompt_offset = torch.full_like(positions, k).long()
+            idx = torch.add(positions, long_prompt_offset)
+        else:
+            idx = positions
         idx = torch.add(idx, offsets) if offsets is not None else idx
         cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
 
@@ -116,13 +146,13 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         cos = cos.repeat(1, 2).unsqueeze(-2)
         sin = sin.repeat(1, 2).unsqueeze(-2)
 
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
         query_rot = query_rot * cos + rotate_neox(query_rot) * sin
         query = torch.cat((query_rot, query_pass), dim=-1)
 
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
         key_rot = key_rot * cos + rotate_neox(key_rot) * sin
         key = torch.cat((key_rot, key_pass), dim=-1)
 
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
deleted file mode 100644
index da7c84cb442d86cdf044808c45f6c15ddc0c4ac9..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.envs as envs
-from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
-
-
-def is_rocm_triton_rotary_embedding_enabled() -> bool:
-    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
-            and envs.VLLM_ROCM_USE_TRITON_ROPE)
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter.ops.triton.rope as ops
-    ops.rope_cached_thd_positions_2c_fwd_inplace(
-        query,
-        key,
-        cos,
-        sin,
-        positions,
-        rotate_style,
-        reuse_freqs_front_part=True,
-        nope_first=is_nope_first,
-    )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-if is_rocm_triton_rotary_embedding_enabled():
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_with_key_forward_triton",
-        op_func=rocm_aiter_rotary_emb_with_key_forward_triton_impl,
-        mutates_args=["key", "query"],
-        fake_impl=rocm_aiter_rotary_emb_with_key_forward_triton_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-
-def rocm_aiter_rotary_emb(positions: torch.Tensor, query: torch.Tensor,
-                          key: torch.Tensor, cos_sin_cache: torch.Tensor,
-                          head_size: int, rotary_dim: int,
-                          is_neox_style: bool):
-    num_tokens = positions.numel()
-    cos, sin = cos_sin_cache.chunk(2, dim=-1)
-    query_shape = query.shape
-    key_shape = key.shape
-    rotate_style = 0 if is_neox_style else 1
-
-    query = query.view(num_tokens, -1, head_size)
-    key = key.view(num_tokens, -1, head_size)
-    query_ = query[..., :rotary_dim]
-    key_ = key[..., :rotary_dim]
-    positions = positions.view(*query.shape[:1])
-    torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_triton(
-        positions,
-        sin,
-        cos,
-        query_,
-        key_,
-        rotate_style,
-        False,
-    )
-    query = query.view(query_shape)
-    key = key.view(key_shape)
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
index 851565c5667a41a2e8da62f2abe11a3b4027a164..ff46ad74b302eec022471bab1e7916aae5563256 100644
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -4,8 +4,7 @@
 import torch
 
 from .base import RotaryEmbedding
-from .common import (yarn_find_correction_range, yarn_get_mscale,
-                     yarn_linear_ramp_mask)
+from .common import yarn_find_correction_range, yarn_get_mscale, yarn_linear_ramp_mask
 
 
 class YaRNScalingRotaryEmbedding(RotaryEmbedding):
@@ -28,6 +27,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         attn_factor: float = 1,
         beta_fast: int = 32,
         beta_slow: int = 1,
+        apply_yarn_scaling: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -35,34 +35,47 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
         # Get n-d magnitude scaling corrected for interpolation
-        self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        self.mscale = (
+            float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+            if apply_yarn_scaling
+            else float(attn_factor)
+        )
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(
-            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
-            self.rotary_dim)
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
         inv_freq_extrapolation = 1.0 / pos_freqs
         inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
 
-        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
-                                               self.rotary_dim, self.base,
-                                               self.max_position_embeddings)
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
         # Get n-d rotational scaling corrected for extrapolation
-        inv_freq_mask = (1 - yarn_linear_ramp_mask(
-            low, high, self.rotary_dim // 2,
-            dtype=torch.float)) * self.extrapolation_factor
-        inv_freq = inv_freq_interpolation * (
-            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
-        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         dtype=torch.float32)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = (freqs.cos() * self.mscale)
-        sin = (freqs.sin() * self.mscale)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
         cache = torch.cat((cos, sin), dim=-1)
         return cache
diff --git a/vllm/model_executor/layers/shared_fused_moe/__init__.py b/vllm/model_executor/layers/shared_fused_moe/__init__.py
deleted file mode 100644
index b87c69d3edd04d50cfedb73da028cf9677a313cd..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/shared_fused_moe/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import (
-    SharedFusedMoE)
-
-__all__ = ["SharedFusedMoE"]
diff --git a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
deleted file mode 100644
index e1e3d188d985233500e579ab7ed0d183840904e6..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-
-import torch
-
-from vllm.distributed import tensor_model_parallel_all_reduce
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-
-
-# TODO(bnell): Add shared + fused combo function? e.g. +
-class SharedFusedMoE(FusedMoE):
-    """
-    A FusedMoE operation that also computes the results of shared experts.
-    If an all2all communicator is being used the shared expert computation
-    can be interleaved with the fused all2all dispatch communication step.
-    """
-
-    def __init__(
-        self,
-        shared_experts: torch.nn.Module,
-        use_overlapped: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self._shared_experts = shared_experts
-        self.use_overlapped = use_overlapped
-
-    @property
-    def shared_experts(self) -> Optional[torch.nn.Module]:
-        return self._shared_experts if self.use_overlapped else None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if not self.use_overlapped:
-            shared_out = self._shared_experts(hidden_states)
-
-            # Reduce outputs if necessary, since the MLP should
-            # have been created with reduce_results=False.
-            if (self.reduce_results and self.tp_size > 1
-                    and self.must_reduce_shared_expert_outputs()):
-                shared_out = tensor_model_parallel_all_reduce(shared_out)
-
-            fused_out = super().forward(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
-        else:
-            shared_out, fused_out = super().forward(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
-        return shared_out, fused_out
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 96dd58c0e4d230bbcb4cb4dc5787eaaf92314ba3..2a52d14950279950436bac4f1bd05f558c8341be 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,14 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
-from typing import Callable, Optional
+
+from collections.abc import Callable
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils.platform_utils import get_cu_count
+from vllm.utils.torch_utils import direct_register_custom_op
+
+logger = init_logger(__name__)
 
 
 def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
@@ -24,8 +30,8 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
     # This will be used together with triton swiglu kernel
     shape = w.shape
     N = shape[-1]
-    first = w[..., :N // 2]
-    second = w[..., N // 2:]
+    first = w[..., : N // 2]
+    second = w[..., N // 2 :]
 
     stacked = torch.stack((first, second), dim=-1)
     w_shuffled = stacked.reshape(shape)
@@ -39,9 +45,9 @@ def get_token_bin_counts_and_mask(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     # Compute the bin counts for the tokens.
     # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=tokens.device)
+    bin_counts = torch.zeros(
+        (num_seqs, vocab_size + 1), dtype=torch.long, device=tokens.device
+    )
     bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
     bin_counts = bin_counts[:, :vocab_size]
     mask = bin_counts > 0
@@ -49,18 +55,21 @@ def get_token_bin_counts_and_mask(
     return bin_counts, mask
 
 
-def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
-                    output_tokens_tensor: torch.Tensor,
-                    presence_penalties: torch.Tensor,
-                    frequency_penalties: torch.Tensor,
-                    repetition_penalties: torch.Tensor) -> torch.Tensor:
+def apply_penalties(
+    logits: torch.Tensor,
+    prompt_tokens_tensor: torch.Tensor,
+    output_tokens_tensor: torch.Tensor,
+    presence_penalties: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+) -> torch.Tensor:
     """
     Applies penalties in place to the logits tensor
     logits : The input logits tensor of shape [num_seqs, vocab_size]
-    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
-        are padded to the maximum prompt length within the batch using 
-        `vocab_size` as the padding value. The value `vocab_size` is used 
-        for padding because it does not correspond to any valid token ID 
+    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts
+        are padded to the maximum prompt length within the batch using
+        `vocab_size` as the padding value. The value `vocab_size` is used
+        for padding because it does not correspond to any valid token ID
         in the vocabulary.
     output_tokens_tensor: The output tokens tensor.
     presence_penalties: The presence penalties of shape (num_seqs, )
@@ -68,15 +77,17 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
     repetition_penalties: The repetition penalties of shape (num_seqs, )
     """
     num_seqs, vocab_size = logits.shape
-    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
-                                                   vocab_size, num_seqs)
+    _, prompt_mask = get_token_bin_counts_and_mask(
+        prompt_tokens_tensor, vocab_size, num_seqs
+    )
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
-        output_tokens_tensor, vocab_size, num_seqs)
+        output_tokens_tensor, vocab_size, num_seqs
+    )
 
     # Apply repetition penalties as a custom op
     from vllm._custom_ops import apply_repetition_penalties
-    apply_repetition_penalties(logits, prompt_mask, output_mask,
-                               repetition_penalties)
+
+    apply_repetition_penalties(logits, prompt_mask, output_mask, repetition_penalties)
 
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
@@ -85,65 +96,100 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
     return logits
 
 
-def default_unquantized_gemm(layer: torch.nn.Module,
-                             x: torch.Tensor,
-                             weight: torch.Tensor,
-                             bias: Optional[torch.Tensor] = None):
+def default_unquantized_gemm(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+):
     return torch.nn.functional.linear(x, weight, bias)
 
 
+def use_aiter_triton_gemm(n, m, k, dtype):
+    if (
+        not rocm_aiter_ops.is_triton_gemm_enabled()
+        # MI300's - fp8nuz=True
+        or current_platform.is_fp8_fnuz()
+        or dtype not in [torch.float16, torch.bfloat16]
+    ):
+        return False
+
+    # use hipblaslt for the larger GEMMs
+    if n > 2048 and m > 512:
+        return False
+    return (
+        (m == 5120 and k == 2880)
+        or (m == 2880 and k == 4096)
+        or (m == 128 and k == 2880)
+        or (m == 640 and k == 2880)
+        or (m == 2880 and k == 512)
+    )
+
+
 def rocm_unquantized_gemm_impl(
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
+) -> torch.Tensor:
     from vllm.platforms.rocm import on_gfx9
+
+    n = x.numel() / x.size(-1)
+    m = weight.shape[0]
     k = weight.shape[1]
-    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
-                    x.dtype in [torch.float16, torch.bfloat16] \
-                    and k % 8 == 0)
+
+    if use_aiter_triton_gemm(n, m, k, x.dtype):
+        from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+
+        return gemm_a16w16(x, weight, bias)
+
+    use_skinny = (
+        envs.VLLM_ROCM_USE_SKINNY_GEMM
+        and on_gfx9()
+        and x.dtype in [torch.float16, torch.bfloat16]
+        and k % 8 == 0
+    )
 
     if use_skinny is not True:
         return torch.nn.functional.linear(x, weight, bias)
 
-    x_view = x.view(-1, x.size(-1))
-    n = x_view.shape[0]
-    m = weight.shape[0]
-    cu_count = current_platform.get_cu_count()
-
+    x_view = x.reshape(-1, x.size(-1))
     if m > 8 and 0 < n <= 4:
+        cu_count = get_cu_count()
         out = ops.wvSplitK(weight, x_view, cu_count, bias)
-        return out.view(*x.shape[:-1], weight.shape[0])
+        return out.reshape(*x.shape[:-1], weight.shape[0])
     elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
         out = ops.LLMM1(weight, x_view, 4)
-        return out.view(*x.shape[:-1], weight.shape[0])
+        return out.reshape(*x.shape[:-1], weight.shape[0])
     return torch.nn.functional.linear(x, weight, bias)
 
 
-def rocm_unquantized_gemm_impl_fake(
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def rocm_unquantized_gemm_fake(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
+) -> torch.Tensor:
     return x.new_empty((*x.shape[:-1], weight.shape[0]))
 
 
-def rocm_unquantized_gemm(layer: torch.nn.Module,
-                          x: torch.Tensor,
-                          weight: torch.Tensor,
-                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops.vllm.rocm_unquantized_gemm_impl(x, weight, bias)
+def rocm_unquantized_gemm(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.ops.vllm.rocm_unquantized_gemm(x, weight, bias)
 
 
 direct_register_custom_op(
-    op_name="rocm_unquantized_gemm_impl",
+    op_name="rocm_unquantized_gemm",
     op_func=rocm_unquantized_gemm_impl,
-    fake_impl=rocm_unquantized_gemm_impl_fake,
+    fake_impl=rocm_unquantized_gemm_fake,
 )
 
 
 def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool:
-    return (torch._C._cpu._is_amx_tile_supported()
-            and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0
-            and n % 16 == 0)
+    return (
+        torch._C._cpu._is_amx_tile_supported()
+        and (dtype in (torch.bfloat16, torch.int8))
+        and k % 32 == 0
+        and n % 16 == 0
+    )
 
 
 def dispatch_cpu_unquantized_gemm(
@@ -158,37 +204,48 @@ def dispatch_cpu_unquantized_gemm(
             bias_f32 = layer.bias.to(torch.float32)
         else:
             bias_f32 = None
-        layer.cpu_linear = (
-            lambda x, weight, bias: torch.ops._C.weight_packed_linear(
-                x, packed_weight, bias_f32
-                if bias is not None else None, True))
-        if remove_weight:
-            layer.weight = torch.nn.Parameter(torch.empty(0),
-                                              requires_grad=False)
-    elif (ops._supports_onednn
-          and current_platform.get_cpu_architecture() == CpuArchEnum.X86):
-        origin_weight = layer.weight
+        layer.cpu_linear = lambda x, weight, bias: torch.ops._C.weight_packed_linear(
+            x, packed_weight, bias_f32 if bias is not None else None, True
+        )
         if remove_weight:
-            layer.weight = torch.nn.Parameter(torch.empty(0),
-                                              requires_grad=False)
-        handler = ops.create_onednn_mm(origin_weight.t(), 32)
-        layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(
-            handler, x, bias)
-    else:
-        layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
-            x, weight, bias)
-
-
-def cpu_unquantized_gemm(layer: torch.nn.Module,
-                         x: torch.Tensor,
-                         weight: torch.Tensor,
-                         bias: Optional[torch.Tensor] = None):
+            layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        return
+    elif (
+        ops._supports_onednn
+        and current_platform.get_cpu_architecture() != CpuArchEnum.POWERPC
+    ):
+        try:
+            origin_weight = layer.weight
+            handler = ops.create_onednn_mm(origin_weight.t(), 32)
+            layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(handler, x, bias)
+            if remove_weight:
+                layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+            return
+        except RuntimeError as e:
+            logger.warning_once(
+                "Failed to create oneDNN linear, fallback to torch linear."
+                f" Exception: {e}"
+            )
+
+    # fallback case
+    layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
+        x, weight, bias
+    )
+
+
+def cpu_unquantized_gemm(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+):
     return layer.cpu_linear(x, weight, bias)
 
 
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
     if current_platform.is_rocm():
-        return rocm_unquantized_gemm
+        # return rocm_unquantized_gemm
+        return default_unquantized_gemm
     elif current_platform.is_cpu():
         return cpu_unquantized_gemm
     else:
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index aa64d4e09ae180fa67dfb0e9427ba487433e231e..1abc3ad884550864679d23e74d02cf056b24c514 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -3,18 +3,23 @@
 
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
+    QuantizationConfig,
+    QuantizeMethodBase,
+    method_has_implemented_embedding,
+)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 from vllm.model_executor.parameter import BasevLLMParameter
 from vllm.model_executor.utils import set_weight_attrs
@@ -26,65 +31,73 @@ DEFAULT_VOCAB_PADDING_SIZE = 64
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
 
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         """Create weights for embedding layer."""
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if current_platform.is_cpu():
-            from vllm.model_executor.layers.utils import (
-                dispatch_cpu_unquantized_gemm)
+            from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm
+
             dispatch_cpu_unquantized_gemm(layer, remove_weight=False)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
-    def embedding(self, layer: torch.nn.Module,
-                  input_: torch.Tensor) -> torch.Tensor:
+    def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
         return F.embedding(input_, layer.weight)
 
 
-def pad_vocab_size(vocab_size: int,
-                   pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
     """Pad the vocab size to the given value."""
     return ((vocab_size + pad_to - 1) // pad_to) * pad_to
 
 
 def vocab_range_from_per_partition_vocab_size(
-        per_partition_vocab_size: int,
-        rank: int,
-        offset: int = 0) -> Sequence[int]:
+    per_partition_vocab_size: int, rank: int, offset: int = 0
+) -> Sequence[int]:
     index_f = rank * per_partition_vocab_size
     index_l = index_f + per_partition_vocab_size
     return index_f + offset, index_l + offset
 
 
-def vocab_range_from_global_vocab_size(global_vocab_size: int,
-                                       rank: int,
-                                       world_size: int,
-                                       offset: int = 0) -> Sequence[int]:
+def vocab_range_from_global_vocab_size(
+    global_vocab_size: int, rank: int, world_size: int, offset: int = 0
+) -> Sequence[int]:
     per_partition_vocab_size = divide(global_vocab_size, world_size)
-    return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
-                                                     rank,
-                                                     offset=offset)
+    return vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size, rank, offset=offset
+    )
 
 
 @dataclass
 class VocabParallelEmbeddingShardIndices:
     """Indices for a shard of a vocab parallel embedding."""
+
     padded_org_vocab_start_index: int
     padded_org_vocab_end_index: int
     padded_added_vocab_start_index: int
@@ -105,13 +118,11 @@ class VocabParallelEmbeddingShardIndices:
 
     @property
     def num_org_elements_padded(self) -> int:
-        return (self.padded_org_vocab_end_index -
-                self.padded_org_vocab_start_index)
+        return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index
 
     @property
     def num_added_elements_padded(self) -> int:
-        return (self.padded_added_vocab_end_index -
-                self.padded_added_vocab_start_index)
+        return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index
 
     @property
     def num_org_vocab_padding(self) -> int:
@@ -127,17 +138,14 @@ class VocabParallelEmbeddingShardIndices:
 
     def __post_init__(self):
         # sanity checks
-        assert (self.padded_org_vocab_start_index
-                <= self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index
-                <= self.padded_added_vocab_end_index)
+        assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index
+        assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.org_vocab_end_index
         assert self.added_vocab_start_index <= self.added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index
-                <= self.padded_added_vocab_start_index)
+        assert self.added_vocab_start_index <= self.padded_added_vocab_start_index
         assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
         assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
 
@@ -147,20 +155,27 @@ class VocabParallelEmbeddingShardIndices:
 
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def get_masked_input_and_mask(
-        input_: torch.Tensor, org_vocab_start_index: int,
-        org_vocab_end_index: int, num_org_vocab_padding: int,
-        added_vocab_start_index: int,
-        added_vocab_end_index: int) -> tuple[torch.Tensor, torch.Tensor]:
+    input_: torch.Tensor,
+    org_vocab_start_index: int,
+    org_vocab_end_index: int,
+    num_org_vocab_padding: int,
+    added_vocab_start_index: int,
+    added_vocab_end_index: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
     # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (
-        input_ < org_vocab_end_index)
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (
-        input_ < added_vocab_end_index)
-    added_offset = added_vocab_start_index - (
-        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
-    valid_offset = (org_vocab_start_index *
-                    org_vocab_mask) + (added_offset * added_vocab_mask)
+        input_ < added_vocab_end_index
+    )
+    added_offset = (
+        added_vocab_start_index
+        - (org_vocab_end_index - org_vocab_start_index)
+        - num_org_vocab_padding
+    )
+    valid_offset = (org_vocab_start_index * org_vocab_mask) + (
+        added_offset * added_vocab_mask
+    )
     vocab_mask = org_vocab_mask | added_vocab_mask
     input_ = vocab_mask * (input_ - valid_offset)
     return input_, ~vocab_mask
@@ -206,14 +221,16 @@ class VocabParallelEmbedding(CustomOp):
         prefix: full name of the layer in the state dict
     """  # noqa: E501
 
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        params_dtype: torch.dtype | None = None,
+        org_num_embeddings: int | None = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         # Keep the input dimensions.
@@ -223,18 +240,22 @@ class VocabParallelEmbedding(CustomOp):
         self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
         num_added_embeddings = num_embeddings - self.org_vocab_size
-        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
-                                                    self.padding_size)
+        self.org_vocab_size_padded = pad_vocab_size(
+            self.org_vocab_size, self.padding_size
+        )
         self.num_embeddings_padded = pad_vocab_size(
-            self.org_vocab_size_padded + num_added_embeddings,
-            self.padding_size)
+            self.org_vocab_size_padded + num_added_embeddings, self.padding_size
+        )
         assert self.org_vocab_size_padded <= self.num_embeddings_padded
 
-        self.shard_indices = self._get_indices(self.num_embeddings_padded,
-                                               self.org_vocab_size_padded,
-                                               self.num_embeddings,
-                                               self.org_vocab_size, tp_rank,
-                                               self.tp_size)
+        self.shard_indices = self._get_indices(
+            self.num_embeddings_padded,
+            self.org_vocab_size_padded,
+            self.num_embeddings,
+            self.org_vocab_size,
+            tp_rank,
+            self.tp_size,
+        )
         self.embedding_dim = embedding_dim
 
         quant_method = None
@@ -248,11 +269,13 @@ class VocabParallelEmbedding(CustomOp):
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self) is VocabParallelEmbedding
         quant_method_implements_embedding = method_has_implemented_embedding(
-            type(quant_method))
+            type(quant_method)
+        )
         if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
                 f"The class {type(quant_method).__name__} must implement "
-                "the 'embedding' method, see UnquantizedEmbeddingMethod.")
+                "the 'embedding' method, see UnquantizedEmbeddingMethod."
+            )
 
         self.quant_method: QuantizeMethodBase = quant_method
 
@@ -260,58 +283,73 @@ class VocabParallelEmbedding(CustomOp):
             params_dtype = torch.get_default_dtype()
         # Divide the weight matrix along the vocabulary dimension.
         self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
-        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
-                                                   self.tp_size)
-        assert (self.shard_indices.num_elements_padded ==
-                self.num_embeddings_per_partition)
+        self.num_embeddings_per_partition = divide(
+            self.num_embeddings_padded, self.tp_size
+        )
+        assert (
+            self.shard_indices.num_elements_padded == self.num_embeddings_per_partition
+        )
         self.num_org_embeddings_per_partition = (
-            self.shard_indices.org_vocab_end_index -
-            self.shard_indices.org_vocab_start_index)
+            self.shard_indices.org_vocab_end_index
+            - self.shard_indices.org_vocab_start_index
+        )
         self.num_added_embeddings_per_partition = (
-            self.shard_indices.added_vocab_end_index -
-            self.shard_indices.added_vocab_start_index)
-
-        self.quant_method.create_weights(self,
-                                         self.embedding_dim,
-                                         [self.num_embeddings_per_partition],
-                                         self.embedding_dim,
-                                         self.num_embeddings_padded,
-                                         params_dtype=params_dtype,
-                                         weight_loader=self.weight_loader)
+            self.shard_indices.added_vocab_end_index
+            - self.shard_indices.added_vocab_start_index
+        )
+
+        self.quant_method.create_weights(
+            self,
+            self.embedding_dim,
+            [self.num_embeddings_per_partition],
+            self.embedding_dim,
+            self.num_embeddings_padded,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
 
     @classmethod
-    def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
-                     vocab_size: int, org_vocab_size: int, tp_rank: int,
-                     tp_size: int) -> VocabParallelEmbeddingShardIndices:
+    def _get_indices(
+        cls,
+        vocab_size_padded: int,
+        org_vocab_size_padded: int,
+        vocab_size: int,
+        org_vocab_size: int,
+        tp_rank: int,
+        tp_size: int,
+    ) -> VocabParallelEmbeddingShardIndices:
         """Get start and end indices for vocab parallel embedding, following the
         layout outlined in the class docstring, based on the given tp_rank and
         tp_size."""
         num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
         padded_org_vocab_start_index, padded_org_vocab_end_index = (
-            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
-                                               tp_size))
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size)
+        )
         padded_added_vocab_start_index, padded_added_vocab_end_index = (
-            vocab_range_from_global_vocab_size(num_added_embeddings_padded,
-                                               tp_rank,
-                                               tp_size,
-                                               offset=org_vocab_size))
+            vocab_range_from_global_vocab_size(
+                num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size
+            )
+        )
         # remove padding
-        org_vocab_start_index = min(padded_org_vocab_start_index,
-                                    org_vocab_size)
+        org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size)
         org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
-        added_vocab_start_index = min(padded_added_vocab_start_index,
-                                      vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size)
         added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
         return VocabParallelEmbeddingShardIndices(
-            padded_org_vocab_start_index, padded_org_vocab_end_index,
-            padded_added_vocab_start_index, padded_added_vocab_end_index,
-            org_vocab_start_index, org_vocab_end_index,
-            added_vocab_start_index, added_vocab_end_index)
-
-    def get_sharded_to_full_mapping(self) -> Optional[list[int]]:
+            padded_org_vocab_start_index,
+            padded_org_vocab_end_index,
+            padded_added_vocab_start_index,
+            padded_added_vocab_end_index,
+            org_vocab_start_index,
+            org_vocab_end_index,
+            added_vocab_start_index,
+            added_vocab_end_index,
+        )
+
+    def get_sharded_to_full_mapping(self) -> list[int] | None:
         """Get a mapping that can be used to reindex the gathered
         logits for sampling.
-        
+
         During sampling, we gather logits from all ranks. The relationship
         of index->token_id will follow the same format as outlined in the class
         docstring. However, after the gather, we want to reindex the final
@@ -326,32 +364,49 @@ class VocabParallelEmbedding(CustomOp):
         added_embeddings: list[int] = []
         padding: list[int] = []
         for tp_rank in range(self.tp_size):
-            shard_indices = self._get_indices(self.num_embeddings_padded,
-                                              self.org_vocab_size_padded,
-                                              self.num_embeddings,
-                                              self.org_vocab_size, tp_rank,
-                                              self.tp_size)
+            shard_indices = self._get_indices(
+                self.num_embeddings_padded,
+                self.org_vocab_size_padded,
+                self.num_embeddings,
+                self.org_vocab_size,
+                tp_rank,
+                self.tp_size,
+            )
             range_start = self.num_embeddings_per_partition * tp_rank
             range_end = self.num_embeddings_per_partition * (tp_rank + 1)
             base_embeddings.extend(
-                range(range_start,
-                      range_start + shard_indices.num_org_elements))
+                range(range_start, range_start + shard_indices.num_org_elements)
+            )
             padding.extend(
-                range(range_start + shard_indices.num_org_elements,
-                      range_start + shard_indices.num_org_elements_padded))
+                range(
+                    range_start + shard_indices.num_org_elements,
+                    range_start + shard_indices.num_org_elements_padded,
+                )
+            )
             added_embeddings.extend(
                 range(
                     range_start + shard_indices.num_org_elements_padded,
-                    range_start + shard_indices.num_org_elements_padded +
-                    shard_indices.num_added_elements))
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                )
+            )
             padding.extend(
                 range(
-                    range_start + shard_indices.num_org_elements_padded +
-                    shard_indices.num_added_elements,
-                    range_start + shard_indices.num_org_elements_padded +
-                    shard_indices.num_added_elements_padded))
-            assert (range_start + shard_indices.num_org_elements_padded +
-                    shard_indices.num_added_elements_padded == range_end)
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements_padded,
+                )
+            )
+            assert (
+                range_start
+                + shard_indices.num_org_elements_padded
+                + shard_indices.num_added_elements_padded
+                == range_end
+            )
         ret = base_embeddings + added_embeddings + padding
         assert len(ret) == self.num_embeddings_padded
         return ret
@@ -385,10 +440,14 @@ class VocabParallelEmbedding(CustomOp):
         # If param packed on the same dim we are sharding on, then
         # need to adjust offsets of loaded weight by pack_factor.
         if packed_dim is not None and packed_dim == output_dim:
-            packed_factor = param.packed_factor if isinstance(
-                param, BasevLLMParameter) else param.pack_factor
-            assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
-                                                       param.packed_factor)
+            packed_factor = (
+                param.packed_factor
+                if isinstance(param, BasevLLMParameter)
+                else param.pack_factor
+            )
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size // param.packed_factor
+            )
             start_idx = start_idx // packed_factor
             shard_size = shard_size // packed_factor
         else:
@@ -396,23 +455,24 @@ class VocabParallelEmbedding(CustomOp):
 
         # Copy the data. Select chunk corresponding to current shard.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
-        param[loaded_weight.shape[0]:].data.fill_(0)
+        param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0] :].data.fill_(0)
 
     def forward_native(self, input_):
         if self.tp_size > 1:
             # Build the mask.
             masked_input, input_mask = get_masked_input_and_mask(
-                input_, self.shard_indices.org_vocab_start_index,
+                input_,
+                self.shard_indices.org_vocab_start_index,
                 self.shard_indices.org_vocab_end_index,
                 self.shard_indices.num_org_vocab_padding,
                 self.shard_indices.added_vocab_start_index,
-                self.shard_indices.added_vocab_end_index)
+                self.shard_indices.added_vocab_end_index,
+            )
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.quant_method.embedding(self,
-                                                      masked_input.long())
+        output_parallel = self.quant_method.embedding(self, masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
@@ -427,8 +487,8 @@ class VocabParallelEmbedding(CustomOp):
         s = f"num_embeddings={self.num_embeddings_per_partition}"
         s += f", embedding_dim={self.embedding_dim}"
         s += f", org_vocab_size={self.org_vocab_size}"
-        s += f', num_embeddings_padded={self.num_embeddings_padded}'
-        s += f', tp_size={self.tp_size}'
+        s += f", num_embeddings_padded={self.num_embeddings_padded}"
+        s += f", tp_size={self.tp_size}"
         return s
 
 
@@ -449,27 +509,38 @@ class ParallelLMHead(VocabParallelEmbedding):
         padding_size: padding size for the vocabulary.
     """
 
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 bias: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__(num_embeddings, embedding_dim, params_dtype,
-                         org_num_embeddings, padding_size, quant_config,
-                         prefix)
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        bias: bool = False,
+        params_dtype: torch.dtype | None = None,
+        org_num_embeddings: int | None = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            params_dtype,
+            org_num_embeddings,
+            padding_size,
+            quant_config,
+            prefix,
+        )
         self.quant_config = quant_config
         if bias:
             self.bias = Parameter(
-                torch.empty(self.num_embeddings_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+                torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 138a2ff30b62276cebeeefb060071d528bdae104..301f2d00bf40450475aad2dea4751c7aa7ddd23f 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Literal, Optional
+from typing import Literal
 
 from torch import nn
 
@@ -9,18 +9,20 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.bitsandbytes_loader import (
-    BitsAndBytesModelLoader)
+from vllm.model_executor.model_loader.bitsandbytes_loader import BitsAndBytesModelLoader
 from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
 from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader
 from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
 from vllm.model_executor.model_loader.runai_streamer_loader import (
-    RunaiModelStreamerLoader)
-from vllm.model_executor.model_loader.sharded_state_loader import (
-    ShardedStateLoader)
+    RunaiModelStreamerLoader,
+)
+from vllm.model_executor.model_loader.sharded_state_loader import ShardedStateLoader
 from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader
 from vllm.model_executor.model_loader.utils import (
-    get_architecture_class_name, get_model_architecture, get_model_cls)
+    get_architecture_class_name,
+    get_model_architecture,
+    get_model_cls,
+)
 
 logger = init_logger(__name__)
 
@@ -69,7 +71,10 @@ def register_model_loader(load_format: str):
 
     Examples:
         >>> from vllm.config.load import LoadConfig
-        >>> from vllm.model_executor.model_loader import get_model_loader, register_model_loader
+        >>> from vllm.model_executor.model_loader import (
+        ...     get_model_loader,
+        ...     register_model_loader,
+        ... )
         >>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader
         >>>
         >>> @register_model_loader("my_loader")
@@ -89,14 +94,20 @@ def register_model_loader(load_format: str):
         if load_format in _LOAD_FORMAT_TO_MODEL_LOADER:
             logger.warning(
                 "Load format `%s` is already registered, and will be "
-                "overwritten by the new loader class `%s`.", load_format,
-                model_loader_cls)
+                "overwritten by the new loader class `%s`.",
+                load_format,
+                model_loader_cls,
+            )
         if not issubclass(model_loader_cls, BaseModelLoader):
-            raise ValueError("The model loader must be a subclass of "
-                             "`BaseModelLoader`.")
+            raise ValueError(
+                "The model loader must be a subclass of `BaseModelLoader`."
+            )
         _LOAD_FORMAT_TO_MODEL_LOADER[load_format] = model_loader_cls
-        logger.info("Registered model loader `%s` with load format `%s`",
-                    model_loader_cls, load_format)
+        logger.info(
+            "Registered model loader `%s` with load format `%s`",
+            model_loader_cls,
+            load_format,
+        )
         return model_loader_cls
 
     return _wrapper
@@ -110,14 +121,13 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     return _LOAD_FORMAT_TO_MODEL_LOADER[load_format](load_config)
 
 
-def get_model(*,
-              vllm_config: VllmConfig,
-              model_config: Optional[ModelConfig] = None) -> nn.Module:
+def get_model(
+    *, vllm_config: VllmConfig, model_config: ModelConfig | None = None
+) -> nn.Module:
     loader = get_model_loader(vllm_config.load_config)
     if model_config is None:
         model_config = vllm_config.model_config
-    return loader.load_model(vllm_config=vllm_config,
-                             model_config=model_config)
+    return loader.load_model(vllm_config=vllm_config, model_config=model_config)
 
 
 __all__ = [
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index ab538a3c95620a799434ded18a9779d0867f7225..94dfa478245d627607e395a086db8d8eabdd2a32 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -9,7 +9,10 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+    initialize_model,
+    process_weights_after_loading,
+)
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 logger = init_logger(__name__)
 
@@ -26,24 +29,26 @@ class BaseModelLoader(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
-        """Load weights into a model. This standalone API allows 
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
+        """Load weights into a model. This standalone API allows
         inplace weights loading for an already-initialized model"""
         raise NotImplementedError
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
+    def load_model(
+        self, vllm_config: VllmConfig, model_config: ModelConfig
+    ) -> nn.Module:
         """Load a model with the given configurations."""
         device_config = vllm_config.device_config
         load_config = vllm_config.load_config
-        load_device = device_config.device if load_config.device is None else \
-                      load_config.device
+        load_device = (
+            device_config.device if load_config.device is None else load_config.device
+        )
         target_device = torch.device(load_device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
-                model = initialize_model(vllm_config=vllm_config,
-                                         model_config=model_config)
+                model = initialize_model(
+                    vllm_config=vllm_config, model_config=model_config
+                )
 
             logger.debug("Loading weights on %s ...", load_device)
             # Quantization does not happen in `load_weights` but after it
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 4edf193b54ac50730081d780c276d09a1a30dc0f..97c7a20bc4d5a842d99d4954e16cbe48970e568f 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -6,8 +6,8 @@ import glob
 import itertools
 import math
 import os
-from collections.abc import Generator
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Generator
+from typing import Any
 
 import numpy as np
 import torch
@@ -18,38 +18,44 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
-# yapf: enable
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (LinearBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (ParamMapping,
-                                                    set_default_torch_dtype)
+from vllm.model_executor.model_loader.utils import ParamMapping
 from vllm.model_executor.model_loader.weight_utils import (
-    download_safetensors_index_file_from_hf, download_weights_from_hf,
-    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    pt_weights_iterator, safetensors_weights_iterator)
+    download_safetensors_index_file_from_hf,
+    download_weights_from_hf,
+    filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference,
+    pt_weights_iterator,
+    safetensors_weights_iterator,
+)
 from vllm.model_executor.models import is_pooling_model
-from vllm.model_executor.utils import (get_moe_expert_mapping,
-                                       get_packed_modules_mapping,
-                                       set_weight_attrs)
+from vllm.model_executor.utils import (
+    get_moe_expert_mapping,
+    get_packed_modules_mapping,
+    set_weight_attrs,
+)
 from vllm.platforms import current_platform
-
-# yapf conflicts with isort for this block
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 logger = init_logger(__name__)
 
 
 def is_moe_model(model: torch.nn.Module) -> bool:
     """Checks if the model contains FusedMoE layers."""
-    return bool(any(
-        isinstance(module, FusedMoE) for module in model.modules()))
+    return bool(any(isinstance(module, FusedMoE) for module in model.modules()))
 
 
 class BitsAndBytesModelLoader(BaseModelLoader):
@@ -83,7 +89,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         self,
         model_name_or_path: str,
         allowed_patterns: list[str],
-        revision: Optional[str] = None,
+        revision: str | None = None,
     ) -> tuple[str, list[str], str]:
         """Retrieve weight files. Download the files if necessary.
 
@@ -92,8 +98,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
         if is_local:
             for pattern in allowed_patterns:
-                weight_files = glob.glob(
-                    os.path.join(model_name_or_path, pattern))
+                weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
                 if weight_files:
                     return model_name_or_path, weight_files, pattern
         else:
@@ -109,20 +114,24 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                         revision,
                         ignore_patterns=self.load_config.ignore_patterns,
                     )
-                    return hf_folder, glob.glob(
-                        os.path.join(hf_folder, pattern)), pattern
+                    return (
+                        hf_folder,
+                        glob.glob(os.path.join(hf_folder, pattern)),
+                        pattern,
+                    )
 
-        raise RuntimeError(
-            f"No model weights found in: `{model_name_or_path}`")
+        raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")
 
-    def _prepare_weights(self, model_name_or_path: str,
-                         revision: Optional[str]) -> tuple[list[str], bool]:
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: str | None
+    ) -> tuple[list[str], bool]:
         """Prepare weight files for the model."""
 
         allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
 
         hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
-            model_name_or_path, allowed_patterns, revision)
+            model_name_or_path, allowed_patterns, revision
+        )
 
         use_safetensors = matched_pattern == "*.safetensors"
         is_local = os.path.isdir(model_name_or_path)
@@ -141,25 +150,27 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     revision,
                 )
             hf_weights_files = filter_duplicate_safetensors_files(
-                hf_weights_files, hf_folder, index_file)
+                hf_weights_files, hf_folder, index_file
+            )
         else:
-            hf_weights_files = filter_files_not_needed_for_inference(
-                hf_weights_files)
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
 
         if len(hf_weights_files) == 0:
             raise RuntimeError(
-                f"Cannot find any model weights with `{model_name_or_path}`")
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
 
         return hf_weights_files, use_safetensors
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
-
         def _maybe_pool_model(module_name: str):
             # For pool model, we need to add the prefix `model.`
             # for the weight name if possible.
-            if self.is_pool_model and self.target_modules[0]. \
-                startswith("model.") and not module_name.startswith(
-                    "model."):
+            if (
+                self.is_pool_model
+                and self.target_modules[0].startswith("model.")
+                and not module_name.startswith("model.")
+            ):
                 return "model." + module_name
 
             return module_name
@@ -186,9 +197,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def _get_quantized_weights_iterator(
         self,
         model_name_or_path: str,
-        revision: Optional[str],
-    ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str,
-                                                                     Any]]:
+        revision: str | None,
+    ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]:
         """Get an iterator to the model weights with bitsandbytes quantization,
         as well as the quantization state dictionary."""
 
@@ -196,37 +206,41 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         try:
             import bitsandbytes
 
-            if version.parse(
-                    bitsandbytes.__version__) < version.parse("0.46.1"):
-                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.46.1.")
+            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.46.1."
+                )
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.46.1 via "
-                              "`pip install bitsandbytes>=0.46.1` to use "
-                              "bitsandbytes quantizer.") from err
+            raise ImportError(
+                "Please install bitsandbytes>=0.46.1 via "
+                "`pip install bitsandbytes>=0.46.1` to use "
+                "bitsandbytes quantizer."
+            ) from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(
-            model_name_or_path, revision)
+            model_name_or_path, revision
+        )
 
         quant_state_dict: dict[str, Any] = {}
 
         if self.pre_quant:
             if self.load_8bit:
                 return self._quantized_8bit_generator(
-                    hf_weights_files, use_safetensors,
-                    quant_state_dict), quant_state_dict
+                    hf_weights_files, use_safetensors, quant_state_dict
+                ), quant_state_dict
             else:
                 return self._quantized_4bit_generator(
-                    hf_weights_files, use_safetensors,
-                    quant_state_dict), quant_state_dict
+                    hf_weights_files, use_safetensors, quant_state_dict
+                ), quant_state_dict
 
-        return self._unquantized_generator(hf_weights_files, use_safetensors,
-                                           quant_state_dict), quant_state_dict
+        return self._unquantized_generator(
+            hf_weights_files, use_safetensors, quant_state_dict
+        ), quant_state_dict
 
     def _is_8bit_weight_name(self, weight_name: str):
         quantized_suffix = {".scb", ".weight_format"}
-        return any(weight_name.lower().endswith(suffix)
-                   for suffix in quantized_suffix)
+        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
 
     def _is_4bit_weight_name(self, weight_name: str):
         quantized_suffix = {
@@ -239,12 +253,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         suffix = weight_name.split(".")[-1]
         return any(q_suffix in suffix for q_suffix in quantized_suffix)
 
-    def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
-                                  quant_state_dict) -> Generator:
+    def _quantized_8bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
         for (
-                org_weight_name,
-                mapped_weight_name,
-                weight_tensor,
+            org_weight_name,
+            mapped_weight_name,
+            weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
             if not mapped_weight_name.lower().endswith(".scb"):
                 continue
@@ -253,9 +268,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             quant_state_dict[weight_key] = weight_tensor
 
         for (
-                org_weight_name,
-                mapped_weight_name,
-                weight_tensor,
+            org_weight_name,
+            mapped_weight_name,
+            weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
             if self._is_8bit_weight_name(mapped_weight_name):
                 continue
@@ -266,18 +281,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             else:
                 yield org_weight_name, weight_tensor
 
-    def _quantized_4bit_generator(self, hf_weights_files, use_safetensors,
-                                  quant_state_dict) -> Generator:
+    def _quantized_4bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
         from bitsandbytes.functional import QuantState
 
         # First iterate over all quant state weights
-        weight_iterator = self._hf_weight_iter(hf_weights_files,
-                                               use_safetensors)
+        weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
         temp_state_dict = {}
         for (
-                org_weight_name,
-                mapped_weight_name,
-                weight_tensor,
+            org_weight_name,
+            mapped_weight_name,
+            weight_tensor,
         ) in weight_iterator:
             if not self._is_4bit_weight_name(mapped_weight_name):
                 continue
@@ -289,98 +304,111 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 temp_state_dict[mapped_weight_name] = weight_tensor
 
         # Closure to parse quant_state for each prequant weight
-        def _parse_quant_state(param_name: str,
-                               temp_state_dict: dict) -> QuantState:
+        def _parse_quant_state(param_name: str, temp_state_dict: dict) -> QuantState:
             quant_state = {}
             for k in temp_state_dict:
                 if param_name + "." in k:
                     quant_state[k] = temp_state_dict[k]
 
-            return QuantState.from_dict(quant_state,
-                                        device=current_platform.device_type)
+            return QuantState.from_dict(
+                quant_state, device=current_platform.device_type
+            )
 
         # Second iterate over all prequant and normal weights
         # pre quantized weights would have a quant_state
         for (
-                org_weight_name,
-                mapped_weight_name,
-                weight_tensor,
+            org_weight_name,
+            mapped_weight_name,
+            weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
             if self._is_4bit_weight_name(mapped_weight_name):
                 continue
 
-            if (f"{mapped_weight_name}.quant_state.bitsandbytes__nf4"
-                    in temp_state_dict) or (
-                        f"{mapped_weight_name}.quant_state.bitsandbytes__fp4"
-                        in temp_state_dict):
-                quant_state = _parse_quant_state(mapped_weight_name,
-                                                 temp_state_dict)
+            if (
+                f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict
+            ) or (
+                f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
+            ):
+                quant_state = _parse_quant_state(mapped_weight_name, temp_state_dict)
                 quant_state_dict[mapped_weight_name] = quant_state
                 yield org_weight_name, weight_tensor
             else:
                 yield org_weight_name, weight_tensor
 
-    def _unquantized_generator(self, hf_weights_files, use_safetensors,
-                               quant_state_dict) -> Generator:
+    def _unquantized_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
         from bitsandbytes.functional import quantize_4bit
 
         global_tp_size = get_tensor_model_parallel_world_size()
         global_tp_rank = get_tensor_model_parallel_rank()
-        check_match = (lambda weight_name, module_name: weight_name.
-                       removesuffix(".weight") == module_name)
+        check_match = (
+            lambda weight_name, module_name: weight_name.removesuffix(".weight")
+            == module_name
+        )
         for (
-                org_weight_name,
-                mapped_weight_name,
-                weight_tensor,
+            org_weight_name,
+            mapped_weight_name,
+            weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
-
             # override tp_size and tp_rank if the module has disabled TP
-            if any(tp_disabled_module in mapped_weight_name
-                   for tp_disabled_module in self.tp_disabled_modules):
+            if any(
+                tp_disabled_module in mapped_weight_name
+                for tp_disabled_module in self.tp_disabled_modules
+            ):
                 tp_size = 1
                 tp_rank = 0
             else:
                 tp_size = global_tp_size
                 tp_rank = global_tp_rank
 
-            if any(target_module in mapped_weight_name
-                   for target_module in self.target_modules
-                   ) and mapped_weight_name.endswith(".weight"):
+            if any(
+                target_module in mapped_weight_name
+                for target_module in self.target_modules
+            ) and mapped_weight_name.endswith(".weight"):
                 # Without sharding
                 if any(
-                        check_match(mapped_weight_name, module)
-                        for module in self.unsharded_weights_modules):
+                    check_match(mapped_weight_name, module)
+                    for module in self.unsharded_weights_modules
+                ):
                     weight_sub_tensor = weight_tensor
                 # Shard by column
                 elif any(
-                        check_match(mapped_weight_name, module)
-                        for module in self.column_sharded_weights_modules):
+                    check_match(mapped_weight_name, module)
+                    for module in self.column_sharded_weights_modules
+                ):
                     total_size = weight_tensor.size(-1)
                     start_index = total_size // tp_size * tp_rank
                     end_index = total_size // tp_size * (tp_rank + 1)
-                    weight_sub_tensor = weight_tensor[...,
-                                                      start_index:end_index]
+                    weight_sub_tensor = weight_tensor[..., start_index:end_index]
                 # Weights have fused on disk. In this case, we assume that the
                 # weight and module use same name.
                 elif any(
-                        check_match(mapped_weight_name, module)
-                        for module in self.maybe_fused_weights_modules):
+                    check_match(mapped_weight_name, module)
+                    for module in self.maybe_fused_weights_modules
+                ):
                     # special case for fused weights
                     # get the size of each shard weight tensor
                     total_shard_sizes = next(
-                        (sizes for module, sizes in
-                         self.maybe_fused_weights_modules.items()
-                         if check_match(mapped_weight_name, module)))
+                        (
+                            sizes
+                            for module, sizes in self.maybe_fused_weights_modules.items()  # noqa: E501
+                            if check_match(mapped_weight_name, module)
+                        )
+                    )
                     total_size = weight_tensor.size(0)
                     assert total_size == sum(total_shard_sizes)
                     # get the start/end index of each shard weight tensor
                     total_start_index = list(
-                        itertools.accumulate([0] + total_shard_sizes))[:-1]
-                    shard_weights_index = [(
-                        idx + size // tp_size * tp_rank,
-                        idx + size // tp_size * (tp_rank + 1),
-                    ) for idx, size in zip(total_start_index,
-                                           total_shard_sizes)]
+                        itertools.accumulate([0] + total_shard_sizes)
+                    )[:-1]
+                    shard_weights_index = [
+                        (
+                            idx + size // tp_size * tp_rank,
+                            idx + size // tp_size * (tp_rank + 1),
+                        )
+                        for idx, size in zip(total_start_index, total_shard_sizes)
+                    ]
                     # slice and reorder the weight tensor
                     weight_tensor = [
                         weight_tensor[start_index:end_index, ...]
@@ -392,15 +420,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     total_size = weight_tensor.size(0)
                     start_index = total_size // tp_size * tp_rank
                     end_index = total_size // tp_size * (tp_rank + 1)
-                    weight_sub_tensor = weight_tensor[start_index:end_index,
-                                                      ...]
+                    weight_sub_tensor = weight_tensor[start_index:end_index, ...]
 
                 # bitsandbytes requires data in GPU
                 if weight_sub_tensor.is_cuda:
                     loaded_weight = weight_sub_tensor
                 else:
                     loaded_weight = weight_sub_tensor.to(
-                        device=current_platform.device_type)
+                        device=current_platform.device_type
+                    )
 
                 # remove the following after the issue is fixed:
                 # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
@@ -421,12 +449,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
         """
-        Identify and collect all modules that support BitsAndBytes 
+        Identify and collect all modules that support BitsAndBytes
         quantization.
         """
         for name, module in model.named_modules():
-            if (isinstance(module, LinearBase)
-                    and hasattr(module.quant_method, "quant_config")):
+            if isinstance(module, LinearBase) and hasattr(
+                module.quant_method, "quant_config"
+            ):
                 if modules_info := self.modules_mapping.get_sub_modules(name):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
@@ -442,45 +471,48 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 if module.disable_tp:
                     self.tp_disabled_modules.append(name)
             elif isinstance(module, FusedMoE) and hasattr(
-                    module.quant_method, "quant_config"):
+                module.quant_method, "quant_config"
+            ):
                 # TODO: support FusedMoE with prequant and 8bit.
                 if self.pre_quant and self.load_8bit:
                     raise ValueError(
                         "Prequant BitsAndBytes 8bit models with FusedMoE "
-                        "is not supported yet.")
+                        "is not supported yet."
+                    )
                 # Get the corresponding weight name using module name and
                 # expert_params_mapping.
 
                 for exp in self.expert_params_mapping:
                     weight_name = exp[1]
-                    rep_name = name.replace("experts",
-                                            "") + weight_name.removesuffix(".")
+                    rep_name = name.replace("experts", "") + weight_name.removesuffix(
+                        "."
+                    )
                     self.target_modules.append(rep_name)
 
-        assert (self.target_modules
-                ), "vLLM currently does not support BNB quantization for"
+        assert self.target_modules, (
+            "vLLM currently does not support BNB quantization for"
+        )
         f" {type(model).__name__}"
 
     def _classify_module_sharding(self, model: nn.Module):
         """
-        Categorize modules based on their weight sharding requirements 
+        Categorize modules based on their weight sharding requirements
         for tensor parallelism.
         """
         for name, module in model.named_modules():
             # Some modules like `ReplicatedLinear` should not have their weights
             # sharded. The reason for implementing it this way is to avoid new
             # static variable in the model implementation.
-            if isinstance(module, (ReplicatedLinear, )):
+            if isinstance(module, (ReplicatedLinear,)):
                 self.unsharded_weights_modules.append(name)
             # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
             # fused weights on disk. We need to use the output sizes of these
             # modules to shard the weights correctly.
-            elif isinstance(module,
-                            (QKVParallelLinear, MergedColumnParallelLinear)):
+            elif isinstance(module, (QKVParallelLinear, MergedColumnParallelLinear)):
                 self.maybe_fused_weights_modules[name] = module.output_sizes
             # In TP, these weights are partitioned along the column
             # dimension (dim=-1)
-            elif isinstance(module, (RowParallelLinear, )):
+            elif isinstance(module, (RowParallelLinear,)):
                 self.column_sharded_weights_modules.append(name)
             elif isinstance(module, FusedMoE):
                 expert_mapping = self.expert_params_mapping
@@ -488,48 +520,52 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     if exp[-1] == "w2":
                         weight_name = exp[1]
                         rep_name = name.replace(
-                            "experts", "") + weight_name.removesuffix(".")
+                            "experts", ""
+                        ) + weight_name.removesuffix(".")
                         self.column_sharded_weights_modules.append(rep_name)
 
-    def _verify_model_compatibility(self, model: nn.Module,
-                                    model_config: ModelConfig) -> None:
+    def _verify_model_compatibility(
+        self, model: nn.Module, model_config: ModelConfig
+    ) -> None:
         """
         Verify that the model is compatible with BitsAndBytes quantization.
         """
         if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
-                f" {type(model).__name__}.")
+                f" {type(model).__name__}."
+            )
 
         if not hasattr(model, "packed_modules_mapping"):
             raise AttributeError(
                 f"Model {type(model).__name__} does not support BitsAndBytes "
-                "quantization yet. No 'packed_modules_mapping' found.")
+                "quantization yet. No 'packed_modules_mapping' found."
+            )
 
-        quant_config = getattr(model_config.hf_config, "quantization_config",
-                               None)
-        if quant_config is not None:
-            quant_method = quant_config.get("quant_method")
+        quant_config = getattr(model_config.hf_config, "quantization_config", None)
+        if quant_config and (quant_method := quant_config.get("quant_method")):
             if quant_method == "bitsandbytes":
                 self.pre_quant = True
             else:
                 raise ValueError(
-                    f"BitsAndBytes loader does not support {quant_method} "
-                    "quantization")
+                    f"BitsAndBytes loader does not support {quant_method} quantization"
+                )
 
         # The quant_states in pre_quantized models cannot work with a split
         # weight tensor. So TP does not work with pre_quantized bnb models.
         if self.pre_quant and get_tensor_model_parallel_world_size() > 1:
             raise ValueError(
                 "Prequant BitsAndBytes models with tensor parallelism is not "
-                "supported. Please try with pipeline parallelism.")
-        if self.pre_quant:
+                "supported. Please try with pipeline parallelism."
+            )
+        if quant_config and self.pre_quant:
             self.load_8bit = quant_config.get("load_in_8bit", False)
 
-    def _initialize_loader_state(self, model: nn.Module,
-                                 model_config: ModelConfig) -> None:
+    def _initialize_loader_state(
+        self, model: nn.Module, model_config: ModelConfig
+    ) -> None:
         """
-        Initialize the loader's internal state based on the model and 
+        Initialize the loader's internal state based on the model and
         configuration.
         """
         self.is_pool_model = is_pooling_model(model)
@@ -541,7 +577,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 raise AttributeError(
                     f"MoE Model {type(model).__name__} does not support "
                     "BitsAndBytes quantization yet. Ensure this model has "
-                    "'get_expert_mapping' method.")
+                    "'get_expert_mapping' method."
+                )
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
         if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
@@ -552,22 +589,20 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def _dequantize_dq(self, quant_states: Any):
         """
-        When BNB employs Double Quantization, we perform the dequantization of 
-        these constants during weight loading rather than at inference time, 
-        thereby avoiding this computational overhead during inference. This 
+        When BNB employs Double Quantization, we perform the dequantization of
+        these constants during weight loading rather than at inference time,
+        thereby avoiding this computational overhead during inference. This
         comes at the cost of increased memory usage.
         """
         from bitsandbytes.functional import QuantState, dequantize_blockwise
 
         def _dequantize_single_state(quant_state):
             """Helper function to dequantize a single QuantState object."""
-            if not (isinstance(quant_state, QuantState)
-                    and quant_state.nested):
+            if not (isinstance(quant_state, QuantState) and quant_state.nested):
                 return
 
             # Copied from: https://github.com/bitsandbytes-foundation/bitsandbytes/blob/0.45.3/bitsandbytes/functional.py#L1352-#L1356
-            absmax = dequantize_blockwise(quant_state.absmax,
-                                          quant_state.state2)
+            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
             absmax += quant_state.offset
 
             # Ensure float32 dtype
@@ -586,10 +621,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             _dequantize_single_state(quant_states)
         return quant_states
 
-    def _fuse_moe_quant_states(self, model: nn.Module,
-                               quant_states_dict: dict) -> dict:
+    def _fuse_moe_quant_states(self, model: nn.Module, quant_states_dict: dict) -> dict:
         """
-        
+
         This function consolidates individual expert quantization states into
         fused representations for w13 and w2.
         """
@@ -609,12 +643,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             for exp in expert_mapping:
                 shard_id = exp[-1]
                 if shard_id not in ("w1", "w2", "w3"):
-                    raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
-                                     f"got {shard_id}.")
+                    raise ValueError(
+                        f"shard_id must be ['w1','w2','w3'] but got {shard_id}."
+                    )
                 layer_prefix = name.split("experts")[0]
                 weight_qual_name = layer_prefix + exp[1] + "weight"
-                quant_state = self._dequantize_dq(
-                    quant_states_dict[weight_qual_name])
+                quant_state = self._dequantize_dq(quant_states_dict[weight_qual_name])
                 if shard_id == "w1":
                     w1_states_lst.append(quant_state)
                 elif shard_id == "w2":
@@ -622,14 +656,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 else:
                     w3_states_lst.append(quant_state)
                 del quant_states_dict[weight_qual_name]
-            assert (len(w1_states_lst) == len(w2_states_lst) ==
-                    len(w3_states_lst))
+            assert len(w1_states_lst) == len(w2_states_lst) == len(w3_states_lst)
             w13_absmax_lst = []
             w2_absmax_lst = []
             w13_total_dim0 = 0
             w2_total_dim0 = 0
-            for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst,
-                                           w3_states_lst):
+            for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst, w3_states_lst):
                 assert w1_qs.shape == w3_qs.shape
                 assert w1_qs.blocksize == w2_qs.blocksize == w3_qs.blocksize
                 assert w1_qs.dtype == w2_qs.dtype == w3_qs.dtype
@@ -669,12 +701,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         return expert_qs_dict
 
     def _stack_quantization_states(
-            self, model: nn.Module,
-            quant_state_dict: dict) -> dict[str, dict[int, Any]]:
+        self, model: nn.Module, quant_state_dict: dict
+    ) -> dict[str, dict[int, Any]]:
         stacked_quant_state_dict: dict[str, dict[int, Any]] = {}
         # TODO: Change this lazy import to normal import
         # after the checks are updated to run on a new version
         from vllm.model_executor.models.utils import is_pp_missing_parameter
+
         param_dict = dict(model.named_parameters())
         for quant_param_name in quant_state_dict:
             if is_pp_missing_parameter(quant_param_name, model):
@@ -684,23 +717,23 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
             shard_index = 0
             for shard_name, (
-                    weight_name,
-                    index,
+                weight_name,
+                index,
             ) in self.modules_mapping.inverse_packed_mapping.items():
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos
-                                      > 0) and (quant_param_name[shard_pos - 1]
-                                                == ".")
+                can_correct_rename = (shard_pos > 0) and (
+                    quant_param_name[shard_pos - 1] == "."
+                )
                 # If the quant_param_name is packed, it won't occur in the
                 # param_dict before renaming.
-                new_quant_param_name = quant_param_name.replace(
-                    shard_name, weight_name)
-                need_rename = (quant_param_name not in param_dict) \
-                              and (new_quant_param_name in param_dict)
+                new_quant_param_name = quant_param_name.replace(shard_name, weight_name)
+                need_rename = (quant_param_name not in param_dict) and (
+                    new_quant_param_name in param_dict
+                )
                 if can_correct_rename and need_rename:
                     shard_index = index
                     quant_param_name = new_quant_param_name
@@ -714,12 +747,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             if quant_param_name not in stacked_quant_state_dict:
                 stacked_quant_state_dict[quant_param_name] = {}
 
-            stacked_quant_state_dict[quant_param_name][shard_index] = (
-                quant_state_dict[non_stacked_param_name])
+            stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[
+                non_stacked_param_name
+            ]
         return stacked_quant_state_dict
 
-    def _bind_quant_states_to_params(self, model: nn.Module,
-                                     stacked_quant_state_dict: dict) -> None:
+    def _bind_quant_states_to_params(
+        self, model: nn.Module, stacked_quant_state_dict: dict
+    ) -> None:
         # save quant_states and offsets as the attributes of the parameters
         param_dict = dict(model.named_parameters())
         for param_name, param in param_dict.items():
@@ -733,13 +768,11 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
                 pack_ratio = getattr(param, "pack_factor", -1)
                 if pack_ratio == -1:
-                    raise ValueError(
-                        f"pack_factor not set for parameter {param_name}.")
+                    raise ValueError(f"pack_factor not set for parameter {param_name}.")
 
                 num_elements = [0] * len(quant_states)
                 for seq, quant_state in quant_states.items():
-                    num_elements[seq] = (math.prod(quant_state.shape) //
-                                         pack_ratio)
+                    num_elements[seq] = math.prod(quant_state.shape) // pack_ratio
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
                 # Make torch infer_schema happy
@@ -748,38 +781,39 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
                 if self.load_8bit:
                     set_weight_attrs(
-                        param, {"matmul_state": [None] * len(quant_states)})
-
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+                        param, {"matmul_state": [None] * len(quant_states)}
+                    )
 
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         self._verify_model_compatibility(model, model_config)
         self._initialize_loader_state(model, model_config)
 
-        logger.info("Loading weights with BitsAndBytes quantization. "
-                    "May take a while ...")
-        qweight_iterator, quant_state_dict = (
-            self._get_quantized_weights_iterator(
-                model_config.model,
-                model_config.revision,
-            ))
+        logger.info(
+            "Loading weights with BitsAndBytes quantization. May take a while ..."
+        )
+        qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator(
+            model_config.model,
+            model_config.revision,
+        )
         weights_to_load = {name for name, _ in model.named_parameters()}
         loaded_weights = model.load_weights(qweight_iterator)
         # Some models may have weights loading tracker unimplemented.
         if loaded_weights is not None:
             weights_not_loaded = weights_to_load - loaded_weights
             if weights_not_loaded:
-                raise ValueError("Following weights were not initialized from "
-                                 f"checkpoint: {weights_not_loaded}")
-        expert_quant_state_dict = self._fuse_moe_quant_states(
-            model, quant_state_dict)
+                raise ValueError(
+                    "Following weights were not initialized from "
+                    f"checkpoint: {weights_not_loaded}"
+                )
+        expert_quant_state_dict = self._fuse_moe_quant_states(model, quant_state_dict)
 
         stacked_quant_state_dict = self._stack_quantization_states(
-            model, quant_state_dict)
+            model, quant_state_dict
+        )
 
         stacked_quant_state_dict = {
             **expert_quant_state_dict,
-            **stacked_quant_state_dict
+            **stacked_quant_state_dict,
         }
         self._bind_quant_states_to_params(model, stacked_quant_state_dict)
         torch.cuda.empty_cache()
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 4b7bcd37d4bc2ce8429895d4f2f881c517f57f8b..c06ac550a94aed46ac96c0bc6f0eec75d460e82c 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -5,7 +5,7 @@ import glob
 import os
 import time
 from collections.abc import Generator, Iterable
-from typing import Optional, cast
+from typing import cast
 
 import torch
 from torch import nn
@@ -14,14 +14,21 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
-    download_safetensors_index_file_from_hf, download_weights_from_hf,
-    fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
-    filter_files_not_needed_for_inference, maybe_download_from_modelscope,
+    download_safetensors_index_file_from_hf,
+    download_weights_from_hf,
+    fastsafetensors_weights_iterator,
+    filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference,
+    maybe_download_from_modelscope,
     multi_thread_pt_weights_iterator,
-    multi_thread_safetensors_weights_iterator, np_cache_weights_iterator,
-    pt_weights_iterator, safetensors_weights_iterator)
+    multi_thread_safetensors_weights_iterator,
+    np_cache_weights_iterator,
+    pt_weights_iterator,
+    safetensors_weights_iterator,
+)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -40,7 +47,7 @@ class DefaultModelLoader(BaseModelLoader):
         model_or_path: str
         """The model ID or path."""
 
-        revision: Optional[str]
+        revision: str | None
         """The optional model revision."""
 
         prefix: str = ""
@@ -49,7 +56,7 @@ class DefaultModelLoader(BaseModelLoader):
         fall_back_to_pt: bool = True
         """Whether .pt weights can be used."""
 
-        allow_patterns_overrides: Optional[list[str]] = None
+        allow_patterns_overrides: list[str] | None = None
         """If defined, weights will load exclusively using these patterns."""
 
     counter_before_loading_weights: float = 0.0
@@ -63,22 +70,26 @@ class DefaultModelLoader(BaseModelLoader):
         unexpected_keys = set(extra_config.keys()) - allowed_keys
 
         if unexpected_keys:
-            raise ValueError(f"Unexpected extra config keys for load format "
-                             f"{load_config.load_format}: "
-                             f"{unexpected_keys}")
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{unexpected_keys}"
+            )
 
     def _prepare_weights(
         self,
         model_name_or_path: str,
-        revision: Optional[str],
+        revision: str | None,
         fall_back_to_pt: bool,
-        allow_patterns_overrides: Optional[list[str]],
+        allow_patterns_overrides: list[str] | None,
     ) -> tuple[str, list[str], bool]:
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
-        model_name_or_path = (maybe_download_from_modelscope(
-            model_name_or_path, revision) or model_name_or_path)
+        model_name_or_path = (
+            maybe_download_from_modelscope(model_name_or_path, revision)
+            or model_name_or_path
+        )
 
         is_local = os.path.isdir(model_name_or_path)
         load_format = self.load_config.load_format
@@ -87,8 +98,7 @@ class DefaultModelLoader(BaseModelLoader):
         # Some quantized models use .pt files for storing the weights.
         if load_format == "auto":
             allow_patterns = ["*.safetensors", "*.bin"]
-        elif (load_format == "safetensors"
-              or load_format == "fastsafetensors"):
+        elif load_format == "safetensors" or load_format == "fastsafetensors":
             use_safetensors = True
             allow_patterns = ["*.safetensors"]
         elif load_format == "mistral":
@@ -141,25 +151,29 @@ class DefaultModelLoader(BaseModelLoader):
                     revision,
                 )
             hf_weights_files = filter_duplicate_safetensors_files(
-                hf_weights_files, hf_folder, index_file)
+                hf_weights_files, hf_folder, index_file
+            )
         else:
-            hf_weights_files = filter_files_not_needed_for_inference(
-                hf_weights_files)
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
 
         if len(hf_weights_files) == 0:
             raise RuntimeError(
-                f"Cannot find any model weights with `{model_name_or_path}`")
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
 
         return hf_folder, hf_weights_files, use_safetensors
 
     def _get_weights_iterator(
-            self, source: "Source"
+        self, source: "Source"
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         extra_config = self.load_config.model_loader_extra_config
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
-            source.model_or_path, source.revision, source.fall_back_to_pt,
-            source.allow_patterns_overrides)
+            source.model_or_path,
+            source.revision,
+            source.fall_back_to_pt,
+            source.allow_patterns_overrides,
+        )
         if self.load_config.load_format == "npcache":
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
@@ -178,13 +192,13 @@ class DefaultModelLoader(BaseModelLoader):
                 )
             else:
                 if extra_config.get("enable_multithread_load"):
-                    weights_iterator = (
-                        multi_thread_safetensors_weights_iterator(
-                            hf_weights_files,
-                            self.load_config.use_tqdm_on_load,
-                            max_workers=extra_config.get(
-                                "num_threads", self.DEFAULT_NUM_THREADS),
-                        ))
+                    weights_iterator = multi_thread_safetensors_weights_iterator(
+                        hf_weights_files,
+                        self.load_config.use_tqdm_on_load,
+                        max_workers=extra_config.get(
+                            "num_threads", self.DEFAULT_NUM_THREADS
+                        ),
+                    )
                 else:
                     weights_iterator = safetensors_weights_iterator(
                         hf_weights_files,
@@ -197,8 +211,9 @@ class DefaultModelLoader(BaseModelLoader):
                     hf_weights_files,
                     self.load_config.use_tqdm_on_load,
                     self.load_config.pt_load_map_location,
-                    max_workers=extra_config.get("num_threads",
-                                                 self.DEFAULT_NUM_THREADS),
+                    max_workers=extra_config.get(
+                        "num_threads", self.DEFAULT_NUM_THREADS
+                    ),
                 )
             else:
                 weights_iterator = pt_weights_iterator(
@@ -208,9 +223,9 @@ class DefaultModelLoader(BaseModelLoader):
                 )
 
         if current_platform.is_tpu():
-            from vllm.platforms.tpu import USE_TPU_COMMONS
+            from vllm.platforms.tpu import USE_TPU_INFERENCE
 
-            if not USE_TPU_COMMONS:
+            if not USE_TPU_INFERENCE:
                 # In PyTorch XLA, we should call `torch_xla.sync`
                 # frequently so that not too many ops are accumulated
                 # in the XLA program.
@@ -226,8 +241,7 @@ class DefaultModelLoader(BaseModelLoader):
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
         # Apply the prefix.
-        return ((source.prefix + name, tensor)
-                for (name, tensor) in weights_iterator)
+        return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator)
 
     def get_all_weights(
         self,
@@ -238,10 +252,8 @@ class DefaultModelLoader(BaseModelLoader):
             model_config.model,
             model_config.revision,
             prefix="",
-            fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
-                                    True),
-            allow_patterns_overrides=getattr(model, "allow_patterns_overrides",
-                                             None),
+            fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
+            allow_patterns_overrides=getattr(model, "allow_patterns_overrides", None),
         )
         yield from self._get_weights_iterator(primary_weights)
 
@@ -253,25 +265,63 @@ class DefaultModelLoader(BaseModelLoader):
             yield from self._get_weights_iterator(source)
 
     def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model,
-                              model_config.revision,
-                              fall_back_to_pt=True,
-                              allow_patterns_overrides=None)
+        self._prepare_weights(
+            model_config.model,
+            model_config.revision,
+            fall_back_to_pt=True,
+            allow_patterns_overrides=None,
+        )
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
+        if model_config.quantization == "torchao" and torchao_version_at_least(
+            "0.14.0"
+        ):
+            self.load_config.safetensors_load_strategy = "torchao"
         weights_to_load = {name for name, _ in model.named_parameters()}
-        loaded_weights = model.load_weights(
-            self.get_all_weights(model_config, model))
+
+        # if we don't have `model.weight_metadata_and_attr_saved` defined and
+        # set to True, it means that this is either offline quantization case
+        # or the first run of online quantization
+        # see online_quantization.py for detailed notes
+        offline_quantization_or_first_run_of_online_quantization = not getattr(
+            model, "weight_metadata_and_attr_saved", False
+        )
+
+        if model_config.quantization is None:
+            # model is not quantized
+            loaded_weights = model.load_weights(
+                self.get_all_weights(model_config, model)
+            )
+        elif offline_quantization_or_first_run_of_online_quantization:
+            # case 1: offline quantized checkpoint
+            # case 2: Step I1 first run of weight loading with
+            # online quantization
+            # see online_quantization.py for detailed notes
+            loaded_weights = model.load_weights(
+                self.get_all_weights(model_config, model)
+            )
+        else:
+            # to avoid circular dependency
+            from vllm.model_executor.model_loader.online_quantization import (
+                load_weights_and_online_quantize,
+            )
+
+            # subsequent runs of weight loading with online
+            # quantization
+            loaded_weights = load_weights_and_online_quantize(self, model, model_config)
+
         self.counter_after_loading_weights = time.perf_counter()
-        logger.info(
+        logger.info_once(
             "Loading weights took %.2f seconds",
-            self.counter_after_loading_weights -
-            self.counter_before_loading_weights)
+            self.counter_after_loading_weights - self.counter_before_loading_weights,
+            scope="local",
+        )
         # We only enable strict check for non-quantized models
         # that have loaded weights tracking currently.
         if model_config.quantization is None and loaded_weights is not None:
             weights_not_loaded = weights_to_load - loaded_weights
             if weights_not_loaded:
-                raise ValueError("Following weights were not initialized from "
-                                 f"checkpoint: {weights_not_loaded}")
+                raise ValueError(
+                    "Following weights were not initialized from "
+                    f"checkpoint: {weights_not_loaded}"
+                )
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 5b8c6268f64efaed4acaa564cc86b3f5b9cf286f..b2a934ce59497e58235005dfc9bed5a60d794ecc 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -5,8 +5,7 @@ import torch.nn as nn
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.weight_utils import (
-    initialize_dummy_weights)
+from vllm.model_executor.model_loader.weight_utils import initialize_dummy_weights
 
 
 class DummyModelLoader(BaseModelLoader):
@@ -15,14 +14,15 @@ class DummyModelLoader(BaseModelLoader):
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
         if load_config.model_loader_extra_config:
-            raise ValueError(f"Model loader extra config is not supported for "
-                             f"load format {load_config.load_format}")
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
 
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         # NOTE(woosuk): For accurate performance evaluation, we assign
         # random values to the weights.
         initialize_dummy_weights(model)
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index aaee8f3f7635387c83093999e48137ca1df4564c..7db1fc167c4fa61e637d55fba7651ca698a60922 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -13,10 +13,15 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+    initialize_model,
+    process_weights_after_loading,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    get_gguf_extra_tensor_names, get_gguf_weight_type_map,
-    gguf_quant_weights_iterator)
+    get_gguf_extra_tensor_names,
+    get_gguf_weight_type_map,
+    gguf_quant_weights_iterator,
+)
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 
 class GGUFModelLoader(BaseModelLoader):
@@ -29,15 +34,18 @@ class GGUFModelLoader(BaseModelLoader):
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
         if load_config.model_loader_extra_config:
-            raise ValueError(f"Model loader extra config is not supported for "
-                             f"load format {load_config.load_format}")
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
 
     def _prepare_weights(self, model_name_or_path: str):
         if os.path.isfile(model_name_or_path):
             return model_name_or_path
         # for raw HTTPS link
         if model_name_or_path.startswith(
-            ("http://", "https://")) and model_name_or_path.endswith(".gguf"):
+            ("http://", "https://")
+        ) and model_name_or_path.endswith(".gguf"):
             return hf_hub_download(url=model_name_or_path)
         # repo id/filename.gguf
         if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
@@ -46,7 +54,8 @@ class GGUFModelLoader(BaseModelLoader):
         else:
             raise ValueError(
                 f"Unrecognised GGUF reference: {model_name_or_path} "
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)")
+                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+            )
 
     def _get_gguf_weights_map(self, model_config: ModelConfig):
         """
@@ -63,30 +72,41 @@ class GGUFModelLoader(BaseModelLoader):
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
             model_type = "command-r"
+        if model_type == "gemma3_text":
+            # Gemma3 models use "gemma3_text" in HuggingFace but
+            # "gemma3" in GGUF architecture naming
+            model_type = "gemma3"
         if model_type in ("deepseek_v3", "deepseek_v2"):
             model_type = "deepseek2"
             # GGUF layer map assumes that we will have a merged expert weights
             # so we need to map them manually
             for idx in range(config.num_hidden_layers):
-                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = \
-                        f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = (
+                    f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
+                )
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                )
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                )
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+                )
         if model_type in ("qwen2_moe", "qwen3_moe"):
             model_type = model_type.replace("_", "")
             # GGUF layer map assumes that we will have a merged expert weights
             # so we need to map them manually
             for idx in range(config.num_hidden_layers):
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
-                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                )
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                )
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
+                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+                )
 
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
@@ -99,7 +119,8 @@ class GGUFModelLoader(BaseModelLoader):
         name_map = gguf.get_tensor_name_map(arch, num_layers)
         with torch.device("meta"):
             dummy_model = AutoModelForCausalLM.from_config(
-                config, trust_remote_code=model_config.trust_remote_code)
+                config, trust_remote_code=model_config.trust_remote_code
+            )
         state_dict = dummy_model.state_dict()
 
         for hf_name in state_dict:
@@ -111,31 +132,31 @@ class GGUFModelLoader(BaseModelLoader):
     def _get_weights_iterator(
         self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
-        return gguf_quant_weights_iterator(model_name_or_path,
-                                           gguf_to_hf_name_map)
+        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
-            self._get_weights_iterator(local_model_path, gguf_weights_map))
+            self._get_weights_iterator(local_model_path, gguf_weights_map)
+        )
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
+    def load_model(
+        self, vllm_config: VllmConfig, model_config: ModelConfig
+    ) -> nn.Module:
         device_config = vllm_config.device_config
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
         if "lm_head.weight" in get_gguf_extra_tensor_names(
-                local_model_path, gguf_weights_map):
+            local_model_path, gguf_weights_map
+        ):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
-        weight_type_map = get_gguf_weight_type_map(model_config.model,
-                                                   gguf_weights_map)
+        weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)
 
         # filter out unquantized modules to skip
         unquant_names = [
diff --git a/vllm/model_executor/model_loader/online_quantization.py b/vllm/model_executor/model_loader/online_quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..890dd7231a0e114cbdb08223ff7bc802ca44ecb0
--- /dev/null
+++ b/vllm/model_executor/model_loader/online_quantization.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+
+import torch
+from torch import nn
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.utils import process_weights_after_loading
+
+logger = init_logger(__name__)
+
+# Notes for Online Quantization
+# In terms of state of checkpoints, quantization config and their
+# correspondance to online quantization:
+# | Use Case      | Checkpoints          |  model_config.quantization |
+# | no quant      | high precision       |  None   |
+# | offline quant | quantized |  fp8, torchao etc. |
+# | online quant  | high precision | torchao etc. |
+#
+# The process for loading non-quantized checkpoint
+# 1. load non-quantized weights (load_weights)
+# 2. do any additional post processing (process_weights_after_loading)
+#
+# The process for loading offline quantized checkpoint
+# 1. load offline-quantized weights (load_weights)
+# 2. do any additional post processing (process_weights_after_loading)
+
+# The process for unquantized model reloading
+# (repeated run in RL training loop)
+# first run
+#   UI1. load_weights: load bfloat16 weights
+#   UI2. process_weights_after_loading: any additional post processing
+# subsequent run
+#   UC1: load_weights: load bfloat16 weights
+#      (shouldn't be any issues since we didn't change any attributes
+#       of the weights)
+#   UC2: process_weights_after_loading: any additional post processing
+
+# The process for weight reloading with online quantization
+# (repeated run in RL training loop)
+# first run
+#  I1. load_weights: load bfloat16 weights
+#  I2. process_weights_after_loading:
+#        record weight metadata and attributes for R1 and R2
+#        quantize weights to fp8
+# subsequent run
+#  (beginning model weight is in fp8)
+#  load_weights:
+#    R1. restore bfloat16 model weight metadata
+#    R2. restore the model weight attributes
+#    R3. reload bfloat16 weights
+#    R4. quantize weights (by calling process_weights_after_loading),
+#    also set `process_weights_after_loading_already_called` to
+#    True to stop it from running again
+#  process_weights_after_loading (if called):
+#    this will be skipped since it's already ran in
+#    load_weights
+
+
+def maybe_save_metadata_and_attributes_for_weight_reloading(
+    model: nn.Module, model_config: ModelConfig
+):
+    # following is to support on the fly quantization, currently only supported
+    # for torchao
+    if model_config.quantization != "torchao":
+        return
+
+    if getattr(model, "process_weights_after_loading_already_called", False):
+        # In case `process_weights_after_loading` is called multiple times
+        # we'll skip it at later times
+        logger.warning(
+            "process_weights_after_loading already called for model %s", model
+        )
+        return
+
+    from vllm.model_executor.model_loader.weight_utils import get_quant_config
+
+    quant_config = get_quant_config(model_config, None)
+
+    # If checkpoint is already torchao serialized, this means it's
+    # pre-quantized quantization case, we'll skip saving the metadata
+    # Otherwise, this is Step I2 of initialization steps of
+    # online quantization
+    # This step record the weights metadata and weight attributes so we can
+    # restore the bfloat16 model weights during the relad step (R1 and R2)
+    # see Notes in online_quantization.py for more details
+    if not (
+        hasattr(quant_config, "is_checkpoint_torchao_serialized")
+        and not quant_config.is_checkpoint_torchao_serialized
+    ):
+        return
+
+    # This is the I2 step of online quantiztion that saves
+    # metadata and attributes of weights so they can be used in R1 and
+    # R2 step, note that we only save these during initialization
+
+    # Includes two things
+    # 1. save floating point metadata (shape, dtype, device) for init
+    # 2. save weight attributes, e.g. `output_dim`, `weight_loader` for init
+
+    if getattr(model, "weight_metadata_and_attr_saved", False):
+        return
+
+    # save the dtype, shape and device for model parameter, used for
+    # restoring the model high precision parameters before
+    # reloading the weights
+    assert not hasattr(model, "original_weights_rebuild_keys")
+    model.original_weights_rebuild_keys = {}
+    for name, p in model.named_parameters():
+        model.original_weights_rebuild_keys[name] = {
+            "shape": p.shape,
+            "dtype": p.dtype,
+            "device": p.device,
+        }
+
+    # record the weight attributes (loader functions etc.)
+    # so these can be recovered later when we reload the weights
+    # structure: {"weight_name": {"weight_attr_key": attr}}
+    assert not hasattr(model, "recorded_weight_attr")
+    model.recorded_weight_attr = {}
+    for name, param in model.named_parameters():
+        model.recorded_weight_attr[name] = {}
+        for key in param.__dict__:
+            if hasattr(param, key):
+                attr = getattr(param, key)
+                if not callable(attr):
+                    model.recorded_weight_attr[name][key] = attr
+                elif hasattr(attr, "__self__") and param is attr.__self__:
+                    # if attr is a bonded method for an instance, and
+                    # attr.__self__ points to the instance (param)
+                    # we'll record the underlying function object
+                    model.recorded_weight_attr[name][key] = attr.__func__
+                else:
+                    model.recorded_weight_attr[name][key] = attr
+    # mark the metadata and attributes saved so we don't run it again
+    model.weight_metadata_and_attr_saved = True
+
+
+def _bond_method_to_cls(func, obj):
+    if hasattr(func, "__self__") or not callable(func):
+        # If the function is already bound to an instance, return it as is
+        return func
+    else:
+        return types.MethodType(func, obj)
+
+
+def load_weights_and_online_quantize(
+    model_loader: DefaultModelLoader, model: nn.Module, model_config: ModelConfig
+) -> set[str]:
+    # online quantization, right now only enabled for
+    # torchao
+    # R1, R2, R3, R4 in the Notes
+
+    # TODO: Add fp8 support
+    assert model_config.quantization == "torchao", (
+        "online quantization is only enabled for torchao currently"
+    )
+    # TODO: use create_weights to restore the weights to original state
+
+    # Step R1: First restore the quantized weights to original bfloat16
+    # weights, with original metadata (shape, dtype, device)
+    # and attributes, so that bfloat16 weights can be loaded properly
+    existing_param_names = dict(model.named_parameters(remove_duplicate=False)).keys()
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    model_device = None
+
+    # Step R2: recover the parameter to the state before first loading
+    for name, d in model.original_weights_rebuild_keys.items():
+        _shape = d["shape"]
+        _dtype = d["dtype"]
+        _device = d["device"]
+        if model_device is not None:
+            assert model_device == _device, (
+                "Expecting all weights "
+                "to be in the same device for now, got both: "
+                f"{model_device} and {_device}"
+            )
+        else:
+            model_device = _device
+
+        if name in existing_param_names:
+            module_name, weight_name = name.rsplit(".", 1)
+            module = named_modules[module_name]
+            setattr(
+                module,
+                weight_name,
+                torch.nn.Parameter(torch.empty(_shape, dtype=_dtype, device=_device)),
+            )
+
+    # recorded_weight_attr is
+    # {"weight_name": {"weight_attr_key": attr}}
+    # e.g.
+    # {
+    #   {
+    #     "layer.0.weight": {
+    #       "weight_loader": weight_loader_function_object,
+    #       "input_dim": 0, ...
+    #     },
+    #     "layer.1.weight": ...,
+    #    }
+    # }
+    for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items():
+        for attr_name, attr in weight_attr_dict.items():
+            module_name, weight_name = full_weight_name.rsplit(".", 1)
+            module = named_modules[module_name]
+            weight = getattr(module, weight_name)
+            if not hasattr(weight, attr_name):
+                setattr(weight, attr_name, _bond_method_to_cls(attr, weight))
+
+    # Step I1: reload bfloat16 / high precision weights
+    loaded_weights = model.load_weights(
+        model_loader.get_all_weights(model_config, model)
+    )
+
+    # Step I2: online quantize the weights
+    # manually process weights after loading
+    model.process_weights_after_loading_already_called = False
+    process_weights_after_loading(model, model_config, model_device)
+    model.process_weights_after_loading_already_called = True
+    return loaded_weights
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index dc941401a04e0e4e1bab609fb7ed64de7c42d1c8..93da07c5501952fa36a4c3a34b3a7f6e5c9abcc5 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -3,7 +3,6 @@
 # ruff: noqa: SIM117
 import os
 from collections.abc import Generator
-from typing import Optional
 
 import torch
 from torch import nn
@@ -13,42 +12,53 @@ from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
-    download_safetensors_index_file_from_hf, download_weights_from_hf,
-    runai_safetensors_weights_iterator)
-from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
-                                                 list_safetensors)
+    download_safetensors_index_file_from_hf,
+    download_weights_from_hf,
+    runai_safetensors_weights_iterator,
+)
+from vllm.transformers_utils.runai_utils import is_runai_obj_uri, list_safetensors
 
 
 class RunaiModelStreamerLoader(BaseModelLoader):
     """
-        Model loader that can load safetensors
-        files from local FS or S3 bucket.
+    Model loader that can load safetensors
+    files from local FS or S3 bucket.
     """
 
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
+
+        self._is_distributed = False
         if load_config.model_loader_extra_config:
             extra_config = load_config.model_loader_extra_config
 
-            if ("concurrency" in extra_config
-                    and isinstance(extra_config.get("concurrency"), int)):
+            if "distributed" in extra_config and isinstance(
+                extra_config.get("distributed"), bool
+            ):
+                self._is_distributed = extra_config.get("distributed")
+
+            if "concurrency" in extra_config and isinstance(
+                extra_config.get("concurrency"), int
+            ):
                 os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
-                    extra_config.get("concurrency"))
+                    extra_config.get("concurrency")
+                )
 
-            if ("memory_limit" in extra_config
-                    and isinstance(extra_config.get("memory_limit"), int)):
+            if "memory_limit" in extra_config and isinstance(
+                extra_config.get("memory_limit"), int
+            ):
                 os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
-                    extra_config.get("memory_limit"))
+                    extra_config.get("memory_limit")
+                )
 
-            runai_streamer_s3_endpoint = os.getenv(
-                'RUNAI_STREAMER_S3_ENDPOINT')
-            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
-            if (runai_streamer_s3_endpoint is None
-                    and aws_endpoint_url is not None):
+            runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT")
+            aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL")
+            if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None:
                 os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
 
-    def _prepare_weights(self, model_name_or_path: str,
-                         revision: Optional[str]) -> list[str]:
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: str | None
+    ) -> list[str]:
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
@@ -58,47 +68,49 @@ class RunaiModelStreamerLoader(BaseModelLoader):
         safetensors_pattern = "*.safetensors"
         index_file = SAFE_WEIGHTS_INDEX_NAME
 
-        hf_folder = (model_name_or_path if (is_local or is_object_storage_path)
-                     else download_weights_from_hf(
-                         model_name_or_path,
-                         self.load_config.download_dir,
-                         [safetensors_pattern],
-                         revision,
-                         ignore_patterns=self.load_config.ignore_patterns,
-                     ))
+        hf_folder = (
+            model_name_or_path
+            if (is_local or is_object_storage_path)
+            else download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                [safetensors_pattern],
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        )
         hf_weights_files = list_safetensors(path=hf_folder)
 
         if not is_local and not is_object_storage_path:
             download_safetensors_index_file_from_hf(
-                model_name_or_path, index_file, self.load_config.download_dir,
-                revision)
+                model_name_or_path, index_file, self.load_config.download_dir, revision
+            )
 
         if not hf_weights_files:
             raise RuntimeError(
-                f"Cannot find any safetensors model weights with "
-                f"`{model_name_or_path}`")
+                f"Cannot find any safetensors model weights with `{model_name_or_path}`"
+            )
 
         return hf_weights_files
 
     def _get_weights_iterator(
-            self, model_or_path: str,
-            revision: str) -> Generator[tuple[str, torch.Tensor], None, None]:
+        self, model_or_path: str, revision: str
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_weights_files = self._prepare_weights(model_or_path, revision)
         return runai_safetensors_weights_iterator(
-            hf_weights_files,
-            self.load_config.use_tqdm_on_load,
+            hf_weights_files, self.load_config.use_tqdm_on_load, self._is_distributed
         )
 
     def download_model(self, model_config: ModelConfig) -> None:
         """Download model if necessary"""
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         """Load weights into a model."""
         model_weights = model_config.model
         if hasattr(model_config, "model_weights"):
             model_weights = model_config.model_weights
         model.load_weights(
-            self._get_weights_iterator(model_weights, model_config.revision))
+            self._get_weights_iterator(model_weights, model_config.revision)
+        )
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index a85ca065d1d274870ea56af7b3e24da23ef62c32..d94dbd9f06e0b6b3f5c68769480919044fba32f2 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -5,7 +5,7 @@ import collections
 import glob
 import os
 from collections.abc import Generator
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -15,7 +15,9 @@ from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, runai_safetensors_weights_iterator)
+    download_weights_from_hf,
+    runai_safetensors_weights_iterator,
+)
 from vllm.transformers_utils.s3_utils import glob as s3_glob
 from vllm.transformers_utils.utils import is_s3
 
@@ -36,23 +38,30 @@ class ShardedStateLoader(BaseModelLoader):
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
 
-        extra_config = ({} if load_config.model_loader_extra_config is None
-                        else load_config.model_loader_extra_config.copy())
+        extra_config = (
+            {}
+            if load_config.model_loader_extra_config is None
+            else load_config.model_loader_extra_config.copy()
+        )
         self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
         if extra_config:
-            raise ValueError(f"Unexpected extra config keys for load format "
-                             f"{load_config.load_format}: "
-                             f"{load_config.model_loader_extra_config.keys()}")
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{load_config.model_loader_extra_config.keys()}"
+            )
 
     @staticmethod
     def _filter_subtensors(
-        tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]:
+        tensors: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
         """
         Filter out all tensors that share the same memory or a subset of the
         memory of another tensor.
         """
         same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = (
-            collections.defaultdict(list))
+            collections.defaultdict(list)
+        )
         for key, tensor in tensors.items():
             if tensor.numel():
                 ptr = tensor.untyped_storage().data_ptr()
@@ -80,8 +89,7 @@ class ShardedStateLoader(BaseModelLoader):
                     result[k] = t
         return result
 
-    def _prepare_weights(self, model_name_or_path: str,
-                         revision: Optional[str]):
+    def _prepare_weights(self, model_name_or_path: str, revision: str | None):
         if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
             return model_name_or_path
         else:
@@ -97,8 +105,7 @@ class ShardedStateLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         from vllm.distributed import get_tensor_model_parallel_rank
 
         model_weights = model_config.model
@@ -114,16 +121,16 @@ class ShardedStateLoader(BaseModelLoader):
 
         filepaths = []
         if is_s3(local_model_path):
-            file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
-            filepaths = s3_glob(path=local_model_path,
-                                allow_pattern=[file_pattern])
+            file_pattern = f"*{self.pattern.format(rank=rank, part='*')}"
+            filepaths = s3_glob(path=local_model_path, allow_pattern=[file_pattern])
         else:
             filepaths = glob.glob(pattern)
         if not filepaths:
             # TODO: support un-sharded checkpoints too
             raise ValueError(
                 f"Could not find checkpoint files '{pattern}', only "
-                f"pre-sharded checkpoints are currently supported!")
+                f"pre-sharded checkpoints are currently supported!"
+            )
         state_dict = self._filter_subtensors(model.state_dict())
         for key, tensor in self.iterate_over_files(filepaths):
             # If loading with LoRA enabled, additional padding may
@@ -136,8 +143,7 @@ class ShardedStateLoader(BaseModelLoader):
                     param_data = param_data.narrow(dim, 0, size)
             if tensor.shape != param_shape:
                 logger.warning(
-                    "loading tensor of shape %s into "
-                    "parameter '%s' of shape %s",
+                    "loading tensor of shape %s into parameter '%s' of shape %s",
                     tensor.shape,
                     key,
                     param_shape,
@@ -145,15 +151,16 @@ class ShardedStateLoader(BaseModelLoader):
             param_data.copy_(tensor)
             state_dict.pop(key)
         if state_dict:
-            raise ValueError(
-                f"Missing keys {tuple(state_dict)} in loaded state!")
+            raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
 
     def iterate_over_files(
-            self, paths) -> Generator[tuple[str, torch.Tensor], None, None]:
+        self, paths
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
         if self.load_config.load_format == "runai_streamer_sharded":
             yield from runai_safetensors_weights_iterator(paths, True)
         else:
             from safetensors.torch import safe_open
+
             for path in paths:
                 with safe_open(path, framework="pt") as f:
                     for key in f.keys():  # noqa: SIM118
@@ -164,8 +171,8 @@ class ShardedStateLoader(BaseModelLoader):
     def save_model(
         model: torch.nn.Module,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         from safetensors.torch import save_file
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 13f4eebf1038efd9b52e441acdaa44b9f9574b3e..e4e530f0cea88827fa5af3e484066d38cf9672f7 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -12,7 +12,7 @@ import threading
 import time
 from collections.abc import Generator, MutableMapping
 from dataclasses import asdict, dataclass, field, fields
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Optional
 
 import regex as re
 import torch
@@ -22,23 +22,25 @@ from torch.utils._python_dispatch import TorchDispatchMode
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, PlaceholderModule
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.import_utils import PlaceholderModule
 
 if TYPE_CHECKING:
     from vllm.engine.arg_utils import EngineArgs
 
 try:
-    from tensorizer import (DecryptionParams, EncryptionParams,
-                            TensorDeserializer, TensorSerializer)
+    from tensorizer import (
+        DecryptionParams,
+        EncryptionParams,
+        TensorDeserializer,
+        TensorSerializer,
+    )
     from tensorizer.stream_io import open_stream
-    from tensorizer.utils import (convert_bytes, get_mem_usage,
-                                  no_init_or_tensor)
+    from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
 
 except ImportError:
     tensorizer = PlaceholderModule("tensorizer")
@@ -52,15 +54,21 @@ except ImportError:
     no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
 
 __all__ = [
-    'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
-    'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
-    'no_init_or_tensor', 'TensorizerConfig'
+    "EncryptionParams",
+    "DecryptionParams",
+    "TensorDeserializer",
+    "TensorSerializer",
+    "open_stream",
+    "convert_bytes",
+    "get_mem_usage",
+    "no_init_or_tensor",
+    "TensorizerConfig",
 ]
 
 logger = init_logger(__name__)
 
 
-def is_valid_deserialization_uri(uri: Optional[str]) -> bool:
+def is_valid_deserialization_uri(uri: str | None) -> bool:
     if uri:
         scheme = uri.lower().split("://")[0]
         return scheme in {"s3", "http", "https"} or os.path.exists(uri)
@@ -73,12 +81,12 @@ def tensorizer_kwargs_arg(value):
         raise argparse.ArgumentTypeError(
             f"Not deserializable to dict: {value}. serialization_kwargs and "
             f"deserialization_kwargs must be "
-            f"deserializable from a JSON string to a dictionary. ")
+            f"deserializable from a JSON string to a dictionary. "
+        )
     return loaded
 
 
 class MetaTensorMode(TorchDispatchMode):
-
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
 
@@ -88,8 +96,9 @@ class MetaTensorMode(TorchDispatchMode):
         return func(*args, **kwargs)
 
 
-def meta_tensor_mode(loading_code=None, ):
-
+def meta_tensor_mode(
+    loading_code=None,
+):
     if loading_code is None:
         return _NoInitOrTensorImpl.context_manager()
     elif callable(loading_code):
@@ -99,15 +108,15 @@ def meta_tensor_mode(loading_code=None, ):
         raise TypeError(
             "expected a callable to evaluate,"
             " or None if being used as a context manager;"
-            f' got an object of type "{type(loading_code).__name__}" instead.')
+            f' got an object of type "{type(loading_code).__name__}" instead.'
+        )
 
 
 class _NoInitOrTensorImpl:
     _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
     _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)
 
-    is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
-                                       default=False)
+    is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active", default=False)
     _count_active: int = 0
     _count_active_lock = threading.Lock()
 
@@ -139,7 +148,6 @@ class _NoInitOrTensorImpl:
 
     @staticmethod
     def _disable(func):
-
         def wrapper(*args, **kwargs):
             if not _NoInitOrTensorImpl.is_active.get():
                 return func(*args, **kwargs)
@@ -149,25 +157,23 @@ class _NoInitOrTensorImpl:
 
 @dataclass
 class TensorizerConfig(MutableMapping):
-    tensorizer_uri: Optional[str] = None
-    tensorizer_dir: Optional[str] = None
-    vllm_tensorized: Optional[bool] = None
-    verify_hash: Optional[bool] = None
-    num_readers: Optional[int] = None
-    encryption_keyfile: Optional[str] = None
-    s3_access_key_id: Optional[str] = None
-    s3_secret_access_key: Optional[str] = None
-    s3_endpoint: Optional[str] = None
-    lora_dir: Optional[str] = None
-    stream_kwargs: Optional[dict[str, Any]] = None
-    serialization_kwargs: Optional[dict[str, Any]] = None
-    deserialization_kwargs: Optional[dict[str, Any]] = None
-    _extra_serialization_attrs: Optional[dict[str, Any]] = field(init=False,
-                                                                 default=None)
-    model_class: Optional[type[torch.nn.Module]] = field(init=False,
-                                                         default=None)
-    hf_config: Optional[PretrainedConfig] = field(init=False, default=None)
-    dtype: Optional[Union[str, torch.dtype]] = field(init=False, default=None)
+    tensorizer_uri: str | None = None
+    tensorizer_dir: str | None = None
+    vllm_tensorized: bool | None = None
+    verify_hash: bool | None = None
+    num_readers: int | None = None
+    encryption_keyfile: str | None = None
+    s3_access_key_id: str | None = None
+    s3_secret_access_key: str | None = None
+    s3_endpoint: str | None = None
+    lora_dir: str | None = None
+    stream_kwargs: dict[str, Any] | None = None
+    serialization_kwargs: dict[str, Any] | None = None
+    deserialization_kwargs: dict[str, Any] | None = None
+    _extra_serialization_attrs: dict[str, Any] | None = field(init=False, default=None)
+    model_class: type[torch.nn.Module] | None = field(init=False, default=None)
+    hf_config: PretrainedConfig | None = field(init=False, default=None)
+    dtype: str | torch.dtype | None = field(init=False, default=None)
     _is_sharded: bool = field(init=False, default=False)
     _fields: ClassVar[tuple[str, ...]]
     _keys: ClassVar[frozenset[str]]
@@ -220,19 +226,23 @@ class TensorizerConfig(MutableMapping):
 
     def __post_init__(self):
         # check if the configuration is for a sharded vLLM model
-        self._is_sharded = isinstance(self.tensorizer_uri, str) \
-            and re.search(r'%0\dd', self.tensorizer_uri) is not None
+        self._is_sharded = (
+            isinstance(self.tensorizer_uri, str)
+            and re.search(r"%0\dd", self.tensorizer_uri) is not None
+        )
 
         if self.tensorizer_dir and self.lora_dir:
             raise ValueError(
                 "Only one of tensorizer_dir or lora_dir may be specified. "
                 "Use lora_dir exclusively when serializing LoRA adapters, "
-                "and tensorizer_dir or tensorizer_uri otherwise.")
+                "and tensorizer_dir or tensorizer_uri otherwise."
+            )
         if self.tensorizer_dir and self.tensorizer_uri:
             logger.warning_once(
                 "Provided both tensorizer_dir and tensorizer_uri. "
                 "Inferring tensorizer_dir from tensorizer_uri as the "
-                "latter takes precedence.")
+                "latter takes precedence."
+            )
             self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
         if not self.tensorizer_uri:
             if self.lora_dir:
@@ -240,11 +250,13 @@ class TensorizerConfig(MutableMapping):
             elif self.tensorizer_dir:
                 self.tensorizer_uri = f"{self.tensorizer_dir}/model.tensors"
             else:
-                raise ValueError("Unable to resolve tensorizer_uri. "
-                                 "A valid tensorizer_uri or tensorizer_dir "
-                                 "must be provided for deserialization, and a "
-                                 "valid tensorizer_uri, tensorizer_uri, or "
-                                 "lora_dir for serialization.")
+                raise ValueError(
+                    "Unable to resolve tensorizer_uri. "
+                    "A valid tensorizer_uri or tensorizer_dir "
+                    "must be provided for deserialization, and a "
+                    "valid tensorizer_uri, tensorizer_uri, or "
+                    "lora_dir for serialization."
+                )
         else:
             self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
 
@@ -280,8 +292,12 @@ class TensorizerConfig(MutableMapping):
 
         tc_dict = {}
         for k, v in raw_tc_dict.items():
-            if (k not in blacklisted and k not in tc_dict
-                    and not k.startswith("_") and v is not None):
+            if (
+                k not in blacklisted
+                and k not in tc_dict
+                and not k.startswith("_")
+                and v is not None
+            ):
                 tc_dict[k] = v
 
         return tc_dict
@@ -293,26 +309,25 @@ class TensorizerConfig(MutableMapping):
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
-        if parallel_config.tensor_parallel_size > 1 \
-            and not self._is_sharded:
+        if parallel_config.tensor_parallel_size > 1 and not self._is_sharded:
             raise ValueError(
                 "For a sharded model, tensorizer_uri should include a"
                 " string format template like '%04d' to be formatted"
-                " with the rank of the shard")
+                " with the rank of the shard"
+            )
 
     def verify_with_model_config(self, model_config: "ModelConfig") -> None:
-        if (model_config.quantization is not None
-                and self.tensorizer_uri is not None):
+        if model_config.quantization is not None and self.tensorizer_uri is not None:
             logger.warning(
                 "Loading a model using Tensorizer with quantization on vLLM"
-                " is unstable and may lead to errors.")
+                " is unstable and may lead to errors."
+            )
 
     def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
         if tensorizer_args is None:
             tensorizer_args = self._construct_tensorizer_args()
 
-        return open_stream(self.tensorizer_uri,
-                           **tensorizer_args.stream_kwargs)
+        return open_stream(self.tensorizer_uri, **tensorizer_args.stream_kwargs)
 
     def keys(self):
         return self._keys
@@ -346,42 +361,44 @@ TensorizerConfig._keys = frozenset(TensorizerConfig._fields)
 
 @dataclass
 class TensorizerArgs:
-    tensorizer_uri: Optional[str] = None
-    tensorizer_dir: Optional[str] = None
-    encryption_keyfile: Optional[str] = None
+    tensorizer_uri: str | None = None
+    tensorizer_dir: str | None = None
+    encryption_keyfile: str | None = None
 
     def __init__(self, tensorizer_config: TensorizerConfig):
         for k, v in tensorizer_config.items():
             setattr(self, k, v)
         self.file_obj = tensorizer_config.tensorizer_uri
-        self.s3_access_key_id = (tensorizer_config.s3_access_key_id
-                                 or envs.S3_ACCESS_KEY_ID)
-        self.s3_secret_access_key = (tensorizer_config.s3_secret_access_key
-                                     or envs.S3_SECRET_ACCESS_KEY)
+        self.s3_access_key_id = (
+            tensorizer_config.s3_access_key_id or envs.S3_ACCESS_KEY_ID
+        )
+        self.s3_secret_access_key = (
+            tensorizer_config.s3_secret_access_key or envs.S3_SECRET_ACCESS_KEY
+        )
         self.s3_endpoint = tensorizer_config.s3_endpoint or envs.S3_ENDPOINT_URL
 
         self.stream_kwargs = {
             "s3_access_key_id": tensorizer_config.s3_access_key_id,
             "s3_secret_access_key": tensorizer_config.s3_secret_access_key,
             "s3_endpoint": tensorizer_config.s3_endpoint,
-            **(tensorizer_config.stream_kwargs or {})
+            **(tensorizer_config.stream_kwargs or {}),
         }
 
         self.deserialization_kwargs = {
             "verify_hash": tensorizer_config.verify_hash,
             "encryption": tensorizer_config.encryption_keyfile,
             "num_readers": tensorizer_config.num_readers,
-            **(tensorizer_config.deserialization_kwargs or {})
+            **(tensorizer_config.deserialization_kwargs or {}),
         }
 
         if self.encryption_keyfile:
             with open_stream(
-                    tensorizer_config.encryption_keyfile,
-                    **self.stream_kwargs,
+                tensorizer_config.encryption_keyfile,
+                **self.stream_kwargs,
             ) as stream:
                 key = stream.read()
                 decryption_params = DecryptionParams.from_key(key)
-                self.deserialization_kwargs['encryption'] = decryption_params
+                self.deserialization_kwargs["encryption"] = decryption_params
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -389,17 +406,20 @@ class TensorizerArgs:
 
         # Tensorizer options arg group
         group = parser.add_argument_group(
-            'tensorizer options',
-            description=('Options for configuring the behavior of the'
-                         ' tensorizer deserializer when '
-                         'load_format=tensorizer is specified when '
-                         'initializing an LLMEngine, either via the CLI '
-                         'when running the vLLM OpenAI inference server '
-                         'with a JSON string passed to '
-                         '--model-loader-extra-config or as arguments given '
-                         'to TensorizerConfig when passed to '
-                         'model_loader_extra_config in the constructor '
-                         'for LLMEngine.'))
+            "tensorizer options",
+            description=(
+                "Options for configuring the behavior of the"
+                " tensorizer deserializer when "
+                "load_format=tensorizer is specified when "
+                "initializing an LLMEngine, either via the CLI "
+                "when running the vLLM OpenAI inference server "
+                "with a JSON string passed to "
+                "--model-loader-extra-config or as arguments given "
+                "to TensorizerConfig when passed to "
+                "model_loader_extra_config in the constructor "
+                "for LLMEngine."
+            ),
+        )
 
         group.add_argument(
             "--tensorizer-uri",
@@ -419,7 +439,8 @@ class TensorizerArgs:
             type=str,
             default=None,
             help="The file path to a binary file containing a binary key to "
-            "use for decryption. Can be a file path or S3 network URI.")
+            "use for decryption. Can be a file path or S3 network URI.",
+        )
         group.add_argument(
             "--num-readers",
             default=None,
@@ -427,7 +448,8 @@ class TensorizerArgs:
             help="Controls how many threads are allowed to read concurrently "
             "from the source file. Default is `None`, which will dynamically "
             "set the number of readers based on the available resources "
-            "and model size. This greatly increases performance.")
+            "and model size. This greatly increases performance.",
+        )
         group.add_argument(
             "--s3-access-key-id",
             type=str,
@@ -455,72 +477,81 @@ class TensorizerArgs:
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
         attrs = [attr.name for attr in dataclasses.fields(cls)]
-        tensorizer_args = cls(**{
-            attr: getattr(args, attr)
-            for attr in attrs if hasattr(args, attr)
-        })
+        tensorizer_args = cls(
+            **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)}
+        )
         return tensorizer_args
 
 
 def _check_tensors_on_meta_device(model: nn.Module) -> None:
     for tensor in model.state_dict().values():
-        if tensor.device.type == 'meta':
+        if tensor.device.type == "meta":
             raise ValueError(
                 "The serialized model contains tensors on the meta device,"
                 " indicating that some tensors were not loaded properly."
                 " Please check that the parameters of the model being"
                 " specified match that of the serialized model, such as"
-                " its quantization.")
+                " its quantization."
+            )
 
 
 def _resize_lora_embeddings(model: nn.Module):
     """Modify LoRA embedding layers to use bigger tensors
     to allow for adapter added tokens."""
     for child in model.modules():
-        if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0]
-                < child.num_embeddings_per_partition):
-            new_weight = torch.empty(child.num_embeddings_per_partition,
-                                     child.embedding_dim,
-                                     dtype=child.weight.dtype,
-                                     device=child.weight.device)
-            new_weight[:child.weight.shape[0]].copy_(child.weight.data)
-            new_weight[child.weight.shape[0]:].fill_(0)
+        if (
+            isinstance(child, VocabParallelEmbedding)
+            and child.weight.shape[0] < child.num_embeddings_per_partition
+        ):
+            new_weight = torch.empty(
+                child.num_embeddings_per_partition,
+                child.embedding_dim,
+                dtype=child.weight.dtype,
+                device=child.weight.device,
+            )
+            new_weight[: child.weight.shape[0]].copy_(child.weight.data)
+            new_weight[child.weight.shape[0] :].fill_(0)
             child.weight.data = new_weight
 
 
-def init_tensorizer_model(tensorizer_config: TensorizerConfig,
-                          vllm_config: VllmConfig) -> nn.Module:
+def init_tensorizer_model(
+    tensorizer_config: TensorizerConfig, vllm_config: VllmConfig
+) -> nn.Module:
     assert tensorizer_config.hf_config is not None
     model_args = tensorizer_config.hf_config
-    model_args.torch_dtype = tensorizer_config.dtype
+    model_args.dtype = tensorizer_config.dtype
     assert tensorizer_config.model_class is not None
     # TODO: Do we need to consider old-style model class?
-    with meta_tensor_mode(), set_current_vllm_config(vllm_config,
-                                                     check_compile=True):
+    with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True):
         return tensorizer_config.model_class(vllm_config=vllm_config)
 
 
-def deserialize_tensorizer_model(model: nn.Module,
-                                 tensorizer_config: TensorizerConfig) -> None:
+def deserialize_tensorizer_model(
+    model: nn.Module, tensorizer_config: TensorizerConfig
+) -> None:
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
     if not is_valid_deserialization_uri(tensorizer_config.tensorizer_uri):
         raise ValueError(
             f"{tensorizer_config.tensorizer_uri} is not a valid "
             f"tensorizer URI. Please check that the URI is correct. "
             f"It must either point to a local existing file, or have a "
-            f"S3, HTTP or HTTPS scheme.")
+            f"S3, HTTP or HTTPS scheme."
+        )
     before_mem = get_mem_usage()
     start = time.perf_counter()
-    with open_stream(
-            tensorizer_config.tensorizer_uri,
-            mode="rb",
-            **tensorizer_args.stream_kwargs) as stream, TensorDeserializer(
-                stream,
-                dtype=tensorizer_config.dtype,
-                device=f'xpu:{torch.xpu.current_device()}'
-                if current_platform.is_xpu() else
-                f'cuda:{torch.cuda.current_device()}',
-                **tensorizer_args.deserialization_kwargs) as deserializer:
+    with (
+        open_stream(
+            tensorizer_config.tensorizer_uri, mode="rb", **tensorizer_args.stream_kwargs
+        ) as stream,
+        TensorDeserializer(
+            stream,
+            dtype=tensorizer_config.dtype,
+            device=f"xpu:{torch.xpu.current_device()}"
+            if current_platform.is_xpu()
+            else f"cuda:{torch.cuda.current_device()}",
+            **tensorizer_args.deserialization_kwargs,
+        ) as deserializer,
+    ):
         deserializer.load_into_module(model)
         end = time.perf_counter()
 
@@ -529,8 +560,9 @@ def deserialize_tensorizer_model(model: nn.Module,
     per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
     after_mem = get_mem_usage()
     deserializer.close()
-    logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
-                end - start, per_second)
+    logger.info(
+        "Deserialized %s in %0.2fs, %s/s", total_bytes_str, end - start, per_second
+    )
     logger.info("Memory usage before: %s", before_mem)
     logger.info("Memory usage after: %s", after_mem)
 
@@ -540,20 +572,21 @@ def deserialize_tensorizer_model(model: nn.Module,
 
 
 def tensorizer_weights_iterator(
-    tensorizer_args: "TensorizerArgs"
+    tensorizer_args: "TensorizerArgs",
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
-    logger.warning("Deserializing HuggingFace models is not optimized for "
-                   "loading on vLLM, as tensorizer is forced to load to CPU. "
-                   "Consider deserializing a vLLM model instead for faster "
-                   "load times. See the "
-                   "examples/others/tensorize_vllm_model.py example script "
-                   "for serializing vLLM models.")
+    logger.warning(
+        "Deserializing HuggingFace models is not optimized for "
+        "loading on vLLM, as tensorizer is forced to load to CPU. "
+        "Consider deserializing a vLLM model instead for faster "
+        "load times. See the "
+        "examples/others/tensorize_vllm_model.py example script "
+        "for serializing vLLM models."
+    )
 
     deserializer_args = tensorizer_args.deserialization_kwargs
     stream_kwargs = tensorizer_args.stream_kwargs
     stream = open_stream(tensorizer_args.tensorizer_uri, **stream_kwargs)
-    with TensorDeserializer(stream, **deserializer_args,
-                            device="cpu") as state:
+    with TensorDeserializer(stream, **deserializer_args, device="cpu") as state:
         yield from state.items()
     del state
 
@@ -571,41 +604,54 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
         bool: True if the model is a vLLM model, False otherwise.
     """
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
-    deserializer = TensorDeserializer(open_stream(
-        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs),
-                                      **tensorizer_args.deserialization_kwargs,
-                                      lazy_load=True)
+    deserializer = TensorDeserializer(
+        open_stream(tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs),
+        **tensorizer_args.deserialization_kwargs,
+        lazy_load=True,
+    )
     if tensorizer_config.vllm_tensorized:
         logger.warning(
             "Please note that newly serialized vLLM models are automatically "
             "inferred as vLLM models, so setting vllm_tensorized=True is "
-            "only necessary for models serialized prior to this change.")
+            "only necessary for models serialized prior to this change."
+        )
         return True
     return ".vllm_tensorized_marker" in deserializer
 
 
 def serialize_extra_artifacts(
-        tensorizer_args: TensorizerArgs,
-        served_model_name: Union[str, list[str], None]) -> None:
+    tensorizer_args: TensorizerArgs, served_model_name: str | list[str] | None
+) -> None:
     if not isinstance(served_model_name, str):
         raise ValueError(
             f"served_model_name must be a str for serialize_extra_artifacts, "
-            f"not {type(served_model_name)}.")
+            f"not {type(served_model_name)}."
+        )
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        snapshot_download(served_model_name,
-                          local_dir=tmpdir,
-                          ignore_patterns=[
-                              "*.pt", "*.safetensors", "*.bin", "*.cache",
-                              "*.gitattributes", "*.md"
-                          ])
+        snapshot_download(
+            served_model_name,
+            local_dir=tmpdir,
+            ignore_patterns=[
+                "*.pt",
+                "*.safetensors",
+                "*.bin",
+                "*.cache",
+                "*.gitattributes",
+                "*.md",
+            ],
+        )
         for artifact in os.scandir(tmpdir):
             if not artifact.is_file():
                 continue
-            with open(artifact.path, "rb") as f, open_stream(
+            with (
+                open(artifact.path, "rb") as f,
+                open_stream(
                     f"{tensorizer_args.tensorizer_dir}/{artifact.name}",
                     mode="wb+",
-                    **tensorizer_args.stream_kwargs) as stream:
+                    **tensorizer_args.stream_kwargs,
+                ) as stream,
+            ):
                 logger.info("Writing artifact %s", artifact.name)
                 stream.write(f.read())
 
@@ -617,7 +663,8 @@ def serialize_vllm_model(
 ) -> nn.Module:
     model.register_parameter(
         "vllm_tensorized_marker",
-        nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
+        nn.Parameter(torch.tensor((1,), device="meta"), requires_grad=False),
+    )
 
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
 
@@ -630,13 +677,17 @@ def serialize_vllm_model(
     output_file = tensorizer_args.tensorizer_uri
     if tensorizer_config._is_sharded:
         from vllm.distributed import get_tensor_model_parallel_rank
+
         output_file = output_file % get_tensor_model_parallel_rank()
 
-    with open_stream(output_file, mode="wb+",
-                     **tensorizer_args.stream_kwargs) as stream:
-        serializer = TensorSerializer(stream,
-                                      encryption=encryption_params,
-                                      **tensorizer_config.serialization_kwargs)
+    with open_stream(
+        output_file, mode="wb+", **tensorizer_args.stream_kwargs
+    ) as stream:
+        serializer = TensorSerializer(
+            stream,
+            encryption=encryption_params,
+            **tensorizer_config.serialization_kwargs,
+        )
         serializer.write_module(model)
         serializer.close()
 
@@ -646,34 +697,35 @@ def serialize_vllm_model(
     return model
 
 
-def tensorize_vllm_model(engine_args: "EngineArgs",
-                         tensorizer_config: TensorizerConfig,
-                         generate_keyfile: bool = True):
+def tensorize_vllm_model(
+    engine_args: "EngineArgs",
+    tensorizer_config: TensorizerConfig,
+    generate_keyfile: bool = True,
+):
     """Utility to load a model and then serialize it with Tensorizer
 
-       Intended to be used separately from running a vLLM server since it
-       creates its own Engine instance.
+    Intended to be used separately from running a vLLM server since it
+    creates its own Engine instance.
     """
     engine_config = engine_args.create_engine_config()
     tensorizer_config.verify_with_model_config(engine_config.model_config)
-    tensorizer_config.verify_with_parallel_config(
-        engine_config.parallel_config)
+    tensorizer_config.verify_with_parallel_config(engine_config.parallel_config)
 
     # generate the encryption key before creating the engine to support sharding
-    if generate_keyfile and (keyfile :=
-                             tensorizer_config.encryption_keyfile) is not None:
+    if (
+        generate_keyfile
+        and (keyfile := tensorizer_config.encryption_keyfile) is not None
+    ):
         encryption_params = EncryptionParams.random()
         with open_stream(
-                keyfile,
-                mode="wb+",
-                s3_access_key_id=tensorizer_config.s3_access_key_id,
-                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
-                s3_endpoint=tensorizer_config.s3_endpoint,
+            keyfile,
+            mode="wb+",
+            s3_access_key_id=tensorizer_config.s3_access_key_id,
+            s3_secret_access_key=tensorizer_config.s3_secret_access_key,
+            s3_endpoint=tensorizer_config.s3_endpoint,
         ) as stream:
             stream.write(encryption_params.key)
 
-    assert envs.VLLM_USE_V1
-
     from vllm.v1.engine.llm_engine import LLMEngine
 
     engine = LLMEngine.from_vllm_config(engine_config)
@@ -683,8 +735,7 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
     )
 
 
-def tensorize_lora_adapter(lora_path: str,
-                           tensorizer_config: TensorizerConfig):
+def tensorize_lora_adapter(lora_path: str, tensorizer_config: TensorizerConfig):
     """
     Uses tensorizer to serialize a LoRA adapter. Assumes that the files
     needed to load a LoRA adapter are a safetensors-format file called
@@ -720,19 +771,20 @@ def tensorize_lora_adapter(lora_path: str,
 
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
 
-    with open_stream(f"{tensorizer_config.tensorizer_dir}/adapter_config.json",
-                     mode="wb+",
-                     **tensorizer_args.stream_kwargs) as f:
-
+    with open_stream(
+        f"{tensorizer_config.tensorizer_dir}/adapter_config.json",
+        mode="wb+",
+        **tensorizer_args.stream_kwargs,
+    ) as f:
         f.write(json.dumps(config).encode("utf-8"))
 
-    lora_uri = (f"{tensorizer_config.tensorizer_dir}"
-                f"/adapter_model.tensors")
-    with open_stream(lora_uri, mode="wb+",
-                     **tensorizer_args.stream_kwargs) as f:
+    lora_uri = f"{tensorizer_config.tensorizer_dir}/adapter_model.tensors"
+    with open_stream(lora_uri, mode="wb+", **tensorizer_args.stream_kwargs) as f:
         serializer = TensorSerializer(f)
         serializer.write_state_dict(tensors)
         serializer.close()
 
-    logger.info("Successfully serialized LoRA files to %s",
-                str(tensorizer_config.tensorizer_dir))
+    logger.info(
+        "Successfully serialized LoRA files to %s",
+        str(tensorizer_config.tensorizer_dir),
+    )
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 65ea49c6429445869d8bb8e105aaf60bf8ee1e75..2b3704cfebbaa9fd2c126917eecd79b47158e86e 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -3,7 +3,6 @@
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
-from typing import Union
 
 import torch
 from torch import nn
@@ -13,11 +12,18 @@ from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.tensorizer import (
-    TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model,
-    is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator)
-from vllm.model_executor.model_loader.utils import (get_model_architecture,
-                                                    initialize_model,
-                                                    set_default_torch_dtype)
+    TensorizerConfig,
+    deserialize_tensorizer_model,
+    init_tensorizer_model,
+    is_vllm_tensorized,
+    serialize_vllm_model,
+    tensorizer_weights_iterator,
+)
+from vllm.model_executor.model_loader.utils import (
+    get_model_architecture,
+    initialize_model,
+)
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 logger = init_logger(__name__)
 
@@ -44,15 +50,18 @@ class TensorizerLoader(BaseModelLoader):
         else:
             validate_config(load_config.model_loader_extra_config)
             self.tensorizer_config = TensorizerConfig(
-                **load_config.model_loader_extra_config["tensorizer_config"])
+                **load_config.model_loader_extra_config["tensorizer_config"]
+            )
 
-    def _verify_config(self, model_config: ModelConfig,
-                       parallel_config: ParallelConfig):
+    def _verify_config(
+        self, model_config: ModelConfig, parallel_config: ParallelConfig
+    ):
         self.tensorizer_config.verify_with_model_config(model_config)
         self.tensorizer_config.verify_with_parallel_config(parallel_config)
 
     def _get_weights_iterator(
-        self, ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        self,
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
         tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
         return tensorizer_weights_iterator(tensorizer_args)
 
@@ -82,8 +91,7 @@ class TensorizerLoader(BaseModelLoader):
         with self.tensorizer_config.open_stream():
             pass
 
-    def _patch_tensorizer_config(
-            self, model_config: ModelConfig) -> TensorizerConfig:
+    def _patch_tensorizer_config(self, model_config: ModelConfig) -> TensorizerConfig:
         model_class = get_model_architecture(model_config)[0]
         tensorizer_config = copy.copy(self.tensorizer_config)
         tensorizer_config.model_class = model_class
@@ -91,8 +99,7 @@ class TensorizerLoader(BaseModelLoader):
         tensorizer_config.dtype = model_config.dtype
         return tensorizer_config
 
-    def load_weights(self, model: nn.Module,
-                     model_config: ModelConfig) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         """Load serialized model weights with tensorizer.
 
         Expects a vLLM-tensorized model. See the
@@ -104,8 +111,9 @@ class TensorizerLoader(BaseModelLoader):
         else:
             model.load_weights(self._get_weights_iterator())
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
+    def load_model(
+        self, vllm_config: VllmConfig, model_config: ModelConfig
+    ) -> nn.Module:
         parallel_config = vllm_config.parallel_config
         self._verify_config(model_config, parallel_config)
 
@@ -113,8 +121,8 @@ class TensorizerLoader(BaseModelLoader):
             from vllm.distributed import get_tensor_model_parallel_rank
 
             self.tensorizer_config.tensorizer_uri = (
-                self.tensorizer_config.tensorizer_uri %
-                get_tensor_model_parallel_rank())
+                self.tensorizer_config.tensorizer_uri % get_tensor_model_parallel_rank()
+            )
 
         if is_vllm_tensorized(self.tensorizer_config):
             tensorizer_config = self._patch_tensorizer_config(model_config)
@@ -122,8 +130,8 @@ class TensorizerLoader(BaseModelLoader):
             with set_default_torch_dtype(model_config.dtype):
                 with torch.device(device_config.device):
                     model = init_tensorizer_model(
-                        tensorizer_config=tensorizer_config,
-                        vllm_config=vllm_config)
+                        tensorizer_config=tensorizer_config, vllm_config=vllm_config
+                    )
             self.load_weights(model, model_config)
             return model
         return self._load_model_serialized_cpu(vllm_config=vllm_config)
@@ -131,7 +139,7 @@ class TensorizerLoader(BaseModelLoader):
     @staticmethod
     def save_model(
         model: torch.nn.Module,
-        tensorizer_config: Union[TensorizerConfig, dict],
+        tensorizer_config: TensorizerConfig | dict,
         model_config: ModelConfig,
     ) -> None:
         if isinstance(tensorizer_config, dict):
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
index a70cdeb483e67d6a91b227e23d1d172b91394183..fc142f1f07fae40e58c3c0d1918bb54e4539ca6d 100644
--- a/vllm/model_executor/model_loader/tpu.py
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -13,7 +12,10 @@ from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
 from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+    initialize_model,
+    process_weights_after_loading,
+)
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 logger = init_logger(__name__)
 
@@ -27,40 +29,38 @@ class TPUModelLoader(DefaultModelLoader):
         self,
         vllm_config: VllmConfig,
         model_config: ModelConfig,
-        mesh: Optional[xs.Mesh] = None,
+        mesh: xs.Mesh | None = None,
     ) -> nn.Module:
         # Initialize model and load weights on CPU. Then, during SPMD partition,
         # weights are sharded and transferred to TPUs.
         self.counter_before_loading_weights = time.perf_counter()
         model_config = vllm_config.model_config
         assert model_config.quantization is None, "Quantization not supported"
-        target_device = torch.device('cpu')
+        target_device = torch.device("cpu")
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
                 model = initialize_model(vllm_config=vllm_config)
 
             load_format = vllm_config.load_config.load_format
             if load_format != "dummy":
-                weights_to_load = {
-                    name
-                    for name, _ in model.named_parameters()
-                }
+                weights_to_load = {name for name, _ in model.named_parameters()}
                 all_weights = self.get_all_weights(model_config, model)
                 loaded_weights = model.load_weights(all_weights)
                 self.counter_after_loading_weights = time.perf_counter()
                 logger.info(
                     "Loading weights took %.2f seconds",
-                    self.counter_after_loading_weights -
-                    self.counter_before_loading_weights)
+                    self.counter_after_loading_weights
+                    - self.counter_before_loading_weights,
+                )
                 # We only enable strict check for non-quantized models
                 # that have loaded weights tracking currently.
-                if model_config.quantization is None and \
-                    loaded_weights is not None:
+                if model_config.quantization is None and loaded_weights is not None:
                     weights_not_loaded = weights_to_load - loaded_weights
                     if weights_not_loaded:
                         raise ValueError(
                             "Following weights were not initialized from "
-                            f"checkpoint: {weights_not_loaded}")
+                            f"checkpoint: {weights_not_loaded}"
+                        )
             else:
                 logger.info("Use dummy weight during weight loading.")
 
@@ -68,11 +68,13 @@ class TPUModelLoader(DefaultModelLoader):
 
         counter_before_partition = time.perf_counter()
         model = model.eval()
-        model = model.to('xla')
+        model = model.to("xla")
         shard_model(model, mesh)
         counter_after_partition = time.perf_counter()
-        logger.info("Partition model took %.2f seconds",
-                    counter_after_partition - counter_before_partition)
+        logger.info(
+            "Partition model took %.2f seconds",
+            counter_after_partition - counter_before_partition,
+        )
 
         # Ensure the model is properly loaded.
         self._check_model_is_loaded(mesh, model)
@@ -82,12 +84,12 @@ class TPUModelLoader(DefaultModelLoader):
         if not model_config.is_multimodal_model:
             model.model = torch.compile(model.model, backend="openxla")
         else:
-            model.language_model.model = \
-                torch.compile(model.language_model.model, backend="openxla")
+            model.language_model.model = torch.compile(
+                model.language_model.model, backend="openxla"
+            )
         return model
 
-    def _check_model_is_loaded(self, mesh: Optional[xs.Mesh],
-                               model: nn.Module) -> None:
+    def _check_model_is_loaded(self, mesh: xs.Mesh | None, model: nn.Module) -> None:
         """
         Ensure the model is properly loaded.
         1. All model parameters and buffers are on XLA device.
@@ -99,16 +101,18 @@ class TPUModelLoader(DefaultModelLoader):
         # Check parameters
         for name, param in model.named_parameters():
             assert param.device.type == device_type, (
-                f"Parameter {name} is on {param.device.type} "
-                f"instead of {device_type}")
+                f"Parameter {name} is on {param.device.type} instead of {device_type}"
+            )
 
         # Check buffers
         for name, buffer in model.named_buffers():
             assert buffer.device.type == device_type, (
-                f"Buffer {name} is on {buffer.device.type} "
-                f"instead of {device_type}")
+                f"Buffer {name} is on {buffer.device.type} instead of {device_type}"
+            )
 
         for module in model.modules():
-            if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'):
-                raise AssertionError("QKVParallelLinear should be replaced by \
-                            XlaQKVParallelLinear under SPMD mode.")
+            if (mesh is not None) and (get_fqn(module) == "QKVParallelLinear"):
+                raise AssertionError(
+                    "QKVParallelLinear should be replaced by \
+                            XlaQKVParallelLinear under SPMD mode."
+                )
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 03202e13c280362281744469d358fc75da1a6da4..ba708a098c0da54b5d6310c630be3a1038f12d6d 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,48 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading models."""
-import contextlib
+
 import inspect
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Optional
 
 import torch
 from torch import nn
 from typing_extensions import assert_never
 
 from vllm.attention import Attention
+from vllm.attention.layer import MLAAttention
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.models.adapters import (
-    as_embedding_model, as_reward_model, as_seq_cls_model,
-    try_create_mm_pooling_model_cls)
-from vllm.model_executor.models.interfaces import (SupportsQuant,
-                                                   supports_multimodal)
-from vllm.utils import is_pin_memory_available
+    as_embedding_model,
+    as_reward_model,
+    as_seq_cls_model,
+    try_create_mm_pooling_model_cls,
+)
+from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
+from vllm.utils.platform_utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
 
-@contextlib.contextmanager
-def set_default_torch_dtype(dtype: torch.dtype):
-    """Sets the default torch dtype to the given dtype."""
-    old_dtype = torch.get_default_dtype()
-    torch.set_default_dtype(dtype)
-    yield
-    torch.set_default_dtype(old_dtype)
-
-
 def initialize_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
-    model_class: Optional[type[nn.Module]] = None,
-    model_config: Optional[ModelConfig] = None,
+    model_class: type[nn.Module] | None = None,
+    model_config: ModelConfig | None = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
     if model_config is None:
@@ -57,16 +51,16 @@ def initialize_model(
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        with set_current_vllm_config(vllm_config,
-                                     check_compile=True,
-                                     prefix=prefix):
+        with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix):
             return model_class(vllm_config=vllm_config, prefix=prefix)
 
-    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
-           "input arguments. Possibly you have an old-style model class"
-           " registered from out of tree and it is used for new vLLM version. "
-           "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
-           "for the design and update the model class accordingly.")
+    msg = (
+        "vLLM model class should accept `vllm_config` and `prefix` as "
+        "input arguments. Possibly you have an old-style model class"
+        " registered from out of tree and it is used for new vLLM version. "
+        "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
+        "for the design and update the model class accordingly."
+    )
     warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
     logger.warning(
@@ -87,20 +81,21 @@ def initialize_model(
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    with set_current_vllm_config(vllm_config,
-                                 check_compile=True,
-                                 prefix=prefix):
+    with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix):
         return model_class(**kwargs)
 
 
-def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
-                                  target_device: torch.device) -> None:
+def process_weights_after_loading(
+    model: nn.Module, model_config: ModelConfig, target_device: torch.device
+) -> None:
+    # to avoid circular dependency
+    from vllm.model_executor.model_loader.online_quantization import (
+        maybe_save_metadata_and_attributes_for_weight_reloading,
+    )
+
+    maybe_save_metadata_and_attributes_for_weight_reloading(model, model_config)
+
     for _, module in model.named_modules():
-        if isinstance(module, QKVCrossParallelLinear):
-            # NOTE(Isotr0py): special case for cross QKV layer because
-            # q and kv proj aren't registered as submodules intentionally
-            module.process_weights_after_loading()
-            continue
         quant_method = getattr(module, "quant_method", None)
         if isinstance(quant_method, QuantizeMethodBase):
             # When quant methods need to process weights after loading
@@ -111,20 +106,19 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
             with device_loading_context(module, target_device):
                 quant_method.process_weights_after_loading(module)
 
-    # Currently only used by MLA.
-    # NOTE: This intentionally happens after other modules so we can easily
-    # decompress the weights for MLA.
+    # Initialize post-load attention weights for both Attention and MLA.
+    # NOTE: Happens after other modules so we can easily decompress weights.
     for _, module in model.named_modules():
-        if isinstance(module, Attention) and \
-            hasattr(module, "process_weights_after_loading"):
+        if isinstance(module, (Attention, MLAAttention)) and hasattr(
+            module, "process_weights_after_loading"
+        ):
             # TODO(lucas): see if there is a way to unify the signatures
             # of process_weights_after_loading
             module.process_weights_after_loading(model_config.dtype)
 
 
 @contextmanager
-def device_loading_context(module: torch.nn.Module,
-                           target_device: torch.device):
+def device_loading_context(module: torch.nn.Module, target_device: torch.device):
     if target_device.type == "cpu":
         # If target is CPU, no need to move anything
         yield module
@@ -169,8 +163,7 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
 """Caches the outputs of `_get_model_architecture`."""
 
 
-def _get_model_architecture(
-        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(
@@ -184,7 +177,9 @@ def _get_model_architecture(
             logger.warning_once(
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
-                "performance may not be optimal.", arch)
+                "performance may not be optimal.",
+                arch,
+            )
 
     convert_type = model_config.convert_type
     if convert_type != "none" and supports_multimodal(model_cls):
@@ -213,16 +208,17 @@ def _get_model_architecture(
     return model_cls, arch
 
 
-def get_model_architecture(
-        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
-    key = hash((
-        model_config.model,
-        model_config.convert_type,
-        model_config.runner_type,
-        model_config.trust_remote_code,
-        model_config.model_impl,
-        tuple(getattr(model_config.hf_config, "architectures", [])),
-    ))
+def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    key = hash(
+        (
+            model_config.model,
+            model_config.convert_type,
+            model_config.runner_type,
+            model_config.trust_remote_code,
+            model_config.model_impl,
+            tuple(getattr(model_config.hf_config, "architectures", [])),
+        )
+    )
     if key in _MODEL_ARCH_BY_HASH:
         return _MODEL_ARCH_BY_HASH[key]
 
@@ -243,12 +239,12 @@ def get_architecture_class_name(model_config: ModelConfig) -> str:
 class ParamMapping:
     """
     A class to handle parameter mapping for model weight loading.
-    It creates a bidirectional mapping between packed parameters and their 
+    It creates a bidirectional mapping between packed parameters and their
     constituent parts.
     """
+
     packed_mapping: dict[str, list[str]]
-    inverse_packed_mapping: dict[str, tuple[str,
-                                            int]] = field(default_factory=dict)
+    inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict)
 
     def __post_init__(self):
         for packed_name, sub_params in self.packed_mapping.items():
@@ -261,16 +257,16 @@ class ParamMapping:
                     index,
                 )
 
-    def get_sub_modules(self,
-                        module_name: str) -> Optional[tuple[str, list[str]]]:
+    def get_sub_modules(self, module_name: str) -> tuple[str, list[str]] | None:
         for key, value in self.packed_mapping.items():
             if module_name.endswith(key):
                 return key, value
         return None
 
 
-def configure_quant_config(quant_config: QuantizationConfig,
-                           model_class: type[nn.Module]):
+def configure_quant_config(
+    quant_config: QuantizationConfig, model_class: type[nn.Module]
+):
     """
     Pass packed_modules_mapping by reference to quant_config so that
     quant_config can properly match fused modules
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index cad32fee1d0f6624cb1eb01e63be4d146294ba47..93986e5f2fc0aeb713e52482057cb452e8355e46 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
+
 import concurrent.futures
 import fnmatch
 import glob
@@ -10,10 +11,10 @@ import os
 import tempfile
 import time
 from collections import defaultdict
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
 from pathlib import Path
-from typing import IO, Any, Callable, Optional, Union
+from typing import IO, Any
 
 import filelock
 import huggingface_hub.constants
@@ -28,18 +29,18 @@ from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (QuantizationConfig,
-                                                     get_quantization_config)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    get_quantization_config,
+)
 from vllm.platforms import current_platform
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
 except ImportError:
-    runai_model_streamer = PlaceholderModule(
-        "runai_model_streamer")  # type: ignore[assignment]
-    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
-        "SafetensorsStreamer")
+    runai_model_streamer = PlaceholderModule("runai_model_streamer")  # type: ignore[assignment]
+    SafetensorsStreamer = runai_model_streamer.placeholder_attr("SafetensorsStreamer")
 
 try:
     import gguf
@@ -50,10 +51,11 @@ try:
     from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 except ImportError:
     fastsafetensors = PlaceholderModule("fastsafetensors")
-    SafeTensorsFileLoader = fastsafetensors.placeholder_attr(
-        "SafeTensorsFileLoader")
+    SafeTensorsFileLoader = fastsafetensors.placeholder_attr("SafeTensorsFileLoader")
     SingleGroup = fastsafetensors.placeholder_attr("SingleGroup")
 
+from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least
+
 logger = init_logger(__name__)
 
 # use system-level temp directory for file locks, so that multiple users
@@ -64,12 +66,12 @@ temp_dir = tempfile.gettempdir()
 
 
 def enable_hf_transfer():
-    """automatically activates hf_transfer
-    """
+    """automatically activates hf_transfer"""
     if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
         try:
             # enable hf hub transfer if available
             import hf_transfer  # type: ignore # noqa
+
             huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
         except ImportError:
             pass
@@ -79,13 +81,12 @@ enable_hf_transfer()
 
 
 class DisabledTqdm(tqdm):
-
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, disable=True)
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
 
 
-def get_lock(model_name_or_path: Union[str, Path],
-             cache_dir: Optional[str] = None):
+def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
     lock_dir = cache_dir or temp_dir
     model_name_or_path = str(model_name_or_path)
     os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
@@ -94,15 +95,14 @@ def get_lock(model_name_or_path: Union[str, Path],
     # add hash to avoid conflict with old users' lock files
     lock_file_name = hash_name + model_name + ".lock"
     # mode 0o666 is required for the filelock to be shared across users
-    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
-                             mode=0o666)
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
     return lock
 
 
 @contextmanager
-def atomic_writer(filepath: Union[str, Path],
-                  mode: str = 'w',
-                  encoding: Optional[str] = None) -> Generator[IO]:
+def atomic_writer(
+    filepath: str | Path, mode: str = "w", encoding: str | None = None
+) -> Generator[IO]:
     """
     Context manager that provides an atomic file writing routine.
 
@@ -133,8 +133,8 @@ def atomic_writer(filepath: Union[str, Path],
 
     except Exception:
         logger.exception(
-            "Error during atomic write. Original file '%s' not modified",
-            filepath)
+            "Error during atomic write. Original file '%s' not modified", filepath
+        )
         raise
     finally:
         # Clean up the temporary file if it still exists.
@@ -143,16 +143,16 @@ def atomic_writer(filepath: Union[str, Path],
 
 
 def maybe_download_from_modelscope(
-        model: str,
-        revision: Optional[str] = None,
-        download_dir: Optional[str] = None,
-        ignore_patterns: Optional[Union[str, list[str]]] = None,
-        allow_patterns: Optional[Union[list[str],
-                                       str]] = None) -> Optional[str]:
+    model: str,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+    allow_patterns: list[str] | str | None = None,
+) -> str | None:
     """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
 
-        Returns the path to the downloaded model, or None if the model is not
-        downloaded from ModelScope."""
+    Returns the path to the downloaded model, or None if the model is not
+    downloaded from ModelScope."""
     if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
         # lazy import so that modelscope is not required for normal use.
@@ -226,9 +226,9 @@ def convert_bin_to_safetensor_file(
 
 
 # TODO(woosuk): Move this to other place.
-def get_quant_config(model_config: ModelConfig,
-                     load_config: LoadConfig) -> QuantizationConfig:
-
+def get_quant_config(
+    model_config: ModelConfig, load_config: LoadConfig
+) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
 
     # GGUF doesn't have config file
@@ -236,27 +236,54 @@ def get_quant_config(model_config: ModelConfig,
         return quant_cls()
 
     # Read the quantization config from the HF model config, if available.
-    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
-                              None)
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
     # some vision model may keep quantization_config in their text_config
     hf_text_config = getattr(model_config.hf_config, "text_config", None)
     if hf_quant_config is None and hf_text_config is not None:
         hf_quant_config = getattr(hf_text_config, "quantization_config", None)
     if hf_quant_config is None:
         # compressed-tensors uses a compressions_config
-        hf_quant_config = getattr(model_config.hf_config, "compression_config",
-                                  None)
+        hf_quant_config = getattr(model_config.hf_config, "compression_config", None)
+
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
+
+    # if hf_quant_config is None, we will try to get config from
+    # hf_overrides
+    hf_overrides = model_config.hf_overrides
+    quantization_config_file = hf_overrides.get("quantization_config_file", None)
+    if quantization_config_file is not None:
+        if hasattr(quant_cls, "from_config_file"):
+            return quant_cls.from_config_file(quantization_config_file)
+        else:
+            raise NotImplementedError(
+                "from_config_file is specified in hf_override config, "
+                "but quant_cls.from_config_file is not implemented in "
+                f"{quant_cls}"
+            )
+    quantization_config_json = hf_overrides.get("quantization_config_dict_json", None)
+    if quantization_config_json is not None:
+        if hasattr(quant_cls, "from_config_dict_json"):
+            return quant_cls.from_config_dict_json(quantization_config_json)
+        else:
+            raise NotImplementedError(
+                "from_config_dict_json is specified in hf_override config, "
+                "but quant_cls.from_config_dict_json is not implemented in "
+                f"{quant_cls}"
+            )
+
     # Inflight BNB quantization
     if model_config.quantization == "bitsandbytes":
         return quant_cls.from_config({})
-    model_name_or_path = maybe_download_from_modelscope(
-        model_config.model,
-        revision=model_config.revision,
-        download_dir=load_config.download_dir,
-        allow_patterns=["*.json"],
-    ) or model_config.model
+    model_name_or_path = (
+        maybe_download_from_modelscope(
+            model_config.model,
+            revision=model_config.revision,
+            download_dir=load_config.download_dir,
+            allow_patterns=["*.json"],
+        )
+        or model_config.model
+    )
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
@@ -281,16 +308,15 @@ def get_quant_config(model_config: ModelConfig,
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
-        f for f in config_files if any(
-            f.endswith(x) for x in possible_config_filenames)
+        f for f in config_files if any(f.endswith(x) for x in possible_config_filenames)
     ]
     if len(quant_config_files) == 0:
-        raise ValueError(
-            f"Cannot find the config file for {model_config.quantization}")
+        raise ValueError(f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:
         raise ValueError(
             f"Found multiple config files for {model_config.quantization}: "
-            f"{quant_config_files}")
+            f"{quant_config_files}"
+        )
 
     quant_config_file = quant_config_files[0]
     with open(quant_config_file) as f:
@@ -304,7 +330,8 @@ def get_quant_config(model_config: ModelConfig,
             else:
                 raise ValueError(
                     f"Unsupported quantization config"
-                    f" found for {model_config.quantization} in {f}.")
+                    f" found for {model_config.quantization} in {f}."
+                )
 
     return quant_cls.from_config(config)
 
@@ -344,10 +371,10 @@ def get_sparse_attention_config(
 
 def download_weights_from_hf(
     model_name_or_path: str,
-    cache_dir: Optional[str],
+    cache_dir: str | None,
     allow_patterns: list[str],
-    revision: Optional[str] = None,
-    ignore_patterns: Optional[Union[str, list[str]]] = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
 ) -> str:
     """Download model weights from Hugging Face Hub.
 
@@ -373,9 +400,7 @@ def download_weights_from_hf(
         # so we only have to call snapshot_download once.
         try:
             fs = HfFileSystem()
-            file_list = fs.ls(model_name_or_path,
-                              detail=False,
-                              revision=revision)
+            file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
 
             # Use the first pattern found in the HF repo's files.
             for pattern in allow_patterns:
@@ -387,9 +412,12 @@ def download_weights_from_hf(
             logger.warning(
                 "Failed to get file list for '%s'. Trying each pattern in "
                 "allow_patterns individually until weights have been "
-                "downloaded. Error: %s", model_name_or_path, e)
+                "downloaded. Error: %s",
+                model_name_or_path,
+                e,
+            )
 
-    logger.info("Using model weights format %s", allow_patterns)
+    logger.debug("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
@@ -410,16 +438,19 @@ def download_weights_from_hf(
                 break
         time_taken = time.perf_counter() - start_time
         if time_taken > 0.5:
-            logger.info("Time spent downloading weights for %s: %.6f seconds",
-                        model_name_or_path, time_taken)
+            logger.info(
+                "Time spent downloading weights for %s: %.6f seconds",
+                model_name_or_path,
+                time_taken,
+            )
     return hf_folder
 
 
 def download_safetensors_index_file_from_hf(
     model_name_or_path: str,
     index_file: str,
-    cache_dir: Optional[str],
-    revision: Optional[str] = None,
+    cache_dir: str | None,
+    revision: str | None = None,
 ) -> None:
     """Download hf safetensors index file from Hugging Face Hub.
 
@@ -455,9 +486,9 @@ def download_safetensors_index_file_from_hf(
 # Passing both of these to the weight loader functionality breaks.
 # So, we use the index_file to
 # look up which safetensors files should be used.
-def filter_duplicate_safetensors_files(hf_weights_files: list[str],
-                                       hf_folder: str,
-                                       index_file: str) -> list[str]:
+def filter_duplicate_safetensors_files(
+    hf_weights_files: list[str], hf_folder: str, index_file: str
+) -> list[str]:
     # model.safetensors.index.json is a mapping from keys in the
     # torch state_dict to safetensors file holding that weight.
     index_file_name = os.path.join(hf_folder, index_file)
@@ -470,17 +501,13 @@ def filter_duplicate_safetensors_files(hf_weights_files: list[str],
         weight_map = json.load(f)["weight_map"]
     weight_files_in_index = set()
     for weight_name in weight_map:
-        weight_files_in_index.add(
-            os.path.join(hf_folder, weight_map[weight_name]))
+        weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
     # Filter out any fields that are not found in the index file.
-    hf_weights_files = [
-        f for f in hf_weights_files if f in weight_files_in_index
-    ]
+    hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
     return hf_weights_files
 
 
-def filter_files_not_needed_for_inference(
-        hf_weights_files: list[str]) -> list[str]:
+def filter_files_not_needed_for_inference(hf_weights_files: list[str]) -> list[str]:
     """
     Exclude files that are not needed for inference.
 
@@ -494,8 +521,7 @@ def filter_files_not_needed_for_inference(
         "scaler.pt",
     ]
     hf_weights_files = [
-        f for f in hf_weights_files
-        if not any(f.endswith(x) for x in blacklist)
+        f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist)
     ]
     return hf_weights_files
 
@@ -508,13 +534,14 @@ _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elap
 
 
 def enable_tqdm(use_tqdm_on_load: bool):
-    return use_tqdm_on_load and (not torch.distributed.is_initialized()
-                                 or torch.distributed.get_rank() == 0)
+    return use_tqdm_on_load and (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
 
 
 def np_cache_weights_iterator(
     model_name_or_path: str,
-    cache_dir: Optional[str],
+    cache_dir: str | None,
     hf_folder: str,
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
@@ -534,14 +561,12 @@ def np_cache_weights_iterator(
         if not os.path.exists(weight_names_file):
             weight_names: list[str] = []
             for bin_file in tqdm(
-                    hf_weights_files,
-                    desc="Loading np_cache checkpoint shards",
-                    disable=not enable_tqdm(use_tqdm_on_load),
-                    bar_format=_BAR_FORMAT,
+                hf_weights_files,
+                desc="Loading np_cache checkpoint shards",
+                disable=not enable_tqdm(use_tqdm_on_load),
+                bar_format=_BAR_FORMAT,
             ):
-                state = torch.load(bin_file,
-                                   map_location="cpu",
-                                   weights_only=True)
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -571,15 +596,32 @@ def safetensors_weights_iterator(
         loading_desc += " (eager)"
 
     for st_file in tqdm(
-            hf_weights_files,
-            desc=loading_desc,
-            disable=not enable_tqdm(use_tqdm_on_load),
-            bar_format=_BAR_FORMAT,
+        hf_weights_files,
+        desc=loading_desc,
+        disable=not enable_tqdm(use_tqdm_on_load),
+        bar_format=_BAR_FORMAT,
     ):
         if safetensors_load_strategy == "eager":
             with open(st_file, "rb") as f:
                 state_dict = load(f.read())
             yield from state_dict.items()
+        elif safetensors_load_strategy == "torchao":
+            if not torchao_version_at_least("0.14.0"):
+                raise ValueError(
+                    "Please use torchao version >= 0.14.0 \
+                        to load torchao safetensors checkpoint"
+                )
+            from torchao.prototype.safetensors.safetensors_support import (
+                unflatten_tensor_state_dict,
+            )
+
+            with safe_open(st_file, framework="pt") as f:
+                state_dict = {}
+                for name in f.keys():  # noqa: SIM118
+                    state_dict[name] = f.get_tensor(name)
+                metadata = f.metadata()
+                updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata)
+            yield from updated_state_dict.items()
         else:
             with safe_open(st_file, framework="pt") as f:
                 for name in f.keys():  # noqa: SIM118
@@ -598,12 +640,8 @@ def multi_thread_safetensors_weights_iterator(
         result = load_file(st_file, device="cpu")
         return result
 
-    with concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers) as executor:
-        futures = [
-            executor.submit(_load_file, st_file)
-            for st_file in hf_weights_files
-        ]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(_load_file, st_file) for st_file in hf_weights_files]
         futures_iter = tqdm(
             concurrent.futures.as_completed(futures),
             total=len(hf_weights_files),
@@ -620,13 +658,26 @@ def multi_thread_safetensors_weights_iterator(
 def runai_safetensors_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
+    is_distributed: bool = False,
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
     with SafetensorsStreamer() as streamer:
-        streamer.stream_files(hf_weights_files)
+        is_cuda_alike = current_platform.is_cuda_alike()
+        device = (
+            f"cuda:{current_platform.current_device()}"
+            if is_distributed and is_cuda_alike
+            else "cpu"
+        )
+
+        streamer.stream_files(
+            hf_weights_files,
+            device=device,
+            is_distributed=is_distributed,
+        )
         total_tensors = sum(
             len(tensors_meta)
-            for tensors_meta in streamer.files_to_tensors_metadata.values())
+            for tensors_meta in streamer.files_to_tensors_metadata.values()
+        )
 
         tensor_iter = tqdm(
             streamer.get_tensors(),
@@ -634,11 +685,25 @@ def runai_safetensors_weights_iterator(
             desc="Loading safetensors using Runai Model Streamer",
             bar_format=_BAR_FORMAT,
             disable=not enable_tqdm(use_tqdm_on_load),
+            mininterval=2,
         )
 
         yield from tensor_iter
 
 
+def _init_loader(
+    pg: torch.distributed.ProcessGroup,
+    device: torch.device,
+    f_list: list[str],
+    *,
+    nogds: bool = False,
+):
+    loader = SafeTensorsFileLoader(pg, device, nogds=nogds)
+    rank_file_map = {i: [f] for i, f in enumerate(f_list)}
+    loader.add_filenames(rank_file_map)
+    return loader
+
+
 def fastsafetensors_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
@@ -650,23 +715,37 @@ def fastsafetensors_weights_iterator(
     else:
         pg = SingleGroup()
 
-    device = torch.device(f'cuda:{pg.rank()}')
+    device = torch.device(f"cuda:{pg.rank()}")
     weight_files_sub_lists = [
-        hf_weights_files[i:i + pg.size()]
+        hf_weights_files[i : i + pg.size()]
         for i in range(0, len(hf_weights_files), pg.size())
     ]
 
+    nogds = False
+
     for f_list in tqdm(
-            weight_files_sub_lists,
-            desc="Loading safetensors using Fastsafetensor loader",
-            disable=not enable_tqdm(use_tqdm_on_load),
-            bar_format=_BAR_FORMAT,
+        weight_files_sub_lists,
+        desc="Loading safetensors using Fastsafetensor loader",
+        disable=not enable_tqdm(use_tqdm_on_load),
+        bar_format=_BAR_FORMAT,
     ):
-        loader = SafeTensorsFileLoader(pg, device)
-        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
-        loader.add_filenames(rank_file_map)
+        loader = _init_loader(pg, device, f_list, nogds=nogds)
         try:
-            fb = loader.copy_files_to_device()
+            try:
+                fb = loader.copy_files_to_device()
+            except RuntimeError as e:
+                if "gds" not in str(e):
+                    raise
+
+                loader.close()
+                nogds = True
+                logger.warning_once(
+                    "GDS not enabled, setting `nogds=True`.\n"
+                    "For more information, see: https://github.com/foundation-model-stack/fastsafetensors?tab=readme-ov-file#basic-api-usages"
+                )
+                loader = _init_loader(pg, device, f_list, nogds=nogds)
+                fb = loader.copy_files_to_device()
+
             try:
                 keys = list(fb.key_to_rank_lidx.keys())
                 for k in keys:
@@ -681,18 +760,18 @@ def fastsafetensors_weights_iterator(
 def pt_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    pt_load_map_location: str | dict[str, str] = "cpu",
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
     for bin_file in tqdm(
-            hf_weights_files,
-            desc="Loading pt checkpoint shards",
-            disable=not enable_tqdm(use_tqdm_on_load),
-            bar_format=_BAR_FORMAT,
+        hf_weights_files,
+        desc="Loading pt checkpoint shards",
+        disable=not enable_tqdm(use_tqdm_on_load),
+        bar_format=_BAR_FORMAT,
     ):
-        state = torch.load(bin_file,
-                           map_location=pt_load_map_location,
-                           weights_only=True)
+        state = torch.load(
+            bin_file, map_location=pt_load_map_location, weights_only=True
+        )
         yield from state.items()
         del state
 
@@ -700,21 +779,19 @@ def pt_weights_iterator(
 def multi_thread_pt_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    pt_load_map_location: str | dict[str, str] = "cpu",
     max_workers: int = 4,
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Multi-Thread iterate over the weights in the model bin/pt files."""
 
     def _load_file(bin_file: str):
-        return torch.load(bin_file,
-                          map_location=pt_load_map_location,
-                          weights_only=True)
+        return torch.load(
+            bin_file, map_location=pt_load_map_location, weights_only=True
+        )
 
-    with concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = [
-            executor.submit(_load_file, bin_file)
-            for bin_file in hf_weights_files
+            executor.submit(_load_file, bin_file) for bin_file in hf_weights_files
         ]
         futures_iter = tqdm(
             concurrent.futures.as_completed(futures),
@@ -731,7 +808,8 @@ def multi_thread_pt_weights_iterator(
 
 
 def get_gguf_extra_tensor_names(
-        gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]:
+    gguf_file: str, gguf_to_hf_name_map: dict[str, str]
+) -> list[str]:
     reader = gguf.GGUFReader(gguf_file)
     expected_gguf_keys = set(gguf_to_hf_name_map.keys())
     exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
@@ -740,14 +818,16 @@ def get_gguf_extra_tensor_names(
 
 
 def get_gguf_weight_type_map(
-        gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> dict[str, str]:
+    gguf_file: str, gguf_to_hf_name_map: dict[str, str]
+) -> dict[str, str]:
     """
     Return GGUF mapped weight's name and its quant type
     """
     reader = gguf.GGUFReader(gguf_file)
     return {
         gguf_to_hf_name_map[tensor.name]: tensor.tensor_type.name
-        for tensor in reader.tensors if tensor.name in gguf_to_hf_name_map
+        for tensor in reader.tensors
+        if tensor.name in gguf_to_hf_name_map
     }
 
 
@@ -797,8 +877,7 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     return x
 
 
-def default_weight_loader(param: torch.Tensor,
-                          loaded_weight: torch.Tensor) -> None:
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
     try:
         if param.numel() == 1 and loaded_weight.numel() == 1:
@@ -809,7 +888,8 @@ def default_weight_loader(param: torch.Tensor,
         else:
             assert param.size() == loaded_weight.size(), (
                 f"Attempted to load weight ({loaded_weight.size()}) "
-                f"into parameter ({param.size()})")
+                f"into parameter ({param.size()})"
+            )
 
             param.data.copy_(loaded_weight)
     except Exception:
@@ -818,8 +898,9 @@ def default_weight_loader(param: torch.Tensor,
         raise
 
 
-def row_parallel_weight_loader(param: torch.Tensor,
-                               loaded_weight: torch.Tensor) -> None:
+def row_parallel_weight_loader(
+    param: torch.Tensor, loaded_weight: torch.Tensor
+) -> None:
     """Load weights that are row-parallelized."""
     tp_rank = get_tensor_model_parallel_rank()
     shard_dim = 0 if param.dim() != 1 else None
@@ -851,12 +932,11 @@ def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
 
 
 def composed_weight_loader(
-        loader: LoaderFunction, fn: Callable[[torch.Tensor],
-                                             torch.Tensor]) -> LoaderFunction:
+    loader: LoaderFunction, fn: Callable[[torch.Tensor], torch.Tensor]
+) -> LoaderFunction:
     """Create a weight loader that post-processes the weights after loading"""
 
-    def composed_loader(param: torch.Tensor,
-                        loaded_weight: torch.Tensor) -> None:
+    def composed_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
         loader(param, loaded_weight)
         param.data.copy_(fn(param))
         return
@@ -892,13 +972,18 @@ def initialize_dummy_weights(
                 # from a CPU tensor.
                 # Note: We avoid using torch.rank_like as it doesn't currently
                 # support the generator argument.
-                param.copy_((high - low) *
-                            torch.rand(param.shape,
-                                       generator=generator,
-                                       dtype=param.dtype,
-                                       layout=param.layout,
-                                       requires_grad=param.requires_grad,
-                                       device="cpu") + low)
+                param.copy_(
+                    (high - low)
+                    * torch.rand(
+                        param.shape,
+                        generator=generator,
+                        dtype=param.dtype,
+                        layout=param.layout,
+                        requires_grad=param.requires_grad,
+                        device="cpu",
+                    )
+                    + low
+                )
                 torch._sync(param)
                 continue
 
@@ -908,14 +993,13 @@ def initialize_dummy_weights(
                 # uniform_ doesn't support < 16-bit datatypes (FP8)
                 dtype = param.data.dtype
                 tmp_param = param.data.to(torch.float16)
-                tmp_param = tmp_param.uniform_(low, high,
-                                               generator=generator).to(dtype)
+                tmp_param = tmp_param.uniform_(low, high, generator=generator).to(dtype)
                 param.data.copy_(tmp_param)
             else:
                 param.uniform_(low, high, generator=generator)
 
 
-def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
     """Remap the name of FP8 k/v_scale parameters.
 
     This function handles the remapping of FP8 k/v_scale parameter names.
@@ -938,7 +1022,8 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
             "Functionally, we will remap kv_scale to k_scale and duplicate "
-            "k_scale to v_scale")
+            "k_scale to v_scale"
+        )
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
@@ -950,19 +1035,28 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
             return None
         return remapped_name
 
+    if any("mla_attn" in key for key in params_dict):
+        attn_str = "mla_attn.mla_attn"
+        logger.debug_once(
+            f"Found mla_attn with k_scale and v_scale in "
+            f"the checkpoint, using {attn_str} as attn_str"
+        )
+    else:
+        attn_str = "attn"
     # Define scale name mapping patterns in order of precedence
     scale_mapping_patterns = [
         # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale ->
         # .self_attn.attn.{k,v}_scale
-        (r"\.self_attn\.([kv])_proj\.([kv])_scale$",
-         r".self_attn.attn.\2_scale"),
+        (
+            r"\.self_attn\.([kv])_proj\.([kv])_scale$",
+            rf".self_attn.{attn_str}.\2_scale",
+        ),
         # QKV proj format: .self_attn.qkv_proj.{k,v}_scale ->
         # .self_attn.attn.{k,v}_scale
         (r"\.self_attn\.qkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
         # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
         # .self_attn.attn.{k,v}_scale
-        (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"
-         ),
+        (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
         # Default format: .{k,v}_scale -> .attn.{k,v}_scale
         (r"\.([kv])_scale$", r".attn.\1_scale"),
     ]
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 4ccba64f2c110b8370306da95b27c38bab9cdd3c..9f8dd042bf83a7e5062730acdfa7d800a3ca1785 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,13 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE,
-                         SupportsMultiModal, SupportsPP, SupportsTranscription,
-                         SupportsV0Only, has_inner_state, supports_lora,
-                         supports_mrope, supports_multimodal, supports_pp,
-                         supports_transcription, supports_v0_only)
-from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
-                              is_pooling_model, is_text_generation_model)
+from .interfaces import (
+    HasInnerState,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsTranscription,
+    has_inner_state,
+    supports_lora,
+    supports_mrope,
+    supports_multimodal,
+    supports_pp,
+    supports_transcription,
+)
+from .interfaces_base import (
+    VllmModelForPooling,
+    VllmModelForTextGeneration,
+    is_pooling_model,
+    is_text_generation_model,
+)
 from .registry import ModelRegistry
 
 __all__ = [
@@ -28,6 +41,4 @@ __all__ = [
     "supports_pp",
     "SupportsTranscription",
     "supports_transcription",
-    "SupportsV0Only",
-    "supports_v0_only",
 ]
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index c4328a176a5def04177c11a1948e53cb71ae76e1..a9cc49451a1d3b2fb6a3633301945fd187311a1c 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -4,7 +4,7 @@
 import ast
 import inspect
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.config import VerifyAndUpdateConfig
-from vllm.transformers_utils.config import (get_hf_file_bytes,
-                                            get_hf_file_to_dict)
+from vllm.transformers_utils.config import (
+    get_hf_file_bytes,
+    try_get_dense_modules,
+)
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
@@ -33,43 +35,28 @@ _GENERATE_SUFFIXES = [
 ]
 
 
-def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
+def _load_st_projector(model_config: "ModelConfig") -> nn.Module | None:
     """Load Sentence-Transformers Dense projection layers."""
 
-    try:
-        modules = get_hf_file_to_dict("modules.json", model_config.model,
-                                      model_config.revision)
-        if not modules:
-            return None
-
-        if isinstance(modules, dict):
-            modules = modules.get("modules", [])
+    dense_modules = try_get_dense_modules(
+        model_config.model, revision=model_config.revision
+    )
 
-        dense_modules = [
-            m for m in modules
-            if m.get("type") == "sentence_transformers.models.Dense"
-        ]
-        if not dense_modules:
-            return None
+    if dense_modules is None:
+        return
 
+    try:
         layers = []
-        for module in dense_modules:
-            folder = module.get("path", "")
-
-            config_path = f"{folder}/config.json" if folder else "config.json"
-            layer_config = get_hf_file_to_dict(config_path, model_config.model,
-                                               model_config.revision)
-            if not layer_config:
-                continue
-
-            linear = nn.Linear(layer_config.get("in_features", 768),
-                               layer_config.get("out_features", 768),
-                               bias=layer_config.get("bias", True),
-                               dtype=model_config.head_dtype)
-
+        for layer_config in dense_modules:
+            folder = layer_config["folder"]
+            linear = nn.Linear(
+                layer_config["in_features"],
+                layer_config["out_features"],
+                bias=layer_config.get("bias", True),
+                dtype=model_config.head_dtype,
+            )
             if not _load_dense_weights(linear, folder, model_config):
                 continue
-
             layers.append(linear)
             if act_name := layer_config.get("activation_function"):
                 layers.append(get_act_fn(act_name))
@@ -80,40 +67,45 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
     return None
 
 
-def _load_dense_weights(linear: nn.Linear, folder: str,
-                        model_config: "ModelConfig") -> bool:
+def _load_dense_weights(
+    linear: nn.Linear, folder: str, model_config: "ModelConfig"
+) -> bool:
     """Load weights using vLLM's weight_loader pattern."""
-    from vllm.model_executor.model_loader.weight_utils import (
-        default_weight_loader)
+    from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
     for filename in ["model.safetensors", "pytorch_model.bin"]:
         file_path = f"{folder}/{filename}" if folder else filename
 
         try:
-            file_bytes = get_hf_file_bytes(file_path, model_config.model,
-                                           model_config.revision)
+            file_bytes = get_hf_file_bytes(
+                file_path, model_config.model, model_config.revision
+            )
             if not file_bytes:
                 continue
 
             if filename.endswith(".safetensors"):
                 from safetensors.torch import load as load_safetensors
+
                 state_dict = load_safetensors(file_bytes)
             else:
                 import io
-                state_dict = torch.load(io.BytesIO(file_bytes),
-                                        map_location="cpu",
-                                        weights_only=True)
+
+                state_dict = torch.load(
+                    io.BytesIO(file_bytes), map_location="cpu", weights_only=True
+                )
 
             for weight_key in ["weight", "linear.weight", "dense.weight"]:
                 if weight_key in state_dict:
-                    weight_loader = getattr(linear.weight, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        linear.weight, "weight_loader", default_weight_loader
+                    )
                     weight_loader(linear.weight, state_dict[weight_key])
 
                     bias_key = weight_key.replace("weight", "bias")
                     if linear.bias is not None and bias_key in state_dict:
-                        bias_loader = getattr(linear.bias, "weight_loader",
-                                              default_weight_loader)
+                        bias_loader = getattr(
+                            linear.bias, "weight_loader", default_weight_loader
+                        )
                         bias_loader(linear.bias, state_dict[bias_key])
                     return True
         except Exception:
@@ -133,9 +125,7 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
 
 
 def try_create_mm_pooling_model_cls(orig_cls: _T) -> _T:
-
     class CallVisitor(ast.NodeVisitor):
-
         def __init__(self):
             self.calls = []
 
@@ -150,7 +140,6 @@ def try_create_mm_pooling_model_cls(orig_cls: _T) -> _T:
         return None
 
     class ModelForPooling(orig_cls, VllmModelForPooling):
-
         is_pooling_model = True
 
         def __init__(
@@ -172,7 +161,6 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
     from .utils import AutoWeightsLoader, WeightsMapper
 
     class ModelForPooling(orig_cls, VllmModelForPooling):
-
         is_pooling_model = True
 
         def __init__(
@@ -198,12 +186,21 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             raise NotImplementedError
 
-        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        def load_weights(
+            self,
+            weights: Iterable[tuple[str, torch.Tensor]],
+            load_lm_head: bool = False,
+        ):
             # TODO: Support uninitialized params tracking
 
-            # We have deleted this attribute, so don't load it
-            weights = ((name, data) for name, data in weights
-                       if not name.startswith("lm_head."))
+            # For most pooling models: We have deleted this attribute, so don't load it.
+            # For converting an LLM into a seq cls model, we need the lm_head.
+            if not load_lm_head:
+                weights = (
+                    (name, data)
+                    for name, data in weights
+                    if not name.startswith("lm_head.")
+                )
 
             # If `*ForCausalLM` defines `load_weights` on the inner model
             # and there are no other inner modules with parameters,
@@ -212,7 +209,8 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
                 # Whether only `self.model` contains parameters
                 model_is_only_param = all(
                     name == "model" or next(child.parameters(), None) is None
-                    for name, child in self.named_children())
+                    for name, child in self.named_children()
+                )
 
                 if model_is_only_param:
                     mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
@@ -252,19 +250,18 @@ def as_embedding_model(cls: _T) -> _T:
     from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 
     class ModelForEmbedding(_create_pooling_model_cls(cls)):
-
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
             self.pooler = DispatchPooler(
                 {
-                    "encode": Pooler.for_encode(pooler_config),
+                    "token_embed": Pooler.for_token_embed(pooler_config),
                     "embed": Pooler.for_embed(pooler_config),
-                }, )
+                },
+            )
 
-    ModelForEmbedding.__name__ = \
-        _get_pooling_model_name(cls.__name__, "ForEmbedding")
+    ModelForEmbedding.__name__ = _get_pooling_model_name(cls.__name__, "ForEmbedding")
 
     return ModelForEmbedding  # type: ignore
 
@@ -287,74 +284,53 @@ def as_seq_cls_model(cls: _T) -> _T:
 
     # Lazy import
     from vllm.model_executor.layers.linear import ReplicatedLinear
-    from vllm.model_executor.layers.pooler import (ClassifierPooler,
-                                                   DispatchPooler, Pooler,
-                                                   PoolingMethod, PoolingType)
+    from vllm.model_executor.layers.pooler import (
+        DispatchPooler,
+        Pooler,
+    )
     from vllm.model_executor.models.interfaces import SupportsCrossEncoding
-    from vllm.sequence import IntermediateTensors
-
-    from .utils import get_model_hidden_size, maybe_prefix
 
-    class ModelForSequenceClassification(_create_pooling_model_cls(cls),
-                                         SupportsCrossEncoding):
+    from .utils import maybe_prefix
 
+    class ModelForSequenceClassification(
+        _create_pooling_model_cls(cls), SupportsCrossEncoding
+    ):
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
-            config = vllm_config.model_config.hf_config
+            text_config = vllm_config.model_config.hf_config.get_text_config()
+            model_config = vllm_config.model_config
             quant_config = vllm_config.quant_config
-            hidden_size = get_model_hidden_size(config)
 
             self.score = ReplicatedLinear(
-                hidden_size,
-                config.num_labels,
+                model_config.hidden_size,
+                text_config.num_labels,
                 bias=False,
-                params_dtype=torch.float32,
+                params_dtype=vllm_config.model_config.head_dtype,
                 quant_config=quant_config,
+                return_bias=False,
                 prefix=maybe_prefix(prefix, "score"),
             )
 
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
-            pooling_type_str = pooler_config.pooling_type
-            assert pooling_type_str is not None
-            pooling_type = PoolingType[pooling_type_str]
-
-            self.pooler = DispatchPooler({
-                "encode":
-                Pooler.for_encode(pooler_config),
-                "classify":
-                ClassifierPooler(
-                    pooling=PoolingMethod.from_pooling_type(pooling_type),
-                    classifier=self._classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config),
-                ),
-                "score":
-                ClassifierPooler(
-                    pooling=PoolingMethod.from_pooling_type(pooling_type),
-                    classifier=self._classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config),
-                ),
-            })
-
-        def _classifier(self, x: torch.Tensor):
-            x, _ = self.score(x.float())
-            return x
-
-        def forward(
-            self,
-            input_ids: torch.Tensor,
-            positions: torch.Tensor,
-            intermediate_tensors: Optional[IntermediateTensors] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-        ) -> torch.Tensor:
-            return super().forward(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+            self.pooler = DispatchPooler(
+                {
+                    "token_classify": Pooler.for_token_classify(
+                        pooler_config, classifier=self.score
+                    ),
+                    "classify": Pooler.for_classify(
+                        pooler_config, classifier=self.score, act_fn="classify"
+                    ),
+                    "score": Pooler.for_classify(
+                        pooler_config, classifier=self.score, act_fn="score"
+                    ),
+                }
+            )
 
         def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-            tokens = getattr(self.config, "classifier_from_token", None)
-            method = getattr(self.config, "method", None)
+            text_config = self.config.get_text_config()
+            tokens = getattr(text_config, "classifier_from_token", None)
+            method = getattr(text_config, "method", None)
 
             if tokens is None and method is None:
                 return super().load_weights(weights)
@@ -363,9 +339,9 @@ def as_seq_cls_model(cls: _T) -> _T:
                 # ForSequenceClassification model.
                 return seq_cls_model_loader(self, weights)
 
-
-    ModelForSequenceClassification.__name__ = \
-        _get_pooling_model_name(cls.__name__, "ForSequenceClassification")
+    ModelForSequenceClassification.__name__ = _get_pooling_model_name(
+        cls.__name__, "ForSequenceClassification"
+    )
 
     return ModelForSequenceClassification  # type: ignore
 
@@ -387,28 +363,33 @@ def as_reward_model(cls: _T) -> _T:
     # Lazy import
     from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 
-    class ModelForReward(_create_pooling_model_cls(cls)):
+    from .interfaces_base import default_pooling_type
 
+    @default_pooling_type("ALL")
+    class ModelForReward(_create_pooling_model_cls(cls)):
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
             self.pooler = DispatchPooler(
-                {"encode": Pooler.for_encode(pooler_config)}, )
+                {
+                    "token_classify": Pooler.for_token_classify(
+                        pooler_config=pooler_config
+                    )
+                }
+            )
 
-    ModelForReward.__name__ = \
-        _get_pooling_model_name(cls.__name__, "ForReward")
+    ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward")
 
     return ModelForReward  # type: ignore
 
 
 class SequenceClassificationConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
-        method = getattr(config, "method", None)
-        tokens = getattr(config, "classifier_from_token", None)
+        text_config = vllm_config.model_config.hf_config.get_text_config()
+        method = getattr(text_config, "method", None)
+        tokens = getattr(text_config, "classifier_from_token", None)
 
         if method is None:
             return
@@ -418,52 +399,65 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
 
         if method == "from_2_way_softmax":
             assert len(tokens) == 2
-            config.num_labels = 1
+            text_config.num_labels = 1
         else:
-            config.num_labels = len(tokens)
+            text_config.num_labels = len(tokens)
 
         # `llm as reranker` defaults to not using pad_token
-        use_pad_token = getattr(config, "use_pad_token", False)
-        config.use_pad_token = use_pad_token
+        use_pad_token = getattr(text_config, "use_pad_token", False)
+        text_config.use_pad_token = use_pad_token
 
 
 def load_weights_using_from_2_way_softmax(
-        model, weights: Iterable[tuple[str, torch.Tensor]]):
+    model, weights: Iterable[tuple[str, torch.Tensor]]
+):
     # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
-    from vllm.model_executor.layers.vocab_parallel_embedding import (
-        ParallelLMHead)
-    from vllm.model_executor.model_loader.weight_utils import (
-        default_weight_loader)
-    from vllm.model_executor.models.utils import AutoWeightsLoader
+    from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+    from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
     model_config = model.vllm_config.model_config
+    quant_config = model.vllm_config.quant_config
+    text_config = model.config.get_text_config()
 
-    tokens = getattr(model.config, "classifier_from_token", [])
+    tokens = getattr(text_config, "classifier_from_token", [])
     tokens = cast(list[int], tokens)
     assert len(tokens) == 2
 
-    if model.config.tie_word_embeddings:
-        model.lm_head = model.model.embed_tokens
-    else:
-        quant_config = model.vllm_config.quant_config
-        model.lm_head = ParallelLMHead(model.config.vocab_size,
-                                       model.config.hidden_size,
-                                       quant_config=quant_config)
-
-    loader = AutoWeightsLoader(model)
-    loaded_weights = loader.load_weights(weights)
+    model.lm_head = ParallelLMHead(
+        text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
+    )
+    if text_config.tie_word_embeddings:
+        # embed_tokens is the assumed name for input embeddings. If the model does not
+        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # the Transformers modeling backend.
+        embed_tokens = (
+            model.model.embed_tokens
+            if hasattr(model.model, "embed_tokens")
+            else model.model.get_input_embeddings()
+        )
+        model.lm_head = model.lm_head.tie_weights(embed_tokens)
+
+    # ModelForPooling is dynamically defined inside the _create_pooling_model_cls
+    # function, so we need use this hacky method to obtain it.
+    pooling_model_cls = next(
+        x for x in type(model).__mro__ if x.__name__ == "ModelForPooling"
+    )
+    loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True)
 
     from vllm.transformers_utils.tokenizer import get_tokenizer
-    tokenizer = get_tokenizer(model_config.tokenizer,
-                              revision=model_config.tokenizer_revision,
-                              tokenizer_mode=model_config.tokenizer_mode,
-                              trust_remote_code=model_config.trust_remote_code)
+
+    tokenizer = get_tokenizer(
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+    )
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
     true_id = tokenizer.convert_tokens_to_ids(tokens[1])
     score_weight = model.lm_head.weight.data[[true_id]].to(
-        torch.float32) - model.lm_head.weight.data[[false_id]].to(
-            torch.float32)
+        torch.float32
+    ) - model.lm_head.weight.data[[false_id]].to(torch.float32)
 
     param = model.score.weight
     weight_loader = getattr(param, "weight_loader", default_weight_loader)
@@ -475,36 +469,43 @@ def load_weights_using_from_2_way_softmax(
     return loaded_weights
 
 
-def load_weights_no_post_processing(model,
-                                    weights: Iterable[tuple[str,
-                                                            torch.Tensor]]):
-    from vllm.model_executor.layers.vocab_parallel_embedding import (
-        ParallelLMHead)
-    from vllm.model_executor.model_loader.weight_utils import (
-        default_weight_loader)
-    from vllm.model_executor.models.utils import AutoWeightsLoader
+def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Tensor]]):
+    from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+    from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
     model_config = model.vllm_config.model_config
-    tokens = getattr(model.config, "classifier_from_token", [])
+    quant_config = model.vllm_config.quant_config
+    text_config = model.config.get_text_config()
+
+    tokens = getattr(text_config, "classifier_from_token", [])
     tokens = cast(list[int], tokens)
     assert len(tokens) > 0
 
-    if model.config.tie_word_embeddings:
-        model.lm_head = model.model.embed_tokens
-    else:
-        quant_config = model.vllm_config.quant_config
-        model.lm_head = ParallelLMHead(model.config.vocab_size,
-                                       model.config.hidden_size,
-                                       quant_config=quant_config)
-
-    loader = AutoWeightsLoader(model)
-    loaded_weights = loader.load_weights(weights)
+    model.lm_head = ParallelLMHead(
+        text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
+    )
+    if text_config.tie_word_embeddings:
+        # embed_tokens is the assumed name for input embeddings. If the model does not
+        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # the Transformers modeling backend.
+        embed_tokens = (
+            model.model.embed_tokens
+            if hasattr(model.model, "embed_tokens")
+            else model.model.get_input_embeddings()
+        )
+        model.lm_head = model.lm_head.tie_weights(embed_tokens)
+
+    # Skip ModelForSequenceClassification in MRO to avoid infinite recursion
+    loaded_weights = type(model).__mro__[1].load_weights(model, weights)
 
     from vllm.transformers_utils.tokenizer import get_tokenizer
-    tokenizer = get_tokenizer(model_config.tokenizer,
-                              revision=model_config.tokenizer_revision,
-                              tokenizer_mode=model_config.tokenizer_mode,
-                              trust_remote_code=model_config.trust_remote_code)
+
+    tokenizer = get_tokenizer(
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+    )
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
     score_weight = model.lm_head.weight.data[token_ids]
@@ -536,7 +537,7 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
     #   - GemmaForCausalLM
     #     - bge-reranker-v2-gemma
 
-    config = model.vllm_config.model_config.hf_config
-    method = getattr(config, "method", None)
+    text_config = model.vllm_config.model_config.hf_config.get_text_config()
+    method = getattr(text_config, "method", None)
     assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
     return SEQ_CLS_LOAD_METHODS[method](model, weights)
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f654f47495f77aaa6ed6fc5711fac3aa6ff4f46
--- /dev/null
+++ b/vllm/model_executor/models/afmoe.py
@@ -0,0 +1,711 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only AfMoE model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.llama import LlamaMLP as AfmoeMLP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class AfmoeMoE(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.route_scale = config.route_scale
+        self.score_func = config.score_func
+        self.route_norm = config.route_norm
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.num_experts
+        self.n_shared_experts: int = config.num_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Router gate
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
+        self.expert_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        # Load balancing settings
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.shared_experts = None
+        # Shared experts
+        if config.num_shared_experts > 0:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        # Routed experts using SharedFusedMoE
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.route_norm if self.score_func == "sigmoid" else False,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_func,
+            routed_scaling_factor=self.route_scale,
+            e_score_correction_bias=self.expert_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+            final_hidden_states = final_hidden_states + shared_output
+        else:
+            final_hidden_states = fused_moe_out
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class AfmoeAttention(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-05,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # Check if this is a local attention layer
+        self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if self.is_local_attention else None
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+
+        # Q/K normalization
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # Only create rotary embeddings for local attention
+        if self.is_local_attention:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=True,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        gate, _ = self.gate_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply Q/K normalization
+        q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+            k.shape
+        )
+
+        # Apply rotary embeddings only for local attention
+        if self.is_local_attention and self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating
+        attn_output = attn_output * torch.sigmoid(gate)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AfmoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        self.layer_idx = extract_layer_index(prefix)
+
+        self.self_attn = AfmoeAttention(
+            config=config,
+            layer_idx=self.layer_idx,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # MoE or dense FFN
+        self.moe_enabled = self.layer_idx >= config.num_dense_layers
+        if self.moe_enabled:
+            self.mlp = AfmoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)  # attn norm b
+
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_layernorm(  # ffn norm a
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)  # ffn norm b
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class AfmoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+        self.mup_enabled = config.mup_enabled
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens"
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: AfmoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+
+            # Apply muP input scaling if enabled
+            if self.mup_enabled:
+                hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if (weight_name not in name) or ("self_attn.gate_proj" in name):
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={
+            ".router.gate.weight": ".gate.weight",
+        },
+    )
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = AfmoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AfmoeDecoderLayer)
+            if layer.moe_enabled:
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None and self.num_moe_layers > 0:
+            raise RuntimeError("No AfmoeMoE layer found in model.layers.")
+
+        if example_moe is not None:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index b13d863ebb744e7cf407ed3a09dd8ebe3b9beec8..5872e8196eada2b48f94b5eaf5050532f134697c 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -4,7 +4,6 @@
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -14,19 +13,20 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
 class AIMv2SwiGLUFFN(nn.Module):
-
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(
+        self, config: AIMv2Config, quant_config: QuantizationConfig, prefix: str
+    ):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.hidden_size
@@ -56,7 +56,6 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 
 class AIMv2PatchEmbed(nn.Module):
-
     def __init__(self, config: AIMv2Config):
         super().__init__()
         self.proj = nn.Conv2d(
@@ -74,14 +73,12 @@ class AIMv2PatchEmbed(nn.Module):
 
 
 class AIMv2ViTPreprocessor(nn.Module):
-
     def __init__(self, config: AIMv2Config):
         super().__init__()
-        num_patches = (config.image_size // config.patch_size)**2
+        num_patches = (config.image_size // config.patch_size) ** 2
 
         self.patchifier = AIMv2PatchEmbed(config)
-        self.pos_embed = nn.Parameter(
-            torch.zeros((1, num_patches, config.hidden_size)))
+        self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         tokens = self.patchifier(x)
@@ -92,9 +89,9 @@ class AIMv2ViTPreprocessor(nn.Module):
 
 
 class AIMv2Attention(nn.Module):
-
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(
+        self, config: AIMv2Config, quant_config: QuantizationConfig, prefix: str
+    ):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -104,7 +101,8 @@ class AIMv2Attention(nn.Module):
             raise ValueError(
                 "embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
 
         self.qkv = QKVParallelLinear(
@@ -127,8 +125,9 @@ class AIMv2Attention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_partition, self.head_dim, self.scale
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         qkv, _ = self.qkv(x)
@@ -140,17 +139,17 @@ class AIMv2Attention(nn.Module):
 
 
 class AIMv2Block(nn.Module):
-
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(
+        self, config: AIMv2Config, quant_config: QuantizationConfig, prefix: str
+    ):
         super().__init__()
-        self.attn = AIMv2Attention(config,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.attn")
+        self.attn = AIMv2Attention(
+            config, quant_config=quant_config, prefix=f"{prefix}.attn"
+        )
         self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.mlp = AIMv2SwiGLUFFN(config,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.mlp")
+        self.mlp = AIMv2SwiGLUFFN(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
         self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -160,24 +159,24 @@ class AIMv2Block(nn.Module):
 
 
 class AIMv2Transformer(nn.Module):
-
     def __init__(
         self,
         config: AIMv2Config,
         quant_config: QuantizationConfig,
         *,
-        require_post_norm: Optional[bool] = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ):
         super().__init__()
 
-        self.blocks = nn.ModuleList([
-            AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
-            for i in range(config.num_hidden_layers)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
+                for i in range(config.num_hidden_layers)
+            ]
+        )
         if require_post_norm:
-            self.post_trunk_norm = RMSNorm(config.hidden_size,
-                                           eps=config.rms_norm_eps)
+            self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.post_trunk_norm = None
 
@@ -191,29 +190,30 @@ class AIMv2Transformer(nn.Module):
 
 
 class AIMv2Model(torch.nn.Module):
-
-    def __init__(self,
-                 config: AIMv2Config,
-                 quant_config: QuantizationConfig,
-                 *,
-                 require_post_norm: Optional[bool] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: AIMv2Config,
+        quant_config: QuantizationConfig,
+        *,
+        require_post_norm: bool | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.preprocessor = AIMv2ViTPreprocessor(config)
-        self.trunk = AIMv2Transformer(config,
-                                      quant_config=quant_config,
-                                      require_post_norm=require_post_norm,
-                                      prefix=f"{prefix}.trunk")
+        self.trunk = AIMv2Transformer(
+            config,
+            quant_config=quant_config,
+            require_post_norm=require_post_norm,
+            prefix=f"{prefix}.trunk",
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-
         x = self.preprocessor(pixel_values)
         x = self.trunk(x)
 
         return x
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".fc13", ".fc1", 0),
@@ -224,11 +224,13 @@ class AIMv2Model(torch.nn.Module):
 
         for name, loaded_weight in weights:
             # post_layernorm is optional in SiglipVisionModel
-            if (name.startswith("trunk.post_trunk_norm")
-                    and self.trunk.post_trunk_norm is None):
+            if (
+                name.startswith("trunk.post_trunk_norm")
+                and self.trunk.post_trunk_norm is None
+            ):
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -239,8 +241,7 @@ class AIMv2Model(torch.nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 6dab4ed14345f080c50c5385508aefcbc3032ffb..0a8f21abb0a35532669cecd61d444352e2293463 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -24,8 +24,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Apertus model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from itertools import islice
+from typing import Any
 
 import torch
 from torch import nn
@@ -38,33 +40,43 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import XIELU
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class ApertusMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -86,8 +98,10 @@ class ApertusMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "xielu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only xIELU is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only xIELU is supported for now."
+            )
         self.act_fn = XIELU()
 
     def forward(self, x):
@@ -98,7 +112,6 @@ class ApertusMLP(nn.Module):
 
 
 class ApertusAttention(nn.Module):
-
     def __init__(
         self,
         config: ApertusConfig,
@@ -106,12 +119,12 @@ class ApertusAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -138,8 +151,7 @@ class ApertusAttention(nn.Module):
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
         # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
-                                             1)
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -164,9 +176,9 @@ class ApertusAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(config,
-                              rope_scaling=rope_scaling,
-                              quant_config=quant_config)
+        self._init_rotary_emb(
+            config, rope_scaling=rope_scaling, quant_config=quant_config
+        )
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -174,8 +186,11 @@ class ApertusAttention(nn.Module):
             if is_sliding:
                 sliding_window = config.sliding_window
 
-        attn_cls = (EncoderOnlyAttention
-                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+        attn_cls = (
+            EncoderOnlyAttention
+            if attn_type == AttentionType.ENCODER_ONLY
+            else Attention
+        )
 
         self.attn = attn_cls(
             self.num_heads,
@@ -206,9 +221,12 @@ class ApertusAttention(nn.Module):
         output, _ = self.o_proj(attn_output)
         return output
 
-    def _init_rotary_emb(self, config: ApertusConfig,
-                         rope_scaling: Optional[dict[str, Any]],
-                         quant_config: Optional[QuantizationConfig]) -> None:
+    def _init_rotary_emb(
+        self,
+        config: ApertusConfig,
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
         is_neox_style = True
         is_gguf = quant_config and quant_config.get_name() == "gguf"
         if is_gguf and config.model_type == "apertus":
@@ -226,12 +244,11 @@ class ApertusAttention(nn.Module):
 
 
 class ApertusDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: ApertusConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -239,18 +256,20 @@ class ApertusDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         bias_o_proj = attention_bias
         # support internlm/internlm3-8b with qkv_bias
-        if hasattr(config, 'qkv_bias'):
+        if hasattr(config, "qkv_bias"):
             attention_bias = config.qkv_bias
 
         # Apertus defaults to causal attention as it is a decoder-only model.
@@ -266,8 +285,9 @@ class ApertusDecoderLayer(nn.Module):
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -286,71 +306,69 @@ class ApertusDecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.attention_layernorm = RMSNorm(config.hidden_size,
-                                           eps=config.rms_norm_eps)
-        self.feedforward_layernorm = RMSNorm(config.hidden_size,
-                                             eps=config.rms_norm_eps)
+        self.attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.attention_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.attention_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
+            hidden_states, residual = self.attention_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
 
         # Fully Connected
-        hidden_states, residual = self.feedforward_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.feedforward_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class ApertusModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = ApertusDecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: layer_type(config=config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config,
-                                      prefix=prefix),
+            lambda prefix: layer_type(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
         if get_pp_group().is_last_rank:
@@ -360,26 +378,25 @@ class ApertusModel(nn.Module):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
-                                                        list[torch.Tensor]]]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -388,16 +405,16 @@ class ApertusModel(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
 
@@ -405,8 +422,7 @@ class ApertusModel(nn.Module):
             return hidden_states, aux_hidden_states
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -424,19 +440,19 @@ class ApertusModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -469,8 +485,7 @@ class ApertusModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -482,56 +497,48 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     # LoRA specific attributes
     embedding_modules = {
         "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings"
+        "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = ApertusDecoderLayer,
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
-        self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"),
-                                      layer_type=layer_type)
+        self.model = self._init_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            layer_type=layer_type,
+        )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config else
-                    lora_config.lora_vocab_padding_size),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.embed_tokens)
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
@@ -540,40 +547,41 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def _init_model(self,
-                    vllm_config: VllmConfig,
-                    prefix: str = "",
-                    layer_type: type[nn.Module] = ApertusDecoderLayer):
-        return ApertusModel(vllm_config=vllm_config,
-                            prefix=prefix,
-                            layer_type=layer_type)
+    def _init_model(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = ApertusDecoderLayer,
+    ):
+        return ApertusModel(
+            vllm_config=vllm_config, prefix=prefix, layer_type=layer_type
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 1ee378af76c9f20e4fb8856421efca1dd60b52bd..20c3ff0754506e356319cbdc8ca58b0e2dd332cd 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -10,7 +10,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -20,32 +20,42 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+)
 
 
 class ArceeMLP(nn.Module):
     """Feed-forward layer for Arcee using ReLU^2 activation
     (no gating as in LLaMA)."""
 
-    def __init__(self,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 hidden_act: str,
-                 quant_config: Optional[Any] = None,
-                 bias: bool = False,
-                 prefix: str = "",
-                 reduce_results: bool = True) -> None:
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Any | None = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
         super().__init__()
         # Single linear projection up to intermediate size
         # (no separate gate projection)
@@ -66,8 +76,10 @@ class ArceeMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "relu2":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only 'relu2' is supported for AFM.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only 'relu2' is supported for AFM."
+            )
         # Define ReLU^2 activation: (ReLU(x))^2 elementwise
         self.act_fn = ReLUSquaredActivation()
 
@@ -82,38 +94,45 @@ class ArceeDecoderLayer(nn.Module):
     """Transformer decoder block for Arcee, with self-attention and
     ReLU^2 MLP."""
 
-    def __init__(self,
-                 config: LlamaConfig,
-                 cache_config: Optional[Any] = None,
-                 quant_config: Optional[Any] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: LlamaConfig,
+        cache_config: Any | None = None,
+        quant_config: Any | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         # Rotary embedding parameters (reuse LLaMA defaults)
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Determine if attention bias is needed (some variants use bias terms)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         bias_o_proj = attention_bias
         if hasattr(config, "qkv_bias"):
             attention_bias = config.qkv_bias
 
         # Self-Attention (using LLaMA's attention structure)
         from vllm.model_executor.models.llama import (
-            LlamaAttention)  # import here to avoid circular import
+            LlamaAttention,  # import here to avoid circular import
+        )
+
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -123,8 +142,8 @@ class ArceeDecoderLayer(nn.Module):
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
             attn_type=getattr(
-                config, "attn_type",
-                "decoder"),  # assume decoder (causal) unless specified
+                config, "attn_type", "decoder"
+            ),  # assume decoder (causal) unless specified
         )
         # MLP with ReLU^2 activation
         self.mlp = ArceeMLP(
@@ -136,14 +155,16 @@ class ArceeDecoderLayer(nn.Module):
             prefix=f"{prefix}.mlp",
         )
         # Layer normalization layers (RMSNorm as in LLaMA)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
-            self, positions: torch.Tensor, hidden_states: torch.Tensor,
-            residual: Optional[torch.Tensor]
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self-Attention block
         if residual is None:
@@ -151,13 +172,10 @@ class ArceeDecoderLayer(nn.Module):
             hidden_states = self.input_layernorm(hidden_states)
         else:
             # Fused residual add + layernorm if supported
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
         # Feed-forward block
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -167,11 +185,13 @@ class ArceeModel(nn.Module):
     """The transformer model backbone for Arcee (embedding layer + stacked
     decoder blocks + final norm)."""
 
-    def __init__(self,
-                 *,
-                 vllm_config,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = ArceeDecoderLayer) -> None:
+    def __init__(
+        self,
+        *,
+        vllm_config,
+        prefix: str = "",
+        layer_type: type[nn.Module] = ArceeDecoderLayer,
+    ) -> None:
         super().__init__()
         config: LlamaConfig = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -179,28 +199,28 @@ class ArceeModel(nn.Module):
         self.quant_config = quant_config
         self.config = config
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         # Word embeddings (parallelized if using pipeline parallel)
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
-            self.embed_tokens = PPMissingLayer(
-            )  # placeholder on non-embedding ranks
+            self.embed_tokens = PPMissingLayer()  # placeholder on non-embedding ranks
 
         # Build decoder layers across pipeline ranks
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: layer_type(config=config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config,
-                                      prefix=prefix),
+            lambda prefix: layer_type(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
         # Final RMSNorm on the last pipeline stage
@@ -215,56 +235,57 @@ class ArceeModel(nn.Module):
 
         # Prepare factory for empty intermediate tensors
         # (for pipeline scheduling)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None
-    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
-                                                        list[torch.Tensor]]]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         # Embedding lookup (on first pipeline rank)
         if get_pp_group().is_first_rank:
-            hidden_states = (inputs_embeds if inputs_embeds is not None else
-                             self.get_input_embeddings(input_ids))
+            hidden_states = (
+                inputs_embeds
+                if inputs_embeds is not None
+                else self.embed_input_ids(input_ids)
+            )
             residual = None
         else:
             assert intermediate_tensors is not None, (
-                "IntermediateTensors must be provided for non-first "
-                "pipeline ranks")
+                "IntermediateTensors must be provided for non-first pipeline ranks"
+            )
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
         aux_hidden_states: list[torch.Tensor] = []
         for idx, layer in enumerate(
-                islice(self.layers, self.start_layer, self.end_layer)):
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(
-                    hidden_states +
-                    residual)  # capture pre-layer hidden state if needed
+                    hidden_states + residual
+                )  # capture pre-layer hidden state if needed
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
             # Send intermediate results to the next pipeline stage
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         # On last rank: apply final layer norm
         hidden_states, _ = self.norm(hidden_states, residual)
         if len(aux_hidden_states) > 0:
             return hidden_states, aux_hidden_states
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         """Load weights, mapping q/k/v projections to fused qkv_proj."""
         stacked_params_mapping = [
             (".qkv_proj", ".q_proj", "q"),
@@ -278,17 +299,17 @@ class ArceeModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 continue
 
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -331,8 +352,7 @@ class ArceeModel(nn.Module):
                 continue
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -342,6 +362,7 @@ class ArceeModel(nn.Module):
 class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     """Arcee Model for causal language modeling, integrated with vLLM
     runtime."""
+
     # Map fused module names to their submodule components
     # (for quantization and LoRA)
     packed_modules_mapping = {
@@ -354,69 +375,65 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.config = config
 
         # Initialize the inner Transformer model (ArceeModel)
-        self.model = ArceeModel(vllm_config=vllm_config,
-                                prefix=f"{prefix}.model")
+        self.model = ArceeModel(vllm_config=vllm_config, prefix=f"{prefix}.model")
         # On the last pipeline stage, set up the LM head and logits processor
         if get_pp_group().is_last_rank:
             # Determine vocabulary size (including any LoRA extra tokens
             # for padded LM head)
-            self.unpadded_vocab_size = config.vocab_size
 
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=vllm_config.quant_config,
                 bias=getattr(config, "lm_head_bias", False),
                 prefix=f"{prefix}.lm_head",
             )
             if config.tie_word_embeddings:
                 # Tie output weights with input embedding matrix
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.embed_tokens)
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             # Placeholder for lm_head on non-last ranks
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids=input_ids,
-                                  positions=positions,
-                                  intermediate_tensors=intermediate_tensors,
-                                  inputs_embeds=inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
         return model_output
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # Compute final logits from hidden states (last pipeline rank only)
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         """Load weights into the model (delegates to inner model and handles
         tied embeddings)."""
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
-            skip_substrs=["gate_proj"])
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+            skip_substrs=["gate_proj"],
+        )
         # AutoWeightLoader handles weight name remapping, including fusing
         # separate q_proj, k_proj, v_proj into qkv_proj
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 55d16fd75cebb35fefda09e3ae2fd93ced992400..b5cc07a56535dc02ca691e01e8857bf3a4e4bf49 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -11,24 +11,33 @@ from torch import nn
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.deepspeedfp import (
-    DeepSpeedFPConfig, DeepSpeedFPParameter)
+    DeepSpeedFPConfig,
+    DeepSpeedFPParameter,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -36,41 +45,55 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
 from .interfaces import SupportsPP, SupportsQuant
-from .utils import (extract_layer_index, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class ArcticMLP(nn.Module):
-
-    def __init__(self,
-                 config: ArcticConfig,
-                 expert_id: int = -1,
-                 is_residual_mlp: bool = False,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 reduce_results: bool = True,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: ArcticConfig,
+        expert_id: int = -1,
+        is_residual_mlp: bool = False,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.expert_id = expert_id
 
-        self.ffn_dim = config.intermediate_size if not is_residual_mlp \
-            else self.hidden_size
-
-        self.w13 = MergedColumnParallelLinear(self.hidden_size,
-                                              [self.ffn_dim] * 2,
-                                              bias=False,
-                                              quant_config=quant_config)
-        self.w2 = RowParallelLinear(self.ffn_dim,
-                                    self.hidden_size,
-                                    bias=False,
-                                    reduce_results=reduce_results,
-                                    quant_config=quant_config)
+        self.ffn_dim = (
+            config.intermediate_size if not is_residual_mlp else self.hidden_size
+        )
+
+        self.w13 = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.ffn_dim] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w13",
+        )
+        self.w2 = RowParallelLinear(
+            self.ffn_dim,
+            self.hidden_size,
+            bias=False,
+            reduce_results=reduce_results,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w2",
+        )
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, hidden_states):
@@ -85,13 +108,15 @@ class ArcticMoE(nn.Module):
     Model-parallel implementation of Arctic MoE Layer.
     """
 
-    def __init__(self,
-                 config: ArcticConfig,
-                 tp_size: Optional[int] = None,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 reduce_results: bool = True,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: ArcticConfig,
+        tp_size: int | None = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ):
         super().__init__()
 
         layer_id = extract_layer_index(prefix)
@@ -111,52 +136,75 @@ class ArcticMoE(nn.Module):
         self.params_dtype = params_dtype
 
         if not self.is_moe_layer:
-            self.mlp = ArcticMLP(config,
-                                 quant_config=quant_config,
-                                 reduce_results=reduce_results,
-                                 prefix=f"{prefix}.mlp")
+            self.mlp = ArcticMLP(
+                config,
+                quant_config=quant_config,
+                reduce_results=reduce_results,
+                prefix=f"{prefix}.mlp",
+            )
         else:
-            self.gate = ReplicatedLinear(self.hidden_size,
-                                         self.num_experts,
-                                         bias=False,
-                                         params_dtype=self.params_dtype,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.gate")
+            self.gate = ReplicatedLinear(
+                self.hidden_size,
+                self.num_experts,
+                bias=False,
+                params_dtype=self.params_dtype,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate",
+            )
             if self.is_quant:
                 self.ws = DeepSpeedFPParameter(
-                    torch.Size((self.num_experts, 2 * self.intermediate_size,
-                                self.hidden_size)),
+                    torch.Size(
+                        (self.num_experts, 2 * self.intermediate_size, self.hidden_size)
+                    ),
                     params_dtype=params_dtype,
                     quant_config=quant_config,
                 )
                 self.w2s = DeepSpeedFPParameter(
-                    torch.Size((self.num_experts, self.hidden_size,
-                                self.intermediate_size)),
+                    torch.Size(
+                        (self.num_experts, self.hidden_size, self.intermediate_size)
+                    ),
                     params_dtype=params_dtype,
                     quant_config=quant_config,
                 )
             else:
                 self.ws = nn.Parameter(
-                    torch.empty(self.num_experts,
-                                2 * self.intermediate_size,
-                                self.hidden_size,
-                                device=current_platform.device_type,
-                                dtype=self.params_dtype))
+                    torch.empty(
+                        self.num_experts,
+                        2 * self.intermediate_size,
+                        self.hidden_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype,
+                    )
+                )
                 self.w2s = nn.Parameter(
-                    torch.empty(self.num_experts,
-                                self.hidden_size,
-                                self.intermediate_size,
-                                device=current_platform.device_type,
-                                dtype=self.params_dtype))
-            set_weight_attrs(self.ws, {
-                "weight_loader": self.weight_loader,
-            })
-            set_weight_attrs(self.w2s, {
-                "weight_loader": self.weight_loader,
-            })
-
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, expert_id: int):
+                    torch.empty(
+                        self.num_experts,
+                        self.hidden_size,
+                        self.intermediate_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype,
+                    )
+                )
+            set_weight_attrs(
+                self.ws,
+                {
+                    "weight_loader": self.weight_loader,
+                },
+            )
+            set_weight_attrs(
+                self.w2s,
+                {
+                    "weight_loader": self.weight_loader,
+                },
+            )
+
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        expert_id: int,
+    ):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.ds_dequantize() if self.is_quant else param.data
         shard_size = self.intermediate_size
@@ -164,8 +212,9 @@ class ArcticMoE(nn.Module):
         if weight_name.endswith("w1.weight"):
             param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w3.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+            param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[
+                shard, :
+            ]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
         if self.is_quant:
@@ -178,15 +227,14 @@ class ArcticMoE(nn.Module):
         router_logits, _ = self.gate(hidden_states)
         do_normalize = self.top_k > 1
         topk_weights, topk_ids, token_expert_indices = fused_topk(
-            hidden_states, router_logits, self.top_k, renormalize=do_normalize)
+            hidden_states, router_logits, self.top_k, renormalize=do_normalize
+        )
         # topk_ids: (num_tokens, k)
         if self.is_quant:
             if 2 * num_tokens <= self.num_experts:
                 # If much fewer tokens than experts, use selective dequantize.
-                ws_dequantized = self.ws.ds_selective_dequantize(
-                    topk_ids.flatten())
-                w2s_dequantized = self.w2s.ds_selective_dequantize(
-                    topk_ids.flatten())
+                ws_dequantized = self.ws.ds_selective_dequantize(topk_ids.flatten())
+                w2s_dequantized = self.w2s.ds_selective_dequantize(topk_ids.flatten())
                 # We gathered the experts to the tokens so update the mapping.
                 topk_ids = torch.arange(
                     0,
@@ -203,10 +251,10 @@ class ArcticMoE(nn.Module):
             w2s_dequantized if self.is_quant else self.w2s,
             topk_weights,
             topk_ids,
-            inplace=True)
+            inplace=True,
+        )
         if self.reduce_results and self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states.view(num_tokens, hidden_size)
 
     def forward(self, hidden_states: torch.Tensor):
@@ -218,12 +266,11 @@ class ArcticMoE(nn.Module):
 
 
 class ArcticAttention(nn.Module):
-
     def __init__(
         self,
         config: ArcticConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -248,18 +295,22 @@ class ArcticAttention(nn.Module):
         self.rope_theta = config.rope_theta
         self.scaling = self.head_dim**-0.5
 
-        self.qkv_proj = QKVParallelLinear(self.hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=False,
-                                          quant_config=quant_config)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=False,
             reduce_results=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -270,13 +321,15 @@ class ArcticAttention(nn.Module):
             is_neox_style=True,
         )
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -292,12 +345,11 @@ class ArcticAttention(nn.Module):
 
 
 class ArcticDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: ArcticConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -305,10 +357,12 @@ class ArcticDecoderLayer(nn.Module):
         layer_idx = extract_layer_index(prefix)
         is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
         self.use_residual = config.use_residual and is_moe_layer
-        self.self_attn = ArcticAttention(config,
-                                         cache_config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.self_attn")
+        self.self_attn = ArcticAttention(
+            config,
+            cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
         self.block_sparse_moe = ArcticMoE(
             config,
             quant_config=quant_config,
@@ -316,18 +370,21 @@ class ArcticDecoderLayer(nn.Module):
             prefix=f"{prefix}.block_sparse_moe",
         )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
         if self.use_residual:
-            self.residual_layernorm = RMSNorm(config.hidden_size,
-                                              eps=config.rms_norm_eps)
-            self.residual_mlp = ArcticMLP(config,
-                                          is_residual_mlp=True,
-                                          reduce_results=False,
-                                          prefix=f"{prefix}.residual_mlp")
+            self.residual_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+            self.residual_mlp = ArcticMLP(
+                config,
+                is_residual_mlp=True,
+                reduce_results=False,
+                prefix=f"{prefix}.residual_mlp",
+            )
 
     def forward(
         self,
@@ -361,7 +418,6 @@ class ArcticDecoderLayer(nn.Module):
 
 @support_torch_compile
 class ArcticModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -371,35 +427,36 @@ class ArcticModel(nn.Module):
 
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=self.vocab_size)
+            self.vocab_size, config.hidden_size, org_num_embeddings=self.vocab_size
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: ArcticDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self._attn_implementation = config._attn_implementation
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -419,8 +476,9 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = ArcticModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = ArcticModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(
             self.vocab_size,
@@ -432,35 +490,35 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.num_experts = config.num_local_experts
         self.num_experts_per_tok = config.num_experts_per_tok
-        self.unpadded_vocab_size = config.vocab_size
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -474,28 +532,47 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
 
         for layer in range(num_layers):
             mlp_params_mapping.append(
-                (f"layers.{layer}.residual_mlp.w13.weight",
-                 f"layers.{layer}.residual_mlp.w1.weight", 0))
+                (
+                    f"layers.{layer}.residual_mlp.w13.weight",
+                    f"layers.{layer}.residual_mlp.w1.weight",
+                    0,
+                )
+            )
             mlp_params_mapping.append(
-                (f"layers.{layer}.residual_mlp.w13.weight",
-                 f"layers.{layer}.residual_mlp.w3.weight", 1))
+                (
+                    f"layers.{layer}.residual_mlp.w13.weight",
+                    f"layers.{layer}.residual_mlp.w3.weight",
+                    1,
+                )
+            )
             if layer % 2 == 0:
                 # MLP layers
                 mlp_params_mapping.append(
-                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
-                     f"layers.{layer}.block_sparse_moe.mlp.w1.weight", 0))
+                    (
+                        f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
+                        f"layers.{layer}.block_sparse_moe.mlp.w1.weight",
+                        0,
+                    )
+                )
                 mlp_params_mapping.append(
-                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
-                     f"layers.{layer}.block_sparse_moe.mlp.w3.weight", 1))
+                    (
+                        f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
+                        f"layers.{layer}.block_sparse_moe.mlp.w3.weight",
+                        1,
+                    )
+                )
             else:
                 # MoE layers
                 for expert_id in range(self.config.num_local_experts):
                     expert_params_mapping.append(
-                        ("ws", f"experts.{expert_id}.w1.weight", expert_id))
+                        ("ws", f"experts.{expert_id}.w1.weight", expert_id)
+                    )
                     expert_params_mapping.append(
-                        ("w2s", f"experts.{expert_id}.w2.weight", expert_id))
+                        ("w2s", f"experts.{expert_id}.w2.weight", expert_id)
+                    )
                     expert_params_mapping.append(
-                        ("ws", f"experts.{expert_id}.w3.weight", expert_id))
+                        ("ws", f"experts.{expert_id}.w3.weight", expert_id)
+                    )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -503,9 +580,10 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
         logger.info(
             "It will take ~10 minutes loading from the 16-bit weights. "
             "Alternatively, use the prequantized 8-bit weights of arctic "
-            "and set load-format to `sharded_state` will accelerate loading.")
+            "and set load-format to `sharded_state` will accelerate loading."
+        )
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -530,8 +608,7 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
                     weight_loader(param, loaded_weight, shard_id)
                     break
                 else:
-                    for param_name, weight_name, shard_id \
-                            in expert_params_mapping:
+                    for param_name, weight_name, shard_id in expert_params_mapping:
                         if weight_name not in name:
                             continue
                         name = name.replace(weight_name, param_name)
@@ -539,10 +616,9 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
                             continue
                         param = params_dict[name]
                         weight_loader = param.weight_loader
-                        weight_loader(param,
-                                      loaded_weight,
-                                      weight_name,
-                                      expert_id=shard_id)
+                        weight_loader(
+                            param, loaded_weight, weight_name, expert_id=shard_id
+                        )
                         break
                     else:
                         if name.endswith(".bias") and name not in params_dict:
@@ -551,8 +627,9 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
                             continue
                         param = params_dict[name]
 
-                        weight_loader = getattr(param, "weight_loader",
-                                                default_weight_loader)
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
                         weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 4895975bc3584cd56cba846b0e4ee0771ed61960..3d07e6b612ca3d697a434e67c316f0bd4f705b2e 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -9,37 +9,48 @@ from transformers import AriaConfig, AriaTextConfig, BatchFeature
 from transformers.models.aria.modeling_aria import AriaCrossAttention
 from transformers.models.aria.processing_aria import AriaProcessor
 
-from vllm.config import QuantizationConfig, VllmConfig
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-# yapf: disable
 from .idefics2_vision_model import Idefics2VisionConfig
 from .idefics2_vision_model import (
-    Idefics2VisionTransformer as Idefics3VisionTransformer)
-# yapf: enable
+    Idefics2VisionTransformer as Idefics3VisionTransformer,
+)
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    is_pp_missing_parameter, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    maybe_prefix,
+)
 
 
 class AriaImagePixelInputs(TensorSchema):
@@ -52,13 +63,15 @@ class AriaImagePixelInputs(TensorSchema):
         - w: Width of each image
     """
 
+    type: Literal["pixel_values"]
+
     pixel_values: Annotated[
         torch.Tensor,
         TensorShape("bn", 3, "h", "w"),
     ]
 
     pixel_mask: Annotated[
-        Optional[torch.Tensor],
+        torch.Tensor | None,
         TensorShape("bn", "h", "w"),
     ]
 
@@ -69,7 +82,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(config, quant_config=quant_config, prefix=prefix)
@@ -78,8 +91,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
         # Identity layer
         self.post_layernorm = nn.Identity()
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -89,7 +101,6 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-
             # NOTE: post_layernorm is not used in Aria
             if "post_layernorm" in name:
                 continue
@@ -104,15 +115,13 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class AriaProjectorMLP(nn.Module):
-
     def __init__(
         self,
         in_features: int,
@@ -121,12 +130,8 @@ class AriaProjectorMLP(nn.Module):
     ) -> None:
         super().__init__()
 
-        self.linear_in = ColumnParallelLinear(in_features,
-                                              hidden_features,
-                                              bias=False)
-        self.linear_out = RowParallelLinear(hidden_features,
-                                            output_dim,
-                                            bias=False)
+        self.linear_in = ColumnParallelLinear(in_features, hidden_features, bias=False)
+        self.linear_out = RowParallelLinear(hidden_features, output_dim, bias=False)
         self.act = get_act_fn("gelu_new")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -160,27 +165,31 @@ class AriaProjector(nn.Module):
         self.output_dim = config.text_config.hidden_size
 
         self.query = nn.Parameter(
-            torch.empty(config.max_value_projector_patch_to_query_dict,
-                        self.in_features))
+            torch.empty(
+                config.max_value_projector_patch_to_query_dict, self.in_features
+            )
+        )
 
         self.cross_attn = AriaCrossAttention(config)
 
         self.layer_norm = nn.LayerNorm(self.in_features)
-        self.feed_forward = AriaProjectorMLP(self.in_features,
-                                             self.hidden_features,
-                                             self.output_dim)
+        self.feed_forward = AriaProjectorMLP(
+            self.in_features, self.hidden_features, self.output_dim
+        )
 
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         batch_size, num_patches = x.shape[0], x.shape[1]
 
         if num_patches not in self.patch_to_query_dict:
-            raise KeyError(f"Number of patches {num_patches} not found in "
-                           "patch_to_query_dict amongst possible values "
-                           f"{self.patch_to_query_dict.keys()}.")
+            raise KeyError(
+                f"Number of patches {num_patches} not found in "
+                "patch_to_query_dict amongst possible values "
+                f"{self.patch_to_query_dict.keys()}."
+            )
 
         query_num = self.patch_to_query_dict[num_patches]
 
@@ -197,33 +206,33 @@ class AriaProjector(nn.Module):
         return out
 
 
-class AriaFusedMoE(FusedMoE):
-
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      shard_id: str) -> None:
+class AriaFusedMoE(SharedFusedMoE):
+    def weight_loader(
+        self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_id: str
+    ) -> None:
         # Override the weight_loader to handle the expert weights in the Aria
         # model, which are already packed with experts, and merge the gate and
         # up weights for each expert.
         # Note: Loading expert weights with quantization is not supported
         tp_rank = get_tensor_model_parallel_rank()
-        if shard_id == 'w13':
+        if shard_id == "w13":
             # the shape of loaded_weight is
             # (num_experts, hidden_size, 2 * moe_intermediate_size)
             if self.tp_size > 1:
                 up, gate = loaded_weight.chunk(2, dim=-1)
                 up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank]
                 gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank]
-                up_and_gate = torch.cat([up_current_rank, gate_current_rank],
-                                        dim=-1).transpose(1, 2)
+                up_and_gate = torch.cat(
+                    [up_current_rank, gate_current_rank], dim=-1
+                ).transpose(1, 2)
                 param.data.copy_(up_and_gate)
             else:
                 param.data.copy_(loaded_weight.transpose(1, 2))
-        elif shard_id == 'w2':
+        elif shard_id == "w2":
             # the shape of loaded_weight is
             # (num_experts, moe_intermediate_size, hidden_size)
             if self.tp_size > 1:
-                down_current_rank = loaded_weight.chunk(self.tp_size,
-                                                        dim=1)[tp_rank]
+                down_current_rank = loaded_weight.chunk(self.tp_size, dim=1)[tp_rank]
                 param.data.copy_(down_current_rank.transpose(1, 2))
             else:
                 param.data.copy_(loaded_weight.transpose(1, 2))
@@ -241,17 +250,26 @@ class AriaTextMoELayer(nn.Module):
     def __init__(
         self,
         config: AriaTextConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
 
         self.router_weight = nn.Parameter(
-            torch.empty(
-                (self.config.moe_num_experts, self.config.hidden_size)))
+            torch.empty((self.config.moe_num_experts, self.config.hidden_size))
+        )
+
+        self.shared_experts = LlamaMLP(
+            config.hidden_size,
+            config.intermediate_size * config.moe_num_shared_experts,
+            "silu",
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+        )
 
         self.experts = AriaFusedMoE(
+            shared_experts=self.shared_experts,
             num_experts=config.moe_num_experts,
             top_k=config.moe_topk,
             hidden_size=config.hidden_size,
@@ -260,13 +278,6 @@ class AriaTextMoELayer(nn.Module):
             reduce_results=True,
             prefix=f"{prefix}.experts",
         )
-        self.shared_experts = LlamaMLP(
-            config.hidden_size,
-            config.intermediate_size * config.moe_num_shared_experts,
-            "silu",
-            quant_config=quant_config,
-            bias=config.mlp_bias,
-        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
@@ -280,15 +291,14 @@ class AriaTextMoELayer(nn.Module):
             torch.Tensor: Output tensor after passing through the MoE layer.
         """
 
-        router_output = torch.nn.functional.linear(hidden_states,
-                                                   self.router_weight)
+        router_output = torch.nn.functional.linear(hidden_states, self.router_weight)
 
-        hidden_states_copy = hidden_states.clone()
-        # NOTE: hidden_states will be modified inplace by `FusedMoE`
         sparse_expert_output = self.experts(hidden_states, router_output)
-        shared_expert_output = self.shared_experts(hidden_states_copy)
 
-        return sparse_expert_output + shared_expert_output
+        if self.shared_experts is not None:
+            return sparse_expert_output[0] + sparse_expert_output[1]
+        else:
+            return sparse_expert_output
 
 
 class AriaTextDecoderLayer(LlamaDecoderLayer):
@@ -304,9 +314,9 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.mlp = AriaTextMoELayer(config,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.mlp")
+        self.mlp = AriaTextMoELayer(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
 
 
 class AriaTextModel(LlamaModel, SupportsQuant):
@@ -314,6 +324,7 @@ class AriaTextModel(LlamaModel, SupportsQuant):
     Custom LlamaModel for the AriaMoE model which modifies the standard
     LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
     """
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -322,14 +333,13 @@ class AriaTextModel(LlamaModel, SupportsQuant):
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=AriaTextDecoderLayer)
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, layer_type=AriaTextDecoderLayer
+        )
 
     # Adapted from LlamaModel.load_weights with the modification of adding
     # the expert weights mapping to `stacked_params_mapping`
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -337,27 +347,27 @@ class AriaTextModel(LlamaModel, SupportsQuant):
             (".qkv_proj", ".v_proj", "v"),
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
-            ("experts.w13_weight", "experts.fc1.weight", 'w13'),
-            ("experts.w2_weight", "experts.fc2.weight", 'w2'),
+            ("experts.w13_weight", "experts.fc1.weight", "w13"),
+            ("experts.w2_weight", "experts.fc2.weight", "w2"),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -389,15 +399,13 @@ class AriaTextModel(LlamaModel, SupportsQuant):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class AriaProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(AriaConfig)
 
@@ -407,7 +415,7 @@ class AriaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(self) -> int:
@@ -416,7 +424,6 @@ class AriaProcessingInfo(BaseProcessingInfo):
 
 
 class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -429,22 +436,26 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=max_image_size,
+                height=max_image_size,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -475,9 +486,11 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
-                                        info=AriaProcessingInfo,
-                                        dummy_inputs=AriaDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    AriaMultiModalProcessor,
+    info=AriaProcessingInfo,
+    dummy_inputs=AriaDummyInputsBuilder,
+)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -485,6 +498,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     This model combines a vision tower, a multi-modal projector, and a language
     model to perform tasks that involve both image and text inputs.
     """
+
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
@@ -501,7 +517,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|fim_prefix|><|img|><|fim_suffix|>"
 
@@ -528,22 +544,21 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=maybe_prefix(prefix, "language_model.model"),
         )
-        self.pad_token_id = (self.config.pad_token_id
-                             if self.config.pad_token_id is not None else -1)
-        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            self.vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                self.vocab_size, logit_scale)
+        self.logits_processor = LogitsProcessor(self.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
+        self, **kwargs: object
+    ) -> AriaImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         pixel_mask = kwargs.pop("pixel_mask", None)
 
@@ -551,12 +566,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
             return None
 
         return AriaImagePixelInputs(
-            pixel_values=flatten_bn(pixel_values, concat=True),
-            pixel_mask=flatten_bn(pixel_mask, concat=True),
+            type="pixel_values",
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
         )
 
     def _create_patch_attention_mask(
-            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        self,
+        pixel_mask: torch.Tensor | None,
+    ) -> torch.Tensor | None:
         if pixel_mask is None:
             return None
 
@@ -576,8 +594,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.vision_tower is not None
 
-        pixel_values = image_input['pixel_values']
-        pixel_mask = image_input['pixel_mask']
+        pixel_values = image_input["pixel_values"]
+        pixel_mask = image_input["pixel_mask"]
 
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
 
@@ -595,41 +613,28 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         multimodal_embeddings = self._process_image_input(image_input)
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_index)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
+            multimodal_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                multimodal_embeddings,
+                is_multimodal=input_ids == self.config.image_token_index,
+            )
             input_ids = None
 
         hidden_states = self.language_model(
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 6fd8c2fb5c561b974474a92d7e57724de51f09dc..0ada2ed5028bb8543b4c2b95aa369c927fdafe66 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -10,31 +10,36 @@ from transformers import BatchFeature, GotOcr2ImageProcessor
 from transformers.activations import ACT2FN
 from transformers.image_processing_utils import get_size_dict
 from transformers.models.aya_vision import AyaVisionConfig
-from transformers.models.aya_vision.processing_aya_vision import (
-    AyaVisionProcessor)
+from transformers.models.aya_vision.processing_aya_vision import AyaVisionProcessor
 from transformers.models.got_ocr2.image_processing_got_ocr2 import (
-    get_optimal_tiled_canvas)
+    get_optimal_tiled_canvas,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalFieldConfig,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 
 class AyaVisionImagePixelInputs(TensorSchema):
@@ -62,17 +67,17 @@ class AyaVisionImagePixelInputs(TensorSchema):
 
 
 class AyaVisionMultiModalProjector(nn.Module):
-
     def __init__(self, config: AyaVisionConfig):
         super().__init__()
         self.config = config
         self.downsample_factor = config.downsample_factor
         self.alignment_intermediate_size = getattr(
-            config, "alignment_intermediate_size",
-            config.text_config.hidden_size)
-        self.layernorm = nn.LayerNorm(config.vision_config.hidden_size *
-                                      (config.downsample_factor**2),
-                                      eps=config.adapter_layer_norm_eps)
+            config, "alignment_intermediate_size", config.text_config.hidden_size
+        )
+        self.layernorm = nn.LayerNorm(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            eps=config.adapter_layer_norm_eps,
+        )
 
         self.linear_1 = nn.Linear(
             config.vision_config.hidden_size * (config.downsample_factor**2),
@@ -82,9 +87,11 @@ class AyaVisionMultiModalProjector(nn.Module):
 
         self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
         # For SwiGLU, project down to half size since we split intermediate dim
-        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2,
-                                  config.text_config.hidden_size,
-                                  bias=True)
+        self.linear_2 = nn.Linear(
+            self.alignment_intermediate_size // 2,
+            config.text_config.hidden_size,
+            bias=True,
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         image_features = self.pixel_shuffle(image_features)
@@ -98,26 +105,31 @@ class AyaVisionMultiModalProjector(nn.Module):
         hidden_states = self.linear_2(hidden_states)
         return hidden_states
 
-    def pixel_shuffle(self,
-                      image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
+    def pixel_shuffle(self, image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
         batch_size, seq_length, _ = image_features.shape
         height = width = int(seq_length**0.5)
-        image_features = image_features.reshape(image_features.shape[0], width,
-                                                height, -1)
+        image_features = image_features.reshape(
+            image_features.shape[0], width, height, -1
+        )
         channels = image_features.shape[-1]
         image_features = image_features.reshape(
-            batch_size, width, int(height / self.downsample_factor),
-            int(channels * self.downsample_factor))
+            batch_size,
+            width,
+            int(height / self.downsample_factor),
+            int(channels * self.downsample_factor),
+        )
         image_features = image_features.permute(0, 2, 1, 3)
         image_features = image_features.reshape(
-            batch_size, int(height / self.downsample_factor),
-            int(width / self.downsample_factor), -1)
+            batch_size,
+            int(height / self.downsample_factor),
+            int(width / self.downsample_factor),
+            -1,
+        )
         image_features = image_features.permute(0, 2, 1, 3)
         return image_features
 
 
 class AyaVisionProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> AyaVisionConfig:
         return self.ctx.get_hf_config(AyaVisionConfig)
 
@@ -127,19 +139,25 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        height = image_processor.size['height']
-        width = image_processor.size['width']
+        height = image_processor.size["height"]
+        width = image_processor.size["width"]
         max_patches = image_processor.max_patches
-        return ImageSize(height=height * max_patches,
-                         width=width * max_patches)
+        return ImageSize(height=height * max_patches, width=width * max_patches)
 
-    def get_num_patches(self, *, image_width: int, image_height: int,
-                        size: dict, min_patches: int, max_patches: int) -> int:
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        size: dict,
+        min_patches: int,
+        max_patches: int,
+    ) -> int:
         """
         Calculate the number of patches needed for a given image based on size
         constraints.  This method replicates and adjusts the logic from:
@@ -147,15 +165,16 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
         """
         size = get_size_dict(size, default_to_square=False)
         num_columns, num_rows = get_optimal_tiled_canvas(
-            (image_height, image_width), (size["height"], size["width"]),
-            min_patches, max_patches)
+            (image_height, image_width),
+            (size["height"], size["width"]),
+            min_patches,
+            max_patches,
+        )
         num_blocks = num_columns * num_rows
         return num_blocks if num_blocks == 1 else num_blocks + 1
 
 
-class AyaVisionDummyInputsBuilder(
-        BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
-
+class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -168,22 +187,24 @@ class AyaVisionDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
-        image_size = \
-            self.info.get_image_size_with_most_features()
+        image_size = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=image_size.width,
-                                   height=image_size.height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=image_size.width,
+                height=image_size.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class AyaVisionMultiModalProcessor(
-        BaseMultiModalProcessor[AyaVisionProcessingInfo]):
-
+class AyaVisionMultiModalProcessor(BaseMultiModalProcessor[AyaVisionProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -202,13 +223,13 @@ class AyaVisionMultiModalProcessor(
 
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images := mm_data.get("images")) is not None:
-            parsed_images = (self._get_data_parser().parse_mm_data({
-                "image":
-                images
-            }).get_items("image", ImageProcessorItems))
+            parsed_images = (
+                self._get_data_parser()
+                .parse_mm_data({"image": images})
+                .get_items("image", ImageProcessorItems)
+            )
             image_sizes = [
-                parsed_images.get_image_size(i)
-                for i in range(len(parsed_images))
+                parsed_images.get_image_size(i) for i in range(len(parsed_images))
             ]
 
             num_patches = [
@@ -217,7 +238,8 @@ class AyaVisionMultiModalProcessor(
                     image_height=image_size.height,
                     size=image_processor.size,
                     min_patches=image_processor.min_patches,
-                    max_patches=image_processor.max_patches)
+                    max_patches=image_processor.max_patches,
+                )
                 for image_size in image_sizes
             ]
             processed_outputs["num_patches"] = torch.tensor(num_patches)
@@ -231,8 +253,7 @@ class AyaVisionMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         num_patches = hf_inputs.get("num_patches", torch.empty(0))
         return dict(
-            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -279,10 +300,10 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
         return _get_layer_index(feature_layers, num_hidden_layers)
     # If we have multiple feature layers, initialize up to the deepest m
     elif isinstance(feature_layers, (list, tuple)):
-        return max(
-            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
-    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
-                    " is not supported")
+        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(
+        f"vision_layer_feature type: {type(feature_layers)} is not supported"
+    )
 
 
 def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
@@ -294,9 +315,10 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 @MULTIMODAL_REGISTRY.register_processor(
     AyaVisionMultiModalProcessor,
     info=AyaVisionProcessingInfo,
-    dummy_inputs=AyaVisionDummyInputsBuilder)
-class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+    dummy_inputs=AyaVisionDummyInputsBuilder,
+)
+class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -305,10 +327,11 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -328,7 +351,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             config.vision_config,
             quant_config,
             num_hidden_layers_override=num_hidden_layers,
-            prefix=maybe_prefix(prefix, "vision_model"))
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
         self.vocab_size = config.text_config.vocab_size
         self.multi_modal_projector = AyaVisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
@@ -336,14 +360,14 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "model"),
             # Cohere2ForCausalLM and CohereForCausalLM are the same on vllm
-            architectures=["Cohere2ForCausalLM"])
+            architectures=["Cohere2ForCausalLM"],
+        )
 
     @property
     def dtype(self):
         return next(self.parameters()).dtype
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -351,44 +375,27 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         self,
         vision_tower: SiglipVisionModel,
         pixel_values: torch.Tensor,
-        **kwargs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
-        target_dtype: torch.dtype = \
-            vision_tower.get_input_embeddings().weight.dtype
-        image_features: Union[torch.Tensor, tuple[torch.Tensor, ...]] = \
-            vision_tower(pixel_values.to(dtype=target_dtype), **kwargs)
-
-        def select_features(leaf: torch.Tensor):
-            return self._select_image_features(
-                leaf,
-                strategy=self.config.vision_feature_select_strategy,
-            )
-
-        return json_map_leaves(select_features, image_features)
-
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        return vision_tower(
+            pixel_values.to(dtype=vision_tower.dtype),
+            feature_select_strategy=self.config.vision_feature_select_strategy,
+        )
 
-    def _process_image_input(self, image_input: AyaVisionImagePixelInputs,
-                             **kwargs) -> list[torch.Tensor]:
+    def _process_image_input(
+        self, image_input: AyaVisionImagePixelInputs, **kwargs
+    ) -> list[torch.Tensor]:
         assert self.vision_tower is not None
         pixel_values = image_input["pixel_values"]
         num_patches = image_input["num_patches"]
         image_features = self._image_pixels_to_features(
-            self.vision_tower, pixel_values=pixel_values)
+            self.vision_tower, pixel_values=pixel_values
+        )
         image_embeds = self.multi_modal_projector(image_features)
-        return [
-            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
-        ]
+        return [e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())]
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
+        self, **kwargs: object
+    ) -> AyaVisionImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -399,60 +406,35 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return AyaVisionImagePixelInputs(
             type="pixel_values",
-            pixel_values=flatten_bn(pixel_values, concat=True),
-            num_patches=flatten_bn(num_patches, concat=True),
+            pixel_values=pixel_values,
+            num_patches=num_patches,
             resolve_bindings={
                 "h": self.config.vision_config.image_size,
                 "w": self.config.vision_config.image_size,
-            })
+            },
+        )
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input, **kwargs)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=self.config.image_token_index,
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -464,5 +446,5 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index db8d0a8710471c0da91e008bea17eff259677f82..8991ef4c606b69eede6c50d23cb4665a7a1cb7d4 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -20,10 +20,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
+
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -32,32 +32,45 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, row_parallel_weight_loader)
+    default_weight_loader,
+    row_parallel_weight_loader,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
     base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
         dtype=torch.float32,
     )
     powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
@@ -65,41 +78,47 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
     if closest_power_of_2 != total_num_heads:
         extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
             dtype=torch.float32,
         )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
     return slopes
 
 
 class BaiChuanMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -119,18 +138,16 @@ class BaiChuanAttention(nn.Module):
         position_embedding: str,
         rope_theta: float = 10000,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = hidden_size // self.total_num_heads
         self.position_embedding = position_embedding
         self.rope_theta = rope_theta
@@ -144,12 +161,14 @@ class BaiChuanAttention(nn.Module):
             self.total_num_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.W_pack",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         # Create the alibi slopes and slice them.
         if self.position_embedding == "ALIBI":
@@ -160,12 +179,14 @@ class BaiChuanAttention(nn.Module):
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
             scaling = self.head_dim**-0.5
-            self.attn = Attention(self.num_heads,
-                                  self.head_dim,
-                                  scaling,
-                                  alibi_slopes=alibi_slopes,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                scaling,
+                alibi_slopes=alibi_slopes,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+            )
         else:
             self.rotary_emb = get_rope(
                 self.head_dim,
@@ -174,12 +195,14 @@ class BaiChuanAttention(nn.Module):
                 base=self.rope_theta,
             )
             self.scaling = self.head_dim**-0.5
-            self.attn = Attention(self.num_heads,
-                                  self.head_dim,
-                                  self.scaling,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+            )
 
     def forward(
         self,
@@ -196,18 +219,18 @@ class BaiChuanAttention(nn.Module):
 
 
 class BaiChuanDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 position_embedding: str,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = BaiChuanAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -223,40 +246,38 @@ class BaiChuanDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class BaiChuanModel(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -278,33 +299,31 @@ class BaiChuanModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: BaiChuanDecoderLayer(config,
-                                                position_embedding,
-                                                cache_config,
-                                                quant_config,
-                                                prefix=prefix),
+            lambda prefix: BaiChuanDecoderLayer(
+                config, position_embedding, cache_config, quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -317,15 +336,16 @@ class BaiChuanModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -337,7 +357,7 @@ class BaiChuanModel(nn.Module):
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -357,15 +377,13 @@ class BaiChuanModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
-                              SupportsQuant):
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [
@@ -384,53 +402,57 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
-        self.model = BaiChuanModel(vllm_config=vllm_config,
-                                   prefix=prefix,
-                                   position_embedding=position_embedding)
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = BaiChuanModel(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            position_embedding=position_embedding,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         self.lm_head.weight.weight_loader = self.lm_head_weight_loader
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
-    def lm_head_weight_loader(self, param: nn.Parameter,
-                              loaded_weight: torch.Tensor):
+    def lm_head_weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         # Unlike Baichuan, Baichuan2 normalizes the head weights.
         # Refer to:
         # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
@@ -454,13 +476,13 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         if config.hidden_size == 4096:  # baichuan2 7b
-            super().__init__(vllm_config=vllm_config,
-                             prefix=prefix,
-                             position_embedding="ROPE")
+            super().__init__(
+                vllm_config=vllm_config, prefix=prefix, position_embedding="ROPE"
+            )
         else:  # baichuan 13b, baichuan2 13b
-            super().__init__(vllm_config=vllm_config,
-                             prefix=prefix,
-                             position_embedding="ALIBI")
+            super().__init__(
+                vllm_config=vllm_config, prefix=prefix, position_embedding="ALIBI"
+            )
 
 
 class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
@@ -469,6 +491,6 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         position_embedding="ROPE")
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, position_embedding="ROPE"
+        )
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 82cd4a26a1baadda06f3fc6acd4d821c21401e4d..024425bb24406e74c729d5eef4aaf0db52bab5af 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -23,9 +23,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only BailingMoE model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -35,37 +35,46 @@ from transformers.configuration_utils import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class BailingAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ):
@@ -76,14 +85,12 @@ class BailingAttention(nn.Module):
         tp_size = get_tensor_model_parallel_world_size()
 
         assert self.total_num_heads % tp_size == 0
-        assert self.total_kv_heads % tp_size == 0
         assert self.total_num_heads >= self.total_kv_heads
 
         self.num_heads = self.total_num_heads // tp_size
-        self.head_dim = config.head_dim or (self.hidden_size //
-                                            self.total_num_heads)
+        self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads)
         self.q_size_per_rank = self.head_dim * self.num_heads
-        self.num_kv_heads = self.total_kv_heads // tp_size
+        self.num_kv_heads = max(1, self.total_kv_heads // tp_size)
         self.kv_size_per_rank = self.num_kv_heads * self.head_dim
         self.scale = self.head_dim**-0.5
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
@@ -100,12 +107,16 @@ class BailingAttention(nn.Module):
         )
 
         if self.use_qk_norm:
-            self.query_layernorm = (RMSNorm(
-                self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm
-                                    else nn.LayerNorm(self.head_dim, eps=1e-6))
-            self.key_layernorm = (RMSNorm(
-                self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm
-                                  else nn.LayerNorm(self.head_dim, eps=1e-6))
+            self.query_layernorm = (
+                RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+                if self.use_rmsnorm
+                else nn.LayerNorm(self.head_dim, eps=1e-6)
+            )
+            self.key_layernorm = (
+                RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+                if self.use_rmsnorm
+                else nn.LayerNorm(self.head_dim, eps=1e-6)
+            )
 
         self.dense = RowParallelLinear(
             self.total_num_heads * self.head_dim,
@@ -116,8 +127,7 @@ class BailingAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
-                                             1.0)
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
 
         self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
 
@@ -145,12 +155,10 @@ class BailingAttention(nn.Module):
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
     ) -> torch.Tensor:
-
         qkv, _ = self.query_key_value(hidden_states)
-        q, k, v = qkv.split([
-            self.q_size_per_rank, self.kv_size_per_rank, self.kv_size_per_rank
-        ],
-                            dim=-1)
+        q, k, v = qkv.split(
+            [self.q_size_per_rank, self.kv_size_per_rank, self.kv_size_per_rank], dim=-1
+        )
 
         if self.use_qk_norm:
             q = q.view(-1, self.num_heads, self.head_dim)
@@ -169,13 +177,12 @@ class BailingAttention(nn.Module):
 
 
 class BailingMLP(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: Optional[bool] = True,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool | None = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -204,13 +211,12 @@ class BailingMLP(nn.Module):
 
 
 class BailingMoE(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: Optional[bool] = True,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool | None = True,
         prefix: str = "",
     ):
         super().__init__()
@@ -226,10 +232,8 @@ class BailingMoE(nn.Module):
         self.score_function = getattr(config, "score_function", None)
         self.n_group = getattr(config, "n_group", None)
         self.topk_group = getattr(config, "topk_group", None)
-        self.use_grouped_topk = (self.n_group is not None
-                                 and self.topk_group is not None)
-        self.routed_scaling_factor = getattr(config, "routed_scaling_factor",
-                                             1.0)
+        self.use_grouped_topk = self.n_group is not None and self.topk_group is not None
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
 
         router_dtype = getattr(config, "router_dtype", None)
         if router_dtype is None:
@@ -248,26 +252,45 @@ class BailingMoE(nn.Module):
 
         if getattr(config, "moe_router_enable_expert_bias", False):
             self.gate.expert_bias = nn.Parameter(
-                torch.empty((config.num_experts, ), dtype=torch.float32))
+                torch.empty((config.num_experts,), dtype=torch.float32)
+            )
         else:
             self.gate.expert_bias = None
 
-        self.correction_bias = (self.gate.expert_bias.data
-                                if self.gate.expert_bias is not None else None)
+        self.correction_bias = (
+            self.gate.expert_bias.data if self.gate.expert_bias is not None else None
+        )
 
         if self.score_function is not None:
             assert (
-                self.score_function == "softmax"
-                and self.correction_bias is None
+                self.score_function == "softmax" and self.correction_bias is None
             ) or (
-                self.score_function == "sigmoid"
-                and self.correction_bias is not None
-            ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"  # noqa: E501
+                self.score_function == "sigmoid" and self.correction_bias is not None
+            ), (
+                "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"  # noqa: E501
+            )
         else:
             # default value for scoring_func
             self.score_function = "softmax"
 
-        self.experts = FusedMoE(
+        if self.num_shared_experts > 0:
+            if hasattr(config, "moe_shared_expert_intermediate_size"):
+                intermediate_size = config.moe_shared_expert_intermediate_size
+            else:
+                intermediate_size = config.moe_intermediate_size
+            intermediate_size *= config.num_shared_experts
+            self.shared_experts = BailingMLP(
+                intermediate_size=intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
             num_experts=self.num_experts,
             top_k=self.top_k,
             hidden_size=self.hidden_size,
@@ -283,106 +306,89 @@ class BailingMoE(nn.Module):
             use_grouped_topk=self.use_grouped_topk,
         )
 
-        if self.num_shared_experts > 0:
-            if hasattr(config, "moe_shared_expert_intermediate_size"):
-                intermediate_size = config.moe_shared_expert_intermediate_size
-            else:
-                intermediate_size = config.moe_intermediate_size
-            intermediate_size *= config.num_shared_experts
-            self.shared_experts = BailingMLP(
-                intermediate_size=intermediate_size,
-                config=config,
-                quant_config=quant_config,
-                reduce_results=False,
-                prefix=f"{prefix}.shared_experts")
-        else:
-            self.shared_experts = None
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_size)
-        if self.shared_experts:
-            shared_output = self.shared_experts(hidden_states)
+
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states.to(self.router_dtype))
         router_logits = router_logits.to(hidden_states.dtype)
 
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = final_hidden_states
+        else:
+            shared_output = None
 
         final_hidden_states *= self.routed_scaling_factor
 
-        if self.shared_experts:
+        if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
 class BailingMoeBlock(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
-        layer_idx = int(prefix.split('.')[-1])
+        layer_idx = int(prefix.split(".")[-1])
         self.config = config
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
 
         self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
-        self.attention = BailingAttention(config,
-                                          cache_config,
-                                          quant_config,
-                                          prefix=f"{prefix}.attention")
+        self.attention = BailingAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attention"
+        )
 
-        self.post_attention_layernorm = RMSNorm(hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
 
         # Choose MLP class based on the number of experts and layer index
         if layer_idx < config.first_k_dense_replace:
             mlp_class = BailingMLP
         else:
             mlp_class = BailingMoE
-        self.mlp = mlp_class(intermediate_size,
-                             config,
-                             quant_config,
-                             True,
-                             prefix=f"{prefix}.mlp")
+        self.mlp = mlp_class(
+            intermediate_size, config, quant_config, True, prefix=f"{prefix}.mlp"
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.attention(
             hidden_states=hidden_states,
             position_ids=position_ids,
         )
 
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class BailingMoeModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -397,11 +403,11 @@ class BailingMoeModel(nn.Module):
         self.config = config
         self.vocab_size = config.vocab_size
         self.embed_dim = config.hidden_size
-        self.tie_word_embeddings = getattr(config, "tie_word_embeddings",
-                                           False)
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
 
-        if get_pp_group().is_first_rank or (self.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            self.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.word_embeddings = VocabParallelEmbedding(
                 self.vocab_size,
                 self.embed_dim,
@@ -421,32 +427,33 @@ class BailingMoeModel(nn.Module):
                 quant_config=quant_config,
                 prefix=prefix,
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -461,10 +468,9 @@ class BailingMoeModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         else:
             if residual is None:
                 hidden_states = self.norm(hidden_states)
@@ -473,15 +479,14 @@ class BailingMoeModel(nn.Module):
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.config.num_experts,
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -492,14 +497,14 @@ class BailingMoeModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            if (hasattr(self.config, "norm_head") and self.config.norm_head
-                    and "lm_head.weight" in name):
-                loaded_weight = F.normalize(loaded_weight,
-                                            dim=0,
-                                            p=2,
-                                            eps=1e-7)
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if (
+                hasattr(self.config, "norm_head")
+                and self.config.norm_head
+                and "lm_head.weight" in name
+            ):
+                loaded_weight = F.normalize(loaded_weight, dim=0, p=2, eps=1e-7)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 if "mlp.experts" in name:
@@ -549,15 +554,15 @@ class BailingMoeModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "gate_up_proj": [
@@ -577,16 +582,14 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         config = vllm_config.model_config.hf_config.get_text_config()
         vllm_config.model_config.hf_config = config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
         self.max_position_embeddings = config.max_position_embeddings
-        self.model = BailingMoeModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
-        self.tie_word_embeddings = getattr(config, "tie_word_embeddings",
-                                           False)
+        self.model = BailingMoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
 
         if get_pp_group().is_last_rank:
             if self.tie_word_embeddings:
@@ -596,38 +599,39 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
                     config.vocab_size,
                     config.hidden_size,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.lm_head",
+                    prefix=maybe_prefix(prefix, "lm_head"),
                 )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."] if self.tie_word_embeddings else None),
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4a6154dc548aa38cf6d63ae20757b224635c435c..e0a2defd5127e61c85b52f15a59b25d34ea846f2 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Bamba model."""
+
 # Added by the IBM Team, 2024
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -16,34 +16,50 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class BambaMLP(nn.Module):
-
     def __init__(
         self,
         config: BambaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -51,16 +67,20 @@ class BambaMLP(nn.Module):
             output_sizes=[config.intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -71,51 +91,52 @@ class BambaMLP(nn.Module):
 
 
 class BambaMixerDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: BambaConfig,
-                 layer_idx: int,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: BambaConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
-        self.mamba = MambaMixer2(hidden_size= config.hidden_size,
-                                ssm_state_size = config.mamba_d_state,
-                                conv_kernel_size = config.mamba_d_conv,
-                                intermediate_size = config.mamba_expand *\
-                                                    config.hidden_size,
-                                use_conv_bias = config.mamba_conv_bias,
-                                use_bias = config.mamba_proj_bias,
-                                n_groups=config.mamba_n_groups,
-                                num_heads=config.mamba_n_heads,
-                                head_dim=config.mamba_d_head,
-                                rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act,
-                                model_config=model_config,
-                                cache_config=cache_config,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.mixer")
-
-        self.feed_forward = BambaMLP(config, quant_config=quant_config)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=config.mamba_expand * config.hidden_size,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            num_heads=config.mamba_n_heads,
+            head_dim=config.mamba_d_head,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.feed_forward = BambaMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         output = torch.empty_like(hidden_states)
         self.mamba(hidden_states, output)
@@ -126,21 +147,19 @@ class BambaMixerDecoderLayer(nn.Module):
 
 
 class BambaAttentionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: BambaConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = config.num_attention_heads
@@ -187,11 +206,15 @@ class BambaAttentionDecoderLayer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        config.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
 
         self.attn = Attention(
             self.num_heads,
@@ -202,11 +225,11 @@ class BambaAttentionDecoderLayer(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
-        self.feed_forward = BambaMLP(config, quant_config=quant_config)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.feed_forward = BambaMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def self_attention(
         self,
@@ -226,36 +249,33 @@ class BambaAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attention(
             positions=positions,
             hidden_states=hidden_states,
         )
         # Fully Connected
-        hidden_states, residual = self.pre_ff_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.pre_ff_layernorm(hidden_states, residual)
         hidden_states = self.feed_forward(hidden_states)
         return hidden_states, residual
 
 
 ALL_DECODER_LAYER_TYPES = {
     "attention": BambaAttentionDecoderLayer,
-    "mamba": BambaMixerDecoderLayer
+    "mamba": BambaMixerDecoderLayer,
 }
 
 
 @support_torch_compile
 class BambaModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -263,24 +283,19 @@ class BambaModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
-            layer_class = ALL_DECODER_LAYER_TYPES[
-                config.layers_block_type[layer_idx]]
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[layer_idx]]
             return layer_class(
                 config,
                 layer_idx,
@@ -291,30 +306,29 @@ class BambaModel(nn.Module):
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-        self.final_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -330,15 +344,13 @@ class BambaModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -383,22 +395,28 @@ class BambaModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsQuant):
+class BambaForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsQuant,
+    SupportsMambaPrefixCaching,
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["up_proj", "down_proj"]
+        "gate_up_proj": ["up_proj", "down_proj"],
     }
 
     # LoRA specific attributes
@@ -413,7 +431,6 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -453,62 +470,53 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
-        scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "Bamba currently does not support prefix caching"
 
+        scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
 
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = BambaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = BambaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/bee.py b/vllm/model_executor/models/bee.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f0342df404b3df7ac4ea9aa586707362b906369
--- /dev/null
+++ b/vllm/model_executor/models/bee.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping
+
+import torch
+import torch.nn as nn
+from transformers.activations import GELUActivation
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict
+
+from .llava_next import (
+    LlavaDummyInputsBuilder,
+    LlavaNextMultiModalProcessor,
+    LlavaNextProcessingInfo,
+)
+from .llava_onevision import LlavaOnevisionForConditionalGeneration
+from .utils import WeightsMapper
+
+
+class BeeProcessingInfo(LlavaNextProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        """Override to use correct max_num_patches from vision_aspect_ratio."""
+        import math
+
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+
+        if aspect_ratio > current_aspect_ratio:
+            new_height = int(
+                round(original_height * (current_width / original_width), 7)
+            )
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
+        else:
+            new_width = int(
+                round(original_width * (current_height / original_height), 7)
+            )
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        # Get max_num_patches from vision_aspect_ratio config
+        hf_config = self.get_hf_config()
+        vision_aspect_ratio = getattr(hf_config, "vision_aspect_ratio", "anyres_max_9")
+        max_num_patches = int(vision_aspect_ratio.replace("anyres_max_", ""))
+
+        ratio = math.sqrt(
+            current_height * current_width / (max_num_patches * npatches**2)
+        )
+        if ratio > 1.1:
+            height_factor = int(current_height // ratio)
+            width_factor = int(current_width // ratio)
+            unpadded_features = height_factor * width_factor
+            newline_features = height_factor
+
+        return (unpadded_features, newline_features)
+
+
+class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        image_token = "<image>"
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+        }
+
+
+class BeeMultiModalProjector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=1e-06)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size,
+            config.text_config.hidden_size * 4,
+            bias=True,
+        )
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size * 4,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+
+    def forward(self, image_feature: torch.Tensor) -> torch.Tensor:
+        image_feature = self.pre_norm(image_feature)
+        hidden_states = self.linear_1(image_feature)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextMultiModalProcessor,
+    info=BeeProcessingInfo,
+    dummy_inputs=BeeDummyInputsBuilder,
+)
+class BeeForConditionalGeneration(LlavaOnevisionForConditionalGeneration):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers
+            # v4.55
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.image_newline": "image_newline",
+            "lm_head.": "language_model.lm_head.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+        self.multi_modal_projector = BeeMultiModalProjector(config)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index ee32587f6b1b4f98021113f7a7adb01ddbcac529..2679448bce77580d278af917ec20d81b537a93a0 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Set
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -13,17 +12,21 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.pooler import (ClassifierPooler,
-                                               DispatchPooler, Pooler,
-                                               PoolingMethod,
-                                               PoolingParamsUpdate,
-                                               PoolingType)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.pooler import (
+    ClassifierPooler,
+    DispatchPooler,
+    Pooler,
+    PoolingMethod,
+    PoolingParamsUpdate,
+    PoolingType,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 from vllm.v1.pool.metadata import PoolingMetadata
@@ -34,19 +37,19 @@ from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class BertEmbedding(nn.Module):
-
     def __init__(self, config: BertConfig):
-
         super().__init__()
         self.size = config.hidden_size
-        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
-                                                      config.hidden_size)
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.position_embeddings = VocabParallelEmbedding(
-            config.max_position_embeddings, config.hidden_size)
+            config.max_position_embeddings, config.hidden_size
+        )
         self.token_type_embeddings = VocabParallelEmbedding(
-            config.type_vocab_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         self.register_buffer(
             "position_ids",
@@ -54,18 +57,21 @@ class BertEmbedding(nn.Module):
         )
         self.position_embedding_type = config.position_embedding_type
         if self.position_embedding_type != "absolute":
-            raise ValueError("Only 'absolute' position_embedding_type" +
-                             " is supported")
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         token_type_ids = _decode_token_type_ids(input_ids)
 
-        inputs_embeds = self.word_embeddings(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
         position_embeddings = self.position_embeddings(position_ids)
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -76,7 +82,6 @@ class BertEmbedding(nn.Module):
 
 
 class BertPooler(Pooler):
-
     def __init__(self, config: BertConfig):
         super().__init__()
 
@@ -97,9 +102,9 @@ class BertPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         pooled_output = self.pooling(hidden_states, pooling_metadata)
 
         if isinstance(pooled_output, list):
@@ -111,19 +116,22 @@ class BertPooler(Pooler):
 
 
 class BertEncoder(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        self.layer = nn.ModuleList([
-            BertLayer(config=config,
-                      cache_config=cache_config,
-                      quant_config=quant_config,
-                      prefix=f"{prefix}.layer.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(
+                    config=config,
+                    cache_config=cache_config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -135,12 +143,13 @@ class BertEncoder(nn.Module):
 
 
 class BertLayer(nn.Module):
-
-    def __init__(self,
-                 config: BertConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: BertConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         self.attention = BertAttention(
@@ -149,20 +158,24 @@ class BertLayer(nn.Module):
             layer_norm_eps=config.layer_norm_eps,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.attention")
+            prefix=f"{prefix}.attention",
+        )
 
         self.intermediate = BertIntermediate(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
-            prefix=f"{prefix}.intermediate")
+            prefix=f"{prefix}.intermediate",
+        )
 
-        self.output = BertOutput(hidden_size=config.hidden_size,
-                                 intermediate_size=config.intermediate_size,
-                                 layer_norm_eps=config.layer_norm_eps,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.output")
+        self.output = BertOutput(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
 
     def forward(self, hidden_states: torch.Tensor):
         attn_output = self.attention(hidden_states)
@@ -172,28 +185,31 @@ class BertLayer(nn.Module):
 
 
 class BertAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_attention_heads: int,
         layer_norm_eps: float,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
 
-        self.self = BertSelfAttention(hidden_size=hidden_size,
-                                      num_attention_heads=num_attention_heads,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.output")
+        self.self = BertSelfAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
 
-        self.output = BertSelfOutput(hidden_size=hidden_size,
-                                     layer_norm_eps=layer_norm_eps,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.output")
+        self.output = BertSelfOutput(
+            hidden_size=hidden_size,
+            layer_norm_eps=layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
 
     def forward(
         self,
@@ -204,13 +220,12 @@ class BertAttention(nn.Module):
 
 
 class BertSelfAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_attention_heads: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -237,15 +252,18 @@ class BertSelfAttention(nn.Module):
             total_num_kv_heads=self.total_num_kv_heads,
             bias=True,
             quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj")
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.attn = EncoderOnlyAttention(num_heads=self.num_heads,
-                                         head_size=self.head_dim,
-                                         scale=self.scaling,
-                                         num_kv_heads=self.num_kv_heads,
-                                         cache_config=cache_config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.attn")
+        self.attn = EncoderOnlyAttention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -258,41 +276,48 @@ class BertSelfAttention(nn.Module):
 
 
 class BertSelfOutput(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 layer_norm_eps: float,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        layer_norm_eps: float,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
-        self.dense = RowParallelLinear(input_size=hidden_size,
-                                       output_size=hidden_size,
-                                       bias=True,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.dense")
+        self.dense = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
         self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor,
-                input_tensor: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
         hidden_states, _ = self.dense(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
 
 class BertIntermediate(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 hidden_act: str,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
-        self.dense = ColumnParallelLinear(input_size=hidden_size,
-                                          output_size=intermediate_size,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.dense")
+        self.dense = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
         self.intermediate_act_fn = get_act_fn(hidden_act)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -302,25 +327,29 @@ class BertIntermediate(nn.Module):
 
 
 class BertOutput(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 layer_norm_eps: float,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_norm_eps: float,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
-        self.dense = RowParallelLinear(input_size=intermediate_size,
-                                       output_size=hidden_size,
-                                       bias=True,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.dense")
+        self.dense = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
 
         self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor,
-                input_tensor: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
         hidden_states, _ = self.dense(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
@@ -329,7 +358,6 @@ class BertOutput(nn.Module):
 @support_torch_compile
 @default_pooling_type("CLS")
 class BertModel(nn.Module, SupportsQuant):
-
     is_pooling_model = True
 
     packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
@@ -345,21 +373,24 @@ class BertModel(nn.Module, SupportsQuant):
 
         self.config = vllm_config.model_config.hf_config
         self.embeddings = embedding_class(self.config)
-        self.encoder = BertEncoder(vllm_config=vllm_config,
-                                   prefix=f"{prefix}.encoder")
+        self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder")
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.word_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
-        else:
-            hidden_states = self.embeddings(input_ids=input_ids,
-                                            position_ids=positions)
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=positions,
+            inputs_embeds=inputs_embeds,
+        )
+
         return self.encoder(hidden_states)
 
     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -374,7 +405,7 @@ class BertModel(nn.Module, SupportsQuant):
         other_weights = []
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
 
@@ -392,8 +423,7 @@ class BertModel(nn.Module, SupportsQuant):
 
         return other_weights, loaded_stacked_params
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         other_weights, loaded_stacked_params = self._load_weights(weights)
 
         loader = AutoWeightsLoader(self, skip_prefixes=["pooler."])
@@ -404,7 +434,6 @@ class BertModel(nn.Module, SupportsQuant):
 
 @default_pooling_type("ALL")
 class BertPoolingModel(BertModel):
-
     is_pooling_model = True
 
     def __init__(
@@ -423,8 +452,7 @@ class BertPoolingModel(BertModel):
         config = vllm_config.model_config.hf_config
         self.pooler = BertPooler(config)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         other_weights, loaded_stacked_params = self._load_weights(weights)
 
         loader = AutoWeightsLoader(self)
@@ -453,45 +481,50 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.model = self._build_model(vllm_config=vllm_config,
-                                       prefix=maybe_prefix(prefix, "model"))
+        self.model = self._build_model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.pooler = self._build_pooler(pooler_config)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return self.model(input_ids=input_ids,
-                          positions=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         weights_list = list(weights)
 
-        has_model_prefix = any(
-            name.startswith("model.") for name, _ in weights_list)
+        has_model_prefix = any(name.startswith("model.") for name, _ in weights_list)
         if not has_model_prefix:
             mapper = WeightsMapper(orig_to_new_prefix={"": "model."})
 
         loader = AutoWeightsLoader(self, skip_prefixes=["lm_head."])
         return loader.load_weights(weights_list, mapper=mapper)
 
-    def _build_model(self,
-                     vllm_config: VllmConfig,
-                     prefix: str = "") -> BertModel:
-        return BertModel(vllm_config=vllm_config,
-                         prefix=prefix,
-                         embedding_class=BertEmbedding)
+    def _build_model(self, vllm_config: VllmConfig, prefix: str = "") -> BertModel:
+        return BertModel(
+            vllm_config=vllm_config, prefix=prefix, embedding_class=BertEmbedding
+        )
 
     def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
-        return DispatchPooler({
-            "encode": Pooler.for_encode(pooler_config),
-            "embed": Pooler.for_embed(pooler_config),
-        })
+        return DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": Pooler.for_embed(pooler_config),
+            }
+        )
 
 
 # Here we encode the token type ids together with the input ids.
@@ -518,18 +551,18 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
 TOKEN_TYPE_SHIFT = 30
 
 
-def _encode_token_type_ids(input_ids: torch.Tensor,
-                           token_type_ids: torch.Tensor) -> None:
+def _encode_token_type_ids(
+    input_ids: torch.Tensor, token_type_ids: torch.Tensor
+) -> None:
     # input_ids can be padded to the right
-    input_ids[:token_type_ids.shape[0]].bitwise_or_(
-        token_type_ids << TOKEN_TYPE_SHIFT)
+    input_ids[: token_type_ids.shape[0]].bitwise_or_(token_type_ids << TOKEN_TYPE_SHIFT)
 
 
 def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
-
-    ids_mask = torch.ones_like(input_ids,
-                               dtype=torch.int32,
-                               device=input_ids.device) << TOKEN_TYPE_SHIFT
+    ids_mask = (
+        torch.ones_like(input_ids, dtype=torch.int32, device=input_ids.device)
+        << TOKEN_TYPE_SHIFT
+    )
     tokens_mask = ids_mask.bitwise_not()
 
     token_type_ids = input_ids.bitwise_and(ids_mask) >> TOKEN_TYPE_SHIFT
@@ -539,18 +572,231 @@ def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
     return token_type_ids
 
 
+class BertMLMHead(nn.Module):
+    def __init__(
+        self, hidden_size: int, vocab_size: int, layer_norm_eps: float = 1e-12
+    ):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.GELU()
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.decoder = nn.Linear(hidden_size, vocab_size, bias=True)
+
+    def tie_weights_with_embeddings(self, embeddings_weight: torch.Tensor):
+        self.decoder.weight = embeddings_weight
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.dense(hidden_states)
+        x = self.activation(x)
+        x = self.layer_norm(x)
+        logits = self.decoder(x)
+        return logits
+
+
+class SPLADESparsePooler(Pooler):
+    """
+    SPLADE sparse pooling:
+    logits = mlm_head(hidden_states)
+            -> log1p(relu(logits))
+            -> (max|sum over L)
+            -> [V]
+
+    Padding is masked with an attention mask,
+    [CLS]/[SEP] is removed (selected),
+    and then pooled.
+    """
+
+    def __init__(
+        self,
+        mlm_head: nn.Module,
+        cls_token_id: int | None = 101,
+        sep_token_id: int | None = 102,
+        pooling: str = "max",
+        remove_cls_sep: bool = True,
+    ):
+        super().__init__()
+        assert pooling in ("max", "sum")
+        self.mlm_head = mlm_head
+        self.cls_token_id = cls_token_id
+        self.sep_token_id = sep_token_id
+        self.pooling = pooling
+        self.remove_cls_sep = remove_cls_sep
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"embed"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        assert isinstance(hidden_states, torch.Tensor) and hidden_states.dim() == 2
+
+        lens_tensor: torch.Tensor = pooling_metadata.prompt_lens
+        lens: list[int] = lens_tensor.tolist()
+        B: int = len(lens)
+
+        token_ids = pooling_metadata.prompt_token_ids
+        offset = 0
+        pooled_list: list[torch.Tensor] = []
+
+        for i in range(B):
+            L = int(lens[i])
+            hs = hidden_states[offset : offset + L]
+
+            start_idx = 0
+            end_idx = L
+            if self.remove_cls_sep and token_ids is not None:
+                if (
+                    self.cls_token_id is not None
+                    and token_ids[i, 0].item() == self.cls_token_id
+                ):
+                    start_idx = 1
+                if (
+                    self.sep_token_id is not None
+                    and token_ids[i, L - 1].item() == self.sep_token_id
+                ):
+                    end_idx = max(start_idx, L - 1)
+
+            if end_idx <= start_idx:
+                V = int(self.mlm_head.decoder.out_features)
+                pooled_list.append(hs.new_zeros((V,)))
+                offset += L
+                continue
+
+            logits_i = self.mlm_head(hs[start_idx:end_idx])
+            scores_i = torch.log1p(torch.relu(logits_i))
+
+            if self.pooling == "sum":
+                pooled_i = scores_i.sum(dim=0)
+            else:  # "max"
+                pooled_i = scores_i.max(dim=0).values
+
+            pooled_list.append(pooled_i.contiguous())
+            offset += L
+
+        return torch.stack(pooled_list, dim=0).contiguous()
+
+
+@default_pooling_type("CLS")
+class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
+    """
+    BertEmbeddingModel + SPLADE sparse embedding.
+    - Make logits by self.mlm_head
+    - pooler: SPLADESparsePooler(mlm_head...)
+    """
+
+    def __init__(
+        self, *, vllm_config: VllmConfig, prefix: str = "", splade_pooling: str = "max"
+    ):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        cfg = vllm_config.model_config.hf_config
+
+        # MLM head
+        self.mlm_head = BertMLMHead(
+            hidden_size=cfg.hidden_size,
+            vocab_size=cfg.vocab_size,
+            layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+        )
+
+        self._splade_pooling = splade_pooling
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = self._build_pooler(pooler_config)
+
+    def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
+        cfg = self.model.config
+
+        if not hasattr(self, "mlm_head"):
+            self.mlm_head = BertMLMHead(
+                hidden_size=cfg.hidden_size,
+                vocab_size=cfg.vocab_size,
+                layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+            )
+
+        pooling_mode = getattr(self, "_splade_pooling", "max")
+
+        cls_id = getattr(cfg, "cls_token_id", None)
+        sep_id = getattr(cfg, "sep_token_id", None)
+
+        return DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": SPLADESparsePooler(
+                    mlm_head=self.mlm_head,
+                    cls_token_id=cls_id,
+                    sep_token_id=sep_id,
+                    pooling=pooling_mode,  # "max" or "sum"
+                    remove_cls_sep=True,
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        if not hasattr(self, "mlm_head"):
+            cfg = self.model.config
+            self.mlm_head = BertMLMHead(
+                hidden_size=cfg.hidden_size,
+                vocab_size=cfg.vocab_size,
+                layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+            )
+
+        def _strip(name: str) -> str:
+            for p in ("model.", "bert."):
+                if name.startswith(p):
+                    name = name[len(p) :]
+            return name
+
+        weights_list = list(weights)
+        model_side: list[tuple[str, torch.Tensor]] = []
+        mlm_side: list[tuple[str, torch.Tensor]] = []
+
+        for k, w in weights_list:
+            name = _strip(k)
+            if name.startswith("cls.predictions."):
+                mlm_side.append((name, w))
+            else:
+                model_side.append((name, w))
+
+        loaded: set[str] = set()
+        loaded_model = self.model.load_weights(model_side)
+        loaded.update({"model." + n for n in loaded_model})
+
+        if mlm_side:
+            name_map = {
+                "cls.predictions.transform.dense.weight": "mlm_head.dense.weight",
+                "cls.predictions.transform.dense.bias": "mlm_head.dense.bias",
+                ("cls.predictions.transform.LayerNorm.weight"): (
+                    "mlm_head.layer_norm.weight"
+                ),
+                ("cls.predictions.transform.LayerNorm.bias"): (
+                    "mlm_head.layer_norm.bias"
+                ),
+                "cls.predictions.decoder.weight": "mlm_head.decoder.weight",
+                "cls.predictions.decoder.bias": "mlm_head.decoder.bias",
+            }
+            remapped = [(name_map[n], w) for n, w in mlm_side if n in name_map]
+            if remapped:
+                loaded_mlm = AutoWeightsLoader(self).load_weights(remapped)
+                loaded.update(loaded_mlm)
+
+        return loaded
+
+
 @default_pooling_type("CLS")
-class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
-                                    SupportsQuant):
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
-   This class encapsulates the BertModel and provides an interface for
-   embedding operations and customized pooling functions.
+    This class encapsulates the BertModel and provides an interface for
+    embedding operations and customized pooling functions.
 
-   Attributes:
-       model: An instance of BertModel used for forward operations.
-       _pooler: An instance of Pooler used for pooling operations.
-   """
+    Attributes:
+        model: An instance of BertModel used for forward operations.
+        _pooler: An instance of Pooler used for pooling operations.
+    """
 
     is_pooling_model = True
 
@@ -559,34 +805,38 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
         config = vllm_config.model_config.hf_config
 
         self.num_labels = config.num_labels
-        self.bert = BertPoolingModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "bert"),
-                                     embedding_class=BertEmbedding)
-        self.classifier = nn.Linear(config.hidden_size,
-                                    config.num_labels,
-                                    dtype=vllm_config.model_config.head_dtype)
+        self.bert = BertPoolingModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "bert"),
+            embedding_class=BertEmbedding,
+        )
+        self.classifier = nn.Linear(
+            config.hidden_size,
+            config.num_labels,
+            dtype=vllm_config.model_config.head_dtype,
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            ClassifierPooler(
-                pooling=self.bert.pooler,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                    vllm_config.model_config),
-            ),
-            "score":
-            ClassifierPooler(
-                pooling=self.bert.pooler,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                    vllm_config.model_config),
-            ),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
+                "classify": ClassifierPooler(
+                    pooling=self.bert.pooler,
+                    classifier=self.classifier,
+                    act_fn="classify",
+                ),
+                "score": ClassifierPooler(
+                    pooling=self.bert.pooler, classifier=self.classifier, act_fn="score"
+                ),
+            }
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
@@ -595,22 +845,23 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         if token_type_ids is not None:
             assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
             assert input_ids is not None
             _encode_token_type_ids(input_ids, token_type_ids)
 
-        return self.bert(input_ids=input_ids,
-                         positions=positions,
-                         inputs_embeds=inputs_embeds,
-                         intermediate_tensors=intermediate_tensors)
+        return self.bert(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
 
 
 @default_pooling_type("ALL")
@@ -622,20 +873,28 @@ class BertForTokenClassification(nn.Module):
         config = vllm_config.model_config.hf_config
         self.head_dtype = vllm_config.model_config.head_dtype
         self.num_labels = config.num_labels
-        self.bert = BertModel(vllm_config=vllm_config,
-                              prefix=maybe_prefix(prefix, "bert"),
-                              embedding_class=BertEmbedding)
-        self.classifier = nn.Linear(config.hidden_size,
-                                    config.num_labels,
-                                    dtype=self.head_dtype)
+        self.bert = BertModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "bert"),
+            embedding_class=BertEmbedding,
+        )
+        self.classifier = nn.Linear(
+            config.hidden_size, config.num_labels, dtype=self.head_dtype
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config
+                ),
+            }
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
@@ -644,22 +903,23 @@ class BertForTokenClassification(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         if token_type_ids is not None:
             assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
             assert input_ids is not None
             _encode_token_type_ids(input_ids, token_type_ids)
 
-        hidden_states = self.bert(input_ids=input_ids,
-                                  positions=positions,
-                                  inputs_embeds=inputs_embeds,
-                                  intermediate_tensors=intermediate_tensors)
+        hidden_states = self.bert(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
 
         hidden_states = hidden_states.to(self.head_dtype)
         return self.classifier(hidden_states)
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index bfc1408ddf880eba54792662354c5fd119586031..131cb68914cf35613c217d265de49d86275d5df8 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -10,25 +9,30 @@ from transformers import PretrainedConfig
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
-                                                   get_act_fn)
-from vllm.model_executor.layers.fused_moe import (activation_without_mul,
-                                                  fused_topk)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn
+from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              maybe_prefix)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    maybe_prefix,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -40,29 +44,29 @@ from .interfaces_base import default_pooling_type
 
 
 class BertWithRopeEmbedding(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
-
         super().__init__()
         if config.position_embedding_type not in ["rope", "rotary"]:
-            raise ValueError("Only 'rotary'('rope') position_embedding_type" +
-                             " is supported")
+            raise ValueError(
+                "Only 'rotary'('rope') position_embedding_type" + " is supported"
+            )
 
-        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
-                                                      config.hidden_size)
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         if config.type_vocab_size > 0:
             self.token_type_embeddings = VocabParallelEmbedding(
-                config.type_vocab_size, config.hidden_size)
+                config.type_vocab_size, config.hidden_size
+            )
         else:
             self.token_type_embeddings = None
 
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(
         self,
         input_ids: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
         inputs_embeds = self.word_embeddings(input_ids)
@@ -70,9 +74,9 @@ class BertWithRopeEmbedding(nn.Module):
         embeddings = inputs_embeds
         if self.token_type_embeddings is not None:
             if token_type_ids is None:
-                token_type_ids = torch.zeros(input_shape,
-                                             dtype=torch.long,
-                                             device=inputs_embeds.device)
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=inputs_embeds.device
+                )
 
             token_type_embeddings = self.token_type_embeddings(token_type_ids)
             embeddings += token_type_embeddings
@@ -82,15 +86,14 @@ class BertWithRopeEmbedding(nn.Module):
 
 
 class BertWithRopeAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_attention_heads: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = True,
-        rotary_kwargs: Optional[dict] = None,
+        rotary_kwargs: dict | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -119,23 +122,28 @@ class BertWithRopeAttention(nn.Module):
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj")
+            prefix=f"{prefix}.qkv_proj",
+        )
 
         self.rotary_emb = get_rope(**rotary_kwargs)
 
-        self.attn = EncoderOnlyAttention(num_heads=self.num_heads,
-                                         head_size=self.head_dim,
-                                         scale=self.scaling,
-                                         num_kv_heads=self.num_kv_heads,
-                                         cache_config=cache_config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.attn")
+        self.attn = EncoderOnlyAttention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
-        self.out_proj = RowParallelLinear(input_size=hidden_size,
-                                          output_size=hidden_size,
-                                          bias=bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.dense")
+        self.out_proj = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
 
     def forward(
         self,
@@ -151,14 +159,15 @@ class BertWithRopeAttention(nn.Module):
 
 
 class BertWithRopeGatedMLP(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 hidden_act: str,
-                 bias: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        bias: bool = True,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.act_fn = get_act_and_mul_fn(hidden_act)
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -168,11 +177,13 @@ class BertWithRopeGatedMLP(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
         )
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         gate_up, _ = self.gate_up_proj(hidden_states)
@@ -182,26 +193,31 @@ class BertWithRopeGatedMLP(nn.Module):
 
 
 class BertWithRopeMLP(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 hidden_act: str,
-                 bias: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        bias: bool = True,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.act_fn = get_act_fn(hidden_act)
-        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
-                                            output_size=intermediate_size,
-                                            bias=bias,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.up_proj")
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.up_proj(hidden_states)
@@ -211,7 +227,6 @@ class BertWithRopeMLP(nn.Module):
 
 
 class NomicMoE(nn.Module):
-
     def __init__(
         self,
         num_experts: int,
@@ -219,8 +234,8 @@ class NomicMoE(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
     ):
         super().__init__()
 
@@ -236,28 +251,40 @@ class NomicMoE(nn.Module):
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
 
-        self.router = ReplicatedLinear(self.hidden_size,
-                                       self.num_total_experts,
-                                       bias=False)
+        self.router = ReplicatedLinear(
+            self.hidden_size, self.num_total_experts, bias=False
+        )
         self.w1 = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        self.intermediate_size,
-                        self.hidden_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype))
+            torch.empty(
+                self.num_total_experts,
+                self.intermediate_size,
+                self.hidden_size,
+                device=current_platform.device_type,
+                dtype=self.params_dtype,
+            )
+        )
         self.w2 = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        self.hidden_size,
-                        self.intermediate_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype))
+            torch.empty(
+                self.num_total_experts,
+                self.hidden_size,
+                self.intermediate_size,
+                device=current_platform.device_type,
+                dtype=self.params_dtype,
+            )
+        )
         self.bias = nn.Parameter(torch.zeros(self.hidden_size))
-        set_weight_attrs(self.w1, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2, {
-            "weight_loader": self.weight_loader,
-        })
+        set_weight_attrs(
+            self.w1,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
 
     def weight_loader(
         self,
@@ -293,10 +320,9 @@ class NomicMoE(nn.Module):
         # FIXME(Isotr0py): This implementation is too tricky,
         # we should use FusedMoE instead in the future
         # after supporting ungated activation for it.
-        topk_weights, topk_ids, _ = fused_topk(hidden_states,
-                                               router_logits,
-                                               self.top_k,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(
+            hidden_states, router_logits, self.top_k, renormalize=False
+        )
 
         final_hidden_states = torch.ops.vllm.outplace_fused_experts(
             hidden_states=hidden_states,
@@ -308,22 +334,22 @@ class NomicMoE(nn.Module):
         )
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_size) + self.bias
 
 
 class BertWithRopeBlock(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 moe: bool = False,
-                 bias: bool = True,
-                 rotary_kwargs: Optional[dict] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        moe: bool = False,
+        bias: bool = True,
+        rotary_kwargs: dict | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.attn = BertWithRopeAttention(
             hidden_size=config.hidden_size,
@@ -332,14 +358,17 @@ class BertWithRopeBlock(nn.Module):
             quant_config=quant_config,
             bias=bias,
             rotary_kwargs=rotary_kwargs,
-            prefix=f"{prefix}.attention")
+            prefix=f"{prefix}.attention",
+        )
 
         if moe:
-            self.mlp = NomicMoE(num_experts=config.num_experts,
-                                top_k=config.moe_top_k,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.intermediate_size,
-                                hidden_act=config.hidden_act)
+            self.mlp = NomicMoE(
+                num_experts=config.num_experts,
+                top_k=config.moe_top_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+            )
         else:
             if config.hidden_act in ["silu", "geglu"]:
                 self.mlp = BertWithRopeGatedMLP(
@@ -348,7 +377,8 @@ class BertWithRopeBlock(nn.Module):
                     hidden_act=config.hidden_act,
                     bias=bias,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.mlp")
+                    prefix=f"{prefix}.mlp",
+                )
             else:
                 self.mlp = BertWithRopeMLP(
                     hidden_size=config.hidden_size,
@@ -356,12 +386,11 @@ class BertWithRopeBlock(nn.Module):
                     hidden_act=config.hidden_act,
                     bias=bias,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.mlp")
+                    prefix=f"{prefix}.mlp",
+                )
 
-        self.attn_ln = nn.LayerNorm(config.hidden_size,
-                                    eps=config.layer_norm_eps)
-        self.mlp_ln = nn.LayerNorm(config.hidden_size,
-                                   eps=config.layer_norm_eps)
+        self.attn_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
         attn_output = self.attn(positions, hidden_states)
@@ -372,27 +401,32 @@ class BertWithRopeBlock(nn.Module):
 
 
 class BertWithRopeEncoder(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 bias: bool = True,
-                 rotary_kwargs: Optional[dict] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        bias: bool = True,
+        rotary_kwargs: dict | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         every_n = getattr(config, "moe_every_n_layers", 0)
-        self.layers = nn.ModuleList([
-            BertWithRopeBlock(config=config,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              bias=bias,
-                              moe=every_n > 0 and (layer_idx % every_n == 1),
-                              rotary_kwargs=rotary_kwargs,
-                              prefix=f"{prefix}.layer.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                BertWithRopeBlock(
+                    config=config,
+                    cache_config=cache_config,
+                    quant_config=quant_config,
+                    bias=bias,
+                    moe=every_n > 0 and (layer_idx % every_n == 1),
+                    rotary_kwargs=rotary_kwargs,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -409,11 +443,13 @@ class BertWithRopeEncoder(nn.Module):
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 add_pooling_layer: bool = False):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        add_pooling_layer: bool = False,
+    ):
         super().__init__()
         self.vllm_config = vllm_config
         self.add_pooling_layer = add_pooling_layer
@@ -423,26 +459,30 @@ class BertWithRope(nn.Module, SupportsQuant):
             vllm_config=vllm_config,
             bias=getattr(self.config, "bias", True),
             rotary_kwargs=self.config.rotary_kwargs,
-            prefix=f"{prefix}.encoder")
+            prefix=f"{prefix}.encoder",
+        )
         self.pooler = BertPooler(self.config) if add_pooling_layer else None
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            hidden_states = self.embeddings(input_ids=input_ids,
-                                            token_type_ids=token_type_ids)
+            hidden_states = self.embeddings(
+                input_ids=input_ids, token_type_ids=token_type_ids
+            )
         return self.encoder(positions, hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.hf_to_vllm_mapper.apply(weights)
 
         if self.config.hidden_act in ["silu", "geglu"]:
@@ -459,7 +499,7 @@ class BertWithRope(nn.Module, SupportsQuant):
         for name, loaded_weight in weights:
             if not self.add_pooling_layer and "pooler" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -475,8 +515,7 @@ class BertWithRope(nn.Module, SupportsQuant):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if name.endswith((".w1", ".w2")):
                     # Nomic-MoE has fused experts weights
                     weight_loader(param, loaded_weight, name)
@@ -503,7 +542,8 @@ class NomicBertModel(BertWithRope):
             "experts.mlp.": "",
             "experts.": "",
             "router.layer": "router",
-        })
+        }
+    )
 
 
 class GteNewModel(BertWithRope):
@@ -515,7 +555,8 @@ class GteNewModel(BertWithRope):
             "layer": "layers",
             "attention.qkv_proj": "attn.qkv_proj",
             "attention.o_proj": "attn.out_proj",
-        })
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs):
         super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
@@ -536,15 +577,13 @@ class GteNewModel(BertWithRope):
             else:
                 yield name, weight
 
-    def ignore_unnecessary_layers(self,
-                                  weights: Iterable[tuple[str, torch.Tensor]]):
+    def ignore_unnecessary_layers(self, weights: Iterable[tuple[str, torch.Tensor]]):
         for name, weight in weights:
             if name.startswith("classifier"):
                 continue
             yield name, weight
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.ignore_unnecessary_layers(weights)
         weights = self.split_up_gate_proj(weights)
         return super().load_weights(weights)
@@ -558,7 +597,8 @@ class SnowflakeGteNewModel(GteNewModel):
             "layer": "layers",
             "attention.qkv_proj": "attn.qkv_proj",
             "attention.o_proj": "attn.out_proj",
-        })
+        }
+    )
 
 
 class JinaRobertaModel(BertWithRope):
@@ -573,11 +613,11 @@ class JinaRobertaModel(BertWithRope):
             "mlp.fc1.": "mlp.up_proj.",
             "mlp.fc2": "mlp.down_proj",
             "norm2": "mlp_ln",
-        })
+        }
+    )
 
     @torch.inference_mode()
-    def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
-                                                              torch.Tensor]]):
+    def jina_merge_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         # use for jina-embeddings-v3
         # Merge Lora weights into a single weight tensor.
         # This is a temporary solution until we have a better way to handle
@@ -598,7 +638,7 @@ class JinaRobertaModel(BertWithRope):
             if o in name:
                 dtype = weights[name].dtype
                 shape = weights[name].shape
-                weight_name = name[:-len(o)]
+                weight_name = name[: -len(o)]
 
                 if "embeddings" in weight_name:
                     B = weights[weight_name + a][i].to(device).float()
@@ -607,20 +647,23 @@ class JinaRobertaModel(BertWithRope):
                     B = weights[weight_name + b][i].to(device).float()
                     A = weights[weight_name + a][i].to(device).float()
 
-                weight = (weights[weight_name + o].to(device) +
-                          torch.matmul(B, A).view(shape) * scaling)
+                weight = (
+                    weights[weight_name + o].to(device)
+                    + torch.matmul(B, A).view(shape) * scaling
+                )
                 weight = weight.cpu().to(dtype)
 
                 weights[weight_name.replace(".parametrizations", "")] = weight
 
-                del weights[weight_name + o], weights[weight_name +
-                                                      a], weights[weight_name +
-                                                                  b]
+                del (
+                    weights[weight_name + o],
+                    weights[weight_name + a],
+                    weights[weight_name + b],
+                )
 
         return [(name, weight) for name, weight in weights.items()]
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.jina_merge_lora_weights(weights)
         return super().load_weights(weights)
 
@@ -634,9 +677,9 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.new = GteNewModel(vllm_config=vllm_config,
-                               prefix=prefix,
-                               add_pooling_layer=True)
+        self.new = GteNewModel(
+            vllm_config=vllm_config, prefix=prefix, add_pooling_layer=True
+        )
         self.classifier = ReplicatedLinear(
             config.hidden_size,
             config.num_labels,
@@ -644,44 +687,46 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
             quant_config=quant_config,
             params_dtype=vllm_config.model_config.head_dtype,
             prefix=maybe_prefix(prefix, "classifier"),
-            return_bias=False)
+            return_bias=False,
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            ClassifierPooler(
-                pooling=self.new.pooler,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                    vllm_config.model_config),
-            ),
-            "score":
-            ClassifierPooler(
-                pooling=self.new.pooler,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                    vllm_config.model_config),
-            ),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
+                "classify": ClassifierPooler(
+                    pooling=self.new.pooler,
+                    classifier=self.classifier,
+                    act_fn="classify",
+                ),
+                "score": ClassifierPooler(
+                    pooling=self.new.pooler, classifier=self.classifier, act_fn="score"
+                ),
+            }
+        )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loaded_params = loader.load_weights(weights)
         return loaded_params
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.new.embed_input_ids(input_ids)
+
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
-        return self.new(input_ids=input_ids,
-                        positions=positions,
-                        inputs_embeds=inputs_embeds,
-                        intermediate_tensors=intermediate_tensors)
+        return self.new(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 2b457fd8a5b25eed5ee77cde02972812ebe5dab9..2e4f73312efa3e5a8d0f909588a8ece2465554e6 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Minimal implementation of BlipVisionModel intended to be only used 
+"""Minimal implementation of BlipVisionModel intended to be only used
 within a vision language model."""
+
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -12,9 +12,11 @@ from transformers import Blip2VisionConfig, BlipVisionConfig
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
@@ -27,15 +29,15 @@ def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
 
 
 def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_blip_patch_grid_length(image_size=image_size,
-                                             patch_size=patch_size)
+    grid_length = get_blip_patch_grid_length(
+        image_size=image_size, patch_size=patch_size
+    )
     return grid_length * grid_length
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
-
-    def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]):
+    def __init__(self, config: BlipVisionConfig | Blip2VisionConfig):
         super().__init__()
 
         self.config = config
@@ -52,25 +54,28 @@ class BlipVisionEmbeddings(nn.Module):
             stride=self.patch_size,
         )
 
-        self.num_patches = get_blip_num_patches(image_size=self.image_size,
-                                                patch_size=self.patch_size)
+        self.num_patches = get_blip_num_patches(
+            image_size=self.image_size, patch_size=self.patch_size
+        )
         self.num_positions = self.num_patches + 1
 
         self.position_embedding = nn.Parameter(
-            torch.randn(1, self.num_positions, self.embed_dim))
+            torch.randn(1, self.num_positions, self.embed_dim)
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(
-            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
 
         position_embeds = self.position_embedding.to(target_dtype)
-        embeddings = embeddings + position_embeds[:, :embeddings.size(1), :]
+        embeddings = embeddings + position_embeds[:, : embeddings.size(1), :]
 
         return embeddings
 
@@ -80,8 +85,8 @@ class BlipAttention(nn.Module):
 
     def __init__(
         self,
-        config: Union[BlipVisionConfig, Blip2VisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: BlipVisionConfig | Blip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -93,7 +98,8 @@ class BlipAttention(nn.Module):
             raise ValueError(
                 "embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
 
@@ -115,12 +121,16 @@ class BlipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_partition, self.head_dim, self.scale
+        )
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -137,11 +147,10 @@ class BlipAttention(nn.Module):
 
 
 class BlipMLP(nn.Module):
-
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -149,16 +158,20 @@ class BlipMLP(nn.Module):
         self.config = config
 
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -169,11 +182,10 @@ class BlipMLP(nn.Module):
 
 
 class BlipEncoderLayer(nn.Module):
-
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -184,13 +196,9 @@ class BlipEncoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.mlp = BlipMLP(config,
-                           quant_config=quant_config,
-                           prefix=f"{prefix}.mlp")
-        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = BlipMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
@@ -209,7 +217,7 @@ class BlipEncoderLayer(nn.Module):
 
 class BlipEncoder(nn.Module):
     """
-    Transformer encoder consisting of `config.num_hidden_layers` self 
+    Transformer encoder consisting of `config.num_hidden_layers` self
     attention layers. Each layer is a [`BlipEncoderLayer`].
 
     Args:
@@ -219,8 +227,8 @@ class BlipEncoder(nn.Module):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -232,12 +240,16 @@ class BlipEncoder(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layers = nn.ModuleList([
-            BlipEncoderLayer(config=config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                BlipEncoderLayer(
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(self, inputs_embeds: torch.Tensor):
         hidden_states = inputs_embeds
@@ -255,10 +267,10 @@ class BlipVisionModel(nn.Module, SupportsQuant):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -284,8 +296,9 @@ class BlipVisionModel(nn.Module, SupportsQuant):
             require_post_norm = len(self.encoder.layers) == num_hidden_layers
 
         if require_post_norm:
-            self.post_layernorm = nn.LayerNorm(config.hidden_size,
-                                               eps=config.layer_norm_eps)
+            self.post_layernorm = nn.LayerNorm(
+                config.hidden_size, eps=config.layer_norm_eps
+            )
         else:
             self.post_layernorm = None
 
@@ -298,8 +311,7 @@ class BlipVisionModel(nn.Module, SupportsQuant):
 
         return self.post_layernorm(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -312,8 +324,7 @@ class BlipVisionModel(nn.Module, SupportsQuant):
 
         for name, loaded_weight in weights:
             # post_layernorm is not needed in BlipVisionModel
-            if (name.startswith("post_layernorm")
-                    and self.post_layernorm is None):
+            if name.startswith("post_layernorm") and self.post_layernorm is None:
                 continue
 
             # omit layers when num_hidden_layers_override is set
@@ -322,7 +333,7 @@ class BlipVisionModel(nn.Module, SupportsQuant):
                 if layer_idx >= layer_count:
                     continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -332,8 +343,7 @@ class BlipVisionModel(nn.Module, SupportsQuant):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index b7455fba62c0271cc4adcdca2df240e3a5d09480..f71b9c01d359d9d23daa0d2c95c813b8a74f7797 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -2,36 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
-                          apply_chunking_to_forward)
+from transformers import (
+    BatchFeature,
+    Blip2Config,
+    Blip2QFormerConfig,
+    apply_chunking_to_forward,
+)
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptIndexTargets,
+    PromptInsertion,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip import BlipVisionModel
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
-
-# We use this internally as placeholders since there is no image token
-# defined on the HuggingFace repo
-_IMAGE_TOKEN_ID = 50265
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
 
 class Blip2ImagePixelInputs(TensorSchema):
@@ -42,6 +53,7 @@ class Blip2ImagePixelInputs(TensorSchema):
         - h: Height of each image
         - w: Width of each image
     """
+
     type: Literal["pixel_values"]
     data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
@@ -53,21 +65,21 @@ class Blip2ImageEmbeddingInputs(TensorSchema):
         - f: Image feature size
         - h: Hidden size (must match the hidden size of language model backbone)
     """
+
     type: Literal["image_embeds"]
     data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
+Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs
 
 
 class Blip2QFormerMultiHeadAttention(nn.Module):
-
     def __init__(
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         is_cross_attention: bool = False,
         prefix: str = "",
     ) -> None:
@@ -82,8 +94,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
             )
 
         self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = (config.hidden_size //
-                                    config.num_attention_heads)
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.scaling = self.attention_head_size**-0.5
 
@@ -95,32 +106,30 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
         self.key = nn.Linear(kv_hidden_size, self.all_head_size)
         self.value = nn.Linear(kv_hidden_size, self.all_head_size)
 
-        self.position_embedding_type = getattr(config,
-                                               "position_embedding_type",
-                                               "absolute")
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
         if self.position_embedding_type != "absolute":
-            raise NotImplementedError("Unsupported position_embedding_type: "
-                                      f"{self.position_embedding_type}")
+            raise NotImplementedError(
+                f"Unsupported position_embedding_type: {self.position_embedding_type}"
+            )
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x):
-        x = x.view(*x.size()[:-1], self.num_attention_heads,
-                   self.attention_head_size)
+        x = x.view(*x.size()[:-1], self.num_attention_heads, self.attention_head_size)
         return x.permute(0, 2, 1, 3)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: torch.FloatTensor | None = None,
     ):
         is_cross_attention = encoder_hidden_states is not None
 
         if is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
         else:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -129,10 +138,8 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
 
         query_layer = self.transpose_for_scores(mixed_query_layer)
 
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-        attention_probs = torch.softmax(attention_scores * self.scaling,
-                                        dim=-1)
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_probs = torch.softmax(attention_scores * self.scaling, dim=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -141,20 +148,19 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
         context_layer = torch.matmul(attention_probs_dropped, value_layer)
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        context_layer = context_layer.view(*context_layer.size()[:-2],
-                                           self.all_head_size)
+        context_layer = context_layer.view(
+            *context_layer.size()[:-2], self.all_head_size
+        )
 
         return context_layer
 
 
 class Blip2QFormerSelfOutput(nn.Module):
-
     def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(
@@ -169,13 +175,12 @@ class Blip2QFormerSelfOutput(nn.Module):
 
 
 class Blip2QFormerAttention(nn.Module):
-
     def __init__(
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         is_cross_attention: bool = False,
         prefix: str = "",
     ) -> None:
@@ -194,7 +199,7 @@ class Blip2QFormerAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: torch.FloatTensor | None = None,
     ) -> tuple[torch.Tensor]:
         self_output = self.attention(
             hidden_states,
@@ -206,7 +211,6 @@ class Blip2QFormerAttention(nn.Module):
 
 
 class Blip2QFormerIntermediate(nn.Module):
-
     def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
@@ -220,13 +224,11 @@ class Blip2QFormerIntermediate(nn.Module):
 
 
 class Blip2QFormerOutput(nn.Module):
-
     def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(
@@ -241,13 +243,12 @@ class Blip2QFormerOutput(nn.Module):
 
 
 class Blip2QFormerLayer(nn.Module):
-
     def __init__(
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         layer_idx: int,
         prefix: str = "",
     ) -> None:
@@ -255,10 +256,12 @@ class Blip2QFormerLayer(nn.Module):
 
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = Blip2QFormerAttention(config,
-                                               quant_config=quant_config,
-                                               cache_config=cache_config,
-                                               prefix=f"{prefix}.attention")
+        self.attention = Blip2QFormerAttention(
+            config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attention",
+        )
 
         self.layer_idx = layer_idx
 
@@ -268,15 +271,16 @@ class Blip2QFormerLayer(nn.Module):
                 quant_config=quant_config,
                 cache_config=cache_config,
                 is_cross_attention=True,
-                prefix=f"{prefix}.crossattention")
+                prefix=f"{prefix}.crossattention",
+            )
             self.has_cross_attention = True
         else:
             self.has_cross_attention = False
 
         self.intermediate_query = Blip2QFormerIntermediate(
-            config, prefix=f"{prefix}.intermediate_query")
-        self.output_query = Blip2QFormerOutput(config,
-                                               prefix=f"{prefix}.output_query")
+            config, prefix=f"{prefix}.intermediate_query"
+        )
+        self.output_query = Blip2QFormerOutput(config, prefix=f"{prefix}.output_query")
 
     def forward(
         self,
@@ -309,8 +313,7 @@ class Blip2QFormerLayer(nn.Module):
                     self.seq_len_dim,
                     attention_output[:, query_length:, :],
                 )
-                layer_output = torch.cat([layer_output, layer_output_text],
-                                         dim=1)
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
         else:
             layer_output = apply_chunking_to_forward(
                 self.feed_forward_chunk,
@@ -321,41 +324,42 @@ class Blip2QFormerLayer(nn.Module):
 
         return layer_output
 
-    def feed_forward_chunk(self,
-                           attention_output: torch.Tensor) -> torch.Tensor:
+    def feed_forward_chunk(self, attention_output: torch.Tensor) -> torch.Tensor:
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
         return layer_output
 
-    def feed_forward_chunk_query(
-            self, attention_output: torch.Tensor) -> torch.Tensor:
+    def feed_forward_chunk_query(self, attention_output: torch.Tensor) -> torch.Tensor:
         intermediate_output = self.intermediate_query(attention_output)
         layer_output = self.output_query(intermediate_output, attention_output)
         return layer_output
 
 
 class Blip2QFormerEncoder(nn.Module):
-
     def __init__(
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
 
-        self.layer = nn.ModuleList([
-            Blip2QFormerLayer(config,
-                              quant_config=quant_config,
-                              cache_config=cache_config,
-                              layer_idx=layer_idx,
-                              prefix=f"{prefix}.layer.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [
+                Blip2QFormerLayer(
+                    config,
+                    quant_config=quant_config,
+                    cache_config=cache_config,
+                    layer_idx=layer_idx,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -377,27 +381,27 @@ class Blip2QFormerEncoder(nn.Module):
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
 class Blip2QFormerModel(nn.Module):
-
     def __init__(
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
 
-        self.layernorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.encoder = Blip2QFormerEncoder(config,
-                                           quant_config=quant_config,
-                                           cache_config=cache_config,
-                                           prefix=f"{prefix}.encoder")
+        self.encoder = Blip2QFormerEncoder(
+            config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.encoder",
+        )
 
     def forward(
         self,
@@ -419,11 +423,10 @@ class Blip2QFormerModel(nn.Module):
 
 
 class Blip2ProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -432,7 +435,6 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
 
 
 class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -440,6 +442,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -447,16 +450,19 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=max_image_size,
+                height=max_image_size,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -509,21 +515,24 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
-                                        info=Blip2ProcessingInfo,
-                                        dummy_inputs=Blip2DummyInputsBuilder)
-class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                    SupportsQuant):
+@MULTIMODAL_REGISTRY.register_processor(
+    Blip2MultiModalProcessor,
+    info=Blip2ProcessingInfo,
+    dummy_inputs=Blip2DummyInputsBuilder,
+)
+class Blip2ForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
+):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
         raise ValueError("Only image modality is supported")
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -536,13 +545,15 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.vision_model = BlipVisionModel(config.vision_config, quant_config)
 
         self.query_tokens = nn.Parameter(
-            torch.zeros(1, config.num_query_tokens,
-                        config.qformer_config.hidden_size))
+            torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
+        )
 
-        self.qformer = Blip2QFormerModel(config.qformer_config,
-                                         cache_config=cache_config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.qformer")
+        self.qformer = Blip2QFormerModel(
+            config.qformer_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qformer",
+        )
 
         self.language_projection = nn.Linear(
             config.qformer_config.hidden_size,
@@ -557,10 +568,12 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+        self, **kwargs: object
+    ) -> Blip2ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -569,50 +582,44 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
 
         if pixel_values is not None:
             expected_h = expected_w = self.config.vision_config.image_size
-            return Blip2ImagePixelInputs(type="pixel_values",
-                                         data=flatten_bn(pixel_values,
-                                                         concat=True),
-                                         resolve_bindings={
-                                             "h": expected_h,
-                                             "w": expected_w
-                                         })
+            return Blip2ImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                resolve_bindings={"h": expected_h, "w": expected_w},
+            )
 
         if image_embeds is not None:
             return Blip2ImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _image_pixels_to_features(self, vision_model: BlipVisionModel,
-                                  pixel_values: torch.Tensor) -> torch.Tensor:
-
+    def _image_pixels_to_features(
+        self, vision_model: BlipVisionModel, pixel_values: torch.Tensor
+    ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
         image_features = vision_model(pixel_values)
 
         return image_features
 
-    def _process_image_pixels(self,
-                              inputs: Blip2ImagePixelInputs) -> torch.Tensor:
+    def _process_image_pixels(self, inputs: Blip2ImagePixelInputs) -> torch.Tensor:
         assert self.vision_model is not None
 
         pixel_values = inputs["data"]
 
         return self._image_pixels_to_features(self.vision_model, pixel_values)
 
-    def _process_image_input(self,
-                             image_input: Blip2ImageInputs) -> torch.Tensor:
-
+    def _process_image_input(self, image_input: Blip2ImageInputs) -> torch.Tensor:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
         assert self.vision_model is not None
         image_features = self._process_image_pixels(image_input)
 
-        query_tokens = self.query_tokens.expand(image_features.shape[0], -1,
-                                                -1)
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1, -1)
         query_output = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_features,
@@ -623,33 +630,19 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                _IMAGE_TOKEN_ID)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         """Run forward pass for BLIP-2.
@@ -664,7 +657,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
 
         To reserve space in KV cache, we have to insert placeholder tokens
-        before they are inputted to the model, so the input processor prepends 
+        before they are inputted to the model, so the input processor prepends
         dummy tokens (denoted as `50265`), resulting in:
         `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
 
@@ -677,7 +670,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-        
+
         Info:
             [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
         """
@@ -685,28 +678,18 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 30816f72a26781da09587a9c7e38f42dc3b80aba..00fba93423d8ef4e7152c21bf8ad600edbdc06f0 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -18,10 +18,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only BLOOM model compatible with HuggingFace weights."""
+
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -30,29 +30,40 @@ from transformers import BloomConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP, SupportsQuant
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
     base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
         dtype=torch.float32,
     )
     powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
@@ -60,27 +71,25 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
     if closest_power_of_2 != total_num_heads:
         extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
             dtype=torch.float32,
         )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
     return slopes
 
 
 class BloomAttention(nn.Module):
-
     def __init__(
         self,
         config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -99,12 +108,14 @@ class BloomAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         # Create the alibi slopes and slice them.
@@ -115,13 +126,15 @@ class BloomAttention(nn.Module):
         alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
         scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scaling,
-                              alibi_slopes=alibi_slopes,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scaling,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -137,11 +150,11 @@ class BloomAttention(nn.Module):
 
 
 class BloomMLP(nn.Module):
-
     def __init__(
         self,
         config: BloomConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -149,12 +162,14 @@ class BloomMLP(nn.Module):
             hidden_size,
             4 * hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.gelu_impl = get_act_fn("gelu")
         self.dense_4h_to_h = RowParallelLinear(
             4 * hidden_size,
             hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -165,28 +180,27 @@ class BloomMLP(nn.Module):
 
 
 class BloomBlock(nn.Module):
-
     def __init__(
         self,
         config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
 
-        self.input_layernorm = nn.LayerNorm(hidden_size,
-                                            eps=config.layer_norm_epsilon)
-        self.self_attention = BloomAttention(config,
-                                             cache_config,
-                                             quant_config,
-                                             prefix=f"{prefix}.self_attention")
+        self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.self_attention = BloomAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attention"
+        )
         self.post_attention_layernorm = nn.LayerNorm(
-            hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = BloomMLP(config, quant_config)
+            hidden_size, eps=config.layer_norm_epsilon
+        )
+        self.mlp = BloomMLP(config, quant_config, prefix=f"{prefix}.mlp")
         self.apply_residual_connection_post_layernorm = (
-            config.apply_residual_connection_post_layernorm)
+            config.apply_residual_connection_post_layernorm
+        )
 
     def forward(
         self,
@@ -223,7 +237,6 @@ class BloomBlock(nn.Module):
 
 @support_torch_compile
 class BloomModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -240,36 +253,39 @@ class BloomModel(nn.Module):
             self.embed_dim,
         )
         self.word_embeddings_layernorm = nn.LayerNorm(
-            self.embed_dim, eps=config.layer_norm_epsilon)
+            self.embed_dim, eps=config.layer_norm_epsilon
+        )
 
         # Transformer blocks
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
             lambda prefix: BloomBlock(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.h")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.h",
+        )
 
         # Final Layer Norm
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states = self.word_embeddings_layernorm(hidden_states)
         else:
             assert intermediate_tensors is not None
@@ -281,8 +297,7 @@ class BloomModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -300,14 +315,14 @@ class BloomModel(nn.Module):
                 if output_dim is not None:
                     loaded_weight_shape = loaded_weight.shape
                     loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
+                        loaded_weight_shape[:output_dim]
+                        + (num_heads, 3, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
+                    loaded_weight = loaded_weight.transpose(output_dim, output_dim + 1)
                     loaded_weight = loaded_weight.reshape(loaded_weight_shape)
 
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -315,60 +330,61 @@ class BloomModel(nn.Module):
 
 
 class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = BloomModel(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(
-                                          prefix, "transformer"))
+        self.transformer = BloomModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.word_embeddings
         else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                self.config.vocab_size,
+                self.config.hidden_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["lm_head.weight"])
         weights = _add_transformer_prefix(weights)
         return loader.load_weights(weights)
 
 
 def _add_transformer_prefix(
-    weights: Iterable[tuple[str, torch.Tensor]]
+    weights: Iterable[tuple[str, torch.Tensor]],
 ) -> Iterable[tuple[str, torch.Tensor]]:
     for name, tensor in weights:
-        if not name.startswith('transformer.'):
-            name = 'transformer.' + name
+        if not name.startswith("transformer."):
+            name = "transformer." + name
         yield name, tensor
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 79d648d749c6a77115f36b3eca3f49e893b42746..fb7476c45fcdb4537e9cf91645e966ce8b3003c8 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -4,47 +4,72 @@
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from itertools import islice
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
-                          ChameleonVQVAEConfig)
+from transformers import (
+    BatchFeature,
+    ChameleonConfig,
+    ChameleonProcessor,
+    ChameleonVQVAEConfig,
+)
 
 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, row_parallel_weight_loader)
+    default_weight_loader,
+    row_parallel_weight_loader,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
-                         SupportsQuant)
-from .utils import (flatten_bn, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -57,19 +82,19 @@ class ChameleonImagePixelInputs(TensorSchema):
         - h: Height of each image
         - w: Width of each image
     """
+
     type: Literal["pixel_values"]
     data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
 class ChameleonProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -77,9 +102,7 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
         return processor.image_seq_length
 
 
-class ChameleonDummyInputsBuilder(
-        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
-
+class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -92,23 +115,26 @@ class ChameleonDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=width,
-                                   height=height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=width,
+                height=height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class ChameleonMultiModalProcessor(
-        BaseMultiModalProcessor[ChameleonProcessingInfo]):
-
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor[ChameleonProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -177,50 +203,51 @@ class ChameleonMultiModalProcessor(
 
 
 class ChameleonLayerNorm(nn.LayerNorm):
-
     def __init__(self, hidden_size, *args, **kwargs):
         super().__init__(hidden_size, *args, **kwargs)
-        self.normalized_shape = (hidden_size[-1], )
+        self.normalized_shape = (hidden_size[-1],)
 
-        set_weight_attrs(self.weight,
-                         {"weight_loader": row_parallel_weight_loader})
-        set_weight_attrs(self.bias,
-                         {"weight_loader": row_parallel_weight_loader})
+        set_weight_attrs(self.weight, {"weight_loader": row_parallel_weight_loader})
+        set_weight_attrs(self.bias, {"weight_loader": row_parallel_weight_loader})
 
     def forward(self, hidden_states):
-        hidden_states = F.layer_norm(hidden_states,
-                                     self.normalized_shape,
-                                     None,
-                                     None,
-                                     eps=1e-5)
+        hidden_states = F.layer_norm(
+            hidden_states, self.normalized_shape, None, None, eps=1e-5
+        )
         hidden_states = hidden_states * self.weight + self.bias
         return hidden_states
 
 
 # Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP
 class ChameleonMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=hidden_size,
             output_sizes=[intermediate_size] * 2,
             bias=bias,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -232,18 +259,17 @@ class ChameleonMLP(nn.Module):
 
 # Modified from vllm.model_executor.models.llama.LlamaAttention -> ChameleonAttention #noqa
 class ChameleonAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 4096,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -276,12 +302,14 @@ class ChameleonAttention(nn.Module):
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
         self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
@@ -293,16 +321,19 @@ class ChameleonAttention(nn.Module):
             rope_scaling=rope_scaling,
         )
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
-    def _apply_qk_norm(self, q: torch.Tensor,
-                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # reshape for layernorm
         q = q.reshape(-1, self.num_heads, self.head_dim)
         k = k.reshape(-1, self.num_kv_heads, self.head_dim)
@@ -328,12 +359,11 @@ class ChameleonAttention(nn.Module):
 
 
 class ChameleonDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: ChameleonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -341,17 +371,19 @@ class ChameleonDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          4096)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -366,45 +398,42 @@ class ChameleonDecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
 
         return hidden_states, residual
 
 
 class ChameleonSwinDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: ChameleonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -412,17 +441,19 @@ class ChameleonSwinDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          4096)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -437,19 +468,19 @@ class ChameleonSwinDecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-
         residual = hidden_states
         hidden_states = self.self_attn(
             positions=positions,
@@ -470,7 +501,6 @@ class ChameleonSwinDecoderLayer(nn.Module):
 
 # Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEVectorQuantizer #noqa
 class ChameleonVQVAEVectorQuantizer(nn.Module):
-
     def __init__(self, config: ChameleonVQVAEConfig):
         super().__init__()
         self.num_embeddings = config.num_embeddings
@@ -486,55 +516,52 @@ class ChameleonVQVAEVectorQuantizer(nn.Module):
 
         # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
         distances = (
-            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) +
-            torch.sum(self.embedding.weight**2, dim=1) -
-            2 * torch.einsum("bd,dn->bn", hidden_state_flattened,
-                             self.embedding.weight.transpose(0, 1)))
+            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn",
+                hidden_state_flattened,
+                self.embedding.weight.transpose(0, 1),
+            )
+        )
 
         min_encoding_indices = torch.argmin(distances, dim=1)
         hidden_state_quant = self.embedding(min_encoding_indices).view(
-            hidden_state.shape)
+            hidden_state.shape
+        )
 
         # compute loss for embedding
-        loss = torch.mean((hidden_state_quant.detach() - hidden_state)**
-                          2) + self.beta * torch.mean(
-                              (hidden_state_quant - hidden_state.detach())**2)
+        loss = torch.mean(
+            (hidden_state_quant.detach() - hidden_state) ** 2
+        ) + self.beta * torch.mean((hidden_state_quant - hidden_state.detach()) ** 2)
 
         # preserve gradients
-        hidden_state_quant = hidden_state + (hidden_state_quant -
-                                             hidden_state).detach()
+        hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
 
         # reshape back to match original input shape
-        hidden_state_quant = hidden_state_quant.permute(0, 3, 1,
-                                                        2).contiguous()
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
 
         return hidden_state_quant, loss, min_encoding_indices
 
 
 # Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderConvDownsample #noqa
 class ChameleonVQVAEEncoderConvDownsample(nn.Module):
-
     def __init__(self, in_channels: int):
         super().__init__()
-        self.conv = nn.Conv2d(in_channels,
-                              in_channels,
-                              kernel_size=3,
-                              stride=2,
-                              padding=0)
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
 
     def forward(self, hidden_states: torch.Tensor):
         # no asymmetric padding in torch conv, must do it ourselves
-        hidden_states = F.pad(hidden_states,
-                              pad=(0, 1, 0, 1),
-                              mode="constant",
-                              value=0)
+        hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0)
         hidden_states = self.conv(hidden_states)
         return hidden_states
 
 
 # Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderResnetBlock #noqa
 class ChameleonVQVAEEncoderResnetBlock(nn.Module):
-
     def __init__(
         self,
         config: ChameleonVQVAEConfig,
@@ -544,42 +571,31 @@ class ChameleonVQVAEEncoderResnetBlock(nn.Module):
     ):
         super().__init__()
         self.in_channels = in_channels
-        self.out_channels = in_channels if out_channels is None \
-            else out_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
         self.use_conv_shortcut = conv_shortcut
 
-        self.norm1 = torch.nn.GroupNorm(num_groups=32,
-                                        num_channels=in_channels,
-                                        eps=1e-6,
-                                        affine=True)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        self.norm2 = torch.nn.GroupNorm(num_groups=32,
-                                        num_channels=out_channels,
-                                        eps=1e-6,
-                                        affine=True)
+        self.norm1 = torch.nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = torch.nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
         self.dropout = torch.nn.Dropout(config.dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
         if self.in_channels != self.out_channels:
             if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
             else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
 
     def forward(self, hidden_states: torch.Tensor):
         residual = hidden_states
@@ -603,35 +619,25 @@ class ChameleonVQVAEEncoderResnetBlock(nn.Module):
 
 # Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderAttnBlock #noqa
 class ChameleonVQVAEEncoderAttnBlock(nn.Module):
-
     def __init__(self, in_channels: int):
         super().__init__()
         self.in_channels = in_channels
 
-        self.norm = torch.nn.GroupNorm(num_groups=32,
-                                       num_channels=in_channels,
-                                       eps=1e-6,
-                                       affine=True)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
+        self.norm = torch.nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
 
     def forward(self, hidden_states: torch.Tensor):
         residual = hidden_states
@@ -642,20 +648,20 @@ class ChameleonVQVAEEncoderAttnBlock(nn.Module):
 
         # compute attention
         batch_size, channels, height, width = query_states.shape
-        query_states = query_states.reshape(batch_size, channels,
-                                            height * width).permute(0, 2, 1)
+        query_states = query_states.reshape(
+            batch_size, channels, height * width
+        ).permute(0, 2, 1)
         key_states = key_states.reshape(batch_size, channels, height * width)
         attn_weights = torch.bmm(query_states, key_states)
-        attn_weights = attn_weights * (int(channels)**(-0.5))
+        attn_weights = attn_weights * (int(channels) ** (-0.5))
         attn_weights = F.softmax(attn_weights, dim=2)
 
         # attend to values
-        value_states = value_states.reshape(batch_size, channels,
-                                            height * width)
+        value_states = value_states.reshape(batch_size, channels, height * width)
         attn_weights = attn_weights.permute(0, 2, 1)
-        attn_output = torch.bmm(value_states,
-                                attn_weights).reshape(batch_size, channels,
-                                                      height, width)
+        attn_output = torch.bmm(value_states, attn_weights).reshape(
+            batch_size, channels, height, width
+        )
 
         attn_output = self.proj_out(attn_output)
         return residual + attn_output
@@ -663,7 +669,6 @@ class ChameleonVQVAEEncoderAttnBlock(nn.Module):
 
 # Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoder #noqa
 class ChameleonVQVAEEncoder(nn.Module):
-
     def __init__(self, config: ChameleonVQVAEConfig):
         super().__init__()
 
@@ -676,14 +681,12 @@ class ChameleonVQVAEEncoder(nn.Module):
         latent_channels = config.latent_channels
         channel_multiplier = config.channel_multiplier
 
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       base_channels,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, base_channels, kernel_size=3, stride=1, padding=1
+        )
 
         curr_res = resolution
-        in_channel_multiplier = (1, ) + tuple(channel_multiplier)
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
         self.in_channel_multiplier = in_channel_multiplier
         self.down = nn.ModuleList()
         for i_level in range(self.num_resolutions):
@@ -697,11 +700,14 @@ class ChameleonVQVAEEncoder(nn.Module):
                         config=config,
                         in_channels=block_in,
                         out_channels=block_out,
-                    ))
+                    )
+                )
                 block_in = block_out
-                if (config.attn_resolutions is not None
-                        and curr_res in config.attn_resolutions
-                        and config.attn_type == "vanilla"):
+                if (
+                    config.attn_resolutions is not None
+                    and curr_res in config.attn_resolutions
+                    and config.attn_type == "vanilla"
+                ):
                     attn.append(ChameleonVQVAEEncoderAttnBlock(block_in))
 
             down = nn.Module()
@@ -718,18 +724,20 @@ class ChameleonVQVAEEncoder(nn.Module):
             in_channels=block_in,
             out_channels=block_in,
         )
-        self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(
-            block_in) if config.attn_type == "vanilla" else nn.Identity()
+        self.mid.attn_1 = (
+            ChameleonVQVAEEncoderAttnBlock(block_in)
+            if config.attn_type == "vanilla"
+            else nn.Identity()
+        )
         self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock(
             config=config,
             in_channels=block_in,
             out_channels=block_in,
         )
 
-        self.norm_out = torch.nn.GroupNorm(num_groups=32,
-                                           num_channels=block_in,
-                                           eps=1e-6,
-                                           affine=True)
+        self.norm_out = torch.nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
         self.conv_out = torch.nn.Conv2d(
             block_in,
             2 * latent_channels if double_latent else latent_channels,
@@ -745,15 +753,12 @@ class ChameleonVQVAEEncoder(nn.Module):
         hidden_states = [self.conv_in(pixel_values)]
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
-                hidden_state = self.down[i_level].block[i_block](
-                    hidden_states[-1])
+                hidden_state = self.down[i_level].block[i_block](hidden_states[-1])
                 if len(self.down[i_level].attn) > 0:
-                    hidden_state = self.down[i_level].attn[i_block](
-                        hidden_state)
+                    hidden_state = self.down[i_level].attn[i_block](hidden_state)
                 hidden_states.append(hidden_state)
             if i_level != self.num_resolutions - 1:
-                hidden_states.append(self.down[i_level].downsample(
-                    hidden_states[-1]))
+                hidden_states.append(self.down[i_level].downsample(hidden_states[-1]))
 
         # middle
         last_hidden_state = hidden_states[-1]
@@ -770,15 +775,14 @@ class ChameleonVQVAEEncoder(nn.Module):
 
 # Adapted from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAE #noqa
 class ChameleonVQVAE(nn.Module):
-
     def __init__(self, config: ChameleonVQVAEConfig):
         super().__init__()
         self.encoder = ChameleonVQVAEEncoder(config)
         self.quantize = ChameleonVQVAEVectorQuantizer(config)
-        self.quant_conv = torch.nn.Conv2d(config.latent_channels,
-                                          config.embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim,
-                                               config.latent_channels, 1)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(
+            config.embed_dim, config.latent_channels, 1
+        )
         self.eval()  # Chameleon's VQ model is frozen
 
     def encode(
@@ -806,10 +810,9 @@ class ChameleonImageVocabularyMapping:
 
     @cached_property
     def image_tokens(self):
-        return sorted([
-            val for name, val in self.vocab_map.items()
-            if name.startswith("IMGIMG")
-        ])
+        return sorted(
+            [val for name, val in self.vocab_map.items() if name.startswith("IMGIMG")]
+        )
 
     @cached_property
     def bpe2img(self):
@@ -817,13 +820,10 @@ class ChameleonImageVocabularyMapping:
 
         def remap(old_name: str) -> str:
             return "".join(
-                img_tkn_chr_mapping.get(c, c)
-                for c in old_name[len("IMGIMG"):-1])
+                img_tkn_chr_mapping.get(c, c) for c in old_name[len("IMGIMG") : -1]
+            )
 
-        return {
-            tok: int(remap(self.val2name[tok]))
-            for tok in self.image_tokens
-        }
+        return {tok: int(remap(self.val2name[tok])) for tok in self.image_tokens}
 
     @cached_property
     def img2bpe(self):
@@ -832,7 +832,8 @@ class ChameleonImageVocabularyMapping:
     @cached_property
     def bpe2img_search_tensors(self):
         return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(
-            sorted(self.bpe2img.values()))
+            sorted(self.bpe2img.values())
+        )
 
     @cached_property
     def img2bpe_mapping_tensor(self):
@@ -848,7 +849,6 @@ class ChameleonImageVocabularyMapping:
 
 
 class ChameleonModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -862,27 +862,31 @@ class ChameleonModel(nn.Module):
             self.vocab_size,
             config.hidden_size,
         )
-        self.vocabulary_mapping = ChameleonImageVocabularyMapping(
-            config.vocabulary_map)
-        decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm \
+        self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map)
+        decoder_layer = (
+            ChameleonDecoderLayer
+            if not self.config.swin_norm
             else ChameleonSwinDecoderLayer
+        )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: decoder_layer(config=config,
-                                         cache_config=cache_config,
-                                         quant_config=quant_config,
-                                         prefix=prefix),
+            lambda prefix: decoder_layer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.vqmodel = ChameleonVQVAE(config.vq_config)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
@@ -899,16 +903,16 @@ class ChameleonModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -921,10 +925,9 @@ class ChameleonModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -932,16 +935,20 @@ class ChameleonModel(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(
     ChameleonMultiModalProcessor,
     info=ChameleonProcessingInfo,
-    dummy_inputs=ChameleonDummyInputsBuilder)
-class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP, SupportsQuant):
+    dummy_inputs=ChameleonDummyInputsBuilder,
+)
+class ChameleonForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
+):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -953,11 +960,12 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
-        self.model = ChameleonModel(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
+        self.model = ChameleonModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -965,13 +973,14 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.lm_head.weight = self.model.embed_tokens.weight
 
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
+        self, **kwargs: object
+    ) -> ChameleonImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
 
         if pixel_values is None:
@@ -980,72 +989,46 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         vq_config: ChameleonVQVAEConfig = self.config.vq_config
         expected_h = expected_w = vq_config.resolution
 
-        return ChameleonImagePixelInputs(type="pixel_values",
-                                         data=flatten_bn(pixel_values,
-                                                         concat=True),
-                                         resolve_bindings={
-                                             "h": expected_h,
-                                             "w": expected_w
-                                         })
+        return ChameleonImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            resolve_bindings={"h": expected_h, "w": expected_w},
+        )
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         assert self.model.vqmodel is not None
-        image_tokens = self.model.get_image_tokens(image_input["data"].to(
-            self.config.torch_dtype))
-        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        image_tokens = self.model.get_image_tokens(
+            image_input["data"].to(self.config.dtype)
+        )
+        vision_embeddings = self.model.embed_input_ids(image_tokens)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-
-        inputs_embeds = self.model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.model.vocabulary_mapping.image_token_id)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.model(input_ids,
-                                   positions,
-                                   intermediate_tensors,
-                                   inputs_embeds=inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
 
         # Disallow image tokens which does not include special
@@ -1056,8 +1039,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1072,8 +1054,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
@@ -1091,8 +1072,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                     # not vqvae for now.
                     use_default_weight_loading = True
             else:
-                for (param_name, weight_name,
-                     shard_id) in stacked_params_mapping:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
@@ -1112,7 +1092,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
+                            ".kv_scale", ".attn.kv_scale"
+                        )
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
@@ -1125,15 +1106,15 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             if use_default_weight_loading and name in params_dict:
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 879508400222fc3c5c826cb21e3ead9fc49eefe5..5d6f5e9125a28f2e9bf41965f462a31c3af95b94 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -3,10 +3,10 @@
 # Adapted from
 # https://github.com/zai-org/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
+
 import json
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -18,31 +18,39 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GLMAttention(nn.Module):
-
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -52,9 +60,11 @@ class GLMAttention(nn.Module):
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
         self.multi_query_attention = config.multi_query_attention
-        self.total_num_kv_heads = (config.multi_query_group_num
-                                   if config.multi_query_attention else
-                                   config.num_attention_heads)
+        self.total_num_kv_heads = (
+            config.multi_query_group_num
+            if config.multi_query_attention
+            else config.num_attention_heads
+        )
         if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
@@ -99,13 +109,15 @@ class GLMAttention(nn.Module):
             base=10000 * rope_ratio,
             is_neox_style=is_neox_style,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -131,7 +143,7 @@ class GLMMLP(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -177,31 +189,33 @@ class GLMBlock(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.apply_residual_connection_post_layernorm = (
-            config.apply_residual_connection_post_layernorm)
+            config.apply_residual_connection_post_layernorm
+        )
 
         self.fp32_residual_connection = config.fp32_residual_connection
 
         layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
         # Layernorm on the input data.
-        self.input_layernorm = layer_norm_func(config.hidden_size,
-                                               eps=config.layernorm_epsilon)
+        self.input_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
 
         # Self attention.
-        self.self_attention = GLMAttention(config,
-                                           cache_config,
-                                           quant_config,
-                                           prefix=f"{prefix}.self_attention")
+        self.self_attention = GLMAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attention"
+        )
         self.hidden_dropout = config.hidden_dropout
 
         # Layernorm on the attention output
         self.post_attention_layernorm = layer_norm_func(
-            config.hidden_size, eps=config.layernorm_epsilon)
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
 
         # MLP
         self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp")
@@ -248,8 +262,8 @@ class GLMTransformer(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -261,8 +275,7 @@ class GLMTransformer(nn.Module):
         # Transformer layers.
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.num_layers,
-            lambda prefix: GLMBlock(
-                config, cache_config, quant_config, prefix=prefix),
+            lambda prefix: GLMBlock(config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
@@ -270,20 +283,22 @@ class GLMTransformer(nn.Module):
             layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
             # Final layer norm before output.
             self.final_layernorm = layer_norm_func(
-                config.hidden_size, eps=config.layernorm_epsilon)
+                config.hidden_size, eps=config.layernorm_epsilon
+            )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states = layer(hidden_states=hidden_states,
-                                  position_ids=position_ids)
+            hidden_states = layer(
+                hidden_states=hidden_states, position_ids=position_ids
+            )
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -298,8 +313,10 @@ class GLMTransformer(nn.Module):
 @support_torch_compile
 class ChatGLMModel(nn.Module, SupportsQuant):
     packed_modules_mapping = {
-        "linear_proj.merged_proj":
-        ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
+        "linear_proj.merged_proj": [
+            "linear_proj.gate_proj",
+            "linear_proj.dense_h_to_4h",
+        ]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -311,43 +328,47 @@ class ChatGLMModel(nn.Module, SupportsQuant):
 
         self.config = config
 
-        self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
-                                                config.hidden_size,
-                                                quant_config=quant_config,
-                                                prefix=f"{prefix}.embedding")
+        self.embedding = VocabParallelEmbedding(
+            config.padded_vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embedding",
+        )
 
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
         self.kv_channels = config.kv_channels
-        self.encoder = GLMTransformer(config,
-                                      cache_config,
-                                      quant_config,
-                                      prefix=f"{prefix}.encoder")
+        self.encoder = GLMTransformer(
+            config, cache_config, quant_config, prefix=f"{prefix}.encoder"
+        )
 
-        self.output_layer = ParallelLMHead(config.padded_vocab_size,
-                                           config.hidden_size,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.output_layer")
+        self.output_layer = ParallelLMHead(
+            config.padded_vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_layer",
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.encoder.make_empty_intermediate_tensors)
+            self.encoder.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -360,8 +381,7 @@ class ChatGLMModel(nn.Module, SupportsQuant):
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
@@ -371,7 +391,7 @@ class ChatGLMModel(nn.Module, SupportsQuant):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -392,8 +412,7 @@ class ChatGLMModel(nn.Module, SupportsQuant):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -401,7 +420,8 @@ class ChatGLMModel(nn.Module, SupportsQuant):
 
 class ChatGLMBaseModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={".word_embeddings": ""}, )
+        orig_to_new_substr={".word_embeddings": ""},
+    )
 
     def __init__(
         self,
@@ -413,30 +433,31 @@ class ChatGLMBaseModel(nn.Module):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
-        self.lora_config = lora_config
         self.multimodal_config = multimodal_config
 
         self.quant_config = quant_config
-        self.max_position_embeddings = getattr(config, "max_sequence_length",
-                                               8192)
-        self.transformer = transformer_type(vllm_config=vllm_config,
-                                            prefix=maybe_prefix(
-                                                prefix, "transformer"))
+        self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
+        self.transformer = transformer_type(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         if self.config.tie_word_embeddings:
-            self.transformer.output_layer.weight = (
-                self.transformer.embedding.weight)
+            self.transformer.output_layer.weight = self.transformer.embedding.weight
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -445,11 +466,10 @@ class ChatGLMBaseModel(nn.Module):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
-                         SupportsQuant):
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
-        "dense_h_to_4h": ["dense_h_to_4h"]
+        "dense_h_to_4h": ["dense_h_to_4h"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -460,7 +480,8 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                 "The configuration of this model indicates that it supports "
                 "vision inputs, but you instantiated the text-only version "
                 "of this model. Please use the vision model by setting "
-                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`"
+            )
 
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
@@ -468,9 +489,10 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index dcab0082287046a4870361f15ba8a6f9bdf7d3f6..5d611deb942d144b3950e7384d29d82554242c65 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,36 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Minimal implementation of CLIPVisionModel intended to be only used
-within a vision language model."""
-from collections.abc import Iterable
-from typing import Optional, Union
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig
-
+from transformers import (
+    BatchFeature,
+    CLIPConfig,
+    CLIPProcessor,
+    CLIPTextConfig,
+    CLIPVisionConfig,
+)
+
+from vllm.attention import Attention
 from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.conv import Conv2dLayer
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptIndexTargets,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces_base import default_pooling_type
+from .utils import AutoWeightsLoader, maybe_prefix
+from .vision import (
+    VisionEncoderInfo,
+    VisionFeatureSelectStrategy,
+    VisionFeatureSelectStrategyStr,
+    get_num_selected_vision_tokens,
+    resolve_visual_encoder_outputs,
+)
+
+
+class CLIPImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
 
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+    type: Literal["pixel_values"]
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
 class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
-
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        return self.get_patch_grid_length()**2 + 1
+        return self.get_patch_grid_length() ** 2 + 1
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
@@ -44,9 +97,215 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
         return image_size // patch_size
 
 
-# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
-class CLIPVisionEmbeddings(nn.Module):
+_POOLING_TYPE_TO_STRATEGY: dict[str, VisionFeatureSelectStrategyStr] = {
+    "MEAN": "full",
+    "ALL": "full",
+    "CLS": "class",
+    # This lets us use the same pooling type for both text and image
+    "LAST": "class",
+}
+
+
+def _get_vision_feature_select_strategy(pooling_type: str):
+    try:
+        return _POOLING_TYPE_TO_STRATEGY[pooling_type]
+    except KeyError:
+        raise ValueError(
+            f"No feature selection strategy is defined for "
+            f"pooling_type: {pooling_type!r}"
+        ) from None
+
+
+class CLIPProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(CLIPConfig)
+
+    def get_vision_encoder_info(self):
+        return CLIPEncoderInfo(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(CLIPProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": 1}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+
+        pooler_config = self.ctx.model_config.pooler_config
+        assert pooler_config is not None
+
+        return get_num_selected_vision_tokens(
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+            _get_vision_feature_select_strategy(pooler_config.pooling_type),
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
+        }
+
+
+class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
+    @cached_property
+    def image_token_id(self) -> int:
+        tokenizer = self.info.get_tokenizer()
+        dummy_token_id = 0
+
+        assert dummy_token_id not in tokenizer.all_special_ids
+
+        return dummy_token_id
 
+    def apply(
+        self,
+        prompt: str | list[int],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        *,
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> MultiModalInputs:
+        if prompt and mm_data:
+            raise ValueError(
+                "CLIP accepts text-only or image-only inputs, not both! "
+                "Image-only inputs means passing an image with an empty text "
+                "prompt."
+            )
+
+        if mm_data:
+            # For multi-modal data, the prompt after processing should
+            # only contain the dummy image tokens
+            tokenization_kwargs = {
+                **(tokenization_kwargs or {}),
+                "add_special_tokens": False,
+            }
+
+        return super().apply(
+            prompt=prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        image_token_id = self.image_token_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            num_image_tokens = self.info.get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=PromptIndexTargets.start(),
+                replacement=get_replacement,
+            ),
+        ]
+
+
+# Adapted from: https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/models/clip/modeling_clip.py
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+
+        embed_dim = config.hidden_size
+
+        self.token_embedding = VocabParallelEmbedding(config.vocab_size, embed_dim)
+        self.position_embedding = VocabParallelEmbedding(
+            config.max_position_embeddings, embed_dim
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "Either `input_ids` or `input_embeds` must be provided"
+                )
+
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPVisionEmbeddings(nn.Module):
     def __init__(self, config: CLIPVisionConfig):
         super().__init__()
         self.config = config
@@ -57,7 +316,7 @@ class CLIPVisionEmbeddings(nn.Module):
 
         self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -65,19 +324,21 @@ class CLIPVisionEmbeddings(nn.Module):
             bias=False,
         )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
-        self.register_buffer("position_ids",
-                             torch.arange(self.num_positions).expand((1, -1)),
-                             persistent=False)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(
-            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -88,15 +349,16 @@ class CLIPVisionEmbeddings(nn.Module):
 
 
 class CLIPAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
     def __init__(
         self,
-        config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
         prefix: str = "",
-    ):
+        attn_cls: type[Attention] | type[MultiHeadAttention],
+    ) -> None:
         super().__init__()
+
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -105,7 +367,8 @@ class CLIPAttention(nn.Module):
             raise ValueError(
                 "embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
 
         self.qkv_proj = QKVParallelLinear(
@@ -126,8 +389,12 @@ class CLIPAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = attn_cls(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -144,26 +411,29 @@ class CLIPAttention(nn.Module):
 
 
 class CLIPMLP(nn.Module):
-
     def __init__(
         self,
-        config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -174,29 +444,26 @@ class CLIPMLP(nn.Module):
 
 
 class CLIPEncoderLayer(nn.Module):
-
     def __init__(
         self,
-        config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
         prefix: str = "",
+        attn_cls: type[Attention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
         self.self_attn = CLIPAttention(
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            attn_cls=attn_cls,
         )
-        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.mlp = CLIPMLP(config,
-                           quant_config=quant_config,
-                           prefix=f"{prefix}.mlp")
-        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
@@ -222,10 +489,12 @@ class CLIPEncoder(nn.Module):
 
     def __init__(
         self,
-        config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
+        *,
         prefix: str = "",
+        attn_cls: type[Attention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -235,16 +504,23 @@ class CLIPEncoder(nn.Module):
             num_hidden_layers = config.num_hidden_layers
         else:
             num_hidden_layers = num_hidden_layers_override
-        self.layers = nn.ModuleList([
-            CLIPEncoderLayer(config=config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_cls=attn_cls,
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(
-        self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        self,
+        inputs_embeds: torch.Tensor,
+        return_all_hidden_states: bool,
+    ) -> torch.Tensor | list[torch.Tensor]:
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
@@ -259,15 +535,92 @@ class CLIPEncoder(nn.Module):
         return hidden_states
 
 
-class CLIPVisionTransformer(nn.Module):
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        config: CLIPTextConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
 
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPTextEmbeddings(config)
+
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            attn_cls=Attention,
+        )
+
+        self.final_layer_norm = nn.LayerNorm(
+            embed_dim,
+            eps=config.layer_norm_eps,
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.token_embedding(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=False,
+        )
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        return last_hidden_state
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class CLIPVisionTransformer(nn.Module):
     def __init__(
         self,
         config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -286,6 +639,7 @@ class CLIPVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
+            attn_cls=MultiHeadAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
@@ -300,73 +654,47 @@ class CLIPVisionTransformer(nn.Module):
             require_post_norm = len(self.encoder.layers) == num_hidden_layers
 
         if require_post_norm:
-            self.post_layernorm = nn.LayerNorm(embed_dim,
-                                               eps=config.layer_norm_eps)
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         else:
             self.post_layernorm = None
 
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
     def forward(
         self,
         pixel_values: torch.Tensor,
-        feature_sample_layers: Optional[list[int]] = None,
+        *,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
-
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
 
-        return_all_hidden_states = feature_sample_layers is not None
-
         # Produces either the last layer output or all of the hidden states,
-        # depending on if we have feature_sample_layers or not
+        # depending on if we have select_layers or not
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            return_all_hidden_states=return_all_hidden_states)
+            return_all_hidden_states=select_layers is not None,
+        )
 
         # Handle post-norm (if applicable) and stacks feature layers if needed
         encoder_outputs = resolve_visual_encoder_outputs(
-            encoder_outputs, feature_sample_layers, self.post_layernorm,
-            self.config.num_hidden_layers)
+            encoder_outputs,
+            self.post_layernorm,
+            select_layers=select_layers,
+            max_possible_layers=self.config.num_hidden_layers,
+            feature_select_strategy=feature_select_strategy,
+        )
 
         return encoder_outputs
 
-
-class CLIPVisionModel(nn.Module, SupportsQuant):
-    config_class = CLIPVisionConfig
-    main_input_name = "pixel_values"
-    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-
-    def __init__(
-        self,
-        config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.vision_model = CLIPVisionTransformer(
-            config=config,
-            quant_config=quant_config,
-            num_hidden_layers_override=num_hidden_layers_override,
-            require_post_norm=require_post_norm,
-            prefix=f"{prefix}.vision_model")
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        feature_sample_layers: Optional[list[int]] = None,
-    ) -> torch.Tensor:
-        return self.vision_model(pixel_values, feature_sample_layers)
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    # (TODO) Add prefix argument for filtering out weights to be loaded
-    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -375,21 +703,20 @@ class CLIPVisionModel(nn.Module, SupportsQuant):
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
-        layer_count = len(self.vision_model.encoder.layers)
+        layer_count = len(self.encoder.layers)
 
         for name, loaded_weight in weights:
             # post_layernorm is not needed in CLIPVisionModel
-            if (name.startswith("vision_model.post_layernorm")
-                    and self.vision_model.post_layernorm is None):
+            if name.startswith("post_layernorm") and self.post_layernorm is None:
                 continue
 
             # omit layers when num_hidden_layers_override is set
-            if name.startswith("vision_model.encoder.layers"):
-                layer_idx = int(name.split(".")[3])
+            if name.startswith("encoder.layers"):
+                layer_idx = int(name.split(".")[2])
                 if layer_idx >= layer_count:
                     continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -400,8 +727,239 @@ class CLIPVisionModel(nn.Module, SupportsQuant):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class CLIPVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vision_model = CLIPVisionTransformer(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            require_post_norm=require_post_norm,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
+    ) -> torch.Tensor:
+        return self.vision_model(
+            pixel_values,
+            select_layers=select_layers,
+            feature_select_strategy=feature_select_strategy,
+        )
+
+    @property
+    def dtype(self):
+        return self.vision_model.dtype
+
+    @property
+    def device(self):
+        return self.vision_model.device
+
+
+# Assume EOS token corresponds to LAST token in text model
+@default_pooling_type("LAST")
+@MULTIMODAL_REGISTRY.register_processor(
+    CLIPMultiModalProcessor,
+    info=CLIPProcessingInfo,
+    dummy_inputs=CLIPDummyInputsBuilder,
+)
+class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
+    is_pooling_model = True
+
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+    merge_by_field_config = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: CLIPConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(
+            text_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "text_model"),
+        )
+        self.vision_model = CLIPVisionTransformer(
+            vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim,
+            self.projection_dim,
+            bias=False,
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim,
+            self.projection_dim,
+            bias=False,
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler_config = pooler_config
+
+        self.pooler = DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": Pooler.for_embed(pooler_config),
+            }
+        )
+
+        # Assumes that self.forward is called after self.embed_input_ids
+        self._is_text_input = True
+
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        pooled_output = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
+    ) -> torch.Tensor:
+        if feature_select_strategy is None:
+            feature_select_strategy = _get_vision_feature_select_strategy(
+                self.pooler_config.pooling_type
+            )
+
+        pooled_output = self.vision_model(
+            pixel_values=pixel_values,
+            select_layers=None,
+            feature_select_strategy=feature_select_strategy,
+        )
+
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> CLIPImagePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is None:
+            return None
+
+        expected_h = expected_w = self.config.vision_config.image_size
+        return CLIPImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            resolve_bindings={"h": expected_h, "w": expected_w},
+        )
+
+    def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["data"]
+
+        return self.get_image_features(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.text_model
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
+    ) -> torch.Tensor:
+        self._is_text_input = (
+            multimodal_embeddings is None or len(multimodal_embeddings) == 0
+        )
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        vision_embeddings = self._process_image_inputs(image_input)
+        return vision_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            raise RuntimeError("PP is not supported for this model")
+
+        # Multimodal inputs
+        if not self._is_text_input:
+            return inputs_embeds
+
+        # Text inputs
+        return self.get_text_features(
+            input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_substrs=[".position_ids"],
+            ignore_unexpected_prefixes=["logit_scale."],
+        )
+
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 6d67eb68d51a8f186fbd13d4f40628ccb411ed0e..139ccba9df6d8ad57b25355cf920614e165f3e46 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -4,41 +4,51 @@
 """Command-A-Vision (Cohere2Vision) multimodal model implementation for vLLM."""
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.models.cohere2_vision import Cohere2VisionConfig
 from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import (  # noqa: E501
-    get_optimal_tiled_canvas)
+    get_optimal_tiled_canvas,
+)
 from transformers.models.cohere2_vision.processing_cohere2_vision import (
-    Cohere2VisionProcessor)
+    Cohere2VisionProcessor,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import MulAndSilu
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalFieldConfig,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 
 class Cohere2VisionImagePixelInputs(TensorSchema):
@@ -67,7 +77,7 @@ class Cohere2VisionImagePixelInputs(TensorSchema):
 
 class Cohere2VisionMultiModalProjector(nn.Module):
     """Multimodal projector that maps vision features to text embedding space.
-    
+
     Uses pixel shuffle downsampling followed by SwiGLU activation.
     """
 
@@ -76,8 +86,7 @@ class Cohere2VisionMultiModalProjector(nn.Module):
         self.downsample_factor = config.downsample_factor
 
         # Input dimension after pixel shuffle downsampling
-        input_dim = config.vision_config.hidden_size * (
-            config.downsample_factor**2)
+        input_dim = config.vision_config.hidden_size * (config.downsample_factor**2)
         # MergedColumnParallelLinear expects the intermediate size to be a list
         # of sizes, so that it will load the weights as two separate linear
         # layers before applying any parallelism.
@@ -110,28 +119,26 @@ class Cohere2VisionMultiModalProjector(nn.Module):
 
     def pixel_shuffle(self, image_features: torch.Tensor) -> torch.Tensor:
         """Apply pixel shuffle downsampling to reduce spatial dimensions.
-        
+
         Args:
             image_features: Input tensor of shape [B, S, D] where S = H*W
-            
+
         Returns:
             Downsampled tensor with increased channel dimension
         """
-        height = width = int(image_features.shape[1]**0.5)
+        height = width = int(image_features.shape[1] ** 0.5)
         x = image_features.reshape(image_features.shape[0], width, height, -1)
         n, h, w, c = x.size()
-        scale_factor = 1. / self.downsample_factor
+        scale_factor = 1.0 / self.downsample_factor
         nh = int(h * scale_factor)
         nw = int(w * scale_factor)
-        x = x.reshape(n, nh, self.downsample_factor, nw,
-                      self.downsample_factor, c)
+        x = x.reshape(n, nh, self.downsample_factor, nw, self.downsample_factor, c)
         x = x.permute(0, 1, 3, 2, 4, 5).contiguous()
         x = x.reshape(n, nh, nw, -1)
         return x
 
 
 class Cohere2VisionProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> Cohere2VisionConfig:
         return self.ctx.get_hf_config(Cohere2VisionConfig)
 
@@ -141,13 +148,13 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        height = image_processor.size['height']
-        width = image_processor.size['width']
+        height = image_processor.size["height"]
+        width = image_processor.size["width"]
         max_patches = image_processor.max_patches
         return ImageSize(height=height * max_patches, width=width)
 
@@ -156,7 +163,7 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Cohere2VisionProcessor],
+        processor: Cohere2VisionProcessor | None,
     ) -> int:
         """
         Calculate the number of image patches for a given image.
@@ -196,8 +203,8 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
 
 
 class Cohere2VisionDummyInputsBuilder(
-        BaseDummyInputsBuilder[Cohere2VisionProcessingInfo]):
-
+    BaseDummyInputsBuilder[Cohere2VisionProcessingInfo]
+):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -210,22 +217,26 @@ class Cohere2VisionDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
-        image_size = \
-            self.info.get_image_size_with_most_features()
+        image_size = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=image_size.width,
-                                   height=image_size.height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=image_size.width,
+                height=image_size.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class Cohere2VisionMultiModalProcessor(
-        BaseMultiModalProcessor[Cohere2VisionProcessingInfo]):
-
+    BaseMultiModalProcessor[Cohere2VisionProcessingInfo]
+):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -241,22 +252,26 @@ class Cohere2VisionMultiModalProcessor(
         )
 
         # Ensure num_patches is available for proper tensor splitting
-        if "num_patches" not in processed_outputs and (
-                images := mm_data.get("images")) is not None:
+        if (
+            "num_patches" not in processed_outputs
+            and (images := mm_data.get("images")) is not None
+        ):
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
             # Fallback calculation if HF processor didn't provide num_patches
-            parsed_images = self._get_data_parser().parse_mm_data({
-                "image":
-                images
-            }).get_items("image", ImageProcessorItems)
+            parsed_images = (
+                self._get_data_parser()
+                .parse_mm_data({"image": images})
+                .get_items("image", ImageProcessorItems)
+            )
 
             num_patches = [
                 self.info.get_num_patches(
                     image_width=parsed_images.get_image_size(i).width,
                     image_height=parsed_images.get_image_size(i).height,
                     processor=hf_processor,
-                ) for i in range(len(parsed_images))
+                )
+                for i in range(len(parsed_images))
             ]
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
@@ -269,8 +284,7 @@ class Cohere2VisionMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         num_patches = hf_inputs.get("num_patches", torch.empty(0))
         return dict(
-            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -297,8 +311,7 @@ class Cohere2VisionMultiModalProcessor(
                 image_height=image_size.height,
                 processor=hf_processor,
             )
-            patch_tokens = (image_token * img_tokens_per_tile +
-                            img_line_break_token)
+            patch_tokens = image_token * img_tokens_per_tile + img_line_break_token
             repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}"
 
             return PromptUpdateDetails.select_text(repl, image_token)
@@ -315,9 +328,10 @@ class Cohere2VisionMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     Cohere2VisionMultiModalProcessor,
     info=Cohere2VisionProcessingInfo,
-    dummy_inputs=Cohere2VisionDummyInputsBuilder)
-class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                            SupportsPP):
+    dummy_inputs=Cohere2VisionDummyInputsBuilder,
+)
+class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -325,7 +339,8 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.language_model.": "language_model.model.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -337,37 +352,39 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.multimodal_config = multimodal_config
         self._patch_quant_config(config, quant_config)
 
-        self.vision_tower = SiglipVisionModel(config.vision_config,
-                                              quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "vision_tower"))
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.vocab_size = config.text_config.vocab_size
-        self.multi_modal_projector = \
-            Cohere2VisionMultiModalProjector(
-                config, prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.multi_modal_projector = Cohere2VisionMultiModalProjector(
+            config, prefix=maybe_prefix(prefix, "multi_modal_projector")
+        )
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
-            architectures=config.text_config.architectures)
+            architectures=config.text_config.architectures,
+        )
 
     @property
     def dtype(self):
         return next(self.parameters()).dtype
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
-    def _process_image_input(self, image_input: Cohere2VisionImagePixelInputs,
-                             **kwargs) -> list[torch.Tensor]:
+    def _process_image_input(
+        self, image_input: Cohere2VisionImagePixelInputs, **kwargs
+    ) -> list[torch.Tensor]:
         """Process image pixels through vision tower and projector.
-        
+
         Args:
-            image_input: Validated image input containing pixel values and 
+            image_input: Validated image input containing pixel values and
                          patch counts
-            
+
         Returns:
             List of flattened image embeddings, one per image
         """
@@ -383,89 +400,63 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_embeds = self.multi_modal_projector(image_features)
 
         # Split and flatten embeddings per image
-        return [
-            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
-        ]
+        return [e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())]
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Cohere2VisionImagePixelInputs]:
+        self, **kwargs: object
+    ) -> Cohere2VisionImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
-        assert image_embeds is None, \
-            "Cohere2Vision does not support image_embeds."
+        assert image_embeds is None, "Cohere2Vision does not support image_embeds."
 
         if pixel_values is None:
             return None
 
         return Cohere2VisionImagePixelInputs(
             type="pixel_values",
-            pixel_values=flatten_bn(pixel_values, concat=True),
-            num_patches=flatten_bn(num_patches, concat=True),
+            pixel_values=pixel_values,
+            num_patches=num_patches,
             resolve_bindings={
                 "h": self.config.vision_config.image_size,
                 "w": self.config.vision_config.image_size,
-            })
+            },
+        )
 
-    def _patch_quant_config(self, config: PretrainedConfig,
-                            quant_config: QuantizationConfig):
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ):
         # the awq models from OpenGVLab missing `modules_to_not_convert`
         # patch the quant_config to add `modules_to_not_convert` back
         if isinstance(quant_config, AWQConfig):
             text_config = config.text_config
-            llm_quant_config = getattr(text_config, "quantization_config",
-                                       None)
-            if (not quant_config.modules_to_not_convert) and (llm_quant_config
-                                                              is not None):
+            llm_quant_config = getattr(text_config, "quantization_config", None)
+            if (not quant_config.modules_to_not_convert) and (
+                llm_quant_config is not None
+            ):
                 quant_config.modules_to_not_convert.append("vision_tower")
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input, **kwargs)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=self.config.image_token_id,
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -477,5 +468,5 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index f3929ef3b59383ab34db783a8cd36a477adb2c51..77bb178519813d863c0c0561533c80f4109fdd4e 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -22,9 +22,9 @@
 
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -35,26 +35,33 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name,
-    row_parallel_weight_loader)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+    row_parallel_weight_loader,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 @torch.compile(backend=current_platform.simple_compile_backend)
@@ -63,34 +70,31 @@ def layer_norm_func(hidden_states, weight, variance_epsilon):
     hidden_states = hidden_states.to(torch.float32)
     mean = hidden_states.mean(-1, keepdim=True)
     variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
-    hidden_states = (hidden_states - mean) * torch.rsqrt(variance +
-                                                         variance_epsilon)
+    hidden_states = (hidden_states - mean) * torch.rsqrt(variance + variance_epsilon)
     hidden_states = weight.to(torch.float32) * hidden_states
     return hidden_states.to(input_dtype)
 
 
 class LayerNorm(nn.Module):
-
     def __init__(self, param_shape=None, eps=1e-5):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(param_shape))
         self.variance_epsilon = eps
-        set_weight_attrs(self.weight,
-                         {"weight_loader": row_parallel_weight_loader})
+        set_weight_attrs(self.weight, {"weight_loader": row_parallel_weight_loader})
 
     def forward(self, hidden_states, residuals=None):
-        hidden_states = layer_norm_func(hidden_states, self.weight,
-                                        self.variance_epsilon)
+        hidden_states = layer_norm_func(
+            hidden_states, self.weight, self.variance_epsilon
+        )
         return hidden_states, residuals
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
 class CohereMLP(nn.Module):
-
     def __init__(
         self,
-        config: Union[CohereConfig, Cohere2Config],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CohereConfig | Cohere2Config,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -121,12 +125,11 @@ class CohereMLP(nn.Module):
 
 
 class CohereAttention(nn.Module):
-
     def __init__(
         self,
-        config: Union[CohereConfig, Cohere2Config],
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CohereConfig | Cohere2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -151,8 +154,8 @@ class CohereAttention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = getattr(
-            config, "model_max_length", None) or getattr(
-                config, "max_position_embeddings", 8192)
+            config, "model_max_length", None
+        ) or getattr(config, "max_position_embeddings", 8192)
         self.rope_theta = config.rope_theta
         self.rope_scaling = getattr(config, "rope_scaling", None)
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
@@ -190,21 +193,24 @@ class CohereAttention(nn.Module):
             if config.layer_types[layer_idx] == "sliding_attention":
                 self.sliding_window = config.sliding_window
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              per_layer_sliding_window=self.sliding_window,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+        )
         if self.use_qk_norm:
-            self.q_norm = LayerNorm(param_shape=(self.num_heads,
-                                                 self.head_dim),
-                                    eps=config.layer_norm_eps)
-            self.k_norm = LayerNorm(param_shape=(self.num_kv_heads,
-                                                 self.head_dim),
-                                    eps=config.layer_norm_eps)
+            self.q_norm = LayerNorm(
+                param_shape=(self.num_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = LayerNorm(
+                param_shape=(self.num_kv_heads, self.head_dim),
+                eps=config.layer_norm_eps,
+            )
 
     def _apply_qk_norm(self, q, k):
         q = q.view(*q.shape[:-1], -1, self.head_dim)
@@ -232,31 +238,33 @@ class CohereAttention(nn.Module):
 
 
 class CohereDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: Union[CohereConfig, Cohere2Config],
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: CohereConfig | Cohere2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = CohereAttention(config,
-                                         cache_config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.self_attn")
+        self.self_attn = CohereAttention(
+            config,
+            cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
 
-        self.mlp = CohereMLP(config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.mlp")
-        self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
-                                         eps=config.layer_norm_eps)
+        self.mlp = CohereMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp")
+        self.input_layernorm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -274,49 +282,50 @@ class CohereDecoderLayer(nn.Module):
 
 @support_torch_compile
 class CohereModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.quant_config = quant_config
 
         self.config = config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: CohereDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
-        self.norm = LayerNorm(param_shape=(config.hidden_size),
-                              eps=config.layer_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -329,15 +338,13 @@ class CohereModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -349,14 +356,15 @@ class CohereModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -386,8 +394,7 @@ class CohereModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -412,54 +419,55 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # currently all existing command R models have `tie_word_embeddings`
         # enabled
         assert config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.quant_config = quant_config
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size,
-                                                scale=config.logit_scale)
-        self.model = CohereModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, scale=config.logit_scale
+        )
+        self.model = CohereModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        is_not_lora = hasattr(self.model.embed_tokens, 'weight')
+    ) -> torch.Tensor | None:
+        is_not_lora = hasattr(self.model.embed_tokens, "weight")
         if is_not_lora:
-            logits = self.logits_processor(self.model.embed_tokens,
-                                           hidden_states)
+            logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         else:
-            logits = self.logits_processor(self.model.embed_tokens.base_layer,
-                                           hidden_states)
+            logits = self.logits_processor(
+                self.model.embed_tokens.base_layer, hidden_states
+            )
 
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
-            self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"])
+            self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"]
+        )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 20e2bf2b28a5a0ffc3c56d2589849c67e353387d..fe66fd0c8c41b0ab88cac00ba3c04260f43393f0 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -1,31 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
+from math import lcm
 from typing import TYPE_CHECKING
 
 import vllm.envs as envs
-from vllm.config.compilation import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+from vllm.platforms import current_platform
+from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
 
 if TYPE_CHECKING:
-
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
 
 
 class VerifyAndUpdateConfig:
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         raise NotImplementedError
 
 
 class Gemma3TextModelConfig:
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         hf_config = vllm_config.model_config.hf_config
@@ -33,7 +32,6 @@ class Gemma3TextModelConfig:
 
 
 class GteNewModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
@@ -49,57 +47,63 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
             "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None)
+            "rope_scaling": getattr(config, "rope_scaling", None),
         }
 
 
 class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         pooler_config = vllm_config.model_config.pooler_config
-        if pooler_config.activation is None:
-            pooler_config.activation = False
+        if pooler_config.use_activation is None:
+            pooler_config.use_activation = False
 
 
 class JinaRobertaModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        config = model_config.hf_config
 
         if config.position_embedding_type == "rotary":
             assert config.__class__.__name__ == "XLMRobertaFlashConfig"
 
             head_dim = config.hidden_size // config.num_attention_heads
+            max_position = config.max_position_embeddings
+            # Jina-embeddings-v3 has max_position_embeddings=8194, which will cause
+            # out-of-bound index issue at RoPE for long prompts with torch.compile,
+            # because it can't be divided by triton num_warps(default=4 or 8).
+            # To deal with this, we increase max_position to multiple of n_warps,
+            # so that triton kernel won't hit out-of-bound index in RoPE cache.
+            if not model_config.enforce_eager:
+                max_position = round_up(max_position, 8)
+
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
-                "max_position": config.max_position_embeddings,
+                "max_position": max_position,
                 "base": getattr(config, "rope_theta", config.rotary_emb_base),
-                "rope_scaling": getattr(config, "rope_scaling", None)
+                "rope_scaling": getattr(config, "rope_scaling", None),
             }
 
 
 class NomicBertModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
 
         assert config.__class__.__name__ == "NomicBertConfig"
         assert config.activation_function in ["swiglu", "gelu"]
-        config.position_embedding_type = getattr(config,
-                                                 "position_embedding_type",
-                                                 "rope")
+        config.position_embedding_type = getattr(
+            config, "position_embedding_type", "rope"
+        )
 
         if config.activation_function == "swiglu":
             config.hidden_act = "silu"
         else:
             config.hidden_act = config.activation_function
 
-        assert (config.mlp_fc1_bias == config.mlp_fc2_bias ==
-                config.qkv_proj_bias)
+        assert config.mlp_fc1_bias == config.mlp_fc2_bias == config.qkv_proj_bias
         config.bias = config.qkv_proj_bias
 
         assert config.rotary_emb_scale_base is None
@@ -118,7 +122,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
             "rotary_dim": rotary_emb_dim,
             "max_position": max_trained_positions,
             "base": getattr(config, "rope_theta", config.rotary_emb_base),
-            "rope_scaling": getattr(config, "rope_scaling", None)
+            "rope_scaling": getattr(config, "rope_scaling", None),
         }
 
         # we ignore config.rotary_scaling_factor so that for datasets shorter
@@ -126,15 +130,18 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         # with SentenceTransformer.
         # The context extension uses vllm style rope_theta and rope_scaling.
         # See #17785 #18755
-        if (not vllm_config.model_config.hf_overrides
-                and vllm_config.model_config.original_max_model_len is None):
+        if (
+            not vllm_config.model_config.hf_overrides
+            and vllm_config.model_config.original_max_model_len is None
+        ):
             # Default
             # Reset max_model_len to max_trained_positions.
             # nomic-embed-text-v2-moe the length is set to 512
             # by sentence_bert_config.json.
             max_model_len_before = vllm_config.model_config.max_model_len
-            max_model_len = min(vllm_config.model_config.max_model_len,
-                                max_trained_positions)
+            max_model_len = min(
+                vllm_config.model_config.max_model_len, max_trained_positions
+            )
 
             vllm_config.recalculate_max_model_len(max_model_len)
             logger.warning(
@@ -142,7 +149,9 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
                 "Changing max_model_len from %s to %s. "
                 "To enable context extension, see: "
                 "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
-                max_model_len_before, vllm_config.model_config.max_model_len)
+                max_model_len_before,
+                vllm_config.model_config.max_model_len,
+            )
         else:
             # We need to re-verify max_model_len to avoid lengths
             # greater than position_embedding.
@@ -152,7 +161,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
             if isinstance(model_config.hf_overrides, dict):
                 # hf_overrides_kw
                 max_model_len = model_config.hf_overrides.get(
-                    "max_model_len", vllm_config.model_config.max_model_len)
+                    "max_model_len", vllm_config.model_config.max_model_len
+                )
             else:
                 # hf_overrides_fn
                 # This might be overridden by sentence_bert_config.json.
@@ -174,7 +184,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
 
 
 class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         pooler_config = vllm_config.model_config.pooler_config
@@ -184,7 +193,6 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
 
 
 class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         pooler_config = vllm_config.model_config.pooler_config
@@ -194,27 +202,26 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
 
 
 class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
 
-        is_original_qwen3_reranker = getattr(config,
-                                             "is_original_qwen3_reranker",
-                                             False)
+        is_original_qwen3_reranker = getattr(
+            config, "is_original_qwen3_reranker", False
+        )
 
         if not is_original_qwen3_reranker:
             return
 
         tokens = getattr(config, "classifier_from_token", None)
-        assert tokens is not None and len(tokens) == 2, \
-            ("Try loading the original Qwen3 Reranker?, see: "
-             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
+        assert tokens is not None and len(tokens) == 2, (
+            "Try loading the original Qwen3 Reranker?, see: "
+            "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py"
+        )
         vllm_config.model_config.hf_config.method = "from_2_way_softmax"
 
 
 class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
@@ -225,7 +232,6 @@ class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
 
 
 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
@@ -241,40 +247,34 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
             "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None)
+            "rope_scaling": getattr(config, "rope_scaling", None),
         }
 
 
 class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
-
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         structured_outputs_config = vllm_config.structured_outputs_config
         if structured_outputs_config.reasoning_parser == "":
             structured_outputs_config.reasoning_parser = "openai_gptoss"
 
-        # Increase the max capture size from 512 to 992 for performance.
+        # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
-        # from 67 to 81.
-        scheduler_config = vllm_config.scheduler_config
-        if len(scheduler_config.cuda_graph_sizes) == 1:
-            max_capture_size = scheduler_config.cuda_graph_sizes[0]
-            # FIXME(woosuk): When using full cuda graph with FA3, the max
-            # supported size is 992.
-            if max_capture_size < 992:
-                cuda_graph_sizes = [1, 2, 4]
-                # Step size 8 for small batch sizes
-                cuda_graph_sizes += [i for i in range(8, 256, 8)]
-                # Step size 16 for larger batch sizes
-                cuda_graph_sizes += [i for i in range(256, 993, 16)]
-                scheduler_config.cuda_graph_sizes = cuda_graph_sizes
-                logger.info(
-                    "Overriding max cuda graph capture size to "
-                    "%d for performance.", 992)
+        # from 67 to 83.
+        compilation_config = vllm_config.compilation_config
+        # Only override when the user has not set either of
+        # cudagraph_capture_sizes or max_cudagraph_capture_size.
+        if (
+            compilation_config.cudagraph_capture_sizes is None
+            and compilation_config.max_cudagraph_capture_size is None
+        ):
+            compilation_config.max_cudagraph_capture_size = 1024
+            logger.info(
+                "Overriding max cuda graph capture size to %d for performance.", 1024
+            )
 
 
 class MambaModelConfig(VerifyAndUpdateConfig):
-
     @classmethod
     def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
@@ -284,35 +284,34 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         Args:
             vllm_config: vLLM Config
         """
-
-        if not envs.VLLM_USE_V1:
-            return
-
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
-        compilation_config = vllm_config.compilation_config
 
-        # TODO(tdoublep): remove once prefix caching is enabled
-        cache_config.enable_prefix_caching = False
-        logger.info("Hybrid or mamba-based model detected: disabling prefix "
-                    "caching since it is not yet supported.")
+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len
 
-        # TODO(tdoublep): remove as full cuda graph support is added
-        FCG_NOT_SUPPORTED_MODELS = [
-            "Lfm2ForCausalLM",
-            "MiniMaxText01ForCausalLM",
-        ]
-
-        if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS
-                and compilation_config.cudagraph_mode is None):
-            logger.info(
-                "Hybrid or mamba-based model detected: setting cudagraph mode "
-                "to FULL_AND_PIECEWISE in order to optimize performance.")
-            compilation_config.cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        if cache_config.enable_prefix_caching:
+            if model_config.supports_mamba_prefix_caching:
+                logger.info(
+                    "Warning: Prefix caching is currently enabled. "
+                    "Its support for Mamba layers is experimental. "
+                    "Please report any issues you may observe."
+                )
+            else:
+                logger.info(
+                    "Hybrid or mamba-based model detected without "
+                    "support for prefix caching: disabling."
+                )
+                cache_config.enable_prefix_caching = False
+
+        # TODO(tdoublep): remove once cascade attention is supported
+        logger.info(
+            "Disabling cascade attention since it is not supported for hybrid models."
+        )
+        model_config.disable_cascade_attn = True
 
 
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
-
     @classmethod
     def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
@@ -325,10 +324,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         Args:
             vllm_config: vLLM Config
         """
-
-        if not envs.VLLM_USE_V1:
-            return
-
+        # Save the user input before it gets modified by MambaModelConfig
+        mamba_block_size = vllm_config.cache_config.mamba_block_size
         # Enable FULL_AND_PIECEWISE by default
         MambaModelConfig.verify_and_update_config(vllm_config)
 
@@ -342,11 +339,39 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
 
         # get attention page size (for 1 token)
-        attn_page_size_1_token = FullAttentionSpec(
-            block_size=1,
-            num_kv_heads=model_config.get_num_kv_heads(parallel_config),
-            head_size=model_config.get_head_size(),
-            dtype=kv_cache_dtype).page_size_bytes
+        # Attention backend constraints:
+        # - FlashAttention (FA) requires block size to be multiple of 16
+        # - MLA (Multi-head Latent Attention) requires larger alignment:
+        #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
+        #   * Other MLA backends: kernel_block_size 64 alignment
+        if model_config.use_mla:
+            use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
+            kernel_block_alignment_size = 128 if use_cutlass_mla else 64
+            attn_page_size_1_token = MLAAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+            ).page_size_bytes
+        else:
+            kernel_block_alignment_size = 16
+            if (
+                current_platform.is_device_capability(100)
+                and model_config.get_head_size() == 256
+                and (
+                    envs.VLLM_ATTENTION_BACKEND is None
+                    or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
+                )
+            ):
+                # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
+                # head size 256 and block size 16 is not supported on blackwell.
+                kernel_block_alignment_size = 32
+            attn_page_size_1_token = FullAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+            ).page_size_bytes
 
         model_cls, _ = ModelRegistry.resolve_model_cls(
             model_config.architecture,
@@ -360,27 +385,58 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             block_size=model_config.max_model_len,
         ).page_size_bytes
 
-        # some attention backends (e.g. FA) only support setting
-        # block size to multiple of 16, so let's suggest a value
-        # that would work (note: FA is currently not compatible
-        # with mamba layers, use FlashInfer instead).
-        attn_block_size = 16 * cdiv(mamba_page_size,
-                                    16 * attn_page_size_1_token)
+        # Model may be marked as is_hybrid
+        #  but mamba is skipped via config,
+        #  return directly
+        if mamba_page_size == 0:
+            return
+
+        if cache_config.enable_prefix_caching:
+            # With prefix caching, select attention block size to
+            # optimize for mamba kernel performance
+
+            # Mamba2 SSD kernel uses a chunk_size, e.g. 256
+            # Align the block to the kernel: use lowest multiple of chunk_size
+            # of attention tokens that would fit mamba_page_size:
+            # e.g. for mamba page size = 788kB
+            #          attn_1_token = 2kB -> fits ~394 tokens
+            #      then round up to a mulitple of 256 -> 512 tokens
+            # End result:
+            #  attn_block_size = 512
+            #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
+            # TODO(tdoublep): this constraint can be relaxed fairly
+            # easily by changing the way we layout chunks in the
+            # mamba2 kernels.
+
+            base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
+            attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
+            chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
+            attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
+            cache_config.mamba_block_size = attn_block_size
+        else:
+            # Without prefix caching, select minimum valid attention block size
+            # to minimize mamba state padding
+
+            # Calculate minimum attention block size that satisfies both:
+            # 1. Backend alignment requirements (kernel_block_alignment_size)
+            # 2. Mamba page size compatibility (attn_page_size >= mamba_page_size)
+            attn_block_size = kernel_block_alignment_size * cdiv(
+                mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
+            )
 
         # override attention block size if either (a) the
         # user has not set it or (b) the user has set it
         # too small.
-        if (cache_config.block_size is None
-                or cache_config.block_size < attn_block_size):
+        if cache_config.block_size is None or cache_config.block_size < attn_block_size:
             cache_config.block_size = attn_block_size
             logger.info(
                 "Setting attention block size to %d tokens "
                 "to ensure that attention page size is >= mamba page size.",
-                attn_block_size)
+                attn_block_size,
+            )
 
         # compute new attention page size
-        attn_page_size = \
-            cache_config.block_size * attn_page_size_1_token
+        attn_page_size = cache_config.block_size * attn_page_size_1_token
 
         assert attn_page_size >= mamba_page_size
 
@@ -389,15 +445,42 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             return
 
         # pad mamba page size to exactly match attention
-        if (cache_config.mamba_page_size_padded is None
-                or cache_config.mamba_page_size_padded != attn_page_size):
-            cache_config.mamba_page_size_padded = (attn_page_size)
-            mamba_padding_pct = 100 * (attn_page_size -
-                                       mamba_page_size) / mamba_page_size
+        if (
+            cache_config.mamba_page_size_padded is None
+            or cache_config.mamba_page_size_padded != attn_page_size
+        ):
+            cache_config.mamba_page_size_padded = attn_page_size
+            mamba_padding_pct = (
+                100 * (attn_page_size - mamba_page_size) / mamba_page_size
+            )
             logger.info(
                 "Padding mamba page size by %.2f%% to ensure "
                 "that mamba page size and attention page size are "
-                "exactly equal.", mamba_padding_pct)
+                "exactly equal.",
+                mamba_padding_pct,
+            )
+
+
+class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
+        """
+        hf_config = vllm_config.model_config.hf_config
+
+        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
+        is_v32 = hasattr(hf_config, "index_topk")
+        assert is_v32
+
+        # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled.
+        cache_config = vllm_config.cache_config
+        if cache_config.cache_dtype.startswith("fp8"):
+            cache_config.cache_dtype = "fp8_ds_mla"
+            logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2")
+        if cache_config.cache_dtype == "bfloat16":
+            cache_config.cache_dtype = "auto"
+            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
 
 
 class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index f863b1da5505c68f05cb75bc12dcc4ff94579692..528ef4f76742df4a509c7c0f133e28da117e5f9e 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -11,25 +10,38 @@ from transformers import DbrxConfig
 
 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class DbrxRouter(nn.Module):
@@ -40,7 +52,7 @@ class DbrxRouter(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -60,12 +72,11 @@ class DbrxRouter(nn.Module):
 
 
 class DbrxExperts(FusedMoE):
-
     def __init__(
         self,
         config: DbrxConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        params_dtype: Optional[torch.dtype] = None,
+        quant_config: QuantizationConfig | None = None,
+        params_dtype: torch.dtype | None = None,
         prefix: str = "",
     ):
         super().__init__(
@@ -82,12 +93,16 @@ class DbrxExperts(FusedMoE):
         )
         self.config = config
         self.d_model = config.d_model
-        self.intermediate_size = (self.config.ffn_config.ffn_hidden_size //
-                                  self.tp_size)
+        self.intermediate_size = self.config.ffn_config.ffn_hidden_size // self.tp_size
 
     # Define custom weight loader for dbrx model
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, param_name: str):
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        param_name: str,
+    ):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.data
         shard_size = self.intermediate_size
@@ -111,8 +126,9 @@ class DbrxExperts(FusedMoE):
                     loaded_weight,
                     [-1, self.intermediate_size * self.tp_size, self.d_model],
                 )
-                param_data[:, shard_size:2 *
-                           shard_size, :] = loaded_weight[:, shard, :]
+                param_data[:, shard_size : 2 * shard_size, :] = loaded_weight[
+                    :, shard, :
+                ]
             elif param_name.endswith("weight_scale"):
                 param_data[:, 1] = loaded_weight
             else:
@@ -139,8 +155,8 @@ class DbrxMoE(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        params_dtype: Optional[torch.dtype] = None,
+        quant_config: QuantizationConfig | None = None,
+        params_dtype: torch.dtype | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -151,10 +167,12 @@ class DbrxMoE(nn.Module):
 
         self.router = DbrxRouter(config, self.params_dtype)
 
-        self.experts = DbrxExperts(config=config,
-                                   quant_config=quant_config,
-                                   params_dtype=self.params_dtype,
-                                   prefix=f"{prefix}.experts")
+        self.experts = DbrxExperts(
+            config=config,
+            quant_config=quant_config,
+            params_dtype=self.params_dtype,
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -166,12 +184,11 @@ class DbrxMoE(nn.Module):
 
 
 class DbrxAttention(nn.Module):
-
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -191,12 +208,14 @@ class DbrxAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.Wqkv",
         )
         self.out_proj = RowParallelLinear(
             self.d_model,
             self.d_model,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -222,13 +241,15 @@ class DbrxAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -246,20 +267,18 @@ class DbrxAttention(nn.Module):
 
 
 class DbrxFusedNormAttention(nn.Module):
-
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
-        self.attn = DbrxAttention(config,
-                                  cache_config,
-                                  quant_config,
-                                  prefix=f"{prefix}.attn")
+        self.attn = DbrxAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
         self.norm_1 = nn.LayerNorm(self.d_model)
         self.norm_2 = nn.LayerNorm(self.d_model)
 
@@ -281,20 +300,17 @@ class DbrxFusedNormAttention(nn.Module):
 
 
 class DbrxBlock(nn.Module):
-
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.norm_attn_norm = DbrxFusedNormAttention(
-            config,
-            cache_config,
-            quant_config,
-            prefix=f"{prefix}.norm_attn_norm")
+            config, cache_config, quant_config, prefix=f"{prefix}.norm_attn_norm"
+        )
         self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")
 
     def forward(
@@ -312,7 +328,6 @@ class DbrxBlock(nn.Module):
 
 
 class DbrxModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -327,35 +342,33 @@ class DbrxModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.blocks = make_layers(
             config.n_layers,
-            lambda prefix: DbrxBlock(
-                config, cache_config, quant_config, prefix=prefix),
+            lambda prefix: DbrxBlock(config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.blocks",
         )
         self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
         for module in self.modules():
-            if hasattr(module, "bias") and isinstance(module.bias,
-                                                      nn.Parameter):
+            if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
                 # Remove the bias term in Linear and LayerNorm.
                 module.register_parameter("bias", None)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.d_model))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.d_model
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
@@ -366,24 +379,27 @@ class DbrxModel(nn.Module):
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        expert_params_mapping = [(
-            "w13" if weight_name in ["w1", "v1"] else "w2",
-            f"mlp.{weight_name}",
-        ) for weight_name in ["w1", "v1", "w2"]]
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        expert_params_mapping = [
+            (
+                "w13" if weight_name in ["w1", "v1"] else "w2",
+                f"mlp.{weight_name}",
+            )
+            for weight_name in ["w1", "v1", "w2"]
+        ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -409,63 +425,58 @@ class DbrxModel(nn.Module):
                 if name is None:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class DbrxForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         if config.tie_word_embeddings:
-            raise ValueError(
-                "tie_word_embeddings is not supported for Dbrx models.")
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
         self.quant_config = quant_config
-        self.unpadded_vocab_size = config.vocab_size
-        self.transformer = DbrxModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
+
+        self.transformer = DbrxModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.d_model,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e62a57eccc953e6a8d982ba8c24146c1ea5533cd
--- /dev/null
+++ b/vllm/model_executor/models/deepencoder.py
@@ -0,0 +1,673 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from
+# https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from collections.abc import Iterable
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import CLIPVisionConfig
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .clip import CLIPEncoder, CLIPVisionEmbeddings
+
+
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: tuple[int, ...] = (),
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """  # noqa: E501
+        super().__init__()
+        self.img_size = img_size
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        self.pos_embed: nn.Parameter | None = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1, img_size // patch_size, img_size // patch_size, embed_dim
+                )
+            )
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_3 = nn.Conv2d(
+            512, 1024, kernel_size=3, stride=2, padding=1, bias=False
+        )
+
+    def get_abs_pos(self, abs_pos: torch.Tensor, tgt_size: int):
+        dtype = abs_pos.dtype
+
+        src_size = abs_pos.size(1)
+
+        if src_size != tgt_size:
+            old_pos_embed = abs_pos.permute(0, 3, 1, 2)
+            old_pos_embed = old_pos_embed.to(torch.float32)
+            new_pos_embed = F.interpolate(
+                old_pos_embed,
+                size=(tgt_size, tgt_size),
+                mode="bicubic",
+                antialias=True,
+                align_corners=False,
+            ).to(dtype)
+            new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+            return new_pos_embed
+        else:
+            return abs_pos
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.get_abs_pos(self.pos_embed, x.size(1))
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        neck_output = self.neck(x.permute(0, 3, 1, 2))
+        conv2_output = self.net_2(neck_output)
+        conv3_output = self.net_3(conv2_output)
+
+        return conv3_output
+
+
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation
+    blocks"""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: tuple[int, int] | None = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """  # noqa: E501
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = RelPosAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(
+            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
+        )
+
+        self.window_size = window_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+
+        return x
+
+
+class RelPosAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: tuple[int, int] | None = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """  # noqa: E501
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert input_size is not None, (
+                "Input size must be provided if using relative positional encoding."
+            )
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = (
+            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+        rel_h, rel_w = None, None
+        if self.use_rel_pos:
+            rel_h, rel_w = add_decomposed_rel_pos(
+                q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
+            )
+
+        q = q.view(B, self.num_heads, H * W, -1)
+        k = k.view(B, self.num_heads, H * W, -1)
+        v = v.view(B, self.num_heads, H * W, -1)
+
+        if self.use_rel_pos:
+            rel_h = rel_h.view(
+                B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3)
+            )
+            rel_w = rel_w.view(
+                B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3)
+            )
+            attn_bias = (rel_h + rel_w).view(
+                B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4)
+            )
+            x = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=attn_bias
+            )
+        else:
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+
+        x = (
+            x.view(B, self.num_heads, H, W, -1)
+            .permute(0, 2, 3, 1, 4)
+            .reshape(B, H, W, -1)
+        )
+
+        x = self.proj(x)
+
+        return x
+
+
+def window_partition(
+    x: torch.Tensor, window_size: int
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """  # noqa: E501
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: tuple[int, int],
+    hw: tuple[int, int],
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """  # noqa: E501
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        dtype = rel_pos.dtype
+        rel_pos = rel_pos.to(torch.float32)
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        ).to(dtype)
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size, device=rel_pos.device)[:, None] * max(
+        k_size / q_size, 1.0
+    )
+    k_coords = torch.arange(k_size, device=rel_pos.device)[None, :] * max(
+        q_size / k_size, 1.0
+    )
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: tuple[int, int],
+    k_size: tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+    Args:
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """  # noqa: E501
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    rel_h = rel_h.unsqueeze(-1)
+    rel_w = rel_w.unsqueeze(-2)
+    rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)
+    rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)
+
+    return rel_h, rel_w
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: tuple[int, int] = (16, 16),
+        stride: tuple[int, int] = (16, 16),
+        padding: tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+# TODO(Isotr0py): use vision_config to build sam model
+def build_sam_vit_b():
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+    )
+
+
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_encoder = ImageEncoderViT(
+        depth=encoder_depth,
+        embed_dim=encoder_embed_dim,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+        num_heads=encoder_num_heads,
+        patch_size=vit_patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=encoder_global_attn_indexes,
+        window_size=14,
+        out_chans=prompt_embed_dim,
+    )
+    return image_encoder
+
+
+class DeepCLIPVisionEmbeddings(CLIPVisionEmbeddings):
+    def get_abs_pos(self, abs_pos: torch.Tensor, tgt_size: int):
+        # abs_pos: L, C
+        # tgt_size: M
+        # return: M, C
+        dim = abs_pos.size(-1)
+        abs_pos_new = abs_pos.squeeze(0)
+        cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
+
+        src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
+        tgt_size = int(math.sqrt(tgt_size))
+        dtype = abs_pos.dtype
+
+        if src_size != tgt_size:
+            old_pos_embed = (
+                old_pos_embed.view(1, src_size, src_size, dim)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )
+            old_pos_embed = old_pos_embed.to(torch.float32)
+            new_pos_embed = F.interpolate(
+                old_pos_embed,
+                size=(tgt_size, tgt_size),
+                mode="bicubic",
+                antialias=True,
+                align_corners=False,
+            ).to(dtype)
+            new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+            new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
+            vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
+            vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
+            return vision_pos_embed
+        else:
+            return abs_pos
+
+    def forward(
+        self, pixel_values: torch.Tensor, patch_embeds: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        if patch_embeds is not None:
+            patch_embeds = patch_embeds
+        else:
+            patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.get_abs_pos(
+            self.position_embedding(self.position_ids), embeddings.size(1)
+        )
+        return embeddings
+
+
+class DeepCLIPVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        num_hidden_layers_override: int | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = DeepCLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.transformer = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder",
+            attn_cls=MultiHeadAttention,
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.transformer.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.transformer.layers)} layers."
+            )
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        patch_embeds: torch.Tensor | None = None,
+        *,
+        select_layers: list[int] | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values, patch_embeds)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        # Produces either the last layer output or all of the hidden states,
+        # depending on if we have select_layers or not
+        encoder_outputs = self.transformer(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=select_layers is not None,
+        )
+        return encoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
deleted file mode 100644
index ffc843fe033cccd3b271cf27fa8f2daf5b8793db..0000000000000000000000000000000000000000
--- a/vllm/model_executor/models/deepseek.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Deepseek model."""
-from collections.abc import Iterable
-from itertools import islice
-from typing import Any, Optional, Union
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class DeepseekMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class DeepseekMoE(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.n_routed_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.n_routed_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}.")
-
-        self.experts = nn.ModuleList([
-            DeepseekMLP(hidden_size=config.hidden_size,
-                        intermediate_size=config.moe_intermediate_size,
-                        hidden_act=config.hidden_act,
-                        quant_config=quant_config,
-                        reduce_results=False)
-            for idx in range(self.n_routed_experts)
-        ])
-        self.pack_params()
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None)
-
-        if config.n_shared_experts is not None:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
-            self.shared_experts = DeepseekMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                reduce_results=False,
-            )
-
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.config.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        topk_weights, topk_ids, _ = fused_topk(
-            hidden_states,
-            router_logits,
-            self.top_k,
-            renormalize=self.config.norm_topk_prob)
-
-        final_hidden_states = fused_experts(hidden_states,
-                                            self.w1,
-                                            self.w2,
-                                            topk_weights,
-                                            topk_ids,
-                                            inplace=True)
-
-        if self.config.n_shared_experts is not None:
-            final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_dim)
-
-
-class DeepseekAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            quant_config=quant_config,
-        )
-
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class DeepseekDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        layer_idx = extract_layer_index(prefix)
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = DeepseekAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = DeepseekMoE(config=config,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
-        else:
-            self.mlp = DeepseekMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class DeepseekModel(nn.Module):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: DeepseekDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
-            ),
-            prefix=f"{prefix}.layers")
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = DeepseekModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(
-            config.vocab_size,
-            config.hidden_size,
-            quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "lm_head"),
-        )
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index ed7e7614800fc2ee6687c5aa9b212df85881b545..3fb04c3b70dd12798309cf70a540f1a67be684f0 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -14,18 +13,26 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer,
-                                                    DeepseekV3ForCausalLM)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    DeepseekV3ForCausalLM,
+)
+from vllm.utils import init_logger
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
+
+logger = init_logger(__name__)
 
 
 @support_torch_compile
 class DeepseekV2Model(nn.Module):
-
     def __init__(
         self,
         *,
@@ -34,8 +41,7 @@ class DeepseekV2Model(nn.Module):
         start_layer_id: int = 0,
     ) -> None:
         super().__init__()
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         quant_config = vllm_config.quant_config
         self.vocab_size = self.config.vocab_size
 
@@ -46,12 +52,16 @@ class DeepseekV2Model(nn.Module):
             prefix=maybe_prefix(prefix, "embed_tokens"),
         )
 
-        self.layers = nn.ModuleList([
-            DeepseekV2DecoderLayer(
-                vllm_config,
-                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
-            ) for i in range(self.config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV2DecoderLayer(
+                    vllm_config,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                    config=self.config,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
 
         self.fc = nn.Linear(
             self.config.model.hidden_size * 2,
@@ -59,12 +69,12 @@ class DeepseekV2Model(nn.Module):
             bias=False,
         )
 
-        self.enorm = RMSNorm(self.config.hidden_size,
-                             eps=self.config.rms_norm_eps)
-        self.hnorm = RMSNorm(self.config.hidden_size,
-                             eps=self.config.rms_norm_eps)
-        self.norm = RMSNorm(self.config.hidden_size,
-                            eps=self.config.rms_norm_eps)
+        self.enorm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.hnorm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
 
     def forward(
         self,
@@ -75,8 +85,8 @@ class DeepseekV2Model(nn.Module):
         input_embeds = self.embed_tokens(input_ids)
 
         inputs = torch.cat(
-            [self.enorm(input_embeds),
-             self.hnorm(hidden_states)], dim=-1)
+            [self.enorm(input_embeds), self.hnorm(hidden_states)], dim=-1
+        )
         hidden_states = self.fc(inputs)
         residual = None
         for layer in self.layers:
@@ -88,8 +98,7 @@ class DeepseekV2Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -104,7 +113,8 @@ class DeepseekV2Model(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
+            num_experts=self.config.n_routed_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -129,8 +139,9 @@ class DeepseekV2Model(nn.Module):
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
                 # if go with fusion option, then update name
-                if ((param_name == "fused_qkv_a_proj")
-                        and name_mapped not in params_dict):
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
                     continue
                 else:
                     name = name_mapped
@@ -162,8 +173,7 @@ class DeepseekV2Model(nn.Module):
                     break
                 else:
                     # if PP disabled then draft will share embed with target
-                    if get_pp_group().world_size == 1 and \
-                            "embed_tokens." in name:
+                    if get_pp_group().world_size == 1 and "embed_tokens." in name:
                         continue
 
                     # Skip loading extra bias for GPTQ models.
@@ -176,41 +186,51 @@ class DeepseekV2Model(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         quant_config = vllm_config.quant_config
         target_layer_num = vllm_config.model_config.get_num_layers(
-            vllm_config.parallel_config)
-        self.model = DeepseekV2Model(vllm_config=vllm_config,
-                                     prefix="model",
-                                     start_layer_id=target_layer_num)
+            vllm_config.parallel_config
+        )
+        self.model = DeepseekV2Model(
+            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+        )
 
-        self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                      self.config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.config.vocab_size,
-                                                scale=logit_scale)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = self.config.num_hidden_layers
+        self.set_moe_parameters()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             raise NotImplementedError(
@@ -221,16 +241,16 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 8156bf409ec14c9ef25dfde5f2e43857f8b225eb..e028dc497aa6aa4e8058dc483953440758472463 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,130 +1,161 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .deepseek_v2 import (DeepseekV2DecoderLayer,
-                          get_spec_layer_idx_from_weight_name)
+from .deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    DeepseekV2MixtureOfExperts,
+    DeepseekV2MoE,
+    get_spec_layer_idx_from_weight_name,
+)
 from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
+logger = init_logger(__name__)
 
-class SharedHead(nn.Module):
 
+class SharedHead(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.head = ParallelLMHead(config.vocab_size,
-                                   config.hidden_size,
-                                   quant_config=quant_config)
+        self.head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "head"),
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.norm(hidden_states)
 
 
 class DeepSeekMultiTokenPredictorLayer(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        self.config = config
         quant_config = vllm_config.quant_config
 
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.eh_proj = nn.Linear(config.hidden_size * 2,
-                                 config.hidden_size,
-                                 bias=False)
+        self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+
+        self.device = current_platform.device_type
 
-        # self.is_v32 = hasattr(config, "index_topk")
-        self.is_v32 = False
+        self.is_v32 = hasattr(config, "index_topk")
         if self.is_v32:
             topk_tokens = config.index_topk
             topk_indices_buffer = torch.empty(
                 vllm_config.scheduler_config.max_num_batched_tokens,
                 topk_tokens,
                 dtype=torch.int32,
-                device="cuda")
+                device=self.device,
+            )
         else:
             topk_indices_buffer = None
-        self.shared_head = SharedHead(config=config, quant_config=quant_config)
-        self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix,
-                                                topk_indices_buffer)
+
+        self.shared_head = SharedHead(
+            config=config, prefix=prefix, quant_config=quant_config
+        )
+        self.mtp_block = DeepseekV2DecoderLayer(
+            vllm_config,
+            prefix,
+            config=self.config,
+            topk_indices_buffer=topk_indices_buffer,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
         # masking inputs at position 0, as not needed by MTP
-        inputs_embeds[positions == 0] = 0
+        inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
         inputs_embeds = self.enorm(inputs_embeds)
         previous_hidden_states = self.hnorm(previous_hidden_states)
 
         hidden_states = self.eh_proj(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1)
+        )
 
-        hidden_states, residual = self.mtp_block(positions=positions,
-                                                 hidden_states=hidden_states,
-                                                 residual=None)
+        hidden_states, residual = self.mtp_block(
+            positions=positions, hidden_states=hidden_states, residual=None
+        )
         hidden_states = residual + hidden_states
         return hidden_states
 
 
 class DeepSeekMultiTokenPredictor(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = config.num_nextn_predict_layers
         # to map the exact layer index from weights
-        self.layers = torch.nn.ModuleDict({
-            str(idx):
-            DeepSeekMultiTokenPredictorLayer(vllm_config,
-                                             f"{prefix}.layers.{idx}")
-            for idx in range(self.mtp_start_layer_idx,
-                             self.mtp_start_layer_idx + self.num_mtp_layers)
-        })
+
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): DeepSeekMultiTokenPredictorLayer(
+                    vllm_config, f"{prefix}.layers.{idx}"
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        current_step_idx = spec_step_idx % self.num_mtp_layers
         return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
             input_ids,
             positions,
@@ -138,45 +169,68 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
-        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
-                                    current_step_idx)]
-        logits = self.logits_processor(mtp_layer.shared_head.head,
-                                       mtp_layer.shared_head(hidden_states))
+        current_step_idx = spec_step_idx % self.num_mtp_layers
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)]
+        logits = self.logits_processor(
+            mtp_layer.shared_head.head, mtp_layer.shared_head(hidden_states)
+        )
         return logits
 
 
-class DeepSeekMTP(nn.Module, SupportsPP):
-
+@support_torch_compile
+class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
-        self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config,
-                                                 prefix=maybe_prefix(
-                                                     prefix, "model"))
+        self.model = DeepSeekMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+        self.num_moe_layers = self.config.num_nextn_predict_layers
+        self.num_expert_groups = self.config.n_group
+
+        self.moe_layers = []
+        self.moe_mlp_layers = []
+        example_moe = None
+        for layer in self.model.layers.values():
+            assert isinstance(layer, DeepSeekMultiTokenPredictorLayer)
+            layer = layer.mtp_block
+            assert isinstance(layer, DeepseekV2DecoderLayer)
+            if isinstance(layer.mlp, DeepseekV2MoE):
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -188,7 +242,8 @@ class DeepSeekMTP(nn.Module, SupportsPP):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
+            num_experts=self.config.n_routed_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -199,7 +254,7 @@ class DeepSeekMTP(nn.Module, SupportsPP):
             if spec_layer is None:
                 continue
             name = self._rewrite_spec_layer_name(spec_layer, name)
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -209,14 +264,15 @@ class DeepSeekMTP(nn.Module, SupportsPP):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
-                if ((param_name == "fused_qkv_a_proj")
-                        and name_mapped not in params_dict):
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
                     continue
                 else:
                     name = name_mapped
@@ -238,26 +294,35 @@ class DeepSeekMTP(nn.Module, SupportsPP):
 
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
 
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
                     # According to DeepSeek-V3 Technical Report, MTP modules
                     # shares embedding layer. We only load the first weights.
-                    if (spec_layer != self.model.mtp_start_layer_idx
-                            and ".layers" not in name):
+                    if (
+                        spec_layer != self.model.mtp_start_layer_idx
+                        and ".layers" not in name
+                    ):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -269,7 +334,11 @@ class DeepSeekMTP(nn.Module, SupportsPP):
         and rename shared layer weights to be top level.
         """
         spec_layer_weight_names = [
-            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
         ]
         shared_weight_names = ["embed_tokens"]
         spec_layer_weight = False
@@ -282,8 +351,9 @@ class DeepSeekMTP(nn.Module, SupportsPP):
                 break
         if not spec_layer_weight:
             # treat rest weights as weights for transformer layer block
-            name = name.replace(f"model.layers.{spec_layer}.",
-                                f"model.layers.{spec_layer}.mtp_block.")
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
         elif shared_weight:
             # treat shared weights as top level weights
             name = name.replace(f"model.layers.{spec_layer}.", "model.")
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..8179f916ff41710df8c2aebc4496c40ea3dc403a
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -0,0 +1,593 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Deepseek-OCR model compatible with HuggingFace weights."""
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Literal
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, CLIPVisionConfig
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
+from vllm.transformers_utils.processors.deepseek_ocr import (
+    BASE_SIZE,
+    CROP_MODE,
+    IMAGE_SIZE,
+    DeepseekOCRProcessor,
+    count_tiles,
+)
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+from .deepencoder import DeepCLIPVisionTransformer, build_sam_vit_b
+from .deepseek_vl2 import MlpProjector
+
+# The image token id may be various
+_IMAGE_TOKEN = "<image>"
+
+
+class DeepseekOCRImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - n: Number of images
+        - p: Number of patches
+        - base_size: Base size of the processor
+        - image_size: Image size of the processor
+    """
+
+    type: Literal["pixel_values"]
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 3, "base_size", "base_size", dynamic_dims={"bnp"}),
+    ]
+    images_crop: Annotated[
+        torch.Tensor,
+        TensorShape("bnp", 3, "image_size", "image_size", dynamic_dims={"bnp"}),
+    ]
+    images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)]
+
+
+class NoRepeatNGramLogitsProcessor:
+    def __init__(
+        self,
+        ngram_size: int,
+        window_size: int,
+        whitelist_token_ids: set[int] | None = None,
+    ):
+        self.ngram_size = ngram_size
+        self.window_size = window_size
+        self.whitelist_token_ids = whitelist_token_ids or set()
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        if len(output_ids) < self.ngram_size:
+            return logits
+
+        current_prefix = tuple(output_ids[-(self.ngram_size - 1) :])
+
+        search_start = max(0, len(output_ids) - self.window_size)
+        search_end = len(output_ids) - self.ngram_size + 1
+
+        banned_tokens = set()
+        for i in range(search_start, search_end):
+            ngram = tuple(output_ids[i : i + self.ngram_size])
+            if ngram[:-1] == current_prefix:
+                banned_tokens.add(ngram[-1])
+
+        banned_tokens = banned_tokens - self.whitelist_token_ids
+
+        if banned_tokens:
+            logits[list(banned_tokens)] = -float("inf")
+
+        return logits
+
+
+class NGramPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of overriding the wrapper class `__init__()` in order to utilize
+    info about the device type"""
+
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        ngram_size = params.extra_args and params.extra_args.get("ngram_size")
+        window_size = params.extra_args and params.extra_args.get("window_size", 100)
+        whitelist_token_ids = params.extra_args and params.extra_args.get(
+            "whitelist_token_ids", None
+        )
+        # if ngram_size is not provided, skip validation because the processor
+        # will not be used.
+        if ngram_size is None:
+            return None
+
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f"`ngram_size` has to be a strictly positive integer, got {ngram_size}."
+            )
+        if not isinstance(window_size, int) or window_size <= 0:
+            raise ValueError(
+                "`window_size` has to be a strictly positive integer, "
+                f"got {window_size}."
+            )
+        if whitelist_token_ids is not None and not isinstance(
+            whitelist_token_ids, Iterable
+        ):
+            raise ValueError(
+                "`whitelist_token_ids` has to be a sequence of integers, "
+                f"got {whitelist_token_ids}."
+            )
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> RequestLogitsProcessor | None:
+        ngram_size = params.extra_args and params.extra_args.get("ngram_size")
+        window_size = params.extra_args and params.extra_args.get("window_size", 100)
+        whitelist_token_ids = params.extra_args and params.extra_args.get(
+            "whitelist_token_ids", None
+        )
+        if ngram_size is None:
+            return None
+
+        whitelist_token_ids = set(whitelist_token_ids) if whitelist_token_ids else None
+        return NoRepeatNGramLogitsProcessor(
+            ngram_size=ngram_size,
+            window_size=window_size,
+            whitelist_token_ids=whitelist_token_ids,
+        )
+
+
+class DeepseekOCRProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(DeepseekVLV2Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(DeepseekOCRProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self, *, image_width: int, image_height: int, cropping: bool = True
+    ) -> int:
+        image_size = IMAGE_SIZE
+        base_size = BASE_SIZE
+        patch_size = 16
+        downsample_ratio = 4
+
+        if CROP_MODE:
+            if image_width <= 640 and image_height <= 640:
+                crop_ratio = [1, 1]
+            else:
+                # find the closest aspect ratio to the target
+                crop_ratio = count_tiles(
+                    image_width, image_height, image_size=IMAGE_SIZE
+                )
+
+            num_width_tiles, num_height_tiles = crop_ratio
+        else:
+            num_width_tiles = num_height_tiles = 1
+
+        h = w = math.ceil((base_size // patch_size) / downsample_ratio)
+
+        h2 = w2 = math.ceil((image_size // patch_size) / downsample_ratio)
+
+        global_views_tokens = h * (w + 1)
+        if num_width_tiles > 1 or num_height_tiles > 1:
+            local_views_tokens = (num_height_tiles * h2) * (num_width_tiles * w2 + 1)
+        else:
+            local_views_tokens = 0
+
+        return global_views_tokens + local_views_tokens + 1
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        if IMAGE_SIZE == 1024 and BASE_SIZE == 1280:
+            return ImageSize(width=1024 * 2, height=1024 * 2)
+        return ImageSize(width=640 * 2, height=640 * 2)
+
+
+class DeepseekOCRDummyInputsBuilder(BaseDummyInputsBuilder[DeepseekOCRProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        max_image_size = self.info.get_image_size_with_most_features()
+
+        return {
+            "image": self._get_dummy_images(
+                width=max_image_size.width,
+                height=max_image_size.height,
+                num_images=num_images,
+            )
+        }
+
+
+class DeepseekOCRMultiModalProcessor(
+    BaseMultiModalProcessor[DeepseekOCRProcessingInfo]
+):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(prompt=prompt, **mm_data),
+                mm_kwargs,
+            )
+
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(
+                prompt, add_special_tokens=True, return_tensors="pt"
+            )
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        images_spatial_crop = hf_inputs.get("images_spatial_crop", torch.empty((0, 2)))
+        is_tiled = (images_spatial_crop[:, 0] > 1) | (images_spatial_crop[:, 1] > 1)
+        patches_per_image = torch.where(is_tiled, images_spatial_crop.prod(dim=-1), 0)
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            images_spatial_crop=MultiModalFieldConfig.batched("image"),
+            images_crop=MultiModalFieldConfig.flat_from_sizes(
+                "image", patches_per_image
+            ),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_token_id = hf_processor.image_token_id
+        assert isinstance(image_token_id, int)
+
+        def get_replacement_deepseek_vl2(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                size = images.get_image_size(item_idx)
+
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=size.width,
+                    image_height=size.height,
+                    cropping=CROP_MODE,
+                )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_deepseek_vl2,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    DeepseekOCRMultiModalProcessor,
+    info=DeepseekOCRProcessingInfo,
+    dummy_inputs=DeepseekOCRDummyInputsBuilder,
+)
+class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # map prefix for language backbone
+            "model.embed_tokens.": "language_model.model.embed_tokens.",
+            "model.layers.": "language_model.model.layers.",
+            "model.norm.": "language_model.model.norm.",
+            "lm_head.": "language_model.lm_head.",
+            # remove "model." prefix for other components
+            "model.": "",
+        }
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: DeepseekVLV2Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_config = config.vision_config
+        self.projector_config = config.projector_config
+        self.text_config = config.text_config
+
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
+        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
+
+        self.sam_model = build_sam_vit_b()
+        clip_vision_config = CLIPVisionConfig(
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_attention_heads=16,
+            num_hidden_layers=24,
+            image_size=224,
+            patch_size=14,
+            projection_dim=512,
+            layer_norm_eps=1e-5,
+        )
+        self.vision_model = DeepCLIPVisionTransformer(
+            config=clip_vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.projector = MlpProjector(self.projector_config)
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        # special token for image token sequence format
+        n_embed = self.projector_config.n_embed
+        embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
+        if self.tile_tag == "2D":
+            # <|view_separator|>, <|\n|>
+            self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
+            # This is a typo in original implementation
+            self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
+        else:
+            raise ValueError(
+                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
+            )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=self.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> DeepseekOCRImagePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
+        images_crop = kwargs.pop("images_crop", None)
+
+        if pixel_values is None or torch.sum(pixel_values).item() == 0:
+            return None
+
+        if pixel_values is not None:
+            base_size = self.vision_config.image_size
+            return DeepseekOCRImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                images_crop=images_crop,
+                images_spatial_crop=images_spatial_crop,
+                resolve_bindings={
+                    "base_size": base_size,
+                },
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _encode_global_features(self, image_tensor: torch.Tensor) -> torch.Tensor:
+        global_features_1 = self.sam_model(image_tensor)
+        global_features_2 = self.vision_model(image_tensor, global_features_1)
+        features = torch.cat(
+            (
+                global_features_2[:, 1:],
+                global_features_1.flatten(2).permute(0, 2, 1),
+            ),
+            dim=-1,
+        )
+        features = self.projector(features)
+
+        _, hw, dim = features.shape
+        side = int(hw**0.5)
+
+        features = features.view(side, side, dim)
+        newline = self.image_newline[None, None, :].expand(side, 1, dim)
+        features = torch.cat([features, newline], dim=1)
+        return features.view(-1, dim)
+
+    def _encode_local_features(
+        self, patches: torch.Tensor, crop_shape: torch.Tensor
+    ) -> torch.Tensor | None:
+        if torch.sum(patches).item() == 0:
+            return None
+
+        local_features_1 = self.sam_model(patches)
+        local_features_2 = self.vision_model(patches, local_features_1)
+        features = torch.cat(
+            (
+                local_features_2[:, 1:],
+                local_features_1.flatten(2).permute(0, 2, 1),
+            ),
+            dim=-1,
+        )
+        features = self.projector(features)
+
+        _, hw, dim = features.shape
+        patch_side = int(hw**0.5)
+
+        width_tiles = int(crop_shape[0].item())
+        height_tiles = int(crop_shape[1].item())
+
+        features = (
+            features.view(height_tiles, width_tiles, patch_side, patch_side, dim)
+            .permute(0, 2, 1, 3, 4)
+            .reshape(height_tiles * patch_side, width_tiles * patch_side, dim)
+        )
+        newline = self.image_newline[None, None, :].expand(
+            height_tiles * patch_side, 1, dim
+        )
+        features = torch.cat([features, newline], dim=1)
+
+        return features.view(-1, dim)
+
+    def _pixel_values_to_embedding(
+        self,
+        pixel_values: torch.Tensor,
+        images_crop: torch.Tensor,
+        images_spatial_crop: torch.Tensor,
+    ) -> NestedTensors:
+        images_in_this_batch = []
+
+        is_tiled = (images_spatial_crop[:, 0] > 1) | (images_spatial_crop[:, 1] > 1)
+        patches_per_image = torch.where(is_tiled, images_spatial_crop.prod(dim=-1), 0)
+        images_crop = images_crop.split(patches_per_image.tolist())
+        for jdx in range(images_spatial_crop.size(0)):
+            patches = images_crop[jdx]
+            image_ori = pixel_values[[jdx]]
+            crop_shape = images_spatial_crop[jdx]
+
+            global_features = self._encode_global_features(image_ori)
+            local_features = self._encode_local_features(patches, crop_shape)
+
+            if local_features is not None:
+                combined = torch.cat(
+                    [local_features, global_features, self.view_seperator[None, :]],
+                    dim=0,
+                )
+            else:
+                combined = torch.cat(
+                    [global_features, self.view_seperator[None, :]], dim=0
+                )
+
+            images_in_this_batch.append(combined)
+
+        return images_in_this_batch
+
+    def _process_image_input(
+        self, image_input: DeepseekOCRImagePixelInputs
+    ) -> torch.Tensor:
+        pixel_values = image_input.data
+        images_crop = image_input.images_crop
+        images_spatial_crop = image_input.images_spatial_crop.to(dtype=torch.long)
+
+        vision_features = self._pixel_values_to_embedding(
+            pixel_values=pixel_values,
+            images_crop=images_crop,
+            images_spatial_crop=images_spatial_crop,
+        )
+
+        return vision_features
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ):
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 1cf0468abab22066740e409c39fd0f3a92a6194e..b62a20f3a28a5d0cb356f8247baafb79fc5c3049 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -23,10 +23,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only DeepseekV2/DeepseekV3 model."""
+
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import os
 import re
@@ -34,51 +35,66 @@ import torch
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ParallelConfig, VllmConfig,
-                         get_current_vllm_config)
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttention
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+    per_token_group_quant_fp8,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import cdiv, direct_register_custom_op
 from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits
-from vllm.v1.attention.backends.mla.indexer import (DeepseekV32IndexerBackend,
-                                                    DeepseekV32IndexerMetadata)
+from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backends.mla.indexer import (
+    DeepseekV32IndexerBackend,
+    DeepseekV32IndexerMetadata,
+)
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-from vllm import _custom_ops as ops
+from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
@@ -88,14 +104,99 @@ elif current_platform.is_xpu():
 logger = init_logger(__name__)
 
 
-class DeepseekV2MLP(nn.Module):
+class DeepseekAttention(nn.Module):
+    """Normal MHA implementation used by Deepseek v1."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: DeepseekV2Config | DeepseekV3Config,
+        hidden_size: int,
+        num_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
 
+class DeepseekV2MLP(nn.Module):
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         is_sequence_parallel=False,
         prefix: str = "",
@@ -107,21 +208,26 @@ class DeepseekV2MLP(nn.Module):
         # replicated and no collective ops are needed.
         # Otherwise we use standard TP with an allreduce at the end.
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            disable_tp=is_sequence_parallel,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
+            reduce_results=reduce_results,
             disable_tp=is_sequence_parallel,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           disable_tp=is_sequence_parallel,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -132,22 +238,21 @@ class DeepseekV2MLP(nn.Module):
 
 
 class DeepseekV2MoE(nn.Module):
-
     def __init__(
         self,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         parallel_config: ParallelConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
 
-        self.routed_scaling_factor = config.routed_scaling_factor
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
@@ -155,17 +260,22 @@ class DeepseekV2MoE(nn.Module):
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
 
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-        if config.topk_method == "noaux_tc":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        if getattr(config, "topk_method", None) == "noaux_tc":
             self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts, dtype=torch.float32))
+                torch.empty(config.n_routed_experts, dtype=torch.float32)
+            )
         else:
             self.gate.e_score_correction_bias = None
 
@@ -175,40 +285,19 @@ class DeepseekV2MoE(nn.Module):
 
         self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_logical_experts = self.n_routed_experts
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
 
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
 
-        if config.n_shared_experts is None:
-            self.experts = FusedMoE(
-                num_experts=config.n_routed_experts,
-                top_k=config.num_experts_per_tok,
-                hidden_size=config.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                reduce_results=False,
-                renormalize=config.norm_topk_prob,
-                quant_config=quant_config,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                topk_group=config.topk_group,
-                prefix=f"{prefix}.experts",
-                scoring_func=config.scoring_func,
-                # we do scaling outside, set factor to 1.0 to avoid double mul
-                routed_scaling_factor=1.0,
-                e_score_correction_bias=self.gate.e_score_correction_bias,
-                enable_eplb=self.enable_eplb,
-                num_redundant_experts=self.n_redundant_experts,
-                is_sequence_parallel=self.is_sequence_parallel,
-            )
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
             self.shared_experts = None
         else:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
 
             self.shared_experts = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
@@ -220,27 +309,34 @@ class DeepseekV2MoE(nn.Module):
                 prefix=f"{prefix}.shared_experts",
             )
 
-            self.experts = SharedFusedMoE(
-                shared_experts=self.shared_experts,
-                num_experts=config.n_routed_experts,
-                top_k=config.num_experts_per_tok,
-                hidden_size=config.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                reduce_results=False,
-                renormalize=config.norm_topk_prob,
-                quant_config=quant_config,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                topk_group=config.topk_group,
-                prefix=f"{prefix}.experts",
-                scoring_func=config.scoring_func,
-                # we do scaling outside, set factor to 1.0 to avoid double mul
-                routed_scaling_factor=1.0,
-                e_score_correction_bias=self.gate.e_score_correction_bias,
-                enable_eplb=self.enable_eplb,
-                num_redundant_experts=self.n_redundant_experts,
-                is_sequence_parallel=self.is_sequence_parallel,
-            )
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            gate=self.gate,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=getattr(config, "n_group", 1),
+            topk_group=getattr(config, "topk_group", 1),
+            prefix=f"{prefix}.experts",
+            scoring_func=getattr(config, "scoring_func", "softmax"),
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            # aiter applies routed_scaling_factor internally
+            routed_scaling_factor=1.0
+            if not self.is_rocm_aiter_moe_enabled
+            else self.routed_scaling_factor,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            n_shared_experts=config.n_shared_experts
+            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+            else None,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
@@ -253,25 +349,30 @@ class DeepseekV2MoE(nn.Module):
         if self.is_sequence_parallel:
             hidden_states = sequence_parallel_chunk(hidden_states)
 
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        fused_moe_out = self.experts(hidden_states=hidden_states,
-                                     router_logits=router_logits)
-
-        if self.shared_experts is not None:
-            shared_output, final_hidden_states = fused_moe_out
+        if self.experts.is_internal_router:
+            # In this case, the gate/router runs inside the FusedMoE class
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
         else:
-            shared_output = None
-            final_hidden_states = fused_moe_out
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
+
+        shared_output, final_hidden_states = fused_moe_out
+        if self.shared_experts is None:
+            assert shared_output is None
 
         # Fix FP16 overflow
         # See DeepseekV2DecoderLayer for more details.
         if hidden_states.dtype != torch.float16:
-            final_hidden_states *= self.routed_scaling_factor
+            if not self.is_rocm_aiter_moe_enabled:
+                final_hidden_states *= self.routed_scaling_factor
         elif self.shared_experts is not None:
             assert shared_output is not None
-            shared_output *= (1. / self.routed_scaling_factor)
+            shared_output *= 1.0 / self.routed_scaling_factor
 
         if self.shared_experts is not None:
             assert shared_output is not None
@@ -279,29 +380,30 @@ class DeepseekV2MoE(nn.Module):
 
         if self.is_sequence_parallel:
             final_hidden_states = tensor_model_parallel_all_gather(
-                final_hidden_states, 0)
+                final_hidden_states, 0
+            )
             final_hidden_states = final_hidden_states[:num_tokens]
         elif self.tp_size > 1:
-            final_hidden_states = (
-                self.experts.maybe_all_reduce_tensor_model_parallel(
-                    final_hidden_states))
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
 def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     import math
+
     if scale <= 1:
         return 1.0
     return 0.1 * mscale * math.log(scale) + 1.0
 
 
 class DeepseekV2Attention(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
@@ -310,11 +412,11 @@ class DeepseekV2Attention(nn.Module):
         q_lora_rank: int,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        topk_indices_buffer: Optional[torch.Tensor] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        topk_indices_buffer: torch.Tensor | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -332,60 +434,70 @@ class DeepseekV2Attention(nn.Module):
         self.scaling = self.qk_head_dim**-0.5
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
-        assert topk_indices_buffer is None, "topk_indices_buffer is not \
+        assert topk_indices_buffer is None, (
+            "topk_indices_buffer is not \
         supported for DeepseekV2Attention"
+        )
 
         if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                             self.q_lora_rank,
-                                             bias=False,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.q_a_proj")
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_a_proj",
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
         else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
 
         self.kv_a_proj_with_mqa = ReplicatedLinear(
             self.hidden_size,
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
+            prefix=f"{prefix}.kv_b_proj",
+        )
         # O projection.
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
         if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
+            rope_scaling["rope_type"] = "deepseek_yarn"
 
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
 
         if rope_scaling:
             mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
@@ -393,13 +505,15 @@ class DeepseekV2Attention(nn.Module):
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        self.attn = Attention(self.num_local_heads,
-                              self.qk_head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_local_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_local_heads,
+            self.qk_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -409,47 +523,43 @@ class DeepseekV2Attention(nn.Module):
         if self.q_lora_rank is not None:
             q = self.q_a_proj(hidden_states)[0]
             q = self.q_a_layernorm(q)
-            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
-                                         self.qk_head_dim)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
         else:
-            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
-                                                   self.qk_head_dim)
-        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                               dim=-1)
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
-        kv_a, _ = latent_cache.split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         latent_cache = latent_cache.unsqueeze(1)
         kv_a = self.kv_a_layernorm(kv_a)
         kv = self.kv_b_proj(kv_a)[0]
-        kv = kv.view(-1, self.num_local_heads,
-                     self.qk_nope_head_dim + self.v_head_dim)
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
 
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
 
-        q[..., self.qk_nope_head_dim:] = q_pe
+        q[..., self.qk_nope_head_dim :] = q_pe
         k = torch.empty_like(q)
-        k[..., :self.qk_nope_head_dim] = k_nope
-        k[..., self.qk_nope_head_dim:] = k_pe
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
         # padding value to qk_head_dim for alignment
         v = torch.nn.functional.pad(
-            v, [0, self.qk_head_dim - self.v_head_dim],
-            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
+            v, [0, self.qk_head_dim - self.v_head_dim], value=0
+        ).view(-1, self.num_local_heads * self.qk_head_dim)
         attn_output = self.attn(q, k, v)
-        attn_output = attn_output.view(
-            -1, self.num_local_heads,
-            self.qk_head_dim)[..., :self.v_head_dim].reshape(
-                -1, self.num_local_heads * self.v_head_dim)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.qk_head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_local_heads * self.v_head_dim)
         output, _ = self.o_proj(attn_output)
         return output
 
 
 class DeepseekV32IndexerCache(torch.nn.Module, AttentionLayerBase):
-
-    def __init__(self, head_dim: int, dtype: torch.dtype, prefix: str,
-                 cache_config: CacheConfig):
+    def __init__(
+        self, head_dim: int, dtype: torch.dtype, prefix: str, cache_config: CacheConfig
+    ):
         super().__init__()
         self.kv_cache = [torch.tensor([])]
         self.head_dim = head_dim
@@ -461,7 +571,7 @@ class DeepseekV32IndexerCache(torch.nn.Module, AttentionLayerBase):
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
         return MLAAttentionSpec(  # Only has one vector instead of K + V
             block_size=self.cache_config.block_size,
             num_kv_heads=1,
@@ -469,70 +579,12 @@ class DeepseekV32IndexerCache(torch.nn.Module, AttentionLayerBase):
             dtype=self.dtype,
         )
 
-    def forward(self):
-        ...
+    def forward(self): ...
 
     def get_attn_backend(self) -> AttentionBackend:
         return DeepseekV32IndexerBackend
 
 
-@torch.inference_mode()
-def cp_gather_indexer_k_quant_cache(
-    kv_cache,  # [num_blocks, block_size, head_dim + 1]
-    dst_value,  # [cu_seq_lens[-1], head_dim]
-    dst_scale,  # [cu_seq_lens[-1], 4]
-    block_table,  # [batch_size, num_blocks]
-    cu_seq_lens,  # [batch_size + 1, ]
-    batch_size,
-):
-    num_blocks, block_size, _ = kv_cache.shape
-    head_dim = dst_value.shape[-1]
-    kv_cache = kv_cache.view(num_blocks, -1)
-
-    expected_value = []
-    expected_scale = []
-    for b in range(batch_size):
-        s = cu_seq_lens[b + 1] - cu_seq_lens[b]
-        if s == 0:
-            continue
-        tot = cdiv(s, block_size)
-        blocks = block_table[b, :tot]
-
-        value = []
-        scale = []
-        full_block = torch.arange(tot - 1,
-                                  device=kv_cache.device,
-                                  dtype=torch.int32)
-        non_remaining_value = kv_cache[blocks[full_block], :block_size *
-                                       head_dim].view(-1, head_dim)
-        non_remaining_scale = kv_cache[blocks[full_block],
-                                       block_size * head_dim:].view(-1, 4)
-
-        remaining = s - (tot - 1) * block_size
-
-        value = torch.cat([
-            non_remaining_value,
-            kv_cache[blocks[-1], :remaining * head_dim].view(-1, head_dim)
-        ],
-                          dim=0)
-        scale = torch.cat([
-            non_remaining_scale,
-            kv_cache[blocks[-1], block_size * head_dim:block_size * head_dim +
-                     remaining * 4].view(-1, 4)
-        ],
-                          dim=0)
-
-        expected_value.append(value)
-        expected_scale.append(scale)
-
-    gather_value = torch.cat(expected_value, dim=0).view(-1, head_dim)
-    gather_scale = torch.cat(expected_scale, dim=0).view(-1, 4)
-    gather_value = gather_value.view(torch.float8_e4m3fn)
-    gather_scale = gather_scale.view(torch.float32)
-    dst_value.copy_(gather_value)
-    dst_scale.copy_(gather_scale)
-
-
 def sparse_attn_indexer(
     hidden_states: torch.Tensor,
     k_cache_prefix: str,
@@ -541,14 +593,13 @@ def sparse_attn_indexer(
     k: torch.Tensor,
     weights: torch.Tensor,
     quant_block_size: int,
-    scale_fmt: Optional[str],
+    scale_fmt: str | None,
     topk_tokens: int,
     head_dim: int,
     max_model_len: int,
     total_seq_lens: int,
-    topk_indices_buffer: Optional[torch.Tensor],
+    topk_indices_buffer: torch.Tensor | None,
 ) -> torch.Tensor:
-
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
     # assert isinstance(attn_metadata, dict)
@@ -583,46 +634,48 @@ def sparse_attn_indexer(
         scale_fmt,
     )
 
-    topk_indices_buffer[:hidden_states.shape[0]] = -1
+    topk_indices_buffer[: hidden_states.shape[0]] = -1
     if has_prefill:
         prefill_metadata = attn_metadata.prefill
         for chunk in prefill_metadata.chunks:
-            k_fp8 = torch.empty([chunk.total_seq_lens, head_dim],
-                                device=k.device,
-                                dtype=torch.float8_e4m3fn)
-            k_scale = torch.empty([chunk.total_seq_lens, 1],
-                                  device=k.device,
-                                  dtype=torch.float32)
-            cp_gather_indexer_k_quant_cache(
+            k_fp8 = torch.empty(
+                [chunk.total_seq_lens, head_dim],
+                device=k.device,
+                dtype=torch.float8_e4m3fn,
+            )
+            k_scale = torch.empty(
+                [chunk.total_seq_lens, 4],
+                device=k.device,
+                dtype=torch.uint8,
+            )
+            ops.cp_gather_indexer_k_quant_cache(
                 kv_cache,
                 k_fp8,
                 k_scale,
                 chunk.block_table,
                 chunk.cu_seq_lens,
-                chunk.num_reqs,
             )
             logits = fp8_mqa_logits(
-                q_fp8[chunk.token_start:chunk.token_end],
-                (k_fp8, k_scale),
-                weights[chunk.token_start:chunk.token_end],
+                q_fp8[chunk.token_start : chunk.token_end],
+                (k_fp8, k_scale.view(torch.float32)),
+                weights[chunk.token_start : chunk.token_end],
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,
             )
-            topk_indices = logits.topk(min(topk_tokens, logits.shape[-1]),
-                                       dim=-1)[1]
-            topk_indices -= chunk.cu_seqlen_ks[:, None]
-            mask_lo = topk_indices >= 0
-            mask_hi = topk_indices - (chunk.cu_seqlen_ke -
-                                      chunk.cu_seqlen_ks)[:, None] < 0
-            mask = torch.full_like(topk_indices,
-                                   False,
-                                   dtype=torch.bool,
-                                   device=topk_indices.device)
-            mask = mask_lo & mask_hi
-            topk_indices = topk_indices.masked_fill(~mask, -1)
-            topk_indices_buffer[
-                chunk.token_start:chunk.token_end, :topk_indices.
-                shape[-1]] = topk_indices.to(dtype=torch.int32)
+            num_rows = logits.shape[0]
+            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
+            topk_indices = topk_indices_buffer[
+                chunk.token_start : chunk.token_end, :topk_tokens
+            ]
+            torch.ops._C.top_k_per_row(
+                logits,
+                chunk.cu_seqlen_ks,
+                chunk.cu_seqlen_ke,
+                topk_indices,
+                num_rows,
+                logits.stride(0),
+                logits.stride(1),
+            )
 
     if has_decode:
         decode_metadata = attn_metadata.decode
@@ -636,10 +689,12 @@ def sparse_attn_indexer(
             # prefill and decode by decode_threshold
             # (currently set to 1 + speculative tokens)
             padded_q_fp8_decode_tokens = pack_seq_triton(
-                q_fp8[:num_decode_tokens], decode_lens)
+                q_fp8[:num_decode_tokens], decode_lens
+            )
         else:
             padded_q_fp8_decode_tokens = q_fp8[:num_decode_tokens].reshape(
-                decode_lens.shape[0], -1, *q_fp8.shape[1:])
+                decode_lens.shape[0], -1, *q_fp8.shape[1:]
+            )
         # TODO: move and optimize below logic with triton kernels
         batch_size = padded_q_fp8_decode_tokens.shape[0]
         next_n = padded_q_fp8_decode_tokens.shape[1]
@@ -654,37 +709,29 @@ def sparse_attn_indexer(
             decode_metadata.schedule_metadata,
             max_model_len=max_model_len,
         )
-        # padded query len
-        current_device = padded_q_fp8_decode_tokens.device
-        padded_num_tokens = batch_size * next_n
-        positions = torch.arange(max_model_len,
-                                 device=current_device).unsqueeze(0).expand(
-                                     batch_size * next_n, -1)
-        row_indices = torch.arange(padded_num_tokens,
-                                   device=current_device) // next_n
-        next_n_offset = torch.arange(
-            padded_num_tokens,
-            device=padded_q_fp8_decode_tokens.device) % next_n
-        index_end_pos = (decode_metadata.seq_lens[row_indices] - next_n +
-                         next_n_offset).unsqueeze(1)
-        # index_end_pos: [B * N, 1]
-        mask = positions <= index_end_pos
-        # mask: [B * N, L]
-        logits = logits.masked_fill(~mask, float('-inf'))
-        topk_indices = logits.topk(topk_tokens,
-                                   dim=-1)[1].to(torch.int32)  # [B * N, K]
-        # ensure we don't set indices for the top k
-        # that is out of range(masked already)
-        # this will happen if context length is shorter than K
-        topk_indices[topk_indices > index_end_pos] = -1
+        num_rows = logits.shape[0]
+        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
+        topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]
+
+        torch.ops._C.top_k_per_row_decode(
+            logits,
+            next_n,
+            decode_metadata.seq_lens,
+            topk_indices,
+            num_rows,
+            logits.stride(0),
+            logits.stride(1),
+        )
         if decode_metadata.requires_padding:
             # if padded, we need to unpack
             # the topk indices removing padded tokens
             topk_indices = unpack_seq_triton(
                 topk_indices.reshape(batch_size, -1, topk_indices.shape[-1]),
-                decode_lens)
-        topk_indices_buffer[:num_decode_tokens, :topk_indices.
-                            shape[-1]] = topk_indices.to(dtype=torch.int32)
+                decode_lens,
+            )
+            topk_indices_buffer[:num_decode_tokens, : topk_indices.shape[-1]] = (
+                topk_indices
+            )
 
     return topk_indices_buffer
 
@@ -697,21 +744,20 @@ def sparse_attn_indexer_fake(
     k: torch.Tensor,
     weights: torch.Tensor,
     quant_block_size: int,
-    scale_fmt: Optional[str],
+    scale_fmt: str | None,
     topk_tokens: int,
     head_dim: int,
     max_model_len: int,
     total_seq_lens: int,
-    topk_indices_buffer: Optional[torch.Tensor],
+    topk_indices_buffer: torch.Tensor | None,
 ) -> torch.Tensor:
     # profile run
     # NOTE(Chen): create the max possible flattened_kv. So that
     # profile_run can get correct memory usage.
-    _flattened_kv = torch.empty([total_seq_lens, head_dim + 4],
-                                device=k.device,
-                                dtype=torch.uint8)
-    _k_fp8 = _flattened_kv[..., :head_dim].view(
-        torch.float8_e4m3fn).contiguous()
+    _flattened_kv = torch.empty(
+        [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
+    )
+    _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous()
     _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous()
     return topk_indices_buffer
 
@@ -726,16 +772,17 @@ direct_register_custom_op(
 
 
 class Indexer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 config: Union[DeepseekV2Config, DeepseekV3Config],
-                 hidden_size: int,
-                 q_lora_rank: int,
-                 quant_config: Optional[QuantizationConfig],
-                 cache_config: Optional[CacheConfig],
-                 topk_indices_buffer: Optional[torch.Tensor],
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: DeepseekV2Config | DeepseekV3Config,
+        hidden_size: int,
+        q_lora_rank: int,
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
+        topk_indices_buffer: torch.Tensor | None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.vllm_config = vllm_config
         self.config = config
@@ -746,21 +793,24 @@ class Indexer(nn.Module):
         self.rope_dim = config.qk_rope_head_dim  # 64
         self.q_lora_rank = q_lora_rank  # 1536
         # no tensor parallel, just replicated
-        self.wq_b = ReplicatedLinear(self.q_lora_rank,
-                                     self.head_dim * self.n_head,
-                                     bias=False,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.wq_b")
-        self.wk = ReplicatedLinear(hidden_size,
-                                   self.head_dim,
-                                   bias=False,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.wk")
+        self.wq_b = ReplicatedLinear(
+            self.q_lora_rank,
+            self.head_dim * self.n_head,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wq_b",
+        )
+        self.wk = ReplicatedLinear(
+            hidden_size,
+            self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wk",
+        )
         self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
-        self.weights_proj = ReplicatedLinear(hidden_size,
-                                             self.n_head,
-                                             quant_config=None,
-                                             prefix=f"{prefix}.weights_proj")
+        self.weights_proj = ReplicatedLinear(
+            hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj"
+        )
         self.softmax_scale = self.head_dim**-0.5
 
         self.scale_fmt = "ue8m0"
@@ -771,28 +821,31 @@ class Indexer(nn.Module):
         #       where we store value in fp8 and scale in fp32
         #       per self.quant_block_size element
         self.k_cache = DeepseekV32IndexerCache(
-            head_dim=self.head_dim +
-            self.head_dim // self.quant_block_size * 4,
+            head_dim=self.head_dim + self.head_dim // self.quant_block_size * 4,
             dtype=torch.uint8,
             prefix=f"{prefix}.k_cache",
-            cache_config=cache_config)
+            cache_config=cache_config,
+        )
         self.max_model_len = vllm_config.model_config.max_model_len
         self.prefix = prefix
-        from vllm.v1.attention.backends.mla.indexer import (
-            get_max_prefill_buffer_size)
+        from vllm.v1.attention.backends.mla.indexer import get_max_prefill_buffer_size
+
         self.max_total_seq_len = get_max_prefill_buffer_size(vllm_config)
 
-    def forward(self, hidden_states: torch.Tensor, qr: torch.Tensor, positions,
-                rotary_emb) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, qr: torch.Tensor, positions, rotary_emb
+    ) -> torch.Tensor:
         q, _ = self.wq_b(qr)
         q = q.view(-1, self.n_head, self.head_dim)
         q_pe, q_nope = torch.split(
-            q, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+            q, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
+        )
 
         k, _ = self.wk(hidden_states)
         k = self.k_norm(k)
         k_pe, k_nope = torch.split(
-            k, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+            k, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
+        )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
         q = torch.cat([q_pe, q_nope], dim=-1)
@@ -800,17 +853,19 @@ class Indexer(nn.Module):
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
-        q_fp8, q_scale = per_token_group_quant_fp8(q,
-                                                   self.quant_block_size,
-                                                   column_major_scales=False,
-                                                   use_ue8m0=self.scale_fmt
-                                                   is not None)
+        q_fp8, q_scale = per_token_group_quant_fp8(
+            q,
+            self.quant_block_size,
+            column_major_scales=False,
+            use_ue8m0=self.scale_fmt is not None,
+        )
         q_fp8 = q_fp8.view(-1, self.n_head, self.head_dim)
         q_scale = q_scale.view(-1, self.n_head, 1)
 
         weights, _ = self.weights_proj(hidden_states)
-        weights = weights.unsqueeze(
-            -1) * q_scale * self.softmax_scale * self.n_head**-0.5
+        weights = (
+            weights.unsqueeze(-1) * q_scale * self.softmax_scale * self.n_head**-0.5
+        )
         weights = weights.squeeze(-1)
 
         return torch.ops.vllm.sparse_attn_indexer(
@@ -834,7 +889,7 @@ class DeepseekV2MLAAttention(nn.Module):
     """
     Main reference: DeepseekV2 paper, and FlashInfer Implementation
     (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-    
+
         For more info see MLACommonImpl in:
         vllm/v1/attention/backends/mla/utils.py
     """
@@ -842,21 +897,21 @@ class DeepseekV2MLAAttention(nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         v_head_dim: int,
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        topk_indices_buffer: Optional[torch.Tensor] = None,
+        topk_indices_buffer: torch.Tensor | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -884,66 +939,79 @@ class DeepseekV2MLAAttention(nn.Module):
                 bias=False,
                 quant_config=quant_config,
                 prefix=f"{prefix}.fused_qkv_a_proj",
-                disable_tp=True)
+                disable_tp=True,
+            )
         else:
             self.kv_a_proj_with_mqa = ReplicatedLinear(
                 self.hidden_size,
                 self.kv_lora_rank + self.qk_rope_head_dim,
                 bias=False,
                 quant_config=quant_config,
-                prefix=f"{prefix}.kv_a_proj_with_mqa")
+                prefix=f"{prefix}.kv_a_proj_with_mqa",
+            )
 
         if self.q_lora_rank is not None:
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(self.q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                self.q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
         else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+            prefix=f"{prefix}.kv_b_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
+            rope_scaling["rope_type"] = "deepseek_yarn"
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
         if rope_scaling:
             mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
             scaling_factor = rope_scaling["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # self.is_v32 = hasattr(config, "index_topk")
-        self.is_v32 = False
+        self.is_v32 = hasattr(config, "index_topk")
 
         if self.is_v32:
-            self.indexer = Indexer(vllm_config, config, hidden_size,
-                                   q_lora_rank, quant_config, cache_config,
-                                   topk_indices_buffer, f"{prefix}.indexer")
+            self.indexer = Indexer(
+                vllm_config,
+                config,
+                hidden_size,
+                q_lora_rank,
+                quant_config,
+                cache_config,
+                topk_indices_buffer,
+                f"{prefix}.indexer",
+            )
         else:
             self.indexer = None
 
@@ -953,11 +1021,12 @@ class DeepseekV2MLAAttention(nn.Module):
             rotary_emb=self.rotary_emb,
             o_proj=self.o_proj,
             fused_qkv_a_proj=self.fused_qkv_a_proj
-            if self.q_lora_rank is not None else None,
+            if self.q_lora_rank is not None
+            else None,
             kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
-            if self.q_lora_rank is None else None,
-            q_a_layernorm=self.q_a_layernorm
-            if self.q_lora_rank is not None else None,
+            if self.q_lora_rank is None
+            else None,
+            q_a_layernorm=self.q_a_layernorm if self.q_lora_rank is not None else None,
             q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
             q_proj=self.q_proj if self.q_lora_rank is None else None,
             indexer=self.indexer,
@@ -965,7 +1034,7 @@ class DeepseekV2MLAAttention(nn.Module):
             topk_indices_buffer=topk_indices_buffer,
         )
 
-        self.mla_attn = MultiHeadLatentAttention(
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
             self.hidden_size,
             self.num_local_heads,
             self.scaling,
@@ -989,14 +1058,17 @@ class DeepseekV2MLAAttention(nn.Module):
 
 
 class DeepseekV2DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 prefix: str,
-                 topk_indices_buffer: Optional[torch.Tensor] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str,
+        config: DeepseekV2Config | None = None,
+        topk_indices_buffer: torch.Tensor | None = None,
+    ) -> None:
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
+        if config is None:
+            config = vllm_config.model_config.hf_config
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
@@ -1005,13 +1077,25 @@ class DeepseekV2DecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        moe_layer_freq = getattr(config, "moe_layer_freq", 1)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
-        layer_idx = int(prefix.split(sep='.')[-1])
+        layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
-        if model_config.use_mla:
+
+        # verify MLA attention specific fields
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        v_head_dim = getattr(config, "v_head_dim", 0)
+        kv_lora_rank = getattr(config, "kv_lora_rank", 0)
+        use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+
+        if use_mha:
+            attn_cls = DeepseekAttention
+        elif model_config.use_mla:
             attn_cls = DeepseekV2MLAAttention
         else:
             attn_cls = DeepseekV2Attention
@@ -1020,12 +1104,11 @@ class DeepseekV2DecoderLayer(nn.Module):
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            qk_nope_head_dim=config.qk_nope_head_dim,
-            qk_rope_head_dim=config.qk_rope_head_dim,
-            v_head_dim=config.v_head_dim,
-            q_lora_rank=config.q_lora_rank
-            if hasattr(config, "q_lora_rank") else None,
-            kv_lora_rank=config.kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=kv_lora_rank,
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -1035,9 +1118,11 @@ class DeepseekV2DecoderLayer(nn.Module):
             topk_indices_buffer=topk_indices_buffer,
         )
 
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0):
+        if (
+            config.n_routed_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % moe_layer_freq == 0
+        ):
             self.mlp = DeepseekV2MoE(
                 config=config,
                 parallel_config=parallel_config,
@@ -1052,60 +1137,59 @@ class DeepseekV2DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.mlp",
             )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-        self.routed_scaling_factor = config.routed_scaling_factor
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
             residual = hidden_states.clone()
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
-        if hidden_states.dtype == torch.float16:
+        if (
+            not isinstance(self.self_attn, DeepseekAttention)
+            and hidden_states.dtype == torch.float16
+        ):
             # Fix FP16 overflow
             # We scale both hidden_states and residual before
             # rmsnorm, and rmsnorm result would not affect by scale.
-            hidden_states *= 1. / self.routed_scaling_factor
+            hidden_states *= 1.0 / self.routed_scaling_factor
             if self.layer_idx == 0:
                 # The residual is shared by all layers, we only scale it on
                 # first layer.
-                residual *= 1. / self.routed_scaling_factor
+                residual *= 1.0 / self.routed_scaling_factor
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
 
-        if isinstance(self.mlp,
-                      DeepseekV2MLP) and hidden_states.dtype == torch.float16:
+        if isinstance(self.mlp, DeepseekV2MLP) and hidden_states.dtype == torch.float16:
             # Fix FP16 overflow
             # Scaling the DeepseekV2MLP output, it is the input of
             # input_layernorm of next decoder layer.
             # The scaling of DeepseekV2MOE output would be done in the forward
             # of DeepseekV2MOE
-            hidden_states *= 1. / self.routed_scaling_factor
+            hidden_states *= 1.0 / self.routed_scaling_factor
 
         return hidden_states, residual
 
 
 @support_torch_compile
 class DeepseekV2Model(nn.Module):
-
     fall_back_to_pt_during_load = False
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -1114,17 +1198,18 @@ class DeepseekV2Model(nn.Module):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
+        self.device = current_platform.device_type
 
         self.vocab_size = config.vocab_size
-        # self.is_v32 = hasattr(config, "index_topk")
-        self.is_v32 = False
+        self.is_v32 = hasattr(config, "index_topk")
         if self.is_v32:
             topk_tokens = config.index_topk
             topk_indices_buffer = torch.empty(
                 vllm_config.scheduler_config.max_num_batched_tokens,
                 topk_tokens,
                 dtype=torch.int32,
-                device="cuda")
+                device=self.device,
+            )
         else:
             topk_indices_buffer = None
 
@@ -1133,39 +1218,41 @@ class DeepseekV2Model(nn.Module):
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.embed_tokens")
+                prefix=f"{prefix}.embed_tokens",
+            )
         else:
             self.embed_tokens = PPMissingLayer()
-
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix,
-                                                  topk_indices_buffer),
-            prefix=f"{prefix}.layers")
+            lambda prefix: DeepseekV2DecoderLayer(
+                vllm_config, prefix, topk_indices_buffer=topk_indices_buffer
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1176,17 +1263,58 @@ class DeepseekV2Model(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
-class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
-                            SupportsLoRA):
+class DeepseekV2MixtureOfExperts(MixtureOfExperts):
+    moe_mlp_layers: list[DeepseekV2MoE]
+    """
+    List of MoE MLP layers in the model.
+    """
+
+    def extract_moe_parameters(self, example_moe: DeepseekV2MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class DeepseekV2ForCausalLM(
+    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA, SupportsEagle
+):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
@@ -1198,20 +1326,31 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
         self.config = config
         self.quant_config = quant_config
 
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        self.use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+
+        if self.use_mha:
+            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+
         # `packed_modules_mapping` needs to be modified before
         # initializing DeepseekV2Model, as it is passed inplace to
         # quantization config init and may be used to select the
         # quant_method for relevant layers during initialization.
-        self.fuse_qkv_a_proj = hasattr(
-            config, "q_lora_rank") and config.q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
         if self.fuse_qkv_a_proj:
             self.packed_modules_mapping["fused_qkv_a_proj"] = [
                 "q_a_proj",
                 "kv_a_proj_with_mqa",
             ]
 
-        self.model = DeepseekV2Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
+        self.model = DeepseekV2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
@@ -1223,15 +1362,25 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+        # Set MoE hyperparameters
+        self.num_moe_layers = (
+            self.config.num_hidden_layers - self.config.first_k_dense_replace
+        )
+        self.set_moe_parameters()
+        self.quant_method = None
+        if quant_config is not None:
+            self.quant_method=quant_config.get_name()
+        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
+
+    def set_moe_parameters(self):
         self.expert_weights = []
 
-        # Set MoE hyperparameters
-        self.num_moe_layers = (config.num_hidden_layers -
-                               config.first_k_dense_replace)
-        self.num_expert_groups = config.n_group
+        self.num_expert_groups = getattr(self.config, "n_group", 1)
 
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
+        self.moe_mlp_layers = []
         example_moe = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -1241,96 +1390,81 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
             if isinstance(layer.mlp, DeepseekV2MoE):
                 # Pick last one layer since the first ones may be dense layers.
                 example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
                 self.moe_layers.append(layer.mlp.experts)
 
-        if example_moe is None:
-            raise RuntimeError("No DeepseekV2MoE layer found in model.layers.")
-
-        self.num_logical_experts = example_moe.n_logical_experts
-        self.num_physical_experts = example_moe.n_physical_experts
-        self.num_local_physical_experts = example_moe.n_local_physical_experts
-        self.num_routed_experts = example_moe.n_routed_experts
-        self.num_shared_experts = example_moe.n_shared_experts
-        self.num_redundant_experts = example_moe.n_redundant_experts
-        
-        self.quant_method = None
-        if quant_config is not None:
-            self.quant_method=quant_config.get_name()
-        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
+        self.extract_moe_parameters(example_moe)
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = (num_physical_experts -
-                                      self.num_logical_experts)
-        for layer in self.model.layers:
-            if isinstance(layer.mlp, DeepseekV2MoE):
-                moe = layer.mlp
-                moe.n_local_physical_experts = num_local_physical_experts
-                moe.n_physical_experts = num_physical_experts
-                moe.n_redundant_experts = self.num_redundant_experts
-                moe.experts.update_expert_map()
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
+        ]
+        mla_params_mapping = [
             ("fused_qkv_a_proj", "q_a_proj", 0),
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
+        mha_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        if self.use_mha:
+            stacked_params_mapping.extend(mha_params_mapping)
+        else:
+            stacked_params_mapping.extend(mla_params_mapping)
 
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
-            num_redundant_experts=self.num_redundant_experts)
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -1342,7 +1476,11 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
+                "mlp.shared_experts" in name
+            )
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -1352,15 +1490,18 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                if is_fuse_shared_experts_layer:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
                 # if go with fusion option, then update name
-                if ((param_name == "fused_qkv_a_proj")
-                        and name_mapped not in params_dict):
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
                     continue
                 else:
                     name = name_mapped
@@ -1377,66 +1518,115 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
                 break
             else:
                 is_expert_weight = False
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-
-                    # Anyway, this is an expert weight and should not be
-                    # attempted to load as other weights later
-                    is_expert_weight = True
-
-                    # Do not modify `name` since the loop may continue here
-                    # Instead, create a new variable
-                    name_mapped = name.replace(weight_name, param_name)
-
-                    if is_pp_missing_parameter(name_mapped, self):
-                        continue
-
-                    param = params_dict[name_mapped]
-                    # We should ask the weight loader to return success or not
-                    # here since otherwise we may skip experts with other
-                    # available replicas.
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
-                    success = weight_loader(param,
-                                            loaded_weight,
-                                            name_mapped,
-                                            shard_id=shard_id,
-                                            expert_id=expert_id,
-                                            return_success=True)
-                    if success:
-                        name = name_mapped
-                        break
-                else:
-                    if is_expert_weight:
-                        # We've checked that this is an expert weight
-                        # However it's not mapped locally to this rank
-                        # So we simply skip it
-                        continue
-
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fuse_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = 1 if "down_proj.weight" in name else 0
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
+                    )
+                    chunk_size = total // num_chunks
+
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
+
+                    if is_fuse_shared_experts_layer:
+                        if split_dim == 0:
+                            weight_to_load = loaded_weight[
+                                j * chunk_size : (j + 1) * chunk_size, :
+                            ]
+                        else:
+                            weight_to_load = loaded_weight[
+                                :, j * chunk_size : (j + 1) * chunk_size
+                            ]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
+
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Anyway, this is an expert weight and should not be
+                        # attempted to load as other weights later
+                        is_expert_weight = True
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        if is_pp_missing_parameter(name_mapped, self):
+                            continue
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fuse_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        if is_expert_weight:
+                            # We've checked that this is an expert weight
+                            # However it's not mapped locally to this rank
+                            # So we simply skip it
+                            continue
+
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        # Remapping the name of FP8 kv-scale.
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        if is_pp_missing_parameter(name, self):
+                            continue
 
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    try:
                         param = params_dict[name]
-                    except Exception as e:
-                        # print("params_dict[name] error in name:", name)
-                        continue
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-            
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fuse_shared_experts_layer:
+                loaded_params.add(name)
+                
         if self.use_llama_nn and self.quant_method is None:
                 lay_key_words = [
                     "self_attn.q_a_proj.weight",
@@ -1466,23 +1656,28 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
                         weight.data.copy_(_weight)
                         
                         weight.data=weight.data.reshape(ori_shape[1],-1)
-                        
         return loaded_params
 
 
+class DeepseekForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
     pass
 
 
 # Compatibility with
 # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
-def get_spec_layer_idx_from_weight_name(config: Union[DeepseekV2Config,
-                                                      DeepseekV3Config],
-                                        weight_name: str) -> Optional[int]:
-    if (hasattr(config, "num_nextn_predict_layers")
-            and config.num_nextn_predict_layers > 0):
+def get_spec_layer_idx_from_weight_name(
+    config: DeepseekV2Config | DeepseekV3Config, weight_name: str
+) -> int | None:
+    if (
+        hasattr(config, "num_nextn_predict_layers")
+        and config.num_nextn_predict_layers > 0
+    ):
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
                 return layer_idx + i
     return None
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index c8ed759d2e972f434c41dfb7007086f49111eeb7..e7b48e0f4e55416e05839687c0d3afb77cf3a576 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -3,9 +3,10 @@
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
+
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -14,35 +15,50 @@ from einops import rearrange, repeat
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.transformers import replace_linear_class
+from vllm.model_executor.models.transformers.utils import replace_linear_class
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, MultiModalUUIDDict,
-                                    NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalProcessingInfo,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
-                                                          MlpProjectorConfig,
-                                                          VisionEncoderConfig)
-from vllm.transformers_utils.processors.deepseek_vl2 import (
-    DeepseekVLV2Processor)
+from vllm.transformers_utils.configs.deepseek_vl2 import (
+    DeepseekVLV2Config,
+    MlpProjectorConfig,
+    VisionEncoderConfig,
+)
+from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 # The image token id may be various
 _IMAGE_TOKEN = "<image>"
@@ -51,15 +67,15 @@ _IMAGE_TOKEN = "<image>"
 class DeepseekVL2ImagePixelInputs(TensorSchema):
     """
     Dimensions:
-        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * number of patches
         - p: Number of patches
         - c: Number of channels (3)
         - h: Height of each image
         - w: Width of each image
     """
+
     type: Literal["pixel_values"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"})]
+    data: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w", dynamic_dims={"bnp"})]
     images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)]
 
 
@@ -70,86 +86,88 @@ class DeepseekVL2VImageEmbeddingInputs(TensorSchema):
         - f: Image feature size
         - h: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("bn", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("bn", "f", "h")]
 
 
-DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
-                               DeepseekVL2VImageEmbeddingInputs]
+DeepseekVL2ImageInputs: TypeAlias = (
+    DeepseekVL2ImagePixelInputs | DeepseekVL2VImageEmbeddingInputs
+)
 
 
 class MlpProjector(nn.Module):
-
     def __init__(self, cfg: MlpProjectorConfig):
-
         super().__init__()
 
         self.cfg = cfg
-        assert not cfg.token_pooling, (
-            "Token pooling is not supported currently.")
+        self.projector_type = cfg.projector_type
+        assert not cfg.token_pooling, "Token pooling is not supported currently."
 
-        if cfg.projector_type == "downsample_mlp_gelu":
+        if self.projector_type == "downsample_mlp_gelu":
             mlp_depth = cfg.depth
             mlp_ratio = cfg.mlp_ratio
             modules = [
                 nn.Linear(
-                    cfg.input_dim * cfg.downsample_ratio *
-                    cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
+                    cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio,
+                    cfg.n_embed * mlp_ratio,
+                )
             ]
             for _ in range(1, mlp_depth - 1):
                 modules.append(nn.GELU())
                 modules.append(
-                    nn.Linear(cfg.n_embed * mlp_ratio,
-                              cfg.n_embed * mlp_ratio))
+                    nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio)
+                )
             modules.append(nn.GELU())
             modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
             modules = nn.Sequential(*modules)
-
+        elif self.projector_type == "linear":
+            modules = nn.Linear(cfg.input_dim, cfg.n_embed)
         else:
             raise NotImplementedError(
-                f"Unsupported projector type: {cfg.projector_type}")
+                f"Unsupported projector type: {cfg.projector_type}"
+            )
 
         self.layers = modules
 
     def forward(self, x):
         bs, hw, input_dim = x.shape
-        h = w = int((hw)**0.5)
-        """compute padding"""
-        if h % self.cfg.downsample_ratio:
-            pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
-        else:
-            pad = 0
-        x = x.reshape(bs, h, w, input_dim)
-        if pad > 0:
-            x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
-        """4 to 1 concat"""
-        x = x.permute(0, 3, 1, 2)  # B, C, H, W
-        x = F.unfold(x,
-                     kernel_size=self.cfg.downsample_ratio,
-                     stride=self.cfg.downsample_ratio,
-                     padding=0)  # B, C*4, HW // 4
-        x = x.permute(0, 2, 1)
+        if self.projector_type == "downsample_mlp_gelu":
+            h = w = int((hw) ** 0.5)
+            """compute padding"""
+            if h % self.cfg.downsample_ratio:
+                pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
+            else:
+                pad = 0
+            x = x.reshape(bs, h, w, input_dim)
+            if pad > 0:
+                x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+            """4 to 1 concat"""
+            x = x.permute(0, 3, 1, 2)  # B, C, H, W
+            x = F.unfold(
+                x,
+                kernel_size=self.cfg.downsample_ratio,
+                stride=self.cfg.downsample_ratio,
+                padding=0,
+            )  # B, C*4, HW // 4
+            x = x.permute(0, 2, 1)
 
         return self.layers(x)
 
 
 class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(DeepseekVLV2Config)
 
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def get_num_image_tokens(self,
-                             *,
-                             image_width: int,
-                             image_height: int,
-                             cropping: bool = True) -> int:
+    def get_num_image_tokens(
+        self, *, image_width: int, image_height: int, cropping: bool = True
+    ) -> int:
         hf_processor = self.get_hf_processor()
         image_size = hf_processor.image_size
         patch_size = hf_processor.patch_size
@@ -157,9 +175,12 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
 
         if cropping:
             best_width, best_height = hf_processor.select_best_resolution(
-                (image_width, image_height))
-            num_width_tiles, num_height_tiles = (best_width // image_size,
-                                                 best_height // image_size)
+                (image_width, image_height)
+            )
+            num_width_tiles, num_height_tiles = (
+                best_width // image_size,
+                best_height // image_size,
+            )
         else:
             num_width_tiles = num_height_tiles = 1
 
@@ -172,15 +193,16 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
     def get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self.get_hf_config()
         candidate_resolutions = hf_config.candidate_resolutions
-        height, width = max(candidate_resolutions,
-                            key=lambda x: self.get_num_image_tokens(
-                                image_width=x[1], image_height=x[0]))
+        height, width = max(
+            candidate_resolutions,
+            key=lambda x: self.get_num_image_tokens(
+                image_width=x[1], image_height=x[0]
+            ),
+        )
         return ImageSize(width=width, height=height)
 
 
-class DeepseekVL2DummyInputsBuilder(
-        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
-
+class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -193,22 +215,27 @@ class DeepseekVL2DummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         max_image_size = self.info.get_image_size_with_most_features()
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=max_image_size.width,
-                                   height=max_image_size.height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=max_image_size.width,
+                height=max_image_size.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class DeepseekVL2MultiModalProcessor(
-        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
-
+    BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]
+):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -218,9 +245,7 @@ class DeepseekVL2MultiModalProcessor(
     ) -> BatchFeature:
         if not mm_data:
             tokenizer = self.info.get_tokenizer()
-            return tokenizer(prompt,
-                             add_special_tokens=True,
-                             return_tensors="pt")
+            return tokenizer(prompt, add_special_tokens=True, return_tensors="pt")
 
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
@@ -229,12 +254,9 @@ class DeepseekVL2MultiModalProcessor(
             tok_kwargs=tok_kwargs,
         )
 
-        pixel_values = processed_outputs["pixel_values"]
-        # split pixel values into patches corresponding to each image
-        images_spatial_crop = processed_outputs["images_spatial_crop"]
-        patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
-        pixel_values = pixel_values.split(patches_per_image)
-        processed_outputs["pixel_values"] = pixel_values
+        processed_outputs["num_patches"] = (
+            processed_outputs["images_spatial_crop"].prod(-1) + 1
+        )
 
         return processed_outputs
 
@@ -243,8 +265,10 @@ class DeepseekVL2MultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
         return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             images_spatial_crop=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -262,7 +286,8 @@ class DeepseekVL2MultiModalProcessor(
 
         def get_replacement_deepseek_vl2(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -286,11 +311,11 @@ class DeepseekVL2MultiModalProcessor(
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -317,15 +342,19 @@ class DeepseekVL2MultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     DeepseekVL2MultiModalProcessor,
     info=DeepseekVL2ProcessingInfo,
-    dummy_inputs=DeepseekVL2DummyInputsBuilder)
+    dummy_inputs=DeepseekVL2DummyInputsBuilder,
+)
 class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "language.": "language_model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language.": "language_model.",
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -346,11 +375,11 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         model_config = vllm_config.model_config
         tokenizer = cached_tokenizer_from_config(model_config)
-        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
+        self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
 
-        self.vision = self._init_vision_module(self.vision_config,
-                                               quant_config,
-                                               maybe_prefix(prefix, "vision"))
+        self.vision = self._init_vision_module(
+            self.vision_config, quant_config, maybe_prefix(prefix, "vision")
+        )
 
         self.projector = MlpProjector(self.projector_config)
         self.tile_tag = config.tile_tag
@@ -358,47 +387,42 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         # special token for image token sequence format
         embed_std = 1 / torch.sqrt(
-            torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
+            torch.tensor(self.projector_config.n_embed, dtype=torch.float32)
+        )
         if self.tile_tag == "2D":
             # <|view_seperator|>, <|\n|>
             self.image_newline = nn.Parameter(
-                torch.randn(self.projector_config.n_embed) * embed_std)
+                torch.randn(self.projector_config.n_embed) * embed_std
+            )
             # This is a typo in original implementation
             self.view_seperator = nn.Parameter(
-                torch.randn(self.projector_config.n_embed) * embed_std)
+                torch.randn(self.projector_config.n_embed) * embed_std
+            )
         else:
             raise ValueError(
                 f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
             )
 
-        if self.text_config.topk_method == "noaux_tc":
-            architectures = ["DeepseekV3ForCausalLM"]
-        elif not self.text_config.use_mla:
-            architectures = ["DeepseekForCausalLM"]
-        else:
-            architectures = ["DeepseekV2ForCausalLM"]
-
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.text_config,
             prefix=maybe_prefix(prefix, "language"),
-            architectures=architectures,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _get_parent_and_attr(self, root: torch.nn.Module, dotted_name: str):
         """Return (parent_module, final_attr_name) for a dotted module path."""
-        names = dotted_name.split('.')
+        names = dotted_name.split(".")
         parent = root
         for n in names[:-1]:
             parent = getattr(parent, n)
         return parent, names[-1]
 
-    #patch for timm ViT instance to support tensor parallel
-    def patch_vit_for_tp(self, vit: torch.nn.Module,
-                         quant_config: QuantizationConfig):
+    # patch for timm ViT instance to support tensor parallel
+    def patch_vit_for_tp(self, vit: torch.nn.Module, quant_config: QuantizationConfig):
         try:
             import timm
         except ImportError as e:
@@ -408,17 +432,14 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             if isinstance(module, nn.Linear):
                 parent, attr_name = self._get_parent_and_attr(vit, name)
                 if isinstance(parent, timm.layers.Mlp) and attr_name == "fc1":
-                    new_linear = replace_linear_class(module,
-                                                      "colwise",
-                                                      quant_config,
-                                                      prefix=name)
+                    new_linear = replace_linear_class(
+                        module, "colwise", quant_config, prefix=name
+                    )
                     setattr(parent, attr_name, new_linear)
-                elif isinstance(parent,
-                                timm.layers.Mlp) and attr_name == "fc2":
-                    new_linear = replace_linear_class(module,
-                                                      "rowwise",
-                                                      quant_config,
-                                                      prefix=name)
+                elif isinstance(parent, timm.layers.Mlp) and attr_name == "fc2":
+                    new_linear = replace_linear_class(
+                        module, "rowwise", quant_config, prefix=name
+                    )
                     setattr(parent, attr_name, new_linear)
 
         return vit
@@ -426,7 +447,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _init_vision_module(
         self,
         vision_config: VisionEncoderConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         # TODO: refactor vision model through timm wrapper from transformers
@@ -451,7 +472,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         return model
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
+        self, **kwargs: object
+    ) -> DeepseekVL2ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         images_spatial_crop = kwargs.pop("images_spatial_crop", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -461,37 +483,31 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         if pixel_values is not None:
             expected_h = expected_w = self.vision_config.image_size
-            return DeepseekVL2ImagePixelInputs(type="pixel_values",
-                                               data=flatten_bn(pixel_values),
-                                               images_spatial_crop=flatten_bn(
-                                                   images_spatial_crop,
-                                                   concat=True),
-                                               resolve_bindings={
-                                                   "h": expected_h,
-                                                   "w": expected_w,
-                                               })
+            return DeepseekVL2ImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                images_spatial_crop=images_spatial_crop,
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w,
+                },
+            )
 
         if image_embeds is not None:
             return DeepseekVL2VImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
     def _pixel_values_to_embedding(
         self,
-        pixel_values: NestedTensors,
+        pixel_values: torch.Tensor,
         images_spatial_crop: torch.Tensor,
-    ) -> NestedTensors:
-        # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width]
-        total_tiles = [x for x in pixel_values]
-
-        # [batch_all_tiles, 3, height, width]
-        total_tiles = torch.cat(total_tiles, dim=0)
-
+    ) -> list[torch.Tensor]:
         # [batch_all_tiles, vit_seq_len, c]
-        images_feature = self.vision.forward_features(total_tiles)
+        images_feature = self.vision.forward_features(pixel_values)
 
         # [batch_all_tiles, hw, D]
         images_embeds = self.projector(images_feature)
@@ -513,8 +529,9 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             global_features = images_embeds[tile_index]
 
             # [num_height_tiles * num_width_tiles, hw, D]
-            local_features = images_embeds[tile_index + 1:tile_index + 1 +
-                                           num_tiles_in_image]
+            local_features = images_embeds[
+                tile_index + 1 : tile_index + 1 + num_tiles_in_image
+            ]
             tile_index += num_tiles_in_image + 1
 
             # format global and local features
@@ -526,8 +543,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
 
             # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
-            global_features = torch.cat([global_features, new_lines_in_global],
-                                        dim=1)
+            global_features = torch.cat([global_features, new_lines_in_global], dim=1)
 
             # [h, w + 1, D] -> [h * (w + 1), D]
             global_features = global_features.view(-1, n_dim)
@@ -535,22 +551,22 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             # ----------------- local view add newline -----------------
             # [num_height_tiles * num_width_tiles, h * w, D] ->
             # [num_height_tiles * h, num_width_tiles * w, D]
-            local_features = rearrange(local_features,
-                                       "(th tw) (h w) d -> (th h) (tw w) d",
-                                       th=num_height_tiles,
-                                       tw=num_width_tiles,
-                                       h=h,
-                                       w=w)
+            local_features = rearrange(
+                local_features,
+                "(th tw) (h w) d -> (th h) (tw w) d",
+                th=num_height_tiles,
+                tw=num_width_tiles,
+                h=h,
+                w=w,
+            )
 
             # [D] -> [num_height_tiles * h, 1, D]
-            new_lines_in_local = repeat(self.image_newline,
-                                        "d -> (th h) 1 d",
-                                        th=num_height_tiles,
-                                        h=h)
+            new_lines_in_local = repeat(
+                self.image_newline, "d -> (th h) 1 d", th=num_height_tiles, h=h
+            )
 
             # [num_height_tiles * h, num_width_tiles * w + 1, D]
-            local_features = torch.cat([local_features, new_lines_in_local],
-                                       dim=1)
+            local_features = torch.cat([local_features, new_lines_in_local], dim=1)
 
             # [num_height_tiles * h, num_width_tiles * w + 1, D]
             #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
@@ -558,23 +574,28 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
             # merge global and local tiles
             if self.global_view_pos == "head":
-                global_local_features = torch.cat([
-                    global_features,
-                    self.view_seperator[None, :],
-                    local_features,
-                ])
+                global_local_features = torch.cat(
+                    [
+                        global_features,
+                        self.view_seperator[None, :],
+                        local_features,
+                    ]
+                )
             else:
-                global_local_features = torch.cat([
-                    local_features,
-                    self.view_seperator[None, :],
-                    global_features,
-                ])
+                global_local_features = torch.cat(
+                    [
+                        local_features,
+                        self.view_seperator[None, :],
+                        global_features,
+                    ]
+                )
 
             vision_embeddings.append(global_local_features)
         return vision_embeddings
 
     def _process_image_input(
-            self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor:
+        self, image_input: DeepseekVL2ImageInputs
+    ) -> list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             image_data = image_input["data"]
             if is_list_of(image_data, torch.Tensor):
@@ -592,67 +613,43 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         images_spatial_crop = image_input["images_spatial_crop"]
 
         return self._pixel_values_to_embedding(
-            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
+            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop
+        )
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def forward(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.image_token_id)
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object):
-
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model(input_ids,
-                                            positions,
-                                            intermediate_tensors,
-                                            inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        autoloaded_weights = loader.load_weights(weights,
-                                                 mapper=self.hf_to_vllm_mapper)
+        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
         return autoloaded_weights
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 2a09234b59ed1119216d8de0e52c14c731346a8f..d24da0c42a254eddb04c7e0dd5abc929441da9a6 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -24,9 +24,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only dots1 model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,57 +36,74 @@ from transformers import Dots1Config
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class Dots1MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -96,11 +114,10 @@ class Dots1MLP(nn.Module):
 
 
 class Dots1MoE(nn.Module):
-
     def __init__(
         self,
         config: Dots1Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -109,21 +126,40 @@ class Dots1MoE(nn.Module):
         self.n_shared_experts = config.n_shared_experts
 
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
         if config.topk_method == "noaux_tc":
-            self.gate.e_score_correction_bias = (nn.Parameter(
-                torch.empty(config.n_routed_experts)))
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts)
+            )
         else:
             self.gate.e_score_correction_bias = None
 
-        self.experts = FusedMoE(
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = Dots1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
@@ -138,39 +174,28 @@ class Dots1MoE(nn.Module):
             scoring_func=config.scoring_func,
             # we do scaling outside, set factor to 1.0 to avoid double mul
             routed_scaling_factor=1.0,
-            e_score_correction_bias=self.gate.e_score_correction_bias)
-
-        if config.n_shared_experts is not None:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
-            self.shared_experts = Dots1MLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                reduce_results=False,
-                prefix=f"{prefix}.shared_experts",
-            )
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
+
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits) * self.routed_scaling_factor
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = (
+            self.experts(hidden_states=hidden_states, router_logits=router_logits)
+            * self.routed_scaling_factor
+        )
+
+        if self.shared_experts is not None:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
 class Dots1Attention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -178,10 +203,10 @@ class Dots1Attention(nn.Module):
         num_kv_heads: int,
         config: Dots1Config,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -200,8 +225,7 @@ class Dots1Attention(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = getattr(config, "head_dim",
-                                hidden_size // self.total_num_heads)
+        self.head_dim = getattr(config, "head_dim", hidden_size // self.total_num_heads)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -216,6 +240,7 @@ class Dots1Attention(nn.Module):
             self.total_num_kv_heads,
             bias=attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
 
         self.o_proj = RowParallelLinear(
@@ -223,6 +248,7 @@ class Dots1Attention(nn.Module):
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -244,14 +270,15 @@ class Dots1Attention(nn.Module):
         self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
-    def forward(self, positions: torch.Tensor,
-                hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, positions: torch.Tensor, hidden_states: torch.Tensor
+    ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q = self.q_norm(q.reshape(-1, self.num_heads,
-                                  self.head_dim)).reshape(q.shape)
-        k = self.k_norm(k.reshape(-1, self.num_kv_heads,
-                                  self.head_dim)).reshape(k.shape)
+        q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+            k.shape
+        )
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
@@ -259,22 +286,20 @@ class Dots1Attention(nn.Module):
 
 
 class Dots1DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Dots1Config,
         prefix: str,
         model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        layer_idx = int(prefix.split(sep='.')[-1])
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
 
         self.self_attn = Dots1Attention(
@@ -289,12 +314,14 @@ class Dots1DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = Dots1MoE(config=config,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.mlp")
+        if (
+            config.n_routed_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % config.moe_layer_freq == 0
+        ):
+            self.mlp = Dots1MoE(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            )
         else:
             self.mlp = Dots1MLP(
                 hidden_size=config.hidden_size,
@@ -303,35 +330,31 @@ class Dots1DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.mlp",
             )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
         self.routed_scaling_factor = config.routed_scaling_factor
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class Dots1Model(nn.Module):
-
     fall_back_to_pt_during_load = False
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -350,7 +373,8 @@ class Dots1Model(nn.Module):
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.embed_tokens")
+                prefix=f"{prefix}.embed_tokens",
+            )
         else:
             self.embed_tokens = PPMissingLayer()
 
@@ -363,31 +387,32 @@ class Dots1Model(nn.Module):
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -400,22 +425,21 @@ class Dots1Model(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
+            num_experts=self.config.n_routed_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -430,10 +454,10 @@ class Dots1Model(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 if name.endswith(".bias") and name not in params_dict:
@@ -456,11 +480,13 @@ class Dots1Model(nn.Module):
 
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     if name.endswith(".bias") and name not in params_dict:
@@ -471,15 +497,15 @@ class Dots1Model(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -498,30 +524,33 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Dots1Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = Dots1Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
@@ -533,12 +562,11 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 2db350c892ae72086fa72a35c20efba66902d1d6..f46caaa095c6a6a557da362e4e5579ac8ad39ee0 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -9,76 +9,93 @@ import torch.nn.functional as F
 from torch.nn import LayerNorm
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 
-from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import utils as dist_utils
 from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (MultiModalEmbeddings,
-                                                   SupportsLoRA,
-                                                   SupportsMultiModal,
-                                                   SupportsPP)
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention
-from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder,
-                                                 Qwen2VLMultiModalProcessor,
-                                                 Qwen2VLProcessingInfo)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              init_vllm_registered_model,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
+from vllm.model_executor.models.qwen2_vl import (
+    Qwen2VisionAttention,
+    Qwen2VLDummyInputsBuilder,
+    Qwen2VLMultiModalProcessor,
+    Qwen2VLProcessingInfo,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict
-from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.dotsocr import (DotsOCRConfig,
-                                                     DotsVisionConfig)
+from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .vision import run_dp_sharded_mrope_vision_model
 
 IMAGE_TOKEN = "<|imgpad|>"
 
 
-class DotsOCRImagePixelInputs(TypedDict):
-    type: Literal["pixel_values", "image_grid_thw"]
+class DotsOCRImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    """
 
-    pixel_values: torch.Tensor
-    image_grid_thw: torch.Tensor
+    type: Literal["pixel_values"]
 
+    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
-class DotsOCRImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds", "image_grid_thw"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - List[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
+
+class DotsOCRImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
     """
 
-    image_grid_thw: torch.Tensor
+    type: Literal["image_embeds"]
 
+    image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
-DotsOCRImageInputs = Union[DotsOCRImagePixelInputs,
-                           DotsOCRImageEmbeddingInputs]
 
+DotsOCRImageInputs: TypeAlias = DotsOCRImagePixelInputs | DotsOCRImageEmbeddingInputs
 
-class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
 
+class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         return IMAGE_TOKEN * num_images
@@ -87,34 +104,37 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = self.info.get_image_size_with_most_features(  # noqa: E501
         )
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
         }
 
 
 class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
-
     def get_hf_config(self) -> DotsOCRConfig:
         config = self.ctx.get_hf_config()
-        if not config.__class__.__name__ == 'DotsOCRConfig':
+        if not config.__class__.__name__ == "DotsOCRConfig":
             raise TypeError(f"Expected DotsOCRConfig, got {type(config)}")
 
-        if hasattr(config, "vision_config") and isinstance(
-                config.vision_config, dict):
+        if hasattr(config, "vision_config") and isinstance(config.vision_config, dict):
             config.vision_config = DotsVisionConfig(**config.vision_config)
 
         return config
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(
@@ -129,8 +149,7 @@ class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
         self,
         **kwargs: object,
     ) -> Qwen2VLProcessor:
-        self.get_tokenizer(
-        ).image_token = IMAGE_TOKEN  # Ensure image token is set
+        self.get_tokenizer().image_token = IMAGE_TOKEN  # Ensure image token is set
         processor = self.ctx.get_hf_processor(
             Qwen2VLProcessor,
             **kwargs,
@@ -142,13 +161,14 @@ class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
-def apply_rotary_pos_emb_vision(tensor: torch.Tensor,
-                                freqs: torch.Tensor) -> torch.Tensor:
+def apply_rotary_pos_emb_vision(
+    tensor: torch.Tensor, freqs: torch.Tensor
+) -> torch.Tensor:
     orig_dtype = tensor.dtype
     tensor = tensor.float()
 
@@ -166,23 +186,20 @@ def apply_rotary_pos_emb_vision(tensor: torch.Tensor,
 
 
 class VisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
-        inv_freq = 1.0 / (theta
-                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
     def forward(self, seqlen: int) -> torch.Tensor:
-        seq = torch.arange(seqlen,
-                           device=self.inv_freq.device,
-                           dtype=self.inv_freq.dtype)
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
         freqs = torch.outer(seq, self.inv_freq)
         return freqs
 
 
 class PatchMerger(nn.Module):
-
     def __init__(
         self,
         dim: int,
@@ -201,19 +218,23 @@ class PatchMerger(nn.Module):
             self.ln_q = RMSNorm(context_dim, eps=1e-6)
 
         self.mlp = nn.Sequential(
-            ColumnParallelLinear(self.hidden_size,
-                                 self.hidden_size,
-                                 bias=True,
-                                 return_bias=False,
-                                 prefix=f"{prefix}.0",
-                                 disable_tp=use_data_parallel),
+            ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=True,
+                return_bias=False,
+                prefix=f"{prefix}.0",
+                disable_tp=use_data_parallel,
+            ),
             nn.GELU(),
-            RowParallelLinear(self.hidden_size,
-                              dim,
-                              bias=True,
-                              return_bias=False,
-                              prefix=f"{prefix}.2",
-                              disable_tp=use_data_parallel),
+            RowParallelLinear(
+                self.hidden_size,
+                dim,
+                bias=True,
+                return_bias=False,
+                prefix=f"{prefix}.2",
+                disable_tp=use_data_parallel,
+            ),
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -225,26 +246,29 @@ class PatchMerger(nn.Module):
 
 
 class DotsVisionAttention(nn.Module):
-
-    def __init__(self,
-                 config,
-                 dim: int,
-                 num_heads: int = 16,
-                 bias: bool = True,
-                 *,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False) -> None:
+    def __init__(
+        self,
+        config,
+        dim: int,
+        num_heads: int = 16,
+        bias: bool = True,
+        *,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
         super().__init__()
 
         self.embed_dim = dim
-        self.tp_size = (1 if use_data_parallel else
-                        get_tensor_model_parallel_world_size())
-        self.tp_rank = (0 if use_data_parallel else
-                        get_tensor_model_parallel_rank())
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(dim, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, self.tp_size)
+            num_heads, self.tp_size
+        )
         # qkv/proj follow Qwen2-VL style; bias controlled by arg
         self.qkv = QKVParallelLinear(
             hidden_size=dim,
@@ -253,44 +277,58 @@ class DotsVisionAttention(nn.Module):
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv",
-            disable_tp=use_data_parallel)
-        self.proj = RowParallelLinear(input_size=dim,
-                                      output_size=dim,
-                                      bias=bias,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj",
-                                      disable_tp=use_data_parallel)
+            disable_tp=use_data_parallel,
+        )
+        self.proj = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
         # Select attention backend
         self.attn_backend = get_vit_attn_backend(
-            self.hidden_size_per_attention_head, torch.get_default_dtype())
+            self.hidden_size_per_attention_head,
+            torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
         self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-                check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
-                f"Unsupported vision attention backend: {self.attn_backend}")
+                f"Unsupported vision attention backend: {self.attn_backend}"
+            )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
+        rotary_pos_emb: torch.Tensor | None = None,
         *,
-        max_seqlen: Optional[int] = None,
-        seqlens: Optional[list[int]] = None,
+        max_seqlen: int | None = None,
+        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
         x, _ = self.qkv(x)
-        q, k, v = Qwen2_5_VisionAttention.split_qkv(self, x)
+        q, k, v = Qwen2VisionAttention.split_qkv(self, x)
         bs = q.shape[1]
         # [S,B,H,D] -> [B,S,H,D]
         q = q.permute(1, 0, 2, 3).contiguous()
@@ -303,29 +341,27 @@ class DotsVisionAttention(nn.Module):
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
             q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
             k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
             v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
-            output = flash_attn_varlen_func(q_,
-                                            k_,
-                                            v_,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0.0,
-                                            causal=False)
-            context_layer = output.view(bs, -1,
-                                        self.num_attention_heads_per_partition,
-                                        self.hidden_size_per_attention_head)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            output = self.flash_attn_varlen_func(
+                q_,
+                k_,
+                v_,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0.0,
+                causal=False,
+            )
+            context_layer = output.view(
+                bs,
+                -1,
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 s = int(cu_seqlens[i - 1])
@@ -333,21 +369,20 @@ class DotsVisionAttention(nn.Module):
                 q_i = q[:, s:e].permute(0, 2, 1, 3)
                 k_i = k[:, s:e].permute(0, 2, 1, 3)
                 v_i = v[:, s:e].permute(0, 2, 1, 3)
-                out_i = F.scaled_dot_product_attention(q_i,
-                                                       k_i,
-                                                       v_i,
-                                                       dropout_p=0.0)
+                out_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
+
+            attn_bias = BlockDiagonalMask.from_seqlens(
+                q_seqlen=seqlens, kv_seqlen=None, device=q.device
+            )
             context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+                q, k, v, attn_bias=attn_bias, p=0, scale=None
+            )
         else:
             raise RuntimeError("Unsupported attention backend")
 
@@ -359,31 +394,36 @@ class DotsVisionAttention(nn.Module):
 
 
 class DotsSwiGLUFFN(nn.Module):
-
-    def __init__(self,
-                 config,
-                 *,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config,
+        *,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.embed_dim
         bias = config.use_bias
 
         # Referenced aimv2.py AIMv2SwiGLUFFN
-        self.fc13 = MergedColumnParallelLinear(in_features,
-                                               [hidden_features] * 2,
-                                               bias=bias,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.fc13",
-                                               disable_tp=use_data_parallel)
-        self.fc2 = RowParallelLinear(hidden_features,
-                                     in_features,
-                                     bias=bias,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2",
-                                     disable_tp=use_data_parallel)
+        self.fc13 = MergedColumnParallelLinear(
+            in_features,
+            [hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc13",
+            disable_tp=use_data_parallel,
+        )
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -392,8 +432,7 @@ class DotsSwiGLUFFN(nn.Module):
         x, _ = self.fc2(x)
         return x
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("fc13", "fc1", 0),
             ("fc13", "fc3", 1),
@@ -401,7 +440,6 @@ class DotsSwiGLUFFN(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -419,15 +457,13 @@ class DotsSwiGLUFFN(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class DotsPatchEmbed(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.num_channels = config.num_channels
@@ -444,15 +480,19 @@ class DotsPatchEmbed(nn.Module):
         self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
 
     def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
-        x = x.view(-1, self.num_channels, self.temporal_patch_size,
-                   self.patch_size, self.patch_size)[:, :, 0]
+        x = x.view(
+            -1,
+            self.num_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )[:, :, 0]
         x = self.proj(x).view(-1, self.embed_dim)
         x = self.norm(x)
         return x
 
 
 class DotsViTPreprocessor(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.patch_h = config.patch_size
@@ -467,38 +507,45 @@ class DotsViTPreprocessor(nn.Module):
 
 
 class DotsVisionBlock(nn.Module):
-
     def __init__(
         self,
         config,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
-        self.attn = DotsVisionAttention(config,
-                                        config.embed_dim,
-                                        num_heads=config.num_attention_heads,
-                                        bias=config.use_bias,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.attn",
-                                        use_data_parallel=use_data_parallel)
+        self.attn = DotsVisionAttention(
+            config,
+            config.embed_dim,
+            num_heads=config.num_attention_heads,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
         self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
-        self.mlp = DotsSwiGLUFFN(config,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mlp",
-                                 use_data_parallel=use_data_parallel)
+        self.mlp = DotsSwiGLUFFN(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
         self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                *,
-                cu_seqlens: torch.Tensor,
-                rotary_pos_emb: torch.Tensor,
-                max_seqlen: Optional[int] = None,
-                seqlens: Optional[list[int]] = None) -> torch.Tensor:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,
+        seqlens: list[int] | None = None,
+    ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
@@ -511,16 +558,16 @@ class DotsVisionBlock(nn.Module):
 
 
 class DotsVisionTransformer(nn.Module):
-
     def __init__(
         self,
         config: DotsVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -531,26 +578,38 @@ class DotsVisionTransformer(nn.Module):
         head_dim = config.embed_dim // config.num_attention_heads
         self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-                check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
-        num_layers = (config.num_hidden_layers if num_hidden_layers_override
-                      is None else num_hidden_layers_override)
-        self.blocks = nn.ModuleList([
-            DotsVisionBlock(config,
-                            quant_config=quant_config,
-                            prefix=f"{prefix}.blocks.{i}",
-                            use_data_parallel=use_data_parallel)
-            for i in range(num_layers)
-        ])
+        num_layers = (
+            config.num_hidden_layers
+            if num_hidden_layers_override is None
+            else num_hidden_layers_override
+        )
+        self.blocks = nn.ModuleList(
+            [
+                DotsVisionBlock(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{i}",
+                    use_data_parallel=use_data_parallel,
+                    attn_backend_override=attn_backend_override,
+                )
+                for i in range(num_layers)
+            ]
+        )
         if require_post_norm is None:
-            require_post_norm = (len(self.blocks) == config.num_hidden_layers)
+            require_post_norm = len(self.blocks) == config.num_hidden_layers
         if require_post_norm and self.config.post_norm:
-            self.post_trunk_norm = RMSNorm(config.embed_dim,
-                                           eps=config.rms_norm_eps)
+            self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         else:
             self.post_trunk_norm = None
 
@@ -569,7 +628,7 @@ class DotsVisionTransformer(nn.Module):
     def device(self) -> torch.device:
         return self.patch_embed.patchifier.proj.weight.device
 
-    def get_pos_ids_by_grid(self, grid_thw):
+    def get_pos_ids_by_grid(self, grid_thw: list[list[int]]) -> list[torch.Tensor]:
         pos_ids = []
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
@@ -591,55 +650,58 @@ class DotsVisionTransformer(nn.Module):
             )
             wpos_ids = wpos_ids.permute(0, 2, 1, 3)
             wpos_ids = wpos_ids.flatten()
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
 
         return pos_ids
 
-    def rot_pos_emb(self, grid_thw):
+    def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
         pos_ids = self.get_pos_ids_by_grid(grid_thw)
         pos_ids = torch.cat(pos_ids, dim=0)
-        max_grid_size = grid_thw[:, 1:].max()
+        max_grid_size = max(max(h, w) for _, h, w in grid_thw)
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
     def compute_attn_mask_seqlen(
-            self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+        self, cu_seqlens: torch.Tensor
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
+        ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
-    def forward(self, hidden_states: torch.Tensor,
-                grid_thw: list[list[int]]) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
+    ) -> torch.Tensor:
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
         # Convert grid_thw to tensor (always expecting list format now)
-        grid_thw = torch.tensor(grid_thw,
-                                device=hidden_states.device,
-                                dtype=torch.long)
+        grid_thw = torch.tensor(grid_thw, device=hidden_states.device, dtype=torch.long)
         hidden_states = hidden_states.to(self.dtype)
         hidden_states = self.patch_embed(hidden_states, grid_thw)
 
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
-
         cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
-                dim=0,
-                dtype=grid_thw.dtype
-                if torch.jit.is_tracing() else torch.int32,
-            )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
-            hidden_states = blk(hidden_states,
-                                cu_seqlens=cu_seqlens,
-                                rotary_pos_emb=rotary_pos_emb,
-                                max_seqlen=max_seqlen,
-                                seqlens=seqlens)
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
 
         if self.post_trunk_norm is not None:
             hidden_states = self.post_trunk_norm(hidden_states)
@@ -653,8 +715,9 @@ class DotsVisionTransformer(nn.Module):
     info=DotsOCRProcessingInfo,
     dummy_inputs=DotsOCRDummyInputsBuilder,
 )
-class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
-                         SupportsLoRA):
+class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".attn.qkv_proj.": ".attn.qkv.",
@@ -682,7 +745,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|img|><|imgpad|><|endofimg|>"
 
@@ -698,11 +761,18 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
             self.config.vision_config = vision_config
         else:
             vision_config = self.config.vision_config
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.vision_tower = DotsVisionTransformer(
             vision_config,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "vision_tower"),
-            use_data_parallel=self.use_data_parallel)
+            use_data_parallel=self.use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
         self.language_model: Qwen2ForCausalLM = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.config,
@@ -710,24 +780,13 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
             architectures=["Qwen2ForCausalLM"],
         )
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return torch.concat(list(mm_input))
-        else:
-            return torch.concat(mm_input)
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[DotsOCRImageInputs]:
+        self, **kwargs: object
+    ) -> DotsOCRImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -736,44 +795,30 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return DotsOCRImagePixelInputs(type="pixel_values",
-                                           pixel_values=pixel_values,
-                                           image_grid_thw=image_grid_thw)
+            return DotsOCRImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-            return DotsOCRImageEmbeddingInputs(type="image_embeds",
-                                               image_embeds=image_embeds,
-                                               image_grid_thw=image_grid_thw)
+            return DotsOCRImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
 
     def _process_image_input(
-            self, image_input: DotsOCRImageInputs) -> tuple[torch.Tensor, ...]:
+        self, image_input: DotsOCRImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
         grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
-            image_embeds = image_input["image_embeds"].type(
-                self.vision_tower.dtype)
+            image_embeds = image_input["image_embeds"].type(self.vision_tower.dtype)
         else:
-            pixel_values = image_input["pixel_values"].type(
-                self.vision_tower.dtype)
+            pixel_values = image_input["pixel_values"].type(self.vision_tower.dtype)
 
             if self.use_data_parallel:
                 return run_dp_sharded_mrope_vision_model(
@@ -783,64 +828,47 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
                     rope_type="rope_3d",
                 )
             else:
-                image_embeds = self.vision_tower(
-                    pixel_values, grid_thw)[:, :self.config.hidden_size]
+                image_embeds = self.vision_tower(pixel_values, grid_thw_list)[
+                    :, : self.config.hidden_size
+                ]
 
         # Split concatenated embeddings for each image item.
         merge_size = self.vision_tower.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
+        sizes = (
+            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
+            // (merge_size * merge_size)
+        ).tolist()
 
         return image_embeds.split(sizes)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_id,
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
-        elif inputs_embeds is None and kwargs.get("pixel_values") is not None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                inputs_embeds = None
-            else:
-                assert input_ids is not None
-                inputs_embeds = self.get_multimodal_embeddings(
-                    input_ids,
-                    image_input=image_input,
-                )
-                input_ids = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=input_ids == self.config.image_token_id,
+            )
+            input_ids = None
 
         hidden_states = self.language_model(
             input_ids=input_ids,
@@ -854,11 +882,10 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
diff --git a/vllm/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py
index e7302dc5ecdd7bdedcd9a2da55c5b6b972393366..c1a4737e1f3264165d7a5aad070711c6f3104803 100644
--- a/vllm/model_executor/models/ernie45.py
+++ b/vllm/model_executor/models/ernie45.py
@@ -22,14 +22,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Erine model compatible with HuggingFace weights."""
+
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
 from .utils import PPMissingLayer
 
 
+@support_torch_compile(
+    # set dynamic_arg_dims to support mrope
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class Ernie4_5ForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format Ernie4.5 dense implementation
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index d262e9e9da50e29a07e2b8fc65355499de97b073..f2999968669f64ff755dc21a3e9344d485526094 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -22,9 +22,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only ErineMoE model compatible with HuggingFace weights."""
-from collections.abc import Iterable
+
+import typing
+from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -32,61 +34,80 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Ernie4_5_MoeMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
         use_bias: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=use_bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=use_bias,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -97,36 +118,77 @@ class Ernie4_5_MoeMLP(nn.Module):
 
 
 class Ernie4_5_MoeMoE(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ):
         super().__init__()
 
         layer_idx = extract_layer_index(prefix)
         self.layer_idx = layer_idx
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
-                                   > 0)
+
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None)
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.moe_num_experts
+        self.n_shared_experts: int = self.moe_num_shared_experts
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+        self.has_shared_experts = getattr(config, "moe_num_shared_experts", 0) > 0
 
         if self.tp_size > config.moe_num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.moe_num_experts}.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.moe_num_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+                f"the number of experts {config.moe_num_experts}."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.moe_num_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
 
         self.gate.e_score_correction_bias = nn.Parameter(
-            torch.empty(config.moe_num_experts))
+            torch.empty(config.moe_num_experts, dtype=torch.float32)
+        )
 
-        self.experts = FusedMoE(
+        if self.has_shared_experts:
+            intermediate_size = (
+                config.moe_intermediate_size * config.moe_num_shared_experts
+            )
+            self.shared_experts = Ernie4_5_MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=False,
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
             num_experts=config.moe_num_experts,
             top_k=config.moe_k,
             hidden_size=config.hidden_size,
@@ -135,60 +197,49 @@ class Ernie4_5_MoeMoE(nn.Module):
             renormalize=True,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
-            e_score_correction_bias=self.gate.e_score_correction_bias)
-
-        if self.has_shared_experts:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.moe_num_shared_experts)
-            self.shared_experts = Ernie4_5_MoeMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.shared_experts",
-                reduce_results=self.experts.must_reduce_shared_expert_outputs(
-                ))
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
         hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
-        shared_output = None
-        if self.has_shared_experts:
-            shared_output = self.shared_experts(hidden_states)
 
-        router_logits, _ = self.gate(hidden_states)
+        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
 
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
 
-        if self.has_shared_experts and \
-              shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        if self.has_shared_experts:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+        else:
+            final_hidden_states = final_hidden_states[1]
 
         if self.tp_size > 1:
-            final_hidden_states = (
-                self.experts.maybe_all_reduce_tensor_model_parallel(
-                    final_hidden_states))
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
 
         return final_hidden_states.view(orig_shape)
 
 
 class Ernie4_5_MoeAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -218,19 +269,23 @@ class Ernie4_5_MoeAttention(nn.Module):
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
-        self.qkv_proj = QKVParallelLinear(hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=qkv_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -240,20 +295,21 @@ class Ernie4_5_MoeAttention(nn.Module):
             is_neox_style=False,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-
         qkv, _ = self.qkv_proj(hidden_states)
 
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -267,30 +323,29 @@ class Ernie4_5_MoeAttention(nn.Module):
 
 
 class Ernie4_5_MoeDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 500000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          131072)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         self.self_attn = Ernie4_5_MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            head_dim=getattr(config, 'head_dim', None),
+            head_dim=getattr(config, "head_dim", None),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
-            qkv_bias=getattr(config, 'use_bias', False),
+            qkv_bias=getattr(config, "use_bias", False),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -302,45 +357,51 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
         # MoE
         moe_num_experts = getattr(config, "moe_num_experts", 0)
         moe_layer_start_index = getattr(config, "moe_layer_start_index", 0)
-        moe_layer_end_index = getattr(config, "moe_layer_end_index",
-                                      config.num_hidden_layers - 1)
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index", config.num_hidden_layers - 1
+        )
         moe_layer_interval = getattr(config, "moe_layer_interval", 1)
         use_moe = getattr(config, "use_moe", moe_num_experts > 0)
 
-        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
-                and layer_idx >= moe_layer_start_index
-                and layer_idx <= moe_layer_end_index):
-            self.mlp = Ernie4_5_MoeMoE(config=config,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.mlp")
+        if (
+            use_moe
+            and ((layer_idx + 1) % moe_layer_interval == 0)
+            and layer_idx >= moe_layer_start_index
+            and layer_idx <= moe_layer_end_index
+        ):
+            self.mlp = Ernie4_5_MoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
         else:
             self.mlp = Ernie4_5_MoeMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
-                use_bias=getattr(config, 'use_bias', False),
+                use_bias=getattr(config, "use_bias", False),
                 quant_config=quant_config,
-                prefix=f"{prefix}.mlp")
+                prefix=f"{prefix}.mlp",
+            )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
-
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attn(
             positions=positions,
@@ -348,8 +409,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
         hidden_states = self.mlp(hidden_states)
 
@@ -358,7 +418,6 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
 
 @support_torch_compile
 class Ernie4_5_MoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -369,22 +428,31 @@ class Ernie4_5_MoeModel(nn.Module):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.config = config
+        parallel_config = vllm_config.parallel_config
+        eplb_config = parallel_config.eplb_config
+        enable_eplb = parallel_config.enable_eplb
+
+        self.num_redundant_experts = eplb_config.num_redundant_experts
 
         if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.embed_tokens")
+                prefix=f"{prefix}.embed_tokens",
+            )
         else:
             self.embed_tokens = PPMissingLayer()
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Ernie4_5_MoeDecoderLayer(config=config,
-                                                    cache_config=cache_config,
-                                                    quant_config=quant_config,
-                                                    prefix=prefix),
+            lambda prefix: Ernie4_5_MoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
             prefix=f"{prefix}.layers",
         )
 
@@ -393,26 +461,25 @@ class Ernie4_5_MoeModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -423,27 +490,26 @@ class Ernie4_5_MoeModel(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
 
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.moe_num_experts)
+            num_experts=self.config.moe_num_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -457,8 +523,7 @@ class Ernie4_5_MoeModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            if self.config.tie_word_embeddings and name.endswith(
-                    "lm_head.weight"):
+            if self.config.tie_word_embeddings and name.endswith("lm_head.weight"):
                 continue
             # MTP will be supported soon.
             if "mtp" in name:
@@ -468,17 +533,18 @@ class Ernie4_5_MoeModel(nn.Module):
                 name = name.replace("moe_statics", "gate")
                 loaded_weight = loaded_weight.squeeze(0)
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
 
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -489,34 +555,58 @@ class Ernie4_5_MoeModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                is_expert_weight = False
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
 
                     if weight_name not in name:
                         continue
 
-                    name = name.replace(weight_name, param_name)
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
                     # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+                    if is_pp_missing_parameter(name_mapped, self):
                         continue
 
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name_mapped.endswith(".bias") or name_mapped.endswith("_bias")
+                    ) and name_mapped not in params_dict:
                         continue
-                    param = params_dict[name]
-
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
                 else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -527,14 +617,15 @@ class Ernie4_5_MoeModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -555,15 +646,17 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Ernie4_5_MoeModel(vllm_config=vllm_config,
-                                       prefix=maybe_prefix(prefix, "model"))
+        self.model = Ernie4_5_MoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         else:
             self.lm_head = PPMissingLayer()
 
@@ -571,35 +664,94 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        moe_layers_indices = [
+            i
+            for i in range(config.num_hidden_layers)
+            if (
+                i >= config.moe_layer_start_index
+                and i <= config.moe_layer_end_index
+                and (i + 1) % config.moe_layer_interval == 0
+            )
+        ]
+        self.num_moe_layers = len(moe_layers_indices)
+        self.num_expert_groups = 1
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, Ernie4_5_MoeDecoderLayer)
+            if isinstance(layer.mlp, Ernie4_5_MoeMoE):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            logger.warning("No Ernie4_5_MoeMoE layer found in model.layers.")
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Ernie4_5_MoeMoE):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 74b358034ef3d47b07afdb50407067f59ded8a45..daa5bf03ea4a90936b6b2e484ad08a510d3f2abd 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -22,10 +22,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Erine VL model compatible with HuggingFace weights."""
+
+import itertools
 import math
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal
 
 import numpy as np
 import torch
@@ -34,34 +36,53 @@ import torch.nn.functional as F
 from einops import rearrange, repeat
 from transformers import BatchFeature
 
-from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import _Backend, current_platform
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
@@ -75,15 +96,14 @@ def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
         return torch.cat((-x2, x1), dim=-1)
     else:
         x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1),
-                         "... d two -> ... (d two)",
-                         two=2)
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
 
 
-def apply_rotary_emb_torch(x: torch.Tensor,
-                           cos: torch.Tensor,
-                           sin: torch.Tensor,
-                           interleaved: bool = False) -> torch.Tensor:
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
     """
     x: (batch_size, seqlen, nheads, headdim)
     cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
@@ -91,22 +111,21 @@ def apply_rotary_emb_torch(x: torch.Tensor,
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos = repeat(
-        cos,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     sin = repeat(
-        sin,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     return torch.cat(
         [
-            x[..., :ro_dim] * cos +
-            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
         ],
         dim=-1,
     )
 
 
-def apply_rotary_pos_emb_vision(t: torch.Tensor,
-                                freqs: torch.Tensor) -> torch.Tensor:
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
     t_ = t.float()
     cos = freqs.cos()
     sin = freqs.sin()
@@ -120,14 +139,14 @@ def apply_rotary_pos_emb_vision(t: torch.Tensor,
 def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
     """All-gather the input tensor interleavely across model parallel group."""
     import torch.distributed as dist
+
     gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    dist.all_gather(gathered_tensors,
-                    local_tensor,
-                    group=parallel_state.get_tp_group().device_group)
+    dist.all_gather(
+        gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group
+    )
 
     gathered_tensors_split = [
-        torch.split(tensor, hidden_size // tp_size, -1)
-        for tensor in gathered_tensors
+        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
     ]
     ordered_tensors = [
         tensor for pair in zip(*gathered_tensors_split) for tensor in pair
@@ -144,17 +163,20 @@ class Ernie4_5_VisionAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
         self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads)
+            projection_size, num_heads
+        )
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, self.tp_size)
+            num_heads, self.tp_size
+        )
 
         self.qkv = QKVParallelLinear(
             hidden_size=embed_dim,
@@ -163,65 +185,81 @@ class Ernie4_5_VisionAttention(nn.Module):
             total_num_kv_heads=num_heads,
             bias=True,
             quant_config=quant_config,
-            prefix=f"{prefix}.qkv")
-        self.proj = RowParallelLinear(input_size=projection_size,
-                                      output_size=embed_dim,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj")
+            prefix=f"{prefix}.qkv",
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
 
         # Detect attention implementation.
         self.attn_backend = get_vit_attn_backend(
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype())
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
 
         self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Ernie45-VL does not support {self.attn_backend} backend now."
             )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
         if self.tp_size > 1:
-            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
-                                        self.tp_size)
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size, self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
         # 3 * [s, b, head * head_dim]
         if self.tp_size > 1:
-            splitter = partial(dist_utils.split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
+            )
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
             v = splitter(v)[self.tp_rank]
 
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
-                     self.hidden_size_per_attention_head)
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
         q, k, v = (x.view(*new_shape) for x in (q, k, v))
         return q, k, v
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -230,40 +268,31 @@ class Ernie4_5_VisionAttention(nn.Module):
         q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
-        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
-                   for x in (q, k, v))
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
         if rotary_pos_emb is not None:
             qk_concat = torch.cat([q, k], dim=0)
             qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
-
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            output = flash_attn_varlen_func(q,
-                                            k,
-                                            v,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0.0,
-                                            causal=False)
-
-            context_layer = rearrange(output,
-                                      "(b s) h d -> s b (h d)",
-                                      b=batch_size).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            output = self.flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0.0,
+                causal=False,
+            )
+
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=batch_size
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -272,54 +301,58 @@ class Ernie4_5_VisionAttention(nn.Module):
                 q_i = q[:, start_idx:end_idx]
                 k_i = k[:, start_idx:end_idx]
                 v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-                                 for x in [q_i, k_i, v_i])
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
                 output_i = rearrange(output_i, "b h s d -> b s h d ")
                 outputs.append(output_i)
             context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
+            attn_bias = BlockDiagonalMask.from_seqlens(
+                q_seqlen=seqlens, kv_seqlen=None, device=q.device
+            )
 
             context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
+                q, k, v, attn_bias=attn_bias, p=0, scale=None
+            )
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
 
 
 class Ernie4_5_VisionMLP(nn.Module):
-
     def __init__(
         self,
         in_features: int,
         hidden_features: int,
         act_layer: type[nn.Module] = QuickGELU,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
-        self.fc1 = ColumnParallelLinear(in_features,
-                                        hidden_features,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
         self.act = act_layer()
-        self.fc2 = RowParallelLinear(hidden_features,
-                                     in_features,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_parallel, _ = self.fc1(x)
@@ -329,16 +362,16 @@ class Ernie4_5_VisionMLP(nn.Module):
 
 
 class Ernie4_5_VisionBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_ratio: float,
         act_layer: type[nn.Module] = QuickGELU,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -348,27 +381,31 @@ class Ernie4_5_VisionBlock(nn.Module):
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
 
-        self.attn = Ernie4_5_VisionAttention(embed_dim=dim,
-                                             num_heads=num_heads,
-                                             projection_size=dim,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.attn")
+        self.attn = Ernie4_5_VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            attn_backend_override=attn_backend_override,
+        )
 
-        self.mlp = Ernie4_5_VisionMLP(dim,
-                                      mlp_hidden_dim,
-                                      act_layer=act_layer,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.mlp")
+        self.mlp = Ernie4_5_VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
 
     def forward(
-            self,
-            hidden_states: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
-
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
@@ -381,7 +418,6 @@ class Ernie4_5_VisionBlock(nn.Module):
 
 
 class Ernie4_5_VisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         patch_size: int = 14,
@@ -389,18 +425,16 @@ class Ernie4_5_VisionPatchEmbed(nn.Module):
         embed_dim: int = 1280,
         prefix="",
     ) -> None:
-
         super().__init__()
         self.patch_size = patch_size
         self.in_channels = in_channels
         self.embed_dim = embed_dim
 
-        self.proj = nn.Linear(in_channels * patch_size * patch_size,
-                              embed_dim,
-                              bias=False)
+        self.proj = nn.Linear(
+            in_channels * patch_size * patch_size, embed_dim, bias=False
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         target_dtype = self.proj.weight.dtype
         hidden_states = hidden_states.to(target_dtype)
         hidden_states = self.proj(hidden_states)
@@ -409,30 +443,29 @@ class Ernie4_5_VisionPatchEmbed(nn.Module):
 
 
 class Ernie4_5_VisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
-        self.inv_freq = 1.0 / theta**(
-            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim)
+        self.inv_freq = 1.0 / theta ** (
+            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim
+        )
 
     def forward(self, seqlen: int) -> torch.Tensor:
-        seq = torch.arange(seqlen,
-                           device=self.inv_freq.device,
-                           dtype=self.inv_freq.dtype)
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
         freqs = torch.outer(input=seq, vec2=self.inv_freq)
         return freqs
 
 
 class Ernie4_5_VisionTransformer(nn.Module):
-
     def __init__(
         self,
         vision_config,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
-
         super().__init__()
         patch_size = vision_config.patch_size
         spatial_merge_size = vision_config.spatial_merge_size
@@ -458,25 +491,36 @@ class Ernie4_5_VisionTransformer(nn.Module):
         head_dim = embed_dim // num_heads
         self.rotary_pos_emb = Ernie4_5_VisionRotaryEmbedding(head_dim // 2)
 
-        self.blocks = nn.ModuleList([
-            Ernie4_5_VisionBlock(dim=embed_dim,
-                                 num_heads=num_heads,
-                                 mlp_ratio=mlp_ratio,
-                                 norm_layer=norm_layer,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.blocks.{layer_idx}")
-            for layer_idx in range(depth)
-        ])
-
-        assert (hidden_size == embed_dim
-                ), "vit's config.hidden must be equal to config.embed_dim"
+        self.blocks = nn.ModuleList(
+            [
+                Ernie4_5_VisionBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    attn_backend_override=attn_backend_override,
+                )
+                for layer_idx in range(depth)
+            ]
+        )
+
+        assert hidden_size == embed_dim, (
+            "vit's config.hidden must be equal to config.embed_dim"
+        )
         self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
 
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-        check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -491,20 +535,27 @@ class Ernie4_5_VisionTransformer(nn.Module):
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten()
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten()
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
@@ -512,34 +563,36 @@ class Ernie4_5_VisionTransformer(nn.Module):
         return rotary_pos_emb
 
     def compute_attn_mask_seqlen(
-            self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+        self, cu_seqlens: torch.Tensor
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
+        ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                grid_thw: torch.Tensor,
-                num_pad=0) -> torch.Tensor:
-
+    def forward(
+        self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
+    ) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
 
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
         rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
 
-        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
-                                             grid_thw[:, 0]).cumsum(
-                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
 
+        zeros = cu_seqlens.new_zeros(1)
         if num_pad > 0:
-            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
+            cu_seqlens = torch.cat([zeros, cu_seqlens, zeros])
             cu_seqlens[-1] = cu_seqlens[-2] + num_pad
         else:
-            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+            cu_seqlens = torch.cat([zeros, cu_seqlens])
 
         # add batch size
         if hidden_states.ndim == 2:
@@ -570,8 +623,7 @@ class Ernie4_5_VisionTransformer(nn.Module):
 
         for name, loaded_weight in weights:
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -580,51 +632,53 @@ class Ernie4_5_VisionTransformer(nn.Module):
 # === Vision Inputs === #
 
 
-class Ernie4_5_VLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+class Ernie4_5_VLImagePixelInputs(TensorSchema):
     """
-
-    grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
     """
 
+    type: Literal["pixel_values"]
 
-Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs
+    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-class Ernie4_5_VLVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
-    """
+Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
 
-    This should be in `(grid_t, grid_h, grid_w)` format.
+class Ernie4_5_VLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - ni: Number of images
+        - cps: Number of channels * temporal_patch_size * patch_size *
+              patch_size
     """
 
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "cps")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
 
-Ernie4_5_VLVideoInputs = Ernie4_5_VLImagePixelInputs
+Ernie4_5_VLVideoInputs = Ernie4_5_VLVideoPixelInputs
 
 # === Vision Processor === #
 
 
-def round_by_factor(number: Union[int, float], factor: int) -> int:
+def round_by_factor(number: int | float, factor: int) -> int:
     return round(number / factor) * factor
 
 
-def ceil_by_factor(number: Union[int, float], factor: int) -> int:
+def ceil_by_factor(number: int | float, factor: int) -> int:
     return math.ceil(number / factor) * factor
 
 
-def floor_by_factor(number: Union[int, float], factor: int) -> int:
+def floor_by_factor(number: int | float, factor: int) -> int:
     return math.floor(number / factor) * factor
 
 
@@ -665,14 +719,15 @@ def smart_resize(
 
 
 class VariableResolutionResamplerModel(nn.Module):
-
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 spatial_conv_size,
-                 temporal_conv_size,
-                 config,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        spatial_conv_size,
+        temporal_conv_size,
+        config,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.in_dim = in_dim
         self.out_dim = out_dim
@@ -682,18 +737,21 @@ class VariableResolutionResamplerModel(nn.Module):
         self.use_temporal_conv = config.use_temporal_conv
 
         # compress 2d conv(picture) to 1d
-        self.spatial_dim = (self.in_dim * self.spatial_conv_size *
-                            self.spatial_conv_size)
+        self.spatial_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size
         # compress 3d conv(video) to 1d
-        self.temporal_dim = (self.in_dim * self.spatial_conv_size *
-                             self.spatial_conv_size * self.temporal_conv_size)
+        self.temporal_dim = (
+            self.in_dim
+            * self.spatial_conv_size
+            * self.spatial_conv_size
+            * self.temporal_conv_size
+        )
 
         self.spatial_linear1 = ColumnParallelLinear(
             self.spatial_dim,
             self.spatial_dim,
             bias=True,
             gather_output=True,
-            quant_config=getattr(config, 'quant_config', None),
+            quant_config=getattr(config, "quant_config", None),
             prefix=f"{prefix}.spatial_linear1",
         )
 
@@ -704,7 +762,7 @@ class VariableResolutionResamplerModel(nn.Module):
             self.spatial_dim,
             bias=True,
             gather_output=True,
-            quant_config=getattr(config, 'quant_config', None),
+            quant_config=getattr(config, "quant_config", None),
             prefix=f"{prefix}.spatial_linear2",
         )
 
@@ -716,7 +774,7 @@ class VariableResolutionResamplerModel(nn.Module):
                 self.spatial_dim,
                 bias=True,
                 gather_output=True,
-                quant_config=getattr(config, 'quant_config', None),
+                quant_config=getattr(config, "quant_config", None),
                 prefix=f"{prefix}.temporal_linear1",
             )
 
@@ -727,7 +785,7 @@ class VariableResolutionResamplerModel(nn.Module):
                 self.spatial_dim,
                 bias=True,
                 gather_output=True,
-                quant_config=getattr(config, 'quant_config', None),
+                quant_config=getattr(config, "quant_config", None),
                 prefix=f"{prefix}.temporal_linear2",
             )
 
@@ -738,12 +796,13 @@ class VariableResolutionResamplerModel(nn.Module):
             self.out_dim,
             bias=True,
             gather_output=True,
-            quant_config=getattr(config, 'quant_config', None),
+            quant_config=getattr(config, "quant_config", None),
             prefix=f"{prefix}.mlp",
         )
 
-        self.after_norm = RMSNorm(hidden_size=out_dim,
-                                  eps=getattr(config, 'rms_norm_eps', 1e-6))
+        self.after_norm = RMSNorm(
+            hidden_size=out_dim, eps=getattr(config, "rms_norm_eps", 1e-6)
+        )
 
     def spatial_conv_reshape(self, x, spatial_conv_size):
         S, C = x.shape
@@ -751,7 +810,6 @@ class VariableResolutionResamplerModel(nn.Module):
         return x
 
     def forward(self, x, grid_thw):
-
         def fwd_spatial(x):
             x = self.spatial_conv_reshape(x, self.spatial_conv_size)
 
@@ -763,43 +821,48 @@ class VariableResolutionResamplerModel(nn.Module):
             return x
 
         def fwd_placeholder(x, grid_thw, to_tensor=False):
-
             grid_thw_cpu = grid_thw.cpu().numpy()
             grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
-            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
-                                                      2)
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**2)
 
-            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
-                self.spatial_conv_size**2)
-            batch_offset = np.empty(tokens_per_img_or_vid.size,
-                                    dtype=tokens_per_img_or_vid.dtype)
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (self.spatial_conv_size**2)
+            batch_offset = np.empty(
+                tokens_per_img_or_vid.size, dtype=tokens_per_img_or_vid.dtype
+            )
             batch_offset[0] = 0
             batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
 
             slice_offsets = []
             for temporoal_size, spatial_size, b_offset in zip(
-                    grid_t, grid_hw_after_conv, batch_offset):
+                grid_t, grid_hw_after_conv, batch_offset
+            ):
                 for temp_offset in range(0, temporoal_size, 2):
                     slice_offsets.append(
                         np.arange(
                             b_offset + (temp_offset) * spatial_size,
                             b_offset + (temp_offset + 1) * spatial_size,
-                        ))
-            slice_offsets = torch.tensor(np.concatenate(slice_offsets,
-                                                        axis=-1)).to(x.device)
+                        )
+                    )
+            slice_offsets = torch.tensor(np.concatenate(slice_offsets, axis=-1)).to(
+                x.device
+            )
 
             slice_offsets2 = []
             for temporoal_size, spatial_size, b_offset in zip(
-                    grid_t, grid_hw_after_conv, batch_offset):
-                for temp_offset in range(1 if temporoal_size > 1 else 0,
-                                         temporoal_size, 2):
+                grid_t, grid_hw_after_conv, batch_offset
+            ):
+                for temp_offset in range(
+                    1 if temporoal_size > 1 else 0, temporoal_size, 2
+                ):
                     slice_offsets2.append(
                         np.arange(
                             b_offset + (temp_offset) * spatial_size,
                             b_offset + (temp_offset + 1) * spatial_size,
-                        ))
-            slice_offsets2 = torch.tensor(
-                np.concatenate(slice_offsets2, axis=-1)).to(x.device)
+                        )
+                    )
+            slice_offsets2 = torch.tensor(np.concatenate(slice_offsets2, axis=-1)).to(
+                x.device
+            )
 
             x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
             x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
@@ -825,9 +888,7 @@ class VariableResolutionResamplerModel(nn.Module):
         x = fwd_mlp(x)
         return x
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
 
@@ -835,15 +896,13 @@ class VariableResolutionResamplerModel(nn.Module):
             if name not in params_dict:
                 continue
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.model_config.hf_config
 
@@ -853,7 +912,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -872,7 +931,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None:
             image_processor = self.get_image_processor()
@@ -891,11 +950,9 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
                 min_pixels=image_processor.min_pixels,
                 max_pixels=image_processor.max_pixels,
             )
-            preprocessed_size = ImageSize(width=resized_width,
-                                          height=resized_height)
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
-            preprocessed_size = ImageSize(width=image_width,
-                                          height=image_height)
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
 
         grid_t = max(num_frames // temporal_conv_size, 1)
         grid_h = preprocessed_size.height // patch_size
@@ -911,7 +968,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
@@ -926,7 +983,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
@@ -988,8 +1045,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens)
         max_frames_per_video = max_total_frames // max(max_videos, 1)
 
         return max(max_frames_per_video, 2)
@@ -1004,15 +1060,12 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
 
 
-class Ernie4_5VLMultiModalProcessor(
-        BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
-
+class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
     def _pixel_values_norm(
         self,
         pixel_values: torch.Tensor,
@@ -1021,29 +1074,33 @@ class Ernie4_5VLMultiModalProcessor(
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
         image_processor = self.info.get_image_processor(**mm_kwargs)
-        image_mean_tensor = torch.tensor(image_processor.image_mean,
-                                         dtype=torch.float32).reshape(
-                                             [1, 3, 1, 1])
-        image_std_tensor = torch.tensor(image_processor.image_std,
-                                        dtype=torch.float32).reshape(
-                                            [1, 3, 1, 1])
-        rescale_factor = torch.tensor(image_processor.rescale_factor,
-                                      dtype=torch.float32)
+        image_mean_tensor = torch.tensor(
+            image_processor.image_mean, dtype=torch.float32
+        ).reshape([1, 3, 1, 1])
+        image_std_tensor = torch.tensor(
+            image_processor.image_std, dtype=torch.float32
+        ).reshape([1, 3, 1, 1])
+        rescale_factor = torch.tensor(
+            image_processor.rescale_factor, dtype=torch.float32
+        )
         patch_size_squared = vision_config.patch_size**2
 
-        image_mean_tensor = (image_mean_tensor.squeeze(
-            [-2, -1]).repeat_interleave(patch_size_squared, -1))
-        image_std_tensor = (image_std_tensor.squeeze(
-            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+        image_mean_tensor = image_mean_tensor.squeeze([-2, -1]).repeat_interleave(
+            patch_size_squared, -1
+        )
+        image_std_tensor = image_std_tensor.squeeze([-2, -1]).repeat_interleave(
+            patch_size_squared, -1
+        )
 
         if not image_mean_tensor.is_contiguous():
             image_mean_tensor = image_mean_tensor.contiguous()
         if not image_std_tensor.is_contiguous():
             image_std_tensor = image_std_tensor.contiguous()
 
-        pixel_values = (rescale_factor * pixel_values.to(torch.float32) -
-                        image_mean_tensor) / image_std_tensor
-        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        pixel_values = (
+            rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor
+        ) / image_std_tensor
+        pixel_values = pixel_values.to(hf_config.dtype)
         return pixel_values
 
     def _call_hf_processor(
@@ -1058,8 +1115,9 @@ class Ernie4_5VLMultiModalProcessor(
         if "images" not in mm_data and "videos" not in mm_data and prompt != "":
             tokenizer = self.info.get_tokenizer()
             prompt_ids = tokenizer.encode(prompt)
-            tokenizer_output = BatchFeature(dict(input_ids=[prompt_ids]),
-                                            tensor_type="pt")
+            tokenizer_output = BatchFeature(
+                dict(input_ids=[prompt_ids]), tensor_type="pt"
+            )
             return tokenizer_output
 
         if "images" not in mm_data:
@@ -1068,38 +1126,40 @@ class Ernie4_5VLMultiModalProcessor(
             mm_data["videos"] = []
         processor_output = self.info.ctx.call_hf_processor(
             self.info.get_hf_processor(**mm_kwargs),
-            dict(text=[prompt],
-                 images=mm_data["images"],
-                 videos=mm_data["videos"]),
+            dict(text=[prompt], images=mm_data["images"], videos=mm_data["videos"]),
             dict(**mm_kwargs, **tok_kwargs),
         )
 
         # Divide the processor_output into two modalities: image and video.
         if processor_output is not None:
-            pixel_values = processor_output['images']
+            pixel_values = processor_output["images"]
             if pixel_values is not None:
-                processor_output['images'] = self._pixel_values_norm(
-                    pixel_values, mm_kwargs)
+                processor_output["images"] = self._pixel_values_norm(
+                    pixel_values, mm_kwargs
+                )
             for key in list(processor_output.keys()):
                 if processor_output[key] is None:
                     del processor_output[key]
                     continue
                 if key == "grid_thw":
-                    grid_thw = processor_output['grid_thw']
-                    pixel_values_all = processor_output['images']
+                    grid_thw = processor_output["grid_thw"]
+                    pixel_values_all = processor_output["images"]
                     # Identify elements where the first
                     # dimension is greater than 1 and
                     # treat them as the video modality
                     mask = grid_thw[:, 0] > 1
                     processor_output["video_grid_thw"] = grid_thw[mask]
                     processor_output["image_grid_thw"] = grid_thw[~mask]
-                    image_patch_num = processor_output["image_grid_thw"].prod(
-                        dim=1).sum()
-                    processor_output[
-                        'pixel_values'] = pixel_values_all[:image_patch_num]
-                    processor_output['pixel_values_videos'] = pixel_values_all[
-                        image_patch_num:]
-                    del processor_output['images']
+                    image_patch_num = (
+                        processor_output["image_grid_thw"].prod(dim=1).sum()
+                    )
+                    processor_output["pixel_values"] = pixel_values_all[
+                        :image_patch_num
+                    ]
+                    processor_output["pixel_values_videos"] = pixel_values_all[
+                        image_patch_num:
+                    ]
+                    del processor_output["images"]
 
         return processor_output
 
@@ -1113,13 +1173,13 @@ class Ernie4_5VLMultiModalProcessor(
 
         before_placeholder = {
             "image": "<|image@placeholder|>",
-            "video": "<|video@placeholder|>"
+            "video": "<|video@placeholder|>",
         }
 
         after_placeholder = {
             # image and video have same placeholder
             "image": "<|IMAGE_PLACEHOLDER|>",
-            "video": "<|IMAGE_PLACEHOLDER|>"
+            "video": "<|IMAGE_PLACEHOLDER|>",
         }
 
         merge_length = hf_processor.spatial_conv_size**2
@@ -1129,8 +1189,11 @@ class Ernie4_5VLMultiModalProcessor(
             grid_thw = out_item[f"{modality}_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
             if modality == "video":
-                num_tokens = int(grid_thw.prod(
-                )) // hf_processor.temporal_conv_size // merge_length
+                num_tokens = (
+                    int(grid_thw.prod())
+                    // hf_processor.temporal_conv_size
+                    // merge_length
+                )
             else:
                 num_tokens = int(grid_thw.prod()) // merge_length
             return after_placeholder[modality] * num_tokens
@@ -1139,9 +1202,9 @@ class Ernie4_5VLMultiModalProcessor(
             PromptReplacement(
                 modality=modality,
                 target=before_placeholder[modality],
-                replacement=partial(get_replacement_ernie45vl,
-                                    modality=modality),
-            ) for modality in ("image", "video")
+                replacement=partial(get_replacement_ernie45vl, modality=modality),
+            )
+            for modality in ("image", "video")
         ]
 
     def _get_mm_fields_config(
@@ -1149,7 +1212,6 @@ class Ernie4_5VLMultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-
         image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
         image_grid_sizes = image_grid_thw.prod(-1)
 
@@ -1158,62 +1220,73 @@ class Ernie4_5VLMultiModalProcessor(
 
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
+                "image", image_grid_sizes
+            ),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
+                "video", video_grid_sizes
+            ),
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
 
-class Ernie4_5_VLDummyInputsBuilder(
-        BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]):
-
+class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
         prompt = ""
         for i in range(num_images):
-            prompt += (f"Picture {i+1}:"
-                       "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>")
+            prompt += (
+                f"Picture {i + 1}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+            )
 
         for i in range(num_videos):
-            prompt += (f"Video {i+1}:"
-                       "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>")
+            prompt += f"Video {i + 1}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
         return prompt
 
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(width=target_width,
-                                   height=target_height,
-                                   num_frames=target_num_frames,
-                                   num_videos=num_videos)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+                overrides=video_overrides,
+            ),
         }
 
 
 @MULTIMODAL_REGISTRY.register_processor(
     Ernie4_5VLMultiModalProcessor,
     info=Ernie4_5_VLProcessingInfo,
-    dummy_inputs=Ernie4_5_VLDummyInputsBuilder)
-class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                             SupportsLoRA, SupportsPP):
+    dummy_inputs=Ernie4_5_VLDummyInputsBuilder,
+)
+class Ernie4_5_VLMoeForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": [
@@ -1244,10 +1317,11 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             "temporal_linear.0.": "temporal_linear1.",
             "temporal_linear.2.": "temporal_linear2.",
             "temporal_linear.3.": "temporal_norm.",
-        })
+        },
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
         if modality.startswith("video"):
@@ -1264,11 +1338,17 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.vision_model = Ernie4_5_VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "vision_model"),
+            attn_backend_override=attn_backend_override,
         )
 
         self.language_model = Ernie4_5_VLMoeForCausalLM(
@@ -1282,16 +1362,35 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.config.spatial_conv_size,
             self.config.temporal_conv_size,
             config=self.config,
-            prefix=maybe_prefix(prefix, "resampler_model"))
+            prefix=maybe_prefix(prefix, "resampler_model"),
+        )
 
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
+        if getattr(self.config, "im_patch_id", None):
+            visual_token_ids = [
+                token_id
+                for token_id in [
+                    self.config.im_patch_id,
+                    getattr(self.config, "image_start_token_id", None),
+                    getattr(self.config, "image_end_token_id", None),
+                    getattr(self.config, "video_start_token_id", None),
+                    getattr(self.config, "video_end_token_id", None),
+                ]
+                if token_id is not None
+            ]
+            self._visual_token_ids_tensor_cache = torch.tensor(
+                visual_token_ids, dtype=torch.long
+            )
+        else:
+            self._visual_token_ids_tensor_cache = None
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """compute logits"""
         return self.language_model.compute_logits(hidden_states)
 
@@ -1305,7 +1404,8 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             if grid_thw.numel() % 3 != 0:
                 raise ValueError(
                     f"grid_thw has {grid_thw.numel()} elements after filtering,"
-                    "which is not divisible by 3.")
+                    "which is not divisible by 3."
+                )
             grid_thw = grid_thw.reshape(-1, 3)
             # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]]
             grid_thw = F.pad(
@@ -1317,33 +1417,157 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         return image_features
 
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
-        if getattr(self.config, "im_patch_id", None) is not None:
-            self.visual_token_mask = (
-                input_ids == self.config.im_patch_id).reshape(-1, 1)
-        else:
+        """Set mask for visual tokens (image/video patches and delimiters)."""
+        if self._visual_token_ids_tensor_cache is None:
             self.visual_token_mask = None
+            return
+        # Create tensor on the correct device
+        visual_token_ids_tensor = self._visual_token_ids_tensor_cache.to(
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
 
-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
+        self.visual_token_mask = torch.isin(input_ids, visual_token_ids_tensor).reshape(
+            -1, 1
+        )
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
+        hf_config = self.config
+        image_token_id = hf_config.im_patch_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+        llm_pos_ids_list: list = []
+
+        if image_grid_thw or video_grid_thw:
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                if modality_type == "image":
+                    t, h, w = image_grid_thw[mm_data_idx]
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_conv_size,
+                        w // spatial_conv_size,
+                    )
+
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = video_grid_thw[mm_data_idx]
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t // temporal_conv_size,
+                        h // spatial_conv_size,
+                        w // spatial_conv_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
         else:
-            return torch.concat(mm_input)
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Ernie4_5_VLImageInputs]:
+        self, **kwargs: object
+    ) -> Ernie4_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
 
@@ -1351,21 +1575,15 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return Ernie4_5_VLImagePixelInputs(type="pixel_values",
-                                               pixel_values=pixel_values,
-                                               image_grid_thw=image_grid_thw)
+            return Ernie4_5_VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[Ernie4_5_VLVideoInputs]:
+        self, **kwargs: object
+    ) -> Ernie4_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
 
@@ -1373,11 +1591,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Ernie4_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1385,16 +1598,15 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
     def _process_image_input(
-            self,
-            image_input: Ernie4_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
-
+        self, image_input: Ernie4_5_VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
 
-        pixel_values = image_input["pixel_values"].type(
-            self.vision_model.dtype)
-        image_features = self._vision_forward(pixel_values=pixel_values,
-                                              grid_thw=grid_thw)
+        pixel_values = image_input["pixel_values"].type(self.vision_model.dtype)
+        image_features = self._vision_forward(
+            pixel_values=pixel_values, grid_thw=grid_thw
+        )
         image_embeds = self.resampler_model(image_features, grid_thw)
 
         merge_size = self.vision_model.spatial_merge_size
@@ -1403,21 +1615,25 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         return image_embeds.split(sizes.tolist())
 
     def _process_video_input(
-            self,
-            video_input: Ernie4_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
-
+        self, video_input: Ernie4_5_VLVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
 
         pixel_values_videos = video_input["pixel_values_videos"].type(
-            self.vision_model.dtype)
-        video_features = self._vision_forward(pixel_values=pixel_values_videos,
-                                              grid_thw=grid_thw)
+            self.vision_model.dtype
+        )
+        video_features = self._vision_forward(
+            pixel_values=pixel_values_videos, grid_thw=grid_thw
+        )
         video_embeds = self.resampler_model(video_features, grid_thw)
 
         merge_size = self.vision_model.spatial_merge_size
-        sizes = (grid_thw.prod(-1) //
-                 self.config.temporal_conv_size) // merge_size // merge_size
+        sizes = (
+            (grid_thw.prod(-1) // self.config.temporal_conv_size)
+            // merge_size
+            // merge_size
+        )
 
         return video_embeds.split(sizes.tolist())
 
@@ -1427,20 +1643,20 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "videos" not in modalities
+            ):
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1454,41 +1670,45 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_embeddings = self._process_video_input(video_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is None:
-            return inputs_embeds
-
-        self._set_visual_token_mask(input_ids)
-        inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds,
-                                                    multimodal_embeddings,
-                                                    [self.config.im_patch_id])
-        return inputs_embeds
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
+            self._set_visual_token_mask(input_ids)
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
-
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
@@ -1497,20 +1717,17 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
         }
 
         if self.visual_token_mask is not None:
-
             if self.visual_token_mask.shape[0] != inputs_embeds.shape[0]:
-                padding_len = inputs_embeds.shape[
-                    0] - self.visual_token_mask.shape[0]
+                padding_len = inputs_embeds.shape[0] - self.visual_token_mask.shape[0]
                 # right pad False
                 pad = torch.zeros(
                     (padding_len, self.visual_token_mask.shape[1]),
                     dtype=self.visual_token_mask.dtype,
-                    device=self.visual_token_mask.device)
-                self.visual_token_mask = torch.cat(
-                    [self.visual_token_mask, pad], dim=0)
+                    device=self.visual_token_mask.device,
+                )
+                self.visual_token_mask = torch.cat([self.visual_token_mask, pad], dim=0)
 
-            forward_kwargs.update(
-                {"visual_token_mask": self.visual_token_mask})
+            forward_kwargs.update({"visual_token_mask": self.visual_token_mask})
             self.visual_token_mask = None
 
         hidden_states = self.language_model.model(
@@ -1520,8 +1737,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index f55016f7ccb36bf6fa3967d8c84103ae7f50221d..e8ef86f9b7f011abd4295df824ffac03358d53e3 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -22,64 +22,84 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Erine VL model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention
+
 # from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
-    Ernie4_5_VLRotaryEmbedding)
+    Ernie4_5_VLRotaryEmbedding,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .ernie45_moe import Ernie4_5_MoeMLP
 from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP):
-    pass
+    def __init__(self, shared_experts: torch.nn.Module | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.shared_experts = shared_experts
 
+    def forward(self, x):
+        if self.shared_experts is not None:
+            return self.shared_experts(x) + super().forward(x)
+        else:
+            return super().forward(x)
 
-class Ernie4_5_VLMoeAttention(nn.Module):
 
+class Ernie4_5_VLMoeAttention(nn.Module):
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         freq_allocation: int = 20,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -109,19 +129,23 @@ class Ernie4_5_VLMoeAttention(nn.Module):
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
-        self.qkv_proj = QKVParallelLinear(hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=qkv_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         t_rope = freq_allocation
         h_rope = (self.head_dim // 2 - freq_allocation) // 2
@@ -134,22 +158,24 @@ class Ernie4_5_VLMoeAttention(nn.Module):
             base=rope_theta,
             is_neox_style=False,
             dtype=torch.get_default_dtype(),
-            mrope_section=[h_rope, w_rope, t_rope])
+            mrope_section=[h_rope, w_rope, t_rope],
+        )
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-
         qkv, _ = self.qkv_proj(hidden_states)
 
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -163,11 +189,10 @@ class Ernie4_5_VLMoeAttention(nn.Module):
 
 
 class Ernie4_5_VLMoeMoE(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -175,8 +200,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
         layer_idx = extract_layer_index(prefix)
         self.layer_idx = layer_idx
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
-                                   > 0)
+        self.has_shared_experts = getattr(config, "moe_num_shared_experts", 0) > 0
         self.hidden_size = config.hidden_size
 
         moe_num_experts = config.moe_num_experts
@@ -185,34 +209,58 @@ class Ernie4_5_VLMoeMoE(nn.Module):
         if self.tp_size > max_moe_num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {moe_num_experts}.")
+                f"the number of experts {moe_num_experts}."
+            )
 
         moe_layer_start_index = config.moe_layer_start_index
         text_moe_layer_start_index = moe_layer_start_index[0]
         vision_moe_layer_start_index = moe_layer_start_index[1]
         moe_layer_end_index = config.moe_layer_end_index
         moe_layer_end_index = getattr(
-            config, "moe_layer_end_index",
-            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+            config,
+            "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1],
+        )
         text_moe_layer_end_index = moe_layer_end_index[0]
         vision_moe_layer_end_index = moe_layer_end_index[1]
 
         assert config.moe_num_experts[0] == config.moe_num_experts[1]
         self.e_score_correction_bias = nn.Parameter(
-            torch.empty(2, config.moe_num_experts[0]))
+            torch.empty(2, config.moe_num_experts[0], dtype=torch.float32)
+        )
 
         assert text_moe_layer_start_index <= text_moe_layer_end_index
 
-        if layer_idx >= text_moe_layer_start_index and \
-            layer_idx <= text_moe_layer_end_index:
+        if self.has_shared_experts:
+            intermediate_size = (
+                config.moe_intermediate_size[0] * config.moe_num_shared_experts
+            )
+            self.shared_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=False,
+            )
+        else:
+            self.shared_experts = None
+
+        if (
+            layer_idx >= text_moe_layer_start_index
+            and layer_idx <= text_moe_layer_end_index
+        ):
             self.text_experts_gate = ReplicatedLinear(
                 config.hidden_size,
                 config.moe_num_experts[0],
                 bias=False,
+                params_dtype=torch.float32,
                 quant_config=quant_config,
-                prefix=f"{prefix}.text_experts_gate")
+                prefix=f"{prefix}.text_experts_gate",
+            )
 
-            self.text_experts = FusedMoE(
+            self.text_experts = SharedFusedMoE(
+                shared_experts=self.shared_experts,
                 num_experts=config.moe_num_experts[0],
                 top_k=config.moe_k,
                 hidden_size=config.hidden_size,
@@ -221,27 +269,35 @@ class Ernie4_5_VLMoeMoE(nn.Module):
                 renormalize=True,
                 quant_config=quant_config,
                 e_score_correction_bias=self.e_score_correction_bias[0],
-                prefix=f"{prefix}.text_experts")
+                prefix=f"{prefix}.text_experts",
+            )
         else:
             self.text_experts = Ernie4_5_VLMoeMLP(
+                shared_experts=self.shared_experts,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
-                use_bias=getattr(config, 'use_bias', False),
+                use_bias=getattr(config, "use_bias", False),
                 quant_config=quant_config,
-                prefix=f"{prefix}.mlp")
+                prefix=f"{prefix}.mlp",
+            )
 
         assert vision_moe_layer_start_index <= vision_moe_layer_end_index
-        if layer_idx >= vision_moe_layer_start_index and \
-            layer_idx <= vision_moe_layer_end_index:
+        if (
+            layer_idx >= vision_moe_layer_start_index
+            and layer_idx <= vision_moe_layer_end_index
+        ):
             self.vision_experts_gate = ReplicatedLinear(
                 config.hidden_size,
                 config.moe_num_experts[1],
                 bias=False,
+                params_dtype=torch.float32,
                 quant_config=quant_config,
-                prefix=f"{prefix}.vision_experts_gate")
+                prefix=f"{prefix}.vision_experts_gate",
+            )
 
-            self.vision_experts = FusedMoE(
+            self.vision_experts = SharedFusedMoE(
+                shared_experts=self.shared_experts,
                 num_experts=config.moe_num_experts[1],
                 top_k=config.moe_k,
                 hidden_size=config.hidden_size,
@@ -250,27 +306,18 @@ class Ernie4_5_VLMoeMoE(nn.Module):
                 renormalize=True,
                 quant_config=quant_config,
                 e_score_correction_bias=self.e_score_correction_bias[1],
-                prefix=f"{prefix}.vision_experts")
+                prefix=f"{prefix}.vision_experts",
+            )
         else:
             self.vision_experts = Ernie4_5_VLMoeMLP(
+                shared_experts=self.shared_experts,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
-                use_bias=getattr(config, 'use_bias', False),
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp")
-
-        if self.has_shared_experts:
-            intermediate_size = (config.moe_intermediate_size[0] *
-                                 config.moe_num_shared_experts)
-            self.shared_experts = Ernie4_5_VLMoeMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
+                use_bias=getattr(config, "use_bias", False),
                 quant_config=quant_config,
-                prefix=f"{prefix}.shared_experts",
-                reduce_results=self.text_experts.
-                must_reduce_shared_expert_outputs())
+                prefix=f"{prefix}.mlp",
+            )
 
     def forward(
         self,
@@ -278,67 +325,90 @@ class Ernie4_5_VLMoeMoE(nn.Module):
         visual_token_mask: torch.Tensor,
         **kwargs: object,
     ) -> torch.Tensor:
-
         orig_shape = hidden_states.shape
         hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        if self.has_shared_experts:
-            shared_output = self.shared_experts(hidden_states)
-
         if visual_token_mask is not None and visual_token_mask.all():
             # only vision modal input
-            router_logits, _ = self.vision_experts_gate(hidden_states)
+            router_logits, _ = self.vision_experts_gate(
+                hidden_states.to(dtype=torch.float32)
+            )
             final_hidden_states = self.vision_experts(
-                hidden_states=hidden_states, router_logits=router_logits)
+                hidden_states=hidden_states, router_logits=router_logits
+            )
         elif visual_token_mask is not None and visual_token_mask.any():
             # text and vision modals input
-            visual_token_mask = visual_token_mask.repeat(
-                1, self.hidden_size).bool()
+            visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool()
             text_token_mask = ~visual_token_mask
-            final_hidden_states = torch.zeros_like(hidden_states)
+            final_experts_hidden_states = torch.zeros_like(hidden_states)
+            final_shared_ouput = (
+                torch.zeros_like(hidden_states) if self.has_shared_experts else None
+            )
 
             text_hidden_states = hidden_states[text_token_mask].reshape(
-                -1, self.hidden_size)
+                -1, self.hidden_size
+            )
             vision_hidden_states = hidden_states[visual_token_mask].reshape(
-                -1, self.hidden_size)
-
-            text_router_logits, _ = self.text_experts_gate(text_hidden_states)
-            final_hidden_states[text_token_mask] = self.text_experts(
-                hidden_states=text_hidden_states,
-                router_logits=text_router_logits).flatten()
+                -1, self.hidden_size
+            )
+
+            text_router_logits, _ = self.text_experts_gate(
+                text_hidden_states.to(dtype=torch.float32)
+            )
+            text_shared_ouput, text_experts_output = self.text_experts(
+                hidden_states=text_hidden_states, router_logits=text_router_logits
+            )
+            final_experts_hidden_states[text_token_mask] = text_experts_output.flatten()
+            if self.has_shared_experts:
+                final_shared_ouput[text_token_mask] = text_shared_ouput.flatten()
 
             vision_router_logits, _ = self.vision_experts_gate(
-                vision_hidden_states)
-            final_hidden_states[visual_token_mask] = self.vision_experts(
-                hidden_states=vision_hidden_states,
-                router_logits=vision_router_logits).flatten()
+                vision_hidden_states.to(dtype=torch.float32)
+            )
+            vision_shared_ouput, vision_experts_output = self.vision_experts(
+                hidden_states=vision_hidden_states, router_logits=vision_router_logits
+            )
+            final_experts_hidden_states[visual_token_mask] = (
+                vision_experts_output.flatten()
+            )
+            if self.has_shared_experts:
+                final_shared_ouput[visual_token_mask] = vision_shared_ouput.flatten()
+
+            final_hidden_states = (final_shared_ouput, final_experts_hidden_states)
         else:
             # only text modal input
-            text_router_logits, _ = self.text_experts_gate(hidden_states)
+            text_router_logits, _ = self.text_experts_gate(
+                hidden_states.to(dtype=torch.float32)
+            )
 
             final_hidden_states = self.text_experts(
-                hidden_states=hidden_states, router_logits=text_router_logits)
+                hidden_states=hidden_states, router_logits=text_router_logits
+            )
 
-        if self.has_shared_experts and \
-              shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        if self.has_shared_experts:
+            # for shared_experts model
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+        else:
+            # for not shared_experts model
+            final_hidden_states = final_hidden_states[1]
 
         if self.tp_size > 1:
             final_hidden_states = (
                 self.text_experts.maybe_all_reduce_tensor_model_parallel(
-                    final_hidden_states))
+                    final_hidden_states
+                )
+            )
 
         return final_hidden_states.view(orig_shape)
 
 
 class Ernie4_5_VLMoeDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -346,20 +416,19 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 500000)
         rope_scaling = getattr(config, "rope_scaling", None)
         freq_allocation = getattr(config, "freq_allocation", 20)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          131072)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
         self.self_attn = Ernie4_5_VLMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            head_dim=getattr(config, 'head_dim', None),
+            head_dim=getattr(config, "head_dim", None),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             freq_allocation=freq_allocation,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
-            qkv_bias=getattr(config, 'use_bias', False),
+            qkv_bias=getattr(config, "use_bias", False),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -372,8 +441,10 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
         moe_layer_start_index = config.moe_layer_start_index
         min_moe_layer_start_index = min(moe_layer_start_index)
         moe_layer_end_index = getattr(
-            config, "moe_layer_end_index",
-            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+            config,
+            "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1],
+        )
         max_moe_layer_end_index = max(moe_layer_end_index)
         assert min_moe_layer_start_index <= max_moe_layer_end_index
         moe_num_experts = config.moe_num_experts
@@ -381,42 +452,44 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
         moe_layer_interval = getattr(config, "moe_layer_interval", 1)
         use_moe = getattr(config, "use_moe", max_moe_num_experts > 0)
 
-        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
-                and layer_idx >= min_moe_layer_start_index
-                and layer_idx <= max_moe_layer_end_index):
-            self.mlp = Ernie4_5_VLMoeMoE(config=config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.mlp")
+        if (
+            use_moe
+            and ((layer_idx + 1) % moe_layer_interval == 0)
+            and layer_idx >= min_moe_layer_start_index
+            and layer_idx <= max_moe_layer_end_index
+        ):
+            self.mlp = Ernie4_5_VLMoeMoE(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            )
         else:
             self.mlp = Ernie4_5_VLMoeMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
-                use_bias=getattr(config, 'use_bias', False),
+                use_bias=getattr(config, "use_bias", False),
                 quant_config=quant_config,
-                prefix=f"{prefix}.mlp")
+                prefix=f"{prefix}.mlp",
+            )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        visual_token_mask: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
+        visual_token_mask: torch.Tensor | None,
         **kwargs: object,
     ) -> torch.Tensor:
-
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attn(
             positions=positions,
@@ -424,12 +497,10 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
         if isinstance(self.mlp, Ernie4_5_VLMoeMoE):
-            hidden_states = self.mlp(hidden_states, visual_token_mask,
-                                     **kwargs)
+            hidden_states = self.mlp(hidden_states, visual_token_mask, **kwargs)
         else:
             hidden_states = self.mlp(hidden_states)
 
@@ -447,7 +518,6 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
 #         "visual_token_mask": 0,
 #     })
 class Ernie4_5_VLMoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -466,7 +536,8 @@ class Ernie4_5_VLMoeModel(nn.Module):
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.embed_tokens")
+                prefix=f"{prefix}.embed_tokens",
+            )
         else:
             self.embed_tokens = PPMissingLayer()
 
@@ -476,7 +547,8 @@ class Ernie4_5_VLMoeModel(nn.Module):
                 config=config,
                 cache_config=cache_config,
                 quant_config=quant_config,
-                prefix=prefix),
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
 
@@ -485,28 +557,27 @@ class Ernie4_5_VLMoeModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        visual_token_mask: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        visual_token_mask: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -514,14 +585,14 @@ class Ernie4_5_VLMoeModel(nn.Module):
             residual = intermediate_tensors["residual"]
 
         for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual,
-                                            visual_token_mask, **kwargs)
+            hidden_states, residual = layer(
+                positions, hidden_states, residual, visual_token_mask, **kwargs
+            )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
 
@@ -550,15 +621,17 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Ernie4_5_VLMoeModel(vllm_config=vllm_config,
-                                         prefix=maybe_prefix(prefix, "model"))
+        self.model = Ernie4_5_VLMoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         else:
             self.lm_head = PPMissingLayer()
 
@@ -566,32 +639,33 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -603,36 +677,35 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
 
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=max(self.config.moe_num_experts))
+            num_experts=max(self.config.moe_num_experts),
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if self.config.tie_word_embeddings and name.endswith(
-                    "lm_head.weight"):
+            if self.config.tie_word_embeddings and name.endswith("lm_head.weight"):
                 loaded_params.add("lm_head.weight")
                 continue
             # MTP will be supported soon.
-            if "mtp" in name or \
-               "vision_model" in name or \
-               "resampler_model" in name:
+            if "mtp" in name or "vision_model" in name or "resampler_model" in name:
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
 
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -647,14 +720,13 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
                 if "mlp.experts" in name:
                     moe_offset = int(name.split(".")[-3])
                     vision_expert_start_idx = self.config.moe_num_experts[0]
-                    is_text_expert = \
-                        moe_offset <= vision_expert_start_idx - 1
+                    is_text_expert = moe_offset <= vision_expert_start_idx - 1
                     if is_text_expert:
                         name = name.replace(".experts.", ".text_experts.")
                     else:
                         name = name.replace(
                             f".experts.{moe_offset}",
-                            f".vision_experts.{moe_offset-vision_expert_start_idx}"
+                            f".vision_experts.{moe_offset - vision_expert_start_idx}",
                         )
 
                 for mapping in expert_params_mapping:
@@ -665,8 +737,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
 
                     # Distinguish between vision experts and text experts
                     moe_offset = int(name.split(".")[-3])
-                    is_text_expert = \
-                        moe_offset <= self.config.moe_num_experts[0] - 1
+                    is_text_expert = moe_offset <= self.config.moe_num_experts[0] - 1
 
                     name = name.replace(weight_name, param_name)
                     if is_text_expert:
@@ -679,36 +750,40 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
                         continue
 
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
 
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Distinguish between vision expert gate
                     # and text expert gate
                     if name.endswith("mlp.gate.weight"):
-                        name = name.replace("gate.weight",
-                                            "text_experts_gate.weight")
+                        name = name.replace("gate.weight", "text_experts_gate.weight")
                         loaded_weight = loaded_weight.T
                     elif name.endswith("mlp.gate.weight_1"):
-                        name = name.replace("gate.weight_1",
-                                            "vision_experts_gate.weight")
+                        name = name.replace(
+                            "gate.weight_1", "vision_experts_gate.weight"
+                        )
                         loaded_weight = loaded_weight.T
 
                     if "e_score_correction_bias" in name:
                         name = name.replace(".moe_statics.", ".")
 
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -720,8 +795,9 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
 
                     param = params_dict[name]
 
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
index 4c9fb64ec74d472e220b0cfc61f0e4cec5e86ea5..1b9abc3572a3b60b2afce236c42683a0593e2917 100644
--- a/vllm/model_executor/models/ernie_mtp.py
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -22,8 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Ernie-MTP model."""
+
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -33,7 +33,9 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
@@ -43,7 +45,6 @@ from .utils import is_pp_missing_parameter, maybe_prefix
 
 
 class ErnieMultiTokenPredictorLayer(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -52,13 +53,11 @@ class ErnieMultiTokenPredictorLayer(nn.Module):
         super().__init__()
         config = vllm_config.model_config.hf_config
 
-        self.mtp_emb_norm = RMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-        self.mtp_hidden_norm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.mtp_linear_proj = nn.Linear(config.hidden_size * 2,
-                                         config.hidden_size,
-                                         bias=False)
+        self.mtp_emb_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_linear_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=False
+        )
         self.mtp_block = LlamaDecoderLayer(vllm_config, prefix)
 
     def forward(
@@ -76,18 +75,18 @@ class ErnieMultiTokenPredictorLayer(nn.Module):
         previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states)
 
         hidden_states = self.mtp_linear_proj(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1)
+        )
 
-        hidden_states, residual = self.mtp_block(positions=positions,
-                                                 hidden_states=hidden_states,
-                                                 residual=None)
+        hidden_states, residual = self.mtp_block(
+            positions=positions, hidden_states=hidden_states, residual=None
+        )
         hidden_states = residual + hidden_states
 
         return hidden_states
 
 
 class ErnieMultiTokenPredictor(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -95,27 +94,33 @@ class ErnieMultiTokenPredictor(nn.Module):
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = config.num_nextn_predict_layers
         # to map the exact layer index from weights
-        self.layers = torch.nn.ModuleDict({
-            str(idx):
-            ErnieMultiTokenPredictorLayer(
-                vllm_config,
-                f"{prefix}.layers.{idx}",
-            )
-            for idx in range(self.mtp_start_layer_idx,
-                             self.mtp_start_layer_idx + self.num_mtp_layers)
-        })
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): ErnieMultiTokenPredictorLayer(
+                    vllm_config,
+                    f"{prefix}.layers.{idx}",
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -139,45 +144,48 @@ class ErnieMultiTokenPredictor(nn.Module):
 
 
 class ErnieMTP(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         self.config = vllm_config.model_config.hf_config
-        self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "model"))
-        self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                      self.config.hidden_size,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = ErnieMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
 
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "ernie_mtp only support predict one token"
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
-        return self.model.compute_logits(hidden_states, self.lm_head,
-                                         spec_step_idx)
+    ) -> torch.Tensor | None:
+        return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -189,16 +197,14 @@ class ErnieMTP(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-
-            if self.config.tie_word_embeddings and name.endswith(
-                    "lm_head.weight"):
+            if self.config.tie_word_embeddings and name.endswith("lm_head.weight"):
                 continue
             if "rotary_emb.inv_freq" in name:
                 continue
             if "mtp" in name:
                 name = self._rewrite_spec_layer_name(self.config, name)
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -210,12 +216,13 @@ class ErnieMTP(nn.Module, SupportsPP):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -227,8 +234,9 @@ class ErnieMTP(nn.Module, SupportsPP):
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -236,33 +244,36 @@ class ErnieMTP(nn.Module, SupportsPP):
 
                 # According to DeepSeek-V3 Technical Report, MTP modules
                 # shares embedding layer. We only load the first weights.
-                if "mtp_" not in name and ("embed_tokens" not in name
-                                           and "lm_head" not in name):
+                if "mtp_" not in name and (
+                    "embed_tokens" not in name and "lm_head" not in name
+                ):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
-    def _rewrite_spec_layer_name(self, config: PretrainedConfig,
-                                 name: str) -> str:
+    def _rewrite_spec_layer_name(self, config: PretrainedConfig, name: str) -> str:
         """
         Rewrite the weight name to match the format of the original model.
         """
         spec_layer_weight_names = [
-            "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm",
-            "mtp_linear_proj"
+            "embed_tokens",
+            "mtp_emb_norm",
+            "mtp_hidden_norm",
+            "mtp_linear_proj",
         ]
         layer_idx = config.num_hidden_layers
         for weight_name in spec_layer_weight_names:
             if weight_name in name:
                 name = name.replace(
                     f"model.{weight_name}.0.",
-                    f"model.layers.{layer_idx}.{weight_name}.")
+                    f"model.layers.{layer_idx}.{weight_name}.",
+                )
                 return name
-        name = name.replace("model.mtp_block.0.",
-                            f"model.layers.{layer_idx}.mtp_block.")
+        name = name.replace(
+            "model.mtp_block.0.", f"model.layers.{layer_idx}.mtp_block."
+        )
         return name
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 5dafcd595e4a5ec995e2c13b49da9d04c322744a..6c56bfc433c7a4be45686114ac66a0cd09634e05 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -39,32 +39,42 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class ExaoneGatedMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -84,8 +94,9 @@ class ExaoneGatedMLP(nn.Module):
             prefix=f"{prefix}.c_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -96,7 +107,6 @@ class ExaoneGatedMLP(nn.Module):
 
 
 class ExaoneAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -104,11 +114,11 @@ class ExaoneAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -191,7 +201,6 @@ class ExaoneAttention(nn.Module):
 
 
 class ExaoneBlockAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -199,11 +208,11 @@ class ExaoneBlockAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -233,12 +242,11 @@ class ExaoneBlockAttention(nn.Module):
 
 
 class ExaoneDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -246,21 +254,24 @@ class ExaoneDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         self.attn = ExaoneBlockAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -284,7 +295,7 @@ class ExaoneDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -305,27 +316,24 @@ class ExaoneDecoderLayer(nn.Module):
 
 @support_torch_compile
 class ExaoneModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.wte = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.wte = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -341,30 +349,29 @@ class ExaoneModel(nn.Module):
             prefix=f"{prefix}.h",
         )
         if get_pp_group().is_last_rank:
-            self.ln_f = RMSNorm(config.hidden_size,
-                                eps=config.layer_norm_epsilon)
+            self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
         else:
             self.ln_f = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -379,16 +386,14 @@ class ExaoneModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -402,19 +407,19 @@ class ExaoneModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -447,8 +452,7 @@ class ExaoneModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -478,10 +482,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.transformer = ExaoneModel(
@@ -489,17 +492,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -507,44 +502,44 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 self.lm_head.weight = self.transformer.wte.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.transformer(input_ids, positions,
-                                        intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             # With tie_word_embeddings, we can skip lm_head.weight
             # The weight might appear unnecessarily in the files if the model is
             # processed with quantization, LoRA, fine-tuning, etc.
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index c78eedff6670013bb75fe2426c87bda141c6073f..b89e168ada20e3e2bd03514e28de6318dbf6f2c8 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -23,7 +23,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,33 +35,43 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class Exaone4GatedMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -81,8 +91,9 @@ class Exaone4GatedMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -93,7 +104,6 @@ class Exaone4GatedMLP(nn.Module):
 
 
 class Exaone4Attention(nn.Module):
-
     def __init__(
         self,
         config: Exaone4Config,
@@ -101,11 +111,11 @@ class Exaone4Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 1000000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -208,12 +218,11 @@ class Exaone4Attention(nn.Module):
 
 
 class Exaone4DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Exaone4Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -221,22 +230,25 @@ class Exaone4DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
 
         self.self_attn = Exaone4Attention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -253,16 +265,18 @@ class Exaone4DecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
-                                                  eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         residual = hidden_states
 
@@ -290,26 +304,23 @@ class Exaone4DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Exaone4Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -329,25 +340,25 @@ class Exaone4Model(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -362,16 +373,14 @@ class Exaone4Model(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -385,19 +394,19 @@ class Exaone4Model(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -430,8 +439,7 @@ class Exaone4Model(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -461,10 +469,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
 
         self.model = Exaone4Model(
@@ -472,17 +478,9 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -490,44 +488,44 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             # With tie_word_embeddings, we can skip lm_head.weight
             # The weight might appear unnecessarily in the files if the model is
             # processed with quantization, LoRA, fine-tuning, etc.
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index d78ee100b26df8cecbbeed7acba1decd18289688..ca0e7e64df53d78315a761159297c53e3e6c34ed 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -23,8 +23,10 @@ import torch
 from torch.nn import Parameter
 
 from vllm.config import VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.linear import set_weight_attrs
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
@@ -32,7 +34,6 @@ from .utils import AutoWeightsLoader, WeightsMapper
 
 
 class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -45,14 +46,12 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
             f"model.{self.tp_rank}.pt",
         ]
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         # fairseq2's serialization adds a wrapper to usual .pt state_dict's:
         # { "model_key": my_model_name, "my_model_name": state_dict }
         # which we first need to unpack
         weights_wrapped = dict(weights)
-        weights = weights_wrapped[
-            weights_wrapped["model_key"]].items()  # type: ignore
+        weights = weights_wrapped[weights_wrapped["model_key"]].items()  # type: ignore
 
         # remap keys
         fs2_to_vllm_mapper = WeightsMapper(
@@ -77,12 +76,14 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
 
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(
-            (self.reshape_fairseq2_weights(name, loaded_weight, params)
-             for name, loaded_weight in weights))
+            (
+                self.reshape_fairseq2_weights(name, loaded_weight, params)
+                for name, loaded_weight in weights
+            )
+        )
 
     def flag_sharded_weights(self, params: dict[str, Parameter]):
         """Sets the `is_sharded_weight` flag to True for all sharded weights"""
@@ -113,35 +114,34 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
                 attn_in //= self.tp_size
                 n_heads //= self.tp_size
             attn_out = self.config.hidden_size
-            return (w.view(n_heads, attn_in // n_heads // 2, 2,
-                           attn_out).transpose(1,
-                                               2).reshape(attn_in, attn_out))
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
 
         modules = name.split(".")
 
         # rotary embeds should be sliced
         if "k_proj" in modules:
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads)
+            loaded_weight = permute(loaded_weight, self.config.num_key_value_heads)
 
         elif "q_proj" in modules:
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads)
+            loaded_weight = permute(loaded_weight, self.config.num_attention_heads)
 
         # We make the loaded weights compatible with both
         # full checkpoints and tp sharded checkpoints.
         # Embeddings are repeated to fit the vocab size.
-        # Other weights are flagged for the weight_loader calls.
+        # Other weights are flagged for the weight_loader calls.
         if any(emb in modules for emb in ["embed_tokens", "lm_head"]):
             # Embeddings are sharded on dim 0
             dim = 0
             # In fairseq2, vocab size has to be divisible by tp_size
             # so we don't worry about padding
-            if self.tp_size > 1 and loaded_weight.shape[
-                    dim] < self.config.vocab_size:
-                assert loaded_weight.shape[
-                    dim] * self.tp_size == self.config.vocab_size, \
-                        "vocab_size should be divisible by tp_size."
+            if self.tp_size > 1 and loaded_weight.shape[dim] < self.config.vocab_size:
+                assert (
+                    loaded_weight.shape[dim] * self.tp_size == self.config.vocab_size
+                ), "vocab_size should be divisible by tp_size."
                 repeats = [1] * len(loaded_weight.size())
                 repeats[dim] = self.tp_size
                 # repeat to match vocab size and to be easily 'narrow'able
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 0c50056d1c52757445ed22d8858d8ef06537cfc2..85acdff3d96b48b49a8ad24f239d66a01fc435e7 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -23,7 +23,7 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
+from typing import TypeAlias
 
 import torch
 from torch import nn
@@ -33,60 +33,70 @@ from transformers import FalconConfig as HF_FalconConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import RWConfig
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
-FalconConfig = Union[HF_FalconConfig, RWConfig]
+FalconConfig: TypeAlias = HF_FalconConfig | RWConfig
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-                        dtype=torch.float32)
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=torch.float32
+    )
     powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
     slopes = torch.pow(base, powers)
 
     if closest_power_of_2 != total_num_heads:
         extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32)
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1,
-                                    1 + 2 * num_remaining_heads,
-                                    2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=torch.float32
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            1, 1 + 2 * num_remaining_heads, 2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
 
     return slopes
 
 
 class FalconAttention(nn.Module):
-
     def __init__(
         self,
         config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -127,65 +137,76 @@ class FalconAttention(nn.Module):
             bias=config.bias,
             skip_bias_add=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
         # Layer-wise attention scaling
         self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
+        self.reduce_row_parallel_results = not (
+            config.new_decoder_architecture or config.parallel_attn
+        )
         self.dense = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=config.bias,
             skip_bias_add=True,
             quant_config=quant_config,
-            reduce_results=self.reduce_row_parallel_results)
+            reduce_results=self.reduce_row_parallel_results,
+            prefix=f"{prefix}.dense",
+        )
 
         self.use_rotary = config.rotary
         self.use_alibi = config.alibi
         assert not (self.use_rotary and self.use_alibi), (
-            "Rotary and alibi are mutually exclusive.")
+            "Rotary and alibi are mutually exclusive."
+        )
 
         if self.use_rotary:
             rope_theta = getattr(config, "rope_theta", 10000)
-            max_position_embeddings = getattr(config,
-                                              "max_position_embeddings", 8192)
+            max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
             self.rotary_emb = get_rope(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
                 base=rope_theta,
             )
-            self.attn = Attention(self.num_heads,
-                                  self.head_dim,
-                                  self.inv_norm_factor,
-                                  num_kv_heads=self.num_kv_heads,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                self.inv_norm_factor,
+                num_kv_heads=self.num_kv_heads,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+            )
         elif self.use_alibi:
             tp_rank = get_tensor_model_parallel_rank()
             head_start = tp_rank * self.num_heads
             head_end = (tp_rank + 1) * self.num_heads
-            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
-                            self.inv_norm_factor)
+            alibi_slopes = (
+                _get_alibi_slopes(self.total_num_heads) * self.inv_norm_factor
+            )
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-            self.attn = Attention(self.num_heads,
-                                  self.head_dim,
-                                  self.inv_norm_factor,
-                                  num_kv_heads=self.num_kv_heads,
-                                  alibi_slopes=alibi_slopes,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                self.inv_norm_factor,
+                num_kv_heads=self.num_kv_heads,
+                alibi_slopes=alibi_slopes,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+            )
         else:
-            self.attn = Attention(self.num_heads,
-                                  self.head_dim,
-                                  scale=self.inv_norm_factor,
-                                  num_kv_heads=self.num_kv_heads,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                scale=self.inv_norm_factor,
+                num_kv_heads=self.num_kv_heads,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+            )
 
     def forward(
         self,
@@ -204,30 +225,36 @@ class FalconAttention(nn.Module):
 
 
 class FalconMLP(nn.Module):
-
     def __init__(
         self,
         config: FalconConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
 
-        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
-                                                  4 * hidden_size,
-                                                  bias=config.bias,
-                                                  skip_bias_add=True,
-                                                  quant_config=quant_config)
+        self.dense_h_to_4h = ColumnParallelLinear(
+            hidden_size,
+            4 * hidden_size,
+            bias=config.bias,
+            skip_bias_add=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+        )
         self.act = get_act_fn("gelu")
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
+        self.reduce_row_parallel_results = not (
+            config.new_decoder_architecture or config.parallel_attn
+        )
         self.dense_4h_to_h = RowParallelLinear(
             4 * hidden_size,
             hidden_size,
             bias=config.bias,
             skip_bias_add=True,
             reduce_results=self.reduce_row_parallel_results,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
@@ -240,51 +267,47 @@ class FalconMLP(nn.Module):
 
 
 class FalconDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.self_attention = FalconAttention(
-            config,
-            cache_config,
-            quant_config,
-            prefix=f"{prefix}.self_attention")
-        self.mlp = FalconMLP(config, quant_config)
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attention"
+        )
+        self.mlp = FalconMLP(config, quant_config, prefix=f"{prefix}.mlp")
         self.config = config
 
-        if (not hasattr(config, "num_ln_in_parallel_attn")):
+        if not hasattr(config, "num_ln_in_parallel_attn"):
             config.num_ln_in_parallel_attn = None
 
-        if (config.num_ln_in_parallel_attn is None
-                and config.new_decoder_architecture):
+        if config.num_ln_in_parallel_attn is None and config.new_decoder_architecture:
             config.num_ln_in_parallel_attn = 2
 
         if not config.parallel_attn:
             self.post_attention_layernorm = LayerNorm(
-                hidden_size, eps=config.layer_norm_epsilon)
-            self.input_layernorm = LayerNorm(hidden_size,
-                                             eps=config.layer_norm_epsilon)
+                hidden_size, eps=config.layer_norm_epsilon
+            )
+            self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         else:
             if config.num_ln_in_parallel_attn == 2:
                 # The layer norm before self-attention
-                self.ln_attn = LayerNorm(hidden_size,
-                                         eps=config.layer_norm_epsilon)
+                self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
                 # The layer norm before the MLP
-                self.ln_mlp = LayerNorm(hidden_size,
-                                        eps=config.layer_norm_epsilon)
+                self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
             else:
-                self.input_layernorm = LayerNorm(hidden_size,
-                                                 eps=config.layer_norm_epsilon)
+                self.input_layernorm = LayerNorm(
+                    hidden_size, eps=config.layer_norm_epsilon
+                )
 
-        self.reduce_row_parallel_results = not (config.new_decoder_architecture
-                                                or config.parallel_attn)
+        self.reduce_row_parallel_results = not (
+            config.new_decoder_architecture or config.parallel_attn
+        )
 
     def forward(
         self,
@@ -314,8 +337,11 @@ class FalconDecoderLayer(nn.Module):
                 residual += attention_output
                 mlp_layernorm_out = self.post_attention_layernorm(residual)
 
-        if (self.config.new_decoder_architecture and self.config.parallel_attn
-                and self.config.num_ln_in_parallel_attn == 1):
+        if (
+            self.config.new_decoder_architecture
+            and self.config.parallel_attn
+            and self.config.num_ln_in_parallel_attn == 1
+        ):
             mlp_layernorm_out = attention_layernorm_out
 
         # MLP.
@@ -340,7 +366,6 @@ class FalconDecoderLayer(nn.Module):
 
 @support_torch_compile
 class FalconModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -363,30 +388,32 @@ class FalconModel(nn.Module):
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
             lambda prefix: FalconDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.h")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.h",
+        )
 
         # Final Layer Norm
         self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -396,8 +423,7 @@ class FalconModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         total_num_heads = self.config.num_attention_heads
         if self.config.new_decoder_architecture:
             total_num_kv_heads = self.config.num_kv_heads
@@ -420,26 +446,34 @@ class FalconModel(nn.Module):
                 loaded_weight_shape = loaded_weight.shape
                 if output_dim is not None:
                     loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] +
-                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
-                         -1) + loaded_weight_shape[output_dim + 1:])
+                        loaded_weight_shape[:output_dim]
+                        + (total_num_kv_heads, num_query_heads_per_kv_head + 2, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
                     wq = loaded_weight.narrow(
-                        output_dim + 1, 0,
-                        num_query_heads_per_kv_head).reshape(
-                            *loaded_weight_shape[:output_dim], -1,
-                            *loaded_weight_shape[output_dim + 1:])
+                        output_dim + 1, 0, num_query_heads_per_kv_head
+                    ).reshape(
+                        *loaded_weight_shape[:output_dim],
+                        -1,
+                        *loaded_weight_shape[output_dim + 1 :],
+                    )
                     wk = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
+                        output_dim + 1, num_query_heads_per_kv_head, 1
+                    ).reshape(
+                        *loaded_weight_shape[:output_dim],
+                        -1,
+                        *loaded_weight_shape[output_dim + 1 :],
+                    )
                     wv = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head + 1,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
+                        output_dim + 1, num_query_heads_per_kv_head + 1, 1
+                    ).reshape(
+                        *loaded_weight_shape[:output_dim],
+                        -1,
+                        *loaded_weight_shape[output_dim + 1 :],
+                    )
                     loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
 
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -456,15 +490,17 @@ class FalconForCausalLM(nn.Module, SupportsPP):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = FalconModel(vllm_config=vllm_config,
-                                       prefix=maybe_prefix(
-                                           prefix, "transformer"))
+        self.transformer = FalconModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         # only Falcon-11B doesn't share lm_head weight with word embeddings
         # and previous Falcon model doesn't have tie_word_embeddings config
         # so we set tie_word_embeddings to True by default
-        self.tie_word_embeddings = (config.tie_word_embeddings
-                                    if config.tie_word_embeddings is not None
-                                    else True)
+        self.tie_word_embeddings = (
+            config.tie_word_embeddings
+            if config.tie_word_embeddings is not None
+            else True
+        )
         if self.tie_word_embeddings:
             self.lm_head = self.transformer.word_embeddings
         else:
@@ -476,34 +512,34 @@ class FalconForCausalLM(nn.Module, SupportsPP):
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.LongTensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index f382018e2222c6cfbffe37c5a256e4b7961a1c92..3653425b8e1ca9d38541af4a925fdc776252d3f2 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only FalconH1 model."""
+
 from collections.abc import Iterable
-from typing import Optional
+from itertools import islice
 
 import torch
 from torch import nn
@@ -15,33 +16,49 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+)
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class FalconH1MLP(nn.Module):
-
     def __init__(
         self,
         config: FalconH1Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -49,24 +66,28 @@ class FalconH1MLP(nn.Module):
             output_sizes=[config.intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.intermediate_size = config.intermediate_size
         self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
         x, _ = self.gate_up_proj(x)
-        x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier
+        x[:, : self.intermediate_size // self.tp_size] *= self.gate_multiplier
         x = self.act_fn(x)
         x, _ = self.down_proj(x)
         x = x * self.down_multiplier
@@ -74,21 +95,23 @@ class FalconH1MLP(nn.Module):
 
 
 class FalconH1SSMDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: FalconH1Config,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
 
-        self.d_ssm = (int(config.mamba_expand * config.hidden_size)
-                      if config.mamba_d_ssm is None else config.mamba_d_ssm)
+        self.d_ssm = (
+            int(config.mamba_expand * config.hidden_size)
+            if config.mamba_d_ssm is None
+            else config.mamba_d_ssm
+        )
 
         self.mamba = MambaMixer2(
             hidden_size=config.hidden_size,
@@ -115,15 +138,15 @@ class FalconH1SSMDecoderLayer(nn.Module):
 
     def _init_mup_vector(self):
         """
-        Non learnable per-block scaling vector composed of element-wise 
-        multipliersapplied to each separate contiguous block of the output 
+        Non learnable per-block scaling vector composed of element-wise
+        multipliersapplied to each separate contiguous block of the output
         of the linear projection (in_proj) before further processing
         (gating, convolution, SSM):
 
             - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
             - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
             - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
-            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S] 
+            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
                         → zxbcdt_multipliers[3]
             - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]
 
@@ -133,38 +156,38 @@ class FalconH1SSMDecoderLayer(nn.Module):
             - S:         SSM state size per group
             - All indices are divided by tp_size to support tensor parallelism
         """
-        vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size +
-                        self.config.mamba_n_heads) // self.tp_size
+        vector_shape = (
+            2 * self.d_ssm + 2 * self.groups_time_state_size + self.config.mamba_n_heads
+        ) // self.tp_size
         mup_vector = torch.ones(1, vector_shape)
         # Z vector 0 -> d_ssm
-        mup_vector[:, :self.d_ssm //
-                   self.tp_size] *= self.zxbcdt_multipliers[0]
+        mup_vector[:, : self.d_ssm // self.tp_size] *= self.zxbcdt_multipliers[0]
         # X vector d_ssm -> 2 * d_ssm
-        mup_vector[:,
-                   (self.d_ssm //
-                    self.tp_size):(2 * self.d_ssm //
-                                   self.tp_size)] *= self.zxbcdt_multipliers[1]
+        mup_vector[
+            :, (self.d_ssm // self.tp_size) : (2 * self.d_ssm // self.tp_size)
+        ] *= self.zxbcdt_multipliers[1]
         # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state)
         mup_vector[
             :,
-            (2 * self.d_ssm) //
-            self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) //
-            self.tp_size,
+            (2 * self.d_ssm) // self.tp_size : (
+                2 * self.d_ssm + self.groups_time_state_size
+            )
+            // self.tp_size,
         ] *= self.zxbcdt_multipliers[2]
         # C vector 2 * d_ssm + (n_group * d_state)
         # -> 2 * d_ssm + 2 * (n_group * d_state)
         mup_vector[
             :,
-            (2 * self.d_ssm + self.groups_time_state_size) //
-            self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) //
-            self.tp_size,
+            (2 * self.d_ssm + self.groups_time_state_size) // self.tp_size : (
+                2 * self.d_ssm + 2 * self.groups_time_state_size
+            )
+            // self.tp_size,
         ] *= self.zxbcdt_multipliers[3]
         # dt vector 2 * d_ssm + 2 * (n_group * d_state)
         # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads
         mup_vector[
             :,
-            (2 * self.d_ssm + 2 * self.groups_time_state_size) //
-            self.tp_size:,
+            (2 * self.d_ssm + 2 * self.groups_time_state_size) // self.tp_size :,
         ] *= self.zxbcdt_multipliers[4]
 
         self.register_buffer("mup_vector", mup_vector, persistent=False)
@@ -172,7 +195,7 @@ class FalconH1SSMDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         output = torch.empty_like(hidden_states)
@@ -185,19 +208,17 @@ class FalconH1SSMDecoderLayer(nn.Module):
 
 
 class FalconH1AttentionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: FalconH1Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         rope_theta = getattr(config, "rope_theta", 1e11)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = config.num_attention_heads
@@ -213,8 +234,11 @@ class FalconH1AttentionDecoderLayer(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = (config.hidden_size // self.total_num_heads if getattr(
-            config, "head_dim", None) is None else config.head_dim)
+        self.head_dim = (
+            config.hidden_size // self.total_num_heads
+            if getattr(config, "head_dim", None) is None
+            else config.head_dim
+        )
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -284,7 +308,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         hidden_states = self.self_attention(
@@ -309,9 +333,9 @@ class FalconH1ParallelHybrid(nn.Module):
         self,
         config: FalconH1Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -343,12 +367,10 @@ class FalconH1ParallelHybrid(nn.Module):
         self.attention_in_multiplier = config.attention_in_multiplier
         self.attn_out_multiplier = config.attention_out_multiplier
 
-        self.feed_forward = FalconH1MLP(config)
+        self.feed_forward = FalconH1MLP(config, prefix=f"{prefix}.feed_forward")
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -380,7 +402,8 @@ class FalconH1ParallelHybrid(nn.Module):
         # We assume both branches produce outputs of the same
         # dimensionality (config.hidden_size).
         hidden_states = (attn_hidden * self.attn_out_multiplier) + (
-            ssm_hidden * self.ssm_out_multiplier)
+            ssm_hidden * self.ssm_out_multiplier
+        )
         hidden_states = hidden_states + residual
 
         # feed-forward
@@ -394,26 +417,21 @@ class FalconH1ParallelHybrid(nn.Module):
 
 @support_torch_compile
 class FalconH1Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: FalconH1Config = vllm_config.model_config.hf_config
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank:
 
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
             self.embedding_multiplier = config.embedding_multiplier
         else:
@@ -433,53 +451,60 @@ class FalconH1Model(nn.Module):
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
         if get_pp_group().is_last_rank:
-            self.final_layernorm = RMSNorm(config.hidden_size,
-                                           eps=config.rms_norm_eps)
+            self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.final_layernorm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds * self.embedding_multiplier
             else:
-                hidden_states = (self.get_input_embeddings(input_ids) *
-                                 self.embedding_multiplier)
+                hidden_states = (
+                    self.embed_input_ids(input_ids) * self.embedding_multiplier
+                )
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(
                 positions=positions,
                 hidden_states=hidden_states,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                }
+            )
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
 
-class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                          IsHybrid):
+class FalconH1ForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsMambaPrefixCaching,
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -496,7 +521,6 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -521,10 +545,11 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         parallel_config = vllm_config.parallel_config
         hf_config = vllm_config.model_config.hf_config
 
-        intermediate_size = (int(hf_config.mamba_expand *
-                                 hf_config.hidden_size)
-                             if hf_config.mamba_d_ssm is None else
-                             hf_config.mamba_d_ssm)
+        intermediate_size = (
+            int(hf_config.mamba_expand * hf_config.hidden_size)
+            if hf_config.mamba_d_ssm is None
+            else hf_config.mamba_d_ssm
+        )
 
         return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
@@ -540,44 +565,32 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert (not cache_config.enable_prefix_caching
-                ), "FalconH1 currently does not support prefix caching"
 
         self.quant_config = vllm_config.quant_config
 
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = FalconH1Model(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        self.model = FalconH1Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.tie_word_embeddings = config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config else
-                    lora_config.lora_vocab_padding_size),
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.lm_head_multiplier = config.lm_head_multiplier
             if self.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.embed_tokens)
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
             # Used to track and store by the Mamba cache between steps.
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.vocab_size,
                 scale=config.lm_head_multiplier,
             )
@@ -585,20 +598,20 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
-
         hidden_states = self.model(
             input_ids,
             positions,
@@ -611,13 +624,12 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
 
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -664,8 +676,7 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
diff --git a/vllm/model_executor/models/flex_olmo.py b/vllm/model_executor/models/flex_olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d0949a798a1fc3262abc60cd0f6d8f8779375d
--- /dev/null
+++ b/vllm/model_executor/models/flex_olmo.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only FlexOlmo model compatible with HuggingFace weights."""
+
+import torch
+from torch import nn
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.models.olmoe import OlmoeAttention, OlmoeForCausalLM
+from vllm.transformers_utils.configs import FlexOlmoConfig
+
+logger = init_logger(__name__)
+
+
+class FlexOlmoAttention(OlmoeAttention):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        hf_config = vllm_config.model_config.hf_config
+        assert isinstance(hf_config, FlexOlmoConfig)
+
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim, eps=hf_config.rms_norm_eps
+        )
+        self.q_norm = RMSNorm(
+            self.total_num_heads * self.head_dim, eps=hf_config.rms_norm_eps
+        )
+
+
+class FlexOlmoMoE(nn.Module):
+    """A tensor-parallel MoE implementation for FlexOlmo that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        hf_config = vllm_config.model_config.hf_config
+        assert isinstance(hf_config, FlexOlmoConfig)
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hf_config.hidden_size,
+            hf_config.num_experts,
+            bias=False,
+            return_bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Gate always runs at half / full precision for now.
+        self.experts = FusedMoE(
+            num_experts=hf_config.num_experts,
+            top_k=hf_config.num_experts_per_tok,
+            hidden_size=hf_config.hidden_size,
+            intermediate_size=hf_config.intermediate_size,
+            reduce_results=True,
+            renormalize=False,
+            quant_config=None,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.top_k = hf_config.num_experts_per_tok
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        # Warning: The experts mutate the hidden state input! This messes up
+        # basic things like the residual stream.
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states.detach().clone(),
+            router_logits=router_logits.float(),
+        )
+
+        return final_hidden_states.view(orig_shape)
+
+
+class FlexOlmoDecoderLayer(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        hf_config = vllm_config.model_config.hf_config
+        assert isinstance(hf_config, FlexOlmoConfig)
+
+        self.self_attn = FlexOlmoAttention(
+            vllm_config=vllm_config, prefix=f"{prefix}.self_attn"
+        )
+        self.post_attention_layernorm = RMSNorm(
+            hf_config.hidden_size, eps=hf_config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            hf_config.hidden_size, eps=hf_config.rms_norm_eps
+        )
+
+        self.mlp = FlexOlmoMoE(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, None
+
+
+class FlexOlmoForCausalLM(OlmoeForCausalLM):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = FlexOlmoDecoderLayer,
+    ):
+        super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 53e9e6fe6e460eb5e2a48f369dd2ef46fb3622ce..269c36ab5b9c74325d38e4391677c4ad69db0a41 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,34 +16,40 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Fuyu model."""
+"""PyTorch Fuyu model."""
+
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
-                          FuyuProcessor)
+from transformers import BatchFeature, FuyuConfig, FuyuImageProcessor, FuyuProcessor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -60,22 +66,18 @@ class FuyuImagePatchInputs(TensorSchema):
 
     type: Literal["image_patches"] = "image_patches"
 
-    flat_data: Annotated[
-        torch.Tensor,
-        TensorShape("bnp", "fn"),
-    ]
+    image_patches_flat: Annotated[torch.Tensor, TensorShape("bnp", "fn")]
 
     patches_per_image: Annotated[list[int], TensorShape("bn")]
     """
     The number of total patches for each image in the batch.
     
     This is used to split the embeddings which has the first two dimensions
-    flattened just like `flat_data`.
+    flattened just like `image_patches_flat`.
     """
 
 
 class FuyuProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
@@ -85,7 +87,7 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_image_feature_grid_size(
@@ -127,12 +129,12 @@ class FuyuProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        return ImageSize(width=image_processor.size["width"],
-                         height=image_processor.size["height"])
+        return ImageSize(
+            width=image_processor.size["width"], height=image_processor.size["height"]
+        )
 
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -140,21 +142,24 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -175,28 +180,11 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
             tok_kwargs=tok_kwargs,
         )
 
-        image_patches = processed_outputs.get("image_patches")
-        if image_patches is not None:
-            images = mm_data["images"]
-            assert isinstance(images, list)
-
-            # Original output: (1, num_images, Pn, Px * Py * C)
-            # New output: (num_images, Pn, Px * Py * C)
-            # image_patches is a list with shape:
-            # (1, num_images, Pn, Px * Py * C)
-            # before Transformers 4.53
-            if isinstance(image_patches, list):
-                assert len(image_patches) == 1
-                assert (isinstance(image_patches[0], torch.Tensor)
-                        and len(image_patches[0]) == len(images))
-                processed_outputs["image_patches"] = image_patches[0]
-            # image_patches is a tensor with shape:
-            # (num_images, Pn, Px * Py * C)
-            # after Transformers 4.53
-            elif isinstance(image_patches, torch.Tensor):
-                assert len(image_patches) == len(images)
-            else:
-                raise AssertionError("This line should be unreachable.")
+        image_patches = processed_outputs["image_patches"]
+        processed_outputs["image_patches"] = flatten_bn(image_patches)
+        processed_outputs["patches_per_image"] = torch.tensor(
+            [len(p) for p in image_patches]
+        )
 
         return processed_outputs
 
@@ -219,7 +207,14 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+        patches_per_image = hf_inputs.get("patches_per_image", torch.empty(0))
+
+        return dict(
+            image_patches=MultiModalFieldConfig.flat_from_sizes(
+                "image", patches_per_image
+            ),
+            patches_per_image=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -243,8 +238,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                            [_NEWLINE_TOKEN_ID]) * nrows
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
             return PromptUpdateDetails.select_token_id(
                 image_tokens + [bos_token_id],
@@ -260,20 +254,24 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
-                                        info=FuyuProcessingInfo,
-                                        dummy_inputs=FuyuDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    FuyuMultiModalProcessor,
+    info=FuyuProcessingInfo,
+    dummy_inputs=FuyuDummyInputsBuilder,
+)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.": "vision_embed_tokens.",
             "model.language_model.": "language_model.model.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -302,81 +300,57 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        self, **kwargs: object
+    ) -> FuyuImagePatchInputs | None:
         image_patches = kwargs.pop("image_patches", None)
-        if image_patches is not None:
-            image_patches_flat = flatten_bn(image_patches)
-            flat_data = flatten_bn(image_patches_flat, concat=True)
-
-            return FuyuImagePatchInputs(
-                type="image_patches",
-                flat_data=flat_data,
-                patches_per_image=[x.size(0) for x in image_patches_flat],
-                resolve_bindings={"fn": self.image_feature_size},
-            )
+        patches_per_image = kwargs.pop("patches_per_image", None)
+
+        if image_patches is None:
+            return None
 
-        return None
+        return FuyuImagePatchInputs(
+            type="image_patches",
+            image_patches_flat=image_patches,
+            patches_per_image=patches_per_image,
+            resolve_bindings={"fn": self.image_feature_size},
+        )
 
     def _process_image_input(
-            self, image_input: FuyuImagePatchInputs) -> MultiModalEmbeddings:
-        image_patches_flat = image_input["flat_data"]
+        self, image_input: FuyuImagePatchInputs
+    ) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["image_patches_flat"]
         patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings_flat, _ = self.vision_embed_tokens(
-            image_patches_flat)
+        vision_embeddings_flat, _ = self.vision_embed_tokens(image_patches_flat)
 
-        return vision_embeddings_flat.split(patches_per_image, dim=0)
+        return vision_embeddings_flat.split(patches_per_image.tolist(), dim=0)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                _IMAGE_TOKEN_ID,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         hidden_states = self.language_model(
             input_ids=input_ids,
             positions=positions,
@@ -388,12 +362,12 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.language_model.logits_processor(
-            self.language_model.lm_head, hidden_states)
+            self.language_model.lm_head, hidden_states
+        )
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index c19425b6cb6d6fefa35460ed0025e26fd979c598..7aaae7c503b5870777c8825f64de06d2e261ea30 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -16,10 +16,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from functools import cache
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -32,29 +32,34 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 @cache
 def _get_gemma_act_fn(
-    hidden_act: Optional[str],
-    hidden_activation: Optional[str],
+    hidden_act: str | None,
+    hidden_activation: str | None,
 ) -> nn.Module:
     if hidden_activation is None:
         if hidden_act is not None:
@@ -66,26 +71,29 @@ def _get_gemma_act_fn(
                 "`%s`, edit the config JSON to set "
                 "`hidden_activation=%s` instead of `hidden_act`. "
                 "See https://github.com/huggingface/transformers/pull/29402 "
-                "for more details.", hidden_act, hidden_act)
+                "for more details.",
+                hidden_act,
+                hidden_act,
+            )
         return GeluAndMul(approximate="tanh")
     elif hidden_activation == "gelu_pytorch_tanh":
         return GeluAndMul(approximate="tanh")
     elif hidden_activation == "gelu":
         return GeluAndMul(approximate="none")
     else:
-        raise ValueError(f"Activation function {hidden_act} is not "
-                         "supported for Gemma models.")
+        raise ValueError(
+            f"Activation function {hidden_act} is not supported for Gemma models."
+        )
 
 
 class GemmaMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
-        hidden_act: Optional[str] = None,
-        hidden_activation: Optional[str] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        hidden_act: str | None = None,
+        hidden_activation: str | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -113,7 +121,6 @@ class GemmaMLP(nn.Module):
 
 
 class GemmaAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -122,8 +129,8 @@ class GemmaAttention(nn.Module):
         head_dim: int,
         max_position_embeddings: int = 8192,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -172,13 +179,15 @@ class GemmaAttention(nn.Module):
             base=self.rope_theta,
             is_neox_style=True,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -194,12 +203,11 @@ class GemmaAttention(nn.Module):
 
 
 class GemmaDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: GemmaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -223,39 +231,36 @@ class GemmaDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class GemmaModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -272,8 +277,10 @@ class GemmaModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: GemmaDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
@@ -281,28 +288,26 @@ class GemmaModel(nn.Module):
         # data type such as bfloat16, not float32.
         # See https://github.com/huggingface/transformers/pull/29402
         normalizer = self.config.hidden_size**0.5
-        self.register_buffer("normalizer",
-                             torch.tensor(normalizer),
-                             persistent=False)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.register_buffer("normalizer", torch.tensor(normalizer), persistent=False)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -315,15 +320,13 @@ class GemmaModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -335,7 +338,7 @@ class GemmaModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
@@ -355,8 +358,7 @@ class GemmaModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -380,46 +382,45 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = GemmaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = GemmaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 3f76e1e7d42a22e4fd9792f0eabdb18624e3712e..4d5d6cbb37c628089ca50a9d5af7c6a071d92b80 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -18,7 +18,6 @@
 # limitations under the License.
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -31,51 +30,65 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Gemma2MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
         hidden_activation: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
             raise ValueError(
                 "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
                 "function. Please set `hidden_act` and `hidden_activation` to "
-                "`gelu_pytorch_tanh`.")
+                "`gelu_pytorch_tanh`."
+            )
         self.act_fn = GeluAndMul(approximate="tanh")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -86,19 +99,20 @@ class Gemma2MLP(nn.Module):
 
 
 class Gemma2Attention(nn.Module):
-
-    def __init__(self,
-                 config: Gemma2Config,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 head_dim: int,
-                 max_position_embeddings: int,
-                 rope_theta: float,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 attn_logits_soft_cap: Optional[float] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: Gemma2Config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        rope_theta: float,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = hidden_size
@@ -129,12 +143,14 @@ class Gemma2Attention(nn.Module):
             self.total_num_kv_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -148,15 +164,17 @@ class Gemma2Attention(nn.Module):
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         sliding_window = config.sliding_window if is_sliding else None
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap,
-                              per_layer_sliding_window=sliding_window,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=attn_logits_soft_cap,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -172,12 +190,11 @@ class Gemma2Attention(nn.Module):
 
 
 class Gemma2DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Gemma2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -202,28 +219,30 @@ class Gemma2DecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
         )
-        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
-        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                       eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -231,7 +250,8 @@ class Gemma2DecoderLayer(nn.Module):
         hidden_states = self.post_attention_layernorm(hidden_states)
 
         hidden_states, residual = self.pre_feedforward_layernorm(
-            hidden_states, residual)
+            hidden_states, residual
+        )
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_feedforward_layernorm(hidden_states)
         return hidden_states, residual
@@ -239,7 +259,6 @@ class Gemma2DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Gemma2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -255,8 +274,10 @@ class Gemma2Model(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Gemma2DecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
@@ -264,28 +285,26 @@ class Gemma2Model(nn.Module):
         # data type such as bfloat16, not float32.
         # See https://github.com/huggingface/transformers/pull/29402
         normalizer = self.config.hidden_size**0.5
-        self.register_buffer("normalizer",
-                             torch.tensor(normalizer),
-                             persistent=False)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.register_buffer("normalizer", torch.tensor(normalizer), persistent=False)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -299,15 +318,13 @@ class Gemma2Model(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -319,17 +336,17 @@ class Gemma2Model(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
@@ -353,8 +370,7 @@ class Gemma2Model(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -377,46 +393,47 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
         self.quant_config = quant_config
-        self.model = Gemma2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.logits_processor = LogitsProcessor(
-            config.vocab_size, soft_cap=config.final_logit_softcapping)
+            config.vocab_size, soft_cap=config.final_logit_softcapping
+        )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 77c0ef8cb91d249e69df8d08a09a9f1be13d261a..357e61a4e78bf596c81361b3e6c0b3cab4743b05 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -31,36 +30,42 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from ...attention.layers.encoder_only_attention import EncoderOnlyAttention
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Gemma3MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_activation: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -82,7 +87,8 @@ class Gemma3MLP(nn.Module):
             raise ValueError(
                 "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
                 "function. Please set `hidden_act` and `hidden_activation` to "
-                "`gelu_pytorch_tanh`.")
+                "`gelu_pytorch_tanh`."
+            )
         self.act_fn = GeluAndMul(approximate="tanh")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -93,18 +99,19 @@ class Gemma3MLP(nn.Module):
 
 
 class Gemma3Attention(nn.Module):
-
-    def __init__(self,
-                 config: Gemma3TextConfig,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 head_dim: int,
-                 max_position_embeddings: int,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 attn_logits_soft_cap: Optional[float] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = hidden_size
@@ -174,19 +181,24 @@ class Gemma3Attention(nn.Module):
         else:
             attn_type = AttentionType.ENCODER_ONLY
 
-        attn_cls = (EncoderOnlyAttention
-                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+        attn_cls = (
+            EncoderOnlyAttention
+            if attn_type == AttentionType.ENCODER_ONLY
+            else Attention
+        )
 
-        self.attn = attn_cls(self.num_heads,
-                             self.head_dim,
-                             self.scaling,
-                             num_kv_heads=self.num_kv_heads,
-                             cache_config=cache_config,
-                             quant_config=quant_config,
-                             attn_type=attn_type,
-                             logits_soft_cap=attn_logits_soft_cap,
-                             per_layer_sliding_window=sliding_window,
-                             prefix=f"{prefix}.attn")
+        self.attn = attn_cls(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_type=attn_type,
+            logits_soft_cap=attn_logits_soft_cap,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -225,11 +237,7 @@ class Gemma3Attention(nn.Module):
         # output is discarded and overwritten below. While this duplicates
         # computation, it maintains compatibility.
         # TODO(woosuk): Optimize by implementing custom attention kernels.
-        attn_output = self.naive_attn_with_masks(q,
-                                                 k,
-                                                 v,
-                                                 out=attn_output,
-                                                 **kwargs)
+        attn_output = self.naive_attn_with_masks(q, k, v, out=attn_output, **kwargs)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -283,12 +291,11 @@ class Gemma3Attention(nn.Module):
 
 
 class Gemma3DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Gemma3TextConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -313,28 +320,29 @@ class Gemma3DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
-        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                       eps=config.rms_norm_eps)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -343,7 +351,8 @@ class Gemma3DecoderLayer(nn.Module):
         hidden_states = self.post_attention_layernorm(hidden_states)
 
         hidden_states, residual = self.pre_feedforward_layernorm(
-            hidden_states, residual)
+            hidden_states, residual
+        )
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_feedforward_layernorm(hidden_states)
         return hidden_states, residual
@@ -351,7 +360,6 @@ class Gemma3DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Gemma3Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -363,13 +371,16 @@ class Gemma3Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
             prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Gemma3DecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
@@ -377,31 +388,29 @@ class Gemma3Model(nn.Module):
         # data type such as bfloat16, not float32.
         # See https://github.com/huggingface/transformers/pull/29402
         normalizer = self.config.hidden_size**0.5
-        self.register_buffer("normalizer",
-                             torch.tensor(normalizer),
-                             persistent=False)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.register_buffer("normalizer", torch.tensor(normalizer), persistent=False)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         # NOTE(woosuk): Only apply the normalizer to the output of
         # vocab embedding. Don't apply it to the vision embedding.
         return self.embed_tokens(input_ids) * self.normalizer
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -415,15 +424,13 @@ class Gemma3Model(nn.Module):
                 **kwargs,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -435,33 +442,42 @@ class Gemma3Model(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            # Revert +1 during llama.cpp conversion
+            # see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400
+            if (
+                self.quant_config
+                and self.quant_config.get_name() == "gguf"
+                and name.endswith("norm.weight")
+            ):
+                loaded_weight -= 1
+
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
 
             # Check if this is a scale parameter that needs remapping first
-            if name.endswith(
-                (".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
+            if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
                 # Try to remap the scale name first
                 remapped_name = maybe_remap_kv_scale_name(name, params_dict)
                 if remapped_name is not None and remapped_name in params_dict:
                     # Successfully remapped, use the remapped name
                     param = params_dict[remapped_name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
                     loaded_params.add(remapped_name)
                     continue
                 # If remapping failed, continue with normal processing
 
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
@@ -485,8 +501,7 @@ class Gemma3Model(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -509,47 +524,48 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
         self.quant_config = quant_config
-        self.model = Gemma3Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.logits_processor = LogitsProcessor(
-            config.vocab_size, soft_cap=config.final_logit_softcapping)
+            config.vocab_size, soft_cap=config.final_logit_softcapping
+        )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 0630ee07c347e4f8990d4e03ff342a388d60fe8a..8e2bbe8f7990c3e7d37355888615235ebe78e370 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -2,43 +2,58 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
 from transformers import BatchFeature, Gemma3Config, Gemma3Processor
 from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-# yapf: disable
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalPromptUpdates,
-                                        MultiModalPromptUpdatesApplyResult,
-                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails,
-                                        replace_token_matches)
-# yapf: enable
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalPromptUpdates,
+    MultiModalPromptUpdatesApplyResult,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+    replace_token_matches,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -53,6 +68,7 @@ class Gemma3ImagePixelInputs(TensorSchema):
         - w: Width of each patch
         - bn: Batch size * number of images
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[torch.Tensor, TensorShape("p", 3, "h", "w")]
@@ -60,18 +76,25 @@ class Gemma3ImagePixelInputs(TensorSchema):
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-Gemma3ImageInputs = Gemma3ImagePixelInputs
+class Gemma3ImageEmbeddingInputs(TensorSchema):
+    type: Literal["image_embeds"] = "image_embeds"
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("ni", "nf", "hs"),
+    ]
 
 
-class Gemma3ProcessingInfo(BaseProcessingInfo):
+Gemma3ImageInputs: TypeAlias = Gemma3ImagePixelInputs | Gemma3ImageEmbeddingInputs
 
+
+class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Gemma3Config)
 
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def _resolve_image_kwargs(
@@ -101,33 +124,35 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3Processor],
+        processor: Gemma3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
 
         images_kwargs = self._resolve_image_kwargs(
-            processor, {
-                "do_pan_and_scan", "pan_and_scan_min_crop_size",
+            processor,
+            {
+                "do_pan_and_scan",
+                "pan_and_scan_min_crop_size",
                 "pan_and_scan_max_num_crops",
-                "pan_and_scan_min_ratio_to_activate"
-            })
+                "pan_and_scan_min_ratio_to_activate",
+            },
+        )
 
         do_pan_and_scan = images_kwargs["do_pan_and_scan"]
-        pan_and_scan_min_crop_size = images_kwargs[
-            "pan_and_scan_min_crop_size"]
-        pan_and_scan_max_num_crops = images_kwargs[
-            "pan_and_scan_max_num_crops"]
+        pan_and_scan_min_crop_size = images_kwargs["pan_and_scan_min_crop_size"]
+        pan_and_scan_max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
         pan_and_scan_min_ratio_to_activate = images_kwargs[
-            "pan_and_scan_min_ratio_to_activate"]
+            "pan_and_scan_min_ratio_to_activate"
+        ]
 
         if not do_pan_and_scan:
             return 0
 
-        if envs.VLLM_USE_V1:
-            logger.warning_once(
-                "`do_pan_and_scan=True` has suboptimal results on V1 "
-                "because of the simplified attention pattern being used.")
+        logger.warning_once(
+            "`do_pan_and_scan=True` has suboptimal results on V1 "
+            "because of the simplified attention pattern being used."
+        )
 
         # Based on Gemma3ImageProcessor.pan_and_scan
         if image_width >= image_height:
@@ -166,20 +191,23 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_image_repl(
         self,
         *,
-        image_width: int,
-        image_height: int,
-        processor: Optional[Gemma3Processor],
+        image_width: int | None,
+        image_height: int | None,
+        num_crops: int | None = None,
+        processor: Gemma3Processor | None,
     ) -> PromptUpdateDetails[str]:
         if processor is None:
             processor = self.get_hf_processor()
 
         boi_token = processor.boi_token
 
-        num_crops = self.get_num_crops(
-            image_width=image_width,
-            image_height=image_height,
-            processor=processor,
-        )
+        if num_crops is None:
+            assert image_width is not None and image_height is not None
+            num_crops = self.get_num_crops(
+                image_width=image_width,
+                image_height=image_height,
+                processor=processor,
+            )
 
         if num_crops == 0:
             image_text = boi_token
@@ -187,10 +215,10 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
             crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
             image_text = (
                 f"Here is the original image {boi_token} and here are some "
-                f"crops to help you see better {crops_image_tokens}")
+                f"crops to help you see better {crops_image_tokens}"
+            )
 
-        repl_full = image_text.replace(boi_token,
-                                       processor.full_image_sequence)
+        repl_full = image_text.replace(boi_token, processor.full_image_sequence)
 
         tokenizer = processor.tokenizer
         vocab = tokenizer.get_vocab()
@@ -203,7 +231,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3Processor],
+        processor: Gemma3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -221,7 +249,8 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         processor = self.get_hf_processor()
 
         images_kwargs = self._resolve_image_kwargs(
-            processor, {"pan_and_scan_max_num_crops"})
+            processor, {"pan_and_scan_max_num_crops"}
+        )
         max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
 
         # Result in the max possible feature size (h:w = max_num_crops:1)
@@ -229,7 +258,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
 
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -242,22 +270,25 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -274,23 +305,25 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
 
         # HF processor pops the `num_crops` kwarg, which is needed by vLLM
         if (images := mm_data.get("images")) is not None:
-            parsed_images = (self._get_data_parser().parse_mm_data({
-                "image":
-                images
-            }).get_items("image", ImageProcessorItems))
+            parsed_images = (
+                self._get_data_parser()
+                .parse_mm_data({"image": images})
+                .get_items("image", ImageProcessorItems)
+            )
             image_sizes = [
-                parsed_images.get_image_size(i)
-                for i in range(len(parsed_images))
+                parsed_images.get_image_size(i) for i in range(len(parsed_images))
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
             num_crops = [
-                self.info.get_num_crops(image_width=size.width,
-                                        image_height=size.height,
-                                        processor=hf_processor)
+                self.info.get_num_crops(
+                    image_width=size.width,
+                    image_height=size.height,
+                    processor=hf_processor,
+                )
                 for size in image_sizes
             ]
-            processed_outputs["num_crops"] = torch.tensor(num_crops)
+            processed_outputs["num_patches"] = torch.tensor(num_crops) + 1
 
         return processed_outputs
 
@@ -299,12 +332,12 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        num_crops = hf_inputs.get("num_crops", torch.empty(0))
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
 
         return dict(
-            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_crops + 1),
-            num_crops=MultiModalFieldConfig.batched("image"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -317,7 +350,19 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
         image_token = hf_processor.boi_token
 
         def get_replacement_gemma3(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
+
+            if isinstance(images, ImageEmbeddingItems):
+                # For image embedding inputs, only support no crops cases
+                # since it's not supported in hf processor anyway
+                return self.info.get_image_repl(
+                    image_width=None,
+                    image_height=None,
+                    num_crops=0,
+                    processor=hf_processor,
+                )
 
             image_size = images.get_image_size(item_idx)
             return self.info.get_image_repl(
@@ -339,8 +384,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
         prompt: list[int],
         mm_prompt_updates: MultiModalPromptUpdates,
     ) -> tuple[list[int], MultiModalPromptUpdatesApplyResult]:
-        token_ids, res = super()._apply_token_matches(prompt,
-                                                      mm_prompt_updates)
+        token_ids, res = super()._apply_token_matches(prompt, mm_prompt_updates)
 
         # "\n\n\n" and "\n\n\n\n" are single tokens
         # Since our replacement can insert "\n\n" next to "\n"
@@ -399,8 +443,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = super()._find_mm_placeholders(repl_token_ids,
-                                              mm_prompt_updates)
+        repls = super()._find_mm_placeholders(repl_token_ids, mm_prompt_updates)
 
         return {
             modality: [
@@ -410,39 +453,43 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
                     is_embed=p.is_embed,
-                ) for p in placeholders
+                )
+                for p in placeholders
             ]
             for modality, placeholders in repls.items()
         }
 
 
 class Gemma3MultiModalProjector(nn.Module):
-
     def __init__(self, config: Gemma3Config):
         super().__init__()
 
         self.mm_input_projection_weight = nn.Parameter(
-            torch.zeros(config.vision_config.hidden_size,
-                        config.text_config.hidden_size))
+            torch.zeros(
+                config.vision_config.hidden_size, config.text_config.hidden_size
+            )
+        )
 
         self.mm_soft_emb_norm = GemmaRMSNorm(
-            config.vision_config.hidden_size,
-            eps=config.vision_config.layer_norm_eps)
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
 
-        self.patches_per_image = int(config.vision_config.image_size //
-                                     config.vision_config.patch_size)
+        self.patches_per_image = int(
+            config.vision_config.image_size // config.vision_config.patch_size
+        )
         self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
         self.kernel_size = self.patches_per_image // self.tokens_per_side
-        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size,
-                                     stride=self.kernel_size)
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=self.kernel_size, stride=self.kernel_size
+        )
 
     def forward(self, vision_outputs: torch.Tensor):
         batch_size, _, seq_length = vision_outputs.shape
 
         reshaped_vision_outputs = vision_outputs.transpose(1, 2)
         reshaped_vision_outputs = reshaped_vision_outputs.reshape(
-            batch_size, seq_length, self.patches_per_image,
-            self.patches_per_image)
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
         reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
 
         pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
@@ -452,15 +499,21 @@ class Gemma3MultiModalProjector(nn.Module):
         normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
 
         projected_vision_outputs = torch.matmul(
-            normed_vision_outputs, self.mm_input_projection_weight)
+            normed_vision_outputs, self.mm_input_projection_weight
+        )
         return projected_vision_outputs.type_as(vision_outputs)
 
 
-@MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor,
-                                        info=Gemma3ProcessingInfo,
-                                        dummy_inputs=Gemma3DummyInputsBuilder)
-class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                     SupportsLoRA):
+@MULTIMODAL_REGISTRY.register_processor(
+    Gemma3MultiModalProcessor,
+    info=Gemma3ProcessingInfo,
+    dummy_inputs=Gemma3DummyInputsBuilder,
+)
+class Gemma3ForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
+):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -480,10 +533,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<start_of_image>"
 
@@ -498,10 +552,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
 
-        self.vision_tower = SiglipVisionModel(config.vision_config,
-                                              quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "vision_tower"))
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.multi_modal_projector = Gemma3MultiModalProjector(config)
 
         self.language_model = init_vllm_registered_model(
@@ -518,38 +573,32 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             self.language_model.logits_processor.scale *= logit_scale
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     @property
     def dtype(self):
         return next(self.parameters()).dtype
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
+        self, **kwargs: object
+    ) -> Gemma3ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
-        num_crops = kwargs.pop("num_crops", None)
+        num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
-        assert image_embeds is None, "Gemma3 does not support image_embeds."
-        if pixel_values is None:
-            return None
-
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        if not isinstance(num_crops, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_crops. "
-                             f"Got type: {type(num_crops)}")
 
-        image_size = self.config.vision_config.image_size
-
-        return Gemma3ImagePixelInputs(
-            pixel_values=flatten_bn(pixel_values, concat=True),
-            num_patches=flatten_bn(num_crops, concat=True) + 1,
-            resolve_bindings={
-                "h": image_size,
-                "w": image_size
-            })
+        if pixel_values is not None:
+            image_size = self.config.vision_config.image_size
+            return Gemma3ImagePixelInputs(
+                pixel_values=pixel_values,
+                num_patches=num_patches,
+                resolve_bindings={"h": image_size, "w": image_size},
+            )
+        elif image_embeds is not None:
+            return Gemma3ImageEmbeddingInputs(
+                image_embeds=image_embeds,
+                type="image_embeds",
+            )
 
     def _image_pixels_to_features(
         self,
@@ -561,7 +610,9 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> list[torch.Tensor]:
+    ) -> torch.Tensor | list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
@@ -573,67 +624,36 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         )
         image_embeds = self.multi_modal_projector(image_features)
 
-        return [
-            e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
-        ]
+        return [e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())]
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
+    def forward(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> IntermediateTensors:
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            if (vision_embeddings is not None) and len(vision_embeddings) != 0:
-                kwargs = self.prepare_attn_masks(
-                    input_ids,
-                    positions,
-                    mask_dtype=self.dtype,
-                    **kwargs,
-                )
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds,
-                                                  **kwargs)
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
 
         return hidden_states
 
@@ -681,7 +701,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
 
             # Consider the bidirectional attention between image tokens.
             img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = (input_token_ids == self.config.image_token_index)
+            img_pos = input_token_ids == self.config.image_token_index
             img_mask[:, :, :, img_pos] += 1
             img_mask[:, :, img_pos, :] += 1
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
@@ -691,10 +711,10 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             if sliding_window is not None:
                 # Create a local causal mask with sliding window (1024).
                 local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask,
-                                             diagonal=-sliding_window)
-                local_attn_mask = torch.where(local_attn_mask == 0,
-                                              global_attn_mask, float("-inf"))
+                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
+                local_attn_mask = torch.where(
+                    local_attn_mask == 0, global_attn_mask, float("-inf")
+                )
                 local_attn_masks.append(local_attn_mask)
         kwargs["global_attn_masks"] = global_attn_masks
         kwargs["local_attn_masks"] = local_attn_masks
@@ -703,11 +723,10 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -718,4 +737,5 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="multi_modal_projector",
-            tower_model="vision_tower")
+            tower_model="vision_tower",
+        )
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 0b6bccb3349824c3dc676ce68491c63f84da8b2b..64443190f53edf63aa0c6f990c2e82818db48774 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -16,7 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -28,28 +27,38 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
-                                                   GeluAndMul,
-                                                   GeluAndMulSparse)
+from vllm.model_executor.layers.activation import (
+    _ACTIVATION_REGISTRY,
+    GeluAndMul,
+    GeluAndMulSparse,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter, make_layers, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -110,9 +119,11 @@ class Gemma3nAltUp(nn.Module):
             eps=rms_norm_eps,
         )
         self.router_input_scale = torch.tensor(
-            hidden_size**-1.0, dtype=self.modality_router.weight.dtype)
+            hidden_size**-1.0, dtype=self.modality_router.weight.dtype
+        )
         self.correct_output_scale = nn.Parameter(
-            torch.zeros(hidden_size, dtype=torch.float32))
+            torch.zeros(hidden_size, dtype=torch.float32)
+        )
 
     def _compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
         router_inputs = self.router_norm(x) * self.router_input_scale
@@ -120,15 +131,17 @@ class Gemma3nAltUp(nn.Module):
         return torch.tanh(routed.float()).type_as(x)
 
     def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
-        return (corrected.type_as(self.correct_output_scale) *
-                self.correct_output_scale).type_as(corrected)
+        return (
+            corrected.type_as(self.correct_output_scale) * self.correct_output_scale
+        ).type_as(corrected)
 
     def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # hidden:       [altup_num_inputs, num_tokens, hidden_size]
         # modalities:   [num_tokens, num_altup_inputs]
         # all_coefs:    [num_tokens, num_altup_inputs ** 2]
         modalities = self._compute_router_modalities(
-            hidden_states[self.altup_active_idx])
+            hidden_states[self.altup_active_idx]
+        )
         all_coefs = self.prediction_coefs(modalities)
 
         # Reshape and transpose the 2D matrix for the matmul.
@@ -146,8 +159,9 @@ class Gemma3nAltUp(nn.Module):
         predictions += hidden_states
         return predictions.contiguous()
 
-    def correct(self, predictions: torch.Tensor,
-                activated: torch.Tensor) -> torch.Tensor:
+    def correct(
+        self, predictions: torch.Tensor, activated: torch.Tensor
+    ) -> torch.Tensor:
         # predictions:  [altup_num_inputs, num_tokens, hidden_size]
         # activated:    [num_tokens, hidden_size]
         # modalities:   [num_tokens, altup_num_inputs]
@@ -181,7 +195,7 @@ class Gemma3nLaurelBlock(nn.Module):
         laurel_rank: int,
         rms_norm_eps: float,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str,
     ) -> None:
         super().__init__()
@@ -215,14 +229,13 @@ class Gemma3nLaurelBlock(nn.Module):
 
 
 class Gemma3nMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_activation: str,
         activation_sparsity: float = 0.0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -244,12 +257,16 @@ class Gemma3nMLP(nn.Module):
             raise ValueError(
                 "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
                 "function. Please set `hidden_act` and `hidden_activation` to "
-                "`gelu_pytorch_tanh`.")
+                "`gelu_pytorch_tanh`."
+            )
 
-        self.act_fn = GeluAndMulSparse(
-            activation_sparsity=activation_sparsity,
-            approximate="tanh") if activation_sparsity > 0.0 else GeluAndMul(
-                approximate="tanh")
+        self.act_fn = (
+            GeluAndMulSparse(
+                activation_sparsity=activation_sparsity, approximate="tanh"
+            )
+            if activation_sparsity > 0.0
+            else GeluAndMul(approximate="tanh")
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         gate_up, _ = self.gate_up_proj(x)
@@ -259,17 +276,18 @@ class Gemma3nMLP(nn.Module):
 
 
 class Gemma3nAttention(nn.Module):
-
-    def __init__(self,
-                 config: Gemma3nTextConfig,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 head_dim: int,
-                 max_position_embeddings: int,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = hidden_size
@@ -307,13 +325,11 @@ class Gemma3nAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
-        self.q_norm = RMSNorm(hidden_size=self.head_dim,
-                              eps=config.rms_norm_eps)
-        self.k_norm = RMSNorm(hidden_size=self.head_dim,
-                              eps=config.rms_norm_eps)
-        self.v_norm = RMSNorm(hidden_size=self.head_dim,
-                              eps=config.rms_norm_eps,
-                              has_weight=False)
+        self.q_norm = RMSNorm(hidden_size=self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(hidden_size=self.head_dim, eps=config.rms_norm_eps)
+        self.v_norm = RMSNorm(
+            hidden_size=self.head_dim, eps=config.rms_norm_eps, has_weight=False
+        )
 
         layer_idx = extract_layer_index(prefix)
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
@@ -329,8 +345,9 @@ class Gemma3nAttention(nn.Module):
             rope_theta = config.rope_theta
             rope_scaling = config.rope_scaling
 
-        first_kv_shared_layer_idx = (config.num_hidden_layers -
-                                     config.num_kv_shared_layers)
+        first_kv_shared_layer_idx = (
+            config.num_hidden_layers - config.num_kv_shared_layers
+        )
         self.is_kv_shared = layer_idx >= first_kv_shared_layer_idx
 
         kv_sharing_target_layer_name = None
@@ -340,8 +357,27 @@ class Gemma3nAttention(nn.Module):
             offset = 2 if self.sliding_window is not None else 1
             kv_shared_layer_index = first_kv_shared_layer_idx - offset
             if kv_shared_layer_index >= 0:
+                # Different model wrappers expose layer parameters under
+                # different parent attributes.
+                # For example:
+                #   - Gemma3nForCausalLM → parameters live under "model.layers"
+                #   - Gemma3nForConditionalGeneration →
+                #     under "language_model.model.layers"
+                # This logic extracts the portion of the parameter name
+                # *before* ".layers."
+                # so downstream code can consistently reference the correct
+                # model root regardless of which wrapper class was used.
+                if ".layers." in prefix:
+                    param_name_before_layers = prefix.split(".layers.")[0]
+                else:
+                    raise ValueError(
+                        "Unexpected prefix format for Gemma3nAttention: "
+                        f"'{prefix}'. The prefix is expected to contain "
+                        "'.layers.' to correctly determine the KV sharing "
+                        "target layer."
+                    )
                 # Only the greater layer is required to specify sharing.
-                kv_sharing_target_layer_name = f"language_model.model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
+                kv_sharing_target_layer_name = f"{param_name_before_layers}.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -361,7 +397,8 @@ class Gemma3nAttention(nn.Module):
             quant_config=quant_config,
             per_layer_sliding_window=self.sliding_window,
             kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            prefix=f"{prefix}.attn")
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -390,12 +427,11 @@ class Gemma3nAttention(nn.Module):
 
 
 class Gemma3nDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Gemma3nTextConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -426,12 +462,12 @@ class Gemma3nDecoderLayer(nn.Module):
         self.mlp = Gemma3nMLP(
             hidden_size=config.hidden_size,
             # NOTE: Matformer https://github.com/huggingface/transformers/blob/a52478253bbe522a420e88ea3940d4d98a935300/src/transformers/models/gemma3n/modular_gemma3n.py#L258 # noqa: E501
-            intermediate_size=config.intermediate_size[extract_layer_index(
-                prefix)],
+            intermediate_size=config.intermediate_size[extract_layer_index(prefix)],
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
             activation_sparsity=config.activation_sparsity_pattern[
-                extract_layer_index(prefix)],
+                extract_layer_index(prefix)
+            ],
             prefix=f"{prefix}.mlp",
         )
         self.laurel = Gemma3nLaurelBlock(
@@ -493,7 +529,6 @@ class Gemma3nDecoderLayer(nn.Module):
         per_layer_input: torch.Tensor,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-
         # ActUp (predict).
         predictions = self.altup.predict(hidden_states)
         active_prediction = predictions[self.altup_active_idx]
@@ -508,8 +543,7 @@ class Gemma3nDecoderLayer(nn.Module):
         )
         attn = self.post_attention_layernorm(attn)
         attn_gated = attn + active_prediction
-        attn_laurel = (attn_gated + laurel_output) / torch.sqrt(
-            torch.tensor(2.0))
+        attn_laurel = (attn_gated + laurel_output) / torch.sqrt(torch.tensor(2.0))
 
         # MLP.
         attn_norm = self.pre_feedforward_layernorm(attn_laurel)
@@ -518,8 +552,7 @@ class Gemma3nDecoderLayer(nn.Module):
         attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm
 
         # ActUp (connect).
-        corrected_predictions = self.altup.correct(predictions,
-                                                   attn_ffw_laurel_gated)
+        corrected_predictions = self.altup.correct(predictions, attn_ffw_laurel_gated)
         first_prediction = corrected_predictions[self.altup_active_idx]
         first_prediction = self.altup.scale_corrected_output(first_prediction)
 
@@ -537,8 +570,9 @@ class Gemma3nDecoderLayer(nn.Module):
 
 
 # This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
+@support_torch_compile(
+    enable_if=lambda vllm_config: vllm_config.cache_config.kv_sharing_fast_prefill
+)
 class Gemma3nSelfDecoder(nn.Module):
     """
     Includes altup embedding and self decoder layers
@@ -595,39 +629,46 @@ class Gemma3nSelfDecoder(nn.Module):
             eps=config.rms_norm_eps,
         )
         self.per_layer_input_scale = torch.rsqrt(torch.tensor(2.0)).to(
-            self.embed_tokens.weight.dtype)
+            self.embed_tokens.weight.dtype
+        )
         self.per_layer_projection_scale = torch.tensor(
             config.hidden_size**0.5,
             dtype=self.embed_tokens.weight.dtype,
         )
-        self.altup_projections = nn.ModuleList([
-            ColumnParallelLinear(
-                config.hidden_size,
-                config.hidden_size,
-                bias=False,
-                gather_output=True,
-                return_bias=False,
-                quant_config=quant_config,
-                prefix=f"{prefix}.altup_projections.{idx-1}",
-            ) for idx in range(1, self.config.altup_num_inputs)
-        ])
-
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
+        self.altup_projections = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    bias=False,
+                    gather_output=True,
+                    return_bias=False,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.altup_projections.{idx - 1}",
+                )
+                for idx in range(1, self.config.altup_num_inputs)
+            ]
+        )
+
+    def get_per_layer_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         # Deal with the fact that vocab_size_per_layer_input < vocab_size
         # which causes us to have some out of vocab tokens by setting
         # those token ids to 0. This matches the HF implementation.
         per_layer_inputs_mask = torch.logical_and(
-            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
-        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
-                                              torch.zeros_like(input_ids))
-        return self.embed_tokens_per_layer(
-            per_layer_inputs_tokens) * self.embed_scale_per_layer
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input
+        )
+        per_layer_inputs_tokens = torch.where(
+            per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)
+        )
+        return (
+            self.embed_tokens_per_layer(per_layer_inputs_tokens)
+            * self.embed_scale_per_layer
+        )
 
     def get_per_layer_inputs(
         self,
         hidden_states_0: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor],
+        per_layer_inputs: torch.Tensor | None,
     ) -> torch.Tensor:
         per_layer_projection = self.per_layer_model_projection(hidden_states_0)
         per_layer_projection = per_layer_projection.reshape(
@@ -635,8 +676,7 @@ class Gemma3nSelfDecoder(nn.Module):
             self.config.num_hidden_layers,
             self.config.hidden_size_per_layer_input,
         )
-        per_layer_projection = self.per_layer_projection_norm(
-            per_layer_projection)
+        per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
         if per_layer_inputs is not None:
             # Profiling run does not compute per_layer_inputs
             per_layer_inputs = per_layer_projection + per_layer_inputs
@@ -645,21 +685,19 @@ class Gemma3nSelfDecoder(nn.Module):
             per_layer_inputs = per_layer_projection
         return per_layer_inputs
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids) * self.embed_scale
 
     def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
         # Altup embed.
         hidden_states = [hidden_states_0] * self.config.altup_num_inputs
-        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
-                                      keepdim=True)**0.5
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5
         for i in range(1, self.config.altup_num_inputs):
             hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, EPS)
+            new_magnitude = (
+                torch.mean(hidden_states[i] ** 2, dim=-1, keepdim=True) ** 0.5
+            )
+            hidden_states[i] *= target_magnitude / torch.maximum(new_magnitude, EPS)
         hidden_states = torch.stack(hidden_states, dim=-1)
         return hidden_states
 
@@ -667,17 +705,18 @@ class Gemma3nSelfDecoder(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             hidden_states_0 = inputs_embeds
         else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
+            hidden_states_0 = self.embed_input_ids(input_ids)
 
         adjusted_per_layer_inputs = self.get_per_layer_inputs(
-            hidden_states_0, per_layer_inputs)
+            hidden_states_0, per_layer_inputs
+        )
         hidden_states = self.altup_embed(hidden_states_0)
 
         # [altnum_inputs, num_tokens, hidden_size]
@@ -700,8 +739,9 @@ class Gemma3nSelfDecoder(nn.Module):
 
 
 # This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
+@support_torch_compile(
+    enable_if=lambda vllm_config: vllm_config.cache_config.kv_sharing_fast_prefill
+)
 class Gemma3nCrossDecoder(nn.Module):
     """
     Cross-decoder layers
@@ -743,10 +783,10 @@ class Gemma3nCrossDecoder(nn.Module):
 
 
 # This disables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
-                       cache_config.kv_sharing_fast_prefill)
+@support_torch_compile(
+    enable_if=lambda vllm_config: not vllm_config.cache_config.kv_sharing_fast_prefill
+)
 class Gemma3nTextModel(nn.Module, SupportsQuant):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -755,27 +795,33 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         self.config = config
         self.quant_config = quant_config
 
-        self.altup_unembed_projections = nn.ModuleList([
-            ColumnParallelLinear(
-                config.hidden_size,
-                config.hidden_size,
-                bias=False,
-                gather_output=True,
-                return_bias=False,
-                quant_config=quant_config,
-                prefix=f"{prefix}.altup_unembed_projections.{idx-1}",
-            ) for idx in range(1, self.config.altup_num_inputs)
-        ])
+        self.altup_unembed_projections = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    bias=False,
+                    gather_output=True,
+                    return_bias=False,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.altup_unembed_projections.{idx - 1}",
+                )
+                for idx in range(1, self.config.altup_num_inputs)
+            ]
+        )
 
         # Allocate config.num_kv_shared_layers layers for self-decoder
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Gemma3nDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
-        first_kv_shared_layer_idx = (config.num_hidden_layers -
-                                     config.num_kv_shared_layers)
+        first_kv_shared_layer_idx = (
+            config.num_hidden_layers - config.num_kv_shared_layers
+        )
 
         # NOTE(sarckk): importing this top level seems to cause issues
         # during running of tests.
@@ -810,18 +856,20 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
             # TODO(sarckk): Extract this functionality to interface
             max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
             device = next(self.parameters()).device
-            self.positions = torch.zeros(max_num_tokens,
-                                         dtype=torch.int64,
-                                         device=device)
+            self.positions = torch.zeros(
+                max_num_tokens, dtype=torch.int64, device=device
+            )
             self.hidden_states = torch.zeros(
-                (max_num_tokens, config.hidden_size,
-                 self.config.altup_num_inputs),
+                (max_num_tokens, config.hidden_size, self.config.altup_num_inputs),
                 dtype=self.embed_tokens.weight.dtype,
                 device=device,
             )
             self.per_layer_inputs = torch.zeros(
-                (max_num_tokens, self.config.num_hidden_layers,
-                 self.config.hidden_size_per_layer_input),
+                (
+                    max_num_tokens,
+                    self.config.num_hidden_layers,
+                    self.config.hidden_size_per_layer_input,
+                ),
                 dtype=self.embed_tokens.weight.dtype,
                 device=device,
             )
@@ -830,46 +878,44 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
     def embed_tokens(self):
         return self.self_decoder.embed_tokens
 
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
+    def get_per_layer_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.self_decoder.get_per_layer_input_embeddings(input_ids)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.self_decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.self_decoder.embed_input_ids(input_ids)
 
     def fast_prefill_forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         logits_indices_padded, num_logits_indices = None, None
         attn_metadata = get_forward_context().attn_metadata
 
         # attn_metadata is None during dummy runs
-        if (self.fast_prefill_enabled and attn_metadata is not None):
+        if self.fast_prefill_enabled and attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             # Last layer is a KV sharing layer
             layer_attn_metadata = attn_metadata[
-                self.layers[-1].self_attn.attn.layer_name]
-            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
-                logits_indices_padded = (
-                    layer_attn_metadata.logits_indices_padded)
+                self.layers[-1].self_attn.attn.layer_name
+            ]
+            if isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata):
+                logits_indices_padded = layer_attn_metadata.logits_indices_padded
                 num_logits_indices = layer_attn_metadata.num_logits_indices
 
         # Copy inputs for cudagraph
         batch_size = positions.size(0)
         self.positions[:batch_size].copy_(positions)
-        self_decoder_hidden_states, per_layer_inputs_adjusted = \
-            self.self_decoder(
-                input_ids=input_ids,
-                positions=self.positions[:batch_size],
-                inputs_embeds=inputs_embeds,
-                per_layer_inputs=per_layer_inputs,
-                **kwargs,
-            )
+        self_decoder_hidden_states, per_layer_inputs_adjusted = self.self_decoder(
+            input_ids=input_ids,
+            positions=self.positions[:batch_size],
+            inputs_embeds=inputs_embeds,
+            per_layer_inputs=per_layer_inputs,
+            **kwargs,
+        )
 
         if logits_indices_padded is None:
             logits_indices_padded = torch.arange(
@@ -889,11 +935,14 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         # Copy inputs for cudagraph
         num_padded_logits_indices = logits_indices_padded.size(0)
         self.positions[:num_padded_logits_indices].copy_(
-            positions[logits_indices_padded])
+            positions[logits_indices_padded]
+        )
         self.hidden_states[:num_padded_logits_indices].copy_(
-            self_decoder_hidden_states[logits_indices_padded])
+            self_decoder_hidden_states[logits_indices_padded]
+        )
         self.per_layer_inputs[:num_padded_logits_indices].copy_(
-            per_layer_inputs_adjusted[logits_indices_padded])
+            per_layer_inputs_adjusted[logits_indices_padded]
+        )
         cross_decoder_hidden_states = self.cross_decoder(
             positions=self.positions[:num_padded_logits_indices],
             hidden_states=self.hidden_states[:num_padded_logits_indices],
@@ -905,7 +954,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
             assert num_logits_indices > 0
             # Merge cross-decoder and self-decoder hidden states
             hidden_states[logits_indices_padded[:num_logits_indices]] = (
-                cross_decoder_hidden_states[:num_logits_indices])
+                cross_decoder_hidden_states[:num_logits_indices]
+            )
         else:
             hidden_states = cross_decoder_hidden_states
 
@@ -915,8 +965,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states, per_layer_inputs = self.self_decoder(
@@ -939,30 +989,32 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         # Altup unembed.
-        target_magnitude = torch.mean(hidden_states[..., 0]**2,
-                                      dim=-1,
-                                      keepdim=True)**0.5
+        target_magnitude = (
+            torch.mean(hidden_states[..., 0] ** 2, dim=-1, keepdim=True) ** 0.5
+        )
         for i in range(1, self.config.altup_num_inputs):
             hidden_states[..., i] = self.altup_unembed_projections[i - 1](
-                hidden_states[..., i])
-            new_magnitude = torch.mean(hidden_states[..., i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
+                hidden_states[..., i]
+            )
+            new_magnitude = (
+                torch.mean(hidden_states[..., i] ** 2, dim=-1, keepdim=True) ** 0.5
+            )
             hidden_states[..., i] *= target_magnitude / torch.maximum(
-                new_magnitude, EPS)
+                new_magnitude, EPS
+            )
         # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size]
         hidden_states = torch.mean(hidden_states, dim=-1)
         return hidden_states
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if self.fast_prefill_enabled:
             hidden_states = self.fast_prefill_forward(
                 input_ids,
@@ -982,8 +1034,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         hidden_states = self.altup_unembed(hidden_states)
         return self.norm(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -997,22 +1048,24 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         for name, loaded_weight in weights:
             # decoder layer weights, altup_unembed_projections and rmsnorm
             # are initialized in text model, others are in self decoder
-            if (not name.startswith('layers')
-                    and not name.startswith('altup_unembed_projections')
-                    and not name.startswith('norm')):
+            if (
+                not name.startswith("layers")
+                and not name.startswith("altup_unembed_projections")
+                and not name.startswith("norm")
+            ):
                 name = f"self_decoder.{name}"
 
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
                 # Avoid spurious match with ".up_proj".
@@ -1039,8 +1092,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
@@ -1062,30 +1114,30 @@ class Gemma3nForCausalLM(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         self.cache_config = vllm_config.cache_config
-        self.model = Gemma3nTextModel(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma3nTextModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.logits_processor = LogitsProcessor(
-            config.vocab_size, soft_cap=config.final_logit_softcapping)
+            config.vocab_size, soft_cap=config.final_logit_softcapping
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         *,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
@@ -1099,15 +1151,15 @@ class Gemma3nForCausalLM(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self,
-                                   skip_substrs=([
-                                       "embed_audio.", "embed_vision.",
-                                       "audio_tower.", "vision_tower."
-                                   ]))
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_substrs=(
+                ["embed_audio.", "embed_vision.", "audio_tower.", "vision_tower."]
+            ),
+        )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 2acdba54a257dcb29baab2e0024fdba6400f6729..6ae76976eb46ca9117ad75d9bd0f07a878e321a0 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -1,52 +1,66 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Any, Literal, Optional, Union, cast
 
 import numpy as np
 import torch
-# yapf: disable
+
 from torch import nn
 from transformers import AutoModel, BatchFeature
-from transformers.models.gemma3n import (Gemma3nAudioConfig,
-                                         Gemma3nAudioFeatureExtractor,
-                                         Gemma3nConfig, Gemma3nProcessor,
-                                         Gemma3nTextConfig,
-                                         Gemma3nVisionConfig)
+from transformers.models.gemma3n import (
+    Gemma3nAudioConfig,
+    Gemma3nAudioFeatureExtractor,
+    Gemma3nConfig,
+    Gemma3nProcessor,
+    Gemma3nTextConfig,
+    Gemma3nVisionConfig,
+)
 from transformers.models.siglip import SiglipImageProcessorFast
 
 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalPromptUpdates,
-                                        MultiModalPromptUpdatesApplyResult,
-                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails,
-                                        replace_token_matches)
-# yapf: enable
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalPromptUpdates,
+    MultiModalPromptUpdatesApplyResult,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+    replace_token_matches,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
-                         SupportsTranscription)
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -55,24 +69,36 @@ TOKENS_PER_IMAGE = 256
 TOKENS_PER_AUDIO = 188
 
 
-class Gemma3nImagePixelInputs(TypedDict):
-    pixel_values: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+class Gemma3nImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each patch
+        - w: Width of each patch
+    """
+
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class Gemma3nAudioInputs(TypedDict):
-    input_features: Union[torch.Tensor, list[torch.Tensor]]
-    input_features_padded: torch.Tensor
-    """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
-    input_features_mask: torch.Tensor
-    """Shape: `(batch_size * num_audio, seq_length)`"""
+class Gemma3nAudioInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of audios
+        - s: seq_length
+        - f: num_features
+    """
+
+    type: Literal["audio"] = "audio"
+    input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
+    input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
 
 
 Gemma3nImageInputs = Gemma3nImagePixelInputs
 
 
 class Gemma3nProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Gemma3nConfig)
 
@@ -83,9 +109,8 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
         return {"image": None, "audio": None}
 
     def get_max_tokens_per_item(
-            self, seq_len: int,
-            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
-
+        self, seq_len: int, mm_counts: Mapping[str, int]
+    ) -> Optional[Mapping[str, int]]:
         return {"image": TOKENS_PER_IMAGE, "audio": TOKENS_PER_AUDIO}
 
     def get_image_repl(
@@ -97,7 +122,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
     ) -> str:
         """
         Get the replacement text for image tokens.
-        
+
         For Gemma3n, this should return the full_image_sequence which includes
         BOI token, repeated image tokens, and EOI token.
         """
@@ -105,7 +130,8 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
             processor = self.get_hf_processor()
 
         return PromptUpdateDetails.select_token_id(
-            processor.full_image_sequence, processor.image_token_id)
+            processor.full_image_sequence, processor.image_token_id
+        )
 
     def get_audio_repl(
         self,
@@ -114,7 +140,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
     ) -> str:
         """
         Get the replacement text for audio tokens.
-        
+
         For Gemma3n, this should return the full_audio_sequence which includes
         BOA token, repeated audio tokens, and EOA token.
         """
@@ -123,11 +149,11 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
 
         # Return the full audio sequence as defined by the processor
         return PromptUpdateDetails.select_token_id(
-            processor.full_audio_sequence, processor.audio_token_id)
+            processor.full_audio_sequence, processor.audio_token_id
+        )
 
 
 class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_audios = mm_counts.get("audio", 0)
@@ -142,29 +168,36 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_audios = mm_counts.get("audio", 0)
         processor = self.info.get_hf_processor()
-        audio_feature_extractor: Gemma3nAudioFeatureExtractor = processor.feature_extractor  # noqa: E501
+        audio_feature_extractor: Gemma3nAudioFeatureExtractor = (
+            processor.feature_extractor
+        )
         audio_len = audio_feature_extractor.fft_length
         image_processor: SiglipImageProcessorFast = processor.image_processor
         img_width = image_processor.size.get("width", 224)
         img_height = image_processor.size.get("height", 224)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=img_width,
-                                   height=img_height,
-                                   num_images=num_images),
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+            "image": self._get_dummy_images(
+                width=img_width,
+                height=img_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            ),
         }
 
 
-class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
-                                 ):
-
+class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_hf_processor().feature_extractor
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
@@ -176,12 +209,11 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         # HF Transformers audio processor no longer accepts `audios` key.
         # We pop `audios` and replace it with `audio` key to suppress
         # the warning.
-        if 'audios' in mm_data:
-            mm_data['audio'] = mm_data.pop('audios')
+        if "audios" in mm_data:
+            mm_data["audio"] = mm_data.pop("audios")
         processed_outputs = super()._call_hf_processor(
             prompt,
             mm_data,
@@ -189,15 +221,17 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             tok_kwargs,
         )
 
-        if 'input_features' in processed_outputs:
+        if "input_features" in processed_outputs:
             # Padding enables audio_tower to run in batched mode
-            processed_outputs["input_features_padded"] = \
-                processed_outputs["input_features"]
+            processed_outputs["input_features_padded"] = processed_outputs[
+                "input_features"
+            ]
 
             # Unpad features here since we need the output of each item to be
             # independent of other items for the cache to work correctly
             unpadded_features = [
-                f[mask] for f, mask in zip(
+                f[mask]
+                for f, mask in zip(
                     processed_outputs["input_features"],
                     processed_outputs["input_features_mask"],
                 )
@@ -210,12 +244,11 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            input_features=MultiModalFieldConfig.batched("audio"),
             input_features_padded=MultiModalFieldConfig.batched("audio"),
-            input_features_mask=MultiModalFieldConfig.batched("audio"))
+            input_features_mask=MultiModalFieldConfig.batched("audio"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -245,21 +278,25 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
                     modality="image",
                     target=image_token,
                     replacement=get_replacement_image,
-                ))
+                )
+            )
 
         # Handle audio tokens
         if "audio" in mm_items:
             audio_token = hf_processor.audio_token
 
             def get_replacement_audio(item_idx: int):
-                return self.info.get_audio_repl(processor=hf_processor, )
+                return self.info.get_audio_repl(
+                    processor=hf_processor,
+                )
 
             prompt_updates.append(
                 PromptReplacement(
                     modality="audio",
                     target=audio_token,
                     replacement=get_replacement_audio,
-                ))
+                )
+            )
 
         return prompt_updates
 
@@ -268,8 +305,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         prompt: list[int],
         mm_prompt_updates: MultiModalPromptUpdates,
     ) -> tuple[list[int], MultiModalPromptUpdatesApplyResult]:
-        token_ids, res = super()._apply_token_matches(prompt,
-                                                      mm_prompt_updates)
+        token_ids, res = super()._apply_token_matches(prompt, mm_prompt_updates)
 
         # "\n\n\n" and "\n\n\n\n" are single tokens
         # Since our replacement can insert "\n\n" next to "\n"
@@ -328,8 +364,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = super()._find_mm_placeholders(repl_token_ids,
-                                              mm_prompt_updates)
+        repls = super()._find_mm_placeholders(repl_token_ids, mm_prompt_updates)
 
         return {
             modality: [
@@ -339,14 +374,15 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
                     is_embed=p.is_embed,
-                ) for p in placeholders
+                )
+                for p in placeholders
             ]
             for modality, placeholders in repls.items()
         }
 
 
 class Gemma3nMultimodalEmbedder(nn.Module):
-    """Embeds token ids or soft tokens for multimodal content into language 
+    """Embeds token ids or soft tokens for multimodal content into language
     model space."""
 
     def __init__(
@@ -406,7 +442,8 @@ class Gemma3nMultimodalEmbedder(nn.Module):
         """  # noqa: E501
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
-                "You must specify exactly one of input_ids or inputs_embeds")
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
 
         if inputs_embeds is not None:
             emb_norm = self.soft_embedding_norm(inputs_embeds)
@@ -418,11 +455,15 @@ class Gemma3nMultimodalEmbedder(nn.Module):
         return self.embedding_post_projection_norm(emb_norm_proj)
 
 
-@MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor,
-                                        info=Gemma3nProcessingInfo,
-                                        dummy_inputs=Gemma3nDummyInputsBuilder)
-class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsTranscription):
+@MULTIMODAL_REGISTRY.register_processor(
+    Gemma3nMultiModalProcessor,
+    info=Gemma3nProcessingInfo,
+    dummy_inputs=Gemma3nDummyInputsBuilder,
+)
+class Gemma3nForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsTranscription
+):
+    merge_by_field_config = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
@@ -448,7 +489,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
             "model": "language_model.model",
-        })
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -462,10 +504,12 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.vision_tower = AutoModel.from_config(config=config.vision_config)
         self.audio_tower = AutoModel.from_config(config=config.audio_config)
-        self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config,
-                                                      config.text_config)
-        self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config,
-                                                     config.text_config)
+        self.embed_vision = Gemma3nMultimodalEmbedder(
+            config.vision_config, config.text_config
+        )
+        self.embed_audio = Gemma3nMultimodalEmbedder(
+            config.audio_config, config.text_config
+        )
 
         self.language_model: nn.Module = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -481,18 +525,12 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.config.text_config.num_hidden_layers,
             self.config.text_config.hidden_size_per_layer_input,
             device=self.language_model.model.embed_tokens.weight.device,
-            dtype=self.language_model.model.embed_tokens.weight.dtype)
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        # TODO check if there are any
-        return data
+            dtype=self.language_model.model.embed_tokens.weight.dtype,
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Gemma3nImageInputs]:
+        self, **kwargs: object
+    ) -> Optional[Gemma3nImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         # TODO is this the case?
@@ -500,34 +538,22 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        pixel_values = flatten_bn(pixel_values, concat=True)
-        pixel_values = pixel_values.contiguous()
-
-        return Gemma3nImagePixelInputs(
-            pixel_values=self._validate_pixel_values(pixel_values), )
+        return Gemma3nImagePixelInputs(pixel_values=pixel_values)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Gemma3nAudioInputs]:
-        input_features = kwargs.pop("input_features", None)
-        if input_features is None:
+        self, **kwargs: object
+    ) -> Optional[Gemma3nAudioInputs]:
+        input_features_padded = kwargs.pop("input_features_padded", None)
+        if input_features_padded is None:
             return None
 
         input_features_mask = kwargs.pop("input_features_mask", None)
         if input_features_mask is None:
             return None
 
-        input_features_padded = kwargs.pop("input_features_padded", None)
-        if input_features_padded is None:
-            return None
-
         return Gemma3nAudioInputs(
-            input_features=input_features,
-            input_features_mask=input_features_mask,
             input_features_padded=input_features_padded,
+            input_features_mask=input_features_mask,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -536,14 +562,20 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values", "image_embeds"
-                             ) and "image" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "image"] = self._parse_and_validate_image_input(**kwargs)
-            if input_key == "input_features" \
-                and "audio" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key == "input_features_padded"
+                and "audio" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(
+                    **kwargs
+                )
         return mm_input_by_modality
 
     def _process_image_input(
@@ -553,16 +585,20 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
-        vision_outputs = self.vision_tower(pixel_values=pixel_values,
-                                           do_pooling=False,
-                                           return_dict=True).last_hidden_state
+        vision_outputs = self.vision_tower(
+            pixel_values=pixel_values, do_pooling=False, return_dict=True
+        ).last_hidden_state
         # TODO try to avoid copy here
         # (batch, channels, height, width) to (batch, height * width, channels)
-        vision_outputs = vision_outputs.reshape(
-            vision_outputs.shape[0],
-            self.config.vision_config.hidden_size,
-            self.config.vision_soft_tokens_per_image,
-        ).permute(0, 2, 1).contiguous()
+        vision_outputs = (
+            vision_outputs.reshape(
+                vision_outputs.shape[0],
+                self.config.vision_config.hidden_size,
+                self.config.vision_soft_tokens_per_image,
+            )
+            .permute(0, 2, 1)
+            .contiguous()
+        )
         # Normalize and embed the soft tokens into language model space.
         vision_outputs *= self.config.vision_config.hidden_size**0.5
         # Return a list of embeddings instead of a batched tensor
@@ -576,8 +612,9 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Run on padded features to enable batching
         input_features = audio_input["input_features_padded"].squeeze(1)
         input_features_mask = audio_input["input_features_mask"].squeeze(1)
-        audio_outputs, audio_mask = self.audio_tower(input_features,
-                                                     ~input_features_mask)
+        audio_outputs, audio_mask = self.audio_tower(
+            input_features, ~input_features_mask
+        )
         audio_features = self.embed_audio(inputs_embeds=audio_outputs)
 
         # ruff: noqa
@@ -587,30 +624,29 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
         # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab.
         # TODO precompute and cache padding
-        audio_padding_toks = torch.tensor([[self.vocab_size - 1]],
-                                          dtype=torch.long,
-                                          device=audio_features.device)
+        audio_padding_toks = torch.tensor(
+            [[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device
+        )
         audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
-        audio_features = torch.where(audio_mask.unsqueeze(-1),
-                                     audio_padding_embs, audio_features)
+        audio_features = torch.where(
+            audio_mask.unsqueeze(-1), audio_padding_embs, audio_features
+        )
 
         audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
         extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len  # noqa: E501
         extra_padding_features = audio_padding_embs.expand(
-            audio_batch_size, extra_padding_tokens, audio_embed_dim)
+            audio_batch_size, extra_padding_tokens, audio_embed_dim
+        )
 
-        audio_features = torch.cat((audio_features, extra_padding_features),
-                                   dim=1)
+        audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
         # Return a list of embeddings instead of a batched tensor
         return audio_features.unbind(0)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if mm_input_by_modality is None:
             return []
 
@@ -628,48 +664,57 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
                 multimodal_embeddings.extend(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        *,
+        is_multimodal: Optional[torch.Tensor] = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
             per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
-                input_ids)
+                input_ids
+            )
             per_layer_inputs = per_layer_inputs.reshape(
-                -1, self.config.text_config.num_hidden_layers,
-                self.config.text_config.hidden_size_per_layer_input)
-            self.per_layer_embeddings[:per_layer_inputs.shape[0]].copy_(
-                per_layer_inputs)
-
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                # NOTE: this order of processing mm items is important
-                [self.config.image_token_id, self.config.audio_token_id])
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> IntermediateTensors:
+                -1,
+                self.config.text_config.num_hidden_layers,
+                self.config.text_config.hidden_size_per_layer_input,
+            )
+            self.per_layer_embeddings[: per_layer_inputs.shape[0]].copy_(
+                per_layer_inputs
+            )
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE (NickLucche) During profiling, `get_input_embeddings` is not
+        # NOTE (NickLucche) During profiling, `embed_input_ids` is not
         # called, hence we don't have input_ids to compute PLEs. We simply
         # select a chunk of pre-allocated PLEs. During normal execution,
-        # `get_input_embeddings` is called before forward, hence this slice
+        # `embed_input_ids` is called before forward, hence this slice
         # will contain PLEs computed from the actual input_ids.
-        per_layer_inputs = self.per_layer_embeddings[:inputs_embeds.shape[0]]
+        per_layer_inputs = self.per_layer_embeddings[: inputs_embeds.shape[0]]
 
         hidden_states = self.language_model.model(
             input_ids,
@@ -677,7 +722,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
             per_layer_inputs=per_layer_inputs,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
-            **kwargs)
+            **kwargs,
+        )
 
         return hidden_states
 
@@ -687,8 +733,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
     ) -> Optional[torch.Tensor]:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -699,7 +744,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="multi_modal_projector",
-            tower_model="vision_tower")
+            tower_model="vision_tower",
+        )
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -711,16 +757,19 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError(f"Unsupported modality: {modality}")
 
     @classmethod
-    def get_generation_prompt(cls, audio: np.ndarray,
-                              stt_config: SpeechToTextConfig,
-                              model_config: ModelConfig,
-                              language: Optional[str],
-                              task_type: Literal["transcribe", "translate"],
-                              request_prompt: str,
-                              to_language: Optional[str]) -> PromptType:
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+        language: Optional[str],
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: Optional[str],
+    ) -> PromptType:
         """
         Gemma3n supports "free-form" transcription.
-        We fix its prompt here to standardize transcriptions/translations 
+        We fix its prompt here to standardize transcriptions/translations
         requests.
         """
         # Transcribe this audio [into <>] | for transcription
@@ -749,8 +798,9 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         return cast(PromptType, prompts_dict)
 
     @classmethod
-    def get_speech_to_text_config(cls, model_config: ModelConfig,
-                                  task_type: str) -> SpeechToTextConfig:
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
         return SpeechToTextConfig(
             # Let's set this to 30 as suggested in the docs for now, although
             # the model is only limited by its context length.
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index defa77b84e44153f330f367bf8d9e6b954ecfeee..a6991f8e43feffdf1b094e346c24523c7dc0a538 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
+
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
@@ -8,7 +9,6 @@ from .utils import PPMissingLayer
 
 
 class GlmForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f49f21a40f8248a686e06c52f5f18ce1cdd934b5..faa0674a2e43dfbc047a51953d7774345317cdb2 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -22,8 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -34,8 +34,7 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -49,21 +48,22 @@ from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
 
 
 class Glm4Attention(nn.Module):
-
-    def __init__(self,
-                 config: Glm4Config,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 max_position: int = 4096 * 32,
-                 head_dim: Optional[int] = None,
-                 qkv_bias: bool = False,
-                 rope_theta: float = 10000,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 rope_scaling: Optional[tuple] = None,
-                 prefix: str = "",
-                 attn_type: str = AttentionType.DECODER) -> None:
+    def __init__(
+        self,
+        config: Glm4Config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        head_dim: int | None = None,
+        qkv_bias: bool = False,
+        rope_theta: float = 10000,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -112,14 +112,16 @@ class Glm4Attention(nn.Module):
             partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn",
-                              attn_type=attn_type)
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+        )
 
     def forward(
         self,
@@ -135,11 +137,12 @@ class Glm4Attention(nn.Module):
 
 
 class Glm4DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 config: Optional[Glm4Config] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        config: Glm4Config | None = None,
+    ) -> None:
         super().__init__()
 
         config = config or vllm_config.model_config.hf_config
@@ -157,8 +160,8 @@ class Glm4DecoderLayer(nn.Module):
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
-            qkv_bias=getattr(config, 'attention_bias', False),
-            head_dim=getattr(config, 'head_dim', None),
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
             rope_scaling=rope_scaling,
@@ -172,28 +175,27 @@ class Glm4DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-        self.post_self_attn_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-        self.post_mlp_layernorm = RMSNorm(config.hidden_size,
-                                          eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_self_attn_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -202,8 +204,7 @@ class Glm4DecoderLayer(nn.Module):
         hidden_states = self.post_self_attn_layernorm(hidden_states)
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_mlp_layernorm(hidden_states)
 
@@ -221,13 +222,13 @@ ALL_DECODER_LAYER_TYPES = {
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class Glm4Model(LlamaModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=Glm4DecoderLayer)
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer
+        )
 
 
 class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
@@ -247,58 +248,58 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Glm4Model(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
+        self.model = Glm4Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index b088e0c0dd241628107cbd9d74548d6cdbfad81e..6953b805653b49fdffb7f02fa71fdece52306d6b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -26,63 +26,86 @@
 # limitations under the License.
 """Inference-only GLM-4V model compatible with HuggingFace weights."""
 
+import itertools
 import math
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from packaging.version import Version
 from transformers import BatchFeature
-from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
-    Glm4vImageProcessor, smart_resize)
-from transformers.models.glm4v.video_processing_glm4v import (
-    Glm4vVideoProcessor)
+    Glm4vImageProcessor,
+    smart_resize,
+)
+from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
 from vllm.config import VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              parallel_state)
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
+from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, VideoItem)
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import uses_mrope
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from ..layers.activation import SiluAndMul
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .qwen2_vl import (_create_qwen2vl_field_factory,
-                       apply_rotary_pos_emb_vision)
-from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .qwen2_vl import _create_qwen2vl_field_factory, apply_rotary_pos_emb_vision
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from .vision import (
+    get_vit_attn_backend,
+    run_dp_sharded_mrope_vision_model,
+)
 
 logger = init_logger(__name__)
 
@@ -100,6 +123,7 @@ class Glm4vImagePixelInputs(TensorSchema):
         - ni: Number of images
         - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")]
@@ -114,13 +138,14 @@ class Glm4vImageEmbeddingInputs(TensorSchema):
         - n: Number of images
         - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
 
     image_embeds: Annotated[torch.Tensor, TensorShape("f", "h")]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("n", 3)]
 
 
-Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs]
+Glm4vImageInputs: TypeAlias = Glm4vImagePixelInputs | Glm4vImageEmbeddingInputs
 
 
 class Glm4vVideoPixelInputs(TensorSchema):
@@ -133,6 +158,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
         - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
+
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
     pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctpp")]
@@ -148,25 +174,25 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
         - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
+
     type: Literal["video_embeds"] = "video_embeds"
 
     video_embeds: Annotated[torch.Tensor, TensorShape("p", "h")]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
 
 
-Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
+Glm4vVideoInputs: TypeAlias = Glm4vVideoPixelInputs | Glm4vVideoEmbeddingInputs
 
 # ==== Vision Encoder ==== #
 
 
 class Glm4vVisionMLP(nn.Module):
-
     def __init__(
         self,
         in_features: int,
         hidden_features: int,
         bias: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -208,8 +234,7 @@ def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
     )
 
     gathered_tensors_split = [
-        torch.split(tensor, hidden_size // tp_size, -1)
-        for tensor in gathered_tensors
+        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
     ]
     ordered_tensors = [
         tensor for pair in zip(*gathered_tensors_split) for tensor in pair
@@ -219,26 +244,30 @@ def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
 
 
 class Glm4vVisionAttention(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = (1 if use_data_parallel else
-                        get_tensor_model_parallel_world_size())
-        self.tp_rank = (0 if use_data_parallel else
-                        parallel_state.get_tensor_model_parallel_rank())
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = (
+            0 if use_data_parallel else parallel_state.get_tensor_model_parallel_rank()
+        )
         self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads)
+            projection_size, num_heads
+        )
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, self.tp_size)
+            num_heads, self.tp_size
+        )
 
         self.qkv = QKVParallelLinear(
             hidden_size=embed_dim,
@@ -263,20 +292,33 @@ class Glm4vVisionAttention(nn.Module):
         # Detect attention implementation.
         self.attn_backend = get_vit_attn_backend(
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype())
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
         self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN,
-                _Backend.TORCH_SDPA,
-                _Backend.XFORMERS,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
-                f"GLM-4V does not support {self.attn_backend} backend now.")
+                f"GLM-4V does not support {self.attn_backend} backend now."
+            )
+
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
@@ -296,12 +338,12 @@ class Glm4vVisionAttention(nn.Module):
         return q, k, v
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -310,25 +352,17 @@ class Glm4vVisionAttention(nn.Module):
         q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
-        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
-                   for x in (q, k, v))
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
         if rotary_pos_emb is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
             qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
-            if self.use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.vllm_flash_attn import flash_attn_varlen_func
-
+        if self.is_flash_attn_backend:
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            output = flash_attn_varlen_func(
+            output = self.flash_attn_varlen_func(
                 q,
                 k,
                 v,
@@ -336,14 +370,14 @@ class Glm4vVisionAttention(nn.Module):
                 cu_seqlens_k=cu_seqlens,
                 max_seqlen_q=max_seqlen,
                 max_seqlen_k=max_seqlen,
-                dropout_p=0,
+                dropout_p=0.0,
                 causal=False,
             )
 
-            context_layer = rearrange(output,
-                                      "(b s) h d -> s b (h d)",
-                                      b=batch_size).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=batch_size
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -352,45 +386,46 @@ class Glm4vVisionAttention(nn.Module):
                 q_i = q[:, start_idx:end_idx]
                 k_i = k[:, start_idx:end_idx]
                 v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-                                 for x in [q_i, k_i, v_i])
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
                 output_i = rearrange(output_i, "b h s d -> b s h d ")
                 outputs.append(output_i)
             context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
+            attn_bias = BlockDiagonalMask.from_seqlens(
+                q_seqlen=seqlens, kv_seqlen=None, device=q.device
+            )
 
             context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
+                q, k, v, attn_bias=attn_bias, p=0, scale=None
+            )
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
 
 
 class Glm4vVisionBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_hidden_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -404,6 +439,7 @@ class Glm4vVisionBlock(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
         )
         self.mlp = Glm4vVisionMLP(
             dim,
@@ -415,12 +451,12 @@ class Glm4vVisionBlock(nn.Module):
         )
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -436,7 +472,6 @@ class Glm4vVisionBlock(nn.Module):
 
 
 class Glm4vVisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         patch_size: int = 14,
@@ -450,7 +485,7 @@ class Glm4vVisionPatchEmbed(nn.Module):
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = nn.Conv3d(
+        self.proj = Conv3dLayer(
             in_channels,
             hidden_size,
             kernel_size=kernel_size,
@@ -460,19 +495,17 @@ class Glm4vVisionPatchEmbed(nn.Module):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
-                   self.patch_size)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
         x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
 class Glm4vPatchMerger(nn.Module):
-
     def __init__(
         self,
         d_model: int,
         context_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -518,7 +551,6 @@ class Glm4vPatchMerger(nn.Module):
 
 
 class Glm4vVisionEmbeddings(nn.Module):
-
     def __init__(self, config: Glm4vVisionConfig):
         super().__init__()
         self.config = config
@@ -526,18 +558,18 @@ class Glm4vVisionEmbeddings(nn.Module):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.register_buffer(
             "position_ids",
             torch.arange(self.num_positions).expand((1, -1)),
             persistent=False,
         )
 
-    def forward(self, embeddings, lengths, image_shapes, h_coords,
-                w_coords) -> torch.Tensor:
+    def forward(
+        self, embeddings, lengths, image_shapes, h_coords, w_coords
+    ) -> torch.Tensor:
         pos_embed_weight = self.position_embedding.weight
         hidden_size = pos_embed_weight.shape[1]
         total_seq = h_coords.shape[0]
@@ -548,29 +580,27 @@ class Glm4vVisionEmbeddings(nn.Module):
 
         # Handle empty sequence case
         if total_seq == 0:
-            adapted_pos_embed = torch.empty(0,
-                                            hidden_size,
-                                            device=device,
-                                            dtype=pos_embed_weight.dtype)
+            adapted_pos_embed = torch.empty(
+                0, hidden_size, device=device, dtype=pos_embed_weight.dtype
+            )
         else:
             # Convert inputs to tensors if needed
             if isinstance(lengths, list):
-                lengths = torch.tensor(lengths,
-                                       device=device,
-                                       dtype=torch.long)
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
             if not isinstance(image_shapes, torch.Tensor):
-                image_shapes = torch.tensor(image_shapes,
-                                            device=device,
-                                            dtype=torch.long)
+                image_shapes = torch.tensor(
+                    image_shapes, device=device, dtype=torch.long
+                )
 
             # Prepare 2D position embedding
             orig_size_sq = pos_embed_weight.shape[0]
             orig_size = int(orig_size_sq**0.5)
-            pos_embed_2d = (pos_embed_weight.view(
-                orig_size, orig_size,
-                hidden_size).permute(2, 0,
-                                     1).unsqueeze(0).to(device=device,
-                                                        dtype=torch.float32))
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
 
             # Calculate target dimensions for each patch
             # Add bounds checking for data parallel mode
@@ -583,23 +613,21 @@ class Glm4vVisionEmbeddings(nn.Module):
                 for i in range(len(lengths)):
                     # Cycle through available shapes
                     shape_idx = i % image_shapes.shape[0]
-                    target_h_list.append(image_shapes[shape_idx,
-                                                      1].repeat(lengths[i]))
-                    target_w_list.append(image_shapes[shape_idx,
-                                                      2].repeat(lengths[i]))
-                target_h = torch.cat(target_h_list).to(device=device,
-                                                       dtype=torch.float32)
-                target_w = torch.cat(target_w_list).to(device=device,
-                                                       dtype=torch.float32)
+                    target_h_list.append(image_shapes[shape_idx, 1].repeat(lengths[i]))
+                    target_w_list.append(image_shapes[shape_idx, 2].repeat(lengths[i]))
+                target_h = torch.cat(target_h_list).to(
+                    device=device, dtype=torch.float32
+                )
+                target_w = torch.cat(target_w_list).to(
+                    device=device, dtype=torch.float32
+                )
             else:
-                target_h = torch.cat([
-                    image_shapes[i, 1].repeat(lengths[i])
-                    for i in range(len(lengths))
-                ]).to(device=device, dtype=torch.float32)
-                target_w = torch.cat([
-                    image_shapes[i, 2].repeat(lengths[i])
-                    for i in range(len(lengths))
-                ]).to(device=device, dtype=torch.float32)
+                target_h = torch.cat(
+                    [image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]
+                ).to(device=device, dtype=torch.float32)
+                target_w = torch.cat(
+                    [image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]
+                ).to(device=device, dtype=torch.float32)
 
             # Normalize coordinates to [-1, 1] range for grid_sample
             h_coords = h_coords.to(device=device, dtype=torch.float32)
@@ -608,8 +636,7 @@ class Glm4vVisionEmbeddings(nn.Module):
             norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
 
             # Create sampling grid
-            grid = (torch.stack((norm_w, norm_h),
-                                dim=-1).unsqueeze(0).unsqueeze(2))
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
 
             # Perform bicubic interpolation
             interpolated_embed_fp32 = F.grid_sample(
@@ -622,9 +649,11 @@ class Glm4vVisionEmbeddings(nn.Module):
 
             # Reshape and convert back to original dtype
             adapted_pos_embed_fp32 = (
-                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0))
-            adapted_pos_embed = adapted_pos_embed_fp32.to(
-                pos_embed_weight.dtype).to(embeddings.device)
+                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            )
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(
+                embeddings.device
+            )
 
         # Add adapted position encoding to embeddings
         embeddings = embeddings + adapted_pos_embed
@@ -632,13 +661,11 @@ class Glm4vVisionEmbeddings(nn.Module):
 
 
 class Glm4vVisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
         self.dim = dim
         self.theta = theta
-        inv_freq = 1.0 / (theta
-                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._seq_len_cached = 0
         self._freqs_cached = None
@@ -647,16 +674,22 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
         if seqlen > self._seq_len_cached:
             seqlen *= 2
             self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (self.theta**(torch.arange(
-                0,
-                self.dim,
-                2,
-                dtype=torch.float,
-                device=self.inv_freq.device,
-            ) / self.dim))
-            seq = torch.arange(seqlen,
-                               device=self.inv_freq.device,
-                               dtype=self.inv_freq.dtype)
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0,
+                        self.dim,
+                        2,
+                        dtype=torch.float,
+                        device=self.inv_freq.device,
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
             freqs = torch.outer(seq, self.inv_freq)
             self._freqs_cached = freqs
 
@@ -666,14 +699,14 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
 
 
 class Glm4vVisionTransformer(nn.Module):
-
     def __init__(
         self,
         vision_config: Glm4vVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -699,17 +732,21 @@ class Glm4vVisionTransformer(nn.Module):
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
-        self.blocks = nn.ModuleList([
-            Glm4vVisionBlock(
-                dim=self.hidden_size,
-                num_heads=self.num_heads,
-                mlp_hidden_dim=vision_config.out_hidden_size,
-                norm_layer=norm_layer,
-                quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}",
-                use_data_parallel=self.use_data_parallel,
-            ) for layer_idx in range(depth)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                Glm4vVisionBlock(
+                    dim=self.hidden_size,
+                    num_heads=self.num_heads,
+                    mlp_hidden_dim=vision_config.out_hidden_size,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    use_data_parallel=self.use_data_parallel,
+                    attn_backend_override=attn_backend_override,
+                )
+                for layer_idx in range(depth)
+            ]
+        )
         self.merger = Glm4vPatchMerger(
             d_model=vision_config.out_hidden_size,
             context_dim=vision_config.intermediate_size,
@@ -720,22 +757,29 @@ class Glm4vVisionTransformer(nn.Module):
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 
-        self.post_conv_layernorm = RMSNorm(vision_config.hidden_size,
-                                           eps=vision_config.rms_norm_eps)
+        self.post_conv_layernorm = RMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
         self.downsample = nn.Conv2d(
             in_channels=vision_config.hidden_size,
             out_channels=vision_config.out_hidden_size,
             kernel_size=vision_config.spatial_merge_size,
             stride=vision_config.spatial_merge_size,
         )
-        self.post_layernorm = RMSNorm(vision_config.hidden_size,
-                                      eps=vision_config.rms_norm_eps)
+        self.post_layernorm = RMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
 
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -750,20 +794,27 @@ class Glm4vVisionTransformer(nn.Module):
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            hpos_ids = (hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten())
-            wpos_ids = (wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten())
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
@@ -773,10 +824,13 @@ class Glm4vVisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
+        ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         return max_seqlen, seqlens
 
@@ -796,15 +850,16 @@ class Glm4vVisionTransformer(nn.Module):
         # compute position embedding
         rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
         # compute cu_seqlens
-        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
-                                             grid_thw[:, 0]).cumsum(
-                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
-        x = self.embeddings(x, seqlens, grid_thw, image_type_ids[:, 0],
-                            image_type_ids[:, 1])
+        x = self.embeddings(
+            x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
+        )
 
         # transformers
         x = x.unsqueeze(1)
@@ -820,16 +875,14 @@ class Glm4vVisionTransformer(nn.Module):
         # adapter
         x = self.post_layernorm(x)
 
-        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size,
-                   x.shape[-1])
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1])
         x = x.permute(0, 3, 1, 2)
         x = self.downsample(x).view(-1, self.out_hidden_size)
         x = self.merger(x)
 
         return x
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("attn.qkv.", "attn.q.", "q"),
@@ -853,22 +906,20 @@ class Glm4vVisionTransformer(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Glm4vProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config()
 
     def get_tokenizer(self):
         return self.ctx.tokenizer
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
     def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
@@ -894,17 +945,16 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         if do_resize:
             resized_height, resized_width = smart_resize(
                 num_frames=num_frames
-                if num_frames > temporal_patch_size else temporal_patch_size,
+                if num_frames > temporal_patch_size
+                else temporal_patch_size,
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
                 max_pixels=max_image_pixels,
             )
-            preprocessed_size = ImageSize(width=resized_width,
-                                          height=resized_height)
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
-            preprocessed_size = ImageSize(width=image_width,
-                                          height=image_height)
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
 
         # NOTE: Frames are padded to be divisible by `temporal_patch_size`
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
@@ -920,8 +970,9 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         return preprocessed_size, num_vision_tokens
 
     def get_image_size_with_most_features(self) -> ImageSize:
-        max_image_size, _ = self._get_vision_info(image_width=9999999,
-                                                  image_height=9999999)
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999, image_height=9999999
+        )
         return max_image_size
 
     def get_num_image_tokens(
@@ -988,22 +1039,22 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   _MAX_FRAMES_PER_VIDEO)
+        max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
+        )
 
         return max(max_frames_per_video, 1)
 
-    def _get_video_second_idx(self, metadata: dict[str, Any],
-                              total_frames: int) -> list[int]:
+    def _get_video_second_idx(
+        self, metadata: dict[str, Any], total_frames: int
+    ) -> list[int]:
         video_processor = self.get_video_processor()
 
         video_fps = metadata.get("fps", video_processor.fps)
         meta_frames = metadata.get("total_num_frames", total_frames)
         max_frame_idx = meta_frames - 1
-        duration = metadata.get("duration",
-                                round(max_frame_idx / video_fps) + 1)
+        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)
         do_sample_frames = metadata["do_sample_frames"]
         if not do_sample_frames:
             frame_indices = metadata["frames_indices"]
@@ -1014,18 +1065,17 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
                     min(
                         max_frame_idx,
                         int(math.ceil(i * video_fps / video_processor.fps)),
-                    ) for i in range(n)
+                    )
+                    for i in range(n)
                 ]
             else:
-                num_samples = int(video_processor.max_duration *
-                                  video_processor.fps)
+                num_samples = int(video_processor.max_duration * video_processor.fps)
                 if num_samples >= meta_frames:
                     frame_indices = list(range(meta_frames))
                 else:
-                    target_seconds = np.linspace(0,
-                                                 duration,
-                                                 num_samples,
-                                                 endpoint=True)
+                    target_seconds = np.linspace(
+                        0, duration, num_samples, endpoint=True
+                    )
                     frame_indices = [
                         min(max_frame_idx, int(math.ceil(t * video_fps)))
                         for t in target_seconds
@@ -1067,8 +1117,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         assert isinstance(grid_thw, torch.Tensor)
         timestamps = self._get_video_second_idx(metadata, len(video_array))
         frames_idx_token = [
-            tokenizer.encode(str(i), add_special_tokens=False)
-            for i in timestamps
+            tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps
         ]
         T, H, W = grid_thw
         num_tokens_per_frame = int(H * W) // merge_length
@@ -1076,8 +1125,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         placeholder.append(bov_token_id)
         for frame_idx in frames_idx_token:
             placeholder.append(boi_token_id)
-            placeholder.extend([hf_processor.video_token_id] *
-                               num_tokens_per_frame)
+            placeholder.extend([hf_processor.video_token_id] * num_tokens_per_frame)
             placeholder.append(eoi_token_id)
             placeholder.extend(frame_idx)
         placeholder.append(eov_token_id)
@@ -1086,7 +1134,6 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
 
 
 class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1109,25 +1156,32 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = (
-            self.info.get_image_size_with_most_features())
+        target_width, target_height = self.info.get_image_size_with_most_features()
         target_num_frames = self.info.get_num_frames_with_most_features(
-            seq_len, mm_counts)
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
+                overrides=video_overrides,
             ),
         }
 
@@ -1138,7 +1192,37 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
         height: int,
         num_frames: int,
         num_videos: int,
+        overrides: VideoDummyOptions | None = None,
     ) -> list[VideoItem]:
+        if overrides:
+            if overrides.num_frames:
+                if overrides.num_frames > num_frames:
+                    logger.warning(
+                        "video.num_frames override (%d) exceeds model's "
+                        "maximum number of frames (%d), will be ignored",
+                        overrides.num_frames,
+                        num_frames,
+                    )
+                num_frames = min(num_frames, overrides.num_frames)
+            if overrides.width:
+                if overrides.width > width:
+                    logger.warning(
+                        "video.width override (%d) exceeds model's "
+                        "maximum width (%d), will be ignored",
+                        overrides.width,
+                        width,
+                    )
+                width = min(width, overrides.width)
+            if overrides.height:
+                if overrides.height > height:
+                    logger.warning(
+                        "video.height override (%d) exceeds model's "
+                        "maximum height (%d), will be ignored",
+                        overrides.height,
+                        height,
+                    )
+                height = min(height, overrides.height)
+
         video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         video_items = []
         for i in range(num_videos):
@@ -1157,7 +1241,6 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
 
 
 class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
-
     def _get_data_parser(self) -> MultiModalDataParser:
         return MultiModalDataParser(video_needs_metadata=True)
 
@@ -1174,8 +1257,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         # GLM-4.1V use `image_token_id` as video placeholder, we need to
         # replace it with `video_token_id` for video processing. So we
         # separate video processing from image processing.
-        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
-                and len(mm_data["videos"]) > 0):
+        if (
+            "videos" in mm_data
+            and isinstance(mm_data["videos"], list)
+            and len(mm_data["videos"]) > 0
+        ):
             video_grid_thw_lst = []
             pixel_values_videos_lst = []
             for item in mm_data.pop("videos", []):
@@ -1184,25 +1270,24 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                 # don't update mm_kwargs inplace
                 video_mm_kwargs = dict(**mm_kwargs)
                 video_mm_kwargs["do_sample_frames"] = metadata.get(
-                    "do_sample_frames", True)
+                    "do_sample_frames", True
+                )
 
                 video_mm_data = dict()
                 video_mm_data["videos"] = [[video_array]]
 
-                # backward compatibility for Transformers 4.55
                 unuse_metadata = ["do_sample_frames"]
-                if not hasattr(
-                        VideoMetadata,
-                        "frames_indices") and "frames_indices" in metadata:
-                    unuse_metadata.append("frames_indices")
-
-                video_mm_data["video_metadata"] = [[
-                    VideoMetadata(
-                        **{
-                            k: metadata[k]
-                            for k in metadata if k not in unuse_metadata
-                        })
-                ]]
+                video_mm_data["video_metadata"] = [
+                    [
+                        VideoMetadata(
+                            **{
+                                k: metadata[k]
+                                for k in metadata
+                                if k not in unuse_metadata
+                            }
+                        )
+                    ]
+                ]
 
                 video_outputs = super()._call_hf_processor(
                     prompt="<|begin_of_video|><|video|><|end_of_video|>",
@@ -1210,23 +1295,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                     mm_kwargs=video_mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
-                if not video_mm_kwargs["do_sample_frames"] and Version(
-                        TRANSFORMERS_VERSION) < Version("4.56.0"):
-                    # Transformers v4.55 has incorrect timestamps issue for
-                    # skip sampling. We construct the placeholder manually to
-                    # get placeholders with correct timestamps.
-                    placeholder = self.info._construct_video_placeholder(
-                        video_array,
-                        metadata,
-                        video_outputs["video_grid_thw"].squeeze(0),
-                    )
-                    video_placeholder = processor.tokenizer.decode(placeholder)
-                else:
-                    input_ids = video_outputs.pop("input_ids")
-                    input_ids[input_ids == processor.image_token_id] = (
-                        processor.video_token_id)
-                    video_placeholder = processor.tokenizer.batch_decode(
-                        input_ids)[0]
+                input_ids = video_outputs.pop("input_ids")
+                input_ids[input_ids == processor.image_token_id] = (
+                    processor.video_token_id
+                )
+                video_placeholder = processor.tokenizer.batch_decode(input_ids)[0]
                 prompt = prompt.replace(
                     "<|begin_of_video|><|video|><|end_of_video|>",
                     video_placeholder,
@@ -1234,8 +1307,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                 )
 
                 video_grid_thw_lst.append(video_outputs["video_grid_thw"])
-                pixel_values_videos_lst.append(
-                    video_outputs["pixel_values_videos"])
+                pixel_values_videos_lst.append(video_outputs["pixel_values_videos"])
             video_outputs = dict(
                 pixel_values_videos=torch.cat(pixel_values_videos_lst),
                 video_grid_thw=torch.cat(video_grid_thw_lst),
@@ -1261,8 +1333,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         return _create_qwen2vl_field_factory(
-            self.info.get_hf_config().vision_config.spatial_merge_size)(
-                hf_inputs)
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -1271,8 +1343,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
 
         merge_length = image_processor.merge_size**2
 
@@ -1291,7 +1362,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
 
             video, metadata = mm_items["video"][item_idx]
             placeholder = self.info._construct_video_placeholder(
-                video, metadata, grid_thw)
+                video, metadata, grid_thw
+            )
             return PromptUpdateDetails.select_token_id(
                 placeholder,
                 embed_token_id=hf_processor.video_token_id,
@@ -1316,15 +1388,18 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
     info=Glm4vProcessingInfo,
     dummy_inputs=Glm4vDummyInputsBuilder,
 )
-class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                    SupportsLoRA, SupportsPP):
+class Glm4vForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["gate_up_proj"]
+        "gate_up_proj": ["gate_up_proj"],
     }
 
     # To ensure correct weight loading and mapping.
@@ -1333,12 +1408,13 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
             "model.language_model.": "language_model.model.",
             "model.visual.": "visual.",
-        })
+        }
+    )
 
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|begin_of_image|><|image|><|end_of_image|>"
         if modality.startswith("video"):
@@ -1356,12 +1432,18 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.visual = Glm4vVisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-5),
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
             use_data_parallel=self.use_data_parallel,
+            attn_backend_override=attn_backend_override,
         )
 
         if config.model_type == "glm4v":
@@ -1375,29 +1457,16 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             vllm_config=vllm_config,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
-            architectures=architectures)
+            architectures=architectures,
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of {name}. Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
-        else:
-            return torch.concat(mm_input)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Glm4vImageInputs]:
+        self, **kwargs: object
+    ) -> Glm4vImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1406,11 +1475,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
             return Glm4vImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values,
@@ -1418,11 +1482,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
             return Glm4vImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -1430,7 +1489,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[Glm4vVideoInputs]:
+        self, **kwargs: object
+    ) -> Glm4vVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1439,11 +1499,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Glm4vVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1451,11 +1506,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Glm4vVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
@@ -1463,7 +1513,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
     def _process_image_input(
-            self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]:
+        self, image_input: Glm4vImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
         grid_thw_list = grid_thw.tolist()
@@ -1473,20 +1524,21 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values,
-                                                         grid_thw.tolist(),
-                                                         rope_type="rope_3d")
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
+                )
             else:
-                image_embeds = self.visual(pixel_values,
-                                           grid_thw=grid_thw.tolist())
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist())
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
+        sizes = (
+            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
+            // (merge_size * merge_size)
+        ).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
-            self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]:
+        self, video_input: Glm4vVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
         grid_thw_list = grid_thw.tolist()
@@ -1495,19 +1547,25 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
-                self.visual.dtype)
+                self.visual.dtype
+            )
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values_videos,
-                                                         grid_thw.tolist(),
-                                                         rope_type="rope_3d")
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual,
+                    pixel_values_videos,
+                    grid_thw.tolist(),
+                    rope_type="rope_3d",
+                )
             else:
-                video_embeds = self.visual(pixel_values_videos,
-                                           grid_thw=grid_thw.tolist())
+                video_embeds = self.visual(
+                    pixel_values_videos, grid_thw=grid_thw.tolist()
+                )
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
+        sizes = (
+            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
+            // (merge_size * merge_size)
+        ).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -1516,23 +1574,27 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if (input_key in ("pixel_values", "image_embeds")
-                    and "image" not in mm_input_by_modality):
-                mm_input_by_modality["image"] = (
-                    self._parse_and_validate_image_input(**kwargs))
-            if (input_key in ("pixel_values_videos", "video_embeds")
-                    and "video" not in mm_input_by_modality):
-                mm_input_by_modality["video"] = (
-                    self._parse_and_validate_video_input(**kwargs))
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
         return mm_input_by_modality
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
 
@@ -1545,64 +1607,154 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                vision_embeddings = self._process_image_input(multimodal_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(multimodal_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def get_mrope_input_positions(
         self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if (multimodal_embeddings is not None
-                and len(multimodal_embeddings) != 0
-                and all(embed.numel() > 0 for embed in multimodal_embeddings)):
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                [self.config.image_token_id, self.config.video_token_id],
-            )
-        return inputs_embeds
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if image_grid_thw or video_grid_thw:
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                if modality_type == "image":
+                    t, h, w = image_grid_thw[mm_data_idx]
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
 
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Glm4vImageInputs] = None,
-        video_input: Optional[Glm4vVideoInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
-            )
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    mm_data_idx += 1
 
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
-        return inputs_embeds
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        *image_grid_thw[mm_data_idx][1:],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for GLM-4V.
 
         Args:
@@ -1621,26 +1773,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input)
-                input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -1652,11 +1784,10 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 947c6ce62f551961aa38f2411d32bbd8c244ce13..1422dbe9b3cd07144d744848fa8bd4f3cfb93944 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -21,11 +21,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GLM-4.5 model compatible with HuggingFace weights."""
+"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights."""
+
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -34,59 +35,76 @@ from transformers.models.glm4_moe import Glm4MoeConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Glm4MoeMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -97,11 +115,10 @@ class Glm4MoeMLP(nn.Module):
 
 
 class Glm4MoE(nn.Module):
-
     def __init__(
         self,
         config: Glm4MoeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -110,14 +127,16 @@ class Glm4MoE(nn.Module):
         self.routed_scaling_factor = config.routed_scaling_factor
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
 
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
         # NOTE In the transformers implementation, the gate isn't an nn.Linear,
         # so we cannot use ReplicatedLinear here.
         # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
@@ -128,7 +147,8 @@ class Glm4MoE(nn.Module):
             dtype=torch.float32,
         )
         self.gate.e_score_correction_bias = nn.Parameter(
-            torch.empty(config.n_routed_experts, dtype=torch.float32))
+            torch.empty(config.n_routed_experts, dtype=torch.float32)
+        )
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
@@ -137,18 +157,16 @@ class Glm4MoE(nn.Module):
 
         self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_logical_experts = self.n_routed_experts
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
 
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
 
         if config.n_shared_experts is not None:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             self.shared_experts = Glm4MoeMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
@@ -157,45 +175,29 @@ class Glm4MoE(nn.Module):
                 reduce_results=False,
                 prefix=f"{prefix}.shared_experts",
             )
-            self.experts = SharedFusedMoE(
-                shared_experts=self.shared_experts,
-                num_experts=config.n_routed_experts,
-                top_k=config.num_experts_per_tok,
-                hidden_size=config.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                reduce_results=False,
-                renormalize=config.norm_topk_prob,
-                quant_config=quant_config,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                topk_group=config.topk_group,
-                prefix=f"{prefix}.experts",
-                scoring_func="sigmoid",
-                # we do scaling outside, set factor to 1.0 to avoid double mul
-                routed_scaling_factor=1.0,
-                e_score_correction_bias=self.gate.e_score_correction_bias,
-                enable_eplb=self.enable_eplb,
-                num_redundant_experts=self.n_redundant_experts,
-            )
         else:
-            self.experts = FusedMoE(
-                num_experts=config.n_routed_experts,
-                top_k=config.num_experts_per_tok,
-                hidden_size=config.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                reduce_results=False,
-                renormalize=config.norm_topk_prob,
-                quant_config=quant_config,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                topk_group=config.topk_group,
-                prefix=f"{prefix}.experts",
-                scoring_func="sigmoid",
-                # we do scaling outside, set factor to 1.0 to avoid double mul
-                routed_scaling_factor=1.0,
-                e_score_correction_bias=self.gate.e_score_correction_bias,
-                enable_eplb=self.enable_eplb,
-                num_redundant_experts=self.n_redundant_experts)
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
@@ -204,27 +206,27 @@ class Glm4MoE(nn.Module):
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states.to(dtype=torch.float32))
 
-        fused_moe_out = self.experts(hidden_states=hidden_states,
-                                     router_logits=router_logits)
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
 
         if self.shared_experts is not None:
             shared_output, final_hidden_states = fused_moe_out
             assert shared_output is not None
-            final_hidden_states = \
-                final_hidden_states * self.routed_scaling_factor\
-                    + shared_output
+            final_hidden_states = (
+                final_hidden_states * self.routed_scaling_factor + shared_output
+            )
         else:
             final_hidden_states = fused_moe_out * self.routed_scaling_factor
 
         if self.tp_size > 1:
-            final_hidden_states = (
-                self.experts.maybe_all_reduce_tensor_model_parallel(
-                    final_hidden_states))
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
 class Glm4MoeAttention(nn.Module):
-
     def __init__(
         self,
         config: Glm4MoeConfig,
@@ -232,14 +234,14 @@ class Glm4MoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
         use_qk_norm: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -266,19 +268,23 @@ class Glm4MoeAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = use_qk_norm
 
-        self.qkv_proj = QKVParallelLinear(hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=qkv_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
         self.rotary_emb = get_rope(
@@ -311,10 +317,12 @@ class Glm4MoeAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_qk_norm:
-            q = self.q_norm(q.reshape(-1, self.num_heads,
-                                      self.head_dim)).reshape(q.shape)
-            k = self.k_norm(k.reshape(-1, self.num_kv_heads,
-                                      self.head_dim)).reshape(k.shape)
+            q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(
+                q.shape
+            )
+            k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+                k.shape
+            )
 
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
@@ -323,12 +331,11 @@ class Glm4MoeAttention(nn.Module):
 
 
 class Glm4MoeDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Glm4MoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -336,11 +343,10 @@ class Glm4MoeDecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          131072)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
-        layer_idx = int(prefix.split(sep='.')[-1])
+        layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
 
         self.self_attn = Glm4MoeAttention(
@@ -360,8 +366,10 @@ class Glm4MoeDecoderLayer(nn.Module):
             use_qk_norm=config.use_qk_norm,
         )
 
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace):
+        if (
+            config.n_routed_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+        ):
             self.mlp = Glm4MoE(
                 config=config,
                 quant_config=quant_config,
@@ -369,34 +377,33 @@ class Glm4MoeDecoderLayer(nn.Module):
                 enable_eplb=enable_eplb,
             )
         else:
-            self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size,
-                                  intermediate_size=config.intermediate_size,
-                                  hidden_act=config.hidden_act,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.mlp")
-
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            self.mlp = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
         self.routed_scaling_factor = config.routed_scaling_factor
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -407,9 +414,9 @@ class Glm4MoeDecoderLayer(nn.Module):
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class Glm4MoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -423,9 +430,8 @@ class Glm4MoeModel(nn.Module):
 
         if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
-                config.vocab_size,
-                config.hidden_size,
-                prefix=f"{prefix}.embed_tokens")
+                config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens"
+            )
         else:
             self.embed_tokens = PPMissingLayer()
 
@@ -438,31 +444,32 @@ class Glm4MoeModel(nn.Module):
                 prefix=prefix,
                 enable_eplb=enable_eplb,
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -473,39 +480,38 @@ class Glm4MoeModel(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
+            num_experts=self.config.n_routed_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -522,7 +528,7 @@ class Glm4MoeModel(nn.Module):
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is not None:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -532,7 +538,7 @@ class Glm4MoeModel(nn.Module):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
@@ -567,14 +573,17 @@ class Glm4MoeModel(nn.Module):
                     # We should ask the weight loader to return success or not
                     # here since otherwise we may skip experts with other
                     # available replicas.
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
-                    success = weight_loader(param,
-                                            loaded_weight,
-                                            name_mapped,
-                                            shard_id=shard_id,
-                                            expert_id=expert_id,
-                                            return_success=True)
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
                     if success:
                         name = name_mapped
                         break
@@ -598,15 +607,44 @@ class Glm4MoeModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
         return loaded_params
 
 
-class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Glm4MixtureOfExperts(MixtureOfExperts):
+    def extract_moe_parameters(self, example_moe: Glm4MoE | None) -> None:
+        if example_moe is None:
+            raise RuntimeError("No Glm4MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -627,27 +665,31 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Glm4MoeModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        self.model = Glm4MoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
         self.expert_weights = []
 
         # Set MoE hyperparameters
-        self.num_moe_layers = (config.num_hidden_layers -
-                               config.first_k_dense_replace)
+        self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace
         self.num_expert_groups = config.n_group
 
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
+        self.moe_mlp_layers: list[Glm4MoE] = []
+
         example_moe = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -657,57 +699,34 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             if isinstance(layer.mlp, Glm4MoE):
                 # Pick last one layer since the first ones may be dense layers.
                 example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
                 self.moe_layers.append(layer.mlp.experts)
 
-        if example_moe is None:
-            raise RuntimeError("No Glm4MoE layer found in model.layers.")
+        self.extract_moe_parameters(example_moe)
 
-        self.num_logical_experts = example_moe.n_logical_experts
-        self.num_physical_experts = example_moe.n_physical_experts
-        self.num_local_physical_experts = example_moe.n_local_physical_experts
-        self.num_routed_experts = example_moe.n_routed_experts
-        self.num_shared_experts = example_moe.n_shared_experts
-        self.num_redundant_experts = example_moe.n_redundant_experts
-
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
@@ -715,13 +734,14 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         return self.model.get_expert_mapping()
 
 
-def get_spec_layer_idx_from_weight_name(config: Glm4MoeConfig,
-                                        weight_name: str) -> Optional[int]:
-    if hasattr(config,
-               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
-                                                > 0):
+def get_spec_layer_idx_from_weight_name(
+    config: Glm4MoeConfig, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_nextn_predict_layers") and (
+        config.num_nextn_predict_layers > 0
+    ):
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if f"layers.{layer_idx+i}." in weight_name:
+            if f"layers.{layer_idx + i}." in weight_name:
                 return layer_idx + i
     return None
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index c572978e622061919016122ebea074ce24a93509..e34ae6c85a4f8f156ecbadc76e5969f715141aec 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -24,71 +24,84 @@
 """Inference-only GLM-4.5 MTP model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name
+from .glm4_moe import (
+    Glm4MixtureOfExperts,
+    Glm4MoE,
+    Glm4MoeDecoderLayer,
+    get_spec_layer_idx_from_weight_name,
+)
 from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
 
 class SharedHead(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.head = ParallelLMHead(config.vocab_size,
-                                   config.hidden_size,
-                                   quant_config=quant_config)
+        self.head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "head"),
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.norm(hidden_states)
 
 
 class Glm4MoeMultiTokenPredictorLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
     ) -> None:
         super().__init__()
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.eh_proj = nn.Linear(config.hidden_size * 2,
-                                 config.hidden_size,
-                                 bias=False)
-        self.shared_head = SharedHead(config=config, quant_config=quant_config)
-        self.mtp_block = Glm4MoeDecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix)
+        self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+        self.shared_head = SharedHead(
+            config=config, prefix=prefix, quant_config=quant_config
+        )
+        self.enable_eplb = parallel_config.enable_eplb
+        self.mtp_block = Glm4MoeDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            enable_eplb=self.enable_eplb,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
@@ -98,51 +111,58 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
         previous_hidden_states = self.hnorm(previous_hidden_states)
 
         hidden_states = self.eh_proj(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1)
+        )
 
-        hidden_states, residual = self.mtp_block(positions=positions,
-                                                 hidden_states=hidden_states,
-                                                 residual=None)
+        hidden_states, residual = self.mtp_block(
+            positions=positions, hidden_states=hidden_states, residual=None
+        )
         hidden_states = residual + hidden_states
         return hidden_states
 
 
 class Glm4MoeMultiTokenPredictor(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = config.num_nextn_predict_layers
         # to map the exact layer index from weights
-        self.layers = torch.nn.ModuleDict({
-            str(idx):
-            Glm4MoeMultiTokenPredictorLayer(
-                config,
-                f"{prefix}.layers.{idx}",
-                cache_config=vllm_config.cache_config,
-                quant_config=vllm_config.quant_config,
-            )
-            for idx in range(self.mtp_start_layer_idx,
-                             self.mtp_start_layer_idx + self.num_mtp_layers)
-        })
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): Glm4MoeMultiTokenPredictorLayer(
+                    config,
+                    f"{prefix}.layers.{idx}",
+                    cache_config=vllm_config.cache_config,
+                    quant_config=vllm_config.quant_config,
+                    parallel_config=vllm_config.parallel_config,
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        current_step_idx = spec_step_idx % self.num_mtp_layers
         return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
             input_ids,
             positions,
@@ -156,45 +176,66 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
-        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
-                                    current_step_idx)]
-        logits = self.logits_processor(mtp_layer.shared_head.head,
-                                       mtp_layer.shared_head(hidden_states))
+        current_step_idx = spec_step_idx % self.num_mtp_layers
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)]
+        logits = self.logits_processor(
+            mtp_layer.shared_head.head, mtp_layer.shared_head(hidden_states)
+        )
         return logits
 
 
-class Glm4MoeMTP(nn.Module, SupportsPP):
-
+class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
-        self.model = Glm4MoeMultiTokenPredictor(vllm_config=vllm_config,
-                                                prefix=maybe_prefix(
-                                                    prefix, "model"))
+        self.model = Glm4MoeMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = self.config.num_nextn_predict_layers
+        self.num_expert_groups = self.config.n_group
+
+        self.moe_layers: list[FusedMoE] = []
+        self.moe_mlp_layers: list[Glm4MoE] = []
+        example_moe = None
+        for layer in self.model.layers.values():
+            assert isinstance(layer, Glm4MoeMultiTokenPredictorLayer)
+            layer = layer.mtp_block
+            assert isinstance(layer, Glm4MoeDecoderLayer)
+            if isinstance(layer.mlp, Glm4MoE):
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -210,16 +251,23 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
+            num_experts=self.config.n_routed_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is None:
-                continue
-            name = self._rewrite_spec_layer_name(spec_layer, name)
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if name == "lm_head.weight":
+                spec_layer = self.model.mtp_start_layer_idx
+                name = f"model.layers.{spec_layer}.shared_head.head.weight"
+            elif name == "model.embed_tokens.weight":
+                spec_layer = self.model.mtp_start_layer_idx
+            else:
+                spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+                if spec_layer is None:
+                    continue
+                name = self._rewrite_spec_layer_name(spec_layer, name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -229,7 +277,7 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
@@ -249,11 +297,13 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
 
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -262,13 +312,16 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
 
                     # According to DeepSeek-V3 Technical Report, MTP modules
                     # shares embedding layer. We only load the first weights.
-                    if (spec_layer != self.model.mtp_start_layer_idx
-                            and ".layers" not in name):
+                    if (
+                        spec_layer != self.model.mtp_start_layer_idx
+                        and ".layers" not in name
+                    ):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -280,7 +333,11 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
         and rename shared layer weights to be top level.
         """
         spec_layer_weight_names = [
-            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
         ]
         shared_weight_names = ["embed_tokens"]
         spec_layer_weight = False
@@ -293,8 +350,9 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
                 break
         if not spec_layer_weight:
             # treat rest weights as weights for transformer layer block
-            name = name.replace(f"model.layers.{spec_layer}.",
-                                f"model.layers.{spec_layer}.mtp_block.")
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
         elif shared_weight:
             # treat shared weights as top level weights
             name = name.replace(f"model.layers.{spec_layer}.", "model.")
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index bf33575859aeae620228aac149007c65c9092068..1c18ea0745f2bb006cd85fd592ca88db94a6c969 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -4,9 +4,11 @@
 # Adapted from
 # https://github.com/zai-org/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
+
+import itertools
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -19,31 +21,45 @@ from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .chatglm import ChatGLMBaseModel, ChatGLMModel
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import flatten_bn, merge_multimodal_embeddings
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
 
 
 class GLMVImagePixelInputs(TensorSchema):
@@ -54,21 +70,22 @@ class GLMVImagePixelInputs(TensorSchema):
         - h: Height of image
         - w: Width of image
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
 
 
 class EVA2CLIPPatchEmbedding(nn.Module):
-
     def __init__(self, config):
         super().__init__()
-        self.proj = nn.Conv2d(config.in_channels,
-                              config.hidden_size,
-                              kernel_size=config.patch_size,
-                              stride=config.patch_size)
+        self.proj = nn.Conv2d(
+            config.in_channels,
+            config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+        )
         self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
-        self.position_embedding = nn.Embedding(config.num_positions,
-                                               config.hidden_size)
+        self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size)
 
     def forward(self, images: torch.Tensor) -> torch.Tensor:
         """
@@ -80,8 +97,7 @@ class EVA2CLIPPatchEmbedding(nn.Module):
         torch.Tensor
             Transformed tensor with shape (B, L, D)
         """
-        images = images.to(device=self.proj.weight.device,
-                           dtype=self.proj.weight.dtype)
+        images = images.to(device=self.proj.weight.device, dtype=self.proj.weight.dtype)
         x = self.proj(images)
         x = x.flatten(2).transpose(1, 2)
         cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
@@ -91,12 +107,11 @@ class EVA2CLIPPatchEmbedding(nn.Module):
 
 
 class EVA2CLIPAttention(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -119,8 +134,9 @@ class EVA2CLIPAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-                                       self.scale)
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_rank, self.head_dim, self.scale
+        )
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -134,12 +150,11 @@ class EVA2CLIPAttention(nn.Module):
 
 
 class EVA2CLIPMLP(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -165,29 +180,27 @@ class EVA2CLIPMLP(nn.Module):
 
 
 class EVA2CLIPTransformerLayer(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
-        self.input_layernorm = LayerNorm(config.hidden_size,
-                                         eps=config.layer_norm_eps)
-        self.attention = EVA2CLIPAttention(config,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.attention")
-        self.mlp = EVA2CLIPMLP(config,
-                               quant_config=quant_config,
-                               prefix=f"{prefix}.mlp")
-        self.post_attention_layernorm = LayerNorm(config.hidden_size,
-                                                  eps=config.layer_norm_eps)
+        self.input_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = EVA2CLIPAttention(
+            config, quant_config=quant_config, prefix=f"{prefix}.attention"
+        )
+        self.mlp = EVA2CLIPMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.post_attention_layernorm = LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
 
     def forward(self, hidden_states):
         attention_input = hidden_states
-        attention_output = self.input_layernorm(
-            self.attention(attention_input))
+        attention_output = self.input_layernorm(self.attention(attention_input))
         hidden_states = attention_input + attention_output
         mlp_input = hidden_states
         mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
@@ -196,20 +209,23 @@ class EVA2CLIPTransformerLayer(nn.Module):
 
 
 class EVA2CLIPTransformer(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
-        self.layers = nn.ModuleList([
-            EVA2CLIPTransformerLayer(config,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                EVA2CLIPTransformerLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(self, hidden_states):
         for layer_module in self.layers:
@@ -218,13 +234,12 @@ class EVA2CLIPTransformer(nn.Module):
 
 
 class EVA2CLIPGLU(nn.Module):
-
     def __init__(
         self,
         config,
         in_features,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         """
         The original implementation is the same as:
@@ -233,14 +248,14 @@ class EVA2CLIPGLU(nn.Module):
             config.hidden_size,
             config.ffn_hidden_size,
             bias=False,
-            quant_config=quant_config
+            quant_config=quant_config,
         )
 
         self.gate_proj = ColumnParallelLinear(
             config.hidden_size,
             config.ffn_hidden_size,
             bias=False,
-            quant_config=quant_config
+            quant_config=quant_config,
         )
         ```
         ```
@@ -255,7 +270,7 @@ class EVA2CLIPGLU(nn.Module):
             config.hidden_size,
             [config.ffn_hidden_size] * 2,
             bias=False,
-            quant_config=quant_config
+            quant_config=quant_config,
         )
         ```
         ```
@@ -263,27 +278,32 @@ class EVA2CLIPGLU(nn.Module):
         ```
         """
         super().__init__()
-        self.linear_proj = ReplicatedLinear(in_features,
-                                            config.hidden_size,
-                                            bias=False,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.linear_proj")
+        self.linear_proj = ReplicatedLinear(
+            in_features,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_proj",
+        )
         self.norm1 = nn.LayerNorm(config.hidden_size)
         self.act1 = nn.GELU()
         self.act2 = SiluAndMul()
 
         self.merged_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.ffn_hidden_size] * 2,
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.merged_proj")
+            prefix=f"{prefix}.merged_proj",
+        )
 
         self.dense_4h_to_h = RowParallelLinear(
             config.ffn_hidden_size,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.dense_4h_to_h")
+            prefix=f"{prefix}.dense_4h_to_h",
+        )
 
     def forward(self, x):
         x, _ = self.linear_proj(x)
@@ -295,27 +315,30 @@ class EVA2CLIPGLU(nn.Module):
 
 
 class EVA2CLIPModel(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         vision_config = Namespace(**config.vision_config)
         self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
-        self.transformer = EVA2CLIPTransformer(vision_config,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.transformer")
-        self.linear_proj = EVA2CLIPGLU(config,
-                                       in_features=config.hidden_size,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.linear_proj")
-        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
-                              out_channels=config.hidden_size,
-                              kernel_size=2,
-                              stride=2)
+        self.transformer = EVA2CLIPTransformer(
+            vision_config, quant_config=quant_config, prefix=f"{prefix}.transformer"
+        )
+        self.linear_proj = EVA2CLIPGLU(
+            config,
+            in_features=config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_proj",
+        )
+        self.conv = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=config.hidden_size,
+            kernel_size=2,
+            stride=2,
+        )
         self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.scaling_factor = vision_config.scaling_factor
@@ -349,15 +372,14 @@ class EVA2CLIPModel(nn.Module):
 
 
 class GLM4VModel(ChatGLMModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
         quant_config = vllm_config.quant_config
 
-        self.vision = EVA2CLIPModel(self.config,
-                                    quant_config,
-                                    prefix=f"{prefix}.vision")
+        self.vision = EVA2CLIPModel(
+            self.config, quant_config, prefix=f"{prefix}.vision"
+        )
 
 
 class GLM4VProcessor:
@@ -379,23 +401,25 @@ class GLM4VProcessor:
         vision_config = config.vision_config
         image_size = vision_config["image_size"]
 
-        self.image_transform = transforms.Compose([
-            transforms.Resize(
-                (image_size, image_size),
-                interpolation=InterpolationMode.BICUBIC,
-            ),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711),
-            ),
-        ])
+        self.image_transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ]
+        )
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -424,7 +448,6 @@ class GLM4VProcessor:
 
 
 class GLM4VProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(ChatGLMConfig)
 
@@ -436,7 +459,7 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -454,7 +477,6 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
 
 
 class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -466,6 +488,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -473,16 +496,19 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
-
     def _hf_processor_applies_updates(
         self,
         prompt_text: str,
@@ -526,16 +552,20 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
-                                        info=GLM4VProcessingInfo,
-                                        dummy_inputs=GLM4VDummyInputsBuilder)
-class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
-                       SupportsMultiModal):
+@MULTIMODAL_REGISTRY.register_processor(
+    GLM4VMultiModalProcessor,
+    info=GLM4VProcessingInfo,
+    dummy_inputs=GLM4VDummyInputsBuilder,
+)
+class GLM4VForCausalLM(
+    ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"],
-        "merged_proj": ["gate_proj", "dense_h_to_4h"]
+        "merged_proj": ["gate_proj", "dense_h_to_4h"],
     }
 
     def get_mm_mapping(self) -> MultiModelKeys:
@@ -545,10 +575,11 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="transformer.encoder",
             connector="transformer.vision.linear_proj",
-            tower_model="transformer.vision.transformer")
+            tower_model="transformer.vision.transformer",
+        )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|begin_of_image|><|endoftext|><|end_of_image|>"
 
@@ -570,36 +601,164 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         self.transformer: GLM4VModel
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
+        self, **kwargs: object
+    ) -> GLMVImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             expected_h = expected_w = self.config.vision_config["image_size"]
-            return GLMVImagePixelInputs(type="pixel_values",
-                                        data=flatten_bn(pixel_values,
-                                                        concat=True),
-                                        resolve_bindings={
-                                            "h": expected_h,
-                                            "w": expected_w
-                                        })
+            return GLMVImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                resolve_bindings={"h": expected_h, "w": expected_w},
+            )
 
         return None
 
-    def _process_image_input(
-            self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
-        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+    def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = image_input["data"].to(dtype=self.config.dtype)
 
         return self.transformer.vision(pixel_values)
 
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if image_grid_thw or video_grid_thw:
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                if modality_type == "image":
+                    t, h, w = image_grid_thw[mm_data_idx]
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
+
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        *image_grid_thw[mm_data_idx][1:],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    embed_input_ids = SupportsMultiModal.embed_input_ids
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -607,48 +766,19 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=[
-                    self.config.boi_token_id,
-                    self.config.pad_token_id,
-                    self.config.eoi_token_id,
-                ],
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 24274db148bd778bad603bcdb1a012caa1957b84..a5e8131c7fba96bf7131e37c3eeac53171d9c83b 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -19,9 +19,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -31,39 +31,47 @@ from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tensor_model_parallel_world_size)
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from ..layers.pooler import DispatchPooler, Pooler
-from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import SupportsCrossEncoding, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GPT2Attention(nn.Module):
-
     def __init__(
         self,
         config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         total_num_heads = config.num_attention_heads
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         assert total_num_heads % tensor_model_parallel_world_size == 0
         self.num_heads = total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // total_num_heads
@@ -84,12 +92,14 @@ class GPT2Attention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.c_proj",
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scale,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scale,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -103,12 +113,11 @@ class GPT2Attention(nn.Module):
 
 
 class GPT2MLP(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: GPT2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -137,29 +146,23 @@ class GPT2MLP(nn.Module):
 
 
 class GPT2Block(nn.Module):
-
     def __init__(
         self,
         config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
-        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
-                     hidden_size)
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config,
-                                  cache_config,
-                                  quant_config,
-                                  prefix=f"{prefix}.attn")
+        self.attn = GPT2Attention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPT2MLP(inner_dim,
-                           config,
-                           quant_config,
-                           prefix=f"{prefix}.mlp")
+        self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -181,7 +184,6 @@ class GPT2Block(nn.Module):
 
 @support_torch_compile
 class GPT2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -194,34 +196,36 @@ class GPT2Model(nn.Module):
         assert not config.scale_attn_by_inverse_layer_idx
         assert not config.reorder_and_upcast_attn
         self.embed_dim = config.hidden_size
-        self.wte = VocabParallelEmbedding(config.vocab_size,
-                                          self.embed_dim,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.wte")
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wte",
+        )
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GPT2Block(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.h")
+            lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h",
+        )
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.n_embd))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.n_embd
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor],
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -237,8 +241,7 @@ class GPT2Model(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -260,64 +263,65 @@ class GPT2Model(nn.Module):
                 if not name.endswith(".weight"):
                     continue
                 loaded_weight = loaded_weight.t()
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class GPT2LMHeadModel(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = GPT2Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
-        self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                      self.config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.lm_head")
+        self.transformer = GPT2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lm_head",
+        )
         if self.config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         weights = _add_transformer_prefix(weights)
         return loader.load_weights(weights)
 
 
-class GPT2ForSequenceClassification(nn.Module):
+class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """GPT2 Model for sequence classification.
 
     This class expands GPT2Model with pooling and score functions - last token
@@ -334,22 +338,35 @@ class GPT2ForSequenceClassification(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        self.transformer = GPT2Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "gpt2"))
-        self.score = nn.Linear(config.n_embd,
-                               config.num_labels,
-                               bias=False,
-                               dtype=vllm_config.model_config.head_dtype)
+        self.transformer = GPT2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "gpt2")
+        )
+        self.score = nn.Linear(
+            config.n_embd,
+            config.num_labels,
+            bias=False,
+            dtype=vllm_config.model_config.head_dtype,
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            Pooler.for_classify(pooler_config, classifier=self.score),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
+                "classify": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
+                ),
+            }
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
@@ -359,22 +376,22 @@ class GPT2ForSequenceClassification(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids=input_ids,
             position_ids=positions,
             inputs_embeds=inputs_embeds,
-            intermediate_tensors=intermediate_tensors)
+            intermediate_tensors=intermediate_tensors,
+        )
         return hidden_states
 
 
 def _add_transformer_prefix(
-    weights: Iterable[tuple[str, torch.Tensor]]
+    weights: Iterable[tuple[str, torch.Tensor]],
 ) -> Iterable[tuple[str, torch.Tensor]]:
     for name, tensor in weights:
-        if not name.startswith('transformer.') and not name.startswith(
-                "lm_head"):
-            name = 'transformer.' + name
+        if not name.startswith("transformer.") and not name.startswith("lm_head"):
+            name = "transformer." + name
         yield name, tensor
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 162018450e7c05a142d9ef9d5cfa2e6eae726397..cdf038ba25c9221b9b0296cad6634efb442fa5ba 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -20,9 +20,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -33,39 +33,44 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GPTBigCodeAttention(nn.Module):
-
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         total_num_heads = config.num_attention_heads
-        self.tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        self.tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         assert total_num_heads % self.tensor_model_parallel_world_size == 0
-        self.num_heads = (total_num_heads //
-                          self.tensor_model_parallel_world_size)
+        self.num_heads = total_num_heads // self.tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // total_num_heads
         self.scale = self.head_dim**-0.5
 
@@ -94,13 +99,15 @@ class GPTBigCodeAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.c_proj",
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scale,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scale,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -110,7 +117,8 @@ class GPTBigCodeAttention(nn.Module):
         q, k, v = qkv.split(
             [
                 self.hidden_size // self.tensor_model_parallel_world_size,
-                self.kv_dim, self.kv_dim
+                self.kv_dim,
+                self.kv_dim,
             ],
             dim=-1,
         )
@@ -120,12 +128,11 @@ class GPTBigCodeAttention(nn.Module):
 
 
 class GPTBigMLP(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: GPTBigCodeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -154,29 +161,23 @@ class GPTBigMLP(nn.Module):
 
 
 class GPTBigCodeBlock(nn.Module):
-
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
-        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
-                     hidden_size)
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPTBigCodeAttention(config,
-                                        cache_config,
-                                        quant_config,
-                                        prefix=f"{prefix}.attn")
+        self.attn = GPTBigCodeAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPTBigMLP(inner_dim,
-                             config,
-                             quant_config,
-                             prefix=f"{prefix}.mlp")
+        self.mlp = GPTBigMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -184,7 +185,9 @@ class GPTBigCodeBlock(nn.Module):
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(hidden_states=hidden_states, )
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+        )
         # residual connection
         hidden_states = attn_output + residual
 
@@ -198,50 +201,48 @@ class GPTBigCodeBlock(nn.Module):
 
 @support_torch_compile
 class GPTBigCodeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         assert not config.add_cross_attention
 
         self.embed_dim = config.hidden_size
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.wte = VocabParallelEmbedding(self.vocab_size,
-                                          self.embed_dim,
-                                          org_num_embeddings=config.vocab_size)
+
+        self.vocab_size = config.vocab_size
+        self.wte = VocabParallelEmbedding(
+            self.vocab_size, self.embed_dim, org_num_embeddings=config.vocab_size
+        )
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
             lambda prefix: GPTBigCodeBlock(
-                config, cache_config, quant_config, prefix=prefix),
+                config, cache_config, quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.h",
         )
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.n_embd))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.n_embd
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             hidden_states = inputs_embeds + self.wpe(position_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -254,8 +255,7 @@ class GPTBigCodeModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -266,13 +266,12 @@ class GPTBigCodeModel(nn.Module):
             if is_pp_missing_parameter(name, self):
                 continue
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
             if "c_attn.input_scale" in name:
-                weight_loader(param, loaded_weight, 'q')
-                weight_loader(param, loaded_weight, 'k')
-                weight_loader(param, loaded_weight, 'v')
+                weight_loader(param, loaded_weight, "q")
+                weight_loader(param, loaded_weight, "k")
+                weight_loader(param, loaded_weight, "v")
             else:
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
@@ -286,54 +285,50 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
-                                           prefix=maybe_prefix(
-                                               prefix, "transformer"))
+        self.transformer = GPTBigCodeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:
             self.lm_head = ParallelLMHead(
                 self.transformer.vocab_size,
                 self.transformer.embed_dim,
-                org_num_embeddings=self.config.vocab_size,
-                prefix=maybe_prefix(prefix, "lm_head"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = None
         if self.config.tie_word_embeddings:
             skip_prefixes = ["lm_head."]
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 698387fab946c42cf243a2a18341684d0b7ee948..e416ecde0c1e0496644af114a991f00a525675ba 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -18,9 +18,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -31,31 +31,40 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GPTJAttention(nn.Module):
-
     def __init__(
         self,
         config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -69,12 +78,14 @@ class GPTJAttention(nn.Module):
             self.total_num_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.out_proj = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
 
         tp_world_size = get_tensor_model_parallel_world_size()
@@ -85,8 +96,7 @@ class GPTJAttention(nn.Module):
         assert getattr(config, "rotary", True)
         assert config.rotary_dim % 2 == 0
         rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=config.rotary_dim,
@@ -94,12 +104,14 @@ class GPTJAttention(nn.Module):
             base=rope_theta,
             is_neox_style=False,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_size,
-                              scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_size,
+            scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -115,12 +127,12 @@ class GPTJAttention(nn.Module):
 
 
 class GPTJMLP(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: GPTJConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.n_embd
@@ -128,11 +140,13 @@ class GPTJMLP(nn.Module):
             hidden_size,
             intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc_in",
         )
         self.fc_out = RowParallelLinear(
             intermediate_size,
             hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc_out",
         )
         self.act = get_act_fn(config.activation_function)
 
@@ -144,23 +158,20 @@ class GPTJMLP(nn.Module):
 
 
 class GPTJBlock(nn.Module):
-
     def __init__(
         self,
         config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
-        inner_dim = (4 * config.n_embd
-                     if config.n_inner is None else config.n_inner)
+        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.attn = GPTJAttention(config,
-                                  cache_config,
-                                  quant_config,
-                                  prefix=f"{prefix}.attn")
-        self.mlp = GPTJMLP(inner_dim, config, quant_config)
+        self.attn = GPTJAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
+        self.mlp = GPTJMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -180,7 +191,6 @@ class GPTJBlock(nn.Module):
 
 @support_torch_compile
 class GPTJModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -197,30 +207,29 @@ class GPTJModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.h = make_layers(
             config.n_layer,
-            lambda prefix: GPTJBlock(
-                config, cache_config, quant_config, prefix=prefix),
+            lambda prefix: GPTJBlock(config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.h",
         )
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.n_embd))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.n_embd
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -230,8 +239,7 @@ class GPTJModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -246,19 +254,20 @@ class GPTJModel(nn.Module):
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
 
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -281,15 +290,13 @@ class GPTJModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class GPTJForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -297,9 +304,9 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
         self.config = config
         self.quant_config = quant_config
         assert not config.tie_word_embeddings
-        self.transformer = GPTJModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
+        self.transformer = GPTJModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.n_embd,
@@ -309,31 +316,31 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       self.lm_head.bias)
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states, self.lm_head.bias)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 7570aefb6e96e1917fdeb7866963f6ef5b5ee77d..af0c9209231cb8b84060943fa5620a4751933781 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -18,9 +18,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -31,30 +31,37 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GPTNeoXAttention(nn.Module):
-
     def __init__(
         self,
         config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -63,11 +70,9 @@ class GPTNeoXAttention(nn.Module):
         self.head_size = self.hidden_size // self.total_num_heads
         self.bias = getattr(config, "attention_bias", True)
 
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
 
         self.query_key_value = QKVParallelLinear(
             config.hidden_size,
@@ -75,31 +80,34 @@ class GPTNeoXAttention(nn.Module):
             self.total_num_heads,
             bias=self.bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             bias=self.bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
         scaling = self.head_size**-0.5
         rotary_dim = int(self.head_size * config.rotary_pct)
         assert rotary_dim % 2 == 0
         rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_size,
-                              scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_size,
+            scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -115,22 +123,24 @@ class GPTNeoXAttention(nn.Module):
 
 
 class GPTNeoXMLP(nn.Module):
-
     def __init__(
         self,
         config: GPTNeoXConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.dense_h_to_4h = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.dense_4h_to_h = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -142,24 +152,24 @@ class GPTNeoXMLP(nn.Module):
 
 
 class GPTNeoXLayer(nn.Module):
-
     def __init__(
         self,
         config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.use_parallel_residual = config.use_parallel_residual
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.layer_norm_eps)
-        self.attention = GPTNeoXAttention(config,
-                                          cache_config,
-                                          quant_config,
-                                          prefix=f"{prefix}.attention")
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.attention = GPTNeoXAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attention"
+        )
         self.mlp = GPTNeoXMLP(config, quant_config)
 
     def forward(
@@ -192,7 +202,6 @@ class GPTNeoXLayer(nn.Module):
 
 @support_torch_compile
 class GPTNeoXModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -209,30 +218,32 @@ class GPTNeoXModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: GPTNeoXLayer(
-                config, cache_config, quant_config, prefix=prefix),
+                config, cache_config, quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers",
         )
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size,
-                                             eps=config.layer_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.final_layer_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_in(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.layers, self.start_layer, self.end_layer):
@@ -242,16 +253,17 @@ class GPTNeoXModel(nn.Module):
         hidden_states = self.final_layer_norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if ("attention.bias" in name or "attention.masked_bias" in name
-                    or "rotary_emb.inv_freq" in name):
+            if (
+                "attention.bias" in name
+                or "attention.masked_bias" in name
+                or "rotary_emb.inv_freq" in name
+            ):
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using OpenRLHF may include
                 # these tensors in the checkpoint. Skip them.
                 continue
@@ -269,62 +281,64 @@ class GPTNeoXModel(nn.Module):
                 if output_dim is not None:
                     loaded_weight_shape = loaded_weight.shape
                     loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
+                        loaded_weight_shape[:output_dim]
+                        + (num_heads, 3, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
+                    loaded_weight = loaded_weight.transpose(output_dim, output_dim + 1)
                     loaded_weight = loaded_weight.reshape(loaded_weight_shape)
 
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class GPTNeoXForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "gpt_neox"))
+        self.gpt_neox = GPTNeoXModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "gpt_neox")
+        )
         self.embed_out = ParallelLMHead(
             config.vocab_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "embed_out"),
         )
         if self.config.tie_word_embeddings:
             self.embed_out.weight = self.gpt_neox.embed_in.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.gpt_neox.make_empty_intermediate_tensors)
+            self.gpt_neox.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.gpt_neox.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.gpt_neox(input_ids, positions,
-                                      intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.gpt_neox(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.embed_out, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 47ba5084d6083d4d5560ca400de2430803ffe874..7df3b087ccb881f0025ab512e808b48828ad4f9c 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -11,38 +10,50 @@ from transformers import GptOssConfig
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.distributed import (
+    get_dp_group,
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.utils import rocm_unquantized_gemm
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
-from .interfaces import SupportsEagle3, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class OAIAttention(nn.Module):
-
     def __init__(
         self,
         config: GptOssConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -59,16 +70,13 @@ class OAIAttention(nn.Module):
             base=config.rope_theta,
             dtype=torch.float32,
             rope_scaling={
-                "rope_type":
-                "yarn",
-                "factor":
-                config.rope_scaling["factor"],
-                "original_max_position_embeddings":
-                config.rope_scaling["original_max_position_embeddings"],
-                "beta_fast":
-                config.rope_scaling["beta_fast"],
-                "beta_slow":
-                config.rope_scaling["beta_slow"],
+                "rope_type": "yarn",
+                "factor": config.rope_scaling["factor"],
+                "original_max_position_embeddings": config.rope_scaling[
+                    "original_max_position_embeddings"
+                ],
+                "beta_fast": config.rope_scaling["beta_fast"],
+                "beta_slow": config.rope_scaling["beta_slow"],
             },
             is_neox_style=True,
         )
@@ -76,15 +84,15 @@ class OAIAttention(nn.Module):
         tp_size = get_tensor_model_parallel_world_size()
 
         self.sinks = torch.nn.Parameter(
-            torch.empty(config.num_attention_heads // tp_size,
-                        requires_grad=False))
+            torch.empty(config.num_attention_heads // tp_size, requires_grad=False)
+        )
 
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
         self.rope_theta = config.rope_theta
 
-        self.qkv = QKVParallelLinear(
+        self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
             head_size=self.head_dim,
             total_num_heads=self.num_attention_heads,
@@ -104,8 +112,7 @@ class OAIAttention(nn.Module):
         self.num_local_key_value_heads = config.num_key_value_heads // tp_size
 
         # Only apply sliding window to every other layer
-        sliding_window = (config.sliding_window if self.layer_idx %
-                          2 == 0 else None)
+        sliding_window = config.sliding_window if self.layer_idx % 2 == 0 else None
         self.attn = Attention(
             self.num_local_attention_heads,
             self.head_dim,
@@ -119,9 +126,10 @@ class OAIAttention(nn.Module):
             sinks=self.sinks,
         )
 
-    def forward(self, hidden_states: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
-        qkv, _ = self.qkv(hidden_states)
+    def forward(
+        self, hidden_states: torch.Tensor, positions: torch.Tensor
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
@@ -131,7 +139,6 @@ class OAIAttention(nn.Module):
 
 
 class MLPBlock(torch.nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -148,30 +155,37 @@ class MLPBlock(torch.nn.Module):
 
         self.layer_idx = layer_idx
         self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
-        self.router = torch.nn.Linear(config.hidden_size,
-                                      config.num_local_experts)
+        self.router = torch.nn.Linear(config.hidden_size, config.num_local_experts)
         assert config.intermediate_size % self.world_size == 0
-        self.experts = FusedMoE(num_experts=config.num_local_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.intermediate_size,
-                                reduce_results=True,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts",
-                                apply_router_weight_on_input=False,
-                                has_bias=True,
-                                activation="swigluoai",
-                                is_sequence_parallel=self.is_sequence_parallel)
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            apply_router_weight_on_input=False,
+            has_bias=True,
+            activation="swigluoai",
+            is_sequence_parallel=self.is_sequence_parallel,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         num_tokens = x.shape[0]
         if self.is_sequence_parallel:
             x = sequence_parallel_chunk(x)
 
-        g = self.router(x)
+        if current_platform.is_rocm():
+            g = rocm_unquantized_gemm(
+                self, x[:, : self.hidden_size], self.router.weight, self.router.bias
+            )
+        else:
+            g = self.router(x)
         x = self.experts(hidden_states=x, router_logits=g)
 
         if self.is_sequence_parallel:
@@ -181,10 +195,10 @@ class MLPBlock(torch.nn.Module):
 
 
 class TransformerBlock(torch.nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
+        quant_config: QuantizationConfig,
         prefix: str = "",
     ):
         super().__init__()
@@ -193,12 +207,13 @@ class TransformerBlock(torch.nn.Module):
         cache_config = vllm_config.cache_config
 
         self.layer_idx = extract_layer_index(prefix)
-        self.attn = OAIAttention(config,
-                                 prefix=f"{prefix}.attn",
-                                 cache_config=cache_config)
-        self.mlp = MLPBlock(vllm_config,
-                            self.layer_idx,
-                            prefix=f"{prefix}.mlp")
+        self.attn = OAIAttention(
+            config,
+            prefix=f"{prefix}.attn",
+            quant_config=quant_config,
+            cache_config=cache_config,
+        )
+        self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp")
         self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
 
@@ -206,26 +221,24 @@ class TransformerBlock(torch.nn.Module):
         self,
         hidden_states: torch.Tensor,
         positions: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.attn(hidden_states, positions)
+
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         output = self.mlp(hidden_states)
         return output, residual
 
 
 @support_torch_compile
 class GptOssModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -234,6 +247,7 @@ class GptOssModel(nn.Module):
     ):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.quant_config = vllm_config.quant_config
         self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
         self.embedding = VocabParallelEmbedding(
@@ -245,30 +259,31 @@ class GptOssModel(nn.Module):
             lambda prefix: TransformerBlock(
                 vllm_config,
                 prefix=prefix,
+                quant_config=self.quant_config,
             ),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(self.config.hidden_size, eps=1e-5)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], self.config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], self.config.hidden_size
+        )
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 x = inputs_embeds
             else:
-                x = self.get_input_embeddings(input_ids)
+                x = self.embed_input_ids(input_ids)
 
             residual = None
         else:
@@ -280,14 +295,10 @@ class GptOssModel(nn.Module):
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             if i in self.aux_hidden_state_layers:
-                aux_hidden_states.append(x if residual is None else x +
-                                         residual)
+                aux_hidden_states.append(x if residual is None else x + residual)
             x, residual = layer(x, positions, residual)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": x,
-                "residual": residual
-            })
+            return IntermediateTensors({"hidden_states": x, "residual": residual})
         x, _ = self.norm(x, residual)
 
         if len(aux_hidden_states) > 0:
@@ -310,46 +321,44 @@ class GptOssModel(nn.Module):
         use_ep = self.parallel_config.enable_expert_parallel
         num_experts = self.config.num_local_experts
 
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
+        # In MoE, we need to flatten the tensor parallel size across the data
+        # parallel size when EP is disabled.
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+            tp_size=get_tensor_model_parallel_world_size(),
+            dp_size=get_dp_group().world_size,
+            dp_rank=get_dp_group().rank_in_group,
+        )
 
         intermediate_size = self.config.intermediate_size
         intermediate_size_block = intermediate_size // mxfp4_block
-        per_rank_intermediate_size_block = cdiv(intermediate_size_block,
-                                                tp_size)
-        per_rank_intermediate_size = (per_rank_intermediate_size_block *
-                                      mxfp4_block)
+        per_rank_intermediate_size_block = cdiv(intermediate_size_block, tp_size)
+        per_rank_intermediate_size = per_rank_intermediate_size_block * mxfp4_block
 
         # Calculate common slicing bounds for current rank
         tp_rank_start = tp_rank * per_rank_intermediate_size
-        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
-                          intermediate_size)
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size)
 
         for name, weight in weights:
             # Skip layers on other devices.
             if is_pp_missing_parameter(name, self):
                 continue
 
-            # FIXME(woosuk): Remove this after testing.
-            weight = weight.cuda()
-
             if ".w13_weight_scale" in name:
                 # Handle MLP gate and up projection weights scale
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end,
-                                           ...]
+                    narrow_weight = weight[:, 2 * tp_rank_start : 2 * tp_rank_end, ...]
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=name,
+                    shard_id=None,
+                    expert_id=None,
+                )
                 loaded_params.add(name)
                 continue
             elif ".w2_weight_scale" in name:
@@ -357,66 +366,68 @@ class GptOssModel(nn.Module):
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[..., tp_rank_start //
-                                           mxfp4_block:tp_rank_end //
-                                           mxfp4_block]
+                    narrow_weight = weight[
+                        ..., tp_rank_start // mxfp4_block : tp_rank_end // mxfp4_block
+                    ]
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=name,
+                    shard_id=None,
+                    expert_id=None,
+                )
                 loaded_params.add(name)
                 continue
             elif ".w13_weight" in name:
                 # Handle MLP gate and up projection weights
                 # flat weight from (E, 2 * N, block_size, entry_per_block)
                 # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
-                weight = weight.view(num_experts, 2 * intermediate_size,
-                                     -1).contiguous()
+                weight = weight.view(
+                    num_experts, 2 * intermediate_size, -1
+                ).contiguous()
 
                 # Extract gate and up projection parts
                 # since the weight is shuffled, we can slice directly
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end,
-                                           ...]
+                    narrow_weight = weight[:, 2 * tp_rank_start : 2 * tp_rank_end, ...]
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=name,
+                    shard_id=None,
+                    expert_id=None,
+                )
                 loaded_params.add(name)
                 continue
             elif ".w2_weight" in name:
                 # Handle MLP down projection weights
                 # same flatten here, but since 2 mx4 value are packed in 1
                 # uint8, divide by 2
-                weight = weight.view(num_experts, -1,
-                                     intermediate_size // 2).contiguous()
+                weight = weight.view(
+                    num_experts, -1, intermediate_size // 2
+                ).contiguous()
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[...,
-                                           tp_rank_start // 2:tp_rank_end // 2]
+                    narrow_weight = weight[..., tp_rank_start // 2 : tp_rank_end // 2]
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=name,
+                    shard_id=None,
+                    expert_id=None,
+                )
                 loaded_params.add(name)
                 continue
             elif ".w13_bias" in name:
@@ -425,35 +436,32 @@ class GptOssModel(nn.Module):
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end]
+                    narrow_weight = weight[:, 2 * tp_rank_start : 2 * tp_rank_end]
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=name,
+                    shard_id=None,
+                    expert_id=None,
+                )
                 loaded_params.add(name)
                 continue
             elif ".w2_bias" in name:
                 # Handle MLP down projection bias
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if use_ep:
                     weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
                     # (only load on rank 0 to avoid duplication)
                     if tp_rank != 0:
                         weight.zero_()
-                weight_loader(param,
-                              weight,
-                              weight_name=name,
-                              shard_id=None,
-                              expert_id=None)
+                weight_loader(
+                    param, weight, weight_name=name, shard_id=None, expert_id=None
+                )
                 loaded_params.add(name)
                 continue
             elif "sinks" in name:
@@ -468,8 +476,7 @@ class GptOssModel(nn.Module):
                     continue
                 name = name.replace(weight_name, param_name)
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if weight_loader == default_weight_loader:
                     weight_loader(param, weight)
                 else:
@@ -480,16 +487,15 @@ class GptOssModel(nn.Module):
                 if name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, weight)
             loaded_params.add(name)
         return loaded_params
 
     def _load_weights_other(
         self,
-        ep_rank_start: int,
         ep_rank_end: int,
+        ep_rank_start: int,
         heads_per_rank: int,
         head_start: int,
         weights: Iterable[tuple[str, torch.Tensor]],
@@ -500,15 +506,19 @@ class GptOssModel(nn.Module):
 
         use_ep = self.parallel_config.enable_expert_parallel
 
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
+        # In MoE, we need to flatten the tensor parallel size across the data
+        # parallel size when EP is disabled.
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+            tp_size=get_tensor_model_parallel_world_size(),
+            dp_size=get_dp_group().world_size,
+            dp_rank=get_dp_group().rank_in_group,
+        )
 
         intermediate_size = self.config.intermediate_size
         per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
         # Calculate common slicing bounds for current rank
         tp_rank_start = tp_rank * per_rank_intermediate_size
-        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
-                          intermediate_size)
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size)
 
         for name, weight in weights:
             # Skip layers on other devices.
@@ -521,8 +531,7 @@ class GptOssModel(nn.Module):
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[:, :,
-                                           2 * tp_rank_start:2 * tp_rank_end]
+                    narrow_weight = weight[:, :, 2 * tp_rank_start : 2 * tp_rank_end]
 
                 narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
                 param = params_dict[name]
@@ -548,8 +557,7 @@ class GptOssModel(nn.Module):
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end]
+                    narrow_weight = weight[:, 2 * tp_rank_start : 2 * tp_rank_end]
 
                 param = params_dict[name]
                 param.copy_(narrow_weight)
@@ -579,8 +587,7 @@ class GptOssModel(nn.Module):
                     continue
                 name = name.replace(weight_name, param_name)
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if weight_loader == default_weight_loader:
                     weight_loader(param, weight)
                 else:
@@ -591,19 +598,17 @@ class GptOssModel(nn.Module):
                 if name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, weight)
             loaded_params.add(name)
         return loaded_params
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv", ".q_proj", "q"),
-            (".qkv", ".k_proj", "k"),
-            (".qkv", ".v_proj", "v"),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
         ]
 
         tp_rank = get_tensor_model_parallel_rank()
@@ -620,20 +625,33 @@ class GptOssModel(nn.Module):
         ep_rank_start = ep_rank * experts_per_rank
         ep_rank_end = (ep_rank + 1) * experts_per_rank
 
-        quant_method = (self.config.quantization_config['quant_method'] if
-                        hasattr(self.config, "quantization_config") else None)
+        quant_method = (
+            self.config.quantization_config["quant_method"]
+            if hasattr(self.config, "quantization_config")
+            else None
+        )
         if quant_method == "mxfp4":
-            return self._load_weights_mxfp4(ep_rank_end, ep_rank_start,
-                                            heads_per_rank, head_start,
-                                            weights, stacked_params_mapping)
+            return self._load_weights_mxfp4(
+                ep_rank_end,
+                ep_rank_start,
+                heads_per_rank,
+                head_start,
+                weights,
+                stacked_params_mapping,
+            )
         else:
-            return self._load_weights_other(ep_rank_end, ep_rank_start,
-                                            heads_per_rank, head_start,
-                                            weights, stacked_params_mapping)
+            return self._load_weights_other(
+                ep_rank_start,
+                ep_rank_end,
+                heads_per_rank,
+                head_start,
+                weights,
+                stacked_params_mapping,
+            )
 
 
-class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3):
-    packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
+class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
@@ -641,17 +659,14 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3):
         },
         orig_to_new_suffix={
             ".embed_tokens.weight": ".embedding.weight",
-
             # MoE MXFP4 weights
             ".gate_up_proj_blocks": ".w13_weight",
             ".down_proj_blocks": ".w2_weight",
             ".gate_up_proj_scales": ".w13_weight_scale",
             ".down_proj_scales": ".w2_weight_scale",
-
             # MoE other weights
             ".gate_up_proj": ".w13_weight",
             ".down_proj": ".w2_weight",
-
             # MoE Bias
             ".gate_up_proj_bias": ".w13_bias",
             ".down_proj_bias": ".w2_bias",
@@ -678,7 +693,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3):
         )
         self.logits_processor = LogitsProcessor(self.config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
@@ -687,26 +703,36 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3):
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return self.model(input_ids, positions, intermediate_tensors,
-                          inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, weight scales, activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_local_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 2c619396e6c0cd0c0964b9e2126e75dcd32b43d0..c44b4021471ef7e21ef4c8dd73c71cabe6de65ae 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -37,32 +38,42 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_layers, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GraniteMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -72,15 +83,19 @@ class GraniteMLP(nn.Module):
             output_sizes=[intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -91,7 +106,6 @@ class GraniteMLP(nn.Module):
 
 
 class GraniteAttention(nn.Module):
-
     def __init__(
         self,
         config: GraniteConfig,
@@ -99,11 +113,11 @@ class GraniteAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -156,13 +170,15 @@ class GraniteAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -178,12 +194,11 @@ class GraniteAttention(nn.Module):
 
 
 class GraniteDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: GraniteConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -192,21 +207,24 @@ class GraniteDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         self.self_attn = GraniteAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -224,10 +242,10 @@ class GraniteDecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
@@ -252,7 +270,6 @@ class GraniteDecoderLayer(nn.Module):
 
 @support_torch_compile
 class GraniteModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -263,12 +280,16 @@ class GraniteModel(nn.Module):
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
@@ -276,38 +297,42 @@ class GraniteModel(nn.Module):
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE
                 # We need bigger padding if using lora for kernel
                 # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
+                if not lora_config
+                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
             )
         else:
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GraniteDecoderLayer(config=config,
-                                               cache_config=cache_config,
-                                               quant_config=quant_config,
-                                               prefix=prefix),
-            prefix=f"{prefix}.layers")
+            lambda prefix: GraniteDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
 
             hidden_states *= self.config.embedding_multiplier
         else:
@@ -318,15 +343,16 @@ class GraniteModel(nn.Module):
             hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                }
+            )
 
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -338,18 +364,19 @@ class GraniteModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -378,8 +405,7 @@ class GraniteModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -415,8 +441,9 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.lora_config = lora_config
         self.quant_config = quant_config
 
-        self.model = GraniteModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        self.model = GraniteModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
@@ -428,7 +455,8 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE
                 # We need bigger padding if using lora for kernel
                 # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
+                if not lora_config
+                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -439,48 +467,47 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             if hasattr(config, "logits_scaling"):
                 logit_scale /= config.logits_scaling
 
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    scale=logit_scale)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         # With tie_word_embeddings, we can skip lm_head.weight
         # The weight might appear unnecessarily in the files if the model is
         # processed with quantization, LoRA, fine-tuning, etc.
-        skip_prefixes = (["lm_head."]
-                         if self.config.tie_word_embeddings else None)
+        skip_prefixes = ["lm_head."] if self.config.tie_word_embeddings else None
 
         loader = AutoWeightsLoader(
             self,
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index a5849184339b13b19db937bed044ac5399614f6b..1797adab8d146418061db737f706996662c1e836 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -23,44 +23,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM Granite speech model."""
+
 import math
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Optional, Union
+from typing import Annotated, Literal, cast
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, VllmConfig
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.inputs.data import PromptType
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.tokenizer import cached_get_tokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip2 import Blip2QFormerModel
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, embed_multimodal,
-                    init_vllm_registered_model, maybe_prefix)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsTranscription,
+)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+
+# NOTE lang support is based on what is written here:
+# https://huggingface.co/ibm-granite/granite-speech-3.3-2b
+# Though this may vary from model to model, and also many langs
+# work pretty well with zero shot.
+ISO639_1_SUPPORTED_LANGS = {
+    "en": "English",
+    "fr": "French",
+    "de": "German",
+    "pt": "Portuguese",
+    "es": "Spanish",
+}
 
 
 ### Audio Input
 class GraniteSpeechAudioInputs(TensorSchema):
     """
     Audio input features for Granite Speech model.
-    
+
     Dimensions:
         - b: Batch size
         - fi: Number of input features from the Mel spectrogram.
@@ -79,8 +109,7 @@ class GraniteSpeechAudioInputs(TensorSchema):
 
 
 class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
     # There is no limit to the maximum number of audio tokens that can be
@@ -96,8 +125,8 @@ class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
 
 ### Input Processing  & Multimodal utils
 class GraniteSpeechMultiModalProcessor(
-        BaseMultiModalProcessor[GraniteSpeechMultiModalProcessingInfo]):
-
+    BaseMultiModalProcessor[GraniteSpeechMultiModalProcessingInfo]
+):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_hf_processor().audio_processor
         sampling_rate = feature_extractor.melspec_kwargs["sample_rate"]
@@ -133,7 +162,8 @@ class GraniteSpeechMultiModalProcessor(
             audio = audios.get(item_idx)
             audio_length = audio.shape[-1]
             num_projector_features = feature_extractor._get_num_audio_features(
-                [audio_length])[0]
+                [audio_length]
+            )[0]
             return [audio_token_id] * num_projector_features
 
         return [
@@ -169,28 +199,30 @@ class GraniteSpeechMultiModalProcessor(
             # Calculate the number of audio tokens per entry in the batch;
             # This is used to split the batch back out after padding.
             audio_token_index = self.info.get_hf_config().audio_token_index
-            processed_outputs["audio_embed_sizes"] = [
-                torch.sum(indices == audio_token_index).item()
-                for indices in processed_outputs["input_ids"]
-            ]
+            processed_outputs["audio_embed_sizes"] = (
+                processed_outputs["input_ids"] == audio_token_index
+            ).sum(-1)
 
         return processed_outputs
 
 
 class GraniteSpeechDummyInputsBuilder(
-        BaseDummyInputsBuilder[GraniteSpeechMultiModalProcessingInfo]):
-
+    BaseDummyInputsBuilder[GraniteSpeechMultiModalProcessingInfo]
+):
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(
+            "audio": self._get_dummy_audios(
                 length=self.info.get_max_audio_len(),
                 num_audios=num_audios,
+                overrides=audio_overrides,
             )
         }
 
@@ -203,12 +235,11 @@ class GraniteSpeechDummyInputsBuilder(
 
 ### QFormer Projector
 class GraniteSpeechEncoderProjector(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
         cache_config: CacheConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -218,8 +249,8 @@ class GraniteSpeechEncoderProjector(nn.Module):
         self.num_queries = config.window_size // config.downsample_rate
 
         self.query = nn.Parameter(
-            torch.zeros(1, self.num_queries,
-                        config.projector_config.hidden_size))
+            torch.zeros(1, self.num_queries, config.projector_config.hidden_size)
+        )
 
         # NOTE - this is implemented generically in transformers,
         # but for now we create the QFormer model directly since
@@ -230,17 +261,16 @@ class GraniteSpeechEncoderProjector(nn.Module):
             cache_config=cache_config,
             prefix=f"{prefix}.qformer",
         )
-        self.linear = nn.Linear(config.projector_config.hidden_size,
-                                config.text_config.hidden_size)
+        self.linear = nn.Linear(
+            config.projector_config.hidden_size, config.text_config.hidden_size
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = hidden_states.size()
         nblocks = math.ceil(seq_len / self.window_size)
         pad = nblocks * self.window_size - seq_len
-        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad),
-                                          "constant", 0)
-        hidden_states = hidden_states.view(batch_size * nblocks,
-                                           self.window_size, dim)
+        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
+        hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim)
 
         last_hidden_state = self.qformer(
             query_embeds=self.query.data,
@@ -252,7 +282,8 @@ class GraniteSpeechEncoderProjector(nn.Module):
                 batch_size,
                 nblocks * self.window_size // self.downsample_rate,
                 -1,
-            ))
+            )
+        )
         return query_proj
 
 
@@ -262,10 +293,12 @@ class GraniteSpeechEncoderProjector(nn.Module):
 class GraniteSpeechConformerFeedForward(nn.Module):
     """Feedforward module for conformer encoder blocks."""
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.pre_norm = nn.LayerNorm(config.hidden_dim)
 
@@ -311,16 +344,16 @@ class GraniteSpeechConformerAttention(nn.Module):
         self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
         self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, config.hidden_dim)
-        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1,
-                                        self.dim_head)
+        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
 
         if self.context_size <= 0 or self.context_size > self.max_pos_emb:
             raise ValueError(
                 "Context size is either less than 0 or exceeds the max_pos_emb"
             )
 
-    def forward(self, hidden_states: torch.Tensor,
-                attention_dists: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, attention_dists: torch.Tensor
+    ) -> torch.Tensor:
         hidden_states = self.pre_norm(hidden_states)
         bsz, num_features, _ = hidden_states.shape
 
@@ -329,47 +362,53 @@ class GraniteSpeechConformerAttention(nn.Module):
         if remainder > 0:
             # right padding to reach block size
             hidden_states = torch.nn.functional.pad(
-                hidden_states, (0, 0, 0, self.context_size - remainder))
+                hidden_states, (0, 0, 0, self.context_size - remainder)
+            )
 
         # NOTE: would be nice to try to use qkvparallellinear
         # here for this block attention implementation if possible
         query_states = self.to_q(hidden_states)
         key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
 
-        query_states = query_states.reshape(bsz, num_blocks, self.context_size,
-                                            self.num_heads,
-                                            -1).transpose(2, 3)
-        key_states = key_states.reshape(bsz, num_blocks, self.context_size,
-                                        self.num_heads, -1).transpose(2, 3)
-        value_states = value_states.reshape(bsz, num_blocks, self.context_size,
-                                            self.num_heads,
-                                            -1).transpose(2, 3)
+        query_states = query_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        key_states = key_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        value_states = value_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
 
         # shaw's relative positional embedding
         dist = attention_dists.to(hidden_states.device)
         rel_pos_emb = self.rel_pos_emb(dist)
-        rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] +
-                                                list(rel_pos_emb.shape))
-        pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded,
-                             dim=-1) * self.scale
+        rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
+        pos_attn = (
+            torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1)
+            * self.scale
+        )
 
         if remainder > 0:
             # masked attention in the extended block
-            mask = torch.ones(self.context_size,
-                              self.context_size,
-                              dtype=bool,
-                              device=hidden_states.device)
+            mask = torch.ones(
+                self.context_size,
+                self.context_size,
+                dtype=bool,
+                device=hidden_states.device,
+            )
             mask[:remainder, :remainder] = 0
             mask_value = -torch.finfo(pos_attn.dtype).max
             pos_attn[:, -1, :].masked_fill_(mask, mask_value)
 
-        with torch.nn.attention.sdpa_kernel(
-                torch.nn.attention.SDPBackend.MATH):
-            out = F.scaled_dot_product_attention(query_states,
-                                                 key_states,
-                                                 value_states,
-                                                 attn_mask=pos_attn,
-                                                 scale=self.scale)
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
+            out = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=pos_attn,
+                scale=self.scale,
+            )
         out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
         return self.to_out(out[:, :num_features, :])
 
@@ -377,22 +416,16 @@ class GraniteSpeechConformerAttention(nn.Module):
 class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     """Wrapper for padded 1D pointwise convolution."""
 
-    def __init__(self,
-                 chan_in: int,
-                 chan_out: int,
-                 kernel_size: int,
-                 prefix: str = ""):
+    def __init__(self, chan_in: int, chan_out: int, kernel_size: int, prefix: str = ""):
         super().__init__()
         # Padding for the 1D conv is symmetric or close (i.e., offset by one).
         pad = kernel_size // 2
         pad_offset = (kernel_size + 1) % 2
         self.padding = (pad, pad - pad_offset)
 
-        self.conv = nn.Conv1d(chan_in,
-                              chan_out,
-                              kernel_size,
-                              groups=chan_in,
-                              bias=False)
+        self.conv = nn.Conv1d(
+            chan_in, chan_out, kernel_size, groups=chan_in, bias=False
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = F.pad(hidden_states, self.padding)
@@ -437,21 +470,19 @@ class GraniteSpeechConformerBlock(nn.Module):
 
     def __init__(self, config: PretrainedConfig, prefix: str = ""):
         super().__init__()
-        self.ff1 = GraniteSpeechConformerFeedForward(config,
-                                                     prefix=f"{prefix}.ff1")
-        self.attn = GraniteSpeechConformerAttention(config,
-                                                    prefix=f"{prefix}.attn")
-        self.conv = GraniteSpeechConformerConvModule(config,
-                                                     prefix=f"{prefix}.conv")
-        self.ff2 = GraniteSpeechConformerFeedForward(config,
-                                                     prefix=f"{prefix}.ff2")
+        self.ff1 = GraniteSpeechConformerFeedForward(config, prefix=f"{prefix}.ff1")
+        self.attn = GraniteSpeechConformerAttention(config, prefix=f"{prefix}.attn")
+        self.conv = GraniteSpeechConformerConvModule(config, prefix=f"{prefix}.conv")
+        self.ff2 = GraniteSpeechConformerFeedForward(config, prefix=f"{prefix}.ff2")
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
-    def forward(self, hidden_states: torch.Tensor,
-                attention_dists: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, attention_dists: torch.Tensor
+    ) -> torch.Tensor:
         hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
-        hidden_states = self.attn(
-            hidden_states, attention_dists=attention_dists) + hidden_states
+        hidden_states = (
+            self.attn(hidden_states, attention_dists=attention_dists) + hidden_states
+        )
         hidden_states = self.conv(hidden_states) + hidden_states
         hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
         hidden_states = self.post_norm(hidden_states)
@@ -461,29 +492,33 @@ class GraniteSpeechConformerBlock(nn.Module):
 class GraniteSpeechCTCEncoder(nn.Module):
     """CTC Encoder comprising conformer blocks and additional linear layers."""
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 prefix: str,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        quant_config: QuantizationConfig | None = None,
+    ):
         super().__init__()
         self.config = config
 
         # Precompute clamped relative positional encoding distances
         seq = torch.arange(config.context_size)
         relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
-        self.attention_dists = torch.clamp(
-            relpos_dist, -config.context_size,
-            config.context_size) + config.max_pos_emb
-
-        self.input_linear = nn.Linear(config.input_dim,
-                                      config.hidden_dim,
-                                      bias=True)
-        self.layers = nn.ModuleList([
-            GraniteSpeechConformerBlock(
-                config,
-                prefix=f"{prefix}.layers.{idx}",
-            ) for idx in range(config.num_layers)
-        ])
+        self.attention_dists = (
+            torch.clamp(relpos_dist, -config.context_size, config.context_size)
+            + config.max_pos_emb
+        )
+
+        self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
+        self.layers = nn.ModuleList(
+            [
+                GraniteSpeechConformerBlock(
+                    config,
+                    prefix=f"{prefix}.layers.{idx}",
+                )
+                for idx in range(config.num_layers)
+            ]
+        )
 
         self.out = ColumnParallelLinear(
             input_size=config.hidden_dim,
@@ -506,8 +541,7 @@ class GraniteSpeechCTCEncoder(nn.Module):
     def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
-            hidden_states = layer(hidden_states,
-                                  attention_dists=self.attention_dists)
+            hidden_states = layer(hidden_states, attention_dists=self.attention_dists)
 
             if idx == self.num_layers // 2:
                 hidden_states_mid = hidden_states.clone()
@@ -521,13 +555,17 @@ class GraniteSpeechCTCEncoder(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(
     GraniteSpeechMultiModalProcessor,
     info=GraniteSpeechMultiModalProcessingInfo,
-    dummy_inputs=GraniteSpeechDummyInputsBuilder)
+    dummy_inputs=GraniteSpeechDummyInputsBuilder,
+)
 class GraniteSpeechForConditionalGeneration(
-        nn.Module,
-        SupportsMultiModal,
-        SupportsPP,
-        SupportsLoRA,
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsLoRA,
+    SupportsTranscription,
 ):
+    merge_by_field_config = True
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
         "qkv_proj": [
@@ -542,7 +580,7 @@ class GraniteSpeechForConditionalGeneration(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio|>"
 
@@ -581,12 +619,13 @@ class GraniteSpeechForConditionalGeneration(
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_audio_input(
         self,
         **kwargs: object,
-    ) -> Optional[GraniteSpeechAudioInputs]:
+    ) -> GraniteSpeechAudioInputs | None:
         input_features = kwargs.pop("input_features", None)
         input_features_mask = kwargs.pop("input_features_mask", None)
         audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
@@ -599,17 +638,21 @@ class GraniteSpeechForConditionalGeneration(
         # from the processor, but we handle rebuilding it here since
         # vLLM generally processes everything independently + batches.
         if input_features_mask is None:
-            input_features_mask = self._build_input_features_mask(
-                audio_embed_sizes)
+            input_features_mask = self._build_input_features_mask(audio_embed_sizes)
 
         if not isinstance(input_features, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio input features. "
-                             f"Got type: {type(input_features)}")
+            raise ValueError(
+                "Incorrect type of audio input features. "
+                f"Got type: {type(input_features)}"
+            )
 
         if input_features_mask is not None and not isinstance(
-                input_features_mask, torch.Tensor):
-            raise ValueError("Incorrect type of audio input features mask. "
-                             f"Got type: {type(input_features_mask)}")
+            input_features_mask, torch.Tensor
+        ):
+            raise ValueError(
+                "Incorrect type of audio input features mask. "
+                f"Got type: {type(input_features_mask)}"
+            )
 
         if isinstance(input_features, torch.Tensor):
             # Granite speech currently only allows one audio token per instance
@@ -622,16 +665,17 @@ class GraniteSpeechForConditionalGeneration(
             if len(input_features.shape) != 3:
                 raise ValueError(
                     "Squeezed input features should be 3D but are of shape "
-                    f"{input_features.shape}")
-            input_features = input_features.to(
-                self.encoder.input_linear.weight.dtype)
+                    f"{input_features.shape}"
+                )
+            input_features = input_features.to(self.encoder.input_linear.weight.dtype)
 
         else:
             # Otherwise we have a list of tensors, which are almost certainly
             # differing in their respective numbers of audio features;
             # stack them into a 3D tensor of size [bsz, most_num_features, 160].
             input_features = self._pad_and_stack_input_features(
-                input_features, ).to(self.encoder.input_linear.weight.dtype)
+                input_features,
+            ).to(self.encoder.input_linear.weight.dtype)
 
         return GraniteSpeechAudioInputs(
             input_features=input_features,
@@ -703,7 +747,7 @@ class GraniteSpeechForConditionalGeneration(
         audio_input: GraniteSpeechAudioInputs,
     ) -> tuple[torch.Tensor]:
         """Compute the audio features to be merged into the LLM embeddings.
-        
+
         Args:
             audio_input: GraniteSpeechAudioInputs
                 Audio inputs object containing Mel features, an input features
@@ -720,7 +764,10 @@ class GraniteSpeechForConditionalGeneration(
         # Split variable length features into a tuple
         return torch.split(masked_embeds, audio_input["audio_embed_sizes"])
 
-    def get_multimodal_embeddings(
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(
         self,
         **kwargs: object,
     ) -> MultiModalEmbeddings:
@@ -728,54 +775,50 @@ class GraniteSpeechForConditionalGeneration(
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
-            return None
+
         audio_features = self._process_audio_input(audio_input)
         return audio_features
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
-        """Compute the merged LLM / audio embeddings."""
-        if multimodal_embeddings is None \
-            or len(multimodal_embeddings) == 0:
-            return self.language_model.get_input_embeddings(input_ids)
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
 
-        inputs_embeds = embed_multimodal(
+        return super().embed_input_ids(
             input_ids,
-            self.config.audio_token_index,
-            self.language_model.model.get_input_embeddings,
-            multimodal_embeddings,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
-        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            audio_embeds = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids, audio_embeds)
-            input_ids = None
-
-        model_output = self.language_model(input_ids, positions,
-                                           intermediate_tensors, inputs_embeds)
+        model_output = self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(
@@ -792,3 +835,79 @@ class GraniteSpeechForConditionalGeneration(
             connector="projector",
             tower_model="encoder",
         )
+
+    ### Support for speech-to-text Transcription
+    @classmethod
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
+        """Get the generation prompt to be used for transcription requests."""
+        # Audio placeholders don't use an index, so value doesn't matter
+        audio_tok = cls.get_placeholder_str("audio", 0)
+
+        if task_type == "translate":
+            full_lang_name_to = cls.supported_languages.get(to_language, to_language)
+            user_prompt = f"{audio_tok}translate the speech to {full_lang_name_to}"  # noqa: E501
+        elif task_type == "transcribe":
+            user_prompt = (
+                f"{audio_tok}can you transcribe the speech into a written format?"  # noqa: E501
+            )
+        else:
+            raise ValueError(f"Unsupported task type {task_type}")
+
+        tokenizer = cached_get_tokenizer(model_config.model)
+        chat = [dict(role="user", content=user_prompt)]
+        prompt = tokenizer.apply_chat_template(
+            chat,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        prompt_token_ids = tokenizer.encode(prompt)
+        prompt = {
+            "prompt_token_ids": prompt_token_ids,
+            "multi_modal_data": {"audio": audio},
+        }
+        return cast(PromptType, prompt)
+
+    # Adapted from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/models/granite_speech/feature_extraction_granite_speech.py#L122 # noqa: E501
+    @classmethod
+    def get_num_audio_tokens(
+        cls,
+        audio_duration_s: float,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+    ) -> int | None:
+        """Get the number of audio tokens for an audio duration in sec."""
+        processor = cached_get_processor(model_config.model)
+        hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
+        proj_win_size = processor.audio_processor.projector_window_size
+        ds_rate = processor.audio_processor.projector_downsample_rate
+        effective_window_size = proj_win_size // ds_rate
+
+        raw_length = audio_duration_s * stt_config.sample_rate
+
+        # mel sequence length computation
+        mel_length = raw_length // hop_length + 1
+        # encoder frame takes two mel features
+        encoder_length = mel_length // 2
+        nblocks = math.ceil(encoder_length / proj_win_size)
+        # projector output length
+        return nblocks * effective_window_size
+
+    @classmethod
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
+        """Get the stt config for this model."""
+        # Default settings are reasonable for this model and we don't currently
+        # expose this information in the model configs, but this may change in
+        # the future
+        return SpeechToTextConfig()
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 76a5745a4f51b6bf558b7de31cf6068632b49146..5c6759ded06691f478a6ef87226eeb3c32d9d7f4 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GraniteMoe model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -33,28 +34,34 @@ from torch import nn
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_layers,
-                    maybe_prefix)
+from .utils import AutoWeightsLoader, is_pp_missing_parameter, make_layers, maybe_prefix
 
 
 class GraniteMoeMoE(nn.Module):
@@ -65,39 +72,45 @@ class GraniteMoeMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(self,
-                 num_experts: int,
-                 top_k: int,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None,
-                 is_sequence_parallel=False,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        is_sequence_parallel=False,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.is_sequence_parallel = is_sequence_parallel
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(hidden_size,
-                                     num_experts,
-                                     bias=False,
-                                     params_dtype=params_dtype,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-
-        self.experts = FusedMoE(num_experts=num_experts,
-                                top_k=top_k,
-                                hidden_size=hidden_size,
-                                intermediate_size=intermediate_size,
-                                params_dtype=params_dtype,
-                                reduce_results=True,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                tp_size=tp_size,
-                                prefix=f"{prefix}.experts",
-                                is_sequence_parallel=self.is_sequence_parallel)
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+            is_sequence_parallel=self.is_sequence_parallel,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -113,7 +126,8 @@ class GraniteMoeMoE(nn.Module):
 
         if self.is_sequence_parallel:
             final_hidden_states = tensor_model_parallel_all_gather(
-                final_hidden_states, 0)
+                final_hidden_states, 0
+            )
             num_tokens = orig_shape[0]
             final_hidden_states = final_hidden_states[:num_tokens]
 
@@ -121,7 +135,6 @@ class GraniteMoeMoE(nn.Module):
 
 
 class GraniteMoeAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -129,10 +142,10 @@ class GraniteMoeAttention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        attention_multiplier: Optional[float] = None,
+        rope_scaling: dict[str, Any] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attention_multiplier: float | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -154,8 +167,11 @@ class GraniteMoeAttention(nn.Module):
         self.head_dim = hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = (attention_multiplier if attention_multiplier
-                        is not None else self.head_dim**-1)
+        self.scaling = (
+            attention_multiplier
+            if attention_multiplier is not None
+            else self.head_dim**-1
+        )
         self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
@@ -182,13 +198,15 @@ class GraniteMoeAttention(nn.Module):
             is_neox_style=True,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -204,7 +222,6 @@ class GraniteMoeAttention(nn.Module):
 
 
 class GraniteMoeDecoderLayer(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -231,7 +248,8 @@ class GraniteMoeDecoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
-            attention_multiplier=config.attention_multiplier)
+            attention_multiplier=config.attention_multiplier,
+        )
         self.block_sparse_moe = GraniteMoeMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
@@ -239,12 +257,13 @@ class GraniteMoeDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             is_sequence_parallel=parallel_config.use_sequence_parallel_moe,
-            prefix=f"{prefix}.block_sparse_moe")
+            prefix=f"{prefix}.block_sparse_moe",
+        )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
         self.residual_multiplier = config.residual_multiplier
 
@@ -271,50 +290,46 @@ class GraniteMoeDecoderLayer(nn.Module):
 
 @support_torch_compile
 class GraniteMoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: GraniteMoeDecoderLayer(vllm_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -322,17 +337,18 @@ class GraniteMoeModel(nn.Module):
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                }
+            )
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def _load_weights(self,
-                      weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+    def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         """
-        This function is copied from `MixtralModel.load_weights`, mainly to 
-        decouple from mixtral, avoiding impact on support like BNB  
+        This function is copied from `MixtralModel.load_weights`, mainly to
+        decouple from mixtral, avoiding impact on support like BNB
         quantization.
         """
         stacked_params_mapping = [
@@ -348,30 +364,33 @@ class GraniteMoeModel(nn.Module):
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
+            num_experts=self.config.num_local_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -394,21 +413,25 @@ class GraniteMoeModel(nn.Module):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -419,40 +442,45 @@ class GraniteMoeModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         new_weights = {}
         for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            if n.endswith(".block_sparse_moe.input_linear.weight"):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
                     w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
                     new_weights[w1_name] = w1_param
                     new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif n.endswith(".block_sparse_moe.output_linear.weight"):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
                 assert gate_name not in new_weights
                 new_weights[gate_name] = p
             else:
@@ -482,69 +510,60 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
-        self.model = GraniteMoeModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = GraniteMoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size,
-                                                scale=1 /
-                                                self.config.logits_scaling)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size,
+            scale=1 / self.config.logits_scaling,
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index f5751fe47bb8be1edc8a0cc9e7d8d7e7f543f1dd..05177f1d1ac2cdcde43150773b2982ab46140858 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeHybrid model."""
+
 # Added by the IBM Team, 2025
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -15,58 +15,73 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .granitemoe import GraniteMoeMoE
 from .granitemoeshared import GraniteMoeSharedMLP
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class GraniteMoeHybridMambaDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: GraniteMoeHybridConfig,
-                 layer_idx: int,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.residual_multiplier = config.residual_multiplier
 
-        self.mamba = MambaMixer2(hidden_size= config.hidden_size,
-                                ssm_state_size = config.mamba_d_state,
-                                conv_kernel_size = config.mamba_d_conv,
-                                intermediate_size = config.mamba_expand *\
-                                                    config.hidden_size,
-                                use_conv_bias = config.mamba_conv_bias,
-                                use_bias = config.mamba_proj_bias,
-                                n_groups=config.mamba_n_groups,
-                                num_heads=config.mamba_n_heads,
-                                head_dim=config.mamba_d_head,
-                                rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act,
-                                model_config=model_config,
-                                cache_config=cache_config,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.mixer")
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=config.mamba_expand * config.hidden_size,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            num_heads=config.mamba_n_heads,
+            head_dim=config.mamba_d_head,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
 
         self.block_sparse_moe = None
         if getattr(config, "num_local_experts", 0) > 0:
@@ -76,25 +91,26 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.block_sparse_moe")
+                prefix=f"{prefix}.block_sparse_moe",
+            )
 
-        self.shared_mlp = None if \
-            getattr(config, 'shared_intermediate_size', 0) == 0 \
+        self.shared_mlp = (
+            None
+            if getattr(config, "shared_intermediate_size", 0) == 0
             else GraniteMoeSharedMLP(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.shared_mlp"
+                config, quant_config=quant_config, prefix=f"{prefix}.shared_mlp"
             )
+        )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         residual = hidden_states
@@ -114,8 +130,7 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
             if self.block_sparse_moe is not None:
                 moe_hidden_states = hidden_states.clone()
                 moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
-                hidden_states = moe_hidden_states + self.shared_mlp(
-                    hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
                 del moe_hidden_states
             else:
                 hidden_states = self.shared_mlp(hidden_states)
@@ -125,14 +140,13 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
 
 
 class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: GraniteMoeHybridConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -143,7 +157,8 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
             config,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.self_attn")
+            prefix=f"{prefix}.self_attn",
+        )
 
         self.block_sparse_moe = None
         if getattr(config, "num_local_experts", 0) > 0:
@@ -153,26 +168,27 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.block_sparse_moe")
+                prefix=f"{prefix}.block_sparse_moe",
+            )
 
-        self.shared_mlp = None if \
-            getattr(config, 'shared_intermediate_size', 0) == 0 \
+        self.shared_mlp = (
+            None
+            if getattr(config, "shared_intermediate_size", 0) == 0
             else GraniteMoeSharedMLP(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.shared_mlp"
+                config, quant_config=quant_config, prefix=f"{prefix}.shared_mlp"
             )
+        )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -194,8 +210,7 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
             if self.block_sparse_moe is not None:
                 moe_hidden_states = hidden_states.clone()
                 moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
-                hidden_states = moe_hidden_states + self.shared_mlp(
-                    hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
                 del moe_hidden_states
             else:
                 hidden_states = self.shared_mlp(hidden_states)
@@ -205,13 +220,12 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
 
 
 class GraniteMoeHybridAttention(nn.Module):
-
     def __init__(
         self,
         config: GraniteMoeHybridConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -237,19 +251,23 @@ class GraniteMoeHybridAttention(nn.Module):
             assert tp_size % self.total_num_kv_heads == 0
         self.num_key_value_heads = max(1, self.total_num_kv_heads // tp_size)
 
-        self.qkv_proj = QKVParallelLinear(self.hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=self.attention_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.hidden_size,
-                                        self.hidden_size,
-                                        bias=self.attention_bias,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=self.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         if config.position_embedding_type == "rope":
             self.rotary_emb = get_rope(
@@ -257,34 +275,38 @@ class GraniteMoeHybridAttention(nn.Module):
                 rotary_dim=self.head_dim,
                 max_position=config.max_position_embeddings,
                 base=int(config.rope_theta),
-                rope_scaling=config.rope_scaling \
-                    if hasattr(config, "rope_scaling") \
-                    and config.rope_scaling is not None else None,
+                rope_scaling=config.rope_scaling
+                if hasattr(config, "rope_scaling") and config.rope_scaling is not None
+                else None,
                 is_neox_style=True,
             )
         else:
             self.rotary_emb = None
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.attention_multiplier,
-                              num_kv_heads=self.num_key_value_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.attention_multiplier,
+            num_kv_heads=self.num_key_value_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-
         qkv, _ = self.qkv_proj(hidden_states)
-        query, key, value = qkv.split([
-            self.num_heads * self.head_dim, self.num_key_value_heads *
-            self.head_dim, self.num_key_value_heads * self.head_dim
-        ],
-                                      dim=-1)
+        query, key, value = qkv.split(
+            [
+                self.num_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=-1,
+        )
 
         if self.rotary_emb is not None:
             query, key = self.rotary_emb(positions, query, key)
@@ -304,7 +326,6 @@ ALL_DECODER_LAYER_TYPES = {
 
 @support_torch_compile
 class GraniteMoeHybridModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -312,25 +333,21 @@ class GraniteMoeHybridModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+        self.quant_config = quant_config
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
-            layer_class = ALL_DECODER_LAYER_TYPES[
-                config.layer_types[layer_idx]]
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layer_types[layer_idx]]
             return layer_class(
                 config,
                 layer_idx,
@@ -341,34 +358,34 @@ class GraniteMoeHybridModel(nn.Module):
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
                 hidden_states = hidden_states * self.embedding_multiplier
             residual = None
         else:
             if intermediate_tensors is None:
-                raise RuntimeError('Intermediate tensors may not be None!')
+                raise RuntimeError("Intermediate tensors may not be None!")
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
@@ -376,21 +393,46 @@ class GraniteMoeHybridModel(nn.Module):
         for i, layer in enumerate(self.layers):
             if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer):
                 num_attn += 1
-            hidden_states, residual = layer(positions=positions,
-                                            hidden_states=hidden_states,
-                                            residual=residual)
+            hidden_states, residual = layer(
+                positions=positions, hidden_states=hidden_states, residual=residual
+            )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        # layers.0.block_sparse_moe.expert_0.input_linear.input_scale
+        ckpt_gate_proj_name = "gate_proj"
+        ckpt_down_proj_name = "down_proj"
+        ckpt_up_proj_name = "up_proj"
+        num_experts = self.config.num_local_experts
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "block_sparse_moe.experts.w13_"
+                if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
+                else "block_sparse_moe.experts.w2_",
+                f"block_sparse_moe.experts.{expert_id}.{weight_name}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -399,11 +441,11 @@ class GraniteMoeHybridModel(nn.Module):
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
 
         def _load(n, p):
             param = params_dict[n]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, p)
             loaded_params.add(n)
 
@@ -411,75 +453,128 @@ class GraniteMoeHybridModel(nn.Module):
             # Skip layers on other devices.
             if not is_pp_missing_parameter(n, self):
                 param = params_dict[n]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, p, shard_id)
                 loaded_params.add(n)
 
         def _load_expert(n, p, name, shard_id, expert_id):
             param = params_dict[n]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param,
-                          p,
-                          name,
-                          shard_id=shard_id,
-                          expert_id=expert_id)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, p, name, shard_id=shard_id, expert_id=expert_id)
             loaded_params.add(n)
 
+        def _load_quant_expert(name, loaded_weight):
+            for mapping in expert_params_mapping:
+                param_name, weight_name, expert_id, shard_id = mapping
+
+                if weight_name not in name:
+                    continue
+
+                name_mapped = name.replace(weight_name, param_name)
+
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name_mapped, self):
+                    continue
+
+                param = params_dict[name_mapped]
+                weight_loader = param.weight_loader
+                success = False
+
+                if weight_loader is not None:
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+
+                if success:
+                    return name_mapped
+            return None
+
         for n, p in weights:
             if "A_log" in n:
                 n = n.replace("A_log", "A")
 
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(n)
+            ):
+                # Loading kv cache quantization scales
+                loaded_weight = p
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                _load(scale_name, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if _load_quant_expert(n, p):
+                continue
+
             # Logic analogous to: https://github.com/vllm-project/vllm/blob/f49e5aff11c986ed4d45202b1716c5d74786efa9/vllm/model_executor/models/granitemoeshared.py#L215
             # Mapping different experts' layout:
             #  from HF (input_linear, output_linear, router)
             #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
             # The renaming and parameter loading logic is the same for weight
             # and weight_scale tensors so we can reuse them without issues.
-            if (n.endswith('.block_sparse_moe.input_linear.weight') or
-                    n.endswith('.block_sparse_moe.input_linear.weight_scale')):
+            if n.endswith(".block_sparse_moe.input_linear.weight") or n.endswith(
+                ".block_sparse_moe.input_linear.weight_scale"
+            ):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
                     w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
                     w1_param, w3_param = p[e].chunk(2, dim=0)
-                    _load_expert(n.replace('.input_linear.', '.experts.w13_'),
-                                 w1_param,
-                                 w1_name,
-                                 shard_id='w1',
-                                 expert_id=e)
-                    _load_expert(n.replace('.input_linear.', '.experts.w13_'),
-                                 w3_param,
-                                 w3_name,
-                                 shard_id='w3',
-                                 expert_id=e)
-            elif (n.endswith('.block_sparse_moe.output_linear.weight') or
-                  n.endswith('.block_sparse_moe.output_linear.weight_scale')):
+                    _load_expert(
+                        n.replace(".input_linear.", ".experts.w13_"),
+                        w1_param,
+                        w1_name,
+                        shard_id="w1",
+                        expert_id=e,
+                    )
+                    _load_expert(
+                        n.replace(".input_linear.", ".experts.w13_"),
+                        w3_param,
+                        w3_name,
+                        shard_id="w3",
+                        expert_id=e,
+                    )
+            elif n.endswith(".block_sparse_moe.output_linear.weight") or n.endswith(
+                ".block_sparse_moe.output_linear.weight_scale"
+            ):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
                     w2_param = p[e]
-                    _load_expert(n.replace('.output_linear.', '.experts.w2_'),
-                                 w2_param,
-                                 w2_name,
-                                 shard_id='w2',
-                                 expert_id=e)
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
+                    _load_expert(
+                        n.replace(".output_linear.", ".experts.w2_"),
+                        w2_param,
+                        w2_name,
+                        shard_id="w2",
+                        expert_id=e,
+                    )
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
                 _load(gate_name, p)
             else:
                 loaded = False
                 for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name in n:
-                        _load_shard(n.replace(weight_name, param_name),
-                                    p,
-                                    shard_id=shard_id)
+                        _load_shard(
+                            n.replace(weight_name, param_name), p, shard_id=shard_id
+                        )
                         loaded = True
                 if not loaded:
                     _load(n, p)
@@ -487,14 +582,24 @@ class GraniteMoeHybridModel(nn.Module):
         return loaded_params
 
 
-class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
-                                  SupportsPP, IsHybrid, SupportsQuant):
+class GraniteMoeHybridForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsQuant,
+    SupportsMambaPrefixCaching,
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
+        "conv1d": ["conv1d"],
+        "in_proj": ["in_proj"],
+        "input_linear": ["input_linear"],
     }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -507,7 +612,6 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -549,66 +653,57 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
-        scheduler_config = vllm_config.scheduler_config
-        if cache_config.enable_prefix_caching:
-            raise RuntimeError(
-                "GraniteMoeHybrid currently does not support prefix caching")
 
+        scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = GraniteMoeHybridModel(vllm_config=vllm_config,
-                                           prefix=maybe_prefix(
-                                               prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = GraniteMoeHybridModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=self.quant_config,
-            prefix=maybe_prefix(prefix, "lm_head"))
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size,
-                                                scale=1 /
-                                                self.config.logits_scaling)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size,
+            config.vocab_size,
+            scale=1 / self.config.logits_scaling,
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index b434822bff0a9513474b1dbcafab61565bb56ff1..926c539af33be72d2adee7cec0299880d2a88275 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -5,9 +5,9 @@
 The architecture is the same as granitemoe but with the addition of shared
 experts.
 """
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -18,13 +18,16 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.sequence import IntermediateTensors
 
 from .granitemoe import GraniteMoeAttention, GraniteMoeModel, GraniteMoeMoE
@@ -33,11 +36,10 @@ from .utils import AutoWeightsLoader, make_layers, maybe_prefix
 
 
 class GraniteMoeSharedMLP(nn.Module):
-
     def __init__(
         self,
         config: GraniteMoeSharedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -49,16 +51,20 @@ class GraniteMoeSharedMLP(nn.Module):
             output_sizes=[self.hidden_size] * 2,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.input_linear")
+            prefix=f"{prefix}.input_linear",
+        )
         self.output_linear = RowParallelLinear(
             self.hidden_size,
             self.input_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.output_linear")
+            prefix=f"{prefix}.output_linear",
+        )
         if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -69,12 +75,11 @@ class GraniteMoeSharedMLP(nn.Module):
 
 
 class GraniteMoeSharedDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: GraniteMoeSharedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -92,26 +97,28 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
-            attention_multiplier=config.attention_multiplier)
+            attention_multiplier=config.attention_multiplier,
+        )
         self.block_sparse_moe = GraniteMoeMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
-            prefix=f"{prefix}.block_sparse_moe")
-        self.shared_mlp = None if \
-            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+        self.shared_mlp = (
+            None
+            if getattr(config, "shared_intermediate_size", 0) == 0
             else GraniteMoeSharedMLP(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.shared_mlp"
+                config, quant_config=quant_config, prefix=f"{prefix}.shared_mlp"
             )
+        )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
         self.residual_multiplier = config.residual_multiplier
 
@@ -145,27 +152,22 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
 
 @support_torch_compile
 class GraniteMoeSharedModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
         self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
         self.embedding_multiplier = config.embedding_multiplier
@@ -175,25 +177,26 @@ class GraniteMoeSharedModel(nn.Module):
             lambda prefix: GraniteMoeSharedDecoderLayer(
                 config, cache_config, quant_config=quant_config, prefix=prefix
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -201,40 +204,46 @@ class GraniteMoeSharedModel(nn.Module):
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                }
+            )
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         new_weights = {}
         for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            if n.endswith(".block_sparse_moe.input_linear.weight"):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
                     w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
                     new_weights[w1_name] = w1_param
                     new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif n.endswith(".block_sparse_moe.output_linear.weight"):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
                 assert gate_name not in new_weights
                 new_weights[gate_name] = p
             else:
@@ -264,69 +273,61 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
-
-        self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
-                                           prefix=maybe_prefix(
-                                               prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.model = GraniteMoeSharedModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "lm_head"))
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size,
-                                                scale=1 /
-                                                self.config.logits_scaling)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size,
+            config.vocab_size,
+            scale=1 / self.config.logits_scaling,
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 639d8f620f9479359c90ac65d729dfb159063756..181c4ed2dca5aac3a9123e7673635ba572292da0 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Set
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -9,11 +8,15 @@ import torch.nn as nn
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
-                                               PoolerHead, PoolerNormalize,
-                                               PoolingParamsUpdate,
-                                               get_prompt_lens,
-                                               get_prompt_token_ids)
+from vllm.model_executor.layers.pooler import (
+    DispatchPooler,
+    Pooler,
+    PoolerHead,
+    PoolerNormalize,
+    PoolingParamsUpdate,
+    get_prompt_lens,
+    get_prompt_token_ids,
+)
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
@@ -47,19 +50,18 @@ class GritLMMeanPool(nn.Module):
         def tokens_to_ids(tokens: list[str]) -> np.ndarray:
             return np.array([self.token_ids[token] for token in tokens])
 
-        self.user_pattern_ids = tokens_to_ids(
-            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.user_pattern_ids = tokens_to_ids(["▁<", "|", "user", "|", ">", "<0x0A>"])
         self.embed_newline_pattern_ids = tokens_to_ids(
-            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
-        self.embed_pattern_ids = tokens_to_ids(
-            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"]
+        )
+        self.embed_pattern_ids = tokens_to_ids(["▁<", "|", "embed", "|", ">", "<0x0A>"])
 
     def _find_array(
         self,
         arr: np.ndarray,
         target: np.ndarray,
         start_idx: int = 0,
-        end_idx: Optional[int] = None,
+        end_idx: int | None = None,
     ) -> int:
         """
         Find the first occurrence of `target` in `arr` starting from
@@ -86,7 +88,7 @@ class GritLMMeanPool(nn.Module):
             end_idx = arr_len
 
         for i in range(start_idx, min(end_idx, arr_len - target_len + 1)):
-            if (arr[i:i + target_len] == target).all():
+            if (arr[i : i + target_len] == target).all():
                 return i
 
         return -1
@@ -105,31 +107,37 @@ class GritLMMeanPool(nn.Module):
 
         # Return no instruction in case of missing BOS token.
         if prompt_token_ids[0] != self.token_ids["<s>"]:
-            logger.warning("BOS token not found in prompt, "
-                           "thus using empty string for instruction. "
-                           "GritLM requires BOS token in prompt.")
+            logger.warning(
+                "BOS token not found in prompt, "
+                "thus using empty string for instruction. "
+                "GritLM requires BOS token in prompt."
+            )
             return instruction_len
 
         # If user pattern is found in the prompt, that means there should be
         # a newline token before the embed pattern.
         embed_pattern_ids = self.embed_pattern_ids
-        if self._find_array(prompt_token_ids,
-                            self.user_pattern_ids,
-                            start_idx=1,
-                            end_idx=2) == 1:
+        if (
+            self._find_array(
+                prompt_token_ids, self.user_pattern_ids, start_idx=1, end_idx=2
+            )
+            == 1
+        ):
             embed_pattern_ids = self.embed_newline_pattern_ids
 
         # Find the embed pattern in the prompt.
-        found_embed_pattern_idx = self._find_array(prompt_token_ids,
-                                                   embed_pattern_ids,
-                                                   start_idx=1)
+        found_embed_pattern_idx = self._find_array(
+            prompt_token_ids, embed_pattern_ids, start_idx=1
+        )
 
         if found_embed_pattern_idx != -1:
             instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
         else:
-            logger.warning("Query instruction not found in prompt, "
-                           "thus using BOS token as instruction instead. "
-                           "GritLM requires query instruction in prompt.")
+            logger.warning(
+                "Query instruction not found in prompt, "
+                "thus using BOS token as instruction instead. "
+                "GritLM requires query instruction in prompt."
+            )
             instruction_len = 1
 
         return instruction_len
@@ -140,59 +148,34 @@ class GritLMMeanPool(nn.Module):
     def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
         return PoolingParamsUpdate(requires_token_ids=True)
 
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-        instr_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
-            "partial prefill not supported with MEAN pooling"
-
-        return hidden_states[instr_len:].mean(dim=0, dtype=torch.float32)
-
-    def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
-        instr_lens: torch.Tensor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
-        offset = 0
-        pooled_data = list[torch.Tensor]()
-
-        for prompt_len, instr_len in zip(prompt_lens, instr_lens):
-            pooled_data.append(hidden_states[offset + instr_len:offset +
-                                             prompt_len].mean(
-                                                 dim=0, dtype=torch.float32))
-            offset += prompt_len
-
-        return pooled_data
-
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
         instr_lens = torch.tensor(
             [
                 self._get_instruction_len(token_ids.cpu().numpy())
                 for token_ids in get_prompt_token_ids(pooling_metadata)
             ],
-            device=prompt_lens.device,
+            device="cpu",
         )
 
-        if isinstance(hidden_states, list):
-            return [
-                self.forward_one(h, prompt_len, instr_len) for h, prompt_len,
-                instr_len in zip(hidden_states, prompt_lens, instr_lens)
-            ]
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len, instr_len in zip(prompt_lens, instr_lens):
+            pooled_data.append(
+                hidden_states[offset + instr_len : offset + prompt_len].mean(
+                    dim=0, dtype=torch.float32
+                )
+            )
+            offset += prompt_len
 
-        return self.forward_all(hidden_states, prompt_lens, instr_lens)
+        return pooled_data
 
 
 class GritLMPooler(Pooler):
-
     def __init__(self, model_config: ModelConfig):
         super().__init__()
 
@@ -254,9 +237,9 @@ class GritLM(LlamaForCausalLM):
 
         pooler_config = vllm_config.model_config.pooler_config
         if pooler_config is not None:
-            self.pooler = DispatchPooler({
-                "encode":
-                Pooler.for_encode(pooler_config),
-                "embed":
-                GritLMPooler(vllm_config.model_config),
-            })
+            self.pooler = DispatchPooler(
+                {
+                    "token_embed": Pooler.for_token_embed(pooler_config),
+                    "embed": GritLMPooler(vllm_config.model_config),
+                }
+            )
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 996e41fe84ff383ac5b1075e95048fe8fe08f596..9dc231863f74f25b3e05fa89be5b64e34a5cd36d 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -22,9 +22,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Grok1 model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -36,22 +36,32 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 # Default Grok1-specific constants, overridden by config values if present
 DEFAULT_ATTN_OUTPUT_MULTIPLIER = 0.08838834764831845
@@ -68,37 +78,43 @@ class Grok1MoE(nn.Module):
     across ranks.
     """
 
-    def __init__(self,
-                 num_experts: int,
-                 top_k: int,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = hidden_size
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(hidden_size,
-                                     num_experts,
-                                     bias=False,
-                                     params_dtype=params_dtype,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-
-        self.experts = FusedMoE(num_experts=num_experts,
-                                top_k=top_k,
-                                hidden_size=hidden_size,
-                                intermediate_size=intermediate_size,
-                                params_dtype=params_dtype,
-                                reduce_results=True,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                tp_size=tp_size,
-                                activation="gelu",
-                                prefix=f"{prefix}.experts")
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            activation="gelu",
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -112,18 +128,17 @@ class Grok1MoE(nn.Module):
 
 
 class Grok1Attention(nn.Module):
-
     def __init__(
-            self,
-            hidden_size: int,
-            num_heads: int,
-            num_kv_heads: int,
-            max_position: int = 4096 * 32,
-            rope_theta: float = 10000,
-            cache_config: Optional[CacheConfig] = None,
-            quant_config: Optional[QuantizationConfig] = None,
-            prefix: str = "",
-            config=None,  # Added config parameter
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        config=None,  # Added config parameter
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -172,19 +187,21 @@ class Grok1Attention(nn.Module):
             is_neox_style=True,
         )
 
-        attn_logits_soft_cap = max(
-            getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+        attn_logits_soft_cap = max(getattr(config, "attn_logit_softcapping", 30.0), 0.0)
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap,
-                              prefix=f"{prefix}.attn")
-        self.attn_multiplier = getattr(self.config, "attn_output_multiplier",
-                                       1.0) if self.config else 1.0
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=attn_logits_soft_cap,
+            prefix=f"{prefix}.attn",
+        )
+        self.attn_multiplier = (
+            getattr(self.config, "attn_output_multiplier", 1.0) if self.config else 1.0
+        )
 
     def forward(
         self,
@@ -201,12 +218,11 @@ class Grok1Attention(nn.Module):
 
 
 class Grok1DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -214,8 +230,7 @@ class Grok1DecoderLayer(nn.Module):
         # Check for fp8 quantization
         self.use_fp8 = False
         if quant_config is not None:
-            self.use_fp8 = getattr(quant_config, "is_fp8_w8a8",
-                                   lambda: False)()
+            self.use_fp8 = getattr(quant_config, "is_fp8_w8a8", lambda: False)()
             if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
                 self.use_fp8 = quant_config.is_fp8
 
@@ -231,41 +246,39 @@ class Grok1DecoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
-            config=config)  # Pass config to Grok1Attention
+            config=config,
+        )  # Pass config to Grok1Attention
 
         # Grok1 uses "num_experts" in its config
         num_experts = getattr(config, "num_experts", 8)
         num_experts_per_tok = getattr(config, "num_experts_per_tok", 2)
 
-        self.moe_block = Grok1MoE(num_experts=num_experts,
-                                  top_k=num_experts_per_tok,
-                                  hidden_size=config.hidden_size,
-                                  intermediate_size=config.intermediate_size,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.moe_block")
-
-        self.pre_attn_norm = RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
-        self.post_attn_norm = RMSNorm(config.hidden_size,
-                                      eps=config.rms_norm_eps)
-        self.pre_moe_norm = RMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-        self.post_moe_norm = RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
+        self.moe_block = Grok1MoE(
+            num_experts=num_experts,
+            top_k=num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.moe_block",
+        )
+
+        self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.pre_attn_norm(hidden_states)
         else:
-            hidden_states, residual = self.pre_attn_norm(
-                hidden_states, residual)
+            hidden_states, residual = self.pre_attn_norm(hidden_states, residual)
 
         hidden_states = self.attn(
             positions=positions,
@@ -285,30 +298,26 @@ class Grok1DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Grok1Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embedding_multiplier_scale = getattr(
-            config, "embedding_multiplier_scale",
-            DEFAULT_EMBEDDING_MULTIPLIER_SCALE)
+            config, "embedding_multiplier_scale", DEFAULT_EMBEDDING_MULTIPLIER_SCALE
+        )
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
 
@@ -317,14 +326,15 @@ class Grok1Model(nn.Module):
             lambda prefix: Grok1DecoderLayer(
                 config, cache_config, quant_config=quant_config, prefix=prefix
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         hidden_states = hidden_states * self.embedding_multiplier_scale
         return hidden_states
@@ -333,14 +343,14 @@ class Grok1Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -351,10 +361,9 @@ class Grok1Model(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
@@ -367,10 +376,10 @@ class Grok1Model(nn.Module):
             ckpt_gate_proj_name="linear",  # Grok1 specific
             ckpt_down_proj_name="linear_1",  # Grok1 specific
             ckpt_up_proj_name="linear_v",  # Grok1 specific
-            num_experts=num_experts)
+            num_experts=num_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -382,25 +391,27 @@ class Grok1Model(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -423,21 +434,25 @@ class Grok1Model(nn.Module):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -453,8 +468,9 @@ class Grok1Model(nn.Module):
                         name = name.replace("scale", "weight")
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -476,24 +492,18 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
-        self.quant_config = quant_config
 
-        self.model = Grok1Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.quant_config = quant_config
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = Grok1Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -502,40 +512,41 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         self.output_multiplier_scale = getattr(
-            config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size,
-                                                self.output_multiplier_scale)
+            config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE
+        )
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, scale=self.output_multiplier_scale
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         # Skip lm_head when tie_word_embeddings is True
-        skip_prefixes = (["lm_head"]
-                         if self.config.tie_word_embeddings else None)
+        skip_prefixes = ["lm_head"] if self.config.tie_word_embeddings else None
 
         loader = AutoWeightsLoader(
             self,
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index b42df3ad8650816edce0df38c844a4da6d5fd2b9..81c6b34bd6ce0fbdbc85a9ccd29ebac26e4d0557 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -9,7 +9,6 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional, Union
 
 import torch
 from PIL import Image
@@ -18,20 +17,33 @@ from transformers import PretrainedConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (MultiModalProcessingInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    MultiModalProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
-from .internvl import (IMG_CONTEXT, IMG_END, IMG_START,
-                       BaseInternVLDummyInputsBuilder,
-                       BaseInternVLMultiModalProcessor,
-                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
-                       InternVLChatModel, build_transform,
-                       find_closest_aspect_ratio, get_internvl_target_ratios)
+from .internvl import (
+    IMG_CONTEXT,
+    IMG_END,
+    IMG_START,
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+    BaseInternVLProcessor,
+    InternVLChatModel,
+    build_transform,
+    find_closest_aspect_ratio,
+    get_internvl_target_ratios,
+)
 
 
 def resolve_h2ovl_min_max_num(
@@ -54,15 +66,17 @@ def get_h2ovl_target_ratios(
     min_num: int,
     max_num: int,
     *,
-    prior_aspect_ratio: Optional[tuple[int, int]],
+    prior_aspect_ratio: tuple[int, int] | None,
 ) -> list[tuple[int, int]]:
     target_ratios = get_internvl_target_ratios(min_num, max_num)
 
     # if prior_aspect_ratio is provided, filter the target ratios
     if prior_aspect_ratio is not None:
         target_ratios = [
-            ratio for ratio in target_ratios if prior_aspect_ratio[0] %
-            ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
+            ratio
+            for ratio in target_ratios
+            if prior_aspect_ratio[0] % ratio[0] != 0
+            and prior_aspect_ratio[1] % ratio[1] != 0
         ]
 
     return target_ratios
@@ -155,7 +169,7 @@ def _preprocess_image(
     min_num: int,
     max_num: int,
     use_thumbnail: bool,
-    prior_aspect_ratio: Optional[tuple[int, int]],
+    prior_aspect_ratio: tuple[int, int] | None,
 ) -> tuple[torch.Tensor, tuple[int, int]]:
     target_ratios = get_h2ovl_target_ratios(
         min_num,
@@ -207,7 +221,8 @@ def image_to_pixel_values_h2ovl(
         )
         # combine pixel values
         pixel_values = torch.cat(
-            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0)
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
+        )
 
     else:
         pixel_values, _ = _preprocess_image(
@@ -223,16 +238,15 @@ def image_to_pixel_values_h2ovl(
 
 
 class H2OVLProcessor(BaseInternVLProcessor):
-
     def __init__(
         self,
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_msac: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_msac: bool | None = None,
     ) -> None:
         super().__init__(
             config,
@@ -255,7 +269,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -265,19 +279,23 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
-        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
-                             is None else min_dynamic_patch)
-        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
-                             is None else max_dynamic_patch)
-        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
-                              is None else dynamic_image_size)
-        use_thumbnail = (self.use_thumbnail
-                         if use_thumbnail is None else use_thumbnail)
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
 
         return resolve_h2ovl_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -289,12 +307,12 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
-        prior_aspect_ratio: Optional[tuple[int, int]] = None,
-        override_min_num: Optional[int] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+        prior_aspect_ratio: tuple[int, int] | None = None,
+        override_min_num: int | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -316,9 +334,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
         *,
         image_width: int,
         image_height: int,
-        use_msac: Optional[bool] = None,
+        use_msac: bool | None = None,
     ) -> int:
-        use_msac = (self.use_msac if use_msac is None else use_msac)
+        use_msac = self.use_msac if use_msac is None else use_msac
 
         use_thumbnail = self.use_thumbnail
 
@@ -366,9 +384,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         use_msac = self.use_msac if len(images) == 1 else False
 
@@ -387,12 +405,12 @@ class H2OVLProcessor(BaseInternVLProcessor):
                 max_num=max_num,
                 use_thumbnail=self.use_thumbnail,
                 use_msac=use_msac,
-            ) for image in images
+            )
+            for image in images
         ]
 
 
 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
         return self.ctx.init_processor(
             H2OVLProcessor,
@@ -406,8 +424,8 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[H2OVLProcessor],
-        use_msac: Optional[bool] = None,
+        processor: H2OVLProcessor | None,
+        use_msac: bool | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -419,9 +437,7 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
         )
 
 
-class H2OVLMultiModalProcessor(
-        BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]):
-
+class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]):
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -446,7 +462,8 @@ class H2OVLMultiModalProcessor(
 
         def get_replacement_internvl(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
@@ -475,11 +492,11 @@ class H2OVLMultiModalProcessor(
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -506,13 +523,13 @@ class H2OVLMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     H2OVLMultiModalProcessor,
     info=H2OVLProcessingInfo,
-    dummy_inputs=BaseInternVLDummyInputsBuilder)
+    dummy_inputs=BaseInternVLDummyInputsBuilder,
+)
 class H2OVLChatModel(InternVLChatModel):
-
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -520,8 +537,9 @@ class H2OVLChatModel(InternVLChatModel):
         if not is_mono:
             vision_feature_layer = config.select_layer
             if vision_feature_layer < 0:
-                num_hidden_layers = (config.vision_config.num_hidden_layers +
-                                     vision_feature_layer + 1)
+                num_hidden_layers = (
+                    config.vision_config.num_hidden_layers + vision_feature_layer + 1
+                )
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 8a23a6b45bc702c4c9ab796eda3bef6a940fc77a..1eadcbe67ade38277223396e37bf64c740c83da3 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -23,9 +23,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only HunYuan model compatible with HuggingFace weights."""
+
 import typing
 from collections.abc import Callable, Iterable
-from typing import Any, Optional, Union
+from itertools import islice
+from typing import Any
 
 import regex as re
 import torch
@@ -35,30 +37,43 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_layers, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
 
 
 def _is_moe(config: PretrainedConfig) -> bool:
@@ -81,13 +96,12 @@ def _get_cla_factor(config: PretrainedConfig) -> int:
 
 
 class HunYuanMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -109,8 +123,9 @@ class HunYuanMLP(nn.Module):
             reduce_results=reduce_results,
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -121,7 +136,6 @@ class HunYuanMLP(nn.Module):
 
 
 class HunYuanAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -129,11 +143,11 @@ class HunYuanAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
     ) -> None:
@@ -205,16 +219,14 @@ class HunYuanAttention(nn.Module):
         )
 
         if self.use_qk_norm:
-            self.query_layernorm = RMSNorm(self.head_dim,
-                                           eps=config.rms_norm_eps)
-            self.key_layernorm = RMSNorm(self.head_dim,
-                                         eps=config.rms_norm_eps)
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -222,9 +234,11 @@ class HunYuanAttention(nn.Module):
         ori_k = k
         if self.use_qk_norm:
             q = self.query_layernorm(
-                q.view(-1, self.num_heads, self.head_dim).contiguous())
+                q.view(-1, self.num_heads, self.head_dim).contiguous()
+            )
             k = self.key_layernorm(
-                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous()
+            )
 
         attn_output = self.attn(q, k, v)
         # For o_proj
@@ -234,7 +248,6 @@ class HunYuanAttention(nn.Module):
 
 
 class HunYuanCrossAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -242,11 +255,11 @@ class HunYuanCrossAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
     ) -> None:
@@ -317,16 +330,14 @@ class HunYuanCrossAttention(nn.Module):
         )
 
         if self.use_qk_norm:
-            self.query_layernorm = RMSNorm(self.head_dim,
-                                           eps=config.rms_norm_eps)
-            self.key_layernorm = RMSNorm(self.head_dim,
-                                         eps=config.rms_norm_eps)
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> torch.Tensor:
         assert kv_states is not None
         ori_k, v = kv_states  # use last layer kv,
@@ -336,9 +347,11 @@ class HunYuanCrossAttention(nn.Module):
         q, _ = self.rotary_emb(positions, q, k_tmp)
         if self.use_qk_norm:
             q = self.query_layernorm(
-                q.view(-1, self.num_heads, self.head_dim).contiguous())
+                q.view(-1, self.num_heads, self.head_dim).contiguous()
+            )
             k = self.key_layernorm(
-                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous()
+            )
 
         attn_output = self.attn(q, k, v)
         # For o_proj
@@ -348,11 +361,10 @@ class HunYuanCrossAttention(nn.Module):
 
 
 class HunYuanSparseMoeBlock(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         layer_id: int = -1,
         prefix: str = "",
         enable_eplb: bool = False,
@@ -361,14 +373,15 @@ class HunYuanSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.num_experts}.")
+                f"the number of experts {config.num_experts}."
+            )
 
         # Get layer_id topk if config.moe_topk is a list
         if isinstance(config.moe_topk, list):
@@ -381,9 +394,11 @@ class HunYuanSparseMoeBlock(nn.Module):
         # If it is moe, moe_intermediate_size is preferred
         intermediate_size = config.intermediate_size
         if config.moe_intermediate_size is not None:
-            intermediate_size = (config.moe_intermediate_size if isinstance(
-                config.moe_intermediate_size, int) else
-                                 config.moe_intermediate_size[layer_id])
+            intermediate_size = (
+                config.moe_intermediate_size
+                if isinstance(config.moe_intermediate_size, int)
+                else config.moe_intermediate_size[layer_id]
+            )
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
@@ -392,32 +407,20 @@ class HunYuanSparseMoeBlock(nn.Module):
 
         self.n_logical_experts = self.n_routed_experts
         self.n_redundant_experts = eplb_config.num_redundant_experts
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
-
-        self.experts = FusedMoE(
-            num_experts=self.n_routed_experts,
-            top_k=top_k,
-            hidden_size=config.hidden_size,
-            intermediate_size=intermediate_size,
-            reduce_results=False,
-            renormalize=top_k > 1,
-            quant_config=quant_config,
-            prefix=f"{prefix}.experts",
-            enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.n_redundant_experts,
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
         )
 
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.num_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
         if config.use_mixed_mlp_moe > 0:
             # Get layer_id num_shared_expert if config.num_shared_expert is
             # a list.
@@ -438,35 +441,46 @@ class HunYuanSparseMoeBlock(nn.Module):
         else:
             self.shared_mlp = None
 
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_mlp,
+            num_experts=self.n_routed_experts,
+            top_k=top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            renormalize=top_k > 1,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
         hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
-        shared_output = None
-        if self.shared_mlp is not None:
-            shared_output = self.shared_mlp(hidden_states)
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+        if self.shared_mlp is not None:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
 
 
 class HunYuanDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
         enable_eplb: bool = False,
@@ -475,30 +489,37 @@ class HunYuanDecoderLayer(nn.Module):
         assert layer_id >= 0
         self.layer_id = layer_id
         self.hidden_size = config.hidden_size
-        self.intermediate_size = (config.intermediate_size if isinstance(
-            config.intermediate_size, int) else
-                                  config.intermediate_size[layer_id])
+        self.intermediate_size = (
+            config.intermediate_size
+            if isinstance(config.intermediate_size, int)
+            else config.intermediate_size[layer_id]
+        )
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         cla_factor = _get_cla_factor(config)
-        attention_type = (AttentionType.ENCODER_DECODER
-                          if layer_id >= 0 and layer_id % cla_factor != 0 else
-                          AttentionType.DECODER)
+        attention_type = (
+            AttentionType.ENCODER_DECODER
+            if layer_id >= 0 and layer_id % cla_factor != 0
+            else AttentionType.DECODER
+        )
         if attention_type == AttentionType.DECODER:
             self.self_attn = HunYuanAttention(
                 config=config,
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
-                num_kv_heads=getattr(config, "num_key_value_heads",
-                                     config.num_attention_heads),
+                num_kv_heads=getattr(
+                    config, "num_key_value_heads", config.num_attention_heads
+                ),
                 rope_theta=rope_theta,
                 rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
@@ -513,8 +534,9 @@ class HunYuanDecoderLayer(nn.Module):
                 config=config,
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
-                num_kv_heads=getattr(config, "num_key_value_heads",
-                                     config.num_attention_heads),
+                num_kv_heads=getattr(
+                    config, "num_key_value_heads", config.num_attention_heads
+                ),
                 rope_theta=rope_theta,
                 rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
@@ -545,25 +567,24 @@ class HunYuanDecoderLayer(nn.Module):
                 prefix=f"{prefix}.mlp",
             )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        residual: torch.Tensor | None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states, ori_kv_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -571,22 +592,20 @@ class HunYuanDecoderLayer(nn.Module):
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual, ori_kv_states
 
 
 @support_torch_compile
 class HunYuanModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         eplb_config = vllm_config.parallel_config.eplb_config
         enable_eplb = vllm_config.parallel_config.enable_eplb
         self.num_redundant_experts = eplb_config.num_redundant_experts
@@ -594,16 +613,15 @@ class HunYuanModel(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -625,21 +643,21 @@ class HunYuanModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -648,8 +666,9 @@ class HunYuanModel(nn.Module):
 
         cla_factor = _get_cla_factor(self.config)
         prev_kv_states = None
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for i, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             hidden_states, residual, kv_states = layer(
                 positions,
                 hidden_states,
@@ -657,25 +676,24 @@ class HunYuanModel(nn.Module):
                 prev_kv_states,
             )
 
-            if (getattr(self.config, "use_cla", False)
-                    and (i - self.start_layer) % cla_factor == 0):
+            if getattr(self.config, "use_cla", False) and i % cla_factor == 0:
                 prev_kv_states = kv_states
             else:
                 prev_kv_states = None
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
     def _split_qkv_weight(self, qkv: torch.Tensor):
         num_attention_heads = self.config.num_attention_heads
-        num_kv_heads = getattr(self.config, "num_key_value_heads",
-                               self.config.num_attention_heads)
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
         num_key_value_groups = num_attention_heads // num_kv_heads
         hidden_size = self.config.hidden_size
 
@@ -686,8 +704,9 @@ class HunYuanModel(nn.Module):
         else:
             attention_head_dim = self.config.hidden_size // num_attention_heads
 
-        qkv = qkv.reshape(num_kv_heads, num_key_value_groups + 2,
-                          attention_head_dim, hidden_size)
+        qkv = qkv.reshape(
+            num_kv_heads, num_key_value_groups + 2, attention_head_dim, hidden_size
+        )
         q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
         q = q.reshape(-1, hidden_size)
         k = k.reshape(-1, hidden_size)
@@ -698,7 +717,7 @@ class HunYuanModel(nn.Module):
         if _is_moe(self.config):
             # Params for weights, fp8 weight scales, fp8 activation scales
             # (param_name, weight_name, expert_id, shard_id)
-            return FusedMoE.make_expert_params_mapping(
+            return SharedFusedMoE.make_expert_params_mapping(
                 ckpt_gate_proj_name="gate_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
@@ -720,16 +739,16 @@ class HunYuanModel(nn.Module):
         ]
 
         num_attention_heads = self.config.num_attention_heads
-        num_kv_heads = getattr(self.config, "num_key_value_heads",
-                               self.config.num_attention_heads)
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
         split_params_mapping = [
             (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
             (
                 ".qkv_proj",
                 ".qkv_proj",
                 num_attention_heads + num_kv_heads * 2,
-                [("q", num_attention_heads), ("k", num_kv_heads),
-                 ("v", num_kv_heads)],
+                [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)],
                 self._split_qkv_weight,
             ),
         ]
@@ -744,8 +763,7 @@ class HunYuanModel(nn.Module):
                 name = name.replace("gate_proj_bias", "gate_proj.bias")
             if "up_proj_bias" in name:
                 name = name.replace("up_proj_bias", "up_proj.bias")
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
@@ -755,11 +773,11 @@ class HunYuanModel(nn.Module):
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
             if self.quant_config is not None and (
-                    scale_name := self.quant_config.get_cache_scale(name)):
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
                 continue
@@ -795,11 +813,11 @@ class HunYuanModel(nn.Module):
                 continue
 
             for (
-                    param_name,
-                    weight_name,
-                    den,
-                    split_param,
-                    func,
+                param_name,
+                weight_name,
+                den,
+                split_param,
+                func,
             ) in split_params_mapping:
                 if weight_name not in name:
                     continue
@@ -820,12 +838,11 @@ class HunYuanModel(nn.Module):
                 for shard_id, num in split_param:
                     new_offset = offset + num * units
                     if func:
-                        weight_loader(param,
-                                      func(loaded_weight)[offset:new_offset],
-                                      shard_id)
+                        weight_loader(
+                            param, func(loaded_weight)[offset:new_offset], shard_id
+                        )
                     else:
-                        weight_loader(param, loaded_weight[offset:new_offset],
-                                      shard_id)
+                        weight_loader(param, loaded_weight[offset:new_offset], shard_id)
                     offset = new_offset
 
                 break
@@ -851,8 +868,9 @@ class HunYuanModel(nn.Module):
                     # We should ask the weight loader to return success or not
                     # here since otherwise we may skip experts with other
                     # available replicas.
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
                     success = weight_loader(
                         param,
                         loaded_weight,
@@ -882,14 +900,15 @@ class HunYuanModel(nn.Module):
                         name = name.replace("wg.", "")
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
+class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -912,12 +931,9 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
 
         self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -925,16 +941,64 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+
+class HunYuanMoEV1Base(HunyuanV1ModelBase, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
         # Set MoE hyperparameters
         self.expert_weights = []
         self.num_expert_groups = 1
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_layer = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -955,22 +1019,6 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            self.expert_weights.append(layer.get_expert_weights())
-            # Register the expert weights.
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
@@ -979,8 +1027,7 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         assert self.num_local_physical_experts == num_local_physical_experts
         self.num_physical_experts = num_physical_experts
         self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = (num_physical_experts -
-                                      self.num_logical_experts)
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
             if isinstance(layer.mlp, HunYuanSparseMoeBlock):
                 moe = layer.mlp
@@ -989,54 +1036,18 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
-        return model_output
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
-        )
-        return loader.load_weights(weights)
-
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.model.get_expert_mapping()
 
 
-class HunYuanDenseV1ForCausalLM(HunYuanV1Base):
+class HunYuanDenseV1Base(HunyuanV1ModelBase):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+
+class HunYuanDenseV1ForCausalLM(HunYuanDenseV1Base):
     pass
 
 
-class HunYuanMoEV1ForCausalLM(HunYuanV1Base):
+class HunYuanMoEV1ForCausalLM(HunYuanMoEV1Base):
     pass
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 4d39ff9ae79ee437c5c275c9a69b4d783d3e7a55..db46353efde5c609bd10c1533042a8e86d6c1112 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -2,51 +2,52 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # copied from : https://github.com/huggingface/transformers
 import ast
-import sys
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from itertools import chain
-from typing import Any, Literal, Optional, TypedDict, Union
+from itertools import accumulate
+from typing import Annotated, Any, Literal
 
 import numpy as np
-import PIL
-from einops import rearrange
-from PIL import Image
-
-if sys.version_info >= (3, 11):
-    import typing
-    Unpack = typing.Unpack
-else:
-    import typing_extensions
-    Unpack = typing_extensions.Unpack
-
 import torch
 import torch.nn as nn
+from einops import rearrange
 from timm.layers import LayerNorm, LayerNorm2d
 from timm.models.regnet import RegStage
 from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
 from transformers.modeling_utils import no_init_weights
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        InputProcessingContext,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    flatten_bn,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import get_vision_encoder_info
 
 EOT = "<|endofturn|>"
@@ -57,8 +58,8 @@ VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
 # Based on combine_frames_into_images in
 # https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py
 def get_num_combined_frames(
-        num_frames: int,
-        max_grid_shape: tuple[int, int] = (3, 3),
+    num_frames: int,
+    max_grid_shape: tuple[int, int] = (3, 3),
 ) -> int:
     max_num_grids = max_grid_shape[0] * max_grid_shape[1]
 
@@ -69,42 +70,58 @@ def get_num_combined_frames(
     return num_canvases + (leftover_frames > 0)
 
 
-class HCXVisionMultimodalPixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values_images: list[torch.Tensor]
+class HCXVisionImagePixelInputs(TensorSchema):
     """
-    Shape: `[(num_grids, num_channels, height, width), ...]` if anyres
-    
-    Note that `height` or `width` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+    Dimensions:
+        - n: Number of images
+        - g: Number of grids
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
     """
-    image_sizes_images: list[tuple[Union[int, float]]]
-    """
-    Shape: `[(height, width), ...]`
-    """
-    vision_query_lengths_images: list[Union[int, float]]
-    pixel_values_videos: list[tuple[Union[int, float]]]
+
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values_images: Annotated[
+        list[torch.Tensor], TensorShape("n", "g", 3, "h", "w", dynamic_dims={"g"})
+    ]
+    image_sizes_images: Annotated[torch.Tensor, TensorShape("n", 2)]
+
+
+HCXVisionImageInputs = HCXVisionImagePixelInputs
+
+
+class HCXVisionVideoPixelInputs(TensorSchema):
     """
-    Shape: `[(num_grids, num_channels, height, width), ...]` if anyres
+    Dimensions:
+        - n: Number of videos
+        - f: Number of frames
+        - g: Number of grids
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
     """
-    vision_query_lengths_videos: list[Union[int, float]]
 
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+    pixel_values_videos: Annotated[
+        list[list[torch.Tensor]],
+        TensorShape("n", "f", "g", 3, "h", "w", dynamic_dims={"f", "g"}),
+    ]
 
-HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
 
+HCXVisionVideoInputs = HCXVisionVideoPixelInputs
 
-class HCXVisionProcessingInfo(BaseProcessingInfo):
 
+class HCXVisionProcessingInfo(BaseProcessingInfo):
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_num_image_tokens(
         self,
         *,
-        vision_query_length: Union[int, list[int]],
+        vision_query_length: int | list[int],
     ) -> int:
         if isinstance(vision_query_length, int):
             return vision_query_length
@@ -114,7 +131,7 @@ class HCXVisionProcessingInfo(BaseProcessingInfo):
     def get_num_video_tokens(
         self,
         *,
-        vision_query_length: Union[int, list[int]],
+        vision_query_length: int | list[int],
     ) -> int:
         if isinstance(vision_query_length, int):
             return vision_query_length
@@ -135,48 +152,49 @@ class HCXVisionProcessingInfo(BaseProcessingInfo):
         )
 
 
-class HCXVisionDummyInputsBuilder(
-        BaseDummyInputsBuilder[HCXVisionProcessingInfo]):
-
+class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo]):
     def get_dummy_text(
         self,
         mm_counts: Mapping[str, int],
     ) -> str:
         dummy_text = IMAGE_TOKEN * mm_counts.get(
-            "image", 0) + VIDEO_TOKEN * mm_counts.get("video", 0)
+            "image", 0
+        ) + VIDEO_TOKEN * mm_counts.get("video", 0)
         return dummy_text
 
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         target_num_frames = 32
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(
+            "image": self._get_dummy_images(
                 width=target_width,
                 height=target_height,
                 num_images=num_images,
+                overrides=image_overrides,
             ),
-            "video":
-            self._get_dummy_videos(
+            "video": self._get_dummy_videos(
                 width=target_width - 1,
                 height=target_height - 1,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
-            )
+                overrides=video_overrides,
+            ),
         }
 
 
-class HCXVisionMultiModalProcessor(
-        BaseMultiModalProcessor[HCXVisionProcessingInfo]):
-
+class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -184,27 +202,9 @@ class HCXVisionMultiModalProcessor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
-        def replace_multimodal_token(
-            token_ids: torch.Tensor,
-            target_token: int,
-            repeats: list[int],
-        ):
-            output = list[int]()
-            _repeats_idx = 0
-            for token_id in token_ids:
-                if token_id == target_token:
-                    output += [token_id.item()] * repeats[_repeats_idx]
-                    _repeats_idx += 1
-                else:
-                    output += [token_id.item()]
-
-            return torch.tensor(output, device=token_ids.device)
-
         for video_idx, video_arr in enumerate(mm_data.get("videos", [])):
-            if video_arr.dtype == np.uint8:
-                continue
-            mm_data["videos"][video_idx] = video_arr.astype(np.uint8)
+            if video_arr.dtype != np.uint8:
+                mm_data["videos"][video_idx] = video_arr.astype(np.uint8)
 
         processed_outputs = self.info.ctx.call_hf_processor(
             hf_processor=self.info.get_hf_processor(**mm_kwargs),
@@ -216,20 +216,16 @@ class HCXVisionMultiModalProcessor(
         )  # text-only
 
         if len(mm_data) > 0:
-            # batchify input as a single item
-            images = mm_data.get("images", None)
-            batched_images = None if images is None else [images]
-
-            # list of video in single conversation
-            videos = mm_data.get("videos", None)
-            batched_videos = None if videos is None else [videos]
+            images = mm_data.get("images")
+            videos = mm_data.get("videos")
 
+            # batchify input as a single item
             _processed_outputs = self.info.ctx.call_hf_processor(
                 hf_processor=self.info.get_hf_processor(**mm_kwargs),
                 data=dict(
                     text=None,
-                    images=batched_images,
-                    videos=batched_videos,
+                    images=None if images is None else [images],
+                    videos=None if videos is None else [videos],
                 ),
             )  # mm-only
 
@@ -239,51 +235,48 @@ class HCXVisionMultiModalProcessor(
                     _processed_outputs[k] = v[0]
 
             if images:
-                tokenizer = self.info.get_tokenizer()
-                image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
-                processed_outputs["input_ids"] = torch.stack([
-                    replace_multimodal_token(
-                        token_ids=_input_ids,
-                        target_token=image_token_id,
-                        repeats=_processed_outputs[
-                            "vision_query_lengths_images"],
-                    ) for _input_ids in processed_outputs["input_ids"]
-                ],
-                                                             dim=0)
+                _processed_outputs["image_sizes_images"] = torch.tensor(
+                    _processed_outputs["image_sizes_images"]
+                )
+                _processed_outputs["vision_query_lengths_images"] = torch.tensor(
+                    _processed_outputs["vision_query_lengths_images"]
+                )
 
             if videos:
-                _num_per_videos = [
-                    get_num_combined_frames(len(video)) for video in videos
+                _idx_per_video = [
+                    0,
+                    *accumulate(
+                        get_num_combined_frames(len(video)) for video in videos
+                    ),
                 ]
                 _processed_outputs["pixel_values_videos"] = [
-                    _processed_outputs["pixel_values_videos"]
-                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
-                    for _i in range(len(videos))
+                    _processed_outputs["pixel_values_videos"][
+                        _idx_per_video[i] : _idx_per_video[i + 1]
+                    ]
+                    for i in range(len(videos))
                 ]
                 _processed_outputs["vision_query_lengths_videos"] = [
-                    _processed_outputs["vision_query_lengths_videos"]
-                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
-                    for _i in range(len(videos))
+                    torch.tensor(
+                        _processed_outputs["vision_query_lengths_videos"][
+                            _idx_per_video[i] : _idx_per_video[i + 1]
+                        ]
+                    )
+                    for i in range(len(videos))
                 ]
 
-                tokenizer = self.info.get_tokenizer()
-                video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN)
-                processed_outputs["input_ids"] = torch.stack([
-                    replace_multimodal_token(
-                        token_ids=_input_ids,
-                        target_token=video_token_id,
-                        repeats=[
-                            sum(lens) for lens in
-                            _processed_outputs["vision_query_lengths_videos"]
-                        ],
-                    ) for _input_ids in processed_outputs["input_ids"]
-                ],
-                                                             dim=0)
-
             processed_outputs.update(_processed_outputs)
 
         return processed_outputs
 
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -304,13 +297,11 @@ class HCXVisionMultiModalProcessor(
             out_item = out_mm_kwargs[modality][item_idx]
 
             if modality == "image":
-                lens = out_item["vision_query_lengths_images"].data
-                num_tokens = self.info.get_num_image_tokens(
-                    vision_query_length=lens)
+                lens = out_item["vision_query_lengths_images"].data.tolist()
+                num_tokens = self.info.get_num_image_tokens(vision_query_length=lens)
             elif modality == "video":
-                lens = out_item["vision_query_lengths_videos"].data
-                num_tokens = self.info.get_num_video_tokens(
-                    vision_query_length=lens)
+                lens = out_item["vision_query_lengths_videos"].data.tolist()
+                num_tokens = self.info.get_num_video_tokens(vision_query_length=lens)
             else:
                 raise NotImplementedError(modality)
 
@@ -327,7 +318,8 @@ class HCXVisionMultiModalProcessor(
                     modality=modality,
                     out_mm_kwargs=out_mm_kwargs,
                 ),
-            ) for modality in ("image", "video")
+            )
+            for modality in ("image", "video")
         ]
 
     def _get_mm_fields_config(
@@ -336,31 +328,17 @@ class HCXVisionMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
-            # image
             pixel_values_images=MultiModalFieldConfig.batched("image"),
             image_sizes_images=MultiModalFieldConfig.batched("image"),
             vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
-            num_queries_vis_abstractors_images=MultiModalFieldConfig.batched(
-                "image"),
-            num_queries_vis_abstractors_slow_images=MultiModalFieldConfig.
-            batched("image"),
-            first_last_frames_slows_images=MultiModalFieldConfig.batched(
-                "image"),
-            # video
             pixel_values_videos=MultiModalFieldConfig.batched("video"),
-            image_sizes_videos=MultiModalFieldConfig.batched("video"),
             vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
-            num_queries_vis_abstractors_videos=MultiModalFieldConfig.batched(
-                "video"),
-            num_queries_vis_abstractors_slow_videos=MultiModalFieldConfig.
-            batched("video"),
-            first_last_frames_slows_videos=MultiModalFieldConfig.batched(
-                "video"),
         )
 
 
 def _build_hcxvision_hf_info(
-    ctx: InputProcessingContext, ) -> HCXVisionProcessingInfo:
+    ctx: InputProcessingContext,
+) -> HCXVisionProcessingInfo:
     return HCXVisionProcessingInfo(ctx)
 
 
@@ -368,7 +346,7 @@ def _build_hcxvision_hf_processor(
     info: HCXVisionProcessingInfo,
     dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, HCXVisionProcessingInfo):
         return HCXVisionMultiModalProcessor(
@@ -382,12 +360,12 @@ def _build_hcxvision_hf_processor(
 
 def init_vision_tower_for_hcxvision(
     vision_config,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    use_nth_layer: Optional[int] = None,
-    require_post_norm: Optional[bool] = None,
+    use_nth_layer: int | None = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel:
     num_hidden_layers = vision_config.num_hidden_layers
     if not isinstance(use_nth_layer, int):
         pass
@@ -418,7 +396,6 @@ def init_vision_tower_for_hcxvision(
 
 
 class HCXVisionMlp(nn.Module):
-
     def __init__(
         self,
         mm_projector_type,
@@ -440,8 +417,9 @@ class HCXVisionMlp(nn.Module):
             self.act = act_layer()
             self.fc2 = nn.Linear(2 * hidden_features, out_features)
         else:
-            raise NotImplementedError("{} is not implemented".format(
-                self.mm_projector_type))
+            raise NotImplementedError(
+                "{} is not implemented".format(self.mm_projector_type)
+            )
 
     def forward(self, x):
         x = self.fc1(x)
@@ -453,7 +431,7 @@ class HCXVisionMlp(nn.Module):
 class HCXVisionCAbstractor(nn.Module):
     """
     This module is based on C-Abstractor, whose license is under apache-2.0.
-    You can check the original code at 
+    You can check the original code at
     https://github.com/khanrc/honeybee/blob/main/honeybee/projectors/projectors.py
     and we made necessary modifications.
     """
@@ -475,7 +453,8 @@ class HCXVisionCAbstractor(nn.Module):
         # Positional embedding
         if pos_emb:
             self.pos_emb = torch.nn.Parameter(
-                torch.zeros(1, num_input_tokens, encoder_hidden_size))
+                torch.zeros(1, num_input_tokens, encoder_hidden_size)
+            )
             self.pos_emb.data.normal_(mean=0.0, std=0.02)
         else:
             self.pos_emb = None
@@ -486,15 +465,16 @@ class HCXVisionCAbstractor(nn.Module):
         else:
             self.prenorm = None
 
-        self.build_net(num_queries, encoder_hidden_size, hidden_size,
-                       output_hidden_size)
+        self.build_net(
+            num_queries, encoder_hidden_size, hidden_size, output_hidden_size
+        )
         self.dtype = next(self.parameters()).dtype
 
     def forward(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> torch.Tensor:
         if self.prenorm is not None:
             x = self.prenorm(x)
@@ -513,8 +493,8 @@ class HCXVisionCAbstractor(nn.Module):
     def _forward(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> torch.Tensor:
         # x: [B, L, dim]
         B, L, dim = x.shape
@@ -524,7 +504,8 @@ class HCXVisionCAbstractor(nn.Module):
         if num_queries_vis_abstractors is not None:
             assert num_grids is not None
             return self._forward_adaptive_num_query(
-                x, num_queries_vis_abstractors, num_grids)
+                x, num_queries_vis_abstractors, num_grids
+            )
 
         x = self.net(x)
         x = rearrange(x, "b d h w -> b (h w) d")
@@ -534,8 +515,8 @@ class HCXVisionCAbstractor(nn.Module):
     def _forward_adaptive_num_query(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> list[torch.Tensor]:
         # self.net is consisted by 3 layers (s1, sampler, s2)
         assert len(self.net) == 3
@@ -545,7 +526,7 @@ class HCXVisionCAbstractor(nn.Module):
         for i, num_queries in enumerate(num_queries_vis_abstractors):
             hw = int(num_queries**0.5)
             sampler = nn.AdaptiveAvgPool2d((hw, hw))
-            out = sampler(x[num_grids[i]:num_grids[i + 1], :])
+            out = sampler(x[num_grids[i] : num_grids[i + 1], :])
             out = self.net[2](out)  # s2
 
             out = rearrange(out, "b d h w -> b (h w) d")
@@ -563,8 +544,9 @@ class HCXVisionCAbstractor(nn.Module):
         depth: int = 3,
         mlp_depth: int = 2,
     ):
-        assert (n_queries**0.5).is_integer(
-        ), f"n_queries must be square number. n_queries: {n_queries}"
+        assert (n_queries**0.5).is_integer(), (
+            f"n_queries must be square number. n_queries: {n_queries}"
+        )
         hw = int(n_queries**0.5)
 
         # RegBlock = ResBlock + SE
@@ -589,8 +571,7 @@ class HCXVisionCAbstractor(nn.Module):
         )
 
         self.net = nn.Sequential(s1, sampler, s2)
-        self.readout = self.build_mlp(mlp_depth, hidden_size,
-                                      output_hidden_size)
+        self.readout = self.build_mlp(mlp_depth, hidden_size, output_hidden_size)
 
     def build_mlp(
         self,
@@ -608,12 +589,14 @@ class HCXVisionCAbstractor(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(
     _build_hcxvision_hf_processor,
     info=_build_hcxvision_hf_info,
-    dummy_inputs=HCXVisionDummyInputsBuilder)
+    dummy_inputs=HCXVisionDummyInputsBuilder,
+)
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     def __init__(
@@ -621,7 +604,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         *,
         vllm_config: VllmConfig,
         prefix: str = "",
-        **kwargs: Optional[Any],
+        **kwargs: Any | None,
     ) -> None:
         super().__init__()
 
@@ -643,7 +626,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         ## possible_resolution should be matched with preprocessor_config.json
         config.possible_resolutions = self._init_possible_resolutions(
-            config, vision_config)
+            config, vision_config
+        )
 
         # init models & parameters
         with no_init_weights():  # weight will be loaded in from_pretrained
@@ -654,11 +638,11 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 require_post_norm=False,
                 prefix=maybe_prefix(prefix, "vision_model"),
             )
-        self.mm_projector = self._init_mm_projector(config, text_config,
-                                                    vision_config)
+        self.mm_projector = self._init_mm_projector(config, text_config, vision_config)
 
-        self.lm_head_vocab_size = getattr(text_config, "padded_vocab_size",
-                                          text_config.vocab_size)
+        self.lm_head_vocab_size = getattr(
+            text_config, "padded_vocab_size", text_config.vocab_size
+        )
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=text_config,
@@ -667,7 +651,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         if config.anyres:
             self.image_newline = nn.Parameter(
-                torch.empty(text_config.hidden_size, dtype=self.dtype))
+                torch.empty(text_config.hidden_size, dtype=self.dtype)
+            )
 
         self.config = config
         self.vision_config = vision_config
@@ -677,7 +662,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         # self.reduction = self._init_reduction_type(use_sum_loss)
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return IMAGE_TOKEN
         if modality.startswith("video"):
@@ -685,176 +670,165 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         raise ValueError("Only image or video modality is supported")
 
-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: object,
+    ) -> HCXVisionImageInputs | None:
+        pixel_values_images = kwargs.pop("pixel_values_images", None)
 
-    def get_multimodal_embeddings(
+        if pixel_values_images is None:
+            return None
+
+        image_sizes_images = kwargs.pop("image_sizes_images")
+
+        return HCXVisionImagePixelInputs(
+            pixel_values_images=pixel_values_images,
+            image_sizes_images=image_sizes_images,
+        )
+
+    def _parse_and_validate_video_input(
         self,
-        **kwargs: Unpack[HCXVisionMultimodalInputs],
-    ) -> Optional[MultiModalEmbeddings]:
-
-        multimodal_embeddings = list()
-        if kwargs.get("pixel_values_images") is not None:
-            for _pixel_values_images, _image_sizes_images in zip(
-                    kwargs["pixel_values_images"],
-                    kwargs["image_sizes_images"]):
-                _pixel_values_images = _pixel_values_images.unsqueeze(dim=0)
-                _image_sizes_images = _image_sizes_images.unsqueeze(dim=0)
-                _len_pixel_values_images = [
-                    len(pixel_value) for pixel_value in _pixel_values_images
-                ]
-                if isinstance(_image_sizes_images, torch.Tensor):
-                    _image_sizes_images = _image_sizes_images.detach().cpu(
-                    ).tolist()
-                _multimodal_embeddings_images = self.forward_images(
-                    pixel_values_images=_pixel_values_images,
-                    image_sizes_images=_image_sizes_images,
-                    len_pixel_values_images=_len_pixel_values_images,
-                )
-                _multimodal_embeddings_images = torch.cat(
-                    _multimodal_embeddings_images, dim=0)
-                multimodal_embeddings.append(_multimodal_embeddings_images)
-
-        if kwargs.get("pixel_values_videos") is not None:
-            for _pixel_values_videos, _vision_query_lengths_videos in zip(
-                    kwargs["pixel_values_videos"],
-                    kwargs["vision_query_lengths_videos"]):
-                _len_pixel_values_videos = [
-                    len(_vision_query_lengths)
-                    for _vision_query_lengths in _vision_query_lengths_videos
-                ]
-                _c, _w, _h = _pixel_values_videos.shape[-3:]
-                _pixel_values_videos = _pixel_values_videos.reshape(
-                    sum(_len_pixel_values_videos), -1, _c, _w,
-                    _h).unsqueeze(dim=0)
-                _multimodal_embeddings_videos = self.forward_videos(
-                    pixel_values_videos=_pixel_values_videos,
-                    len_pixel_values_videos=_len_pixel_values_videos,
-                )
-                _multimodal_embeddings_videos = torch.cat(
-                    _multimodal_embeddings_videos, dim=0)
-                multimodal_embeddings.append(_multimodal_embeddings_videos)
-        return multimodal_embeddings
+        **kwargs: object,
+    ) -> HCXVisionVideoInputs | None:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+
+        if pixel_values_videos is None:
+            return None
 
-    def get_input_embeddings(
+        return HCXVisionVideoPixelInputs(
+            pixel_values_videos=pixel_values_videos,
+        )
+
+    def _process_image_input(
         self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                placeholder_token_id=[
-                    self.config.image_token_id,
-                    self.config.video_token_id,
-                ],
-            )
+        image_input: HCXVisionImageInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        return self.forward_images(
+            pixel_values_images=image_input["pixel_values_images"],
+            image_sizes_images=image_input["image_sizes_images"],
+        )
 
-        return inputs_embeds
+    def _process_video_input(
+        self,
+        video_input: HCXVisionVideoInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        return self.forward_videos(
+            pixel_values_videos=video_input["pixel_values_videos"],
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key == "pixel_values_images" and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key == "pixel_values_videos" and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
+
+        return modalities
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(
+        self,
+        **kwargs: object,
+    ) -> MultiModalEmbeddings:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
+
+        return multimodal_embeddings
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def forward_images(
         self,
-        pixel_values_images: list[list[torch.FloatTensor]],
-        image_sizes_images: list[list[tuple[int, int]]],
-        len_pixel_values_images: list[int],
-    ) -> list[list[torch.Tensor]]:
-        if sum(len_pixel_values_images) == 0:
-            return None
-
-        concat_pixel_values_images = torch.cat(list(
-            chain(*pixel_values_images)),
-                                               dim=0)
+        pixel_values_images: list[torch.Tensor],
+        image_sizes_images: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        pixel_values_image_flat = flatten_bn(pixel_values_images, concat=True)
 
         visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1
-        image_forward_outs = self.vision_model(
-            concat_pixel_values_images)[:, visual_token_idx:]
+        image_forward_outs = self.vision_model(pixel_values_image_flat)[
+            :, visual_token_idx:
+        ]
 
-        image_forward_outs = image_forward_outs.to(
-            dtype=self.mm_projector.dtype)
+        image_forward_outs = image_forward_outs.to(dtype=self.mm_projector.dtype)
         image_forward_outs = self.mm_projector(image_forward_outs)  # b (h w) d
 
-        split_sizes = [
-            pixel_value.shape[0] for pixel_value in chain(*pixel_values_images)
-        ]
-        image_forward_outs = torch.split(image_forward_outs,
-                                         split_sizes,
-                                         dim=0)
+        split_sizes = [len(item) for item in pixel_values_images]
+        image_forward_outs = torch.split(image_forward_outs, split_sizes, dim=0)
 
         # newline for anyres postprocessing
         image_features = anyres_postprocessing(
             image_forward_outs=image_forward_outs,
-            image_sizes=[
-                image_size for image_sizes in image_sizes_images
-                for image_size in image_sizes
-            ],
-            num_queries_vis_abstractor=self.config.
-            num_queries_vis_abstractor_image,
+            image_sizes=image_sizes_images.tolist(),
+            num_queries_vis_abstractor=self.config.num_queries_vis_abstractor_image,
             unpad=self.config.unpad,
             patch_size=self.vision_config.patch_size,
             grid_size=self.vision_config.image_size,
             image_newline=self.image_newline,
             possible_resolutions=self.config.possible_resolutions,
         )
-        return image_features
+
+        return tuple(image_features)
 
     def forward_videos(
         self,
-        pixel_values_videos: list[list[torch.FloatTensor]],
-        len_pixel_values_videos: list[int],
-    ) -> list[torch.Tensor]:
-
-        len_video_grids = sum(len_pixel_values_videos)
-        if len_video_grids == 0:
-            return None
-
-        # Run Vision Model
-        concat_pixel_values_videos = torch.cat(list(
-            chain(*pixel_values_videos)),
-                                               dim=0)
+        pixel_values_videos: list[list[torch.Tensor]],
+    ) -> tuple[torch.Tensor, ...]:
+        pixel_values_videos_flat = flatten_bn(
+            [frame for frames in pixel_values_videos for frame in frames],
+            concat=True,
+        )
 
         visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1
-        video_forward_outs = self.vision_model(
-            concat_pixel_values_videos)[:, visual_token_idx:]
+        video_forward_outs = self.vision_model(pixel_values_videos_flat)[
+            :, visual_token_idx:
+        ]
 
-        video_forward_outs = video_forward_outs.to(
-            dtype=self.mm_projector.dtype)
+        video_forward_outs = video_forward_outs.to(dtype=self.mm_projector.dtype)
 
         # Run MM-Projector
         # len(num_grids) == len(num_queries_vis_abstractors) + 1
         grid_idx = 0
-        num_grids = [
-            grid_idx
-        ]  # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
-        num_queries_vis_abstractors = [
-        ]  # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
+        # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
+        num_grids = [grid_idx]
+        # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
+        num_queries_vis_abstractors = []
         len_total_frames = video_forward_outs.shape[0]
 
         if self.config.first_last_frames_slow:
@@ -862,22 +836,26 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             assert len_total_frames != 0
             if len_total_frames <= 2:
                 num_queries_vis_abstractors.append(
-                    self.config.num_queries_vis_abstractor_video_slow)
+                    self.config.num_queries_vis_abstractor_video_slow
+                )
                 grid_idx += len_total_frames
                 num_grids.append(grid_idx)
             else:
                 num_queries_vis_abstractors.append(
-                    self.config.num_queries_vis_abstractor_video_slow)
+                    self.config.num_queries_vis_abstractor_video_slow
+                )
                 grid_idx += 1
                 num_grids.append(grid_idx)
 
                 num_queries_vis_abstractors.append(
-                    self.config.num_queries_vis_abstractor_video_fast)
+                    self.config.num_queries_vis_abstractor_video_fast
+                )
                 grid_idx += len_total_frames - 2
                 num_grids.append(grid_idx)
 
                 num_queries_vis_abstractors.append(
-                    self.config.num_queries_vis_abstractor_video_slow)
+                    self.config.num_queries_vis_abstractor_video_slow
+                )
                 grid_idx += 1
                 num_grids.append(grid_idx)
         else:
@@ -886,17 +864,19 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 for pixel_values_frame in pixel_values_frames:
                     if len(pixel_values_frame) > 0:
                         num_queries_vis_abstractors.append(
-                            self.config.num_queries_vis_abstractor_video_slow)
+                            self.config.num_queries_vis_abstractor_video_slow
+                        )
                         grid_idx += 1
                         num_grids.append(grid_idx)
                         num_queries_vis_abstractors.append(
-                            self.config.num_queries_vis_abstractor_video_fast)
+                            self.config.num_queries_vis_abstractor_video_fast
+                        )
                         grid_idx = grid_idx + len(pixel_values_frame) - 1
                         num_grids.append(grid_idx)
 
-        video_forward_outs = self.mm_projector(video_forward_outs,
-                                               num_queries_vis_abstractors,
-                                               num_grids)
+        video_forward_outs = self.mm_projector(
+            video_forward_outs, num_queries_vis_abstractors, num_grids
+        )
 
         video_features = []  # what we want to return
         target_features = []
@@ -918,14 +898,19 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 target_group_size = 0
 
             elif video_group_size < target_group_size:
-                raise RuntimeError(
-                    f"{video_group_size=} < {target_group_size=}")
+                raise RuntimeError(f"{video_group_size=} < {target_group_size=}")
 
-        assert len(target_features
-                   ) == 0, f"target_features is not empty!! {target_features}"
+        assert len(target_features) == 0, (
+            f"target_features is not empty!! {target_features}"
+        )
         assert len(video_groups) == len(video_features)
 
-        return video_features
+        feats_per_video = [len(video) for video in pixel_values_videos]
+        idxs_per_video = [0, *accumulate(feats_per_video)]
+        return tuple(
+            torch.cat(video_features[idxs_per_video[i] : idxs_per_video[i + 1]])
+            for i in range(len(feats_per_video))
+        )
 
     def _prepare_multimodal_kwargs(self, **kwargs: object):
         output = defaultdict(list)
@@ -934,7 +919,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 continue  # if empty batch of empty sample
 
             new_k, is_video = k, False
-            if (not k.endswith("_images") and not k.endswith("_videos")):
+            if not k.endswith("_images") and not k.endswith("_videos"):
                 pass
             else:
                 new_k, is_video = k.split("_")[:-1], k.split("_")[-1]
@@ -961,7 +946,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(
@@ -987,10 +972,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                         if i * j <= config.max_num_grids:
                             possible_resolutions.append([i, j])
 
-                possible_resolutions = [[
-                    ys * vision_config.image_size,
-                    xs * vision_config.image_size
-                ] for ys, xs in possible_resolutions]
+                possible_resolutions = [
+                    [ys * vision_config.image_size, xs * vision_config.image_size]
+                    for ys, xs in possible_resolutions
+                ]
             return possible_resolutions
         else:
             return config.possible_resolutions
@@ -1003,14 +988,13 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     ):
         input_hidden_size = vision_config.hidden_size
         if config.mm_projector_type == "linear":
-            mm_projector = nn.Linear(input_hidden_size,
-                                     text_config.hidden_size)
+            mm_projector = nn.Linear(input_hidden_size, text_config.hidden_size)
             mm_projector.dtype = next(mm_projector.parameters()).dtype
         elif config.mm_projector_type == "cabstractor":
             mm_projector = HCXVisionCAbstractor(
                 num_queries=config.num_queries_vis_abstractor_image,
-                num_input_tokens=(vision_config.image_size //
-                                  vision_config.patch_size)**2,
+                num_input_tokens=(vision_config.image_size // vision_config.patch_size)
+                ** 2,
                 encoder_hidden_size=input_hidden_size,
                 hidden_size=input_hidden_size,
                 output_hidden_size=text_config.hidden_size,
@@ -1027,8 +1011,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         return mm_projector
 
 
-def unpad_image(tensor: torch.Tensor,
-                original_size: tuple[int, int]) -> torch.Tensor:
+def unpad_image(tensor: torch.Tensor, original_size: tuple[int, int]) -> torch.Tensor:
     original_width, original_height = original_size
     current_height, current_width = tensor.shape[1:]
 
@@ -1039,18 +1022,17 @@ def unpad_image(tensor: torch.Tensor,
         scale_factor = current_width / original_width
         new_height = int(original_height * scale_factor)
         padding = (current_height - new_height) // 2
-        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
     else:
         scale_factor = current_height / original_height
         new_width = int(original_width * scale_factor)
         padding = (current_width - new_width) // 2
-        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
 
     return unpadded_tensor
 
 
-def select_best_resolution(original_size: tuple,
-                           possible_resolutions: list) -> tuple:
+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
     original_height, original_width = original_size
     best_fit = None
     max_effective_resolution = 0
@@ -1058,15 +1040,19 @@ def select_best_resolution(original_size: tuple,
 
     for height, width in possible_resolutions:
         scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(
-            original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height,
-                                   original_width * original_height)
+        downscaled_width, downscaled_height = (
+            int(original_width * scale),
+            int(original_height * scale),
+        )
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
         wasted_resolution = (width * height) - effective_resolution
 
         if effective_resolution > max_effective_resolution or (
-                effective_resolution == max_effective_resolution
-                and wasted_resolution < min_wasted_resolution):
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
             max_effective_resolution = effective_resolution
             min_wasted_resolution = wasted_resolution
             best_fit = (height, width)
@@ -1076,15 +1062,19 @@ def select_best_resolution(original_size: tuple,
 
 def get_anyres_image_grid_shape(
     image_size: tuple[int, int],
-    grid_pinpoints: Union[str, list[tuple[int, int]]],
+    grid_pinpoints: str | list[tuple[int, int]],
     patch_size: int,
 ) -> tuple[int, int]:
-    possible_resolutions = grid_pinpoints if isinstance(
-        grid_pinpoints, list) else ast.literal_eval(grid_pinpoints)
+    possible_resolutions = (
+        grid_pinpoints
+        if isinstance(grid_pinpoints, list)
+        else ast.literal_eval(grid_pinpoints)
+    )
 
     original_width, original_height = image_size
-    height, width = select_best_resolution((original_height, original_width),
-                                           possible_resolutions)
+    height, width = select_best_resolution(
+        (original_height, original_width), possible_resolutions
+    )
     return width // patch_size, height // patch_size
 
 
@@ -1102,12 +1092,15 @@ def reshape_and_unpad_image_features(
     image_feature = image_feature[1:]
 
     assert height * width == base_image_feature.shape[0], (
-        f"{height=} * {width=} != {base_image_feature.shape[0]=}")
+        f"{height=} * {width=} != {base_image_feature.shape[0]=}"
+    )
 
     num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-        image_size, possible_resolutions, grid_size)
-    image_feature = image_feature.view(num_patch_height, num_patch_width,
-                                       height, width, -1)
+        image_size, possible_resolutions, grid_size
+    )
+    image_feature = image_feature.view(
+        num_patch_height, num_patch_width, height, width, -1
+    )
 
     if unpad:
         image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
@@ -1116,8 +1109,9 @@ def reshape_and_unpad_image_features(
         image_feature = torch.cat(
             (
                 image_feature,
-                image_newline[:, None, None].expand(
-                    *image_feature.shape[:-1], 1).to(image_feature.device),
+                image_newline[:, None, None]
+                .expand(*image_feature.shape[:-1], 1)
+                .to(image_feature.device),
             ),
             dim=-1,
         )
@@ -1131,20 +1125,21 @@ def reshape_and_unpad_image_features(
 
 
 def anyres_postprocessing(
-    image_forward_outs: list[torch.FloatTensor],
+    image_forward_outs: list[torch.Tensor],
     image_sizes: list[list[int]],
     possible_resolutions: list[tuple[int, int]],
     patch_size: int,
     grid_size: int,
-    image_newline: torch.FloatTensor,
+    image_newline: torch.Tensor,
     num_queries_vis_abstractor: int = -1,
     unpad: bool = False,
-) -> list[torch.FloatTensor]:
+) -> list[torch.Tensor]:
     height = width = grid_size // patch_size
 
     if num_queries_vis_abstractor > 0:
-        assert (num_queries_vis_abstractor**0.5
-                ).is_integer(), "n_queries must be square number"
+        assert (num_queries_vis_abstractor**0.5).is_integer(), (
+            "n_queries must be square number"
+        )
         height = width = int(num_queries_vis_abstractor**0.5)
 
     # post-processing (unpad, add newline)
@@ -1164,29 +1159,8 @@ def anyres_postprocessing(
         else:
             image_feature = image_feature[0]
             image_feature = torch.cat(
-                (image_feature, image_newline[None].to(image_feature.device)),
-                dim=0)
+                (image_feature, image_newline[None].to(image_feature.device)), dim=0
+            )
         new_image_features.append(image_feature)
-    image_features = new_image_features
-    return image_features
-
-
-def resize_image(
-    image: Union[np.ndarray, PIL.Image.Image],
-    max_side: int = 378,
-) -> np.ndarray:
-    image_arr = image
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-
-    width, height = image.size
-    cur_max_size = max(width, height)
-    if cur_max_size <= max_side:
-        return image_arr
-
-    scale = max_side / cur_max_size
-    width = int(width * scale)
-    height = int(height * scale)
-    image = image.resize((width, height), Image.LANCZOS)
-    image_arr = np.array(image)
-    return image_arr
+
+    return new_image_features
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 2f0c4240413be9e419bc874477e4dce82928abc2..727c8ec0397cae33f40f394092f7ae90238e1d71 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -19,19 +19,22 @@
 """PyTorch Idefics2 model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
 from transformers.models.idefics2.configuration_idefics2 import (
-    Idefics2Config, Idefics2VisionConfig)
+    Idefics2Config,
+    Idefics2VisionConfig,
+)
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
@@ -67,13 +70,14 @@ class Idefics2VisionEmbeddings(nn.Module):
         self.num_patches_per_side = self.image_size // self.patch_size
         self.num_patches = self.num_patches_per_side**2
         self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
 
-    def forward(self,
-                pixel_values: torch.FloatTensor,
-                patch_attention_mask: torch.BoolTensor,
-                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: torch.IntTensor | None = None,
+    ) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
@@ -82,14 +86,14 @@ class Idefics2VisionEmbeddings(nn.Module):
             max_im_h // self.patch_size,
             max_im_w // self.patch_size,
         )
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
-                                  1 / self.num_patches_per_side)
-        position_ids = torch.full(size=(batch_size,
-                                        max_nb_patches_h * max_nb_patches_w),
-                                  fill_value=0)
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
 
         for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-
             if tgt_sizes is not None:
                 nb_patches_h = tgt_sizes[batch_idx][0]
                 nb_patches_w = tgt_sizes[batch_idx][1]
@@ -98,14 +102,15 @@ class Idefics2VisionEmbeddings(nn.Module):
                 nb_patches_w = p_attn_mask[0].sum()
             fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
             fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-            bucket_coords_h = torch.bucketize(fractional_coords_h,
-                                              boundaries,
-                                              right=True)
-            bucket_coords_w = torch.bucketize(fractional_coords_w,
-                                              boundaries,
-                                              right=True)
-            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
-                       bucket_coords_w).flatten()
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
             position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         position_ids = position_ids.to(self.position_embedding.weight.device)
         embeddings += self.position_embedding(position_ids)
@@ -118,7 +123,7 @@ class Idefics2VisionAttention(nn.Module):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -130,12 +135,12 @@ class Idefics2VisionAttention(nn.Module):
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"  # noqa: E501
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
 
-        tp_size = (1 if use_data_parallel else
-                   get_tensor_model_parallel_world_size())
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
         assert self.num_heads % tp_size == 0
         self.num_heads_per_partition = self.num_heads // tp_size
 
@@ -156,8 +161,9 @@ class Idefics2VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
         # Use unified MultiHeadAttention with Flash Attention support
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_partition, self.head_dim, self.scale
+        )
 
     def forward(
         self,
@@ -175,11 +181,10 @@ class Idefics2VisionAttention(nn.Module):
 
 
 class Idefics2VisionMLP(nn.Module):
-
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -211,11 +216,10 @@ class Idefics2VisionMLP(nn.Module):
 
 
 class Idefics2EncoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Idefics2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -225,15 +229,16 @@ class Idefics2EncoderLayer(nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
-            use_data_parallel=use_data_parallel)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.mlp = Idefics2VisionMLP(config,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp",
-                                     use_data_parallel=use_data_parallel)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+            use_data_parallel=use_data_parallel,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -269,9 +274,9 @@ class Idefics2Encoder(nn.Module):
     def __init__(
         self,
         config: Idefics2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -284,13 +289,17 @@ class Idefics2Encoder(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layers = nn.ModuleList([
-            Idefics2EncoderLayer(config,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.layers.{layer_idx}",
-                                 use_data_parallel=use_data_parallel)
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -313,13 +322,12 @@ class Idefics2Encoder(nn.Module):
 
 
 class Idefics2VisionTransformer(nn.Module):
-
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         require_post_norm: bool = True,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -335,7 +343,8 @@ class Idefics2VisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            use_data_parallel=use_data_parallel)
+            use_data_parallel=use_data_parallel,
+        )
 
         num_hidden_layers = config.num_hidden_layers
         if len(self.encoder.layers) > config.num_hidden_layers:
@@ -345,10 +354,14 @@ class Idefics2VisionTransformer(nn.Module):
             )
 
         self.require_post_norm = require_post_norm
-        self.post_layernorm = nn.LayerNorm(
-            embed_dim,
-            eps=config.layer_norm_eps,
-        ) if require_post_norm else nn.Identity()
+        self.post_layernorm = (
+            nn.LayerNorm(
+                embed_dim,
+                eps=config.layer_norm_eps,
+            )
+            if require_post_norm
+            else nn.Identity()
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -356,8 +369,8 @@ class Idefics2VisionTransformer(nn.Module):
     def forward(
         self,
         pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        tgt_sizes: Optional[torch.IntTensor] = None,
+        patch_attention_mask: torch.BoolTensor | None = None,
+        tgt_sizes: torch.IntTensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
@@ -365,15 +378,13 @@ class Idefics2VisionTransformer(nn.Module):
             tgt_sizes=tgt_sizes,
         )
         if self.use_data_parallel:
-            encoder_outputs = run_dp_sharded_vision_model(
-                hidden_states, self.encoder)
+            encoder_outputs = run_dp_sharded_vision_model(hidden_states, self.encoder)
         else:
             encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -390,8 +401,7 @@ class Idefics2VisionTransformer(nn.Module):
                 continue
 
             # post_layernorm is optional
-            if (name.startswith("post_layernorm.")
-                    and not self.require_post_norm):
+            if name.startswith("post_layernorm.") and not self.require_post_norm:
                 continue
 
             # omit layers when num_hidden_layers_override is set
@@ -410,8 +420,7 @@ class Idefics2VisionTransformer(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 79e130119ae83615a86898ea398870d62ee70133..9c5f9389e54bbad486d8f4a2e84e25d698b316ce 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -18,42 +18,49 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 from torch import nn
-from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
-                          Idefics3Processor)
+from transformers import (
+    BatchFeature,
+    Idefics3Config,
+    Idefics3ImageProcessor,
+    Idefics3Processor,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalDataItems, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
-# yapf: enable
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalDataItems,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-# yapf: disable
 from .idefics2_vision_model import (
-    Idefics2VisionTransformer as Idefics3VisionTransformer)
-# yapf: enable
+    Idefics2VisionTransformer as Idefics3VisionTransformer,
+)
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
-from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, maybe_prefix
 
 
 class Idefics3ImagePixelInputs(TensorSchema):
@@ -65,9 +72,10 @@ class Idefics3ImagePixelInputs(TensorSchema):
         - h: Height
         - w: Width
     """
+
     type: Literal["pixel_values"]
     pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
-    pixel_attention_mask: torch.Tensor
+    pixel_attention_mask: Annotated[torch.Tensor, TensorShape("bnp", "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
@@ -78,28 +86,30 @@ class Idefics3ImageEmbeddingInputs(TensorSchema):
         - f: Image feature size
         - h: Hidden size (must match the hidden size of language model backbone)
     """
+
     type: Literal["image_embeds"]
     data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
+ImageInputs: TypeAlias = Idefics3ImagePixelInputs | Idefics3ImageEmbeddingInputs
 
 
 class Idefics3ProcessingInfo(BaseProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
         return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def _resize_output_size(self,
-                            *,
-                            height: int,
-                            width: int,
-                            max_len: Optional[int] = None,
-                            min_len: int = 1,
-                            max_size: Optional[int] = None) -> tuple[int, int]:
+    def _resize_output_size(
+        self,
+        *,
+        height: int,
+        width: int,
+        max_len: int | None = None,
+        min_len: int = 1,
+        max_size: int | None = None,
+    ) -> tuple[int, int]:
         # Set default value for max_len if not provided
         max_len = max(height, width) if max_len is None else max_len
         aspect_ratio = width / height
@@ -135,18 +145,19 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
     ) -> tuple[int, int]:
         hf_processor = self.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
-        max_image_size = image_processor.size['longest_edge']
+        max_image_size = image_processor.size["longest_edge"]
         if resolution_max_side > max_image_size:
             raise ValueError(
-                "`resolution_max_side` cannot be larger than `max_image_size`")
+                "`resolution_max_side` cannot be larger than `max_image_size`"
+            )
 
         height, width = image_height, image_width
 
         # Find the output size, when rescaling the longest edge to max_len and
         # preserving the aspect ratio
-        height, width = self._resize_output_size(height=height,
-                                                 width=width,
-                                                 max_len=resolution_max_side)
+        height, width = self._resize_output_size(
+            height=height, width=width, max_len=resolution_max_side
+        )
         return height, width
 
     def _get_image_feature_grid_size(
@@ -154,19 +165,20 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> tuple[int, int]:
         if processor is None:
             processor = self.get_hf_processor()
 
         image_processor: Idefics3ImageProcessor = processor.image_processor
 
-        max_image_size = image_processor.max_image_size['longest_edge']
-        size = image_processor.size['longest_edge']
+        max_image_size = image_processor.max_image_size["longest_edge"]
+        size = image_processor.size["longest_edge"]
         assert size % max_image_size == 0, (
             "`longest_edge` in image_processor's `size` must be divisible by "
             "`longest_edge` in `max_image_size`, this may be caused by "
-            "incorrect mm_kwargs override.")
+            "incorrect mm_kwargs override."
+        )
 
         resized_height, resized_width = self._get_resize_output_image_size(
             image_width=image_width,
@@ -185,7 +197,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> int:
         grid_w, grid_h = self._get_image_feature_grid_size(
             image_width=image_width,
@@ -196,8 +208,8 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         return grid_w * grid_h + 1
 
     def _get_image_token(
-            self,
-            processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
+        self, processor: Idefics3Processor | None
+    ) -> tuple[str, str, str]:
         if processor is None:
             processor = self.get_hf_processor()
 
@@ -211,13 +223,14 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> str:
         if processor is None:
             processor = self.get_hf_processor()
 
         image_token, fake_image_token, global_img_token = self._get_image_token(
-            processor)
+            processor
+        )
         image_seq_len = processor.image_seq_len
         grid_placeholder = "<row_{n_h}_col_{n_w}>"
 
@@ -236,26 +249,27 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         tiles_placeholder = list[str]()
         for i in range(grid_h):
             for j in range(grid_w):
-                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1,
-                                                                   n_w=j + 1)
+                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1, n_w=j + 1)
                 tiles_placeholder.append(placeholder_per_tile)
                 # Add line break if it is the last tile in the row
                 if j == grid_w - 1:
                     tiles_placeholder.append("\n")
 
-        return "".join([
-            *tiles_placeholder,
-            "\n",
-            global_img_placeholder,
-            fake_image_token,
-        ])
+        return "".join(
+            [
+                *tiles_placeholder,
+                "\n",
+                global_img_placeholder,
+                fake_image_token,
+            ]
+        )
 
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -278,9 +292,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         )
 
 
-class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
-                                 ):
-
+class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -293,23 +305,26 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
-        longest_edge = image_processor.max_image_size['longest_edge']
+        longest_edge = image_processor.max_image_size["longest_edge"]
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=longest_edge,
-                                   height=longest_edge,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=longest_edge,
+                height=longest_edge,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class Idefics3MultiModalProcessor(
-        BaseMultiModalProcessor[Idefics3ProcessingInfo]):
-
+class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -330,9 +345,11 @@ class Idefics3MultiModalProcessor(
             tok_kwargs,
         )
 
-        parsed_images = (self._get_data_parser().parse_mm_data({
-            "image": images
-        }).get_items("image", ImageProcessorItems))
+        parsed_images = (
+            self._get_data_parser()
+            .parse_mm_data({"image": images})
+            .get_items("image", ImageProcessorItems)
+        )
         image_sizes = [
             parsed_images.get_image_size(i) for i in range(len(parsed_images))
         ]
@@ -343,7 +360,8 @@ class Idefics3MultiModalProcessor(
                 image_width=size.width,
                 image_height=size.height,
                 processor=hf_processor,
-            ) for size in image_sizes
+            )
+            for size in image_sizes
         ]
         processed_outputs["num_patches"] = torch.tensor(num_patches)
 
@@ -361,10 +379,10 @@ class Idefics3MultiModalProcessor(
         num_patches = hf_inputs.get("num_patches", torch.empty(0))
 
         return dict(
-            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             pixel_attention_mask=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+                "image", num_patches
+            ),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
         )
@@ -404,16 +422,14 @@ class Idefics3MultiModalProcessor(
 
 
 class Idefics3SimpleMLP(nn.Module):
-
     def __init__(
         self,
         config: Idefics3Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
-        input_size = config.vision_config.hidden_size * (config.scale_factor**
-                                                         2)
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
         output_size = config.text_config.hidden_size
         self.proj = ReplicatedLinear(
             input_size,
@@ -429,11 +445,10 @@ class Idefics3SimpleMLP(nn.Module):
 
 
 class Idefics3Connector(nn.Module):
-
     def __init__(
         self,
         config: Idefics3Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -444,14 +459,11 @@ class Idefics3Connector(nn.Module):
             prefix=maybe_prefix(prefix, "modality_projection"),
         )
 
-    def pixel_shuffle(self,
-                      x: torch.Tensor,
-                      scale_factor: int = 2) -> torch.Tensor:
+    def pixel_shuffle(self, x: torch.Tensor, scale_factor: int = 2) -> torch.Tensor:
         bsz, seq, embed_dim = x.size()
         height = width = int(seq**0.5)
         x = x.view(bsz, height, width, embed_dim)
-        x = x.view(bsz, height, int(width / scale_factor),
-                   embed_dim * scale_factor)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
         x = x.permute(0, 2, 1, 3)
         x = x.reshape(
             bsz,
@@ -460,19 +472,16 @@ class Idefics3Connector(nn.Module):
             embed_dim * (scale_factor**2),
         )
         x = x.permute(0, 2, 1, 3)
-        x = x.reshape(bsz, int(seq / (scale_factor**2)),
-                      embed_dim * (scale_factor**2))
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
         return x
 
     def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
-        image_hidden_states = self.pixel_shuffle(image_hidden_states,
-                                                 self.scale_factor)
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
         image_hidden_states = self.modality_projection(image_hidden_states)
         return image_hidden_states
 
 
 class Idefics3Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -484,7 +493,8 @@ class Idefics3Model(nn.Module):
         self.vision_model = Idefics3VisionTransformer(
             config.vision_config,
             quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "vision_model"))
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
         self.connector = Idefics3Connector(
             config,
             quant_config,
@@ -496,8 +506,9 @@ class Idefics3Model(nn.Module):
         )
 
         self.image_seq_len = int(
-            ((config.vision_config.image_size //
-              config.vision_config.patch_size)**2) / (config.scale_factor**2))
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
+            / (config.scale_factor**2)
+        )
         self.image_token_id = self.config.image_token_id
 
     def image_pixels_to_features(
@@ -514,21 +525,21 @@ class Idefics3Model(nn.Module):
         # Remove padding images - padding images are full 0.
         nb_values_per_image = pixel_values.shape[1:].numel()
         real_images_inds = (pixel_values == 0.0).sum(
-            dim=(-1, -2, -3)) != nb_values_per_image
+            dim=(-1, -2, -3)
+        ) != nb_values_per_image
         pixel_values = pixel_values[real_images_inds].contiguous()
 
         # Handle the vision attention mask
         # Remove padding images from the mask
-        pixel_attention_mask = pixel_attention_mask[
-            real_images_inds].contiguous()
+        pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
 
         patch_size = self.config.vision_config.patch_size
-        patches_subgrid = pixel_attention_mask.unfold(dimension=1,
-                                                      size=patch_size,
-                                                      step=patch_size)
-        patches_subgrid = patches_subgrid.unfold(dimension=2,
-                                                 size=patch_size,
-                                                 step=patch_size)
+        patches_subgrid = pixel_attention_mask.unfold(
+            dimension=1, size=patch_size, step=patch_size
+        )
+        patches_subgrid = patches_subgrid.unfold(
+            dimension=2, size=patch_size, step=patch_size
+        )
         patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
 
         # Get sequence from the vision encoder
@@ -539,20 +550,16 @@ class Idefics3Model(nn.Module):
 
         return image_hidden_states
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        return self.text_model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.text_model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.text_model(
             input_ids,
             positions,
@@ -565,9 +572,11 @@ class Idefics3Model(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(
     Idefics3MultiModalProcessor,
     info=Idefics3ProcessingInfo,
-    dummy_inputs=Idefics3DummyInputsBuilder)
-class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                       SupportsLoRA):
+    dummy_inputs=Idefics3DummyInputsBuilder,
+)
+class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -581,7 +590,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -597,8 +606,9 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.model = Idefics3Model(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        self.model = Idefics3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.image_token_id = self.config.image_token_id
 
         self.lm_head = ParallelLMHead(
@@ -611,8 +621,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.lm_head.weight = self.model.text_model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
 
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[ImageInputs]:
+    def _parse_and_validate_image_input(self, **kwargs: object) -> ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -620,47 +629,27 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             pixel_attention_mask = kwargs.pop("pixel_attention_mask")
-            if not isinstance(pixel_attention_mask, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel_attention_mask. "
-                                 f"Got type: {type(pixel_attention_mask)}")
-
             num_patches = kwargs.pop("num_patches")
-            if not isinstance(num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of num_patches. "
-                                 f"Got type: {type(num_patches)}")
-
             expected_h = expected_w = self.config.vision_config.image_size
+
             return Idefics3ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values, concat=True),
-                pixel_attention_mask=flatten_bn(pixel_attention_mask,
-                                                concat=True),
-                num_patches=flatten_bn(num_patches, concat=True),
-                resolve_bindings={
-                    "h": expected_h,
-                    "w": expected_w
-                },
+                pixel_values=pixel_values,
+                pixel_attention_mask=pixel_attention_mask,
+                num_patches=num_patches,
+                resolve_bindings={"h": expected_h, "w": expected_w},
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_image_pixels(
-            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+    def _process_image_pixels(self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
         pixel_values = inputs["pixel_values"]
         pixel_attention_mask = inputs["pixel_attention_mask"]
 
@@ -672,7 +661,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _process_image_input(
         self,
         image_input: ImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -680,60 +669,32 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_features = self.model.connector(image_features)
 
         num_patches = image_input["num_patches"]
-        return [
-            e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
-        ]
+        return [e.flatten(0, 1) for e in image_features.split(num_patches.tolist())]
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.model.text_model(input_ids,
-                                              positions,
-                                              intermediate_tensors,
-                                              inputs_embeds=inputs_embeds)
+        hidden_states = self.model.text_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
@@ -741,8 +702,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
@@ -753,4 +713,5 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         return MultiModelKeys.from_string_field(
             language_model="model.text_model",
             connector="model.connector",
-            tower_model="model.vision_model")
+            tower_model="model.vision_model",
+        )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index f13e590cd243bdcaac44620f4eea65f0ecec2fef..dc4caf2f02f9d4ed0c28fadb3424ab4ef67adf1c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,14 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Iterable, Mapping, MutableSequence
-from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    Union, overload, runtime_checkable)
+from collections.abc import Callable, Iterable, Mapping, MutableSequence, Set
+from typing import (
+    TYPE_CHECKING,
+    ClassVar,
+    Literal,
+    Protocol,
+    TypeAlias,
+    overload,
+    runtime_checkable,
+)
 
 import numpy as np
 import torch
+import torch.nn as nn
 from torch import Tensor
-from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
@@ -16,20 +23,25 @@ from vllm.config import ModelConfig, SpeechToTextConfig
 from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.utils import supports_kw
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.utils.func_utils import supports_kw
 
-from .interfaces_base import is_pooling_model
+from .interfaces_base import VllmModel, is_pooling_model
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
+    from vllm.multimodal.inputs import MultiModalFeatureSpec
     from vllm.sequence import IntermediateTensors
+else:
+    VllmConfig = object
+    WeightsMapper = object
+    MultiModalFeatureSpec = object
+    IntermediateTensors = object
 
 logger = init_logger(__name__)
 
-MultiModalEmbeddings = Union[list[Tensor], Tensor, tuple[Tensor, ...]]
+MultiModalEmbeddings: TypeAlias = list[Tensor] | Tensor | tuple[Tensor, ...]
 """
 The output embeddings must be one of the following formats:
 
@@ -70,17 +82,21 @@ class SupportsMultiModal(Protocol):
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
+    multimodal_cpu_fields: ClassVar[Set[str]] = frozenset()
+    """
+    A set indicating CPU-only multimodal fields.
+    """
+
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
         Get the placeholder text for the `i`th `modality` item in the prompt.
         """
         ...
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """
-        Returns multimodal embeddings generated from multimodal kwargs 
+        Returns multimodal embeddings generated from multimodal kwargs
         to be merged with text embeddings.
 
         Note:
@@ -88,13 +104,19 @@ class SupportsMultiModal(Protocol):
             the appearances of their corresponding multimodal data item in the
             input prompt.
         """
-        ...
+        if hasattr(self, "get_multimodal_embeddings"):
+            logger.warning_once(
+                "`get_multimodal_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_multimodal`."
+            )
+            return self.get_multimodal_embeddings(**kwargs)
 
-    def get_language_model(self) -> torch.nn.Module:
+    def get_language_model(self) -> VllmModel:
         """
         Returns the underlying language model used for text generation.
 
-        This is typically the `torch.nn.Module` instance responsible for 
+        This is typically the `torch.nn.Module` instance responsible for
         processing the merged multimodal embeddings and producing hidden states
 
         Returns:
@@ -102,17 +124,83 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
-    def get_input_embeddings(
+    @overload
+    def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
+
+    @overload
+    def embed_input_ids(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: MultiModalEmbeddings,
+        *,
+        is_multimodal: torch.Tensor,
+        handle_oov_mm_token: bool = False,
+    ) -> Tensor: ...
+
+    def _embed_text_input_ids(
+        self,
+        input_ids: Tensor,
+        embed_input_ids: Callable[[Tensor], Tensor],
+        *,
+        is_multimodal: Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> Tensor:
+        if handle_oov_mm_token and is_multimodal is not None:
+            is_text = ~is_multimodal
+            text_embeds = embed_input_ids(input_ids[is_text])
+
+            return torch.empty(
+                (input_ids.shape[0], text_embeds.shape[1]),
+                dtype=text_embeds.dtype,
+                device=text_embeds.device,
+            ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
+
+        return embed_input_ids(input_ids)
+
+    def embed_input_ids(
         self,
         input_ids: Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> Tensor:
         """
-        Returns the input embeddings merged from the text embeddings from 
-        input_ids and the multimodal embeddings generated from multimodal 
-        kwargs.
+        Apply token embeddings to `input_ids`.
+
+        If `multimodal_embeddings` is passed, scatter them into
+        `input_ids` according to the mask `is_multimodal`.
+
+        In case the multi-modal token IDs exceed the vocabulary size of
+        the language model, you can set `handle_oov_mm_token=False`
+        to avoid calling the language model's `embed_input_ids` method
+        on those tokens. Note however that doing so increases memory usage
+        as an additional buffer is needed to hold the input embeddings.
         """
-        ...
+        from .utils import _merge_multimodal_embeddings
+
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.get_language_model().embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        if is_multimodal is None:
+            raise ValueError(
+                "`embed_input_ids` now requires `is_multimodal` arg, "
+                "please update your model runner according to "
+                "https://github.com/vllm-project/vllm/pull/16229."
+            )
+
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
 
 
 @runtime_checkable
@@ -121,12 +209,15 @@ class SupportsMultiModalPruning(Protocol):
     embeddings and positions. Model may require custom positions for dynamic
     pruning of multimodal embeddings.
     """
+
     supports_multimodal_pruning: ClassVar[Literal[True]] = True
 
     def recompute_mrope_positions(
-            self, input_ids: list[int],
-            multimodal_embeddings: MultiModalEmbeddings,
-            mrope_positions: torch.LongTensor, num_computed_tokens: int
+        self,
+        input_ids: list[int],
+        multimodal_embeddings: MultiModalEmbeddings,
+        mrope_positions: torch.LongTensor,
+        num_computed_tokens: int,
     ) -> tuple[MultiModalEmbeddings, Tensor, int]:
         """
         Update part of input mrope positions (starting with
@@ -152,48 +243,40 @@ class SupportsMultiModalPruning(Protocol):
 
 
 @overload
-def supports_multimodal(
-        model: type[object]) -> TypeIs[type[SupportsMultiModal]]:
-    ...
+def supports_multimodal(model: type[object]) -> TypeIs[type[SupportsMultiModal]]: ...
 
 
 @overload
-def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]:
-    ...
+def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
 
 
 def supports_multimodal(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
     return getattr(model, "supports_multimodal", False)
 
 
-def supports_multimodal_raw_input_only(
-        model: Union[type[object], object]) -> bool:
+def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
     return getattr(model, "supports_multimodal_raw_input_only", False)
 
 
-def supports_multimodal_encoder_tp_data(
-        model: Union[type[object], object]) -> bool:
+def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
 @overload
 def supports_multimodal_pruning(
-        model: type[object]) -> TypeIs[type[SupportsMultiModalPruning]]:
-    ...
+    model: type[object],
+) -> TypeIs[type[SupportsMultiModalPruning]]: ...
 
 
 @overload
-def supports_multimodal_pruning(
-        model: object) -> TypeIs[SupportsMultiModalPruning]:
-    ...
+def supports_multimodal_pruning(model: object) -> TypeIs[SupportsMultiModalPruning]: ...
 
 
 def supports_multimodal_pruning(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMultiModalPruning]],
-           TypeIs[SupportsMultiModalPruning]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMultiModalPruning]] | TypeIs[SupportsMultiModalPruning]:
     return getattr(model, "supports_multimodal_pruning", False)
 
 
@@ -211,10 +294,10 @@ class SupportsScoreTemplate(Protocol):
     """
 
     @classmethod
-    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+    def get_score_template(cls, query: str, document: str) -> str | None:
         """
         Generate a full prompt by populating the score template with query and document content.
-        """ # noqa: E501
+        """  # noqa: E501
         ...
 
     @classmethod
@@ -227,18 +310,17 @@ class SupportsScoreTemplate(Protocol):
 
 @overload
 def supports_score_template(
-        model: type[object]) -> TypeIs[type[SupportsScoreTemplate]]:
-    ...
+    model: type[object],
+) -> TypeIs[type[SupportsScoreTemplate]]: ...
 
 
 @overload
-def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]:
-    ...
+def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]: ...
 
 
 def supports_score_template(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsScoreTemplate]] | TypeIs[SupportsScoreTemplate]:
     return getattr(model, "supports_score_template", False)
 
 
@@ -258,7 +340,7 @@ class SupportsLoRA(Protocol):
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
     embedding_padding_modules: ClassVar[list[str]] = []
-    packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
+    packed_modules_mapping: dict[str, list[str]] = {}
 
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
@@ -273,18 +355,16 @@ class _SupportsLoRAType(Protocol):
 
 
 @overload
-def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]:
-    ...
+def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: ...
 
 
 @overload
-def supports_lora(model: object) -> TypeIs[SupportsLoRA]:
-    ...
+def supports_lora(model: object) -> TypeIs[SupportsLoRA]: ...
 
 
 def supports_lora(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsLoRA]] | TypeIs[SupportsLoRA]:
     result = _supports_lora(model)
 
     if not result:
@@ -293,8 +373,7 @@ def supports_lora(
             "embedding_modules",
             "embedding_padding_modules",
         )
-        missing_attrs = tuple(attr for attr in lora_attrs
-                              if not hasattr(model, attr))
+        missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))
 
         if getattr(model, "supports_lora", False):
             if missing_attrs:
@@ -308,12 +387,14 @@ def supports_lora(
             if not missing_attrs:
                 logger.warning(
                     "The model (%s) contains all LoRA-specific attributes, "
-                    "but does not set `supports_lora=True`.", model)
+                    "but does not set `supports_lora=True`.",
+                    model,
+                )
 
     return result
 
 
-def _supports_lora(model: Union[type[object], object]) -> bool:
+def _supports_lora(model: type[object] | object) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsLoRAType)
 
@@ -338,15 +419,15 @@ class SupportsPP(Protocol):
         batch_size: int,
         dtype: torch.dtype,
         device: torch.device,
-    ) -> "IntermediateTensors":
+    ) -> IntermediateTensors:
         """Called when PP rank > 0 for profiling purposes."""
         ...
 
     def forward(
         self,
         *,
-        intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[Tensor, "IntermediateTensors"]:
+        intermediate_tensors: IntermediateTensors | None,
+    ) -> IntermediateTensors | None:
         """
         Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
         PP rank > 0.
@@ -368,42 +449,39 @@ class _SupportsPPType(Protocol):
         batch_size: int,
         dtype: torch.dtype,
         device: torch.device,
-    ) -> "IntermediateTensors":
-        ...
+    ) -> IntermediateTensors: ...
 
     def forward(
         self,
         *,
-        intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[Tensor, "IntermediateTensors"]:
-        ...
+        intermediate_tensors: IntermediateTensors | None,
+    ) -> Tensor | IntermediateTensors: ...
 
 
 @overload
-def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]:
-    ...
+def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: ...
 
 
 @overload
-def supports_pp(model: object) -> TypeIs[SupportsPP]:
-    ...
+def supports_pp(model: object) -> TypeIs[SupportsPP]: ...
 
 
 def supports_pp(
-    model: Union[type[object], object],
-) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]:
+    model: type[object] | object,
+) -> bool | TypeIs[type[SupportsPP]] | TypeIs[SupportsPP]:
     supports_attributes = _supports_pp_attributes(model)
     supports_inspect = _supports_pp_inspect(model)
 
     if supports_attributes and not supports_inspect:
         logger.warning(
             "The model (%s) sets `supports_pp=True`, but does not accept "
-            "`intermediate_tensors` in its `forward` method", model)
+            "`intermediate_tensors` in its `forward` method",
+            model,
+        )
 
     if not supports_attributes:
-        pp_attrs = ("make_empty_intermediate_tensors", )
-        missing_attrs = tuple(attr for attr in pp_attrs
-                              if not hasattr(model, attr))
+        pp_attrs = ("make_empty_intermediate_tensors",)
+        missing_attrs = tuple(attr for attr in pp_attrs if not hasattr(model, attr))
 
         if getattr(model, "supports_pp", False):
             if missing_attrs:
@@ -417,19 +495,21 @@ def supports_pp(
             if not missing_attrs:
                 logger.warning(
                     "The model (%s) contains all PP-specific attributes, "
-                    "but does not set `supports_pp=True`.", model)
+                    "but does not set `supports_pp=True`.",
+                    model,
+                )
 
     return supports_attributes and supports_inspect
 
 
-def _supports_pp_attributes(model: Union[type[object], object]) -> bool:
+def _supports_pp_attributes(model: type[object] | object) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsPPType)
 
     return isinstance(model, SupportsPP)
 
 
-def _supports_pp_inspect(model: Union[type[object], object]) -> bool:
+def _supports_pp_inspect(model: type[object] | object) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
@@ -450,18 +530,16 @@ class HasInnerState(Protocol):
 
 
 @overload
-def has_inner_state(model: object) -> TypeIs[HasInnerState]:
-    ...
+def has_inner_state(model: object) -> TypeIs[HasInnerState]: ...
 
 
 @overload
-def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]:
-    ...
+def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ...
 
 
 def has_inner_state(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]:
+    model: type[object] | object,
+) -> TypeIs[type[HasInnerState]] | TypeIs[HasInnerState]:
     return getattr(model, "has_inner_state", False)
 
 
@@ -479,25 +557,23 @@ class IsAttentionFree(Protocol):
 
 
 @overload
-def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
-    ...
+def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: ...
 
 
 @overload
-def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]:
-    ...
+def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ...
 
 
 def is_attention_free(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
+    model: type[object] | object,
+) -> TypeIs[type[IsAttentionFree]] | TypeIs[IsAttentionFree]:
     return getattr(model, "is_attention_free", False)
 
 
 @runtime_checkable
 class IsHybrid(Protocol):
     """The interface required for all models like Jamba that have both
-    attention and mamba blocks, indicates that 
+    attention and mamba blocks, indicates that
     hf_config has 'layers_block_type'"""
 
     is_hybrid: ClassVar[Literal[True]] = True
@@ -509,7 +585,7 @@ class IsHybrid(Protocol):
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
-        vllm_config: "VllmConfig",
+        vllm_config: VllmConfig,
         use_v1: bool = True,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         """Calculate shapes for Mamba's convolutional and state caches.
@@ -527,18 +603,16 @@ class IsHybrid(Protocol):
 
 
 @overload
-def is_hybrid(model: object) -> TypeIs[IsHybrid]:
-    ...
+def is_hybrid(model: object) -> TypeIs[IsHybrid]: ...
 
 
 @overload
-def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]:
-    ...
+def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ...
 
 
 def is_hybrid(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]:
+    model: type[object] | object,
+) -> TypeIs[type[IsHybrid]] | TypeIs[IsHybrid]:
     return getattr(model, "is_hybrid", False)
 
 
@@ -580,6 +654,9 @@ class MixtureOfExperts(Protocol):
     num_redundant_experts: int
     """Number of redundant experts in this model."""
 
+    moe_layers: Iterable[nn.Module]
+    """List of MoE layers in this model."""
+
     def set_eplb_state(
         self,
         expert_load_view: Tensor,
@@ -588,7 +665,7 @@ class MixtureOfExperts(Protocol):
     ) -> None:
         """
         Register the EPLB state in the MoE model.
-        
+
         Since these are views of the actual EPLB state, any changes made by
         the EPLB algorithm are automatically reflected in the model's behavior
         without requiring additional method calls to set new states.
@@ -602,18 +679,27 @@ class MixtureOfExperts(Protocol):
             logical_to_physical_map: Mapping from logical to physical experts.
             logical_replica_count: Count of replicas for each logical expert.
         """
-        ...
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
 
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
         num_local_physical_experts: int,
-    ) -> None:
-        ...
+    ) -> None: ...
 
 
 def is_mixture_of_experts(model: object) -> TypeIs[MixtureOfExperts]:
-    return isinstance(model, MixtureOfExperts)
+    return (
+        isinstance(model, MixtureOfExperts) and getattr(model, "num_moe_layers", 0) > 0
+    )
 
 
 @runtime_checkable
@@ -622,21 +708,47 @@ class HasNoOps(Protocol):
 
 
 @overload
-def has_noops(model: object) -> TypeIs[HasNoOps]:
-    ...
+def has_noops(model: object) -> TypeIs[HasNoOps]: ...
 
 
 @overload
-def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]:
-    ...
+def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]: ...
 
 
 def has_noops(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]:
+    model: type[object] | object,
+) -> TypeIs[type[HasNoOps]] | TypeIs[HasNoOps]:
     return getattr(model, "has_noops", False)
 
 
+@runtime_checkable
+class SupportsMambaPrefixCaching(Protocol):
+    """The interface for models whose mamba layers support prefix caching.
+
+    This is currently experimental.
+    """
+
+    supports_mamba_prefix_caching: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_mamba_prefix_caching(
+    model: object,
+) -> TypeIs[SupportsMambaPrefixCaching]: ...
+
+
+@overload
+def supports_mamba_prefix_caching(
+    model: type[object],
+) -> TypeIs[type[SupportsMambaPrefixCaching]]: ...
+
+
+def supports_mamba_prefix_caching(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMambaPrefixCaching]] | TypeIs[SupportsMambaPrefixCaching]:
+    return getattr(model, "supports_mamba_prefix_caching", False)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
@@ -646,33 +758,32 @@ class SupportsCrossEncoding(Protocol):
 
 @overload
 def supports_cross_encoding(
-        model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]:
-    ...
+    model: type[object],
+) -> TypeIs[type[SupportsCrossEncoding]]: ...
 
 
 @overload
-def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]:
-    ...
+def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: ...
 
 
 def _supports_cross_encoding(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsCrossEncoding]] | TypeIs[SupportsCrossEncoding]:
     return getattr(model, "supports_cross_encoding", False)
 
 
 def supports_cross_encoding(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsCrossEncoding]] | TypeIs[SupportsCrossEncoding]:
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
-    hf_to_vllm_mapper: ClassVar[Optional["WeightsMapper"]] = None
-    packed_modules_mapping: ClassVar[Optional[dict[str, list[str]]]] = None
-    quant_config: Optional[QuantizationConfig] = None
+    hf_to_vllm_mapper: ClassVar[WeightsMapper | None] = None
+    packed_modules_mapping: ClassVar[dict[str, list[str]] | None] = None
+    quant_config: QuantizationConfig | None = None
 
     def __new__(cls, *args, **kwargs) -> Self:
         instance = super().__new__(cls)
@@ -680,7 +791,6 @@ class SupportsQuant:
         # find config passed in arguments
         quant_config = cls._find_quant_config(*args, **kwargs)
         if quant_config is not None:
-
             # attach config to model for general use
             instance.quant_config = quant_config
 
@@ -689,12 +799,13 @@ class SupportsQuant:
                 instance.quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
             if instance.packed_modules_mapping is not None:
                 instance.quant_config.packed_modules_mapping.update(
-                    instance.packed_modules_mapping)
+                    instance.packed_modules_mapping
+                )
 
         return instance
 
     @staticmethod
-    def _find_quant_config(*args, **kwargs) -> Optional[QuantizationConfig]:
+    def _find_quant_config(*args, **kwargs) -> QuantizationConfig | None:
         """Find quant config passed through model constructor args"""
         from vllm.config import VllmConfig  # avoid circular import
 
@@ -712,6 +823,7 @@ class SupportsQuant:
 @runtime_checkable
 class SupportsTranscription(Protocol):
     """The interface required for all models that support transcription."""
+
     # Mapping from ISO639_1 language codes: language names
     supported_languages: ClassVar[Mapping[str, str]]
 
@@ -732,16 +844,20 @@ class SupportsTranscription(Protocol):
             raise ValueError(
                 f"{cls.__name__}.supported_languages contains invalid "
                 f"language codes: {sorted(invalid)}\n. "
-                f"Valid choices are: {sorted(LANGUAGES.keys())}")
+                f"Valid choices are: {sorted(LANGUAGES.keys())}"
+            )
 
     @classmethod
-    def get_generation_prompt(cls, audio: np.ndarray,
-                              stt_config: SpeechToTextConfig,
-                              model_config: ModelConfig,
-                              language: Optional[str],
-                              task_type: Literal["transcribe", "translate"],
-                              request_prompt: str,
-                              to_language: Optional[str]) -> PromptType:
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
         returns a valid PromptType."""
@@ -750,17 +866,14 @@ class SupportsTranscription(Protocol):
     @classmethod
     def get_other_languages(cls) -> Mapping[str, str]:
         # other possible language codes from the whisper map
-        return {
-            k: v
-            for k, v in LANGUAGES.items() if k not in cls.supported_languages
-        }
+        return {k: v for k, v in LANGUAGES.items() if k not in cls.supported_languages}
 
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         """
-        Ensure the language specified in the transcription request 
-        is a valid ISO 639-1 language code. If the request language is 
-        valid, but not natively supported by the model, trigger a 
+        Ensure the language specified in the transcription request
+        is a valid ISO 639-1 language code. If the request language is
+        valid, but not natively supported by the model, trigger a
         warning (but not an exception).
         """
         if language is None or language in cls.supported_languages:
@@ -777,22 +890,25 @@ class SupportsTranscription(Protocol):
         else:
             raise ValueError(
                 f"Unsupported language: {language!r}.  Must be one of "
-                f"{list(cls.supported_languages.keys())}.")
+                f"{list(cls.supported_languages.keys())}."
+            )
 
     @classmethod
     def get_speech_to_text_config(
-            cls, model_config: ModelConfig,
-            task_type: Literal["transcribe",
-                               "translate"]) -> SpeechToTextConfig:
+        cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
+    ) -> SpeechToTextConfig:
         """Get the speech to text config for the ASR model."""
         ...
 
     @classmethod
-    def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig,
-                             model_config: ModelConfig) -> Optional[int]:
+    def get_num_audio_tokens(
+        cls,
+        audio_duration_s: float,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+    ) -> int | None:
         """
-        Map from audio duration to number of audio tokens produced by the ASR 
+        Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
@@ -801,52 +917,88 @@ class SupportsTranscription(Protocol):
 
 @overload
 def supports_transcription(
-        model: type[object]) -> TypeIs[type[SupportsTranscription]]:
-    ...
+    model: type[object],
+) -> TypeIs[type[SupportsTranscription]]: ...
 
 
 @overload
-def supports_transcription(model: object) -> TypeIs[SupportsTranscription]:
-    ...
+def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: ...
 
 
 def supports_transcription(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsTranscription]] | TypeIs[SupportsTranscription]:
     return getattr(model, "supports_transcription", False)
 
 
 @runtime_checkable
-class SupportsV0Only(Protocol):
-    """Models with this interface are not compatible with V1 vLLM."""
+class SupportsEagleBase(Protocol):
+    """Base interface for models that support EAGLE-based speculative decoding."""
+
+    has_own_lm_head: bool = False
+    """
+    A flag that indicates this model has trained its own lm_head.
+    """
+
+    has_own_embed_tokens: bool = False
+    """
+    A flag that indicates this model has trained its own input embeddings.
+    """
+
+
+@overload
+def supports_any_eagle(model: type[object]) -> TypeIs[type[SupportsEagleBase]]: ...
+
+
+@overload
+def supports_any_eagle(model: object) -> TypeIs[SupportsEagleBase]: ...
+
+
+def supports_any_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagleBase]] | TypeIs[SupportsEagleBase]:
+    """Check if model supports any EAGLE variant (1, 2, or 3)."""
+    return supports_eagle(model) or supports_eagle3(model)
+
+
+@runtime_checkable
+class SupportsEagle(SupportsEagleBase, Protocol):
+    """The interface required for models that support
+    EAGLE-1 and EAGLE-2 speculative decoding."""
 
-    supports_v0_only: ClassVar[Literal[True]] = True
+    supports_eagle: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports EAGLE-1 and EAGLE-2 
+    speculative decoding.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
 
 
 @overload
-def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]:
-    ...
+def supports_eagle(model: type[object]) -> TypeIs[type[SupportsEagle]]: ...
 
 
 @overload
-def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]:
-    ...
+def supports_eagle(model: object) -> TypeIs[SupportsEagle]: ...
 
 
-def supports_v0_only(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
-    return getattr(model, "supports_v0_only", False)
+def supports_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle]] | TypeIs[SupportsEagle]:
+    return isinstance(model, SupportsEagle)
 
 
 @runtime_checkable
-class SupportsEagle3(Protocol):
-    """The interface required for models that support 
-    EAGLE3 speculative decoding."""
+class SupportsEagle3(SupportsEagleBase, Protocol):
+    """The interface required for models that support
+    EAGLE-3 speculative decoding."""
 
     supports_eagle3: ClassVar[Literal[True]] = True
     """
-    A flag that indicates this model supports EAGLE3 
+    A flag that indicates this model supports EAGLE-3 
     speculative decoding.
 
     Note:
@@ -857,8 +1009,8 @@ class SupportsEagle3(Protocol):
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         """
         Set which layers should output auxiliary
-        hidden states for EAGLE3.
-        
+        hidden states for EAGLE-3.
+
         Args:
             layers: Tuple of layer indices that should output auxiliary
                 hidden states.
@@ -868,8 +1020,8 @@ class SupportsEagle3(Protocol):
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         """
         Get the layer indices that should output auxiliary hidden states
-        for EAGLE3.
-        
+        for EAGLE-3.
+
         Returns:
             Tuple of layer indices for auxiliary hidden state outputs.
         """
@@ -877,18 +1029,16 @@ class SupportsEagle3(Protocol):
 
 
 @overload
-def supports_eagle3(model: type[object]) -> TypeIs[type[SupportsEagle3]]:
-    ...
+def supports_eagle3(model: type[object]) -> TypeIs[type[SupportsEagle3]]: ...
 
 
 @overload
-def supports_eagle3(model: object) -> TypeIs[SupportsEagle3]:
-    ...
+def supports_eagle3(model: object) -> TypeIs[SupportsEagle3]: ...
 
 
 def supports_eagle3(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle3]] | TypeIs[SupportsEagle3]:
     return isinstance(model, SupportsEagle3)
 
 
@@ -908,52 +1058,35 @@ class SupportsMRoPE(Protocol):
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]] = None,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
+        mm_features: list["MultiModalFeatureSpec"],
     ) -> tuple[torch.Tensor, int]:
         """
         Get M-RoPE input positions and delta value for this specific model.
-        
+
         This method should be implemented by each model that supports M-RoPE
         to provide model-specific logic for computing input positions.
-        
+
         Args:
             input_tokens: List of input token IDs
-            hf_config: HuggingFace model configuration
-            image_grid_thw: Image grid dimensions (t, h, w)
-            video_grid_thw: Video grid dimensions (t, h, w)
-            second_per_grid_ts: Seconds per grid timestep for videos
-            context_len: Context length
-            seq_len: Sequence length
-            audio_feature_lengths: Audio feature lengths for multimodal models
-            use_audio_in_video: Whether to use audio in video for interleaving
-            
+            mm_features: Information about each multi-modal data item
+
         Returns:
-            Tuple of (llm_positions, mrope_position_delta)
-            - llm_positions: Tensor of shape [3, num_tokens]
-                with T/H/W positions
+            Tuple of `(llm_positions, mrope_position_delta)`
+            - llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
             - mrope_position_delta: Delta for position calculations
         """
         ...
 
 
 @overload
-def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]:
-    ...
+def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]: ...
 
 
 @overload
-def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]:
-    ...
+def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]: ...
 
 
 def supports_mrope(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 8fdf70e35a2b8a30f84bbb964d1c55b346312778..4267b6c6598e270ab6fdc8fb48156edea8428c0b 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,14 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
-                    Union, overload, runtime_checkable)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Literal,
+    Protocol,
+    overload,
+    runtime_checkable,
+)
 
 import torch
 import torch.nn as nn
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
-from vllm.utils import supports_kw
+from vllm.utils.func_utils import supports_kw
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -34,37 +41,55 @@ T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
 class VllmModel(Protocol[T_co]):
     """The interface required for all models in vLLM."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
-        ...
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: ...
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-    ) -> T_co:
-        ...
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Apply token embeddings to `input_ids`."""
+        if hasattr(self, "get_input_embeddings"):
+            logger.warning_once(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            return self.get_input_embeddings(input_ids)
 
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...
 
-def _check_vllm_model_init(model: Union[type[object], object]) -> bool:
+
+def _check_vllm_model_init(model: type[object] | object) -> bool:
     model_init = model.__init__
     return supports_kw(model_init, "vllm_config")
 
 
-def _check_vllm_model_forward(model: Union[type[object], object]) -> bool:
+def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
+    model_embed_input_ids = getattr(model, "embed_input_ids", None)
+    if not callable(model_embed_input_ids):
+        model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
+        if callable(model_get_input_embeddings):
+            logger.warning(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            model.embed_input_ids = model_get_input_embeddings
+        logger.warning(
+            "The model (%s) is missing the `embed_input_ids` method.",
+            model,
+        )
+        return False
+
+    return True
+
+
+def _check_vllm_model_forward(model: type[object] | object) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
 
     vllm_kws = ("input_ids", "positions")
-    missing_kws = tuple(kw for kw in vllm_kws
-                        if not supports_kw(model_forward, kw))
+    missing_kws = tuple(kw for kw in vllm_kws if not supports_kw(model_forward, kw))
 
-    if missing_kws and (isinstance(model, type)
-                        and issubclass(model, nn.Module)):
+    if missing_kws and (isinstance(model, type) and issubclass(model, nn.Module)):
         logger.warning(
             "The model (%s) is missing "
             "vLLM-specific keywords from its `forward` method: %s",
@@ -76,19 +101,21 @@ def _check_vllm_model_forward(model: Union[type[object], object]) -> bool:
 
 
 @overload
-def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]:
-    ...
+def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: ...
 
 
 @overload
-def is_vllm_model(model: object) -> TypeIs[VllmModel]:
-    ...
+def is_vllm_model(model: object) -> TypeIs[VllmModel]: ...
 
 
 def is_vllm_model(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]:
-    return _check_vllm_model_init(model) and _check_vllm_model_forward(model)
+    model: type[object] | object,
+) -> TypeIs[type[VllmModel]] | TypeIs[VllmModel]:
+    return (
+        _check_vllm_model_init(model)
+        and _check_vllm_model_embed_input_ids(model)
+        and _check_vllm_model_forward(model)
+    )
 
 
 @runtime_checkable
@@ -98,27 +125,24 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     def compute_logits(
         self,
         hidden_states: T,
-    ) -> Optional[T]:
+    ) -> T | None:
         """Return `None` if TP rank > 0."""
         ...
 
 
 @overload
 def is_text_generation_model(
-        model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]:
-    ...
+    model: type[object],
+) -> TypeIs[type[VllmModelForTextGeneration]]: ...
 
 
 @overload
-def is_text_generation_model(
-        model: object) -> TypeIs[VllmModelForTextGeneration]:
-    ...
+def is_text_generation_model(model: object) -> TypeIs[VllmModelForTextGeneration]: ...
 
 
 def is_text_generation_model(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[VllmModelForTextGeneration]],
-           TypeIs[VllmModelForTextGeneration]]:
+    model: type[object] | object,
+) -> TypeIs[type[VllmModelForTextGeneration]] | TypeIs[VllmModelForTextGeneration]:
     if not is_vllm_model(model):
         return False
 
@@ -157,18 +181,16 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
 
 
 @overload
-def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]:
-    ...
+def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: ...
 
 
 @overload
-def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
-    ...
+def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: ...
 
 
 def is_pooling_model(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
+    model: type[object] | object,
+) -> TypeIs[type[VllmModelForPooling]] | TypeIs[VllmModelForPooling]:
     if not is_vllm_model(model):
         return False
 
@@ -188,5 +210,5 @@ def default_pooling_type(pooling_type: str):
     return func
 
 
-def get_default_pooling_type(model: Union[type[object], object]) -> str:
+def get_default_pooling_type(model: type[object] | object) -> str:
     return getattr(model, "default_pooling_type", "LAST")
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 2c341d28397192a3b50a5d4112d2ca194ba086bd..03918127c6ae109c268e55302597b4df54bf70c7 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -9,7 +9,6 @@
 # --------------------------------------------------------
 from collections.abc import Iterable
 from functools import partial
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -17,28 +16,32 @@ import torch.nn.functional as F
 from transformers import PretrainedConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 from .vision import run_dp_sharded_vision_model
 
 NORM2FN = {
-    'rms_norm': RMSNorm,
-    'layer_norm': nn.LayerNorm,
+    "rms_norm": RMSNorm,
+    "layer_norm": nn.LayerNorm,
 }
 
 
 class InternVisionEmbeddings(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.config = config
@@ -48,28 +51,36 @@ class InternVisionEmbeddings(nn.Module):
 
         self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(in_channels=3,
-                                         out_channels=self.embed_dim,
-                                         kernel_size=self.patch_size,
-                                         stride=self.patch_size)
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
 
         self.position_embedding = nn.Parameter(
-            torch.randn(1, self.num_positions, self.embed_dim))
+            torch.randn(1, self.num_positions, self.embed_dim)
+        )
 
     def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int):
         target_dtype = pos_embed.dtype
-        pos_embed = pos_embed.float().reshape(
-            1, self.image_size // self.patch_size,
-            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
-        pos_embed = F.interpolate(pos_embed,
-                                  size=(H, W),
-                                  mode='bicubic',
-                                  align_corners=False)
-        return pos_embed.reshape(1, -1, H * W).permute(0, 2,
-                                                       1).to(target_dtype)
+        pos_embed = (
+            pos_embed.float()
+            .reshape(
+                1,
+                self.image_size // self.patch_size,
+                self.image_size // self.patch_size,
+                -1,
+            )
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = F.interpolate(
+            pos_embed, size=(H, W), mode="bicubic", align_corners=False
+        )
+        return pos_embed.reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
 
     def _get_position_embedding(self, H: int, W: int) -> torch.Tensor:
         position_embedding = self.position_embedding
@@ -86,12 +97,12 @@ class InternVisionEmbeddings(nn.Module):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(
-            target_dtype))  # shape = [*, channel, width, height]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(target_dtype)
+        )  # shape = [*, channel, width, height]
         batch_size, _, height, width = patch_embeds.shape
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-        class_embeds = self.class_embedding.expand(batch_size, 1,
-                                                   -1).to(target_dtype)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
         position_embedding = self._get_position_embedding(height, width)
         embeddings = embeddings + position_embedding.to(target_dtype)
@@ -99,7 +110,6 @@ class InternVisionEmbeddings(nn.Module):
 
 
 class InternVisionPatchModel(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.config = config
@@ -110,12 +120,11 @@ class InternVisionPatchModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
-            raise ValueError(
-                'You have to specify pixel_values or pixel_embeds')
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
 
         if pixel_embeds is not None:
             hidden_states = pixel_embeds
@@ -123,8 +132,7 @@ class InternVisionPatchModel(nn.Module):
             if pixel_values.ndim == 4:
                 hidden_states = self.embeddings(pixel_values)
             else:
-                raise ValueError(
-                    f'wrong pixel_values size: {pixel_values.shape}')
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
 
         return hidden_states
 
@@ -135,7 +143,7 @@ class InternParallelAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
@@ -149,19 +157,21 @@ class InternParallelAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f'embed_dim must be divisible by num_heads '
-                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
-                f' {self.num_heads}).')
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
 
-        self.tp_size = (1 if use_data_parallel else
-                        get_tensor_model_parallel_world_size())
-        self.tp_rank = (0 if use_data_parallel else
-                        get_tensor_model_parallel_rank())
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
 
         # Additional dummy heads are used to enable TP for common GPU counts.
         self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
-        self.num_heads_per_partition = divide(num_dummy_heads + self.num_heads,
-                                              self.tp_size)
+        self.num_heads_per_partition = divide(
+            num_dummy_heads + self.num_heads, self.tp_size
+        )
 
         self.scale = self.head_dim**-0.5
         self.qkv = QKVParallelLinear(
@@ -177,12 +187,16 @@ class InternParallelAttention(nn.Module):
         self.qk_normalization = config.qk_normalization
 
         if self.qk_normalization:
-            self.q_norm = RMSNorm(self.dummy_dim,
-                                  eps=config.layer_norm_eps,
-                                  var_hidden_size=self.embed_dim)
-            self.k_norm = RMSNorm(self.dummy_dim,
-                                  eps=config.layer_norm_eps,
-                                  var_hidden_size=self.embed_dim)
+            self.q_norm = RMSNorm(
+                self.dummy_dim,
+                eps=config.layer_norm_eps,
+                var_hidden_size=self.embed_dim,
+            )
+            self.k_norm = RMSNorm(
+                self.dummy_dim,
+                eps=config.layer_norm_eps,
+                var_hidden_size=self.embed_dim,
+            )
 
         self.proj = RowParallelLinear(
             self.dummy_dim,
@@ -192,8 +206,9 @@ class InternParallelAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_partition, self.head_dim, self.scale
+        )
 
     def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
         if self.tp_size > 1:
@@ -202,8 +217,7 @@ class InternParallelAttention(nn.Module):
         q = self.q_norm(q)
         k = self.k_norm(k)
         if self.tp_size > 1:
-            splitter = partial(split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
         return q, k
@@ -222,11 +236,10 @@ class InternParallelAttention(nn.Module):
 
 
 class InternMLP(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -234,18 +247,22 @@ class InternMLP(nn.Module):
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1",
-                                        disable_tp=use_data_parallel)
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2",
-                                     disable_tp=use_data_parallel)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -256,11 +273,10 @@ class InternMLP(nn.Module):
 
 
 class InternVisionEncoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
@@ -272,71 +288,70 @@ class InternVisionEncoderLayer(nn.Module):
         self.intermediate_size = config.intermediate_size
         self.norm_type = config.norm_type
 
-        self.attn = self._init_attn(config,
-                                    quant_config,
-                                    num_dummy_heads=num_dummy_heads,
-                                    prefix=f"{prefix}.attn",
-                                    use_data_parallel=use_data_parallel)
-
-        self.mlp = InternMLP(config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.mlp",
-                             use_data_parallel=use_data_parallel)
-        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
-                                             eps=config.layer_norm_eps)
-        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
-                                             eps=config.layer_norm_eps)
-
-        self.ls1 = nn.Parameter(config.initializer_factor *
-                                torch.ones(self.embed_dim))
-        self.ls2 = nn.Parameter(config.initializer_factor *
-                                torch.ones(self.embed_dim))
+        self.attn = self._init_attn(
+            config,
+            quant_config,
+            num_dummy_heads=num_dummy_heads,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.mlp = InternMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
 
     def _init_attn(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         num_dummy_heads: int,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
         # fallback to sdpa attention if tp unavailable
-        tp_size = (1 if use_data_parallel else
-                   get_tensor_model_parallel_world_size())
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
         num_heads = config.num_attention_heads
 
         # if the number of heads is not divisible by tp_size,
         # we also disable Attention's TP
-        use_data_parallel = (use_data_parallel
-                             or (num_heads + num_dummy_heads) % tp_size != 0)
-        return InternParallelAttention(config,
-                                       quant_config=quant_config,
-                                       num_dummy_heads=num_dummy_heads,
-                                       prefix=prefix,
-                                       use_data_parallel=use_data_parallel)
+        use_data_parallel = (
+            use_data_parallel or (num_heads + num_dummy_heads) % tp_size != 0
+        )
+        return InternParallelAttention(
+            config,
+            quant_config=quant_config,
+            num_dummy_heads=num_dummy_heads,
+            prefix=prefix,
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
     ):
-        hidden_states = hidden_states + self.attn(
-            self.norm1(hidden_states)) * self.ls1
+        hidden_states = hidden_states + self.attn(self.norm1(hidden_states)) * self.ls1
 
-        hidden_states = hidden_states + self.mlp(
-            self.norm2(hidden_states)) * self.ls2
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) * self.ls2
 
         return hidden_states
 
 
 class InternVisionEncoder(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -350,17 +365,20 @@ class InternVisionEncoder(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layers = nn.ModuleList([
-            InternVisionEncoderLayer(config,
-                                     quant_config,
-                                     num_dummy_heads=num_dummy_heads,
-                                     prefix=f"{prefix}.layers.{layer_idx}",
-                                     use_data_parallel=use_data_parallel)
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                InternVisionEncoderLayer(
+                    config,
+                    quant_config,
+                    num_dummy_heads=num_dummy_heads,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(self, inputs_embeds: torch.Tensor):
-
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
@@ -369,7 +387,6 @@ class InternVisionEncoder(nn.Module):
 
 
 class InternVisionModel(nn.Module):
-
     packed_modules_mapping = {
         "qkv": ["qkv"],
     }
@@ -377,9 +394,9 @@ class InternVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -404,12 +421,11 @@ class InternVisionModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
-            raise ValueError(
-                'You have to specify pixel_values or pixel_embeds')
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
 
         if pixel_embeds is not None:
             hidden_states = pixel_embeds
@@ -417,25 +433,21 @@ class InternVisionModel(nn.Module):
             if pixel_values.ndim == 4:
                 hidden_states = self.embeddings(pixel_values)
             else:
-                raise ValueError(
-                    f'wrong pixel_values size: {pixel_values.shape}')
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
 
         if self.use_data_parallel:
-            encoder_outputs = run_dp_sharded_vision_model(
-                hidden_states, self.encoder)
+            encoder_outputs = run_dp_sharded_vision_model(hidden_states, self.encoder)
         else:
             encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         return encoder_outputs
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 221ff08b43843034b00a2da5725cf3fcc6a28bba..60fbeb842dd4bb92bd4582a4bfcaf204578c450d 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -13,39 +13,48 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .interfaces_base import default_pooling_type
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class InternLM2MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -64,8 +73,9 @@ class InternLM2MLP(nn.Module):
             prefix=f"{prefix}.w2",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -76,17 +86,16 @@ class InternLM2MLP(nn.Module):
 
 
 class InternLM2Attention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -157,16 +166,16 @@ class InternLM2Attention(nn.Module):
             qkv = qkv[::3] + qkv[1::3] + qkv[2::3]
             qkv = torch.cat(qkv, dim=-1)
 
-        qkv = qkv.view(seq_len, self.total_num_kv_heads,
-                       self.key_value_groups + 2, self.head_dim)
+        qkv = qkv.view(
+            seq_len, self.total_num_kv_heads, self.key_value_groups + 2, self.head_dim
+        )
         q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=-2)
         q = q.reshape(seq_len, self.q_size * self.tp_size)
         k = k.reshape(seq_len, self.kv_size * self.tp_size)
         v = v.reshape(seq_len, self.kv_size * self.tp_size)
 
         if self.tp_size > 1:
-            splitter = partial(split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
             v = splitter(v)[self.tp_rank]
@@ -186,20 +195,18 @@ class InternLM2Attention(nn.Module):
 
 
 class InternLMDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -218,23 +225,21 @@ class InternLMDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.feed_forward",
         )
-        self.attention_norm = RMSNorm(config.hidden_size,
-                                      eps=config.rms_norm_eps)
+        self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.attention_norm(hidden_states)
         else:
-            hidden_states, residual = self.attention_norm(
-                hidden_states, residual)
+            hidden_states, residual = self.attention_norm(hidden_states, residual)
         hidden_states = self.attention(
             positions=positions,
             hidden_states=hidden_states,
@@ -248,13 +253,13 @@ class InternLMDecoderLayer(nn.Module):
 
 @support_torch_compile
 class InternLM2Model(nn.Module):
-
     def __init__(
-            self,
-            *,
-            vllm_config: VllmConfig,
-            prefix: str = "",
-            layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer):
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -270,28 +275,30 @@ class InternLM2Model(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: layer_type(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.tok_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -300,10 +307,9 @@ class InternLM2Model(nn.Module):
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -314,55 +320,59 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         "gate_up_proj": ["w1", "w3"],
     }
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 model_type: type[InternLM2Model] = InternLM2Model):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        model_type: type[InternLM2Model] = InternLM2Model,
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        self.lora_config = lora_config
-
-        self.model = model_type(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.output = ParallelLMHead(config.vocab_size,
-                                     config.hidden_size,
-                                     quant_config=quant_config,
-                                     prefix=maybe_prefix(prefix, "output"))
+
+        self.model = model_type(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.output = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "output"),
+        )
         if self.config.tie_word_embeddings:
             self.output.weight = self.model.tok_embeddings.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.output, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w1", 0),
@@ -373,7 +383,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -393,8 +403,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -402,7 +411,6 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
 
 @default_pooling_type("ALL")
 class InternLM2ForRewardModel(InternLM2ForCausalLM):
-
     is_pooling_model = True
 
     def __init__(
@@ -412,9 +420,7 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
         prefix: str = "",
         model_type: type[InternLM2Model] = InternLM2Model,
     ):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         model_type=model_type)
+        super().__init__(vllm_config=vllm_config, prefix=prefix, model_type=model_type)
 
         for attr in ("output", "logits_processor"):
             delattr(self, attr)
@@ -422,29 +428,33 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
         config = vllm_config.model_config.hf_config
         self.head_dtype = vllm_config.model_config.head_dtype
 
-        self.v_head = RowParallelLinear(config.hidden_size,
-                                        1,
-                                        bias=False,
-                                        input_is_parallel=False,
-                                        params_dtype=self.head_dtype,
-                                        prefix=maybe_prefix(prefix, "v_head"),
-                                        return_bias=False)
+        self.v_head = RowParallelLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            input_is_parallel=False,
+            params_dtype=self.head_dtype,
+            prefix=maybe_prefix(prefix, "v_head"),
+            return_bias=False,
+        )
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)}, )
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         hidden_states = hidden_states.to(self.head_dtype)
         logits = self.v_head(hidden_states)
         return logits
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index d41ac2b70bc697d37f513c40ad7fc4032c32bce0..6dc081e34157b92bfc3097e849d0a3508f542281 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -12,27 +11,28 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.models.internlm2 import (InternLM2Attention,
-                                                  InternLM2ForCausalLM,
-                                                  InternLM2MLP, InternLM2Model)
+from vllm.model_executor.models.internlm2 import (
+    InternLM2Attention,
+    InternLM2ForCausalLM,
+    InternLM2MLP,
+    InternLM2Model,
+)
 from vllm.sequence import IntermediateTensors
 
 
 class InternLM2VEDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -58,24 +58,22 @@ class InternLM2VEDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.feed_forward_ve",
         )
-        self.attention_norm = RMSNorm(config.hidden_size,
-                                      eps=config.rms_norm_eps)
+        self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        visual_token_mask: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None,
+        visual_token_mask: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.attention_norm(hidden_states)
         else:
-            hidden_states, residual = self.attention_norm(
-                hidden_states, residual)
+            hidden_states, residual = self.attention_norm(hidden_states, residual)
         hidden_states = self.attention(
             positions=positions,
             hidden_states=hidden_states,
@@ -84,36 +82,34 @@ class InternLM2VEDecoderLayer(nn.Module):
         # Fully Connected
         hidden_states, residual = self.ffn_norm(hidden_states, residual)
         if visual_token_mask is not None and visual_token_mask.any():
-            visual_token_mask = visual_token_mask.repeat(
-                1, self.hidden_size).bool()
+            visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool()
             text_token_mask = ~visual_token_mask
             hidden_states[visual_token_mask] = self.feed_forward_ve(
-                hidden_states[visual_token_mask].reshape(
-                    -1, self.hidden_size)).flatten()
+                hidden_states[visual_token_mask].reshape(-1, self.hidden_size)
+            ).flatten()
             if text_token_mask.any():
                 hidden_states[text_token_mask] = self.feed_forward(
-                    hidden_states[text_token_mask].reshape(
-                        -1, self.hidden_size)).flatten()
+                    hidden_states[text_token_mask].reshape(-1, self.hidden_size)
+                ).flatten()
         else:
             hidden_states = self.feed_forward(hidden_states)
         return hidden_states, residual
 
 
 class InternLM2VEModel(InternLM2Model):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=InternLM2VEDecoderLayer)
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, layer_type=InternLM2VEDecoderLayer
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        visual_token_mask: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        visual_token_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -132,17 +128,15 @@ class InternLM2VEModel(InternLM2Model):
                 visual_token_mask=visual_token_mask,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
 class InternLM2VEForCausalLM(InternLM2ForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         model_type=InternLM2VEModel)
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, model_type=InternLM2VEModel
+        )
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 197d629b906fec0b004ccb9173280b086c10eeb6..c2195fd0cb88d4d65a85cc1db21fea6137e11699 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -7,7 +7,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -15,48 +15,69 @@ import torch.nn as nn
 from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
 from transformers.activations import ACT2FN
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
-    GotOcr2ImageProcessorFast)
+    GotOcr2ImageProcessorFast,
+)
 from transformers.models.internvl.video_processing_internvl import (
-    InternVLVideoProcessor)
+    InternVLVideoProcessor,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.interns1_vit import InternS1VisionModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import (
-    cached_video_processor_from_config)
+from vllm.transformers_utils.processor import cached_video_processor_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 
 class InternS1MultiModalProjector(nn.Module):
-
     def __init__(self, config):
         super().__init__()
-        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size *
-                                       int(1 / config.downsample_ratio)**2)
+        self.layer_norm = nn.LayerNorm(
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2
+        )
         self.linear_1 = nn.Linear(
-            config.vision_config.hidden_size *
-            int(1 / config.downsample_ratio)**2,
-            config.text_config.hidden_size)
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2,
+            config.text_config.hidden_size,
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size,
-                                  config.text_config.hidden_size)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size
+        )
 
     def forward(self, image_features):
         hidden_states = self.layer_norm(image_features)
@@ -75,6 +96,7 @@ class InternS1ImagePixelInputs(TensorSchema):
         - w: Width
         - bn: Batch size * number of images
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
@@ -87,13 +109,12 @@ class InternS1ImageEmbeddingInputs(TensorSchema):
         - tifs: Total image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("ni", "tifs", "hs")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("ni", "tifs", "hs")]
 
 
-InternS1ImageInputs = Union[InternS1ImagePixelInputs,
-                            InternS1ImageEmbeddingInputs]
+InternS1ImageInputs: TypeAlias = InternS1ImagePixelInputs | InternS1ImageEmbeddingInputs
 
 
 class InternS1VideoPixelInputs(TensorSchema):
@@ -105,6 +126,7 @@ class InternS1VideoPixelInputs(TensorSchema):
         - h: Height
         - w: Width
     """
+
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
     pixel_values: Annotated[torch.Tensor, TensorShape("bnv", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
@@ -117,13 +139,12 @@ class InternS1VideoEmbeddingInputs(TensorSchema):
         - tvfs: Total video feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["video_embeds"] = "video_embeds"
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("nv", "tvfs", "hs")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("nv", "tvfs", "hs")]
 
 
-InternS1VideoInputs = Union[InternS1VideoPixelInputs,
-                            InternS1VideoEmbeddingInputs]
+InternS1VideoInputs: TypeAlias = InternS1VideoPixelInputs | InternS1VideoEmbeddingInputs
 
 
 def resolve_interns1_min_max_num(
@@ -145,10 +166,13 @@ def get_interns1_target_ratios(
     min_num: int,
     max_num: int,
 ) -> list[tuple[int, int]]:
-    target_ratios = {(i, j)
-                     for n in range(min_num, max_num + 1)
-                     for i in range(1, n + 1)
-                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
     return sorted(target_ratios, key=lambda x: x[0] * x[1])
 
 
@@ -160,10 +184,12 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         hf_processor.video_processor = cached_video_processor_from_config(
             self.ctx.model_config,
             processor_cls=InternVLVideoProcessor,
-            **kwargs)
+            size=hf_processor.image_processor.size,
+            **kwargs,
+        )
         return hf_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_num_image_tokens(
@@ -171,21 +197,22 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional['GotOcr2ImageProcessorFast'] = None,
+        processor: GotOcr2ImageProcessorFast | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor().image_processor
 
         if not isinstance(processor, GotOcr2ImageProcessorFast):
-            raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
-                             f'{type(processor)}')
+            raise ValueError(
+                f"GotOcr2ImageProcessorFast is expected but got {type(processor)}"
+            )
         num_image_patches = processor.get_number_of_image_patches(
-            image_height, image_width, images_kwargs=dict())
-        num_image_tokens = self.get_hf_processor(
-        ).image_seq_length * num_image_patches
+            image_height, image_width, images_kwargs=dict()
+        )
+        num_image_tokens = self.get_hf_processor().image_seq_length * num_image_patches
         return num_image_tokens
 
-    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
+    def resolve_target_ratios(self, use_thumbnail: bool | None = None):
         image_processor = self.get_hf_processor().image_processor
         min_dynamic_patch = image_processor.min_patches
         max_dynamic_patch = image_processor.max_patches
@@ -197,7 +224,8 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
             min_dynamic_patch,
             max_dynamic_patch,
             dynamic_image_size,
-            use_thumbnail=use_thumbnail)
+            use_thumbnail=use_thumbnail,
+        )
 
         return get_interns1_target_ratios(min_num, max_num)
 
@@ -219,11 +247,11 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
-        assert not (largest_feature_size == 0 or largest_feature_pinpoint
-                    is None), ("Cannot have a largest feature size of 0!")
+        assert not (largest_feature_size == 0 or largest_feature_pinpoint is None), (
+            "Cannot have a largest feature size of 0!"
+        )
 
         return largest_feature_pinpoint
 
@@ -248,15 +276,13 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         processor = self.get_hf_processor()
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = (seq_len -
-                            max_image_tokens) // processor.image_seq_length
+        max_total_frames = (seq_len - max_image_tokens) // processor.image_seq_length
         max_frames_per_video = max_total_frames // max(max_videos, 1)
 
         return max(max_frames_per_video, 1)
 
 
-class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
-                                 ):
+class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]):
     """DummyInputsBuilder for InternS1-style models."""
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
@@ -271,33 +297,40 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
         config = self.info.get_hf_config()
         image_size_h, image_size_w = config.vision_config.image_size
 
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(width=image_size_w,
-                                   height=image_size_h,
-                                   num_frames=target_num_frames,
-                                   num_videos=num_videos),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
+                width=image_size_w,
+                height=image_size_h,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+                overrides=video_overrides,
+            ),
         }
 
 
-class InternS1MultiModalProcessor(
-        BaseMultiModalProcessor[InternS1ProcessingInfo]):
-    """ Basic image-only MultiModalProcessor for InternS1-style models."""
+class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo]):
+    """Basic image-only MultiModalProcessor for InternS1-style models."""
 
     def _call_hf_processor(
         self,
@@ -305,7 +338,7 @@ class InternS1MultiModalProcessor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> BatchFeature:
         mm_data = dict(mm_data)
         videos = mm_data.pop("videos", [])
         images = mm_data.pop("images", [])
@@ -314,15 +347,14 @@ class InternS1MultiModalProcessor(
 
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         tokenizer = hf_processor.tokenizer
-        video_token_id = tokenizer.encode(hf_processor.video_token,
-                                          add_special_tokens=False)
+        video_token_id = tokenizer.encode(
+            hf_processor.video_token, add_special_tokens=False
+        )
         assert len(video_token_id) == 1
         video_token_id = video_token_id[0]
 
-        prompt = re.sub(hf_processor.image_token, "<image_placeholder>",
-                        prompt)
-        prompt = re.sub(hf_processor.video_token, "<video_placeholder>",
-                        prompt)
+        prompt = re.sub(hf_processor.image_token, "<image_placeholder>", prompt)
+        prompt = re.sub(hf_processor.video_token, "<video_placeholder>", prompt)
 
         image_outputs = {}
         if images:
@@ -334,16 +366,14 @@ class InternS1MultiModalProcessor(
                     mm_kwargs=mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
-                image_pixel_values.append(
-                    processed_outputs.pop("pixel_values"))
+                image_pixel_values.append(processed_outputs.pop("pixel_values"))
 
                 input_ids = processed_outputs.pop("input_ids")
                 image_placeholder = tokenizer.batch_decode(input_ids)[0]
-                prompt = prompt.replace("<image_placeholder>",
-                                        image_placeholder, 1)
+                prompt = prompt.replace("<image_placeholder>", image_placeholder, 1)
 
             num_patches = [len(item) for item in image_pixel_values]
-            image_outputs: dict[str, NestedTensors] = {
+            image_outputs = {
                 "pixel_values": torch.concat(image_pixel_values),
                 "image_num_patches": torch.tensor(num_patches),
                 "image_token_id": torch.tensor(hf_processor.image_token_id),
@@ -359,43 +389,32 @@ class InternS1MultiModalProcessor(
                     mm_kwargs=mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
-                video_pixel_values.append(
-                    processed_outputs.pop("pixel_values"))
+                video_pixel_values.append(processed_outputs.pop("pixel_values"))
 
                 input_ids = processed_outputs.pop("input_ids")
-                input_ids[input_ids ==
-                          hf_processor.image_token_id] = video_token_id
+                input_ids[input_ids == hf_processor.image_token_id] = video_token_id
 
                 video_placeholder = tokenizer.batch_decode(input_ids)[0]
-                prompt = prompt.replace("<video_placeholder>",
-                                        video_placeholder, 1)
+                prompt = prompt.replace("<video_placeholder>", video_placeholder, 1)
 
             num_frames = [len(item) for item in video_pixel_values]
-            video_outputs: dict[str, NestedTensors] = {
+            video_outputs = {
                 "pixel_values_videos": torch.concat(video_pixel_values),
                 "video_num_patches": torch.tensor(num_frames),
                 "video_token_id": torch.tensor(video_token_id),
             }
 
-        prompt = re.sub("<image_placeholder>", hf_processor.image_token,
-                        prompt)
-        prompt = re.sub("<video_placeholder>", hf_processor.video_token,
-                        prompt)
+        prompt = re.sub("<image_placeholder>", hf_processor.image_token, prompt)
+        prompt = re.sub("<video_placeholder>", hf_processor.video_token, prompt)
         text_outputs = tokenizer(prompt, **tok_kwargs, return_tensors="pt")
 
-        combined_outputs = dict(
-            **text_outputs,
-            **image_outputs,
-            **video_outputs,
-        )
-        return BatchFeature(combined_outputs)
+        return BatchFeature({**text_outputs, **image_outputs, **video_outputs})
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
         video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
         num_images = len(image_num_patches)
@@ -403,12 +422,14 @@ class InternS1MultiModalProcessor(
 
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_num_patches),
+                "image", image_num_patches
+            ),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_num_patches),
+                "video", video_num_patches
+            ),
             video_num_patches=MultiModalFieldConfig.batched("video"),
             video_token_id=MultiModalFieldConfig.shared("video", num_videos),
         )
@@ -442,7 +463,8 @@ class InternS1MultiModalProcessor(
 
         def get_replacement_interns1_image(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
@@ -452,19 +474,16 @@ class InternS1MultiModalProcessor(
 
             repl_features = img_context_token * feature_size
             repl_full = start_image_token + repl_features + end_image_token
-            return PromptUpdateDetails.select_text(repl_full,
-                                                   img_context_token)
+            return PromptUpdateDetails.select_text(repl_full, img_context_token)
 
         def get_replacement_interns1_video(item_idx: int):
             num_patches = video_num_patches[item_idx]
             repl_features = video_token * hf_processor.image_seq_length
-            repl_features_with_sep = (start_image_token + repl_features +
-                                      end_image_token)
+            repl_features_with_sep = start_image_token + repl_features + end_image_token
             # num_patches is equal to num_frames
-            repl_full = '\n'.join([
-                f'Frame{i+1}: {repl_features_with_sep}'
-                for i in range(num_patches)
-            ])
+            repl_full = "\n".join(
+                [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
+            )
 
             return PromptUpdateDetails.select_text(repl_full, video_token)
 
@@ -485,9 +504,12 @@ class InternS1MultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     InternS1MultiModalProcessor,
     info=InternS1ProcessingInfo,
-    dummy_inputs=InternS1DummyInputsBuilder)
-class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                       SupportsPP, SupportsLoRA):
+    dummy_inputs=InternS1DummyInputsBuilder,
+)
+class InternS1ForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
+):
+    merge_by_field_config = True
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
@@ -496,14 +518,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.language_model.": "language_model.model.",
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         # transformers InternVLProcessor uses <IMG_CONTEXT> as the separator
         # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
         if modality.startswith("image"):
-            return '<IMG_CONTEXT>'
+            return "<IMG_CONTEXT>"
         if modality.startswith("video"):
             return "<video>"
 
@@ -522,7 +545,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         patch_size = config.vision_config.patch_size[0]
         self.patch_size = patch_size
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.downsample_ratio = config.downsample_ratio
 
         self.llm_arch_name = config.text_config.architectures[0]
@@ -545,12 +569,13 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         prefix: str,
     ):
@@ -562,7 +587,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             prefix=prefix,
         )
 
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
         return InternS1MultiModalProjector(config)
 
     def pixel_shuffle(self, x, scale_factor=0.5):
@@ -571,8 +596,12 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
         # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
         x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-                   int(c / (scale_factor * scale_factor)))
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
         x = x.permute(0, 2, 1, 3).contiguous()
         return x
 
@@ -580,18 +609,17 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         vit_embeds = self.vision_tower(pixel_values=pixel_values)
         vit_embeds = vit_embeds[:, 1:, :]
 
-        h = w = int(vit_embeds.shape[1]**0.5)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
 
         vit_embeds = self.multi_modal_projector(vit_embeds)
         return vit_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[InternS1ImageInputs]:
+        self, **kwargs: object
+    ) -> InternS1ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -600,31 +628,19 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return InternS1ImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
 
-            if not isinstance(image_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(image_num_patches)}")
-
-            pixel_values = flatten_bn(pixel_values, concat=True)
-            image_num_patches = flatten_bn(image_num_patches, concat=True)
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
+        if pixel_values is not None:
             h, w = self.config.vision_config.image_size
             return InternS1ImagePixelInputs(
                 type="pixel_values",
@@ -639,7 +655,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         raise AssertionError("This line should be unreachable.")
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
+        self, **kwargs: object
+    ) -> InternS1VideoInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
@@ -648,32 +665,19 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if video_embeds is not None:
-            if not isinstance(video_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
-
-            return InternS1ImageEmbeddingInputs(
+            return InternS1VideoEmbeddingInputs(
                 type="video_embeds",
-                data=flatten_bn(video_embeds),
+                data=video_embeds,
             )
 
         video_token_id = kwargs["video_token_id"]
-        assert isinstance(video_token_id, torch.Tensor)
-        self.video_context_token_id = video_token_id.flatten().unique().item()
-
-        if pixel_values_flat_video is not None:
-            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat_video)}")
+        if isinstance(video_token_id, torch.Tensor):
+            video_token_id = video_token_id.flatten().unique().item()
 
-            if not isinstance(video_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(video_num_patches)}")
-
-            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-                                                 concat=True)
-            video_num_patches = flatten_bn(video_num_patches, concat=True)
+        assert isinstance(video_token_id, int)
+        self.video_context_token_id = video_token_id
 
+        if pixel_values_flat_video is not None:
             h, w = self.config.vision_config.image_size
             return InternS1VideoPixelInputs(
                 type="pixel_values_videos",
@@ -687,11 +691,14 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_image_input(
+    def _process_vision_input(
         self,
-        image_input: Union[InternS1ImageInputs, InternS1VideoPixelInputs],
+        image_input: InternS1ImageInputs | InternS1VideoInputs,
     ) -> tuple[torch.Tensor, ...]:
-        if image_input["type"] == "image_embeds":
+        if (
+            image_input["type"] == "image_embeds"
+            or image_input["type"] == "video_embeds"
+        ):
             return image_input["data"]
 
         assert self.vision_tower is not None
@@ -702,14 +709,12 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return (image_embeds.view(-1,
-                                      self.config.text_config.hidden_size), )
+            return (image_embeds.view(-1, self.config.text_config.hidden_size),)
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1,
-                                         self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
@@ -721,14 +726,13 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in (
-                    "pixel_values_videos", ) and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos",) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
@@ -738,9 +742,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -754,59 +756,49 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
-                video_embeddings = self._process_image_input(video_input)
-                multimodal_embeddings += video_embeddings
+                video_embeddings = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            context_token_ids = [
-                token_id for token_id in (self.img_context_token_id,
-                                          self.video_context_token_id)
-                if token_id is not None
-            ]
-            assert len(context_token_ids) >= 1
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
             self._set_visual_token_mask(input_ids)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                context_token_ids,
-            )
-        return inputs_embeds
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
-
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
@@ -820,11 +812,10 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -835,4 +826,5 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="multi_modal_projector",
-            tower_model="vision_tower")
+            tower_model="vision_tower",
+        )
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index eb6b685d03dc5c3f3e221b75aad6304a46e77cdf..507503d75046dbcc332afc92ee97a18f3af8a815 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -8,7 +8,6 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -18,48 +17,45 @@ from transformers.utils import torch_int
 from vllm.attention.layer import MultiHeadAttention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 NORM2FN = {
-    'rms_norm': RMSNorm,
-    'layer_norm': nn.LayerNorm,
+    "rms_norm": RMSNorm,
+    "layer_norm": nn.LayerNorm,
 }
 
 
 class InternS1VisionPatchEmbeddings(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         image_size, patch_size = config.image_size, config.patch_size
         num_channels, hidden_size = config.num_channels, config.hidden_size
 
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
-                                                          patch_size[0])
-        patch_shape = (image_size[0] // patch_size[0],
-                       image_size[1] // patch_size[1])
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.num_patches = num_patches
         self.patch_shape = patch_shape
 
-        self.projection = nn.Conv2d(num_channels,
-                                    hidden_size,
-                                    kernel_size=patch_size,
-                                    stride=patch_size)
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values "
-                "match with the one set in the configuration.")
+                "match with the one set in the configuration."
+            )
 
-        embeddings = self.projection(
-            pixel_values.to(self.projection.weight.dtype))
+        embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
         patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
         embeddings = embeddings.flatten(2).transpose(1, 2)
 
@@ -67,30 +63,32 @@ class InternS1VisionPatchEmbeddings(nn.Module):
 
 
 class InternS1VisionEmbeddings(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.config = config
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         if config.use_mask_token:
-            self.mask_token = nn.Parameter(
-                torch.zeros(1, 1, config.hidden_size))
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         else:
             self.mask_token = None
         self.patch_embeddings = InternS1VisionPatchEmbeddings(config)
         self.patch_size = config.patch_size
-        self.image_size = (config.image_size if isinstance(
-            config.image_size, Iterable) else
-                           (config.image_size, config.image_size))
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, Iterable)
+            else (config.image_size, config.image_size)
+        )
         num_patches = self.patch_embeddings.num_patches
         if config.use_absolute_position_embeddings:
             self.position_embeddings = nn.Parameter(
-                torch.zeros(1, num_patches + 1, config.hidden_size))
+                torch.zeros(1, num_patches + 1, config.hidden_size)
+            )
         else:
             self.position_embeddings = None
 
-    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
-                                 width: int) -> torch.Tensor:
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
         images. This method is also adapted to support torch.jit tracing.
@@ -105,8 +103,11 @@ class InternS1VisionEmbeddings(nn.Module):
 
         # always interpolate when tracing to ensure the exported model
         # works for dynamic input shapes
-        if not torch.jit.is_tracing(
-        ) and num_patches == num_positions and height == width:
+        if (
+            not torch.jit.is_tracing()
+            and num_patches == num_positions
+            and height == width
+        ):
             return self.position_embeddings
 
         class_pos_embed = self.position_embeddings[:, :1]
@@ -118,8 +119,9 @@ class InternS1VisionEmbeddings(nn.Module):
         new_width = width // self.patch_size[1]
 
         sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
-                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
 
         patch_pos_embed = nn.functional.interpolate(
@@ -136,11 +138,10 @@ class InternS1VisionEmbeddings(nn.Module):
     def forward(
         self,
         pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        bool_masked_pos: torch.BoolTensor | None = None,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        embeddings, (patch_height,
-                     patch_width) = self.patch_embeddings(pixel_values)
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
 
         if bool_masked_pos is not None:
@@ -154,7 +155,8 @@ class InternS1VisionEmbeddings(nn.Module):
 
         if self.position_embeddings is not None:
             embeddings = embeddings + self.interpolate_pos_encoding(
-                embeddings, height, width)
+                embeddings, height, width
+            )
 
         return embeddings, (patch_height, patch_width)
 
@@ -176,51 +178,54 @@ class InternSdpaAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f'embed_dim must be divisible by num_heads '
-                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
-                f' {self.num_heads}).')
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
 
         # Additional dummy heads are used to enable TP for common GPU counts.
         self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
 
         self.scale = self.head_dim**-0.5
 
-        self.q_proj = nn.Linear(self.embed_dim,
-                                self.num_heads * self.head_dim,
-                                bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.embed_dim,
-                                self.num_heads * self.head_dim,
-                                bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.embed_dim,
-                                self.num_heads * self.head_dim,
-                                bias=config.attention_bias)
+        self.q_proj = nn.Linear(
+            self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias
+        )
 
         self.qk_normalization = config.use_qk_norm
         if self.qk_normalization:
-            self.q_norm = RMSNorm(self.dummy_dim,
-                                  eps=config.layer_norm_eps,
-                                  var_hidden_size=self.embed_dim)
-            self.k_norm = RMSNorm(self.dummy_dim,
-                                  eps=config.layer_norm_eps,
-                                  var_hidden_size=self.embed_dim)
+            self.q_norm = RMSNorm(
+                self.dummy_dim,
+                eps=config.layer_norm_eps,
+                var_hidden_size=self.embed_dim,
+            )
+            self.k_norm = RMSNorm(
+                self.dummy_dim,
+                eps=config.layer_norm_eps,
+                var_hidden_size=self.embed_dim,
+            )
 
         self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
 
         # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim,
-                                       self.scale)
+        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, N, C = x.shape
+        """x shape: (B, N, C)"""
 
         q = self.q_proj(x)
         k = self.k_proj(x)
         v = self.v_proj(x)
 
         if self.qk_normalization:
-            B_, N_, H_, D_ = q.shape
-            q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
-            k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
+            q = self.q_norm(q)
+            k = self.k_norm(k)
 
         # Use unified MultiHeadAttention with automatic backend selection
         x = self.attn(q, k, v)
@@ -230,27 +235,30 @@ class InternSdpaAttention(nn.Module):
 
 
 class InternS1VisionMLP(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -261,42 +269,45 @@ class InternS1VisionMLP(nn.Module):
 
 
 class InternS1VisionLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
-        self.attention = self._init_attn(config,
-                                         quant_config,
-                                         num_dummy_heads=num_dummy_heads,
-                                         prefix=f"{prefix}.attention")
+        self.attention = self._init_attn(
+            config,
+            quant_config,
+            num_dummy_heads=num_dummy_heads,
+            prefix=f"{prefix}.attention",
+        )
 
-        self.mlp = InternS1VisionMLP(config,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp")
+        self.mlp = InternS1VisionMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
         self.layernorm_before = NORM2FN[config.norm_type](
-            config.hidden_size, eps=config.layer_norm_eps)
+            config.hidden_size, eps=config.layer_norm_eps
+        )
         self.layernorm_after = NORM2FN[config.norm_type](
-            config.hidden_size, eps=config.layer_norm_eps)
+            config.hidden_size, eps=config.layer_norm_eps
+        )
 
         init_values = config.layer_scale_init_value
-        self.lambda_1 = nn.Parameter(init_values *
-                                     torch.ones(config.hidden_size),
-                                     requires_grad=True)
-        self.lambda_2 = nn.Parameter(init_values *
-                                     torch.ones(config.hidden_size),
-                                     requires_grad=True)
+        self.lambda_1 = nn.Parameter(
+            init_values * torch.ones(config.hidden_size), requires_grad=True
+        )
+        self.lambda_2 = nn.Parameter(
+            init_values * torch.ones(config.hidden_size), requires_grad=True
+        )
 
     def _init_attn(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         num_dummy_heads: int,
         prefix: str = "",
@@ -307,23 +318,26 @@ class InternS1VisionLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
     ):
-        hidden_states = hidden_states + self.attention(
-            self.layernorm_before(hidden_states)) * self.lambda_1
+        hidden_states = (
+            hidden_states
+            + self.attention(self.layernorm_before(hidden_states)) * self.lambda_1
+        )
 
-        hidden_states = hidden_states + self.mlp(
-            self.layernorm_after(hidden_states)) * self.lambda_2
+        hidden_states = (
+            hidden_states
+            + self.mlp(self.layernorm_after(hidden_states)) * self.lambda_2
+        )
 
         return hidden_states
 
 
 class InternS1VisionEncoder(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ):
@@ -336,16 +350,19 @@ class InternS1VisionEncoder(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layer = nn.ModuleList([
-            InternS1VisionLayer(config,
-                                quant_config,
-                                num_dummy_heads=num_dummy_heads,
-                                prefix=f"{prefix}.layer.{layer_idx}")
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [
+                InternS1VisionLayer(
+                    config,
+                    quant_config,
+                    num_dummy_heads=num_dummy_heads,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(self, inputs_embeds: torch.Tensor):
-
         hidden_states = inputs_embeds
         for encoder_layer in self.layer:
             hidden_states = encoder_layer(hidden_states)
@@ -354,13 +371,12 @@ class InternS1VisionEncoder(nn.Module):
 
 
 class InternS1VisionModel(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
@@ -375,21 +391,22 @@ class InternS1VisionModel(nn.Module):
             num_dummy_heads=num_dummy_heads,
             prefix=f"{prefix}.encoder",
         )
-        self.layernorm = (nn.Identity() if config.use_mean_pooling else
-                          nn.LayerNorm(config.hidden_size,
-                                       eps=config.layer_norm_eps))
+        self.layernorm = (
+            nn.Identity()
+            if config.use_mean_pooling
+            else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
 
     def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
-            raise ValueError(
-                'You have to specify pixel_values or pixel_embeds')
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
 
         if pixel_embeds is not None:
             hidden_states = pixel_embeds
@@ -397,22 +414,19 @@ class InternS1VisionModel(nn.Module):
             if pixel_values.ndim == 4:
                 hidden_states, _ = self.embeddings(pixel_values)
             else:
-                raise ValueError(
-                    f'wrong pixel_values size: {pixel_values.shape}')
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
 
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
         encoder_outputs = self.layernorm(encoder_outputs)
 
         return encoder_outputs
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f4004e518e3ba767257814ee1a70c8f9dd3c6af4..ccbde115009d2c6ec47f8566ce0663fb26d755f7 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -10,44 +10,61 @@
 import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 
 import numpy.typing as npt
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import BatchEncoding, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.models.intern_vit import (InternVisionModel,
-                                                   InternVisionPatchModel)
+from vllm.model_executor.models.intern_vit import (
+    InternVisionModel,
+    InternVisionPatchModel,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import set_default_torch_num_threads
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.utils.torch_utils import set_default_torch_num_threads
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
-IMG_START = '<img>'
-IMG_END = '</img>'
-IMG_CONTEXT = '<IMG_CONTEXT>'
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
 
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -62,6 +79,7 @@ class InternVLImagePixelInputs(TensorSchema):
         - h: Height of each image patch
         - w: Width of each image patch
     """
+
     type: Literal["pixel_values"]
     pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
@@ -74,13 +92,12 @@ class InternVLImageEmbeddingInputs(TensorSchema):
         - f: Total image feature size
         - h: Hidden size (must match the hidden size of language model backbone)
     """
+
     type: Literal["image_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-InternVLImageInputs = Union[InternVLImagePixelInputs,
-                            InternVLImageEmbeddingInputs]
+InternVLImageInputs: TypeAlias = InternVLImagePixelInputs | InternVLImageEmbeddingInputs
 
 
 class InternVLVideoPixelInputs(TensorSchema):
@@ -92,6 +109,7 @@ class InternVLVideoPixelInputs(TensorSchema):
         - h: Height of each video frame
         - w: Width of each video frame
     """
+
     type: Literal["pixel_values_videos"]
     pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
@@ -104,25 +122,27 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
         - f: Total video feature size
         - h: Hidden size (must match the hidden size of language model backbone)
     """
+
     type: Literal["video_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-InternVLVideoInputs = Union[InternVLVideoPixelInputs,
-                            InternVLVideoEmbeddingInputs]
+InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose([
-        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
-        T.Resize((input_size, input_size),
-                 interpolation=T.InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
-    ])
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
     # Image transformation operations (which include tensor computations
     # on the CPU) can occupy a substantial number of CPU cores, introducing
     # overhead due to CPU contention. This issue becomes particularly
@@ -147,7 +167,7 @@ def find_closest_aspect_ratio(
     height: int,
     image_size: int,
 ) -> tuple[int, int]:
-    best_ratio_diff = float('inf')
+    best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height
     for ratio in target_ratios:
@@ -182,10 +202,13 @@ def get_internvl_target_ratios(
     min_num: int,
     max_num: int,
 ) -> list[tuple[int, int]]:
-    target_ratios = {(i, j)
-                     for n in range(min_num, max_num + 1)
-                     for i in range(1, n + 1)
-                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
     return sorted(target_ratios, key=lambda x: x[0] * x[1])
 
 
@@ -243,10 +266,12 @@ def dynamic_preprocess_internvl(
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
@@ -324,9 +349,9 @@ class BaseInternVLProcessor(ABC):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         super().__init__()
 
@@ -349,7 +374,8 @@ class BaseInternVLProcessor(ABC):
         assert isinstance(dynamic_image_size, bool)
 
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.image_size = image_size
         self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
@@ -365,26 +391,30 @@ class BaseInternVLProcessor(ABC):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         raise NotImplementedError
 
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
-        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
-                             is None else min_dynamic_patch)
-        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
-                             is None else max_dynamic_patch)
-        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
-                              is None else dynamic_image_size)
-        use_thumbnail = (self.use_thumbnail
-                         if use_thumbnail is None else use_thumbnail)
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
 
         return resolve_internvl_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -396,10 +426,10 @@ class BaseInternVLProcessor(ABC):
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -433,9 +463,9 @@ class BaseInternVLProcessor(ABC):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -451,16 +481,17 @@ class BaseInternVLProcessor(ABC):
                 min_num=min_num,
                 max_num=max_num,
                 use_thumbnail=self.use_thumbnail,
-            ) for image in images
+            )
+            for image in images
         ]
 
     def _preprocess_image(
         self,
         text: list[str],
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> tuple[list[str], dict[str, torch.Tensor]]:
         if len(images) == 0:
             image_inputs = {}
@@ -471,11 +502,11 @@ class BaseInternVLProcessor(ABC):
                 max_dynamic_patch=max_dynamic_patch,
                 dynamic_image_size=dynamic_image_size,
             )
-            image_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat":
-                torch.cat(pixel_values_lst),
-                "image_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst]),
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
             }
 
             for pixel_values in pixel_values_lst:
@@ -483,11 +514,10 @@ class BaseInternVLProcessor(ABC):
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
         return text, image_inputs
 
-    def _make_batch_input(self,
-                          input_item: Optional[Union[Any, list[Any]]] = None):
+    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
         if input_item is None:
             input_item = []
         if not isinstance(input_item, list):
@@ -496,13 +526,13 @@ class BaseInternVLProcessor(ABC):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> Mapping[str, NestedTensors]:
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
         text, images = [self._make_batch_input(x) for x in (text, images)]
 
         text, image_inputs = self._preprocess_image(
@@ -515,10 +545,9 @@ class BaseInternVLProcessor(ABC):
 
         text_inputs = self.tokenizer(text)
 
-        return {
-            **BatchEncoding(text_inputs, tensor_type=return_tensors),
-            **image_inputs,
-        }
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
 
 
 class InternVLProcessor(BaseInternVLProcessor):
@@ -534,10 +563,10 @@ class InternVLProcessor(BaseInternVLProcessor):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        video_token: Optional[str] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
     ) -> None:
         super().__init__(
             config=config,
@@ -554,7 +583,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
     @property
-    def video_token_id(self) -> Optional[int]:
+    def video_token_id(self) -> int | None:
         if self.video_token is None:
             return None
         return self.tokenizer.get_vocab().get(self.video_token, None)
@@ -566,7 +595,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def _videos_to_pixel_values_lst(
         self,
         videos: list[npt.NDArray],
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=1,
@@ -582,14 +611,15 @@ class InternVLProcessor(BaseInternVLProcessor):
                 min_num=min_num,
                 max_num=max_num,
                 use_thumbnail=False,
-            ) for video in videos
+            )
+            for video in videos
         ]
 
     def _preprocess_video(
         self,
         text: list[str],
         videos: list[npt.NDArray],
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ):
         if len(videos) == 0 or not self.supports_video:
             video_inputs = {}
@@ -598,31 +628,32 @@ class InternVLProcessor(BaseInternVLProcessor):
                 videos,
                 dynamic_image_size=dynamic_image_size,
             )
-            video_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat_video":
-                torch.cat(pixel_values_lst_video),
-                "video_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst_video]),
+            video_inputs = {
+                "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
+                "video_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst_video]
+                ),
             }
 
             for pixel_values in pixel_values_lst_video:
                 num_patches = pixel_values.shape[0]
 
-                video_repl = self.get_video_repl(self.num_image_token,
-                                                 num_patches, self.video_token)
-                text = [t.replace('<video>', video_repl.full, 1) for t in text]
+                video_repl = self.get_video_repl(
+                    self.num_image_token, num_patches, self.video_token
+                )
+                text = [t.replace("<video>", video_repl.full, 1) for t in text]
         return text, video_inputs
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> Mapping[str, NestedTensors]:
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: npt.NDArray | list[npt.NDArray] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
         text, images, videos = [
             self._make_batch_input(x) for x in (text, images, videos)
         ]
@@ -643,16 +674,14 @@ class InternVLProcessor(BaseInternVLProcessor):
 
         text_inputs = self.tokenizer(text)
 
-        return {
-            **BatchEncoding(text_inputs, tensor_type=return_tensors),
-            **image_inputs,
-            **video_inputs,
-        }
+        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
 
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -662,15 +691,15 @@ class InternVLProcessor(BaseInternVLProcessor):
     def get_video_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int] = None,
+        num_patches: int | None = None,
         video_context_token: str = IMG_CONTEXT,
     ) -> PromptUpdateDetails[str]:
         repl_features = video_context_token * self.num_image_token
         repl_features_with_sep = IMG_START + repl_features + IMG_END
         # num_patches is equal to num_frames
-        repl_full = ''.join([
-            f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches)
-        ])
+        repl_full = "".join(
+            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
+        )
 
         return PromptUpdateDetails.select_text(repl_full, video_context_token)
 
@@ -682,7 +711,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -690,7 +719,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[BaseInternVLProcessor],
+        processor: BaseInternVLProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -717,8 +746,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
@@ -751,21 +779,25 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
-    """ Basic image-only MultiModalProcessor for InternVL-style models."""
+    """Basic image-only MultiModalProcessor for InternVL-style models."""
 
     def _call_hf_processor(
         self,
@@ -773,7 +805,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> BatchFeature:
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
@@ -793,7 +825,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
@@ -801,7 +833,8 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         return dict(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_num_patches),
+                "image", image_num_patches
+            ),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
@@ -829,7 +862,8 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         def get_replacement_internvl(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
@@ -867,7 +901,7 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
         video_limit = {"video": None} if self.supports_video else {}
         return {**super().get_supported_mm_limits(), **video_limit}
 
-    def get_video_token(self) -> Optional[str]:
+    def get_video_token(self) -> str | None:
         text_model_type = self.get_hf_config().get_text_config().model_type
         video_token_map = {
             "qwen2": "<|video_pad|>",
@@ -888,8 +922,7 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
         processor = self.get_hf_processor()
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = (seq_len -
-                            max_image_tokens) // processor.num_image_token
+        max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
         max_frames_per_video = max_total_frames // max(max_videos, 1)
 
         return max(max_frames_per_video, 1)
@@ -905,7 +938,8 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
 
 class InternVLDummyInputsBuilder(
-        BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]):
+    BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
+):
     """InternVL DummyInputsBuilder extended for video support"""
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
@@ -917,21 +951,27 @@ class InternVLDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
-                                                mm_counts=mm_counts)
+        dummy_image = super().get_dummy_mm_data(
+            seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
+        )
         if self.info.supports_video:
             config = self.info.get_hf_config()
             image_size: int = config.vision_config.image_size
-            target_num_frames = \
-                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+            target_num_frames = self.info.get_num_frames_with_most_features(
+                seq_len, mm_counts
+            )
             num_videos = mm_counts.get("video", 0)
+            video_overrides = mm_options.get("video") if mm_options else None
             dummy_video = {
-                "video":
-                self._get_dummy_videos(width=image_size,
-                                       height=image_size,
-                                       num_frames=target_num_frames,
-                                       num_videos=num_videos)
+                "video": self._get_dummy_videos(
+                    width=image_size,
+                    height=image_size,
+                    num_frames=target_num_frames,
+                    num_videos=num_videos,
+                    overrides=video_overrides,
+                )
             }
         else:
             dummy_video = {}
@@ -939,7 +979,8 @@ class InternVLDummyInputsBuilder(
 
 
 class InternVLMultiModalProcessor(
-        BaseInternVLMultiModalProcessor[InternVLProcessingInfo]):
+    BaseInternVLMultiModalProcessor[InternVLProcessingInfo]
+):
     """InternVL MultiModalProcessor extended for video support"""
 
     def _call_hf_processor(
@@ -948,33 +989,34 @@ class InternVLMultiModalProcessor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
-        processed_outputs = super()._call_hf_processor(prompt, mm_data,
-                                                       mm_kwargs, tok_kwargs)
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt, mm_data, mm_kwargs, tok_kwargs
+        )
 
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        if self.info.supports_video and (
-                video_token_id := hf_processor.video_token_id) is not None:
+        if (
+            self.info.supports_video
+            and (video_token_id := hf_processor.video_token_id) is not None
+        ):
             processed_outputs["video_token_id"] = torch.tensor(video_token_id)
         return processed_outputs
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        image_fields = super()._get_mm_fields_config(hf_inputs,
-                                                     hf_processor_mm_kwargs)
+        image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
         if self.info.supports_video:
-            video_num_patches = hf_inputs.get("video_num_patches",
-                                              torch.empty(0))
+            video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
             num_videos = len(video_num_patches)
             video_fields = dict(
                 pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes(
-                    "video", video_num_patches),
+                    "video", video_num_patches
+                ),
                 video_num_patches=MultiModalFieldConfig.batched("video"),
-                video_token_id=MultiModalFieldConfig.shared(
-                    "video", num_videos),
+                video_token_id=MultiModalFieldConfig.shared("video", num_videos),
             )
         else:
             video_fields = {}
@@ -1010,9 +1052,8 @@ class InternVLMultiModalProcessor(
                 assert isinstance(num_patches, int)
 
             return hf_processor.get_video_repl(
-                feature_size,
-                num_patches,
-                video_context_token=hf_processor.video_token)
+                feature_size, num_patches, video_context_token=hf_processor.video_token
+            )
 
         if self.info.supports_video:
             prompt_repl = [
@@ -1021,7 +1062,7 @@ class InternVLMultiModalProcessor(
                     modality="video",
                     target="<video>",
                     replacement=get_video_replacement_internvl,
-                )
+                ),
             ]
 
         return prompt_repl
@@ -1030,14 +1071,15 @@ class InternVLMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     InternVLMultiModalProcessor,
     info=InternVLProcessingInfo,
-    dummy_inputs=InternVLDummyInputsBuilder)
-class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-                        SupportsLoRA):
+    dummy_inputs=InternVLDummyInputsBuilder,
+)
+class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
 
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -1061,12 +1103,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.downsample_ratio = config.downsample_ratio
         self.ps_version = config.ps_version
 
         self.llm_arch_name = config.text_config.architectures[0]
-        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
+        self.is_mono = self.llm_arch_name == "InternLM2VEForCausalLM"
         self.vision_model = self._init_vision_model(
             config,
             quant_config=quant_config,
@@ -1087,24 +1130,26 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
-    def _patch_quant_config(self, config: PretrainedConfig,
-                            quant_config: QuantizationConfig):
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ):
         # the awq models from OpenGVLab missing `modules_to_not_convert`
         # patch the quant_config to add `modules_to_not_convert` back
         if isinstance(quant_config, AWQConfig):
             text_config = config.text_config
-            llm_quant_config = getattr(text_config, "quantization_config",
-                                       None)
-            if (not quant_config.modules_to_not_convert) and \
-                (llm_quant_config is not None):
+            llm_quant_config = getattr(text_config, "quantization_config", None)
+            if (not quant_config.modules_to_not_convert) and (
+                llm_quant_config is not None
+            ):
                 quant_config.modules_to_not_convert.append("vision_model")
 
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -1112,8 +1157,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         if not is_mono:
             vision_feature_layer = config.select_layer
             if vision_feature_layer < 0:
-                num_hidden_layers = config.vision_config.num_hidden_layers \
-                    + vision_feature_layer + 1
+                num_hidden_layers = (
+                    config.vision_config.num_hidden_layers + vision_feature_layer + 1
+                )
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
@@ -1122,18 +1168,20 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                 quant_config=quant_config,
                 num_hidden_layers_override=num_hidden_layers,
                 prefix=prefix,
-                use_data_parallel=self.use_data_parallel)
+                use_data_parallel=self.use_data_parallel,
+            )
         else:
             return InternVisionPatchModel(config.vision_config)
 
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
         vit_hidden_size = config.vision_config.hidden_size
         llm_hidden_size = config.text_config.hidden_size
 
         return nn.Sequential(
-            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
-            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                      llm_hidden_size),
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size
+            ),
             nn.GELU(),
             nn.Linear(llm_hidden_size, llm_hidden_size),
         )
@@ -1144,9 +1192,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
         # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
         x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-                   int(c / (scale_factor * scale_factor)))
-        if self.ps_version == 'v1':
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
             pass
         else:
             x = x.permute(0, 2, 1, 3).contiguous()
@@ -1156,17 +1208,16 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         vit_embeds = self.vision_model(pixel_values=pixel_values)
         vit_embeds = vit_embeds[:, 1:, :]
 
-        h = w = int(vit_embeds.shape[1]**0.5)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[InternVLImageInputs]:
+        self, **kwargs: object
+    ) -> InternVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -1175,30 +1226,19 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return InternVLImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
 
-        if pixel_values_flat is not None:
-            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat)}")
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
-            if not isinstance(image_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(image_num_patches)}")
-
-            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-            image_num_patches = flatten_bn(image_num_patches, concat=True)
+        if pixel_values_flat is not None:
             expected_h = expected_w = self.config.vision_config.image_size
             resolve_bindings = {"h": expected_h, "w": expected_w}
 
@@ -1212,7 +1252,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         raise AssertionError("This line should be unreachable.")
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]:
+        self, **kwargs: object
+    ) -> InternVLVideoPixelInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("image_embeds", None)
@@ -1223,25 +1264,17 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         if video_embeds is not None:
             return InternVLVideoEmbeddingInputs(
                 type="video_embeds",
-                data=flatten_bn(video_embeds),
+                data=video_embeds,
             )
 
         video_token_id = kwargs["video_token_id"]
-        assert isinstance(video_token_id, torch.Tensor)
-        self.video_context_token_id = video_token_id.flatten().unique().item()
-
-        if pixel_values_flat_video is not None:
-            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat_video)}")
+        if isinstance(video_token_id, torch.Tensor):
+            video_token_id = video_token_id.flatten().unique().item()
 
-            if not isinstance(video_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(video_num_patches)}")
+        assert isinstance(video_token_id, int)
+        self.video_context_token_id = video_token_id
 
-            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-                                                 concat=True)
-            video_num_patches = flatten_bn(video_num_patches, concat=True)
+        if pixel_values_flat_video is not None:
             expected_h = expected_w = self.config.vision_config.image_size
             resolve_bindings = {"h": expected_h, "w": expected_w}
 
@@ -1254,11 +1287,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_image_input(
+    def _process_vision_input(
         self,
-        image_input: Union[InternVLImageInputs, InternVLVideoPixelInputs],
+        image_input: InternVLImageInputs | InternVLVideoInputs,
     ) -> tuple[torch.Tensor, ...]:
-        if image_input["type"] == "image_embeds":
+        if (
+            image_input["type"] == "image_embeds"
+            or image_input["type"] == "video_embeds"
+        ):
             return image_input["data"]
 
         assert self.vision_model is not None
@@ -1269,14 +1305,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return (image_embeds.view(-1,
-                                      self.config.text_config.hidden_size), )
+            return (image_embeds.view(-1, self.config.text_config.hidden_size),)
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1,
-                                         self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
@@ -1288,31 +1322,29 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values_flat",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_flat_video",
-                             ) and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values_flat", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_flat_video",) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         if self.is_mono:
             assert self.img_context_token_id is not None
-            self.visual_token_mask = (
-                input_ids == self.img_context_token_id).reshape(-1, 1)
+            self.visual_token_mask = (input_ids == self.img_context_token_id).reshape(
+                -1, 1
+            )
         else:
             self.visual_token_mask = None
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -1326,59 +1358,49 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
-                video_embeddings = self._process_image_input(video_input)
-                multimodal_embeddings += video_embeddings
+                video_embeddings = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            context_token_ids = [
-                token_id for token_id in (self.img_context_token_id,
-                                          self.video_context_token_id)
-                if token_id is not None
-            ]
-            assert len(context_token_ids) >= 1
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
             self._set_visual_token_mask(input_ids)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                context_token_ids,
-            )
-        return inputs_embeds
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
-
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
@@ -1388,8 +1410,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         # Only required if the model is mono-architecture
         if self.visual_token_mask is not None:
-            forward_kwargs.update(
-                {"visual_token_mask": self.visual_token_mask})
+            forward_kwargs.update({"visual_token_mask": self.visual_token_mask})
             self.visual_token_mask = None
 
         hidden_states = self.language_model.model(**forward_kwargs)
@@ -1398,17 +1419,24 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
         skip_prefixes = [
-            "action_embed", "temporal_embed", "track_embed",
-            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
-            "loc_encoder", "loc_decoder", "sam", "temporal_token",
-            "track_token"
+            "action_embed",
+            "temporal_embed",
+            "track_embed",
+            "track_embed_decoder",
+            "box_token",
+            "cg_criterion",
+            "cg_model",
+            "loc_encoder",
+            "loc_decoder",
+            "sam",
+            "temporal_token",
+            "track_token",
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights)
@@ -1420,4 +1448,5 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="mlp1",
-            tower_model="vision_model")
+            tower_model="vision_model",
+        )
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 0eb1578b43610c52bfb3c133f6bf25805e55af70..5549a1fc1cd302e8d757394dc575a804afe91834 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -24,7 +24,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -32,60 +31,68 @@ from torch import nn
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import JAISConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class SwiGLUActivation(nn.Module):
-
     def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         return x1 * nn.functional.silu(x2)
 
 
 def _get_alibi_slopes(n):
-
     def get_slopes_power_of_2(n):
-        start = 2**(-(2**-(math.log2(n) - 3)))
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
         ratio = start
         return [start * ratio**i for i in range(n)]
 
     if math.log2(n).is_integer():
         return get_slopes_power_of_2(n)
     else:
-        closest_power_of_2 = 2**math.floor(math.log2(n))
-        return (get_slopes_power_of_2(closest_power_of_2) + _get_alibi_slopes(
-            2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return (
+            get_slopes_power_of_2(closest_power_of_2)
+            + _get_alibi_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+        )
 
 
 class JAISAttention(nn.Module):
-
     def __init__(
         self,
         config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         total_num_heads = config.num_attention_heads
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         assert total_num_heads % tensor_model_parallel_world_size == 0
         self.num_heads = total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // total_num_heads
@@ -100,12 +107,14 @@ class JAISAttention(nn.Module):
             total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
 
         tp_rank = get_tensor_model_parallel_rank()
@@ -113,13 +122,15 @@ class JAISAttention(nn.Module):
         head_end = (tp_rank + 1) * self.num_heads
         alibi_slopes = _get_alibi_slopes(total_num_heads)
         alibi_slopes = alibi_slopes[head_start:head_end]
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scale,
-                              alibi_slopes=alibi_slopes,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scale,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -133,12 +144,12 @@ class JAISAttention(nn.Module):
 
 
 class JAISMLP(nn.Module):
-
     def __init__(
         self,
         intermediate_size: int,
         config: JAISConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -148,18 +159,25 @@ class JAISMLP(nn.Module):
             intermediate_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
+        )
+        self.c_fc2 = (
+            ColumnParallelLinear(
+                hidden_size,
+                intermediate_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.c_fc2",
+            )
+            if self.swiglu
+            else None
         )
-        self.c_fc2 = (ColumnParallelLinear(
-            hidden_size,
-            intermediate_size,
-            bias=True,
-            quant_config=quant_config,
-        ) if self.swiglu else None)
         self.c_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
 
         self.act = SwiGLUActivation()
@@ -168,33 +186,33 @@ class JAISMLP(nn.Module):
         if self.swiglu:
             hidden_states2, _ = self.c_fc2(hidden_states)
         hidden_states, _ = self.c_fc(hidden_states)
-        hidden_states = (self.act(hidden_states, hidden_states2)
-                         if self.swiglu else self.act(hidden_states))
+        hidden_states = (
+            self.act(hidden_states, hidden_states2)
+            if self.swiglu
+            else self.act(hidden_states)
+        )
         hidden_states, _ = self.c_proj(hidden_states)
         return hidden_states
 
 
 class JAISBlock(nn.Module):
-
     def __init__(
         self,
         config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
-        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
-                     hidden_size)
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = JAISAttention(config,
-                                  cache_config,
-                                  quant_config,
-                                  prefix=f"{prefix}.attn")
+        self.attn = JAISAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = JAISMLP(inner_dim, config, quant_config)
+        self.mlp = JAISMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -202,7 +220,9 @@ class JAISBlock(nn.Module):
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(hidden_states=hidden_states, )
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+        )
         # residual connection
         hidden_states = attn_output + residual
 
@@ -216,7 +236,6 @@ class JAISBlock(nn.Module):
 
 @support_torch_compile
 class JAISModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -230,9 +249,11 @@ class JAISModel(nn.Module):
         assert not config.reorder_and_upcast_attn
         self.embed_dim = config.hidden_size
         self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
-        self.wpe = (nn.Embedding(config.max_position_embeddings,
-                                 self.embed_dim)
-                    if config.position_embedding_type != "alibi" else None)
+        self.wpe = (
+            nn.Embedding(config.max_position_embeddings, self.embed_dim)
+            if config.position_embedding_type != "alibi"
+            else None
+        )
         if hasattr(config, "embeddings_scale"):
             self.embeddings_scale = config.embeddings_scale
         else:
@@ -240,38 +261,41 @@ class JAISModel(nn.Module):
 
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: JAISBlock(config=config,
-                                     cache_config=cache_config,
-                                     quant_config=quant_config,
-                                     prefix=prefix),
+            lambda prefix: JAISBlock(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.h",
         )
 
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.n_embd))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.n_embd
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[IntermediateTensors, torch.Tensor]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> IntermediateTensors | torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             if self.wpe is not None:
                 position_embeds = self.wpe(position_ids)
                 hidden_states = inputs_embeds + position_embeds
             else:
                 hidden_states = inputs_embeds
-            hidden_states *= torch.tensor(float(self.embeddings_scale),
-                                          dtype=hidden_states.dtype)
+            hidden_states *= torch.tensor(
+                float(self.embeddings_scale), dtype=hidden_states.dtype
+            )
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -287,56 +311,57 @@ class JAISModel(nn.Module):
 
 
 class JAISLMHeadModel(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = JAISModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
+        self.transformer = JAISModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                self.config.vocab_size,
+                self.config.hidden_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         if hasattr(config, "width_scale"):
             self.output_logits_scale = config.width_scale
         else:
-            self.output_logits_scale = (config.mup_output_alpha *
-                                        config.mup_width_scale)
-        self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
-                                                scale=self.output_logits_scale)
+            self.output_logits_scale = config.mup_output_alpha * config.mup_width_scale
+        self.logits_processor = LogitsProcessor(
+            vocab_size=config.vocab_size, scale=self.output_logits_scale
+        )
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[IntermediateTensors, torch.Tensor]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> IntermediateTensors | torch.Tensor:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -366,8 +391,7 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
                 if not name.endswith(".weight"):
                     continue
                 loaded_weight = loaded_weight.t()
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index e8277e259bc5b210beeee51253f7bf37708de211..3a2c98c73dab4f42be934e2fee388da4ba2f2d41 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -16,37 +16,55 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaMLP as JambaMLP
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class JambaMoE(nn.Module):
-
-    def __init__(self,
-                 config: JambaConfig,
-                 num_experts: Optional[int] = None,
-                 top_k: Optional[int] = None,
-                 params_dtype: Optional[torch.dtype] = None,
-                 tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: JambaConfig,
+        num_experts: int | None = None,
+        top_k: int | None = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.num_total_experts = num_experts or config.num_experts
         self.top_k = top_k or config.num_experts_per_tok
@@ -54,23 +72,27 @@ class JambaMoE(nn.Module):
         self.intermediate_size = config.intermediate_size
 
         if self.num_total_experts > 1:
-            self.router = ReplicatedLinear(self.hidden_size,
-                                           self.num_total_experts,
-                                           bias=False,
-                                           quant_config=None,
-                                           params_dtype=params_dtype)
-
-        self.experts = FusedMoE(self.num_total_experts,
-                                self.top_k,
-                                self.hidden_size,
-                                self.intermediate_size,
-                                tp_size=tp_size,
-                                params_dtype=params_dtype,
-                                reduce_results=True,
-                                renormalize=False,
-                                use_grouped_topk=False,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
+            self.router = ReplicatedLinear(
+                self.hidden_size,
+                self.num_total_experts,
+                bias=False,
+                quant_config=None,
+                params_dtype=params_dtype,
+            )
+
+        self.experts = FusedMoE(
+            self.num_total_experts,
+            self.top_k,
+            self.hidden_size,
+            self.intermediate_size,
+            tp_size=tp_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=False,
+            use_grouped_topk=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -79,43 +101,46 @@ class JambaMoE(nn.Module):
         if self.num_total_experts > 1:
             router_logits, _ = self.router(hidden_states)
         else:
-            router_logits = torch.ones((hidden_states.shape[0], 1),
-                                       device=hidden_states.device,
-                                       dtype=hidden_states.dtype)
+            router_logits = torch.ones(
+                (hidden_states.shape[0], 1),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
         hidden_states = self.experts(hidden_states, router_logits)
         return hidden_states.view(orig_shape)
 
 
 class JambaMambaDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: JambaConfig,
-                 layer_idx: int,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 is_lora_enabled: Optional[bool] = False,
-                 prefix: str = "",
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        config: JambaConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        is_lora_enabled: bool | None = False,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__()
         self.config = config
         self.is_lora_enabled = is_lora_enabled
-        self.mamba = MambaMixer(hidden_size= config.hidden_size,
-                                ssm_state_size = config.mamba_d_state,
-                                conv_kernel_size = config.mamba_d_conv,
-                                intermediate_size = config.mamba_expand *\
-                                                    config.hidden_size,
-                                time_step_rank = config.mamba_dt_rank,
-                                use_conv_bias = config.mamba_conv_bias,
-                                use_bias = config.mamba_proj_bias,
-                                use_rms_norm=True,
-                                rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act,
-                                is_lora_enabled = self.is_lora_enabled,
-                                model_config=model_config,
-                                cache_config=cache_config,
-                                prefix=f"{prefix}.mixer",
-                                )
+        self.mamba = MambaMixer(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=config.mamba_expand * config.hidden_size,
+            time_step_rank=config.mamba_dt_rank,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            use_rms_norm=True,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            is_lora_enabled=self.is_lora_enabled,
+            model_config=model_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.mixer",
+        )
 
         num_experts = config.layers_num_experts[layer_idx]
         if num_experts > 1:
@@ -132,23 +157,20 @@ class JambaMambaDecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.feed_forward",
             )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         output = torch.empty_like(hidden_states)
         self.mamba(hidden_states, output)
@@ -159,15 +181,16 @@ class JambaMambaDecoderLayer(nn.Module):
 
 
 class JambaAttentionDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: JambaConfig,
-                 layer_idx: int,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        config: JambaConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -196,11 +219,15 @@ class JambaAttentionDecoderLayer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        config.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
 
         self.attn = Attention(
             self.num_heads,
@@ -226,10 +253,8 @@ class JambaAttentionDecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.feed_forward",
             )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def self_attention(
         self,
@@ -247,36 +272,33 @@ class JambaAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attention(
             positions=positions,
             hidden_states=hidden_states,
         )
         # Fully Connected
-        hidden_states, residual = self.pre_ff_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.pre_ff_layernorm(hidden_states, residual)
         hidden_states = self.feed_forward(hidden_states)
         return hidden_states, residual
 
 
 ALL_DECODER_LAYER_TYPES = {
     "attention": JambaAttentionDecoderLayer,
-    "mamba": JambaMambaDecoderLayer
+    "mamba": JambaMambaDecoderLayer,
 }
 
 
 @support_torch_compile
 class JambaModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -284,58 +306,55 @@ class JambaModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
 
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
-            layer_class = ALL_DECODER_LAYER_TYPES[
-                config.layers_block_type[layer_idx]]
-            return layer_class(config,
-                               layer_idx,
-                               model_config,
-                               cache_config,
-                               quant_config=quant_config,
-                               prefix=prefix,
-                               **extra_kwargs)
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                model_config,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                **extra_kwargs,
+            )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-        self.final_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -343,15 +362,14 @@ class JambaModel(nn.Module):
             residual = intermediate_tensors["residual"]
 
         for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions=positions,
-                                            hidden_states=hidden_states,
-                                            residual=residual)
+            hidden_states, residual = layer(
+                positions=positions, hidden_states=hidden_states, residual=residual
+            )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
@@ -362,10 +380,10 @@ class JambaModel(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -384,7 +402,7 @@ class JambaModel(nn.Module):
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                if 'experts' in name:
+                if "experts" in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
@@ -399,10 +417,10 @@ class JambaModel(nn.Module):
                 break
             else:
                 for (
-                        param_name,
-                        weight_name,
-                        expert_id,
-                        shard_id,
+                    param_name,
+                    weight_name,
+                    expert_id,
+                    shard_id,
                 ) in expert_params_mapping:
                     if weight_name not in name:
                         continue
@@ -412,11 +430,13 @@ class JambaModel(nn.Module):
                     name = name.replace(weight_name, param_name)
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -426,19 +446,25 @@ class JambaModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid):
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
-        ".self_attn.": ".",
-        ".A_log": ".A"
-    }, )
+class JambaForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsMambaPrefixCaching,
+):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".self_attn.": ".", ".A_log": ".A"},
+    )
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -458,56 +484,48 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "Jamba currently does not support prefix caching"
 
         super().__init__()
         self.config = config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.scheduler_config = scheduler_config
-        self.model = JambaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = JambaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        return self.mamba_cache.copy_inputs_before_cuda_graphs(
-            input_buffers, **kwargs)
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(input_buffers, **kwargs)
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
@@ -517,7 +535,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba1_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -543,12 +560,11 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -557,7 +573,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
 
 class JambaForSequenceClassification(JambaForCausalLM):
-
     is_pooling_model = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -565,7 +580,7 @@ class JambaForSequenceClassification(JambaForCausalLM):
 
         config = vllm_config.model_config.hf_config
         num_labels: int = config.num_labels
-        score_bias: bool = getattr(config, 'score_bias', False)
+        score_bias: bool = getattr(config, "score_bias", False)
 
         # TODO: The original reward weights have float32 accuracy data, we
         # would like to load them in fp32 to get that extra precision.
@@ -580,12 +595,16 @@ class JambaForSequenceClassification(JambaForCausalLM):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            Pooler.for_classify(
-                pooler_config,
-                classifier=self.score,
-            ),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
+                "classify": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
+                ),
+            }
+        )
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index f8c2a1e507a74cfcaa1b64c73336d3e285f317aa..05a40837954d89341c8c596f1c9ff76b059d5d02 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -10,36 +9,34 @@ from transformers import BatchFeature
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (SupportsCrossEncoding, SupportsMultiModal,
-                         SupportsScoreTemplate)
-from .qwen2_vl import (Qwen2VLDummyInputsBuilder,
-                       Qwen2VLForConditionalGeneration,
-                       Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo)
+from .interfaces import SupportsCrossEncoding, SupportsMultiModal, SupportsScoreTemplate
+from .qwen2_vl import (
+    Qwen2VLDummyInputsBuilder,
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLMultiModalProcessor,
+    Qwen2VLProcessingInfo,
+)
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 logger = init_logger(__name__)
 
 
 class JinaVLScorer(nn.Module):
-
     def __init__(self, model_config: "ModelConfig"):
         super().__init__()
         config = model_config.hf_config
         head_dtype = model_config.head_dtype
-        self.dense = ColumnParallelLinear(config.hidden_size,
-                                          config.hidden_size,
-                                          params_dtype=head_dtype,
-                                          bias=True)
-        self.out_proj = RowParallelLinear(config.hidden_size,
-                                          config.num_labels,
-                                          params_dtype=head_dtype,
-                                          bias=True)
+        self.dense = ColumnParallelLinear(
+            config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True
+        )
+        self.out_proj = RowParallelLinear(
+            config.hidden_size, config.num_labels, params_dtype=head_dtype, bias=True
+        )
 
     def forward(self, x, **kwargs):
         x, _ = self.dense(x)
@@ -49,7 +46,6 @@ class JinaVLScorer(nn.Module):
 
 
 class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -57,25 +53,26 @@ class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor):
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         # NOTE: We should reverse the order of the mm_data because the
         # query prompt is placed after the document prompt in the score
         # template for JinaVLForRanking model, but in mm_data they are
         # stored in the opposite order (query first, then document).
         for _, value in mm_data.items():
             value.reverse()
-        return super()._call_hf_processor(prompt, mm_data, mm_kwargs,
-                                          tok_kwargs)
-
-
-@MULTIMODAL_REGISTRY.register_processor(JinaVLMultiModalProcessor,
-                                        info=Qwen2VLProcessingInfo,
-                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
-class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
-                                      SupportsCrossEncoding,
-                                      SupportsMultiModal,
-                                      SupportsScoreTemplate):
-
+        return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    JinaVLMultiModalProcessor,
+    info=Qwen2VLProcessingInfo,
+    dummy_inputs=Qwen2VLDummyInputsBuilder,
+)
+class JinaVLForSequenceClassification(
+    Qwen2VLForConditionalGeneration,
+    SupportsCrossEncoding,
+    SupportsMultiModal,
+    SupportsScoreTemplate,
+):
     is_pooling_model = True
     weight_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -87,47 +84,53 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
             # mapping for original checkpoint
             "lm_head.": "language_model.lm_head.",
             "model.": "language_model.model.",
-        })
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=maybe_prefix(prefix, "qwen2_vl"))
+        super().__init__(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "qwen2_vl")
+        )
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
         self.score = JinaVLScorer(vllm_config.model_config)
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            Pooler.for_classify(pooler_config, classifier=self.score),
-            "score":
-            Pooler.for_classify(pooler_config, classifier=self.score),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
+                "classify": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
+                ),
+            }
+        )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
 
         raise ValueError("Only image modality is supported")
 
     @classmethod
-    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+    def get_score_template(cls, query: str, document: str) -> str | None:
         return f"**Document**:\n{document}\n**Query**:\n{query}"
 
     @classmethod
     def post_process_tokens(cls, prompt: TokensPrompt) -> None:
-
         # add score target token at the end of prompt tokens
-        prompt['prompt_token_ids'].append(100)
+        prompt["prompt_token_ids"].append(100)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         hidden_states = super().forward(
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 3b6fdba225122beae883ff4833f68b1775df0dc8..1eb0eccc0411c424a5381932d373b58e2f7cbc24 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -4,7 +4,7 @@ import math
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 
 import numpy as np
 import torch
@@ -13,44 +13,71 @@ from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.modeling_outputs import (BaseModelOutput,
-                                           BaseModelOutputWithPooling)
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.utils import torch_int
 
-from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    maybe_get_vit_flash_attn_backend,
+)
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, VideoItem)
-from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import _Backend
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .siglip import SiglipMLP
-from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, is_pp_missing_parameter,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    is_pp_missing_parameter,
+    maybe_prefix,
+)
 from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
@@ -82,8 +109,10 @@ def smart_resize(
         width = factor
 
     if max(height, width) / min(height, width) > 200:
-        raise ValueError("absolute aspect ratio must be smaller than 200, got "
-                         "{max(height, width) / min(height, width)}")
+        raise ValueError(
+            "absolute aspect ratio must be smaller than 200, got "
+            "{max(height, width) / min(height, width)}"
+        )
     h_bar = round(height / factor) * factor
     w_bar = round(width / factor) * factor
     if h_bar * w_bar > max_pixels:
@@ -100,17 +129,17 @@ def smart_resize(
 class KeyeImagePixelInputs(TensorSchema):
     """
     Dimensions:
-        - b: Batch size
-        - np: Number of patches
+        - bnp: Batch size * Number of patches
         - c: Number of channels
         - ps: Patch size
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["pixel_values"]
     pixel_values: Annotated[
-        torch.Tensor,
-        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
+        torch.Tensor, TensorShape("bnp", 3, "ps", "ps", dynamic_dims={"bnp"})
+    ]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
@@ -118,33 +147,34 @@ class KeyeImageEmbeddingInputs(TensorSchema):
     """
     Dimensions:
         - nf: Number of image features
-        - hs: Hidden size (must match the hidden size of language model 
+        - hs: Hidden size (must match the hidden size of language model
           backbone)
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["image_embeds"]
     image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
+KeyeImageInputs: TypeAlias = KeyeImagePixelInputs | KeyeImageEmbeddingInputs
 
 
 class KeyeVideoPixelInputs(TensorSchema):
     """
     Dimensions:
-        - b: Batch size
-        - np: Number of patches
+        - bnp: Batch size * Number of patches
         - c: Number of channels
         - ps: Patch size
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["pixel_values_videos"]
     pixel_values_videos: Annotated[
-        torch.Tensor,
-        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
+        torch.Tensor, TensorShape("bnp", 3, "ps", "ps", dynamic_dims={"bnp"})
+    ]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
@@ -152,21 +182,21 @@ class KeyeVideoEmbeddingInputs(TensorSchema):
     """
     Dimensions:
         - nf: Number of video features
-        - hs: Hidden size (must match the hidden size of language model 
+        - hs: Hidden size (must match the hidden size of language model
           backbone)
         - nv: Number of videos
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["video_embeds"]
     video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
-KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
+KeyeVideoInputs: TypeAlias = KeyeVideoPixelInputs | KeyeVideoEmbeddingInputs
 
 
 class KeyeVisionEmbeddings(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.config = config
@@ -182,12 +212,11 @@ class KeyeVisionEmbeddings(nn.Module):
             padding="valid",
         )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches
         self.cache_position_embedding = dict()
         self.cache_position_count = dict()
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
 
         self.register_buffer(
@@ -203,7 +232,6 @@ class KeyeVisionEmbeddings(nn.Module):
         width: int,
         is_after_patchify: bool = False,
     ) -> torch.Tensor:
-
         num_positions = self.position_embedding.weight.shape[0]
 
         patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
@@ -218,8 +246,9 @@ class KeyeVisionEmbeddings(nn.Module):
             new_width = width // self.patch_size
 
         sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
-                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
 
         patch_pos_embed = nn.functional.interpolate(
@@ -232,11 +261,7 @@ class KeyeVisionEmbeddings(nn.Module):
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
         return patch_pos_embed
 
-    def fetch_position_embedding_lfu_cache(self,
-                                           embeddings,
-                                           h,
-                                           w,
-                                           max_cache: int = 20):
+    def fetch_position_embedding_lfu_cache(self, embeddings, h, w, max_cache: int = 20):
         grid = (h, w)
         if grid in self.cache_position_embedding:
             self.cache_position_count[grid] += 1
@@ -250,8 +275,7 @@ class KeyeVisionEmbeddings(nn.Module):
             self.cache_position_count.pop(min_hit_grid)
             self.cache_position_embedding.pop(min_hit_grid)
 
-        position_embedding = self.interpolate_pos_encoding(
-            embeddings, h, w, True)
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
         self.cache_position_count[grid] = 1
         self.cache_position_embedding[grid] = position_embedding
         return position_embedding
@@ -259,11 +283,9 @@ class KeyeVisionEmbeddings(nn.Module):
     def forward(
         self,
         pixel_values: torch.FloatTensor,
-        position_ids: Optional[torch.Tensor] = None,
-        image_grid_thw: Optional[list[Union[
-            tuple[int, int, int],
-            list[tuple[int, int, int]],
-        ]]] = None,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
         interpolate_pos_encoding=False,
     ) -> torch.Tensor:
         if pixel_values.dim() == 4:
@@ -282,8 +304,7 @@ class KeyeVisionEmbeddings(nn.Module):
             ) = pixel_values.shape
             target_dtype = self.patch_embedding.weight.dtype
             pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
-            patch_embeds = self.patch_embedding(
-                pixel_values.to(dtype=target_dtype))
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
             embeddings = patch_embeds.flatten(-2).squeeze(-1)
 
             if interpolate_pos_encoding and image_grid_thw is not None:
@@ -293,19 +314,23 @@ class KeyeVisionEmbeddings(nn.Module):
                     t, h, w = image_grid
                     end = start + t * h * w
                     image_embeddings = embeddings[start:end, :]
-                    position_embedding = (self.interpolate_pos_encoding(
-                        image_embeddings, h, w, True).squeeze(0).repeat(t, 1))
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .repeat(t, 1)
+                    )
                     image_embeddings = image_embeddings + position_embedding
                     tmp_embeddings.append(image_embeddings)
                     start = end
                 embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
             else:
-                embeddings = embeddings + self.packing_position_embedding(
-                    position_ids)
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
             return embeddings
         else:
-            raise ValueError("Unsupported pixel_values dimension:"
-                             f" {pixel_values.dim()}. Expected 4 or 5.")
+            raise ValueError(
+                "Unsupported pixel_values dimension:"
+                f" {pixel_values.dim()}. Expected 4 or 5."
+            )
 
 
 def apply_rotary_pos_emb_flashatt(
@@ -317,7 +342,17 @@ def apply_rotary_pos_emb_flashatt(
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
 
-    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    if current_platform.is_cuda():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    elif current_platform.is_rocm():
+        from flash_attn.ops.triton.rotary import apply_rotary as apply_rotary_emb
+    else:
+        # For other platforms, use PyTorch fallback
+        from vllm.model_executor.layers.rotary_embedding.common import (
+            apply_rotary_emb_torch,
+        )
+
+        apply_rotary_emb = partial(apply_rotary_emb_torch, is_neox_style=True)
 
     q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
     k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
@@ -331,8 +366,9 @@ class KeyeSiglipAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -372,26 +408,40 @@ class KeyeSiglipAttention(nn.Module):
 
         # Detect attention implementation.
         self.attn_backend = get_vit_attn_backend(
-            head_size=self.head_dim, dtype=torch.get_default_dtype())
+            head_size=self.head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
 
-        self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                use_upstream_fa=False,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
-        if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
+        if self.attn_backend not in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             raise RuntimeError(
-                f"Keye-VL does not support {self.attn_backend} backend now.")
+                f"Keye-VL does not support {self.attn_backend} backend now."
+            )
+
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: torch.Tensor | None = None,
+        output_attentions: bool | None = False,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        rope_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split(
@@ -417,8 +467,7 @@ class KeyeSiglipAttention(nn.Module):
             )
         else:
             if cu_seqlens is None:
-                raise ValueError(
-                    "cu_seqlens cannot be None when rope_emb is not None.")
+                raise ValueError("cu_seqlens cannot be None when rope_emb is not None.")
             cos, sin = rope_emb
             q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
             k = k.view(
@@ -433,15 +482,10 @@ class KeyeSiglipAttention(nn.Module):
                 self.head_dim,
             )
 
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            if self.use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.vllm_flash_attn import flash_attn_varlen_func
-
+        if self.is_flash_attn_backend:
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            output = flash_attn_varlen_func(
+            output = self.flash_attn_varlen_func(
                 q,
                 k,
                 v,
@@ -452,29 +496,26 @@ class KeyeSiglipAttention(nn.Module):
                 causal=False,
                 softmax_scale=self.scale,
             )
-            context_layer = rearrange(output,
-                                      "(b s) ... -> b s ...",
-                                      b=batch_size)
-        elif self.attn_backend == _Backend.XFORMERS:
+            context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
+            attn_bias = BlockDiagonalMask.from_seqlens(
+                q_seqlen=seqlens, kv_seqlen=None, device=q.device
+            )
 
             context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+                q, k, v, attn_bias=attn_bias, p=0, scale=None
+            )
 
-        context_layer = rearrange(context_layer,
-                                  "b s h d -> b s (h d)").contiguous()
+        context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
 
         output, _ = self.out_proj(context_layer)
         return output
 
 
 class SigLIPRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
         self.dim = dim
@@ -482,8 +523,9 @@ class SigLIPRotaryEmbedding(nn.Module):
         self.rope_init()
 
     def rope_init(self):
-        inv_freq = 1.0 / (self.theta**(
-            torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim))
+        inv_freq = 1.0 / (
+            self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
     def forward(self, seqlen: int) -> torch.Tensor:
@@ -497,24 +539,23 @@ class SigLIPRotaryEmbedding(nn.Module):
 
 
 class KeyeSiglipEncoderLayer(nn.Module):
-
     def __init__(
         self,
-        config: Union[PretrainedConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.self_attn = KeyeSiglipAttention(
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            attn_backend_override=attn_backend_override,
         )
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
             config,
             quant_config=quant_config,
@@ -525,11 +566,10 @@ class KeyeSiglipEncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool | None = False,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        rope_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> tuple[torch.FloatTensor]:
-
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
@@ -553,25 +593,29 @@ class KeyeSiglipEncoderLayer(nn.Module):
 
 
 class KeyeSiglipEncoder(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
         num_heads = config.num_attention_heads
         head_dim = embed_dim // num_heads
-        self.layers = nn.ModuleList([
-            KeyeSiglipEncoderLayer(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.layers.{layer_idx}",
-            ) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                KeyeSiglipEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_backend_override=attn_backend_override,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
         self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
 
     @staticmethod
@@ -587,18 +631,16 @@ class KeyeSiglipEncoder(nn.Module):
     def forward(
         self,
         inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        image_grid_thw: Optional[list[Union[
-            tuple[int, int, int],
-            list[tuple[int, int, int]],
-        ]]] = None,
-        height_position_ids: Optional[torch.Tensor] = None,
-        width_position_ids: Optional[torch.Tensor] = None,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        attention_mask: torch.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
         vision_or_text: str = "vision",
     ) -> BaseModelOutput:
         device = inputs_embeds.device
@@ -610,8 +652,7 @@ class KeyeSiglipEncoder(nn.Module):
                 split_hids = list()
                 split_wids = list()
                 for t, h, w in flatten_image_grid_thw:
-                    image_pids = torch.arange(t * h * w,
-                                              device=device) % (h * w)
+                    image_pids = torch.arange(t * h * w, device=device) % (h * w)
                     sample_hids = image_pids // w
                     sample_wids = image_pids % w
                     split_hids.append(sample_hids)
@@ -647,12 +688,12 @@ class KeyeSiglipEncoder(nn.Module):
 
 
 class KeyeSiglipVisionTransformer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -663,34 +704,31 @@ class KeyeSiglipVisionTransformer(nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.encoder",
+            attn_backend_override=attn_backend_override,
         )
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
         pixel_values,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = False,
-        attention_mask: Optional[torch.Tensor] = None,
-        sample_indices: Optional[torch.Tensor] = None,
-        image_indices: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        height_position_ids: Optional[torch.Tensor] = None,
-        width_position_ids: Optional[torch.Tensor] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[list[Union[
-            tuple[int, int, int],
-            list[tuple[int, int, int]],
-        ]]] = None,
-        return_pooler_output: Optional[bool] = True,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        interpolate_pos_encoding: bool | None = False,
+        attention_mask: torch.Tensor | None = None,
+        sample_indices: torch.Tensor | None = None,
+        image_indices: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        padding_mask: torch.Tensor | None = None,
+        vision_return_embed_list: bool | None = False,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        return_pooler_output: bool | None = True,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
     ) -> BaseModelOutputWithPooling:
-
         hidden_states = self.embeddings(
             pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
@@ -716,8 +754,10 @@ class KeyeSiglipVisionTransformer(nn.Module):
 
         sample_hidden_state = list()
         if cu_seqlens is None:
-            raise ValueError("cu_seqlens cannot be None for "
-                             "SiglipVisionTransformer output processing.")
+            raise ValueError(
+                "cu_seqlens cannot be None for "
+                "SiglipVisionTransformer output processing."
+            )
         for i in range(cu_seqlens.shape[0] - 1):
             start = cu_seqlens[i]
             end = cu_seqlens[i + 1]
@@ -734,8 +774,9 @@ class KeyeSiglipVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -743,6 +784,7 @@ class KeyeSiglipVisionModel(nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.vision_model",
+            attn_backend_override=attn_backend_override,
         )
         self.quant_config = quant_config
 
@@ -760,22 +802,19 @@ class KeyeSiglipVisionModel(nn.Module):
     def forward(
         self,
         pixel_values,
-        sample_indices: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
+        sample_indices: torch.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
         interpolate_pos_encoding: bool = False,
-        position_ids: Optional[torch.Tensor] = None,
-        vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[list[Union[
-            tuple[int, int, int],
-            list[tuple[int, int, int]],
-        ]]] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        return_pooler_output: Optional[bool] = True,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        position_ids: torch.Tensor | None = None,
+        vision_return_embed_list: bool | None = False,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        return_pooler_output: bool | None = True,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
     ) -> BaseModelOutputWithPooling:
-
         return self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -791,8 +830,7 @@ class KeyeSiglipVisionModel(nn.Module):
             window_size=window_size,
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -808,22 +846,24 @@ class KeyeSiglipVisionModel(nn.Module):
             if "head.mlp" in name or "head.probe" in name:
                 continue
             if self.quant_config is not None and (
-                    scale_name := self.quant_config.get_cache_scale(name)):
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 param = params_dict[scale_name]
                 weight_loader = getattr(
                     param,
                     "weight_loader",
                     default_weight_loader,
                 )
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
             for (
-                    param_name,
-                    weight_name,
-                    shard_id,
+                param_name,
+                weight_name,
+                shard_id,
             ) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -856,12 +896,11 @@ class KeyeSiglipVisionModel(nn.Module):
 
 
 class Projector(nn.Module):
-
     def __init__(
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -869,12 +908,13 @@ class Projector(nn.Module):
         self.vision_config = vision_config
         self.merge_kernel_size = (2, 2)
 
-        self.hidden_size = (self.vision_config.hidden_size *
-                            self.merge_kernel_size[0] *
-                            self.merge_kernel_size[1])
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
 
-        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size,
-                                           eps=1e-05)
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size, eps=1e-05)
         self.act = GELUActivation()
 
         self.linear_1 = ColumnParallelLinear(
@@ -894,14 +934,13 @@ class Projector(nn.Module):
 
     def forward(
         self,
-        image_features: Union[torch.Tensor, list[torch.Tensor]],
+        image_features: torch.Tensor | list[torch.Tensor],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
-            for image_feature, image_grid in zip(image_features,
-                                                 image_grid_thw):
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
                 image_feature = self.pre_norm(image_feature)
                 t, h, w = image_grid
 
@@ -924,8 +963,7 @@ class Projector(nn.Module):
         dims = image_features.shape[:-1]
         dim = image_features.shape[-1]
         image_features = image_features.view(np.prod(dims), dim)
-        hidden_states = self.pre_norm(image_features).view(
-            -1, self.hidden_size)
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
         hidden_states = self.linear_1(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
@@ -933,7 +971,9 @@ class Projector(nn.Module):
         return hidden_states.view(*dims, -1)
 
 
-def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
+def _keye_field_config(
+    hf_inputs: Mapping[str, torch.Tensor],
+):
     image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
     image_grid_sizes = image_grid_thw.prod(-1)
 
@@ -941,24 +981,21 @@ def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
     video_grid_sizes = video_grid_thw.prod(-1)
 
     return dict(
-        pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
-        image_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
         image_grid_thw=MultiModalFieldConfig.batched("image"),
         pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
-        video_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
+            "video", video_grid_sizes
+        ),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes("video", video_grid_sizes),
         video_grid_thw=MultiModalFieldConfig.batched("video"),
     )
 
 
 class KeyeMultiModalDataParser(MultiModalDataParser):
-
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -975,7 +1012,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -992,17 +1029,18 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 
 class KeyeProcessingInfo(BaseProcessingInfo):
-
     def get_max_image_size(self) -> int:
-        return 9999999  #_MAX_IMAGE_SIZE
+        return 9999999  # _MAX_IMAGE_SIZE
 
     def get_max_frame_per_video(self) -> int:
-        return 16  #_MAX_FRAMES_PER_VIDEO
+        return 16  # _MAX_FRAMES_PER_VIDEO
 
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(
+        self,
+    ) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -1041,11 +1079,9 @@ class KeyeProcessingInfo(BaseProcessingInfo):
                 min_pixels=image_processor.min_pixels,
                 max_pixels=image_processor.max_pixels,
             )
-            preprocessed_size = ImageSize(width=resized_width,
-                                          height=resized_height)
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
-            preprocessed_size = ImageSize(width=image_width,
-                                          height=image_height)
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
 
         padded_num_frames = num_frames + num_frames % temporal_patch_size
 
@@ -1088,7 +1124,9 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         )
         return num_video_tokens
 
-    def get_image_size_with_most_features(self, ) -> ImageSize:
+    def get_image_size_with_most_features(
+        self,
+    ) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=self.get_max_image_size(),
             image_height=self.get_max_image_size(),
@@ -1132,8 +1170,7 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens)
         max_frames_per_video = min(
             max_total_frames // max(max_videos, 1),
             self.get_max_frame_per_video(),
@@ -1156,7 +1193,6 @@ _I = TypeVar("_I", bound=KeyeProcessingInfo)
 
 
 class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1171,40 +1207,40 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = (
-            self.info.get_image_size_with_most_features())
-        target_num_frames = self.info.get_num_frames_with_most_features(
-            seq_len)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(seq_len)
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
         mm_data = {
-            "image":
-            self._get_dummy_images(
+            "image": self._get_dummy_images(
                 width=target_width,
                 height=target_height,
                 num_images=num_images,
+                overrides=image_overrides,
             ),
-            "video":
-            self._get_dummy_videos(
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
+                overrides=video_overrides,
             ),
         }
 
         return mm_data
 
 
-class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]):
-    ...
+class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]): ...
 
 
 class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
-
     def _get_data_parser(self) -> MultiModalDataParser:
         return KeyeMultiModalDataParser()
 
@@ -1215,8 +1251,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
@@ -1240,7 +1275,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
                 modality=modality,
                 target=[placeholder[modality]],
                 replacement=partial(get_replacement_keye, modality=modality),
-            ) for modality in ("image", "video")
+            )
+            for modality in ("image", "video")
         ]
 
     def _get_mm_fields_config(
@@ -1252,6 +1288,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
 
 
 class BaseKeyeModule(nn.Module):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1264,13 +1302,15 @@ class BaseKeyeModule(nn.Module):
         ],
     }
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "lm_head.": "language_model.lm_head.",
-        "model.": "language_model.model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1287,10 +1327,16 @@ class BaseKeyeModule(nn.Module):
         self.config = config
         self.multimodal_config = multimodal_config
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.visual = KeyeSiglipVisionModel(
             config.vision_config,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
+            attn_backend_override=attn_backend_override,
         )
 
         self.mlp_AR = self._build_projector(
@@ -1307,18 +1353,20 @@ class BaseKeyeModule(nn.Module):
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     @abstractmethod
-    def _build_projector(self,
-                         text_config: PretrainedConfig,
-                         vision_config: PretrainedConfig,
-                         quant_config: Optional[QuantizationConfig] = None,
-                         prefix: str = "") -> nn.Module:
+    def _build_projector(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         raise ValueError("Need projector")
 
-    def _process_image_input(self,
-                             image_input: Any) -> tuple[torch.Tensor, ...]:
+    def _process_image_input(self, image_input: Any) -> tuple[torch.Tensor, ...]:
         siglip_position_ids = list()
         image_grid_hws = list()
         sample_indices = list()
@@ -1333,21 +1381,22 @@ class BaseKeyeModule(nn.Module):
             image_grid_hws.append(thw_tuple)
             image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
             siglip_position_ids.append(image_position_ids)
-            sample_indices.append(torch.full((numel, ), idx,
-                                             dtype=torch.int64))
+            sample_indices.append(torch.full((numel,), idx, dtype=torch.int64))
             cu_seqlens.append(cu_seqlens[-1] + numel)
 
         if image_input["type"] == "image_embeds":
             raise ValueError(
-                "Image embeddings are not supported for this processing path.")
+                "Image embeddings are not supported for this processing path."
+            )
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            siglip_position_ids = torch.concat(siglip_position_ids,
-                                               dim=0).to(pixel_values.device)
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values.device
+            )
             cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                pixel_values.device)
-            sample_indices = torch.concat(sample_indices,
-                                          dim=0).to(pixel_values.device)
+                pixel_values.device
+            )
+            sample_indices = torch.concat(sample_indices, dim=0).to(pixel_values.device)
 
             image_embeds = self.visual(
                 pixel_values=pixel_values,
@@ -1367,8 +1416,8 @@ class BaseKeyeModule(nn.Module):
         self,
         video_type: Literal["video_embeds", "pixel_values_videos"],
         video_grid_thw: list[torch.Tensor],
-        pixel_values_videos: Optional[torch.Tensor] = None
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        pixel_values_videos: torch.Tensor | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
         siglip_position_ids = list()
         video_grid_hws = list()
         sample_indices = list()
@@ -1382,21 +1431,24 @@ class BaseKeyeModule(nn.Module):
             video_grid_hws.append(thw_tuple)
             video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
             siglip_position_ids.append(video_position_ids)
-            sample_indices.append(torch.full((numel, ), idx,
-                                             dtype=torch.int64))
+            sample_indices.append(torch.full((numel,), idx, dtype=torch.int64))
             cu_seqlens.append(cu_seqlens[-1] + numel)
 
         if video_type == "video_embeds":
             raise ValueError(
-                "Video embeddings are not supported for this processing path.")
+                "Video embeddings are not supported for this processing path."
+            )
         else:
             pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
             siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
-                pixel_values_videos.device)
+                pixel_values_videos.device
+            )
             cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                pixel_values_videos.device)
-            sample_indices = torch.concat(sample_indices,
-                                          dim=0).to(pixel_values_videos.device)
+                pixel_values_videos.device
+            )
+            sample_indices = torch.concat(sample_indices, dim=0).to(
+                pixel_values_videos.device
+            )
 
             video_embeds = self.visual(
                 pixel_values=pixel_values_videos,
@@ -1416,23 +1468,23 @@ class BaseKeyeModule(nn.Module):
         modalities = {}
 
         for input_key in kwargs:
-            if (input_key in ("pixel_values", "image_embeds")
-                    and "images" not in modalities):
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if (input_key in ("pixel_values_videos", "video_embeds")
-                    and "videos" not in modalities):
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "videos" not in modalities
+            ):
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1442,66 +1494,22 @@ class BaseKeyeModule(nn.Module):
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_embeddings = self._process_video_input(video_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                [
-                    self.config.image_token_id,
-                    self.config.video_token_id,
-                ],
-            )
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Any] = None,
-        video_input: Optional[Any] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
-            )
-
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Keye-VL.
 
         Args:
@@ -1518,23 +1526,6 @@ class BaseKeyeModule(nn.Module):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input,
-                )
-                input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -1547,11 +1538,10 @@ class BaseKeyeModule(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -1569,40 +1559,21 @@ class BaseKeyeModule(nn.Module):
     info=KeyeProcessingInfo,
     dummy_inputs=KeyeDummyInputsBuilder,
 )
-class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
-                                   SupportsLoRA, SupportsPP):
-
-    def _build_projector(self,
-                         text_config: PretrainedConfig,
-                         vision_config: PretrainedConfig,
-                         quant_config: Optional[QuantizationConfig] = None,
-                         prefix: str = "") -> nn.Module:
+class KeyeForConditionalGeneration(
+    BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    def _build_projector(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         return Projector(text_config, vision_config, quant_config, prefix)
 
-    def _validate_and_reshape_mm_tensor(
-            self, mm_input: NestedTensors,
-            name: str) -> Union[torch.Tensor, list[torch.Tensor]]:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim == 5:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
-        elif is_list_of(mm_input, torch.Tensor):
-            if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2
-                                                          for p in mm_input):
-                return mm_input
-        return torch.concat(mm_input)
-
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[KeyeImageInputs]:
+        self, **kwargs: object
+    ) -> KeyeImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1611,11 +1582,6 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
             return KeyeImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values,
@@ -1623,11 +1589,6 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
             return KeyeImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -1635,7 +1596,8 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[KeyeVideoInputs]:
+        self, **kwargs: object
+    ) -> KeyeVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1644,13 +1606,6 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos,
-                "video pixel values",
-            )
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return KeyeVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1658,11 +1613,6 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return KeyeVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
@@ -1670,11 +1620,142 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
     def _process_video_input(
-            self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
+        self, video_input: KeyeVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         video_type = video_input["type"]
         video_grid_thw = video_input["video_grid_thw"]
         pixel_values_videos = video_input.get("pixel_values_videos", None)
 
         return tuple(
-            self._process_video_embeds(video_type, video_grid_thw,
-                                       pixel_values_videos))
+            self._process_video_embeds(video_type, video_grid_thw, pixel_values_videos)
+        )
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
+        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
+            video_grid_thw = video_grid_thw[0]
+
+        def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
+            """
+            Split grid_thw along the t dimension.
+
+            Args:
+                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
+
+            Returns:
+                List of [1, h, w] rows, repeated t times for each original row.
+            """
+
+            if isinstance(grid_thw, list):
+                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+            if grid_thw.numel() == 0:
+                return []
+
+            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
+            ones = torch.ones_like(hw[:, :1])  # [N,1]
+            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
+            return out.tolist()
+
+        video_grid_thw = split_thw(video_grid_thw)
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        image_nums = len(image_grid_thw)
+        frame_nums = len(video_grid_thw)
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_frames = image_nums, frame_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + frame_nums):
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_frames > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+
+            if ed_image < ed_video:
+                t, h, w = image_grid_thw[image_index]
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = video_grid_thw[video_index]
+                video_index += 1
+                remain_frames -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 93a3bf5f98f7baee22768949b28d357f91dc6e6f..124e9c2afa21713c304dd8724ce76173b88eec6f 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -3,7 +3,7 @@
 import itertools
 from collections.abc import Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -15,22 +15,37 @@ from transformers.feature_extraction_utils import BatchFeature
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalFieldConfig,
-                                    MultiModalKwargsItems, VideoItem)
-from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .keye import (BaseKeyeModule, BaseMultiModalProcessor,
-                   KeyeBaseDummyInputsBuilder, KeyeProcessingInfo)
+from .interfaces import SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP
+from .keye import (
+    BaseKeyeModule,
+    BaseMultiModalProcessor,
+    KeyeBaseDummyInputsBuilder,
+    KeyeProcessingInfo,
+)
 
 logger = init_logger(__name__)
 
@@ -58,8 +73,9 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
     return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
 
 
-def get_num_patches(grid_thw: torch.Tensor,
-                    num_frames: Union[list[int], torch.Tensor]) -> list[int]:
+def get_num_patches(
+    grid_thw: torch.Tensor, num_frames: list[int] | torch.Tensor
+) -> list[int]:
     """
     Return num_patches per video.
 
@@ -73,9 +89,13 @@ def get_num_patches(grid_thw: torch.Tensor,
 
     Examples:
         >>> # Suppose there are 2 videos with a total of 3 grids
-        >>> grid_thw = torch.tensor([[2, 2, 2],  # grid 0: 2*2*2=8 patches
-        ...                          [2, 2, 2],  # grid 1: 2*2*2=8 patches
-        ...                          [1, 1, 1]]) # grid 2: 1*1*1=1 patches
+        >>> grid_thw = torch.tensor(
+        ...     [
+        ...         [2, 2, 2],  # grid 0: 2*2*2=8 patches
+        ...         [2, 2, 2],  # grid 1: 2*2*2=8 patches
+        ...         [1, 1, 1],
+        ...     ]
+        ... )  # grid 2: 1*1*1=1 patches
         >>> num_frames = [2, 1]  # The first video contains 2 grids,
                                    the second contains 1 grid.
         >>> get_num_patches(grid_thw, num_frames)
@@ -90,28 +110,31 @@ def get_num_patches(grid_thw: torch.Tensor,
     num_grids_per_frame = grid_thw.prod(dim=1)
     start_idx_per_video = [0, *itertools.accumulate(num_frames)]
     num_patches = [
-        num_grids_per_frame[start_idx_per_video[i]:start_idx_per_video[i + 1]].
-        sum() for i in range(len(num_frames))
+        num_grids_per_frame[start_idx_per_video[i] : start_idx_per_video[i + 1]].sum()
+        for i in range(len(num_frames))
     ]
-    return torch.stack(num_patches) if num_patches else torch.zeros(
-        0, dtype=grid_thw.dtype, device=grid_thw.device)
+    return (
+        torch.stack(num_patches)
+        if num_patches
+        else torch.zeros(0, dtype=grid_thw.dtype, device=grid_thw.device)
+    )
 
 
 class KeyeVL1_5ImagePixelInputs(TensorSchema):
     """
     Dimensions:
-        - b: Batch size
-        - np: Number of patches
+        - bnp: Batch size * Number of patches
         - c: Number of channels
         - ps: Patch size
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["pixel_values"]
 
     pixel_values: Annotated[
-        torch.Tensor,
-        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+        torch.Tensor, TensorShape("bnp", 3, "ps", "ps", dynamic_dims={"bnp"})
+    ]
 
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
@@ -125,29 +148,31 @@ class KeyeVL1_5ImageEmbeddingInputs(TensorSchema):
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["image_embeds"]
     image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-KeyeVL1_5ImageInputs = Union[KeyeVL1_5ImagePixelInputs,
-                             KeyeVL1_5ImageEmbeddingInputs]
+KeyeVL1_5ImageInputs: TypeAlias = (
+    KeyeVL1_5ImagePixelInputs | KeyeVL1_5ImageEmbeddingInputs
+)
 
 
 class KeyeVL1_5VideoPixelInputs(TensorSchema):
     """
     Dimensions:
-        - b: Batch size
-        - np: Number of patches
+        - bnp: Batch size * Number of patches
         - c: Number of channels
         - ps: Patch size
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["pixel_values_videos"]
     pixel_values_videos: Annotated[
-        torch.Tensor,
-        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+        torch.Tensor, TensorShape("bnp", 3, "ps", "ps", dynamic_dims={"bnp"})
+    ]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
     num_frames: torch.Tensor
@@ -162,23 +187,24 @@ class KeyeVL1_5VideoEmbeddingInputs(TensorSchema):
         - nv: Number of videos
         - g: Grid dimensions (3 for t, h, w)
     """
+
     type: Literal["video_embeds"]
     video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
     num_frames: torch.Tensor
 
 
-KeyeVL1_5VideoInputs = Union[KeyeVL1_5VideoPixelInputs,
-                             KeyeVL1_5VideoEmbeddingInputs]
+KeyeVL1_5VideoInputs: TypeAlias = (
+    KeyeVL1_5VideoPixelInputs | KeyeVL1_5VideoEmbeddingInputs
+)
 
 
 class KeyeVL1_5Projector(nn.Module):
-
     def __init__(
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -186,9 +212,11 @@ class KeyeVL1_5Projector(nn.Module):
         self.vision_config = vision_config
         self.merge_kernel_size = (2, 2)
 
-        self.hidden_size = (self.vision_config.hidden_size *
-                            self.merge_kernel_size[0] *
-                            self.merge_kernel_size[1])
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
 
         self.pre_norm = torch.nn.LayerNorm(self.hidden_size, eps=1e-05)
         self.act = GELUActivation()
@@ -210,15 +238,13 @@ class KeyeVL1_5Projector(nn.Module):
 
     def forward(
         self,
-        image_features: Union[torch.Tensor, tuple[torch.Tensor],
-                              list[torch.Tensor]],
+        image_features: torch.Tensor | tuple[torch.Tensor] | list[torch.Tensor],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
-            for image_feature, image_grid in zip(image_features,
-                                                 image_grid_thw):
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
                 t, h, w = image_grid
                 image_feature = rearrange(
                     image_feature,
@@ -240,8 +266,7 @@ class KeyeVL1_5Projector(nn.Module):
         dims = image_features.shape[:-1]
         dim = image_features.shape[-1]
         image_features = image_features.view(np.prod(dims), dim)
-        hidden_states = self.pre_norm(image_features.view(
-            -1, self.hidden_size))
+        hidden_states = self.pre_norm(image_features.view(-1, self.hidden_size))
         hidden_states = self.linear_1(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
@@ -250,24 +275,28 @@ class KeyeVL1_5Projector(nn.Module):
 
 
 class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo):
-
     def get_max_frame_per_video(self) -> int:
         return 2048
 
-    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(
+        self,
+    ) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
 
-def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
-    image_grid_thw = hf_inputs.get("image_grid_thw",
-                                   torch.empty((0, 3), dtype=torch.int64))
+def _keye_field_config(
+    hf_inputs: Mapping[str, torch.Tensor],
+):
+    image_grid_thw = hf_inputs.get(
+        "image_grid_thw", torch.empty((0, 3), dtype=torch.int64)
+    )
     image_grid_sizes = image_grid_thw.prod(-1)
 
-    video_grid_thw = hf_inputs.get("video_grid_thw",
-                                   torch.empty((0, 3), dtype=torch.int64))
+    video_grid_thw = hf_inputs.get(
+        "video_grid_thw", torch.empty((0, 3), dtype=torch.int64)
+    )
     video_grid_thw = split_thw(video_grid_thw)
-    num_frames = hf_inputs.get("num_frames",
-                               video_grid_thw[:, 0]).clone().tolist()
+    num_frames = hf_inputs.get("num_frames", video_grid_thw[:, 0]).clone().tolist()
 
     video_num_patches = get_num_patches(video_grid_thw, num_frames)
 
@@ -287,25 +316,23 @@ def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
             else:
                 j += 1
     video_num_grids = torch.tensor(video_num_grids)
-    return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes(
-        "image", image_grid_sizes),
-                image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                    "image", image_grid_sizes),
-                image_grid_thw=MultiModalFieldConfig.batched("image"),
-                pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                    "video", video_num_patches),
-                video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                    "video", video_num_patches),
-                video_grid_thw=MultiModalFieldConfig.flat_from_sizes(
-                    "video", video_num_grids),
-                num_frames=MultiModalFieldConfig.batched("video"))
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_num_patches
+        ),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes("video", video_num_patches),
+        video_grid_thw=MultiModalFieldConfig.flat_from_sizes("video", video_num_grids),
+        num_frames=MultiModalFieldConfig.batched("video"),
+    )
 
 
 class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
-
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -322,7 +349,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -338,9 +365,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
         return super()._parse_video_data(data)
 
 
-class KeyeVL1_5MultiModalProcessor(
-        BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]):
-
+class KeyeVL1_5MultiModalProcessor(BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return KeyeVL1_5MultiModalDataParser()
 
@@ -351,8 +376,7 @@ class KeyeVL1_5MultiModalProcessor(
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
         image_token_id = vocab[hf_processor.image_token]
@@ -361,44 +385,49 @@ class KeyeVL1_5MultiModalProcessor(
         merge_length = image_processor.merge_size**2
 
         out_mm_kwargs_data = out_mm_kwargs.get_data()
-        frame_types: list[torch.Tensor] = \
-            hf_processor_mm_kwargs.get("frame_types", None)
-        timestamps: list[torch.Tensor] = \
-            hf_processor_mm_kwargs.get("timestamps", None)
+        frame_types: list[torch.Tensor] = hf_processor_mm_kwargs.get(
+            "frame_types", None
+        )
+        timestamps: list[torch.Tensor] = hf_processor_mm_kwargs.get("timestamps", None)
         num_videos = mm_items.get_count("video", strict=False)
 
         if frame_types is None:
             frame_types = [None] * num_videos
-        assert len(frame_types) == num_videos, \
-            f"Number of frame_types={len(frame_types)} " \
+        assert len(frame_types) == num_videos, (
+            f"Number of frame_types={len(frame_types)} "
             f"doesn't equal to number of videos={num_videos}"
+        )
         if timestamps is None:
             timestamps = [None] * num_videos
-        assert len(timestamps) == num_videos, \
-            f"Number of timestamps={len(timestamps)} " \
+        assert len(timestamps) == num_videos, (
+            f"Number of timestamps={len(timestamps)} "
             f"doesn't equal to number of videos={num_videos}"
+        )
 
         video_grid_thw = out_mm_kwargs_data.get(
-            'video_grid_thw', torch.empty((0, 3), dtype=torch.int64))
+            "video_grid_thw", torch.empty((0, 3), dtype=torch.int64)
+        )
         num_frames = out_mm_kwargs_data.get(
-            'num_frames', torch.tensor([], dtype=torch.int64))
+            "num_frames", torch.tensor([], dtype=torch.int64)
+        )
 
-        assert len(num_frames) == num_videos, \
-            f"Size of num_frames={len(num_frames)} " \
+        assert len(num_frames) == num_videos, (
+            f"Size of num_frames={len(num_frames)} "
             f"doesn't equal to number of videos={num_videos}"
+        )
 
         video_grid_hws = split_thw(video_grid_thw)
         assert int(num_frames.sum().tolist()) == video_grid_hws.shape[0], (
             f"The first dimension of `video_grid_hws`={video_grid_hws.shape[0]}"
-            f"doesn't equal to num of frames.")
+            f"doesn't equal to num of frames."
+        )
 
-        cu_seqlens = torch.cumsum(torch.tensor([0] + num_frames.tolist()),
-                                  dim=-1)
+        cu_seqlens = torch.cumsum(torch.tensor([0] + num_frames.tolist()), dim=-1)
 
         def get_replacement_keye(item_idx: int, modality: str):
             """
             Args:
-                item_idx(int): The item index of modality to replace 
+                item_idx(int): The item index of modality to replace
                 modality(str): The modality
             """
             if modality == "image":
@@ -413,16 +442,15 @@ class KeyeVL1_5MultiModalProcessor(
                 video_timestamps = timestamps[item_idx]
                 video_frame_types = frame_types[item_idx]
                 grid_thw = video_grid_hws[
-                    cu_seqlens[item_idx]:cu_seqlens[item_idx + 1]]
+                    cu_seqlens[item_idx] : cu_seqlens[item_idx + 1]
+                ]
 
                 nframes = grid_thw.shape[0]
 
                 if video_timestamps is None:
                     video_timestamps = [""] * nframes
                 else:
-                    video_timestamps = [
-                        format(ts, ".1f") for ts in video_timestamps
-                    ]
+                    video_timestamps = [format(ts, ".1f") for ts in video_timestamps]
 
                 if video_frame_types is None:
                     video_frame_types = [0] * nframes
@@ -437,7 +465,8 @@ class KeyeVL1_5MultiModalProcessor(
                         placeholders.append(vocab[hf_processor.fast_end])
 
                 return PromptUpdateDetails.select_token_id(
-                    placeholders, embed_token_id=video_token_id)
+                    placeholders, embed_token_id=video_token_id
+                )
             else:
                 raise ValueError(f"Unsupported modality {modality}")
 
@@ -446,7 +475,8 @@ class KeyeVL1_5MultiModalProcessor(
                 modality=modality,
                 target=[placeholder[modality]],
                 replacement=partial(get_replacement_keye, modality=modality),
-            ) for modality in ("image", "video")
+            )
+            for modality in ("image", "video")
         ]
 
     def _get_mm_fields_config(
@@ -458,8 +488,8 @@ class KeyeVL1_5MultiModalProcessor(
 
 
 class KeyeVL1_5DummyInputsBuilder(
-        KeyeBaseDummyInputsBuilder[KeyeVL1_5ProcessingInfo]):
-    ...
+    KeyeBaseDummyInputsBuilder[KeyeVL1_5ProcessingInfo]
+): ...
 
 
 @MULTIMODAL_REGISTRY.register_processor(
@@ -467,42 +497,26 @@ class KeyeVL1_5DummyInputsBuilder(
     info=KeyeVL1_5ProcessingInfo,
     dummy_inputs=KeyeVL1_5DummyInputsBuilder,
 )
-class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
-                                        SupportsLoRA, SupportsPP):
-
-    def _build_projector(self,
-                         text_config: PretrainedConfig,
-                         vision_config: PretrainedConfig,
-                         quant_config: Optional[QuantizationConfig] = None,
-                         prefix: str = "") -> nn.Module:
-        return KeyeVL1_5Projector(text_config, vision_config, quant_config,
-                                  prefix)
+class KeyeVL1_5ForConditionalGeneration(
+    BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    def _build_projector(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return KeyeVL1_5Projector(text_config, vision_config, quant_config, prefix)
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config: PretrainedConfig = vllm_config.model_config.hf_config
         self.merge_size = config.vision_config.spatial_merge_size
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
-                                        expected_dim: int, name: str):
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == expected_dim:
-                return mm_input
-            elif mm_input.ndim == expected_dim + 1:
-                return mm_input.reshape(-1, *mm_input.shape[2:])
-            else:
-                raise ValueError(
-                    f"{name} should be {expected_dim}D or "
-                    f"batched {expected_dim}D tensor."
-                    f"Got ndim: {mm_input.ndim} (shape={mm_input.shape})")
-        else:
-            return torch.concat(mm_input)
-
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[KeyeVL1_5ImageInputs]:
+        self, **kwargs: object
+    ) -> KeyeVL1_5ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -511,11 +525,6 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, expected_dim=4, name="image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, expected_dim=2, name="image grid_thw")
-
             return KeyeVL1_5ImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values,
@@ -523,11 +532,6 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, expected_dim=2, name="image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, expected_dim=2, name="image grid_thw")
-
             return KeyeVL1_5ImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -535,7 +539,8 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[KeyeVL1_5VideoInputs]:
+        self, **kwargs: object
+    ) -> KeyeVL1_5VideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -545,43 +550,31 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos,
-                expected_dim=4,
-                name="video pixel values",
-            )
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, expected_dim=2, name="video grid_thw")
-
-            num_frames = self._validate_and_reshape_mm_tensor(
-                num_frames, expected_dim=1, name="video num frames")
-
             return KeyeVL1_5VideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
                 video_grid_thw=video_grid_thw,
-                num_frames=num_frames)
+                num_frames=num_frames,
+            )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, expected_dim=2, name="video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, expected_dim=2, name="video grid_thw")
-
-            return KeyeVL1_5VideoEmbeddingInputs(type="video_embeds",
-                                                 video_embeds=video_embeds,
-                                                 video_grid_thw=video_grid_thw,
-                                                 num_frames=num_frames)
+            return KeyeVL1_5VideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+                num_frames=num_frames,
+            )
 
     def _process_video_input(
-            self,
-            video_input: KeyeVL1_5VideoInputs) -> tuple[torch.Tensor, ...]:
+        self, video_input: KeyeVL1_5VideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         video_type = video_input["type"]
         video_grid_thw = split_thw(video_input["video_grid_thw"])
         pixel_values_videos = video_input.get("pixel_values_videos", None)
 
-        video_embeds = self._process_video_embeds(video_type, video_grid_thw,
-                                                  pixel_values_videos)
+        video_embeds = self._process_video_embeds(
+            video_type, video_grid_thw, pixel_values_videos
+        )
         video_embeds = torch.concat(video_embeds, dim=0)
 
         num_frames = video_input["num_frames"].clone().tolist()
@@ -589,10 +582,11 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
         num_patches = get_num_patches(video_grid_thw, num_frames).tolist()
 
         patch_cu_seqlens = torch.cumsum(
-            torch.tensor([0] + num_patches).detach().clone(), dim=-1)
-        patch_cu_seqlens = torch.div(patch_cu_seqlens,
-                                     self.merge_size**2,
-                                     rounding_mode="floor")
+            torch.tensor([0] + num_patches).detach().clone(), dim=-1
+        )
+        patch_cu_seqlens = torch.div(
+            patch_cu_seqlens, self.merge_size**2, rounding_mode="floor"
+        )
 
         new_video_embeds = []
         for idx in range(patch_cu_seqlens.shape[0] - 1):
@@ -600,3 +594,133 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
             end = patch_cu_seqlens[idx + 1]
             new_video_embeds.append(video_embeds[start:end])
         return tuple(new_video_embeds)
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
+        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
+            video_grid_thw = video_grid_thw[0]
+
+        def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
+            """
+            Split grid_thw along the t dimension.
+
+            Args:
+                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
+
+            Returns:
+                List of [1, h, w] rows, repeated t times for each original row.
+            """
+
+            if isinstance(grid_thw, list):
+                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+            if grid_thw.numel() == 0:
+                return []
+
+            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
+            ones = torch.ones_like(hw[:, :1])  # [N,1]
+            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
+            return out.tolist()
+
+        video_grid_thw = split_thw(video_grid_thw)
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        image_nums = len(image_grid_thw)
+        frame_nums = len(video_grid_thw)
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_frames = image_nums, frame_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + frame_nums):
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_frames > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+
+            if ed_image < ed_video:
+                t, h, w = image_grid_thw[image_index]
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = video_grid_thw[video_index]
+                video_index += 1
+                remain_frames -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3675075a48f4dc145f9292e9d3975ad29fa0d18
--- /dev/null
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -0,0 +1,663 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.kda import KimiDeltaAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
+
+from .interfaces import HasInnerState, IsHybrid, MixtureOfExperts, SupportsPP
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class KimiMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class KimiMoE(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        layer_idx: int = 0,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        moe_intermediate_size = config.moe_intermediate_size
+        num_experts = config.num_experts
+        moe_renormalize = config.moe_renormalize
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.num_shared_experts = config.num_shared_experts
+        self.layer_idx = layer_idx
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.gate.e_score_correction_bias = nn.Parameter(torch.empty(num_experts))
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=config.num_experts_per_token,
+            hidden_size=hidden_size,
+            intermediate_size=moe_intermediate_size,
+            reduce_results=False,
+            renormalize=moe_renormalize,
+            quant_config=quant_config,
+            use_grouped_topk=config.use_grouped_topk,
+            num_expert_group=config.num_expert_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.moe_router_activation_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+        )
+
+        if self.num_shared_experts is not None:
+            intermediate_size = moe_intermediate_size * self.num_shared_experts
+            self.shared_experts = KimiMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+        if self.num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = (
+            self.experts(hidden_states=hidden_states, router_logits=router_logits)
+            * self.routed_scaling_factor
+        )
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class KimiMLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 vllm Implementation
+    """
+
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        use_nope: bool = False,
+        rope_scaling: dict[str, Any] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.use_nope = use_nope
+        assert self.use_nope is True
+        assert self.q_lora_rank is None
+        assert rope_scaling is None
+        assert num_heads % tp_size == 0
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
+        )
+        self.q_proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.num_heads * self.qk_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_a_layernorm = RMSNorm(
+            self.kv_lora_rank,
+            eps=config.rms_norm_eps,
+        )
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            rotary_emb=None,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
+            q_a_layernorm=None,
+            q_b_proj=None,
+            q_proj=self.q_proj,
+            indexer=None,
+            is_sparse=False,
+            topk_indices_buffer=None,
+        )
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ) -> None:
+        output[:] = self.mla_attn(positions, hidden_states)
+
+
+class KimiDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        layer_idx: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
+        model_config: ModelConfig | None = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.is_moe = config.is_moe
+
+        if config.is_kda_layer(layer_idx):
+            self.self_attn = KimiDeltaAttention(
+                layer_idx=layer_idx,
+                hidden_size=config.hidden_size,
+                quant_config=quant_config,
+                cache_config=cache_config,
+                model_config=config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            self.self_attn = KimiMLAAttention(
+                layer_idx=layer_idx,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                quant_config=quant_config,
+                cache_config=cache_config,
+                model_config=model_config,
+                prefix=f"{prefix}.self_attn",
+                config=config,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=config.q_lora_rank,
+                kv_lora_rank=config.kv_lora_rank,
+                use_nope=config.mla_use_nope,
+            )
+
+        if (
+            self.is_moe
+            and config.num_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % config.moe_layer_freq == 0
+        ):
+            self.block_sparse_moe = KimiMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.block_sparse_moe",
+            )
+            self.mlp = self.block_sparse_moe
+        else:
+            self.mlp = KimiMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        attn_output = torch.empty_like(hidden_states)
+        self.self_attn(
+            hidden_states=hidden_states,
+            positions=positions,
+            output=attn_output,
+        )
+        hidden_states = attn_output
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class KimiLinearModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_text_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+        self.config = config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        extra_kwargs = {}
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return KimiDecoderLayer(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config,
+                parallel_config,
+                model_config,
+                prefix,
+                **extra_kwargs,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        world_size = get_tensor_model_parallel_world_size()
+        assert config.num_attention_heads % world_size == 0, (
+            "num_attention_heads must be divisible by world_size"
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for _, layer in enumerate(self.layers[self.start_layer : self.end_layer]):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class KimiLinearForCausalLM(
+    nn.Module, HasInnerState, SupportsPP, MixtureOfExperts, IsHybrid
+):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model_config = vllm_config.model_config
+        self.vllm_config = vllm_config
+        self.config = self.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
+        self.model = KimiLinearModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.config.vocab_size,
+                self.config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
+        )
+        return hidden_states
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]:
+        return MambaStateDtypeCalculator.kda_state_dtype(
+            vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls, vllm_config: "VllmConfig"
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        tp_size = parallel_config.tensor_parallel_size
+        num_spec = (
+            vllm_config.speculative_config.num_speculative_tokens
+            if vllm_config.speculative_config
+            else 0
+        )
+        return MambaStateShapeCalculator.kda_state_shape(
+            tp_size,
+            hf_config.linear_attn_config["num_heads"],
+            hf_config.linear_attn_config["head_dim"],
+            conv_kernel_size=hf_config.linear_attn_config["short_conv_kernel_size"],
+            num_spec=num_spec,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if self.config.is_moe:
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="w1",
+                ckpt_down_proj_name="w2",
+                ckpt_up_proj_name="w3",
+                num_experts=self.config.num_experts,
+            )
+        else:
+            expert_params_mapping = []
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for idx, (param_name, weight_name, expert_id, shard_id) in enumerate(
+                    expert_params_mapping
+                ):
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        expert_id=expert_id,
+                        shard_id=shard_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias")
+                        and name not in params_dict
+                        and not self.config.is_linear_attn
+                    ):  # noqa: E501
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, **kwargs)
+            loaded_params.add(name)
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: KimiLinearConfig, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_nextn_predict_layers") and (
+        config.num_nextn_predict_layers > 0
+    ):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 503627865c4a5076005f7f0658bf20309fc2608e..8167b82f323305a84475edd8cbd0edd26a956769 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -46,39 +46,50 @@ import copy
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 from torch import nn
-from transformers import BatchFeature
+from transformers import BatchFeature, DeepseekV2Config
 from transformers.activations import GELUActivation
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+    ParallelLMHead,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   SupportsPP)
+from vllm.model_executor.models.interfaces import SupportsMultiModal, SupportsPP
 from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
-from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
@@ -93,33 +104,35 @@ class MaxImageTokenMeta:
 
 
 class KimiVLMultiModalProjector(nn.Module):
-
-    def __init__(self, config: KimiVLConfig, \
-                 use_data_parallel: bool = False, prefix: str = ""):
+    def __init__(
+        self, config: KimiVLConfig, use_data_parallel: bool = False, prefix: str = ""
+    ):
         super().__init__()
         self.use_data_parallel = use_data_parallel
 
-        self.hidden_size = (config.vision_config.hidden_size *
-                            config.vision_config.merge_kernel_size[0] *
-                            config.vision_config.merge_kernel_size[1])
-
-        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
-                                           eps=1e-5)
-        self.linear_1 = ReplicatedLinear(self.hidden_size,
-                                         self.hidden_size,
-                                         bias=True,
-                                         prefix=maybe_prefix(
-                                             prefix, "linear_1"))
-        self.linear_2 = ReplicatedLinear(self.hidden_size,
-                                         config.text_config.hidden_size,
-                                         bias=True,
-                                         prefix=maybe_prefix(
-                                             prefix, "linear_2"))
+        self.hidden_size = (
+            config.vision_config.hidden_size
+            * config.vision_config.merge_kernel_size[0]
+            * config.vision_config.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
+        self.linear_1 = ReplicatedLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            prefix=maybe_prefix(prefix, "linear_1"),
+        )
+        self.linear_2 = ReplicatedLinear(
+            self.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+            prefix=maybe_prefix(prefix, "linear_2"),
+        )
         self.act = GELUActivation()
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.pre_norm(image_features).view(
-            -1, self.hidden_size)
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
         hidden_states, _ = self.linear_1(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states, _ = self.linear_2(hidden_states)
@@ -134,10 +147,11 @@ class KimiVLImagePixelInputs(TensorSchema):
         - ps: Patch size
         - ni: Number of images
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("np", 3, "ps", "ps"),
     ]
 
@@ -150,11 +164,10 @@ KimiVLImageInputs = KimiVLImagePixelInputs
 
 
 class KimiVLProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(KimiVLConfig)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -169,25 +182,25 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
         in_token_limit = hf_processor.image_processor.in_token_limit
         height = image_height
         width = image_width
-        assert isinstance(height,
-                          int), f"height must be int, current height {height}"
-        assert isinstance(width,
-                          int), f"width must be int, current width {width}"
+        assert isinstance(height, int), f"height must be int, current height {height}"
+        assert isinstance(width, int), f"width must be int, current width {width}"
         assert kernel_size is not None, "kernel_size must be specified"
 
         if (width // patch_size) * (height // patch_size) > in_token_limit:
-            scale = math.sqrt(in_token_limit / ((width // patch_size) *
-                                                (height // patch_size)))
+            scale = math.sqrt(
+                in_token_limit / ((width // patch_size) * (height // patch_size))
+            )
             new_w, new_h = int(width * scale), int(height * scale)
             width, height = new_w, new_h
 
         kernel_height, kernel_width = kernel_size
 
-        pad_height = (kernel_height * patch_size - height %
-                      (kernel_height * patch_size)) % (kernel_height *
-                                                       patch_size)
-        pad_width = (kernel_width * patch_size - width %
-                     (kernel_width * patch_size)) % (kernel_width * patch_size)
+        pad_height = (
+            kernel_height * patch_size - height % (kernel_height * patch_size)
+        ) % (kernel_height * patch_size)
+        pad_width = (
+            kernel_width * patch_size - width % (kernel_width * patch_size)
+        ) % (kernel_width * patch_size)
 
         # Calculate new dimensions after padding and patching
         token_height = (height + pad_height) // (kernel_size[0] * patch_size)
@@ -200,7 +213,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
 
 
 class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -213,19 +225,23 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=MaxImageTokenMeta.width,
-                                   height=MaxImageTokenMeta.height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=MaxImageTokenMeta.width,
+                height=MaxImageTokenMeta.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -238,7 +254,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
         # image_grid_hws is shapes for each subtensor in pixel_values
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
+                "image", image_grid_sizes
+            ),
             image_grid_hws=MultiModalFieldConfig.batched("image"),
         )
 
@@ -252,7 +269,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -274,16 +292,18 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
-                                        info=KimiVLProcessingInfo,
-                                        dummy_inputs=KimiVLDummyInputsBuilder)
-class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                     SupportsPP):
+@MULTIMODAL_REGISTRY.register_processor(
+    KimiVLMultiModalProcessor,
+    info=KimiVLProcessingInfo,
+    dummy_inputs=KimiVLDummyInputsBuilder,
+)
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
 
@@ -301,62 +321,49 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
         quant_config = vllm_config.quant_config
 
         assert isinstance(config.vision_config, MoonViTConfig)
-        self.use_data_parallel = model_config.multimodal_config.mm_encoder_tp_mode == "data"
+        self.use_data_parallel = (
+            model_config.multimodal_config.mm_encoder_tp_mode == "data"
+        )
         self.hidden_size = config.text_config.hidden_size
-        self.vision_tower = MoonVitPretrainedModel(config.vision_config,
-                                                   self.use_data_parallel,
-                                                   prefix=maybe_prefix(
-                                                       prefix, "vision_tower"))
+        self.vision_tower = MoonVitPretrainedModel(
+            config.vision_config,
+            self.use_data_parallel,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
 
         self.multi_modal_projector = KimiVLMultiModalProjector(
             config=config,
             use_data_parallel=self.use_data_parallel,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
 
         self.quant_config = quant_config
         sub_vllm_config = copy.deepcopy(vllm_config)
-        sub_vllm_config.model_config.hf_config = sub_vllm_config.model_config.hf_config.text_config
+        sub_vllm_config.model_config.hf_config = (
+            sub_vllm_config.model_config.hf_config.text_config
+        )
         self.language_model = DeepseekV2Model(
             vllm_config=sub_vllm_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        self.unpadded_vocab_size = config.text_config.vocab_size
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.text_config.hidden_size,
-                org_num_embeddings=self.config.text_config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         else:
             self.lm_head = PPMissingLayer()
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.media_placeholder: int = self.config.media_placeholder_token_id
 
-    # ref: qwen2_vl.py
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
-        else:
-            return torch.concat(mm_input)
-
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[KimiVLImageInputs]:
+        self, **kwargs: object
+    ) -> KimiVLImageInputs | None:
         # image input type must be pixel values now
         pixel_values = kwargs.pop("pixel_values", None)
         image_grid_hws = kwargs.pop("image_grid_hws", None)
@@ -364,21 +371,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None:
             return None
 
-        image_grid_hws = self._validate_and_reshape_mm_tensor(
-            image_grid_hws, "image grid hws")
-        # pixel_values may have complex shapes
-        num_channels = 3
-        patch_size = self.config.vision_config.patch_size
-        if isinstance(pixel_values, list):
-            pixel_values = torch.cat([
-                x.reshape(-1, num_channels, patch_size, patch_size)
-                for x in pixel_values
-            ])
-        else:
-            pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
-                                                patch_size)
-        pixel_values = pixel_values.to(self.vision_tower.dtype)
-
         return KimiVLImagePixelInputs(
             type="pixel_values",
             pixel_values=pixel_values,
@@ -387,34 +379,32 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     # perform vt on processored pixel_values
     @torch.inference_mode()
-    def _process_image_pixels(self,
-                              inputs: KimiVLImagePixelInputs) -> torch.Tensor:
+    def _process_image_pixels(self, inputs: KimiVLImagePixelInputs) -> torch.Tensor:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
         image_grid_hws = inputs["image_grid_hws"]
         if self.use_data_parallel:
-            return run_dp_sharded_mrope_vision_model(self.vision_tower,
-                                                     pixel_values,
-                                                     image_grid_hws.tolist(),
-                                                     rope_type="rope_2d")
+            return run_dp_sharded_mrope_vision_model(
+                self.vision_tower,
+                pixel_values,
+                image_grid_hws.tolist(),
+                rope_type="rope_2d",
+            )
         else:
             return self.vision_tower(pixel_values, image_grid_hws)
 
-    def _process_image_input(self,
-                             image_input: KimiVLImageInputs) -> torch.Tensor:
+    def _process_image_input(self, image_input: KimiVLImageInputs) -> torch.Tensor:
         assert image_input["type"] == "pixel_values"
         image_features = self._process_image_pixels(image_input)
         assert isinstance(image_features, (list, tuple))
         lengths = [x.shape[0] for x in image_features]
-        return self.multi_modal_projector(
-            torch.cat(image_features)).split(lengths)
+        return self.multi_modal_projector(torch.cat(image_features)).split(lengths)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> Optional[NestedTensors]:
+    def embed_multimodal(self, **kwargs: object) -> NestedTensors | None:
         # Validate the multimodal input keyword arguments
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
@@ -424,54 +414,16 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-
-        # `get_input_embeddings` should already be implemented for the language
-        # model as one of the requirements of basic vLLM model implementation.
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None and len(
-                multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=self.config.media_placeholder_token_id)
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.get_input_embeddings(input_ids)
-                image_embeds = self._process_image_input(image_input)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids,
-                    inputs_embeds,
-                    image_embeds,
-                    placeholder_token_id=self.config.
-                    media_placeholder_token_id,
-                )
-                input_ids = None
 
         hidden_states = self.language_model(
             input_ids=input_ids,
@@ -482,8 +434,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       **kwargs) -> torch.Tensor:
+    def compute_logits(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states, **kwargs)
         return logits
 
@@ -499,7 +450,11 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
-        if not config.use_mla:
+        use_mha = (
+            config.model_type == "deepseek"
+            or config.qk_nope_head_dim + config.qk_rope_head_dim == 0
+        )
+        if use_mha:
             stacked_params_mapping += [
                 (".qkv_proj", ".q_proj", "q"),
                 (".qkv_proj", ".k_proj", "k"),
@@ -512,7 +467,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 ckpt_gate_proj_name="gate_proj",
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
-                num_experts=config.n_routed_experts)
+                num_experts=config.n_routed_experts,
+            )
         else:
             expert_params_mapping = []
 
@@ -528,8 +484,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
@@ -543,8 +498,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                     # not vision model for now.
                     use_default_weight_loading = True
             else:
-                for (param_name, weight_name,
-                     shard_id) in stacked_params_mapping:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name not in name:
                         continue
                     # We have mlp.experts[0].gate_proj in the checkpoint.
@@ -553,7 +507,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                     # name will be updated to mlp.experts[0].gate_up_proj, which
                     # will then be updated below in expert_params_mapping
                     # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                    if (("mlp.experts." in name) and name not in params_dict):
+                    if ("mlp.experts." in name) and name not in params_dict:
                         continue
                     name = name.replace(weight_name, param_name)
                     # Skip loading extra bias for GPTQ models.
@@ -568,8 +522,12 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                     weight_loader(param, loaded_weight, shard_id, **kwargs)
                     break
                 else:
-                    for idx, (param_name, weight_name, expert_id,
-                              shard_id) in enumerate(expert_params_mapping):
+                    for idx, (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                    ) in enumerate(expert_params_mapping):
                         if weight_name not in name:
                             continue
                         name = name.replace(weight_name, param_name)
@@ -579,12 +537,14 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
                         param = params_dict[name]
                         weight_loader = param.weight_loader
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      expert_id=expert_id,
-                                      shard_id=shard_id,
-                                      **kwargs)
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            expert_id=expert_id,
+                            shard_id=shard_id,
+                            **kwargs,
+                        )
                         break
                     else:
                         use_default_weight_loading = True
@@ -601,18 +561,18 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight, **kwargs)
 
 
-def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config,
-                                        weight_name: str) -> Optional[int]:
-    if hasattr(config,
-               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
-                                                > 0):
+def get_spec_layer_idx_from_weight_name(
+    config: DeepseekV2Config, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_nextn_predict_layers") and (
+        config.num_nextn_predict_layers > 0
+    ):
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
                 return layer_idx + i
     return None
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 53c36e4e52d817382311903efb91cb7d6ef4a45f..aeb25602f11a405a459e7ec5d1ac7ca9ecf95d96 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -14,38 +14,47 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.short_conv import ShortConv
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, SupportsQuant
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class Lfm2MLP(nn.Module):
-
     def __init__(
         self,
         dim: int,
         ff_dim: int,
         multiple_of: int,
         auto_adjust_ff_dim: bool,
-        ffn_dim_multiplier: Optional[float],
-        quant_config: Optional[QuantizationConfig] = None,
+        ffn_dim_multiplier: float | None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -61,14 +70,14 @@ class Lfm2MLP(nn.Module):
             output_sizes=[ff_dim] * 2,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj",
+            prefix=f"{prefix}.w1",
         )
         self.w2 = RowParallelLinear(
             input_size=ff_dim,
             output_size=dim,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.down_proj",
+            prefix=f"{prefix}.w2",
         )
         self.act_fn = SiluAndMul()
 
@@ -80,7 +89,6 @@ class Lfm2MLP(nn.Module):
 
 
 class Lfm2Attention(nn.Module):
-
     def __init__(
         self,
         config: Lfm2Config,
@@ -89,10 +97,10 @@ class Lfm2Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -177,14 +185,13 @@ class Lfm2Attention(nn.Module):
 
 
 class Lfm2AttentionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Lfm2Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -195,11 +202,12 @@ class Lfm2AttentionDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2Attention(
             config=config,
@@ -231,30 +239,27 @@ class Lfm2AttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.operator_norm(hidden_states)
         else:
-            hidden_states, residual = self.operator_norm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
+            hidden_states, residual = self.operator_norm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
         hidden_states, residual = self.ffn_norm(hidden_states, residual)
         return self.feed_forward(hidden_states), residual
 
 
 class Lfm2ShortConvDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Lfm2Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -283,15 +288,14 @@ class Lfm2ShortConvDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.operator_norm(hidden_states)
         else:
-            hidden_states, residual = self.operator_norm(
-                hidden_states, residual)
+            hidden_states, residual = self.operator_norm(hidden_states, residual)
         output = torch.empty_like(hidden_states)
         self.conv(
             hidden_states,
@@ -304,7 +308,6 @@ class Lfm2ShortConvDecoderLayer(nn.Module):
 
 @support_torch_compile
 class Lfm2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -312,24 +315,21 @@ class Lfm2Model(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size)
+            self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
+        )
 
         def get_layer(prefix: str):
             layer_idx = extract_layer_index(prefix)
             is_attn = self.config.layer_types[layer_idx] == "full_attention"
-            layer_class = (Lfm2AttentionDecoderLayer
-                           if is_attn else Lfm2ShortConvDecoderLayer)
+            layer_class = (
+                Lfm2AttentionDecoderLayer if is_attn else Lfm2ShortConvDecoderLayer
+            )
             return layer_class(
                 config,
                 layer_idx,
@@ -340,32 +340,32 @@ class Lfm2Model(nn.Module):
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
         if get_pp_group().is_last_rank:
-            self.embedding_norm = RMSNorm(config.hidden_size,
-                                          eps=config.norm_eps)
+            self.embedding_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
         else:
             self.embedding_norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -379,15 +379,13 @@ class Lfm2Model(nn.Module):
                 residual=residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.embedding_norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             (".qkv_proj", ".q_proj", "q"),
             (".qkv_proj", ".k_proj", "k"),
@@ -398,7 +396,6 @@ class Lfm2Model(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -414,15 +411,15 @@ class Lfm2Model(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                      IsHybrid, SupportsQuant):
+class Lfm2ForCausalLM(
+    nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid, SupportsQuant
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -447,7 +444,6 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, ...]:
-
         return MambaStateDtypeCalculator.short_conv_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -458,7 +454,7 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[tuple[int, int]]:
-        """ Calculate shapes for LFM2's convolutional cache.
+        """Calculate shapes for LFM2's convolutional cache.
 
         Args:
             vllm_config: vLLM config
@@ -480,35 +476,21 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
-        scheduler_config = vllm_config.scheduler_config
-        assert (not cache_config.enable_prefix_caching
-                ), "Lfm2 currently does not support prefix caching"
+
+        assert not cache_config.enable_prefix_caching, (
+            "Lfm2 currently does not support prefix caching"
+        )
 
         super().__init__()
         self.config = config
-        self.vllm_config = vllm_config
-        self.scheduler_config = scheduler_config
-        self.model_config = vllm_config.model_config
-
-        self.model = Lfm2Model(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
+        self.model = Lfm2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config else
-                    lora_config.lora_vocab_padding_size),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -516,33 +498,35 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         else:
             self.lm_head = PPMissingLayer()
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b7b5564ee9898030c5a89f079ed11f12bd580fa
--- /dev/null
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -0,0 +1,762 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.mamba.short_conv import ShortConv
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import Lfm2MoeConfig
+
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    MixtureOfExperts,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class Lfm2MoeMlp(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ff_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.w1 = MergedColumnParallelLinear(
+            input_size=dim,
+            output_sizes=[ff_dim] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w1",
+        )
+        self.w2 = RowParallelLinear(
+            input_size=ff_dim,
+            output_size=dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w2",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.w1(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class Lfm2MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate",
+        )
+        if config.use_expert_bias:
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(self.n_routed_experts, dtype=torch.float32)
+            )
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,  # needed for softmax score func
+            num_expert_group=1,
+            topk_group=1,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            scoring_func="sigmoid",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = (
+            self.experts(hidden_states=hidden_states, router_logits=router_logits)
+            * self.routed_scaling_factor
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Lfm2MoeAttention(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        self.num_kv_heads = num_kv_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.q_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        n_tokens, _ = hidden_states.shape
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(n_tokens, self.num_heads, self.head_dim).contiguous()
+        k = k.view(n_tokens, self.num_kv_heads, self.head_dim).contiguous()
+        q = self.q_layernorm(q)
+        k = self.k_layernorm(k)
+        q, k = self.rotary_emb(positions, q, k)
+        q = q.view(n_tokens, self.num_heads * self.head_dim)
+        k = k.view(n_tokens, self.num_kv_heads * self.head_dim)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class Lfm2MoeAttentionDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.prefix = prefix
+        self.config = config
+        self.layer_idx = layer_idx
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        self.self_attn = Lfm2MoeAttention(
+            config=config,
+            layer_idx=layer_idx,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        if layer_idx < config.num_dense_layers:
+            self.feed_forward = Lfm2MoeMlp(
+                dim=config.hidden_size,
+                ff_dim=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = Lfm2MoeSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+                enable_eplb=enable_eplb,
+            )
+
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.operator_norm(hidden_states)
+        else:
+            hidden_states, residual = self.operator_norm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        return self.feed_forward(hidden_states), residual
+
+
+class Lfm2MoeShortConvDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.conv = ShortConv(
+            config=config,
+            dim=config.hidden_size,
+            layer_idx=layer_idx,
+            model_config=model_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.conv",
+        )
+
+        if layer_idx < config.num_dense_layers:
+            self.feed_forward = Lfm2MoeMlp(
+                dim=config.hidden_size,
+                ff_dim=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = Lfm2MoeSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+                enable_eplb=enable_eplb,
+            )
+
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.operator_norm(hidden_states)
+        else:
+            hidden_states, residual = self.operator_norm(hidden_states, residual)
+        output = torch.empty_like(hidden_states)
+        self.conv(
+            hidden_states,
+            output,
+        )
+        hidden_states, residual = self.ffn_norm(output, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Lfm2MoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        parallel_config = vllm_config.parallel_config
+        enable_eplb = parallel_config.enable_eplb
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = extract_layer_index(prefix)
+            is_attn = self.config.layer_types[layer_idx] == "full_attention"
+            layer_class = (
+                Lfm2MoeAttentionDecoderLayer
+                if is_attn
+                else Lfm2MoeShortConvDecoderLayer
+            )
+            return layer_class(
+                config,
+                layer_idx,
+                model_config,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+        if get_pp_group().is_last_rank:
+            self.embedding_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        else:
+            self.embedding_norm = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.embedding_norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".w1", ".w1", 0),
+            (".w1", ".w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "expert_bias" in name:
+                name = name.replace("expert_bias", "gate.e_score_correction_bias")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if ("feed_forward.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Lfm2MoeForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsQuant,
+    MixtureOfExperts,
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "w1": [
+            "w1",
+            "w3",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, ...]:
+        return MambaStateDtypeCalculator.short_conv_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[tuple[int, int]]:
+        """Calculate shapes for LFM2's convolutional cache.
+
+        Args:
+            vllm_config: vLLM config
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+
+        return MambaStateShapeCalculator.short_conv_state_shape(
+            tp_world_size=parallel_config.tensor_parallel_size,
+            intermediate_size=hf_config.hidden_size,
+            conv_kernel=hf_config.conv_L_cache,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        assert not cache_config.enable_prefix_caching, (
+            "Lfm2Moe currently does not support prefix caching"
+        )
+
+        super().__init__()
+        self.config = config
+        self.model = Lfm2MoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+        # Set MoE hyperparameters
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_layer = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(
+                layer, (Lfm2MoeAttentionDecoderLayer, Lfm2MoeShortConvDecoderLayer)
+            )
+            if isinstance(layer.feed_forward, Lfm2MoeSparseMoeBlock):
+                example_layer = layer.feed_forward
+                self.moe_layers.append(layer.feed_forward.experts)
+
+        if example_layer is None:
+            raise RuntimeError(
+                "No Lfm2MoeSparseMoeBlock layer found in the model.layers."
+            )
+
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_layer.n_logical_experts
+        self.num_physical_experts = example_layer.n_physical_experts
+        self.num_local_physical_experts = example_layer.n_local_physical_experts
+        self.num_routed_experts = example_layer.n_routed_experts
+        self.num_redundant_experts = example_layer.n_redundant_experts
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.feed_forward, Lfm2MoeSparseMoeBlock):
+                moe = layer.feed_forward
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9839e4f8f707e906b084b2cb41f6b05479b10fc8
--- /dev/null
+++ b/vllm/model_executor/models/lightonocr.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping, Sequence
+from typing import TypeVar
+
+import torch
+import torch.nn as nn
+from transformers import (
+    BatchFeature,
+    PixtralVisionConfig,
+)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.models.mistral3 import (
+    Mistral3DummyInputsBuilder,
+    Mistral3ForConditionalGeneration,
+    Mistral3MultiModalProjector,
+    Mistral3ProcessingInfo,
+    _build_mistral3_info,
+    init_vision_tower_for_llava,
+)
+from vllm.model_executor.models.pixtral import PixtralHFEncoderInfo
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+
+_I = TypeVar("_I", bound=Mistral3ProcessingInfo)
+
+
+class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo]):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        # NOTE: LightOnOCR does not use break/end tokens, so we remove them here.
+        input_ids = processed_outputs.get("input_ids")
+        if input_ids is not None:
+            processor = self.info.get_hf_processor()
+            tokenizer = self.info.get_tokenizer()
+            vocab = tokenizer.get_vocab()
+
+            break_id = vocab.get(processor.image_break_token)
+            end_id = vocab.get(processor.image_end_token)
+
+            # create mask to remove break/end tokens
+            keep_mask = ~torch.isin(
+                input_ids,
+                torch.tensor([break_id, end_id]),
+            )
+
+            processed_outputs["input_ids"] = input_ids[keep_mask].unsqueeze(0)
+            if "attention_mask" in processed_outputs:
+                processed_outputs["attention_mask"] = processed_outputs[
+                    "attention_mask"
+                ][keep_mask].unsqueeze(0)
+
+        # un-pad pixel_values per-image so caches remain independent.
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(hf_config)
+
+        def replace(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            size = images.get_image_size(item_idx)
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=size.width, image_height=size.height
+            )
+            # break/end tokens are not used in LightOnOCR
+            tokens = [image_token_id] * (ncols * nrows)
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+
+        return [
+            PromptReplacement(
+                modality="image", target=[image_token_id], replacement=replace
+            )
+        ]
+
+
+def _build_LightOnOCR_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: BaseMultiModalProcessorCache | None = None,
+):
+    assert isinstance(info, Mistral3ProcessingInfo)
+    return LightOnOCRMultiModalProcessor(info, dummy_inputs, cache=cache)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_LightOnOCR_processor,
+    info=_build_mistral3_info,
+    dummy_inputs=Mistral3DummyInputsBuilder,
+)
+class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_encoder.": "vision_tower.",
+            "model.vision_projection.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
+
+        self.multi_modal_projector = Mistral3MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            spatial_merge_size=config.spatial_merge_size,
+            patch_size=config.vision_config.patch_size,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c7dd134ea47e9c49facc39cbdfb2063ea5fead82..0a3f37c30ab5f15ff9fdd176aa51b07f410c9896 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -38,33 +39,44 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class LlamaMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -89,8 +101,9 @@ class LlamaMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -101,7 +114,6 @@ class LlamaMLP(nn.Module):
 
 
 class LlamaAttention(nn.Module):
-
     def __init__(
         self,
         config: LlamaConfig,
@@ -109,12 +121,12 @@ class LlamaAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -141,14 +153,21 @@ class LlamaAttention(nn.Module):
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
         # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
-                                             1)
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
+        llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
+        self.do_llama_4_scaling = llama_4_scaling_config is not None
+        if self.do_llama_4_scaling:
+            self.llama_4_scaling_original_max_position_embeddings = (
+                llama_4_scaling_config["original_max_position_embeddings"]
+            )
+            self.llama_4_scaling_beta = llama_4_scaling_config["beta"]
+
         self.qkv_proj = QKVParallelLinear(
             hidden_size=hidden_size,
             head_size=self.head_dim,
@@ -167,33 +186,36 @@ class LlamaAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(config,
-                              rope_scaling=rope_scaling,
-                              quant_config=quant_config)
+        self._init_rotary_emb(
+            config, rope_scaling=rope_scaling, quant_config=quant_config
+        )
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
             # Fix for Eagle3 compatibility:
             # for draft models, subtract target layer count
             # to get draft-relative layer index starting from 0
-            if hasattr(config, 'target_layer_count'):
+            if hasattr(config, "target_layer_count"):
                 # This is a draft model,
                 # adjust layer_idx to be relative to draft layers
                 effective_layer_idx = layer_idx - config.target_layer_count
             else:
                 # This is a target model, use layer_idx directly
                 effective_layer_idx = layer_idx
-            assert effective_layer_idx < len(layer_types), \
+            assert effective_layer_idx < len(layer_types), (
                 f"effective_layer_idx: {effective_layer_idx} \
                 is out of bounds for layer_types: {layer_types}"
+            )
 
-            is_sliding = layer_types[
-                effective_layer_idx] == "sliding_attention"
+            is_sliding = layer_types[effective_layer_idx] == "sliding_attention"
             if is_sliding:
                 sliding_window = config.sliding_window
 
-        attn_cls = (EncoderOnlyAttention
-                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+        attn_cls = (
+            EncoderOnlyAttention
+            if attn_type == AttentionType.ENCODER_ONLY
+            else Attention
+        )
 
         self.attn = attn_cls(
             self.num_heads,
@@ -207,6 +229,17 @@ class LlamaAttention(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
+    def _get_llama_4_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        # Llama4 scaling
+        scaling = 1 + self.llama_4_scaling_beta * torch.log(
+            1
+            + torch.floor(
+                positions / self.llama_4_scaling_original_max_position_embeddings
+            )
+        )
+        # Broadcast over head_dim
+        return scaling.unsqueeze(-1)
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -215,13 +248,19 @@ class LlamaAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
+        if self.do_llama_4_scaling:
+            attn_scale = self._get_llama_4_attn_scale(positions)
+            q = (q * attn_scale).to(q.dtype)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
-    def _init_rotary_emb(self, config: LlamaConfig,
-                         rope_scaling: Optional[dict[str, Any]],
-                         quant_config: Optional[QuantizationConfig]) -> None:
+    def _init_rotary_emb(
+        self,
+        config: LlamaConfig,
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
         is_neox_style = True
         is_gguf = quant_config and quant_config.get_name() == "gguf"
         if is_gguf and config.model_type == "llama":
@@ -239,33 +278,36 @@ class LlamaAttention(nn.Module):
 
 
 class LlamaDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 config: Optional[LlamaConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        config: LlamaConfig | None = None,
+    ) -> None:
         super().__init__()
 
         config = config or vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
+        quant_config = self.get_quant_config(vllm_config)
 
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         bias_o_proj = attention_bias
         # support internlm/internlm3-8b with qkv_bias
-        if hasattr(config, 'qkv_bias'):
+        if hasattr(config, "qkv_bias"):
             attention_bias = config.qkv_bias
 
         # By default, Llama uses causal attention as it is a decoder-only model.
@@ -281,8 +323,9 @@ class LlamaDecoderLayer(nn.Module):
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -301,42 +344,44 @@ class LlamaDecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        """Get quantization config for this layer. Override in subclasses."""
+        return vllm_config.quant_config
+
 
 @support_torch_compile
 class LlamaModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = LlamaDecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -345,12 +390,16 @@ class LlamaModel(nn.Module):
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
@@ -371,26 +420,25 @@ class LlamaModel(nn.Module):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
-                                                        list[torch.Tensor]]]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -399,16 +447,16 @@ class LlamaModel(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                islice(self.layers, self.start_layer, self.end_layer)):
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
 
@@ -416,8 +464,7 @@ class LlamaModel(nn.Module):
             return hidden_states, aux_hidden_states
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -431,19 +478,19 @@ class LlamaModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -476,23 +523,24 @@ class LlamaModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
+class LlamaForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     # LoRA specific attributes
     embedding_modules = {
         "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings"
+        "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
 
@@ -522,11 +570,13 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "norm": "model.norm",
     }
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = LlamaDecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -534,9 +584,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.config = config
         self.lora_config = lora_config
 
-        self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"),
-                                      layer_type=layer_type)
+        self.model = self._init_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            layer_type=layer_type,
+        )
 
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
@@ -550,71 +602,77 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
                     DEFAULT_VOCAB_PADDING_SIZE
                     # We need bigger padding if using lora for kernel
                     # compatibility
-                    if not lora_config else
-                    lora_config.lora_vocab_padding_size),
+                    if not lora_config
+                    else lora_config.lora_vocab_padding_size
+                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.embed_tokens)
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Override to return default layers for Llama
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def _init_model(self,
-                    vllm_config: VllmConfig,
-                    prefix: str = "",
-                    layer_type: type[nn.Module] = LlamaDecoderLayer):
-        return LlamaModel(vllm_config=vllm_config,
-                          prefix=prefix,
-                          layer_type=layer_type)
+    def _init_model(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
+    ):
+        return LlamaModel(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(
             self.maybe_remap_mistral(name, loaded_weight)
-            for name, loaded_weight in weights)
+            for name, loaded_weight in weights
+        )
 
     # This function is used to remap the mistral format as
     # used by Mistral and Llama <=2
@@ -623,12 +681,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         name: str,
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
-
         def permute(w: torch.Tensor, n_heads: int, attn_out: int):
             attn_in = self.config.head_dim * n_heads
 
-            return w.view(n_heads, attn_in // n_heads // 2, 2,
-                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
 
         mapping = self.mistral_mapping
         modules = name.split(".")
@@ -637,29 +697,32 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         # If using quantized model in mistral format,
         # quantization scales (qscale_weight) also need to be sliced
         if "wk" in modules and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads,
-                                    self.config.hidden_size)
-        elif "wk" in modules and modules[
-                -1] == "qscale_weight" and loaded_weight.numel() > 1:
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads, 1)
+            loaded_weight = permute(
+                loaded_weight, self.config.num_key_value_heads, self.config.hidden_size
+            )
+        elif (
+            "wk" in modules
+            and modules[-1] == "qscale_weight"
+            and loaded_weight.numel() > 1
+        ):
+            loaded_weight = permute(loaded_weight, self.config.num_key_value_heads, 1)
         elif "wq" in modules and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads,
-                                    self.config.hidden_size)
-        elif "wq" in modules and modules[
-                -1] == "qscale_weight" and loaded_weight.numel() > 1:
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads, 1)
+            loaded_weight = permute(
+                loaded_weight, self.config.num_attention_heads, self.config.hidden_size
+            )
+        elif (
+            "wq" in modules
+            and modules[-1] == "qscale_weight"
+            and loaded_weight.numel() > 1
+        ):
+            loaded_weight = permute(loaded_weight, self.config.num_attention_heads, 1)
 
         num_modules = len(modules)
         for i in range(num_modules):
             item = modules[i]
             next_item = modules[i + 1] if i < num_modules - 1 else None
 
-            combined_item = (f"{item}.{next_item}"
-                             if next_item is not None else None)
+            combined_item = f"{item}.{next_item}" if next_item is not None else None
 
             if combined_item in mapping:
                 name = name.replace(combined_item, mapping[combined_item])
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 32d4f69c6bf18bcc98c98e308ac03fdde6647133..a7e0732ec71e2bf5551bfe223558762594761486 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -17,8 +17,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -28,27 +29,40 @@ from vllm.attention import Attention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.distributed import (
+    get_ep_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import MixtureOfExperts
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
-from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
-                    is_pp_missing_parameter)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    fast_topk,
+    is_pp_missing_parameter,
+)
 
+logger = init_logger(__name__)
 
-class Llama4MoE(nn.Module):
 
+class Llama4MoE(nn.Module):
     @staticmethod
     def custom_routing_function(
         hidden_states: torch.Tensor,
@@ -71,13 +85,18 @@ class Llama4MoE(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.top_k = config.num_experts_per_tok
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
 
         intermediate_size_moe = config.intermediate_size
-        self.router = ReplicatedLinear(config.hidden_size,
-                                       config.num_local_experts,
-                                       bias=False,
-                                       quant_config=None,
-                                       prefix=f"{prefix}.router")
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.router",
+        )
 
         self.shared_expert = LlamaMLP(
             hidden_size=config.hidden_size,
@@ -90,6 +109,20 @@ class Llama4MoE(nn.Module):
             disable_tp=self.is_sequence_parallel,
         )
 
+        # Load balancing settings.
+        eplb_config = parallel_config.eplb_config if parallel_config else None
+        self.enable_eplb = parallel_config.enable_eplb if parallel_config else False
+        self.n_redundant_experts = (
+            eplb_config.num_redundant_experts if eplb_config else 0
+        )
+
+        self.n_routed_experts: int = config.num_local_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_shared_experts: int = 1
+        self.n_local_experts: int = config.num_local_experts
+        self.n_physical_experts = self.n_local_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
         self.experts = SharedFusedMoE(
             shared_experts=self.shared_expert,
             num_experts=config.num_local_experts,
@@ -103,6 +136,8 @@ class Llama4MoE(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             is_sequence_parallel=self.is_sequence_parallel,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
         )
 
     def forward(self, hidden_states):
@@ -123,26 +158,28 @@ class Llama4MoE(nn.Module):
             experts_out = experts_out[:num_tokens]
         elif self.tp_size > 1:
             experts_out = self.experts.maybe_all_reduce_tensor_model_parallel(
-                experts_out)
+                experts_out
+            )
 
         return experts_out
 
 
 class Llama4Attention(nn.Module):
-
-    def __init__(self,
-                 config: Llama4TextConfig,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 rope_theta: float = 10000,
-                 rope_scaling: Optional[dict[str, Any]] = None,
-                 max_position_embeddings: int = 8192,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 bias: bool = False,
-                 bias_o_proj: bool = False,
-                 cache_config: Optional[CacheConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
@@ -167,20 +204,23 @@ class Llama4Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.attn_temperature_tuning = self.nope and \
-            config.attn_temperature_tuning
+        self.attn_temperature_tuning = self.nope and config.attn_temperature_tuning
 
         self.floor_scale = getattr(config, "floor_scale", 8192.0)
         self.attn_scale = getattr(config, "attn_scale", 0.1)
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.n_rep = self.num_heads // self.num_kv_heads
-        self.qk_norm = RMSNorm(
-            hidden_size=self.head_dim,
-            eps=config.rms_norm_eps,
-            has_weight=False,
-            dtype=torch.float32,
-        ) if self.use_qk_norm else None
+        self.qk_norm = (
+            RMSNorm(
+                hidden_size=self.head_dim,
+                eps=config.rms_norm_eps,
+                has_weight=False,
+                dtype=torch.float32,
+            )
+            if self.use_qk_norm
+            else None
+        )
         self.qkv_proj = QKVParallelLinear(
             hidden_size=hidden_size,
             head_size=self.head_dim,
@@ -203,18 +243,21 @@ class Llama4Attention(nn.Module):
         if is_gguf and config.model_type == "llama":
             is_neox_style = False
 
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=int(rope_theta),
-            rope_scaling=rope_scaling if rope_scaling != "default" else None,
-            is_neox_style=is_neox_style,
-        ) if not self.nope else None
+        self.rotary_emb = (
+            get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=int(rope_theta),
+                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                is_neox_style=is_neox_style,
+            )
+            if not self.nope
+            else None
+        )
 
         use_chunked_local_attn = not self.nope and config.attention_chunk_size
-        attn_cls = (ChunkedLocalAttention
-                    if use_chunked_local_attn else Attention)
+        attn_cls = ChunkedLocalAttention if use_chunked_local_attn else Attention
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -223,9 +266,12 @@ class Llama4Attention(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
-            **({
-                "attention_chunk_size": config.attention_chunk_size
-            } if use_chunked_local_attn else {}))
+            **(
+                {"attention_chunk_size": config.attention_chunk_size}
+                if use_chunked_local_attn
+                else {}
+            ),
+        )
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)
@@ -270,11 +316,12 @@ class Llama4Attention(nn.Module):
 
 
 class Llama4DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 config: Optional[Llama4TextConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        config: Llama4TextConfig | None = None,
+    ) -> None:
         super().__init__()
 
         config = config or vllm_config.model_config.hf_config
@@ -302,8 +349,10 @@ class Llama4DecoderLayer(nn.Module):
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
         )
-        is_moe_layer = config.interleave_moe_layer_step > 0 and (
-            self.layer_idx + 1) % config.interleave_moe_layer_step == 0
+        is_moe_layer = (
+            config.interleave_moe_layer_step > 0
+            and (self.layer_idx + 1) % config.interleave_moe_layer_step == 0
+        )
         if is_moe_layer:
             self.feed_forward = Llama4MoE(
                 vllm_config=vllm_config,
@@ -318,46 +367,45 @@ class Llama4DecoderLayer(nn.Module):
                 bias=False,
                 prefix=f"{prefix}.feed_forward",
             )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.feed_forward(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class Llama4Model(LlamaModel):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer,
+    ):
         self.num_experts = vllm_config.model_config.hf_config.num_local_experts
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=layer_type)
+        self.n_redundant_experts = (
+            vllm_config.parallel_config.eplb_config.num_redundant_experts
+        )
+        super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
 
     def load_moe_expert_weights(
         self,
@@ -377,7 +425,7 @@ class Llama4Model(LlamaModel):
             params_dict: The dictionary of module parameters.
             loaded_params: The set of already loaded parameters.
             expert_params_mapping: The mapping of expert parameters. Must be
-                generated by FusedMoE.make_expert_params_mapping().
+                generated by SharedFusedMoE.make_expert_params_mapping().
             fused: Whether the expert weights are fused into a single weight
                 tensor or are separate weight tensors for each expert.
                 When fused is True, loaded_weight should have shape of:
@@ -408,9 +456,7 @@ class Llama4Model(LlamaModel):
 
         # Iterate over all the expert parameters and load the weights if we find
         # a match in weight name.
-        for (param_name, weight_name, expert_id,
-             shard_id) in expert_params_mapping:
-
+        for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
             # Get a view of the loaded_weight to avoid modifying the original
             # one across iterations.
             new_loaded_weight = loaded_weight
@@ -419,7 +465,7 @@ class Llama4Model(LlamaModel):
             # the expert index from the expected weight name.
             if fused:
                 # The string between e_str and proj_str is the expert index.
-                e_str, _, proj_str, _ = weight_name.split('.')
+                e_str, _, proj_str, _ = weight_name.split(".")
                 weight_name = f"{e_str}.{proj_str}"
                 param_name = f"{param_name}weight"
 
@@ -436,8 +482,9 @@ class Llama4Model(LlamaModel):
                 continue
 
             # Skip if the current weight is for the bias.
-            if ((name.endswith(".bias") or name.endswith("_bias"))
-                    and name not in params_dict):
+            if (
+                name.endswith(".bias") or name.endswith("_bias")
+            ) and name not in params_dict:
                 continue
 
             param = params_dict[full_param_name]
@@ -456,13 +503,14 @@ class Llama4Model(LlamaModel):
                 # starting expert index for the current EP rank and extract the
                 # corresponding expert weights.
                 layer_idx = extract_layer_index(name)
-                expert_map = self.layers[
-                    layer_idx].feed_forward.experts.expert_map
+                expert_map = self.layers[layer_idx].feed_forward.experts.expert_map
                 if expert_map is not None:
-                    local_expert_indices = (expert_map != -1) \
-                                            .nonzero() \
-                                            .flatten() \
-                                            .to(new_loaded_weight.device)
+                    local_expert_indices = (
+                        (expert_map != -1)
+                        .nonzero()
+                        .flatten()
+                        .to(new_loaded_weight.device)
+                    )
                     new_loaded_weight = new_loaded_weight[local_expert_indices]
                     expert_id = local_expert_indices[0].item()
             else:
@@ -471,19 +519,19 @@ class Llama4Model(LlamaModel):
 
             # Load the weight into the module parameter with corresponding
             # shard id and expert id.
-            weight_loader(param,
-                          new_loaded_weight,
-                          full_param_name,
-                          shard_id=shard_id,
-                          expert_id=expert_id)
-
+            weight_loader(
+                param,
+                new_loaded_weight,
+                full_param_name,
+                shard_id=shard_id,
+                expert_id=expert_id,
+            )
             loaded_params.add(full_param_name)
             expert_param_loaded = True
 
         return expert_param_loaded
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         # Name mapping from the parameter name to the shard name and
         # corresponding shard id.
         stacked_params_mapping = [
@@ -499,18 +547,21 @@ class Llama4Model(LlamaModel):
         fused_experts_params = False
         # Expert parameter mapping for the case where the expert weights are
         # not fused into a single weight tensor.
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.num_experts)
+            num_experts=self.num_experts,
+            num_redundant_experts=self.n_redundant_experts,
+        )
         # Expert parameter mapping for the case where the expert weights are
         # fused into a single weight tensor.
-        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_up_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="gate_up_proj",
-            num_experts=1)
+            num_experts=1,
+        )
         # All the module parameters.
         params_dict = dict(self.named_parameters())
         # The module parameters that have been loaded.
@@ -518,7 +569,6 @@ class Llama4Model(LlamaModel):
 
         # Iterate over all the weights and load them into module parameters.
         for name, loaded_weight in weights:
-
             # If the name contains "experts.gate_up_proj" or "experts.down_proj"
             # without the expert indices, it means the expert weights are fused
             # into a single weight tensor across all experts.
@@ -529,13 +579,14 @@ class Llama4Model(LlamaModel):
             # If kv cache quantization scales exist and the weight name
             # corresponds to one of the kv cache quantization scales, load
             # them.
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -552,8 +603,9 @@ class Llama4Model(LlamaModel):
 
                 # For ModelOpt checkpoints, we need to rename the self_attn
                 # weight/weight_scale names except for kv cache scales.
-                if not (name.endswith(
-                    (".k_scale", ".v_scale")) and "self_attn" in name):
+                if not (
+                    name.endswith((".k_scale", ".v_scale")) and "self_attn" in name
+                ):
                     name = name.replace(weight_name, param_name)
 
                 # Skip if the current weight corresponds to a parameter that
@@ -572,8 +624,7 @@ class Llama4Model(LlamaModel):
                 # Load the weight into the module parameter with corresponding
                 # shard id and exit the for loop and the else block.
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
 
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
@@ -587,12 +638,14 @@ class Llama4Model(LlamaModel):
             else:
                 # First, try to load MoE weights using load_moe_expert_weights.
                 # If successful, move on to next loaded weight.
-                if self.load_moe_expert_weights(name,
-                                                loaded_weight,
-                                                params_dict,
-                                                loaded_params,
-                                                expert_params_mapping,
-                                                fused=fused_experts_params):
+                if self.load_moe_expert_weights(
+                    name,
+                    loaded_weight,
+                    params_dict,
+                    loaded_params,
+                    expert_params_mapping,
+                    fused=fused_experts_params,
+                ):
                     continue
 
                 # Skip if the current weight corresponds to a parameter that
@@ -604,37 +657,40 @@ class Llama4Model(LlamaModel):
                 # per-expert patterns, i.e. one weight scale tensor for all
                 # experts.
                 scale_names = [
-                    "w13_input_scale", "w13_weight_scale", "w2_input_scale",
-                    "w2_weight_scale"
+                    "w13_input_scale",
+                    "w13_weight_scale",
+                    "w2_input_scale",
+                    "w2_weight_scale",
                 ]
-                if ("experts." in name and any(scale_name in name
-                                               for scale_name in scale_names)):
-
+                if "experts." in name and any(
+                    scale_name in name for scale_name in scale_names
+                ):
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
 
                     # If weight loader supports special moe loading, use it to
                     # avoid expensive runtime reflection
-                    if getattr(weight_loader, 'supports_moe_loading', False):
+                    if getattr(weight_loader, "supports_moe_loading", False):
                         # Map the weight name to the corresponding shard id.
                         shard_id = "w2" if "w2_" in name else "w1"
 
                         # Transpose if weight scales are FP8 block scales with
                         # three dimensions:
                         # [num_experts, hidden_in, hidden_out].
-                        if name.endswith("weight_scale") \
-                            and loaded_weight.dtype == torch.float8_e4m3fn \
-                            and loaded_weight.ndim == 3:
+                        if (
+                            name.endswith("weight_scale")
+                            and loaded_weight.dtype == torch.float8_e4m3fn
+                            and loaded_weight.ndim == 3
+                        ):
                             loaded_weight = loaded_weight.transpose(-1, -2)
 
                         # Load the weight into the module parameter with
                         # corresponding shard id and expert id.
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=0)
+                        weight_loader(
+                            param, loaded_weight, name, shard_id=shard_id, expert_id=0
+                        )
 
                     else:
                         # Regular weight loader (handles both
@@ -646,8 +702,7 @@ class Llama4Model(LlamaModel):
 
                 # Handle normal (non-stacked, non-MoE) weights.
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
 
@@ -655,8 +710,7 @@ class Llama4Model(LlamaModel):
         return loaded_params
 
 
-class Llama4ForCausalLM(LlamaForCausalLM):
-
+class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -667,30 +721,80 @@ class Llama4ForCausalLM(LlamaForCausalLM):
         gen_config = vllm_config.model_config.try_get_generation_config()
         gen_config.update(vllm_config.model_config.override_generation_config)
         # enable temperature tuning by default when max_model_len > 32K
-        default_attn_temperature_tuning = \
-            vllm_config.model_config.max_model_len > 32768
-        vllm_config.model_config.hf_config.attn_temperature_tuning \
-            = gen_config.get(
-                "attn_temperature_tuning", default_attn_temperature_tuning)
-
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=Llama4DecoderLayer)
-
-    def _init_model(self,
-                    vllm_config: VllmConfig,
-                    prefix: str = "",
-                    layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
-        return Llama4Model(vllm_config=vllm_config,
-                           prefix=prefix,
-                           layer_type=layer_type)
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        default_attn_temperature_tuning = vllm_config.model_config.max_model_len > 32768
+        vllm_config.model_config.hf_config.attn_temperature_tuning = gen_config.get(
+            "attn_temperature_tuning", default_attn_temperature_tuning
+        )
+
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, layer_type=Llama4DecoderLayer
+        )
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            assert isinstance(layer, Llama4DecoderLayer)
+            if isinstance(layer.feed_forward, Llama4MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.feed_forward
+                self.moe_layers.append(layer.feed_forward.experts)
+
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("No Llama4MoE layer found in model.layers.")
+        else:
+            self.num_moe_layers = len(self.moe_layers)
+            self.num_expert_groups = 1
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.feed_forward, Llama4MoE):
+                moe = layer.feed_forward
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def _init_model(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer,
+    ):
+        return Llama4Model(
+            vllm_config=vllm_config, prefix=prefix, layer_type=layer_type
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         weights = [
             self.permute_qk_weight_for_rotary(name, loaded_weight)
@@ -703,10 +807,8 @@ class Llama4ForCausalLM(LlamaForCausalLM):
         name: str,
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
-
         # Helper function to permute the weight's channels
         def permute(w: torch.Tensor, n_heads: int, is_weight_scale: bool):
-
             # Calculate the expected shape of the weight.
             # Do not rely on w's shape, as it may be in another layout.
             attn_in = self.config.head_dim * n_heads
@@ -719,28 +821,39 @@ class Llama4ForCausalLM(LlamaForCausalLM):
 
             # If the weight is a weight scale, we need to divide attn_out by
             # block size, which is currently 16.
-            elif w.dtype == torch.float8_e4m3fn and is_weight_scale \
-                and w.shape[1] * 16 == attn_out:
+            elif (
+                w.dtype == torch.float8_e4m3fn
+                and is_weight_scale
+                and w.shape[1] * 16 == attn_out
+            ):
                 attn_out = attn_out // 16
 
-            return w.view(n_heads, attn_in // n_heads // 2, 2,
-                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
 
         modules = name.split(".")
 
         # Permute Q/K weights and weight block scales for rotary embedding
         is_weight = modules[-1] == "weight"
-        is_nvfp4_weight_scale = (modules[-1] == "weight_scale" and
-                                 loaded_weight.dtype == torch.float8_e4m3fn)
+        is_nvfp4_weight_scale = (
+            modules[-1] == "weight_scale" and loaded_weight.dtype == torch.float8_e4m3fn
+        )
 
         if is_weight or is_nvfp4_weight_scale:
-            if ("wk" in modules or "k_proj" in modules):
-                loaded_weight = permute(loaded_weight,
-                                        self.config.num_key_value_heads,
-                                        is_nvfp4_weight_scale)
-            elif ("wq" in modules or "q_proj" in modules):
-                loaded_weight = permute(loaded_weight,
-                                        self.config.num_attention_heads,
-                                        is_nvfp4_weight_scale)
+            if "wk" in modules or "k_proj" in modules:
+                loaded_weight = permute(
+                    loaded_weight,
+                    self.config.num_key_value_heads,
+                    is_nvfp4_weight_scale,
+                )
+            elif "wq" in modules or "q_proj" in modules:
+                loaded_weight = permute(
+                    loaded_weight,
+                    self.config.num_attention_heads,
+                    is_nvfp4_weight_scale,
+                )
 
         return name, loaded_weight
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 583ed15a4eeeca852a6110253c1ef48847d09e79..660c8f1bb522609c5306aa93c373d10a6686ca86 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -28,36 +27,31 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
-                                               Llama4ForCausalLM)
+from vllm.model_executor.models.llama4 import Llama4DecoderLayer, Llama4ForCausalLM
 from vllm.model_executor.models.utils import extract_layer_index
-from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings
+from .interfaces import SupportsMultiModal
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
 
 @support_torch_compile
 class LlamaModel(nn.Module):
-
     def __init__(
         self,
         *,
         vllm_config: VllmConfig,
         prefix: str = "",
         start_layer_id: int = 0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
-        self.config = (
-            vllm_config.speculative_config.draft_model_config.hf_config)
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.validate_and_update_config(start_layer_id, quant_config)
         self.vocab_size = self.config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -66,36 +60,41 @@ class LlamaModel(nn.Module):
             prefix=maybe_prefix(prefix, "embed_tokens"),
         )
 
-        self.layers = nn.ModuleList([
-            Llama4DecoderLayer(
-                vllm_config=vllm_config,
-                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
-                config=self.config,
-            ) for i in range(self.config.num_hidden_layers)
-        ])
-        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
-                                  self.config.hidden_size,
-                                  bias=False)
-        self.norm = RMSNorm(self.config.hidden_size,
-                            eps=self.config.rms_norm_eps)
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+        # Temporarily modify vllm_config.quant_config for draft model layers
+        original_quant_config = vllm_config.quant_config
+        vllm_config.quant_config = quant_config
+        try:
+            self.layers = nn.ModuleList(
+                [
+                    Llama4DecoderLayer(
+                        vllm_config=vllm_config,
+                        prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                        config=self.config,
+                    )
+                    for i in range(self.config.num_hidden_layers)
+                ]
+            )
+        finally:
+            # Restore original quant_config
+            vllm_config.quant_config = original_quant_config
+        self.fc = torch.nn.Linear(
+            self.config.hidden_size * 2, self.config.hidden_size, bias=False
+        )
+        self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
-        hidden_states = self.fc(
-            torch.cat((inputs_embeds, hidden_states), dim=-1))
+            inputs_embeds = self.embed_input_ids(input_ids)
+        hidden_states = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.layers:
             hidden_states, residual = layer(
@@ -106,8 +105,7 @@ class LlamaModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -130,88 +128,91 @@ class LlamaModel(nn.Module):
                 break
             else:
                 # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and \
-                    "embed_tokens." in name:
+                if get_pp_group().world_size == 1 and "embed_tokens." in name:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         for name in params_dict:
             # if PP disabled then draft will share embed with target
-            if get_pp_group().world_size == 1 and \
-                "embed_tokens." in name:
+            if get_pp_group().world_size == 1 and "embed_tokens." in name:
                 continue
             assert name in loaded_params, f"{name} is not loaded!"
         return loaded_params
 
     def validate_and_update_config(
-            self,
-            start_layer_id: int,
-            quant_config: Optional[QuantizationConfig] = None) -> None:
+        self, start_layer_id: int, quant_config: QuantizationConfig | None = None
+    ) -> None:
         # yoco and moe is not supported by draft model yet
         assert self.config.yoco_global_kv_layer is None
         assert self.config.yoco_local_kv_layer is None
         assert len(self.config.moe_layers) == 0
         # draft model layer index is increased by start_layer_id,
         # so we need to pad relevant configs accordingly
-        self.config.no_rope_layers = [
-            0
-        ] * start_layer_id + self.config.no_rope_layers
+        self.config.no_rope_layers = [0] * start_layer_id + self.config.no_rope_layers
         # currently only TorchAO quantization is supported
         if isinstance(quant_config, TorchAOConfig):
 
             def pad_layer_name(layer: str) -> str:
                 layer_index = extract_layer_index(layer)
-                return layer.replace(str(layer_index),
-                                     str(layer_index + start_layer_id))
+                return layer.replace(
+                    str(layer_index), str(layer_index + start_layer_id)
+                )
 
-            quant_config.torchao_config.module_fqn_to_config = {
+            torchao_config = quant_config.torchao_config
+            torchao_config.module_fqn_to_config = {
                 pad_layer_name(layer): quantization
-                for layer, quantization in
-                quant_config.torchao_config.module_fqn_to_config.items()
+                for layer, quantization in torchao_config.module_fqn_to_config.items()
             }
 
 
 class EagleLlama4ForCausalLM(Llama4ForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
-        self.config = (
-            vllm_config.speculative_config.draft_model_config.hf_config)
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         target_layer_num = vllm_config.model_config.get_num_layers(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
         # draft model quantization config may differ from target model
         quant_config = VllmConfig.get_quantization_config(
-            vllm_config.speculative_config.draft_model_config,
-            vllm_config.load_config)
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix="model",
-                                start_layer_id=target_layer_num,
-                                quant_config=quant_config)
+            vllm_config.speculative_config.draft_model_config, vllm_config.load_config
+        )
+        self.model = LlamaModel(
+            vllm_config=vllm_config,
+            prefix="model",
+            start_layer_id=target_layer_num,
+            quant_config=quant_config,
+        )
         logit_scale = getattr(self.config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.config.vocab_size,
-                                                scale=logit_scale)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
+    embed_input_ids = SupportsMultiModal.embed_input_ids  # type: ignore
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> None:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None:
         def transform(inputs):
             name, loaded_weight = inputs
-            name, weight = self.permute_qk_weight_for_rotary(
-                name, loaded_weight)
+            name, weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, weight
 
         loader = AutoWeightsLoader(
@@ -220,20 +221,3 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
             skip_prefixes=(["lm_head."]),
         )
         loader.load_weights(map(transform, weights))
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-
-        return inputs_embeds
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 9a5b453ca0f3945f8714e298301c18f83568dc07..90ab5c50361b6534ffcfa3c6b9e95cb7e3488c4d 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -12,26 +11,33 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import (LlamaDecoderLayer,
-                                              LlamaForCausalLM)
-
-from .utils import AutoWeightsLoader, maybe_prefix
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
+
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
 
 class LlamaDecoderLayer(LlamaDecoderLayer):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
         disable_input_layernorm: bool,
         prefix: str = "",
-        config: Optional[LlamaConfig] = None,
+        config: LlamaConfig | None = None,
     ) -> None:
         super().__init__(vllm_config, prefix=prefix, config=config)
 
@@ -41,10 +47,13 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
             del self.input_layernorm
             self.input_layernorm = nn.Identity()
 
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        """Use drafter's quantization config instead of verifier's."""
+        return get_draft_quant_config(vllm_config)
+
 
 @support_torch_compile
 class LlamaModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -53,27 +62,41 @@ class LlamaModel(nn.Module):
         start_layer_id: int = 0,
     ) -> None:
         super().__init__()
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
             self.config.hidden_size,
             prefix=maybe_prefix(prefix, "embed_tokens"),
         )
 
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(
-                vllm_config,
-                i == 0,
-                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
-                config=self.config,
-            ) for i in range(self.config.num_hidden_layers)
-        ])
-        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
-                                  self.config.hidden_size,
-                                  bias=False)
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    vllm_config,
+                    i == 0,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                    config=self.config,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.fc = ReplicatedLinear(
+            input_size=self.config.hidden_size * 2,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
 
     def forward(
         self,
@@ -82,8 +105,7 @@ class LlamaModel(nn.Module):
         hidden_states: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         input_embeds = self.embed_tokens(input_ids)
-        hidden_states = self.fc(
-            torch.cat((input_embeds, hidden_states), dim=-1))
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.layers:
             hidden_states, residual = layer(
@@ -94,8 +116,7 @@ class LlamaModel(nn.Module):
         hidden_states = hidden_states + residual
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -107,6 +128,24 @@ class LlamaModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -116,47 +155,47 @@ class LlamaModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-
                 # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and \
-                    "embed_tokens." in name:
+                if get_pp_group().world_size == 1 and "embed_tokens." in name:
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class EagleLlamaForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         # Ensure draft_vocab_size is set
         # default to the base vocab size when absent
         if getattr(self.config, "draft_vocab_size", None) is None:
             base_vocab_size = getattr(self.config, "vocab_size", None)
             self.config.draft_vocab_size = base_vocab_size
         target_layer_num = vllm_config.model_config.get_num_layers(
-            vllm_config.parallel_config)
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix="model",
-                                start_layer_id=target_layer_num)
+            vllm_config.parallel_config
+        )
+        self.model = LlamaModel(
+            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+        )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.config.vocab_size,
-                                                scale=logit_scale)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             raise NotImplementedError(
@@ -165,11 +204,11 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
         return self.model(input_ids, positions, hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 1244d3f8c0be29a007c43c898f66543dd9fdb86b..75c671311b49159ea20874e461b8c25257670d44 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -12,33 +11,51 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import (LlamaDecoderLayer,
-                                              LlamaForCausalLM)
-
-from .utils import AutoWeightsLoader, maybe_prefix
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
+
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
 
 class LlamaDecoderLayer(LlamaDecoderLayer):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 config: Optional[LlamaConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        config: LlamaConfig | None = None,
+        layer_idx: int = 0,
+    ) -> None:
         super().__init__(vllm_config, prefix=prefix, config=config)
 
         config = config or vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
+        quant_config = self.get_quant_config(vllm_config)
+
+        # First layer uses 2*hidden_size (embeds + hidden_states concatenated)
+        # Subsequent layers use hidden_size (only hidden_states, no embeds)
+        qkv_input_size = 2 * self.hidden_size if layer_idx == 0 else self.hidden_size
 
         # override qkv
         self.self_attn.qkv_proj = QKVParallelLinear(
-            2 * self.hidden_size,
+            qkv_input_size,
             self.self_attn.head_dim,
             self.self_attn.total_num_heads,
             self.self_attn.total_num_kv_heads,
@@ -48,22 +65,27 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         )
 
         self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
 
         if getattr(config, "norm_before_residual", False):
             self._residual_norm = self._norm_before_residual
         else:
             self._residual_norm = self._norm_after_residual
 
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        """Use drafter's quantization config instead of verifier's."""
+        return get_draft_quant_config(vllm_config)
+
     def _norm_before_residual(
-            self,
-            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self, hidden_states: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         hidden_states = self.hidden_norm(hidden_states)
         residual = hidden_states
         return hidden_states, residual
 
     def _norm_after_residual(
-            self,
-            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self, hidden_states: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         residual = hidden_states
         hidden_states = self.hidden_norm(hidden_states)
         return hidden_states, residual
@@ -73,23 +95,24 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         positions: torch.Tensor,
         embeds: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.layer_idx == 0:
+            # First layer: concatenate embeds with hidden_states
+            embeds = self.input_layernorm(embeds)
+            hidden_states, residual = self._residual_norm(hidden_states=hidden_states)
+            hidden_states = torch.cat([embeds, hidden_states], dim=-1)
+        else:
+            # Subsequent layers: process hidden_states and residuals only
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
-        embeds = self.input_layernorm(embeds)
-
-        hidden_states, residual = self._residual_norm(
-            hidden_states=hidden_states)
-
-        hidden_states = torch.cat([embeds, hidden_states], dim=-1)
         # Self Attention
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
         # Fully Connected
         hidden_states = self.mlp(hidden_states)
@@ -97,9 +120,16 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         return hidden_states, residual
 
 
-@support_torch_compile
+@support_torch_compile(
+    # torch.compile is disabled for multimodal EAGLE3 models due to constraint
+    # violations with dynamic shapes during tensor concatenation operations.
+    # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132
+    # Non-multimodal EAGLE3 models can still use torch.compile safely.
+    enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs(
+        vllm_config.model_config
+    ),
+)
 class LlamaModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -108,10 +138,12 @@ class LlamaModel(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -120,48 +152,62 @@ class LlamaModel(nn.Module):
             prefix=maybe_prefix(prefix, "embed_tokens"),
         )
 
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(
-                current_vllm_config,
-                prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
-                config=self.config,
-            )
-        ])
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    current_vllm_config,
+                    prefix=maybe_prefix(prefix, f"layers.{layer_idx + start_layer_id}"),
+                    config=self.config,
+                    layer_idx=layer_idx,
+                )
+                for layer_idx in range(self.config.num_hidden_layers)
+            ]
+        )
         if hasattr(self.config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(self.config.target_hidden_size * 3,
-                                      self.config.hidden_size,
-                                      bias=False)
+            fc_input_size = self.config.target_hidden_size * 3
         else:
-            self.fc = torch.nn.Linear(self.config.hidden_size * 3,
-                                      self.config.hidden_size,
-                                      bias=False)
+            fc_input_size = self.config.hidden_size * 3
+        self.fc = ReplicatedLinear(
+            input_size=fc_input_size,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
         )
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        input_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        input_embeds = self.embed_tokens(input_ids)
+        if input_embeds is None:
+            input_embeds = self.embed_input_ids(input_ids)
         assert hidden_states.shape[-1] == input_embeds.shape[-1]
 
         residual = None
-        hidden_states, residual = self.layers[0](
-            positions,
-            input_embeds,
-            hidden_states,
-            residual,
-        )
-
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions=positions,
+                embeds=input_embeds,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
         hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
         return hidden_states, hidden_prenorm
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -173,8 +219,26 @@ class LlamaModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if 'midlayer.' in name:
-                name = name.replace('midlayer.', 'layers.0.')
+            if "midlayer." in name:
+                name = name.replace("midlayer.", "layers.0.")
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -185,78 +249,84 @@ class LlamaModel(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Eagle3LlamaForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
-        self.config = vllm_config. \
-            speculative_config.draft_model_config.hf_config
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
         # Ensure draft_vocab_size is set
         # default to the base vocab size when absent
         if getattr(self.config, "draft_vocab_size", None) is None:
             base_vocab_size = getattr(self.config, "vocab_size", None)
             self.config.draft_vocab_size = base_vocab_size
         target_layer_num = vllm_config.model_config.get_num_layers(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
 
         # Store target layer count in draft config for
         # proper layer_types indexing in draft models
         self.config.target_layer_count = target_layer_num
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix="model",
-                                start_layer_id=target_layer_num)
+        self.model = LlamaModel(
+            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+        )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
         self.lm_head = ParallelLMHead(
             self.config.draft_vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.draft_vocab_size,
-            padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
-            prefix=maybe_prefix(prefix, "lm_head"))
-        self.logits_processor = LogitsProcessor(self.config.draft_vocab_size,
-                                                scale=logit_scale)
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        self.logits_processor = LogitsProcessor(
+            self.config.draft_vocab_size, scale=logit_scale
+        )
         self.draft_id_to_target_id = nn.Parameter(
             torch.zeros(self.config.draft_vocab_size, dtype=torch.long),
             requires_grad=False,
         )
 
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: NestedTensors | None = None,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                f"{type(self).__name__} does not support multimodal inputs yet."
-            )
-        return self.model(input_ids, positions, hidden_states)
+        return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         if self.draft_id_to_target_id is None:
-            assert logits.shape[1] == self.config.vocab_size, \
-                "Expected logits to have shape " \
+            assert logits.shape[1] == self.config.vocab_size, (
+                "Expected logits to have shape "
                 f"(*, {self.config.vocab_size}), but got {logits.shape}"
+            )
             return logits
 
         base = torch.arange(self.config.draft_vocab_size, device=logits.device)
         targets = base + self.draft_id_to_target_id
-        logits_new = logits.new_full((
-            logits.shape[0],
-            self.config.vocab_size,
-        ), float('-inf'))
+        logits_new = logits.new_full(
+            (
+                logits.shape[0],
+                self.config.vocab_size,
+            ),
+            float("-inf"),
+        )
         logits_new[:, targets] = logits
         return logits_new
 
@@ -282,6 +352,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
             if "embed_tokens" in name:
                 includes_embed_tokens = True
             model_weights[name] = loaded_weight
+            process_eagle_weight(self, name)
 
         skip_substrs = []
         if not includes_draft_id_mapping:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4d8ed95b6cc8fc1b456d9ebfac39787a0ca19a3c..c1fb2d4f4af7d54058c922daded73de352661e8f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -3,47 +3,64 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
-                    Union)
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
-                          PixtralVisionConfig, PretrainedConfig,
-                          SiglipVisionConfig)
+from transformers import (
+    BatchFeature,
+    CLIPVisionConfig,
+    LlavaConfig,
+    PixtralVisionConfig,
+    PretrainedConfig,
+    SiglipVisionConfig,
+)
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems,
-                                    MultiModalUUIDDict)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        InputProcessingContext,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from .vision import get_num_selected_vision_tokens, get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TensorSchema):
@@ -53,10 +70,11 @@ class LlavaImagePixelInputs(TensorSchema):
         - c: Number of channels (3)
         - h: Height
         - w: Width
-    
+
     Note that `height` or `width` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
@@ -68,14 +86,16 @@ class PixtralHFImagePixelInputs(TensorSchema):
         - c: Number of channels
         - h: Height
         - w: Width
-    
+
     Note that `height` or `width` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+
     type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})]
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"}),
+    ]
 
 
 class LlavaImageEmbeddingInputs(TensorSchema):
@@ -85,36 +105,43 @@ class LlavaImageEmbeddingInputs(TensorSchema):
         - ifs: Image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs,
-                         LlavaImageEmbeddingInputs]
+LlavaImageInputs: TypeAlias = (
+    LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs
+)
 
 
 class LlavaMultiModalProjector(nn.Module):
-
-    def __init__(self,
-                 vision_hidden_size: int,
-                 text_hidden_size: int,
-                 projector_hidden_act: str,
-                 multimodal_projector_bias: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str,
+        multimodal_projector_bias: bool,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
-        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
-                                             text_hidden_size,
-                                             bias=multimodal_projector_bias,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.linear_1")
+        self.linear_1 = ColumnParallelLinear(
+            vision_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = RowParallelLinear(text_hidden_size,
-                                          text_hidden_size,
-                                          bias=multimodal_projector_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.linear_2")
+        self.linear_2 = RowParallelLinear(
+            text_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.linear_1(image_features)
@@ -127,7 +154,7 @@ class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
 
 
 class LlavaLikeProcessor(Protocol):
@@ -135,7 +162,6 @@ class LlavaLikeProcessor(Protocol):
 
 
 class BaseLlavaProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(LlavaConfig)
 
@@ -146,22 +172,9 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def _apply_feature_select_strategy(
-        self,
-        strategy: str,
-        encoder_num_image_tokens: int,
-    ) -> int:
-        if strategy == "default":
-            return encoder_num_image_tokens - 1
-        if strategy == "full":
-            return encoder_num_image_tokens
-
-        msg = f"Unexpected feature select strategy: {strategy!r}"
-        raise NotImplementedError(msg)
-
     def get_num_image_tokens(
         self,
         *,
@@ -171,12 +184,12 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
         hf_config = self.get_hf_config()
         vision_encoder_info = self.get_vision_encoder_info()
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
+        return get_num_selected_vision_tokens(
             vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
                 image_height=image_height,
             ),
+            hf_config.vision_feature_select_strategy,
         )
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -197,7 +210,6 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
 
 class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -210,22 +222,25 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object):
         hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
         # In case patch_size is omitted from `processor_config.json`
@@ -237,7 +252,6 @@ class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
 
 class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
-
     # Copied from BaseMultiModalProcessor
     @abstractmethod
     def _get_mm_fields_config(
@@ -258,7 +272,8 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -280,9 +295,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
         ]
 
 
-class LlavaMultiModalProcessor(
-        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
-
+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -295,14 +308,11 @@ class LlavaMultiModalProcessor(
 
 
 class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
 
 
-class PixtralHFMultiModalProcessor(
-        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
-
+class PixtralHFMultiModalProcessor(BaseMultiModalProcessor[PixtralHFProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -382,7 +392,8 @@ class PixtralHFMultiModalProcessor(
 
 
 def _build_llava_or_pixtral_hf_info(
-    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    ctx: InputProcessingContext,
+) -> BaseLlavaProcessingInfo:
     hf_config = ctx.get_hf_config(LlavaConfig)
 
     if isinstance(hf_config.vision_config, PixtralVisionConfig):
@@ -395,7 +406,7 @@ def _build_llava_or_pixtral_hf_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
@@ -417,7 +428,7 @@ def _build_llava_or_pixtral_hf_processor(
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
     """Determine the number of hidden layers to initialize up to in the
     visual encoder.
-    
+
     Args:
         hf_config: Model config with vision feature layer(s).
     """
@@ -428,10 +439,10 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
         return _get_layer_index(feature_layers, num_hidden_layers)
     # If we have multiple feature layers, initialize up to the deepest one
     elif isinstance(feature_layers, (list, tuple)):
-        return max(
-            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
-    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
-                    " is not supported")
+        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(
+        f"vision_layer_feature type: {type(feature_layers)} is not supported"
+    )
 
 
 def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
@@ -450,11 +461,11 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel:
     vision_config = hf_config.vision_config
 
     # Initialize the vision tower only up to the deepest required feature layer
@@ -489,14 +500,17 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
-                                        info=_build_llava_or_pixtral_hf_info,
-                                        dummy_inputs=LlavaDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_llava_or_pixtral_hf_processor,
+    info=_build_llava_or_pixtral_hf_info,
+    dummy_inputs=LlavaDummyInputsBuilder,
+)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     hf_to_vllm_mapper = WeightsMapper(
@@ -506,10 +520,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -527,11 +542,15 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
         # NOTE: These are special cases for Pixtral-12B in the HF-format
         # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
-        if (config.text_config.architectures is None
-                and config.text_config.model_type == "mistral"):
+        if (
+            config.text_config.architectures is None
+            and config.text_config.model_type == "mistral"
+        ):
             config.text_config.architectures = ["MistralForCausalLM"]
-        if (config.projector_hidden_act is None
-                and config.vision_config.hidden_act == "gelu"):
+        if (
+            config.projector_hidden_act is None
+            and config.vision_config.hidden_act == "gelu"
+        ):
             config.projector_hidden_act = "gelu"
 
         # TODO: Optionally initializes this for supporting embeddings.
@@ -540,14 +559,16 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                 config,
                 quant_config,
                 require_post_norm=False,
-                prefix=maybe_prefix(prefix, "vision_tower"))
+                prefix=maybe_prefix(prefix, "vision_tower"),
+            )
             self.multi_modal_projector = LlavaMultiModalProjector(
                 vision_hidden_size=config.vision_config.hidden_size,
                 text_hidden_size=config.text_config.hidden_size,
                 projector_hidden_act=config.projector_hidden_act,
                 multimodal_projector_bias=config.multimodal_projector_bias,
                 quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+                prefix=maybe_prefix(prefix, "multi_modal_projector"),
+            )
         else:
             self.vision_tower = None
             self.multi_modal_projector = None
@@ -559,10 +580,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaImageInputs]:
+        self, **kwargs: object
+    ) -> LlavaImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -570,74 +593,46 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             if self.config.vision_config.model_type == "pixtral":
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
-                    pixel_values=flatten_bn(pixel_values),
+                    pixel_values=pixel_values,
                 )
 
             expected_h = expected_w = self.config.vision_config.image_size
             return LlavaImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values, concat=True),
-                resolve_bindings={
-                    "h": expected_h,
-                    "w": expected_w
-                },
+                pixel_values=pixel_values,
+                resolve_bindings={"h": expected_h, "w": expected_w},
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             if self.config.vision_config.model_type == "pixtral":
                 raise ValueError("Pixtral-HF does not support image_embeds.")
 
             return LlavaImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
-                            PixtralHFVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features: Union[torch.Tensor, tuple[torch.Tensor, ...]] = \
-            vision_tower(pixel_values)
-
-        def select_features(leaf: torch.Tensor):
-            return self._select_image_features(
-                leaf,
-                strategy=self.config.vision_feature_select_strategy,
-            )
-
-        return json_map_leaves(select_features, image_features)
+        return vision_tower(
+            pixel_values,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
+        )
 
     def _process_image_pixels(
         self,
-        inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        inputs: LlavaImagePixelInputs | PixtralHFImagePixelInputs,
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -647,7 +642,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_image_input(
         self,
         image_input: LlavaImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -657,9 +652,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if isinstance(image_features, torch.Tensor):
             return self.multi_modal_projector(image_features)
 
-        feature_sizes = [
-            image_feature.shape[0] for image_feature in image_features
-        ]
+        feature_sizes = [image_feature.shape[0] for image_feature in image_features]
 
         image_embeds = self.multi_modal_projector(torch.cat(image_features))
         image_embeds = torch.split(image_embeds, feature_sizes)
@@ -668,38 +661,21 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LLaVA-1.5.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -740,29 +716,19 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.vision_tower is None and self.multi_modal_projector is None:
             skip_prefixes.extend(["vision_tower.", "multi_modal_projector."])
@@ -772,7 +738,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
 
 class MantisProcessingInfo(LlavaProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object):
         hf_config = self.get_hf_config()
         vision_info = self.get_vision_encoder_info()
@@ -787,14 +752,13 @@ class MantisProcessingInfo(LlavaProcessingInfo):
 
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
-
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -805,11 +769,13 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
             image_height=-1,
         )
 
-        result = super().apply(prompt,
-                               mm_data,
-                               hf_processor_mm_kwargs,
-                               tokenization_kwargs,
-                               mm_uuids=mm_uuids)
+        result = super().apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
@@ -819,21 +785,26 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         # We reimplement the functionality of MLlavaProcessor from
         # https://github.com/TIGER-AI-Lab/Mantis.git
         def get_replacement_mantis(item_idx: int):
-            return "".join([
-                f"(image {item_idx+1}: <Image>",  # 7 tokens
-                "<image>" * num_image_tokens,
-                "</Image>)",  # 3 tokens
-            ])
-
-        mantis_mm_repls = self._bind_and_group_updates([
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id] * num_image_tokens,
-                replacement=get_replacement_mantis,
+            return "".join(
+                [
+                    f"(image {item_idx + 1}: <Image>",  # 7 tokens
+                    "<image>" * num_image_tokens,
+                    "</Image>)",  # 3 tokens
+                ]
             )
-        ], mm_item_counts)
 
-        prompt_ids, prompt, _ = self._apply_prompt_updates(
+        mantis_mm_repls = self._bind_and_group_updates(
+            [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id] * num_image_tokens,
+                    replacement=get_replacement_mantis,
+                )
+            ],
+            mm_item_counts,
+        )
+
+        prompt_ids, _ = self._apply_prompt_updates(
             result["prompt_token_ids"],
             mantis_mm_repls,
         )
@@ -853,7 +824,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
         return MultiModalInputs(
             type="multimodal",
-            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
@@ -863,8 +833,10 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
-                                        info=MantisProcessingInfo,
-                                        dummy_inputs=LlavaDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    MantisMultiModalProcessor,
+    info=MantisProcessingInfo,
+    dummy_inputs=LlavaDummyInputsBuilder,
+)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c9133fde14552d689a200c37d60b2dc76245ceb2..98b1b46045c3dae3a62c46d267ddb1d8ffa150af 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,14 +3,15 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
-from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
-                    Union)
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
 from transformers.models.llava_next.modeling_llava_next import (
-    get_anyres_image_grid_shape, unpad_image)
+    get_anyres_image_grid_shape,
+    unpad_image,
+)
 
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -21,12 +22,22 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
-                    LlavaDummyInputsBuilder, LlavaLikeConfig,
-                    LlavaMultiModalProjector, init_vision_tower_for_llava)
+from .llava import (
+    BaseLlavaMultiModalProcessor,
+    BaseLlavaProcessingInfo,
+    LlavaDummyInputsBuilder,
+    LlavaLikeConfig,
+    LlavaMultiModalProjector,
+    init_vision_tower_for_llava,
+)
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, embed_multimodal,
-                    flatten_bn, init_vllm_registered_model, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from .vision import get_num_selected_vision_tokens
 
 
 class LlavaNextImagePixelInputs(TensorSchema):
@@ -37,16 +48,18 @@ class LlavaNextImagePixelInputs(TensorSchema):
         - c: Number of channels (3)
         - h: Height
         - w: Width
-    
+
     Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"})]
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"}),
+    ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
     # This should be in `(height, width)` format.
 
 
@@ -57,12 +70,14 @@ class LlavaNextImageEmbeddingInputs(TensorSchema):
         - ifs: Image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
-                             LlavaNextImageEmbeddingInputs]
+LlavaNextImageInputs: TypeAlias = (
+    LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs
+)
 
 
 class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
@@ -70,7 +85,6 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
 
 
 class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
-
     def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
@@ -95,12 +109,12 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
         hf_config = self.get_hf_config()
         vision_encoder_info = self.get_vision_encoder_info()
 
-        base_feature_size = self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
+        base_feature_size = get_num_selected_vision_tokens(
             vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
                 image_height=image_height,
             ),
+            hf_config.vision_feature_select_strategy,
         )
 
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
@@ -140,12 +154,14 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
 
         if aspect_ratio > current_aspect_ratio:
             new_height = int(
-                round(original_height * (current_width / original_width), 7))
+                round(original_height * (current_width / original_width), 7)
+            )
             padding = (current_height - new_height) // 2
             current_height = current_height - (2 * padding)
         else:
             new_width = int(
-                round(original_width * (current_height / original_height), 7))
+                round(original_width * (current_height / original_height), 7)
+            )
             padding = (current_width - new_width) // 2
             current_width = current_width - (2 * padding)
 
@@ -158,13 +174,13 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
         hf_config = self.get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self.get_num_image_tokens(image_width=width,
-                                                  image_height=height)
+        for height, width in hf_config.image_grid_pinpoints:
+            feat_size = self.get_num_image_tokens(
+                image_width=width, image_height=height
+            )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
@@ -176,7 +192,6 @@ _I = TypeVar("_I", bound=LlavaNextProcessingInfo)
 
 
 class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
-
     # Copied from BaseMultiModalProcessor
     @abstractmethod
     def _get_mm_fields_config(
@@ -188,8 +203,8 @@ class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
 
 
 class LlavaNextMultiModalProcessor(
-        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
-
+    BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]
+):
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -202,11 +217,13 @@ class LlavaNextMultiModalProcessor(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
-                                        info=LlavaNextProcessingInfo,
-                                        dummy_inputs=LlavaDummyInputsBuilder)
-class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextMultiModalProcessor,
+    info=LlavaNextProcessingInfo,
+    dummy_inputs=LlavaDummyInputsBuilder,
+)
+class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -216,10 +233,11 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.image_newline": "image_newline",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -235,16 +253,18 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Determine the layer up to which we will initialize the vision tower
         if isinstance(vision_feature_layer, int):
             vision_hidden_size = config.vision_config.hidden_size
-            self.feature_sample_layers = None
+            self.select_layers = None
         # Used for multimodal granite models to control encoder outputs
         elif isinstance(vision_feature_layer, (list, tuple)):
             vision_hidden_size = config.vision_config.hidden_size * len(
-                vision_feature_layer)
-            self.feature_sample_layers = vision_feature_layer
+                vision_feature_layer
+            )
+            self.select_layers = vision_feature_layer
         else:
             raise TypeError(
                 f"vision_layer_feature type: {type(vision_feature_layer)}"
-                " is not supported")
+                " is not supported"
+            )
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -254,14 +274,15 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             config,
             quant_config,
             require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
-        self.image_newline = nn.Parameter(
-            torch.empty(config.text_config.hidden_size))
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
+        self.image_newline = nn.Parameter(torch.empty(config.text_config.hidden_size))
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=vision_hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act,
-            multimodal_projector_bias=config.multimodal_projector_bias)
+            multimodal_projector_bias=config.multimodal_projector_bias,
+        )
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -270,10 +291,12 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+        self, **kwargs: object
+    ) -> LlavaNextImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -282,78 +305,56 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(image_sizes, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
-
             expected_h = expected_w = self.config.vision_config.image_size
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values),
-                image_sizes=flatten_bn(image_sizes, concat=True),
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
                 resolve_bindings={
                     "h": expected_h,
                     "w": expected_w,
-                })
+                },
+            )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeds. "
-                                 f"Got type: {type(image_embeds)}")
-
             return LlavaNextImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(
-            pixel_values, feature_sample_layers=self.feature_sample_layers)
-
-        return self._select_image_features(
-            image_features,
-            strategy=self.config.vision_feature_select_strategy,
+        return vision_tower(
+            pixel_values,
+            select_layers=self.select_layers,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
         )
 
     # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
-    def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
-                                      patch_embeddings: torch.Tensor, *,
-                                      strategy: str) -> torch.Tensor:
+    def _merge_image_patch_embeddings(
+        self, image_size: torch.Tensor, patch_embeddings: torch.Tensor, *, strategy: str
+    ) -> torch.Tensor:
         if strategy == "flat":
             return patch_embeddings.flatten(0, 1)
 
         if strategy.startswith("spatial"):
-            height = width = self.config.vision_config.image_size \
+            height = width = (
+                self.config.vision_config.image_size
                 // self.config.vision_config.patch_size
+            )
 
             base_patch_embeds = patch_embeddings[0]
             if height * width != base_patch_embeds.shape[0]:
                 raise ValueError(
-                    "The number of patches is not consistent with the "
-                    "image size.")
+                    "The number of patches is not consistent with the image size."
+                )
 
             if patch_embeddings.shape[0] > 1:
                 other_patch_embeds = patch_embeddings[1:]
@@ -370,37 +371,51 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                 num_patches = num_patch_height * num_patch_width
 
                 # Image patches might be padded for batch processing
-                other_patch_embeds = other_patch_embeds[:num_patches] \
-                    .view(num_patch_height, num_patch_width, height, width, -1)
+                other_patch_embeds = other_patch_embeds[:num_patches].view(
+                    num_patch_height, num_patch_width, height, width, -1
+                )
 
                 if "unpad" in strategy:
-                    other_patch_embeds = other_patch_embeds \
-                        .permute(4, 0, 2, 1, 3).contiguous() \
-                        .flatten(1, 2).flatten(2, 3)
-                    other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     (orig_height, orig_width))
-                    other_patch_embeds = torch.cat((
-                        other_patch_embeds,
-                        self.image_newline[:, None, None] \
-                            .expand(*other_patch_embeds.shape[:-1], 1) \
+                    other_patch_embeds = (
+                        other_patch_embeds.permute(4, 0, 2, 1, 3)
+                        .contiguous()
+                        .flatten(1, 2)
+                        .flatten(2, 3)
+                    )
+                    other_patch_embeds = unpad_image(
+                        other_patch_embeds, (orig_height, orig_width)
+                    )
+                    other_patch_embeds = torch.cat(
+                        (
+                            other_patch_embeds,
+                            self.image_newline[:, None, None]
+                            .expand(*other_patch_embeds.shape[:-1], 1)
                             .to(other_patch_embeds.device),
-                    ), dim=-1)
-                    other_patch_embeds = other_patch_embeds \
-                        .flatten(1, 2).transpose(0, 1)
+                        ),
+                        dim=-1,
+                    )
+                    other_patch_embeds = other_patch_embeds.flatten(1, 2).transpose(
+                        0, 1
+                    )
                 else:
-                    other_patch_embeds = other_patch_embeds \
-                        .permute(0, 2, 1, 3, 4).contiguous() \
+                    other_patch_embeds = (
+                        other_patch_embeds.permute(0, 2, 1, 3, 4)
+                        .contiguous()
                         .flatten(0, 3)
+                    )
 
                 merged_patch_embeddings = torch.cat(
-                    (base_patch_embeds, other_patch_embeds), dim=0)
+                    (base_patch_embeds, other_patch_embeds), dim=0
+                )
             else:
                 if "unpad" in strategy:
                     merged_patch_embeddings = torch.cat(
-                        (base_patch_embeds,
-                         self.image_newline[None] \
-                            .to(base_patch_embeds.device)
-                    ), dim=0)
+                        (
+                            base_patch_embeds,
+                            self.image_newline[None].to(base_patch_embeds.device),
+                        ),
+                        dim=0,
+                    )
                 else:
                     merged_patch_embeddings = base_patch_embeds
 
@@ -411,7 +426,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -420,25 +435,30 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             b, num_patches, c, h, w = pixel_values.shape
             stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
             stacked_image_features = self._image_pixels_to_features(
-                self.vision_tower, stacked_pixel_values)
+                self.vision_tower, stacked_pixel_values
+            )
             stacked_patch_embeddings = self.multi_modal_projector(
-                stacked_image_features)
+                stacked_image_features
+            )
 
             return stacked_patch_embeddings.view(
-                b, num_patches, *stacked_patch_embeddings.shape[1:])
+                b, num_patches, *stacked_patch_embeddings.shape[1:]
+            )
 
         num_patches_per_batch = [v.shape[0] for v in pixel_values]
         stacked_pixel_values = torch.cat(pixel_values)
         stacked_image_features = self._image_pixels_to_features(
-            self.vision_tower, stacked_pixel_values)
+            self.vision_tower, stacked_pixel_values
+        )
 
-        return torch.split(self.multi_modal_projector(stacked_image_features),
-                           num_patches_per_batch)
+        return torch.split(
+            self.multi_modal_projector(stacked_image_features), num_patches_per_batch
+        )
 
     def _process_image_input(
         self,
         image_input: LlavaNextImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return [image_input["data"]]
 
@@ -449,53 +469,55 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             batch_size = len(image_input["data"])
             vision_config = self.config.vision_config
             default_height = default_width = vision_config.image_size
-            image_sizes = torch.as_tensor([[default_height, default_width]
-                                           for _ in range(batch_size)])
+            image_sizes = torch.as_tensor(
+                [[default_height, default_width] for _ in range(batch_size)]
+            )
 
         return [
-            self._merge_image_patch_embeddings(image_sizes[i],
-                                               patch_features_batch,
-                                               strategy="spatial_unpad")
+            self._merge_image_patch_embeddings(
+                image_sizes[i], patch_features_batch, strategy="spatial_unpad"
+            )
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
 
-        if multimodal_embeddings is None \
-            or len(multimodal_embeddings) == 0:
-            return self.language_model.get_input_embeddings(input_ids)
-
-        inputs_embeds = embed_multimodal(
+        return super().embed_input_ids(
             input_ids,
-            self.config.image_token_index,
-            self.language_model.model.get_input_embeddings,
-            multimodal_embeddings,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
-        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-NeXT.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -545,27 +567,17 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 610fb188d57d2767bfdb048bfdaf5739a2967d17..902c598c226f0d8595557e2e65e33c135ae26dfe 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -3,61 +3,75 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, LlavaNextVideoConfig,
-                          LlavaNextVideoProcessor)
+from transformers import BatchFeature, LlavaNextVideoConfig, LlavaNextVideoProcessor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageSize,
+    MultiModalDataItems,
+    VideoEmbeddingItems,
+    VideoProcessorItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import get_vision_encoder_info
 
 
 class LlavaNextVideoPixelInputs(TensorSchema):
-    """    
+    """
     Dimensions:
-        - bs: Batch size
-        - nv: Number of videos
-        - nf: Number of frames
-        - nc: Number of channels (3)
+        - bn: Batch size * number of videos
+        - f: Number of frames
+        - c: Number of channels (3)
         - h: Height of each frame
         - w: Width of each frame
 
-    Note that `num_frames` may be different for each batch, in which case
+    Note that `f` may be different for each batch, in which case
     the data is passed as a list instead of a batched tensor.
 
     Note that it only supports one video input for one batch.
     """
+
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("bs", "nv", "nf", 3, "h", "w")]
+    pixel_values_videos: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
+    ]
 
 
 class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
@@ -67,7 +81,7 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"video": 1}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -137,8 +151,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
 
 class LlavaNextVideoDummyInputsBuilder(
-        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
-
+    BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]
+):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_videos = mm_counts.get("video", 0)
 
@@ -151,28 +165,31 @@ class LlavaNextVideoDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        video_overrides = mm_options.get("video") if mm_options else None
 
         return {
-            "video":
-            self._get_dummy_videos(
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
+                overrides=video_overrides,
             )
         }
 
 
 class LlavaNextVideoMultiModalProcessor(
-        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
-
+    BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]
+):
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -191,7 +208,8 @@ class LlavaNextVideoMultiModalProcessor(
 
         def get_replacement(item_idx: int):
             videos = mm_items.get_items(
-                "video", (VideoEmbeddingItems, VideoProcessorItems))
+                "video", (VideoEmbeddingItems, VideoProcessorItems)
+            )
 
             if isinstance(videos, VideoEmbeddingItems):
                 num_video_tokens = videos.get_feature_size(item_idx)
@@ -216,7 +234,6 @@ class LlavaNextVideoMultiModalProcessor(
 
 # adopted from transformers modeling_llava_next_video.py
 class LlavaNextVideoPooler(nn.Module):
-
     def __init__(self, config: LlavaNextVideoConfig):
         super().__init__()
 
@@ -233,36 +250,41 @@ class LlavaNextVideoPooler(nn.Module):
         else:
             # TODO: Support Conv2d pooling layer, need to load weights
             raise ValueError(
-                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
+                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]"
+            )
 
     def forward(self, image_features: torch.Tensor):
         ori_width = int(
-            math.sqrt(image_features.shape[1] * self.image_size //
-                      self.image_size))
+            math.sqrt(image_features.shape[1] * self.image_size // self.image_size)
+        )
         ori_height = int(ori_width * self.image_size // self.image_size)
 
         batch_size, _, dim = image_features.shape
-        image_features_spatial = image_features \
-            .view(batch_size, ori_height, ori_height, dim) \
-            .permute(0, 3, 1, 2)
+        image_features_spatial = image_features.view(
+            batch_size, ori_height, ori_height, dim
+        ).permute(0, 3, 1, 2)
         image_features_spatial = self.pool(image_features_spatial)
 
         return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
 
 
 class LlavaNextMultiModalProjector(nn.Module):
-
-    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
-                 projector_hidden_act: str, multimodal_projector_bias: bool):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str,
+        multimodal_projector_bias: bool,
+    ):
         super().__init__()
 
-        self.linear_1 = nn.Linear(vision_hidden_size,
-                                  text_hidden_size,
-                                  bias=multimodal_projector_bias)
+        self.linear_1 = nn.Linear(
+            vision_hidden_size, text_hidden_size, bias=multimodal_projector_bias
+        )
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = nn.Linear(text_hidden_size,
-                                  text_hidden_size,
-                                  bias=multimodal_projector_bias)
+        self.linear_2 = nn.Linear(
+            text_hidden_size, text_hidden_size, bias=multimodal_projector_bias
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.linear_1(image_features)
@@ -276,8 +298,8 @@ class LlavaNextMultiModalProjector(nn.Module):
     info=LlavaNextVideoProcessingInfo,
     dummy_inputs=LlavaNextVideoDummyInputsBuilder,
 )
-class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                             SupportsPP):
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -287,10 +309,11 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.image_newline": "image_newline",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -312,13 +335,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
             config,
             quant_config,
             require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.vision_resampler = LlavaNextVideoPooler(config)
         self.multi_modal_projector = LlavaNextMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act,
-            multimodal_projector_bias=config.multimodal_projector_bias)
+            multimodal_projector_bias=config.multimodal_projector_bias,
+        )
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
@@ -326,14 +351,16 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.model.make_empty_intermediate_tensors)
+            self.language_model.model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
+        self, **kwargs: object
+    ) -> LlavaNextVideoPixelInputs | None:
         """
         A legal video input should have the following dimensions:
         {
-            "pixel_values_videos" : 
+            "pixel_values_videos" :
                 list[b, Tensor(nb_frames, nb_channels, height, width)]
         }
         """
@@ -343,34 +370,25 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         expected_h = expected_w = self.config.vision_config.image_size
-        return LlavaNextVideoPixelInputs(type="pixel_values_videos",
-                                         data=pixel_values_videos,
-                                         resolve_bindings={
-                                             "h": expected_h,
-                                             "w": expected_w,
-                                         })
-
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values_videos",
+            pixel_values_videos=pixel_values_videos,
+            resolve_bindings={
+                "h": expected_h,
+                "w": expected_w,
+            },
+        )
 
     def _video_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values)
-        image_features = self._select_image_features(
-            image_features,
-            strategy=self.config.vision_feature_select_strategy,
+        image_features = vision_tower(
+            pixel_values,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
         )
         image_features = self.vision_resampler(image_features)
         image_features = self.multi_modal_projector(image_features)
@@ -379,63 +397,46 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
         assert self.vision_tower is not None
 
-        video_pixels = inputs["data"]
+        video_pixels = inputs["pixel_values_videos"]
 
         if isinstance(video_pixels, torch.Tensor):
-            # TODO: support multiple videos per input
-            b, num_videos, num_frames, c, h, w = video_pixels.shape
-            assert (num_videos == 1)
-            stacked_pixels = video_pixels.view(b * num_videos * num_frames, c,
-                                               h, w)
+            bn, f, c, h, w = video_pixels.shape
+            stacked_pixels = video_pixels.view(bn * f, c, h, w)
             stacked_embeddings = self._video_pixels_to_features(
-                self.vision_tower, stacked_pixels)
-            embeds = stacked_embeddings.view(b, num_frames,
-                                             *stacked_embeddings.shape[1:])
+                self.vision_tower, stacked_pixels
+            )
+            embeds = stacked_embeddings.view(bn, f, *stacked_embeddings.shape[1:])
 
         elif is_list_of(video_pixels, torch.Tensor):
             frames_per_videos = [v.shape[0] for v in video_pixels]
             stacked_pixels = torch.cat(video_pixels, dim=0)
             stacked_embeddings = self._video_pixels_to_features(
-                self.vision_tower, stacked_pixels)
+                self.vision_tower, stacked_pixels
+            )
             embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
         else:
-            raise ValueError(
-                f"Unsupported type of video input {type(video_pixels)}")
+            raise ValueError(f"Unsupported type of video input {type(video_pixels)}")
 
         return [e.flatten(0, 1) for e in embeds]
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return []
         vision_embeddings = self._process_video_pixels(video_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.video_token_index)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-NeXT-Video.
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
@@ -445,29 +446,19 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             # This model doesn't support images for now
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index cee9ddaf94cc4bbb1fcaf7450fd602be66fc1866..322bde94ff66d2716b939f29263f1f5688abfb23 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,22 +3,31 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Optional, Protocol, Union
+from typing import Annotated, Final, Literal, Protocol, TypeAlias
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, LlavaOnevisionConfig,
-                          LlavaOnevisionProcessor)
+from transformers import BatchFeature, LlavaOnevisionConfig, LlavaOnevisionProcessor
 from transformers.models.llava_onevision.modeling_llava_onevision import (
-    get_anyres_image_grid_shape, unpad_image)
+    get_anyres_image_grid_shape,
+    unpad_image,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageSize,
+    MultiModalDataItems,
+    VideoEmbeddingItems,
+    VideoProcessorItems,
+)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -26,12 +35,18 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
-from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
-                         LlavaNextProcessingInfo)
+from .llava_next import (
+    BaseLlavaNextMultiModalProcessor,
+    LlavaNextLikeConfig,
+    LlavaNextProcessingInfo,
+)
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -46,14 +61,15 @@ class LlavaOnevisionVideoPixelInputs(TensorSchema):
         - h: Height
         - w: Width
 
-        Note that `num_videos` may be different for each batch, and 'num_frames'
+        Note that `f` may be different for each batch, and 'num_frames'
         may be different for each video, in which case the data is passed as a
         list instead of a batched tensor.
     """
+
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
     pixel_values_videos: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
     ]
 
@@ -70,14 +86,15 @@ class LlavaOnevisionImagePixelInputs(TensorSchema):
         Note that `num_patches` may be different per batch and image,
         in which case the data is passed as a list instead of a batched tensor.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"}),
     ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
 
 
 class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
@@ -87,6 +104,7 @@ class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
         - ifs: Image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
 
     data: Annotated[
@@ -95,11 +113,13 @@ class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
     ]
 
 
-LlavaOnevisionImageInputs = Union[LlavaOnevisionImagePixelInputs,
-                                  LlavaOnevisionImageEmbeddingInputs]
+LlavaOnevisionImageInputs: TypeAlias = (
+    LlavaOnevisionImagePixelInputs | LlavaOnevisionImageEmbeddingInputs
+)
 
-LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs,
-                                  LlavaOnevisionVideoPixelInputs]
+LlavaOnevisionMultiInputs: TypeAlias = (
+    LlavaOnevisionImageInputs | LlavaOnevisionVideoPixelInputs
+)
 
 
 class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
@@ -107,14 +127,13 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
 
 
 class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
-
     def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -136,12 +155,14 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
         if aspect_ratio > current_aspect_ratio:
             new_height = int(
-                round(original_height * (current_width / original_width), 7))
+                round(original_height * (current_width / original_width), 7)
+            )
             padding = (current_height - new_height) // 2
             current_height = current_height - (2 * padding)
         else:
             new_width = int(
-                round(original_width * (current_height / original_height), 7))
+                round(original_width * (current_height / original_height), 7)
+            )
             padding = (current_width - new_width) // 2
             current_width = current_width - (2 * padding)
 
@@ -218,8 +239,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
         max_videos = mm_counts.get("video", 0)
 
         max_total_frames = self._get_max_video_frames(seq_len)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   _MAX_FRAMES_PER_VIDEO)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
+        )
 
         return max(max_frames_per_video, 1)
 
@@ -233,14 +255,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
         )
 
 
 class LlavaOnevisionDummyInputsBuilder(
-        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
-
+    LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]
+):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -255,34 +276,39 @@ class LlavaOnevisionDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len,
-                                                        mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
-            )
+                overrides=video_overrides,
+            ),
         }
 
 
 class LlavaOnevisionMultiModalProcessor(
-        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
-
+    BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]
+):
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -399,7 +425,8 @@ class LlavaOnevisionMultiModalProcessor(
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
-                "video", (VideoEmbeddingItems, VideoProcessorItems))
+                "video", (VideoEmbeddingItems, VideoProcessorItems)
+            )
 
             if isinstance(videos, VideoEmbeddingItems):
                 num_video_tokens = videos.get_feature_size(item_idx)
@@ -424,17 +451,20 @@ class LlavaOnevisionMultiModalProcessor(
 
 
 class LlavaOnevisionMultiModalProjector(nn.Module):
-
     def __init__(self, config: LlavaOnevisionConfig):
         super().__init__()
 
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size,
-                                  config.text_config.hidden_size,
-                                  bias=config.multimodal_projector_bias)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
         self.act = get_act_fn(config.projector_hidden_act)
-        self.linear_2 = nn.Linear(config.text_config.hidden_size,
-                                  config.text_config.hidden_size,
-                                  bias=config.multimodal_projector_bias)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.linear_1(image_features)
@@ -446,9 +476,10 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(
     LlavaOnevisionMultiModalProcessor,
     info=LlavaOnevisionProcessingInfo,
-    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
-class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                             SupportsPP):
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder,
+)
+class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -458,10 +489,11 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.image_newline": "image_newline",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -483,21 +515,23 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             config,
             quant_config,
             require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        self.image_newline = nn.Parameter(
-            torch.empty(config.text_config.hidden_size))
+        self.image_newline = nn.Parameter(torch.empty(config.text_config.hidden_size))
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.model.make_empty_intermediate_tensors)
+            self.language_model.model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaOnevisionImageInputs]:
+        self, **kwargs: object
+    ) -> LlavaOnevisionImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -506,42 +540,31 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(image_sizes, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
-
             return LlavaOnevisionImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values),
-                image_sizes=flatten_bn(image_sizes, concat=True),
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
                 resolve_bindings={
                     "h": self.config.vision_config.image_size,
-                    "w": self.config.vision_config.image_size
-                })
+                    "w": self.config.vision_config.image_size,
+                },
+            )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeds. "
-                                 f"Got type: {type(image_embeds)}")
-
             return LlavaOnevisionImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
     def _parse_and_validate_video_input(
-            self,
-            **kwargs: object) -> Optional[LlavaOnevisionVideoPixelInputs]:
+        self, **kwargs: object
+    ) -> LlavaOnevisionVideoPixelInputs | None:
         """
         A legal video input should have the following dimensions:
         {
-            "pixel_values_videos" : 
+            "pixel_values_videos" :
                 list[b, Tensor(nb_frames, nb_channels, height, width)]
         }
         """
@@ -549,17 +572,14 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values_videos is None:
             return None
 
-        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel_values_videos. "
-                             f"Got type: {type(pixel_values_videos)}")
-
         return LlavaOnevisionVideoPixelInputs(
             type="pixel_values_videos",
-            pixel_values_videos=flatten_bn(pixel_values_videos),
+            pixel_values_videos=pixel_values_videos,
             resolve_bindings={
                 "h": self.config.vision_config.image_size,
-                "w": self.config.vision_config.image_size
-            })
+                "w": self.config.vision_config.image_size,
+            },
+        )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
@@ -567,60 +587,59 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values", "image_embeds"
-                             ) and "image" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "image"] = self._parse_and_validate_image_input(**kwargs)
-            if input_key in ("pixel_values_videos", "video_embeds"
-                             ) and "video" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
 
         return mm_input_by_modality
 
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values)
-        return self._select_image_features(
-            image_features,
-            strategy=self.config.vision_feature_select_strategy,
+        return vision_tower(
+            pixel_values,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
         )
 
     # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
-    def _merge_image_patch_embeddings(self,
-                                      image_size: torch.Tensor,
-                                      patch_embeddings: torch.Tensor,
-                                      *,
-                                      image_newline=None,
-                                      vision_aspect_ratio="anyres_max_9",
-                                      strategy: str) -> torch.Tensor:
+    def _merge_image_patch_embeddings(
+        self,
+        image_size: torch.Tensor,
+        patch_embeddings: torch.Tensor,
+        *,
+        image_newline=None,
+        vision_aspect_ratio="anyres_max_9",
+        strategy: str,
+    ) -> torch.Tensor:
         if strategy == "flat":
             return patch_embeddings.flatten(0, 1)
 
         if strategy.startswith("spatial"):
-            height = width = self.config.vision_config.image_size \
+            height = width = (
+                self.config.vision_config.image_size
                 // self.config.vision_config.patch_size
+            )
 
             base_patch_embeds = patch_embeddings[0]
             if height * width != base_patch_embeds.shape[0]:
                 raise ValueError(
-                    "The number of patches is not consistent with the "
-                    "image size.")
+                    "The number of patches is not consistent with the image size."
+                )
 
             if patch_embeddings.shape[0] > 1:
                 other_patch_embeds = patch_embeddings[1:]
@@ -637,53 +656,66 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                 num_patches = num_patch_height * num_patch_width
 
                 # Image patches might be padded for batch processing
-                other_patch_embeds = other_patch_embeds[:num_patches] \
-                    .view(num_patch_height, num_patch_width, height, width, -1)
+                other_patch_embeds = other_patch_embeds[:num_patches].view(
+                    num_patch_height, num_patch_width, height, width, -1
+                )
 
                 if "unpad" in strategy:
-                    other_patch_embeds = other_patch_embeds \
-                        .permute(4, 0, 2, 1, 3).contiguous() \
-                        .flatten(1, 2).flatten(2, 3)
-                    other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     (orig_height, orig_width))
+                    other_patch_embeds = (
+                        other_patch_embeds.permute(4, 0, 2, 1, 3)
+                        .contiguous()
+                        .flatten(1, 2)
+                        .flatten(2, 3)
+                    )
+                    other_patch_embeds = unpad_image(
+                        other_patch_embeds, (orig_height, orig_width)
+                    )
                     max_num_patches = int(
-                        vision_aspect_ratio.removeprefix("anyres_max_"))
+                        vision_aspect_ratio.removeprefix("anyres_max_")
+                    )
                     channels, curr_height, curr_width = other_patch_embeds.shape
-                    ratio = math.sqrt(curr_height * curr_width /
-                                      (max_num_patches * height**2))
+                    ratio = math.sqrt(
+                        curr_height * curr_width / (max_num_patches * height**2)
+                    )
                     if ratio > 1.1:
                         other_patch_embeds = other_patch_embeds[None]
                         other_patch_embeds = nn.functional.interpolate(
-                            other_patch_embeds, [
-                                int(curr_height // ratio),
-                                int(curr_width // ratio)
-                            ],
-                            mode="bilinear")[0]
+                            other_patch_embeds,
+                            [int(curr_height // ratio), int(curr_width // ratio)],
+                            mode="bilinear",
+                        )[0]
                     if image_newline is not None:
                         other_patch_embeds = torch.cat(
                             (
                                 other_patch_embeds,
-                                image_newline[:, None, None] \
-                                .expand(*other_patch_embeds.shape[:-1], 1) \
+                                image_newline[:, None, None]
+                                .expand(*other_patch_embeds.shape[:-1], 1)
                                 .to(other_patch_embeds.device),
                             ),
-                        dim=-1)
-                    other_patch_embeds = other_patch_embeds \
-                        .flatten(1, 2).transpose(0, 1)
+                            dim=-1,
+                        )
+                    other_patch_embeds = other_patch_embeds.flatten(1, 2).transpose(
+                        0, 1
+                    )
                 else:
-                    other_patch_embeds = other_patch_embeds \
-                        .permute(0, 2, 1, 3, 4).contiguous() \
+                    other_patch_embeds = (
+                        other_patch_embeds.permute(0, 2, 1, 3, 4)
+                        .contiguous()
                         .flatten(0, 3)
+                    )
 
                 merged_patch_embeddings = torch.cat(
-                    (base_patch_embeds, other_patch_embeds), dim=0)
+                    (base_patch_embeds, other_patch_embeds), dim=0
+                )
             else:
                 if "unpad" in strategy:
                     merged_patch_embeddings = torch.cat(
-                        (base_patch_embeds,
-                         self.image_newline[None] \
-                            .to(base_patch_embeds.device)
-                    ), dim=0)
+                        (
+                            base_patch_embeds,
+                            self.image_newline[None].to(base_patch_embeds.device),
+                        ),
+                        dim=0,
+                    )
                 else:
                     merged_patch_embeddings = base_patch_embeds
 
@@ -694,7 +726,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _process_image_pixels(
         self,
         inputs: LlavaOnevisionImagePixelInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -703,27 +735,33 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             b, num_patches, c, h, w = pixel_values.shape
             stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
             stacked_image_features = self._image_pixels_to_features(
-                self.vision_tower, stacked_pixel_values)
+                self.vision_tower, stacked_pixel_values
+            )
             stacked_patch_embeddings = self.multi_modal_projector(
-                stacked_image_features)
+                stacked_image_features
+            )
 
             return stacked_patch_embeddings.view(
-                b, num_patches, *stacked_patch_embeddings.shape[1:])
+                b, num_patches, *stacked_patch_embeddings.shape[1:]
+            )
 
         num_patches_per_batch = [v.shape[0] for v in pixel_values]
         stacked_pixel_values = torch.cat(pixel_values)
         stacked_image_features = self._image_pixels_to_features(
-            self.vision_tower, stacked_pixel_values)
+            self.vision_tower, stacked_pixel_values
+        )
 
         return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
+            self.multi_modal_projector(image_features)
+            for image_features in torch.split(
+                stacked_image_features, num_patches_per_batch
+            )
         ]
 
     def _process_image_input(
         self,
         image_input: LlavaOnevisionImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return [image_input["data"]]
 
@@ -734,30 +772,30 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             batch_size = len(image_input["pixel_values"])
             vision_config = self.config.vision_config
             default_height = default_width = vision_config.image_size
-            image_sizes = torch.as_tensor([[default_height, default_width]
-                                           for _ in range(batch_size)])
+            image_sizes = torch.as_tensor(
+                [[default_height, default_width] for _ in range(batch_size)]
+            )
 
         return [
             self._merge_image_patch_embeddings(
                 image_sizes[i],
                 patch_features_batch,
                 image_newline=self.image_newline,
-                strategy="spatial_unpad")
+                strategy="spatial_unpad",
+            )
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
     def _video_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        video_features = vision_tower(pixel_values)
-        video_features = self._select_image_features(
-            video_features,
-            strategy=self.config.vision_feature_select_strategy,
+        video_features = vision_tower(
+            pixel_values,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
         )
         video_features = self.multi_modal_projector(video_features)
         video_features = self.apply_pooling(video_features)
@@ -770,36 +808,39 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         if isinstance(video_pixels, torch.Tensor):
             total_videos, frames, c, h, w = video_pixels.shape
-            video_pixels_flat = video_pixels.view(total_videos * frames, c, h,
-                                                  w)
+            video_pixels_flat = video_pixels.view(total_videos * frames, c, h, w)
 
             embeddings_flat = self._video_pixels_to_features(
-                self.vision_tower, video_pixels_flat)
+                self.vision_tower, video_pixels_flat
+            )
 
             embeddings_flat = embeddings_flat.reshape(
-                total_videos, frames * embeddings_flat.shape[1], -1)
+                total_videos, frames * embeddings_flat.shape[1], -1
+            )
 
             image_newline = self.image_newline[None, None, :].expand(
-                total_videos, -1, -1)
+                total_videos, -1, -1
+            )
             return torch.cat((embeddings_flat, image_newline), dim=1)
 
         frames_per_video = [len(video) for video in video_pixels]
         video_pixels_flat = torch.cat(video_pixels)
 
         embeddings_flat = self._video_pixels_to_features(
-            self.vision_tower, video_pixels_flat)
+            self.vision_tower, video_pixels_flat
+        )
 
         image_newline = self.image_newline[None, None, :]
 
         return [
             torch.cat(
                 (
-                    embeds.reshape(1, num_frame * embeddings_flat.shape[1],
-                                   -1),
+                    embeds.reshape(1, num_frame * embeddings_flat.shape[1], -1),
                     image_newline,
                 ),
                 dim=1,
-            ) for num_frame, embeds in zip(
+            )
+            for num_frame, embeds in zip(
                 frames_per_video,
                 torch.split(embeddings_flat, frames_per_video),
             )
@@ -815,9 +856,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         # TODO support other pooling types config
         height, width = image_features.shape[2:]
         scaled_shape = [math.ceil(height / stride), math.ceil(width / stride)]
-        image_feature = nn.functional.interpolate(image_features,
-                                                  size=scaled_shape,
-                                                  mode='bilinear')
+        image_feature = nn.functional.interpolate(
+            image_features, size=scaled_shape, mode="bilinear"
+        )
         image_feature = image_feature.permute(0, 2, 3, 1)
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
@@ -825,10 +866,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
             return None
@@ -842,62 +881,22 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                vision_embeddings = self._process_image_input(multimodal_input)
-                multimodal_embeddings += tuple(vision_embeddings)
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_pixels(multimodal_input)
                 multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [self.config.image_token_index, self.config.video_token_index])
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[LlavaOnevisionImagePixelInputs] = None,
-        video_input: Optional[LlavaOnevisionVideoPixelInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_index,
-            )
-
-        if video_input is not None:
-            video_embeds = self._process_video_pixels(video_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_index,
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-Onevision.
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
@@ -907,36 +906,18 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input)
-                input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 78e6e3d4b5352cd1bfb8cfe77c6948117e2cba3f..5de10e708683008ffbad2260a384156413820a21 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -32,9 +32,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 """Inference-only Flash model compatible with HuggingFace weights."""
+
 import typing
 from collections.abc import Callable, Iterable
-from typing import Optional, Union
+from itertools import islice
 
 import torch
 from torch import nn
@@ -47,29 +48,37 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    block_dequant)
+from vllm.model_executor.layers.quantization.utils.int8_utils import block_dequant
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2MLAAttention
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class FlashConfig(PretrainedConfig):
     """Flash model configuration."""
+
     model_type = "longcat_flash"
     keys_to_ignore_at_inference = ["past_key_values"]
 
@@ -105,7 +114,7 @@ class FlashConfig(PretrainedConfig):
         attention_dropout=0.0,
         mla_scale_q_lora=False,
         mla_scale_kv_lora=False,
-        torch_dtype="bfloat16",
+        dtype="bfloat16",
         params_dtype="bfloat16",
         router_dtype="float32",
         router_bias=False,
@@ -121,7 +130,7 @@ class FlashConfig(PretrainedConfig):
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             params_dtype=params_dtype,
             router_dtype=router_dtype,
             topk_method=topk_method,
@@ -132,8 +141,9 @@ class FlashConfig(PretrainedConfig):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
-        self.num_hidden_layers = (num_hidden_layers if num_hidden_layers
-                                  is not None else num_layers)
+        self.num_hidden_layers = (
+            num_hidden_layers if num_hidden_layers is not None else num_layers
+        )
         self.num_attention_heads = num_attention_heads
         self.ep_size = ep_size
         self.kv_lora_rank = kv_lora_rank
@@ -162,8 +172,11 @@ class FlashConfig(PretrainedConfig):
         self.zero_expert_type = zero_expert_type
         self.routed_scaling_factor = routed_scaling_factor
         self.hidden_act = "silu"
-        self.intermediate_size = self.ffn_hidden_size if hasattr(
-            self, "ffn_hidden_size") else self.intermediate_size
+        self.intermediate_size = (
+            self.ffn_hidden_size
+            if hasattr(self, "ffn_hidden_size")
+            else self.intermediate_size
+        )
         if hasattr(self, "moe_intermediate_size"):
             self.moe_intermediate_size = self.moe_intermediate_size
         elif hasattr(self, "expert_ffn_hidden_size"):
@@ -180,7 +193,7 @@ class FlashMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -201,8 +214,9 @@ class FlashMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -216,15 +230,19 @@ class FlashMLP(nn.Module):
 
 
 class LongcatRouter(nn.Module):
-
-    def __init__(self,
-                 config,
-                 zero_expert_num=0,
-                 rounter_params_dtype=torch.bfloat16,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config,
+        zero_expert_num=0,
+        rounter_params_dtype=torch.bfloat16,
+        prefix: str = "",
+    ):
         super().__init__()
-        self.n_routed_experts = config.n_routed_experts if hasattr(
-            config, "n_routed_experts") else config.num_experts[0]
+        self.n_routed_experts = (
+            config.n_routed_experts
+            if hasattr(config, "n_routed_experts")
+            else config.num_experts[0]
+        )
         self.n_routed_experts = self.n_routed_experts + zero_expert_num
         self.classifier = ReplicatedLinear(
             config.hidden_size,
@@ -235,7 +253,8 @@ class LongcatRouter(nn.Module):
             prefix=f"{prefix}.classifier",
         )
         self.e_score_correction_bias = nn.Parameter(
-            torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype))
+            torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
+        )
 
     def forward(self, hidden_states):
         logits, _ = self.classifier(hidden_states)
@@ -243,7 +262,6 @@ class LongcatRouter(nn.Module):
 
 
 class LongcatMoe(nn.Module):
-
     def __init__(
         self,
         config: FlashConfig,
@@ -251,8 +269,8 @@ class LongcatMoe(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -271,7 +289,8 @@ class LongcatMoe(nn.Module):
             config=config,
             zero_expert_num=self.zero_expert_num,
             rounter_params_dtype=self.rounter_params_dtype,
-            prefix=f"{prefix}.gate")
+            prefix=f"{prefix}.gate",
+        )
 
         self.experts = FusedMoE(
             num_experts=num_experts,
@@ -291,14 +310,13 @@ class LongcatMoe(nn.Module):
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        router_logits = self.router(hidden_states.to(
-            self.rounter_params_dtype))
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
@@ -310,73 +328,82 @@ class FlashDecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         config: FlashConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
         super().__init__()
-        self.layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = int(prefix.split(sep=".")[-1])
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
+                config.original_max_position_embeddings
+            )
 
         # Dual attention structure
-        self.self_attn = nn.ModuleList([
-            DeepseekV2MLAAttention(
-                vllm_config=vllm_config,
-                config=config,
-                hidden_size=self.hidden_size,
-                num_heads=config.num_attention_heads,
-                qk_nope_head_dim=config.qk_nope_head_dim,
-                qk_rope_head_dim=config.qk_rope_head_dim,
-                v_head_dim=config.v_head_dim,
-                q_lora_rank=(config.q_lora_rank if hasattr(
-                    config, "q_lora_rank") else None),
-                kv_lora_rank=config.kv_lora_rank,
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
-                max_position_embeddings=max_position_embeddings,
-                cache_config=cache_config,
-                quant_config=None if "self_attn" in getattr(
-                    config, "disable_quant_module", []) else quant_config,
-                prefix=f"{prefix}.self_attn.{i}",
-            ) for i in range(2)
-        ])
-        self.input_layernorm = nn.ModuleList([
-            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            for i in range(2)
-        ])
-        self.post_attention_layernorm = nn.ModuleList([
-            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            for i in range(2)
-        ])
+        self.self_attn = nn.ModuleList(
+            [
+                DeepseekV2MLAAttention(
+                    vllm_config=vllm_config,
+                    config=config,
+                    hidden_size=self.hidden_size,
+                    num_heads=config.num_attention_heads,
+                    qk_nope_head_dim=config.qk_nope_head_dim,
+                    qk_rope_head_dim=config.qk_rope_head_dim,
+                    v_head_dim=config.v_head_dim,
+                    q_lora_rank=(
+                        config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                    ),
+                    kv_lora_rank=config.kv_lora_rank,
+                    rope_theta=rope_theta,
+                    rope_scaling=rope_scaling,
+                    max_position_embeddings=max_position_embeddings,
+                    cache_config=cache_config,
+                    quant_config=None
+                    if "self_attn" in getattr(config, "disable_quant_module", [])
+                    else quant_config,
+                    prefix=f"{prefix}.self_attn.{i}",
+                )
+                for i in range(2)
+            ]
+        )
+        self.input_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
+        self.post_attention_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
 
         # Dual MLP structure
-        self.mlps = nn.ModuleList([
-            FlashMLP(
-                hidden_size=self.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=None if "mlps" in getattr(
-                    config, "disable_quant_module", []) else quant_config,
-                prefix=f"{prefix}.mlps.{i}",
-            ) for i in range(2)
-        ])
+        self.mlps = nn.ModuleList(
+            [
+                FlashMLP(
+                    hidden_size=self.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=None
+                    if "mlps" in getattr(config, "disable_quant_module", [])
+                    else quant_config,
+                    prefix=f"{prefix}.mlps.{i}",
+                )
+                for i in range(2)
+            ]
+        )
 
         self.mlp = LongcatMoe(
             config=config,
-            num_experts=config.n_routed_experts if hasattr(
-                config, "n_routed_experts") else
-            config.num_experts[self.layer_idx],
+            num_experts=config.n_routed_experts
+            if hasattr(config, "n_routed_experts")
+            else config.num_experts[self.layer_idx],
             top_k=config.moe_topk
-            if hasattr(config, "moe_topk") else config.num_experts_per_tok,
+            if hasattr(config, "moe_topk")
+            else config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
             quant_config=quant_config,
@@ -387,15 +414,13 @@ class FlashDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm[0](hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm[0](hidden_states,
-                                                              residual)
+            hidden_states, residual = self.input_layernorm[0](hidden_states, residual)
 
         hidden_states = self.self_attn[0](
             positions=positions,
@@ -403,7 +428,8 @@ class FlashDecoderLayer(nn.Module):
         )
 
         hidden_states, residual = self.post_attention_layernorm[0](
-            hidden_states, residual)
+            hidden_states, residual
+        )
 
         # moe
         hidden_states_copy = hidden_states.clone()
@@ -412,8 +438,7 @@ class FlashDecoderLayer(nn.Module):
         # first mlp
         hidden_states = self.mlps[0](hidden_states)
 
-        hidden_states, residual = self.input_layernorm[1](hidden_states,
-                                                          residual)
+        hidden_states, residual = self.input_layernorm[1](hidden_states, residual)
 
         # second_attn
         hidden_states = self.self_attn[1](
@@ -421,7 +446,8 @@ class FlashDecoderLayer(nn.Module):
             hidden_states=hidden_states,
         )
         hidden_states, residual = self.post_attention_layernorm[1](
-            hidden_states, residual)
+            hidden_states, residual
+        )
 
         # second_mlp
         hidden_states = self.mlps[1](hidden_states)
@@ -462,38 +488,38 @@ class FlashModel(nn.Module):
                 quant_config=quant_config,
                 prefix=prefix,
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
@@ -501,10 +527,9 @@ class FlashModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
@@ -529,48 +554,54 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        config.intermediate_size = config.ffn_hidden_size if hasattr(
-            config, "ffn_hidden_size") else config.intermediate_size
-        self.lora_config = lora_config
+        config.intermediate_size = (
+            config.ffn_hidden_size
+            if hasattr(config, "ffn_hidden_size")
+            else config.intermediate_size
+        )
+
         self.quant_config = quant_config
 
-        self.model = FlashModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = FlashModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -581,14 +612,12 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts if hasattr(
-                self.config, "n_routed_experts") else
-            self.config.num_experts[0],
+            num_experts=self.config.n_routed_experts
+            if hasattr(self.config, "n_routed_experts")
+            else self.config.num_experts[0],
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("fused_qkv_a_proj", "q_a_proj", 0),
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
@@ -610,8 +639,9 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if (name.endswith(".bias")
-                        or name.endswith("_bias")) and name not in params_dict:
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip mtp
                 if ".mtp." in name:
@@ -633,22 +663,25 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                     # Skip mtp
                     if ".mtp." in name_mapped:
                         continue
-                    if (name_mapped.endswith(".bias")
-                            or name_mapped.endswith("_bias")
-                        ) and name not in params_dict:
+                    if (
+                        name_mapped.endswith(".bias") or name_mapped.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name_mapped]
                     weight_loader = param.weight_loader
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
-                    success = weight_loader(param,
-                                            loaded_weight,
-                                            name_mapped,
-                                            shard_id=shard_id,
-                                            expert_id=expert_id,
-                                            return_success=True)
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
                     if success:
                         name = name_mapped
                         break
@@ -672,8 +705,9 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         for layer_id in range(self.config.num_hidden_layers):
@@ -681,35 +715,35 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 if isinstance(self.model.layers[layer_id], PPMissingLayer):
                     continue
                 self_attn = self.model.layers[layer_id].self_attn[i]
-                if hasattr(self.quant_config, "weight_block_size"
-                           ) and self_attn.kv_b_proj.weight.dtype in (
-                               torch.float8_e4m3fn,
-                               torch.float8_e4m3fnuz,
-                           ):
+                if hasattr(
+                    self.quant_config, "weight_block_size"
+                ) and self_attn.kv_b_proj.weight.dtype in (
+                    torch.float8_e4m3fn,
+                    torch.float8_e4m3fnuz,
+                ):
                     weight_block_size = self.quant_config.weight_block_size
                     if weight_block_size is not None:
                         assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
                         dtype = torch.get_default_dtype()
-                        w = block_dequant(self_attn.kv_b_proj.weight,
-                                          self_attn.kv_b_proj.weight_scale_inv,
-                                          weight_block_size).to(dtype)
+                        w = block_dequant(
+                            self_attn.kv_b_proj.weight,
+                            self_attn.kv_b_proj.weight_scale_inv,
+                            weight_block_size,
+                        ).to(dtype)
                 else:
                     w = self_attn.kv_b_proj.weight
 
                 w_kc, w_vc = w.unflatten(
-                    0,
-                    (-1,
-                     self_attn.qk_nope_head_dim + self_attn.v_head_dim)).split(
-                         [self_attn.qk_nope_head_dim, self_attn.v_head_dim],
-                         dim=1)
-                self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(
-                    1, 2)
+                    0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+                ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+                self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
                 self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
                 if self.config.mla_scale_q_lora:
                     self_attn.q_a_layernorm.weight.data *= (
-                        self.config.hidden_size / self.config.q_lora_rank)**0.5
+                        self.config.hidden_size / self.config.q_lora_rank
+                    ) ** 0.5
                 if self.config.mla_scale_kv_lora:
                     self_attn.kv_a_layernorm.weight.data *= (
-                        self.config.hidden_size /
-                        self.config.kv_lora_rank)**0.5
+                        self.config.hidden_size / self.config.kv_lora_rank
+                    ) ** 0.5
         return loaded_params
diff --git a/vllm/model_executor/models/longcat_flash_mtp.py b/vllm/model_executor/models/longcat_flash_mtp.py
index eebc2ee1559796c4642159e3217ad4cd8920f286..e554d1e2de9274d1b19b6bbb076fa2374373baff 100644
--- a/vllm/model_executor/models/longcat_flash_mtp.py
+++ b/vllm/model_executor/models/longcat_flash_mtp.py
@@ -4,7 +4,6 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -15,10 +14,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    block_dequant)
+from vllm.model_executor.layers.quantization.utils.int8_utils import block_dequant
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.longcat_flash import FlashConfig
 from vllm.sequence import IntermediateTensors
@@ -29,32 +29,32 @@ from .utils import maybe_prefix
 
 
 class LongCatMultiTokenPredictorLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
         vllm_config: VllmConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.eh_proj = ReplicatedLinear(2 * config.hidden_size,
-                                        config.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix="eh_proj")
+        self.eh_proj = ReplicatedLinear(
+            2 * config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix="eh_proj",
+        )
         self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix)
-        self.final_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
@@ -62,39 +62,43 @@ class LongCatMultiTokenPredictorLayer(nn.Module):
         previous_hidden_states = self.hnorm(previous_hidden_states)
 
         hidden_states, _ = self.eh_proj(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1)
+        )
 
-        hidden_states, residual = self.mtp_block(positions=positions,
-                                                 hidden_states=hidden_states,
-                                                 residual=None)
+        hidden_states, residual = self.mtp_block(
+            positions=positions, hidden_states=hidden_states, residual=None
+        )
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
 
 class LongCatMultiTokenPredictor(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
-        vllm_config.model_config.hf_config.intermediate_size \
-                                                = config.intermediate_size
+        vllm_config.model_config.hf_config.intermediate_size = config.intermediate_size
         self.mtp_start_layer_idx = config.num_hidden_layers * 2
         self.num_mtp_layers = 1
-        self.layers = torch.nn.ModuleDict({
-            str(idx):
-            LongCatMultiTokenPredictorLayer(
-                config,
-                prefix=f"{prefix}.layers.{idx}",
-                vllm_config=vllm_config,
-                quant_config=quant_config,
-            )
-            for idx in range(self.mtp_start_layer_idx,
-                             self.mtp_start_layer_idx + self.num_mtp_layers)
-        })
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): LongCatMultiTokenPredictorLayer(
+                    config,
+                    prefix=f"{prefix}.layers.{idx}",
+                    vllm_config=vllm_config,
+                    quant_config=quant_config,
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -105,12 +109,12 @@ class LongCatMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        current_step_idx = spec_step_idx % self.num_mtp_layers
         return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
             input_ids,
             positions,
@@ -121,25 +125,27 @@ class LongCatMultiTokenPredictor(nn.Module):
 
 
 class LongCatFlashMTP(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         # LongCat MTP without MoE layers
         vllm_config.model_config.hf_config.n_routed_experts = None
-        self.config = FlashConfig(
-            **vllm_config.model_config.hf_config.__dict__)
-        self.quant_config = None if "mtp" in getattr(
-            self.config, "disable_quant_module",
-            []) else vllm_config.quant_config
+        self.config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
+        self.quant_config = (
+            None
+            if "mtp" in getattr(self.config, "disable_quant_module", [])
+            else vllm_config.quant_config
+        )
 
-        self.model = LongCatMultiTokenPredictor(vllm_config=vllm_config,
-                                                quant_config=self.quant_config,
-                                                prefix=maybe_prefix(
-                                                    prefix, "model"))
+        self.model = LongCatMultiTokenPredictor(
+            vllm_config=vllm_config,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
         self.lm_head = ParallelLMHead(
             self.config.vocab_size,
             self.config.hidden_size,
             quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
         )
         self.logits_processor = LogitsProcessor(self.config.vocab_size)
 
@@ -148,25 +154,24 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
-
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -175,53 +180,31 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         ]
 
         new_to_old_names_mapping = {
-            "model.mtp.embed_tokens.weight":
-            "model.layers.0.embed_tokens.weight",
+            "model.mtp.embed_tokens.weight": "model.layers.0.embed_tokens.weight",
             "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
-            "model.mtp.layers.0.eh_proj.weight_scale_inv":
-            "eh_proj.weight_scale_inv",
+            "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
             "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
             "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
-            "model.mtp.layers.0.input_layernorm.weight":
-            "model.layers.0.input_layernorm.weight",
-            "model.mtp.layers.0.post_attention_layernorm.weight":
-            "model.layers.0.post_attention_layernorm.weight",
-            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight":
-            "model.layers.0.self_attn.kv_a_layernorm.weight",
-            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight":
-            "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
-            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv":
-            "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.kv_b_proj.weight":
-            "model.layers.0.self_attn.kv_b_proj.weight",
-            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv":
-            "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.o_proj.weight":
-            "model.layers.0.self_attn.o_proj.weight",
-            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv":
-            "model.layers.0.self_attn.o_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.q_a_layernorm.weight":
-            "model.layers.0.self_attn.q_a_layernorm.weight",
-            "model.mtp.layers.0.self_attn.q_a_proj.weight":
-            "model.layers.0.self_attn.q_a_proj.weight",
-            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv":
-            "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.q_b_proj.weight":
-            "model.layers.0.self_attn.q_b_proj.weight",
-            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv":
-            "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight":
-            "model.layers.0.mlp.down_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv":
-            "model.layers.0.mlp.down_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight":
-            "model.layers.0.mlp.gate_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv":
-            "model.layers.0.mlp.gate_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight":
-            "model.layers.0.mlp.up_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv":
-            "model.layers.0.mlp.up_proj.weight_scale_inv",
+            "model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",  # noqa: E501
             "model.mtp.norm.weight": "final_layernorm.weight",
         }
 
@@ -230,13 +213,13 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            spec_layer = self.get_spec_layer_idx_from_weight_name(
-                self.config, name)
+            spec_layer = self.get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is None:
                 continue
-            name = self._rewrite_spec_layer_name(spec_layer, name,
-                                                 new_to_old_names_mapping)
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            name = self._rewrite_spec_layer_name(
+                spec_layer, name, new_to_old_names_mapping
+            )
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -246,14 +229,13 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
-                if ((param_name == "fused_qkv_a_proj")
-                        and name not in params_dict):
+                if (param_name == "fused_qkv_a_proj") and name not in params_dict:
                     continue
 
                 # Skip loading extra bias for GPTQ models.
@@ -271,48 +253,54 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
 
                 # According to DeepSeek-V3 Technical Report, MTP modules
                 # shares embedding layer. We only load the first weights.
-                if (spec_layer != self.model.mtp_start_layer_idx
-                        and ".layers" not in name):
+                if (
+                    spec_layer != self.model.mtp_start_layer_idx
+                    and ".layers" not in name
+                ):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         spec_layer_id = self.config.num_hidden_layers * 2
         self_attn = self.model.layers[str(spec_layer_id)].mtp_block.self_attn
         if hasattr(
-                self.quant_config,
-                "weight_block_size") and self_attn.kv_b_proj.weight.dtype in (
-                    torch.float8_e4m3fn,
-                    torch.float8_e4m3fnuz,
-                ):
+            self.quant_config, "weight_block_size"
+        ) and self_attn.kv_b_proj.weight.dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ):
             weight_block_size = self.quant_config.weight_block_size
             if weight_block_size is not None:
                 dtype = torch.get_default_dtype()
-                w = block_dequant(self_attn.kv_b_proj.weight,
-                                  self_attn.kv_b_proj.weight_scale_inv,
-                                  weight_block_size).to(dtype)
+                w = block_dequant(
+                    self_attn.kv_b_proj.weight,
+                    self_attn.kv_b_proj.weight_scale_inv,
+                    weight_block_size,
+                ).to(dtype)
             else:
                 w = self_attn.kv_b_proj.weight
         else:
             w = self_attn.kv_b_proj.weight
         w_kc, w_vc = w.unflatten(
-            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)).split(
-                [self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+        ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
         self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
         self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
         if self.config.mla_scale_q_lora:
             self_attn.q_a_layernorm.weight.data *= (
-                self.config.hidden_size / self.config.q_lora_rank)**0.5
+                self.config.hidden_size / self.config.q_lora_rank
+            ) ** 0.5
         if self.config.mla_scale_kv_lora:
             self_attn.kv_a_layernorm.weight.data *= (
-                self.config.hidden_size / self.config.kv_lora_rank)**0.5
+                self.config.hidden_size / self.config.kv_lora_rank
+            ) ** 0.5
         return loaded_params
 
-    def _rewrite_spec_layer_name(self, spec_layer: int, name: str,
-                                 new_to_old_names_mapping: dict) -> str:
+    def _rewrite_spec_layer_name(
+        self, spec_layer: int, name: str, new_to_old_names_mapping: dict
+    ) -> str:
         """
         Rewrite the weight name to match the format of the original model.
         Add .mtp_block for modules in transformer layer block for spec layer
@@ -321,11 +309,18 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         if name in new_to_old_names_mapping:
             name = new_to_old_names_mapping[name]
         spec_layer_weight_names = [
-            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
         ]
-        if name.startswith("enorm") or name.startswith(
-                "hnorm") or name.startswith("eh_proj") or name.startswith(
-                    "final_layernorm"):
+        if (
+            name.startswith("enorm")
+            or name.startswith("hnorm")
+            or name.startswith("eh_proj")
+            or name.startswith("final_layernorm")
+        ):
             name = "model.layers." + str(spec_layer) + "." + name
         shared_weight_names = ["embed_tokens"]
         spec_layer_weight = False
@@ -338,15 +333,17 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
                 break
         if not spec_layer_weight:
             # treat rest weights as weights for transformer layer block
-            name = name.replace("model.layers.0.",
-                                f"model.layers.{spec_layer}.mtp_block.")
+            name = name.replace(
+                "model.layers.0.", f"model.layers.{spec_layer}.mtp_block."
+            )
         elif shared_weight:
             # treat shared weights as top level weights
             name = name.replace("model.layers.0.", "model.")
         return name
 
-    def get_spec_layer_idx_from_weight_name(self, config: PretrainedConfig,
-                                            weight_name: str) -> Optional[int]:
+    def get_spec_layer_idx_from_weight_name(
+        self, config: PretrainedConfig, weight_name: str
+    ) -> int | None:
         if "model.mtp" in weight_name:
             return config.num_hidden_layers * 2
         return None
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 5bd268291c7d926afc9b99346a363954a6db01ae..aa16640a9427646ac45bf1ef0eab64a0fab0fbf6 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA model."""
+
 from collections.abc import Iterable
-from typing import Optional
+from itertools import islice
 
 import torch
 from torch import nn
@@ -15,59 +16,73 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree, SupportsPP)
+from vllm.model_executor.models.interfaces import (
+    HasInnerState,
+    IsAttentionFree,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+)
 from vllm.sequence import IntermediateTensors
 
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 KVCache = tuple[torch.Tensor, torch.Tensor]
 
 
 class MambaDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: MambaConfig,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 is_lora_enabled: Optional[bool] = False,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: MambaConfig,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        is_lora_enabled: bool | None = False,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
         self.is_lora_enabled = is_lora_enabled
         mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
-        self.mixer = MambaMixer(hidden_size=config.hidden_size,
-                                ssm_state_size=config.state_size,
-                                conv_kernel_size=config.conv_kernel,
-                                intermediate_size=config.intermediate_size,
-                                time_step_rank=config.time_step_rank,
-                                use_conv_bias=config.use_conv_bias,
-                                use_bias=config.use_bias,
-                                use_rms_norm=self.is_falcon_mamba,
-                                rms_norm_has_weight=not self.is_falcon_mamba,
-                                rms_norm_eps=mixer_rms_eps,
-                                activation=config.hidden_act,
-                                is_lora_enabled=self.is_lora_enabled,
-                                model_config=model_config,
-                                cache_config=cache_config,
-                                prefix=f"{prefix}.mixer")
+        self.mixer = MambaMixer(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=config.intermediate_size,
+            time_step_rank=config.time_step_rank,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            use_rms_norm=self.is_falcon_mamba,
+            rms_norm_has_weight=not self.is_falcon_mamba,
+            rms_norm_eps=mixer_rms_eps,
+            activation=config.hidden_act,
+            is_lora_enabled=self.is_lora_enabled,
+            model_config=model_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.mixer",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -83,7 +98,6 @@ class MambaDecoderLayer(nn.Module):
 
 @support_torch_compile
 class MambaModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -95,70 +109,66 @@ class MambaModel(nn.Module):
         is_lora_enabled = bool(lora_config)
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MambaDecoderLayer(config,
-                                             model_config=model_config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             is_lora_enabled=is_lora_enabled,
-                                             prefix=prefix),
-            prefix=f"{prefix}.layers")
-
-        self.norm_f = RMSNorm(config.hidden_size,
-                              eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            lambda prefix: MambaDecoderLayer(
+                config,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                is_lora_enabled=is_lora_enabled,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions=positions,
-                                            hidden_states=hidden_states,
-                                            residual=residual)
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions=positions, hidden_states=hidden_states, residual=residual
+            )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm_f(hidden_states, residual)
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -171,64 +181,57 @@ class MambaModel(nn.Module):
                 continue
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
-
+class MambaForCausalLM(
+    nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching
+):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         self.scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "Mamba does not support prefix caching"
 
         super().__init__()
         self.config = config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        self.backbone = MambaModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "backbone"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.backbone = MambaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
+        )
+
         if config.tie_word_embeddings:
             self.lm_head = self.backbone.embeddings
         else:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.backbone.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+            self.backbone.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
-        hidden_states = self.backbone(input_ids, positions,
-                                      intermediate_tensors, inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.backbone(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
@@ -237,7 +240,6 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba1_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -256,11 +258,11 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
             tp_world_size=parallel_config.tensor_parallel_size,
             intermediate_size=hf_config.intermediate_size,
             state_size=hf_config.state_size,
-            conv_kernel=hf_config.conv_kernel)
+            conv_kernel=hf_config.conv_kernel,
+        )
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        return self.mamba_cache.copy_inputs_before_cuda_graphs(
-            input_buffers, **kwargs)
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(input_buffers, **kwargs)
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
@@ -269,7 +271,6 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 97e9c5785e726d906099aa447b679b3402cb0e6c..fc17f98be19862e069332c25a1787aadadccc46a 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA2 model."""
+
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -15,57 +15,70 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree)
+from vllm.model_executor.models.interfaces import (
+    HasInnerState,
+    IsAttentionFree,
+    SupportsMambaPrefixCaching,
+)
 from vllm.sequence import IntermediateTensors
 
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 KVCache = tuple[torch.Tensor, torch.Tensor]
 
 
 class Mamba2DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: MambaConfig,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: MambaConfig,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
-        self.mixer = MambaMixer2(hidden_size=config.hidden_size,
-                                 ssm_state_size=config.state_size,
-                                 conv_kernel_size=config.conv_kernel,
-                                 intermediate_size=getattr(
-                                     config, "intermediate_size",
-                                     config.expand * config.hidden_size),
-                                 use_conv_bias=config.use_conv_bias,
-                                 use_bias=config.use_bias,
-                                 n_groups=config.n_groups,
-                                 num_heads=config.num_heads,
-                                 head_dim=config.head_dim,
-                                 rms_norm_eps=config.layer_norm_epsilon,
-                                 activation=config.hidden_act,
-                                 model_config=model_config,
-                                 cache_config=cache_config,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mixer")
+        self.mixer = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=getattr(
+                config, "intermediate_size", config.expand * config.hidden_size
+            ),
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.n_groups,
+            num_heads=config.num_heads,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.layer_norm_epsilon,
+            activation=config.hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -81,7 +94,6 @@ class Mamba2DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Mamba2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -94,47 +106,46 @@ class Mamba2Model(nn.Module):
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Mamba2DecoderLayer(config,
-                                              model_config=model_config,
-                                              cache_config=cache_config,
-                                              quant_config=quant_config,
-                                              prefix=prefix),
-            prefix=f"{prefix}.layers")
-
-        self.norm_f = RMSNorm(config.hidden_size,
-                              eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            lambda prefix: Mamba2DecoderLayer(
+                config,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -142,22 +153,20 @@ class Mamba2Model(nn.Module):
             residual = intermediate_tensors["residual"]
 
         for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(positions=positions,
-                                            hidden_states=hidden_states,
-                                            residual=residual)
+            hidden_states, residual = layer(
+                positions=positions, hidden_states=hidden_states, residual=residual
+            )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm_f(hidden_states, residual)
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -171,21 +180,20 @@ class Mamba2Model(nn.Module):
                 continue
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
-
+class Mamba2ForCausalLM(
+    nn.Module, HasInnerState, IsAttentionFree, SupportsMambaPrefixCaching
+):
     @classmethod
     def get_mamba_state_dtype_from_config(
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -223,60 +231,51 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "Mamba does not support prefix caching"
 
         super().__init__()
         self.config = config
         self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
         self.model_config = vllm_config.model_config
-        self.backbone = Mamba2Model(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "backbone"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.backbone = Mamba2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
+        )
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.backbone.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+            self.backbone.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
-        hidden_states = self.backbone(input_ids, positions,
-                                      intermediate_tensors, inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.backbone(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        return self.mamba_cache.copy_inputs_before_cuda_graphs(
-            input_buffers, **kwargs)
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(input_buffers, **kwargs)
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
@@ -285,7 +284,6 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 0ae59dc8dfc23cb75560cdd3261de14730eabcfa..fd7fc2c73f16ea94d56caad97f769968ea5ed987 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -9,24 +9,27 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+    ParallelLMHead,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 from .utils import maybe_prefix
 
 
 class ResidualBlock(nn.Module):
-
-    def __init__(self, config: VllmConfig, hidden_size: int,
-                 num_layers: int) -> None:
+    def __init__(self, config: VllmConfig, hidden_size: int, num_layers: int) -> None:
         super().__init__()
 
-        self.layers = nn.ModuleList([
-            nn.Linear(hidden_size,
-                      hidden_size,
-                      bias=getattr(config, "medusa_fc_bias", False))
-            for _ in range(num_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                nn.Linear(
+                    hidden_size,
+                    hidden_size,
+                    bias=getattr(config, "medusa_fc_bias", False),
+                )
+                for _ in range(num_layers)
+            ]
+        )
         self.act = nn.SiLU()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -38,13 +41,13 @@ class ResidualBlock(nn.Module):
 class Medusa(nn.Module):
     """This class implements the Medusa draft model from the paper: https://arxiv.org/abs/2401.10774
     Reference implementation: https://github.com/FasterDecoding/Medusa
-    
+
     Differences from reference implementation:
     1. Currently this only supports generating proposals from top-1 tokens.
-    2. We have an optional token_map which reduces draft vocab to most 
-       frequently used tokens to give some additional speed-up by reducing 
-       sampling overhead. This is disabled unless the checkpoint file has 
-       explicit token_map tensor and config has an optional attribute 
+    2. We have an optional token_map which reduces draft vocab to most
+       frequently used tokens to give some additional speed-up by reducing
+       sampling overhead. This is disabled unless the checkpoint file has
+       explicit token_map tensor and config has an optional attribute
        truncated_vocab_size < vocab_size. To use this technique, one has to find
        the top-k most frequent tokens in target dataset and add that as a tensor
        in the draft checkpoint (using key token_map). Also, the draft config
@@ -54,41 +57,42 @@ class Medusa(nn.Module):
         config = vllm_config.speculative_config.draft_model_config.hf_config
         super().__init__()
         self.config = config
-        self.blocks = nn.ModuleList([
-            ResidualBlock(config=config,
-                          hidden_size=self.config.hidden_size,
-                          num_layers=self.config.num_hidden_layers)
-            for _ in range(self.config.num_heads)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                ResidualBlock(
+                    config=config,
+                    hidden_size=self.config.hidden_size,
+                    num_layers=self.config.num_hidden_layers,
+                )
+                for _ in range(self.config.num_heads)
+            ]
+        )
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
-        self.unpadded_vocab_size = self.truncated_vocab_size
 
         if getattr(config, "original_lm_head", False):
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                self.truncated_vocab_size,
                 config.hidden_size,
-                org_num_embeddings=self.truncated_vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-            self.lm_heads = [
-                self.lm_head for _ in range(self.config.num_heads)
-            ]
+            self.lm_heads = [self.lm_head for _ in range(self.config.num_heads)]
         else:
-            self.lm_heads = nn.ModuleList([
-                ParallelLMHead(
-                    self.unpadded_vocab_size,
-                    config.hidden_size,
-                    org_num_embeddings=self.truncated_vocab_size,
-                    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
-                ) for _ in range(self.config.num_heads)
-            ])
+            self.lm_heads = nn.ModuleList(
+                [
+                    ParallelLMHead(
+                        config.vocab_size,
+                        config.hidden_size,
+                        prefix=maybe_prefix(prefix, f"lm_heads.{i}"),
+                    )
+                    for i in range(self.config.num_heads)
+                ]
+            )
 
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                self.truncated_vocab_size,
-                                                logit_scale)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, self.truncated_vocab_size, logit_scale
+        )
 
         # Token map is a idx to token mapping to reduce the vocab size for
         # the draft model. Using smaller vocab size for draft, containing
@@ -119,17 +123,20 @@ class Medusa(nn.Module):
             if self.token_map is None:
                 logits_lst.append(_logits)
             else:
-                logits_lst.append(-torch.inf * torch.ones(
-                    size=(*_logits.shape[:-1], self.orig_vocab_size),
-                    device=_logits.device,
-                    dtype=_logits.dtype))
+                logits_lst.append(
+                    -torch.inf
+                    * torch.ones(
+                        size=(*_logits.shape[:-1], self.orig_vocab_size),
+                        device=_logits.device,
+                        dtype=_logits.dtype,
+                    )
+                )
 
                 logits_lst[-1][..., self.token_map] = _logits
 
         return logits_lst
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
 
@@ -140,30 +147,33 @@ class Medusa(nn.Module):
 
             if name == "token_map":
                 if self.truncated_vocab_size < self.orig_vocab_size:
-                    self.token_map = nn.Parameter(loaded_weight,
-                                                  requires_grad=False)
+                    self.token_map = nn.Parameter(loaded_weight, requires_grad=False)
             elif name in params_dict:
                 weights_map[name] = loaded_weight
-            elif (getattr(self.config, "original_lm_head", False)
-                  and name == "lm_heads.0.weight"):
+            elif (
+                getattr(self.config, "original_lm_head", False)
+                and name == "lm_heads.0.weight"
+            ):
                 weights_map["lm_head.weight"] = loaded_weight
 
         for name, loaded_weight in weights_map.items():
-            if "lm_head" in name and self.token_map is not None and\
-                loaded_weight.shape[0] > self.token_map.shape[0]:
-
+            if (
+                "lm_head" in name
+                and self.token_map is not None
+                and loaded_weight.shape[0] > self.token_map.shape[0]
+            ):
                 loaded_weight = loaded_weight[self.token_map]
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
         if self.token_map is not None:
             self.token_map.to(device=self.lm_heads[0].weight.device)
 
-        assert (self.truncated_vocab_size
-                == self.orig_vocab_size) or (self.token_map is not None)
+        assert (self.truncated_vocab_size == self.orig_vocab_size) or (
+            self.token_map is not None
+        )
 
         return loaded_params
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index 82648ba668ca5265b23172008e2bc69a0d3c5b89..a84c99059cd9cc93912fca24cbd8ccb718c23864 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -22,48 +22,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiDashengLM model compatible with HuggingFace weights."""
+
 import collections
 import collections.abc
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, TypedDict, Union, cast
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Annotated, Any, TypeAlias, cast
 
 import numpy as np
 import torch
 import torch.nn as nn
-import torchaudio.transforms as audio_transforms
+import torchaudio.functional as F
+from torch.nn.functional import scaled_dot_product_attention
 from transformers import BatchFeature
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.midashenglm import DashengConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
-_Tuple2 = Union[int, tuple[int, int], Sequence[int]]
+_Tuple2: TypeAlias = int | tuple[int, int] | Sequence[int]
 
 
 def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]:
     if isinstance(x, collections.abc.Sequence):
         assert len(x) == 2, (
-            f"Expected a sequence of length 2, got {x} with length {len(x)}")
+            f"Expected a sequence of length 2, got {x} with length {len(x)}"
+        )
         return cast(tuple[int, int], tuple(x))
     return (x, x)
 
@@ -80,12 +91,14 @@ def calculate_mel_frames_dasheng(
     if center:
         audio_length_samples = audio_length_samples + n_fft
 
-    return (int(1 + ((audio_length_samples - n_fft) / hop_size)) //
-            dasheng_subsampling // model_subsampling)
+    return (
+        int(1 + ((audio_length_samples - n_fft) / hop_size))
+        // dasheng_subsampling
+        // model_subsampling
+    )
 
 
 class AudioPatchEmbed(nn.Module):
-
     def __init__(
         self,
         input_size: _Tuple2 = 64,
@@ -93,7 +106,7 @@ class AudioPatchEmbed(nn.Module):
         patch_stride: _Tuple2 = 16,
         in_chans: int = 1,
         embed_dim: int = 768,
-        norm_layer: Optional[Callable] = None,
+        norm_layer: Callable | None = None,
         flatten: bool = False,
     ):
         super().__init__()
@@ -118,14 +131,14 @@ class AudioPatchEmbed(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         if self.flatten:
-            x = torch.permute(torch.flatten(
-                x, 2, 3), (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+            x = torch.permute(
+                torch.flatten(x, 2, 3), (0, 2, 1)
+            )  # rearrange(x, "b c f t -> b (f t) c")
         x = self.norm(x)
         return x
 
 
 class LayerScale(nn.Module):
-
     def __init__(self, dim, init_values=1e-5, inplace=False):
         super().__init__()
         self.inplace = inplace
@@ -136,27 +149,30 @@ class LayerScale(nn.Module):
 
 
 class DashengMlp(nn.Module):
-
     def __init__(
         self,
         in_features: int,
-        hidden_features: Optional[int] = None,
-        out_features: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
-        self.fc1 = ColumnParallelLinear(input_size=in_features,
-                                        output_size=hidden_features,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
+        self.fc1 = ColumnParallelLinear(
+            input_size=in_features,
+            output_size=hidden_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
         self.act = get_act_fn("gelu")
-        self.fc2 = RowParallelLinear(input_size=hidden_features,
-                                     output_size=out_features,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        self.fc2 = RowParallelLinear(
+            input_size=hidden_features,
+            output_size=out_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x, _ = self.fc1(x)
@@ -166,14 +182,12 @@ class DashengMlp(nn.Module):
 
 
 class DashengAttention(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int = 8,
         qkv_bias: bool = False,
-        causal: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -206,46 +220,42 @@ class DashengAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.qkv",
         )
-        self.attn = MultiHeadAttention(
-            self.num_heads,
-            self.head_dim,
-            self.scale,
-            num_kv_heads=self.num_kv_heads,
-        )
         self.proj = RowParallelLinear(
             input_size=dim,
             output_size=dim,
             quant_config=quant_config,
             prefix=f"{prefix}.proj",
         )
-        self.causal = causal
 
-    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None):
         B, N, C = x.shape
 
-        qkv_out, _ = self.qkv(x)
-        q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
-                                dim=-1)
+        qkv, _ = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
 
-        attn_out = self.attn(q, k, v)
-        C_local = attn_out.numel() // (B * N)  # C_local for parallel
-        attn_out = attn_out.view(B, N, C_local)
-
-        x, _ = self.proj(attn_out)
+        x = scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=mask[:, None, None, :] if mask is not None else None,
+        )
 
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x, _ = self.proj(x)
         return x
 
 
 class DashengBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = False,
-        init_values: Optional[float] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        init_values: float | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -257,8 +267,9 @@ class DashengBlock(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
         )
-        self.ls1 = (LayerScale(dim, init_values=init_values)
-                    if init_values else nn.Identity())
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
 
         self.norm2 = nn.LayerNorm(dim, eps=1e-6)
         self.mlp = DashengMlp(
@@ -267,26 +278,79 @@ class DashengBlock(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.ls2 = (LayerScale(dim, init_values=init_values)
-                    if init_values else nn.Identity())
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
 
     # Kwargs usually has a mask parameter that is passed to Attention
     def forward(
         self,
         x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         x = x + self.ls1(self.attn(self.norm1(x), mask))
         x = x + self.ls2(self.mlp(self.norm2(x)))
         return x
 
 
-class DashengAudioTransformer(nn.Module):
+class DashengFrontend(nn.Module):
+    def __init__(self, config: DashengConfig):
+        super().__init__()
+        self.config = config
 
+        spectrogram_window = torch.hann_window(self.config.win_length)
+        self.register_buffer(
+            "spectrogram_window",
+            spectrogram_window,
+            persistent=False,
+        )
+        self.spectrogram_window: torch.Tensor
+
+        melscale_fbanks = F.melscale_fbanks(
+            n_freqs=self.config.n_fft // 2 + 1,
+            f_min=self.config.f_min,
+            f_max=self.config.f_max,
+            n_mels=self.config.n_mels,
+            sample_rate=self.config.sample_rate,
+        )
+        self.register_buffer("melscale_fbanks", melscale_fbanks, persistent=False)
+        self.melscale_fbanks: torch.Tensor
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        spectrogram = F.spectrogram(
+            waveform=waveform.to(torch.float32),
+            pad=0,
+            window=self.spectrogram_window,
+            n_fft=self.config.n_fft,
+            hop_length=self.config.hop_length,
+            win_length=self.config.win_length,
+            power=2,
+            normalized=False,
+            center=self.config.center,
+        )
+        mel_spectrogram = (spectrogram.mT @ self.melscale_fbanks.to(torch.float32)).mT
+        # x has shape [batch, freq, time].
+        # F.amplitude_to_DB accepts inputs shaped as:
+        #   - [freq, time]
+        #   - [channel, freq, time]
+        #   - [..., channel, freq, time]
+        # Here we insert a channel dimension of size 1 before calling it,
+        # then remove that extra dimension afterward.
+        log_mel_spectrogram = F.amplitude_to_DB(
+            mel_spectrogram.unsqueeze(1),
+            multiplier=10,
+            amin=1e-10,
+            db_multiplier=0,
+            top_db=120,
+        ).squeeze(1)
+        return log_mel_spectrogram.to(waveform.dtype)
+
+
+class DashengAudioTransformer(nn.Module):
     def __init__(
         self,
         config: DashengConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -294,7 +358,7 @@ class DashengAudioTransformer(nn.Module):
         self.target_length = config.target_length
         self.hop_length = config.hop_length
 
-        self._init_front_end(config)
+        self.front_end = DashengFrontend(config)
 
         self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01)
 
@@ -308,9 +372,11 @@ class DashengAudioTransformer(nn.Module):
         )
 
         self.time_pos_embed = nn.Parameter(
-            torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1]))
+            torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1])
+        )
         self.freq_pos_embed = nn.Parameter(
-            torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1))
+            torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1)
+        )
         self.blocks = nn.ModuleList(
             DashengBlock(
                 dim=config.embed_dim,
@@ -319,45 +385,25 @@ class DashengAudioTransformer(nn.Module):
                 qkv_bias=config.qkv_bias,
                 init_values=config.init_values,
                 quant_config=quant_config,
-                prefix=f"{prefix}.block{i}",
-            ) for i in range(config.depth))
-        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
-
-    def _init_front_end(self, config):
-        with set_default_torch_dtype(torch.float32):
-            self.front_end = nn.Sequential(
-                audio_transforms.MelSpectrogram(
-                    f_min=config.f_min,
-                    f_max=config.f_max,
-                    center=config.center,
-                    win_length=config.win_length,
-                    hop_length=config.hop_length,
-                    sample_rate=config.sample_rate,
-                    n_fft=config.n_fft,
-                    n_mels=config.n_mels,
-                ),
-                audio_transforms.AmplitudeToDB(top_db=120),
+                prefix=f"{prefix}.blocks.{i}",
             )
-
-            mel_spectrogram = self.front_end[0]
-            fb = mel_spectrogram.mel_scale.fb
-            win = mel_spectrogram.spectrogram.window
-            mel_spectrogram.mel_scale.fb = fb.to(torch.bfloat16).to(
-                torch.float32)
-            mel_spectrogram.spectrogram.window = win.to(torch.bfloat16).to(
-                torch.float32)
+            for i in range(config.depth)
+        )
+        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
 
     def forward_features(
         self,
         x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         t = x.shape[-1]
         x = x + self.time_pos_embed[:, :, :, :t]
-        x = (x + self.freq_pos_embed[:, :, :, :]
-             )  # Just to support __getitem__ in posembed
-        x = torch.permute(torch.flatten(x, 2, 3),
-                          (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+        x = (
+            x + self.freq_pos_embed[:, :, :, :]
+        )  # Just to support __getitem__ in posembed
+        x = torch.permute(
+            torch.flatten(x, 2, 3), (0, 2, 1)
+        )  # rearrange(x, "b c f t -> b (f t) c")
         for block in self.blocks:
             x = block(x, mask)
         x = self.norm(x)
@@ -373,8 +419,8 @@ class DashengAudioTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        x_length: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x_length: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         x = self.front_end(x)
         x = x.to(self.time_pos_embed.dtype)
         target_length_in_patches = self.target_length // 4
@@ -390,12 +436,12 @@ class DashengAudioTransformer(nn.Module):
 
         if x_length is not None:
             assert len(x_length) == len(x), (
-                "batchsizes of input x and x_length need to be same")
+                "batchsizes of input x and x_length need to be same"
+            )
             assert x_length.ndim == 1, "Lengths are of size (B,)"
             scaled_lengths = (x_length / (self.hop_length * 4)).long()
             mask = self._to_mask(max_length=t, lengths=scaled_lengths)
-            split_masks = mask.logical_not().split(target_length_in_patches,
-                                                   dim=-1)
+            split_masks = mask.split(target_length_in_patches, dim=-1)
         else:
             mask = None
             split_masks = [None] * len(input_splits)
@@ -412,14 +458,13 @@ class DashengAudioTransformer(nn.Module):
 
 
 class AudioProjectorSubsample(nn.Module):
-
     def __init__(
         self,
         in_dim: int,
         out_dim: int,
         downsample_rate=5,
-        dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -431,14 +476,16 @@ class AudioProjectorSubsample(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.net.0",
                 return_bias=False,
-            ), get_act_fn("gelu"),
+            ),
+            get_act_fn("gelu"),
             RowParallelLinear(
                 input_size=out_dim,
                 output_size=out_dim,
                 quant_config=quant_config,
                 prefix=f"{prefix}.net.2",
                 return_bias=False,
-            ))
+            ),
+        )
 
     def forward(self, x, mask=None):
         batch_size, seq_len, dim = x.shape
@@ -449,27 +496,32 @@ class AudioProjectorSubsample(nn.Module):
                 mask = mask[:, :-num_frames_to_discard]
         if mask is None:
             mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
-        x = x.reshape(batch_size, -1, self.k *
-                      dim)  # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
+        x = x.reshape(
+            batch_size, -1, self.k * dim
+        )  # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
         for layer in self.net:
             x = layer(x)
         mask = mask.reshape(
-            batch_size, -1,
-            self.k)  # rearrange(mask, "b (s k) -> b s k", k=self.k)
+            batch_size, -1, self.k
+        )  # rearrange(mask, "b (s k) -> b s k", k=self.k)
         mask = mask.any(dim=-1).long()
         return x, mask
 
 
 # === Audio Inputs === #
-class MiDashengLMAudioInputs(TypedDict):
-    input_values: torch.Tensor
-    """Shape: `(num_audios, num_sampling_points)`"""
-    audio_length: torch.Tensor
-    """Shape: `(num_audios, 1)`"""
+class MiDashengLMAudioInputs(TensorSchema):
+    """
 
+    Dimensions:
+        - bn: Batch size * number of audios
+        - p: Number of sampling points
+    """
 
-class MiDashengLMProcessingInfo(BaseProcessingInfo):
+    input_values: Annotated[torch.Tensor, TensorShape("n", "p")]
+    audio_length: Annotated[torch.Tensor, TensorShape("n")]
 
+
+class MiDashengLMProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config()
 
@@ -478,7 +530,7 @@ class MiDashengLMProcessingInfo(BaseProcessingInfo):
         feature_extractor = hf_processor.feature_extractor
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
     def get_min_audio_len(self):
@@ -488,9 +540,7 @@ class MiDashengLMProcessingInfo(BaseProcessingInfo):
         return 160000
 
 
-class MiDashengLMDummyInputsBuilder(
-        BaseDummyInputsBuilder[MiDashengLMProcessingInfo]):
-
+class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
 
@@ -506,19 +556,24 @@ class MiDashengLMDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
 
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(length=self.info.get_max_audio_len(),
-                                   num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=self.info.get_max_audio_len(),
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            )
         }
 
 
 class MiDashengLMMultiModalProcessor(
-        BaseMultiModalProcessor[MiDashengLMProcessingInfo]):
-
+    BaseMultiModalProcessor[MiDashengLMProcessingInfo]
+):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
@@ -535,10 +590,15 @@ class MiDashengLMMultiModalProcessor(
         # + Padding
         min_audio_len = self.info.get_min_audio_len()
         processed_audios = [
-            np.pad(audio, (0, min_audio_len - audio.shape[-1]),
-                   mode='constant',
-                   constant_values=0) if isinstance(audio, np.ndarray)
-            and audio.shape[-1] < min_audio_len else audio for audio in audios
+            np.pad(
+                audio,
+                (0, min_audio_len - audio.shape[-1]),
+                mode="constant",
+                constant_values=0,
+            )
+            if isinstance(audio, np.ndarray) and audio.shape[-1] < min_audio_len
+            else audio
+            for audio in audios
         ]
 
         if processed_audios:
@@ -549,7 +609,9 @@ class MiDashengLMMultiModalProcessor(
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
-        mm_kwargs = dict(**mm_kwargs, )
+        mm_kwargs = dict(
+            **mm_kwargs,
+        )
 
         return super()._call_hf_processor(
             prompt=prompt,
@@ -586,11 +648,13 @@ class MiDashengLMMultiModalProcessor(
         if audio_length is None:
             audio_output_lengths = []
         else:
-            audio_length_np = audio_length.cpu().numpy() if isinstance(
-                audio_length, torch.Tensor) else audio_length
+            audio_length_np = (
+                audio_length.cpu().numpy()
+                if isinstance(audio_length, torch.Tensor)
+                else audio_length
+            )
             audio_output_lengths = [
-                max(1, calculate_mel_frames_dasheng(
-                    int(length)))  # at least one frame
+                max(1, calculate_mel_frames_dasheng(int(length)))  # at least one frame
                 for length in audio_length_np
             ]
 
@@ -618,9 +682,22 @@ class MiDashengLMMultiModalProcessor(
     dummy_inputs=MiDashengLMDummyInputsBuilder,
 )
 class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio_bos|><|AUDIO|><|audio_eos|>"
 
@@ -656,42 +733,23 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         self.quant_config = quant_config
         self.make_empty_intermediate_tensors = (
-            self.decoder.make_empty_intermediate_tensors)
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            return mm_input.reshape(-1, *mm_input.shape[2:])
-
-        if name == "input_values":
-            max_length = max(tensor.shape[1] for tensor in mm_input)
-            padded_mm_input = [
-                torch.nn.functional.pad(tensor,
-                                        (0, max_length - tensor.shape[1]))
-                if tensor.shape[1] < max_length else tensor
-                for tensor in mm_input
-            ]
-            return torch.concat(padded_mm_input)
-
-        return torch.concat(mm_input)
+            self.decoder.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]:
+        self, **kwargs: object
+    ) -> MiDashengLMAudioInputs | None:
         input_values = kwargs.pop("input_values", None)
         audio_length = kwargs.pop("audio_length", None)
 
         if input_values is None:
             return None
-        input_values = self._validate_and_reshape_mm_tensor(
-            input_values, "input_values")
-        audio_length = self._validate_and_reshape_mm_tensor(
-            audio_length, "audio_length")
-        if not isinstance(input_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio input features. "
-                             f"Got type: {type(input_values)}")
+
+        if isinstance(input_values, list):
+            input_values = torch.nn.utils.rnn.pad_sequence(
+                input_values,
+                batch_first=True,
+            )
 
         return MiDashengLMAudioInputs(
             input_values=input_values,
@@ -699,94 +757,71 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
         )
 
     def _process_audio_input(
-            self, audio_input: MiDashengLMAudioInputs) -> torch.Tensor:
+        self,
+        audio_input: MiDashengLMAudioInputs,
+    ) -> tuple[torch.Tensor, ...]:
         # Process audio through encoder and projector
         input_values = audio_input["input_values"]
         audio_length = audio_input["audio_length"]
 
-        encoder_out, encoder_atts = self.audio_encoder(input_values,
-                                                       audio_length)
+        encoder_out, encoder_atts = self.audio_encoder(input_values, audio_length)
         audio_embeddings, _ = self.audio_projector(encoder_out, encoder_atts)
-        audio_embeddings = audio_embeddings.to(
-            audio_input["input_values"].dtype)
+        audio_embeddings = audio_embeddings.to(audio_input["input_values"].dtype)
         batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape
 
-        audio_length_np = audio_length.cpu().numpy() if isinstance(
-            audio_length, torch.Tensor) else audio_length
         audio_output_lengths = [
-            max(1, calculate_mel_frames_dasheng(
-                int(length)))  # at least one frame
-            for length in audio_length_np
+            max(1, calculate_mel_frames_dasheng(int(length)))  # at least one frame
+            for length in audio_length.tolist()
         ]
-        audio_output_lengths = torch.tensor(audio_output_lengths).to(
-            audio_embeddings.device)
+        audio_output_lengths = torch.tensor(
+            audio_output_lengths,
+            device=audio_embeddings.device,
+        )
 
-        audio_feature_mask = (torch.arange(
-            max_audio_tokens,
-            device=audio_embeddings.device).unsqueeze(0).expand(
-                batch_size, max_audio_tokens)
-                              < audio_output_lengths.unsqueeze(1))
+        audio_feature_mask = torch.arange(
+            max_audio_tokens, device=audio_embeddings.device
+        ).unsqueeze(0).expand(
+            batch_size, max_audio_tokens
+        ) < audio_output_lengths.unsqueeze(1)
 
-        masked_audio_features = audio_embeddings[audio_feature_mask].view(
-            -1, embed_dim)
+        masked_audio_features = audio_embeddings[audio_feature_mask].view(-1, embed_dim)
 
-        return torch.split(masked_audio_features,
-                           audio_output_lengths.tolist())
+        return torch.split(masked_audio_features, audio_output_lengths.tolist())
 
     def get_language_model(self) -> torch.nn.Module:
         return self.decoder
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
 
         if audio_input is None:
             return []
         return self._process_audio_input(audio_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.decoder.get_input_embeddings(input_ids)
-        if multimodal_embeddings and len(multimodal_embeddings) > 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.audio_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
 
-        return self.decoder.model(input_ids,
-                                  positions,
-                                  intermediate_tensors,
-                                  inputs_embeds=inputs_embeds)
+        return self.decoder.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.decoder.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index d256c1f3eed7bec72c6212d2ce3661e17814e9f7..cd0a6190e95026a74385dbc2d9bdf0dab00851b4 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -25,9 +25,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiMo model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -39,7 +39,9 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
 from vllm.sequence import IntermediateTensors
 
@@ -54,21 +56,21 @@ logger = init_logger(__name__)
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class MiMoModel(Qwen2Model):
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -81,15 +83,13 @@ class MiMoModel(Qwen2Model):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states = hidden_states + residual
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -104,18 +104,19 @@ class MiMoModel(Qwen2Model):
                 continue
             if "rotary_emb.inv_freq" in name:
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -139,50 +140,49 @@ class MiMoModel(Qwen2Model):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
-        self.model = MiMoModel(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
+        self.model = MiMoModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         hidden_states = self.model.norm(hidden_states)
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index b4abe458e477164da176f4ddd04f3e591fe0828e..9905f65b74ca78cd0e852a5ae32863980334ef83 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -19,8 +19,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiMo-MTP model."""
+
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -31,7 +31,9 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2DecoderLayer
 from vllm.sequence import IntermediateTensors
@@ -40,30 +42,28 @@ from .utils import maybe_prefix
 
 
 class MiMoMultiTokenPredictorLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
         model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
 
-        self.token_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.hidden_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
-        self.input_proj = nn.Linear(config.hidden_size * 2,
-                                    config.hidden_size,
-                                    bias=False)
-        self.mtp_block = Qwen2DecoderLayer(config=config,
-                                           cache_config=cache_config,
-                                           quant_config=quant_config,
-                                           prefix=prefix)
-        self.final_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.token_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=False
+        )
+        self.mtp_block = Qwen2DecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -79,17 +79,17 @@ class MiMoMultiTokenPredictorLayer(nn.Module):
         previous_hidden_states = self.hidden_layernorm(previous_hidden_states)
 
         hidden_states = self.input_proj(
-            torch.cat([previous_hidden_states, inputs_embeds], dim=-1))
+            torch.cat([previous_hidden_states, inputs_embeds], dim=-1)
+        )
 
-        hidden_states, residual = self.mtp_block(positions=positions,
-                                                 hidden_states=hidden_states,
-                                                 residual=None)
+        hidden_states, residual = self.mtp_block(
+            positions=positions, hidden_states=hidden_states, residual=None
+        )
         hidden_states = residual + hidden_states
         return self.final_layernorm(hidden_states)
 
 
 class MiMoMultiTokenPredictor(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -102,30 +102,35 @@ class MiMoMultiTokenPredictor(nn.Module):
             config.hidden_size,
         )
 
-        self.mtp_layers = torch.nn.ModuleDict({
-            str(idx):
-            MiMoMultiTokenPredictorLayer(
-                config,
-                f"{prefix}.layers.{idx}",
-                model_config=vllm_config.model_config,
-                cache_config=vllm_config.cache_config,
-                quant_config=vllm_config.quant_config,
-            )
-            for idx in range(self.mtp_start_layer_idx,
-                             self.mtp_start_layer_idx + self.num_mtp_layers)
-        })
+        self.mtp_layers = torch.nn.ModuleDict(
+            {
+                str(idx): MiMoMultiTokenPredictorLayer(
+                    config,
+                    f"{prefix}.layers.{idx}",
+                    model_config=vllm_config.model_config,
+                    cache_config=vllm_config.cache_config,
+                    quant_config=vllm_config.quant_config,
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         return self.mtp_layers[str(self.mtp_start_layer_idx + spec_step_idx)](
@@ -147,41 +152,44 @@ class MiMoMultiTokenPredictor(nn.Module):
 
 
 class MiMoMTP(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
-        self.model = MiMoMultiTokenPredictor(vllm_config=vllm_config,
-                                             prefix=maybe_prefix(
-                                                 prefix, "model"))
-        self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                      self.config.hidden_size,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = MiMoMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "mimo_mtp only support predict one token now"
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
-        return self.model.compute_logits(hidden_states, self.lm_head,
-                                         spec_step_idx)
+    ) -> torch.Tensor | None:
+        return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -193,12 +201,11 @@ class MiMoMTP(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-
             if "rotary_emb.inv_freq" in name:
                 continue
             name = self.map_model_name_to_mtp_param_name(name)
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -210,7 +217,7 @@ class MiMoMTP(nn.Module):
                 # name will be updated to mlp.experts[0].gate_up_proj, which
                 # will then be updated below in expert_params_mapping
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
+                if ("mlp.experts." in name) and name not in params_dict:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
@@ -225,12 +232,12 @@ class MiMoMTP(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                if "mtp_layers" not in name and ("embed_tokens" not in name
-                                                 and "lm_head" not in name):
+                if "mtp_layers" not in name and (
+                    "embed_tokens" not in name and "lm_head" not in name
+                ):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -247,8 +254,10 @@ class MiMoMTP(nn.Module):
             name = name.replace(match.group(), f"{match.group(1)}{new_num}.")
         # check for early turn
         name_without_prefix = [
-            "token_layernorm", "hidden_layernorm", "input_proj",
-            "final_layernorm"
+            "token_layernorm",
+            "hidden_layernorm",
+            "input_proj",
+            "final_layernorm",
         ]
         for sub_name in name_without_prefix:
             if sub_name in name:
@@ -266,7 +275,11 @@ class MiMoMTP(nn.Module):
         Add .mtp_block for modules in transformer layer block for spec layer
         """
         spec_layer_weight_names = [
-            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
         ]
         spec_layer_weight = False
         for weight_name in spec_layer_weight_names:
@@ -275,6 +288,7 @@ class MiMoMTP(nn.Module):
                 break
         if not spec_layer_weight:
             # treat rest weights as weights for transformer layer block
-            name = name.replace(f"model.layers.{spec_layer}.",
-                                f"model.layers.{spec_layer}.mtp_block.")
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
         return name
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 55fe3e2ae3ae71025afc272c024f75ce1fa546c1..914b097fe199eb78982b1e3f4e177f3a3eda2592 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -23,10 +23,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
+
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,30 +36,41 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class MiniCPMMoE(nn.Module):
@@ -76,8 +88,8 @@ class MiniCPMMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
     ):
         super().__init__()
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -90,34 +102,53 @@ class MiniCPMMoE(nn.Module):
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
 
-        self.gate = ReplicatedLinear(self.hidden_size,
-                                     self.num_total_experts,
-                                     bias=False,
-                                     params_dtype=self.params_dtype,
-                                     quant_config=None)
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=self.params_dtype,
+            quant_config=None,
+        )
 
         self.ws = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        2 * self.intermediate_size,
-                        self.hidden_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype))
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.hidden_size,
+                device=current_platform.device_type,
+                dtype=self.params_dtype,
+            )
+        )
         self.w2s = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        self.hidden_size,
-                        self.intermediate_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype))
-
-        set_weight_attrs(self.ws, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2s, {
-            "weight_loader": self.weight_loader,
-        })
-
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, expert_id: int):
+            torch.empty(
+                self.num_total_experts,
+                self.hidden_size,
+                self.intermediate_size,
+                device=current_platform.device_type,
+                dtype=self.params_dtype,
+            )
+        )
+
+        set_weight_attrs(
+            self.ws,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2s,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        expert_id: int,
+    ):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.data
         shard_size = self.intermediate_size
@@ -125,8 +156,9 @@ class MiniCPMMoE(nn.Module):
         if weight_name.endswith("w1.weight"):
             param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w3.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+            param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[
+                shard, :
+            ]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
 
@@ -136,51 +168,54 @@ class MiniCPMMoE(nn.Module):
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        topk_weights, topk_ids, _ = fused_topk(hidden_states,
-                                               router_logits,
-                                               self.top_k,
-                                               renormalize=True)
+        topk_weights, topk_ids, _ = fused_topk(
+            hidden_states, router_logits, self.top_k, renormalize=True
+        )
 
-        final_hidden_states = fused_experts(hidden_states,
-                                            self.ws,
-                                            self.w2s,
-                                            topk_weights,
-                                            topk_ids,
-                                            inplace=True)
+        final_hidden_states = fused_experts(
+            hidden_states, self.ws, self.w2s, topk_weights, topk_ids, inplace=True
+        )
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
 class MiniCPMMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
         hidden_act_param: float,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act == "silu":
             self.act_fn = SiluAndMul()
         elif hidden_act == "fatrelu":
             self.act_fn = FatreluAndMul(threshold=hidden_act_param)
         else:
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu and fatrelu are supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu and fatrelu are supported for now."
+            )
 
     def forward(self, x):
         gate_up, _ = self.gate_up_proj(x)
@@ -190,17 +225,16 @@ class MiniCPMMLP(nn.Module):
 
 
 class MiniCPMAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -233,12 +267,14 @@ class MiniCPMAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -249,13 +285,15 @@ class MiniCPMAttention(nn.Module):
             rope_scaling=rope_scaling,
         )
 
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -274,12 +312,11 @@ class MiniCPMAttention(nn.Module):
 
 
 class MiniCPMDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -289,15 +326,15 @@ class MiniCPMDecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         self.rope_theta = getattr(config, "rope_theta", 10000)
         self.rope_scaling = getattr(config, "rope_scaling", None)
-        self.max_position_embeddings = getattr(config,
-                                               "max_position_embeddings", 8192)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
         self._init_ffn_block()
 
     def _init_attn_block(self):
-        self.input_layernorm = RMSNorm(self.config.hidden_size,
-                                       eps=self.config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
         self.self_attn = MiniCPMAttention(
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
@@ -311,15 +348,16 @@ class MiniCPMDecoderLayer(nn.Module):
         )
 
     def _init_ffn_block(self):
-        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
-                                                eps=self.config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
         self.num_experts = getattr(self.config, "num_experts", 0)
         if self.num_experts == 0:
             self.mlp = MiniCPMMLP(
                 hidden_size=self.hidden_size,
                 intermediate_size=self.config.intermediate_size,
                 hidden_act=self.config.hidden_act,
-                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.0),
                 quant_config=self.quant_config,
             )
         else:
@@ -327,13 +365,14 @@ class MiniCPMDecoderLayer(nn.Module):
                 num_experts=self.config.num_experts,
                 top_k=self.config.num_experts_per_tok,
                 hidden_size=self.config.hidden_size,
-                intermediate_size=self.config.intermediate_size)
+                intermediate_size=self.config.intermediate_size,
+            )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -342,41 +381,39 @@ class MiniCPMDecoderLayer(nn.Module):
             positions=positions,
             hidden_states=hidden_states,
         )
-        hidden_states = residual + hidden_states * \
-            (self.config.scale_depth / math.sqrt(self.config.num_hidden_layers))
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
 
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states * \
-            (self.config.scale_depth / math.sqrt(self.config.num_hidden_layers))
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
 
         return hidden_states, None
 
 
 @support_torch_compile
 class MiniCPMModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config)
@@ -384,24 +421,26 @@ class MiniCPMModel(nn.Module):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], self.config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], self.config.hidden_size
+        )
 
     def _init_layers(
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MiniCPMDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -409,15 +448,14 @@ class MiniCPMModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
-                                                        list[torch.Tensor]]]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -425,11 +463,12 @@ class MiniCPMModel(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                islice(self.layers, self.start_layer, self.end_layer)):
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(
-                    hidden_states +
-                    residual if residual is not None else hidden_states)
+                    hidden_states + residual if residual is not None else hidden_states
+                )
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
@@ -437,10 +476,9 @@ class MiniCPMModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states = self.norm(hidden_states)
 
@@ -448,8 +486,7 @@ class MiniCPMModel(nn.Module):
             return hidden_states, aux_hidden_states
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -460,8 +497,11 @@ class MiniCPMModel(nn.Module):
         ]
         expert_params_mapping = [
             # (param_name, weight_name, expert_id)
-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
             for expert_id in range(self.num_experts)
             for weight_name in ["w1", "w2", "w3"]
         ]
@@ -471,12 +511,11 @@ class MiniCPMModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -498,10 +537,9 @@ class MiniCPMModel(nn.Module):
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  weight_name,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -510,8 +548,9 @@ class MiniCPMModel(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -542,29 +581,23 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
+        parallel_config = vllm_config.parallel_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
-        self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+        self.model = self._init_model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -572,16 +605,18 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+        if parallel_config.enable_eplb and getattr(config, "num_experts", 0) > 0:
+            raise NotImplementedError("EPLB is not supported for MiniCPM yet.")
 
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
@@ -594,12 +629,12 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
-                                                        list[torch.Tensor]]]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         if isinstance(model_output, tuple) and len(model_output) == 2:
             # Aux hidden states are present.
@@ -617,15 +652,13 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 92c13e81bf3e4a437af6c402c9da5c955b2f1919..d3b6966ee3a7ff980f2408e7c0334d5960dfe1c9 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -24,7 +24,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
-from typing import Any, Optional
+
+from typing import Any
 
 import torch
 from torch import nn
@@ -34,20 +35,23 @@ from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.models.minicpm import (MiniCPMDecoderLayer,
-                                                MiniCPMForCausalLM,
-                                                MiniCPMModel)
+from vllm.model_executor.models.minicpm import (
+    MiniCPMDecoderLayer,
+    MiniCPMForCausalLM,
+    MiniCPMModel,
+)
 
 from .utils import make_layers
 
 
 class MiniCPM3Attention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -59,10 +63,10 @@ class MiniCPM3Attention(nn.Module):
         q_lora_rank: int,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -83,33 +87,41 @@ class MiniCPM3Attention(nn.Module):
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
-        self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                         self.q_lora_rank,
-                                         bias=False,
-                                         quant_config=quant_config)
+        self.q_a_proj = ReplicatedLinear(
+            self.hidden_size, self.q_lora_rank, bias=False, quant_config=quant_config
+        )
         self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
-        self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                             self.num_heads * self.qk_head_dim,
-                                             bias=False,
-                                             quant_config=quant_config)
-
-        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
-                                                   self.kv_lora_rank +
-                                                   self.qk_rope_head_dim,
-                                                   bias=False,
-                                                   quant_config=quant_config)
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
+        self.q_b_proj = ColumnParallelLinear(
+            q_lora_rank,
+            self.num_heads * self.qk_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_b_proj",
+        )
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
         # O projection.
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         self.rotary_emb = get_rope(
             self.qk_rope_head_dim,
@@ -118,13 +130,15 @@ class MiniCPM3Attention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_local_heads,
-                              self.qk_head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_local_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_local_heads,
+            self.qk_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -135,55 +149,52 @@ class MiniCPM3Attention(nn.Module):
         q = self.q_a_layernorm(q)
         q, _ = self.q_b_proj(q)
         q = q.view(-1, self.num_local_heads, self.qk_head_dim)
-        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                          dim=-1)
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
-        kv_a, _ = latent_cache.split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         latent_cache = latent_cache.unsqueeze(1)
         kv_a = self.kv_a_layernorm(kv_a.contiguous())
         kv, _ = self.kv_b_proj(kv_a)
-        kv = kv.view(-1, self.num_local_heads,
-                     self.qk_nope_head_dim + self.v_head_dim)
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
 
         q_pe, k_pe = self.rotary_emb(
             positions,
             q_pe.reshape(-1, self.num_local_heads * self.qk_rope_head_dim),
-            k_pe.reshape(-1, self.qk_rope_head_dim))
+            k_pe.reshape(-1, self.qk_rope_head_dim),
+        )
         q_pe = q_pe.view(-1, self.num_local_heads, self.qk_rope_head_dim)
         k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
 
-        q[..., self.qk_nope_head_dim:] = q_pe
+        q[..., self.qk_nope_head_dim :] = q_pe
 
         k = torch.empty_like(q)
 
-        k[..., :self.qk_nope_head_dim] = k_nope
-        k[..., self.qk_nope_head_dim:] = k_pe
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
 
         q = q.reshape(-1, self.num_local_heads * self.qk_head_dim)
         k = k.view(-1, self.num_local_heads * self.qk_head_dim)
         v = torch.nn.functional.pad(
-            v, [0, self.qk_head_dim - self.v_head_dim],
-            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
+            v, [0, self.qk_head_dim - self.v_head_dim], value=0
+        ).view(-1, self.num_local_heads * self.qk_head_dim)
 
         attn_output = self.attn(q, k, v)
-        attn_output = attn_output.view(
-            -1, self.num_local_heads,
-            self.qk_head_dim)[..., :self.v_head_dim].reshape(
-                -1, self.num_local_heads * self.v_head_dim)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.qk_head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_local_heads * self.v_head_dim)
 
         output, _ = self.o_proj(attn_output)
         return output
 
 
 class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
-
     def _init_attn_block(self):
-        self.input_layernorm = RMSNorm(self.config.hidden_size,
-                                       eps=self.config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
         self.self_attn = MiniCPM3Attention(
             config=self.config,
             hidden_size=self.hidden_size,
@@ -203,19 +214,20 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
 
 
 class MiniCPM3Model(MiniCPMModel):
-
     def _init_layers(
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MiniCPM3DecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
 
 class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 2af0d546ce63dd819384596ed5418e8665bb1cd1..d0cdb70aa85745717d1d148add303dc7501f948e 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -23,9 +23,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only EagleMiniCPM model compatible with HuggingFace weights."""
+
 import math
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -37,25 +37,31 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsLoRA, SupportsPP
 from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
 from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
 from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 
 class EagleMiniCPMDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -65,15 +71,15 @@ class EagleMiniCPMDecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         self.rope_theta = getattr(config, "rope_theta", 10000)
         self.rope_scaling = getattr(config, "rope_scaling", None)
-        self.max_position_embeddings = getattr(config,
-                                               "max_position_embeddings", 8192)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
         self._init_ffn_block()
 
     def _init_attn_block(self):
-        self.input_layernorm = RMSNorm(self.config.hidden_size,
-                                       eps=self.config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
         self.self_attn = EagleMiniCPMAttention(
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
@@ -87,15 +93,16 @@ class EagleMiniCPMDecoderLayer(nn.Module):
         )
 
     def _init_ffn_block(self):
-        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
-                                                eps=self.config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
         self.num_experts = getattr(self.config, "num_experts", 0)
         if self.num_experts == 0:
             self.mlp = EagleMiniCPMMLP(
                 hidden_size=self.hidden_size,
                 intermediate_size=self.config.intermediate_size,
                 hidden_act=self.config.hidden_act,
-                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.0),
                 quant_config=self.quant_config,
             )
         else:
@@ -103,13 +110,14 @@ class EagleMiniCPMDecoderLayer(nn.Module):
                 num_experts=self.config.num_experts,
                 top_k=self.config.num_experts_per_tok,
                 hidden_size=self.config.hidden_size,
-                intermediate_size=self.config.intermediate_size)
+                intermediate_size=self.config.intermediate_size,
+            )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -118,77 +126,75 @@ class EagleMiniCPMDecoderLayer(nn.Module):
             positions=positions,
             hidden_states=hidden_states,
         )
-        hidden_states = residual + hidden_states * \
-            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.mup_denominator)
+        )
 
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states * \
-            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.mup_denominator)
+        )
 
         return hidden_states, None
 
 
 @support_torch_compile
 class EagleMiniCPMModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 start_layer: int = 0):
+    def __init__(
+        self, *, vllm_config: VllmConfig, prefix: str = "", start_layer: int = 0
+    ):
         super().__init__()
 
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
-                                  self.config.hidden_size,
-                                  bias=False)
+
+        self.vocab_size = config.vocab_size
+
+        self.fc = torch.nn.Linear(
+            self.config.hidden_size * 2, self.config.hidden_size, bias=False
+        )
         self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
-        self._init_layers(prefix, config, cache_config, quant_config,
-                          start_layer)
+        self._init_layers(prefix, config, cache_config, quant_config, start_layer)
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], self.config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], self.config.hidden_size
+        )
 
     def _init_layers(
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
         start_layer: int,
     ):
-        self.eagle_layers = nn.ModuleList([
-            EagleMiniCPMDecoderLayer(
-                config,
-                cache_config,
-                quant_config,
-                f"{prefix}.eagle_layers.{i + start_layer}",
-            ) for i in range(self.config.num_hidden_layers)
-        ])
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        self.eagle_layers = nn.ModuleList(
+            [
+                EagleMiniCPMDecoderLayer(
+                    config,
+                    cache_config,
+                    quant_config,
+                    f"{prefix}.eagle_layers.{i + start_layer}",
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -197,13 +203,12 @@ class EagleMiniCPMModel(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        input_embeds = self.get_input_embeddings(input_ids)
+    ) -> torch.Tensor | IntermediateTensors:
+        input_embeds = self.embed_input_ids(input_ids)
         input_embeds = self.input_norm1(input_embeds)
         hidden_states = self.input_norm2(hidden_states)
 
-        hidden_states = self.fc(
-            torch.cat((input_embeds, hidden_states), dim=-1))
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.eagle_layers:
             hidden_states, residual = layer(
@@ -214,8 +219,7 @@ class EagleMiniCPMModel(nn.Module):
 
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -226,8 +230,11 @@ class EagleMiniCPMModel(nn.Module):
         ]
         expert_params_mapping = [
             # (param_name, weight_name, expert_id)
-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
             for expert_id in range(self.num_experts)
             for weight_name in ["w1", "w2", "w3"]
         ]
@@ -237,12 +244,11 @@ class EagleMiniCPMModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -264,10 +270,9 @@ class EagleMiniCPMModel(nn.Module):
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  weight_name,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -276,15 +281,16 @@ class EagleMiniCPMModel(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
 
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -309,33 +315,27 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
         target_layer_num = vllm_config.model_config.get_num_layers(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
 
-        self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"),
-                                      start_layer=target_layer_num)
+        self.model = self._init_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer=target_layer_num,
+        )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -343,22 +343,20 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def _init_model(self,
-                    *,
-                    vllm_config: VllmConfig,
-                    prefix: str = "",
-                    start_layer: int = 0):
-        return EagleMiniCPMModel(vllm_config=vllm_config,
-                                 prefix=prefix,
-                                 start_layer=start_layer)
+    def _init_model(
+        self, *, vllm_config: VllmConfig, prefix: str = "", start_layer: int = 0
+    ):
+        return EagleMiniCPMModel(
+            vllm_config=vllm_config, prefix=prefix, start_layer=start_layer
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -366,8 +364,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        hidden_states, hidden_states2 = self.model(input_ids, positions,
-                                                   hidden_states)
+        hidden_states, hidden_states2 = self.model(input_ids, positions, hidden_states)
         hidden_states = hidden_states / self.scale_width
         hidden_states2 = hidden_states2 / self.scale_width
         return hidden_states, hidden_states2
@@ -375,15 +372,18 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        def transform(inputs):
+            name, loaded_weight = inputs
+            process_eagle_weight(self, name)
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(map(transform, weights))
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index e5333fb652b131457c420a62e0fd178d8b2b9a1f..fa2feb0ba10b4e247cf58b91be49b8c5c859b868 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,37 +23,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
 from transformers import BatchFeature
 from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.whisper.modeling_whisper import (ACT2FN,
-                                                          WhisperAttention,
-                                                          WhisperConfig,
-                                                          WhisperEncoder)
+from transformers.models.whisper.modeling_whisper import (
+    ACT2FN,
+    WhisperAttention,
+    WhisperConfig,
+    WhisperEncoder,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    NestedTensors)
-from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
-                                   DictEmbeddingItems, ModalityData,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    AudioItem,
+    AudioProcessorItems,
+    DictEmbeddingItems,
+    ModalityData,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
-                       MiniCPMVDummyInputsBuilder,
-                       MiniCPMVMultiModalDataParser,
-                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
-                       _minicpmv_field_config)
-from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
-                    maybe_prefix)
+from .minicpmv import (
+    _MAX_FRAMES_PER_VIDEO,
+    MiniCPMV2_6,
+    MiniCPMVDummyInputsBuilder,
+    MiniCPMVMultiModalDataParser,
+    MiniCPMVMultiModalProcessor,
+    MiniCPMVProcessingInfo,
+    _minicpmv_field_config,
+)
+from .utils import AutoWeightsLoader, cast_overflow_tensors, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -67,10 +85,11 @@ class MiniCPMOAudioFeatureInputs(TensorSchema):
         - l: Length
         - s: Number of slices
     """
+
     type: Literal["audio_features"] = "audio_features"
 
     audio_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bns", "c", "l", dynamic_dims={"l"}),
     ]
     """
@@ -80,7 +99,7 @@ class MiniCPMOAudioFeatureInputs(TensorSchema):
     """
 
     audio_feature_lens: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "s"),
     ]
     """
@@ -95,36 +114,33 @@ class MiniCPMOAudioEmbeddingInputs(TensorSchema):
         - bn: Batch size * number of audios
         - s: Number of slices
         - h: Hidden size (must match language model backbone)
-    
+
     Length of each slice may vary, so pass it as a list.
     """
+
     type: Literal["audio_embeds"] = "audio_embeds"
 
     audio_embeds: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "s", "h", dynamic_dims={"s"}),
     ]
 
 
-MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
-                            MiniCPMOAudioEmbeddingInputs]
+MiniCPMOAudioInputs: TypeAlias = (
+    MiniCPMOAudioFeatureInputs | MiniCPMOAudioEmbeddingInputs
+)
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    audio_features = hf_inputs.get("audio_features", torch.empty(0))
-    num_audios = len(audio_features)
-
     return dict(
         **_minicpmv_field_config(hf_inputs),
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
-        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
 
 class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
-
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
@@ -142,11 +158,10 @@ class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
 
 
 class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
-
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMOAudioEmbeddingItems(
                 data,
@@ -159,7 +174,7 @@ class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
 class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {**super().get_supported_mm_limits(), "audio": None}
 
     def get_audio_placeholder(
@@ -214,18 +229,17 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_audio_tokens = self.get_max_audio_tokens() * max_audios
-        max_total_frames = self.get_max_video_frames(seq_len -
-                                                     max_image_tokens -
-                                                     max_audio_tokens)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   _MAX_FRAMES_PER_VIDEO)
+        max_total_frames = self.get_max_video_frames(
+            seq_len - max_image_tokens - max_audio_tokens
+        )
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
+        )
 
         return max(max_frames_per_video, 1)
 
 
-class MiniCPMODummyInputsBuilder(
-        MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
-
+class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
 
@@ -237,28 +251,33 @@ class MiniCPMODummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
-        audio_len = self.info.get_max_audio_chunks_with_most_features() * \
-            self.info.get_default_audio_sampling_rate()
+        audio_len = (
+            self.info.get_max_audio_chunks_with_most_features()
+            * self.info.get_default_audio_sampling_rate()
+        )
+
+        audio_overrides = mm_options.get("audio") if mm_options else None
 
         audio_mm_data = {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            )
         }
 
         return {
-            **super().get_dummy_mm_data(seq_len, mm_counts),
+            **super().get_dummy_mm_data(seq_len, mm_counts, mm_options),
             **audio_mm_data,
         }
 
 
-class MiniCPMOMultiModalProcessor(
-        MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]):
-
+class MiniCPMOMultiModalProcessor(MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMOMultiModalDataParser(
-            target_sr=self.info.get_default_audio_sampling_rate())
+            target_sr=self.info.get_default_audio_sampling_rate()
+        )
 
     def get_audio_prompt_texts(
         self,
@@ -281,10 +300,11 @@ class MiniCPMOMultiModalProcessor(
         if (audios := mm_data.get("audios")) is None:
             return {}
 
-        parsed_audios = (self._get_data_parser().parse_mm_data({
-            "audio": audios
-        }).get_items("audio",
-                     (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)))
+        parsed_audios = (
+            self._get_data_parser()
+            .parse_mm_data({"audio": audios})
+            .get_items("audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems))
+        )
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
@@ -292,9 +312,7 @@ class MiniCPMOMultiModalProcessor(
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
                 mm_data={"audios": [[audio] for audio in parsed_audios]},
-                mm_kwargs={
-                    **mm_kwargs, "chunk_input": True
-                },
+                mm_kwargs={**mm_kwargs, "chunk_input": True},
                 tok_kwargs=tok_kwargs,
                 out_keys={"audio_features", "audio_feature_lens"},
             )
@@ -302,17 +320,14 @@ class MiniCPMOMultiModalProcessor(
             # Avoid padding since we need the output for each audio to be
             # independent of other audios for the cache to work correctly
             unpadded_audio_features = [
-                feat[:, :feature_len] for feat, feature_len in zip(
+                feat[:, :feature_len]
+                for feat, feature_len in zip(
                     audio_inputs["audio_features"],
                     audio_inputs["audio_feature_lens"],
                 )
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
-
         return audio_inputs
 
     def process_mm_inputs(
@@ -342,12 +357,14 @@ class MiniCPMOMultiModalProcessor(
 
         def get_audio_replacement(item_idx: int):
             audios = mm_items.get_items(
-                "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems))
+                "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)
+            )
 
             if isinstance(audios, MiniCPMOAudioEmbeddingItems):
                 single_audio_embeds = audios.get(item_idx)["audio_embeds"]
                 audio_len = self.info.get_audio_len_by_num_chunks(
-                    sum(map(len, single_audio_embeds)))
+                    sum(map(len, single_audio_embeds))
+                )
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
@@ -358,9 +375,11 @@ class MiniCPMOMultiModalProcessor(
 
         return [
             *base_updates,
-            PromptReplacement(modality="audio",
-                              target=audio_placeholder,
-                              replacement=get_audio_replacement),
+            PromptReplacement(
+                modality="audio",
+                target=audio_placeholder,
+                replacement=get_audio_replacement,
+            ),
         ]
 
     def _get_mm_fields_config(
@@ -372,16 +391,11 @@ class MiniCPMOMultiModalProcessor(
 
 
 class MultiModalProjector(nn.Module):
-
     def __init__(self, in_dim: int, out_dim: int):
         super().__init__()
-        self.linear1 = nn.Linear(in_features=in_dim,
-                                 out_features=out_dim,
-                                 bias=True)
+        self.linear1 = nn.Linear(in_features=in_dim, out_features=out_dim, bias=True)
         self.relu = nn.ReLU()
-        self.linear2 = nn.Linear(in_features=out_dim,
-                                 out_features=out_dim,
-                                 bias=True)
+        self.linear2 = nn.Linear(in_features=out_dim, out_features=out_dim, bias=True)
 
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.relu(self.linear1(audio_features))
@@ -390,7 +404,6 @@ class MultiModalProjector(nn.Module):
 
 
 class MiniCPMWhisperEncoderLayer(nn.Module):
-
     def __init__(self, config: WhisperConfig, layer_idx: int):
         super().__init__()
         self.embed_dim = config.d_model
@@ -415,55 +428,55 @@ class MiniCPMWhisperEncoderLayer(nn.Module):
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         residual = hidden_states
-        past_key_values = None
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights, past_key_values = self.self_attn(
+        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            past_key_value=past_key_values,
         )
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.dropout,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
 
         residual = hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.activation_dropout,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
         hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.dropout,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
         hidden_states = residual + hidden_states
 
         if hidden_states.dtype == torch.float16:
             hidden_states = cast_overflow_tensors(hidden_states)
 
-        outputs = (hidden_states, )
+        outputs = (hidden_states,)
 
         return outputs
 
 
 class MiniCPMWhisperEncoder(WhisperEncoder):
-
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
-        self.layers = nn.ModuleList([
-            MiniCPMWhisperEncoderLayer(config, layer_idx=i)
-            for i in range(config.encoder_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+                for i in range(config.encoder_layers)
+            ]
+        )
 
     def forward(
         self,
         input_features: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
     ) -> BaseModelOutputWithPast:
         # Ignore copy
-        input_features = input_features.to(dtype=self.conv1.weight.dtype,
-                                           device=self.conv1.weight.device)
+        input_features = input_features.to(
+            dtype=self.conv1.weight.dtype, device=self.conv1.weight.device
+        )
 
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
@@ -472,17 +485,17 @@ class MiniCPMWhisperEncoder(WhisperEncoder):
 
         embed_pos = self.embed_positions.weight
 
-        embed_pos = embed_pos[:inputs_embeds.shape[1], :]
+        embed_pos = embed_pos[: inputs_embeds.shape[1], :]
 
         hidden_states = inputs_embeds + embed_pos
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.dropout,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
 
         encoder_states = ()
 
         for idx, encoder_layer in enumerate(self.layers):
-            encoder_states = encoder_states + (hidden_states, )
+            encoder_states = encoder_states + (hidden_states,)
             to_drop = False
             if self.training:
                 dropout_probability = torch.rand([])
@@ -501,7 +514,7 @@ class MiniCPMWhisperEncoder(WhisperEncoder):
                 hidden_states = layer_outputs[0]
 
         hidden_states = self.layer_norm(hidden_states)
-        encoder_states = encoder_states + (hidden_states, )
+        encoder_states = encoder_states + (hidden_states,)
 
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
@@ -512,7 +525,8 @@ class MiniCPMWhisperEncoder(WhisperEncoder):
 @MULTIMODAL_REGISTRY.register_processor(
     MiniCPMOMultiModalProcessor,
     info=MiniCPMOProcessingInfo,
-    dummy_inputs=MiniCPMODummyInputsBuilder)
+    dummy_inputs=MiniCPMODummyInputsBuilder,
+)
 class MiniCPMO(MiniCPMV2_6):
     packed_modules_mapping = {
         "qkv_proj": [
@@ -527,7 +541,7 @@ class MiniCPMO(MiniCPMV2_6):
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "(<image>./</image>)"
         if modality.startswith("video"):
@@ -539,26 +553,25 @@ class MiniCPMO(MiniCPMV2_6):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
-        self.apm = self.init_audio_module(vllm_config=vllm_config,
-                                          prefix=maybe_prefix(prefix, "apm"))
-
-        self.audio_token_id = None
+        self.apm = self.init_audio_module(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm")
+        )
 
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
         model = MiniCPMWhisperEncoder(audio_config)
         audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
-        self.audio_avg_pooler = \
-            nn.AvgPool1d(self.config.audio_pool_step,
-                         stride=self.config.audio_pool_step)
-        self.audio_projection_layer = \
-            MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim)
+        self.audio_avg_pooler = nn.AvgPool1d(
+            self.config.audio_pool_step, stride=self.config.audio_pool_step
+        )
+        self.audio_projection_layer = MultiModalProjector(
+            in_dim=audio_output_dim, out_dim=self.embed_dim
+        )
         self.audio_encoder_layer = -1
         return model
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
         return loader.load_weights(weights)
 
@@ -579,14 +592,13 @@ class MiniCPMO(MiniCPMV2_6):
             start_indices = torch.zeros_like(row_indices)
         else:
             # Compute start indices vectorially
-            start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks,
-                                              min=0)
+            start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks, min=0)
             start_indices = start_chunk_indices * chunk_size
         # Compute ending indices vectorially
         end_chunk_indices = chunk_indices + 1
-        end_indices = torch.clamp(end_chunk_indices * chunk_size +
-                                  num_lookhead,
-                                  max=size)
+        end_indices = torch.clamp(
+            end_chunk_indices * chunk_size + num_lookhead, max=size
+        )
         # Create column indices for broadcasting
         col_indices = torch.arange(size, device=device).unsqueeze(0)
         start_indices = start_indices.unsqueeze(1)
@@ -595,19 +607,18 @@ class MiniCPMO(MiniCPMV2_6):
         ret = (col_indices >= start_indices) & (col_indices < end_indices)
         return ret
 
-    def _get_feat_extract_output_lengths(self,
-                                         input_lengths: torch.LongTensor):
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
         input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
         input_lengths_after_pooling = (
-            input_lengths_after_cnn -
-            self.config.audio_pool_step) // self.config.audio_pool_step + 1
-        input_lengths_after_pooling = input_lengths_after_pooling.to(
-            dtype=torch.int32)
+            input_lengths_after_cnn - self.config.audio_pool_step
+        ) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(dtype=torch.int32)
 
         return input_lengths_after_cnn, input_lengths_after_pooling
 
     def get_audio_hidden_states(
-            self, data: MiniCPMOAudioFeatureInputs) -> list[torch.Tensor]:
+        self, data: MiniCPMOAudioFeatureInputs
+    ) -> list[torch.Tensor]:
         chunk_length = self.config.audio_chunk_length
 
         # (bs, 80, frames) or [], multi audios need filled in advance
@@ -636,23 +647,26 @@ class MiniCPMO(MiniCPMV2_6):
         max_seq_len = (max_mel_seq_len - 1) // 2 + 1
 
         # Create a sequence tensor of shape (batch_size, max_seq_len)
-        seq_range = (torch.arange(
-            0,
-            max_seq_len,
-            dtype=audio_feature_lens.dtype,
-            device=audio_feature_lens.device).unsqueeze(0).expand(
-                batch_size, max_seq_len))
-        lengths_expand = audio_feature_lens.unsqueeze(1).expand(
-            batch_size, max_seq_len)
+        seq_range = (
+            torch.arange(
+                0,
+                max_seq_len,
+                dtype=audio_feature_lens.dtype,
+                device=audio_feature_lens.device,
+            )
+            .unsqueeze(0)
+            .expand(batch_size, max_seq_len)
+        )
+        lengths_expand = audio_feature_lens.unsqueeze(1).expand(batch_size, max_seq_len)
         # Create mask
         padding_mask = seq_range >= lengths_expand  # 1 for padded values
 
-        audio_attention_mask_ = padding_mask.view(
-            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
-                                                  max_seq_len)
+        audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+            batch_size, 1, max_seq_len, max_seq_len
+        )
         audio_attention_mask = audio_attention_mask_.to(
-            dtype=self.apm.conv1.weight.dtype,
-            device=self.apm.conv1.weight.device)
+            dtype=self.apm.conv1.weight.dtype, device=self.apm.conv1.weight.device
+        )
 
         if chunk_length > 0:
             chunk_num_frame = int(chunk_length * 50)
@@ -663,20 +677,22 @@ class MiniCPMO(MiniCPMV2_6):
                 device=audio_attention_mask_.device,
             )
             audio_attention_mask_ = torch.logical_or(
-                audio_attention_mask_, torch.logical_not(chunk_mask))
+                audio_attention_mask_, torch.logical_not(chunk_mask)
+            )
 
         audio_attention_mask[audio_attention_mask_] = float("-inf")
         audio_states = self.apm(
-            wavforms, attention_mask=audio_attention_mask).hidden_states[
-                self.audio_encoder_layer]
+            wavforms, attention_mask=audio_attention_mask
+        ).hidden_states[self.audio_encoder_layer]
         audio_embeds = self.audio_projection_layer(audio_states)
 
         audio_embeds = audio_embeds.transpose(1, 2)
         audio_embeds = self.audio_avg_pooler(audio_embeds)
         audio_embeds = audio_embeds.transpose(1, 2)
 
-        _, feature_lens_after_pooling = \
-            self._get_feat_extract_output_lengths(audio_feature_lens)
+        _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
+            audio_feature_lens
+        )
 
         num_audio_tokens = feature_lens_after_pooling
 
@@ -686,7 +702,8 @@ class MiniCPMO(MiniCPMV2_6):
             target_audio_embeds_lst = list[torch.Tensor]()
             for _ in range(len(audio_feature_lens_raw[i])):
                 target_audio_embeds_lst.append(
-                    audio_embeds[idx, :num_audio_tokens[idx], :])
+                    audio_embeds[idx, : num_audio_tokens[idx], :]
+                )
                 idx += 1
 
             final_audio_embeds.append(torch.cat(target_audio_embeds_lst))
@@ -694,46 +711,26 @@ class MiniCPMO(MiniCPMV2_6):
         return final_audio_embeds
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+        self, **kwargs: object
+    ) -> MiniCPMOAudioInputs | None:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
-        audio_token_id = kwargs.pop("audio_token_id")
-        if audio_token_id is not None:
-            assert isinstance(audio_token_id, torch.Tensor)
-            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
-
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-
-            audio_embeds_flat = flatten_bn(audio_embeds)
-
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
-                audio_embeds=audio_embeds_flat,
+                audio_embeds=audio_embeds,
             )
 
-        if not isinstance(audio_features, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_features. "
-                             f"Got type: {type(audio_features)}")
-
         audio_feature_lens = kwargs.pop("audio_feature_lens")
-        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_feature_lens. "
-                             f"Got type: {type(audio_feature_lens)}")
-
-        audio_features_flat = flatten_bn(audio_features)
-        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
 
         return MiniCPMOAudioFeatureInputs(
             type="audio_features",
-            audio_features=audio_features_flat,
-            audio_feature_lens=audio_feature_lens_flat,
+            audio_features=audio_features,
+            audio_feature_lens=audio_feature_lens,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -742,17 +739,18 @@ class MiniCPMO(MiniCPMV2_6):
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("audio_features",
-                             "audio_embeds") and "audios" not in modalities:
-                modalities["audios"] = self._parse_and_validate_audio_input(
-                    **kwargs)
+            if (
+                input_key in ("audio_features", "audio_embeds")
+                and "audios" not in modalities
+            ):
+                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)
 
         return modalities
 
     def _process_audio_input(
         self,
         audio_input: MiniCPMOAudioInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if audio_input["type"] == "audio_embeds":
             return audio_input["audio_embeds"]
 
@@ -764,7 +762,7 @@ class MiniCPMO(MiniCPMV2_6):
         for modality in modalities:
             if modality == "audios":
                 audio_input = modalities["audios"]
-                audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(audio_features)
+                audio_embeddings = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a17c4f004d75c36427ef97cef36989c4a6b7c029..2ac97764dd3418f3fb91baa14636a9604aed9801 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -23,12 +23,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+
 import math
 from collections import defaultdict
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from itertools import chain
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -39,40 +40,61 @@ from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
-from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
-                                                  get_2d_sincos_pos_embed)
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.layers.resampler import (
+    BaseResampler,
+    Resampler2,
+    get_2d_sincos_pos_embed,
+)
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
-                                   ImageProcessorItems, ImageSize,
-                                   ModalityData, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser,
-                                   VideoItem, VideoProcessorItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails,
-                                        ResolvedPromptUpdate, _seq2text)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageItem,
+    ImageProcessorItems,
+    ImageSize,
+    ModalityData,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+    VideoItem,
+    VideoProcessorItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+    ResolvedPromptUpdate,
+    _seq2text,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
+from vllm.utils.collection_utils import flatten_2d_lists
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.utils.torch_utils import set_default_torch_dtype
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -90,7 +112,7 @@ class MiniCPMVImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"] = "pixel_values"
 
-    # Note that the image size may vary, so we pass it as a list instead of a
+    # Note that the patch size may vary, so we pass it as a list instead of a
     # batched tensor.
     pixel_values: Annotated[
         list[torch.Tensor],
@@ -116,50 +138,53 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
 
     type: Literal["image_embeds"]
     image_embeds: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "ns", "hs"),
     ]
 
 
-MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
-                            MiniCPMVImageEmbeddingInputs]
+MiniCPMVImageInputs: TypeAlias = MiniCPMVImagePixelInputs | MiniCPMVImageEmbeddingInputs
 
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
 class Resampler2_5(BaseResampler):
-
-    def __init__(self,
-                 num_queries: int,
-                 embed_dim: int,
-                 num_heads: int,
-                 kv_dim: Optional[int] = None,
-                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-                 max_size: tuple[int, int] = (70, 70),
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
-        super().__init__(num_queries,
-                         embed_dim,
-                         num_heads,
-                         kv_dim,
-                         norm_layer,
-                         quant_config=quant_config,
-                         prefix=prefix)
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int | None = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: tuple[int, int] = (70, 70),
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
 
         self.max_size = max_size
         self._set_2d_pos_cache(self.max_size)
 
-    def _set_2d_pos_cache(self,
-                          max_size: tuple[int, int],
-                          device: torch.types.Device = "cpu") -> None:
-        pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
-                                                max_size,
-                                                version=(2, 5))
+    def _set_2d_pos_cache(
+        self, max_size: tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
         pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
 
-    def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
-                          device: torch.types.Device) -> None:
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
         max_h = tgt_sizes[:, 0].max().item()
         max_w = tgt_sizes[:, 1].max().item()
         assert isinstance(max_h, int) and isinstance(max_w, int)
@@ -171,8 +196,7 @@ class Resampler2_5(BaseResampler):
             )
             self._set_2d_pos_cache(self.max_size, device)
 
-    def forward(self, x: torch.Tensor,
-                tgt_sizes: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, tgt_sizes: torch.Tensor) -> torch.Tensor:
         assert x.shape[0] == tgt_sizes.shape[0]
         bs = x.shape[0]
 
@@ -186,21 +210,20 @@ class Resampler2_5(BaseResampler):
         max_patch_len = patch_len.max().item()
         assert isinstance(max_patch_len, int)
 
-        key_padding_mask = torch.zeros((bs, max_patch_len),
-                                       dtype=torch.bool,
-                                       device=device)
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
 
         pos_embed = []
         for i in range(bs):
             tgt_h, tgt_w = tgt_sizes[i].tolist()
-            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
-                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
-            key_padding_mask[i, patch_len[i]:] = True
-        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
-                                                    batch_first=True,
-                                                    padding_value=0.0).permute(
-                                                        1, 0,
-                                                        2)  # BLD => L * B * D
+            pos_embed.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0
+        ).permute(1, 0, 2)  # BLD => L * B * D
         x, _ = self.kv_proj(x)  # B * L * D
         x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
 
@@ -221,33 +244,37 @@ class Resampler2_5(BaseResampler):
 
 
 class Resampler4_5(Resampler2_5):
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int | None = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: tuple[int, int] = (70, 70),
+        max_temporal_size: int = 36000,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            max_size,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
 
-    def __init__(self,
-                 num_queries: int,
-                 embed_dim: int,
-                 num_heads: int,
-                 kv_dim: Optional[int] = None,
-                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-                 max_size: tuple[int, int] = (70, 70),
-                 max_temporal_size: int = 36000,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
-        super().__init__(num_queries,
-                         embed_dim,
-                         num_heads,
-                         kv_dim,
-                         norm_layer,
-                         max_size,
-                         quant_config=quant_config,
-                         prefix=prefix)
-
-        trunc_normal_(self.query, std=.02)
+        trunc_normal_(self.query, std=0.02)
         self.max_temporal_size = max_temporal_size
         self._set_temporal_pos_cache(self.max_temporal_size)
         self.apply(self._init_weights)
 
-    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
-                                                   pos: np.ndarray):
+    def get_1d_sincos_pos_embed_from_temporal_size(
+        self, embed_dim: int, pos: np.ndarray
+    ):
         """
         embed_dim: output dimension for each position
         pos: a list of positions to be encoded: size (M,)
@@ -255,11 +282,11 @@ class Resampler4_5(Resampler2_5):
         """
         assert embed_dim % 2 == 0
         omega = np.arange(embed_dim // 2, dtype=np.float32)
-        omega /= embed_dim / 2.
-        omega = 1. / 10000**omega  # (D/2,)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega  # (D/2,)
 
         pos = pos.reshape(-1)  # (M,)
-        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
 
         emb_sin = np.sin(out)  # (M, D/2)
         emb_cos = np.cos(out)  # (M, D/2)
@@ -267,25 +294,31 @@ class Resampler4_5(Resampler2_5):
         emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
         return emb
 
-    def _set_temporal_pos_cache(self,
-                                max_temporal_size: int,
-                                device: torch.types.Device = "cpu") -> None:
+    def _set_temporal_pos_cache(
+        self, max_temporal_size: int, device: torch.types.Device = "cpu"
+    ) -> None:
         temporal_size = np.arange(max_temporal_size, dtype=np.float32)
-        pos_embed = torch.from_numpy(
-            self.get_1d_sincos_pos_embed_from_temporal_size(
-                self.embed_dim, temporal_size)).float().to(device)
+        pos_embed = (
+            torch.from_numpy(
+                self.get_1d_sincos_pos_embed_from_temporal_size(
+                    self.embed_dim, temporal_size
+                )
+            )
+            .float()
+            .to(device)
+        )
         self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
 
-    def _adjust_temporal_pos_cache(self,
-                                   max_temporal_size: int,
-                                   device: torch.types.Device = "cpu"):
+    def _adjust_temporal_pos_cache(
+        self, max_temporal_size: int, device: torch.types.Device = "cpu"
+    ):
         if max_temporal_size > self.max_temporal_size:
             self.max_temporal_size = max_temporal_size
             self._set_temporal_pos_cache(self.max_temporal_size, device)
 
-    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
+    def _init_weights(self, m: nn.Linear | nn.LayerNorm):
         if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
+            trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
@@ -297,7 +330,7 @@ class Resampler4_5(Resampler2_5):
         x: torch.Tensor,
         tgt_sizes: torch.Tensor,
         # temporal_ids for high refresh rate videos
-        temporal_ids=None
+        temporal_ids=None,
     ) -> torch.Tensor:
         assert x.shape[0] == tgt_sizes.shape[0]
         bs = x.shape[0]
@@ -323,9 +356,9 @@ class Resampler4_5(Resampler2_5):
         max_patch_len = patch_len.max().item()
         assert isinstance(max_patch_len, int)
 
-        key_padding_mask = torch.zeros((bs, max_patch_len),
-                                       dtype=torch.bool,
-                                       device=device)
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
 
         x, _ = self.kv_proj(x)  # B * L * D
         x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
@@ -338,19 +371,21 @@ class Resampler4_5(Resampler2_5):
             if temporal_pos_emb:
                 if temporal_ids_flatten[i] == -1:
                     pos_embed_temporal.append(
-                        torch.zeros(self.embed_dim, dtype=dtype,
-                                    device=device))
+                        torch.zeros(self.embed_dim, dtype=dtype, device=device)
+                    )
                 else:
-                    pos_embed_temporal.append(self.temporal_pos_embed[
-                        temporal_ids_flatten[i]].to(dtype))  # D
+                    pos_embed_temporal.append(
+                        self.temporal_pos_embed[temporal_ids_flatten[i]].to(dtype)
+                    )  # D
 
-            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
-                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
-            key_padding_mask[i, patch_len[i]:] = True
+            pos_embed_2d.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
 
         pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
-            pos_embed_2d, batch_first=True,
-            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+            pos_embed_2d, batch_first=True, padding_value=0.0
+        ).permute(1, 0, 2)  # BLD => L * B * D
 
         k = x
         v = x + pos_embed_2d
@@ -366,26 +401,27 @@ class Resampler4_5(Resampler2_5):
                 end = start + len(tp)
                 # L * (end-start) * D -> (end-start) * L * D
                 # -> 1 * L*(end-start) * D
-                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
-                    -1, self.embed_dim))
-                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
-                    -1, self.embed_dim))
+                merge_k.append(
+                    k[:, start:end, :].permute(1, 0, 2).reshape(-1, self.embed_dim)
+                )
+                merge_v.append(
+                    v[:, start:end, :].permute(1, 0, 2).reshape(-1, self.embed_dim)
+                )
                 merge_key_padding_mask.append(
-                    key_padding_mask[start:end, :].reshape(-1, 1))
+                    key_padding_mask[start:end, :].reshape(-1, 1)
+                )
 
                 start = end
 
-            k = torch.nn.utils.rnn.pad_sequence(merge_k,
-                                                batch_first=True,
-                                                padding_value=0.0).permute(
-                                                    1, 0, 2)  # L*(end-start)
-            v = torch.nn.utils.rnn.pad_sequence(merge_v,
-                                                batch_first=True,
-                                                padding_value=0.0).permute(
-                                                    1, 0, 2)  # L*(end-start)
+            k = torch.nn.utils.rnn.pad_sequence(
+                merge_k, batch_first=True, padding_value=0.0
+            ).permute(1, 0, 2)  # L*(end-start)
+            v = torch.nn.utils.rnn.pad_sequence(
+                merge_v, batch_first=True, padding_value=0.0
+            ).permute(1, 0, 2)  # L*(end-start)
             key_padding_mask = torch.nn.utils.rnn.pad_sequence(
-                merge_key_padding_mask, batch_first=True,
-                padding_value=True).squeeze(-1)
+                merge_key_padding_mask, batch_first=True, padding_value=True
+            ).squeeze(-1)
 
         out = self.attn(
             self._repeat(q, bs),  # Q * B * D
@@ -415,12 +451,6 @@ def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
 
 
 def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    pixel_values = hf_inputs.get("pixel_values", torch.empty(0))
-    num_images = len(pixel_values)
-
-    video_pixel_values = hf_inputs.get("video_pixel_values", torch.empty(0))
-    num_videos = len(video_pixel_values)
-
     return dict(
         pixel_values=MultiModalFieldConfig.batched("image"),
         image_sizes=MultiModalFieldConfig.batched("image"),
@@ -430,13 +460,10 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
-        image_token_id=MultiModalFieldConfig.shared("image", num_images),
-        video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
 class MiniCPMVImageEmbeddingItems(DictEmbeddingItems):
-
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
@@ -458,7 +485,6 @@ class MiniCPMVImageEmbeddingItems(DictEmbeddingItems):
 
 
 class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
-
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
@@ -483,11 +509,10 @@ class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
 
 
 class MiniCPMVMultiModalDataParser(MultiModalDataParser):
-
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMVImageEmbeddingItems(
                 data,
@@ -498,8 +523,8 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMVVideoEmbeddingItems(
                 data,
@@ -535,7 +560,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         mm_limits = {"image": None}
         if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             mm_limits["video"] = None
@@ -547,7 +572,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         image_size: ImageSize,
         # For MiniCPM V/O 2.6
         image_idx: int = 0,
-        max_slice_nums: Optional[int] = None,
+        max_slice_nums: int | None = None,
         use_image_id: bool = True,
     ) -> str:
         image_processor = self.get_image_processor()
@@ -567,8 +592,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         self,
         image_size: ImageSize,
         # For MiniCPM V/O 2.6
-        max_slice_nums: Optional[int] = None,
-    ) -> Optional[tuple[int, int]]:
+        max_slice_nums: int | None = None,
+    ) -> tuple[int, int] | None:
         image_processor = self.get_image_processor()
         version = self.get_model_version()
 
@@ -586,7 +611,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
-        max_slice_nums: Optional[int] = None,
+        max_slice_nums: int | None = None,
     ) -> int:
         image_processor = self.get_image_processor()
 
@@ -652,21 +677,18 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self.get_max_video_frames(seq_len -
-                                                     max_image_tokens)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   _MAX_FRAMES_PER_VIDEO)
+        max_total_frames = self.get_max_video_frames(seq_len - max_image_tokens)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
+        )
 
         return max(max_frames_per_video, 1)
 
 
-_I = TypeVar("_I",
-             bound=MiniCPMVProcessingInfo,
-             default=MiniCPMVProcessingInfo)
+_I = TypeVar("_I", bound=MiniCPMVProcessingInfo, default=MiniCPMVProcessingInfo)
 
 
 class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -680,51 +702,59 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        image_width, image_height = \
-            self.info.get_image_size_with_most_features()
-        video_width, video_height = \
-            self.info.get_video_frame_size_with_most_features()
-        num_video_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        image_width, image_height = self.info.get_image_size_with_most_features()
+        video_width, video_height = self.info.get_video_frame_size_with_most_features()
+        num_video_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=image_width,
-                                   height=image_height,
-                                   num_images=num_images),
+            "image": self._get_dummy_images(
+                width=image_width,
+                height=image_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
             "video": [
-                self._get_dummy_images(width=video_width,
-                                       height=video_height,
-                                       num_images=num_video_frames)
-            ] * num_videos,
+                self._get_dummy_images(
+                    width=video_width,
+                    height=video_height,
+                    num_images=num_video_frames,
+                    overrides=video_overrides,
+                )
+            ]
+            * num_videos,
         }
 
 
 class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
-
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMVMultiModalDataParser()
 
-    def get_image_prompt_texts(self,
-                               image_size: ImageSize,
-                               image_idx: int = 0) -> str:
+    def get_image_prompt_texts(self, image_size: ImageSize, image_idx: int = 0) -> str:
         return self.info.get_slice_image_placeholder(
             image_size,
             image_idx=image_idx,
         )
 
-    def get_video_prompt_texts(self, image_size: ImageSize,
-                               num_frames: int) -> str:
-        return self.info.get_slice_image_placeholder(
-            image_size=image_size,
-            image_idx=0,
-            max_slice_nums=self.info.get_video_max_slice_num(),
-            use_image_id=False,
-        ) * num_frames
+    def get_video_prompt_texts(self, image_size: ImageSize, num_frames: int) -> str:
+        return (
+            self.info.get_slice_image_placeholder(
+                image_size=image_size,
+                image_idx=0,
+                max_slice_nums=self.info.get_video_max_slice_num(),
+                use_image_id=False,
+            )
+            * num_frames
+        )
 
     def process_images(
         self,
@@ -735,10 +765,11 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         if (images := mm_data.get("images")) is None:
             return {}
 
-        parsed_images = (self._get_data_parser().parse_mm_data({
-            "image": images
-        }).get_items("image",
-                     (MiniCPMVImageEmbeddingItems, ImageProcessorItems)))
+        parsed_images = (
+            self._get_data_parser()
+            .parse_mm_data({"image": images})
+            .get_items("image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems))
+        )
 
         if isinstance(parsed_images, MiniCPMVImageEmbeddingItems):
             image_inputs = {}
@@ -751,10 +782,6 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        image_inputs["image_token_id"] = torch.tensor(unk_token_id)
-
         return image_inputs
 
     def process_videos(
@@ -766,24 +793,23 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         if (videos := mm_data.get("videos")) is None:
             return {}
 
-        parsed_videos = (self._get_data_parser().parse_mm_data({
-            "video": videos
-        }).get_items("video",
-                     (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)))
+        parsed_videos = (
+            self._get_data_parser()
+            .parse_mm_data({"video": videos})
+            .get_items("video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems))
+        )
 
         if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems):
             video_inputs = {}
         else:
             video_inputs = self._base_call_hf_processor(
                 prompts=[
-                    self.info.image_pattern * len(video)
-                    for video in parsed_videos
+                    self.info.image_pattern * len(video) for video in parsed_videos
                 ],
                 mm_data={"images": list(parsed_videos)},
                 mm_kwargs={
                     **mm_kwargs,
-                    "max_slice_nums":
-                    self.info.get_video_max_slice_num(),
+                    "max_slice_nums": self.info.get_video_max_slice_num(),
                 },
                 tok_kwargs=tok_kwargs,
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
@@ -791,10 +817,6 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        video_inputs["video_token_id"] = torch.tensor(unk_token_id)
-
         return video_inputs
 
     def process_mm_inputs(
@@ -831,10 +853,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             for i, prompt in enumerate(prompts):
                 inputs_one = super()._call_hf_processor(
                     prompt=prompt,
-                    mm_data={
-                        k: v[i]
-                        for k, v in mm_data.items()
-                    },
+                    mm_data={k: v[i] for k, v in mm_data.items()},
                     mm_kwargs=mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
@@ -857,10 +876,12 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         input_ids = torch.tensor([tokenizer.encode(prompt, **tok_kwargs)])
         mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs, tok_kwargs)
 
-        return BatchFeature({
-            "input_ids": input_ids,
-            **mm_inputs,
-        })
+        return BatchFeature(
+            {
+                "input_ids": input_ids,
+                **mm_inputs,
+            }
+        )
 
     def _hf_processor_applies_updates(
         self,
@@ -877,22 +898,26 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        placeholders = [("image", self.info.image_pattern),
-                        ("video", self.info.video_pattern)]
+        placeholders = [
+            ("image", self.info.image_pattern),
+            ("video", self.info.video_pattern),
+        ]
 
         # hard code for inconsistency of encode-decode image_pattern
         additional_placeholders = []
         tokenizer = self.info.get_tokenizer()
         for modality, pattern in placeholders:
             sub_pattern = tokenizer.decode(
-                tokenizer.encode(pattern, add_special_tokens=False))
+                tokenizer.encode(pattern, add_special_tokens=False)
+            )
             if sub_pattern != pattern:
                 additional_placeholders.append((modality, sub_pattern))
         placeholders += additional_placeholders
 
         def get_image_replacement(item_idx: int):
             images = mm_items.get_items(
-                "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems))
+                "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems)
+            )
 
             image_size = images.get_image_size(item_idx)
 
@@ -903,7 +928,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
-                "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems))
+                "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)
+            )
 
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
@@ -919,9 +945,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         }
 
         return [
-            PromptReplacement(modality=modality,
-                              target=pattern,
-                              replacement=get_replacement[modality])
+            PromptReplacement(
+                modality=modality, target=pattern, replacement=get_replacement[modality]
+            )
             for modality, pattern in placeholders
         ]
 
@@ -958,7 +984,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                         1,
                     ),
                     "<unk>",
-                ))
+                )
+            )
 
         return new_update
 
@@ -976,10 +1003,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
+    merge_by_field_config = True
+
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "(<image>./</image>)"
         if modality.startswith("video"):
@@ -1001,69 +1030,50 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         self.multimodal_config = multimodal_config
 
         self.version = get_version_by_config(self.config)
-        self.llm = self.init_llm(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "llm"))
-        self.vpm = self.init_vision_module(config,
-                                           quant_config,
-                                           prefix=maybe_prefix(prefix, "vpm"))
-        self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
-                           self.vpm.embeddings.embed_dim)
+        self.llm = self.init_llm(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "llm")
+        )
+        self.vpm = self.init_vision_module(
+            config, quant_config, prefix=maybe_prefix(prefix, "vpm")
+        )
+        self.vision_dim = (
+            self.vpm.embed_dim
+            if self.version == (2, 0)
+            else self.vpm.embeddings.embed_dim
+        )
         self.embed_dim = self.config.hidden_size
 
-        self.resampler = self.init_resampler(self.embed_dim,
-                                             self.vision_dim,
-                                             quant_config=quant_config,
-                                             prefix=maybe_prefix(
-                                                 prefix, "resampler"))
+        self.resampler = self.init_resampler(
+            self.embed_dim,
+            self.vision_dim,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "resampler"),
+        )
 
-        self.mm_token_ids = set[int]()
-        self.make_empty_intermediate_tensors = (
-            self.llm.make_empty_intermediate_tensors)
+        self.make_empty_intermediate_tensors = self.llm.make_empty_intermediate_tensors
 
     def _parse_and_validate_vision_input(
         self,
         modality: str,
         **kwargs: object,
-    ) -> Optional[MiniCPMVImageInputs]:
+    ) -> MiniCPMVImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
             return None
 
-        image_token_id = kwargs.pop("image_token_id")
-        if image_token_id is not None:
-            assert isinstance(image_token_id, torch.Tensor)
-            self.mm_token_ids.add(image_token_id.flatten().unique().item())
-
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError(
-                    f"Incorrect type of image_embeds for {modality=}. "
-                    f"Got type: {type(image_embeds)}")
-
-            image_embeds_flat = flatten_bn(image_embeds)
-
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
-                image_embeds=image_embeds_flat,
+                image_embeds=image_embeds,
             )
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of pixel_values for {modality=}. "
-                f"Got type: {type(pixel_values)}")
-
         tgt_sizes = kwargs.pop("tgt_sizes")
-        if not isinstance(tgt_sizes, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of tgt_sizes for {modality=}. "
-                             f"Got type: {type(tgt_sizes)}")
-
-        num_slices = [[len(p) for p in ps] for ps in pixel_values]
-        num_slices_flat = flatten_bn(torch.tensor(num_slices))
 
-        pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
-        tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
+        num_slices_flat = torch.tensor([len(ps) for ps in pixel_values])
+        pixel_values_flat = flatten_bn(pixel_values)
+        tgt_sizes_flat = flatten_bn(tgt_sizes, concat=True)
 
         return MiniCPMVImagePixelInputs(
             type="pixel_values",
@@ -1078,41 +1088,34 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
                 modalities["images"] = self._parse_and_validate_vision_input(
-                    "images", **kwargs)
-            if input_key in ("video_pixel_values",
-                             "video_embeds") and "videos" not in modalities:
-
-                def _image_key(video_key: str):
-                    if video_key == "video_token_id":
-                        return "image_token_id"
-
-                    return video_key.removeprefix("video_")
-
+                    "images", **kwargs
+                )
+            if (
+                input_key in ("video_pixel_values", "video_embeds")
+                and "videos" not in modalities
+            ):
                 modalities["videos"] = self._parse_and_validate_vision_input(
-                    "videos", **{
-                        _image_key(k): v
-                        for k, v in kwargs.items()
-                    })
+                    "videos", **{k.removeprefix("video_"): v for k, v in kwargs.items()}
+                )
 
         return modalities
 
     def _process_vision_input(
         self,
         image_input: MiniCPMVImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["image_embeds"]
 
         image_features_flat = self.get_vision_hidden_states(image_input)
 
         num_slices = image_input["num_slices"]
-        return [
-            e.flatten(0, 1)
-            for e in image_features_flat.split(num_slices.tolist())
-        ]
+        return [e.flatten(0, 1) for e in image_features_flat.split(num_slices.tolist())]
 
     def _process_multimodal_inputs(self, modalities: dict):
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -1124,64 +1127,36 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(image_features)
+                image_embeddings = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
-                video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(video_features)
+                video_embeddings = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
     def get_language_model(self) -> torch.nn.Module:
         return self.llm
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
 
         return self._process_multimodal_inputs(modalities)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.llm.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            assert len(self.mm_token_ids) > 0
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                list(self.mm_token_ids),
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         hidden_states = self.llm.model(
             input_ids=input_ids,
             positions=positions,
@@ -1193,11 +1168,10 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.llm.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
@@ -1205,9 +1179,9 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         """
         Get the module prefix in multimodal models
         """
-        return MultiModelKeys.from_string_field(language_model="llm",
-                                                connector="resampler",
-                                                tower_model="vpm")
+        return MultiModelKeys.from_string_field(
+            language_model="llm", connector="resampler", tower_model="vpm"
+        )
 
     def init_llm(
         self,
@@ -1219,25 +1193,25 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         raise NotImplementedError
 
-    def init_resampler(self,
-                       embed_dim: int,
-                       vision_dim: int,
-                       quant_config: Optional[QuantizationConfig] = None,
-                       prefix: str = "") -> nn.Module:
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         raise NotImplementedError
 
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         raise NotImplementedError
 
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
-
     supports_encoder_tp_data = False
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -1254,7 +1228,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         # TODO: refactor vision model through timm wrapper from transformers
@@ -1274,8 +1248,10 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
 
         model = model.to(dtype=torch.get_default_dtype())
 
-        if (isinstance(model, timm.models.VisionTransformer)
-                and model.attn_pool is not None):
+        if (
+            isinstance(model, timm.models.VisionTransformer)
+            and model.attn_pool is not None
+        ):
             model.attn_pool = torch.nn.Identity()
 
         if self.config.drop_vision_last_layer:
@@ -1283,27 +1259,30 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
 
         return model
 
-    def init_resampler(self,
-                       embed_dim: int,
-                       vision_dim: int,
-                       quant_config: Optional[QuantizationConfig] = None,
-                       prefix: str = "") -> nn.Module:
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
-            resampler = Resampler2(embed_dim=embed_dim,
-                                   num_heads=embed_dim // 128,
-                                   grid_size=int(
-                                       math.sqrt(self.config.query_num)),
-                                   kv_dim=vision_dim,
-                                   adaptive=False,
-                                   do_post_projection=True,
-                                   quant_config=quant_config,
-                                   prefix=prefix)
-
-        return resampler.to(device=current_platform.device_type,
-                            dtype=torch.get_default_dtype())
-
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+            resampler = Resampler2(
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                grid_size=int(math.sqrt(self.config.query_num)),
+                kv_dim=vision_dim,
+                adaptive=False,
+                do_post_projection=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(
+            device=current_platform.device_type, dtype=torch.get_default_dtype()
+        )
+
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
 
         P_h, P_w = self.vpm.patch_embed.patch_size
@@ -1315,7 +1294,8 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
             H, W = pixel_value[0].shape[-2:]
             tgt_size = (math.ceil(H / P_h), math.ceil(W / P_w))
             vision_embedding = self.vpm.forward_features(
-                pixel_value.unsqueeze(0).type(dtype))
+                pixel_value.unsqueeze(0).type(dtype)
+            )
 
             if num_prefix_tokens > 0:
                 vision_embedding = vision_embedding[:, num_prefix_tokens:]
@@ -1351,7 +1331,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         model = Idefics2VisionTransformer(
@@ -1364,24 +1344,28 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
             model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self,
-                       embed_dim: int,
-                       vision_dim: int,
-                       quant_config: Optional[QuantizationConfig] = None,
-                       prefix: str = "") -> nn.Module:
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
-            resampler = Resampler2_5(num_queries=self.config.query_num,
-                                     embed_dim=embed_dim,
-                                     num_heads=embed_dim // 128,
-                                     kv_dim=vision_dim,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
-
-        return resampler.to(device=current_platform.device_type,
-                            dtype=torch.get_default_dtype())
-
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(
+            device=current_platform.device_type, dtype=torch.get_default_dtype()
+        )
+
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
 
@@ -1391,9 +1375,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         device = pixel_values[0].device
         dtype = pixel_values[0].dtype
 
-        all_pixel_values = torch.zeros((B, 3, P, L),
-                                       dtype=dtype,
-                                       device=device)
+        all_pixel_values = torch.zeros((B, 3, P, L), dtype=dtype, device=device)
         for i, pixel_values_item in enumerate(pixel_values):
             L_item = pixel_values_item.shape[-1]
             all_pixel_values[i, ..., :L_item] = pixel_values_item
@@ -1402,9 +1384,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         max_patches = num_patches.max().item()
         assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, max_patches),
-                                      dtype=torch.bool,
-                                      device=device)
+        patch_attn_mask = torch.zeros((B, max_patches), dtype=torch.bool, device=device)
         for i, num_patches_item in enumerate(num_patches):
             patch_attn_mask[i, :num_patches_item] = True
 
@@ -1444,7 +1424,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         model = Idefics2VisionTransformer(
@@ -1457,25 +1437,29 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
             model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self,
-                       embed_dim: int,
-                       vision_dim: int,
-                       quant_config: Optional[QuantizationConfig] = None,
-                       prefix: str = "") -> nn.Module:
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
             # The resampler in 2.6 remains consistent with the one in 2.5.
-            resampler = Resampler2_5(num_queries=self.config.query_num,
-                                     embed_dim=embed_dim,
-                                     num_heads=embed_dim // 128,
-                                     kv_dim=vision_dim,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
-
-        return resampler.to(device=current_platform.device_type,
-                            dtype=torch.get_default_dtype())
-
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(
+            device=current_platform.device_type, dtype=torch.get_default_dtype()
+        )
+
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
 
@@ -1485,9 +1469,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         device = pixel_values[0].device
         dtype = pixel_values[0].dtype
 
-        all_pixel_values = torch.zeros((B, 3, P, L),
-                                       dtype=dtype,
-                                       device=device)
+        all_pixel_values = torch.zeros((B, 3, P, L), dtype=dtype, device=device)
         for i, pixel_values_item in enumerate(pixel_values):
             L_item = pixel_values_item.shape[-1]
             all_pixel_values[i, ..., :L_item] = pixel_values_item
@@ -1496,9 +1478,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         max_patches = num_patches.max().item()
         assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, max_patches),
-                                      dtype=torch.bool,
-                                      device=device)
+        patch_attn_mask = torch.zeros((B, max_patches), dtype=torch.bool, device=device)
         for i, num_patches_item in enumerate(num_patches):
             patch_attn_mask[i, :num_patches_item] = True
 
@@ -1510,10 +1490,8 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
 
         return self.resampler(vision_embedding, tgt_sizes)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self,
-                                   skip_prefixes=["apm.", "audio", "tts"])
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["apm.", "audio", "tts"])
         return loader.load_weights(weights)
 
 
@@ -1534,11 +1512,6 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-            return None
-        return quant_config
-
     def init_llm(
         self,
         vllm_config: VllmConfig,
@@ -1549,10 +1522,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
-        quant_config = self._maybe_ignore_quant_config(quant_config)
         model = Idefics2VisionTransformer(
             config.vision_config,
             quant_config=quant_config,
@@ -1567,24 +1539,25 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
-        quant_config = self._maybe_ignore_quant_config(quant_config)
         with set_default_torch_dtype(torch.float16):
             # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-            resampler = Resampler2_5(num_queries=self.config.query_num,
-                                     embed_dim=embed_dim,
-                                     num_heads=embed_dim // 128,
-                                     kv_dim=vision_dim,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
-
-        return resampler.to(device=current_platform.device_type,
-                            dtype=torch.get_default_dtype())
-
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(
+            device=current_platform.device_type, dtype=torch.get_default_dtype()
+        )
+
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
 
@@ -1594,9 +1567,7 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         device = pixel_values[0].device
         dtype = pixel_values[0].dtype
 
-        all_pixel_values = torch.zeros((B, 3, P, L),
-                                       dtype=dtype,
-                                       device=device)
+        all_pixel_values = torch.zeros((B, 3, P, L), dtype=dtype, device=device)
         for i, pixel_values_item in enumerate(pixel_values):
             L_item = pixel_values_item.shape[-1]
             all_pixel_values[i, ..., :L_item] = pixel_values_item
@@ -1605,9 +1576,7 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         max_patches = num_patches.max().item()
         assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, max_patches),
-                                      dtype=torch.bool,
-                                      device=device)
+        patch_attn_mask = torch.zeros((B, max_patches), dtype=torch.bool, device=device)
         for i, num_patches_item in enumerate(num_patches):
             patch_attn_mask[i, :num_patches_item] = True
 
@@ -1619,10 +1588,8 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
 
         return self.resampler(vision_embedding, tgt_sizes)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self,
-                                   skip_prefixes=["apm.", "audio", "tts"])
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["apm.", "audio", "tts"])
         return loader.load_weights(weights)
 
 
@@ -1643,11 +1610,6 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 5)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-            return None
-        return quant_config
-
     def init_llm(
         self,
         vllm_config: VllmConfig,
@@ -1658,10 +1620,9 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
-        quant_config = self._maybe_ignore_quant_config(quant_config)
         model = Idefics2VisionTransformer(
             config.vision_config,
             quant_config=quant_config,
@@ -1676,27 +1637,28 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
-        quant_config = self._maybe_ignore_quant_config(quant_config)
         with set_default_torch_dtype(torch.float16):
             # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-            resampler = Resampler4_5(num_queries=self.config.query_num,
-                                     embed_dim=embed_dim,
-                                     num_heads=embed_dim // 128,
-                                     kv_dim=vision_dim,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
-
-        return resampler.to(device=current_platform.device_type,
-                            dtype=torch.get_default_dtype())
-
-    def get_vision_hidden_states(
-            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+            resampler = Resampler4_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(
+            device=current_platform.device_type, dtype=torch.get_default_dtype()
+        )
+
+    def get_vision_hidden_states(self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
-        temporal_ids = data.get('temporal_ids', None)
+        temporal_ids = data.get("temporal_ids", None)
 
         B = len(pixel_values)
         P = pixel_values[0].shape[-2]
@@ -1704,11 +1666,10 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         device = pixel_values[0].device
         dtype = pixel_values[0].dtype
 
-        all_pixel_values = torch.zeros((B, 3, P, L),
-                                       dtype=dtype,
-                                       device=device)
-        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
-            temporal_ids)
+        all_pixel_values = torch.zeros((B, 3, P, L), dtype=dtype, device=device)
+        all_temporal_ids = (
+            None if temporal_ids is None else flatten_2d_lists(temporal_ids)
+        )
         for i, pixel_values_item in enumerate(pixel_values):
             L_item = pixel_values_item.shape[-1]
             all_pixel_values[i, ..., :L_item] = pixel_values_item
@@ -1717,9 +1678,7 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         max_patches = num_patches.max().item()
         assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, max_patches),
-                                      dtype=torch.bool,
-                                      device=device)
+        patch_attn_mask = torch.zeros((B, max_patches), dtype=torch.bool, device=device)
         for i, num_patches_item in enumerate(num_patches):
             patch_attn_mask[i, :num_patches_item] = True
 
@@ -1731,10 +1690,8 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
 
         return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self,
-                                   skip_prefixes=["apm.", "audio", "tts"])
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["apm.", "audio", "tts"])
         return loader.load_weights(weights)
 
 
@@ -1750,7 +1707,8 @@ _SUPPORT_VERSION = {
 @MULTIMODAL_REGISTRY.register_processor(
     MiniCPMVMultiModalProcessor,
     info=MiniCPMVProcessingInfo,
-    dummy_inputs=MiniCPMVDummyInputsBuilder)
+    dummy_inputs=MiniCPMVDummyInputsBuilder,
+)
 class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
@@ -1772,9 +1730,12 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
         instance_cls = _SUPPORT_VERSION.get(version)
         if instance_cls is None:
             supported_versions = ", ".join(
-                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())])
-            raise ValueError(f"Currently, MiniCPMV only supports versions "
-                             f"{supported_versions}. Got version: {version}")
+                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())]
+            )
+            raise ValueError(
+                f"Currently, MiniCPMV only supports versions "
+                f"{supported_versions}. Got version: {version}"
+            )
 
         # quant_config references base class members,
         # so update values before init is called
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d2f2d261969b049c56b9e9e12b384898c7a02e
--- /dev/null
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The MiniMax AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniMaxM2 model."""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class MiniMaxM2MoE(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_local_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_local_experts}."
+            )
+        self.use_routing_bias = getattr(config, "use_routing_bias", False)
+        if self.use_routing_bias:
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.num_local_experts, dtype=torch.float32)
+            )
+            self.e_score_correction_bias.weight_loader = (
+                MiniMaxM2MoE.ebias_weight_loader
+            )
+        else:
+            self.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            scoring_func=config.scoring_func,
+            use_grouped_topk=True,
+            num_expert_group=1,
+            topk_group=1,
+            e_score_correction_bias=self.e_score_correction_bias,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            reduce_results=False,
+            renormalize=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+    @staticmethod
+    def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states.to(torch.float32))
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+        final_hidden_states = final_hidden_states
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class MiniMaxM2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        attn_window_size: int | None = None,
+        max_position_embeddings: int = 8192,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            per_layer_sliding_window=attn_window_size,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.q_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_heads, eps=rms_norm_eps
+        )
+        self.k_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxM2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
+            max_position_embeddings = max(
+                config.max_position_embeddings, config.max_model_len
+            )
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep=".")[-1])
+
+        self.layer_idx = layer_idx
+        self.self_attn = MiniMaxM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rotary_dim=config.rotary_dim,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.block_sparse_moe = MiniMaxM2MoE(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        hidden_states = self.block_sparse_moe(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class MiniMaxM2Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=None,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiniMaxM2DecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = self.get_expert_mapping()
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiniMaxM2ForCausalLM(nn.Module, SupportsPP):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxM2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=None
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: PretrainedConfig, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_mtp_modules):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index cc9a959f633135e22ca1bfcfc64f591860251151..bf1ecc822756d636f18974fc2ce35484a541378d 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     pass
 
 import regex as re
 import torch
-import torch.distributed
 from torch import nn
 from transformers import MiniMaxConfig
 
@@ -18,26 +18,32 @@ from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size)
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.linear_attn import (
-    MiniMaxText01LinearAttention)
+from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01LinearAttention
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.sequence import IntermediateTensors
@@ -46,25 +52,22 @@ from .interfaces import HasInnerState, IsHybrid
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
 
 
-def replace_weight_name(name: str,
-                        key: str = None,
-                        to: str = None,
-                        count: int = None,
-                        prefix: str = None) -> str:
-    name = name.replace(key, to) if count is None else \
-        name.replace(key, to, count)
+def replace_weight_name(
+    name: str, key: str = None, to: str = None, count: int = None, prefix: str = None
+) -> str:
+    name = name.replace(key, to) if count is None else name.replace(key, to, count)
     return name
 
 
 def weight_loader_with_alias(alias: str):
-
     def wrapper(func: callable):
-
-        def inner_func(param: torch.Tensor,
-                       loaded_weight: torch.Tensor,
-                       *args,
-                       prefix: str = None,
-                       **kwargs):
+        def inner_func(
+            param: torch.Tensor,
+            loaded_weight: torch.Tensor,
+            *args,
+            prefix: str = None,
+            **kwargs,
+        ):
             value = func(param, loaded_weight, *args, **kwargs)
             return value
 
@@ -74,12 +77,11 @@ def weight_loader_with_alias(alias: str):
 
 
 class MiniMaxText01MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
         prefix: str = "mlp",
     ) -> None:
@@ -104,7 +106,6 @@ class MiniMaxText01MLP(nn.Module):
         return
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
@@ -112,16 +113,15 @@ class MiniMaxText01MLP(nn.Module):
 
 
 class MiniMaxText01MoE(nn.Module):
-
     def __init__(
         self,
         num_experts: int,
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         layer_idx: int = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "moe",
     ) -> None:
         super().__init__()
@@ -163,8 +163,7 @@ class MiniMaxText01MoE(nn.Module):
         return
 
     @staticmethod
-    def gate_weight_loader(param: nn.Parameter,
-                           loaded_weight: torch.Tensor) -> None:
+    def gate_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None:
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight.to(torch.float32))
         return
@@ -174,13 +173,13 @@ class MiniMaxText01MoE(nn.Module):
         hidden_states = hidden_states.view(-1, self.hidden_size)
         router_logits_fp32, _ = self.gate(hidden_states.to(torch.float32))
         final_hidden_states = self.experts(
-            hidden_states, router_logits_fp32.to(hidden_states.dtype))
+            hidden_states, router_logits_fp32.to(hidden_states.dtype)
+        )
         final_hidden = final_hidden_states.view(num_tokens, hidden_size)
         return final_hidden
 
 
 class MiniMaxText01Attention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -190,10 +189,10 @@ class MiniMaxText01Attention(nn.Module):
         rotary_dim: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        sliding_window: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        sliding_window: int | None = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "mha",
     ) -> None:
         super().__init__()
@@ -254,8 +253,13 @@ class MiniMaxText01Attention(nn.Module):
         )
         return
 
-    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                positions: torch.Tensor, **kwargs) -> None:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        positions: torch.Tensor,
+        **kwargs,
+    ) -> None:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
@@ -264,16 +268,15 @@ class MiniMaxText01Attention(nn.Module):
 
 
 class MiniMaxText01DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: MiniMaxConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         expert_num: int = 1,
         layer_id: int = None,
-        linear_layer_id: Optional[int] = None,
+        linear_layer_id: int | None = None,
         prefix: str = "decoder",
     ) -> None:
         self._ilayer = layer_id
@@ -289,14 +292,17 @@ class MiniMaxText01DecoderLayer(nn.Module):
         head_dim = getattr(config, "head_dim", None)
         if head_dim is None:
             head_dim = config.hidden_size // config.num_attention_heads
-        if hasattr(config, "max_model_len") and isinstance(
-                config.max_model_len, int):
-            max_position_embeddings = min(config.max_position_embeddings,
-                                          config.max_model_len)
+        if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
+            max_position_embeddings = min(
+                config.max_position_embeddings, config.max_model_len
+            )
         if config.attention_type == 0:
             use_headxdim = True
-            hidden_inner = (head_dim * config.num_attention_heads
-                            if use_headxdim else config.hidden_size)
+            hidden_inner = (
+                head_dim * config.num_attention_heads
+                if use_headxdim
+                else config.hidden_size
+            )
             self.self_attn = MiniMaxText01LinearAttention(
                 hidden_size=self.hidden_size,
                 hidden_inner_size=hidden_inner,
@@ -310,14 +316,16 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
                 linear_layer_idx=linear_layer_id,
-                prefix=prefix)
+                prefix=prefix,
+            )
         elif config.attention_type == 1:
             self.self_attn = MiniMaxText01Attention(
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
                 head_dim=head_dim,
                 rotary_dim=config.rotary_dim
-                if hasattr(config, "rotary_dim") else head_dim,
+                if hasattr(config, "rotary_dim")
+                else head_dim,
                 num_kv_heads=config.num_key_value_heads,
                 max_position=max_position_embeddings,
                 rope_theta=rope_theta,
@@ -325,10 +333,12 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
                 cache_config=cache_config,
-                prefix=prefix)
+                prefix=prefix,
+            )
         else:
             raise ValueError(
-                f"Unsupported attention type: {self.config.attention_type}")
+                f"Unsupported attention type: {self.config.attention_type}"
+            )
 
         if expert_num == 1:
             self.mlp = MiniMaxText01MLP(
@@ -336,7 +346,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 intermediate_size=config.intermediate_size,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
-                prefix=prefix)
+                prefix=prefix,
+            )
         else:
             self.block_sparse_moe = MiniMaxText01MoE(
                 num_experts=expert_num,
@@ -345,39 +356,51 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 intermediate_size=config.intermediate_size,
                 layer_idx=self._ilayer,
                 quant_config=quant_config,
-                prefix=prefix)
+                prefix=prefix,
+            )
 
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
         if config.attention_type == 0:
             self.layernorm_attention_alpha = getattr(
-                config, 'layernorm_linear_attention_alpha',
-                getattr(config, 'linear_attn_alpha_factor', 1))
+                config,
+                "layernorm_linear_attention_alpha",
+                getattr(config, "linear_attn_alpha_factor", 1),
+            )
             self.layernorm_attention_beta = getattr(
-                config, 'layernorm_linear_attention_beta',
-                getattr(config, 'linear_attn_beta_factor', 1))
+                config,
+                "layernorm_linear_attention_beta",
+                getattr(config, "linear_attn_beta_factor", 1),
+            )
         else:
             self.layernorm_attention_alpha = getattr(
-                config, 'layernorm_full_attention_alpha',
-                getattr(config, 'full_attn_alpha_factor', 1))
+                config,
+                "layernorm_full_attention_alpha",
+                getattr(config, "full_attn_alpha_factor", 1),
+            )
             self.layernorm_attention_beta = getattr(
-                config, 'layernorm_full_attention_beta',
-                getattr(config, 'full_attn_beta_factor', 1))
+                config,
+                "layernorm_full_attention_beta",
+                getattr(config, "full_attn_beta_factor", 1),
+            )
         self.layernorm_mlp_alpha = getattr(
-            config, 'layernorm_mlp_alpha',
-            getattr(config, 'mlp_alpha_factor', 1))
+            config, "layernorm_mlp_alpha", getattr(config, "mlp_alpha_factor", 1)
+        )
         self.layernorm_mlp_beta = getattr(
-            config, 'layernorm_mlp_beta', getattr(config, 'mlp_beta_factor',
-                                                  1))
-        self.postnorm = getattr(config, 'postnorm', False)
+            config, "layernorm_mlp_beta", getattr(config, "mlp_beta_factor", 1)
+        )
+        self.postnorm = getattr(config, "postnorm", False)
         self.shared_moe = False
 
-        shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        shared_intermediate = getattr(config, "shared_intermediate_size", 0)
         if isinstance(shared_intermediate, list):
-            shared_intermediate = shared_intermediate[
-                layer_id] if layer_id < len(shared_intermediate) else 0
+            shared_intermediate = (
+                shared_intermediate[layer_id]
+                if layer_id < len(shared_intermediate)
+                else 0
+            )
         if shared_intermediate > 0:
             self.shared_moe = True
             self.shared_mlp = MiniMaxText01MLP(
@@ -385,7 +408,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 intermediate_size=shared_intermediate,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
-                prefix=prefix)
+                prefix=prefix,
+            )
             self.coefficient = ReplicatedLinear(
                 self.hidden_size,
                 1,
@@ -393,20 +417,19 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 params_dtype=torch.float32,
             )
-            self.coefficient.weight.weight_loader = (
-                self.shared_moe_coefficient_loader)
-            self.shared_moe_mode = getattr(config, 'shared_moe_mode',
-                                           'softmax')
+            self.coefficient.weight.weight_loader = self.shared_moe_coefficient_loader
+            self.shared_moe_mode = getattr(config, "shared_moe_mode", "softmax")
         return
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                positions: torch.Tensor,
-                attn_metadata: AttentionMetadata,
-                residual: Optional[torch.Tensor],
-                is_warmup: bool = False,
-                **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
-
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: torch.Tensor | None,
+        is_warmup: bool = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         layernorm_input = hidden_states
         layernorm_output = self.input_layernorm(layernorm_input)
         residual = layernorm_output if self.postnorm else layernorm_input
@@ -418,8 +441,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
         )
 
         residual = residual * self.layernorm_attention_alpha
-        self_attention_output = (self_attention_output *
-                                 self.layernorm_attention_beta)
+        self_attention_output = self_attention_output * self.layernorm_attention_beta
 
         layernorm_input = residual + self_attention_output
         layernorm_output = self.post_attention_layernorm(layernorm_input)
@@ -433,19 +455,16 @@ class MiniMaxText01DecoderLayer(nn.Module):
             if self.shared_moe:
                 before_moe_dtype = layernorm_output.dtype
                 moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
-                output_mlp = self.shared_mlp(layernorm_output).to(
-                    torch.float32)
+                output_mlp = self.shared_mlp(layernorm_output).to(torch.float32)
 
                 coef, _ = self.coefficient(layernorm_output.to(torch.float32))
 
-                if self.shared_moe_mode == 'softmax':
+                if self.shared_moe_mode == "softmax":
                     coef = torch.nn.functional.softmax(coef, dim=-1)
-                    hidden_states = moe_hidden_fp32 * (
-                        1 - coef) + output_mlp * coef
-                elif self.shared_moe_mode == 'sigmoid':
+                    hidden_states = moe_hidden_fp32 * (1 - coef) + output_mlp * coef
+                elif self.shared_moe_mode == "sigmoid":
                     coef = torch.nn.functional.sigmoid(coef)
-                    hidden_states = moe_hidden_fp32 * (
-                        1 - coef) + output_mlp * coef
+                    hidden_states = moe_hidden_fp32 * (1 - coef) + output_mlp * coef
 
                 hidden_states = hidden_states.to(before_moe_dtype)
             else:
@@ -459,8 +478,9 @@ class MiniMaxText01DecoderLayer(nn.Module):
         return hidden_states, None
 
     @staticmethod
-    def shared_moe_coefficient_loader(param: torch.Tensor,
-                                      loaded_weight: torch.Tensor) -> None:
+    def shared_moe_coefficient_loader(
+        param: torch.Tensor, loaded_weight: torch.Tensor
+    ) -> None:
         assert param.size() == loaded_weight.size()
 
         param.data.copy_(loaded_weight.to(torch.float32))
@@ -469,7 +489,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
 
 @support_torch_compile
 class MiniMaxText01Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: MiniMaxConfig = vllm_config.model_config.hf_config
@@ -482,8 +501,8 @@ class MiniMaxText01Model(nn.Module):
         self.vocab_size = config.vocab_size
 
         self.decoder_attention_types = getattr(
-            config, "attn_type_list", False) or getattr(
-                config, "decoder_attention_types", False)
+            config, "attn_type_list", False
+        ) or getattr(config, "decoder_attention_types", False)
         # The HF format uses "layer_types" instead of "attn_type_list"
         # where "linear_attention" is 0 and "full_attention" is 1
         if not self.decoder_attention_types and hasattr(config, "layer_types"):
@@ -511,50 +530,57 @@ class MiniMaxText01Model(nn.Module):
             self.embed_tokens = PPMissingLayer()
 
         def layer_fn(prefix):
-            layer_idx = int(prefix.split('.')[-1])
+            layer_idx = int(prefix.split(".")[-1])
             layer_config = config
-            layer_config.attention_type = self.decoder_attention_types[
-                layer_idx]
+            layer_config.attention_type = self.decoder_attention_types[layer_idx]
             layer_config.layer_idx = layer_idx
 
             decoder_kwargs = {
                 "quant_config": quant_config,
                 "layer_id": layer_idx,
                 "model_config": model_config,
-                "cache_config": cache_config
+                "cache_config": cache_config,
             }
 
             if layer_config.attention_type == 0:
                 decoder_kwargs["linear_layer_id"] = sum(
-                    1 for i in range(layer_idx)
-                    if self.decoder_attention_types[i] == 0)
+                    1 for i in range(layer_idx) if self.decoder_attention_types[i] == 0
+                )
             else:
                 decoder_kwargs["linear_layer_id"] = None
 
             if hasattr(config, "num_local_experts") and isinstance(
-                    config.num_local_experts, list):
-                decoder_kwargs["expert_num"] = config.num_local_experts[
-                    layer_idx]
+                config.num_local_experts, list
+            ):
+                decoder_kwargs["expert_num"] = config.num_local_experts[layer_idx]
             elif hasattr(config, "num_local_experts") and isinstance(
-                    config.num_local_experts, int):
+                config.num_local_experts, int
+            ):
                 decoder_kwargs["expert_num"] = config.num_local_experts
             else:
                 decoder_kwargs["expert_num"] = 1
 
-            return MiniMaxText01DecoderLayer(layer_config,
-                                             **decoder_kwargs,
-                                             prefix=prefix)
+            return MiniMaxText01DecoderLayer(
+                layer_config, **decoder_kwargs, prefix=prefix
+            )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers")
+            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers"
+        )
 
-        linear_layer_nums = sum(1 for i in range(config.num_hidden_layers)
-                                if self.decoder_attention_types[i] == 0)
+        linear_layer_nums = sum(
+            1
+            for i in range(config.num_hidden_layers)
+            if self.decoder_attention_types[i] == 0
+        )
         max_slots_number = scheduler_config.max_num_seqs
-        self.cache_shape = (linear_layer_nums, max_slots_number,
-                            config.num_attention_heads //
-                            get_tensor_model_parallel_world_size(),
-                            config.head_dim, config.head_dim)
+        self.cache_shape = (
+            linear_layer_nums,
+            max_slots_number,
+            config.num_attention_heads // get_tensor_model_parallel_world_size(),
+            config.head_dim,
+            config.head_dim,
+        )
         _dummy = torch.zeros(1)
         self._dtype = _dummy.dtype
         del _dummy
@@ -569,12 +595,12 @@ class MiniMaxText01Model(nn.Module):
         self.embed_scale = 1.0
         return
 
-    def _clear_prefill_cache(self, attn_metadata,
-                             minimax_cache_tensors: torch.Tensor, **kwargs):
+    def _clear_prefill_cache(
+        self, attn_metadata, minimax_cache_tensors: torch.Tensor, **kwargs
+    ):
         seq_to_slot_maps = {}
         seq_id_map = sum(list(kwargs["request_ids_to_seq_ids"].values()), [])
-        for _, seq_to_slot_map in (
-                self.minimax_cache.cache_indices_mapping.items()):
+        for _, seq_to_slot_map in self.minimax_cache.cache_indices_mapping.items():
             seq_to_slot_maps.update(seq_to_slot_map)
 
         slots_to_clear = []
@@ -582,28 +608,29 @@ class MiniMaxText01Model(nn.Module):
             if _prefill_id >= len(seq_id_map):
                 break
             seq_id = seq_id_map[_prefill_id]
-            if attn_metadata.context_lens_tensor[
-                    _prefill_id] == 0 and seq_id in seq_to_slot_maps:
+            if (
+                attn_metadata.context_lens_tensor[_prefill_id] == 0
+                and seq_id in seq_to_slot_maps
+            ):
                 slots_to_clear.append(seq_to_slot_maps[seq_id])
 
         if slots_to_clear:
-            slots_tensor = torch.tensor(slots_to_clear,
-                                        device=minimax_cache_tensors.device,
-                                        dtype=torch.long)
+            slots_tensor = torch.tensor(
+                slots_to_clear, device=minimax_cache_tensors.device, dtype=torch.long
+            )
             minimax_cache_tensors[:, slots_tensor, ...] = 0
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
-    def forward(self,
-                input_ids: Optional[torch.Tensor],
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
@@ -626,10 +653,9 @@ class MiniMaxText01Model(nn.Module):
                 residual=residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         if residual is not None:
             hidden_states, _ = self.norm(hidden_states, residual)
         else:
@@ -639,68 +665,64 @@ class MiniMaxText01Model(nn.Module):
 
 
 class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
-
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
         if not hasattr(config, "sliding_window"):
             config.sliding_window = None
 
         self.CONCAT_FFN = True
 
-        self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
-        self.model = MiniMaxText01Model(vllm_config=vllm_config,
-                                        prefix=maybe_prefix(prefix, "model"))
+        self.model = MiniMaxText01Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 self.config.hidden_size,
-                org_num_embeddings=self.config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    self.config.vocab_size)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, self.config.vocab_size
+            )
 
         else:
             self.lm_head = PPMissingLayer()
         self.lm_head.float()
         flash_layer_count = sum(
-            1 for attn_type in self.model.decoder_attention_types
-            if attn_type == 1)
+            1 for attn_type in self.model.decoder_attention_types if attn_type == 1
+        )
         self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
         return
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         return self.model.minimax_cache.copy_inputs_before_cuda_graphs(
-            input_buffers, **kwargs)
+            input_buffers, **kwargs
+        )
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
-        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
-            batch_size)
+        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-    def get_input_embeddings(
+    def forward(
         self,
         input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
     ) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
+        )
 
         return hidden_states
 
@@ -710,21 +732,20 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         return logits
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
 
@@ -736,7 +757,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
         def is_linear_attn_layer(layer_idx: int) -> bool:
             if layer_idx is None or layer_idx >= len(
-                    self.model.decoder_attention_types):
+                self.model.decoder_attention_types
+            ):
                 return False
             return self.model.decoder_attention_types[layer_idx] == 0
 
@@ -744,39 +766,48 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             return "block_sparse_moe" in name and not name.endswith(".bias")
 
         def get_expert_id(param_name):
-            pattern = r'model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\.'
+            pattern = r"model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\."
             match = re.search(pattern, param_name)
             if match:
                 return match.group(1)
             return None
 
-        def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
-                                   self) -> None:
+        def load_sparse_moe_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
             if isinstance(self.config.num_local_experts, list):
                 expert_params_mapping = [
-                    ("w13_weight"
-                     if weight_name in ["w1", "w3"] else "w2_weight",
-                     f"experts.{expert_id}.{weight_name}.weight", expert_id)
+                    (
+                        "w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
+                        f"experts.{expert_id}.{weight_name}.weight",
+                        expert_id,
+                    )
                     for expert_id in range(max(self.config.num_local_experts))
                     for weight_name in ["w1", "w2", "w3"]
                 ]
             else:
                 expert_params_mapping = [
-                    ("w13_scale" if weight_name in ["w1", "w3"] else
-                     "w2_scale", f"{expert_id}.{weight_name}.weight_scale",
-                     expert_id, weight_name)
+                    (
+                        "w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
+                        f"{expert_id}.{weight_name}.weight_scale",
+                        expert_id,
+                        weight_name,
+                    )
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ] + [
+                    (
+                        "w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
+                        f"{expert_id}.{weight_name}.weight",
+                        expert_id,
+                        weight_name,
+                    )
                     for expert_id in range(self.config.num_local_experts)
                     for weight_name in ["w1", "w2", "w3"]
-                ] + [("w13_weight" if weight_name in ["w1", "w3"] else
-                      "w2_weight", f"{expert_id}.{weight_name}.weight",
-                      expert_id, weight_name)
-                     for expert_id in range(self.config.num_local_experts)
-                     for weight_name in ["w1", "w2", "w3"]]
-            for (param_name, weight_name, expert_id,
-                 shard_id) in expert_params_mapping:
+                ]
+            for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
                 name_expert_id = get_expert_id(name)
-                if name_expert_id is not None and int(name_expert_id) != int(
-                        expert_id):
+                if name_expert_id is not None and int(name_expert_id) != int(expert_id):
                     continue
                 if weight_name not in name:
                     continue
@@ -786,19 +817,20 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
-                weight_loader(param,
-                              loaded_weight,
-                              weight_name,
-                              expert_id=expert_id,
-                              shard_id=shard_id)
+                weight_loader(
+                    param,
+                    loaded_weight,
+                    weight_name,
+                    expert_id=expert_id,
+                    shard_id=shard_id,
+                )
                 loaded_params.add(name)
                 break
             else:
                 if is_pp_missing_parameter(name, self):
                     return
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
@@ -807,8 +839,9 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         def is_shared_mlp_weight(name: str) -> bool:
             return "shared_mlp" in name and not name.endswith(".bias")
 
-        def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
-                                   self) -> None:
+        def load_shared_mlp_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
             if not self.CONCAT_FFN:
                 if "gate_proj" in name:
                     name = name.replace("gate_proj", "w1", 1)
@@ -826,8 +859,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             if is_pp_missing_parameter(name, self):
                 return
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             if not self.CONCAT_FFN:
                 weight_loader(param, loaded_weight)
@@ -837,31 +869,31 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                 elif "down_proj" in name:
                     weight_loader(param, loaded_weight)
                 else:
-                    raise AssertionError(
-                        "MLP weight not in [gate_up_proj, down_proj]")
+                    raise AssertionError("MLP weight not in [gate_up_proj, down_proj]")
             loaded_params.add(name)
             return
 
         def is_mha_weight(name: str) -> bool:
             return "self_attn" in name and not name.endswith(".bias")
 
-        def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
-                                    self) -> None:
+        def load_linear_attn_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
             if is_pp_missing_parameter(name, self):
                 return
             param = params_dict[name]
 
             weight_loader = getattr(
-                param, "weight_loader",
-                MiniMaxText01LinearAttention.weight_direct_load)
+                param, "weight_loader", MiniMaxText01LinearAttention.weight_direct_load
+            )
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
             return
 
-        def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
-                                   self) -> None:
-
+        def load_flash_attn_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
             flash_mha_params_mapping = [
                 ("qkv_proj", "q_proj", "q"),
                 ("qkv_proj", "k_proj", "k"),
@@ -869,16 +901,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                 ("gate_up_proj", "gate_proj", 0),
                 ("gate_up_proj", "up_proj", 1),
             ]
-            for (param_name, weight_name,
-                 shard_id) in flash_mha_params_mapping:
+            for param_name, weight_name, shard_id in flash_mha_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 if is_pp_missing_parameter(name, self):
                     return
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight, shard_id)
                 loaded_params.add(name)
@@ -888,36 +918,32 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                     return
                 param = params_dict[name]
 
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
             return
 
         def is_layer_norm_weight(name: str) -> bool:
-            return "norm" in name and not name.endswith(
-                ".bias") and name in params_dict
+            return "norm" in name and not name.endswith(".bias") and name in params_dict
 
-        def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
-                                   self) -> None:
+        def load_layer_norm_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
             if is_pp_missing_parameter(name, self):
                 return
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
             return
 
-        def load_basic_weight(name: str, loaded_weight: torch.Tensor,
-                              self) -> None:
+        def load_basic_weight(name: str, loaded_weight: torch.Tensor, self) -> None:
             if is_pp_missing_parameter(name, self):
                 return
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
@@ -926,7 +952,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         for name, loaded_weight in weights:
             weight_at_layer = which_layer(name)
             if weight_at_layer and weight_at_layer >= len(
-                    self.model.decoder_attention_types):
+                self.model.decoder_attention_types
+            ):
                 continue
 
             if is_layer_norm_weight(name):
@@ -956,7 +983,6 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.linear_attention_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index d81ac8c704e79eb12da362d28e9726721708f9ae..0939a72ba53ec9e5314ed2a04e3f1ff36ad2f7c9 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,34 +1,40 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.models.llava_next.modeling_llava_next import (
-    get_anyres_image_grid_shape, unpad_image)
+    get_anyres_image_grid_shape,
+    unpad_image,
+)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, LlavaDummyInputsBuilder,
-                    init_vision_tower_for_llava)
+from .llava import (
+    BaseLlavaMultiModalProcessor,
+    LlavaDummyInputsBuilder,
+    init_vision_tower_for_llava,
+)
 from .llava_next import LlavaNextProcessingInfo
 from .pixtral import PixtralHFVisionModel
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 
 class MiniMaxVL01ImagePixelInputs(TensorSchema):
@@ -43,12 +49,14 @@ class MiniMaxVL01ImagePixelInputs(TensorSchema):
     Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np", "h", "w"})]
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np", "h", "w"}),
+    ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
     # This should be in `(height, width)` format.
 
 
@@ -59,36 +67,43 @@ class MiniMaxVL01ImageEmbeddingInputs(TensorSchema):
         - ifs: Image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs,
-                               MiniMaxVL01ImageEmbeddingInputs]
+MiniMaxVL01ImageInputs: TypeAlias = (
+    MiniMaxVL01ImagePixelInputs | MiniMaxVL01ImageEmbeddingInputs
+)
 
 
 class MiniMaxVL01MultiModalProjector(nn.Module):
-
-    def __init__(self,
-                 vision_hidden_size: int,
-                 text_hidden_size: int,
-                 projector_hidden_act: str,
-                 multimodal_projector_bias: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str,
+        multimodal_projector_bias: bool,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
-        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
-                                             text_hidden_size,
-                                             bias=multimodal_projector_bias,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.linear_1")
+        self.linear_1 = ColumnParallelLinear(
+            vision_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = RowParallelLinear(text_hidden_size,
-                                          text_hidden_size,
-                                          bias=multimodal_projector_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.linear_2")
+        self.linear_2 = RowParallelLinear(
+            text_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.linear_1(image_features)
@@ -102,25 +117,23 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
 
 
 class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
-
     def get_hf_config(self):  # Need to override the config type
         return self.ctx.get_hf_config(PretrainedConfig)
 
     def get_hf_processor(self, **kwargs: object):
         hf_processor = self.ctx.get_hf_processor(**kwargs)
         image_processor = hf_processor.image_processor
-        image_processor.anyres_preprocess = (
-            image_processor.anyres_for_vllm_preprocess)
+        image_processor.anyres_preprocess = image_processor.anyres_for_vllm_preprocess
 
         return hf_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
 
 class MiniMaxVL01MultiModalProcessor(
-        BaseLlavaMultiModalProcessor[MiniMaxVL01ProcessingInfo]):
-
+    BaseLlavaMultiModalProcessor[MiniMaxVL01ProcessingInfo]
+):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -163,17 +176,18 @@ class MiniMaxVL01MultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     MiniMaxVL01MultiModalProcessor,
     info=MiniMaxVL01ProcessingInfo,
-    dummy_inputs=MiniMaxVL01DummyInputsBuilder)
-class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                          SupportsPP):
+    dummy_inputs=MiniMaxVL01DummyInputsBuilder,
+)
+class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -194,16 +208,17 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
             config,
             quant_config,
             require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.multi_modal_projector = MiniMaxVL01MultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act,
             multimodal_projector_bias=True,
             quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
-        self.image_newline = nn.Parameter(
-            torch.empty(config.text_config.hidden_size))
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
+        self.image_newline = nn.Parameter(torch.empty(config.text_config.hidden_size))
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
@@ -216,109 +231,78 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.pad_token_id = self.config.pad_token_id
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-        return inputs_embeds
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
-                            PixtralHFVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features: tuple[torch.Tensor, ...] = \
-            tuple(vision_tower(p) for p in pixel_values)
-
-        def select_features(leaf: torch.Tensor):
-            return self._select_image_features(
-                leaf,
-                strategy=self.config.vision_feature_select_strategy,
-            )
-
-        return json_map_leaves(select_features, image_features)
+        feature_select_strategy = self.config.vision_feature_select_strategy
+        return tuple(
+            vision_tower(p, feature_select_strategy=feature_select_strategy)
+            for p in pixel_values
+        )
 
     # adapted from https://huggingface.co/MiniMaxAI/MiniMax-VL-01/blob/main/modeling_minimax_vl_01.py#L616-L631
-    def pack_image_features(self, image_features: list[torch.Tensor],
-                            image_sizes: torch.Tensor):
+    def pack_image_features(
+        self, image_features: list[torch.Tensor], image_sizes: torch.Tensor
+    ):
         new_image_features = []
         for image_idx, image_feature in enumerate(image_features):
             if image_feature.shape[0] > 1:
                 base_image_feature = image_feature[0]
                 image_feature = image_feature[1:]
-                height = width = (self.config.vision_config.image_size //
-                                  self.config.vision_config.patch_size)
+                height = width = (
+                    self.config.vision_config.image_size
+                    // self.config.vision_config.patch_size
+                )
                 if height * width != base_image_feature.shape[0]:
                     raise ValueError(
-                        "The number of patches is not consistent with "
-                        "the image size.")
+                        "The number of patches is not consistent with the image size."
+                    )
                 num_patch_height, num_patch_width = get_anyres_image_grid_shape(
                     image_sizes[image_idx],
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
 
-                image_feature = image_feature.view(num_patch_height,
-                                                   num_patch_width, height,
-                                                   width, -1)
-                image_feature = image_feature.permute(4, 0, 2, 1,
-                                                      3).contiguous()
+                image_feature = image_feature.view(
+                    num_patch_height, num_patch_width, height, width, -1
+                )
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                 image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                image_feature = unpad_image(image_feature,
-                                            image_sizes[image_idx])
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
 
                 image_feature = torch.cat(
                     (
                         image_feature,
-                        self.image_newline[:, None, None].expand(
-                            *image_feature.shape[:-1], 1).to(
-                                image_feature.dtype),
+                        self.image_newline[:, None, None]
+                        .expand(*image_feature.shape[:-1], 1)
+                        .to(image_feature.dtype),
                     ),
                     dim=-1,
                 )
                 image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                image_feature = torch.cat((base_image_feature, image_feature),
-                                          dim=0)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
             else:
                 image_feature = image_feature[0]
                 image_feature = torch.cat(
-                    (image_feature,
-                     self.image_newline[None].to(image_feature)),
-                    dim=0)
+                    (image_feature, self.image_newline[None].to(image_feature)), dim=0
+                )
             new_image_features.append(image_feature)
         return new_image_features
 
     def _process_image_pixels(
         self,
         inputs: MiniMaxVL01ImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -327,7 +311,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _process_image_input(
         self,
         image_input: MiniMaxVL01ImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -337,9 +321,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if isinstance(image_features, torch.Tensor):
             return self.multi_modal_projector(image_features)
 
-        feature_sizes = [
-            image_feature.shape[0] for image_feature in image_features
-        ]
+        feature_sizes = [image_feature.shape[0] for image_feature in image_features]
 
         image_embeds = self.multi_modal_projector(torch.cat(image_features))
         image_embeds = torch.split(image_embeds, feature_sizes)
@@ -347,7 +329,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
         return self.pack_image_features(image_embeds, image_sizes)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]:
+        self, **kwargs: object
+    ) -> MiniMaxVL01ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -356,34 +339,21 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None and image_sizes is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(image_sizes, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
-
             return MiniMaxVL01ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values),
-                image_sizes=flatten_bn(image_sizes, concat=True),
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return MiniMaxVL01ImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -394,33 +364,33 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=input_ids == self.config.image_token_index,
+            )
             input_ids = None
 
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index ba6da4403ae16d60a7172dcdb5d2c668b174dba2..1ddb470a0f93db8f9b8f53bed266146e4d541e8b 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -3,43 +3,58 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
-                    Union)
+from typing import Annotated, Final, Literal, Protocol, TypeVar
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
-                          PretrainedConfig)
+from transformers import (
+    BatchFeature,
+    Mistral3Config,
+    PixtralVisionConfig,
+    PretrainedConfig,
+)
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        InputProcessingContext,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import get_vision_encoder_info
 
 
@@ -57,7 +72,7 @@ class Mistral3ImagePixelInputs(TensorSchema):
     # Note that `height` or `width` may be different per batch and image,
     # in which case the data is passed as a list instead of a batched tensor.
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}),
     ]
 
@@ -67,38 +82,43 @@ class Mistral3PatchMerger(nn.Module):
     Learned merging of spatial_merge_size ** 2 patches
     """
 
-    def __init__(self, vision_hidden_size: int, spatial_merge_size: int,
-                 patch_size: int):
+    def __init__(
+        self, vision_hidden_size: int, spatial_merge_size: int, patch_size: int
+    ):
         super().__init__()
 
         self.vision_hidden_size = vision_hidden_size
         self.spatial_merge_size = spatial_merge_size
         self.patch_size = patch_size
-        self.merging_layer = nn.Linear(vision_hidden_size *
-                                       self.spatial_merge_size**2,
-                                       vision_hidden_size,
-                                       bias=False)
+        self.merging_layer = nn.Linear(
+            vision_hidden_size * self.spatial_merge_size**2,
+            vision_hidden_size,
+            bias=False,
+        )
 
-    def forward(self, image_features: torch.Tensor,
-                image_sizes: torch.Tensor) -> torch.Tensor:
-        image_sizes = [(image_size[0] // self.patch_size,
-                        image_size[1] // self.patch_size)
-                       for image_size in image_sizes]
+    def forward(
+        self, image_features: torch.Tensor, image_sizes: torch.Tensor
+    ) -> torch.Tensor:
+        image_sizes = [
+            (image_size[0] // self.patch_size, image_size[1] // self.patch_size)
+            for image_size in image_sizes
+        ]
 
         tokens_per_image = [h * w for h, w in image_sizes]
         d = image_features.shape[-1]
 
         permuted_tensor = []
         for image_index, image_tokens in enumerate(
-                image_features.split(tokens_per_image)):
+            image_features.split(tokens_per_image)
+        ):
             # Reshape image_tokens into a 2D grid
             h, w = image_sizes[image_index]
-            image_grid = image_tokens.view(h, w, d).permute(2, 0,
-                                                            1).unsqueeze(0)
+            image_grid = image_tokens.view(h, w, d).permute(2, 0, 1).unsqueeze(0)
             grid = torch.nn.functional.unfold(
                 image_grid,
                 kernel_size=self.spatial_merge_size,
-                stride=self.spatial_merge_size)
+                stride=self.spatial_merge_size,
+            )
             grid = grid.view(d * self.spatial_merge_size**2, -1).t()
             permuted_tensor.append(grid)
 
@@ -108,38 +128,45 @@ class Mistral3PatchMerger(nn.Module):
 
 
 class Mistral3MultiModalProjector(nn.Module):
-
-    def __init__(self,
-                 vision_hidden_size: int,
-                 text_hidden_size: int,
-                 spatial_merge_size: int,
-                 patch_size: int,
-                 projector_hidden_act: str,
-                 multimodal_projector_bias: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        spatial_merge_size: int,
+        patch_size: int,
+        projector_hidden_act: str,
+        multimodal_projector_bias: bool,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         self.norm = RMSNorm(vision_hidden_size, eps=1e-5)
         self.patch_merger = Mistral3PatchMerger(
             vision_hidden_size=vision_hidden_size,
             spatial_merge_size=spatial_merge_size,
-            patch_size=patch_size)
+            patch_size=patch_size,
+        )
 
-        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
-                                             text_hidden_size,
-                                             bias=multimodal_projector_bias,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.linear_1")
+        self.linear_1 = ColumnParallelLinear(
+            vision_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = RowParallelLinear(text_hidden_size,
-                                          text_hidden_size,
-                                          bias=multimodal_projector_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.linear_2")
-
-    def forward(self, image_features: torch.Tensor,
-                image_sizes: torch.Tensor) -> torch.Tensor:
+        self.linear_2 = RowParallelLinear(
+            text_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(
+        self, image_features: torch.Tensor, image_sizes: torch.Tensor
+    ) -> torch.Tensor:
         image_features = self.norm(image_features)
         image_features = self.patch_merger(image_features, image_sizes)
         hidden_states, _ = self.linear_1(image_features)
@@ -152,7 +179,7 @@ class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
 
 
 class LlavaLikeProcessor(Protocol):
@@ -160,7 +187,6 @@ class LlavaLikeProcessor(Protocol):
 
 
 class BaseLlavaProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(Mistral3Config)
 
@@ -171,7 +197,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -196,7 +222,6 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
 
 class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -209,29 +234,30 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
 
 
-class Mistral3MultiModalProcessor(
-        BaseMultiModalProcessor[Mistral3ProcessingInfo]):
-
+class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -248,7 +274,6 @@ class Mistral3MultiModalProcessor(
 
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
-
             # Avoid padding since we need the output for each image to be
             # independent of other images for the cache to work correctly
             image_sizes = processed_outputs["image_sizes"]
@@ -312,7 +337,8 @@ class Mistral3MultiModalProcessor(
 
 
 def _build_mistral3_info(
-    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    ctx: InputProcessingContext,
+) -> BaseLlavaProcessingInfo:
     hf_config = ctx.get_hf_config(Mistral3Config)
     assert isinstance(hf_config.vision_config, PixtralVisionConfig)
     return Mistral3ProcessingInfo(ctx)
@@ -322,7 +348,7 @@ def _build_mistral3_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
@@ -335,7 +361,7 @@ def _build_mistral3_processor(
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
     """Determine the number of hidden layers to initialize up to in the
     visual encoder.
-    
+
     Args:
         hf_config: Model config with vision feature layer(s).
     """
@@ -346,10 +372,10 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
         return _get_layer_index(feature_layers, num_hidden_layers)
     # If we have multiple feature layers, initialize up to the deepest one
     elif isinstance(feature_layers, (list, tuple)):
-        return max(
-            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
-    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
-                    " is not supported")
+        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(
+        f"vision_layer_feature type: {type(feature_layers)} is not supported"
+    )
 
 
 def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
@@ -368,9 +394,9 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
 ) -> PixtralHFVisionModel:
     vision_config = hf_config.vision_config
@@ -392,13 +418,16 @@ def init_vision_tower_for_llava(
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
-    dummy_inputs=Mistral3DummyInputsBuilder)
-class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
-                                       SupportsMultiModal, SupportsPP):
+    dummy_inputs=Mistral3DummyInputsBuilder,
+)
+class Mistral3ForConditionalGeneration(
+    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
+):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     hf_to_vllm_mapper = WeightsMapper(
@@ -408,10 +437,11 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -429,11 +459,15 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
 
         # NOTE: These are special cases for Pixtral-12B in the HF-format
         # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
-        if (config.text_config.architectures is None
-                and config.text_config.model_type == "mistral"):
+        if (
+            config.text_config.architectures is None
+            and config.text_config.model_type == "mistral"
+        ):
             config.text_config.architectures = ["MistralForCausalLM"]
-        if (config.projector_hidden_act is None
-                and config.vision_config.hidden_act == "gelu"):
+        if (
+            config.projector_hidden_act is None
+            and config.vision_config.hidden_act == "gelu"
+        ):
             config.projector_hidden_act = "gelu"
 
         # TODO: Optionally initializes this for supporting embeddings.
@@ -442,7 +476,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
                 config,
                 quant_config,
                 require_post_norm=False,
-                prefix=maybe_prefix(prefix, "vision_tower"))
+                prefix=maybe_prefix(prefix, "vision_tower"),
+            )
             self.multi_modal_projector = Mistral3MultiModalProjector(
                 vision_hidden_size=config.vision_config.hidden_size,
                 text_hidden_size=config.text_config.hidden_size,
@@ -451,7 +486,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
                 patch_size=config.vision_config.patch_size,
                 multimodal_projector_bias=config.multimodal_projector_bias,
                 quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+                prefix=maybe_prefix(prefix, "multi_modal_projector"),
+            )
         else:
             self.vision_tower = None
             self.multi_modal_projector = None
@@ -463,35 +499,33 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
+        self, **kwargs: object
+    ) -> Mistral3ImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
             return None
 
-        assert pixel_values is not None
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
-            pixel_values=flatten_bn(pixel_values),
+            pixel_values=pixel_values,
         )
 
     def _process_image_input(
         self,
         image_input: Mistral3ImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
-        image_sizes = [(img.shape[-2], img.shape[-1])
-                       for img in image_input["pixel_values"]]
+        image_sizes = [
+            (img.shape[-2], img.shape[-1]) for img in image_input["pixel_values"]
+        ]
 
         image_features = self.vision_tower(image_input["pixel_values"])
 
@@ -503,19 +537,19 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
             for image_feature in image_features
         ]
 
-        image_embeds = self.multi_modal_projector(torch.cat(image_features),
-                                                  image_sizes)
+        image_embeds = self.multi_modal_projector(
+            torch.cat(image_features), image_sizes
+        )
         if len(feature_sizes) > 1:
             image_embeds = torch.split(image_embeds, feature_sizes)
         else:
-            image_embeds = (image_embeds, )
+            image_embeds = (image_embeds,)
         return image_embeds
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -524,30 +558,14 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
 
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Mistral3.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -588,29 +606,19 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.vision_tower is None and self.multi_modal_projector is None:
             skip_prefixes = ["vision_tower.", "multi_modal_projector."]
@@ -625,4 +633,5 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="multi_modal_projector",
-            tower_model="vision_tower")
+            tower_model="vision_tower",
+        )
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index bebf0b5adac52768cd02d61fd7b43bbfecf01607..d7a1cb82fb4fb5bfdb527bff11b9f28dcabd45c7 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -23,10 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
+
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -35,26 +35,41 @@ from transformers import MixtralConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class MixtralMoE(nn.Module):
@@ -66,22 +81,24 @@ class MixtralMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(self,
-                 num_experts: int,
-                 top_k: int,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None,
-                 dp_size: Optional[int] = None,
-                 prefix: str = "",
-                 enable_eplb: bool = False):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        dp_size: int | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
         super().__init__()
         self.hidden_size = hidden_size
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
 
         # Expert Parallelism Load balancing settings.
@@ -91,38 +108,40 @@ class MixtralMoE(nn.Module):
 
         self.n_routed_experts = num_experts
         self.n_logical_experts = num_experts
-        self.n_redundant_experts = (
-            parallel_config.eplb_config.num_redundant_experts)
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_redundant_experts = parallel_config.eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
 
         # Gate always runs at half / full precision for now.
 
-        self.gate = ReplicatedLinear(hidden_size,
-                                     num_experts,
-                                     bias=False,
-                                     params_dtype=params_dtype,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-
-        self.experts = FusedMoE(num_experts=num_experts,
-                                top_k=top_k,
-                                hidden_size=hidden_size,
-                                intermediate_size=intermediate_size,
-                                params_dtype=params_dtype,
-                                reduce_results=True,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                tp_size=tp_size,
-                                dp_size=dp_size,
-                                prefix=f"{prefix}.experts",
-                                enable_eplb=self.enable_eplb,
-                                num_redundant_experts=self.n_redundant_experts)
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            dp_size=dp_size,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -135,7 +154,6 @@ class MixtralMoE(nn.Module):
 
 
 class MixtralAttention(nn.Module):
-
     def __init__(
         self,
         config: MixtralConfig,
@@ -144,8 +162,8 @@ class MixtralAttention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -196,13 +214,15 @@ class MixtralAttention(nn.Module):
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -218,12 +238,11 @@ class MixtralAttention(nn.Module):
 
 
 class MixtralDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -240,7 +259,8 @@ class MixtralDecoderLayer(nn.Module):
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.self_attn")
+            prefix=f"{prefix}.self_attn",
+        )
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
@@ -248,40 +268,38 @@ class MixtralDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             prefix=f"{prefix}.block_sparse_moe",
-            enable_eplb=enable_eplb)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            enable_eplb=enable_eplb,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.block_sparse_moe(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class MixtralModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -293,8 +311,11 @@ class MixtralModel(nn.Module):
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
 
@@ -305,8 +326,7 @@ class MixtralModel(nn.Module):
         )
 
         self.enable_eplb = parallel_config.enable_eplb
-        self.num_redundant_experts = (
-            parallel_config.eplb_config.num_redundant_experts)
+        self.num_redundant_experts = parallel_config.eplb_config.num_redundant_experts
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -317,28 +337,29 @@ class MixtralModel(nn.Module):
                 prefix=prefix,
                 enable_eplb=self.enable_eplb,
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -347,10 +368,9 @@ class MixtralModel(nn.Module):
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -362,10 +382,10 @@ class MixtralModel(nn.Module):
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
             num_experts=self.config.num_local_experts,
-            num_redundant_experts=self.num_redundant_experts)
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -377,25 +397,27 @@ class MixtralModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -424,20 +446,23 @@ class MixtralModel(nn.Module):
                     if is_pp_missing_parameter(name_mapped, self):
                         continue
 
-                    if ((name_mapped.endswith(".bias")
-                         or name_mapped.endswith("_bias"))
-                            and name_mapped not in params_dict):
+                    if (
+                        name_mapped.endswith(".bias") or name_mapped.endswith("_bias")
+                    ) and name_mapped not in params_dict:
                         continue
 
                     param = params_dict[name_mapped]
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
-                    success = weight_loader(param,
-                                            loaded_weight,
-                                            name_mapped,
-                                            shard_id=shard_id,
-                                            expert_id=expert_id,
-                                            return_success=True)
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
                     if success:
                         name = name_mapped
                         break
@@ -445,8 +470,9 @@ class MixtralModel(nn.Module):
                     if is_expert_weight:
                         continue
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -457,15 +483,15 @@ class MixtralModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
-                         MixtureOfExperts):
+class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
@@ -492,8 +518,9 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         self.lora_config = lora_config
         self.quant_config = quant_config
 
-        self.model = MixtralModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        self.model = MixtralModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -504,19 +531,22 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
             # We need bigger padding if using lora for kernel
             # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
+            if not lora_config
+            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size
+        )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
         self.expert_weights = []
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_moe = None
 
         for layer in self.model.layers:
@@ -524,7 +554,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
                 continue
             assert isinstance(layer, MixtralDecoderLayer)
             if hasattr(layer, "block_sparse_moe") and isinstance(
-                    layer.block_sparse_moe, MixtralMoE):
+                layer.block_sparse_moe, MixtralMoE
+            ):
                 example_moe = layer.block_sparse_moe
                 self.moe_layers.append(layer.block_sparse_moe.experts)
 
@@ -541,22 +572,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         self.num_expert_groups = 1
         self.num_shared_experts = 0
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
@@ -565,40 +580,40 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         assert self.num_local_physical_experts == num_local_physical_experts
         self.num_physical_experts = num_physical_experts
         self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = (num_physical_experts -
-                                      self.num_logical_experts)
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
             if hasattr(layer, "block_sparse_moe") and isinstance(
-                    layer.block_sparse_moe, MixtralMoE):
+                layer.block_sparse_moe, MixtralMoE
+            ):
                 moe = layer.block_sparse_moe
                 moe.n_local_physical_experts = num_local_physical_experts
                 moe.n_physical_experts = num_physical_experts
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 56f0d2b6887112bccee77d5783aa602f466d3b6c..7600e8ce9f01e625595364de240a7d051331f935 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -19,7 +19,7 @@
 import math
 from collections.abc import Iterable, Mapping
 from itertools import tee
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -27,37 +27,58 @@ from transformers import BatchFeature, Llama4Config, Llama4VisionConfig
 from transformers.image_utils import SizeDict
 from transformers.models.llama4 import Llama4Processor
 from transformers.models.llama4.image_processing_llama4_fast import (
-    find_supported_resolutions, get_best_fit)
+    find_supported_resolutions,
+    get_best_fit,
+)
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        InputProcessingContext,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MixtureOfExperts,
+    MultiModalEmbeddings,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .llama4 import Llama4ForCausalLM
-from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    maybe_prefix,
+)
 from .vision import run_dp_sharded_vision_model
 
 
@@ -72,16 +93,17 @@ class Llama4ImagePatchInputs(TensorSchema):
 
     type: Literal["pixel_values"] = "pixel_values"
 
-    flat_data: Annotated[torch.Tensor,
-                         TensorShape("total_num_chunks", "num_channels",
-                                     "image_size", "image_size")]
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("total_num_chunks", "num_channels", "image_size", "image_size"),
+    ]
 
     patches_per_image: Annotated[torch.Tensor, TensorShape("batch_size")]
     """
     The number of total patches for each image in the batch.
     
     This is used to split the embeddings which has the first two dimensions
-    flattened just like `flat_data`.
+    flattened just like `pixel_values`.
     """
 
     aspect_ratios: Annotated[torch.Tensor, TensorShape("batch_size", 2)]
@@ -93,7 +115,6 @@ class Llama4ImagePatchInputs(TensorSchema):
 
 
 class Llama4VisionMLP(nn.Module):
-
     def __init__(
         self,
         input_size: int,
@@ -101,7 +122,7 @@ class Llama4VisionMLP(nn.Module):
         output_size: int,
         bias: bool,
         output_activation: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -135,11 +156,10 @@ class Llama4VisionMLP(nn.Module):
 
 
 class Llama4MultiModalProjector(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -165,9 +185,9 @@ def pixel_shuffle(input_tensor, shuffle_ratio):
     input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
     batch_size, height, width, channels = input_tensor.size()
 
-    reshaped_tensor = input_tensor.view(batch_size, height,
-                                        int(width * shuffle_ratio),
-                                        int(channels / shuffle_ratio))
+    reshaped_tensor = input_tensor.view(
+        batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio)
+    )
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
     reshaped_tensor = reshaped_tensor.view(
@@ -178,24 +198,23 @@ def pixel_shuffle(input_tensor, shuffle_ratio):
     )
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
-    output_tensor = reshaped_tensor.view(batch_size, -1,
-                                         reshaped_tensor.shape[-1])
+    output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
     return output_tensor
 
 
 class Llama4VisionPixelShuffleMLP(nn.Module):
-
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
         super().__init__()
         self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
-        self.inner_dim = int(config.projector_input_dim //
-                             (self.pixel_shuffle_ratio**2))
+        self.inner_dim = int(
+            config.projector_input_dim // (self.pixel_shuffle_ratio**2)
+        )
         self.output_dim = config.projector_output_dim
         self.mlp = Llama4VisionMLP(
             input_size=config.intermediate_size,
@@ -209,24 +228,23 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
         )
 
     def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
-        encoded_patches = pixel_shuffle(encoded_patches,
-                                        self.pixel_shuffle_ratio)
+        encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
         return self.mlp(encoded_patches)
 
 
 class Llama4VisionAttention(nn.Module):
-
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
-        self.tp_size = (1 if use_data_parallel else
-                        get_tensor_model_parallel_world_size())
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // self.num_heads
@@ -237,8 +255,9 @@ class Llama4VisionAttention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.scaling = self.head_dim**-0.5
 
-        self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
-                                       self.scaling)
+        self.attn = MultiHeadAttention(
+            self.num_local_heads, self.head_dim, self.scaling
+        )
 
         if use_data_parallel:
             self.qkv_proj = ReplicatedLinear(
@@ -277,7 +296,7 @@ class Llama4VisionAttention(nn.Module):
             head_size=self.head_dim,
             rotary_dim=config.hidden_size // config.num_attention_heads // 2,
             # number of image patches
-            max_position=(config.image_size // config.patch_size)**2,
+            max_position=(config.image_size // config.patch_size) ** 2,
             base=config.rope_theta,
             rope_scaling={"rope_type": "mllama4"},
             is_neox_style=False,
@@ -308,11 +327,10 @@ class Llama4VisionAttention(nn.Module):
 
 
 class Llama4VisionEncoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -357,29 +375,31 @@ class Llama4VisionEncoderLayer(nn.Module):
         hidden_state = self.mlp(hidden_state)
         hidden_state = residual + hidden_state
 
-        outputs = (hidden_state, )
+        outputs = (hidden_state,)
         return outputs
 
 
 class Llama4VisionEncoder(nn.Module):
-
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([
-            Llama4VisionEncoderLayer(
-                config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.layers.{layer_idx}",
-                use_data_parallel=use_data_parallel,
-            ) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                Llama4VisionEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -387,9 +407,9 @@ class Llama4VisionEncoder(nn.Module):
     ) -> torch.Tensor:
         r"""
         Args:
-            hidden_states: Input tensor of shape 
+            hidden_states: Input tensor of shape
                 (batch_size, sequence_length, hidden_size).
-                Hidden states from the model embeddings, representing 
+                Hidden states from the model embeddings, representing
                 the input tokens.
                 associated vectors than the model's internal embedding
                 lookup matrix.
@@ -403,11 +423,10 @@ class Llama4VisionEncoder(nn.Module):
 
 
 class Llama4UnfoldConvolution(nn.Module):
-
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -415,8 +434,7 @@ class Llama4UnfoldConvolution(nn.Module):
         kernel_size = config.patch_size
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size)
-        self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
-                                      stride=config.patch_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
         self.linear = ColumnParallelLinear(
             input_size=config.num_channels * kernel_size[0] * kernel_size[1],
             output_size=config.hidden_size,
@@ -435,11 +453,10 @@ class Llama4UnfoldConvolution(nn.Module):
 
 
 class Llama4VisionModel(nn.Module):
-
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -450,7 +467,7 @@ class Llama4VisionModel(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_channels = config.num_channels
 
-        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
         self.scale = config.hidden_size**-0.5
 
         self.patch_embedding = Llama4UnfoldConvolution(
@@ -460,10 +477,10 @@ class Llama4VisionModel(nn.Module):
             use_data_parallel=use_data_parallel,
         )
 
-        self.class_embedding = nn.Parameter(self.scale *
-                                            torch.randn(self.hidden_size))
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
         self.positional_embedding_vlm = nn.Parameter(
-            self.scale * torch.randn(self.num_patches, self.hidden_size))
+            self.scale * torch.randn(self.num_patches, self.hidden_size)
+        )
 
         # layer norms
         self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
@@ -492,8 +509,9 @@ class Llama4VisionModel(nn.Module):
         num_tiles, num_patches, hidden_dim = hidden_state.shape
 
         # Add cls token
-        class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1,
-                                                      hidden_state.shape[-1])
+        class_embedding = self.class_embedding.expand(
+            hidden_state.shape[0], 1, hidden_state.shape[-1]
+        )
         hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
         num_patches += 1
 
@@ -505,7 +523,8 @@ class Llama4VisionModel(nn.Module):
             hidden_dim,
         )
         positional_embedding = self.positional_embedding_vlm.to(
-            dtype=hidden_state.dtype, device=hidden_state.device)
+            dtype=hidden_state.dtype, device=hidden_state.device
+        )
         hidden_state = hidden_state + positional_embedding
         hidden_state = self.layernorm_pre(hidden_state)
         hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
@@ -524,7 +543,6 @@ class Llama4VisionModel(nn.Module):
 
 
 class Mllama4ProcessingInfo(BaseProcessingInfo):
-
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__(ctx)
 
@@ -532,11 +550,11 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         return self.ctx.get_hf_config(Llama4Config)
 
     def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
-        return self.ctx.get_hf_processor(Llama4Processor,
-                                         use_fast=kwargs.pop("use_fast", True),
-                                         **kwargs)
+        return self.ctx.get_hf_processor(
+            Llama4Processor, use_fast=kwargs.pop("use_fast", True), **kwargs
+        )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         # Although vLLM can support more images from an infra capability
         # perspective, we do not recommend using >10 images in practice.
         return {"image": None}
@@ -546,13 +564,13 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         image_size = vision_config.image_size
         patch_size = vision_config.patch_size
 
-        assert (
-            image_size %
-            patch_size == 0), f"chunk size {image_size} should be multiple of "
+        assert image_size % patch_size == 0, (
+            f"chunk size {image_size} should be multiple of "
+        )
         f"patch_size {patch_size}"
 
         ds_ratio = int(round(1.0 / (vision_config.pixel_shuffle_ratio**2)))
-        return (image_size // patch_size)**2 // ds_ratio
+        return (image_size // patch_size) ** 2 // ds_ratio
 
     def get_max_num_tiles(self) -> int:
         image_processor = self.get_hf_processor().image_processor
@@ -563,13 +581,10 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         vision_config = self.get_hf_config().vision_config
         image_size = vision_config.image_size
         # Result in the max possible feature size (h:w = 16:1)
-        return ImageSize(height=self.get_max_num_tiles() * image_size,
-                         width=image_size)
-
+        return ImageSize(height=self.get_max_num_tiles() * image_size, width=image_size)
 
-class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
-                                 ):
 
+class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -593,15 +608,16 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         vision_config = self.info.get_hf_config().vision_config
 
         if processed_outputs.get("pixel_values") is not None:
-            assert (
-                "images" in mm_data
-            ), "images expected to be in mm_data when pixel_values is present"
+            assert "images" in mm_data, (
+                "images expected to be in mm_data when pixel_values is present"
+            )
 
             images = mm_data["images"]
-            parsed_images = (self._get_data_parser().parse_mm_data({
-                "image":
-                images
-            }).get_items("image", ImageProcessorItems))
+            parsed_images = (
+                self._get_data_parser()
+                .parse_mm_data({"image": images})
+                .get_items("image", ImageProcessorItems)
+            )
 
             tile_size = vision_config.image_size
             possible_resolutions = find_supported_resolutions(
@@ -613,20 +629,20 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                     (image.size[1], image.size[0]),
                     torch.tensor(possible_resolutions),
                     resize_to_max_canvas=image_processor.resize_to_max_canvas,
-                ) for image in parsed_images
+                )
+                for image in parsed_images
             ]
             # TODO tile height/width do not necessarily need to match
-            aspect_ratios = [(image_size[0] // tile_size,
-                              image_size[1] // tile_size)
-                             for image_size in best_fit_sizes]
+            aspect_ratios = [
+                (image_size[0] // tile_size, image_size[1] // tile_size)
+                for image_size in best_fit_sizes
+            ]
             patches_per_image = [
-                1 if r_h * r_w == 1 else 1 + r_h * r_w
-                for (r_h, r_w) in aspect_ratios
+                1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
             ]
 
             processed_outputs["aspect_ratios"] = torch.tensor(aspect_ratios)
-            processed_outputs["patches_per_image"] = torch.tensor(
-                patches_per_image)
+            processed_outputs["patches_per_image"] = torch.tensor(patches_per_image)
 
         return processed_outputs
 
@@ -638,7 +654,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         patches_per_image = hf_inputs.get("patches_per_image", torch.empty(0))
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", patches_per_image),
+                "image", patches_per_image
+            ),
             patches_per_image=MultiModalFieldConfig.batched("image"),
             aspect_ratios=MultiModalFieldConfig.batched("image"),
         )
@@ -678,7 +695,6 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
 
 
 class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -691,17 +707,21 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        (target_width,
-         target_height) = self.info.get_image_size_with_most_features()
+        (target_width, target_height) = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 @MULTIMODAL_REGISTRY.register_processor(
@@ -709,9 +729,15 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
     info=Mllama4ProcessingInfo,
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
-
-class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                     SupportsPP):
+class Llama4ForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    MixtureOfExperts,
+    SupportsEagle3,
+    SupportsLoRA,
+):
+    merge_by_field_config = True
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -720,7 +746,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|image|>"
 
@@ -744,60 +770,102 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 use_data_parallel=self.use_data_parallel,
             )
             self.multi_modal_projector = Llama4MultiModalProjector(
-                self.config,
-                None,
-                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+                self.config, None, prefix=maybe_prefix(prefix, "multi_modal_projector")
+            )
         else:
             self.vision_model = None
             self.multi_modal_projector = None
         self.language_model = initialize_model(
-            vllm_config=vllm_config.with_hf_config(config.text_config,
-                                                   ["LlamaForCausalLM"]),
+            vllm_config=vllm_config.with_hf_config(
+                config.text_config, ["LlamaForCausalLM"]
+            ),
             prefix=maybe_prefix(prefix, "language_model"),
             model_class=Llama4ForCausalLM,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+        # Set MoE hyperparameters
+        self.num_expert_groups = 1
+        self.num_logical_experts = self.language_model.num_logical_experts
+        self.num_physical_experts = self.language_model.num_physical_experts
+        self.num_local_physical_experts = self.language_model.num_local_physical_experts
+        self.num_routed_experts = self.language_model.num_routed_experts
+        self.num_shared_experts = self.language_model.num_shared_experts
+        self.num_redundant_experts = self.language_model.num_redundant_experts
+        self.moe_layers = self.language_model.moe_layers
+        self.num_moe_layers = len(self.moe_layers)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        """Set which layers should output auxiliary hidden states for EAGLE3."""
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, "set_aux_hidden_state_layers")
+        self.language_model.set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Get the layer indices for auxiliary hidden state outputs.
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, "get_eagle3_aux_hidden_state_layers")
+        return self.language_model.get_eagle3_aux_hidden_state_layers()
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ):
+        self.language_model.set_eplb_state(
+            expert_load_view, logical_to_physical_map, logical_replica_count
+        )
+        self.expert_weights = self.language_model.expert_weights
+
+    def update_physical_experts_metadata(
+        self, num_physical_experts: int, num_local_physical_experts: int
+    ):
+        self.language_model.update_physical_experts_metadata(
+            num_physical_experts, num_local_physical_experts
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
+        self, **kwargs: object
+    ) -> Llama4ImagePatchInputs | None:
         # num_images, 1, num_chunks, channel, image_size, image_size
         pixel_values = kwargs.pop("pixel_values", None)
         if pixel_values is None:
             return None
 
-        # num_images x num_chunks, channel, image_size, image_size
-        # TODO: confirm handling for variable lengths
-        flat_pixel_values = flatten_bn(pixel_values, concat=True)
-        patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
+        patches_per_image = kwargs.pop("patches_per_image")
         aspect_ratios = kwargs.pop("aspect_ratios")
-        if aspect_ratios.ndim == 3:
-            aspect_ratios = aspect_ratios.squeeze(1)
 
         return Llama4ImagePatchInputs(
             type="pixel_values",
-            flat_data=flat_pixel_values,
+            pixel_values=pixel_values,
             patches_per_image=patches_per_image,
             aspect_ratios=aspect_ratios,
         )
 
     def _process_image_input(
-            self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
-
+        self, image_input: Llama4ImagePatchInputs
+    ) -> MultiModalEmbeddings:
         assert self.vision_model and self.multi_modal_projector
-        flat_data = image_input["flat_data"]
+        pixel_values = image_input["pixel_values"]
         patches_per_image = image_input["patches_per_image"].tolist()
 
         # shard image input
         if self.use_data_parallel:
             vision_embeddings_flat = run_dp_sharded_vision_model(
-                flat_data, self.vision_model)
+                pixel_values, self.vision_model
+            )
         else:
-            vision_embeddings_flat = self.vision_model(flat_data)
+            vision_embeddings_flat = self.vision_model(pixel_values)
 
-        vision_embeddings_flat = self.multi_modal_projector(
-            vision_embeddings_flat)
+        vision_embeddings_flat = self.multi_modal_projector(vision_embeddings_flat)
 
         return [
             img.flatten(0, 1)
@@ -807,65 +875,39 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None and len(
-                multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner,
-        # this condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        return self.language_model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        return self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def separate_weights(
         self,
         weights: Iterable[tuple[str, torch.Tensor]],
         prefix: str,
-    ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[
-            str, torch.Tensor]]]:
+    ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, torch.Tensor]]]:
         weights1, weights2 = tee(weights, 2)
 
         def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]:
@@ -907,31 +949,33 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _rename_weight_for_modelopt_checkpoint(self, name: str) -> str:
         """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM
         format."""
-        if name.startswith("model.") or name.startswith(
-                "language_model.model."):
-            renamed = name.replace("model.", "language_model.model.",
-                                   1) if name.startswith("model.") else name
+        if name.startswith("model.") or name.startswith("language_model.model."):
+            renamed = (
+                name.replace("model.", "language_model.model.", 1)
+                if name.startswith("model.")
+                else name
+            )
             # Handle expert scale parameters with flat naming
-            if "feed_forward.experts." in name and ("_input_scale" in name or
-                                                    "_weight_scale" in name):
+            if "feed_forward.experts." in name and (
+                "_input_scale" in name or "_weight_scale" in name
+            ):
                 # Map checkpoint naming to vLLM's expected naming
                 if "down_proj_input_scale" in renamed:
-                    return renamed.replace("down_proj_input_scale",
-                                           "w2_input_scale")
+                    return renamed.replace("down_proj_input_scale", "w2_input_scale")
                 elif "down_proj_weight_scale" in renamed:
-                    return renamed.replace("down_proj_weight_scale",
-                                           "w2_weight_scale")
+                    return renamed.replace("down_proj_weight_scale", "w2_weight_scale")
                 elif "gate_up_proj_input_scale" in renamed:
-                    return renamed.replace("gate_up_proj_input_scale",
-                                           "w13_input_scale")
+                    return renamed.replace(
+                        "gate_up_proj_input_scale", "w13_input_scale"
+                    )
                 elif "gate_up_proj_weight_scale" in renamed:
-                    return renamed.replace("gate_up_proj_weight_scale",
-                                           "w13_weight_scale")
+                    return renamed.replace(
+                        "gate_up_proj_weight_scale", "w13_weight_scale"
+                    )
                 return renamed
 
             # Handle attention scale parameters
-            elif "self_attn." in name and (".k_scale" in name
-                                           or ".v_scale" in name):
+            elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name):
                 if ".k_proj.k_scale" in renamed:
                     return renamed.replace(".k_proj.k_scale", ".attn.k_scale")
                 elif ".v_proj.v_scale" in renamed:
@@ -942,8 +986,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return renamed
 
         elif name.startswith("lm_head.weight"):
-            return name.replace("lm_head.weight",
-                                "language_model.lm_head.weight")
+            return name.replace("lm_head.weight", "language_model.lm_head.weight")
 
         return name
 
@@ -966,7 +1009,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         return language_model_weights, other_weights
 
     def _handle_expert_scale_broadcasting(
-            self, weights: list[tuple[str, torch.Tensor]], params_dict: dict
+        self, weights: list[tuple[str, torch.Tensor]], params_dict: dict
     ) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
         """Handle expert scale parameters that need broadcasting.
 
@@ -979,12 +1022,18 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         for name, weight in weights:
             # Check if this is an expert scale parameter that needs broadcasting
-            if ("feed_forward.experts." in name and "scale" in name
-                    and ".shared_expert" not in name):
+            if (
+                "feed_forward.experts." in name
+                and "scale" in name
+                and ".shared_expert" not in name
+            ):
                 if name in params_dict:
                     param = params_dict[name]
-                    if (hasattr(param, 'data') and param.data.numel() > 1
-                            and weight.numel() == 1):
+                    if (
+                        hasattr(param, "data")
+                        and param.data.numel() > 1
+                        and weight.numel() == 1
+                    ):
                         # Broadcast single value to all experts
                         param.data.fill_(weight.item())
                         updated_params.add(name)
@@ -996,10 +1045,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return regular_weights, expert_scale_weights, updated_params
 
-    def _load_other_weights(self, other_weights: Iterable[tuple[str,
-                                                                torch.Tensor]],
-                            params_dict: dict,
-                            stacked_params_mapping: list) -> set[str]:
+    def _load_other_weights(
+        self,
+        other_weights: Iterable[tuple[str, torch.Tensor]],
+        params_dict: dict,
+        stacked_params_mapping: list,
+    ) -> set[str]:
         """Load non-language-model weights with stacking support."""
         updated_params = set()
 
@@ -1020,16 +1071,24 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             else:
                 # Use regular weight loading
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
                 updated_params.add(name)
 
         return updated_params
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.num_local_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
@@ -1046,8 +1105,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         updated_params: set[str] = set()
 
         # Separate and rename weights
-        language_model_weights, other_weights = (
-            self._separate_and_rename_weights(weights))
+        language_model_weights, other_weights = self._separate_and_rename_weights(
+            weights
+        )
 
         # Skip loading vision model and projector if they're not initialized.
         if self.vision_model is None and self.multi_modal_projector is None:
@@ -1055,8 +1115,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Handle expert scale parameters
         regular_weights, expert_scale_weights, updated_params_from_experts = (
-            self._handle_expert_scale_broadcasting(language_model_weights,
-                                                   params_dict))
+            self._handle_expert_scale_broadcasting(language_model_weights, params_dict)
+        )
         updated_params.update(updated_params_from_experts)
 
         loader = AutoWeightsLoader(self)
@@ -1065,13 +1125,22 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         updated_params.update(loaded_language_model_params)
 
         if expert_scale_weights:
-            loaded_expert_scale_params = loader.load_weights(
-                expert_scale_weights)
+            loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
             if loaded_expert_scale_params:
                 updated_params.update(loaded_expert_scale_params)
 
         updated_params.update(
-            self._load_other_weights(other_weights, params_dict,
-                                     stacked_params_mapping))
+            self._load_other_weights(other_weights, params_dict, stacked_params_mapping)
+        )
 
         return updated_params
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector.",
+            tower_model="vision_model.",
+        )
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d057eb49a62d1ff8036deee6fab985d487e05197..48604d8e510311ec2fe8be1330f220af268d46c4 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -10,9 +10,13 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from .utils import maybe_prefix
+
 SQRT2 = 2**0.5
 
 
@@ -72,8 +76,7 @@ class MLPSpeculator(nn.Module):
         self.n_predict = config.n_predict
         self.vocab_size = config.vocab_size
         self.emb_dim = config.emb_dim
-        self.inner_dim = config.inner_dim if config.inner_dim != 0 \
-            else config.emb_dim
+        self.inner_dim = config.inner_dim if config.inner_dim != 0 else config.emb_dim
 
         self.max_speculative_tokens = config.num_lookahead_tokens
 
@@ -81,64 +84,92 @@ class MLPSpeculator(nn.Module):
         self.scale_input = config.scale_input
 
         if self.tie_weights:
-            assert (
-                self.n_predict > 1
-            ), "You cannot tie weights between stages when only 1 exists"
+            assert self.n_predict > 1, (
+                "You cannot tie weights between stages when only 1 exists"
+            )
             embedding = VocabParallelEmbedding(
-                config.vocab_size,
-                self.inner_dim,
-                org_num_embeddings=config.vocab_size)
+                config.vocab_size, self.inner_dim, org_num_embeddings=config.vocab_size
+            )
             self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens)
 
             # the initial projection from the base model may
             # have a different size, so that stays separate.
             proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False)
             proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False)
-            self.proj = nn.ModuleList([proj_first] + [proj_tied] *
-                                      (self.max_speculative_tokens - 1))
-
-            head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
-            self.head = nn.ModuleList([head] * self.max_speculative_tokens)
-
-            ln = MLPSpeculatorLayerNorm(self.inner_dim,
-                                        elementwise_scale_and_shift=True)
+            self.proj = nn.ModuleList(
+                [proj_first] + [proj_tied] * (self.max_speculative_tokens - 1)
+            )
+
+            self.head = nn.ModuleList(
+                [
+                    ParallelLMHead(
+                        self.vocab_size,
+                        self.inner_dim,
+                        bias=False,
+                        prefix=maybe_prefix(prefix, f"head.{i}"),
+                    )
+                    for i in range(self.max_speculative_tokens)
+                ]
+            )
+
+            ln = MLPSpeculatorLayerNorm(
+                self.inner_dim, elementwise_scale_and_shift=True
+            )
             self.ln = nn.ModuleList([ln] * self.max_speculative_tokens)
 
         else:
-            self.emb = nn.ModuleList([
-                VocabParallelEmbedding(config.vocab_size,
-                                       self.inner_dim,
-                                       org_num_embeddings=config.vocab_size)
-                for _ in range(self.max_speculative_tokens)
-            ])
-
-            self.proj = nn.ModuleList([
-                nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
-                          self.inner_dim,
-                          bias=False)
-                for i in range(self.max_speculative_tokens)
-            ])
-
-            self.head = nn.ModuleList([
-                ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
-                for _ in range(self.max_speculative_tokens)
-            ])
-            self.ln = nn.ModuleList([
-                MLPSpeculatorLayerNorm(self.inner_dim,
-                                       elementwise_scale_and_shift=True)
-                for _ in range(self.max_speculative_tokens)
-            ])
+            self.emb = nn.ModuleList(
+                [
+                    VocabParallelEmbedding(
+                        config.vocab_size,
+                        self.inner_dim,
+                    )
+                    for _ in range(self.max_speculative_tokens)
+                ]
+            )
+
+            self.proj = nn.ModuleList(
+                [
+                    nn.Linear(
+                        (self.emb_dim if i == 0 else self.inner_dim),
+                        self.inner_dim,
+                        bias=False,
+                    )
+                    for i in range(self.max_speculative_tokens)
+                ]
+            )
+
+            self.head = nn.ModuleList(
+                [
+                    ParallelLMHead(
+                        self.vocab_size,
+                        self.inner_dim,
+                        bias=False,
+                        prefix=maybe_prefix(prefix, f"head.{i}"),
+                    )
+                    for i in range(self.max_speculative_tokens)
+                ]
+            )
+            self.ln = nn.ModuleList(
+                [
+                    MLPSpeculatorLayerNorm(
+                        self.inner_dim, elementwise_scale_and_shift=True
+                    )
+                    for _ in range(self.max_speculative_tokens)
+                ]
+            )
         if self.scale_input:
             self.ln0 = MLPSpeculatorLayerNorm(
-                self.emb_dim, elementwise_scale_and_shift=False)
+                self.emb_dim, elementwise_scale_and_shift=False
+            )
 
-        self.state_weight = 0.5**(0.5 / config.n_predict)
-        self.emb_weight = math.sqrt(
-            (1 - self.state_weight**2) * (self.inner_dim / 2))
+        self.state_weight = 0.5 ** (0.5 / config.n_predict)
+        self.emb_weight = math.sqrt((1 - self.state_weight**2) * (self.inner_dim / 2))
         self.activation = nn.GELU()
         self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                config.vocab_size, 1.0)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, config.vocab_size, 1.0
+        )
 
     # NOTE(woosuk): This method is commented out because it is old code
     # using V0. We should either port it to V1 or remove it.
@@ -191,16 +222,14 @@ class MLPSpeculator(nn.Module):
 
     #     return next_tokens
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             name = name.replace("speculator.", "")
             param = params_dict.get(name)
             if param is not None:
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 1d5da3139de9276c214c9d3b1c292c97968788ca..3a8a6c74d9d158950286361a67a80bd9e33080ef 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,26 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Set
-from typing import Optional, Union
 
 import torch
 from torch import nn
 from transformers import ModernBertConfig
+from transformers.activations import ACT2FN
 
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.pooler import (ClassifierPooler,
-                                               DispatchPooler, Pooler,
-                                               PoolingMethod,
-                                               PoolingParamsUpdate,
-                                               PoolingType)
+from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
+from vllm.model_executor.layers.pooler import (
+    ClassifierPooler,
+    DispatchPooler,
+    Pooler,
+    PoolingMethod,
+    PoolingParamsUpdate,
+    PoolingType,
+)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
@@ -28,25 +29,30 @@ from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
 from .interfaces_base import default_pooling_type
-from .utils import WeightsMapper, maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class ModernBertEmbeddings(nn.Module):
-
     def __init__(self, config: ModernBertConfig):
-
         super().__init__()
         self.config = config
-        self.tok_embeddings = VocabParallelEmbedding(config.vocab_size,
-                                                     config.hidden_size)
-        self.norm = nn.LayerNorm(config.hidden_size,
-                                 eps=config.layer_norm_eps,
-                                 bias=config.norm_bias)
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        eps = (
+            getattr(config, "norm_eps", None)
+            or getattr(config, "layer_norm_eps", None)
+            or 1e-5
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=eps, bias=config.norm_bias)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.tok_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             return self.norm(inputs_embeds)
@@ -57,24 +63,20 @@ class ModernBertEmbeddings(nn.Module):
 
 
 class ModernBertRotaryEmbedding(RotaryEmbedding):
-
-    def __init__(self, config: ModernBertConfig, head_size: int, dim: int,
-                 base: float):
+    def __init__(self, config: ModernBertConfig, head_size: int, dim: int, base: float):
         super().__init__(
             head_size=head_size,
             rotary_dim=dim,
             max_position_embeddings=config.max_position_embeddings,
             base=base,
             is_neox_style=True,
-            dtype=torch.float16)
+            dtype=torch.float16,
+        )
         self.config = config
 
 
 class ModernBertAttention(nn.Module):
-
-    def __init__(self,
-                 config: ModernBertConfig,
-                 layer_id: Optional[int] = None):
+    def __init__(self, config: ModernBertConfig, layer_id: int | None = None):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -96,24 +98,27 @@ class ModernBertAttention(nn.Module):
         sliding_window = None
         if layer_id % config.global_attn_every_n_layers != 0:
             sliding_window = config.local_attention // 2
-            rope_theta = config.local_rope_theta if config.local_rope_theta \
-                    is not None else config.global_rope_theta
+            rope_theta = (
+                config.local_rope_theta
+                if config.local_rope_theta is not None
+                else config.global_rope_theta
+            )
         else:
             rope_theta = config.global_rope_theta
 
-        self.rotary_emb = ModernBertRotaryEmbedding(config=config,
-                                                    head_size=self.head_dim,
-                                                    dim=self.head_dim,
-                                                    base=rope_theta)
+        self.rotary_emb = ModernBertRotaryEmbedding(
+            config=config, head_size=self.head_dim, dim=self.head_dim, base=rope_theta
+        )
         self.attn = EncoderOnlyAttention(
             self.num_heads,
             self.head_dim,
             self.scaling,
             prefix=f"{layer_id}.attn",
-            per_layer_sliding_window=sliding_window)
-        self.Wo = RowParallelLinear(config.hidden_size,
-                                    config.hidden_size,
-                                    bias=config.attention_bias)
+            per_layer_sliding_window=sliding_window,
+        )
+        self.Wo = RowParallelLinear(
+            config.hidden_size, config.hidden_size, bias=config.attention_bias
+        )
 
     def forward(
         self,
@@ -130,17 +135,16 @@ class ModernBertAttention(nn.Module):
 
 
 class ModernBertMLP(nn.Module):
-
     def __init__(self, config: ModernBertConfig):
         super().__init__()
         self.config = config
-        self.Wi = nn.Linear(config.hidden_size,
-                            int(config.intermediate_size) * 2,
-                            bias=config.mlp_bias)
+        self.Wi = nn.Linear(
+            config.hidden_size, int(config.intermediate_size) * 2, bias=config.mlp_bias
+        )
         self.act = nn.GELU()
-        self.Wo = RowParallelLinear(config.intermediate_size,
-                                    config.hidden_size,
-                                    bias=config.mlp_bias)
+        self.Wo = RowParallelLinear(
+            config.intermediate_size, config.hidden_size, bias=config.mlp_bias
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
@@ -148,23 +152,21 @@ class ModernBertMLP(nn.Module):
 
 
 class ModernBertLayer(nn.Module):
-
-    def __init__(self,
-                 config: ModernBertConfig,
-                 prefix: str = "",
-                 layer_id: Optional[int] = None):
+    def __init__(
+        self, config: ModernBertConfig, prefix: str = "", layer_id: int | None = None
+    ):
         super().__init__()
         self.config = config
         if layer_id == 0:
             self.attn_norm = nn.Identity()
         else:
-            self.attn_norm = nn.LayerNorm(config.hidden_size,
-                                          eps=config.norm_eps,
-                                          bias=config.norm_bias)
+            self.attn_norm = nn.LayerNorm(
+                config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
+            )
         self.attn = ModernBertAttention(config=config, layer_id=layer_id)
-        self.mlp_norm = nn.LayerNorm(config.hidden_size,
-                                     eps=config.norm_eps,
-                                     bias=config.norm_bias)
+        self.mlp_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
+        )
         self.mlp = ModernBertMLP(config)
 
     def forward(
@@ -172,8 +174,9 @@ class ModernBertLayer(nn.Module):
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
     ) -> torch.Tensor:
-        attn_outputs = self.attn(hidden_states=self.attn_norm(hidden_states),
-                                 position_ids=position_ids)
+        attn_outputs = self.attn(
+            hidden_states=self.attn_norm(hidden_states), position_ids=position_ids
+        )
         hidden_states = hidden_states + attn_outputs
         mlp_output = self.mlp(self.mlp_norm(hidden_states))
         hidden_states = hidden_states + mlp_output
@@ -181,14 +184,15 @@ class ModernBertLayer(nn.Module):
 
 
 class ModernBertEncoderLayer(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        self.layers = nn.ModuleList([
-            ModernBertLayer(config=config, layer_id=layer_id)
-            for layer_id in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                ModernBertLayer(config=config, layer_id=layer_id)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -204,7 +208,8 @@ class ModernBertEncoderLayer(nn.Module):
 @default_pooling_type("CLS")
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={"layers.": "encoder_layer.layers."})
+        orig_to_new_prefix={"layers.": "encoder_layer.layers."}
+    )
 
     def __init__(
         self,
@@ -216,12 +221,14 @@ class ModernBertModel(nn.Module):
         self.config = config
         self.embeddings = ModernBertEmbeddings(config)
         self.encoder_layer = ModernBertEncoderLayer(vllm_config)
-        self.final_norm = nn.LayerNorm(config.hidden_size,
-                                       eps=config.norm_eps,
-                                       bias=config.norm_bias)
+        self.final_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.embed_input_ids(input_ids)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.hf_to_vllm_mapper.apply(weights)
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -229,8 +236,7 @@ class ModernBertModel(nn.Module):
             if name.endswith(".bias") and name not in params_dict:
                 continue
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -239,14 +245,15 @@ class ModernBertModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            hidden_states = self.embeddings(input_ids=input_ids,
-                                            inputs_embeds=inputs_embeds)
+            hidden_states = self.embeddings(
+                input_ids=input_ids, inputs_embeds=inputs_embeds
+            )
 
         outputs = self.encoder_layer(
             hidden_states=hidden_states,
@@ -257,18 +264,18 @@ class ModernBertModel(nn.Module):
 
 
 class ModernBertPooler(Pooler):
-
     def __init__(self, config: ModernBertConfig):
         super().__init__()
 
         pooling_type = PoolingType[config.classifier_pooling.upper()]
         self.pooling = PoolingMethod.from_pooling_type(pooling_type)
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size,
-                               config.classifier_bias)
+        self.dense = nn.Linear(
+            config.hidden_size, config.hidden_size, config.classifier_bias
+        )
         self.act = nn.GELU()
-        self.norm = nn.LayerNorm(config.hidden_size,
-                                 eps=config.norm_eps,
-                                 bias=config.norm_bias)
+        self.norm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
+        )
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return self.pooling.get_supported_tasks()
@@ -282,9 +289,9 @@ class ModernBertPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         pooled_output = self.pooling(hidden_states, pooling_metadata)
 
         if isinstance(pooled_output, list):
@@ -297,50 +304,49 @@ class ModernBertPooler(Pooler):
 
 @default_pooling_type("CLS")
 class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
-
     is_pooling_model = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.config = config
-        self.model = ModernBertModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "modernbert"))
-        self.classifier = nn.Linear(config.hidden_size,
-                                    config.num_labels,
-                                    dtype=vllm_config.model_config.head_dtype)
+        self.model = ModernBertModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")
+        )
+        self.classifier = nn.Linear(
+            config.hidden_size,
+            config.num_labels,
+            dtype=vllm_config.model_config.head_dtype,
+        )
         self.pooling = ModernBertPooler(config)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            ClassifierPooler(
-                pooling=self.pooling,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                    vllm_config.model_config),
-            ),
-            "score":
-            ClassifierPooler(
-                pooling=self.pooling,
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                    vllm_config.model_config),
-            ),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
+                "classify": ClassifierPooler(
+                    pooling=self.pooling, classifier=self.classifier, act_fn="classify"
+                ),
+                "score": ClassifierPooler(
+                    pooling=self.pooling, classifier=self.classifier, act_fn="score"
+                ),
+            }
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         self_weights = []
 
         def weight_filter():
             for name, weight in weights:
                 if name.startswith("model."):
-                    yield name[len("model."):], weight
+                    yield name[len("model.") :], weight
                 else:
                     self_weights.append((name, weight))
 
@@ -351,24 +357,94 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
         for name, loaded_weight in self_weights:
             if name.startswith("classifier"):
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             if name.startswith("head"):
-                param = params_dict["pooling." + name[len("head") + 1:]]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                param = params_dict["pooling." + name[len("head") + 1 :]]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
 
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor],
+        input_ids: torch.LongTensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             positions=positions,
         )
+
+
+class ModernBertPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dense = nn.Linear(
+            config.hidden_size, config.hidden_size, bias=config.classifier_bias
+        )
+        self.act = ACT2FN[config.classifier_activation]
+        self.norm = nn.LayerNorm(
+            config.hidden_size,
+            eps=getattr(config, "norm_eps", 1e-5),
+            bias=getattr(config, "norm_bias", True),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(self.act(self.dense(hidden_states)))
+
+
+@default_pooling_type("ALL")
+class ModernBertForTokenClassification(nn.Module):
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.head_dtype = vllm_config.model_config.head_dtype
+        self.num_labels = config.num_labels
+        self.model = ModernBertModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")
+        )
+        self.head = ModernBertPredictionHead(config)
+        self.classifier = nn.Linear(
+            config.hidden_size, config.num_labels, dtype=self.head_dtype
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config
+                ),
+            }
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self, skip_prefixes=["drop"])
+        loaded_params = loader.load_weights(weights)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
+        hidden_states = self.head(hidden_states)
+        hidden_states = hidden_states.to(self.head_dtype)
+        return self.classifier(hidden_states)
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 11a2a384c165e574bfd60fe6d534062512e280bd..9e7d997bdb01d0477c864a436e19b71e40dd1afa 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -5,7 +5,6 @@
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
 from dataclasses import dataclass, field
-from typing import Union
 
 
 @dataclass
@@ -54,19 +53,22 @@ class MultiModelKeys(ModelKeys):
     generator: list[str] = field(default_factory=list)
 
     @staticmethod
-    def from_string_field(language_model: Union[str, list[str]] = None,
-                          connector: Union[str, list[str]] = None,
-                          tower_model: Union[str, list[str]] = None,
-                          generator: Union[str, list[str]] = None,
-                          **kwargs) -> 'MultiModelKeys':
-
+    def from_string_field(
+        language_model: str | list[str] = None,
+        connector: str | list[str] = None,
+        tower_model: str | list[str] = None,
+        generator: str | list[str] = None,
+        **kwargs,
+    ) -> "MultiModelKeys":
         def to_list(value):
             if value is None:
                 return []
             return [value] if isinstance(value, str) else list(value)
 
-        return MultiModelKeys(language_model=to_list(language_model),
-                              connector=to_list(connector),
-                              tower_model=to_list(tower_model),
-                              generator=to_list(generator),
-                              **kwargs)
+        return MultiModelKeys(
+            language_model=to_list(language_model),
+            connector=to_list(connector),
+            tower_model=to_list(tower_model),
+            generator=to_list(generator),
+            **kwargs,
+        )
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 201bf83cac581af7c6f615299c334726f876e5f1..ab83a271e30a06e05ae619bc5db8633341e4dc77 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -6,15 +6,14 @@ from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
 from itertools import islice
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
-                          TensorType)
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -22,43 +21,65 @@ from vllm.attention import Attention
 from vllm.attention.layer import MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather)
-from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
-                                                   SiluAndMul)
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptIndexTargets,
+    PromptInsertion,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP, SupportsQuant)
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -75,22 +96,19 @@ class MolmoImageInputs(TensorSchema):
     """
     Dimensions:
         - bn: Batch size * number of images
-        - nc: Number of crops (dynamic)
+        - bnc: Batch size * number of images * number of crops (dynamic)
         - np: Number of patches
         - tp: Token sequence positions
         - pd: Patch dimension
     """
-    images: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                      TensorShape("bn", "nc", "np", "pd", dynamic_dims={"nc"})]
-    # Number of crops may vary per batch and image, so pass it as a list.
 
-    image_masks: Annotated[Optional[Union[torch.Tensor, list[torch.Tensor]]],
-                           TensorShape("bn", "nc", "np", dynamic_dims={"nc"})]
+    images: Annotated[torch.Tensor, TensorShape("bnc", "np", "pd")]
+
+    image_masks: Annotated[torch.Tensor | None, TensorShape("bnc", "np")]
+
+    image_input_idx: Annotated[torch.Tensor, TensorShape("bnc", "tp")]
+    """An index tensor that maps image features to their corresponding patch tokens."""
 
-    feat_is_patch: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "nc", "tp", dynamic_dims={"nc"})]
-    # A boolean mask indicating which image features correspond to patch tokens.
     num_crops: Annotated[torch.Tensor, TensorShape("bn")]
 
 
@@ -109,8 +127,7 @@ class VisionBackboneConfig:
     image_norm_eps: float = 1e-5
 
     def __post_init__(self):
-        self.image_default_input_size = tuple(
-            self.image_default_input_size)  # type: ignore[assignment]
+        self.image_default_input_size = tuple(self.image_default_input_size)  # type: ignore[assignment]
 
     @property
     def image_num_patch(self):
@@ -124,7 +141,7 @@ class ViTMLP(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.w1 = ColumnParallelLinear(
@@ -158,7 +175,7 @@ class MultiHeadDotProductAttention(nn.Module):
         config: VisionBackboneConfig,
         use_bias: bool = True,
         nlayers: int = 1,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
 
@@ -206,15 +223,13 @@ class MultiHeadDotProductAttention(nn.Module):
         )
 
         self.scale = self.head_dim**-0.5
-        self.attn = MultiHeadAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.scale,
-                                       num_kv_heads=self.num_kv_heads)
-
-    def forward(self,
-                inputs_q: torch.Tensor,
-                inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self.attn = MultiHeadAttention(
+            self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads
+        )
 
+    def forward(
+        self, inputs_q: torch.Tensor, inputs_kv: torch.Tensor | None = None
+    ) -> torch.Tensor:
         if inputs_kv is not None:
             inputs_k = inputs_kv
             inputs_v = inputs_kv
@@ -238,11 +253,10 @@ class ResidualAttentionBlock(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
-        self.attention = MultiHeadDotProductAttention(
-            config, quant_config=quant_config)
+        self.attention = MultiHeadDotProductAttention(config, quant_config=quant_config)
         self.feed_forward = ViTMLP(config, quant_config)
         self.attention_norm = nn.LayerNorm(
             config.image_emb_dim,
@@ -265,13 +279,15 @@ class BlockCollection(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
-        self.resblocks = nn.ModuleList([
-            ResidualAttentionBlock(config, quant_config)
-            for _ in range(config.image_num_layers)
-        ])
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(config, quant_config)
+                for _ in range(config.image_num_layers)
+            ]
+        )
 
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
         hidden_states = []
@@ -291,24 +307,23 @@ class VisionTransformer(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         scale = config.image_emb_dim**-0.5
         self.patch_num = config.image_num_patch
-        self.class_embedding = nn.Parameter(
-            torch.randn(config.image_emb_dim) * scale)
+        self.class_embedding = nn.Parameter(torch.randn(config.image_emb_dim) * scale)
         self.num_prefix_tokens: int = NUM_PREFIX_TOKENS
         self.positional_embedding = nn.Parameter(
-            torch.randn(config.image_num_pos, config.image_emb_dim) * scale)
+            torch.randn(config.image_num_pos, config.image_emb_dim) * scale
+        )
         image_patch_size = config.image_patch_size
         self.patch_embedding = nn.Linear(
             image_patch_size * image_patch_size * 3,
             config.image_emb_dim,
             bias=False,
         )
-        self.pre_ln = nn.LayerNorm(config.image_emb_dim,
-                                   eps=config.image_norm_eps)
+        self.pre_ln = nn.LayerNorm(config.image_emb_dim, eps=config.image_norm_eps)
         self.transformer = BlockCollection(config, quant_config)
 
     def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
@@ -316,8 +331,12 @@ class VisionTransformer(nn.Module):
         pos_emb = self.positional_embedding[1:]
 
         pos_emb = pos_emb.reshape(
-            (int(math.sqrt(pos_emb.shape[0])),
-             int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1]))
+            (
+                int(math.sqrt(pos_emb.shape[0])),
+                int(math.sqrt(pos_emb.shape[0])),
+                pos_emb.shape[1],
+            )
+        )
 
         (patch_num_0, patch_num_1) = patch_num
 
@@ -334,13 +353,12 @@ class VisionTransformer(nn.Module):
             pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
 
         pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
-        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]],
-                          dim=1).to(x.dtype)
+        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
         return x
 
-    def forward(self,
-                x: torch.Tensor,
-                patch_num: Optional[int] = None) -> list[torch.Tensor]:
+    def forward(
+        self, x: torch.Tensor, patch_num: int | None = None
+    ) -> list[torch.Tensor]:
         """
         : param x: (batch_size, num_patch, n_pixels)
         """
@@ -352,8 +370,8 @@ class VisionTransformer(nn.Module):
 
         # class embeddings and positional embeddings
         x = torch.cat(
-            [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x],
-            dim=1)
+            [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1
+        )
         x = self.add_pos_emb(x, patch_num)
 
         x = self.pre_ln(x)
@@ -368,8 +386,8 @@ class MolmoAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -381,8 +399,7 @@ class MolmoAttention(nn.Module):
         assert self.total_num_heads % self.tp_size == 0
 
         self.num_heads = self.total_num_heads // self.tp_size
-        self.total_num_kv_heads = config.num_key_value_heads \
-            or self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads or self.total_num_heads
         if self.total_num_kv_heads >= self.tp_size:
             assert self.total_num_kv_heads % self.tp_size == 0
         else:
@@ -405,15 +422,15 @@ class MolmoAttention(nn.Module):
             quant_config=quant_config,
         )
 
-        self.tp_rank: Optional[int] = None
-        self.k_norm: Optional[nn.Module] = None
-        self.q_norm: Optional[nn.Module] = None
+        self.tp_rank: int | None = None
+        self.k_norm: nn.Module | None = None
+        self.q_norm: nn.Module | None = None
         if config.attention_layer_norm:
             self.tp_rank = get_tensor_model_parallel_rank()
-            self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
-                                  eps=config.layer_norm_eps)
-            self.q_norm = RMSNorm(config.hidden_size,
-                                  eps=config.layer_norm_eps)
+            self.k_norm = RMSNorm(
+                self.total_num_kv_heads * self.head_dim, eps=config.layer_norm_eps
+            )
+            self.q_norm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         # Rotary embeddings.
         self.rotary_emb = get_rope(
@@ -423,13 +440,15 @@ class MolmoAttention(nn.Module):
             base=self.rope_theta,
         )
         self.scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
         # Attention output projection.
         self.o_proj = RowParallelLinear(
@@ -439,16 +458,16 @@ class MolmoAttention(nn.Module):
             quant_config=quant_config,
         )
 
-    def _apply_qk_norm(self, q: torch.Tensor,
-                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.tp_size > 1:
             q = tensor_model_parallel_all_gather(q.contiguous())
             k = tensor_model_parallel_all_gather(k.contiguous())
         q = self.q_norm(q)
         k = self.k_norm(k)
         if self.tp_size > 1:
-            splitter = partial(split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
         return q, k
@@ -471,10 +490,12 @@ class MolmoAttention(nn.Module):
 class LanguageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 input_dim: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        input_dim: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
@@ -511,8 +532,8 @@ class ImageProjectorMLP(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        input_dim: int | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -546,63 +567,58 @@ class ImageProjectorMLP(nn.Module):
 
 
 class MolmoDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         # Attention block.
-        self.self_attn = MolmoAttention(config,
-                                        cache_config,
-                                        quant_config,
-                                        prefix=f"{prefix}.self_attn")
+        self.self_attn = MolmoAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attn"
+        )
 
         # MLP block.
         self.mlp = LanguageModelMLP(config, quant_config=quant_config)
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.layer_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.layer_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 class MolmoDecoderNormAfterLayer(MolmoDecoderLayer):
-
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Self Attention
         residual = hidden_states
         hidden_states = self.self_attn(
@@ -628,7 +644,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         self,
         config: PretrainedConfig,
         vision_config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.vit_layers = VIT_LAYERS
@@ -637,16 +653,14 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
             (self.image_num_patch[0] + 1) // POOLING_SIZE,
             (self.image_num_patch[1] + 1) // POOLING_SIZE,
         )
-        self.image_vit = VisionTransformer(vision_config,
-                                           quant_config=quant_config)
+        self.image_vit = VisionTransformer(vision_config, quant_config=quant_config)
         self.num_prefix_tokens = self.image_vit.num_prefix_tokens
-        assert self.num_prefix_tokens in {
-            0, 1
-        }, "Only 0 or 1 prefix tokens are supported"
+        assert self.num_prefix_tokens in {0, 1}, (
+            "Only 0 or 1 prefix tokens are supported"
+        )
         self.image_pooling_2d = MultiHeadDotProductAttention(
-            vision_config,
-            nlayers=len(self.vit_layers),
-            quant_config=quant_config)
+            vision_config, nlayers=len(self.vit_layers), quant_config=quant_config
+        )
         self.image_projector = ImageProjectorMLP(
             config,
             input_dim=vision_config.image_emb_dim,
@@ -670,8 +684,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         """
         B, T, N, D = images.shape
 
-        mask = ~torch.all(
-            images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
+        mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
 
         images = images.view(B * T, N, D)
         image_features = self.image_vit(images)
@@ -706,21 +719,22 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         assert image_masks is not None
         pad_embed = self.pad_embed[:, None, None, None, :]
         all_pad = image_masks == 0
-        partial_pad = torch.logical_and(
-            image_masks < 1,
-            torch.logical_not(all_pad)).to(dtype=torch.float32)
+        partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(
+            dtype=torch.float32
+        )
         all_pad = all_pad.to(dtype=torch.float32)
-        image_features = image_features + pad_embed[0] * torch.unsqueeze(
-            all_pad, -1)
+        image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
         image_features = image_features + pad_embed[1] * torch.unsqueeze(
-            partial_pad, -1)
+            partial_pad, -1
+        )
 
         image_features = image_features.to(og_dtype)
 
         image_features = image_features.reshape(
-            (batch_size, num_image) + self.image_num_patch + (-1, ), )
+            (batch_size, num_image) + self.image_num_patch + (-1,),
+        )
 
-        if (missing_w := self.image_num_patch[0] % POOLING_SIZE):
+        if missing_w := self.image_num_patch[0] % POOLING_SIZE:
             # Padding for image pooling (see below)
             image_features = F.pad(
                 image_features,
@@ -730,7 +744,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         # image pooling
         image_features = rearrange(
             image_features,
-            'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
+            "b n (h dh) (w dw) c -> (b n h w) (dh dw) c",
             dh=POOLING_SIZE,
             dw=POOLING_SIZE,
         )
@@ -746,8 +760,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         # image_features: (batch_size, num_image, num_patch, d_model)
         return image_features
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("merged_linear", "gate_proj", 0),
@@ -757,7 +770,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -776,8 +789,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -785,7 +797,6 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
 
 @support_torch_compile
 class MolmoModel(nn.Module, SupportsQuant):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -803,34 +814,33 @@ class MolmoModel(nn.Module, SupportsQuant):
             quant_config=quant_config,
         )
 
-        decoder_layer = MolmoDecoderNormAfterLayer if config.norm_after \
-            else MolmoDecoderLayer
+        decoder_layer = (
+            MolmoDecoderNormAfterLayer if config.norm_after else MolmoDecoderLayer
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: decoder_layer(
-                config, cache_config, quant_config, prefix=prefix),
+                config, cache_config, quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers",
         )
 
         assert config.layer_norm_type == "rms"
         self.norm = RMSNorm(config.hidden_size, config.layer_norm_eps)
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -851,18 +861,16 @@ class MolmoModel(nn.Module, SupportsQuant):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         if residual is not None:
             hidden_states, _ = self.norm(hidden_states, residual)
         else:
             hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
 
@@ -873,8 +881,7 @@ class MolmoModel(nn.Module, SupportsQuant):
                 continue
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -941,8 +948,12 @@ def get_patches_grid_size(
 
 
 def get_candidate_tilings(max_num: int) -> list[tuple[int, int]]:
-    tilings = [(i, j) for i in range(1, max_num + 1)
-               for j in range(1, max_num + 1) if i * j <= max_num]
+    tilings = [
+        (i, j)
+        for i in range(1, max_num + 1)
+        for j in range(1, max_num + 1)
+        if i * j <= max_num
+    ]
     return sorted(tilings, key=lambda x: x[0] * x[1])
 
 
@@ -1043,7 +1054,7 @@ class MolmoProcessorWrapper:
         return image_token_length_h
 
     @property
-    def message_format(self) -> Optional[str]:
+    def message_format(self) -> str | None:
         return "role"
 
     @property
@@ -1124,13 +1135,14 @@ class MolmoProcessorWrapper:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> BatchFeature:
         outputs = self.processor.process(  # type: ignore
-            text, images, **kwargs)
+            text, images, **kwargs
+        )
 
         if images is None:
             images = []
@@ -1148,13 +1160,14 @@ class MolmoProcessorWrapper:
                 self.select_tiling(
                     image_width=image.size[0],
                     image_height=image.size[1],
-                ) for image in images
+                )
+                for image in images
             ]
             # For each image: tiling_h * tiling_w + extra
             num_crops = torch.tensor(tilings).prod(-1) + 1
             assert num_crops.sum() == len(feat_is_patch)
 
-            outputs["feat_is_patch"] = feat_is_patch
+            outputs["image_input_idx"] = image_input_idx
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1162,12 +1175,11 @@ class MolmoProcessorWrapper:
 
 
 class MolmoProcessingInfo(BaseProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
         processor = self.ctx.get_hf_processor(**kwargs)
         return MolmoProcessorWrapper(processor)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -1175,7 +1187,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[MolmoProcessorWrapper],
+        processor: MolmoProcessorWrapper | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -1189,8 +1201,9 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         image_token_length_w = processor.image_token_length_w
         image_token_length_h = processor.image_token_length_h
 
-        extra = image_token_length_w * image_token_length_h
-        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
+        # Calculate total tokens: 2 for start/end + (w+1)*h for column separators
+        extra = 2 + (image_token_length_w + 1) * image_token_length_h
+        joint = 2 + ((ncols + 1) // pooling_size + 1) * ((nrows + 1) // pooling_size)
 
         return extra + joint
 
@@ -1211,8 +1224,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
@@ -1221,7 +1233,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
 
 
 class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -1229,39 +1240,45 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
-
     def _apply_hf_processor_tokens_only(
         self,
         prompt_tokens: list[int],
     ) -> list[int]:
         processor = self.info.get_hf_processor()
 
-        # Apply the chat template to the tokens
+        # The chat template is already applied to the prompt tokens
+        # Use message_format="none" to avoid applying it again
+        # Prepend an empty space if `always_start_with_space` is True
         tokens = processor.processor.get_tokens_input(  # type: ignore
             self.info.get_tokenizer().decode(prompt_tokens),
-            message_format=processor.message_format,
+            message_format="none",
             always_start_with_space=processor.always_start_with_space,
         )
 
+        # Prepend a BOS token id to the tokens
         processed_data = self.info.ctx.call_hf_processor(
             processor,  # type: ignore
             dict(tokens=tokens),
         )
-        prompt_ids, = processed_data.pop("input_ids").tolist()
+        (prompt_ids,) = processed_data.pop("input_ids").tolist()
 
         return prompt_ids
 
@@ -1275,10 +1292,8 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
 
         return dict(
             images=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
-            image_masks=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_crops),
-            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_crops),
+            image_masks=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
+            image_input_idx=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1301,8 +1316,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
         img_end_id = processor.im_end_id
 
         extra_row = [img_patch_id] * image_token_length_w + [img_col_id]
-        extra_joint = ([img_start_id] + extra_row * image_token_length_h +
-                       [img_end_id])
+        extra_joint = [img_start_id] + extra_row * image_token_length_h + [img_end_id]
 
         def get_insertion_molmo(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
@@ -1313,10 +1327,12 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 image_height=image_size.height,
             )
 
-            joint_row = ([img_patch_id] * ((ncols + 1) // pooling_size) +
-                         [img_col_id])
-            joint = ([img_start_id] + joint_row *
-                     ((nrows + 1) // pooling_size) + [img_end_id])
+            joint_row = [img_patch_id] * ((ncols + 1) // pooling_size) + [img_col_id]
+            joint = (
+                [img_start_id]
+                + joint_row * ((nrows + 1) // pooling_size)
+                + [img_end_id]
+            )
 
             return PromptUpdateDetails.select_token_id(
                 extra_joint + joint,
@@ -1332,11 +1348,16 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(MolmoMultiModalProcessor,
-                                        info=MolmoProcessingInfo,
-                                        dummy_inputs=MolmoDummyInputsBuilder)
-class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
-                       SupportsQuant):
+@MULTIMODAL_REGISTRY.register_processor(
+    MolmoMultiModalProcessor,
+    info=MolmoProcessingInfo,
+    dummy_inputs=MolmoDummyInputsBuilder,
+)
+class MolmoForCausalLM(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant
+):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
@@ -1368,11 +1389,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
     packed_modules_mapping = {
         "qkv_proj": ["qkv_proj"],
         "gate_up_proj": ["gate_up_proj"],  # language model
-        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
+        "merged_linear": ["gate_proj", "up_proj"],  # image_projector
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -1383,16 +1404,15 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.multimodal_config = multimodal_config
-        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
-        self.vision_backbone = MolmoVisionBackbone(config, vision_config,
-                                                   quant_config)
-        self.model = MolmoModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config)
+        self.model = MolmoModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.img_patch_id = None
 
         if self.config.weight_tying:
@@ -1405,39 +1425,37 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
-        self.logits_processor = LogitsProcessor(config.embedding_size
-                                                or config.vocab_size)
+        self.logits_processor = LogitsProcessor(
+            config.embedding_size or config.vocab_size
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
         self,
         **kwargs: object,
-    ) -> Optional[MolmoImageInputs]:
+    ) -> MolmoImageInputs | None:
         images = kwargs.pop("images", None)
         image_masks = kwargs.pop("image_masks", None)
-        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        image_input_idx = kwargs.pop("image_input_idx", None)
         num_crops = kwargs.pop("num_crops", None)
 
         if images is None:
             return None
 
-        if not isinstance(num_crops, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_crops. "
-                             f"Got type: {type(num_crops)}")
-        num_crops = flatten_bn(num_crops, concat=True)
-
         img_patch_id = kwargs.pop("img_patch_id", None)
-        if not isinstance(img_patch_id, torch.Tensor):
-            raise ValueError("Incorrect type of img_patch_id. "
-                             f"Got type: {type(img_patch_id)}")
-        self.img_patch_id = img_patch_id.flatten().unique().item()
+        if isinstance(img_patch_id, torch.Tensor):
+            img_patch_id = img_patch_id.item()
+
+        assert isinstance(img_patch_id, int)
+        self.img_patch_id = img_patch_id
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
-            feat_is_patch=feat_is_patch,
+            image_input_idx=image_input_idx,
             num_crops=num_crops,
         )
 
@@ -1447,82 +1465,53 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
     ) -> list[torch.Tensor]:
         images = image_input["images"]
         image_masks = image_input["image_masks"]
-        feat_is_patch = image_input["feat_is_patch"]
+        image_input_idx = image_input["image_input_idx"]
         num_crops = image_input["num_crops"]
 
         # Call the vision backbone on the whole batch at once
-        images_flat = flatten_bn(images, concat=True)
-        image_masks_flat = (None if image_masks is None else flatten_bn(
-            image_masks, concat=True))
-        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
-
-        image_features_flat = self.vision_backbone(
-            images=images_flat.unsqueeze(0),
-            image_masks=(None if image_masks_flat is None else
-                         image_masks_flat.unsqueeze(0)),
+        image_features = self.vision_backbone(
+            images=images.unsqueeze(0),
+            image_masks=None if image_masks is None else image_masks.unsqueeze(0),
         ).squeeze(0)
 
         # Only the features corresponding to patch tokens are relevant
-        return [
-            feats[f_is_patch] for feats, f_is_patch in zip(
-                image_features_flat.split(num_crops.tolist()),
-                feat_is_patch_flat.split(num_crops.tolist()),
-            )
-        ]
+        # Re-order the features using the image_input_idx tensor
+        results = []
+        num_crops_list = num_crops.tolist()
+        for feats, img_idx in zip(
+            image_features.split(num_crops_list),
+            image_input_idx.split(num_crops_list),
+        ):
+            is_valid = img_idx >= 0
+            valid_img_idx = img_idx[is_valid]
+            order = torch.argsort(valid_img_idx)
+            results.append(feats[is_valid][order])
+        return results
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            assert self.img_patch_id is not None
-
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.img_patch_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.LongTensor,
         positions: torch.LongTensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
-
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.model(input_ids,
-                                   positions,
-                                   intermediate_tensors,
-                                   inputs_embeds=inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
@@ -1531,7 +1520,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         loader = AutoWeightsLoader(self)
         weights = _get_weights_with_merged_embedding(weights)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
@@ -1548,7 +1536,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
 
 
 def _get_weights_with_merged_embedding(
-    weights: Iterable[tuple[str, torch.Tensor]]
+    weights: Iterable[tuple[str, torch.Tensor]],
 ) -> Iterable[tuple[str, torch.Tensor]]:
     embedding_weights = {}
     for name, weight in weights:
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index caa00763fc3d45a140871058b5518f4511320b4d..8017c947bf9ad7f9b068a164099b2a0991c3ae6a 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -45,12 +45,11 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from functools import cached_property
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers.activations import ACT2FN, PytorchGELUTanh
+from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available
 
@@ -68,8 +67,8 @@ def multihead_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    q_cu_seqlens: Optional[torch.Tensor] = None,
-    k_cu_seqlens: Optional[torch.Tensor] = None,
+    q_cu_seqlens: torch.Tensor | None = None,
+    k_cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """Multi-head attention using flash attention 2.
 
@@ -91,10 +90,10 @@ def multihead_attention(
     """
     # Unified format legal check
     assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
-    assert q_cu_seqlens[-1] == q.shape[
-        0], "q_cu_seqlens must sum to q.shape[0]"
-    assert (k_cu_seqlens[-1] == k.shape[0] ==
-            v.shape[0]), "k_cu_seqlens must sum to k.shape[0]"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    assert k_cu_seqlens[-1] == k.shape[0] == v.shape[0], (
+        "k_cu_seqlens must sum to k.shape[0]"
+    )
     assert q.dtype in [
         torch.bfloat16,
         torch.float16,
@@ -121,8 +120,8 @@ def sdpa_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    q_cu_seqlens: Optional[torch.Tensor] = None,
-    k_cu_seqlens: Optional[torch.Tensor] = None,
+    q_cu_seqlens: torch.Tensor | None = None,
+    k_cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """SDPA attention.
 
@@ -137,23 +136,19 @@ def sdpa_attention(
         k_cu_seqlens: Optional cumulative sequence lengths of k.
     """
     seq_length = q.shape[0]
-    attention_mask = torch.zeros([1, seq_length, seq_length],
-                                 device=q.device,
-                                 dtype=torch.bool)
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
     for i in range(1, len(q_cu_seqlens)):
         attention_mask[
             ...,
-            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
-            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
         ] = True
     q = q.transpose(0, 1)
     k = k.transpose(0, 1)
     v = v.transpose(0, 1)
-    attn_output = F.scaled_dot_product_attention(q,
-                                                 k,
-                                                 v,
-                                                 attention_mask,
-                                                 dropout_p=0.0)
+    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
     attn_output = attn_output.transpose(0, 1)
     attn_output = attn_output.reshape(seq_length, -1)
     return attn_output
@@ -172,8 +167,9 @@ def _apply_rope_input_validation(x, freqs_cis):
     assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
 
 
-def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
-               freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Args: (The leading dimensions of all inputs should be the same)
         xq: query, tensor of shape (..., num_heads, head_dim)
@@ -189,20 +185,15 @@ def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
     # ..., num_heads, head_dim/2
     xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(
-        -2)  # ..., num_heads, head_dim
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(
-        -2)  # ..., num_heads, head_dim
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
     return xq_out.type_as(xq), xk_out.type_as(xk)
 
 
 class Learnable2DInterpPosEmb(nn.Module):
-
-    def __init__(self,
-                 height: int,
-                 width: int,
-                 dim: int,
-                 interpolation_mode: str = "bicubic") -> None:
+    def __init__(
+        self, height: int, width: int, dim: int, interpolation_mode: str = "bicubic"
+    ) -> None:
         super().__init__()
         self.height = height
         self.width = width
@@ -224,39 +215,42 @@ class Learnable2DInterpPosEmb(nn.Module):
                         self.weight.permute((2, 0, 1)).unsqueeze(0),
                         size=shape,
                         mode=self.interpolation_mode,
-                    ).squeeze(0).permute((1, 2, 0)).flatten(end_dim=1))
+                    )
+                    .squeeze(0)
+                    .permute((1, 2, 0))
+                    .flatten(end_dim=1)
+                )
         out = x + torch.cat(pos_embs)
         return out
 
 
 class MoonVisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         out_dim: int,
         in_dim: int = 3,
-        patch_size: Union[int, tuple[int, int]] = (14, 14),
+        patch_size: int | tuple[int, int] = (14, 14),
         pos_emb_height: int = 14,
         pos_emb_width: int = 14,
     ):
         super().__init__()
-        assert isinstance(
-            patch_size,
-            (int, Sequence)), f"Invalid patch_size type: {type(patch_size)}"
+        assert isinstance(patch_size, (int, Sequence)), (
+            f"Invalid patch_size type: {type(patch_size)}"
+        )
         if isinstance(patch_size, int):
             patch_size = (patch_size, patch_size)
-        assert (len(patch_size) == 2
-                ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        assert len(patch_size) == 2, (
+            f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        )
         self.patch_size = patch_size
 
-        self.proj = nn.Conv2d(in_dim,
-                              out_dim,
-                              kernel_size=patch_size,
-                              stride=patch_size)
+        self.proj = nn.Conv2d(
+            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
+        )
 
-        self.pos_emb = Learnable2DInterpPosEmb(height=pos_emb_height,
-                                               width=pos_emb_width,
-                                               dim=out_dim)
+        self.pos_emb = Learnable2DInterpPosEmb(
+            height=pos_emb_height, width=pos_emb_width, dim=out_dim
+        )
 
     def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
         """
@@ -295,12 +289,9 @@ class Rope2DPosEmb(nn.Module):
         device (str): the device to store the precomputed cis
     """
 
-    def __init__(self,
-                 dim: int,
-                 max_height: int,
-                 max_width: int,
-                 theta_base=10000,
-                 device="cuda"):
+    def __init__(
+        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+    ):
         super().__init__()
         self.dim = dim
         assert self.dim % 4 == 0, "dim must be divisible by 4"
@@ -325,18 +316,18 @@ class Rope2DPosEmb(nn.Module):
         flat_pos = torch.arange(0, N).float().to(self.device)
         x_pos = flat_pos % self.max_width
         y_pos = flat_pos // self.max_width
-        dim_range = (torch.arange(0, self.dim,
-                                  4)[:(self.dim // 4)].float().to(self.device)
-                     )  # C/4
-        freqs = 1.0 / (self.theta_base**(dim_range / self.dim))
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(self.device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
         x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
         y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
         x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
         y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
         # N, C/4, 2
         freqs_cis = torch.cat(
-            [x_cis.unsqueeze(dim=-1),
-             y_cis.unsqueeze(dim=-1)], dim=-1)
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
         # max_height, max_width, C/2
         freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
         return freqs_cis
@@ -349,12 +340,13 @@ class Rope2DPosEmb(nn.Module):
             freqs_cis: tensor of shape (sum(t * height * width), dim//2)
         """
         shapes = grid_hws.tolist()
-        assert all(1 <= h <= self.max_height and 1 <= w <= self.max_width
-                   for h, w in shapes), (
-                       shapes,
-                       self.max_height,
-                       self.max_width,
-                   )
+        assert all(
+            1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
+        ), (
+            shapes,
+            self.max_height,
+            self.max_width,
+        )
         freqs_cis = torch.cat(
             [
                 self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
@@ -364,8 +356,9 @@ class Rope2DPosEmb(nn.Module):
         )
         return freqs_cis
 
-    def get_freqs_cis_by_idx(self, pos_idx: torch.Tensor,
-                             pos_idx_mask: torch.Tensor) -> torch.Tensor:
+    def get_freqs_cis_by_idx(
+        self, pos_idx: torch.Tensor, pos_idx_mask: torch.Tensor
+    ) -> torch.Tensor:
         """
         Args:
             pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
@@ -374,16 +367,20 @@ class Rope2DPosEmb(nn.Module):
         Return:
             freqs_cis: tensor of shape (..., dim//2)
         """
-        assert (pos_idx.shape[:-1] == pos_idx_mask.shape
-                and pos_idx.shape[-1] == 2 and pos_idx.ndim
-                == pos_idx_mask.ndim + 1), (pos_idx.shape, pos_idx_mask.shape)
+        assert (
+            pos_idx.shape[:-1] == pos_idx_mask.shape
+            and pos_idx.shape[-1] == 2
+            and pos_idx.ndim == pos_idx_mask.ndim + 1
+        ), (pos_idx.shape, pos_idx_mask.shape)
         assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
 
-        shp = pos_idx_mask.shape + (self.dim // 2, )  # ..., head_dim/2
-        freqs_cis = torch.ones(shp, dtype=torch.complex64,
-                               device=self.device)  # ..., head_dim/2
-        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[pos_idx[
-            ..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]]
+        shp = pos_idx_mask.shape + (self.dim // 2,)  # ..., head_dim/2
+        freqs_cis = torch.ones(
+            shp, dtype=torch.complex64, device=self.device
+        )  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[
+            pos_idx[..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]
+        ]
         return freqs_cis
 
 
@@ -394,23 +391,23 @@ class MLP2(nn.Module):
         bias: whether to use bias in linear layer.
     """
 
-    def __init__(self,
-                 dims: list[int],
-                 activation,
-                 bias: bool = True,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        dims: list[int],
+        activation,
+        bias: bool = True,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         assert len(dims) == 3
         self.use_data_parallel = use_data_parallel
-        self.fc0 = ReplicatedLinear(dims[0],
-                                    dims[1],
-                                    bias=bias,
-                                    prefix=maybe_prefix(prefix, "fc0"))
-        self.fc1 = ReplicatedLinear(dims[1],
-                                    dims[2],
-                                    bias=bias,
-                                    prefix=maybe_prefix(prefix, "fc1"))
+        self.fc0 = ReplicatedLinear(
+            dims[0], dims[1], bias=bias, prefix=maybe_prefix(prefix, "fc0")
+        )
+        self.fc1 = ReplicatedLinear(
+            dims[1], dims[2], bias=bias, prefix=maybe_prefix(prefix, "fc1")
+        )
         self.activation = activation
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -421,7 +418,6 @@ class MLP2(nn.Module):
 
 
 class MoonVitEncoderLayer(nn.Module):
-
     def __init__(
         self,
         num_heads: int,
@@ -446,24 +442,24 @@ class MoonVitEncoderLayer(nn.Module):
         self.norm0 = nn.LayerNorm(hidden_dim)
         self.norm1 = nn.LayerNorm(hidden_dim)
         self.use_data_parallel = use_data_parallel
-        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim],
-                        activation,
-                        prefix=f"{prefix}.mlp",
-                        use_data_parallel=use_data_parallel)
-        self.wqkv = ReplicatedLinear(hidden_dim,
-                                     hidden_dim * 3,
-                                     bias=attn_bias,
-                                     prefix=f"{prefix}.wqkv")
-        self.wo = ReplicatedLinear(hidden_dim,
-                                   hidden_dim,
-                                   bias=attn_bias,
-                                   prefix=f"{prefix}.wo")
+        self.mlp = MLP2(
+            [hidden_dim, mlp_dim, hidden_dim],
+            activation,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+        self.wqkv = ReplicatedLinear(
+            hidden_dim, hidden_dim * 3, bias=attn_bias, prefix=f"{prefix}.wqkv"
+        )
+        self.wo = ReplicatedLinear(
+            hidden_dim, hidden_dim, bias=attn_bias, prefix=f"{prefix}.wo"
+        )
 
     def attention_qkvpacked(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rope_freqs_cis: Optional[torch.Tensor] = None,
+        rope_freqs_cis: torch.Tensor | None = None,
     ):
         """
         Args:
@@ -484,11 +480,9 @@ class MoonVitEncoderLayer(nn.Module):
         xq, xk = apply_rope(xq, xk, rope_freqs_cis)
 
         attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
-        attn_out = attn_func(xq,
-                             xk,
-                             xv,
-                             q_cu_seqlens=cu_seqlens,
-                             k_cu_seqlens=cu_seqlens)
+        attn_out = attn_func(
+            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
+        )
         attn_out, _ = self.wo(attn_out)
         return attn_out
 
@@ -496,7 +490,7 @@ class MoonVitEncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rope_freqs_cis: Union[torch.Tensor, None] = None,
+        rope_freqs_cis: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """
         Args:
@@ -507,9 +501,9 @@ class MoonVitEncoderLayer(nn.Module):
         """
         residual = hidden_states
         hidden_states = self.norm0(hidden_states)
-        attn_out = self.attention_qkvpacked(hidden_states,
-                                            cu_seqlens,
-                                            rope_freqs_cis=rope_freqs_cis)
+        attn_out = self.attention_qkvpacked(
+            hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+        )
         hidden_states = residual + attn_out
 
         residual = hidden_states
@@ -519,7 +513,6 @@ class MoonVitEncoderLayer(nn.Module):
 
 
 class MoonVitEncoder(nn.Module):
-
     def __init__(
         self,
         hidden_dim: int,
@@ -531,27 +524,37 @@ class MoonVitEncoder(nn.Module):
         super().__init__()
 
         self.rope_2d = Rope2DPosEmb(
-            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
+        )
         self.blocks = nn.ModuleList(
-            [MoonVitEncoderLayer(use_data_parallel=use_data_parallel, \
-                                 prefix=f"{prefix}.blocks.{layer_idx}", \
-                                 **block_cfg) for layer_idx in range(num_layers)])
+            [
+                MoonVitEncoderLayer(
+                    use_data_parallel=use_data_parallel,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    **block_cfg,
+                )
+                for layer_idx in range(num_layers)
+            ]
+        )
         self.final_layernorm = nn.LayerNorm(hidden_dim)
 
-    def forward(self, hidden_states: torch.Tensor,
-                grid_hw: torch.Tensor) -> torch.Tensor:
-        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
-            grid_hws=grid_hw)
+    def forward(
+        self, hidden_states: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(grid_hws=grid_hw)
 
         lengths = torch.cat(
-            (torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
-             (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device)))
+            (
+                torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+                (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device),
+            )
+        )
         cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
 
         for _, block in enumerate(self.blocks):
-            hidden_states = block(hidden_states,
-                                  cu_seqlens,
-                                  rope_freqs_cis=rope_freqs_cis)
+            hidden_states = block(
+                hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+            )
 
         hidden_states = self.final_layernorm(hidden_states)
 
@@ -559,9 +562,9 @@ class MoonVitEncoder(nn.Module):
 
 
 def patch_merger(
-        x: torch.Tensor,
-        grid_hw: torch.Tensor,
-        merge_kernel_size: list[int, int] = (2, 2),
+    x: torch.Tensor,
+    grid_hw: torch.Tensor,
+    merge_kernel_size: list[int, int] = (2, 2),
 ) -> list[torch.Tensor]:
     d_model = x.size(-1)
 
@@ -570,15 +573,17 @@ def patch_merger(
     for x_shape in grid_hw.tolist():
         height, width = x_shape[0], x_shape[1]
         # Get the current sequence
-        seq = x[pre_sum:pre_sum + height * width]
+        seq = x[pre_sum : pre_sum + height * width]
         # Reshape along self.merge_kernel_size and concat to the last dimension
         kernel_height, kernel_width = merge_kernel_size
         new_height, new_width = height // kernel_height, width // kernel_width
-        reshaped_seq = seq.view(new_height, kernel_height, new_width,
-                                kernel_width, d_model)
+        reshaped_seq = seq.view(
+            new_height, kernel_height, new_width, kernel_width, d_model
+        )
         reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
-        padded_seq = reshaped_seq.view(new_height * new_width,
-                                       kernel_height * kernel_width, -1)
+        padded_seq = reshaped_seq.view(
+            new_height * new_width, kernel_height * kernel_width, -1
+        )
         outputs.append(padded_seq)
         pre_sum += height * width
 
@@ -586,7 +591,6 @@ def patch_merger(
 
 
 class MoonVitVLProjector(nn.Module):
-
     def __init__(
         self,
         in_channels: int,
@@ -596,13 +600,10 @@ class MoonVitVLProjector(nn.Module):
         out_dim: int = 4096,
     ):
         super().__init__()
-        self.hidden_size = in_channels * merge_kernel_size[
-            0] * merge_kernel_size[1]
+        self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1]
 
         self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
-        self.linear_1 = nn.Linear(self.hidden_size,
-                                  self.hidden_size,
-                                  bias=True)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
         self.act = ACT2FN[hidden_act]
         self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
 
@@ -621,12 +622,14 @@ class MoonVitPretrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
-    def __init__(self,
-                 config: MoonViTConfig,
-                 use_data_parallel: bool = False,
-                 prefix: str = "",
-                 *inputs,
-                 **kwargs):
+    def __init__(
+        self,
+        config: MoonViTConfig,
+        use_data_parallel: bool = False,
+        prefix: str = "",
+        *inputs,
+        **kwargs,
+    ):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)
         self.use_data_parallel = use_data_parallel
@@ -648,15 +651,16 @@ class MoonVitPretrainedModel(PreTrainedModel):
                 "num_heads": config.num_attention_heads,
                 "hidden_dim": config.hidden_size,
                 "mlp_dim": config.intermediate_size,
-                "activation": PytorchGELUTanh(),
+                "activation": ACT2FN["gelu_pytorch_tanh"],
                 "attn_bias": True,
                 "attn_implementation": config._attn_implementation,
             },
             prefix=f"{prefix}.encoder",
         )
 
-    def forward(self, pixel_values: torch.Tensor,
-                grid_hw: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, pixel_values: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
         """
         Args:
             pixel_values (torch.Tensor): The input pixel values.
@@ -667,7 +671,7 @@ class MoonVitPretrainedModel(PreTrainedModel):
         """
         hidden_states = self.patch_embed(pixel_values, grid_hw)
         hidden_states = self.encoder(hidden_states, grid_hw)
-        hidden_states = patch_merger(hidden_states,
-                                     grid_hw,
-                                     merge_kernel_size=self.merge_kernel_size)
+        hidden_states = patch_merger(
+            hidden_states, grid_hw, merge_kernel_size=self.merge_kernel_size
+        )
         return hidden_states
diff --git a/vllm/model_executor/models/motif.py b/vllm/model_executor/models/motif.py
deleted file mode 100644
index 153f36dcf1f5509c61bba8fa128029ad9bf75dd9..0000000000000000000000000000000000000000
--- a/vllm/model_executor/models/motif.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/modeling_motif.py
-# Copyright (c) Alibaba Cloud.
-# LICENSE: https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/LICENSE
-"""Inference-only Motif model compatible with HuggingFace weights."""
-import math
-from typing import Any, Optional
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention, AttentionType
-from vllm.attention.selector import _Backend
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.models.llama import LlamaForCausalLM
-
-from .adapters import as_seq_cls_model
-from .interfaces import SupportsV0Only
-from .utils import extract_layer_index
-
-
-class MotifMLP(nn.Module):
-    """MLP for the language component of the Motif model, which contains a
-    MergedColumnParallelLinear merging 2 outputs via PolyNorm activation."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str = "poly_norm",
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        prefix: str = "",
-        reduce_results: bool = True,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            input_size=hidden_size,
-            output_sizes=[intermediate_size] * 2,
-            bias=bias,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj",
-        )
-        self.down_proj = RowParallelLinear(
-            input_size=intermediate_size,
-            output_size=hidden_size,
-            bias=bias,
-            quant_config=quant_config,
-            reduce_results=reduce_results,
-            prefix=f"{prefix}.down_proj",
-        )
-        if hidden_act != "poly_norm":
-            raise NotImplementedError(f"Unsupported activation: {hidden_act}. "
-                                      "Only poly_norm is supported for now.")
-        self.act_fn = PolyNorm()
-        self.intermediate_size = intermediate_size
-        tp_size = get_tensor_model_parallel_world_size()
-        if hidden_act == "poly_norm" and tp_size > 1:
-            raise NotImplementedError(
-                "Tensor parallelism for poly_norm is not supported yet. "
-                "Support will be added in the future.")
-
-    def forward(self, x):
-        x, _ = self.gate_up_proj(x)
-        x = self.act_fn(
-            x[..., :self.intermediate_size]) * x[..., self.intermediate_size:]
-        x, _ = self.down_proj(x)
-        return x
-
-
-class MotifAttention(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        super().__init__()
-        layer_idx = extract_layer_index(prefix)
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        head_dim = getattr(config, "head_dim", None)
-        if head_dim is None:
-            head_dim = self.hidden_size // self.total_num_heads
-        self.head_dim = head_dim
-        # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
-                                             1)
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        assert self.num_heads % 2 == 0, 'num_heads should be even'
-        assert self.num_kv_heads % 2 == 0, 'num_heads should be even'
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size=hidden_size,
-            head_size=self.head_dim,
-            total_num_heads=self.total_num_heads,
-            total_num_kv_heads=self.total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-
-        self.o_proj = RowParallelLinear(
-            input_size=self.total_num_heads * self.head_dim,
-            output_size=hidden_size,
-            bias=bias_o_proj,
-            quant_config=quant_config,
-            prefix=f"{prefix}.o_proj",
-        )
-
-        self._init_rotary_emb(config,
-                              rope_scaling=rope_scaling,
-                              quant_config=quant_config)
-        sliding_window = None
-
-        self.lambda_init = self.lambda_init_fn(layer_idx)
-        self.lambda_q1 = nn.Parameter(
-            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
-                                                                    std=0.1))
-        self.lambda_k1 = nn.Parameter(
-            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
-                                                                    std=0.1))
-        self.lambda_q2 = nn.Parameter(
-            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
-                                                                    std=0.1))
-        self.lambda_k2 = nn.Parameter(
-            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
-                                                                    std=0.1))
-        self.subln = RMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
-
-        params = {
-            'differential_flash_attention_config': {
-                'lambda_init': self.lambda_init,
-                'lambda_q1': self.lambda_q1,
-                'lambda_k1': self.lambda_k1,
-                'lambda_q2': self.lambda_q2,
-                'lambda_k2': self.lambda_k2,
-                "subln": self.subln,
-            }
-        }
-
-        diff_attn_err_msg = (
-            'Set VLLM_ATTENTION_BACKEND="DIFFERENTIAL_FLASH_ATTN" '
-            'to enable Differential Flash Attention.')
-        try:
-            self.attn = Attention(
-                self.num_heads,
-                self.head_dim,
-                self.scaling,
-                num_kv_heads=self.num_kv_heads,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                per_layer_sliding_window=sliding_window,
-                attn_type=attn_type,
-                prefix=f"{prefix}.attn",
-                **params,
-            )
-        except TypeError as e:
-            raise ValueError(diff_attn_err_msg) from e
-        assert (self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN
-                ), diff_attn_err_msg
-
-    def lambda_init_fn(self, depth):
-        return 0.8 - 0.6 * math.exp(-0.3 * (depth - 1))
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-    def _init_rotary_emb(self, config: PretrainedConfig,
-                         rope_scaling: Optional[dict[str, Any]],
-                         quant_config: Optional[QuantizationConfig]) -> None:
-        is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "llama":
-            is_neox_style = False
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-            is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
-        )
-
-
-class MotifDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "use_bias", False)
-        bias_o_proj = attention_bias
-        if hasattr(config, 'qkv_bias'):
-            attention_bias = config.qkv_bias
-
-        # By default, Motif uses causal attention as it is a decoder-only model.
-        # You can override the HF config with `is_causal=False` to enable
-        # bidirectional attention, which is used in some embedding models
-        # (e.g. parasail-ai/GritLM-7B-vllm)
-        if getattr(config, "is_causal", True):
-            attn_type = AttentionType.DECODER
-        else:
-            attn_type = AttentionType.ENCODER_ONLY
-
-        self.self_attn = MotifAttention(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            quant_config=quant_config,
-            bias=attention_bias,
-            bias_o_proj=bias_o_proj,
-            cache_config=cache_config,
-            prefix=f"{prefix}.self_attn",
-            attn_type=attn_type,
-        )
-        self.mlp = MotifMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-            bias=getattr(config, "use_bias", False),
-            prefix=f"{prefix}.mlp",
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states)
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-# Motif model uses differential attention
-# Only supported in v0 (no chunked prefill support)
-class MotifForCausalLM(LlamaForCausalLM, SupportsV0Only):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 layer_type: type[nn.Module] = MotifDecoderLayer):
-
-        # Prefix caching and chunked prefill is not supported for this model.
-        assert not vllm_config.cache_config.enable_prefix_caching, \
-            "Motif currently does not support prefix caching"
-        assert not vllm_config.scheduler_config.chunked_prefill_enabled, \
-            "Motif currently does not support chunked prefill"
-
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=layer_type)
-
-
-MotifForSequenceClassification = as_seq_cls_model(MotifForCausalLM)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 64d669e8ac3e136e09952f2301da336371ad189a..106ad971a321a78b7ad52deb97885a25909f0272 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -5,7 +5,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -14,30 +13,38 @@ from transformers import MptConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 def _get_alibi_slopes(
     total_num_heads: int,
     alibi_bias_max: int,
 ) -> torch.Tensor:
-    next_power_of_2 = 2**math.ceil(math.log2(total_num_heads))
+    next_power_of_2 = 2 ** math.ceil(math.log2(total_num_heads))
     m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32)
     m = m.mul(alibi_bias_max / next_power_of_2)
     slopes = 1.0 / torch.pow(2, m)
@@ -47,12 +54,11 @@ def _get_alibi_slopes(
 
 
 class MPTAttention(nn.Module):
-
     def __init__(
         self,
         config: MptConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -77,6 +83,7 @@ class MPTAttention(nn.Module):
             self.total_num_kv_heads,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.Wqkv",
         )
         if self.qk_ln:
             self.q_ln = nn.LayerNorm(self.d_model)
@@ -86,6 +93,7 @@ class MPTAttention(nn.Module):
             self.d_model,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
 
         tp_world_size = get_tensor_model_parallel_world_size()
@@ -107,20 +115,21 @@ class MPTAttention(nn.Module):
         tp_rank = get_tensor_model_parallel_rank()
         head_start = tp_rank * self.num_heads
         head_end = (tp_rank + 1) * self.num_heads
-        alibi_slopes = _get_alibi_slopes(self.total_num_heads,
-                                         self.alibi_bias_max)
+        alibi_slopes = _get_alibi_slopes(self.total_num_heads, self.alibi_bias_max)
         alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
         self.head_dim = self.d_model // self.total_num_heads
         scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scaling,
-                              alibi_slopes=alibi_slopes,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scaling,
+            alibi_slopes=alibi_slopes,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -141,11 +150,11 @@ class MPTAttention(nn.Module):
 
 
 class MPTMLP(nn.Module):
-
     def __init__(
         self,
         config: MptConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.d_model
@@ -156,6 +165,7 @@ class MPTMLP(nn.Module):
             intermediate_size,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
         )
         self.act = get_act_fn("gelu")
         self.down_proj = RowParallelLinear(
@@ -163,6 +173,7 @@ class MPTMLP(nn.Module):
             hidden_size,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -173,23 +184,21 @@ class MPTMLP(nn.Module):
 
 
 class MPTBlock(nn.Module):
-
     def __init__(
         self,
         config: MptConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.d_model
         self.norm_1 = nn.LayerNorm(hidden_size)
-        self.attn = MPTAttention(config,
-                                 cache_config,
-                                 quant_config,
-                                 prefix=f"{prefix}.attn")
+        self.attn = MPTAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.attn"
+        )
         self.norm_2 = nn.LayerNorm(hidden_size)
-        self.ffn = MPTMLP(config, quant_config)
+        self.ffn = MPTMLP(config, quant_config, prefix=f"{prefix}.ffn")
 
     def forward(
         self,
@@ -210,7 +219,6 @@ class MPTBlock(nn.Module):
 
 @support_torch_compile
 class MPTModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -227,35 +235,34 @@ class MPTModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.blocks = make_layers(
             config.n_layers,
-            lambda prefix: MPTBlock(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.blocks")
+            lambda prefix: MPTBlock(config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.blocks",
+        )
         self.norm_f = nn.LayerNorm(config.d_model)
         if config.no_bias:
             for module in self.modules():
-                if hasattr(module, "bias") and isinstance(
-                        module.bias, nn.Parameter):
+                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
                     # Remove the bias term in Linear and LayerNorm.
                     module.register_parameter("bias", None)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.d_model))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.d_model
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -267,8 +274,7 @@ class MPTModel(nn.Module):
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -278,15 +284,13 @@ class MPTModel(nn.Module):
             if is_pp_missing_parameter(name, self):
                 continue
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class MPTForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -295,35 +299,37 @@ class MPTForCausalLM(nn.Module, SupportsPP):
         assert config.tie_word_embeddings
         self.quant_config = quant_config
 
-        self.transformer = MPTModel(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "transformer"))
+        self.transformer = MPTModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 2b68d40cf2c678741871ba92570e39b805401030..cb39c2ae482d2b3d04565e69c0d6821735c41637 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -11,47 +11,78 @@ import copy
 import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 
 import numpy.typing as npt
+import regex as re
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import (BatchEncoding, BatchFeature, PretrainedConfig,
-                          TensorType)
+from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   MultiModalEmbeddings,
-                                                   SupportsMultiModal)
-from vllm.model_executor.models.internvl import (calculate_internvl_targets,
-                                                 get_internvl_target_ratios)
+from vllm.model_executor.models.interfaces import (
+    HasInnerState,
+    IsHybrid,
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsMultiModalPruning,
+)
+from vllm.model_executor.models.internvl import (
+    calculate_internvl_targets,
+    get_internvl_target_ratios,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM
 from vllm.model_executor.models.radio import RadioModel
-from vllm.model_executor.models.utils import (flatten_bn,
-                                              init_vllm_registered_model,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
+from vllm.model_executor.models.utils import (
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, MultiModalKwargsItems,
-                                    NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.evs import (
+    compute_retained_tokens_count,
+    compute_retention_mask,
+)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+    _seq2tokens,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    cached_tokenizer_from_config,
+    encode_tokens,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
+from .utils import _merge_multimodal_embeddings
+
 # Configure PIL to handle large images without warnings
 # This prevents DecompressionBombWarning for legitimate large images
 Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
@@ -63,48 +94,58 @@ IMG_END = "</img>"
 IMG_CONTEXT = "<image>"
 
 # Profiling
-MAX_FRAMES = 16
+# MAX_FRAMES = 16
+DEFAULT_NUM_TILES = 12
 
 
-class NanoNemotronVLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values_flat: torch.Tensor
+class NanoNemotronVLImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height of each image patch
+        - w: Width of each image patch
     """
 
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
+    type: Literal["pixel_values"]
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
-class NanoNemotronVLImageEmbeddinInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
 
-    `hidden_size` must match the hidden size of language model backbone.
+class NanoNemotronVLImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - n: Number of images
+        - f: Total image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
+
+    type: Literal["image_embeds"]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-NanoNemotronVLImageInputs = Union[NanoNemotronVLImagePixelInputs,
-                                  NanoNemotronVLImageEmbeddinInputs]
+NanoNemotronVLImageInputs: TypeAlias = (
+    NanoNemotronVLImagePixelInputs | NanoNemotronVLImageEmbeddingInputs
+)
 
 
 class NanoNemotronVLVideoPixelInputs(TensorSchema):
     """
     Dimensions:
         - bvf: Batch size * number of videos * num_frames
-        - bn: Batch size * number of images
+        - bn: Batch size * number of videos
+        - f: Number of frames
         - c: Number of channels (3)
         - h: Height of each video frame
         - w: Width of each video frame
     """
+
     type: Literal["pixel_values_videos"]
     pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+    frames_indices: Annotated[torch.Tensor, TensorShape("bvf")]
+    frame_duration_ms: Annotated[torch.Tensor, TensorShape("bn")]
 
 
 class NanoNemotronVLVideoEmbeddingInputs(TensorSchema):
@@ -114,21 +155,19 @@ class NanoNemotronVLVideoEmbeddingInputs(TensorSchema):
         - f: Total video feature size
         - h: Hidden size (must match the hidden size of language model backbone)
     """
+
     type: Literal["video_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-NanoNemotronVLVideoInputs = Union[NanoNemotronVLVideoPixelInputs,
-                                  NanoNemotronVLVideoEmbeddingInputs]
+NanoNemotronVLVideoInputs: TypeAlias = (
+    NanoNemotronVLVideoPixelInputs | NanoNemotronVLVideoEmbeddingInputs
+)
 
 
-def dynamic_preprocess(image,
-                       *,
-                       image_size=512,
-                       max_num_tiles=12,
-                       use_thumbnail=True,
-                       idx=0):
+def dynamic_preprocess(
+    image, *, image_size=512, max_num_tiles=12, use_thumbnail=True, idx=0
+):
     orig_width, orig_height = image.size
 
     target_ratios = get_internvl_target_ratios(1, max_num_tiles)
@@ -138,7 +177,8 @@ def dynamic_preprocess(image,
         orig_height=orig_height,
         target_ratios=target_ratios,
         image_size=image_size,
-        use_thumbnail=False)
+        use_thumbnail=False,
+    )
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
@@ -158,12 +198,12 @@ def dynamic_preprocess(image,
         processed_images.append(thumbnail_img)
 
     processed_images = [
-        img.convert("RGB") if img.mode != "RGB" else img
-        for img in processed_images
+        img.convert("RGB") if img.mode != "RGB" else img for img in processed_images
     ]
     processed_images = [
-        T.Resize((image_size, image_size),
-                 interpolation=T.InterpolationMode.BICUBIC)(img)
+        T.Resize((image_size, image_size), interpolation=T.InterpolationMode.BICUBIC)(
+            img
+        )
         for img in processed_images
     ]
     processed_images = [T.ToTensor()(img) for img in processed_images]
@@ -197,6 +237,8 @@ def video_to_pixel_values(
     max_num_tiles: int = 1,
     use_thumbnail: bool,
 ) -> torch.Tensor:
+    assert max_num_tiles == 1, "Video modality always uses one tile"
+
     # Convert each frame to a single resized tile tensor consistent
     # with image path
     frames_tensors: list[torch.Tensor] = []
@@ -210,11 +252,26 @@ def video_to_pixel_values(
         )
         # dynamic_preprocess returns tensors already; take the single tile
         assert len(pil_frame) >= 1
-        frames_tensors.append(pil_frame[0])
+        frames_tensors.append(pil_frame[-1])
 
     return torch.stack(frames_tensors)
 
 
+def input_conditioner(x, norm_mean, norm_std):
+    return (x - norm_mean) / norm_std
+
+
+def calculate_timestamps(
+    indices: list[int] | torch.Tensor,
+    frame_duration_ms: int,
+):
+    if not isinstance(indices, list):
+        indices = indices.tolist()
+
+    timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
+    return timestamps
+
+
 class BaseNanoNemotronVLProcessor(ABC):
     """
     This model doesn't define its own HF processor,
@@ -224,18 +281,26 @@ class BaseNanoNemotronVLProcessor(ABC):
     https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
     """
 
-    def __init__(self, config: PretrainedConfig, tokenizer: AnyTokenizer,
-                 *args, **kwargs) -> None:
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *args,
+        max_num_tiles: int | None = None,
+        **kwargs,
+    ) -> None:
         super().__init__()
 
         self.config = config
         self.tokenizer = tokenizer
 
+        self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
         image_size: int = config.force_image_size
         patch_size: int = config.patch_size
 
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.image_size = image_size
         self.use_thumbnail: bool = config.use_thumbnail
         self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
@@ -250,7 +315,7 @@ class BaseNanoNemotronVLProcessor(ABC):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         raise NotImplementedError
 
@@ -285,7 +350,8 @@ class BaseNanoNemotronVLProcessor(ABC):
                 max_num=max_num_tiles,
                 use_thumbnail=self.use_thumbnail,
                 idx=idx,
-            ) for idx, image in enumerate(images)
+            )
+            for idx, image in enumerate(images)
         ]
 
     def _preprocess_image(
@@ -297,24 +363,35 @@ class BaseNanoNemotronVLProcessor(ABC):
         if len(images) == 0:
             image_inputs = {}
         else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images, max_num_tiles)
-            image_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat":
-                torch.cat(pixel_values_lst),
-                "image_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst]),
+            pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
+            image_inputs = {
+                "pixel_values_flat": input_conditioner(
+                    torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
+                ),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
             }
 
-            for pixel_values in pixel_values_lst:
+            assert len(text) == 1, (
+                "hf_processor is called on the output of get_dummy_text, "
+                "which should be a single string"
+            )
+            parts = [x for x in re.split(r"(<image>)", text[0]) if x]
+            assert parts.count("<image>") == len(pixel_values_lst), (
+                "the number of <image> tokens in the text should be the "
+                "same as the number of images"
+            )
+
+            for i, pixel_values in enumerate(pixel_values_lst):
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                parts[i] = parts[i].replace("<image>", image_repl.full)
+            text = ["".join(parts)]
         return text, image_inputs
 
-    def _make_batch_input(self,
-                          input_item: Optional[Union[Any, list[Any]]] = None):
+    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
         if input_item is None:
             input_item = []
         if not isinstance(input_item, list):
@@ -323,14 +400,14 @@ class BaseNanoNemotronVLProcessor(ABC):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        max_num_tiles: Optional[int] = None,
-    ) -> Mapping[str, NestedTensors]:
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
+    ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images = [self._make_batch_input(x) for x in (text, images)]
 
@@ -342,10 +419,9 @@ class BaseNanoNemotronVLProcessor(ABC):
 
         text_inputs = self.tokenizer(text, add_special_tokens=False)
 
-        return {
-            **BatchEncoding(text_inputs, tensor_type=return_tensors),
-            **image_inputs,
-        }
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
 
 
 class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
@@ -360,27 +436,43 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        video_token: Optional[str] = None,
+        max_num_tiles: int | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
+        video_pruning_rate: float | None = None,
     ) -> None:
         super().__init__(
             config=config,
             tokenizer=tokenizer,
+            max_num_tiles=max_num_tiles,
             min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
         )
         # add extra video token for video processing
         self.video_token = video_token
+        self.video_pruning_rate = video_pruning_rate
+
+        # Pre-tokenize special tokens for video processing
+        # to avoid repeated tokenization
+        self._img_start_token_ids = encode_tokens(
+            tokenizer, IMG_START, add_special_tokens=False
+        )
+        self._img_end_token_ids = encode_tokens(
+            tokenizer, IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = encode_tokens(
+            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        )
 
     @property
     def supports_video(self) -> bool:
         return self.video_token_id is not None
 
     @property
-    def video_token_id(self) -> Optional[int]:
+    def video_token_id(self) -> int | None:
         if self.video_token is None:
             return None
         return self.tokenizer.get_vocab().get(self.video_token, None)
@@ -393,61 +485,125 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         self,
         videos: list[npt.NDArray],
         max_num_tiles: int,
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
-
         return [
             video_to_pixel_values(
                 video,
                 input_size=self.image_size,
                 max_num_tiles=max_num_tiles,
                 use_thumbnail=self.use_thumbnail,
-            ) for video in videos
+            )
+            for video in videos
         ]
 
     def _preprocess_video(
         self,
         text: list[str],
-        videos: list[npt.NDArray],
+        videos: list[tuple[npt.NDArray, dict[str, Any]]],
         max_num_tiles: int,
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ):
         if len(videos) == 0 or not self.supports_video:
             video_inputs = {}
         else:
+            videos_lst = [v[0] for v in videos]
+            video_metadata_lst = [v[1] for v in videos]
             pixel_values_lst_video = self._videos_to_pixel_values_lst(
-                videos,
+                videos_lst,
                 max_num_tiles=max_num_tiles,
                 dynamic_image_size=dynamic_image_size,
             )
 
-            video_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat_video":
-                torch.cat(pixel_values_lst_video),
-                "video_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst_video]),
+            # We use frame duration in milliseconds (as integer) to ensure
+            # we have consistent timestamps calculation. At preprocessing
+            # fps parameter is given in fp32, while at inference it is bf16
+            # which leads to inaccurate timestamp calculation and causes
+            # timestamp values to differ.In rare cases this causes
+            # mismatching number of output tokens for tokenized  frame prefixes
+            frame_duration_ms_lst = [
+                int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
+            ]
+            frames_indices_lst = [
+                metadata["frames_indices"] for metadata in video_metadata_lst
+            ]
+
+            video_inputs = {
+                "pixel_values_flat_video": input_conditioner(
+                    torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
+                ),
+                "video_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst_video]
+                ),
+                "frames_indices": frames_indices_lst,
+                "frame_duration_ms": torch.tensor(frame_duration_ms_lst),
             }
 
-            for pixel_values in pixel_values_lst_video:
-                num_patches = pixel_values.shape[0]
+            image_size: int = self.config.force_image_size
+            patch_size: int = self.config.patch_size
+            downsample_ratio = self.config.downsample_ratio
+            tokens_in_single_frame = int(
+                (image_size * image_size // patch_size**2) * (downsample_ratio**2)
+            )
 
-                video_repl = self.get_video_repl(self.num_image_token,
-                                                 num_patches, self.video_token)
-                text = [t.replace('<video>', video_repl.full, 1) for t in text]
+            for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
+                pixel_values_lst_video,
+                video_metadata_lst,
+                frames_indices_lst,
+                frame_duration_ms_lst,
+            ):
+                num_frames = pixel_values.shape[0]
+
+                if (
+                    self.video_pruning_rate is not None
+                    and self.video_pruning_rate > 0.0
+                ):
+                    # Start of EVS-specific code
+                    num_tokens = compute_retained_tokens_count(
+                        tokens_per_frame=tokens_in_single_frame,
+                        num_frames=num_frames,
+                        q=self.video_pruning_rate,
+                    )
+
+                    # Here we just need placeholders that won't actually be replaced -
+                    # we just need to make sure the total number of tokens is correct
+                    # assign all tokens to the first frame
+                    tokens_per_frame = [num_tokens] + [0] * (num_frames - 1)
+
+                    # End of EVS-specific code
+                else:
+                    tokens_per_frame = [tokens_in_single_frame] * num_frames
+
+                video_repl = self.get_video_repl(
+                    tokens_per_frame=tokens_per_frame,
+                    frames_indices=frames_indices,
+                    frame_duration_ms=frame_duration_ms,
+                    tokenizer=self.tokenizer,
+                    img_start_token_ids=self._img_start_token_ids,
+                    img_end_token_ids=self._img_end_token_ids,
+                    img_context_token_ids=self._img_context_token_ids,
+                )
+
+                # video_repl.full is a list of token IDs
+                # Convert token IDs back to text for the HF processor flow
+                video_repl_text = self.tokenizer.decode(
+                    video_repl.full, skip_special_tokens=False
+                )
+                text = [t.replace("<video>", video_repl_text, 1) for t in text]
         return text, video_inputs
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        max_num_tiles: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-    ) -> Mapping[str, NestedTensors]:
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: list[tuple[npt.NDArray, dict[str, Any]]] | None = None,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images, videos = [
             self._make_batch_input(x) for x in (text, images, videos)
@@ -462,42 +618,102 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         text, video_inputs = self._preprocess_video(
             text=text,
             videos=videos,
-            max_num_tiles=max_num_tiles,
+            max_num_tiles=1,
             dynamic_image_size=dynamic_image_size,
         )
 
         text_inputs = self.tokenizer(text, add_special_tokens=False)
 
-        return BatchFeature({
-            **BatchEncoding(text_inputs, tensor_type=return_tensors),
-            **image_inputs,
-            **video_inputs,
-        })
+        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
 
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
         return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
+    @classmethod
     def get_video_repl(
-        self,
-        feature_size: int,
-        num_patches: Optional[int] = None,
-        video_context_token: str = IMG_CONTEXT,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = video_context_token * self.num_image_token
-        repl_features_with_sep = IMG_START + repl_features + IMG_END
-        # num_patches is equal to num_frames
-        repl_full = ''.join([
-            f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches)
-        ])
+        cls,
+        *,
+        tokens_per_frame: list[int],
+        frames_indices: list[int],
+        frame_duration_ms: int,
+        tokenizer: AnyTokenizer,
+        img_start_token_ids: list[int],
+        img_end_token_ids: list[int],
+        img_context_token_ids: list[int],
+    ) -> PromptUpdateDetails[list[int]]:
+        """
+        Build prompt replacement for a video.
+        The replacement returned is not actually used to replace the placeholder
+        tokens - it's just used to make sure we allocate the correct number
+        of tokens.
+        Actual replacement is done in embed_multimodal of
+        NemotronH_Nano_VL_V2
+        (specifically in _process_video_input -> _create_final_video_embeddings).
+        There, we create the final embeddings with text embeddings for indicator tokens
+        and video embeddings for video tokens.
+        This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
+        The differentiation is done via tokens_per_frame parameter.
+        - non EVS case - constant value same value across all frames
+        - EVS dummy - Doesn't matter how tokens are distributed between frames - just
+                        make sure the total number of tokens is correct.
+        - EVS real (called from get_real_video_repl_for_evs) - different value per frame
+        Args:
+            tokens_per_frame (list[int]): number of tokens per frame
+            frames_indices (list[int]): frame indices
+            frame_duration_ms (int): duration of each frame in milliseconds
+            tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+            img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
+            img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
+            img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
+        """
+        # TODO: Add support of frame_duration_ms to be None
+        # At preprocessing step we should allow absent / metadata without
+        # frames_indices field.
+        timestamps_enabled = frame_duration_ms is not None
+
+        if timestamps_enabled:
+            timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
+
+            assert len(timestamps) == len(tokens_per_frame), (
+                "timestamps and tokens_per_frame must have the same length"
+            )
+            frame_separators = [
+                f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
+                for i, timestamp in enumerate(timestamps)
+            ]
+        else:
+            frame_separators = [
+                f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
+            ]
+
+        # Tokenize frame separator independently
+        frame_separators_tokenized = [
+            _seq2tokens(tokenizer, sep) for sep in frame_separators
+        ]
+
+        # Tokenize each component independently to avoid tokenizer merging tokens
+        # across boundaries. This ensures consistent tokenization regardless of
+        # num_tokens_per_frame values.
+        all_token_ids = []
+        for i, num_tokens in enumerate(tokens_per_frame):
+            frame_sep_token_ids = frame_separators_tokenized[i]
+            all_token_ids.extend(frame_sep_token_ids)
+
+            # Add pre-tokenized special tokens
+            all_token_ids.extend(img_start_token_ids)
+            all_token_ids.extend(img_context_token_ids * num_tokens)
+            all_token_ids.extend(img_end_token_ids)
 
-        return PromptUpdateDetails.select_text(repl_full, video_context_token)
+        return PromptUpdateDetails.from_seq(all_token_ids)
 
 
 class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
@@ -510,7 +726,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
     ) -> BaseNanoNemotronVLProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -519,7 +735,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         max_num_tiles: int,
-        processor: Optional[BaseNanoNemotronVLProcessor],
+        processor: BaseNanoNemotronVLProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -530,8 +746,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
             max_num_tiles=max_num_tiles,
         )
 
-    def get_image_size_with_most_features(self,
-                                          max_num_tiles: int) -> ImageSize:
+    def get_image_size_with_most_features(self, max_num_tiles: int) -> ImageSize:
         processor = self.get_hf_processor()
 
         base_size = processor.image_size
@@ -549,8 +764,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
@@ -560,9 +774,10 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
     def get_max_image_tokens(self) -> int:
         processor = self.get_hf_processor()
         # Use default max_num_tiles for max tokens calculation
-        max_num_tiles = 12
+        max_num_tiles = processor.max_num_tiles
         target_width, target_height = self.get_image_size_with_most_features(
-            max_num_tiles)
+            max_num_tiles
+        )
 
         return self.get_num_image_tokens(
             image_width=target_width,
@@ -576,7 +791,7 @@ _I = TypeVar("_I", bound=BaseNanoNemotronVLProcessingInfo)
 
 
 class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
-    """ ProcessingInfo extended for video processing"""
+    """ProcessingInfo extended for video processing"""
 
     @property
     def supports_video(self):
@@ -586,9 +801,12 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
         video_limit = {"video": None} if self.supports_video else {}
         return {**super().get_supported_mm_limits(), **video_limit}
 
-    def get_video_token(self) -> Optional[str]:
+    def get_video_token(self) -> str | None:
         return IMG_CONTEXT
 
+    def get_video_pruning_rate(self) -> float | None:
+        return self.ctx.get_mm_config().video_pruning_rate
+
     def get_num_frames_with_most_features(
         self,
         seq_len: int,
@@ -600,11 +818,8 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
         processor = self.get_hf_processor()  # we get the CustomProcessor here
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = (seq_len -
-                            max_image_tokens) // processor.num_image_token
+        max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
         max_frames_per_video = max_total_frames // max(max_videos, 1)
-
-        max_frames_per_video = min(max_frames_per_video, MAX_FRAMES)
         return max(max_frames_per_video, 1)
 
     def get_hf_processor(self, **kwargs: object) -> NanoNemotronVLProcessor:
@@ -613,6 +828,7 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
             config=self.get_hf_config(),
             tokenizer=self.get_tokenizer(),
             video_token=self.get_video_token(),
+            video_pruning_rate=self.get_video_pruning_rate(),
             **kwargs,
         )
 
@@ -620,44 +836,19 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
 class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
     """Basic image-only MultiModalProcessor for InternVL-style models."""
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
-        processed_outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
-        )
-
-        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        image_token_id = hf_processor.image_token_id
-
-        # Since there may be extra tokens in the feature placeholders,
-        # we need to pass the image token ID to the model to select the
-        # tokens to merge from the vision encoder outputs
-        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
-
-        return processed_outputs
-
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
-        num_images = len(image_num_patches)
 
         return dict(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_num_patches),
+                "image", image_num_patches
+            ),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
-            image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
 
     def _get_prompt_updates(
@@ -680,14 +871,17 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         def get_replacement_custom(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
                 # Extract max_num_tiles from kwargs, default to 12
-                max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
+                max_num_tiles = hf_processor_mm_kwargs.get(
+                    "max_num_tiles", hf_processor.max_num_tiles
+                )
                 feature_size = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
@@ -699,9 +893,9 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
             local_image_num_patches = image_num_patches
             if isinstance(local_image_num_patches, torch.Tensor):
                 local_image_num_patches = local_image_num_patches.tolist()
-            if isinstance(
-                    local_image_num_patches,
-                (list, tuple)) and item_idx < len(local_image_num_patches):
+            if isinstance(local_image_num_patches, (list, tuple)) and item_idx < len(
+                local_image_num_patches
+            ):
                 num_patches = int(local_image_num_patches[item_idx])
 
             return hf_processor.get_image_repl(feature_size, num_patches)
@@ -716,42 +910,30 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
 
 class NanoNemotronVLMultiModalProcessor(
-        NanoNemotronBaseVLMultiModalProcessor[NanoNemotronVLProcessingInfo]):
+    NanoNemotronBaseVLMultiModalProcessor[NanoNemotronVLProcessingInfo]
+):
     """MultiModalProcessor extended for video support"""
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
-        processed_outputs = super()._call_hf_processor(prompt, mm_data,
-                                                       mm_kwargs, tok_kwargs)
-
-        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        if self.info.supports_video and (
-                video_token_id := hf_processor.video_token_id) is not None:
-            processed_outputs["video_token_id"] = torch.tensor(video_token_id)
-        return processed_outputs
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(video_needs_metadata=True)
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        image_fields = super()._get_mm_fields_config(hf_inputs,
-                                                     hf_processor_mm_kwargs)
+        image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
         if self.info.supports_video:
-            video_num_patches = hf_inputs.get("video_num_patches",
-                                              torch.empty(0))
-            num_videos = len(video_num_patches)
+            video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
+
             video_fields = dict(
                 pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes(
-                    "video", video_num_patches),
+                    "video", video_num_patches
+                ),
                 video_num_patches=MultiModalFieldConfig.batched("video"),
-                video_token_id=MultiModalFieldConfig.shared(
-                    "video", num_videos))
+                frames_indices=MultiModalFieldConfig.batched("video"),
+                frame_duration_ms=MultiModalFieldConfig.batched("video"),
+            )
         else:
             video_fields = {}
 
@@ -781,14 +963,38 @@ class NanoNemotronVLMultiModalProcessor(
 
         def get_video_replacement_internvl(item_idx: int):
             feature_size = hf_processor.num_image_token
+            video, metadata = mm_items["video"][item_idx]
             num_patches = video_num_patches[item_idx]
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
+            video_pruning_rate = self.info.ctx.get_mm_config().video_pruning_rate
+            if video_pruning_rate is not None and video_pruning_rate > 0.0:
+                # Start of EVS-specific code
+                num_tokens = compute_retained_tokens_count(
+                    tokens_per_frame=feature_size,
+                    num_frames=num_patches,
+                    q=video_pruning_rate,
+                )
+                # Here we just need placeholders that won't actually be replaced -
+                # we just need to make sure the total number of tokens is correct
+                # assign all tokens to the first frame
+                tokens_per_frame = [num_tokens] + [0] * (num_patches - 1)
+
+                # End of EVS-specific code
+            else:
+                tokens_per_frame = [feature_size] * num_patches
+
+            frame_duration_ms = int(1000 / metadata["fps"])
             return hf_processor.get_video_repl(
-                feature_size,
-                num_patches,
-                video_context_token=hf_processor.video_token)
+                tokens_per_frame=tokens_per_frame,
+                frames_indices=metadata["frames_indices"],
+                frame_duration_ms=frame_duration_ms,
+                tokenizer=hf_processor.tokenizer,
+                img_start_token_ids=hf_processor._img_start_token_ids,
+                img_end_token_ids=hf_processor._img_end_token_ids,
+                img_context_token_ids=hf_processor._img_context_token_ids,
+            )
 
         if self.info.supports_video:
             prompt_repl = [
@@ -797,7 +1003,7 @@ class NanoNemotronVLMultiModalProcessor(
                     modality="video",
                     target="<video>",
                     replacement=get_video_replacement_internvl,
-                )
+                ),
             ]
 
         return prompt_repl
@@ -815,23 +1021,30 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         # Use default max_num_tiles for dummy data generation
         max_num_tiles = 12
-        target_width, target_height = (
-            self.info.get_image_size_with_most_features(max_num_tiles))
+        target_width, target_height = self.info.get_image_size_with_most_features(
+            max_num_tiles
+        )
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class NanoNemotronVLDummyInputsBuilder(
-        NanoNemotronVLDummyInputsBuilder[NanoNemotronVLProcessingInfo]):
+    NanoNemotronVLDummyInputsBuilder[NanoNemotronVLProcessingInfo]
+):
     """DummyInputsBuilder extended for video support"""
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
@@ -839,25 +1052,62 @@ class NanoNemotronVLDummyInputsBuilder(
 
         return super().get_dummy_text(mm_counts) + "<video>" * num_videos
 
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+        overrides: VideoDummyOptions | None = None,
+    ) -> list[VideoItem]:
+        video = super()._get_dummy_videos(
+            width=width,
+            height=height,
+            num_frames=num_frames,
+            num_videos=1,
+            overrides=overrides,
+        )[0]
+        video_items = []
+        for _ in range(num_videos):
+            video_metadata = {
+                "total_num_frames": num_frames,
+                "fps": 2,
+                "duration": num_frames / 2.0,
+                "video_backend": "opencv_dynamic",
+                "frames_indices": [i for i in range(num_frames)],
+                "do_sample_frames": False,
+            }
+            video_item = (video.copy(), video_metadata)
+            video_items.append(video_item)
+
+        return video_items
+
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
-                                                mm_counts=mm_counts)
+        dummy_image = super().get_dummy_mm_data(
+            seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
+        )
         if self.info.supports_video:
             config = self.info.get_hf_config()
             image_size: int = config.force_image_size
-            target_num_frames = \
-                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+            target_num_frames = self.info.get_num_frames_with_most_features(
+                seq_len, mm_counts
+            )
             num_videos = mm_counts.get("video", 0)
+            video_overrides = mm_options.get("video") if mm_options else None
             dummy_video = {
-                "video":
-                self._get_dummy_videos(width=image_size,
-                                       height=image_size,
-                                       num_frames=target_num_frames,
-                                       num_videos=num_videos)
+                "video": self._get_dummy_videos(
+                    width=image_size,
+                    height=image_size,
+                    num_frames=target_num_frames,
+                    num_videos=num_videos,
+                    overrides=video_overrides,
+                )
             }
         else:
             dummy_video = {}
@@ -869,11 +1119,13 @@ class NanoNemotronVLDummyInputsBuilder(
     info=NanoNemotronVLProcessingInfo,
     dummy_inputs=NanoNemotronVLDummyInputsBuilder,
 )
-class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
-                           SupportsMultiModal):
+class NemotronH_Nano_VL_V2(
+    nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
+):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -883,24 +1135,26 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-
+        multimodal_config = vllm_config.model_config.multimodal_config
         image_size = config.force_image_size
         patch_size = config.patch_size
         self.patch_size = patch_size
         self.template = config.template
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.downsample_ratio = config.downsample_ratio
         self.ps_version = config.ps_version
         self.image_tag_type = config.image_tag_type
-
+        self.video_pruning_rate = multimodal_config.video_pruning_rate
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.vision_model = self.get_vit_model_from_radio_config(config).to(
-            self.language_model.config.torch_dtype)
+            self.language_model.config.dtype
+        )
 
         # Construct the vision projection.
         vit_hidden_size = config.vit_hidden_size
@@ -908,24 +1162,35 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         llm_hidden_size = config.text_config.hidden_size
 
         self.mlp1 = nn.Sequential(
-            RMSNorm(hidden_size=vit_hidden_size *
-                    int(1 / self.downsample_ratio)**2,
-                    eps=1e-5),
+            RMSNorm(
+                hidden_size=vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                eps=1e-5,
+            ),
             nn.Linear(
-                vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
                 vision_projection_hidden_size,
                 bias=False,
             ),
             ReLUSquaredActivation(),
-            nn.Linear(vision_projection_hidden_size,
-                      llm_hidden_size,
-                      bias=False),
+            nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
         )
-        self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype)
+        self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
-        self.img_context_token_id = None
-        self.video_context_token_id = None
         self.config = config
+        self.model_config = vllm_config.model_config
+
+        # Pre-tokenize special tokens for video processing
+        # to avoid repeated tokenization
+        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self._img_start_token_ids = encode_tokens(
+            tokenizer, IMG_START, add_special_tokens=False
+        )
+        self._img_end_token_ids = encode_tokens(
+            tokenizer, IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = encode_tokens(
+            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        )
 
     def pixel_shuffle(self, x, scale_factor=0.5):
         n, w, h, c = x.size()
@@ -957,19 +1222,33 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         return x
 
     def extract_feature(self, pixel_values):
-        vit_embeds = self.vision_model(pixel_values)
-        vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
-        h = w = int(vit_embeds.shape[1]**0.5)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
-        vit_embeds = self.mlp1(vit_embeds)
+        # Process images in a micro-batch of at most 128 frames per call
+        # This is done on purpose to ensure peak GPU ram usage of huge batch
+        # (namely for really long videos with EVS ON) won't cause any problems
+        # as we don't support chunked prefill for video media
+        micro_batch_size = 128
+        n = pixel_values.shape[0]
+        vit_embeds_list = []
+        for i in range(0, n, micro_batch_size):
+            vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
+            vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+            h = w = int(vit_embeds.shape[1] ** 0.5)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+            vit_embeds = self.pixel_shuffle(
+                vit_embeds, scale_factor=self.downsample_ratio
+            )
+            vit_embeds = vit_embeds.reshape(
+                vit_embeds.shape[0], -1, vit_embeds.shape[-1]
+            )
+            vit_embeds = self.mlp1(vit_embeds)
+            vit_embeds_list.append(vit_embeds)
+
+        vit_embeds = torch.cat(vit_embeds_list, dim=0)
         return vit_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[NanoNemotronVLImageInputs]:
+        self, **kwargs: object
+    ) -> NanoNemotronVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -978,31 +1257,12 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            return NanoNemotronVLImageEmbeddinInputs(
+            return NanoNemotronVLImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
-        image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
-
         if pixel_values_flat is not None:
-            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat)}")
-
-            if not isinstance(image_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(image_num_patches)}")
-
-            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-            image_num_patches = flatten_bn(image_num_patches, concat=True)
-
             return NanoNemotronVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=pixel_values_flat,
@@ -1012,7 +1272,8 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
-            self, image_input: NanoNemotronVLImageInputs) -> torch.Tensor:
+        self, image_input: NanoNemotronVLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -1023,25 +1284,140 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return (image_embeds.view(-1,
-                                      self.config.text_config.hidden_size), )
+            return (image_embeds.view(-1, self.config.text_config.hidden_size),)
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1,
-                                         self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
         return image_embeds.split(image_feature_sizes)
 
+    def _process_video_input(
+        self, video_input: NanoNemotronVLVideoPixelInputs
+    ) -> tuple[torch.Tensor, ...]:
+        """Process video input and create final embeddings with video content
+        and indicator tokens."""
+        # Get video embeddings using the same processing as images
+        video_embeddings = self._process_image_input(video_input)
+
+        final_video_embeddings: tuple[torch.Tensor, ...] = ()
+
+        image_rows = image_cols = self.config.force_image_size
+        downsample_ratio = self.config.downsample_ratio
+        patch_size = self.config.patch_size
+        rows = int(image_rows * downsample_ratio // patch_size)
+        cols = int(image_cols * downsample_ratio // patch_size)
+        video_pruning_rate = self.video_pruning_rate
+        video_num_frames = video_input["num_patches"].tolist()
+        video_frames_indices = video_input["frames_indices"].split(video_num_frames)
+        # Calculate video feature dimensions (number of frames and
+        # their feature size (AKA tokens per frame))
+        # TODO: Maybe this can be optimized to avoid the loop?
+        for i, single_video_embeddings in enumerate(video_embeddings):
+            num_frames = video_num_frames[i]
+            frames_indices = video_frames_indices[i].tolist()
+            frame_duration_ms = video_input["frame_duration_ms"][i].item()
+            assert single_video_embeddings.shape[0] % num_frames == 0
+
+            if video_pruning_rate is not None and video_pruning_rate > 0.0:
+                # Start of EVS-specific code
+                retention_mask = compute_retention_mask(
+                    single_video_embeddings,
+                    video_size_thw=(num_frames, rows, cols),
+                    spatial_merge_size=1,
+                    q=video_pruning_rate,
+                )
+
+                # apply retention mask
+                single_video_embeddings = single_video_embeddings[retention_mask]
+
+                # calculate the actual number of retained tokens per frame
+                retention_mask_thw = retention_mask.reshape(num_frames, rows, cols)
+                num_tokens_per_frame = (
+                    retention_mask_thw.sum(dim=(1, 2)).long().tolist()
+                )
+                # End of EVS-specific code
+            else:
+                feature_size = single_video_embeddings.shape[0] // num_frames
+                num_tokens_per_frame = [feature_size] * num_frames
+
+            final_video_embeddings += (
+                self._create_final_video_embeddings(
+                    single_video_embeddings,
+                    num_tokens_per_frame,
+                    frames_indices,
+                    frame_duration_ms,
+                ),
+            )
+
+        return final_video_embeddings
+
+    def _create_final_video_embeddings(
+        self,
+        video_embeddings: torch.Tensor,
+        num_tokens_per_frame: list[int],
+        frames_indices: list[int],
+        frame_duration_ms: int,
+    ) -> torch.Tensor:
+        """Create final embeddings that combine video embeddings with
+        text embeddings of indicator tokens.
+
+        These final embeddings contain:
+        - Actual video embeddings in positions corresponding to video content
+        - Text embeddings for indicator tokens (<img>, </img>, and
+          frame separation text) in their respective positions
+
+        These embeddings will replace the placeholder embeddings to create
+        input_embeds for the LLM.
+        """
+        device = video_embeddings.device
+        tokenizer = cached_tokenizer_from_config(self.model_config)
+
+        # Generate video replacement token IDs using get_video_repl
+        # This tokenizes each frame separator independently, then uses pre-tokenized
+        # special tokens to ensure consistent tokenization regardless of
+        # num_tokens_per_frame values.
+        video_repl = NanoNemotronVLProcessor.get_video_repl(
+            tokens_per_frame=num_tokens_per_frame,
+            frames_indices=frames_indices,
+            frame_duration_ms=frame_duration_ms,
+            tokenizer=tokenizer,
+            img_start_token_ids=self._img_start_token_ids,
+            img_end_token_ids=self._img_end_token_ids,
+            img_context_token_ids=self._img_context_token_ids,
+        )
+
+        # video_repl.full is a list of token IDs
+        repl_token_ids = torch.tensor(video_repl.full, device=device)
+
+        # Get embedding token IDs for image context (use pre-tokenized version)
+        embed_token_ids = torch.tensor(self._img_context_token_ids, device=device)
+
+        # Create mask for video embedding positions
+        is_video_embed = torch.isin(repl_token_ids, embed_token_ids)
+
+        # Create final video embeddings, merging text embeddings for indicator
+        # tokens with video embeddings
+        text_embeddings = self.get_language_model().embed_input_ids(repl_token_ids)
+        final_video_embeddings = _merge_multimodal_embeddings(
+            inputs_embeds=text_embeddings,
+            multimodal_embeddings=video_embeddings,
+            is_multimodal=is_video_embed,
+        )
+
+        return final_video_embeddings
+
     def _parse_and_validate_video_input(
-            self,
-            **kwargs: object) -> Optional[NanoNemotronVLVideoPixelInputs]:
+        self, **kwargs: object
+    ) -> NanoNemotronVLVideoPixelInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
+        frames_indices = kwargs.pop("frames_indices", None)
+        frame_duration_ms = kwargs.pop("frame_duration_ms", None)
 
         if pixel_values_flat_video is None and video_embeds is None:
             return None
@@ -1049,32 +1425,26 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         if video_embeds is not None:
             return NanoNemotronVLVideoEmbeddingInputs(
                 type="video_embeds",
-                data=flatten_bn(video_embeds),
+                data=video_embeds,
             )
 
-        video_token_id = kwargs["video_token_id"]
-        assert isinstance(video_token_id, torch.Tensor)
-        self.video_context_token_id = video_token_id.flatten().unique().item()
-
         if pixel_values_flat_video is not None:
-            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat_video)}")
-
-            if not isinstance(video_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(video_num_patches)}")
+            if torch.is_tensor(frames_indices):
+                frames_indices = frames_indices.flatten()
+            else:
+                frames_indices = torch.cat([f.flatten() for f in frames_indices], dim=0)
 
-            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-                                                 concat=True)
-            video_num_patches = flatten_bn(video_num_patches, concat=True)
+            frame_duration_ms = frame_duration_ms.flatten()
             expected_h = expected_w = self.config.force_image_size
-            resolve_bindings = {"h": expected_h, "w": expected_w}
+            num_frames = video_num_patches[0].item()
+            resolve_bindings = {"h": expected_h, "w": expected_w, "f": num_frames}
 
             return NanoNemotronVLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_flat=pixel_values_flat_video,
                 num_patches=video_num_patches,
+                frames_indices=frames_indices,
+                frame_duration_ms=frame_duration_ms,
                 resolve_bindings=resolve_bindings,
             )
 
@@ -1085,19 +1455,17 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values_flat",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_flat_video",
-                             ) and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values_flat", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_flat_video",) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Validate the multimodal input keyword arguments
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if modalities is None:
@@ -1112,39 +1480,15 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
-                video_embeddings = self._process_image_input(video_input)
-                multimodal_embeddings += video_embeddings
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-        if (multimodal_embeddings is not None
-                and len(multimodal_embeddings) != 0):
-            context_token_ids = [
-                token_id for token_id in (self.img_context_token_id,
-                                          self.video_context_token_id)
-                if token_id is not None
-            ]
-            assert len(context_token_ids) >= 1
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                context_token_ids,
-            )
-
-        return inputs_embeds
-
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
@@ -1152,22 +1496,14 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         hidden_states = self.language_model(
             input_ids=input_ids,
             positions=positions,
@@ -1191,7 +1527,7 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -1222,16 +1558,13 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
                     default_weight_loader(param, w)
             elif is_vision_weights(name):
                 # Convert: vision_model.radio_model.* → radio_model.*
-                hf_key = name[len(
-                    "vision_model."):]  # Remove "vision_model." prefix
+                hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
                 vision_weights.append((hf_key, w))
 
         self.language_model.load_weights(llm_weights)
         self.vision_model.load_weights(vision_weights)
 
-    def print_architecture(self,
-                           detailed: bool = True,
-                           save_to_file: str = None):
+    def print_architecture(self, detailed: bool = True, save_to_file: str = None):
         """
         Print model architecture with parameter names, shapes, and sizes.
 
@@ -1267,20 +1600,26 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
                 # Group parameters by main component
                 if name.startswith("language_model"):
                     param_groups["language_model"].append(
-                        (name, param.shape, param_size, param.dtype))
+                        (name, param.shape, param_size, param.dtype)
+                    )
                 elif name.startswith("vision_model"):
                     param_groups["vision_model"].append(
-                        (name, param.shape, param_size, param.dtype))
+                        (name, param.shape, param_size, param.dtype)
+                    )
                 elif name.startswith("mlp1"):
                     param_groups["mlp1"].append(
-                        (name, param.shape, param_size, param.dtype))
+                        (name, param.shape, param_size, param.dtype)
+                    )
                 else:
                     param_groups["other"].append(
-                        (name, param.shape, param_size, param.dtype))
+                        (name, param.shape, param_size, param.dtype)
+                    )
 
                 if detailed:
-                    print(f"{name:<70} | Shape: {str(param.shape):<25} | "
-                          f"Size: {param_size:>12,} | Dtype: {param.dtype}")
+                    print(
+                        f"{name:<70} | Shape: {str(param.shape):<25} | "
+                        f"Size: {param_size:>12,} | Dtype: {param.dtype}"
+                    )
 
             print("=" * 100)
             print("Summary by Component:")
@@ -1289,11 +1628,16 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
             for component, params in param_groups.items():
                 if params:  # Only show components that have parameters
                     component_total = sum(size for _, _, size, _ in params)
-                    percentage = ((component_total / total_params) *
-                                  100 if total_params > 0 else 0)
-                    print(f"{component:<20} | Parameters: {len(params):>4} | "
-                          f"Total Size: {component_total:>15,} | "
-                          f"{percentage:>6.2f}%")
+                    percentage = (
+                        (component_total / total_params) * 100
+                        if total_params > 0
+                        else 0
+                    )
+                    print(
+                        f"{component:<20} | Parameters: {len(params):>4} | "
+                        f"Total Size: {component_total:>15,} | "
+                        f"{percentage:>6.2f}%"
+                    )
 
             print("-" * 60)
             print(f"{'Total Parameters':<20} | {total_params:>15,}")
@@ -1349,10 +1693,9 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
         hf_config_vision = hf_config.vision_config
         model_name = hf_config_vision.args.get("model")
         if model_name is None:
-            raise ValueError(f'Unsupported vit model type: {model_name}')
+            raise ValueError(f"Unsupported vit model type: {model_name}")
 
-        preferred_resolution = getattr(hf_config_vision,
-                                       "preferred_resolution", None)
+        preferred_resolution = getattr(hf_config_vision, "preferred_resolution", None)
         image_size = preferred_resolution[0] if preferred_resolution else 224
         patch_size = getattr(hf_config_vision, "patch_size", 16)
 
@@ -1362,33 +1705,36 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid,
             patch_size=patch_size,
             norm_mean=hf_config.norm_mean,
             norm_std=hf_config.norm_std,
-            reg_tokens=(hf_config_vision.args.get("register_multiple")
-                        if hasattr(hf_config_vision, "args")
-                        and isinstance(hf_config_vision.args, dict) else None),
+            reg_tokens=(
+                hf_config_vision.args.get("register_multiple")
+                if hasattr(hf_config_vision, "args")
+                and isinstance(hf_config_vision.args, dict)
+                else None
+            ),
         )
 
         return RadioModel(config=radio_config)
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         return self.language_model.mamba_cache.copy_inputs_before_cuda_graphs(
-            input_buffers, **kwargs)
+            input_buffers, **kwargs
+        )
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
-        return (self.language_model.mamba_cache.
-                get_seqlen_agnostic_capture_inputs(batch_size))
+        return self.language_model.mamba_cache.get_seqlen_agnostic_capture_inputs(
+            batch_size
+        )
 
     @classmethod
     def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"):
         text_config = vllm_config.model_config.hf_config.text_config
         temp_vllm_config = copy.deepcopy(vllm_config)
         temp_vllm_config.model_config.hf_config = text_config
-        return NemotronHForCausalLM.get_mamba_state_shape_from_config(
-            temp_vllm_config)
+        return NemotronHForCausalLM.get_mamba_state_shape_from_config(temp_vllm_config)
 
     @classmethod
     def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"):
         text_config = vllm_config.model_config.hf_config.text_config
         temp_vllm_config = copy.deepcopy(vllm_config)
         temp_vllm_config.model_config.hf_config = text_config
-        return NemotronHForCausalLM.get_mamba_state_dtype_from_config(
-            temp_vllm_config)
+        return NemotronHForCausalLM.get_mamba_state_dtype_from_config(temp_vllm_config)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 6bb2f7392cb490eba17f3ab2213304f68bde9375..92dcf5ea5700807163e592ef386bd39ff14788d5 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Nemotron model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,23 +36,34 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import NemotronConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 # The architecture is pretty similar to Llama, with these changes:
 # - There is no gate_proj, just up_proj
@@ -65,58 +77,63 @@ def _cast_if_autocast_enabled(*args):
         return args
     else:
         return torch.amp.autocast_mode._cast(
-            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype())
+            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype()
+        )
 
 
 class NemotronLayerNorm1P(nn.LayerNorm):
-
-    def __init__(self,
-                 normalized_shape: Union[int, list[int], torch.Size],
-                 eps: float = 1e-5,
-                 elementwise_affine: bool = True,
-                 bias: bool = True,
-                 device=None,
-                 dtype=None):
-        super().__init__(normalized_shape, eps, elementwise_affine, bias,
-                         device, dtype)
+    def __init__(
+        self,
+        normalized_shape: int | list[int] | torch.Size,
+        eps: float = 1e-5,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(normalized_shape, eps, elementwise_affine, bias, device, dtype)
 
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if residual is not None:
             x = x + residual
             residual = x
-        args = _cast_if_autocast_enabled(x, self.normalized_shape,
-                                         self.weight + 1, self.bias, self.eps)
+        args = _cast_if_autocast_enabled(
+            x, self.normalized_shape, self.weight + 1, self.bias, self.eps
+        )
         with torch.amp.autocast("cuda", enabled=False):
             x = torch.nn.functional.layer_norm(*args)
             return x if residual is None else (x, residual)
 
 
 class NemotronMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
-                                            output_size=intermediate_size,
-                                            bias=bias,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.up_proj")
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         self.act_fn = get_act_fn(hidden_act)
 
     def forward(self, x):
@@ -127,7 +144,6 @@ class NemotronMLP(nn.Module):
 
 
 class NemotronAttention(nn.Module):
-
     def __init__(
         self,
         config: NemotronConfig,
@@ -135,11 +151,11 @@ class NemotronAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -194,13 +210,15 @@ class NemotronAttention(nn.Module):
             rope_scaling=rope_scaling,
             partial_rotary_factor=self.partial_rotary_factor,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -216,12 +234,11 @@ class NemotronAttention(nn.Module):
 
 
 class NemotronDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: NemotronConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -229,21 +246,24 @@ class NemotronDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         self.self_attn = NemotronAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -260,93 +280,92 @@ class NemotronDecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = NemotronLayerNorm1P(config.hidden_size,
-                                                   eps=config.norm_eps)
+        self.input_layernorm = NemotronLayerNorm1P(
+            config.hidden_size, eps=config.norm_eps
+        )
         self.post_attention_layernorm = NemotronLayerNorm1P(
-            config.hidden_size, eps=config.norm_eps)
+            config.hidden_size, eps=config.norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class NemotronModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: NemotronDecoderLayer(config=config,
-                                                cache_config=cache_config,
-                                                quant_config=quant_config,
-                                                prefix=prefix),
-            prefix=f"{prefix}.layers")
+            lambda prefix: NemotronDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
         if get_pp_group().is_last_rank:
-            self.norm = NemotronLayerNorm1P(config.hidden_size,
-                                            eps=config.norm_eps)
+            self.norm = NemotronLayerNorm1P(config.hidden_size, eps=config.norm_eps)
         else:
             self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -357,16 +376,14 @@ class NemotronModel(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -376,18 +393,19 @@ class NemotronModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -416,8 +434,7 @@ class NemotronModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -443,27 +460,20 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         assert isinstance(config, NemotronConfig)
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
-        self.model = NemotronModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        self.model = NemotronModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -471,37 +481,38 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 987920ecc3310efcf19ec386aa63501df63909c6..f7e0caf410e10da5ec7562afcbf0215645ea46ed 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -17,8 +17,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only NemotronH model."""
-from collections.abc import Iterable
-from typing import Optional
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
 
 import torch
 from torch import nn
@@ -26,59 +28,76 @@ from torch import nn
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.config.parallel import ParallelConfig
+from vllm.distributed import get_ep_group, get_tensor_model_parallel_world_size
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsLoRA, SupportsPP,
-                                                   SupportsQuant)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import (
+    HasInnerState,
+    IsHybrid,
+    MixtureOfExperts,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+    SupportsQuant,
+)
 from vllm.model_executor.models.utils import (
-    AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory,
-    make_layers, maybe_prefix)
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+    sequence_parallel_chunk,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import NemotronHConfig
 
 
 class NemotronHMLP(nn.Module):
-
     def __init__(
         self,
         config: NemotronHConfig,
-        layer_idx: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        reduce_results: bool = True,
+        is_sequence_parallel: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
-        hybrid_override_pattern = config.hybrid_override_pattern
-        mlp_index = hybrid_override_pattern[:layer_idx + 1].count("-") - 1
-        if isinstance(config.intermediate_size, list):
-            if len(config.intermediate_size) == 1:
-                intermediate_size = config.intermediate_size[0]
-            else:
-                intermediate_size = config.intermediate_size[mlp_index]
-        else:
-            intermediate_size = config.intermediate_size
-
         self.up_proj = ColumnParallelLinear(
             input_size=config.hidden_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
+            disable_tp=is_sequence_parallel,
             prefix=f"{prefix}.up_proj",
         )
         self.down_proj = RowParallelLinear(
@@ -86,6 +105,8 @@ class NemotronHMLP(nn.Module):
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            reduce_results=reduce_results,
+            disable_tp=is_sequence_parallel,
             prefix=f"{prefix}.down_proj",
         )
         self.act_fn = ReLUSquaredActivation()
@@ -97,34 +118,168 @@ class NemotronHMLP(nn.Module):
         return x
 
 
-class NemotronHMLPDecoderLayer(nn.Module):
+class NemotronHMoE(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=torch.float32)
+        )
+        # Load balancing settings.
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_redundant_experts = parallel_config.eplb_config.num_redundant_experts  # noqa: E501
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        if config.n_shared_experts is None or config.n_shared_experts == 0:
+            self.shared_experts = None
+        else:
+            intermediate_size = (
+                config.moe_shared_expert_intermediate_size * config.n_shared_experts
+            )
+
+            self.shared_experts = NemotronHMLP(
+                config=config,
+                intermediate_size=intermediate_size,
+                quant_config=quant_config,
+                reduce_results=False,
+                is_sequence_parallel=self.is_sequence_parallel,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            activation=activation_without_mul(config.mlp_hidden_act),
+            is_act_and_mul=False,  # non-gated MoE
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        shared_output, final_hidden_states = fused_moe_out
+
+        # Fix FP16 overflow
+        # See DeepseekV2DecoderLayer for more details.
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= 1.0 / self.routed_scaling_factor
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
 
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
     def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
 
+        hybrid_override_pattern = config.hybrid_override_pattern
+        mlp_index = hybrid_override_pattern[: layer_idx + 1].count("-") - 1
+        if isinstance(config.intermediate_size, list):
+            if len(config.intermediate_size) == 1:
+                intermediate_size = config.intermediate_size[0]
+            else:
+                intermediate_size = config.intermediate_size[mlp_index]
+        else:
+            intermediate_size = config.intermediate_size
+
         self.mixer = NemotronHMLP(
             config,
+            intermediate_size=intermediate_size,
             quant_config=quant_config,
             bias=config.mlp_bias,
             prefix=f"{prefix}.mixer",
-            layer_idx=layer_idx,
         )
 
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -137,15 +292,54 @@ class NemotronHMLPDecoderLayer(nn.Module):
         return hidden_states, residual
 
 
-class NemotronHMambaDecoderLayer(nn.Module):
+class NemotronHMoEDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.mixer = NemotronHMoE(
+            config,
+            quant_config=quant_config,
+            parallel_config=parallel_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states, residual
+
 
+class NemotronHMambaDecoderLayer(nn.Module):
     def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -160,7 +354,7 @@ class NemotronHMambaDecoderLayer(nn.Module):
             n_groups=config.n_groups,
             num_heads=config.mamba_num_heads,
             head_dim=config.mamba_head_dim,
-            rms_norm_eps=config.rms_norm_eps,
+            rms_norm_eps=config.layer_norm_epsilon,
             activation=config.mamba_hidden_act,
             model_config=model_config,
             cache_config=cache_config,
@@ -168,12 +362,12 @@ class NemotronHMambaDecoderLayer(nn.Module):
             prefix=f"{prefix}.mixer",
         )
 
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -188,14 +382,13 @@ class NemotronHMambaDecoderLayer(nn.Module):
 
 
 class NemotronHAttention(nn.Module):
-
     def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -261,14 +454,14 @@ class NemotronHAttention(nn.Module):
 
 
 class NemotronHAttentionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -282,13 +475,13 @@ class NemotronHAttentionDecoderLayer(nn.Module):
             prefix=f"{prefix}.mixer",
         )
 
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -305,12 +498,12 @@ ALL_DECODER_LAYER_TYPES = {
     "M": NemotronHMambaDecoderLayer,
     "-": NemotronHMLPDecoderLayer,
     "*": NemotronHAttentionDecoderLayer,
+    "E": NemotronHMoEDecoderLayer,
 }
 
 
 @support_torch_compile
 class NemotronHModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -318,66 +511,65 @@ class NemotronHModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+        parallel_config = vllm_config.parallel_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
+        self.has_moe = "E" in config.hybrid_override_pattern
+
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
             layer_class = ALL_DECODER_LAYER_TYPES[
-                config.hybrid_override_pattern[layer_idx]]
+                config.hybrid_override_pattern[layer_idx]
+            ]
             return layer_class(
-                config,
-                layer_idx,
-                model_config,
-                cache_config,
+                config=config,
+                layer_idx=layer_idx,
+                model_config=model_config,
+                cache_config=cache_config,
                 quant_config=quant_config,
+                parallel_config=parallel_config,
                 prefix=prefix,
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            len(config.hybrid_override_pattern),
-            get_layer,
-            prefix=f"{prefix}.layers")
-        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
-            ["hidden_states", "residual"], config.hidden_size)
+            len(config.hybrid_override_pattern), get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-        self.norm_f = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        residual = None
-        for i, layer in enumerate(self.layers):
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -385,15 +577,13 @@ class NemotronHModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm_f(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -401,6 +591,22 @@ class NemotronHModel(nn.Module):
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        if self.has_moe:
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                # - FusedMoe.w1 (aka gate_proj) should be up_proj since that's
+                #   what the activation is applied to
+                # - FusedMoe.w3 (aka up_proj) should be ignored since we're
+                #   using non-gated MoE
+                ckpt_gate_proj_name="up_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="",
+                num_experts=self.config.n_routed_experts,
+                num_redundant_experts=getattr(self, "num_redundant_experts", 0),
+            )
+        else:
+            expert_params_mapping = []
+
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -419,6 +625,9 @@ class NemotronHModel(nn.Module):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -426,23 +635,70 @@ class NemotronHModel(nn.Module):
 
             # load other params
             else:
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
 
             loaded_params.add(name)
         return loaded_params
 
 
-class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                           IsHybrid, SupportsQuant):
+class NemotronHForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsQuant,
+    MixtureOfExperts,
+    SupportsMambaPrefixCaching,
+):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"backbone": "model"},
-        orig_to_new_substr={
-            "A_log": "A",
-            "embeddings": "embed_tokens"
-        },
+        orig_to_new_substr={"A_log": "A", "embeddings": "embed_tokens"},
     )
 
     packed_modules_mapping = {
@@ -465,7 +721,6 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -505,61 +760,93 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "NemotronH currently does not support prefix caching"
 
         self.quant_config = vllm_config.quant_config
 
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = NemotronHModel(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = NemotronHModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
-
-        self.make_empty_intmd_tensors = (self.model.make_empty_intmd_tensors)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+        # Set MoE hyperparameters
+        if self.model.has_moe:
+            self.expert_weights = []
+            self.num_expert_groups = config.n_group
+
+            self.moe_layers = []
+            example_moe = None
+            for layer in self.model.layers:
+                if isinstance(layer, NemotronHMoEDecoderLayer):
+                    # Pick last one layer since the first ones
+                    # may be dense layers.
+                    example_moe = layer.mixer
+                    self.moe_layers.append(layer.mixer.experts)
+
+            self.num_moe_layers = len(self.moe_layers)
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts  # noqa: E501
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer, NemotronHMoEDecoderLayer):
+                moe = layer.mixer
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index d474c8db41b2fc54455aa0b4cb6e12c684027d3d..b839206a3094dd816c25fe082e8cedae9e5c32f3 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only deci model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -40,16 +41,25 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
@@ -66,7 +76,6 @@ def _find_multiple(n: int, k: int) -> int:
 
 
 class DeciLMAttention(LlamaAttention):
-
     def __init__(
         self,
         config: LlamaConfig,
@@ -74,27 +83,43 @@ class DeciLMAttention(LlamaAttention):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
-        super().__init__(config, hidden_size, num_heads, num_kv_heads,
-                         rope_theta, rope_scaling, max_position_embeddings,
-                         quant_config, bias, bias_o_proj, cache_config, prefix,
-                         attn_type)
+        super().__init__(
+            config,
+            hidden_size,
+            num_heads,
+            num_kv_heads,
+            rope_theta,
+            rope_scaling,
+            max_position_embeddings,
+            quant_config,
+            bias,
+            bias_o_proj,
+            cache_config,
+            prefix,
+            attn_type,
+        )
 
-    def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]],
-                         quant_config: Optional[QuantizationConfig]) -> None:
+    def _init_rotary_emb(
+        self,
+        config,
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
         # Enables YARN for Mistral and LLaMA4 derivatives.
         is_neox_style = True
         if hasattr(config, "position_embedding_type"):
             is_neox_style = config.position_embedding_type not in [
-                "mistral_yarn", "rope_llama4"
+                "mistral_yarn",
+                "rope_llama4",
             ]
 
         self.rotary_emb = get_rope(
@@ -104,17 +129,17 @@ class DeciLMAttention(LlamaAttention):
             base=self.rope_theta,
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor)
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
 
 
 class DeciLMDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: LlamaConfig,
         layer_idx: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -126,23 +151,26 @@ class DeciLMDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         bias_o_proj = attention_bias
         # support internlm/internlm3-8b with qkv_bias
         if hasattr(config, "qkv_bias"):
             attention_bias = config.qkv_bias
 
         if not self._is_no_op_attention:
-            num_kv_heads = (config.num_attention_heads //
-                            block_config.attention.n_heads_in_group)
+            num_kv_heads = (
+                config.num_attention_heads // block_config.attention.n_heads_in_group
+            )
             self.self_attn = DeciLMAttention(
                 config=config,
                 hidden_size=self.hidden_size,
@@ -157,13 +185,13 @@ class DeciLMDecoderLayer(nn.Module):
                 cache_config=cache_config,
                 prefix=f"{prefix}.self_attn",
             )
-            self.input_layernorm = RMSNorm(config.hidden_size,
-                                           eps=config.rms_norm_eps)
+            self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         if not self._is_no_op_ffn:
             ffn_mult = block_config.ffn.ffn_mult
             intermediate_size = _ffn_mult_to_intermediate_size(
-                ffn_mult, config.hidden_size)
+                ffn_mult, config.hidden_size
+            )
 
             self.mlp = LlamaMLP(
                 hidden_size=self.hidden_size,
@@ -173,26 +201,26 @@ class DeciLMDecoderLayer(nn.Module):
                 bias=getattr(config, "mlp_bias", False),
                 prefix=f"{prefix}.mlp",
             )
-            self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                    eps=config.rms_norm_eps)
+            self.post_attention_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
 
         if self._is_no_op_attention:
             pass
         else:
-            if (residual is None):
+            if residual is None:
                 residual = hidden_states
                 hidden_states = self.input_layernorm(hidden_states)
             else:
-                hidden_states, residual = self.input_layernorm(
-                    hidden_states, residual)
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
             hidden_states = self.self_attn(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -201,14 +229,14 @@ class DeciLMDecoderLayer(nn.Module):
         # Fully Connected
         if not self._is_no_op_ffn:
             hidden_states, residual = self.post_attention_layernorm(
-                hidden_states, residual)
+                hidden_states, residual
+            )
             hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class DeciModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -221,21 +249,19 @@ class DeciModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -261,25 +287,25 @@ class DeciModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -289,24 +315,20 @@ class DeciModel(nn.Module):
         kv_cache_index = 0
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             if not layer._is_no_op_attention:
-                hidden_states, residual = layer(positions, hidden_states,
-                                                residual)
+                hidden_states, residual = layer(positions, hidden_states, residual)
                 kv_cache_index += 1
             else:
-                hidden_states, residual = layer(positions, hidden_states,
-                                                residual)
+                hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -320,19 +342,19 @@ class DeciModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
             if self.quant_config is not None and (
-                    scale_name := self.quant_config.get_cache_scale(name)):
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -365,8 +387,7 @@ class DeciModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -409,73 +430,62 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
-        self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+        self.model = self._init_model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config else
-                    lora_config.lora_vocab_padding_size),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.embed_tokens)
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return DeciModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 3abbff8c717d49fb611665c08b94c331ff88e865..5a1dda8aac2c14a4399ed2cf16f444cfc76d2df1 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -9,7 +9,6 @@
 # --------------------------------------------------------
 from abc import ABC
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -22,36 +21,45 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.models.internvl import (
-    BaseInternVLDummyInputsBuilder, BaseInternVLMultiModalProcessor,
-    BaseInternVLProcessingInfo, InternVLImageEmbeddingInputs,
-    InternVLImageInputs, InternVLImagePixelInputs, InternVLProcessor)
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+    InternVLImageEmbeddingInputs,
+    InternVLImageInputs,
+    InternVLImagePixelInputs,
+    InternVLProcessor,
+)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import (
-    cached_image_processor_from_config)
+from vllm.transformers_utils.processor import cached_image_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
-IMG_START = '<img>'
-IMG_END = '</img>'
-IMG_CONTEXT = '<image>'
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<image>"
 
 
 def build_transform(input_size: int):
-    return T.Compose([
-        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
-        T.Resize((input_size, input_size),
-                 interpolation=T.InterpolationMode.BICUBIC),
-        T.ToTensor(),
-    ])
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+        ]
+    )
 
 
 # adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
@@ -63,15 +71,16 @@ def find_closest_aspect_ratio(
     height: int,
     image_size: int,
 ) -> tuple[int, int]:
-    best_factor = float('-inf')
+    best_factor = float("-inf")
     best_ratio = (1, 1)
     area = width * height
 
     for rw, rh in target_ratios:
         target_aspect_ratio = rw / rh
         size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
-        ratio_closeness = min(target_aspect_ratio / aspect_ratio,
-                              aspect_ratio / target_aspect_ratio)
+        ratio_closeness = min(
+            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
+        )
         factor = size_factor * ratio_closeness
 
         if factor > best_factor:
@@ -134,10 +143,12 @@ def dynamic_preprocess_nemotron_vl(
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
@@ -155,10 +166,13 @@ def get_nemotron_vl_target_ratios(
     min_num: int,
     max_num: int,
 ) -> list[tuple[int, int]]:
-    target_ratios = {(i, j)
-                     for n in range(min_num, max_num + 1)
-                     for i in range(1, n + 1)
-                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
     return sorted(target_ratios, key=lambda x: x[0] * x[1])
 
 
@@ -186,16 +200,15 @@ def image_to_pixel_values_nemotron_vl(
 
 
 class NemotronVLProcessor(InternVLProcessor):
-
     def __init__(
         self,
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         image_processor: BaseImageProcessorFast,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         ABC.__init__(self)
         self.config = config
@@ -217,7 +230,8 @@ class NemotronVLProcessor(InternVLProcessor):
         assert isinstance(dynamic_image_size, bool)
 
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.image_size = image_size
         self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
@@ -251,9 +265,9 @@ class NemotronVLProcessor(InternVLProcessor):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -269,16 +283,17 @@ class NemotronVLProcessor(InternVLProcessor):
                 min_num=min_num,
                 max_num=max_num,
                 use_thumbnail=self.use_thumbnail,
-            ) for image in images
+            )
+            for image in images
         ]
 
     def _preprocess_image(
         self,
         text: list[str],
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> tuple[list[str], dict[str, torch.Tensor]]:
         if len(images) == 0:
             image_inputs = {}
@@ -289,11 +304,11 @@ class NemotronVLProcessor(InternVLProcessor):
                 max_dynamic_patch=max_dynamic_patch,
                 dynamic_image_size=dynamic_image_size,
             )
-            image_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat":
-                torch.cat(pixel_values_lst),
-                "image_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst]),
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
             }
 
             for pixel_values in pixel_values_lst:
@@ -301,17 +316,16 @@ class NemotronVLProcessor(InternVLProcessor):
                 feature_size = num_patches * self.num_image_token
                 image_repl = self.get_image_repl(feature_size, num_patches)
                 NVL_IMAGE_CONTEXT = image_repl.full.replace(
-                    "<image>", "<NVL_IMG_CONTEXT>")
-                text = [
-                    t.replace('<image>', NVL_IMAGE_CONTEXT, 1) for t in text
-                ]
+                    "<image>", "<NVL_IMG_CONTEXT>"
+                )
+                text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
             text = [t.replace("<NVL_IMG_CONTEXT>", IMG_CONTEXT) for t in text]
         return text, image_inputs
 
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -341,12 +355,13 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
 @MULTIMODAL_REGISTRY.register_processor(
     BaseInternVLMultiModalProcessor[NemotronVLProcessingInfo],
     info=NemotronVLProcessingInfo,
-    dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo])
-class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-                               SupportsLoRA):
+    dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo],
+)
+class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -367,7 +382,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.downsample_ratio = config.downsample_ratio
         self.ps_version = config.ps_version
 
@@ -390,41 +406,45 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
-    def _patch_quant_config(self, config: PretrainedConfig,
-                            quant_config: QuantizationConfig):
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ):
         # the awq models from OpenGVLab missing `modules_to_not_convert`
         # patch the quant_config to add `modules_to_not_convert` back
         if isinstance(quant_config, AWQConfig):
             text_config = config.text_config
-            llm_quant_config = getattr(text_config, "quantization_config",
-                                       None)
-            if (not quant_config.modules_to_not_convert) and \
-                (llm_quant_config is not None):
+            llm_quant_config = getattr(text_config, "quantization_config", None)
+            if (not quant_config.modules_to_not_convert) and (
+                llm_quant_config is not None
+            ):
                 quant_config.modules_to_not_convert.append("vision_model")
 
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         prefix: str,
     ):
-        return AutoModel.from_config(config.vision_config,
-                                     trust_remote_code=True)
+        return AutoModel.from_config(config.vision_config, trust_remote_code=True)
 
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
         vit_hidden_size = config.vit_hidden_size
         vision_projection_hidden_size = config.projector_hidden_size
         llm_hidden_size = config.text_config.hidden_size
 
         return nn.Sequential(
-            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                         bias=True),
-            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                      vision_projection_hidden_size,
-                      bias=True),
+            nn.LayerNorm(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, bias=True
+            ),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                vision_projection_hidden_size,
+                bias=True,
+            ),
             nn.GELU(),
             nn.Linear(vision_projection_hidden_size, llm_hidden_size),
         )
@@ -435,9 +455,13 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
         # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
         x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-                   int(c / (scale_factor * scale_factor)))
-        if self.ps_version == 'v1':
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
             pass
         else:
             x = x.permute(0, 2, 1, 3).contiguous()
@@ -448,17 +472,16 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         vit_embeds = self.vision_model(x=pixel_values).features
         vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
 
-        h = w = int(vit_embeds.shape[1]**0.5)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[InternVLImageInputs]:
+        self, **kwargs: object
+    ) -> InternVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -467,38 +490,26 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return InternVLImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
-
-        if pixel_values_flat is not None:
-            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat)}")
-
-            if not isinstance(image_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(image_num_patches)}")
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
 
-            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-            image_num_patches = flatten_bn(image_num_patches, concat=True)
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
+        if pixel_values_flat is not None:
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
                 resolve_bindings={
                     "h": self.config.force_image_size,
-                    "w": self.config.force_image_size
+                    "w": self.config.force_image_size,
                 },
             )
 
@@ -519,14 +530,12 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return (image_embeds.view(-1,
-                                      self.config.text_config.hidden_size), )
+            return (image_embeds.view(-1, self.config.text_config.hidden_size),)
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1,
-                                         self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
@@ -538,10 +547,11 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values_flat",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values_flat", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
 
         return modalities
 
@@ -551,9 +561,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -567,51 +575,45 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            context_token_ids = [self.img_context_token_id]
-            assert len(context_token_ids) >= 1
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
             self._set_visual_token_mask(input_ids)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                context_token_ids,
-            )
-        return inputs_embeds
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
-
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
@@ -621,8 +623,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         # Only required if the model is mono-architecture
         if self.visual_token_mask is not None:
-            forward_kwargs.update(
-                {"visual_token_mask": self.visual_token_mask})
+            forward_kwargs.update({"visual_token_mask": self.visual_token_mask})
             self.visual_token_mask = None
 
         hidden_states = self.language_model.model(**forward_kwargs)
@@ -631,11 +632,10 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         ## Ignore registered_buffers
         ## see https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/input_conditioner.py#L28 # noqa: E501
         skip_substrs = ["norm_mean", "norm_std"]
@@ -649,4 +649,5 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="mlp1",
-            tower_model="vision_model")
+            tower_model="vision_model",
+        )
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 3bbf4c67604c75d5b1f3fe37e462b32e88b6654b..73dd8dfd0f85dd059431dc1d14ced5ec6c858c55 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -8,31 +8,39 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 
 from .intern_vit import InternVisionModel
-from .internvl import (BaseInternVLDummyInputsBuilder,
-                       BaseInternVLMultiModalProcessor,
-                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
-                       InternVLChatModel)
+from .internvl import (
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+    BaseInternVLProcessor,
+    InternVLChatModel,
+)
 
 IMG_PAD = "<|vision_pad|>"
 
 
 class NVLMProcessor(BaseInternVLProcessor):
-
     @property
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_PAD]
@@ -40,7 +48,7 @@ class NVLMProcessor(BaseInternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         if num_patches is None:
             raise NotImplementedError("Embedding inputs are not supported")
@@ -50,8 +58,9 @@ class NVLMProcessor(BaseInternVLProcessor):
             tile_pos_identifiers += ["<tile_global_thumbnail>"]
 
         context_size = feature_size // num_patches
-        features = "".join(identifier + IMG_PAD * context_size
-                           for identifier in tile_pos_identifiers)
+        features = "".join(
+            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
+        )
 
         # We include the start and end as well because "<Image><tile" is
         # tokenized as ["<Image", "><", "tile"], resulting in assertion error
@@ -62,7 +71,6 @@ class NVLMProcessor(BaseInternVLProcessor):
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
         return self.ctx.init_processor(
             NVLMProcessor,
@@ -72,9 +80,7 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
         )
 
 
-class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
-                             ):
-
+class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -86,22 +92,24 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class NVLMMultiModalProcessor(
-        BaseInternVLMultiModalProcessor[NVLMProcessingInfo]):
-
+class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo]):
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -124,7 +132,8 @@ class NVLMMultiModalProcessor(
 
         def get_replacement_nvlm(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
@@ -154,21 +163,24 @@ class NVLMMultiModalProcessor(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor,
-                                        info=NVLMProcessingInfo,
-                                        dummy_inputs=NVLMDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    NVLMMultiModalProcessor,
+    info=NVLMProcessingInfo,
+    dummy_inputs=NVLMDummyInputsBuilder,
+)
 class NVLM_D_Model(InternVLChatModel):
-
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
         vit_hidden_size = config.vision_config.hidden_size
         llm_intermediate_size = config.text_config.intermediate_size
         llm_hidden_size = config.text_config.hidden_size
 
         return nn.Sequential(
-            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
-            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                      llm_intermediate_size,
-                      bias=False),
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                llm_intermediate_size,
+                bias=False,
+            ),
             nn.GELU(),
             nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False),
         )
@@ -176,7 +188,7 @@ class NVLM_D_Model(InternVLChatModel):
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -184,8 +196,9 @@ class NVLM_D_Model(InternVLChatModel):
         if not is_mono:
             vision_feature_layer = config.select_layer
             if vision_feature_layer < 0:
-                num_hidden_layers = config.vision_config.num_hidden_layers \
-                    + vision_feature_layer + 1
+                num_hidden_layers = (
+                    config.vision_config.num_hidden_layers + vision_feature_layer + 1
+                )
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 9fa8760073c154e5ae19cb7ae958d1979d9e6f18..487e3f671a45583afb2850846e72fda4f82c9a08 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -23,9 +23,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -36,49 +36,55 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class OlmoAttention(nn.Module):
     """
     This is the attention block where the output is computed as
-    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
     (plus another skip connection).
     """
 
     def __init__(
         self,
         config: OlmoConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = config.num_attention_heads
 
         assert self.hidden_size % self.total_num_heads == 0
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
 
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
@@ -102,12 +108,14 @@ class OlmoAttention(nn.Module):
             base=self.rope_theta,
         )
         self.scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
         # Attention output projection.
         self.o_proj = RowParallelLinear(
@@ -136,14 +144,14 @@ class OlmoAttention(nn.Module):
 class OlmoMLP(nn.Module):
     """
     This is the MLP block where the output is computed as
-    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    `MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
     (plus another skip connection).
     """
 
     def __init__(
         self,
         config: OlmoConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -185,38 +193,39 @@ class OlmoMLP(nn.Module):
 class OlmoDecoderLayer(nn.Module):
     """
     This is a typical transformer block where the output is
-    computed as ``MLP(LN(x + Attention(LN(x))))``
+    computed as `MLP(LN(x + Attention(LN(x))))`
     (plus another skip connection).
     """
 
-    def __init__(self,
-                 config: OlmoConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: OlmoConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         # Attention block.
-        self.self_attn = OlmoAttention(config,
-                                       cache_config,
-                                       quant_config,
-                                       prefix=f"{prefix}.self_attn")
+        self.self_attn = OlmoAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attn"
+        )
 
         # MLP block.
         self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            elementwise_affine=False,
-                                            bias=False)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     elementwise_affine=False,
-                                                     bias=False)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Attention block.
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -233,7 +242,6 @@ class OlmoDecoderLayer(nn.Module):
 
 @support_torch_compile
 class OlmoModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -243,30 +251,33 @@ class OlmoModel(nn.Module):
 
         self.config = config
 
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: OlmoDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
-        self.norm = nn.LayerNorm(config.hidden_size,
-                                 elementwise_affine=False,
-                                 bias=False)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
@@ -274,7 +285,7 @@ class OlmoModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -291,8 +302,7 @@ class OlmoModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -304,7 +314,7 @@ class OlmoModel(nn.Module):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -324,8 +334,7 @@ class OlmoModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -335,6 +344,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -352,33 +362,33 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = OlmoModel(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
+        self.model = OlmoModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -390,15 +400,15 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head.weight"]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(
+                ["lm_head.weight"] if self.config.tie_word_embeddings else None
+            ),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 2e0b1fb2a13f78c57672512a5ad84dbda6a8b724..045582c889ee4f52830841f6e48f8a00bc7e2ff0 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -27,7 +27,6 @@
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -42,18 +41,27 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
 from vllm.distributed.utils import split_tensor_along_last_dim
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
-    AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter,
-    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Olmo3Config
 
@@ -61,7 +69,7 @@ from vllm.transformers_utils.configs import Olmo3Config
 class Olmo2Attention(nn.Module):
     """
     This is the attention block where the output is computed as
-    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
     (plus another skip connection).
     """
 
@@ -78,8 +86,9 @@ class Olmo2Attention(nn.Module):
         assert self.total_num_heads % self.tp_size == 0
 
         self.num_heads = self.total_num_heads // self.tp_size
-        self.total_num_kv_heads = (self.config.num_key_value_heads
-                                   or self.total_num_heads)
+        self.total_num_kv_heads = (
+            self.config.num_key_value_heads or self.total_num_heads
+        )
         if self.total_num_kv_heads >= self.tp_size:
             assert self.total_num_kv_heads % self.tp_size == 0
         else:
@@ -108,15 +117,15 @@ class Olmo2Attention(nn.Module):
             self.total_num_kv_heads * self.head_dim,
             eps=self.config.rms_norm_eps,
         )
-        self.q_norm = RMSNorm(self.config.hidden_size,
-                              eps=self.config.rms_norm_eps)
+        self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
 
         self.scaling = self.head_dim**-0.5
 
         layer_idx = extract_layer_index(prefix)
         sliding_window = None
-        if ((layer_types := getattr(self.config, "layer_types", None))
-                is not None and layer_types[layer_idx] == "sliding_attention"):
+        if (
+            layer_types := getattr(self.config, "layer_types", None)
+        ) is not None and layer_types[layer_idx] == "sliding_attention":
             sliding_window = self.config.sliding_window
 
         self.attn = Attention(
@@ -132,8 +141,7 @@ class Olmo2Attention(nn.Module):
 
         # Rotary embeddings. Rope scaling is only applied on full attention
         # layers.
-        self.rope_scaling = (self.config.rope_scaling
-                             if sliding_window is None else None)
+        self.rope_scaling = self.config.rope_scaling if sliding_window is None else None
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
@@ -151,16 +159,16 @@ class Olmo2Attention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-    def _apply_qk_norm(self, q: torch.Tensor,
-                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.tp_size > 1:
             q = tensor_model_parallel_all_gather(q.contiguous())
             k = tensor_model_parallel_all_gather(k.contiguous())
         q = self.q_norm(q)
         k = self.k_norm(k)
         if self.tp_size > 1:
-            splitter = partial(split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
         return q, k
@@ -182,7 +190,7 @@ class Olmo2Attention(nn.Module):
 class Olmo2MLP(nn.Module):
     """
     This is the MLP block where the output is computed as
-    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    `MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
     (plus another skip connection).
     """
 
@@ -227,7 +235,7 @@ class Olmo2MLP(nn.Module):
 class Olmo2DecoderLayer(nn.Module):
     """
     This is a typical transformer block where the output is
-    computed as ``MLP(LN(x + Attention(LN(x))))``
+    computed as `MLP(LN(x + Attention(LN(x))))`
     (plus another skip connection).
     """
 
@@ -236,18 +244,21 @@ class Olmo2DecoderLayer(nn.Module):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, (Olmo2Config, Olmo3Config))
         # Attention block.
-        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
-                                        prefix=f"{prefix}.self_attn")
+        self.self_attn = Olmo2Attention(
+            vllm_config=vllm_config, prefix=f"{prefix}.self_attn"
+        )
 
         # MLP block.
         self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
-        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
-                                                  eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
@@ -270,7 +281,6 @@ class Olmo2DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Olmo2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -283,25 +293,27 @@ class Olmo2Model(nn.Module):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.config.num_hidden_layers,
-            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
-                                             prefix=prefix),
+            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
         )
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    self.config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], self.config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
@@ -331,8 +343,7 @@ class Olmo2Model(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -363,8 +374,7 @@ class Olmo2Model(nn.Module):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -374,6 +384,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -391,30 +402,33 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, (Olmo2Config, Olmo3Config))
         self.config = config
-        self.model = Olmo2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = Olmo2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=vllm_config.quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -426,14 +440,15 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head.weight"]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(
+                ["lm_head.weight"] if self.config.tie_word_embeddings else None
+            ),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 77ece544d4900a1cd57253235358b078ad5367ba..499eb05de76e45249fd4dc3a938ffb0d400b2382 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -13,40 +13,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import OlmoeConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.config import VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.distributed.utils import split_tensor_along_last_dim
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -60,33 +70,36 @@ class OlmoeMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(self,
-                 num_experts: int,
-                 top_k: int,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = hidden_size
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(hidden_size,
-                                     num_experts,
-                                     bias=False,
-                                     quant_config=None)
-
-        self.experts = FusedMoE(num_experts=num_experts,
-                                top_k=top_k,
-                                hidden_size=hidden_size,
-                                intermediate_size=intermediate_size,
-                                reduce_results=True,
-                                renormalize=False,
-                                quant_config=quant_config,
-                                tp_size=tp_size,
-                                prefix=f"{prefix}.experts")
+        self.gate = ReplicatedLinear(
+            hidden_size, num_experts, bias=False, quant_config=None
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            renormalize=False,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -95,27 +108,28 @@ class OlmoeMoE(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
         return final_hidden_states.view(orig_shape)
 
 
 class OlmoeAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        max_position_embeddings: int = 4096,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
-        self.hidden_size = hidden_size
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
+
+        num_heads = config.num_attention_heads
+        num_kv_heads = config.num_key_value_heads
+
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
         assert self.total_num_heads % tp_size == 0
@@ -130,7 +144,7 @@ class OlmoeAttention(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -138,23 +152,24 @@ class OlmoeAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
-            hidden_size,
+            self.hidden_size,
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.tp_size = tp_size
         self.tp_rank = get_tensor_model_parallel_rank()
         self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5)
-        self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
-                              eps=1e-5)
+        self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim, eps=1e-5)
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
-            hidden_size,
+            self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -165,24 +180,26 @@ class OlmoeAttention(nn.Module):
             rope_scaling=rope_scaling,
             is_neox_style=True,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def _apply_qk_norm(self, q: torch.Tensor,
-                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.tp_size > 1:
             q = tensor_model_parallel_all_gather(q.contiguous())
             k = tensor_model_parallel_all_gather(k.contiguous())
         q = self.q_norm(q)
         k = self.k_norm(k)
         if self.tp_size > 1:
-            splitter = partial(split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
         return q, k
@@ -202,30 +219,15 @@ class OlmoeAttention(nn.Module):
 
 
 class OlmoeDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: OlmoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          4096)
 
         self.self_attn = OlmoeAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
+            vllm_config=vllm_config,
             prefix=f"{prefix}.self_attn",
         )
 
@@ -244,15 +246,14 @@ class OlmoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attn(
             positions=positions,
@@ -260,21 +261,23 @@ class OlmoeDecoderLayer(nn.Module):
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class OlmoeModel(nn.Module):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = OlmoeDecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
         self.config = config
@@ -284,30 +287,30 @@ class OlmoeModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: OlmoeDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+            lambda prefix: layer_type(vllm_config=vllm_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(config.hidden_size, eps=1e-5)
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -322,12 +325,14 @@ class OlmoeModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
-        hidden_states, _ = self.norm(hidden_states, residual)
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
@@ -337,24 +342,22 @@ class OlmoeModel(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
         ]
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -391,11 +394,13 @@ class OlmoeModel(nn.Module):
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -407,7 +412,8 @@ class OlmoeModel(nn.Module):
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
+                            ".kv_scale", ".attn.kv_scale"
+                        )
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
@@ -419,63 +425,72 @@ class OlmoeModel(nn.Module):
                             name = remapped_kv_scale_name
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class OlmoeForCausalLM(nn.Module, SupportsPP):
+class OlmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        ]
     }
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = OlmoeDecoderLayer,
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = OlmoeModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = OlmoeModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            layer_type=layer_type,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d13a745beffebfbfe084fa1064c1fc65c6529bba
--- /dev/null
+++ b/vllm/model_executor/models/openpangu.py
@@ -0,0 +1,1062 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+from collections.abc import Callable, Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_gather,
+)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import (
+    MixtureOfExperts,
+    SupportsLoRA,
+    SupportsPP,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+    sequence_parallel_chunk,
+)
+from vllm.sequence import IntermediateTensors
+
+
+def check_ffn_act_fn(act_fn: str):
+    if act_fn != "silu":
+        raise ValueError(
+            f"Unsupported activation: {act_fn}. Only silu is supported for now."
+        )
+
+
+class OpenPanguMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        reduce_results: bool = True,
+        is_sequence_parallel=False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            disable_tp=is_sequence_parallel,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            disable_tp=is_sequence_parallel,
+            prefix=f"{prefix}.down_proj",
+        )
+
+        check_ffn_act_fn(hidden_act)
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act_fn(self.gate_up_proj(x)[0]))[0]
+
+
+class OpenPanguMoE(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        parallel_config: ParallelConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tp_group().rank_in_group
+
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        check_ffn_act_fn(config.hidden_act)
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.e_score_correction_bias = None
+
+        # Load balancing settings.
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = OpenPanguMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                is_sequence_parallel=self.is_sequence_parallel,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=1,
+            topk_group=1,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        router_logits, _ = self.gate(hidden_states)
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        shared_output, final_hidden_states = fused_moe_out
+        if self.shared_experts is None:
+            assert shared_output is None
+
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= 1.0 / self.routed_scaling_factor
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class OpenPanguMLAAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.rope_theta = rope_theta
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if num_heads % self.tp_size != 0:
+            raise ValueError(
+                f"num_heads {num_heads} is not divisible by tp_size {self.tp_size}."
+            )
+        self.num_local_heads = num_heads // self.tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.prefix = prefix
+
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True,
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa",
+            )
+
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # TODO: remove hard coding
+        rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 1,
+            "mscale": 1.0,
+            "mscale_all_dim": 1.0,
+            "original_max_position_embeddings": max_position_embeddings,
+            "type": "yarn",
+            "rope_type": "deepseek_yarn",
+        }
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
+
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None
+            else None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
+            if self.q_lora_rank is None
+            else None,
+            q_a_layernorm=self.q_a_layernorm if self.q_lora_rank is not None else None,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else None,
+            indexer=None,
+            is_sparse=False,
+            topk_indices_buffer=None,
+        )
+
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.mla_attn(positions, hidden_states)
+
+
+class OpenPanguEmbeddedAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        if self.total_num_heads % tp_size != 0:
+            raise ValueError(
+                f"total_num_heads {self.total_num_heads} "
+                f"is not divisible by tp_size {tp_size}."
+            )
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads > tp_size and self.total_num_kv_heads % tp_size != 0:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                "Number of KV heads is greater than TP size, "
+                f"but total_num_kv_heads {self.total_num_kv_heads} "
+                f"is not divisible by tp_size {tp_size}."
+            )
+        elif (
+            self.total_num_kv_heads < tp_size and tp_size % self.total_num_kv_heads != 0
+        ):
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                f"Number of KV heads is less than TP size, but tp_size {tp_size} "
+                f"is not divisible by total_num_kv_heads {self.total_num_kv_heads}."
+            )
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(
+            config, rope_scaling=rope_scaling, quant_config=quant_config
+        )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} "
+                    "for interleaved_sliding_window is not supported."
+                )
+        else:
+            sliding_window = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(
+        self,
+        config: PretrainedConfig,
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "PanguEmbedded":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+
+
+class OpenPanguDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        vllm_config: VllmConfig,
+    ) -> None:
+        super().__init__()
+
+        if config is None:
+            config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        layer_idx = int(prefix.split(sep=".")[-1])
+        self.layer_idx = layer_idx
+
+        self.use_mla = (
+            hasattr(config, "qk_nope_head_dim")
+            and hasattr(config, "qk_rope_head_dim")
+            and hasattr(config, "v_head_dim")
+            and hasattr(config, "kv_lora_rank")
+        )
+        if self.use_mla:
+            self.self_attn = OpenPanguMLAAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=(
+                    config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                ),
+                kv_lora_rank=config.kv_lora_rank,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            attention_bias = getattr(config, "attention_bias", False) or getattr(
+                config, "bias", False
+            )
+            bias_o_proj = attention_bias
+            if hasattr(config, "qkv_bias"):
+                attention_bias = config.qkv_bias
+            # By default, PanguEmbedded uses causal attention
+            # as it is a decoder-only model.
+            # You can override the HF config with `is_causal=False` to enable
+            # bidirectional attention, which is used in some embedding models
+            if getattr(config, "is_causal", True):
+                attn_type = AttentionType.DECODER
+            else:
+                attn_type = AttentionType.ENCODER_ONLY
+            self.self_attn = OpenPanguEmbeddedAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(
+                    config, "num_key_value_heads", config.num_attention_heads
+                ),
+                rope_theta=rope_theta,
+                rope_scaling=getattr(config, "rope_scaling", None),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                attn_type=attn_type,
+            )
+
+        if (
+            getattr(config, "n_routed_experts", None) is not None
+            and layer_idx >= config.first_k_dense_replace
+        ):
+            self.mlp = OpenPanguMoE(
+                config=config,
+                parallel_config=parallel_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = OpenPanguMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.first_k_dense_replace = getattr(
+            config, "first_k_dense_replace", self.num_hidden_layers
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.tp_group = get_tp_group().device_group
+        self.sandwich_norm = getattr(config, "sandwich_norm", False)
+        if self.sandwich_norm:
+            self.pre_mlp_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+            self.post_mlp_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states.clone()
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        if (
+            self.routed_scaling_factor is not None
+            and hidden_states.dtype == torch.float16
+        ):
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1.0 / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1.0 / self.routed_scaling_factor
+
+        if self.sandwich_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, residual = self.pre_mlp_layernorm(hidden_states, residual)
+        else:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        if (
+            self.routed_scaling_factor is not None
+            and isinstance(self.mlp, OpenPanguMLP)
+            and hidden_states.dtype == torch.float16
+        ):
+            hidden_states *= 1.0 / self.routed_scaling_factor
+
+        if self.sandwich_norm:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class OpenPanguModel(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.config = config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OpenPanguDecoderLayer(config, prefix, vllm_config),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_attn_mlp_weight(
+        self,
+        attn_mlp_replace_mapping: list[tuple[str, str, int]],
+        params_dict: dict[str, Any],
+        weight_name: str,
+        loaded_weight: torch.Tensor,
+        loaded_params: set[str],
+    ) -> bool:
+        for param_name, origin_name, shard_id in attn_mlp_replace_mapping:
+            if origin_name not in weight_name or (
+                ("mlp.experts." in weight_name) and weight_name not in params_dict
+            ):
+                continue
+            weight_name_mapped = weight_name.replace(origin_name, param_name)
+            if (
+                param_name == "fused_qkv_a_proj"
+                and weight_name_mapped not in params_dict
+            ):
+                continue
+            else:
+                weight_name = weight_name_mapped
+            if weight_name.endswith(".bias") and weight_name not in params_dict:
+                continue
+            if is_pp_missing_parameter(weight_name, self):
+                continue
+
+            param = params_dict[weight_name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            loaded_params.add(weight_name)
+            return True
+        return False
+
+    def load_expert_weight(
+        self,
+        expert_merge_mapping: list[tuple[str, str, int, str]],
+        params_dict: dict[str, Any],
+        weight_name: str,
+        loaded_weight: torch.Tensor,
+        loaded_params: set[str],
+        flag_dict: dict[str, bool],
+    ) -> bool:
+        for mapping in expert_merge_mapping:
+            param_name, origin_name, expert_id, shard_id = mapping
+            if origin_name not in weight_name:
+                continue
+            flag_dict["is_expert_weight"] = True
+            weight_name_mapped = weight_name.replace(origin_name, param_name)
+            if is_pp_missing_parameter(weight_name_mapped, self):
+                continue
+            param = params_dict[weight_name_mapped]
+            weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+            success = weight_loader(
+                param,
+                loaded_weight,
+                weight_name_mapped,
+                shard_id=shard_id,
+                expert_id=expert_id,
+                return_success=True,
+            )
+            if success:
+                weight_name = weight_name_mapped
+                loaded_params.add(weight_name_mapped)
+                return True
+        return False
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        attn_mlp_replace_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".fused_qkv_a_proj", ".q_a_proj", 0),
+            (".fused_qkv_a_proj", ".kv_a_proj_with_mqa", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        has_experts = hasattr(self.config, "n_routed_experts")
+        if has_experts:
+            expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.n_routed_experts,
+                num_redundant_experts=self.num_redundant_experts,
+            )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            if (
+                "layers" in name
+                and hasattr(self.config, "num_nextn_predict_layers")
+                and (self.config.num_nextn_predict_layers > 0)
+            ):
+                layer_idx = int(name.split("layers.")[-1].split(".")[0])
+                mtp_idx = layer_idx - self.config.num_hidden_layers
+                if mtp_idx >= 0 and mtp_idx < self.config.num_nextn_predict_layers:
+                    continue  # skip spec decode layers for main model
+
+            flag_dict = {"is_expert_weight": False}
+            if (
+                self.load_attn_mlp_weight(
+                    attn_mlp_replace_mapping,
+                    params_dict,
+                    name,
+                    loaded_weight,
+                    loaded_params,
+                )
+                or has_experts
+                and self.load_expert_weight(
+                    expert_merge_mapping,
+                    params_dict,
+                    name,
+                    loaded_weight,
+                    loaded_params,
+                    flag_dict,
+                )
+            ):
+                continue
+            else:
+                if flag_dict["is_expert_weight"]:
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
+
+
+class OpenPanguModelBase(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.model = OpenPanguModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+
+class OpenPanguMoEModel(OpenPanguModelBase, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+
+        # Set MoE hyperparameters
+        self.expert_weights = []
+        self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace
+        self.num_expert_groups = 1
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, OpenPanguDecoderLayer)
+            if isinstance(layer.mlp, OpenPanguMoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No MOE layer found in model.layers.")
+
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.n_routed_experts = example_moe.n_routed_experts
+        self.n_shared_experts = example_moe.n_shared_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, OpenPanguMoE):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+
+class OpenPanguEmbeddedModel(OpenPanguModelBase):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+
+class PanguEmbeddedForCausalLM(OpenPanguEmbeddedModel):
+    pass
+
+
+class PanguUltraMoEForCausalLM(OpenPanguMoEModel):
+    pass
diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..436b7f981b1f909365326d706079c9382580cbdf
--- /dev/null
+++ b/vllm/model_executor/models/openpangu_mtp.py
@@ -0,0 +1,265 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.deepseek_mtp import (
+    DeepSeekMultiTokenPredictor,
+    DeepSeekMultiTokenPredictorLayer,
+    SharedHead,
+)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .openpangu import OpenPanguDecoderLayer
+
+
+class OpenPanguMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
+    def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
+        nn.Module.__init__(self)
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        self.config = config
+        quant_config = vllm_config.quant_config
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+        self.shared_head = SharedHead(
+            config=config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "shared_head"),
+        )
+        self.mtp_block = OpenPanguDecoderLayer(config, prefix, vllm_config)
+
+
+class OpenPanguMultiTokenPredictor(DeepSeekMultiTokenPredictor):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): OpenPanguMultiTokenPredictorLayer(
+                    vllm_config, f"{prefix}.layers.{idx}"
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+
+@support_torch_compile
+class OpenPanguMTP(nn.Module, SupportsPP):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = OpenPanguMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            hidden_states,
+            inputs_embeds,
+            spec_step_idx,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        return self.model.compute_logits(hidden_states, spec_step_idx)
+
+    def get_spec_layer(self, name):
+        if (
+            "layers" in name
+            and hasattr(self.config, "num_nextn_predict_layers")
+            and self.config.num_nextn_predict_layers > 0
+        ):
+            layer_idx = int(name.split("layers.")[-1].split(".")[0])
+            mtp_idx = layer_idx - self.config.num_hidden_layers
+            if mtp_idx >= 0 and mtp_idx < self.config.num_nextn_predict_layers:
+                return layer_idx
+        return None
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = self.get_spec_layer(name)
+            if spec_layer is None:
+                continue
+
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
+                    continue
+                else:
+                    name = name_mapped
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if (
+                        spec_layer != self.model.mtp_start_layer_idx
+                        and ".layers" not in name
+                    ):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        and rename shared layer weights to be top level.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
+        ]
+        shared_weight_names = ["embed_tokens"]
+        spec_layer_weight = False
+        shared_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                if weight_name in shared_weight_names:
+                    shared_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
+        elif shared_weight:
+            # treat shared weights as top level weights
+            name = name.replace(f"model.layers.{spec_layer}.", "model.")
+        return name
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 4c3ce9f61efb348d347ae19f316549bdc49e4f26..5df700d1a2e17f560c3733fd69dc42cccfc6925d 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -19,9 +19,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -32,25 +32,33 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class OPTLearnedPositionalEmbedding(nn.Embedding):
-
     def __init__(self, num_embeddings: int, embedding_dim: int):
         # OPT is set up so that if padding_idx is specified then offset the
         # embedding ids by 2 and adjust num_embeddings appropriately. Other
@@ -63,20 +71,18 @@ class OPTLearnedPositionalEmbedding(nn.Embedding):
 
 
 class OPTAttention(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.embed_dim = embed_dim
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         total_num_heads = num_heads
         assert num_heads % tensor_model_parallel_world_size == 0
         self.num_heads = total_num_heads // tensor_model_parallel_world_size
@@ -98,12 +104,14 @@ class OPTAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -117,12 +125,11 @@ class OPTAttention(nn.Module):
 
 
 class OPTDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -139,8 +146,8 @@ class OPTDecoderLayer(nn.Module):
         self.do_layer_norm_before = config.do_layer_norm_before
 
         self.self_attn_layer_norm = nn.LayerNorm(
-            self.embed_dim,
-            elementwise_affine=config.layer_norm_elementwise_affine)
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
         self.fc1 = ColumnParallelLinear(
             self.embed_dim,
             config.ffn_dim,
@@ -157,8 +164,8 @@ class OPTDecoderLayer(nn.Module):
             prefix=f"{prefix}.fc2",
         )
         self.final_layer_norm = nn.LayerNorm(
-            self.embed_dim,
-            elementwise_affine=config.layer_norm_elementwise_affine)
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
 
     def forward(
         self,
@@ -191,12 +198,11 @@ class OPTDecoderLayer(nn.Module):
 
 
 class OPTDecoder(nn.Module):
-
     def __init__(
         self,
         config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -210,24 +216,29 @@ class OPTDecoder(nn.Module):
         )
         # Positional embeddings are replicated (not sharded).
         self.embed_positions = OPTLearnedPositionalEmbedding(
-            config.max_position_embeddings, config.hidden_size)
+            config.max_position_embeddings, config.hidden_size
+        )
 
         # Project out & in will be replicated if they exist.
         if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = ReplicatedLinear(config.hidden_size,
-                                                config.word_embed_proj_dim,
-                                                bias=False,
-                                                quant_config=quant_config,
-                                                prefix=f"{prefix}.project_out")
+            self.project_out = ReplicatedLinear(
+                config.hidden_size,
+                config.word_embed_proj_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.project_out",
+            )
         else:
             self.project_out = None
 
         if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = ReplicatedLinear(config.word_embed_proj_dim,
-                                               config.hidden_size,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.project_in")
+            self.project_in = ReplicatedLinear(
+                config.word_embed_proj_dim,
+                config.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.project_in",
+            )
         else:
             self.project_in = None
 
@@ -238,29 +249,32 @@ class OPTDecoder(nn.Module):
         if config.do_layer_norm_before and not config._remove_final_layer_norm:
             self.final_layer_norm = nn.LayerNorm(
                 config.hidden_size,
-                elementwise_affine=config.layer_norm_elementwise_affine)
+                elementwise_affine=config.layer_norm_elementwise_affine,
+            )
         else:
             self.final_layer_norm = None
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: OPTDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             pos_embeds = self.embed_positions(positions)
             if self.project_in is not None:
                 inputs_embeds, _ = self.project_in(inputs_embeds)
@@ -283,7 +297,6 @@ class OPTDecoder(nn.Module):
 
 @support_torch_compile
 class OPTModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -291,31 +304,28 @@ class OPTModel(nn.Module):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.decoder = OPTDecoder(config,
-                                  cache_config,
-                                  quant_config,
-                                  prefix=f"{prefix}.decoder")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.decoder = OPTDecoder(
+            config, cache_config, quant_config, prefix=f"{prefix}.decoder"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        return self.decoder(input_ids,
-                            positions,
-                            intermediate_tensors,
-                            inputs_embeds=inputs_embeds)
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self.decoder(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -325,7 +335,7 @@ class OPTModel(nn.Module):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -345,22 +355,22 @@ class OPTModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class OPTForCausalLM(nn.Module, SupportsPP):
+class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "decoder.": "model.decoder.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "decoder.": "model.decoder.",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -368,45 +378,49 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = OPTModel(vllm_config=vllm_config,
-                              prefix=maybe_prefix(prefix, "model"))
+        self.model = OPTModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         if self.config.tie_word_embeddings:
             self.lm_head = self.model.decoder.embed_tokens
         else:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.word_embed_proj_dim,
-                                          prefix=maybe_prefix(
-                                              prefix, "lm_head"))
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.word_embed_proj_dim,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head.weight"]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(
+                ["lm_head.weight"] if self.config.tie_word_embeddings else None
+            ),
         )
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 586fea343d6f9abaf53e07b174144e3b2168015d..859cd2cecf897ffcb1a819f753abea0975b35ca3 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -6,9 +6,10 @@
 # Copyright (c) OrionStar Inc.
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -19,44 +20,59 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class OrionMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -67,17 +83,16 @@ class OrionMLP(nn.Module):
 
 
 class OrionAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -110,12 +125,14 @@ class OrionAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -125,13 +142,15 @@ class OrionAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -147,20 +166,18 @@ class OrionAttention(nn.Module):
 
 
 class OrionDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = OrionAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -177,12 +194,13 @@ class OrionDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
 
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
@@ -209,7 +227,6 @@ class OrionDecoderLayer(nn.Module):
 
 @support_torch_compile
 class OrionModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -226,43 +243,48 @@ class OrionModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: OrionDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory([
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            [
                 "hidden_states",
-            ], config.hidden_size))
+            ],
+            config.hidden_size,
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                }
+            )
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -274,7 +296,7 @@ class OrionModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -294,55 +316,57 @@ class OrionModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class OrionForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = OrionModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = OrionModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db6c317c26a89097f6c2143ce640dd6f64e0210
--- /dev/null
+++ b/vllm/model_executor/models/ouro.py
@@ -0,0 +1,516 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Ouro model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class OuroMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OuroAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        # Get total_ut_steps from config, default to 4 if not specified
+        total_ut_steps = getattr(config, "total_ut_steps", 4)
+
+        # Use total number of hidden layers instead of hardcoded 24
+        total_layers = config.num_hidden_layers
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = nn.ModuleList()
+        for ut_step in range(total_ut_steps):
+            base_layer_idx = extract_layer_index(prefix)
+            unique_layer_idx = ut_step * total_layers + base_layer_idx
+
+            unique_prefix = prefix.replace(
+                f"layers.{base_layer_idx}", f"layers.{unique_layer_idx}"
+            )
+
+            self.attn.append(
+                Attention(
+                    self.num_heads,
+                    self.head_dim,
+                    self.scaling,
+                    num_kv_heads=self.num_kv_heads,
+                    cache_config=cache_config,
+                    quant_config=quant_config,
+                    attn_type=attn_type,
+                    prefix=f"{unique_prefix}.attn",
+                    **{
+                        "layer_idx": unique_layer_idx,
+                        "dual_chunk_attention_config": dual_chunk_attention_config,
+                    }
+                    if dual_chunk_attention_config
+                    else {},
+                )
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        current_ut: int,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn[current_ut](q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OuroDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = OuroAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.mlp = OuroMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm_2 = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        current_ut: int,
+        residual: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions, hidden_states=hidden_states, current_ut=current_ut
+        )
+        hidden_states = self.input_layernorm_2(hidden_states)
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm_2(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class OuroModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = OuroDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        # TODO (@robertgshaw2): see if this can be moved out
+        if cache_config.sliding_window is not None and hasattr(
+            config, "max_window_layers"
+        ):
+            assert config.max_window_layers == config.num_hidden_layers, (
+                "Sliding window for some but all layers is not supported. "
+                "This model uses sliding window but `max_window_layers` = {} "
+                "is less than `num_hidden_layers` = {}. Please open an issue "
+                "to discuss this feature.".format(
+                    config.max_window_layers,
+                    config.num_hidden_layers,
+                )
+            )
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+
+        # Use the provided decoder layer type or default to OuroDecoderLayer
+        decoder_layer_type = decoder_layer_type or OuroDecoderLayer
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: decoder_layer_type(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.early_exit_gate = RowParallelLinear(config.hidden_size, 1, bias=True)
+
+        self.total_ut_steps = getattr(self.config, "total_ut_steps", 4)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_input_ids(input_ids)
+
+        for current_ut in range(self.total_ut_steps):
+            residual = None
+            for layer in self.layers[self.start_layer : self.end_layer]:
+                hidden_states, residual = layer(
+                    positions, hidden_states, current_ut, residual
+                )
+            hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class OuroForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.model = OuroModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index bd525b6780e0d11f7db196af961b18e36252442e..a0fab820720fbd55493ce39f48f18d0203924bfc 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -16,10 +16,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Ovis model."""
+"""PyTorch Ovis model."""
+
 import math
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -28,27 +29,35 @@ from torch.nn.functional import gumbel_softmax, pad, softmax
 from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.aimv2 import AIMv2Model
 from vllm.model_executor.models.siglip import SiglipVisionModel
-from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
-                                              init_vllm_registered_model,
-                                              maybe_prefix)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    flatten_bn,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import merge_multimodal_embeddings
 
 # Cannot find the following number from hf config.
 IMAGE_TOKEN = "<image>"
@@ -75,11 +84,10 @@ def st_argmax(y_soft: torch.Tensor, dim: int):  # straight-through softmax
 
 
 class VisualTokenizer(torch.nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -93,17 +101,20 @@ class VisualTokenizer(torch.nn.Module):
         head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
         self.head = torch.nn.Sequential(
             ReplicatedLinear(
-                config.backbone_config.hidden_size * config.hidden_stride *
-                config.hidden_stride,
+                config.backbone_config.hidden_size
+                * config.hidden_stride
+                * config.hidden_stride,
                 head_dim,
                 bias=False,
                 return_bias=False,
-            ), torch.nn.LayerNorm(head_dim))
+            ),
+            torch.nn.LayerNorm(head_dim),
+        )
 
     def _init_backbone(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         model_type = config.backbone_config.model_type
@@ -121,8 +132,7 @@ class VisualTokenizer(torch.nn.Module):
                 quant_config=quant_config,
                 prefix=prefix,
             )
-        raise ValueError(
-            f"Unsupported visual tokenizer model_type: {model_type}")
+        raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}")
 
     @property
     def dtype(self) -> torch.dtype:
@@ -133,16 +143,17 @@ class VisualTokenizer(torch.nn.Module):
         return next(self.head.parameters()).device
 
     def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
-        if self.config.tokenize_function == 'softmax':
+        if self.config.tokenize_function == "softmax":
             tokens = softmax(logits, dim=-1)
-        elif self.config.tokenize_function == 'gumbel_argmax':
+        elif self.config.tokenize_function == "gumbel_argmax":
             tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
-        elif self.config.tokenize_function == 'st_argmax':
+        elif self.config.tokenize_function == "st_argmax":
             tokens = st_argmax(logits, dim=-1)
         else:
             raise ValueError(
-                'Invalid `max_type`, expected softmax or gumbel_argmax '
-                f'or st_argmax, but got {self.config.tokenize_function}')
+                "Invalid `max_type`, expected softmax or gumbel_argmax "
+                f"or st_argmax, but got {self.config.tokenize_function}"
+            )
         return tokens
 
     def encode(self, pixel_values: torch.Tensor) -> torch.Tensor:
@@ -155,29 +166,34 @@ class VisualTokenizer(torch.nn.Module):
         # e.g., for hidden_stride=2, this leads to a token length reduction:
         # 1024 -> 256 for aimv2
         if self.config.hidden_stride > 1:
-            # this `d` maybe different from the above `d``
+            # this `d` maybe different from the above `d`
             n, L, d = features.shape
             sqrt_l = int(L**0.5)
             assert sqrt_l**2 == L, (
-                "The token sequence length should be a perfect square.")
+                "The token sequence length should be a perfect square."
+            )
             features = features.reshape(n, sqrt_l, sqrt_l, d)
-            pl = (self.config.hidden_stride -
-                  (sqrt_l %
-                   self.config.hidden_stride)) % self.config.hidden_stride
+            pl = (
+                self.config.hidden_stride - (sqrt_l % self.config.hidden_stride)
+            ) % self.config.hidden_stride
             features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
             sqrt_l += pl
-            features = features.reshape(n, sqrt_l // self.config.hidden_stride,
-                                        self.config.hidden_stride,
-                                        sqrt_l // self.config.hidden_stride,
-                                        self.config.hidden_stride, d)
+            features = features.reshape(
+                n,
+                sqrt_l // self.config.hidden_stride,
+                self.config.hidden_stride,
+                sqrt_l // self.config.hidden_stride,
+                self.config.hidden_stride,
+                d,
+            )
             # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
             features = features.permute(0, 1, 3, 2, 4, 5)
             # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
             features = features.flatten(3)
             # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d]
             features = features.reshape(
-                n, -1,
-                self.config.hidden_stride * self.config.hidden_stride * d)
+                n, -1, self.config.hidden_stride * self.config.hidden_stride * d
+            )
 
         return features
 
@@ -201,29 +217,31 @@ class VisualTokenizer(torch.nn.Module):
 class OvisImagePatchInputs(TensorSchema):
     """
     Dimensions:
-        - batch_patches: Batch size * number of patches
-        - patch_size: patch_size_x * patch_size_y * num_channels
+        - bnp: Batch size * number of images * number of patches
+        - h: Height of each patch
+        - w: Width of each patch
         - patch_indicators: Batch size * (number of patches + 1)
-        - patches_per_image: List of number of total patches for each image
-          in the batch.
+        - bn: Batch size * number of images
     """
+
     type: Literal["image_patches"]
-    flat_data: Annotated[torch.Tensor,
-                         TensorShape("batch_patches", "patch_size")]
+    flat_data: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
     indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
-    patches_per_image: Annotated[list[int],
-                                 TensorShape("num_patches_per_image")]
+    patches_per_image: Annotated[list[int], TensorShape("bn")]
     # This is used to restore the first two dimensions of `flat_data`.
 
 
 class VisualEmbedding(torch.nn.Embedding):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def forward(self, visual_tokens: Tensor) -> Tensor:
         if visual_tokens.dtype in [
-                torch.int8, torch.int16, torch.int32, torch.int64, torch.long
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.long,
         ]:
             return super().forward(visual_tokens)
         return torch.matmul(visual_tokens, self.weight)
@@ -238,7 +256,6 @@ class VisualEmbedding(torch.nn.Embedding):
 
 
 class OvisProcessingInfo(BaseProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(
             OvisProcessor,
@@ -255,16 +272,17 @@ class OvisProcessingInfo(BaseProcessingInfo):
         patch_grid_length = math.ceil(image_size / patch_size)
         assert patch_grid_length % hidden_stride == 0, (
             f"patch_grid_length {patch_grid_length} is not divisible by "
-            f"hidden_stride {hidden_stride}")
+            f"hidden_stride {hidden_stride}"
+        )
         # minus 1 for presented image token
-        return (patch_grid_length // hidden_stride)**2 - 1
+        return (patch_grid_length // hidden_stride) ** 2 - 1
 
     def get_image_pad_token(self) -> str:
         hf_text_config = self.get_hf_config().get_text_config()
         text_model_type = hf_text_config.model_type
         return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -276,7 +294,6 @@ class OvisProcessingInfo(BaseProcessingInfo):
 
 
 class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         return IMAGE_TOKEN * num_images
@@ -285,29 +302,32 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
         }
         return mm_data
 
 
 class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
-
     def image_indicators_to_visual_tokens(
         self,
         image_indicators: list[int],
     ) -> list[int]:
         """
-        Filter image indicators placeholders and convert them to corresponding 
+        Filter image indicators placeholders and convert them to corresponding
         tokens in visual tokenizer.
         For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
         should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
@@ -346,14 +366,13 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
             self.image_indicators_to_visual_tokens(indicator)
             for indicator in image_indicators
         ]
-        processed_outputs["indicator_tokens"] = indicator_tokens
+        processed_outputs["indicator_tokens"] = torch.tensor(indicator_tokens)
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
         self,
         prompt_tokens: list[int],
     ) -> list[int]:
-
         return prompt_tokens
 
     def _get_mm_fields_config(
@@ -361,9 +380,11 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
-                    grids=MultiModalFieldConfig.batched("image"),
-                    indicator_tokens=MultiModalFieldConfig.batched("image"))
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            grids=MultiModalFieldConfig.batched("image"),
+            indicator_tokens=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -371,7 +392,6 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptReplacement]:
-
         def get_replacement_ovis(item_idx: int):
             out_item = out_mm_kwargs["image"][item_idx]
             grid = out_item["grids"].data
@@ -388,13 +408,16 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor,
-                                        info=OvisProcessingInfo,
-                                        dummy_inputs=OvisDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    OvisMultiModalProcessor,
+    info=OvisProcessingInfo,
+    dummy_inputs=OvisDummyInputsBuilder,
+)
 class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -418,17 +441,19 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
         )
 
         self.vte = VisualEmbedding(
-            self.config.visual_tokenizer_config.vocab_size,
-            self.config.hidden_size)
+            self.config.visual_tokenizer_config.vocab_size, self.config.hidden_size
+        )
 
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
 
         self.make_empty_intermediate_tensors = (
-            self.get_language_model().make_empty_intermediate_tensors)
+            self.get_language_model().make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        self, **kwargs: object
+    ) -> OvisImagePatchInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         indicator_tokens = kwargs.pop("indicator_tokens", None)
 
@@ -437,62 +462,59 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
         if pixel_values is not None and indicator_tokens is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+                raise ValueError(
+                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
+                )
 
             if not isinstance(indicator_tokens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of indicator_tokens. "
-                                 f"Got type: {type(pixel_values)}")
+                raise ValueError(
+                    "Incorrect type of indicator_tokens. "
+                    f"Got type: {type(pixel_values)}"
+                )
 
-            flat_data = flatten_bn(pixel_values, concat=True)
-            if flat_data.ndim >= 3:
-                flat_data = flat_data.flatten(start_dim=1)
             return OvisImagePatchInputs(
                 type="image_patches",
-                flat_data=flat_data,
-                patches_per_image=[
-                    x.shape[0] for x in flatten_bn(pixel_values)
-                ],
-                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
-                                            concat=True),
+                flat_data=flatten_bn(pixel_values, concat=True),
+                patches_per_image=[x.shape[0] for x in pixel_values],
+                indicator_tokens=flatten_bn(indicator_tokens, concat=True),
             )
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
-            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        self, image_input: OvisImagePatchInputs
+    ) -> MultiModalEmbeddings:
         image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
         indicator_tokens = image_input["indicator_tokens"]
 
         indicator_per_image = list(
-            map(lambda x: x + 1 if x > 1 else x + 2, patches_per_image))
+            map(lambda x: x + 1 if x > 1 else x + 2, patches_per_image)
+        )
 
         target_dtype = self.visual_tokenizer.dtype
-        visual_tokens = self.visual_tokenizer(
-            image_patches_flat.to(target_dtype))
+        visual_tokens = self.visual_tokenizer(image_patches_flat.to(target_dtype))
         visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
 
         indicator_embeds = self.vte(indicator_tokens)
-        indicator_embeds_per_image = indicator_embeds.split(
-            indicator_per_image)
+        indicator_embeds_per_image = indicator_embeds.split(indicator_per_image)
 
         visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
         vision_embeddings = []
-        for indicator, visual in zip(indicator_embeds_per_image,
-                                     visual_embeds_per_image):
+        for indicator, visual in zip(
+            indicator_embeds_per_image, visual_embeds_per_image
+        ):
             vision_embeddings_per_image = []
             for i in range(visual.shape[0]):
                 vision_embeddings_per_image.append(
-                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
-            vision_embeddings_per_image.append(indicator[i + 1:])
-            vision_embeddings.append(
-                torch.cat(vision_embeddings_per_image, dim=0))
+                    torch.cat([indicator[i : i + 1], visual[i]], dim=0)
+                )
+            vision_embeddings_per_image.append(indicator[i + 1 :])
+            vision_embeddings.append(torch.cat(vision_embeddings_per_image, dim=0))
 
         return tuple(vision_embeddings)
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -501,38 +523,17 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
         return image_features
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.llm.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.image_pad_token_id)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         # up until here we have an inputs_embeds 100% numerical identity
         # between the OG HF Transformers implementation and ours
         hidden_states = self.llm(
@@ -546,12 +547,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.llm.compute_logits(hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index f18e38ce154d2664fdb8065ec21bb9a208c8e13b..85f37cfea10b1585c47b411c14e55164885aa963 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -1,33 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" PyTorch Ovis model."""
+"""PyTorch Ovis model."""
+
 from collections.abc import Iterable, Mapping
 from functools import partial
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
-                                             VisualEmbedding)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.ovis import VisualEmbedding
 from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
-from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
-                                              init_vllm_registered_model,
-                                              maybe_prefix)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    flatten_bn,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 
@@ -49,34 +60,38 @@ IMAGE_PAD_TOKEN_ID_MAP = {
 }
 
 
-class OvisVideoPatchInputs(TypedDict):
-    type: Literal["video_patches"]
-    flat_data: torch.Tensor
+class Ovis2_5ImagePatchInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    Dimensions:
+        - bnp: Batch size * number of images * number of patches
+        - patch_size: patch_size_x * patch_size_y * num_channels
+        - patch_indicators: Batch size * (number of patches + 1)
+        - bn: Batch size * number of images
     """
 
-    indicator_tokens: torch.Tensor
-    """
-    Shape:
-    `(batch_size * (num_patches + 1))`
-    """
+    type: Literal["image_patches"]
+    flat_data: Annotated[torch.Tensor, TensorShape("bnp", "patch_size")]
+    indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
+    patches_per_item: Annotated[list[int], TensorShape("bn")]
+    grids: Annotated[torch.Tensor, TensorShape("bn", 3)]
+    # This is used to restore the first two dimensions of `flat_data`.
+
 
-    patches_per_image: list[int]
+class Ovis2_5VideoPatchInputs(TensorSchema):
     """
-    List of number of total patches for each frame in the video.
-    This is used to restore the first two dimensions of `flat_data`.
+    Dimensions:
+        - bnp: Batch size * number of videos * number of patches
+        - patch_size: patch_size_x * patch_size_y * num_channels
+        - patch_indicators: Batch size * (number of patches + 1)
+        - bn: Batch size * number of videos
     """
 
-
-def _ovis2_5_field_config():
-    return dict(pixel_values=MultiModalFieldConfig.batched("image"),
-                grids=MultiModalFieldConfig.batched("image"),
-                indicator_tokens=MultiModalFieldConfig.batched("image"),
-                video_pixel_values=MultiModalFieldConfig.batched("video"),
-                video_indicator_tokens=MultiModalFieldConfig.batched("video"),
-                video_grids=MultiModalFieldConfig.batched("video"))
+    type: Literal["video_patches"]
+    flat_data: Annotated[torch.Tensor, TensorShape("bnp", "patch_size")]
+    indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
+    patches_per_item: Annotated[list[int], TensorShape("bn")]
+    grids: Annotated[torch.Tensor, TensorShape("bn", 3)]
+    # This is used to restore the first two dimensions of `flat_data`.
 
 
 class VisualTokenizer(torch.nn.Module):
@@ -88,9 +103,10 @@ class VisualTokenizer(torch.nn.Module):
         self,
         config: PretrainedConfig,
         visual_vocab_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -99,6 +115,7 @@ class VisualTokenizer(torch.nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.vit",
             use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
         )
         # reserved tokens for INDICATOR_IDS
         head_dim = visual_vocab_size - len(INDICATOR_IDS)
@@ -108,23 +125,28 @@ class VisualTokenizer(torch.nn.Module):
                 head_dim,
                 bias=False,
                 return_bias=False,
-            ), torch.nn.LayerNorm(head_dim))
+            ),
+            torch.nn.LayerNorm(head_dim),
+        )
 
     def _init_backbone(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         model_type = config.model_type
         if model_type == "siglip2_navit":
-            return Siglip2NavitModel(config=config,
-                                     quant_config=quant_config,
-                                     prefix=prefix,
-                                     use_data_parallel=use_data_parallel)
-        raise ValueError(
-            f"Unsupported visual tokenizer model_type: {model_type}")
+            return Siglip2NavitModel(
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                use_data_parallel=use_data_parallel,
+                attn_backend_override=attn_backend_override,
+            )
+        raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}")
 
     @property
     def dtype(self) -> torch.dtype:
@@ -135,22 +157,22 @@ class VisualTokenizer(torch.nn.Module):
         return next(self.head.parameters()).device
 
     def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
-        tokens = torch.softmax(logits, dim=-1,
-                               dtype=torch.float32).to(logits.dtype)
+        tokens = torch.softmax(logits, dim=-1, dtype=torch.float32).to(logits.dtype)
         return tokens
 
-    def encode(self, pixel_values: torch.Tensor,
-               grid_thws: torch.Tensor) -> torch.Tensor:
+    def encode(
+        self, pixel_values: torch.Tensor, grid_thws: torch.Tensor
+    ) -> torch.Tensor:
         features = self.vit(pixel_values, grid_thws)
         # refer to qwen2.5-vl patchmerger
         seq_len, _ = features.shape
-        features = features.reshape(seq_len // (self.config.hidden_stride**2),
-                                    -1)
+        features = features.reshape(seq_len // (self.config.hidden_stride**2), -1)
 
         return features
 
-    def forward(self, pixel_values: torch.Tensor,
-                grid_thws: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thws: torch.Tensor
+    ) -> torch.Tensor:
         features = self.encode(pixel_values, grid_thws)
         logits = self.head(features)
         tokens = self.tokenize(logits)
@@ -167,7 +189,6 @@ class VisualTokenizer(torch.nn.Module):
 
 
 class Ovis2_5ProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config()
 
@@ -189,7 +210,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self) -> BaseImageProcessor:
         return self.get_hf_processor().image_processor  # type: ignore
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -220,8 +241,9 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_width=target_width,
-                                         image_height=target_height)
+        return self.get_num_image_tokens(
+            image_width=target_width, image_height=target_height
+        )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -247,8 +269,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         max_images = mm_counts.get("image", 0)
         max_videos = mm_counts.get("video", 0)
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens)
         max_frames_per_video = max_total_frames // max(max_videos, 1)
         return max(max_frames_per_video, 1)
 
@@ -258,11 +279,11 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[BaseImageProcessor],
+        image_processor: BaseImageProcessor | None,
     ) -> int:
-        num_video_tokens = self.get_num_image_tokens(image_width=image_width,
-                                                     image_height=image_height,
-                                                     num_frames=num_frames)
+        num_video_tokens = self.get_num_image_tokens(
+            image_width=image_width, image_height=image_height, num_frames=num_frames
+        )
         return num_video_tokens
 
     def get_max_video_tokens(
@@ -274,14 +295,12 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
 
 
 class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -291,46 +310,52 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
+
         mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
-            )
+                overrides=video_overrides,
+            ),
         }
         return mm_data
 
 
-class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
-                                 ):
-
+class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]):
     def visual_indicators_to_visual_tokens(
         self,
         visual_indicators: list[int],
     ) -> list[int]:
         """
-        Filter image indicators placeholders and convert them to corresponding 
+        Filter image indicators placeholders and convert them to corresponding
         tokens in visual tokenizer.
         """
         hf_config = self.info.get_hf_config()
         vte_vocab_size = hf_config.visual_vocab_size
         return [
             vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
-            for x in visual_indicators if x < -300
+            for x in visual_indicators
+            if x < -300
         ]
 
     def _call_hf_processor(
@@ -363,7 +388,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
                 self.visual_indicators_to_visual_tokens(indicator)
                 for indicator in visual_indicators
             ]
-            processed_outputs["video_indicator_tokens"] = indicator_tokens
+            processed_outputs["video_indicator_tokens"] = torch.tensor(indicator_tokens)
         if "images" in mm_data:
             visual_indicators = [
                 hf_processor.construct_visual_indicators((1, 1, 1), False)
@@ -374,14 +399,13 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
                 for indicator in visual_indicators
             ]
 
-            processed_outputs["indicator_tokens"] = indicator_tokens
+            processed_outputs["indicator_tokens"] = torch.tensor(indicator_tokens)
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
         self,
         prompt_tokens: list[int],
     ) -> list[int]:
-
         return prompt_tokens
 
     def _get_mm_fields_config(
@@ -389,7 +413,14 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _ovis2_5_field_config()
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            grids=MultiModalFieldConfig.batched("image"),
+            indicator_tokens=MultiModalFieldConfig.batched("image"),
+            video_pixel_values=MultiModalFieldConfig.batched("video"),
+            video_indicator_tokens=MultiModalFieldConfig.batched("video"),
+            video_grids=MultiModalFieldConfig.batched("video"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -397,7 +428,6 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptReplacement]:
-
         def get_replacement_ovis(item_idx, modality: str):
             if modality == "image":
                 out_item = out_mm_kwargs["image"][item_idx]
@@ -406,26 +436,33 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
                 out_item = out_mm_kwargs["video"][item_idx]
                 grid = out_item["video_grids"].data
             hf_processor = self.info.get_hf_processor()
-            return hf_processor.construct_visual_placeholders(grid[0], )
+            return hf_processor.construct_visual_placeholders(
+                grid[0],
+            )
 
         return [
             PromptReplacement(
                 modality=modality,
                 target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
                 replacement=partial(get_replacement_ovis, modality=modality),
-            ) for modality in ("image", "video")
+            )
+            for modality in ("image", "video")
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
-                                        info=Ovis2_5ProcessingInfo,
-                                        dummy_inputs=Ovis2_5DummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    Ovis2_5MultiModalProcessor,
+    info=Ovis2_5ProcessingInfo,
+    dummy_inputs=Ovis2_5DummyInputsBuilder,
+)
 class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config: PretrainedConfig = config
         self.llm = init_vllm_registered_model(
@@ -433,24 +470,31 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
             prefix=maybe_prefix(prefix, "llm"),
         )
 
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
         self.visual_tokenizer = VisualTokenizer(
             config=config.vit_config,
             visual_vocab_size=config.visual_vocab_size,
             quant_config=quant_config,
             prefix=f"{prefix}.visual_tokenizer",
+            attn_backend_override=attn_backend_override,
         )
 
-        self.vte = VisualEmbedding(config.visual_vocab_size,
-                                   config.hidden_size)
+        self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size)
 
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
 
         self.make_empty_intermediate_tensors = (
-            self.get_language_model().make_empty_intermediate_tensors)
+            self.get_language_model().make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        self, **kwargs: object
+    ) -> Ovis2_5ImagePatchInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         indicator_tokens = kwargs.pop("indicator_tokens", None)
         grids = kwargs.pop("grids", None)
@@ -459,29 +503,32 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
         if pixel_values is not None and indicator_tokens is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+                raise ValueError(
+                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
+                )
 
             if not isinstance(indicator_tokens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of indicator_tokens. "
-                                 f"Got type: {type(indicator_tokens)}")
+                raise ValueError(
+                    "Incorrect type of indicator_tokens. "
+                    f"Got type: {type(indicator_tokens)}"
+                )
 
-            return OvisImagePatchInputs(
+            return Ovis2_5ImagePatchInputs(
                 type="image_patches",
-                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
-                patches_per_image=[
+                flat_data=flatten_bn(pixel_values, concat=True),
+                patches_per_item=[
                     x.shape[0] // (self.config.vit_config.hidden_stride**2)
-                    for x in flatten_bn(pixel_values)
+                    for x in pixel_values
                 ],
-                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
-                                            concat=True),
-                grids=flatten_bn(flatten_bn(grids), concat=True),
+                indicator_tokens=flatten_bn(indicator_tokens, concat=True),
+                grids=flatten_bn(grids, concat=True),
             )
 
         raise AssertionError("This line should be unreachable.")
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        self, **kwargs: object
+    ) -> Ovis2_5VideoPatchInputs | None:
         pixel_values = kwargs.pop("video_pixel_values", None)
         indicator_tokens = kwargs.pop("video_indicator_tokens", None)
         grids = kwargs.pop("video_grids", None)
@@ -490,60 +537,64 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
         if pixel_values is not None and indicator_tokens is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+                raise ValueError(
+                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
+                )
 
             if not isinstance(indicator_tokens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of indicator_tokens. "
-                                 f"Got type: {type(indicator_tokens)}")
+                raise ValueError(
+                    "Incorrect type of indicator_tokens. "
+                    f"Got type: {type(indicator_tokens)}"
+                )
 
-            return OvisVideoPatchInputs(
+            return Ovis2_5VideoPatchInputs(
                 type="video_patches",
-                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
-                patches_per_image=[
+                flat_data=flatten_bn(pixel_values, concat=True),
+                patches_per_item=[
                     x.shape[0] // (self.config.vit_config.hidden_stride**2)
-                    for x in flatten_bn(pixel_values)
+                    for x in pixel_values
                 ],
-                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
-                                            concat=True),
-                grids=flatten_bn(flatten_bn(grids), concat=True),
+                indicator_tokens=flatten_bn(indicator_tokens, concat=True),
+                grids=flatten_bn(grids, concat=True),
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_image_input(
-        self, image_input: Union[OvisImagePatchInputs, OvisVideoPatchInputs]
+    def _process_visual_input(
+        self, visual_input: Ovis2_5ImagePatchInputs | Ovis2_5VideoPatchInputs
     ) -> MultiModalEmbeddings:
-        image_patches_flat = image_input["flat_data"]
-        patches_per_image = image_input["patches_per_image"]
-        indicator_tokens = image_input["indicator_tokens"]
-        grid_thws = image_input["grids"]
+        image_patches_flat = visual_input["flat_data"]
+        patches_per_image = visual_input["patches_per_item"]
+        indicator_tokens = visual_input["indicator_tokens"]
+        grid_thws = visual_input["grids"]
 
         indicator_per_image = list(
-            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
+            map(lambda x: 2 if x > 1 else x + 2, patches_per_image)
+        )
 
         target_dtype = self.visual_tokenizer.dtype
         visual_tokens = self.visual_tokenizer(
-            image_patches_flat.to(target_dtype), grid_thws)
+            image_patches_flat.to(target_dtype), grid_thws
+        )
 
         visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
         indicator_embeds = self.vte(indicator_tokens)
 
         visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
-        indicator_embeds_per_image = indicator_embeds.split(
-            indicator_per_image)
+        indicator_embeds_per_image = indicator_embeds.split(indicator_per_image)
 
         vision_embeddings = []
-        for indicator, visual in zip(indicator_embeds_per_image,
-                                     visual_embeds_per_image):
+        for indicator, visual in zip(
+            indicator_embeds_per_image, visual_embeds_per_image
+        ):
             vision_embeddings_per_image = []
             visual = visual.unsqueeze(0)
             for i in range(visual.shape[0]):
                 vision_embeddings_per_image.append(
-                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
-            vision_embeddings_per_image.append(indicator[i + 1:])
-            vision_embeddings.append(
-                torch.cat(vision_embeddings_per_image, dim=0))
+                    torch.cat([indicator[i : i + 1], visual[i]], dim=0)
+                )
+            vision_embeddings_per_image.append(indicator[i + 1 :])
+            vision_embeddings.append(torch.cat(vision_embeddings_per_image, dim=0))
         return tuple(vision_embeddings)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -552,20 +603,21 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values", "indicator_tokens",
-                             "grids") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("video_pixel_values", "video_indicator_tokens",
-                             "video_grids") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values", "indicator_tokens", "grids")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key
+                in ("video_pixel_values", "video_indicator_tokens", "video_grids")
+                and "videos" not in modalities
+            ):
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -576,46 +628,26 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_visual_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
-                video_embeddings = self._process_image_input(video_input)
-                multimodal_embeddings += video_embeddings
+                video_embeddings = self._process_visual_input(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.llm.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            tmp = torch.concat(multimodal_embeddings, dim=0)
-            inputs_embeds[input_ids == self.image_pad_token_id] = tmp
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         # up until here we have a inputs_embeds 100% numerical identity
         # between the OG HF Transformers implementation and ours
         hidden_states = self.llm(
@@ -629,12 +661,11 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.llm.compute_logits(hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef6470070d1813470ae838294b4d331ffb68f35
--- /dev/null
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -0,0 +1,1407 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Literal
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature, PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPooling,
+)
+from transformers.utils import torch_int
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
+from vllm.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_xformers_attn_wrapper,
+)
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    dispatch_rotary_emb_function,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
+from vllm.multimodal.parse import (
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .ernie45 import Ernie4_5ForCausalLM
+from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    maybe_prefix,
+)
+from .vision import get_vit_attn_backend
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+
+    if height < factor:
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, "
+            f"got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+
+
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    sin = repeat(
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = rotary_emb_function(t_, cos, sin).type_as(t)
+    return output
+
+
+class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self):
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor,
+    ) -> int:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        resized_height, resized_width = smart_resize(
+            height=image_height,
+            width=image_width,
+            factor=patch_size * merge_size,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+        )
+        preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_image_tokens = num_patches // (merge_size**2)
+
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+
+        # See `smart_resize` for the calculation of the image size.
+        merge_size = hf_config.vision_config.spatial_merge_size
+        patch_size = hf_config.vision_config.patch_size
+        factor = merge_size * patch_size
+        max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
+        # Find factors of max_num_tokens close to its square root
+        # to create a dummy image with a reasonable aspect ratio.
+        h_patches = int(math.sqrt(max_num_tokens))
+        max_num_tokens -= max_num_tokens % h_patches
+        w_patches = max_num_tokens // h_patches
+        return ImageSize(height=h_patches * factor, width=w_patches * factor)
+
+
+class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        max_image_size = self.info.get_image_size_with_most_features()
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=max_image_size.width,
+                height=max_image_size.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
+        }
+
+
+class PaddleOCRVLMultiModalProcessor(
+    BaseMultiModalProcessor[PaddleOCRVLProcessingInfo]
+):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(text=prompt, **mm_data),
+                dict(**mm_kwargs, **tok_kwargs),
+            )
+            num_patches_per_image = processed_outputs["image_grid_thw"].prod(-1)
+            processed_outputs["pixel_values"] = processed_outputs["pixel_values"].split(
+                num_patches_per_image.tolist()
+            )
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(
+                prompt, add_special_tokens=True, return_tensors="pt"
+            )
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_id
+
+        def get_replacement(item_idx: int, image_processor):
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            num_image_tokens = self.info.get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                image_processor=image_processor,
+            )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=partial(get_replacement, image_processor=image_processor),
+            ),
+        ]
+
+
+class Projector(nn.Module):
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size, eps=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(
+            self.hidden_size, self.text_config.hidden_size, bias=True
+        )
+
+    def forward(
+        self,
+        image_features: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)
+                t, h, w = image_grid
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+class PaddleOCRImagePixelInputs(TensorSchema):
+    type: Literal["pixel_values"]
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "p", 3, "patch_size", "patch_size", dynamic_dims={"p"}),
+    ]
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 3),
+    ]
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(
+        self,
+        embeddings: torch.Tensor,
+        height: int,
+        width: int,
+        is_after_patchify: bool = False,
+    ) -> torch.Tensor:
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def fetch_position_embedding_lfu_cache(
+        self, embeddings: torch.Tensor, h: int, w: int, max_cache: int = 20
+    ):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count,
+                key=self.cache_position_count.get,
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        interpolate_pos_encoding=False,
+    ) -> torch.Tensor:
+        if pixel_values.dim() == 4:
+            pixel_values = pixel_values.unsqueeze(0)
+        if pixel_values.dim() == 5:
+            if position_ids is None:
+                raise ValueError(
+                    "position_ids cannot be None when pixel_values.dim() is 5."
+                )
+            (
+                batch_size,
+                squence_len,
+                channel,
+                height,
+                width,
+            ) = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                start = 0
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[start:end, :]
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .repeat(t, 1)
+                    )
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
+            return embeddings
+        else:
+            raise ValueError(
+                "Unsupported pixel_values dimension:"
+                f" {pixel_values.dim()}. Expected 4 or 5."
+            )
+
+
+def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(
+        gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group
+    )
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class SiglipAttention(nn.Module):
+    """SigLIP vision attention adapted from Qwen2.5-VisionAttention."""
+
+    def __init__(
+        self,
+        *,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
+        use_upstream_fa: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.attn_backend = attn_backend
+        self.use_upstream_fa = use_upstream_fa
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
+        self.is_flash_attn_backend = self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size)
+
+        q, k, v = qkv.chunk(3, dim=2)
+
+        if self.tp_size > 1:
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor | None,
+        max_seqlen: torch.Tensor | None,
+        seqlens: torch.Tensor | None,
+    ) -> torch.Tensor:
+        batch_size, _, _ = hidden_states.shape
+
+        x = rearrange(hidden_states, "b s d -> s b d")
+        x, _ = self.qkv_proj(x)
+        q, k, v = self.split_qkv(x)
+        q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v))
+
+        if rotary_pos_emb is not None:
+            qk_concat = torch.cat([q, k], dim=0)
+            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            q, k = torch.chunk(qk_rotated, 2, dim=0)
+
+        if self.is_flash_attn_backend:
+            if max_seqlen is None:
+                raise ValueError("Flash attention backend requires max_seqlen.")
+            context_layer = vit_flash_attn_wrapper(
+                q,
+                k,
+                v,
+                cu_seqlens,
+                max_seqlen,
+                batch_size,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
+                self.use_upstream_fa,
+            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(tensor, "b s h d -> b h s d")
+                    for tensor in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            if seqlens is None:
+                raise ValueError("xFormers attention backend requires seqlens tensor.")
+            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
+        else:
+            raise RuntimeError(
+                f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+            )
+
+        output, _ = self.out_proj(context_layer)
+        output = rearrange(output, "s b d -> b s d")
+        return output
+
+
+class SigLIPRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        inv_freq = 1.0 / (
+            self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class SiglipMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+            quantizable = True
+        else:
+            # For other quantization, we require the hidden size to be a
+            # multiple of 64
+            quantizable = (
+                config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+            )
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        *,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
+        use_upstream_fa: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = SiglipAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attn_backend=attn_backend,
+            attn_backend_override=attn_backend_override,
+            use_upstream_fa=use_upstream_fa,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor | None,
+        max_seqlen: torch.Tensor | None,
+        seqlens: torch.Tensor | None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class SiglipEncoder(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.attn_backend = get_vit_attn_backend(
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        self.use_upstream_fa = False
+        if self.attn_backend not in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        } and check_upstream_fa_availability(torch.get_default_dtype()):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
+            self.use_upstream_fa = True
+        if self.attn_backend not in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
+            raise RuntimeError(
+                f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+            )
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_backend=self.attn_backend,
+                    attn_backend_override=attn_backend_override,
+                    use_upstream_fa=self.use_upstream_fa,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def forward(
+        self,
+        inputs_embeds,
+        cu_seqlens: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        device = inputs_embeds.device
+        hidden_states = inputs_embeds
+
+        flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+        if width_position_ids is None or height_position_ids is None:
+            split_hids = list()
+            split_wids = list()
+            for t, h, w in flatten_image_grid_thw:
+                image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                sample_hids = image_pids // w
+                sample_wids = image_pids % w
+                split_hids.append(sample_hids)
+                split_wids.append(sample_wids)
+            width_position_ids = torch.concat(split_wids, dim=0)
+            height_position_ids = torch.concat(split_hids, dim=0)
+
+        pids = torch.stack(
+            [height_position_ids, width_position_ids],
+            dim=-1,
+        )
+        max_grid_size = pids.max() + 1
+        rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rope_emb_max_grid[pids].flatten(1)
+
+        if cu_seqlens is None:
+            raise ValueError("cu_seqlens cannot be None for SiglipEncoder.")
+        if not isinstance(cu_seqlens, torch.Tensor):
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+        else:
+            cu_seqlens = cu_seqlens.to(device=device)
+
+        max_seqlen = None
+        seqlens = None
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+        return hidden_states
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            attn_backend_override=attn_backend_override,
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool | None = False,
+        position_ids: torch.Tensor | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        return last_hidden_state
+
+
+class SiglipVisionModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ):
+        super().__init__()
+
+        self.vision_model = SiglipVisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_model",
+            attn_backend_override=attn_backend_override,
+        )
+        self.quant_config = quant_config
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        interpolate_pos_encoding: bool = False,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+    ) -> BaseModelOutputWithPooling:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+            cu_seqlens=cu_seqlens,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "head.attention" in name or "head.layernorm" in name:
+                continue
+            if "head.mlp" in name or "head.probe" in name:
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                param = params_dict[scale_name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (
+                param_name,
+                weight_name,
+                shard_id,
+            ) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PaddleOCRVLMultiModalProcessor,
+    info=PaddleOCRVLProcessingInfo,
+    dummy_inputs=PaddleOCRVLDummyInputsBuilder,
+)
+class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
+    merge_by_field_config = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
+
+        self.visual = SiglipVisionModel(
+            config=config.vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+            attn_backend_override=attn_backend_override,
+        )
+        self.mlp_AR = Projector(config, config.vision_config)
+
+        self.language_model = Ernie4_5ForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        for layer in self.language_model.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.is_neox_style = True
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = image_grid_thw[image_index]
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = video_grid_thw[video_index]
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                    * video_second_per_grid_t
+                    * tokens_per_second
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return llm_positions, mrope_position_delta
+
+    def get_language_model(self) -> nn.Module:
+        return self.language_model
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> PaddleOCRImagePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        return PaddleOCRImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        elif inputs_embeds is None:
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            is_multimodal = kwargs.pop("is_multimodal", None)
+            handle_oov_mm_token = kwargs.pop("handle_oov_mm_token", False)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=is_multimodal,
+                handle_oov_mm_token=handle_oov_mm_token,
+            )
+            input_ids = None
+
+        return self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def encode_image(
+        self, pixel_values: torch.Tensor, image_grid_thw: torch.Tensor
+    ) -> torch.Tensor:
+        pixel_values = pixel_values.type(self.visual.dtype)
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        cu_seqlens = [0]
+
+        thw_tuple = tuple(image_grid_thw.tolist())
+        numel = np.prod(thw_tuple)
+        image_grid_hws.append(thw_tuple)
+        image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+        siglip_position_ids.append(image_position_ids)
+        cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+            pixel_values.device
+        )
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values.device)
+
+        vision_outputs = self.visual(
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_hws,
+            position_ids=siglip_position_ids,
+            interpolate_pos_encoding=True,
+            cu_seqlens=cu_seqlens,
+        )
+        return vision_outputs
+
+    def _process_image_input(
+        self, image_input: PaddleOCRImagePixelInputs
+    ) -> MultiModalEmbeddings:
+        pixel_values = image_input.pixel_values
+        image_grid_thw = image_input.image_grid_thw
+        vision_outputs = tuple(
+            self.encode_image(pixel, grid).squeeze(0)
+            for pixel, grid in zip(pixel_values, image_grid_thw)
+        )
+        image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+        return image_embeds
+
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return ()
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+        image_embeds = self._process_image_input(image_input)
+        multimodal_embeddings += tuple(image_embeds)
+
+        return multimodal_embeddings
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index aef5102304614c59e8741055d43bce57b432fa75..ec5d0fa6226ddd4c2795f68c53802492a71d445d 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,33 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 from torch import nn
 from transformers import BatchFeature, PaliGemmaConfig
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems,
-                                    MultiModalUUIDDict)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptIndexTargets,
+    PromptInsertion,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    flatten_bn,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import get_vision_encoder_info
 
 logger = init_logger(__name__)
@@ -41,6 +57,7 @@ class PaliGemmaImagePixelInputs(TensorSchema):
         - h: Height
         - w: Width
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
@@ -52,16 +69,17 @@ class PaliGemmaImageEmbeddingInputs(TensorSchema):
         - ifs: Image feature size
         - hs: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs,
-                             PaliGemmaImageEmbeddingInputs]
+PaliGemmaImageInputs: TypeAlias = (
+    PaliGemmaImagePixelInputs | PaliGemmaImageEmbeddingInputs
+)
 
 
 class PaliGemmaMultiModalProjector(nn.Module):
-
     def __init__(self, vision_hidden_size: int, projection_dim: int):
         super().__init__()
 
@@ -73,14 +91,13 @@ class PaliGemmaMultiModalProjector(nn.Module):
 
 
 class PaliGemmaProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(PaliGemmaConfig)
 
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(
@@ -97,9 +114,7 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
         )
 
 
-class PaliGemmaDummyInputsBuilder(
-        BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
-
+class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -107,6 +122,7 @@ class PaliGemmaDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -114,17 +130,19 @@ class PaliGemmaDummyInputsBuilder(
 
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=max_image_size,
+                height=max_image_size,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class PaliGemmaMultiModalProcessor(
-        BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
-
+class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
@@ -167,7 +185,8 @@ class PaliGemmaMultiModalProcessor(
 
         def get_insertion(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -192,24 +211,27 @@ class PaliGemmaMultiModalProcessor(
             PromptInsertion(
                 modality="image",
                 target=PromptIndexTargets.prefix(
-                    [bos_token_id] if tokenizer.add_bos_token else []),
+                    [bos_token_id] if tokenizer.add_bos_token else []
+                ),
                 insertion=get_insertion,
             )
         ]
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
-        mm_inputs = super().apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs,
-                                  tokenization_kwargs,
-                                  mm_uuids=mm_uuids)
+        mm_inputs = super().apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
@@ -220,7 +242,6 @@ class PaliGemmaMultiModalProcessor(
         if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
             prompt_token_ids.append(newline_token_id)
             mm_inputs["prompt_token_ids"] = prompt_token_ids
-            mm_inputs["prompt"] += newline_prompt
 
         return mm_inputs
 
@@ -228,9 +249,9 @@ class PaliGemmaMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     PaliGemmaMultiModalProcessor,
     info=PaliGemmaProcessingInfo,
-    dummy_inputs=PaliGemmaDummyInputsBuilder)
-class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+    dummy_inputs=PaliGemmaDummyInputsBuilder,
+)
+class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -250,10 +271,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -267,13 +289,15 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.vision_tower = SiglipVisionModel(config.vision_config,
-                                              quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "vision_tower"))
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         self.multi_modal_projector = PaliGemmaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
-            projection_dim=config.vision_config.projection_dim)
+            projection_dim=config.vision_config.projection_dim,
+        )
 
         self.quant_config = quant_config
 
@@ -290,10 +314,12 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.language_model.logits_processor.scale *= logit_scale
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
+        self, **kwargs: object
+    ) -> PaliGemmaImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -304,12 +330,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = flatten_bn(pixel_values, concat=True)
 
             h = w = self.config.vision_config.image_size
-            return PaliGemmaImagePixelInputs(type="pixel_values",
-                                             data=pixel_values,
-                                             resolve_bindings={
-                                                 "h": h,
-                                                 "w": w
-                                             })
+            return PaliGemmaImagePixelInputs(
+                type="pixel_values",
+                data=pixel_values,
+                resolve_bindings={"h": h, "w": w},
+            )
 
         if image_embeds is not None:
             image_embeds = flatten_bn(image_embeds, concat=True)
@@ -326,7 +351,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         vision_tower: SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-
         target_dtype = vision_tower.get_input_embeddings().weight.dtype
         image_features = vision_tower(pixel_values.to(dtype=target_dtype))
 
@@ -336,7 +360,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         self,
         image_input: PaliGemmaImageInputs,
     ) -> torch.Tensor:
-
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -352,8 +375,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -362,50 +384,29 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def forward(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_index)
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> IntermediateTensors:
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 23fb7bb85215c10ba518ec556cdddfca4e1afb81..3bf6a1d9763d025e4b27312b47e79285710a68f5 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -22,9 +22,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -35,35 +35,51 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class PersimmonMLP(nn.Module):
-
-    def __init__(self,
-                 config: PersimmonConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
-        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
-                                                  config.intermediate_size,
-                                                  quant_config=quant_config)
-        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
-                                               config.hidden_size,
-                                               quant_config=quant_config)
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+        )
         self.act = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states) -> torch.Tensor:
@@ -74,12 +90,13 @@ class PersimmonMLP(nn.Module):
 
 
 class PersimmonAttention(nn.Module):
-
-    def __init__(self,
-                 config: PersimmonConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.config = config
         tensor_parallel_world_size = get_tensor_model_parallel_world_size()
@@ -102,12 +119,14 @@ class PersimmonAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
         self.is_qk_layernorm = config.qk_layernorm
 
@@ -123,12 +142,14 @@ class PersimmonAttention(nn.Module):
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scale=self.scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scale=self.scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
         # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
@@ -167,23 +188,32 @@ class PersimmonAttention(nn.Module):
 
 
 class PersimmonDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: PersimmonConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = PersimmonAttention(config=config,
-                                            cache_config=cache_config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.self_attn")
-        self.mlp = PersimmonMLP(config, quant_config=quant_config)
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.layer_norm_eps)
+        self.self_attn = PersimmonAttention(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = PersimmonMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
 
     def forward(
         self,
@@ -214,7 +244,6 @@ class PersimmonDecoderLayer(nn.Module):
 
 @support_torch_compile
 class PersimmonModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -224,34 +253,38 @@ class PersimmonModel(nn.Module):
 
         self.vocab_size = config.vocab_size
         self.config = config
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: PersimmonDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
-        self.final_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -262,8 +295,7 @@ class PersimmonModel(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
@@ -282,45 +314,48 @@ class PersimmonModel(nn.Module):
                 if output_dim is not None:
                     loaded_weight_shape = loaded_weight.shape
                     loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
+                        loaded_weight_shape[:output_dim]
+                        + (num_heads, 3, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
+                    loaded_weight = loaded_weight.transpose(output_dim, output_dim + 1)
                     loaded_weight = loaded_weight.reshape(loaded_weight_shape)
 
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.config = config
         self.vocab_size = config.vocab_size
-        self.model = PersimmonModel(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      bias=False,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.model = PersimmonModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=False,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ):
         hidden_states = self.model(
             input_ids=input_ids,
@@ -333,11 +368,10 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 9cf288e85005755f3f0f50e0b50640ef09416dd8..8fee53c23fb4b9d85be14af4ef89d5d9a55a9772 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -37,9 +37,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -50,40 +50,47 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class PhiAttention(nn.Module):
-
-    def __init__(self,
-                 config: PhiConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PhiConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.total_num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.total_num_heads
 
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
 
         # pylint: disable=C0103
         self.qkv_proj = QKVParallelLinear(
@@ -92,36 +99,41 @@ class PhiAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.dense = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         scaling = self.head_size**-0.5
-        rotary_dim = int(config.partial_rotary_factor *
-                         (config.hidden_size // config.num_attention_heads))
+        rotary_dim = int(
+            config.partial_rotary_factor
+            * (config.hidden_size // config.num_attention_heads)
+        )
         assert rotary_dim % 2 == 0
 
         # pylint: disable=C0301
         # Refer to:
         # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
         rope_theta = getattr(config, "rope_theta", 10000.0)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          2048)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_size,
-                              scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_size,
+            scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -137,10 +149,12 @@ class PhiAttention(nn.Module):
 
 
 class PhiMLP(nn.Module):
-
-    def __init__(self,
-                 config: PhiConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         n_inner = getattr(config, "n_inner", None)
@@ -150,11 +164,13 @@ class PhiMLP(nn.Module):
             config.hidden_size,
             n_inner,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
         )
         self.fc2 = RowParallelLinear(
             n_inner,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -166,20 +182,21 @@ class PhiMLP(nn.Module):
 
 
 class PhiLayer(nn.Module):
-
-    def __init__(self,
-                 config: PhiConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: PhiConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.self_attn = PhiAttention(config,
-                                      cache_config,
-                                      quant_config,
-                                      prefix=f"{prefix}.self_attn")
-        self.mlp = PhiMLP(config, quant_config)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.self_attn = PhiAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attn"
+        )
+        self.mlp = PhiMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -199,7 +216,6 @@ class PhiLayer(nn.Module):
 
 @support_torch_compile
 class PhiModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -209,34 +225,36 @@ class PhiModel(nn.Module):
 
         self.config = config
         self.quant_config = quant_config
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: PhiLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
-        self.final_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.layer_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+            lambda prefix: PhiLayer(config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -250,13 +268,12 @@ class PhiModel(nn.Module):
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v")
+            ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -265,7 +282,7 @@ class PhiModel(nn.Module):
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -287,8 +304,7 @@ class PhiModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -307,50 +323,52 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # lm_head use bias, cannot share word embeddings
         assert not config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
-        self.model = PhiModel(vllm_config=vllm_config,
-                              prefix=maybe_prefix(prefix, "model"))
+        self.model = PhiModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      bias=True,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       self.lm_head.bias)
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states, self.lm_head.bias)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index f4e870c530309eb96012b78ed84e706abe4dfc1a..56c8755123d3dbcd84efe29dc8b0f2813ef39e83 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -8,7 +8,6 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 
 
 class Phi3ForCausalLM(LlamaForCausalLM):
-
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a2b201fe4228d60c53a1fee46da1ebdc1010d4dc..384572217bc19a8c721ab875d36a44113c8d49ac 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -16,72 +16,93 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import regex as re
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
-                          ProcessorMixin)
+from transformers import (
+    BatchFeature,
+    CLIPVisionConfig,
+    PretrainedConfig,
+    ProcessorMixin,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalPromptUpdates,
-                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        ResolvedPromptUpdate)
-# yapf: enable
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalPromptUpdates,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+    ResolvedPromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    _merge_multimodal_embeddings,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 32044
 
-CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
-                                                     hidden_act="quick_gelu",
-                                                     hidden_size=1024,
-                                                     image_size=336,
-                                                     intermediate_size=4096,
-                                                     num_attention_heads=16,
-                                                     num_channels=3,
-                                                     num_hidden_layers=24,
-                                                     patch_size=14,
-                                                     projection_dim=768)
-
-
-def _init_img_processor(hf_config: PretrainedConfig,
-                        quant_config: Optional[QuantizationConfig],
-                        prefix: str = "") -> CLIPVisionModel:
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
+    dropout=0.0,
+    hidden_act="quick_gelu",
+    hidden_size=1024,
+    image_size=336,
+    intermediate_size=4096,
+    num_attention_heads=16,
+    num_channels=3,
+    num_hidden_layers=24,
+    patch_size=14,
+    projection_dim=768,
+)
+
+
+def _init_img_processor(
+    hf_config: PretrainedConfig,
+    quant_config: QuantizationConfig | None,
+    prefix: str = "",
+) -> CLIPVisionModel:
     clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-    layer_idx = hf_config.img_processor.get('layer_idx', -2)
+    layer_idx = hf_config.img_processor.get("layer_idx", -2)
 
     # Initialize the CLIP only up to the required feature layer
     if layer_idx < 0:
-        num_hidden_layers = clip_config.num_hidden_layers + \
-            layer_idx + 1
+        num_hidden_layers = clip_config.num_hidden_layers + layer_idx + 1
     else:
         num_hidden_layers = layer_idx + 1
 
@@ -108,14 +129,15 @@ class Phi3VImagePixelInputs(TensorSchema):
     type: Literal["pixel_values", "image_embeds"] = "pixel_values"
 
     # Supports either a stacked tensor or a list of (p, 3, h, w) tensors
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"}
-                    ),  # 'p' may vary across items
+    pixel_values: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape(
+            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
+        ),  # 'p' may vary across items
     ]
 
     # Stacked tensor with height and width for each image
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
 
 
 class Phi3VImageEmbeddingInputs(TensorSchema):
@@ -126,26 +148,25 @@ class Phi3VImageEmbeddingInputs(TensorSchema):
         - f: Image feature size (e.g., number of tokens per image)
         - h: Hidden size (must match language model backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", "h"),
     ]
 
 
-Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs]
+Phi3VImageInputs: TypeAlias = Phi3VImagePixelInputs | Phi3VImageEmbeddingInputs
 
 
 class Phi3ImageEmbeddingBase(nn.Module):
-
     def __init__(self) -> None:
         super().__init__()
         self.layer_idx: int
         self.type_feature: str
         self.img_processor: CLIPVisionModel
 
-    def get_img_features(self,
-                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
         TYPE_FEATURE = self.type_feature
 
         # NOTE: we skip the step to select the vision feature layer since
@@ -166,52 +187,51 @@ class Phi3ImageEmbeddingBase(nn.Module):
 class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
     """Phi3 Image embedding with HD transform."""
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig],
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
 
         # n_embed or hidden_size
-        hidden_size = config.n_embd if hasattr(
-            config, 'n_embd') else config.hidden_size
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
 
         self.img_processor = _init_img_processor(
-            config, quant_config, prefix=f"{prefix}.img_processor")
+            config, quant_config, prefix=f"{prefix}.img_processor"
+        )
 
-        image_dim_out = config.img_processor['image_dim_out']
-        self.num_img_tokens = config.img_processor['num_img_tokens']
+        image_dim_out = config.img_processor["image_dim_out"]
+        self.num_img_tokens = config.img_processor["num_img_tokens"]
 
         self.image_dim_out = image_dim_out
 
         # global_gn and sub_gn for hd transform, serves as line separator
-        self.use_hd_transform = config.embd_layer.get('use_hd_transform',
-                                                      False)
+        self.use_hd_transform = config.embd_layer.get("use_hd_transform", False)
         self.with_learnable_separator = config.embd_layer.get(
-            'with_learnable_separator', False)
-        self.hd_transform_order = config.embd_layer.get(
-            'hd_transform_order', 'glb_sub')
+            "with_learnable_separator", False
+        )
+        self.hd_transform_order = config.embd_layer.get("hd_transform_order", "glb_sub")
         # with_hd_transform and with_learnable_separator should have same value
         assert self.use_hd_transform and self.with_learnable_separator
 
         # 1024 * 4, merge spatial to channel dimension
         self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
-        self.sub_GN = nn.Parameter(
-            torch.empty([1, 1, 1, self.image_dim_out * 4]))
+        self.sub_GN = nn.Parameter(torch.empty([1, 1, 1, self.image_dim_out * 4]))
 
         dim_projection = hidden_size
         depth = 2
         layers = [nn.Linear(image_dim_out * 4, dim_projection)]
         for _ in range(1, depth):
-            layers.extend(
-                [nn.GELU(),
-                 nn.Linear(dim_projection, dim_projection)])
+            layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
         self.img_projection = nn.Sequential(*layers)
 
-        self.type_feature = config.img_processor.get('type_feature', 'patch')
+        self.type_feature = config.img_processor.get("type_feature", "patch")
 
-    def forward(self, pixel_values: torch.FloatTensor,
-                image_sizes: torch.Tensor) -> torch.FloatTensor:
+    def forward(
+        self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor
+    ) -> torch.FloatTensor:
         """
         process image and return vision embeddings.
 
@@ -221,19 +241,19 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         num_images, num_crops, c, h, w = pixel_values.shape
         pixel_values = pixel_values.flatten(0, 1)
         img_features = self.get_img_features(pixel_values)
-        img_features = img_features.reshape(num_images, num_crops, -1,
-                                            self.image_dim_out)
-        image_features_proj = self.hd_feature_transform(
-            img_features, image_sizes)
+        img_features = img_features.reshape(
+            num_images, num_crops, -1, self.image_dim_out
+        )
+        image_features_proj = self.hd_feature_transform(img_features, image_sizes)
         return image_features_proj
 
     def hd_feature_transform(self, image_features, image_sizes):
         """
         image_features: (num_images, num_crops+1, 24*24, 1024)
         """
-        assert (
-            self.hd_transform_order == 'sub_glb'
-        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
+        assert self.hd_transform_order == "sub_glb", (
+            f"hd_transform_order `{self.hd_transform_order}` not implemented"
+        )
         if isinstance(self.img_projection, nn.Sequential):
             target_device = self.img_projection[0].bias.device
             target_dtype = self.img_projection[0].bias.dtype
@@ -241,13 +261,14 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
             target_device = self.img_projection.bias.device
             target_dtype = self.img_projection.bias.dtype
 
-        global_image_features = image_features[:,
-                                               0]  # (num_images, 24*24, 1024)
+        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
         # global feature can be viewed as a special HD case with num_crops 1x1
         global_image_features_hd = self.reshape_hd_patches_2x2merge(
-            global_image_features, 1, 1)
+            global_image_features, 1, 1
+        )
         global_image_features_hd_newline = self.add_image_newline(
-            global_image_features_hd)
+            global_image_features_hd
+        )
 
         batch_image_features_proj = []
         # need a for loop to process each image because of different image sizes
@@ -260,21 +281,27 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
             # NOTE: real num_crops is padded
             # (num_crops, 24*24, 1024)
-            sub_image_features = image_features[i, 1:1 + num_crops]
+            sub_image_features = image_features[i, 1 : 1 + num_crops]
             sub_image_features_hd = self.reshape_hd_patches_2x2merge(
-                sub_image_features, h_crop, w_crop)
+                sub_image_features, h_crop, w_crop
+            )
             sub_image_features_hd_newline = self.add_image_newline(
-                sub_image_features_hd)
+                sub_image_features_hd
+            )
 
             # [sub features, separator, global features]
-            image_embeddings = torch.cat([
-                sub_image_features_hd_newline.squeeze(
-                    0),  # (h_crop*12*(w_crop*12+1), 4096)
-                self.glb_GN.squeeze(0),
-                global_image_features_hd_newline[i],
-            ])
+            image_embeddings = torch.cat(
+                [
+                    sub_image_features_hd_newline.squeeze(
+                        0
+                    ),  # (h_crop*12*(w_crop*12+1), 4096)
+                    self.glb_GN.squeeze(0),
+                    global_image_features_hd_newline[i],
+                ]
+            )
             img_proj = self.img_projection(
-                image_embeddings.to(target_device, target_dtype))
+                image_embeddings.to(target_device, target_dtype)
+            )
             batch_image_features_proj.append(img_proj)
 
         return batch_image_features_proj
@@ -294,11 +321,13 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
             .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
             .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
             .reshape(N, -1, 4 * C)  # N, 144, 4096
-            .reshape(num_images, h_crop, w_crop, H // 2, H // 2,
-                     -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+            .reshape(
+                num_images, h_crop, w_crop, H // 2, H // 2, -1
+            )  # n_img, h_crop, w_crop, 12, 12, 4096
             .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
-            .reshape(num_images, h_crop * H // 2, w_crop * H // 2,
-                     4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+            .reshape(
+                num_images, h_crop * H // 2, w_crop * H // 2, 4 * C
+            )  # n_img, h_crop*12, w_crop*12, 4096
         )
         return image_features_hd
 
@@ -309,17 +338,17 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         """
         num_images, h, w, hid_dim = image_features_hd.shape
         # add the newline token to the HD image feature patches
-        newline_embeddings = self.sub_GN.expand(num_images, h, -1,
-                                                -1)  # (n_img, h, 1, hid_dim)
+        newline_embeddings = self.sub_GN.expand(
+            num_images, h, -1, -1
+        )  # (n_img, h, 1, hid_dim)
         image_features_hd_newline = torch.cat(
-            [image_features_hd, newline_embeddings],
-            dim=2).reshape(num_images, -1, hid_dim)
+            [image_features_hd, newline_embeddings], dim=2
+        ).reshape(num_images, -1, hid_dim)
         return image_features_hd_newline
 
 
 class Phi3VProcessingInfo(BaseProcessingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -327,7 +356,7 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -343,7 +372,6 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
 
 
 class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -356,22 +384,25 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -418,7 +449,8 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
         def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -461,7 +493,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         self,
         token_ids: list[int],
         mm_prompt_updates: MultiModalPromptUpdates,
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
         # align to hf behavior when there are images
         if len(mm_prompt_updates):
             tokenizer = self.info.get_tokenizer()
@@ -482,8 +514,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
             # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
             pattern = r"<\|image_\d+\|>"
             prompt_chunks = [
-                tokenizer(chunk).input_ids
-                for chunk in re.split(pattern, text)
+                tokenizer(chunk).input_ids for chunk in re.split(pattern, text)
             ]
             image_tags = [
                 tokenizer(chunk, add_special_tokens=False).input_ids
@@ -492,18 +523,21 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
             if len(prompt_chunks) > len(image_tags):
                 image_tags.append([])
             token_ids = [
-                e for sublist in zip(prompt_chunks, image_tags)
-                for ele in sublist for e in ele
+                e
+                for sublist in zip(prompt_chunks, image_tags)
+                for ele in sublist
+                for e in ele
             ]
 
-        token_ids, text, placeholders = super()._apply_prompt_updates(
+        token_ids, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
             mm_prompt_updates=mm_prompt_updates,
         )
 
         # Keep the behavior in line with HF processor
-        if text.startswith("<s> <|image|>"):
-            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
+        if len(mm_prompt_updates) and (
+            token_ids[:2] == tokenizer.encode("<s> <|image|>", add_special_tokens=False)
+        ):
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
@@ -513,29 +547,34 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
                         is_embed=p.is_embed,
-                    ) for p in ps
+                    )
+                    for p in ps
                 ]
                 for modality, ps in placeholders.items()
             }
 
-        return token_ids, text, placeholders
+        return token_ids, placeholders
 
 
-@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
-                                        info=Phi3VProcessingInfo,
-                                        dummy_inputs=Phi3VDummyInputsBuilder)
-class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
-                       SupportsQuant):
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi3VMultiModalProcessor,
+    info=Phi3VProcessingInfo,
+    dummy_inputs=Phi3VDummyInputsBuilder,
+)
+class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.wte": "embed_tokens",
             "model.vision_embed_tokens.": "vision_embed_tokens.",
             "lm_head.": "language_model.lm_head.",
             "model.": "language_model.model.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"<|image_{i}|>"
 
@@ -552,7 +591,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
@@ -561,7 +599,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             config,
             self.quant_config,
-            prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
+            prefix=maybe_prefix(prefix, "model.vision_embed_tokens"),
+        )
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -575,10 +614,12 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Phi3VImageInputs]:
+        self, **kwargs: object
+    ) -> Phi3VImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -589,17 +630,18 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
         if pixel_values is not None:
             return Phi3VImagePixelInputs(
                 type="pixel_values",
-                data=flatten_bn(pixel_values),
-                image_sizes=flatten_bn(image_sizes, concat=True),
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
                 resolve_bindings={
                     "h": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
-                    "w": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
-                })
+                    "w": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
+                },
+            )
 
         if image_embeds is not None:
             return Phi3VImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -608,87 +650,84 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
         self,
         image_input: Phi3VImageInputs,
     ) -> torch.Tensor:
-
         if image_input["type"] == "image_embeds":
-            image_data = image_input["data"]
-            if is_list_of(image_data, torch.Tensor):
-                # it's already a list of tensors
-                return image_data
-            if len(image_data.shape) == 3:
-                # 3D tensor
-                return list(torch.unbind(image_data, dim=0))
-            raise ValueError(
-                "We expect batched 2D tensors; "
-                "this can be either a list of 2D tensors or a single 3D tensor."
-            )
+            return image_input["data"]
 
         assert self.vision_embed_tokens is not None
-        image_embeds = self.vision_embed_tokens(image_input["data"],
-                                                image_input["image_sizes"])
+
+        image_embeds = self.vision_embed_tokens(
+            image_input["pixel_values"], image_input["image_sizes"]
+        )
 
         return image_embeds
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.embed_tokens(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.image_token_id)
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object):
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.embed_tokens,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        if is_multimodal is None:
+            raise ValueError(
+                "`embed_input_ids` now requires `is_multimodal` arg, "
+                "please update your model runner according to "
+                "https://github.com/vllm-project/vllm/pull/16229."
+            )
+
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        autoloaded_weights = loader.load_weights(weights,
-                                                 mapper=self.hf_to_vllm_mapper)
+        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
         # The HF config doesn't specify whether these are tied,
         # so we detect it this way
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index d2a3a8cc04969534f585c2ad863c6d1803eaed0a..0f1230a55bae6f1cf9d7eaf625ad2eca0af29bfb 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -2,61 +2,85 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import (BatchFeature, Phi4MultimodalAudioConfig,
-                          Phi4MultimodalConfig, Phi4MultimodalFeatureExtractor,
-                          Phi4MultimodalImageProcessorFast)
+from transformers import (
+    BatchFeature,
+    Phi4MultimodalAudioConfig,
+    Phi4MultimodalConfig,
+    Phi4MultimodalFeatureExtractor,
+    Phi4MultimodalImageProcessorFast,
+)
 from transformers import Phi4MultimodalProcessor as Phi4MMProcessor
 from transformers.models.phi4_multimodal.modeling_phi4_multimodal import (
-    Phi4MultimodalAudioConvModule, Phi4MultimodalAudioNemoConvSubsampling,
-    Phi4MultimodalAudioRelativeAttentionBias, adaptive_enc_mask, unfold_tensor)
+    Phi4MultimodalAudioConvModule,
+    Phi4MultimodalAudioNemoConvSubsampling,
+    Phi4MultimodalAudioRelativeAttentionBias,
+    adaptive_enc_mask,
+    unfold_tensor,
+)
 
 from vllm.config import VllmConfig
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
-                                   ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems, MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-
-# <|endoftext10|> (see vocab.json in hf model)
-_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
-# <|endoftext11|>
-_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
 
 
-def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
-                      target_width: int):
+def _get_padding_size(
+    orig_width: int, orig_height: int, target_height: int, target_width: int
+):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -70,7 +94,6 @@ def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
 
 
 class Phi4MMProjector(nn.Module):
-
     def __init__(self, input_size: int, hidden_size: int):
         super().__init__()
         self.up = ColumnParallelLinear(input_size, hidden_size)
@@ -94,41 +117,44 @@ class Phi4MMImageEmbedding(nn.Module):
         self.crop_size = config.vision_config.crop_size
         self.image_dim_out = config.vision_config.hidden_size
 
-        n_patches = (config.vision_config.image_size //
-                     config.vision_config.patch_size)
+        n_patches = config.vision_config.image_size // config.vision_config.patch_size
         if n_patches % 2 != 0:
             self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
             n_patches += 1
-        self.num_img_tokens = (n_patches // 2)**2
+        self.num_img_tokens = (n_patches // 2) ** 2
 
-        num_hidden_layers = (config.vision_config.num_hidden_layers +
-                             self.layer_idx +
-                             1 if self.layer_idx < 0 else self.layer_idx + 1)
+        num_hidden_layers = (
+            config.vision_config.num_hidden_layers + self.layer_idx + 1
+            if self.layer_idx < 0
+            else self.layer_idx + 1
+        )
         self.img_processor = Idefics2VisionTransformer(
             config.vision_config,
             require_post_norm=False,
-            num_hidden_layers_override=num_hidden_layers)
+            num_hidden_layers_override=num_hidden_layers,
+        )
         self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.img_projection = Phi4MMProjector(self.image_dim_out,
-                                              config.hidden_size)
+        self.img_projection = Phi4MMProjector(self.image_dim_out, config.hidden_size)
         self.global_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, self.image_dim_out]))
+            torch.zeros([1, 1, self.image_dim_out])
+        )
         self.sub_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, 1, self.image_dim_out]))
+            torch.zeros([1, 1, 1, self.image_dim_out])
+        )
 
     def get_img_features(
         self,
         img_embeds: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
-        img_feature = self.img_processor(img_embeds,
-                                         patch_attention_mask=attention_mask)
+        img_feature = self.img_processor(
+            img_embeds, patch_attention_mask=attention_mask
+        )
 
         patch_feature = img_feature
         # reshape to 2D tensor
         width = int(math.sqrt(patch_feature.size(1)))
-        patch_feature = patch_feature.view(-1, width, width,
-                                           patch_feature.size(-1))
+        patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
         # convert to NCHW
         patch_feature = patch_feature.permute(0, 3, 1, 2)
         if getattr(self, "img_processor_padding", None) is not None:
@@ -137,19 +163,19 @@ class Phi4MMImageEmbedding(nn.Module):
         # convert to NHWC
         patch_feature = patch_feature.permute(0, 2, 3, 1)
         patch_feature = patch_feature.view(
-            -1,
-            patch_feature.size(1) * patch_feature.size(2),
-            patch_feature.size(-1))
+            -1, patch_feature.size(1) * patch_feature.size(2), patch_feature.size(-1)
+        )
         return patch_feature
 
     def forward(
         self,
         image_pixel_values: torch.FloatTensor,
-        image_sizes: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
+        image_sizes: torch.Tensor | None = None,
+        image_attention_mask: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         image_pixel_values = image_pixel_values.to(
-            self.img_processor.embeddings.patch_embedding.weight.dtype)
+            self.img_processor.embeddings.patch_embedding.weight.dtype
+        )
 
         target_device = self.img_projection.up.bias.device
         target_dtype = self.img_projection.up.bias.dtype
@@ -159,11 +185,13 @@ class Phi4MMImageEmbedding(nn.Module):
         img_features = self.get_img_features(
             image_pixel_values.flatten(0, 1),
             attention_mask=image_attention_mask.flatten(0, 1).to(
-                dtype=bool, device=target_device),
+                dtype=bool, device=target_device
+            ),
         )
         base_feat_size = int(np.sqrt(img_features.shape[1]))
-        img_features = img_features.view(batch_size, -1, base_feat_size**2,
-                                         self.image_dim_out)
+        img_features = img_features.view(
+            batch_size, -1, base_feat_size**2, self.image_dim_out
+        )
         image_sizes = image_sizes.view(-1, 2)
 
         output_imgs = []
@@ -174,58 +202,70 @@ class Phi4MMImageEmbedding(nn.Module):
             area_ratio = height_ratio * width_ratio
 
             global_img = img_features[idx, :1]
-            global_img = global_img.reshape(1, base_feat_size, base_feat_size,
-                                            self.image_dim_out).contiguous()
+            global_img = global_img.reshape(
+                1, base_feat_size, base_feat_size, self.image_dim_out
+            ).contiguous()
             temporary_extensor = self.sub_img_feature_extensor.repeat(
-                1, base_feat_size, 1, 1)
-            global_img = torch.cat([global_img, temporary_extensor],
-                                   dim=2).reshape(1, -1, self.image_dim_out)
+                1, base_feat_size, 1, 1
+            )
+            global_img = torch.cat([global_img, temporary_extensor], dim=2).reshape(
+                1, -1, self.image_dim_out
+            )
 
             sub_img = img_features[idx, 1:]
             sub_img = sub_img[:area_ratio]
-            sub_img = (sub_img.reshape(
-                height_ratio, width_ratio, base_feat_size, base_feat_size,
-                self.image_dim_out).transpose(1, 2).reshape(
-                    1, height_ratio * base_feat_size,
+            sub_img = (
+                sub_img.reshape(
+                    height_ratio,
+                    width_ratio,
+                    base_feat_size,
+                    base_feat_size,
+                    self.image_dim_out,
+                )
+                .transpose(1, 2)
+                .reshape(
+                    1,
+                    height_ratio * base_feat_size,
                     width_ratio * base_feat_size,
-                    self.image_dim_out).contiguous())
+                    self.image_dim_out,
+                )
+                .contiguous()
+            )
 
             if image_attention_mask is not None:
                 reshaped_image_attention_mask = (
-                    image_attention_mask[idx, 1:area_ratio + 1,
-                                         0::2, 0::2].reshape(
-                                             height_ratio, width_ratio,
-                                             base_feat_size,
-                                             base_feat_size).transpose(
-                                                 1, 2).reshape(
-                                                     1, height_ratio *
-                                                     base_feat_size,
-                                                     width_ratio *
-                                                     base_feat_size))
-                useful_height = int(
-                    reshaped_image_attention_mask[0, :, 0].sum().item())
-                useful_width = int(
-                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                    image_attention_mask[idx, 1 : area_ratio + 1, 0::2, 0::2]
+                    .reshape(height_ratio, width_ratio, base_feat_size, base_feat_size)
+                    .transpose(1, 2)
+                    .reshape(
+                        1, height_ratio * base_feat_size, width_ratio * base_feat_size
+                    )
+                )
+                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
                 sub_img = sub_img[:, :useful_height, :useful_width]
                 temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, useful_height, 1, 1)
+                    1, useful_height, 1, 1
+                )
             else:
                 temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, height_ratio * base_feat_size, 1, 1)
+                    1, height_ratio * base_feat_size, 1, 1
+                )
 
-            sub_img = torch.cat([sub_img, temporary_extensor],
-                                dim=2).reshape(1, -1, self.image_dim_out)
+            sub_img = torch.cat([sub_img, temporary_extensor], dim=2).reshape(
+                1, -1, self.image_dim_out
+            )
 
             # Merge global and sub
             output_imgs.append(
                 torch.cat(
-                    [sub_img, self.global_img_feature_extensor, global_img],
-                    dim=1))
+                    [sub_img, self.global_img_feature_extensor, global_img], dim=1
+                )
+            )
 
         img_set_tensor = []
         for output_img in output_imgs:
-            output_img = output_img.to(device=target_device,
-                                       dtype=target_dtype)
+            output_img = output_img.to(device=target_device, dtype=target_dtype)
             img_feature_proj = self.img_projection(output_img)
             img_set_tensor.append(img_feature_proj.flatten(0, 1))
 
@@ -233,26 +273,29 @@ class Phi4MMImageEmbedding(nn.Module):
 
 
 class Phi4MultimodalAudioMLP(nn.Module):
-
     def __init__(
         self,
         config: Phi4MultimodalAudioConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.act_fn = MulAndSilu()
         self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.intermediate_size] * 2,
+            config.hidden_size,
+            [config.intermediate_size] * 2,
             bias=True,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=True,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.layer_norm(hidden_states)
@@ -263,11 +306,10 @@ class Phi4MultimodalAudioMLP(nn.Module):
 
 
 class Phi4MultimodalAudioAttention(nn.Module):
-
     def __init__(
         self,
         config: Phi4MultimodalAudioConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -279,7 +321,8 @@ class Phi4MultimodalAudioAttention(nn.Module):
             raise ValueError(
                 "embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
 
         self.qkv_proj = QKVParallelLinear(
@@ -336,7 +379,6 @@ class Phi4MultimodalAudioAttention(nn.Module):
 
 
 class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
-
     def __init__(self, config: Phi4MultimodalAudioConfig):
         super().__init__()
 
@@ -355,11 +397,9 @@ class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
         residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
         hidden_states = self.layer_norm_att(residual)
 
-        hidden_states = residual + self.self_attn(hidden_states,
-                                                  attention_mask)
+        hidden_states = residual + self.self_attn(hidden_states, attention_mask)
         hidden_states = hidden_states + self.conv(hidden_states)
-        hidden_states = hidden_states + 0.5 * self.feed_forward_out(
-            hidden_states)
+        hidden_states = hidden_states + 0.5 * self.feed_forward_out(hidden_states)
 
         out = self.layer_norm(hidden_states)
 
@@ -373,7 +413,7 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
     Typically used as a very first layer in a model.
 
     Args:
-        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) 
+        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig)
             object containing model parameters.
     """
 
@@ -393,19 +433,21 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
 
 
 class Phi4MultimodalAudioModel(nn.Module):
-
     def __init__(self, config: Phi4MultimodalAudioConfig):
         super().__init__()
         self.config = config
 
         self.encoder_embedding = Phi4MMAudioMeanVarianceNormLayer(config)
         self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
-        self.relative_attention_bias_layer = (
-            Phi4MultimodalAudioRelativeAttentionBias(config))
-        self.encoders = nn.ModuleList([
-            Phi4MultimodalAudioConformerEncoderLayer(config)
-            for _ in range(config.num_blocks)
-        ])
+        self.relative_attention_bias_layer = Phi4MultimodalAudioRelativeAttentionBias(
+            config
+        )
+        self.encoders = nn.ModuleList(
+            [
+                Phi4MultimodalAudioConformerEncoderLayer(config)
+                for _ in range(config.num_blocks)
+            ]
+        )
 
     def _streaming_mask(
         self,
@@ -418,9 +460,11 @@ class Phi4MultimodalAudioModel(nn.Module):
         # S stores start index. if chunksize is 18, s is [0,18,36,....]
         chunk_start_idx = np.arange(0, seq_len, chunk_size)
 
-        enc_streaming_mask = (adaptive_enc_mask(
-            seq_len, chunk_start_idx,
-            left_window=left_chunk).unsqueeze(0).expand([batch_size, -1, -1]))
+        enc_streaming_mask = (
+            adaptive_enc_mask(seq_len, chunk_start_idx, left_window=left_chunk)
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
         return enc_streaming_mask
 
     def forward_embeddings(
@@ -429,18 +473,18 @@ class Phi4MultimodalAudioModel(nn.Module):
         masks: torch.Tensor,
     ):
         """Forwarding the inputs through the top embedding layers"""
-        seq_len = math.ceil(hidden_states.shape[1] /
-                            self.config.time_reduction)
+        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
         if seq_len <= 0:
             raise ValueError(
                 f"Sequence length after time reduction is invalid: {seq_len}."
-                "Your input feature is too short.")
+                "Your input feature is too short."
+            )
 
         batch_size = hidden_states.shape[0]
 
-        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
-                                                  self.config.chunk_size,
-                                                  self.config.left_chunk)
+        enc_streaming_mask = self._streaming_mask(
+            seq_len, batch_size, self.config.chunk_size, self.config.left_chunk
+        )
         enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
 
         hidden_states, masks = self.embed(hidden_states, masks)
@@ -455,13 +499,14 @@ class Phi4MultimodalAudioModel(nn.Module):
 
         return hidden_states, hs_mask, masks
 
-    def calculate_hs_mask(self, hidden_states: torch.Tensor,
-                          device: torch.device, mask: torch.Tensor):
+    def calculate_hs_mask(
+        self, hidden_states: torch.Tensor, device: torch.device, mask: torch.Tensor
+    ):
         max_audio_length = hidden_states.shape[1]
         batch_size = hidden_states.shape[0]
-        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
-                                                  self.config.chunk_size,
-                                                  self.config.left_chunk)
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.config.chunk_size, self.config.left_chunk
+        )
         enc_streaming_mask = enc_streaming_mask.to(device)
         if mask is None:
             return enc_streaming_mask
@@ -469,17 +514,15 @@ class Phi4MultimodalAudioModel(nn.Module):
         feature_lens = mask.sum(1)
         padding_length = feature_lens
         pad_mask = torch.arange(0, max_audio_length, device=device).expand(
-            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
         pad_mask = pad_mask.unsqueeze(1)
         pad_mask = pad_mask & enc_streaming_mask
         return pad_mask
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                mask: Optional[torch.Tensor] = None):
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor | None = None):
         hidden_states = self.encoder_embedding(hidden_states)
-        hidden_states, hs_mask, mask = self.forward_embeddings(
-            hidden_states, mask)
+        hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
 
         unfolded = False
         bs, seq_len, _ = hidden_states.shape
@@ -495,9 +538,9 @@ class Phi4MultimodalAudioModel(nn.Module):
             else:
                 chunk_pad_size = 0
             if chunk_pad_size > 0:
-                hidden_states_pad = F.pad(hidden_states,
-                                          (0, 0, 0, chunk_pad_size),
-                                          "constant", 0)
+                hidden_states_pad = F.pad(
+                    hidden_states, (0, 0, 0, chunk_pad_size), "constant", 0
+                )
                 hidden_states = hidden_states_pad.to(hidden_states.device)
 
             hidden_states = unfold_tensor(hidden_states, max_seq_len)
@@ -505,24 +548,24 @@ class Phi4MultimodalAudioModel(nn.Module):
             if mask is not None:
                 # revise hs_mask here because the previous calculated hs_mask
                 # did not consider extra pad
-                subsampled_pad_mask = mask.squeeze(
-                    1)  # [bz, subsampled_unmask_seq_len]
+                subsampled_pad_mask = mask.squeeze(1)  # [bz, subsampled_unmask_seq_len]
                 extra_padded_subsamlped_pad_mask = F.pad(
-                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
-                    False)  # extra padding to the pad mask
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
                 extra_padded_subsamlped_pad_mask = (
-                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float())
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                )
                 masks_unfold = unfold_tensor(
                     extra_padded_subsamlped_pad_mask, max_seq_len
                 )  # unfold the pad mask like we did to the input tensor
                 masks_unfold = masks_unfold.squeeze(
-                    -1).bool()  # unfold op does not support bool tensor
+                    -1
+                ).bool()  # unfold op does not support bool tensor
             hs_mask = self.calculate_hs_mask(
                 hidden_states, hidden_states.device, masks_unfold
             )  # calculate hs_mask based on the unfolded pad mask
 
-        relative_attention_bias = self.relative_attention_bias_layer(
-            hidden_states)
+        relative_attention_bias = self.relative_attention_bias_layer(hidden_states)
         attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
 
         for layer in self.encoders:
@@ -539,7 +582,6 @@ class Phi4MultimodalAudioModel(nn.Module):
 
 
 class Phi4MMAudioEmbedding(nn.Module):
-
     def __init__(self, config: Phi4MultimodalConfig):
         super().__init__()
         self.config = config
@@ -548,12 +590,11 @@ class Phi4MMAudioEmbedding(nn.Module):
         self.encoder = Phi4MultimodalAudioModel(config.audio_config)
 
         audio_config = config.audio_config
-        proj_input_size = (audio_config.hidden_size *
-                           audio_config.downsample_rate)
+        proj_input_size = audio_config.hidden_size * audio_config.downsample_rate
         self.vision_speech_projection = Phi4MMProjector(
-            proj_input_size, config.hidden_size)
-        self.speech_projection = Phi4MMProjector(proj_input_size,
-                                                 config.hidden_size)
+            proj_input_size, config.hidden_size
+        )
+        self.speech_projection = Phi4MMProjector(proj_input_size, config.hidden_size)
 
     def get_projection(
         self,
@@ -571,23 +612,23 @@ class Phi4MMAudioEmbedding(nn.Module):
         audio_attention_mask=None,
         audio_projection_mode="speech",
     ) -> torch.FloatTensor:
-
         audio_projection = self.get_projection(audio_projection_mode)
 
         target_device = audio_projection.up.bias.device
         target_dtype = audio_projection.up.bias.dtype
 
-        audio_input_features = audio_input_features.to(device=target_device,
-                                                       dtype=target_dtype)
+        audio_input_features = audio_input_features.to(
+            device=target_device, dtype=target_dtype
+        )
 
-        audio_encoder_hidden_states = self.encoder(audio_input_features,
-                                                   audio_attention_mask)
+        audio_encoder_hidden_states = self.encoder(
+            audio_input_features, audio_attention_mask
+        )
         audio_embeds = audio_projection(audio_encoder_hidden_states)
 
         return audio_embeds.flatten(0, 1)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -608,8 +649,7 @@ class Phi4MMAudioEmbedding(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -630,10 +670,11 @@ class Phi4MMImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"]
 
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"}
-                    ),  # may be different per batch and image
+    pixel_values: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape(
+            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
+        ),  # may be different per batch and image
     ]
 
     image_sizes: Annotated[
@@ -663,7 +704,7 @@ class Phi4MMImageEmbeddingInputs(TensorSchema):
     type: Literal["image_embeds"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", "h"),
     ]
 
@@ -678,8 +719,8 @@ class Phi4MMAudioFeatureInputs(TensorSchema):
 
     type: Literal["audio_features"]
 
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+    audio_features: Annotated[
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "t", 80, dynamic_dims={"t"}),
     ]
 
@@ -701,8 +742,8 @@ class Phi4MMAudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
-Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+Phi4MMImageInput: TypeAlias = Phi4MMImagePixelInputs | Phi4MMImageEmbeddingInputs
+Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
 
 
 def cat_with_pad(tensors, dim, padding_value=0):
@@ -710,9 +751,9 @@ def cat_with_pad(tensors, dim, padding_value=0):
     cat along dim, while pad to max for all other dims
     """
     ndim = tensors[0].dim()
-    assert all(
-        t.dim() == ndim for t in
-        tensors[1:]), "All tensors must have the same number of dimensions"
+    assert all(t.dim() == ndim for t in tensors[1:]), (
+        "All tensors must have the same number of dimensions"
+    )
 
     out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
     out_size[dim] = sum(t.shape[dim] for t in tensors)
@@ -732,20 +773,18 @@ def cat_with_pad(tensors, dim, padding_value=0):
 
 
 class Phi4MMProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> Phi4MultimodalConfig:
         return self.ctx.get_hf_config(Phi4MultimodalConfig)
 
     def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor:
         return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs)
 
-    def get_feature_extractor(
-            self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
+    def get_feature_extractor(self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
         return self.get_hf_processor(**kwargs).audio_processor
 
     def get_image_processor(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> Phi4MultimodalImageProcessorFast:
         if processor is None:
             processor = self.get_hf_processor()
@@ -753,11 +792,11 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_dynamic_hd(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> int:
         return self.get_image_processor(processor).dynamic_hd
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None}
 
     def _find_target_aspect_ratio(
@@ -774,9 +813,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
             aspect_ratio = orig_width / orig_height
 
             # calculate the existing image aspect ratio
-            target_ratios = set((i, j) for i in range(1, max_num + 1)
-                                for j in range(1, max_num + 1)
-                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = set(
+                (i, j)
+                for i in range(1, max_num + 1)
+                for j in range(1, max_num + 1)
+                if i * j <= max_num and i * j >= min_num
+            )
             target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 
             # find the closest aspect ratio to the target
@@ -786,6 +828,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
                 target_ratios,
                 orig_width,
                 orig_height,
+                image_size,
             )
 
             # calculate the target width and height
@@ -808,49 +851,56 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     ):
         """
         compute the number of tokens an image is expected to take up considering
-        the image encoder architecture and exclude output features containing 
+        the image encoder architecture and exclude output features containing
         only padding pixels
 
-        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
         32x32 feature map
         NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
         """
         assert vit_image_size % vit_patch_size == 0, (
-            "vit_image_size must be divisible by vit_patch_size")
-        assert (vit_image_size // vit_patch_size %
-                token_compression_factor == 0), (
-                    "vit_image_size // vit_patch_size must be divisible by "
-                    "token_compression_factor")
+            "vit_image_size must be divisible by vit_patch_size"
+        )
+        assert vit_image_size // vit_patch_size % token_compression_factor == 0, (
+            "vit_image_size // vit_patch_size must be divisible by "
+            "token_compression_factor"
+        )
 
         target_aspect_ratio, target_height, target_width = (
-            self._find_target_aspect_ratio(orig_width,
-                                           orig_height,
-                                           vit_image_size,
-                                           dynamic_hd_size,
-                                           min_num=1))
+            self._find_target_aspect_ratio(
+                orig_width, orig_height, vit_image_size, dynamic_hd_size, min_num=1
+            )
+        )
         assert target_aspect_ratio[0] * vit_image_size == target_width, (
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+        )
         assert target_aspect_ratio[1] * vit_image_size == target_height, (
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
-        assert (target_height % vit_image_size == 0
-                and target_width % vit_image_size == 0)
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+        )
+        assert (
+            target_height % vit_image_size == 0 and target_width % vit_image_size == 0
+        )
 
         padding_height, padding_width = _get_padding_size(
-            orig_width, orig_height, target_height, target_width)
-        assert padding_width == 0 or padding_height == 0, \
+            orig_width, orig_height, target_height, target_width
+        )
+        assert padding_width == 0 or padding_height == 0, (
             "padding_width or padding_height must be 0"
+        )
 
         target_feat_width = target_width // vit_patch_size
         target_feat_height = target_height // vit_patch_size
         if padding_width >= vit_patch_size:
             assert padding_height == 0, "padding_height not 0"
             non_pad_feat_width = target_feat_width - math.floor(
-                padding_width / vit_patch_size)
+                padding_width / vit_patch_size
+            )
             non_pad_feat_height = target_feat_height
         elif padding_height >= vit_patch_size:
             assert padding_width == 0, "padding_width not 0"
             non_pad_feat_height = target_feat_height - math.floor(
-                padding_height / vit_patch_size)
+                padding_height / vit_patch_size
+            )
             non_pad_feat_width = target_feat_width
         else:
             # small padding shorter than a vit patch
@@ -867,22 +917,24 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         num_hd_patch_tokens = feat_width * feat_height
         num_hd_newline_tokens = feat_height
         vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size //
-                                   token_compression_factor)**2
+        num_global_image_tokens = (vit_feature_size // token_compression_factor) ** 2
         num_sep_tokens = 1
-        num_global_image_newline_tokens = \
-            vit_feature_size // token_compression_factor
-
-        return (num_global_image_tokens + num_sep_tokens +
-                num_hd_patch_tokens + num_hd_newline_tokens +
-                num_global_image_newline_tokens)
+        num_global_image_newline_tokens = vit_feature_size // token_compression_factor
+
+        return (
+            num_global_image_tokens
+            + num_sep_tokens
+            + num_hd_patch_tokens
+            + num_hd_newline_tokens
+            + num_global_image_newline_tokens
+        )
 
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
@@ -905,7 +957,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> ImageSize:
         vit_image_size = self.get_hf_config().vision_config.image_size
 
@@ -970,7 +1022,6 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -985,28 +1036,34 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        audio_overrides = mm_options.get("audio") if mm_options else None
 
         mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "audio":
-            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                                   num_audios=num_audios),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "audio": self._get_dummy_audios(
+                length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            ),
         }
 
         return mm_data
 
 
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
-
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
@@ -1025,29 +1082,29 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
         audio_data = mm_data.pop("audios", [])
         if audio_data:
-            mm_data['audio'] = audio_data
+            mm_data["audio"] = audio_data
 
-        processed_outputs = super()._call_hf_processor(prompt, mm_data,
-                                                       mm_kwargs, tok_kwargs)
+        processed_outputs = super()._call_hf_processor(
+            prompt, mm_data, mm_kwargs, tok_kwargs
+        )
 
         if "image_pixel_values" in processed_outputs:
             num_img_tokens = [
-                self.info.get_num_image_tokens(image_width=img_size[0],
-                                               image_height=img_size[1])
+                self.info.get_num_image_tokens(
+                    image_width=img_size[0], image_height=img_size[1]
+                )
                 for img_size in processed_outputs["image_sizes"]
             ]
             processed_outputs["num_img_tokens"] = num_img_tokens
 
         if audio_data:
-            audio_features = processed_outputs['audio_input_features']
+            audio_features = processed_outputs["audio_input_features"]
             sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
             feature_sizes = [
-                self.info.get_audio_num_frames(len(audio), sr)
-                for audio in audio_data
+                self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data
             ]
-            processed_outputs['audio_input_features'] = [
-                audio_features[idx, :size]
-                for idx, size in enumerate(feature_sizes)
+            processed_outputs["audio_input_features"] = [
+                audio_features[idx, :size] for idx, size in enumerate(feature_sizes)
             ]
 
         return processed_outputs
@@ -1076,12 +1133,12 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         audio_token_id: int = tokenizer.vocab[tokenizer.audio_token]
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        audio_processor = self.info.get_feature_extractor(
-            **hf_processor_mm_kwargs)
+        audio_processor = self.info.get_feature_extractor(**hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -1100,9 +1157,9 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             # TODO(Isotr0py): support embedding inputs
             audio_len = audios.get_audio_length(item_idx)
             audio_frames = self.info.get_audio_num_frames(
-                audio_len, audio_processor.sampling_rate)
-            audio_embed_size = self.info._compute_audio_embed_size(
-                audio_frames)
+                audio_len, audio_processor.sampling_rate
+            )
+            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
 
             return [audio_token_id] * audio_embed_size
 
@@ -1129,6 +1186,9 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
+
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
@@ -1156,7 +1216,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|image|>"
         if modality.startswith("audio"):
@@ -1188,12 +1248,14 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        self, **kwargs: object
+    ) -> Phi4MMAudioInputs | None:
         """
-        Parse and validate the audio input to the model.  This handles both 
+        Parse and validate the audio input to the model.  This handles both
         audio features and audio embeddings, but only the former is used for
         now.
 
@@ -1210,17 +1272,19 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             return None
 
         if audio_features is not None:
-            return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=flatten_bn(audio_features))
+            return Phi4MMAudioFeatureInputs(
+                type="audio_features",
+                audio_features=audio_features,
+            )
 
         if audio_embeds is not None:
-            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
-                                              data=audio_embeds)
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds)
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
-                             audio_projection_mode: str) -> NestedTensors:
+    def _process_audio_input(
+        self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str
+    ) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
         is pairs of audio features and audio embed lengths.  The audio input is
@@ -1235,7 +1299,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
-        audio_features = audio_input["data"]
+        audio_features = audio_input["audio_features"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
 
@@ -1244,68 +1308,30 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             self.audio_embed(
                 features.unsqueeze(0).to(dtype),
                 audio_projection_mode=audio_projection_mode,
-            ) for features in audio_features
+            )
+            for features in audio_features
         ]
         return audio_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Phi4MMImagePixelInputs]:
-        image_pixel_values: NestedTensors = kwargs.get("image_pixel_values")
-        if image_pixel_values is None:
+        self, **kwargs: object
+    ) -> Phi4MMImagePixelInputs | None:
+        pixel_values = kwargs.get("image_pixel_values")
+        if pixel_values is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
         image_attention_mask = kwargs.get("image_attention_mask")
         num_img_tokens = kwargs.get("num_img_tokens")
-        assert image_sizes is not None and image_attention_mask is not None\
-              and num_img_tokens is not None, "Missing image inputs"
-
-        if is_list_of(image_pixel_values, torch.Tensor):
-            assert all(p.dim() == 5
-                       for p in image_pixel_values), "Incorrect image inputs"
-            # list len is batch_size.
-            # each tensor has dimension: num_img_per_example, num_hd_patches,
-            # channels, height, width.
-            # need to pad along num_hd_patches.
-            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            image_pixel_values = cat_with_pad(image_pixel_values, dim=0)
-        elif isinstance(image_pixel_values, torch.Tensor):
-            # dimension: batch_size, num_img_per_example, num_hd_patches,
-            # channels, height, width.
-            # we flatten first 2 dims to make it a single large batch for
-            # SigLIP Encoder.
-            assert image_pixel_values.dim() == 6, "Incorrect image inputs"
-            image_pixel_values = image_pixel_values.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect image_pixel_values inputs")
-
-        if isinstance(image_attention_mask, list):
-            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
-        elif isinstance(image_attention_mask, torch.Tensor):
-            image_attention_mask = image_attention_mask.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect image_attention_mask inputs")
-
-        if isinstance(image_sizes, list):
-            image_sizes = torch.cat(image_sizes, dim=0)
-        elif isinstance(image_sizes, torch.Tensor):
-            image_sizes = image_sizes.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect image_sizes inputs")
-
-        if isinstance(num_img_tokens, list):
-            num_img_tokens = [
-                n for num_tensor in num_img_tokens
-                for n in num_tensor.tolist()
-            ]
-        elif isinstance(num_img_tokens, torch.Tensor):
-            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
-        else:
-            raise ValueError("Incorrect num_img_tokens inputs")
+        assert (
+            image_sizes is not None
+            and image_attention_mask is not None
+            and num_img_tokens is not None
+        ), "Missing image inputs"
 
         return Phi4MMImagePixelInputs(
             type="pixel_values",
-            data=image_pixel_values,
+            pixel_values=pixel_values,
             image_sizes=image_sizes,
             image_attention_mask=image_attention_mask,
             num_img_tokens=num_img_tokens,
@@ -1317,36 +1343,38 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("image_pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("audio_input_features",
-                             "audio_embeds") and "audios" not in modalities:
-                modalities["audios"] = self._parse_and_validate_audio_input(
-                    **kwargs)
+            if (
+                input_key in ("image_pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key in ("audio_input_features", "audio_embeds")
+                and "audios" not in modalities
+            ):
+                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)
 
         return modalities
 
     def _process_image_input(
-            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        self, image_input: Phi4MMImagePixelInputs
+    ) -> list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             dtype = next(self.image_embed.parameters()).dtype
-            pixel_values = image_input['data'].to(dtype)
-            image_sizes = image_input['image_sizes']
-            image_attention_mask = image_input['image_attention_mask']
-            image_embeds = self.image_embed(pixel_values, image_sizes,
-                                            image_attention_mask)
+            pixel_values = image_input["pixel_values"].to(dtype)
+            image_sizes = image_input["image_sizes"]
+            image_attention_mask = image_input["image_attention_mask"]
+            image_embeds = self.image_embed(
+                pixel_values, image_sizes, image_attention_mask
+            )
         return image_embeds
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
-            return None
+            return []
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor corresponding to a multimodal data item (image or video).
@@ -1354,90 +1382,34 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        audio_projection_mode = 'speech'
+        audio_projection_mode = "speech"
         for modality in modalities:
             # make sure process images first
             if modality == "images":
                 audio_projection_mode = "vision"
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += tuple(vision_embeddings)
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_embeddings = self._process_audio_input(
-                    audio_input, audio_projection_mode=audio_projection_mode)
+                    audio_input, audio_projection_mode=audio_projection_mode
+                )
                 multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Phi4MMImagePixelInputs] = None,
-        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
-    ) -> torch.Tensor:
-        audio_projection_mode = 'speech'
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
-            )
-            audio_projection_mode = 'vision'
-
-        if audio_input is not None:
-            audio_embeds = self._process_audio_input(
-                audio_input, audio_projection_mode=audio_projection_mode)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                audio_embeds,
-                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
-
-            if image_input is None and audio_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    audio_input=audio_input)
-                input_ids = None
-
         hidden_states = self.language_model(
             input_ids,
             positions,
@@ -1450,11 +1422,10 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -1465,8 +1436,9 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         return MultiModelKeys.from_string_field(
             language_model="language_model.",
             connector=[
-                "img_projection", "vision_speech_projection",
-                "speech_projection"
+                "img_projection",
+                "vision_speech_projection",
+                "speech_projection",
             ],
             tower_model=["image_embed", "audio_embed"],
         )
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 47b5ad55ab2d04bd8fb36bb63f1cc431ef340ef6..8425549a7bd20c15ef2d2d6e4a398cdb403d10ce 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -2,41 +2,59 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
-                          SequenceFeatureExtractor, SiglipVisionConfig)
+from transformers import (
+    BatchFeature,
+    PretrainedConfig,
+    ProcessorMixin,
+    SequenceFeatureExtractor,
+    SiglipVisionConfig,
+)
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+    ParallelLMHead,
+)
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
-                                   ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems, MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, ResolvedPromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    ResolvedPromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -47,16 +65,17 @@ _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
 
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
-    'siglip-so400m-patch14-448': {
-        'vit_image_size': 448,
-        'vit_patch_size': 14,
-        'token_compression_factor': 2,
+    "siglip-so400m-patch14-448": {
+        "vit_image_size": 448,
+        "vit_patch_size": 14,
+        "token_compression_factor": 2,
     },
 }
 
 
-def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
-                      target_width: int):
+def _get_padding_size(
+    orig_width: int, orig_height: int, target_height: int, target_width: int
+):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -82,8 +101,7 @@ def get_navit_vision_model(layer_idx: int = -1, **kwargs):
 
     model_config = SiglipVisionConfig(**vision_config, **kwargs)
     if layer_idx < 0:
-        num_hidden_layers = model_config.num_hidden_layers \
-            + layer_idx + 1
+        num_hidden_layers = model_config.num_hidden_layers + layer_idx + 1
     else:
         num_hidden_layers = layer_idx + 1
 
@@ -99,38 +117,38 @@ def get_navit_vision_model(layer_idx: int = -1, **kwargs):
 class Phi4MMImageEncoder(nn.Module):
     """Image embedding."""
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig],
-                 prefix: str = "",
-                 model_dir: str = "") -> None:
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None,
+        prefix: str = "",
+        model_dir: str = "",
+    ) -> None:
         super().__init__()
 
         # n_embed or hidden_size
-        hidden_size = config.n_embd if hasattr(
-            config, 'n_embd') else config.hidden_size
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
 
         # layer_idx to output the img features
         if isinstance(config.img_processor, dict):
-            self.layer_idx = config.img_processor.get('layer_idx', -2)
-            self.type_feature = config.img_processor.get(
-                'type_feature', 'patch')
+            self.layer_idx = config.img_processor.get("layer_idx", -2)
+            self.type_feature = config.img_processor.get("type_feature", "patch")
         else:
             self.layer_idx = -2
-            self.type_feature = 'patch'
+            self.type_feature = "patch"
 
         self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx)
 
         pe_weight = self.img_processor.embeddings.position_embedding.weight
         L, D = pe_weight.size()
         H = int(math.sqrt(L))
-        assert H**2 == L, f'position embedding size {L} is not square'
+        assert H**2 == L, f"position embedding size {L} is not square"
         if H % 2 != 0:
             self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
             H += 1
         image_dim_out = D
         # ((448/14)//2)**2
-        self.num_img_tokens = (H // 2)**2
+        self.num_img_tokens = (H // 2) ** 2
         self.base_feat_height_target = H
 
         self.image_dim_out = image_dim_out
@@ -145,37 +163,35 @@ class Phi4MMImageEncoder(nn.Module):
         self.crop_size = 448
 
         # image token compression
-        self.image_token_compression_cls = 'avg_pool_2d'
+        self.image_token_compression_cls = "avg_pool_2d"
         self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
         self.base_feat_height_reduction = 1
         self.base_feat_height_target = self.base_feat_height_target // 2
 
         # with_hd_transform and with_learnable_separator should have same value
-        assert self.use_hd_transform == self.with_learnable_separator, \
-        'use_hd_transform and with_learnable_separator should have same value'
-        assert self.use_hd_transform, \
-            'learnable separator is only for hd transform'
+        assert self.use_hd_transform == self.with_learnable_separator, (
+            "use_hd_transform and with_learnable_separator should have same value"
+        )
+        assert self.use_hd_transform, "learnable separator is only for hd transform"
         # 1024 * 4, merge spatial to channel dimension
         self.glb_GN = nn.Parameter(
-            torch.zeros([
-                1, 1, self.image_dim_out * self.base_feat_height_reduction**2
-            ]))
+            torch.zeros([1, 1, self.image_dim_out * self.base_feat_height_reduction**2])
+        )
         self.sub_GN = nn.Parameter(
-            torch.zeros([
-                1, 1, 1,
-                self.image_dim_out * self.base_feat_height_reduction**2
-            ]))
+            torch.zeros(
+                [1, 1, 1, self.image_dim_out * self.base_feat_height_reduction**2]
+            )
+        )
 
         dim_projection = hidden_size
         depth = 2
         layers = [
-            nn.Linear(image_dim_out * self.base_feat_height_reduction**2,
-                      dim_projection)
+            nn.Linear(
+                image_dim_out * self.base_feat_height_reduction**2, dim_projection
+            )
         ]
         for _ in range(1, depth):
-            layers.extend(
-                [nn.GELU(),
-                 nn.Linear(dim_projection, dim_projection)])
+            layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
         self.img_projection = nn.Sequential(*layers)
 
         self.vocab_size = config.vocab_size
@@ -183,24 +199,24 @@ class Phi4MMImageEncoder(nn.Module):
 
         self.use_out_place_operations = False
 
-    def get_img_features(self,
-                         img_embeds: torch.FloatTensor,
-                         attention_mask=None) -> torch.FloatTensor:
-
-        img_feature = self.img_processor(img_embeds,
-                                         patch_attention_mask=attention_mask)
+    def get_img_features(
+        self, img_embeds: torch.FloatTensor, attention_mask=None
+    ) -> torch.FloatTensor:
+        img_feature = self.img_processor(
+            img_embeds, patch_attention_mask=attention_mask
+        )
 
         if self.type_feature == "patch":
             patch_feature = img_feature
 
             use_token_compression = self.image_token_compression is not None
-            use_padding = getattr(self, 'img_processor_padding',
-                                  None) is not None
+            use_padding = getattr(self, "img_processor_padding", None) is not None
             if use_token_compression or use_padding:
                 # reshape to 2D tensor
                 width = int(math.sqrt(patch_feature.size(1)))
-                patch_feature = patch_feature.view(-1, width, width,
-                                                   patch_feature.size(-1))
+                patch_feature = patch_feature.view(
+                    -1, width, width, patch_feature.size(-1)
+                )
                 # convert to NCHW
                 patch_feature = patch_feature.permute(0, 3, 1, 2)
 
@@ -214,15 +230,19 @@ class Phi4MMImageEncoder(nn.Module):
                 patch_feature = patch_feature.view(
                     -1,
                     patch_feature.size(1) * patch_feature.size(2),
-                    patch_feature.size(-1))
+                    patch_feature.size(-1),
+                )
 
             return patch_feature
 
         raise NotImplementedError
 
-    def forward(self, pixel_values: torch.FloatTensor,
-                image_sizes: torch.Tensor,
-                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        image_attention_mask: torch.Tensor,
+    ) -> list[torch.FloatTensor]:
         """
         process image and return vision embeddings.
 
@@ -251,25 +271,27 @@ class Phi4MMImageEncoder(nn.Module):
 
         img_features = self.get_img_features(
             pixel_values,
-            image_attention_mask.type(torch.BoolTensor).flatten(
-                0, 1).to(target_device))
+            image_attention_mask.type(torch.BoolTensor).flatten(0, 1).to(target_device),
+        )
 
         base_feat_height_target = self.base_feat_height_target
         base_resolution = self.crop_size
         base_feat_height_reduction = self.base_feat_height_reduction
 
-        base_feat_height = base_feat_width = int(np.sqrt(
-            img_features.shape[1]))
-        assert base_feat_height == base_feat_height_target \
-            and base_feat_width == base_feat_height_target, \
-                (f"base_feat_height: {base_feat_height}, "
-                 f"base_feat_width: {base_feat_width}, "
-                 f"expect {base_feat_height_target} features for hd transform")
+        base_feat_height = base_feat_width = int(np.sqrt(img_features.shape[1]))
+        assert (
+            base_feat_height == base_feat_height_target
+            and base_feat_width == base_feat_height_target
+        ), (
+            f"base_feat_height: {base_feat_height}, "
+            f"base_feat_width: {base_feat_width}, "
+            f"expect {base_feat_height_target} features for hd transform"
+        )
 
         # bs x max_num_crops x (24x24) x C
-        img_features = img_features.view(bs, -1,
-                                         base_feat_height * base_feat_width,
-                                         self.image_dim_out)
+        img_features = img_features.view(
+            bs, -1, base_feat_height * base_feat_width, self.image_dim_out
+        )
         C = self.image_dim_out
         H = base_feat_height
 
@@ -288,22 +310,32 @@ class Phi4MMImageEncoder(nn.Module):
             global_img_feature = img_features[_bs, :1]
 
             # 1 x 12 x 12 x 4096
-            glb_img = global_img_feature.reshape(1, H, H, C).reshape(
-                1, H // base_feat_height_reduction, base_feat_height_reduction,
-                H // base_feat_height_reduction, base_feat_height_reduction,
-                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
-                    1, H // base_feat_height_reduction,
+            glb_img = (
+                global_img_feature.reshape(1, H, H, C)
+                .reshape(
+                    1,
                     H // base_feat_height_reduction,
-                    base_feat_height_reduction * base_feat_height_reduction *
-                    C).contiguous()
-            temp_glb_GN = self.sub_GN.repeat(1,
-                                             H // base_feat_height_reduction,
-                                             1, 1)
+                    base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    C,
+                )
+                .contiguous()
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    1,
+                    H // base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction * C,
+                )
+                .contiguous()
+            )
+            temp_glb_GN = self.sub_GN.repeat(1, H // base_feat_height_reduction, 1, 1)
 
             # 1 x 156 x 4096
             glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
-                1, -1,
-                base_feat_height_reduction * base_feat_height_reduction * C)
+                1, -1, base_feat_height_reduction * base_feat_height_reduction * C
+            )
 
             # (max_num_crops-1) x (12x12) x C
             sub_img = img_features[_bs, 1:]
@@ -313,79 +345,106 @@ class Phi4MMImageEncoder(nn.Module):
 
             # (num_crops, 12, 2, 12, 2, 1024) ->
             # (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
-            sub_img = sub_img.reshape(B_, H, H, C).reshape(
-                B_, H // base_feat_height_reduction,
-                base_feat_height_reduction, H // base_feat_height_reduction,
-                base_feat_height_reduction,
-                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
-                    B_, -1, base_feat_height_reduction *
-                    base_feat_height_reduction * C).contiguous()
-            sub_img = sub_img.reshape(
-                1, h, w, base_feat_height // base_feat_height_reduction,
-                base_feat_width // base_feat_height_reduction,
-                -1).permute(0, 1, 3, 2, 4, 5).reshape(
-                    1, h * base_feat_height // base_feat_height_reduction,
+            sub_img = (
+                sub_img.reshape(B_, H, H, C)
+                .reshape(
+                    B_,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    C,
+                )
+                .contiguous()
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    B_, -1, base_feat_height_reduction * base_feat_height_reduction * C
+                )
+                .contiguous()
+            )
+            sub_img = (
+                sub_img.reshape(
+                    1,
+                    h,
+                    w,
+                    base_feat_height // base_feat_height_reduction,
+                    base_feat_width // base_feat_height_reduction,
+                    -1,
+                )
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    1,
+                    h * base_feat_height // base_feat_height_reduction,
                     w * base_feat_width // base_feat_height_reduction,
-                    base_feat_height_reduction * base_feat_height_reduction *
-                    C)
-
-            if image_attention_mask is not None and len(
-                    image_attention_mask) > 0:
-                reshaped_image_attention_mask = image_attention_mask[
-                    _bs, 1:B_ + 1, 0::2, 0::2].reshape(
-                        1, h, w,
+                    base_feat_height_reduction * base_feat_height_reduction * C,
+                )
+            )
+
+            if image_attention_mask is not None and len(image_attention_mask) > 0:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[_bs, 1 : B_ + 1, 0::2, 0::2]
+                    .reshape(
+                        1,
+                        h,
+                        w,
                         base_feat_height // base_feat_height_reduction,
-                        base_feat_width // base_feat_height_reduction).permute(
-                            0, 1, 3, 2, 4).reshape(
-                                1, h * base_feat_height //
-                                base_feat_height_reduction, w *
-                                base_feat_width // base_feat_height_reduction)
-                useful_height = int(
-                    reshaped_image_attention_mask[0, :, 0].sum().item())
-                useful_width = int(
-                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                        base_feat_width // base_feat_height_reduction,
+                    )
+                    .permute(0, 1, 3, 2, 4)
+                    .reshape(
+                        1,
+                        h * base_feat_height // base_feat_height_reduction,
+                        w * base_feat_width // base_feat_height_reduction,
+                    )
+                )
+                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
                 sub_img = sub_img[:, :useful_height, :useful_width]
                 temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
-                temp_len = int(
-                    image_attention_mask[_bs, :B_ + 1, 0::2, 0::2].sum().item(
-                    )) + (useful_height +
-                          1) + base_feat_height // base_feat_height_reduction
+                temp_len = (
+                    int(image_attention_mask[_bs, : B_ + 1, 0::2, 0::2].sum().item())
+                    + (useful_height + 1)
+                    + base_feat_height // base_feat_height_reduction
+                )
             else:
                 temp_sub_GN = self.sub_GN.repeat(
-                    1, h * base_feat_height // base_feat_height_reduction, 1,
-                    1)
-                temp_len = int((h * w + 1) * self.num_img_tokens + 1 +
-                               (h + 1) * base_feat_height //
-                               base_feat_height_reduction)
+                    1, h * base_feat_height // base_feat_height_reduction, 1, 1
+                )
+                temp_len = int(
+                    (h * w + 1) * self.num_img_tokens
+                    + 1
+                    + (h + 1) * base_feat_height // base_feat_height_reduction
+                )
 
             sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
-                1, -1,
-                base_feat_height_reduction * base_feat_height_reduction * C)
+                1, -1, base_feat_height_reduction * base_feat_height_reduction * C
+            )
             # (1, num_img_tokens, 1024*4)
 
             # glb + sub
-            if self.hd_transform_order == 'glb_sub':
-                output_imgs.append(
-                    torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
-            elif self.hd_transform_order == 'sub_glb':
-                output_imgs.append(
-                    torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+            if self.hd_transform_order == "glb_sub":
+                output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+            elif self.hd_transform_order == "sub_glb":
+                output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
             else:
                 raise NotImplementedError(
                     f'hd_transform_order = {self.hd_transform_order}, "\
-                        "not implemented')
+                        "not implemented'
+                )
 
-            #temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
-            assert temp_len == output_imgs[-1].shape[
-                1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
+            # temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+            assert temp_len == output_imgs[-1].shape[1], (
+                f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
                     "{output_imgs[-1].shape[1]}'
+            )
 
             output_len.append(temp_len)
 
         img_set_tensor = []
         for _output_img in output_imgs:
             img_feature_proj = self.img_projection(
-                _output_img.to(target_device).to(target_dtype))
+                _output_img.to(target_device).to(target_dtype)
+            )
             img_set_tensor.append(img_feature_proj.squeeze(0))
 
         return img_set_tensor
@@ -406,10 +465,11 @@ class Phi4MMImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"]
 
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"}
-                    ),  # may be different per batch and image
+    pixel_values: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape(
+            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
+        ),  # may be different per batch and image
     ]
 
     image_sizes: Annotated[
@@ -437,8 +497,8 @@ class Phi4MMAudioFeatureInputs(TensorSchema):
 
     type: Literal["audio_features"]
 
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+    audio_features: Annotated[
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "t", 80, dynamic_dims={"t"}),
     ]
 
@@ -451,6 +511,7 @@ class Phi4MMAudioEmbeddingInputs(TensorSchema):
         - f: Audio feature size
         - h: Hidden size (must match language model backbone)
     """
+
     type: Literal["audio_embeds"]
     data: Annotated[
         NestedTensors,
@@ -458,7 +519,7 @@ class Phi4MMAudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
 
 
 def cat_with_pad(tensors, dim, padding_value=0):
@@ -466,9 +527,9 @@ def cat_with_pad(tensors, dim, padding_value=0):
     cat along dim, while pad to max for all other dims
     """
     ndim = tensors[0].dim()
-    assert all(
-        t.dim() == ndim for t in
-        tensors[1:]), "All tensors must have the same number of dimensions"
+    assert all(t.dim() == ndim for t in tensors[1:]), (
+        "All tensors must have the same number of dimensions"
+    )
 
     out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
     out_size[dim] = sum(t.shape[dim] for t in tensors)
@@ -488,29 +549,27 @@ def cat_with_pad(tensors, dim, padding_value=0):
 
 
 class Phi4MMProcessingInfo(BaseProcessingInfo):
-
     @property
     def image_tokens(self) -> list[str]:
-        return [f"<|image_{i+1}|>" for i in range(100)]
+        return [f"<|image_{i + 1}|>" for i in range(100)]
 
     @property
     def audio_tokens(self) -> list[str]:
-        return [f"<|audio_{i+1}|>" for i in range(100)]
+        return [f"<|audio_{i + 1}|>" for i in range(100)]
 
     def get_dynamic_hd(
         self,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
         image_processor = processor.image_processor
         return image_processor.dynamic_hd
 
-    def get_feature_extractor(self,
-                              **kwargs: object) -> SequenceFeatureExtractor:
+    def get_feature_extractor(self, **kwargs: object) -> SequenceFeatureExtractor:
         return self.get_hf_processor(**kwargs).audio_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None}
 
     def _find_target_aspect_ratio(
@@ -527,9 +586,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
             aspect_ratio = orig_width / orig_height
 
             # calculate the existing image aspect ratio
-            target_ratios = set((i, j) for i in range(1, max_num + 1)
-                                for j in range(1, max_num + 1)
-                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = set(
+                (i, j)
+                for i in range(1, max_num + 1)
+                for j in range(1, max_num + 1)
+                if i * j <= max_num and i * j >= min_num
+            )
             target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 
             # find the closest aspect ratio to the target
@@ -562,49 +624,56 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     ):
         """
         compute the number of tokens an image is expected to take up considering
-        the image encoder architecture and exclude output features containing 
+        the image encoder architecture and exclude output features containing
         only padding pixels
 
-        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
         32x32 feature map
         NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
         """
         assert vit_image_size % vit_patch_size == 0, (
-            "vit_image_size must be divisible by vit_patch_size")
-        assert (vit_image_size // vit_patch_size %
-                token_compression_factor == 0), (
-                    "vit_image_size // vit_patch_size must be divisible by "
-                    "token_compression_factor")
+            "vit_image_size must be divisible by vit_patch_size"
+        )
+        assert vit_image_size // vit_patch_size % token_compression_factor == 0, (
+            "vit_image_size // vit_patch_size must be divisible by "
+            "token_compression_factor"
+        )
 
         target_aspect_ratio, target_height, target_width = (
-            self._find_target_aspect_ratio(orig_width,
-                                           orig_height,
-                                           vit_image_size,
-                                           dynamic_hd_size,
-                                           min_num=1))
+            self._find_target_aspect_ratio(
+                orig_width, orig_height, vit_image_size, dynamic_hd_size, min_num=1
+            )
+        )
         assert target_aspect_ratio[0] * vit_image_size == target_width, (
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+        )
         assert target_aspect_ratio[1] * vit_image_size == target_height, (
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
-        assert (target_height % vit_image_size == 0
-                and target_width % vit_image_size == 0)
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+        )
+        assert (
+            target_height % vit_image_size == 0 and target_width % vit_image_size == 0
+        )
 
         padding_height, padding_width = _get_padding_size(
-            orig_width, orig_height, target_height, target_width)
-        assert padding_width == 0 or padding_height == 0, \
+            orig_width, orig_height, target_height, target_width
+        )
+        assert padding_width == 0 or padding_height == 0, (
             "padding_width or padding_height must be 0"
+        )
 
         target_feat_width = target_width // vit_patch_size
         target_feat_height = target_height // vit_patch_size
         if padding_width >= vit_patch_size:
             assert padding_height == 0, "padding_height not 0"
             non_pad_feat_width = target_feat_width - math.floor(
-                padding_width / vit_patch_size)
+                padding_width / vit_patch_size
+            )
             non_pad_feat_height = target_feat_height
         elif padding_height >= vit_patch_size:
             assert padding_width == 0, "padding_width not 0"
             non_pad_feat_height = target_feat_height - math.floor(
-                padding_height / vit_patch_size)
+                padding_height / vit_patch_size
+            )
             non_pad_feat_width = target_feat_width
         else:
             # small padding shorter than a vit patch
@@ -621,32 +690,33 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         num_hd_patch_tokens = feat_width * feat_height
         num_hd_newline_tokens = feat_height
         vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size //
-                                   token_compression_factor)**2
+        num_global_image_tokens = (vit_feature_size // token_compression_factor) ** 2
         num_sep_tokens = 1
-        num_global_image_newline_tokens = \
-            vit_feature_size // token_compression_factor
-
-        return (num_global_image_tokens + num_sep_tokens +
-                num_hd_patch_tokens + num_hd_newline_tokens +
-                num_global_image_newline_tokens)
+        num_global_image_newline_tokens = vit_feature_size // token_compression_factor
+
+        return (
+            num_global_image_tokens
+            + num_sep_tokens
+            + num_hd_patch_tokens
+            + num_hd_newline_tokens
+            + num_global_image_newline_tokens
+        )
 
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
-        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
-            vision_encoder_name]
-        vit_image_size = prepro_config['vit_image_size']
-        vit_patch_size = prepro_config['vit_patch_size']
-        token_compression_factor = prepro_config['token_compression_factor']
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        vit_image_size = prepro_config["vit_image_size"]
+        vit_patch_size = prepro_config["vit_patch_size"]
+        token_compression_factor = prepro_config["token_compression_factor"]
 
         dynamic_hd_size = self.get_dynamic_hd(processor=processor)
 
@@ -663,15 +733,14 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(
         self,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> ImageSize:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
-        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
-            vision_encoder_name]
-        vit_image_size = prepro_config['vit_image_size']
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        vit_image_size = prepro_config["vit_image_size"]
 
         max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
         return ImageSize(height=max_side, width=vit_image_size)
@@ -717,8 +786,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         compression rate.
         """
         hf_config = self.get_hf_config()
-        compression_rate = hf_config.embd_layer['audio_embd_layer'][
-            'compression_rate']
+        compression_rate = hf_config.embd_layer["audio_embd_layer"]["compression_rate"]
         # NOTE: this is a hard-coded value but might be configurable
         # in the future
         qformer_compression_rate = 1
@@ -736,7 +804,6 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -750,32 +817,39 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        audio_overrides = mm_options.get("audio") if mm_options else None
 
         mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "audio":
-            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                                   num_audios=num_audios),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "audio": self._get_dummy_audios(
+                length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            ),
         }
 
         return mm_data
 
 
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
-
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
-                                    audio_resample_method="scipy")
+        return MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate, audio_resample_method="scipy"
+        )
 
     def _call_hf_processor(
         self,
@@ -790,27 +864,27 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
-        if (audio_data := mm_data.get("audios", [])):
-            mm_data['audios'] = [(data, sr) for data in audio_data]
+        if audio_data := mm_data.get("audios", []):
+            mm_data["audios"] = [(data, sr) for data in audio_data]
 
-        processed_outputs = super()._call_hf_processor(prompt, mm_data,
-                                                       mm_kwargs, tok_kwargs)
+        processed_outputs = super()._call_hf_processor(
+            prompt, mm_data, mm_kwargs, tok_kwargs
+        )
 
         num_img_tokens = [
-            self.info.get_num_image_tokens(image_width=img_size[0],
-                                           image_height=img_size[1])
+            self.info.get_num_image_tokens(
+                image_width=img_size[0], image_height=img_size[1]
+            )
             for img_size in processed_outputs["image_sizes"]
         ]
         processed_outputs["num_img_tokens"] = num_img_tokens
 
-        audio_features = processed_outputs['input_audio_embeds']
+        audio_features = processed_outputs["input_audio_embeds"]
         feature_sizes = [
-            self.info.get_audio_num_frames(len(audio), sr)
-            for audio in audio_data
+            self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data
         ]
-        processed_outputs['input_audio_embeds'] = [
-            audio_features[idx, :size]
-            for idx, size in enumerate(feature_sizes)
+        processed_outputs["input_audio_embeds"] = [
+            audio_features[idx, :size] for idx, size in enumerate(feature_sizes)
         ]
 
         return processed_outputs
@@ -836,13 +910,13 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
-        feature_extractor = self.info.get_feature_extractor(
-            **hf_processor_mm_kwargs)
+        feature_extractor = self.info.get_feature_extractor(**hf_processor_mm_kwargs)
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_image_tokens = images.get_feature_size(item_idx)
@@ -861,9 +935,9 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             # TODO(Isotr0py): support embedding inputs
             audio_len = audios.get_audio_length(item_idx)
             audio_frames = self.info.get_audio_num_frames(
-                audio_len, feature_extractor.sampling_rate)
-            audio_embed_size = self.info._compute_audio_embed_size(
-                audio_frames)
+                audio_len, feature_extractor.sampling_rate
+            )
+            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
 
             return [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
 
@@ -909,6 +983,9 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
+
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
@@ -923,17 +1000,15 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             "base_layer.": "",
         },
         orig_to_new_prefix={
-            "model.embed_tokens_extend.audio_embed.audio_projection.vision.":
-            "embed_tokens_extend.audio_projection_for_vision.",
-            "model.embed_tokens_extend.audio_embed.audio_projection.speech.":
-            "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",  # noqa: E501
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",  # noqa: E501
             "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
             "model.embed_tokens_extend.image_embed.": "vision_encoder.",
         },
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"<|image_{i}|>"
         if modality.startswith("audio"):
@@ -947,27 +1022,24 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         multimodal_config = vllm_config.model_config.multimodal_config
         assert multimodal_config, "multimodal_config is required"
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.lora_config = lora_config
 
         # Tensor/Pipeline parallel not supported for now.
-        assert get_pp_group(
-        ).world_size == 1, "pipeline parallel is not supported"
+        assert get_pp_group().world_size == 1, "pipeline parallel is not supported"
 
         self.vision_encoder = Phi4MMImageEncoder(
             config,
             quant_config,
             prefix="model.vision_embed_tokens",
-            model_dir=config._name_or_path)
+            model_dir=config._name_or_path,
+        )
 
         if isinstance(config.embd_layer["audio_embd_layer"], dict):
             embedding_config = {
-                "embedding_cls":
-                config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                "embedding_cls": config.embd_layer["audio_embd_layer"]["embedding_cls"],
                 **config.embd_layer["audio_embd_layer"],
             }
         else:
@@ -976,30 +1048,26 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             }
 
         self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = LlamaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        self, **kwargs: object
+    ) -> Phi4MMAudioInputs | None:
         """
-        Parse and validate the audio input to the model.  This handles both 
+        Parse and validate the audio input to the model.  This handles both
         audio features and audio embeddings, but only the former is used for
         now.
 
@@ -1016,17 +1084,19 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             return None
 
         if audio_features is not None:
-            return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=flatten_bn(audio_features))
+            return Phi4MMAudioFeatureInputs(
+                type="audio_features",
+                audio_features=audio_features,
+            )
 
         if audio_embeds is not None:
-            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
-                                              data=audio_embeds)
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds)
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
-                             audio_projection_mode: str) -> NestedTensors:
+    def _process_audio_input(
+        self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str
+    ) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
         is pairs of audio features and audio embed lengths.  The audio input is
@@ -1041,7 +1111,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
-        audio_features = audio_input["data"]
+        audio_features = audio_input["audio_features"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
 
@@ -1050,68 +1120,30 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             self.embed_tokens_extend(
                 features.to(dtype),
                 audio_projection_mode=audio_projection_mode,
-            ) for features in audio_features
+            )
+            for features in audio_features
         ]
         return audio_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Phi4MMImagePixelInputs]:
-        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
-        if input_image_embeds is None:
+        self, **kwargs: object
+    ) -> Phi4MMImagePixelInputs | None:
+        pixel_values = kwargs.get("input_image_embeds")
+        if pixel_values is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
         image_attention_mask = kwargs.get("image_attention_mask")
         num_img_tokens = kwargs.get("num_img_tokens")
-        assert image_sizes is not None and image_attention_mask is not None\
-              and num_img_tokens is not None, "Missing image inputs"
-
-        if is_list_of(input_image_embeds, torch.Tensor):
-            assert all(p.dim() == 5
-                       for p in input_image_embeds), "Incorrect image inputs"
-            # list len is batch_size.
-            # each tensor has dimension: num_img_per_example, num_hd_patches,
-            # channels, height, width.
-            # need to pad along num_hd_patches.
-            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
-        elif isinstance(input_image_embeds, torch.Tensor):
-            # dimension: batch_size, num_img_per_example, num_hd_patches,
-            # channels, height, width.
-            # we flatten first 2 dims to make it a single large batch for
-            # SigLIP Encoder.
-            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
-            input_image_embeds = input_image_embeds.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect input_image_embeds inputs")
-
-        if isinstance(image_attention_mask, list):
-            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
-        elif isinstance(image_attention_mask, torch.Tensor):
-            image_attention_mask = image_attention_mask.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect image_attention_mask inputs")
-
-        if isinstance(image_sizes, list):
-            image_sizes = torch.cat(image_sizes, dim=0)
-        elif isinstance(image_sizes, torch.Tensor):
-            image_sizes = image_sizes.flatten(0, 1)
-        else:
-            raise ValueError("Incorrect image_sizes inputs")
-
-        if isinstance(num_img_tokens, list):
-            num_img_tokens = [
-                n for num_tensor in num_img_tokens
-                for n in num_tensor.tolist()
-            ]
-        elif isinstance(num_img_tokens, torch.Tensor):
-            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
-        else:
-            raise ValueError("Incorrect num_img_tokens inputs")
+        assert (
+            image_sizes is not None
+            and image_attention_mask is not None
+            and num_img_tokens is not None
+        ), "Missing image inputs"
 
         return Phi4MMImagePixelInputs(
             type="pixel_values",
-            data=input_image_embeds,
+            pixel_values=pixel_values,
             image_sizes=image_sizes,
             image_attention_mask=image_attention_mask,
             num_img_tokens=num_img_tokens,
@@ -1123,35 +1155,35 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("input_image_embeds",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("input_audio_embeds",
-                             "audio_embeds") and "audios" not in modalities:
-                modalities["audios"] = self._parse_and_validate_audio_input(
-                    **kwargs)
+            if (
+                input_key in ("input_image_embeds", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key in ("input_audio_embeds", "audio_embeds")
+                and "audios" not in modalities
+            ):
+                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)
 
         return modalities
 
     def _process_image_input(
-            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
-
+        self, image_input: Phi4MMImagePixelInputs
+    ) -> list[torch.Tensor]:
         dtype = next(self.vision_encoder.parameters()).dtype
-        pixel_values = image_input['data'].to(dtype)
-        image_sizes = image_input['image_sizes']
-        image_attention_mask = image_input['image_attention_mask']
-        image_embeds = self.vision_encoder(pixel_values, image_sizes,
-                                           image_attention_mask)
+        pixel_values = image_input["pixel_values"].to(dtype)
+        image_sizes = image_input["image_sizes"]
+        image_attention_mask = image_input["image_attention_mask"]
+        image_embeds = self.vision_encoder(
+            pixel_values, image_sizes, image_attention_mask
+        )
         return image_embeds
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor corresponding to a multimodal data item (image or video).
@@ -1159,91 +1191,34 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        audio_projection_mode = 'speech'
+        audio_projection_mode = "speech"
         for modality in modalities:
             # make sure process images first
             if modality == "images":
                 audio_projection_mode = "vision"
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += tuple(vision_embeddings)
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_embeddings = self._process_audio_input(
-                    audio_input, audio_projection_mode=audio_projection_mode)
+                    audio_input, audio_projection_mode=audio_projection_mode
+                )
                 multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.embed_tokens(input_ids)
-        if multimodal_embeddings is not None and len(
-                multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Phi4MMImagePixelInputs] = None,
-        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
-    ) -> torch.Tensor:
-        audio_projection_mode = 'speech'
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
-            )
-            audio_projection_mode = 'vision'
-
-        if audio_input is not None:
-            audio_embeds = self._process_audio_input(
-                audio_input, audio_projection_mode=audio_projection_mode)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                audio_embeds,
-                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
-
-            if image_input is None and audio_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    audio_input=audio_input)
-                input_ids = None
-
         hidden_states = self.model(
             input_ids,
             positions,
@@ -1256,12 +1231,11 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> None:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None:
         loader = AutoWeightsLoader(self, skip_substrs=["lora"])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index a1c452053ddd2918e1fe190c9f89dc9a26714661..493fdb465fbad4d40eb370fe8d90fe315f94caa9 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -7,22 +7,31 @@
 #!/usr/bin/env python3
 import abc
 import math
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    CheckpointWrapper)
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    FullyShardedDataParallel)
+    CheckpointWrapper,
+)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
 from transformers import PretrainedConfig
 
 from vllm.model_executor.models.phi4mm_utils import (
-    AbsolutePositionalEncoding, ConvModule, FeedForward, MeanVarianceNormLayer,
-    MultiHeadedAttention, MultiSequential, NemoConvSubsampling,
-    T5RelativeAttentionLogitBias, adaptive_enc_mask, get_offset, unfold_tensor)
+    AbsolutePositionalEncoding,
+    ConvModule,
+    FeedForward,
+    MeanVarianceNormLayer,
+    MultiHeadedAttention,
+    MultiSequential,
+    NemoConvSubsampling,
+    T5RelativeAttentionLogitBias,
+    adaptive_enc_mask,
+    get_offset,
+    unfold_tensor,
+)
 
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
 
@@ -40,9 +49,9 @@ class ConformerEncoderLayer(nn.Module):
             if > 0, ext_pw_out_channel is a dim channel size
              for the last pointwise conv after swish activation.
         depthwise_seperable_out_channel: int
-            if set different to 0, the number of 
+            if set different to 0, the number of
              depthwise_seperable_out_channel will be used as a
-             channel_out of the second conv1d layer. 
+             channel_out of the second conv1d layer.
              otherwise, it equals to 0, the second conv1d layer is skipped.
         depthwise_multiplier: int
             number of input_dim channels duplication. this value
@@ -119,10 +128,10 @@ class ConformerEncoderLayer(nn.Module):
              and allow the onnx conversion for inference.
               default False.
         use_pt_scaled_dot_product_attention: bool, optional
-            if set to True, use pytorch's scaled dot product attention 
+            if set to True, use pytorch's scaled dot product attention
             implementation in training.
         attn_group_sizes: int, optional
-            the number of groups to use for attention, default 1 
+            the number of groups to use for attention, default 1
             (Multi-Head Attention),
             1 = typical Multi-Head Attention,
             1 < attn_group_sizes < attention_heads = Grouped-Query Attention
@@ -173,8 +182,7 @@ class ConformerEncoderLayer(nn.Module):
             attention_inner_dim,
             attention_glu_type,
             bias_in_glu,
-            use_pt_scaled_dot_product_attention=
-            use_pt_scaled_dot_product_attention,
+            use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
             group_size=attn_group_sizes,
         )
         self.conv = ConvModule(
@@ -213,7 +221,7 @@ class ConformerEncoderLayer(nn.Module):
         pos_k: torch.Tensor,
         pos_v: torch.Tensor,
         mask: torch.Tensor,
-        relative_attention_bias: Optional[Tensor] = None,
+        relative_attention_bias: Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """ConformerEncoder forward.
 
@@ -296,7 +304,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             (Q*K^T + B) implemented in cmb.basics.embedding.
             [T5/ALiBi]RelativeAttentionLogitBias
             usage: relative_attention_bias_args={"type": t5/alibi}
-            additional method-specific arguments can be provided (see 
+            additional method-specific arguments can be provided (see
             transformer_base.py)
         positional_dropout_rate: float, optional
             dropout rate after positional encoding. default 0.0
@@ -310,10 +318,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             supraframe utts in batch.
             Default: none
         attention_group_size: int, optional
-            the number of groups to use for attention, default 1 
+            the number of groups to use for attention, default 1
             (Multi-Head Attention),
             1 = typical Multi-Head Attention,
-            1 < attention_group_size < attention_heads = Grouped-Query 
+            1 < attention_group_size < attention_heads = Grouped-Query
             Attention
             attention_group_size = attention_heads = Multi-Query Attention
     """
@@ -321,8 +329,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
     def __init__(
         self,
         input_size: int,
-        chunk_size: Union[int, list[int]],
-        left_chunk: Union[int, list[int]],
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
         attention_dim: int = 256,
         attention_heads: int = 4,
         input_layer: str = "nemo_conv",
@@ -331,13 +339,12 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         time_reduction: int = 4,
         dropout_rate: float = 0.0,
         padding_idx: int = -1,
-        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        relative_attention_bias_args: dict[str, Any] | None = None,
         positional_dropout_rate: float = 0.0,
-        nemo_conv_settings: Optional[dict[str, Any]] = None,
-        conv2d_extra_padding: Literal["feat", "feat_time", "none",
-                                      True] = "none",
+        nemo_conv_settings: dict[str, Any] | None = None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
         attention_group_size: int = 1,
-        encoder_embedding_config: Optional[dict[str, Any]] = None,
+        encoder_embedding_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.input_size = input_size
@@ -366,70 +373,77 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             if nemo_conv_settings:
                 default_nemo_conv_settings.update(nemo_conv_settings)
                 for i in ["subsampling_factor", "feat_in", "feat_out"]:
-                    assert (
-                        i not in nemo_conv_settings
-                    ), "{i} should be specified outside of the NeMo dictionary"
+                    assert i not in nemo_conv_settings, (
+                        "{i} should be specified outside of the NeMo dictionary"
+                    )
 
-            self.embed = NemoConvSubsampling(**default_nemo_conv_settings, )
+            self.embed = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
         else:
             raise ValueError("unknown input_layer: " + input_layer)
 
-        self.pos_emb = AbsolutePositionalEncoding(attention_dim,
-                                                  positional_dropout_rate)
+        self.pos_emb = AbsolutePositionalEncoding(
+            attention_dim, positional_dropout_rate
+        )
 
         self.relative_attention_bias_type = (
             relative_attention_bias_args.get("type")
-            if relative_attention_bias_args else None)
+            if relative_attention_bias_args
+            else None
+        )
         if self.relative_attention_bias_type == "t5":
-            assert (self.num_heads % self.attention_group_size == 0
-                    ), "attention_group_size must divide n_head"
+            assert self.num_heads % self.attention_group_size == 0, (
+                "attention_group_size must divide n_head"
+            )
             self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
                 self.num_heads // self.attention_group_size,
                 max_distance=relative_attention_bias_args.get(
-                    "t5_bias_max_distance", 1000),
-                symmetric=relative_attention_bias_args.get(
-                    "t5_bias_symmetric", False),
+                    "t5_bias_max_distance", 1000
+                ),
+                symmetric=relative_attention_bias_args.get("t5_bias_symmetric", False),
             )
         else:
             raise NotImplementedError
 
         self.encoder_embedding = MeanVarianceNormLayer(
-            self.encoder_embedding_config["input_size"])
+            self.encoder_embedding_config["input_size"]
+        )
 
     def compute_lens_change(
-            self,
-            feature_lens: Union[int,
-                                torch.Tensor]) -> Union[int, torch.Tensor]:
+        self, feature_lens: int | torch.Tensor
+    ) -> int | torch.Tensor:
         """feature_lens: int
         return updated feature lens.
 
-        This used to return a different lambda function for each case that 
-        computed the right thing.  That does not work within Torchscript. 
+        This used to return a different lambda function for each case that
+        computed the right thing.  That does not work within Torchscript.
         If you really need this to be faster, create nn.Module()-s for all
         the cases and return one of them.  Torchscript does support that.
         """
         if self.input_layer == "nemo_conv":
             # Handle the special causal case
             subsampling_causal_cond = self.nemo_conv_settings.get(
-                "subsampling", "dw_striding") in [
-                    "dw_striding",
-                    "striding",
-                    "striding_conv1d",
-                ]
+                "subsampling", "dw_striding"
+            ) in [
+                "dw_striding",
+                "striding",
+                "striding_conv1d",
+            ]
             is_causal = self.nemo_conv_settings.get("is_causal", False)
             if is_causal and subsampling_causal_cond:
-                lens_change = (torch.ceil(feature_lens /
-                                          self.time_reduction).long()
-                               if isinstance(feature_lens, Tensor) else
-                               math.ceil(feature_lens / self.time_reduction))
+                lens_change = (
+                    torch.ceil(feature_lens / self.time_reduction).long()
+                    if isinstance(feature_lens, Tensor)
+                    else math.ceil(feature_lens / self.time_reduction)
+                )
                 feature_lens_remainder = feature_lens % self.time_reduction
                 if isinstance(feature_lens, Tensor):
                     lens_change[feature_lens_remainder != 1] += 1
                 elif feature_lens_remainder != 1:
                     lens_change += 1
                 return lens_change
-            ceil_func = (math.ceil
-                         if isinstance(feature_lens, int) else torch.ceil)
+            ceil_func = math.ceil if isinstance(feature_lens, int) else torch.ceil
             return ceil_func(feature_lens / self.time_reduction)
 
     @abc.abstractmethod
@@ -437,10 +451,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         """Abstract forward method implementation."""
 
     def _chunk_size_selection(
-            self,
-            chunk_size: Optional[Union[int, list[int]]] = None,
-            left_chunk: Optional[Union[int,
-                                       list[int]]] = None) -> tuple[int, int]:
+        self,
+        chunk_size: int | list[int] | None = None,
+        left_chunk: int | list[int] | None = None,
+    ) -> tuple[int, int]:
         """If chunk size is a list, we will randomly select a chunk size."""
 
         if chunk_size is None:
@@ -450,15 +464,16 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         if isinstance(chunk_size, list):
             # Variable chunk size during training
             chunk_size_index = int(
-                torch.randint(low=0, high=len(chunk_size), size=(1, )))
+                torch.randint(low=0, high=len(chunk_size), size=(1,))
+            )
             chunk_size_train_eff = chunk_size[chunk_size_index]
             if not isinstance(left_chunk, list):
                 raise ValueError(
-                    "Since chunk_size is a list, left_chunk must be a list")
+                    "Since chunk_size is a list, left_chunk must be a list"
+                )
             if len(left_chunk) != len(chunk_size):
                 raise ValueError(
-                    "The length of left_chunk must be the same as length of "\
-                        "chunk_size."
+                    "The length of left_chunk must be the same as length of chunk_size."
                 )
             left_chunk_train_eff = left_chunk[chunk_size_index]
         else:
@@ -479,8 +494,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         return embed_class
 
     def _forward_embeddings_core(
-            self, input_tensor: torch.Tensor,
-            masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self, input_tensor: torch.Tensor, masks: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         embed_class = self._get_embed_class(self.embed)
         assert isinstance(embed_class, NemoConvSubsampling)
         input_tensor, masks = self.embed(input_tensor, masks)
@@ -488,41 +503,62 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
     def _position_embedding(
         self, input_tensor: torch.Tensor
-    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
         pos_k = None
         pos_v = None
         if self.relative_attention_bias_layer is None:
             input_tensor = self.pos_emb(
-                input_tensor)  # default to add abs sinusoid embedding
+                input_tensor
+            )  # default to add abs sinusoid embedding
         return pos_k, pos_v
 
-    def _streaming_mask(self, seq_len: int, batch_size: int,
-                        chunk_size: Union[int, list[int]],
-                        left_chunk: Union[int, list[int]]) -> torch.Tensor:
-        chunk_size_train_eff, left_chunk_train_eff = \
-            self._chunk_size_selection(chunk_size, left_chunk)
+    def _streaming_mask(
+        self,
+        seq_len: int,
+        batch_size: int,
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
+    ) -> torch.Tensor:
+        chunk_size_train_eff, left_chunk_train_eff = self._chunk_size_selection(
+            chunk_size, left_chunk
+        )
 
         # Create mask matrix for streaming
         # S stores start index. if chunksize is 18, s is [0,18,36,....]
         chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
 
-        enc_streaming_mask = (adaptive_enc_mask(
-            seq_len, chunk_start_idx,
-            left_window=left_chunk_train_eff).unsqueeze(0).expand(
-                [batch_size, -1, -1]))
+        enc_streaming_mask = (
+            adaptive_enc_mask(
+                seq_len, chunk_start_idx, left_window=left_chunk_train_eff
+            )
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
         return enc_streaming_mask
 
     def forward_embeddings(
         self,
         xs_pad: torch.Tensor,
         masks: torch.Tensor,
-        chunk_size_nc: Optional[Union[int, list[int]]] = None,
-        left_chunk_nc: Optional[Union[int, list[int]]] = None
-    ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor],
-                     Optional[torch.Tensor], torch.Tensor, torch.Tensor],
-               tuple[torch.Tensor, Optional[torch.Tensor],
-                     Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-                     torch.Tensor]]:
+        chunk_size_nc: int | list[int] | None = None,
+        left_chunk_nc: int | list[int] | None = None,
+    ) -> (
+        tuple[
+            torch.Tensor,
+            torch.Tensor | None,
+            torch.Tensor | None,
+            torch.Tensor,
+            torch.Tensor,
+        ]
+        | tuple[
+            torch.Tensor,
+            torch.Tensor | None,
+            torch.Tensor | None,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+        ]
+    ):
         """Forwarding the inputs through the top embedding layers
 
         Args:
@@ -530,7 +566,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
                 input tensor
             masks: torch.Tensor
                 input mask
-            chunk_size_nc: (optional, default is None) chunk size for 
+            chunk_size_nc: (optional, default is None) chunk size for
                             non-causal layers
             left_chunk_nc: (optional, default is None) # of left chunks for
                             non-causal layers
@@ -543,21 +579,21 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
                 f"""The sequence length after time reduction is invalid: 
                 {seq_len}. Your input feature is too short. Consider 
                 filtering out the very short sentence from data 
-                loader""", )
+                loader""",
+            )
 
         batch_size = xs_pad.shape[0]
 
-        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
-                                                  self.chunk_size,
-                                                  self.left_chunk)
+        enc_streaming_mask = self._streaming_mask(
+            seq_len, batch_size, self.chunk_size, self.left_chunk
+        )
 
         if xs_pad.is_cuda:
             enc_streaming_mask = enc_streaming_mask.cuda()
             xs_pad = xs_pad.cuda()
 
         input_tensor = xs_pad
-        input_tensor, masks = self._forward_embeddings_core(
-            input_tensor, masks)
+        input_tensor, masks = self._forward_embeddings_core(input_tensor, masks)
 
         streaming_mask = enc_streaming_mask
         if streaming_mask is not None and masks is not None:
@@ -569,7 +605,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
         if chunk_size_nc is not None:
             enc_streaming_mask_nc = self._streaming_mask(
-                seq_len, batch_size, chunk_size_nc, left_chunk_nc)
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc
+            )
             if xs_pad.is_cuda:
                 enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
             if masks is not None:
@@ -622,8 +659,8 @@ class ConformerEncoder(TransformerEncoderBase):
             left_chunk = 6
             left_chunk = [12, 9, 6, 3]
         num_lang: int
-            This parameter is used to store the number of languages in the 
-            lang_dict, only used for multiseed/multilingual models. 
+            This parameter is used to store the number of languages in the
+            lang_dict, only used for multiseed/multilingual models.
             default None.
         attention_dim: int, optional
             attention dimension. default 256.
@@ -721,16 +758,16 @@ class ConformerEncoder(TransformerEncoderBase):
         extra_layer_output_idx: int
             the layer index to be exposed.
         relative_attention_bias_args: dict, optional
-            use more efficient scalar bias-based relative multihead attention 
+            use more efficient scalar bias-based relative multihead attention
             (Q*K^T + B) implemented in cmb.basics.embedding.
             [T5/ALiBi]RelativeAttentionLogitBias
             usage: relative_attention_bias_args={"type": t5/alibi}
-            additional method-specific arguments can be provided (see 
+            additional method-specific arguments can be provided (see
             transformer_base.py)
         time_reduction: int optional
             time reduction factor
             default 4
-        use_pt_scaled_dot_product_attention: whether to use pytorch scaled 
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
             dot product attention in training.
             Default: False
         nemo_conv_settings: dict, optional
@@ -748,12 +785,12 @@ class ConformerEncoder(TransformerEncoderBase):
             Add extra padding in conv2d subsampling layers. Choices are
             (feat, feat_time, none, True)
             Default: none
-        replication_pad_for_subsample_embedding:  For batched-streaming 
+        replication_pad_for_subsample_embedding:  For batched-streaming
             decoding, use "replication" padding for the cache at start of
             utterance.
             Default: False
         attention_group_size: int, optional
-            the number of groups to use for attention, default 1 
+            the number of groups to use for attention, default 1
             (Multi-Head Attention),
             1 = typical Multi-Head Attention,
             1 < attention_group_size < attention_heads = Grouped-Query
@@ -766,9 +803,9 @@ class ConformerEncoder(TransformerEncoderBase):
     def __init__(  # pylint: disable-all
         self,
         input_size: int,
-        chunk_size: Union[int, list[int]],
-        left_chunk: Union[int, list[int]],
-        num_lang: Optional[int] = None,
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
+        num_lang: int | None = None,
         attention_dim: int = 256,
         attention_heads: int = 4,
         linear_units: int = 2048,
@@ -795,15 +832,14 @@ class ConformerEncoder(TransformerEncoderBase):
         extra_layer_output_idx: int = -1,
         extra_multi_layer_output_idxs: list[int] = [],  # noqa
         activation_checkpointing: str = "",
-        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        relative_attention_bias_args: dict[str, Any] | None = None,
         time_reduction: int = 4,
         use_pt_scaled_dot_product_attention: bool = False,
-        nemo_conv_settings: Optional[dict[str, Any]] = None,
-        conv2d_extra_padding: Literal["feat", "feat_time", "none",
-                                      True] = "none",
+        nemo_conv_settings: dict[str, Any] | None = None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
         replication_pad_for_subsample_embedding: bool = False,
         attention_group_size: int = 1,
-        encoder_embedding_config: Optional[dict[str, Any]] = None,
+        encoder_embedding_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             input_size,
@@ -827,39 +863,43 @@ class ConformerEncoder(TransformerEncoderBase):
         self.num_lang = num_lang
         self.kernel_size = kernel_size
         self.replication_pad_for_subsample_embedding: bool = (
-            replication_pad_for_subsample_embedding)
-        assert (self.num_heads % attention_group_size == 0
-                ), "attention_group_size must divide n_head"
+            replication_pad_for_subsample_embedding
+        )
+        assert self.num_heads % attention_group_size == 0, (
+            "attention_group_size must divide n_head"
+        )
         self.num_heads_k = self.num_heads // attention_group_size
 
-        self.encoders = MultiSequential(*[
-            ConformerEncoderLayer(
-                d_model=attention_dim,
-                ext_pw_out_channel=ext_pw_out_channel,
-                depthwise_seperable_out_channel=depthwise_seperable_out_channel,
-                depthwise_multiplier=depthwise_multiplier,
-                n_head=attention_heads,
-                d_ffn=linear_units,
-                ext_pw_kernel_size=ext_pw_kernel_size,
-                kernel_size=kernel_size,
-                dropout_rate=dropout_rate,
-                causal=causal,
-                batch_norm=batch_norm,
-                activation=activation,
-                chunk_se=chunk_se,
-                chunk_size=chunk_size,
-                conv_activation=conv_activation,
-                conv_glu_type=conv_glu_type,
-                bias_in_glu=bias_in_glu,
-                linear_glu_in_convm=linear_glu_in_convm,
-                attention_glu_type=attention_glu_type,
-                activation_checkpointing=activation_checkpointing,
-                export=export,
-                use_pt_scaled_dot_product_attention=
-                use_pt_scaled_dot_product_attention,
-                attn_group_sizes=attention_group_size,
-            ) for _ in range(num_blocks)
-        ])
+        self.encoders = MultiSequential(
+            *[
+                ConformerEncoderLayer(
+                    d_model=attention_dim,
+                    ext_pw_out_channel=ext_pw_out_channel,
+                    depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                    depthwise_multiplier=depthwise_multiplier,
+                    n_head=attention_heads,
+                    d_ffn=linear_units,
+                    ext_pw_kernel_size=ext_pw_kernel_size,
+                    kernel_size=kernel_size,
+                    dropout_rate=dropout_rate,
+                    causal=causal,
+                    batch_norm=batch_norm,
+                    activation=activation,
+                    chunk_se=chunk_se,
+                    chunk_size=chunk_size,
+                    conv_activation=conv_activation,
+                    conv_glu_type=conv_glu_type,
+                    bias_in_glu=bias_in_glu,
+                    linear_glu_in_convm=linear_glu_in_convm,
+                    attention_glu_type=attention_glu_type,
+                    activation_checkpointing=activation_checkpointing,
+                    export=export,
+                    use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
+                    attn_group_sizes=attention_group_size,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
         self.extra_layer_output_idx = extra_layer_output_idx
         self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
         # Make a zeros scalar we can use in get_initial_state to determine
@@ -867,34 +907,36 @@ class ConformerEncoder(TransformerEncoderBase):
         self.register_buffer("dev_type", torch.zeros(()), persistent=False)
 
     def init_relative_attention_bias(
-            self, input_tensor: torch.Tensor) -> Optional[torch.Tensor]:
+        self, input_tensor: torch.Tensor
+    ) -> torch.Tensor | None:
         if self.relative_attention_bias_layer:
             return self.relative_attention_bias_layer(input_tensor)
 
-    def calculate_hs_mask(self, xs_pad: torch.Tensor, device: torch.device,
-                          mask: Optional[torch.Tensor]) -> torch.Tensor:
+    def calculate_hs_mask(
+        self, xs_pad: torch.Tensor, device: torch.device, mask: torch.Tensor | None
+    ) -> torch.Tensor:
         max_audio_length = xs_pad.shape[1]
         batch_size = xs_pad.shape[0]
-        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
-                                                  self.chunk_size,
-                                                  self.left_chunk)
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.chunk_size, self.left_chunk
+        )
         enc_streaming_mask = enc_streaming_mask.to(device)
         if mask is None:
             return enc_streaming_mask
 
         feature_lens = mask.sum(1)
         padding_length = feature_lens
-        pad_mask = (torch.arange(0, max_audio_length,
-                                 device=device).expand(padding_length.size(0),
-                                                       -1)
-                    < padding_length.unsqueeze(1))
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
         pad_mask = pad_mask.unsqueeze(1)
         pad_mask = pad_mask & enc_streaming_mask
         return pad_mask
 
     @torch.jit.ignore
-    def forward(self, xs_pad: torch.Tensor,
-                masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, xs_pad: torch.Tensor, masks: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Conformer Forward function
 
         Args:
@@ -905,11 +947,12 @@ class ConformerEncoder(TransformerEncoderBase):
         """
         xs_pad = self.encoder_embedding(xs_pad)
         input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
-            xs_pad, masks)
+            xs_pad, masks
+        )
 
         unfolded = False
         ori_bz, seq_len, D = input_tensor.shape
-        max_seq_len = 500  #maximum position for absolute positional encoding
+        max_seq_len = 500  # maximum position for absolute positional encoding
         if seq_len > max_seq_len:
             # audio sequence is longer than max_seq_len, unfold it into chunks
             # of max_seq_len
@@ -921,26 +964,29 @@ class ConformerEncoder(TransformerEncoderBase):
             else:
                 chunk_pad_size = 0
             if chunk_pad_size > 0:
-                input_tensor_pad = F.pad(input_tensor,
-                                         (0, 0, 0, chunk_pad_size), "constant",
-                                         0)
+                input_tensor_pad = F.pad(
+                    input_tensor, (0, 0, 0, chunk_pad_size), "constant", 0
+                )
                 input_tensor = input_tensor_pad.to(input_tensor.device)
             input_tensor = unfold_tensor(input_tensor, max_seq_len)
             if masks is not None:
                 # revise hs_mask here because the previous calculated hs_mask
                 # did not consider extra pad
                 subsampled_pad_mask = masks.squeeze(
-                    1)  # [bz, subsampled_unmask_seq_len]
+                    1
+                )  # [bz, subsampled_unmask_seq_len]
                 extra_padded_subsamlped_pad_mask = F.pad(
-                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
-                    False)  # extra padding to the pad mask
-                extra_padded_subsamlped_pad_mask = \
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = (
                     extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                )
                 masks_unfold = unfold_tensor(
                     extra_padded_subsamlped_pad_mask, max_seq_len
                 )  # unfold the pad mask like we did to the input tensor
                 masks_unfold = masks_unfold.squeeze(
-                    -1).bool()  # unfold op does not support bool tensor
+                    -1
+                ).bool()  # unfold op does not support bool tensor
             else:
                 masks_unfold = None
             hs_mask = self.calculate_hs_mask(
@@ -949,15 +995,14 @@ class ConformerEncoder(TransformerEncoderBase):
 
         # layer_emb = None
 
-        relative_attention_bias = self.init_relative_attention_bias(
-            input_tensor)
+        relative_attention_bias = self.init_relative_attention_bias(input_tensor)
 
-        _simplified_path = (self.extra_layer_output_idx == -1
-                            and relative_attention_bias is None)
+        _simplified_path = (
+            self.extra_layer_output_idx == -1 and relative_attention_bias is None
+        )
 
         if _simplified_path:
-            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v,
-                                             hs_mask)
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v, hs_mask)
         else:
             for i, layer in enumerate(self.encoders):
                 input_tensor, _, _, _ = layer(
@@ -997,29 +1042,33 @@ class WindowQformer(nn.Module):
     ):
         super().__init__()
 
-        self.decoders = nn.ModuleList([
-            nn.TransformerDecoderLayer(
-                d_model=attention_dim,
-                nhead=attention_heads,
-                dim_feedforward=linear_units,
-                dropout=dropout_rate,
-                activation="relu",
-                batch_first=True,
-                norm_first=normalize_before,  # TODO need to verify
-            ) for _ in range(num_blocks)
-        ])
+        self.decoders = nn.ModuleList(
+            [
+                nn.TransformerDecoderLayer(
+                    d_model=attention_dim,
+                    nhead=attention_heads,
+                    dim_feedforward=linear_units,
+                    dropout=dropout_rate,
+                    activation="relu",
+                    batch_first=True,
+                    norm_first=normalize_before,  # TODO need to verify
+                )
+                for _ in range(num_blocks)
+            ]
+        )
 
         self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
-        self.after_norm = (nn.LayerNorm(attention_dim, eps=1e-12)
-                           if normalize_before else None)
+        self.after_norm = (
+            nn.LayerNorm(attention_dim, eps=1e-12) if normalize_before else None
+        )
         self.window_size = window_size
 
     def forward(
-            self,
-            audio_embed: torch.Tensor,
-            mask: Optional[torch.Tensor],
-            embed_len: Optional[int] = None
-    ) -> tuple[torch.Tensor, Optional[int]]:
+        self,
+        audio_embed: torch.Tensor,
+        mask: torch.Tensor | None,
+        embed_len: int | None = None,
+    ) -> tuple[torch.Tensor, int | None]:
         """forward decoder"""
         # audio_embed: N x T x D => N x D x T
 
@@ -1027,8 +1076,9 @@ class WindowQformer(nn.Module):
         # audio_embed: N x D x 1 x T => N x DK x T'
         padding = audio_embed.shape[-1] % self.window_size
         if padding > 0:
-            audio_embed = F.pad(audio_embed, (0, self.window_size - padding),
-                                "constant", 0)
+            audio_embed = F.pad(
+                audio_embed, (0, self.window_size - padding), "constant", 0
+            )
 
         embed_chunk = F.unfold(
             audio_embed[..., None, :],
@@ -1045,10 +1095,7 @@ class WindowQformer(nn.Module):
         # NT' x 1 x D
         q = self.queries.expand(bsz * slen, -1, -1)
         for layer in self.decoders:
-            q = layer(tgt=q,
-                      memory=embed_chunk,
-                      tgt_mask=None,
-                      memory_mask=mask)
+            q = layer(tgt=q, memory=embed_chunk, tgt_mask=None, memory_mask=mask)
 
         if self.after_norm is not None:
             q = self.after_norm(q)
@@ -1068,8 +1115,7 @@ class AudioEmbedding(nn.Module):
         super().__init__()
         self.config = config
         # n_embed or hidden_size for text LM
-        hidden_size = (config.n_embd
-                       if hasattr(config, "n_embd") else config.hidden_size)
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
 
         # self.wte = nn.Embedding(config.vocab_size, hidden_size)
 
@@ -1078,8 +1124,10 @@ class AudioEmbedding(nn.Module):
         )
         self.layer_idx = -2
 
-        if (isinstance(config.audio_processor, dict)
-                and config.audio_processor.get("name", None) == "cascades"):
+        if (
+            isinstance(config.audio_processor, dict)
+            and config.audio_processor.get("name", None) == "cascades"
+        ):
             encoder_config = config.audio_processor.get("config", None)
             assert encoder_config is not None
             self.encoder = ConformerEncoder(**encoder_config)
@@ -1089,13 +1137,11 @@ class AudioEmbedding(nn.Module):
         else:
             raise NotImplementedError("")
 
-        assert (audio_dim_out
-                is not None), "Remember to set values for audio_dim_out"
+        assert audio_dim_out is not None, "Remember to set values for audio_dim_out"
         self.audio_dim_out = audio_dim_out
         self.audio_dim_in = n_mels
 
-        self.freeze_audio_processor = kwargs.get("freeze_audio_processor",
-                                                 False)
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor", False)
 
         self.downsample_rate = kwargs.get("downsample_rate", 1)
 
@@ -1107,8 +1153,9 @@ class AudioEmbedding(nn.Module):
             self.qformer = None
 
         if kwargs.get("use_conv_downsample", False):
-            assert (self.qformer is None
-                    ), "don't support use qformer and conv downsample together"
+            assert self.qformer is None, (
+                "don't support use qformer and conv downsample together"
+            )
             nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
             default_nemo_conv_settings = {
                 "subsampling": "dw_striding",
@@ -1124,11 +1171,13 @@ class AudioEmbedding(nn.Module):
             if nemo_conv_settings:
                 default_nemo_conv_settings.update(nemo_conv_settings)
                 for i in ["subsampling_factor", "feat_in", "feat_out"]:
-                    assert (
-                        i not in nemo_conv_settings
-                    ), "{i} should be specified outside of the NeMo dictionary"
+                    assert i not in nemo_conv_settings, (
+                        "{i} should be specified outside of the NeMo dictionary"
+                    )
 
-            self.conv_ds = NemoConvSubsampling(**default_nemo_conv_settings, )
+            self.conv_ds = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
         else:
             self.conv_ds = None
 
@@ -1140,30 +1189,26 @@ class AudioEmbedding(nn.Module):
             # (do not use image_projection and image_proj_norm)
             dim_projection = hidden_size
             depth = 2
-            self.linear_downsample_rate = (1 if (self.qformer or self.conv_ds)
-                                           else self.downsample_rate)
+            self.linear_downsample_rate = (
+                1 if (self.qformer or self.conv_ds) else self.downsample_rate
+            )
             layers = [
-                nn.Linear(audio_dim_out * self.linear_downsample_rate,
-                          dim_projection)
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
             ]
             for _ in range(1, depth):
-                layers.extend(
-                    [nn.GELU(),
-                     nn.Linear(dim_projection, dim_projection)])
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
             self.audio_projection = nn.Sequential(*layers)
             # NOTE vision-speech tasks use a separate projection layer
             layers = [
-                nn.Linear(audio_dim_out * self.linear_downsample_rate,
-                          dim_projection)
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
             ]
             for _ in range(1, depth):
-                layers.extend(
-                    [nn.GELU(),
-                     nn.Linear(dim_projection, dim_projection)])
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
             self.audio_projection_for_vision = nn.Sequential(*layers)
         else:
             raise NotImplementedError(
-                f"projection_cls = {projection_cls}, not implemented")
+                f"projection_cls = {projection_cls}, not implemented"
+            )
 
         # TODO: audio sequence compression - Qformer
         self.vocab_size = config.vocab_size
@@ -1179,7 +1224,7 @@ class AudioEmbedding(nn.Module):
     def get_audio_features(
         self,
         input_embeds: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
+        audio_attention_mask: torch.Tensor | None = None,
         audio_projection_mode: str = "speech",
     ) -> torch.Tensor:
         """
@@ -1188,11 +1233,9 @@ class AudioEmbedding(nn.Module):
         """
         if self.freeze_audio_processor:
             with torch.no_grad():
-                audio_features, masks = self.encoder(input_embeds,
-                                                     audio_attention_mask)
+                audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
         else:
-            audio_features, masks = self.encoder(input_embeds,
-                                                 audio_attention_mask)
+            audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
 
         if self.qformer is not None:
             audio_features, _ = self.qformer(audio_features, mask=None)
@@ -1221,14 +1264,13 @@ class AudioEmbedding(nn.Module):
                 feat_dim * self.linear_downsample_rate,
             )
 
-        if audio_projection_mode == 'speech':
+        if audio_projection_mode == "speech":
             audio_set_tensor = self.audio_projection(audio_features)
-        elif audio_projection_mode == 'vision':
+        elif audio_projection_mode == "vision":
             audio_set_tensor = self.audio_projection_for_vision(audio_features)
         else:
             raise ValueError(
-                f"audio_projection_mode = {audio_projection_mode} not "\
-                    "implemented"
+                f"audio_projection_mode = {audio_projection_mode} not implemented"
             )
 
         return audio_set_tensor
@@ -1236,13 +1278,13 @@ class AudioEmbedding(nn.Module):
     def forward(
         self,
         audio_features: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
+        audio_attention_mask: torch.Tensor | None = None,
         audio_projection_mode: str = "speech",
     ) -> torch.Tensor:
         """
         arguments:
             audio_features: audio features (T, D)
-        
+
         returns:
             audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
         """
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index 6fbfca619a42f74db015a360ca80dff26424c9ce..698435eb76c966c00116e48d2cf5decc1579e3d2 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -6,7 +6,6 @@
 # but implemented by the Phi-Speech team
 #!/usr/bin/env python3
 import math
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -43,18 +42,17 @@ def get_activation(name: str = "relu") -> torch.nn.Module:
     return nn.Identity()
 
 
-def adaptive_enc_mask(x_len: int,
-                      chunk_start_idx: list[int],
-                      left_window: int = 0,
-                      right_window: int = 0) -> torch.Tensor:
+def adaptive_enc_mask(
+    x_len: int, chunk_start_idx: list[int], left_window: int = 0, right_window: int = 0
+) -> torch.Tensor:
     """
     The function is very important for Transformer Transducer Streaming mode
     Args:
         x_len: sequence length
-        chunk_start_idx: first idx of each chunk, such as [0,18,36,48]. 
+        chunk_start_idx: first idx of each chunk, such as [0,18,36,48].
         It also supports adaptive chunk size [0,10,15,45]
         left_window: how many left chunks can be seen
-        right_window: how many right chunks can be seen. It is used for 
+        right_window: how many right chunks can be seen. It is used for
         chunk overlap model.
         Returns:
             mask (torch.Tensor): a mask tensor for streaming model
@@ -67,21 +65,23 @@ def adaptive_enc_mask(x_len: int,
                     [False., True., True., False.],
                     [False., False., True., True.]])
     """
-    chunk_start_idx = torch.Tensor(chunk_start_idx).long(
-    )  # first idx of each chunk, such as [0,18,36,48].
+    chunk_start_idx = torch.Tensor(
+        chunk_start_idx
+    ).long()  # first idx of each chunk, such as [0,18,36,48].
     start_pad = torch.nn.functional.pad(
-        chunk_start_idx,
-        (1, 0))  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+        chunk_start_idx, (1, 0)
+    )  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
     end_pad = torch.nn.functional.pad(
         chunk_start_idx, (0, 1), value=x_len
     )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
-    seq_range = torch.arange(0,
-                             x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
-    idx = ((seq_range < end_pad) &
-           (seq_range >= start_pad)).nonzero()[:, 1]  # idx size: [x_len]
+    seq_range = torch.arange(0, x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) & (seq_range >= start_pad)).nonzero()[
+        :, 1
+    ]  # idx size: [x_len]
     # boundary = end_pad[idx]  # boundary size: [x_len]
-    seq_range_expand = (torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
-                        )  # seq_range_expand size [x_len, x_len]
+    seq_range_expand = (
+        torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+    )  # seq_range_expand size [x_len, x_len]
     idx_left = idx - left_window
     idx_left[idx_left < 0] = 0
     boundary_left = start_pad[idx_left]
@@ -231,18 +231,23 @@ class GLUPointWiseConv(nn.Module):
         x = self.ext_pw_conv_1d(x)
         if self.glu_type == "bilinear":
             if self.bias_in_glu:
-                x = (x[:, 0:self.output_dim, :] + self.b1) * (
-                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
             else:
-                x = (x[:, 0:self.output_dim, :]) * (
-                    x[:, self.output_dim:self.output_dim * 2, :])
+                x = (
+                    (x[:, 0 : self.output_dim, :])
+                    * (x[:, self.output_dim : self.output_dim * 2, :])
+                )
         else:
             if self.bias_in_glu:
-                x = (x[:, 0:self.output_dim, :] + self.b1) * self.glu_act(
-                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
             else:
-                x = (x[:, 0:self.output_dim, :]) * self.glu_act(
-                    x[:, self.output_dim:self.output_dim * 2, :])
+                x = (x[:, 0 : self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :]
+                )
 
         x = x.permute([0, 2, 1])
         return x
@@ -257,7 +262,7 @@ class DepthWiseSeperableConv1d(nn.Module):
         input_dim: int
             input channel size.
         depthwise_seperable_out_channel: int
-            if set different to 0, the number of 
+            if set different to 0, the number of
              depthwise_seperable_out_channel will be used as a channel_out
              of the second conv1d layer.
              otherwise, it equals to 0, the second conv1d layer is skipped.
@@ -327,7 +332,7 @@ class ConvModule(nn.Module):
             if > 0, ext_pw_out_channel is a dim channel size
              for the last pointwise conv after swish activation.
         depthwise_seperable_out_channel: int
-            if set different to 0, the number of 
+            if set different to 0, the number of
              depthwise_seperable_out_channel
              will be used as a channel_out of the second conv1d layer.
              otherwise, it equal to 0, the second conv1d layer is skipped.
@@ -431,12 +436,10 @@ class ConvModule(nn.Module):
 
         if depthwise_seperable_out_channel != 0:
             if input_dim != depthwise_seperable_out_channel:
-                self.ln2 = nn.Linear(depthwise_seperable_out_channel,
-                                     input_dim)
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel, input_dim)
         else:
             if depthwise_multiplier != 1:
-                self.ln2 = nn.Linear(input_dim * depthwise_multiplier,
-                                     input_dim)
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim)
 
     def _add_ext_pw_layer(self) -> None:
         """
@@ -445,7 +448,8 @@ class ConvModule(nn.Module):
         of the conformer.
         """
         self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
-            nn.Identity())  # jit hacks.
+            nn.Identity()
+        )  # jit hacks.
         self.squeeze_excitation = nn.Identity()  # jit.
         self.apply_ln1 = self.fix_len1 = False  # jit.
 
@@ -509,7 +513,7 @@ class ConvModule(nn.Module):
         if self.ext_pw_out_channel != 0:
             x = self.glu(x)
             if self.causal and self.ext_pw_kernel_size > 1:
-                x = x[:, :-(self.ext_pw_kernel_size - 1), :]
+                x = x[:, : -(self.ext_pw_kernel_size - 1), :]
             if self.apply_ln1:
                 x = self.ln1(x)
         else:
@@ -521,7 +525,7 @@ class ConvModule(nn.Module):
 
         x = self.dw_sep_conv_1d(x)
         if self.causal and self.kernel_size > 1:
-            x = x[:, :, :-(self.kernel_size - 1)]
+            x = x[:, :, : -(self.kernel_size - 1)]
         if hasattr(self, "ln2"):
             x = x.permute([0, 2, 1])
             x = self.ln2(x)
@@ -533,7 +537,7 @@ class ConvModule(nn.Module):
         if self.ext_pw_out_channel != 0:
             x = self.ext_pw_conv_1d(x)
             if self.fix_len1:
-                x = x[:, :, :-(self.ext_pw_kernel_size - 1)]
+                x = x[:, :, : -(self.ext_pw_kernel_size - 1)]
 
             if self.apply_ln1:
                 x = x.permute([0, 2, 1])
@@ -652,7 +656,7 @@ def _pre_hook(
 
     Note:
         We saved self.pe until v.0.5.2 but we have omitted it later.
-        Therefore, we remove the item "pe" from `state_dict` for backward 
+        Therefore, we remove the item "pe" from `state_dict` for backward
         compatibility.
 
     """
@@ -663,7 +667,7 @@ def _pre_hook(
 
 class T5RelativeAttentionLogitBias(nn.Module):
     """
-    This module implements the relative position bias described in Section 
+    This module implements the relative position bias described in Section
     2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
 
     The Huggingface implementation is used as a reference
@@ -671,18 +675,18 @@ class T5RelativeAttentionLogitBias(nn.Module):
     transformers/models/t5/modeling_t5.py#L435
 
     Modifies attention as Q*K^T + B, where B is a learned scalar bias based
-    on relative position of the query and key. It is HxNxN, where H is the 
+    on relative position of the query and key. It is HxNxN, where H is the
     number of heads, N is the sequence length.
 
     I've made these modifications to the original T5 bias:
-    - Skipping of the bucketing step. Original T5 bias converted rel 
-      position distances into logarithmically increasing buckets. This is 
+    - Skipping of the bucketing step. Original T5 bias converted rel
+      position distances into logarithmically increasing buckets. This is
       supposed to help with length generalization.
-    - I just directly use rel position index as bias values, as we don't 
-      need length generalization (40s max is good enough for ASR encoder), 
+    - I just directly use rel position index as bias values, as we don't
+      need length generalization (40s max is good enough for ASR encoder),
       and it keeps ONNX export simple.
-    - I've also extended it so that biases can be asymmetric, the default 
-      implementation treats L->R and R->L the same. Asymmetric was found to 
+    - I've also extended it so that biases can be asymmetric, the default
+      implementation treats L->R and R->L the same. Asymmetric was found to
       yield better results in my experiments.
 
     Args:
@@ -690,26 +694,28 @@ class T5RelativeAttentionLogitBias(nn.Module):
             Number of attention heads
         num_buckets: int
             Number of buckets to use for relative attention bias. This is the
-            size of the learnable bias parameter. Bucketing is not yet 
+            size of the learnable bias parameter. Bucketing is not yet
             supported, so this defaults to -1 which means no bucketing is
             used (max_distance determines size of bias param).
         max_distance: int
-            Maximum distance to use for relative attention bias. With 
-            num_buckets=-1, this directly controls the max size of the bias 
-            parameter. When num_buckets > 0 is supported, this will control 
-            the maximum distance for logarithmic bucketing after which all 
+            Maximum distance to use for relative attention bias. With
+            num_buckets=-1, this directly controls the max size of the bias
+            parameter. When num_buckets > 0 is supported, this will control
+            the maximum distance for logarithmic bucketing after which all
             positions are in the same bucket.
         symmetric: bool
             Whether to use symmetric or asymmetric biases. symmetric=False uses
-            2x number of bias params to distinguish L->R from R->L. This was 
+            2x number of bias params to distinguish L->R from R->L. This was
             found to be better for the encoder.
     """
 
-    def __init__(self,
-                 num_heads: int,
-                 num_buckets: int = -1,
-                 max_distance: int = 1000,
-                 symmetric: bool = False) -> None:
+    def __init__(
+        self,
+        num_heads: int,
+        num_buckets: int = -1,
+        max_distance: int = 1000,
+        symmetric: bool = False,
+    ) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.num_buckets = num_buckets
@@ -720,7 +726,8 @@ class T5RelativeAttentionLogitBias(nn.Module):
             self.num_buckets = max_distance
         else:
             raise NotImplementedError(
-                "T5 attention bias with bucketed positions is not yet tested")
+                "T5 attention bias with bucketed positions is not yet tested"
+            )
         if not self.symmetric:
             self.num_buckets *= 2
         self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
@@ -728,19 +735,21 @@ class T5RelativeAttentionLogitBias(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         # instantiate bias compatible with shape of x
         maxpos = x.size(1)
-        context_position = torch.arange(maxpos,
-                                        device=x.device,
-                                        dtype=torch.long)[:, None]
-        memory_position = torch.arange(maxpos,
-                                       device=x.device,
-                                       dtype=torch.long)[None, :]
+        context_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            :, None
+        ]
+        memory_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            None, :
+        ]
         relative_position = memory_position - context_position
         # clipping to a maximum distance using ops that play well with ONNX
         # export
         relative_position = relative_position.masked_fill(
-            relative_position < -self.max_distance, -self.max_distance)
+            relative_position < -self.max_distance, -self.max_distance
+        )
         relative_position = relative_position.masked_fill(
-            relative_position > self.max_distance - 1, self.max_distance - 1)
+            relative_position > self.max_distance - 1, self.max_distance - 1
+        )
 
         # mapping from relative position to index in the bias parameter
         if self._skip_bucketing:
@@ -753,8 +762,7 @@ class T5RelativeAttentionLogitBias(nn.Module):
             bias_idx += self.num_buckets // 2
 
         t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
-        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(
-            0)  # [1, H, L, L]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(0)  # [1, H, L, L]
 
         return t5_rel_att_bias
 
@@ -766,11 +774,13 @@ class T5RelativeAttentionLogitBias(nn.Module):
         if not self.causal:
             self.num_buckets //= 2
             relative_buckets += (relative_position > 0).to(
-                torch.long) * self.num_buckets
+                torch.long
+            ) * self.num_buckets
             relative_position = torch.abs(relative_position)
         else:
-            relative_position = -torch.min(relative_position,
-                                           torch.zeros_like(relative_position))
+            relative_position = -torch.min(
+                relative_position, torch.zeros_like(relative_position)
+            )
         # now relative_position is in the range [0, inf)
 
         # half of the buckets are for exact increments in positions
@@ -780,16 +790,18 @@ class T5RelativeAttentionLogitBias(nn.Module):
         # The other half of the buckets are for logarithmically bigger bins in
         # positions up to max_distance
         relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact) /
-            math.log(self.max_distance / max_exact) *
-            (self.num_buckets - max_exact)).to(torch.long)
+            torch.log(relative_position.float() / max_exact)
+            / math.log(self.max_distance / max_exact)
+            * (self.num_buckets - max_exact)
+        ).to(torch.long)
         relative_position_if_large = torch.min(
             relative_position_if_large,
             torch.full_like(relative_position_if_large, self.num_buckets - 1),
         )
 
-        relative_buckets += torch.where(is_small, relative_position,
-                                        relative_position_if_large)
+        relative_buckets += torch.where(
+            is_small, relative_position, relative_position_if_large
+        )
         return relative_buckets
 
 
@@ -808,10 +820,7 @@ class AbsolutePositionalEncoding(nn.Module):
 
     """
 
-    def __init__(self,
-                 d_model: int,
-                 dropout_rate: float,
-                 max_len: int = 5000) -> None:
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000) -> None:
         """Construct an PositionalEncoding object."""
         super().__init__()
         self.d_model = d_model
@@ -834,8 +843,9 @@ class AbsolutePositionalEncoding(nn.Module):
         pe = torch.zeros(x.size(1), self.d_model)
         position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
         div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
-            -(math.log(10000.0) / self.d_model))
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0)
@@ -852,7 +862,7 @@ class AbsolutePositionalEncoding(nn.Module):
 
         """
         self.extend_pe(x)
-        x = x * self.xscale + self.pe[:, :x.size(1)]
+        x = x * self.xscale + self.pe[:, : x.size(1)]
         return self.dropout(x)
 
 
@@ -889,14 +899,14 @@ class CausalConv1D(nn.Conv1d):
     locations on its right or left
     All arguments are the same as nn.Conv1d except padding.
 
-    If padding is set None, then paddings are set automatically to make it a 
+    If padding is set None, then paddings are set automatically to make it a
     causal convolution where each location would not see any steps on its right.
 
-    If padding is set as a list (size of 2), then padding[0] would be used as 
+    If padding is set as a list (size of 2), then padding[0] would be used as
     left padding and padding[1] as right padding.
     It would make it possible to control the number of steps to be accessible
     on the right and left.
-    This mode is not supported when stride > 1. padding[0]+padding[1] should 
+    This mode is not supported when stride > 1. padding[0]+padding[1] should
     be equal to (kernel_size - 1).
     """
 
@@ -906,7 +916,7 @@ class CausalConv1D(nn.Conv1d):
         out_channels: int,
         kernel_size: int,
         stride: int = 1,
-        padding: Union[str, int] = 0,
+        padding: str | int = 0,
         dilation: int = 1,
         groups: int = 1,
         bias: bool = True,
@@ -920,13 +930,15 @@ class CausalConv1D(nn.Conv1d):
             self._right_padding = stride - 1
         else:
             if stride != 1 and padding != kernel_size - 1:
-                raise ValueError(
-                    "No striding allowed for non-symmetric convolutions!")
+                raise ValueError("No striding allowed for non-symmetric convolutions!")
             if isinstance(padding, int):
                 self._left_padding = padding
                 self._right_padding = padding
-            elif (isinstance(padding, list) and len(padding) == 2
-                  and padding[0] + padding[1] == kernel_size - 1):
+            elif (
+                isinstance(padding, list)
+                and len(padding) == 2
+                and padding[0] + padding[1] == kernel_size - 1
+            ):
                 self._left_padding = padding[0]
                 self._right_padding = padding[1]
             else:
@@ -949,9 +961,8 @@ class CausalConv1D(nn.Conv1d):
         )
 
     def update_cache(
-            self,
-            x: Tensor,
-            cache: Optional[Tensor] = None) -> tuple[Tensor, Optional[Tensor]]:
+        self, x: Tensor, cache: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         if cache is None:
             new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
             next_cache = cache
@@ -959,17 +970,15 @@ class CausalConv1D(nn.Conv1d):
             new_x = F.pad(x, pad=(0, self._right_padding))
             new_x = torch.cat([cache, new_x], dim=-1)
             if self.cache_drop_size > 0:
-                next_cache = new_x[:, :, :-self.cache_drop_size]
+                next_cache = new_x[:, :, : -self.cache_drop_size]
             else:
                 next_cache = new_x
-            next_cache = next_cache[:, :, -cache.size(-1):]
+            next_cache = next_cache[:, :, -cache.size(-1) :]
         return new_x, next_cache
 
     def forward(
-        self,
-        x: Tensor,
-        cache: Optional[Tensor] = None
-    ) -> Union[Tensor, tuple[Tensor, Optional[Tensor]]]:
+        self, x: Tensor, cache: Tensor | None = None
+    ) -> Tensor | tuple[Tensor, Tensor | None]:
         x, cache = self.update_cache(x, cache=cache)
         x = super().forward(x)
         if cache is None:
@@ -982,7 +991,7 @@ class CausalConv2D(nn.Conv2d):
     """
     A causal version of nn.Conv2d where each location in the 2D matrix would
     have no access to locations on its right or down
-    All arguments are the same as nn.Conv2d except padding which should be 
+    All arguments are the same as nn.Conv2d except padding which should be
     set as None
     """
 
@@ -992,7 +1001,7 @@ class CausalConv2D(nn.Conv2d):
         out_channels: int,
         kernel_size: int,
         stride: int = 1,
-        padding: Union[str, int] = 0,
+        padding: str | int = 0,
         dilation: int = 1,
         groups: int = 1,
         bias: bool = True,
@@ -1001,8 +1010,7 @@ class CausalConv2D(nn.Conv2d):
         dtype=None,
     ) -> None:
         if padding is not None:
-            raise ValueError(
-                "Argument padding should be set to None for CausalConv2D.")
+            raise ValueError("Argument padding should be set to None for CausalConv2D.")
         self._left_padding = kernel_size - 1
         self._right_padding = stride - 1
 
@@ -1038,17 +1046,17 @@ class NemoConvSubsampling(torch.nn.Module):
     (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
     34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
 
-    Striding Subsampling: "Speech-Transformer: A No-Recurrence 
-    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong 
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong
     et al. (https://ieeexplore.ieee.org/document/8462506)
 
 
-    Compared with the EncoderConv2D (`input_layer: custom`), this is a 
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a
     much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
     Moreover, depthwise convolutions are used to reduce FLOPs, but the first
       layer is kept as a regular convolution so as not to degrade accuracy.
 
-    `Striding` and `dw_striding` are the same except that the latter uses 
+    `Striding` and `dw_striding` are the same except that the latter uses
     depthwise convolutions after the first layer, whereas the former does not.
 
     Args:
@@ -1056,11 +1064,11 @@ class NemoConvSubsampling(torch.nn.Module):
         feat_in (int): size of the input features
         feat_out (int): size of the output features
         subsampling (str): The subsampling technique, choose from
-            {"striding", "dw-striding", "striding_conv1d", 
+            {"striding", "dw-striding", "striding_conv1d",
             "dw_striding_conv1d"}
-        conv_channels (int): Number of channels for the convolution layers, 
+        conv_channels (int): Number of channels for the convolution layers,
                             default is 256.
-        subsampling_conv_chunking_factor (int): Input chunking factor which 
+        subsampling_conv_chunking_factor (int): Input chunking factor which
             can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
         activation (Module): activation function, default is nn.ReLU()
         is_causal (bool): whether to use causal Conv1/2D, where each step will
@@ -1095,15 +1103,15 @@ class NemoConvSubsampling(torch.nn.Module):
             "striding_conv1d",
         )
 
-        if (subsampling_conv_chunking_factor != -1
-                and subsampling_conv_chunking_factor != 1
-                and subsampling_conv_chunking_factor % 2 != 0):
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
             raise ValueError(
-                "subsampling_conv_chunking_factor should be -1, 1, or a "\
-                    "power of 2"
+                "subsampling_conv_chunking_factor should be -1, 1, or a power of 2"
             )
-        self.subsampling_conv_chunking_factor = \
-            subsampling_conv_chunking_factor
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
 
         in_channels = 1
         layers = []
@@ -1131,7 +1139,8 @@ class NemoConvSubsampling(torch.nn.Module):
                         kernel_size=self._kernel_size,
                         stride=self._stride,
                         padding=None,
-                    ))
+                    )
+                )
             else:
                 layers.append(
                     torch.nn.Conv2d(
@@ -1140,7 +1149,8 @@ class NemoConvSubsampling(torch.nn.Module):
                         kernel_size=self._kernel_size,
                         stride=self._stride,
                         padding=self._left_padding,
-                    ))
+                    )
+                )
             in_channels = conv_channels
             layers.append(activation)
 
@@ -1154,7 +1164,8 @@ class NemoConvSubsampling(torch.nn.Module):
                             stride=self._stride,
                             padding=None,
                             groups=in_channels,
-                        ))
+                        )
+                    )
                 else:
                     layers.append(
                         torch.nn.Conv2d(
@@ -1164,7 +1175,8 @@ class NemoConvSubsampling(torch.nn.Module):
                             stride=self._stride,
                             padding=self._left_padding,
                             groups=in_channels,
-                        ))
+                        )
+                    )
 
                 layers.append(
                     torch.nn.Conv2d(
@@ -1174,7 +1186,8 @@ class NemoConvSubsampling(torch.nn.Module):
                         stride=1,
                         padding=0,
                         groups=1,
-                    ))
+                    )
+                )
                 layers.append(activation)
                 in_channels = conv_channels
 
@@ -1201,7 +1214,8 @@ class NemoConvSubsampling(torch.nn.Module):
                             kernel_size=self._kernel_size,
                             stride=self._stride,
                             padding=None,
-                        ))
+                        )
+                    )
                 else:
                     layers.append(
                         torch.nn.Conv2d(
@@ -1210,7 +1224,8 @@ class NemoConvSubsampling(torch.nn.Module):
                             kernel_size=self._kernel_size,
                             stride=self._stride,
                             padding=self._left_padding,
-                        ))
+                        )
+                    )
                 layers.append(activation)
                 in_channels = conv_channels
 
@@ -1235,22 +1250,30 @@ class NemoConvSubsampling(torch.nn.Module):
                     layers.append(
                         CausalConv1D(
                             in_channels=in_channels,
-                            out_channels=(feat_out if self._sampling_num == i +
-                                          1 else conv_channels),
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
                             kernel_size=self._kernel_size,
                             stride=self._stride,
                             padding=None,
-                        ))
+                        )
+                    )
                 else:
                     layers.append(
                         torch.nn.Conv1d(
                             in_channels=in_channels,
-                            out_channels=(feat_out if self._sampling_num == i +
-                                          1 else conv_channels),
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
                             kernel_size=self._kernel_size,
                             stride=self._stride,
                             padding=self._left_padding,
-                        ))
+                        )
+                    )
                 layers.append(activation)
                 in_channels = conv_channels
 
@@ -1265,30 +1288,8 @@ class NemoConvSubsampling(torch.nn.Module):
             self._right_padding = (self._kernel_size - 1) // 2
 
             # Layer 1
-            layers.extend([
-                torch.nn.Conv1d(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    kernel_size=self._kernel_size,
-                    stride=self._stride,
-                    padding=self._left_padding,
-                    groups=in_channels,
-                ),
-                torch.nn.Conv1d(
-                    in_channels=in_channels,
-                    out_channels=(feat_out if self._sampling_num == 1 else
-                                  conv_channels),
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    groups=1,
-                ),
-            ])
-            in_channels = conv_channels
-            layers.append(activation)
-
-            for i in range(self._sampling_num - 1):
-                layers.extend([
+            layers.extend(
+                [
                     torch.nn.Conv1d(
                         in_channels=in_channels,
                         out_channels=in_channels,
@@ -1299,14 +1300,44 @@ class NemoConvSubsampling(torch.nn.Module):
                     ),
                     torch.nn.Conv1d(
                         in_channels=in_channels,
-                        out_channels=(feat_out if self._sampling_num == i +
-                                      2 else conv_channels),
+                        out_channels=(
+                            feat_out if self._sampling_num == 1 else conv_channels
+                        ),
                         kernel_size=1,
                         stride=1,
                         padding=0,
                         groups=1,
                     ),
-                ])
+                ]
+            )
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend(
+                    [
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ),
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 2
+                                else conv_channels
+                            ),
+                            kernel_size=1,
+                            stride=1,
+                            padding=0,
+                            groups=1,
+                        ),
+                    ]
+                )
                 layers.append(activation)
                 in_channels = conv_channels
 
@@ -1323,8 +1354,7 @@ class NemoConvSubsampling(torch.nn.Module):
                 ceil_mode=self._ceil_mode,
                 repeat_num=self._sampling_num,
             )
-            self.out = torch.nn.Linear(conv_channels * int(out_length),
-                                       feat_out)
+            self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out)
             self.conv2d_subsampling = True
         elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
             self.out = None
@@ -1340,8 +1370,7 @@ class NemoConvSubsampling(torch.nn.Module):
     def get_streaming_cache_size(self) -> list[int]:
         return [0, self.subsampling_factor + 1]
 
-    def forward(self, x: Tensor,
-                mask: Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]:
+    def forward(self, x: Tensor, mask: Tensor | None) -> tuple[Tensor, Tensor | None]:
         """
         Forward method for NeMo subsampling.
 
@@ -1350,24 +1379,22 @@ class NemoConvSubsampling(torch.nn.Module):
             mask: input mask
 
         Returns:
-            x: Resulting tensor from subsampling (B, T // 
+            x: Resulting tensor from subsampling (B, T //
                 time_reduction_factor, feat_out)
-            pad_mask: tensor of padded hidden state sequences (B, 1, T // 
+            pad_mask: tensor of padded hidden state sequences (B, 1, T //
                 time_reduction_factor)
         """
         x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
 
         # split inputs if chunking_factor is set
-        if (self.subsampling_conv_chunking_factor != -1
-                and self.conv2d_subsampling):
+        if self.subsampling_conv_chunking_factor != -1 and self.conv2d_subsampling:
             if self.subsampling_conv_chunking_factor == 1:
                 # if subsampling_conv_chunking_factor is 1, we split only
                 # if needed.
                 # avoiding a bug / feature limiting indexing of tensors
                 # to 2**31.
                 # see https://github.com/pytorch/pytorch/issues/80020
-                x_ceil = (2**31 / self._conv_channels * self._stride *
-                          self._stride)
+                x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
                 need_to_split = torch.numel(x) > x_ceil
             else:
                 # if subsampling_conv_chunking_factor > 1 we always split
@@ -1403,7 +1430,8 @@ class NemoConvSubsampling(torch.nn.Module):
             feature_lens_remainder = feature_lens % self.subsampling_factor
             padding_length[feature_lens_remainder != 1] += 1
         pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
-            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
         return x, pad_mask.unsqueeze(1)
 
     def reset_parameters(self) -> None:
@@ -1412,27 +1440,22 @@ class NemoConvSubsampling(torch.nn.Module):
             with torch.no_grad():
                 # init conv
                 scale = 1.0 / self._kernel_size
-                dw_max = (self._kernel_size**2)**-0.5
+                dw_max = (self._kernel_size**2) ** -0.5
                 pw_max = self._conv_channels**-0.5
 
                 torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
                 torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
 
                 for idx in range(2, len(self.conv), 3):
-                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max,
-                                           dw_max)
-                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max,
-                                           dw_max)
-                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max,
-                                           pw_max)
-                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max,
-                                           pw_max)
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max, pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max, pw_max)
 
                 # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
                 # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
                 # src/models/conformer_encoder.py#L487
-                fc_scale = (self._feat_out * self._feat_in /
-                            self._sampling_num)**-0.5
+                fc_scale = (self._feat_out * self._feat_in / self._sampling_num) ** -0.5
                 torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
                 torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
 
@@ -1456,15 +1479,14 @@ class NemoConvSubsampling(torch.nn.Module):
             return x, False
 
         return (
-            torch.cat([
-                self.conv(chunk)
-                for chunk in torch.split(x, new_batch_size, 0)
-            ]),
+            torch.cat(
+                [self.conv(chunk) for chunk in torch.split(x, new_batch_size, 0)]
+            ),
             True,
         )
 
     def conv_split_by_channel(self, x: Tensor) -> Tensor:
-        """For dw convs, tries to split input by time, run conv and concat 
+        """For dw convs, tries to split input by time, run conv and concat
         results"""
         x = self.conv[0](x)  # full conv2D
         x = self.conv[1](x)  # activation
@@ -1489,22 +1511,21 @@ class NemoConvSubsampling(torch.nn.Module):
             if new_t == 0:
                 new_t = 1
 
-            x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c,
-                                          x)  # conv2D, depthwise
+            x = self.channel_chunked_conv(
+                self.conv[i * 3 + 2], new_c, x
+            )  # conv2D, depthwise
 
             # splitting pointwise convs by time
             x = torch.cat(
-                [
-                    self.conv[i * 3 + 3](chunk)
-                    for chunk in torch.split(x, new_t, 2)
-                ],
+                [self.conv[i * 3 + 3](chunk) for chunk in torch.split(x, new_t, 2)],
                 2,
             )  # conv2D, pointwise
             x = self.conv[i * 3 + 4](x)  # activation
         return x
 
-    def channel_chunked_conv(self, conv: torch.nn.Module, chunk_size: int,
-                             x: Tensor) -> Tensor:
+    def channel_chunked_conv(
+        self, conv: torch.nn.Module, chunk_size: int, x: Tensor
+    ) -> Tensor:
         """Performs channel chunked convolution"""
 
         ind = 0
@@ -1524,8 +1545,8 @@ class NemoConvSubsampling(torch.nn.Module):
                 )
                 ch_out = nn.functional.conv2d(
                     chunk,
-                    conv.weight[ind:ind + step, :, :, :],
-                    bias=conv.bias[ind:ind + step],
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
                     stride=self._stride,
                     padding=0,
                     groups=step,
@@ -1533,8 +1554,8 @@ class NemoConvSubsampling(torch.nn.Module):
             else:
                 ch_out = nn.functional.conv2d(
                     chunk,
-                    conv.weight[ind:ind + step, :, :, :],
-                    bias=conv.bias[ind:ind + step],
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
                     stride=self._stride,
                     padding=self._left_padding,
                     groups=step,
@@ -1545,30 +1566,33 @@ class NemoConvSubsampling(torch.nn.Module):
         return torch.cat(out_chunks, 1)
 
     def change_subsampling_conv_chunking_factor(
-            self, subsampling_conv_chunking_factor: int) -> None:
-        if (subsampling_conv_chunking_factor != -1
-                and subsampling_conv_chunking_factor != 1
-                and subsampling_conv_chunking_factor % 2 != 0):
+        self, subsampling_conv_chunking_factor: int
+    ) -> None:
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
             raise ValueError(
-                "subsampling_conv_chunking_factor should be -1, 1, or a "\
-                    "power of 2"
+                "subsampling_conv_chunking_factor should be -1, 1, or a power of 2"
             )
         self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
 
 
-def calc_length(lengths: Tensor,
-                all_paddings: int,
-                kernel_size: int,
-                stride: int,
-                ceil_mode: bool,
-                repeat_num: int = 1) -> Tensor:
+def calc_length(
+    lengths: Tensor,
+    all_paddings: int,
+    kernel_size: int,
+    stride: int,
+    ceil_mode: bool,
+    repeat_num: int = 1,
+) -> Tensor:
     """Calculates the output length of a Tensor passed through a convolution or
-      max pooling layer"""
+    max pooling layer"""
     add_pad: float = all_paddings - kernel_size
     one: float = 1.0
     for i in range(repeat_num):
-        lengths = (torch.div(lengths.to(dtype=torch.float) + add_pad, stride) +
-                   one)
+        lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one
         lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
     return lengths.to(dtype=torch.int)
 
@@ -1588,10 +1612,10 @@ class AttModule(nn.Module):
     def forward(
         self,
         x: Tensor,
-        memory: Optional[Tensor] = None,
-        pos_emb: Optional[Tensor] = None,
-        att_mask: Optional[Tensor] = None,
-    ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        memory: Tensor | None = None,
+        pos_emb: Tensor | None = None,
+        att_mask: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor | None, Tensor | None]:
         """AttModule forward
 
         Args:
@@ -1613,20 +1637,21 @@ class AttBlock(BlockBase, AttModule):
 
 def masked_softmax(
     scores: Tensor,
-    mask: Optional[Tensor],
+    mask: Tensor | None,
 ) -> Tensor:
     if mask is not None:
         mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
         scores = scores.masked_fill(mask, -torch.inf)
         attn = torch.softmax(scores, dim=-1).masked_fill(
-            mask, 0.0)  # (batch, head, time1, time2)
+            mask, 0.0
+        )  # (batch, head, time1, time2)
     else:
         attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
     return attn
 
 
 class MultiHeadedAttention(nn.Module):
-    """Multi-Head Attention layer with optional relative position embedding 
+    """Multi-Head Attention layer with optional relative position embedding
     and GLU.
 
     Args:
@@ -1642,12 +1667,12 @@ class MultiHeadedAttention(nn.Module):
             default: -1 (equal to n_feat).
         use_pt_scaled_dot_product_attention: bool, optional
             if set True, use pytorch scaled dot product attention in training.
-            NOTE: this will NOT be used in ONNX decoding due to a lack of 
-            support.  In that case, we use the original attention 
+            NOTE: this will NOT be used in ONNX decoding due to a lack of
+            support.  In that case, we use the original attention
             implementation, which shows no regression.
             default: False.
         n_value: int, optional
-            if set to values other than -1, use a different dimension for 
+            if set to values other than -1, use a different dimension for
             value. With the default value (i.e. -1), it is backward compatible.
         group_size: int, optional. must divide `n_head`
             if group_size > 1:       GQA
@@ -1692,11 +1717,10 @@ class MultiHeadedAttention(nn.Module):
         self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
         self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
 
-        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.attn = torch.jit.Attribute(None, Tensor | None)
         self.dropout = nn.Dropout(p=dropout_rate)
         self.dropout_rate = dropout_rate
-        self.use_pt_scaled_dot_product_attention = (
-            use_pt_scaled_dot_product_attention)
+        self.use_pt_scaled_dot_product_attention = use_pt_scaled_dot_product_attention
 
         if use_pt_scaled_dot_product_attention and group_size > 1:
             raise ValueError("Cannot use PT Scaled Attention with GQA")
@@ -1714,10 +1738,10 @@ class MultiHeadedAttention(nn.Module):
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        pos_k: Optional[Tensor],
-        pos_v: Optional[Tensor],
-        mask: Optional[Tensor],
-        relative_attention_bias: Optional[Tensor] = None,
+        pos_k: Tensor | None,
+        pos_v: Tensor | None,
+        mask: Tensor | None,
+        relative_attention_bias: Tensor | None = None,
     ) -> Tensor:
         """Compute 'Scaled Dot Product Attention'.
 
@@ -1728,25 +1752,24 @@ class MultiHeadedAttention(nn.Module):
             pos_k: key tensor used for relative positional embedding.
             pos_v: value tensor used for relative positional embedding.
             mask: mask tensor (batch, time1, time2)
-            relative_attention_bias: bias added to attention logits w.r.t. 
+            relative_attention_bias: bias added to attention logits w.r.t.
                 relative positions
                 (1, n_head, time1, time2)
         """
         n_batch = query.size(0)
 
-        q = self.linear_q(query).view(n_batch, -1, self.h,
-                                      self.d_k)  # (b, t, d)
-        k = self.linear_k(key).view(n_batch, -1, self.h_k,
-                                    self.d_k)  # (b, t, d)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k, self.d_k)  # (b, t, d)
         v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
-        q = (q.transpose(1, 2) if self.use_pt_scaled_dot_product_attention
-             and not torch.jit.is_scripting() else q.transpose(1, 2) *
-             self.inv_sqrt_d_k)
+        q = (
+            q.transpose(1, 2)
+            if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting()
+            else q.transpose(1, 2) * self.inv_sqrt_d_k
+        )
         k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
         v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
 
-        if (self.use_pt_scaled_dot_product_attention
-                and not torch.jit.is_scripting()):
+        if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting():
             attn_mask = None
             if mask is not None:
                 mask = mask.unsqueeze(1)
@@ -1757,12 +1780,14 @@ class MultiHeadedAttention(nn.Module):
                 if mask.dtype != q.dtype:
                     attn_mask = attn_mask.to(q.dtype)
 
-            with torch.nn.attention.sdpa_kernel([
+            with torch.nn.attention.sdpa_kernel(
+                [
                     torch.nn.attention.SDPBackend.FLASH_ATTENTION,
                     torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
                     torch.nn.attention.SDPBackend.MATH,
                     torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
-            ]):
+                ]
+            ):
                 x = torch.nn.functional.scaled_dot_product_attention(
                     q,
                     k,
@@ -1780,14 +1805,17 @@ class MultiHeadedAttention(nn.Module):
                 if self.h != self.h_k:
                     B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
                 else:
-                    reshape_q = (q.contiguous().view(n_batch * self.h, -1,
-                                                     self.d_k).transpose(0, 1)
-                                 )  # (t1,nh,dk)
-                    B = torch.matmul(reshape_q,
-                                     pos_k.transpose(-2,
-                                                     -1))  # pos_k: (t1,dk,t2)
-                    B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0),
-                                               pos_k.size(1))
+                    reshape_q = (
+                        q.contiguous()
+                        .view(n_batch * self.h, -1, self.d_k)
+                        .transpose(0, 1)
+                    )  # (t1,nh,dk)
+                    B = torch.matmul(
+                        reshape_q, pos_k.transpose(-2, -1)
+                    )  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(
+                        n_batch, self.h, pos_k.size(0), pos_k.size(1)
+                    )
                 scores = A + B
             else:
                 scores = A
@@ -1800,20 +1828,24 @@ class MultiHeadedAttention(nn.Module):
             self.attn = attn
 
             p_attn = self.dropout(attn)
-            x = torch.matmul(p_attn.to(v.dtype),
-                             v)  # (batch, head, time1, d_k)
+            x = torch.matmul(p_attn.to(v.dtype), v)  # (batch, head, time1, d_k)
             if pos_v is not None:
-                reshape_attn = (p_attn.contiguous().view(
-                    n_batch * self.h, pos_v.size(0),
-                    pos_v.size(1)).transpose(0, 1))  # (t1, bh, t2)
-
-                attn_v = (torch.matmul(reshape_attn, pos_v).transpose(
-                    0, 1).contiguous().view(n_batch, self.h, pos_v.size(0),
-                                            self.d_k))
+                reshape_attn = (
+                    p_attn.contiguous()
+                    .view(n_batch * self.h, pos_v.size(0), pos_v.size(1))
+                    .transpose(0, 1)
+                )  # (t1, bh, t2)
+
+                attn_v = (
+                    torch.matmul(reshape_attn, pos_v)
+                    .transpose(0, 1)
+                    .contiguous()
+                    .view(n_batch, self.h, pos_v.size(0), self.d_k)
+                )
                 x = x + attn_v
-        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
-                                                 self.h_k * self.d_k)
-             )  # (batch, time1, d_model)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h_k * self.d_k)
+        )  # (batch, time1, d_model)
 
         return self.linear_out(x)  # (batch, time1, d_model)
 
@@ -1830,7 +1862,7 @@ class MultiSequential(torch.nn.Sequential):
 
 
 def get_offset(input_layer: str, time_reduction: int) -> int:
-    """Get an offset. We will use the offset for determining #frames of a 
+    """Get an offset. We will use the offset for determining #frames of a
     subsampled feature.
 
     Args:
@@ -1841,7 +1873,7 @@ def get_offset(input_layer: str, time_reduction: int) -> int:
     """
     if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
         return 3
-    if input_layer in ("conv2d", ) and time_reduction == 6:
+    if input_layer in ("conv2d",) and time_reduction == 6:
         return 1
     if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
         return 7
@@ -1850,8 +1882,8 @@ def get_offset(input_layer: str, time_reduction: int) -> int:
 
 def unfold_tensor(xs_pad: Tensor, max_seq_len: int) -> Tensor:
     """
-    For a given tensor with shape of (N, T, D), if sequence length T is 
-    longer than max_seq_len, this function unfold it to a 
+    For a given tensor with shape of (N, T, D), if sequence length T is
+    longer than max_seq_len, this function unfold it to a
     (NT', max_seq_len, D) where T' is T // max_seq_len.
     Args:
         xs_pad: input tensor with shape (N, T, D)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 3ce67ce37a7ab56b4d785679235bea6e3f4f99d0..92fd858b608bc40b362e24f5b11389a152ab9a02 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -23,9 +23,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only PhiMoE model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -36,27 +36,35 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class PhiMoEConfig(PretrainedConfig):
-
     model_type = "phimoe"
     keys_to_ignore_at_inference = ["past_key_values"]
 
@@ -129,7 +137,6 @@ class PhiMoEConfig(PretrainedConfig):
 
 
 class mp(torch.autograd.Function):
-
     @staticmethod
     def forward(
         ctx,
@@ -174,8 +181,9 @@ def sparsemixer(scores, jitter_eps=0.01):
         # compute mask for sparsity
         mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = ((mask_logits_threshold - scores) /
-                                 factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+            2 * jitter_eps
+        )
 
     # apply mask
     masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -196,24 +204,21 @@ def sparsemixer(scores, jitter_eps=0.01):
     )
     with torch.no_grad():
         # compute mask for sparsity
-        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
-                                                           keepdim=True)
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = ((mask_logits_threshold - scores) /
-                                 factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+            2 * jitter_eps
+        )
 
     # apply mask
-    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
-                                                  float("-inf"))
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, float("-inf"))
     selected_experts_top2 = max_ind
     # compute scores for gradients
     masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
-    multiplier_top2 = masked_gates_top2.gather(dim=-1,
-                                               index=selected_experts_top2)
+    multiplier_top2 = masked_gates_top2.gather(dim=-1, index=selected_experts_top2)
 
     multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
-    selected_experts = torch.concat((selected_experts, selected_experts_top2),
-                                    dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2), dim=-1)
 
     return (
         multiplier,
@@ -227,8 +232,7 @@ def phimoe_routing_function(
     topk: int,
     renormalize: bool,
 ):
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     assert topk == 2, "Only top-2 routing is supported"
     assert renormalize is False, "Renormalization is not supported"
 
@@ -251,9 +255,9 @@ class PhiMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -279,7 +283,8 @@ class PhiMoE(nn.Module):
             quant_config=quant_config,
             tp_size=tp_size,
             custom_routing_function=phimoe_routing_function,
-            prefix=f"{prefix}.experts")
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -292,18 +297,17 @@ class PhiMoE(nn.Module):
 
 
 class PhiMoEAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[dict] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: dict | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -338,12 +342,14 @@ class PhiMoEAttention(nn.Module):
             self.total_num_kv_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -377,12 +383,11 @@ class PhiMoEAttention(nn.Module):
 
 
 class PhiMoEDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PhiMoEConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -394,8 +399,9 @@ class PhiMoEDecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            head_dim=getattr(config, "head_dim",
-                             self.hidden_size // config.num_attention_heads),
+            head_dim=getattr(
+                config, "head_dim", self.hidden_size // config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -410,18 +416,18 @@ class PhiMoEDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.block_sparse_moe",
         )
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps,
-                                            elementwise_affine=True)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps,
-                                                     elementwise_affine=True)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -445,55 +451,52 @@ class PhiMoEDecoderLayer(nn.Module):
 
 @support_torch_compile
 class PhiMoEModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+        self.vocab_size = config.vocab_size
+
         self.config = config
         self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: PhiMoEDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers")
-        self.norm = nn.LayerNorm(config.hidden_size,
-                                 eps=config.rms_norm_eps,
-                                 elementwise_affine=True)
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -508,10 +511,9 @@ class PhiMoEModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states = self.norm(hidden_states)
         return hidden_states
@@ -524,8 +526,7 @@ class PhiMoEModel(nn.Module):
             num_experts=self.config.num_local_experts,
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -537,14 +538,15 @@ class PhiMoEModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -595,8 +597,9 @@ class PhiMoEModel(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -623,55 +626,48 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = vllm_config.quant_config
 
-        self.model = PhiMoEModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = PhiMoEModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=(
-                DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size),
             quant_config=None,
             bias=True,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 7b197844c8b637338b12fbd73028845b425dbf0a..8cb7d6a889da41b38018cfe1bc0f1b29e5a04114 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -5,59 +5,77 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
 from PIL import Image
 from transformers import BatchFeature, PixtralVisionConfig, TensorType
 from transformers.image_utils import ImageInput
 from transformers.models.pixtral.image_processing_pixtral import (
-    _num_image_tokens as _get_pixtral_hf_num_image_tokens)
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens,
+)
 from transformers.models.pixtral.modeling_pixtral import (
-    PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
+    PixtralRotaryEmbedding,
+    apply_rotary_pos_emb,
+    position_ids_in_meshgrid,
+)
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalUUIDDict, NestedTensors)
-from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalProcessingInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalUUIDDict,
+    NestedTensors,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (MistralTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import (
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+from .utils import init_vllm_registered_model, maybe_prefix
+from .vision import (
+    VisionEncoderInfo,
+    VisionFeatureSelectStrategy,
+    resolve_visual_encoder_outputs,
+)
 
 try:
     from xformers import ops as xops
-    if (current_platform.is_cuda()
-            and current_platform.has_device_capability(100)):
+
+    if current_platform.is_cuda() and current_platform.has_device_capability(100):
         # Xformers FA is not compatible with B200
         USE_XFORMERS_OPS = False
     else:
@@ -75,13 +93,16 @@ class PixtralImagePixelInputs(TensorSchema):
         - c: Number of channels (3)
         - h: Height of each image
         - w: Width of each image
-    
+
     The result of stacking `ImageEncoding.tokens` from each prompt.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
-    images: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                      TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"})]
+    images: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}),
+    ]
 
 
 class PixtralProcessorAdapter:
@@ -123,9 +144,9 @@ class PixtralProcessorAdapter:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> Mapping[str, NestedTensors]:
         if text is None:
@@ -149,7 +170,8 @@ class PixtralProcessorAdapter:
                 "Make sure to process your input via `mistral_common`'s "
                 "tokenizer or pass a chat completion request. "
                 "For more info, see: "
-                "https://github.com/vllm-project/vllm/issues/8411.")
+                "https://github.com/vllm-project/vllm/issues/8411."
+            )
 
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
@@ -162,16 +184,15 @@ class PixtralProcessorAdapter:
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
 
-        return BatchFeature({
-            "input_ids":
-            torch.cat(images_tokens)[None].expand(len(text), -1),
-            "images":
-            images_processed,
-        })
+        return BatchFeature(
+            {
+                "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
+                "images": images_processed,
+            }
+        )
 
 
 class PixtralProcessingInfo(BaseProcessingInfo):
-
     def get_tokenizer(self) -> MistralTokenizer:
         tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
@@ -182,12 +203,12 @@ class PixtralProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self) -> PixtralProcessorAdapter:
         return PixtralProcessorAdapter(self.get_tokenizer())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_vision_config(
         self,
-        processor: Optional[PixtralProcessorAdapter] = None,
+        processor: PixtralProcessorAdapter | None = None,
     ):
         if processor is None:
             processor = self.get_hf_processor()
@@ -202,13 +223,14 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[PixtralProcessorAdapter] = None,
+        processor: PixtralProcessorAdapter | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
 
         ncols, nrows = processor.image_processor._image_to_num_tokens(
-            Image.new("RGB", (image_width, image_height)))
+            Image.new("RGB", (image_width, image_height))
+        )
 
         return ncols * nrows
 
@@ -220,7 +242,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
 
 
 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -228,48 +249,57 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         tokenizer = self.info.get_tokenizer()
 
         dummy_text = self.get_dummy_text(mm_counts)
-        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
         dummy_images = dummy_mm_data.get("image", [])
         tokenization_kwargs = {"truncation": False}
 
-        request = ChatCompletionRequest(messages=[
-            UserMessage(content=[
-                TextChunk(text=dummy_text),
-                *(ImageChunk(image=image) for image in dummy_images),
-            ]),
-        ])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=dummy_text),
+                        *(ImageChunk(image=image) for image in dummy_images),
+                    ]
+                ),
+            ]
+        )
         res = tokenizer.mistral.encode_chat_completion(request)
         dummy_tokens = res.tokens
 
-        return ProcessorInputs(prompt=dummy_tokens,
-                               mm_data=dummy_mm_data,
-                               tokenization_kwargs=tokenization_kwargs)
-
+        return ProcessorInputs(
+            prompt=dummy_tokens,
+            mm_data=dummy_mm_data,
+            tokenization_kwargs=tokenization_kwargs,
+        )
 
-class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
-                                 ):
 
+class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]):
     def _get_mm_fields_config(
         self,
         hf_inputs: Mapping[str, NestedTensors],
@@ -294,7 +324,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
             image_size = images.get_image_size(item_idx)
 
             ncols, nrows = processor.image_processor._image_to_num_tokens(
-                Image.new("RGB", (image_size.width, image_size.height)))
+                Image.new("RGB", (image_size.width, image_size.height))
+            )
 
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
@@ -311,11 +342,11 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
@@ -329,14 +360,16 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         return prompt_ids, mm_info, True
 
 
-@MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
-                                        info=PixtralProcessingInfo,
-                                        dummy_inputs=PixtralDummyInputsBuilder)
-class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP):
+@MULTIMODAL_REGISTRY.register_processor(
+    PixtralMultiModalProcessor,
+    info=PixtralProcessingInfo,
+    dummy_inputs=PixtralDummyInputsBuilder,
+)
+class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -368,8 +401,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.vision_encoder = VisionTransformer(self.vision_args)
 
         if self.vision_args.add_pre_mm_projector_layer_norm:
-            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
-                                                 eps=1e-5)
+            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
 
         if self.vision_args.mm_projector_id == PATCH_MERGE:
             self.patch_merger = PatchMerger(
@@ -379,20 +411,23 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         self.vision_language_adapter = VisionLanguageAdapter(
-            self.vision_args, dim=config.text_config.hidden_size)
+            self.vision_args, dim=config.text_config.hidden_size
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[PixtralImagePixelInputs]:
+        self, **kwargs: object
+    ) -> PixtralImagePixelInputs | None:
         images = kwargs.pop("images", None)
         if images is None:
             return None
 
         return PixtralImagePixelInputs(
             type="pixel_values",
-            images=flatten_bn(images),
+            images=images,
         )
 
     def _process_image_input(
@@ -401,23 +436,24 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     ) -> tuple[torch.Tensor, ...]:
         images = image_input["images"]
         image_features = self.vision_encoder(images)
-        feature_sizes = [
-            image_feature.shape[0] for image_feature in image_features
-        ]
+        feature_sizes = [image_feature.shape[0] for image_feature in image_features]
         image_features = torch.cat(image_features)
         if self.vision_args.add_pre_mm_projector_layer_norm:
             image_features = self.pre_mm_projector_norm(image_features)
         if self.vision_args.mm_projector_id == PATCH_MERGE:
             patch_size = self.vision_args.patch_size
             spatial_merge_size_square = self.vision_args.spatial_merge_size**2
-            img_patch_dims = [(img.shape[1] // patch_size,
-                               img.shape[2] // patch_size) for img in images]
+            img_patch_dims = [
+                (img.shape[1] // patch_size, img.shape[2] // patch_size)
+                for img in images
+            ]
             feature_sizes = [
                 feature_size // spatial_merge_size_square
                 for feature_size in feature_sizes
             ]
-            image_features = self.patch_merger(image_features,
-                                               image_sizes=img_patch_dims)
+            image_features = self.patch_merger(
+                image_features, image_sizes=img_patch_dims
+            )
         image_embeds = self.vision_language_adapter(image_features)
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
@@ -425,65 +461,38 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.vision_args.image_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for pixtral."""
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]):
             return weight[0].startswith("vision_encoder")
 
@@ -498,38 +507,42 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Get references to parameters for direct loading
         vision_encoder_dict = dict(self.vision_encoder.named_parameters())
-        patch_merger_dict = dict(self.patch_merger.named_parameters(
-        )) if self.vision_args.mm_projector_id == PATCH_MERGE else dict()
-        pre_mm_projector_norm_dict = dict(
-            self.pre_mm_projector_norm.named_parameters(
-            )) if self.vision_args.add_pre_mm_projector_layer_norm else dict()
-        vision_lang_adapter_dict = dict(
-            self.vision_language_adapter.named_parameters())
+        patch_merger_dict = (
+            dict(self.patch_merger.named_parameters())
+            if self.vision_args.mm_projector_id == PATCH_MERGE
+            else dict()
+        )
+        pre_mm_projector_norm_dict = (
+            dict(self.pre_mm_projector_norm.named_parameters())
+            if self.vision_args.add_pre_mm_projector_layer_norm
+            else dict()
+        )
+        vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters())
 
         def llm_weights_generator():
             # Single pass over weights
             for name, w in weights:
                 if is_vision_encoder_weights((name, w)):
                     # Load vision encoder weights directly
-                    trimmed_name = '.'.join(name.split(".")[1:])
+                    trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_encoder_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_patch_merger((name, w)):
                     # Load vision patch merger weights directly
-                    trimmed_name = '.'.join(name.split(".")[1:])
+                    trimmed_name = ".".join(name.split(".")[1:])
                     param = patch_merger_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_pre_mm_projector_norm((name, w)):
                     # Load vision pre_mm_projector_norm weights directly
-                    trimmed_name = '.'.join(name.split(".")[1:])
+                    trimmed_name = ".".join(name.split(".")[1:])
                     param = pre_mm_projector_norm_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_vision_lang_adapter_weights((name, w)):
                     # Load vision-language adapter weights directly
-                    trimmed_name = '.'.join(name.split(".")[1:])
+                    trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_lang_adapter_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
@@ -560,8 +573,7 @@ class VisionEncoderArgs:
     mm_projector_id: str = ""
 
 
-def _reshape_for_broadcast(freqs_cis: torch.Tensor,
-                           x: torch.Tensor) -> torch.Tensor:
+def _reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
     """
     freqs_cis: complex - (seq_len, head_dim / 2)
     x: complex - (bsz, seq_len, head_dim / 2)
@@ -572,9 +584,7 @@ def _reshape_for_broadcast(freqs_cis: torch.Tensor,
         freqs_cis.shape,
         (x.shape[1], x.shape[-1]),
     )
-    shape = [
-        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
-    ]
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
     return freqs_cis.view(*shape)
 
 
@@ -589,7 +599,7 @@ def precompute_freqs_cis_2d(
         to be indexed by (height, width) position tuples
     """
     # (dim / 2) frequency bases
-    freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
 
     h = torch.arange(height, device=freqs.device)
     w = torch.arange(width, device=freqs.device)
@@ -621,26 +631,18 @@ def apply_rotary_emb_vit(
 
 
 class FeedForward(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         assert args.intermediate_size is not None
-        self.w1 = nn.Linear(args.hidden_size,
-                            args.intermediate_size,
-                            bias=False)
-        self.w2 = nn.Linear(args.intermediate_size,
-                            args.hidden_size,
-                            bias=False)
-        self.w3 = nn.Linear(args.hidden_size,
-                            args.intermediate_size,
-                            bias=False)
+        self.w1 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
+        self.w2 = nn.Linear(args.intermediate_size, args.hidden_size, bias=False)
+        self.w3 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
 
 class Attention(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.args = args
@@ -674,10 +676,7 @@ class Attention(nn.Module):
             q = q.transpose(1, 2)
             k = k.transpose(1, 2)
             v = v.transpose(1, 2)
-            out = nn.functional.scaled_dot_product_attention(q,
-                                                             k,
-                                                             v,
-                                                             attn_mask=mask)
+            out = nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
             out = out.transpose(1, 2)
 
         out = out.reshape(batch, patches, self.n_heads * self.head_dim)
@@ -685,7 +684,6 @@ class Attention(nn.Module):
 
 
 class TransformerBlock(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.attention = Attention(args)
@@ -699,9 +697,9 @@ class TransformerBlock(nn.Module):
         mask: torch.Tensor,
         freqs_cis: torch.Tensor,
     ) -> torch.Tensor:
-        r = self.attention.forward(self.attention_norm(x),
-                                   mask=mask,
-                                   freqs_cis=freqs_cis)
+        r = self.attention.forward(
+            self.attention_norm(x), mask=mask, freqs_cis=freqs_cis
+        )
         h = x + r
         r = self.feed_forward.forward(self.ffn_norm(h))
         out = h + r
@@ -709,7 +707,6 @@ class TransformerBlock(nn.Module):
 
 
 class Transformer(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.layers = torch.nn.ModuleList()
@@ -720,29 +717,33 @@ class Transformer(nn.Module):
         self,
         x: torch.Tensor,
         mask: torch.Tensor,
-        freqs_cis: Optional[torch.Tensor],
+        freqs_cis: torch.Tensor | None,
     ) -> torch.Tensor:
         for layer in self.layers:
             x = layer(x, mask=mask, freqs_cis=freqs_cis)
         return x
 
 
-def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor:
-    positions = torch.cat([
-        torch.stack(
-            torch.meshgrid(
-                torch.arange(p.shape[-2]),
-                torch.arange(p.shape[-1]),
-                indexing="ij",
-            ),
-            dim=-1,
-        ).reshape(-1, 2) for p in patch_embeds_list
-    ])
+def position_meshgrid(
+    patch_embeds_list: list[torch.Tensor],
+) -> torch.Tensor:
+    positions = torch.cat(
+        [
+            torch.stack(
+                torch.meshgrid(
+                    torch.arange(p.shape[-2]),
+                    torch.arange(p.shape[-1]),
+                    indexing="ij",
+                ),
+                dim=-1,
+            ).reshape(-1, 2)
+            for p in patch_embeds_list
+        ]
+    )
     return positions
 
 
 class VisionTransformer(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.args = args
@@ -758,7 +759,7 @@ class VisionTransformer(nn.Module):
 
         head_dim = self.args.hidden_size // self.args.num_attention_heads
         assert head_dim % 2 == 0, "ROPE requires even head_dim"
-        self._freqs_cis: Optional[torch.Tensor] = None
+        self._freqs_cis: torch.Tensor | None = None
 
     @property
     def max_patches_per_side(self) -> int:
@@ -804,9 +805,7 @@ class VisionTransformer(nn.Module):
             self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
         ]
 
-        patch_embeds = [
-            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
-        ]
+        patch_embeds = [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list]
         embed_sizes = [p.shape[1] for p in patch_embeds]
 
         # flatten to a single sequence
@@ -820,13 +819,16 @@ class VisionTransformer(nn.Module):
         # pass through Transformer with a block diagonal mask delimiting images
         if USE_XFORMERS_OPS:
             mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+            )
         else:
             from transformers.models.pixtral.modeling_pixtral import (
-                generate_block_attention_mask)
+                generate_block_attention_mask,
+            )
+
             mask = generate_block_attention_mask(
-                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
-                patch_embeds)
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+            )
         out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
 
         # squeeze dim 0 and split into separate tensors for each image
@@ -834,7 +836,6 @@ class VisionTransformer(nn.Module):
 
 
 class VisionLanguageAdapter(nn.Module):
-
     def __init__(self, args: VisionEncoderArgs, dim: int):
         super().__init__()
         assert isinstance(args, VisionEncoderArgs)
@@ -874,8 +875,9 @@ class PatchMerger(nn.Module):
             bias=use_mlp_bias,
         )
 
-    def forward(self, x: torch.Tensor,
-                image_sizes: list[tuple[int, int]]) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor, image_sizes: list[tuple[int, int]]
+    ) -> torch.Tensor:
         # image_sizes specified in tokens
         assert sum([h * w for h, w in image_sizes]) == len(x)
 
@@ -907,15 +909,14 @@ class PatchMerger(nn.Module):
         """
 
         sub_grids = get_sub_grids(
-            x=x,
-            image_sizes=image_sizes,
-            spatial_merge_size=self.spatial_merge_size
+            x=x, image_sizes=image_sizes, spatial_merge_size=self.spatial_merge_size
         )  # list of [d x sub_grid_size x sub_grid_size x n_patches]
         permuted_tensor: list[torch.Tensor] = []
         for grid in sub_grids:
             n_patches = grid.shape[-1]
-            permuted_tensor.append(grid.view(-1, n_patches).t(
-            ))  # n_patches x d * sub_grid_size * sub_grid_size
+            permuted_tensor.append(
+                grid.view(-1, n_patches).t()
+            )  # n_patches x d * sub_grid_size * sub_grid_size
         return torch.cat(
             permuted_tensor, dim=0
         )  # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
@@ -935,14 +936,15 @@ def get_sub_grids(
     for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
         # Reshape image_tokens into a 2D grid
         h, w = image_sizes[image_index]
-        image_grid = image_tokens.view(h, w, d).permute(
-            2, 0, 1)[None, :, :, :]  # 1 x d x h x w
-        sub_grids = torch.nn.functional.unfold(image_grid,
-                                               kernel_size=sub_grid_size,
-                                               stride=sub_grid_size)
+        image_grid = image_tokens.view(h, w, d).permute(2, 0, 1)[
+            None, :, :, :
+        ]  # 1 x d x h x w
+        sub_grids = torch.nn.functional.unfold(
+            image_grid, kernel_size=sub_grid_size, stride=sub_grid_size
+        )
         sub_grids = sub_grids.view(
-            1, d, sub_grid_size, sub_grid_size,
-            -1)  # 1 x d x sub_grid_size x sub_grid_size x n_patches
+            1, d, sub_grid_size, sub_grid_size, -1
+        )  # 1 x d x sub_grid_size x sub_grid_size x n_patches
 
         all_img_sub_grids.append(sub_grids[0])
 
@@ -958,7 +960,6 @@ def get_sub_grids(
 
 
 class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
-
     def get_num_image_tokens(
         self,
         *,
@@ -1011,11 +1012,10 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
 
 
 class PixtralHFMLP(nn.Module):
-
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1027,12 +1027,15 @@ class PixtralHFMLP(nn.Module):
             output_sizes=[config.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(input_size=config.intermediate_size,
-                                           output_size=config.hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1043,11 +1046,10 @@ class PixtralHFMLP(nn.Module):
 
 
 class PixtralHFAttention(nn.Module):
-
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1082,7 +1084,7 @@ class PixtralHFAttention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         batch, patches, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
@@ -1099,40 +1101,37 @@ class PixtralHFAttention(nn.Module):
             # Transpose q and k back for attention
             q = q.transpose(1, 2).contiguous()
             k = k.transpose(1, 2).contiguous()
-            out = xops.memory_efficient_attention(q,
-                                                  k,
-                                                  v,
-                                                  attn_bias=attention_mask)
+            out = xops.memory_efficient_attention(q, k, v, attn_bias=attention_mask)
         else:
             v = v.transpose(1, 2)
             out = nn.functional.scaled_dot_product_attention(
-                q, k, v, attn_mask=attention_mask)
+                q, k, v, attn_mask=attention_mask
+            )
             out = out.transpose(1, 2)
 
-        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
         attn_output, _ = self.o_proj(out)
 
         return attn_output, None
 
 
 class PixtralHFTransformerBlock(nn.Module):
-
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
-        self.attention = PixtralHFAttention(config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.attention")
-        self.feed_forward = PixtralHFMLP(config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.feed_forward")
+        self.attention = PixtralHFAttention(
+            config, quant_config=quant_config, prefix=f"{prefix}.attention"
+        )
+        self.feed_forward = PixtralHFMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
         self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
 
     def forward(
@@ -1141,9 +1140,11 @@ class PixtralHFTransformerBlock(nn.Module):
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
     ) -> torch.Tensor:
-        r, _ = self.attention.forward(self.attention_norm(hidden_states),
-                                      attention_mask=attention_mask,
-                                      position_embeddings=position_embeddings)
+        r, _ = self.attention.forward(
+            self.attention_norm(hidden_states),
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+        )
         h = hidden_states + r
         r = self.feed_forward.forward(self.ffn_norm(h))
         out = h + r
@@ -1151,13 +1152,12 @@ class PixtralHFTransformerBlock(nn.Module):
 
 
 class PixtralHFTransformer(nn.Module):
-
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -1167,12 +1167,16 @@ class PixtralHFTransformer(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layers = nn.ModuleList([
-            PixtralHFTransformerBlock(config=config,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                PixtralHFTransformerBlock(
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -1195,14 +1199,13 @@ class PixtralHFTransformer(nn.Module):
 
 
 class PixtralHFVisionModel(nn.Module):
-
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -1229,7 +1232,8 @@ class PixtralHFVisionModel(nn.Module):
             raise ValueError(
                 f"The original encoder only has {num_hidden_layers} "
                 f"layers, but you requested {len(self.transformer.layers)} "
-                "layers.")
+                "layers."
+            )
 
         if require_post_norm is True:
             msg = "PixtralHFVisionModel does not have post-layernorm"
@@ -1237,13 +1241,14 @@ class PixtralHFVisionModel(nn.Module):
 
         self.dtype = next(self.parameters()).dtype
         self.device = next(self.parameters()).device
-        self.patch_positional_embedding = PixtralRotaryEmbedding(
-            config, self.device)
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config, self.device)
 
     def forward(
         self,
         pixel_values: list[torch.Tensor],
-        feature_sample_layers: Optional[list[int]] = None,
+        *,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> tuple[torch.Tensor, ...]:
         """
         Args:
@@ -1251,7 +1256,7 @@ class PixtralHFVisionModel(nn.Module):
                 in pixel_values. This means it will be a list of tensors
                 because multiple requests batched can have multiple images,
                 each with their own shape potentially
-            feature_sample_layers: Layer indices whose features should be
+            select_layers: Layer indices whose features should be
                 concatenated and used as the visual encoder output. If none
                 are provided, the last layer is used.
 
@@ -1261,13 +1266,10 @@ class PixtralHFVisionModel(nn.Module):
         """
         # pass images through initial convolution independently
         patch_embeds_list = [
-            self.patch_conv(img.unsqueeze(0).to(self.dtype))
-            for img in pixel_values
+            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values
         ]
 
-        patch_embeds = [
-            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
-        ]
+        patch_embeds = [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list]
         embed_sizes = [p.shape[1] for p in patch_embeds]
 
         # flatten to a single sequence
@@ -1277,38 +1279,44 @@ class PixtralHFVisionModel(nn.Module):
         # positional embeddings
         position_ids = position_ids_in_meshgrid(
             patch_embeds_list,
-            max_width=self.config.image_size // self.config.patch_size).to(
-                self.device)
-        position_embedding = self.patch_positional_embedding(
-            patch_embeds, position_ids)
+            max_width=self.config.image_size // self.config.patch_size,
+        ).to(self.device)
+        position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
 
         if USE_XFORMERS_OPS:
             attention_mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+            )
         else:
             from transformers.models.pixtral.modeling_pixtral import (
-                generate_block_attention_mask)
+                generate_block_attention_mask,
+            )
+
             attention_mask = generate_block_attention_mask(
-                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
-                patch_embeds)
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+            )
 
-        return_all_hidden_states = feature_sample_layers is not None
         out = self.transformer(
             patch_embeds,
             attention_mask,
             position_embedding,
-            return_all_hidden_states=return_all_hidden_states)
+            return_all_hidden_states=select_layers is not None,
+        )
 
-        out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
-                                             self.config.num_hidden_layers)
+        out = resolve_visual_encoder_outputs(
+            out,
+            None,
+            select_layers=select_layers,
+            max_possible_layers=self.config.num_hidden_layers,
+            feature_select_strategy=feature_select_strategy,
+        )
 
         # squeeze dim 0 and split into separate tensors for each image
         return torch.split(out.squeeze(0), embed_sizes)
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1328,7 +1336,7 @@ class PixtralHFVisionModel(nn.Module):
                 if layer_idx >= layer_count:
                     continue
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -1338,8 +1346,7 @@ class PixtralHFVisionModel(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 03265b13de500460643d23142f58e13f42b5f749..0c87f5000ff45a14fb8930a467e2ce91de3dcfa2 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -22,34 +23,47 @@ from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
-from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_state_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import selective_state_update
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined_varlen)
+    mamba_chunk_scan_combined_varlen,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    composed_weight_loader, default_weight_loader, sharded_weight_loader)
-from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsPP)
+    composed_weight_loader,
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from vllm.model_executor.models.interfaces import HasInnerState, IsHybrid, SupportsPP
 from vllm.model_executor.models.utils import (
-    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
-    make_layers, maybe_prefix)
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 
@@ -89,12 +103,7 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
 # transformers.models.mamba.modeling_mamba.MambaMixer
 @CustomOp.register(name="plamo2_mamba_mixer")
 class Plamo2MambaMixer(MambaBase, CustomOp):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 *,
-                 prefix: str = "",
-                 **kwargs) -> None:
+    def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None:
         super().__init__()
         self.config = vllm_config.model_config.hf_config
         self.cache_config = vllm_config.cache_config
@@ -103,8 +112,9 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         self.hidden_size = self.config.hidden_size
         self.ssm_state_size = self.config.mamba_d_state
         self.conv_kernel_size = self.config.mamba_d_conv
-        self.intermediate_size = (self.config.mamba_num_heads *
-                                  self.config.hidden_size_per_head)
+        self.intermediate_size = (
+            self.config.mamba_num_heads * self.config.hidden_size_per_head
+        )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.head_dim = self.config.hidden_size_per_head
         self.num_heads = self.config.mamba_num_heads
@@ -155,17 +165,17 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
             torch.empty(
                 divide(self.num_heads, self.tp_size),
                 dtype=torch.float32,
-            ))
+            )
+        )
         self.D = nn.Parameter(torch.ones(divide(self.num_heads, self.tp_size)))
-        self.dt_bias = nn.Parameter(
-            torch.ones(divide(self.num_heads, self.tp_size)))
+        self.dt_bias = nn.Parameter(torch.ones(divide(self.num_heads, self.tp_size)))
 
         set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
         a_weight_loader = composed_weight_loader(
-            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float())
+        )
         set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
-        set_weight_attrs(self.dt_bias,
-                         {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
 
         self.out_proj = RowParallelLinear(
             self.intermediate_size,
@@ -179,12 +189,9 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         # The activation function is fixed to SiLU.
         self.activation = "silu"
 
-        self.dt_norm = RMSNorm(self.time_step_rank,
-                               eps=self.config.rms_norm_eps)
-        self.B_norm = RMSNorm(self.ssm_state_size,
-                              eps=self.config.rms_norm_eps)
-        self.C_norm = RMSNorm(self.ssm_state_size,
-                              eps=self.config.rms_norm_eps)
+        self.dt_norm = RMSNorm(self.time_step_rank, eps=self.config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size, eps=self.config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size, eps=self.config.rms_norm_eps)
 
         self.chunk_size = self.config.mamba_chunk_size
 
@@ -239,7 +246,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         output: torch.Tensor,
         **kwargs,
     ):
-
         forward_context = get_forward_context()
         # attn_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
@@ -260,22 +266,24 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
             prep_initial_states = attn_metadata.prep_initial_states
             chunk_size = attn_metadata.chunk_size
             seq_idx_p = attn_metadata.seq_idx_p
-            chunk_indices_p = attn_metadata.chunk_indices_p
-            chunk_offsets_p = attn_metadata.chunk_offsets_p
             query_start_loc_p = attn_metadata.query_start_loc_p
+            cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
+            last_chunk_indices_p = attn_metadata.last_chunk_indices_p
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)
         gate, hidden_states = projected_states.chunk(2, dim=-1)
 
         # 2. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
 
         if attn_metadata is None:
             # profile run
-            hidden_states = (hidden_states.transpose(0, 1).clone().transpose(
-                0, 1)).contiguous()
+            hidden_states = (
+                hidden_states.transpose(0, 1).clone().transpose(0, 1)
+            ).contiguous()
             output[:] = self.out_proj(hidden_states)
             return
 
@@ -294,9 +302,9 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
             [num_decodes, num_prefill_tokens],
             dim=0,
         )
-        gate_d, gate_p = torch.split(gate[:num_actual_tokens],
-                                     [num_decodes, num_prefill_tokens],
-                                     dim=0)
+        gate_d, gate_p = torch.split(
+            gate[:num_actual_tokens], [num_decodes, num_prefill_tokens], dim=0
+        )
         # Split along batch dimension
         state_indices_tensor_d, state_indices_tensor_p = torch.split(
             state_indices_tensor,
@@ -309,7 +317,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         preallocated_ssm_out = torch.empty(
             [
                 num_prefill_tokens + num_decodes,
-                (self.num_heads // self.tp_size) * self.head_dim
+                (self.num_heads // self.tp_size) * self.head_dim,
             ],
             dtype=hidden_states.dtype,
             device=hidden_states.device,
@@ -325,8 +333,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
             # 2. Convolution sequence transformation
             # - "cache_indices" updates the conv_state cache in positions
             #   pointed to by "state_indices_tensor"
-            x = hidden_states_p.transpose(
-                0, 1)  # this is the form that causal-conv see
+            x = hidden_states_p.transpose(0, 1)  # this is the form that causal-conv see
             hidden_states_p = causal_conv1d_fn(
                 x,
                 conv_weights,
@@ -336,7 +343,8 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
                 has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
                 metadata=attn_metadata,
-                query_start_loc=query_start_loc_p)
+                query_start_loc=query_start_loc_p,
+            )
             hidden_states_p = hidden_states_p.transpose(0, 1)
             hidden_states_p = hidden_states_p[:num_prefill_tokens]
             # In some instances, the following `bcdt_proj` op
@@ -352,30 +360,32 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
                 # making a copy of the states
                 initial_states = torch.where(
                     has_initial_states_p[:, None, None, None],
-                    ssm_state[state_indices_tensor_p], 0)
+                    ssm_state[state_indices_tensor_p],
+                    0,
+                )
 
             varlen_state = mamba_chunk_scan_combined_varlen(
-                hidden_states_p.view(num_prefill_tokens,
-                                     self.num_heads // self.tp_size,
-                                     self.head_dim),
+                hidden_states_p.view(
+                    num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim
+                ),
                 dt,
                 self.A,
                 B.view(num_prefill_tokens, 1, -1),
                 C.view(num_prefill_tokens, 1, -1),
                 chunk_size=chunk_size,
                 D=self.D,
-                z=gate_p.view(num_prefill_tokens,
-                              self.num_heads // self.tp_size, self.head_dim),
+                z=gate_p.view(
+                    num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim
+                ),
                 dt_bias=self.dt_bias,
                 seq_idx=seq_idx_p,
-                chunk_indices=chunk_indices_p,
-                chunk_offsets=chunk_offsets_p,
                 cu_seqlens=query_start_loc_p,
+                cu_chunk_seqlens=cu_chunk_seqlen_p,
+                last_chunk_indices=last_chunk_indices_p,
                 initial_states=initial_states,
                 dt_softplus=True,
                 dt_limit=(0.0, float("inf")),
-                out=preallocated_ssm_out_p.view(num_prefill_tokens, -1,
-                                                self.head_dim),
+                out=preallocated_ssm_out_p.view(num_prefill_tokens, -1, self.head_dim),
                 state_dtype=ssm_state.dtype,
             )
 
@@ -392,21 +402,23 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=state_indices_tensor_d)
+                conv_state_indices=state_indices_tensor_d,
+            )
 
             B, C, dt = self._project_ssm_parameters(hidden_states_d)
 
             # 3. State Space Model sequence transformation
-            A = self.A[:, None, ...][:, :,
-                                     None].expand(-1, self.head_dim,
-                                                  self.config.mamba_d_state)
+            A = self.A[:, None, ...][:, :, None].expand(
+                -1, self.head_dim, self.config.mamba_d_state
+            )
             dt = dt[:, :, None].expand(-1, -1, self.head_dim)
             dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
             D = self.D[:, None, ...].expand(-1, self.head_dim)
             B = B.unsqueeze(1)
             C = C.unsqueeze(1)
             hidden_states_d = hidden_states_d.view(
-                -1, self.num_heads // self.tp_size, self.head_dim)
+                -1, self.num_heads // self.tp_size, self.head_dim
+            )
 
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - ssm_state's slots will be selected
@@ -425,8 +437,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
                 dt_bias=dt_bias,
                 dt_softplus=True,
                 state_batch_indices=state_indices_tensor_d,
-                out=preallocated_ssm_out_d.view(num_decodes, -1,
-                                                self.head_dim),
+                out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
             )
 
         # 4. Final linear projection
@@ -457,8 +468,8 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         return "mamba2"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import (
-            Mamba2AttentionBackend)
+        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
+
         return Mamba2AttentionBackend
 
 
@@ -489,11 +500,10 @@ direct_register_custom_op(
 
 
 class DenseMLP(nn.Module):
-
     def __init__(
         self,
         config: Plamo2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -508,12 +518,14 @@ class DenseMLP(nn.Module):
             return_bias=False,
         )
         self.act = SiluAndMul()
-        self.down_proj = RowParallelLinear(self.intermediate_size,
-                                           self.hidden_size,
-                                           bias=False,
-                                           prefix=f"{prefix}.down_proj",
-                                           quant_config=quant_config,
-                                           return_bias=False)
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            prefix=f"{prefix}.down_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         h = self.gate_up_proj(hidden_states)
@@ -522,12 +534,7 @@ class DenseMLP(nn.Module):
 
 
 class Plamo2AttentionMixer(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -559,21 +566,25 @@ class Plamo2AttentionMixer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        config.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
 
-        self.rope_theta = config.rope_theta if hasattr(config,
-                                                       "rope_theta") else 10000
-        self.rope_scaling = config.rope_scaling if hasattr(
-            config, "rope_scaling") else None
+        self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
+        self.rope_scaling = (
+            config.rope_scaling if hasattr(config, "rope_scaling") else None
+        )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
-                vllm_config.model_config.max_model_len, int):
-            max_position = min(max_position,
-                               vllm_config.model_config.max_model_len)
+            vllm_config.model_config.max_model_len, int
+        ):
+            max_position = min(max_position, vllm_config.model_config.max_model_len)
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -582,22 +593,24 @@ class Plamo2AttentionMixer(nn.Module):
             base=self.rope_theta,
             rope_scaling=self.rope_scaling,
         )
-        self.q_norm = RMSNorm(config.hidden_size_per_head,
-                              eps=config.rms_norm_eps)
+        self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
         self.q_norm.weight = torch.nn.Parameter(
-            torch.ones((self.num_heads, config.hidden_size_per_head)))
-        set_weight_attrs(self.q_norm.weight,
-                         {"weight_loader": sharded_weight_loader(0)})
-        self.k_norm = RMSNorm(config.hidden_size_per_head,
-                              eps=config.rms_norm_eps)
+            torch.ones((self.num_heads, config.hidden_size_per_head))
+        )
+        set_weight_attrs(
+            self.q_norm.weight, {"weight_loader": sharded_weight_loader(0)}
+        )
+        self.k_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
         self.k_norm.weight = torch.nn.Parameter(
-            torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+            torch.ones((self.num_kv_heads, config.hidden_size_per_head))
+        )
         # Tensor-parallelism shards the K norm weights to the tp ranks
         # in a head-wise manner. This approach does not work if there is only
         # a single KV head, as is the case for PLaMo 2-1B.
         if self.total_num_kv_heads != 1:
-            set_weight_attrs(self.k_norm.weight,
-                             {"weight_loader": sharded_weight_loader(0)})
+            set_weight_attrs(
+                self.k_norm.weight, {"weight_loader": sharded_weight_loader(0)}
+            )
 
         self.attn = Attention(
             self.num_heads,
@@ -631,49 +644,43 @@ class Plamo2AttentionMixer(nn.Module):
 
 
 class Plamo2DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 layer_idx: int,
-                 prefix: str = "",
-                 **kwargs) -> None:
+    def __init__(
+        self, vllm_config: VllmConfig, layer_idx: int, prefix: str = "", **kwargs
+    ) -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
         self.is_mamba = is_mamba(config, layer_idx)
         if self.is_mamba:
-            self.mixer = Plamo2MambaMixer(vllm_config=vllm_config,
-                                          prefix=f"{prefix}.mixer")
+            self.mixer = Plamo2MambaMixer(
+                vllm_config=vllm_config, prefix=f"{prefix}.mixer"
+            )
         else:
-            self.mixer = Plamo2AttentionMixer(vllm_config=vllm_config,
-                                              prefix=f"{prefix}.mixer")
-
-        self.mlp = DenseMLP(config=config,
-                            quant_config=quant_config,
-                            prefix=f"{prefix}.mlp")
-        self.pre_mixer_norm = RMSNorm(config.hidden_size,
-                                      eps=config.rms_norm_eps)
-        self.post_mixer_norm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.pre_mlp_norm = RMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-        self.post_mlp_norm = RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
+            self.mixer = Plamo2AttentionMixer(
+                vllm_config=vllm_config, prefix=f"{prefix}.mixer"
+            )
+
+        self.mlp = DenseMLP(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.pre_mixer_norm(hidden_states)
         else:
-            hidden_states, residual = self.pre_mixer_norm(
-                hidden_states, residual)
+            hidden_states, residual = self.pre_mixer_norm(hidden_states, residual)
 
         if self.is_mamba:
             # Plamo2MambaMixer writes output to this tensor
@@ -700,7 +707,6 @@ class Plamo2DecoderLayer(nn.Module):
 
 
 class Plamo2Decoder(torch.nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -708,19 +714,22 @@ class Plamo2Decoder(torch.nn.Module):
 
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
-            return Plamo2DecoderLayer(vllm_config=vllm_config,
-                                      layer_idx=layer_idx,
-                                      prefix=prefix,
-                                      **extra_kwargs)
+            return Plamo2DecoderLayer(
+                vllm_config=vllm_config,
+                layer_idx=layer_idx,
+                prefix=prefix,
+                **extra_kwargs,
+            )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
@@ -733,7 +742,6 @@ class Plamo2Decoder(torch.nn.Module):
 
 @support_torch_compile
 class Plamo2Model(torch.nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -742,36 +750,33 @@ class Plamo2Model(torch.nn.Module):
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             prefix=f"{prefix}.embed_tokens",
         )
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-        self.layers = Plamo2Decoder(vllm_config=vllm_config,
-                                    prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        self.layers = Plamo2Decoder(vllm_config=vllm_config, prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -784,10 +789,9 @@ class Plamo2Model(torch.nn.Module):
             residual=residual,
         )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -816,38 +820,39 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
         # the case for PLaMo2, as indicated by the FIXME comment.
         self.config.head_dim = self.config.hidden_size_per_head
 
-        self.model = Plamo2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Plamo2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.vocab_size = self.config.vocab_size
-        self.unpadded_vocab_size = self.config.vocab_size
-        num_embeddings = ((self.vocab_size + 15) // 16) * 16
         self.lm_head = ParallelLMHead(
-            num_embeddings,
+            self.vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             prefix=f"{prefix}.lm_head",
         )
         if self.config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                self.config.vocab_size)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, self.config.vocab_size
+        )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs):
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     @classmethod
@@ -855,7 +860,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -877,8 +881,7 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
         """
         parallel_config = vllm_config.parallel_config
         hf_config = vllm_config.model_config.hf_config
-        intermediate_size =\
-                hf_config.mamba_num_heads * hf_config.hidden_size_per_head
+        intermediate_size = hf_config.mamba_num_heads * hf_config.hidden_size_per_head
 
         return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
@@ -893,14 +896,13 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-
             # Both tie_word_embeddings=True and lm_head.weight in the safetensor
             # at the same time causes dict key access error.
             if name == "lm_head.weight" and self.config.tie_word_embeddings:
@@ -932,10 +934,12 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
             # Also, in addition to the quantized weights,
             # the zero points and scales have to be reshaped as well.
             # Packing should not be affected by this.
-            if ".mixer.in_proj.weight" in name \
-                or "mixer.in_proj.qweight" in name \
-                or "mixer.in_proj.scales" in name \
-                or "mixer.in_proj.qzeros" in name:
+            if (
+                ".mixer.in_proj.weight" in name
+                or "mixer.in_proj.qweight" in name
+                or "mixer.in_proj.scales" in name
+                or "mixer.in_proj.qzeros" in name
+            ):
                 if "mixer.in_proj.weight" in name:
                     loaded_weight = loaded_weight.transpose(0, 1)
                 # for weight:
@@ -945,14 +949,14 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
                 # for scales and qzeros:
                 # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
                 loaded_weight = loaded_weight.reshape(
-                    loaded_weight.shape[0], self.config.mamba_num_heads, -1)
-                gate_weight, hidden_states_weight = loaded_weight.chunk(2,
-                                                                        dim=-1)
+                    loaded_weight.shape[0], self.config.mamba_num_heads, -1
+                )
+                gate_weight, hidden_states_weight = loaded_weight.chunk(2, dim=-1)
                 gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
                 hidden_states_weight = hidden_states_weight.reshape(
-                    loaded_weight.shape[0], -1)
-                loaded_weight = torch.cat([gate_weight, hidden_states_weight],
-                                          dim=-1)
+                    loaded_weight.shape[0], -1
+                )
+                loaded_weight = torch.cat([gate_weight, hidden_states_weight], dim=-1)
                 if "mixer.in_proj.weight" in name:
                     loaded_weight = loaded_weight.transpose(0, 1)
 
@@ -973,6 +977,5 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
                 continue
 
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index e0c08a6a88271262a7120984b28849520ec10067..50a125c3f59737586c3061d014df4272b2da1eb2 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -6,10 +6,11 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
+
 import json
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -21,21 +22,28 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class QWenMLP(nn.Module):
@@ -47,20 +55,19 @@ class QWenMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str = "silu",
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config)
-        self.c_proj = RowParallelLinear(intermediate_size,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
+            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -71,26 +78,23 @@ class QWenMLP(nn.Module):
 
 
 class QWenAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         max_position_embeddings: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: dict[str, Any] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = hidden_size // self.total_num_heads
         self.c_attn = QKVParallelLinear(
             hidden_size,
@@ -98,12 +102,14 @@ class QWenAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.scaling = self.head_dim**-0.5
 
@@ -114,12 +120,14 @@ class QWenAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -135,12 +143,11 @@ class QWenAttention(nn.Module):
 
 
 class QWenBlock(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -148,26 +155,28 @@ class QWenBlock(nn.Module):
 
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        self.attn = QWenAttention(config.hidden_size,
-                                  config.num_attention_heads,
-                                  config.max_position_embeddings,
-                                  rope_theta=rope_theta,
-                                  rope_scaling=rope_scaling,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.attn")
+        self.attn = QWenAttention(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.max_position_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
         self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-        self.mlp = QWenMLP(config.hidden_size,
-                           config.intermediate_size // 2,
-                           quant_config=quant_config)
+        self.mlp = QWenMLP(
+            config.hidden_size, config.intermediate_size // 2, quant_config=quant_config
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -188,7 +197,6 @@ class QWenBlock(nn.Module):
 
 @support_torch_compile
 class QWenModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -205,29 +213,29 @@ class QWenModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: QWenBlock(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.h")
+            lambda prefix: QWenBlock(config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h",
+        )
         self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -241,16 +249,14 @@ class QWenModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
 
 class QWenBaseModel(nn.Module):
-
     def __init__(
         self,
         *,
@@ -265,28 +271,30 @@ class QWenBaseModel(nn.Module):
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.transformer = transformer_type(vllm_config=vllm_config,
-                                            prefix=maybe_prefix(
-                                                prefix, "transformer"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.transformer = transformer_type(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.transformer.wte.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
+            self.transformer.make_empty_intermediate_tensors
+        )
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w2", 0),
@@ -297,7 +305,7 @@ class QWenBaseModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -319,8 +327,7 @@ class QWenBaseModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -338,14 +345,13 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         if hasattr(config, "visual"):
-            hf_overrides = {
-                "architectures": ["QwenVLForConditionalGeneration"]
-            }
+            hf_overrides = {"architectures": ["QwenVLForConditionalGeneration"]}
             raise RuntimeError(
                 "The configuration of this model indicates that it supports "
                 "vision inputs, but you instantiated the text-only version "
                 "of this model. Please use the vision model by setting "
-                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`"
+            )
 
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
@@ -353,9 +359,10 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index c536b0f60c30d341097034f9851bec1cf01b8da2..1bbb969ce5aa3c651b4af8790857d9338ad15e8b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -24,9 +24,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -39,34 +40,44 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import is_interleaved
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class Qwen2MLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -85,8 +96,9 @@ class Qwen2MLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -97,7 +109,6 @@ class Qwen2MLP(nn.Module):
 
 
 class Qwen2Attention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -105,12 +116,12 @@ class Qwen2Attention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -160,8 +171,11 @@ class Qwen2Attention(nn.Module):
             rope_scaling=rope_scaling,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
-        attn_cls = (EncoderOnlyAttention
-                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+        attn_cls = (
+            EncoderOnlyAttention
+            if attn_type == AttentionType.ENCODER_ONLY
+            else Attention
+        )
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -174,7 +188,10 @@ class Qwen2Attention(nn.Module):
             **{
                 "layer_idx": extract_layer_index(prefix),
                 "dual_chunk_attention_config": dual_chunk_attention_config,
-            } if dual_chunk_attention_config else {})
+            }
+            if dual_chunk_attention_config
+            else {},
+        )
 
     def forward(
         self,
@@ -190,12 +207,11 @@ class Qwen2Attention(nn.Module):
 
 
 class Qwen2DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -203,9 +219,9 @@ class Qwen2DecoderLayer(nn.Module):
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        dual_chunk_attention_config = getattr(config,
-                                              "dual_chunk_attention_config",
-                                              None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
 
         # By default, Qwen2 uses causal attention as it is a decoder-only model.
         # You can override the HF config with `is_causal=False` to enable
@@ -236,32 +252,30 @@ class Qwen2DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -274,14 +288,16 @@ class Qwen2DecoderLayer(nn.Module):
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class Qwen2Model(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config.get_text_config()
@@ -297,14 +313,16 @@ class Qwen2Model(nn.Module):
                 "to discuss this feature.".format(
                     config.max_window_layers,
                     config.num_hidden_layers,
-                ))
+                )
+            )
 
         self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.vocab_size
 
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 config.vocab_size,
                 config.hidden_size,
@@ -318,16 +336,18 @@ class Qwen2Model(nn.Module):
         decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: decoder_layer_type(config=config,
-                                              cache_config=cache_config,
-                                              quant_config=quant_config,
-                                              prefix=prefix),
+            lambda prefix: decoder_layer_type(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
@@ -335,21 +355,21 @@ class Qwen2Model(nn.Module):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -358,16 +378,16 @@ class Qwen2Model(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                islice(self.layers, self.start_layer, self.end_layer)):
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
 
@@ -376,8 +396,7 @@ class Qwen2Model(nn.Module):
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -391,18 +410,19 @@ class Qwen2Model(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -417,8 +437,7 @@ class Qwen2Model(nn.Module):
                     if name is None:
                         continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
                 else:
@@ -435,8 +454,7 @@ class Qwen2Model(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -459,34 +477,35 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = Qwen2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
@@ -499,25 +518,24 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 5f27230c913b4bfa503426e787a3b7fc78bc1298..262ea771d9cdffaf1c1dfcf3af3a85885e7b0ae7 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -22,55 +22,91 @@
 # limitations under the License.
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from copy import copy
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
-    Qwen2_5OmniConfig, Qwen2_5OmniThinkerConfig)
+    Qwen2_5OmniConfig,
+    Qwen2_5OmniThinkerConfig,
+)
 from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
-    Qwen2_5OmniAudioEncoder)
+    Qwen2_5OmniAudioEncoder,
+)
 from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import (
-    Qwen2_5OmniProcessor)
+    Qwen2_5OmniProcessor,
+)
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
-    Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
-    Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
-    Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
-    Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
+    Qwen2_5_VisionTransformer,
+    Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs,
+    Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLProcessingInfo,
+    Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs,
+    Qwen2_5_VLVideoPixelInputs,
+)
 from vllm.model_executor.models.qwen2_audio import (
-    Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths)
+    Qwen2AudioProcessingInfo,
+    _get_feat_extract_output_lengths,
+)
 from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalPromptUpdates,
-                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    DictEmbeddingItems,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    MultiModalPromptUpdates,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+    split_list_into_ranges,
+)
+from .vision import get_llm_pos_ids_for_vision
 
 try:
     import flash_attn
@@ -88,90 +124,95 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
         - msl: Maximum sequence length
         - tsl: Total sequence length
     """
+
     type: Literal["audio_features"]
     input_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("nmb", "tsl"),
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("nmb", "tsl", dynamic_dims={"tsl"}),
     ]
 
+    audio_feature_lengths: Annotated[torch.Tensor, TensorShape("na")]
+
     feature_attention_mask: Annotated[
-        torch.Tensor,
-        TensorShape("na", "msl"),
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("na", "msl", dynamic_dims={"msl"}),
     ]
 
 
 def create_qwen2_5_omni_thinker_field_factory(
-    spatial_merge_size: int
-) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str,
-                                                    MultiModalFieldConfig]]:
-
-    def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str,
-                                                              torch.Tensor]):
-        audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
-                                              torch.empty((0, )))
+    spatial_merge_size: int,
+) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str, MultiModalFieldConfig]]:
+    def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+        audio_feature_lengths = hf_inputs.get(
+            "audio_feature_lengths", torch.empty((0,))
+        )
 
         image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
         image_pixel_grid_sizes = image_grid_thw.prod(-1)
-        image_embed_grid_sizes = (image_pixel_grid_sizes //
-                                  spatial_merge_size // spatial_merge_size)
+        image_embed_grid_sizes = (
+            image_pixel_grid_sizes // spatial_merge_size // spatial_merge_size
+        )
 
         video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
         video_grid_sizes = video_grid_thw.prod(-1)
-        video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
-                                  spatial_merge_size)
+        video_embed_grid_sizes = (
+            video_grid_sizes // spatial_merge_size // spatial_merge_size
+        )
 
         num_videos = len(video_grid_sizes)
 
         return dict(
             input_audio_features=MultiModalFieldConfig.flat_from_sizes(
-                "audio", audio_feature_lengths, dim=1),
+                "audio", audio_feature_lengths, dim=1
+            ),
             feature_attention_mask=MultiModalFieldConfig.batched("audio"),
             audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_pixel_grid_sizes),
+                "image", image_pixel_grid_sizes
+            ),
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_embed_grid_sizes),
+                "image", image_embed_grid_sizes
+            ),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
+                "video", video_grid_sizes
+            ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_embed_grid_sizes),
+                "video", video_embed_grid_sizes
+            ),
             video_grid_thw=MultiModalFieldConfig.batched("video"),
             second_per_grid_ts=MultiModalFieldConfig.batched("video"),
-            use_audio_in_video=MultiModalFieldConfig.shared(
-                "video", num_videos),
+            use_audio_in_video=MultiModalFieldConfig.shared("video", num_videos),
         )
 
     return _qwen2_5_omni_thinker_field_config
 
 
 class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
-
     def __init__(self, spatial_merge_size: int, *args, **kwargs):
         self._spatial_merge_size = spatial_merge_size
         super().__init__(self._spatial_merge_size, *args, **kwargs)
 
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
                 modality="audio",
-                required_fields={
-                    "input_audio_features", "audio_feature_lengths"
-                },
+                required_fields={"input_audio_features", "audio_feature_lengths"},
                 fields_factory=create_qwen2_5_omni_thinker_field_factory(
-                    self._spatial_merge_size),
+                    self._spatial_merge_size
+                ),
             )
 
         return super()._parse_audio_data(data)
 
 
-class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
-                                       Qwen2_5_VLProcessingInfo):
-
+class Qwen2_5OmniThinkerProcessingInfo(
+    Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo
+):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
 
@@ -188,13 +229,13 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None, "video": None}
 
 
 class Qwen2_5OmniThinkerDummyInputsBuilder(
-        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
-
+    BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]
+):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -206,13 +247,17 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
 
-        return (audio_token * num_audios + image_token * num_images +
-                video_token * num_videos)
+        return (
+            audio_token * num_audios
+            + image_token * num_images
+            + video_token * num_videos
+        )
 
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -220,42 +265,55 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
 
         feature_extractor = self.info.get_feature_extractor()
 
-        target_audio_length = min(
-            feature_extractor.chunk_length,
-            30,
-        ) * feature_extractor.sampling_rate
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_audio_length = (
+            min(
+                feature_extractor.chunk_length,
+                30,
+            )
+            * feature_extractor.sampling_rate
+        )
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
+        audio_overrides = mm_options.get("audio") if mm_options else None
 
         mm_data = {
-            "audio":
-            self._get_dummy_audios(length=target_audio_length,
-                                   num_audios=num_audios),
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(width=target_width,
-                                   height=target_height,
-                                   num_frames=target_num_frames,
-                                   num_videos=num_videos),
+            "audio": self._get_dummy_audios(
+                length=target_audio_length,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            ),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+                overrides=video_overrides,
+            ),
         }
 
         return mm_data
 
 
 class Qwen2_5OmniThinkerMultiModalProcessor(
-        BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]):
-
+    BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]
+):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return Qwen2_5OmniThinkerMultiModalDataParser(
-            spatial_merge_size=self.info.get_hf_config(
-            ).vision_config.spatial_merge_size,
-            target_sr=feature_extractor.sampling_rate)
+            spatial_merge_size=self.info.get_hf_config().vision_config.spatial_merge_size,
+            target_sr=feature_extractor.sampling_rate,
+        )
 
     def _call_hf_processor(
         self,
@@ -271,7 +329,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         if audios:
             # NOTE: Qwen2.5-Omni processor accept "audio"
             mm_data["audio"] = audios
-            mm_kwargs = dict(**mm_kwargs, )
+            mm_kwargs = dict(
+                **mm_kwargs,
+            )
 
         hf_inputs = super()._call_hf_processor(
             prompt=prompt,
@@ -280,17 +340,19 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             tok_kwargs=tok_kwargs,
         )
 
-        input_features = hf_inputs.pop('input_features', None)
-        feature_attention_mask = hf_inputs.get('feature_attention_mask', None)
-        if ('input_audio_features' not in hf_inputs
-                and input_features is not None):
+        input_features = hf_inputs.pop("input_features", None)
+        feature_attention_mask = hf_inputs.get("feature_attention_mask", None)
+        if "input_audio_features" not in hf_inputs and input_features is not None:
             if feature_attention_mask is not None:
-                input_features = input_features.permute(
-                    0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
-            hf_inputs['input_audio_features'] = input_features
-        if ('audio_feature_lengths' not in hf_inputs
-                and feature_attention_mask is not None):
-            hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+                input_features = input_features.permute(0, 2, 1)[
+                    feature_attention_mask.bool()
+                ].permute(1, 0)
+            hf_inputs["input_audio_features"] = input_features
+        if (
+            "audio_feature_lengths" not in hf_inputs
+            and feature_attention_mask is not None
+        ):
+            hf_inputs["audio_feature_lengths"] = feature_attention_mask.sum(-1)
 
         video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
         if video_second_per_grid is not None:
@@ -307,8 +369,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         return create_qwen2_5_omni_thinker_field_factory(
-            self.info.get_hf_config().vision_config.spatial_merge_size)(
-                hf_inputs)
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
 
     def _maybe_apply_prompt_updates(
         self,
@@ -317,16 +379,22 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         mm_kwargs: MultiModalKwargsItems,
         mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
         """
         Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
         """
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
-
-        use_audio_in_video = (all(
-            item["use_audio_in_video"].data
-            for item in mm_kwargs["video"]) if "video" in mm_kwargs else False)
+        self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
+
+        use_audio_in_video = False
+        if "video" in mm_kwargs:
+            video_items = [item for item in mm_kwargs["video"] if item is not None]
+            # only check video items (if there are any)
+            if video_items:
+                use_audio_in_video = all(
+                    item["use_audio_in_video"].data for item in video_items
+                )
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
@@ -336,28 +404,73 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video)
-
-            tokenizer = self.info.get_tokenizer()
-            prompt = decode_tokens(tokenizer, prompt_ids)
+                use_audio_in_video=use_audio_in_video,
+            )
         else:
-            (
-                prompt_ids,
-                prompt,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
+            prompt_ids, mm_placeholders = self._apply_prompt_updates(
                 prompt_ids,
                 mm_prompt_updates,
             )
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video)
+                use_audio_in_video=use_audio_in_video,
+            )
 
-        tokenizer = self.info.get_tokenizer()
-        prompt = decode_tokens(tokenizer, prompt_ids)
+        return prompt_ids, mm_placeholders
+
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: list[int] | torch.Tensor,
+        video_second_per_grid_t: float,
+    ) -> list[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(
+            thinker_config.vision_config, "tokens_per_second", 25
+        )
 
-        return prompt_ids, prompt, mm_placeholders
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (
+            torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second
+        ).long()
+        t_index_split_chunk = split_list_into_ranges(t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = (
+                len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+            )
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk, audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
 
     def _get_prompt_updates(
         self,
@@ -367,8 +480,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         vocab = tokenizer.get_vocab()
 
         audio_token = processor.audio_token
@@ -385,12 +497,14 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
             _, audio_output_lens = _get_feat_extract_output_lengths(
-                audio_feature_lengths)
+                audio_feature_lengths
+            )
             audio_output_lengths = audio_output_lens.tolist()
         elif feature_attention_mask is not None:
             assert isinstance(feature_attention_mask, torch.Tensor)
             _, audio_output_lens = _get_feat_extract_output_lengths(
-                feature_attention_mask.sum(-1))
+                feature_attention_mask.sum(-1)
+            )
             audio_output_lengths = audio_output_lens.tolist()
 
         # number of audios read from video.
@@ -405,7 +519,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
                 audio = audios.get(item_idx)
                 raise ValueError(
                     f"The audio {audio} (len={len(audio)}) is too short "
-                    "to be represented inside the model")
+                    "to be represented inside the model"
+                )
 
             return [audio_token_id] * num_features
 
@@ -417,27 +532,26 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             token_id = image_token_id if modality == "image" else video_token_id
             return [token_id] * (int(grid_thw.prod()) // merge_length)
 
-        use_audio_in_video = hf_processor_mm_kwargs.get(
-            "use_audio_in_video", False)
+        use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False)
         thinker_config = self.info.get_hf_config()
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
 
-            audio_num_features = audio_output_lengths[audio_in_video_item_idx +
-                                                      item_idx]
+            audio_num_features = audio_output_lengths[
+                audio_in_video_item_idx + item_idx
+            ]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
 
-            second_per_grid_ts = hf_processor_mm_kwargs.get(
-                "second_per_grid_ts", None)
+            second_per_grid_ts = hf_processor_mm_kwargs.get("second_per_grid_ts", None)
             if second_per_grid_ts:
                 video_second_per_grid_t = second_per_grid_ts[item_idx]
             else:
                 video_second_per_grid_t = 1.0
 
-            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+            return self.omni_get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
@@ -445,8 +559,10 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             )
 
         video_replacement_fn = (
-            get_replacement_qwen2_use_audio_in_video if use_audio_in_video else
-            partial(get_replacement_qwen2_vision, modality="video"))
+            get_replacement_qwen2_use_audio_in_video
+            if use_audio_in_video
+            else partial(get_replacement_qwen2_vision, modality="video")
+        )
 
         return [
             PromptReplacement(
@@ -457,8 +573,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             PromptReplacement(
                 modality="image",
                 target=image_token,
-                replacement=partial(get_replacement_qwen2_vision,
-                                    modality="image"),
+                replacement=partial(get_replacement_qwen2_vision, modality="image"),
             ),
             PromptReplacement(
                 modality="video",
@@ -469,7 +584,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
     def _apply_hf_processor_main(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
@@ -511,8 +626,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         """
         mm_counts = mm_items.get_all_counts()
 
-        use_audio_in_video = hf_processor_mm_kwargs.get(
-            "use_audio_in_video", False)
+        use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False)
         if use_audio_in_video and "video" in mm_counts:
             assert "audio" in mm_counts
             mm_counts["audio"] -= mm_counts["video"]
@@ -541,46 +655,26 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
 
 class Qwen2_5OmniConditionalGenerationMixin:
-
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: object,
-                                        name: str,
-                                        dim: int = 0) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if dim == 0:
-                return mm_input.reshape(-1, *mm_input.shape[2:])
-            return torch.concat(list(mm_input), dim=dim)
-        else:
-            return torch.concat(mm_input, dim=dim)
-
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2_5OmniAudioFeatureInputs]:
-        input_audio_features = kwargs.pop('input_audio_features', None)
-        audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
-        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        self, **kwargs: object
+    ) -> Qwen2_5OmniAudioFeatureInputs | None:
+        input_audio_features = kwargs.pop("input_audio_features", None)
+        audio_feature_lengths = kwargs.pop("audio_feature_lengths", None)
+        feature_attention_mask = kwargs.pop("feature_attention_mask", None)
         if input_audio_features is None:
             return None
-        input_audio_features = self._validate_and_reshape_mm_tensor(
-            input_audio_features, 'input_audio_features', dim=1)
-        if feature_attention_mask is not None:
-            feature_attention_mask = self._validate_and_reshape_mm_tensor(
-                feature_attention_mask, 'feature_attention_mask')
-        if not isinstance(input_audio_features, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio input features. "
-                             f"Got type: {type(input_audio_features)}")
+
         return Qwen2_5OmniAudioFeatureInputs(
             type="audio_features",
             input_features=input_audio_features,
             audio_feature_lengths=audio_feature_lengths,
-            feature_attention_mask=feature_attention_mask)
+            feature_attention_mask=feature_attention_mask,
+        )
 
     def _parse_and_validate_image_input(
         self,
         **kwargs: dict[str, Any],
-    ) -> Optional[Qwen2_5_VLImageInputs]:
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -589,37 +683,23 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
-                                              pixel_values=pixel_values,
-                                              image_grid_thw=image_grid_thw)
+            return Qwen2_5_VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2_5_VLImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
-                image_grid_thw=image_grid_thw)
+                image_grid_thw=image_grid_thw,
+            )
 
     def _parse_and_validate_video_input(
         self,
         **kwargs: dict[str, Any],
-    ) -> Optional[Qwen2_5_VLVideoInputs]:
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -628,11 +708,6 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Qwen2_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -640,54 +715,40 @@ class Qwen2_5OmniConditionalGenerationMixin:
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
+                raise ValueError(
+                    "Incorrect type of video embeddings. "
+                    f"Got type: {type(video_embeds)}"
+                )
             return Qwen2_5_VLVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
-                video_grid_thw=video_grid_thw)
+                video_grid_thw=video_grid_thw,
+            )
 
     def _process_audio_input(
         self,
         audio_input: Qwen2_5OmniAudioFeatureInputs,
-        audio_hashes: list[str] = None,
-        cached_audio_features: torch.Tensor = None,
+        audio_hashes: list[str] | None = None,
+        cached_audio_features: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         input_features = audio_input["input_features"]
         audio_feature_lengths = audio_input["audio_feature_lengths"]
-        if input_features.ndim == 3:
-            assert input_features.shape[0] == 1
-            input_features = input_features.squeeze(0)
-        if audio_feature_lengths.ndim == 2:
-            assert audio_feature_lengths.shape[
-                0] == 1 or audio_feature_lengths.shape[1] == 1
-            if audio_feature_lengths.shape[0] == 1:
-                audio_feature_lengths = audio_feature_lengths.squeeze(0)
-            else:
-                audio_feature_lengths = audio_feature_lengths.squeeze(1)
 
         audio_feat_lengths, audio_output_lengths = (
-            self.audio_tower._get_feat_extract_output_lengths(
-                audio_feature_lengths))
+            self.audio_tower._get_feat_extract_output_lengths(audio_feature_lengths)
+        )
 
         audio_outputs = self.audio_tower(
             input_features.to(self.audio_tower.dtype),
             feature_lens=audio_feature_lengths,
             aftercnn_lens=audio_feat_lengths,
         )
-        return audio_outputs.last_hidden_state.split(
-            audio_output_lengths.tolist())
+        return audio_outputs.last_hidden_state.split(audio_output_lengths.tolist())
 
     def _process_image_input(
-            self,
-            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        self, image_input: Qwen2_5_VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["image_embeds"].type(self.visual.dtype)
 
@@ -695,7 +756,8 @@ class Qwen2_5OmniConditionalGenerationMixin:
         assert grid_thw.ndim == 2
 
         pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-        image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+        with set_forward_context(None, self.vllm_config):
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
         sizes = grid_thw.prod(-1) // merge_size // merge_size
@@ -703,19 +765,20 @@ class Qwen2_5OmniConditionalGenerationMixin:
         return image_embeds.split(sizes.tolist())
 
     def _process_video_input(
-            self,
-            video_input: Qwen2_5_VLVideoInputs,
-            video_hashes: list[str] = None,
-            cached_video_embeds: torch.Tensor = None) -> torch.Tensor:
+        self,
+        video_input: Qwen2_5_VLVideoInputs,
+        video_hashes: list[str] = None,
+        cached_video_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
         if video_input["type"] == "video_embeds":
             return video_input["video_embeds"].type(self.visual.dtype)
 
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
 
-        pixel_values_videos = video_input["pixel_values_videos"].type(
-            self.visual.dtype)
-        video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+        pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype)
+        with set_forward_context(None, self.vllm_config):
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
         sizes = grid_thw.prod(-1) // merge_size // merge_size
@@ -729,14 +792,22 @@ class Qwen2_5OmniConditionalGenerationMixin:
     dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
 )
 class Qwen2_5OmniThinkerForConditionalGeneration(
-        nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
-        Qwen2_5OmniConditionalGenerationMixin):
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsLoRA,
+    SupportsMRoPE,
+    Qwen2_5OmniConditionalGenerationMixin,
+):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
             "thinker.model.": "language_model.model.",
             "thinker.": "",
-        })
+        }
+    )
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -755,7 +826,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|IMAGE|><|vision_end|>"
         if modality.startswith("video"):
@@ -767,8 +838,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        self.vllm_config = vllm_config
         thinker_config: Qwen2_5OmniThinkerConfig = (
-            vllm_config.model_config.hf_config.thinker_config)
+            vllm_config.model_config.hf_config.thinker_config
+        )
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = thinker_config
@@ -784,20 +857,20 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             logger.warning(
                 "flash_attn is not available, the model may not yield the "
                 "exactly same result as the transformers implementation "
-                "in the audio tower part.")
+                "in the audio tower part."
+            )
 
         if multimodal_config.get_limit_per_prompt("audio"):
-            self.audio_tower = Qwen2_5OmniAudioEncoder(
-                thinker_config.audio_config)
+            self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
         else:
             self.audio_tower = None
 
         if multimodal_config.get_limit_per_prompt(
-                "image") or multimodal_config.get_limit_per_prompt("video"):
+            "image"
+        ) or multimodal_config.get_limit_per_prompt("video"):
             self.visual = Qwen2_5_VisionTransformer(
                 vision_config=thinker_config.vision_config,
-                norm_eps=getattr(thinker_config.text_config, "rms_norm_eps",
-                                 1e-6),
+                norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
             )
@@ -813,7 +886,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
@@ -821,28 +895,245 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values", "image_embeds"
-                             ) and "image" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "image"] = self._parse_and_validate_image_input(**kwargs)
-            if input_key in ("pixel_values_videos", "video_embeds"
-                             ) and "video" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "video"] = self._parse_and_validate_video_input(**kwargs)
-            if input_key in ("input_audio_features"
-                             ) and "audio" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("input_audio_features")
+                and "audio" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(
+                    **kwargs
+                )
         return mm_input_by_modality
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        """
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = self.config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_start_token_id = thinker_config.vision_start_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(
+            thinker_config.vision_config, "tokens_per_second", 25
+        )
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = (
+                llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            )
+            if src_item[idx] not in [audio_token_id, video_token_id, image_token_id]:
+                if use_audio_in_video and idx > 0:
+                    if (
+                        src_item[idx] == vision_end_token_id
+                        and src_item[idx - 1] == audio_end_token_id
+                    ):
+                        # processing the <|audio_eos|> before <|vision_eos|>
+                        start_idx -= 1
+                    elif (
+                        src_item[idx] == audio_start_token_id
+                        and src_item[idx - 1] == vision_start_token_id
+                    ):
+                        # processing the <|audio_bos|> after <|vision_eos|>
+                        start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx], dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * second_per_grid_ts[video_idx]
+                    * tokens_per_second
+                ).long()
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (
+                    torch.arange(grid_t)
+                    * second_per_grid_ts[video_idx]
+                    * tokens_per_second
+                ).long()
+                t_index_split_chunk = split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk
+                )
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: list[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = (
+                        len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    )
+                    new_src_item.extend([video_token_id] * vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = get_llm_pos_ids_for_vision(
+                        start_idx,
+                        video_idx,
+                        spatial_merge_size,
+                        t_chunk,
+                        grid_hs,
+                        grid_ws,
+                    ).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len - added_audio_len)
+                        * [audio_token_id]
+                    )
+                    audio_start_idx = (
+                        start_idx
+                        if len(audio_llm_pos_ids_list) == 0
+                        else audio_llm_pos_ids_list[-1][0].item() + 1
+                    )
+                    if min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (
+                            torch.arange(
+                                min(
+                                    t_ntoken_per_chunk, pure_audio_len - added_audio_len
+                                )
+                            ).expand(3, -1)
+                            + audio_start_idx
+                        ).split(1, dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(
+                        t_ntoken_per_chunk, pure_audio_len - added_audio_len
+                    )
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id]
+                    )
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(3, -1)
+                        + llm_pos_ids_list[-1].max()
+                        + 1
+                    ).split(1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = (
+            torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item)
+        )
+
+        return llm_positions, mrope_position_delta
 
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
 
@@ -855,37 +1146,38 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                vision_embeddings = self._process_image_input(multimodal_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(multimodal_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
             if modality == "audio":
                 audio_embeddings = self._process_audio_input(multimodal_input)
-                multimodal_embeddings += audio_embeddings
+                multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    # TODO (ywang96): support overlapping modality embeddings so that
+    # `use_audio_in_video` will work on V1.
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-
-            # TODO (ywang96): support overlapping modality embeddings so that
-            # `use_audio_in_video` will work on V1.
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings, [
-                    self.config.image_token_index,
-                    self.config.video_token_index,
-                    self.config.audio_token_index
-                ])
-        return inputs_embeds
-
-    def get_multimodal_embeddings_v0(
-            self, **kwargs: object) -> Optional[NestedTensors]:
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         image_input = self._parse_and_validate_image_input(**kwargs)
         video_input = self._parse_and_validate_video_input(**kwargs)
@@ -906,59 +1198,29 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             multimodal_embeddings.append((video_embeds, "video"))
         return multimodal_embeddings
 
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
-            return inputs_embeds
-
-        for embeddings, modality in multimodal_embeddings:
-            if modality == "audio":
-                placeholder_token_id = self.config.audio_token_index
-            if modality == "image":
-                placeholder_token_id = self.config.image_token_index
-            if modality == "video":
-                placeholder_token_id = self.config.video_token_index
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, embeddings, placeholder_token_id)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs)
-            inputs_embeds = self.get_input_embeddings_v0(
-                input_ids, multimodal_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = ["talker.", "token2wav."]
         if self.audio_tower is None:
             skip_prefixes.extend(["audio_tower."])
@@ -969,8 +1231,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             self,
             skip_prefixes=skip_prefixes,
         )
-        loaded_weights = loader.load_weights(weights,
-                                             mapper=self.hf_to_vllm_mapper)
+        loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
         return loaded_weights
 
@@ -981,4 +1242,5 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="merger.",
-            tower_model=["visual.", "audio_tower."])
+            tower_model=["visual.", "audio_tower."],
+        )
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 79f4203018cf199d3cd9c8e533e4c161cf2753d4..7047cdcb4a217cb1735086dbf9de25cd7e717dba 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -25,59 +25,93 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
-from collections.abc import Iterable, Mapping, Sequence
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
+import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange
 from transformers import BatchFeature
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
-    Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
-
-from vllm.attention.layer import check_upstream_fa_availability
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLVisionConfig,
+)
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import maybe_get_vit_flash_attn_backend
+from vllm.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_torch_sdpa_wrapper,
+    vit_xformers_attn_wrapper,
+)
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
-# yapf: disable
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-# yapf: enable
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.vision import should_torch_compile_mm_vit
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.evs import (compute_mrope_for_media,
-                                 compute_retained_tokens_count,
-                                 compute_retention_mask,
-                                 recompute_mrope_positions)
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.evs import (
+    compute_mrope_for_media,
+    compute_retained_tokens_count,
+    compute_retention_mask,
+    recompute_mrope_positions,
+)
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
-from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsMultiModalPruning,
-                         SupportsPP, SupportsQuant)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsMultiModalPruning,
+    SupportsPP,
+    SupportsQuant,
+)
 from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
-from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
-                       apply_rotary_pos_emb_vision)
-from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
+from .qwen2_vl import (
+    Qwen2VLMultiModalProcessor,
+    Qwen2VLProcessingInfo,
+    apply_rotary_pos_emb_vision,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    cast_overflow_tensors,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from .vision import (
+    get_vit_attn_backend,
+    run_dp_sharded_mrope_vision_model,
+)
 
 logger = init_logger(__name__)
 
@@ -95,8 +129,9 @@ class Qwen2_5_VLImagePixelInputs(TensorSchema):
         - pixel_values shape: (num_patches, num_channels * patch_size *
           patch_size)
         - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
-          formatnum_channels * patch_size * patch_size
+          format.
     """
+
     type: Literal["pixel_values"]
 
     pixel_values: Annotated[
@@ -125,6 +160,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TensorSchema):
         - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["image_embeds"]
 
     image_embeds: Annotated[
@@ -138,8 +174,9 @@ class Qwen2_5_VLImageEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
-                              Qwen2_5_VLImageEmbeddingInputs]
+Qwen2_5_VLImageInputs: TypeAlias = (
+    Qwen2_5_VLImagePixelInputs | Qwen2_5_VLImageEmbeddingInputs
+)
 
 
 class Qwen2_5_VLVideoPixelInputs(TensorSchema):
@@ -159,6 +196,7 @@ class Qwen2_5_VLVideoPixelInputs(TensorSchema):
           grid along the temporal dimension in the 3D position IDs. Returned
           when `videos` is not `None`.
     """
+
     type: Literal["pixel_values_videos"]
 
     pixel_values_videos: Annotated[
@@ -172,7 +210,7 @@ class Qwen2_5_VLVideoPixelInputs(TensorSchema):
     ]
 
     second_per_grid_ts: Annotated[
-        Optional[torch.Tensor],
+        torch.Tensor | None,
         TensorShape("nv"),
     ]
 
@@ -192,6 +230,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["video_embeds"]
 
     video_embeds: Annotated[
@@ -205,22 +244,24 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
-                              Qwen2_5_VLVideoEmbeddingInputs]
+Qwen2_5_VLVideoInputs: TypeAlias = (
+    Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
+)
 
 # === Vision Encoder === #
 
 
 class Qwen2_5_VisionMLP(nn.Module):
-
-    def __init__(self,
-                 in_features: int,
-                 hidden_features: int,
-                 bias: bool = False,
-                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=in_features,
@@ -228,14 +269,17 @@ class Qwen2_5_VisionMLP(nn.Module):
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
-            disable_tp=use_data_parallel)
-
-        self.down_proj = RowParallelLinear(hidden_features,
-                                           in_features,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj",
-                                           disable_tp=use_data_parallel)
+            disable_tp=use_data_parallel,
+        )
+
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -245,47 +289,33 @@ class Qwen2_5_VisionMLP(nn.Module):
         return x_down
 
 
-def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
-    """All-gather the input tensor interleavely across model parallel group."""
-    import torch.distributed as dist
-    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    dist.all_gather(gathered_tensors,
-                    local_tensor,
-                    group=parallel_state.get_tp_group().device_group)
-
-    gathered_tensors_split = [
-        torch.split(tensor, hidden_size // tp_size, -1)
-        for tensor in gathered_tensors
-    ]
-    ordered_tensors = [
-        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
-    ]
-    result_tensor = torch.cat(ordered_tensors, dim=-1)
-    return result_tensor
-
-
 class Qwen2_5_VisionAttention(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = (1 if use_data_parallel else
-                        parallel_state.get_tensor_model_parallel_world_size())
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads)
+            projection_size, num_heads
+        )
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, self.tp_size)
+            num_heads, self.tp_size
+        )
 
         self.qkv = QKVParallelLinear(
             hidden_size=embed_dim,
@@ -295,141 +325,135 @@ class Qwen2_5_VisionAttention(nn.Module):
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv",
-            disable_tp=use_data_parallel)
+            disable_tp=use_data_parallel,
+        )
 
-        self.proj = RowParallelLinear(input_size=projection_size,
-                                      output_size=embed_dim,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj",
-                                      disable_tp=use_data_parallel)
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
         self.attn_backend = attn_backend
         self.use_upstream_fa = use_upstream_fa
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
+        # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
+        from vllm.platforms import current_platform
+
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
+            self.use_upstream_fa = True
+        if current_platform.is_xpu():
+            self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
-        # [s, b, 3 * head * head_dim]
-        seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
-                                        self.tp_size)
-
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
-        q, k, v = qkv.chunk(3, dim=2)
-
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(dist_utils.split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
-        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
-                     self.hidden_size_per_attention_head)
-        q, k, v = (x.view(*new_shape) for x in (q, k, v))
-        return q, k, v
-
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
+        seq_len, batch_size, _ = x.shape
 
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
-        q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
+        qkv = einops.rearrange(
+            x,
+            "s b (three head head_dim) -> b s three head head_dim",
+            three=3,
+            head=self.num_attention_heads_per_partition,
+        )
 
-        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
-                   for x in (q, k, v))
         if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+            qk, v = qkv[:, :, :2], qkv[:, :, 2]
+
+            qk_reshaped = einops.rearrange(
+                qk, "b s two head head_dim -> (two b) s head head_dim", two=2
+            )
+            qk_rotated = apply_rotary_pos_emb_vision(qk_reshaped, rotary_pos_emb)
+            qk_rotated = qk_rotated.view(
+                2,
+                batch_size,
+                seq_len,
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
+            q, k = qk_rotated.unbind(dim=0)
+        else:
+            q, k, v = qkv.unbind(dim=2)
 
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
-
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-
-            output = flash_attn_varlen_func(q,
-                                            k,
-                                            v,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0.0,
-                                            causal=False)
-
-            context_layer = rearrange(output,
-                                      "(b s) h d -> s b (h d)",
-                                      b=batch_size).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            context_layer = vit_flash_attn_wrapper(
+                q,
+                k,
+                v,
+                cu_seqlens,
+                max_seqlen,
+                batch_size,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
+                self.use_upstream_fa,
+            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
-            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-                                 for x in [q_i, k_i, v_i])
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
-                output_i = rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
+            from vllm.platforms import current_platform
+
+            # Never remove the next contiguous logic
+            # Without it, hallucinations occur with the backend
+            if current_platform.is_rocm():
+                q = q.contiguous()
+                k = k.contiguous()
+                v = v.contiguous()
+            context_layer = vit_torch_sdpa_wrapper(
+                q,
+                k,
+                v,
+                cu_seqlens,
+            )
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
 
 
+@support_torch_compile(
+    dynamic_arg_dims={
+        "x": 0,
+        "cu_seqlens": 0,
+        "rotary_pos_emb": 0,
+        "seqlens": 0,
+    },
+    mark_unbacked_dims={"seqlens": 0},
+    enable_if=should_torch_compile_mm_vit,
+)
 class Qwen2_5_VisionBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -444,35 +468,46 @@ class Qwen2_5_VisionBlock(nn.Module):
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa)
-        self.mlp = Qwen2_5_VisionMLP(dim,
-                                     mlp_hidden_dim,
-                                     act_fn=act_fn,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp",
-                                     use_data_parallel=use_data_parallel)
+            use_upstream_fa=use_upstream_fa,
+            attn_backend_override=attn_backend_override,
+        )
+        self.mlp = Qwen2_5_VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
-        x_attn = self.attn(self.norm1(x),
-                           cu_seqlens=cu_seqlens,
-                           rotary_pos_emb=rotary_pos_emb,
-                           max_seqlen=max_seqlen,
-                           seqlens=seqlens)
+        x_attn = self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
         return x
 
 
+@support_torch_compile(
+    dynamic_arg_dims={
+        "x": 0,
+    },
+    enable_if=should_torch_compile_mm_vit,
+)
 class Qwen2_5_VisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         patch_size: int = 14,
@@ -486,29 +521,35 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = nn.Conv3d(in_channels,
-                              hidden_size,
-                              kernel_size=kernel_size,
-                              stride=kernel_size,
-                              bias=False)
+        self.proj = Conv3dLayer(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
-                   self.patch_size)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
         x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
+@support_torch_compile(
+    dynamic_arg_dims={
+        "x": 0,
+    },
+    enable_if=should_torch_compile_mm_vit,
+)
 class Qwen2_5_VisionPatchMerger(nn.Module):
-
     def __init__(
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -548,13 +589,13 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
 
 
 class Qwen2_5_VisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
         self.dim = dim
         self.theta = theta
-        inv_freq = 1.0 / (theta**(
-            torch.arange(0, dim, 2, dtype=torch.float, device='cpu') / dim))
+        inv_freq = 1.0 / (
+            theta ** (torch.arange(0, dim, 2, dtype=torch.float, device="cpu") / dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._seq_len_cached = 0
         self._freqs_cached = None
@@ -563,12 +604,18 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
         if seqlen > self._seq_len_cached:
             seqlen *= 2
             self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (self.theta**(torch.arange(
-                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
-                                                / self.dim))
-            seq = torch.arange(seqlen,
-                               device=self.inv_freq.device,
-                               dtype=self.inv_freq.dtype)
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
             freqs = torch.outer(seq, self.inv_freq)
             self._freqs_cached = freqs
 
@@ -578,14 +625,14 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
 
 
 class Qwen2_5_VisionTransformer(nn.Module):
-
     def __init__(
         self,
         vision_config: Qwen2_5_VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -604,13 +651,18 @@ class Qwen2_5_VisionTransformer(nn.Module):
         self.spatial_merge_size = vision_config.spatial_merge_size
         self.fullatt_block_indexes = vision_config.fullatt_block_indexes
         self.spatial_merge_unit = self.spatial_merge_size**2
-
-        self.patch_embed = Qwen2_5_VisionPatchEmbed(
-            patch_size=patch_size,
-            temporal_patch_size=temporal_patch_size,
-            in_channels=in_channels,
-            hidden_size=self.hidden_size,
-        )
+        # TODO[@lucaskabela]: Investigate fixing this usage
+        # see https://github.com/vllm-project/vllm/issues/27044
+        # DO NOT MOVE THIS IMPORT
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("Qwen2_5_VisionPatchEmbed"):
+            self.patch_embed = Qwen2_5_VisionPatchEmbed(
+                patch_size=patch_size,
+                temporal_patch_size=temporal_patch_size,
+                in_channels=in_channels,
+                hidden_size=self.hidden_size,
+            )
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
@@ -618,43 +670,58 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            use_upstream_fa = True
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
             )
+        with set_model_tag("Qwen2_5_VisionBlock"):
+            self.blocks = nn.ModuleList(
+                [
+                    Qwen2_5_VisionBlock(
+                        dim=self.hidden_size,
+                        num_heads=self.num_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=get_act_and_mul_fn(vision_config.hidden_act),
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.blocks.{layer_idx}",
+                        use_data_parallel=use_data_parallel,
+                        attn_backend=self.attn_backend,
+                        use_upstream_fa=use_upstream_fa,
+                        attn_backend_override=attn_backend_override,
+                    )
+                    for layer_idx in range(depth)
+                ]
+            )
 
-        self.blocks = nn.ModuleList([
-            Qwen2_5_VisionBlock(
-                dim=self.hidden_size,
-                num_heads=self.num_heads,
-                mlp_hidden_dim=vision_config.intermediate_size,
-                act_fn=get_act_and_mul_fn(vision_config.hidden_act),
+        with set_model_tag("Qwen2_5_VisionPatchMerger"):
+            self.merger = Qwen2_5_VisionPatchMerger(
+                d_model=vision_config.out_hidden_size,
+                context_dim=self.hidden_size,
                 norm_layer=norm_layer,
+                spatial_merge_size=self.spatial_merge_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}",
+                prefix=f"{prefix}.merger",
                 use_data_parallel=use_data_parallel,
-                attn_backend=self.attn_backend,
-                use_upstream_fa=use_upstream_fa) for layer_idx in range(depth)
-        ])
-        self.merger = Qwen2_5_VisionPatchMerger(
-            d_model=vision_config.out_hidden_size,
-            context_dim=self.hidden_size,
-            norm_layer=norm_layer,
-            spatial_merge_size=self.spatial_merge_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.merger",
-            use_data_parallel=use_data_parallel,
-        )
+            )
 
     @property
     def dtype(self) -> torch.dtype:
@@ -667,48 +734,66 @@ class Qwen2_5_VisionTransformer(nn.Module):
     def rotary_pos_emb_thw(self, t, h, w):
         hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
         wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-        hpos_ids = hpos_ids.reshape(
-            h // self.spatial_merge_size,
-            self.spatial_merge_size,
-            w // self.spatial_merge_size,
-            self.spatial_merge_size,
-        ).permute(0, 2, 1, 3).flatten()
-        wpos_ids = wpos_ids.reshape(
-            h // self.spatial_merge_size,
-            self.spatial_merge_size,
-            w // self.spatial_merge_size,
-            self.spatial_merge_size,
-        ).permute(0, 2, 1, 3).flatten()
+        hpos_ids = (
+            hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            .permute(0, 2, 1, 3)
+            .flatten()
+        )
+        wpos_ids = (
+            wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            .permute(0, 2, 1, 3)
+            .flatten()
+        )
         pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
         max_size = max(h, w)
         rotary_pos_emb_full = self.rotary_pos_emb(max_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         rotary_pos_emb = rotary_pos_emb.reshape(
             rotary_pos_emb.shape[0] // self.spatial_merge_unit,
-            self.spatial_merge_unit, -1)
+            self.spatial_merge_unit,
+            -1,
+        )
 
         return rotary_pos_emb
 
     def get_window_index_thw(self, grid_t, grid_h, grid_w):
-        vit_merger_window_size = (self.window_size //
-                                  self.spatial_merge_size // self.patch_size)
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
 
         llm_grid_h = grid_h // self.spatial_merge_size
         llm_grid_w = grid_w // self.spatial_merge_size
         index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
-            grid_t, llm_grid_h, llm_grid_w)
+            grid_t, llm_grid_h, llm_grid_w
+        )
         pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
         pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
         num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
         num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
-        index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
-        index_padded = index_padded.reshape(grid_t, num_windows_h,
-                                            vit_merger_window_size,
-                                            num_windows_w,
-                                            vit_merger_window_size)
+        index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+        index_padded = index_padded.reshape(
+            grid_t,
+            num_windows_h,
+            vit_merger_window_size,
+            num_windows_w,
+            vit_merger_window_size,
+        )
         index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
-            grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
-            vit_merger_window_size)
+            grid_t,
+            num_windows_h * num_windows_w,
+            vit_merger_window_size,
+            vit_merger_window_size,
+        )
         seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
         index_padded = index_padded.reshape(-1)
         index_new = index_padded[index_padded != -100]
@@ -720,35 +805,40 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
     @lru_cache(maxsize=1024)  # noqa: B019
     def get_rope_by_thw(self, t, h, w):
-        window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(
-            t, h, w)
+        window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w)
         rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w)
         rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :]
         rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1)
         cu_seqlens_thw = torch.repeat_interleave(
-            torch.tensor([h * w], dtype=torch.int32), t)
-        return (rotary_pos_emb_thw, window_index_thw, cu_seqlens_window_thw,
-                cu_seqlens_thw)
+            torch.tensor([h * w], dtype=torch.int32), t
+        )
+        return (
+            rotary_pos_emb_thw,
+            window_index_thw,
+            cu_seqlens_window_thw,
+            cu_seqlens_thw,
+        )
 
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
-        max_seqlen, seqlens = None, None
-        if (self.attn_backend == _Backend.FLASH_ATTN
-                or self.attn_backend == _Backend.ROCM_AITER_FA):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        max_seqlen = torch.zeros([], device=cu_seqlens.device)
+        seqlens = torch.zeros(1, device=cu_seqlens.device)
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
     @staticmethod
     def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
         # building the inverse permutation in O(n) time
         inv = torch.empty_like(perm, pin_memory=is_pin_memory_available())
-        inv[perm] = torch.arange(perm.numel(),
-                                 device=perm.device,
-                                 dtype=perm.dtype)
+        inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
         return inv
 
     def forward(
@@ -781,10 +871,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
             ) = self.get_rope_by_thw(t, h, w)
 
             window_index.append(window_index_thw + window_index_id)
-            window_index_id += (t * llm_h * llm_w)
+            window_index_id += t * llm_h * llm_w
 
-            cu_seqlens_window_thw = (cu_seqlens_window_thw +
-                                     cu_window_seqlens_last)
+            cu_seqlens_window_thw = cu_seqlens_window_thw + cu_window_seqlens_last
             cu_window_seqlens_last = cu_seqlens_window_thw[-1]
             cu_window_seqlens.append(cu_seqlens_window_thw)
 
@@ -804,23 +893,22 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
-            cu_seqlens)
+        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
         max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
-            cu_window_seqlens)
+            cu_window_seqlens
+        )
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
-        cu_window_seqlens = cu_window_seqlens.to(device=self.device,
-                                                 non_blocking=True)
-        rotary_pos_emb = rotary_pos_emb.to(device=self.device,
-                                           non_blocking=True)
-        window_index = window_index.to(device=hidden_states.device,
-                                       non_blocking=True)
-        reverse_indices = reverse_indices.to(device=hidden_states.device,
-                                             non_blocking=True)
+        cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
+        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
+        window_index = window_index.to(device=hidden_states.device, non_blocking=True)
+        reverse_indices = reverse_indices.to(
+            device=hidden_states.device, non_blocking=True
+        )
 
         hidden_states = hidden_states.reshape(
-            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
         hidden_states = hidden_states[window_index, :, :]
         hidden_states = hidden_states.reshape(seq_len, -1)
 
@@ -854,8 +942,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("attn.qkv.", "attn.q.", "q"),
@@ -868,7 +955,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -878,15 +965,13 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5_VLConfig)
 
@@ -899,7 +984,6 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
 
 
 class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -917,8 +1001,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
@@ -937,13 +1020,19 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
             num_tokens = int(grid_thw.prod()) // merge_length
 
             # EVS-specific code
-            video_pruning_rate = self.info.ctx.get_mm_config(
-            ).video_pruning_rate
-            if (modality == "video" and video_pruning_rate is not None
-                    and video_pruning_rate > 0.0):
+            video_pruning_rate = self.info.ctx.get_mm_config().video_pruning_rate
+            if (
+                modality == "video"
+                and video_pruning_rate is not None
+                and video_pruning_rate > 0.0
+            ):
+                T, H, W = map(int, grid_thw)
+                tokens_per_frame = (H // image_processor.merge_size) * (
+                    W // image_processor.merge_size
+                )
                 num_tokens = compute_retained_tokens_count(
-                    grid_thw,
-                    image_processor.merge_size,
+                    tokens_per_frame,
+                    T,
                     video_pruning_rate,
                 )
             # End of EVS-specific code
@@ -954,20 +1043,29 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
             PromptReplacement(
                 modality=modality,
                 target=[placeholder[modality]],
-                replacement=partial(get_replacement_qwen2vl,
-                                    modality=modality),
-            ) for modality in ("image", "video")
+                replacement=partial(get_replacement_qwen2vl, modality=modality),
+            )
+            for modality in ("image", "video")
         ]
 
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2_5_VLMultiModalProcessor,
     info=Qwen2_5_VLProcessingInfo,
-    dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
-class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                         SupportsLoRA, SupportsPP,
-                                         SupportsQuant,
-                                         SupportsMultiModalPruning):
+    dummy_inputs=Qwen2_5_VLDummyInputsBuilder,
+)
+class Qwen2_5_VLForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+    SupportsEagle3,
+    SupportsMultiModalPruning,
+    SupportsMRoPE,
+):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -983,12 +1081,129 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             # mapping for original checkpoint
             "lm_head.": "language_model.lm_head.",
             "model.": "language_model.model.",
-        })
+        }
+    )
 
     supports_encoder_tp_data = True
 
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+
+        hf_config = self.config
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = image_grid_thw[image_index]
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = video_grid_thw[video_index]
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                    * video_second_per_grid_t
+                    * tokens_per_second
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return llm_positions, mrope_position_delta
+
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1003,19 +1218,28 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.config = config
+        self.vllm_config = vllm_config
         self.multimodal_config = multimodal_config
         self.video_pruning_rate = multimodal_config.video_pruning_rate
         self.is_multimodal_pruning_enabled = (
-            multimodal_config.is_multimodal_pruning_enabled())
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
 
-        if multimodal_config.get_limit_per_prompt("image") or \
-            multimodal_config.get_limit_per_prompt("video"):
+        if multimodal_config.get_limit_per_prompt(
+            "image"
+        ) or multimodal_config.get_limit_per_prompt("video"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
             self.visual = Qwen2_5_VisionTransformer(
-                config.vision_config,
+                vision_config=config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
                 use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
             )
         else:
             self.visual = None
@@ -1027,26 +1251,19 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
-        else:
-            return torch.concat(mm_input)
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.language_model.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.language_model.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        self, **kwargs: object
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1055,28 +1272,22 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
-                                              pixel_values=pixel_values,
-                                              image_grid_thw=image_grid_thw)
+            return Qwen2_5_VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
             return Qwen2_5_VLImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
-                image_grid_thw=image_grid_thw)
+                image_grid_thw=image_grid_thw,
+            )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        self, **kwargs: object
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1086,12 +1297,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-            if second_per_grid_ts is not None and second_per_grid_ts.ndim == 2:
-                second_per_grid_ts = second_per_grid_ts.squeeze(-1)
             return Qwen2_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1100,20 +1305,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Qwen2_5_VLVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
-                video_grid_thw=video_grid_thw)
+                video_grid_thw=video_grid_thw,
+            )
 
     def _process_image_input(
-            self,
-            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
-
+        self, image_input: Qwen2_5_VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
         grid_thw_list = grid_thw.tolist()
@@ -1122,27 +1322,24 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-
-            if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
-            else:
-                image_embeds = self.visual(pixel_values,
-                                           grid_thw=grid_thw_list)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                    )
+                else:
+                    image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _postprocess_image_embeds_evs(
-            self, image_embeds_split: tuple[torch.Tensor, ...],
-            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        self,
+        image_embeds_split: tuple[torch.Tensor, ...],
+        image_input: Qwen2_5_VLImageInputs,
+    ) -> tuple[torch.Tensor, ...]:
         """
         Append mrope positions for each for images.
         This is necessary to recover correct mrope
@@ -1163,17 +1360,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         grid_thw_list = grid_thw.tolist()
         image_embeds_out = []
         for emb, size in zip(image_embeds_split, grid_thw_list):
-            positions = compute_mrope_for_media(size,
-                                                merge_size).to(emb.device)
+            positions = compute_mrope_for_media(size, merge_size).to(emb.device)
             emb = torch.cat([emb, positions], dim=1)
             image_embeds_out.append(emb)
         image_embeds_split = image_embeds_out
         return tuple(image_embeds_split)
 
     def _process_video_input(
-            self,
-            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
-
+        self, video_input: Qwen2_5_VLVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
         grid_thw_list = grid_thw.tolist()
@@ -1182,26 +1377,29 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values_videos,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
-            else:
-                video_embeds = self.visual(pixel_values_videos,
-                                           grid_thw=grid_thw_list)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual,
+                        pixel_values_videos,
+                        grid_thw_list,
+                        rope_type="rope_3d",
+                    )
+                else:
+                    video_embeds = self.visual(
+                        pixel_values_videos, grid_thw=grid_thw_list
+                    )
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _postprocess_video_embeds_evs(
-            self, video_embeds_split: tuple[torch.Tensor, ...],
-            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+        self,
+        video_embeds_split: tuple[torch.Tensor, ...],
+        video_input: Qwen2_5_VLVideoInputs,
+    ) -> tuple[torch.Tensor, ...]:
         """
         Prunes video embeddings via Efficient Video Sampling (EVS)
         and then appends mrope positions for each retained embeddings
@@ -1226,9 +1424,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         tokens_per_second = self.config.vision_config.tokens_per_second
 
         video_embeds_out = []
-        for emb, size, video_second_per_grid_t in zip(video_embeds_split,
-                                                      grid_thw_list,
-                                                      second_per_grid_ts):
+        for emb, size, video_second_per_grid_t in zip(
+            video_embeds_split, grid_thw_list, second_per_grid_ts
+        ):
             # For each video, we compute retention mask using EVS
             retention_mask = compute_retention_mask(
                 emb,
@@ -1280,20 +1478,19 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         vision_start_token_id = self.config.vision_start_token_id
 
         # Device
-        device = (multimodal_embeddings[0].device
-                  if len(multimodal_embeddings) else mrope_positions.device)
+        device = (
+            multimodal_embeddings[0].device
+            if len(multimodal_embeddings)
+            else mrope_positions.device
+        )
 
         # Tensors
-        input_ids_t = torch.as_tensor(input_ids,
-                                      device=device,
-                                      dtype=torch.long)
+        input_ids_t = torch.as_tensor(input_ids, device=device, dtype=torch.long)
 
-        # fmt: off
-        mm_embeddings_out = [mm[:, :-4] for mm in
-                             multimodal_embeddings]
-        mm_embeddings_pos = [mm[:, -4:].permute(1, 0).long() for mm in
-                             multimodal_embeddings]
-        # fmt: in
+        mm_embeddings_out = [mm[:, :-4] for mm in multimodal_embeddings]
+        mm_embeddings_pos = [
+            mm[:, -4:].permute(1, 0).long() for mm in multimodal_embeddings
+        ]
 
         positions, mrope_positions_delta = recompute_mrope_positions(
             input_ids_t,
@@ -1313,24 +1510,27 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values", "image_embeds"
-                             ) and "image" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "image"] = self._parse_and_validate_image_input(**kwargs)
-            if input_key in ("pixel_values_videos", "video_embeds"
-                             ) and "video" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
         return mm_input_by_modality
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
 
@@ -1343,76 +1543,29 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                vision_embeddings = self._process_image_input(multimodal_input)
+                image_embeddings = self._process_image_input(multimodal_input)
                 if self.is_multimodal_pruning_enabled:
-                    vision_embeddings = self._postprocess_image_embeds_evs(
-                        vision_embeddings, multimodal_input
+                    image_embeddings = self._postprocess_image_embeds_evs(
+                        image_embeddings, multimodal_input
                     )
-                multimodal_embeddings += vision_embeddings
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(multimodal_input)
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
                     )
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [self.config.image_token_id, self.config.video_token_id])
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Qwen2_5_VLImageInputs] = None,
-        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            if self.is_multimodal_pruning_enabled:
-                image_embeds = self._postprocess_image_embeds_evs(
-                    image_embeds, image_input
-                )
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
-            )
-
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            if self.is_multimodal_pruning_enabled:
-                video_embeds = self._postprocess_video_embeds_evs(
-                    video_embeds, video_input
-                )
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen2.5-VL.
 
         Args:
@@ -1427,26 +1580,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input)
-                input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -1458,12 +1591,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
             skip_prefixes.extend(["visual."])
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 762ab42e5929e1323c6cb27d064ca4e22ad22599..7e883a393aa8d4817018497eb73dbe43c18e0a52 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -22,35 +22,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
-from transformers.models.qwen2_audio import (Qwen2AudioConfig,
-                                             Qwen2AudioEncoder,
-                                             Qwen2AudioProcessor)
+from transformers.models.qwen2_audio import (
+    Qwen2AudioConfig,
+    Qwen2AudioEncoder,
+    Qwen2AudioProcessor,
+)
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (AudioItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    AudioItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    DictEmbeddingItems,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
 
 # # === Audio Inputs === #
@@ -60,9 +75,10 @@ class Qwen2AudioFeatureInputs(TensorSchema):
         - na: Number of audios
         - nmb: Number of mel bins
     """
+
     type: Literal["audio_features"]
     input_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("na", "nmb", 3000),
     ]
 
@@ -80,6 +96,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
         - hs: Hidden size (must match the hidden size of language model
           backbone)
     """
+
     type: Literal["audio_embeds"] = "audio_embeds"
 
     audio_embeds: Annotated[
@@ -88,13 +105,12 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
+Qwen2AudioInputs: TypeAlias = Qwen2AudioFeatureInputs | Qwen2AudioEmbeddingInputs
 
 # === Audio Encoder === #
 
 
 class Qwen2AudioMultiModalProjector(nn.Module):
-
     def __init__(self, audio_hidden_size: int, text_hidden_size: int):
         super().__init__()
         self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)
@@ -112,27 +128,23 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
 
 
 class Qwen2AudioProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
     def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
 
-    def get_feature_extractor(self,
-                              **kwargs: object) -> WhisperFeatureExtractor:
+    def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
 
-class Qwen2AudioDummyInputsBuilder(
-        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
-
+class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
 
@@ -145,6 +157,7 @@ class Qwen2AudioDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
@@ -152,9 +165,12 @@ class Qwen2AudioDummyInputsBuilder(
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            )
         }
 
 
@@ -167,11 +183,10 @@ def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]):
 
 
 class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
-
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -183,13 +198,10 @@ class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
         return super()._parse_audio_data(data)
 
 
-class Qwen2AudioMultiModalProcessor(
-        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
-
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return Qwen2AudioMultiModalDataParser(
-            target_sr=feature_extractor.sampling_rate)
+        return Qwen2AudioMultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
@@ -237,17 +249,14 @@ class Qwen2AudioMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
         # Use getattr with default to be compatible with transformers<4.48
         audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
-        audio_bos_token = getattr(processor, "audio_bos_token",
-                                  "<|audio_bos|>")
-        audio_eos_token = getattr(processor, "audio_eos_token",
-                                  "<|audio_eos|>")
+        audio_bos_token = getattr(processor, "audio_bos_token", "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token", "<|audio_eos|>")
 
         audio_token_id = vocab[audio_token]
         audio_bos_id = vocab[audio_bos_token]
@@ -260,26 +269,27 @@ class Qwen2AudioMultiModalProcessor(
         else:
             assert isinstance(feature_attention_mask, torch.Tensor)
             _, audio_output_lens = _get_feat_extract_output_lengths(
-                feature_attention_mask.sum(-1))
+                feature_attention_mask.sum(-1)
+            )
 
             audio_output_lengths = audio_output_lens.tolist()
 
         def get_replacement_qwen2_audio(item_idx: int):
-
             if audio_output_lengths:
                 num_features = audio_output_lengths[item_idx]
             else:
                 audio_embeds = out_mm_data["audio_embeds"][item_idx]
-                assert len(audio_embeds.shape
-                           ) == 2, "audio_embeds must be a 2D tensor"
+                assert len(audio_embeds.shape) == 2, "audio_embeds must be a 2D tensor"
                 num_features = audio_embeds.shape[0]
 
             if num_features == 0:
                 audios = mm_items.get_items("audio", AudioProcessorItems)
                 audio_len = audios.get_audio_length(item_idx)
 
-                raise ValueError(f"The audio (len={audio_len}) is too short "
-                                 "to be represented inside the model")
+                raise ValueError(
+                    f"The audio (len={audio_len}) is too short "
+                    "to be represented inside the model"
+                )
 
             audio_tokens = [audio_token_id] * num_features
 
@@ -300,12 +310,13 @@ class Qwen2AudioMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2AudioMultiModalProcessor,
     info=Qwen2AudioProcessingInfo,
-    dummy_inputs=Qwen2AudioDummyInputsBuilder)
-class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                         SupportsPP):
+    dummy_inputs=Qwen2AudioDummyInputsBuilder,
+)
+class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
 
@@ -321,7 +332,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.audio_tower = Qwen2AudioEncoder(config.audio_config)
         self.multi_modal_projector = Qwen2AudioMultiModalProjector(
-            config.audio_config.d_model, config.text_config.hidden_size)
+            config.audio_config.d_model, config.text_config.hidden_size
+        )
 
         self.quant_config = quant_config
 
@@ -333,51 +345,36 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            return mm_input.reshape(-1, *mm_input.shape[2:])
-        else:
-            return torch.concat(mm_input)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
-        input_features = kwargs.pop('input_features', None)
-        audio_embeds = kwargs.pop('audio_embeds', None)
-        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        self, **kwargs: object
+    ) -> Qwen2AudioInputs | None:
+        input_features = kwargs.pop("input_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        feature_attention_mask = kwargs.pop("feature_attention_mask", None)
 
         if input_features is None and audio_embeds is None:
             return None
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-            audio_embeds = self._validate_and_reshape_mm_tensor(
-                audio_embeds, "audio_embeds")
-            return Qwen2AudioEmbeddingInputs(type="audio_embeds",
-                                             audio_embeds=audio_embeds)
+            return Qwen2AudioEmbeddingInputs(
+                type="audio_embeds", audio_embeds=audio_embeds
+            )
 
         if input_features is not None:
-            input_features = self._validate_and_reshape_mm_tensor(
-                input_features, 'input_features')
-            feature_attention_mask = self._validate_and_reshape_mm_tensor(
-                feature_attention_mask, 'feature_attention_mask')
             return Qwen2AudioFeatureInputs(
                 type="audio_features",
                 input_features=input_features,
-                feature_attention_mask=feature_attention_mask)
+                feature_attention_mask=feature_attention_mask,
+            )
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_audio_input(
         self, audio_input: Qwen2AudioInputs
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if audio_input["type"] == "audio_embeds":
             audio_embeds = audio_input["audio_embeds"]
             return tuple(audio_embeds)
@@ -387,103 +384,90 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         audio_feat_lengths, audio_output_lengths = (
             self.audio_tower._get_feat_extract_output_lengths(
-                feature_attention_mask.sum(-1)))
+                feature_attention_mask.sum(-1)
+            )
+        )
 
         batch_size, _, max_mel_seq_len = input_features.shape
         max_seq_len = (max_mel_seq_len - 2) // 2 + 1
         # Create a sequence tensor of shape (batch_size, max_seq_len)
-        seq_range = (torch.arange(
-            0,
-            max_seq_len,
-            dtype=audio_feat_lengths.dtype,
-            device=audio_feat_lengths.device).unsqueeze(0).expand(
-                batch_size, max_seq_len))
+        seq_range = (
+            torch.arange(
+                0,
+                max_seq_len,
+                dtype=audio_feat_lengths.dtype,
+                device=audio_feat_lengths.device,
+            )
+            .unsqueeze(0)
+            .expand(batch_size, max_seq_len)
+        )
         lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
-            batch_size, max_seq_len)
+            batch_size, max_seq_len
+        )
         # Create mask
         padding_mask = seq_range >= lengths_expand
 
-        audio_attention_mask_ = padding_mask.view(
-            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
-                                                  max_seq_len)
+        audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+            batch_size, 1, max_seq_len, max_seq_len
+        )
         audio_attention_mask = audio_attention_mask_.to(
             dtype=self.audio_tower.conv1.weight.dtype,
-            device=self.audio_tower.conv1.weight.device)
+            device=self.audio_tower.conv1.weight.device,
+        )
         audio_attention_mask[audio_attention_mask_] = float("-inf")
 
-        audio_outputs = self.audio_tower(input_features,
-                                         attention_mask=audio_attention_mask)
+        audio_outputs = self.audio_tower(
+            input_features, attention_mask=audio_attention_mask
+        )
         selected_audio_feature = audio_outputs.last_hidden_state
         audio_features = self.multi_modal_projector(selected_audio_feature)
         num_audios, max_audio_tokens, embed_dim = audio_features.shape
         audio_output_lengths = audio_output_lengths.unsqueeze(1)
-        audio_features_mask = torch.arange(max_audio_tokens).expand(
-            num_audios, max_audio_tokens).to(
-                audio_output_lengths.device) < audio_output_lengths
-        masked_audio_features = audio_features[audio_features_mask].view(
-            -1, embed_dim)
+        audio_features_mask = (
+            torch.arange(max_audio_tokens)
+            .expand(num_audios, max_audio_tokens)
+            .to(audio_output_lengths.device)
+            < audio_output_lengths
+        )
+        masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
 
         # Split to tuple of embeddings for individual audio input.
-        return torch.split(masked_audio_features,
-                           audio_output_lengths.flatten().tolist())
+        return torch.split(
+            masked_audio_features, audio_output_lengths.flatten().tolist()
+        )
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
         masked_audio_features = self._process_audio_input(audio_input)
         return masked_audio_features
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.audio_token_index)
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 6a9acaf2c3fe03064873e6c9063ccde13b9d595e..2ff0d19df238ce3e473c7813be180035be979f79 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -24,9 +24,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -39,70 +40,87 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Qwen2MoeMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
+        expert_gate: torch.nn.Linear | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
+        self.expert_gate = expert_gate
 
     def forward(self, x):
         gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
+        out = self.act_fn(gate_up)
+        out, _ = self.down_proj(out)
 
+        if self.expert_gate is not None:
+            out = F.sigmoid(self.expert_gate(x)) * out
+
+        return out
 
-class Qwen2MoeSparseMoeBlock(nn.Module):
 
+class Qwen2MoeSparseMoeBlock(nn.Module):
     def __init__(
         self,
         config: Qwen2MoeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -111,77 +129,78 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.num_experts}.")
-
-        self.experts = FusedMoE(num_experts=config.num_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=config.norm_topk_prob,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.num_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+
         if config.shared_expert_intermediate_size > 0:
             self.shared_expert = Qwen2MoeMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.shared_expert_intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=self.experts.must_reduce_shared_expert_outputs(
-                ),
+                reduce_results=False,
+                expert_gate=self.shared_expert_gate,
                 prefix=f"{prefix}.shared_expert",
             )
         else:
             self.shared_expert = None
-        self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
-                                                  1,
-                                                  bias=False)
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_expert,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
         hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
-        shared_output = None
-        if self.shared_expert is not None:
-            shared_output = self.shared_expert(hidden_states)
-            if self.shared_expert_gate is not None:
-                shared_output = F.sigmoid(
-                    self.shared_expert_gate(hidden_states)) * shared_output
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+        if self.shared_expert is not None:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
         if self.tp_size > 1:
             final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
+                final_hidden_states
+            )
 
         return final_hidden_states.view(orig_shape)
 
 
 class Qwen2MoeAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -207,19 +226,23 @@ class Qwen2MoeAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
-        self.qkv_proj = QKVParallelLinear(hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -240,7 +263,10 @@ class Qwen2MoeAttention(nn.Module):
             **{
                 "layer_idx": extract_layer_index(prefix),
                 "dual_chunk_attention_config": dual_chunk_attention_config,
-            } if dual_chunk_attention_config else {})
+            }
+            if dual_chunk_attention_config
+            else {},
+        )
 
     def forward(
         self,
@@ -256,23 +282,21 @@ class Qwen2MoeAttention(nn.Module):
 
 
 class Qwen2MoeDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Qwen2MoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        dual_chunk_attention_config = getattr(config,
-                                              "dual_chunk_attention_config",
-                                              None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = Qwen2MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -289,53 +313,53 @@ class Qwen2MoeDecoderLayer(nn.Module):
         # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
         # `mlp_only_layers` in the config.
         layer_idx = extract_layer_index(prefix)
-        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
-                           config.mlp_only_layers)
+        mlp_only_layers = (
+            [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
+        )
         if (layer_idx not in mlp_only_layers) and (
-                config.num_experts > 0 and
-            (layer_idx + 1) % config.decoder_sparse_step == 0):
-            self.mlp = Qwen2MoeSparseMoeBlock(config=config,
-                                              quant_config=quant_config,
-                                              prefix=f"{prefix}.mlp")
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            )
         else:
-            self.mlp = Qwen2MoeMLP(hidden_size=config.hidden_size,
-                                   intermediate_size=config.intermediate_size,
-                                   hidden_act=config.hidden_act,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class Qwen2MoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -352,32 +376,34 @@ class Qwen2MoeModel(nn.Module):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Qwen2MoeDecoderLayer(config=config,
-                                                cache_config=cache_config,
-                                                quant_config=quant_config,
-                                                prefix=prefix),
+            lambda prefix: Qwen2MoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -386,24 +412,23 @@ class Qwen2MoeModel(nn.Module):
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -417,7 +442,7 @@ class Qwen2MoeModel(nn.Module):
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -431,8 +456,9 @@ class Qwen2MoeModel(nn.Module):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -455,21 +481,25 @@ class Qwen2MoeModel(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -477,7 +507,8 @@ class Qwen2MoeModel(nn.Module):
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
+                            ".kv_scale", ".attn.kv_scale"
+                        )
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
                                 "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
@@ -488,26 +519,22 @@ class Qwen2MoeModel(nn.Module):
                         else:
                             name = remapped_kv_scale_name
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-
     fall_back_to_pt_during_load = False
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        ]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -516,41 +543,52 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Qwen2MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        # Only perform the following mapping when Qwen2MoeMLP exists
+        if (
+            getattr(config, "mlp_only_layers", [])
+            or config.shared_expert_intermediate_size > 0
+        ):
+            self.packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
+
+        self.model = Qwen2MoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 2bd9d2b52628a587245cfe9ea469d36d864d642a..eac46e0f8b05561798efe172d56e883e4bfc7eba 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -6,15 +6,14 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-RM model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.sequence import IntermediateTensors
 
@@ -25,7 +24,6 @@ from .utils import AutoWeightsLoader, maybe_prefix
 
 
 class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
-
     is_pooling_model = True
     pooler: Pooler
 
@@ -45,58 +43,60 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = Qwen2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.head_dtype = vllm_config.model_config.head_dtype
 
         self.score = nn.Sequential(
-            ColumnParallelLinear(config.hidden_size,
-                                 config.hidden_size,
-                                 quant_config=quant_config,
-                                 params_dtype=self.head_dtype,
-                                 return_bias=False),
+            ColumnParallelLinear(
+                config.hidden_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                params_dtype=self.head_dtype,
+                return_bias=False,
+            ),
             nn.ReLU(),
-            RowParallelLinear(config.hidden_size,
-                              config.num_labels,
-                              params_dtype=self.head_dtype,
-                              quant_config=quant_config,
-                              return_bias=False),
+            RowParallelLinear(
+                config.hidden_size,
+                config.num_labels,
+                params_dtype=self.head_dtype,
+                quant_config=quant_config,
+                return_bias=False,
+            ),
         )
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         hidden_states = hidden_states.to(self.head_dtype)
         logits = self.score(hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."])
         return loader.load_weights(weights)
 
 
 @default_pooling_type("ALL")
 class Qwen2ForRewardModel(Qwen2RewardBaseModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         vllm_config.model_config.hf_config.num_labels = 1
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -105,12 +105,12 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)}, )
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
+        )
 
 
 @default_pooling_type("STEP")
 class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         vllm_config.model_config.hf_config.num_labels = 2
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -119,4 +119,5 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)})
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
+        )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5fbf8935405569ce6d13e765444b2d2ef1da1160..5d21e249fc4ccfc88d7ad22875cef682df68235c 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -24,57 +24,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
-from collections.abc import Iterable, Mapping, Sequence
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import AutoConfig, BatchFeature, PretrainedConfig
-from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
-                                          Qwen2VLProcessor)
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
-    Qwen2VLConfig, Qwen2VLVisionConfig)
+    Qwen2VLConfig,
+    Qwen2VLVisionConfig,
+)
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
-    Qwen2VLVideoProcessor)
+from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
 
-from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.conv import Conv3dLayer
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    dispatch_rotary_emb_function,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, VideoItem)
-from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
-from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from .vision import (
+    get_vit_attn_backend,
+    run_dp_sharded_mrope_vision_model,
+)
 
 logger = init_logger(__name__)
 
@@ -91,13 +125,14 @@ class Qwen2VLImagePixelInputs(TensorSchema):
               the batch
         - ni: Number of images
         - cps: Number of channels * patch_size * patch_size
-    
+
     Historical context:
-        - pixel_values shape: (num_patches, num_channels * patch_size * 
+        - pixel_values shape: (num_patches, num_channels * patch_size *
           patch_size)
         - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["pixel_values"]
 
     pixel_values: Annotated[
@@ -117,7 +152,7 @@ class Qwen2VLImageEmbeddingInputs(TensorSchema):
         - nf: Number of image features
         - hs: Hidden size
         - ni: Number of images
-    
+
     Historical context:
         - image_embeds shape: (num_image_features, hidden_size)
         - num_image_features varies based on the number and resolution of the
@@ -126,6 +161,7 @@ class Qwen2VLImageEmbeddingInputs(TensorSchema):
         - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["image_embeds"]
 
     image_embeds: Annotated[
@@ -139,8 +175,7 @@ class Qwen2VLImageEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
-                           Qwen2VLImageEmbeddingInputs]
+Qwen2VLImageInputs: TypeAlias = Qwen2VLImagePixelInputs | Qwen2VLImageEmbeddingInputs
 
 
 class Qwen2VLVideoPixelInputs(TensorSchema):
@@ -148,16 +183,17 @@ class Qwen2VLVideoPixelInputs(TensorSchema):
     Dimensions:
         - np: The total number of patches over each video over each prompt in
               the batch
-        - ctps: Number of channels * temporal_patch_size * patch_size * 
+        - ctps: Number of channels * temporal_patch_size * patch_size *
           patch_size
         - nv: Number of videos
-    
+
     Historical context:
-        - pixel_values_videos shape: (num_patches, num_channels * 
+        - pixel_values_videos shape: (num_patches, num_channels *
           temporal_patch_size * patch_size * patch_size)
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["pixel_values_videos"]
 
     pixel_values_videos: Annotated[
@@ -177,7 +213,7 @@ class Qwen2VLVideoEmbeddingInputs(TensorSchema):
         - nf: Number of video features
         - hs: Hidden size
         - nv: Number of videos
-    
+
     Historical context:
         - video_embeds shape: (num_video_features, hidden_size)
         - num_video_features varies based on the number and resolution of the
@@ -186,6 +222,7 @@ class Qwen2VLVideoEmbeddingInputs(TensorSchema):
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
     """
+
     type: Literal["video_embeds"]
 
     video_embeds: Annotated[
@@ -199,35 +236,37 @@ class Qwen2VLVideoEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
-                           Qwen2VLVideoEmbeddingInputs]
+Qwen2VLVideoInputs: TypeAlias = Qwen2VLVideoPixelInputs | Qwen2VLVideoEmbeddingInputs
 
 # === Vision Encoder === #
 
 
 class Qwen2VisionMLP(nn.Module):
-
     def __init__(
         self,
         in_features: int,
         hidden_features: int,
         act_layer: type[nn.Module] = QuickGELU,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
         super().__init__()
-        self.fc1 = ColumnParallelLinear(in_features,
-                                        hidden_features,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1",
-                                        disable_tp=use_data_parallel)
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
+        )
         self.act = act_layer()
-        self.fc2 = RowParallelLinear(hidden_features,
-                                     in_features,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2",
-                                     disable_tp=use_data_parallel)
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_parallel, _ = self.fc1(x)
@@ -242,15 +281,14 @@ def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
         return torch.cat((-x2, x1), dim=-1)
     else:
         x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1),
-                         "... d two -> ... (d two)",
-                         two=2)
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
 
 
-def apply_rotary_emb_torch(x: torch.Tensor,
-                           cos: torch.Tensor,
-                           sin: torch.Tensor,
-                           interleaved: bool = False) -> torch.Tensor:
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
     """
     x: (batch_size, seqlen, nheads, headdim)
     cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
@@ -258,117 +296,126 @@ def apply_rotary_emb_torch(x: torch.Tensor,
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos = repeat(
-        cos,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     sin = repeat(
-        sin,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     return torch.cat(
         [
-            x[..., :ro_dim] * cos +
-            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
         ],
         dim=-1,
     )
 
 
-def apply_rotary_pos_emb_vision(t: torch.Tensor,
-                                freqs: torch.Tensor) -> torch.Tensor:
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
     t_ = t.float()
     cos = freqs.cos()
     sin = freqs.sin()
-    apply_rotary_emb = apply_rotary_emb_torch
-    if current_platform.is_cuda():
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-    output = apply_rotary_emb(t_, cos, sin).type_as(t)
+    output = rotary_emb_function(t_, cos, sin).type_as(t)
     return output
 
 
 class Qwen2VisionAttention(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = (1 if use_data_parallel else
-                        parallel_state.get_tensor_model_parallel_world_size())
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads)
+            projection_size, num_heads
+        )
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, self.tp_size)
-
-        self.qkv = ColumnParallelLinear(input_size=embed_dim,
-                                        output_size=3 * projection_size,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv",
-                                        disable_tp=use_data_parallel)
-        self.proj = RowParallelLinear(input_size=projection_size,
-                                      output_size=embed_dim,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj",
-                                      disable_tp=use_data_parallel)
+            num_heads, self.tp_size
+        )
+
+        self.qkv = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=3 * projection_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
 
         # Detect attention implementation.
         self.attn_backend = get_vit_attn_backend(
             head_size=self.hidden_size_per_attention_head,
-            dtype=torch.get_default_dtype())
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
         self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
-                f"Qwen2-VL does not support {self.attn_backend} backend now.")
+                f"Qwen2-VL does not support {self.attn_backend} backend now."
+            )
+
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(dist_utils.split_tensor_along_last_dim,
-                               num_partitions=self.tp_size)
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
-                     self.hidden_size_per_attention_head)
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
         q, k, v = (x.view(*new_shape) for x in (q, k, v))
         return q, k, v
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
-
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
 
@@ -376,8 +423,7 @@ class Qwen2VisionAttention(nn.Module):
         q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
-        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
-                   for x in (q, k, v))
+        q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
         if rotary_pos_emb is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
@@ -385,31 +431,31 @@ class Qwen2VisionAttention(nn.Module):
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
-
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            output = flash_attn_varlen_func(q,
-                                            k,
-                                            v,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0.0,
-                                            causal=False)
-
-            context_layer = rearrange(output,
-                                      "(b s) h d -> s b (h d)",
-                                      b=batch_size).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            output = self.flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0.0,
+                causal=False,
+            )
+
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=batch_size
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
+            from vllm.platforms import current_platform
+
+            if current_platform.is_rocm():
+                q = q.contiguous()
+                k = k.contiguous()
+                v = v.contiguous()
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 start_idx = cu_seqlens[i - 1]
@@ -417,46 +463,47 @@ class Qwen2VisionAttention(nn.Module):
                 q_i = q[:, start_idx:end_idx]
                 k_i = k[:, start_idx:end_idx]
                 v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-                                 for x in [q_i, k_i, v_i])
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
                 output_i = rearrange(output_i, "b h s d -> b s h d ")
                 outputs.append(output_i)
             context_layer = torch.cat(outputs, dim=1)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
+            attn_bias = BlockDiagonalMask.from_seqlens(
+                q_seqlen=seqlens, kv_seqlen=None, device=q.device
+            )
 
             context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
+                q, k, v, attn_bias=attn_bias, p=0, scale=None
+            )
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
 
 
 class Qwen2VisionBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_ratio: float,
         act_layer: type[nn.Module] = QuickGELU,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -465,26 +512,31 @@ class Qwen2VisionBlock(nn.Module):
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
 
-        self.attn = Qwen2VisionAttention(embed_dim=dim,
-                                         num_heads=num_heads,
-                                         projection_size=dim,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.attn",
-                                         use_data_parallel=use_data_parallel)
-        self.mlp = Qwen2VisionMLP(dim,
-                                  mlp_hidden_dim,
-                                  act_layer=act_layer,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.mlp",
-                                  use_data_parallel=use_data_parallel)
+        self.attn = Qwen2VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
+        self.mlp = Qwen2VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -499,7 +551,6 @@ class Qwen2VisionBlock(nn.Module):
 
 
 class Qwen2VisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         patch_size: int = 14,
@@ -513,29 +564,29 @@ class Qwen2VisionPatchEmbed(nn.Module):
         self.embed_dim = embed_dim
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = nn.Conv3d(in_channels,
-                              embed_dim,
-                              kernel_size=kernel_size,
-                              stride=kernel_size,
-                              bias=False)
+        self.proj = Conv3dLayer(
+            in_channels,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
-                   self.patch_size)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
         x = self.proj(x).view(L, self.embed_dim)
         return x
 
 
 class Qwen2VisionPatchMerger(nn.Module):
-
     def __init__(
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -544,21 +595,27 @@ class Qwen2VisionPatchMerger(nn.Module):
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.ln_q = norm_layer(context_dim)
-        self.mlp = nn.ModuleList([
-            ColumnParallelLinear(self.hidden_size,
-                                 self.hidden_size,
-                                 bias=True,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mlp.0",
-                                 disable_tp=use_data_parallel),
-            nn.GELU(),
-            RowParallelLinear(self.hidden_size,
-                              d_model,
-                              bias=True,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.mlp.2",
-                              disable_tp=use_data_parallel),
-        ])
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.0",
+                    disable_tp=use_data_parallel,
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.2",
+                    disable_tp=use_data_parallel,
+                ),
+            ]
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.ln_q(x)
@@ -572,13 +629,11 @@ class Qwen2VisionPatchMerger(nn.Module):
 
 
 class Qwen2VisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
         self.dim = dim
         self.theta = theta
-        inv_freq = 1.0 / (theta
-                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._seq_len_cached = 0
         self._freqs_cached = None
@@ -587,12 +642,18 @@ class Qwen2VisionRotaryEmbedding(nn.Module):
         if seqlen > self._seq_len_cached:
             seqlen *= 2
             self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (self.theta**(torch.arange(
-                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
-                                                / self.dim))
-            seq = torch.arange(seqlen,
-                               device=self.inv_freq.device,
-                               dtype=self.inv_freq.dtype)
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
             freqs = torch.outer(seq, self.inv_freq)
             self._freqs_cached = freqs
 
@@ -602,14 +663,14 @@ class Qwen2VisionRotaryEmbedding(nn.Module):
 
 
 class Qwen2VisionTransformer(nn.Module):
-
     def __init__(
         self,
         vision_config: Qwen2VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -641,16 +702,21 @@ class Qwen2VisionTransformer(nn.Module):
         head_dim = embed_dim // num_heads
         self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
 
-        self.blocks = nn.ModuleList([
-            Qwen2VisionBlock(dim=embed_dim,
-                             num_heads=num_heads,
-                             mlp_ratio=mlp_ratio,
-                             norm_layer=norm_layer,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.blocks.{layer_idx}",
-                             use_data_parallel=use_data_parallel)
-            for layer_idx in range(depth)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2VisionBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                    attn_backend_override=attn_backend_override,
+                )
+                for layer_idx in range(depth)
+            ]
+        )
         self.merger = Qwen2VisionPatchMerger(
             d_model=hidden_size,
             context_dim=embed_dim,
@@ -660,11 +726,15 @@ class Qwen2VisionTransformer(nn.Module):
             use_data_parallel=use_data_parallel,
         )
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -680,20 +750,27 @@ class Qwen2VisionTransformer(nn.Module):
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten()
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            ).permute(0, 2, 1, 3).flatten()
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
             max_grid_size = max(max_grid_size, h, w)
         pos_ids = torch.cat(pos_ids, dim=0)
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
@@ -701,34 +778,42 @@ class Qwen2VisionTransformer(nn.Module):
         return rotary_pos_emb
 
     def compute_attn_mask_seqlen(
-            self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+        self, cu_seqlens: torch.Tensor
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
-        if (self.attn_backend == _Backend.FLASH_ATTN
-                or self.attn_backend == _Backend.ROCM_AITER_FA):
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
         x = self.patch_embed(x)
 
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+
         # compute position embedding
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
-        grid_thw_ = torch.tensor(grid_thw)
-        cu_seqlens = torch.repeat_interleave(grid_thw_[:, 1] * grid_thw_[:, 2],
-                                             grid_thw_[:, 0]).cumsum(
-                                                 dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         # transformers
         x = x.unsqueeze(1)
@@ -749,8 +834,7 @@ class Qwen2VisionTransformer(nn.Module):
 
         return x
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -761,7 +845,7 @@ class Qwen2VisionTransformer(nn.Module):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -772,41 +856,45 @@ class Qwen2VisionTransformer(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 def _create_qwen2vl_field_factory(
-    spatial_merge_size: int
+    spatial_merge_size: int,
 ) -> Callable[
     [Mapping[str, torch.Tensor]],
-        Mapping[str, MultiModalFieldConfig],
+    Mapping[str, MultiModalFieldConfig],
 ]:
-
     def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
         image_pixel_grid_sizes = image_grid_thw.prod(-1)
-        image_embed_grid_sizes = (image_pixel_grid_sizes //
-                                  spatial_merge_size // spatial_merge_size)
+        image_embed_grid_sizes = (
+            image_pixel_grid_sizes // spatial_merge_size // spatial_merge_size
+        )
 
         video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
         video_grid_sizes = video_grid_thw.prod(-1)
-        video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
-                                  spatial_merge_size)
+        video_embed_grid_sizes = (
+            video_grid_sizes // spatial_merge_size // spatial_merge_size
+        )
 
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_pixel_grid_sizes),
+                "image", image_pixel_grid_sizes
+            ),
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_embed_grid_sizes),
+                "image", image_embed_grid_sizes
+            ),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
+                "video", video_grid_sizes
+            ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_embed_grid_sizes),
+                "video", video_embed_grid_sizes
+            ),
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
@@ -814,44 +902,40 @@ def _create_qwen2vl_field_factory(
 
 
 class Qwen2VLMultiModalDataParser(MultiModalDataParser):
-
     def __init__(self, spatial_merge_size: int, *args, **kwargs):
         self._spatial_merge_size = spatial_merge_size
         super().__init__(*args, **kwargs)
 
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
                 modality="image",
                 required_fields={"image_embeds", "image_grid_thw"},
-                fields_factory=_create_qwen2vl_field_factory(
-                    self._spatial_merge_size),
+                fields_factory=_create_qwen2vl_field_factory(self._spatial_merge_size),
             )
 
         return super()._parse_image_data(data)
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
                 modality="video",
                 required_fields={"video_embeds", "video_grid_thw"},
-                fields_factory=_create_qwen2vl_field_factory(
-                    self._spatial_merge_size),
+                fields_factory=_create_qwen2vl_field_factory(self._spatial_merge_size),
             )
 
         return super()._parse_video_data(data)
 
 
 class Qwen2VLProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
@@ -865,7 +949,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -884,7 +968,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None:
             image_processor = self.get_image_processor()
@@ -903,11 +987,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
                 min_pixels=image_processor.min_pixels,
                 max_pixels=image_processor.max_pixels,
             )
-            preprocessed_size = ImageSize(width=resized_width,
-                                          height=resized_height)
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
-            preprocessed_size = ImageSize(width=image_width,
-                                          height=image_height)
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
 
         # NOTE: Frames are padded to be divisible by `temporal_patch_size`
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
@@ -927,7 +1009,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
@@ -943,7 +1025,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
@@ -971,9 +1053,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
             image_processor=None,
         )
 
-    def _get_max_video_frames(self,
-                              max_tokens: int,
-                              start_num_frames: int = 1) -> int:
+    def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 1) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = start_num_frames
@@ -1003,8 +1083,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         max_videos = mm_counts.get("video", 0)
 
         max_total_frames = self._get_max_video_frames(seq_len)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   max_frames_per_video)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1), max_frames_per_video
+        )
 
         return max(max_frames_per_video, 1)
 
@@ -1018,14 +1099,12 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
 
 
 class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1040,36 +1119,41 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts
+        )
+
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
-            )
+                overrides=video_overrides,
+            ),
         }
 
 
-class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
-                                 ):
-
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2VLMultiModalDataParser(
-            self.info.get_hf_config().vision_config.spatial_merge_size)
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )
 
     def _get_prompt_updates(
         self,
@@ -1078,8 +1162,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
@@ -1102,9 +1185,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
             PromptReplacement(
                 modality=modality,
                 target=[placeholder[modality]],
-                replacement=partial(get_replacement_qwen2vl,
-                                    modality=modality),
-            ) for modality in ("image", "video")
+                replacement=partial(get_replacement_qwen2vl, modality=modality),
+            )
+            for modality in ("image", "video")
         ]
 
     def _get_mm_fields_config(
@@ -1113,15 +1196,20 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         return _create_qwen2vl_field_factory(
-            self.info.get_hf_config().vision_config.spatial_merge_size)(
-                hf_inputs)
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
-                                        info=Qwen2VLProcessingInfo,
-                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
-class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsLoRA, SupportsPP, SupportsMRoPE):
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2VLMultiModalProcessor,
+    info=Qwen2VLProcessingInfo,
+    dummy_inputs=Qwen2VLDummyInputsBuilder,
+)
+class Qwen2VLForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
@@ -1132,40 +1220,35 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             # mapping for original checkpoint
             "lm_head.": "language_model.lm_head.",
             "model.": "language_model.model.",
-        })
+        }
+    )
 
     supports_encoder_tp_data = True
 
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]] = None,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get M-RoPE input positions for Qwen2-VL model."""
-        if image_grid_thw is None:
-            image_grid_thw = []
-        if video_grid_thw is None:
-            video_grid_thw = []
-        if second_per_grid_ts is None:
-            second_per_grid_ts = []
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(hf_config.vision_config,
-                                    "tokens_per_second", 1.0)
+        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
 
         input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id).squeeze(1)
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
         vision_tokens = input_tokens_tensor[vision_start_indices + 1]
         image_nums = (vision_tokens == image_token_id).sum()
         video_nums = (vision_tokens == video_token_id).sum()
@@ -1192,20 +1275,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1213,43 +1288,61 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 remain_videos -= 1
                 ed = ed_video
 
-            llm_grid_t, llm_grid_h, llm_grid_w = \
-                t, h // spatial_merge_size, w // spatial_merge_size
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
             text_len = ed - st
 
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
 
-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
-                       tokens_per_second).long().flatten()
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                    * video_second_per_grid_t
+                    * tokens_per_second
+                )
+                .long()
+                .flatten()
+            )
 
-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                llm_grid_t, -1, llm_grid_w).flatten()
-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                llm_grid_t, llm_grid_h, -1).flatten()
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
             llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
 
         if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             text_len = len(input_tokens) - st
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
 
         return llm_positions, mrope_position_delta
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1267,14 +1360,21 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        if multimodal_config.get_limit_per_prompt("image") or \
-            multimodal_config.get_limit_per_prompt("video"):
+        if multimodal_config.get_limit_per_prompt(
+            "image"
+        ) or multimodal_config.get_limit_per_prompt("video"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
             self.visual = Qwen2VisionTransformer(
                 config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
                 use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
             )
         else:
             self.visual = None
@@ -1286,26 +1386,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return mm_input.reshape(-1, mm_input.shape[-1])
-        else:
-            return torch.concat(mm_input)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Qwen2VLImageInputs]:
+        self, **kwargs: object
+    ) -> Qwen2VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1314,27 +1400,22 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            return Qwen2VLImagePixelInputs(type="pixel_values",
-                                           pixel_values=pixel_values,
-                                           image_grid_thw=image_grid_thw)
+            return Qwen2VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            return Qwen2VLImageEmbeddingInputs(type="image_embeds",
-                                               image_embeds=image_embeds,
-                                               image_grid_thw=image_grid_thw)
+            return Qwen2VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
+        self, **kwargs: object
+    ) -> Qwen2VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1343,11 +1424,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Qwen2VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1355,21 +1431,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
-            return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
-                                               video_embeds=video_embeds,
-                                               video_grid_thw=video_grid_thw)
+            return Qwen2VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
 
     def _process_image_input(
-            self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]:
-
+        self, image_input: Qwen2VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"]
@@ -1377,46 +1449,38 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = image_input["pixel_values"]
 
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
+                )
             else:
-                image_embeds = self.visual(pixel_values,
-                                           grid_thw=grid_thw_list)
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
-            self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
-
+        self, video_input: Qwen2VLVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"]
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values_videos,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+                grid_thw_list = grid_thw.tolist()
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
+                )
             else:
-                video_embeds = self.visual(pixel_values_videos,
-                                           grid_thw=grid_thw_list)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -1425,23 +1489,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "images" not in modalities
+            ):
+                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "videos" not in modalities
+            ):
+                modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
 
         return modalities
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
-
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -1455,62 +1519,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_embeddings = self._process_video_input(video_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [self.config.image_token_id, self.config.video_token_id])
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Qwen2VLImagePixelInputs] = None,
-        video_input: Optional[Qwen2VLVideoPixelInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
-            )
-
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen2-VL.
 
         Args:
@@ -1528,26 +1553,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input)
-                input_ids = None
-
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
@@ -1559,12 +1564,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
             skip_prefixes.extend(["visual."])
@@ -1587,17 +1590,16 @@ class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
 
 
 class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
-
     def __init__(
         self,
-        size: Optional[dict[str, int]] = None,
+        size: dict[str, int] | None = None,
         **kwargs,
     ) -> None:
         if size is not None and "min_pixels" in size and "max_pixels" in size:
             # Remap if Tarsier2-specific format is provided
             remapped_size = {
                 "shortest_edge": size["min_pixels"],
-                "longest_edge": size["max_pixels"]
+                "longest_edge": size["max_pixels"],
             }
             super().__init__(size=remapped_size, **kwargs)
         else:
@@ -1605,7 +1607,6 @@ class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
 
 
 class Tarsier2Processor(Qwen2VLProcessor):
-
     def __init__(
         self,
         vision_config: dict,
@@ -1618,16 +1619,14 @@ class Tarsier2Processor(Qwen2VLProcessor):
             tokenizer=tokenizer,
             video_processor=Qwen2VLVideoProcessor(**vision_config),
             chat_template=None,
-            **kwargs)
+            **kwargs,
+        )
 
 
 class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
-
     def get_hf_config(self) -> Qwen2VLConfig:
         model_path = self.ctx.model_config.model
-        original_config = AutoConfig.from_pretrained(model_path)
-        config_dict = original_config.to_dict()
-        correct_config = Qwen2VLConfig.from_dict(config_dict)
+        correct_config = Qwen2VLConfig.from_pretrained(model_path)
 
         return correct_config
 
@@ -1639,17 +1638,20 @@ class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
         )
 
     def get_image_processor(self) -> Tarsier2ImageProcessor:
-        return Tarsier2ImageProcessor(
-            **self.ctx.get_hf_image_processor_config())
+        return Tarsier2ImageProcessor(**self.ctx.get_hf_image_processor_config())
 
 
-@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor,
-                                        info=Tarsier2ProcessingInfo,
-                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    Tarsier2MultiModalProcessor,
+    info=Tarsier2ProcessingInfo,
+    dummy_inputs=Qwen2VLDummyInputsBuilder,
+)
 class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "vision_tower.": "visual.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "vision_tower.": "visual.",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
@@ -1660,9 +1662,7 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
         vllm_config.model_config.hf_config = qwen2vl_config
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
             skip_prefixes.extend(["visual."])
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index ae72fd30c399351aa5f98885d219180c104b903a..8d7f22a33fe6ce9ba6004f54a9d5eb19c67f4401 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -22,8 +22,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,8 +36,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -46,30 +46,28 @@ from vllm.sequence import IntermediateTensors
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    maybe_prefix)
+from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix
 
 logger = init_logger(__name__)
 
 
 class Qwen3Attention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -131,7 +129,9 @@ class Qwen3Attention(nn.Module):
             **{
                 "layer_idx": extract_layer_index(prefix),
                 "dual_chunk_attention_config": dual_chunk_attention_config,
-            } if dual_chunk_attention_config else {},
+            }
+            if dual_chunk_attention_config
+            else {},
         )
         self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
         self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
@@ -144,12 +144,10 @@ class Qwen3Attention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         # Add qk-norm
-        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
-                           self.head_dim)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
         q_by_head = self.q_norm(q_by_head)
         q = q_by_head.view(q.shape)
-        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
-                           self.head_dim)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
         k_by_head = self.k_norm(k_by_head)
         k = k_by_head.view(k.shape)
         q, k = self.rotary_emb(positions, q, k)
@@ -159,12 +157,11 @@ class Qwen3Attention(nn.Module):
 
 
 class Qwen3DecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Qwen3Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -172,9 +169,9 @@ class Qwen3DecoderLayer(nn.Module):
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        dual_chunk_attention_config = getattr(config,
-                                              "dual_chunk_attention_config",
-                                              None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
 
         # By default, Qwen3 uses causal attention as it is a decoder-only model.
         # You can override the HF config with `is_causal=False` to enable
@@ -192,8 +189,8 @@ class Qwen3DecoderLayer(nn.Module):
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
             rms_norm_eps=config.rms_norm_eps,
-            qkv_bias=getattr(config, 'attention_bias', False),
-            head_dim=getattr(config, 'head_dim', None),
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
             rope_scaling=rope_scaling,
@@ -208,32 +205,30 @@ class Qwen3DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -251,13 +246,13 @@ ALL_DECODER_LAYER_TYPES = {
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class Qwen3Model(Qwen2Model):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         decoder_layer_type=Qwen3DecoderLayer)
+        super().__init__(
+            vllm_config=vllm_config, prefix=prefix, decoder_layer_type=Qwen3DecoderLayer
+        )
 
 
 class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
@@ -277,31 +272,32 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen3Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = Qwen3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
@@ -310,32 +306,31 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 61f1abad72b69dc96bdc4cc83348443cc644c1dc..8d00df1c67ed4c34e7d8ec5e92a27703d3ab245c 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -22,10 +22,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -33,62 +34,81 @@ from torch import nn
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
-from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import MixtureOfExperts, SupportsEagle3, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class Qwen3MoeMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -99,7 +119,6 @@ class Qwen3MoeMLP(nn.Module):
 
 
 class Qwen3MoeSparseMoeBlock(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -114,7 +133,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -123,7 +142,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.num_experts}.")
+                f"the number of experts {config.num_experts}."
+            )
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
@@ -132,36 +152,41 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
 
         self.n_logical_experts = self.n_routed_experts
         self.n_redundant_experts = eplb_config.num_redundant_experts
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
 
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
-
-        self.experts = FusedMoE(num_experts=self.n_routed_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=True,
-                                renormalize=config.norm_topk_prob,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts",
-                                enable_eplb=self.enable_eplb,
-                                num_redundant_experts=self.n_redundant_experts,
-                                is_sequence_parallel=self.is_sequence_parallel)
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.num_experts,
-                                     bias=False,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.gate")
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=True,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        assert hidden_states.dim(
-        ) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs"
+        assert hidden_states.dim() <= 2, (
+            "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs"
+        )
         is_input_1d = hidden_states.dim() == 1
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -171,8 +196,15 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
 
         if self.is_sequence_parallel:
             final_hidden_states = tensor_model_parallel_all_gather(
@@ -180,27 +212,25 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             final_hidden_states = final_hidden_states[:num_tokens]
 
         # return to 1d if input is 1d
-        return final_hidden_states.squeeze(0) if is_input_1d else \
-            final_hidden_states
+        return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states
 
 
 class Qwen3MoeAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -226,19 +256,23 @@ class Qwen3MoeAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
-        self.qkv_proj = QKVParallelLinear(hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_kv_heads,
-                                          bias=qkv_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
 
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -259,7 +293,9 @@ class Qwen3MoeAttention(nn.Module):
             **{
                 "layer_idx": extract_layer_index(prefix),
                 "dual_chunk_attention_config": dual_chunk_attention_config,
-            } if dual_chunk_attention_config else {},
+            }
+            if dual_chunk_attention_config
+            else {},
         )
 
         self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
@@ -273,13 +309,11 @@ class Qwen3MoeAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         # Add qk-norm
-        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
-                           self.head_dim)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
         q_by_head = self.q_norm(q_by_head)
         q = q_by_head.view(q.shape)
 
-        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
-                           self.head_dim)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
         k_by_head = self.k_norm(k_by_head)
         k = k_by_head.view(k.shape)
         q, k = self.rotary_emb(positions, q, k)
@@ -289,7 +323,6 @@ class Qwen3MoeAttention(nn.Module):
 
 
 class Qwen3MoeDecoderLayer(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
@@ -300,11 +333,10 @@ class Qwen3MoeDecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        dual_chunk_attention_config = getattr(config,
-                                              "dual_chunk_attention_config",
-                                              None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
         self.self_attn = Qwen3MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -313,8 +345,8 @@ class Qwen3MoeDecoderLayer(nn.Module):
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
-            qkv_bias=getattr(config, 'attention_bias', False),
-            head_dim=getattr(config, 'head_dim', None),
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -323,52 +355,53 @@ class Qwen3MoeDecoderLayer(nn.Module):
 
         # `mlp_only_layers` in the config.
         layer_idx = extract_layer_index(prefix)
-        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
-                           config.mlp_only_layers)
+        mlp_only_layers = (
+            [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
+        )
         if (layer_idx not in mlp_only_layers) and (
-                config.num_experts > 0 and
-            (layer_idx + 1) % config.decoder_sparse_step == 0):
-            self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
-                                              prefix=f"{prefix}.mlp")
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen3MoeSparseMoeBlock(
+                vllm_config=vllm_config, prefix=f"{prefix}.mlp"
+            )
         else:
-            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
-                                   intermediate_size=config.intermediate_size,
-                                   hidden_act=config.hidden_act,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            self.mlp = Qwen3MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class Qwen3MoeModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -385,46 +418,63 @@ class Qwen3MoeModel(nn.Module):
             config.vocab_size,
             config.hidden_size,
             quant_config=quant_config,
-            prefix=f"{prefix}.embed_tokens")
+            prefix=f"{prefix}.embed_tokens",
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config,
-                                                prefix=prefix),
+            lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        # Track layers for auxiliary hidden state outputs (EAGLE3)
+        self.aux_hidden_state_layers: tuple[int, ...] = ()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
+
+        aux_hidden_states = []
+        for layer_idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer),
+            start=self.start_layer,
+        ):
+            # Collect auxiliary hidden states if specified
+            if layer_idx in self.aux_hidden_state_layers:
+                aux_hidden_state = (
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+                aux_hidden_states.append(aux_hidden_state)
             hidden_states, residual = layer(positions, hidden_states, residual)
+
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        # Return auxiliary hidden states if collected
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
@@ -435,10 +485,10 @@ class Qwen3MoeModel(nn.Module):
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.config.num_experts,
-            num_redundant_experts=self.num_redundant_experts)
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -449,15 +499,24 @@ class Qwen3MoeModel(nn.Module):
         ]
 
         # Skip loading extra parameters for GPTQ/modelopt models.
-        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
-                           ".v_scale", "_v_scale", ".weight_scale",
-                           "_weight_scale", ".input_scale", "_input_scale")
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
@@ -487,8 +546,7 @@ class Qwen3MoeModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
                 else:
@@ -513,23 +571,27 @@ class Qwen3MoeModel(nn.Module):
                         continue
 
                     # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name_mapped.endswith(
-                            ignore_suffixes
-                    ) and name_mapped not in params_dict:
+                    if (
+                        name_mapped.endswith(ignore_suffixes)
+                        and name_mapped not in params_dict
+                    ):
                         continue
 
                     param = params_dict[name_mapped]
                     # We should ask the weight loader to return success or not
                     # here since otherwise we may skip experts with other
                     # available replicas.
-                    weight_loader = typing.cast(Callable[..., bool],
-                                                param.weight_loader)
-                    success = weight_loader(param,
-                                            loaded_weight,
-                                            name_mapped,
-                                            shard_id=shard_id,
-                                            expert_id=expert_id,
-                                            return_success=True)
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
                     if success:
                         name = name_mapped
                         break
@@ -541,8 +603,7 @@ class Qwen3MoeModel(nn.Module):
                         continue
 
                     # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name.endswith(
-                            ignore_suffixes) and name not in params_dict:
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -550,7 +611,8 @@ class Qwen3MoeModel(nn.Module):
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
+                            ".kv_scale", ".attn.kv_scale"
+                        )
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
@@ -561,25 +623,23 @@ class Qwen3MoeModel(nn.Module):
                         else:
                             name = remapped_kv_scale_name
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
-                          MixtureOfExperts):
+class Qwen3MoeForCausalLM(
+    nn.Module, SupportsPP, SupportsLoRA, SupportsEagle3, MixtureOfExperts
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        ]
     }
 
     fall_back_to_pt_during_load = False
@@ -590,22 +650,29 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Qwen3MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
+        # Only perform the following mapping when Qwen3MoeMLP exists
+        if getattr(config, "mlp_only_layers", []):
+            self.packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
+        self.model = Qwen3MoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
         # Set MoE hyperparameters
         self.expert_weights = []
 
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_layer = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -628,22 +695,6 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
@@ -652,8 +703,7 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         assert self.num_local_physical_experts == num_local_physical_experts
         self.num_physical_experts = num_physical_experts
         self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = (num_physical_experts -
-                                      self.num_logical_experts)
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
             if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
                 moe = layer.mlp
@@ -662,29 +712,36 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 14d19874a51e0f8bdb73767bf3e09f7ad90251f8..61cd9ac244ba0ac443d01895b8bb162c14c4f1d0 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -1,52 +1,71 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Qwen3Next model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
-import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 
 from vllm.attention import Attention, AttentionBackend, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
-                         VllmConfig, get_current_vllm_config)
-from vllm.distributed import (divide, get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    SpeculativeConfig,
+    VllmConfig,
+    get_current_vllm_config,
+)
+from vllm.distributed import (
+    divide,
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fla.ops import (
-    RMSNormGated, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
-from vllm.model_executor.layers.fused_moe import FusedMoE
-# yapf conflicts with isort for this block
-# yapf: disable
+    chunk_gated_delta_rule,
+    fused_recurrent_gated_delta_rule,
+)
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.layernorm import (
-    GemmaRMSNorm as Qwen3NextRMSNorm)
-# yapf: enable
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+    GemmaRMSNorm as Qwen3NextRMSNorm,
+)
+from vllm.model_executor.layers.layernorm import RMSNormGated
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
-from vllm.model_executor.layers.mamba.mamba_mixer2 import (
-    mamba_v2_sharded_weight_loader)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import mamba_v2_sharded_weight_loader
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, sharded_weight_loader)
+    default_weight_loader,
+    sharded_weight_loader,
+)
 from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.model_executor.utils import set_weight_attrs
@@ -54,15 +73,25 @@ from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Qwen3NextConfig
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
 
-from .interfaces import (HasInnerState, IsHybrid, MixtureOfExperts,
-                         SupportsLoRA, SupportsPP)
-from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
-                    is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    MixtureOfExperts,
+    SupportsLoRA,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -70,7 +99,6 @@ KVCache = tuple[torch.Tensor, torch.Tensor]
 
 
 class Qwen3NextSparseMoeBlock(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -81,7 +109,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -90,7 +118,8 @@ class Qwen3NextSparseMoeBlock(nn.Module):
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.num_experts}.")
+                f"the number of experts {config.num_experts}."
+            )
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
@@ -99,32 +128,23 @@ class Qwen3NextSparseMoeBlock(nn.Module):
 
         self.n_logical_experts = self.n_routed_experts
         self.n_redundant_experts = eplb_config.num_redundant_experts
-        self.n_physical_experts = (self.n_logical_experts +
-                                   self.n_redundant_experts)
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
 
-        self.physical_expert_start = (self.ep_rank *
-                                      self.n_local_physical_experts)
-        self.physical_expert_end = (self.physical_expert_start +
-                                    self.n_local_physical_experts)
-
-        self.experts = FusedMoE(num_experts=self.n_routed_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=config.norm_topk_prob,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts",
-                                enable_eplb=self.enable_eplb,
-                                num_redundant_experts=self.n_redundant_experts,
-                                is_sequence_parallel=self.is_sequence_parallel)
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.num_experts,
-                                     bias=False,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.gate")
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
 
         if config.shared_expert_intermediate_size > 0:
             self.shared_expert = Qwen3NextMLP(
@@ -132,15 +152,29 @@ class Qwen3NextSparseMoeBlock(nn.Module):
                 intermediate_size=config.shared_expert_intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=self.experts.must_reduce_shared_expert_outputs(
-                ),
+                reduce_results=False,
+                expert_gate=self.shared_expert_gate,
                 prefix=f"{prefix}.shared_expert",
             )
         else:
             self.shared_expert = None
-        self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
-                                                  1,
-                                                  bias=False)
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_expert,
+            gate=self.gate,
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -151,58 +185,67 @@ class Qwen3NextSparseMoeBlock(nn.Module):
         if self.is_sequence_parallel:
             hidden_states = sequence_parallel_chunk(hidden_states)
 
-        shared_output = None
-        if self.shared_expert is not None:
-            shared_output = self.shared_expert(hidden_states)
-            if self.shared_expert_gate is not None:
-                shared_output = F.sigmoid(
-                    self.shared_expert_gate(hidden_states)) * shared_output
-
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        if self.experts.is_internal_router:
+            # In this case, the gate/router runs inside the FusedMoE class
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
+        else:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
 
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+        if self.shared_expert is not None:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
 
         if self.is_sequence_parallel:
             final_hidden_states = tensor_model_parallel_all_gather(
-                final_hidden_states, 0)
+                final_hidden_states, 0
+            )
             final_hidden_states = final_hidden_states[:num_tokens]
         elif self.tp_size > 1:
             final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
+                final_hidden_states
+            )
 
         return final_hidden_states.view(orig_shape)
 
 
 class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
-
     @property
     def mamba_type(self) -> str:
         return "linear_attention"
 
     def get_attn_backend(self) -> type["AttentionBackend"]:
         from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
+
         return GDNAttentionBackend
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
-            self.model_config.dtype, self.cache_config.mamba_cache_dtype)
+            self.model_config.dtype, self.cache_config.mamba_cache_dtype
+        )
 
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.gated_delta_net_state_shape(
-            self.tp_size, self.num_k_heads, self.num_v_heads, self.head_k_dim,
-            self.head_v_dim, self.conv_kernel_size, self.num_spec)
+            self.tp_size,
+            self.num_k_heads,
+            self.num_v_heads,
+            self.head_k_dim,
+            self.head_v_dim,
+            self.conv_kernel_size,
+            self.num_spec,
+        )
 
     def __init__(
         self,
         config: Qwen3NextConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        speculative_config: SpeculativeConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -228,8 +271,11 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.speculative_config = speculative_config
-        self.num_spec = (self.speculative_config.num_speculative_tokens
-                         if self.speculative_config else 0)
+        self.num_spec = (
+            self.speculative_config.num_speculative_tokens
+            if self.speculative_config
+            else 0
+        )
 
         # QKV
         self.conv_dim = self.key_dim * 2 + self.value_dim
@@ -265,31 +311,35 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
 
         delattr(self.conv1d.weight, "weight_loader")
         set_weight_attrs(
-            self.conv1d.weight, {
-                "weight_loader":
-                mamba_v2_sharded_weight_loader([
-                    query_key_settings,
-                    query_key_settings,
-                    value_settings,
-                ], self.tp_size, self.tp_rank)
-            })
+            self.conv1d.weight,
+            {
+                "weight_loader": mamba_v2_sharded_weight_loader(
+                    [
+                        query_key_settings,
+                        query_key_settings,
+                        value_settings,
+                    ],
+                    self.tp_size,
+                    self.tp_rank,
+                )
+            },
+        )
 
         # selective projection used to make dt, B and C input dependant
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
         self.dt_bias = nn.Parameter(
-            torch.ones(self.num_v_heads // self.tp_size), )
+            torch.ones(self.num_v_heads // self.tp_size),
+        )
         self.A_log = nn.Parameter(
             torch.empty(
                 divide(self.num_v_heads, self.tp_size),
-                dtype=torch.float32,
-            ))
+            )
+        )
 
-        set_weight_attrs(self.A_log,
-                         {"weight_loader": sharded_weight_loader(0)})
-        set_weight_attrs(self.dt_bias,
-                         {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
 
         self.norm = RMSNormGated(
             self.head_v_dim,
@@ -297,15 +347,17 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             group_size=None,
             norm_before_gate=True,
             device=current_platform.current_device(),
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
-        self.out_proj = RowParallelLinear(self.value_dim,
-                                          self.hidden_size,
-                                          bias=False,
-                                          input_is_parallel=True,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.out_proj")
+        self.out_proj = RowParallelLinear(
+            self.value_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
 
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -322,9 +374,13 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         """
         new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
             self.num_k_heads // self.tp_size,
-            (self.head_k_dim + self.head_k_dim +
-             (self.head_v_dim + self.head_v_dim) * self.num_v_heads //
-             self.num_k_heads),
+            (
+                self.head_k_dim
+                + self.head_k_dim
+                + (self.head_v_dim + self.head_v_dim)
+                * self.num_v_heads
+                // self.num_k_heads
+            ),
         )
         new_tensor_shape_ba = mixed_qkvz.size()[:-1] + (
             self.num_k_heads // self.tp_size,
@@ -342,15 +398,13 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         ]
         split_arg_list_ba = [
             self.num_v_heads // self.num_k_heads,
-            self.num_v_heads // self.num_k_heads
+            self.num_v_heads // self.num_k_heads,
         ]
 
         # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)]
         # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn],
         #  [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng]
-        (query, key, value, z) = torch.split(mixed_qkvz,
-                                             split_arg_list_qkvz,
-                                             dim=2)
+        (query, key, value, z) = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=2)
         (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=2)
 
         # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
@@ -374,27 +428,79 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             dim=-1,
         )
         query, key = map(
-            lambda x: rearrange(x, 'l (h d) -> 1 l h d', d=self.head_k_dim),
-            (query, key))
-        value = rearrange(value, 'l (h d) -> 1 l h d', d=self.head_v_dim)
-        return query, key, value
+            lambda x: rearrange(x, "l (h d) -> 1 l h d", d=self.head_k_dim),
+            (query, key),
+        )
+        value = rearrange(value, "l (h d) -> 1 l h d", d=self.head_v_dim)
+        return query.contiguous(), key.contiguous(), value.contiguous()
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
     ):
-        return torch.ops.vllm.gdn_attention(
-            hidden_states,
-            output,
+        """
+        Forward pass with three parts:
+        1. Input projection
+        2. Core attention (custom op)
+        3. Output projection
+        """
+        num_tokens = hidden_states.size(0)
+
+        # ============================================================
+        # Part 1: Input Projection
+        # ============================================================
+        projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+        projected_states_ba, _ = self.in_proj_ba(hidden_states)
+        query, key, value, z, b, a = self.fix_query_key_value_ordering(
+            projected_states_qkvz, projected_states_ba
+        )
+        query, key, value = map(
+            lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
+        )
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+
+        # ============================================================
+        # Part 2: Core Attention (Custom Op)
+        # ============================================================
+        # Note: we should not use torch.empty here like other attention backends,
+        # see discussions in https://github.com/vllm-project/vllm/pull/28182
+        core_attn_out = torch.zeros(
+            (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        torch.ops.vllm.gdn_attention_core(
+            mixed_qkv,
+            b,
+            a,
+            core_attn_out,
             self.prefix,
         )
 
-    def _forward(
+        # ============================================================
+        # Part 3: Output Projection
+        # ============================================================
+        z_shape_og = z.shape
+        # Reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
+    def _forward_core(
         self,
-        hidden_states: torch.Tensor,
-        output: torch.Tensor,
+        mixed_qkv: torch.Tensor,
+        b: torch.Tensor,
+        a: torch.Tensor,
+        core_attn_out: torch.Tensor,
     ):
+        """
+        Core attention computation (called by custom op).
+        """
         forward_context = get_forward_context()
         attn_metadata: AttentionMetadata = forward_context.attn_metadata
 
@@ -409,7 +515,8 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         spec_query_start_loc = attn_metadata.spec_query_start_loc
         non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
         spec_sequence_masks = attn_metadata.spec_sequence_masks
-        spec_token_masks = attn_metadata.spec_token_masks
+        spec_token_indx = attn_metadata.spec_token_indx
+        non_spec_token_indx = attn_metadata.non_spec_token_indx
         spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
         self_kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -417,37 +524,28 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens
         num_accepted_tokens = attn_metadata.num_accepted_tokens
-        if spec_token_masks is not None:
-            spec_token_masks = spec_token_masks[:num_actual_tokens]
-
-        # 1. Set up dimensions for reshapes later
-        projected_states_qkvz, _ = self.in_proj_qkvz(
-            hidden_states[:num_actual_tokens])
-        projected_states_ba, _ = self.in_proj_ba(
-            hidden_states[:num_actual_tokens])
-        query, key, value, z, b, a = self.fix_query_key_value_ordering(
-            projected_states_qkvz, projected_states_ba)
-        query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'),
-                                (query, key, value))
-        mixed_qkv = torch.cat((query, key, value), dim=-1)
 
-        # 2. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
+        mixed_qkv = mixed_qkv[:num_actual_tokens]
+        b = b[:num_actual_tokens]
+        a = a[:num_actual_tokens]
+
+        # 1. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
 
         if spec_sequence_masks is not None:
-            if (attn_metadata.num_prefills == 0
-                    and attn_metadata.num_decodes == 0):
+            if attn_metadata.num_prefills == 0 and attn_metadata.num_decodes == 0:
                 mixed_qkv_spec = mixed_qkv
                 mixed_qkv_non_spec = None
             else:
-                mixed_qkv_spec = mixed_qkv[spec_token_masks]
-                mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
+                mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
+                mixed_qkv_non_spec = mixed_qkv.index_select(0, non_spec_token_indx)
         else:
             mixed_qkv_spec = None
             mixed_qkv_non_spec = mixed_qkv
 
-        # 2.1: process the mutli-query part
+        # 1.1: Process the multi-query part
         if spec_sequence_masks is not None:
             mixed_qkv_spec = causal_conv1d_update(
                 mixed_qkv_spec,
@@ -455,15 +553,16 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=spec_state_indices_tensor[:, 0]
-                [:attn_metadata.num_spec_decodes],
+                conv_state_indices=spec_state_indices_tensor[:, 0][
+                    : attn_metadata.num_spec_decodes
+                ],
                 num_accepted_tokens=num_accepted_tokens,
                 query_start_loc=spec_query_start_loc,
                 max_query_len=spec_state_indices_tensor.size(-1),
                 validate_data=False,
             )
 
-        # 2.2: process the remaining part
+        # 1.2: Process the remaining part
         if attn_metadata.num_prefills > 0:
             mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1)
             # - "cache_indices" updates the conv_state cache in positions
@@ -486,67 +585,61 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=non_spec_state_indices_tensor[:attn_metadata
-                                                                 .num_decodes],
+                conv_state_indices=non_spec_state_indices_tensor[
+                    : attn_metadata.num_actual_tokens
+                ],
                 validate_data=True,
             )
         else:
             mixed_qkv_non_spec = None
 
-        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(
-            mixed_qkv_spec)
+        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(mixed_qkv_spec)
         query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv(
-            mixed_qkv_non_spec)
+            mixed_qkv_non_spec
+        )
 
-        beta = b.sigmoid()
-        # g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-        g = fused_gdn_gating(self.A_log, a, self.dt_bias)
-        g, beta = map(lambda x: rearrange(x, 'l d -> 1 l d'), (g, beta))
+        g, beta = fused_gdn_gating(self.A_log, a, b, self.dt_bias)
 
         if spec_sequence_masks is not None:
-            if (attn_metadata.num_prefills == 0
-                    and attn_metadata.num_decodes == 0):
+            if attn_metadata.num_prefills == 0 and attn_metadata.num_decodes == 0:
                 g_spec = g
                 beta_spec = beta
                 g_non_spec = None
                 beta_non_spec = None
             else:
-                g_spec = g[:, spec_token_masks]
-                beta_spec = beta[:, spec_token_masks]
-                g_non_spec = g[:, ~spec_token_masks]
-                beta_non_spec = beta[:, ~spec_token_masks]
+                g_spec = g.index_select(1, spec_token_indx)
+                beta_spec = beta.index_select(1, spec_token_indx)
+                g_non_spec = g.index_select(1, non_spec_token_indx)
+                beta_non_spec = beta.index_select(1, non_spec_token_indx)
         else:
             g_spec = None
             beta_spec = None
             g_non_spec = g
             beta_non_spec = beta
 
-        # 3. Recurrent attention
+        # 2. Recurrent attention
 
-        # 3.1: process the mutlti-query part
+        # 2.1: Process the multi-query part
         if spec_sequence_masks is not None:
-            core_attn_out_spec, last_recurrent_state = (
-                fused_recurrent_gated_delta_rule(
-                    q=query_spec,
-                    k=key_spec,
-                    v=value_spec,
-                    g=g_spec,
-                    beta=beta_spec,
-                    initial_state=ssm_state,
-                    inplace_final_state=True,
-                    cu_seqlens=spec_query_start_loc[:attn_metadata.
-                                                    num_spec_decodes + 1],
-                    ssm_state_indices=spec_state_indices_tensor,
-                    num_accepted_tokens=num_accepted_tokens,
-                    use_qk_l2norm_in_kernel=True,
-                ))
+            core_attn_out_spec, last_recurrent_state = fused_recurrent_gated_delta_rule(
+                q=query_spec,
+                k=key_spec,
+                v=value_spec,
+                g=g_spec,
+                beta=beta_spec,
+                initial_state=ssm_state,
+                inplace_final_state=True,
+                cu_seqlens=spec_query_start_loc[: attn_metadata.num_spec_decodes + 1],
+                ssm_state_indices=spec_state_indices_tensor,
+                num_accepted_tokens=num_accepted_tokens,
+                use_qk_l2norm_in_kernel=True,
+            )
         else:
             core_attn_out_spec, last_recurrent_state = None, None
 
-        # 3.2: process the remaining part
+        # 2.2: Process the remaining part
         if attn_metadata.num_prefills > 0:
-            initial_state = ssm_state[
-                non_spec_state_indices_tensor].contiguous()
+            initial_state = ssm_state[non_spec_state_indices_tensor].contiguous()
             initial_state[~has_initial_state, ...] = 0
             (
                 core_attn_out_non_spec,
@@ -565,7 +658,8 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             )
             # Init cache
             ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to(
-                ssm_state.dtype)
+                ssm_state.dtype
+            )
         elif attn_metadata.num_decodes > 0:
             core_attn_out_non_spec, last_recurrent_state = (
                 fused_recurrent_gated_delta_rule(
@@ -576,48 +670,39 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                     beta=beta_non_spec,
                     initial_state=ssm_state,
                     inplace_final_state=True,
-                    cu_seqlens=non_spec_query_start_loc[:attn_metadata.
-                                                        num_decodes + 1],
+                    cu_seqlens=non_spec_query_start_loc[
+                        : attn_metadata.num_decodes + 1
+                    ],
                     ssm_state_indices=non_spec_state_indices_tensor,
                     use_qk_l2norm_in_kernel=True,
-                ))
+                )
+            )
         else:
             core_attn_out_non_spec, last_recurrent_state = None, None
 
-        # Merge core attention output
-        if (spec_sequence_masks is not None
-                and core_attn_out_non_spec is not None):
-            core_attn_out = torch.empty(
+        # 3. Merge core attention output
+        if spec_sequence_masks is not None and core_attn_out_non_spec is not None:
+            merged_out = torch.empty(
                 (1, num_actual_tokens, *core_attn_out_spec.shape[2:]),
                 dtype=core_attn_out_non_spec.dtype,
                 device=core_attn_out_non_spec.device,
             )
-            core_attn_out[:, spec_token_masks] = core_attn_out_spec
-            core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
+            merged_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
+            merged_out.index_copy_(1, non_spec_token_indx, core_attn_out_non_spec)
+            core_attn_out[:num_actual_tokens] = merged_out.squeeze(0)
         elif spec_sequence_masks is not None:
-            core_attn_out = core_attn_out_spec
+            core_attn_out[:num_actual_tokens] = core_attn_out_spec.squeeze(0)
         else:
-            core_attn_out = core_attn_out_non_spec
-
-        z_shape_og = z.shape
-        # reshape input data into 2D tensor
-        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
-        z = z.reshape(-1, z.shape[-1])
-        core_attn_out = self.norm(core_attn_out, z)
-        core_attn_out = core_attn_out.reshape(z_shape_og)
-        core_attn_out = rearrange(core_attn_out, '... h d -> ... (h d)')
-
-        output[:num_actual_tokens], _ = self.out_proj(core_attn_out)
+            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)
 
 
 class Qwen3NextAttention(nn.Module):
-
     def __init__(
         self,
         config: Qwen3NextConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -642,7 +727,8 @@ class Qwen3NextAttention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
         self.dual_chunk_attention_config = getattr(
-            config, "dual_chunk_attention_config", None)
+            config, "dual_chunk_attention_config", None
+        )
         self.attn_output_gate = getattr(config, "attn_output_gate", True)
 
         self.qkv_proj = QKVParallelLinear(
@@ -683,9 +769,10 @@ class Qwen3NextAttention(nn.Module):
             prefix=f"{prefix}.attn",
             **{
                 "layer_idx": extract_layer_index(prefix),
-                "dual_chunk_attention_config":
-                self.dual_chunk_attention_config,
-            } if self.dual_chunk_attention_config else {},
+                "dual_chunk_attention_config": self.dual_chunk_attention_config,
+            }
+            if self.dual_chunk_attention_config
+            else {},
         )
 
         self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -701,20 +788,22 @@ class Qwen3NextAttention(nn.Module):
 
         if self.attn_output_gate:
             q_gate, k, v = qkv.split(
-                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1)
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
+            )
             orig_shape = q_gate.shape[:-1]
             q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
             q, gate = torch.chunk(q_gate, 2, dim=-1)
             q = q.reshape(*orig_shape, -1)
             gate = gate.reshape(*orig_shape, -1)
         else:
-            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
-                                dim=-1)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
         q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view(
-            -1, self.num_heads * self.head_dim)
+            -1, self.num_heads * self.head_dim
+        )
         k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)).view(
-            -1, self.num_kv_heads * self.head_dim)
+            -1, self.num_kv_heads * self.head_dim
+        )
 
         q, k = self.rotary_emb(positions, q, k)
 
@@ -728,7 +817,6 @@ class Qwen3NextAttention(nn.Module):
 
 
 class Qwen3NextDecoderLayer(nn.Module):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -753,23 +841,26 @@ class Qwen3NextDecoderLayer(nn.Module):
                 cache_config=cache_config,
                 quant_config=quant_config,
                 speculative_config=speculative_config,
-                prefix=f'{prefix}.linear_attn')
+                prefix=f"{prefix}.linear_attn",
+            )
         elif self.layer_type == "full_attention":
             self.self_attn = Qwen3NextAttention(
                 config,
                 model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
-                prefix=f'{prefix}.self_attn',
+                prefix=f"{prefix}.self_attn",
             )
         else:
             raise ValueError(f"Invalid layer_type {self.layer_type}")
 
-        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
-                           config.mlp_only_layers)
+        mlp_only_layers = (
+            [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
+        )
         if (self.layer_idx not in mlp_only_layers) and (
-                config.num_experts > 0 and
-            (self.layer_idx + 1) % config.decoder_sparse_step == 0):
+            config.num_experts > 0
+            and (self.layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
             self.mlp = Qwen3NextSparseMoeBlock(
                 vllm_config=vllm_config,
                 prefix=f"{prefix}.mlp",
@@ -782,10 +873,12 @@ class Qwen3NextDecoderLayer(nn.Module):
                 quant_config=quant_config,
             )
 
-        self.input_layernorm = Qwen3NextRMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = Qwen3NextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
         self.post_attention_layernorm = Qwen3NextRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps)
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
         self.layer_scale = getattr(config, "layer_scale", False)
         if self.layer_scale:
@@ -794,20 +887,22 @@ class Qwen3NextDecoderLayer(nn.Module):
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
-                ), )
+                    dtype=config.dtype,
+                ),
+            )
             self.ffn_layer_scale = torch.nn.Parameter(
                 torch.zeros(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
-                ), )
+                    dtype=config.dtype,
+                ),
+            )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         positions: torch.Tensor = None,
         **kwargs: object,
     ):
@@ -815,8 +910,7 @@ class Qwen3NextDecoderLayer(nn.Module):
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         self_attention_output = torch.empty_like(hidden_states)
         if self.layer_type == "linear_attention":
@@ -837,51 +931,51 @@ class Qwen3NextDecoderLayer(nn.Module):
         if self.layer_scale:
             if len(hidden_states.shape) == 2:
                 hidden_states = hidden_states * (
-                    self.attn_layer_scale.to(hidden_states.dtype)[0] + 1)
+                    self.attn_layer_scale.to(hidden_states.dtype)[0] + 1
+                )
             else:
                 hidden_states = hidden_states * (
-                    self.attn_layer_scale.to(hidden_states.dtype) + 1)
+                    self.attn_layer_scale.to(hidden_states.dtype) + 1
+                )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
 
         if self.layer_scale:
             if len(hidden_states.shape) == 2:
                 hidden_states = hidden_states * (
-                    self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1)
+                    self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1
+                )
             else:
-                assert len(hidden_states.shape) == len(
-                    self.ffn_layer_scale.shape
-                ), f'shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}'  # noqa: E501
+                assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
+                    f"shape must be the same {len(hidden_states.shape)}, "
+                    f"{len(self.ffn_layer_scale.shape)}"
+                )
                 hidden_states = hidden_states * (
-                    self.ffn_layer_scale.to(hidden_states.dtype) + 1)
+                    self.ffn_layer_scale.to(hidden_states.dtype) + 1
+                )
 
         return hidden_states, residual
 
 
 @support_torch_compile
 class Qwen3NextModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config: Qwen3NextConfig = vllm_config.model_config.hf_config
         parallel_config = vllm_config.parallel_config
-        lora_config = vllm_config.lora_config
         eplb_config = parallel_config.eplb_config
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
@@ -892,32 +986,32 @@ class Qwen3NextModel(nn.Module):
             )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
         if get_pp_group().is_last_rank:
-            self.norm = Qwen3NextRMSNorm(config.hidden_size,
-                                         eps=config.rms_norm_eps)
+            self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -932,25 +1026,24 @@ class Qwen3NextModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
-        return FusedMoE.make_expert_params_mapping(
+        return SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.config.num_experts,
-            num_redundant_experts=self.num_redundant_experts)
+            num_redundant_experts=self.num_redundant_experts,
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -1001,16 +1094,19 @@ class Qwen3NextModel(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -1019,15 +1115,66 @@ class Qwen3NextModel(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                           MixtureOfExperts, IsHybrid):
+class QwenNextMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, Qwen3NextDecoderLayer) and isinstance(
+                layer.mlp, Qwen3NextSparseMoeBlock
+            ):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+            if example_moe is None:
+                raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
+class Qwen3NextForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    QwenNextMixtureOfExperts,
+    IsHybrid,
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1042,107 +1189,47 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
+        assert not cache_config.enable_prefix_caching, (
             "Qwen3Next currently does not support prefix caching"
+        )
         self.quant_config = vllm_config.quant_config
 
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = Qwen3NextModel(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.model = Qwen3NextModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-            prefix=maybe_prefix(prefix, "lm_head"))
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
         # Set MoE hyperparameters
-        self.expert_weights = []
-
-        self.moe_layers: list[FusedMoE] = []
-        example_layer = None
-        for layer in self.model.layers:
-            if isinstance(layer, PPMissingLayer):
-                continue
-
-            assert isinstance(layer, Qwen3NextDecoderLayer)
-            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
-                example_layer = layer.mlp
-                self.moe_layers.append(layer.mlp.experts)
-
-        if example_layer is None:
-            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
-
-        self.num_moe_layers = len(self.moe_layers)
-        self.num_expert_groups = 1
-        self.num_shared_experts = 0
-        self.num_logical_experts = example_layer.n_logical_experts
-        self.num_physical_experts = example_layer.n_physical_experts
-        self.num_local_physical_experts = example_layer.n_local_physical_experts
-        self.num_routed_experts = example_layer.n_routed_experts
-        self.num_redundant_experts = example_layer.n_redundant_experts
-
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
+        self.set_moe_parameters()
 
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = (num_physical_experts -
-                                      self.num_logical_experts)
-        for layer in self.model.layers:
-            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
-                moe = layer.mlp
-                moe.n_local_physical_experts = num_local_physical_experts
-                moe.n_physical_experts = num_physical_experts
-                moe.n_redundant_experts = self.num_redundant_experts
-                moe.experts.update_expert_map()
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
 
         return hidden_states
 
@@ -1152,32 +1239,38 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
-            vllm_config.model_config.dtype,
-            vllm_config.cache_config.mamba_cache_dtype)
+            vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype
+        )
 
     @classmethod
     def get_mamba_state_shape_from_config(
-            cls, vllm_config: "VllmConfig"
+        cls, vllm_config: "VllmConfig"
     ) -> tuple[tuple[int, int], tuple[int, int]]:
         parallel_config = vllm_config.parallel_config
         hf_config = vllm_config.model_config.hf_config
         tp_size = parallel_config.tensor_parallel_size
-        num_spec = (vllm_config.speculative_config.num_speculative_tokens
-                    if vllm_config.speculative_config else 0)
+        num_spec = (
+            vllm_config.speculative_config.num_speculative_tokens
+            if vllm_config.speculative_config
+            else 0
+        )
         return MambaStateShapeCalculator.gated_delta_net_state_shape(
-            tp_size, hf_config.linear_num_key_heads,
-            hf_config.linear_num_value_heads, hf_config.linear_key_head_dim,
-            hf_config.linear_value_head_dim, hf_config.linear_conv_kernel_dim,
-            num_spec)
+            tp_size,
+            hf_config.linear_num_key_heads,
+            hf_config.linear_num_value_heads,
+            hf_config.linear_key_head_dim,
+            hf_config.linear_value_head_dim,
+            hf_config.linear_conv_kernel_dim,
+            num_spec,
+        )
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.logits_processor(self.lm_head, hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=["mtp."],
@@ -1188,38 +1281,54 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         return self.model.get_expert_mapping()
 
 
-def gdn_attention(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
+def gdn_attention_core(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
     layer_name: str,
 ) -> None:
+    """
+    Custom op for the core attention computation.
+    Only handles the convolution + recurrent attention part.
+    Input/output projections are handled outside this op.
+    """
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self._forward(hidden_states=hidden_states, output=output)
+    self._forward_core(
+        mixed_qkv=mixed_qkv,
+        b=b,
+        a=a,
+        core_attn_out=core_attn_out,
+    )
 
 
-def gdn_attention_fake(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
+def gdn_attention_core_fake(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
     layer_name: str,
 ) -> None:
+    """Fake implementation for torch.compile."""
     return
 
 
 direct_register_custom_op(
-    op_name="gdn_attention",
-    op_func=gdn_attention,
-    mutates_args=["output"],
-    fake_impl=gdn_attention_fake,
+    op_name="gdn_attention_core",
+    op_func=gdn_attention_core,
+    mutates_args=["core_attn_out"],
+    fake_impl=gdn_attention_core_fake,
 )
 
 
-# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
 @triton.jit
 def fused_gdn_gating_kernel(
     g,
+    beta_output,
     A_log,
     a,
+    b,
     dt_bias,
     seq_len,
     NUM_HEADS: tl.constexpr,
@@ -1233,34 +1342,53 @@ def fused_gdn_gating_kernel(
     mask = head_off < NUM_HEADS
     blk_A_log = tl.load(A_log + head_off, mask=mask)
     blk_a = tl.load(a + off, mask=mask)
+    blk_b = tl.load(b + off, mask=mask)
     blk_bias = tl.load(dt_bias + head_off, mask=mask)
     # If the model is loaded in fp16, without the .float() here, A might be -inf
     x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
-    softplus_x = tl.where(beta * x <= threshold,
-                          (1 / beta) * tl.log(1 + tl.exp(beta * x)), x)
+    softplus_x = tl.where(
+        beta * x <= threshold, (1 / beta) * tl.log(1 + tl.exp(beta * x)), x
+    )
     blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
     tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+    # compute beta_output = sigmoid(b)
+    blk_beta_output = tl.sigmoid(blk_b.to(tl.float32))
+    tl.store(
+        beta_output + off, blk_beta_output.to(beta_output.dtype.element_ty), mask=mask
+    )
 
 
 def fused_gdn_gating(
     A_log: torch.Tensor,
     a: torch.Tensor,
+    b: torch.Tensor,
     dt_bias: torch.Tensor,
     beta: float = 1.0,
     threshold: float = 20.0,
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fused computation of g and beta for Gated Delta Net.
+    g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+    beta_output = b.sigmoid()
+    TODO maybe use torch.compile to replace this triton kernel
+    """
     batch, num_heads = a.shape
     seq_len = 1
     grid = (batch, seq_len, triton.cdiv(num_heads, 8))
-    g = torch.empty_like(a, dtype=torch.float32)
-    fused_gdn_gating_kernel[grid](g,
-                                  A_log,
-                                  a,
-                                  dt_bias,
-                                  seq_len,
-                                  num_heads,
-                                  beta,
-                                  threshold,
-                                  8,
-                                  num_warps=1)
-    return g
+    g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
+    beta_output = torch.empty(1, batch, num_heads, dtype=b.dtype, device=b.device)
+    fused_gdn_gating_kernel[grid](
+        g,
+        beta_output,
+        A_log,
+        a,
+        b,
+        dt_bias,
+        seq_len,
+        num_heads,
+        beta,
+        threshold,
+        8,
+        num_warps=1,
+    )
+    return g, beta_output
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index e950699a0c499e7eda6fd13014b662f8d62b0fc1..83694caa52480a884ae63f26e2b3048cead1758c 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Qwen3Next MTP model."""
+
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -15,16 +15,25 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer,
-                                                   Qwen3NextRMSNorm)
+from vllm.model_executor.models.qwen3_next import (
+    Qwen3NextDecoderLayer,
+    Qwen3NextRMSNorm,
+    QwenNextMixtureOfExperts,
+)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Qwen3NextConfig
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -33,20 +42,17 @@ KVCache = tuple[torch.Tensor, torch.Tensor]
 
 @support_torch_compile
 class Qwen3NextMultiTokenPredictor(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         model_config = vllm_config.model_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         config: Qwen3NextConfig = model_config.hf_config
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
@@ -54,36 +60,40 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
-        self.fc = ColumnParallelLinear(self.config.hidden_size * 2,
-                                       self.config.hidden_size,
-                                       gather_output=True,
-                                       bias=False,
-                                       return_bias=False,
-                                       quant_config=quant_config,
-                                       prefix=f'{prefix}.fc')
+        self.fc = ColumnParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            gather_output=True,
+            bias=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc",
+        )
 
         self.layers = torch.nn.ModuleList(
             Qwen3NextDecoderLayer(
                 vllm_config,
                 layer_type="full_attention",
-                prefix=f'{prefix}.layers.{idx}',
-            ) for idx in range(self.num_mtp_layers))
+                prefix=f"{prefix}.layers.{idx}",
+            )
+            for idx in range(self.num_mtp_layers)
+        )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-        self.norm = Qwen3NextRMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
-        self.pre_fc_norm_hidden = Qwen3NextRMSNorm(config.hidden_size,
-                                                   eps=config.rms_norm_eps)
-        self.pre_fc_norm_embedding = Qwen3NextRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
+        self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_fc_norm_hidden = Qwen3NextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_fc_norm_embedding = Qwen3NextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -91,13 +101,13 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
             inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
             hidden_states = self.pre_fc_norm_hidden(hidden_states)
@@ -109,7 +119,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        current_step_idx = spec_step_idx % self.num_mtp_layers
         hidden_states, residual = self.layers[current_step_idx](
             positions=positions,
             hidden_states=hidden_states,
@@ -117,16 +127,14 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -142,7 +150,8 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+        )
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -180,16 +189,19 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
                     if is_pp_missing_parameter(name, self):
                         continue
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -199,74 +211,77 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 @support_torch_compile
-class Qwen3NextMTP(nn.Module, SupportsPP):
+class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["up_proj", "down_proj"]
+        "gate_up_proj": ["up_proj", "down_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         cache_config = vllm_config.cache_config
-        assert not cache_config.enable_prefix_caching, \
+        assert not cache_config.enable_prefix_caching, (
             "Qwen3NextMTP currently does not support prefix caching"
+        )
 
         self.quant_config = vllm_config.quant_config
 
         super().__init__()
         self.config = config
-        self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config,
-                                                  prefix=maybe_prefix(
-                                                      prefix, "mtp"))
-        self.unpadded_vocab_size = config.vocab_size
-        self.lm_head = ParallelLMHead(self.unpadded_vocab_size,
-                                      config.hidden_size,
-                                      org_num_embeddings=config.vocab_size,
-                                      padding_size=DEFAULT_VOCAB_PADDING_SIZE,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.model = Qwen3NextMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp")
+        )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+        self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
-        hidden_states = self.model(input_ids, positions, hidden_states,
-                                   intermediate_tensors, inputs_embeds)
+        hidden_states = self.model(
+            input_ids, positions, hidden_states, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.logits_processor(self.lm_head, hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         shared_weight_names = ["embed_tokens", "lm_head"]
 
         def remap_weight_names(weights):
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
new file mode 100755
index 0000000000000000000000000000000000000000..40b80ce2387c87a92baf2704cc4ad6a68e575a1d
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -0,0 +1,1721 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3-Omni-Moe model (thinker part)."""
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging.version import Version
+from transformers import PretrainedConfig
+from transformers import __version__ as TRANSFORMERS_VERSION
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import (
+    Qwen3OmniMoeConfig,
+    Qwen3OmniMoeThinkerConfig,
+)
+from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
+    Qwen3OmniMoeAudioEncoder,
+)
+from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
+    Qwen3OmniMoeProcessor,
+)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import check_upstream_fa_availability
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
+from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    MultiModalPromptUpdates,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .qwen2_5_omni_thinker import (
+    Qwen2_5OmniAudioFeatureInputs,
+    Qwen2_5OmniConditionalGenerationMixin,
+    Qwen2_5OmniThinkerDummyInputsBuilder,
+    Qwen2_5OmniThinkerMultiModalProcessor,
+    Qwen2_5OmniThinkerProcessingInfo,
+)
+from .qwen2_5_vl import (
+    Qwen2_5_VisionAttention,
+    Qwen2_5_VisionRotaryEmbedding,
+    Qwen2_5_VLProcessingInfo,
+)
+from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    _merge_multimodal_embeddings,
+    maybe_prefix,
+)
+from .vision import (
+    get_llm_pos_ids_for_vision,
+    get_vit_attn_backend,
+)
+
+try:
+    import flash_attn
+except (ImportError, ModuleNotFoundError):
+    flash_attn = None
+
+logger = init_logger(__name__)
+
+
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
+    input_lengths_leave = input_lengths % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = (
+        ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    )
+    return feat_lengths, output_lengths
+
+
+class Qwen3_VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = Conv3dLayer(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Qwen3_VisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.linear_fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            return_bias=False,
+            prefix=f"{prefix}.linear_fc1",
+        )
+        self.linear_fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            return_bias=False,
+            prefix=f"{prefix}.linear_fc2",
+        )
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
+        return mlp_output
+
+
+class Qwen3_VisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Qwen2_5_VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = Qwen3_VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        seqlens: torch.Tensor,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen3_VisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        spatial_merge_size: int = 2,
+        use_postshuffle_norm: bool = False,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+
+        self.use_postshuffle_norm = use_postshuffle_norm
+        if self.use_postshuffle_norm:
+            context_dim = self.hidden_size
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.ln_q = norm_layer(
+            self.hidden_size if use_postshuffle_norm else context_dim
+        )
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.0",
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.2",
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_postshuffle_norm:
+            x = self.ln_q(x.view(-1, self.hidden_size))
+        else:
+            x = self.ln_q(x).view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen3Omni_VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_config,
+        norm_eps: float = 1e-6,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+        self.image_size = vision_config.image_size
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.spatial_merge_unit = self.spatial_merge_size**2
+        self.temporal_patch_size = vision_config.temporal_patch_size
+        self.num_grid_per_side = self.image_size // self.patch_size
+        self.apply_vit_abs_pos_embed = vision_config.apply_vit_abs_pos_embed
+        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
+
+        self.patch_embed = Qwen3_VisionPatchEmbed(
+            patch_size=self.patch_size,
+            temporal_patch_size=self.temporal_patch_size,
+            in_channels=vision_config.in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        # vit pos embeding, TODO: spatial_patch_size vs patch_size
+        if self.apply_vit_abs_pos_embed:
+            self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
+        else:
+            self.pos_embed = nn.Parameter(
+                torch.empty([1, self.num_grid_per_side**2, self.hidden_size])
+            )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList(
+            [
+                Qwen3_VisionBlock(
+                    dim=self.hidden_size,
+                    num_heads=self.num_heads,
+                    mlp_hidden_dim=vision_config.intermediate_size,
+                    act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                )
+                for layer_idx in range(vision_config.depth)
+            ]
+        )
+        self.merger = Qwen3_VisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=self.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+        if self.deepstack_visual_indexes is not None:
+            self.merger_list = nn.ModuleList(
+                [
+                    Qwen3_VisionPatchMerger(
+                        d_model=vision_config.out_hidden_size,
+                        context_dim=self.hidden_size,
+                        spatial_merge_size=self.spatial_merge_size,
+                        use_postshuffle_norm=True,
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.merger_list.{layer_idx}",
+                    )
+                    for layer_idx in range(len(self.deepstack_visual_indexes))
+                ]
+            )
+
+        self.attn_backend = get_vit_attn_backend(
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
+        num_grid_per_side = self.num_grid_per_side
+        m_size = self.spatial_merge_size
+        hidden_dim = self.pos_embed.embedding_dim
+
+        outputs = []
+        for t, h, w in grid_thw:
+            h_idxs = torch.linspace(
+                0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device
+            )
+            w_idxs = torch.linspace(
+                0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device
+            )
+
+            h_floor = h_idxs.to(torch.long)
+            w_floor = w_idxs.to(torch.long)
+            h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
+            w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)
+
+            dh = h_idxs - h_floor
+            dw = w_idxs - w_floor
+
+            # Create meshgrid view for all h, w vars
+            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
+            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
+            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
+            h_floor_grid_idx = h_floor_grid * num_grid_per_side
+            h_ceil_grid_idx = h_ceil_grid * num_grid_per_side
+
+            # original computation of weights
+            # w00 = (1 - dh_grid) * (1 - dw_grid)
+            # w01 = (1 - dh_grid) * dw_grid
+            # w10 = dh_grid * (1 - dw_grid)
+            # w11 = dh_grid * dw_grid
+            # we reuse w11 here to avoid duplicate
+            # dh_grid * dw_grid computation
+            w11 = dh_grid * dw_grid
+            w10 = dh_grid - w11
+            w01 = dw_grid - w11
+            w00 = 1 - dh_grid - dw_grid + w11
+
+            idx00 = h_floor_grid_idx + w_floor_grid
+            idx01 = h_floor_grid_idx + w_ceil_grid
+            idx10 = h_ceil_grid_idx + w_floor_grid
+            idx11 = h_ceil_grid_idx + w_ceil_grid
+
+            indices = torch.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1)
+            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
+            weights = weights.to(dtype=self.dtype, device=self.device)
+
+            embeds = self.pos_embed(indices)
+            weighted_embeds = embeds * weights
+            p0, p1, p2, p3 = weighted_embeds.unbind(dim=0)
+            combined = p0 + p1 + p2 + p3
+
+            combined = combined.view(h * w, hidden_dim)
+            repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous()
+            repeated = repeated.view(
+                t, h // m_size, m_size, w // m_size, m_size, hidden_dim
+            )
+            repeated = repeated.permute(0, 1, 3, 2, 4, 5).reshape(-1, hidden_dim)
+            outputs.append(repeated)
+
+        return torch.cat(outputs, dim=0)
+
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        max_seqlen = torch.zeros([], device=cu_seqlens.device)
+        seqlens = torch.zeros(1, device=cu_seqlens.device)
+        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
+        return max_seqlen, seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        if self.apply_vit_abs_pos_embed:
+            pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+            hidden_states = hidden_states + pos_embeds
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        hidden_states = hidden_states.unsqueeze(1)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        hidden_states_list = []
+        deepstack_visual_indexes = self.deepstack_visual_indexes
+
+        for layer_num, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+            if (
+                deepstack_visual_indexes is not None
+                and layer_num in deepstack_visual_indexes
+            ):
+                hidden_states_list.append(hidden_states)
+
+        hidden_states = self.merger(hidden_states)
+
+        # processing deepstack
+        if deepstack_visual_indexes is not None:
+            processed_hidden_states_list = [hidden_states]
+            for idx, x in enumerate(hidden_states_list):
+                x = self.merger_list[idx](x)
+                processed_hidden_states_list.append(x)
+            # we cat the original visual features and deepstack features
+            # along the feature dim
+            hidden_states = torch.cat(
+                processed_hidden_states_list, dim=1
+            )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+        "deepstack_input_embeds": 0,
+    }
+)
+class Qwen3MoeLLMModel(Qwen3MoeModel):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        self.deepstack_multiscale_layer_start = 1
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer_idx, layer in enumerate(
+            self.layers[self.start_layer : self.end_layer]
+        ):
+            layer_idx = layer_idx + self.start_layer
+
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+            if deepstack_input_embeds is not None and layer_idx in range(
+                0, len(deepstack_input_embeds)
+            ):
+                hidden_states = (
+                    hidden_states
+                    + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"]
+                )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super(Qwen3MoeForCausalLM, self).__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeLLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, quant_config=quant_config
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+
+class Qwen3OmniMoeThinkerProcessingInfo(
+    Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo
+):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config
+
+    def get_hf_processor(self, **kwargs: object) -> Qwen3OmniMoeProcessor:
+        processor = self.ctx.get_hf_processor(
+            Qwen3OmniMoeProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+        if not hasattr(processor, "audio_token"):
+            processor.audio_token = "<|audio_pad|>"
+        if not hasattr(processor, "image_token"):
+            processor.image_token = "<|image_pad|>"
+        if not hasattr(processor, "video_token"):
+            processor.video_token = "<|video_pad|>"
+        return processor
+
+    def get_feature_extractor(self, **kwargs: object):
+        hf_processor = self.get_hf_processor(**kwargs)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"audio": None, "image": None, "video": None}
+
+
+Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
+
+
+class Qwen3OmniMoeThinkerMultiModalProcessor(
+    Qwen2_5OmniThinkerMultiModalProcessor,
+):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray:
+            length = x.shape[-1]
+            if length % hop_length != 0:
+                pad_length = hop_length - (length % hop_length)
+                x = np.pad(x, (0, pad_length), mode="constant", constant_values=0)
+            return x
+
+        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        feature_extractor = self.info.get_feature_extractor()
+        hop_length = feature_extractor.hop_length
+        if audios:
+            # NOTE: Qwen3-Omni processor accept "audio"
+            # To make sure the cache works with padding=True, we pre-padded
+            # the audio to multiple of hop_length.
+            mm_data["audio"] = [
+                pad_to_hop_length(audio, hop_length)
+                if isinstance(audio, np.ndarray)
+                else (pad_to_hop_length(audio[0], hop_length), audio[1])
+                for audio in audios
+            ]
+
+            # TODO(Isotr0py): Remove this patch after upstream fix PR
+            # released and Transformers version update:
+            # https://github.com/huggingface/transformers/pull/41473
+            mm_kwargs = dict(mm_kwargs)
+            tok_kwargs = dict(tok_kwargs)
+            if Version(TRANSFORMERS_VERSION) < Version("4.58.0"):
+                # move truncation to audio_kwargs level to avoid conflict
+                # with tok_kwargs
+                mm_kwargs["audio_kwargs"] = {
+                    "truncation": mm_kwargs.pop("truncation", False)
+                }
+                mm_kwargs["text_kwargs"] = {
+                    "truncation": tok_kwargs.pop("truncation", False)
+                }
+
+        hf_inputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        if (
+            "audio_feature_lengths" in hf_inputs
+            and "feature_attention_mask" in hf_inputs
+            and (audios := mm_data.get("audio", []))
+        ):
+            audio_num_frames = []
+            for _, audio in enumerate(audios):
+                audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio)
+                num_frame = (
+                    (audio_length // hop_length)
+                    if audio_length % hop_length == 0
+                    else (audio_length // hop_length - 1)
+                )
+                if mm_kwargs.get("truncation", False):
+                    num_frame = min(
+                        num_frame, feature_extractor.n_samples // hop_length
+                    )
+                audio_num_frames.append(num_frame)
+            hf_inputs["feature_attention_mask"] = [
+                torch.ones(num_frame) for num_frame in audio_num_frames
+            ]
+            hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames)
+        return hf_inputs
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargsItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
+        is_update_applied: bool,
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        """
+        Qwen3-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        use_audio_in_video = False
+        if "video" in mm_kwargs:
+            for item in mm_kwargs["video"]:
+                if item and item["use_audio_in_video"].data:
+                    use_audio_in_video = True
+                else:
+                    use_audio_in_video = False
+
+        if use_audio_in_video and "video" in mm_item_counts:
+            assert "audio" in mm_item_counts
+            mm_item_counts["audio"] -= mm_item_counts["video"]
+
+        # Special case with `use_audio_in_video=True`
+        if use_audio_in_video:
+            if is_update_applied:
+                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
+            (
+                prompt_ids,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+            )
+            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+        # normal case with `use_audio_in_video=False`
+        elif is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                prompt_ids,
+                mm_prompt_updates,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+            )
+        else:
+            prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+            )
+
+        return prompt_ids, mm_placeholders
+
+    def get_updates_use_audio_in_video(
+        self,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: list[int] | torch.Tensor,
+        video_second_per_grid_t: float,
+    ) -> list[int]:
+        shift = 0
+        audio_token_id = thinker_config.audio_token_id
+        video_token_id = thinker_config.video_token_id
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        position_id_per_seconds = thinker_config.position_id_per_seconds
+        audio_token_indices = np.arange(next(iter([audio_len])))
+        curr_video_grid_thw = next(iter([video_grid_thw]))
+        height = curr_video_grid_thw[1] // spatial_merge_size
+        width = curr_video_grid_thw[2] // spatial_merge_size
+        video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
+        video_token_indices = np.broadcast_to(
+            video_token_indices, (video_token_indices.shape[0], height, width)
+        ).reshape(-1)
+        video_token_indices = (
+            (video_token_indices + shift)
+            * next(iter([video_second_per_grid_t]))
+            * position_id_per_seconds
+        )
+        video_data_index, audio_data_index = 0, 0
+        updates = [audio_start_token_id]
+        while video_data_index < len(video_token_indices) and audio_data_index < len(
+            audio_token_indices
+        ):
+            if (
+                video_token_indices[video_data_index]
+                <= audio_token_indices[audio_data_index]
+            ):
+                updates += [video_token_id]
+                video_data_index += 1
+            else:
+                updates += [audio_token_id]
+                audio_data_index += 1
+        if video_data_index < len(video_token_indices):
+            updates += [video_token_id] * (len(video_token_indices) - video_data_index)
+        if audio_data_index < len(audio_token_indices):
+            updates += [audio_token_id] * (len(audio_token_indices) - audio_data_index)
+        updates += [audio_end_token_id]
+        return updates
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        vocab = tokenizer.get_vocab()
+
+        audio_token = processor.audio_token
+        image_token = processor.image_token
+        video_token = processor.video_token
+        audio_token_id = vocab[audio_token]
+        image_token_id = vocab[image_token]
+        video_token_id = vocab[video_token]
+
+        out_mm_data = out_mm_kwargs.get_data()
+        audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
+        if audio_feature_lengths is None and feature_attention_mask is None:
+            audio_output_lengths = []
+        elif audio_feature_lengths is not None:
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                audio_feature_lengths
+            )
+            audio_output_lengths = audio_output_lens.tolist()
+        elif feature_attention_mask is not None:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1)
+            )
+            audio_output_lengths = audio_output_lens.tolist()
+
+        # number of audios read from video.
+        audio_in_video_item_idx = 0
+        audio_item_idx = 0
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            nonlocal audio_item_idx
+            item_idx += audio_in_video_item_idx
+
+            audio_item_idx += 1
+
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model"
+                )
+
+            return [audio_token_id] * num_features
+
+        def get_replacement_qwen2_vision(item_idx: int, modality: str):
+            grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+            merge_length = image_processor.merge_size**2
+
+            token_id = image_token_id if modality == "image" else video_token_id
+            return [token_id] * (int(grid_thw.prod()) // merge_length)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False)
+        thinker_config = self.info.get_hf_config()
+
+        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
+            nonlocal audio_in_video_item_idx
+            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+            video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
+
+            audio_in_video_item_idx += 1
+
+            second_per_grid_ts = hf_processor_mm_kwargs.get("second_per_grid_ts", None)
+            if second_per_grid_ts:
+                video_second_per_grid_t = second_per_grid_ts[item_idx]
+            else:
+                video_second_per_grid_t = 1.0
+
+            return self.get_updates_use_audio_in_video(
+                thinker_config=thinker_config,
+                audio_len=audio_num_features,
+                video_grid_thw=video_grid_thw,
+                video_second_per_grid_t=video_second_per_grid_t,
+            )
+
+        video_replacement_fn = (
+            get_replacement_qwen2_use_audio_in_video
+            if use_audio_in_video
+            else partial(get_replacement_qwen2_vision, modality="video")
+        )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=partial(get_replacement_qwen2_vision, modality="image"),
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=video_replacement_fn,
+            ),
+        ]
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        BaseMultiModalProcessor[
+            Qwen2_5OmniThinkerProcessingInfo
+        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+
+    def _get_raw_input_ids(
+        self,
+        token_ids: list[int],
+        use_audio_in_video: bool = False,
+    ) -> list[int]:
+        tokenizer = self.info.get_tokenizer()
+        vision_bos_token = tokenizer.encode(tokenizer.vision_bos_token)[0]
+        vision_eos_token = tokenizer.encode(tokenizer.vision_eos_token)[0]
+        audio_bos_token = tokenizer.encode(tokenizer.audio_bos_token)[0]
+        audio_eos_token = tokenizer.encode(tokenizer.audio_eos_token)[0]
+        audio_token = tokenizer.encode("<|audio_pad|>")[0]
+        image_token = tokenizer.encode("<|image_pad|>")[0]
+        video_token = tokenizer.encode("<|video_pad|>")[0]
+
+        result = token_ids[:]
+        if use_audio_in_video:
+            while True:
+                start = None
+                for i in range(len(result) - 1):
+                    if result[i : i + 2] == [vision_bos_token, audio_bos_token]:
+                        start = i
+                        break
+                if start is not None:
+                    end = None
+                    for i in range(start + 2, len(result) - 1):
+                        if result[i : i + 2] == [audio_eos_token, vision_eos_token]:
+                            end = i
+                            break
+                    if end is not None:
+                        result = (
+                            result[:start]
+                            + [vision_bos_token, video_token, vision_eos_token]
+                            + result[end + 2 :]
+                        )
+                else:
+                    break
+
+        for mm_token in [audio_token, image_token, video_token]:
+            compressed = []
+            for x in result:
+                if x != mm_token or (not compressed or compressed[-1] != mm_token):
+                    compressed.append(x)
+            result = compressed
+
+        return result
+
+
+class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin):
+    def _process_audio_input(
+        self,
+        audio_input: Qwen2_5OmniAudioFeatureInputs,
+        audio_hashes: list[str] | None = None,
+        cached_audio_features: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        input_features = audio_input["input_features"]
+        audio_feature_lengths = audio_input["audio_feature_lengths"]
+
+        audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
+            audio_feature_lengths
+        )
+
+        audio_outputs = self.audio_tower(
+            input_features.to(self.audio_tower.dtype),
+            feature_lens=audio_feature_lengths,
+            aftercnn_lens=audio_feat_lengths,
+        )
+        audio_features = audio_outputs.last_hidden_state
+        return audio_features.split(audio_output_lengths.tolist())
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3OmniMoeThinkerMultiModalProcessor,
+    info=Qwen3OmniMoeThinkerProcessingInfo,
+    dummy_inputs=Qwen3OmniMoeThinkerDummyInputsBuilder,
+)
+class Qwen3OmniMoeThinkerForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsMRoPE,
+    Qwen3OmniMoeConditionalGenerationMixin,
+):
+    merge_by_field_config = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "thinker.lm_head.": "language_model.lm_head.",
+            "thinker.model.": "language_model.model.",
+            "thinker.": "",
+        }
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+        if modality.startswith("audio"):
+            return "<|audio_start|><|audio_pad|><|audio_end|>"
+
+        raise ValueError("Only image, video or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.vllm_config = vllm_config  # needed for torch compile forward context
+        thinker_config: Qwen3OmniMoeThinkerConfig = (
+            vllm_config.model_config.hf_config.thinker_config
+        )
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = thinker_config
+        self.multimodal_config = multimodal_config
+
+        # force "use_flash_attention_2=True" to audio tower to align
+        # the results.
+        if flash_attn is not None:
+            audio_config = thinker_config.audio_config
+            audio_config._attn_implementation_autoset = True
+            audio_config._attn_implementation = "flash_attention_2"
+        else:
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part."
+            )
+
+        self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config)
+
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
+        self.visual = Qwen3Omni_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+            attn_backend_override=attn_backend_override,
+        )
+        self.quant_config = quant_config
+
+        self.language_model = Qwen3MoeLLMForCausalLM(
+            vllm_config=vllm_config.with_hf_config(
+                thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]
+            ),
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+        self.use_deepstack = hasattr(
+            thinker_config.vision_config, "deepstack_visual_indexes"
+        )
+        self.deepstack_num_level = (
+            len(thinker_config.vision_config.deepstack_visual_indexes)
+            if self.use_deepstack
+            else 0
+        )
+        # register buffer for deepstack
+        self.deepstack_input_embeds = (
+            [
+                torch.zeros(
+                    vllm_config.scheduler_config.max_num_batched_tokens,
+                    thinker_config.text_config.hidden_size,
+                )
+                for _ in range(self.deepstack_num_level)
+            ]
+            if self.use_deepstack
+            else None
+        )
+        self.visual_dim = thinker_config.vision_config.out_hidden_size
+        self.multiscale_dim = self.visual_dim * self.deepstack_num_level
+
+    def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
+        # get deepstack_input_embeds from buffer, and clear the buffer
+        return IntermediateTensors(
+            {
+                f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][
+                    :num_tokens
+                ]
+                for idx in range(self.deepstack_num_level)
+            }
+        )
+
+    def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None:
+        # set deepstack_input_embeds to buffer
+        num_tokens = deepstack_input_embeds.size(1)
+        if num_tokens > self.deepstack_input_embeds[0].size(0):
+            self.deepstack_input_embeds = [
+                torch.zeros(
+                    num_tokens,
+                    self.config.text_config.hidden_size,
+                    device=self.deepstack_input_embeds[0].device,
+                    dtype=self.deepstack_input_embeds[0].dtype,
+                )
+                for _ in range(self.deepstack_num_level)
+            ]
+        for idx in range(self.deepstack_num_level):
+            self.deepstack_input_embeds[idx][:num_tokens].copy_(
+                deepstack_input_embeds[idx]
+            )
+
+    def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
+        # clear deepstack_input_embeds in buffer
+        if num_tokens > 0:
+            for idx in range(self.deepstack_num_level):
+                self.deepstack_input_embeds[idx][:num_tokens].zero_()
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("input_audio_features")
+                and "audio" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(
+                    **kwargs
+                )
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += tuple(video_embeddings)
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings += tuple(audio_embeddings)
+        return multimodal_embeddings
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
+    ) -> torch.Tensor:
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.language_model.embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        deepstack_input_embeds = None
+        # TODO (ywang96): support overlapping modalitiy embeddings so that
+        # `use_audio_in_video` will work on V1.
+        # split the feat dim to obtain multi-scale visual feature
+        has_vision_embeddings = [
+            embeddings.shape[-1] != self.config.text_config.hidden_size
+            for embeddings in multimodal_embeddings
+        ]
+        if self.visual.deepstack_visual_indexes is not None and any(
+            has_vision_embeddings
+        ):
+            multiscale_len = len(self.visual.deepstack_visual_indexes)
+            multimodal_embeddings_multiscale = []
+            is_vision = torch.zeros_like(is_multimodal)
+            mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
+            mm_position_idx = 0
+            for index, embeddings in enumerate(multimodal_embeddings):
+                num_tokens = embeddings.shape[0]
+                current_positions = mm_positions[
+                    mm_position_idx : mm_position_idx + num_tokens
+                ]
+
+                # Vision embeddings
+                if embeddings.shape[-1] != self.config.text_config.hidden_size:
+                    visual_dim = embeddings.shape[-1] // (multiscale_len + 1)
+                    multi_dim = visual_dim * multiscale_len
+                    embeddings_main, embeddings_multiscale = torch.split(
+                        embeddings, [visual_dim, multi_dim], dim=-1
+                    )
+                    multimodal_embeddings[index] = embeddings_main
+                    multimodal_embeddings_multiscale.append(embeddings_multiscale)
+                    is_vision[current_positions] = True
+
+                # Audio embeddings
+                else:
+                    is_vision[current_positions] = False
+
+                mm_position_idx += num_tokens
+
+            deepstack_input_embeds = inputs_embeds.new_zeros(
+                inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
+            )
+            deepstack_input_embeds = _merge_multimodal_embeddings(
+                inputs_embeds=deepstack_input_embeds,
+                multimodal_embeddings=multimodal_embeddings_multiscale,
+                is_multimodal=is_vision,
+            )
+            deepstack_input_embeds = (
+                deepstack_input_embeds.view(
+                    inputs_embeds.shape[0], multiscale_len, visual_dim
+                )
+                .permute(1, 0, 2)
+                .contiguous()
+            )
+            self._set_deepstack_input_embeds(deepstack_input_embeds)
+
+        inputs_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        if (
+            self.use_deepstack
+            and inputs_embeds is not None
+            and get_pp_group().is_first_rank
+        ):
+            deepstack_input_embeds = self._get_deepstack_input_embeds(
+                inputs_embeds.size(0)
+            )
+        else:
+            deepstack_input_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            # args for deepstack
+            deepstack_input_embeds=deepstack_input_embeds,
+        )
+
+        if inputs_embeds is not None and get_pp_group().is_first_rank:
+            self._clear_deepstack_input_embeds(inputs_embeds.size(0))
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["talker.", "code2wav."],
+        )
+        loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+        return loaded_weights
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
+
+        input_ids = torch.tensor(input_tokens)
+        if input_ids is None or input_ids.ndim != 1:
+            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
+
+        seq_len = input_ids.shape[0]
+
+        if isinstance(audio_feature_lengths, list):
+            audio_feature_lengths = torch.tensor(
+                audio_feature_lengths, dtype=torch.long
+            )
+
+        if not len(second_per_grid_ts) and len(video_grid_thw):
+            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+        else:
+            second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
+
+        config = self.config
+        spatial_merge_size = config.vision_config.spatial_merge_size
+        image_token_id = config.image_token_id
+        video_token_id = config.video_token_id
+        audio_token_id = config.audio_token_id
+        vision_start_token_id = config.vision_start_token_id
+        audio_start_token_id = config.audio_start_token_id
+        position_id_per_seconds = config.position_id_per_seconds
+
+        vision_start_indices = torch.argwhere(
+            input_ids == vision_start_token_id
+        ).squeeze(1)
+        if vision_start_indices.numel() > 0:
+            vision_tokens = input_ids[vision_start_indices + 1]
+        else:
+            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
+        audio_nums = torch.sum(input_ids == audio_start_token_id)
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (
+            (vision_tokens == audio_start_token_id).sum()
+            if use_audio_in_video
+            else (vision_tokens == video_token_id).sum()
+        )
+
+        llm_pos_ids_list: list[torch.Tensor] = []
+        st = 0
+        image_idx = 0
+        video_idx = 0
+        audio_idx = 0
+        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
+        multimodal_nums = (
+            image_nums + audio_nums
+            if use_audio_in_video
+            else image_nums + video_nums + audio_nums
+        )  # noqa: E501
+
+        for _ in range(multimodal_nums):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
+                remain_videos > 0 or remain_images > 0
+            ):
+                ed_vision_start = input_tokens.index(vision_start_token_id, st)
+            else:
+                ed_vision_start = len(input_tokens) + 1
+            if audio_token_id in input_tokens and remain_audios > 0:
+                ed_audio_start = input_tokens.index(audio_start_token_id, st)
+            else:
+                ed_audio_start = len(input_tokens) + 1
+            min_ed = min(ed_vision_start, ed_audio_start)
+
+            if min_ed == ed_audio_start:
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(
+                    audio_feature_lengths[audio_idx]
+                )
+                llm_pos_ids = (
+                    torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + audio_len + eos_len
+                audio_idx += 1
+                remain_audios -= 1
+            elif (
+                min_ed == ed_vision_start
+                and input_ids[ed_vision_start + 1] == image_token_id
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = torch.arange(grid_t) * position_id_per_seconds
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + image_len + eos_len
+                image_idx += 1
+                remain_images -= 1
+            elif (
+                min_ed == ed_vision_start
+                and input_ids[ed_vision_start + 1] == video_token_id
+                and not use_audio_in_video
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * float(second_per_grids[video_idx].item())
+                    * position_id_per_seconds
+                )
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + video_len + eos_len
+                video_idx += 1
+                remain_videos -= 1
+            elif (
+                min_ed == ed_vision_start
+                and ed_vision_start + 1 == ed_audio_start
+                and use_audio_in_video
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                bos_block = (
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(bos_block)
+                llm_pos_ids_list.append(bos_block)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(
+                    audio_feature_lengths[audio_idx]
+                )
+                audio_llm_pos_ids = (
+                    torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * float(second_per_grids[video_idx].item())
+                    * position_id_per_seconds
+                )
+                video_llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_data_index, audio_data_index = 0, 0
+                while (
+                    video_data_index < video_llm_pos_ids.shape[-1]
+                    and audio_data_index < audio_llm_pos_ids.shape[-1]
+                ):
+                    if (
+                        video_llm_pos_ids[0][video_data_index]
+                        <= audio_llm_pos_ids[0][audio_data_index]
+                    ):
+                        llm_pos_ids_list.append(
+                            video_llm_pos_ids[
+                                :, video_data_index : video_data_index + 1
+                            ]
+                        )
+                        video_data_index += 1
+                    else:
+                        llm_pos_ids_list.append(
+                            audio_llm_pos_ids[
+                                :, audio_data_index : audio_data_index + 1
+                            ]
+                        )
+                        audio_data_index += 1
+                if video_data_index < video_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(
+                        video_llm_pos_ids[
+                            :, video_data_index : video_llm_pos_ids.shape[-1]
+                        ]
+                    )
+                if audio_data_index < audio_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(
+                        audio_llm_pos_ids[
+                            :, audio_data_index : audio_llm_pos_ids.shape[-1]
+                        ]
+                    )
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                eos_block = (
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(eos_block)
+                llm_pos_ids_list.append(eos_block)
+                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
+                audio_idx += 1
+                video_idx += 1
+                remain_videos -= 1
+                remain_audios -= 1
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        if llm_positions.shape[1] != seq_len:
+            raise RuntimeError("Position ids length mismatch with input ids length")
+
+        mrope_position_delta = llm_positions.max() + 1 - seq_len
+        return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 52d0123e8c8441af43a3774de5361c4ae9a178c2..7f0c9372991d1d02d9e99d78cdaffd6a78487492 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -23,9 +23,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
-from collections.abc import Iterable, Mapping, Sequence
-from functools import partial
-from typing import Any, Callable, Optional, Union
+
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from functools import lru_cache, partial
+from itertools import islice
+from typing import Any
 
 import numpy as np
 import torch
@@ -34,56 +36,86 @@ import torch.nn.functional as F
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
-    smart_resize as image_smart_resize)
-from transformers.models.qwen3_vl import (Qwen3VLProcessor,
-                                          Qwen3VLVideoProcessor)
+    smart_resize as image_smart_resize,
+)
+from transformers.models.qwen3_vl import Qwen3VLProcessor, Qwen3VLVideoProcessor
 from transformers.models.qwen3_vl.configuration_qwen3_vl import (
-    Qwen3VLConfig, Qwen3VLVisionConfig)
+    Qwen3VLConfig,
+    Qwen3VLVisionConfig,
+)
 from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
-    smart_resize as video_smart_resize)
+    smart_resize as video_smart_resize,
+)
 from transformers.video_utils import VideoMetadata
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.conv import Conv3dLayer
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItem,
-                                    MultiModalKwargsItems, VideoItem)
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_list_of
-
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .qwen2_5_vl import (Qwen2_5_VisionAttention,
-                         Qwen2_5_VisionRotaryEmbedding,
-                         Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs,
-                         Qwen2_5_VLImagePixelInputs,
-                         Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs,
-                         Qwen2_5_VLVideoPixelInputs)
+from vllm.utils.collection_utils import is_list_of
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .qwen2_5_vl import (
+    Qwen2_5_VisionAttention,
+    Qwen2_5_VisionRotaryEmbedding,
+    Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs,
+    Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs,
+    Qwen2_5_VLVideoPixelInputs,
+)
 from .qwen2_vl import Qwen2VLProcessingInfo
 from .qwen3 import Qwen3ForCausalLM, Qwen3Model
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    _merge_multimodal_embeddings,
+    maybe_prefix,
+)
+from .vision import (
+    get_vit_attn_backend,
+    run_dp_sharded_mrope_vision_model,
+)
 
 logger = init_logger(__name__)
 
@@ -92,7 +124,6 @@ _MAX_FRAMES_PER_VIDEO = 24576
 
 
 class Qwen3_VisionPatchEmbed(nn.Module):
-
     def __init__(
         self,
         patch_size: int = 14,
@@ -106,45 +137,51 @@ class Qwen3_VisionPatchEmbed(nn.Module):
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = nn.Conv3d(in_channels,
-                              hidden_size,
-                              kernel_size=kernel_size,
-                              stride=kernel_size,
-                              bias=True)
+        self.proj = Conv3dLayer(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
-                   self.patch_size)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
         x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
 class Qwen3_VisionMLP(nn.Module):
-
-    def __init__(self,
-                 in_features: int,
-                 hidden_features: int,
-                 bias: bool = False,
-                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
-        self.linear_fc1 = ColumnParallelLinear(in_features,
-                                               hidden_features,
-                                               bias=bias,
-                                               quant_config=quant_config,
-                                               return_bias=False,
-                                               prefix=f"{prefix}.linear_fc1",
-                                               disable_tp=use_data_parallel)
-        self.linear_fc2 = RowParallelLinear(hidden_features,
-                                            in_features,
-                                            bias=bias,
-                                            quant_config=quant_config,
-                                            return_bias=False,
-                                            prefix=f"{prefix}.linear_fc2",
-                                            disable_tp=use_data_parallel)
+        self.linear_fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            return_bias=False,
+            prefix=f"{prefix}.linear_fc1",
+            disable_tp=use_data_parallel,
+        )
+        self.linear_fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            return_bias=False,
+            prefix=f"{prefix}.linear_fc2",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -153,18 +190,17 @@ class Qwen3_VisionMLP(nn.Module):
 
 
 class Qwen3_VisionBlock(nn.Module):
-
     def __init__(
         self,
         dim: int,
         num_heads: int,
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
@@ -180,43 +216,47 @@ class Qwen3_VisionBlock(nn.Module):
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa)
-        self.mlp = Qwen3_VisionMLP(dim,
-                                   mlp_hidden_dim,
-                                   act_fn=act_fn,
-                                   bias=True,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp",
-                                   use_data_parallel=use_data_parallel)
+            use_upstream_fa=use_upstream_fa,
+        )
+        self.mlp = Qwen3_VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
-            self,
-            x: torch.Tensor,
-            cu_seqlens: torch.Tensor,
-            rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
-        x = x + self.attn(self.norm1(x),
-                          cu_seqlens=cu_seqlens,
-                          rotary_pos_emb=rotary_pos_emb,
-                          max_seqlen=max_seqlen,
-                          seqlens=seqlens)
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
 
         x = x + self.mlp(self.norm2(x))
         return x
 
 
 class Qwen3_VisionPatchMerger(nn.Module):
-
     def __init__(
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
         use_postshuffle_norm: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -230,19 +270,23 @@ class Qwen3_VisionPatchMerger(nn.Module):
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.norm = norm_layer(context_dim)
-        self.linear_fc1 = ColumnParallelLinear(self.hidden_size,
-                                               self.hidden_size,
-                                               bias=True,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.linear_fc1",
-                                               disable_tp=use_data_parallel)
+        self.linear_fc1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_fc1",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = nn.GELU()
-        self.linear_fc2 = RowParallelLinear(self.hidden_size,
-                                            d_model,
-                                            bias=True,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.linear_fc2",
-                                            disable_tp=use_data_parallel)
+        self.linear_fc2 = RowParallelLinear(
+            self.hidden_size,
+            d_model,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_fc2",
+            disable_tp=use_data_parallel,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.use_postshuffle_norm:
@@ -257,14 +301,14 @@ class Qwen3_VisionPatchMerger(nn.Module):
 
 
 class Qwen3_VisionTransformer(nn.Module):
-
     def __init__(
         self,
         vision_config: Qwen3VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -280,8 +324,9 @@ class Qwen3_VisionTransformer(nn.Module):
 
         # NOTE: This is used for creating empty tensor for all_gather for
         # DP ViT. Here out_hidden_size is enlarged due to deepstack
-        self.out_hidden_size = (vision_config.out_hidden_size *
-                                (1 + len(self.deepstack_visual_indexes)))
+        self.out_hidden_size = vision_config.out_hidden_size * (
+            1 + len(self.deepstack_visual_indexes)
+        )
 
         self.patch_embed = Qwen3_VisionPatchEmbed(
             patch_size=self.patch_size,
@@ -290,8 +335,7 @@ class Qwen3_VisionTransformer(nn.Module):
             hidden_size=self.hidden_size,
         )
 
-        self.pos_embed = nn.Embedding(self.num_position_embeddings,
-                                      self.hidden_size)
+        self.pos_embed = nn.Embedding(self.num_position_embeddings, self.hidden_size)
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
@@ -307,49 +351,62 @@ class Qwen3_VisionTransformer(nn.Module):
             use_data_parallel=use_data_parallel,
         )
 
-        self.deepstack_merger_list = nn.ModuleList([
-            Qwen3_VisionPatchMerger(
-                d_model=vision_config.out_hidden_size,
-                context_dim=self.hidden_size,
-                spatial_merge_size=self.spatial_merge_size,
-                use_postshuffle_norm=True,
-                norm_layer=norm_layer,
-                quant_config=quant_config,
-                prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
-                use_data_parallel=use_data_parallel)
-            for layer_idx in range(len(self.deepstack_visual_indexes))
-        ])
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3_VisionPatchMerger(
+                    d_model=vision_config.out_hidden_size,
+                    context_dim=self.hidden_size,
+                    spatial_merge_size=self.spatial_merge_size,
+                    use_postshuffle_norm=True,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(len(self.deepstack_visual_indexes))
+            ]
+        )
 
         self.attn_backend = get_vit_attn_backend(
-            head_size=head_dim, dtype=torch.get_default_dtype())
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
         use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
+            and check_upstream_fa_availability(torch.get_default_dtype())
+        ):
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
-                f"Qwen3-VL does not support {self.attn_backend} backend now.")
-
-        self.blocks = nn.ModuleList([
-            Qwen3_VisionBlock(
-                dim=self.hidden_size,
-                num_heads=self.num_heads,
-                mlp_hidden_dim=vision_config.intermediate_size,
-                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
-                norm_layer=norm_layer,
-                quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}",
-                use_data_parallel=use_data_parallel,
-                attn_backend=self.attn_backend,
-                use_upstream_fa=use_upstream_fa)
-            for layer_idx in range(vision_config.depth)
-        ])
+                f"Qwen3-VL does not support {self.attn_backend} backend now."
+            )
+        self.blocks = nn.ModuleList(
+            [
+                Qwen3_VisionBlock(
+                    dim=self.hidden_size,
+                    num_heads=self.num_heads,
+                    mlp_hidden_dim=vision_config.intermediate_size,
+                    act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                    attn_backend=self.attn_backend,
+                    use_upstream_fa=use_upstream_fa,
+                )
+                for layer_idx in range(vision_config.depth)
+            ]
+        )
 
     @property
     def dtype(self) -> torch.dtype:
@@ -359,61 +416,59 @@ class Qwen3_VisionTransformer(nn.Module):
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw):
-        pos_ids = []
-        # Support both Tensor and list inputs for DP path
-        if isinstance(grid_thw, list):
-            grid_list = grid_thw
-            max_grid_size = max(max(h, w) for _, h, w in grid_list)
-        else:
-            grid_list = grid_thw.tolist()
-            max_grid_size = int(grid_thw[:, 1:].max().item())
-        for t, h, w in grid_list:
-            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
-            hpos_ids = hpos_ids.flatten()
-
-            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
-            wpos_ids = wpos_ids.flatten()
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+    @staticmethod
+    @lru_cache(maxsize=1024)
+    def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
+        hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
+        h_div = h // spatial_merge_size
+        w_div = w // spatial_merge_size
+        hpos_ids = hpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+
+        wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
+        wpos_ids = wpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+
+        return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))
+
+    def rot_pos_emb(self, grid_thw: list[list[int]]):
+        max_grid_size = max(max(h, w) for _, h, w in grid_thw)
+        pos_ids = [
+            self.rot_pos_ids(h, w, self.spatial_merge_size)
+            if t == 1
+            else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
+            for t, h, w in grid_thw
+        ]
         pos_ids = torch.cat(pos_ids, dim=0)
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def fast_pos_embed_interpolate(self,
-                                   grid_thw: list[list[int]]) -> torch.Tensor:
-
+    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
         m_size = self.spatial_merge_size
         hidden_dim = self.pos_embed.embedding_dim
 
         outputs = []
         for t, h, w in grid_thw:
-            h_idxs = torch.linspace(0,
-                                    num_grid_per_side - 1,
-                                    h,
-                                    dtype=torch.float32,
-                                    device=self.device)
-            w_idxs = torch.linspace(0,
-                                    num_grid_per_side - 1,
-                                    w,
-                                    dtype=torch.float32,
-                                    device=self.device)
+            h_idxs = torch.linspace(
+                0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device
+            )
+            w_idxs = torch.linspace(
+                0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device
+            )
 
             h_floor = h_idxs.to(torch.long)
             w_floor = w_idxs.to(torch.long)
@@ -424,15 +479,9 @@ class Qwen3_VisionTransformer(nn.Module):
             dw = w_idxs - w_floor
 
             # Create meshgrid view for all h, w vars
-            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing='ij')
-            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor,
-                                                        w_floor,
-                                                        indexing='ij')
-            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil,
-                                                      w_ceil,
-                                                      indexing='ij')
-            h_floor_grid_idx = h_floor_grid * num_grid_per_side
-            h_ceil_grid_idx = h_ceil_grid * num_grid_per_side
+            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
+            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
+            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
 
             # original computation of weights
             # w00 = (1 - dh_grid) * (1 - dw_grid)
@@ -444,30 +493,25 @@ class Qwen3_VisionTransformer(nn.Module):
             w11 = dh_grid * dw_grid
             w10 = dh_grid - w11
             w01 = dw_grid - w11
-            w00 = 1 - dh_grid - dw_grid + w11
+            w00 = 1 - dh_grid - w01
 
-            idx00 = h_floor_grid_idx + w_floor_grid
-            idx01 = h_floor_grid_idx + w_ceil_grid
-            idx10 = h_ceil_grid_idx + w_floor_grid
-            idx11 = h_ceil_grid_idx + w_ceil_grid
+            h_grid = torch.stack([h_floor_grid, h_floor_grid, h_ceil_grid, h_ceil_grid])
+            w_grid = torch.stack([w_floor_grid, w_ceil_grid, w_floor_grid, w_ceil_grid])
+            h_grid_idx = h_grid * num_grid_per_side
 
-            indices = torch.stack([idx00, idx01, idx10, idx11],
-                                  dim=0).reshape(4, -1)
-            weights = torch.stack([w00, w01, w10, w11],
-                                  dim=0).reshape(4, -1, 1)
-            weights = weights.to(dtype=self.dtype, device=self.device)
+            indices = (h_grid_idx + w_grid).reshape(4, -1)
+            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
+            weights = weights.to(dtype=self.dtype)
 
             embeds = self.pos_embed(indices)
-            weighted_embeds = embeds * weights
-            p0, p1, p2, p3 = weighted_embeds.unbind(dim=0)
-            combined = p0 + p1 + p2 + p3
-
-            combined = combined.view(h * w, hidden_dim)
-            repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous()
-            repeated = repeated.view(t, h // m_size, m_size, w // m_size,
-                                     m_size, hidden_dim)
-            repeated = repeated.permute(0, 1, 3, 2, 4,
-                                        5).reshape(-1, hidden_dim)
+            embeds *= weights
+            combined = embeds.sum(dim=0)
+
+            combined = combined.reshape(
+                h // m_size, m_size, w // m_size, m_size, hidden_dim
+            )
+            combined = combined.permute(0, 2, 1, 3, 4).reshape(1, -1, hidden_dim)
+            repeated = combined.expand(t, -1, -1).reshape(-1, hidden_dim)
             outputs.append(repeated)
 
         return torch.cat(outputs, dim=0)
@@ -475,64 +519,68 @@ class Qwen3_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
-        max_seqlen, seqlens = None, None
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        max_seqlen = torch.zeros([], device=cu_seqlens.device)
+        seqlens = torch.zeros(1, device=cu_seqlens.device)
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
+        ):
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
+            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
-        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
         hidden_states = self.patch_embed(hidden_states)
 
-        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-        hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
 
-        grid_thw_tensor = torch.tensor(grid_thw,
-                                       device=self.device,
-                                       dtype=torch.int32)
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
+        hidden_states = hidden_states + pos_embeds
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
 
         cu_seqlens = torch.repeat_interleave(
-            grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2],
-            grid_thw_tensor[:, 0]).cumsum(
-                dim=0,
-                dtype=grid_thw_tensor.dtype
-                if torch.jit.is_tracing() else torch.int32,
-            )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
         hidden_states = hidden_states.unsqueeze(1)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
         for layer_num, blk in enumerate(self.blocks):
-            hidden_states = blk(hidden_states,
-                                cu_seqlens=cu_seqlens,
-                                rotary_pos_emb=rotary_pos_emb,
-                                max_seqlen=max_seqlen,
-                                seqlens=seqlens)
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
             if layer_num in self.deepstack_visual_indexes:
-                deepstack_merger_idx = self.deepstack_visual_indexes.index(
-                    layer_num)
-                deepstack_feature = self.deepstack_merger_list[
-                    deepstack_merger_idx](hidden_states)
+                deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
+                deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx](
+                    hidden_states
+                )
                 deepstack_feature_lists.append(deepstack_feature)
         hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
-            [hidden_states] + deepstack_feature_lists,
-            dim=1)  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
+            [hidden_states] + deepstack_feature_lists, dim=1
+        )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("attn.qkv.", "attn.q.", "q"),
@@ -543,7 +591,7 @@ class Qwen3_VisionTransformer(nn.Module):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -554,15 +602,13 @@ class Qwen3_VisionTransformer(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen3VLConfig)
 
@@ -576,8 +622,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
     def get_tokenizer(self):
         return self.ctx.tokenizer
 
-    def get_image_processor(self,
-                            **kwargs: object) -> Qwen2VLImageProcessorFast:
+    def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessorFast:
         return self.get_hf_processor(**kwargs).image_processor
 
     def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor:
@@ -590,8 +635,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         image_height: int,
         num_frames: int = 2,
         do_resize: bool = True,
-        image_processor: Optional[Union[Qwen2VLImageProcessorFast,
-                                        Qwen3VLVideoProcessor]],
+        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None and num_frames > 1:
             image_processor = self.get_video_processor()
@@ -611,7 +655,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
                 smart_resize = video_smart_resize
                 extra_kwargs = {
                     "num_frames": num_frames,
-                    "temporal_factor": temporal_patch_size
+                    "temporal_factor": temporal_patch_size,
                 }
             else:
                 smart_resize = image_smart_resize
@@ -624,11 +668,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
                 max_pixels=image_processor.size["longest_edge"],
                 **extra_kwargs,
             )
-            preprocessed_size = ImageSize(width=resized_width,
-                                          height=resized_height)
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
-            preprocessed_size = ImageSize(width=image_width,
-                                          height=image_height)
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
 
         padded_num_frames = num_frames + num_frames % temporal_patch_size
 
@@ -641,11 +683,10 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_max_video_frames(self,
-                              max_tokens: int,
-                              start_num_frames: int = 2) -> int:
-        return super()._get_max_video_frames(max_tokens,
-                                             start_num_frames=start_num_frames)
+    def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 2) -> int:
+        return super()._get_max_video_frames(
+            max_tokens, start_num_frames=start_num_frames
+        )
 
     def get_num_frames_with_most_features(
         self,
@@ -653,7 +694,8 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         mm_counts: Mapping[str, int],
     ) -> int:
         return super().get_num_frames_with_most_features(
-            seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO)
+            seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO
+        )
 
     def get_max_video_tokens(
         self,
@@ -664,8 +706,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         video_soft_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
 
@@ -674,25 +715,28 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         formatted_video_soft_tokens = video_soft_tokens * 12.5
         return int(formatted_video_soft_tokens)
 
-    def _calculate_timestamps(self, indices: list[int] | torch.Tensor,
-                              video_fps: float, merge_size: int):
+    def _calculate_timestamps(
+        self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int
+    ):
         if not isinstance(indices, list):
             indices = indices.tolist()
         if len(indices) % merge_size != 0:
             # don't update metadata's frames_indices directly
-            indices = indices + [indices[-1]
-                                 ] * (merge_size - len(indices) % merge_size)
+            indices = indices + [indices[-1]] * (merge_size - len(indices) % merge_size)
         timestamps = [idx / video_fps for idx in indices]
-        timestamps = [(timestamps[i] + timestamps[i + merge_size - 1]) / 2
-                      for i in range(0, len(timestamps), merge_size)]
+        timestamps = [
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2
+            for i in range(0, len(timestamps), merge_size)
+        ]
         return timestamps
 
     def _get_video_second_idx(
-            self,
-            metadata: dict[str, Any],
-            out_item: MultiModalKwargsItem,
-            do_sample_frames: Optional[bool] = None,
-            sampled_fps: Optional[float] = None) -> list[int]:
+        self,
+        metadata: dict[str, Any],
+        out_item: MultiModalKwargsItem,
+        do_sample_frames: bool | None = None,
+        sampled_fps: float | None = None,
+    ) -> list[int]:
         video_processor = self.get_video_processor()
         merge_size = video_processor.merge_size
         indices = metadata["frames_indices"]
@@ -708,20 +752,27 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         if do_sample_frames:
             # here video_fps is the fps of the sampled video, and
             # metadata["fps"] refers to the fps of the original video.
-            video_fps = sampled_fps if sampled_fps else video_processor.fps
+            sampled_fps = sampled_fps if sampled_fps else video_processor.fps
             total_num_frames = metadata["total_num_frames"]
-            num_frames = int(total_num_frames / metadata["fps"] * video_fps)
+            num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
             num_frames = min(
-                min(max(num_frames, video_processor.min_frames),
-                    video_processor.max_frames), total_num_frames)
-            indices = np.linspace(0, total_num_frames - 1,
-                                  num_frames).round().astype(int).tolist()
+                min(
+                    max(num_frames, video_processor.min_frames),
+                    video_processor.max_frames,
+                ),
+                total_num_frames,
+            )
+            indices = (
+                np.linspace(0, total_num_frames - 1, num_frames)
+                .round()
+                .astype(int)
+                .tolist()
+            )
         timestamps = self._calculate_timestamps(indices, video_fps, merge_size)
         return timestamps
 
 
 class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -735,29 +786,80 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
+        image_overrides = mm_options.get("image") if mm_options else None
+        video_overrides = mm_options.get("video") if mm_options else None
 
-        target_width, target_height = (
-            self.info.get_image_size_with_most_features())
+        target_width, target_height = self.info.get_image_size_with_most_features()
         target_num_frames = self.info.get_num_frames_with_most_features(
-            seq_len, mm_counts)
+            seq_len, mm_counts
+        )
+
+        if video_overrides:
+            assert isinstance(video_overrides, VideoDummyOptions)
+            num_frames_override = video_overrides.num_frames
+            if num_frames_override:
+                if num_frames_override > target_num_frames:
+                    logger.warning(
+                        "video.num_frames override (%d) exceeds model's "
+                        "maximum number of frames (%d), will be ignored",
+                        num_frames_override,
+                        target_num_frames,
+                    )
+                if num_frames_override < 2:
+                    logger.warning(
+                        "video.num_frames override (%d) cannot be less "
+                        "than 2, will be ignored",
+                        num_frames_override,
+                    )
+                target_num_frames = min(target_num_frames, num_frames_override)
+        target_num_frames = max(target_num_frames, 2)
+
         target_video_size, _ = self.info._get_vision_info(
             image_width=target_width,
             image_height=target_height,
             num_frames=target_num_frames,
             image_processor=self.info.get_video_processor(),
         )
+        # NOTE: we need to do this check here since Qwen3-VL resizes video
+        # frames depending on how many frames there are.
+        width, height = target_video_size.width, target_video_size.height
+        if video_overrides:
+            assert isinstance(video_overrides, VideoDummyOptions)
+            width_override = video_overrides.width
+            if width_override:
+                if width_override > width:
+                    logger.warning(
+                        "video.width override (%d) exceeds model's "
+                        "maximum width (%d), will be ignored",
+                        width_override,
+                        width,
+                    )
+                width = min(width, width_override)
+            height_override = video_overrides.height
+            if height_override:
+                if height_override > height:
+                    logger.warning(
+                        "video.height override (%d) exceeds model's "
+                        "maximum height (%d), will be ignored",
+                        height_override,
+                        height,
+                    )
+                height = min(height, height_override)
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
-                width=target_video_size.width,
-                height=target_video_size.height,
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+            "video": self._get_dummy_videos(
+                width=width,
+                height=height,
                 num_frames=target_num_frames,
                 num_videos=num_videos,
             ),
@@ -771,7 +873,6 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
         num_frames: int,
         num_videos: int,
     ) -> list[VideoItem]:
-        num_frames = max(num_frames, 2)
         video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         video_items = []
         for i in range(num_videos):
@@ -787,22 +888,8 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
             video_items.append(video_item)
         return video_items
 
-    def get_dummy_processor_inputs(self, seq_len, mm_counts):
-        processor_inputs = super().get_dummy_processor_inputs(
-            seq_len, mm_counts)
-        # HACK(Isotr0py): We set do_resize to False here to reuse Qwen2-VL's
-        # profiling logic, which will be problematic for configurable mm
-        # profiling.
-        # TODO(Isotr0py): Switch to the implementation in
-        # https://github.com/vllm-project/vllm/pull/25557
-        # after supporting configurable mm profiling.
-        processor_inputs.hf_processor_mm_kwargs = {"do_resize": False}
-        return processor_inputs
-
-
-class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
-                                 ):
 
+class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return MultiModalDataParser(video_needs_metadata=True)
 
@@ -817,13 +904,12 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
         processor = self.info.get_hf_processor(**mm_kwargs)
 
         # Separate video processing from image processing. Because the videos
-        # are processed into serval image patches
-        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
-                and len(mm_data["videos"]) > 0):
+        # are processed into several image patches
+        if videos := mm_data.pop("videos", []):
             video_grid_thw_lst = []
             pixel_values_videos_lst = []
 
-            for item_idx, item in enumerate(mm_data.pop("videos", [])):
+            for item in videos:
                 video_array, metadata = item
 
                 # NOTE: @JJJYmmm new attr metadata.frames_indices indicates
@@ -838,12 +924,12 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
                     # qwen_vl_utils already has "do_sample_frames" in
                     # mm_kwargs, don't overwrite it.
                     video_mm_kwargs["do_sample_frames"] = metadata.get(
-                        "do_sample_frames", False)
+                        "do_sample_frames", False
+                    )
 
-                metadata = VideoMetadata(**{
-                    k: metadata[k]
-                    for k in metadata if k != "do_sample_frames"
-                })
+                metadata = VideoMetadata(
+                    **{k: metadata[k] for k in metadata if k != "do_sample_frames"}
+                )
 
                 video_mm_data = dict()
                 video_mm_data["videos"] = [[video_array]]
@@ -856,8 +942,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
                     tok_kwargs=tok_kwargs,
                 )
                 input_ids = video_outputs.pop("input_ids")
-                video_placeholder = processor.tokenizer.batch_decode(
-                    input_ids)[0]
+                video_placeholder = processor.tokenizer.batch_decode(input_ids)[0]
                 prompt = prompt.replace(
                     "<|vision_start|><|video_pad|><|vision_end|>",
                     video_placeholder,
@@ -865,8 +950,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
                 )
 
                 video_grid_thw_lst.append(video_outputs["video_grid_thw"])
-                pixel_values_videos_lst.append(
-                    video_outputs["pixel_values_videos"])
+                pixel_values_videos_lst.append(video_outputs["pixel_values_videos"])
             video_outputs = dict(
                 pixel_values_videos=torch.cat(pixel_values_videos_lst),
                 video_grid_thw=torch.cat(video_grid_thw_lst),
@@ -899,14 +983,18 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
 
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
+                "image", image_grid_sizes
+            ),
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
+                "image", image_grid_sizes
+            ),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
+                "video", video_grid_sizes
+            ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
+                "video", video_grid_sizes
+            ),
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
@@ -917,8 +1005,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self.info.get_image_processor(
-            **hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         hf_config = self.info.get_hf_config()
 
@@ -947,26 +1034,28 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
             if is_list_of(sampled_fps, float):
                 sampled_fps = sampled_fps[item_idx]
             timestamps = self.info._get_video_second_idx(
-                metadata, out_item, do_sample_frames, sampled_fps)
+                metadata, out_item, do_sample_frames, sampled_fps
+            )
 
             assert len(timestamps) == grid_thw[0], (
                 f"The timestamps length({len(timestamps)}) should be equal "
-                f"video length ({grid_thw[0]}).")
+                f"video length ({grid_thw[0]})."
+            )
 
             frames_idx_token = [
-                tokenizer.encode(f"<{curr_time:.1f} seconds>",
-                                 add_special_tokens=False)
+                tokenizer.encode(f"<{curr_time:.1f} seconds>", add_special_tokens=False)
                 for curr_time in timestamps
             ]
             num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
             placeholder = []
             for frame_idx in frames_idx_token:
                 placeholder.extend(frame_idx)
-                placeholder.extend([vision_start_token_id] +
-                                   [video_token_id] * num_tokens_per_frame +
-                                   [vision_end_token_id])
-            return PromptUpdateDetails.select_token_id(placeholder,
-                                                       video_token_id)
+                placeholder.extend(
+                    [vision_start_token_id]
+                    + [video_token_id] * num_tokens_per_frame
+                    + [vision_end_token_id]
+                )
+            return PromptUpdateDetails.select_token_id(placeholder, video_token_id)
 
         return [
             PromptReplacement(
@@ -974,7 +1063,6 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
                 target=hf_processor.image_token,
                 replacement=get_image_replacement_qwen3vl,
             ),
-
             # NOTE: We match string on purpose since searching sequence of
             # token ids takes more time.
             PromptReplacement(
@@ -994,98 +1082,108 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
         # the same shape as input_embeds
-        "deepstack_input_embeds": 0
-    })
+        "deepstack_input_embeds": 0,
+    }
+)
 class Qwen3LLMModel(Qwen3Model):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         if not get_pp_group().is_first_rank:
             assert self.start_layer >= len(
-                vllm_config.model_config.hf_config.vision_config.
-                deepstack_visual_indexes), (
-                    "start_layer should be greater than or equal to "
-                    "len(deepstack_visual_indexes)")
+                vllm_config.model_config.hf_config.vision_config.deepstack_visual_indexes
+            ), (
+                "start_layer should be greater than or equal to "
+                "len(deepstack_visual_indexes)"
+            )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         # args for deepstack
-        deepstack_input_embeds: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer_idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
-            layer_idx = layer_idx + self.start_layer
-
+        for layer_idx, layer in islice(
+            enumerate(self.layers), self.start_layer, self.end_layer
+        ):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
                 residual,
             )
 
-            if deepstack_input_embeds is not None and \
-                    layer_idx in range(0, len(deepstack_input_embeds)):
-                hidden_states = hidden_states + deepstack_input_embeds[
-                    f"deepstack_input_embeds_{layer_idx}"]
+            if deepstack_input_embeds is not None and layer_idx in range(
+                0, len(deepstack_input_embeds)
+            ):
+                hidden_states = (
+                    hidden_states
+                    + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"]
+                )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
 class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super(Qwen3ForCausalLM, self).__init__()
         config = vllm_config.model_config.hf_config.text_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
+        self.model = Qwen3LLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix="lm_head")
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix="lm_head",
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+
 
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3VLMultiModalProcessor,
+    info=Qwen3VLProcessingInfo,
+    dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class Qwen3VLForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
-                                        info=Qwen3VLProcessingInfo,
-                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
-class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1106,10 +1204,11 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.visual.": "visual.",
             "lm_head.": "language_model.lm_head.",
             "model.language_model.": "language_model.model.",
-        })
+        }
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1126,37 +1225,46 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
-        if not multimodal_config.get_limit_per_prompt("image") and \
-            not multimodal_config.get_limit_per_prompt("video"):
+        if not multimodal_config.get_limit_per_prompt(
+            "image"
+        ) and not multimodal_config.get_limit_per_prompt("video"):
             self.visual = None
         else:
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
             self.visual = Qwen3_VisionTransformer(
                 config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
                 use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
             )
 
-        self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config,
-                                                  prefix=maybe_prefix(
-                                                      prefix,
-                                                      "language_model"))
+        self.language_model = Qwen3LLMForCausalLM(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
-        self.use_deepstack = hasattr(config.vision_config,
-                                     'deepstack_visual_indexes')
-        self.deepstack_num_level = len(
-            config.vision_config.deepstack_visual_indexes
-        ) if self.use_deepstack else 0
+        self.use_deepstack = hasattr(config.vision_config, "deepstack_visual_indexes")
+        self.deepstack_num_level = (
+            len(config.vision_config.deepstack_visual_indexes)
+            if self.use_deepstack
+            else 0
+        )
         # register buffer for deepstack
         if self.use_deepstack and self.visual is not None:
             self.deepstack_input_embeds = [
                 torch.zeros(
                     vllm_config.scheduler_config.max_num_batched_tokens,
-                    config.text_config.hidden_size)
+                    config.text_config.hidden_size,
+                )
                 for _ in range(self.deepstack_num_level)
             ]
         else:
@@ -1164,30 +1272,34 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
 
-    def _get_deepstack_input_embeds(self,
-                                    num_tokens: int) -> IntermediateTensors:
+    def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
         # get deepstack_input_embeds from buffer, and clear the buffer
-        return IntermediateTensors({
-            f"deepstack_input_embeds_{idx}":
-            self.deepstack_input_embeds[idx][:num_tokens]
-            for idx in range(self.deepstack_num_level)
-        })
-
-    def _set_deepstack_input_embeds(
-            self, deepstack_input_embeds: torch.Tensor) -> None:
+        return IntermediateTensors(
+            {
+                f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][
+                    :num_tokens
+                ]
+                for idx in range(self.deepstack_num_level)
+            }
+        )
+
+    def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None:
         # set deepstack_input_embeds to buffer
         num_tokens = deepstack_input_embeds.size(1)
         if num_tokens > self.deepstack_input_embeds[0].size(0):
             self.deepstack_input_embeds = [
-                torch.zeros(num_tokens,
-                            self.config.text_config.hidden_size,
-                            device=self.deepstack_input_embeds[0].device,
-                            dtype=self.deepstack_input_embeds[0].dtype)
+                torch.zeros(
+                    num_tokens,
+                    self.config.text_config.hidden_size,
+                    device=self.deepstack_input_embeds[0].device,
+                    dtype=self.deepstack_input_embeds[0].dtype,
+                )
                 for _ in range(self.deepstack_num_level)
             ]
         for idx in range(self.deepstack_num_level):
             self.deepstack_input_embeds[idx][:num_tokens].copy_(
-                deepstack_input_embeds[idx])
+                deepstack_input_embeds[idx]
+            )
 
     def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
         # clear deepstack_input_embeds in buffer
@@ -1195,24 +1307,9 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             for idx in range(self.deepstack_num_level):
                 self.deepstack_input_embeds[idx][:num_tokens].zero_()
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return torch.concat(list(mm_input))
-        else:
-            return torch.concat(mm_input)
-
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        self, **kwargs: object
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1221,35 +1318,22 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
-                                              pixel_values=pixel_values,
-                                              image_grid_thw=image_grid_thw)
+            return Qwen2_5_VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
 
         if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2_5_VLImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
-                image_grid_thw=image_grid_thw)
+                image_grid_thw=image_grid_thw,
+            )
 
     def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        self, **kwargs: object
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1259,11 +1343,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos, "video pixel values")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
             return Qwen2_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -1272,97 +1351,136 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             )
 
         if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Qwen2_5_VLVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
-                video_grid_thw=video_grid_thw)
+                video_grid_thw=video_grid_thw,
+            )
 
     def _process_image_input(
-            self,
-            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
-
+        self, image_input: Qwen2_5_VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
+                )
             else:
-                image_embeds = self.visual(pixel_values,
-                                           grid_thw=grid_thw_list)
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
-            self,
-            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
-
+        self, video_input: Qwen2_5_VLVideoInputs
+    ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
-                self.visual.dtype)
+                self.visual.dtype
+            )
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values_videos,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+                grid_thw_list = grid_thw.tolist()
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
+                )
             else:
-                video_embeds = self.visual(pixel_values_videos,
-                                           grid_thw=grid_thw_list)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
-                 (merge_size * merge_size)).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
         for input_key in kwargs:
-            if input_key in ("pixel_values", "image_embeds"
-                             ) and "image" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "image"] = self._parse_and_validate_image_input(**kwargs)
-            if input_key in ("pixel_values_videos", "video_embeds"
-                             ) and "video" not in mm_input_by_modality:
-                mm_input_by_modality[
-                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+            if (
+                input_key in ("pixel_values_videos", "video_embeds")
+                and "video" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
+                    **kwargs
+                )
         return mm_input_by_modality
 
+    def iter_mm_grid_hw(
+        self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
+    ) -> Iterator[tuple[int, int, int]]:
+        video_token_id = self.config.video_token_id
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            offset = mm_feature.mm_position.offset
+            if mm_feature.modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                assert t == 1, f"Image must have 1 frame, got {t}"
+                yield offset, h // spatial_merge_size, w // spatial_merge_size
+            elif mm_feature.modality == "video":
+                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
+                llm_grid_h = h // spatial_merge_size
+                llm_grid_w = w // spatial_merge_size
+                for _ in range(t):
+                    offset = input_tokens.index(video_token_id, offset)
+                    yield offset, llm_grid_h, llm_grid_w
+                    offset += llm_grid_h * llm_grid_w
+            else:
+                raise ValueError(f"Unsupported modality: {mm_feature.modality}")
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        llm_pos_ids_list = []
+        st = 0
+        for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
+            input_tokens, mm_features
+        ):
+            text_len = offset - st
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
+            )
+
+            grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
+            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
+            st = offset + llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
+            )
+
+        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return torch.from_numpy(llm_positions), mrope_position_delta
+
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
-        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
-            **kwargs)
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
 
@@ -1375,150 +1493,110 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                vision_embeddings = self._process_image_input(multimodal_input)
-                multimodal_embeddings += vision_embeddings
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(multimodal_input)
-                multimodal_embeddings += video_embeddings
+                multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
     def _compute_deepstack_embeds(
-            self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor,
-            multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor:
-        visual_lens = [
-            x.shape[0] if isinstance(x, torch.Tensor) else len(x)
-            for x in multimodal_embeddings
-        ]
+        self,
+        inputs_embeds: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings,
+        is_multimodal: torch.Tensor,
+    ) -> tuple[torch.Tensor, MultiModalEmbeddings]:
+        visual_lens = [len(x) for x in multimodal_embeddings]
         multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0)
 
-        multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split(  # noqa:E501
-            multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim],
-            dim=-1)
+        (
+            multimodal_embeddings_main,
+            multimodal_embeddings_multiscale,
+        ) = torch.split(
+            multimodal_embeddings_cat,
+            [self.visual_dim, self.multiscale_dim],
+            dim=-1,
+        )
 
-        multimodal_embeddings = torch.split(multimodal_embeddings_main,
-                                            visual_lens,
-                                            dim=0)
+        multimodal_embeddings = torch.split(
+            multimodal_embeddings_main, visual_lens, dim=0
+        )
         multimodal_embeddings_multiscale = torch.split(
-            multimodal_embeddings_multiscale, visual_lens, dim=0)
+            multimodal_embeddings_multiscale, visual_lens, dim=0
+        )
 
         deepstack_input_embeds = inputs_embeds.new_zeros(
-            inputs_embeds.size(0),
-            self.deepstack_num_level * inputs_embeds.size(1))
+            inputs_embeds.size(0), self.deepstack_num_level * inputs_embeds.size(1)
+        )
 
-        deepstack_input_embeds = merge_multimodal_embeddings(
-            input_ids,
-            deepstack_input_embeds,
-            multimodal_embeddings_multiscale,
-            placeholder_token_id=[
-                self.config.image_token_id, self.config.video_token_id
-            ],
+        deepstack_input_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=deepstack_input_embeds,
+            multimodal_embeddings=multimodal_embeddings_multiscale,
+            is_multimodal=is_multimodal,
         )
         deepstack_input_embeds = deepstack_input_embeds.view(
-            inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim)
+            inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim
+        )
         deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2)
+
         return deepstack_input_embeds, multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        deepstack_input_embeds = None
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            if self.use_deepstack:
-                deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds(  # noqa:E501
-                    input_ids, inputs_embeds, multimodal_embeddings)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [self.config.image_token_id, self.config.video_token_id])
-
-        if self.use_deepstack:
-            if deepstack_input_embeds is None:
-                deepstack_input_embeds = torch.zeros_like(
-                    inputs_embeds).unsqueeze(0).repeat(
-                        self.deepstack_num_level, 1, 1).contiguous()
-            self._set_deepstack_input_embeds(deepstack_input_embeds)
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.language_model.embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
-        return inputs_embeds
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
 
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[Qwen2_5_VLImageInputs] = None,
-        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
+        if is_multimodal is None:
+            raise ValueError(
+                "`embed_input_ids` now requires `is_multimodal` arg, "
+                "please update your model runner according to "
+                "https://github.com/vllm-project/vllm/pull/16229."
+            )
 
         if self.use_deepstack:
-            visual_dim = inputs_embeds.shape[-1]
-            deepstack_input_embeds = None
-            if image_input is not None or video_input is not None:
-                deepstack_input_embeds = torch.zeros_like(
-                    inputs_embeds).unsqueeze(1).repeat(
-                        1, self.deepstack_num_level, 1).flatten(1)
-
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            if self.use_deepstack:
-                image_embeds = torch.cat(image_embeds)
-
-                image_embeds, image_embeds_multiscale = image_embeds.split(
-                    [visual_dim, visual_dim * self.deepstack_num_level],
-                    dim=-1)
-
-                deepstack_input_embeds = merge_multimodal_embeddings(
-                    input_ids,
-                    deepstack_input_embeds,
-                    image_embeds_multiscale,
-                    placeholder_token_id=self.config.image_token_id,
-                )
-
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
+            (
+                deepstack_input_embeds,
+                multimodal_embeddings,
+            ) = self._compute_deepstack_embeds(
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                is_multimodal=is_multimodal,
             )
+        else:
+            deepstack_input_embeds = None
 
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            if self.use_deepstack:
-                video_embeds = torch.cat(video_embeds)
-
-                video_embeds, video_embeds_multiscale = video_embeds.split(
-                    [visual_dim, visual_dim * self.deepstack_num_level],
-                    dim=-1)
-
-                deepstack_input_embeds = merge_multimodal_embeddings(
-                    input_ids,
-                    deepstack_input_embeds,
-                    video_embeds_multiscale,
-                    placeholder_token_id=self.config.video_token_id,
-                )
-
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
+        inputs_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
 
-        if self.use_deepstack and deepstack_input_embeds is not None:
-            deepstack_input_embeds = deepstack_input_embeds.view(
-                inputs_embeds.shape[0], self.deepstack_num_level,
-                visual_dim).permute(1, 0, 2).contiguous()
+        if deepstack_input_embeds is not None:
             self._set_deepstack_input_embeds(deepstack_input_embeds)
+
         return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen3VL.
 
         Args:
@@ -1546,30 +1624,14 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input)
-                input_ids = None
-
-        if self.use_deepstack and inputs_embeds is not None and get_pp_group(
-        ).is_first_rank:
+        if (
+            self.use_deepstack
+            and inputs_embeds is not None
+            and get_pp_group().is_first_rank
+        ):
             deepstack_input_embeds = self._get_deepstack_input_embeds(
-                inputs_embeds.size(0))
+                inputs_embeds.size(0)
+            )
         else:
             deepstack_input_embeds = None
 
@@ -1590,12 +1652,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
             skip_prefixes.extend(["visual."])
@@ -1608,6 +1668,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         """
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="model.visual.merger",
-            tower_model="model.visual.",
+            connector="visual.merger",
+            tower_model="visual.",
         )
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 1ed053eb2e96cddb4f14890aab6cda7862cc9e22..5c3205faf9c2f26711a8d369a90df04ba280bcd4 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -23,13 +23,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights."""
+
 import typing
-from collections.abc import Iterable
-from typing import Callable, Optional, Union
+from collections.abc import Callable, Iterable
+from itertools import islice
 
 import torch
-from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
-    Qwen3VLMoeConfig)
+from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -38,21 +38,26 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
 from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
-from .qwen3_vl import (Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder,
-                       Qwen3VLForConditionalGeneration,
-                       Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo)
+from .qwen3_vl import (
+    Qwen3_VisionTransformer,
+    Qwen3VLDummyInputsBuilder,
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLMultiModalProcessor,
+    Qwen3VLProcessingInfo,
+)
 from .utils import is_pp_missing_parameter, maybe_prefix
 
 logger = init_logger(__name__)
 
 
 class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen3VLMoeConfig)
 
@@ -66,81 +71,89 @@ class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
         # the same shape as input_embeds
-        "deepstack_input_embeds": 0
-    })
+        "deepstack_input_embeds": 0,
+    }
+)
 class Qwen3MoeLLMModel(Qwen3MoeModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         if not get_pp_group().is_first_rank:
             assert self.start_layer >= len(
-                vllm_config.model_config.hf_config.vision_config.
-                deepstack_visual_indexes), (
-                    "start_layer should be greater than or equal to "
-                    "len(deepstack_visual_indexes)")
+                vllm_config.model_config.hf_config.vision_config.deepstack_visual_indexes
+            ), (
+                "start_layer should be greater than or equal to "
+                "len(deepstack_visual_indexes)"
+            )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        deepstack_input_embeds: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer_idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
-            layer_idx = layer_idx + self.start_layer
-
+        for layer_idx, layer in islice(
+            enumerate(self.layers), self.start_layer, self.end_layer
+        ):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
                 residual,
             )
 
-            if deepstack_input_embeds is not None and \
-                    layer_idx in range(0, len(deepstack_input_embeds)):
-                hidden_states = hidden_states + deepstack_input_embeds[
-                    f"deepstack_input_embeds_{layer_idx}"]
+            if deepstack_input_embeds is not None and layer_idx in range(
+                0, len(deepstack_input_embeds)
+            ):
+                hidden_states = (
+                    hidden_states
+                    + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"]
+                )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_fused_expert_weights(self, name: str, params_dict: dict,
-                                  loaded_weight: torch.Tensor, shard_id: str,
-                                  num_experts: int) -> bool:
+    def load_fused_expert_weights(
+        self,
+        name: str,
+        params_dict: dict,
+        loaded_weight: torch.Tensor,
+        shard_id: str,
+        num_experts: int,
+    ) -> bool:
         param = params_dict[name]
         weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
         loaded_local_expert = False
         for expert_id in range(num_experts):
             curr_expert_weight = loaded_weight[expert_id]
-            success = weight_loader(param,
-                                    curr_expert_weight,
-                                    name,
-                                    shard_id,
-                                    expert_id,
-                                    return_success=True)
+            success = weight_loader(
+                param,
+                curr_expert_weight,
+                name,
+                shard_id,
+                expert_id,
+                return_success=True,
+            )
             if success:
                 loaded_local_expert = True
 
         return loaded_local_expert
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -150,9 +163,18 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
             ("gate_up_proj", "up_proj", 1),
         ]
         # Skip loading extra parameters for GPTQ/modelopt models.
-        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
-                           ".v_scale", "_v_scale", ".weight_scale",
-                           "_weight_scale", ".input_scale", "_input_scale")
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
@@ -163,9 +185,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
         ]
         num_experts = self.config.num_experts
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if ("experts.gate_up_proj" in name
-                        or "experts.down_proj" in name):
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
                     is_fused_expert = True
                     expert_params_mapping = fused_expert_params_mapping
 
@@ -195,8 +216,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                 if name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
                 else:
@@ -215,40 +235,55 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                     if is_pp_missing_parameter(name_mapped, self):
                         continue
                     if is_fused_expert:
-                        loaded_weight = loaded_weight.transpose(-1,
-                                                                -2)  # no bias
+                        loaded_weight = loaded_weight.transpose(-1, -2)  # no bias
                         if "experts.gate_up_proj" in name:
                             loaded_weight = loaded_weight.chunk(2, dim=-2)
                             success_w1 = self.load_fused_expert_weights(
-                                name_mapped, params_dict, loaded_weight[0],
-                                "w1", num_experts)
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
                             success_w3 = self.load_fused_expert_weights(
-                                name_mapped, params_dict, loaded_weight[1],
-                                "w3", num_experts)
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
                             success = success_w1 and success_w3
                         else:
                             # down_proj
                             success = self.load_fused_expert_weights(
-                                name_mapped, params_dict, loaded_weight,
-                                shard_id, num_experts)
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
                     else:
                         # Skip loading extra parameters for GPTQ/modelopt models
-                        if name_mapped.endswith(
-                                ignore_suffixes
-                        ) and name_mapped not in params_dict:
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
                             continue
                         param = params_dict[name_mapped]
                         # We should ask the weight loader to return success or
                         # not here since otherwise we may skip experts with
                         # other available replicas.
-                        weight_loader = typing.cast(Callable[..., bool],
-                                                    param.weight_loader)
-                        success = weight_loader(param,
-                                                loaded_weight,
-                                                name_mapped,
-                                                shard_id=shard_id,
-                                                expert_id=expert_id,
-                                                return_success=True)
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
                     if success:
                         name = name_mapped
                         break
@@ -259,8 +294,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                         # So we simply skip it
                         continue
                     # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name.endswith(
-                            ignore_suffixes) and name not in params_dict:
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
@@ -268,7 +302,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
+                            ".kv_scale", ".attn.kv_scale"
+                        )
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
@@ -279,35 +314,49 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                         else:
                             name = remapped_kv_scale_name
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super(Qwen3MoeForCausalLM, self).__init__()
         self.config = vllm_config.model_config.hf_config.text_config
         self.quant_config = vllm_config.quant_config
-        self.model = Qwen3MoeLLMModel(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                      self.config.hidden_size,
-                                      quant_config=self.quant_config)
+        self.model = Qwen3MoeLLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(self.config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
-                                        info=Qwen3VLMoeProcessingInfo,
-                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3VLMultiModalProcessor,
+    info=Qwen3VLMoeProcessingInfo,
+    dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
 class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super(Qwen3VLForConditionalGeneration, self).__init__()
@@ -319,8 +368,9 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
-        if not multimodal_config.get_limit_per_prompt("image") and \
-            not multimodal_config.get_limit_per_prompt("video"):
+        if not multimodal_config.get_limit_per_prompt(
+            "image"
+        ) and not multimodal_config.get_limit_per_prompt("video"):
             self.visual = None
         else:
             self.visual = Qwen3_VisionTransformer(
@@ -331,25 +381,32 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
                 use_data_parallel=self.use_data_parallel,
             )
 
-        self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,
-                                                     prefix=maybe_prefix(
-                                                         prefix,
-                                                         "language_model"))
+        self.language_model = Qwen3MoeLLMForCausalLM(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+        )
+        # Whether to include the gate_up_proj mapping is determined by
+        # the language model.
+        self.packed_modules_mapping = (
+            self.packed_modules_mapping | self.language_model.packed_modules_mapping
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
-        self.use_deepstack = hasattr(config.vision_config,
-                                     'deepstack_visual_indexes')
-        self.deepstack_num_level = len(
-            config.vision_config.deepstack_visual_indexes
-        ) if self.use_deepstack else 0
+        self.use_deepstack = hasattr(config.vision_config, "deepstack_visual_indexes")
+        self.deepstack_num_level = (
+            len(config.vision_config.deepstack_visual_indexes)
+            if self.use_deepstack
+            else 0
+        )
         # register buffer for deepstack
         if self.use_deepstack and self.visual is not None:
             self.deepstack_input_embeds = [
                 torch.zeros(
                     vllm_config.scheduler_config.max_num_batched_tokens,
-                    config.text_config.hidden_size)
+                    config.text_config.hidden_size,
+                )
                 for _ in range(self.deepstack_num_level)
             ]
         else:
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 90200f319464b606ef8abeec62a54e1e52aac555..6a259cade9cf17d980978bd9313727746e372f33 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -9,43 +9,55 @@
 import copy
 import math
 import unicodedata
-from collections.abc import Collection, Mapping, Sequence, Set
+from collections.abc import Callable, Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
-from typing import Annotated, Callable, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import regex as re
 import torch
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
-                          TensorType)
+from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .qwen import QWenBaseModel, QWenModel
-from .utils import flatten_bn, merge_multimodal_embeddings
 
 
 class QwenImagePixelInputs(TensorSchema):
@@ -55,11 +67,12 @@ class QwenImagePixelInputs(TensorSchema):
         - c: Number of channels (3)
         - h: Height
         - w: Width
-    
+
     Note that image_size is the value in the vision config to which we resize
     the image to in the normalization transform. Currently multi-image support
     can only be leveraged by passing image embeddings directly.
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
@@ -70,15 +83,16 @@ class QwenImageEmbeddingInputs(TensorSchema):
         - bn: Batch size * number of images
         - ifs: Image feature size (256)
         - hs: Hidden size
-    
+
     `hidden_size` must match the hidden size of the language model backbone
     and is stored in the visual config of the model if we have one.
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
 
 
-QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
+QwenImageInputs: TypeAlias = QwenImagePixelInputs | QwenImageEmbeddingInputs
 
 
 class VisualAttention(nn.Module):
@@ -92,15 +106,14 @@ class VisualAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        kdim: Optional[int] = None,
-        vdim: Optional[int] = None,
+        kdim: int | None = None,
+        vdim: int | None = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
         self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = self.kdim == embed_dim \
-            and self.vdim == embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
 
         self.num_heads = num_heads
 
@@ -111,8 +124,9 @@ class VisualAttention(nn.Module):
         self.hidden_size_per_partition = embed_dim
 
         # Strided linear layer.
-        assert self._qkv_same_embed_dim, \
-                'Visual Attention implementation only supports self-attention'
+        assert self._qkv_same_embed_dim, (
+            "Visual Attention implementation only supports self-attention"
+        )
         self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
         self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -120,57 +134,70 @@ class VisualAttention(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # query/key/value: [sq, b, h]
         sq, b, _ = x.size()
         mixed_x_layer, _ = self.in_proj(x)
 
         # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-        new_tensor_shape = mixed_x_layer.size()[:-1] + \
-            (self.num_attention_heads_per_partition,
-             3 * self.hidden_size_per_attention_head)
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
         mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
         # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
         query_layer, key_layer, value_layer = mixed_x_layer.split(
-            self.hidden_size_per_attention_head, dim=-1)
+            self.hidden_size_per_attention_head, dim=-1
+        )
 
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)
         # [sk, b, np, hn] -> [sk, b * np, hn]
         key_layer = key_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)
 
         q_scaled = query_layer / self.norm_factor
         if attn_mask is not None:
-            attention_probs = torch.baddbmm(attn_mask, q_scaled,
-                                            key_layer.transpose(-2, -1))
+            attention_probs = torch.baddbmm(
+                attn_mask, q_scaled, key_layer.transpose(-2, -1)
+            )
         else:
             attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
         attention_probs = attention_probs.softmax(dim=-1)
 
         value_layer = value_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)
 
         # matmul: [b * np, sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer)
 
         # change view [b, np, sq, hn]
         context_layer = context_layer.view(
-            b, self.num_attention_heads_per_partition, sq,
-            self.hidden_size_per_attention_head)
+            b,
+            self.num_attention_heads_per_partition,
+            sq,
+            self.hidden_size_per_attention_head,
+        )
 
         # [b, np, sq, hn] --> [sq, b, np, hn]
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
 
         # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition,
+        )
         context_layer = context_layer.view(*new_context_layer_shape)
 
         output, _ = self.out_proj(context_layer)
@@ -185,13 +212,12 @@ class QwenVLMLP(nn.Module):
         self,
         hidden_size: int,
         intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
-        self.c_fc = ColumnParallelLinear(hidden_size,
-                                         intermediate_size,
-                                         bias=True,
-                                         quant_config=quant_config)
+        self.c_fc = ColumnParallelLinear(
+            hidden_size, intermediate_size, bias=True, quant_config=quant_config
+        )
         self.act_fn = get_act_fn("gelu")
         self.c_proj = RowParallelLinear(
             intermediate_size,
@@ -208,14 +234,13 @@ class QwenVLMLP(nn.Module):
 
 
 class VisualAttentionBlock(nn.Module):
-
     def __init__(
         self,
         d_model: int,
         n_head: int,
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
 
@@ -232,7 +257,7 @@ class VisualAttentionBlock(nn.Module):
     def attention(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
         return self.attn(x, attn_mask=attn_mask)
@@ -240,7 +265,7 @@ class VisualAttentionBlock(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
         x = x + self.mlp(self.ln_2(x))
@@ -248,7 +273,6 @@ class VisualAttentionBlock(nn.Module):
 
 
 class TransformerBlock(nn.Module):
-
     def __init__(
         self,
         width: int,
@@ -256,20 +280,24 @@ class TransformerBlock(nn.Module):
         heads: int,
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.width = width
         self.layers = layers
 
-        self.resblocks = nn.ModuleList([
-            VisualAttentionBlock(width,
-                                 heads,
-                                 mlp_ratio,
-                                 norm_layer=norm_layer,
-                                 quant_config=quant_config)
-            for _ in range(layers)
-        ])
+        self.resblocks = nn.ModuleList(
+            [
+                VisualAttentionBlock(
+                    width,
+                    heads,
+                    mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                )
+                for _ in range(layers)
+            ]
+        )
 
     def get_cast_dtype(self) -> torch.dtype:
         return self.resblocks[0].mlp.c_fc.weight.dtype
@@ -277,54 +305,57 @@ class TransformerBlock(nn.Module):
     def get_cast_device(self) -> torch.device:
         return self.resblocks[0].mlp.c_fc.weight.device
 
-    def forward(self,
-                x: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor, attn_mask: torch.Tensor | None = None
+    ) -> torch.Tensor:
         for r in self.resblocks:
             x = r(x, attn_mask=attn_mask)
         return x
 
 
 class VisionTransformer(nn.Module):
-
-    def __init__(self,
-                 image_size: int,
-                 patch_size: int,
-                 width: int,
-                 layers: int,
-                 heads: int,
-                 mlp_ratio: float,
-                 n_queries: int = 256,
-                 output_dim: int = 512,
-                 image_start_id: int = 151857,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float,
+        n_queries: int = 256,
+        output_dim: int = 512,
+        image_start_id: int = 151857,
+        quant_config: QuantizationConfig | None = None,
+        **kwargs,
+    ):
         super().__init__()
         image_height, image_width = self.image_size = (image_size, image_size)
         patch_height, patch_width = self.patch_size = (patch_size, patch_size)
-        self.grid_size = (image_height // patch_height,
-                          image_width // patch_width)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
         self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(in_channels=3,
-                               out_channels=width,
-                               kernel_size=patch_size,
-                               stride=patch_size,
-                               bias=False)
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
 
         # class embeddings and positional embeddings
         scale = width**-0.5
-        self.positional_embedding = nn.Parameter(scale *
-                                                 torch.randn(256, width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
 
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
 
         self.ln_pre = norm_layer(width)
-        self.transformer = TransformerBlock(width,
-                                            layers,
-                                            heads,
-                                            mlp_ratio,
-                                            norm_layer=norm_layer,
-                                            quant_config=quant_config)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+        )
 
         self.attn_pool = Resampler2(
             grid_size=int(math.sqrt(n_queries)),
@@ -341,7 +372,8 @@ class VisionTransformer(nn.Module):
 
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter(
-            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+            (output_dim**-0.5) * torch.randn(output_dim, output_dim)
+        )
 
         self.image_start_id = image_start_id
         self.image_end_id = image_start_id + 1
@@ -355,12 +387,10 @@ class VisionTransformer(nn.Module):
 
         # to patches
         x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1],
-                      -1)  # shape = [*, width, grid ** 2]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 
-        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
-            x.size(1))))
+        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(x.size(1))))
 
         x = self.ln_pre(x)
 
@@ -376,20 +406,19 @@ class VisionTransformer(nn.Module):
 
 
 class QwenVLModel(QWenModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.visual = VisionTransformer(**config.visual,
-                                        quant_config=quant_config)
+        self.visual = VisionTransformer(**config.visual, quant_config=quant_config)
 
 
 @lru_cache(maxsize=1)
 def _get_tokenizer_without_image_pad(
-        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    tokenizer: PreTrainedTokenizer,
+) -> PreTrainedTokenizer:
     """
     The logic of adding image pad tokens should only be applied in
     [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
@@ -401,18 +430,18 @@ def _get_tokenizer_without_image_pad(
     new_tokenizer = copy.deepcopy(tokenizer)
 
     class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
-
         def tokenize(
             self,
             text: str,
-            allowed_special: Union[Set[str], str] = "all",
-            disallowed_special: Union[Collection[str], str] = (),
+            allowed_special: Set[str] | str = "all",
+            disallowed_special: Collection[str] | str = (),
             **kwargs,
-        ) -> list[Union[bytes, str]]:
+        ) -> list[bytes | str]:
             text = unicodedata.normalize("NFC", text)
 
             return [
-                self.decoder[t] for t in self.tokenizer.encode(
+                self.decoder[t]
+                for t in self.tokenizer.encode(
                     text,
                     allowed_special=allowed_special,
                     disallowed_special=disallowed_special,
@@ -421,9 +450,9 @@ def _get_tokenizer_without_image_pad(
 
         def _decode(
             self,
-            token_ids: Union[int, list[int]],
+            token_ids: int | list[int],
             skip_special_tokens: bool = False,
-            errors: Optional[str] = None,
+            errors: str | None = None,
             **kwargs,
         ) -> str:
             if isinstance(token_ids, int):
@@ -434,8 +463,7 @@ def _get_tokenizer_without_image_pad(
                 errors=errors or self.errors,
             )
 
-    TokenizerWithoutImagePad.__name__ = \
-        f"{tokenizer.__class__.__name__}WithoutImagePad"
+    TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
 
     new_tokenizer.__class__ = TokenizerWithoutImagePad
     return new_tokenizer
@@ -466,17 +494,19 @@ class QwenVLProcessor:
         vision_config = config.visual
         image_size = vision_config["image_size"]
 
-        self.image_transform = transforms.Compose([
-            transforms.Resize(
-                (image_size, image_size),
-                interpolation=InterpolationMode.BICUBIC,
-            ),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711),
-            ),
-        ])
+        self.image_transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ]
+        )
 
     @property
     def image_start_tag(self) -> str:
@@ -492,9 +522,9 @@ class QwenVLProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -523,7 +553,6 @@ class QwenVLProcessor:
 
 
 class QwenVLProcessingInfo(BaseProcessingInfo):
-
     def get_tokenizer(self) -> PreTrainedTokenizer:
         tokenizer = self.ctx.tokenizer
         assert isinstance(tokenizer, PreTrainedTokenizer)
@@ -538,7 +567,7 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(self) -> int:
@@ -552,7 +581,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
 
 
 class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -560,13 +588,15 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
         img_start = hf_processor.image_start_tag
         img_end = hf_processor.image_end_tag
 
-        return "".join(f"Picture {i}: {img_start}{img_end}\n"
-                       for i in range(1, num_images + 1))
+        return "".join(
+            f"Picture {i}: {img_start}{img_end}\n" for i in range(1, num_images + 1)
+        )
 
     def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.visual
@@ -574,16 +604,19 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
 class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -639,8 +672,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
-        special_tokens: dict[str,
-                             int] = tokenizer.special_tokens  # type: ignore
+        special_tokens: dict[str, int] = tokenizer.special_tokens  # type: ignore
 
         processor = self.info.get_hf_processor()
         img_start_id = special_tokens[processor.image_start_tag]
@@ -662,11 +694,16 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
-                                        info=QwenVLProcessingInfo,
-                                        dummy_inputs=QwenVLDummyInputsBuilder)
-class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
-                                     SupportsMultiModal):
+@MULTIMODAL_REGISTRY.register_processor(
+    QwenVLMultiModalProcessor,
+    info=QwenVLProcessingInfo,
+    dummy_inputs=QwenVLDummyInputsBuilder,
+)
+class QwenVLForConditionalGeneration(
+    QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
+):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "c_attn": ["c_attn"],
         "gate_up_proj": [
@@ -682,10 +719,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
         return MultiModelKeys.from_string_field(
             language_model="transformer.h",
             connector="transformer.visual.attn_pool",
-            tower_model="transformer.visual.transformer")
+            tower_model="transformer.visual.transformer",
+        )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"Picture {i}: <img></img>"
 
@@ -707,38 +745,30 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
         self.transformer: QwenVLModel
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        self, **kwargs: object
+    ) -> QwenImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             expected_h = expected_w = self.config.visual["image_size"]
             resolve_bindings = {"h": expected_h, "w": expected_w}
 
             return QwenImagePixelInputs(
                 type="pixel_values",
-                data=flatten_bn(pixel_values, concat=True),
+                data=pixel_values,
                 resolve_bindings=resolve_bindings,
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return QwenImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         return None
 
-    def _process_image_input(self,
-                             image_input: QwenImageInputs) -> torch.Tensor:
+    def _process_image_input(self, image_input: QwenImageInputs) -> torch.Tensor:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -747,8 +777,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -756,40 +785,18 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.transformer.visual.image_pad_id)
-
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py
index 9cbf844ae9f868350d40a4b1f79b73aebb2186b8..6a42564ac70a7c0b466f8697313b0b5a45606238 100644
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -11,7 +11,7 @@
 import math
 from collections.abc import Iterable
 from itertools import repeat
-from typing import Optional, Union
+from typing import TypeAlias
 
 import torch
 import torch.nn as nn
@@ -23,12 +23,11 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.intern_vit import InternVisionEncoder
 
-input_dim_t = Union[int, tuple[int, int]]
-norm_t = Union[tuple[float, float, float], torch.Tensor]
+input_dim_t: TypeAlias = int | tuple[int, int]
+norm_t: TypeAlias = tuple[float, float, float] | torch.Tensor
 
 
 def _ntuple(n):
-
     def parse(x):
         if isinstance(x, Iterable) and not isinstance(x, str):
             return tuple(x)
@@ -44,42 +43,14 @@ to_4tuple = _ntuple(4)
 to_ntuple = _ntuple
 
 
-class InputConditioner(nn.Module):
-
-    def __init__(
-        self,
-        input_scale: float,
-        norm_mean: norm_t,
-        norm_std: norm_t,
-        dtype: torch.dtype = None,
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-
-        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
-        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
-
-    def forward(self, x: torch.Tensor):
-        y = (x - self.norm_mean) / self.norm_std
-        if self.dtype is not None:
-            y = y.to(self.dtype)
-        return y
-
-
-def _to_tensor(v: norm_t):
-    return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)
-
-
 class ClsToken(nn.Module):
-
     def __init__(
         self,
         ndim: int,
         num_tokens: int = 1,
         enabled: bool = True,
-        register_multiple: Optional[int] = None,
-        num_registers: Optional[int] = None,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
     ):
         super().__init__()
 
@@ -91,12 +62,14 @@ class ClsToken(nn.Module):
             if num_registers:
                 self.num_registers = num_registers
             elif register_multiple:
-                self.num_registers = register_multiple - (num_tokens %
-                                                          register_multiple)
+                self.num_registers = register_multiple - (
+                    num_tokens % register_multiple
+                )
 
             scale = ndim**-0.5
             self.token = nn.Parameter(
-                torch.randn(num_tokens + self.num_registers, ndim) * scale)
+                torch.randn(num_tokens + self.num_registers, ndim) * scale
+            )
 
         else:
             self.token = None
@@ -108,16 +81,18 @@ class ClsToken(nn.Module):
             return x
 
         token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1)
-        x = torch.cat([
-            token,
-            x,
-        ], dim=1)
+        x = torch.cat(
+            [
+                token,
+                x,
+            ],
+            dim=1,
+        )
 
         return x
 
 
 class ViTPatchGenerator(nn.Module):
-
     def __init__(
         self,
         #  config: PretrainedConfig,
@@ -127,12 +102,12 @@ class ViTPatchGenerator(nn.Module):
         abs_pos: bool = True,
         normalize_patches: bool = False,
         cls_token: bool = False,
-        max_input_dims: Optional[input_dim_t] = None,
+        max_input_dims: input_dim_t | None = None,
         pos_dropout: float = 0.0,
         return_pos_enc: bool = False,
         num_cls_tokens: int = 1,
-        register_multiple: Optional[int] = None,
-        num_registers: Optional[int] = None,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
         patch_bias: bool = False,
         device=None,
         dtype=None,
@@ -147,8 +122,8 @@ class ViTPatchGenerator(nn.Module):
             max_input_dims = (max_input_dims, max_input_dims)
 
         max_input_dims = tuple(
-            int(math.ceil(d / patch_size) * patch_size)
-            for d in max_input_dims)
+            int(math.ceil(d / patch_size) * patch_size) for d in max_input_dims
+        )
 
         self.cpe_mode = max_input_dims != input_dims
         self.pos_dropout = pos_dropout
@@ -167,15 +142,15 @@ class ViTPatchGenerator(nn.Module):
         self.max_input_dims = max_input_dims
 
         self.im_to_patches = Im2Patches(patch_size)
-        self.embedder = ViTPatchLinear(patch_size,
-                                       embed_dim,
-                                       bias=patch_bias,
-                                       **factory)
+        self.embedder = ViTPatchLinear(
+            patch_size, embed_dim, bias=patch_bias, **factory
+        )
 
         if abs_pos:
             scale = embed_dim**-0.5
             self.pos_embed = nn.Parameter(
-                torch.randn(1, self.num_patches, embed_dim, **factory) * scale)
+                torch.randn(1, self.num_patches, embed_dim, **factory) * scale
+            )
 
         self.cls_token = ClsToken(
             embed_dim,
@@ -185,8 +160,9 @@ class ViTPatchGenerator(nn.Module):
             num_registers=num_registers,
         )
 
-        self.patch_normalizer = nn.LayerNorm(
-            embed_dim) if normalize_patches else nn.Identity()
+        self.patch_normalizer = (
+            nn.LayerNorm(embed_dim) if normalize_patches else nn.Identity()
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         patches = self.embed_patches(x)
@@ -221,42 +197,48 @@ class ViTPatchGenerator(nn.Module):
         if src_embed.shape != targ_embed.shape:
             src_size = int(math.sqrt(src_embed.shape[1]))
 
-            assert src_size**2 == src_embed.shape[
-                1], 'Unable to interpolate non-square embedding'
-
-            src_embed = rearrange(src_embed,
-                                  'b (h w) c -> b c h w',
-                                  h=src_size,
-                                  w=src_size)
-            src_embed = F.interpolate(src_embed,
-                                      size=(self.num_rows, self.num_cols),
-                                      mode='bicubic',
-                                      align_corners=True,
-                                      antialias=False)
-            src_embed = rearrange(src_embed, 'b c h w -> b (h w) c')
+            assert src_size**2 == src_embed.shape[1], (
+                "Unable to interpolate non-square embedding"
+            )
+
+            src_embed = rearrange(
+                src_embed, "b (h w) c -> b c h w", h=src_size, w=src_size
+            )
+            src_embed = F.interpolate(
+                src_embed,
+                size=(self.num_rows, self.num_cols),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_embed = rearrange(src_embed, "b c h w -> b (h w) c")
         targ_embed.data.copy_(src_embed)
 
-    def _load_projection(self, src_proj_weight: torch.Tensor,
-                         targ_proj_weight: torch.Tensor):
+    def _load_projection(
+        self, src_proj_weight: torch.Tensor, targ_proj_weight: torch.Tensor
+    ):
         if src_proj_weight.shape != targ_proj_weight.shape:
             src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3))
 
-            assert (src_patch_size**2) * 3 == src_proj_weight.shape[
-                1], 'Unable to interpolate non-square patch size'
-
-            src_proj_weight = rearrange(src_proj_weight,
-                                        'b (c h w) -> b c h w',
-                                        c=3,
-                                        h=src_patch_size,
-                                        w=src_patch_size)
-            src_proj_weight = F.interpolate(src_proj_weight,
-                                            size=(self.patch_size,
-                                                  self.patch_size),
-                                            mode='bicubic',
-                                            align_corners=True,
-                                            antialias=False)
-            src_proj_weight = rearrange(src_proj_weight,
-                                        'b c h w -> b (c h w)')
+            assert (src_patch_size**2) * 3 == src_proj_weight.shape[1], (
+                "Unable to interpolate non-square patch size"
+            )
+
+            src_proj_weight = rearrange(
+                src_proj_weight,
+                "b (c h w) -> b c h w",
+                c=3,
+                h=src_patch_size,
+                w=src_patch_size,
+            )
+            src_proj_weight = F.interpolate(
+                src_proj_weight,
+                size=(self.patch_size, self.patch_size),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_proj_weight = rearrange(src_proj_weight, "b c h w -> b (c h w)")
         targ_proj_weight.data.copy_(src_proj_weight)
 
     def embed_patches(self, x: torch.Tensor) -> torch.Tensor:
@@ -267,8 +249,8 @@ class ViTPatchGenerator(nn.Module):
     def apply_pos_enc(
         self,
         patches: torch.Tensor,
-        patch_idxs: Optional[torch.Tensor] = None,
-        input_size: Optional[tuple[int, int]] = None,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
     ) -> torch.Tensor:
         if not self.abs_pos:
             return patches
@@ -276,11 +258,12 @@ class ViTPatchGenerator(nn.Module):
         pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size)
 
         if self.training and self.pos_dropout > 0:
-            keeps = torch.rand(patches.shape[0],
-                               1,
-                               1,
-                               dtype=pos_enc.dtype,
-                               device=pos_enc.device) > self.pos_dropout
+            keeps = (
+                torch.rand(
+                    patches.shape[0], 1, 1, dtype=pos_enc.dtype, device=pos_enc.device
+                )
+                > self.pos_dropout
+            )
             pos_enc_drop = torch.where(keeps, pos_enc, 0)
         else:
             pos_enc_drop = pos_enc
@@ -290,8 +273,8 @@ class ViTPatchGenerator(nn.Module):
     def get_pos_enc(
         self,
         batch_size: int,
-        patch_idxs: Optional[torch.Tensor] = None,
-        input_size: Optional[tuple[int, int]] = None,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
     ) -> torch.Tensor:
         if input_size is None:
             input_dims = self.input_dims
@@ -303,56 +286,58 @@ class ViTPatchGenerator(nn.Module):
         if patch_idxs is None:
             return pos_embed
 
-        exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(
-            -1, -1, pos_embed.shape[-1])
+        exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1])
 
-        pos_embed = torch.gather(pos_embed.expand(patch_idxs.shape[0], -1, -1),
-                                 dim=1,
-                                 index=exp_patch_idxs)
+        pos_embed = torch.gather(
+            pos_embed.expand(patch_idxs.shape[0], -1, -1), dim=1, index=exp_patch_idxs
+        )
         return pos_embed
 
-    def _get_pos_embeddings(self, batch_size: int, input_dims: tuple[int,
-                                                                     int]):
+    def _get_pos_embeddings(self, batch_size: int, input_dims: tuple[int, int]):
         if (self.num_rows, self.num_cols) == input_dims:
             return self.pos_embed
 
-        pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols,
-                                           -1).permute(0, 3, 1, 2)
+        pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, -1).permute(
+            0, 3, 1, 2
+        )
 
         def window_select(pos_embed):
             if input_dims[0] < pos_embed.shape[-2]:
-                pos_embed = pos_embed[..., :input_dims[0], :]
+                pos_embed = pos_embed[..., : input_dims[0], :]
             if input_dims[1] < pos_embed.shape[-1]:
-                pos_embed = pos_embed[..., :, :input_dims[1]]
+                pos_embed = pos_embed[..., :, : input_dims[1]]
             return pos_embed
 
         if self.cpe_mode:
             if self.training:
                 min_scale = math.sqrt(0.1)
-                scale = torch.rand(batch_size, 1, 1, device=pos_embed.device
-                                   ) * (1 - min_scale) + min_scale
+                scale = (
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (1 - min_scale)
+                    + min_scale
+                )
                 aspect_min = math.log(3 / 4)
                 aspect_max = -aspect_min
                 aspect = torch.exp(
-                    torch.rand(batch_size, 1, 1, device=pos_embed.device) *
-                    (aspect_max - aspect_min) + aspect_min)
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (aspect_max - aspect_min)
+                    + aspect_min
+                )
 
                 scale_x = scale * aspect
                 scale_y = scale * (1 / aspect)
                 scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1)
 
-                pos_xy = torch.rand(
-                    batch_size, 1, 1, 2,
-                    device=pos_embed.device) * (1 - scale_xy)
+                pos_xy = torch.rand(batch_size, 1, 1, 2, device=pos_embed.device) * (
+                    1 - scale_xy
+                )
 
                 lin_x = torch.linspace(
-                    0, 1, steps=input_dims[1],
-                    device=pos_embed.device)[None, None].expand(
-                        batch_size, input_dims[0], -1)
+                    0, 1, steps=input_dims[1], device=pos_embed.device
+                )[None, None].expand(batch_size, input_dims[0], -1)
                 lin_y = torch.linspace(
-                    0, 1, steps=input_dims[0],
-                    device=pos_embed.device)[None, :, None].expand(
-                        batch_size, -1, input_dims[1])
+                    0, 1, steps=input_dims[0], device=pos_embed.device
+                )[None, :, None].expand(batch_size, -1, input_dims[1])
 
                 lin_xy = torch.stack([lin_x, lin_y], dim=-1)
 
@@ -364,26 +349,27 @@ class ViTPatchGenerator(nn.Module):
                 pos_embed = F.grid_sample(
                     pos_embed.float().expand(batch_size, -1, -1, -1),
                     grid=grid_xy,
-                    mode='bilinear',
-                    padding_mode='zeros',
+                    mode="bilinear",
+                    padding_mode="zeros",
                     align_corners=True,
                 ).to(pos_embed.dtype)
             else:
                 max_dim = max(input_dims)
-                pos_embed = F.interpolate(pos_embed.float(),
-                                          size=(max_dim, max_dim),
-                                          align_corners=True,
-                                          mode='bilinear').to(pos_embed.dtype)
+                pos_embed = F.interpolate(
+                    pos_embed.float(),
+                    size=(max_dim, max_dim),
+                    align_corners=True,
+                    mode="bilinear",
+                ).to(pos_embed.dtype)
 
                 pos_embed = window_select(pos_embed)
         else:
             pos_embed = window_select(pos_embed)
 
         if pos_embed.shape[-2:] != input_dims:
-            pos_embed = F.interpolate(pos_embed.float(),
-                                      size=input_dims,
-                                      align_corners=True,
-                                      mode='bilinear').to(pos_embed.dtype)
+            pos_embed = F.interpolate(
+                pos_embed.float(), size=input_dims, align_corners=True, mode="bilinear"
+            ).to(pos_embed.dtype)
 
         pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
 
@@ -391,7 +377,6 @@ class ViTPatchGenerator(nn.Module):
 
 
 class Im2Patches(nn.Module):
-
     def __init__(self, patch_size: int):
         super().__init__()
         self.patch_size = patch_size
@@ -406,7 +391,7 @@ class Im2Patches(nn.Module):
         px = x.shape[-1] // self.patch_size
         patches = rearrange(
             x,
-            'b c (py yy) (px xx) -> b (py px) (c yy xx)',
+            "b c (py yy) (px xx) -> b (py px) (c yy xx)",
             py=py,
             yy=self.patch_size,
             px=px,
@@ -416,12 +401,7 @@ class Im2Patches(nn.Module):
 
 
 class ViTPatchLinear(nn.Linear):
-
-    def __init__(self,
-                 patch_size: int,
-                 embed_dim: int,
-                 bias: bool = False,
-                 **factory):
+    def __init__(self, patch_size: int, embed_dim: int, bias: bool = False, **factory):
         super().__init__(3 * (patch_size**2), embed_dim, bias=bias, **factory)
         self.patch_size = patch_size
 
@@ -434,9 +414,9 @@ class RadioInternVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
@@ -444,16 +424,19 @@ class RadioInternVisionModel(nn.Module):
 
         self.config = config
         self.img_size, self.grid_size, self.num_patches = self._init_img_size(
-            to_2tuple(config.patch_size), config.image_size)
+            to_2tuple(config.patch_size), config.image_size
+        )
         max_img_size = int(
-            round(config.max_img_size / config.patch_size) * config.patch_size)
+            round(config.max_img_size / config.patch_size) * config.patch_size
+        )
         self.patch_generator = ViTPatchGenerator(
             config.patch_size,
             config.hidden_size,
             input_dims=self.img_size,
             max_input_dims=max_img_size,
             cls_token=True,
-            register_multiple=config.reg_tokens)
+            register_multiple=config.reg_tokens,
+        )
 
         self.encoder = InternVisionEncoder(
             config=config,
@@ -463,8 +446,7 @@ class RadioInternVisionModel(nn.Module):
             prefix=f"{prefix}.encoder",
         )
 
-    def _init_img_size(self, patch_size, img_size: Union[int, tuple[int,
-                                                                    int]]):
+    def _init_img_size(self, patch_size, img_size: int | tuple[int, int]):
         if img_size is None:
             return None, None, None
         img_size = to_2tuple(img_size)
@@ -490,34 +472,29 @@ class RadioModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
-        self.input_conditioner = InputConditioner(
-            input_scale=1.0,
-            norm_mean=config.norm_mean,
-            norm_std=config.norm_std,
-        )
         self.model = RadioInternVisionModel(
             config=config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             num_dummy_heads=num_dummy_heads,
-            prefix=prefix)
+            prefix=prefix,
+        )
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
-        x = self.input_conditioner(pixel_values)
-        y = self.model(x)
+        y = self.model(pixel_values)
         return self._extract_final(y)
 
     def load_weights(self, weights) -> set[str]:
@@ -534,11 +511,15 @@ class RadioModel(nn.Module):
                 # Skip non-radio weights
                 continue
 
-            sub = name[len("radio_model."):]  # drop "radio_model." prefix
+            sub = name[len("radio_model.") :]  # drop "radio_model." prefix
 
             # Skip buffers not used in vLLM
             if sub in {"summary_idxs"}:
                 continue
+            if sub.startswith("input_conditioner."):
+                # we normalize in the input processor,
+                # based on norm and std values from the config
+                continue
 
             vllm_key = None
             if sub.startswith("model.patch_generator."):
@@ -553,15 +534,13 @@ class RadioModel(nn.Module):
                     layer_idx = parts[2]
                     suffix = ".".join(parts[3:])
                     # Skip layer-scale entries that vLLM doesn't use
-                    if suffix in {"ls1", "ls2"} or suffix.startswith(
-                        ("ls1.", "ls2.")):
+                    if suffix in {"ls1", "ls2"} or suffix.startswith(("ls1.", "ls2.")):
                         continue
                     vllm_key = f"model.encoder.layers.{layer_idx}.{suffix}"
 
             if vllm_key and vllm_key in params_dict:
                 param = params_dict[vllm_key]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, weight)
                 loaded_params.add(vllm_key)
 
@@ -571,6 +550,6 @@ class RadioModel(nn.Module):
         # Remove CLS + REGISTERS tokens
         patch_gen = getattr(self.model, "patch_generator", None)
         if patch_gen is not None:
-            all_feat = y[:, patch_gen.num_skip:]
+            all_feat = y[:, patch_gen.num_skip :]
 
         return all_feat
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e1fc7e11a2e69f5484cdcbf3278f848227344ddd..a2de597c87d886e5b9c04208404bcffe18cb76ec 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -4,6 +4,7 @@
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
+
 import hashlib
 import importlib
 import json
@@ -13,45 +14,54 @@ import subprocess
 import sys
 import tempfile
 from abc import ABC, abstractmethod
-from collections.abc import Set
+from collections.abc import Callable, Set
 from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import Callable, Optional, TypeVar, Union
+from typing import TypeVar
 
 import torch.nn as nn
 import transformers
 
 from vllm import envs
-from vllm.config import (ModelConfig, iter_architecture_defaults,
-                         try_match_architecture_defaults)
+from vllm.config import (
+    ModelConfig,
+    iter_architecture_defaults,
+    try_match_architecture_defaults,
+)
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
-from vllm.transformers_utils.dynamic_module import (
-    try_get_class_from_dynamic_module)
-
-from .interfaces import (has_inner_state, has_noops, is_attention_free,
-                         is_hybrid, supports_cross_encoding,
-                         supports_multimodal,
-                         supports_multimodal_encoder_tp_data,
-                         supports_multimodal_raw_input_only, supports_pp,
-                         supports_transcription, supports_v0_only)
-from .interfaces_base import (get_default_pooling_type, is_pooling_model,
-                              is_text_generation_model)
+from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
+
+from .interfaces import (
+    has_inner_state,
+    has_noops,
+    is_attention_free,
+    is_hybrid,
+    supports_cross_encoding,
+    supports_mamba_prefix_caching,
+    supports_multimodal,
+    supports_multimodal_encoder_tp_data,
+    supports_multimodal_raw_input_only,
+    supports_pp,
+    supports_transcription,
+)
+from .interfaces_base import (
+    get_default_pooling_type,
+    is_pooling_model,
+    is_text_generation_model,
+)
 
 logger = init_logger(__name__)
 
-# yapf: disable
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"),
     "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"),
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
-    "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
-    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
-    "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name
@@ -67,7 +77,7 @@ _TEXT_GENERATION_MODELS = {
     "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
-    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekForCausalLM": ("deepseek_v2", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
@@ -76,8 +86,11 @@ _TEXT_GENERATION_MODELS = {
     "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"),
-    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
+    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
+    "FlexOlmoForCausalLM": ("flex_olmo", "FlexOlmoForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
@@ -93,8 +106,8 @@ _TEXT_GENERATION_MODELS = {
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
-    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),   # noqa: E501
-    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
+    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),  # noqa: E501
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),  # noqa: E501
     "GritLM": ("gritlm", "GritLM"),
     "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
     "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
@@ -106,21 +119,24 @@ _TEXT_GENERATION_MODELS = {
     "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
+    "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),  # noqa: E501
     "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
+    "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
-    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
+    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
-    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
-    "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
     "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
     "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
+    "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
+    "MiniMaxM2ForCausalLM": ("minimax_m2", "MiniMaxM2ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "MotifForCausalLM": ("motif", "MotifForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
@@ -133,6 +149,9 @@ _TEXT_GENERATION_MODELS = {
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "OuroForCausalLM": ("ouro", "OuroForCausalLM"),
+    "PanguEmbeddedForCausalLM": ("openpangu", "PanguEmbeddedForCausalLM"),
+    "PanguUltraMoEForCausalLM": ("openpangu", "PanguUltraMoEForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
@@ -159,6 +178,7 @@ _TEXT_GENERATION_MODELS = {
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "BertSpladeSparseEmbeddingModel": ("bert", "BertSpladeSparseEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3TextModel": ("gemma3", "Gemma3Model"),
@@ -172,7 +192,8 @@ _EMBEDDING_MODELS = {
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
-        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
+        k: (mod, arch)
+        for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
         if arch == "LlamaForCausalLM"
     },
     "MistralModel": ("llama", "LlamaForCausalLM"),
@@ -188,9 +209,14 @@ _EMBEDDING_MODELS = {
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]
-    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
+    "CLIPModel": ("clip", "CLIPEmbeddingModel"),
+    "LlavaNextForConditionalGeneration": (
+        "llava_next",
+        "LlavaNextForConditionalGeneration",
+    ),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "SiglipModel": ("siglip", "SiglipEmbeddingModel"),
     # Technically Terratorch models work on images, both in
     # input and output. I am adding it here because it piggy-backs on embedding
     # models for the time being.
@@ -201,79 +227,168 @@ _EMBEDDING_MODELS = {
 _CROSS_ENCODER_MODELS = {
     "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
     "BertForTokenClassification": ("bert", "BertForTokenClassification"),
-    "GteNewForSequenceClassification": ("bert_with_rope",
-                                        "GteNewForSequenceClassification"),
-    "ModernBertForSequenceClassification": ("modernbert",
-                                            "ModernBertForSequenceClassification"),
-    "RobertaForSequenceClassification": ("roberta",
-                                         "RobertaForSequenceClassification"),
-    "XLMRobertaForSequenceClassification": ("roberta",
-                                            "RobertaForSequenceClassification"),
+    "GteNewForSequenceClassification": (
+        "bert_with_rope",
+        "GteNewForSequenceClassification",
+    ),
+    "ModernBertForSequenceClassification": (
+        "modernbert",
+        "ModernBertForSequenceClassification",
+    ),
+    "ModernBertForTokenClassification": (
+        "modernbert",
+        "ModernBertForTokenClassification",
+    ),
+    "RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
+    "XLMRobertaForSequenceClassification": (
+        "roberta",
+        "RobertaForSequenceClassification",
+    ),
     # [Auto-converted (see adapters.py)]
-    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
+    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),  # noqa: E501,
 }
 
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
-    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
+    "AyaVisionForConditionalGeneration": (
+        "aya_vision",
+        "AyaVisionForConditionalGeneration",
+    ),
+    "BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"),
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
-    "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
-    "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
+    "ChameleonForConditionalGeneration": (
+        "chameleon",
+        "ChameleonForConditionalGeneration",
+    ),
+    "Cohere2VisionForConditionalGeneration": (
+        "cohere2_vision",
+        "Cohere2VisionForConditionalGeneration",
+    ),
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
+    "DeepseekOCRForCausalLM": ("deepseek_ocr", "DeepseekOCRForCausalLM"),
     "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
-    "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
+    "Ernie4_5_VLMoeForConditionalGeneration": (
+        "ernie45_vl",
+        "Ernie4_5_VLMoeForConditionalGeneration",
+    ),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
-    "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501
+    "Gemma3nForConditionalGeneration": (
+        "gemma3n_mm",
+        "Gemma3nForConditionalGeneration",
+    ),
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),  # noqa: E501
-    "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": (
+        "granite_speech",
+        "GraniteSpeechForConditionalGeneration",
+    ),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
-    "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
-    "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
-    "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
-    "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "InternS1ForConditionalGeneration": (
+        "interns1",
+        "InternS1ForConditionalGeneration",
+    ),
+    "InternVLForConditionalGeneration": (
+        "interns1",
+        "InternS1ForConditionalGeneration",
+    ),
+    "Idefics3ForConditionalGeneration": (
+        "idefics3",
+        "Idefics3ForConditionalGeneration",
+    ),
+    "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
-    "KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
+    "KeyeVL1_5ForConditionalGeneration": (
+        "keye_vl1_5",
+        "KeyeVL1_5ForConditionalGeneration",
+    ),
     "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
+    "LightOnOCRForConditionalGeneration": (
+        "lightonocr",
+        "LightOnOCRForConditionalGeneration",
+    ),
     "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
     "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
-    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
-    "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "LlavaNextForConditionalGeneration": (
+        "llava_next",
+        "LlavaNextForConditionalGeneration",
+    ),
+    "LlavaNextVideoForConditionalGeneration": (
+        "llava_next_video",
+        "LlavaNextVideoForConditionalGeneration",
+    ),
+    "LlavaOnevisionForConditionalGeneration": (
+        "llava_onevision",
+        "LlavaOnevisionForConditionalGeneration",
+    ),
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
-    "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
+    "MiniMaxVL01ForConditionalGeneration": (
+        "minimax_vl_01",
+        "MiniMaxVL01ForConditionalGeneration",
+    ),
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
-    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
+    "Mistral3ForConditionalGeneration": (
+        "mistral3",
+        "Mistral3ForConditionalGeneration",
+    ),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
     "Ovis2_5": ("ovis2_5", "Ovis2_5"),
-    "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
+    "PaddleOCRVLForConditionalGeneration": (
+        "paddleocr_vl",
+        "PaddleOCRVLForConditionalGeneration",
+    ),
+    "PaliGemmaForConditionalGeneration": (
+        "paligemma",
+        "PaliGemmaForConditionalGeneration",
+    ),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
-    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": (
+        "qwen2_5_vl",
+        "Qwen2_5_VLForConditionalGeneration",
+    ),
+    "Qwen2AudioForConditionalGeneration": (
+        "qwen2_audio",
+        "Qwen2AudioForConditionalGeneration",
+    ),
+    "Qwen2_5OmniModel": (
+        "qwen2_5_omni_thinker",
+        "Qwen2_5OmniThinkerForConditionalGeneration",
+    ),
+    "Qwen2_5OmniForConditionalGeneration": (
+        "qwen2_5_omni_thinker",
+        "Qwen2_5OmniThinkerForConditionalGeneration",
+    ),
+    "Qwen3OmniMoeForConditionalGeneration": (
+        "qwen3_omni_moe_thinker",
+        "Qwen3OmniMoeThinkerForConditionalGeneration",
+    ),
     "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
-    "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),  # noqa: E501
+    "Qwen3VLMoeForConditionalGeneration": (
+        "qwen3_vl_moe",
+        "Qwen3VLMoeForConditionalGeneration",
+    ),
     "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
-    "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
+    "Tarsier2ForConditionalGeneration": (
+        "qwen2_vl",
+        "Tarsier2ForConditionalGeneration",
+    ),
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
     # [Encoder-decoder]
@@ -287,12 +402,14 @@ _SPECULATIVE_DECODING_MODELS = {
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "LongCatFlashMTPModel": ("longcat_flash_mtp", "LongCatFlashMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
+    "OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"),
     "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),
     # Temporarily disabled.
     # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
@@ -303,15 +420,46 @@ _TRANSFORMERS_SUPPORTED_MODELS = {
     # Text generation models
     "SmolLM3ForCausalLM": ("transformers", "TransformersForCausalLM"),
     # Multimodal models
-    "Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"),  # noqa: E501
+    "Emu3ForConditionalGeneration": (
+        "transformers",
+        "TransformersMultiModalForCausalLM",
+    ),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
-    "TransformersModel": ("transformers", "TransformersModel"),
+    # Text generation models
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
-    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
+    "TransformersMoEForCausalLM": ("transformers", "TransformersMoEForCausalLM"),
+    # Multimodal models
+    "TransformersMultiModalForCausalLM": (
+        "transformers",
+        "TransformersMultiModalForCausalLM",
+    ),
+    "TransformersMultiModalMoEForCausalLM": (
+        "transformers",
+        "TransformersMultiModalMoEForCausalLM",
+    ),
+    # Embedding models
+    "TransformersEmbeddingModel": ("transformers", "TransformersEmbeddingModel"),
+    "TransformersMoEEmbeddingModel": ("transformers", "TransformersMoEEmbeddingModel"),
+    "TransformersMultiModalEmbeddingModel": (
+        "transformers",
+        "TransformersMultiModalEmbeddingModel",
+    ),
+    # Sequence classification models
+    "TransformersForSequenceClassification": (
+        "transformers",
+        "TransformersForSequenceClassification",
+    ),
+    "TransformersMoEForSequenceClassification": (
+        "transformers",
+        "TransformersMoEForSequenceClassification",
+    ),
+    "TransformersMultiModalForSequenceClassification": (
+        "transformers",
+        "TransformersMultiModalForSequenceClassification",
+    ),
 }
-# yapf: enable
 
 _VLLM_MODELS = {
     **_TEXT_GENERATION_MODELS,
@@ -327,12 +475,12 @@ _VLLM_MODELS = {
 # can modify  this variable to alter the args if needed. e.g.
 # when we use par format to pack things together, sys.executable
 # might not be the target we want to run.
-_SUBPROCESS_COMMAND = [
-    sys.executable, "-m", "vllm.model_executor.models.registry"
-]
+_SUBPROCESS_COMMAND = [sys.executable, "-m", "vllm.model_executor.models.registry"]
 
 _PREVIOUSLY_SUPPORTED_MODELS = {
+    "MotifForCausalLM": "0.10.2",
     "Phi3SmallForCausalLM": "0.9.2",
+    "Phi4FlashForCausalLM": "0.10.2",
     # encoder-decoder models except whisper
     # have been removed for V0 deprecation.
     "BartModel": "0.10.2",
@@ -359,9 +507,9 @@ class _ModelInfo:
     is_attention_free: bool
     is_hybrid: bool
     has_noops: bool
+    supports_mamba_prefix_caching: bool
     supports_transcription: bool
     supports_transcription_only: bool
-    supports_v0_only: bool
 
     @staticmethod
     def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
@@ -372,24 +520,26 @@ class _ModelInfo:
             default_pooling_type=get_default_pooling_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
-            supports_multimodal_raw_input_only=
-            supports_multimodal_raw_input_only(model),
-            supports_multimodal_encoder_tp_data=
-            supports_multimodal_encoder_tp_data(model),
+            supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
+                model
+            ),
+            supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
+                model
+            ),
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
             is_hybrid=is_hybrid(model),
+            supports_mamba_prefix_caching=supports_mamba_prefix_caching(model),
             supports_transcription=supports_transcription(model),
-            supports_transcription_only=(supports_transcription(model) and
-                                         model.supports_transcription_only),
-            supports_v0_only=supports_v0_only(model),
+            supports_transcription_only=(
+                supports_transcription(model) and model.supports_transcription_only
+            ),
             has_noops=has_noops(model),
         )
 
 
 class _BaseRegisteredModel(ABC):
-
     @abstractmethod
     def inspect_model_cls(self) -> _ModelInfo:
         raise NotImplementedError
@@ -427,6 +577,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
     """
     Represents a model that has not been imported in the main process.
     """
+
     module_name: str
     class_name: str
 
@@ -438,38 +589,42 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
         cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
         return f"{cls_name}.json"
 
-    def _load_modelinfo_from_cache(self,
-                                   module_hash: str) -> _ModelInfo | None:
+    def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
         try:
             try:
-                modelinfo_path = self._get_cache_dir(
-                ) / self._get_cache_filename()
+                modelinfo_path = self._get_cache_dir() / self._get_cache_filename()
                 with open(modelinfo_path, encoding="utf-8") as file:
                     mi_dict = json.load(file)
             except FileNotFoundError:
-                logger.debug(("Cached model info file "
-                              "for class %s.%s not found"), self.module_name,
-                             self.class_name)
+                logger.debug(
+                    "Cached model info file for class %s.%s not found",
+                    self.module_name,
+                    self.class_name,
+                )
                 return None
 
             if mi_dict["hash"] != module_hash:
-                logger.debug(("Cached model info file "
-                              "for class %s.%s is stale"), self.module_name,
-                             self.class_name)
+                logger.debug(
+                    "Cached model info file for class %s.%s is stale",
+                    self.module_name,
+                    self.class_name,
+                )
                 return None
 
             # file not changed, use cached _ModelInfo properties
             return _ModelInfo(**mi_dict["modelinfo"])
         except Exception:
-            logger.exception(("Cached model info "
-                              "for class %s.%s error. "), self.module_name,
-                             self.class_name)
+            logger.debug(
+                "Cached model info for class %s.%s error. ",
+                self.module_name,
+                self.class_name,
+            )
             return None
 
-    def _save_modelinfo_to_cache(self, mi: _ModelInfo,
-                                 module_hash: str) -> None:
+    def _save_modelinfo_to_cache(self, mi: _ModelInfo, module_hash: str) -> None:
         """save dictionary json file to cache"""
         from vllm.model_executor.model_loader.weight_utils import atomic_writer
+
         try:
             modelinfo_dict = {
                 "hash": module_hash,
@@ -478,38 +633,42 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
             cache_dir = self._get_cache_dir()
             cache_dir.mkdir(parents=True, exist_ok=True)
             modelinfo_path = cache_dir / self._get_cache_filename()
-            with atomic_writer(modelinfo_path, encoding='utf-8') as f:
+            with atomic_writer(modelinfo_path, encoding="utf-8") as f:
                 json.dump(modelinfo_dict, f, indent=2)
         except Exception:
             logger.exception("Error saving model info cache.")
 
     @logtime(logger=logger, msg="Registry inspect model class")
     def inspect_model_cls(self) -> _ModelInfo:
-        model_path = Path(
-            __file__).parent / f"{self.module_name.split('.')[-1]}.py"
+        model_path = Path(__file__).parent / f"{self.module_name.split('.')[-1]}.py"
         module_hash = None
 
         if model_path.exists():
             with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read()).hexdigest()
+                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
 
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
-                logger.debug(("Loaded model info "
-                              "for class %s.%s from cache"), self.module_name,
-                             self.class_name)
+                logger.debug(
+                    "Loaded model info for class %s.%s from cache",
+                    self.module_name,
+                    self.class_name,
+                )
                 return mi
             else:
-                logger.debug(("Cache model info "
-                              "for class %s.%s miss. "
-                              "Loading model instead."), self.module_name,
-                             self.class_name)
+                logger.debug(
+                    "Cache model info for class %s.%s miss. Loading model instead.",
+                    self.module_name,
+                    self.class_name,
+                )
 
         # Performed in another process to avoid initializing CUDA
         mi = _run_in_subprocess(
-            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
-        logger.debug("Loaded model info for class %s.%s", self.module_name,
-                     self.class_name)
+            lambda: _ModelInfo.from_model_cls(self.load_model_cls())
+        )
+        logger.debug(
+            "Loaded model info for class %s.%s", self.module_name, self.class_name
+        )
 
         # save cache file
         if module_hash is not None:
@@ -526,14 +685,14 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
 def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
-) -> Optional[type[nn.Module]]:
+) -> type[nn.Module] | None:
     from vllm.platforms import current_platform
+
     current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
     except Exception:
-        logger.exception("Error in loading model architecture '%s'",
-                         model_arch)
+        logger.exception("Error in loading model architecture '%s'", model_arch)
         return None
 
 
@@ -541,12 +700,11 @@ def _try_load_model_cls(
 def _try_inspect_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
-) -> Optional[_ModelInfo]:
+) -> _ModelInfo | None:
     try:
         return model.inspect_model_cls()
     except Exception:
-        logger.exception("Error in inspecting model architecture '%s'",
-                         model_arch)
+        logger.exception("Error in inspecting model architecture '%s'", model_arch)
         return None
 
 
@@ -561,7 +719,7 @@ class _ModelRegistry:
     def register_model(
         self,
         model_arch: str,
-        model_cls: Union[type[nn.Module], str],
+        model_cls: type[nn.Module] | str,
     ) -> None:
         """
         Register an external model to be used in vLLM.
@@ -581,8 +739,10 @@ class _ModelRegistry:
         if model_arch in self.models:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
-                "overwritten by the new model class %s.", model_arch,
-                model_cls)
+                "overwritten by the new model class %s.",
+                model_arch,
+                model_cls,
+            )
 
         if isinstance(model_cls, str):
             split_str = model_cls.split(":")
@@ -594,8 +754,10 @@ class _ModelRegistry:
         elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
             model = _RegisteredModel.from_model_cls(model_cls)
         else:
-            msg = ("`model_cls` should be a string or PyTorch model class, "
-                   f"not a {type(model_arch)}")
+            msg = (
+                "`model_cls` should be a string or PyTorch model class, "
+                f"not a {type(model_arch)}"
+            )
             raise TypeError(msg)
 
         self.models[model_arch] = model
@@ -606,7 +768,8 @@ class _ModelRegistry:
         if any(arch in all_supported_archs for arch in architectures):
             raise ValueError(
                 f"Model architectures {architectures} failed "
-                "to be inspected. Please check the logs for more details.")
+                "to be inspected. Please check the logs for more details."
+            )
 
         for arch in architectures:
             if arch in _PREVIOUSLY_SUPPORTED_MODELS:
@@ -616,20 +779,21 @@ class _ModelRegistry:
                     f"Model architecture {arch} was supported in vLLM until "
                     f"v{previous_version}, and is not supported anymore. "
                     "Please use an older version of vLLM if you want to "
-                    "use this model architecture.")
+                    "use this model architecture."
+                )
 
         raise ValueError(
             f"Model architectures {architectures} are not supported for now. "
-            f"Supported architectures: {all_supported_archs}")
+            f"Supported architectures: {all_supported_archs}"
+        )
 
-    def _try_load_model_cls(self,
-                            model_arch: str) -> Optional[type[nn.Module]]:
+    def _try_load_model_cls(self, model_arch: str) -> type[nn.Module] | None:
         if model_arch not in self.models:
             return None
 
         return _try_load_model_cls(model_arch, self.models[model_arch])
 
-    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
+    def _try_inspect_model_cls(self, model_arch: str) -> _ModelInfo | None:
         if model_arch not in self.models:
             return None
 
@@ -639,12 +803,13 @@ class _ModelRegistry:
         self,
         architecture: str,
         model_config: ModelConfig,
-    ) -> Optional[str]:
+    ) -> str | None:
         if architecture in _TRANSFORMERS_BACKEND_MODELS:
             return architecture
 
-        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
-                                           None) or dict()
+        auto_map: dict[str, str] = (
+            getattr(model_config.hf_config, "auto_map", None) or dict()
+        )
 
         # Make sure that config class is always initialized before model class,
         # otherwise the model class won't be able to access the config class,
@@ -686,7 +851,8 @@ class _ModelRegistry:
                     "registered model in the Transformers library (only "
                     "relevant if the model is meant to be in Transformers) "
                     "and 'AutoModel' is not present in the model config's "
-                    "'auto_map' (relevant if the model is custom).")
+                    "'auto_map' (relevant if the model is custom)."
+                )
 
         if not model_module.is_backend_compatible():
             if model_config.model_impl != "transformers":
@@ -694,7 +860,8 @@ class _ModelRegistry:
 
             raise ValueError(
                 f"The Transformers implementation of {architecture!r} "
-                "is not compatible with vLLM.")
+                "is not compatible with vLLM."
+            )
 
         return model_config._get_transformers_backend_cls()
 
@@ -726,7 +893,7 @@ class _ModelRegistry:
 
     def inspect_model_cls(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> tuple[_ModelInfo, str]:
         if isinstance(architectures, str):
@@ -736,8 +903,7 @@ class _ModelRegistry:
 
         # Require transformers impl
         if model_config.model_impl == "transformers":
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_info = self._try_inspect_model_cls(arch)
                 if model_info is not None:
@@ -747,11 +913,12 @@ class _ModelRegistry:
             return (model_info, "Terratorch")
 
         # Fallback to transformers impl (after resolving convert_type)
-        if (all(arch not in self.models for arch in architectures)
-                and model_config.model_impl == "auto"
-                and getattr(model_config, "convert_type", "none") == "none"):
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+        if (
+            all(arch not in self.models for arch in architectures)
+            and model_config.model_impl == "auto"
+            and getattr(model_config, "convert_type", "none") == "none"
+        ):
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_info = self._try_inspect_model_cls(arch)
                 if model_info is not None:
@@ -764,10 +931,11 @@ class _ModelRegistry:
                 return (model_info, arch)
 
         # Fallback to transformers impl (before resolving runner_type)
-        if (all(arch not in self.models for arch in architectures)
-                and model_config.model_impl == "auto"):
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+        if (
+            all(arch not in self.models for arch in architectures)
+            and model_config.model_impl == "auto"
+        ):
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_info = self._try_inspect_model_cls(arch)
                 if model_info is not None:
@@ -777,7 +945,7 @@ class _ModelRegistry:
 
     def resolve_model_cls(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> tuple[type[nn.Module], str]:
         if isinstance(architectures, str):
@@ -787,8 +955,7 @@ class _ModelRegistry:
 
         # Require transformers impl
         if model_config.model_impl == "transformers":
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_cls = self._try_load_model_cls(arch)
                 if model_cls is not None:
@@ -800,11 +967,12 @@ class _ModelRegistry:
                 return (model_cls, arch)
 
         # Fallback to transformers impl (after resolving convert_type)
-        if (all(arch not in self.models for arch in architectures)
-                and model_config.model_impl == "auto"
-                and getattr(model_config, "convert_type", "none") == "none"):
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+        if (
+            all(arch not in self.models for arch in architectures)
+            and model_config.model_impl == "auto"
+            and getattr(model_config, "convert_type", "none") == "none"
+        ):
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_cls = self._try_load_model_cls(arch)
                 if model_cls is not None:
@@ -817,10 +985,11 @@ class _ModelRegistry:
                 return (model_cls, arch)
 
         # Fallback to transformers impl (before resolving runner_type)
-        if (all(arch not in self.models for arch in architectures)
-                and model_config.model_impl == "auto"):
-            arch = self._try_resolve_transformers(architectures[0],
-                                                  model_config)
+        if (
+            all(arch not in self.models for arch in architectures)
+            and model_config.model_impl == "auto"
+        ):
+            arch = self._try_resolve_transformers(architectures[0], model_config)
             if arch is not None:
                 model_cls = self._try_load_model_cls(arch)
                 if model_cls is not None:
@@ -830,7 +999,7 @@ class _ModelRegistry:
 
     def is_text_generation_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -838,7 +1007,7 @@ class _ModelRegistry:
 
     def is_pooling_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -846,7 +1015,7 @@ class _ModelRegistry:
 
     def is_cross_encoder_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -854,7 +1023,7 @@ class _ModelRegistry:
 
     def is_multimodal_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -862,7 +1031,7 @@ class _ModelRegistry:
 
     def is_multimodal_raw_input_only_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -870,7 +1039,7 @@ class _ModelRegistry:
 
     def is_pp_supported_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -878,7 +1047,7 @@ class _ModelRegistry:
 
     def model_has_inner_state(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -886,7 +1055,7 @@ class _ModelRegistry:
 
     def is_attention_free_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -894,7 +1063,7 @@ class _ModelRegistry:
 
     def is_hybrid_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -902,7 +1071,7 @@ class _ModelRegistry:
 
     def is_noops_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -910,7 +1079,7 @@ class _ModelRegistry:
 
     def is_transcription_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -918,29 +1087,22 @@ class _ModelRegistry:
 
     def is_transcription_only_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_transcription_only
 
-    def is_v1_compatible(
-        self,
-        architectures: Union[str, list[str]],
-        model_config: ModelConfig,
-    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures, model_config)
-        return not model_cls.supports_v0_only
 
-
-ModelRegistry = _ModelRegistry({
-    model_arch:
-    _LazyRegisteredModel(
-        module_name=f"vllm.model_executor.models.{mod_relname}",
-        class_name=cls_name,
-    )
-    for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items()
-})
+ModelRegistry = _ModelRegistry(
+    {
+        model_arch: _LazyRegisteredModel(
+            module_name=f"vllm.model_executor.models.{mod_relname}",
+            class_name=cls_name,
+        )
+        for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items()
+    }
+)
 
 _T = TypeVar("_T")
 
@@ -953,21 +1115,23 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
 
         # `cloudpickle` allows pickling lambda functions directly
         import cloudpickle
+
         input_bytes = cloudpickle.dumps((fn, output_filepath))
 
         # cannot use `sys.executable __file__` here because the script
         # contains relative imports
-        returned = subprocess.run(_SUBPROCESS_COMMAND,
-                                  input=input_bytes,
-                                  capture_output=True)
+        returned = subprocess.run(
+            _SUBPROCESS_COMMAND, input=input_bytes, capture_output=True
+        )
 
         # check if the subprocess is successful
         try:
             returned.check_returncode()
         except Exception as e:
             # wrap raised exception to provide more information
-            raise RuntimeError(f"Error raised in subprocess:\n"
-                               f"{returned.stderr.decode()}") from e
+            raise RuntimeError(
+                f"Error raised in subprocess:\n{returned.stderr.decode()}"
+            ) from e
 
         with open(output_filepath, "rb") as f:
             return pickle.load(f)
@@ -976,6 +1140,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
 def _run() -> None:
     # Setup plugins
     from vllm.plugins import load_general_plugins
+
     load_general_plugins()
 
     fn, output_file = pickle.loads(sys.stdin.buffer.read())
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba405be41687662da5c56e225e37d7267d1155a5..31cc645099141197e8a621232c524aaa1702798e 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -2,23 +2,31 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import ModelConfig, VllmConfig
-from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
-                                               DispatchPooler, Pooler)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
-from vllm.model_executor.models.bert import (TOKEN_TYPE_SHIFT,
-                                             BertEmbeddingModel, BertModel,
-                                             _decode_token_type_ids,
-                                             _encode_token_type_ids)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              maybe_prefix)
+from vllm.model_executor.layers.pooler import (
+    ClassifierPooler,
+    CLSPool,
+    DispatchPooler,
+    Pooler,
+)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.models.bert import (
+    TOKEN_TYPE_SHIFT,
+    BertEmbeddingModel,
+    BertModel,
+    _decode_token_type_ids,
+    _encode_token_type_ids,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    maybe_prefix,
+)
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
@@ -27,21 +35,23 @@ from .interfaces_base import default_pooling_type
 
 
 class RobertaEmbedding(nn.Module):
-
     def __init__(self, config: RobertaConfig):
         super().__init__()
         self.size = config.hidden_size
-        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
-                                                      config.hidden_size)
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
         self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size,
-                                                padding_idx=self.padding_idx)
-
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.register_buffer(
             "position_ids",
             torch.arange(config.max_position_embeddings).unsqueeze(0),
@@ -49,18 +59,21 @@ class RobertaEmbedding(nn.Module):
 
         self.position_embedding_type = config.position_embedding_type
         if self.position_embedding_type != "absolute":
-            raise ValueError("Only 'absolute' position_embedding_type" +
-                             " is supported")
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         token_type_ids = _decode_token_type_ids(input_ids)
 
-        inputs_embeds = self.word_embeddings(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
         position_embeddings = self.position_embeddings(position_ids)
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -77,12 +90,10 @@ class RobertaClassificationHead(nn.Module):
         super().__init__()
         config = model_config.hf_config
         head_dtype = model_config.head_dtype
-        self.dense = nn.Linear(config.hidden_size,
-                               config.hidden_size,
-                               dtype=head_dtype)
-        self.out_proj = nn.Linear(config.hidden_size,
-                                  config.num_labels,
-                                  dtype=head_dtype)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, dtype=head_dtype)
+        self.out_proj = nn.Linear(
+            config.hidden_size, config.num_labels, dtype=head_dtype
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # CLSPool has already been applied in `pooling`
@@ -94,15 +105,7 @@ class RobertaClassificationHead(nn.Module):
 
 @default_pooling_type("CLS")
 class RobertaEmbeddingModel(BertEmbeddingModel):
-    """A model that uses Roberta to provide embedding functionalities.
-
-   This class encapsulates the BertModel and provides an interface for
-   embedding operations and customized pooling functions.
-
-   Attributes:
-       model: An instance of BertModel used for forward operations.
-       _pooler: An instance of Pooler used for pooling operations.
-   """
+    """A model that uses Roberta to provide embedding functionalities."""
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -112,37 +115,38 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
-
         # Fix Roberta positions here outside of the CUDA graph.
         # Because we need the to extract the sequences from
         # input_ids the control flow is data dependent.
-        replace_roberta_positions(input_ids=input_ids,
-                                  position_ids=positions,
-                                  padding_idx=self.padding_idx)
-
-        return self.model(input_ids=input_ids,
-                          positions=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
-
-    def _build_model(self,
-                     vllm_config: VllmConfig,
-                     prefix: str = "") -> Union[BertModel, BertWithRope]:
-        if (vllm_config.model_config.hf_config.position_embedding_type ==
-                "rotary"):
+        replace_roberta_positions(
+            input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
+        )
+
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
+
+    def _build_model(
+        self, vllm_config: VllmConfig, prefix: str = ""
+    ) -> BertModel | BertWithRope:
+        if vllm_config.model_config.hf_config.position_embedding_type == "rotary":
             return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix)
         else:
-            return BertModel(vllm_config=vllm_config,
-                             prefix=prefix,
-                             embedding_class=RobertaEmbedding)
+            return BertModel(
+                vllm_config=vllm_config, prefix=prefix, embedding_class=RobertaEmbedding
+            )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         weights_list = list(weights)
         has_roberta_prefix = any(
-            name.startswith("roberta.") for name, _ in weights_list)
+            name.startswith("roberta.") for name, _ in weights_list
+        )
         if has_roberta_prefix:
             # For models with the `roberta.` prefix e.g.
             # `FacebookAI/roberta-base`
@@ -160,26 +164,27 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
 class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
 
-   This class encapsulates the BertModel and provides an interface for
-   embedding operations and customized pooling functions.
+    This class encapsulates the BertModel and provides an interface for
+    embedding operations and customized pooling functions.
 
-   Attributes:
-       roberta: An instance of BertModel used for forward operations.
-       _pooler: An instance of Pooler used for pooling operations.
-   """
+    Attributes:
+        roberta: An instance of BertModel used for forward operations.
+        _pooler: An instance of Pooler used for pooling operations.
+    """
 
     is_pooling_model = True
     jina_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
-            'emb_ln': "embeddings.LayerNorm",
-            'layers': "layer",
-            'mixer.Wqkv': "attention.self.qkv_proj",
-            'mixer.out_proj': "attention.output.dense",
-            'norm1': "attention.output.LayerNorm",
-            'mlp.fc1': "intermediate.dense",
-            'mlp.fc2': "output.dense",
-            'norm2': "output.LayerNorm",
-        })
+            "emb_ln": "embeddings.LayerNorm",
+            "layers": "layer",
+            "mixer.Wqkv": "attention.self.qkv_proj",
+            "mixer.out_proj": "attention.output.dense",
+            "norm1": "attention.output.LayerNorm",
+            "mlp.fc1": "intermediate.dense",
+            "mlp.fc2": "output.dense",
+            "norm2": "output.LayerNorm",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -187,61 +192,63 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.padding_idx: int = vllm_config.model_config.hf_config.pad_token_id
 
         self.num_labels = config.num_labels
-        self.roberta = BertModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "bert"),
-                                 embedding_class=RobertaEmbedding)
+        self.roberta = BertModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "bert"),
+            embedding_class=RobertaEmbedding,
+        )
         self.classifier = RobertaClassificationHead(vllm_config.model_config)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "classify":
-            ClassifierPooler(
-                pooling=CLSPool(),
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                    vllm_config.model_config),
-            ),
-            "score":
-            ClassifierPooler(
-                pooling=CLSPool(),
-                classifier=self.classifier,
-                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                    vllm_config.model_config),
-            ),
-        })
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config, classifier=self.classifier
+                ),
+                "classify": ClassifierPooler(
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
+                ),
+                "score": ClassifierPooler(
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
+                ),
+            }
+        )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.jina_to_vllm_mapper)
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.roberta.embed_input_ids(input_ids)
+
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        replace_roberta_positions(input_ids=input_ids,
-                                  position_ids=positions,
-                                  padding_idx=self.padding_idx)
+        replace_roberta_positions(
+            input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
+        )
         if token_type_ids is not None:
             assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
             assert input_ids is not None
             _encode_token_type_ids(input_ids, token_type_ids)
-        return self.roberta(input_ids=input_ids,
-                            positions=positions,
-                            inputs_embeds=inputs_embeds,
-                            intermediate_tensors=intermediate_tensors)
+        return self.roberta(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
 
 
-def replace_roberta_positions(input_ids: torch.Tensor,
-                              position_ids: torch.Tensor,
-                              padding_idx: int) -> None:
+def replace_roberta_positions(
+    input_ids: torch.Tensor, position_ids: torch.Tensor, padding_idx: int
+) -> None:
     # Replace position ids because in RoBERTa models
     # they have to start at padding_idx + 1 and ignore
     # existing padding tokens
diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py
index efdb010046634a4e8a83afc04660863214a7db28..92352febe87ec2613e91e67baa57294716b7e4a0 100644
--- a/vllm/model_executor/models/rvl.py
+++ b/vllm/model_executor/models/rvl.py
@@ -8,17 +8,20 @@ import torch.nn as nn
 from transformers.activations import GELUActivation
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict
 
-from .llava_next import (LlavaDummyInputsBuilder, LlavaNextMultiModalProcessor,
-                         LlavaNextProcessingInfo)
+from .llava_next import (
+    LlavaDummyInputsBuilder,
+    LlavaNextMultiModalProcessor,
+    LlavaNextProcessingInfo,
+)
 from .llava_onevision import LlavaOnevisionForConditionalGeneration
 from .utils import WeightsMapper
 
 
 class RVLProcessingInfo(LlavaNextProcessingInfo):
-
     def get_hf_config(self):
         return self.ctx.get_hf_config()
 
@@ -27,7 +30,6 @@ class RVLProcessingInfo(LlavaNextProcessingInfo):
 
 
 class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         image_token = "<image>"
@@ -38,26 +40,28 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = (
-            self.info.get_image_size_with_most_features())
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
         }
 
 
 class RVLMultiModalProjector(nn.Module):
-
     def __init__(self, config):
         super().__init__()
-        self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size,
-                                     eps=1e-06)
+        self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=1e-06)
         self.linear_1 = nn.Linear(
             config.vision_config.hidden_size,
             config.text_config.hidden_size,
@@ -85,7 +89,6 @@ class RVLMultiModalProjector(nn.Module):
     dummy_inputs=RVLDummyInputsBuilder,
 )
 class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration):
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers
@@ -95,7 +98,8 @@ class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration):
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.image_newline": "image_newline",
             "lm_head.": "language_model.lm_head.",
-        })
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index a217c820fedf0da8ec407f6c070a10a56f560532..bf211d28f1844e47c7d03581ee4af3b3f2b17fd2 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -22,9 +22,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only SeedOss model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -37,34 +37,44 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class SeedOssMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -83,8 +93,9 @@ class SeedOssMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -95,7 +106,6 @@ class SeedOssMLP(nn.Module):
 
 
 class SeedOssAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -104,9 +114,9 @@ class SeedOssAttention(nn.Module):
         head_dim: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -181,12 +191,11 @@ class SeedOssAttention(nn.Module):
 
 
 class SeedOssDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: SeedOssConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -224,32 +233,30 @@ class SeedOssDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -260,14 +267,16 @@ class SeedOssDecoderLayer(nn.Module):
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })
+    }
+)
 class SeedOssModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer,
+    ):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -275,8 +284,9 @@ class SeedOssModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
+        if cache_config.sliding_window is not None and hasattr(
+            config, "max_window_layers"
+        ):
             assert config.max_window_layers == config.num_hidden_layers, (
                 "Sliding window for some but all layers is not supported. "
                 "This model uses sliding window but `max_window_layers` = {} "
@@ -284,14 +294,16 @@ class SeedOssModel(nn.Module):
                 "to discuss this feature.".format(
                     config.max_window_layers,
                     config.num_hidden_layers,
-                ))
+                )
+            )
 
         self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.vocab_size
 
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 config.vocab_size,
                 config.hidden_size,
@@ -305,36 +317,38 @@ class SeedOssModel(nn.Module):
         decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: decoder_layer_type(config=config,
-                                              cache_config=cache_config,
-                                              quant_config=quant_config,
-                                              prefix=prefix),
+            lambda prefix: decoder_layer_type(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -347,15 +361,13 @@ class SeedOssModel(nn.Module):
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -369,18 +381,19 @@ class SeedOssModel(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -404,8 +417,7 @@ class SeedOssModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -428,58 +440,58 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = SeedOssModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        self.model = SeedOssModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
             else:
-                self.lm_head = ParallelLMHead(config.vocab_size,
-                                              config.hidden_size,
-                                              quant_config=quant_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "lm_head"))
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
         else:
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index eb49d6d2c33505a81bd85b217252a5c8fd282c9e..b175dd60cf6503971501e813570ce807d17b9aed 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -4,37 +4,267 @@
 within a vision language model."""
 
 import math
-from collections.abc import Iterable
-from typing import Optional, Union
+from collections.abc import Iterable, Mapping
+from functools import cached_property
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
-from transformers import SiglipVisionConfig
+from transformers import (
+    BatchFeature,
+    SiglipConfig,
+    SiglipProcessor,
+    SiglipTextConfig,
+    SiglipVisionConfig,
+)
 
 from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptIndexTargets,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
+from .interfaces_base import default_pooling_type
+from .utils import AutoWeightsLoader, maybe_prefix
+from .vision import (
+    VisionEncoderInfo,
+    VisionFeatureSelectStrategy,
+    VisionFeatureSelectStrategyStr,
+    get_num_selected_vision_tokens,
+    resolve_visual_encoder_outputs,
+)
+
+
+class SiglipImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
+
+    type: Literal["pixel_values"]
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
+
+
+_POOLING_TYPE_TO_STRATEGY: dict[str, VisionFeatureSelectStrategyStr] = {
+    "MEAN": "full",
+    "ALL": "full",
+    "CLS": "class",
+}
+
+
+def _get_vision_feature_select_strategy(
+    pooling_type: str,
+) -> VisionFeatureSelectStrategyStr:
+    try:
+        return _POOLING_TYPE_TO_STRATEGY[pooling_type]
+    except KeyError:
+        raise ValueError(
+            f"No feature selection strategy is defined for "
+            f"pooling_type: {pooling_type!r}"
+        ) from None
+
+
+class SiglipProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(SiglipConfig)
+
+    def get_vision_encoder_info(self):
+        return SiglipEncoderInfo(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(SiglipProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": 1}
 
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
 
+        pooler_config = self.ctx.model_config.pooler_config
+        assert pooler_config is not None
 
-class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
+        return get_num_selected_vision_tokens(
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+            _get_vision_feature_select_strategy(pooler_config.pooling_type),
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width, image_height=target_height
+        )
+
+
+class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
+        }
+
+
+class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
+    @cached_property
+    def image_token_id(self) -> int:
+        tokenizer = self.info.get_tokenizer()
+        dummy_token_id = next(
+            token_id
+            for token_id in range(tokenizer.vocab_size)
+            if token_id not in tokenizer.all_special_ids
+        )
+
+        return dummy_token_id
+
+    def apply(
+        self,
+        prompt: str | list[int],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        *,
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> MultiModalInputs:
+        if prompt and mm_data:
+            raise ValueError(
+                "Siglip accepts text-only or image-only inputs, not both! "
+                "Image-only inputs means passing an image with an empty text "
+                "prompt."
+            )
+
+        if mm_data:
+            # For multi-modal data, the prompt after processing should
+            # only contain the image token
+            tokenization_kwargs = {
+                **(tokenization_kwargs or {}),
+                "add_special_tokens": False,
+            }
+
+        return super().apply(
+            prompt=prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> list[PromptUpdate]:
+        image_token_id = self.image_token_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            num_image_tokens = self.info.get_num_image_tokens(
+                image_width=image_size.width, image_height=image_size.height
+            )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=PromptIndexTargets.start(),
+                replacement=get_replacement,
+            ),
+        ]
 
+
+class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        return self.get_patch_grid_length()**2
+        return self.get_patch_grid_length() ** 2
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
@@ -49,7 +279,6 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
 class SiglipVisionEmbeddings(nn.Module):
-
     def __init__(self, config: SiglipVisionConfig):
         super().__init__()
         self.config = config
@@ -65,19 +294,20 @@ class SiglipVisionEmbeddings(nn.Module):
             padding="valid",
         )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches
         self.position_embedding = VocabParallelEmbedding(
-            self.num_positions, self.embed_dim)
+            self.num_positions, self.embed_dim
+        )
         self.register_buffer(
             "position_ids",
-            torch.arange(self.num_positions, dtype=torch.int64).expand(
-                (1, -1)),
+            torch.arange(self.num_positions, dtype=torch.int64).expand((1, -1)),
             persistent=False,
         )
 
-    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
-                                 width: int) -> torch.Tensor:
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
         """
         This method is an adapted method for SigLIP (due to SigLIP not having
         class embedding unlike other ViTs) that allows the model to interpolate
@@ -102,8 +332,8 @@ class SiglipVisionEmbeddings(nn.Module):
         height, width = height + 0.1, width + 0.1
 
         patch_pos_embed = position_embeddings.reshape(
-            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)),
-            dim)
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
@@ -114,38 +344,43 @@ class SiglipVisionEmbeddings(nn.Module):
             mode="bicubic",
             align_corners=False,
         )
-        if (int(height) != patch_pos_embed.shape[-2]
-                or int(width) != patch_pos_embed.shape[-1]):
-            raise ValueError("Width or height does not match with "
-                             "the interpolated position embeddings")
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with "
+                "the interpolated position embeddings"
+            )
 
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
         return patch_pos_embed
 
-    def forward(self,
-                pixel_values: torch.Tensor,
-                interpolate_pos_encoding: bool = False) -> torch.Tensor:
+    def forward(
+        self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False
+    ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(
-            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
         if interpolate_pos_encoding:
-            embeddings += self.interpolate_pos_encoding(
-                embeddings, height, width)
+            embeddings += self.interpolate_pos_encoding(embeddings, height, width)
         else:
             embeddings += self.position_embedding(self.position_ids)
         return embeddings
 
 
 class SiglipAttention(nn.Module):
-
     def __init__(
         self,
-        config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: SiglipVisionConfig | SiglipTextConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -154,9 +389,11 @@ class SiglipAttention(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(f"embed_dim must be divisible by num_heads (got "
-                             "`embed_dim`: {self.embed_dim} and `num_heads`:"
-                             f" {self.num_heads}).")
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got "
+                "`embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
 
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
@@ -178,17 +415,20 @@ class SiglipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-                                       self.head_dim, self.scale)
+        self.attn = attn_cls(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, None]:
         """Input shape: Batch x Time x Channel"""
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-
         out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
 
@@ -196,11 +436,10 @@ class SiglipAttention(nn.Module):
 
 
 class SiglipMLP(nn.Module):
-
     def __init__(
         self,
-        config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: SiglipVisionConfig | SiglipTextConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -208,15 +447,14 @@ class SiglipMLP(nn.Module):
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
         # Special handling for BNB and torchao quantization
-        if quant_config and quant_config.get_name() in [
-                "bitsandbytes", "torchao"
-        ]:
+        if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
             quantizable = True
         else:
             # For other quantization, we require the hidden size to be a
             # multiple of 64
-            quantizable = (config.hidden_size % 64 == 0
-                           and config.intermediate_size % 64 == 0)
+            quantizable = (
+                config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+            )
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
@@ -238,12 +476,13 @@ class SiglipMLP(nn.Module):
 
 
 class SiglipEncoderLayer(nn.Module):
-
     def __init__(
         self,
-        config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: SiglipVisionConfig | SiglipTextConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -253,16 +492,15 @@ class SiglipEncoderLayer(nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            attn_cls=attn_cls,
         )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
         )
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -283,13 +521,14 @@ class SiglipEncoderLayer(nn.Module):
 
 
 class SiglipEncoder(nn.Module):
-
     def __init__(
         self,
-        config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        config: SiglipVisionConfig | SiglipTextConfig,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
+        *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -300,18 +539,23 @@ class SiglipEncoder(nn.Module):
         else:
             num_hidden_layers = num_hidden_layers_override
 
-        self.layers = nn.ModuleList([
-            SiglipEncoderLayer(config,
-                               quant_config=quant_config,
-                               prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_cls=attn_cls,
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
         inputs_embeds: torch.Tensor,
         return_all_hidden_states: bool,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
@@ -326,13 +570,84 @@ class SiglipEncoder(nn.Module):
         return hidden_states
 
 
+class SiglipTextTransformer(nn.Module):
+    def __init__(
+        self,
+        config: SiglipTextConfig,
+        quant_config: QuantizationConfig | None = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipTextEmbeddings(config)
+
+        self.encoder = SiglipEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            attn_cls=EncoderOnlyAttention,
+        )
+
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = nn.Linear(embed_dim, config.projection_size)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.token_embedding(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(input_ids, position_ids, inputs_embeds)
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states, return_all_hidden_states=False
+        )
+
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        return last_hidden_state
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
 class SiglipMultiheadAttentionPoolingHead(nn.Module):
     """Multihead Attention Pooling."""
 
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -340,16 +655,17 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         # TODO(ChristopherCho): Implement vLLM version of MultiheadAttention
         self.attention = torch.nn.MultiheadAttention(
-            config.hidden_size, config.num_attention_heads, batch_first=True)
-        self.layernorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config=config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.mlp")
+            config.hidden_size, config.num_attention_heads, batch_first=True
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        batch_size = hidden_state.shape[0]
-        probe = self.probe.repeat(batch_size, 1, 1)
+        batch_size = hidden_state.size(0)
+
+        probe = self.probe.expand(batch_size, -1, -1)
 
         hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
 
@@ -358,18 +674,19 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         hidden_state = self.mlp(hidden_state)
         hidden_state += residual
 
-        return hidden_state[:, 0]
+        pooled = hidden_state[:, 0]
 
+        return pooled.unsqueeze(1)
 
-class SiglipVisionTransformer(nn.Module):
 
+class SiglipVisionTransformer(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -384,6 +701,7 @@ class SiglipVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
+            attn_cls=MultiHeadAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
@@ -398,13 +716,13 @@ class SiglipVisionTransformer(nn.Module):
             require_post_norm = len(self.encoder.layers) == num_hidden_layers
 
         if require_post_norm:
-            self.post_layernorm = nn.LayerNorm(embed_dim,
-                                               eps=config.layer_norm_eps)
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         else:
             self.post_layernorm = None
 
-        self.use_head = (True if not hasattr(config, "vision_use_head") else
-                         config.vision_use_head)
+        self.use_head = (
+            True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        )
         if self.use_head:
             self.head = SiglipMultiheadAttentionPoolingHead(
                 config=config,
@@ -412,38 +730,87 @@ class SiglipVisionTransformer(nn.Module):
                 prefix=f"{prefix}.head",
             )
 
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
     def forward(
         self,
         pixel_values: torch.Tensor,
-        interpolate_pos_encoding: bool = True,
-        feature_sample_layers: Optional[list[int]] = None,
+        *,
+        interpolate_pos_encoding: bool = False,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
-
         hidden_states = self.embeddings(
             pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
-
-        return_all_hidden_states = feature_sample_layers is not None
-
         # Produces either the last layer output or all of the hidden states,
-        # depending on if we have feature_sample_layers or not
+        # depending on if we have select_layers or not
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            return_all_hidden_states=return_all_hidden_states,
+            return_all_hidden_states=select_layers is not None,
         )
 
-        # Handle post-norm (if applicable) and stacks feature layers if needed
-        encoder_outputs = resolve_visual_encoder_outputs(
-            encoder_outputs, feature_sample_layers, self.post_layernorm,
-            self.config.num_hidden_layers)
+        if self.post_layernorm is not None:
+            encoder_outputs = self.post_layernorm(encoder_outputs)
+
+        if self.use_head:
+            encoder_outputs = self.head(encoder_outputs)
 
-        # TODO: add this back when pooled_output is used in inference.
-        # if self.use_head:
-        # pooled_output = self.head(encoder_outputs)
+        # stacks feature layers if needed
+        encoder_outputs = resolve_visual_encoder_outputs(
+            encoder_outputs,
+            None,
+            select_layers=select_layers,
+            max_possible_layers=self.config.num_hidden_layers,
+            feature_select_strategy=feature_select_strategy,
+        )
 
         return encoder_outputs
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        layer_count = len(self.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # post_layernorm is not needed in SiglipVisionTransformer
+            if name.startswith("post_layernorm") and self.post_layernorm is None:
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("encoder.layers"):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class SiglipVisionModel(nn.Module):
     config_class = SiglipVisionConfig
@@ -452,10 +819,10 @@ class SiglipVisionModel(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -471,20 +838,29 @@ class SiglipVisionModel(nn.Module):
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
+    @property
+    def dtype(self):
+        return self.vision_model.dtype
+
+    @property
+    def device(self):
+        return self.vision_model.device
+
     def forward(
         self,
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = False,
-        feature_sample_layers: Optional[list[int]] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         return self.vision_model(
             pixel_values=pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            feature_sample_layers=feature_sample_layers,
+            select_layers=select_layers,
+            feature_select_strategy=feature_select_strategy,
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -497,8 +873,10 @@ class SiglipVisionModel(nn.Module):
 
         for name, loaded_weight in weights:
             # post_layernorm is optional in SiglipVisionModel
-            if (name.startswith("vision_model.post_layernorm")
-                    and self.vision_model.post_layernorm is None):
+            if (
+                name.startswith("vision_model.post_layernorm")
+                and self.vision_model.post_layernorm is None
+            ):
                 continue
 
             # omit layers when num_hidden_layers_override is set
@@ -508,21 +886,21 @@ class SiglipVisionModel(nn.Module):
                     continue
 
             # Check if this is a scale parameter that needs remapping first
-            if name.endswith(
-                (".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
+            if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
                 # Try to remap the scale name first
                 remapped_name = maybe_remap_kv_scale_name(name, params_dict)
                 if remapped_name is not None and remapped_name in params_dict:
                     # Successfully remapped, use the remapped name
                     param = params_dict[remapped_name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
                     loaded_params.add(remapped_name)
                     continue
                 # If remapping failed, continue with normal processing
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -533,8 +911,264 @@ class SiglipVisionModel(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+# Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
+class SiglipTextEmbeddings(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+
+        self.token_embedding = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+
+        self.position_embedding = VocabParallelEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings
+
+
+# Assume EOS token corresponds to CLS token in text model
+@default_pooling_type("CLS")
+@MULTIMODAL_REGISTRY.register_processor(
+    SiglipMultiModalProcessor,
+    info=SiglipProcessingInfo,
+    dummy_inputs=SiglipDummyInputsBuilder,
+)
+class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
+    is_pooling_model = True
+
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+    merge_by_field_config = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: SiglipConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        if hasattr(config, "num_labels"):
+            config.num_labels = 0
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = SiglipTextTransformer(
+            text_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "text_model"),
+        )
+        self.vision_model = SiglipVisionTransformer(
+            vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.text_projection_size = text_config.projection_size
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler_config = pooler_config
+
+        self.pooler = DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": Pooler.for_embed(pooler_config),
+            }
+        )
+
+        self._is_text_input = True
+
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor | None,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        last_hidden_state = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        text_features = self.text_model.head(last_hidden_state)
+
+        # SigLIP uses reversed position_ids;
+        # flip sequences to move EOS token to first position
+        text_features = self._flip_sequences_by_position_ids(
+            text_features, position_ids
+        )
+
+        return text_features
+
+    def _flip_sequences_by_position_ids(
+        self,
+        features: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Flip sequences so EOS token moves to first position for CLS pooling.
+
+        SigLIP position_ids are reversed within each sequence. This method detects
+        sequence boundaries and flips each sequence individually.
+        """
+        if len(features) == 1:
+            return features
+
+        # Detect sequence boundaries where position_ids decrease
+        position_diffs = position_ids[1:] - position_ids[:-1]
+        boundary_mask = position_diffs <= 0
+
+        boundary_indices = torch.cat(
+            [
+                torch.tensor([0], device=features.device),
+                torch.where(boundary_mask)[0] + 1,
+                torch.tensor([len(features)], device=features.device),
+            ]
+        )
+
+        # For each sequence [start, end), position i flips to: start + end - 1 - i
+        lengths = boundary_indices[1:] - boundary_indices[:-1]
+        starts = boundary_indices[:-1]
+        ends = boundary_indices[1:]
+
+        # Assign sequence ID to each element
+        sequence_ids = torch.arange(
+            len(lengths), device=features.device
+        ).repeat_interleave(lengths)
+
+        # Calculate flipped indices for all positions at once
+        current_positions = torch.arange(len(features), device=features.device)
+        flip_indices = starts[sequence_ids] + ends[sequence_ids] - 1 - current_positions
+
+        return features[flip_indices]
+
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
+    ) -> torch.Tensor:
+        if feature_select_strategy is None:
+            feature_select_strategy = _get_vision_feature_select_strategy(
+                self.pooler_config.pooling_type
+            )
+
+        pooled_output = self.vision_model(
+            pixel_values=pixel_values,
+            select_layers=None,
+            feature_select_strategy=feature_select_strategy,
+        )
+
+        return pooled_output
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> SiglipImagePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is None:
+            return None
+
+        expected_h = expected_w = self.config.vision_config.image_size
+        return SiglipImagePixelInputs(
+            type="pixel_values",
+            data=pixel_values,
+            resolve_bindings={"h": expected_h, "w": expected_w},
+        )
+
+    def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["data"]
+
+        return self.get_image_features(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.text_model
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
+    ) -> torch.Tensor:
+        self._is_text_input = (
+            multimodal_embeddings is None or len(multimodal_embeddings) == 0
+        )
+
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        vision_embeddings = self._process_image_inputs(image_input)
+        return vision_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            raise RuntimeError("PP is not supported for this model")
+
+        # Multimodal inputs (image embeddings)
+        if not self._is_text_input:
+            return inputs_embeds
+
+        return self.get_text_features(input_ids, positions, inputs_embeds)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_substrs=[".position_ids"],
+            ignore_unexpected_prefixes=["logit_scale.", "logit_bias."],
+        )
+
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 7d90d3a7ef1288b442470af3540a09c797feb7ad..29dd164ad37fd2e13e9003d143ea46933e8c67c4 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -4,7 +4,6 @@
 within a vision language model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from einops import rearrange, repeat
@@ -13,38 +12,39 @@ from torch.nn import functional as F
 from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention.layer import check_upstream_fa_availability
-from vllm.config import QuantizationConfig
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase, QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    LinearBase,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.platforms import _Backend
+from vllm.platforms import current_platform
 
 from .vision import get_vit_attn_backend
 
 
 class VisionRotaryEmbedding(nn.Module):
-
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
-        inv_freq = 1.0 / (theta
-                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
     def forward(self, seqlen: int) -> torch.Tensor:
-        seq = torch.arange(seqlen,
-                           device=self.inv_freq.device,
-                           dtype=self.inv_freq.dtype)
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
         freqs = torch.outer(seq, self.inv_freq)
         return freqs
 
 
 class Siglip2VisionEmbeddings(nn.Module):
-
     def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.config = config
@@ -58,15 +58,13 @@ class Siglip2VisionEmbeddings(nn.Module):
         # siglip2 naflex
         if self.num_patches > 0:
             self.patch_embedding = ReplicatedLinear(
-                input_size=config.num_channels * self.patch_size *
-                self.patch_size,
+                input_size=config.num_channels * self.patch_size * self.patch_size,
                 output_size=self.embed_dim,
                 return_bias=False,
             )
             if self.preserve_original_pe:
                 self.position_embedding_size = int(self.num_patches**0.5)
-                self.position_embedding = nn.Embedding(self.num_patches,
-                                                       self.embed_dim)
+                self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
 
         else:
             self.patch_embedding = nn.Conv2d(
@@ -77,15 +75,15 @@ class Siglip2VisionEmbeddings(nn.Module):
                 padding="valid",
             )
             if self.preserve_original_pe:
-                self.num_patches = (self.image_size // self.patch_size)**2
-                self.position_embedding_size = (self.image_size //
-                                                self.patch_size)
-                self.position_embedding = nn.Embedding(self.num_patches,
-                                                       self.embed_dim)
-
-    def forward(self,
-                pixel_values: torch.FloatTensor,
-                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+                self.num_patches = (self.image_size // self.patch_size) ** 2
+                self.position_embedding_size = self.image_size // self.patch_size
+                self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor | None = None,
+    ) -> torch.Tensor:
         """
         Args:
             pixel_values (`torch.FloatTensor`):
@@ -100,36 +98,48 @@ class Siglip2VisionEmbeddings(nn.Module):
         # Apply patch embeddings to already patchified pixel values
         target_dtype = self.patch_embedding.weight.dtype
         if isinstance(self.patch_embedding, LinearBase):
-            patch_embeds = self.patch_embedding(
-                pixel_values.to(dtype=target_dtype))
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
         elif isinstance(self.patch_embedding, nn.Conv2d):
             pixel_values = pixel_values.view(
-                -1, self.config.num_channels * self.config.temporal_patch_size,
-                self.patch_size, self.patch_size)
-            patch_embeds = self.patch_embedding(
-                pixel_values.to(dtype=target_dtype))
+                -1,
+                self.config.num_channels * self.config.temporal_patch_size,
+                self.patch_size,
+                self.patch_size,
+            )
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
             patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
 
         if self.preserve_original_pe:
             assert grid_thws is not None
             pos_embed_new = torch.zeros_like(patch_embeds)
-            positional_embeddings = self.position_embedding.weight.reshape(
-                self.position_embedding_size, self.position_embedding_size,
-                -1).unsqueeze(0).permute(0, 3, 1, 2)
+            positional_embeddings = (
+                self.position_embedding.weight.reshape(
+                    self.position_embedding_size, self.position_embedding_size, -1
+                )
+                .unsqueeze(0)
+                .permute(0, 3, 1, 2)
+            )
             cnt = 0
             for t, h, w in grid_thws:
                 volume = t * h * w
-                pe = F.interpolate(positional_embeddings,
-                                   size=(h, w),
-                                   mode='bicubic',
-                                   align_corners=False)
+                pe = F.interpolate(
+                    positional_embeddings,
+                    size=(h, w),
+                    mode="bicubic",
+                    align_corners=False,
+                )
                 pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
                 pe = pe[0].repeat(t, 1)
-                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride,
-                                w // self.hidden_stride, self.hidden_stride,
-                                -1)
+                pe = pe.reshape(
+                    t,
+                    h // self.hidden_stride,
+                    self.hidden_stride,
+                    w // self.hidden_stride,
+                    self.hidden_stride,
+                    -1,
+                )
                 pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(volume, -1)
-                pos_embed_new[cnt:cnt + volume] = pe
+                pos_embed_new[cnt : cnt + volume] = pe
                 cnt += volume
             patch_embeds = patch_embeds + pos_embed_new
 
@@ -143,9 +153,9 @@ def rotate_half(x, interleaved=False):
         return torch.cat((-x2, x1), dim=-1)
     else:
         x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1),
-                         "... d two -> ... (d two)",
-                         two=2)
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
 
 
 def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
@@ -156,15 +166,15 @@ def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos = repeat(
-        cos,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     sin = repeat(
-        sin,
-        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
     return torch.cat(
         [
-            x[..., :ro_dim] * cos +
-            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
         ],
         dim=-1,
     )
@@ -179,15 +189,14 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend:
+    if is_flash_attn_backend and not current_platform.is_xpu():
         from flash_attn.layers.rotary import apply_rotary_emb
+
         apply_rotary_emb_func = apply_rotary_emb
     else:
         apply_rotary_emb_func = apply_rotary_emb_torch
-    q_embed = apply_rotary_emb_func(q.float(), cos.float(),
-                                    sin.float()).type_as(q)
-    k_embed = apply_rotary_emb_func(k.float(), cos.float(),
-                                    sin.float()).type_as(k)
+    q_embed = apply_rotary_emb_func(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb_func(k.float(), cos.float(), sin.float()).type_as(k)
     return q_embed, k_embed
 
 
@@ -197,9 +206,10 @@ class Siglip2Attention(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -210,7 +220,8 @@ class Siglip2Attention(nn.Module):
             raise ValueError(
                 f"embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
         self.is_causal = False
@@ -231,37 +242,45 @@ class Siglip2Attention(nn.Module):
             prefix=f"{prefix}.out_proj",
         )
 
-        self.tp_size = (1 if use_data_parallel else
-                        get_tensor_model_parallel_world_size())
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
         self.use_rope = config.use_rope
 
         # Detect attention implementation.
         self.attn_backend = get_vit_attn_backend(
-            head_size=self.head_dim, dtype=torch.get_default_dtype())
+            head_size=self.head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
         self.use_upstream_fa = False
-        if self.attn_backend != _Backend.FLASH_ATTN and \
-            check_upstream_fa_availability(
-                torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
-            self.use_upstream_fa = True
+
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
 
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
-                _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
-            self.attn_backend = _Backend.TORCH_SDPA
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        position_embeddings: Optional[tuple[torch.Tensor,
-                                            torch.Tensor]] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         seq_length, embed_dim = hidden_states.shape
@@ -269,34 +288,34 @@ class Siglip2Attention(nn.Module):
         qkv_states, _ = self.qkv_proj(hidden_states)
         queries, keys, values = qkv_states.chunk(3, dim=-1)
 
-        queries = queries.view(seq_length, self.num_heads_per_partition,
-                               self.head_dim)
-        keys = keys.view(seq_length, self.num_heads_per_partition,
-                         self.head_dim)
-        values = values.view(seq_length, self.num_heads_per_partition,
-                             self.head_dim)
+        queries = queries.view(seq_length, self.num_heads_per_partition, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads_per_partition, self.head_dim)
+        values = values.view(seq_length, self.num_heads_per_partition, self.head_dim)
 
         if self.use_rope:
             cos, sin = position_embeddings
-            queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0),
-                                                 keys.unsqueeze(0), cos, sin,
-                                                 self.is_flash_attn_backend)
+            queries, keys = apply_rotary_pos_emb(
+                queries.unsqueeze(0),
+                keys.unsqueeze(0),
+                cos,
+                sin,
+                self.is_flash_attn_backend,
+            )
             queries = queries.squeeze(0)
             keys = keys.squeeze(0)
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
-            attn_output = flash_attn_varlen_func(
-                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
-                max_seqlen).reshape(seq_length, -1)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+            attn_output = self.flash_attn_varlen_func(
+                queries,
+                keys,
+                values,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+            ).reshape(seq_length, -1)
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             batch_size = cu_seqlens.shape[0] - 1
             outputs = []
@@ -314,13 +333,9 @@ class Siglip2Attention(nn.Module):
                 # (1, num_heads, seq_len, head_dim)
                 q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
 
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
                 # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
-                output_i = output_i.transpose(1, 2).reshape(
-                    end_idx - start_idx, -1)
+                output_i = output_i.transpose(1, 2).reshape(end_idx - start_idx, -1)
                 outputs.append(output_i)
 
             attn_output = torch.cat(outputs, dim=0)
@@ -329,11 +344,10 @@ class Siglip2Attention(nn.Module):
 
 
 class Siglip2MLP(nn.Module):
-
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -363,31 +377,38 @@ class Siglip2MLP(nn.Module):
 
 
 class Siglip2EncoderLayer(nn.Module):
-
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.self_attn = Siglip2Attention(config,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.self_attn",
-                                          use_data_parallel=use_data_parallel)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.mlp = Siglip2MLP(config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.mlp",
-                              use_data_parallel=use_data_parallel)
-
-    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
-                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> tuple[torch.FloatTensor]:
         """
         Args:
             hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
@@ -397,9 +418,11 @@ class Siglip2EncoderLayer(nn.Module):
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states,
-                                       cu_seqlens=cu_seqlens,
-                                       position_embeddings=position_embeddings)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings,
+        )
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -411,7 +434,7 @@ class Siglip2EncoderLayer(nn.Module):
 
 class Siglip2Encoder(nn.Module):
     """
-    Transformer encoder consisting of `config.num_hidden_layers` 
+    Transformer encoder consisting of `config.num_hidden_layers`
     self attention layers. Each layer is a [`Siglip2EncoderLayer`].
 
     Args:
@@ -421,22 +444,29 @@ class Siglip2Encoder(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([
-            Siglip2EncoderLayer(config,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.layers.{idx}",
-                                use_data_parallel=use_data_parallel)
-            for idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                Siglip2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{idx}",
+                    use_data_parallel=use_data_parallel,
+                    attn_backend_override=attn_backend_override,
+                )
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
 
         self.rotary_pos_emb = VisionRotaryEmbedding(
-            config.hidden_size // config.num_attention_heads // 2)
+            config.hidden_size // config.num_attention_heads // 2
+        )
         self.patch_size = config.patch_size
         self.hidden_stride = config.hidden_stride
         self.window_size = config.window_size
@@ -445,7 +475,7 @@ class Siglip2Encoder(nn.Module):
             self.fullatt_block_indexes = None
         else:
             self.fullatt_block_indexes = [
-                int(i) for i in config.fullatt_block_indexes.split('|')
+                int(i) for i in config.fullatt_block_indexes.split("|")
             ]
 
     # copied from qwen2.5_vl
@@ -471,8 +501,7 @@ class Siglip2Encoder(nn.Module):
             )
             wpos_ids = wpos_ids.permute(0, 2, 1, 3)
             wpos_ids = wpos_ids.flatten()
-            pos_ids.append(
-                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
@@ -484,8 +513,9 @@ class Siglip2Encoder(nn.Module):
         cu_window_seqlens: list = [0]
         window_index_id = 0
         # patch (after merge) number in each window
-        vit_merger_window_size = (self.window_size // self.hidden_stride //
-                                  self.patch_size)
+        vit_merger_window_size = (
+            self.window_size // self.hidden_stride // self.patch_size
+        )
 
         for grid_t, grid_h, grid_w in grid_thw:
             llm_grid_h, llm_grid_w = (
@@ -493,7 +523,8 @@ class Siglip2Encoder(nn.Module):
                 grid_w // self.hidden_stride,
             )
             index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
-                grid_t, llm_grid_h, llm_grid_w)
+                grid_t, llm_grid_h, llm_grid_w
+            )
             pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
             pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
             num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
@@ -516,8 +547,9 @@ class Siglip2Encoder(nn.Module):
             index_padded = index_padded.reshape(-1)
             index_new = index_padded[index_padded != -100]
             window_index.append(index_new + window_index_id)
-            cu_seqlens_tmp = seqlens.cumsum(
-                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_seqlens_tmp = (
+                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            )
             cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
             window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
         window_index = torch.cat(window_index, dim=0)
@@ -531,10 +563,10 @@ class Siglip2Encoder(nn.Module):
     ) -> torch.Tensor:
         r"""
         Args:
-            inputs_embeds: Input tensor of shape 
+            inputs_embeds: Input tensor of shape
                 (batch_size, sequence_length, hidden_size).
                 Embedded representation of the input tokens.
-            grid_thws: Grid tensor of shape (num_patches, 3) 
+            grid_thws: Grid tensor of shape (num_patches, 3)
                 containing grid dimensions.
                 Whether or not to return a [`~utils.ModelOutput`] instead of
                 a plain tuple.
@@ -550,11 +582,13 @@ class Siglip2Encoder(nn.Module):
 
         seq_len, _ = inputs_embeds.size()
         inputs_embeds = inputs_embeds.reshape(
-            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
         inputs_embeds = inputs_embeds[window_index, :, :]
         inputs_embeds = inputs_embeds.reshape(seq_len, -1)
         rotary_pos_emb = rotary_pos_emb.reshape(
-            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
         rotary_pos_emb = rotary_pos_emb[window_index, :, :]
         rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
@@ -572,47 +606,48 @@ class Siglip2Encoder(nn.Module):
             # for more information
             dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
         )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
         reverse_indices = torch.argsort(window_index)
 
         hidden_states = inputs_embeds
         for index, block in enumerate(self.layers):
-            if (not self.fullatt_block_indexes
-                    or index in self.fullatt_block_indexes):
+            if not self.fullatt_block_indexes or index in self.fullatt_block_indexes:
                 cu_seqlens_tmp = cu_seqlens
             else:
                 cu_seqlens_tmp = cu_window_seqlens
-            hidden_states = block(hidden_states, cu_seqlens_tmp,
-                                  position_embeddings)
+            hidden_states = block(hidden_states, cu_seqlens_tmp, position_embeddings)
 
         hidden_states = hidden_states.reshape(
-            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
         hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
 
         return hidden_states
 
 
 class Siglip2VisionTransformer(nn.Module):
-
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
 
         self.embeddings = Siglip2VisionEmbeddings(config)
-        self.encoder = Siglip2Encoder(config,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.encoder",
-                                      use_data_parallel=use_data_parallel)
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
+        self.encoder = Siglip2Encoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -633,13 +668,13 @@ class Siglip2VisionTransformer(nn.Module):
 
 
 class Siglip2NavitModel(torch.nn.Module):
-
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -647,7 +682,9 @@ class Siglip2NavitModel(torch.nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.vision_model",
-            use_data_parallel=use_data_parallel)
+            use_data_parallel=use_data_parallel,
+            attn_backend_override=attn_backend_override,
+        )
 
     def forward(
         self,
@@ -659,8 +696,7 @@ class Siglip2NavitModel(torch.nn.Module):
             grid_thws=grid_thws,
         )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -671,7 +707,7 @@ class Siglip2NavitModel(torch.nn.Module):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -682,8 +718,7 @@ class Siglip2NavitModel(torch.nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 893ce4497c319d116b14cf10ecb3fa3f0c2919d8..d825eb3a1c1341f6ce2eefec165dc4b5dccd48f8 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -8,41 +8,54 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import BatchEncoding, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.models.intern_vit import (InternVisionModel,
-                                                   InternVisionPatchModel)
+from vllm.model_executor.models.intern_vit import (
+    InternVisionModel,
+    InternVisionPatchModel,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
-IMG_START = '<img>'
-IMG_END = '</img>'
-IMG_CONTEXT = '<IMG_CONTEXT>'
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
 
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -57,6 +70,7 @@ class SkyworkR1VImagePixelInputs(TensorSchema):
         - w: Width
         - bn: Batch size * number of images
     """
+
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values_flat: Annotated[
@@ -75,31 +89,36 @@ class SkyworkR1VImageEmbeddingInputs(TensorSchema):
     Dimensions:
         - ni: Number of images
         - ifs: Image feature size
-        - hs: Hidden size (must match the hidden size of language model 
+        - hs: Hidden size (must match the hidden size of language model
           backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("ni", "ifs", "hs"),
     ]
 
 
-SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
-                              SkyworkR1VImageEmbeddingInputs]
+SkyworkR1VImageInputs: TypeAlias = (
+    SkyworkR1VImagePixelInputs | SkyworkR1VImageEmbeddingInputs
+)
 
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose([
-        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
-        T.Resize((input_size, input_size),
-                 interpolation=T.InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
-    ])
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
 
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
@@ -111,7 +130,7 @@ def find_closest_aspect_ratio(
     height: int,
     image_size: int,
 ) -> tuple[int, int]:
-    best_ratio_diff = float('inf')
+    best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height
     for ratio in target_ratios:
@@ -146,10 +165,13 @@ def get_skyworkr1v_target_ratios(
     min_num: int,
     max_num: int,
 ) -> list[tuple[int, int]]:
-    target_ratios = {(i, j)
-                     for n in range(min_num, max_num + 1)
-                     for i in range(1, n + 1)
-                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
     return sorted(target_ratios, key=lambda x: x[0] * x[1])
 
 
@@ -206,10 +228,12 @@ def dynamic_preprocess_skyworkr1v(
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
         # split the image
         split_img = resized_img.crop(box)
         processed_images.append(split_img)
@@ -260,9 +284,9 @@ class SkyworkR1VProcessor:
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         super().__init__()
 
@@ -285,7 +309,8 @@ class SkyworkR1VProcessor:
         assert isinstance(dynamic_image_size, bool)
 
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.image_size = image_size
         self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
@@ -299,7 +324,7 @@ class SkyworkR1VProcessor:
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -309,19 +334,23 @@ class SkyworkR1VProcessor:
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
-        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
-                             is None else min_dynamic_patch)
-        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
-                             is None else max_dynamic_patch)
-        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
-                              is None else dynamic_image_size)
-        use_thumbnail = (self.use_thumbnail
-                         if use_thumbnail is None else use_thumbnail)
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
 
         return resolve_skyworkr1v_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -333,10 +362,10 @@ class SkyworkR1VProcessor:
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -370,9 +399,9 @@ class SkyworkR1VProcessor:
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -388,18 +417,19 @@ class SkyworkR1VProcessor:
                 min_num=min_num,
                 max_num=max_num,
                 use_thumbnail=self.use_thumbnail,
-            ) for image in images
+            )
+            for image in images
         ]
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> Mapping[str, NestedTensors]:
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
         if text is None:
             text = []
         if not isinstance(text, list):
@@ -418,11 +448,11 @@ class SkyworkR1VProcessor:
                 max_dynamic_patch=max_dynamic_patch,
                 dynamic_image_size=dynamic_image_size,
             )
-            image_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat":
-                torch.cat(pixel_values_lst),
-                "image_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst]),
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
             }
 
             for pixel_values in pixel_values_lst:
@@ -431,18 +461,16 @@ class SkyworkR1VProcessor:
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
 
-                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
 
         text_inputs = self.tokenizer(text)
 
-        return {
-            **BatchEncoding(text_inputs, tensor_type=return_tensors),
-            **image_inputs,
-        }
+        combined_outputs = {**text_inputs, **image_inputs}
 
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
 
-class SkyworkR1VProcessingInfo(BaseProcessingInfo):
 
+class SkyworkR1VProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
         return self.ctx.init_processor(
             SkyworkR1VProcessor,
@@ -451,7 +479,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -459,7 +487,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[SkyworkR1VProcessor],
+        processor: SkyworkR1VProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -486,8 +514,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
+                largest_feature_pinpoint = ImageSize(width=width, height=height)
 
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
@@ -495,9 +522,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
         return largest_feature_pinpoint
 
 
-class SkyworkR1VDummyInputsBuilder(
-        BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
-
+class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
@@ -507,29 +532,31 @@ class SkyworkR1VDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class SkyworkR1VMultiModalProcessor(
-        BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
-
+class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
     def _call_hf_processor(
         self,
         prompt: str,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> BatchFeature:
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
@@ -549,7 +576,7 @@ class SkyworkR1VMultiModalProcessor(
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: Mapping[str, NestedTensors],
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
@@ -557,7 +584,8 @@ class SkyworkR1VMultiModalProcessor(
 
         return dict(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_num_patches),
+                "image", image_num_patches
+            ),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
@@ -585,7 +613,8 @@ class SkyworkR1VMultiModalProcessor(
 
         def get_replacement_skyworkr1v(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
@@ -615,11 +644,13 @@ class SkyworkR1VMultiModalProcessor(
 @MULTIMODAL_REGISTRY.register_processor(
     SkyworkR1VMultiModalProcessor,
     info=SkyworkR1VProcessingInfo,
-    dummy_inputs=SkyworkR1VDummyInputsBuilder)
+    dummy_inputs=SkyworkR1VDummyInputsBuilder,
+)
 class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -640,12 +671,13 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
         self.num_image_token = int(
-            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
         self.downsample_ratio = config.downsample_ratio
         self.ps_version = config.ps_version
 
         self.llm_arch_name = config.text_config.architectures[0]
-        self.is_mono = self.llm_arch_name == 'SkyworkLM2VEForCausalLM'
+        self.is_mono = self.llm_arch_name == "SkyworkLM2VEForCausalLM"
         self.vision_model = self._init_vision_model(
             config,
             quant_config=quant_config,
@@ -659,29 +691,33 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             prefix=maybe_prefix(prefix, "language_model"),
         )
 
-        self.mlp1 = self._init_mlp1(config)
+        self.mlp1 = self._init_mlp1(
+            config, quant_config, prefix=maybe_prefix(prefix, "mlp1")
+        )
 
         self.img_context_token_id = None
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
-    def _patch_quant_config(self, config: PretrainedConfig,
-                            quant_config: QuantizationConfig):
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ):
         # the awq models from OpenGVLab missing `modules_to_not_convert`
         # patch the quant_config to add `modules_to_not_convert` back
         if isinstance(quant_config, AWQConfig):
             text_config = config.text_config
-            llm_quant_config = getattr(text_config, "quantization_config",
-                                       None)
-            if (not quant_config.modules_to_not_convert) and \
-                (llm_quant_config is not None):
+            llm_quant_config = getattr(text_config, "quantization_config", None)
+            if (not quant_config.modules_to_not_convert) and (
+                llm_quant_config is not None
+            ):
                 quant_config.modules_to_not_convert.append("vision_model")
 
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -689,8 +725,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if not is_mono:
             vision_feature_layer = config.select_layer
             if vision_feature_layer < 0:
-                num_hidden_layers = config.vision_config.num_hidden_layers \
-                    + vision_feature_layer + 1
+                num_hidden_layers = (
+                    config.vision_config.num_hidden_layers + vision_feature_layer + 1
+                )
             else:
                 num_hidden_layers = vision_feature_layer + 1
 
@@ -703,20 +740,32 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         else:
             return InternVisionPatchModel(config.vision_config)
 
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+    def _init_mlp1(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig,
+        prefix: str = "",
+    ) -> nn.Module:
         vit_hidden_size = config.vision_config.hidden_size
         llm_hidden_size = config.text_config.hidden_size
 
         return nn.Sequential(
-            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
-            ReplicatedLinear(vit_hidden_size *
-                             int(1 / self.downsample_ratio)**2,
-                             llm_hidden_size,
-                             return_bias=False),
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            ReplicatedLinear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                llm_hidden_size,
+                return_bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.1",
+            ),
             nn.GELU(),
-            ReplicatedLinear(llm_hidden_size,
-                             llm_hidden_size,
-                             return_bias=False),
+            ReplicatedLinear(
+                llm_hidden_size,
+                llm_hidden_size,
+                return_bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.3",
+            ),
         )
 
     def pixel_shuffle(self, x, scale_factor=0.5):
@@ -725,9 +774,13 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
         # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
         x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-                   int(c / (scale_factor * scale_factor)))
-        if self.ps_version == 'v1':
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
             pass
         else:
             x = x.permute(0, 2, 1, 3).contiguous()
@@ -737,17 +790,16 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         vit_embeds = self.vision_model(pixel_values=pixel_values)
         vit_embeds = vit_embeds[:, 1:, :]
 
-        h = w = int(vit_embeds.shape[1]**0.5)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
+        self, **kwargs: object
+    ) -> SkyworkR1VImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -756,31 +808,19 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return SkyworkR1VImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
-
-        if pixel_values_flat is not None:
-            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values_flat)}")
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
 
-            if not isinstance(image_num_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(image_num_patches)}")
-
-            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-            image_num_patches = flatten_bn(image_num_patches, concat=True)
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
+        if pixel_values_flat is not None:
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=pixel_values_flat,
@@ -788,14 +828,15 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 resolve_bindings={
                     "h": self.config.vision_config.image_size,
                     "w": self.config.vision_config.image_size,
-                })
+                },
+            )
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
         self,
         image_input: SkyworkR1VImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -807,14 +848,14 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return image_embeds.view(
-                -1, self.config.text_config.hidden_size).unsqueeze(0)
+            return image_embeds.view(-1, self.config.text_config.hidden_size).unsqueeze(
+                0
+            )
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1,
-                                         self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
@@ -822,61 +863,56 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         if self.is_mono:
-            self.visual_token_mask = (
-                input_ids == self.img_context_token_id).reshape(-1, 1)
+            self.visual_token_mask = (input_ids == self.img_context_token_id).reshape(
+                -1, 1
+            )
         else:
             self.visual_token_mask = None
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            assert self.img_context_token_id is not None
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
             self._set_visual_token_mask(input_ids)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.img_context_token_id,
-            )
-        return inputs_embeds
+
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
-
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
@@ -886,8 +922,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         # Only required if the model is mono-architecture
         if self.visual_token_mask is not None:
-            forward_kwargs.update(
-                {"visual_token_mask": self.visual_token_mask})
+            forward_kwargs.update({"visual_token_mask": self.visual_token_mask})
             self.visual_token_mask = None
 
         hidden_states = self.language_model.model(**forward_kwargs)
@@ -896,16 +931,23 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = [
-            "action_embed", "temporal_embed", "track_embed",
-            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
-            "loc_encoder", "loc_decoder", "sam", "temporal_token",
-            "track_token"
+            "action_embed",
+            "temporal_embed",
+            "track_embed",
+            "track_embed_decoder",
+            "box_token",
+            "cg_criterion",
+            "cg_model",
+            "loc_encoder",
+            "loc_decoder",
+            "sam",
+            "temporal_token",
+            "track_token",
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 2adfad67152b3dfb6310cdbc84ce28c4a3db118c..e8b805297d963e010625a3569c379e8a0637bbc5 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -1,29 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from transformers import SmolVLMProcessor
 
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-# yapf: disable
 from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
-from .idefics3 import Idefics3ForConditionalGeneration
+from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
 from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
-from .idefics3 import Idefics3ProcessingInfo
-
-# yapf: enable
 
 
 class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
         return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
 
-    def _get_image_token(
-            self, processor: Optional[SmolVLMProcessor]) -> tuple[str, str]:
+    def _get_image_token(self, processor: SmolVLMProcessor | None) -> tuple[str, str]:
         if processor is None:
             processor = self.get_hf_processor()
         image_token = processor.image_token
@@ -32,11 +25,12 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
         return image_token, fake_image_token, global_image_token
 
 
-@MULTIMODAL_REGISTRY.register_processor(SmolVLMMultiModalProcessor,
-                                        info=SmolVLMProcessingInfo,
-                                        dummy_inputs=SmolVLMDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    SmolVLMMultiModalProcessor,
+    info=SmolVLMProcessingInfo,
+    dummy_inputs=SmolVLMDummyInputsBuilder,
+)
 class SmolVLMForConditionalGeneration(Idefics3ForConditionalGeneration):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(
             vllm_config=vllm_config,
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index c774171b9dcd2f799dfa795542deef162f026d46..4ec855f7944466cdf87b432386d97cbfe37deee6 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -25,7 +25,7 @@
 """Inference-only Solar model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -37,32 +37,42 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class SolarMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -82,8 +92,9 @@ class SolarMLP(nn.Module):
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -94,7 +105,6 @@ class SolarMLP(nn.Module):
 
 
 class SolarAttention(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -102,11 +112,11 @@ class SolarAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -183,12 +193,11 @@ class SolarAttention(nn.Module):
 
 
 class SolarDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -197,21 +206,24 @@ class SolarDecoderLayer(nn.Module):
         rope_scaling = getattr(config, "rope_scaling", None)
 
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
-            rope_scaling["original_max_position_embeddings"] \
-                = config.original_max_position_embeddings
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         self.self_attn = SolarAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -228,59 +240,54 @@ class SolarDecoderLayer(nn.Module):
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 @support_torch_compile
 class SolarModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
@@ -299,25 +306,25 @@ class SolarModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -328,8 +335,7 @@ class SolarModel(nn.Module):
         bskcn_h_2 = None
         bskcn_r_1 = None
         bskcn_r_2 = None
-        bskcn_tv = (self.config.bskcn_tv[0]
-                    if self.training else self.config.bskcn_tv[1])
+        bskcn_tv = self.config.bskcn_tv[0] if self.training else self.config.bskcn_tv[1]
 
         for i in range(self.start_layer, self.end_layer):
             if i in self.config.bskcn_1:
@@ -339,12 +345,10 @@ class SolarModel(nn.Module):
                 bskcn_h_2 = hidden_states.clone()
                 bskcn_r_2 = residual.clone()
             if i in self.config.bskcn_3:
-                hidden_states = bskcn_h_1 * bskcn_tv + hidden_states * (
-                    1 - bskcn_tv)
+                hidden_states = bskcn_h_1 * bskcn_tv + hidden_states * (1 - bskcn_tv)
                 residual = bskcn_r_1 * bskcn_tv + residual * (1 - bskcn_tv)
             if i in self.config.bskcn_4:
-                hidden_states = bskcn_h_2 * bskcn_tv + hidden_states * (
-                    1 - bskcn_tv)
+                hidden_states = bskcn_h_2 * bskcn_tv + hidden_states * (1 - bskcn_tv)
                 residual = bskcn_r_2 * bskcn_tv + residual * (1 - bskcn_tv)
             layer = self.layers[i]
             hidden_states, residual = layer(
@@ -354,16 +358,14 @@ class SolarModel(nn.Module):
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -375,14 +377,15 @@ class SolarModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
                 # Loading kv cache quantization scales
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
@@ -415,8 +418,7 @@ class SolarModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -446,9 +448,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = SolarModel(
@@ -456,17 +458,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -474,31 +468,35 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 self.lm_head.weight = self.model.embed_tokens.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, intermediate_tensors,
-                                  inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index e4dfe8d5a9a3bbc83c4661e4f72e4396243101ff..06eb7201c1a897c0c1c7954e6c7ece4a3ea16691 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -21,9 +21,9 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -33,43 +33,56 @@ from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class StablelmMLP(nn.Module):
-
-    def __init__(self,
-                 config: StableLmConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: StableLmConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.intermediate_size] * 2,
+            config.hidden_size,
+            [config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.down_proj",
+        )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -80,12 +93,13 @@ class StablelmMLP(nn.Module):
 
 
 class StablelmAttention(nn.Module):
-
-    def __init__(self,
-                 config: StableLmConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: StableLmConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -102,33 +116,39 @@ class StablelmAttention(nn.Module):
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_key_value_heads == 0
-        self.num_key_value_heads = max(
-            1, self.total_num_key_value_heads // tp_size)
+        self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.partial_rotary_factor = getattr(
-            config, "rope_pct", getattr(config, "partial_rotary_factor", 1))
+            config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
+        )
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
         self.qkv_bias = getattr(config, "use_qkv_bias", False)
         if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
-            raise ValueError(f"hidden_size must be divisible by num_heads "
-                             f"(got `hidden_size`: {self.hidden_size}"
-                             f" and `num_heads`: {self.num_heads}).")
-
-        self.qkv_proj = QKVParallelLinear(self.hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_key_value_heads,
-                                          self.qkv_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.qkv_proj")
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads "
+                f"(got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_key_value_heads,
+            self.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
@@ -136,13 +156,15 @@ class StablelmAttention(nn.Module):
             base=self.config.rope_theta,
             partial_rotary_factor=self.partial_rotary_factor,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_key_value_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_key_value_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -158,25 +180,21 @@ class StablelmAttention(nn.Module):
 
 
 class StablelmDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: StableLmConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.self_attn = StablelmAttention(config,
-                                           cache_config,
-                                           quant_config,
-                                           prefix=f"{prefix}.self_attn")
+        self.self_attn = StablelmAttention(
+            config, cache_config, quant_config, prefix=f"{prefix}.self_attn"
+        )
         self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
-        norm_eps = getattr(config, "norm_eps",
-                           getattr(config, "layer_norm_eps", 1e-05))
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
 
     def forward(
         self,
@@ -202,7 +220,6 @@ class StablelmDecoderLayer(nn.Module):
 
 
 class StableLMEpochModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -219,31 +236,31 @@ class StableLMEpochModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: StablelmDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
+                config, cache_config, quant_config, prefix=prefix
+            ),
             prefix=f"{prefix}.layers",
         )
-        norm_eps = getattr(config, "norm_eps",
-                           getattr(config, "layer_norm_eps", 1e-05))
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
         self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -254,8 +271,7 @@ class StableLMEpochModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -267,7 +283,7 @@ class StableLMEpochModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -287,55 +303,57 @@ class StableLMEpochModel(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = StableLMEpochModel(vllm_config=vllm_config,
-                                        prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.lm_head")
+        self.model = StableLMEpochModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lm_head",
+        )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 7f379ab95a03da54ba97dbc97ff191afde2be363..0f2942acd50067e679158efb344d030167a08ee2 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -19,10 +19,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Starcoder2 model."""
+"""PyTorch Starcoder2 model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -33,31 +33,42 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    AutoWeightsLoader,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 
 class Starcoder2Attention(nn.Module):
-
-    def __init__(self,
-                 config: Starcoder2Config,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.config = config
 
@@ -107,13 +118,15 @@ class Starcoder2Attention(nn.Module):
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
 
     def forward(
         self,
@@ -129,11 +142,12 @@ class Starcoder2Attention(nn.Module):
 
 
 class Starcoder2MLP(nn.Module):
-
-    def __init__(self,
-                 config: Starcoder2Config,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.c_fc = ColumnParallelLinear(
             config.hidden_size,
@@ -159,25 +173,28 @@ class Starcoder2MLP(nn.Module):
 
 
 class Starcoder2DecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: Starcoder2Config,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = Starcoder2Attention(config,
-                                             cache_config,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.self_attn")
-        self.mlp = Starcoder2MLP(config,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mlp")
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.norm_epsilon)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.norm_epsilon)
+        self.self_attn = Starcoder2Attention(
+            config,
+            cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Starcoder2MLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_epsilon
+        )
 
     def forward(
         self,
@@ -204,7 +221,6 @@ class Starcoder2DecoderLayer(nn.Module):
 
 @support_torch_compile
 class Starcoder2Model(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -219,7 +235,8 @@ class Starcoder2Model(nn.Module):
             config.vocab_size,
             config.hidden_size,
             quant_config=quant_config,
-            prefix=f"{prefix}.embed_tokens")
+            prefix=f"{prefix}.embed_tokens",
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Starcoder2DecoderLayer(
@@ -228,25 +245,25 @@ class Starcoder2Model(nn.Module):
             prefix=f"{prefix}.layers",
         )
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -257,8 +274,7 @@ class Starcoder2Model(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -269,7 +285,7 @@ class Starcoder2Model(nn.Module):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -286,69 +302,66 @@ class Starcoder2Model(nn.Module):
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = Starcoder2Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
+        self.model = Starcoder2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
         self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=f"{prefix}.lm_head",
             )
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
             # Models trained using ColossalAI may include these tensors in
             # the checkpoint. Skip them.
-            skip_prefixes=(["lm_head.weight"]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(
+                ["lm_head.weight"] if self.config.tie_word_embeddings else None
+            ),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 0cce0c78f8dc66d97b64d242ff16e0574ddac5e0..4fff356b29e28c755410c29098619fff927dc213 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jurassic model."""
+
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -11,61 +12,76 @@ from torch import nn
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
 
 class FusedMoEBlock(nn.Module):
-
-    def __init__(self,
-                 config: ModelConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: ModelConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
 
         if self.tp_size > config.moe_num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.moe_num_experts}.")
-
-        self.experts = FusedMoE(num_experts=config.moe_num_experts,
-                                top_k=config.moe_top_k,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=config.norm_expert_weight,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.moe_num_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+                f"the number of experts {config.moe_num_experts}."
+            )
+
+        self.experts = FusedMoE(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_expert_weight,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.moe_num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -74,39 +90,43 @@ class FusedMoEBlock(nn.Module):
 
         router_logits, _ = self.gate(hidden_states)
 
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
 
 
 class Step3TextMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
         self.hidden_size = hidden_size
 
@@ -118,7 +138,6 @@ class Step3TextMLP(nn.Module):
 
 
 class Step3TextAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -126,12 +145,12 @@ class Step3TextAttention(nn.Module):
         num_kv_heads: int,
         norm_eps: float,
         rope_theta: int,
-        share_q_dim: Optional[int] = None,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        share_q_dim: int | None = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 8192,
         head_dim: int = 256,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -143,8 +162,9 @@ class Step3TextAttention(nn.Module):
         self.num_heads = self.total_num_heads // tp_size
 
         if num_kv_heads != 1:
-            raise ValueError(f"Step3TextAttention num_kv_heads must be 1, "
-                             f"but got {num_kv_heads}.")
+            raise ValueError(
+                f"Step3TextAttention num_kv_heads must be 1, but got {num_kv_heads}."
+            )
         self.num_kv_heads = num_kv_heads
 
         self.head_dim = head_dim
@@ -174,21 +194,26 @@ class Step3TextAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.wq",
         )
-        self.rotary_emb = get_rope(self.head_dim,
-                                   rotary_dim=self.head_dim,
-                                   max_position=max_position_embedding,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embedding,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
         scaling = self.head_dim**-0.5
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              scaling,
-                              self.num_kv_heads,
-                              cache_config=cache_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(self, positions: torch.Tensor,
-                hidden_states: torch.Tensor) -> torch.Tensor:
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            scaling,
+            self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self, positions: torch.Tensor, hidden_states: torch.Tensor
+    ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q = self.inter_norm(q)
@@ -200,12 +225,13 @@ class Step3TextAttention(nn.Module):
 
 
 class Step3TextDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: ModelConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: ModelConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         config = config.hf_config
         self.hidden_size = config.hidden_size
@@ -223,59 +249,61 @@ class Step3TextDecoderLayer(nn.Module):
             share_q_dim=config.share_q_dim,
             rope_theta=config.rope_theta,
             rope_scaling=rope_scaling,
-            prefix=f"{prefix}.self_attn")
+            prefix=f"{prefix}.self_attn",
+        )
 
         layer_idx = int(prefix.split("layers.")[1].split(".")[0])
         moe_layers_enum = getattr(config, "moe_layers_enum", None)
         if moe_layers_enum is not None:
-            moe_layers_idx = [
-                int(i) for i in moe_layers_enum.strip().split(',')
-            ]
+            moe_layers_idx = [int(i) for i in moe_layers_enum.strip().split(",")]
         else:
             # Default to 1dense.
             moe_layers_idx = [i for i in range(1, config.num_hidden_layers)]
 
         if layer_idx in moe_layers_idx:
-            self.moe = FusedMoEBlock(config=config,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.moe")
+            self.moe = FusedMoEBlock(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.moe"
+            )
             self.share_expert = Step3TextMLP(
                 hidden_size=self.hidden_size,
                 intermediate_size=config.share_expert_dim,
                 hidden_act="silu",
                 quant_config=quant_config,
-                prefix=f"{prefix}.share_expert")
+                prefix=f"{prefix}.share_expert",
+            )
             self.use_moe = True
         else:
-            self.mlp = Step3TextMLP(hidden_size=config.hidden_size,
-                                    intermediate_size=config.intermediate_size,
-                                    hidden_act="silu",
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.mlp")
+            self.mlp = Step3TextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
             self.use_moe = False
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
-            self, positions: torch.Tensor, hidden_states: torch.Tensor,
-            residual: Optional[torch.Tensor]
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
 
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
         if self.use_moe:
             share_output = self.share_expert(hidden_states)
@@ -289,7 +317,6 @@ class Step3TextDecoderLayer(nn.Module):
 
 @support_torch_compile
 class Step3TextModel(nn.Module):
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -298,8 +325,9 @@ class Step3TextModel(nn.Module):
         self.vocab_size = config.vocab_size
         self.config = config
 
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
@@ -309,11 +337,12 @@ class Step3TextModel(nn.Module):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Step3TextDecoderLayer(config=vllm_config.
-                                                 model_config,
-                                                 cache_config=cache_config,
-                                                 quant_config=quant_config,
-                                                 prefix=prefix),
+            lambda prefix: Step3TextDecoderLayer(
+                config=vllm_config.model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
             prefix=f"{prefix}.layers",
         )
         if get_pp_group().is_last_rank:
@@ -321,25 +350,25 @@ class Step3TextModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.hidden_size
+        )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -350,17 +379,18 @@ class Step3TextModel(nn.Module):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual,
-            })
+            return IntermediateTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
 
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
 class Step3TextForCausalLM(nn.Module, SupportsPP):
-
     def __init__(
         self,
         *,
@@ -369,60 +399,71 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.vllm_config = vllm_config
 
         self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                if not lora_config else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size)
+            self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
             self.lm_head = PPMissingLayer()
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None):
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         qkv_params_mapping = [
             # (param_name, shard_name, relative_start_idx, relative_end_idx)
-            (".qkv_proj", ".q_proj", 0, self.config.share_q_dim /
-             (self.config.share_q_dim + self.config.head_dim * 2)),
-            (".qkv_proj", ".k_proj", self.config.share_q_dim /
-             (self.config.share_q_dim + self.config.head_dim * 2),
-             (self.config.share_q_dim + self.config.head_dim) /
-             (self.config.share_q_dim + self.config.head_dim * 2)),
-            (".qkv_proj", ".v_proj",
-             (self.config.share_q_dim + self.config.head_dim) /
-             (self.config.share_q_dim + self.config.head_dim * 2),
-             (self.config.share_q_dim + self.config.head_dim * 2) /
-             (self.config.share_q_dim + self.config.head_dim * 2)),
+            (
+                ".qkv_proj",
+                ".q_proj",
+                0,
+                self.config.share_q_dim
+                / (self.config.share_q_dim + self.config.head_dim * 2),
+            ),
+            (
+                ".qkv_proj",
+                ".k_proj",
+                self.config.share_q_dim
+                / (self.config.share_q_dim + self.config.head_dim * 2),
+                (self.config.share_q_dim + self.config.head_dim)
+                / (self.config.share_q_dim + self.config.head_dim * 2),
+            ),
+            (
+                ".qkv_proj",
+                ".v_proj",
+                (self.config.share_q_dim + self.config.head_dim)
+                / (self.config.share_q_dim + self.config.head_dim * 2),
+                (self.config.share_q_dim + self.config.head_dim * 2)
+                / (self.config.share_q_dim + self.config.head_dim * 2),
+            ),
         ]
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -435,20 +476,19 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
         expert_params_mapping = [
             (".moe.experts.w13_weight", ".moe.gate_proj.weight", "w1"),
             (".moe.experts.w13_weight", ".moe.up_proj.weight", "w3"),
-            (".moe.experts.w2_weight", ".moe.down_proj.weight", "w2")
+            (".moe.experts.w2_weight", ".moe.down_proj.weight", "w2"),
         ]
 
-        disable_moe_stacked_params = [
-            data[1] for data in expert_params_mapping
-        ]
+        disable_moe_stacked_params = [data[1] for data in expert_params_mapping]
 
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                if any(disable_moe_stacked_param in name
-                       for disable_moe_stacked_param in
-                       disable_moe_stacked_params):
+                if any(
+                    disable_moe_stacked_param in name
+                    for disable_moe_stacked_param in disable_moe_stacked_params
+                ):
                     continue
                 name = name.replace(weight_name, param_name)
                 if is_pp_missing_parameter(name, self):
@@ -468,23 +508,30 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
                     if is_pp_missing_parameter(name, self):
                         continue
                     # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     for expert_id in range(loaded_weight.shape[0]):
                         loaded_weight_expert = loaded_weight[expert_id]
-                        weight_loader(param,
-                                      loaded_weight_expert,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=expert_id)
+                        weight_loader(
+                            param,
+                            loaded_weight_expert,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
                     loaded_params.add(name)
                     break
                 else:
-                    for (param_name, weight_name, start_idx,
-                         end_idx) in qkv_params_mapping:
+                    for (
+                        param_name,
+                        weight_name,
+                        start_idx,
+                        end_idx,
+                    ) in qkv_params_mapping:
                         if weight_name not in name:
                             continue
                         name = name.replace(weight_name, param_name)
@@ -494,8 +541,9 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
                         dim = param.shape[param.output_dim]
                         begin_idx = int(start_idx * dim)
                         end_idx = int(end_idx * dim)
-                        param_slice = param.narrow(param.output_dim, begin_idx,
-                                                   end_idx - begin_idx)
+                        param_slice = param.narrow(
+                            param.output_dim, begin_idx, end_idx - begin_idx
+                        )
                         param_slice.copy_(loaded_weight)
                         loaded_params.add(name)
                         break
@@ -503,8 +551,9 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
                         if is_pp_missing_parameter(name, self):
                             continue
                         param = params_dict[name]
-                        weight_loader = getattr(param, "weight_loader",
-                                                default_weight_loader)
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
                         weight_loader(param, loaded_weight)
                         loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 5f6ad58850439a9441c08d98fee713c121760cbf..5d16be1eb3128961648f31cb2cc5f9fb24b94f04 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -4,7 +4,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from itertools import product
 from math import ceil, sqrt
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -17,45 +17,78 @@ from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import run_dp_sharded_vision_model
 
 
-class Step3VLImagePixelInputs(TypedDict):
+class Step3VLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+        - bnp: Batch size * number of images * number of patches
+        - hp: Height of patch
+        - wp: Width of patch
+    """
+
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    patch_pixel_values: Optional[torch.Tensor]
-    num_patches: list[int]
+    pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
+    patch_pixel_values: Annotated[
+        torch.Tensor | None, TensorShape("bnp", 3, "hp", "wp")
+    ]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
 
+class Step3VLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
 
-class Step3VLImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-Step3VLImageInputs = Union[Step3VLImagePixelInputs,
-                           Step3VLImageEmbeddingInputs]
+Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs
 
 ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
 
@@ -63,31 +96,42 @@ MAX_IMAGE_SIZE: int = 3024
 
 
 class Step3VisionProcessor:
-
     def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
         mean = [0.48145466, 0.4578275, 0.40821073]
         std = [0.26862954, 0.26130258, 0.27577711]
         patch_size = patch_size if patch_size is not None else size
 
-        self.transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(mean, std),
-            transforms.Resize(
-                (size, size),
-                interpolation=InterpolationMode.BICUBIC if interpolation_mode
-                == "bicubic" else InterpolationMode.BILINEAR,
-                antialias=True),
-        ])
-
-        self.patch_transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(mean, std),
-            transforms.Resize(
-                (patch_size, patch_size),
-                interpolation=InterpolationMode.BICUBIC if interpolation_mode
-                == "bicubic" else InterpolationMode.BILINEAR,
-                antialias=True),
-        ]) if patch_size is not None else None
+        self.transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean, std),
+                transforms.Resize(
+                    (size, size),
+                    interpolation=InterpolationMode.BICUBIC
+                    if interpolation_mode == "bicubic"
+                    else InterpolationMode.BILINEAR,
+                    antialias=True,
+                ),
+            ]
+        )
+
+        self.patch_transform = (
+            transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean, std),
+                    transforms.Resize(
+                        (patch_size, patch_size),
+                        interpolation=InterpolationMode.BICUBIC
+                        if interpolation_mode == "bicubic"
+                        else InterpolationMode.BILINEAR,
+                        antialias=True,
+                    ),
+                ]
+            )
+            if patch_size is not None
+            else None
+        )
 
     def __call__(self, image, is_patch=False):
         if is_patch:
@@ -97,7 +141,6 @@ class Step3VisionProcessor:
 
 
 class ImagePatcher:
-
     def determine_window_size(self, long: int, short: int) -> int:
         if long <= 728:
             return short if long / short > 1.5 else 0
@@ -118,14 +161,12 @@ class ImagePatcher:
             size_w, size_h = size
             step_w, step_h = step
 
-            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w +
-                                                   1)
+            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
             x_start = [step_w * i for i in range(x_num)]
             if len(x_start) > 1 and x_start[-1] + size_w > width:
                 x_start[-1] = width - size_w
 
-            y_num = 1 if height <= size_h else ceil((height - size_h) /
-                                                    step_h + 1)
+            y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
             y_start = [step_h * i for i in range(y_num)]
             if len(y_start) > 1 and y_start[-1] + size_h > height:
                 y_start[-1] = height - size_h
@@ -135,8 +176,10 @@ class ImagePatcher:
             windows.append(np.concatenate([start, start + size], axis=1))
         windows = np.concatenate(windows, axis=0)
 
-        return [(int(box[0]), int(box[1]), int(box[2] - box[0]),
-                 int(box[3] - box[1])) for box in windows], (x_num, y_num)
+        return [
+            (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
+            for box in windows
+        ], (x_num, y_num)
 
     def square_pad(self, img: Image.Image) -> Image.Image:
         w, h = img.size
@@ -147,25 +190,27 @@ class ImagePatcher:
         padded.paste(img, (0, 0))
         return padded
 
-    def get_image_size_for_padding(self, img_width: int,
-                                   img_height: int) -> tuple[int, int]:
+    def get_image_size_for_padding(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
         ratio = img_width / img_height
         if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
             new_size = max(img_height, img_width)
             return new_size, new_size
         return img_width, img_height
 
-    def get_image_size_for_preprocess(self, img_width: int,
-                                      img_height: int) -> tuple[int, int]:
-
+    def get_image_size_for_preprocess(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
         if max(img_height, img_width) > MAX_IMAGE_SIZE:
             scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
             img_width = int(img_width * scale_factor)
             img_height = int(img_height * scale_factor)
         return img_width, img_height
 
-    def get_image_size_for_crop(self, img_width: int, img_height: int,
-                                window_size: int):
+    def get_image_size_for_crop(
+        self, img_width: int, img_height: int, window_size: int
+    ):
         w_ratio = img_width / window_size
         h_ratio = img_height / window_size
 
@@ -187,22 +232,26 @@ class ImagePatcher:
         target = img.crop((j, i, j + tw, i + th))
         return target
 
-    def get_num_patches(self, img_width: int,
-                        img_height: int) -> tuple[int, int]:
-        img_width, img_height = self.get_image_size_for_padding(
-            img_width, img_height)
+    def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
+        img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
         img_width, img_height = self.get_image_size_for_preprocess(
-            img_width, img_height)
-        window_size = self.determine_window_size(max(img_height, img_width),
-                                                 min(img_height, img_width))
+            img_width, img_height
+        )
+        window_size = self.determine_window_size(
+            max(img_height, img_width), min(img_height, img_width)
+        )
         if window_size == 0:
             return 0, 0
         else:
             img_width, img_height = self.get_image_size_for_crop(
-                img_width, img_height, window_size)
+                img_width, img_height, window_size
+            )
             center_list, (x_num, y_num) = self.slide_window(
-                img_width, img_height, [(window_size, window_size)],
-                [(window_size, window_size)])
+                img_width,
+                img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
             full_rows = (len(center_list) - 1) // x_num + 1
             if len(center_list) > 0 and len(center_list) % x_num == 0:
                 full_rows -= 1
@@ -213,39 +262,44 @@ class ImagePatcher:
     ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
         img_width, img_height = img.size
         new_img_width, new_img_height = self.get_image_size_for_padding(
-            img_width, img_height)
+            img_width, img_height
+        )
         if new_img_width != img_width or new_img_height != img_height:
             img = self.square_pad(img)
             img_width, img_height = img.size
 
         new_img_width, new_img_height = self.get_image_size_for_preprocess(
-            img_width, img_height)
-        img = img.resize((new_img_width, new_img_height),
-                         Image.Resampling.BILINEAR)
+            img_width, img_height
+        )
+        img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
         window_size = self.determine_window_size(
-            max(new_img_height, new_img_width),
-            min(new_img_height, new_img_width))
+            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
+        )
 
         if window_size == 0:
             return img, [], None
         else:
             new_img_width, new_img_height = self.get_image_size_for_crop(
-                new_img_width, new_img_height, window_size)
+                new_img_width, new_img_height, window_size
+            )
             if (new_img_width, new_img_height) != (img_width, img_height):
-                img_for_crop = img.resize((new_img_width, new_img_height),
-                                          Image.Resampling.BILINEAR)
+                img_for_crop = img.resize(
+                    (new_img_width, new_img_height), Image.Resampling.BILINEAR
+                )
             else:
                 img_for_crop = img
 
             patches = []
             newlines = []
             center_list, (x_num, y_num) = self.slide_window(
-                new_img_width, new_img_height, [(window_size, window_size)],
-                [(window_size, window_size)])
+                new_img_width,
+                new_img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
             for patch_id, center_lf_point in enumerate(center_list):
                 x, y, patch_w, patch_h = center_lf_point
-                big_patch = self.patch_crop(img_for_crop, y, x, patch_h,
-                                            patch_w)
+                big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
                 patches.append(big_patch)
                 if (patch_id + 1) % x_num == 0:
                     newlines.append(patch_id)
@@ -253,12 +307,16 @@ class ImagePatcher:
             if newlines and newlines[-1] == len(patches) - 1:
                 newlines.pop()
 
-            return img, patches, [i in newlines for i in range(len(patches))
-                                  ] if len(patches) > 0 else None
+            return (
+                img,
+                patches,
+                [i in newlines for i in range(len(patches))]
+                if len(patches) > 0
+                else None,
+            )
 
 
 class Step3VLProcessor:
-
     def __init__(
         self,
         config: PretrainedConfig,
@@ -271,17 +329,15 @@ class Step3VLProcessor:
 
         self.image_size = 728
         self.patch_size = 504
-        self.image_preprocessor = Step3VisionProcessor(self.image_size,
-                                                       "bilinear",
-                                                       self.patch_size)
+        self.image_preprocessor = Step3VisionProcessor(
+            self.image_size, "bilinear", self.patch_size
+        )
 
         self.num_image_feature_size = 169
         self.num_patch_feature_size = 81
         self.image_token = "<im_patch>"
-        self.image_feature_placeholder = (self.image_token *
-                                          self.num_image_feature_size)
-        self.patch_feature_placeholder = (self.image_token *
-                                          self.num_patch_feature_size)
+        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
+        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
 
         self.patcher = ImagePatcher()
 
@@ -290,15 +346,16 @@ class Step3VLProcessor:
         return self.tokenizer.get_vocab()[self.image_token]
 
     def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
-        num_patches, num_newlines = self.patcher.get_num_patches(
-            img_width, img_height)
+        num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
 
-        return num_patches * (
-            self.num_patch_feature_size +
-            2) + self.num_image_feature_size + 2 + num_newlines
+        return (
+            num_patches * (self.num_patch_feature_size + 2)
+            + self.num_image_feature_size
+            + 2
+            + num_newlines
+        )
 
-    def _split_images(self,
-                      images: list[Image.Image]) -> list[ImageWithPatches]:
+    def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
         result = []
         for img in images:
             result.append(self.patcher(img))
@@ -325,13 +382,15 @@ class Step3VLProcessor:
             assert len(patch_newline_mask) == num_patches
             text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
             token_ids.extend(
-                [self.tokenizer.convert_tokens_to_ids("<patch_start>")] +
-                [self.image_token_id] * self.num_patch_feature_size +
-                [self.tokenizer.convert_tokens_to_ids("<patch_end>")])
+                [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+                + [self.image_token_id] * self.num_patch_feature_size
+                + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
+            )
             if patch_newline_mask and patch_newline_mask[i]:
                 text += "<patch_newline>"
                 token_ids.append(
-                    self.tokenizer.convert_tokens_to_ids("<patch_newline>"))
+                    self.tokenizer.convert_tokens_to_ids("<patch_newline>")
+                )
         return text, token_ids
 
     def _get_image_repl(
@@ -339,30 +398,30 @@ class Step3VLProcessor:
         num_images: int,
     ) -> tuple[str, list[int]]:
         text = f"<im_start>{self.image_feature_placeholder}<im_end>"
-        token_ids = [
-            self.tokenizer.convert_tokens_to_ids("<im_start>")
-        ] + [self.image_token_id] * self.num_image_feature_size + [
-            self.tokenizer.convert_tokens_to_ids("<im_end>")
-        ]
+        token_ids = (
+            [self.tokenizer.convert_tokens_to_ids("<im_start>")]
+            + [self.image_token_id] * self.num_image_feature_size
+            + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
+        )
         return text * num_images, token_ids * num_images
 
     def _get_image_repl_features(
         self,
         num_images: int,
         num_patches: int,
-        patch_new_line_idx: Optional[list[bool]],
+        patch_new_line_idx: list[bool] | None,
     ) -> tuple[str, list[int]]:
         if num_patches > 0:
             patch_repl, patch_repl_ids = self._get_patch_repl(
-                num_patches, patch_new_line_idx)
+                num_patches, patch_new_line_idx
+            )
         else:
             patch_repl = ""
             patch_repl_ids = []
         image_repl, image_repl_ids = self._get_image_repl(num_images)
         return patch_repl + image_repl, patch_repl_ids + image_repl_ids
 
-    def replace_placeholder(self, text: str, placeholder: str,
-                            repls: list[str]) -> str:
+    def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
         parts = text.split(placeholder)
 
         if len(parts) - 1 != len(repls):
@@ -379,9 +438,9 @@ class Step3VLProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -404,17 +463,17 @@ class Step3VLProcessor:
             image_repl_ids_lst = []
             num_patches = []
             for raw_img, img_patches, patch_newline_mask in splitted_images_data:  # noqa: E501
-                pixel_values_lst.extend(
-                    self._convert_images_to_pixel_values([raw_img]))
+                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
 
                 if len(img_patches) > 0:
                     patch_pixel_values_lst.extend(
-                        self._convert_images_to_pixel_values(img_patches,
-                                                             is_patch=True))
+                        self._convert_images_to_pixel_values(img_patches, is_patch=True)
+                    )
                 num_patches.append(len(img_patches))
 
                 image_repl_str, image_repl_ids = self._get_image_repl_features(
-                    1, len(img_patches), patch_newline_mask)
+                    1, len(img_patches), patch_newline_mask
+                )
                 image_repl_str_lst.append(image_repl_str)
                 image_repl_ids_lst.extend(image_repl_ids)
 
@@ -426,15 +485,15 @@ class Step3VLProcessor:
                 "num_patches": num_patches,
             }
             if patch_pixel_values_lst:
-                image_inputs["patch_pixel_values"] = torch.cat(
-                    patch_pixel_values_lst)
+                image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
             if patch_newline_mask_lst:
                 image_inputs["patch_newline_mask"] = torch.tensor(
-                    patch_newline_mask_lst, dtype=torch.bool)
+                    patch_newline_mask_lst, dtype=torch.bool
+                )
 
             text = [
-                self.replace_placeholder(t, self.image_token,
-                                         image_repl_str_lst) for t in text
+                self.replace_placeholder(t, self.image_token, image_repl_str_lst)
+                for t in text
             ]
             text_inputs = self.tokenizer(text)
 
@@ -448,21 +507,21 @@ class Step3VLProcessor:
 
 
 class Step3VLProcessingInfo(BaseProcessingInfo):
-
     def get_hf_processor(self) -> Step3VLProcessor:
         return Step3VLProcessor(
             self.get_hf_config(),
             self.get_tokenizer(),
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         return hf_processor.get_num_image_tokens(
             self.get_image_size_with_most_features().width,
-            self.get_image_size_with_most_features().height)
+            self.get_image_size_with_most_features().height,
+        )
 
     def get_mm_max_tokens_per_item(
         self,
@@ -476,19 +535,19 @@ class Step3VLProcessingInfo(BaseProcessingInfo):
 
     def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int:
         if len(mm_data) != 1 or "image" not in mm_data:
-            raise ValueError(
-                "mm_data could only contain one key 'image' for steo1o")
+            raise ValueError("mm_data could only contain one key 'image' for steo1o")
 
         image_data = mm_data["image"]
         if not isinstance(image_data, (list, tuple)):
             image_data = [image_data]
 
-        return sum(self.get_hf_processor().get_num_image_tokens(
-            img.width, img.height) for img in image_data)
+        return sum(
+            self.get_hf_processor().get_num_image_tokens(img.width, img.height)
+            for img in image_data
+        )
 
 
 class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         return "<im_patch>" * num_images
@@ -497,22 +556,24 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
+        image_overrides = mm_options.get("image") if mm_options else None
+
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
         }
 
 
-class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
-                                 ):
-
+class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]):
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -528,10 +589,10 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
             if num_patches > 0:
                 patch_newline_mask = out_item["patch_newline_mask"].data
                 image_repl_ids = hf_processor._get_image_repl_features(
-                    1, num_patches, patch_newline_mask.tolist())[1]
+                    1, num_patches, patch_newline_mask.tolist()
+                )[1]
             else:
-                image_repl_ids = hf_processor._get_image_repl_features(
-                    1, 0, None)[1]
+                image_repl_ids = hf_processor._get_image_repl_features(1, 0, None)[1]
             return PromptUpdateDetails.select_token_id(
                 seq=image_repl_ids,
                 embed_token_id=image_placeholder_token_id,
@@ -555,10 +616,12 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
             patch_pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+                "image", num_patches
+            ),
             num_patches=MultiModalFieldConfig.batched("image"),
             patch_newline_mask=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_patches),
+                "image", num_patches
+            ),
         )
 
 
@@ -572,29 +635,29 @@ def get_abs_pos(abs_pos, tgt_size):
     dtype = abs_pos.dtype
 
     if src_size != tgt_size:
-        old_pos_embed = old_pos_embed.view(1, src_size, src_size,
-                                           dim).permute(0, 3, 1,
-                                                        2).contiguous()
+        old_pos_embed = (
+            old_pos_embed.view(1, src_size, src_size, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
         old_pos_embed = old_pos_embed.to(torch.float32)
         new_pos_embed = F.interpolate(
             old_pos_embed,
             size=(tgt_size, tgt_size),
-            mode='bicubic',
+            mode="bicubic",
             antialias=True,
             align_corners=False,
         ).to(dtype)
         new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
         new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
         vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
-        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1,
-                                                 dim)
+        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
         return vision_pos_embed
     else:
         return abs_pos
 
 
 class Step3VisionEmbeddings(nn.Module):
-
     def __init__(self, config: Step3VisionEncoderConfig):
         super().__init__()
         self.config = config
@@ -612,43 +675,51 @@ class Step3VisionEmbeddings(nn.Module):
             bias=True,
         )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.pad_tp_size = 4  # hard code for padding
         # To load the pretrained weights, we still use P+1 as the seqlen
-        self.position_embedding = torch.nn.Embedding(self.num_patches + 1,
-                                                     self.embed_dim)
-        self.register_buffer("position_ids",
-                             torch.arange(self.num_patches + 1).expand(
-                                 (1, -1)),
-                             persistent=False)
+        self.position_embedding = torch.nn.Embedding(
+            self.num_patches + 1, self.embed_dim
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_patches + 1).expand((1, -1)),
+            persistent=False,
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         patch_embeds = self.patch_embedding(
-            pixel_values)  # shape = [*, width, grid, grid]
+            pixel_values
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         # pad
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
         embeddings = embeddings + get_abs_pos(
-            self.position_embedding(self.position_ids), patch_embeds.size(1))
-        embeddings = torch.cat([
-            embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1,
-                                                    1), embeddings
-        ],
-                               dim=1)
+            self.position_embedding(self.position_ids), patch_embeds.size(1)
+        )
+        embeddings = torch.cat(
+            [
+                embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1, 1),
+                embeddings,
+            ],
+            dim=1,
+        )
         return embeddings
 
 
 class Step3VisionAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self,
-                 config,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -657,8 +728,7 @@ class Step3VisionAttention(nn.Module):
 
         self.scale = self.head_dim**-0.5
 
-        tp_size = (1 if use_data_parallel else
-                   get_tensor_model_parallel_world_size())
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
 
@@ -673,16 +743,17 @@ class Step3VisionAttention(nn.Module):
             prefix=f"{prefix}.qkv_proj",
             disable_tp=use_data_parallel,
         )
-        self.out_proj = RowParallelLinear(self.embed_dim,
-                                          self.embed_dim,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.out_proj",
-                                          disable_tp=use_data_parallel)
+        self.out_proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
+        )
 
         # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim,
-                                       self.scale)
+        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(
         self,
@@ -704,27 +775,32 @@ class Step3VisionAttention(nn.Module):
 
 
 class Step3VisionMLP(nn.Module):
-
-    def __init__(self,
-                 config,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1",
-                                        disable_tp=use_data_parallel)
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2",
-                                     disable_tp=use_data_parallel)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -734,12 +810,13 @@ class Step3VisionMLP(nn.Module):
 
 
 class Step3VisionEncoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: Step3VisionEncoderConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config: Step3VisionEncoderConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.use_data_parallel = use_data_parallel
         self.embed_dim = config.hidden_size
@@ -747,44 +824,48 @@ class Step3VisionEncoderLayer(nn.Module):
             config,
             quant_config,
             prefix=f"{prefix}.self_attn",
-            use_data_parallel=self.use_data_parallel)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.mlp = Step3VisionMLP(config,
-                                  quant_config,
-                                  prefix=f"{prefix}.mlp",
-                                  use_data_parallel=self.use_data_parallel)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
+            use_data_parallel=self.use_data_parallel,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Step3VisionMLP(
+            config,
+            quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=self.use_data_parallel,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.FloatTensor:
-        hidden_states = hidden_states + self.layer_norm1(
-            self.self_attn(hidden_states))
-        hidden_states = hidden_states + self.layer_norm2(
-            self.mlp(hidden_states))
+        hidden_states = hidden_states + self.layer_norm1(self.self_attn(hidden_states))
+        hidden_states = hidden_states + self.layer_norm2(self.mlp(hidden_states))
         return hidden_states
 
 
 class Step3VisionEncoder(nn.Module):
-
-    def __init__(self,
-                 config: Step3VisionEncoderConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config: Step3VisionEncoderConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.use_data_parallel = use_data_parallel
-        self.layers = nn.ModuleList([
-            Step3VisionEncoderLayer(config,
-                                    quant_config,
-                                    prefix=f"{prefix}.layers.{i}",
-                                    use_data_parallel=self.use_data_parallel)
-            for i in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                Step3VisionEncoderLayer(
+                    config,
+                    quant_config,
+                    prefix=f"{prefix}.layers.{i}",
+                    use_data_parallel=self.use_data_parallel,
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
 
     def forward(
         self,
@@ -797,12 +878,13 @@ class Step3VisionEncoder(nn.Module):
 
 
 class Step3VisionTransformer(nn.Module):
-
-    def __init__(self,
-                 config: Step3VisionEncoderConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        config: Step3VisionEncoderConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.use_data_parallel = use_data_parallel
@@ -812,7 +894,8 @@ class Step3VisionTransformer(nn.Module):
             config,
             quant_config,
             prefix=f"{prefix}.transformer",
-            use_data_parallel=self.use_data_parallel)
+            use_data_parallel=self.use_data_parallel,
+        )
 
     def forward(
         self,
@@ -820,28 +903,31 @@ class Step3VisionTransformer(nn.Module):
     ):
         hidden_states = self.embeddings(pixel_values)
         if self.use_data_parallel:
-            hidden_states = run_dp_sharded_vision_model(
-                hidden_states, self.transformer)
+            hidden_states = run_dp_sharded_vision_model(hidden_states, self.transformer)
         else:
             hidden_states = self.transformer(inputs_embeds=hidden_states)
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(Step3VLMultiModalProcessor,
-                                        info=Step3VLProcessingInfo,
-                                        dummy_inputs=Step3VLDummyInputsBuilder)
-class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP):
+@MULTIMODAL_REGISTRY.register_processor(
+    Step3VLMultiModalProcessor,
+    info=Step3VLProcessingInfo,
+    dummy_inputs=Step3VLDummyInputsBuilder,
+)
+class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "model.": "language_model.model.",
-        "lm_head.": "language_model.lm_head.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        }
+    )
 
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<im_patch>"
 
@@ -862,12 +948,14 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 config.vision_config,
                 None,
                 prefix=maybe_prefix(prefix, "vision_model"),
-                use_data_parallel=self.use_data_parallel)
+                use_data_parallel=self.use_data_parallel,
+            )
             self.vit_downsampler = nn.Conv2d(
                 config.vision_config.hidden_size,
                 config.vision_config.output_hidden_size,
                 kernel_size=2,
-                stride=config.understand_projector_stride)
+                stride=config.understand_projector_stride,
+            )
             self.vit_downsampler2 = nn.Conv2d(
                 config.vision_config.output_hidden_size,
                 config.vision_config.output_hidden_size * 2,
@@ -889,10 +977,12 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     @property
     def device(self):
@@ -903,7 +993,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         return next(self.parameters()).dtype
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Step3VLImageInputs]:
+        self, **kwargs: object
+    ) -> Step3VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         patch_pixel_values = kwargs.pop("patch_pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
@@ -913,42 +1004,24 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            pixel_values = flatten_bn(pixel_values, concat=True)
-            if pixel_values.dim() >= 3:
-                pixel_values = pixel_values.view(-1, *pixel_values.shape[-3:])
-            if patch_pixel_values is not None:
-                patch_pixel_values = flatten_bn(patch_pixel_values,
-                                                concat=True)
-                patch_pixel_values = patch_pixel_values.view(
-                    -1, *patch_pixel_values.shape[-3:])
-                # Handle empty patch_pixel_values by setting to None
-                if patch_pixel_values.shape[0] == 0:
-                    patch_pixel_values = None
-            num_patches = flatten_bn(num_patches, concat=True).tolist()
-
             return Step3VLImagePixelInputs(
                 type="pixel_values",
-                pixel_values=pixel_values.to(self.dtype).to(self.device),
-                patch_pixel_values=patch_pixel_values.to(self.dtype).to(
-                    self.device) if patch_pixel_values is not None else None,
+                pixel_values=pixel_values.to(self.dtype),
+                patch_pixel_values=patch_pixel_values.to(self.dtype)
+                if patch_pixel_values is not None
+                else None,
                 num_patches=num_patches,
             )
 
         if image_embeds is not None:
-            if image_embeds.dim() == 2 or image_embeds.dim() >= 3:
-                image_embeds = image_embeds.view(-1, image_embeds.shape[-1])
-            else:
-                raise ValueError(
-                    f"Unexpected shape for image_embeds: {image_embeds.shape}")
-
             return Step3VLImageEmbeddingInputs(
                 type="image_embeds",
-                image_embeds=image_embeds.to(self.dtype).to(self.device),
+                image_embeds=image_embeds.to(self.dtype),
             )
-        return None
 
-    def _process_image_features(self,
-                                image_features: torch.Tensor) -> torch.Tensor:
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_features(self, image_features: torch.Tensor) -> torch.Tensor:
         B, P = image_features.shape[:2]
         HW = int(sqrt(P))
         image_features = image_features.permute(0, 2, 1).view(B, -1, HW, HW)
@@ -959,26 +1032,29 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_features = self.vit_large_projector(image_features)
         return image_features
 
-    def _get_vision_model_output(self,
-                                 input_tensor: torch.Tensor) -> torch.Tensor:
+    def _get_vision_model_output(self, input_tensor: torch.Tensor) -> torch.Tensor:
         return self.vision_model(input_tensor)[:, 4:]
 
     def _process_image_input(
-            self, image_input: Step3VLImageInputs) -> tuple[torch.Tensor, ...]:
-
+        self, image_input: Step3VLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             image_features = image_input["image_embeds"]
         else:
-            image_features = self._get_vision_model_output(
-                image_input["pixel_values"])
-            patch_image_features = self._get_vision_model_output(
-                image_input["patch_pixel_values"]
-            ) if image_input["patch_pixel_values"] is not None else None
+            image_features = self._get_vision_model_output(image_input["pixel_values"])
+            patch_image_features = (
+                self._get_vision_model_output(image_input["patch_pixel_values"])
+                if image_input["patch_pixel_values"] is not None
+                else None
+            )
             num_patches = image_input["num_patches"]
 
         image_features = self._process_image_features(image_features)
-        patch_image_features = self._process_image_features(
-            patch_image_features) if patch_image_features is not None else None
+        patch_image_features = (
+            self._process_image_features(patch_image_features)
+            if patch_image_features is not None
+            else None
+        )
 
         merged_image_features = []
         cur_patch_idx = 0
@@ -986,87 +1062,87 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             cur_feature = []
             if num_patch > 0:
                 patch_slice = patch_image_features[
-                    cur_patch_idx:cur_patch_idx + num_patch]
+                    cur_patch_idx : cur_patch_idx + num_patch
+                ]
                 cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1]))
-            cur_feature.append(image_features[i].view(
-                -1, image_features.shape[-1]))
+            cur_feature.append(image_features[i].view(-1, image_features.shape[-1]))
             cur_patch_idx += num_patch
             merged_image_features.append(
-                torch.cat(cur_feature) if len(cur_feature) >
-                1 else cur_feature[0])
+                torch.cat(cur_feature) if len(cur_feature) > 1 else cur_feature[0]
+            )
         return merged_image_features
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
-            return None
+            return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
-        if multimodal_embeddings is None:
-            inputs_embeds = self.language_model.model.get_input_embeddings(
-                input_ids)
-        else:
-            is_text = input_ids != self.config.image_token_id
-            text_ids = input_ids[is_text]
-            text_embeds = self.language_model.model.get_input_embeddings(
-                text_ids)
-            inputs_embeds = torch.empty(input_ids.shape[0],
-                                        text_embeds.shape[-1],
-                                        dtype=text_embeds.dtype,
-                                        device=text_embeds.device)
-            inputs_embeds[is_text] = text_embeds
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_id)
-        return inputs_embeds
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=input_ids == self.config.image_token_id,
+            )
             input_ids = None
 
-        hidden_states = self.language_model(input_ids,
-                                            positions,
-                                            intermediate_tensors,
-                                            inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         skip_prefixes = []
         if self.vision_model is None and self.vit_large_projector is None:
             skip_prefixes = [
-                "vision_model.", "vit_downsampler.", "vit_downsampler2.",
-                "vit_large_projector."
+                "vision_model.",
+                "vit_downsampler.",
+                "vit_downsampler2.",
+                "vit_large_projector.",
             ]
 
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
-        loaded_weights = loader.load_weights(weights,
-                                             mapper=self.hf_to_vllm_mapper)
+        loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
         return loaded_weights
diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py
index 30b441f5b4df00f84c4e4cd897e8d922d0d4ff59..a74fd80c06d8c9cf18ac36cf76b5f15e8e1c80b8 100644
--- a/vllm/model_executor/models/swin.py
+++ b/vllm/model_executor/models/swin.py
@@ -2,68 +2,72 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from transformers import SwinConfig
-from transformers.models.swin.modeling_swin import SwinEmbeddings
+from transformers.models.swin.modeling_swin import SwinEmbeddings, SwinPatchMerging
 from transformers.models.swin.modeling_swin import SwinLayer as HFSwinLayer
-from transformers.models.swin.modeling_swin import SwinPatchMerging
 from transformers.pytorch_utils import meshgrid
 
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 
 class SwinSelfAttention(nn.Module):
-
     def __init__(
         self,
         config: SwinConfig,
         dim: int,
         num_heads: int,
         window_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         if dim % num_heads != 0:
             raise ValueError(
                 f"The hidden size ({dim}) is not a multiple of the number of "
-                f"attention heads ({num_heads})")
+                f"attention heads ({num_heads})"
+            )
 
         self.num_attention_heads = num_heads
         self.attention_head_size = int(dim / num_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.window_size = (window_size if isinstance(window_size, Iterable)
-                            else (window_size, window_size))
+        self.window_size = (
+            window_size
+            if isinstance(window_size, Iterable)
+            else (window_size, window_size)
+        )
         self.scale = self.attention_head_size**-0.5
 
         self.relative_position_bias_table = nn.Parameter(
             torch.zeros(
-                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
-                num_heads))
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads
+            )
+        )
 
         # get pair-wise relative position index for each token inside the window
         coords_h = torch.arange(self.window_size[0])
         coords_w = torch.arange(self.window_size[1])
         coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
         coords_flatten = torch.flatten(coords, 1)
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:,
-                                                                      None, :]
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
         relative_coords = relative_coords.permute(1, 2, 0).contiguous()
         relative_coords[:, :, 0] += self.window_size[0] - 1
         relative_coords[:, :, 1] += self.window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
         relative_position_index = relative_coords.sum(-1)
 
-        self.relative_position_index = nn.Parameter(relative_position_index,
-                                                    requires_grad=False)
+        self.relative_position_index = nn.Parameter(
+            relative_position_index, requires_grad=False
+        )
 
         self.qkv = QKVParallelLinear(
             hidden_size=dim,
@@ -75,27 +79,31 @@ class SwinSelfAttention(nn.Module):
         )
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
         x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def _get_rel_pos_bias(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.view(-1)]
+            self.relative_position_index.view(-1)
+        ]
         relative_position_bias = relative_position_bias.view(
             self.window_size[0] * self.window_size[1],
-            self.window_size[0] * self.window_size[1], -1)
-        relative_position_bias = relative_position_bias.permute(
-            2, 0, 1).contiguous()
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
         return relative_position_bias.unsqueeze(0)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
+        attention_mask: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor, ...]:
         batch_size, dim, num_channels = hidden_states.shape
 
@@ -110,43 +118,43 @@ class SwinSelfAttention(nn.Module):
         if attention_mask is not None:
             mask_shape = attention_mask.shape[0]
             attention_mask_expanded = attention_mask.view(
-                1, mask_shape, 1, dim,
-                dim).expand(batch_size // mask_shape, mask_shape,
-                            self.num_attention_heads, dim, dim)
-            attention_scores = attention_scores + \
-            attention_mask_expanded.unsqueeze(
-                1).unsqueeze(0)
-            attention_scores = attention_scores.view(-1,
-                                                     self.num_attention_heads,
-                                                     dim, dim)
+                1, mask_shape, 1, dim, dim
+            ).expand(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask_expanded.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attention_scores = attention_scores.view(
+                -1, self.num_attention_heads, dim, dim
+            )
 
         context_layer = torch.nn.functional.scaled_dot_product_attention(
             query_layer,
             key_layer,
             value_layer,
             attn_mask=attention_scores,
-            dropout_p=0.,
+            dropout_p=0.0,
         )
         attention_probs = None
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
 
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
 
         return outputs
 
 
 class SwinSelfOutput(nn.Module):
-
     def __init__(
         self,
         config: SwinConfig,
         dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -157,61 +165,68 @@ class SwinSelfOutput(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-    def forward(self, hidden_states: torch.Tensor,
-                input_tensor: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
         hidden_states, _ = self.dense(hidden_states)
 
         return hidden_states
 
 
 class SwinAttention(nn.Module):
-
-    def __init__(self,
-                 config: SwinConfig,
-                 dim: int,
-                 num_heads: int,
-                 window_size: int,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        num_heads: int,
+        window_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-        self.self = SwinSelfAttention(config,
-                                      dim,
-                                      num_heads,
-                                      window_size,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.self")
-        self.output = SwinSelfOutput(config,
-                                     dim,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.output")
+        self.self = SwinSelfAttention(
+            config,
+            dim,
+            num_heads,
+            window_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self",
+        )
+        self.output = SwinSelfOutput(
+            config, dim, quant_config=quant_config, prefix=f"{prefix}.output"
+        )
         self.pruned_heads = set()
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
+        attention_mask: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor]:
-        self_outputs = self.self(hidden_states, attention_mask, head_mask,
-                                 output_attentions)
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, output_attentions
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output, ) + self_outputs[1:]
+        outputs = (attention_output,) + self_outputs[1:]
         return outputs
 
 
 class SwinIntermediate(nn.Module):
-
-    def __init__(self,
-                 config: SwinConfig,
-                 dim: int,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-        self.dense = ColumnParallelLinear(dim,
-                                          int(config.mlp_ratio * dim),
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.dense")
+        self.dense = ColumnParallelLinear(
+            dim,
+            int(config.mlp_ratio * dim),
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
         self.intermediate_act_fn = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -221,17 +236,20 @@ class SwinIntermediate(nn.Module):
 
 
 class SwinOutput(nn.Module):
-
-    def __init__(self,
-                 config: SwinConfig,
-                 dim: int,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-        self.dense = RowParallelLinear(int(config.mlp_ratio * dim),
-                                       dim,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.dense")
+        self.dense = RowParallelLinear(
+            int(config.mlp_ratio * dim),
+            dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.dense(hidden_states)
@@ -239,7 +257,6 @@ class SwinOutput(nn.Module):
 
 
 class SwinLayer(HFSwinLayer):
-
     def __init__(
         self,
         config: SwinConfig,
@@ -248,7 +265,7 @@ class SwinLayer(HFSwinLayer):
         num_heads: int,
         drop_path_rate: float = 0.0,
         shift_size: int = 0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(
@@ -260,24 +277,23 @@ class SwinLayer(HFSwinLayer):
             shift_size=shift_size,
         )
 
-        self.attention = SwinAttention(config,
-                                       dim,
-                                       num_heads,
-                                       window_size=self.window_size,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.attention")
-        self.intermediate = SwinIntermediate(config,
-                                             dim,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.intermediate")
-        self.output = SwinOutput(config,
-                                 dim,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.output")
+        self.attention = SwinAttention(
+            config,
+            dim,
+            num_heads,
+            window_size=self.window_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+        self.intermediate = SwinIntermediate(
+            config, dim, quant_config=quant_config, prefix=f"{prefix}.intermediate"
+        )
+        self.output = SwinOutput(
+            config, dim, quant_config=quant_config, prefix=f"{prefix}.output"
+        )
 
 
 class SwinStage(nn.Module):
-
     def __init__(
         self,
         config: SwinConfig,
@@ -286,31 +302,34 @@ class SwinStage(nn.Module):
         depth: int,
         num_heads: int,
         drop_path: list[float],
-        downsample: Optional[SwinPatchMerging] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        downsample: SwinPatchMerging | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.dim = dim
-        self.blocks = nn.ModuleList([
-            SwinLayer(config=config,
-                      dim=dim,
-                      input_resolution=input_resolution,
-                      num_heads=num_heads,
-                      drop_path_rate=drop_path[layer_idx],
-                      shift_size=0 if
-                      (layer_idx % 2 == 0) else config.window_size // 2,
-                      quant_config=quant_config,
-                      prefix=f"{prefix}.blocks.{layer_idx}")
-            for layer_idx in range(depth)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                SwinLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    drop_path_rate=drop_path[layer_idx],
+                    shift_size=0 if (layer_idx % 2 == 0) else config.window_size // 2,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                )
+                for layer_idx in range(depth)
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
-            self.downsample = downsample(input_resolution,
-                                         dim=dim,
-                                         norm_layer=nn.LayerNorm)
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=nn.LayerNorm
+            )
         else:
             self.downsample = None
 
@@ -320,33 +339,39 @@ class SwinStage(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
+        always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            layer_outputs = layer_module(hidden_states, input_dimensions,
-                                         layer_head_mask, output_attentions,
-                                         always_partition)
+            layer_outputs = layer_module(
+                hidden_states,
+                input_dimensions,
+                layer_head_mask,
+                output_attentions,
+                always_partition,
+            )
 
             hidden_states = layer_outputs[0]
 
         hidden_states_before_downsampling = hidden_states
         if self.downsample is not None:
-            height_downsampled, width_downsampled = (height + 1) // 2, (width +
-                                                                        1) // 2
-            output_dimensions = (height, width, height_downsampled,
-                                 width_downsampled)
-            hidden_states = self.downsample(hidden_states_before_downsampling,
-                                            input_dimensions)
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(
+                hidden_states_before_downsampling, input_dimensions
+            )
         else:
             output_dimensions = (height, width, height, width)
 
-        stage_outputs = (hidden_states, hidden_states_before_downsampling,
-                         output_dimensions)
+        stage_outputs = (
+            hidden_states,
+            hidden_states_before_downsampling,
+            output_dimensions,
+        )
 
         if output_attentions:
             stage_outputs += layer_outputs[1:]
@@ -354,51 +379,66 @@ class SwinStage(nn.Module):
 
 
 class SwinEncoder(nn.Module):
-
     def __init__(
         self,
         config: SwinConfig,
         grid_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.num_layers = len(config.depths)
         self.config = config
         dpr = [
-            x.item() for x in torch.linspace(
-                0, config.drop_path_rate, sum(config.depths), device="cpu")
+            x.item()
+            for x in torch.linspace(
+                0, config.drop_path_rate, sum(config.depths), device="cpu"
+            )
         ]
-        self.layers = nn.ModuleList([
-            SwinStage(config=config,
-                      dim=int(config.embed_dim * 2**layer_idx),
-                      input_resolution=(grid_size[0] // (2**layer_idx),
-                                        grid_size[1] // (2**layer_idx)),
-                      depth=config.depths[layer_idx],
-                      num_heads=config.num_heads[layer_idx],
-                      drop_path=dpr[sum(config.depths[:layer_idx]
-                                        ):sum(config.depths[:layer_idx + 1])],
-                      downsample=SwinPatchMerging if
-                      (layer_idx < self.num_layers - 1) else None,
-                      quant_config=quant_config,
-                      prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(self.num_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                SwinStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**layer_idx),
+                    input_resolution=(
+                        grid_size[0] // (2**layer_idx),
+                        grid_size[1] // (2**layer_idx),
+                    ),
+                    depth=config.depths[layer_idx],
+                    num_heads=config.num_heads[layer_idx],
+                    drop_path=dpr[
+                        sum(config.depths[:layer_idx]) : sum(
+                            config.depths[: layer_idx + 1]
+                        )
+                    ],
+                    downsample=SwinPatchMerging
+                    if (layer_idx < self.num_layers - 1)
+                    else None,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(self.num_layers)
+            ]
+        )
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
+        always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         for i, layer_module in enumerate(self.layers):
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            layer_outputs = layer_module(hidden_states, input_dimensions,
-                                         layer_head_mask, output_attentions,
-                                         always_partition)
+            layer_outputs = layer_module(
+                hidden_states,
+                input_dimensions,
+                layer_head_mask,
+                output_attentions,
+                always_partition,
+            )
 
             hidden_states = layer_outputs[0]
             output_dimensions = layer_outputs[2]
@@ -414,25 +454,27 @@ class SwinModel(nn.Module):
     def __init__(
         self,
         config: SwinConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.num_layers = len(config.depths)
-        self.num_features = int(config.embed_dim * 2**(self.num_layers - 1))
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
 
         self.embeddings = SwinEmbeddings(config)
-        self.encoder = SwinEncoder(config,
-                                   self.embeddings.patch_grid,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.encoder")
+        self.encoder = SwinEncoder(
+            config,
+            self.embeddings.patch_grid,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
 
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
+        pixel_values: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = None,
     ) -> tuple[torch.Tensor]:
         embedding_output, input_dimensions = self.embeddings(pixel_values)
 
@@ -445,8 +487,7 @@ class SwinModel(nn.Module):
 
         return encoder_outputs
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv", "query", "q"),
             ("qkv", "key", "k"),
@@ -456,8 +497,7 @@ class SwinModel(nn.Module):
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -468,8 +508,7 @@ class SwinModel(nn.Module):
                 break
             else:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 3660efdc079aa91f344f107a963ce4654b3fe725..4d310712f303e8f9939446eb0e3c28221a442481 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -3,14 +3,17 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
-                    Union)
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, CLIPVisionConfig
+from transformers import (
+    BatchFeature,
+    CLIPVisionConfig,
+    PretrainedConfig,
+    SiglipVisionConfig,
+)
 from transformers import LlavaConfig as HfLlavaConfig
-from transformers import PretrainedConfig, SiglipVisionConfig
 from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
 from transformers.models.llava import LlavaProcessor
 from transformers.processing_utils import ProcessingKwargs, Unpack
@@ -18,30 +21,38 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize, MultiModalDataItems)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        InputProcessingContext,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo, get_vision_encoder_info
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+from .vision import (
+    VisionEncoderInfo,
+    get_num_selected_vision_tokens,
+    get_vision_encoder_info,
+)
 
 
 class TarsierImagePixelInputs(TensorSchema):
@@ -52,6 +63,7 @@ class TarsierImagePixelInputs(TensorSchema):
         - h: Height
         - w: Width
     """
+
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
@@ -64,12 +76,12 @@ class TarsierImageEmbeddingInputs(TensorSchema):
         - hs: Hidden size (must match the hidden size of language model
           backbone)
     """
+
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-TarsierImageInputs = Union[TarsierImagePixelInputs,
-                           TarsierImageEmbeddingInputs]
+TarsierImageInputs: TypeAlias = TarsierImagePixelInputs | TarsierImageEmbeddingInputs
 
 
 class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
@@ -77,7 +89,7 @@ class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
     text_config: Final[PretrainedConfig]  # Added from Tarsier's LlavaConfig
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
     projector_hidden_act: Final[str]
     image_newline_idx: Final[int]
     image_new_idx: Final[int]
@@ -94,19 +106,19 @@ class TarsierProcessorKwargs(ProcessingKwargs, total=False):
 
 
 class TarsierProcessor(LlavaProcessor):
-
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput],
-                    list[PreTokenizedInput]] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         audio=None,
         videos=None,
         **kwargs: Unpack[TarsierProcessorKwargs],
     ) -> BatchFeature:
         if images is None and text is None:
-            raise ValueError(
-                "You have to specify at least one of `images` or `text`.")
+            raise ValueError("You have to specify at least one of `images` or `text`.")
 
         output_kwargs = self._merge_kwargs(
             TarsierProcessorKwargs,
@@ -115,15 +127,17 @@ class TarsierProcessor(LlavaProcessor):
         )
         if images is not None:
             image_inputs = self.image_processor(
-                images, **output_kwargs["images_kwargs"])
+                images, **output_kwargs["images_kwargs"]
+            )
         else:
             image_inputs = {}
 
         if isinstance(text, str):
             text = [text]
         elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string,"
-                             " or a list of strings")
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
 
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
@@ -131,51 +145,55 @@ class TarsierProcessor(LlavaProcessor):
             # Replace the image token with the expanded image token sequence
             pixel_values = image_inputs["pixel_values"]
             height, width = get_image_size(to_numpy_array(pixel_values[0]))
-            num_image_tokens = (height // self.patch_size) * (
-                width // self.patch_size +
-                1) + self.num_additional_image_tokens + 1
+            num_image_tokens = (
+                (height // self.patch_size) * (width // self.patch_size + 1)
+                + self.num_additional_image_tokens
+                + 1
+            )
             if self.vision_feature_select_strategy == "default":
                 num_image_tokens -= 1
 
             prompt_strings = []
             for sample in text:
-                sample = sample.replace(self.image_token,
-                                        self.image_token * num_image_tokens)
+                sample = sample.replace(
+                    self.image_token, self.image_token * num_image_tokens
+                )
                 prompt_strings.append(sample)
 
-        return_tensors = output_kwargs["text_kwargs"].pop(
-            "return_tensors", None)
-        text_inputs = self.tokenizer(prompt_strings,
-                                     **output_kwargs["text_kwargs"])
-        return BatchFeature(data={
-            **text_inputs,
-            **image_inputs
-        },
-                            tensor_type=return_tensors)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        return BatchFeature(
+            data={**text_inputs, **image_inputs}, tensor_type=return_tensors
+        )
 
 
 class TarsierMultiModalProjector(nn.Module):
-
-    def __init__(self,
-                 vision_hidden_size: int,
-                 text_hidden_size: int,
-                 projector_hidden_act: str,
-                 multimodal_projector_bias: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str,
+        multimodal_projector_bias: bool,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
-        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
-                                             text_hidden_size,
-                                             bias=multimodal_projector_bias,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.linear_1")
+        self.linear_1 = ColumnParallelLinear(
+            vision_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
         self.act = get_act_fn(projector_hidden_act)
-        self.linear_2 = RowParallelLinear(text_hidden_size,
-                                          text_hidden_size,
-                                          bias=multimodal_projector_bias,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.linear_2")
+        self.linear_2 = RowParallelLinear(
+            text_hidden_size,
+            text_hidden_size,
+            bias=multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.linear_1(image_features)
@@ -185,7 +203,6 @@ class TarsierMultiModalProjector(nn.Module):
 
 
 class TarsierProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> TarsierHfConfig:
         return self.ctx.get_hf_config(HfLlavaConfig)
 
@@ -199,21 +216,9 @@ class TarsierProcessingInfo(BaseProcessingInfo):
 
         return self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def _apply_feature_select_strategy(
-        self,
-        strategy: str,
-        encoder_num_image_tokens: int,
-    ) -> int:
-        if strategy == "default":
-            return encoder_num_image_tokens - 1
-        if strategy == "full":
-            return encoder_num_image_tokens
-        msg = f"Unexpected feature select strategy: {strategy!r}"
-        raise NotImplementedError(msg)
-
     def get_num_image_tokens(
         self,
         *,
@@ -222,29 +227,27 @@ class TarsierProcessingInfo(BaseProcessingInfo):
     ) -> int:
         hf_config = self.get_hf_config()
         vision_encoder_info = self.get_vision_encoder_info()
-        num_projected_patches = self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
+        num_projected_patches = get_num_selected_vision_tokens(
             vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
                 image_height=image_height,
             ),
+            hf_config.vision_feature_select_strategy,
         )
         if num_projected_patches <= 0:
             default_size = self.get_image_size_with_most_features()
-            num_projected_patches_default = self._apply_feature_select_strategy(
-                hf_config.vision_feature_select_strategy,
+            num_projected_patches_default = get_num_selected_vision_tokens(
                 vision_encoder_info.get_num_image_tokens(
                     image_width=default_size.width,
                     image_height=default_size.height,
                 ),
+                hf_config.vision_feature_select_strategy,
             )
             if num_projected_patches_default <= 0:
-                raise ValueError(
-                    "Could not determine a valid number of image patches.")
+                raise ValueError("Could not determine a valid number of image patches.")
             num_projected_patches = num_projected_patches_default
         num_height_patches = int(math.sqrt(num_projected_patches))
-        total_image_tokens_for_llm = num_projected_patches \
-            + num_height_patches + 1
+        total_image_tokens_for_llm = num_projected_patches + num_height_patches + 1
         return total_image_tokens_for_llm
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -270,12 +273,10 @@ _I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo)
 
 
 class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]):
-
     pass
 
 
 class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -297,14 +298,14 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems))
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
 
             if isinstance(images, ImageEmbeddingItems):
                 num_projected_patches = images.get_feature_size(item_idx)
                 # This assumes num_projected_patches is a perfect square
                 num_height_patches = int(math.sqrt(num_projected_patches))
-                num_final_image_tokens = num_projected_patches \
-                + num_height_patches + 1
+                num_final_image_tokens = num_projected_patches + num_height_patches + 1
             else:
                 image_size = images.get_image_size(item_idx)
                 num_final_image_tokens = self.info.get_num_image_tokens(
@@ -323,8 +324,7 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
         ]
 
 
-def _build_tarsier_hf_info(
-        ctx: InputProcessingContext) -> TarsierProcessingInfo:
+def _build_tarsier_hf_info(ctx: InputProcessingContext) -> TarsierProcessingInfo:
     return TarsierProcessingInfo(ctx)
 
 
@@ -332,7 +332,7 @@ def _build_tarsier_hf_processor(
     info: _I_Tarsier,
     dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, TarsierProcessingInfo):
         return TarsierMultiModalProcessor(
@@ -345,32 +345,33 @@ def _build_tarsier_hf_processor(
 
 def init_vision_tower_for_tarsier(
     hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel:
     vision_config = hf_config.vision_config
 
     feature_layers = hf_config.vision_feature_layer
     base_num_hidden_layers = vision_config.num_hidden_layers
 
-    def _get_layer_index(feature_layer_index: int,
-                         num_hidden_layers_total: int) -> int:
+    def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int:
         if feature_layer_index < 0:
             return num_hidden_layers_total + feature_layer_index + 1
         return feature_layer_index
 
     if isinstance(feature_layers, int):
-        num_hidden_layers_to_init = _get_layer_index(feature_layers,
-                                                     base_num_hidden_layers)
+        num_hidden_layers_to_init = _get_layer_index(
+            feature_layers, base_num_hidden_layers
+        )
     elif isinstance(feature_layers, (list, tuple)):
         num_hidden_layers_to_init = max(
-            _get_layer_index(idx, base_num_hidden_layers)
-            for idx in feature_layers)
+            _get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
+        )
     else:
-        raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
-                        " is not supported")
+        raise TypeError(
+            f"vision_layer_feature type: {type(feature_layers)} is not supported"
+        )
 
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPVisionModel(
@@ -393,18 +394,21 @@ def init_vision_tower_for_tarsier(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor,
-                                        info=_build_tarsier_hf_info,
-                                        dummy_inputs=TarsierDummyInputsBuilder)
-class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP):
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_tarsier_hf_processor,
+    info=_build_tarsier_hf_info,
+    dummy_inputs=TarsierDummyInputsBuilder,
+)
+class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -419,7 +423,8 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
             config,
             quant_config,
             require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
         projector_bias = getattr(config, "multimodal_projector_bias", True)
 
         self.multi_modal_projector = TarsierMultiModalProjector(
@@ -428,27 +433,31 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
             projector_hidden_act=config.projector_hidden_act,
             multimodal_projector_bias=projector_bias,
             quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
+        )
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
-            hf_config=config.
-            text_config,  # Use text_config from Tarsier's main config
+            hf_config=config.text_config,  # Use text_config from Tarsier's main config
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        self.register_buffer('image_newline_idx_tensor',
-                             torch.tensor([config.image_newline_idx],
-                                          dtype=torch.long),
-                             persistent=False)
-        self.register_buffer('image_new_idx_tensor',
-                             torch.tensor([config.image_new_idx],
-                                          dtype=torch.long),
-                             persistent=False)
+        self.register_buffer(
+            "image_newline_idx_tensor",
+            torch.tensor([config.image_newline_idx], dtype=torch.long),
+            persistent=False,
+        )
+        self.register_buffer(
+            "image_new_idx_tensor",
+            torch.tensor([config.image_new_idx], dtype=torch.long),
+            persistent=False,
+        )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[TarsierImageInputs]:
+        self, **kwargs: object
+    ) -> TarsierImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -456,69 +465,49 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return TarsierImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values, concat=True),
+                pixel_values=pixel_values,
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return TarsierImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
-    def _select_image_features(self, image_features: torch.Tensor, *,
-                               strategy: str) -> torch.Tensor:
-        if strategy == "default":
-            return image_features[:, 1:]
-        elif strategy == "full":
-            return image_features
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # From vLLM LLaVA, vision tower output handling
-        image_hidden_states: Union[torch.Tensor, tuple[torch.Tensor, ...]] = \
-            vision_tower(pixel_values)
-
-        def select_features_fn(leaf: torch.Tensor):
-            return self._select_image_features(
-                leaf,
-                strategy=self.config.vision_feature_select_strategy,
-            )
-
-        return json_map_leaves(select_features_fn, image_hidden_states)
+        return vision_tower(
+            pixel_values,
+            feature_select_strategy=self.config.vision_feature_select_strategy,
+        )
 
     def _add_tarsier_split_tokens(
-            self, projected_image_features: torch.Tensor) -> torch.Tensor:
+        self, projected_image_features: torch.Tensor
+    ) -> torch.Tensor:
         """
         Implements Tarsier's `add_split_tokens` logic.
         """
-        num_images, num_projected_patches, embed_dim = \
-            projected_image_features.shape
+        num_images, num_projected_patches, embed_dim = projected_image_features.shape
         num_height_patches = int(math.sqrt(num_projected_patches))
         num_width_patches = num_projected_patches // num_height_patches
         device = projected_image_features.device
         embedding_layer = self.language_model.model.embed_tokens
         image_newline_emb = embedding_layer(
-            self.image_newline_idx_tensor.to(device)).squeeze(0)
-        image_new_emb = embedding_layer(
-            self.image_new_idx_tensor.to(device)).squeeze(0)
+            self.image_newline_idx_tensor.to(device)
+        ).squeeze(0)
+        image_new_emb = embedding_layer(self.image_new_idx_tensor.to(device)).squeeze(0)
         try:
             current_image_features_grid = projected_image_features.view(
-                num_images, num_height_patches, num_width_patches, embed_dim)
+                num_images, num_height_patches, num_width_patches, embed_dim
+            )
         except RuntimeError as e:
             raise RuntimeError(
                 "Cannot reshape projected_image_features"
@@ -528,112 +517,103 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
                 "Ensure num_projected_patches is compatible"
                 " with a grid structure. "
                 f"num_projected_patches={num_projected_patches}, "
-                f"derived num_height_patches={num_height_patches}. ") from e
+                f"derived num_height_patches={num_height_patches}. "
+            ) from e
 
         image_newline_expanded = image_newline_emb.expand(
-            (num_images, num_height_patches, 1, embed_dim))
+            (num_images, num_height_patches, 1, embed_dim)
+        )
         features_with_newlines = torch.cat(
             [current_image_features_grid, image_newline_expanded],
-            dim=2  # Concatenate along width dim
+            dim=2,  # Concatenate along width dim
         )
-        new_num_patches_after_newline = num_projected_patches \
-            + num_height_patches
+        new_num_patches_after_newline = num_projected_patches + num_height_patches
         features_with_newlines_flat = features_with_newlines.view(
-            num_images, new_num_patches_after_newline, embed_dim)
+            num_images, new_num_patches_after_newline, embed_dim
+        )
         image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim))
         final_image_features = torch.cat(
             [features_with_newlines_flat, image_new_expanded],
-            dim=1  # Concatenate along patch sequence dim
+            dim=1,  # Concatenate along patch sequence dim
         )
         return final_image_features
 
     def _process_image_pixels(
         self,
         inputs: TarsierImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
         pixel_values = inputs["pixel_values"]
         image_features_selected = self._image_pixels_to_features(
-            self.vision_tower, pixel_values)  # type: ignore
+            self.vision_tower, pixel_values
+        )  # type: ignore
         if isinstance(image_features_selected, torch.Tensor):
-            projected_features = self.multi_modal_projector(
-                image_features_selected)
+            projected_features = self.multi_modal_projector(image_features_selected)
             final_features = self._add_tarsier_split_tokens(projected_features)
             return final_features
         else:
             raise TypeError(
                 f"_image_pixels_to_features type:"
-                f" {type(image_features_selected)} is not supported")
+                f" {type(image_features_selected)} is not supported"
+            )
 
     def _process_image_input(
         self,
         image_input: TarsierImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             projected_features = image_input["data"]
             if isinstance(projected_features, torch.Tensor):
                 return self._add_tarsier_split_tokens(projected_features)
             else:
-                raise ValueError("Incorrect type of image_embeds. "
-                                 f"Got type: {type(projected_features)}. ")
+                raise ValueError(
+                    "Incorrect type of image_embeds. "
+                    f"Got type: {type(projected_features)}. "
+                )
         assert self.vision_tower is not None
         return self._process_image_pixels(image_input)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                self.config.image_token_index,
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=input_ids == self.config.image_token_index,
+            )
             input_ids = None
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
         return hidden_states
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 49a7677151a948a29f1978d74ac75d6f2f0765bd..113581d55ff56c6398336e6186a5710a2674b0d0 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -30,12 +30,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
 
 from .llama import LlamaDecoderLayer
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    is_pp_missing_parameter)
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    is_pp_missing_parameter,
+)
 
 
 class TeleChat2Model(LlamaModel):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         hf_config = vllm_config.model_config.hf_config
 
@@ -43,7 +46,7 @@ class TeleChat2Model(LlamaModel):
             "num_hidden_layers": "n_layer",
             "num_attention_heads": "n_head",
             "intermediate_size": "ffn_hidden_size",
-            "rms_norm_eps": "layer_norm_epsilon"
+            "rms_norm_eps": "layer_norm_epsilon",
         }
         vllm_config.model_config.hf_config.hidden_act = "silu"
 
@@ -62,11 +65,10 @@ class TeleChat2Model(LlamaModel):
                 layer.mlp.gate_up_proj.bias = None
                 layer.mlp.gate_up_proj.skip_bias_add = True
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
-            ('gate_up_proj', 'gate_proj', 0),
-            ('gate_up_proj', 'up_proj', 1),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -78,9 +80,10 @@ class TeleChat2Model(LlamaModel):
                 v_weight = []
                 for i in range(total_num_heads):
                     start = i * head_dim * 2
-                    k_weight.append(loaded_weight[start:start + head_dim, :])
-                    v_weight.append(loaded_weight[start + head_dim:start +
-                                                  2 * head_dim:])
+                    k_weight.append(loaded_weight[start : start + head_dim, :])
+                    v_weight.append(
+                        loaded_weight[start + head_dim : start + 2 * head_dim :]
+                    )
                 k_weight = torch.cat(k_weight, dim=0)
                 v_weight = torch.cat(v_weight, dim=0)
                 name = name.replace("key_value", "qkv_proj")
@@ -112,15 +115,15 @@ class TeleChat2Model(LlamaModel):
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class TeleChat2ForCausalLM(LlamaForCausalLM):
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "transformer.": "model.",
@@ -134,18 +137,17 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
         },
     )
 
-    def _init_model(self,
-                    vllm_config: VllmConfig,
-                    prefix: str = "",
-                    layer_type: type[nn.Module] = LlamaDecoderLayer):
+    def _init_model(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
+    ):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]
-                           if self.config.tie_word_embeddings else None),
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 3666f7011a9979082c8a8fc4d28c472439b01b17..8a0bec9dff848a7a3d9957ff71c2b3b6c25be397 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -28,12 +28,14 @@ import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.models.llama import (LlamaDecoderLayer,
-                                              LlamaForCausalLM, LlamaModel)
+from vllm.model_executor.models.llama import (
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+)
 
 
 class TeleFLMModel(LlamaModel):
-
     def __init__(
         self,
         *,
@@ -41,9 +43,7 @@ class TeleFLMModel(LlamaModel):
         prefix: str = "",
         layer_type: type[nn.Module] = LlamaDecoderLayer,
     ):
-        super().__init__(vllm_config=vllm_config,
-                         prefix=prefix,
-                         layer_type=layer_type)
+        super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
         """
         This implementation is based on the µScaling paper presented at  
         the ICLR 2025 Workshop:  
@@ -57,7 +57,7 @@ class TeleFLMModel(LlamaModel):
         if self.use_mup:
             self.input_mult = self.config.input_mult
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         if self.use_mup:
             embedding = embedding * self.input_mult
@@ -65,7 +65,6 @@ class TeleFLMModel(LlamaModel):
 
 
 class TeleFLMForCausalLM(LlamaForCausalLM):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # mup
@@ -74,6 +73,6 @@ class TeleFLMForCausalLM(LlamaForCausalLM):
             self.mup_scale_factor = self.config.mup_scale_factor
             self.output_mult = self.config.output_mult / self.mup_scale_factor
             logit_scale = self.output_mult
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    self.config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, self.config.vocab_size, logit_scale
+            )
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index b9dfa8e9b6f51ff52300ff75c5e8949a6abc43c3..19052c8d49e44510564cee8010a563519d5d5a2f 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -18,36 +18,56 @@
 """Wrapper around `Terratorch` models"""
 
 from collections import OrderedDict
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Any
 
 import torch
 import torch.nn as nn
-from terratorch.vllm import (DummyDataGenerator, InferenceRunner,
-                             InputDefinition, InputTypeEnum)
+from terratorch.vllm import (
+    DummyDataGenerator,
+    InferenceRunner,
+    InputDefinition,
+    InputTypeEnum,
+)
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import DispatchPooler, DummyPooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems,
-                                    MultiModalUUIDDict, PlaceholderRange)
-from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptUpdate)
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+    PlaceholderRange,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
-                         SupportsMultiModal)
+from .interfaces import IsAttentionFree, MultiModalEmbeddings, SupportsMultiModal
 from .interfaces_base import default_pooling_type
 
+logger = init_logger(__name__)
+
 
 def _terratorch_field_names(pretrained_cfg: dict):
     input_definition = InputDefinition(**pretrained_cfg["input"])
@@ -55,12 +75,11 @@ def _terratorch_field_names(pretrained_cfg: dict):
 
 
 def _terratorch_field_factory(
-    pretrained_cfg: dict
+    pretrained_cfg: dict,
 ) -> Callable[
     [Mapping[str, torch.Tensor]],
-        Mapping[str, MultiModalFieldConfig],
+    Mapping[str, MultiModalFieldConfig],
 ]:
-
     def _terratorch_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         input_definition = InputDefinition(**pretrained_cfg["input"])
         fields = {}
@@ -68,27 +87,25 @@ def _terratorch_field_factory(
             if input.type == InputTypeEnum.tensor:
                 fields[input_name] = "image"
 
-        mm_fields_config = {}
-        for field_name, field_modality in fields.items():
-            mm_fields_config[field_name] = MultiModalFieldConfig.shared(
-                batch_size=1, modality=field_modality)
-        return mm_fields_config
+        return {
+            field_name: MultiModalFieldConfig.batched(modality=field_modality)
+            for field_name, field_modality in fields.items()
+        }
 
     return _terratorch_field_config
 
 
 class TerratorchProcessingInfo(BaseProcessingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
 
 class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
-
     def __init__(self, info: TerratorchProcessingInfo):
         super().__init__(info)
         self.dummy_data_generator = DummyDataGenerator(
-            self.info.get_hf_config().to_dict()["pretrained_cfg"])
+            self.info.get_hf_config().to_dict()["pretrained_cfg"]
+        )
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
@@ -97,24 +114,31 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         # Dummy data is generated based on the 'input' section
         # defined in the HF configuration file
+
+        if mm_options:
+            logger.warning(
+                "Configurable multimodal profiling "
+                "options are not supported for Terratorch. "
+                "They are ignored for now."
+            )
+
         return self.dummy_data_generator.get_dummy_mm_data()
 
 
 class TerratorchMultiModalDataParser(MultiModalDataParser):
-
     def __init__(self, pretrained_cfg: dict, *args, **kwargs):
         self._pretrained_cfg = pretrained_cfg
         super().__init__(*args, **kwargs)
 
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
-
             terratorch_fields = _terratorch_field_names(self._pretrained_cfg)
 
             return DictEmbeddingItems(
@@ -128,20 +152,18 @@ class TerratorchMultiModalDataParser(MultiModalDataParser):
 
 
 class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
-
     def __init__(
-            self,
-            info: TerratorchProcessingInfo,
-            dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
-            *,
-            cache: Optional[MultiModalProcessorOnlyCache] = None) -> None:
-
+        self,
+        info: TerratorchProcessingInfo,
+        dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
+        *,
+        cache: MultiModalProcessorOnlyCache | None = None,
+    ) -> None:
         self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
         super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        return TerratorchMultiModalDataParser(
-            pretrained_cfg=self.pretrained_cfg)
+        return TerratorchMultiModalDataParser(pretrained_cfg=self.pretrained_cfg)
 
     def _get_mm_fields_config(
         self,
@@ -160,37 +182,37 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         if "image" in mm_data:
             image_data = mm_data["image"]
+            image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}
         else:
             image_data = mm_data
-            mm_data = {"image": mm_data}
+            image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}
+
+        mm_data = {"image": image_data}
 
         mm_items = self._to_mm_items(mm_data)
         tokenization_kwargs = tokenization_kwargs or {}
-        mm_hashes = self._hash_mm_items(mm_items,
-                                        hf_processor_mm_kwargs,
-                                        tokenization_kwargs,
-                                        mm_uuids=mm_uuids)
+        mm_hashes = self._hash_mm_items(
+            mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
+        )
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)
 
         mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_processed_data,
-            self._get_mm_fields_config(mm_processed_data,
-                                       hf_processor_mm_kwargs),
+            self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs),
         )
 
         return MultiModalInputs(
             type="multimodal",
-            prompt=prompt,
             prompt_token_ids=[1],
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
@@ -205,11 +227,12 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
     dummy_inputs=TerratorchInputBuilder,
 )
 class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
+    merge_by_field_config = True
     supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -226,13 +249,15 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)}, )
+        self.pooler = DispatchPooler({"plugin": DummyPooler()})
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # We do not really use any input tokens and therefore no embeddings
         # to be calculated. However, due to the mandatory token ids in
@@ -242,18 +267,17 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         model_output = self.inference_runner.forward(**kwargs)
 
         return model_output.output
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_list = []
         model_buffers = dict(self.named_buffers())
         loaded_buffers = []
@@ -276,8 +300,9 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
                             if "_timm_module." in name:
                                 name = name.replace("_timm_module.", "")
                             buffer = model_buffers[name]
-                            weight_loader = getattr(buffer, "weight_loader",
-                                                    default_weight_loader)
+                            weight_loader = getattr(
+                                buffer, "weight_loader", default_weight_loader
+                            )
                             weight_loader(buffer, weight)
                             loaded_buffers.append(name)
                         else:
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
deleted file mode 100644
index 19dd242f16eb62bbfd400757b823cc62270ef0ae..0000000000000000000000000000000000000000
--- a/vllm/model_executor/models/transformers.py
+++ /dev/null
@@ -1,948 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2024 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Wrapper around `transformers` models"""
-from collections.abc import Iterable, Mapping
-from contextlib import contextmanager
-from pathlib import Path
-from typing import Literal, Optional, Union
-
-import regex as re
-import torch
-from torch import nn
-from transformers import (AutoModel, BatchFeature, PretrainedConfig,
-                          PreTrainedModel)
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-
-from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, VllmConfig)
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.distributed.utils import get_pp_indices
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalUUIDDict,
-                                    PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
-
-from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
-                         SupportsQuant)
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    flatten_bn, make_empty_intermediate_tensors_factory,
-                    maybe_prefix)
-
-logger = init_logger(__name__)
-
-
-def get_feature_request_tip(
-    model: str,
-    trust_remote_code: bool,
-) -> str:
-    hf_url = f"a discussion at https://huggingface.co/{model}/discussions/new"
-    gh_url = "an issue at https://github.com/huggingface/transformers/issues/new/choose"
-    url = hf_url if trust_remote_code else gh_url
-    prefix = f"Please open {url} to request support for this feature. "
-    if Path(model).exists():
-        prefix = ""
-    doc_url = "https://docs.vllm.ai/en/latest/models/supported_models.html#writing-custom-models"
-    tip = f"See {doc_url} for instructions on how to add support yourself."
-    return f"{prefix}{tip}"
-
-
-def vllm_flash_attention_forward(
-        # Transformers args
-        module: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attention_mask: torch.Tensor,
-        # Transformers kwargs
-        scaling: Optional[float] = None,
-        # vLLM kwargs
-        attention_instances: Optional[dict[Attention]] = None,
-        **kwargs):
-    self_attn = attention_instances[module.layer_idx]
-    if scaling is not None:
-        self_attn.impl.scale = float(scaling)
-    hidden = query.shape[-2]
-    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
-    return self_attn.forward(query, key, value), None
-
-
-ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
-
-
-def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
-    logger.debug("%s: %s -> %s", name, old_module, new_module)
-
-
-def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
-    """
-    Callable to be passed to `@support_torch_compile`'s `enable_if` argument.
-
-    Defaults to `True` but is disabled in the following situations:
-
-    - The model uses dynamic rope scaling.
-    """
-    enable = True
-    text_config = vllm_config.model_config.hf_config.get_text_config()
-    # Dynamic rope scaling is not compatible with torch.compile
-    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
-    if rope_scaling.get("rope_type") == "dynamic":
-        enable = False
-    return enable
-
-
-def replace_linear_class(
-    linear: nn.Linear,
-    style: Literal["colwise", "rowwise"],
-    quant_config: QuantizationConfig,
-    *,
-    prefix: str = "",
-) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
-    """
-    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
-
-    Args:
-        linear (nn.Linear): `nn.Linear` to be replaced.
-        style (str): Tensor parallel style of the new linear, e.g. "colwise".
-        quant_config (QuantConfig): Quantization config for the new linear.
-    Returns:
-        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
-    """
-
-    if not isinstance(style, str):
-        raise ValueError(
-            f"Unsupported parallel style type {type(style)}, expected str")
-
-    vllm_linear_cls, vllm_linear_kwargs = {
-        "colwise": (ColumnParallelLinear, {}),
-        "colwise_rep": (ColumnParallelLinear, {
-            "gather_output": True
-        }),
-        "rowwise": (RowParallelLinear, {}),
-        "rowwise_rep": (RowParallelLinear, {
-            "input_is_parallel": False
-        }),
-        "replicate": (ReplicatedLinear, {}),
-    }.get(style, (ReplicatedLinear, {}))
-
-    return vllm_linear_cls(
-        input_size=linear.in_features,
-        output_size=linear.out_features,
-        bias=linear.bias is not None,
-        quant_config=quant_config,
-        prefix=prefix,
-        return_bias=False,
-        **vllm_linear_kwargs,
-    )
-
-
-# Copied from `accelerate`
-@contextmanager
-def init_on_device_without_buffers(device: torch.device):
-    """
-    A context manager under which models are initialized with all
-    parameters on the specified device. However buffers are not
-    initialized on specified device.
-
-    Args:
-        device (`torch.device`):
-            Device to initialize all parameters on.
-    """
-
-    old_register_parameter = nn.Module.register_parameter
-
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            param_cls = type(module._parameters[name])
-            kwargs = module._parameters[name].__dict__
-            kwargs["requires_grad"] = param.requires_grad
-            module._parameters[name] = param_cls(
-                module._parameters[name].to(device), **kwargs)
-
-    tensor_constructors_to_patch = {}
-
-    def patch_tensor_constructor(fn):
-
-        def wrapper(*args, **kwargs):
-            kwargs["device"] = device
-            return fn(*args, **kwargs)
-
-        return wrapper
-
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        for torch_function_name in tensor_constructors_to_patch:
-            setattr(
-                torch, torch_function_name,
-                patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        for torch_function_name, old_torch_function in (
-                tensor_constructors_to_patch.items()):
-            setattr(torch, torch_function_name, old_torch_function)
-
-
-class MultiModalProcessingInfo(BaseProcessingInfo):
-
-    def get_hf_config(self):
-        return self.ctx.model_config.hf_config
-
-    def get_supported_mm_limits(self):
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
-        return {"image": self.get_max_image_tokens()}
-
-    def get_max_image_tokens(self) -> int:
-        width, height = self.get_max_image_size()
-        processor = self.get_hf_processor()
-        multimodal_config = self.ctx.model_config.multimodal_config
-        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
-        mm_tokens = processor._get_num_multimodal_tokens(
-            image_sizes=([height, width], ), **mm_processor_kwargs)
-        image_tokens = mm_tokens["num_image_tokens"][0]
-        return image_tokens
-
-    def get_max_image_size(self):
-        return 10_000, 10_000  # hardcode for arbitrary very large size
-
-
-class MultiModalDummyInputsBuilder(
-        BaseDummyInputsBuilder[MultiModalProcessingInfo]):
-
-    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-        num_images = mm_counts.get("image", 0)
-
-        processor = self.info.get_hf_processor()
-        if "gemma3" in processor.__class__.__name__.lower():
-            image_token = processor.boi_token
-        else:
-            image_token = getattr(processor, "image_token", "")
-        return image_token * num_images
-
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        num_images = mm_counts.get("image", 0)
-
-        target_width, target_height = self.info.get_max_image_size()
-
-        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-        }
-
-
-class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
-
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargsItems,
-    ):
-        """
-        Given the original multi-modal items for this modality
-        and HF-processed data, output the updates to perform.
-
-        The information returned by this method is used to update token inputs
-        which bypass the HF processor. It is also used to update the output of
-        HF processor if the HF process does not apply prompt updates to text
-        inputs.
-
-        Moreover, this information is critical to determine the token positions
-        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
-        for each multi-modal item.
-        """
-        return None
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs,
-        hf_processor_mm_kwargs,
-        num_image_patches: torch.Tensor = None,
-    ):
-        # HF Processors always return a mask but vLLM doesn't need it
-        hf_inputs.pop("attention_mask", None)
-        mm_fields = {
-            key: MultiModalFieldConfig.flat_from_sizes("image",
-                                                       num_image_patches)
-            for key in hf_inputs
-        }
-        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
-            "image", num_image_patches)
-        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
-        return mm_fields
-
-    def _apply_hf_processor_text_mm(
-        self,
-        prompt_text: str,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], BatchFeature, bool]:
-        """
-        Apply the HF processor on the prompt text and multi-modal data
-        together.
-
-        In addition, return whether prompt replacements have been applied.
-        """
-        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
-        processor_data["return_mm_token_type_ids"] = True
-
-        processed_data = self._call_hf_processor(
-            prompt=prompt_text,
-            mm_data=processor_data,
-            mm_kwargs=hf_processor_mm_kwargs,
-            tok_kwargs=tokenization_kwargs,
-        )
-        processed_data.update(passthrough_data)
-
-        prompt_ids, = processed_data.pop("input_ids").tolist()
-        mm_token_type_ids = processed_data.pop(
-            "mm_token_type_ids"
-        ) if "mm_token_type_ids" in processed_data else processed_data.pop(
-            "token_type_ids")  # for gemma3 only
-
-        return prompt_ids, processed_data, mm_token_type_ids
-
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
-    ) -> MultiModalInputs:
-        """
-        Process multi-modal inputs to be used in vLLM.
-
-        Apply HF Processor on prompt text and multi-modal data together,
-        outputting token IDs and processed tensors.
-        """
-        if tokenization_kwargs is None:
-            tokenization_kwargs = {}
-
-        mm_items = self._to_mm_items(mm_data)
-        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        if not isinstance(prompt, str):
-            # the prompt is the tokenized ids which is not supported
-            # by the hf_processor, which is why we would need to decode the ids
-            # into string
-            prompt = hf_processor.decode(prompt)
-
-        (prompt_ids, processed_data,
-         mm_token_type_ids) = self._apply_hf_processor_text_mm(
-             prompt_text=prompt,
-             mm_items=mm_items,
-             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-             tokenization_kwargs=tokenization_kwargs,
-         )
-
-        # HF processor will return `mm_token_type_ids` from which
-        # we can infer mm_placeholders. Until then hardcode to make code run
-        # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
-        mm_positions = torch.where(mm_token_type_ids == 1)[1]
-        images = mm_items.get_items("image", ImageProcessorItems)
-        multimodal_config = self.info.ctx.model_config.multimodal_config
-        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
-        image_sizes = []
-        for item_idx in range(len(images)):
-            image_size = images.get_image_size(item_idx)
-            image_sizes.append((image_size.height, image_size.width))
-
-        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
-            image_sizes=image_sizes, **mm_processor_kwargs)
-
-        mm_placeholders = {}
-        split_sizes = mm_tokens_per_modality["num_image_tokens"]
-        if split_sizes:
-            chunked_mm_positions = torch.split(mm_positions, split_sizes)
-            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
-            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
-            ranges = [
-                PlaceholderRange(
-                    offset=positions[0].item(),
-                    length=positions.shape[0],
-                    is_embed=(mm_tokens == hf_processor.image_token_id).bool())
-                for positions, mm_tokens in zip(chunked_mm_positions,
-                                                chunked_mm_tokens)
-            ]
-            mm_placeholders = {"image": ranges}
-
-        num_image_patches = torch.tensor(
-            mm_tokens_per_modality["num_image_patches"]
-        ) if "num_image_patches" in mm_tokens_per_modality else None
-        processed_data['num_image_patches'] = num_image_patches
-        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
-            processed_data,
-            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
-                                       num_image_patches),
-        )
-
-        # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = self._hash_mm_items(mm_items,
-                                        hf_processor_mm_kwargs,
-                                        tokenization_kwargs,
-                                        mm_uuids=mm_uuids)
-
-        return MultiModalInputs(
-            type="multimodal",
-            prompt=prompt,
-            prompt_token_ids=prompt_ids,
-            mm_kwargs=mm_kwargs,
-            mm_hashes=mm_hashes,
-            mm_placeholders=mm_placeholders,
-        )
-
-
-class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
-    embedding_modules = ["embed_tokens"
-                         ]  # TODO transformers will have a util to get it
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        logger.info("Using Transformers backend.")
-
-        self.config: PretrainedConfig = vllm_config.model_config.hf_config
-        self.text_config: PretrainedConfig = self.config.get_text_config()
-        self.cache_config: CacheConfig = vllm_config.cache_config
-        self.device_config: DeviceConfig = vllm_config.device_config
-        self.model_config: ModelConfig = vllm_config.model_config
-        self.parallel_config: ParallelConfig = vllm_config.parallel_config
-        self.quant_config: QuantizationConfig = vllm_config.quant_config
-
-        self.pp_group = get_pp_group()
-        self.pp_size = self.pp_group.world_size
-        self.pp_rank = self.pp_group.rank_in_group
-        self.tp_size = get_tensor_model_parallel_world_size()
-
-        # Weights to skip in `self.load_weights`
-        self.skip_prefixes: list[str] = []
-        self.skip_substrs: list[str] = []
-
-        # Set correct attn and init on "meta" to delay allocating GPU tensors
-        # TODO: @raushan, use the public `model.set_attn_implementation()`
-        # method once its checks are fixed in Transformers.
-        self.text_config._attn_implementation = "vllm"
-        with init_on_device_without_buffers("meta"):
-            self.model: PreTrainedModel = AutoModel.from_config(
-                self.config,
-                torch_dtype=self.model_config.dtype,
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
-
-        self.pipeline_parallel()
-        self.tensor_parallel()
-
-        # Input embeddings
-        if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
-            self.model.set_input_embeddings(
-                VocabParallelEmbedding(
-                    self.text_config.vocab_size,
-                    self.text_config.hidden_size,
-                    org_num_embeddings=self.text_config.vocab_size,
-                    quant_config=self.quant_config,
-                ))
-
-        # Attention layers
-        self.attention_instances = self.create_attention_instances()
-
-        # Initialize any parameters that have not had their modules replaced
-        self.init_parameters(self.model)
-
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states"], self.text_config.hidden_size))
-
-    def pipeline_parallel(self):
-        """
-        Apply the model's pipeline parallelization plan.
-        """
-        if self.pp_size <= 1:
-            return
-
-        if not self.model.supports_pp_plan:
-            tip = get_feature_request_tip(self.model_config.model,
-                                          self.model_config.trust_remote_code)
-            raise ValueError(
-                f"{type(self.model)} does not support pipeline parallel. {tip}"
-            )
-
-        module_lists = []
-        module_list_idx = None
-        pp_plan = list(self.model._pp_plan.keys())
-        for i, name in enumerate(pp_plan):
-            if isinstance(getattr(self.model, name), nn.ModuleList):
-                module_lists.append(name)
-                module_list_idx = i
-
-        if len(module_lists) > 1:
-            raise ValueError(
-                "Pipeline parallel of models with multiple `ModuleList`s "
-                "in the base model are not supported yet!")
-        if module_list_idx is None:
-            raise ValueError(
-                f"Could not find `ModuleList` in {type(self.model)}")
-
-        # Layers before module list
-        for name in pp_plan[:module_list_idx]:
-            if self.pp_group.is_first_rank or (
-                    self.text_config.tie_word_embeddings
-                    and self.pp_group.is_last_rank):
-                continue
-            setattr(self.model, name, PPMissingLayer())
-
-        # Module list
-        start_layer, end_layer = get_pp_indices(
-            self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
-        layers_name = pp_plan[module_list_idx]
-        layers = getattr(self.model, layers_name)
-        for i in range(len(layers)):
-            if start_layer <= i and i < end_layer:
-                continue
-            layers[i] = PPMissingLayer()
-
-        # Layers after module list
-        for name in pp_plan[module_list_idx + 1:]:
-            # Modules that should be on last rank
-            if not self.pp_group.is_last_rank:
-                setattr(self.model, name, PPMissingLayer())
-
-    def tensor_parallel(self):
-        """
-        Apply the model's tensor parallelization plan.
-        Currently only supports linear layers.
-        """
-        # Look for tp plans in all of the PreTrainedModels found in self.model
-        is_pretrained_model = lambda m: isinstance(m, PreTrainedModel)
-        supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None
-        pretrained_models = filter(is_pretrained_model, self.model.modules())
-        models_with_tp_plan = filter(supports_tp_plan, pretrained_models)
-
-        if not any(models_with_tp_plan) and self.tp_size > 1:
-            tip = get_feature_request_tip(self.model_config.model,
-                                          self.model_config.trust_remote_code)
-            raise ValueError(
-                f"{type(self.model)} does not support tensor parallel. {tip}")
-
-        def _tensor_parallel(module: nn.Module,
-                             prefix: str = "",
-                             tp_plan=None):
-            tp_plan = tp_plan or {}
-
-            # If the current module is a PreTrainedModel, set the tp_plan for
-            # all of its children
-            if isinstance(module, PreTrainedModel):
-                tp_plan = module.config.base_model_tp_plan or {}
-                tp_plan = {
-                    maybe_prefix(prefix, k): v
-                    for k, v in tp_plan.items()
-                }
-
-            # Some weight loaders expect linear layers to inherit from vLLM's
-            # LinearBase class, so we set a default style which causes any
-            # unspecified linear layers to be replaced with ReplicatedLinear
-            for child_name, child_module in module.named_children():
-                qual_name = maybe_prefix(prefix, child_name)
-                if isinstance(child_module, nn.Linear):
-                    generator = (p for p in tp_plan if re.match(p, qual_name))
-                    pattern = next(generator, None)
-                    style = tp_plan.get(pattern, "replicate")
-                    new_module = replace_linear_class(child_module,
-                                                      style,
-                                                      self.quant_config,
-                                                      prefix=qual_name)
-                    setattr(module, child_name, new_module)
-                    log_replacement(qual_name, child_module, new_module)
-                else:
-                    _tensor_parallel(child_module,
-                                     prefix=qual_name,
-                                     tp_plan=tp_plan)
-
-        _tensor_parallel(self.model)
-
-    def create_attention_instances(
-        self,
-        attn_type: AttentionType = AttentionType.DECODER
-    ) -> dict[int, Attention]:
-        """
-        Create `Attention` instances to inform KV cache allocation.
-        """
-        num_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        head_size = self.model_config.get_head_size()
-        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-        start, end = get_pp_indices(self.text_config.num_hidden_layers,
-                                    self.pp_rank, self.pp_size)
-
-        attention_instances = {}
-        for i in range(start, end):
-            # Handle interleaved sliding window attention
-            per_layer_sliding_window = None
-            if (hasattr(self.config, "layer_types")
-                    and self.config.layer_types[i] == "sliding_attention"):
-                per_layer_sliding_window = self.config.sliding_window
-
-            attention_instances[i] = Attention(
-                num_heads=num_heads,
-                head_size=head_size,
-                # NOTE: We use Llama scale as default, if it's set by
-                # Transformers, it's updated in vllm_flash_attention_forward
-                scale=head_size**-0.5,
-                num_kv_heads=num_kv_heads,
-                cache_config=self.cache_config,
-                quant_config=self.quant_config,
-                per_layer_sliding_window=per_layer_sliding_window,
-                prefix=f"{i}.attn",
-                attn_type=attn_type)
-        return attention_instances
-
-    def init_parameters(self, module: nn.Module):
-        """
-        If a `parameter` is on the `meta` device, then its parent
-        `module` is the original module created by:
-
-        ```python
-        with torch.device("meta"):
-            self.model: PreTrainedModel = AutoModel.from_config(...)
-        ```
-        """
-        for name, param in module.named_parameters(recurse=False):
-            if param.device == torch.device("meta"):
-                new_param = nn.Parameter(
-                    torch.empty_like(param.data,
-                                     dtype=self.model_config.dtype,
-                                     device=self.device_config.device))
-                setattr(module, name, new_param)
-        for child in module.children():
-            self.init_parameters(child)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if not get_pp_group().is_first_rank:
-            assert intermediate_tensors is not None
-            input_ids = None
-            inputs_embeds = intermediate_tensors["hidden_states"]
-
-        if input_ids is not None:
-            input_ids = input_ids[None, ...]
-        if inputs_embeds is not None:
-            inputs_embeds = inputs_embeds[None, ...]
-
-        if self.model_config.uses_mrope:
-            position_ids = positions[:, None]
-        else:
-            position_ids = positions[None, ...]
-
-        hidden_states = self.model(
-            input_ids=input_ids,
-            inputs_embeds=inputs_embeds,
-            use_cache=False,
-            position_ids=position_ids,
-            attention_instances=self.attention_instances,
-            return_dict=False)[0][0, ...]  # we remove batch dimension for now
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({"hidden_states": hidden_states})
-
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=self.skip_prefixes,
-            skip_substrs=self.skip_substrs,
-        )
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-
-
-@support_torch_compile(enable_if=can_enable_torch_compile)
-class TransformersModel(TransformersBase):
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            # Handle BERT-like models
-            "bert": "model",
-            # Add `model.` prefix for base model checkpoints
-            "": "model.",
-            # Remove `model.` prefix if it was already there
-            "model.model.": "model.",
-            # Pooling adapters will be adjacent to `model`
-            "model.pooler": "pooler",
-            "model.score": "score",
-            # Classifier adapter's classifier layer is renamed to score
-            "model.classifier": "score",
-        },
-        orig_to_new_suffix={
-            # Replace legacy suffixes used for norms
-            ".gamma": ".weight",
-            ".beta": ".bias",
-        })
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        # After creating a pooling model, `pooler` will be duplicated.
-        # The one inside `model` comes from the Transformers modelling code.
-        # The one after `model` is an adapter from vLLM.
-        # We want to use the adapter so we nullify the original pooler.
-        if getattr(self.model, "pooler", None) is not None:
-            self.skip_prefixes.append("pooler.")
-            self.model.pooler = torch.nn.Identity()
-
-        # Some encoder models have the position_ids buffer in the checkpoint.
-        # vLLM will always pass position_ids as an argument, so we skip loading
-        # the buffer if it exists
-        self.skip_substrs.append("position_ids")
-
-        # Some encoder models have the bias of the final classifier layer
-        # in the checkpoint. vLLM does not use this bias, so we skip loading
-        # it if it exists
-        self.skip_substrs.append("score.bias")
-
-    def create_attention_instances(
-            self, attn_type: AttentionType = AttentionType.DECODER):
-        # TODO(hmellor): Better way to detect encoder models
-        # In encoder models, the attention layers will have `is_causal=False`
-        is_encoder = lambda m: not getattr(m, "is_causal", True)
-        # vLLM does not support encoder-decoder models, so if any encoder layer
-        # is found, we assume the whole model is an encoder model
-        if any(is_encoder(m) for m in self.model.modules()):
-            attn_type = AttentionType.ENCODER_ONLY
-
-        # Check minimum transformers version for encoder models support
-        if attn_type == AttentionType.ENCODER_ONLY:
-            import transformers
-            from packaging.version import Version
-            installed = Version(transformers.__version__)
-            required = Version("4.57.0.dev0")
-            if installed < required:
-                raise ValueError(
-                    "Encoder models with the Transformers backend require "
-                    f"transformers>={required}, but got {installed}")
-
-        return super().create_attention_instances(attn_type)
-
-
-@support_torch_compile(enable_if=can_enable_torch_compile)
-class TransformersForCausalLM(TransformersBase):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        # Tell `TransformersBase.load_weights` to skip
-        # `lm_head` if the model has tied word embeddings
-        if self.text_config.tie_word_embeddings:
-            self.skip_prefixes.append("lm_head.")
-
-        if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.text_config.vocab_size
-            self.lm_head = ParallelLMHead(
-                self.text_config.vocab_size,
-                self.text_config.hidden_size,
-                quant_config=self.quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            if self.text_config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.get_input_embeddings())
-
-            logit_scale = getattr(self.text_config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.text_config.vocab_size,
-                logit_scale)
-        else:
-            self.lm_head = PPMissingLayer()
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-
-def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
-    """Flatten until a list of tensors can be concatenated then do concat"""
-
-    def _can_concat(x: list[torch.Tensor]):
-        return len(set(map(lambda _x: _x.shape[1:], x))) == 1
-
-    if _can_concat(x):
-        return torch.concat(x)
-    return flatten_and_concat(flatten_bn(x))
-
-
-@MULTIMODAL_REGISTRY.register_processor(
-    MultiModalProcessor,
-    info=MultiModalProcessingInfo,
-    dummy_inputs=MultiModalDummyInputsBuilder)
-@support_torch_compile(
-    # set `positions` to last dim to support Qwen-mrope
-    dynamic_arg_dims={
-        "input_ids": 0,
-        "positions": -1,
-        "intermediate_tensors": 0,
-        "inputs_embeds": 0,
-    },
-    enable_if=can_enable_torch_compile)
-class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
-    # Backwards compatibility for prev released models. State dicts back then
-    # had different formats and cannot be loaded with `AutoModel` mapping as is
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            "language_model.model": "model.language_model",
-            "text_model.model": "model.text_model",
-            "vision_tower": "model.vision_tower",
-            "vqmodel": "model.vqmodel",
-            "visual": "model.visual",
-            "vision_model": "model.vision_model",
-            "vision_embed_tokens": "model.vision_embed_tokens",
-            "image_newline": "model.image_newline",
-            "multi_modal_projector": "model.multi_modal_projector",
-            "text_model.lm_head": "lm_head",
-            "language_model.lm_head": "lm_head",
-            # Qwen models used "model" as the name for the language model.
-            # Therefore, we must map each of submodule explicitly to avoid
-            # conflicts with newer models that use "model.language_model".
-            "model.embed_tokens": "model.language_model.embed_tokens",
-            "model.layers": "model.language_model.layers",
-            "model.norm": "model.language_model.norm",
-        })
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        self.dtype = vllm_config.model_config.dtype
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        # NOTE: In v1, inputs_embeds is always generated at model runner from
-        # `get_multimodal_embeddings` and `get_input_embeddings`, this
-        # condition is only for v0 compatibility.
-        if inputs_embeds is None:
-            multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
-            if multimodal_embeds is not None:
-                inputs_embeds = self.get_input_embeddings(
-                    input_ids, multimodal_embeds)
-                input_ids = None
-
-        model_output = super().forward(input_ids, positions,
-                                       intermediate_tensors, inputs_embeds)
-        return model_output
-
-    def get_multimodal_embeddings(self, **kwargs):
-        pixel_values = kwargs.pop("pixel_values", None)
-        pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
-            "image_patches", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-
-        if image_embeds is not None:
-            return image_embeds
-
-        if pixel_values is None and image_embeds is None:
-            return None
-
-        num_image_patches = kwargs.pop("num_image_patches")
-        if pixel_values is not None:
-            if isinstance(pixel_values, torch.Tensor):
-                pixel_values = flatten_bn(pixel_values).to(self.dtype)
-            elif is_list_of(pixel_values, torch.Tensor):
-                pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
-            else:
-                raise ValueError(
-                    f"Unsupported pixel_values type {type(pixel_values)}. "
-                    "Expected `torch.Tensor` or list of `torch.Tensor`.")
-
-            if isinstance(num_image_patches, list):
-                num_image_patches = torch.cat(num_image_patches)
-
-            vision_embeddings = self.model.get_image_features(
-                pixel_values,
-                **{
-                    k: v.flatten(0, 1)
-                    for k, v in kwargs.items()
-                },
-            )
-
-            if isinstance(vision_embeddings, torch.Tensor):
-                if vision_embeddings.ndim == 2:
-                    vision_embeddings = vision_embeddings.unsqueeze(0)
-
-                # Embeddings have to be 2D tensors of length `num_images`
-                # but transformers returns concat tensors if each patch
-                # is of different size. We split it back to make vLLM happy
-                vision_embeddings = torch.split(
-                    vision_embeddings,
-                    num_image_patches.flatten().tolist())
-                vision_embeddings = [
-                    embed.flatten(start_dim=0, end_dim=-2)
-                    for embed in vision_embeddings
-                ]
-
-            return vision_embeddings
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings=None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.get_input_embeddings()(input_ids)
-        if (multimodal_embeddings is not None
-                and len(multimodal_embeddings) != 0):
-            mask = (input_ids == self.config.image_token_id)
-            mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
-            multimodal_embeddings = torch.cat(multimodal_embeddings)
-
-            inputs_embeds = inputs_embeds.masked_scatter(
-                mask, multimodal_embeddings)
-        return inputs_embeds
diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..93cd8ff507669934e8858d39a4c950a2ececcdca
--- /dev/null
+++ b/vllm/model_executor/models/transformers/__init__.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper around `transformers` models"""
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.model_executor.models.transformers.base import Base
+from vllm.model_executor.models.transformers.causal import CausalMixin
+from vllm.model_executor.models.transformers.legacy import LegacyMixin
+from vllm.model_executor.models.transformers.moe import MoEMixin
+from vllm.model_executor.models.transformers.multimodal import (
+    DYNAMIC_ARG_DIMS,
+    MultiModalDummyInputsBuilder,
+    MultiModalMixin,
+    MultiModalProcessingInfo,
+    MultiModalProcessor,
+)
+from vllm.model_executor.models.transformers.pooling import (
+    EmbeddingMixin,
+    SequenceClassificationMixin,
+)
+from vllm.model_executor.models.transformers.utils import can_enable_torch_compile
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+# Text only models
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersForCausalLM(CausalMixin, Base): ...
+
+
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersMoEForCausalLM(MoEMixin, CausalMixin, Base): ...
+
+
+# Multimodal models
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor,
+    info=MultiModalProcessingInfo,
+    dummy_inputs=MultiModalDummyInputsBuilder,
+)
+@support_torch_compile(
+    dynamic_arg_dims=DYNAMIC_ARG_DIMS, enable_if=can_enable_torch_compile
+)
+class TransformersMultiModalForCausalLM(MultiModalMixin, CausalMixin, Base): ...
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor,
+    info=MultiModalProcessingInfo,
+    dummy_inputs=MultiModalDummyInputsBuilder,
+)
+@support_torch_compile(
+    dynamic_arg_dims=DYNAMIC_ARG_DIMS, enable_if=can_enable_torch_compile
+)
+class TransformersMultiModalMoEForCausalLM(
+    MoEMixin, MultiModalMixin, CausalMixin, Base
+): ...
+
+
+# Embedding models
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersEmbeddingModel(EmbeddingMixin, LegacyMixin, Base): ...
+
+
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersMoEEmbeddingModel(EmbeddingMixin, MoEMixin, Base): ...
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor,
+    info=MultiModalProcessingInfo,
+    dummy_inputs=MultiModalDummyInputsBuilder,
+)
+@support_torch_compile(
+    dynamic_arg_dims=DYNAMIC_ARG_DIMS, enable_if=can_enable_torch_compile
+)
+class TransformersMultiModalEmbeddingModel(EmbeddingMixin, MultiModalMixin, Base): ...
+
+
+# Sequence classification models
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersForSequenceClassification(
+    SequenceClassificationMixin, LegacyMixin, Base
+): ...
+
+
+@support_torch_compile(enable_if=can_enable_torch_compile)
+class TransformersMoEForSequenceClassification(
+    SequenceClassificationMixin, MoEMixin, Base
+): ...
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MultiModalProcessor,
+    info=MultiModalProcessingInfo,
+    dummy_inputs=MultiModalDummyInputsBuilder,
+)
+@support_torch_compile(
+    dynamic_arg_dims=DYNAMIC_ARG_DIMS, enable_if=can_enable_torch_compile
+)
+class TransformersMultiModalForSequenceClassification(
+    SequenceClassificationMixin, MultiModalMixin, Base
+): ...
+
+
+def __getattr__(name: str):
+    """Handle imports of non-existent classes with a helpful error message."""
+    if name not in globals():
+        raise AttributeError(
+            "The Transformers modeling backend does not currently have a class to "
+            f"handle the requested model type: {name}. Please open an issue at "
+            "https://github.com/vllm-project/vllm/issues/new"
+        )
+    return globals()[name]
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ba4758bcc460f1823bbb6d0c3e0dceca4aee28
--- /dev/null
+++ b/vllm/model_executor/models/transformers/base.py
@@ -0,0 +1,464 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend base class."""
+
+from collections.abc import Iterable
+from typing import TYPE_CHECKING
+
+import regex as re
+import torch
+import transformers
+from packaging.version import Version
+from torch import nn
+from transformers import AutoModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.config.utils import getattr_iter
+from vllm.distributed import get_pp_group, get_tp_group
+from vllm.distributed.utils import get_pp_indices
+from vllm.logger import init_logger
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.models.interfaces import (
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+)
+from vllm.model_executor.models.interfaces_base import VllmModel
+from vllm.model_executor.models.transformers.utils import (
+    get_feature_request_tip,
+    init_on_device_without_buffers,
+    log_replacement,
+    replace_linear_class,
+    replace_rms_norm_class,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    make_empty_intermediate_tensors_factory,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from vllm.config import VllmConfig
+else:
+    PreTrainedModel = object
+
+logger = init_logger(__name__)
+
+
+def vllm_flash_attention_forward(
+    # Transformers args
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    # Transformers kwargs
+    scaling: float | None = None,
+    # vLLM kwargs
+    attention_instances: dict[int, Attention] | None = None,
+    **kwargs,
+):
+    self_attn = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.impl.scale = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(query, key, value), None
+
+
+ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
+
+
+class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Add `model.` prefix for base model checkpoints,
+            # handling the case where it is already present
+            "": "model.",
+            "model.model.": "model.",
+            # Heads will be adjacent to `model` (pooling included because of adapters)
+            "model.lm_head.": "lm_head.",
+            "model.score.": "classifier.",
+            "model.classifier.": "classifier.",
+        }
+    )
+
+    def __init_subclass__(cls, *args, **kwargs):
+        """Merge hf_to_vllm_mapper in MRO from most specific to least specific."""
+        super().__init_subclass__(*args, **kwargs)
+        hf_to_vllm_mapper = WeightsMapper()
+        for base in cls.__mro__:
+            if base_hf_to_vllm_mapper := getattr(base, "hf_to_vllm_mapper", None):
+                hf_to_vllm_mapper |= base_hf_to_vllm_mapper
+        cls.hf_to_vllm_mapper = hf_to_vllm_mapper
+
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        super().__init__()
+        logger.info("Using Transformers modeling backend.")
+
+        self.config = vllm_config.model_config.hf_config
+        self.text_config = self.config.get_text_config()
+        self.cache_config = vllm_config.cache_config
+        self.device_config = vllm_config.device_config
+        self.model_config = vllm_config.model_config
+        self.parallel_config = vllm_config.parallel_config
+        self.quant_config = vllm_config.quant_config
+
+        self.pp_group = get_pp_group()
+        self.tp_group = get_tp_group()
+
+        # Weights to skip in `self.load_weights`
+        self.skip_prefixes: list[str] = []
+        """Skip loading weights whose qualname starts with these prefixes."""
+        self.skip_substrs: list[str] = []
+        """Skip loading weights whose qualname contains these substrings."""
+        self.ignore_unexpected_prefixes: list[str] = []
+        """Ignore unexpected weights whose qualname starts with these prefixes.
+        """
+        self.ignore_unexpected_suffixes: list[str] = []
+        """Ignore unexpected weights whose qualname ends with these suffixes."""
+
+        if self.quant_config:
+            quant_method_name = self.quant_config.get_name()
+            # Check for unsupported quantization methods.
+            if quant_method_name == "mxfp4":
+                raise NotImplementedError(
+                    "Transformers modeling backend does "
+                    "not support MXFP4 quantization yet."
+                )
+            # Skip loading extra bias for GPTQ models.
+            if "gptq" in quant_method_name:
+                self.ignore_unexpected_suffixes.append(".bias")
+
+        # Set correct attn and init on "meta" to delay allocating GPU tensors
+        self.text_config._attn_implementation = "vllm"
+        with init_on_device_without_buffers("meta"):
+            self.model: PreTrainedModel = AutoModel.from_config(
+                self.config,
+                dtype=self.model_config.dtype,
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+        # Remove layers not on this pipeline parallel rank
+        self.pipeline_parallel()
+        # Substitute remaining layers with vLLM's layers as needed
+        self.recursive_replace()
+        # Create attention instances for KV cache allocation
+        self.attention_instances = self.create_attention_instances()
+
+        # Input embeddings
+        input_embeddings = self.model.get_input_embeddings()
+        if not isinstance(input_embeddings, PPMissingLayer):
+            # Some models scale embeddings inside the input embedding layer
+            self.embed_scale = getattr(input_embeddings, "embed_scale", None)
+            names = ("embedding_size", "hidden_size")
+            embedding_dim = getattr_iter(self.text_config, names, None)
+            assert embedding_dim is not None
+            self.model.set_input_embeddings(
+                VocabParallelEmbedding(
+                    self.text_config.vocab_size,
+                    embedding_dim=embedding_dim,
+                    org_num_embeddings=self.text_config.vocab_size,
+                    quant_config=self.quant_config,
+                )
+            )
+
+        # Initialize any parameters that have not had their modules replaced
+        self.init_parameters(self.model)
+
+        # Pipeline parallel intermediate tensors
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], self.text_config.hidden_size
+        )
+
+    def pipeline_parallel(self):
+        """
+        Apply the model's pipeline parallelization plan.
+        """
+        if self.pp_group.world_size <= 1:
+            return
+
+        if not self.model.supports_pp_plan:
+            tip = get_feature_request_tip(
+                self.model_config.model, self.model_config.trust_remote_code
+            )
+            raise ValueError(
+                f"{type(self.model)} does not support pipeline parallel. {tip}"
+            )
+
+        module_lists = []
+        module_list_idx = None
+        pp_plan = list(self.model._pp_plan.keys())
+        for i, name in enumerate(pp_plan):
+            if isinstance(getattr(self.model, name), nn.ModuleList):
+                module_lists.append(name)
+                module_list_idx = i
+
+        if len(module_lists) > 1:
+            raise ValueError(
+                "Pipeline parallel of models with multiple `ModuleList`s "
+                "in the base model are not supported yet!"
+            )
+        if module_list_idx is None:
+            raise ValueError(f"Could not find `ModuleList` in {type(self.model)}")
+
+        # Layers before module list
+        for name in pp_plan[:module_list_idx]:
+            if self.pp_group.is_first_rank or (
+                self.text_config.tie_word_embeddings and self.pp_group.is_last_rank
+            ):
+                continue
+            setattr(self.model, name, PPMissingLayer())
+
+        # Module list
+        start_layer, end_layer = get_pp_indices(
+            self.text_config.num_hidden_layers,
+            self.pp_group.rank_in_group,
+            self.pp_group.world_size,
+        )
+        layers_name = pp_plan[module_list_idx]
+        layers = getattr(self.model, layers_name)
+        for i in range(len(layers)):
+            if start_layer <= i and i < end_layer:
+                continue
+            layers[i] = PPMissingLayer()
+
+        # Layers after module list
+        for name in pp_plan[module_list_idx + 1 :]:
+            # Modules that should be on last rank
+            if not self.pp_group.is_last_rank:
+                setattr(self.model, name, PPMissingLayer())
+
+    def recursive_replace(self):
+        """Recursively replace modules in the model as needed.
+
+        Currently, this replaces:
+
+        - `nn.Linear` with vLLM's tensor parallel linear classes
+        - `*RMSNorm` with vLLM's `RMSNorm`
+        """
+        tp_plan = self.model.tp_plan
+
+        if not tp_plan and self.tp_group.world_size > 1:
+            tip = get_feature_request_tip(
+                self.model_config.model, self.model_config.trust_remote_code
+            )
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel. {tip}"
+            )
+
+        # Prefix the patterns because we always start from `self.model`
+        tp_plan = {maybe_prefix("model", k): v for k, v in tp_plan.items()}
+
+        def _recursive_replace(module: nn.Module, prefix: str):
+            for child_name, child_module in module.named_children():
+                new_module = child_module
+                qual_name = maybe_prefix(prefix, child_name)
+                if isinstance(child_module, nn.Linear):
+                    generator = (p for p in tp_plan if re.match(p, qual_name))
+                    pattern = next(generator, None)
+                    # Some weight loaders expect all linear layers to inherit
+                    # LinearBase, so we set a default style which causes any
+                    # unspecified layers to be replaced with ReplicatedLinear
+                    style = tp_plan.get(pattern, "replicate")
+                    new_module = replace_linear_class(
+                        child_module, style, self.quant_config, prefix=qual_name
+                    )
+                elif child_module.__class__.__name__.endswith("RMSNorm"):
+                    new_module = replace_rms_norm_class(
+                        child_module, self.text_config.hidden_size
+                    )
+                else:
+                    _recursive_replace(child_module, prefix=qual_name)
+
+                if new_module is not child_module:
+                    setattr(module, child_name, new_module)
+                    log_replacement(qual_name, child_module, new_module)
+
+        _recursive_replace(self.model, prefix="model")
+
+    def create_attention_instances(self) -> dict[int, Attention]:
+        """
+        Create `Attention` instances to inform KV cache allocation.
+        """
+        text_config = self.text_config
+
+        num_heads = self.model_config.get_num_attention_heads(self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        logits_soft_cap = getattr(text_config, "attn_logit_softcapping", None)
+
+        # In encoder models, the attention layers will have `is_causal=False`
+        is_encoder = lambda module: not getattr(module, "is_causal", True)
+        has_encoder = lambda model: any(is_encoder(m) for m in model.modules())
+        is_multimodal = lambda config: config != config.get_text_config()
+        # vLLM does not support encoder-decoder models, so if any encoder layer is
+        # found in a text only model, we assume the whole model is an encoder model
+        if has_encoder(self.model) and not is_multimodal(self.config):
+            self.check_version("5.0.0.dev0", "encoder models support")
+            attn_type = AttentionType.ENCODER_ONLY
+        else:
+            attn_type = AttentionType.DECODER
+
+        pp_rank = self.pp_group.rank_in_group
+        pp_size = self.pp_group.world_size
+        start, end = get_pp_indices(text_config.num_hidden_layers, pp_rank, pp_size)
+
+        attention_instances = {}
+        for i in range(start, end):
+            # Handle interleaved sliding window attention
+            per_layer_sliding_window = None
+            if (
+                hasattr(self.config, "layer_types")
+                and self.config.layer_types[i] == "sliding_attention"
+            ):
+                per_layer_sliding_window = self.config.sliding_window
+
+            attn_cls = (
+                EncoderOnlyAttention
+                if attn_type == AttentionType.ENCODER_ONLY
+                else Attention
+            )
+            attention_instances[i] = attn_cls(
+                num_heads=num_heads,
+                head_size=head_size,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in vllm_flash_attention_forward
+                scale=head_size**-0.5,
+                num_kv_heads=num_kv_heads,
+                cache_config=self.cache_config,
+                quant_config=self.quant_config,
+                logits_soft_cap=logits_soft_cap,
+                per_layer_sliding_window=per_layer_sliding_window,
+                prefix=f"{i}.attn",
+                attn_type=attn_type,
+            )
+        return attention_instances
+
+    def init_parameters(self, module: nn.Module, dtype: torch.dtype | None = None):
+        """
+        If a `parameter` is on the `meta` device, then its parent
+        `module` is the original module created by:
+
+        ```python
+        with torch.device("meta"):
+            self.model: "PreTrainedModel" = AutoModel.from_config(...)
+        ```
+        """
+
+        def _init_parameters(module: nn.Module, dtype: torch.dtype | None):
+            for name, param in module.named_parameters(recurse=False):
+                if param.device == torch.device("meta"):
+                    new_param = nn.Parameter(
+                        torch.empty_like(
+                            param.data,
+                            dtype=dtype or self.model_config.dtype,
+                            device=self.device_config.device,
+                        )
+                    )
+                    setattr(module, name, new_param)
+            for child in module.children():
+                _init_parameters(child, dtype)
+
+        _init_parameters(module, dtype)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        if self.embed_scale is not None:
+            inputs_embeds *= self.embed_scale
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        if not self.pp_group.is_first_rank:
+            assert intermediate_tensors is not None
+            input_ids = None
+            inputs_embeds = intermediate_tensors["hidden_states"]
+
+        if input_ids is not None:
+            input_ids = input_ids[None, ...]
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds[None, ...]
+
+        # If the model scales embeddings inside the input embedding layer we must
+        # ensure they are scaled here since VocabParallelEmbedding will not do it
+        if (
+            self.embed_scale is not None
+            and input_ids is not None
+            and inputs_embeds is None
+        ):
+            inputs_embeds = self.embed_input_ids(input_ids)
+            input_ids = None
+
+        if self.model_config.uses_mrope:
+            position_ids = positions[:, None]
+        else:
+            position_ids = positions[None, ...]
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=False,
+            position_ids=position_ids,
+            attention_instances=self.attention_instances,
+            return_dict=False,
+            **kwargs,
+        )[0][0, ...]  # we remove batch dimension for now
+
+        if not self.pp_group.is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        return hidden_states
+
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=self.skip_prefixes,
+            skip_substrs=self.skip_substrs,
+            ignore_unexpected_prefixes=self.ignore_unexpected_prefixes,
+            ignore_unexpected_suffixes=self.ignore_unexpected_suffixes,
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    @staticmethod
+    def check_version(min_version: str, feature: str):
+        installed = Version(transformers.__version__)
+        required = Version(min_version)
+        if installed < required:
+            raise ImportError(
+                f"Transformers modeling backend requires transformers>={required} "
+                f"for {feature}, but got {installed}"
+            )
diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2865ed0c7ff5302077cea08a940ddf9ee51c1d3
--- /dev/null
+++ b/vllm/model_executor/models/transformers/causal.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend mixin for causal language models."""
+
+from typing import TYPE_CHECKING
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.interfaces_base import VllmModelForTextGeneration
+from vllm.model_executor.models.utils import PPMissingLayer, maybe_prefix
+
+if TYPE_CHECKING:
+    import torch
+
+    from vllm.config import VllmConfig
+
+
+class CausalMixin(VllmModelForTextGeneration):
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        # Skip VllmModelForTextGeneration.__init__ and call the next class in MRO
+        super(VllmModelForTextGeneration, self).__init__(
+            vllm_config=vllm_config, prefix=prefix
+        )
+
+        # Tell `Base.load_weights` to skip
+        # `lm_head` if the model has tied word embeddings
+        if self.text_config.tie_word_embeddings:
+            self.skip_prefixes.append("lm_head.")
+
+        if self.pp_group.is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.text_config.vocab_size,
+                self.text_config.hidden_size,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if self.text_config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings()
+                )
+
+            logit_scale = getattr(self.text_config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(
+                self.text_config.vocab_size, scale=logit_scale
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def compute_logits(self, hidden_states: "torch.Tensor") -> "torch.Tensor | None":
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca630be56154a22c35257db1296fe35d8c66305
--- /dev/null
+++ b/vllm/model_executor/models/transformers/legacy.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend mixin for legacy models."""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class LegacyMixin:
+    hf_to_vllm_mapper = WeightsMapper(
+        # These are applied in order, so the order matters!
+        orig_to_new_prefix={
+            # Handle BERT-like models
+            "roberta": "model",
+            "bert": "model",
+        },
+        orig_to_new_suffix={
+            # Replace legacy suffixes used for norms
+            ".gamma": ".weight",
+            ".beta": ".bias",
+        },
+    )
+
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        # Skip unsupported/unwanted output embeddings layers
+        self.skip_prefixes.extend(
+            [
+                "model.lm_head.",
+                "model.predictions.",
+                "model.qa_outputs.",
+                "model.embeddings_project.",
+                "model.discriminator_predictions.",
+            ]
+        )
+
+        # Some encoder models have the position_ids buffer in the checkpoint.
+        # vLLM will always pass position_ids as an argument, so we skip loading
+        # the buffer if it exists
+        self.skip_substrs.append("position_ids")
+
+        # Some encoder models have the bias of the final classifier layer
+        # in the checkpoint. vLLM does not use this bias, so we skip loading
+        # it if it exists
+        self.skip_substrs.append("score.bias")
+
+        # roberta-like models an extra padding in positions.
+        # FIXME(Isotr0py): This is quite hacky for roberta edge case,
+        # we should find a better way to handle this.
+        self.is_roberta = "roberta" in self.text_config.model_type
+        self.padding_idx = self.text_config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if self.is_roberta:
+            # RoBERTa-specific positions padding
+            positions += self.padding_idx + 1
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4973014c3d4ed377f41b2dc56efb4c7b54e0614b
--- /dev/null
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend mixin for Mixture of Experts (MoE) models."""
+
+from typing import TYPE_CHECKING, Any
+
+import torch
+import torch.nn as nn
+
+from vllm.config.utils import getattr_iter
+from vllm.distributed import get_dp_group, get_ep_group
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.models.interfaces import MixtureOfExperts
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op
+
+from .utils import log_replacement
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+@CustomOp.register("transformers_fused_moe")
+class TransformersFusedMoE(FusedMoE):
+    """Custom FusedMoE for the Transformers modeling backend."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._topk_ids: torch.Tensor = None
+
+        def custom_routing_function(hidden_states, gating_output, topk, renormalize):
+            """Return `topk_weights` from `gating_output` and the
+            `topk_ids` we stored in the layer earlier."""
+            topk_weights = gating_output
+            topk_ids = self._topk_ids
+            # Handle all gather in expert parallel
+            if topk_ids.size(0) != hidden_states.size(0):
+                dp_metadata = get_forward_context().dp_metadata
+                sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+                is_sp = self.is_sequence_parallel
+                dist_group = get_ep_group() if is_sp else get_dp_group()
+                assert sizes[dist_group.rank_in_group] == topk_ids.shape[0]
+                (topk_ids,) = dist_group.all_gatherv([topk_ids], 0, sizes)
+            return topk_weights, topk_ids
+
+        self.custom_routing_function = custom_routing_function
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        """In Transformers `experts.forward` will have this signature.
+
+        We discard any extra kwargs because we cannot use them here."""
+        return torch.ops.vllm.transformers_moe_forward(
+            hidden_states,
+            topk_ids.to(torch.int32),
+            topk_weights.to(torch.float32),
+            self.layer_name,
+        )
+
+
+def transformers_moe_forward(
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    """Store the `topk_ids` in the layer and call the actual forward."""
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._topk_ids = topk_ids
+    # Clone hidden_states because it will be mutated in-place in FusedMoE
+    return self.forward_impl(hidden_states.clone(), topk_weights)
+
+
+def transformers_moe_forward_fake(
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="transformers_moe_forward",
+    op_func=transformers_moe_forward,
+    mutates_args=["hidden_states"],
+    fake_impl=transformers_moe_forward_fake,
+    dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.needs_fixed_stride_order,),
+)
+
+
+class MoEMixin(MixtureOfExperts):
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        self.check_version("5.0.0.dev0", "MoE models support")
+        # Skip MixtureOfExperts.__init__ and call the next class in MRO
+        super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ):
+        for moe_layer_idx, mlp_layer in enumerate(self.mlp_moe_layers):
+            mlp_layer.experts.set_eplb_state(
+                moe_layer_idx=moe_layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ):
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for mlp in self.mlp_moe_layers:
+            mlp.n_local_physical_experts = num_local_physical_experts
+            mlp.n_physical_experts = num_physical_experts
+            mlp.n_redundant_experts = self.num_redundant_experts
+            mlp.experts.update_expert_map()
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        """
+        Params for weights, fp8 weight scales, fp8 activation scales
+        (param_name, weight_name, expert_id, shard_id)
+        """
+        ckpt_names = [
+            # (ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name)
+            ("gate_proj", "down_proj", "up_proj"),  # Most common MoE style
+            ("w1", "w2", "w3"),  # Granite, Mixtral, Phi MoE style
+            ("linear", "linear_1", "linear_v"),  # Grok1 style
+        ]
+        num_experts = self.model_config.get_num_experts()
+        num_redundant_experts = self.parallel_config.eplb_config.num_redundant_experts
+        expert_mapping = []
+        for gate_proj, down_proj, up_proj in ckpt_names:
+            expert_mapping.extend(
+                FusedMoE.make_expert_params_mapping(
+                    ckpt_gate_proj_name=gate_proj,
+                    ckpt_down_proj_name=down_proj,
+                    ckpt_up_proj_name=up_proj,
+                    num_experts=num_experts,
+                    num_redundant_experts=num_redundant_experts,
+                )
+            )
+        return expert_mapping
+
+    def recursive_replace(self):
+        """Initialize the MoE layers."""
+        text_config = self.text_config
+
+        # Positional arguments
+        num_experts = self.model_config.get_num_experts()
+        top_k = getattr_iter(text_config, ["num_experts_per_tok", "top_k"], None)
+        assert top_k is not None
+        hidden_size = text_config.hidden_size
+        intermediate_size = getattr_iter(
+            text_config, ["moe_intermediate_size", "intermediate_size"], None
+        )
+        assert intermediate_size is not None
+
+        # If there are shared experts, the results are
+        # reduced after mlp.forward() not inside FusedMoE
+        num_shared_experts = getattr_iter(
+            text_config,
+            [
+                "n_shared_experts",  # DeepSeek, Docs, GLM
+                "moe_num_shared_experts",  # Aria, Ernie
+            ],
+            0,
+        )
+        reduce_results = num_shared_experts == 0
+
+        def add_all_reduce(mlp: nn.Module):
+            """Adds an all-reduce to the output of `mlp.forward()`."""
+
+            class MLPWithAllReduce(mlp.__class__):
+                def forward(self, *args, **kwargs):
+                    output = super().forward(*args, **kwargs)
+                    return self.experts.maybe_all_reduce_tensor_model_parallel(output)
+
+            mlp.__class__ = MLPWithAllReduce
+
+        # Unused kwargs since we use custom_routing_function:
+        # - `scoring_func` and `e_score_correction_bias` only used for grouped
+        #    topk routing inside vLLM and are non-trivial to infer
+        #    and hard code `use_grouped_topk=False`
+        # - `renormalize` passed anyway because it's easy to infer
+        # - `num_expert_group` and `topk_group` used for inferring expert
+        #    placement strategy in FusedMoE
+        # - `apply_router_weight_on_input` is already applied in Transformers
+        renormalize = getattr(text_config, "norm_topk_prob", top_k > 1)
+        num_expert_group = getattr(text_config, "n_group", None)
+        topk_group = getattr(text_config, "topk_group", None)
+
+        # MoE activation function
+        activation = "silu"
+        wrapped_arch = self.config.architectures[0].lower()
+        if "gptoss" in wrapped_arch:
+            activation = "swigluoai"
+        elif "grok1" in wrapped_arch:
+            activation = "gelu"
+
+        # Expert mapping for `AutoWeightsLoader`
+        expert_mapping = self.get_expert_mapping()
+
+        # Expert parallel load balancing kwargs
+        enable_eplb = self.parallel_config.enable_eplb
+        num_redundant_experts = self.parallel_config.eplb_config.num_redundant_experts
+
+        # MixtureOfExperts mixin settings
+        ep_size = get_ep_group().world_size
+
+        self.mlp_moe_layers = []  # Used for MixtureOfExperts methods
+        self.moe_layers = []
+        self.expert_weights = []
+        self.num_moe_layers = 0
+        self.num_expert_groups = 1 if num_expert_group is None else num_expert_group
+        self.num_logical_experts = num_experts
+        self.num_physical_experts = num_experts + num_redundant_experts
+        self.num_local_physical_experts = self.num_physical_experts // ep_size
+        self.num_routed_experts = num_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_redundant_experts = num_redundant_experts
+
+        # Recursively fuse MoE layers
+        def _recursive_replace(module: nn.Module, prefix: str):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                if child_name == "experts" and isinstance(child_module, nn.ModuleList):
+                    # Alias for readability
+                    mlp = module
+                    experts = child_module
+                    # Do the experts have biases
+                    has_bias = False
+                    for experts_param_name, _ in experts.named_parameters():
+                        if "bias" in experts_param_name:
+                            has_bias = True
+                            break
+                    # Double check there are no shared experts
+                    nonlocal reduce_results
+                    if reduce_results:
+                        for mlp_param_name, _ in mlp.named_parameters():
+                            if "shared_expert" in mlp_param_name:
+                                reduce_results = False
+                                # If the config does not specify num_shared_experts, but
+                                # the model has shared experts, we assume there is one.
+                                self.num_shared_experts = 1
+                                break
+                    # Replace experts module with FusedMoE
+                    fused_experts = TransformersFusedMoE(
+                        num_experts=num_experts,
+                        top_k=top_k,
+                        hidden_size=hidden_size,
+                        intermediate_size=intermediate_size,
+                        reduce_results=reduce_results,
+                        renormalize=renormalize,
+                        # Hard coded because topk happens in Transformers
+                        use_grouped_topk=False,
+                        num_expert_group=num_expert_group,
+                        topk_group=topk_group,
+                        quant_config=self.quant_config,
+                        prefix=qual_name,
+                        activation=activation,
+                        enable_eplb=enable_eplb,
+                        num_redundant_experts=num_redundant_experts,
+                        has_bias=has_bias,
+                        expert_mapping=expert_mapping,
+                    )
+                    mlp.experts = fused_experts
+                    log_replacement(qual_name, experts, fused_experts)
+                    # Update MixtureOfExperts mixin state
+                    self.mlp_moe_layers.append(mlp)
+                    self.moe_layers.append(fused_experts)
+                    self.expert_weights.append(fused_experts.get_expert_weights())
+                    self.num_moe_layers += 1
+                    # If results are not all-reduced in FusedMoE, ensure they
+                    # are all-reduced at the end of mlp.forward() if tensor
+                    # parallel or expert parallel is enabled
+                    if not reduce_results and (
+                        fused_experts.tp_size > 1 or fused_experts.ep_size > 1
+                    ):
+                        add_all_reduce(mlp)
+                else:
+                    _recursive_replace(child_module, prefix=qual_name)
+
+        _recursive_replace(self.model, prefix="model")
+        # Continue with the replacement of layers in Base
+        super().recursive_replace()
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf6053719871f8c73c0d1bdd3276e025bcce868
--- /dev/null
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -0,0 +1,411 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend mixin for multi-modal models."""
+
+from collections.abc import Mapping
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.config.utils import getattr_iter
+from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.multimodal import MultiModalKwargsItems
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalUUIDDict,
+    PlaceholderRange,
+)
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
+from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from transformers import BatchFeature
+
+    from vllm.config import VllmConfig
+    from vllm.config.multimodal import BaseDummyOptions
+
+DYNAMIC_ARG_DIMS = {
+    "input_ids": 0,
+    # set `positions` to last dim to support Qwen-mrope
+    "positions": -1,
+    "intermediate_tensors": 0,
+    "inputs_embeds": 0,
+}
+
+
+class MultiModalProcessingInfo(BaseProcessingInfo):
+    def get_supported_mm_limits(self):
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
+        return {"image": self.get_max_image_tokens()}
+
+    def get_max_image_tokens(self) -> int:
+        width, height = self.get_max_image_size()
+        processor = self.get_hf_processor()
+        multimodal_config = self.ctx.model_config.multimodal_config
+        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
+        mm_tokens = processor._get_num_multimodal_tokens(
+            image_sizes=([height, width],), **mm_processor_kwargs
+        )
+        image_tokens = mm_tokens["num_image_tokens"][0]
+        return image_tokens
+
+    def get_max_image_size(self):
+        return 10_000, 10_000  # hardcode for arbitrary very large size
+
+
+class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        if "gemma3" in processor.__class__.__name__.lower():
+            image_token = processor.boi_token
+        else:
+            image_token = getattr(processor, "image_token", "")
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, "BaseDummyOptions"] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_max_image_size()
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            ),
+        }
+
+
+class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ):
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the updates to perform.
+
+        The information returned by this method is used to update token inputs
+        which bypass the HF processor. It is also used to update the output of
+        HF processor if the HF process does not apply prompt updates to text
+        inputs.
+
+        Moreover, this information is critical to determine the token positions
+        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
+        for each multi-modal item.
+        """
+        return None
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: "BatchFeature",
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        # HF Processors always return a mask but vLLM doesn't need it
+        hf_inputs.pop("attention_mask", None)
+        num_image_patches = hf_inputs.get("num_image_patches")
+        mm_fields = {
+            key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches)
+            for key in hf_inputs
+        }
+        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
+            "image", num_image_patches
+        )
+
+        # Keep these as batched, as they always have batch size as first dim
+        mm_fields["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+        mm_fields["video_grid_thw"] = MultiModalFieldConfig.batched("image")
+        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
+        return mm_fields
+
+    def _get_hf_mm_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        """
+        In contrast to the base class, this method always adds
+        `return_mm_token_type_ids` to the processor data
+        """
+        processor_data, passthrough_data = super()._get_hf_mm_data(mm_items)
+        processor_data["return_mm_token_type_ids"] = True
+        return processor_data, passthrough_data
+
+    def apply(
+        self,
+        prompt: str | list[int],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        Apply HF Processor on prompt text and multi-modal data together,
+        outputting token IDs and processed tensors.
+        """
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
+
+        mm_items = self._to_mm_items(mm_data)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        if not isinstance(prompt, str):
+            # the prompt is the tokenized ids which is not supported
+            # by the hf_processor, which is why we would need to decode the ids
+            # into string
+            prompt = hf_processor.decode(prompt)
+
+        # Bypass cached processor and always apply to the full set of mm inputs
+        # NOTE: we can't just set caching=False because base class method
+        # transforms outputs to `MultiModalKwargs` which is not going to
+        # work for Transformers. We have a lot of logic tied to
+        # `mm_tokens_per_modality` below
+        prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
+            prompt_text=prompt,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        # For gemma3 we check `token_type_ids` as the key
+        token_type_key = (
+            "mm_token_type_ids"
+            if "mm_token_type_ids" in processed_data
+            else "token_type_ids"
+        )
+        mm_token_type_ids = processed_data.pop(token_type_key)
+
+        # We can infer vLLM style placeholder from token type ids, if we split
+        # it for each input `mm_data`.
+        mm_positions = torch.where(mm_token_type_ids == 1)[1]
+        images = mm_items.get_items("image", ImageProcessorItems)
+        multimodal_config = self.info.ctx.model_config.multimodal_config
+        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
+        image_sizes = []
+        for item_idx in range(len(images)):
+            image_size = images.get_image_size(item_idx)
+            image_sizes.append((image_size.height, image_size.width))
+
+        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
+            image_sizes=image_sizes, **mm_processor_kwargs
+        )
+
+        mm_placeholders = {}
+        split_sizes = mm_tokens_per_modality["num_image_tokens"]
+        if split_sizes:
+            chunked_mm_positions = torch.split(mm_positions, split_sizes)
+            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
+            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
+            ranges = [
+                PlaceholderRange(
+                    offset=positions[0].item(),
+                    length=positions.shape[0],
+                    is_embed=(mm_tokens == hf_processor.image_token_id).bool(),
+                )
+                for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens)
+            ]
+            mm_placeholders = {"image": ranges}
+
+        processed_data["num_image_patches"] = torch.tensor(
+            mm_tokens_per_modality["num_image_patches"]
+        )
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+        )
+
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = self._hash_mm_items(
+            mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
+        )
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholders,
+        )
+
+
+class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
+    supports_multimodal_raw_input_only = True
+    merge_by_field_config = True
+    # Backwards compatibility for prev released models. State dicts back then
+    # had different formats and cannot be loaded with `AutoModel` mapping as is
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.model": "model.language_model",
+            "text_model.model": "model.text_model",
+            "vision_tower": "model.vision_tower",
+            "vqmodel": "model.vqmodel",
+            "visual": "model.visual",
+            "vision_model": "model.vision_model",
+            "vision_embed_tokens": "model.vision_embed_tokens",
+            "image_newline": "model.image_newline",
+            "multi_modal_projector": "model.multi_modal_projector",
+            "text_model.lm_head": "lm_head",
+            "language_model.lm_head": "lm_head",
+            # Qwen models used "model" as the name for the language model.
+            # Therefore, we must map each of submodule explicitly to avoid
+            # conflicts with newer models that use "model.language_model".
+            "model.embed_tokens": "model.language_model.embed_tokens",
+            "model.layers": "model.language_model.layers",
+            "model.norm": "model.language_model.norm",
+        }
+    )
+
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        # Skip SupportsMRoPE.__init__ and call the next class in MRO
+        super(SupportsMRoPE, self).__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        # Gemma3 and PaliGemma needs `token_type_ids` to work correctly
+        # Other models will not have `token_type_ids` in kwargs
+        kwargs = {k: v for k, v in kwargs.items() if k == "token_type_ids"}
+        model_output = super().forward(
+            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
+        )
+        return model_output
+
+    def get_language_model(self) -> torch.nn.Module:
+        """Transformers modeling backend multimodal classes do not contain a separate
+        vLLM language model class. Therefore, in order to return a language model vLLM
+        class, we use a wrapper to give `self` the same interface as a text model."""
+
+        # Exclude self and object
+        bases = self.__class__.mro()[1:-1]
+        # Keep only classes defined in `vllm.model_executor.models.transformers`
+        bases = [b for b in bases if ".transformers." in b.__module__]
+        # Exclude MultiModalMixin itself
+        bases = [b for b in bases if b is not MultiModalMixin]
+
+        class LanguageModel(*bases):
+            def __init__(self, multimodal_model):
+                # Don't call super().__init__() to avoid re-initialization
+                self.__dict__.update(multimodal_model.__dict__)
+
+            model = getattr_iter(self.model, ("language_model", "text_model"), None)
+
+        return LanguageModel(self)
+
+    def embed_multimodal(self, **kwargs):
+        pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
+        image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
+        # Model might use `image_patches` instead of `pixel_values`
+        if pixel_values is None:
+            pixel_values = kwargs.pop("image_patches", None)
+
+        if image_embeds is not None:
+            return image_embeds
+
+        if pixel_values is None:
+            return None
+
+        num_image_patches = kwargs.pop("num_image_patches")
+        kwargs.pop("token_type_ids", None)  # used only in `forward`
+        if pixel_values is not None:
+            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
+
+            if isinstance(vision_embeddings, torch.Tensor):
+                if vision_embeddings.ndim == 2:
+                    vision_embeddings = vision_embeddings.unsqueeze(0)
+
+                # Embeddings have to be 2D tensors of length `num_images`
+                # but transformers returns concat tensors if each patch
+                # is of different size. We split it back to make vLLM happy
+                vision_embeddings = torch.split(
+                    vision_embeddings, num_image_patches.flatten().tolist()
+                )
+                vision_embeddings = [
+                    embed.flatten(start_dim=0, end_dim=-2)
+                    for embed in vision_embeddings
+                ]
+
+            return vision_embeddings
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        if any(
+            v
+            for k, v in kwargs.items()
+            if k not in {"image_grid_thw", "video_grid_thw"}
+        ):
+            raise NotImplementedError(
+                "Transformers modeling backend only supports images."
+            )
+
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
+
+        mrope_positions, mrope_position_delta = self.model.get_rope_index(
+            input_ids=torch.tensor(input_tokens).unsqueeze(0),
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+        )
+
+        mrope_positions = mrope_positions[:, 0]
+        mrope_position_delta = mrope_position_delta[0].item()
+
+        return mrope_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a74bccb6a96e7e105a61d21a7f2da0ca2ed0f
--- /dev/null
+++ b/vllm/model_executor/models/transformers/pooling.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend mixins for pooling models."""
+
+from typing import TYPE_CHECKING
+
+import torch
+from transformers import AutoModelForSequenceClassification
+
+from vllm.config.utils import getattr_iter
+from vllm.model_executor.layers.pooler import (
+    ClassifierPooler,
+    CLSPool,
+    DispatchPooler,
+    Pooler,
+)
+from vllm.model_executor.models.interfaces import SupportsCrossEncoding
+from vllm.model_executor.models.interfaces_base import VllmModelForPooling
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class EmbeddingMixin(VllmModelForPooling):
+    default_pooling_type = "CLS"
+
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        # Skip VllmModelForPooling.__init__ and call the next class in MRO
+        super(VllmModelForPooling, self).__init__(
+            vllm_config=vllm_config, prefix=prefix
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {
+                "token_embed": Pooler.for_token_embed(pooler_config),
+                "embed": Pooler.for_embed(pooler_config),
+            }
+        )
+
+
+class SequenceClassificationMixin(SupportsCrossEncoding, VllmModelForPooling):
+    default_pooling_type = "CLS"
+
+    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
+        # Skip VllmModelForPooling.__init__ and call the next class in MRO
+        super(VllmModelForPooling, self).__init__(
+            vllm_config=vllm_config, prefix=prefix
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        # Certain information about the the model and classifier can only be
+        # inferred from the `ForSequenceClassification` class. Therefore, we
+        # instantiate it on the "meta" device to avoid allocating GPU memory.
+        with torch.device("meta"):
+            seq_cls_model = AutoModelForSequenceClassification.from_config(
+                self.config,
+                dtype=self.model_config.dtype,
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+        # When used for sequence classification, some models have their
+        # pooling layers removed. Make sure this is reflected in vLLM.
+        for module in seq_cls_model.modules():
+            if hasattr(module, "pooler") and module.pooler is None:
+                self.model.pooler = None
+                break
+
+        # Unlike `lm_head`, `classifier` is not always `nn.Linear`.
+        self.classifier = getattr_iter(seq_cls_model, ["classifier", "score"], None)
+        if self.classifier is None:
+            raise ValueError(
+                "Could not find `classifier` or `score` layer in the "
+                "`AutoModelForSequenceClassification` instance."
+            )
+        self.init_parameters(self.classifier, dtype=self.model_config.head_dtype)
+
+        class ClassifierWithReshape(self.classifier.__class__):
+            """CLSPool has already been applied in `pooling`.
+            Add dim to match expected input shape of `classifier.forward`."""
+
+            def forward(self, *args, **kwargs):
+                if len(args) > 0:
+                    args = (args[0].unsqueeze(1), *args[1:])
+                return super().forward(*args, **kwargs)
+
+        self.classifier.__class__ = ClassifierWithReshape
+
+        self.pooler = DispatchPooler(
+            {
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
+                "classify": ClassifierPooler(
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
+                ),
+                "score": ClassifierPooler(
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
+                ),
+            }
+        )
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..517eb54d53ac677112cee65c3ec4f6cb419a4390
--- /dev/null
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers modeling backend utilities."""
+
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+import torch
+from torch import nn
+
+from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.quantization import QuantizationConfig
+
+
+logger = init_logger(__name__)
+
+
+# Copied from `accelerate`
+@contextmanager
+def init_on_device_without_buffers(device: torch.device):
+    """
+    A context manager under which models are initialized with all
+    parameters on the specified device. However buffers are not
+    initialized on specified device.
+
+    Args:
+        device (`torch.device`):
+            Device to initialize all parameters on.
+    """
+
+    old_register_parameter = nn.Module.register_parameter
+
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs
+            )
+
+    tensor_constructors_to_patch = {}
+
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+
+        return wrapper
+
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        for torch_function_name in tensor_constructors_to_patch:
+            setattr(
+                torch,
+                torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)),
+            )
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        for (
+            torch_function_name,
+            old_torch_function,
+        ) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+
+
+Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"]
+
+
+def replace_linear_class(
+    linear: nn.Linear,
+    style: Style = "replicate",
+    quant_config: "QuantizationConfig | None" = None,
+    *,
+    prefix: str = "",
+) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear: `nn.Linear` to be replaced.
+        style: Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config: Quantization config for the new linear.
+    Returns:
+        The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    vllm_linear_cls, vllm_linear_kwargs = {
+        "colwise": (ColumnParallelLinear, {}),
+        "colwise_rep": (ColumnParallelLinear, {"gather_output": True}),
+        "rowwise": (RowParallelLinear, {}),
+        "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}),
+        "replicate": (ReplicatedLinear, {}),
+    }.get(style, (ReplicatedLinear, {}))
+
+    return vllm_linear_cls(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+        prefix=prefix,
+        return_bias=False,
+        **vllm_linear_kwargs,
+    )
+
+
+def replace_rms_norm_class(rms_norm: nn.Module, hidden_size: int) -> RMSNorm:
+    """Replace a Transformers RMSNorm with vLLM's RMSNorm.
+
+    This method assumes:
+    - Weight is stored as `weight`.
+    - Epsilon is stored as `eps` or `variance_epsilon`.
+    - `with_scale` indicates whether the layer has a weight (Gemma3n only).
+    - `var_hidden_size` is only ever used for Intern vision encoder in vLLM
+    and Transformers doesn't appear to have the same concept.
+    """
+    eps = getattr_iter(rms_norm, ("eps", "variance_epsilon"), 1e-6)
+    kwargs = {"hidden_size": hidden_size, "eps": eps}
+    # Update hidden size if weight is available
+    weight_meta = getattr(rms_norm, "weight", None)
+    if weight_meta is not None:
+        kwargs["hidden_size"] = weight_meta.size(0)
+    # Check if weight is all zeros, which indicates GemmaRMSNorm
+    # We must create a new instance because rms_norm is on meta
+    try:
+        with torch.device("cpu"):
+            weight_test = getattr(rms_norm.__class__(1), "weight", None)
+    except Exception:
+        logger.warning(
+            "Failed to determine if RMSNorm weight is centered on zero or one. "
+            "Defaulting to one."
+        )
+        weight_test = None
+    if weight_test is not None and torch.all(weight_test == 0):
+        return GemmaRMSNorm(**kwargs)
+    # Otherwise assume it's a regular RMSNorm
+    kwargs["has_weight"] = getattr(rms_norm, "with_scale", True)
+    if weight_meta is not None:
+        kwargs["dtype"] = weight_meta.dtype
+    else:
+        # No weight, fall back to weightless RMSNorm
+        kwargs["has_weight"] = False
+    return RMSNorm(**kwargs)
+
+
+def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
+    logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+
+def get_feature_request_tip(
+    model: str,
+    trust_remote_code: bool,
+) -> str:
+    hf_url = f"a discussion at https://huggingface.co/{model}/discussions/new"
+    gh_url = "an issue at https://github.com/huggingface/transformers/issues/new/choose"
+    url = hf_url if trust_remote_code else gh_url
+    prefix = f"Please open {url} to request support for this feature. "
+    if Path(model).exists():
+        prefix = ""
+    doc_url = "https://docs.vllm.ai/en/latest/models/supported_models.html#writing-custom-models"
+    tip = f"See {doc_url} for instructions on how to add support yourself."
+    return f"{prefix}{tip}"
+
+
+def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
+    """
+    Callable to be passed to `@support_torch_compile`'s `enable_if` argument.
+
+    Defaults to `True` but is disabled in the following situations:
+
+    - The model uses dynamic rope scaling.
+    """
+    text_config = vllm_config.model_config.hf_config.get_text_config()
+    # Dynamic rope scaling is not compatible with torch.compile
+    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
+    return rope_scaling.get("rope_type") != "dynamic"
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 12ae9487ad9dc2c473f245d7f879a4a23061a094..bb0f6bd036f1474e8664cb5ed4a60f24f4623ee3 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,9 @@
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
+
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -14,27 +15,43 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.model_loader import DefaultModelLoader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    flatten_bn,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
@@ -48,16 +65,20 @@ class UltravoxAudioFeatureInputs(TensorSchema):
     - t: Time frames (M)
     - nmb: Number of mel bins
     """
+
     type: Literal["audio_features"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor],
-                          list[list[torch.Tensor]]],
-                    TensorShape("b", "n", "nmb", "t", dynamic_dims={"n"})]
-    lens: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("b", "n", dynamic_dims={"n"})]
-    """Length of the audio frames. Used for attention mask in WhisperEncoder."""
-    token_len: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                         TensorShape("b", "n", dynamic_dims={"n"})]
-    """Length of the audio tokens. Used for flattening the audio features."""
+    data: Annotated[
+        torch.Tensor | list[torch.Tensor] | list[list[torch.Tensor]],
+        TensorShape("bn", "nmb", "t"),
+    ]
+    lens: Annotated[torch.Tensor, TensorShape("bn")]
+    """
+    Length of the audio frames per chunk. Used for attention mask in WhisperEncoder.
+    """
+    token_len: Annotated[torch.Tensor, TensorShape("bn")]
+    """Length of the audio tokens per chunk. Used for flattening the audio features."""
+    num_chunks: Annotated[torch.Tensor, TensorShape("n")]
+    """Number of chunks per audio. Used for flattening the audio features."""
 
 
 class UltravoxAudioEmbeddingInputs(TensorSchema):
@@ -68,17 +89,19 @@ class UltravoxAudioEmbeddingInputs(TensorSchema):
     - afs: audio feature size
     - hs: hidden size
     """
+
     type: Literal["audio_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("b", "na", "afs", "hs")]
+    data: Annotated[
+        torch.Tensor | list[torch.Tensor], TensorShape("b", "na", "afs", "hs")
+    ]
 
 
-UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
-                            UltravoxAudioEmbeddingInputs]
+UltravoxAudioInputs: TypeAlias = (
+    UltravoxAudioFeatureInputs | UltravoxAudioEmbeddingInputs
+)
 
 
 class UltravoxProcessingInfo(BaseProcessingInfo):
-
     def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
@@ -91,21 +114,18 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 
         return hf_processor
 
-    def get_feature_extractor(self,
-                              **kwargs: object) -> WhisperFeatureExtractor:
+    def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
 
-class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
-                                 ):
-
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
 
@@ -115,23 +135,26 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
-        audio_len = (feature_extractor.chunk_length * sampling_rate *
-                     _MAX_ENCODER_BATCH_SIZE)
+        audio_len = (
+            feature_extractor.chunk_length * sampling_rate * _MAX_ENCODER_BATCH_SIZE
+        )
         num_audios = mm_counts.get("audio", 0)
 
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            )
         }
 
 
-class UltravoxMultiModalProcessor(
-        BaseMultiModalProcessor[UltravoxProcessingInfo]):
-
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor[UltravoxProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
@@ -146,7 +169,8 @@ class UltravoxMultiModalProcessor(
         # Text-only input not supported in composite processor
         if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(
-                prompt, add_special_tokens=False)
+                prompt, add_special_tokens=False
+            )
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
@@ -173,7 +197,7 @@ class UltravoxMultiModalProcessor(
             mm_kwargs=mm_kwargs,
             tok_kwargs=tok_kwargs,
         )
-        output['audio_features'] = output.pop('audio_values')
+        output["audio_features"] = output.pop("audio_values")
 
         return output
 
@@ -182,17 +206,14 @@ class UltravoxMultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        num_chunks = hf_inputs.get('audio_num_chunks', torch.zeros(0))
+        num_chunks = hf_inputs.get("audio_num_chunks", torch.zeros(0))
         return dict(
             # to handle longer than 30s audio, each audio might be split
             # into multiple chunks as such, their batch dimension can be
             # higher than the number of audio samples
-            audio_features=MultiModalFieldConfig.flat_from_sizes(
-                "audio", num_chunks),
-            audio_token_len=MultiModalFieldConfig.flat_from_sizes(
-                "audio", num_chunks),
-            audio_lens=MultiModalFieldConfig.flat_from_sizes(
-                "audio", num_chunks),
+            audio_features=MultiModalFieldConfig.flat_from_sizes("audio", num_chunks),
+            audio_token_len=MultiModalFieldConfig.flat_from_sizes("audio", num_chunks),
+            audio_lens=MultiModalFieldConfig.flat_from_sizes("audio", num_chunks),
             # num_chunks can convert audio_chunked to audio batch dimension
             audio_num_chunks=MultiModalFieldConfig.batched("audio"),
             audio_embeds=MultiModalFieldConfig.batched("audio"),
@@ -213,11 +234,12 @@ class UltravoxMultiModalProcessor(
         # belonging to the i-th audio.
         out_mm_data = out_mm_kwargs.get_data()
         num_chunks = out_mm_data.get("audio_num_chunks", torch.zeros(0))
-        chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
-                                                      dim=0,
-                                                      dtype=torch.int32)
+        chunks_start_idx: torch.Tensor = torch.cumsum(
+            num_chunks, dim=0, dtype=torch.int32
+        )
         chunks_start_idx = torch.cat(
-            [torch.tensor([0], dtype=torch.int32), chunks_start_idx])
+            [torch.tensor([0], dtype=torch.int32), chunks_start_idx]
+        )
 
         def get_replacement_ultravox(item_idx: int):
             start = chunks_start_idx[item_idx]
@@ -246,17 +268,16 @@ class StackAudioFrames(nn.Module):
 
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
-        T_pad = (T + self.stack_factor -
-                 1) // self.stack_factor * self.stack_factor
+        T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
         audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
         B, T, C = audio_embeds.shape
-        audio_embeds = audio_embeds.view(B, T // self.stack_factor,
-                                         C * self.stack_factor)
+        audio_embeds = audio_embeds.view(
+            B, T // self.stack_factor, C * self.stack_factor
+        )
         return audio_embeds
 
 
 class UltravoxProjector(nn.Module):
-
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
@@ -320,12 +341,15 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 
     @property
     def max_context_length(self):
-        return (self.config.max_source_positions * self.conv1.stride[0] *
-                self.conv2.stride[0])
+        return (
+            self.config.max_source_positions
+            * self.conv1.stride[0]
+            * self.conv2.stride[0]
+        )
 
-    def get_attention_mask_by_audio_len(self,
-                                        audio_lens: Optional[torch.Tensor],
-                                        hidden_states: torch.Tensor):
+    def get_attention_mask_by_audio_len(
+        self, audio_lens: torch.Tensor | None, hidden_states: torch.Tensor
+    ):
         """
         Create attention mask based on audio lengths to mask out padding tokens
         For each sample in batch:
@@ -341,9 +365,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 
         audio_feature_len = self._get_feat_extract_output_lengths(audio_lens)
         max_seq_len = hidden_states.shape[1]
-        attention_mask = torch.arange(max_seq_len,
-                                      device=hidden_states.device)[None, :].lt(
-                                          audio_feature_len.view(-1, 1))
+        attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[
+            None, :
+        ].lt(audio_feature_len.view(-1, 1))
         attention_mask = self.get_extended_attention_mask(
             attention_mask,
             None,
@@ -354,7 +378,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
     def forward(
         self,
         input_features: torch.Tensor,
-        audio_lens: Optional[torch.Tensor] = None,
+        audio_lens: torch.Tensor | None = None,
     ):
         expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
@@ -362,21 +386,21 @@ class ModifiedWhisperEncoder(WhisperEncoder):
                 f"Whisper expects the mel input features to be of length "
                 f"{expected_seq_length} or less, but found "
                 f"{input_features.shape[-1]}. Make sure to pad the input mel "
-                f"features to {expected_seq_length}.")
+                f"features to {expected_seq_length}."
+            )
 
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
 
         inputs_embeds = inputs_embeds.permute(0, 2, 1)
-        embed_pos = self.embed_positions.weight[:inputs_embeds.size(-2)]
+        embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)]
 
         hidden_states = inputs_embeds + embed_pos
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.dropout,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
 
-        attention_mask = self.get_attention_mask_by_audio_len(
-            audio_lens, hidden_states)
+        attention_mask = self.get_attention_mask_by_audio_len(audio_lens, hidden_states)
 
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
@@ -394,19 +418,22 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 @MULTIMODAL_REGISTRY.register_processor(
     UltravoxMultiModalProcessor,
     info=UltravoxProcessingInfo,
-    dummy_inputs=UltravoxDummyInputsBuilder)
+    dummy_inputs=UltravoxDummyInputsBuilder,
+)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
+        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}
+    )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio|>"
 
@@ -430,7 +457,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
                     model_or_path=config.audio_model_id,
                     revision=None,
                     prefix="audio_tower.",
-                ))
+                )
+            )
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -441,12 +469,16 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             # this prefix is not for initialization, but for loading weights
             # note the trailing dot
             self.secondary_weights.append(
-                DefaultModelLoader.Source(model_or_path=config.text_model_id,
-                                          revision=None,
-                                          prefix="language_model."))
+                DefaultModelLoader.Source(
+                    model_or_path=config.text_model_id,
+                    revision=None,
+                    prefix="language_model.",
+                )
+            )
 
         self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors)
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
@@ -459,8 +491,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         )
 
     def _audio_features_to_embeddings(
-            self, input_features: torch.Tensor,
-            audio_lens: torch.Tensor) -> torch.Tensor:
+        self, input_features: torch.Tensor, audio_lens: torch.Tensor
+    ) -> torch.Tensor:
         audio_features = input_features.to(self.audio_tower.dtype)
         batch_size = audio_features.size(0)
         audio_embeddings = []
@@ -469,8 +501,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         for start in range(0, batch_size, _MAX_ENCODER_BATCH_SIZE):
             end = min(start + _MAX_ENCODER_BATCH_SIZE, batch_size)
             # Process through audio tower
-            batch_features = self.audio_tower(audio_features[start:end],
-                                              audio_lens[start:end])
+            batch_features = self.audio_tower(
+                audio_features[start:end], audio_lens[start:end]
+            )
             batch_features = batch_features.to(self.audio_tower.dtype)
 
             # Process through projector
@@ -482,31 +515,35 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         return audio_embeddings
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[UltravoxAudioInputs]:
+        self, **kwargs: object
+    ) -> UltravoxAudioInputs | None:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
         audio_lens = kwargs.pop("audio_lens", None)
         audio_token_len = kwargs.pop("audio_token_len", None)
+        audio_num_chunks = kwargs.pop("audio_num_chunks", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
         if audio_features is not None:
-            return UltravoxAudioFeatureInputs(type="audio_features",
-                                              data=audio_features,
-                                              lens=audio_lens,
-                                              token_len=audio_token_len)
+            return UltravoxAudioFeatureInputs(
+                type="audio_features",
+                data=audio_features,
+                lens=audio_lens,
+                token_len=audio_token_len,
+                num_chunks=audio_num_chunks,
+            )
 
         if audio_embeds is not None:
-            return UltravoxAudioEmbeddingInputs(type="audio_embeds",
-                                                data=audio_embeds)
+            return UltravoxAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds)
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_audio_input(
         self,
         audio_input: UltravoxAudioInputs,
-    ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
+    ) -> NestedTensors | tuple[torch.Tensor, ...]:
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
@@ -514,12 +551,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
         audio_features = pad_and_concat_to_dim3(audio_input["data"])
 
-        # [B1, B2] -> [B1+B2]
-        audio_lens = flatten_bn(audio_input['lens'], concat=True)
-        audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
+        audio_lens = audio_input["lens"]
+        audio_token_len = audio_input["token_len"]
 
-        embeddings = self._audio_features_to_embeddings(
-            audio_features, audio_lens)
+        embeddings = self._audio_features_to_embeddings(audio_features, audio_lens)
 
         # We should flatten and concatenate embeddings based on token lengths
         # For example, with token_len = [4, 2, 3], flattened_embeddings will be
@@ -528,53 +563,57 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         # Create a mask of valid indices based on token lengths
         max_len = embeddings.shape[1]
         indices = torch.arange(max_len, device=embeddings.device).expand(
-            embeddings.shape[0], -1)
+            embeddings.shape[0], -1
+        )
         mask = indices < audio_token_len[:, None]
         # Apply mask and flatten
         flattened_embeddings = embeddings[mask]
 
         # Return one tensor per input audio
         embed_lens = [
-            token_len_item.sum().item()
-            for token_len_item in audio_input['token_len']
+            chunk_lens.sum().item()
+            for chunk_lens in audio_token_len.split(audio_input["num_chunks"].tolist())
         ]
         return flattened_embeddings.split(embed_lens)
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
         audio_embeddings = self._process_audio_input(audio_input)
         return audio_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
-        # The audio token index is not included in the embedding table
-        # We need to remove it before embedding lookup
-        safe_input_ids = input_ids.clone()
-        safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
-        inputs_embeds = self.language_model.get_input_embeddings(
-            safe_input_ids)
-        if multimodal_embeddings is not None and len(
-                multimodal_embeddings) > 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.audio_token_index)
-        return inputs_embeds
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[torch.Tensor] = None,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
+        # This is to satisfy the type checker for each overload
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Ultravox
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -596,38 +635,25 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
-
         language_model = self.language_model
         if hasattr(language_model, "language_model"):
             language_model = language_model.language_model
 
-        hidden_states = language_model.model(input_ids,
-                                             positions,
-                                             intermediate_tensors,
-                                             inputs_embeds=inputs_embeds)
+        hidden_states = language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["audio_tower."])
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 def pad_and_concat_to_dim3(
-    features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
+    features: torch.Tensor | list[torch.Tensor] | list[list[torch.Tensor]],
 ) -> torch.Tensor:
     """
     Pad and concatenate a list of tensors.
@@ -641,6 +667,7 @@ def pad_and_concat_to_dim3(
         if features.ndim > 3:
             # Flatten [B, N, 80, M] -> [B * N, 80, M]
             features = flatten_bn(features)
+
         return features
 
     features = [pad_and_concat_to_dim3(f) for f in features]
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 91d43306717dcec5fa2d77a165c0f5b579491bfd..7e8cc8071886816ba9adbca19fff2f8a85acbba7 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -4,28 +4,40 @@
 import itertools
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
-from typing import Any, Callable, Literal, Optional, Protocol, Union, overload
+from typing import Any, Literal, Protocol, overload
 
 import torch
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
+from typing_extensions import deprecated
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
 from vllm.sequence import IntermediateTensors
-from vllm.utils import (cdiv, direct_register_custom_op,
-                        get_cuda_view_from_cpu_tensor, is_pin_memory_available,
-                        is_uva_available)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import (
+    is_pin_memory_available,
+    is_uva_available,
+)
+from vllm.utils.torch_utils import (
+    direct_register_custom_op,
+    get_cuda_view_from_cpu_tensor,
+)
 
 logger = init_logger(__name__)
 
-WeightsMapping = Mapping[str, Optional[str]]
+WeightsMapping = Mapping[str, str | None]
 """If a key maps to a value of `None`, the corresponding weight is ignored."""
 
 
@@ -37,7 +49,15 @@ class WeightsMapper:
     orig_to_new_prefix: WeightsMapping = field(default_factory=dict)
     orig_to_new_suffix: WeightsMapping = field(default_factory=dict)
 
-    def _map_name(self, key: str) -> Optional[str]:
+    def __or__(self, other: "WeightsMapper") -> "WeightsMapper":
+        """Combine two `WeightsMapper`s by merging their mappings."""
+        return WeightsMapper(
+            orig_to_new_substr={**self.orig_to_new_substr, **other.orig_to_new_substr},
+            orig_to_new_prefix={**self.orig_to_new_prefix, **other.orig_to_new_prefix},
+            orig_to_new_suffix={**self.orig_to_new_suffix, **other.orig_to_new_suffix},
+        )
+
+    def _map_name(self, key: str) -> str | None:
         for substr, new_key in self.orig_to_new_substr.items():
             if substr in key:
                 if new_key is None:
@@ -64,12 +84,16 @@ class WeightsMapper:
     def apply(
         self, weights: Iterable[tuple[str, torch.Tensor]]
     ) -> Iterable[tuple[str, torch.Tensor]]:
-        return ((out_name, data) for name, data in weights
-                if (out_name := self._map_name(name)) is not None)
+        return (
+            (out_name, data)
+            for name, data in weights
+            if (out_name := self._map_name(name)) is not None
+        )
 
     def apply_list(self, values: list[str]) -> list[str]:
         return [
-            out_name for name in values
+            out_name
+            for name in values
             if (out_name := self._map_name(name)) is not None
         ]
 
@@ -88,18 +112,19 @@ class AutoWeightsLoader:
     the weights only once.
 
     The weight loading logic for individual modules can be overridden
-    by defining a ``load_weights`` method.
+    by defining a `load_weights` method.
 
     Similarly, the weight loading logic for individual parameters can be
-    overridden by defining a ``weight_loader`` method.
+    overridden by defining a `weight_loader` method.
 
     Detailed weight loading information can be viewed by setting the
-    environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
+    environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
     """
 
-    # Models trained using early version ColossalAI
-    # may include these tensors in checkpoint. Skip them.
+    # Models trained using early version ColossalAI or quantized by
+    # GPTQModel may include these tensors in checkpoint. Skip them.
     ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_pos_emb.inv_freq",
         "rotary_emb.inv_freq",
         "rotary_emb.cos_cached",
         "rotary_emb.sin_cached",
@@ -109,9 +134,10 @@ class AutoWeightsLoader:
         self,
         module: nn.Module,
         *,
-        skip_prefixes: Optional[list[str]] = None,
-        skip_substrs: Optional[list[str]] = None,
-        ignore_unexpected_prefixes: Optional[list[str]] = None,
+        skip_prefixes: list[str] | None = None,
+        skip_substrs: list[str] | None = None,
+        ignore_unexpected_prefixes: list[str] | None = None,
+        ignore_unexpected_suffixes: list[str] | None = None,
     ) -> None:
         super().__init__()
 
@@ -119,6 +145,7 @@ class AutoWeightsLoader:
         self.skip_prefixes = skip_prefixes or []
         self.skip_substrs = skip_substrs or []
         self.ignore_unexpected_prefixes = ignore_unexpected_prefixes or []
+        self.ignore_unexpected_suffixes = ignore_unexpected_suffixes or []
         # update default skip_substrs
         self.skip_substrs += self.ROTARY_EMBEDS_UNUSED_WEIGHTS
 
@@ -126,17 +153,20 @@ class AutoWeightsLoader:
         self,
         weights: Iterable[tuple[str, torch.Tensor]],
     ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]:
-        weights_by_parts = ((weight_name.split(".", 1), weight_data)
-                            for weight_name, weight_data in weights)
+        weights_by_parts = (
+            (weight_name.split(".", 1), weight_data)
+            for weight_name, weight_data in weights
+        )
 
-        for prefix, group in itertools.groupby(weights_by_parts,
-                                               key=lambda x: x[0][0]):
+        for prefix, group in itertools.groupby(weights_by_parts, key=lambda x: x[0][0]):
             yield (
                 prefix,
                 # Because maxsplit=1 in weight_name.split(...),
                 # the length of `parts` must either be 1 or 2
-                (("" if len(parts) == 1 else parts[1], weights_data)
-                 for parts, weights_data in group),
+                (
+                    ("" if len(parts) == 1 else parts[1], weights_data)
+                    for parts, weights_data in group
+                ),
             )
 
     def _get_qualname(self, prefix: str, rest: str) -> str:
@@ -148,12 +178,14 @@ class AutoWeightsLoader:
         return ".".join((prefix, rest))
 
     def _can_skip(self, qualname: str) -> bool:
-        return (any(qualname.startswith(p) for p in self.skip_prefixes)
-                or any(substr in qualname for substr in self.skip_substrs))
+        return any(qualname.startswith(p) for p in self.skip_prefixes) or any(
+            substr in qualname for substr in self.skip_substrs
+        )
 
     def _can_ignore_unexpected(self, qualname: str) -> bool:
-        return any(
-            qualname.startswith(p) for p in self.ignore_unexpected_prefixes)
+        iup = (qualname.startswith(p) for p in self.ignore_unexpected_prefixes)
+        ius = (qualname.endswith(s) for s in self.ignore_unexpected_suffixes)
+        return any(iup) or any(ius)
 
     def _load_param(
         self,
@@ -177,24 +209,26 @@ class AutoWeightsLoader:
 
                 raise ValueError(
                     f"Attempted to load nested weight '{weight_qualname}' "
-                    f"into a single parameter '{base_prefix}'")
+                    f"into a single parameter '{base_prefix}'"
+                )
 
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, weight_data)
 
-            logger.debug("Loaded weight %s with shape %s", weight_qualname,
-                         param.shape)
+            logger.debug("Loaded weight %s with shape %s", weight_qualname, param.shape)
 
             yield weight_qualname
 
-    def _add_loadable_non_param_tensors(self, module: nn.Module,
-                                        child_params: dict[str, torch.Tensor]):
+    def _add_loadable_non_param_tensors(
+        self, module: nn.Module, child_params: dict[str, torch.Tensor]
+    ):
         """
         Add tensor names that are not in the model params that may be in the
         safetensors, e.g., batch normalization stats.
         """
-        if isinstance(module, (
+        if isinstance(
+            module,
+            (
                 nn.BatchNorm1d,
                 nn.BatchNorm2d,
                 nn.BatchNorm3d,
@@ -202,10 +236,10 @@ class AutoWeightsLoader:
                 nn.LazyBatchNorm2d,
                 nn.LazyBatchNorm3d,
                 nn.SyncBatchNorm,
-        )):
+            ),
+        ):
             module_state_dict = module.state_dict()
-            for stat_name in ("running_mean", "running_var",
-                              "num_batches_tracked"):
+            for stat_name in ("running_mean", "running_var", "num_batches_tracked"):
                 child_params[stat_name] = module_state_dict[stat_name]
 
     def _load_module(
@@ -225,8 +259,8 @@ class AutoWeightsLoader:
                 loaded_params = module_load_weights(weights)
                 if loaded_params is None:
                     logger.warning(
-                        "Unable to collect loaded parameters "
-                        "for module %s", module)
+                        "Unable to collect loaded parameters for module %s", module
+                    )
                 else:
                     yield from map(
                         lambda x: self._get_qualname(base_prefix, x),
@@ -249,17 +283,18 @@ class AutoWeightsLoader:
 
                     continue
 
-                yield from self._load_module(prefix,
-                                             child_modules[child_prefix],
-                                             child_weights)
+                yield from self._load_module(
+                    prefix, child_modules[child_prefix], child_weights
+                )
             elif child_prefix in child_params:
                 if self._can_skip(prefix):
                     logger.debug("Skipping param %s", prefix)
 
                     continue
 
-                yield from self._load_param(prefix, child_params[child_prefix],
-                                            child_weights)
+                yield from self._load_param(
+                    prefix, child_params[child_prefix], child_weights
+                )
             else:
                 can_skip_module = self._can_skip(prefix + ".")
                 can_skip_param = self._can_skip(prefix)
@@ -275,21 +310,24 @@ class AutoWeightsLoader:
 
                     continue
 
-                msg = (f"There is no module or parameter named '{prefix}' "
-                       f"in {type(self.module).__name__}")
+                msg = (
+                    f"There is no module or parameter named '{prefix}' "
+                    f"in {type(self.module).__name__}"
+                )
                 raise ValueError(msg)
 
     def load_weights(
         self,
         weights: Iterable[tuple[str, torch.Tensor]],
         *,
-        mapper: Optional[WeightsMapper] = None,
+        mapper: WeightsMapper | None = None,
     ) -> set[str]:
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name
-        weights = ((name, weight) for name, weight in weights
-                   if not self._can_skip(name))
+        weights = (
+            (name, weight) for name, weight in weights if not self._can_skip(name)
+        )
 
         autoloaded_weights = set(self._load_module("", self.module, weights))
         return autoloaded_weights
@@ -299,8 +337,8 @@ def init_vllm_registered_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
-    hf_config: Optional[PretrainedConfig] = None,
-    architectures: Optional[list[str]] = None,
+    hf_config: PretrainedConfig | None = None,
+    architectures: list[str] | None = None,
 ) -> nn.Module:
     """
     Helper function to initialize an inner model registered to vLLM,
@@ -313,49 +351,44 @@ def init_vllm_registered_model(
         hf_config = vllm_config.model_config.hf_config
 
     if hf_config is not None:
-        vllm_config = vllm_config.with_hf_config(hf_config,
-                                                 architectures=architectures)
+        vllm_config = vllm_config.with_hf_config(hf_config, architectures=architectures)
 
     return initialize_model(vllm_config=vllm_config, prefix=prefix)
 
 
 @overload
-def flatten_bn(x: torch.Tensor) -> torch.Tensor:
-    ...
+def flatten_bn(x: torch.Tensor) -> torch.Tensor: ...
 
 
 @overload
-def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]:
-    ...
+def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ...
 
 
 @overload
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: Literal[True],
-) -> torch.Tensor:
-    ...
+) -> torch.Tensor: ...
 
 
 @overload
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: bool = False,
-) -> Union[list[torch.Tensor], torch.Tensor]:
-    ...
+) -> list[torch.Tensor] | torch.Tensor: ...
 
 
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: bool = False,
-) -> Union[list[torch.Tensor], torch.Tensor]:
+) -> list[torch.Tensor] | torch.Tensor:
     """
-    Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
+    Flatten the `B` and `N` dimensions of batched multimodal inputs.
 
-    The input tensor should have shape ``(B, N, ...)```.
+    The input tensor should have shape `(B, N, ...)`.
     """
     if isinstance(x, torch.Tensor):
         return x.flatten(0, 1)
@@ -388,95 +421,82 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
     if isinstance(embeddings, torch.Tensor):
         return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
 
-    return " + ".join(
-        _embedding_count_expression(inner) for inner in embeddings)
+    return " + ".join(_embedding_count_expression(inner) for inner in embeddings)
+
+
+def split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]:
+    ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)]
+    for num in lst:
+        index = num // interval
+        ranges[index].append(num)
+    return ranges
 
 
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
-    is_multimodal: torch.Tensor,
     multimodal_embeddings: NestedTensors,
+    is_multimodal: torch.Tensor,
 ) -> torch.Tensor:
     """
-    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
-    positions in ``inputs_embeds`` corresponding to placeholder tokens in
-    ``input_ids``.
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
+    positions in `inputs_embeds` corresponding to placeholder tokens in
+    `input_ids`.
 
     Note:
-        This updates ``inputs_embeds`` in place.
+        This updates `inputs_embeds` in place.
     """
-    flattened = _flatten_embeddings(multimodal_embeddings)
+    if len(multimodal_embeddings) == 0:
+        return inputs_embeds
+
+    mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
+    input_dtype = inputs_embeds.dtype
+
     try:
-        # This is equivalent to: inputs_embeds[is_multimodal] = flattened.
-        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1),
-                                      flattened.to(dtype=inputs_embeds.dtype))
+        # For debugging
+        # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype)
+
+        # NOTE: This can avoid D2H sync (#22105), but fails to
+        # raise an error if is_multimodal.sum() < len(mm_embeds_flat)
+        inputs_embeds.masked_scatter_(
+            is_multimodal.unsqueeze(-1), mm_embeds_flat.to(dtype=input_dtype)
+        )
     except RuntimeError as e:
+        num_actual_tokens = len(mm_embeds_flat)
         num_expected_tokens = is_multimodal.sum().item()
-        assert isinstance(num_expected_tokens, int)
 
-        if flattened.shape[0] != num_expected_tokens:
+        if num_actual_tokens != num_expected_tokens:
             expr = _embedding_count_expression(multimodal_embeddings)
+
             raise ValueError(
-                f"Attempted to assign {expr} = {flattened.shape[0]} "
+                f"Attempted to assign {expr} = {num_actual_tokens} "
                 f"multimodal tokens to {num_expected_tokens} placeholders"
             ) from e
-        else:
-            raise ValueError("Error during masked scatter operation") from e
-
-    return inputs_embeds
-
-
-def embed_multimodal(
-    input_ids: torch.Tensor,
-    multimodal_token_id: int,
-    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    multimodal_embeds: NestedTensors,
-) -> torch.Tensor:
-    """
-    Embed token IDs and multimodal inputs and combine their embeddings.
 
-    ``multimodal_token_id`` is used to determine whether a token ID should
-    be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``.
+        raise ValueError("Error during masked scatter operation") from e
 
-    Compared to ``merge_multimodal_embeddings`, this avoids running
-    ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]``
-    which causes issues when the placeholder token ID exceeds the
-    vocabulary size of the language model.
-    """
-    is_multimodal = input_ids == multimodal_token_id
-    is_text = ~is_multimodal
-
-    text_embeds = get_text_embeds(input_ids[is_text])
-    merged_embeds = torch.empty(
-        (input_ids.shape[0], text_embeds.shape[1]),
-        dtype=text_embeds.dtype,
-        device=text_embeds.device,
-    )
-
-    merged_embeds[is_text] = text_embeds
-
-    return _merge_multimodal_embeddings(
-        merged_embeds,
-        is_multimodal,
-        multimodal_embeds,
-    )
+    return inputs_embeds
 
 
+@deprecated(
+    "`merge_multimodal_embeddings` has been replaced with "
+    "`SupportsMultiModal.embed_input_ids` and will be "
+    "removed in v0.12."
+)
 def merge_multimodal_embeddings(
     input_ids: torch.Tensor,
     inputs_embeds: torch.Tensor,
     multimodal_embeddings: NestedTensors,
-    placeholder_token_id: Union[int, list[int]],
+    placeholder_token_id: int | list[int],
 ) -> torch.Tensor:
     """
-    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
-    positions in ``inputs_embeds`` corresponding to placeholder tokens in
-    ``input_ids``.
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
+    positions in `inputs_embeds` corresponding to placeholder tokens in
+    `input_ids`.
 
-    ``placeholder_token_id`` can be a list of token ids (e.g, token ids
+    `placeholder_token_id` can be a list of token ids (e.g, token ids
     of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the ``input_ids`` MUST MATCH the order of
-    their embeddings in ``multimodal_embeddings`` since we need to
+    the order of these tokens in the `input_ids` MUST MATCH the order of
+    their embeddings in `multimodal_embeddings` since we need to
     slice-merge instead of individually scattering.
 
     For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
@@ -491,30 +511,34 @@ def merge_multimodal_embeddings(
     input_ids for a correct embedding merge.
 
     Note:
-        This updates ``inputs_embeds`` in place.
+        This updates `inputs_embeds` in place.
     """
     if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(
-            placeholder_token_id,
-            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
-                                                     non_blocking=True)
-        return _merge_multimodal_embeddings(
-            inputs_embeds,
-            torch.isin(input_ids, placeholder_token_id),
-            multimodal_embeddings,
-        )
+        is_multimodal = isin_list(input_ids, placeholder_token_id)
+    else:
+        is_multimodal = input_ids == placeholder_token_id
 
     return _merge_multimodal_embeddings(
         inputs_embeds,
-        (input_ids == placeholder_token_id),
-        multimodal_embeddings,
+        multimodal_embeddings=multimodal_embeddings,
+        is_multimodal=is_multimodal,
     )
 
 
-class LayerFn(Protocol):
+def isin_list(
+    elements: torch.Tensor,
+    test_elements_list: list[int],
+) -> torch.Tensor:
+    test_elements = torch.tensor(
+        test_elements_list,
+        pin_memory=is_pin_memory_available(),
+    ).to(device=elements.device, non_blocking=True)
+
+    return torch.isin(elements, test_elements)
 
-    def __call__(self, prefix: str) -> torch.nn.Module:
-        ...
+
+class LayerFn(Protocol):
+    def __call__(self, prefix: str) -> torch.nn.Module: ...
 
 
 class PPMissingLayer(torch.nn.Identity):
@@ -556,12 +580,8 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
     pin_memory = is_pin_memory_available()
     uva_available = is_uva_available()
 
-    if envs.VLLM_USE_V1:
-        assert uva_available, ("V1 CPU offloading requires"
-                               " uva (pin memory) support")
-        uva_offloading = True
-    else:
-        uva_offloading = False
+    assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
+    uva_offloading = True
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
@@ -573,12 +593,14 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
             break
 
         # `torch.empty_like` does not support `pin_memory` argument
-        cpu_data = torch.empty_strided(size=p.data.size(),
-                                       stride=p.data.stride(),
-                                       dtype=p.data.dtype,
-                                       layout=p.data.layout,
-                                       device='cpu',
-                                       pin_memory=pin_memory)
+        cpu_data = torch.empty_strided(
+            size=p.data.size(),
+            stride=p.data.stride(),
+            dtype=p.data.dtype,
+            layout=p.data.layout,
+            device="cpu",
+            pin_memory=pin_memory,
+        )
         cpu_data.copy_(p.data)
         if not uva_offloading:
             p.data = cpu_data
@@ -600,10 +622,7 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
                 k: v.to(device, non_blocking=True)
                 for k, v in module.state_dict().items()
             }
-            output = functional_call(module,
-                                     device_state,
-                                     args=args,
-                                     kwargs=kwargs)
+            output = functional_call(module, device_state, args=args, kwargs=kwargs)
             module.forward = forward
             return output
 
@@ -622,14 +641,18 @@ def make_layers(
     """
     from vllm.distributed.parallel_state import get_pp_group
     from vllm.distributed.utils import get_pp_indices
-    start_layer, end_layer = get_pp_indices(num_hidden_layers,
-                                            get_pp_group().rank_in_group,
-                                            get_pp_group().world_size)
+
+    start_layer, end_layer = get_pp_indices(
+        num_hidden_layers, get_pp_group().rank_in_group, get_pp_group().world_size
+    )
     modules = torch.nn.ModuleList(
-        [PPMissingLayer() for _ in range(start_layer)] + [
+        [PPMissingLayer() for _ in range(start_layer)]
+        + [
             maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
             for idx in range(start_layer, end_layer)
-        ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+        ]
+        + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]
+    )
     return start_layer, end_layer, modules
 
 
@@ -649,7 +672,7 @@ def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]:
             # NOTE: the trailing dot is used to match the prefix of the layer.
             # without the dot, we could match a layer that is not missing,
             # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
-            missing_layer_names.append(name + '.')
+            missing_layer_names.append(name + ".")
     _model_to_pp_missing_layer_names[model_id] = missing_layer_names
 
     return missing_layer_names
@@ -662,21 +685,22 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
 
     return any(
         name.startswith(missing_layer_name)
-        for missing_layer_name in get_pp_missing_layer_names(model))
+        for missing_layer_name in get_pp_missing_layer_names(model)
+    )
 
 
 def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int):
-
     def make_empty_intermediate_tensors(
         batch_size: int,
         dtype: torch.dtype,
         device: torch.device,
     ) -> IntermediateTensors:
-        return IntermediateTensors({
-            key:
-            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
-            for key in keys
-        })
+        return IntermediateTensors(
+            {
+                key: torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
+                for key in keys
+            }
+        )
 
     return make_empty_intermediate_tensors
 
@@ -694,6 +718,30 @@ def maybe_prefix(prefix: str, name: str) -> str:
     return name if not prefix else f"{prefix}.{name}"
 
 
+def get_draft_quant_config(
+    vllm_config: VllmConfig,
+) -> QuantizationConfig | None:
+    """Get quantization config for Draft models.
+
+    Draft models should use their own quantization config instead of the verifier/target
+    model's config. This helper retrieves the draft model's quantization config.
+
+    Args:
+        vllm_config: The vLLM configuration object.
+
+    Returns:
+        The draft model's config if available, None otherwise.
+    """
+    draft_model_config = vllm_config.speculative_config.draft_model_config
+    draft_load_config = vllm_config.load_config
+
+    return (
+        VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
+        if draft_model_config
+        else None
+    )
+
+
 def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
     """
     Extract the layer index from the module name.
@@ -711,15 +759,20 @@ def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
         except ValueError:
             continue
     if num_attn_module == 1 or "attn" not in layer_name:
-        assert len(int_vals) == 1, (f"layer name {layer_name} should"
-                                    " only contain one integer")
+        assert len(int_vals) == 1, (
+            f"layer name {layer_name} should only contain one integer"
+        )
 
         return int_vals[0]
     else:
-        assert len(int_vals) <= 2, (f"layer name {layer_name} should"
-                                    " contain most two integers")
-        layer_index = int_vals[0] * num_attn_module + int_vals[1] if len(
-            int_vals) == 2 else int_vals[0]
+        assert len(int_vals) <= 2, (
+            f"layer name {layer_name} should contain most two integers"
+        )
+        layer_index = (
+            int_vals[0] * num_attn_module + int_vals[1]
+            if len(int_vals) == 2
+            else int_vals[0]
+        )
         return layer_index
 
 
@@ -733,19 +786,20 @@ def cast_overflow_tensors(
     return tensors
 
 
-def fast_topk(values: torch.Tensor, topk: int,
-              dim: int) -> tuple[torch.Tensor, torch.Tensor]:
+def fast_topk(
+    values: torch.Tensor, topk: int, dim: int
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Optimized topk implementation that uses torch.max for k=1 case.
-    
+
     This function provides better performance for the common case of k=1
     by using torch.max instead of the more general torch.topk.
-    
+
     Args:
         values: Input tensor to find top-k values from
         topk: Number of top values to return (k). Must be > 0.
         dim: Dimension along which to compute topk
-        
+
     Returns:
         Tuple of (values, indices) where values are the top-k values
         and indices are their corresponding indices in the input tensor
@@ -757,14 +811,6 @@ def fast_topk(values: torch.Tensor, topk: int,
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
 
-
-def get_model_hidden_size(hf_config: PretrainedConfig) -> int:
-    if hasattr(hf_config, "hidden_size"):
-        return hf_config.hidden_size
-    text_config = hf_config.get_text_config()
-    return text_config.hidden_size
-
-
 # Chunk x along the num_tokens axis for sequence parallelism
 # NOTE: This is wrapped in a torch custom op to work around the following issue:
 # The output tensor can have a sequence length 0 at small input sequence lengths
@@ -804,5 +850,27 @@ direct_register_custom_op(
     op_name="sequence_parallel_chunk_impl",
     op_func=sequence_parallel_chunk_impl,
     fake_impl=sequence_parallel_chunk_impl_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(torch.Tag.needs_fixed_stride_order,),
 )
+
+
+def process_eagle_weight(
+    model: nn.Module,
+    name: str,
+) -> None:
+    """
+    Update EAGLE model flags based on loaded weight name.
+    This should be called during weight loading to detect if a model
+    has its own lm_head or embed_tokens weight.
+    Args:
+        model: The model instance (must support EAGLE)
+        name: The name of the weight to process
+    """
+    if not supports_any_eagle(model):
+        return
+
+    # To prevent overriding with target model's layers
+    if "lm_head" in name:
+        model.has_own_lm_head = True
+    if "embed_tokens" in name:
+        model.has_own_embed_tokens = True
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 08ad8fbeb4246e18a1f9f5ae0ad9a408187f25fb..e5d70eb7bc2fc9183e7e74f4fbf3f4adc471c03d 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -4,25 +4,33 @@
 import itertools
 import math
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Literal, Optional, Protocol, TypeVar, Union
+from collections.abc import Callable
+from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 from transformers import PretrainedConfig
 
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_gather)
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config import VllmConfig
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
 from vllm.logger import init_logger
-from vllm.platforms import _Backend, current_platform
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
-class VisionEncoderInfo(ABC, Generic[_C]):
+class _RootConfig(Protocol[_C]):
+    vision_config: _C
+
 
-    def __init__(self, hf_config: _C) -> None:
+class VisionEncoderInfo(ABC, Generic[_C]):
+    def __init__(self, hf_config: _RootConfig[_C]) -> None:
         super().__init__()
 
         self.hf_config = hf_config
@@ -54,8 +62,7 @@ class VisionLanguageConfig(Protocol):
     vision_config: Final[PretrainedConfig]
 
 
-def get_vision_encoder_info(
-        hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
+def get_vision_encoder_info(hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
     # Avoid circular imports
     from .clip import CLIPEncoderInfo, CLIPVisionConfig
     from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
@@ -72,25 +79,88 @@ def get_vision_encoder_info(
     raise NotImplementedError(msg)
 
 
-def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend:
+def get_vit_attn_backend(
+    head_size: int,
+    dtype: torch.dtype,
+    *,
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> AttentionBackendEnum:
     """
     Get the available attention backend for Vision Transformer.
     """
+    if attn_backend_override is not None:
+        return attn_backend_override
+
     # Lazy import to avoid circular dependency
     from vllm.attention.selector import get_env_variable_attn_backend
 
-    selected_backend: Optional[_Backend] = get_env_variable_attn_backend()
+    selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend()
     if selected_backend is not None:
         return selected_backend
 
     return current_platform.get_vit_attn_backend(head_size, dtype)
 
 
+def should_torch_compile_mm_vit(vllm_config: VllmConfig) -> bool:
+    """Callable to be passed to `@support_torch_compile`'s `enable_if` argument."""
+    return vllm_config.compilation_config.compile_mm_encoder
+
+
+VisionFeatureSelectStrategyStr = Literal["class", "default", "full"]
+
+VisionFeatureSelectStrategy: TypeAlias = (
+    VisionFeatureSelectStrategyStr | Callable[[torch.Tensor], torch.Tensor]
+)
+
+
+def _get_vision_feature_selector(
+    strategy: VisionFeatureSelectStrategy | str,
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    if callable(strategy):
+        return strategy
+
+    # https://github.com/huggingface/transformers/blob/cd74917ffc3e8f84e4a886052c5ab32b7ac623cc/src/transformers/models/clip/modeling_clip.py#L762
+    if strategy == "class":
+        return lambda feats: feats[:, :1, :]
+
+    # https://github.com/huggingface/transformers/blob/4a02bc7004285bdb12cc033e87ad2578ce2fa900/src/transformers/models/llava/modeling_llava.py#L196
+    if strategy == "default":
+        return lambda feats: feats[:, 1:, :]
+
+    if strategy == "full":
+        return lambda feats: feats
+
+    raise ValueError(f"Unexpected feature select strategy: {strategy!r}")
+
+
+def get_num_selected_vision_tokens(
+    num_vision_tokens: int,
+    strategy: VisionFeatureSelectStrategy | str,
+) -> int:
+    if callable(strategy):
+        dummy_features = torch.empty(1, num_vision_tokens, 64)  # [B, L, D]
+        dummy_selected_features = strategy(dummy_features)
+        return dummy_selected_features.shape[1]
+
+    if strategy == "class":
+        return 1
+
+    if strategy == "default":
+        return num_vision_tokens - 1
+
+    if strategy == "full":
+        return num_vision_tokens
+
+    raise ValueError(f"Unexpected feature select strategy: {strategy!r}")
+
+
 def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    feature_sample_layers: Optional[list[int]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
-    max_possible_layers: int,
+    encoder_outputs: torch.Tensor | list[torch.Tensor],
+    post_layer_norm: torch.nn.LayerNorm | None,
+    *,
+    select_layers: list[int] | None = None,
+    max_possible_layers: int | None = None,
+    feature_select_strategy: VisionFeatureSelectStrategy | None = None,
 ) -> torch.Tensor:
     """Given the outputs a visual encoder module that may correspond to the
     output of the last layer, or a list of hidden states to be stacked,
@@ -98,17 +168,34 @@ def resolve_visual_encoder_outputs(
 
     Args:
         encoder_outputs: Output of encoder's last layer or all hidden states.
-        feature_sample_layers: Optional layer indices to grab from the encoder
-            outputs; if provided, encoder outputs must be a list.
         post_layer_norm: Post norm to apply to the output of the encoder.
+        select_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
         max_possible_layers: Total layers in the fully loaded visual encoder.
-
+        feature_select_strategy: Defines how to select the hidden states
+            from each layer.
     """
-    if feature_sample_layers is None:
+    if select_layers is None:
+        if not isinstance(encoder_outputs, torch.Tensor):
+            raise ValueError(
+                "Expected only a single encoder output when "
+                "`select_layers` is not provided"
+            )
+
+        if feature_select_strategy is not None:
+            select_features = _get_vision_feature_selector(feature_select_strategy)
+            encoder_outputs = select_features(encoder_outputs)
+
         if post_layer_norm is not None:
             return post_layer_norm(encoder_outputs)
+
         return encoder_outputs
 
+    if max_possible_layers is None:
+        raise ValueError(
+            "`max_possible_layers` must be provided alongside `select_layers`"
+        )
+
     # Get the hidden states corresponding to the layer indices.
     # Negative values are relative to the full visual encoder,
     # so offset them depending on how many layers were loaded.
@@ -119,20 +206,27 @@ def resolve_visual_encoder_outputs(
     offset = max_possible_layers - num_loaded_layers
     hs_pool = [
         encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
-        for layer_idx in feature_sample_layers
+        if layer_idx >= 0
+        else encoder_outputs[layer_idx + offset]
+        for layer_idx in select_layers
     ]
 
+    if feature_select_strategy is not None:
+        select_features = _get_vision_feature_selector(feature_select_strategy)
+        hs_pool = [select_features(hs) for hs in hs_pool]
+
     # Apply post-norm on the final hidden state if we are using it
-    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    uses_last_layer = select_layers[-1] in (max_possible_layers - 1, -1)
     if post_layer_norm is not None and uses_last_layer:
-        hs_pool[-1] = post_layer_norm(encoder_outputs)
+        hs_pool[-1] = post_layer_norm(hs_pool[-1])
+
     return torch.cat(hs_pool, dim=-1)
 
 
-def run_dp_sharded_vision_model(image_input: torch.Tensor,
-                                vision_model: torch.nn.Module) -> torch.Tensor:
-    """Run a vision model with data parallelism (DP) sharding. The function 
+def run_dp_sharded_vision_model(
+    image_input: torch.Tensor, vision_model: torch.nn.Module
+) -> torch.Tensor:
+    """Run a vision model with data parallelism (DP) sharding. The function
     will shard the input image tensor on the first dimension and run the vision
     model
 
@@ -147,18 +241,17 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
     mp_world_size = get_tensor_model_parallel_world_size()
     num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size
     num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks
-    pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks)
+    pad = (0,) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks)
     image_input_padded = torch.nn.functional.pad(image_input, pad)
     rank = get_tensor_model_parallel_rank()
-    image_input_per_rank = image_input_padded[rank *
-                                              num_chunks_per_rank:(rank + 1) *
-                                              num_chunks_per_rank, ...]
+    image_input_per_rank = image_input_padded[
+        rank * num_chunks_per_rank : (rank + 1) * num_chunks_per_rank, ...
+    ]
 
     vision_embeddings = vision_model(image_input_per_rank)
     # Ensure tensor is contiguous before all_gather
     vision_embeddings = vision_embeddings.contiguous()
-    vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings,
-                                                         dim=0)
+    vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, dim=0)
     vision_embeddings = vision_embeddings[:num_chunks, ...]
     return vision_embeddings
 
@@ -168,27 +261,27 @@ def get_load_balance_assignment(
     num_gpus: int = 2,
 ) -> tuple[list[int], list[int], list[int]]:
     """
-    Generate load balancing assignment and metadata 
+    Generate load balancing assignment and metadata
     for distributing data across GPUs.
     The load is determined by the total image sizes,
     not the number of images.
-    
+
     Args:
         sizes: The size of each image
         num_gpus: Number of GPUs to balance across
-    
+
     Returns:
-        shuffle_indices: 
+        shuffle_indices:
             Indices to reorder data for balanced loading
-        gpu_sample_counts: 
+        gpu_sample_counts:
             Number of samples assigned to each GPU
-        grouped_sizes_per_gpu: 
+        grouped_sizes_per_gpu:
             Total size assigned to each GPU
-    
+
     Example:
         ```
         sizes = [1000, 100, 200, 50]
-        num_gpus=2
+        num_gpus = 2
         ```
 
     """
@@ -206,9 +299,9 @@ def get_load_balance_assignment(
     # Sort indices by size (largest first for better load balancing)
     # sizes = [1000, 100, 200, 50]
     # large_to_small_indices = [0, 2, 1, 3]
-    large_to_small_indices = sorted(range(n_samples),
-                                    key=lambda i: sizes[i],
-                                    reverse=True)
+    large_to_small_indices = sorted(
+        range(n_samples), key=lambda i: sizes[i], reverse=True
+    )
 
     for idx in large_to_small_indices:
         # Find GPU with minimum current load (by total size)
@@ -239,11 +332,11 @@ def run_dp_sharded_mrope_vision_model(
     *,
     rope_type: Literal["rope_3d", "rope_2d"],
 ) -> tuple[torch.Tensor, ...]:
-    """Run a vision model with data parallelism (DP) sharding. 
-    The function will shard the input image tensor on the 
+    """Run a vision model with data parallelism (DP) sharding.
+    The function will shard the input image tensor on the
     first dimension and run the vision model.
     This function is used to run the vision model with mrope.
-    
+
     Args:
         vision_model (torch.nn.Module): Vision model.
         pixel_values (torch.Tensor): Image/Video input tensor.
@@ -261,7 +354,7 @@ def run_dp_sharded_mrope_vision_model(
         vision_model.spatial_merge_size = 2
         pixel_values.shape = (1350, channel)
         grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]]
-        tp_size=2
+        tp_size = 2
         ```
 
     """
@@ -280,51 +373,57 @@ def run_dp_sharded_mrope_vision_model(
     # image_to_tp_rank = [0, 2, 1, 3]
     # gpu_sample_counts = [1, 3]
     # grouped_pixel_values_len = [1000, 350]
-    (image_to_tp_rank, gpu_sample_counts,
-     grouped_pixel_values_len) = get_load_balance_assignment(
-         patches_per_image, tp_size)
+    (image_to_tp_rank, gpu_sample_counts, grouped_pixel_values_len) = (
+        get_load_balance_assignment(patches_per_image, tp_size)
+    )
 
     # cu_gpu_sample_counts = [0, 1, 4]
     cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)]
 
     # GPU_0 image_idxs_local = [0]
     # GPU_1 image_idxs_local = [2, 1, 3]
-    image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]:
-                                        cum_gpu_sample_counts[tp_rank_local +
-                                                              1]]
+    image_idxs_local = image_to_tp_rank[
+        cum_gpu_sample_counts[tp_rank_local] : cum_gpu_sample_counts[tp_rank_local + 1]
+    ]
 
     # Get the pixel values for the local images based on the image_idxs_local
     if len(image_idxs_local) > 0:
-        pixel_values_local = torch.cat([
-            pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]]
-            for i in image_idxs_local
-        ])
+        pixel_values_local = torch.cat(
+            [
+                pixel_values[cum_patches_per_image[i] : cum_patches_per_image[i + 1]]
+                for i in image_idxs_local
+            ]
+        )
     else:
         # Handle case where this rank has no images
-        pixel_values_local = torch.empty((0, pixel_values.shape[1]),
-                                         device=pixel_values.device,
-                                         dtype=pixel_values.dtype)
+        pixel_values_local = torch.empty(
+            (0, pixel_values.shape[1]),
+            device=pixel_values.device,
+            dtype=pixel_values.dtype,
+        )
     # embed_dim_reduction_factor = 2 * 2
     if rope_type == "rope_2d":
-        embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] *
-                                      vision_model.merge_kernel_size[1])
+        embed_dim_reduction_factor = (
+            vision_model.merge_kernel_size[0] * vision_model.merge_kernel_size[1]
+        )
     else:
-        embed_dim_reduction_factor = (vision_model.spatial_merge_size *
-                                      vision_model.spatial_merge_size)
+        embed_dim_reduction_factor = (
+            vision_model.spatial_merge_size * vision_model.spatial_merge_size
+        )
 
     # Find the max length across all ranks
     # The output embedding of every DP rank has to be
     # padded to this length for tensor_model_parallel_all_gather
     # to work
-    max_len_per_rank = max(
-        grouped_pixel_values_len) // embed_dim_reduction_factor
+    max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
     # Run the vision model on the local pixel_values_local
     if rope_type == "rope_2d":
         if pixel_values_local.shape[0] > 0:
             image_embeds_local = vision_model(
-                pixel_values_local, torch.tensor(local_grid_thw_list))
+                pixel_values_local, torch.tensor(local_grid_thw_list)
+            )
             if isinstance(image_embeds_local, list):
                 image_embeds_local = torch.cat(image_embeds_local, dim=0)
         else:
@@ -332,16 +431,18 @@ def run_dp_sharded_mrope_vision_model(
             image_embeds_local = torch.empty(
                 (0, embed_dim_reduction_factor, out_dim),
                 device=pixel_values.device,
-                dtype=pixel_values.dtype)
+                dtype=pixel_values.dtype,
+            )
     else:
         if pixel_values_local.shape[0] > 0:
-            image_embeds_local = vision_model(pixel_values_local,
-                                              local_grid_thw_list)
+            image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list)
         else:
             # Handle empty case
-            image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
-                                             device=pixel_values.device,
-                                             dtype=pixel_values.dtype)
+            image_embeds_local = torch.empty(
+                (0, vision_model.out_hidden_size),
+                device=pixel_values.device,
+                dtype=pixel_values.dtype,
+            )
 
     # Pad the output based on max_len_per_rank
     # for tensor_model_parallel_all_gather to work
@@ -349,33 +450,40 @@ def run_dp_sharded_mrope_vision_model(
     if current_len < max_len_per_rank:
         padding_size = max_len_per_rank - current_len
         if rope_type == "rope_2d":
-            padding = torch.empty((padding_size, image_embeds_local.shape[1],
-                                   image_embeds_local.shape[2]),
-                                  dtype=image_embeds_local.dtype,
-                                  device=image_embeds_local.device)
+            padding = torch.empty(
+                (
+                    padding_size,
+                    image_embeds_local.shape[1],
+                    image_embeds_local.shape[2],
+                ),
+                dtype=image_embeds_local.dtype,
+                device=image_embeds_local.device,
+            )
         else:
-            padding = torch.empty((padding_size, image_embeds_local.shape[1]),
-                                  dtype=image_embeds_local.dtype,
-                                  device=image_embeds_local.device)
-        image_embeds_local_padded = torch.cat([image_embeds_local, padding],
-                                              dim=0)
+            padding = torch.empty(
+                (padding_size, image_embeds_local.shape[1]),
+                dtype=image_embeds_local.dtype,
+                device=image_embeds_local.device,
+            )
+        image_embeds_local_padded = torch.cat([image_embeds_local, padding], dim=0)
     else:
         image_embeds_local_padded = image_embeds_local
 
     # Do all_gather to collect embeddings from all ranks
-    gathered_embeds = tensor_model_parallel_all_gather(
-        image_embeds_local_padded, dim=0)
+    gathered_embeds = tensor_model_parallel_all_gather(image_embeds_local_padded, dim=0)
 
     # Remove padding and reconstruct per-rank embeddings
     rank_embeddings = list[torch.Tensor]()
     for rank in range(tp_size):
         start_idx = rank * max_len_per_rank
-        end_idx = start_idx + (grouped_pixel_values_len[rank] //
-                               embed_dim_reduction_factor)
+        end_idx = start_idx + (
+            grouped_pixel_values_len[rank] // embed_dim_reduction_factor
+        )
         rank_embeddings.append(gathered_embeds[start_idx:end_idx])
 
-    patches_per_output_image = [(patch_size // embed_dim_reduction_factor)
-                                for patch_size in patches_per_image]
+    patches_per_output_image = [
+        (patch_size // embed_dim_reduction_factor) for patch_size in patches_per_image
+    ]
 
     # Reconstruct embeddings in the original order
     original_order_embeddings = [None] * len(grid_thw_list)
@@ -386,7 +494,7 @@ def run_dp_sharded_mrope_vision_model(
             # Get images assigned to this rank in shuffled order
             # GPU_0 = image_idxs_local  [0]
             # GPU_1 = image_idxs_local  [2, 1, 3]
-            rank_images = image_to_tp_rank[current_idx:current_idx + count]
+            rank_images = image_to_tp_rank[current_idx : current_idx + count]
 
             rank_embed = rank_embeddings[rank]
             # Split rank embeddings back to individual images
@@ -394,11 +502,51 @@ def run_dp_sharded_mrope_vision_model(
             for img_idx in rank_images:
                 img_patches = patches_per_output_image[img_idx]
                 original_order_embeddings[img_idx] = rank_embed[
-                    embed_start:embed_start + img_patches]
+                    embed_start : embed_start + img_patches
+                ]
                 embed_start += img_patches
             current_idx += count
-    out_embeddings = tuple(embed for embed in original_order_embeddings
-                           if embed is not None)
-    assert len(out_embeddings) == len(
-        original_order_embeddings), "Found unassigned embeddings"
+    out_embeddings = tuple(
+        embed for embed in original_order_embeddings if embed is not None
+    )
+    assert len(out_embeddings) == len(original_order_embeddings), (
+        "Found unassigned embeddings"
+    )
     return out_embeddings
+
+
+def get_llm_pos_ids_for_vision(
+    start_idx: int,
+    vision_idx: int,
+    spatial_merge_size: int,
+    t_index: list[int],
+    grid_hs: torch.Tensor,
+    grid_ws: torch.Tensor,
+) -> torch.Tensor:
+    llm_pos_ids_list = []
+    llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+    llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+    h_index = (
+        torch.arange(llm_grid_h)
+        .view(1, -1, 1)
+        .expand(len(t_index), -1, llm_grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(llm_grid_w)
+        .view(1, 1, -1)
+        .expand(len(t_index), llm_grid_h, -1)
+        .flatten()
+    )
+    t_index_tensor = (
+        torch.Tensor(t_index)
+        .to(llm_grid_h.device)
+        .view(-1, 1)
+        .expand(-1, llm_grid_h * llm_grid_w)
+        .long()
+        .flatten()
+    )
+    _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+    llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+    llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+    return llm_pos_ids
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index b33e8d09c4be13291f4abbb71a15538dfb302d77..18ad8851fccdaa2a594b33a01dc216b06c660e76 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -5,15 +5,15 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from math import ceil
-from typing import Literal, Optional, Union, cast
+from typing import Literal, cast
 
 import numpy as np
 import regex as re
 import torch
 import torch.nn as nn
 from mistral_common.audio import mel_filter_bank
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.protocol.transcription.request import TranscriptionRequest
 from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
@@ -21,34 +21,43 @@ from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import SupportsPP
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-# yapf: disable
 from vllm.model_executor.models.whisper import WhisperEncoder
-# yapf: enable
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, MultiModalUUIDDict,
-                                    NestedTensors)
-from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalProcessingInfo,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    MultiModalUUIDDict,
+    NestedTensors,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (MistralTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import (
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+)
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsTranscription)
-from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
+from .utils import init_vllm_registered_model, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -110,14 +119,15 @@ class VoxtralProcessorAdapter:
         audio_length: int,
     ) -> int:
         pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
-            audio_length, self.sampling_rate)
+            audio_length, self.sampling_rate
+        )
         return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        audios: Optional[Union[np.ndarray, list[np.ndarray]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        audios: np.ndarray | list[np.ndarray] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> Mapping[str, NestedTensors]:
         if text is None:
@@ -140,7 +150,8 @@ class VoxtralProcessorAdapter:
                 "Make sure to process your input via `mistral_common`'s "
                 "tokenizer or pass a chat completion request. "
                 "For more info, see: "
-                "https://github.com/vllm-project/vllm/issues/8411.")
+                "https://github.com/vllm-project/vllm/issues/8411."
+            )
 
         audios_tokens = list[torch.Tensor]()
         audios_processed = list[torch.Tensor]()
@@ -151,23 +162,22 @@ class VoxtralProcessorAdapter:
             # pad if necessary
             audio = self._audio_processor.pad(audio, self.sampling_rate)
 
-            audio_tokens = [
-                self.begin_audio_token_id
-            ] + [self.audio_token_id] * self.get_num_audio_tokens(len(audio))
+            audio_tokens = [self.begin_audio_token_id] + [
+                self.audio_token_id
+            ] * self.get_num_audio_tokens(len(audio))
 
             audios_tokens.append(torch.tensor(audio_tokens))
             audios_processed.append(torch.tensor(audio))
 
-        return BatchFeature({
-            "input_ids":
-            torch.cat(audios_tokens)[None].expand(len(text), -1),
-            "audio_arrays":
-            audios_processed,
-        })
+        return BatchFeature(
+            {
+                "input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
+                "audio_arrays": audios_processed,
+            }
+        )
 
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
-
     def get_tokenizer(self) -> MistralTokenizer:
         tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
@@ -178,7 +188,7 @@ class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self) -> VoxtralProcessorAdapter:
         return VoxtralProcessorAdapter(self.get_tokenizer())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 5}  # Performance tends to degrade after 5
 
     def get_mm_max_tokens_per_item(
@@ -194,11 +204,11 @@ class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_max_audio_array_len(self) -> int:
         processor = self.get_hf_processor()
         return self.get_max_audio_tokens() * int(
-            processor.sampling_rate // processor.frame_rate)
+            processor.sampling_rate // processor.frame_rate
+        )
 
 
 class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
 
@@ -206,25 +216,30 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
 
         target_length = self.info.get_max_audio_array_len()
 
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(length=target_length, num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=target_length, num_audios=num_audios, overrides=audio_overrides
+            )
         }
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         tokenizer = self.info.get_tokenizer()
 
         dummy_text = self.get_dummy_text(mm_counts)
-        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
         dummy_audios = dummy_mm_data.get("audio", [])
 
         audio_chunks: list[AudioChunk] = []
@@ -238,9 +253,11 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
             chunk = AudioChunk(input_audio=RawAudio.from_audio(audio_item))
             audio_chunks.append(chunk)
 
-        request = ChatCompletionRequest(messages=[
-            UserMessage(content=[TextChunk(text=dummy_text), *audio_chunks]),
-        ])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(content=[TextChunk(text=dummy_text), *audio_chunks]),
+            ]
+        )
         res = tokenizer.mistral.encode_chat_completion(request)
         dummy_tokens = res.tokens
         # whixtral tokenizer adds padding to the audio
@@ -250,9 +267,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
         return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
 
 
-class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
-                                 ):
-
+class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]):
     def _get_mm_fields_config(
         self,
         hf_inputs: Mapping[str, NestedTensors],
@@ -288,11 +303,11 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
@@ -310,17 +325,21 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         return MultiModalDataParser(target_sr=sampling_rate)
 
 
-@MULTIMODAL_REGISTRY.register_processor(VoxtralMultiModalProcessor,
-                                        info=VoxtralProcessingInfo,
-                                        dummy_inputs=VoxtralDummyInputsBuilder)
-class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP, SupportsLoRA,
-                                      SupportsTranscription):
+@MULTIMODAL_REGISTRY.register_processor(
+    VoxtralMultiModalProcessor,
+    info=VoxtralProcessingInfo,
+    dummy_inputs=VoxtralDummyInputsBuilder,
+)
+class VoxtralForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
+):
+    merge_by_field_config = True
+
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -331,7 +350,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         # match the vLLM model names
         if hasattr(vllm_config, "quant_config"):
             vllm_config.quant_config = self.maybe_update_quant_config(
-                vllm_config.quant_config)
+                vllm_config.quant_config
+            )
 
         config = vllm_config.model_config.hf_config
         self.config = config
@@ -366,32 +386,22 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            audio_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      audio_embeddings)
-            input_ids = None
-
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
 
         return hidden_states
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...],
-               None]:
+    ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
         audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
         if audio_inputs is None:
             return None
@@ -402,50 +412,37 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             seq_len, dim = audio_embedding.shape
             # Pad such that seq_len is divisible by downsample_factor
             target_seq_len = self.downsample_factor * math.ceil(
-                seq_len / self.downsample_factor)
+                seq_len / self.downsample_factor
+            )
             audio_embedding = torch.nn.functional.pad(
                 audio_embedding,
                 (0, 0, 0, target_seq_len - seq_len),
             )
             audio_embeddings[i] = audio_embedding.reshape(
-                target_seq_len // self.downsample_factor,
-                dim * self.downsample_factor)
+                target_seq_len // self.downsample_factor, dim * self.downsample_factor
+            )
 
         # Concat, project and resplit
         audio_embeddings_packed = torch.cat(audio_embeddings, dim=0)
-        audio_embeddings_packed = self.audio_language_adapter(
-            audio_embeddings_packed)
-        audio_embeddings = torch.split(audio_embeddings_packed,
-                                       [a.shape[0] for a in audio_embeddings],
-                                       dim=0)
+        audio_embeddings_packed = self.audio_language_adapter(audio_embeddings_packed)
+        audio_embeddings = torch.split(
+            audio_embeddings_packed, [a.shape[0] for a in audio_embeddings], dim=0
+        )
 
         return audio_embeddings
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        audio_encoder = self.tokenizer.instruct.audio_encoder
-        audio_tok_id = audio_encoder.audio_token
-
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings, audio_tok_id)
-        return inputs_embeds
-
     def _parse_and_validate_audio_arrays(
-            self, **kwargs: object) -> Union[list[torch.Tensor], None]:
+        self, **kwargs: object
+    ) -> list[torch.Tensor] | None:
         audio_arrays = kwargs.pop("audio_arrays", None)
         if audio_arrays is None:
             return None
 
         if not isinstance(audio_arrays, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_arrays. "
-                             f"Got type: {type(audio_arrays)}")
+            raise ValueError(
+                f"Incorrect type of audio_arrays. Got type: {type(audio_arrays)}"
+            )
 
-        audio_arrays = flatten_bn(audio_arrays)
         if isinstance(audio_arrays, torch.Tensor):
             audio_arrays = list(audio_arrays.unbind(0))
         return audio_arrays
@@ -453,12 +450,13 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     @classmethod
-    def get_speech_to_text_config(cls, model_config: ModelConfig,
-                                  task_type: str) -> SpeechToTextConfig:
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio_config = tokenizer.instruct.audio_encoder.audio_config
         max_audio_clip_s = audio_config.chunk_length_s
@@ -472,19 +470,23 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     @classmethod
     # for speech-to-text transcription
-    def get_generation_prompt(cls, audio: np.ndarray,
-                              model_config: ModelConfig,
-                              stt_config: SpeechToTextConfig,
-                              language: Optional[str],
-                              task_type: Literal["transcribe", "translate"],
-                              request_prompt: str,
-                              to_language: Optional[str]) -> PromptType:
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
-        audio = Audio(audio, int(stt_config.sample_rate),
-                      format="wav")  # lossless
-        req = TranscriptionRequest(model=model_config.model,
-                                   audio=RawAudio.from_audio(audio),
-                                   language=language)
+        audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
+        req = TranscriptionRequest(
+            model=model_config.model,
+            audio=RawAudio.from_audio(audio),
+            language=language,
+        )
 
         tokenized = tokenizer.instruct.encode_transcription(req)
         audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
@@ -493,35 +495,44 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         return cast(PromptType, prompts_dict)
 
     @classmethod
-    def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig,
-                             model_config: ModelConfig) -> Optional[int]:
+    def get_num_audio_tokens(
+        cls,
+        audio_duration_s: float,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+    ) -> int | None:
         """
-        Map from audio duration to number of audio tokens produced by the ASR 
+        Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
         tokenizer = cached_tokenizer_from_config(model_config)
         adapter = VoxtralProcessorAdapter(tokenizer)
         return adapter.get_num_audio_tokens(
-            int(audio_duration_s * stt_config.sample_rate))
+            int(audio_duration_s * stt_config.sample_rate)
+        )
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        # fmt: off
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         remapping_rules = [
             (r"mm_whisper_embeddings\.(.*)", r"\1"),
             (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
-            (r"audio_language_adapter\.0\.weight", r"audio_language_adapter.w_in.weight"),  # noqa: E501
-            (r"audio_language_adapter\.2\.weight", r"audio_language_adapter.w_out.weight"),  # noqa: E501
+            (
+                r"audio_language_adapter\.0\.weight",
+                r"audio_language_adapter.w_in.weight",
+            ),
+            (
+                r"audio_language_adapter\.2\.weight",
+                r"audio_language_adapter.w_out.weight",
+            ),
         ]
-        # fmt: on
 
         audio_params = dict(
-            nn.ModuleDict({
-                "audio_language_adapter":
-                self.audio_language_adapter,
-            }).named_parameters())
+            nn.ModuleDict(
+                {
+                    "audio_language_adapter": self.audio_language_adapter,
+                }
+            ).named_parameters()
+        )
 
         loaded_weights = set()
 
@@ -529,10 +540,12 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             nonlocal loaded_weights
             for name, w in weights:
                 is_encoder = (
-                    name.startswith("mm_whisper_embeddings") and
-                    not name.startswith("mm_whisper_embeddings.tok_embeddings")
+                    name.startswith("mm_whisper_embeddings")
+                    and not name.startswith("mm_whisper_embeddings.tok_embeddings")
                     and not name.startswith(
-                        "mm_whisper_embeddings.audio_language_projection"))
+                        "mm_whisper_embeddings.audio_language_projection"
+                    )
+                )
 
                 for pattern, repl in remapping_rules:
                     if re.fullmatch(pattern, name):
@@ -563,7 +576,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         return loaded_weights
 
     def maybe_update_quant_config(
-            self, quant_config: QuantizationConfig) -> QuantizationConfig:
+        self, quant_config: QuantizationConfig
+    ) -> QuantizationConfig:
         """
         Update quant config to so that ignored module and target module names
         match the vLLM model names.
@@ -572,32 +586,54 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         """
         remapping_rules = [
             (r"output", r"language_model.lm_head"),
-            (r"layers\.(\d+)\.attention\.wo",
-             r"language_model.model.layers.\1.self_attn.out_proj"),
-            (r"layers\.(\d+)\.attention\.w(.*)",
-             r"language_model.model.layers.\1.self_attn.\2_proj"),
-            (r"layers\.(\d+)\.feed_forward\.w1",
-             r"language_model.model.layers.\1.mlp.gate_proj"),
-            (r"layers\.(\d+)\.feed_forward\.w2",
-             r"language_model.model.layers.\1.mlp.down_proj"),
-            (r"layers\.(\d+)\.feed_forward\.w3",
-             r"language_model.model.layers.\1.mlp.up_proj"),
-            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)",
-             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj"
-             ),
-            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo",
-             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj"
-             ),
-            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)",
-             r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"),
-            (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0",
-             r"whisper_encoder.whisper_encoder.conv1"),
-            (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1",
-             r"whisper_encoder.whisper_encoder.conv2"),
-            (r"mm_whisper_embeddings\.audio_language_projection\.0",
-             r"audio_language_adapter.w_in"),
-            (r"mm_whisper_embeddings\.audio_language_projection\.2",
-             r"audio_language_adapter.w_out"),
+            (
+                r"layers\.(\d+)\.attention\.wo",
+                r"language_model.model.layers.\1.self_attn.out_proj",
+            ),
+            (
+                r"layers\.(\d+)\.attention\.w(.*)",
+                r"language_model.model.layers.\1.self_attn.\2_proj",
+            ),
+            (
+                r"layers\.(\d+)\.feed_forward\.w1",
+                r"language_model.model.layers.\1.mlp.gate_proj",
+            ),
+            (
+                r"layers\.(\d+)\.feed_forward\.w2",
+                r"language_model.model.layers.\1.mlp.down_proj",
+            ),
+            (
+                r"layers\.(\d+)\.feed_forward\.w3",
+                r"language_model.model.layers.\1.mlp.up_proj",
+            ),
+            (
+                r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)",
+                r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj",
+            ),
+            (
+                r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo",
+                r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj",
+            ),
+            (
+                r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)",
+                r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2",
+            ),
+            (
+                r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0",
+                r"whisper_encoder.whisper_encoder.conv1",
+            ),
+            (
+                r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1",
+                r"whisper_encoder.whisper_encoder.conv2",
+            ),
+            (
+                r"mm_whisper_embeddings\.audio_language_projection\.0",
+                r"audio_language_adapter.w_in",
+            ),
+            (
+                r"mm_whisper_embeddings\.audio_language_projection\.2",
+                r"audio_language_adapter.w_out",
+            ),
         ]
 
         # Update ignore list
@@ -630,7 +666,6 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
 
 class AudioLanguageAdapter(nn.Module):
-
     def __init__(self, hidden_size: int, dim: int) -> None:
         super().__init__()
         self.w_in = nn.Linear(hidden_size, dim, bias=False)
@@ -644,19 +679,44 @@ class AudioLanguageAdapter(nn.Module):
 class VoxtralEncoderModel(nn.Module):
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
-    # fmt: off
     mistral_remapping = [
-        (r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1"), # noqa: E501
-        (r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.\2_proj.\3"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.out_proj.\2"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn_layer_norm.\2"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc1.\2"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc2.\2"), # noqa: E501
-        (r"whisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)", r"whisper_encoder.layers.\1.final_layer_norm.\2"), # noqa: E501
-        (r"whisper_encoder\.transformer\.norm\.(weight|bias)", r"whisper_encoder.layer_norm.\1"), # noqa: E501
+        (
+            r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
+            r"whisper_encoder.conv1.\1",
+        ),
+        (
+            r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
+            r"whisper_encoder.conv2.\1",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)",  # noqa: E501
+            r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)",  # noqa: E501
+            r"whisper_encoder.layers.\1.self_attn.out_proj.\2",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)",  # noqa: E501
+            r"whisper_encoder.layers.\1.self_attn_layer_norm.\2",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)",  # noqa: E501
+            r"whisper_encoder.layers.\1.mlp.fc1.\2",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)",  # noqa: E501
+            r"whisper_encoder.layers.\1.mlp.fc2.\2",
+        ),
+        (
+            r"whisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)",
+            r"whisper_encoder.layers.\1.final_layer_norm.\2",
+        ),
+        (
+            r"whisper_encoder\.transformer\.norm\.(weight|bias)",
+            r"whisper_encoder.layer_norm.\1",
+        ),
     ]
-    # fmt: on
 
     def __init__(
         self,
@@ -667,10 +727,11 @@ class VoxtralEncoderModel(nn.Module):
         super().__init__()
         self.config = cast(WhisperConfig, vllm_config.model_config.hf_config)
         self.dtype: torch.dtype = vllm_config.model_config.dtype
-        self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
-                                              prefix=maybe_prefix(
-                                                  prefix, "whisper_encoder"),
-                                              init_in_fp32=True)
+        self.whisper_encoder = WhisperEncoder(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "whisper_encoder"),
+            init_in_fp32=True,
+        )
         mel_filters = mel_filter_bank(
             num_frequency_bins=1 + self.config.window_size // 2,
             num_mel_bins=self.config.num_mel_bins,
@@ -685,8 +746,7 @@ class VoxtralEncoderModel(nn.Module):
         audio_waveforms: torch.Tensor,
     ) -> torch.Tensor:
         input_dtype = audio_waveforms.dtype
-        window = torch.hann_window(self.config.window_size).to(
-            audio_waveforms.device)
+        window = torch.hann_window(self.config.window_size).to(audio_waveforms.device)
         stft = torch.stft(
             audio_waveforms,
             self.config.window_size,
@@ -694,7 +754,7 @@ class VoxtralEncoderModel(nn.Module):
             window=window,
             return_complex=True,
         )
-        magnitudes = stft[..., :-1].abs()**2
+        magnitudes = stft[..., :-1].abs() ** 2
         mel_spec = self.mel_filters.T @ magnitudes
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
         log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
@@ -703,8 +763,9 @@ class VoxtralEncoderModel(nn.Module):
 
     @property
     def downsample_factor(self) -> int:
-        return self.whisper_encoder.conv1.stride[
-            0] * self.whisper_encoder.conv2.stride[0]
+        return (
+            self.whisper_encoder.conv1.stride[0] * self.whisper_encoder.conv2.stride[0]
+        )
 
     @property
     def chunk_size(self) -> int:
@@ -732,14 +793,13 @@ class VoxtralEncoderModel(nn.Module):
         return torch.stack(chunked_features), chunks_per_example
 
     def forward(
-        self, input_features: Union[torch.Tensor, list[torch.Tensor]]
+        self, input_features: torch.Tensor | list[torch.Tensor]
     ) -> list[torch.Tensor]:
         if not isinstance(input_features, list):
             input_features = [input_features]
 
         # Split long inputs into chunks
-        input_embeds, chunks_per_example = (
-            self.prepare_inputs_for_conv(input_features))
+        input_embeds, chunks_per_example = self.prepare_inputs_for_conv(input_features)
 
         # [total_num_chunks, ceil(chunk_size / downsample_factor), hidden_size]
         out = self.whisper_encoder([input_embeds])
@@ -748,7 +808,7 @@ class VoxtralEncoderModel(nn.Module):
         chunk_idx = 0
         results = []
         for n_chunks in chunks_per_example:
-            result = out[chunk_idx:chunk_idx + n_chunks].flatten(0, 1)
+            result = out[chunk_idx : chunk_idx + n_chunks].flatten(0, 1)
             results.append(result)
             chunk_idx += n_chunks
 
@@ -768,7 +828,7 @@ class VoxtralEncoderModel(nn.Module):
             if re.fullmatch(pattern, name):
                 name = re.sub(pattern, repl, name)
 
-        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+        for param_name, weight_name, shard_id in stacked_params_mapping:
             if weight_name not in name:
                 continue
             name = name.replace(weight_name, param_name)
@@ -779,8 +839,7 @@ class VoxtralEncoderModel(nn.Module):
             break
         else:
             param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
             weight_loader(param, loaded_weight)
 
         return name
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index de3e4f0592a62ec77ead8519ce2d05e50764f6ae..91a10b95a08c0a2523ec0799bdf6475200d5129e 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,48 +4,63 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Annotated, Literal, Optional, Union, cast
+from typing import Annotated, Literal, cast
 
 import numpy as np
 import torch
 from torch import nn
-from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
-                          WhisperProcessor)
+from transformers import (
+    BatchFeature,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+)
 from transformers.models.whisper.modeling_whisper import sinusoids
 
 from vllm.attention import Attention, AttentionType
 from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
-from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
-                         VllmConfig)
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
-from vllm.multimodal.processing import (BaseProcessingInfo,
-                                        EncDecMultiModalProcessor,
-                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.processing import (
+    BaseProcessingInfo,
+    EncDecMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.utils.torch_utils import set_default_torch_dtype
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
-                         SupportsTranscription)
-from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
-                    make_layers)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    cast_overflow_tensors,
+    make_layers,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -108,7 +123,7 @@ ISO639_1_SUPPORTED_LANGS = {
     "uk": "Ukrainian",
     "ur": "Urdu",
     "vi": "Vietnamese",
-    "cy": "Welsh"
+    "cy": "Welsh",
 }
 
 
@@ -120,8 +135,10 @@ class WhisperAudioInputs(TensorSchema):
         - t: Time frames (M)
     """
 
-    input_features: Annotated[Optional[NestedTensors],
-                              TensorShape("b", "nmb", "t")]
+    input_features: Annotated[
+        list[torch.Tensor] | None,
+        TensorShape("b", "nmb", "t"),
+    ]
 
 
 class WhisperEncoderAttention(MultiHeadAttention):
@@ -153,7 +170,6 @@ class WhisperEncoderAttention(MultiHeadAttention):
 
 
 class WhisperPositionalEmbedding(nn.Embedding):
-
     def __init__(self, num_positions: int, embedding_dim: int):
         super().__init__(num_positions, embedding_dim)
 
@@ -162,15 +178,14 @@ class WhisperPositionalEmbedding(nn.Embedding):
 
 
 class WhisperAttention(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
         attn_type: AttentionType = AttentionType.DECODER,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -196,7 +211,8 @@ class WhisperAttention(nn.Module):
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: "
-                f"{self.embed_dim} and `num_heads`: {num_heads}).")
+                f"{self.embed_dim} and `num_heads`: {num_heads})."
+            )
         self.scaling = self.head_dim**-0.5
 
         self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
@@ -241,7 +257,7 @@ class WhisperAttention(nn.Module):
         self,
         embed_dim: int,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         self.qkv_proj = QKVParallelLinear(
@@ -269,14 +285,13 @@ class WhisperAttention(nn.Module):
 
 
 class WhisperCrossAttention(WhisperAttention):
-
     def __init__(
         self,
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__(
@@ -293,7 +308,7 @@ class WhisperCrossAttention(WhisperAttention):
         self,
         embed_dim: int,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         self.q_proj = ColumnParallelLinear(
@@ -316,7 +331,7 @@ class WhisperCrossAttention(WhisperAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
         q, _ = self.q_proj(hidden_states)
 
@@ -336,13 +351,12 @@ class WhisperCrossAttention(WhisperAttention):
 
 
 class WhisperMLP(nn.Module):
-
     def __init__(
         self,
         embed_dim: int,
         ffn_dim: int,
         act_fn: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -369,7 +383,6 @@ class WhisperMLP(nn.Module):
 
 
 class WhisperEncoderLayer(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -414,7 +427,6 @@ class WhisperEncoderLayer(nn.Module):
 
 
 class WhisperDecoderLayer(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -450,7 +462,7 @@ class WhisperDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -474,58 +486,49 @@ class WhisperDecoderLayer(nn.Module):
 
 
 class WhisperEncoder(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 init_in_fp32: bool = False):
+    def __init__(
+        self, *, vllm_config: VllmConfig, prefix: str = "", init_in_fp32: bool = False
+    ):
         super().__init__()
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
         self.num_mel_bins = config.num_mel_bins
         self.max_source_positions = config.max_source_positions
-        self.embed_scale = (math.sqrt(embed_dim)
-                            if config.scale_embedding else 1.0)
-
-        self.conv1 = nn.Conv1d(self.num_mel_bins,
-                               embed_dim,
-                               kernel_size=3,
-                               padding=1)
-        self.conv2 = nn.Conv1d(embed_dim,
-                               embed_dim,
-                               kernel_size=3,
-                               stride=2,
-                               padding=1)
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.encoder_layers,
-            lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
-                                               prefix=f"{prefix}.layers"),
+            lambda prefix: WhisperEncoderLayer(
+                vllm_config=vllm_config, prefix=f"{prefix}.layers"
+            ),
             prefix=f"{prefix}.layers",
         )
         self.layer_norm = nn.LayerNorm(config.d_model)
 
-        maybe_fp32_init_ctx = set_default_torch_dtype(
-            torch.float32) if init_in_fp32 else nullcontext()
+        maybe_fp32_init_ctx = (
+            set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
+        )
 
         with (
-                torch.no_grad(),
-                maybe_fp32_init_ctx,
+            torch.no_grad(),
+            maybe_fp32_init_ctx,
         ):
-            self.embed_positions = nn.Embedding(self.max_source_positions,
-                                                embed_dim)
+            self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
             self.embed_positions.weight.copy_(
-                sinusoids(*self.embed_positions.weight.shape))
+                sinusoids(*self.embed_positions.weight.shape)
+            )
 
-    def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]):
+    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
         hidden_states = []
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
             embeds = nn.functional.gelu(self.conv2(embeds))
             embeds = embeds.transpose(-1, -2)
-            embeds = (embeds +
-                      self.embed_positions.weight[:embeds.size(-2), :]).to(
-                          embeds.dtype)
+            embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
+                embeds.dtype
+            )
             hidden_states.append(embeds)
         hidden_states = torch.cat(hidden_states)
 
@@ -537,7 +540,6 @@ class WhisperEncoder(nn.Module):
 
 
 class WhisperDecoder(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -545,17 +547,19 @@ class WhisperDecoder(nn.Module):
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_target_positions
         self.max_source_positions = config.max_source_positions
-        self.embed_scale = (math.sqrt(config.d_model)
-                            if config.scale_embedding else 1.0)
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
-                                         self.padding_idx)
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.d_model, self.padding_idx
+        )
         self.embed_positions = WhisperPositionalEmbedding(
-            self.max_target_positions, config.d_model)
+            self.max_target_positions, config.d_model
+        )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.decoder_layers,
-            lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
-                                               prefix=f"{prefix}.layers"),
+            lambda prefix: WhisperDecoderLayer(
+                vllm_config=vllm_config, prefix=f"{prefix}.layers"
+            ),
             prefix=f"{prefix}.layers",
         )
         self.layer_norm = nn.LayerNorm(config.d_model)
@@ -564,9 +568,9 @@ class WhisperDecoder(nn.Module):
         self,
         input_ids,
         positions: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
-        inputs_embeds = self.get_input_embeddings(input_ids)
+        inputs_embeds = self.embed_input_ids(input_ids)
         positions = self.embed_positions(positions)
         hidden_states = inputs_embeds + positions
 
@@ -579,26 +583,24 @@ class WhisperDecoder(nn.Module):
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
 
 class WhisperModel(nn.Module):
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.encoder = WhisperEncoder(vllm_config=vllm_config,
-                                      prefix=f"{prefix}.encoder")
-        self.decoder = WhisperDecoder(vllm_config=vllm_config,
-                                      prefix=f"{prefix}.decoder")
+        self.encoder = WhisperEncoder(
+            vllm_config=vllm_config, prefix=f"{prefix}.encoder"
+        )
+        self.decoder = WhisperDecoder(
+            vllm_config=vllm_config, prefix=f"{prefix}.decoder"
+        )
 
     def forward(
         self,
-        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
-        input_ids: Optional[torch.Tensor],
+        input_features: torch.Tensor | list[torch.Tensor] | None,
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
     ) -> torch.Tensor:
         encoder_outputs = self.get_encoder_outputs(input_features)
@@ -611,14 +613,13 @@ class WhisperModel(nn.Module):
 
     def get_encoder_outputs(
         self,
-        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
-    ) -> Optional[torch.Tensor]:
+        input_features: torch.Tensor | list[torch.Tensor] | None,
+    ) -> torch.Tensor | None:
         if input_features is None:
             return None
         return self.encoder(input_features)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
@@ -648,33 +649,20 @@ class WhisperModel(nn.Module):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
 class WhisperProcessingInfo(BaseProcessingInfo):
-
     def get_hf_config(self) -> WhisperConfig:
         return self.ctx.get_hf_config(WhisperConfig)
 
-    def get_hf_processor(self, **kwargs: object) -> WhisperProcessor:
-        # HACK: Transformers 4.53.2 has issue with whisper tokenizer to
-        # initialize processor. We use a monkeypatch to fix it here.
-        # See: https://github.com/vllm-project/vllm/issues/20224
-        processor_class = WhisperProcessor
-        tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
-        if processor_class.tokenizer_class != tokenizer_class:
-            processor_class.tokenizer_class = tokenizer_class
-        return self.ctx.get_hf_processor(processor_class, **kwargs)
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
-    def get_feature_extractor(self,
-                              **kwargs: object) -> WhisperFeatureExtractor:
+    def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
@@ -685,7 +673,6 @@ class WhisperProcessingInfo(BaseProcessingInfo):
 
 
 class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
 
@@ -695,6 +682,7 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
@@ -702,15 +690,16 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+            "audio": self._get_dummy_audios(
+                length=audio_len, num_audios=num_audios, overrides=audio_overrides
+            )
         }
 
 
-class WhisperMultiModalProcessor(
-        EncDecMultiModalProcessor[WhisperProcessingInfo]):
-
+class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
@@ -721,9 +710,9 @@ class WhisperMultiModalProcessor(
 
     def create_encoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         # Strictly speaking, whisper encoder only accept audio features.
         # We create a dummy encoder prompt here which will be padded to
         # num_audio_tokens. So that we can create dummy data from this
@@ -777,11 +766,15 @@ class WhisperMultiModalProcessor(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(WhisperMultiModalProcessor,
-                                        info=WhisperProcessingInfo,
-                                        dummy_inputs=WhisperDummyInputsBuilder)
-class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
-                                      SupportsMultiModal):
+@MULTIMODAL_REGISTRY.register_processor(
+    WhisperMultiModalProcessor,
+    info=WhisperProcessingInfo,
+    dummy_inputs=WhisperDummyInputsBuilder,
+)
+class WhisperForConditionalGeneration(
+    nn.Module, SupportsTranscription, SupportsMultiModal
+):
+    merge_by_field_config = True
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
@@ -791,17 +784,16 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"],
     }
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
-        ".fc1.": ".mlp.fc1.",
-        ".fc2.": ".mlp.fc2."
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}
+    )
 
     # Whisper only supports audio-conditioned generation.
     supports_transcription_only = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             # TODO language should be optional and can be guessed.
             # For now we default to en. See
@@ -809,23 +801,26 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
             logger.warning(
                 "Defaulting to language='en'. If you wish to transcribe "
                 "audio in a different language, pass the `language` field "
-                "in the TranscriptionRequest.")
+                "in the TranscriptionRequest."
+            )
             language = "en"
         return super().validate_language(language)
 
     @classmethod
     def get_generation_prompt(
-            cls,
-            audio: np.ndarray,
-            model_config: ModelConfig,  # not needed here
-            stt_config: SpeechToTextConfig,
-            language: Optional[str],
-            task_type: Literal["transcribe", "translate"],
-            request_prompt: str,
-            to_language: Optional[str]) -> PromptType:
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,  # not needed here
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
         if language is None:
             raise ValueError(
-                "Language must be specified when creating the Whisper prompt")
+                "Language must be specified when creating the Whisper prompt"
+            )
         prompt = {
             "encoder_prompt": {
                 # Whisper does not support encoder prompt.
@@ -834,23 +829,25 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
                     "audio": (audio, stt_config.sample_rate),
                 },
             },
-            "decoder_prompt":
-            ((f"<|prev|>{request_prompt}" if request_prompt else "") +
-             f"<|startoftranscript|><|{language}|>" +
-             f"<|{task_type}|><|notimestamps|>")
+            "decoder_prompt": (
+                (f"<|prev|>{request_prompt}" if request_prompt else "")
+                + f"<|startoftranscript|><|{language}|>"
+                + f"<|{task_type}|><|notimestamps|>"
+            ),
         }
         return cast(PromptType, prompt)
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return None
 
         raise ValueError("Only audio modality is supported")
 
     @classmethod
-    def get_speech_to_text_config(cls, model_config: ModelConfig,
-                                  task_type: str) -> SpeechToTextConfig:
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
         processor = cached_get_processor(model_config.model)
 
         return SpeechToTextConfig(
@@ -859,9 +856,12 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         )
 
     @classmethod
-    def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig,
-                             model_config: ModelConfig) -> Optional[int]:
+    def get_num_audio_tokens(
+        cls,
+        audio_duration_s: float,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+    ) -> int | None:
         processor = cached_get_processor(model_config.model)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
@@ -869,8 +869,7 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         # prompts directly at least not to Whisper.
         # One indicator of the encoder amount of processing
         # is the log-mel spectogram length.
-        return math.ceil(audio_duration_s * stt_config.sample_rate /
-                         hop_length)
+        return math.ceil(audio_duration_s * stt_config.sample_rate / hop_length)
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -880,15 +879,16 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         self.dtype = vllm_config.model_config.dtype
 
         self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
-        self.unpadded_vocab_size = config.vocab_size
-        self.proj_out = ParallelLMHead(config.vocab_size,
-                                       config.d_model,
-                                       quant_config=quant_config)
-        self.proj_out = self.proj_out.tie_weights(
-            self.model.decoder.embed_tokens)
+
+        self.proj_out = ParallelLMHead(
+            config.vocab_size,
+            config.d_model,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "proj_out"),
+        )
+        self.proj_out = self.proj_out.tie_weights(self.model.decoder.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def forward(
         self,
@@ -907,31 +907,28 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
     def get_language_model(self) -> torch.nn.Module:
         return self.model.decoder
 
-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         return [self.model.get_encoder_outputs(audio_input["input_features"])]
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # This method just returns the decoder sequence embeddings since
         # Whisper does not have encoder text tokens.
-        return self.model.decoder.get_input_embeddings(input_ids)
+        return self.model.decoder.embed_input_ids(input_ids)
 
-    def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> WhisperAudioInputs:
+    def _parse_and_validate_audio_input(self, **kwargs: object) -> WhisperAudioInputs:
         input_features = kwargs.pop("input_features", None)
 
         if input_features is not None:
-            if not isinstance(input_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio features. "
-                                 f"Got type: {type(input_features)}")
-            input_features = torch.cat(
-                [feat.to(self.dtype) for feat in input_features])
+            input_features = json_map_leaves(lambda x: x.to(self.dtype), input_features)
 
         return WhisperAudioInputs(input_features=input_features)
 
@@ -939,8 +936,7 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         logits = self.logits_processor(self.proj_out, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
 
         # add fake zeros bias for k_proj to state_dict
@@ -949,7 +945,7 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
 
 
 def _create_fake_bias_for_k_proj(
-    weights: Iterable[tuple[str, torch.Tensor]]
+    weights: Iterable[tuple[str, torch.Tensor]],
 ) -> Iterable[tuple[str, torch.Tensor]]:
     """
     Create full zeros bias for k_proj weight in self-attn and x-attn layers.
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 1d68320bd9b293d50ca188f0174072a027b88d63..64e6979c8fcfbfc697f1c4e49e0cbe15fdb6f49d 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch Zamba2 model implementation for vLLM.
 
-This module implements the Zamba2 architecture from 
-https://arxiv.org/abs/2411.15242, which combines Mamba and Transformer 
-architectures in a hybrid model optimized for efficient sequence modeling. The 
+This module implements the Zamba2 architecture from
+https://arxiv.org/abs/2411.15242, which combines Mamba and Transformer
+architectures in a hybrid model optimized for efficient sequence modeling. The
 model alternates between state space model layers and attention-based layers.
 """
+
 from collections.abc import Iterable
 from itertools import cycle
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -21,29 +22,35 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid
+from .interfaces import HasInnerState, IsHybrid, SupportsMambaPrefixCaching
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class Zamba2LoRA(nn.Module):
     """LoRA layer for the Zamba2 model.
-    
+
     Implements a LoRA layer that is used in shared attention and gated MLP
     blocks.
     """
@@ -52,12 +59,12 @@ class Zamba2LoRA(nn.Module):
         self,
         input_dim: int,
         rank: int,
-        output_dim: Union[int, list[int]],
-        quant_config: Optional[QuantizationConfig] = None,
+        output_dim: int | list[int],
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         """Initialize the attention layer.
-        
+
         Args:
             input_dim: input dimension
             rank: LoRA rank
@@ -66,20 +73,20 @@ class Zamba2LoRA(nn.Module):
         """
         super().__init__()
 
-        self.A = ColumnParallelLinear(input_dim,
-                                      rank,
-                                      bias=False,
-                                      quant_config=quant_config,
-                                      gather_output=True)
+        self.A = ColumnParallelLinear(
+            input_dim,
+            rank,
+            bias=False,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=f"{prefix}.A",
+        )
 
         if isinstance(output_dim, list):
             B_class = MergedColumnParallelLinear
         else:
             B_class = ColumnParallelLinear
-        self.B = B_class(rank,
-                         output_dim,
-                         bias=False,
-                         quant_config=quant_config)
+        self.B = B_class(rank, output_dim, bias=False, quant_config=quant_config)
 
     def forward(
         self,
@@ -92,8 +99,8 @@ class Zamba2LoRA(nn.Module):
 
 class Zamba2Attention(nn.Module):
     """Multi-head attention mechanism for the Zamba2 model.
-    
-    Implements attention with parallel computation, QKV projections, optional 
+
+    Implements attention with parallel computation, QKV projections, optional
     adapters and rotary position embeddings. The attention is computed across
     distributed blocks for efficient processing.
     """
@@ -103,12 +110,12 @@ class Zamba2Attention(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the attention layer.
-        
+
         Args:
             config: The Zamba2 model configuration
             bare_block_idx: Index of the bare attention block
@@ -129,15 +136,17 @@ class Zamba2Attention(nn.Module):
         self.num_attention_heads = config.num_attention_heads // tp_size
         self.attention_head_dim = config.attention_head_dim
         self.qkv_size = self.attention_hidden_size // tp_size
-        self.scale = (self.attention_head_dim / 2)**-0.5
+        self.scale = (self.attention_head_dim / 2) ** -0.5
 
-        if (self.attention_head_dim *
-                self.total_num_attention_heads) != self.attention_hidden_size:
+        if (
+            self.attention_head_dim * self.total_num_attention_heads
+        ) != self.attention_hidden_size:
             raise ValueError(
                 f"attention_hidden_size must be divisible by"
                 f" num_attention_heads"
                 f" (got `attention_hidden_size`: {self.attention_hidden_size}"
-                f" and `num_heads`: {self.num_attention_heads}).")
+                f" and `num_heads`: {self.num_attention_heads})."
+            )
 
         self.qkv_proj = QKVParallelLinear(
             self.attention_hidden_size,
@@ -145,11 +154,15 @@ class Zamba2Attention(nn.Module):
             self.total_num_attention_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.attention_hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
-        self.o_proj = RowParallelLinear(self.attention_hidden_size,
-                                        config.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config)
 
         # Even though in Zamba2 weights are shared between attention layers, KV
         # cache is unique for every attention layer. Hence, we need to define
@@ -158,8 +171,11 @@ class Zamba2Attention(nn.Module):
 
         # Initialize attention blocks with proper indexing
         self.dpa_list = nn.ModuleList([])
-        j = bare_block_idx * (self.num_hybrid_layers + config.num_mem_blocks -
-                              1) // config.num_mem_blocks
+        j = (
+            bare_block_idx
+            * (self.num_hybrid_layers + config.num_mem_blocks - 1)
+            // config.num_mem_blocks
+        )
         for block_idx in range(self.num_hybrid_layers):
             if block_idx % config.num_mem_blocks == bare_block_idx:
                 dpa = Attention(
@@ -187,18 +203,21 @@ class Zamba2Attention(nn.Module):
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_q_adapter",
                     )
                     linear_k_adapter = Zamba2LoRA(
                         self.attention_hidden_size,
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_k_adapter",
                     )
                     linear_v_adapter = Zamba2LoRA(
                         self.attention_hidden_size,
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_v_adapter",
                     )
                 else:
                     linear_q_adapter = nn.Identity()
@@ -226,18 +245,17 @@ class Zamba2Attention(nn.Module):
         position_ids: torch.Tensor,
     ) -> torch.Tensor:
         """Forward pass through the attention layer.
-        
+
         Args:
             hidden_states: Input tensor [batch_size, seq_len, hidden_size]
             position_ids: Position IDs for positional embeddings
             block_idx: Current shared transformer block index
-            
+
         Returns:
             Output tensor [batch_size, seq_len, hidden_size]
         """
         qkv, _ = self.qkv_proj(hidden_states)
-        query_states, key_states, value_states = qkv.split([self.qkv_size] * 3,
-                                                           dim=-1)
+        query_states, key_states, value_states = qkv.split([self.qkv_size] * 3, dim=-1)
 
         if self.config.use_shared_attention_adapter:
             # Apply adapter transformations to Q, K, V if enabled
@@ -257,9 +275,9 @@ class Zamba2Attention(nn.Module):
             value_states = value_states + v_lora_output
 
         if self.config.use_mem_rope:
-            query_states, key_states = self.rotary_emb(position_ids,
-                                                       query_states,
-                                                       key_states)
+            query_states, key_states = self.rotary_emb(
+                position_ids, query_states, key_states
+            )
 
         y = self.dpa_list[block_idx](query_states, key_states, value_states)
         y, _ = self.o_proj(y)
@@ -268,9 +286,9 @@ class Zamba2Attention(nn.Module):
 
 class Zamba2MLP(nn.Module):
     """Feed-forward MLP layer for the Zamba2 model.
-    
-    Implements a gated feed-forward network that projects inputs to a larger 
-    intermediate size, applies GELU activation with gating, then projects back 
+
+    Implements a gated feed-forward network that projects inputs to a larger
+    intermediate size, applies GELU activation with gating, then projects back
     to the original size. Includes optional adapter layers for model adaptation.
     """
 
@@ -279,11 +297,11 @@ class Zamba2MLP(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: dict[int, int],
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the MLP layer.
-        
+
         Args:
             config: The Zamba2 model configuration
             bare_block_idx: Index of the bare block in the model
@@ -302,17 +320,24 @@ class Zamba2MLP(nn.Module):
             self.hidden_size,
             2 * [self.intermediate_size],  # 2x for gate and input projections
             bias=self.config.add_bias_linear,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
 
-        self.down_proj = RowParallelLinear(self.intermediate_size,
-                                           self.hidden_size,
-                                           bias=self.config.add_bias_linear,
-                                           quant_config=quant_config)
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
 
         # Only allow GELU activations
         if config.hidden_act != "gelu":
-            raise ValueError(f"Only GELU activation is supported "
-                             f"(got `hidden_act`: {config.hidden_act})")
+            raise ValueError(
+                f"Only GELU activation is supported "
+                f"(got `hidden_act`: {config.hidden_act})"
+            )
         self.act_fn = GeluAndMul()
 
         # Initialize adapter layers
@@ -329,14 +354,13 @@ class Zamba2MLP(nn.Module):
                 gate_up_proj_adapter = nn.Identity()
             self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
 
-    def forward(self, hidden_states: torch.Tensor,
-                block_idx: int) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, block_idx: int) -> torch.Tensor:
         """Forward pass through the MLP layer.
-        
+
         Args:
             hidden_states: Input tensor [batch_size, seq_len, hidden_size]
             block_idx: Current shared transformer block index
-            
+
         Returns:
             Output tensor [batch_size, seq_len, hidden_size] after applying
             gated feed-forward transformation
@@ -360,7 +384,7 @@ class Zamba2MLP(nn.Module):
 
 class Zamba2AttentionDecoderLayer(nn.Module):
     """Single decoder layer combining attention and feed-forward networks.
-    
+
     This layer implements a standard transformer block with:
     - Input layer normalization
     - Multi-head self-attention
@@ -373,12 +397,12 @@ class Zamba2AttentionDecoderLayer(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the decoder layer.
-        
+
         Args:
             config: The Zamba2 model configuration
             bare_block_idx: Index of the bare block
@@ -405,15 +429,14 @@ class Zamba2AttentionDecoderLayer(nn.Module):
             bare_block_idx=bare_block_idx,
             num_hybrid_layers=num_hybrid_layers,
             quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
         )
 
         # Initialize layer normalizations
         # Input normalization operates on concatenated states
-        self.input_layernorm = RMSNorm(2 * config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(2 * config.hidden_size, eps=config.rms_norm_eps)
         # Pre-FF normalization operates on attention output
-        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
-                                        eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -423,14 +446,14 @@ class Zamba2AttentionDecoderLayer(nn.Module):
         positions: torch.Tensor,
     ) -> torch.Tensor:
         """Forward pass through the decoder layer.
-        
+
         Args:
             hidden_states: Input tensor from previous layer
-            original_hidden_states: Original input tensor for residual 
+            original_hidden_states: Original input tensor for residual
                 connection
             block_idx: Current shared transformer block index
             positions: IDs for positional embeddings
-            
+
         Returns:
             Transformed hidden states after attention and feed-forward
         """
@@ -440,7 +463,8 @@ class Zamba2AttentionDecoderLayer(nn.Module):
         # The concatenated tensor is then used as input of the pre-attention
         # RMSNorm (see fig. 2 in https://arxiv.org/pdf/2405.16712).
         hidden_states = torch.concatenate(
-            [hidden_states, original_hidden_states], dim=-1)
+            [hidden_states, original_hidden_states], dim=-1
+        )
 
         # Layer norm before attention
         hidden_states = self.input_layernorm(hidden_states)
@@ -463,20 +487,22 @@ class Zamba2AttentionDecoderLayer(nn.Module):
 
 class Zamba2MambaDecoderLayer(nn.Module):
     """Single Mamba decoder layer with normalization.
-    
-    This implements a  Mamba block. It includes input normalization 
-    and can process sequences using either chunked or full 
+
+    This implements a  Mamba block. It includes input normalization
+    and can process sequences using either chunked or full
     computation depending on configuration.
     """
 
-    def __init__(self,
-                 config: Zamba2Config,
-                 model_config: Optional[ModelConfig] = None,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+    def __init__(
+        self,
+        config: Zamba2Config,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
         """Initialize the Mamba decoder layer.
-        
+
         Args:
             config: The Zamba2 model configuration
             quant_config: Configuration for model quantization
@@ -485,43 +511,43 @@ class Zamba2MambaDecoderLayer(nn.Module):
 
         # Initialize Mamba mixer with expanded intermediate size
         intermediate_size = config.mamba_expand * config.hidden_size
-        self.mamba = MambaMixer2(hidden_size=config.hidden_size,
-                                 ssm_state_size=config.mamba_d_state,
-                                 conv_kernel_size=config.mamba_d_conv,
-                                 intermediate_size=intermediate_size,
-                                 use_conv_bias=config.use_conv_bias,
-                                 use_bias=config.add_bias_linear,
-                                 n_groups=config.mamba_ngroups,
-                                 num_heads=config.n_mamba_heads,
-                                 head_dim=intermediate_size //
-                                 config.n_mamba_heads,
-                                 rms_norm_eps=config.rms_norm_eps,
-                                 activation="silu",
-                                 model_config=model_config,
-                                 cache_config=cache_config,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mixer")
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=intermediate_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.add_bias_linear,
+            n_groups=config.mamba_ngroups,
+            num_heads=config.n_mamba_heads,
+            head_dim=intermediate_size // config.n_mamba_heads,
+            rms_norm_eps=config.rms_norm_eps,
+            activation="silu",
+            model_config=model_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
 
         # Input normalization
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        transformer_hidden_states: Optional[torch.Tensor] = None,
-        positions: Optional[torch.Tensor] = None,
-        original_hidden_states: Optional[torch.Tensor] = None,
+        transformer_hidden_states: torch.Tensor | None = None,
+        positions: torch.Tensor | None = None,
+        original_hidden_states: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass through the Mamba decoder layer.
-        
+
         Args:
             hidden_states: Input tensor [batch_size, seq_len, hidden_size]
             transformer_hidden_states: Optional output from transformer path
                 Added to input if provided (used in hybrid architecture)
             positions: Optional position IDs (unused in Mamba)
             original_hidden_states: Optional original inputs (unused in Mamba)
-            
+
         Returns:
             Transformed hidden states with residual connection applied
         """
@@ -555,7 +581,7 @@ class Zamba2MambaDecoderLayer(nn.Module):
 
 class Zamba2HybridLayer(nn.Module):
     """Hybrid layer combining Transformer and Mamba architectures.
-    
+
     This layer implements the hybrid architecture described in the Zamba paper,
     where a shared transformer pathway processes input in parallel with a Mamba
     pathway. The transformer output is projected and added to the Mamba input
@@ -567,28 +593,33 @@ class Zamba2HybridLayer(nn.Module):
         shared_transformer: Zamba2AttentionDecoderLayer,
         config: Zamba2Config,
         block_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the hybrid layer.
-        
+
         Args:
             shared_transformer: Transformer decoder layer for attention pathway
         """
         super().__init__()
         self.block_idx = block_idx
         self.shared_transformer = shared_transformer
-        self.linear = ReplicatedLinear(config.hidden_size,
-                                       config.hidden_size,
-                                       bias=False,
-                                       quant_config=quant_config)
-        self.mamba_decoder = Zamba2MambaDecoderLayer(config,
-                                                     model_config=model_config,
-                                                     cache_config=cache_config,
-                                                     quant_config=quant_config,
-                                                     prefix=prefix)
+        self.linear = ReplicatedLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear",
+        )
+        self.mamba_decoder = Zamba2MambaDecoderLayer(
+            config,
+            model_config=model_config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
 
     def forward(
         self,
@@ -597,19 +628,19 @@ class Zamba2HybridLayer(nn.Module):
         positions: torch.Tensor,
     ) -> torch.Tensor:
         """Forward pass through the hybrid layer.
-        
+
         Processes input through parallel transformer and Mamba paths:
         1. Transformer path processes input with attention
         2. Transformer output is projected to match hidden size
         3. Projected output is added to Mamba path input
         4. Final output combines both paths' representations
-        
+
         Args:
             hidden_states: Input tensor [batch_size, seq_len, hidden_size]
-            original_hidden_states: Original input for transformer residual 
+            original_hidden_states: Original input for transformer residual
                 connection
             positions: Position IDs for positional embeddings
-            
+
         Returns:
             Output tensor combining transformer and Mamba representations
         """
@@ -636,16 +667,16 @@ class Zamba2HybridLayer(nn.Module):
 @support_torch_compile
 class Zamba2Model(nn.Module):
     """Core Zamba2 model combining transformer and Mamba architectures.
-    
-    The model processes input through a sequence of hybrid and Mamba-only 
+
+    The model processes input through a sequence of hybrid and Mamba-only
     layers, using token embeddings and final layer normalization.
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         """Initialize the Zamba2 model.
-        
+
         Args:
-            vllm_config: Configuration object containing model, cache, 
+            vllm_config: Configuration object containing model, cache,
                 quantization and LoRA settings
             prefix: Optional prefix for parameter names in state dict
         """
@@ -660,16 +691,13 @@ class Zamba2Model(nn.Module):
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         # Initialize token embeddings
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         # Map hybrid layer indices to block indices
@@ -679,15 +707,19 @@ class Zamba2Model(nn.Module):
         }
 
         # Create cyclic iterator of transformer blocks
-        blocks = cycle([
-            Zamba2AttentionDecoderLayer(config,
-                                        bare_block_idx=idx,
-                                        num_hybrid_layers=len(layer2block_map),
-                                        cache_config=cache_config,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}")
-            for idx in range(config.num_mem_blocks)
-        ])
+        blocks = cycle(
+            [
+                Zamba2AttentionDecoderLayer(
+                    config,
+                    bare_block_idx=idx,
+                    num_hybrid_layers=len(layer2block_map),
+                    cache_config=cache_config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}",
+                )
+                for idx in range(config.num_mem_blocks)
+            ]
+        )
 
         # Initialize layers according to block type configuration
         layers = []
@@ -699,32 +731,37 @@ class Zamba2Model(nn.Module):
                 block = next(blocks)
                 block_idx = layer2block_map[layer_idx]
                 layers.append(
-                    Zamba2HybridLayer(block,
-                                      config,
-                                      block_idx,
-                                      model_config=model_config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config,
-                                      prefix=prefix))
+                    Zamba2HybridLayer(
+                        block,
+                        config,
+                        block_idx,
+                        model_config=model_config,
+                        cache_config=cache_config,
+                        quant_config=quant_config,
+                        prefix=prefix,
+                    )
+                )
             else:
                 layers.append(
-                    Zamba2MambaDecoderLayer(config,
-                                            model_config=model_config,
-                                            cache_config=cache_config,
-                                            quant_config=quant_config,
-                                            prefix=prefix))
+                    Zamba2MambaDecoderLayer(
+                        config,
+                        model_config=model_config,
+                        cache_config=cache_config,
+                        quant_config=quant_config,
+                        prefix=prefix,
+                    )
+                )
         self.layers = nn.ModuleList(layers)
 
         # Final layer normalization
-        self.final_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
-        
+
         Args:
             input_ids: Tensor of input token IDs
-            
+
         Returns:
             Embedded representation of the input tokens
         """
@@ -734,22 +771,22 @@ class Zamba2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """Forward pass through the model.
-        
+
         Args:
             input_ids: Input token IDs
             positions: Position IDs for embeddings
             inputs_embeds: Optional pre-computed input embeddings
-            
+
         Returns:
-            Either final hidden states or intermediate tensors for pipeline 
+            Either final hidden states or intermediate tensors for pipeline
             parallelism
         """
         # Handle pipeline parallelism for first rank
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
         hidden_states = inputs_embeds
 
         # Process through layers
@@ -765,8 +802,7 @@ class Zamba2Model(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -780,8 +816,7 @@ class Zamba2Model(nn.Module):
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in chkpt_weight_name:
                     continue
-                chkpt_weight_name = chkpt_weight_name.replace(
-                    weight_name, param_name)
+                chkpt_weight_name = chkpt_weight_name.replace(weight_name, param_name)
                 param = params_dict[chkpt_weight_name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -790,35 +825,36 @@ class Zamba2Model(nn.Module):
                 if chkpt_weight_name not in params_dict:
                     continue
                 param = params_dict[chkpt_weight_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(chkpt_weight_name)
         return loaded_params
 
 
-class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
+class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixCaching):
     """Zamba2 model with causal language modeling head.
-    
+
     This class wraps the core Zamba2 model and adds:
     - A language modeling head for next token prediction
     - Mamba state caching functionality
     - Support for model parallelism and quantization
     - Sampling capabilities for text generation
     """
+
     # To ensure correct weight loading and mapping.
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
-        "A_log": "A",
-        "0.weight": "A.weight",
-        "1.weight": "B.weight",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "A_log": "A",
+            "0.weight": "A.weight",
+            "1.weight": "B.weight",
+        }
+    )
 
     @classmethod
     def get_mamba_state_dtype_from_config(
         cls,
         vllm_config: "VllmConfig",
     ) -> tuple[torch.dtype, torch.dtype]:
-
         return MambaStateDtypeCalculator.mamba2_state_dtype(
             vllm_config.model_config.dtype,
             vllm_config.cache_config.mamba_cache_dtype,
@@ -857,76 +893,67 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         """Initialize the Zamba2 model for causal language modeling.
-        
+
         Args:
             vllm_config: Configuration containing model, cache, quantization,
                         LoRA and scheduler settings
             prefix: Optional prefix for parameter names
-        
+
         Raises:
-            AssertionError: If prefix caching is enabled 
+            AssertionError: If prefix caching is enabled
                 (not supported by Mamba)
         """
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "Mamba does not support prefix caching"
 
         super().__init__()
         self.config = config
         self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
         self.model_config = vllm_config.model_config
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         # Initialize core model
-        self.model = Zamba2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Zamba2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         # Initialize language modeling head
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         # Tie weights with input embeddings if using same dimensions
         self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
         # Initialize logits processing and sampling
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
         Args:
             input_ids: Tensor of input token IDs
         Returns:
             Embedded representation of the input tokens
         """
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: Any) -> torch.Tensor:
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
         """Forward pass through the model.
-        
+
         Args:
             input_ids: Input token IDs
             positions: Position IDs for embeddings
             inputs_embeds: Optional pre-computed input embeddings
             **kwargs: Additional arguments passed to cache manager
-            
+
         Returns:
             Output hidden states
         """
@@ -942,7 +969,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Compute logits for next token prediction.
 
         Args:
@@ -954,7 +981,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 66add98dab44365c79b8806095f3364d7722cb91..d3a91feab64d97f2fdfd4a71bb980eeb6204124c 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,22 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Hashable
+from collections.abc import Callable, Hashable
 from fractions import Fraction
-from typing import Callable, Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
 from torch.nn import Parameter
 
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 
 __all__ = [
-    "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
-    "ModelWeightParameter", "ChannelQuantScaleParameter",
-    "GroupQuantScaleParameter", "PackedColumnParameter", "RowvLLMParameter"
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
+    "PerTensorScaleParameter",
+    "ModelWeightParameter",
+    "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter",
+    "PackedColumnParameter",
+    "RowvLLMParameter",
 ]
 
 logger = init_logger(__name__)
@@ -29,8 +35,7 @@ class BasevLLMParameter(Parameter):
     into the parameter when the provided weight loader is called.
     """
 
-    def __new__(cls, data: Optional[torch.Tensor], **kwargs):
-
+    def __new__(cls, data: torch.Tensor | None, **kwargs):
         return super().__new__(cls, data=data, requires_grad=False)
 
     def __init__(self, data: torch.Tensor, weight_loader: Callable):
@@ -52,9 +57,9 @@ class BasevLLMParameter(Parameter):
         # This sometimes causes OOM errors during model loading. To avoid this,
         # we sync the param tensor after its weight loader is called.
         from vllm.platforms import current_platform
+
         if current_platform.use_sync_weight_loader():
-            weight_loader = current_platform.make_synced_weight_loader(
-                weight_loader)
+            weight_loader = current_platform.make_synced_weight_loader(weight_loader)
 
         self._weight_loader = weight_loader
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -65,10 +70,11 @@ class BasevLLMParameter(Parameter):
         # NOTE(@ksayers) some models such as mamba_mixer2 override the
         # weight loader to support custom loading. In the future, model-specific
         # weight loading should be implemented via Model.load_weights. In the
-        # meantime, support deleting and overriding `weight_loader`` attribute
+        # meantime, support deleting and overriding `weight_loader` attribute
         if self._weight_loader is None:
-            raise AttributeError(f"{self.__class__.__name__} weight_loader "
-                                 "attribute has been deleted")
+            raise AttributeError(
+                f"{self.__class__.__name__} weight_loader attribute has been deleted"
+            )
         return self._weight_loader
 
     @weight_loader.setter
@@ -82,11 +88,12 @@ class BasevLLMParameter(Parameter):
     def _is_1d_and_scalar(self, loaded_weight: torch.Tensor):
         cond1 = self.data.ndim == 1 and self.data.numel() == 1
         cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1
-        return (cond1 and cond2)
+        return cond1 and cond2
 
     def _assert_and_load(self, loaded_weight: torch.Tensor):
-        assert (self.data.shape == loaded_weight.shape
-                or self._is_1d_and_scalar(loaded_weight))
+        assert self.data.shape == loaded_weight.shape or self._is_1d_and_scalar(
+            loaded_weight
+        )
         self.data.copy_(loaded_weight)
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
@@ -101,7 +108,7 @@ class BasevLLMParameter(Parameter):
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         self._assert_and_load(loaded_weight)
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+    def _shard_id_as_int(self, shard_id: str | int) -> int:
         if isinstance(shard_id, int):
             return shard_id
 
@@ -121,11 +128,11 @@ class BasevLLMParameter(Parameter):
 
 class _ColumnvLLMParameter(BasevLLMParameter):
     """
-    Private class defining weight loading functionality 
+    Private class defining weight loading functionality
     (load_merged_column_weight, load_qkv_weight)
     for parameters being loaded into linear layers with column
     parallelism. This includes QKV and MLP layers which are
-    not already fused on disk. Requires an output dimension 
+    not already fused on disk. Requires an output dimension
     to be defined. Called within the weight loader of
     each of the column parallel linear layers.
     """
@@ -140,57 +147,55 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
         shard_size = self.data.shape[self.output_dim]
-        loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             self.tp_rank * shard_size,
-                                             shard_size)
+        loaded_weight = loaded_weight.narrow(
+            self.output_dim, self.tp_rank * shard_size, shard_size
+        )
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
 
     def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
-
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
 
         # TODO: move these to PackedColumnParameter and PackedvLLMParameter
-        if isinstance(
-                self,
-            (PackedColumnParameter,
-             PackedvLLMParameter)) and self.packed_dim == self.output_dim:
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.packed_dim == self.output_dim
+        ):
             shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
-                shard_offset=shard_offset, shard_size=shard_size)
+                shard_offset=shard_offset, shard_size=shard_size
+            )
 
         param_data = self.data
 
-        param_data = param_data.narrow(self.output_dim, shard_offset,
-                                       shard_size)
-        loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             self.tp_rank * shard_size,
-                                             shard_size)
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.narrow(
+            self.output_dim, self.tp_rank * shard_size, shard_size
+        )
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
-
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
         shard_id = kwargs.get("shard_id")
         num_heads = kwargs.get("num_heads")
 
         # TODO: move these to PackedColumnParameter and PackedvLLMParameter
-        if isinstance(
-                self,
-            (PackedColumnParameter,
-             PackedvLLMParameter)) and self.output_dim == self.packed_dim:
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.output_dim == self.packed_dim
+        ):
             shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
-                shard_offset=shard_offset, shard_size=shard_size)
+                shard_offset=shard_offset, shard_size=shard_size
+            )
 
         param_data = self.data
-        shard_id = (self.tp_rank if shard_id == "q" else self.tp_rank //
-                    num_heads)
-        param_data = param_data.narrow(self.output_dim, shard_offset,
-                                       shard_size)
-        loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             shard_id * shard_size, shard_size)
+        shard_id = self.tp_rank if shard_id == "q" else self.tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.narrow(
+            self.output_dim, shard_id * shard_size, shard_size
+        )
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -214,9 +219,9 @@ class RowvLLMParameter(BasevLLMParameter):
 
     def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
         shard_size = self.data.shape[self.input_dim]
-        loaded_weight = loaded_weight.narrow(self.input_dim,
-                                             self.tp_rank * shard_size,
-                                             shard_size)
+        loaded_weight = loaded_weight.narrow(
+            self.input_dim, self.tp_rank * shard_size, shard_size
+        )
 
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
@@ -230,6 +235,7 @@ class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
     Parameter class for linear layer weights. Uses both column and
     row parallelism.
     """
+
     pass
 
 
@@ -238,6 +244,7 @@ class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
     Parameter class for weight scales loaded for weights with
     grouped quantization. Uses both column and row parallelism.
     """
+
     pass
 
 
@@ -246,6 +253,7 @@ class ChannelQuantScaleParameter(_ColumnvLLMParameter):
     Parameter class for weight scales loaded for weights with
     channel-wise quantization. Equivalent to _ColumnvLLMParameter.
     """
+
     pass
 
 
@@ -256,11 +264,11 @@ class PerTensorScaleParameter(BasevLLMParameter):
     layers (e.g. for QKV, there are 3 scales loaded from disk).
     This is relevant to weights with per-tensor quantization.
     Adds functionality to map the scalers to a shard during
-    weight loading. 
+    weight loading.
 
-    Note: additional parameter manipulation may be handled 
-    for each quantization config specifically, within 
-    process_weights_after_loading 
+    Note: additional parameter manipulation may be handled
+    for each quantization config specifically, within
+    process_weights_after_loading
     """
 
     def __init__(self, **kwargs):
@@ -280,10 +288,11 @@ class PerTensorScaleParameter(BasevLLMParameter):
     def load_column_parallel_weight(self, *args, **kwargs):
         super().load_row_parallel_weight(*args, **kwargs)
 
-    def _load_into_shard_id(self, loaded_weight: torch.Tensor,
-                            shard_id: Union[str, int], **kwargs):
+    def _load_into_shard_id(
+        self, loaded_weight: torch.Tensor, shard_id: str | int, **kwargs
+    ):
         """
-        Slice the parameter data based on the shard id for 
+        Slice the parameter data based on the shard id for
         loading.
         """
 
@@ -308,12 +317,14 @@ class PackedColumnParameter(_ColumnvLLMParameter):
     for more details on the packed properties.
     """
 
-    def __init__(self,
-                 packed_factor: Union[int, Fraction],
-                 packed_dim: int,
-                 marlin_tile_size: Optional[int] = None,
-                 bitblas_tile_size: Optional[int] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        packed_factor: int | Fraction,
+        packed_dim: int,
+        marlin_tile_size: int | None = None,
+        bitblas_tile_size: int | None = None,
+        **kwargs,
+    ):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
@@ -342,7 +353,8 @@ class PackedColumnParameter(_ColumnvLLMParameter):
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
             marlin_tile_size=self.marlin_tile_size,
-            bitblas_tile_size=self.bitblas_tile_size)
+            bitblas_tile_size=self.bitblas_tile_size,
+        )
 
 
 class PackedvLLMParameter(ModelWeightParameter):
@@ -351,17 +363,19 @@ class PackedvLLMParameter(ModelWeightParameter):
     Example: GPTQ Marlin weights are int4 or int8, packed into int32.
     Extends the ModelWeightParameter to take in the
     packed factor, the packed dimension, and optionally, marlin
-    tile size for marlin kernels. Adjusts the shard_size and 
+    tile size for marlin kernels. Adjusts the shard_size and
     shard_offset for fused linear layers model weight loading
     by accounting for packing and optionally, marlin tile size.
     """
 
-    def __init__(self,
-                 packed_factor: Union[int, Fraction],
-                 packed_dim: int,
-                 marlin_tile_size: Optional[int] = None,
-                 bitblas_tile_size: Optional[int] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        packed_factor: int | Fraction,
+        packed_dim: int,
+        marlin_tile_size: int | None = None,
+        bitblas_tile_size: int | None = None,
+        **kwargs,
+    ):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
@@ -390,7 +404,8 @@ class PackedvLLMParameter(ModelWeightParameter):
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
             marlin_tile_size=self.marlin_tile_size,
-            bitblas_tile_size=self.bitblas_tile_size)
+            bitblas_tile_size=self.bitblas_tile_size,
+        )
 
 
 class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
@@ -410,6 +425,7 @@ class SharedWeightParameter(BasevLLMParameter):
     `MergedColumnParallelLinear`, the transform weights must stay separate
     tensors in order to allow for tensor memory sharing between layers.
     """
+
     # global registry for sharing tensors based on passed `data_key`
     # this dict holds weaksrefs to avoid memory leak after model cleanup
     tensors_registry: WeakValueDictionary = WeakValueDictionary()
@@ -420,14 +436,13 @@ class SharedWeightParameter(BasevLLMParameter):
     local_tensors: set[torch.Tensor]
 
     # dictionary mapping partition indices to associated parameters
-    partitions: dict[int, Union[ModelWeightParameter, Parameter]]
+    partitions: dict[int, ModelWeightParameter | Parameter]
 
     def __new__(cls, **kwargs):
         return super().__new__(cls, data=None, **kwargs)
 
     def __init__(self, input_dim: int = 1, output_dim: int = 0, **kwargs):
-        weight_loader: Callable = kwargs.get(
-            "weight_loader")  # type: ignore[assignment]
+        weight_loader: Callable = kwargs.get("weight_loader")  # type: ignore[assignment]
         super().__init__(data=None, weight_loader=weight_loader)
 
         self.local_tensors = set()
@@ -435,12 +450,14 @@ class SharedWeightParameter(BasevLLMParameter):
         self.kwargs = {
             "input_dim": input_dim,
             "output_dim": output_dim,
-            "weight_loader": self._fake_weight_loader
+            "weight_loader": self._fake_weight_loader,
         }
 
         if self.tp_size > 1:
-            raise NotImplementedError(f"{self.__class__.__name__} does not "
-                                      "currently support tensor parallelism")
+            raise NotImplementedError(
+                f"{self.__class__.__name__} does not "
+                "currently support tensor parallelism"
+            )
 
     def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
         """
@@ -460,8 +477,7 @@ class SharedWeightParameter(BasevLLMParameter):
             data = self.tensors_registry[data_key]
 
         # create associated model parameter
-        self.partitions[index] = ModelWeightParameter(
-            data=data, **self.kwargs)  # type: ignore[arg-type]
+        self.partitions[index] = ModelWeightParameter(data=data, **self.kwargs)  # type: ignore[arg-type]
 
         # hold local reference, since ModelWeightParameter does not
         # see https://github.com/pytorch/pytorch/issues/75932
@@ -471,8 +487,7 @@ class SharedWeightParameter(BasevLLMParameter):
         assert len(self.partitions) == 1 and 0 in self.partitions
         partition = self.partitions[0]
 
-        ModelWeightParameter.load_column_parallel_weight(
-            partition, loaded_weight)
+        ModelWeightParameter.load_column_parallel_weight(partition, loaded_weight)
 
     def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
         assert len(self.partitions) == 1 and 0 in self.partitions
@@ -490,10 +505,8 @@ class SharedWeightParameter(BasevLLMParameter):
         shard_offset = self.tp_rank * shard_size
 
         ModelWeightParameter.load_merged_column_weight(
-            partition,
-            loaded_weight,
-            shard_offset=shard_offset,
-            shard_size=shard_size)
+            partition, loaded_weight, shard_offset=shard_offset, shard_size=shard_size
+        )
 
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         partition_id = self._shard_id_as_int(kwargs.pop("shard_id"))
@@ -517,33 +530,42 @@ class SharedWeightParameter(BasevLLMParameter):
     def process_weights_after_loading(self):
         for key in self.partitions:
             self.partitions[key] = torch.nn.Parameter(
-                data=self.partitions[key].data, requires_grad=False)
+                data=self.partitions[key].data, requires_grad=False
+            )
 
     @property
     def data(self):
-        raise ValueError("Accessing `data` of a "
-                         "`PartitionedModelWeightParameter` is not allowed. "
-                         "Instead, use `get_partition` to get the weight of "
-                         "the particular partition you want to access")
+        raise ValueError(
+            "Accessing `data` of a "
+            "`PartitionedModelWeightParameter` is not allowed. "
+            "Instead, use `get_partition` to get the weight of "
+            "the particular partition you want to access"
+        )
 
-    def _fake_weight_loader(self, param: BasevLLMParameter,
-                            loaded_weight: torch.Tensor,
-                            loaded_weight_shard_id: Optional[Union[str, int]]):
-        raise ValueError("When loading partition weights of "
-                         f"{self.__class__.__name__}, use methods provided by "
-                         f"{self.__class__.__name__}, not partition loader")
+    def _fake_weight_loader(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_weight_shard_id: str | int | None,
+    ):
+        raise ValueError(
+            "When loading partition weights of "
+            f"{self.__class__.__name__}, use methods provided by "
+            f"{self.__class__.__name__}, not partition loader"
+        )
 
 
-def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
-                          output_dim: int, **kwargs) -> BasevLLMParameter:
+def permute_param_layout_(
+    param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs
+) -> BasevLLMParameter:
     """
-    Permute a parameter's layout to the specified input and output dimensions, 
+    Permute a parameter's layout to the specified input and output dimensions,
     useful for forcing the parameter into a known layout, for example, if I need
-    a packed (quantized) weight matrix to be in the layout 
+    a packed (quantized) weight matrix to be in the layout
         {input_dim = 0, output_dim = 1, packed_dim = 0}
     then I can call:
         permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
-    to ensure x is in the correct layout (permuting it to the correct layout if 
+    to ensure x is in the correct layout (permuting it to the correct layout if
     required, asserting if it cannot get it to the correct layout)
     """
 
@@ -551,35 +573,34 @@ def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
     curr_output_dim = getattr(param, "output_dim", None)
 
     if curr_input_dim is None or curr_output_dim is None:
-        assert param.data.dim() == 2,\
-            "permute_param_layout_ only supports 2D parameters when either "\
+        assert param.data.dim() == 2, (
+            "permute_param_layout_ only supports 2D parameters when either "
             "input_dim or output_dim is not set"
+        )
 
     # if one of the dimensions is not set, set it to the opposite of the other
     #  we can only do this since we asserted the parameter is 2D above
     if curr_input_dim is None:
-        assert curr_output_dim is not None,\
-            "either input or output dim must be set"
+        assert curr_output_dim is not None, "either input or output dim must be set"
         curr_input_dim = (curr_output_dim + 1) % 2
     if curr_output_dim is None:
-        assert curr_input_dim is not None,\
-            "either input or output dim must be set"
+        assert curr_input_dim is not None, "either input or output dim must be set"
         curr_output_dim = (curr_input_dim + 1) % 2
 
     # create permutation from the current layout to the layout with
     # self.input_dim at input_dim and self.output_dim at output_dim preserving
     # other dimensions
     perm = [
-        i for i in range(param.data.dim())
-        if i not in [curr_input_dim, curr_output_dim]
+        i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim]
     ]
     perm.insert(input_dim, curr_input_dim)
     perm.insert(output_dim, curr_output_dim)
 
     if "packed_dim" in kwargs:
-        assert hasattr(param, "packed_dim") and\
-            param.packed_dim == perm[kwargs["packed_dim"]],\
-            "permute_param_layout_ currently doesn't support repacking"
+        assert (
+            hasattr(param, "packed_dim")
+            and param.packed_dim == perm[kwargs["packed_dim"]]
+        ), "permute_param_layout_ currently doesn't support repacking"
 
     param.data = param.data.permute(*perm)
     if hasattr(param, "_input_dim"):
@@ -592,29 +613,30 @@ def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
     return param
 
 
-def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
-                                     marlin_tile_size):
+def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
-def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset,
-                                      bitblas_tile_size):
+def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset, bitblas_tile_size):
     return shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size
 
 
-def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
-                                      marlin_tile_size, bitblas_tile_size):
+def _adjust_shard_indexes_for_packing(
+    shard_size, shard_offset, packed_factor, marlin_tile_size, bitblas_tile_size
+):
     shard_size = shard_size // packed_factor
     shard_offset = shard_offset // packed_factor
     if marlin_tile_size is not None:
         return _adjust_shard_indexes_for_marlin(
             shard_size=shard_size,
             shard_offset=shard_offset,
-            marlin_tile_size=marlin_tile_size)
+            marlin_tile_size=marlin_tile_size,
+        )
     elif bitblas_tile_size is not None:
         return _adjust_shard_indexes_for_bitblas(
             shard_size=shard_size,
             shard_offset=shard_offset,
-            bitblas_tile_size=bitblas_tile_size)
+            bitblas_tile_size=bitblas_tile_size,
+        )
 
     return shard_size, shard_offset
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 543918418953bc40aa6bb684457d26284f4286e1..759b809433b14bf3605cc78d8ea9f78ffa863bee 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -3,10 +3,12 @@
 """Utils for model executor."""
 
 import copy
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
 
 def set_random_seed(seed: int) -> None:
     from vllm.platforms import current_platform
@@ -16,7 +18,7 @@ def set_random_seed(seed: int) -> None:
 
 def set_weight_attrs(
     weight: torch.Tensor,
-    weight_attrs: Optional[dict[str, Any]],
+    weight_attrs: dict[str, Any] | None,
 ):
     """Set attributes on a weight tensor.
 
@@ -30,8 +32,7 @@ def set_weight_attrs(
     if weight_attrs is None:
         return
     for key, value in weight_attrs.items():
-        assert not hasattr(
-            weight, key), f"Overwriting existing tensor attribute: {key}"
+        assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
 
         # NOTE(woosuk): During weight loading, we often do something like:
         # narrowed_tensor = param.data.narrow(0, offset, len)
@@ -44,8 +45,7 @@ def set_weight_attrs(
         # TODO(woosuk): Remove this hack once we have a better solution.
         from vllm.platforms import current_platform
 
-        if current_platform.use_sync_weight_loader(
-        ) and key == "weight_loader":
+        if current_platform.use_sync_weight_loader() and key == "weight_loader":
             value = current_platform.make_synced_weight_loader(value)
         setattr(weight, key, value)
 
@@ -63,18 +63,19 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
         child_map = getattr(child, "packed_modules_mapping", None)
         child_map = copy.deepcopy(child_map) if child_map is not None else {}
 
-        if any((k in parent_map and parent_map[k] != v)
-               for k, v in child_map.items()):
+        if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()):
             raise ValueError(
                 f"Can't update {type(model).__name__}'s packed_modules_mapping "
-                f"safely because of conflicts from {type(child).__name__}.")
+                f"safely because of conflicts from {type(child).__name__}."
+            )
         else:
             parent_map.update(child_map)
     return parent_map
 
 
 def get_moe_expert_mapping(
-    model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]:
+    model: torch.nn.Module,
+) -> list[tuple[str, str, int, str]]:
     if parent_map := getattr(model, "get_expert_mapping", None):
         return parent_map()
     else:
@@ -84,3 +85,10 @@ def get_moe_expert_mapping(
             if child_map is not None:
                 return child_map()
         return []
+
+
+def maybe_disable_graph_partition(current_backend: str) -> dict[str, bool]:
+    if current_backend == "inductor" and is_torch_equal_or_newer("2.9.0.dev"):
+        return {"graph_partition": False}
+    else:
+        return {}
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index e495f9ee44724552e39e6b085cd162bfa15238c8..e0c584df8760bc9a9508b89e61ce8728d1ba15d5 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -12,20 +12,73 @@ from tqdm import tqdm
 import vllm.envs as envs
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
-from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
-    compute_aligned_M, deep_gemm_block_shape)
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
-from vllm.utils.deep_gemm import fp8_gemm_nt, m_grouped_fp8_gemm_nt_contiguous
+from vllm.utils.deep_gemm import (
+    fp8_gemm_nt,
+    get_mk_alignment_for_contiguous_layout,
+    m_grouped_fp8_gemm_nt_contiguous,
+)
+
+
+def _generate_optimal_warmup_m_values(
+    max_tokens: int, n: int, device: torch.device
+) -> list[int]:
+    """
+    Generate M values that cover all possible DeepGEMM kernel configurations.
+    Reference: https://github.com/deepseek-ai/DeepGEMM/blob/79f48ee15a82dd5fad5cd9beaa393c1f755e6b55/csrc/jit_kernels/heuristics/common.hpp
+
+    Args:
+        max_tokens: Maximum number of tokens to warmup for
+        n: The actual N dimension from the weight tensor
+        device: The torch device to get properties from.
+    """
+
+    def ceil_div(a: int, b: int) -> int:
+        return (a + b - 1) // b
+
+    # DeepGEMM's possible block sizes
+    block_ms = [64, 128, 256]
+    block_ns = list(range(16, min(257, n + 1), 16))
+    num_sms = torch.cuda.get_device_properties(device).multi_processor_count
+
+    m_values = set()
+
+    # Always include small cases
+    m_values.update([1, 2, 4] + [i for i in range(8, 65, 8)])
+
+    # Collect M values where different wave patterns occur
+    for block_m in block_ms:
+        for block_n in block_ns:
+            if block_n > n:
+                continue
+
+            # Add key M boundaries for this block combination
+            for wave in range(1, 11):  # Up to 10 waves
+                # M where this block config transitions to next wave
+                target_blocks = wave * num_sms
+                m = target_blocks * block_m // ceil_div(n, block_n)
+                if 1 <= m <= max_tokens:
+                    m_values.add(m)
+
+            # Add block_m boundaries
+            for multiple in range(1, max_tokens // block_m + 1):
+                m = multiple * block_m
+                if m <= max_tokens:
+                    m_values.add(m)
+
+    return sorted(m_values)
 
 
 def _extract_data_from_linear_base_module(
-        m: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor, list[int]]:
+    m: torch.nn.Module,
+) -> tuple[torch.Tensor, torch.Tensor, list[int]]:
     """
     Extract weights, weight scales and quantization block sizes from the given
     LinearBase module.
@@ -46,18 +99,24 @@ def _extract_data_from_linear_base_module(
 
 
 def _extract_data_from_fused_moe_module(
-    m: torch.nn.Module
+    m: torch.nn.Module,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]:
     """
     Extract weights, weight scales and num_topk from FusedMoE module.
     """
     assert isinstance(m, FusedMoE)
     w13 = m.w13_weight
-    w13_s = m.w13_weight_scale_inv if hasattr(
-        m, "w13_weight_scale_inv") else m.w13_weight_scale
+    w13_s = (
+        m.w13_weight_scale_inv
+        if hasattr(m, "w13_weight_scale_inv")
+        else m.w13_weight_scale
+    )
     w2 = m.w2_weight
-    w2_s = m.w2_weight_scale_inv if hasattr(
-        m, "w2_weight_scale_inv") else m.w2_weight_scale
+    w2_s = (
+        m.w2_weight_scale_inv
+        if hasattr(m, "w2_weight_scale_inv")
+        else m.w2_weight_scale
+    )
     num_topk = m.top_k
 
     assert isinstance(w13, torch.Tensor)
@@ -71,67 +130,86 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
     """
     Return True if the input module/layer could be processed with DeepGEMM.
     """
-    block_size = deep_gemm_block_shape()[0]
-    if not (isinstance(module, LinearBase)
-            and isinstance(module.quant_method, Fp8LinearMethod)
-            and module.quant_method.block_quant):
+    block_size = get_mk_alignment_for_contiguous_layout()[0]
+    if not (
+        isinstance(module, LinearBase)
+        and isinstance(module.quant_method, Fp8LinearMethod)
+        and module.quant_method.block_quant
+    ):
         return False
 
     w, _, block_sizes = _extract_data_from_linear_base_module(module)
-    return (block_sizes == deep_gemm_block_shape() and w.ndim == 2
-            and w.shape[0] % block_size == 0 and w.shape[1] % block_size == 0)
+    return (
+        block_sizes == get_mk_alignment_for_contiguous_layout()
+        and w.ndim == 2
+        and w.shape[0] % block_size == 0
+        and w.shape[1] % block_size == 0
+    )
 
 
 def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
+    if not (envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM):
+        return False
+
     if not isinstance(module, FusedMoE):
         return False
 
     moe_quant_config = module.quant_method.get_fused_moe_quant_config(module)
 
-    if (moe_quant_config is None
-            or moe_quant_config.quant_dtype != torch.float8_e4m3fn
-            or moe_quant_config.block_shape != deep_gemm_block_shape()):
+    if (
+        moe_quant_config is None
+        or moe_quant_config.quant_dtype != torch.float8_e4m3fn
+        or moe_quant_config.block_shape != get_mk_alignment_for_contiguous_layout()
+    ):
         return False
 
-    if not isinstance(module.quant_method.fused_experts,
-                      FusedMoEModularKernel):
-        # fused_experts could invoke deep_gemm_moe_fp8
+    if not isinstance(module.quant_method, FusedMoEModularMethod):
+        # modular kernels could invoke deep_gemm_moe_fp8
         return True
 
     mk: FusedMoEModularKernel = module.quant_method.fused_experts
     # Further check if the ModularKernel implementation uses the DeepGemmExperts
-    return isinstance(mk.fused_experts,
-                      (DeepGemmExperts, TritonOrDeepGemmExperts))
+    return isinstance(mk.fused_experts, (DeepGemmExperts, TritonOrDeepGemmExperts))
 
 
 FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor,
-                                 max_tokens: int):
+def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: int):
     if w.size() in FP8_GEMM_NT_WARMUP_CACHE:
         return
 
     n, k = w.size()
-    block_m = deep_gemm_block_shape()[0]
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
 
     device = w.device
-    a1q = torch.empty((max_tokens, k),
-                      device=device,
-                      dtype=torch.float8_e4m3fn)
-    a1q_scales = torch.empty((max_tokens, k // block_m),
-                             device=device,
-                             dtype=torch.float32)
+    a1q = torch.empty((max_tokens, k), device=device, dtype=torch.float8_e4m3fn)
+    a1q_scales = torch.empty(
+        (max_tokens, k // block_m), device=device, dtype=torch.float32
+    )
     out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16)
 
-    pbar = tqdm(total=max_tokens,
-                desc=f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()})")
-    num_tokens = max_tokens
-    while num_tokens > 0:
-        fp8_gemm_nt((a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws),
-                    out[:num_tokens])
+    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
+    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
+    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
+        m_values = _generate_optimal_warmup_m_values(max_tokens, n, device)
+        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [relaxed]"
+    else:
+        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
+            "Expected "
+            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
+            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
+        )
+        m_values = list(range(1, max_tokens + 1))
+        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [all tokens]"
+
+    pbar = tqdm(total=len(m_values), desc=desc)
+
+    for num_tokens in m_values:
+        fp8_gemm_nt(
+            (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), out[:num_tokens]
+        )
         pbar.update(1)
-        num_tokens -= 1
 
     FP8_GEMM_NT_WARMUP_CACHE.add(w.size())
 
@@ -140,16 +218,22 @@ GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
 
 
 def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-        w1: torch.Tensor, w2: torch.Tensor, w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor, num_topk: int, max_tokens: int):
-    if (w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-            and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE):
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    num_topk: int,
+    max_tokens: int,
+):
+    if (
+        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    ):
         return
 
-    assert w1.size(0) == w2.size(0), (
-        "w1 and w2 must have the same number of experts")
+    assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
 
-    block_m = deep_gemm_block_shape()[0]
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     num_experts = w1.size(0)
     device = w1.device
 
@@ -159,41 +243,41 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
 
     # This is the maximum GroupedGemm M size that we expect to run
     # the grouped_gemm with.
-    MAX_M = compute_aligned_M(max_tokens,
-                              num_topk,
-                              num_experts,
-                              block_m,
-                              expert_tokens_meta=None)
+    MAX_M = compute_aligned_M(
+        max_tokens, num_topk, num_experts, block_m, expert_tokens_meta=None
+    )
     # Distribute expert-ids evenly.
     MAX_BLOCKS = MAX_M // block_m
-    expert_ids_block = torch.randint(low=0,
-                                     high=num_experts,
-                                     size=(MAX_BLOCKS, ),
-                                     device=device,
-                                     dtype=torch.int32)
+    expert_ids_block = torch.randint(
+        low=0, high=num_experts, size=(MAX_BLOCKS,), device=device, dtype=torch.int32
+    )
     expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
 
     def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
-
         _, n, k = w.size()
         a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn)
-        a1q_scales = torch.empty((MAX_M, k // block_m),
-                                 device=device,
-                                 dtype=torch.float32)
+        a1q_scales = torch.empty(
+            (MAX_M, k // block_m), device=device, dtype=torch.float32
+        )
         out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
 
+        # Generate M values in block_m increments (already optimized for MoE)
+        m_values = list(range(block_m, MAX_M + 1, block_m))
+
         pbar = tqdm(
-            total=MAX_BLOCKS,
-            desc=
-            f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()})"
+            total=len(m_values),
+            desc=f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()}) "
+            f"[{len(m_values)} values, block_m={block_m}]",
         )
-        num_tokens = MAX_M
-        while num_tokens > 0:
+
+        for num_tokens in m_values:
             m_grouped_fp8_gemm_nt_contiguous(
-                (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
-                out[:num_tokens], expert_ids[:num_tokens])
+                (a1q[:num_tokens], a1q_scales[:num_tokens]),
+                (w, w_scale),
+                out[:num_tokens],
+                expert_ids[:num_tokens],
+            )
             pbar.update(1)
-            num_tokens = num_tokens - block_m
 
     for w, ws in [(w1, w1_scale), (w2, w2_scale)]:
         if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE:
@@ -202,27 +286,27 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
 
 
 def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
-    dg_modules = [
-        m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)
-    ]
+    dg_modules = [m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)]
 
     for dgm in dg_modules:
         w, ws, _ = _extract_data_from_linear_base_module(dgm)
         _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
 
 
-def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module,
-                                                   max_tokens: int):
+def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+    model: torch.nn.Module, max_tokens: int
+):
     dg_modules = [
-        m for m in model.modules()
-        if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
+        m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
     ]
 
     for dgm in dg_modules:
-        w13, w13_scale, w2, w2_scale, num_topk = (
-            _extract_data_from_fused_moe_module(dgm))
+        w13, w13_scale, w2, w2_scale, num_topk = _extract_data_from_fused_moe_module(
+            dgm
+        )
         _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-            w13, w2, w13_scale, w2_scale, num_topk, max_tokens)
+            w13, w2, w13_scale, w2_scale, num_topk, max_tokens
+        )
 
 
 def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 3f99340c290628f358add8c2550aba49ca498b94..95f5982bc8c7b18a6b3e092b2bbd5d22c504b24b 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -5,6 +5,7 @@ Warmup kernels used during model execution.
 This is useful specifically for JIT'ed kernels as we don't want JIT'ing to
 happen during model execution.
 """
+
 from typing import TYPE_CHECKING
 
 import torch
@@ -25,9 +26,11 @@ logger = init_logger(__name__)
 
 def kernel_warmup(worker: "Worker"):
     # Deep GEMM warmup
-    do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
-                           and is_deep_gemm_supported()
-                           and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP)
+    do_deep_gemm_warmup = (
+        envs.VLLM_USE_DEEP_GEMM
+        and is_deep_gemm_supported()
+        and envs.VLLM_DEEP_GEMM_WARMUP != "skip"
+    )
     if do_deep_gemm_warmup:
         model = worker.get_model()
         max_tokens = worker.scheduler_config.max_num_batched_tokens
@@ -46,9 +49,19 @@ def kernel_warmup(worker: "Worker"):
         except NotImplementedError:
             return False
 
-    if not worker.model_runner.is_pooling_model and all(
+    # NOTE: we add check for empty attn_groups to avoid errors when
+    # deploying models such as E instances and encoder-only models.
+    # As for those models, worker.model_runner.attn_groups is empty.
+    # This change is made during EPD feature development.
+    if (
+        not worker.model_runner.is_pooling_model
+        and worker.model_runner.attn_groups
+        and all(
             _is_flashinfer_backend(group.backend)
-            for groups in worker.model_runner.attn_groups for group in groups):
+            for groups in worker.model_runner.attn_groups
+            for group in groups
+        )
+    ):
         logger.info("Warming up FlashInfer attention.")
         # Warmup with mixed batch containing both prefill and decode tokens
         # This is to warm up both prefill and decode attention kernels
@@ -78,6 +91,8 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
         # When autotuning with number of tokens m, flashinfer will autotune
         # operations for all number of tokens up to m.
         # So we only need to run with the max number of tokens.
-        runner._dummy_run(runner.scheduler_config.max_num_batched_tokens,
-                          skip_eplb=True,
-                          is_profile=True)
+        runner._dummy_run(
+            runner.scheduler_config.max_num_batched_tokens,
+            skip_eplb=True,
+            is_profile=True,
+        )
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 8ea79078465e60634e568a1935f117e8dd606930..b7cbb3bbc67e78f808f22d27a5de60bde5000f9d 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,10 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .hasher import MultiModalHasher
-from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
-                     MultiModalDataDict, MultiModalKwargs,
-                     MultiModalKwargsItems, MultiModalPlaceholderDict,
-                     MultiModalUUIDDict, NestedTensors)
+from .inputs import (
+    BatchedTensorInputs,
+    ModalityData,
+    MultiModalDataBuiltins,
+    MultiModalDataDict,
+    MultiModalKwargs,
+    MultiModalKwargsItems,
+    MultiModalPlaceholderDict,
+    MultiModalUUIDDict,
+    NestedTensors,
+)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index d7e9d402a1f97934770435b4dd5faec932737029..53052ddc6343c41777e4508eed13dc5c6a1b06e4 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -3,12 +3,12 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import Literal, Optional
+from typing import Literal
 
 import numpy as np
 import numpy.typing as npt
 
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 from .base import MediaIO
 
@@ -53,7 +53,7 @@ class AudioResampler:
 
     def __init__(
         self,
-        target_sr: Optional[float] = None,
+        target_sr: float | None = None,
         method: Literal["librosa", "scipy"] = "librosa",
     ):
         self.target_sr = target_sr
@@ -66,23 +66,25 @@ class AudioResampler:
         orig_sr: float,
     ) -> npt.NDArray[np.floating]:
         if self.target_sr is None:
-            raise RuntimeError("Audio resampling is not supported when "
-                               "`target_sr` is not provided")
+            raise RuntimeError(
+                "Audio resampling is not supported when `target_sr` is not provided"
+            )
         if self.method == "librosa":
-            return resample_audio_librosa(audio,
-                                          orig_sr=orig_sr,
-                                          target_sr=self.target_sr)
+            return resample_audio_librosa(
+                audio, orig_sr=orig_sr, target_sr=self.target_sr
+            )
         elif self.method == "scipy":
-            return resample_audio_scipy(audio,
-                                        orig_sr=orig_sr,
-                                        target_sr=self.target_sr)
+            return resample_audio_scipy(
+                audio, orig_sr=orig_sr, target_sr=self.target_sr
+            )
         else:
-            raise ValueError(f"Invalid resampling method: {self.method}. "
-                             "Supported methods are 'librosa' and 'scipy'.")
+            raise ValueError(
+                f"Invalid resampling method: {self.method}. "
+                "Supported methods are 'librosa' and 'scipy'."
+            )
 
 
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
-
     def __init__(self, **kwargs) -> None:
         super().__init__()
 
@@ -113,4 +115,4 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
             soundfile.write(buffer, audio, sr, format="WAV")
             data = buffer.getvalue()
 
-        return base64.b64encode(data).decode('utf-8')
+        return base64.b64encode(data).decode("utf-8")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index faffddd57199de9986866e10bd92216511a6dba1..fef118a93c6cbda428094a8ea41fce225a1e20e6 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,7 +9,6 @@ _T = TypeVar("_T")
 
 
 class MediaIO(ABC, Generic[_T]):
-
     @abstractmethod
     def load_bytes(self, data: bytes) -> _T:
         raise NotImplementedError
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 642ec3fd7e3f7af6bc0acdf81bb3b3af99b49104..c1531cbfdc31dc6d030ccb5dced5e7264afa7bf2 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -5,23 +5,31 @@ import sys
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
 from multiprocessing.synchronize import Lock as LockType
-from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast
 
 import torch
-from typing_extensions import TypeAlias, override
+from typing_extensions import override
 
+import vllm.envs as envs
 from vllm.distributed.device_communicators.shm_object_storage import (
-    MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer)
-from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME
+    MsgpackSerde,
+    SingleWriterShmObjectStorage,
+    SingleWriterShmRingBuffer,
+)
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, LRUCache, MiB_bytes
-from vllm.utils.jsontree import (json_count_leaves, json_map_leaves,
-                                 json_reduce_leaves)
-
-from .inputs import (MultiModalBatchedField, MultiModalFeatureSpec,
-                     MultiModalFieldElem, MultiModalKwargs,
-                     MultiModalKwargsItem, MultiModalKwargsItems,
-                     NestedTensors)
+from vllm.utils.cache import CacheInfo, LRUCache
+from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
+from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
+
+from .inputs import (
+    MultiModalBatchedField,
+    MultiModalFeatureSpec,
+    MultiModalFieldElem,
+    MultiModalKwargs,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    NestedTensors,
+)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -77,20 +85,19 @@ class MultiModalProcessorCacheItemMetadata:
         self.prompt_updates = prompt_updates
 
 
-MultiModalCacheValue = Union[
-    MultiModalProcessorCacheItem,
-    MultiModalProcessorCacheItemMetadata,
-    MultiModalKwargsItems,
-    MultiModalKwargsItem,
-    MultiModalKwargs,
-    Mapping[str, NestedTensors],
-]
+MultiModalCacheValue: TypeAlias = (
+    MultiModalProcessorCacheItem
+    | MultiModalProcessorCacheItemMetadata
+    | MultiModalKwargsItems
+    | MultiModalKwargsItem
+    | MultiModalKwargs
+    | Mapping[str, NestedTensors]
+)
 
 _V = TypeVar("_V", bound=MultiModalCacheValue)
 
 
 class MultiModalCache:
-
     @classmethod
     def get_leaf_size(cls, leaf: object) -> int:
         if isinstance(leaf, MultiModalProcessorCacheItem):
@@ -99,8 +106,15 @@ class MultiModalCache:
             return leaf.item_size
 
         # These are not subclasses of dict
-        if isinstance(leaf, (MultiModalKwargs, MultiModalKwargsItems,
-                             MultiModalKwargsItem, MultiModalFieldElem)):
+        if isinstance(
+            leaf,
+            (
+                MultiModalKwargs,
+                MultiModalKwargsItems,
+                MultiModalKwargsItem,
+                MultiModalFieldElem,
+            ),
+        ):
             return cls.get_item_size(leaf.data)  # type: ignore
 
         # sys.getsizeof doesn't work for tensors
@@ -116,8 +130,9 @@ class MultiModalCache:
         *,
         debug: bool = False,
     ) -> int:
-        size = json_reduce_leaves(operator.add,
-                                  json_map_leaves(cls.get_leaf_size, value))
+        size = json_reduce_leaves(
+            operator.add, json_map_leaves(cls.get_leaf_size, value)
+        )
 
         if debug:
             leaf_count = json_count_leaves(value)
@@ -241,17 +256,19 @@ class BaseMultiModalCache(ABC, Generic[_I, _O]):
         raise NotImplementedError
 
 
-MultiModalProcessorCacheInItem: TypeAlias = \
-    Optional[tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]]
+MultiModalProcessorCacheInItem: TypeAlias = (
+    tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]] | None
+)
 
 
-MultiModalProcessorCacheOutItem: TypeAlias = \
-    tuple[Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]]
+MultiModalProcessorCacheOutItem: TypeAlias = tuple[
+    MultiModalKwargsItem | None, Sequence["ResolvedPromptUpdate"]
+]
 
 
 class BaseMultiModalProcessorCache(
-        BaseMultiModalCache[MultiModalProcessorCacheInItem,
-                            MultiModalProcessorCacheOutItem]):
+    BaseMultiModalCache[MultiModalProcessorCacheInItem, MultiModalProcessorCacheOutItem]
+):
     """The required interface for caches on P0."""
 
     @abstractmethod
@@ -285,6 +302,16 @@ class BaseMultiModalProcessorCache(
         """
         return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
 
+    @abstractmethod
+    def make_stats(self, *, delta: bool = False) -> CacheInfo:
+        """
+        Get (and reset) the multi-modal cache stats.
+
+        Returns:
+            The current multi-modal caching stats.
+        """
+        raise NotImplementedError
+
 
 class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
     """
@@ -330,6 +357,10 @@ class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
     def clear_cache(self) -> None:
         self._cache.clear()
 
+    @override
+    def make_stats(self, *, delta: bool = False) -> CacheInfo:
+        return self._cache.stat(delta=delta)
+
 
 class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
     """
@@ -380,6 +411,10 @@ class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
     def clear_cache(self) -> None:
         self._cache.clear()
 
+    @override
+    def make_stats(self, *, delta: bool = False) -> CacheInfo:
+        return self._cache.stat(delta=delta)
+
 
 class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
     """
@@ -401,19 +436,31 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
         ring_buffer = SingleWriterShmRingBuffer(
             data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes),
-            name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
+            name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
             create=True,  # sender is the writer
         )
         self._shm_cache = SingleWriterShmObjectStorage(
-            max_object_size=mm_config.mm_shm_cache_max_object_size_mb *
-            MiB_bytes,
+            max_object_size=mm_config.mm_shm_cache_max_object_size_mb * MiB_bytes,
             n_readers=self.world_size,
             ring_buffer=ring_buffer,
             serde_class=MsgpackSerde,
         )
         # cache (prompt_updates, modality) for P0 only
-        self._p0_cache: dict[str, tuple[Sequence[ResolvedPromptUpdate],
-                                        str]] = {}
+        self._p0_cache: dict[str, tuple[Sequence[ResolvedPromptUpdate], str]] = {}
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+    def _stat(self, *, delta: bool = False) -> CacheInfo:
+        info = CacheInfo(hits=self._hits, total=self._total)
+
+        if delta:
+            info_delta = info - self._last_info
+            self._last_info = info
+            info = info_delta
+
+        return info
 
     @override
     def is_cached_item(self, mm_hash: str) -> bool:
@@ -425,30 +472,33 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
         mm_item: MultiModalProcessorCacheInItem,
         mm_hash: str,
     ) -> MultiModalProcessorCacheOutItem:
-
         if self._shm_cache.is_cached(mm_hash):
+            self._hits += 1
+            self._total += 1
+
             address, monotonic_id = self._shm_cache.get_cached(mm_hash)
             prompt_updates, modality = self._p0_cache[mm_hash]
-            return self.address_as_item(address, monotonic_id,
-                                        modality), prompt_updates
+            return self.address_as_item(address, monotonic_id, modality), prompt_updates
 
         assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
 
+        self._total += 1
+
         try:
             address, monotonic_id = self._shm_cache.put(mm_hash, mm_item[0])
             # Try to remove dangling items if p0 cache is too large.
             if len(self._p0_cache) >= 2 * len(self._shm_cache.key_index):
                 self.remove_dangling_items()
             self._p0_cache[mm_hash] = mm_item[1], mm_item[0].modality
-            address_item = self.address_as_item(address, monotonic_id,
-                                                mm_item[0].modality)
+            address_item = self.address_as_item(
+                address, monotonic_id, mm_item[0].modality
+            )
             return address_item, mm_item[1]
         except (ValueError, MemoryError) as e:
             # put may fail if the object is too large or
             # the cache is full.
             # In this case we log the error and keep the original mm_input.
-            logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash,
-                         e)
+            logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, e)
             return mm_item
 
     @override
@@ -456,6 +506,14 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
         self._shm_cache.clear()
         self._p0_cache.clear()
 
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+    @override
+    def make_stats(self, *, delta: bool = False) -> CacheInfo:
+        return self._stat(delta=delta)
+
     def remove_dangling_items(self) -> None:
         """Remove items that are no longer in the shared memory cache."""
         cached_hashes = self._shm_cache.key_index.keys()
@@ -463,8 +521,9 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
         for mm_hash in dangling_hashes:
             del self._p0_cache[mm_hash]
 
-    def address_as_item(self, address: int, monotonic_id: int,
-                        modality: str) -> MultiModalKwargsItem:
+    def address_as_item(
+        self, address: int, monotonic_id: int, modality: str
+    ) -> MultiModalKwargsItem:
         addr_elem = MultiModalFieldElem(
             modality=modality,
             key="address",
@@ -494,9 +553,10 @@ def _enable_processor_cache(
 
 def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
     parallel_config = vllm_config.parallel_config
-    supports_ipc_cache = ((parallel_config._api_process_count == 1
-                           and parallel_config.data_parallel_size == 1)
-                          or parallel_config.data_parallel_external_lb)
+    supports_ipc_cache = (
+        parallel_config._api_process_count == 1
+        and parallel_config.data_parallel_size == 1
+    ) or parallel_config.data_parallel_external_lb
 
     return supports_ipc_cache
 
@@ -515,7 +575,7 @@ def _enable_mm_input_shm_cache(vllm_config: "VllmConfig") -> bool:
 def processor_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
-) -> Optional[BaseMultiModalProcessorCache]:
+) -> BaseMultiModalProcessorCache | None:
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
@@ -542,8 +602,8 @@ def processor_only_cache_from_config(
 
 
 class BaseMultiModalReceiverCache(
-        BaseMultiModalCache[Optional[MultiModalKwargsItem],
-                            MultiModalKwargsItem]):
+    BaseMultiModalCache[MultiModalKwargsItem | None, MultiModalKwargsItem]
+):
     """The required interface for caches on P1."""
 
     def get_and_update_features(
@@ -552,8 +612,7 @@ class BaseMultiModalReceiverCache(
     ) -> list["MultiModalFeatureSpec"]:
         """Update multimodal features with cached encoder outputs."""
         for feature in mm_features:
-            feature.data = self.get_and_update_item(feature.data,
-                                                    feature.identifier)
+            feature.data = self.get_and_update_item(feature.data, feature.identifier)
         return mm_features
 
 
@@ -581,7 +640,7 @@ class MultiModalReceiverCache(BaseMultiModalReceiverCache):
     @override
     def get_and_update_item(
         self,
-        mm_item: Optional[MultiModalKwargsItem],
+        mm_item: MultiModalKwargsItem | None,
         mm_hash: str,
     ) -> MultiModalKwargsItem:
         if (cached_item := self._cache.get(mm_hash)) is not None:
@@ -619,12 +678,11 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
 
         ring_buffer = SingleWriterShmRingBuffer(
             data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes),
-            name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
+            name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
             create=False,  # Server is a reader
         )
         self._shm_cache = SingleWriterShmObjectStorage(
-            max_object_size=mm_config.mm_shm_cache_max_object_size_mb *
-            MiB_bytes,
+            max_object_size=mm_config.mm_shm_cache_max_object_size_mb * MiB_bytes,
             n_readers=self.world_size,
             ring_buffer=ring_buffer,
             serde_class=MsgpackSerde,
@@ -634,7 +692,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
     @override
     def get_and_update_item(
         self,
-        mm_item: Optional[MultiModalKwargsItem],
+        mm_item: MultiModalKwargsItem | None,
         mm_hash: str,
     ) -> MultiModalKwargsItem:
         assert mm_item is not None, f"Expected an address item for {mm_hash=}"
@@ -653,7 +711,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
 def engine_receiver_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
-) -> Optional[BaseMultiModalReceiverCache]:
+) -> BaseMultiModalReceiverCache | None:
     """
     This is used in the engine process.
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
@@ -677,7 +735,7 @@ def worker_receiver_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
     shared_worker_lock: LockType,
-) -> Optional[BaseMultiModalReceiverCache]:
+) -> BaseMultiModalReceiverCache | None:
     """
     This is used in the worker process.
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py
index 056f3d90596816cb5c47c6da57efce4c93421af6..4a288d2d238c29881528133a0ab190bf267da4da 100644
--- a/vllm/multimodal/evs.py
+++ b/vllm/multimodal/evs.py
@@ -13,30 +13,31 @@ import typing
 import torch
 
 
-def compute_retained_tokens_count(video_size_thw: torch.LongTensor,
-                                  spatial_merge_size: int, q: float) -> int:
+def compute_retained_tokens_count(
+    tokens_per_frame: int, num_frames: int, q: float
+) -> int:
     """
     Compute the number of retained tokens for a given video.
     Method ensures that we retain all the tokens from the first frame
     regardless of the pruning rate.
 
     Args:
-        video_size_thw: The size of the video in the format of (T, H, W).
-        spatial_merge_size: The size of the spatial merge.
+        tokens_per_frame: The number of tokens per frame.
+        num_frames: The total number of frames.
         q: The pruning rate.
 
     Returns:
         The number of retained tokens.
     """
-    T, H, W = map(int, video_size_thw)
-    min_num_tokens = (H // spatial_merge_size) * (W // spatial_merge_size)
-    evs_num_tokens = int(T * min_num_tokens * (1 - q))
+    total_tokens = tokens_per_frame * num_frames
+    evs_num_tokens = int(total_tokens * (1 - q))
+    min_num_tokens = tokens_per_frame
     return max(min_num_tokens, evs_num_tokens)
 
 
 def compute_retention_mask(
     video_embeds: torch.Tensor,
-    video_size_thw: torch.LongTensor,
+    video_size_thw: torch.LongTensor | tuple[int, int, int],
     spatial_merge_size: int,
     q: float,
 ) -> torch.Tensor:
@@ -55,7 +56,7 @@ def compute_retention_mask(
         `torch.Tensor`: The retention mask for the video embeddings of
             `(T * H * W // spatial_merge_size ^ 2)` shape.
     """
-    T, H, W = video_size_thw
+    T, H, W = map(int, video_size_thw)
 
     # Use reshape instead of einops to avoid graph breaks
     video_embeds = video_embeds.reshape(
@@ -64,25 +65,23 @@ def compute_retention_mask(
         W // spatial_merge_size,
         video_embeds.size(-1),
     )
-
+    tokens_per_frame = (H // spatial_merge_size) * (W // spatial_merge_size)
     # Core EVS
-    similarity = torch.nn.functional.cosine_similarity(video_embeds[1:, ...],
-                                                       video_embeds[:-1, ...],
-                                                       dim=-1)
+    similarity = torch.nn.functional.cosine_similarity(
+        video_embeds[1:, ...], video_embeds[:-1, ...], dim=-1
+    )
     dissimilarity = 1 - similarity
 
     # Always ensure we include all tokens from the first frame
     dissimilarity = torch.cat(
-        [255 * torch.ones_like(video_embeds[:1, :, :, 0]), dissimilarity],
-        dim=0)
+        [255 * torch.ones_like(video_embeds[:1, :, :, 0]), dissimilarity], dim=0
+    )
 
     dissimilarity_flat = dissimilarity.view(-1)
-    order = torch.argsort(dissimilarity_flat,
-                          dim=-1,
-                          descending=True,
-                          stable=True)
-    retain_num_tokens = compute_retained_tokens_count(video_size_thw,
-                                                      spatial_merge_size, q)
+    order = torch.argsort(dissimilarity_flat, dim=-1, descending=True, stable=True)
+    retain_num_tokens = compute_retained_tokens_count(
+        tokens_per_frame=tokens_per_frame, num_frames=T, q=q
+    )
     topk_indices = order[:retain_num_tokens]
 
     retention_mask = torch.zeros_like(dissimilarity_flat, dtype=torch.bool)
@@ -119,18 +118,34 @@ def compute_mrope_for_media(
     llm_grid_h = video_size_thw[1] // spatial_merge_size
     llm_grid_w = video_size_thw[2] // spatial_merge_size
 
-    t_index = ((torch.arange(llm_grid_t).view(-1, 1).expand(
-        -1, llm_grid_h * llm_grid_w).mul(
-            tokens_per_second * video_second_per_grid)).long().flatten())
-    h_index = (torch.arange(llm_grid_h).view(1, -1,
-                                             1).expand(llm_grid_t, -1,
-                                                       llm_grid_w).flatten())
-    w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
-        llm_grid_t, llm_grid_h, -1).flatten())
-    llm_grid_w = (torch.tensor([llm_grid_w
-                                ]).view(1, 1,
-                                        1).expand(llm_grid_t, llm_grid_h,
-                                                  llm_grid_w).flatten())
+    t_index = (
+        (
+            torch.arange(llm_grid_t)
+            .view(-1, 1)
+            .expand(-1, llm_grid_h * llm_grid_w)
+            .mul(tokens_per_second * video_second_per_grid)
+        )
+        .long()
+        .flatten()
+    )
+    h_index = (
+        torch.arange(llm_grid_h)
+        .view(1, -1, 1)
+        .expand(llm_grid_t, -1, llm_grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(llm_grid_w)
+        .view(1, 1, -1)
+        .expand(llm_grid_t, llm_grid_h, -1)
+        .flatten()
+    )
+    llm_grid_w = (
+        torch.tensor([llm_grid_w])
+        .view(1, 1, 1)
+        .expand(llm_grid_t, llm_grid_h, llm_grid_w)
+        .flatten()
+    )
 
     positions = torch.stack([t_index, h_index, w_index, llm_grid_w], dim=1)
     return positions
@@ -183,7 +198,8 @@ def recompute_mrope_positions(
 
     # Tensors
     positions: torch.LongTensor = typing.cast(
-        torch.LongTensor, mrope_positions.clone())  # (3, N)
+        torch.LongTensor, mrope_positions.clone()
+    )  # (3, N)
     N = input_ids.numel()
 
     image_mask = input_ids.eq(image_token_id)
@@ -193,8 +209,7 @@ def recompute_mrope_positions(
 
     # Early exit: no media in this chunk
     if len(multimodal_positions) == 0:
-        delta = (int((positions.max().item() + 1) -
-                     N) if positions.numel() else -N)
+        delta = int((positions.max().item() + 1) - N) if positions.numel() else -N
         return positions, delta
 
     total_mm_tokens = torch.count_nonzero(media_mask)
@@ -203,12 +218,12 @@ def recompute_mrope_positions(
     # Early exit: we've updated positions for all media tokens
     # (and consequently - for all remaining text tokens)
     if seen_mm_tokens == total_mm_tokens:
-        delta = (int((positions.max().item() + 1) -
-                     N) if positions.numel() else -N)
+        delta = int((positions.max().item() + 1) - N) if positions.numel() else -N
         return positions, delta
 
-    vision_start_indices = (input_ids == vision_start_token_id).nonzero(
-        as_tuple=True)[0]
+    vision_start_indices = (input_ids == vision_start_token_id).nonzero(as_tuple=True)[
+        0
+    ]
 
     for mm_pos in multimodal_positions:
         # Each mm_pos can be a complete embedding for single media
@@ -218,8 +233,9 @@ def recompute_mrope_positions(
         # - Current prefill chunk has no vision start indexes at all
         # - Vision start token appeared in previous prefill round
         # - Regular case
-        seen_vision_start_indices = vision_start_indices[vision_start_indices <
-                                                         num_computed_tokens]
+        seen_vision_start_indices = vision_start_indices[
+            vision_start_indices < num_computed_tokens
+        ]
 
         if len(seen_vision_start_indices):
             # If we have encountered some vision start indexes,
@@ -228,19 +244,23 @@ def recompute_mrope_positions(
             # | TTTTTTTTTSVVVVVVVVVV|VVVVVVTTTTTTTTTTTTTTTT|
             last_vision_start_token = seen_vision_start_indices[-1]
             seem_mm_tokens_before_last_vision_start = torch.count_nonzero(
-                media_mask[:last_vision_start_token])
+                media_mask[:last_vision_start_token]
+            )
             in_the_middle_of_media = (
-                seen_mm_tokens > seem_mm_tokens_before_last_vision_start)
+                seen_mm_tokens > seem_mm_tokens_before_last_vision_start
+            )
 
             if in_the_middle_of_media:
-                mm_embeddings_seen = (seen_mm_tokens -
-                                      seem_mm_tokens_before_last_vision_start)
+                mm_embeddings_seen = (
+                    seen_mm_tokens - seem_mm_tokens_before_last_vision_start
+                )
                 global_mm_start = last_vision_start_token
             else:
                 # We have completed previous mm_embedding part and
                 # ready to start a new one
                 next_vision_start_token = vision_start_indices[
-                    vision_start_indices >= num_computed_tokens][0]
+                    vision_start_indices >= num_computed_tokens
+                ][0]
                 mm_embeddings_seen = 0
                 global_mm_start = next_vision_start_token
 
@@ -248,7 +268,8 @@ def recompute_mrope_positions(
             # If there were no vision start indexes so far,
             # let's find first vision start index
             next_vision_start_token = vision_start_indices[
-                vision_start_indices >= num_computed_tokens][0]
+                vision_start_indices >= num_computed_tokens
+            ][0]
 
             mm_embeddings_seen = 0
             global_mm_start = next_vision_start_token
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index df6c531d876ad47101971d85988c1d232f72462d..d0dcbb25fcce8911805c37824142c8643a04f1ba 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -4,7 +4,6 @@
 import pickle
 import uuid
 from collections.abc import Iterable
-from typing import Union
 
 import numpy as np
 import torch
@@ -17,23 +16,23 @@ logger = init_logger(__name__)
 
 
 class MultiModalHasher:
-
     @classmethod
-    def serialize_item(cls, obj: object) -> Iterable[Union[bytes, memoryview]]:
+    def serialize_item(cls, obj: object) -> Iterable[bytes | memoryview]:
         # Simple cases
         if isinstance(obj, (bytes, memoryview)):
-            return (obj, )
+            return (obj,)
         if isinstance(obj, str):
-            return (obj.encode("utf-8"), )
+            return (obj.encode("utf-8"),)
         if isinstance(obj, (int, float)):
-            return (np.array(obj).tobytes(), )
+            return (np.array(obj).tobytes(),)
 
         if isinstance(obj, Image.Image):
             exif = obj.getexif()
             if Image.ExifTags.Base.ImageID in exif and isinstance(
-                    exif[Image.ExifTags.Base.ImageID], uuid.UUID):
+                exif[Image.ExifTags.Base.ImageID], uuid.UUID
+            ):
                 # If the image has exif ImageID tag, use that
-                return (exif[Image.ExifTags.Base.ImageID].bytes, )
+                return (exif[Image.ExifTags.Base.ImageID].bytes,)
             data = {"mode": obj.mode, "data": np.asarray(obj)}
             if obj.palette is not None:
                 data["palette"] = obj.palette.palette
@@ -49,37 +48,42 @@ class MultiModalHasher:
             # Workaround: View the tensor as a contiguous 1D array of bytes
             if tensor_dtype == torch.bfloat16:
                 tensor_obj = tensor_obj.contiguous()
-                tensor_obj = tensor_obj.view(
-                    (tensor_obj.numel(), )).view(torch.uint8)
+                tensor_obj = tensor_obj.view((tensor_obj.numel(),)).view(torch.uint8)
 
                 return cls.iter_item_to_bytes(
-                    "tensor", {
+                    "tensor",
+                    {
                         "original_dtype": str(tensor_dtype),
                         "original_shape": tuple(tensor_shape),
                         "data": tensor_obj.numpy(),
-                    })
+                    },
+                )
             return cls.iter_item_to_bytes("tensor", tensor_obj.numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first
-            arr_data = obj.view(
-                np.uint8).data if obj.flags.c_contiguous else obj.tobytes()
-            return cls.iter_item_to_bytes("ndarray", {
-                "dtype": obj.dtype.str,
-                "shape": obj.shape,
-                "data": arr_data,
-            })
+            arr_data = (
+                obj.view(np.uint8).data if obj.flags.c_contiguous else obj.tobytes()
+            )
+            return cls.iter_item_to_bytes(
+                "ndarray",
+                {
+                    "dtype": obj.dtype.str,
+                    "shape": obj.shape,
+                    "data": arr_data,
+                },
+            )
         logger.warning(
-            "No serialization method found for %s. "
-            "Falling back to pickle.", type(obj))
+            "No serialization method found for %s. Falling back to pickle.", type(obj)
+        )
 
-        return (pickle.dumps(obj), )
+        return (pickle.dumps(obj),)
 
     @classmethod
     def iter_item_to_bytes(
         cls,
         key: str,
         obj: object,
-    ) -> Iterable[Union[bytes, memoryview]]:
+    ) -> Iterable[bytes | memoryview]:
         # Recursive cases
         if isinstance(obj, (list, tuple)):
             for i, elem in enumerate(obj):
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 1006c1ce4b2417791cf9299ede5b593e2db3dd47..21e8bef97a787232f4e06c34f36c33d5c42b0bef 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,7 +3,6 @@
 
 from io import BytesIO
 from pathlib import Path
-from typing import Union
 
 import pybase64
 import torch
@@ -12,9 +11,9 @@ from PIL import Image
 from .base import MediaIO
 
 
-def rescale_image_size(image: Image.Image,
-                       size_factor: float,
-                       transpose: int = -1) -> Image.Image:
+def rescale_image_size(
+    image: Image.Image, size_factor: float, transpose: int = -1
+) -> Image.Image:
     """Rescale the dimensions of an image by a constant factor."""
     new_width = int(image.width * size_factor)
     new_height = int(image.height * size_factor)
@@ -26,7 +25,7 @@ def rescale_image_size(image: Image.Image,
 
 def rgba_to_rgb(
     image: Image.Image,
-    background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255)
+    background_color: tuple[int, int, int] | list[int] = (255, 255, 255),
 ) -> Image.Image:
     """Convert an RGBA image to RGB with filled background color."""
     assert image.mode == "RGBA"
@@ -45,7 +44,6 @@ def convert_image_mode(image: Image.Image, to_mode: str):
 
 
 class ImageMediaIO(MediaIO[Image.Image]):
-
     def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
         super().__init__()
 
@@ -59,18 +57,21 @@ class ImageMediaIO(MediaIO[Image.Image]):
 
         # Extract RGBA background color from kwargs if provided
         # Default to white background for backward compatibility
-        rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255))
+        rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
         # Convert list to tuple for consistency
         if isinstance(rgba_bg, list):
             rgba_bg = tuple(rgba_bg)
 
         # Validate rgba_background_color format
-        if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3
-                and all(isinstance(c, int) and 0 <= c <= 255
-                        for c in rgba_bg)):
+        if not (
+            isinstance(rgba_bg, tuple)
+            and len(rgba_bg) == 3
+            and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
+        ):
             raise ValueError(
                 "rgba_background_color must be a list or tuple of 3 integers "
-                "in the range [0, 255].")
+                "in the range [0, 255]."
+            )
         self.rgba_background_color = rgba_bg
 
     def _convert_image_mode(self, image: Image.Image) -> Image.Image:
@@ -108,11 +109,10 @@ class ImageMediaIO(MediaIO[Image.Image]):
             image.save(buffer, image_format)
             data = buffer.getvalue()
 
-        return pybase64.b64encode(data).decode('utf-8')
+        return pybase64.b64encode(data).decode("utf-8")
 
 
 class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
-
     def __init__(self) -> None:
         super().__init__()
 
@@ -127,4 +127,4 @@ class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
         return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
-        return pybase64.b64encode(media.numpy()).decode('utf-8')
+        return pybase64.b64encode(media.numpy()).decode("utf-8")
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3539517ed45ee9c016650941d60dcc4f3d17f0f9..7518a023c5f50cd81f02ef4f1a71b7f4aeffe3d2 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -7,13 +7,23 @@ from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union,
-                    cast, final)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    Optional,
+    TypeAlias,
+    TypedDict,
+    Union,
+    cast,
+    final,
+)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated
+from typing_extensions import NotRequired, TypeVar, deprecated
 
-from vllm.utils import LazyLoader, full_groupby, is_list_of
+from vllm.utils.collection_utils import full_groupby, is_list_of
+from vllm.utils.import_utils import LazyLoader
 from vllm.utils.jsontree import json_map_leaves
 
 if TYPE_CHECKING:
@@ -35,8 +45,9 @@ A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
 """
 
-HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor",
-                               list[np.ndarray], list["torch.Tensor"]]
+HfVideoItem: TypeAlias = Union[
+    list["Image"], np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]
+]
 """
 A `transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace `VideoProcessor`.
@@ -58,8 +69,9 @@ which are treated as image embeddings;
 these are directly passed to the model without HF processing.
 """
 
-VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor",
-                             tuple[HfVideoItem, dict[str, Any]]]
+VideoItem: TypeAlias = Union[
+    HfVideoItem, "torch.Tensor", tuple[HfVideoItem, dict[str, Any]]
+]
 """
 A `transformers.video_utils.VideoInput` representing a single video item. 
 This can be passed to a HuggingFace `VideoProcessor` 
@@ -70,8 +82,7 @@ which are treated as video embeddings;
 these are directly passed to the model without HF processing.
 """
 
-AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
-                             "torch.Tensor"]
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], "torch.Tensor"]
 """
 Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
@@ -85,7 +96,7 @@ which are treated as audio embeddings;
 these are directly passed to the model without HF processing.
 """
 
-ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
+ModalityData: TypeAlias = _T | list[_T | None] | None
 """
 Either a single data item, or a list of data items. Can only be None if UUID
 is provided.
@@ -117,7 +128,7 @@ The built-in modalities are defined by
 [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """
 
-MultiModalUUIDDict: TypeAlias = Mapping[str, Union[list[Optional[str]], str]]
+MultiModalUUIDDict: TypeAlias = Mapping[str, list[str | None] | str]
 """
 A dictionary containing user-provided UUIDs for items in each modality.
 If a UUID for an item is not provided, its entry will be `None` and
@@ -177,8 +188,12 @@ class PlaceholderRange:
         return nested_tensors_equal(self.is_embed, other.is_embed)
 
 
-NestedTensors: TypeAlias = Union[list["NestedTensors"], list["torch.Tensor"],
-                                 "torch.Tensor", tuple["torch.Tensor", ...]]
+NestedTensors: TypeAlias = Union[
+    list["NestedTensors"],
+    list["torch.Tensor"],
+    "torch.Tensor",
+    tuple["torch.Tensor", ...],
+]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
@@ -193,11 +208,13 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
         return isinstance(a, torch.Tensor) and torch.equal(b, a)
 
     if isinstance(a, list):
-        return (isinstance(b, list)
-                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+        return isinstance(b, list) and all(
+            nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)
+        )
     if isinstance(b, list):
-        return (isinstance(a, list)
-                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+        return isinstance(a, list) and all(
+            nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)
+        )
 
     # Both a and b are scalars
     return a == b
@@ -214,7 +231,7 @@ A dictionary containing nested tensors which have been batched via
 class MultiModalFeatureSpec:
     """
     Represents a single multimodal input with its processed data and metadata.
-    
+
     Used by the V1 engine to track multimodal data through processing and
     caching. A request containing multiple multimodal items will have one
     MultiModalFeatureSpec per item.
@@ -232,6 +249,19 @@ class MultiModalFeatureSpec:
     mm_position: PlaceholderRange
     """e.g., PlaceholderRange(offset=2, length=336)"""
 
+    @staticmethod
+    def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
+        kwargs = defaultdict[str, list[NestedTensors]](list)
+
+        for f in features:
+            item = f.data
+            if item is not None:
+                for k in keys:
+                    if k in item:
+                        kwargs[k].append(item[k].data)
+
+        return dict(kwargs)
+
 
 @dataclass
 class MultiModalFieldElem:
@@ -280,9 +310,11 @@ class MultiModalFieldElem:
         else:
             data_equal = nested_tensors_equal(self.data, other.data)
 
-        return ((self.modality, self.key) == (other.modality, other.key)
-                and data_equal
-                and type(self.field) == type(other.field))  # noqa: E721
+        return (
+            (self.modality, self.key) == (other.modality, other.key)
+            and data_equal
+            and type(self.field) is type(other.field)
+        )  # noqa: E721
 
 
 @dataclass(frozen=True)
@@ -385,10 +417,12 @@ class MultiModalBatchedField(BaseMultiModalField):
                 return batch[0].unsqueeze(0).contiguous()
             first_shape = batch[0].shape
             if all(elem.shape == first_shape for elem in batch):
-                out = torch.empty((len(batch), *batch[0].shape),
-                                  dtype=batch[0].dtype,
-                                  device=batch[0].device,
-                                  pin_memory=pin_memory)
+                out = torch.empty(
+                    (len(batch), *batch[0].shape),
+                    dtype=batch[0].dtype,
+                    device=batch[0].device,
+                    pin_memory=pin_memory,
+                )
                 return torch.stack(batch, out=out)
 
         return batch
@@ -401,7 +435,8 @@ class MultiModalFlatField(BaseMultiModalField):
         [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
         [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
     """
-    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+
+    slices: Sequence[slice] | Sequence[Sequence[slice]]
     dim: int = 0
 
     def build_elems(
@@ -412,8 +447,9 @@ class MultiModalFlatField(BaseMultiModalField):
     ) -> Sequence[MultiModalFieldElem]:
         field_factory = self._field_factory(modality=modality, key=key)
         if not is_list_of(self.slices, slice, check="all"):
-            assert isinstance(data, torch.Tensor), \
+            assert isinstance(data, torch.Tensor), (
                 "torch.Tensor is required for multiple slices"
+            )
         return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
     def _reduce_data(
@@ -433,17 +469,19 @@ class MultiModalFlatField(BaseMultiModalField):
             dim = self.dim + (self.dim < 0) * len(batch[0].shape)
 
             def _shape_before_after(tensor: torch.Tensor):
-                return tensor.shape[:dim], tensor.shape[dim + 1:]
+                return tensor.shape[:dim], tensor.shape[dim + 1 :]
 
             first_shape = _shape_before_after(batch[0])
 
             if all(_shape_before_after(elem) == first_shape for elem in batch):
                 shape_before, shape_after = first_shape
                 shape_concat = sum(item.shape[dim] for item in batch)
-                out = torch.empty((*shape_before, shape_concat, *shape_after),
-                                  dtype=batch[0].dtype,
-                                  device=batch[0].device,
-                                  pin_memory=pin_memory)
+                out = torch.empty(
+                    (*shape_before, shape_concat, *shape_after),
+                    dtype=batch[0].dtype,
+                    device=batch[0].device,
+                    pin_memory=pin_memory,
+                )
                 return torch.concat(batch, dim=self.dim, out=out)
 
         assert self.dim == 0, "dim == 0 is required for nested list"
@@ -456,6 +494,7 @@ class MultiModalSharedField(BaseMultiModalField):
     Info:
         [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
     """
+
     batch_size: int
 
     def build_elems(
@@ -477,7 +516,6 @@ class MultiModalSharedField(BaseMultiModalField):
 
 
 class MultiModalFieldConfig:
-
     @staticmethod
     def batched(modality: str):
         """
@@ -508,9 +546,11 @@ class MultiModalFieldConfig:
         )
 
     @staticmethod
-    def flat(modality: str,
-             slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
-             dim: int = 0):
+    def flat(
+        modality: str,
+        slices: Sequence[slice] | Sequence[Sequence[slice]],
+        dim: int = 0,
+    ):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -561,9 +601,7 @@ class MultiModalFieldConfig:
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str,
-                        size_per_item: "torch.Tensor",
-                        dim: int = 0):
+    def flat_from_sizes(modality: str, size_per_item: "torch.Tensor", dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -609,13 +647,17 @@ class MultiModalFieldConfig:
         """
 
         if size_per_item.ndim != 1:
-            raise ValueError("size_per_item should be a 1-D tensor, "
-                             f"but found shape: {size_per_item.shape}")
+            raise ValueError(
+                "size_per_item should be a 1-D tensor, "
+                f"but found shape: {size_per_item.shape}"
+            )
 
         slice_idxs = [0, *accumulate(size_per_item)]
-        slices = [(slice(None, None, None), ) * dim +
-                  (slice(slice_idxs[i], slice_idxs[i + 1]), )
-                  for i in range(len(size_per_item))]
+        slices = [
+            (slice(None, None, None),) * dim
+            + (slice(slice_idxs[i], slice_idxs[i + 1]),)
+            for i in range(len(size_per_item))
+        ]
 
         return MultiModalFieldConfig.flat(modality, slices, dim=dim)
 
@@ -659,6 +701,9 @@ class MultiModalFieldConfig:
         self.field = field
         self.modality = modality
 
+    def __repr__(self) -> str:
+        return f"MultiModalFieldConfig(field={self.field}, modality={self.modality})"
+
     def build_elems(
         self,
         key: str,
@@ -708,7 +753,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 _I = TypeVar(
     "_I",
     MultiModalKwargsItem,
-    Optional[MultiModalKwargsItem],
+    MultiModalKwargsItem | None,
     default=MultiModalKwargsItem,
 )
 
@@ -745,7 +790,8 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
             if len(set(batch_sizes.values())) > 1:
                 raise ValueError(
                     f"Cannot merge different batch sizes for {modality=}! "
-                    f"Found: {batch_sizes=}")
+                    f"Found: {batch_sizes=}"
+                )
 
             batch_size = next(iter(batch_sizes.values()))
             for item_idx in range(batch_size):
@@ -761,8 +807,10 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
 
     def __getitem__(self, modality: str) -> Sequence[_I]:
         if modality not in self:
-            raise KeyError(f"Modality {modality!r} not found. "
-                           f"Available modalities: {set(self.keys())}")
+            raise KeyError(
+                f"Modality {modality!r} not found. "
+                f"Available modalities: {set(self.keys())}"
+            )
 
         return super().__getitem__(modality)  # type: ignore[return-value]
 
@@ -770,8 +818,7 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         for modality, items in self.items():
             for i, item in enumerate(items):
                 if item is None:
-                    raise RuntimeError(
-                        f"Found empty mm_items[{modality}][{i}]")
+                    raise RuntimeError(f"Found empty mm_items[{modality}][{i}]")
 
         return self  # type: ignore[return-value]
 
@@ -780,23 +827,25 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         for modality, items in self.items():
             for i, item in enumerate(items):
                 if item is None:
-                    raise RuntimeError("Cannot build data from empty "
-                                       f"mm_items[{modality}][{i}]")
+                    raise RuntimeError(
+                        f"Cannot build data from empty mm_items[{modality}][{i}]"
+                    )
 
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
-        return MultiModalKwargs({
-            key:
-            elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items()
-        })
+        return MultiModalKwargs(
+            {
+                key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+                for key, elems in elems_by_key.items()
+            }
+        )
 
 
-MultiModalKwargsOptionalItems: TypeAlias = Union[
-    MultiModalKwargsItems[MultiModalKwargsItem],
-    MultiModalKwargsItems[Optional[MultiModalKwargsItem]],
-]
+MultiModalKwargsOptionalItems: TypeAlias = (
+    MultiModalKwargsItems[MultiModalKwargsItem]
+    | MultiModalKwargsItems[MultiModalKwargsItem | None]
+)
 
 
 class MultiModalKwargs(UserDict[str, NestedTensors]):
@@ -806,33 +855,36 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
 
     @staticmethod
-    @deprecated("`MultiModalKwargs.from_hf_inputs` is deprecated and "
-                "will be removed in v0.13. "
-                "Please use `MultiModalKwargsItems.from_hf_inputs` and "
-                "access the tensor data using `.get_data()`.")
+    @deprecated(
+        "`MultiModalKwargs.from_hf_inputs` is deprecated and "
+        "will be removed in v0.13. "
+        "Please use `MultiModalKwargsItems.from_hf_inputs` and "
+        "access the tensor data using `.get_data()`."
+    )
     def from_hf_inputs(
         hf_inputs: "BatchFeature",
         config_by_key: Mapping[str, MultiModalFieldConfig],
     ):
-        return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key) \
-            .get_data()
+        return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key).get_data()
 
     @staticmethod
-    @deprecated("`MultiModalKwargs.from_items` is deprecated and "
-                "will be removed in v0.13. "
-                "Please use `MultiModalKwargsItems.from_seq` and "
-                "access the tensor data using `.get_data()`.")
+    @deprecated(
+        "`MultiModalKwargs.from_items` is deprecated and "
+        "will be removed in v0.13. "
+        "Please use `MultiModalKwargsItems.from_seq` and "
+        "access the tensor data using `.get_data()`."
+    )
     def from_items(
         items: Sequence[MultiModalKwargsItem],
         *,
         pin_memory: bool = False,
     ):
-        return MultiModalKwargsItems.from_seq(items) \
-            .get_data(pin_memory=pin_memory)
+        return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
 
     @staticmethod
-    def _try_stack(nested_tensors: NestedTensors,
-                   pin_memory: bool = False) -> NestedTensors:
+    def _try_stack(
+        nested_tensors: NestedTensors, pin_memory: bool = False
+    ) -> NestedTensors:
         """
         Stack the inner dimensions that have the same shape in
         a nested list of tensors.
@@ -849,9 +901,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         if isinstance(nested_tensors, (int, float)):
             return torch.tensor(nested_tensors)
 
-        stacked = [
-            MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors
-        ]
+        stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors]
         if not is_list_of(stacked, torch.Tensor, check="all"):
             # Only tensors (not lists) can be stacked.
             return stacked
@@ -867,16 +917,19 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
 
-        outputs = torch.empty(len(tensors_),
-                              *tensors_[0].shape,
-                              dtype=tensors_[0].dtype,
-                              device=tensors_[0].device,
-                              pin_memory=pin_memory)
+        outputs = torch.empty(
+            len(tensors_),
+            *tensors_[0].shape,
+            dtype=tensors_[0].dtype,
+            device=tensors_[0].device,
+            pin_memory=pin_memory,
+        )
         return torch.stack(tensors_, out=outputs)
 
     @staticmethod
-    def batch(inputs_list: list["MultiModalKwargs"],
-              pin_memory: bool = False) -> BatchedTensorInputs:
+    def batch(
+        inputs_list: list["MultiModalKwargs"], pin_memory: bool = False
+    ) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -915,8 +968,10 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
 
     def __getitem__(self, key: str):
         if key not in self:
-            raise KeyError(f"Keyword argument {key!r} not found. "
-                           f"Available keys: {set(self.keys())}")
+            raise KeyError(
+                f"Keyword argument {key!r} not found. "
+                f"Available keys: {set(self.keys())}"
+            )
 
         return super().__getitem__(key)
 
@@ -949,9 +1004,6 @@ class MultiModalInputs(TypedDict):
     type: Literal["multimodal"]
     """The type of inputs."""
 
-    prompt: str
-    """The processed prompt text."""
-
     prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
@@ -980,8 +1032,5 @@ class MultiModalEncDecInputs(MultiModalInputs):
     ready to be passed to vLLM internals.
     """
 
-    encoder_prompt: str
-    """The processed encoder prompt text."""
-
     encoder_prompt_token_ids: list[int]
     """The processed token IDs of the encoder prompt."""
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 493dd3560a5162ab02535f5a1c7c6494aa4c6efc..810f29072a0fe57bb313e8d5f06db5a07bdedd99 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -4,19 +4,37 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
-                    TypeVar, Union)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    Literal,
+    NamedTuple,
+    TypeAlias,
+    TypeGuard,
+    TypeVar,
+)
 
 import numpy as np
 import torch
-from typing_extensions import TypeAlias, TypeGuard, assert_never
+from typing_extensions import assert_never
 
-from vllm.utils import LazyLoader, is_list_of
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.import_utils import LazyLoader
 
 from .audio import AudioResampler
-from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     MultiModalFieldConfig, MultiModalKwargsItems, VideoItem)
+from .inputs import (
+    AudioItem,
+    HfAudioItem,
+    HfImageItem,
+    HfVideoItem,
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
@@ -40,8 +58,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
         self.modality = modality
 
     def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r}, "
-                f"len={len(self)})")
+        return f"{type(self).__name__}(modality={self.modality!r}, len={len(self)})"
 
     def __len__(self) -> int:
         return self.get_count()
@@ -51,8 +68,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
 
     if TYPE_CHECKING:
         # Auto-generated
-        def __iter__(self) -> Iterator[_I]:
-            ...
+        def __iter__(self) -> Iterator[_I]: ...
 
     @abstractmethod
     def get_count(self) -> int:
@@ -95,8 +111,9 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
         return {}
 
 
-class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
-                                       torch.Tensor]):
+class EmbeddingItems(
+    ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor]
+):
     """
     Base class for data items that are expressed as a batched embedding tensor,
     or a list of embedding tensors (one per item).
@@ -118,8 +135,9 @@ class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
         return len(self.get(item_idx))
 
 
-class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
-                                           Mapping[str, torch.Tensor]]):
+class DictEmbeddingItems(
+    ModalityDataItems[Mapping[str, torch.Tensor], Mapping[str, torch.Tensor]]
+):
     """
     Base class for data items that are expressed as a dictionary of tensors.
 
@@ -143,8 +161,10 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
         missing_required_data_keys = required_fields - data.keys()
         if missing_required_data_keys:
             data_keys = set(data.keys())
-            msg = (f"The data should contain the fields: {required_fields}, "
-                   f"but only found the following keys: {data_keys}")
+            msg = (
+                f"The data should contain the fields: {required_fields}, "
+                f"but only found the following keys: {data_keys}"
+            )
             raise ValueError(msg)
 
         fields_config = fields_factory(data)
@@ -176,8 +196,7 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
 
 
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
-
-    def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
+    def __init__(self, data: Sequence[HfAudioItem] | None) -> None:
         if data is None:
             data = [None]
         super().__init__(data, "audio")
@@ -188,8 +207,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
 
 class AudioEmbeddingItems(EmbeddingItems):
-
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "audio")
 
 
@@ -199,8 +217,7 @@ class ImageSize(NamedTuple):
 
 
 class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
-
-    def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
+    def __init__(self, data: Sequence[HfImageItem] | None) -> None:
         if data is None:
             data = [None]
         super().__init__(data, "image")
@@ -218,18 +235,15 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
 
 class ImageEmbeddingItems(EmbeddingItems):
-
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "image")
 
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
-
     def __init__(
         self,
-        data: Optional[Sequence[HfVideoItem]],
-        metadata: Optional[Union[dict[str, Any],
-                                 list[Optional[dict[str, Any]]]]] = None,
+        data: Sequence[HfVideoItem] | None,
+        metadata: dict[str, Any] | list[dict[str, Any] | None] | None = None,
     ) -> None:
         if data is None:
             data = [None]
@@ -252,8 +266,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
 
 class VideoEmbeddingItems(EmbeddingItems):
-
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "video")
 
 
@@ -276,8 +289,10 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
         if modality not in self:
             if strict:
                 available_modalities = set(self.keys())
-                raise KeyError(f"Modality {modality!r} not found. "
-                               f"Available modalities: {available_modalities}")
+                raise KeyError(
+                    f"Modality {modality!r} not found. "
+                    f"Available modalities: {available_modalities}"
+                )
 
             return 0
 
@@ -290,7 +305,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     def get_items(
         self,
         modality: str,
-        typ: Union[type[_D], tuple[type[_D], ...]],
+        typ: type[_D] | tuple[type[_D], ...],
     ) -> _D:
         """
         Get the data items belonging to a modality,
@@ -298,20 +313,25 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
         """
         if modality not in self:
             available_modalities = set(self.keys())
-            raise KeyError(f"Modality {modality!r} not found. "
-                           f"Available modalities: {available_modalities}")
+            raise KeyError(
+                f"Modality {modality!r} not found. "
+                f"Available modalities: {available_modalities}"
+            )
 
         items = self[modality]
         if not isinstance(items, typ):
-            raise TypeError(f"Invalid type of data items for {modality=}. "
-                            f"Expected type: {typ}, but "
-                            f"found type: {type(items)}")
+            raise TypeError(
+                f"Invalid type of data items for {modality=}. "
+                f"Expected type: {typ}, but "
+                f"found type: {type(items)}"
+            )
 
         return items  # type: ignore[return-value]
 
 
-ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
-                                         Optional[ModalityDataItems[Any, Any]]]
+ModalityDataParser: TypeAlias = Callable[
+    [ModalityData[Any]], ModalityDataItems[Any, Any] | None
+]
 
 
 class MultiModalDataParser:
@@ -327,7 +347,7 @@ class MultiModalDataParser:
     def __init__(
         self,
         *,
-        target_sr: Optional[float] = None,
+        target_sr: float | None = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
         video_needs_metadata: bool = False,
     ) -> None:
@@ -339,13 +359,14 @@ class MultiModalDataParser:
         )
         self.video_needs_metadata = video_needs_metadata
 
-    def _is_embeddings(
-            self, data: object
-    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
+    @classmethod
+    def is_embeddings(
+        cls, data: object
+    ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            return data[0].ndim == 2
+            return data[0].ndim == 2  # type: ignore[index]
 
         return False
 
@@ -360,7 +381,7 @@ class MultiModalDataParser:
     def _get_audio_with_sr(
         self,
         audio: AudioItem,
-    ) -> tuple[np.ndarray, Optional[float]]:
+    ) -> tuple[np.ndarray, float | None]:
         if isinstance(audio, tuple):
             return audio
         if isinstance(audio, list):
@@ -375,7 +396,7 @@ class MultiModalDataParser:
     def _get_video_with_metadata(
         self,
         video: VideoItem,
-    ) -> tuple[np.ndarray, Optional[dict[str, Any]]]:
+    ) -> tuple[np.ndarray, dict[str, Any] | None]:
         if isinstance(video, tuple):
             return video
         if isinstance(video, list):
@@ -390,27 +411,31 @@ class MultiModalDataParser:
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return AudioProcessorItems(None)
 
         # also check single audio item with sampling rate
-        if self._is_empty(data) or (isinstance(data, tuple)
-                                    and self._is_empty(data[0])):
+        if self._is_empty(data) or (
+            isinstance(data, tuple) and self._is_empty(data[0])
+        ):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return AudioEmbeddingItems(data)
 
-        if (is_list_of(data, float)
-                or isinstance(data,
-                              (np.ndarray, torch.Tensor)) and data.ndim == 1
-                or isinstance(data, tuple)):
+        data_items: list[AudioItem]
+        if (
+            is_list_of(data, float)
+            or isinstance(data, (np.ndarray, torch.Tensor))
+            and data.ndim == 1
+            or isinstance(data, tuple)
+        ):
             data_items = [data]
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
         else:
-            data_items = data
+            data_items = data  # type: ignore[assignment]
 
         new_audios = list[np.ndarray]()
         for data_item in data_items:
@@ -418,8 +443,7 @@ class MultiModalDataParser:
             if orig_sr is None:
                 new_audio = audio
             else:
-                new_audio = self.audio_resampler.resample(audio,
-                                                          orig_sr=orig_sr)
+                new_audio = self.audio_resampler.resample(audio, orig_sr=orig_sr)
 
             new_audios.append(new_audio)
 
@@ -428,19 +452,21 @@ class MultiModalDataParser:
     def _parse_image_data(
         self,
         data: ModalityData[ImageItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return ImageProcessorItems(None)
 
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return ImageEmbeddingItems(data)
 
-        if (isinstance(data, PILImage.Image)
-                or isinstance(data,
-                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+        if (
+            isinstance(data, PILImage.Image)
+            or isinstance(data, (np.ndarray, torch.Tensor))
+            and data.ndim == 3
+        ):
             data_items = [data]
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
@@ -452,32 +478,40 @@ class MultiModalDataParser:
     def _parse_video_data(
         self,
         data: ModalityData[VideoItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return VideoProcessorItems(None)
 
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return VideoEmbeddingItems(data)
 
-        if (is_list_of(data, PILImage.Image)
-                or isinstance(data,
-                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+        data_items: list[VideoItem]
+        if (
+            is_list_of(data, PILImage.Image)
+            or isinstance(data, (np.ndarray, torch.Tensor))
+            and data.ndim == 4
+        ):
             data_items = [data]
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
         elif isinstance(data, tuple) and len(data) == 2:
             data_items = [data]
         else:
-            data_items = data
+            data_items = data  # type: ignore[assignment]
 
-        new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
-        metadata_lst: list[Optional[dict[str, Any]]] = []
+        new_videos = list[tuple[np.ndarray, dict[str, Any] | None]]()
+        metadata_lst: list[dict[str, Any] | None] = []
         for data_item in data_items:
             video, metadata = self._get_video_with_metadata(data_item)
             if self.video_needs_metadata:
+                if metadata is None:
+                    raise ValueError(
+                        "Video metadata is required but not found in mm input. "
+                        "Please check your video input in `multi_modal_data`"
+                    )
                 new_videos.append((video, metadata))
                 metadata_lst.append(metadata)
             else:
@@ -495,8 +529,7 @@ class MultiModalDataParser:
             "video": self._parse_video_data,
         }
 
-    def parse_mm_data(self,
-                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+    def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems:
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ce671479b1ae7429c13d6d5567df572d20248f44..85a03efd5bb9b48551b3b8f8d7ed6d2d8f6125db 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -3,13 +3,20 @@
 import time
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
-                             Sequence)
+from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
-from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional,
-                    Protocol, Union, cast, overload)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    NamedTuple,
+    Protocol,
+    TypeAlias,
+    cast,
+    overload,
+)
 
 import regex as re
 import torch
@@ -17,20 +24,29 @@ from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
-                                               encode_tokens)
-from vllm.utils import (flatten_2d_lists, full_groupby,
-                        get_allowed_kwarg_only_overrides)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
+from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
 
 from .hasher import MultiModalHasher
-from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalFieldConfig, MultiModalInputs,
-                     MultiModalKwargsItem, MultiModalKwargsItems,
-                     MultiModalKwargsOptionalItems, MultiModalUUIDDict,
-                     PlaceholderRange)
-from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
-                    MultiModalDataParser)
+from .inputs import (
+    MultiModalDataDict,
+    MultiModalEncDecInputs,
+    MultiModalFieldConfig,
+    MultiModalInputs,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalKwargsOptionalItems,
+    MultiModalUUIDDict,
+    PlaceholderRange,
+)
+from .parse import (
+    DictEmbeddingItems,
+    EmbeddingItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
 
 if TYPE_CHECKING:
     from transformers.configuration_utils import PretrainedConfig
@@ -41,12 +57,20 @@ if TYPE_CHECKING:
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
+else:
+    PretrainedConfig = object
+    BatchFeature = object
+    ProcessorMixin = object
+
+    ModelConfig = object
+
+    BaseMultiModalProcessorCache = object
 
 logger = init_logger(__name__)
 
 _S = TypeVar("_S", str, list[int])
 
-PromptSeq = Union[str, list[int]]
+PromptSeq: TypeAlias = str | list[int]
 """A token sequence (list of token IDs) or text."""
 
 
@@ -55,11 +79,9 @@ def _cached_encode(
     tokenizer: AnyTokenizer,
     text: str,
     *,
-    add_special_tokens: Optional[bool] = None,
+    add_special_tokens: bool | None = None,
 ) -> list[int]:
-    return encode_tokens(tokenizer,
-                         text,
-                         add_special_tokens=add_special_tokens)
+    return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens)
 
 
 @lru_cache(maxsize=2048)
@@ -67,11 +89,11 @@ def _cached_decode(
     tokenizer: AnyTokenizer,
     token_ids: tuple[int, ...],
     *,
-    skip_special_tokens: Optional[bool] = None,
+    skip_special_tokens: bool | None = None,
 ) -> str:
-    return decode_tokens(tokenizer,
-                         list(token_ids),
-                         skip_special_tokens=skip_special_tokens)
+    return decode_tokens(
+        tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens
+    )
 
 
 def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
@@ -89,24 +111,22 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
 
 
 class _GetMatchIndex(Protocol):
-
     def __call__(
         self,
         tokenizer: AnyTokenizer,
         prompt: PromptSeq,
         start_idx: int = 0,
-    ) -> Optional[int]:
-        ...
+    ) -> int | None: ...
 
 
 @dataclass
 class PromptIndex:
     """Resolves to an index in the prompt."""
+
     get_match_index: _GetMatchIndex
 
 
 class PromptIndexTargets:
-
     @staticmethod
     def start() -> PromptIndex:
         """
@@ -126,7 +146,7 @@ class PromptIndexTargets:
             tokenizer: AnyTokenizer,
             prompt: PromptSeq,
             start_idx: int = 0,
-        ) -> Optional[int]:
+        ) -> int | None:
             if start_idx != 0:
                 return None
 
@@ -139,9 +159,7 @@ class PromptIndexTargets:
             else:
                 if isinstance(prefix, str):
                     # Make both `list[int]`
-                    prefix = encode_tokens(tokenizer,
-                                           prefix,
-                                           add_special_tokens=False)
+                    prefix = encode_tokens(tokenizer, prefix, add_special_tokens=False)
 
             match_idx = len(prefix)
             return match_idx if prompt[:match_idx] == prefix else None
@@ -158,12 +176,12 @@ class PromptIndexTargets:
         return PromptIndex(lambda tokenizer, prompt, start_idx=0: len(prompt))
 
 
-UpdateTarget = Union[PromptSeq, PromptIndex]
+UpdateTarget: TypeAlias = PromptSeq | PromptIndex
 """
 The token sequence or text to update.
 """
 
-PromptUpdateTarget = Union[Callable[[int], UpdateTarget], UpdateTarget]
+PromptUpdateTarget: TypeAlias = Callable[[int], UpdateTarget] | UpdateTarget
 """
 Given the index of the processed item within
 [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
@@ -181,8 +199,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Optional[Callable[[AnyTokenizer, PromptSeq],
-                                torch.Tensor]] = None
+    is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -191,7 +208,7 @@ class PromptUpdateDetails(Generic[_S]):
     `None` (default) means to assign embeddings to all positions of `full`.
 
     The embeddings are obtained by calling
-    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
+    [`SupportsMultiModal.embed_multimodal`][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal].
     """
 
     @staticmethod
@@ -203,7 +220,6 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-
         def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
             embed_token_ids = encode_tokens(tokenizer, embed_text)
             token_ids = _seq2tokens(tokenizer, full)
@@ -220,7 +236,6 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-
         def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
@@ -229,7 +244,7 @@ class PromptUpdateDetails(Generic[_S]):
         return PromptUpdateDetails(full=seq, is_embed=is_embed)
 
 
-PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
+PromptUpdateInfo: TypeAlias = PromptSeq | PromptUpdateDetails
 """
 The token sequence or text that are part of the update.
 
@@ -238,8 +253,7 @@ use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
 specify which part.
 """
 
-PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
-                            PromptUpdateInfo]
+PromptUpdateContent: TypeAlias = Callable[[int], PromptUpdateInfo] | PromptUpdateInfo
 """
 Given the index of the processed item within
 [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
@@ -318,8 +332,8 @@ class PromptInsertion(PromptUpdate):
 
     Example:
 
-    For each image, insert a number of ``<image>`` feature placeholders
-    equal to the feature size of the vision encoder after the ``<s>`` token:
+    For each image, insert a number of `<image>` feature placeholders
+    equal to the feature size of the vision encoder after the `<s>` token:
 
     ```python
     PromptInsertion(
@@ -339,7 +353,7 @@ class PromptInsertion(PromptUpdate):
     )
     ```
 
-    Insert these tokens after a prefix ``Images:``:
+    Insert these tokens after a prefix `Images:`:
 
     ```python
     PromptInsertion(
@@ -387,8 +401,8 @@ class PromptReplacement(PromptUpdate):
 
     Example:
 
-    For each image, replace one ``<image>`` input placeholder in the prompt
-    with a number of ``<image>`` feature placeholders
+    For each image, replace one `<image>` input placeholder in the prompt
+    with a number of `<image>` feature placeholders
     equal to the feature size of the vision encoder:
 
     ```python
@@ -399,8 +413,8 @@ class PromptReplacement(PromptUpdate):
     )
     ```
 
-    As above, but further pad the feature placeholders with ``<image_bos>``
-    and `<image_eos>``, which are not supposed to be passed to the vision
+    As above, but further pad the feature placeholders with `<image_bos>`
+    and `<image_eos>`, which are not supposed to be passed to the vision
     encoder:
 
     ```python
@@ -408,11 +422,13 @@ class PromptReplacement(PromptUpdate):
         modality="image",
         target="<image>",
         replacement=PromptUpdateDetails(
-            full="".join([
-                "<image_bos>",
-                "<image>" * image_feature_size,
-                "<image_eos>",
-            ]),
+            full="".join(
+                [
+                    "<image_bos>",
+                    "<image>" * image_feature_size,
+                    "<image_eos>",
+                ]
+            ),
             features="<image>" * image_feature_size,
         ),
     )
@@ -426,8 +442,9 @@ class PromptReplacement(PromptUpdate):
         modality="image",
         target=[image_token_id],
         replacement=PromptUpdateDetails(
-            full=([image_bos_id] + [image_token_id] * image_feature_size
-                    + [image_eos_id]),
+            full=(
+                [image_bos_id] + [image_token_id] * image_feature_size + [image_eos_id]
+            ),
             features=[image_token_id] * image_feature_size,
         ),
     )
@@ -459,18 +476,19 @@ class _HasModalityAttr(Protocol):
 
 
 class _HasModalityProp(Protocol):
-
     @property
-    def modality(self) -> str:
-        ...
+    def modality(self) -> str: ...
 
 
-_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
+_M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
 
 
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
-    """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby]
-    based on modality."""
+    """
+    Convenience function to apply
+    [`full_groupby`][vllm.utils.collection_utils.full_groupby]
+    based on modality.
+    """
     return full_groupby(values, key=lambda x: x.modality)
 
 
@@ -520,9 +538,7 @@ class ResolvedPromptUpdate:
 
         target_token_ids = _seq2tokens(tokenizer, target)
 
-        for match in iter_token_matches(prompt,
-                                        target_token_ids,
-                                        start_idx=start_idx):
+        for match in iter_token_matches(prompt, target_token_ids, start_idx=start_idx):
             yield PromptTargetMatch(match.start_idx, match.end_idx)
 
     def iter_text_matches(
@@ -544,22 +560,19 @@ class ResolvedPromptUpdate:
 
         target_text = _seq2text(tokenizer, target)
 
-        for match in re.finditer(re.escape(target_text), prompt,
-                                 pos=start_idx):
+        for match in re.finditer(re.escape(target_text), prompt, pos=start_idx):
             yield PromptTargetMatch(match.start(), match.end())
 
     def iter_matches(
         self,
-        prompt: Union[list[int], str],
+        prompt: list[int] | str,
         tokenizer: AnyTokenizer,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
         """Yield each instance of `self.target` found in `prompt`."""
         if isinstance(prompt, str):
-            return self.iter_text_matches(prompt,
-                                          tokenizer,
-                                          start_idx=start_idx)
+            return self.iter_text_matches(prompt, tokenizer, start_idx=start_idx)
 
         return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx)
 
@@ -640,7 +653,7 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
-    is_embed: Optional[torch.Tensor]
+    is_embed: torch.Tensor | None
 
     @property
     def length(self) -> int:
@@ -666,8 +679,8 @@ def _find_matches(
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
-) -> tuple[Optional[UpdateMode], list[_MatchToApply]]:
-    mode: Optional[UpdateMode] = None
+) -> tuple[UpdateMode | None, list[_MatchToApply]]:
+    mode: UpdateMode | None = None
     mm_matches = dict[tuple[str, int], tuple[PromptTargetMatch, int]]()
 
     for modality, modality_updates in mm_prompt_updates.items():
@@ -680,9 +693,9 @@ def _find_matches(
                     break  # Already found a match for this item
 
                 for match in update.iter_matches(
-                        prompt,
-                        tokenizer,
-                        start_idx=prev_end_idx,
+                    prompt,
+                    tokenizer,
+                    start_idx=prev_end_idx,
                 ):
                     # All matches should share the same mode
                     if mode is None:
@@ -721,10 +734,9 @@ def _apply_matches(
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     prompt_len = len(prompt)
 
-    out_seqs = list[Union[str, list[int]]]()
+    out_seqs = list[str | list[int]]()
     out_result: MultiModalPromptUpdatesApplyResult = {
-        m: [None] * len(items)
-        for m, items in mm_prompt_updates.items()
+        m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
     start_idx = prev_end_idx = 0
@@ -743,8 +755,7 @@ def _apply_matches(
             for (modality, item_idx), (match, update_idx) in matches_to_apply:
                 found = True
 
-                matched_update = mm_prompt_updates[modality][item_idx][
-                    update_idx]
+                matched_update = mm_prompt_updates[modality][item_idx][update_idx]
                 matched_content = matched_update.content.full
 
                 if mode == UpdateMode.INSERT:
@@ -756,9 +767,10 @@ def _apply_matches(
 
                 out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
                 out_seqs.append(
-                    _seq2text(tokenizer, matched_content
-                              ) if isinstance(prompt, str) else _seq2tokens(
-                                  tokenizer, matched_content))
+                    _seq2text(tokenizer, matched_content)
+                    if isinstance(prompt, str)
+                    else _seq2tokens(tokenizer, matched_content)
+                )
                 out_result[modality][item_idx] = update_idx
 
                 # Exclude overlapping matches
@@ -784,8 +796,7 @@ def apply_token_matches(
     the same placeholder tokens. In that case, the modality that
     appears earlier in `mm_prompt_updates` takes priority.
     """
-    token_id_seqs, result = _apply_matches(prompt, mm_prompt_updates,
-                                           tokenizer)
+    token_id_seqs, result = _apply_matches(prompt, mm_prompt_updates, tokenizer)
 
     return flatten_2d_lists(token_id_seqs), result
 
@@ -847,8 +858,7 @@ def _iter_placeholders(
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
                     content_is_embed = content.is_embed
                     if content_is_embed is not None:
-                        content_is_embed = content_is_embed(
-                            tokenizer, content.full)
+                        content_is_embed = content_is_embed(tokenizer, content.full)
 
                     yield PlaceholderFeaturesInfo(
                         modality=modality,
@@ -881,8 +891,8 @@ def find_mm_placeholders(
 
 
 _T = TypeVar("_T")
-_C = TypeVar("_C", bound="PretrainedConfig", default="PretrainedConfig")
-_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
 @dataclass(frozen=True)
@@ -892,27 +902,25 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    model_config: "ModelConfig"
+    model_config: ModelConfig
     """The configuration of the model."""
 
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
     @overload
-    def get_hf_config(self, /) -> "PretrainedConfig":
-        ...
+    def get_hf_config(self, /) -> PretrainedConfig: ...
 
     @overload
     def get_hf_config(
         self,
-        typ: Union[type[_C], tuple[type[_C], ...]],
+        typ: type[_C] | tuple[type[_C], ...],
         /,
-    ) -> _C:
-        ...
+    ) -> _C: ...
 
     def get_hf_config(
         self,
-        typ: Optional[Union[type[Any], tuple[type[Any], ...]]] = None,
+        typ: type[Any] | tuple[type[Any], ...] | None = None,
         /,
     ) -> Any:
         """
@@ -930,9 +938,11 @@ class InputProcessingContext:
 
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, typ):
-            raise TypeError("Invalid type of HuggingFace config. "
-                            f"Expected type: {typ}, but "
-                            f"found type: {type(hf_config)}")
+            raise TypeError(
+                "Invalid type of HuggingFace config. "
+                f"Expected type: {typ}, but "
+                f"found type: {type(hf_config)}"
+            )
 
         return hf_config
 
@@ -956,21 +966,19 @@ class InputProcessingContext:
         return mm_config
 
     @overload
-    def get_hf_processor(self, /, **kwargs: object) -> "ProcessorMixin":
-        ...
+    def get_hf_processor(self, /, **kwargs: object) -> ProcessorMixin: ...
 
     @overload
     def get_hf_processor(
         self,
-        typ: Union[type[_P], tuple[type[_P], ...]],
+        typ: type[_P] | tuple[type[_P], ...],
         /,
         **kwargs: object,
-    ) -> _P:
-        ...
+    ) -> _P: ...
 
     def get_hf_processor(
         self,
-        typ: Optional[Union[type[Any], tuple[type[Any], ...]]] = None,
+        typ: type[Any] | tuple[type[Any], ...] | None = None,
         /,
         **kwargs: object,
     ) -> Any:
@@ -1017,7 +1025,6 @@ class InputProcessingContext:
         self,
         output: JSONTree,
     ) -> JSONTree:
-
         def _postprocess_one(x: object):
             if isinstance(x, torch.Tensor):  # noqa: SIM102
                 # This mimics the behavior of transformers.BatchFeature
@@ -1030,13 +1037,13 @@ class InputProcessingContext:
 
     def call_hf_processor(
         self,
-        hf_processor: "ProcessorMixin",
+        hf_processor: ProcessorMixin,
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
         *,
         num_tries: int = 1,
         max_tries: int = 5,
-    ) -> Union["BatchFeature", JSONTree]:
+    ) -> BatchFeature | JSONTree:
         """
         Call `hf_processor` on the prompt `data`
         (text, image, audio...) with configurable options `kwargs`.
@@ -1054,17 +1061,21 @@ class InputProcessingContext:
         )
 
         try:
-            output = hf_processor(**data,
-                                  **allowed_kwargs,
-                                  return_tensors="pt")
+            output = hf_processor(**data, **allowed_kwargs, return_tensors="pt")
         except Exception as exc:
             # See https://github.com/huggingface/tokenizers/issues/537
-            if (isinstance(exc, RuntimeError) and exc
-                    and exc.args[0] == "Already borrowed"
-                    and num_tries < max_tries):
+            if (
+                isinstance(exc, RuntimeError)
+                and exc
+                and exc.args[0] == "Already borrowed"
+                and num_tries < max_tries
+            ):
                 logger.warning(
                     "Failed to acquire tokenizer in current thread. "
-                    "Retrying (%d/%d)...", num_tries, max_tries)
+                    "Retrying (%d/%d)...",
+                    num_tries,
+                    max_tries,
+                )
                 time.sleep(0.5)
                 return self.call_hf_processor(
                     hf_processor,
@@ -1074,8 +1085,10 @@ class InputProcessingContext:
                     max_tries=max_tries,
                 )
 
-            msg = (f"Failed to apply {type(hf_processor).__name__} "
-                   f"on data={data} with kwargs={allowed_kwargs}")
+            msg = (
+                f"Failed to apply {type(hf_processor).__name__} "
+                f"on data={data} with kwargs={allowed_kwargs}"
+            )
 
             raise ValueError(msg) from exc
 
@@ -1111,10 +1124,10 @@ class BaseProcessingInfo:
     def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def get_hf_config(self) -> "PretrainedConfig":
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin":
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
@@ -1122,7 +1135,7 @@ class BaseProcessingInfo:
         return self.ctx.get_hf_processor(**kwargs)
 
     @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         """
         Return the maximum supported number of items for each modality.
 
@@ -1142,8 +1155,11 @@ class BaseProcessingInfo:
         for modality, supported_limit in supported_mm_limits.items():
             user_limit = mm_config.get_limit_per_prompt(modality)
 
-            allowed_limits[modality] = (user_limit if supported_limit is None
-                                        else min(user_limit, supported_limit))
+            allowed_limits[modality] = (
+                user_limit
+                if supported_limit is None
+                else min(user_limit, supported_limit)
+            )
 
         return allowed_limits
 
@@ -1151,10 +1167,10 @@ class BaseProcessingInfo:
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> Optional[Mapping[str, int]]:
+    ) -> Mapping[str, int] | None:
         """
         Return the maximum number of tokens per item of for each modality.
-        
+
         When `None` (the default) is returned, vLLM will generate dummy inputs
         (images/videos) at maximum possible sizes and process them to determine
         the maximum token count per modality.
@@ -1165,7 +1181,7 @@ class BaseProcessingInfo:
         counts, avoiding the need for dummy input generation and processing.
 
         Note:
-            The maximum number of tokens per item of each modality returned 
+            The maximum number of tokens per item of each modality returned
             from this function should respect the model's maximum sequence
             length and the maximum number of items of each modality allowed,
             and agree with dummy inputs (images/videos) at maximum possible
@@ -1188,7 +1204,7 @@ A collection of prompt updates with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
-MultiModalPromptUpdatesApplyResult = Mapping[str, list[Optional[int]]]
+MultiModalPromptUpdatesApplyResult = Mapping[str, list[int | None]]
 """
 For an item `MultiModalPromptUpdates[k][i]`,
 `MultiModalPromptUpdatesApplyResult[k][i]` represents the index of the
@@ -1215,7 +1231,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         info: _I,
         dummy_inputs: "BaseDummyInputsBuilder[_I]",
         *,
-        cache: Optional["BaseMultiModalProcessorCache"] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
@@ -1243,12 +1259,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
-        return self.apply(prompt,
-                          mm_data,
-                          hf_processor_mm_kwargs,
-                          mm_uuids=mm_uuids)
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs, mm_uuids=mm_uuids)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         """
@@ -1276,8 +1289,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         limit = min(supported_limit, allowed_limit)
 
         if num_items > limit:
-            msg = (f"At most {limit} {modality}(s) may be provided in "
-                   "one prompt.")
+            msg = f"At most {limit} {modality}(s) may be provided in one prompt."
 
             if num_items <= supported_limit:
                 msg += " Set `--limit-mm-per-prompt` to increase this limit."
@@ -1296,6 +1308,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
+
+        mm_config = self.info.ctx.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            for modality, items in mm_items.items():
+                if isinstance(items, (EmbeddingItems, DictEmbeddingItems)):
+                    raise ValueError(
+                        f"You must set `--enable-mm-embeds` to input "
+                        f"`{modality}_embeds`"
+                    )
+
         for modality, items in mm_items.items():
             self.validate_num_items(modality, len(items))
 
@@ -1304,7 +1326,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     @abstractmethod
     def _get_mm_fields_config(
         self,
-        hf_inputs: "BatchFeature",
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         """Given the HF-processed data, output the metadata of each field."""
@@ -1339,8 +1361,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_item_counts: Mapping[str, int],
     ) -> MultiModalPromptUpdates:
         return {
-            modality: [[update.resolve(item_idx) for update in updates]
-                       for item_idx in range(mm_item_counts.get(modality, 0))]
+            modality: [
+                [update.resolve(item_idx) for update in updates]
+                for item_idx in range(mm_item_counts.get(modality, 0))
+            ]
             for modality, updates in full_groupby_modality(prompt_updates)
         }
 
@@ -1385,8 +1409,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
         tokenizer = self.info.get_tokenizer()
 
-        return find_mm_placeholders(new_token_ids, mm_prompt_updates,
-                                    tokenizer)
+        return find_mm_placeholders(new_token_ids, mm_prompt_updates, tokenizer)
 
     def _get_hf_mm_data(
         self,
@@ -1409,7 +1432,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> "BatchFeature":
+    ) -> BatchFeature:
         """
         Call the HF processor on the prompt text and
         associated multi-modal data.
@@ -1436,7 +1459,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         return not any(
             isinstance(items, (EmbeddingItems, DictEmbeddingItems))
-            for items in mm_items.values())
+            for items in mm_items.values()
+        )
 
     def _apply_hf_processor_text_mm(
         self,
@@ -1444,7 +1468,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], "BatchFeature", bool]:
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data
         together.
@@ -1461,7 +1485,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         )
         processed_data.update(passthrough_data)
 
-        prompt_ids, = processed_data.pop("input_ids").tolist()
+        (prompt_ids,) = processed_data.pop("input_ids").tolist()
 
         is_update_applied = self._hf_processor_applies_updates(
             prompt_text=prompt_text,
@@ -1515,7 +1539,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> "BatchFeature":
+    ) -> BatchFeature:
         """
         Apply the HF processor on the multi-modal data only.
 
@@ -1537,13 +1561,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _apply_hf_processor_main(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
         enable_hf_prompt_update: bool,
-    ) -> tuple[list[int], "BatchFeature", bool]:
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data.
 
@@ -1564,8 +1588,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                     tokenization_kwargs=tokenization_kwargs,
                 )
 
-            prompt_ids = self._apply_hf_processor_text_only(
-                prompt, tokenization_kwargs)
+            prompt_ids = self._apply_hf_processor_text_only(prompt, tokenization_kwargs)
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
@@ -1583,7 +1606,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalHashes:
         """Create MM hashes to be returned.
 
@@ -1611,10 +1634,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                     # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs`
                     # are provided. This is because the processed multimodal
                     # inputs can be different depending on the processor kwargs.
-                    if item_uuid is None or \
-                        hf_processor_mm_kwargs or \
-                        tokenization_kwargs:
-
+                    if (
+                        item_uuid is None
+                        or hf_processor_mm_kwargs
+                        or tokenization_kwargs
+                    ):
                         # NOTE: use provided hash string to hash with kwargs
                         # if available for better performance.
                         item = item_uuid if item_uuid is not None else item
@@ -1623,16 +1647,20 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                 model_id=model_id,
                                 **{modality: item},
                                 **hf_processor_mm_kwargs,
-                                **tokenization_kwargs))
+                                **tokenization_kwargs,
+                            )
+                        )
                     else:
                         computed.append(item_uuid)
                 hashes[modality] = computed
             else:
                 hashes[modality] = [
-                    MultiModalHasher.hash_kwargs(model_id=model_id,
-                                                 **{modality: item},
-                                                 **hf_processor_mm_kwargs,
-                                                 **tokenization_kwargs)
+                    MultiModalHasher.hash_kwargs(
+                        model_id=model_id,
+                        **{modality: item},
+                        **hf_processor_mm_kwargs,
+                        **tokenization_kwargs,
+                    )
                     for item in items
                 ]
 
@@ -1640,18 +1668,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _get_cache_missing_items(
         self,
-        cache: "BaseMultiModalProcessorCache",
+        cache: BaseMultiModalProcessorCache,
         mm_data_items: MultiModalDataItems,
         mm_hashes: MultiModalHashes,
     ) -> MultiModalDataItems:
         mm_is_cached = {
-            modality: cache.is_cached(hashes)
-            for modality, hashes in mm_hashes.items()
+            modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
         }
 
         mm_missing_idxs = {
             modality: [
-                idx for idx, item_is_cached in enumerate(items_is_cached)
+                idx
+                for idx, item_is_cached in enumerate(items_is_cached)
                 if not item_is_cached
             ]
             for modality, items_is_cached in mm_is_cached.items()
@@ -1664,7 +1692,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 if data is None:
                     raise ValueError(
                         f"Cache miss for {modality} at index {idx} "
-                        f"but data is not provided.")
+                        f"but data is not provided."
+                    )
                 else:
                     missing_modality_data.append(data)
             mm_missing_data[modality] = missing_modality_data
@@ -1684,7 +1713,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _merge_mm_kwargs(
         self,
-        cache: "BaseMultiModalProcessorCache",
+        cache: BaseMultiModalProcessorCache,
         mm_hashes: MultiModalHashes,
         mm_missing_kwargs: MultiModalKwargsItems,
         mm_missing_prompt_updates: MultiModalPromptUpdates,
@@ -1692,23 +1721,21 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         # Need to calculate this at the beginning to avoid skipping cache logic
         # for subsequently repeated items in the same modality
         mm_is_cached = {
-            modality: cache.is_cached(hashes)
-            for modality, hashes in mm_hashes.items()
+            modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
         }
 
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_kwargs = defaultdict[str,
-                                    list[Optional[MultiModalKwargsItem]]](list)
-        merged_prompt_updates = defaultdict[
-            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        merged_kwargs = defaultdict[str, list[MultiModalKwargsItem | None]](list)
+        merged_prompt_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](
+            list
+        )
         for modality, hashes in mm_hashes.items():
             missing_kwargs = mm_missing_kwargs.get(modality, [])
-            missing_prompt_updates = mm_missing_prompt_updates.get(
-                modality, [])
+            missing_prompt_updates = mm_missing_prompt_updates.get(modality, [])
 
             for item_idx, item_hash in enumerate(hashes):
-                kwargs: Optional[MultiModalKwargsItem]
+                kwargs: MultiModalKwargsItem | None
                 if not mm_is_cached[modality][item_idx]:
                     missing_next_idx = mm_missing_next_idx[modality]
                     kwargs = missing_kwargs[missing_next_idx]
@@ -1723,10 +1750,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 kwargs, updates = cache.get_and_update_item(item, item_hash)
 
                 merged_kwargs[modality].append(kwargs)
-                merged_prompt_updates[modality].append([
-                    self._recompute_cached_prompt_update(update, item_idx)
-                    for update in updates
-                ])
+                merged_prompt_updates[modality].append(
+                    [
+                        self._recompute_cached_prompt_update(update, item_idx)
+                        for update in updates
+                    ]
+                )
 
         mm_kwargs = MultiModalKwargsItems(merged_kwargs)
         mm_prompt_updates = dict(merged_prompt_updates)
@@ -1735,12 +1764,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1756,15 +1785,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_processed_data,
-            self._get_mm_fields_config(mm_processed_data,
-                                       hf_processor_mm_kwargs),
+            self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs),
         )
 
         # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = self._hash_mm_items(mm_data_items,
-                                        hf_processor_mm_kwargs,
-                                        tokenization_kwargs,
-                                        mm_uuids=mm_uuids)
+        mm_hashes = self._hash_mm_items(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1782,12 +1812,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1805,10 +1835,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_uuids=mm_uuids,
             )
 
-        mm_hashes = self._hash_mm_items(mm_data_items,
-                                        hf_processor_mm_kwargs,
-                                        tokenization_kwargs,
-                                        mm_uuids=mm_uuids)
+        mm_hashes = self._hash_mm_items(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1833,8 +1865,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_missing_processed_data,
-            self._get_mm_fields_config(mm_missing_processed_data,
-                                       hf_processor_mm_kwargs),
+            self._get_mm_fields_config(
+                mm_missing_processed_data, hf_processor_mm_kwargs
+            ),
         )
 
         mm_missing_prompt_updates = self._get_mm_prompt_updates(
@@ -1878,7 +1911,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         token_ids: list[int],
         mm_prompt_updates: MultiModalPromptUpdates,
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
         tokenizer = self.info.get_tokenizer()
 
         new_token_ids, match_result = self._apply_token_matches(
@@ -1896,11 +1929,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         # Since it is inefficient to search for all possible tokenizations
         # of the search text in the prompt, we instead perform string-based
         # updates on the decoded token IDs, then encode them back.
-        if all(
-                all(update_idx is not None for update_idx in update_idxs)
-                for update_idxs in match_result.values()):
-            new_text = decode_tokens(tokenizer, new_token_ids)
-        else:
+        if not all(
+            all(update_idx is not None for update_idx in update_idxs)
+            for update_idxs in match_result.values()
+        ):
             new_text, match_result = self._apply_text_matches(
                 decode_tokens(tokenizer, token_ids),
                 mm_prompt_updates,
@@ -1912,23 +1944,24 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 add_special_tokens=False,
             )
 
-        matched_updates = defaultdict[
-            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        matched_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](list)
         for modality, update_idxs in match_result.items():
             for item_idx, update_idx in enumerate(update_idxs):
                 assert update_idx is not None, (
                     "Failed to apply prompt replacement for "
-                    f"mm_items[{modality!r}][{item_idx}]")
+                    f"mm_items[{modality!r}][{item_idx}]"
+                )
 
                 matched_updates[modality].append(
-                    [mm_prompt_updates[modality][item_idx][update_idx]])
+                    [mm_prompt_updates[modality][item_idx][update_idx]]
+                )
 
         placeholders = self._find_mm_placeholders(
             new_token_ids,
             dict(matched_updates),
         )
 
-        return new_token_ids, new_text, placeholders
+        return new_token_ids, placeholders
 
     def _validate_mm_kwargs(
         self,
@@ -1946,20 +1979,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                     "There is likely a problem with your "
                     "implementation of merged multi-modal processor for this "
                     "model (usually arising from an inconsistency between "
-                    "`_call_hf_processor` and `_get_mm_fields_config`).")
+                    "`_call_hf_processor` and `_get_mm_fields_config`)."
+                )
 
-    def _validate_mm_placeholders(
+    def _validate_mm_updates(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_updates: MultiModalPromptUpdates,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
-            placeholders = mm_placeholders.get(modality, [])
+            placeholders = mm_updates.get(modality, [])
 
             if len(placeholders) != item_count:
-                # NOTE: If you are a model developer, this can also arise from
-                # an inconsistency between `_call_hf_processor` and
-                # `_get_mm_fields_config` implementations
                 raise RuntimeError(
                     f"Expected there to be {item_count} prompt updates "
                     f"corresponding to {item_count} {modality} items, but "
@@ -1967,7 +1998,25 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                     "This is likely because you forgot to include input "
                     "placeholder tokens (e.g., `<image>`, `<|image_pad|>`) "
                     "in the prompt. If the model has a chat template, make "
-                    "sure you have applied it before calling `LLM.generate`.")
+                    "sure you have applied it before calling `LLM.generate`."
+                )
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            placeholders = mm_placeholders.get(modality, [])
+
+            if len(placeholders) != item_count:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} prompt placeholders "
+                    f"corresponding to {item_count} {modality} items, but "
+                    f"instead found {len(placeholders)} prompt placeholders! "
+                    "Make sure the implementation of `_call_hf_processor` and "
+                    "`_get_mm_fields_config` are consistent with each other."
+                )
 
     def _maybe_apply_prompt_updates(
         self,
@@ -1976,9 +2025,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_kwargs: MultiModalKwargsOptionalItems,
         mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+        self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
@@ -1986,30 +2036,23 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_prompt_updates,
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
-            tokenizer = self.info.get_tokenizer()
-            prompt = decode_tokens(tokenizer, prompt_ids)
         else:
-            (
-                prompt_ids,
-                prompt,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
+            prompt_ids, mm_placeholders = self._apply_prompt_updates(
                 prompt_ids,
                 mm_prompt_updates,
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
-        return prompt_ids, prompt, mm_placeholders
+        return prompt_ids, mm_placeholders
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -2042,7 +2085,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
-        prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
+        prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
             mm_items=mm_items,
             prompt_ids=prompt_ids,
             mm_kwargs=mm_info.kwargs,
@@ -2057,7 +2100,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         return MultiModalInputs(
             type="multimodal",
-            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_info.kwargs,
             mm_hashes=mm_info.hashes,
@@ -2066,13 +2108,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
 
 class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
-
     @abstractmethod
     def create_encoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         """
         Create input prompt for the encoder. HF processor will be applied on
         this prompt during profiling and generation.
@@ -2085,45 +2126,42 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def create_decoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         """Create input prompt for the decoder."""
         return prompt
 
     def _get_enc_dec_inputs(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         encoder_inputs: MultiModalInputs,
     ):
         tokenizer = self.info.get_tokenizer()
         decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
         if isinstance(decoder_prompt_raw, str):
-            decoder_prompt = decoder_prompt_raw
-            decoder_prompt_ids = encode_tokens(tokenizer,
-                                               decoder_prompt_raw,
-                                               add_special_tokens=False)
+            decoder_prompt_ids = encode_tokens(
+                tokenizer, decoder_prompt_raw, add_special_tokens=False
+            )
         else:
-            decoder_prompt = decode_tokens(tokenizer, decoder_prompt_raw)
             decoder_prompt_ids = decoder_prompt_raw
 
         mm_inputs = MultiModalEncDecInputs(
-            encoder_prompt=encoder_inputs["prompt"],
             encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
-            **encoder_inputs)
-        mm_inputs["prompt"] = decoder_prompt
+            **encoder_inputs,
+        )
         mm_inputs["prompt_token_ids"] = decoder_prompt_ids
         return mm_inputs
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 26c5d188964c4337abfe5af9de031839bd6a5ffa..cb70041e9744f1ffab1037fad3f47a3662dc2ee3 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,20 +3,32 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
+from typing import Generic, NamedTuple, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-import vllm.envs as envs
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
 from vllm.logger import init_logger
 
-from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargsItems,
-                     MultiModalPlaceholderDict)
-from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
-                         EncDecMultiModalProcessor)
+from .inputs import (
+    MultiModalDataDict,
+    MultiModalEncDecInputs,
+    MultiModalInputs,
+    MultiModalKwargsItems,
+    MultiModalPlaceholderDict,
+)
+from .processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    EncDecMultiModalProcessor,
+)
 
 logger = init_logger(__name__)
 
@@ -27,7 +39,8 @@ class ProcessorInputs:
     Represents the keyword arguments to
     [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
     """
-    prompt: Union[str, list[int]]
+
+    prompt: str | list[int]
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
     tokenization_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -73,10 +86,19 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         """
         Build the multimodal input which, after processing, results in
         the maximum possible number of placeholder tokens.
+
+        Args:
+            seq_len: Sequence length
+            mm_counts: Count of items per modality
+            mm_options: Configurable options per modality (optional).
+                       If None, use model defaults for backward compatibility.
+                       If provided, models can use these to customize dummy
+                       data generation.
         """
         raise NotImplementedError
 
@@ -84,28 +106,49 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
         the maximum possible number of placeholder tokens.
+
+        Args:
+            seq_len: Sequence length
+            mm_counts: Count of items per modality
+            mm_options: Configurable options per modality (optional)
         """
         dummy_text = self.get_dummy_text(mm_counts)
-        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+
+        # Use the unified function for both legacy and configurable cases
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
+
         tokenization_kwargs = {"truncation": False}
 
-        return ProcessorInputs(prompt=dummy_text,
-                               mm_data=dummy_mm_data,
-                               tokenization_kwargs=tokenization_kwargs)
+        return ProcessorInputs(
+            prompt=dummy_text,
+            mm_data=dummy_mm_data,
+            tokenization_kwargs=tokenization_kwargs,
+        )
 
     def _get_dummy_audios(
         self,
         *,
         length: int,
         num_audios: int,
+        overrides: AudioDummyOptions | None = None,
     ) -> list[npt.NDArray]:
         if num_audios == 0:
             return []
-        audio = np.zeros((length, ))
+        if overrides and overrides.length:
+            if overrides.length > length:
+                logger.warning(
+                    "audio.length override (%d) exceeds model's "
+                    "maximum length (%d), will be ignored",
+                    overrides.length,
+                    length,
+                )
+            length = min(length, overrides.length)
+        audio = np.zeros((length,))
         return [audio] * num_audios
 
     def _get_dummy_images(
@@ -114,9 +157,29 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         width: int,
         height: int,
         num_images: int,
+        overrides: ImageDummyOptions | None = None,
     ) -> list[Image.Image]:
         if num_images == 0:
             return []
+        if overrides:
+            if overrides.width:
+                if overrides.width > width:
+                    logger.warning(
+                        "image.width override (%d) exceeds model's "
+                        "maximum width (%d), will be ignored",
+                        overrides.width,
+                        width,
+                    )
+                width = min(width, overrides.width)
+            if overrides.height:
+                if overrides.height > height:
+                    logger.warning(
+                        "image.height override (%d) exceeds model's "
+                        "maximum height (%d), will be ignored",
+                        overrides.height,
+                        height,
+                    )
+                height = min(height, overrides.height)
         image = Image.new("RGB", (width, height), color=255)
         return [image] * num_images
 
@@ -127,10 +190,39 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         height: int,
         num_frames: int,
         num_videos: int,
+        overrides: VideoDummyOptions | None = None,
     ) -> list[npt.NDArray]:
         if num_videos == 0:
             return []
-        video = np.full((num_frames, width, height, 3), 255)
+        if overrides:
+            if overrides.num_frames:
+                if overrides.num_frames > num_frames:
+                    logger.warning(
+                        "video.num_frames override (%d) exceeds model's "
+                        "maximum number of frames (%d), will be ignored",
+                        overrides.num_frames,
+                        num_frames,
+                    )
+                num_frames = min(num_frames, overrides.num_frames)
+            if overrides.width:
+                if overrides.width > width:
+                    logger.warning(
+                        "video.width override (%d) exceeds model's "
+                        "maximum width (%d), will be ignored",
+                        overrides.width,
+                        width,
+                    )
+                width = min(width, overrides.width)
+            if overrides.height:
+                if overrides.height > height:
+                    logger.warning(
+                        "video.height override (%d) exceeds model's "
+                        "maximum height (%d), will be ignored",
+                        overrides.height,
+                        height,
+                    )
+                height = min(height, overrides.height)
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         return [video] * num_videos
 
 
@@ -161,14 +253,16 @@ class MultiModalProfiler(Generic[_I]):
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalInputs:
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
 
         factory = self.dummy_inputs
         processor_inputs = factory.get_dummy_processor_inputs(
-            seq_len, mm_counts)
+            seq_len, mm_counts, mm_options
+        )
 
         return self.processor.apply(
             prompt=processor_inputs.prompt,
@@ -185,18 +279,20 @@ class MultiModalProfiler(Generic[_I]):
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality:
-            sum(item.get_num_embeds() if mm_embeddings_only else item.length
-                for item in placeholders)
+            modality: sum(
+                item.get_num_embeds() if mm_embeddings_only else item.length
+                for item in placeholders
+            )
             for modality, placeholders in placeholders_by_modality.items()
         }
 
     def get_encoder_dummy_data(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> DummyEncoderData:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -209,27 +305,16 @@ class MultiModalProfiler(Generic[_I]):
         if processor.pad_dummy_encoder_prompt:
             num_tokens_to_pad = max(total_len, seq_len) - total_len
             encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
-        # NOTE: Whisper allows total_len > seq_len.
-        elif total_len > seq_len and not envs.VLLM_USE_V1:
-            # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning_once(
-                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
-                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
-                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
-                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
-                seq_len,
-                total_len,
-                str(self._get_mm_num_tokens(mm_inputs)),
-            )
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
     def get_decoder_dummy_data(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> DummyDecoderData:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options)
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
@@ -246,7 +331,7 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_max_tokens(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         if mm_counts is None:
@@ -257,28 +342,28 @@ class MultiModalProfiler(Generic[_I]):
             mm_counts=mm_counts,
         )
         if max_tokens_per_item is not None:
-            return max_tokens_per_item
+            return {
+                modality: max_tokens
+                for modality, max_tokens in max_tokens_per_item.items()
+                if mm_counts.get(modality, 0) > 0
+            }
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        return self._get_mm_num_tokens(mm_inputs,
-                                       mm_embeddings_only=mm_embeddings_only)
+        return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only)
 
     def get_mm_max_contiguous_tokens(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
-    ):
+        mm_counts: Mapping[str, int] | None = None,
+    ) -> Mapping[str, int]:
         """
         Returns the maximum length of the multimodal (image placeholders+text)
         tokens, including any break/text tokens in-between image embeddings.
 
         `<im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>`
         Returns 9, even when the number of image embeddings is 6.
-        
+
         This is important to take into account when profiling and
         initializing the encoder cache size.
         """
-
-        return self._get_mm_max_tokens(seq_len,
-                                       mm_counts,
-                                       mm_embeddings_only=False)
+        return self._get_mm_max_tokens(seq_len, mm_counts, mm_embeddings_only=False)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 2bbc0078ad13a029e928c684688b23f132f2bb86..8f9276e8464072b893d88dc006ea2ca645fcbda4 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,20 +2,27 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
 
 import torch.nn as nn
 
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
-from vllm.utils import ClassRegistry
+from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
+from vllm.utils.collection_utils import ClassRegistry
 
 from .cache import BaseMultiModalProcessorCache
-from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
-                         InputProcessingContext)
-from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
-                        DummyEncoderData, MultiModalProfiler)
+from .processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    InputProcessingContext,
+)
+from .profiling import (
+    BaseDummyInputsBuilder,
+    DummyDecoderData,
+    DummyEncoderData,
+    MultiModalProfiler,
+)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -37,8 +44,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
     def __call__(
         self,
         ctx: InputProcessingContext,
-    ) -> _I_co:
-        ...
+    ) -> _I_co: ...
 
 
 class DummyInputsBuilderFactory(Protocol[_I]):  # type: ignore[misc]
@@ -48,11 +54,10 @@ class DummyInputsBuilderFactory(Protocol[_I]):  # type: ignore[misc]
     instance from the context.
     """
 
-    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
-        ...
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: ...
 
 
-class MultiModalProcessorFactory(Protocol[_I]):
+class MultiModalProcessorFactory(Protocol[_I]):  # type: ignore[misc]
     """
     Constructs a
     [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
@@ -64,9 +69,8 @@ class MultiModalProcessorFactory(Protocol[_I]):
         info: _I,
         dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
-    ) -> BaseMultiModalProcessor[_I]:
-        ...
+        cache: BaseMultiModalProcessorCache | None = None,
+    ) -> BaseMultiModalProcessor[_I]: ...
 
 
 @dataclass(frozen=True)
@@ -79,7 +83,7 @@ class _ProcessorFactories(Generic[_I]):
         self,
         ctx: InputProcessingContext,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ):
         info = self.info(ctx)
         dummy_inputs_builder = self.dummy_inputs(info)
@@ -92,14 +96,34 @@ class MultiModalRegistry:
     """
 
     def __init__(self) -> None:
-        self._processor_factories = ClassRegistry[nn.Module,
-                                                  _ProcessorFactories]()
+        self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]()
+
+    def _extract_mm_options(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, BaseDummyOptions] | None:
+        """
+        Extract multimodal dummy options from model config.
+
+        Returns None if no configurable options are found, otherwise returns
+        a mapping of modality names to their dummy options.
+        """
+        if not model_config.multimodal_config:
+            return None
+
+        mm_options = {
+            m: opt
+            for m in model_config.multimodal_config.limit_per_prompt
+            if (opt := model_config.multimodal_config.get_dummy_options(m)) is not None
+        }
+
+        return mm_options if len(mm_options) > 0 else None
 
     def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
-        Returns True if the model is multimodal with any non-zero supported 
-        modalities, otherwise returns False, effectively running in 
+        Returns True if the model is multimodal with any non-zero supported
+        modalities, otherwise returns False, effectively running in
         text-only mode.
         """
         if not model_config.is_multimodal_model:
@@ -112,11 +136,13 @@ class MultiModalRegistry:
 
         # Check if all supported modalities have limit == 0
         if all(
-                mm_config.get_limit_per_prompt(modality) == 0
-                for modality in supported_modalities):
+            mm_config.get_limit_per_prompt(modality) == 0
+            for modality in supported_modalities
+        ):
             logger.info_once(
                 "All limits of multimodal modalities supported by the model "
-                "are set to 0, running in text-only mode.")
+                "are set to 0, running in text-only mode."
+            )
             return False
 
         return True
@@ -125,7 +151,8 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
+        profiler_limits: Mapping[str, int] | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -135,51 +162,23 @@ class MultiModalRegistry:
             return {}
 
         processor = self.create_processor(model_config, cache=cache)
-        profiler = MultiModalProfiler(processor)
+        profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
-        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        profiler_limits = (
+            profiler.get_mm_limits() if profiler_limits is None else profiler_limits
+        )
 
         return profiler.get_mm_max_contiguous_tokens(
             seq_len,
-            {
-                modality: 1
-                for modality, limit in mm_limits.items() if limit > 0
-            },
+            {modality: 1 for modality, limit in profiler_limits.items() if limit > 0},
         )
 
-    def get_max_tokens_per_item_by_nonzero_modality(
-        self,
-        model_config: "ModelConfig",
-        *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
-    ) -> Mapping[str, int]:
-        """
-        Get the maximum number of tokens per data item from each modality based
-        on underlying model configuration, excluding modalities that user
-        explicitly disabled via `limit_mm_per_prompt`.
-
-        Note:
-            This is currently directly used only in V1 for profiling the memory
-            usage of a model.
-        """
-        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
-        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
-            model_config,
-            cache=cache,
-        )
-
-        return {
-            key: max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
-            if mm_limits[key] > 0
-        }
-
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -189,7 +188,7 @@ class MultiModalRegistry:
             return {}
 
         processor = self.create_processor(model_config, cache=cache)
-        profiler = MultiModalProfiler(processor)
+        profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
     def register_processor(
@@ -212,7 +211,9 @@ class MultiModalRegistry:
                 logger.warning(
                     "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
+                    model_cls,
+                    self,
+                )
 
             self._processor_factories[model_cls] = _ProcessorFactories(
                 info=info,
@@ -234,7 +235,7 @@ class MultiModalRegistry:
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ) -> InputProcessingContext:
         if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
@@ -244,7 +245,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
@@ -255,8 +256,8 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: Optional[AnyTokenizer] = None,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        tokenizer: AnyTokenizer | None = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -275,25 +276,32 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by ``model_config``.
+        The model is identified by `model_config`.
         """
         processor = self.create_processor(model_config, cache=cache)
-        profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
+        profiler: MultiModalProfiler = MultiModalProfiler(processor)
+
+        # Extract configurable options from multimodal config.
+        # Only include modalities that use advanced option types so legacy
+        # count-only behavior remains unchanged.
+        mm_options = self._extract_mm_options(model_config)
+
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
         if len(token_ids) < seq_len:
             raise AssertionError(
                 f"Expected at least {seq_len} dummy tokens for profiling, "
-                f"but found {len(token_ids)} tokens instead.")
+                f"but found {len(token_ids)} tokens instead."
+            )
 
         return dummy_data
 
@@ -301,18 +309,24 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by ``model_config``.
+        The model is identified by `model_config`.
         """
         processor = self.create_processor(model_config, cache=cache)
-        profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
+        profiler: MultiModalProfiler = MultiModalProfiler(processor)
+
+        # Extract configurable options from multimodal config.
+        # Only include modalities that use advanced option types so legacy
+        # count-only behavior remains unchanged.
+        mm_options = self._extract_mm_options(model_config)
+
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
@@ -331,15 +345,16 @@ class MultiModalRegistry:
         """
         if not model_config.is_encoder_decoder:
             return 0
-        max_tokens = self.\
-            get_max_tokens_per_item_by_nonzero_modality(model_config)
+        max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
         if not max_tokens:
             # TODO - this function assumes encoder-decoder models are
             # multimodal. This will need to change when adding support for more
             # than whisper.
             return 0
-        assert len(max_tokens) == 1, "Encoder-decoder models are expected \
+        assert len(max_tokens) == 1, (
+            "Encoder-decoder models are expected \
             to implement the multimodal interface with at most one modality."
+        )
 
         first_modality = next(iter(max_tokens))
         return max_tokens[first_modality]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index bab12fd1681a3ede8b7d8e25b7d7d28efb23d3bf..3f55c46ca334ddc8e59e389b6a35502429c0a14d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,11 +3,11 @@
 
 import asyncio
 import atexit
-from collections.abc import Iterable
+from collections.abc import Iterable, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, TypeVar
 from urllib.parse import ParseResult, urlparse
 from urllib.request import url2pathname
 
@@ -15,48 +15,56 @@ import numpy as np
 import numpy.typing as npt
 import torch
 from PIL import Image, UnidentifiedImageError
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
+from vllm.logger import init_logger
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
 
-_M = TypeVar("_M")
-
 if TYPE_CHECKING:
-    from .inputs import (BatchedTensorInputs, MultiModalKwargsItem,
-                         MultiModalKwargsItems, MultiModalPlaceholderDict)
+    from .inputs import (
+        BatchedTensorInputs,
+        MultiModalKwargsItem,
+        MultiModalPlaceholderDict,
+    )
 else:
     BatchedTensorInputs = Any
     MultiModalKwargsItem = Any
-    MultiModalKwargsItems = Any
     MultiModalPlaceholderDict = Any
 
+logger = init_logger(__name__)
+
 global_thread_pool = ThreadPoolExecutor(
-    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT)
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
+)
 atexit.register(global_thread_pool.shutdown)
 
+_M = TypeVar("_M")
+
+MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
 
-class MediaConnector:
 
+@MEDIA_CONNECTOR_REGISTRY.register("http")
+class MediaConnector:
     def __init__(
         self,
-        media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
         connection: HTTPConnection = global_http_connection,
         *,
         allowed_local_media_path: str = "",
-        allowed_media_domains: Optional[list[str]] = None,
+        allowed_media_domains: list[str] | None = None,
     ) -> None:
         """
         Args:
-            media_io_kwargs: Additional args passed to process media 
-                             inputs, keyed by modalities. For example, 
-                             to set num_frames for video, set 
+            media_io_kwargs: Additional args passed to process media
+                             inputs, keyed by modalities. For example,
+                             to set num_frames for video, set
                              `--media-io-kwargs '{"video":{"num_frames":40}}'`
             connection: HTTP connection client to download media contents.
             allowed_local_media_path: A local directory to load media files
@@ -64,8 +72,9 @@ class MediaConnector:
         """
         super().__init__()
 
-        self.media_io_kwargs: dict[str, dict[
-            str, Any]] = media_io_kwargs if media_io_kwargs else {}
+        self.media_io_kwargs: dict[str, dict[str, Any]] = (
+            media_io_kwargs if media_io_kwargs else {}
+        )
         self.connection = connection
 
         if allowed_local_media_path:
@@ -74,11 +83,13 @@ class MediaConnector:
             if not allowed_local_media_path_.exists():
                 raise ValueError(
                     "Invalid `--allowed-local-media-path`: The path "
-                    f"{allowed_local_media_path_} does not exist.")
+                    f"{allowed_local_media_path_} does not exist."
+                )
             if not allowed_local_media_path_.is_dir():
                 raise ValueError(
                     "Invalid `--allowed-local-media-path`: The path "
-                    f"{allowed_local_media_path_} must be a directory.")
+                    f"{allowed_local_media_path_} must be a directory."
+                )
         else:
             allowed_local_media_path_ = None
 
@@ -108,31 +119,36 @@ class MediaConnector:
     ) -> _M:  # type: ignore[type-var]
         allowed_local_media_path = self.allowed_local_media_path
         if allowed_local_media_path is None:
-            raise RuntimeError("Cannot load local files without "
-                               "`--allowed-local-media-path`.")
+            raise RuntimeError(
+                "Cannot load local files without `--allowed-local-media-path`."
+            )
 
         filepath = Path(url2pathname(url_spec.path))
         if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
                 f"The file path {filepath} must be a subpath "
-                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
+                f"of `--allowed-local-media-path` {allowed_local_media_path}."
+            )
 
         return media_io.load_file(filepath)
 
     def _assert_url_in_allowed_media_domains(self, url_spec) -> None:
-        if self.allowed_media_domains and url_spec.hostname not in \
-            self.allowed_media_domains:
+        if (
+            self.allowed_media_domains
+            and url_spec.hostname not in self.allowed_media_domains
+        ):
             raise ValueError(
                 f"The URL must be from one of the allowed domains: "
                 f"{self.allowed_media_domains}. Input URL domain: "
-                f"{url_spec.hostname}")
+                f"{url_spec.hostname}"
+            )
 
     def load_from_url(
         self,
         url: str,
         media_io: MediaIO[_M],
         *,
-        fetch_timeout: Optional[int] = None,
+        fetch_timeout: int | None = None,
     ) -> _M:  # type: ignore[type-var]
         url_spec = urlparse(url)
 
@@ -162,7 +178,7 @@ class MediaConnector:
         url: str,
         media_io: MediaIO[_M],
         *,
-        fetch_timeout: Optional[int] = None,
+        fetch_timeout: int | None = None,
     ) -> _M:
         url_spec = urlparse(url)
         loop = asyncio.get_running_loop()
@@ -176,20 +192,19 @@ class MediaConnector:
                 timeout=fetch_timeout,
                 allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
             )
-            future = loop.run_in_executor(global_thread_pool,
-                                          media_io.load_bytes, data)
+            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
             return await future
 
         if url_spec.scheme == "data":
-            future = loop.run_in_executor(global_thread_pool,
-                                          self._load_data_url, url_spec,
-                                          media_io)
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_data_url, url_spec, media_io
+            )
             return await future
 
         if url_spec.scheme == "file":
-            future = loop.run_in_executor(global_thread_pool,
-                                          self._load_file_url, url_spec,
-                                          media_io)
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_file_url, url_spec, media_io
+            )
             return await future
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
@@ -197,7 +212,7 @@ class MediaConnector:
     def fetch_audio(
         self,
         audio_url: str,
-    ) -> tuple[np.ndarray, Union[int, float]]:
+    ) -> tuple[np.ndarray, int | float]:
         """
         Load audio from a URL.
         """
@@ -212,7 +227,7 @@ class MediaConnector:
     async def fetch_audio_async(
         self,
         audio_url: str,
-    ) -> tuple[np.ndarray, Union[int, float]]:
+    ) -> tuple[np.ndarray, int | float]:
         """
         Asynchronously fetch audio from a URL.
         """
@@ -235,8 +250,9 @@ class MediaConnector:
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode,
-                                **self.media_io_kwargs.get("image", {}))
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
 
         try:
             return self.load_from_url(
@@ -259,8 +275,9 @@ class MediaConnector:
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode,
-                                **self.media_io_kwargs.get("image", {}))
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
 
         try:
             return await self.load_from_url_async(
@@ -281,10 +298,10 @@ class MediaConnector:
         """
         Load video from an HTTP or base64 data URL.
         """
-        image_io = ImageMediaIO(image_mode=image_mode,
-                                **self.media_io_kwargs.get("image", {}))
-        video_io = VideoMediaIO(image_io,
-                                **self.media_io_kwargs.get("video", {}))
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
 
         return self.load_from_url(
             video_url,
@@ -303,10 +320,10 @@ class MediaConnector:
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode,
-                                **self.media_io_kwargs.get("image", {}))
-        video_io = VideoMediaIO(image_io,
-                                **self.media_io_kwargs.get("video", {}))
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
 
         return await self.load_from_url_async(
             video_url,
@@ -357,7 +374,8 @@ def encode_video_base64(frames: npt.NDArray) -> str:
 
 
 def argsort_mm_positions(
-        mm_positions: MultiModalPlaceholderDict) -> list[tuple[str, int]]:
+    mm_positions: MultiModalPlaceholderDict,
+) -> list[tuple[str, int]]:
     """
     Given a `MultiModalPlaceholderDict`, output a sequence of keys to
     sort the dictionary by `offset` (starting index in the input sequence)
@@ -367,48 +385,24 @@ def argsort_mm_positions(
         A list of `(modality, idx)`, which can be used to access an item
         by `mm_positions[modality][idx]`.
     """
-    flat_items = ((modality, idx, item)
-                  for modality, items in mm_positions.items()
-                  for idx, item in enumerate(items))
+    flat_items = (
+        (modality, idx, item)
+        for modality, items in mm_positions.items()
+        for idx, item in enumerate(items)
+    )
 
     sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)
 
     return [(modality, idx) for modality, idx, _ in sorted_flat_items]
 
 
-# Temporary back-compatibility for plugins that define model runner
-@deprecated("`group_mm_inputs_by_modality` is superseded by "
-            "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
-            "Please use `group_mm_kwargs_by_modality` instead.")
-def group_mm_inputs_by_modality(
-    mm_inputs: list[MultiModalKwargsItems]
-) -> list[list[MultiModalKwargsItems]]:
-    if not mm_inputs:
-        return []
-
-    def modality_group_func(
-            mm_input: MultiModalKwargsItems) -> Union[str, int]:
-        # If the input has multiple modalities, return an id as the unique key
-        # for the mm_input input.
-        if len(mm_input) > 1:
-            return id(mm_input)
-
-        elif len(mm_input) == 1:
-            return next(iter(mm_input.keys()))
-
-        raise AssertionError("This line should be unreachable.")
-
-    return [
-        list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
-    ]
-
-
 def group_mm_kwargs_by_modality(
     mm_kwargs: list[MultiModalKwargsItem],
     *,
     device: torch.types.Device = None,
     pin_memory: bool = False,
-    merge_by_field_config: bool = False,
+    merge_by_field_config: bool | None = None,
+    multimodal_cpu_fields: Set[str] = frozenset(),
 ) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -421,25 +415,46 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
+    if merge_by_field_config is None:
+        raise RuntimeError(
+            "`group_mm_kwargs_by_modality` now requires "
+            "`merge_by_field_config` arg, please update your model runner "
+            "according to https://github.com/vllm-project/vllm/pull/25676."
+        )
+    if merge_by_field_config is False:
+        logger.warning_once(
+            "The legacy code for batching multi-modal kwargs is deprecated and "
+            "will be removed in v0.12. Please update your model with "
+            "`merge_by_field_config=True` to use the new code defined by "
+            "`MultiModalFieldConfig`. You can refer to "
+            "https://github.com/vllm-project/vllm/issues/26149 "
+            "for some examples on how to do this."
+        )
+
     from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
 
-        # TODO: Enable `merge_by_field_config` for all models
-        # to avoid creating an extra batch dimension (except for fields
-        # that are meant to be stacked anyway).
-        # We will also need to update each model to remove `flatten_bn`.
         if merge_by_field_config:
             mm_kwargs_group: BatchedTensorInputs = dict(
                 MultiModalKwargsItems.from_seq(items_lst).get_data(
-                    pin_memory=pin_memory))
+                    pin_memory=pin_memory
+                )
+            )
 
             if device is not None:
-                mm_kwargs_group = json_map_leaves(
-                    lambda x: x.to(device=device),
-                    mm_kwargs_group,
-                )
+                mm_kwargs_group = {
+                    k: json_map_leaves(
+                        lambda x: x.to(device=device, non_blocking=True)
+                        if isinstance(x, torch.Tensor)
+                        else x,
+                        v,
+                    )
+                    if k not in multimodal_cpu_fields
+                    else v
+                    for k, v in mm_kwargs_group.items()
+                }
         else:
             mm_kwargs_group = MultiModalKwargs.as_kwargs(
                 MultiModalKwargs.batch(
@@ -457,47 +472,41 @@ def group_mm_kwargs_by_modality(
 
 def fetch_audio(
     audio_url: str,
-    audio_io_kwargs: Optional[dict[str, Any]] = None,
-) -> tuple[np.ndarray, Union[int, float]]:
+    audio_io_kwargs: dict[str, Any] | None = None,
+) -> tuple[np.ndarray, int | float]:
     """
     Args:
         audio_url: URL of the audio file to fetch.
         audio_io_kwargs: Additional kwargs passed to handle audio IO.
     """
-    media_io_kwargs = None if not audio_io_kwargs else {
-        "audio": audio_io_kwargs
-    }
+    media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
     return media_connector.fetch_audio(audio_url)
 
 
 def fetch_image(
     image_url: str,
-    image_io_kwargs: Optional[dict[str, Any]] = None,
+    image_io_kwargs: dict[str, Any] | None = None,
 ) -> Image.Image:
     """
     Args:
         image_url: URL of the image file to fetch.
         image_io_kwargs: Additional kwargs passed to handle image IO.
     """
-    media_io_kwargs = None if not image_io_kwargs else {
-        "image": image_io_kwargs
-    }
+    media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
     return media_connector.fetch_image(image_url)
 
 
 def fetch_video(
     video_url: str,
-    video_io_kwargs: Optional[dict[str, Any]] = None,
+    video_io_kwargs: dict[str, Any] | None = None,
 ) -> tuple[npt.NDArray, dict[str, Any]]:
     """
     Args:
         video_url: URL of the video file to fetch.
         video_io_kwargs: Additional kwargs passed to handle video IO.
     """
-    media_io_kwargs = None if not video_io_kwargs else {
-        "video": video_io_kwargs
-    }
+    media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
     return media_connector.fetch_video(video_url)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 6981f2ce56239a330b93bee64812a8866b916bdb..369c5e6cb4d108979dd93ae70b8a81a331ce963e 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,23 +6,28 @@ from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
 from vllm import envs
+from vllm.logger import init_logger
+from vllm.utils.registry import ExtensionManager
 
 from .base import MediaIO
 from .image import ImageMediaIO
 
+logger = init_logger(__name__)
+
 
 def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
-    resized_frames = np.empty((num_frames, new_height, new_width, channels),
-                              dtype=frames.dtype)
+    resized_frames = np.empty(
+        (num_frames, new_height, new_width, channels), dtype=frames.dtype
+    )
     # lazy import cv2 to avoid bothering users who only use text models
     import cv2
 
@@ -40,8 +45,7 @@ def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
     return resize_video(frames, (new_height, new_width))
 
 
-def sample_frames_from_video(frames: npt.NDArray,
-                             num_frames: int) -> npt.NDArray:
+def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArray:
     total_frames = frames.shape[0]
     if num_frames == -1:
         return frames
@@ -52,42 +56,19 @@ def sample_frames_from_video(frames: npt.NDArray,
 
 
 class VideoLoader:
-
     @classmethod
     @abstractmethod
-    def load_bytes(cls,
-                   data: bytes,
-                   num_frames: int = -1,
-                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, **kwargs
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
 
-class VideoLoaderRegistry:
-
-    def __init__(self) -> None:
-        self.name2class: dict[str, type] = {}
-
-    def register(self, name: str):
-
-        def wrap(cls_to_register):
-            self.name2class[name] = cls_to_register
-            return cls_to_register
-
-        return wrap
-
-    @staticmethod
-    def load(cls_name: str) -> VideoLoader:
-        cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name)
-        assert cls is not None, f"VideoLoader class {cls_name} not found"
-        return cls()
-
-
-VIDEO_LOADER_REGISTRY = VideoLoaderRegistry()
+VIDEO_LOADER_REGISTRY = ExtensionManager()
 
 
 @VIDEO_LOADER_REGISTRY.register("opencv")
 class OpenCVVideoBackend(VideoLoader):
-
     def get_cv2_video_api(self):
         import cv2.videoio_registry as vr
 
@@ -108,6 +89,7 @@ class OpenCVVideoBackend(VideoLoader):
         cls,
         data: bytes,
         num_frames: int = -1,
+        fps: int = -1,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         import cv2
@@ -121,16 +103,21 @@ class OpenCVVideoBackend(VideoLoader):
         original_fps = cap.get(cv2.CAP_PROP_FPS)
         duration = total_frames_num / original_fps if original_fps > 0 else 0
 
-        # resample video to target num_frames
-        full_read = num_frames == -1 or total_frames_num < num_frames
-        if full_read:
-            num_frames = total_frames_num
-            frame_idx = list(range(0, num_frames))
+        # resample video to target num_frames and fps
+        # - the minimum of the two will be used
+        num_frames_to_sample = total_frames_num
+        if num_frames > 0:
+            num_frames_to_sample = min(num_frames, total_frames_num)
+        if fps > 0:
+            num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
+        num_frames_to_sample = max(1, num_frames_to_sample)  # at least one sample
+
+        if num_frames_to_sample == total_frames_num:
+            frame_idx = list(range(0, num_frames_to_sample))
         else:
-            uniform_sampled_frames = np.linspace(0,
-                                                 total_frames_num - 1,
-                                                 num_frames,
-                                                 dtype=int)
+            uniform_sampled_frames = np.linspace(
+                0, total_frames_num - 1, num_frames_to_sample, dtype=int
+            )
             frame_idx = uniform_sampled_frames.tolist()
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -138,7 +125,7 @@ class OpenCVVideoBackend(VideoLoader):
         frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
 
         i = 0
-        for idx in range(total_frames_num):
+        for idx in range(max(frame_idx) + 1):
             ok = cap.grab()
             if not ok:
                 break
@@ -148,21 +135,23 @@ class OpenCVVideoBackend(VideoLoader):
                     frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     i += 1
 
-        assert i == num_frames, (f"Expected reading {num_frames} frames, "
-                                 f"but only loaded {i} frames from video.")
+        assert i == num_frames_to_sample, (
+            f"Expected reading {num_frames_to_sample} frames, "
+            f"but only loaded {i} frames from video."
+        )
 
         # Use transformers transformers.video_utils.VideoMetadata format
         # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
         # can cause incorrect timestamp calculation without num_frames=-1.
         metadata = {
-            "total_num_frames": num_frames,
-            "fps": num_frames / duration,
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": list(range(num_frames)),
+            "frames_indices": list(frame_idx),
             # extra field used to control hf processor's video
             # sampling behavior
-            "do_sample_frames": num_frames == total_frames_num,
+            "do_sample_frames": num_frames_to_sample == total_frames_num,
         }
 
         return frames, metadata
@@ -170,7 +159,6 @@ class OpenCVVideoBackend(VideoLoader):
 
 @VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
 class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
-
     @classmethod
     def load_bytes(
         cls,
@@ -197,31 +185,31 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: Union[range, list[int]]
+        frame_indices: range | list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
-            frame_indices = sorted({
-                min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
-                for i in range(n)
-            })
+            frame_indices = sorted(
+                {
+                    min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
+                    for i in range(n)
+                }
+            )
         else:
             num_samples = int(max_duration * fps)
             if num_samples >= total_frames_num:
                 frame_indices = range(total_frames_num)
             else:
-                target_seconds = np.linspace(0,
-                                             duration,
-                                             num_samples,
-                                             endpoint=True)
-                frame_indices = sorted({
-                    min(max_frame_idx, int(math.ceil(t * original_fps)))
-                    for t in target_seconds
-                })
+                target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
+                frame_indices = sorted(
+                    {
+                        min(max_frame_idx, int(math.ceil(t * original_fps)))
+                        for t in target_seconds
+                    }
+                )
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3),
-                          dtype=np.uint8)
+        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
 
         i = 0
         for idx in range(total_frames_num):
@@ -236,7 +224,8 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
 
         assert i == len(frame_indices), (
             f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video.")
+            f"but only loaded {i} frames from video."
+        )
 
         # Use transformers transformers.video_utils.VideoMetadata format
         metadata = {
@@ -252,7 +241,6 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
 
 
 class VideoMediaIO(MediaIO[npt.NDArray]):
-
     def __init__(
         self,
         image_io: ImageMediaIO,
@@ -273,22 +261,22 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
         self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
-        return self.video_loader.load_bytes(data,
-                                            num_frames=self.num_frames,
-                                            **self.kwargs)
+        return self.video_loader.load_bytes(
+            data, num_frames=self.num_frames, **self.kwargs
+        )
 
-    def load_base64(self, media_type: str,
-                    data: str) -> tuple[npt.NDArray, dict[str, Any]]:
+    def load_base64(
+        self, media_type: str, data: str
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         if media_type.lower() == "video/jpeg":
             load_frame = partial(
                 self.image_io.load_base64,
                 "image/jpeg",
             )
 
-            return np.stack([
-                np.asarray(load_frame(frame_data))
-                for frame_data in data.split(",")
-            ]), {}
+            return np.stack(
+                [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
+            ), {}
 
         return self.load_bytes(base64.b64decode(data))
 
@@ -312,8 +300,7 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
                 image_format=video_format,
             )
 
-            return ",".join(
-                encode_frame(Image.fromarray(frame)) for frame in video)
+            return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
 
         msg = "Only JPEG format is supported for now."
         raise NotImplementedError(msg)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4d8206bb2d830bc6888f096bf349581cfc4c5bea..cdfe06f1c7faecbd16a2c96e7341054fece3d799 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -4,7 +4,7 @@
 from collections.abc import MutableSequence
 from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass
-from typing import Any, Generic, Optional, Union
+from typing import Any, Generic
 
 import torch
 from typing_extensions import TypeVar
@@ -14,6 +14,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sequence import RequestMetrics
+from vllm.v1.metrics.stats import RequestStateStats
 
 logger = init_logger(__name__)
 
@@ -40,23 +41,25 @@ class CompletionOutput:
     index: int
     text: str
     token_ids: GenericSequence[int]
-    cumulative_logprob: Optional[float]
-    logprobs: Optional[SampleLogprobs]
-    finish_reason: Optional[str] = None
-    stop_reason: Union[int, str, None] = None
-    lora_request: Optional[LoRARequest] = None
+    cumulative_logprob: float | None
+    logprobs: SampleLogprobs | None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
+    lora_request: LoRARequest | None = None
 
     def finished(self) -> bool:
         return self.finish_reason is not None
 
     def __repr__(self) -> str:
-        return (f"CompletionOutput(index={self.index}, "
-                f"text={self.text!r}, "
-                f"token_ids={self.token_ids}, "
-                f"cumulative_logprob={self.cumulative_logprob}, "
-                f"logprobs={self.logprobs}, "
-                f"finish_reason={self.finish_reason}, "
-                f"stop_reason={self.stop_reason})")
+        return (
+            f"CompletionOutput(index={self.index}, "
+            f"text={self.text!r}, "
+            f"token_ids={self.token_ids}, "
+            f"cumulative_logprob={self.cumulative_logprob}, "
+            f"logprobs={self.logprobs}, "
+            f"finish_reason={self.finish_reason}, "
+            f"stop_reason={self.stop_reason})"
+        )
 
 
 @dataclass
@@ -66,14 +69,16 @@ class PoolingOutput:
     Args:
         data: The extracted hidden states.
     """
+
     data: torch.Tensor
 
     def __repr__(self) -> str:
-        return (f"PoolingOutput(data={self.data})")
+        return f"PoolingOutput(data={self.data})"
 
     def __eq__(self, other: object) -> bool:
-        return (isinstance(other, self.__class__) and bool(
-            (self.data == other.data).all()))
+        return isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()
+        )
 
 
 class RequestOutput:
@@ -103,26 +108,27 @@ class RequestOutput:
     def __init__(
         self,
         request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_logprobs: Optional[PromptLogprobs],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_logprobs: PromptLogprobs | None,
         outputs: list[CompletionOutput],
         finished: bool,
-        metrics: Optional[RequestMetrics] = None,
-        lora_request: Optional[LoRARequest] = None,
-        encoder_prompt: Optional[str] = None,
-        encoder_prompt_token_ids: Optional[list[int]] = None,
-        num_cached_tokens: Optional[int] = None,
+        metrics: RequestMetrics | RequestStateStats | None = None,
+        lora_request: LoRARequest | None = None,
+        encoder_prompt: str | None = None,
+        encoder_prompt_token_ids: list[int] | None = None,
+        num_cached_tokens: int | None = None,
         *,
-        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
-        kv_transfer_params: Optional[dict[str, Any]] = None,
+        multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
+        kv_transfer_params: dict[str, Any] | None = None,
         # Forward compatibility, code that uses args added in new release can
         # still run with older versions of vLLM without breaking.
         **kwargs: Any,
     ) -> None:
         if kwargs:
-            logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
-                                str(kwargs))
+            logger.warning_once(
+                "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
+            )
         self.request_id = request_id
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
@@ -149,16 +155,15 @@ class RequestOutput:
                     if aggregate:
                         # Merge outputs with same index
                         completion.text += next_completion.text
-                        if not isinstance(completion.token_ids,
-                                          MutableSequence):
+                        if not isinstance(completion.token_ids, MutableSequence):
                             completion.token_ids = list(completion.token_ids)
                         completion.token_ids.extend(next_completion.token_ids)
                         if next_completion.logprobs:
                             assert completion.logprobs is not None
-                            completion.logprobs.extend(
-                                next_completion.logprobs)
+                            completion.logprobs.extend(next_completion.logprobs)
                         completion.cumulative_logprob = (
-                            next_completion.cumulative_logprob)
+                            next_completion.cumulative_logprob
+                        )
                         completion.finish_reason = next_completion.finish_reason
                         completion.stop_reason = next_completion.stop_reason
                     else:
@@ -169,18 +174,20 @@ class RequestOutput:
                 self.outputs.append(next_completion)
 
     def __repr__(self) -> str:
-        return (f"RequestOutput(request_id={self.request_id}, "
-                f"prompt={self.prompt!r}, "
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"encoder_prompt={self.encoder_prompt!r}, "
-                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
-                f"prompt_logprobs={self.prompt_logprobs}, "
-                f"outputs={self.outputs}, "
-                f"finished={self.finished}, "
-                f"metrics={self.metrics}, "
-                f"lora_request={self.lora_request}, "
-                f"num_cached_tokens={self.num_cached_tokens}, "
-                f"multi_modal_placeholders={self.multi_modal_placeholders})")
+        return (
+            f"RequestOutput(request_id={self.request_id}, "
+            f"prompt={self.prompt!r}, "
+            f"prompt_token_ids={self.prompt_token_ids}, "
+            f"encoder_prompt={self.encoder_prompt!r}, "
+            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
+            f"prompt_logprobs={self.prompt_logprobs}, "
+            f"outputs={self.outputs}, "
+            f"finished={self.finished}, "
+            f"metrics={self.metrics}, "
+            f"lora_request={self.lora_request}, "
+            f"num_cached_tokens={self.num_cached_tokens}, "
+            f"multi_modal_placeholders={self.multi_modal_placeholders})"
+        )
 
 
 _O = TypeVar("_O", default=PoolingOutput)
@@ -194,21 +201,32 @@ class PoolingRequestOutput(Generic[_O]):
         request_id (str): A unique identifier for the pooling request.
         outputs (PoolingOutput): The pooling results for the given input.
         prompt_token_ids (list[int]): A list of token IDs used in the prompt.
+        num_cached_tokens: The number of tokens with prefix cache hit.
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: _O,
-                 prompt_token_ids: list[int], finished: bool):
+    def __init__(
+        self,
+        request_id: str,
+        outputs: _O,
+        prompt_token_ids: list[int],
+        num_cached_tokens: int,
+        finished: bool,
+    ):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
+        self.num_cached_tokens = num_cached_tokens
         self.finished = finished
         self.outputs = outputs
 
     def __repr__(self):
-        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
-                f"outputs={self.outputs!r}, "
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"finished={self.finished})")
+        return (
+            f"{type(self).__name__}(request_id={self.request_id!r}, "
+            f"outputs={self.outputs!r}, "
+            f"prompt_token_ids={self.prompt_token_ids}, "
+            f"num_cached_tokens={self.num_cached_tokens}, "
+            f"finished={self.finished})"
+        )
 
 
 @dataclass
@@ -219,6 +237,7 @@ class EmbeddingOutput:
         embedding: The embedding vector, which is a list of floats.
             Its length depends on the hidden dimension of the model.
     """
+
     embedding: list[float]
 
     @staticmethod
@@ -238,13 +257,13 @@ class EmbeddingOutput:
 
 
 class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
-
     @staticmethod
     def from_base(request_output: PoolingRequestOutput):
         return EmbeddingRequestOutput(
             request_id=request_output.request_id,
             outputs=EmbeddingOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
 
@@ -257,6 +276,7 @@ class ClassificationOutput:
         probs: The probability vector, which is a list of floats.
             Its length depends on the number of classes.
     """
+
     probs: list[float]
 
     @staticmethod
@@ -277,13 +297,13 @@ class ClassificationOutput:
 
 
 class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
-
     @staticmethod
     def from_base(request_output: PoolingRequestOutput):
         return ClassificationRequestOutput(
             request_id=request_output.request_id,
             outputs=ClassificationOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
 
@@ -295,6 +315,7 @@ class ScoringOutput:
     Args:
         score: The similarity score, which is a scalar value.
     """
+
     score: float
 
     @staticmethod
@@ -313,12 +334,12 @@ class ScoringOutput:
 
 
 class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
-
     @staticmethod
     def from_base(request_output: PoolingRequestOutput):
         return ScoringRequestOutput(
             request_id=request_output.request_id,
             outputs=ScoringOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 9b64817da648c67fc2e55d8a1fa553b3f60d9b4d..a45ca988200d2c399990d7573f760e27497a79c9 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -3,13 +3,13 @@
 import logging
 import traceback
 from itertools import chain
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm import envs
-from vllm.plugins import load_plugins_by_group
-from vllm.utils import resolve_obj_by_qualname, supports_xccl
+from vllm.plugins import PLATFORM_PLUGINS_GROUP, load_plugins_by_group
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.torch_utils import supports_xccl
 
-from .interface import _Backend  # noqa: F401
 from .interface import CpuArchEnum, Platform, PlatformEnum
 
 logger = logging.getLogger(__name__)
@@ -20,23 +20,25 @@ def vllm_version_matches_substr(substr: str) -> bool:
     Check to see if the vLLM version matches a substring.
     """
     from importlib.metadata import PackageNotFoundError, version
+
     try:
         vllm_version = version("vllm")
     except PackageNotFoundError as e:
         logger.warning(
             "The vLLM package was not found, so its version could not be "
-            "inspected. This may cause platform detection to fail.")
+            "inspected. This may cause platform detection to fail."
+        )
         raise e
     return substr in vllm_version
 
 
-def tpu_platform_plugin() -> Optional[str]:
+def tpu_platform_plugin() -> str | None:
     logger.debug("Checking if TPU platform is available.")
 
     # Check for Pathways TPU proxy
     if envs.VLLM_TPU_USING_PATHWAYS:
         logger.debug("Confirmed TPU platform is available via Pathways proxy.")
-        return "tpu_commons.platforms.tpu_jax.TpuPlatform"
+        return "tpu_inference.platforms.tpu_platform.TpuPlatform"
 
     # Check for libtpu installation
     try:
@@ -46,6 +48,7 @@ def tpu_platform_plugin() -> Optional[str]:
         # has TPUs.
 
         import libtpu  # noqa: F401
+
         logger.debug("Confirmed TPU platform is available.")
         return "vllm.platforms.tpu.TpuPlatform"
     except Exception as e:
@@ -53,11 +56,12 @@ def tpu_platform_plugin() -> Optional[str]:
         return None
 
 
-def cuda_platform_plugin() -> Optional[str]:
+def cuda_platform_plugin() -> str | None:
     is_cuda = False
     logger.debug("Checking if CUDA platform is available.")
     try:
-        from vllm.utils import import_pynvml
+        from vllm.utils.import_utils import import_pynvml
+
         pynvml = import_pynvml()
         pynvml.nvmlInit()
         try:
@@ -66,21 +70,22 @@ def cuda_platform_plugin() -> Optional[str]:
             # we need to check if vllm is built with cpu too.
             # Otherwise, vllm will always activate cuda plugin
             # on a GPU machine, even if in a cpu build.
-            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
-                       and not vllm_version_matches_substr("cpu"))
+            is_cuda = (
+                pynvml.nvmlDeviceGetCount() > 0
+                and not vllm_version_matches_substr("cpu")
+            )
             if pynvml.nvmlDeviceGetCount() <= 0:
-                logger.debug(
-                    "CUDA platform is not available because no GPU is found.")
+                logger.debug("CUDA platform is not available because no GPU is found.")
             if vllm_version_matches_substr("cpu"):
-                logger.debug("CUDA platform is not available because"
-                             " vLLM is built with CPU.")
+                logger.debug(
+                    "CUDA platform is not available because vLLM is built with CPU."
+                )
             if is_cuda:
                 logger.debug("Confirmed CUDA platform is available.")
         finally:
             pynvml.nvmlShutdown()
     except Exception as e:
-        logger.debug("Exception happens when checking CUDA platform: %s",
-                     str(e))
+        logger.debug("Exception happens when checking CUDA platform: %s", str(e))
         if "nvml" not in e.__class__.__name__.lower():
             # If the error is not related to NVML, re-raise it.
             raise e
@@ -89,8 +94,9 @@ def cuda_platform_plugin() -> Optional[str]:
         import os
 
         def cuda_is_jetson() -> bool:
-            return os.path.isfile("/etc/nv_tegra_release") \
-                or os.path.exists("/sys/class/tegra-firmware")
+            return os.path.isfile("/etc/nv_tegra_release") or os.path.exists(
+                "/sys/class/tegra-firmware"
+            )
 
         if cuda_is_jetson():
             logger.debug("Confirmed CUDA platform is available on Jetson.")
@@ -101,19 +107,19 @@ def cuda_platform_plugin() -> Optional[str]:
     return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
 
 
-def rocm_platform_plugin() -> Optional[str]:
+def rocm_platform_plugin() -> str | None:
     is_rocm = False
     logger.debug("Checking if ROCm platform is available.")
     try:
         import amdsmi
+
         amdsmi.amdsmi_init()
         try:
             if len(amdsmi.amdsmi_get_processor_handles()) > 0:
                 is_rocm = True
                 logger.debug("Confirmed ROCm platform is available.")
             else:
-                logger.debug("ROCm platform is not available because"
-                             " no GPU is found.")
+                logger.debug("ROCm platform is not available because no GPU is found.")
         finally:
             amdsmi.amdsmi_shut_down()
     except Exception as e:
@@ -122,25 +128,26 @@ def rocm_platform_plugin() -> Optional[str]:
     return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
 
 
-def xpu_platform_plugin() -> Optional[str]:
+def xpu_platform_plugin() -> str | None:
     is_xpu = False
     logger.debug("Checking if XPU platform is available.")
     try:
         # installed IPEX if the machine has XPUs.
         import intel_extension_for_pytorch  # noqa: F401
         import torch
+
         if supports_xccl():
             dist_backend = "xccl"
         else:
             dist_backend = "ccl"
             import oneccl_bindings_for_pytorch  # noqa: F401
 
-        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
             is_xpu = True
             from vllm.platforms.xpu import XPUPlatform
+
             XPUPlatform.dist_backend = dist_backend
-            logger.debug("Confirmed %s backend is available.",
-                         XPUPlatform.dist_backend)
+            logger.debug("Confirmed %s backend is available.", XPUPlatform.dist_backend)
             logger.debug("Confirmed XPU platform is available.")
     except Exception as e:
         logger.debug("XPU platform is not available because: %s", str(e))
@@ -148,20 +155,23 @@ def xpu_platform_plugin() -> Optional[str]:
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
 
 
-def cpu_platform_plugin() -> Optional[str]:
+def cpu_platform_plugin() -> str | None:
     is_cpu = False
     logger.debug("Checking if CPU platform is available.")
     try:
         is_cpu = vllm_version_matches_substr("cpu")
         if is_cpu:
-            logger.debug("Confirmed CPU platform is available because"
-                         " vLLM is built with CPU.")
+            logger.debug(
+                "Confirmed CPU platform is available because vLLM is built with CPU."
+            )
         if not is_cpu:
             import sys
+
             is_cpu = sys.platform.startswith("darwin")
             if is_cpu:
-                logger.debug("Confirmed CPU platform is available"
-                             " because the machine is MacOS.")
+                logger.debug(
+                    "Confirmed CPU platform is available because the machine is MacOS."
+                )
 
     except Exception as e:
         logger.debug("CPU platform is not available because: %s", str(e))
@@ -170,21 +180,20 @@ def cpu_platform_plugin() -> Optional[str]:
 
 
 builtin_platform_plugins = {
-    'tpu': tpu_platform_plugin,
-    'cuda': cuda_platform_plugin,
-    'rocm': rocm_platform_plugin,
-    'xpu': xpu_platform_plugin,
-    'cpu': cpu_platform_plugin,
+    "tpu": tpu_platform_plugin,
+    "cuda": cuda_platform_plugin,
+    "rocm": rocm_platform_plugin,
+    "xpu": xpu_platform_plugin,
+    "cpu": cpu_platform_plugin,
 }
 
 
 def resolve_current_platform_cls_qualname() -> str:
-    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+    platform_plugins = load_plugins_by_group(PLATFORM_PLUGINS_GROUP)
 
     activated_plugins = []
 
-    for name, func in chain(builtin_platform_plugins.items(),
-                            platform_plugins.items()):
+    for name, func in chain(builtin_platform_plugins.items(), platform_plugins.items()):
         try:
             assert callable(func)
             platform_cls_qualname = func()
@@ -194,43 +203,43 @@ def resolve_current_platform_cls_qualname() -> str:
             pass
 
     activated_builtin_plugins = list(
-        set(activated_plugins) & set(builtin_platform_plugins.keys()))
-    activated_oot_plugins = list(
-        set(activated_plugins) & set(platform_plugins.keys()))
+        set(activated_plugins) & set(builtin_platform_plugins.keys())
+    )
+    activated_oot_plugins = list(set(activated_plugins) & set(platform_plugins.keys()))
 
     if len(activated_oot_plugins) >= 2:
         raise RuntimeError(
             "Only one platform plugin can be activated, but got: "
-            f"{activated_oot_plugins}")
+            f"{activated_oot_plugins}"
+        )
     elif len(activated_oot_plugins) == 1:
         platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
-        logger.info("Platform plugin %s is activated",
-                    activated_oot_plugins[0])
+        logger.info("Platform plugin %s is activated", activated_oot_plugins[0])
     elif len(activated_builtin_plugins) >= 2:
         raise RuntimeError(
             "Only one platform plugin can be activated, but got: "
-            f"{activated_builtin_plugins}")
+            f"{activated_builtin_plugins}"
+        )
     elif len(activated_builtin_plugins) == 1:
-        platform_cls_qualname = builtin_platform_plugins[
-            activated_builtin_plugins[0]]()
-        logger.info("Automatically detected platform %s.",
-                    activated_builtin_plugins[0])
+        platform_cls_qualname = builtin_platform_plugins[activated_builtin_plugins[0]]()
+        logger.debug(
+            "Automatically detected platform %s.", activated_builtin_plugins[0]
+        )
     else:
         platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
-        logger.info(
-            "No platform detected, vLLM is running on UnspecifiedPlatform")
+        logger.debug("No platform detected, vLLM is running on UnspecifiedPlatform")
     return platform_cls_qualname
 
 
 _current_platform = None
-_init_trace: str = ''
+_init_trace: str = ""
 
 if TYPE_CHECKING:
     current_platform: Platform
 
 
 def __getattr__(name: str):
-    if name == 'current_platform':
+    if name == "current_platform":
         # lazy init current_platform.
         # 1. out-of-tree platform plugins need `from vllm.platforms import
         #    Platform` so that they can inherit `Platform` class. Therefore,
@@ -245,19 +254,24 @@ def __getattr__(name: str):
         global _current_platform
         if _current_platform is None:
             platform_cls_qualname = resolve_current_platform_cls_qualname()
-            _current_platform = resolve_obj_by_qualname(
-                platform_cls_qualname)()
+            _current_platform = resolve_obj_by_qualname(platform_cls_qualname)()
             global _init_trace
             _init_trace = "".join(traceback.format_stack())
         return _current_platform
     elif name in globals():
         return globals()[name]
     else:
-        raise AttributeError(
-            f"No attribute named '{name}' exists in {__name__}.")
+        raise AttributeError(f"No attribute named '{name}' exists in {__name__}.")
+
+
+def __setattr__(name: str, value):
+    if name == "current_platform":
+        global _current_platform
+        _current_platform = value
+    elif name in globals():
+        globals()[name] = value
+    else:
+        raise AttributeError(f"No attribute named '{name}' exists in {__name__}.")
 
 
-__all__ = [
-    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
-    "_init_trace"
-]
+__all__ = ["Platform", "PlatformEnum", "current_platform", "CpuArchEnum", "_init_trace"]
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index e858c33048291657afb884143c1389d91fa611e3..ed655912d39643bde4bd38ba902a520236898890 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,34 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import glob
 import json
 import os
 import platform
 import subprocess
 import sys
 from dataclasses import dataclass
-from importlib.util import find_spec
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
+import regex as re
 import torch
 
+from vllm import envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
-from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend
+from .interface import CpuArchEnum, Platform, PlatformEnum
 
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
+    AttentionBackendEnum = None
     VllmConfig = None
 
 
 def get_max_threads(pid=0):
-    if hasattr(os, 'sched_getaffinity'):
+    if hasattr(os, "sched_getaffinity"):
         return len(os.sched_getaffinity(pid))
-    elif platform.system() == 'Darwin':
+    elif platform.system() == "Darwin":
         return os.cpu_count()
     else:
         raise NotImplementedError("Unsupported OS")
@@ -58,7 +61,8 @@ class LogicalCPUInfo:
             return LogicalCPUInfo(
                 id=LogicalCPUInfo._int(id),
                 physical_core=LogicalCPUInfo._int(physical_core),
-                numa_node=LogicalCPUInfo._int(numa_node))
+                numa_node=LogicalCPUInfo._int(numa_node),
+            )
         else:
             return obj_dict
 
@@ -75,13 +79,42 @@ class CpuPlatform(Platform):
     def supported_dtypes(self) -> list[torch.dtype]:
         if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
             return [torch.bfloat16, torch.float32]
-        elif (self.get_cpu_architecture() == CpuArchEnum.ARM
-              and sys.platform.startswith("darwin")):
-            if (subprocess.check_output(
-                ["sysctl -n hw.optional.arm.FEAT_BF16"],
-                    shell=True).strip() == b"1"):
+        elif self.get_cpu_architecture() == CpuArchEnum.ARM and sys.platform.startswith(
+            "darwin"
+        ):
+            if (
+                subprocess.check_output(
+                    ["sysctl -n hw.optional.arm.FEAT_BF16"], shell=True
+                ).strip()
+                == b"1"
+            ):
                 return [torch.bfloat16, torch.float16, torch.float32]
             return [torch.float16, torch.float32]
+        elif self.get_cpu_architecture() == CpuArchEnum.RISCV:
+            # Workaround for Issue #25655: RISC-V scheduler bug with float16
+            #
+            # Background:
+            # - RISC-V currently uses scalar code path
+            # - There is a latent bug in the vLLM scheduler that provides
+            # invalid
+            #   physical_block_idx values under certain conditions
+            # - This bug causes segmentation faults when using float16
+            # dtype on RISC-V
+            # - Testing shows that forcing float32 successfully bypasses
+            # this issue
+            #
+            # Technical details:
+            # - The bug manifests as out-of-bounds physical_block_idx in
+            # block_tables
+            # - Only occurs on RISC-V hardware
+            # tested on Sophgo SG2044
+            # - Does not reproduce on x86 or other architectures
+            # - Root cause is in Python-level scheduling logic,
+            # not C++ kernels
+            #
+            # This is a temporary workaround until the scheduler bug is fixed.
+            # See: https://github.com/vllm-project/vllm/issues/25655
+            return [torch.float32]
         # x86/aarch64 CPU has supported both bf16 and fp16 natively.
         return [torch.bfloat16, torch.float16, torch.float32]
 
@@ -90,34 +123,39 @@ class CpuPlatform(Platform):
         return "cpu"
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink: bool, use_sparse: bool) -> str:
-        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: str | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
             raise NotImplementedError("MLA is not supported on CPU.")
         if use_sparse:
-            raise NotImplementedError(
-                "Sparse Attention is not supported on CPU.")
-        logger.info("Using Torch SDPA backend.")
-        if use_v1:
-            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
-        else:
-            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+            raise NotImplementedError("Sparse Attention is not supported on CPU.")
+        return AttentionBackendEnum.CPU_ATTN.get_path()
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
-        import vllm.envs as envs
-        from vllm.utils import GiB_bytes
+        from vllm.utils.mem_constants import GiB_bytes
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
         if kv_cache_space is None:
             kv_cache_space = 4 * GiB_bytes  # type: ignore
             logger.warning_once(
                 "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
-                "for CPU backend is not set, using 4 by default.")
+                "for CPU backend is not set, using 4 by default."
+            )
         else:
             kv_cache_space *= GiB_bytes
 
@@ -143,62 +181,61 @@ class CpuPlatform(Platform):
 
         cache_config = vllm_config.cache_config
 
-        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+        if cache_config.block_size is None:
+            cache_config.block_size = 128
 
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128 if ipex_available else 16
+        if cache_config.block_size % 32 != 0:
+            logger.warning(
+                "CPU backend prefers block_size is multiples of 32, "
+                "otherwise the performance is not optimized."
+            )
 
-        if not ipex_available and cache_config.block_size != 16:
+        scheduler_config = vllm_config.scheduler_config
+        if (
+            scheduler_config.enable_chunked_prefill
+            or cache_config.enable_prefix_caching
+        ) and cache_config.cache_dtype != "auto":
             raise RuntimeError(
-                f"--block-size={cache_config.block_size} requires"
-                " intel_extension_for_pytorch")
+                "Chunked-prefill and prefix-cache on the CPU "
+                "backend is not compatible with FP8 KV cache."
+            )
 
-        scheduler_config = vllm_config.scheduler_config
-        if ((scheduler_config.chunked_prefill_enabled
-             or cache_config.enable_prefix_caching)
-                and cache_config.cache_dtype != "auto"):
-            raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
-                               "backend is not compatible with FP8 KV cache.")
-
-        if cache_config.cache_dtype == "fp8_e4m3":
-            cache_config.cache_dtype = "fp8_e5m2"
+        if cache_config.cache_dtype != "auto":
             logger.warning(
-                "CPU backend doesn't support fp8_e4m3 KV cache type, "
-                "cast to fp8_e5m2.")
+                "CPU backend doesn't support KV cache quantization fallback to auto."
+            )
+            cache_config.cache_dtype = "auto"
 
-        if (cache_config.cache_dtype != "auto" and model_config is not None
-                and model_config.dtype == torch.half):
-            logger.warning("FP8 KV cache on the CPU backend only does not"
-                           " support fp16 for now, cast to bf16.")
-            model_config.dtype = torch.bfloat16
-
-        cache_config.cpu_kvcache_space_bytes = \
-            CpuPlatform.get_device_total_memory()
+        cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
 
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.world_size > 1
-                and parallel_config.distributed_executor_backend is not None
-                and parallel_config.distributed_executor_backend != "mp"):
-            logger.warning(("%s is not supported on CPU, fallback to mp "
-                            "distributed executor backend."),
-                           parallel_config.distributed_executor_backend)
+        if (
+            parallel_config.world_size > 1
+            and parallel_config.distributed_executor_backend is not None
+            and parallel_config.distributed_executor_backend != "mp"
+        ):
+            logger.warning(
+                (
+                    "%s is not supported on CPU, fallback to mp "
+                    "distributed executor backend."
+                ),
+                parallel_config.distributed_executor_backend,
+            )
             parallel_config.distributed_executor_backend = "mp"
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
         # Disable DBO
         if parallel_config.enable_dbo:
-            logger.warning(
-                "Dual-Batch Overlap is not supported on CPU, disabled.")
+            logger.warning("Dual-Batch Overlap is not supported on CPU, disabled.")
             parallel_config.enable_dbo = False
 
         # Note: workaround for v1 gpu_model_runner
-        from vllm.config import CompilationLevel
+        from vllm.config import CompilationMode
+
         vllm_config.compilation_config.cudagraph_capture_sizes = []
 
         compilation_config = vllm_config.compilation_config
-        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
-                == CompilationLevel.PIECEWISE):
-
+        if vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             # Note: vLLM V1 is using PIECEWISE level compilation, which will
             # take time to compile kernels just-in-time with the inductor
             # backend. For CPU CI tests, most of them are executed fast and
@@ -211,23 +248,19 @@ class CpuPlatform(Platform):
             else:
                 backend = "inductor"
 
-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
             compilation_config.backend = backend
-            compilation_config.inductor_compile_config.update({
-                "dce":
-                True,
-                "size_asserts":
-                False,
-                "nan_asserts":
-                False,
-                "epilogue_fusion":
-                True,
-            })
-            if compilation_config.use_inductor:
-                compilation_config.custom_ops = ["none"]
+            compilation_config.inductor_compile_config.update(
+                {
+                    "dce": True,
+                    "size_asserts": False,
+                    "nan_asserts": False,
+                    "epilogue_fusion": True,
+                }
+            )
 
         if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE
 
         assert vllm_config.device_config.device_type == "cpu"
 
@@ -238,65 +271,104 @@ class CpuPlatform(Platform):
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
         # Note: to avoid the error 'nthreads cannot be larger than environment
-        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        # variable "NUMEXPR_MAX_THREADS" (64)'.
         os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
 
-        # Set default threads num for OpenMP parallel
-        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+        if envs.VLLM_CPU_OMP_THREADS_BIND != "nobind":
+            # Set default threads num for OpenMP parallel
+            os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+        else:
+            # In this case, setting the OpenMP configuration via
+            # OMP_NUM_THREADS is up to the user.
+            logger.info("Disabling binding processes to CPU cores...")
 
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Disable multi-stream for shared experts as no Stream on CPU
+        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
+
         # Intel OpenMP setting
-        ld_prealod_str = os.getenv("LD_PRELOAD", "")
-        if "libiomp5.so" in ld_prealod_str:
+        ld_preload_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_preload_str:
             # The time(milliseconds) that a thread should wait after
             # completing the execution of a parallel region, before sleeping.
-            os.environ['KMP_BLOCKTIME'] = "1"
+            os.environ["KMP_BLOCKTIME"] = "1"
             # Prevents the CPU to run into low performance state
-            os.environ['KMP_TPAUSE'] = "0"
+            os.environ["KMP_TPAUSE"] = "0"
             # Provides fine granularity parallelism
-            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+            os.environ["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
+            os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
+            os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
+
+        if (
+            platform.system() == "Linux"
+            and Platform.get_cpu_architecture()
+            in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
+            and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
+        ):
+            # We need to LD_PRELOAD PyTorch's libgomp, otherwise only
+            # one core will be properly utilized when we thread-bind
+            # See: https://github.com/vllm-project/vllm/issues/27369
+            # TODO: Remove once:
+            # https://github.com/pytorch/pytorch/issues/166087 is fixed
+
+            # We need to find the location of PyTorch's libgomp
+            torch_pkg = os.path.dirname(torch.__file__)
+            site_root = os.path.dirname(torch_pkg)
+            torch_libs = os.path.join(site_root, "torch.libs")
+            pytorch_libgomp_so_candidates = glob.glob(
+                os.path.join(torch_libs, "libgomp-*.so*")
+            )
+            if pytorch_libgomp_so_candidates:
+                pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
+                if ld_preload_str:
+                    ld_preload_str += ":"
+                ld_preload_str += pytorch_libgomp_so
+                os.environ["LD_PRELOAD"] = ld_preload_str
 
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
-            vllm_config.parallel_config.tensor_parallel_size)
+            vllm_config.parallel_config.tensor_parallel_size
+        )
 
         if model_config is not None and model_config.use_mla:
             logger.info(
                 "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
+                "prefill and prefix caching to be disabled."
+            )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
 
     @classmethod
-    def get_allowed_cpu_core_node_list(
-            cls) -> tuple[list[int], list[LogicalCPUInfo]]:
+    def get_allowed_cpu_core_node_list(cls) -> tuple[list[int], list[LogicalCPUInfo]]:
         assert platform.system() == "Linux"
 
         # Init LogicalCPUInfo from lscpu
-        lscpu_output = subprocess.check_output("lscpu -J -e=CPU,CORE,NODE",
-                                               shell=True,
-                                               text=True)
+        lscpu_output = subprocess.check_output(
+            "lscpu -J -e=CPU,CORE,NODE", shell=True, text=True
+        )
+        lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output)
         logical_cpu_list: list[LogicalCPUInfo] = json.loads(
-            lscpu_output, object_hook=LogicalCPUInfo.json_decoder)['cpus']
+            lscpu_output, object_hook=LogicalCPUInfo.json_decoder
+        )["cpus"]
 
         # Filter CPUs with invalid attributes
         logical_cpu_list = [
-            x for x in logical_cpu_list
+            x
+            for x in logical_cpu_list
             if -1 not in (x.id, x.physical_core, x.numa_node)
         ]
 
         # Filter allowed CPUs
-        allowed_cpu_id_list = os.sched_getaffinity(0)
-        logical_cpu_list = [
-            x for x in logical_cpu_list if x.id in allowed_cpu_id_list
-        ]
+        if hasattr(os, "sched_getaffinity"):
+            allowed_cpu_id_list = os.sched_getaffinity(0)
+        else:
+            raise NotImplementedError("Unsupported OS")
+        logical_cpu_list = [x for x in logical_cpu_list if x.id in allowed_cpu_id_list]
 
         # Get allowed NUMA nodes
         allowed_numa_nodes = set()
@@ -305,8 +377,8 @@ class CpuPlatform(Platform):
         allowed_numa_nodes_list = sorted(allowed_numa_nodes)
 
         env_key = CpuPlatform.device_control_env_var
-        if (env_key in os.environ and os.environ[env_key] != ""):
-            visible_nodes = [int(s) for s in os.environ[env_key].split(',')]
+        if env_key in os.environ and os.environ[env_key] != "":
+            visible_nodes = [int(s) for s in os.environ[env_key].split(",")]
             allowed_numa_nodes_list = [
                 x for x in visible_nodes if x in allowed_cpu_id_list
             ]
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 83cf2b8b273a1d5600173d181f2e895fe02cce93..2e4dd8bb808b42adec8c64db69ae45f2b8f7276c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -5,25 +5,30 @@ pynvml. However, it should not initialize cuda context.
 """
 
 import os
-from datetime import timedelta
+from collections.abc import Callable
 from functools import cache, wraps
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, TypeVar
 
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
-from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless, import_pynvml
+from vllm.utils.import_utils import import_pynvml
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
-from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
+else:
+    AttentionBackendEnum = None
+    VllmConfig = None
+    CacheDType = None
 
 logger = init_logger(__name__)
 
@@ -37,8 +42,50 @@ pynvml = import_pynvml()
 torch.backends.cuda.enable_cudnn_sdp(False)
 
 
-def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+@cache
+def _get_backend_priorities(
+    use_mla: bool,
+    device_capability: DeviceCapability,
+) -> list[AttentionBackendEnum]:
+    """Get backend priorities with lazy import to avoid circular dependency."""
+    from vllm.attention.backends.registry import AttentionBackendEnum
+
+    if use_mla:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.CUTLASS_MLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+    else:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
 
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @wraps(fn)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         pynvml.nvmlInit()
@@ -83,9 +130,7 @@ class CudaPlatformBase(Platform):
         _ = torch.zeros(1, device=device)
 
     @classmethod
-    def get_device_capability(cls,
-                              device_id: int = 0
-                              ) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         raise NotImplementedError
 
     @classmethod
@@ -110,17 +155,7 @@ class CudaPlatformBase(Platform):
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
-            if vllm_config.speculative_config:
-                if not envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Speculative decoding is not supported on vLLM V0.")
-                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-            else:
-                if envs.VLLM_USE_V1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.gpu_worker.Worker"
-                else:
-                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
 
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
@@ -128,9 +163,16 @@ class CudaPlatformBase(Platform):
 
         # TODO(lucas): handle this more gracefully
         # Note: model_config may be None during testing
-        if model_config is not None and model_config.use_mla:
-            use_sparse = hasattr(vllm_config.model_config.hf_config,
-                                 "index_topk")
+        # Note: block_size is initialized in
+        # HybridAttentionMambaModelConfig.verify_and_update_config
+        # for models with both attention and mamba,
+        # and doesn't need to be reinitialized here
+        if (
+            model_config is not None
+            and model_config.use_mla
+            and cache_config.block_size is not None
+        ):
+            use_sparse = hasattr(vllm_config.model_config.hf_config, "index_topk")
             # If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA,
             # then we default to FlashMLA backend for non-blackwell GPUs,
             # else we default to CutlassMLA. For each case, we force the
@@ -153,43 +195,51 @@ class CudaPlatformBase(Platform):
                     use_flashmla = True
             else:
                 # Forced case
-                use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
-                use_cutlass_mla = (
-                    envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA")
-                use_flashinfer_mla = (
-                    envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA")
-
-            from vllm.attention.ops.flashmla import is_flashmla_supported
-            if use_flashmla and is_flashmla_supported()[0] \
-                and cache_config.block_size != 64:
+                use_flashmla = envs.VLLM_ATTENTION_BACKEND == "FLASHMLA"
+                use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
+                use_flashinfer_mla = envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA"
+
+            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+
+            if (
+                use_flashmla
+                and is_flashmla_dense_supported()[0]
+                and cache_config.block_size % 64 != 0
+            ):
                 cache_config.block_size = 64
-                logger.info(
-                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+                logger.info("Forcing kv cache block size to 64 for FlashMLA backend.")
 
-            if use_cutlass_mla and cache_config.block_size != 128:
+            if use_cutlass_mla and cache_config.block_size % 128 != 0:
                 cache_config.block_size = 128
-                logger.info("Forcing kv cache block size to 128 for "
-                            "CUTLASS_MLA backend.")
+                logger.info(
+                    "Forcing kv cache block size to 128 for CUTLASS_MLA backend."
+                )
 
-            if use_flashinfer_mla and cache_config.block_size not in [32, 64]:
+            if (
+                use_flashinfer_mla
+                and cache_config.block_size != 32
+                and cache_config.block_size % 64 != 0
+            ):
                 cache_config.block_size = 64
                 logger.info(
-                    "Forcing kv cache block size to 64 for FlashInferMLA "
-                    "backend.")
+                    "Forcing kv cache block size to 64 for FlashInferMLA backend."
+                )
 
             # TODO(Chen): remove this hacky code
             if use_sparse and cache_config.block_size != 64:
                 cache_config.block_size = 64
                 logger.info(
-                    "Forcing kv cache block size to 64 for FlashMLASparse "
-                    "backend.")
+                    "Forcing kv cache block size to 64 for FlashMLASparse backend."
+                )
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
-        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
-                and parallel_config.data_parallel_size > 1
-                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
+        if (
+            parallel_config.all2all_backend == "deepep_high_throughput"
+            and parallel_config.data_parallel_size > 1
+            and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
             # TODO: Piecewise Cuda graph might be enabled
             # if torch compile cache key issue fixed
             # See https://github.com/vllm-project/vllm/pull/25093
@@ -198,198 +248,188 @@ class CudaPlatformBase(Platform):
                 "kernels are optimized for prefill and are incompatible with "
                 "CUDA Graphs. "
                 "In order to use CUDA Graphs for decode-optimized workloads, "
-                "set VLLM_ALL2ALL_BACKEND to another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter.")
+                "use --all2all-backend with another option, such as "
+                "deepep_low_latency, pplx, or allgather_reducescatter."
+            )
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
     @classmethod
-    def get_current_memory_usage(cls,
-                                 device: Optional[torch.types.Device] = None
-                                 ) -> float:
+    def get_current_memory_usage(
+        cls, device: torch.types.Device | None = None
+    ) -> float:
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int,
-                             dtype: torch.dtype) -> _Backend:
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         # For Blackwell GPUs, force TORCH_SDPA for now.
         # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
         if cls.has_device_capability(100):
-            return _Backend.TORCH_SDPA
+            return AttentionBackendEnum.TORCH_SDPA
 
         if dtype not in (torch.float16, torch.bfloat16):
-            return _Backend.XFORMERS
+            return AttentionBackendEnum.XFORMERS
 
         if cls.has_device_capability(80):
-            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-            from vllm.attention.selector import is_attn_backend_supported
-            is_default_fa_supported = is_attn_backend_supported(
-                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False)
-            if is_default_fa_supported:
-                return _Backend.FLASH_ATTN
+            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+            if backend_class.supports_head_size(
+                head_size
+            ) and backend_class.supports_dtype(dtype):
+                return AttentionBackendEnum.FLASH_ATTN
             else:
-                # Fallback to XFORMERS
-                return _Backend.XFORMERS
+                return AttentionBackendEnum.XFORMERS
         else:
             # Fallback for Volta/Turing GPUs or FA not supported
-            return _Backend.XFORMERS
-
-    @classmethod
-    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla,
-                             has_sink, use_sparse) -> str:
-        if use_mla:
-            if not use_v1:
-                raise RuntimeError(
-                    "MLA attention backends require the V1 engine. "
-                    "Set VLLM_USE_V1=1 to enable them.")
-
-            from vllm.attention.ops.flashmla import is_flashmla_supported
-            from vllm.attention.utils.fa_utils import flash_attn_supports_mla
-
-            if use_sparse:
-                logger.info_once("Using Sparse MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla.flashmla_sparse."
-                        "FlashMLASparseBackend")
-
-            use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
-                selected_backend is None and cls.is_device_capability(100)
-                and block_size == 128)
-            use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or (
-                selected_backend is None and cls.is_device_capability(100)
-                and block_size in [32, 64])
-            use_flashmla = selected_backend == _Backend.FLASHMLA or (
-                selected_backend is None and is_flashmla_supported()[0])
-            use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or (
-                selected_backend is None and flash_attn_supports_mla())
-            use_triton = selected_backend == _Backend.TRITON_MLA or (
-                selected_backend is None)
-
-            if use_cutlassmla:
-                logger.info_once("Using Cutlass MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "cutlass_mla.CutlassMLABackend")
-            if use_flashinfermla:
-                from vllm.v1.attention.backends.utils import (
-                    set_kv_cache_layout)
-                set_kv_cache_layout("HND")
-                logger.info_once("Using FlashInfer MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "flashinfer_mla.FlashInferMLABackend")
-            if use_flashmla:
-                if block_size != 64:
-                    logger.warning(
-                        "FlashMLA backend is not supported for block size %d"
-                        " (currently only supports block size 64).",
-                        block_size)
-                else:
-                    logger.info_once("Using FlashMLA backend on V1 engine.")
-                    return ("vllm.v1.attention.backends.mla."
-                            "flashmla.FlashMLABackend")
-            if use_flashattn:
-                logger.info_once(
-                    "Using FlashAttention MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "flashattn_mla.FlashAttnMLABackend")
-            if use_triton:
-                logger.info_once("Using Triton MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "triton_mla.TritonMLABackend")
-        if use_v1:
-            FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
-            FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
-            TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-            TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
-            XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
-
-            use_fp8_kv_cache = (kv_cache_dtype is not None
-                                and kv_cache_dtype.startswith("fp8"))
-
-            if selected_backend == _Backend.FLASHINFER:
-                logger.info_once("Using FlashInfer backend on V1 engine.")
-                if cls.has_device_capability(100):
-                    from vllm.v1.attention.backends.utils import (
-                        set_kv_cache_layout)
-                    set_kv_cache_layout("HND")
-                return FLASHINFER_V1
-            elif selected_backend == _Backend.FLEX_ATTENTION:
-                logger.info_once("Using FlexAttention backend on V1 engine.")
-                return FLEX_ATTENTION_V1
-            elif selected_backend == _Backend.TRITON_ATTN:
-                logger.info_once("Using Triton backend on V1 engine.")
-                return TRITON_ATTN
-            elif selected_backend == _Backend.FLASH_ATTN:
-                logger.info_once("Using Flash Attention backend on V1 engine.")
-                return FLASH_ATTN_V1
-            elif selected_backend == _Backend.TREE_ATTN:
-                logger.info_once("Using Tree Attention backend on V1 engine.")
-                return TREE_ATTN_V1
-            elif selected_backend == _Backend.XFORMERS:
-                logger.info_once("Using XFormers backend on V1 engine.")
-                return XFORMERS_V1
-
-            from vllm.attention.selector import is_attn_backend_supported
-
-            # Default backends for V1 engine
-            # Prefer FlashInfer for Blackwell GPUs if installed
-            if cls.is_device_capability(100):
-                if is_default_backend_supported := is_attn_backend_supported(
-                        FLASHINFER_V1, head_size, dtype):
-                    from vllm.v1.attention.backends.utils import (
-                        set_kv_cache_layout)
-
-                    logger.info_once(
-                        "Using FlashInfer backend with HND KV cache layout on "
-                        "V1 engine by default for Blackwell (SM 10.0) GPUs.")
-                    set_kv_cache_layout("HND")
-
-                    return FLASHINFER_V1
-
-                if not is_default_backend_supported.can_import:
-                    logger.warning_once(
-                        "FlashInfer failed to import for V1 engine on "
-                        "Blackwell (SM 10.0) GPUs; it is recommended to "
-                        "install FlashInfer for better performance.")
-
-            # FlashAttention is the default for SM 8.0+ GPUs
-            if cls.has_device_capability(80):
-                if (has_sink or
-                        use_fp8_kv_cache) and not cls.is_device_capability(90):
-                    logger.info_once("Using Triton backend on V1 engine.")
-                    return TRITON_ATTN
-                elif is_default_backend_supported := is_attn_backend_supported(
-                        FLASH_ATTN_V1, head_size, dtype,
-                        allow_import_error=False):
-                    logger.info_once("Using Flash Attention backend on "
-                                     "V1 engine.")
-                    return FLASH_ATTN_V1
-
-            # FlexAttention is the default for older GPUs
-            else:
-                logger.info_once("Using FlexAttention backend on V1 engine.")
-                return FLEX_ATTENTION_V1
+            return AttentionBackendEnum.XFORMERS
 
-            assert not is_default_backend_supported
+    @classmethod
+    def get_valid_backends(
+        cls,
+        head_size,
+        dtype,
+        kv_cache_dtype,
+        block_size,
+        use_mla,
+        has_sink,
+        use_sparse,
+        device_capability,
+        attn_type,
+    ) -> tuple[
+        list[tuple["AttentionBackendEnum", int]],
+        dict["AttentionBackendEnum", list[str]],
+    ]:
+        valid_backends_priorities = []
+        invalid_reasons = {}
+
+        backend_priorities = _get_backend_priorities(use_mla, device_capability)
+        for priority, backend in enumerate(backend_priorities):
+            try:
+                backend_class = backend.get_class()
+                invalid_reasons_i = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    block_size,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                    attn_type,
+                )
+            except ImportError:
+                invalid_reasons_i = ["ImportError"]
+            if invalid_reasons_i:
+                invalid_reasons[backend] = invalid_reasons_i
+            else:
+                valid_backends_priorities.append((backend, priority))
 
-            use_flex_attention_reason = {}
-            if not is_default_backend_supported.head_size:
-                use_flex_attention_reason["head_size"] = head_size
-            if not is_default_backend_supported.dtype:
-                use_flex_attention_reason["dtype"] = dtype
+        return valid_backends_priorities, invalid_reasons
 
-            logger.info_once(
-                "Using FlexAttention backend for %s on V1 engine.",
-                ", ".join(f"{k}={v}"
-                          for k, v in use_flex_attention_reason.items()),
+    @classmethod
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm.attention import AttentionType
+
+        if attn_type is None:
+            attn_type = AttentionType.DECODER
+
+        device_capability = cls.get_device_capability()
+        assert device_capability is not None
+
+        # First try checking just the selected backend, if there is one.
+        if selected_backend is not None:
+            try:
+                backend_class = selected_backend.get_class()
+                invalid_reasons = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    None,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                    attn_type,
+                )
+            except ImportError:
+                invalid_reasons = ["ImportError"]
+            if invalid_reasons:
+                raise ValueError(
+                    f"Selected backend {selected_backend} is not valid for "
+                    f"this configuration. Reason: {invalid_reasons}"
+                )
+            else:
+                logger.info("Using %s backend.", selected_backend)
+                return selected_backend.get_path()
+
+        # No selected backend or the selected backend is invalid,
+        # so we try finding a valid backend.
+        valid_backends_priorities, invalid_reasons = cls.get_valid_backends(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            None,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+            attn_type,
+        )
+        reasons_str = (
+            "{"
+            + ", ".join(
+                f"{backend.name}: [{', '.join(reasons)}]"
+                for backend, reasons in invalid_reasons.items()
             )
-            return FLEX_ATTENTION_V1
+            + "}"
+        )
+        config_str = (
+            f"head_size: {head_size}, dtype: {dtype}, "
+            f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, "
+            f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}"
+        )
+        logger.debug_once(
+            f"Some attention backends are not valid for {cls.device_name} with "
+            f"{config_str}. Reasons: {reasons_str}."
+        )
+        if len(valid_backends_priorities) == 0:
+            raise ValueError(
+                f"No valid attention backend found for {cls.device_name} "
+                f"with {config_str}. Reasons: {reasons_str}."
+            )
+
+        # We have found some valid backends. Select the one with the
+        # highest priority.
+        logger.info(
+            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
+        )
+        sorted_indices = sorted(
+            range(len(valid_backends_priorities)),
+            key=lambda i: valid_backends_priorities[i][1],
+        )
+        selected_index = sorted_indices[0]
+        selected_backend = valid_backends_priorities[selected_index][0]
+        logger.info(
+            "Using %s backend.",
+            selected_backend.name,
+        )
 
-        raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend.")
+        return selected_backend.get_path()
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
@@ -397,7 +437,9 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def get_device_communicator_cls(cls) -> str:
-        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+        return (
+            "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+        )
 
     @classmethod
     def supports_fp8(cls) -> bool:
@@ -415,87 +457,13 @@ class CudaPlatformBase(Platform):
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
-    @classmethod
-    def stateless_init_device_torch_dist_pg(
-        cls,
-        backend: str,
-        prefix_store: PrefixStore,
-        group_rank: int,
-        group_size: int,
-        timeout: timedelta,
-    ) -> ProcessGroup:
-        assert is_nccl_available()
-        pg: ProcessGroup = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-        pg._set_default_backend(backend_type)
-        backend_class._set_sequence_number_for_group()
-
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
-                                    model_config: "ModelConfig") -> bool:
-        fp8_attention = kv_cache_dtype.startswith("fp8")
-        attention_backend = envs.VLLM_ATTENTION_BACKEND
-
-        supported = False
-        if model_config is not None and model_config.use_mla:
-            # Default to CutlassMLA for blackwell,
-            # FlashMLA otherwise
-            if attention_backend is None:
-                if cls.is_device_capability(100):
-                    attention_backend = "CUTLASS_MLA"
-                else:
-                    attention_backend = "FLASHMLA"
-
-            # Only FlashMLA and CUTLASS_MLA support fp8
-            if attention_backend in [
-                    "FLASHMLA", "CUTLASS_MLA", "FLASHINFER_MLA"
-            ]:
-                supported = True
-            else:
-                supported = (not fp8_attention)
-        else:
-            # Default to FlashAttention
-            if attention_backend is None:
-                attention_backend = "FLASH_ATTN"
-
-            # All Blackwell backends support fp8
-            if cls.is_device_capability(100):
-                supported = True
-            elif attention_backend == "FLASH_ATTN":
-                if fp8_attention:
-                    from vllm.attention.utils.fa_utils import (
-                        flash_attn_supports_fp8)
-                    supported = flash_attn_supports_fp8()
-                else:
-                    supported = True
-            elif attention_backend == "FLASHINFER":
-                supported = True
-            elif attention_backend == "TRITON_ATTN":
-                supported = cls.supports_fp8()
-        return supported
-
-    @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
@@ -511,7 +479,32 @@ class CudaPlatformBase(Platform):
                     "with compute capability of at least 8.0. "
                     f"Your {gpu_name} GPU {compute_str}. "
                     "You can use float16 instead by explicitly setting the "
-                    "`dtype` flag in CLI, for example: --dtype=half.")
+                    "`dtype` flag in CLI, for example: --dtype=half."
+                )
+
+    @classmethod
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from src_cache to dst_cache on GPU."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
+
+    @classmethod
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from GPU to host (CPU)."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.cpu()
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
@@ -527,13 +520,10 @@ class CudaPlatformBase(Platform):
 # all the related functions work on real physical device ids.
 # the major benefit of using NVML is that it will not initialize CUDA
 class NvmlCudaPlatform(CudaPlatformBase):
-
     @classmethod
     @cache
     @with_nvml_context
-    def get_device_capability(cls,
-                              device_id: int = 0
-                              ) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         try:
             physical_device_id = cls.device_id_to_physical_device_id(device_id)
             handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
@@ -546,7 +536,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
     @with_nvml_context
     def has_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         try:
@@ -580,9 +570,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
         """
         query if the set of gpus are fully connected by nvlink (1 hop)
         """
-        handles = [
-            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
-        ]
+        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
         for i, handle in enumerate(handles):
             for j, peer_handle in enumerate(handles):
                 if i < j:
@@ -597,7 +585,8 @@ class NvmlCudaPlatform(CudaPlatformBase):
                     except pynvml.NVMLError:
                         logger.exception(
                             "NVLink detection failed. This is normal if"
-                            " your machine has no NVLink equipped.")
+                            " your machine has no NVLink equipped."
+                        )
                         return False
         return True
 
@@ -611,11 +600,11 @@ class NvmlCudaPlatform(CudaPlatformBase):
     def log_warnings(cls):
         device_ids: int = pynvml.nvmlDeviceGetCount()
         if device_ids > 1:
-            device_names = [
-                cls._get_physical_device_name(i) for i in range(device_ids)
-            ]
-            if (len(set(device_names)) > 1
-                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+            device_names = [cls._get_physical_device_name(i) for i in range(device_ids)]
+            if (
+                len(set(device_names)) > 1
+                and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"
+            ):
                 logger.warning(
                     "Detected different devices in the system: %s. Please"
                     " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
@@ -625,7 +614,6 @@ class NvmlCudaPlatform(CudaPlatformBase):
 
 
 class NonNvmlCudaPlatform(CudaPlatformBase):
-
     @classmethod
     @cache
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
@@ -645,7 +633,8 @@ class NonNvmlCudaPlatform(CudaPlatformBase):
     def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
         logger.exception(
             "NVLink detection not possible, as context support was"
-            " not found. Assuming no NVLink available.")
+            " not found. Assuming no NVLink available."
+        )
         return False
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 1691ad62650bc90008be27246bfbb820b4755c3b..0471c20429b1d0f26a3121d6ef531c5250b66858 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,65 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 import enum
 import os
 import platform
 import random
 import sys
 from datetime import timedelta
-from platform import uname
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import numpy as np
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
 
-from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
-    from vllm.lora.request import LoRARequest
+    from torch.distributed import PrefixStore, ProcessGroup
+
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
+    from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 else:
-    ModelConfig = None
-    VllmConfig = None
-    LoRARequest = None
-    PoolingParams = None
-    SamplingParams = None
-    FlexibleArgumentParser = None
+    FlexibleArgumentParser = object
 
 logger = init_logger(__name__)
 
 
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
-class _Backend(enum.Enum):
-    FLASH_ATTN = enum.auto()
-    TRITON_ATTN = enum.auto()
-    XFORMERS = enum.auto()
-    ROCM_FLASH = enum.auto()
-    ROCM_AITER_MLA = enum.auto()  # Supported by V1
-    ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
-    TORCH_SDPA = enum.auto()
-    FLASHINFER = enum.auto()
-    FLASHINFER_MLA = enum.auto()
-    TRITON_MLA = enum.auto()  # Supported by V1
-    CUTLASS_MLA = enum.auto()
-    FLASHMLA = enum.auto()  # Supported by V1
-    FLASH_ATTN_MLA = enum.auto()  # Supported by V1
-    PALLAS = enum.auto()
-    IPEX = enum.auto()
-    DUAL_CHUNK_FLASH_ATTN = enum.auto()
-    DIFFERENTIAL_FLASH_ATTN = enum.auto()
-    NO_ATTENTION = enum.auto()
-    FLEX_ATTENTION = enum.auto()
-    TREE_ATTN = enum.auto()
-    ROCM_ATTN = enum.auto()
+    return "microsoft" in " ".join(platform.uname()).lower()
 
 
 class PlatformEnum(enum.Enum):
@@ -86,6 +59,31 @@ class DeviceCapability(NamedTuple):
     major: int
     minor: int
 
+    def __lt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) < (other.major, other.minor)
+
+    def __le__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) <= (other.major, other.minor)
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) == (other.major, other.minor)
+
+    def __ge__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) >= (other.major, other.minor)
+
+    def __gt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) > (other.major, other.minor)
+
     def as_version_str(self) -> str:
         return f"{self.major}.{self.minor}"
 
@@ -134,7 +132,7 @@ class Platform:
 
     additional_env_vars: list[str] = []
 
-    _global_graph_pool: Optional[Any] = None
+    _global_graph_pool: Any | None = None
 
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
@@ -162,6 +160,9 @@ class Platform:
     def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
+    def is_unspecified(self) -> bool:
+        return self._enum == PlatformEnum.UNSPECIFIED
+
     def get_max_output_tokens(self, prompt_len: int) -> int:
         return sys.maxsize
 
@@ -170,15 +171,21 @@ class Platform:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     def is_sleep_mode_available(self) -> bool:
-        return self._enum == PlatformEnum.CUDA
+        # TODO: Actually only mi3xx has the sleep mode support now
+        # for ROCm, but currently we don't have a way to detect the
+        # exact GPU model statelessly here. So we return True for
+        # all ROCm platforms for now.
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
         # Treat empty device control env var as unset. This is a valid
         # configuration in Ray setups where the engine is launched in
         # a CPU-only placement group located on a GPU node.
-        if cls.device_control_env_var in os.environ and os.environ[
-                cls.device_control_env_var] != "":
+        if (
+            cls.device_control_env_var in os.environ
+            and os.environ[cls.device_control_env_var] != ""
+        ):
             device_ids = os.environ[cls.device_control_env_var].split(",")
             physical_device_id = device_ids[device_id]
             return int(physical_device_id)
@@ -186,15 +193,37 @@ class Platform:
             return device_id
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int,
-                             dtype: torch.dtype) -> _Backend:
-        return _Backend.TORCH_SDPA
+    def import_kernels(cls) -> None:
+        """Import any platform-specific C kernels."""
+        try:
+            import vllm._C  # noqa: F401
+        except ImportError as e:
+            logger.warning("Failed to import from vllm._C: %r", e)
+        with contextlib.suppress(ImportError):
+            import vllm._moe_C  # noqa: F401
+
+    @classmethod
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        # Import AttentionBackendEnum here to avoid circular import.
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink: bool, use_sparse: bool) -> str:
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        attn_type: str | None = None,
+    ) -> str:
         """Get the attention backend class of a device."""
         return ""
 
@@ -202,14 +231,14 @@ class Platform:
     def get_device_capability(
         cls,
         device_id: int = 0,
-    ) -> Optional[DeviceCapability]:
+    ) -> DeviceCapability | None:
         """Stateless version of [torch.cuda.get_device_capability][]."""
         return None
 
     @classmethod
     def has_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         """
@@ -233,7 +262,7 @@ class Platform:
     @classmethod
     def is_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         """
@@ -280,7 +309,7 @@ class Platform:
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def seed_everything(cls, seed: Optional[int] = None) -> None:
+    def seed_everything(cls, seed: int | None = None) -> None:
         """
         Set the seed of each random module.
         `torch.manual_seed` will set seed on all devices.
@@ -300,9 +329,9 @@ class Platform:
         raise NotImplementedError
 
     @classmethod
-    def pre_register_and_update(cls,
-                                parser: Optional[FlexibleArgumentParser] = None
-                                ) -> None:
+    def pre_register_and_update(
+        cls, parser: FlexibleArgumentParser | None = None
+    ) -> None:
         """
         Do some pre-registration or update action for the current platform.
 
@@ -316,7 +345,7 @@ class Platform:
         pass
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
         Check and update the configuration for the current platform.
 
@@ -345,11 +374,10 @@ class Platform:
         """
         Verify whether the quantization is supported by the current platform.
         """
-        if cls.supported_quantization and \
-            quant not in cls.supported_quantization:
+        if cls.supported_quantization and quant not in cls.supported_quantization:
             raise ValueError(
-                f"{quant} quantization is currently not supported in "
-                f"{cls.device_name}.")
+                f"{quant} quantization is currently not supported in {cls.device_name}."
+            )
 
     @classmethod
     def get_cpu_architecture(cls) -> CpuArchEnum:
@@ -378,15 +406,17 @@ class Platform:
         if in_wsl():
             # Pinning memory in WSL is not supported.
             # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-            logger.warning("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
+            logger.warning(
+                "Using 'pin_memory=False' as WSL is detected. "
+                "This may slow down the performance."
+            )
             return False
         return True
 
     @classmethod
-    def get_current_memory_usage(cls,
-                                 device: Optional[torch.types.Device] = None
-                                 ) -> float:
+    def get_current_memory_usage(
+        cls, device: torch.types.Device | None = None
+    ) -> float:
         """
         Return the memory usage in bytes.
         """
@@ -469,13 +499,7 @@ class Platform:
         """
         Whether to use allgather in LogitsProcessor to gather the logits.
         """
-        import vllm.envs as envs
-        from vllm.config import get_current_vllm_config
-
-        parallel_config = get_current_vllm_config().parallel_config
-        return (envs.VLLM_USE_V1
-                or parallel_config.distributed_executor_backend
-                == "external_launcher")
+        return True
 
     @classmethod
     def use_custom_allreduce(cls) -> bool:
@@ -495,9 +519,9 @@ class Platform:
     @classmethod
     def validate_request(
         cls,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        processed_inputs: ProcessorInputs,
+        prompt: "PromptType",
+        params: "SamplingParams | PoolingParams",
+        processed_inputs: "ProcessorInputs",
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
@@ -506,8 +530,11 @@ class Platform:
         if device is not None and hasattr(device, key):
             return getattr(device, key)
         else:
-            logger.warning("Current platform %s does not have '%s'" \
-            " attribute.", self.device_type, key)
+            logger.warning(
+                "Current platform %s does not have '%s' attribute.",
+                self.device_type,
+                key,
+            )
             return None
 
     def get_global_graph_pool(self) -> Any:
@@ -519,13 +546,6 @@ class Platform:
             cls._global_graph_pool = self.graph_pool_handle()
         return cls._global_graph_pool
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        """
-        Returns the total number of compute units (CU) on single GPU.
-        """
-        raise NotImplementedError
-
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         """
@@ -537,26 +557,18 @@ class Platform:
     def stateless_init_device_torch_dist_pg(
         cls,
         backend: str,
-        prefix_store: PrefixStore,
+        prefix_store: "PrefixStore",
         group_rank: int,
         group_size: int,
         timeout: timedelta,
-    ) -> ProcessGroup:
+    ) -> "ProcessGroup":
         """
         Init platform-specific torch distributed process group.
         """
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
-
-    @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
-                                    model_config: "ModelConfig") -> bool:
-        """
-        Returns if the kv_cache_dtype is supported by the current platform.
-        """
-        return False
+        raise NotImplementedError
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
         """
         Check if the dtype is supported by the current platform.
         """
@@ -602,18 +614,25 @@ class Platform:
     @classmethod
     def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]:
         """
-        Returns a mapping from device_type to a tuple of supported 
+        Returns a mapping from device_type to a tuple of supported
         kv_buffer_device for nixl.
         """
         return {}
 
     @classmethod
-    def get_nixl_memory_type(cls) -> Optional[str]:
+    def get_nixl_memory_type(cls) -> str | None:
         """
         Returns the nixl memory type for the current platform.
         """
         return None
 
+    @classmethod
+    def check_max_model_len(cls, max_model_len: int) -> int:
+        """
+        Check max_model_len for the current platform.
+        """
+        return max_model_len
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index db5e8b13c7a8555bcd77200e43da357c3fba3c4c..232ec7e3cb944fc7f7b70d9908313c6ca55b920e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -2,29 +2,34 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from datetime import timedelta
 from functools import cache, lru_cache, wraps
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
-from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils.torch_utils import cuda_device_count_stateless
 
-from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
+else:
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
 try:
-    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
-                        amdsmi_get_processor_handles, amdsmi_init,
-                        amdsmi_shut_down, amdsmi_topo_get_link_type)
+    from amdsmi import (
+        AmdSmiException,
+        amdsmi_get_gpu_asic_info,
+        amdsmi_get_processor_handles,
+        amdsmi_init,
+        amdsmi_shut_down,
+        amdsmi_topo_get_link_type,
+    )
 except ImportError as e:
     logger.warning("Failed to import from amdsmi with %r", e)
 
@@ -44,36 +49,21 @@ _ROCM_UNSUPPORTED_MODELS: list[str] = []
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
-                    "Triton flash attention. For half-precision SWA support, "
-                    "please use CK flash attention by setting "
-                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
-    "Qwen2ForCausalLM":
-    _ROCM_SWA_REASON,
-    "MistralForCausalLM":
-    _ROCM_SWA_REASON,
-    "MixtralForCausalLM":
-    _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration":
-    ("ROCm flash attention does not yet "
-     "fully support 32-bit precision on PaliGemma"),
-    "Phi3VForCausalLM":
-    ("ROCm Triton flash attention may run into compilation errors due to "
-     "excessive use of shared memory. If this happens, disable Triton FA "
-     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
-}
+_ROCM_SWA_REASON = ()
+_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {}
 _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
     "0x74a0": "AMD_Instinct_MI300A",
     "0x74a1": "AMD_Instinct_MI300X",
     "0x74b5": "AMD_Instinct_MI300X",  # MI300X VF
+    "0x74a2": "AMD_Instinct_MI308X",
     "0x74a5": "AMD_Instinct_MI325X",
     "0x74b9": "AMD_Instinct_MI325X",  # MI325X VF
     "0x74a9": "AMD_Instinct_MI300X_HF",
     "0x74bd": "AMD_Instinct_MI300X_HF",
+    "0x744c": "AMD_Radeon_RX7900XTX",
 }
 
-# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
 # if "HIP_VISIBLE_DEVICES" in os.environ:
 #     val = os.environ["HIP_VISIBLE_DEVICES"]
 #     if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
@@ -88,7 +78,6 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
 
 
 def with_amdsmi_context(fn):
-
     @wraps(fn)
     def wrapper(*args, **kwargs):
         amdsmi_init()
@@ -126,15 +115,17 @@ def on_gfx950() -> bool:
 
 @cache
 def use_rocm_custom_paged_attention(
-        qtype: torch.dtype,
-        head_size: int,
-        block_size: int,
-        gqa_ratio: int,
-        max_seq_len: int,
-        sliding_window: int,
-        kv_cache_dtype: str,
-        alibi_slopes: Optional[torch.Tensor] = None,
-        sinks: Optional[torch.Tensor] = None) -> bool:
+    qtype: torch.dtype,
+    head_size: int,
+    block_size: int,
+    gqa_ratio: int,
+    max_seq_len: int,
+    sliding_window: int,
+    kv_cache_dtype: str,
+    alibi_slopes: torch.Tensor | None = None,
+    sinks: torch.Tensor | None = None,
+) -> bool:
+    from vllm._aiter_ops import rocm_aiter_ops
 
     # GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     # ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
@@ -143,26 +134,32 @@ def use_rocm_custom_paged_attention(
     # custom paged attn always supported on V0. On V1, requires sliding window
     # disabled due to observed numerical discrepancy.
     # if ON_GFX9:
-    #     return ((not envs.VLLM_USE_V1 or sliding_window == 0
-    #              or sliding_window == (-1, -1))
-    #             and (qtype == torch.half or qtype == torch.bfloat16)
-    #             and (head_size == 64 or head_size == 128)
-    #             and (block_size == 16 or block_size == 32)
-    #             and (gqa_ratio >= 1 and gqa_ratio <= 16)
-    #             and max_seq_len <= 128 * 1024
-    #             and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-    #             and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-    #                      and envs.VLLM_ROCM_USE_AITER) and sinks is None)
+    #     return (
+    #         (sliding_window == 0 or sliding_window == (-1, -1))
+    #         and (qtype == torch.half or qtype == torch.bfloat16)
+    #         and (head_size == 64 or head_size == 128)
+    #         and (block_size == 16 or block_size == 32)
+    #         and (gqa_ratio >= 1 and gqa_ratio <= 16)
+    #         and max_seq_len <= 128 * 1024
+    #         and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+    #         and not (rocm_aiter_ops.is_pa_attn_enabled())
+    #         and sinks is None
+    #     )
 
     # else:
-    #     return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
-    #                                 or sliding_window == (-1, -1))
-    #             and (qtype == torch.half or qtype == torch.bfloat16)
-    #             and head_size == 128 and block_size == 16
-    #             and (gqa_ratio >= 3 and gqa_ratio <= 16)
-    #             and max_seq_len <= 128 * 1024 and alibi_slopes is None
-    #             and kv_cache_dtype == "auto"
-    #             and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN and sinks is None)
+    #     return (
+    #         ON_GFX11_GFX12
+    #         and (sliding_window == 0 or sliding_window == (-1, -1))
+    #         and (qtype == torch.half or qtype == torch.bfloat16)
+    #         and head_size == 128
+    #         and block_size == 16
+    #         and (gqa_ratio >= 3 and gqa_ratio <= 16)
+    #         and max_seq_len <= 128 * 1024
+    #         and alibi_slopes is None
+    #         and kv_cache_dtype == "auto"
+    #         and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN
+    #         and sinks is None
+    #     )
     return False
 
 
@@ -177,103 +174,107 @@ class RocmPlatform(Platform):
     device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     supported_quantization: list[str] = [
-        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
-        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao"
+        "awq",
+        "gptq",
+        "fp8",
+        "compressed-tensors",
+        "fbgemm_fp8",
+        "gguf",
+        "quark",
+        "ptpc_fp8",
+        "mxfp4",
+        "petit_nvfp4",
+        "torchao",
     ]
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int,
-                             dtype: torch.dtype) -> _Backend:
-        if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
-                and on_gfx9()):
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> AttentionBackendEnum:
+        from importlib.util import find_spec
+
+        from vllm._aiter_ops import rocm_aiter_ops
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
             # TODO: Add support for other VL models in their model class.
-            return _Backend.ROCM_AITER_FA
-        if on_gfx9():
-            return _Backend.FLASH_ATTN
-        return _Backend.TORCH_SDPA
+            return AttentionBackendEnum.ROCM_AITER_FA
+
+        if on_gfx9() and find_spec("flash_attn") is not None:
+            return AttentionBackendEnum.FLASH_ATTN
+
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla,
-                             has_sink, use_sparse) -> str:
+    def get_attn_backend_cls(
+        cls,
+        selected_backend,
+        head_size,
+        dtype,
+        kv_cache_dtype,
+        block_size,
+        use_mla,
+        has_sink,
+        use_sparse,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm._aiter_ops import rocm_aiter_ops
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         if use_sparse:
-            raise NotImplementedError(
-                "Sparse Attention is not supported on ROCm.")
+            raise NotImplementedError("Sparse Attention is not supported on ROCm.")
+
         if use_mla:
-            if not use_v1:
-                raise RuntimeError(
-                    "MLA attention backends require the V1 engine. "
-                    "Set VLLM_USE_V1=1 to enable them.")
-
-            # from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
-            #     is_aiter_mla_enabled)
-            if envs.VLLM_USE_FLASH_MLA:
-                from vllm.attention.ops.flashmla import is_flashmla_supported
-                use_flashmla = selected_backend == _Backend.FLASHMLA or (
-                    selected_backend is None and is_flashmla_supported()[0])
-                
-                if use_flashmla:
-                    if block_size != 64:
-                        logger.warning(
-                            "FlashMLA backend is not supported for block size %d"
-                            " (currently only supports block size 64).",
-                            block_size)
-                    else:
-                        logger.info_once("Using FlashMLA backend on V1 engine.")
-                        return ("vllm.v1.attention.backends.mla."
-                                "flashmla.FlashMLABackend")
-            else:
-                if selected_backend is None:
-                    # selected_backend = (_Backend.ROCM_AITER_MLA if
-                    #                     is_aiter_mla_enabled() or block_size == 1
-                    #                     else _Backend.TRITON_MLA)
-                    selected_backend = _Backend.TRITON_MLA
-
-                if selected_backend == _Backend.TRITON_MLA:
-                    if block_size != 1:
-                        logger.info_once("Using Triton MLA backend on V1 engine.")
-                        return ("vllm.v1.attention.backends.mla."
-                                "triton_mla.TritonMLABackend")
-                    raise ValueError(
-                        f" The selected backend, {selected_backend.name},"
-                        f"does not support block size {block_size}.")
-                # if selected_backend == _Backend.ROCM_AITER_MLA:
-                #     if block_size == 1:
-                #         logger.info("Using AITER MLA backend on V1 engine.")
-                #         return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
-                #     raise ValueError(
-                #         f" The selected backend, {selected_backend.name},"
-                #         f"does not support block size {block_size}."
-                #         "(currently only supports block size 1)")
-            
+            if selected_backend is None:
+                selected_backend = (
+                    # AttentionBackendEnum.ROCM_AITER_MLA
+                    # if rocm_aiter_ops.is_mla_enabled() or block_size == 1
+                    # else AttentionBackendEnum.TRITON_MLA
+                    AttentionBackendEnum.TRITON_MLA
+                )
+
+            if selected_backend == AttentionBackendEnum.TRITON_MLA:
+                if block_size != 1:
+                    logger.info_once("Using Triton MLA backend.")
+                    return AttentionBackendEnum.TRITON_MLA.get_path()
                 raise ValueError(
                     f" The selected backend, {selected_backend.name},"
-                    f"is not MLA type while requested for MLA backend.")
-
-        if envs.VLLM_USE_V1:
-            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA \
-                and on_gfx9():
-                logger.info("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "rocm_aiter_fa.AiterFlashAttentionBackend")
-            elif (envs.VLLM_ROCM_USE_AITER and
-                envs.VLLM_USE_AITER_UNIFIED_ATTENTION) or \
-                    envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION or \
-                        selected_backend == _Backend.ROCM_ATTN:
-                # rocm specific backend, with aiter and/or
-                #   triton prefix-prefill
-                logger.info("Using Rocm/Aiter Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "rocm_attn.RocmAttentionBackend")
-            else:
-                # default case, using triton unified attention
-                logger.info("Using Triton Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "triton_attn.TritonAttentionBackend")
-        raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend.")
+                    f"does not support block size {block_size}."
+                )
+            # if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
+            #     logger.info("Using AITER MLA backend.")
+            #     return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
+
+            raise ValueError(
+                f" The selected backend, {selected_backend.name},"
+                f"is not MLA type while requested for MLA backend."
+            )
+
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
+            logger.info("Using FlexAttention backend.")
+            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+        # if (
+        #     rocm_aiter_ops.is_mha_enabled()
+        # ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+        #     logger.info("Using Aiter Flash Attention backend.")
+        #     return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+        # if (
+        #     rocm_aiter_ops.is_triton_unified_attn_enabled()
+        # ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+        #     logger.info("Using Aiter Unified Attention backend.")
+        #     return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+        if (
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
+            or selected_backend == AttentionBackendEnum.ROCM_ATTN
+        ):
+            # rocm specific backend, with aiter and/or
+            #   triton prefix-prefill
+            logger.info("Using Rocm Attention backend.")
+            return AttentionBackendEnum.ROCM_ATTN.get_path()
+        # default case, using triton unified attention
+        logger.info("Using Triton Attention backend.")
+        return AttentionBackendEnum.TRITON_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -284,9 +285,7 @@ class RocmPlatform(Platform):
 
     @classmethod
     @lru_cache(maxsize=8)
-    def get_device_capability(cls,
-                              device_id: int = 0
-                              ) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
@@ -296,21 +295,17 @@ class RocmPlatform(Platform):
         """
         Query if the set of gpus are fully connected by xgmi (1 hop)
         """
-        handles = [
-            amdsmi_get_processor_handles()[i] for i in physical_device_ids
-        ]
+        handles = [amdsmi_get_processor_handles()[i] for i in physical_device_ids]
         for i, handle in enumerate(handles):
             for j, peer_handle in enumerate(handles):
                 if i < j:
                     try:
-                        link_type = amdsmi_topo_get_link_type(
-                            handle, peer_handle)
+                        link_type = amdsmi_topo_get_link_type(handle, peer_handle)
                         # type is 2 for XGMI
                         if link_type["hops"] != 1 or link_type["type"] != 2:
                             return False
                     except AmdSmiException as error:
-                        logger.error("AMD 1 hop XGMI detection failed.",
-                                     exc_info=error)
+                        logger.error("AMD 1 hop XGMI detection failed.", exc_info=error)
                         return False
         return True
 
@@ -333,6 +328,7 @@ class RocmPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm._aiter_ops import rocm_aiter_ops
         from vllm.config.compilation import CUDAGraphMode
 
         cache_config = vllm_config.cache_config
@@ -340,41 +336,35 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
-        use_v1 = envs.VLLM_USE_V1
-        use_aiter_rms_norm = envs.VLLM_ROCM_USE_AITER and \
-             envs.VLLM_ROCM_USE_AITER_RMSNORM
+        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
 
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
         if parallel_config.worker_cls == "auto":
-            if vllm_config.speculative_config:
-                if not use_v1:
-                    raise NotImplementedError(
-                        "Speculative decoding is not supported on vLLM V0.")
-                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-            else:
-                if use_v1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.gpu_worker.Worker"
-                else:
-                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
         #  Aiter rms norm perform best when CUDA Graph capture is enabled.
-        if (use_v1 and use_aiter_rms_norm and not is_eager_execution
-                and "-rms_norm" not in compilation_config.custom_ops):
+        if (
+            use_aiter_rms_norm
+            and not is_eager_execution
+            and "-rms_norm" not in compilation_config.custom_ops
+        ):
             compilation_config.custom_ops.append("+rms_norm")
 
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         if model_arch in _ROCM_UNSUPPORTED_MODELS:
-            raise ValueError(f"Model architecture '{model_arch}' is not "
-                             "supported by ROCm for now.")
+            raise ValueError(
+                f"Model architecture '{model_arch}' is not supported by ROCm for now."
+            )
 
         if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
             msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
             logger.warning(
-                "Model architecture '%s' is partially "
-                "supported by ROCm: %s", model_arch, msg)
+                "Model architecture '%s' is partially supported by ROCm: %s",
+                model_arch,
+                msg,
+            )
 
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
@@ -382,25 +372,27 @@ class RocmPlatform(Platform):
         if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
             logger.warning(
                 "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
-                " is not set, enabling VLLM_USE_TRITON_AWQ.")
-        envs.VLLM_USE_TRITON_AWQ = True
+                " is not set, enabling VLLM_USE_TRITON_AWQ."
+            )
+        os.environ["VLLM_USE_TRITON_AWQ"] = "1"
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
 
     @classmethod
-    def get_current_memory_usage(cls,
-                                 device: Optional[torch.types.Device] = None
-                                 ) -> float:
+    def get_current_memory_usage(
+        cls, device: torch.types.Device | None = None
+    ) -> float:
         torch.cuda.reset_peak_memory_stats(device)
-        # return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
-        #     device)[0]
+        # return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(device)[0]
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
     def get_device_communicator_cls(cls) -> str:
-        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+        return (
+            "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+        )
 
     @classmethod
     def supports_mx(cls) -> bool:
@@ -410,12 +402,12 @@ class RocmPlatform(Platform):
     @classmethod
     def supports_fp8(cls) -> bool:
         gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
-        return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
+        return any(gfx in gcn_arch for gfx in ["gfx94", "gfx95", "gfx12"])
 
     @classmethod
     def is_fp8_fnuz(cls) -> bool:
         # only device 0 is checked, this assumes MI300 platforms are homogeneous
-        return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
 
     @classmethod
     def fp8_dtype(cls) -> torch.dtype:
@@ -428,68 +420,28 @@ class RocmPlatform(Platform):
     def use_custom_allreduce(cls) -> bool:
         # We only enable custom allreduce for MI300 series
         gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
-        supported_archs = ['gfx94', 'gfx95']
+        supported_archs = ["gfx94", "gfx95"]
         return any(gfx in gcn_arch for gfx in supported_archs)
 
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        return torch.cuda.get_device_properties(
-            device_id).multi_processor_count
-
     @classmethod
     def is_navi(cls) -> bool:
-        return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
+        return "gfx1" in torch.cuda.get_device_properties(0).gcnArchName
 
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
-    @classmethod
-    def stateless_init_device_torch_dist_pg(
-        cls,
-        backend: str,
-        prefix_store: PrefixStore,
-        group_rank: int,
-        group_size: int,
-        timeout: timedelta,
-    ) -> ProcessGroup:
-        assert is_nccl_available()
-        pg: ProcessGroup = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-        pg._set_default_backend(backend_type)
-        backend_class._set_sequence_number_for_group()
-
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
-                                    model_config: "ModelConfig") -> bool:
-        return True
-
-    @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
@@ -505,7 +457,8 @@ class RocmPlatform(Platform):
                     "with compute capability of at least 8.0. "
                     f"Your {gpu_name} GPU {compute_str}. "
                     "You can use float16 instead by explicitly setting the "
-                    "`dtype` flag in CLI, for example: --dtype=half.")
+                    "`dtype` flag in CLI, for example: --dtype=half."
+                )
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 92f8496bedb7c5946c2e24e23d2dcf88d5b0f15e..0544296a69111e64f2d7fe869935b4997ae7ebe7 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union, cast
+import contextlib
+from typing import TYPE_CHECKING, cast
 
 import torch
 from tpu_info import device
@@ -9,23 +10,29 @@ from tpu_info import device
 import vllm.envs as envs
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
-from .interface import Platform, PlatformEnum, _Backend
+from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.config import BlockSize, ModelConfig, VllmConfig
+    from typing import TypeAlias
+
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
+    from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+
+    ParamsType: TypeAlias = SamplingParams | PoolingParams
 else:
     BlockSize = None
-    ModelConfig = None
     VllmConfig = None
     PoolingParams = None
+    AttentionBackendEnum = None
+    ParamsType = None
 
 logger = init_logger(__name__)
 
-USE_TPU_COMMONS = False
+USE_TPU_INFERENCE = False
 
 
 class TpuPlatform(Platform):
@@ -38,31 +45,38 @@ class TpuPlatform(Platform):
     device_control_env_var: str = "TPU_VISIBLE_CHIPS"
     simple_compile_backend: str = "openxla"
 
-    supported_quantization: list[str] = [
-        "fp8", "tpu_int8", "compressed-tensors"
-    ]
+    supported_quantization: list[str] = ["fp8", "tpu_int8", "compressed-tensors"]
 
-    additional_env_vars: list[str] = [
-        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
-    ]
+    additional_env_vars: list[str] = ["TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"]
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink, use_sparse) -> str:
+    def import_kernels(cls) -> None:
+        # Do not import vllm._C
+        with contextlib.suppress(ImportError):
+            import vllm._moe_C  # noqa: F401
+
+    @classmethod
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: str | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink,
+        use_sparse,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         if use_sparse:
-            raise NotImplementedError(
-                "Sparse Attention is not supported on TPU.")
-        if selected_backend != _Backend.PALLAS:
+            raise NotImplementedError("Sparse Attention is not supported on TPU.")
+        if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
-        if use_v1:
-            logger.info("Using Pallas V1 backend.")
-            return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
-        else:
-            logger.info("Using Pallas backend.")
-            return "vllm.attention.backends.pallas.PallasAttentionBackend"
+        logger.info("Using Pallas V1 backend.")
+        return AttentionBackendEnum.PALLAS.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -102,7 +116,7 @@ class TpuPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode
 
         cache_config = vllm_config.cache_config
         # For v0, the default block size is 16.
@@ -110,38 +124,46 @@ class TpuPlatform(Platform):
             cache_config.block_size = cast(BlockSize, 16)
         compilation_config = vllm_config.compilation_config
 
-        # TPU only supports DYNAMO_ONCE compilation level
-        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
-            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level, and "
-                        "disabling cudagraph.")
-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
-
-        if compilation_config.cudagraph_mode is None or \
-                compilation_config.cudagraph_mode.max_cudagraph_mode() \
-                    != CUDAGraphMode.NONE:
-            logger.info("[TPU] CUDA graph is not supported on TPU, "
-                        "disabling cudagraphs.")
+        # TPU only supports DYNAMO_TRACE_ONCE compilation mode
+        if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
+            logger.info(
+                "[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\
+                disabling cudagraph."
+            )
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+
+        if (
+            compilation_config.cudagraph_mode is None
+            or compilation_config.cudagraph_mode.max_cudagraph_mode()
+            != CUDAGraphMode.NONE
+        ):
+            logger.info(
+                "[TPU] CUDA graph is not supported on TPU, disabling cudagraphs."
+            )
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         if compilation_config.backend == "":
             compilation_config.backend = "openxla"
 
-        assert vllm_config.speculative_config is None, \
+        assert vllm_config.speculative_config is None, (
             "TPU does not support speculative decoding"
+        )
 
         model_config = vllm_config.model_config
-        if model_config is not None and model_config.dtype in (torch.float16,
-                                                               torch.float32):
+        if model_config is not None and model_config.dtype in (
+            torch.float16,
+            torch.float32,
+        ):
             logger.warning(
                 "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.", model_config.dtype)
+                "Using bfloat16 instead.",
+                model_config.dtype,
+            )
             model_config.dtype = torch.bfloat16
 
-        if envs.VLLM_USE_V1:
-            from vllm.v1.attention.backends.pallas import (
-                PallasAttentionBackend)
-            cache_config.block_size = PallasAttentionBackend.get_page_size(
-                vllm_config)  # type: ignore[assignment]
+        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
+
+        cache_config.block_size = PallasAttentionBackend.get_page_size(vllm_config)  # type: ignore[assignment]
 
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
@@ -149,24 +171,30 @@ class TpuPlatform(Platform):
             parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
 
         assert not vllm_config.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend")
+            "Speculative decoding is not yet supported for TPU backend"
+        )
 
-        if scheduler_config.is_multimodal_model and not \
-                scheduler_config.disable_chunked_mm_input:
-            logger.warning("TPU does not support running Multimodal models"\
-            " without setting `--disable_chunked_mm_input`. " \
-            "Forcing --disable_chunked_mm_input.")
+        if (
+            scheduler_config.is_multimodal_model
+            and not scheduler_config.disable_chunked_mm_input
+        ):
+            logger.warning(
+                "TPU does not support running Multimodal models"
+                " without setting `--disable_chunked_mm_input`. "
+                "Forcing --disable_chunked_mm_input."
+            )
             scheduler_config.disable_chunked_mm_input = True
 
         if model_config and model_config.use_mla:
             logger.info(
                 "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
+                "prefill and prefix caching to be disabled."
+            )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
 
     @classmethod
     def is_pin_memory_available(cls):
@@ -177,30 +205,21 @@ class TpuPlatform(Platform):
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
 
-    @classmethod
-    def use_all_gather(cls) -> bool:
-        return True
-
     @classmethod
     def validate_request(
         cls,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: ParamsType,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
-        if isinstance(params, SamplingParams):
-            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
-                raise ValueError("Structured output is not supported on "
-                                 f"{cls.device_name} V0.")
-            if params.sampling_type == SamplingType.RANDOM_SEED:
-                raise ValueError(
-                    "Torch XLA does not support per-request seed.")
+        from vllm.sampling_params import SamplingParams, SamplingType
 
-    @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
-                                    model_config: "ModelConfig") -> bool:
-        return True
+        if (
+            isinstance(params, SamplingParams)
+            and params.sampling_type == SamplingType.RANDOM_SEED
+        ):
+            raise ValueError("Torch XLA does not support per-request seed.")
 
     @classmethod
     @torch.compile(backend="openxla")
@@ -212,8 +231,7 @@ class TpuPlatform(Platform):
         dst_block_indices: torch.Tensor,
     ) -> None:
         torch.ops.xla.dynamo_set_buffer_donor_(dst_cache, True)
-        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(
-            dst_cache.device)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(dst_cache.device)
 
     @classmethod
     @torch.compile(backend="openxla")
@@ -224,7 +242,7 @@ class TpuPlatform(Platform):
         src_block_indices: torch.Tensor,
         dst_block_indices: torch.Tensor,
     ) -> None:
-        """ tpu blocks to cpu blocks"""
+        """tpu blocks to cpu blocks"""
         torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
         dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
 
@@ -232,11 +250,28 @@ class TpuPlatform(Platform):
     def use_sync_weight_loader(cls) -> bool:
         return True
 
+    @classmethod
+    def check_max_model_len(cls, max_model_len: int) -> int:
+        """
+        Check max_model_len for the current platform.
+        """
+        logger.warning(
+            "--max-model-len is not specified, "
+            "it's currently using model's default length %d, "
+            "which might be too large."
+            "Please input with --max-model-len based on your "
+            "request input length and output length, to avoid "
+            "unnecessary degradation.",
+            max_model_len,
+        )
+        return max_model_len
+
 
 try:
-    from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
-    TpuPlatform = TpuCommonsPlatform  # type: ignore
-    USE_TPU_COMMONS = True
+    from tpu_inference.platforms import TpuPlatform as TpuInferencePlatform
+
+    TpuPlatform = TpuInferencePlatform  # type: ignore
+    USE_TPU_INFERENCE = True
 except ImportError:
-    logger.info("tpu_commons not found, using vLLM's TpuPlatform")
+    logger.info("tpu_inference not found, using vLLM's TpuPlatform")
     pass
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 10bd730c79112d30f3eed4aa468826f602a46199..4bb8732fde4cd8cdf67c7cebb4ad92a289909514 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,22 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
-from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
 else:
-    ModelConfig = None
     VllmConfig = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -33,44 +34,50 @@ class XPUPlatform(Platform):
     device_control_env_var: str = "ZE_AFFINITY_MASK"
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink: bool, use_sparse) -> str:
+    def import_kernels(cls) -> None:
+        # Do not import vllm._C
+        with contextlib.suppress(ImportError):
+            import vllm._moe_C  # noqa: F401
+
+    @classmethod
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: str | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout("NHD")
+        logger.info(
+            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+            "only NHD layout is supported by XPU attention kernels."
+        )
+
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         if use_sparse:
-            raise NotImplementedError(
-                "Sparse Attention is not supported on XPU.")
-        use_v1 = envs.VLLM_USE_V1
-        if not use_v1:
-            raise ValueError("XPU backend only supports V1.")
-        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-        FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-        if selected_backend == _Backend.TRITON_ATTN:
-            logger.info_once("Using Triton backend on V1 engine.")
-            return TRITON_ATTN
-        elif selected_backend == _Backend.FLASH_ATTN:
-            logger.info_once("Using Flash Attention backend on V1 engine.")
-            return FLASH_ATTN
+            raise NotImplementedError("Sparse Attention is not supported on XPU.")
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+            logger.info_once("Using Triton backend.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+        elif selected_backend == AttentionBackendEnum.FLASH_ATTN:
+            logger.info_once("Using Flash Attention backend.")
+            return AttentionBackendEnum.FLASH_ATTN.get_path()
         elif selected_backend:
             raise ValueError(
                 f"Invalid attention backend for {cls.device_name}, "
-                f"with use_v1: {use_v1} use_mla: {use_mla}")
+                f"with use_mla: {use_mla}"
+            )
 
-        logger.info("Using Flash Attention backend on V1 engine.")
-        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-
-    @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
-                                    model_config: "ModelConfig") -> bool:
-        """
-        Check if the kv_cache_dtype is supported.
-        XPU only support fp8 kv cache with triton backend.
-        """
-        if envs.is_set("VLLM_ATTENTION_BACKEND") and \
-            envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN":
-            return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"]
-
-        return False
+        logger.info("Using Flash Attention backend.")
+        return AttentionBackendEnum.FLASH_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -83,7 +90,7 @@ class XPUPlatform(Platform):
     def get_device_capability(
         cls,
         device_id: int = 0,
-    ) -> Optional[DeviceCapability]:
+    ) -> DeviceCapability | None:
         # capacity format differs from cuda's and will cause unexpected
         # failure, so use None directly
         return None
@@ -94,13 +101,25 @@ class XPUPlatform(Platform):
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        xpu_use_triton_kernel = os.getenv("XPU_USE_TRITON_KERNEL", "0") == "1"
+        if not xpu_use_triton_kernel:
+            return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        else:
+            return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.xpu.get_device_properties(device_id)
         return device_props.total_memory
 
+    @classmethod
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        return AttentionBackendEnum.FLASH_ATTN
+
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
@@ -117,24 +136,24 @@ class XPUPlatform(Platform):
                 cache_config.block_size = 16
 
         # lazy import to avoid circular import
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode
+
         compilation_config = vllm_config.compilation_config
         if compilation_config.compile_sizes is None:
             compilation_config.compile_sizes = []
 
-        assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \
+        assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, (
             "CUDA graph mode should be NONE on XPU"
+        )
 
         if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        if envs.VLLM_USE_V1:
-            parallel_config.worker_cls =\
-                "vllm.v1.worker.xpu_worker.XPUWorker"
-        else:
-            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+        if vllm_config.kv_transfer_config is not None:
+            vllm_config.kv_transfer_config.enable_permute_local_kv = True
 
         if parallel_config.distributed_executor_backend is None:
             if parallel_config.world_size > 1:
@@ -143,36 +162,35 @@ class XPUPlatform(Platform):
                 parallel_config.distributed_executor_backend = "uni"
         elif parallel_config.distributed_executor_backend == "mp":
             # FIXME(kunshang):
-            # spawn needs calling `if __name__ == '__main__':``
+            # spawn needs calling `if __name__ == '__main__':`
             # fork is not supported for xpu start new process.
             if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
                 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
                 logger.warning(
-                    "Please use spawn as start method if you want to use mp.")
-        elif (parallel_config.distributed_executor_backend != "ray"
-              and parallel_config.distributed_executor_backend != "uni"
-              and parallel_config.distributed_executor_backend
-              != "external_launcher"):
+                    "Please use spawn as start method if you want to use mp."
+                )
+        elif (
+            parallel_config.distributed_executor_backend != "ray"
+            and parallel_config.distributed_executor_backend != "uni"
+            and parallel_config.distributed_executor_backend != "external_launcher"
+        ):
             logger.warning(
                 "%s is not supported on XPU, fallback to ray distributed"
                 " executor backend.",
-                parallel_config.distributed_executor_backend)
+                parallel_config.distributed_executor_backend,
+            )
             parallel_config.distributed_executor_backend = "ray"
 
         if model_config and model_config.use_mla:
             logger.info(
                 "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
+                "prefill and prefix caching to be disabled."
+            )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
-        from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-        set_kv_cache_layout("NHD")
-        logger.info("Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
-                    "only NHD layout is supported by XPU attention kernels.")
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
@@ -187,9 +205,9 @@ class XPUPlatform(Platform):
         return True
 
     @classmethod
-    def get_current_memory_usage(cls,
-                                 device: Optional[torch.types.Device] = None
-                                 ) -> float:
+    def get_current_memory_usage(
+        cls, device: torch.types.Device | None = None
+    ) -> float:
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
 
@@ -211,15 +229,16 @@ class XPUPlatform(Platform):
         return torch.xpu.device_count()
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             device_name = cls.get_device_name().lower()
             # client gpu a770
             if device_name.count("a770") > 0:
                 raise ValueError(
                     "Intel Arc A770 have bfloat16 accuracy known issue. "
                     "You can use float16 instead by explicitly setting the "
-                    "`dtype` flag in CLI, for example: --dtype=half.")
+                    "`dtype` flag in CLI, for example: --dtype=half."
+                )
 
     @classmethod
     def opaque_attention_op(cls) -> bool:
@@ -235,6 +254,10 @@ class XPUPlatform(Platform):
     ) -> None:
         """Copy blocks from src_cache to dst_cache on XPU."""
         _src_cache = src_cache[:, src_block_indices]
+        if _src_cache.shape[2:] != dst_cache.shape[2:]:
+            # To support TP_ratio, HOST KV might be initiated with HND
+            # while XPU device KV is with NHD
+            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
 
     @classmethod
@@ -247,4 +270,8 @@ class XPUPlatform(Platform):
     ) -> None:
         """Copy blocks from XPU to host (CPU)."""
         _src_cache = src_cache[:, src_block_indices]
+        if _src_cache.shape[2:] != dst_cache.shape[2:]:
+            # XPU device KV is with NHD while HOST KV
+            # might be initiated with HND for TP_ratio support
+            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.cpu()
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 1a1760df82c03476b947f7d7d2f129be61fbe2cc..0d8988f27959fd0ae4052baf9266f999035181bf 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -2,24 +2,28 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins'
+# Default plugins group will be loaded in all processes(process0, engine core
+# process and worker processes)
+DEFAULT_PLUGINS_GROUP = "vllm.general_plugins"
+# IO processor plugins group will be loaded in process0 only
+IO_PROCESSOR_PLUGINS_GROUP = "vllm.io_processor_plugins"
+# Platform plugins group will be loaded in all processes when
+# `vllm.platforms.current_platform` is called and the value not initialized,
+PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins"
 
 # make sure one process only loads plugins once
 plugins_loaded = False
 
 
 def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
-    import sys
-    if sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
+    from importlib.metadata import entry_points
 
     allowed_plugins = envs.VLLM_PLUGINS
 
@@ -29,7 +33,7 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
         return {}
 
     # Check if the only discovered plugin is the default one
-    is_default_group = (group == DEFAULT_PLUGINS_GROUP)
+    is_default_group = group == DEFAULT_PLUGINS_GROUP
     # Use INFO for non-default groups and DEBUG for the default group
     log_level = logger.debug if is_default_group else logger.info
 
@@ -38,8 +42,10 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
         log_level("- %s -> %s", plugin.name, plugin.value)
 
     if allowed_plugins is None:
-        log_level("All plugins in this group will be loaded. "
-                  "Set `VLLM_PLUGINS` to control which plugins to load.")
+        log_level(
+            "All plugins in this group will be loaded. "
+            "Set `VLLM_PLUGINS` to control which plugins to load."
+        )
 
     plugins = dict[str, Callable[[], Any]]()
     for plugin in discovered_plugins:
diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py
index 3b17211b1b8352fb4e0a1b942a5480b88f8e2f8c..b3a3b548781e199b35603785d8c9118279db2846 100644
--- a/vllm/plugins/io_processors/__init__.py
+++ b/vllm/plugins/io_processors/__init__.py
@@ -1,22 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import logging
-from typing import Optional
 
 from vllm.config import VllmConfig
-from vllm.plugins import load_plugins_by_group
+from vllm.plugins import IO_PROCESSOR_PLUGINS_GROUP, load_plugins_by_group
 from vllm.plugins.io_processors.interface import IOProcessor
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = logging.getLogger(__name__)
 
 
 def get_io_processor(
-        vllm_config: VllmConfig,
-        plugin_from_init: Optional[str] = None) -> IOProcessor | None:
+    vllm_config: VllmConfig, plugin_from_init: str | None = None
+) -> IOProcessor | None:
     # Input.Output processors are loaded as plugins under the
     # 'vllm.io_processor_plugins' group. Similar to platform
     # plugins, these plugins register a function that returns the class
@@ -39,8 +36,9 @@ def get_io_processor(
     logger.debug("IOProcessor plugin to be loaded %s", model_plugin)
 
     # Load all installed plugin in the group
-    multimodal_data_processor_plugins = \
-        load_plugins_by_group('vllm.io_processor_plugins')
+    multimodal_data_processor_plugins = load_plugins_by_group(
+        IO_PROCESSOR_PLUGINS_GROUP
+    )
 
     loadable_plugins = {}
     for name, func in multimodal_data_processor_plugins.items():
@@ -54,14 +52,16 @@ def get_io_processor(
 
     num_available_plugins = len(loadable_plugins.keys())
     if num_available_plugins == 0:
-        raise ValueError("No IOProcessor plugins installed"
-                         f" but one is required ({model_plugin}).")
+        raise ValueError(
+            f"No IOProcessor plugins installed but one is required ({model_plugin})."
+        )
 
     if model_plugin not in loadable_plugins:
         raise ValueError(
             f"The model requires the '{model_plugin}' IO Processor plugin "
             "but it is not installed. "
-            f"Available plugins: {list(loadable_plugins.keys())}")
+            f"Available plugins: {list(loadable_plugins.keys())}"
+        )
 
     activated_plugin_cls = loadable_plugins[model_plugin]
 
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index 62b224cac5e53073e818b35bf4f670c3af148bc8..e0488e48614d9ad57e517c9e3a10de317f0b74b5 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -3,19 +3,20 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Sequence
-from typing import Any, Generic, Optional, TypeVar, Union
+from typing import Any, Generic, TypeVar
 
 from vllm.config import VllmConfig
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.inputs.data import PromptType
 from vllm.outputs import PoolingRequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
 
-IOProcessorInput = TypeVar('IOProcessorInput')
-IOProcessorOutput = TypeVar('IOProcessorOutput')
+IOProcessorInput = TypeVar("IOProcessorInput")
+IOProcessorOutput = TypeVar("IOProcessorOutput")
 
 
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
-
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
 
@@ -23,37 +24,40 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     def pre_process(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         raise NotImplementedError
 
     async def pre_process_async(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         return self.pre_process(prompt, request_id, **kwargs)
 
     @abstractmethod
-    def post_process(self,
-                     model_output: Sequence[PoolingRequestOutput],
-                     request_id: Optional[str] = None,
-                     **kwargs) -> IOProcessorOutput:
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: str | None = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
         raise NotImplementedError
 
     async def post_process_async(
         self,
         model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
         # We cannot guarantee outputs are returned in the same order they were
         # fed to vLLM.
         # Let's sort them by id before post_processing
-        sorted_output = sorted([(i, item) async for i, item in model_output],
-                               key=lambda output: output[0])
+        sorted_output = sorted(
+            [(i, item) async for i, item in model_output], key=lambda output: output[0]
+        )
         collected_output = [output[1] for output in sorted_output]
         return self.post_process(collected_output, request_id, **kwargs)
 
@@ -61,7 +65,13 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     def parse_request(self, request: Any) -> IOProcessorInput:
         raise NotImplementedError
 
+    def validate_or_generate_params(
+        self, params: SamplingParams | PoolingParams | None = None
+    ) -> SamplingParams | PoolingParams:
+        return params or PoolingParams()
+
     @abstractmethod
     def output_to_response(
-            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        self, plugin_output: IOProcessorOutput
+    ) -> IOProcessorResponse:
         raise NotImplementedError
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
deleted file mode 100644
index 48f27dddea07ed4f23abc331dd1b9e7c6147ab0d..0000000000000000000000000000000000000000
--- a/vllm/plugins/lora_resolvers/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# LoRA Resolver Plugins
-
-This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
-via the LoRAResolver plugin framework.
-
-Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
-to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
-
-## lora_filesystem_resolver
-
-This LoRA Resolver is installed with vLLM by default.
-To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
-for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
-for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
-load that adapter, and then service the request as normal. That adapter will then be available
-for future requests as normal.
diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py
index b999d07a6eb74f05b11ae12d88c03e238fbd6437..8d94a673e86238d488bc1cf41615e9f898e091d8 100644
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.lora.request import LoRARequest
@@ -10,25 +9,29 @@ from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 
 
 class FilesystemResolver(LoRAResolver):
-
     def __init__(self, lora_cache_dir: str):
         self.lora_cache_dir = lora_cache_dir
 
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
         lora_path = os.path.join(self.lora_cache_dir, lora_name)
         if os.path.exists(lora_path):
-            adapter_config_path = os.path.join(self.lora_cache_dir, lora_name,
-                                               "adapter_config.json")
+            adapter_config_path = os.path.join(
+                self.lora_cache_dir, lora_name, "adapter_config.json"
+            )
             if os.path.exists(adapter_config_path):
                 with open(adapter_config_path) as file:
                     adapter_config = json.load(file)
-                if adapter_config["peft_type"] == "LORA" and adapter_config[
-                        "base_model_name_or_path"] == base_model_name:
-                    lora_request = LoRARequest(lora_name=lora_name,
-                                               lora_int_id=abs(
-                                                   hash(lora_name)),
-                                               lora_path=lora_path)
+                if (
+                    adapter_config["peft_type"] == "LORA"
+                    and adapter_config["base_model_name_or_path"] == base_model_name
+                ):
+                    lora_request = LoRARequest(
+                        lora_name=lora_name,
+                        lora_int_id=abs(hash(lora_name)),
+                        lora_path=lora_path,
+                    )
                     return lora_request
         return None
 
@@ -38,13 +41,12 @@ def register_filesystem_resolver():
 
     lora_cache_dir = envs.VLLM_LORA_RESOLVER_CACHE_DIR
     if lora_cache_dir:
-        if not os.path.exists(lora_cache_dir) or not os.path.isdir(
-                lora_cache_dir):
+        if not os.path.exists(lora_cache_dir) or not os.path.isdir(lora_cache_dir):
             raise ValueError(
                 "VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory \
-                for Filesystem Resolver plugin to function")
+                for Filesystem Resolver plugin to function"
+            )
         fs_resolver = FilesystemResolver(lora_cache_dir)
-        LoRAResolverRegistry.register_resolver("Filesystem Resolver",
-                                               fs_resolver)
+        LoRAResolverRegistry.register_resolver("Filesystem Resolver", fs_resolver)
 
     return
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index a6313367457a413ddc5b7959975c3db03d394ce7..5c3dfa8ac9cbcb27355c15674812bebd5b916902 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,110 +2,112 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Annotated, Any, Optional
+from typing import Annotated, Any, Optional
 
 import msgspec
 
+from vllm.config import ModelConfig, PoolerConfig
+from vllm.config.pooler import get_use_activation
 from vllm.sampling_params import RequestOutputKind
 from vllm.tasks import PoolingTask
 
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
 
 class PoolingParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    array_like=True,
+):  # type: ignore[call-arg]
     """API parameters for pooling models.
 
     Attributes:
         truncate_prompt_tokens: Controls prompt truncation.
             Set to -1 to use the model's default truncation size.
             Set to k to keep only the last k tokens (left truncation).
-            Set to None to disable truncation.         
-        normalize: Whether to normalize the embeddings outputs.
+            Set to None to disable truncation.
         dimensions: Reduce the dimensions of embeddings
             if model support matryoshka representation.
-        activation: Whether to apply activation function to
+        normalize: Whether to normalize the embeddings outputs.
+        softmax: softmax will be deprecated, please use use_activation instead.
+        activation: activation will be deprecated, please use use_activation instead.
+        use_activation: Whether to apply activation function to
             the classification outputs.
-        softmax: Whether to apply softmax to the reward outputs.
     """
 
     # --8<-- [start:common-pooling-params]
-    truncate_prompt_tokens: Optional[Annotated[int,
-                                               msgspec.Meta(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
     # --8<-- [end:common-pooling-params]
 
     ## for embeddings models
     # --8<-- [start:embedding-pooling-params]
-    dimensions: Optional[int] = None
-    normalize: Optional[bool] = None
+    dimensions: int | None = None
+    normalize: bool | None = None
     # --8<-- [end:embedding-pooling-params]
 
     ## for classification, scoring and rerank
     # --8<-- [start:classification-pooling-params]
-    activation: Optional[bool] = None
+    softmax: bool | None = None
+    activation: bool | None = None
+    use_activation: bool | None = None
     # --8<-- [end:classification-pooling-params]
 
-    ## for reward models
-    softmax: Optional[bool] = None
-    step_tag_id: Optional[int] = None
-    returned_token_ids: Optional[list[int]] = None
-
-    task: Optional[PoolingTask] = None
-    """Internal use only."""
+    ## for step pooling models
+    step_tag_id: int | None = None
+    returned_token_ids: list[int] | None = None
 
+    ## Internal use only
+    task: PoolingTask | None = None
     requires_token_ids: bool = False
-    """Internal use only."""
-
-    extra_kwargs: Optional[dict[str, Any]] = None
-    """Internal use only."""
-
+    skip_reading_prefix_cache: bool = None
+    extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
     @property
     def all_parameters(self) -> list[str]:
-        return [
-            "dimensions", "normalize", "activation", "softmax", "step_tag_id",
-            "returned_token_ids"
-        ]
+        return ["dimensions", "normalize", "use_activation"]
 
     @property
     def valid_parameters(self):
         return {
             "embed": ["dimensions", "normalize"],
-            "classify": ["activation"],
-            "score": ["activation"],
-            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
+            "classify": ["use_activation"],
+            "score": ["use_activation"],
+            "token_embed": ["dimensions", "normalize"],
+            "token_classify": ["use_activation"],
         }
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
         return deepcopy(self)
 
-    def verify(self,
-               task: PoolingTask,
-               model_config: Optional["ModelConfig"] = None) -> None:
-
+    def verify(
+        self, task: PoolingTask, model_config: Optional["ModelConfig"] = None
+    ) -> None:
         if self.task is None:
             self.task = task
         elif self.task != task:
             msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
             raise ValueError(msg)
 
+        # raise deprecated warning for softmax and activation
+        self.use_activation = get_use_activation(self)
+
+        # plugin task uses io_processor.parse_request to verify inputs,
+        # skipping PoolingParams verify
+        if self.task == "plugin":
+            if self.skip_reading_prefix_cache is None:
+                self.skip_reading_prefix_cache = True
+            return
+
         # NOTE: Task validation needs to done against the model instance,
         # which is not available in model config. So, it's not included
         # in this method
-
         self._merge_default_parameters(model_config)
         self._set_default_parameters(model_config)
         self._verify_valid_parameters()
 
-    def _merge_default_parameters(self,
-                                  model_config: Optional["ModelConfig"] = None
-                                  ) -> None:
-
+    def _merge_default_parameters(
+        self, model_config: Optional["ModelConfig"] = None
+    ) -> None:
         if model_config is None:
             return
 
@@ -123,8 +125,43 @@ class PoolingParams(
             if getattr(self, k, None) is None:
                 setattr(self, k, getattr(pooler_config, k))
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of all pooling may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            if self.task in ["token_embed", "token_classify"]:
+                self.skip_reading_prefix_cache = True
+            else:
+                self.skip_reading_prefix_cache = False
+
+        self._verify_step_pooling(pooler_config, valid_parameters)
+
+    def _verify_step_pooling(
+        self, pooler_config: "PoolerConfig", valid_parameters: list[str]
+    ):
+        step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
+        if pooler_config.pooling_type != "STEP":
+            invalid_parameters = []
+            for k in step_pooling_parameters:
+                if getattr(self, k, None) is not None:
+                    invalid_parameters.append(k)
+
+            if invalid_parameters:
+                raise ValueError(
+                    f"Task {self.task} only supports {valid_parameters} "
+                    f"parameters, does not support "
+                    f"{invalid_parameters} parameters"
+                )
+        else:
+            for k in step_pooling_parameters:
+                if getattr(pooler_config, k, None) is None:
+                    continue
+
+                if getattr(self, k, None) is None:
+                    setattr(self, k, getattr(pooler_config, k))
+
     def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
-        if self.task == "embed":
+        if self.task in ["embed", "token_embed"]:
             if self.normalize is None:
                 self.normalize = True
 
@@ -132,8 +169,8 @@ class PoolingParams(
                 if not model_config.is_matryoshka:
                     raise ValueError(
                         f'Model "{model_config.served_model_name}" does not '
-                        f'support matryoshka representation, '
-                        f'changing output dimensions will lead to poor results.'
+                        f"support matryoshka representation, "
+                        f"changing output dimensions will lead to poor results."
                     )
 
                 mds = model_config.matryoshka_dimensions
@@ -141,19 +178,16 @@ class PoolingParams(
                     if self.dimensions not in mds:
                         raise ValueError(
                             f'Model "{model_config.served_model_name}" '
-                            f'only supports {str(mds)} matryoshka dimensions, '
-                            f'use other output dimensions will '
-                            f'lead to poor results.')
+                            f"only supports {str(mds)} matryoshka dimensions, "
+                            f"use other output dimensions will "
+                            f"lead to poor results."
+                        )
                 elif self.dimensions < 1:
                     raise ValueError("Dimensions must be greater than 0")
 
-        elif self.task in ["classify", "score"]:
-            if self.activation is None:
-                self.activation = True
-
-        elif self.task == "encode":
-            if self.softmax is None:
-                self.softmax = True
+        elif self.task in ["classify", "score", "token_classify"]:
+            if self.use_activation is None:
+                self.use_activation = True
         else:
             raise ValueError(f"Unknown pooling task: {self.task}")
 
@@ -172,20 +206,23 @@ class PoolingParams(
             raise ValueError(
                 f"Task {self.task} only supports {valid_parameters} "
                 f"parameters, does not support "
-                f"{invalid_parameters} parameters")
+                f"{invalid_parameters} parameters"
+            )
 
     def __repr__(self) -> str:
-        return (f"PoolingParams("
-                f"task={self.task}, "
-                f"normalize={self.normalize}, "
-                f"dimensions={self.dimensions}, "
-                f"activation={self.activation}, "
-                f"softmax={self.softmax}, "
-                f"step_tag_id={self.step_tag_id}, "
-                f"returned_token_ids={self.returned_token_ids}, "
-                f"requires_token_ids={self.requires_token_ids}, "
-                f"extra_kwargs={self.extra_kwargs})")
+        return (
+            f"PoolingParams("
+            f"task={self.task}, "
+            f"normalize={self.normalize}, "
+            f"dimensions={self.dimensions}, "
+            f"use_activation={self.use_activation}, "
+            f"step_tag_id={self.step_tag_id}, "
+            f"returned_token_ids={self.returned_token_ids}, "
+            f"requires_token_ids={self.requires_token_ids}, "
+            f"extra_kwargs={self.extra_kwargs})"
+        )
 
     def __post_init__(self) -> None:
-        assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
+        assert self.output_kind == RequestOutputKind.FINAL_ONLY, (
             "For pooling output_kind has to be FINAL_ONLY"
+        )
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..58c668953161509c74c5651d1012416c529f4572
--- /dev/null
+++ b/vllm/profiler/gpu_profiler.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class CudaProfilerWrapper:
+    def __init__(self) -> None:
+        self._profiler_running = False
+        # Note: lazy import to avoid dependency issues if CUDA is not available.
+        import torch.cuda.profiler as cuda_profiler
+
+        self._cuda_profiler = cuda_profiler
+
+    def start(self) -> None:
+        try:
+            self._cuda_profiler.start()
+            self._profiler_running = True
+            logger.info_once("Started CUDA profiler")
+        except Exception as e:
+            logger.warning_once("Failed to start CUDA profiler: %s", e)
+
+    def stop(self) -> None:
+        if self._profiler_running:
+            try:
+                self._cuda_profiler.stop()
+                logger.info_once("Stopped CUDA profiler")
+            except Exception as e:
+                logger.warning_once("Failed to stop CUDA profiler: %s", e)
+            finally:
+                self._profiler_running = False
+
+    def shutdown(self) -> None:
+        """Ensure profiler is stopped when shutting down."""
+        self.stop()
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 41136f738c286174d0608393a3817b426b0b0da1..829b63d8a79d0d0f4e62ba6b8749e0a8b8c519ed 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -3,30 +3,41 @@
 
 import copy
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import asdict, dataclass, field
-from typing import Any, Callable, Optional, TypeAlias, Union
+from typing import Any, Optional, TypeAlias
 
-import pandas as pd
 from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
 from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
 from torch.autograd.profiler import FunctionEvent
 from torch.profiler import ProfilerActivity, profile
 
-from vllm.profiler.utils import (TablePrinter, event_has_module,
-                                 event_is_torch_op, event_module_repr,
-                                 event_torch_op_stack_trace, indent_string)
+from vllm.profiler.utils import (
+    TablePrinter,
+    event_has_module,
+    event_is_torch_op,
+    event_module_repr,
+    event_torch_op_stack_trace,
+    indent_string,
+)
+from vllm.utils.import_utils import PlaceholderModule
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
 
 
 @dataclass
 class _ModuleTreeNode:
     event: _ProfilerEvent
-    parent: Optional['_ModuleTreeNode'] = None
-    children: list['_ModuleTreeNode'] = field(default_factory=list)
+    parent: Optional["_ModuleTreeNode"] = None
+    children: list["_ModuleTreeNode"] = field(default_factory=list)
     trace: str = ""
 
     @property
     def is_leaf(self):
-        return (self.event.children is None or len(self.event.children) == 0)
+        return self.event.children is None or len(self.event.children) == 0
 
     @property
     def is_torch_op(self):
@@ -34,8 +45,10 @@ class _ModuleTreeNode:
 
     @property
     def is_cuda(self):
-        return (self.event.tag == _EventType.Kineto
-                and self.event.typed[1].device_type == DeviceType.CUDA)
+        return (
+            self.event.tag == _EventType.Kineto
+            and self.event.typed[1].device_type == DeviceType.CUDA
+        )
 
 
 @dataclass
@@ -55,28 +68,27 @@ class ModelStatsEntry:
     trace: str
 
 
-StatsEntry: TypeAlias = Union[ModelStatsEntry, SummaryStatsEntry]
+StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
 
 
 @dataclass
 class _StatsTreeNode:
     entry: StatsEntry
     children: list[StatsEntry]
-    parent: Optional[StatsEntry]
+    parent: StatsEntry | None
 
 
 @dataclass
 class LayerwiseProfileResults(profile):
     _kineto_results: _ProfilerResult
-    _kineto_event_correlation_map: dict[int,
-                                        list[_KinetoEvent]] = field(init=False)
+    _kineto_event_correlation_map: dict[int, list[_KinetoEvent]] = field(init=False)
     _event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
     _module_tree: list[_ModuleTreeNode] = field(init=False)
     _model_stats_tree: list[_StatsTreeNode] = field(init=False)
     _summary_stats_tree: list[_StatsTreeNode] = field(init=False)
 
     # profile metadata
-    num_running_seqs: Optional[int] = None
+    num_running_seqs: int | None = None
 
     def __post_init__(self):
         self._build_correlation_map()
@@ -84,11 +96,9 @@ class LayerwiseProfileResults(profile):
         self._build_stats_trees()
 
     def print_model_table(self, column_widths: dict[str, int] = None):
-        _column_widths = dict(name=60,
-                              cpu_time_us=12,
-                              cuda_time_us=12,
-                              pct_cuda_time=12,
-                              trace=60)
+        _column_widths = dict(
+            name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
+        )
         if column_widths:
             _column_widths.update(**column_widths)
         filtered_model_table = [
@@ -99,78 +109,76 @@ class LayerwiseProfileResults(profile):
         TablePrinter(ModelStatsEntry, _column_widths).print_table(
             self._indent_row_names_based_on_depth(
                 filtered_model_table,
-                indent_style=lambda indent: "|" + "-" * indent + " "))
+                indent_style=lambda indent: "|" + "-" * indent + " ",
+            )
+        )
 
     def print_summary_table(self, column_widths: dict[str, int] = None):
-        _column_widths = dict(name=80,
-                              cuda_time_us=12,
-                              pct_cuda_time=12,
-                              invocations=15)
+        _column_widths = dict(
+            name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15
+        )
         if column_widths:
             _column_widths.update(**column_widths)
-        filtered_summary_table = [(depth, row)
-                                  for depth, row in self._flatten_stats_tree(
-                                      self._summary_stats_tree)
-                                  if row.cuda_time_us > 0]
+        filtered_summary_table = [
+            (depth, row)
+            for depth, row in self._flatten_stats_tree(self._summary_stats_tree)
+            if row.cuda_time_us > 0
+        ]
         TablePrinter(SummaryStatsEntry, _column_widths).print_table(
             self._indent_row_names_based_on_depth(
                 filtered_summary_table,
-                indent_style=lambda indent: "|" + "-" * indent + " "))
+                indent_style=lambda indent: "|" + "-" * indent + " ",
+            )
+        )
 
     def export_model_stats_table_csv(self, filename: str):
-        df = pd.DataFrame([
-            asdict(row)
-            for _, row in self._flatten_stats_tree(self._model_stats_tree)
-        ])
+        df = pd.DataFrame(
+            [asdict(row) for _, row in self._flatten_stats_tree(self._model_stats_tree)]
+        )
         df.to_csv(filename)
 
     def export_summary_stats_table_csv(self, filename: str):
-        df = pd.DataFrame([
-            asdict(row)
-            for _, row in self._flatten_stats_tree(self._summary_stats_tree)
-        ])
+        df = pd.DataFrame(
+            [
+                asdict(row)
+                for _, row in self._flatten_stats_tree(self._summary_stats_tree)
+            ]
+        )
         df.to_csv(filename)
 
     def convert_stats_to_dict(self) -> dict[str, Any]:
         return {
-            "metadata": {
-                "num_running_seqs": self.num_running_seqs
-            },
-            "summary_stats":
-            self._convert_stats_tree_to_dict(self._summary_stats_tree),
-            "model_stats":
-            self._convert_stats_tree_to_dict(self._model_stats_tree)
+            "metadata": {"num_running_seqs": self.num_running_seqs},
+            "summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree),
+            "model_stats": self._convert_stats_tree_to_dict(self._model_stats_tree),
         }
 
     @staticmethod
-    def _indent_row_names_based_on_depth(depths_rows: list[tuple[int,
-                                                                 StatsEntry]],
-                                         indent_style: Union[Callable[[int],
-                                                                      str],
-                                                             str] = " "):
+    def _indent_row_names_based_on_depth(
+        depths_rows: list[tuple[int, StatsEntry]],
+        indent_style: Callable[[int], str] | str = " ",
+    ):
         indented_rows = []
         for depth, row in depths_rows:
             if row.cuda_time_us == 0:
                 continue
             indented_row = copy.deepcopy(row)
-            indented_row.name = indent_string(indented_row.name, depth,
-                                              indent_style)
+            indented_row.name = indent_string(indented_row.name, depth, indent_style)
             indented_rows.append(indented_row)
         return indented_rows
 
     def _build_correlation_map(self):
         self._kineto_event_correlation_map = defaultdict(list)
         for event in self._kineto_results.events():
-            self._kineto_event_correlation_map[event.correlation_id()].append(
-                event)
+            self._kineto_event_correlation_map[event.correlation_id()].append(event)
 
     def _build_module_tree(self):
         self._module_tree = []
         event_tree = self._kineto_results.experimental_event_tree()
 
-        def _df_traversal(event: _ProfilerEvent,
-                          curr_node: Optional[_ModuleTreeNode] = None):
-
+        def _df_traversal(
+            event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
+        ):
             # For the tensor parallel case for now only look at task 1
             if event.start_tid != 1:
                 return
@@ -183,13 +191,15 @@ class LayerwiseProfileResults(profile):
                     self._module_tree.append(node)
                 curr_node = node
 
-            is_leaf = (event.children is None or len(event.children) == 0)
+            is_leaf = event.children is None or len(event.children) == 0
             if is_leaf and curr_node:
                 node = _ModuleTreeNode(
                     event=event,
                     parent=curr_node,
                     trace=event_torch_op_stack_trace(
-                        event, until=lambda x: event_has_module(x)))
+                        event, until=lambda x: event_has_module(x)
+                    ),
+                )
                 curr_node.children.append(node)
                 curr_node = node
 
@@ -203,31 +213,31 @@ class LayerwiseProfileResults(profile):
         if node.event.tag != _EventType.Kineto:
             return None
         correlated_kineto_events = self._kineto_event_correlation_map.get(
-            node.event.correlation_id, [])
-        iterator = (x for x in correlated_kineto_events
-                    if x.device_type() == DeviceType.CUDA
-                    and x.name() == node.event.name)
+            node.event.correlation_id, []
+        )
+        iterator = (
+            x
+            for x in correlated_kineto_events
+            if x.device_type() == DeviceType.CUDA and x.name() == node.event.name
+        )
         return next(iterator, None)
 
     def _cumulative_cuda_time(self, node: _ModuleTreeNode):
-        'Return cuda time in microseconds'
+        "Return cuda time in microseconds"
 
         def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
-            if node.is_leaf and (gpu_kineto_event :=
-                                 self._get_kineto_gpu_event(node)):
+            if node.is_leaf and (gpu_kineto_event := self._get_kineto_gpu_event(node)):
                 return gpu_kineto_event.duration_ns() / 1000.0
             else:
                 cumulative_cuda_time = 0
                 for child in node.children:
-                    cumulative_cuda_time += _cumulative_cuda_time_recursive(
-                        child)
+                    cumulative_cuda_time += _cumulative_cuda_time_recursive(child)
                 return cumulative_cuda_time
 
         return _cumulative_cuda_time_recursive(node)
 
     def _total_cuda_time(self):
-        return sum(
-            [self._cumulative_cuda_time(root) for root in self._module_tree])
+        return sum([self._cumulative_cuda_time(root) for root in self._module_tree])
 
     def _build_stats_trees(self):
         summary_dict: dict[str, _StatsTreeNode] = {}
@@ -238,39 +248,43 @@ class LayerwiseProfileResults(profile):
 
         def build_summary_stats_tree_df(
             node: _ModuleTreeNode,
-            parent: Optional[_StatsTreeNode] = None,
-            summary_trace: tuple[str] = ()):
-
+            parent: _StatsTreeNode | None = None,
+            summary_trace: tuple[str] = (),
+        ):
             if event_has_module(node.event):
                 name = event_module_repr(node.event)
                 cuda_time_us = self._cumulative_cuda_time(node)
-            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
+            elif gpu_kineto_event := self._get_kineto_gpu_event(node):
                 name = gpu_kineto_event.name()
                 cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
             else:
                 return None
 
-            summary_trace = summary_trace + (name, )
+            summary_trace = summary_trace + (name,)
             if summary_trace in summary_dict:
                 entry = summary_dict[summary_trace].entry
                 entry.cuda_time_us += cuda_time_us
                 entry.invocations += 1
                 entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
             else:
-                new_node = _StatsTreeNode(entry=SummaryStatsEntry(
-                    name=name,
-                    cuda_time_us=cuda_time_us,
-                    pct_cuda_time=pct_cuda_time(cuda_time_us),
-                    invocations=1),
-                                          children=[],
-                                          parent=parent)
+                new_node = _StatsTreeNode(
+                    entry=SummaryStatsEntry(
+                        name=name,
+                        cuda_time_us=cuda_time_us,
+                        pct_cuda_time=pct_cuda_time(cuda_time_us),
+                        invocations=1,
+                    ),
+                    children=[],
+                    parent=parent,
+                )
                 if parent:
                     parent.children.append(new_node)
                 summary_dict[summary_trace] = new_node
 
             for child in node.children:
-                build_summary_stats_tree_df(child, summary_dict[summary_trace],
-                                            summary_trace)
+                build_summary_stats_tree_df(
+                    child, summary_dict[summary_trace], summary_trace
+                )
 
             return summary_dict[summary_trace]
 
@@ -278,14 +292,17 @@ class LayerwiseProfileResults(profile):
         for root in self._module_tree:
             self._summary_stats_tree.append(build_summary_stats_tree_df(root))
 
-        def build_model_stats_tree_df(node: _ModuleTreeNode,
-                                      parent: Optional[_StatsTreeNode] = None):
-            if event_has_module(node.event, ):
+        def build_model_stats_tree_df(
+            node: _ModuleTreeNode, parent: _StatsTreeNode | None = None
+        ):
+            if event_has_module(
+                node.event,
+            ):
                 name = event_module_repr(node.event)
                 cuda_time_us = self._cumulative_cuda_time(node)
                 cpu_time_us = node.event.duration_time_ns / 1000
                 trace = ""
-            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
+            elif gpu_kineto_event := self._get_kineto_gpu_event(node):
                 name = gpu_kineto_event.name()
                 cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
                 cpu_time_us = 0
@@ -293,14 +310,17 @@ class LayerwiseProfileResults(profile):
             else:
                 return None
 
-            new_node = _StatsTreeNode(entry=ModelStatsEntry(
-                name=name,
-                cpu_time_us=cpu_time_us,
-                cuda_time_us=cuda_time_us,
-                pct_cuda_time=pct_cuda_time(cuda_time_us),
-                trace=trace),
-                                      parent=parent,
-                                      children=[])
+            new_node = _StatsTreeNode(
+                entry=ModelStatsEntry(
+                    name=name,
+                    cpu_time_us=cpu_time_us,
+                    cuda_time_us=cuda_time_us,
+                    pct_cuda_time=pct_cuda_time(cuda_time_us),
+                    trace=trace,
+                ),
+                parent=parent,
+                children=[],
+            )
             if parent:
                 parent.children.append(new_node)
 
@@ -314,7 +334,8 @@ class LayerwiseProfileResults(profile):
             self._model_stats_tree.append(build_model_stats_tree_df(root))
 
     def _flatten_stats_tree(
-            self, tree: list[_StatsTreeNode]) -> list[tuple[int, StatsEntry]]:
+        self, tree: list[_StatsTreeNode]
+    ) -> list[tuple[int, StatsEntry]]:
         entries: list[tuple[int, StatsEntry]] = []
 
         def df_traversal(node: _StatsTreeNode, depth=0):
@@ -327,15 +348,11 @@ class LayerwiseProfileResults(profile):
 
         return entries
 
-    def _convert_stats_tree_to_dict(self,
-                                    tree: list[_StatsTreeNode]) -> list[dict]:
+    def _convert_stats_tree_to_dict(self, tree: list[_StatsTreeNode]) -> list[dict]:
         root_dicts: list[dict] = []
 
         def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]):
-            curr_json_list.append({
-                "entry": asdict(node.entry),
-                "children": []
-            })
+            curr_json_list.append({"entry": asdict(node.entry), "children": []})
             for child in node.children:
                 df_traversal(child, curr_json_list[-1]["children"])
 
@@ -346,8 +363,7 @@ class LayerwiseProfileResults(profile):
 
 
 class layerwise_profile(profile):
-
-    def __init__(self, num_running_seqs: Optional[int] = None):
+    def __init__(self, num_running_seqs: int | None = None):
         """
         layerwise profile constructor.
 
@@ -361,7 +377,8 @@ class layerwise_profile(profile):
             record_shapes=True,
             with_stack=True,
             with_modules=True,
-            experimental_config=_ExperimentalConfig(verbose=True))
+            experimental_config=_ExperimentalConfig(verbose=True),
+        )
 
         self.num_running_seqs = num_running_seqs
 
@@ -371,5 +388,5 @@ class layerwise_profile(profile):
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
         self.results = LayerwiseProfileResults(
-            self.profiler.kineto_results,
-            num_running_seqs=self.num_running_seqs)
+            self.profiler.kineto_results, num_running_seqs=self.num_running_seqs
+        )
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index 9f0f56a15fd5362255599fa053aeb35a18aa13ca..c95f9f4ac9779613917b9a2d91a1ce27b3e7d4bd 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from typing import Callable, Union
+from collections.abc import Callable
 
 from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
 
@@ -30,9 +30,9 @@ def trim_string_back(string, width):
 
 
 class TablePrinter:
-
-    def __init__(self, row_cls: type[dataclasses.dataclass],
-                 column_widths: dict[str, int]):
+    def __init__(
+        self, row_cls: type[dataclasses.dataclass], column_widths: dict[str, int]
+    ):
         self.row_cls = row_cls
         self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
         self.column_widths = column_widths
@@ -46,16 +46,18 @@ class TablePrinter:
 
     def _print_header(self):
         for i, f in enumerate(self.fieldnames):
-            last = (i == len(self.fieldnames) - 1)
+            last = i == len(self.fieldnames) - 1
             col_width = self.column_widths[f]
-            print(trim_string_back(f, col_width).ljust(col_width),
-                  end=" | " if not last else "\n")
+            print(
+                trim_string_back(f, col_width).ljust(col_width),
+                end=" | " if not last else "\n",
+            )
 
     def _print_row(self, row):
         assert isinstance(row, self.row_cls)
 
         for i, f in enumerate(self.fieldnames):
-            last = (i == len(self.fieldnames) - 1)
+            last = i == len(self.fieldnames) - 1
             col_width = self.column_widths[f]
             val = getattr(row, f)
 
@@ -75,9 +77,9 @@ class TablePrinter:
         print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
 
 
-def indent_string(string: str,
-                  indent: int,
-                  indent_style: Union[Callable[[int], str], str] = " ") -> str:
+def indent_string(
+    string: str, indent: int, indent_style: Callable[[int], str] | str = " "
+) -> str:
     if indent:
         if isinstance(indent_style, str):
             return indent_style * indent + string
@@ -111,15 +113,14 @@ def event_arg_repr(arg) -> str:
     elif isinstance(arg, tuple):
         return f"({', '.join([event_arg_repr(x) for x in arg])})"
     else:
-        assert isinstance(arg,
-                          _TensorMetadata), f"Unsupported type: {type(arg)}"
-        sizes_str = ', '.join([str(x) for x in arg.sizes])
+        assert isinstance(arg, _TensorMetadata), f"Unsupported type: {type(arg)}"
+        sizes_str = ", ".join([str(x) for x in arg.sizes])
         return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
 
 
 def event_torch_op_repr(event: _ProfilerEvent) -> str:
     assert event.tag == _EventType.TorchOp
-    args_str = ', '.join([event_arg_repr(x) for x in event.typed[1].inputs])
+    args_str = ", ".join([event_arg_repr(x) for x in event.typed[1].inputs])
     return f"{event.name}({args_str})".replace("aten::", "")
 
 
@@ -127,15 +128,17 @@ def event_module_repr(event: _ProfilerEvent) -> str:
     assert event_has_module(event)
     module = event.typed[1].module
     if module.parameters and len(module.parameters) > 0:
-        args_str = ', '.join(
-            [f'{x[0]}={event_arg_repr(x[1])}' for x in module.parameters])
+        args_str = ", ".join(
+            [f"{x[0]}={event_arg_repr(x[1])}" for x in module.parameters]
+        )
         return f"{module.cls_name}({args_str})"
     else:
         return module.cls_name
 
 
-def event_torch_op_stack_trace(curr_event: _ProfilerEvent,
-                               until: Callable[[_ProfilerEvent], bool]) -> str:
+def event_torch_op_stack_trace(
+    curr_event: _ProfilerEvent, until: Callable[[_ProfilerEvent], bool]
+) -> str:
     trace = ""
     curr_event = curr_event.parent
     while curr_event and not until(curr_event):
diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py
index bb3535579cfdf86e6804556e3dc3295fded0a37d..64b5f51571a35ae49e765edadcc28ff87d4b8774 100644
--- a/vllm/ray/lazy_utils.py
+++ b/vllm/ray/lazy_utils.py
@@ -6,6 +6,7 @@ def is_ray_initialized():
     """Check if Ray is initialized."""
     try:
         import ray
+
         return ray.is_initialized()
     except ImportError:
         return False
@@ -16,7 +17,10 @@ def is_in_ray_actor():
 
     try:
         import ray
-        return (ray.is_initialized()
-                and ray.get_runtime_context().get_actor_id() is not None)
+
+        return (
+            ray.is_initialized()
+            and ray.get_runtime_context().get_actor_id() is not None
+        )
     except ImportError:
         return False
diff --git a/vllm/ray/ray_env.py b/vllm/ray/ray_env.py
index f6a994bb3c2262702c568ca474a2ac2ca972c996..85623cfe5ff57cfd037f861aa8b868ab3e88c41c 100644
--- a/vllm/ray/ray_env.py
+++ b/vllm/ray/ray_env.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -14,7 +13,8 @@ CONFIG_HOME = envs.VLLM_CONFIG_ROOT
 # This file contains a list of env vars that should not be copied
 # from the driver to the Ray workers.
 RAY_NON_CARRY_OVER_ENV_VARS_FILE = os.path.join(
-    CONFIG_HOME, "ray_non_carry_over_env_vars.json")
+    CONFIG_HOME, "ray_non_carry_over_env_vars.json"
+)
 
 try:
     if os.path.exists(RAY_NON_CARRY_OVER_ENV_VARS_FILE):
@@ -25,13 +25,16 @@ try:
 except json.JSONDecodeError:
     logger.warning(
         "Failed to parse %s. Using an empty set for non-carry-over env vars.",
-        RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+        RAY_NON_CARRY_OVER_ENV_VARS_FILE,
+    )
     RAY_NON_CARRY_OVER_ENV_VARS = set()
 
 
-def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None,
-                         additional_vars: Optional[set[str]] = None,
-                         destination: Optional[str] = None) -> set[str]:
+def get_env_vars_to_copy(
+    exclude_vars: set[str] | None = None,
+    additional_vars: set[str] | None = None,
+    destination: str | None = None,
+) -> set[str]:
     """
     Get the environment variables to copy to downstream Ray actors.
 
@@ -60,13 +63,17 @@ def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None,
 
     to_destination = " to " + destination if destination is not None else ""
 
-    logger.info("RAY_NON_CARRY_OVER_ENV_VARS from config: %s",
-                RAY_NON_CARRY_OVER_ENV_VARS)
-    logger.info("Copying the following environment variables%s: %s",
-                to_destination,
-                [v for v in env_vars_to_copy if v in os.environ])
     logger.info(
-        "If certain env vars should NOT be copied, add them to "
-        "%s file", RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+        "RAY_NON_CARRY_OVER_ENV_VARS from config: %s", RAY_NON_CARRY_OVER_ENV_VARS
+    )
+    logger.info(
+        "Copying the following environment variables%s: %s",
+        to_destination,
+        [v for v in env_vars_to_copy if v in os.environ],
+    )
+    logger.info(
+        "If certain env vars should NOT be copied, add them to %s file",
+        RAY_NON_CARRY_OVER_ENV_VARS_FILE,
+    )
 
     return env_vars_to_copy
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 3c8a9c6ae0d338af7f377f6e95f8b71ec6ea803a..36e58dba6b497d08853b3904ad160c4d15b2c8b1 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -1,29 +1,92 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
-from .basic_parsers import BaseThinkingReasoningParser
-from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
-from .gptoss_reasoning_parser import GptOssReasoningParser
-from .granite_reasoning_parser import GraniteReasoningParser
-from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
-from .mistral_reasoning_parser import MistralReasoningParser
-from .qwen3_reasoning_parser import Qwen3ReasoningParser
-from .seedoss_reasoning_parser import SeedOSSReasoningParser
-from .step3_reasoning_parser import Step3ReasoningParser
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 
 __all__ = [
     "ReasoningParser",
-    "BaseThinkingReasoningParser",
     "ReasoningParserManager",
-    "DeepSeekR1ReasoningParser",
-    "GraniteReasoningParser",
-    "HunyuanA13BReasoningParser",
-    "Qwen3ReasoningParser",
-    "Glm4MoeModelReasoningParser",
-    "MistralReasoningParser",
-    "Step3ReasoningParser",
-    "GptOssReasoningParser",
-    "SeedOSSReasoningParser",
 ]
+"""
+Register a lazy module mapping.
+
+Example:
+    ReasoningParserManager.register_lazy_module(
+        name="qwen3",
+        module_path="vllm.reasoning.qwen3_reasoning_parser",
+        class_name="Qwen3ReasoningParser",
+    )
+"""
+
+
+_REASONING_PARSERS_TO_REGISTER = {
+    "deepseek_r1": (  # name
+        "deepseek_r1_reasoning_parser",  # filename
+        "DeepSeekR1ReasoningParser",  # class_name
+    ),
+    "deepseek_v3": (
+        "deepseek_v3_reasoning_parser",
+        "DeepSeekV3ReasoningParser",
+    ),
+    "ernie45": (
+        "ernie45_reasoning_parser",
+        "Ernie45ReasoningParser",
+    ),
+    "glm45": (
+        "glm4_moe_reasoning_parser",
+        "Glm4MoeModelReasoningParser",
+    ),
+    "openai_gptoss": (
+        "gptoss_reasoning_parser",
+        "GptOssReasoningParser",
+    ),
+    "granite": (
+        "granite_reasoning_parser",
+        "GraniteReasoningParser",
+    ),
+    "hunyuan_a13b": (
+        "hunyuan_a13b_reasoning_parser",
+        "HunyuanA13BReasoningParser",
+    ),
+    "kimi_k2": (
+        "deepseek_r1_reasoning_parser",
+        "DeepSeekR1ReasoningParser",
+    ),
+    "minimax_m2": (
+        "minimax_m2_reasoning_parser",
+        "MiniMaxM2ReasoningParser",
+    ),
+    "minimax_m2_append_think": (
+        "minimax_m2_reasoning_parser",
+        "MiniMaxM2AppendThinkReasoningParser",
+    ),
+    "mistral": (
+        "mistral_reasoning_parser",
+        "MistralReasoningParser",
+    ),
+    "olmo3": (
+        "olmo3_reasoning_parser",
+        "Olmo3ReasoningParser",
+    ),
+    "qwen3": (
+        "qwen3_reasoning_parser",
+        "Qwen3ReasoningParser",
+    ),
+    "seed_oss": (
+        "seedoss_reasoning_parser",
+        "SeedOSSReasoningParser",
+    ),
+    "step3": (
+        "step3_reasoning_parser",
+        "Step3ReasoningParser",
+    ),
+}
+
+
+def register_lazy_reasoning_parsers():
+    for name, (file_name, class_name) in _REASONING_PARSERS_TO_REGISTER.items():
+        module_path = f"vllm.reasoning.{file_name}"
+        ReasoningParserManager.register_lazy_module(name, module_path, class_name)
+
+
+register_lazy_reasoning_parsers()
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 320009c2611e5e4604692bc4cc9ba2815d900e98..d26e4ffc9c16329d9a6927643dd6b44fba582723 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,21 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
+import importlib
 import os
 from abc import abstractmethod
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable, Union
+from typing import TYPE_CHECKING, Any
 
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-from vllm.utils import import_from_path, is_list_of
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.import_utils import import_from_path
 
 if TYPE_CHECKING:
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                                  DeltaMessage,
-                                                  ResponsesRequest)
+    from vllm.entrypoints.openai.protocol import (
+        ChatCompletionRequest,
+        DeltaMessage,
+        ResponsesRequest,
+    )
     from vllm.transformers_utils.tokenizer import AnyTokenizer
 else:
     ChatCompletionRequest = Any
@@ -73,10 +76,10 @@ class ReasoningParser:
         """
 
     @abstractmethod
-    def extract_reasoning_content(
+    def extract_reasoning(
         self,
         model_output: str,
-        request: Union[ChatCompletionRequest, ResponsesRequest],
+        request: ChatCompletionRequest | ResponsesRequest,
     ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from a complete model-generated string.
@@ -97,7 +100,7 @@ class ReasoningParser:
         """
 
     @abstractmethod
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -105,7 +108,7 @@ class ReasoningParser:
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Instance method that should be implemented for extracting reasoning
         from an incomplete response; for use when handling reasoning calls and
@@ -114,51 +117,130 @@ class ReasoningParser:
         previously been parsed and extracted (see constructor)
         """
 
+    def prepare_structured_tag(
+        self,
+        original_tag: str | None,
+        tool_server: ToolServer | None,
+    ) -> str:
+        """
+        Instance method that is implemented for preparing the structured tag
+        Otherwise, None is returned
+        """
+        return None
+
 
 class ReasoningParserManager:
-    reasoning_parsers: dict[str, type] = {}
+    """
+    Central registry for ReasoningParser implementations.
+
+    Supports two registration modes:
+      - Eager registration via `register_module`
+      - Lazy registration via `register_lazy_module`
+
+    Each reasoning parser must inherit from `ReasoningParser`.
+    """
+
+    reasoning_parsers: dict[str, type[ReasoningParser]] = {}
+    lazy_parsers: dict[str, tuple[str, str]] = {}  # name -> (module_path, class_name)
 
     @classmethod
-    def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
+    def get_reasoning_parser(cls, name: str) -> type[ReasoningParser]:
         """
-        Get reasoning parser by name which is registered by `register_module`.
+        Retrieve a registered or lazily registered ReasoningParser class.
 
-        Raise a KeyError exception if the name is not registered.
+        If the parser is lazily registered, it will be imported and cached
+        on first access.
+
+        Raises:
+            KeyError: if no parser is found under the given name.
         """
         if name in cls.reasoning_parsers:
             return cls.reasoning_parsers[name]
 
-        raise KeyError(
-            f"reasoning helper: '{name}' not found in reasoning_parsers")
+        if name in cls.lazy_parsers:
+            return cls._load_lazy_parser(name)
+
+        raise KeyError(f"Reasoning parser '{name}' not found.")
+
+    @classmethod
+    def list_registered(cls) -> list[str]:
+        """Return names of all eagerly and lazily registered reasoning parsers."""
+        return sorted(set(cls.reasoning_parsers.keys()) | set(cls.lazy_parsers.keys()))
+
+    @classmethod
+    def _load_lazy_parser(cls, name: str) -> type[ReasoningParser]:
+        """Import and register a lazily loaded reasoning parser."""
+        module_path, class_name = cls.lazy_parsers[name]
+        try:
+            mod = importlib.import_module(module_path)
+            parser_cls = getattr(mod, class_name)
+            if not issubclass(parser_cls, ReasoningParser):
+                raise TypeError(
+                    f"{class_name} in {module_path} is not a ReasoningParser subclass."
+                )
+
+            cls.reasoning_parsers[name] = parser_cls  # cache
+            return parser_cls
+        except Exception as e:
+            logger.exception(
+                "Failed to import lazy reasoning parser '%s' from %s: %s",
+                name,
+                module_path,
+                e,
+            )
+            raise
 
     @classmethod
     def _register_module(
         cls,
-        module: type,
-        module_name: Union[str, list[str]] | None = None,
+        module: type[ReasoningParser],
+        module_name: str | list[str] | None = None,
         force: bool = True,
     ) -> None:
+        """Register a ReasoningParser class immediately."""
         if not issubclass(module, ReasoningParser):
-            raise TypeError("module must be subclass of ReasoningParser, "
-                            f"but got {type(module)}")
+            raise TypeError(
+                f"module must be subclass of ReasoningParser, but got {type(module)}"
+            )
+
         if module_name is None:
-            module_name = module.__name__
-        if isinstance(module_name, str):
-            module_name = [module_name]
-        for name in module_name:
+            module_names = [module.__name__]
+        elif isinstance(module_name, str):
+            module_names = [module_name]
+        elif is_list_of(module_name, str):
+            module_names = module_name
+        else:
+            raise TypeError("module_name must be str, list[str], or None.")
+
+        for name in module_names:
             if not force and name in cls.reasoning_parsers:
-                existed_module = cls.reasoning_parsers[name]
-                raise KeyError(f"{name} is already registered "
-                               f"at {existed_module.__module__}")
+                existed = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered at {existed.__module__}")
             cls.reasoning_parsers[name] = module
 
+    @classmethod
+    def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
+        """
+        Register a lazy module mapping for delayed import.
+
+        Example:
+            ReasoningParserManager.register_lazy_module(
+                name="qwen3",
+                module_path="vllm.reasoning.parsers.qwen3_reasoning_parser",
+                class_name="Qwen3ReasoningParser",
+            )
+        """
+        cls.lazy_parsers[name] = (module_path, class_name)
+
     @classmethod
     def register_module(
         cls,
-        name: Union[str, list[str]] | None = None,
+        name: str | list[str] | None = None,
         force: bool = True,
-        module: Union[type, None] = None,
-    ) -> Union[type, Callable]:
+        module: type[ReasoningParser] | None = None,
+    ) -> (
+        type[ReasoningParser] | Callable[[type[ReasoningParser]], type[ReasoningParser]]
+    ):
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not
@@ -167,24 +249,29 @@ class ReasoningParserManager:
         if not isinstance(force, bool):
             raise TypeError(f"force must be a boolean, but got {type(force)}")
 
-        # raise the error ahead of time
-        if not (name is None or isinstance(name, str)
-                or is_list_of(name, str)):
-            raise TypeError(
-                "name must be None, an instance of str, or a sequence of str, "
-                f"but got {type(name)}")
-
-        # use it as a normal method: x.register_module(module=SomeClass)
+        # Immediate registration (explicit call)
         if module is not None:
             cls._register_module(module=module, module_name=name, force=force)
             return module
 
-        # use it as a decorator: @x.register_module()
-        def _register(module):
-            cls._register_module(module=module, module_name=name, force=force)
-            return module
+        # Decorator usage
+        def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]:
+            module_path = obj.__module__
+            class_name = obj.__name__
+
+            if isinstance(name, str):
+                names = [name]
+            elif is_list_of(name, str):
+                names = name
+            else:
+                names = [class_name]
+
+            for n in names:
+                cls.lazy_parsers[n] = (module_path, class_name)
+
+            return obj
 
-        return _register
+        return _decorator
 
     @classmethod
     def import_reasoning_parser(cls, plugin_path: str) -> None:
@@ -197,6 +284,7 @@ class ReasoningParserManager:
         try:
             import_from_path(module_name, plugin_path)
         except Exception:
-            logger.exception("Failed to load module '%s' from %s.",
-                             module_name, plugin_path)
+            logger.exception(
+                "Failed to load module '%s' from %s.", module_name, plugin_path
+            )
             return
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index cea4b8601ae70b32b2f8555a9fd5bcff40f26545..02689477327260044ae3803bfab9c1c34e085a40 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -3,22 +3,30 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Any
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage, ResponsesRequest)
+from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.protocol import (
+        ChatCompletionRequest,
+        ResponsesRequest,
+    )
+else:
+    ChatCompletionRequest = Any
+    ResponsesRequest = Any
+
 
 class BaseThinkingReasoningParser(ReasoningParser):
     """
     Base class for reasoning parsers that use thinking tokens.
-    
+
     This class provides common functionality for parsers that use start and end
     tokens to delimit reasoning content (
         e.g., <think>...</think>, <seed:think>...</seed:think>).
-    
+
     Subclasses must implement the start and end tokens via abstract
     properties.
     """
@@ -41,21 +49,23 @@ class BaseThinkingReasoningParser(ReasoningParser):
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
         if not self.start_token or not self.end_token:
-            raise ValueError(
-                "start_token and end_token must be defined in subclasses")
+            raise ValueError("start_token and end_token must be defined in subclasses")
 
         self.start_token_id = self.vocab.get(self.start_token)
         self.end_token_id = self.vocab.get(self.end_token)
         if self.start_token_id is None or self.end_token_id is None:
             raise RuntimeError(
                 f"{self.__class__.__name__} reasoning parser could not locate "
-                "think start/end tokens in the tokenizer!")
+                "think start/end tokens in the tokenizer!"
+            )
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
+        end_token_id = self.end_token_id
+        return any(input_id == end_token_id for input_id in reversed(input_ids))
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
@@ -64,9 +74,9 @@ class BaseThinkingReasoningParser(ReasoningParser):
         if self.end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.end_token_id) + 1:]
+            return input_ids[input_ids.index(self.end_token_id) + 1 :]
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -74,16 +84,16 @@ class BaseThinkingReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
         Uses token IDs for faster processing.
         """
         # Skip single special tokens
-        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
-                self.start_token_id, self.end_token_id
-        ]):
+        if len(delta_token_ids) == 1 and (
+            delta_token_ids[0] in [self.start_token_id, self.end_token_id]
+        ):
             return None
 
         # Check if start token is present in previous or delta.
@@ -93,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser):
                 # start token in previous, end token in delta,
                 # extract reasoning content
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.end_token):]
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
-                    content=content if content else None,
+                    reasoning=reasoning, content=content if content else None
                 )
             elif self.end_token_id in previous_token_ids:
                 # start token in previous, end token in previous,
@@ -106,51 +115,48 @@ class BaseThinkingReasoningParser(ReasoningParser):
             else:
                 # start token in previous, no end token in previous or delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         elif self.start_token_id in delta_token_ids:
             if self.end_token_id in delta_token_ids:
                 # start token in delta, end token in delta,
                 # extract reasoning content
                 start_index = delta_text.find(self.start_token)
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[start_index +
-                                               len(self.start_token):end_index]
-                content = delta_text[end_index + len(self.end_token):]
+                reasoning = delta_text[start_index + len(self.start_token) : end_index]
+                content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
-                    content=content if content else None,
+                    reasoning=reasoning, content=content if content else None
                 )
             else:
                 # start token in delta, no end token in delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         else:
             # not find thinking start token
             return DeltaMessage(content=delta_text)
 
-    def extract_reasoning_content(
-        self, model_output: str, request: Union[ChatCompletionRequest,
-                                                ResponsesRequest]
-    ) -> tuple[Optional[str], Optional[str]]:
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
-        
+
         This is the base implementation that works for most models.
         Subclasses can override this method for specific behavior.
         """
         # Check if the start token is present in the model output, remove it
         # if it is present.
         model_output_parts = model_output.partition(self.start_token)
-        model_output = model_output_parts[2] if model_output_parts[
-            1] else model_output_parts[0]
+        model_output = (
+            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
+        )
 
         # For models that may not generate start token,
         # assume the reasoning content is always at the start.
         if self.end_token not in model_output:
             return model_output, None
         else:
-            reasoning_content, _, content = model_output.partition(
-                self.end_token)
+            reasoning, _, content = model_output.partition(self.end_token)
             # If generation stops right after end-of-think, return null content
             final_content = content or None
-            return reasoning_content, final_content
+            return reasoning, final_content
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 76d2959e1c9a412a6e3c886cfffdae5ac0811ee5..a91c8ceeb6255e8548ef584e36dd8afbd166b247 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -2,14 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("deepseek_r1")
 class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for DeepSeek R1 model.
@@ -28,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -36,8 +33,8 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
-        ret = super().extract_reasoning_content_streaming(
+    ) -> DeltaMessage | None:
+        ret = super().extract_reasoning_streaming(
             previous_text,
             current_text,
             delta_text,
@@ -45,16 +42,19 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
             current_token_ids,
             delta_token_ids,
         )
-        if (ret is not None and self.start_token_id not in previous_token_ids
-                and self.start_token_id not in delta_token_ids):
+        if (
+            ret is not None
+            and self.start_token_id not in previous_token_ids
+            and self.start_token_id not in delta_token_ids
+        ):
             if self.end_token_id in delta_token_ids:
                 # end token in delta with more tokens,
                 # extract reasoning content and content
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.end_token):]
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content=content if content else None,
                 )
             elif self.end_token_id in previous_token_ids:
@@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
                 return DeltaMessage(content=delta_text)
             else:
                 # no end token in previous or delta, reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
 
         return ret
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..afdf73262aca09ec8282ae386375ffb1ab01027b
--- /dev/null
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+from .identity_reasoning_parser import IdentityReasoningParser
+
+logger = init_logger(__name__)
+
+
+class DeepSeekV3ReasoningParser(ReasoningParser):
+    """
+    V3 parser that delegates to either DeepSeekR1ReasoningParser or
+    IdentityReasoningParser based on `thinking` and `separate_reasoning`.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
+        thinking = bool(chat_kwargs.pop("thinking", False))
+
+        if thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning(model_output, request)
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cdbf14858ec22e5bbc741e1802133d6fe410521
--- /dev/null
+++ b/vllm/reasoning/ernie45_reasoning_parser.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+
+logger = init_logger(__name__)
+
+
+class Ernie45ReasoningParser(BaseThinkingReasoningParser):
+    """
+    Reasoning parser for Ernie45 thinking model.
+    The Ernie45 thinking model ouput format is
+        abc\n</think>\n\n<response>\ndef\n</response>\n
+    or  abc\n</think>\ndef
+    """
+
+    response_start_token: str = "<response>"
+    response_end_token: str = "</response>"
+    newline_token: str = "<0x0A>"
+
+    @property
+    def start_token(self) -> str:
+        """The token that starts reasoning content."""
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        """The token that ends reasoning content."""
+        return "</think>"
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction."
+            )
+
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        self.response_start_token_id = self.vocab.get(self.response_start_token)
+        self.response_end_token_id = self.vocab.get(self.response_end_token)
+        self.newline_token_id = self.vocab.get(self.newline_token)
+
+        self.parser_token_ids = [self.end_token_id, self.response_end_token_id]
+
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError(
+                "Ernie45 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!"
+            )
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        The Ernie45 thinking model ouput format is
+            abc\n</think>\n\n<response>\ndef\n</response>\n
+        or  abc\n</think>\ndef
+        - 'abc' goes to reasoning
+        - 'def' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (
+            delta_token_ids[0]
+            in [
+                self.start_token_id,
+                self.end_token_id,
+                self.response_start_token_id,
+                self.response_end_token_id,
+            ]
+        ):
+            return None
+
+        # No <think> in previous or delta, also need to check for </think>.
+        # Because the model may have generated </think> without <think>
+        if self.end_token_id in delta_token_ids:
+            # </think> in delta with more tokens,
+            # extract reasoning content and content
+            think_end_index = delta_text.find(self.end_token)
+            reasoning = delta_text[:think_end_index]
+            content = delta_text[think_end_index + len(self.end_token) :]
+            content = content.lstrip("\n")
+            response_start_idx = content.find(self.response_start_token)
+            response_end_idx = content.rfind(self.response_end_token)
+            if response_start_idx != -1:
+                content = content[response_start_idx + len(self.response_start_token) :]
+            if response_end_idx != -1:
+                content = content[:response_end_idx]
+            return DeltaMessage(
+                reasoning=reasoning,
+                content=content if content else None,
+            )
+        elif self.end_token_id in previous_token_ids:
+            # </think> in previous, thinking content ends
+            content = delta_text
+            if self.response_start_token_id in delta_token_ids:
+                content = content.lstrip("\n")
+                response_start_idx = content.find(self.response_start_token)
+                content = content[response_start_idx + len(self.response_start_token) :]
+                # if have </response>, remove it
+                response_end_idx = content.rfind(self.response_end_token)
+                if response_end_idx != -1:
+                    content = content[:response_end_idx]
+            elif self.response_end_token_id in delta_token_ids:
+                response_end_idx = content.rfind(self.response_end_token)
+                content = content[:response_end_idx]
+            # remove \n after </think>  or </response>
+            if previous_token_ids[-1] in self.parser_token_ids and (
+                len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
+            ):
+                content = content.lstrip("\n")
+            # remove \n after </think>\n
+            if (
+                len(previous_token_ids) > 1
+                and previous_token_ids[-2] == self.end_token_id
+            ) and (
+                len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
+            ):
+                content = content.lstrip("\n")
+
+            return DeltaMessage(content=content if content else None)
+        else:
+            # no </think> in previous or delta, reasoning content continues
+            return DeltaMessage(reasoning=delta_text)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        """
+        Extract reasoning content from the model output.
+        The Ernie45 thinking model ouput format is
+            abc\n</think>\n\n\n<response>\ndef\n</response>\n
+        or  abc\n</think>\ndef
+        - 'abc' goes to reasoning
+        - 'def' goes to content
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+        reasoning, content = super().extract_reasoning(model_output, request)
+        if content:
+            start_idx = content.find(self.response_start_token)
+            end_idx = content.rfind(self.response_end_token)
+            # Simultaneously existing and in the correct order
+            if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
+                content = content[start_idx + len(self.response_start_token) : end_idx]
+        final_content = content or None
+
+        return reasoning, final_content
diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py
index 11e828a7039fa1ce8522ddcb7d2498d6940beb8a..1871adcd43210bf659c5ae2293c91dc4b6af0c73 100644
--- a/vllm/reasoning/glm4_moe_reasoning_parser.py
+++ b/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -2,19 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("glm45")
 class Glm4MoeModelReasoningParser(ReasoningParser):
     """
     Reasoning parser for the Glm4MoeModel model.
@@ -30,22 +27,39 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         super().__init__(tokenizer, *args, **kwargs)
         self.think_start_token = "<think>"
         self.think_end_token = "</think>"
+        self.assistant_token = "<|assistant|>"
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
         self.think_start_token_id = self.vocab.get(self.think_start_token)
         self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if (self.think_start_token_id is None
-                or self.think_end_token_id is None):
+        self.assistant_token_id = self.vocab.get(self.assistant_token)
+        if (
+            self.think_start_token_id is None
+            or self.think_end_token_id is None
+            or self.assistant_token_id is None
+        ):
             raise RuntimeError(
                 "Glm4MoeModel reasoning parser could not locate "
-                "think start/end tokens in the tokenizer!")
+                "think start/end or assistant tokens in the tokenizer!"
+            )
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.think_end_token_id in input_ids
+        """
+        GLM's chat template has <think></think> tokens after every
+        <|assistant|> token. Thus, we need to check if </think> is
+        after the most recent <|assistant|> token (if present).
+        """
+        for token_id in input_ids[::-1]:
+            if token_id == self.think_end_token_id:
+                return True
+            elif token_id == self.assistant_token_id:
+                return False
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
@@ -54,9 +68,9 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         if self.think_end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+            return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -64,19 +78,19 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
         Uses token IDs for faster processing.
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
         """
         # Skip single special tokens
-        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
-                self.think_start_token_id, self.think_end_token_id
-        ]):
+        if len(delta_token_ids) == 1 and (
+            delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]
+        ):
             return None
 
         if self.think_start_token_id in previous_token_ids:
@@ -84,10 +98,12 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
                 # <think> in previous, </think> in delta,
                 # extract reasoning content
                 end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token) :]
+                return DeltaMessage(
+                    reasoning=reasoning,
+                    content=content if content else None,
+                )
             elif self.think_end_token_id in previous_token_ids:
                 # <think> in previous, </think> in previous,
                 # reasoning content continues
@@ -95,34 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
             else:
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         elif self.think_start_token_id in delta_token_ids:
             if self.think_end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
                 start_index = delta_text.find(self.think_start_token)
                 end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[start_index +
-                                               len(self.think_start_token
-                                                   ):end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
+                reasoning = delta_text[
+                    start_index + len(self.think_start_token) : end_index
+                ]
+                content = delta_text[end_index + len(self.think_end_token) :]
+                return DeltaMessage(
+                    reasoning=reasoning,
+                    content=content if content else None,
+                )
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         else:
             # thinking is disabled, just content
             return DeltaMessage(content=delta_text)
 
-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
 
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
 
         Returns:
@@ -130,22 +148,24 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         """
 
         # Check if the model output contains the <think> and </think> tokens.
-        if (self.think_start_token not in model_output
-                or self.think_end_token not in model_output):
+        if (
+            self.think_start_token not in model_output
+            or self.think_end_token not in model_output
+        ):
             return None, model_output
         # Check if the <think> is present in the model output, remove it
         # if it is present.
         model_output_parts = model_output.partition(self.think_start_token)
-        model_output = model_output_parts[2] if model_output_parts[
-            1] else model_output_parts[0]
+        model_output = (
+            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
+        )
         # Check if the model output contains the </think> tokens.
         # If the end token is not found, return the model output as is.
         if self.think_end_token not in model_output:
             return None, model_output
 
         # Extract reasoning content from the model output.
-        reasoning_content, _, content = model_output.partition(
-            self.think_end_token)
+        reasoning, _, content = model_output.partition(self.think_end_token)
 
         final_content = content or None
-        return reasoning_content, final_content
+        return reasoning, final_content
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index b0988d5d2618aa9b468a77146a372f779b4ee879..0c1b54d0bd359e0157561f6d409dc37b79b67d6a 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -1,21 +1,62 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import json
 from collections.abc import Sequence
-from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.harmony_utils import parse_chat_output
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
+no_func_reaonsing_tag = {
+    "type": "structural_tag",
+    "format": {
+        "type": "triggered_tags",
+        "tags": [
+            {
+                "begin": "<|channel|>analysis<|message|>",
+                "content": {"type": "any_text"},
+                "end": "<|end|>",
+            }
+        ],
+        "triggers": ["<|channel|>analysis"],
+        "stop_after_first": False,
+    },
+}
+
+
+def from_builtin_tool_to_tag(tool: str) -> list[dict]:
+    tag = [
+        {
+            "begin": f"<|channel|>commentary to={tool}",
+            "content": {"type": "any_text"},
+            "end": "<|end|>",
+        },
+        {
+            "begin": f"<|channel|>analysis to={tool}",
+            "content": {"type": "any_text"},
+            "end": "<|end|>",
+        },
+    ]
+    return tag
+
+
+def tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list: list[str]) -> dict:
+    import copy
+
+    new_tag = copy.deepcopy(no_func_reaonsing_tag)
+    new_tag["format"]["triggers"].append("<|channel|>commentary to=")
+
+    for tool in builtin_tool_list:
+        new_tag["format"]["tags"].extend(from_builtin_tool_to_tag(tool))
+    return new_tag
+
 
-@ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
     """
     Reasoning parser for GptOss model.
@@ -26,17 +67,35 @@ class GptOssReasoningParser(ReasoningParser):
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
-        self.reasoning_end_token_ids = self.model_tokenizer.encode(
-            "<|start|>assistant<|channel|>final<|message|>")
+        # The model can output some special tokens between "final" and "<|message|>"
+        # So we need to look for both sequences to determine the end of reasoning.
+        self.reasoning_end_token_ids_prefix = self.model_tokenizer.encode(
+            "<|channel|>final"
+        )
+        self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
+        self.reasoning_max_num_between_tokens = 20
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        end_token_ids = self.reasoning_end_token_ids
-        assert len(end_token_ids) > 0, "reasoning_end_token_ids is empty"
+        end_token_ids_prefix = self.reasoning_end_token_ids_prefix
+        end_token_ids_suffix = self.reasoning_end_token_ids_suffix
+        assert len(end_token_ids_prefix) > 0, "reasoning_end_token_ids_prefix is empty"
+        assert len(end_token_ids_suffix) > 0, "reasoning_end_token_ids_suffix is empty"
         # Check if the end sequence is present in the input_ids.
         # We search from the end of input_ids to find the last match.
-        for i in range(len(input_ids) - len(end_token_ids), -1, -1):
-            if input_ids[i:i + len(end_token_ids)] == end_token_ids:
-                return True
+        for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1):
+            if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix:
+                # We have found the prefix, now we look for the suffix after the prefix.
+                suffix_start = i + len(end_token_ids_prefix)
+                for j in range(
+                    suffix_start, len(input_ids) - len(end_token_ids_suffix) + 1
+                ):
+                    if j - suffix_start >= self.reasoning_max_num_between_tokens:
+                        break
+                    if (
+                        input_ids[j : j + len(end_token_ids_suffix)]
+                        == end_token_ids_suffix
+                    ):
+                        return True
         return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
@@ -45,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser):
             return []
         return self.model_tokenizer.encode(content)
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -53,35 +112,62 @@ class GptOssReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
-        prev_reasoning, prev_content, _ = parse_chat_output(
-            list(previous_token_ids))
-        cur_reasoning, cur_content, _ = parse_chat_output(
-            list(current_token_ids))
+    ) -> DeltaMessage | None:
+        prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids))
+        cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids))
         reasoning_delta = None
         content_delta = None
         if cur_reasoning is not None:
             prev_r = prev_reasoning or ""
             if cur_reasoning.startswith(prev_r):
-                reasoning_delta = cur_reasoning[len(prev_r):] or None
+                reasoning_delta = cur_reasoning[len(prev_r) :] or None
             else:
                 reasoning_delta = cur_reasoning
         if cur_content is not None:
             prev_c = prev_content or ""
             if cur_content.startswith(prev_c):
-                content_delta = cur_content[len(prev_c):] or None
+                content_delta = cur_content[len(prev_c) :] or None
             else:
                 content_delta = cur_content
         if reasoning_delta is None and content_delta is None:
             return None
-        return DeltaMessage(reasoning_content=reasoning_delta,
-                            content=content_delta)
+        return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self,
         model_output: str,
         request: ChatCompletionRequest,
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         raise NotImplementedError(
             "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used."  # noqa: E501
         )
+
+    # This function prepares the structural tag to format reasoning output
+    def prepare_structured_tag(
+        self, original_tag: str | None, tool_server: ToolServer | None
+    ) -> str:
+        if original_tag is None:
+            if tool_server is None:
+                return json.dumps(no_func_reaonsing_tag)
+            else:
+                builtin_tool_list: list[str] = []
+                if tool_server.has_tool("browser"):
+                    builtin_tool_list.append("browser")
+                if tool_server.has_tool("python"):
+                    builtin_tool_list.append("python")
+                if tool_server.has_tool("container"):
+                    builtin_tool_list.append("container")
+
+                if len(builtin_tool_list) > 0:
+                    logger.info("Builtin_tool_list: %s", builtin_tool_list)
+                    func_tag = json.dumps(
+                        tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list)
+                    )
+                else:
+                    logger.info("Builtin_tool_list is empty")
+                    func_tag = json.dumps(no_func_reaonsing_tag)
+
+                return func_tag
+        else:
+            # There is potential risk for appending the tag to the original tag
+            return original_tag
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index b76170f39f10927eebc3f0d5304b5137ab77572f..484045d66a3c9247aef0e36e6b439a06c8ec59a0 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -2,20 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("granite")
 class GraniteReasoningParser(ReasoningParser):
     """
     Reasoning parser for IBM Granite.
@@ -34,15 +31,14 @@ class GraniteReasoningParser(ReasoningParser):
         self.response_start_expr = r"(?:Here's|Here is) my response:"
 
         self.reasoning_regex = re.compile(
-            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
-            re.DOTALL)
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
+        )
 
         self.valid_think_starts = [
-            "Here's my thought process:", "Here is my thought process:"
-        ]
-        self.valid_response_starts = [
-            "Here's my response:", "Here is my response:"
+            "Here's my thought process:",
+            "Here is my thought process:",
         ]
+        self.valid_response_starts = ["Here's my response:", "Here is my response:"]
 
         # Substrings to match for sequence boundaries on raw text
         self.seq_boundary_end = ":"
@@ -50,11 +46,12 @@ class GraniteReasoningParser(ReasoningParser):
 
         # The longest any thinking / start of response message can be
         self.longest_think_start = max(
-            len(think_start) for think_start in self.valid_think_starts)
+            len(think_start) for think_start in self.valid_think_starts
+        )
 
-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -70,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser):
         re_match = self.reasoning_regex.findall(model_output)
         if not re_match:
             return None, model_output
-        reasoning_content, response_content = re_match[0]
+        reasoning, response_content = re_match[0]
         if not response_content:
-            return reasoning_content, None
-        return reasoning_content, response_content
+            return reasoning, None
+        return reasoning, response_content
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -83,7 +80,7 @@ class GraniteReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract the reasoning content / content emitted by granite models;
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -110,26 +107,27 @@ class GraniteReasoningParser(ReasoningParser):
             Union[DeltaMessage, None]
                 DeltaMessage with either reasoning content or content, or None.
         """
-        reasoning_content, resp_seq_len, content = self._get_content_sections(
-            current_text)
+        reasoning, resp_seq_len, content = self._get_content_sections(current_text)
         # Either we haven't finished the start of the reasoning sequence,
         # or the model is generating something unexpected.
-        if not reasoning_content:
+        if not reasoning:
             delta_message = self._get_delta_message_with_no_reasoning_bounds(
-                current_text, delta_text)
+                current_text, delta_text
+            )
         # We have a start of reasoning message, but have not yet finished
         # the start of response sequence.
         elif not content:
             delta_message = self._get_delta_message_with_no_response_bounds(
-                current_text, reasoning_content, delta_text)
+                current_text, reasoning, delta_text
+            )
         # We've finished both the start of reasoning and start of response seq.
         else:
             # This should never happen since we matched on the response
             assert resp_seq_len is not None
             delta_message = self._get_delta_message_with_both_bounds(
-                delta_text, reasoning_content, content, current_text,
-                resp_seq_len)
-        if not delta_message.content and not delta_message.reasoning_content:
+                delta_text, reasoning, content, current_text, resp_seq_len
+            )
+        if not delta_message.content and not delta_message.reasoning:
             return None
         return delta_message
 
@@ -139,26 +137,27 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             text (str): Text to check for leading substr.
-        
+
         Returns:
             bool: True if any of the possible reasoning start seqs match.
         """
         return any(
-            think_start.startswith(text)
-            for think_start in self.valid_think_starts)
+            think_start.startswith(text) for think_start in self.valid_think_starts
+        )
 
     def _is_response_start_substr(self, text: str) -> bool:
         """Check if a text matches one of the possible start response seqs.
 
         Args:
             text (str): Text to check for leading substr.
-        
+
         Returns:
             bool: True if any of the possible response start seqs match.
         """
         return any(
             response_start.startswith(text)
-            for response_start in self.valid_response_starts)
+            for response_start in self.valid_response_starts
+        )
 
     def _get_delta_message_with_no_reasoning_bounds(
         self,
@@ -177,28 +176,27 @@ class GraniteReasoningParser(ReasoningParser):
         """
         prev_longest_length = len(current_text) - len(delta_text)
         is_substr = self._is_reasoning_start_substr(current_text)
-        was_substr = self._is_reasoning_start_substr(
-            current_text[:prev_longest_length])
+        was_substr = self._is_reasoning_start_substr(current_text[:prev_longest_length])
 
         # Check if we just generated something NOT in the special token seq;
         # if so, add everything that we previously skipped with this delta
         # message and append everything to content in the future.
         if was_substr and not is_substr:
             return DeltaMessage(
-                reasoning_content=None,
+                reasoning=None,
                 content=current_text,
             )
         if is_substr:
             # Might still be in the special token sequence; return nothing
-            return DeltaMessage(reasoning_content=None, content=None)
+            return DeltaMessage(reasoning=None, content=None)
         # Otherwise the sequence has already been broken and we already
         # corrected; just return the delta text as normal content.
-        return DeltaMessage(reasoning_content=None, content=delta_text)
+        return DeltaMessage(reasoning=None, content=delta_text)
 
     def _get_delta_message_with_no_response_bounds(
         self,
         current_text: str,
-        reasoning_content: str,
+        reasoning: str,
         delta_text: str,
     ) -> DeltaMessage:
         """Parse the delta message when the current text has both reasoning
@@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             current_text (str): The full previous + delta text.
-            reasoning_content (str): reasoning content from current_text.
+            reasoning (str): reasoning content from current_text.
             delta_text (str): Text to consider and parse content from.
 
         Returns:
@@ -220,13 +218,14 @@ class GraniteReasoningParser(ReasoningParser):
         # content and fully parse it out; we should not pass the : back.
         ends_with_start_response_seq = any(
             current_text.endswith(response_start)
-            for response_start in self.valid_response_starts)
-        if reasoning_content is None or ends_with_start_response_seq:
-            return DeltaMessage(reasoning_content=None, content=None)
+            for response_start in self.valid_response_starts
+        )
+        if reasoning is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning=None, content=None)
 
         # Consider previous / current text only within context of the reasoning
-        previous_text = reasoning_content[:-len(delta_text)]
-        current_text = reasoning_content
+        previous_text = reasoning[: -len(delta_text)]
+        current_text = reasoning
 
         # We need to be careful about adding unfinished response sequences;
         # Find the place at which we MIGHT be starting a response sequence
@@ -234,42 +233,48 @@ class GraniteReasoningParser(ReasoningParser):
         delta_idx = delta_text.rfind(self.seq_boundary_start)
 
         # Check the state of potential start of response substring matches.
-        prev_was_substr = self._is_response_start_substr(
-            previous_text[prev_idx:]) if prev_idx >= 0 else False
-        delta_continues_substr = self._is_response_start_substr(
-            current_text[prev_idx:]) if prev_idx >= 0 else False
-        delta_new_substr = self._is_response_start_substr(
-            delta_text[delta_idx:]) if delta_idx >= 0 else False
+        prev_was_substr = (
+            self._is_response_start_substr(previous_text[prev_idx:])
+            if prev_idx >= 0
+            else False
+        )
+        delta_continues_substr = (
+            self._is_response_start_substr(current_text[prev_idx:])
+            if prev_idx >= 0
+            else False
+        )
+        delta_new_substr = (
+            self._is_response_start_substr(delta_text[delta_idx:])
+            if delta_idx >= 0
+            else False
+        )
 
         # Delta only contains potential continued response sequence text.
         if delta_continues_substr:
-            return DeltaMessage(reasoning_content=None, content=None)
+            return DeltaMessage(reasoning=None, content=None)
 
         if not prev_was_substr:
             # Delta may be starting a new response seq but has other text too.
             if delta_new_substr:
-                return DeltaMessage(reasoning_content=delta_text[:delta_idx],
-                                    content=None)
+                return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
             # Normal case for most reasoning text (no potential special seqs).
-            return DeltaMessage(reasoning_content=delta_text, content=None)
+            return DeltaMessage(reasoning=delta_text, content=None)
         # The substring that previously seemed to be a potential response
         # seq wasn't one; we need to add the content to the delta message,
         # and also slice off the potential response sequence
         elif delta_new_substr:
-            reasoning_content = previous_text[
-                prev_idx:] + delta_text[:delta_idx]
-            return DeltaMessage(reasoning_content=reasoning_content,
-                                content=None)
+            reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning=reasoning, content=None)
         # No new substring yet, and we broke our old one; take the whole delta
         return DeltaMessage(
-            reasoning_content=previous_text[prev_idx:] + delta_text,
+            reasoning=previous_text[prev_idx:] + delta_text,
             content=None,
         )
 
     def _get_delta_message_with_both_bounds(
         self,
         delta_text: str,
-        reasoning_content: str,
+        reasoning: str,
         response_content: str,
         current_text: str,
         response_seq_len: int,
@@ -279,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             delta_text: Text to consider and parse content from.
-            reasoning_content: reasoning content from current_text.
+            reasoning: reasoning content from current_text.
             response_content: response content from current_text.
             current_text: The full previous + delta text.
             response_seq_len: Len of the complete response sequence used.
@@ -288,32 +293,30 @@ class GraniteReasoningParser(ReasoningParser):
             DeltaMessage: Message containing the parsed content.
         """
         # Always have content; take length to the end
-        delta_content = delta_text[-len(response_content):]
-        reasoning_end_idx = len(delta_text) - (len(response_content) +
-                                               response_seq_len)
+        delta_content = delta_text[-len(response_content) :]
+        reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
 
         if reasoning_end_idx < 0:
-            delta_reasoning_content = None
+            delta_reasoning = None
         else:
             # Get the starting offset
-            start_reasoning_content_idx = len(
-                reasoning_content) + response_seq_len + len(
-                    response_content) - 1
+            start_reasoning_idx = (
+                len(reasoning) + response_seq_len + len(response_content) - 1
+            )
             delta_offset = len(current_text) - len(delta_text)
-            start_offset = start_reasoning_content_idx - delta_offset
+            start_offset = start_reasoning_idx - delta_offset
             if start_offset < 0:
                 start_offset = 0
-            delta_reasoning_content = delta_text[
-                start_offset:reasoning_end_idx]
+            delta_reasoning = delta_text[start_offset:reasoning_end_idx]
 
         return DeltaMessage(
-            reasoning_content=delta_reasoning_content,
+            reasoning=delta_reasoning,
             content=delta_content,
         )
 
     def _get_content_sections(
         self, current_text: str
-    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+    ) -> tuple[str | None, int | None, str | None]:
         """Parse the text to extract the reasoning content / content
         if we have them.
 
@@ -326,38 +329,35 @@ class GraniteReasoningParser(ReasoningParser):
             (if there is one) and the non-reasoning content.
         """
         current_chunk_start = 0
-        start_reasoning_content = None
+        start_reasoning = None
         parsed_content = False
         delimiter_idxs = [
-            idx for idx, char in enumerate(current_text)
+            idx
+            for idx, char in enumerate(current_text)
             if char == self.seq_boundary_end
         ]
 
         for current_chunk_end in delimiter_idxs:
             current_chunk = current_text[current_chunk_start:current_chunk_end]
             # Check to see if the start of reasoning seq if complete
-            if start_reasoning_content is None:
+            if start_reasoning is None:
                 for think_start in self.valid_think_starts:
                     if current_chunk == think_start[:-1]:
-                        start_reasoning_content = current_chunk_end + 1
+                        start_reasoning = current_chunk_end + 1
                         current_chunk_start = current_chunk_end + 1
                         break
 
             # Check to see if the start of response seq if complete
             elif not parsed_content:
                 for response_start in self.valid_response_starts:
-                    if current_chunk[-len(response_start) +
-                                     1:] == response_start[:-1]:
+                    if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
                         # Mark end of reasoning and start response content
                         # after the start of response sequence.
-                        end_reasoning_content = current_chunk_end - len(
-                            response_start)
-                        reasoning_content = current_text[
-                            start_reasoning_content:end_reasoning_content]
-                        response_content = current_text[current_chunk_end + 1:]
-                        return reasoning_content, len(
-                            response_start), response_content
-
-        if start_reasoning_content and not parsed_content:
-            return current_text[start_reasoning_content:], None, None
+                        end_reasoning = current_chunk_end - len(response_start)
+                        reasoning = current_text[start_reasoning:end_reasoning]
+                        response_content = current_text[current_chunk_end + 1 :]
+                        return reasoning, len(response_start), response_content
+
+        if start_reasoning and not parsed_content:
+            return current_text[start_reasoning:], None, None
         return None, None, None
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index 6e3b056d6b62921eb28f828285f66f5dc4b7ed16..f297454f57ec97b57d1d7609516ac332e2ae8b7c 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -2,36 +2,33 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("hunyuan_a13b")
 class HunyuanA13BReasoningParser(ReasoningParser):
     """
     Reasoning parser for Hunyuan A13B Model
 
     HunyuanReasoningParser
 
-    This class implements a reasoning parser specifically designed 
-    for the Hunyuan A13B Model. It is responsible for parsing and 
-    extracting structured reasoning and answer segments from model 
+    This class implements a reasoning parser specifically designed
+    for the Hunyuan A13B Model. It is responsible for parsing and
+    extracting structured reasoning and answer segments from model
     outputs that follow a specific pattern.
 
     Key Features:
         - For non-stream output , Recognizes and extracts reasoning ("think")
          and answer ("answer") sections from text using regular expressions.
         - For stream process, it requires a token id sequences to change the
-          reasoning state and other state so it maintains internal state to 
+          reasoning state and other state so it maintains internal state to
           manage parsing across multiple token.
 
 
@@ -50,20 +47,19 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
         self.full_match_reasoning_regex = re.compile(
             rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
-            re.DOTALL)
+            re.DOTALL,
+        )
 
         self.half_match_reasoning_regex = re.compile(
-            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
-            re.DOTALL)
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
+        )
 
         self.think_start_ids = [14023, 771, 397]
         self.think_start_ids_fast = [14023, 771, 1363]
         self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
         self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
         self.response_end_ids = [198, 524, 9399, 29]
-        self.fast_think_ids = [
-            14023, 771, 1363, 524, 27963, 397, 27, 9399, 397
-        ]
+        self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397]
 
         # when state change, send out all the buffered text in last state
         self.buffered_text = []
@@ -90,9 +86,9 @@ class HunyuanA13BReasoningParser(ReasoningParser):
         # this id is not part of content, so just return [] here.
         return []
 
-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -108,33 +104,33 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
         re_match = self.full_match_reasoning_regex.findall(model_output)
         if re_match:
-            reasoning_content, response_content = re_match[0]
-            if len(reasoning_content) == 0:
-                reasoning_content = None
+            reasoning, response_content = re_match[0]
+            if len(reasoning) == 0:
+                reasoning = None
             if len(response_content) == 0:
                 response_content = None
-            return reasoning_content, response_content
+            return reasoning, response_content
 
         fallback_regex = self.half_match_reasoning_regex
         fallback_match = fallback_regex.findall(model_output)
         if fallback_match:
-            reasoning_content, response_content = fallback_match[0]
+            reasoning, response_content = fallback_match[0]
 
             if response_content.endswith(self.response_end_expr):
-                response_content = response_content[:-len(self.
-                                                          response_end_expr)]
+                response_content = response_content[: -len(self.response_end_expr)]
 
-            if len(reasoning_content) == 0:
-                reasoning_content = None
+            if len(reasoning) == 0:
+                reasoning = None
             if len(response_content) == 0:
                 response_content = None
 
-            return reasoning_content, response_content
+            return reasoning, response_content
 
         return None, model_output
 
-    def _is_strict_increasing_subsequence(self, subsequence: Sequence[int],
-                                          sequence: Sequence[int]) -> bool:
+    def _is_strict_increasing_subsequence(
+        self, subsequence: Sequence[int], sequence: Sequence[int]
+    ) -> bool:
         if not subsequence:
             return False
 
@@ -144,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
                 sub_idx += 1
         return sub_idx == len(subsequence)
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -152,34 +148,34 @@ class HunyuanA13BReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract content using token ID sequence state machine"""
         # Define sequences
         think_start_sequence = self.think_start_ids
         response_start_sequence = self.response_start_ids
         response_end_sequence = self.response_end_ids
 
-        assert (len(delta_token_ids) == 1)
+        assert len(delta_token_ids) == 1
         # Process each token in the delta
         token = delta_token_ids[0]
 
         def check_token_with_sequence(token):
             if self.current_state == "idle" or self.current_state == "think":
-                return (token == self.expected_sequence[self.sequence_index]
-                         or token ==  \
-                         self.expected_sequence_side[self.sequence_index])
+                return (
+                    token == self.expected_sequence[self.sequence_index]
+                    or token == self.expected_sequence_side[self.sequence_index]
+                )
             else:
                 return token == self.expected_sequence[self.sequence_index]
 
         def check_last_token(token):
             if self.current_state == "idle" or self.current_state == "think":
                 # only return true if it's judge using a side sequence.
-                if (self.sequence_index - 1 < len(self.expected_sequence_side)
-                        and token
-                        == self.expected_sequence_side[self.sequence_index -
-                                                       1]):
-                    return self.sequence_index == len(
-                        self.expected_sequence_side)
+                if (
+                    self.sequence_index - 1 < len(self.expected_sequence_side)
+                    and token == self.expected_sequence_side[self.sequence_index - 1]
+                ):
+                    return self.sequence_index == len(self.expected_sequence_side)
                 else:
                     return self.sequence_index == len(self.expected_sequence)
             else:
@@ -227,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
                 # Return content based on current state
                 if self.current_state == "think":
-                    return DeltaMessage(reasoning_content=buffered_content,
-                                        content=None)
+                    return DeltaMessage(reasoning=buffered_content, content=None)
                 else:
-                    return DeltaMessage(reasoning_content=None,
-                                        content=buffered_content)
+                    return DeltaMessage(reasoning=None, content=buffered_content)
             else:
                 # No buffered content, send normally
                 if self.current_state == "think":
-                    return DeltaMessage(reasoning_content=delta_text,
-                                        content=None)
+                    return DeltaMessage(reasoning=delta_text, content=None)
                 else:
-                    return DeltaMessage(reasoning_content=None,
-                                        content=delta_text)
+                    return DeltaMessage(reasoning=None, content=delta_text)
 
         # If no content to send in this delta
         return None
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92f8add0391aabb1e828cbbfd5c473c355944e9
--- /dev/null
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+
+logger = init_logger(__name__)
+
+
+class IdentityReasoningParser(ReasoningParser):
+    """
+    Identity reasoning parser.
+
+    This parser does not attempt to parse or strip out reasoning tokens.
+    It treats the entire model output as content and ignores reasoning.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction."
+            )
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        # Always return True, since we never treat reasoning specially
+        return True
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        # Identity: return all tokens as content
+        return input_ids
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        # Just wrap delta_text as content, ignore reasoning
+        if delta_text:
+            return DeltaMessage(content=delta_text)
+        return None
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        # No reasoning separation: return None for reasoning,
+        # and full model_output as content
+        return None, model_output
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f5f2f88caf731d81f2004be60894a9d1966747
--- /dev/null
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
+    """
+    Reasoning parser for MiniMax M2 model.
+    """
+
+    @property
+    def start_token(self) -> str:
+        """The token that starts reasoning content."""
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        """The token that ends reasoning content."""
+        return "</think>"
+
+
+class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for MiniMax M2 model.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+        self.end_token_id = self.vocab.get("</think>")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        end_token_id = self.end_token_id
+        return any(input_id == end_token_id for input_id in reversed(input_ids))
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return input_ids
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        if len(previous_token_ids) == 0:
+            delta_text = "<think>" + delta_text
+        return DeltaMessage(content=delta_text)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
+        return None, "<think>" + model_output
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index ceda96ca6a6dce5d61125f8296c326b9181777eb..af6d179bf6d019a7d241044471dcc17e7b519907 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -4,15 +4,13 @@
 from functools import cached_property
 
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.reasoning.deepseek_r1_reasoning_parser import (
-    DeepSeekR1ReasoningParser)
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("mistral")
 class MistralReasoningParser(DeepSeekR1ReasoningParser):
     """
     Reasoning parser for Mistral models.
@@ -23,34 +21,35 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser):
 
     def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
         if not isinstance(tokenizer, MistralTokenizer):
-            raise ValueError(
-                "The tokenizer must be an instance of MistralTokenizer.")
+            raise ValueError("The tokenizer must be an instance of MistralTokenizer.")
 
         ReasoningParser.__init__(self, tokenizer, *args, **kwargs)
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
-        self.start_token_id = tokenizer.tokenizer.get_control_token(
-            self.start_token)
-        self.end_token_id = tokenizer.tokenizer.get_control_token(
-            self.end_token)
+        self.start_token_id = tokenizer.tokenizer.get_control_token(self.start_token)
+        self.end_token_id = tokenizer.tokenizer.get_control_token(self.end_token)
 
         if self.start_token_id is None or self.end_token_id is None:
             raise RuntimeError(
                 "Mistral reasoning parser could not locate think start/end "
-                "tokens in the tokenizer!")
+                "tokens in the tokenizer!"
+            )
 
     @cached_property
     def start_token(self) -> str:
         """The token that starts reasoning content."""
         from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         return SpecialTokens.begin_think
 
     @cached_property
     def end_token(self) -> str:
         """The token that ends reasoning content."""
         from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         return SpecialTokens.end_think
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..7149f8c4123b3b968fa6bea535fca805b37e09de
--- /dev/null
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses as dt
+import enum
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+
+import regex as re
+
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+
+logger = init_logger(__name__)
+
+
+class Olmo3ReasoningState(enum.Enum):
+    REASONING = 1
+    CONTENT = 2
+
+
+@dt.dataclass(frozen=True)
+class Indices:
+    start: int
+    end: int
+
+    def __len__(self):
+        return self.end - self.start
+
+
+def string_overlap(a: str, b: str) -> tuple[Indices | None, Indices | None]:
+    """
+    Find the longest overlap where the end of string a matches the start
+    of string b.
+
+    Args:
+        a: First string
+        b: Second string
+
+    Returns:
+        Tuple of IndicesTuples representing the overlapping portions in each
+        string, or a tuple of None if no overlap exists
+    """
+
+    # swap so a is always the shorter string
+    a, b, swap = (a, b, False) if len(a) < len(b) else (b, a, True)
+
+    # first check: is a fully contained in b?
+    if a in b:
+        ind_a = Indices(0, len(a))
+        ind_b = Indices(b.index(a), b.index(a) + len(a))
+        return (ind_b, ind_a) if swap else (ind_a, ind_b)
+
+    # second check: does the end of a overlap with the
+    #               beginning of b?
+    for i in range(len(a) - 1, 0, -1):
+        if a[-i:] == b[:i]:
+            ind_a = Indices(len(a) - i, len(a))
+            ind_b = Indices(0, i)
+            return (ind_b, ind_a) if swap else (ind_a, ind_b)
+
+    # third check: does the beginning of a overlap with
+    #              the end of b?
+    for i in range(len(a) - 1, 0, -1):
+        if b[-i:] == a[:i]:
+            ind_a = Indices(0, i)
+            ind_b = Indices(len(b) - i, len(b))
+            return (ind_b, ind_a) if swap else (ind_a, ind_b)
+
+    return None, None
+
+
+@dt.dataclass
+class Olmo3ReasoningBuffer:
+    think_start: str = "<think>"
+    think_end: str = "</think>"
+    buffer: str = ""
+
+    # we start in reasoning state to support cases where we hardcode
+    # <think> as the start of the reasoning block.
+    # In those cases, the only token we will see is </think>, which
+    # is when we switch to content state.
+    state: Olmo3ReasoningState = Olmo3ReasoningState.REASONING
+
+    def process_buffer(self) -> DeltaMessage | None:
+        start_think_idx = self.buffer.find(self.think_start)
+
+        if start_think_idx >= 0:
+            self.state = Olmo3ReasoningState.REASONING
+            pretext, self.buffer = (
+                self.buffer[:start_think_idx],
+                self.buffer[start_think_idx + len(self.think_start) :],
+            )
+            if start_think_idx > 0:
+                # this covers the case there's content before
+                # the start of the reasoning block
+                return DeltaMessage(content=pretext)
+
+        end_think_idx = self.buffer.rfind(self.think_end)
+
+        if end_think_idx >= 0:
+            self.state = Olmo3ReasoningState.CONTENT
+            pretext, self.buffer = (
+                self.buffer[:end_think_idx],
+                self.buffer[end_think_idx + len(self.think_end) :],
+            )
+            if end_think_idx > 0:
+                # this covers the case there's content before
+                # the end of the reasoning block
+                return DeltaMessage(reasoning=pretext)
+
+        if self.state == Olmo3ReasoningState.REASONING:
+            # we are inside reasoning block, return and empty
+            # the text buffer
+            (
+                text_buffer,
+                self.buffer,
+            ) = self.buffer, ""
+            return DeltaMessage(reasoning=text_buffer)
+
+        if self.state == Olmo3ReasoningState.CONTENT:
+            # we are outside reasoning block, return and empty
+            # the text buffer
+            (
+                text_buffer,
+                self.buffer,
+            ) = self.buffer, ""
+            return DeltaMessage(content=text_buffer)
+
+        # nothing to return unless we are in reasoning or content state
+        return None
+
+    def __len__(self):
+        # is the length of the text buffer
+        return len(self.buffer)
+
+    def add_text(self, delta_text: str) -> DeltaMessage | None:
+        # we start by adding the delta text to the buffer
+        self.buffer += delta_text
+
+        # setting this to empty before starting
+        delta_message: DeltaMessage | None = None
+
+        # we start by computing the overlap between the delta_text
+        # and start/end of think tokens.
+        _, overlap_think_start = string_overlap(delta_text, self.think_start)
+        _, overlap_think_end = string_overlap(delta_text, self.think_end)
+
+        partial_overlap_start = overlap_think_start is not None and len(
+            overlap_think_start
+        ) < len(self.think_start)
+        partial_overlap_end = overlap_think_end is not None and len(
+            overlap_think_end
+        ) < len(self.think_end)
+
+        if (
+            partial_overlap_start
+            and self.think_start in self.buffer
+            and not partial_overlap_end
+        ):
+            # we can only process the buffer if partial overlap
+            # is the last part of think token (thus causing
+            # text_buffer to contain the start of think token)
+            # and there are no partial overlaps with end think
+            delta_message = self.process_buffer()
+
+        elif partial_overlap_end and self.think_end in self.buffer:
+            # same as before (partial overlap only allowed)
+            # if the buffer contains the end think token,
+            # but we don't have to check for partial overlap
+            # with start think token because they are handled
+            # by the previous condition
+            delta_message = self.process_buffer()
+
+        elif partial_overlap_start or partial_overlap_end:
+            # in general, if there are overlaps, we don't
+            # process the buffer because we want to wait until
+            # the think token is fully completed.
+            return None
+        else:
+            # we process the buffer as normal
+            delta_message = self.process_buffer()
+
+        return delta_message
+
+
+class Olmo3ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for Olmo 3 model
+
+    Olmo3ReasoningParser
+
+    This class implements a reasoning parser specifically designed for the
+    Olmo 3 family of models. Olmo 3 models do not use special tokens to
+    indicate reasoning; rather, reasoning trace is wrapped in `<think>` and
+    `</think>`, which are tokenized using standard vocabulary entries.
+    Because of this, the parser operates in string space, accumulating the
+    characters in a buffer until it sees `<think>` or `</think>`. tokens
+    to switch modes.
+
+    Key Features:
+        - For non-stream output, Recognizes and extracts reasoning (text
+          bracketed by `<think>` and `</think>`) and content (everything
+          after the first `</think>`).
+        - For stream process, it uses a buffer to accumulate delta text,
+          and output progressive delta messages as soon as thinking starts
+          or ends.
+        - For reliability, some Olmo 3 models may hardcode the first
+          `<think>` token is the input text (similar to Deepseek R1,
+          or reasoning-only Qwen models). To support such variants, the
+          parser can optionally work in cases where the first `<think>`
+          token is missing from generation.
+    """
+
+    def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        self.think_start = r"<think>"
+        self.think_end = r"</think>"
+
+        # notice that the first think is optional; this allows template to
+        # work in cases when we hardcode a <think> at the beginning of the
+        # reasoning template.
+        reasoning_expr = (
+            rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
+            + rf"{self.think_end}(?P<content>.*)$"
+        )
+        self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)
+
+        self.buffer = Olmo3ReasoningBuffer(
+            think_start=self.think_start, think_end=self.think_end
+        )
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        text = self.model_tokenizer.decode(input_ids)
+        return self.think_end in text
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        # for Olmo 3 streaming reason parsing, the stream parse
+        # will call first, and the same token will be called in
+        # is_reasoning_end and extract_content_ids
+        # this id is not part of content, so just return [] here.
+        return []
+
+    def extract_reasoning(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest | ResponsesRequest,
+    ) -> tuple[str | None, str | None]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionRequest | ResponsesRequest): Request being
+                processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+
+        re_match = self.reasoning_regex.match(model_output)
+        if re_match:
+            reasoning = re_match.group("reasoning") or None
+            content = re_match.group("content") or None
+            return reasoning, content
+
+        # no reasoning content
+        return None, model_output
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """Extract content using token ID sequence state machine"""
+
+        delta_message = self.buffer.add_text(delta_text)
+        if delta_message is None and self.buffer.think_end in self.buffer.buffer:
+            # this is a bit hacky, but, because of how the buffer is
+            # constructed, if the last delta_text contains characters that
+            # marks the end of thinking tokens, then messages in the buffer
+            # would never be processed because we get no other turn. To get
+            # around that, we check if the text buffer contains the end of
+            # thinking tokens, and, if so, we reprocess the buffer again.
+            delta_message = self.buffer.process_buffer()
+
+        return delta_message
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 3e3c7f32796bde67bae377c608b9e360f33dda96..ef7762bf0af5970368862096f09cbc6a36058445 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ResponsesRequest)
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("qwen3")
 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for the Qwen3 model.
@@ -31,18 +27,17 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
-    def extract_reasoning_content(
-        self, model_output: str, request: Union[ChatCompletionRequest,
-                                                ResponsesRequest]
-    ) -> tuple[Optional[str], Optional[str]]:
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
-        
+
         Qwen3 has stricter requirements - it needs both start and end tokens
         to be present, unlike other models that work with just the end token.
 
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
 
         Returns:
@@ -50,15 +45,15 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
         """
 
         # Check if the model output contains both <think> and </think> tokens.
-        if (self.start_token not in model_output
-                or self.end_token not in model_output):
+        if self.start_token not in model_output or self.end_token not in model_output:
             return None, model_output
 
         # Check if the <think> is present in the model output, remove it
         # if it is present.
         model_output_parts = model_output.partition(self.start_token)
-        model_output = model_output_parts[2] if model_output_parts[
-            1] else model_output_parts[0]
+        model_output = (
+            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
+        )
 
         # Check if the model output contains the </think> tokens.
         # If the end token is not found, return the model output as is.
@@ -66,7 +61,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
             return None, model_output
 
         # Extract reasoning content from the model output.
-        reasoning_content, _, content = model_output.partition(self.end_token)
+        reasoning, _, content = model_output.partition(self.end_token)
 
         final_content = content or None
-        return reasoning_content, final_content
+        return reasoning, final_content
diff --git a/vllm/reasoning/seedoss_reasoning_parser.py b/vllm/reasoning/seedoss_reasoning_parser.py
index 5f4bbbf1557eb1a961304650535d097bfa69dea3..d3d4d8ec0749ec05813e24e81671ffc4b0c31243 100644
--- a/vllm/reasoning/seedoss_reasoning_parser.py
+++ b/vllm/reasoning/seedoss_reasoning_parser.py
@@ -1,19 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("seed_oss")
 class SeedOSSReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for SeedOSS model.
 
-    The SeedOSS model uses <seed:think>...</seed:think> tokens to 
-    denote reasoning content text. This parser extracts 
+    The SeedOSS model uses <seed:think>...</seed:think> tokens to
+    denote reasoning content text. This parser extracts
     the reasoning content from the model output.
-    Similar to DeepSeek R1, it supports cases 
+    Similar to DeepSeek R1, it supports cases
     where the model doesn't generate the start token.
     """
 
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
index 6e5deb52d3451e7dac7c812ce697a28ef826e301..f635758a92c0dd6ac0637021b47f8e96de18458c 100644
--- a/vllm/reasoning/step3_reasoning_parser.py
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -2,25 +2,22 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("step3")
 class Step3ReasoningParser(ReasoningParser):
     """
     Reasoning parser for Step3 model.
 
-    The Step3 model uses </think> token to denote the end of reasoning 
+    The Step3 model uses </think> token to denote the end of reasoning
     text. This parser extracts all content before </think> as reasoning content.
     """
 
@@ -28,21 +25,22 @@ class Step3ReasoningParser(ReasoningParser):
         super().__init__(tokenizer, *args, **kwargs)
         self.think_end_token = "</think>"
 
-        self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}",
-                                          re.DOTALL)
+        self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}", re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
-                "constructor during construction.")
+                "constructor during construction."
+            )
 
         self.think_end_token_id = self.vocab.get(self.think_end_token)
         if self.think_end_token_id is None:
             raise RuntimeError(
                 "Step3 reasoning parser could not locate think end "
-                "token in the tokenizer!")
+                "token in the tokenizer!"
+            )
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -50,38 +48,38 @@ class Step3ReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
         Uses token IDs for faster processing.
         For text "abc</think>xyz":
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
         """
         # Skip single special token
-        if len(delta_token_ids
-               ) == 1 and delta_token_ids[0] == self.think_end_token_id:
+        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
             return None
 
         if self.think_end_token_id in delta_token_ids:
             # </think> in delta, extract reasoning content and remaining content
             end_index = delta_text.find(self.think_end_token)
-            reasoning_content = delta_text[:end_index]
-            content = delta_text[end_index + len(self.think_end_token):]
-            return DeltaMessage(reasoning_content=reasoning_content,
-                                content=content if content else None)
+            reasoning = delta_text[:end_index]
+            content = delta_text[end_index + len(self.think_end_token) :]
+            return DeltaMessage(
+                reasoning=reasoning,
+                content=content if content else None,
+            )
         elif self.think_end_token_id in previous_token_ids:
             # </think> already seen in previous text, everything is content
             return DeltaMessage(content=delta_text)
         else:
             # No </think> seen yet, everything is reasoning
-            return DeltaMessage(reasoning_content=delta_text)
-
-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+            return DeltaMessage(reasoning=delta_text)
 
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
         # Check if the model output contains the </think> token
         if self.think_end_token not in model_output:
             # If no </think> token, everything is reasoning content
@@ -89,15 +87,15 @@ class Step3ReasoningParser(ReasoningParser):
         else:
             # Find the first occurrence of </think>
             end_index = model_output.find(self.think_end_token)
-            reasoning_content = model_output[:end_index]
+            reasoning = model_output[:end_index]
 
             # Content after </think> token
-            content = model_output[end_index + len(self.think_end_token):]
+            content = model_output[end_index + len(self.think_end_token) :]
 
             if len(content) == 0:
                 content = None
 
-            return reasoning_content, content
+            return reasoning, content
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
@@ -106,4 +104,4 @@ class Step3ReasoningParser(ReasoningParser):
         if self.think_end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+            return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index f424682f9dfa09cb42d6c484d52cbed4c7fec622..901d66163452746090e98b8111eb3aaddf379b05 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
+
 import copy
 import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
-from typing import Annotated, Any, Optional, Union
+from typing import Annotated, Any
 
 import msgspec
 from pydantic.dataclasses import dataclass
@@ -14,6 +15,7 @@ from pydantic.dataclasses import dataclass
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
 
@@ -31,45 +33,83 @@ class SamplingType(IntEnum):
 @dataclass
 class StructuredOutputsParams:
     # One of these fields will be used to build a logit processor.
-    json: Optional[Union[str, dict]] = None
-    regex: Optional[str] = None
-    choice: Optional[list[str]] = None
-    grammar: Optional[str] = None
-    json_object: Optional[bool] = None
+    json: str | dict | None = None
+    regex: str | None = None
+    choice: list[str] | None = None
+    grammar: str | None = None
+    json_object: bool | None = None
     # These are other options that can be set.
     disable_fallback: bool = False
     disable_any_whitespace: bool = False
     disable_additional_properties: bool = False
-    whitespace_pattern: Optional[str] = None
-    structural_tag: Optional[str] = None
+    whitespace_pattern: str | None = None
+    structural_tag: str | None = None
 
-    _backend: Optional[str] = field(default=None, init=False)
+    _backend: str | None = field(default=None, init=False)
     """CAUTION: Should only be set by Processor._validate_structured_output"""
     _backend_was_auto: bool = field(default=False, init=False)
     """CAUTION: Should only be set by Processor._validate_structured_output"""
 
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
-        count = sum([
-            self.json is not None, self.regex is not None, self.choice
-            is not None, self.grammar is not None, self.json_object is not None
-        ])
+        count = sum(
+            [
+                self.json is not None,
+                self.regex is not None,
+                self.choice is not None,
+                self.grammar is not None,
+                self.json_object is not None,
+                self.structural_tag is not None,
+            ]
+        )
         if count > 1:
             raise ValueError(
                 "You can only use one kind of structured outputs constraint "
-                f"but multiple are specified: {self.__dict__}")
+                f"but multiple are specified: {self.__dict__}"
+            )
+
+    def all_constraints_none(self) -> bool:
+        """
+        Returns True if all structured-output constraint fields are None.
+        """
+        return all(
+            getattr(self, field) is None
+            for field in (
+                "json",
+                "regex",
+                "choice",
+                "grammar",
+                "json_object",
+                "structural_tag",
+            )
+        )
+
+    def all_non_structural_tag_constraints_none(self) -> bool:
+        """
+        Returns True if all structured-output constraint fields are None.
+        """
+        return all(
+            getattr(self, field) is None
+            for field in (
+                "json",
+                "regex",
+                "choice",
+                "grammar",
+                "json_object",
+            )
+        )
 
 
 @dataclass
 class GuidedDecodingParams(StructuredOutputsParams):
-
     def __post_init__(self):
         warnings.warn(
             "GuidedDecodingParams is deprecated. This will be removed in "
             "v0.12.0 or v1.0.0, which ever is soonest. Please use "
             "StructuredOutputsParams instead.",
             DeprecationWarning,
-            stacklevel=2)
+            stacklevel=2,
+        )
         return super().__post_init__()
 
 
@@ -83,10 +123,12 @@ class RequestOutputKind(Enum):
 
 
 class SamplingParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):  # type: ignore[call-arg]
+    PydanticMsgspecMixin,
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    # required for @cached_property.
+    dict=True,
+):  # type: ignore[call-arg]
     """Sampling parameters for text generation.
 
     Overall, we follow the sampling parameters from the OpenAI text completion
@@ -102,12 +144,12 @@ class SamplingParams(
         are generated and streamed cumulatively per request. To see all `n`
         outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
         in `SamplingParams`."""
-    best_of: Optional[int] = None
+    best_of: int | None = None
     """Number of output sequences that are generated from the prompt. From
     these `best_of` sequences, the top `n` sequences are returned. `best_of`
     must be greater than or equal to `n`. By default, `best_of` is set to `n`.
     Warning, this is only supported in V0."""
-    _real_n: Optional[int] = None
+    _real_n: int | None = None
     presence_penalty: float = 0.0
     """Penalizes new tokens based on whether they appear in the generated text
     so far. Values > 0 encourage the model to use new tokens, while values < 0
@@ -134,24 +176,24 @@ class SamplingParams(
     """Represents the minimum probability for a token to be considered,
     relative to the probability of the most likely token. Must be in [0, 1].
     Set to 0 to disable this."""
-    seed: Optional[int] = None
+    seed: int | None = None
     """Random seed to use for the generation."""
-    stop: Optional[Union[str, list[str]]] = None
+    stop: str | list[str] | None = None
     """String(s) that stop the generation when they are generated. The returned
     output will not contain the stop strings."""
-    stop_token_ids: Optional[list[int]] = None
+    stop_token_ids: list[int] | None = None
     """Token IDs that stop the generation when they are generated. The returned
     output will contain the stop tokens unless the stop tokens are special
     tokens."""
     ignore_eos: bool = False
     """Whether to ignore the EOS token and continue generating
     tokens after the EOS token is generated."""
-    max_tokens: Optional[int] = 16
+    max_tokens: int | None = 16
     """Maximum number of tokens to generate per output sequence."""
     min_tokens: int = 0
     """Minimum number of tokens to generate per output sequence before EOS or
     `stop_token_ids` can be generated"""
-    logprobs: Optional[int] = None
+    logprobs: int | None = None
     """Number of log probabilities to return per output token. When set to
     `None`, no probability is returned. If set to a non-`None` value, the
     result includes the log probabilities of the specified number of most
@@ -159,7 +201,7 @@ class SamplingParams(
     follows the OpenAI API: The API will always return the log probability of
     the sampled token, so there may be up to `logprobs+1` elements in the
     response. When set to -1, return all `vocab_size` log probabilities."""
-    prompt_logprobs: Optional[int] = None
+    prompt_logprobs: int | None = None
     """Number of log probabilities to return per prompt token.
     When set to -1, return all `vocab_size` log probabilities."""
     # NOTE: This parameter is only exposed at the engine level for now.
@@ -171,15 +213,14 @@ class SamplingParams(
     """Whether to skip special tokens in the output."""
     spaces_between_special_tokens: bool = True
     """Whether to add spaces between special tokens in the output."""
-    # Optional[list[LogitsProcessor]] type. We use Any here because
-    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
-    logits_processors: Optional[Any] = None
+    # `list[LogitsProcessor] | None` type. We use Any here because
+    # `list[LogitsProcessor] | None` type is not supported by msgspec.
+    logits_processors: Any | None = None
     """Functions that modify logits based on previously generated tokens, and
     optionally prompt tokens as a first argument."""
     include_stop_str_in_output: bool = False
     """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int,
-                                               msgspec.Meta(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
     """If set to -1, will use the truncation size supported by the model. If
     set to an integer k, will use only the last k tokens from the prompt
     (i.e., left truncation). If set to `None`, truncation is disabled."""
@@ -191,62 +232,62 @@ class SamplingParams(
     _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
 
     # Fields used to construct logits processors
-    structured_outputs: Optional[StructuredOutputsParams] = None
+    structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: Optional[GuidedDecodingParams] = None
+    guided_decoding: GuidedDecodingParams | None = None
     """Deprecated alias for structured_outputs."""
-    logit_bias: Optional[dict[int, float]] = None
+    logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
-    allowed_token_ids: Optional[list[int]] = None
+    allowed_token_ids: list[int] | None = None
     """If provided, the engine will construct a logits processor which only
     retains scores for the given token ids."""
-    extra_args: Optional[dict[str, Any]] = None
+    extra_args: dict[str, Any] | None = None
     """Arbitrary additional args, that can be used by custom sampling
     implementations, plugins, etc. Not used by any in-tree sampling
     implementations."""
 
     # Fields used for bad words
-    bad_words: Optional[list[str]] = None
+    bad_words: list[str] | None = None
     """Words that are not allowed to be generated. More precisely, only the
     last token of a corresponding token sequence is not allowed when the next
     generated token can complete the sequence."""
-    _bad_words_token_ids: Optional[list[list[int]]] = None
+    _bad_words_token_ids: list[list[int]] | None = None
+
+    skip_reading_prefix_cache: bool = None
 
     @staticmethod
     def from_optional(
-        n: Optional[int] = 1,
-        best_of: Optional[int] = None,
-        presence_penalty: Optional[float] = 0.0,
-        frequency_penalty: Optional[float] = 0.0,
-        repetition_penalty: Optional[float] = 1.0,
-        temperature: Optional[float] = 1.0,
-        top_p: Optional[float] = 1.0,
+        n: int | None = 1,
+        best_of: int | None = None,
+        presence_penalty: float | None = 0.0,
+        frequency_penalty: float | None = 0.0,
+        repetition_penalty: float | None = 1.0,
+        temperature: float | None = 1.0,
+        top_p: float | None = 1.0,
         top_k: int = 0,
         min_p: float = 0.0,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, list[str]]] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        bad_words: Optional[list[str]] = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stop_token_ids: list[int] | None = None,
+        bad_words: list[str] | None = None,
         include_stop_str_in_output: bool = False,
         ignore_eos: bool = False,
-        max_tokens: Optional[int] = 16,
+        max_tokens: int | None = 16,
         min_tokens: int = 0,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
+        logprobs: int | None = None,
+        prompt_logprobs: int | None = None,
         detokenize: bool = True,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
-        logits_processors: Optional[list[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int,
-                                                   msgspec.Meta(
-                                                       ge=-1)]] = None,
+        logits_processors: list[LogitsProcessor] | None = None,
+        truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
-        structured_outputs: Optional[StructuredOutputsParams] = None,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
-        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
-        allowed_token_ids: Optional[list[int]] = None,
-        extra_args: Optional[dict[str, Any]] = None,
+        structured_outputs: StructuredOutputsParams | None = None,
+        guided_decoding: GuidedDecodingParams | None = None,
+        logit_bias: dict[int, float] | dict[str, float] | None = None,
+        allowed_token_ids: list[int] | None = None,
+        extra_args: dict[str, Any] | None = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -261,19 +302,19 @@ class SamplingParams(
                 "v0.12.0 or v1.0.0, which ever is soonest. Please use "
                 "structured_outputs instead.",
                 DeprecationWarning,
-                stacklevel=2)
+                stacklevel=2,
+            )
             structured_outputs = guided_decoding
             guided_decoding = None
 
         return SamplingParams(
             n=1 if n is None else n,
             best_of=best_of,
-            presence_penalty=0.0
-            if presence_penalty is None else presence_penalty,
-            frequency_penalty=0.0
-            if frequency_penalty is None else frequency_penalty,
+            presence_penalty=0.0 if presence_penalty is None else presence_penalty,
+            frequency_penalty=0.0 if frequency_penalty is None else frequency_penalty,
             repetition_penalty=1.0
-            if repetition_penalty is None else repetition_penalty,
+            if repetition_penalty is None
+            else repetition_penalty,
             temperature=1.0 if temperature is None else temperature,
             top_p=1.0 if top_p is None else top_p,
             top_k=top_k,
@@ -301,17 +342,18 @@ class SamplingParams(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of``:
-        # if `best_of`` is not set, we default to `n`;
-        # if `best_of`` is set, we set `n`` to `best_of`,
-        # and set `_real_n`` to the original `n`.
+        # how we deal with `best_of`:
+        # if `best_of` is not set, we default to `n`;
+        # if `best_of` is set, we set `n` to `best_of`,
+        # and set `_real_n` to the original `n`.
         # when we return the result, we will check
         # if we need to return `n` or `_real_n` results
         if self.best_of:
             if self.best_of < self.n:
                 raise ValueError(
                     f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
+                    f"got n={self.n} and best_of={self.best_of}."
+                )
             if not self._real_n:
                 self._real_n = self.n
                 self.n = self.best_of
@@ -320,7 +362,10 @@ class SamplingParams(
             logger.warning(
                 "temperature %s is less than %s, which may cause numerical "
                 "errors nan or inf in tensors. We have maxed it out to %s.",
-                self.temperature, _MAX_TEMP, _MAX_TEMP)
+                self.temperature,
+                _MAX_TEMP,
+                _MAX_TEMP,
+            )
             self.temperature = max(self.temperature, _MAX_TEMP)
 
         if self.seed == -1:
@@ -366,101 +411,122 @@ class SamplingParams(
                 "v0.12.0 or v1.0.0, which ever is soonest. Please use "
                 "structured_outputs instead.",
                 DeprecationWarning,
-                stacklevel=2)
+                stacklevel=2,
+            )
             self.structured_outputs = self.guided_decoding
             self.guided_decoding = None
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of prompt logprobs may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            self.skip_reading_prefix_cache = self.prompt_logprobs is not None
+
     def _verify_args(self) -> None:
         if not isinstance(self.n, int):
-            raise ValueError(f"n must be an int, but is of "
-                             f"type {type(self.n)}")
+            raise ValueError(f"n must be an int, but is of type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
         if self.best_of is not None:
             if not isinstance(self.best_of, int):
                 raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}")
+                    f"best_of must be an integer, got {type(self.best_of)}"
+                )
             if self.best_of < 1:
-                raise ValueError(
-                    f"best_of must be at least 1, got {self.best_of}")
+                raise ValueError(f"best_of must be at least 1, got {self.best_of}")
             if self.best_of < self.n:
                 raise ValueError(
                     f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
+                    f"got n={self.n} and best_of={self.best_of}."
+                )
         if not -2.0 <= self.presence_penalty <= 2.0:
-            raise ValueError("presence_penalty must be in [-2, 2], got "
-                             f"{self.presence_penalty}.")
+            raise ValueError(
+                f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."
+            )
         if not -2.0 <= self.frequency_penalty <= 2.0:
-            raise ValueError("frequency_penalty must be in [-2, 2], got "
-                             f"{self.frequency_penalty}.")
+            raise ValueError(
+                f"frequency_penalty must be in [-2, 2], got {self.frequency_penalty}."
+            )
         if self.repetition_penalty <= 0.0:
             raise ValueError(
                 "repetition_penalty must be greater than zero, got "
-                f"{self.repetition_penalty}.")
+                f"{self.repetition_penalty}."
+            )
         if self.temperature < 0.0:
             raise ValueError(
-                f"temperature must be non-negative, got {self.temperature}.")
+                f"temperature must be non-negative, got {self.temperature}."
+            )
         if not 0.0 < self.top_p <= 1.0:
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
         # quietly accept -1 as disabled, but prefer 0
         if self.top_k < -1:
-            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
-                             f"got {self.top_k}.")
+            raise ValueError(
+                f"top_k must be 0 (disable), or at least 1, got {self.top_k}."
+            )
         if not isinstance(self.top_k, int):
             raise TypeError(
-                f"top_k must be an integer, got {type(self.top_k).__name__}")
+                f"top_k must be an integer, got {type(self.top_k).__name__}"
+            )
         if not 0.0 <= self.min_p <= 1.0:
-            raise ValueError("min_p must be in [0, 1], got "
-                             f"{self.min_p}.")
+            raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
         if self.max_tokens is not None and self.max_tokens < 1:
-            raise ValueError(
-                f"max_tokens must be at least 1, got {self.max_tokens}.")
+            raise ValueError(f"max_tokens must be at least 1, got {self.max_tokens}.")
         if self.min_tokens < 0:
-            raise ValueError(f"min_tokens must be greater than or equal to 0, "
-                             f"got {self.min_tokens}.")
+            raise ValueError(
+                f"min_tokens must be greater than or equal to 0, got {self.min_tokens}."
+            )
         if self.max_tokens is not None and self.min_tokens > self.max_tokens:
             raise ValueError(
                 f"min_tokens must be less than or equal to "
-                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if (self.logprobs is not None and self.logprobs != -1
-                and self.logprobs < 0):
+                f"max_tokens={self.max_tokens}, got {self.min_tokens}."
+            )
+        if self.logprobs is not None and self.logprobs != -1 and self.logprobs < 0:
             raise ValueError(
-                f"logprobs must be non-negative or -1, got {self.logprobs}.")
-        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
-                and self.prompt_logprobs < 0):
+                f"logprobs must be non-negative or -1, got {self.logprobs}."
+            )
+        if (
+            self.prompt_logprobs is not None
+            and self.prompt_logprobs != -1
+            and self.prompt_logprobs < 0
+        ):
             raise ValueError(
                 f"prompt_logprobs must be non-negative or -1, got "
-                f"{self.prompt_logprobs}.")
-        if (self.truncate_prompt_tokens is not None
-                and (self.truncate_prompt_tokens == 0
-                     or self.truncate_prompt_tokens < -1)):
+                f"{self.prompt_logprobs}."
+            )
+        if self.truncate_prompt_tokens is not None and (
+            self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1
+        ):
             raise ValueError(
                 f"truncate_prompt_tokens must be an integer >= 1 or -1, "
-                f"got {self.truncate_prompt_tokens}")
+                f"got {self.truncate_prompt_tokens}"
+            )
         assert isinstance(self.stop_token_ids, list)
         if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
-            raise ValueError(f"stop_token_ids must contain only integers, "
-                             f"got {self.stop_token_ids}.")
+            raise ValueError(
+                f"stop_token_ids must contain only integers, got {self.stop_token_ids}."
+            )
         assert isinstance(self.stop, list)
         if any(not stop_str for stop_str in self.stop):
             raise ValueError("stop cannot contain an empty string.")
         if self.stop and not self.detokenize:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
-                "Set detokenize=True to use stop.")
+                "Set detokenize=True to use stop."
+            )
         if self.best_of != self._real_n and self.output_kind == (
-                RequestOutputKind.DELTA):
+            RequestOutputKind.DELTA
+        ):
             raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
-            raise ValueError("n must be 1 when using greedy sampling, "
-                             f"got {self.n}.")
+            raise ValueError(f"n must be 1 when using greedy sampling, got {self.n}.")
 
     def update_from_generation_config(
-            self,
-            generation_config: dict[str, Any],
-            model_eos_token_id: Optional[int] = None) -> None:
+        self,
+        generation_config: dict[str, Any],
+        model_eos_token_id: int | None = None,
+    ) -> None:
         """Update if there are non-default values from generation_config"""
 
         if model_eos_token_id is not None:
@@ -494,30 +560,33 @@ class SamplingParams(
             for add_prefix_space in [False, True]:
                 prefix = " " if add_prefix_space else ""
                 prompt = prefix + bad_word.lstrip()
-                prompt_token_ids = tokenizer.encode(text=prompt,
-                                                    add_special_tokens=False)
+                prompt_token_ids = tokenizer.encode(
+                    text=prompt, add_special_tokens=False
+                )
 
                 # If no space at the beginning
                 # or if prefix space produces a new word token
                 if (not add_prefix_space) or (
-                        add_prefix_space and prompt_token_ids[0]
-                        != self._bad_words_token_ids[-1][0]
-                        and len(prompt_token_ids) == len(
-                            self._bad_words_token_ids[-1])):
+                    add_prefix_space
+                    and prompt_token_ids[0] != self._bad_words_token_ids[-1][0]
+                    and len(prompt_token_ids) == len(self._bad_words_token_ids[-1])
+                ):
                     self._bad_words_token_ids.append(prompt_token_ids)
 
         invalid_token_ids = [
-            token_id for bad_words_token_ids in self._bad_words_token_ids
+            token_id
+            for bad_words_token_ids in self._bad_words_token_ids
             for token_id in bad_words_token_ids
             if token_id < 0 or token_id > tokenizer.max_token_id
         ]
         if len(invalid_token_ids) > 0:
             raise ValueError(
-                f"The model vocabulary size is {tokenizer.max_token_id+1},"
+                f"The model vocabulary size is {tokenizer.max_token_id + 1},"
                 f" but the following tokens"
                 f" were specified as bad: {invalid_token_ids}."
                 f" All token id values should be integers satisfying:"
-                f" 0 <= token_id <= {tokenizer.max_token_id}.")
+                f" 0 <= token_id <= {tokenizer.max_token_id}."
+            )
 
     @cached_property
     def sampling_type(self) -> SamplingType:
@@ -532,7 +601,7 @@ class SamplingParams(
         return self._all_stop_token_ids
 
     @property
-    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
+    def bad_words_token_ids(self) -> list[list[int]] | None:
         # For internal use only. Backward compatibility not guaranteed
         return self._bad_words_token_ids
 
@@ -545,10 +614,14 @@ class SamplingParams(
         See https://github.com/vllm-project/vllm/issues/3087
         """
 
-        logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
-            for lp in self.logits_processors
-        }
+        logit_processor_refs = (
+            None
+            if self.logits_processors is None
+            else {
+                id(lp): lp.clone() if hasattr(lp, "clone") else lp
+                for lp in self.logits_processors
+            }
+        )
         return copy.deepcopy(self, memo=logit_processor_refs)
 
     def __repr__(self) -> str:
@@ -576,15 +649,18 @@ class SamplingParams(
             f"{self.spaces_between_special_tokens}, "
             f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"structured_outputs={self.structured_outputs}, "
-            f"extra_args={self.extra_args})")
+            f"extra_args={self.extra_args})"
+        )
 
 
 class BeamSearchParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):  # type: ignore[call-arg]
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    # required for @cached_property.
+    dict=True,
+):  # type: ignore[call-arg]
     """Beam search parameters for text generation."""
+
     beam_width: int
     max_tokens: int
     ignore_eos: bool = False
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 055f28914ad597c2a8561be63107783d9fdff145..05760f3f829988d4640955b6d19ff2549b1d28cd 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -5,7 +5,6 @@ import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional, Union
 
 _SCALAR_TYPES_ID_MAP = {}
 
@@ -70,20 +69,19 @@ class ScalarType:
     """
 
     def _floating_point_max_int(self) -> int:
-        assert (
-            self.mantissa <= 52 and self.exponent <= 11
-        ), f"Cannot represent max/min as a double for type {self.__str__()}"
+        assert self.mantissa <= 52 and self.exponent <= 11, (
+            f"Cannot represent max/min as a double for type {self.__str__()}"
+        )
 
         max_mantissa = (1 << self.mantissa) - 1
         if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
             max_mantissa = max_mantissa - 1
 
         max_exponent = (1 << self.exponent) - 2
-        if (self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN
-                or self.nan_repr == NanRepr.NONE):
-            assert (
-                self.exponent < 11
-            ), f"Cannot represent max/min as a double for type {self.__str__()}"
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN or self.nan_repr == NanRepr.NONE:
+            assert self.exponent < 11, (
+                f"Cannot represent max/min as a double for type {self.__str__()}"
+            )
             max_exponent = max_exponent + 1
 
         # adjust the exponent to match that of a double
@@ -96,38 +94,39 @@ class ScalarType:
         exponent_bias = (1 << (self.exponent - 1)) - 1
         exponent_bias_double = (1 << 10) - 1  # double e = 11
 
-        max_exponent_double = (max_exponent - exponent_bias +
-                               exponent_bias_double)
+        max_exponent_double = max_exponent - exponent_bias + exponent_bias_double
 
         # shift the mantissa and exponent into the proper positions for an
         # IEEE double and bitwise-or them together.
-        return (max_mantissa <<
-                (52 - self.mantissa)) | (max_exponent_double << 52)
+        return (max_mantissa << (52 - self.mantissa)) | (max_exponent_double << 52)
 
     def _floating_point_max(self) -> float:
         double_raw = self._floating_point_max_int()
-        return struct.unpack('!d', struct.pack('!Q', double_raw))[0]
+        return struct.unpack("!d", struct.pack("!Q", double_raw))[0]
 
-    def _raw_max(self) -> Union[int, float]:
+    def _raw_max(self) -> int | float:
         if self.is_floating_point():
             return self._floating_point_max()
         else:
-            assert (self.size_bits < 64 or self.size_bits == 64
-                    and self.is_signed()), "Cannot represent max as an int"
+            assert self.size_bits < 64 or self.size_bits == 64 and self.is_signed(), (
+                "Cannot represent max as an int"
+            )
             return (1 << self.mantissa) - 1
 
-    def _raw_min(self) -> Union[int, float]:
+    def _raw_min(self) -> int | float:
         if self.is_floating_point():
-            assert self.is_signed(
-            ), "We currently assume all floating point types are signed"
+            assert self.is_signed(), (
+                "We currently assume all floating point types are signed"
+            )
             sign_bit_double = 1 << 63
 
             max_raw = self._floating_point_max_int()
             min_raw = max_raw | sign_bit_double
-            return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
+            return struct.unpack("!d", struct.pack("!Q", min_raw))[0]
         else:
-            assert (not self.is_signed() or self.size_bits
-                    <= 64), "Cannot represent min as a int64_t"
+            assert not self.is_signed() or self.size_bits <= 64, (
+                "Cannot represent min as a int64_t"
+            )
 
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
@@ -158,8 +157,7 @@ class ScalarType:
         or_and_advance(self._finite_values_only, 1)
         or_and_advance(self.nan_repr.value, 8)
 
-        assert offset <= 64, \
-            f"ScalarType fields too big {offset} to fit into an int64"
+        assert offset <= 64, f"ScalarType fields too big {offset} to fit into an int64"
 
         _SCALAR_TYPES_ID_MAP[val] = self
 
@@ -169,14 +167,14 @@ class ScalarType:
     def size_bits(self) -> int:
         return self.exponent + self.mantissa + int(self.signed)
 
-    def min(self) -> Union[int, float]:
+    def min(self) -> int | float:
         """
         Min representable value for this scalar type.
         (accounting for bias if there is one)
         """
         return self._raw_min() - self.bias
 
-    def max(self) -> Union[int, float]:
+    def max(self) -> int | float:
         """
         Max representable value for this scalar type.
         (accounting for bias if there is one)
@@ -215,8 +213,7 @@ class ScalarType:
         If the type is a floating point type that follows IEEE 754
         conventions
         """
-        return self.nan_repr == NanRepr.IEEE_754.value and \
-            not self._finite_values_only
+        return self.nan_repr == NanRepr.IEEE_754.value and not self._finite_values_only
 
     def __str__(self) -> str:
         """
@@ -232,8 +229,14 @@ class ScalarType:
           - if bias is not present it means its zero
         """
         if self.is_floating_point():
-            ret = "float" + str(self.size_bits) + "_e" + str(
-                self.exponent) + "m" + str(self.mantissa)
+            ret = (
+                "float"
+                + str(self.size_bits)
+                + "_e"
+                + str(self.exponent)
+                + "m"
+                + str(self.mantissa)
+            )
 
             if not self.is_ieee_754():
                 if self._finite_values_only:
@@ -261,41 +264,43 @@ class ScalarType:
     #
 
     @classmethod
-    def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+    def int_(cls, size_bits: int, bias: int | None) -> "ScalarType":
         "Create a signed integer scalar type (size_bits includes sign-bit)."
         ret = cls(0, size_bits - 1, True, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
 
     @classmethod
-    def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+    def uint(cls, size_bits: int, bias: int | None) -> "ScalarType":
         """Create an unsigned integer scalar type."""
         ret = cls(0, size_bits, False, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
 
     @classmethod
-    def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+    def float_IEEE754(cls, exponent: int, mantissa: int) -> "ScalarType":
         """
         Create a standard floating point type
         (i.e. follows IEEE 754 conventions).
         """
-        assert (mantissa > 0 and exponent > 0)
+        assert mantissa > 0 and exponent > 0
         ret = cls(exponent, mantissa, True, 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
 
     @classmethod
-    def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
-               nan_repr: NanRepr) -> 'ScalarType':
+    def float_(
+        cls, exponent: int, mantissa: int, finite_values_only: bool, nan_repr: NanRepr
+    ) -> "ScalarType":
         """
         Create a non-standard floating point type
         (i.e. does not follow IEEE 754 conventions).
         """
-        assert (mantissa > 0 and exponent > 0)
-        assert (nan_repr != NanRepr.IEEE_754), (
+        assert mantissa > 0 and exponent > 0
+        assert nan_repr != NanRepr.IEEE_754, (
             "use `float_IEEE754` constructor for floating point types that "
-            "follow IEEE 754 conventions")
+            "follow IEEE 754 conventions"
+        )
         ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
         ret.id  # noqa B018: make sure the id is cached
         return ret
@@ -303,8 +308,7 @@ class ScalarType:
     @classmethod
     def from_id(cls, scalar_type_id: int):
         if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
-            raise ValueError(
-                f"scalar_type_id {scalar_type_id} doesn't exists.")
+            raise ValueError(f"scalar_type_id {scalar_type_id} doesn't exists.")
         return _SCALAR_TYPES_ID_MAP[scalar_type_id]
 
 
@@ -327,14 +331,16 @@ class scalar_types:
     uint8 = ScalarType.uint(8, None)
     float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
     float8_e5m2 = ScalarType.float_IEEE754(5, 2)
-    float8_e8m0fnu = ScalarType(8, 0, False, 0, True,
-                                NanRepr.EXTD_RANGE_MAX_MIN)
+    float8_e8m0fnu = ScalarType(8, 0, False, 0, True, NanRepr.EXTD_RANGE_MAX_MIN)
     float16_e8m7 = ScalarType.float_IEEE754(8, 7)
     float16_e5m10 = ScalarType.float_IEEE754(5, 10)
 
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    # and https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
 
+    float6_e2m3f = ScalarType.float_(2, 3, True, NanRepr.NONE)
+
     # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
 
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 7a7fdccf0a32bf06baa24cde1e5b8d70d8b962a9..f158860726beb167c206b9773e2f8461abfb2087 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -10,6 +10,8 @@ logger = init_logger(__name__)
 # Backwards compatibility for the move from vllm.scripts to
 # vllm.entrypoints.cli.main
 def main():
-    logger.warning("vllm.scripts.main() is deprecated. Please re-install "
-                   "vllm or use vllm.entrypoints.cli.main.main() instead.")
+    logger.warning(
+        "vllm.scripts.main() is deprecated. Please re-install "
+        "vllm or use vllm.entrypoints.cli.main.main() instead."
+    )
     vllm_main()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index e5f23d47a660265bc9b7ef4cb731caae00d6c907..6d20ca9aac225d6ecc9365cb9e817d44aac4dc13 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,15 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sequence and its related classes."""
+
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
-import msgspec
 import torch
 
 if TYPE_CHECKING:
-    from vllm.v1.worker.kv_connector_model_runner_mixin import (
-        KVConnectorOutput)
+    from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorOutput
 else:
     KVConnectorOutput = Any
 
@@ -36,15 +35,16 @@ class RequestMetrics:
                             will include model forward, block/sync across
                             workers, cpu-gpu sync time and sampling time.
     """
+
     arrival_time: float
     last_token_time: float
-    first_scheduled_time: Optional[float]
-    first_token_time: Optional[float]
-    time_in_queue: Optional[float]
-    finished_time: Optional[float] = None
-    scheduler_time: Optional[float] = None
-    model_forward_time: Optional[float] = None
-    model_execute_time: Optional[float] = None
+    first_scheduled_time: float | None
+    first_token_time: float | None
+    time_in_queue: float | None
+    finished_time: float | None = None
+    scheduler_time: float | None = None
+    model_forward_time: float | None = None
+    model_execute_time: float | None = None
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -53,21 +53,26 @@ class IntermediateTensors:
     """For all pipeline stages except the last, we need to return the hidden
     states and residuals to be sent to the next stage. This data structure
     contains the hidden states and residuals for a request.
-    
+
     Each stage also needs to handle its own kv_connector_output.
     """
 
     tensors: dict[str, torch.Tensor]
-    kv_connector_output: Optional[KVConnectorOutput]
+    kv_connector_output: KVConnectorOutput | None
 
-    def __init__(self, tensors):
+    def __init__(
+        self,
+        tensors: dict[str, torch.Tensor],
+        kv_connector_output: KVConnectorOutput | None = None,
+    ) -> None:
         # manually define this function, so that
         # Dynamo knows `IntermediateTensors()` comes from this file.
         # Otherwise, dataclass will generate this function by evaluating
         # a string, and we will lose the information about the source file.
         self.tensors = tensors
+        self.kv_connector_output = kv_connector_output
 
-    def __getitem__(self, key: Union[str, slice]):
+    def __getitem__(self, key: str | slice):
         if isinstance(key, str):
             return self.tensors[key]
         elif isinstance(key, slice):
@@ -87,17 +92,7 @@ class IntermediateTensors:
             return False
         if self.tensors.keys() != other.tensors.keys():
             return False
-        return all(
-            torch.equal(self.tensors[k], other.tensors[k])
-            for k in self.tensors)
+        return all(torch.equal(self.tensors[k], other.tensors[k]) for k in self.tensors)
 
     def __repr__(self) -> str:
         return f"IntermediateTensors(tensors={self.tensors})"
-
-
-class ExecuteModelRequest(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True):  # type: ignore[call-arg]
-    # Placeholder. Remove.
-    pass
diff --git a/vllm/tasks.py b/vllm/tasks.py
index 85c5c6e436205c0fe732497e700c6af2102dde21..b02cde74c12a9763341c9dfab2940c8528647669 100644
--- a/vllm/tasks.py
+++ b/vllm/tasks.py
@@ -5,7 +5,9 @@ from typing import Literal, get_args
 GenerationTask = Literal["generate", "transcription"]
 GENERATION_TASKS = get_args(GenerationTask)
 
-PoolingTask = Literal["encode", "embed", "classify", "score"]
+PoolingTask = Literal[
+    "embed", "classify", "score", "token_embed", "token_classify", "plugin"
+]
 POOLING_TASKS = get_args(PoolingTask)
 
 SupportedTask = Literal[GenerationTask, PoolingTask]
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
deleted file mode 100644
index 91dcc2fd84e1732663bf99b67e322c397bf0d772..0000000000000000000000000000000000000000
--- a/vllm/test_utils.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-MODELS_ON_S3 = [
-    "adept/fuyu-8b",
-    "ai21labs/AI21-Jamba-1.5-Mini",
-    "ai21labs/Jamba-tiny-random",
-    "ai21labs/Jamba-tiny-reward-dev",
-    "allenai/Molmo-7B-D-0924",
-    "allenai/OLMo-1B-hf",
-    "allenai/OLMoE-1B-7B-0924-Instruct",
-    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "hmellor/Ilama-3.2-1B",
-    "BAAI/bge-base-en-v1.5",
-    "BAAI/bge-multilingual-gemma2",
-    "BAAI/bge-reranker-v2-m3",
-    "bigcode/starcoder2-3b",
-    "cross-encoder/ms-marco-MiniLM-L-6-v2",
-    "cross-encoder/quora-roberta-base",
-    "deepseek-ai/deepseek-vl2-tiny",
-    "distilbert/distilgpt2",
-    "facebook/bart-base",
-    "facebook/bart-large-cnn",
-    # "fixie-ai/ultravox-v0_5-llama-3_2-1b",
-    "google/gemma-1.1-2b-it",
-    "google/gemma-2-2b-it",
-    "google/paligemma-3b-pt-224",
-    "h2oai/h2ovl-mississippi-800m",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
-    "internlm/internlm2-1_8b-reward",
-    "intfloat/e5-mistral-7b-instruct",
-    "intfloat/multilingual-e5-small",
-    "jason9693/Qwen2.5-1.5B-apeach",
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    "llava-hf/LLaVA-NeXT-Video-7B-hf",
-    # "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Llama-3.2-1B",
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "meta-llama/Meta-Llama-3-8B",
-    "microsoft/phi-2",
-    "microsoft/Phi-3-mini-4k-instruct",
-    "microsoft/Phi-3-small-8k-instruct",
-    "microsoft/Phi-3-vision-128k-instruct",
-    "microsoft/Phi-3.5-MoE-instruct",
-    "microsoft/Phi-3.5-vision-instruct",
-    # "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "mistralai/Pixtral-12B-2409",
-    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
-    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
-    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
-    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
-    "nvidia/NVLM-D-72B",
-    "openai-community/gpt2",
-    # "openai/whisper-large-v3",
-    "openbmb/MiniCPM-o-2_6",
-    "openbmb/MiniCPM-V-2_6",
-    "OpenGVLab/InternVL2-1B",
-    "parasail-ai/GritLM-7B-vllm",
-    "Qwen/Qwen1.5-MoE-A2.7B-Chat",
-    "Qwen/Qwen2-7B-Instruct",
-    "Qwen/Qwen2-Audio-7B-Instruct",
-    "Qwen/Qwen2-VL-2B-Instruct",
-    "Qwen/Qwen2.5-1.5B-Instruct",
-    "Qwen/Qwen2.5-Math-PRM-7B",
-    "Qwen/Qwen2.5-Math-RM-72B",
-    "Qwen/Qwen2.5-VL-3B-Instruct",
-    "royokong/e5-v",
-    "sentence-transformers/all-roberta-large-v1",
-    "sentence-transformers/stsb-roberta-base-v2",
-    "allenai/OLMo-2-0425-1B",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
-    "ssmits/Qwen2-7B-Instruct-embed-base",
-    "stabilityai/stablelm-3b-4e1t",
-    "stabilityai/stablelm-zephyr-3b",
-    "state-spaces/mamba-130m-hf",
-    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "zai-org/glm-4v-9b",
-    "TIGER-Lab/Mantis-8B-siglip-llama3",
-    "TIGER-Lab/VLM2Vec-Full",
-    "tiiuae/falcon-40b",
-    "tiiuae/falcon-mamba-7b-instruct",
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "upstage/solar-pro-preview-instruct",
-]
-
-MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 7537e9901a044d469eb3bdb979890e2020e0a7ae..01bbebf35cfc1d43748e367aab01d7aff773f0fd 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -3,26 +3,28 @@
 
 import os
 from collections.abc import Mapping
-from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.utils import run_once
+from vllm.utils.func_utils import run_once
 
 TRACE_HEADERS = ["traceparent", "tracestate"]
 
 logger = init_logger(__name__)
 
 _is_otel_imported = False
-otel_import_error_traceback: Optional[str] = None
+otel_import_error_traceback: str | None = None
 try:
     from opentelemetry.context.context import Context
     from opentelemetry.sdk.environment_variables import (
-        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
+        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL,
+    )
     from opentelemetry.sdk.trace import TracerProvider
     from opentelemetry.sdk.trace.export import BatchSpanProcessor
     from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
     from opentelemetry.trace.propagation.tracecontext import (
-        TraceContextTextMapPropagator)
+        TraceContextTextMapPropagator,
+    )
+
     _is_otel_imported = True
 except ImportError:
     # Capture and format traceback to provide detailed context for the import
@@ -30,6 +32,7 @@ except ImportError:
     # memory leaks.
     # See https://github.com/vllm-project/vllm/pull/7266#discussion_r1707395458
     import traceback
+
     otel_import_error_traceback = traceback.format_exc()
 
     class Context:  # type: ignore
@@ -49,13 +52,15 @@ def is_otel_available() -> bool:
     return _is_otel_imported
 
 
-def init_tracer(instrumenting_module_name: str,
-                otlp_traces_endpoint: str) -> Optional[Tracer]:
+def init_tracer(
+    instrumenting_module_name: str, otlp_traces_endpoint: str
+) -> Tracer | None:
     if not is_otel_available():
         raise ValueError(
             "OpenTelemetry is not available. Unable to initialize "
             "a tracer. Ensure OpenTelemetry packages are installed. "
-            f"Original error:\n{otel_import_error_traceback}")
+            f"Original error:\n{otel_import_error_traceback}"
+        )
     trace_provider = TracerProvider()
 
     span_exporter = get_span_exporter(otlp_traces_endpoint)
@@ -70,19 +75,19 @@ def get_span_exporter(endpoint):
     protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
     if protocol == "grpc":
         from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
-            OTLPSpanExporter)
+            OTLPSpanExporter,
+        )
     elif protocol == "http/protobuf":
         from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
-            OTLPSpanExporter)  # type: ignore
+            OTLPSpanExporter,  # type: ignore
+        )
     else:
-        raise ValueError(
-            f"Unsupported OTLP protocol '{protocol}' is configured")
+        raise ValueError(f"Unsupported OTLP protocol '{protocol}' is configured")
 
     return OTLPSpanExporter(endpoint=endpoint)
 
 
-def extract_trace_context(
-        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
+def extract_trace_context(headers: Mapping[str, str] | None) -> Context | None:
     if is_otel_available():
         headers = headers or {}
         return TraceContextTextMapPropagator().extract(headers)
@@ -91,7 +96,6 @@ def extract_trace_context(
 
 
 def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
-
     return {h: headers[h] for h in TRACE_HEADERS if h in headers}
 
 
@@ -113,17 +117,13 @@ class SpanAttributes:
     GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
     GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
     # Time taken in the forward pass for this across all workers
-    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
-        "gen_ai.latency.time_in_model_forward")
+    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
     # Time taken in the model execute function. This will include model
     # forward, block/sync across workers, cpu-gpu sync time and sampling time.
-    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
-        "gen_ai.latency.time_in_model_execute")
-    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \
-        "gen_ai.latency.time_in_model_prefill"
+    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
+    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = "gen_ai.latency.time_in_model_prefill"
     GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
-    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \
-        "gen_ai.latency.time_in_model_inference"
+    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = "gen_ai.latency.time_in_model_inference"
 
 
 def contains_trace_headers(headers: Mapping[str, str]) -> bool:
@@ -132,5 +132,4 @@ def contains_trace_headers(headers: Mapping[str, str]) -> bool:
 
 @run_once
 def log_tracing_disabled_warning() -> None:
-    logger.warning(
-        "Received a request with trace context but tracing is disabled")
+    logger.warning("Received a request with trace context but tracing is disabled")
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index 6d4231baca50b390c22fbd6cd037a65ff9797138..649df9a4f0222a240a9c90282919467811739be7 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -10,10 +10,11 @@ if envs.VLLM_USE_MODELSCOPE:
         from packaging import version
 
         # patch_hub begins from modelscope>=1.18.1
-        if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
+        if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
             raise ImportError(
-                'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
-                'install by `pip install modelscope -U`')
+                "Using vLLM with ModelScope needs modelscope>=1.18.1, please "
+                "install by `pip install modelscope -U`"
+            )
         from modelscope.utils.hf_util import patch_hub
 
         # Patch hub to download models from modelscope to speed up.
@@ -21,4 +22,5 @@ if envs.VLLM_USE_MODELSCOPE:
     except ImportError as err:
         raise ImportError(
             "Please install modelscope>=1.18.1 via "
-            "`pip install modelscope>=1.18.1` to use ModelScope.") from err
+            "`pip install modelscope>=1.18.1` to use ModelScope."
+        ) from err
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index 3a97f2c056181eaad26d6e824e0aaf6a448117ff..fe84b6c152eef50bbcdbeadf176c6e62b558a1c6 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import TypeAlias
 
 from vllm.logger import init_logger
 
@@ -9,19 +10,17 @@ logger = init_logger(__file__)
 
 CHAT_TEMPLATES_DIR = Path(__file__).parent
 
-ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
+ChatTemplatePath: TypeAlias = Path | Callable[[str], Path | None]
 
 
-def _get_qwen_chat_template_fallback(
-        tokenizer_name_or_path: str) -> Optional[Path]:
+def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
     if tokenizer_name_or_path.endswith("-Chat"):
         return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
 
     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
 
 
-def _get_minicpmv_chat_template_fallback(
-        tokenizer_name_or_path: str) -> Optional[Path]:
+def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
     # MiniCPM-V-4.5 version uses a dedicated template
     if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
         return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
@@ -30,17 +29,19 @@ def _get_minicpmv_chat_template_fallback(
     return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
 
 
-# yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
     "chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "deepseek_ocr": CHAT_TEMPLATES_DIR / "template_deepseek_ocr.jinja",
     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
     "minicpmv": _get_minicpmv_chat_template_fallback,
     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "qwen": _get_qwen_chat_template_fallback,
+    "siglip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "siglip2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
 }
-# yapf: enable
 
 
 def register_chat_template_fallback_path(
@@ -50,8 +51,10 @@ def register_chat_template_fallback_path(
     if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
         logger.warning(
             "Model type %s already has a chat template registered. "
-            "It will be overwritten by the new chat template %s.", model_type,
-            chat_template)
+            "It will be overwritten by the new chat template %s.",
+            model_type,
+            chat_template,
+        )
 
     _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
 
@@ -59,7 +62,7 @@ def register_chat_template_fallback_path(
 def get_chat_template_fallback_path(
     model_type: str,
     tokenizer_name_or_path: str,
-) -> Optional[Path]:
+) -> Path | None:
     chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
     if callable(chat_template):
         chat_template = chat_template(tokenizer_name_or_path)
diff --git a/vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja b/vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..287abe3586425dc7168ed933cb1058a7455d737d
--- /dev/null
+++ b/vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja
@@ -0,0 +1,14 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+    {{ message['content'] }}
+{%- endfor -%}
diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
index 661ebd1cf5c17d2828c8a3eed86d1764d5fe2143..c73ae96f0c1d548c35e807b89d049fee13f51d55 100644
--- a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
@@ -30,18 +30,18 @@
         {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
     {%- elif message.role == "assistant" %}
         {%- set content = message.content %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-            {%- set reasoning_content = message.reasoning_content %}
+        {%- set reasoning = '' %}
+        {%- if message.reasoning is defined and message.reasoning is not none %}
+            {%- set reasoning = message.reasoning %}
         {%- else %}
             {%- if '</think>' in message.content %}
                 {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set reasoning = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
             {%- endif %}
         {%- endif %}
         {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- if loop.last or (not loop.last and reasoning) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
             {%- else %}
                 {{- '<|im_start|>' + message.role + '\n' + content }}
             {%- endif %}
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c3384aec82b1efb9e58a80f5f61355695946fffe..49250e071eab287d60994ad562a86e957e9169a9 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,32 +4,42 @@
 import json
 import os
 import time
+from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, TypeVar, Union
+from typing import Any, Literal, TypeVar
 
 import huggingface_hub
-from huggingface_hub import get_safetensors_metadata, hf_hub_download
+from huggingface_hub import (
+    get_safetensors_metadata,
+    hf_hub_download,
+    try_to_load_from_cache,
+)
 from huggingface_hub import list_repo_files as hf_list_repo_files
-from huggingface_hub import try_to_load_from_cache
-from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   LocalEntryNotFoundError,
-                                   RepositoryNotFoundError,
-                                   RevisionNotFoundError)
-from transformers import GenerationConfig, PretrainedConfig
-from transformers.models.auto.image_processing_auto import (
-    get_image_processor_config)
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_MAPPING_NAMES,
+)
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
-from vllm.transformers_utils.utils import (check_gguf_file,
-                                           parse_safetensors_file_metadata)
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    parse_safetensors_file_metadata,
+)
 
 if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -41,35 +51,39 @@ MISTRAL_CONFIG_NAME = "params.json"
 logger = init_logger(__name__)
 
 
-def _get_hf_token() -> Optional[str]:
+def _get_hf_token() -> str | None:
     """
     Get the HuggingFace token from environment variable.
 
-    Returns None if the token is not set, is an empty string, 
+    Returns None if the token is not set, is an empty string,
     or contains only whitespace.
     This follows the same pattern as huggingface_hub library which
     treats empty string tokens as None to avoid authentication errors.
     """
-    token = os.getenv('HF_TOKEN')
+    token = os.getenv("HF_TOKEN")
     if token and token.strip():
         return token
     return None
 
 
 class LazyConfigDict(dict):
-
     def __getitem__(self, key):
+        if isinstance(value := super().__getitem__(key), type):
+            return value
+
         import vllm.transformers_utils.configs as configs
-        return getattr(configs, super().__getitem__(key))
+
+        return getattr(configs, value)
 
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
-    deepseek_v3="DeepseekV3Config",
-    deepseek_v32="DeepseekV3Config",
+    deepseek_v32=DeepseekV3Config,
+    flex_olmo="FlexOlmoConfig",
+    kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
-    Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
     RefinedWebModel="RWConfig",  # For tiiuae/falcon-7b(-instruct)
     jais="JAISConfig",
@@ -84,30 +98,30 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     ultravox="UltravoxConfig",
     step3_vl="Step3VLConfig",
     step3_text="Step3TextConfig",
-    qwen3_next="Qwen3NextConfig")
+    qwen3_next="Qwen3NextConfig",
+    lfm2_moe="Lfm2MoeConfig",
+)
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
     "llm_config": "text_config",
 }
 
 _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
-    "internvl_chat": {
-        "has_no_defaults_at_init": True
-    },
-    "NVLM_D": {
-        "has_no_defaults_at_init": True
-    },
+    "internvl_chat": {"has_no_defaults_at_init": True},
+    "Llama_Nemotron_Nano_VL": {"attn_implementation": "eager"},
+    "NVLM_D": {"has_no_defaults_at_init": True},
 }
 
 
 class HFConfigParser(ConfigParserBase):
-
-    def parse(self,
-              model: Union[str, Path],
-              trust_remote_code: bool,
-              revision: Optional[str] = None,
-              code_revision: Optional[str] = None,
-              **kwargs) -> tuple[dict, PretrainedConfig]:
+    def parse(
+        self,
+        model: str | Path,
+        trust_remote_code: bool,
+        revision: str | None = None,
+        code_revision: str | None = None,
+        **kwargs,
+    ) -> tuple[dict, PretrainedConfig]:
         kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
         config_dict, _ = PretrainedConfig.get_config_dict(
             model,
@@ -119,8 +133,11 @@ class HFConfigParser(ConfigParserBase):
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
         if model_type is None:
-            model_type = "speculators" if config_dict.get(
-                "speculators_config") is not None else model_type
+            model_type = (
+                "speculators"
+                if config_dict.get("speculators_config") is not None
+                else model_type
+            )
 
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
@@ -133,8 +150,7 @@ class HFConfigParser(ConfigParserBase):
             )
         else:
             try:
-                kwargs = _maybe_update_auto_config_kwargs(
-                    kwargs, model_type=model_type)
+                kwargs = _maybe_update_auto_config_kwargs(kwargs, model_type=model_type)
                 config = AutoConfig.from_pretrained(
                     model,
                     trust_remote_code=trust_remote_code,
@@ -144,15 +160,17 @@ class HFConfigParser(ConfigParserBase):
                     **kwargs,
                 )
             except ValueError as e:
-                if (not trust_remote_code
-                        and "requires you to execute the configuration file"
-                        in str(e)):
+                if (
+                    not trust_remote_code
+                    and "requires you to execute the configuration file" in str(e)
+                ):
                     err_msg = (
                         "Failed to load the model config. If the model "
                         "is a custom model not yet available in the "
                         "HuggingFace transformers library, consider setting "
                         "`trust_remote_code=True` in LLM or using the "
-                        "`--trust-remote-code` flag in the CLI.")
+                        "`--trust-remote-code` flag in the CLI."
+                    )
                     raise RuntimeError(err_msg) from e
                 else:
                     raise e
@@ -161,20 +179,23 @@ class HFConfigParser(ConfigParserBase):
 
 
 class MistralConfigParser(ConfigParserBase):
-
-    def parse(self,
-              model: Union[str, Path],
-              trust_remote_code: bool,
-              revision: Optional[str] = None,
-              code_revision: Optional[str] = None,
-              **kwargs) -> tuple[dict, PretrainedConfig]:
+    def parse(
+        self,
+        model: str | Path,
+        trust_remote_code: bool,
+        revision: str | None = None,
+        code_revision: str | None = None,
+        **kwargs,
+    ) -> tuple[dict, PretrainedConfig]:
         # This function loads a params.json config which
         # should be used when loading models in mistral format
         config_dict = _download_mistral_config_file(model, revision)
-        if (max_position_embeddings :=
-                config_dict.get("max_position_embeddings")) is None:
+        if (
+            max_position_embeddings := config_dict.get("max_position_embeddings")
+        ) is None:
             max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
-                model, revision, **kwargs)
+                model, revision, **kwargs
+            )
             config_dict["max_position_embeddings"] = max_position_embeddings
 
         from vllm.transformers_utils.configs.mistral import adapt_config_dict
@@ -183,8 +204,9 @@ class MistralConfigParser(ConfigParserBase):
 
         # Mistral configs may define sliding_window as list[int]. Convert it
         # to int and add the layer_types list[str] to make it HF compatible
-        if ((sliding_window := getattr(config, "sliding_window", None))
-                and isinstance(sliding_window, list)):
+        if (sliding_window := getattr(config, "sliding_window", None)) and isinstance(
+            sliding_window, list
+        ):
             pattern_repeats = config.num_hidden_layers // len(sliding_window)
             layer_types = sliding_window * pattern_repeats
             config.layer_types = [
@@ -216,44 +238,51 @@ def get_config_parser(config_format: str) -> ConfigParserBase:
 
 
 def register_config_parser(config_format: str):
-
     """Register a customized vllm config parser.
-    When a config format is not supported by vllm, you can register a customized
-   config parser to support it.
-    Args:
-        config_format (str): The config parser format name.
-    Examples:
-
-        >>> from vllm.transformers_utils.config import (get_config_parser,
-                                                        register_config_parser)
-        >>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
-        >>>
-        >>> @register_config_parser("custom_config_parser")
-        ... class CustomConfigParser(ConfigParserBase):
-        ...     def parse(self,
-        ...            model: Union[str, Path],
-        ...            trust_remote_code: bool,
-        ...            revision: Optional[str] = None,
-        ...            code_revision: Optional[str] = None,
-        ...           **kwargs) -> tuple[dict, PretrainedConfig]:
-        ...        raise NotImplementedError
-        >>>
-        >>> type(get_config_parser("custom_config_parser"))
-        <class 'CustomConfigParser'>
+     When a config format is not supported by vllm, you can register a customized
+    config parser to support it.
+     Args:
+         config_format (str): The config parser format name.
+     Examples:
+
+         >>> from vllm.transformers_utils.config import (get_config_parser,
+                                                         register_config_parser)
+         >>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
+         >>>
+         >>> @register_config_parser("custom_config_parser")
+         ... class CustomConfigParser(ConfigParserBase):
+         ...     def parse(
+         ...         self,
+         ...         model: Union[str, Path],
+         ...         trust_remote_code: bool,
+         ...         revision: str | None = None,
+         ...         code_revision: str | None = None,
+         ...         **kwargs,
+         ...     ) -> tuple[dict, PretrainedConfig]:
+         ...         raise NotImplementedError
+         >>>
+         >>> type(get_config_parser("custom_config_parser"))
+         <class 'CustomConfigParser'>
     """  # noqa: E501
 
     def _wrapper(config_parser_cls):
         if config_format in _CONFIG_FORMAT_TO_CONFIG_PARSER:
             logger.warning(
                 "Config format `%s` is already registered, and will be "
-                "overwritten by the new parser class `%s`.", config_format,
-                config_parser_cls)
+                "overwritten by the new parser class `%s`.",
+                config_format,
+                config_parser_cls,
+            )
         if not issubclass(config_parser_cls, ConfigParserBase):
-            raise ValueError("The config parser must be a subclass of "
-                             "`ConfigParserBase`.")
+            raise ValueError(
+                "The config parser must be a subclass of `ConfigParserBase`."
+            )
         _CONFIG_FORMAT_TO_CONFIG_PARSER[config_format] = config_parser_cls
-        logger.info("Registered config parser `%s` with config format `%s`",
-                    config_parser_cls, config_format)
+        logger.info(
+            "Registered config parser `%s` with config format `%s`",
+            config_parser_cls,
+            config_format,
+        )
         return config_parser_cls
 
     return _wrapper
@@ -275,8 +304,9 @@ def with_retry(
             if attempt == max_retries - 1:
                 logger.error("%s: %s", log_msg, e)
                 raise
-            logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
-                         max_retries)
+            logger.error(
+                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
+            )
             time.sleep(retry_delay)
             retry_delay *= 2
 
@@ -288,32 +318,31 @@ def with_retry(
 def list_repo_files(
     repo_id: str,
     *,
-    revision: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
 ) -> list[str]:
-
     def lookup_files() -> list[str]:
         # directly list files if model is local
         if (local_path := Path(repo_id)).exists():
             return [
                 str(file.relative_to(local_path))
-                for file in local_path.rglob('*') if file.is_file()
+                for file in local_path.rglob("*")
+                if file.is_file()
             ]
         # if model is remote, use hf_hub api to list files
         try:
             if envs.VLLM_USE_MODELSCOPE:
-                from vllm.transformers_utils.utils import (
-                    modelscope_list_repo_files)
-                return modelscope_list_repo_files(repo_id,
-                                                  revision=revision,
-                                                  token=os.getenv(
-                                                      "MODELSCOPE_API_TOKEN",
-                                                      None))
-            return hf_list_repo_files(repo_id,
-                                      revision=revision,
-                                      repo_type=repo_type,
-                                      token=token)
+                from vllm.transformers_utils.utils import modelscope_list_repo_files
+
+                return modelscope_list_repo_files(
+                    repo_id,
+                    revision=revision,
+                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
+                )
+            return hf_list_repo_files(
+                repo_id, revision=revision, repo_type=repo_type, token=token
+            )
         except huggingface_hub.errors.OfflineModeIsEnabled:
             # Don't raise in offline mode,
             # all we know is that we don't have this
@@ -327,27 +356,27 @@ def file_exists(
     repo_id: str,
     file_name: str,
     *,
-    repo_type: Optional[str] = None,
-    revision: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
 ) -> bool:
-    file_list = list_repo_files(repo_id,
-                                repo_type=repo_type,
-                                revision=revision,
-                                token=token)
+    file_list = list_repo_files(
+        repo_id, repo_type=repo_type, revision=revision, token=token
+    )
     return file_name in file_list
 
 
 # In offline mode the result can be a false negative
-def file_or_path_exists(model: Union[str, Path], config_name: str,
-                        revision: Optional[str]) -> bool:
+def file_or_path_exists(
+    model: str | Path, config_name: str, revision: str | None
+) -> bool:
     if (local_path := Path(model)).exists():
         return (local_path / config_name).is_file()
 
     # Offline mode support: Check if config file is cached already
-    cached_filepath = try_to_load_from_cache(repo_id=model,
-                                             filename=config_name,
-                                             revision=revision)
+    cached_filepath = try_to_load_from_cache(
+        repo_id=model, filename=config_name, revision=revision
+    )
     if isinstance(cached_filepath, str):
         # The config file exists in cache- we can continue trying to load
         return True
@@ -356,10 +385,9 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
     # hf_hub. This will fail in offline mode.
 
     # Call HF to check if the file exists
-    return file_exists(str(model),
-                       config_name,
-                       revision=revision,
-                       token=_get_hf_token())
+    return file_exists(
+        str(model), config_name, revision=revision, token=_get_hf_token()
+    )
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -381,7 +409,8 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
                 f"field) and 'type={rope_type_legacy}' (legacy field). "
-                "You should only specify one of them.")
+                "You should only specify one of them."
+            )
 
     if "rope_type" not in rope_scaling and "type" in rope_scaling:
         rope_scaling["rope_type"] = rope_scaling["type"]
@@ -409,8 +438,11 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
 
 def uses_mrope(config: PretrainedConfig) -> bool:
     """Detect if the model with this config uses M-ROPE."""
-    return _uses_mrope(config) or _uses_mrope(
-        config.get_text_config()) or thinker_uses_mrope(config)
+    return (
+        _uses_mrope(config)
+        or _uses_mrope(config.get_text_config())
+        or thinker_uses_mrope(config)
+    )
 
 
 def thinker_uses_mrope(config: PretrainedConfig) -> bool:
@@ -432,8 +464,7 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool:
     def _is_encoder_decoder(config: PretrainedConfig) -> bool:
         return getattr(config, "is_encoder_decoder", False)
 
-    return (_is_encoder_decoder(config)
-            or _is_encoder_decoder(config.get_text_config()))
+    return _is_encoder_decoder(config) or _is_encoder_decoder(config.get_text_config())
 
 
 def is_interleaved(config: PretrainedConfig) -> bool:
@@ -442,8 +473,7 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     """
     text_config = config.get_text_config()
     if layer_types := getattr(text_config, "layer_types", None):
-        interleaved_types = {"full_attention", "sliding_attention"}
-        return interleaved_types.issubset(layer_types)
+        return len(set(layer_types)) > 1
     return False
 
 
@@ -462,8 +492,7 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
         if hasattr(config, old_attr):
             if not hasattr(config, new_attr):
                 config.update({new_attr: getattr(config, old_attr)})
-            logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
-                         new_attr)
+            logger.debug("Remapped config attribute '%s' to '%s'", old_attr, new_attr)
     return config
 
 
@@ -471,10 +500,10 @@ def maybe_override_with_speculators(
     model: str,
     tokenizer: str,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-    vllm_speculative_config: Optional[dict[str, Any]] = None,
+    revision: str | None = None,
+    vllm_speculative_config: dict[str, Any] | None = None,
     **kwargs,
-) -> tuple[str, str, Optional[dict[str, Any]]]:
+) -> tuple[str, str, dict[str, Any] | None]:
     """
     Resolve model configuration when speculators are detected.
 
@@ -512,31 +541,30 @@ def maybe_override_with_speculators(
         return model, tokenizer, vllm_speculative_config
 
     # Speculators format detected - process overrides
-    from vllm.transformers_utils.configs.speculators.base import (
-        SpeculatorsConfig)
+    from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 
-    vllm_speculative_config = SpeculatorsConfig.extract_vllm_speculative_config(
-        config_dict=config_dict)
+    speculative_config = SpeculatorsConfig.extract_vllm_speculative_config(
+        config_dict=config_dict
+    )
 
     # Set the draft model to the speculators model
-    vllm_speculative_config["model"] = model
+    speculative_config["model"] = model
 
     # Override model and tokenizer with the verifier model from config
     verifier_model = speculators_config["verifier"]["name_or_path"]
     model = tokenizer = verifier_model
 
-    return model, tokenizer, vllm_speculative_config
+    return model, tokenizer, speculative_config
 
 
 def get_config(
-    model: Union[str, Path],
+    model: str | Path,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-    code_revision: Optional[str] = None,
-    config_format: Union[str, ConfigFormat] = "auto",
-    hf_overrides_kw: Optional[dict[str, Any]] = None,
-    hf_overrides_fn: Optional[Callable[[PretrainedConfig],
-                                       PretrainedConfig]] = None,
+    revision: str | None = None,
+    code_revision: str | None = None,
+    config_format: str | ConfigFormat = "auto",
+    hf_overrides_kw: dict[str, Any] | None = None,
+    hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None,
     **kwargs,
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
@@ -548,12 +576,9 @@ def get_config(
 
     if config_format == "auto":
         try:
-            if is_gguf or file_or_path_exists(
-                    model, HF_CONFIG_NAME, revision=revision):
+            if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
                 config_format = "hf"
-            elif file_or_path_exists(model,
-                                     MISTRAL_CONFIG_NAME,
-                                     revision=revision):
+            elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
             else:
                 raise ValueError(
@@ -561,7 +586,8 @@ def get_config(
                     "With config_format 'auto', ensure your model has either "
                     "config.json (HF format) or params.json (Mistral format). "
                     "Otherwise please specify your_custom_config_format "
-                    "in engine args for customized config parser.")
+                    "in engine args for customized config parser."
+                )
 
         except Exception as e:
             error_message = (
@@ -576,7 +602,8 @@ def get_config(
                 "'params.json'.\n"
                 "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
                 "   Loading GGUF from a remote repo directly is not yet "
-                "supported.\n").format(model=model)
+                "supported.\n"
+            ).format(model=model)
 
             raise ValueError(error_message) from e
 
@@ -591,40 +618,57 @@ def get_config(
     # Special architecture mapping check for GGUF models
     if is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-            raise RuntimeError(
-                f"Can't get gguf config for {config.model_type}.")
+            raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
         config.update({"architectures": [model_type]})
 
+    # Architecture mapping for models without explicit architectures field
+    if not config.architectures:
+        if config.model_type not in MODEL_MAPPING_NAMES:
+            logger.warning(
+                "Model config does not have a top-level 'architectures' field: "
+                "expecting `hf_overrides={'architectures': ['...']}` to be passed "
+                "in engine args."
+            )
+        else:
+            model_type = MODEL_MAPPING_NAMES[config.model_type]
+            config.update({"architectures": [model_type]})
+
     # ModelOpt 0.31.0 and after saves the quantization config in the model
     # config file.
     quantization_config = config_dict.get("quantization_config", None)
 
     # ModelOpt 0.29.0 and before saves the quantization config in a separate
     # "hf_quant_config.json" in the same directory as the model config file.
-    if quantization_config is None \
-        and file_or_path_exists(model, "hf_quant_config.json", revision):
-        quantization_config = get_hf_file_to_dict("hf_quant_config.json",
-                                                  model, revision)
+    if quantization_config is None and file_or_path_exists(
+        model, "hf_quant_config.json", revision
+    ):
+        quantization_config = get_hf_file_to_dict(
+            "hf_quant_config.json", model, revision
+        )
 
     if quantization_config is not None:
         config.quantization_config = quantization_config
-        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        # auto-enable DeepGEMM UE8M0 if model config requests it
         scale_fmt = quantization_config.get("scale_fmt", None)
-        if scale_fmt in ("ue8m0", ):
-            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
-                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+        if scale_fmt in ("ue8m0",):
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
                 logger.info_once(
-                    ("Detected quantization_config.scale_fmt=%s; "
-                     "enabling Hopper UE8M0."),
+                    (
+                        "Detected quantization_config.scale_fmt=%s; "
+                        "enabling UE8M0 for DeepGEMM."
+                    ),
                     scale_fmt,
                 )
-            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
                 logger.warning_once(
-                    ("Model config requests UE8M0 "
-                     "(quantization_config.scale_fmt=%s), but "
-                     "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
-                     "Hopper UE8M0 disabled."),
+                    (
+                        "Model config requests UE8M0 "
+                        "(quantization_config.scale_fmt=%s), but "
+                        "VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
+                        "UE8M0 for DeepGEMM disabled."
+                    ),
                     scale_fmt,
                 )
 
@@ -643,17 +687,17 @@ def get_config(
     return config
 
 
-def try_get_local_file(model: Union[str, Path],
-                       file_name: str,
-                       revision: Optional[str] = 'main') -> Optional[Path]:
+def try_get_local_file(
+    model: str | Path, file_name: str, revision: str | None = "main"
+) -> Path | None:
     file_path = Path(model) / file_name
     if file_path.is_file():
         return file_path
     else:
         try:
-            cached_filepath = try_to_load_from_cache(repo_id=model,
-                                                     filename=file_name,
-                                                     revision=revision)
+            cached_filepath = try_to_load_from_cache(
+                repo_id=model, filename=file_name, revision=revision
+            )
             if isinstance(cached_filepath, str):
                 return Path(cached_filepath)
         except ValueError:
@@ -661,9 +705,9 @@ def try_get_local_file(model: Union[str, Path],
     return None
 
 
-def get_hf_file_to_dict(file_name: str,
-                        model: Union[str, Path],
-                        revision: Optional[str] = 'main'):
+def get_hf_file_to_dict(
+    file_name: str, model: str | Path, revision: str | None = "main"
+):
     """
     Downloads a file from the Hugging Face Hub and returns
     its contents as a dictionary.
@@ -678,25 +722,27 @@ def get_hf_file_to_dict(file_name: str,
     the contents of the downloaded file.
     """
 
-    file_path = try_get_local_file(model=model,
-                                   file_name=file_name,
-                                   revision=revision)
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
 
     if file_path is None:
         try:
             hf_hub_file = hf_hub_download(model, file_name, revision=revision)
         except huggingface_hub.errors.OfflineModeIsEnabled:
             return None
-        except (RepositoryNotFoundError, RevisionNotFoundError,
-                EntryNotFoundError, LocalEntryNotFoundError) as e:
+        except (
+            RepositoryNotFoundError,
+            RevisionNotFoundError,
+            EntryNotFoundError,
+            LocalEntryNotFoundError,
+        ) as e:
             logger.debug("File or repository not found in hf_hub_download", e)
             return None
         except HfHubHTTPError as e:
             logger.warning(
-                "Cannot connect to Hugging Face Hub. Skipping file "
-                "download for '%s':",
+                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
                 file_name,
-                exc_info=e)
+                exc_info=e,
+            )
             return None
         file_path = Path(hf_hub_file)
 
@@ -708,8 +754,7 @@ def get_hf_file_to_dict(file_name: str,
 
 
 @cache
-def get_pooling_config(model: str,
-                       revision: Optional[str] = 'main') -> Optional[dict]:
+def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
     """
     This function gets the pooling and normalize
     config from the model - only applies to
@@ -717,20 +762,20 @@ def get_pooling_config(model: str,
 
     Args:
         model: The name of the Hugging Face model.
-        revision: The specific version of the model to use. 
+        revision: The specific version of the model to use.
             Defaults to 'main'.
 
     Returns:
-        A dictionary containing the pooling type and whether 
+        A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
 
     modules_file_name = "modules.json"
 
     modules_dict = None
-    if file_or_path_exists(model=model,
-                           config_name=modules_file_name,
-                           revision=revision):
+    if file_or_path_exists(
+        model=model, config_name=modules_file_name, revision=revision
+    ):
         modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
 
     if modules_dict is None:
@@ -738,20 +783,31 @@ def get_pooling_config(model: str,
 
     logger.info("Found sentence-transformers modules configuration.")
 
-    pooling = next((item for item in modules_dict
-                    if item["type"] == "sentence_transformers.models.Pooling"),
-                   None)
+    pooling = next(
+        (
+            item
+            for item in modules_dict
+            if item["type"] == "sentence_transformers.models.Pooling"
+        ),
+        None,
+    )
     normalize = bool(
-        next((item for item in modules_dict
-              if item["type"] == "sentence_transformers.models.Normalize"),
-             False))
+        next(
+            (
+                item
+                for item in modules_dict
+                if item["type"] == "sentence_transformers.models.Normalize"
+            ),
+            False,
+        )
+    )
 
     if pooling:
-
         pooling_file_name = "{}/config.json".format(pooling["path"])
         pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
         pooling_type_name = next(
-            (item for item, val in pooling_dict.items() if val is True), None)
+            (item for item, val in pooling_dict.items() if val is True), None
+        )
 
         if pooling_type_name is not None:
             pooling_type_name = get_pooling_config_name(pooling_type_name)
@@ -762,7 +818,7 @@ def get_pooling_config(model: str,
     return None
 
 
-def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
+def get_pooling_config_name(pooling_name: str) -> str | None:
     if "pooling_mode_" in pooling_name:
         pooling_name = pooling_name.replace("pooling_mode_", "")
 
@@ -772,20 +828,19 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
     if "lasttoken" in pooling_name:
         pooling_name = "last"
 
-    supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
+    supported_pooling_types = ["LAST", "ALL", "CLS", "STEP", "MEAN"]
     pooling_type_name = pooling_name.upper()
 
     if pooling_type_name in supported_pooling_types:
         return pooling_type_name
 
-    raise NotImplementedError(
-        f"Pooling type {pooling_type_name} not supported")
+    raise NotImplementedError(f"Pooling type {pooling_type_name} not supported")
 
 
 @cache
-def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
-                                              revision: Optional[str] = 'main'
-                                              ):
+def get_sentence_transformer_tokenizer_config(
+    model: str | Path, revision: str | None = "main"
+):
     """
     Returns the tokenization configuration dictionary for a
     given Sentence Transformer BERT model.
@@ -812,9 +867,10 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
     encoder_dict = None
 
     for config_file in sentence_transformer_config_files:
-        if try_get_local_file(model=model,
-                              file_name=config_file,
-                              revision=revision) is not None:
+        if (
+            try_get_local_file(model=model, file_name=config_file, revision=revision)
+            is not None
+        ):
             encoder_dict = get_hf_file_to_dict(config_file, model, revision)
             if encoder_dict:
                 break
@@ -822,16 +878,15 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
     if not encoder_dict and not Path(model).is_absolute():
         try:
             # If model is on HuggingfaceHub, get the repo files
-            repo_files = list_repo_files(model,
-                                         revision=revision,
-                                         token=_get_hf_token())
+            repo_files = list_repo_files(
+                model, revision=revision, token=_get_hf_token()
+            )
         except Exception:
             repo_files = []
 
         for config_name in sentence_transformer_config_files:
             if config_name in repo_files:
-                encoder_dict = get_hf_file_to_dict(config_name, model,
-                                                   revision)
+                encoder_dict = get_hf_file_to_dict(config_name, model, revision)
                 if encoder_dict:
                     break
 
@@ -848,34 +903,39 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
 def maybe_register_config_serialize_by_value() -> None:
     """Try to register HF model configuration class to serialize by value
 
-        If trust_remote_code is set, and the model's config file specifies an
-        `AutoConfig` class, then the config class is typically an instance of
-        a custom class imported from the HF modules cache.
+    If trust_remote_code is set, and the model's config file specifies an
+    `AutoConfig` class, then the config class is typically an instance of
+    a custom class imported from the HF modules cache.
 
-        Examples:
-
-        >>> from transformers import AutoConfig
-        >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
-        >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
-        >>> import transformers_modules # error, not initialized
-        >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
-        >>> import transformers_modules # success, initialized
-        >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
-
-        In the DeepSeek example, the config class is an instance of a custom
-        class that is not serializable by default. This class will not be
-        importable in spawned workers, and won't exist at all on
-        other nodes, which breaks serialization of the config.
-
-        In this function we tell the cloudpickle serialization library to pass
-        instances of these generated classes by value instead of by reference,
-        i.e. the class definition is serialized along with its data so that the
-        class module does not need to be importable on the receiving end.
+    Examples:
 
-        See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
-    """ # noqa
+    >>> from transformers import AutoConfig
+    >>> klass = AutoConfig.from_pretrained(
+    ...     "meta-llama/Meta-Llama-3-8B", trust_remote_code=True
+    ... )
+    >>> klass.__class__  # transformers.models.llama.configuration_llama.LlamaConfig
+    >>> import transformers_modules  # error, not initialized
+    >>> klass = AutoConfig.from_pretrained(
+    ...     "deepseek-ai/DeepSeek-V2.5", trust_remote_code=True
+    ... )
+    >>> import transformers_modules  # success, initialized
+    >>> klass.__class__  # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
+
+    In the DeepSeek example, the config class is an instance of a custom
+    class that is not serializable by default. This class will not be
+    importable in spawned workers, and won't exist at all on
+    other nodes, which breaks serialization of the config.
+
+    In this function we tell the cloudpickle serialization library to pass
+    instances of these generated classes by value instead of by reference,
+    i.e. the class definition is serialized along with its data so that the
+    class module does not need to be importable on the receiving end.
+
+    See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
+    """  # noqa
     try:
         import transformers_modules
+
         transformers_modules_available = True
     except ImportError:
         transformers_modules_available = False
@@ -892,7 +952,7 @@ def maybe_register_config_serialize_by_value() -> None:
         # serialization of VllmConfig objects that may contain custom configs
         # from transformers_modules
         def _reduce_config(config: VllmConfig):
-            return (pickle.loads, (cloudpickle.dumps(config), ))
+            return (pickle.loads, (cloudpickle.dumps(config),))
 
         multiprocessing.reducer.register(VllmConfig, _reduce_config)
 
@@ -901,7 +961,8 @@ def maybe_register_config_serialize_by_value() -> None:
             cloudpickle.register_pickle_by_value(transformers_modules)
 
             # ray vendors its own version of cloudpickle
-            from vllm.executor.ray_utils import ray
+            from vllm.v1.executor.ray_utils import ray
+
             if ray:
                 ray.cloudpickle.register_pickle_by_value(transformers_modules)
 
@@ -911,13 +972,14 @@ def maybe_register_config_serialize_by_value() -> None:
             " trust_remote_code with by-value serialization. This may"
             " lead to a later error. If remote code is not needed"
             " remove `--trust-remote-code`",
-            exc_info=e)
+            exc_info=e,
+        )
 
 
 def get_hf_image_processor_config(
-    model: Union[str, Path],
-    hf_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
+    model: str | Path,
+    hf_token: bool | str | None = None,
+    revision: str | None = None,
     **kwargs,
 ) -> dict[str, Any]:
     # ModelScope does not provide an interface for image_processor
@@ -926,10 +988,9 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
-    return get_image_processor_config(model,
-                                      token=hf_token,
-                                      revision=revision,
-                                      **kwargs)
+    return get_image_processor_config(
+        model, token=hf_token, revision=revision, **kwargs
+    )
 
 
 def get_hf_text_config(config: PretrainedConfig):
@@ -950,8 +1011,9 @@ def get_hf_text_config(config: PretrainedConfig):
 def try_get_generation_config(
     model: str,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-) -> Optional[GenerationConfig]:
+    revision: str | None = None,
+    config_format: str | ConfigFormat = "auto",
+) -> GenerationConfig | None:
     try:
         return GenerationConfig.from_pretrained(
             model,
@@ -963,6 +1025,7 @@ def try_get_generation_config(
                 model,
                 trust_remote_code=trust_remote_code,
                 revision=revision,
+                config_format=config_format,
             )
             return GenerationConfig.from_model_config(config)
         except OSError:  # Not found
@@ -972,7 +1035,7 @@ def try_get_generation_config(
 def try_get_safetensors_metadata(
     model: str,
     *,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ):
     get_safetensors_metadata_partial = partial(
         get_safetensors_metadata,
@@ -982,17 +1045,18 @@ def try_get_safetensors_metadata(
     )
 
     try:
-        return with_retry(get_safetensors_metadata_partial,
-                          "Error retrieving safetensors")
+        return with_retry(
+            get_safetensors_metadata_partial, "Error retrieving safetensors"
+        )
     except Exception:
         return None
 
 
 def try_get_tokenizer_config(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
+    pretrained_model_name_or_path: str | os.PathLike,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-) -> Optional[dict[str, Any]]:
+    revision: str | None = None,
+) -> dict[str, Any] | None:
     try:
         return get_tokenizer_config(
             pretrained_model_name_or_path,
@@ -1003,10 +1067,44 @@ def try_get_tokenizer_config(
         return None
 
 
+@cache
+def try_get_dense_modules(
+    model: str | Path,
+    revision: str | None = None,
+) -> list[dict[str, Any]] | None:
+    try:
+        modules = get_hf_file_to_dict("modules.json", model, revision)
+        if not modules:
+            return None
+
+        if isinstance(modules, dict):
+            modules = modules.get("modules", [])
+
+        dense_modules = [
+            m for m in modules if m.get("type") == "sentence_transformers.models.Dense"
+        ]
+        if not dense_modules:
+            return None
+
+        layer_configs = []
+        for module in dense_modules:
+            folder = module.get("path", "")
+
+            config_path = f"{folder}/config.json" if folder else "config.json"
+            layer_config = get_hf_file_to_dict(config_path, model, revision)
+            if not layer_config:
+                continue
+            layer_config["folder"] = folder
+            layer_configs.append(layer_config)
+        return layer_configs
+    except Exception:
+        return None
+
+
 def get_safetensors_params_metadata(
     model: str,
     *,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ) -> dict[str, Any]:
     """
     Get the safetensors metadata for remote model repository.
@@ -1016,9 +1114,9 @@ def get_safetensors_params_metadata(
         safetensors_to_check = model_path.glob("*.safetensors")
         full_metadata = {
             param_name: info
-            for file_path in safetensors_to_check if file_path.is_file()
-            for param_name, info in parse_safetensors_file_metadata(
-                file_path).items()
+            for file_path in safetensors_to_check
+            if file_path.is_file()
+            for param_name, info in parse_safetensors_file_metadata(file_path).items()
         }
     else:
         repo_mt = try_get_safetensors_metadata(model, revision=revision)
@@ -1038,7 +1136,8 @@ def _download_mistral_config_file(model, revision) -> dict:
         raise ValueError(
             f"Failed to load mistral '{config_file_name}' config for model "
             f"{model}. Please check if the model is a mistral-format model "
-            f"and if the config file exists.")
+            f"and if the config file exists."
+        )
     assert isinstance(config_dict, dict)
     return config_dict
 
@@ -1047,10 +1146,12 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
     max_position_embeddings = 128_000
     try:
         trust_remote_code_val = kwargs.get("trust_remote_code", False)
-        hf_config = get_config(model=model,
-                               trust_remote_code=trust_remote_code_val,
-                               revision=revision,
-                               config_format="hf")
+        hf_config = get_config(
+            model=model,
+            trust_remote_code=trust_remote_code_val,
+            revision=revision,
+            config_format="hf",
+        )
         if hf_value := hf_config.get_text_config().max_position_embeddings:
             max_position_embeddings = hf_value
     except Exception as e:
@@ -1058,12 +1159,13 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
             "The params.json file is missing 'max_position_embeddings'"
             " and could not get a value from the HF config."
             " Defaulting to 128000",
-            exc_info=e)
+            exc_info=e,
+        )
 
     return max_position_embeddings
 
 
-def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
+def get_model_path(model: str | Path, revision: str | None = None):
     if os.path.exists(model):
         return model
     assert huggingface_hub.constants.HF_HUB_OFFLINE
@@ -1074,29 +1176,28 @@ def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
 
     if envs.VLLM_USE_MODELSCOPE:
         from modelscope.hub.snapshot_download import snapshot_download
+
         return snapshot_download(model_id=model, **common_kwargs)
 
     from huggingface_hub import snapshot_download
+
     return snapshot_download(repo_id=model, **common_kwargs)
 
 
-def get_hf_file_bytes(file_name: str,
-                      model: Union[str, Path],
-                      revision: Optional[str] = 'main') -> Optional[bytes]:
+def get_hf_file_bytes(
+    file_name: str, model: str | Path, revision: str | None = "main"
+) -> bytes | None:
     """Get file contents from HuggingFace repository as bytes."""
-    file_path = try_get_local_file(model=model,
-                                   file_name=file_name,
-                                   revision=revision)
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
 
     if file_path is None:
-        hf_hub_file = hf_hub_download(model,
-                                      file_name,
-                                      revision=revision,
-                                      token=_get_hf_token())
+        hf_hub_file = hf_hub_download(
+            model, file_name, revision=revision, token=_get_hf_token()
+        )
         file_path = Path(hf_hub_file)
 
     if file_path is not None and file_path.is_file():
-        with open(file_path, 'rb') as file:
+        with open(file_path, "rb") as file:
             return file.read()
 
     return None
diff --git a/vllm/transformers_utils/config_parser_base.py b/vllm/transformers_utils/config_parser_base.py
index c27177f74d4ba8032d6c0ad41a1afa39377d281f..79d47ff5604202ff6954c320fcdac1693fbb0e62 100644
--- a/vllm/transformers_utils/config_parser_base.py
+++ b/vllm/transformers_utils/config_parser_base.py
@@ -3,18 +3,18 @@
 
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 
 
 class ConfigParserBase(ABC):
-
     @abstractmethod
-    def parse(self,
-              model: Union[str, Path],
-              trust_remote_code: bool,
-              revision: Optional[str] = None,
-              code_revision: Optional[str] = None,
-              **kwargs) -> tuple[dict, PretrainedConfig]:
+    def parse(
+        self,
+        model: str | Path,
+        trust_remote_code: bool,
+        revision: str | None = None,
+        code_revision: str | None = None,
+        **kwargs,
+    ) -> tuple[dict, PretrainedConfig]:
         raise NotImplementedError
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 1b33b5e70e0bfdd75320fb65787c56caf06c1cc5..df74fb8c1b56193de6233c478b6b99d4fc980bf2 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -7,50 +7,59 @@ Model configs may be defined in this directory for the following reasons:
 - There is a need to override the existing config to support vLLM.
 """
 
+from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
+
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
+from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.olmo3 import Olmo3Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
 from vllm.transformers_utils.configs.radio import RadioConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
-from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
-                                                      Step3VisionEncoderConfig,
-                                                      Step3VLConfig)
+from vllm.transformers_utils.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
     "DeepseekV3Config",
     "DotsOCRConfig",
     "EAGLEConfig",
+    "FlexOlmoConfig",
     "RWConfig",
     "JAISConfig",
+    "Lfm2MoeConfig",
     "MedusaConfig",
     "MiDashengLMConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
+    "KimiLinearConfig",
     "KimiVLConfig",
     "NemotronConfig",
     "NemotronHConfig",
-    "Nemotron_Nano_VL_Config",
     "Olmo3Config",
     "OvisConfig",
     "RadioConfig",
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b634fd037a3380b12a756880d31ed8d4540c7f1
--- /dev/null
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 200_192,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        moe_intermediate_size: int = 1408,
+        num_hidden_layers: int = 32,
+        num_dense_layers: int = 1,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        rope_scaling: dict | None = None,
+        num_experts: int = 64,
+        num_experts_per_tok: int = 6,
+        num_shared_experts: int = 2,
+        num_expert_groups: int = 1,
+        num_limited_groups: int = 1,
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        global_attn_every_n_layers: int = 4,
+        sliding_window: int = 2048,
+        layer_types: list[str] | None = None,
+        attention_dropout: float = 0.0,
+        mup_enabled: bool = False,
+        n_group: int = 1,
+        topk_group: int = 1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or num_attention_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.attention_dropout = attention_dropout
+
+        self.mup_enabled = mup_enabled
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["AfmoeConfig"]
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index a789b93b5edff0f079707c515c3a9e2f2c90e42c..1707e15285c899197f216f6594130a0f72cdfe61 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # Copied from
 # https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
-""" Arctic model configuration"""
+"""Arctic model configuration"""
 
 from dataclasses import asdict, dataclass
 from typing import Any
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 176d2b8f63fe46ff97e8f4b75a5d54829ffd022c..1d795b55c8bc75d0b8bbebe345a95fd341c5e498 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -13,33 +13,35 @@ class ChatGLMConfig(PretrainedConfig):
         "n_head_kv": "multi_query_group_num",
     }
 
-    def __init__(self,
-                 num_layers=28,
-                 padded_vocab_size=65024,
-                 hidden_size=4096,
-                 ffn_hidden_size=13696,
-                 kv_channels=128,
-                 num_attention_heads=32,
-                 seq_length=2048,
-                 hidden_dropout=0.0,
-                 attention_dropout=0.0,
-                 layernorm_epsilon=1e-5,
-                 rmsnorm=True,
-                 apply_residual_connection_post_layernorm=False,
-                 post_layer_norm=True,
-                 add_bias_linear=False,
-                 add_qkv_bias=False,
-                 interleaved_qkv=False,
-                 bias_dropout_fusion=True,
-                 multi_query_attention=False,
-                 multi_query_group_num=1,
-                 apply_query_key_layer_scaling=True,
-                 attention_softmax_in_fp32=True,
-                 fp32_residual_connection=False,
-                 quantization_bit=0,
-                 pre_seq_len=None,
-                 prefix_projection=False,
-                 **kwargs):
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs,
+    ):
         self.num_layers = num_layers
         self.vocab_size = padded_vocab_size
         self.padded_vocab_size = padded_vocab_size
@@ -55,7 +57,8 @@ class ChatGLMConfig(PretrainedConfig):
         self.layernorm_epsilon = layernorm_epsilon
         self.rmsnorm = rmsnorm
         self.apply_residual_connection_post_layernorm = (
-            apply_residual_connection_post_layernorm)
+            apply_residual_connection_post_layernorm
+        )
         self.post_layer_norm = post_layer_norm
         self.add_bias_linear = add_bias_linear
         self.add_qkv_bias = add_qkv_bias
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 957d638318410e36fa3684549be09aaf86455a20..8b02a4ddd4bc7a5a11c41490ff129a3204c046b5 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -3,7 +3,7 @@
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 
-from transformers.configuration_utils import PretrainedConfig
+from transformers import DeepseekV2Config, PretrainedConfig
 
 
 class VisionEncoderConfig(PretrainedConfig):
@@ -25,20 +25,22 @@ class VisionEncoderConfig(PretrainedConfig):
     deterministic: bool = False
     num_recomputing_layers: int = 0
 
-    def __init__(self,
-                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
-                 image_size: int = 384,
-                 patch_size: int = 16,
-                 width: int = 1024,
-                 layers: int = 24,
-                 heads: int = 16,
-                 mlp_ratio: int = 4,
-                 global_pool: str = "map",
-                 ignore_head: bool = True,
-                 class_token: bool = False,
-                 num_classes: int = 0,
-                 use_checkpoint: bool = False,
-                 **kwargs):
+    def __init__(
+        self,
+        model_name: str = "vit_so400m_patch14_siglip_384.webli",
+        image_size: int = 384,
+        patch_size: int = 16,
+        width: int = 1024,
+        layers: int = 24,
+        heads: int = 16,
+        mlp_ratio: int = 4,
+        global_pool: str = "map",
+        ignore_head: bool = True,
+        class_token: bool = False,
+        num_classes: int = 0,
+        use_checkpoint: bool = False,
+        **kwargs,
+    ):
         self.model_name = model_name
         self.image_size = image_size
         self.patch_size = patch_size
@@ -65,14 +67,16 @@ class MlpProjectorConfig(PretrainedConfig):
     downsample_ratio: int = 2
     token_pooling: bool = False
 
-    def __init__(self,
-                 projector_type: str = "downsample_mlp_gelu",
-                 input_dim: int = 1152,
-                 n_embed: int = 2048,
-                 depth: int = 2,
-                 mlp_ratio: int = 1,
-                 downsample_ratio: int = 2,
-                 **kwargs):
+    def __init__(
+        self,
+        projector_type: str = "downsample_mlp_gelu",
+        input_dim: int = 1152,
+        n_embed: int = 2048,
+        depth: int = 2,
+        mlp_ratio: int = 1,
+        downsample_ratio: int = 2,
+        **kwargs,
+    ):
         self.projector_type = projector_type
         self.input_dim = input_dim
         self.n_embed = n_embed
@@ -83,107 +87,6 @@ class MlpProjectorConfig(PretrainedConfig):
         super().__init__(**kwargs)
 
 
-class DeepseekV2Config(PretrainedConfig):
-
-    model_type = "deepseek_v2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        hidden_size=4096,
-        intermediate_size=11008,
-        moe_intermediate_size=1407,
-        num_hidden_layers=30,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        n_shared_experts=None,
-        n_routed_experts=None,
-        ep_size=1,
-        routed_scaling_factor=1.0,
-        kv_lora_rank=512,
-        q_lora_rank=1536,
-        qk_rope_head_dim=64,
-        v_head_dim=128,
-        qk_nope_head_dim=128,
-        topk_method='gready',
-        n_group=None,
-        topk_group=None,
-        num_experts_per_tok=None,
-        moe_layer_freq=1,
-        first_k_dense_replace=0,
-        norm_topk_prob=False,
-        scoring_func='softmax',
-        aux_loss_alpha=0.001,
-        seq_aux=True,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=100000,
-        eos_token_id=100001,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        use_mla=True,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_shared_experts = n_shared_experts
-        self.n_routed_experts = n_routed_experts
-        self.ep_size = ep_size
-        self.routed_scaling_factor = routed_scaling_factor
-        self.kv_lora_rank = kv_lora_rank
-        self.q_lora_rank = q_lora_rank
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.topk_method = topk_method
-        self.n_group = n_group
-        self.topk_group = topk_group
-        self.num_experts_per_tok = num_experts_per_tok
-        self.moe_layer_freq = moe_layer_freq
-        self.first_k_dense_replace = first_k_dense_replace
-        self.norm_topk_prob = norm_topk_prob
-        self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
-        self.seq_aux = seq_aux
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = float(rms_norm_eps)
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.use_mla = use_mla
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
 class DeepseekVLV2Config(PretrainedConfig):
     model_type = "deepseek_vl_v2"
     vision_config: VisionEncoderConfig
@@ -191,14 +94,15 @@ class DeepseekVLV2Config(PretrainedConfig):
 
     tile_tag: str = "2D"
     global_view_pos: str = "head"
-    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
-
-    def __init__(self,
-                 tile_tag: str = "tile_tag",
-                 global_view_pos: str = "head",
-                 candidate_resolutions: tuple[tuple[int,
-                                                    int]] = ((384, 384), ),
-                 **kwargs):
+    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
+
+    def __init__(
+        self,
+        tile_tag: str = "tile_tag",
+        global_view_pos: str = "head",
+        candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         vision_config = kwargs.get("vision_config", {})
@@ -214,3 +118,9 @@ class DeepseekVLV2Config(PretrainedConfig):
         self.global_view_pos = global_view_pos
         self.candidate_resolutions = candidate_resolutions
         self.vocab_size = self.text_config.vocab_size
+
+        # update model_type for OCR model
+        if "DeepseekOCRForCausalLM" in (
+            self.architectures or kwargs.get("architectures", [])
+        ):
+            self.model_type = "deepseek_ocr"
diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py
index 6bb3c12d9c7eb83ba7478258d215077f8fc8e389..1e42cb2fd8594a27bbd7091c69a18f834fdc47fe 100644
--- a/vllm/transformers_utils/configs/dotsocr.py
+++ b/vllm/transformers_utils/configs/dotsocr.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.qwen2 import Qwen2Config
@@ -53,12 +53,14 @@ class DotsVisionConfig(PretrainedConfig):
 class DotsOCRConfig(Qwen2Config):
     model_type = "dots_ocr"
 
-    def __init__(self,
-                 image_token_id=151665,
-                 video_token_id=151656,
-                 vision_config: Optional[dict] = None,
-                 *args,
-                 **kwargs):
+    def __init__(
+        self,
+        image_token_id=151665,
+        video_token_id=151656,
+        vision_config: dict | None = None,
+        *args,
+        **kwargs,
+    ):
         super().__init__(*args, **kwargs)
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 444ed70de3d0ce60c4c01b518af17041b535f778..f5dc9ddfbc5758182e53733673519fc951f9a350 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -2,37 +2,28 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional, Union
 
-from transformers import AutoConfig, PretrainedConfig
-
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
 
 
 class EAGLEConfig(PretrainedConfig):
     model_type = "eagle"
 
-    def __init__(self,
-                 model: Union[PretrainedConfig, dict, None] = None,
-                 truncated_vocab_size: Optional[int] = None,
-                 method: Optional[str] = 'eagle',
-                 **kwargs):
-
-        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+    def __init__(
+        self,
+        model: PretrainedConfig | dict | None = None,
+        truncated_vocab_size: int | None = None,
+        method: str | None = "eagle",
+        **kwargs,
+    ):
+        model_config: PretrainedConfig | DeepseekV2Config | None
         if isinstance(model, dict):
-            archs = model.get("architectures", [])
-            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
-            if any(target_arch in archs for target_arch in target_archs):
-                # AutoConfig does not support DeepSeek MoE models yet
-                model_config = DeepseekV2Config(**model)
-            else:
-                model_config = AutoConfig.for_model(**model)
+            model_config = AutoConfig.for_model(**model)
         else:
             model_config = model
 
         for k, v in kwargs.items():
-            if k != "architectures" and k != "model_type" and hasattr(
-                    model_config, k):
+            if k != "architectures" and k != "model_type" and hasattr(model_config, k):
                 setattr(model_config, k, v)
 
         self.model = model_config
@@ -40,31 +31,39 @@ class EAGLEConfig(PretrainedConfig):
         if self.model is None:
             self.truncated_vocab_size = None
         else:
-            self.truncated_vocab_size = self.model.vocab_size if \
-                truncated_vocab_size is None else truncated_vocab_size
+            self.truncated_vocab_size = (
+                self.model.vocab_size
+                if truncated_vocab_size is None
+                else truncated_vocab_size
+            )
 
         # Eagle model name should follow naming convention of
         # LlamaForCausalLM -> EagleLlamaForCausalLM
         # LlamaForCausalLM -> Eagle3LlamaForCausalLM
         # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
         if method == "eagle":
-            assert self.model is not None, \
+            assert self.model is not None, (
                 "model should not be None when method is eagle"
+            )
             kwargs["architectures"] = [
-                f"Eagle{arch}" if not arch.startswith("Eagle") \
-                    else arch for arch in self.model.architectures
+                f"Eagle{arch}" if not arch.startswith("Eagle") else arch
+                for arch in self.model.architectures
             ]
 
         elif method == "eagle3":
-            assert self.model is not None, \
+            assert self.model is not None, (
                 "model should not be None when method is eagle3"
+            )
             kwargs["architectures"] = [
-                arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
-                else f"Eagle3{arch}" for arch in self.model.architectures
+                arch
+                if arch.startswith("Eagle3") or arch.endswith("Eagle3")
+                else f"Eagle3{arch}"
+                for arch in self.model.architectures
             ]
         else:
-            raise ValueError(f"Invalid method {method}. "
-                             "Supported methods are eagle and eagle3.")
+            raise ValueError(
+                f"Invalid method {method}. Supported methods are eagle and eagle3."
+            )
 
         super().__init__(**kwargs)
 
@@ -76,9 +75,10 @@ class EAGLEConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "EAGLEConfig":
         config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs)
+            pretrained_model_name_or_path, **kwargs
+        )
         return cls.from_dict(config_dict, **kwargs)
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index 2f5400463d91adf86cd9767a5fba17c825fee8ac..c646d241d4eb0e710b7598c0db4655498c84c56a 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -19,6 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Falcon configuration"""
+
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -77,9 +78,7 @@ class RWConfig(PretrainedConfig):
             # Hack for falcon-40b
             self.new_decoder_architecture = True
 
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         **kwargs)
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
     @property
     def head_dim(self):
diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f4d446288bb910b09f8d65105acf1dbf1c0ed
--- /dev/null
+++ b/vllm/transformers_utils/configs/flex_olmo.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class FlexOlmoConfig(PretrainedConfig):
+    model_type = "flex_olmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100352,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        pad_token_id=100277,
+        bos_token_id=None,
+        eos_token_id=100257,
+        tie_word_embeddings=False,
+        rope_theta=500000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_experts_per_tok=5,
+        num_experts=7,
+        output_router_logits=False,
+        router_aux_loss_coef=0.01,
+        norm_topk_prob=False,
+        **kwargs,
+    ):
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["FlexOlmoForCausalLM"]
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.norm_topk_prob = norm_topk_prob
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 3f50638f16b537520f6a6466440162a07b076cca..6b581bf1877556a524791e037bbeeb9f00035afe 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -75,7 +75,7 @@ class JAISConfig(PretrainedConfig):
             Whether or not the model should return the last key/values
             attentions (not used by all models).
         scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
-            Whether to additionally scale attention weights 
+            Whether to additionally scale attention weights
             by `1 / layer_idx + 1`.
         reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
             Whether to scale keys (K) prior to computing attention
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
         if self.alibi_scaling is None:
             return
 
-        if (not isinstance(self.alibi_scaling, dict)
-                or len(self.alibi_scaling) != 2):
+        if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
             raise ValueError(
                 "`alibi_scaling` must be a dictionary with two fields, "
                 "`type` and `factor` or `type` and `train_seq_len`, "
-                f"got {self.alibi_scaling}")
+                f"got {self.alibi_scaling}"
+            )
         alibi_scaling_type = self.alibi_scaling.get("type", None)
         alibi_scaling_factor = self.alibi_scaling.get("factor", None)
         alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
         if alibi_scaling_type is None or alibi_scaling_type != "linear":
-            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
-                             f"got {alibi_scaling_type}")
-        if (alibi_scaling_factor is not None
-                and not isinstance(alibi_scaling_factor, float)
-                or (alibi_scaling_factor is not None
-                    and alibi_scaling_factor <= 1.0)):
+            raise ValueError(
+                f"`alibi_scaling`'s type field must be 'linear', "
+                f"got {alibi_scaling_type}"
+            )
+        if (
+            alibi_scaling_factor is not None
+            and not isinstance(alibi_scaling_factor, float)
+            or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
+        ):
             raise ValueError(
                 f"`alibi_scaling`'s factor field must be a float > 1.0, "
-                f"got {alibi_scaling_factor}")
-        if (alibi_dynamic_scaling is not None
-                and not isinstance(alibi_dynamic_scaling, int)
-                or (alibi_dynamic_scaling is not None
-                    and alibi_dynamic_scaling <= 1)):
+                f"got {alibi_scaling_factor}"
+            )
+        if (
+            alibi_dynamic_scaling is not None
+            and not isinstance(alibi_dynamic_scaling, int)
+            or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
+        ):
             raise ValueError(
                 f"`alibi_scaling`'s `train_seq_len` field must be an "
-                f"integer > 1, got {alibi_dynamic_scaling}")
+                f"integer > 1, got {alibi_dynamic_scaling}"
+            )
diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..65ddf48c5249b24ce83cace3dedd36e6c6b93fd9
--- /dev/null
+++ b/vllm/transformers_utils/configs/kimi_linear.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: int | None = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: int | None = None,
+        num_experts_per_token: int | None = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: int | None = None,
+        kv_lora_rank: int | None = None,
+        qk_nope_head_dim: int | None = None,
+        qk_rope_head_dim: int | None = None,
+        v_head_dim: int | None = None,
+        mla_use_nope: bool | None = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: dict | None = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index ae8dac0f381d658879c82b4fcaad3f97ca46a049..6d992464cbe81f8e33df94dd9b38a2141f5a4749 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -1,24 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
-from typing import Optional, Union
 
+from transformers import DeepseekV2Config
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 
 class KimiVLConfig(PretrainedConfig):
     model_type = "kimi_vl"
 
-    def __init__(self,
-                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
-                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
-                 ignore_index: int = -100,
-                 media_placeholder_token_id: int = 163605,
-                 pad_token_id: int = 0,
-                 **kwargs):
+    def __init__(
+        self,
+        vision_config: dict | MoonViTConfig | None = None,
+        text_config: dict | DeepseekV2Config | None = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs,
+    ):
         if vision_config is None:
             vision_config = MoonViTConfig()
         elif isinstance(vision_config, dict):
diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..37c038e12db80436ae2606846a4032fbd1c469eb
--- /dev/null
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Lfm2MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
+    e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Lfm2Model`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 7168):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1792):
+            Intermediate size of the routed expert.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the conv layers.
+        conv_L_cache (`int`, *optional*, defaults to 3):
+            L_cache dim in the conv layers.
+        num_dense_layers (`int`, *optional*, defaults to 2):
+            Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 32):
+            Number of routed experts.
+        use_expert_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use the expert bias on the routing weights.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts in MoE models.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        layer_types (`Optional`, *optional*):
+            Type of each layers.
+
+    ```python
+    >>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
+
+    >>> # Initializing a LFM2 Moe model
+    >>> configuration = Lfm2MoeConfig()
+
+    >>> # Initializing a model from the LFM2-8B-A1B style configuration
+    >>> model = Lfm2MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""  # noqa: E501
+
+    model_type = "lfm2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2048,
+        intermediate_size: int = 7168,
+        moe_intermediate_size: int = 1792,
+        num_hidden_layers: int = 32,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        rope_theta: float = 1000000.0,
+        max_position_embeddings: int = 128_000,
+        use_cache: bool = True,
+        norm_eps: float = 0.00001,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        conv_bias: bool = False,
+        conv_L_cache: int = 3,
+        num_dense_layers: int = 2,
+        num_experts_per_tok: int = 4,
+        num_experts: int = 32,
+        use_expert_bias: bool = True,
+        routed_scaling_factor: float = 1.0,
+        norm_topk_prob: bool = True,
+        layer_types: list[str] | None = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.norm_eps = norm_eps
+
+        # attn operator config
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        # custom operator config
+        self.conv_bias = conv_bias
+        self.conv_L_cache = conv_L_cache
+
+        # moe config
+        self.num_dense_layers = num_dense_layers
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.use_expert_bias = use_expert_bias
+        self.routed_scaling_factor = routed_scaling_factor
+        self.norm_topk_prob = norm_topk_prob
+        self.layer_types = layer_types
+
+        tie_word_embeddings = kwargs.get(
+            "tie_embedding", tie_word_embeddings
+        )  # to fit original config keys
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Lfm2MoeConfig"]
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index 9ba52956a8e8efc80a59c90f7f66a07234da690b..bfa0f30e8961f5ac5f8198ad6055ad70f653562d 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 
@@ -10,16 +9,17 @@ from transformers import PretrainedConfig
 class MedusaConfig(PretrainedConfig):
     model_type = "medusa"
 
-    def __init__(self,
-                 hidden_size: int = 4096,
-                 vocab_size: int = 32001,
-                 num_heads: int = 5,
-                 num_hidden_layers: int = 1,
-                 max_paths: int = 64,
-                 topk: int = 10,
-                 truncated_vocab_size: Optional[int] = None,
-                 **kwargs):
-
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        vocab_size: int = 32001,
+        num_heads: int = 5,
+        num_hidden_layers: int = 1,
+        max_paths: int = 64,
+        topk: int = 10,
+        truncated_vocab_size: int | None = None,
+        **kwargs,
+    ):
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.num_heads = num_heads
@@ -27,8 +27,9 @@ class MedusaConfig(PretrainedConfig):
         self.max_paths = max_paths
         self.topk = topk
         self.max_seq_len = int(2**20)
-        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
-            else truncated_vocab_size
+        self.truncated_vocab_size = (
+            vocab_size if truncated_vocab_size is None else truncated_vocab_size
+        )
         if "architectures" not in kwargs:
             kwargs["architectures"] = ["MedusaModel"]
 
@@ -37,16 +38,17 @@ class MedusaConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "MedusaConfig":
         config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs)
+            pretrained_model_name_or_path, **kwargs
+        )
         for k in list(config_dict.keys()):
-            if 'num' in k:
-                if 'heads' in k:
+            if "num" in k:
+                if "heads" in k:
                     config_dict["num_heads"] = config_dict.pop(k)
-                elif 'layers' in k:
+                elif "layers" in k:
                     config_dict["num_hidden_layers"] = config_dict.pop(k)
         return cls.from_dict(config_dict, **kwargs)
 
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
index 1c23202e23c8e534c78319ce12c9a45da731be2a..e49bd26b2b00c2fa609eed7a1e4e74a011ddf2e2 100644
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -21,11 +21,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
-    Qwen2_5OmniTextConfig)
+    Qwen2_5OmniTextConfig,
+)
 
 
 class DashengConfig(PretrainedConfig):
@@ -35,15 +35,15 @@ class DashengConfig(PretrainedConfig):
         self,
         embed_dim: int = 768,
         outputdim: int = 527,
-        patch_size: Union[int, tuple[int, int]] = 16,
-        patch_stride: Union[int, tuple[int, int]] = 16,
+        patch_size: int | tuple[int, int] = 16,
+        patch_stride: int | tuple[int, int] = 16,
         input_channels: int = 1,
         target_length: int = 1012,
         depth: int = 12,
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = True,
-        init_values: Optional[float] = None,
+        init_values: float | None = None,
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         f_min: float = 0.0,
@@ -85,17 +85,19 @@ class MiDashengLMConfig(PretrainedConfig):
 
     def __init__(
         self,
-        audio_encoder_config: Optional[dict] = None,
+        audio_encoder_config: dict | None = None,
         subsample_factor: int = 5,
-        text_config: Optional[dict] = None,
-        audio_token_id: Optional[int] = None,
+        text_config: dict | None = None,
+        audio_token_id: int | None = None,
         **kwargs,
     ):
-        self.audio_encoder_config = DashengConfig(
-            **(audio_encoder_config or {}))
+        self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
         self.subsample_factor = subsample_factor
-        self.text_config = (Qwen2_5OmniTextConfig(
-            **text_config) if text_config else Qwen2_5OmniTextConfig())
+        self.text_config = (
+            Qwen2_5OmniTextConfig(**text_config)
+            if text_config
+            else Qwen2_5OmniTextConfig()
+        )
         self.text_config.rope_scaling = None  # uses_mrope is false
         self.audio_token_id = audio_token_id
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 5d9206e188322f0626020b5953cbfde24b9e4040..c6f04febe37e10d2b8ffd6949c9c28044de32d1d 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -9,8 +9,7 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 
-def adapt_config_dict(config_dict: dict[str, Any],
-                      **kwargs) -> PretrainedConfig:
+def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
     config_dict.update(kwargs)
     config_dict = _remap_general_mistral_args(config_dict)
 
@@ -25,15 +24,28 @@ def adapt_config_dict(config_dict: dict[str, Any],
     if bool(config_dict.get("yarn")):
         config_dict = _remap_mistral_yarn_args(config_dict)
 
-    is_vision = ((config_dict.get("multimodal")
-                  or {}).get("vision_encoder_args")
-                 or config_dict.get("vision_encoder"))
+    if bool(config_dict.get("llama_4_scaling")):
+        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+        assert all(
+            [
+                key in config_dict["llama_4_scaling"]
+                for key in llama_4_scaling_config_keys
+            ]
+        ), (
+            "llama_4_scaling config should define the keys: "
+            f"{','.join(llama_4_scaling_config_keys)}"
+        )
+
+    is_vision = (config_dict.get("multimodal") or {}).get(
+        "vision_encoder_args"
+    ) or config_dict.get("vision_encoder")
     is_audio = bool(
-        ((config_dict.get("multimodal") or {}).get("whisper_model_args")
-         or {}).get("encoder_args"))
+        ((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
+            "encoder_args"
+        )
+    )
 
-    assert not (is_vision and is_audio), \
-        "Vision and audio are mutually exclusive"
+    assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
 
     if is_vision:
         config_dict = _remap_mistral_vision_args(config_dict)
@@ -66,19 +78,24 @@ def _remap_mistral_vision_args(config: dict) -> dict:
 
 
 def _remap_mistral_yarn_args(config: dict) -> dict:
-    # Direct remaps: yarn.X -> rope_scaling.Y
-    # Source keys are from mistral.model.args.YarnArgs
-    _map = {
+    yarn_config_map = {
+        "factor": "factor",
+        "original_max_position_embeddings": "original_max_position_embeddings",
         "beta": "beta_fast",
         "alpha": "beta_slow",
+        "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
     config["rope_scaling"] = {
         "rope_type": "yarn",
-        "mscale_all_dim": 1,  # We hardcoded this to 1
-        **renamed_yarn_config
+        "mscale_all_dim": 1,
     }
+    for old_name, new_name in yarn_config_map.items():
+        if old_name in yarn_config:
+            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
+
+    assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
+
     return config
 
 
@@ -105,8 +122,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
         if key in config:
             config[new_key] = config.pop(key)
 
-    for new_key, (key,
-                  default_value) in top_level_mapping_with_default.items():
+    for new_key, (key, default_value) in top_level_mapping_with_default.items():
         config[new_key] = config.pop(key, default_value)
 
     return config
@@ -116,16 +132,12 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
     quantization = config.get("quantization", {})
     if quantization.get("qformat_weight") == "fp8_e4m3":
         # This maps to the FP8 static per-tensor quantization scheme
-        quantization_config = {
-            "quant_method": "fp8",
-            "activation_scheme": "static"
-        }
+        quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
     elif quantization.get("quant_method") == "compressed-tensors":
         # Pass through the quantization config to compressed-tensors
         quantization_config = quantization
     else:
-        raise ValueError(
-            f"Found unknown quantization='{quantization}' in config")
+        raise ValueError(f"Found unknown quantization='{quantization}' in config")
 
     config["quantization_config"] = quantization_config
 
@@ -139,13 +151,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
 
     quant_config = config.get("quantization_config")
     config = {
-        "model_type":
-        "whixtral",
+        "model_type": "whixtral",
         "architectures": ["VoxtralForConditionalGeneration"],
-        "text_config":
-        PretrainedConfig.from_dict(config),
-        "audio_config":
-        WhisperConfig(
+        "text_config": PretrainedConfig.from_dict(config),
+        "audio_config": WhisperConfig(
             num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
             window_size=encoder_args["audio_encoding_args"]["window_size"],
             sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
@@ -158,7 +167,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             vocab_size=encoder_args["vocab_size"],
             max_source_positions=encoder_args["max_source_positions"],
             is_encoder_decoder=False,  # Override WhisperConfig default
-        )
+        ),
     }
     if quant_config:
         config["quantization_config"] = quant_config
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 2fa284e5c9e8f26ca90627057e9e316b7d6ae8ac..75745f227f48293362f4695bf72d34856cf65d41 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from transformers import PretrainedConfig
 
@@ -13,16 +12,18 @@ class MLPSpeculatorConfig(PretrainedConfig):
         "hidden_size": "emb_dim",
     }
 
-    def __init__(self,
-                 vocab_size: int = 32000,
-                 emb_dim: int = 4096,
-                 inner_dim: int = 0,
-                 n_predict: int = 3,
-                 top_k_tokens_per_head: Optional[list[int]] = None,
-                 n_candidates: int = 5,
-                 tie_weights: bool = False,
-                 scale_input: bool = False,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        emb_dim: int = 4096,
+        inner_dim: int = 0,
+        n_predict: int = 3,
+        top_k_tokens_per_head: list[int] | None = None,
+        n_candidates: int = 5,
+        tie_weights: bool = False,
+        scale_input: bool = False,
+        **kwargs,
+    ):
         """
         Initialize an MLPSpeculatorConfig
 
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
index a6f712f3d60059875980e02acaf99a365985f92a..6e9b2897f4cc7c9ce90549ebfe7b0811de55f5ae 100644
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -8,16 +8,16 @@ class MoonViTConfig(PretrainedConfig):
     model_type = "moonvit"
 
     def __init__(
-            self,
-            patch_size: int = 14,
-            init_pos_emb_height: int = 64,
-            init_pos_emb_width: int = 64,
-            num_attention_heads: int = 16,
-            num_hidden_layers: int = 27,
-            hidden_size: int = 1152,
-            intermediate_size: int = 4304,
-            merge_kernel_size: tuple[int, int] = (2, 2),
-            **kwargs,
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.patch_size = patch_size
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 090fefa14203e8625e42fdc26fcdc589f910f348..60eed549561fb7787a06c8f7b9c230b3a2e09c78 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -62,7 +62,7 @@ class NemotronConfig(PretrainedConfig):
             (MQA) otherwise GQA is used. When converting a multi-head
             checkpoint to a GQA checkpoint, each group key and value
             head should be constructed by meanpooling all the original
-            heads within that group. For more details checkout 
+            heads within that group. For more details checkout
             [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
             is not specified, will default to `num_attention_heads`.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
@@ -147,8 +147,9 @@ class NemotronConfig(PretrainedConfig):
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         head_dim = head_dim or kwargs.get("kv_channels")
-        self.head_dim = head_dim if head_dim is not None else (
-            hidden_size // num_attention_heads)
+        self.head_dim = (
+            head_dim if head_dim is not None else (hidden_size // num_attention_heads)
+        )
 
         # for backward compatibility
         if num_key_value_heads is None:
@@ -162,8 +163,11 @@ class NemotronConfig(PretrainedConfig):
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         # for backward compatibility
-        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
-            "rope_percentage") or partial_rotary_factor
+        partial_rotary_factor = (
+            kwargs.get("rope_percent")
+            or kwargs.get("rope_percentage")
+            or partial_rotary_factor
+        )
         self.partial_rotary_factor = partial_rotary_factor
         self._rope_scaling_validation()
         self.attention_bias = attention_bias
@@ -185,21 +189,24 @@ class NemotronConfig(PretrainedConfig):
         if self.rope_scaling is None:
             return
 
-        if not isinstance(self.rope_scaling, dict) or len(
-                self.rope_scaling) != 2:
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
                 "`rope_scaling` must be a dictionary with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}")
+                f"`type` and `factor`, got {self.rope_scaling}"
+            )
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in [
-                "linear", "dynamic"
-        ]:
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
             raise ValueError(
                 "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}")
-        if rope_scaling_factor is None or not isinstance(
-                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+                f"'dynamic'], got {rope_scaling_type}"
+            )
+        if (
+            rope_scaling_factor is None
+            or not isinstance(rope_scaling_factor, float)
+            or rope_scaling_factor <= 1.0
+        ):
             raise ValueError(
                 "`rope_scaling`'s factor field must be a float > 1, got "
-                f"{rope_scaling_factor}")
\ No newline at end of file
+                f"{rope_scaling_factor}"
+            )
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 581bed5716c1c3370f638092a2ff5ab395813ca4..68c40002098c8868cf21ca65ab3abf4f1bdf2406 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -185,6 +185,15 @@ class NemotronHConfig(PretrainedConfig):
         mamba_proj_bias=False,
         mamba_chunk_size=256,
         rescale_prenorm_residual=True,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        moe_intermediate_size=7688,
+        moe_shared_expert_intermediate_size=7688,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        norm_topk_prob=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -203,11 +212,11 @@ class NemotronHConfig(PretrainedConfig):
         # Validate hybrid_override_pattern
         # M: Mamba2, *: Attention, -: MLP
         assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
-            "hybrid_override_pattern must have same length as "
-            "num_hidden_layers")
+            "hybrid_override_pattern must have same length as num_hidden_layers"
+        )
         assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
-            "hybrid_override_pattern must only contain characters "
-            "'M', '*', or '-'")
+            "hybrid_override_pattern must only contain characters 'M', '*', or '-'"
+        )
 
         # for backward compatibility
         if num_key_value_heads is None:
@@ -241,6 +250,15 @@ class NemotronHConfig(PretrainedConfig):
         self.mamba_proj_bias = mamba_proj_bias
         self.chunk_size = mamba_chunk_size
         self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob
 
         super().__init__(
             pad_token_id=pad_token_id,
@@ -253,7 +271,12 @@ class NemotronHConfig(PretrainedConfig):
     @property
     def layers_block_type(self):
         return [
-            "mamba" if self.hybrid_override_pattern[i] == "M" else
-            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            "mamba"
+            if self.hybrid_override_pattern[i] == "M"
+            else "attention"
+            if self.hybrid_override_pattern[i] == "*"
+            else "mlp"
+            if self.hybrid_override_pattern[i] == "-"
+            else "moe"
             for i in range(self.num_hidden_layers)
         ]
diff --git a/vllm/transformers_utils/configs/nemotron_vl.py b/vllm/transformers_utils/configs/nemotron_vl.py
deleted file mode 100644
index 6a642f26b82a2804dd4439bfb44ceed078a5434f..0000000000000000000000000000000000000000
--- a/vllm/transformers_utils/configs/nemotron_vl.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# Adapted from
-# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
-# --------------------------------------------------------
-# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
-#     LICENSE is in incl_licenses directory.
-# --------------------------------------------------------
-
-from transformers import LlamaConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
-
-
-class Nemotron_Nano_VL_Config(PretrainedConfig):
-    model_type = 'Llama_Nemotron_Nano_VL'
-    is_composition = True
-
-    def __init__(
-        self,
-        vision_config=None,
-        llm_config=None,
-        force_image_size=None,
-        downsample_ratio=0.5,
-        template=None,
-        ps_version='v1',
-        image_tag_type="internvl",
-        projector_hidden_size=4096,
-        vit_hidden_size=1280,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        if vision_config is not None:
-            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
-            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
-            self.vision_config = vision_auto_config(**vision_config)
-        else:
-            self.vision_config = PretrainedConfig()
-
-        if llm_config is None:
-            self.text_config = LlamaConfig()
-        else:
-            self.text_config = LlamaConfig(**llm_config)
-
-        # Assign configuration values
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template  # TODO move out of here and into the tokenizer
-        self.ps_version = ps_version  # Pixel shuffle version
-        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
-        self.projector_hidden_size = projector_hidden_size
-        self.vit_hidden_size = vit_hidden_size
diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py
index 874507db43a7f40c7030a001d13641d518bcb3a0..f5a9a7cd36bdb7966c7803ff45f29ef45f74a792 100644
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@@ -5,7 +5,6 @@ from transformers.configuration_utils import PretrainedConfig
 
 
 class Olmo3Config(PretrainedConfig):
-
     model_type = "olmo3"
     keys_to_ignore_at_inference = ["past_key_values"]
 
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
index 550f5e15dbcc21ef3180d6f1e045a11a01b0eaf4..294b4c9037aafc50a6dd070949a5aeb3e9d109e4 100644
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# yapf: disable
 # ruff: noqa: E501
 # adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
 # and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
 # Ovis Config with AimV2 config registration removed for Transformers compatibility
-from typing import Any, Optional, Union
+from typing import Any
 
 from transformers import AutoConfig, PretrainedConfig
 
@@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
 #                     Visual Tokenizer Configuration
 # ----------------------------------------------------------------------
 class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=16384,
+        tokenize_function="softmax",
+        tau=1.0,
+        depths=None,
+        drop_cls_token=False,
+        backbone_config: PretrainedConfig | dict | None = None,
+        hidden_stride: int = 1,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.tokenize_function = tokenize_function
         self.tau = tau
         if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
+            depths = [int(x) for x in depths.split("|")]
         self.depths = depths
         self.backbone_kwargs = dict[str, Any]()
         self.drop_cls_token = drop_cls_token
         if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), (
                 f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            )
             if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
+                model_type = backbone_config["model_type"]
                 if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                    backbone_config.pop("model_type")
+                    backbone_config = AutoConfig.for_model(
+                        model_type, **backbone_config
+                    )
                 else:
                     backbone_config = AIMv2Config(**backbone_config)
         self.backbone_config = backbone_config
@@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
             self.drop_cls_token = False
         if self.depths:
             assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
 
 
 class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
@@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
             self.drop_cls_token = False
         if self.depths:
             assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
 
 
 AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
@@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
 class OvisConfig(PretrainedConfig):
     model_type = "ovis"
 
-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
+    def __init__(
+        self,
+        llm_config: PretrainedConfig | dict | None = None,
+        visual_tokenizer_config: PretrainedConfig | dict | None = None,
+        multimodal_max_length=8192,
+        hidden_size=None,
+        conversation_formatter_class=None,
+        llm_attn_implementation=None,
+        disable_tie_weight=False,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+            assert isinstance(llm_config, (PretrainedConfig, dict)), (
                 f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            )
             if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
+                model_type = llm_config["model_type"]
+                llm_config.pop("model_type")
                 llm_config = AutoConfig.for_model(model_type, **llm_config)
 
         # map llm_config to text_config
         self.text_config = llm_config
         if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
                 f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            )
             if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
+                model_type = visual_tokenizer_config["model_type"]
+                visual_tokenizer_config.pop("model_type")
                 visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
+                    model_type, **visual_tokenizer_config
+                )
 
         self.visual_tokenizer_config = visual_tokenizer_config
         self.multimodal_max_length = multimodal_max_length
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index c7af26acd1b9f82e36ae5033c149e6e368ea41c1..21750bde2f87834fdabf4036ecd43f7de3f65ade 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -16,8 +16,7 @@
 # limitations under the License.
 """Qwen3-Next model configuration"""
 
-from transformers.configuration_utils import (PretrainedConfig,
-                                              layer_type_validation)
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py
index e1d96294d6ade7ec880ae6c5b36758a84961fd0d..2b6544fb273c2b62ef9488b0adf28268fd73fb54 100644
--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Radio vision model configuration"""
 
-from typing import Optional, Union
-
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
@@ -60,9 +58,9 @@ class RadioConfig(PretrainedConfig):
         initializer_factor: float = 1.0,
         hidden_act: str = "gelu",
         max_img_size: int = 2048,
-        norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
-        norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
-        reg_tokens: Optional[int] = None,
+        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
+        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
+        reg_tokens: int | None = None,
         **kwargs,
     ):
         self.model_name = model_name
@@ -81,11 +79,11 @@ class RadioConfig(PretrainedConfig):
         self.initializer_factor = initializer_factor
         self.hidden_act = hidden_act
         self.max_img_size = max_img_size
-        self.norm_mean = list(norm_mean) if isinstance(norm_mean,
-                                                       (tuple,
-                                                        list)) else norm_mean
-        self.norm_std = list(norm_std) if isinstance(norm_std,
-                                                     (tuple,
-                                                      list)) else norm_std
+        self.norm_mean = (
+            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
+        )
+        self.norm_std = (
+            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
+        )
         self.reg_tokens = reg_tokens
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py
index efc87b6bcf26fea4474e5ea31e0899ab73ad7420..88bce3d4f79e98f7b8d845bed2549b0967f86bec 100644
--- a/vllm/transformers_utils/configs/speculators/algos.py
+++ b/vllm/transformers_utils/configs/speculators/algos.py
@@ -5,7 +5,6 @@ SUPPORTED_SPECULATORS_TYPES = {}
 
 
 def register_speculator(name):
-
     def decorator(fn):
         SUPPORTED_SPECULATORS_TYPES[name] = fn
         return fn
@@ -17,16 +16,23 @@ def register_speculator(name):
 def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
     """
     Apply Eagle-3 specific configuration transformations.
-    
+
     Eagle-3 specific fields:
     - draft_vocab_size: Size of the draft model's vocabulary
     - target_hidden_size: Hidden size of the target model
     - norm_before_residual: Whether to apply norm before residual connection
+    - eagle_aux_hidden_state_layer_ids: List of layer indices from the base
+        model to use as auxiliary inputs for the Eagle3 drafter. These layers
+        provide intermediate hidden states that help the drafter make better
+        predictions. This is the standard field used in Eagle3 checkpoints.
     """
 
     vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
     if config_dict.get("target_hidden_size") is not None:
         vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
-    vllm_config["norm_before_residual"] = config_dict.get(
-        "norm_before_residual", True)
+    vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
     vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
+    if config_dict.get("eagle_aux_hidden_state_layer_ids"):
+        vllm_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
+            "eagle_aux_hidden_state_layer_ids"
+        ]
diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py
index 53128b4eecb03e7fd37d307e8f50b27447fa8d66..bf3a5d4131927d9de88ee2003bd839bc85d1ae85 100644
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
-from typing import Any, Union
+from typing import Any
 
 from transformers import PretrainedConfig
 
 from vllm.transformers_utils.configs.speculators.algos import (
-    SUPPORTED_SPECULATORS_TYPES)
+    SUPPORTED_SPECULATORS_TYPES,
+)
 
 __all__ = ["SpeculatorsConfig"]
 
@@ -17,31 +18,31 @@ class SpeculatorsConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "SpeculatorsConfig":
         """Load speculators Eagle config and convert to vLLM format."""
-        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
-                                             **kwargs)
+        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         vllm_config = cls.extract_vllm_speculative_config(config_dict)
         return cls(**vllm_config)
 
     @classmethod
     def extract_vllm_speculative_config(
-            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
+        cls, config_dict: dict[str, Any]
+    ) -> dict[str, Any]:
         speculators_model_type = config_dict.get("speculators_model_type")
         if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
             raise ValueError(
                 f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
-                "Please ensure you're loading a speculators-format model.")
+                "Please ensure you're loading a speculators-format model."
+            )
 
         # validate fields
         # TODO: @dsikka - use speculators pydantic model to validate
         cls.validate_speculators_config(config_dict=config_dict)
         # Convert from speculators config -> format that can be ingested by vLLM
-        vllm_config = cls.build_vllm_speculative_config(
-            config_dict=config_dict)
+        vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
         # Apply anything specific to the supported algorithm
         algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
         algo_updater(config_dict=config_dict, vllm_config=vllm_config)
@@ -64,11 +65,13 @@ class SpeculatorsConfig(PretrainedConfig):
 
         if not isinstance(config_dict["transformer_layer_config"], dict):
             raise TypeError(
-                "'transformer_layer_config' must be a dictionary if provided")
+                "'transformer_layer_config' must be a dictionary if provided"
+            )
 
     @classmethod
     def build_vllm_speculative_config(
-            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
+        cls, config_dict: dict[str, Any]
+    ) -> dict[str, Any]:
         """
         Build vLLM-compatible speculative configuration from speculators format.
 
@@ -94,14 +97,14 @@ class SpeculatorsConfig(PretrainedConfig):
 
         if num_speculative_tokens is None:
             raise ValueError(
-                "Missing 'speculative_tokens' in proposal method. "
-                f"Got: {first_method}")
+                f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
+            )
 
         # Build base vLLM speculative configuration
         vllm_config = {
             "method": config_dict.get("speculators_model_type"),
             "num_speculative_tokens": num_speculative_tokens,
-            "target_model": spec_config.get("verifier")["name_or_path"]
+            "target_model": spec_config.get("verifier")["name_or_path"],
         }
 
         # Merge transformer layer configuration if present
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
index fe3c72de69d2809eaefd07af83893de9b86c9812..637b82d88e265d9c9682ec38145f58d7fb888024 100644
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -53,19 +53,70 @@ class Step3TextConfig(PretrainedConfig):
         moe_num_experts: int = 48,
         moe_top_k: int = 3,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 65536,
         share_expert_dim: int = 5120,
         share_q_dim: int = 2048,
         head_dim: int = 256,
         norm_expert_weight: bool = False,
-        moe_layers_enum: tuple[int,
-                               ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
-                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                                       45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                                       55, 56, 57, 58, 59),
+        moe_layers_enum: tuple[int, ...] = (
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+        ),
         **kwargs,
     ) -> None:
         self.hidden_size = hidden_size
@@ -96,8 +147,8 @@ class Step3VLConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
-        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        vision_config: dict | Step3VisionEncoderConfig | None = None,
+        text_config: dict | Step3TextConfig | None = None,
         understand_projector_stride: int = 1,
         projector_bias: bool = True,
         image_token_id: int = 128001,
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index aaf31d84d0c1a2c3c4bfa408c1094e007d71ded9..fc0360a9ecb4eadd35f8c137627174369dab97af 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
-from typing import Any, Optional
+from typing import Any
 
 import transformers
 
@@ -42,6 +42,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
             projector or at the end. Versions v0.4.1 and below
             use `False`, but v0.5 and above use `True`.
     """
+
     wrapped_model_config: transformers.PretrainedConfig
     model_type = "ultravox"
     audio_token = "<|audio|>"
@@ -49,10 +50,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
 
     def __init__(
         self,
-        audio_config: Optional[dict[str, Any]] = None,
-        text_config: Optional[dict[str, Any]] = None,
-        audio_model_id: Optional[str] = None,
-        text_model_id: Optional[str] = None,
+        audio_config: dict[str, Any] | None = None,
+        text_config: dict[str, Any] | None = None,
+        audio_model_id: str | None = None,
+        text_model_id: str | None = None,
         ignore_index: int = -100,
         audio_token_index: int = 32000,
         hidden_size: int = 4096,
@@ -76,15 +77,17 @@ class UltravoxConfig(transformers.PretrainedConfig):
         if text_model_id is None:
             text_config = text_config or {}
             self.wrapped_model_config = transformers.CONFIG_MAPPING[
-                text_config.get("model_type", "llama")](**text_config)
+                text_config.get("model_type", "llama")
+            ](**text_config)
 
         # N.B. May set the audio_config below.
         self.audio_model_id = audio_model_id
         if audio_model_id is None:
             self.audio_model_id = None
             audio_config = audio_config or {}
-            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
-                "model_type", "whisper")](**audio_config)
+            self.audio_config = transformers.CONFIG_MAPPING[
+                audio_config.get("model_type", "whisper")
+            ](**audio_config)
 
         super().__init__(**kwargs)
 
@@ -99,8 +102,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         if key == "text_model_id" and value is not None:
             from vllm.transformers_utils.config import get_config
 
-            self.wrapped_model_config = get_config(value,
-                                                   trust_remote_code=False)
+            self.wrapped_model_config = get_config(value, trust_remote_code=False)
         elif key == "audio_model_id" and value is not None:
             from vllm.transformers_utils.config import get_config
 
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 101f31d39cc1f160ad60880cb2d91d218b0a0d28..560526bfd823e72645fbcf550ce27a14151f7b86 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from .tokenizer import AnyTokenizer
 
 
-def _replace_none_with_empty(tokens: list[Optional[str]]):
+def _replace_none_with_empty(tokens: list[str | None]):
     for i, token in enumerate(tokens):
         if token is None:
             tokens[i] = ""
@@ -30,8 +29,9 @@ def _convert_tokens_to_string_with_added_encoders(
     current_sub_text: list[str] = []
     convert_tokens_to_string = tokenizer.convert_tokens_to_string
     added_vocab_set = set(tokenizer.get_added_vocab())
-    all_special_tokens = set(
-        tokenizer.all_special_tokens) if skip_special_tokens else ()
+    all_special_tokens = (
+        set(tokenizer.all_special_tokens) if skip_special_tokens else ()
+    )
 
     for token in output_tokens:
         # Use precomputed set for skip-special check
@@ -70,11 +70,11 @@ def convert_prompt_ids_to_tokens(
     # We do not need to convert the whole prompt to tokens.
     # Offset a little more in case we have special tokens.
     new_tokens = tokenizer.convert_ids_to_tokens(
-        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
-        skip_special_tokens=skip_special_tokens)
+        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
+        skip_special_tokens=skip_special_tokens,
+    )
     read_offset = len(new_tokens)
-    prefix_offset = max(
-        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
     # This is required to guard against out-of-vocab prompt token ids
     _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
     return new_tokens, prefix_offset, read_offset
@@ -92,7 +92,7 @@ def convert_ids_list_to_tokens(
 
     Returns:
       Python list of token string representations
-    
+
     """
     token_str_lst = []
     for token_id in token_ids:
@@ -110,7 +110,7 @@ def convert_ids_list_to_tokens(
 def detokenize_incrementally(
     tokenizer: AnyTokenizer,
     all_input_ids: list[int],
-    prev_tokens: Optional[list[str]],
+    prev_tokens: list[str] | None,
     prefix_offset: int,
     read_offset: int,
     skip_special_tokens: bool = False,
@@ -144,18 +144,17 @@ def detokenize_incrementally(
     # This is the first iteration for this sequence
     is_first_iter = prev_tokens is None
     if is_first_iter:
-        (prev_tokens, prefix_offset,
-         read_offset) = convert_prompt_ids_to_tokens(
-             tokenizer,
-             all_input_ids[:-1],
-             skip_special_tokens=skip_special_tokens)
+        (prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
+            tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
+        )
     assert prev_tokens is not None
 
     # If the new token id is out of bounds, return an empty string.
     if 0 <= new_token_id < len(tokenizer):
         # Put new_token_id in a list so skip_special_tokens is respected
         new_tokens = tokenizer.convert_ids_to_tokens(
-            [new_token_id], skip_special_tokens=skip_special_tokens)
+            [new_token_id], skip_special_tokens=skip_special_tokens
+        )
         if isinstance(new_tokens, str):
             new_tokens = [new_tokens]
     else:
@@ -171,9 +170,9 @@ def detokenize_incrementally(
     # surrounding ids.
     if tokenizer.is_fast or not tokenizer.get_added_vocab():
         prefix_text = tokenizer.convert_tokens_to_string(
-            output_tokens[prefix_offset:read_offset])
-        new_text = tokenizer.convert_tokens_to_string(
-            output_tokens[prefix_offset:])
+            output_tokens[prefix_offset:read_offset]
+        )
+        new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
     else:
         prefix_text = _convert_tokens_to_string_with_added_encoders(
             tokenizer,
@@ -195,5 +194,5 @@ def detokenize_incrementally(
         # by the model
         return new_tokens, "", prefix_offset, read_offset
 
-    new_text = new_text[len(prefix_text):]
+    new_text = new_text[len(prefix_text) :]
     return new_tokens, new_text, read_offset, len(output_tokens)
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index 3c273ad41da00b5d50a6f924acfd3853aaec1c82..24ead83785f7192c3385b870788a8516d8d19a54 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
-from typing import Optional, Union
 
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
@@ -14,18 +13,18 @@ logger = init_logger(__name__)
 def try_get_class_from_dynamic_module(
     class_reference: str,
     pretrained_model_name_or_path: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    cache_dir: str | os.PathLike | None = None,
     force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
+    resume_download: bool | None = None,
+    proxies: dict[str, str] | None = None,
+    token: bool | str | None = None,
+    revision: str | None = None,
     local_files_only: bool = False,
-    repo_type: Optional[str] = None,
-    code_revision: Optional[str] = None,
+    repo_type: str | None = None,
+    code_revision: str | None = None,
     warn_on_fail: bool = True,
     **kwargs,
-) -> Optional[type]:
+) -> type | None:
     """
     As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
     but ignoring any errors.
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 51bcce6c10e26c0fe8fc33f06e834fc7f2669f56..b3469c1b18f2dfb8bb3853b37cf4b1b24f132e03 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,18 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
+import inspect
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
-
-from transformers import (AutoFeatureExtractor, AutoImageProcessor,
-                          AutoProcessor, AutoVideoProcessor)
+from typing import TYPE_CHECKING, Any, cast, get_args, get_type_hints
+
+from transformers import (
+    AutoFeatureExtractor,
+    AutoImageProcessor,
+    AutoProcessor,
+    AutoVideoProcessor,
+)
 from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.image_processing_utils import BaseImageProcessor
 from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.utils import get_allowed_kwarg_only_overrides
+from vllm.transformers_utils.utils import convert_model_repo_to_path
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -41,7 +48,7 @@ class HashableList(list):
         return hash(tuple(self))
 
 
-def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
+def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
     if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
         return AutoProcessor.from_pretrained
     if hasattr(processor_cls, "from_pretrained"):
@@ -50,9 +57,26 @@ def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
     return processor_cls
 
 
+@lru_cache
+def _collect_dynamic_keys_from_processing_kwargs(kwargs_cls: type) -> set[str]:
+    dynamic_kwargs: set[str] = set()
+    if kwargs_cls is None:
+        return dynamic_kwargs
+    # get kwargs annotations in processor
+    # merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
+    kwargs_type_annotations = get_type_hints(kwargs_cls)
+    for kw_type in ("text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"):
+        if kw_type in kwargs_type_annotations:
+            kw_annotations = get_type_hints(kwargs_type_annotations[kw_type])
+            for kw_name in kw_annotations:
+                dynamic_kwargs.add(kw_name)
+    dynamic_kwargs |= {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
+    return dynamic_kwargs
+
+
 def _merge_mm_kwargs(
     model_config: "ModelConfig",
-    processor_cls: Union[type, tuple[type, ...]],
+    processor_cls: type | tuple[type, ...],
     /,
     **kwargs,
 ):
@@ -66,7 +90,6 @@ def _merge_mm_kwargs(
         requires_kw_only=False,
         allow_var_kwargs=True,
     )
-
     # NOTE: Pythonic dict is not hashable and will raise unhashable type
     # error when calling `cached_get_processor`, therefore we need to
     # wrap it to a hashable dict.
@@ -82,16 +105,16 @@ def _merge_mm_kwargs(
 def get_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
     """Load a processor for the given model name via HuggingFace."""
     if revision is None:
         revision = "main"
-
     try:
+        processor_name = convert_model_repo_to_path(processor_name)
         if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
             processor = AutoProcessor.from_pretrained(
                 processor_name,
@@ -121,15 +144,18 @@ def get_processor(
                 "a custom processor not yet available in the HuggingFace "
                 "transformers library, consider setting "
                 "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
+                "`--trust-remote-code` flag in the CLI."
+            )
             raise RuntimeError(err_msg) from e
         else:
             raise e
 
     if not isinstance(processor, processor_cls):
-        raise TypeError("Invalid type of HuggingFace processor. "
-                        f"Expected type: {processor_cls}, but "
-                        f"found type: {type(processor)}")
+        raise TypeError(
+            "Invalid type of HuggingFace processor. "
+            f"Expected type: {processor_cls}, but "
+            f"found type: {type(processor)}"
+        )
 
     return processor
 
@@ -137,12 +163,80 @@ def get_processor(
 cached_get_processor = lru_cache(get_processor)
 
 
+@lru_cache
+def get_processor_kwargs_from_processor(processor: _P) -> set[str]:
+    try:
+        # get kwargs annotations in processor
+        call_kwargs = inspect.signature(type(processor).__call__).parameters.get(
+            "kwargs"
+        )
+        call_kwargs_annotations = call_kwargs.annotation if call_kwargs else None
+        # if the processor has explicit kwargs annotation, use it
+        if call_kwargs_annotations not in (None, inspect._empty):
+            # get_type_hints will parse all type annotations at runtime,
+            # and if an annotation refers to a type or
+            # name that hasn’t been imported or defined, it will raise an error.
+            # So we use __annotations__ to get the raw annotations directly.
+            return _collect_dynamic_keys_from_processing_kwargs(
+                get_args(call_kwargs_annotations)[0]
+            )
+        # otherwise, try to get from ProcessingKwargs
+        else:
+            module_name = type(processor).__module__
+            mod = importlib.import_module(module_name)
+            # find *ProcessingKwargs in the module
+            processor_kwargs: set[str] = set()
+            for name, obj in vars(mod).items():
+                if name.endswith("ProcessingKwargs"):
+                    processor_kwargs = (
+                        processor_kwargs
+                        | _collect_dynamic_keys_from_processing_kwargs(obj)
+                    )
+            return processor_kwargs
+    except Exception:
+        return set()
+
+
+def cached_get_processor_without_dynamic_kwargs(
+    processor_name: str,
+    *args: Any,
+    revision: str | None = None,
+    trust_remote_code: bool = False,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    # Step 1: use default kwargs to get a temporary processor instance
+    processor = cached_get_processor(
+        processor_name,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+    )
+
+    # Step 2: use temporary processor collect dynamic keys
+    dynamic_keys = get_processor_kwargs_from_processor(processor)
+
+    # Step 3: use dynamic_keys filter kwargs
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k not in dynamic_keys}
+
+    # Step 4: use filtered kwargs to get final processor instance
+    final_processor = cached_get_processor(
+        processor_name,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+        **filtered_kwargs,
+    )
+
+    return final_processor
+
+
 def cached_processor_from_config(
     model_config: "ModelConfig",
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    return cached_get_processor(
+    return cached_get_processor_without_dynamic_kwargs(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
@@ -154,19 +248,21 @@ def cached_processor_from_config(
 def get_feature_extractor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
-    """Load an audio feature extractor for the given model name 
+    """Load an audio feature extractor for the given model name
     via HuggingFace."""
     try:
+        processor_name = convert_model_repo_to_path(processor_name)
         feature_extractor = AutoFeatureExtractor.from_pretrained(
             processor_name,
             *args,
             revision=revision,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
@@ -177,7 +273,8 @@ def get_feature_extractor(
                 "extractor is a custom extractor not yet available in the "
                 "HuggingFace transformers library, consider setting "
                 "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
+                "`--trust-remote-code` flag in the CLI."
+            )
             raise RuntimeError(err_msg) from e
         else:
             raise e
@@ -202,18 +299,20 @@ def cached_feature_extractor_from_config(
 def get_image_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
     """Load an image processor for the given model name via HuggingFace."""
     try:
+        processor_name = convert_model_repo_to_path(processor_name)
         processor = AutoImageProcessor.from_pretrained(
             processor_name,
             *args,
             revision=revision,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
@@ -224,7 +323,8 @@ def get_image_processor(
                 "a custom processor not yet available in the HuggingFace "
                 "transformers library, consider setting "
                 "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
+                "`--trust-remote-code` flag in the CLI."
+            )
             raise RuntimeError(err_msg) from e
         else:
             raise e
@@ -250,20 +350,22 @@ def cached_image_processor_from_config(
 def get_video_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
-    processor_cls_overrides: Optional[type[_V]] = None,
+    processor_cls_overrides: type[_V] | None = None,
     **kwargs: Any,
 ):
     """Load a video processor for the given model name via HuggingFace."""
     try:
+        processor_name = convert_model_repo_to_path(processor_name)
         processor_cls = processor_cls_overrides or AutoVideoProcessor
         processor = processor_cls.from_pretrained(
             processor_name,
             *args,
             revision=revision,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
@@ -274,7 +376,8 @@ def get_video_processor(
                 "a custom processor not yet available in the HuggingFace "
                 "transformers library, consider setting "
                 "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
+                "`--trust-remote-code` flag in the CLI."
+            )
             raise RuntimeError(err_msg) from e
         else:
             raise e
@@ -287,7 +390,7 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 def cached_video_processor_from_config(
     model_config: "ModelConfig",
-    processor_cls: Optional[type[_V]] = None,
+    processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
     return cached_get_video_processor(
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 8a1ad226d99f0ba33a0d8b90140fc1c43833928d..76b6d3dc9c99abba4d83d4c390a6a1c4e7832ec5 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -8,8 +8,7 @@ reasons:
 - There is a need to override the existing processor to support vLLM.
 """
 
-from vllm.transformers_utils.processors.deepseek_vl2 import (
-    DeepseekVLV2Processor)
+from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
diff --git a/vllm/transformers_utils/processors/deepseek_ocr.py b/vllm/transformers_utils/processors/deepseek_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb7aa0c174867614c9b5185cd30785f29d47d1d3
--- /dev/null
+++ b/vllm/transformers_utils/processors/deepseek_ocr.py
@@ -0,0 +1,438 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# adapted from https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
+import math
+
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+# TODO(Isotr0py): change modes for variants
+# see: https://github.com/deepseek-ai/DeepSeek-OCR/blob/8cf003d38821fa1b19c73da3bd1b0dc262ea8136/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py#L1-L6
+# Tiny: base_size = 512, image_size = 512, crop_mode = False
+# Small: base_size = 640, image_size = 640, crop_mode = False
+# Base: base_size = 1024, image_size = 1024, crop_mode = False
+# Large: base_size = 1280, image_size = 1280, crop_mode = False
+# Gundam: base_size = 1024, image_size = 640, crop_mode = True
+BASE_SIZE = 1024
+IMAGE_SIZE = 640
+CROP_MODE = True
+
+# TODO(Isotr0py): Expose as mm_kwargs
+MIN_CROPS = 2
+MAX_CROPS = 6  # max:9; If your GPU memory is small, it is recommended to set it to 6.
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def calculate_aspect_ratios(
+    min_num: int = MIN_CROPS, max_num: int = MAX_CROPS
+) -> list[tuple[int, int]]:
+    target_ratios: set[tuple[int, int]] = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    sorted_target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    return sorted_target_ratios
+
+
+def count_tiles(
+    orig_width,
+    orig_height,
+    min_num=MIN_CROPS,
+    max_num=MAX_CROPS,
+    image_size=640,
+    use_thumbnail=False,
+):
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = calculate_aspect_ratios(min_num, max_num)
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    return target_aspect_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = calculate_aspect_ratios(min_num, max_num)
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+
+
+class ImageTransform:
+    def __init__(
+        self,
+        mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekOCRProcessor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        patch_size: int = 16,
+        downsample_ratio: int = 4,
+        image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.image_size = IMAGE_SIZE
+        self.base_size = BASE_SIZE
+        self.patch_size = 16
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = 4
+
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
+
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "left"  # must set this，padding side with make a difference in batch inference # noqa: E501
+
+        # add the pad_token as special token to use 'tokenizer.pad_token'
+        # and 'tokenizer.pad_token_id'
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
+
+        # add image token
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: list[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str,
+        images: list[Image.Image],
+        crop_mode: bool = CROP_MODE,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            crop_mode (bool): if True, then crop the image;
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        assert prompt is not None and images is not None, (
+            "prompt and images must be used at the same time."
+        )
+
+        sft_format = prompt
+
+        (
+            input_ids,
+            pixel_values,
+            images_crop,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+            _,
+        ) = self.tokenize_with_images(
+            conversation=sft_format,
+            images=images,
+            bos=True,
+            eos=True,
+            cropping=crop_mode,
+        )
+
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_crop=images_crop,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: list[Image.Image],
+        crop_mode: bool = CROP_MODE,
+        **kwargs,
+    ):
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            crop_mode=crop_mode,
+        )
+
+        return prepare
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: list[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
+            [],
+            [],
+            [],
+            [],
+        )
+        image_shapes = []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            image_shapes.append(image.size)
+
+            images_crop_raw = []
+            if image.size[0] <= 640 and image.size[1] <= 640:
+                crop_ratio = [1, 1]
+            elif cropping:
+                images_crop_raw, crop_ratio = dynamic_preprocess(
+                    image, image_size=IMAGE_SIZE
+                )
+            else:
+                crop_ratio = [1, 1]
+
+            if self.image_size <= 640 and not cropping:
+                image = image.resize((self.image_size, self.image_size))
+
+            global_view = ImageOps.pad(
+                image,
+                (self.base_size, self.base_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            images_list.append(self.image_transform(global_view))
+
+            num_width_tiles, num_height_tiles = crop_ratio
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            if num_width_tiles > 1 or num_height_tiles > 1:
+                for cropped_image in images_crop_raw:
+                    images_crop_list.append(self.image_transform(cropped_image))
+
+            num_queries = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
+            num_queries_base = math.ceil(
+                (self.base_size // self.patch_size) / self.downsample_ratio
+            )
+
+            tokenized_image = (
+                [self.image_token_id] * num_queries_base + [self.image_token_id]
+            ) * num_queries_base
+            tokenized_image += [self.image_token_id]
+            if num_width_tiles > 1 or num_height_tiles > 1:
+                local_row = [self.image_token_id] * (num_queries * num_width_tiles + 1)
+                tokenized_image += local_row * (num_queries * num_height_tiles)
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(images_seq_mask), (
+            f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} "
+            f"is not equal to images_seq_mask's length {len(images_seq_mask)}."
+        )
+
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, "
+            f"input_ids' length {len(masked_tokenized_str)}, "
+            f"images_seq_mask's length {len(images_seq_mask)}, are not equal."
+        )
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+
+        # Remove the ending eos token
+        assert input_ids[-1] == self.eos_id
+        input_ids = input_ids[:-1]
+        target_ids = target_ids[:-1]
+        images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((0, 3, self.base_size, self.base_size))
+            images_spatial_crop = torch.zeros((0, 2), dtype=torch.long)
+            images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+            if images_crop_list:
+                images_crop = torch.stack(images_crop_list, dim=0)
+            else:
+                images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
+
+        input_ids = input_ids.unsqueeze(0)
+
+        return (
+            input_ids,
+            pixel_values,
+            images_crop,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+            image_shapes,
+        )
+
+
+AutoProcessor.register("DeepseekOCRProcessor", DeepseekOCRProcessor)
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index d1d117b4e2cf46b5b5d669634e4417c61e3817dd..5ef258b9be29822bb17ff4958af334aef5ebeb11 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
@@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
 
 
 class ImageTransform:
-
-    def __init__(self,
-                 mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 std: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 normalize: bool = True):
+    def __init__(
+        self,
+        mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
         self.mean = mean
         self.std = std
         self.normalize = normalize
@@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
         ignore_id: int = -100,
         **kwargs,
     ):
-
         self.candidate_resolutions = candidate_resolutions
         self.image_size = candidate_resolutions[0][0]
         self.patch_size = patch_size
@@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
         self.normalize = normalize
         self.downsample_ratio = downsample_ratio
 
-        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
         self.tokenizer = tokenizer
-        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"  # must set this，padding side with make a difference in batch inference
 
         # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
         if tokenizer.pad_token is None:
-            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
 
         # add image token
         image_token_id = self.tokenizer.vocab.get(image_token)
@@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
 
         # add five special tokens for grounding-related tasks
         # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
-        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
         special_tokens_dict = {"additional_special_tokens": special_tokens}
         self.tokenizer.add_special_tokens(special_tokens_dict)
 
@@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
 
         for width, height in self.candidate_resolutions:
             scale = min(width / original_width, height / original_height)
-            downscaled_width, downscaled_height = int(
-                original_width * scale), int(original_height * scale)
-            effective_resolution = min(downscaled_width * downscaled_height,
-                                       original_width * original_height)
+            downscaled_width, downscaled_height = (
+                int(original_width * scale),
+                int(original_height * scale),
+            )
+            effective_resolution = min(
+                downscaled_width * downscaled_height, original_width * original_height
+            )
             wasted_resolution = (width * height) - effective_resolution
 
             if effective_resolution > max_effective_resolution or (
-                    effective_resolution == max_effective_resolution
-                    and wasted_resolution < min_wasted_resolution):
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution
+            ):
                 max_effective_resolution = effective_resolution
                 min_wasted_resolution = wasted_resolution
                 best_fit = (width, height)
@@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
                 - num_image_tokens (list[int]): the number of image tokens
         """
 
-        assert (prompt is not None and images is not None
-                ), "prompt and images must be used at the same time."
+        assert prompt is not None and images is not None, (
+            "prompt and images must be used at the same time."
+        )
 
         sft_format = prompt
-        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
-            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        ) = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
+        )
         masked_tokenized_str = []
         for token_index in tokenized_str:
             if token_index != self.image_token_id:
@@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
             else:
                 masked_tokenized_str.append(self.ignore_id)
 
-        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
-            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
-             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
 
         input_ids = torch.LongTensor(tokenized_str)
         target_ids = torch.LongTensor(masked_tokenized_str)
         images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
 
         # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
-        target_ids[(input_ids < 0) |
-                   (input_ids == self.image_token_id)] = self.ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
         input_ids[input_ids < 0] = self.pad_id
 
         if inference_mode:
@@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
                 best_width, best_height = self.image_size, self.image_size
 
             """process the global view"""
-            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
-                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
             images_list.append(self.image_transform(global_view))
 
             """process the local views"""
-            local_view = ImageOps.pad(image, (best_width, best_height),
-                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
             for i in range(0, best_height, self.image_size):
                 for j in range(0, best_width, self.image_size):
                     images_list.append(
-                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )
 
             """record height / width crop num"""
-            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
             images_spatial_crop.append([num_width_tiles, num_height_tiles])
 
             """add image tokens"""
-            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
             # global views tokens h * (w + 1), 1 is for line separator
             tokenized_image = [self.image_token_id] * h * (w + 1)
             # add a separator between global and local views
             tokenized_image += [self.image_token_id]
             # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
-            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )
 
             tokenized_str += tokenized_image
             images_seq_mask += [True] * len(tokenized_image)
@@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
             tokenized_str = tokenized_str + [self.eos_id]
             images_seq_mask = images_seq_mask + [False]
 
-        assert len(tokenized_str) == len(
-            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        assert len(tokenized_str) == len(images_seq_mask), (
+            f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        )
 
-        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+        return (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        )
 
 
 AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index 0077a7a8ce6562b2c9dc4de951df9aa16ee34170..252f833993652d52b4e2855b9b1402fbb866295f 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
@@ -24,35 +23,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import cached_property
-from typing import Union
 
 import PIL
 import torch
 from transformers import AutoProcessor, BatchFeature
 from transformers.image_utils import ImageInput
-from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
-                                           Unpack)
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from vllm.multimodal.image import convert_image_mode
 
-__all__ = ['OvisProcessor']
+__all__ = ["OvisProcessor"]
 IGNORE_ID = -100
 
-class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
     _defaults = {
         "text_kwargs": {
             "padding": False,
         },
         "images_kwargs": {
-            'max_partition':9,
-            'covering_threshold':0.9,
-            'convert_to_rgb':True,
-        'return_tensors':'pt'},
+            "max_partition": 9,
+            "covering_threshold": 0.9,
+            "convert_to_rgb": True,
+            "return_tensors": "pt",
+        },
     }
 
 
-
 class OvisProcessor(ProcessorMixin):
     r"""
     Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
@@ -98,14 +96,17 @@ class OvisProcessor(ProcessorMixin):
             "image_col_sep": -303,
             "image_row_sep": -304,
             "image_end": -305,
-            'image_pad': image_pad_token_id,
+            "image_pad": image_pad_token_id,
         }
         return extra_special_tokens
 
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         **kwargs: Unpack[OvisProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -170,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
 
         # Process text input
         if text is not None:
-
             if not isinstance(text, list):
                 text = [text]
 
@@ -179,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
             replaced_ids_list = []
             idx = 0
             for ids_tensor in tokenized_batched_text:
-                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                if (
+                    image_token_id in ids_tensor
+                    and "image_placeholders" in image_features
+                ):
                     if idx < len(image_features["image_placeholders"]):
                         # Converts in list for ease of use
                         ids_list = ids_tensor.tolist()
@@ -189,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
                         # replace placeholders
                         for i, token_id in enumerate(ids_list):
                             if token_id == image_token_id:
-                                placeholder_ids = image_features["image_placeholders"][idx]
+                                placeholder_ids = image_features["image_placeholders"][
+                                    idx
+                                ]
                                 new_ids.extend(placeholder_ids)
                                 idx += 1
                             else:
@@ -199,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
                         ids_tensor = torch.tensor(new_ids, dtype=torch.long)
                     else:
                         raise RuntimeError(
-                            'Mismatch between the images you provided and the number of placeholder present in the text')
+                            "Mismatch between the images you provided and the number of placeholder present in the text"
+                        )
 
                 replaced_ids_list.append(ids_tensor)
 
@@ -218,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
             # Add image features if present
             if image_features:
                 output["pixel_values"] = processed_images
-                output['grids'] = grids
+                output["grids"] = grids
 
             return output
 
@@ -228,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
     def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
         batch_token_ids = []
         for text in text_list:
-            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
-                           text.split(self.image_token)]
+            text_chunks = [
+                self.tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in text.split(self.image_token)
+            ]
             token_ids = []
             num_chuck = len(text_chunks)
             for i, chunk in enumerate(text_chunks):
@@ -241,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
 
     def get_image_size(self):
         size = self.image_processor.size
-        if 'shortest_edge' in size:
-            width = height = size['shortest_edge']
+        if "shortest_edge" in size:
+            width = height = size["shortest_edge"]
         elif "height" in size and "width" in size:
-            width = size['width']
-            height = size['height']
+            width = size["width"]
+            height = size["height"]
         else:
-            raise ValueError( "Can't parse image size from image_processor config.")
+            raise ValueError("Can't parse image size from image_processor config.")
         return height, width
 
     def get_token_value(self, tok):
         return self.extra_special_tokens[tok]
 
     def construct_image_indicators(self, grid):
-        image_placeholders = [self.get_token_value('image_start'),
-                              self.get_token_value('image_atom'),
-                              self.get_token_value('image_prefix')]
+        image_placeholders = [
+            self.get_token_value("image_start"),
+            self.get_token_value("image_atom"),
+            self.get_token_value("image_prefix"),
+        ]
         if grid[0] * grid[1] > 1:
             for r in range(grid[0]):
                 for c in range(grid[1]):
-                    image_placeholders.append(self.get_token_value('image_atom') )
+                    image_placeholders.append(self.get_token_value("image_atom"))
                     if c < grid[1] - 1:
-                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                        image_placeholders.append(self.get_token_value("image_col_sep"))
                 if r < grid[0] - 1:
-                    image_placeholders.append(self.get_token_value('image_row_sep'))
-        image_placeholders.append(self.get_token_value('image_end'))
+                    image_placeholders.append(self.get_token_value("image_row_sep"))
+        image_placeholders.append(self.get_token_value("image_end"))
         return image_placeholders
 
     def construct_image_placeholders(self, grid):
-
         image_placeholders = self.construct_image_indicators(grid)
 
-        image_atom_token_id = self.get_token_value('image_atom')
+        image_atom_token_id = self.get_token_value("image_atom")
         # Extract the padding token ID from tokenizer
-        image_padding_token_id = self.get_token_value('image_pad')
+        image_padding_token_id = self.get_token_value("image_pad")
 
         # Create a new list with padding tokens inserted
         padded_placeholder_tokens = []
         for token in image_placeholders:
             padded_placeholder_tokens.append(image_padding_token_id)
             if token == image_atom_token_id:
-                padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
+                padded_placeholder_tokens.extend(
+                    [image_padding_token_id] * self.image_segment_len
+                )
         return padded_placeholder_tokens
 
-    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+    def preprocess_image(
+        self,
+        image: PIL.Image.Image,
+        max_partition,
+        covering_threshold,
+        convert_to_rgb,
+        return_tensors,
+    ):
         def _preprocess(img: PIL.Image.Image, side):
             # first resize and preprocess
             w, h = img.size
@@ -297,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
                 new_height = side
                 new_width = int(w / h * new_height)
             new_size = dict(height=new_height, width=new_width)
-            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+            pixel_values = self.image_processor.preprocess(
+                img, size=new_size, return_tensors=return_tensors
+            )["pixel_values"]
 
             # then pad to square
-            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            square_values = torch.zeros(
+                [1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
+            )
             new_height, new_width = pixel_values.shape[2:]
             if new_height == new_width:
                 square_values[:, :, :, :] = pixel_values
             elif new_height > new_width:
                 from_index = (side - new_width) // 2
-                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+                square_values[:, :, :, from_index : from_index + new_width] = (
+                    pixel_values
+                )
             else:
                 from_index = (side - new_height) // 2
-                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+                square_values[:, :, from_index : from_index + new_height, :] = (
+                    pixel_values
+                )
 
             return square_values
 
@@ -351,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
             good_grids = []
             for grid in candidate_grids:
                 partition = _partition(img, grid)
-                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                covering_ratio = (
+                    sum([_covering_area(*p, side) for p in partition]) / img_area
+                )
                 assert covering_ratio <= 1.0
                 all_grids.append((grid, covering_ratio))
                 if covering_ratio > covering_threshold:
@@ -359,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
 
             if len(good_grids) > 0:
                 # pick the good partition with minimum #sub_images and break the tie using covering_ratio
-                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
+                    0
+                ]
             else:
                 # pick the partition with maximum covering_ratio and break the tie using #sub_images
                 return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
 
         if convert_to_rgb:
-            image = convert_image_mode(image, 'RGB')
-
+            image = convert_image_mode(image, "RGB")
 
         sides = self.get_image_size()
         if sides[0] != sides[1]:
-            raise ValueError('get_image_size() returns non-square size')
+            raise ValueError("get_image_size() returns non-square size")
         side = sides[0]
         grid = _get_best_grid(image, side)
         partition = _partition(image, grid)
@@ -379,7 +408,7 @@ class OvisProcessor(ProcessorMixin):
             crops.insert(0, image)
         pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
         image_placeholders = self.construct_image_placeholders(grid)
-        return pixel_values, image_placeholders, grid
+        return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -406,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
             `list[str]`: The decoded text.
         """
         return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
         )
 
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
         return names_from_processor + ["second_per_grid_ts"]
 
 
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index 282e9cb2116e0a67306032b03dca1e6c11d3a37b..4c084fdccabc96cec7dbab05b204deab87e62b27 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -2,40 +2,37 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from functools import cached_property
-from typing import Optional, Union
 
 import numpy as np
 import PIL
 import torch
 from transformers import AutoProcessor, BatchFeature
 from transformers.image_utils import ImageInput
-from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
-                                           Unpack)
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
-__all__ = ['Ovis2_5Processor']
+__all__ = ["Ovis2_5Processor"]
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
 MIN_PIXELS = 448 * 448
 MAX_PIXELS = 1792 * 1792
 
 
-class Ovis2_5ProcessorKwargs(ProcessingKwargs,
-                             total=False):  # type: ignore[call-arg]
+class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
     _defaults = {
         "text_kwargs": {
             "padding": False,
         },
         "images_kwargs": {
-            'convert_to_rgb': True,
-            'min_pixels': MIN_PIXELS,
-            'max_pixels': MAX_PIXELS,
+            "convert_to_rgb": True,
+            "min_pixels": MIN_PIXELS,
+            "max_pixels": MAX_PIXELS,
         },
         "videos_kwargs": {
-            'convert_to_rgb': True,
-            'min_pixels': MIN_PIXELS,
-            'max_pixels': MAX_PIXELS,
-        }
+            "convert_to_rgb": True,
+            "min_pixels": MIN_PIXELS,
+            "max_pixels": MAX_PIXELS,
+        },
     }
 
 
@@ -43,8 +40,8 @@ class Ovis2_5Processor(ProcessorMixin):
     r"""
     Constructs an Ovis processor which wraps an Ovis image processor
     and a Qwen2 tokenizer into a single processor.
-    [`OvisProcessor`] offers all the functionalities of 
-    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. 
+    [`OvisProcessor`] offers all the functionalities of
+    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
     See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
     for more information.
     Args:
@@ -81,9 +78,7 @@ class Ovis2_5Processor(ProcessorMixin):
         self.patch_size = patch_size
         self.hidden_stride = hidden_stride
         self.temporal_patch_size = temporal_patch_size
-        super().__init__(image_processor,
-                         tokenizer,
-                         chat_template=chat_template)
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     @cached_property
     def extra_special_tokens(self):
@@ -96,16 +91,18 @@ class Ovis2_5Processor(ProcessorMixin):
             "image_end": -302,
             "video_start": -303,
             "video_end": -304,
-            'image_pad': image_pad_token_id,
+            "image_pad": image_pad_token_id,
         }
         return extra_special_tokens
 
     def __call__(
         self,
         images: ImageInput = None,
-        videos: Union[np.ndarray, list[ImageInput]] = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput],
-                    list[PreTokenizedInput]] = None,
+        videos: np.ndarray | list[ImageInput] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         **kwargs: Unpack[Ovis2_5ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -148,9 +145,9 @@ class Ovis2_5Processor(ProcessorMixin):
                 [`BatchFeature`]: A [`BatchFeature`] with the following fields:
                 - **input_ids** -- list of token ids to be fed to a model.
                   Returned when `text` is not `None`.
-                - **attention_mask** -- list of indices specifying which tokens 
+                - **attention_mask** -- list of indices specifying which tokens
                   should be attended to by the model (when
-                  `return_attention_mask=True` or if *"attention_mask"* 
+                  `return_attention_mask=True` or if *"attention_mask"*
                   is in `self.model_input_names` and if `text` is not `None`).
                 - **pixel_values** -- Pixel values to be fed to a model.
                   Returned when `images` is not `None`.
@@ -177,9 +174,9 @@ class Ovis2_5Processor(ProcessorMixin):
             grids = []
             # Process each image
             for image in images if isinstance(images, list) else [images]:
-                pixel_values, image_placeholders, grid = (
-                    self.preprocess_multidata(
-                        images=image, **output_kwargs["images_kwargs"]))
+                pixel_values, image_placeholders, grid = self.preprocess_multidata(
+                    images=image, **output_kwargs["images_kwargs"]
+                )
                 processed_images.append(pixel_values)
                 image_placeholders_list.append(image_placeholders)
                 grids.append(grid)
@@ -196,16 +193,15 @@ class Ovis2_5Processor(ProcessorMixin):
             grids = []
             # Process each video
             for video in videos if isinstance(videos, list) else [videos]:
-                pixel_values, video_placeholders, grid = (
-                    self.preprocess_multidata(
-                        video=video, **output_kwargs["videos_kwargs"]))
+                pixel_values, video_placeholders, grid = self.preprocess_multidata(
+                    video=video, **output_kwargs["videos_kwargs"]
+                )
                 processed_videos.append(pixel_values)
                 videos_placeholders_list.append(video_placeholders)
                 grids.append(grid)
             # assign all processed videos
             if processed_videos:
-                visual_features[
-                    "video_placeholders"] = videos_placeholders_list
+                visual_features["video_placeholders"] = videos_placeholders_list
             output["video_pixel_values"] = processed_videos
             output["video_grids"] = grids
 
@@ -220,14 +216,16 @@ class Ovis2_5Processor(ProcessorMixin):
             image_idx = 0
             video_idx = 0
             for ids_tensor in tokenized_batched_text:
-                has_image_tokens = (image_token_id in ids_tensor
-                                    and "image_placeholders" in visual_features
-                                    and image_idx < len(
-                                        visual_features["image_placeholders"]))
-                has_video_tokens = (video_token_id in ids_tensor
-                                    and "video_placeholders" in visual_features
-                                    and video_idx < len(
-                                        visual_features["video_placeholders"]))
+                has_image_tokens = (
+                    image_token_id in ids_tensor
+                    and "image_placeholders" in visual_features
+                    and image_idx < len(visual_features["image_placeholders"])
+                )
+                has_video_tokens = (
+                    video_token_id in ids_tensor
+                    and "video_placeholders" in visual_features
+                    and video_idx < len(visual_features["video_placeholders"])
+                )
                 if has_image_tokens or has_video_tokens:
                     # Convert to list for easier manipulation
                     ids_list = ids_tensor.tolist()
@@ -237,13 +235,13 @@ class Ovis2_5Processor(ProcessorMixin):
                     for token_id in ids_list:
                         if token_id == image_token_id:
                             new_ids.extend(
-                                visual_features["image_placeholders"]
-                                [image_idx])
+                                visual_features["image_placeholders"][image_idx]
+                            )
                             image_idx += 1
                         elif token_id == video_token_id:
                             new_ids.extend(
-                                visual_features["video_placeholders"]
-                                [video_idx])
+                                visual_features["video_placeholders"][video_idx]
+                            )
                             video_idx += 1
                         else:
                             new_ids.append(token_id)
@@ -260,8 +258,7 @@ class Ovis2_5Processor(ProcessorMixin):
         # If only images were provided
         return BatchFeature(data=visual_features)
 
-    def _tokenize_with_visual_symbol(self,
-                                     text_list: list[str]) -> torch.LongTensor:
+    def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
         batch_token_ids = []
         for text in text_list:
             token_ids = []
@@ -288,21 +285,24 @@ class Ovis2_5Processor(ProcessorMixin):
         return torch.tensor(batch_token_ids, dtype=torch.long)
 
     # Copied from qwen2_vl
-    def smart_resize(self,
-                     height: int,
-                     width: int,
-                     factor: int = 28,
-                     min_pixels: int = MIN_PIXELS,
-                     max_pixels: int = MAX_PIXELS):
+    def smart_resize(
+        self,
+        height: int,
+        width: int,
+        factor: int = 28,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+    ):
         """Rescales the image so that the following conditions are met:
         1. Both dimensions (height and width) are divisible by 'factor'.
-        2. The total number of pixels is within the range 
+        2. The total number of pixels is within the range
             ['min_pixels', 'max_pixels'].
         3. The aspect ratio of the image is maintained as closely as possible.
         """
         if height < factor or width < factor:
-            print(f"height:{height} or width:{width} must be "
-                  f"larger than factor:{factor}")
+            print(
+                f"height:{height} or width:{width} must be larger than factor:{factor}"
+            )
             if height < width:
                 width = round(factor / height * width)
                 height = factor
@@ -311,8 +311,10 @@ class Ovis2_5Processor(ProcessorMixin):
                 width = factor
 
         elif max(height, width) / min(height, width) > 200:
-            print(f"absolute aspect ratio must be smaller than 200, "
-                  f"got {max(height, width) / min(height, width)}")
+            print(
+                f"absolute aspect ratio must be smaller than 200, "
+                f"got {max(height, width) / min(height, width)}"
+            )
             if height > width:
                 height = 200 * width
             else:
@@ -335,29 +337,27 @@ class Ovis2_5Processor(ProcessorMixin):
 
     def construct_visual_indicators(self, grid, is_video: bool = False):
         if is_video:
-            start_token = self.get_token_value('video_start')
-            end_token = self.get_token_value('video_end')
+            start_token = self.get_token_value("video_start")
+            end_token = self.get_token_value("video_end")
         else:
-            start_token = self.get_token_value('image_start')
-            end_token = self.get_token_value('image_end')
+            start_token = self.get_token_value("image_start")
+            end_token = self.get_token_value("image_end")
 
-        image_placeholders = [start_token, self.get_token_value('visual_atom')]
+        image_placeholders = [start_token, self.get_token_value("visual_atom")]
         if grid[0] * grid[1] > 1:
             for r in range(grid[0]):
                 for c in range(grid[1]):
-                    image_placeholders.append(
-                        self.get_token_value('visual_atom'))
+                    image_placeholders.append(self.get_token_value("visual_atom"))
 
         image_placeholders.append(end_token)
         return image_placeholders
 
     def construct_visual_placeholders(self, grid, is_video: bool = False):
-        visual_placeholders = self.construct_visual_indicators((1, 1),
-                                                               is_video)
+        visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
 
-        image_atom_token_id = self.get_token_value('visual_atom')
+        image_atom_token_id = self.get_token_value("visual_atom")
         # Extract the padding token ID from tokenizer
-        image_padding_token_id = self.get_token_value('image_pad')
+        image_padding_token_id = self.get_token_value("image_pad")
 
         num_image_atoms = grid[0] * grid[1] * grid[2]
         num_image_atoms //= self.hidden_stride**2
@@ -367,20 +367,21 @@ class Ovis2_5Processor(ProcessorMixin):
         padded_placeholder_tokens = []
         for token in visual_placeholders:
             if token == image_atom_token_id:
-                padded_placeholder_tokens.extend([image_padding_token_id] *
-                                                 num_image_atoms)
+                padded_placeholder_tokens.extend(
+                    [image_padding_token_id] * num_image_atoms
+                )
             else:
                 padded_placeholder_tokens.append(image_padding_token_id)
         return padded_placeholder_tokens
 
     def preprocess_multidata(
         self,
-        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
-        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
-        convert_to_rgb: Optional[bool] = True,
+        images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
+        video: list[PIL.Image.Image] | np.ndarray | None = None,
+        convert_to_rgb: bool | None = True,
         min_pixels: int = MIN_PIXELS,
         max_pixels: int = MAX_PIXELS,
-        return_tensors: Optional[str] = 'pt',
+        return_tensors: str | None = "pt",
     ):
         is_video = False
         if images is not None:
@@ -396,11 +397,14 @@ class Ovis2_5Processor(ProcessorMixin):
                     images.append(image)
             elif isinstance(video, list):
                 images = video
-        min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
-                         min_pixels if min_pixels is not None else MIN_PIXELS)
+        else:
+            raise ValueError("Either images or video should be provided.")
+        min_pixels = min(
+            max_pixels if max_pixels is not None else MAX_PIXELS,
+            min_pixels if min_pixels is not None else MIN_PIXELS,
+        )
         images = [
-            image.convert("RGB")
-            if convert_to_rgb and image.mode != 'RGB' else image
+            image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image
             for image in images
         ]
 
@@ -417,14 +421,16 @@ class Ovis2_5Processor(ProcessorMixin):
             )
             new_size = dict(height=resized_height, width=resized_width)
             image_pt = self.image_processor.preprocess(
-                image, size=new_size, return_tensors="np")['pixel_values'][0]
+                image, size=new_size, return_tensors="np"
+            )["pixel_values"][0]
 
             processed_images.append(image_pt)
 
         patches = np.array(processed_images)
         if patches.shape[0] % self.temporal_patch_size != 0:
-            num_to_pad = self.temporal_patch_size - (patches.shape[0] %
-                                                     self.temporal_patch_size)
+            num_to_pad = self.temporal_patch_size - (
+                patches.shape[0] % self.temporal_patch_size
+            )
             repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
@@ -445,14 +451,18 @@ class Ovis2_5Processor(ProcessorMixin):
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
-            self.patch_size * self.patch_size)
+            grid_t * grid_h * grid_w,
+            channel * self.temporal_patch_size * self.patch_size * self.patch_size,
+        )
 
         visual_placeholders = self.construct_visual_placeholders(
-            [grid_t, grid_h, grid_w], is_video)
-        return torch.tensor(
-            flatten_patches), visual_placeholders, torch.tensor(
-                [[grid_t, grid_h, grid_w]])
+            [grid_t, grid_h, grid_w], is_video
+        )
+        return (
+            torch.tensor(flatten_patches),
+            visual_placeholders,
+            torch.tensor([[grid_t, grid_h, grid_w]]),
+        )
 
 
 AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
index 355fd60e8da1b5490bc23a6eba2dbfea6c5685ae..eac4294bb59cdc28f9081dadb9134c97fd78a9da 100644
--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@@ -5,16 +5,15 @@ import hashlib
 import os
 import shutil
 import signal
-from typing import Optional
 
 from vllm import envs
 from vllm.assets.base import get_cache_dir
 from vllm.logger import init_logger
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 logger = init_logger(__name__)
 
-SUPPORTED_SCHEMES = ['s3://', 'gs://']
+SUPPORTED_SCHEMES = ["s3://", "gs://"]
 
 try:
     from runai_model_streamer import list_safetensors as runai_list_safetensors
@@ -22,11 +21,9 @@ try:
 except (ImportError, OSError):
     # see https://github.com/run-ai/runai-model-streamer/issues/26
     # OSError will be raised on arm64 platform
-    runai_model_streamer = PlaceholderModule(
-        "runai_model_streamer")  # type: ignore[assignment]
+    runai_model_streamer = PlaceholderModule("runai_model_streamer")  # type: ignore[assignment]
     runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
-    runai_list_safetensors = runai_model_streamer.placeholder_attr(
-        "list_safetensors")
+    runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")
 
 
 def list_safetensors(path: str = "") -> list[str]:
@@ -65,8 +62,10 @@ class ObjectStorageModel:
                 signal.signal(sig, self._close_by_signal(existing_handler))
 
         dir_name = os.path.join(
-            get_cache_dir(), "model_streamer",
-            hashlib.sha256(str(url).encode()).hexdigest()[:8])
+            get_cache_dir(),
+            "model_streamer",
+            hashlib.sha256(str(url).encode()).hexdigest()[:8],
+        )
         if os.path.exists(dir_name):
             shutil.rmtree(dir_name)
         os.makedirs(dir_name)
@@ -78,7 +77,6 @@ class ObjectStorageModel:
             shutil.rmtree(self.dir)
 
     def _close_by_signal(self, existing_handler=None):
-
         def new_handler(signum, frame):
             self._close()
             if existing_handler:
@@ -86,10 +84,12 @@ class ObjectStorageModel:
 
         return new_handler
 
-    def pull_files(self,
-                   model_path: str = "",
-                   allow_pattern: Optional[list[str]] = None,
-                   ignore_pattern: Optional[list[str]] = None) -> None:
+    def pull_files(
+        self,
+        model_path: str = "",
+        allow_pattern: list[str] | None = None,
+        ignore_pattern: list[str] | None = None,
+    ) -> None:
         """
         Pull files from object storage into the temporary directory.
 
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index b848898ff6dad9a01cf9cb3a9443f516074d6ac3..a5a3af6538b810bf32d79dcad78f9c147de0249b 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -4,7 +4,7 @@
 import fnmatch
 from typing import TYPE_CHECKING, Optional
 
-from vllm.utils import PlaceholderModule
+from vllm.utils.import_utils import PlaceholderModule
 
 if TYPE_CHECKING:
     from botocore.client import BaseClient
@@ -17,21 +17,25 @@ except ImportError:
 
 def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
     return [
-        path for path in paths if any(
-            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+        path
+        for path in paths
+        if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
     ]
 
 
 def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
     return [
-        path for path in paths
+        path
+        for path in paths
         if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
     ]
 
 
-def glob(s3: Optional["BaseClient"] = None,
-         path: str = "",
-         allow_pattern: Optional[list[str]] = None) -> list[str]:
+def glob(
+    s3: Optional["BaseClient"] = None,
+    path: str = "",
+    allow_pattern: list[str] | None = None,
+) -> list[str]:
     """
     List full file names from S3 path and filter by allow pattern.
 
@@ -47,17 +51,15 @@ def glob(s3: Optional["BaseClient"] = None,
         s3 = boto3.client("s3")
     if not path.endswith("/"):
         path = path + "/"
-    bucket_name, _, paths = list_files(s3,
-                                       path=path,
-                                       allow_pattern=allow_pattern)
+    bucket_name, _, paths = list_files(s3, path=path, allow_pattern=allow_pattern)
     return [f"s3://{bucket_name}/{path}" for path in paths]
 
 
 def list_files(
-        s3: "BaseClient",
-        path: str,
-        allow_pattern: Optional[list[str]] = None,
-        ignore_pattern: Optional[list[str]] = None
+    s3: "BaseClient",
+    path: str,
+    allow_pattern: list[str] | None = None,
+    ignore_pattern: list[str] | None = None,
 ) -> tuple[str, str, list[str]]:
     """
     List files from S3 path and filter by pattern.
@@ -71,17 +73,17 @@ def list_files(
     Returns:
         tuple[str, str, list[str]]: A tuple where:
             - The first element is the bucket name
-            - The second element is string represent the bucket 
+            - The second element is string represent the bucket
               and the prefix as a dir like string
-            - The third element is a list of files allowed or 
+            - The third element is a list of files allowed or
               disallowed by pattern
     """
-    parts = path.removeprefix('s3://').split('/')
-    prefix = '/'.join(parts[1:])
+    parts = path.removeprefix("s3://").split("/")
+    prefix = "/".join(parts[1:])
     bucket_name = parts[0]
 
     objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
-    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+    paths = [obj["Key"] for obj in objects.get("Contents", [])]
 
     paths = _filter_ignore(paths, ["*/"])
     if allow_pattern is not None:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 9aaac66817394115c6573e400823496c5a5efc5b..a393568909d27ff9ee9770522eddd3b2ec54e656 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -7,40 +7,35 @@ import os
 import warnings
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 import huggingface_hub
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import (
-    get_sentence_transformer_tokenizer_config)
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.lora.request import LoRARequest
     from vllm.transformers_utils.tokenizer_base import TokenizerBase
 else:
     ModelConfig = Any
-    LoRARequest = Any
     TokenizerBase = Any
 
 logger = init_logger(__name__)
 
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
-                     TokenizerBase]
+AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
 
 
 def decode_tokens(
     tokenizer: AnyTokenizer,
     token_ids: list[int],
     *,
-    skip_special_tokens: Optional[bool] = None,
+    skip_special_tokens: bool | None = None,
 ) -> str:
     """
     Backend-agnostic equivalent of HF's
@@ -50,8 +45,7 @@ def decode_tokens(
     settings.
     """
     if skip_special_tokens is not None:
-        return tokenizer.decode(token_ids,
-                                skip_special_tokens=skip_special_tokens)
+        return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
 
     return tokenizer.decode(token_ids)
 
@@ -60,9 +54,9 @@ def encode_tokens(
     tokenizer: AnyTokenizer,
     text: str,
     *,
-    truncation: Optional[bool] = None,
-    max_length: Optional[int] = None,
-    add_special_tokens: Optional[bool] = None,
+    truncation: bool | None = None,
+    max_length: int | None = None,
+    add_special_tokens: bool | None = None,
 ) -> list[int]:
     """
     Backend-agnostic equivalent of HF's
@@ -95,8 +89,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 
     tokenizer_all_special_ids = tokenizer.all_special_ids
     tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_all_special_tokens_extended = (
-        tokenizer.all_special_tokens_extended)
+    tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
     tokenizer_vocab = tokenizer.get_vocab()
     tokenizer_len = len(tokenizer)
 
@@ -110,7 +103,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
             max_token_id = max(max_token_id, tokenizer.vocab_size)
 
     class CachedTokenizer(tokenizer.__class__):  # type: ignore
-
         @property
         def all_special_ids(self) -> list[int]:
             return tokenizer_all_special_ids
@@ -134,7 +126,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
             return tokenizer_len
 
         def __reduce__(self):
-            return get_cached_tokenizer, (tokenizer, )
+            return get_cached_tokenizer, (tokenizer,)
 
     CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
 
@@ -143,16 +135,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 
 
 def get_tokenizer(
-    tokenizer_name: Union[str, Path],
+    tokenizer_name: str | Path,
     *args,
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
-    revision: Optional[str] = None,
-    download_dir: Optional[str] = None,
+    revision: str | None = None,
+    download_dir: str | None = None,
     **kwargs,
 ) -> AnyTokenizer:
-    """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
-    """
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
     if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
         # lazy import so that modelscope is not required for normal use.
@@ -173,13 +164,13 @@ def get_tokenizer(
                     revision=revision,
                     local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                     # Ignore weights - we only need the tokenizer.
-                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+                )
                 tokenizer_name = tokenizer_path
 
     if tokenizer_mode == "slow":
         if kwargs.get("use_fast", False):
-            raise ValueError(
-                "Cannot use the fast tokenizer in slow tokenizer mode.")
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
         kwargs["use_fast"] = False
 
     if "truncation_side" not in kwargs:
@@ -195,23 +186,28 @@ def get_tokenizer(
     is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
-            'It is strongly recommended to run mistral models with '
+            "It is strongly recommended to run mistral models with "
             '`--tokenizer-mode "mistral"` to ensure correct '
-            'encoding and decoding.',
+            "encoding and decoding.",
             FutureWarning,
-            stacklevel=2)
+            stacklevel=2,
+        )
 
     tokenizer: AnyTokenizer
     if tokenizer_mode == "mistral":
-        tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
-                                                     revision=revision)
+        tokenizer = MistralTokenizer.from_pretrained(
+            str(tokenizer_name), revision=revision
+        )
     elif tokenizer_mode == "custom":
         from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
-        tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
-                                                    *args,
-                                                    revision=revision,
-                                                    download_dir=download_dir,
-                                                    **kwargs)
+
+        tokenizer = TokenizerRegistry.get_tokenizer(
+            str(tokenizer_name),
+            *args,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
     else:
         try:
             tokenizer = AutoTokenizer.from_pretrained(
@@ -226,13 +222,16 @@ def get_tokenizer(
             # currently being imported,
             # suggest using the --trust-remote-code flag.
             if not trust_remote_code and (
-                    "does not exist or is not currently imported." in str(e)
-                    or "requires you to execute the tokenizer file" in str(e)):
-                err_msg = ("Failed to load the tokenizer. If the tokenizer "
-                           "is a custom tokenizer not yet available in the "
-                           "HuggingFace transformers library, consider "
-                           "setting `trust_remote_code=True` in LLM or using "
-                           "the `--trust-remote-code` flag in the CLI.")
+                "does not exist or is not currently imported." in str(e)
+                or "requires you to execute the tokenizer file" in str(e)
+            ):
+                err_msg = (
+                    "Failed to load the tokenizer. If the tokenizer "
+                    "is a custom tokenizer not yet available in the "
+                    "HuggingFace transformers library, consider "
+                    "setting `trust_remote_code=True` in LLM or using "
+                    "the `--trust-remote-code` flag in the CLI."
+                )
                 raise RuntimeError(err_msg) from e
             else:
                 raise e
@@ -240,19 +239,21 @@ def get_tokenizer(
         # The special_tokens in tokenizer should also be
         # controlled by do_lower_case in encoder_config
         encoder_config = get_sentence_transformer_tokenizer_config(
-            tokenizer_name, revision)
+            tokenizer_name, revision
+        )
         if isinstance(encoder_config, dict) and encoder_config.get(
-                "do_lower_case", False):
+            "do_lower_case", False
+        ):
             special_tokens_map = {
-                k: v.lower()
-                for k, v in tokenizer.special_tokens_map.items()
+                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
         if not isinstance(tokenizer, PreTrainedTokenizerFast):
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "
-                "slowdown. Consider using a fast tokenizer instead.")
+                "slowdown. Consider using a fast tokenizer instead."
+            )
         tokenizer = get_cached_tokenizer(tokenizer)
 
     return tokenizer
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index b1f84a023fc315a3fb739f753a983dea5c17df12..7421eb5348082c2713a26e838fcc0468d991de85 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -3,14 +3,13 @@
 
 import importlib
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
 class TokenizerBase(ABC):
-
     @property
     @abstractmethod
     def all_special_tokens_extended(self) -> list[str]:
@@ -72,11 +71,11 @@ class TokenizerBase(ABC):
     @abstractmethod
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
         raise NotImplementedError()
 
@@ -93,23 +92,27 @@ class TokenizerBase(ABC):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
-    def encode(self,
-               text: str,
-               truncation: Optional[bool] = None,
-               max_length: Optional[int] = None,
-               add_special_tokens: Optional[bool] = None) -> list[int]:
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
-    def apply_chat_template(self,
-                            messages: list["ChatCompletionMessageParam"],
-                            tools: Optional[list[dict[str, Any]]] = None,
-                            **kwargs) -> list[int]:
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
@@ -117,9 +120,7 @@ class TokenizerBase(ABC):
         raise NotImplementedError()
 
     @abstractmethod
-    def decode(self,
-               ids: Union[list[int], int],
-               skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     @abstractmethod
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 941156c4bf50e8cb795dcfdbf8088e7b4c8a3394..b63cb26af46dd7ee6262c881ff37908ac85ace81 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
-                      truncate_tool_call_ids, validate_request_params)
+from .mistral import (
+    MistralTokenizer,
+    maybe_serialize_tool_calls,
+    truncate_tool_call_ids,
+    validate_request_params,
+)
 
 __all__ = [
-    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
-    "validate_request_params"
+    "MistralTokenizer",
+    "maybe_serialize_tool_calls",
+    "truncate_tool_call_ids",
+    "validate_request_params",
 ]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index d8a8d19391cd0bd4025a64288cfa8b82f1f4c6e4..39198a1f3d8156cace57d0e43da12a603a3315e7 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,33 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
-
-import huggingface_hub
-import regex as re
-from huggingface_hub import HfApi, hf_hub_download
-from transformers.tokenization_utils_base import BatchEncoding
+from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_base import TokenizerBase
-from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
-    # make sure `mistral_common` is lazy imported,
-    # so that users who only use non-mistral models
-    # will not be bothered by the dependency.
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    from mistral_common.tokens.tokenizers.mistral import (
-        MistralTokenizer as PublicMistralTokenizer)
+    from mistral_common.protocol.instruct.request import (
+        ChatCompletionRequest as MistralChatCompletionRequest,
+    )
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+    from transformers.tokenization_mistral_common import (
+        MistralCommonTokenizer as TransformersMistralTokenizer,
+    )
 
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 
 logger = init_logger(__name__)
 
 
-def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
+def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
     # NOTE: There is currently a bug in pydantic where attributes
@@ -51,7 +45,7 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
     #   - https://github.com/pydantic/pydantic/issues/9541
     # TODO: remove when pydantic v2.11 is released
     for i, message in enumerate(request.messages):
-        if message.get("role") == 'assistant':
+        if message.get("role") == "assistant":
             tool_calls_validator = message.get("tool_calls", ().__iter__())
             validated_tool_calls = []
             while True:
@@ -64,10 +58,10 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
             request.messages[i]["tool_calls"] = validated_tool_calls
 
 
-def truncate_tool_call_ids(request: "ChatCompletionRequest"):
+def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
     """Truncates tool call IDs for Mistral's ID requirements."""
     for i, message in enumerate(request.messages):
-        if message.get("role") == 'assistant':
+        if message.get("role") == "assistant":
             tool_calls = message.get("tool_calls", [])
             for tool_call in tool_calls:
                 if len(tool_call["id"]) > 9:
@@ -94,210 +88,183 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"):
                 request.messages[i]["tool_call_id"] = tool_call_id
 
 
-def validate_request_params(request: "ChatCompletionRequest"):
-    if (request.skip_special_tokens is not None
-            and not request.skip_special_tokens):
-        raise ValueError("skip_special_tokens=False is not supported "
-                         "for Mistral tokenizers.")
-
-
-def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
-    repo_cache = os.path.join(
-        huggingface_hub.constants.HF_HUB_CACHE,
-        huggingface_hub.constants.REPO_ID_SEPARATOR.join(
-            ["models", *repo_id.split("/")]))
-
-    if revision is None:
-        revision_file = os.path.join(repo_cache, "refs", "main")
-        if os.path.isfile(revision_file):
-            with open(revision_file) as file:
-                revision = file.read()
-
-    if revision:
-        revision_dir = os.path.join(repo_cache, "snapshots", revision)
-        if os.path.isdir(revision_dir):
-            return os.listdir(revision_dir)
-
-    return []
-
-
-def find_tokenizer_file(files: list[str]):
-    file_pattern = re.compile(
-        r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
-
-    matched_files = [file for file in files if file_pattern.match(file)]
-    if len(matched_files) > 1:
-        raise OSError(
-            f"Found {len(matched_files)} files matching the "
-            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
-            f"tokenizer is present in {files}.")
-    elif len(matched_files) == 0:
-        raise OSError(
-            f"Found {len(matched_files)} files matching the "
-            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
-            f"tokenizer is present in {files}.")
-
-    return matched_files[0]
-
-
-def _aggregate_content(content: list) -> list[dict[str, Any]]:
-    aggregated_content: list[dict[str, Any]] = []
-    for chunk in content:
-        if chunk.get("type"
-                     ) == "text" and aggregated_content and aggregated_content[
-                         -1].get("type") == "text":
-            aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
-        else:
-            aggregated_content.append(chunk)
-    if len(aggregated_content) == 1 and aggregated_content[0].get(
-            "type") == "text":
-        content = aggregated_content[0]["text"]
-    return content
+def _prepare_apply_chat_template_tools_and_messages(
+    messages: list["ChatCompletionMessageParam"],
+    tools: list[dict[str, Any]] | None = None,
+    continue_final_message: bool = False,
+    add_generation_prompt: bool = False,
+) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+    if add_generation_prompt and continue_final_message:
+        raise ValueError(
+            "Cannot set both `add_generation_prompt` and "
+            "`continue_final_message` to True."
+        )
 
-
-def make_mistral_chat_completion_request(
-        messages: list["ChatCompletionMessageParam"],
-        tools: Optional[list[dict[str,
-                                  Any]]] = None) -> "ChatCompletionRequest":
     last_message = cast(dict[str, Any], messages[-1])
-    if last_message["role"] == "assistant":
-        last_message["prefix"] = True
+    # add_generation_prompt is directly handled by the tokenizer but we
+    # check if the user is trying to use it with a final assistant message
+    # which is probably not what they want.
+    # If add_generation_prompt is False, we don't need to check anything.
+    if add_generation_prompt and last_message["role"] == "assistant":
+        raise ValueError(
+            "Cannot set `add_generation_prompt` to True when "
+            "the last message is from the assistant. Consider "
+            "using `continue_final_message` instead."
+        )
+    if continue_final_message and last_message["role"] != "assistant":
+        raise ValueError(
+            "Cannot set `continue_final_message` to True when "
+            "the last message is not from the assistant."
+        )
 
     # mistral-common requires AssistantMessage content to be string [1].
     #
     # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
     for message in messages:
-        # Remove reasoning_content as unsupported by Mistral
-        _ = message.pop("reasoning_content", None)  # type: ignore
-
-        # Convert list text content to string
-        if message.get("role") in ("assistant", "tool"):
-            content: Any = message.get("content")
-            if isinstance(content, list):
-                content = _aggregate_content(content)
-            message["content"] = content
+        # Remove reasoning as unsupported by Mistral
+        _ = message.pop("reasoning", None)  # type: ignore
 
     # The Mistral client, in comparison to the OpenAI client, requires the
     # "parameters" dict and the "description" string to be present
     # even if they are empty.
     if tools:
         for function in [
-                tool["function"] for tool in tools
-                if tool["type"] == "function"
+            tool["function"] for tool in tools if tool["type"] == "function"
         ]:
             if function.get("parameters") is None:
                 function["parameters"] = {}
             if function.get("description") is None:
                 function["description"] = ""
 
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    return ChatCompletionRequest(messages=messages,
-                                 tools=tools)  # type: ignore[type-var]
+    return messages, tools
 
 
-class MistralTokenizer(TokenizerBase):
+def validate_request_params(request: "ChatCompletionRequest"):
+    if request.chat_template is not None or request.chat_template_kwargs is not None:
+        raise ValueError("chat_template is not supported for Mistral tokenizers.")
 
-    def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
-        self.mistral = tokenizer
-        self.instruct = tokenizer.instruct_tokenizer
-        _mistral_version_str = self.instruct.tokenizer.version.value
-        self.version: int = int(_mistral_version_str.split("v")[-1])
 
-        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
-        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
-        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
+    assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
+
+    t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
+    shift = tokenizer.num_special_tokens
+    try:
+        return shift + tokenizer._tekken_token2id_nospecial[t_bytes]
+    except KeyError:
+        t_str = t_bytes.decode("utf-8")
+        if t_str in tokenizer._special_tokens_reverse_vocab:
+            return tokenizer._special_tokens_reverse_vocab[t_str]
+        logger.warning(
+            "Failed to convert token %s to id, replacing with <unk>", t_bytes
+        )
+        return tokenizer.unk_id
 
-        self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+
+class MistralTokenizer(TokenizerBase):
+    def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+        from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
-            SentencePieceTokenizer)
-        self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
-        self._special_token_policy = (SpecialTokenPolicy.IGNORE
-                                      if self.is_tekken else None)
+            SentencePieceTokenizer,
+        )
+        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
+        self.transformers_tokenizer = tokenizer
+        self.mistral = tokenizer.tokenizer
+        self.instruct = self.mistral.instruct_tokenizer
+        self.tokenizer = self.instruct.tokenizer
+
+        mode = self.mistral._chat_completion_request_validator._mode
+        if mode != ValidationMode.test:
+            raise ValueError(
+                "Mistral tokenizer must be in test mode. Make sure to "
+                "set `mode='ValidationMode.test'` when creating the "
+                "Mistral tokenizer."
+            )
+
+        _mistral_version_str = str(self.tokenizer.version.value)
+        self.version: int = int(_mistral_version_str.split("v")[-1])
+
+        self.is_tekken = isinstance(self.tokenizer, Tekkenizer)
+        self.is_spm = isinstance(self.tokenizer, SentencePieceTokenizer)
         if not (self.is_tekken or self.is_spm):
-            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
+            raise TypeError(f"Unsupported tokenizer: {type(self.tokenizer)}")
 
-        self._vocab = tokenizer_.vocab()
-        # Convert to a dict[str, int] to match protocol, but this is a lossy
-        # conversion. There may be multiple token ids that decode to the same
-        # string due to partial UTF-8 byte sequences being converted to �
+        # Reverse order to ensure that the lowest token id is kept.
         self._vocab_dict = {
-            token: idx
-            for idx, token in enumerate(self._vocab)
+            self.convert_ids_to_tokens([i], skip_special_tokens=False)[0]: i
+            for i in range(self.vocab_size - 1, -1, -1)
         }
-        self.tokenizer = tokenizer_
+        # Sort the dict for convenience
+        self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
+
+        # Cache special tokens for faster access.
+        self._special_token_ids = self._get_special_token_ids()
+        self._special_token_ids_set = set(self._special_token_ids)
+        self._special_tokens = self._get_special_tokens(self._special_token_ids)
+        self._special_tokens_set = set(self._special_tokens)
+
+        # Vocab sorted by token id.
+        self._vocab = self.tokenizer._vocab
         self._max_token_id = self.vocab_size - 1
 
     @classmethod
-    def from_pretrained(cls,
-                        path_or_repo_id: str,
-                        *,
-                        revision: Optional[str] = None) -> "MistralTokenizer":
-        if not Path(path_or_repo_id).exists():
-            assert len(path_or_repo_id.split("/")) == 2, (
-                "You have either provided a non-existent path: "
-                "{path_or_repo_id} or an invalid HF Hub repo id.")
-            tokenizer_file = cls._download_mistral_tokenizer_from_hf(
-                path_or_repo_id, revision)
-        elif Path(path_or_repo_id).is_dir():
-            tokenizer_file_name = find_tokenizer_file(
-                os.listdir(path_or_repo_id))
-            tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
+    def from_pretrained(
+        cls, path_or_repo_id: str, *, revision: str | None = None
+    ) -> "MistralTokenizer":
+        from mistral_common.protocol.instruct.validator import ValidationMode
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as TransformersMistralTokenizer,
+        )
+
+        str_revision = "main" if revision is None else revision
+        return cls(
+            TransformersMistralTokenizer.from_pretrained(
+                path_or_repo_id, revision=str_revision, mode=ValidationMode.test
+            )
+        )
+
+    def _get_special_token_ids(self) -> list[int]:
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer,
+        )
+        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
+        if self.is_tekken:
+            assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
+            special_ids = {t["rank"] for t in self.tokenizer._all_special_tokens}
+        elif self.is_spm:
+            assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
+                self.tokenizer
+            )
+            special_ids = self.tokenizer._control_tokens
         else:
-            assert Path(
-                path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
-            tokenizer_file = str(Path(path_or_repo_id))
-
-        from mistral_common.tokens.tokenizers.mistral import (
-            MistralTokenizer as PublicMistralTokenizer)
-        mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
-        return cls(mistral_tokenizer)
-
-    @staticmethod
-    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
-                                            revision: Optional[str]) -> str:
-        try:
-            hf_api = HfApi()
-            files = hf_api.list_repo_files(repo_id=tokenizer_name,
-                                           revision=revision)
-        except ConnectionError as exc:
-            files = list_local_repo_files(repo_id=tokenizer_name,
-                                          revision=revision)
-
-            if len(files) == 0:
-                raise exc
-
-        filename = find_tokenizer_file(files)
-
-        tokenizer_file = hf_hub_download(tokenizer_name,
-                                         filename=filename,
-                                         revision=revision)
-        return tokenizer_file
+            raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}")
+        return sorted(special_ids)
+
+    def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
+
+        return [
+            self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
+            for i in all_special_ids
+        ]
 
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
     @property
     def all_special_tokens_extended(self) -> list[str]:
-        from mistral_common.tokens.tokenizers.base import SpecialTokens
-
-        # tekken defines its own extended special tokens list
-        if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
-            special_tokens = self.tokenizer.SPECIAL_TOKENS
-        else:
-            special_tokens = list(SpecialTokens)
-        return [
-            s.value if isinstance(s, SpecialTokens) else s
-            for s in special_tokens
-        ]
+        return self.all_special_tokens
 
     @property
     def all_special_tokens(self) -> list[str]:
-        return self.all_special_tokens_extended
+        return self._special_tokens
 
     @property
     def all_special_ids(self) -> list[int]:
-        return [
-            self.all_special_tokens.index(t) for t in self.all_special_tokens
-        ]
+        return self._special_token_ids
 
     @property
     def bos_token_id(self) -> int:
@@ -313,7 +280,7 @@ class MistralTokenizer(TokenizerBase):
 
     @property
     def pad_token(self) -> str:
-        raise NotImplementedError()
+        return self.transformers_tokenizer.pad_token
 
     @property
     def is_fast(self) -> bool:
@@ -321,7 +288,7 @@ class MistralTokenizer(TokenizerBase):
 
     @property
     def vocab_size(self) -> int:
-        return len(self._vocab)
+        return self.transformers_tokenizer.vocab_size
 
     @property
     def max_token_id(self) -> int:
@@ -331,36 +298,33 @@ class MistralTokenizer(TokenizerBase):
     def truncation_side(self) -> str:
         raise NotImplementedError()
 
+    def _is_special_token_id(self, token_id: int) -> bool:
+        return token_id in self._special_token_ids_set
+
     def __len__(self) -> int:
         return self.vocab_size
 
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
-        input_ids: Union[list[int], list[list[int]]]
-        # For list[str], original prompt text
-        if is_list_of(text, str):
-            input_ids_: list[list[int]] = []
-            for p in text:
-                each_input_ids = self.encode_one(p, truncation, max_length)
-                input_ids_.append(each_input_ids)
-            input_ids = input_ids_
-        # For list[int], apply chat template output, already tokens.
-        elif is_list_of(text, int):
-            input_ids = text
-        # For str, single prompt text
-        else:
-            input_ids = self.encode_one(text, truncation, max_length)
-        return BatchEncoding({"input_ids": input_ids})
+        return self.transformers_tokenizer(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+
+    @property
+    def vocab(self) -> list[str]:
+        return self._vocab
 
     def get_vocab(self) -> dict[str, int]:
-        # NB: the dictionary form of the vocabulary collapses token ids that map
-        # to the same string but have different bytes
         return self._vocab_dict
 
     def get_added_vocab(self) -> dict[str, int]:
@@ -371,84 +335,111 @@ class MistralTokenizer(TokenizerBase):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         # Mistral Tokenizers should not add special tokens
-        input_ids = self.encode(text)
-
-        if truncation:
-            input_ids = input_ids[:max_length]
-        return input_ids
-
-    def encode(self,
-               text: str,
-               truncation: Optional[bool] = None,
-               max_length: Optional[int] = None,
-               add_special_tokens: Optional[bool] = None) -> list[int]:
-        # `encode` should only be used for prompt completion
-        # it should never be used for chat_completion.
-        # For chat completion use `apply_chat_template`
-        if add_special_tokens is not None:
-            return self.tokenizer.encode(text,
-                                         bos=add_special_tokens,
-                                         eos=add_special_tokens)
-        else:
-            return self.tokenizer.encode(text, bos=True, eos=False)
+        return self.transformers_tokenizer.encode(
+            text, add_special_tokens=False, truncation=truncation, max_length=max_length
+        )
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
+        encoded = self.tokenizer.encode(
+            text, bos=add_special_tokens is not False, eos=False
+        )
 
-    def apply_chat_template(self,
-                            messages: list["ChatCompletionMessageParam"],
-                            tools: Optional[list[dict[str, Any]]] = None,
-                            **kwargs) -> list[int]:
+        if truncation is not False and max_length is not None:
+            return encoded[:max_length]
+        else:
+            return encoded
 
-        request = make_mistral_chat_completion_request(messages, tools)
-        encoded = self.mistral.encode_chat_completion(request)
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
+        add_generation_prompt = kwargs.pop("add_generation_prompt", False)
+        continue_final_message = kwargs.get("continue_final_message", False)
+        padding = kwargs.get("padding", False)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        messages, tools = _prepare_apply_chat_template_tools_and_messages(
+            messages, tools, continue_final_message, add_generation_prompt
+        )
+
+        return self.transformers_tokenizer.apply_chat_template(
+            conversation=messages,
+            tools=tools,
+            continue_final_message=continue_final_message,
+            tokenize=True,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=None,
+            return_dict=False,
+        )
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        if isinstance(ids, int):
+            ids = [ids]
 
-        # encode-decode to get clean prompt
-        return encoded.tokens
+        return self.transformers_tokenizer.decode(
+            ids, skip_special_tokens=skip_special_tokens
+        )
 
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        from mistral_common.tokens.tokenizers.base import SpecialTokens
+        from mistral_common.tokens.tokenizers.base import (
+            SpecialTokenPolicy,
+            SpecialTokens,
+        )
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer,
+        )
+        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
+        to_decode_special_tokens = {SpecialTokens.tool_calls}
         if self.is_tekken:
+            assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
             tokens = [
-                t for t in tokens
-                if (t is SpecialTokens.tool_calls
-                    or t not in self.tokenizer._all_special_tokens)
+                t
+                for t in tokens
+                if (t in to_decode_special_tokens or t not in self._special_tokens_set)
             ]
 
             if any(isinstance(t, bytes) for t in tokens):
                 # we need to encode and decode all tokens again
-                shift = self.tokenizer.num_special_tokens
-
-                def _token_to_id(t: str):
-                    t_bytes = t.encode("utf-8") \
-                        if not isinstance(t, bytes) else t
-                    try:
-                        return shift + \
-                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
-                    except KeyError:
-                        logger.warning(
-                            "Failed to convert token %s to id,"
-                            " replacing with <unk>", t_bytes)
-                        return self.tokenizer.unk_id
-
-                ids = [_token_to_id(t) for t in tokens]
-                decoded = self.tokenizer.decode(ids,
-                                                self._special_token_policy)
+                ids = [_tekken_token_to_id(self.tokenizer, t) for t in tokens]
+                # We filtered unwanted special tokens before
+                # so we can decode the rest.
+                decoded = self.tokenizer.decode(ids, SpecialTokenPolicy.KEEP)
             else:
                 decoded = "".join(tokens)
         else:
             # make sure certain special tokens like Tool calls are
             # not decoded
-            special_tokens = {SpecialTokens.tool_calls}
+            assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
+                self.tokenizer
+            )
+
             regular_tokens: list[str] = []
-            decoded_list = []
+            decoded_list: list[str] = []
+            decoded = ""
 
             for token in tokens:
-                if token in special_tokens:
+                if token in to_decode_special_tokens:
                     if regular_tokens:
                         decoded_list.append(
-                            self.tokenizer.decode(regular_tokens,
-                                                  self._special_token_policy))
+                            self.tokenizer.decode(
+                                regular_tokens, SpecialTokenPolicy.IGNORE
+                            )
+                        )
                         regular_tokens = []
                     decoded_list.append(token)
                 else:
@@ -456,66 +447,56 @@ class MistralTokenizer(TokenizerBase):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.tokenizer.decode(regular_tokens,
-                                          self._special_token_policy))
-
-            decoded = ''.join(decoded_list)
+                    self.tokenizer.decode(regular_tokens, SpecialTokenPolicy.IGNORE)
+                )
+            decoded = "".join(decoded_list)
 
         return decoded
 
-    def decode(self,
-               ids: Union[list[int], int],
-               skip_special_tokens: bool = True) -> str:
-        assert (
-            skip_special_tokens
-        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
-
-        if isinstance(ids, int):
-            ids = [ids]
-        return self.tokenizer.decode(ids, self._special_token_policy)
-
     def convert_ids_to_tokens(
         self,
         ids: list[int],
         skip_special_tokens: bool = True,
     ) -> list[str]:
-        from mistral_common.tokens.tokenizers.base import SpecialTokens
-        from mistral_common.tokens.tokenizers.instruct import (
-            InstructTokenizerV13)
-
-        # TODO(Patrick) - potentially allow special tokens to not be skipped
-        assert (
-            skip_special_tokens
-        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
+        from mistral_common.tokens.tokenizers.base import (
+            SpecialTokenPolicy,
+            SpecialTokens,
+        )
+        from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
 
-        assert self.is_tekken or self.is_spm, type(self.tokenizer)
+        if not skip_special_tokens:
+            return [self.tokenizer.id_to_piece(token_id) for token_id in ids]
 
-        if self.is_tekken:
-            # skip special tokens except tool call and think tokens
-            non_skip_special_tokens = {
-                self.tokenizer.get_control_token(SpecialTokens.tool_calls)
-            }
-            if isinstance(self.instruct, InstructTokenizerV13):
-                if self.instruct.BEGIN_THINK:
-                    non_skip_special_tokens.add(self.instruct.BEGIN_THINK)
-                if self.instruct.END_THINK:
-                    non_skip_special_tokens.add(self.instruct.END_THINK)
-            ids = [
-                i for i in ids if i > self.tokenizer.num_special_tokens
-                or i in non_skip_special_tokens
-            ]
+        non_skip_special_tokens_ids = {
+            self.tokenizer.get_control_token(SpecialTokens.tool_calls),
+        }
+        if isinstance(self.instruct, InstructTokenizerV13):
+            if self.instruct.BEGIN_THINK:
+                non_skip_special_tokens_ids.add(self.instruct.BEGIN_THINK)
+            if self.instruct.END_THINK:
+                non_skip_special_tokens_ids.add(self.instruct.END_THINK)
+
+        ids_kept = [
+            i
+            for i in ids
+            if i in non_skip_special_tokens_ids or not self._is_special_token_id(i)
+        ]
 
-        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+        # We filtered unwanted special tokens so we can decode the rest.
+        tokens = [self.tokenizer.id_to_piece(token_id) for token_id in ids_kept]
 
         if any("�" in t for t in tokens) and self.is_tekken:
             # if a decoded token contains the replacement character, then the
             # token has an incomplete UTF-8 character so we must use bytes
             # See: https://github.com/vllm-project/vllm/pull/8640
             #      https://github.com/vllm-project/vllm/pull/9625
-            # if underlying tokenizeir is sentencepiece, we just add "�"
+            # if underlying tokenizer is sentencepiece, we just add "�".
+            # We filtered unwanted special tokens so we can decode the rest.
             tokens = [
-                self.tokenizer.id_to_byte_piece(id, self._special_token_policy)
-                for id in ids
+                self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP)
+                if token_id not in self._special_token_ids_set
+                else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP)
+                for token_id in ids_kept
             ]
 
         return tokens
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 2aaad8f949d0f34e682824cff85d878db5320155..1ae42ba622dc4702dec63cd09c9ce5d62e909f96 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -2,23 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
+import os
 import struct
 from functools import cache
 from os import PathLike
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any
 
-from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 def is_s3(model_or_path: str) -> bool:
-    return model_or_path.lower().startswith('s3://')
+    return model_or_path.lower().startswith("s3://")
 
 
-def check_gguf_file(model: Union[str, PathLike]) -> bool:
+def is_gcs(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith("gs://")
+
+
+def is_cloud_storage(model_or_path: str) -> bool:
+    return is_s3(model_or_path) or is_gcs(model_or_path)
+
+
+def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
     if not model.is_file():
@@ -38,23 +47,26 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
 
 def modelscope_list_repo_files(
     repo_id: str,
-    revision: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
 ) -> list[str]:
     """List files in a modelscope repo."""
     from modelscope.hub.api import HubApi
+
     api = HubApi()
     api.login(token)
     # same as huggingface_hub.list_repo_files
     files = [
-        file['Path'] for file in api.get_model_files(
-            model_id=repo_id, revision=revision, recursive=True)
-        if file['Type'] == 'blob'
+        file["Path"]
+        for file in api.get_model_files(
+            model_id=repo_id, revision=revision, recursive=True
+        )
+        if file["Type"] == "blob"
     ]
     return files
 
 
-def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+def _maybe_json_dict(path: str | PathLike) -> dict[str, str]:
     with open(path) as f:
         try:
             return json.loads(f.read())
@@ -62,7 +74,7 @@ def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
             return dict[str, str]()
 
 
-def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+def _maybe_space_split_dict(path: str | PathLike) -> dict[str, str]:
     parsed_dict = dict[str, str]()
     with open(path) as f:
         for line in f.readlines():
@@ -83,7 +95,7 @@ def maybe_model_redirect(model: str) -> str:
     :return: maybe redirect to a local folder
     """
 
-    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+    model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH
 
     if not model_redirect_path:
         return model
@@ -91,18 +103,28 @@ def maybe_model_redirect(model: str) -> str:
     if not Path(model_redirect_path).exists():
         return model
 
-    redirect_dict = (_maybe_json_dict(model_redirect_path)
-                     or _maybe_space_split_dict(model_redirect_path))
-    if (redirect_model := redirect_dict.get(model)):
+    redirect_dict = _maybe_json_dict(model_redirect_path) or _maybe_space_split_dict(
+        model_redirect_path
+    )
+    if redirect_model := redirect_dict.get(model):
         logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
         return redirect_model
 
     return model
 
 
-def parse_safetensors_file_metadata(
-        path: Union[str, PathLike]) -> dict[str, Any]:
+def parse_safetensors_file_metadata(path: str | PathLike) -> dict[str, Any]:
     with open(path, "rb") as f:
-        length_of_metadata = struct.unpack('<Q', f.read(8))[0]
-        metadata = json.loads(f.read(length_of_metadata).decode('utf-8'))
+        length_of_metadata = struct.unpack("<Q", f.read(8))[0]
+        metadata = json.loads(f.read(length_of_metadata).decode("utf-8"))
         return metadata
+
+
+def convert_model_repo_to_path(model_repo: str) -> str:
+    """When VLLM_USE_MODELSCOPE is True convert a model
+    repository string to a Path str."""
+    if not envs.VLLM_USE_MODELSCOPE or Path(model_repo).exists():
+        return model_repo
+    from modelscope.utils.file_utils import get_model_cache_root
+
+    return os.path.join(get_model_cache_root(), model_repo)
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 828536e6408b15fc1e1df363eaf5d12b0b83e013..a475d0fa406bf529c9efa2d12499191fc981f4e5 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
-                                         TritonPlaceholder)
+from vllm.triton_utils.importing import (
+    HAS_TRITON,
+    TritonLanguagePlaceholder,
+    TritonPlaceholder,
+)
 
 if HAS_TRITON:
     import triton
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 95076a9a7c8fc96307cd300946dc7b96a45f40b0..f05bc555bfdc37f420e9ecfcd63e3fd6ae380090 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -21,15 +21,15 @@ if HAS_TRITON:
         # an is_active method.
         # The `x.driver and` check adds a small layer of safety.
         active_drivers = [
-            x.driver for x in backends.values()
-            if x.driver and x.driver.is_active()
+            x.driver for x in backends.values() if x.driver and x.driver.is_active()
         ]
 
         # Check if we're in a distributed environment where CUDA_VISIBLE_DEVICES
         # might be temporarily empty (e.g., Ray sets it to "" during actor init)
         cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
-        is_distributed_env = (cuda_visible_devices is not None
-                              and len(cuda_visible_devices.strip()) == 0)
+        is_distributed_env = (
+            cuda_visible_devices is not None and len(cuda_visible_devices.strip()) == 0
+        )
 
         # Apply lenient driver check for distributed environments
         if is_distributed_env and len(active_drivers) == 0:
@@ -37,35 +37,41 @@ if HAS_TRITON:
             # active later when CUDA context is properly initialized
             logger.debug(
                 "Triton found 0 active drivers in distributed environment. "
-                "This is expected during initialization.")
+                "This is expected during initialization."
+            )
         elif not is_distributed_env and len(active_drivers) != 1:
             # Strict check for non-distributed environments
             logger.info(
                 "Triton is installed but %d active driver(s) found "
                 "(expected 1). Disabling Triton to prevent runtime errors.",
-                len(active_drivers))
+                len(active_drivers),
+            )
             HAS_TRITON = False
     except ImportError:
         # This can occur if Triton is partially installed or triton.backends
         # is missing.
         logger.warning(
             "Triton is installed, but `triton.backends` could not be imported. "
-            "Disabling Triton.")
+            "Disabling Triton."
+        )
         HAS_TRITON = False
     except Exception as e:
         # Catch any other unexpected errors during the check.
         logger.warning(
             "An unexpected error occurred while checking Triton active drivers:"
-            " %s. Disabling Triton.", e)
+            " %s. Disabling Triton.",
+            e,
+        )
         HAS_TRITON = False
 
 if not HAS_TRITON:
-    logger.info("Triton not installed or not compatible; certain GPU-related"
-                " functions will not be available.")
+    logger.info(
+        "Triton not installed or not compatible; certain GPU-related"
+        " functions will not be available."
+    )
 
 
 class TritonPlaceholder(types.ModuleType):
-
     def __init__(self):
         super().__init__("triton")
         self.__version__ = "3.4.0"
@@ -76,7 +82,6 @@ class TritonPlaceholder(types.ModuleType):
         self.language = TritonLanguagePlaceholder()
 
     def _dummy_decorator(self, name):
-
         def decorator(*args, **kwargs):
             if args and callable(args[0]):
                 return args[0]
@@ -86,7 +91,6 @@ class TritonPlaceholder(types.ModuleType):
 
 
 class TritonLanguagePlaceholder(types.ModuleType):
-
     def __init__(self):
         super().__init__("triton.language")
         self.constexpr = None
@@ -94,3 +98,6 @@ class TritonLanguagePlaceholder(types.ModuleType):
         self.int64 = None
         self.int32 = None
         self.tensor = None
+        self.exp = None
+        self.log = None
+        self.log2 = None
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 92245498de65728824cf19430f749c96ec4ba132..69226763aafe6c22086c80595af692154e0b7de8 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -10,7 +10,7 @@ import time
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any
 from uuid import uuid4
 
 import cpuinfo
@@ -21,7 +21,8 @@ import torch
 import vllm.envs as envs
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
+from vllm.utils.platform_utils import cuda_get_device_properties
+from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -32,21 +33,19 @@ _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
-_GLOBAL_RUNTIME_DATA = dict[str, Union[str, int, bool]]()
+_GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
-    "VLLM_USE_TRITON_FLASH_ATTN",
     "VLLM_ATTENTION_BACKEND",
     "VLLM_USE_FLASHINFER_SAMPLER",
     "VLLM_PP_LAYER_PARTITION",
     "VLLM_USE_TRITON_AWQ",
-    "VLLM_USE_V1",
     "VLLM_ENABLE_V1_MULTIPROCESSING",
 ]
 
 
-def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+def set_runtime_usage_data(key: str, value: str | int | bool) -> None:
     """Set global usage data that will be sent with every usage heartbeat."""
     _GLOBAL_RUNTIME_DATA[key] = value
 
@@ -68,8 +67,7 @@ def is_usage_stats_enabled():
         no_usage_stats = envs.VLLM_NO_USAGE_STATS
         do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
 
-        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
-                                    or do_not_track_file)
+        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats or do_not_track_file)
     return _USAGE_STATS_ENABLED
 
 
@@ -80,9 +78,11 @@ def _get_current_timestamp_ns() -> int:
 def _detect_cloud_provider() -> str:
     # Try detecting through vendor file
     vendor_files = [
-        "/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
+        "/sys/class/dmi/id/product_version",
+        "/sys/class/dmi/id/bios_vendor",
         "/sys/class/dmi/id/product_name",
-        "/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
+        "/sys/class/dmi/id/chassis_asset_tag",
+        "/sys/class/dmi/id/sys_vendor",
     ]
     # Mapping of identifiable strings to cloud providers
     cloud_identifiers = {
@@ -130,62 +130,96 @@ class UsageMessage:
         self.uuid = str(uuid4())
 
         # Environment Information
-        self.provider: Optional[str] = None
-        self.num_cpu: Optional[int] = None
-        self.cpu_type: Optional[str] = None
-        self.cpu_family_model_stepping: Optional[str] = None
-        self.total_memory: Optional[int] = None
-        self.architecture: Optional[str] = None
-        self.platform: Optional[str] = None
-        self.cuda_runtime: Optional[str] = None
-        self.gpu_count: Optional[int] = None
-        self.gpu_type: Optional[str] = None
-        self.gpu_memory_per_device: Optional[int] = None
-        self.env_var_json: Optional[str] = None
+        self.provider: str | None = None
+        self.num_cpu: int | None = None
+        self.cpu_type: str | None = None
+        self.cpu_family_model_stepping: str | None = None
+        self.total_memory: int | None = None
+        self.architecture: str | None = None
+        self.platform: str | None = None
+        self.cuda_runtime: str | None = None
+        self.gpu_count: int | None = None
+        self.gpu_type: str | None = None
+        self.gpu_memory_per_device: int | None = None
+        self.env_var_json: str | None = None
 
         # vLLM Information
-        self.model_architecture: Optional[str] = None
-        self.vllm_version: Optional[str] = None
-        self.context: Optional[str] = None
+        self.model_architecture: str | None = None
+        self.vllm_version: str | None = None
+        self.context: str | None = None
 
         # Metadata
-        self.log_time: Optional[int] = None
-        self.source: Optional[str] = None
-
-    def report_usage(self,
-                     model_architecture: str,
-                     usage_context: UsageContext,
-                     extra_kvs: Optional[dict[str, Any]] = None) -> None:
-        t = Thread(target=self._report_usage_worker,
-                   args=(model_architecture, usage_context, extra_kvs or {}),
-                   daemon=True)
+        self.log_time: int | None = None
+        self.source: str | None = None
+
+    def report_usage(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any] | None = None,
+    ) -> None:
+        t = Thread(
+            target=self._report_usage_worker,
+            args=(model_architecture, usage_context, extra_kvs or {}),
+            daemon=True,
+        )
         t.start()
 
-    def _report_usage_worker(self, model_architecture: str,
-                             usage_context: UsageContext,
-                             extra_kvs: dict[str, Any]) -> None:
+    def _report_usage_worker(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any],
+    ) -> None:
         self._report_usage_once(model_architecture, usage_context, extra_kvs)
         self._report_continuous_usage()
 
-    def _report_usage_once(self, model_architecture: str,
-                           usage_context: UsageContext,
-                           extra_kvs: dict[str, Any]) -> None:
+    def _report_tpu_inference_usage(self) -> bool:
+        try:
+            from tpu_inference import tpu_info, utils
+
+            self.gpu_count = tpu_info.get_num_chips()
+            self.gpu_type = tpu_info.get_tpu_type()
+            self.gpu_memory_per_device = utils.get_device_hbm_limit()
+            self.cuda_runtime = "tpu_inference"
+            return True
+        except Exception:
+            return False
+
+    def _report_torch_xla_usage(self) -> bool:
+        try:
+            import torch_xla
+
+            self.gpu_count = torch_xla.runtime.world_size()
+            self.gpu_type = torch_xla.tpu.get_tpu_type()
+            self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info()[
+                "bytes_limit"
+            ]
+            self.cuda_runtime = "torch_xla"
+            return True
+        except Exception:
+            return False
+
+    def _report_usage_once(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any],
+    ) -> None:
         # Platform information
         from vllm.platforms import current_platform
+
         if current_platform.is_cuda_alike():
             self.gpu_count = cuda_device_count_stateless()
-            self.gpu_type, self.gpu_memory_per_device = (
-                cuda_get_device_properties(0, ("name", "total_memory")))
+            self.gpu_type, self.gpu_memory_per_device = cuda_get_device_properties(
+                0, ("name", "total_memory")
+            )
         if current_platform.is_cuda():
             self.cuda_runtime = torch.version.cuda
-        if current_platform.is_tpu():
-            try:
-                import torch_xla
-                self.gpu_count = torch_xla.runtime.world_size()
-                self.gpu_type = torch_xla.tpu.get_tpu_type()
-                self.gpu_memory_per_device = (
-                    torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
-            except Exception:
+        if current_platform.is_tpu():  # noqa: SIM102
+            if (not self._report_tpu_inference_usage()) and (
+                not self._report_torch_xla_usage()
+            ):
                 logger.exception("Failed to collect TPU information")
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
@@ -195,11 +229,13 @@ class UsageMessage:
         info = cpuinfo.get_cpu_info()
         self.num_cpu = info.get("count", None)
         self.cpu_type = info.get("brand_raw", "")
-        self.cpu_family_model_stepping = ",".join([
-            str(info.get("family", "")),
-            str(info.get("model", "")),
-            str(info.get("stepping", ""))
-        ])
+        self.cpu_family_model_stepping = ",".join(
+            [
+                str(info.get("family", "")),
+                str(info.get("model", "")),
+                str(info.get("stepping", "")),
+            ]
+        )
 
         # vLLM information
         self.context = usage_context.value
@@ -207,10 +243,9 @@ class UsageMessage:
         self.model_architecture = model_architecture
 
         # Environment variables
-        self.env_var_json = json.dumps({
-            env_var: getattr(envs, env_var)
-            for env_var in _USAGE_ENV_VARS_TO_COLLECT
-        })
+        self.env_var_json = json.dumps(
+            {env_var: getattr(envs, env_var) for env_var in _USAGE_ENV_VARS_TO_COLLECT}
+        )
 
         # Metadata
         self.log_time = _get_current_timestamp_ns()
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 4365c262bb32b76773c1cdd74574da8aa55681a7..3ef44e7703204aaa3552dc1ee0781cfa2b5566c9 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,93 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
-import asyncio
-import concurrent
-import contextlib
-import datetime
-import enum
-import gc
-import getpass
-import hashlib
-import importlib
-import importlib.metadata
-import importlib.util
-import inspect
-import ipaddress
-import json
-import multiprocessing
-import os
-import pickle
-import signal
-import socket
-import subprocess
-import sys
-import tempfile
-import textwrap
-import threading
-import time
-import traceback
-import types
 import uuid
 import warnings
-import weakref
-from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
-                      ArgumentTypeError, RawDescriptionHelpFormatter,
-                      _ArgumentGroup)
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import UserDict, defaultdict
-from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
-                             Hashable, Iterable, Iterator, KeysView, Mapping,
-                             Sequence)
-from concurrent.futures import ThreadPoolExecutor
-from concurrent.futures.process import ProcessPoolExecutor
-from dataclasses import dataclass, field
-from functools import cache, lru_cache, partial, wraps
-from pathlib import Path
-from types import MappingProxyType
-from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, TextIO, TypeVar, Union, cast, overload)
-from urllib.parse import urlparse
-from uuid import uuid4
+from typing import Any
 
-import cachetools
-import cbor2
-import cloudpickle
-import numpy as np
-import numpy.typing as npt
-import psutil
-import regex as re
-import setproctitle
 import torch
-import torch.types
-import yaml
-import zmq
-import zmq.asyncio
-from packaging import version
-from packaging.version import Version
-from torch.library import Library
-from transformers.tokenization_utils_base import BatchEncoding
-from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
-import vllm.envs as envs
-from vllm.logger import enable_trace_function_call, init_logger
-from vllm.ray.lazy_utils import is_in_ray_actor
+from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    from argparse import Namespace
+_DEPRECATED_MAPPINGS = {
+    "cprofile": "profiling",
+    "cprofile_context": "profiling",
+    # Used by lm-eval
+    "get_open_port": "network_utils",
+}
 
-    from vllm.config import ModelConfig, VllmConfig
-    from vllm.sequence import IntermediateTensors
 
-logger = init_logger(__name__)
+def __getattr__(name: str) -> Any:  # noqa: D401 - short deprecation docstring
+    """Module-level getattr to handle deprecated utilities."""
+    if name in _DEPRECATED_MAPPINGS:
+        submodule_name = _DEPRECATED_MAPPINGS[name]
+        warnings.warn(
+            f"vllm.utils.{name} is deprecated and will be removed in a future version. "
+            f"Use vllm.utils.{submodule_name}.{name} instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        module = __import__(f"vllm.utils.{submodule_name}", fromlist=[submodule_name])
+        return getattr(module, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
-# This value is chosen to have a balance between ITL and TTFT. Note it is
-# not optimized for throughput.
-DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
-POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
-MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
+
+def __dir__() -> list[str]:
+    # expose deprecated names in dir() for better UX/tab-completion
+    return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
+
+
+logger = init_logger(__name__)
 
 # Constants related to forcing the attention backend selection
 
@@ -99,3469 +49,34 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
 STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
-MB_bytes = 1_000_000
-"""The number of bytes in one megabyte (MB)."""
-
-MiB_bytes = 1 << 20
-"""The number of bytes in one mebibyte (MiB)."""
-
-GB_bytes = 1_000_000_000
-"""The number of bytes in one gigabyte (GB)."""
-
-GiB_bytes = 1 << 30
-"""The number of bytes in one gibibyte (GiB)."""
-
-# ANSI color codes
-CYAN = '\033[1;36m'
-RESET = '\033[0;0m'
-
-STR_DTYPE_TO_TORCH_DTYPE = {
-    "float32": torch.float32,
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-    "fp8": torch.uint8,
-    "fp8_e4m3": torch.uint8,
-    "fp8_e5m2": torch.uint8,
-    "int8": torch.int8,
-    "fp8_inc": torch.float8_e4m3fn,
-    "fp8_ds_mla": torch.uint8,
-}
-
-TORCH_DTYPE_TO_NUMPY_DTYPE = {
-    torch.float16: np.float16,
-    torch.float32: np.float32,
-    torch.float64: np.float64,
-    torch.uint8: np.uint8,
-    torch.int32: np.int32,
-    torch.int64: np.int64,
-}
-
-
-@contextlib.contextmanager
-def set_default_torch_num_threads(num_threads: int):
-    """Sets the default number of threads for PyTorch to the given value."""
-    old_num_threads = torch.get_num_threads()
-    torch.set_num_threads(num_threads)
-    yield
-    torch.set_num_threads(old_num_threads)
-
-
-P = ParamSpec('P')
-T = TypeVar("T")
-U = TypeVar("U")
-
-_K = TypeVar("_K", bound=Hashable)
-_V = TypeVar("_V")
-_T = TypeVar("_T")
-
-
-class _Sentinel:
-    ...
-
-
-ALL_PINNED_SENTINEL = _Sentinel()
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-
-
-class LayerBlockType(enum.Enum):
-    attention = "attention"
-    mamba = "mamba"
-
-
-class Counter:
-
-    def __init__(self, start: int = 0) -> None:
-        self.counter = start
-
-    def __next__(self) -> int:
-        i = self.counter
-        self.counter += 1
-        return i
-
-    def reset(self) -> None:
-        self.counter = 0
-
-
-class _MappingOrderCacheView(UserDict[_K, _V]):
-
-    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
-        super().__init__(data)
-        self.ordered_keys = ordered_keys
-
-    def __iter__(self) -> Iterator[_K]:
-        return iter(self.ordered_keys)
-
-    def keys(self) -> KeysView[_K]:
-        return KeysView(self.ordered_keys)
-
-
-class CacheInfo(NamedTuple):
-    hits: int
-    total: int
-
-    @property
-    def hit_ratio(self) -> float:
-        if self.total == 0:
-            return 0
-
-        return self.hits / self.total
-
-    def __sub__(self, other: CacheInfo):
-        return CacheInfo(
-            hits=self.hits - other.hits,
-            total=self.total - other.total,
-        )
-
-
-class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
-
-    def __init__(self,
-                 capacity: float,
-                 getsizeof: Optional[Callable[[_V], float]] = None):
-        super().__init__(capacity, getsizeof)
-
-        self.pinned_items = set[_K]()
-
-        self._hits = 0
-        self._total = 0
-        self._last_info = CacheInfo(hits=0, total=0)
-
-    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
-        value = super().__getitem__(key)
-
-        if update_info:
-            self._hits += 1
-            self._total += 1
-
-        return value
-
-    def __delitem__(self, key: _K) -> None:
-        run_on_remove = key in self
-        value = self.__getitem__(key,
-                                 update_info=False)  # type: ignore[call-arg]
-        super().__delitem__(key)
-        if key in self.pinned_items:
-            # Todo: add warning to inform that del pinned item
-            self._unpin(key)
-        if run_on_remove:
-            self._on_remove(key, value)
-
-    @property
-    def cache(self) -> Mapping[_K, _V]:
-        """Return the internal cache dictionary in order (read-only)."""
-        return _MappingOrderCacheView(
-            self._Cache__data,  # type: ignore
-            self.order)
-
-    @property
-    def order(self) -> Mapping[_K, None]:
-        """Return the internal order dictionary (read-only)."""
-        return MappingProxyType(self._LRUCache__order)  # type: ignore
-
-    @property
-    def capacity(self) -> float:
-        return self.maxsize
-
-    @property
-    def usage(self) -> float:
-        if self.maxsize == 0:
-            return 0
-
-        return self.currsize / self.maxsize
-
-    def stat(self, *, delta: bool = False) -> CacheInfo:
-        """
-        Gets the cumulative number of hits and queries against this cache.
-
-        If `delta=True`, instead gets these statistics
-        since the last call that also passed `delta=True`.
-        """
-        info = CacheInfo(hits=self._hits, total=self._total)
-
-        if delta:
-            info_delta = info - self._last_info
-            self._last_info = info
-            info = info_delta
-
-        return info
-
-    def touch(self, key: _K) -> None:
-        try:
-            self._LRUCache__order.move_to_end(key)  # type: ignore
-        except KeyError:
-            self._LRUCache__order[key] = None  # type: ignore
-
-    @overload
-    def get(self, key: _K, /) -> Optional[_V]:
-        ...
-
-    @overload
-    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
-        ...
-
-    def get(self,
-            key: _K,
-            /,
-            default: Optional[Union[_V,
-                                    _T]] = None) -> Optional[Union[_V, _T]]:
-        value: Optional[Union[_V, _T]]
-        if key in self:
-            value = self.__getitem__(
-                key, update_info=False)  # type: ignore[call-arg]
-
-            self._hits += 1
-        else:
-            value = default
-
-        self._total += 1
-        return value
-
-    @overload
-    def pop(self, key: _K) -> _V:
-        ...
-
-    @overload
-    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
-        ...
-
-    def pop(self,
-            key: _K,
-            default: Optional[Union[_V,
-                                    _T]] = None) -> Optional[Union[_V, _T]]:
-        value: Optional[Union[_V, _T]]
-        if key not in self:
-            return default
-
-        value = self.__getitem__(key,
-                                 update_info=False)  # type: ignore[call-arg]
-        self.__delitem__(key)
-        return value
-
-    def put(self, key: _K, value: _V) -> None:
-        self.__setitem__(key, value)
-
-    def pin(self, key: _K) -> None:
-        """
-        Pins a key in the cache preventing it from being
-        evicted in the LRU order.
-        """
-        if key not in self:
-            raise ValueError(f"Cannot pin key: {key} not in cache.")
-        self.pinned_items.add(key)
-
-    def _unpin(self, key: _K) -> None:
-        """
-        Unpins a key in the cache allowing it to be
-        evicted in the LRU order.
-        """
-        self.pinned_items.remove(key)
-
-    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
-        pass
-
-    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
-        if len(self) == 0:
-            return
-
-        self.popitem(remove_pinned=remove_pinned)
-
-    def _remove_old_if_needed(self) -> None:
-        while self.currsize > self.capacity:
-            self.remove_oldest()
-
-    def popitem(self, remove_pinned: bool = False):
-        """Remove and return the `(key, value)` pair least recently used."""
-        if not remove_pinned:
-            # pop the oldest item in the cache that is not pinned
-            lru_key = next(
-                (key for key in self.order if key not in self.pinned_items),
-                ALL_PINNED_SENTINEL)
-            if lru_key is ALL_PINNED_SENTINEL:
-                raise RuntimeError("All items are pinned, "
-                                   "cannot remove oldest from the cache.")
-        else:
-            lru_key = next(iter(self.order))
-        value = self.pop(cast(_K, lru_key))
-        return (lru_key, value)
-
-    def clear(self) -> None:
-        while len(self) > 0:
-            self.remove_oldest(remove_pinned=True)
-
-        self._hits = 0
-        self._total = 0
-        self._last_info = CacheInfo(hits=0, total=0)
-
-
-class PyObjectCache:
-    """Used to cache python objects to avoid object allocations
-    across scheduler iterations.
-    """
-
-    def __init__(self, obj_builder):
-        self._obj_builder = obj_builder
-        self._index = 0
-
-        self._obj_cache = []
-        for _ in range(128):
-            self._obj_cache.append(self._obj_builder())
-
-    def _grow_cache(self):
-        # Double the size of the cache
-        num_objs = len(self._obj_cache)
-        for _ in range(num_objs):
-            self._obj_cache.append(self._obj_builder())
-
-    def get_object(self):
-        """Returns a pre-allocated cached object. If there is not enough
-        objects, then the cache size will double.
-        """
-        if self._index >= len(self._obj_cache):
-            self._grow_cache()
-            assert self._index < len(self._obj_cache)
-
-        obj = self._obj_cache[self._index]
-        self._index += 1
-
-        return obj
-
-    def reset(self):
-        """Makes all cached-objects available for the next scheduler iteration.
-        """
-        self._index = 0
-
-
-@cache
-def get_max_shared_memory_bytes(gpu: int = 0) -> int:
-    """Returns the maximum shared memory per thread block in bytes."""
-    from vllm import _custom_ops as ops
-    max_shared_mem = (
-        ops.get_max_shared_memory_per_block_device_attribute(gpu))
-    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
-    # will fail
-    assert max_shared_mem > 0, "max_shared_mem can not be zero"
-    return int(max_shared_mem)
-
-
-def get_cpu_memory() -> int:
-    """Returns the total CPU memory of the node in bytes."""
-    return psutil.virtual_memory().total
-
 
 def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-class AsyncMicrobatchTokenizer:
-    """Asynchronous tokenizer with micro-batching.
-
-    Pulls pending encode/decode requests from a queue and batches them
-    up to reduce overhead. A single-thread ThreadPoolExecutor is used
-    so the event loop stays responsive.
-    """
-
-    def __init__(
-        self,
-        tokenizer,
-        max_batch_size: int = 32,
-        batch_wait_timeout_s: float = 0.002,
-    ) -> None:
-        self.tokenizer = tokenizer
-        self.max_batch_size = max_batch_size
-        self.batch_wait_timeout_s = batch_wait_timeout_s
-
-        self._loop = asyncio.get_running_loop()
-        self._queues: dict[tuple,
-                           asyncio.Queue[Union[tuple[str, dict,
-                                                     asyncio.Future],
-                                               tuple[list[int],
-                                                     asyncio.Future]]]] = {}
-        self._batcher_tasks: list[asyncio.Task] = []
-
-        # Single-thread executor for blocking tokenizer calls.
-        self._executor = ThreadPoolExecutor(max_workers=1)
-
-    # === Public async API ===
-    async def __call__(self, prompt, **kwargs):
-        result_future: asyncio.Future = self._loop.create_future()
-        key = self._queue_key("encode", kwargs)
-        queue = self._get_queue(self._loop, key)
-        await queue.put((prompt, kwargs, result_future))
-        return await result_future
-
-    async def decode(self, token_ids, **kwargs):
-        result_future: asyncio.Future = self._loop.create_future()
-        key = self._queue_key("decode", kwargs)
-        queue = self._get_queue(self._loop, key)
-        await queue.put((token_ids, result_future))
-        return await result_future
-
-    # === Internal helpers ===
-    def _get_queue(
-        self, loop: asyncio.AbstractEventLoop, key: tuple
-    ) -> asyncio.Queue[Union[tuple[str, dict, asyncio.Future], tuple[
-            list[int], asyncio.Future]]]:
-        """Get the request queue for the given operation key, creating a new
-        queue and batcher task if needed."""
-        queue = self._queues.get(key)
-        if queue is None:
-            self._queues[key] = queue = asyncio.Queue()
-            if key[0] == "encode":
-                can_batch = key[1] != "other"
-                coro = self._batch_encode_loop(queue, can_batch)
-            else:
-                assert key[0] == "decode", \
-                    f"Unknown operation type: {key[0]}."
-                coro = self._batch_decode_loop(queue)
-            self._batcher_tasks.append(loop.create_task(coro))
-        return queue
-
-    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
-        """Batch incoming encode requests for efficiency."""
-        while True:
-            prompt, kwargs, result_future = await queue.get()
-            prompts = [prompt]
-            kwargs_list = [kwargs]
-            result_futures = [result_future]
-            deadline = self._loop.time() + self.batch_wait_timeout_s
-
-            while len(prompts) < self.max_batch_size:
-                timeout = deadline - self._loop.time()
-                if timeout <= 0:
-                    break
-                try:
-                    prompt, kwargs, result_future = await asyncio.wait_for(
-                        queue.get(), timeout)
-                    prompts.append(prompt)
-                    result_futures.append(result_future)
-                    if not can_batch:
-                        kwargs_list.append(kwargs)
-                except asyncio.TimeoutError:
-                    break
-
-            try:
-                # If every request uses identical kwargs we can run a single
-                # batched tokenizer call for a big speed-up.
-                if can_batch and len(prompts) > 1:
-                    batch_encode_fn = partial(self.tokenizer, prompts,
-                                              **kwargs)
-                    results = await self._loop.run_in_executor(
-                        self._executor, batch_encode_fn)
-
-                    for i, fut in enumerate(result_futures):
-                        if not fut.done():
-                            data = {k: v[i] for k, v in results.items()}
-                            fut.set_result(BatchEncoding(data))
-                else:
-                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
-                        self.tokenizer(p, **kw)
-                        for p, kw in zip(prompts, kwargs)
-                    ]
-                    results = await self._loop.run_in_executor(
-                        self._executor, encode_fn)
-
-                    for fut, res in zip(result_futures, results):
-                        if not fut.done():
-                            fut.set_result(res)
-            except Exception as e:
-                for fut in result_futures:
-                    if not fut.done():
-                        fut.set_exception(e)
-
-    async def _batch_decode_loop(self, queue: asyncio.Queue):
-        """Batch incoming decode requests for efficiency."""
-        while True:
-            token_ids, result_future = await queue.get()
-            token_ids_list = [token_ids]
-            result_futures = [result_future]
-            deadline = self._loop.time() + self.batch_wait_timeout_s
-
-            while len(token_ids_list) < self.max_batch_size:
-                timeout = deadline - self._loop.time()
-                if timeout <= 0:
-                    break
-                try:
-                    token_ids, result_future = await asyncio.wait_for(
-                        queue.get(), timeout)
-                    token_ids_list.append(token_ids)
-                    result_futures.append(result_future)
-                except asyncio.TimeoutError:
-                    break
-
-            try:
-                # Perform a single batched decode call for all requests
-                results = await self._loop.run_in_executor(
-                    self._executor, self.tokenizer.batch_decode,
-                    token_ids_list)
-                for fut, res in zip(result_futures, results):
-                    if not fut.done():
-                        fut.set_result(res)
-            except Exception as e:
-                for fut in result_futures:
-                    if not fut.done():
-                        fut.set_exception(e)
-
-    def _queue_key(self, op: str, kwargs: dict) -> tuple:
-        """
-        Return a normalized key describing operation + kwargs.
-
-        - `add_special_tokens`: {True/False}
-        - `truncation`: {True/False}
-          - If `truncation` is False (`max_length` is None),
-            returns a key for a can_batch queue.
-          - If `truncation` is True and `max_length` is None or equals
-            `tokenizer.model_max_length`, returns a key for a can_batch queue.
-          - Otherwise, returns a key for a cannot_batch queue.
-
-        Examples:
-          - Decode: ("decode",)
-          - Encode typical:
-            ("encode", add_special_tokens, bool_truncation, max_length_label)
-          - Fallback: ("encode", "other")
-        """
-
-        if op == "decode":
-            return ("decode", )
-
-        add_special_tokens = kwargs.get("add_special_tokens", True)
-        truncation = kwargs.get("truncation", False)
-        max_length = kwargs.get("max_length")
-
-        if not truncation:
-            return "encode", add_special_tokens, False, None
-
-        model_max = getattr(self.tokenizer, "model_max_length", None)
-        if max_length is None or (model_max is not None
-                                  and max_length == model_max):
-            return "encode", add_special_tokens, True, "model_max"
-
-        return "encode", "other"
-
-    def __del__(self):
-        if ((tasks := getattr(self, "_batcher_tasks", None))
-                and (loop := getattr(self, "_loop", None))
-                and not loop.is_closed()):
-
-            def cancel_tasks():
-                for task in tasks:
-                    task.cancel()
-
-            loop.call_soon_threadsafe(cancel_tasks)
-
-
-def cancel_task_threadsafe(task: Task):
-    if task and not task.done():
-        run_in_loop(task.get_loop(), task.cancel)
-
-
-def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
-    for sock in sockets:
-        if sock is not None:
-            sock.close(linger=0)
-
-
-def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
-    if in_loop(loop):
-        function(*args)
-    elif not loop.is_closed():
-        loop.call_soon_threadsafe(function, *args)
-
-
-def in_loop(event_loop: AbstractEventLoop) -> bool:
-    try:
-        return asyncio.get_running_loop() == event_loop
-    except RuntimeError:
-        return False
-
-
-def make_async(
-    func: Callable[P, T],
-    executor: Optional[concurrent.futures.Executor] = None
-) -> Callable[P, Awaitable[T]]:
-    """Take a blocking function, and run it on in an executor thread.
-
-    This function prevents the blocking function from blocking the
-    asyncio event loop.
-    The code in this function needs to be thread safe.
-    """
-
-    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
-        loop = asyncio.get_event_loop()
-        p_func = partial(func, *args, **kwargs)
-        return loop.run_in_executor(executor=executor, func=p_func)
-
-    return _async_wrapper
-
-
-def _next_task(iterator: AsyncGenerator[T, None],
-               loop: AbstractEventLoop) -> Task:
-    # Can use anext() in python >= 3.10
-    return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
-
-
-async def merge_async_iterators(
-    *iterators: AsyncGenerator[T,
-                               None], ) -> AsyncGenerator[tuple[int, T], None]:
-    """Merge multiple asynchronous iterators into a single iterator.
-
-    This method handle the case where some iterators finish before others.
-    When it yields, it yields a tuple (i, item) where i is the index of the
-    iterator that yields the item.
-    """
-    if len(iterators) == 1:
-        # Fast-path single iterator case.
-        async for item in iterators[0]:
-            yield 0, item
-        return
-
-    loop = asyncio.get_running_loop()
-
-    awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
-    try:
-        while awaits:
-            done, _ = await asyncio.wait(awaits.keys(),
-                                         return_when=FIRST_COMPLETED)
-            for d in done:
-                pair = awaits.pop(d)
-                try:
-                    item = await d
-                    i, it = pair
-                    awaits[_next_task(it, loop)] = pair
-                    yield i, item
-                except StopAsyncIteration:
-                    pass
-    finally:
-        # Cancel any remaining iterators
-        for f, (_, it) in awaits.items():
-            with contextlib.suppress(BaseException):
-                f.cancel()
-                await it.aclose()
-
-
-async def collect_from_async_generator(
-        iterator: AsyncGenerator[T, None]) -> list[T]:
-    """Collect all items from an async generator into a list."""
-    items = []
-    async for item in iterator:
-        items.append(item)
-    return items
-
-
-def get_ip() -> str:
-    host_ip = envs.VLLM_HOST_IP
-    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
-        logger.warning(
-            "The environment variable HOST_IP is deprecated and ignored, as"
-            " it is often used by Docker and other software to"
-            " interact with the container's network stack. Please "
-            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
-            " to communicate with each other.")
-    if host_ip:
-        return host_ip
-
-    # IP is not set, try to get it from the network interface
-
-    # try ipv4
-    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-    try:
-        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
-        return s.getsockname()[0]
-    except Exception:
-        pass
-
-    # try ipv6
-    try:
-        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
-        # Google's public DNS server, see
-        # https://developers.google.com/speed/public-dns/docs/using#addresses
-        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
-        return s.getsockname()[0]
-    except Exception:
-        pass
-
-    warnings.warn(
-        "Failed to get the IP address, using 0.0.0.0 by default."
-        "The value can be set by the environment variable"
-        " VLLM_HOST_IP or HOST_IP.",
-        stacklevel=2)
-    return "0.0.0.0"
-
-
-def test_loopback_bind(address, family):
-    try:
-        s = socket.socket(family, socket.SOCK_DGRAM)
-        s.bind((address, 0))  # Port 0 = auto assign
-        s.close()
-        return True
-    except OSError:
-        return False
-
-
-def get_loopback_ip() -> str:
-    loopback_ip = envs.VLLM_LOOPBACK_IP
-    if loopback_ip:
-        return loopback_ip
-
-    # VLLM_LOOPBACK_IP is not set, try to get it based on network interface
-
-    if test_loopback_bind("127.0.0.1", socket.AF_INET):
-        return "127.0.0.1"
-    elif test_loopback_bind("::1", socket.AF_INET6):
-        return "::1"
-    else:
-        raise RuntimeError(
-            "Neither 127.0.0.1 nor ::1 are bound to a local interface. "
-            "Set the VLLM_LOOPBACK_IP environment variable explicitly.")
-
-
-def is_valid_ipv6_address(address: str) -> bool:
-    try:
-        ipaddress.IPv6Address(address)
-        return True
-    except ValueError:
-        return False
-
-
-def split_host_port(host_port: str) -> tuple[str, int]:
-    # ipv6
-    if host_port.startswith('['):
-        host, port = host_port.rsplit(']', 1)
-        host = host[1:]
-        port = port.split(':')[1]
-        return host, int(port)
-    else:
-        host, port = host_port.split(':')
-        return host, int(port)
-
-
-def join_host_port(host: str, port: int) -> str:
-    if is_valid_ipv6_address(host):
-        return f"[{host}]:{port}"
-    else:
-        return f"{host}:{port}"
-
-
-def get_distributed_init_method(ip: str, port: int) -> str:
-    return get_tcp_uri(ip, port)
-
-
-def get_tcp_uri(ip: str, port: int) -> str:
-    if is_valid_ipv6_address(ip):
-        return f"tcp://[{ip}]:{port}"
-    else:
-        return f"tcp://{ip}:{port}"
-
-
-def get_open_zmq_ipc_path() -> str:
-    base_rpc_path = envs.VLLM_RPC_BASE_PATH
-    return f"ipc://{base_rpc_path}/{uuid4()}"
-
-
-def get_open_zmq_inproc_path() -> str:
-    return f"inproc://{uuid4()}"
-
-
-def get_open_port() -> int:
-    """
-    Get an open port for the vLLM process to listen on.
-    An edge case to handle, is when we run data parallel,
-    we need to avoid ports that are potentially used by
-    the data parallel master process.
-    Right now we reserve 10 ports for the data parallel master
-    process. Currently it uses 2 ports.
+def length_from_prompt_token_ids_or_embeds(
+    prompt_token_ids: list[int] | None,
+    prompt_embeds: torch.Tensor | None,
+) -> int:
+    """Calculate the request length (in number of tokens) give either
+    prompt_token_ids or prompt_embeds.
     """
-    if "VLLM_DP_MASTER_PORT" in os.environ:
-        dp_master_port = envs.VLLM_DP_MASTER_PORT
-        reserved_port_range = range(dp_master_port, dp_master_port + 10)
-        while True:
-            candidate_port = _get_open_port()
-            if candidate_port not in reserved_port_range:
-                return candidate_port
-    return _get_open_port()
-
-
-def get_open_ports_list(count: int = 5) -> list[int]:
-    """Get a list of open ports."""
-    ports = set[int]()
-    while len(ports) < count:
-        ports.add(get_open_port())
-    return list(ports)
-
-
-def _get_open_port() -> int:
-    port = envs.VLLM_PORT
-    if port is not None:
-        while True:
-            try:
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                    s.bind(("", port))
-                    return port
-            except OSError:
-                port += 1  # Increment port number if already in use
-                logger.info("Port %d is already in use, trying port %d",
-                            port - 1, port)
-    # try ipv4
-    try:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.bind(("", 0))
-            return s.getsockname()[1]
-    except OSError:
-        # try ipv6
-        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
-            s.bind(("", 0))
-            return s.getsockname()[1]
-
-
-def find_process_using_port(port: int) -> Optional[psutil.Process]:
-    # TODO: We can not check for running processes with network
-    # port on macOS. Therefore, we can not have a full graceful shutdown
-    # of vLLM. For now, let's not look for processes in this case.
-    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
-    if sys.platform.startswith("darwin"):
-        return None
+    prompt_token_len = None if prompt_token_ids is None else len(prompt_token_ids)
+    prompt_embeds_len = None if prompt_embeds is None else len(prompt_embeds)
 
-    our_pid = os.getpid()
-    for conn in psutil.net_connections():
-        if conn.laddr.port == port and (conn.pid is not None
-                                        and conn.pid != our_pid):
-            try:
-                return psutil.Process(conn.pid)
-            except psutil.NoSuchProcess:
-                return None
-    return None
-
-
-def update_environment_variables(envs: dict[str, str]):
-    for k, v in envs.items():
-        if k in os.environ and os.environ[k] != v:
-            logger.warning(
-                "Overwriting environment variable %s "
-                "from '%s' to '%s'", k, os.environ[k], v)
-        os.environ[k] = v
-
-
-def chunk_list(lst: list[T], chunk_size: int):
-    """Yield successive chunk_size chunks from lst."""
-    for i in range(0, len(lst), chunk_size):
-        yield lst[i:i + chunk_size]
-
-
-def cdiv(a: int, b: int) -> int:
-    """Ceiling division."""
-    return -(a // -b)
-
-
-def next_power_of_2(n) -> int:
-    """The next power of 2 (inclusive)"""
-    if n < 1:
-        return 1
-    return 1 << (n - 1).bit_length()
-
-
-def prev_power_of_2(n: int) -> int:
-    """The previous power of 2 (inclusive)"""
-    if n <= 0:
-        return 0
-    return 1 << (n.bit_length() - 1)
-
-
-def round_up(x: int, y: int) -> int:
-    return ((x + y - 1) // y) * y
-
-
-def round_down(x: int, y: int) -> int:
-    return (x // y) * y
-
-
-def _generate_random_fp8(
-    tensor: torch.Tensor,
-    low: float,
-    high: float,
-) -> None:
-    # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
-    # it may occur Inf or NaN if we directly use torch.randint
-    # to generate random data for fp8 data.
-    # For example, s.11111.00 in fp8e5m2 format represents Inf.
-    #     | E4M3        | E5M2
-    # -----|-------------|-------------------
-    # Inf | N/A         | s.11111.00
-    # NaN | s.1111.111  | s.11111.{01,10,11}
-    from vllm import _custom_ops as ops
-    tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
-    tensor_tmp.uniform_(low, high)
-    ops.convert_fp8(tensor, tensor_tmp)
-    del tensor_tmp
-
-
-def get_kv_cache_torch_dtype(
-        cache_dtype: Optional[Union[str, torch.dtype]],
-        model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
-    if isinstance(cache_dtype, str):
-        if cache_dtype == "auto":
-            if isinstance(model_dtype,
-                          str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
-                torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
-            elif isinstance(model_dtype, torch.dtype):
-                torch_dtype = model_dtype
-            else:
-                raise ValueError(f"Invalid model dtype: {model_dtype}")
-        elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
-            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
-        else:
-            raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
-    elif isinstance(cache_dtype, torch.dtype):
-        torch_dtype = cache_dtype
+    if prompt_token_len is None:
+        if prompt_embeds_len is None:
+            raise ValueError("Neither prompt_token_ids nor prompt_embeds were defined.")
+        return prompt_embeds_len
     else:
-        raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
-    return torch_dtype
-
-
-def create_kv_caches_with_random_flash(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    cache_dtype: Optional[Union[str, torch.dtype]],
-    model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: Optional[int] = None,
-    device: Optional[str] = "cuda",
-    cache_layout: Optional[str] = "NHD",
-) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-    from vllm.platforms import current_platform
-    current_platform.seed_everything(seed)
-
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
-    generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
-    assert cache_layout in ("NHD", "HND")
-    stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2,
-                                                                  4)
-
-    kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i]
-                                      for i in stride_order)
-    scale = head_size**-0.5
-
-    key_caches: list[torch.Tensor] = []
-    value_caches: list[torch.Tensor] = []
-
-    for _ in range(num_layers):
-        key_value_cache = torch.empty(size=kv_cache_allocation_shape,
-                                      dtype=torch_dtype,
-                                      device=device).permute(*stride_order)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
-            key_value_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8':
-            _generate_random_fp8(key_value_cache, -scale, scale)
-        else:
-            raise ValueError(
-                f"Does not support key cache of type {cache_dtype}")
-        key_caches.append(key_value_cache[:, 0])
-        value_caches.append(key_value_cache[:, 1])
-    return key_caches, value_caches
-
-
-def create_kv_caches_with_random(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    cache_dtype: Optional[Union[str, torch.dtype]],
-    model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: Optional[int] = None,
-    device: Optional[str] = "cuda",
-) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-    if cache_dtype == "fp8" and head_size % 16:
-        raise ValueError(
-            f"Does not support key cache of type fp8 with head_size {head_size}"
-        )
-    from vllm.platforms import current_platform
-    current_platform.seed_everything(seed)
-
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
-
-    scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches: list[torch.Tensor] = []
-    for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape,
-                                dtype=torch_dtype,
-                                device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
-            key_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8':
-            _generate_random_fp8(key_cache, -scale, scale)
-        else:
-            raise ValueError(
-                f"Does not support key cache of type {cache_dtype}")
-        key_caches.append(key_cache)
-
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches: list[torch.Tensor] = []
-    for _ in range(num_layers):
-        value_cache = torch.empty(size=value_cache_shape,
-                                  dtype=torch_dtype,
-                                  device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
-            value_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8':
-            _generate_random_fp8(value_cache, -scale, scale)
-        else:
-            raise ValueError(
-                f"Does not support value cache of type {cache_dtype}")
-        value_caches.append(value_cache)
-    return key_caches, value_caches
-
-
-@cache
-def is_pin_memory_available() -> bool:
-    from vllm.platforms import current_platform
-    return current_platform.is_pin_memory_available()
-
-
-@cache
-def is_uva_available() -> bool:
-    """Check if Unified Virtual Addressing (UVA) is available."""
-    # UVA requires pinned memory.
-    # TODO: Add more requirements for UVA if needed.
-    return is_pin_memory_available()
-
-
-class DeviceMemoryProfiler:
-
-    def __init__(self, device: Optional[torch.types.Device] = None):
-        self.device = device
-
-    def current_memory_usage(self) -> float:
-        # Return the memory usage in bytes.
-        from vllm.platforms import current_platform
-        gc.collect()
-        return current_platform.get_current_memory_usage(self.device)
-
-    def __enter__(self):
-        self.initial_memory = self.current_memory_usage()
-        # This allows us to call methods of the context manager if needed
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.final_memory = self.current_memory_usage()
-        self.consumed_memory = self.final_memory - self.initial_memory
-
-        # Force garbage collection
-        gc.collect()
-
-
-def make_ndarray_with_pad(
-    x: list[list[T]],
-    pad: T,
-    dtype: npt.DTypeLike,
-    *,
-    max_len: Optional[int] = None,
-) -> npt.NDArray:
-    """
-    Make a padded array from 2D inputs.
-
-    The padding is applied to the end of each inner list until it reaches
-    `max_len`.
-    """
-    if max_len is None:
-        # Unlike for most functions, map is faster than a genexpr over `len`
-        max_len = max(map(len, x), default=0)
-
-    padded_x = np.full((len(x), max_len), pad, dtype=dtype)
-    for ind, blocktb in enumerate(x):
-        assert len(blocktb) <= max_len
-        padded_x[ind, :len(blocktb)] = blocktb
-
-    return padded_x
-
-
-def make_tensor_with_pad(
-    x: list[list[T]],
-    pad: T,
-    dtype: torch.dtype,
-    *,
-    max_len: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    pin_memory: bool = False,
-) -> torch.Tensor:
-    """
-    Make a padded tensor from 2D inputs.
-
-    The padding is applied to the end of each inner list until it reaches
-    `max_len`.
-    """
-    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
-    padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
-
-    tensor = torch.from_numpy(padded_x).to(device)
-    if pin_memory:
-        tensor = tensor.pin_memory()
-
-    return tensor
-
-
-def async_tensor_h2d(
-    data: list,
-    dtype: torch.dtype,
-    target_device: Union[str, torch.device],
-    pin_memory: bool,
-) -> torch.Tensor:
-    """Asynchronously create a tensor and copy it from host to device."""
-    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
-    return t.to(device=target_device, non_blocking=True)
-
-
-def get_dtype_size(dtype: torch.dtype) -> int:
-    """Get the size of the data type in bytes."""
-    return torch.tensor([], dtype=dtype).element_size()
-
-
-# bool = 0, int = 1, float = 2, complex = 3
-def _get_precision_level(dtype: torch.dtype) -> int:
-    # NOTE: Complex dtypes return `is_floating_point=False`
-    return ((dtype != torch.bool) + dtype.is_floating_point +
-            dtype.is_complex * 2)
-
-
-def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
-    """
-    Test whether it is lossless to cast a tensor from
-    `src_dtype` to `tgt_dtype`.
-    """
-    if src_dtype == tgt_dtype:
-        return True
-
-    src_level = _get_precision_level(src_dtype)
-    tgt_level = _get_precision_level(tgt_dtype)
-
-    if src_level < tgt_level:
-        return True
-    if src_level > tgt_level:
-        return False
-
-    # Compare integral types
-    if not src_dtype.is_floating_point and not src_dtype.is_complex:
-        src_info = torch.iinfo(src_dtype)
-        tgt_info = torch.iinfo(tgt_dtype)
-        return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
-
-    # Compare floating-point types
-    src_info = torch.finfo(src_dtype)
-    tgt_info = torch.finfo(tgt_dtype)
-    return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
-            and src_info.resolution >= tgt_info.resolution)
-
-
-def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
-    """
-    Get the common `dtype` where all of the other `dtypes` can be
-    cast to it without losing any information.
-    """
-    return max(
-        dtypes,
-        key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
-    )
-
-
-def as_list(maybe_list: Iterable[T]) -> list[T]:
-    """Convert iterable to list, unless it's already a list."""
-    return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
-
-
-def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]:
-    if isinstance(obj, str) or not isinstance(obj, Iterable):
-        return [obj]  # type: ignore[list-item]
-    return obj
-
-
-# `collections` helpers
-def is_list_of(
-    value: object,
-    typ: Union[type[T], tuple[type[T], ...]],
-    *,
-    check: Literal["first", "all"] = "first",
-) -> TypeIs[list[T]]:
-    if not isinstance(value, list):
-        return False
-
-    if check == "first":
-        return len(value) == 0 or isinstance(value[0], typ)
-    elif check == "all":
-        return all(isinstance(v, typ) for v in value)
-
-    assert_never(check)
-
-
-def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
-    """Flatten a list of lists to a single list."""
-    return [item for sublist in lists for item in sublist]
-
-
-def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
-    """
-    Unlike [`itertools.groupby`][], groups are not broken by
-    non-contiguous data.
-    """
-    groups = defaultdict[_K, list[_V]](list)
-
-    for value in values:
-        groups[key(value)].append(value)
-
-    return groups.items()
-
-
-# TODO: This function can be removed if transformer_modules classes are
-# serialized by value when communicating between processes
-def init_cached_hf_modules() -> None:
-    """
-    Lazy initialization of the Hugging Face modules.
-    """
-    from transformers.dynamic_module_utils import init_hf_modules
-    init_hf_modules()
-
-
-@cache
-def find_library(lib_name: str) -> str:
-    """
-    Find the library file in the system.
-    `lib_name` is full filename, with both prefix and suffix.
-    This function resolves `lib_name` to the full path of the library.
-    """
-    # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
-    # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
-    # `/sbin/ldconfig` should exist in all Linux systems.
-    # `/sbin/ldconfig` searches the library in the system
-    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
-    # each line looks like the following:
-    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
-    locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
-    # `LD_LIBRARY_PATH` searches the library in the user-defined paths
-    env_ld_library_path = envs.LD_LIBRARY_PATH
-    if not locs and env_ld_library_path:
-        locs = [
-            os.path.join(dir, lib_name)
-            for dir in env_ld_library_path.split(":")
-            if os.path.exists(os.path.join(dir, lib_name))
-        ]
-    if not locs:
-        raise ValueError(f"Cannot find {lib_name} in the system.")
-    return locs[0]
-
-
-def find_nccl_library() -> str:
-    """
-    We either use the library file specified by the `VLLM_NCCL_SO_PATH`
-    environment variable, or we find the library file brought by PyTorch.
-    After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
-    found by `ctypes` automatically.
-    """
-    so_file = envs.VLLM_NCCL_SO_PATH
-
-    # manually load the nccl library
-    if so_file:
-        logger.info(
-            "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
-            so_file)
-    else:
-        if torch.version.cuda is not None:
-            so_file = "libnccl.so.2"
-        elif torch.version.hip is not None:
-            so_file = "librccl.so.1"
-        else:
-            raise ValueError("NCCL only supports CUDA and ROCm backends.")
-        logger.info("Found nccl from library %s", so_file)
-    return so_file
-
-
-def find_nccl_include_paths() -> Optional[list[str]]:
-    """
-    We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH`
-    environment variable, or we find the library file brought by 
-    nvidia-nccl-cuXX. load_inline by default uses 
-    torch.utils.cpp_extension.include_paths
-    """
-    paths: list[str] = []
-    inc = envs.VLLM_NCCL_INCLUDE_PATH
-    if inc and os.path.isdir(inc):
-        paths.append(inc)
-
-    try:
-        import importlib.util
-        spec = importlib.util.find_spec("nvidia.nccl")
-        if spec and getattr(spec, "submodule_search_locations", None):
-            for loc in spec.submodule_search_locations:
-                inc_dir = os.path.join(loc, "include")
-                if os.path.exists(os.path.join(inc_dir, "nccl.h")):
-                    paths.append(inc_dir)
-    except Exception:
-        pass
-
-    seen = set()
-    out: list[str] = []
-    for p in paths:
-        if p and p not in seen:
-            out.append(p)
-            seen.add(p)
-    return out or None
-
-
-prev_set_stream = torch.cuda.set_stream
-
-_current_stream_tls = threading.local()
-
-
-def _patched_set_stream(stream: torch.cuda.Stream) -> None:
-    _current_stream_tls.value = stream
-    prev_set_stream(stream)
-
-
-torch.cuda.set_stream = _patched_set_stream
-
-
-class _StreamPlaceholder:
-
-    def __init__(self):
-        self.synchronize = lambda: None
-
-
-def current_stream() -> torch.cuda.Stream:
-    """
-    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
-    it turns out that `torch.cuda.current_stream()` is quite expensive,
-    as it will construct a new stream object at each call.
-    here we patch `torch.cuda.set_stream` to keep track of the current stream
-    directly, so that we can avoid calling `torch.cuda.current_stream()`.
-
-    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
-    from C/C++ code.
-    """
-    from vllm.platforms import current_platform
-    if not hasattr(_current_stream_tls,
-                   "value") or _current_stream_tls.value is None:
-        # when this function is called before any stream is set,
-        # we return the default stream.
-        # On ROCm using the default 0 stream in combination with RCCL
-        # is hurting performance. Therefore creating a dedicated stream
-        # per process
-        # if current_platform.is_rocm():
-        #     # torch.cuda.set_stream here is the alias of _pathed_set_stream
-        #     torch.cuda.set_stream(torch.cuda.Stream())
-        if current_platform.is_cpu():
-            _current_stream_tls.value = _StreamPlaceholder()
-        else:
-            current_stream = current_platform.current_stream
-            if current_stream is not None:
-                _current_stream_tls.value = current_stream()
-            else:
-                raise ValueError(
-                    "Fail to set current stream, current platform "
-                    "may not support current_stream with torch API")
-    return _current_stream_tls.value
-
-
-def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
-    """Set up function tracing for the current thread,
-    if enabled via the VLLM_TRACE_FUNCTION environment variable
-    """
-
-    if envs.VLLM_TRACE_FUNCTION:
-        tmp_dir = tempfile.gettempdir()
-        # add username to tmp_dir to avoid permission issues
-        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
-        filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
-                    f"_thread_{threading.get_ident()}_"
-                    f"at_{datetime.datetime.now()}.log").replace(" ", "_")
-        log_path = os.path.join(tmp_dir, "vllm",
-                                f"vllm-instance-{vllm_config.instance_id}",
-                                filename)
-        os.makedirs(os.path.dirname(log_path), exist_ok=True)
-        enable_trace_function_call(log_path)
-
-
-# `functools` helpers
-def identity(value: T, **kwargs) -> T:
-    """Returns the first provided value."""
-    return value
-
-
-F = TypeVar('F', bound=Callable[..., Any])
-
-
-def deprecate_args(
-    start_index: int,
-    is_deprecated: Union[bool, Callable[[], bool]] = True,
-    additional_message: Optional[str] = None,
-) -> Callable[[F], F]:
-    if not callable(is_deprecated):
-        is_deprecated = partial(identity, is_deprecated)
-
-    def wrapper(fn: F) -> F:
-
-        params = inspect.signature(fn).parameters
-        pos_types = (
-            inspect.Parameter.POSITIONAL_ONLY,
-            inspect.Parameter.POSITIONAL_OR_KEYWORD,
-        )
-        pos_kws = [
-            kw for kw, param in params.items() if param.kind in pos_types
-        ]
-
-        @wraps(fn)
-        def inner(*args, **kwargs):
-            if is_deprecated():
-                deprecated_args = pos_kws[start_index:len(args)]
-                if deprecated_args:
-                    msg = (
-                        f"The positional arguments {deprecated_args} are "
-                        "deprecated and will be removed in a future update.")
-                    if additional_message is not None:
-                        msg += f" {additional_message}"
-
-                    warnings.warn(
-                        DeprecationWarning(msg),
-                        stacklevel=3,  # The inner function takes up one level
-                    )
-
-            return fn(*args, **kwargs)
-
-        return inner  # type: ignore
-
-    return wrapper
-
-
-def deprecate_kwargs(
-    *kws: str,
-    is_deprecated: Union[bool, Callable[[], bool]] = True,
-    additional_message: Optional[str] = None,
-) -> Callable[[F], F]:
-    deprecated_kws = set(kws)
-
-    if not callable(is_deprecated):
-        is_deprecated = partial(identity, is_deprecated)
-
-    def wrapper(fn: F) -> F:
-
-        @wraps(fn)
-        def inner(*args, **kwargs):
-            if is_deprecated():
-                deprecated_kwargs = kwargs.keys() & deprecated_kws
-                if deprecated_kwargs:
-                    msg = (
-                        f"The keyword arguments {deprecated_kwargs} are "
-                        "deprecated and will be removed in a future update.")
-                    if additional_message is not None:
-                        msg += f" {additional_message}"
-
-                    warnings.warn(
-                        DeprecationWarning(msg),
-                        stacklevel=3,  # The inner function takes up one level
-                    )
-
-            return fn(*args, **kwargs)
-
-        return inner  # type: ignore
-
-    return wrapper
-
-
-@lru_cache(maxsize=8)
-def _cuda_device_count_stateless(
-        cuda_visible_devices: Optional[str] = None) -> int:
-    # Note: cuda_visible_devices is not used, but we keep it as an argument for
-    # LRU Cache purposes.
-
-    # Code below is based on
-    # https://github.com/pytorch/pytorch/blob/
-    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
-    # torch/cuda/__init__.py#L831C1-L831C17
-    import torch.cuda
-    import torch.version
-
-    from vllm.platforms import current_platform
-    if not torch.cuda._is_compiled():
-        return 0
-    if current_platform.is_rocm():
-        # ROCm uses amdsmi instead of nvml for stateless device count
-        # This requires a sufficiently modern version of Torch 2.4.0
-        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
-            torch.cuda, "_device_count_amdsmi")) else -1
-    else:
-        raw_count = torch.cuda._device_count_nvml()
-    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
-    return r
-
-
-def cuda_device_count_stateless() -> int:
-    """Get number of CUDA devices, caching based on the value of
-    CUDA_VISIBLE_DEVICES at the time of call.
-
-    This should be used instead of torch.cuda.device_count()
-    unless CUDA_VISIBLE_DEVICES has already been set to the desired
-    value."""
-
-    # This can be removed and simply replaced with torch.cuda.get_device_count
-    # after https://github.com/pytorch/pytorch/pull/122815 is released.
-    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
-
-
-def cuda_is_initialized() -> bool:
-    """Check if CUDA is initialized."""
-    if not torch.cuda._is_compiled():
-        return False
-    return torch.cuda.is_initialized()
-
-
-def xpu_is_initialized() -> bool:
-    """Check if XPU is initialized."""
-    if not torch.xpu._is_compiled():
-        return False
-    return torch.xpu.is_initialized()
-
-
-def cuda_get_device_properties(device,
-                               names: Sequence[str],
-                               init_cuda=False) -> tuple[Any, ...]:
-    """Get specified CUDA device property values without initializing CUDA in
-    the current process."""
-    if init_cuda or cuda_is_initialized():
-        props = torch.cuda.get_device_properties(device)
-        return tuple(getattr(props, name) for name in names)
-
-    # Run in subprocess to avoid initializing CUDA as a side effect.
-    mp_ctx = multiprocessing.get_context("fork")
-    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
-        return executor.submit(cuda_get_device_properties, device, names,
-                               True).result()
-
-
-def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
-    """Make an instance method that weakly references
-    its associated instance and no-ops once that
-    instance is collected."""
-    ref = weakref.ref(bound_method.__self__)  # type: ignore[attr-defined]
-    unbound = bound_method.__func__  # type: ignore[attr-defined]
-
-    def weak_bound(*args, **kwargs) -> None:
-        if inst := ref():
-            unbound(inst, *args, **kwargs)
-
-    return weak_bound
-
-
-def run_once(f: Callable[P, None]) -> Callable[P, None]:
-
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
-        if wrapper.has_run:  # type: ignore[attr-defined]
-            return
-
-        with wrapper.lock:  # type: ignore[attr-defined]
-            if not wrapper.has_run:  # type: ignore[attr-defined]
-                wrapper.has_run = True  # type: ignore[attr-defined]
-                return f(*args, **kwargs)
-
-    wrapper.has_run = False  # type: ignore[attr-defined]
-    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
-    return wrapper
-
-
-class StoreBoolean(Action):
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        if values.lower() == "true":
-            setattr(namespace, self.dest, True)
-        elif values.lower() == "false":
-            setattr(namespace, self.dest, False)
-        else:
-            raise ValueError(f"Invalid boolean value: {values}. "
-                             "Expected 'true' or 'false'.")
-
-
-class SortedHelpFormatter(ArgumentDefaultsHelpFormatter,
-                          RawDescriptionHelpFormatter):
-    """SortedHelpFormatter that sorts arguments by their option strings."""
-
-    def _split_lines(self, text, width):
-        """
-        1. Sentences split across lines have their single newlines removed.
-        2. Paragraphs and explicit newlines are split into separate lines.
-        3. Each line is wrapped to the specified width (width of terminal).
-        """
-        # The patterns also include whitespace after the newline
-        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
-        multiple_newlines = re.compile(r"\n{2,}\s*")
-        text = single_newline.sub(' ', text)
-        lines = re.split(multiple_newlines, text)
-        return sum([textwrap.wrap(line, width) for line in lines], [])
-
-    def add_arguments(self, actions):
-        actions = sorted(actions, key=lambda x: x.option_strings)
-        super().add_arguments(actions)
-
-
-class FlexibleArgumentParser(ArgumentParser):
-    """ArgumentParser that allows both underscore and dash in names."""
-
-    _deprecated: set[Action] = set()
-    _json_tip: str = (
-        "When passing JSON CLI arguments, the following sets of arguments "
-        "are equivalent:\n"
-        '   --json-arg \'{"key1": "value1", "key2": {"key3": "value2"}}\'\n'
-        "   --json-arg.key1 value1 --json-arg.key2.key3 value2\n\n"
-        "Additionally, list elements can be passed individually using +:\n"
-        '   --json-arg \'{"key4": ["value3", "value4", "value5"]}\'\n'
-        "   --json-arg.key4+ value3 --json-arg.key4+=\'value4,value5\'\n\n")
-    _search_keyword: Optional[str] = None
-
-    def __init__(self, *args, **kwargs):
-        # Set the default "formatter_class" to SortedHelpFormatter
-        if "formatter_class" not in kwargs:
-            kwargs["formatter_class"] = SortedHelpFormatter
-        # Pop kwarg "add_json_tip" to control whether to add the JSON tip
-        self.add_json_tip = kwargs.pop("add_json_tip", True)
-        super().__init__(*args, **kwargs)
-
-    if sys.version_info < (3, 13):
-        # Enable the deprecated kwarg for Python 3.12 and below
-
-        def parse_known_args(self, args=None, namespace=None):
-            if args is not None and "--disable-log-requests" in args:
-                # Special case warning because the warning below won't trigger
-                # if –-disable-log-requests because its value is default.
-                logger.warning_once(
-                    "argument '--disable-log-requests' is deprecated and "
-                    "replaced with '--enable-log-requests'. This will be "
-                    "removed in v0.12.0.")
-            namespace, args = super().parse_known_args(args, namespace)
-            for action in FlexibleArgumentParser._deprecated:
-                if (hasattr(namespace, dest := action.dest)
-                        and getattr(namespace, dest) != action.default):
-                    logger.warning_once("argument '%s' is deprecated", dest)
-            return namespace, args
-
-        def add_argument(self, *args, **kwargs):
-            deprecated = kwargs.pop("deprecated", False)
-            action = super().add_argument(*args, **kwargs)
-            if deprecated:
-                FlexibleArgumentParser._deprecated.add(action)
-            return action
-
-        class _FlexibleArgumentGroup(_ArgumentGroup):
-
-            def add_argument(self, *args, **kwargs):
-                deprecated = kwargs.pop("deprecated", False)
-                action = super().add_argument(*args, **kwargs)
-                if deprecated:
-                    FlexibleArgumentParser._deprecated.add(action)
-                return action
-
-        def add_argument_group(self, *args, **kwargs):
-            group = self._FlexibleArgumentGroup(self, *args, **kwargs)
-            self._action_groups.append(group)
-            return group
-
-    def format_help(self):
-        # Only use custom help formatting for bottom level parsers
-        if self._subparsers is not None:
-            return super().format_help()
-
-        formatter = self._get_formatter()
-
-        # Handle keyword search of the args
-        if (search_keyword := self._search_keyword) is not None:
-            # Normalise the search keyword
-            search_keyword = search_keyword.lower().replace("_", "-")
-            # Return full help if searching for 'all'
-            if search_keyword == 'all':
-                self.epilog = self._json_tip
-                return super().format_help()
-
-            # Return group help if searching for a group title
-            for group in self._action_groups:
-                if group.title and group.title.lower() == search_keyword:
-                    formatter.start_section(group.title)
-                    formatter.add_text(group.description)
-                    formatter.add_arguments(group._group_actions)
-                    formatter.end_section()
-                    formatter.add_text(self._json_tip)
-                    return formatter.format_help()
-
-            # Return matched args if searching for an arg name
-            matched_actions = []
-            for group in self._action_groups:
-                for action in group._group_actions:
-                    # search option name
-                    if any(search_keyword in opt.lower()
-                           for opt in action.option_strings):
-                        matched_actions.append(action)
-            if matched_actions:
-                formatter.start_section(
-                    f"Arguments matching '{search_keyword}'")
-                formatter.add_arguments(matched_actions)
-                formatter.end_section()
-                formatter.add_text(self._json_tip)
-                return formatter.format_help()
-
-            # No match found
-            formatter.add_text(
-                f"No group or arguments matching '{search_keyword}'.\n"
-                "Use '--help' to see available groups or "
-                "'--help=all' to see all available parameters.")
-            return formatter.format_help()
-
-        # usage
-        formatter.add_usage(self.usage, self._actions,
-                            self._mutually_exclusive_groups)
-
-        # description
-        formatter.add_text(self.description)
-
-        # positionals, optionals and user-defined groups
-        formatter.start_section("Config Groups")
-        config_groups = ""
-        for group in self._action_groups:
-            if not group._group_actions:
-                continue
-            title = group.title
-            description = group.description or ""
-            config_groups += f"{title: <24}{description}\n"
-        formatter.add_text(config_groups)
-        formatter.end_section()
-
-        # epilog
-        formatter.add_text(self.epilog)
-
-        # determine help from format above
-        return formatter.format_help()
-
-    def parse_args(  # type: ignore[override]
-        self,
-        args: list[str] | None = None,
-        namespace: Namespace | None = None,
-    ):
-        if args is None:
-            args = sys.argv[1:]
-
-        # Check for --model in command line arguments first
-        if args and args[0] == "serve":
-            model_in_cli_args = any(arg == '--model' for arg in args)
-
-            if model_in_cli_args:
-                raise ValueError(
-                    "With `vllm serve`, you should provide the model as a "
-                    "positional argument or in a config file instead of via "
-                    "the `--model` option.")
-
-        if '--config' in args:
-            args = self._pull_args_from_config(args)
-
-        def repl(match: re.Match) -> str:
-            """Replaces underscores with dashes in the matched string."""
-            return match.group(0).replace("_", "-")
-
-        # Everything between the first -- and the first .
-        pattern = re.compile(r"(?<=--)[^\.]*")
-
-        # Convert underscores to dashes and vice versa in argument names
-        processed_args = list[str]()
-        for i, arg in enumerate(args):
-            if arg.startswith("--help="):
-                FlexibleArgumentParser._search_keyword = arg.split(
-                    '=', 1)[-1].lower()
-                processed_args.append("--help")
-            elif arg.startswith('--'):
-                if '=' in arg:
-                    key, value = arg.split('=', 1)
-                    key = pattern.sub(repl, key, count=1)
-                    processed_args.append(f'{key}={value}')
-                else:
-                    key = pattern.sub(repl, arg, count=1)
-                    processed_args.append(key)
-            elif arg.startswith('-O') and arg != '-O' and arg[2] != '.':
-                # allow -O flag to be used without space, e.g. -O3 or -Odecode
-                # -O.<...> handled later
-                # also handle -O=<level> here
-                level = arg[3:] if arg[2] == '=' else arg[2:]
-                processed_args.append(f'-O.level={level}')
-            elif arg == '-O' and i + 1 < len(args) and args[i + 1] in {
-                    "0", "1", "2", "3"
-            }:
-                # Convert -O <n> to -O.level <n>
-                processed_args.append('-O.level')
-            else:
-                processed_args.append(arg)
-
-        def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]:
-            """Creates a nested dictionary from a list of keys and a value.
-
-            For example, `keys = ["a", "b", "c"]` and `value = 1` will create:
-            `{"a": {"b": {"c": 1}}}`
-            """
-            nested_dict: Any = value
-            for key in reversed(keys):
-                nested_dict = {key: nested_dict}
-            return nested_dict
-
-        def recursive_dict_update(
-            original: dict[str, Any],
-            update: dict[str, Any],
-        ) -> set[str]:
-            """Recursively updates a dictionary with another dictionary.
-            Returns a set of duplicate keys that were overwritten.
-            """
-            duplicates = set[str]()
-            for k, v in update.items():
-                if isinstance(v, dict) and isinstance(original.get(k), dict):
-                    nested_duplicates = recursive_dict_update(original[k], v)
-                    duplicates |= {f"{k}.{d}" for d in nested_duplicates}
-                elif isinstance(v, list) and isinstance(original.get(k), list):
-                    original[k] += v
-                else:
-                    if k in original:
-                        duplicates.add(k)
-                    original[k] = v
-            return duplicates
-
-        delete = set[int]()
-        dict_args = defaultdict[str, dict[str, Any]](dict)
-        duplicates = set[str]()
-        for i, processed_arg in enumerate(processed_args):
-            if i in delete:  # skip if value from previous arg
-                continue
-
-            if processed_arg.startswith("-") and "." in processed_arg:
-                if "=" in processed_arg:
-                    processed_arg, value_str = processed_arg.split("=", 1)
-                    if "." not in processed_arg:
-                        # False positive, '.' was only in the value
-                        continue
-                else:
-                    value_str = processed_args[i + 1]
-                    delete.add(i + 1)
-
-                if processed_arg.endswith("+"):
-                    processed_arg = processed_arg[:-1]
-                    value_str = json.dumps(list(value_str.split(",")))
-
-                key, *keys = processed_arg.split(".")
-                try:
-                    value = json.loads(value_str)
-                except json.decoder.JSONDecodeError:
-                    value = value_str
-
-                # Merge all values with the same key into a single dict
-                arg_dict = create_nested_dict(keys, value)
-                arg_duplicates = recursive_dict_update(dict_args[key],
-                                                       arg_dict)
-                duplicates |= {f'{key}.{d}' for d in arg_duplicates}
-                delete.add(i)
-        # Filter out the dict args we set to None
-        processed_args = [
-            a for i, a in enumerate(processed_args) if i not in delete
-        ]
-        if duplicates:
-            logger.warning("Found duplicate keys %s", ", ".join(duplicates))
-
-        # Add the dict args back as if they were originally passed as JSON
-        for dict_arg, dict_value in dict_args.items():
-            processed_args.append(dict_arg)
-            processed_args.append(json.dumps(dict_value))
-
-        return super().parse_args(processed_args, namespace)
-
-    def check_port(self, value):
-        try:
-            value = int(value)
-        except ValueError:
-            msg = "Port must be an integer"
-            raise ArgumentTypeError(msg) from None
-
-        if not (1024 <= value <= 65535):
-            raise ArgumentTypeError("Port must be between 1024 and 65535")
-
-        return value
-
-    def _pull_args_from_config(self, args: list[str]) -> list[str]:
-        """Method to pull arguments specified in the config file
-        into the command-line args variable.
-
-        The arguments in config file will be inserted between
-        the argument list.
-
-        example:
-        ```yaml
-            port: 12323
-            tensor-parallel-size: 4
-        ```
-        ```python
-        $: vllm {serve,chat,complete} "facebook/opt-12B" \
-            --config config.yaml -tp 2
-        $: args = [
-            "serve,chat,complete",
-            "facebook/opt-12B",
-            '--config', 'config.yaml',
-            '-tp', '2'
-        ]
-        $: args = [
-            "serve,chat,complete",
-            "facebook/opt-12B",
-            '--port', '12323',
-            '--tensor-parallel-size', '4',
-            '-tp', '2'
-            ]
-        ```
-
-        Please note how the config args are inserted after the sub command.
-        this way the order of priorities is maintained when these are args
-        parsed by super().
-        """
-        assert args.count(
-            '--config') <= 1, "More than one config file specified!"
-
-        index = args.index('--config')
-        if index == len(args) - 1:
-            raise ValueError("No config file specified! \
-                             Please check your command-line arguments.")
-
-        file_path = args[index + 1]
-
-        config_args = self.load_config_file(file_path)
-
-        # 0th index might be the sub command {serve,chat,complete,...}
-        # optionally followed by model_tag (only for serve)
-        # followed by config args
-        # followed by rest of cli args.
-        # maintaining this order will enforce the precedence
-        # of cli > config > defaults
-        if args[0].startswith('-'):
-            # No sub command (e.g., api_server entry point)
-            args = config_args + args[0:index] + args[index + 2:]
-        elif args[0] == "serve":
-            model_in_cli = len(args) > 1 and not args[1].startswith('-')
-            model_in_config = any(arg == '--model' for arg in config_args)
-
-            if not model_in_cli and not model_in_config:
-                raise ValueError(
-                    "No model specified! Please specify model either "
-                    "as a positional argument or in a config file.")
-
-            if model_in_cli:
-                # Model specified as positional arg, keep CLI version
-                args = [args[0]] + [
-                    args[1]
-                ] + config_args + args[2:index] + args[index + 2:]
-            else:
-                # No model in CLI, use config if available
-                args = [args[0]
-                        ] + config_args + args[1:index] + args[index + 2:]
-        else:
-            args = [args[0]] + config_args + args[1:index] + args[index + 2:]
-
-        return args
-
-    def load_config_file(self, file_path: str) -> list[str]:
-        """Loads a yaml file and returns the key value pairs as a
-        flattened list with argparse like pattern
-        ```yaml
-            port: 12323
-            tensor-parallel-size: 4
-        ```
-        returns:
-            processed_args: list[str] = [
-                '--port': '12323',
-                '--tensor-parallel-size': '4'
-            ]
-        """
-        extension: str = file_path.split('.')[-1]
-        if extension not in ('yaml', 'yml'):
-            raise ValueError(
-                "Config file must be of a yaml/yml type.\
-                              %s supplied", extension)
-
-        # only expecting a flat dictionary of atomic types
-        processed_args: list[str] = []
-
-        config: dict[str, Union[int, str]] = {}
-        try:
-            with open(file_path) as config_file:
-                config = yaml.safe_load(config_file)
-        except Exception as ex:
-            logger.error(
-                "Unable to read the config file at %s. \
-                Make sure path is correct", file_path)
-            raise ex
-
-        store_boolean_arguments = [
-            action.dest for action in self._actions
-            if isinstance(action, StoreBoolean)
-        ]
-
-        for key, value in config.items():
-            if isinstance(value, bool) and key not in store_boolean_arguments:
-                if value:
-                    processed_args.append('--' + key)
-            elif isinstance(value, list):
-                if value:
-                    processed_args.append('--' + key)
-                    for item in value:
-                        processed_args.append(str(item))
-            else:
-                processed_args.append('--' + key)
-                processed_args.append(str(value))
-
-        return processed_args
-
-
-async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
-                              **kwargs):
-    """Utility function to run async task in a lock"""
-    async with lock:
-        return await task(*args, **kwargs)
-
-
-@lru_cache
-def supports_kw(
-    callable: Callable[..., object],
-    kw_name: str,
-    *,
-    requires_kw_only: bool = False,
-    allow_var_kwargs: bool = True,
-) -> bool:
-    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
-    disallows kwargs names that can also be positional arguments.
-    """
-    params = inspect.signature(callable).parameters
-    if not params:
-        return False
-
-    param_val = params.get(kw_name)
-
-    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
-    passable_kw_types = set((inspect.Parameter.POSITIONAL_ONLY,
-                             inspect.Parameter.POSITIONAL_OR_KEYWORD,
-                             inspect.Parameter.KEYWORD_ONLY))
-
-    if param_val:
-        is_sig_param = param_val.kind in passable_kw_types
-        # We want kwargs only, but this is passable as a positional arg
-        if (requires_kw_only and is_sig_param
-                and param_val.kind != inspect.Parameter.KEYWORD_ONLY):
-            return False
-        if ((requires_kw_only
-             and param_val.kind == inspect.Parameter.KEYWORD_ONLY)
-                or (not requires_kw_only and is_sig_param)):
-            return True
-
-    # If we're okay with var-kwargs, it's supported as long as
-    # the kw_name isn't something like *args, **kwargs
-    if allow_var_kwargs:
-        # Get the last param; type is ignored here because params is a proxy
-        # mapping, but it wraps an ordered dict, and they appear in order.
-        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
-        last_param = params[next(reversed(params))]  # type: ignore
-        return (last_param.kind == inspect.Parameter.VAR_KEYWORD
-                and last_param.name != kw_name)
-
-    return False
-
-
-def get_allowed_kwarg_only_overrides(
-    callable: Callable[..., object],
-    overrides: Optional[Mapping[str, object]],
-    *,
-    requires_kw_only: bool = True,
-    allow_var_kwargs: bool = False,
-) -> dict[str, Any]:
-    """
-    Given a callable which has one or more keyword only params and a dict
-    mapping param names to values, drop values that can be not be kwarg
-    expanded to overwrite one or more keyword-only args. This is used in a
-    few places to handle custom processor overrides for multimodal models,
-    e.g., for profiling when processor options provided by the user
-    may affect the number of mm tokens per instance.
-
-    Args:
-        callable: Callable which takes 0 or more keyword only arguments.
-                  If None is provided, all overrides names are allowed.
-        overrides: Potential overrides to be used when invoking the callable.
-        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
-
-    Returns:
-        Dictionary containing the kwargs to be leveraged which may be used
-        to overwrite one or more keyword only arguments when invoking the
-        callable.
-    """
-    if not overrides:
-        return {}
-
-    # Drop any mm_processor_kwargs provided by the user that
-    # are not kwargs, unless it can fit it var_kwargs param
-    filtered_overrides = {
-        kwarg_name: val
-        for kwarg_name, val in overrides.items()
-        if supports_kw(callable,
-                       kwarg_name,
-                       requires_kw_only=requires_kw_only,
-                       allow_var_kwargs=allow_var_kwargs)
-    }
-
-    # If anything is dropped, log a warning
-    dropped_keys = overrides.keys() - filtered_overrides.keys()
-    if dropped_keys:
-        if requires_kw_only:
-            logger.warning(
-                "The following intended overrides are not keyword-only args "
-                "and will be dropped: %s", dropped_keys)
-        else:
-            logger.warning(
-                "The following intended overrides are not keyword args "
-                "and will be dropped: %s", dropped_keys)
-
-    return filtered_overrides
-
-
-# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
-# In particular, the FakeScalarType is not supported for earlier versions of
-# PyTorch which breaks dynamo for any ops registered using ScalarType.
-def supports_dynamo() -> bool:
-    base_torch_version = Version(Version(torch.__version__).base_version)
-    return base_torch_version >= Version("2.4.0")
-
-
-# Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
-def supports_xccl() -> bool:
-    return is_torch_equal_or_newer(
-        "2.8.0.dev") and torch.distributed.is_xccl_available()
-
-
-# Some backends use pytorch version < 2.4.0 which doesn't
-# support `torch.library.custom_op`.
-def supports_custom_op() -> bool:
-    return hasattr(torch.library, "custom_op")
-
-
-class AtomicCounter:
-    """An atomic, thread-safe counter"""
-
-    def __init__(self, initial=0):
-        """Initialize a new atomic counter to given initial value"""
-        self._value = initial
-        self._lock = threading.Lock()
-
-    def inc(self, num=1):
-        """Atomically increment the counter by num and return the new value"""
-        with self._lock:
-            self._value += num
-            return self._value
-
-    def dec(self, num=1):
-        """Atomically decrement the counter by num and return the new value"""
-        with self._lock:
-            self._value -= num
-            return self._value
-
-    @property
-    def value(self):
-        return self._value
-
-
-# Adapted from: https://stackoverflow.com/a/47212782/5082708
-class LazyDict(Mapping[str, T], Generic[T]):
-
-    def __init__(self, factory: dict[str, Callable[[], T]]):
-        self._factory = factory
-        self._dict: dict[str, T] = {}
-
-    def __getitem__(self, key: str) -> T:
-        if key not in self._dict:
-            if key not in self._factory:
-                raise KeyError(key)
-            self._dict[key] = self._factory[key]()
-        return self._dict[key]
-
-    def __setitem__(self, key: str, value: Callable[[], T]):
-        self._factory[key] = value
-
-    def __iter__(self):
-        return iter(self._factory)
-
-    def __len__(self):
-        return len(self._factory)
-
-
-class ClassRegistry(UserDict[type[T], _V]):
-
-    def __getitem__(self, key: type[T]) -> _V:
-        for cls in key.mro():
-            if cls in self.data:
-                return self.data[cls]
-
-        raise KeyError(key)
-
-    def __contains__(self, key: object) -> bool:
-        return self.contains(key)
-
-    def contains(self, key: object, *, strict: bool = False) -> bool:
-        if not isinstance(key, type):
-            return False
-
-        if strict:
-            return key in self.data
-
-        return any(cls in self.data for cls in key.mro())
-
-
-def weak_ref_tensor(tensor: Any) -> Any:
-    """
-    Create a weak reference to a tensor.
-    The new tensor will share the same data as the original tensor,
-    but will not keep the original tensor alive.
-    """
-    if isinstance(tensor, torch.Tensor):
-        return torch.ops._C.weak_ref_tensor(tensor)
-    else:
-        return tensor
-
-
-def weak_ref_tensors(
-    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor],
-                   IntermediateTensors]
-) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
-    """
-    Convenience function to create weak references to tensors,
-    for single tensor, list of tensors or tuple of tensors.
-    """
-    if isinstance(tensors, torch.Tensor):
-        return weak_ref_tensor(tensors)
-    if isinstance(tensors, list):
-        return [weak_ref_tensor(t) for t in tensors]
-    if isinstance(tensors, tuple):
-        return tuple(weak_ref_tensor(t) for t in tensors)
-
-    # For IntermediateTensors used in pipeline parallelism
-    from vllm.sequence import IntermediateTensors
-    if isinstance(tensors, IntermediateTensors):
-        ret = IntermediateTensors({
-            key: weak_ref_tensor(val)
-            for key, val in tensors.tensors.items()
-        })
-        return ret
-    raise ValueError("Invalid type for tensors")
-
-
-def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
-    """
-    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
-    """
-    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
-    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
-
-
-def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
-    """
-    Import a Python file according to its file path.
-
-    Based on the official recipe:
-    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
-    """
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ModuleNotFoundError(f"No module named '{module_name}'")
-
-    assert spec.loader is not None
-
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-    return module
-
-
-@cache
-def get_vllm_optional_dependencies():
-    metadata = importlib.metadata.metadata("vllm")
-    requirements = metadata.get_all("Requires-Dist", [])
-    extras = metadata.get_all("Provides-Extra", [])
-
-    return {
-        extra: [
-            re.split(r";|>=|<=|==", req)[0] for req in requirements
-            if req.endswith(f'extra == "{extra}"')
-        ]
-        for extra in extras
-    }
-
-
-class _PlaceholderBase:
-    """
-    Disallows downstream usage of placeholder modules.
-
-    We need to explicitly override each dunder method because
-    [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
-    is not called when they are accessed.
-
-    Info:
-        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
-    """
-
-    def __getattr__(self, key: str) -> Never:
-        """
-        The main class should implement this to throw an error
-        for attribute accesses representing downstream usage.
-        """
-        raise NotImplementedError
-
-    # [Basic customization]
-
-    def __lt__(self, other: object):
-        return self.__getattr__("__lt__")
-
-    def __le__(self, other: object):
-        return self.__getattr__("__le__")
-
-    def __eq__(self, other: object):
-        return self.__getattr__("__eq__")
-
-    def __ne__(self, other: object):
-        return self.__getattr__("__ne__")
-
-    def __gt__(self, other: object):
-        return self.__getattr__("__gt__")
-
-    def __ge__(self, other: object):
-        return self.__getattr__("__ge__")
-
-    def __hash__(self):
-        return self.__getattr__("__hash__")
-
-    def __bool__(self):
-        return self.__getattr__("__bool__")
-
-    # [Callable objects]
-
-    def __call__(self, *args: object, **kwargs: object):
-        return self.__getattr__("__call__")
-
-    # [Container types]
-
-    def __len__(self):
-        return self.__getattr__("__len__")
-
-    def __getitem__(self, key: object):
-        return self.__getattr__("__getitem__")
-
-    def __setitem__(self, key: object, value: object):
-        return self.__getattr__("__setitem__")
-
-    def __delitem__(self, key: object):
-        return self.__getattr__("__delitem__")
-
-    # __missing__ is optional according to __getitem__ specification,
-    # so it is skipped
-
-    # __iter__ and __reversed__ have a default implementation
-    # based on __len__ and __getitem__, so they are skipped.
-
-    # [Numeric Types]
-
-    def __add__(self, other: object):
-        return self.__getattr__("__add__")
-
-    def __sub__(self, other: object):
-        return self.__getattr__("__sub__")
-
-    def __mul__(self, other: object):
-        return self.__getattr__("__mul__")
-
-    def __matmul__(self, other: object):
-        return self.__getattr__("__matmul__")
-
-    def __truediv__(self, other: object):
-        return self.__getattr__("__truediv__")
-
-    def __floordiv__(self, other: object):
-        return self.__getattr__("__floordiv__")
-
-    def __mod__(self, other: object):
-        return self.__getattr__("__mod__")
-
-    def __divmod__(self, other: object):
-        return self.__getattr__("__divmod__")
-
-    def __pow__(self, other: object, modulo: object = ...):
-        return self.__getattr__("__pow__")
-
-    def __lshift__(self, other: object):
-        return self.__getattr__("__lshift__")
-
-    def __rshift__(self, other: object):
-        return self.__getattr__("__rshift__")
-
-    def __and__(self, other: object):
-        return self.__getattr__("__and__")
-
-    def __xor__(self, other: object):
-        return self.__getattr__("__xor__")
-
-    def __or__(self, other: object):
-        return self.__getattr__("__or__")
-
-    # r* and i* methods have lower priority than
-    # the methods for left operand so they are skipped
-
-    def __neg__(self):
-        return self.__getattr__("__neg__")
-
-    def __pos__(self):
-        return self.__getattr__("__pos__")
-
-    def __abs__(self):
-        return self.__getattr__("__abs__")
-
-    def __invert__(self):
-        return self.__getattr__("__invert__")
-
-    # __complex__, __int__ and __float__ have a default implementation
-    # based on __index__, so they are skipped.
-
-    def __index__(self):
-        return self.__getattr__("__index__")
-
-    def __round__(self, ndigits: object = ...):
-        return self.__getattr__("__round__")
-
-    def __trunc__(self):
-        return self.__getattr__("__trunc__")
-
-    def __floor__(self):
-        return self.__getattr__("__floor__")
-
-    def __ceil__(self):
-        return self.__getattr__("__ceil__")
-
-    # [Context managers]
-
-    def __enter__(self):
-        return self.__getattr__("__enter__")
-
-    def __exit__(self, *args: object, **kwargs: object):
-        return self.__getattr__("__exit__")
-
-
-class PlaceholderModule(_PlaceholderBase):
-    """
-    A placeholder object to use when a module does not exist.
-
-    This enables more informative errors when trying to access attributes
-    of a module that does not exist.
-    """
-
-    def __init__(self, name: str) -> None:
-        super().__init__()
-
-        # Apply name mangling to avoid conflicting with module attributes
-        self.__name = name
-
-    def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self, attr_path)
-
-    def __getattr__(self, key: str):
-        name = self.__name
-
-        try:
-            importlib.import_module(name)
-        except ImportError as exc:
-            for extra, names in get_vllm_optional_dependencies().items():
-                if name in names:
-                    msg = f"Please install vllm[{extra}] for {extra} support"
-                    raise ImportError(msg) from exc
-
-            raise exc
-
-        raise AssertionError("PlaceholderModule should not be used "
-                             "when the original module can be imported")
-
-
-class _PlaceholderModuleAttr(_PlaceholderBase):
-
-    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
-        super().__init__()
-
-        # Apply name mangling to avoid conflicting with module attributes
-        self.__module = module
-        self.__attr_path = attr_path
-
-    def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self.__module,
-                                      f"{self.__attr_path}.{attr_path}")
-
-    def __getattr__(self, key: str):
-        getattr(self.__module, f"{self.__attr_path}.{key}")
-
-        raise AssertionError("PlaceholderModule should not be used "
-                             "when the original module can be imported")
-
-
-# create a library to hold the custom op
-vllm_lib = Library("vllm", "FRAGMENT")  # noqa
-
-
-def direct_register_custom_op(
-        op_name: str,
-        op_func: Callable,
-        mutates_args: Optional[list[str]] = None,
-        fake_impl: Optional[Callable] = None,
-        target_lib: Optional[Library] = None,
-        dispatch_key: Optional[str] = None,
-        tags: tuple[torch.Tag, ...] = (),
-):
-    """
-    `torch.library.custom_op` can have significant overhead because it
-    needs to consider complicated dispatching logic. This function
-    directly registers a custom op and dispatches it to the CUDA backend.
-    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
-    for more details.
-
-    By default, the custom op is registered to the vLLM library. If you
-    want to register it to a different library, you can pass the library
-    object to the `target_lib` argument.
-
-    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
-    library object. If you want to bind the operator to a different library,
-    make sure the library object is alive when the operator is used.
-    """
-    if not supports_custom_op():
-        from vllm.platforms import current_platform
-        assert not current_platform.is_cuda_alike(), (
-            "cuda platform needs torch>=2.4 to support custom op, "
-            "chances are you are using an old version of pytorch "
-            "or a custom build of pytorch. It is recommended to "
-            "use vLLM in a fresh new environment and let it install "
-            "the required dependencies.")
-        return
-
-    if mutates_args is None:
-        mutates_args = []
-
-    if dispatch_key is None:
-        from vllm.platforms import current_platform
-        dispatch_key = current_platform.dispatch_key
-
-    import torch.library
-    if hasattr(torch.library, "infer_schema"):
-        schema_str = torch.library.infer_schema(op_func,
-                                                mutates_args=mutates_args)
-    else:
-        # for pytorch 2.4
-        import torch._custom_op.impl
-        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
-    my_lib = target_lib or vllm_lib
-    my_lib.define(op_name + schema_str, tags=tags)
-    my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
-    if fake_impl is not None:
-        my_lib._register_fake(op_name, fake_impl)
-
-
-def resolve_obj_by_qualname(qualname: str) -> Any:
-    """
-    Resolve an object by its fully-qualified class name.
-    """
-    module_name, obj_name = qualname.rsplit(".", 1)
-    module = importlib.import_module(module_name)
-    return getattr(module, obj_name)
-
-
-def kill_process_tree(pid: int):
-    """
-    Kills all descendant processes of the given pid by sending SIGKILL.
-
-    Args:
-        pid (int): Process ID of the parent process
-    """
-    try:
-        parent = psutil.Process(pid)
-    except psutil.NoSuchProcess:
-        return
-
-    # Get all children recursively
-    children = parent.children(recursive=True)
-
-    # Send SIGKILL to all children first
-    for child in children:
-        with contextlib.suppress(ProcessLookupError):
-            os.kill(child.pid, signal.SIGKILL)
-
-    # Finally kill the parent
-    with contextlib.suppress(ProcessLookupError):
-        os.kill(pid, signal.SIGKILL)
-
-
-@dataclass
-class MemorySnapshot:
-    """Memory snapshot."""
-    torch_peak: int = 0
-    free_memory: int = 0
-    total_memory: int = 0
-    cuda_memory: int = 0
-    torch_memory: int = 0
-    non_torch_memory: int = 0
-    timestamp: float = 0.0
-    auto_measure: bool = True
-
-    def __post_init__(self):
-        if self.auto_measure:
-            self.measure()
-
-    def measure(self):
-        # we measure the torch peak memory usage via allocated_bytes,
-        # rather than `torch.cuda.memory_reserved()` .
-        # After `torch.cuda.reset_peak_memory_stats()`,
-        # `torch.cuda.memory_reserved()` will keep growing, and only shrink
-        # when we call `torch.cuda.empty_cache()` or OOM happens.
-        self.torch_peak = torch.cuda.memory_stats().get(
-            "allocated_bytes.all.peak", 0)
-
-        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
-        self.cuda_memory = self.total_memory - self.free_memory
-
-        # torch.cuda.memory_reserved() is how many bytes
-        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
-        # this is used to measure the non-torch memory usage
-        self.torch_memory = torch.cuda.memory_reserved()
-
-        self.non_torch_memory = self.cuda_memory - self.torch_memory
-        self.timestamp = time.time()
-
-    def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
-        return MemorySnapshot(
-            torch_peak=self.torch_peak - other.torch_peak,
-            free_memory=self.free_memory - other.free_memory,
-            total_memory=self.total_memory - other.total_memory,
-            cuda_memory=self.cuda_memory - other.cuda_memory,
-            torch_memory=self.torch_memory - other.torch_memory,
-            non_torch_memory=self.non_torch_memory - other.non_torch_memory,
-            timestamp=self.timestamp - other.timestamp,
-            auto_measure=False,
-        )
-
-
-@dataclass
-class MemoryProfilingResult:
-    """Memory profiling result. All numbers are in bytes.
-    """
-    non_kv_cache_memory: int = 0
-    torch_peak_increase: int = 0
-    non_torch_increase: int = 0
-    weights_memory: float = 0
-    before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
-    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
-    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
-    profile_time: float = 0.0
-
-    def __repr__(self) -> str:
-        return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
-                f"Total non KV cache memory: "
-                f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
-                f"torch peak memory increase: "
-                f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
-                f"non-torch forward increase memory: "
-                f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
-                f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
-
-
-@contextlib.contextmanager
-def memory_profiling(
-        baseline_snapshot: MemorySnapshot,
-        weights_memory: int) -> Generator[MemoryProfilingResult, None, None]:
-    """Memory profiling context manager.
-    baseline_snapshot: the memory snapshot before the current vLLM instance.
-    weights_memory: memory used by PyTorch when loading the model weights.
-        Note that, before loading the model weights, we also initialize the device
-        and distributed environment, which may consume some memory. This part is not
-        included in the weights_memory because PyTorch does not control it.
-
-    The memory in one GPU can be classified into 3 categories:
-    1. memory used by anything other than the current vLLM instance.
-    2. memory used by torch in the current vLLM instance.
-    3. memory used in the current vLLM instance, but not by torch.
-
-    A quantitive example:
-
-    Before creating the current vLLM instance:
-        category 1: 1 GiB
-        category 2: 0 GiB
-        category 3: 0 GiB
-
-    After creating the current vLLM instance and loading the model,
-    (i.e. before profiling):
-        category 1: 1 GiB
-        category 2: 2 GiB (model weights take 2 GiB)
-        category 3: 0.5 GiB (memory used by NCCL)
-
-    During profiling (peak):
-        category 1: 1 GiB
-        category 2: 4 GiB (peak activation tensors take 2 GiB)
-        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
-
-    After profiling:
-        category 1: 1 GiB
-        category 2: 3 GiB (after garbage-collecting activation tensors)
-        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
-
-    In this case, non-kv cache takes 5 GiB in total, including:
-    a. 2 GiB used by the model weights (category 2)
-    b. 2 GiB reserved for the peak activation tensors (category 2)
-    c. 1 GiB used by non-torch components (category 3)
-
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
-
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
-
-    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
-    """  # noqa
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-
-    result = MemoryProfilingResult()
-
-    result.before_create = baseline_snapshot
-    # the part of memory used for holding the model weights
-    result.weights_memory = weights_memory
-
-    result.before_profile.measure()
-
-    yield result
-
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    result.after_profile.measure()
-
-    diff_profile = result.after_profile - result.before_profile
-    diff_from_create = result.after_profile - result.before_create
-    result.torch_peak_increase = diff_profile.torch_peak
-    result.non_torch_increase = diff_from_create.non_torch_memory
-    result.profile_time = diff_profile.timestamp
-
-    non_torch_memory = result.non_torch_increase
-    peak_activation_memory = result.torch_peak_increase
-    result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory  # noqa
-
-
-# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
-def set_ulimit(target_soft_limit=65535):
-    if sys.platform.startswith('win'):
-        logger.info("Windows detected, skipping ulimit adjustment.")
-        return
-
-    import resource
-    resource_type = resource.RLIMIT_NOFILE
-    current_soft, current_hard = resource.getrlimit(resource_type)
-
-    if current_soft < target_soft_limit:
-        try:
-            resource.setrlimit(resource_type,
-                               (target_soft_limit, current_hard))
-        except ValueError as e:
-            logger.warning(
-                "Found ulimit of %s and failed to automatically increase "
-                "with error %s. This can cause fd limit errors like "
-                "`OSError: [Errno 24] Too many open files`. Consider "
-                "increasing with ulimit -n", current_soft, e)
-
-
-# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
-def get_exception_traceback():
-    etype, value, tb = sys.exc_info()
-    err_str = "".join(traceback.format_exception(etype, value, tb))
-    return err_str
-
-
-def split_zmq_path(path: str) -> tuple[str, str, str]:
-    """Split a zmq path into its parts."""
-    parsed = urlparse(path)
-    if not parsed.scheme:
-        raise ValueError(f"Invalid zmq path: {path}")
-
-    scheme = parsed.scheme
-    host = parsed.hostname or ""
-    port = str(parsed.port or "")
-
-    if scheme == "tcp" and not all((host, port)):
-        # The host and port fields are required for tcp
-        raise ValueError(f"Invalid zmq path: {path}")
-
-    if scheme != "tcp" and port:
-        # port only makes sense with tcp
-        raise ValueError(f"Invalid zmq path: {path}")
-
-    return scheme, host, port
-
-
-def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
-    """Make a ZMQ path from its parts.
-
-    Args:
-        scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc).
-        host: The host - can be an IPv4 address, IPv6 address, or hostname.
-        port: Optional port number, only used for TCP sockets.
-
-    Returns:
-        A properly formatted ZMQ path string.
-    """
-    if port is None:
-        return f"{scheme}://{host}"
-    if is_valid_ipv6_address(host):
-        return f"{scheme}://[{host}]:{port}"
-    return f"{scheme}://{host}:{port}"
-
-
-# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
-def make_zmq_socket(
-    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
-    path: str,
-    socket_type: Any,
-    bind: Optional[bool] = None,
-    identity: Optional[bytes] = None,
-    linger: Optional[int] = None,
-) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
-    """Make a ZMQ socket with the proper bind/connect semantics."""
-
-    mem = psutil.virtual_memory()
-    socket = ctx.socket(socket_type)
-
-    # Calculate buffer size based on system memory
-    total_mem = mem.total / 1024**3
-    available_mem = mem.available / 1024**3
-    # For systems with substantial memory (>32GB total, >16GB available):
-    # - Set a large 0.5GB buffer to improve throughput
-    # For systems with less memory:
-    # - Use system default (-1) to avoid excessive memory consumption
-    if total_mem > 32 and available_mem > 16:
-        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
-    else:
-        buf_size = -1  # Use system default buffer size
-
-    if bind is None:
-        bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB)
-
-    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
-        socket.setsockopt(zmq.RCVHWM, 0)
-        socket.setsockopt(zmq.RCVBUF, buf_size)
-
-    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
-        socket.setsockopt(zmq.SNDHWM, 0)
-        socket.setsockopt(zmq.SNDBUF, buf_size)
-
-    if identity is not None:
-        socket.setsockopt(zmq.IDENTITY, identity)
-
-    if linger is not None:
-        socket.setsockopt(zmq.LINGER, linger)
-
-    if socket_type == zmq.XPUB:
-        socket.setsockopt(zmq.XPUB_VERBOSE, True)
-
-    # Determine if the path is a TCP socket with an IPv6 address.
-    # Enable IPv6 on the zmq socket if so.
-    scheme, host, _ = split_zmq_path(path)
-    if scheme == "tcp" and is_valid_ipv6_address(host):
-        socket.setsockopt(zmq.IPV6, 1)
-
-    if bind:
-        socket.bind(path)
-    else:
-        socket.connect(path)
-
-    return socket
-
-
-@contextlib.contextmanager
-def zmq_socket_ctx(
-    path: str,
-    socket_type: Any,
-    bind: Optional[bool] = None,
-    linger: int = 0,
-    identity: Optional[bytes] = None,
-) -> Iterator[zmq.Socket]:
-    """Context manager for a ZMQ socket"""
-
-    ctx = zmq.Context()  # type: ignore[attr-defined]
-    try:
-        yield make_zmq_socket(ctx,
-                              path,
-                              socket_type,
-                              bind=bind,
-                              identity=identity)
-
-    except KeyboardInterrupt:
-        logger.debug("Got Keyboard Interrupt.")
-
-    finally:
-        ctx.destroy(linger=linger)
-
-
-def _maybe_force_spawn():
-    """Check if we need to force the use of the `spawn` multiprocessing start
-    method.
-    """
-    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
-        return
-
-    reasons = []
-    if is_in_ray_actor():
-        # even if we choose to spawn, we need to pass the ray address
-        # to the subprocess so that it knows how to connect to the ray cluster.
-        # env vars are inherited by subprocesses, even if we use spawn.
-        import ray
-        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
-        reasons.append("In a Ray actor and can only be spawned")
-
-    if cuda_is_initialized():
-        reasons.append("CUDA is initialized")
-    elif xpu_is_initialized():
-        reasons.append("XPU is initialized")
-
-    if reasons:
-        logger.warning(
-            "We must use the `spawn` multiprocessing start method. "
-            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-            "See https://docs.vllm.ai/en/latest/usage/"
-            "troubleshooting.html#python-multiprocessing "
-            "for more information. Reasons: %s", "; ".join(reasons))
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
-def get_mp_context():
-    """Get a multiprocessing context with a particular method (spawn or fork).
-    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
-    determine the multiprocessing method (default is fork). However, under
-    certain conditions, we may enforce spawn and override the value of
-    VLLM_WORKER_MULTIPROC_METHOD.
-    """
-    _maybe_force_spawn()
-    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-    return multiprocessing.get_context(mp_method)
-
-
-def bind_kv_cache(
-    ctx: dict[str, Any],
-    kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
-    shared_kv_cache_layers: Optional[dict[str, str]] = None
-) -> None:
-    # Bind the kv_cache tensor to Attention modules, similar to
-    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
-    # Special things handled here:
-    # 1. Some models have non-attention layers, e.g., Jamba
-    # 2. Pipeline parallelism, each rank only has a subset of layers
-    # 3. Encoder attention has no kv cache
-    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
-    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
-    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
-    #    tensor
-    # 5. Some models have attention layers that share kv cache with previous
-    #    layers, this is specified through shared_kv_cache_layers
-    if shared_kv_cache_layers is None:
-        shared_kv_cache_layers = {}
-    from vllm.attention import AttentionType
-    from vllm.model_executor.models.utils import extract_layer_index
-    layer_need_kv_cache = [
-        layer_name for layer_name in ctx
-        if (hasattr(ctx[layer_name], 'attn_type') and ctx[layer_name].attn_type
-            in (AttentionType.DECODER, AttentionType.ENCODER_DECODER)) \
-                and ctx[layer_name].kv_sharing_target_layer_name is None
-    ]
-    layer_index_sorted = sorted(
-        set(
-            extract_layer_index(layer_name)
-            for layer_name in layer_need_kv_cache))
-    for layer_name in layer_need_kv_cache:
-        kv_cache_idx = layer_index_sorted.index(
-            extract_layer_index(layer_name))
-        forward_ctx = ctx[layer_name]
-        assert len(forward_ctx.kv_cache) == len(kv_cache)
-        for ve, ve_kv_cache in enumerate(kv_cache):
-            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
-    if shared_kv_cache_layers is not None:
-        for layer_name, target_layer_name in shared_kv_cache_layers.items():
-            assert extract_layer_index(target_layer_name) < \
-               extract_layer_index(layer_name), \
-                   "v0 doesn't support interleaving kv sharing"
-            ctx[layer_name].kv_cache = ctx[target_layer_name].kv_cache
-
-
-def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any],
-               kwargs: dict[str, Any]) -> Any:
-    """
-    Run a method of an object with the given arguments and keyword arguments.
-    If the method is string, it will be converted to a method using getattr.
-    If the method is serialized bytes and will be deserialized using
-    cloudpickle.
-    If the method is a callable, it will be called directly.
-    """
-    if isinstance(method, bytes):
-        func = partial(cloudpickle.loads(method), obj)
-    elif isinstance(method, str):
-        try:
-            func = getattr(obj, method)
-        except AttributeError:
-            raise NotImplementedError(f"Method {method!r} is not"
-                                      " implemented.") from None
-    else:
-        func = partial(method, obj)  # type: ignore
-    return func(*args, **kwargs)
-
-
-def import_pynvml():
-    """
-    Historical comments:
-
-    libnvml.so is the library behind nvidia-smi, and
-    pynvml is a Python wrapper around it. We use it to get GPU
-    status without initializing CUDA context in the current process.
-    Historically, there are two packages that provide pynvml:
-    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
-        wrapper. It is a dependency of vLLM, and is installed when users
-        install vLLM. It provides a Python module named `pynvml`.
-    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
-        Prior to version 12.0, it also provides a Python module `pynvml`,
-        and therefore conflicts with the official one. What's worse,
-        the module is a Python package, and has higher priority than
-        the official one which is a standalone Python file.
-        This causes errors when both of them are installed.
-        Starting from version 12.0, it migrates to a new module
-        named `pynvml_utils` to avoid the conflict.
-    It is so confusing that many packages in the community use the
-    unofficial one by mistake, and we have to handle this case.
-    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
-    one, and it will cause errors, see the issue
-    https://github.com/vllm-project/vllm/issues/12847 for example.
-    After all the troubles, we decide to copy the official `pynvml`
-    module to our codebase, and use it directly.
-    """
-    import vllm.third_party.pynvml as pynvml
-    return pynvml
-
-
-def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
-    """
-    A replacement for `abc.ABC`.
-    When we use `abc.ABC`, subclasses will fail to instantiate
-    if they do not implement all abstract methods.
-    Here, we only require `raise NotImplementedError` in the
-    base class, and log a warning if the method is not implemented
-    in the subclass.
-    """
-
-    original_init = cls.__init__
-
-    def find_unimplemented_methods(self: object):
-        unimplemented_methods = []
-        for attr_name in dir(self):
-            # bypass inner method
-            if attr_name.startswith('_'):
-                continue
-
-            try:
-                attr = getattr(self, attr_name)
-                # get the func of callable method
-                if callable(attr):
-                    attr_func = attr.__func__
-            except AttributeError:
-                continue
-            src = inspect.getsource(attr_func)
-            if "NotImplementedError" in src:
-                unimplemented_methods.append(attr_name)
-        if unimplemented_methods:
-            method_names = ','.join(unimplemented_methods)
-            msg = (f"Methods {method_names} not implemented in {self}")
-            logger.debug(msg)
-
-    @wraps(original_init)
-    def wrapped_init(self, *args, **kwargs) -> None:
-        original_init(self, *args, **kwargs)
-        find_unimplemented_methods(self)
-
-    type.__setattr__(cls, '__init__', wrapped_init)
-    return cls
-
-
-class LazyLoader(types.ModuleType):
-    """
-    LazyLoader module borrowed from Tensorflow
-    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
-    with an addition of "module caching".
-
-    Lazily import a module, mainly to avoid pulling in large dependencies.
-    Modules such as `xgrammar` might do additional side effects, so we
-    only want to use this when it is needed, delaying all eager effects
-    """
-
-    def __init__(
-        self,
-        local_name: str,
-        parent_module_globals: dict[str, Any],
-        name: str,
-    ):
-        self._local_name = local_name
-        self._parent_module_globals = parent_module_globals
-        self._module: types.ModuleType | None = None
-
-        super().__init__(str(name))
-
-    def _load(self) -> types.ModuleType:
-        # Import the target module and insert it into the parent's namespace
-        try:
-            module = importlib.import_module(self.__name__)
-            self._parent_module_globals[self._local_name] = module
-            # The additional add to sys.modules
-            # ensures library is actually loaded.
-            sys.modules[self._local_name] = module
-        except ModuleNotFoundError as err:
-            raise err from None
-
-        # Update this object's dict so that if someone keeps a
-        # reference to the LazyLoader, lookups are efficient
-        # (__getattr__ is only called on lookups that fail).
-        self.__dict__.update(module.__dict__)
-        return module
-
-    def __getattr__(self, item: Any) -> Any:
-        if self._module is None:
-            self._module = self._load()
-        return getattr(self._module, item)
-
-    def __dir__(self) -> list[str]:
-        if self._module is None:
-            self._module = self._load()
-        return dir(self._module)
-
-
-def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
-    """
-    Helper function to swap values for two keys
-    """
-    v1 = obj.get(key1)
-    v2 = obj.get(key2)
-    if v1 is not None:
-        obj[key2] = v1
-    else:
-        obj.pop(key2, None)
-    if v2 is not None:
-        obj[key1] = v2
-    else:
-        obj.pop(key1, None)
-
-
-@contextlib.contextmanager
-def cprofile_context(save_file: Optional[str] = None):
-    """Run a cprofile
-
-    Args:
-        save_file: path to save the profile result. "1" or
-            None will result in printing to stdout.
-    """
-    import cProfile
-
-    prof = cProfile.Profile()
-    prof.enable()
-
-    try:
-        yield
-    finally:
-        prof.disable()
-        if save_file and save_file != "1":
-            prof.dump_stats(save_file)
-        else:
-            prof.print_stats(sort="cumtime")
-
-
-def cprofile(save_file: Optional[str] = None, enabled: bool = True):
-    """Decorator to profile a Python method using cProfile.
-
-    Args:
-        save_file: Path to save the profile result.
-            If "1", None, or "", results will be printed to stdout.
-        enabled: Set to false to turn this into a no-op
-    """
-
-    def decorator(func: Callable):
-
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            if not enabled:
-                # If profiling is disabled, just call the function directly.
-                return func(*args, **kwargs)
-
-            with cprofile_context(save_file):
-                return func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-# Only relevant for models using ALiBi (e.g, MPT)
-def check_use_alibi(model_config: ModelConfig) -> bool:
-    cfg = model_config.hf_text_config
-    return (getattr(cfg, "alibi", False)  # Falcon
-            or ("BloomForCausalLM" in getattr(model_config.hf_config,
-                                              "architectures", []))  # Bloom
-            or getattr(cfg, "position_encoding_type", "") ==
-            "alibi"  # codellm_1b_alibi
-            or (hasattr(cfg, "attn_config")  # MPT
-                and ((isinstance(cfg.attn_config, dict)
-                      and cfg.attn_config.get("alibi", False)) or
-                     (not isinstance(cfg.attn_config, dict)
-                      and getattr(cfg.attn_config, "alibi", False)))))
-
-
-def sha256(input: Any) -> bytes:
-    """Hash any picklable Python object using SHA-256.
-
-    The input is serialized using pickle before hashing, which allows
-    arbitrary Python objects to be used. Note that this function does
-    not use a hash seed—if you need one, prepend it explicitly to the input.
-
-    Args:
-        input: Any picklable Python object.
-
-    Returns:
-        Bytes representing the SHA-256 hash of the serialized input.
-    """
-    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    return hashlib.sha256(input_bytes).digest()
-
-
-def sha256_cbor(input: Any) -> bytes:
-    """
-    Hash objects using CBOR serialization and SHA-256.
-
-    This option is useful for non-Python-dependent serialization and hashing.
-
-    Args:
-        input: Object to be serialized and hashed. Supported types include
-            basic Python types and complex structures like lists, tuples, and
-            dictionaries.
-            Custom classes must implement CBOR serialization methods.
-
-    Returns:
-        Bytes representing the SHA-256 hash of the CBOR serialized input.
-    """
-    input_bytes = cbor2.dumps(input, canonical=True)
-    return hashlib.sha256(input_bytes).digest()
-
-
-def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
-    """Get a hash function by name, or raise an error if
-    the function is not found.
-    Args:
-        hash_fn_name: Name of the hash function.
-    Returns:
-        A hash function.
-    """
-    if hash_fn_name == "sha256":
-        return sha256
-    if hash_fn_name == "sha256_cbor":
-        return sha256_cbor
-
-    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
-
-
-def is_torch_equal_or_newer(target: str) -> bool:
-    """Check if the installed torch version is >= the target version.
-
-    Args:
-        target: a version string, like "2.6.0".
-
-    Returns:
-        Whether the condition meets.
-    """
-    try:
-        return _is_torch_equal_or_newer(str(torch.__version__), target)
-    except Exception:
-        # Fallback to PKG-INFO to load the package info, needed by the doc gen.
-        return Version(importlib.metadata.version('torch')) >= Version(target)
-
-
-# Helper function used in testing.
-def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
-    torch_version = version.parse(torch_version)
-    return torch_version >= version.parse(target)
-
-
-@cache
-def _has_module(module_name: str) -> bool:
-    """Return True if *module_name* can be found in the current environment.
-
-    The result is cached so that subsequent queries for the same module incur
-    no additional overhead.
-    """
-    return importlib.util.find_spec(module_name) is not None
-
-
-def has_pplx() -> bool:
-    """Whether the optional `pplx_kernels` package is available."""
-
-    return _has_module("pplx_kernels")
-
-
-def has_deep_ep() -> bool:
-    """Whether the optional `deep_ep` package is available."""
-
-    return _has_module("deep_ep")
-
-
-def has_deep_gemm() -> bool:
-    """Whether the optional `deep_gemm` package is available."""
-
-    return _has_module("deep_gemm")
-
-
-def has_triton_kernels() -> bool:
-    """Whether the optional `triton_kernels` package is available."""
-
-    return _has_module("triton_kernels")
-
-
-def has_tilelang() -> bool:
-    """Whether the optional `tilelang` package is available."""
-
-    return _has_module("tilelang")
-
-
-def set_process_title(name: str,
-                      suffix: str = "",
-                      prefix: str = envs.VLLM_PROCESS_NAME_PREFIX) -> None:
-    """
-    Set the current process title to a specific name with an
-    optional suffix.
-
-    Args:
-        name: The title to assign to the current process.
-        suffix: An optional suffix to append to the base name.
-        prefix: A prefix to prepend to the front separated by `::`.
-    """
-    if suffix:
-        name = f"{name}_{suffix}"
-    setproctitle.setproctitle(f"{prefix}::{name}")
-
-
-def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
-    """Prepend each output line with process-specific prefix"""
-
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
-    file_write = file.write
-
-    def write_with_prefix(s: str):
-        if not s:
-            return
-        if file.start_new_line:  # type: ignore[attr-defined]
-            file_write(prefix)
-        idx = 0
-        while (next_idx := s.find('\n', idx)) != -1:
-            next_idx += 1
-            file_write(s[idx:next_idx])
-            if next_idx == len(s):
-                file.start_new_line = True  # type: ignore[attr-defined]
-                return
-            file_write(prefix)
-            idx = next_idx
-        file_write(s[idx:])
-        file.start_new_line = False  # type: ignore[attr-defined]
-
-    file.start_new_line = True  # type: ignore[attr-defined]
-    file.write = write_with_prefix  # type: ignore[method-assign]
-
-
-def decorate_logs(process_name: Optional[str] = None) -> None:
-    """
-    Adds a process-specific prefix to each line of output written to stdout and
-    stderr.
-
-    This function is intended to be called before initializing the api_server,
-    engine_core, or worker classes, so that all subsequent output from the
-    process is prefixed with the process name and PID. This helps distinguish
-    log output from different processes in multi-process environments.
-
-    Args:
-        process_name: Optional; the name of the process to use in the prefix.
-            If not provided, the current process name from the multiprocessing
-            context is used.
-    """
-    if process_name is None:
-        process_name = get_mp_context().current_process().name
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
-
-
-def length_from_prompt_token_ids_or_embeds(
-    prompt_token_ids: Optional[list[int]],
-    prompt_embeds: Optional[torch.Tensor],
-) -> int:
-    """Calculate the request length (in number of tokens) give either
-    prompt_token_ids or prompt_embeds.
-    """
-    prompt_token_len = None if prompt_token_ids is None else len(
-        prompt_token_ids)
-    prompt_embeds_len = \
-        None if prompt_embeds is None else len(prompt_embeds)
-
-    if prompt_token_len is None:
-        if prompt_embeds_len is None:
-            raise ValueError(
-                "Neither prompt_token_ids nor prompt_embeds were defined.")
-        return prompt_embeds_len
-    else:
-        if (prompt_embeds_len is not None
-                and prompt_embeds_len != prompt_token_len):
+        if prompt_embeds_len is not None and prompt_embeds_len != prompt_token_len:
             raise ValueError(
                 "Prompt token ids and prompt embeds had different lengths"
                 f" prompt_token_ids={prompt_token_len}"
-                f" prompt_embeds={prompt_embeds_len}")
+                f" prompt_embeds={prompt_embeds_len}"
+            )
         return prompt_token_len
-
-
-@contextlib.contextmanager
-def set_env_var(key, value):
-    old = os.environ.get(key)
-    os.environ[key] = value
-    try:
-        yield
-    finally:
-        if old is None:
-            del os.environ[key]
-        else:
-            os.environ[key] = old
-
-
-def unique_filepath(fn: Callable[[int], Path]) -> Path:
-    """
-    unique_filepath returns a unique path by trying
-    to include an integer in increasing order.
-
-    fn should be a callable that returns a path that
-    includes the passed int at a fixed location.
-
-    Note: This function has a TOCTOU race condition.
-    Caller should use atomic operations (e.g., open with 'x' mode)
-    when creating the file to ensure thread safety.
-    """
-    i = 0
-    while True:
-        p = fn(i)
-        if not p.exists():
-            return p
-        i += 1
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d105a3685b37c2fae74d74aaa92bc521f7e5756
--- /dev/null
+++ b/vllm/utils/argparse_utils.py
@@ -0,0 +1,487 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Argument parsing utilities for vLLM."""
+
+import json
+import sys
+import textwrap
+from argparse import (
+    Action,
+    ArgumentDefaultsHelpFormatter,
+    ArgumentParser,
+    ArgumentTypeError,
+    Namespace,
+    RawDescriptionHelpFormatter,
+    _ArgumentGroup,
+)
+from collections import defaultdict
+from typing import Any
+
+import regex as re
+import yaml
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter):
+    """SortedHelpFormatter that sorts arguments by their option strings."""
+
+    def _split_lines(self, text, width):
+        """
+        1. Sentences split across lines have their single newlines removed.
+        2. Paragraphs and explicit newlines are split into separate lines.
+        3. Each line is wrapped to the specified width (width of terminal).
+        """
+        # The patterns also include whitespace after the newline
+        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
+        multiple_newlines = re.compile(r"\n{2,}\s*")
+        text = single_newline.sub(" ", text)
+        lines = re.split(multiple_newlines, text)
+        return sum([textwrap.wrap(line, width) for line in lines], [])
+
+    def add_arguments(self, actions):
+        actions = sorted(actions, key=lambda x: x.option_strings)
+        super().add_arguments(actions)
+
+
+class FlexibleArgumentParser(ArgumentParser):
+    """ArgumentParser that allows both underscore and dash in names."""
+
+    _deprecated: set[Action] = set()
+    _json_tip: str = (
+        "When passing JSON CLI arguments, the following sets of arguments "
+        "are equivalent:\n"
+        '   --json-arg \'{"key1": "value1", "key2": {"key3": "value2"}}\'\n'
+        "   --json-arg.key1 value1 --json-arg.key2.key3 value2\n\n"
+        "Additionally, list elements can be passed individually using +:\n"
+        '   --json-arg \'{"key4": ["value3", "value4", "value5"]}\'\n'
+        "   --json-arg.key4+ value3 --json-arg.key4+='value4,value5'\n\n"
+    )
+    _search_keyword: str | None = None
+
+    def __init__(self, *args, **kwargs):
+        # Set the default "formatter_class" to SortedHelpFormatter
+        if "formatter_class" not in kwargs:
+            kwargs["formatter_class"] = SortedHelpFormatter
+        # Pop kwarg "add_json_tip" to control whether to add the JSON tip
+        self.add_json_tip = kwargs.pop("add_json_tip", True)
+        super().__init__(*args, **kwargs)
+
+    if sys.version_info < (3, 13):
+        # Enable the deprecated kwarg for Python 3.12 and below
+
+        def parse_known_args(self, args=None, namespace=None):
+            if args is not None and "--disable-log-requests" in args:
+                # Special case warning because the warning below won't trigger
+                # if –-disable-log-requests because its value is default.
+                logger.warning_once(
+                    "argument '--disable-log-requests' is deprecated and "
+                    "replaced with '--enable-log-requests'. This will be "
+                    "removed in v0.12.0."
+                )
+            namespace, args = super().parse_known_args(args, namespace)
+            for action in FlexibleArgumentParser._deprecated:
+                if (
+                    hasattr(namespace, dest := action.dest)
+                    and getattr(namespace, dest) != action.default
+                ):
+                    logger.warning_once("argument '%s' is deprecated", dest)
+            return namespace, args
+
+        def add_argument(self, *args, **kwargs):
+            deprecated = kwargs.pop("deprecated", False)
+            action = super().add_argument(*args, **kwargs)
+            if deprecated:
+                FlexibleArgumentParser._deprecated.add(action)
+            return action
+
+        class _FlexibleArgumentGroup(_ArgumentGroup):
+            def add_argument(self, *args, **kwargs):
+                deprecated = kwargs.pop("deprecated", False)
+                action = super().add_argument(*args, **kwargs)
+                if deprecated:
+                    FlexibleArgumentParser._deprecated.add(action)
+                return action
+
+        def add_argument_group(self, *args, **kwargs):
+            group = self._FlexibleArgumentGroup(self, *args, **kwargs)
+            self._action_groups.append(group)
+            return group
+
+    def format_help(self):
+        # Only use custom help formatting for bottom level parsers
+        if self._subparsers is not None:
+            return super().format_help()
+
+        formatter = self._get_formatter()
+
+        # Handle keyword search of the args
+        if (search_keyword := self._search_keyword) is not None:
+            # Normalise the search keyword
+            search_keyword = search_keyword.lower().replace("_", "-")
+            # Return full help if searching for 'all'
+            if search_keyword == "all":
+                self.epilog = self._json_tip
+                return super().format_help()
+
+            # Return group help if searching for a group title
+            for group in self._action_groups:
+                if group.title and group.title.lower() == search_keyword:
+                    formatter.start_section(group.title)
+                    formatter.add_text(group.description)
+                    formatter.add_arguments(group._group_actions)
+                    formatter.end_section()
+                    formatter.add_text(self._json_tip)
+                    return formatter.format_help()
+
+            # Return matched args if searching for an arg name
+            matched_actions = []
+            for group in self._action_groups:
+                for action in group._group_actions:
+                    # search option name
+                    if any(
+                        search_keyword in opt.lower() for opt in action.option_strings
+                    ):
+                        matched_actions.append(action)
+            if matched_actions:
+                formatter.start_section(f"Arguments matching '{search_keyword}'")
+                formatter.add_arguments(matched_actions)
+                formatter.end_section()
+                formatter.add_text(self._json_tip)
+                return formatter.format_help()
+
+            # No match found
+            formatter.add_text(
+                f"No group or arguments matching '{search_keyword}'.\n"
+                "Use '--help' to see available groups or "
+                "'--help=all' to see all available parameters."
+            )
+            return formatter.format_help()
+
+        # usage
+        formatter.add_usage(self.usage, self._actions, self._mutually_exclusive_groups)
+
+        # description
+        formatter.add_text(self.description)
+
+        # positionals, optionals and user-defined groups
+        formatter.start_section("Config Groups")
+        config_groups = ""
+        for group in self._action_groups:
+            if not group._group_actions:
+                continue
+            title = group.title
+            description = group.description or ""
+            config_groups += f"{title: <24}{description}\n"
+        formatter.add_text(config_groups)
+        formatter.end_section()
+
+        # epilog
+        formatter.add_text(self.epilog)
+
+        # determine help from format above
+        return formatter.format_help()
+
+    def parse_args(  # type: ignore[override]
+        self,
+        args: list[str] | None = None,
+        namespace: Namespace | None = None,
+    ):
+        if args is None:
+            args = sys.argv[1:]
+
+        # Check for --model in command line arguments first
+        if args and args[0] == "serve":
+            try:
+                model_idx = next(
+                    i
+                    for i, arg in enumerate(args)
+                    if arg == "--model" or arg.startswith("--model=")
+                )
+                logger.warning(
+                    "With `vllm serve`, you should provide the model as a "
+                    "positional argument or in a config file instead of via "
+                    "the `--model` option. "
+                    "The `--model` option will be removed in v0.13."
+                )
+
+                if args[model_idx] == "--model":
+                    model_tag = args[model_idx + 1]
+                    rest_start_idx = model_idx + 2
+                else:
+                    model_tag = args[model_idx].removeprefix("--model=")
+                    rest_start_idx = model_idx + 1
+
+                # Move <model> to the front, e,g:
+                # [Before]
+                # vllm serve -tp 2 --model <model> --enforce-eager --port 8001
+                # [After]
+                # vllm serve <model> -tp 2 --enforce-eager --port 8001
+                args = [
+                    "serve",
+                    model_tag,
+                    *args[1:model_idx],
+                    *args[rest_start_idx:],
+                ]
+            except StopIteration:
+                pass
+
+        if "--config" in args:
+            args = self._pull_args_from_config(args)
+
+        def repl(match: re.Match) -> str:
+            """Replaces underscores with dashes in the matched string."""
+            return match.group(0).replace("_", "-")
+
+        # Everything between the first -- and the first .
+        pattern = re.compile(r"(?<=--)[^\.]*")
+
+        # Convert underscores to dashes and vice versa in argument names
+        processed_args = list[str]()
+        for i, arg in enumerate(args):
+            if arg.startswith("--help="):
+                FlexibleArgumentParser._search_keyword = arg.split("=", 1)[-1].lower()
+                processed_args.append("--help")
+            elif arg.startswith("--"):
+                if "=" in arg:
+                    key, value = arg.split("=", 1)
+                    key = pattern.sub(repl, key, count=1)
+                    processed_args.append(f"{key}={value}")
+                else:
+                    key = pattern.sub(repl, arg, count=1)
+                    processed_args.append(key)
+            elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
+                # allow -O flag to be used without space, e.g. -O3 or -Odecode
+                # -O.<...> handled later
+                # also handle -O=<mode> here
+                mode = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args.append(f"-O.mode={mode}")
+            elif (
+                arg == "-O"
+                and i + 1 < len(args)
+                and args[i + 1] in {"0", "1", "2", "3"}
+            ):
+                # Convert -O <n> to -O.mode <n>
+                processed_args.append("-O.mode")
+            else:
+                processed_args.append(arg)
+
+        def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]:
+            """Creates a nested dictionary from a list of keys and a value.
+
+            For example, `keys = ["a", "b", "c"]` and `value = 1` will create:
+            `{"a": {"b": {"c": 1}}}`
+            """
+            nested_dict: Any = value
+            for key in reversed(keys):
+                nested_dict = {key: nested_dict}
+            return nested_dict
+
+        def recursive_dict_update(
+            original: dict[str, Any],
+            update: dict[str, Any],
+        ) -> set[str]:
+            """Recursively updates a dictionary with another dictionary.
+            Returns a set of duplicate keys that were overwritten.
+            """
+            duplicates = set[str]()
+            for k, v in update.items():
+                if isinstance(v, dict) and isinstance(original.get(k), dict):
+                    nested_duplicates = recursive_dict_update(original[k], v)
+                    duplicates |= {f"{k}.{d}" for d in nested_duplicates}
+                elif isinstance(v, list) and isinstance(original.get(k), list):
+                    original[k] += v
+                else:
+                    if k in original:
+                        duplicates.add(k)
+                    original[k] = v
+            return duplicates
+
+        delete = set[int]()
+        dict_args = defaultdict[str, dict[str, Any]](dict)
+        duplicates = set[str]()
+        for i, processed_arg in enumerate(processed_args):
+            if i in delete:  # skip if value from previous arg
+                continue
+
+            if processed_arg.startswith("-") and "." in processed_arg:
+                if "=" in processed_arg:
+                    processed_arg, value_str = processed_arg.split("=", 1)
+                    if "." not in processed_arg:
+                        # False positive, '.' was only in the value
+                        continue
+                else:
+                    value_str = processed_args[i + 1]
+                    delete.add(i + 1)
+
+                if processed_arg.endswith("+"):
+                    processed_arg = processed_arg[:-1]
+                    value_str = json.dumps(list(value_str.split(",")))
+
+                key, *keys = processed_arg.split(".")
+                try:
+                    value = json.loads(value_str)
+                except json.decoder.JSONDecodeError:
+                    value = value_str
+
+                # Merge all values with the same key into a single dict
+                arg_dict = create_nested_dict(keys, value)
+                arg_duplicates = recursive_dict_update(dict_args[key], arg_dict)
+                duplicates |= {f"{key}.{d}" for d in arg_duplicates}
+                delete.add(i)
+        # Filter out the dict args we set to None
+        processed_args = [a for i, a in enumerate(processed_args) if i not in delete]
+        if duplicates:
+            logger.warning("Found duplicate keys %s", ", ".join(duplicates))
+
+        # Add the dict args back as if they were originally passed as JSON
+        for dict_arg, dict_value in dict_args.items():
+            processed_args.append(dict_arg)
+            processed_args.append(json.dumps(dict_value))
+
+        return super().parse_args(processed_args, namespace)
+
+    def check_port(self, value):
+        try:
+            value = int(value)
+        except ValueError:
+            msg = "Port must be an integer"
+            raise ArgumentTypeError(msg) from None
+
+        if not (1024 <= value <= 65535):
+            raise ArgumentTypeError("Port must be between 1024 and 65535")
+
+        return value
+
+    def _pull_args_from_config(self, args: list[str]) -> list[str]:
+        """Method to pull arguments specified in the config file
+        into the command-line args variable.
+
+        The arguments in config file will be inserted between
+        the argument list.
+
+        example:
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        ```python
+        $: vllm {serve,chat,complete} "facebook/opt-12B" \
+            --config config.yaml -tp 2
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B",
+            '--config', 'config.yaml',
+            '-tp', '2'
+        ]
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B",
+            '--port', '12323',
+            '--tensor-parallel-size', '4',
+            '-tp', '2'
+            ]
+        ```
+
+        Please note how the config args are inserted after the sub command.
+        this way the order of priorities is maintained when these are args
+        parsed by super().
+        """
+        assert args.count("--config") <= 1, "More than one config file specified!"
+
+        index = args.index("--config")
+        if index == len(args) - 1:
+            raise ValueError(
+                "No config file specified! \
+                             Please check your command-line arguments."
+            )
+
+        file_path = args[index + 1]
+
+        config_args = self.load_config_file(file_path)
+
+        # 0th index might be the sub command {serve,chat,complete,...}
+        # optionally followed by model_tag (only for serve)
+        # followed by config args
+        # followed by rest of cli args.
+        # maintaining this order will enforce the precedence
+        # of cli > config > defaults
+        if args[0].startswith("-"):
+            # No sub command (e.g., api_server entry point)
+            args = config_args + args[0:index] + args[index + 2 :]
+        elif args[0] == "serve":
+            model_in_cli = len(args) > 1 and not args[1].startswith("-")
+            model_in_config = any(arg == "--model" for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
+                raise ValueError(
+                    "No model specified! Please specify model either "
+                    "as a positional argument or in a config file."
+                )
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = (
+                    [args[0]]
+                    + [args[1]]
+                    + config_args
+                    + args[2:index]
+                    + args[index + 2 :]
+                )
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]] + config_args + args[1:index] + args[index + 2 :]
+        else:
+            args = [args[0]] + config_args + args[1:index] + args[index + 2 :]
+
+        return args
+
+    def load_config_file(self, file_path: str) -> list[str]:
+        """Loads a yaml file and returns the key value pairs as a
+        flattened list with argparse like pattern
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        returns:
+            processed_args: list[str] = [
+                '--port': '12323',
+                '--tensor-parallel-size': '4'
+            ]
+        """
+        extension: str = file_path.split(".")[-1]
+        if extension not in ("yaml", "yml"):
+            raise ValueError(
+                f"Config file must be of a yaml/yml type. {extension} supplied"
+            )
+
+        # only expecting a flat dictionary of atomic types
+        processed_args: list[str] = []
+
+        config: dict[str, int | str] = {}
+        try:
+            with open(file_path) as config_file:
+                config = yaml.safe_load(config_file)
+        except Exception as ex:
+            logger.error(
+                "Unable to read the config file at %s. Check path correctness",
+                file_path,
+            )
+            raise ex
+
+        for key, value in config.items():
+            if isinstance(value, bool):
+                if value:
+                    processed_args.append("--" + key)
+            elif isinstance(value, list):
+                if value:
+                    processed_args.append("--" + key)
+                    for item in value:
+                        processed_args.append(str(item))
+            else:
+                processed_args.append("--" + key)
+                processed_args.append(str(value))
+
+        return processed_args
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6c24e1ceeee79ed30e66c08c893856dae047f25
--- /dev/null
+++ b/vllm/utils/async_utils.py
@@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Contains helpers related to asynchronous code.
+
+This is similar in concept to the `asyncio` module.
+"""
+
+import asyncio
+import contextlib
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
+from collections.abc import AsyncGenerator, Awaitable, Callable
+from concurrent.futures import Executor, ThreadPoolExecutor
+from functools import partial
+from typing import TypeVar
+
+from transformers.tokenization_utils_base import BatchEncoding
+from typing_extensions import ParamSpec
+
+P = ParamSpec("P")
+T = TypeVar("T")
+
+
+class AsyncMicrobatchTokenizer:
+    """Asynchronous tokenizer with micro-batching.
+
+    Pulls pending encode/decode requests from a queue and batches them
+    up to reduce overhead. A single-thread ThreadPoolExecutor is used
+    so the event loop stays responsive.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        max_batch_size: int = 32,
+        batch_wait_timeout_s: float = 0.002,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+
+        self._loop = asyncio.get_running_loop()
+        self._queues: dict[
+            tuple,
+            asyncio.Queue[tuple[str, dict, Future] | tuple[list[int], Future]],
+        ] = {}
+        self._batcher_tasks: list[Task] = []
+
+        # Single-thread executor for blocking tokenizer calls.
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    # === Public async API ===
+    async def __call__(self, prompt, **kwargs):
+        result_future: Future = self._loop.create_future()
+        key = self._queue_key("encode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((prompt, kwargs, result_future))
+        return await result_future
+
+    async def decode(self, token_ids, **kwargs):
+        result_future: Future = self._loop.create_future()
+        key = self._queue_key("decode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((token_ids, result_future))
+        return await result_future
+
+    # === Internal helpers ===
+    def _get_queue(
+        self, loop: asyncio.AbstractEventLoop, key: tuple
+    ) -> asyncio.Queue[tuple[str, dict, Future] | tuple[list[int], Future]]:
+        """Get the request queue for the given operation key, creating a new
+        queue and batcher task if needed."""
+        queue = self._queues.get(key)
+        if queue is None:
+            self._queues[key] = queue = asyncio.Queue()
+            if key[0] == "encode":
+                can_batch = key[1] != "other"
+                coro = self._batch_encode_loop(queue, can_batch)
+            else:
+                assert key[0] == "decode", f"Unknown operation type: {key[0]}."
+                coro = self._batch_decode_loop(queue)
+            self._batcher_tasks.append(loop.create_task(coro))
+        return queue
+
+    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
+        """Batch incoming encode requests for efficiency."""
+        while True:
+            prompt, kwargs, result_future = await queue.get()
+            prompts = [prompt]
+            kwargs_list = [kwargs]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(prompts) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    prompt, kwargs, result_future = await asyncio.wait_for(
+                        queue.get(), timeout
+                    )
+                    prompts.append(prompt)
+                    result_futures.append(result_future)
+                    if not can_batch:
+                        kwargs_list.append(kwargs)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # If every request uses identical kwargs we can run a single
+                # batched tokenizer call for a big speed-up.
+                if can_batch and len(prompts) > 1:
+                    batch_encode_fn = partial(self.tokenizer, prompts, **kwargs)
+                    results = await self._loop.run_in_executor(
+                        self._executor, batch_encode_fn
+                    )
+
+                    for i, fut in enumerate(result_futures):
+                        if not fut.done():
+                            data = {k: v[i] for k, v in results.items()}
+                            fut.set_result(BatchEncoding(data))
+                else:
+                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
+                        self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs)
+                    ]
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn
+                    )
+
+                    for fut, res in zip(result_futures, results):
+                        if not fut.done():
+                            fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    async def _batch_decode_loop(self, queue: asyncio.Queue):
+        """Batch incoming decode requests for efficiency."""
+        while True:
+            token_ids, result_future = await queue.get()
+            token_ids_list = [token_ids]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(token_ids_list) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    token_ids, result_future = await asyncio.wait_for(
+                        queue.get(), timeout
+                    )
+                    token_ids_list.append(token_ids)
+                    result_futures.append(result_future)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # Perform a single batched decode call for all requests
+                results = await self._loop.run_in_executor(
+                    self._executor, self.tokenizer.batch_decode, token_ids_list
+                )
+                for fut, res in zip(result_futures, results):
+                    if not fut.done():
+                        fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    def _queue_key(self, op: str, kwargs: dict) -> tuple:
+        """
+        Return a normalized key describing operation + kwargs.
+
+        - `add_special_tokens`: {True/False}
+        - `truncation`: {True/False}
+          - If `truncation` is False (`max_length` is None),
+            returns a key for a can_batch queue.
+          - If `truncation` is True and `max_length` is None or equals
+            `tokenizer.model_max_length`, returns a key for a can_batch queue.
+          - Otherwise, returns a key for a cannot_batch queue.
+
+        Examples:
+          - Decode: ("decode",)
+          - Encode typical:
+            ("encode", add_special_tokens, bool_truncation, max_length_label)
+          - Fallback: ("encode", "other")
+        """
+
+        if op == "decode":
+            return ("decode",)
+
+        add_special_tokens = kwargs.get("add_special_tokens", True)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        if not truncation:
+            return "encode", add_special_tokens, False, None
+
+        model_max = getattr(self.tokenizer, "model_max_length", None)
+        if max_length is None or (model_max is not None and max_length == model_max):
+            return "encode", add_special_tokens, True, "model_max"
+
+        return "encode", "other"
+
+    def __del__(self):
+        if (
+            (tasks := getattr(self, "_batcher_tasks", None))
+            and (loop := getattr(self, "_loop", None))
+            and not loop.is_closed()
+        ):
+
+            def cancel_tasks():
+                for task in tasks:
+                    task.cancel()
+
+            loop.call_soon_threadsafe(cancel_tasks)
+
+
+def cancel_task_threadsafe(task: Task):
+    if task and not task.done():
+        run_in_loop(task.get_loop(), task.cancel)
+
+
+def make_async(
+    func: Callable[P, T],
+    executor: Executor | None = None,
+) -> Callable[P, Awaitable[T]]:
+    """
+    Take a blocking function, and run it on in an executor thread.
+
+    This function prevents the blocking function from blocking the
+    asyncio event loop.
+    The code in this function needs to be thread safe.
+    """
+
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Future[T]:
+        loop = asyncio.get_event_loop()
+        p_func = partial(func, *args, **kwargs)
+        return loop.run_in_executor(executor=executor, func=p_func)
+
+    return _async_wrapper
+
+
+def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
+    if in_loop(loop):
+        function(*args)
+    elif not loop.is_closed():
+        loop.call_soon_threadsafe(function, *args)
+
+
+def in_loop(event_loop: AbstractEventLoop) -> bool:
+    try:
+        return asyncio.get_running_loop() == event_loop
+    except RuntimeError:
+        return False
+
+
+async def merge_async_iterators(
+    *iterators: AsyncGenerator[T, None],
+) -> AsyncGenerator[tuple[int, T], None]:
+    """Merge multiple asynchronous iterators into a single iterator.
+
+    This method handle the case where some iterators finish before others.
+    When it yields, it yields a tuple (i, item) where i is the index of the
+    iterator that yields the item.
+    """
+    if len(iterators) == 1:
+        # Fast-path single iterator case.
+        async for item in iterators[0]:
+            yield 0, item
+        return
+
+    loop = asyncio.get_running_loop()
+
+    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}
+    try:
+        while awaits:
+            done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED)
+            for d in done:
+                pair = awaits.pop(d)
+                try:
+                    item = await d
+                    i, it = pair
+                    awaits[loop.create_task(anext(it))] = pair
+                    yield i, item
+                except StopAsyncIteration:
+                    pass
+    finally:
+        # Cancel any remaining iterators
+        for f, (_, it) in awaits.items():
+            with contextlib.suppress(BaseException):
+                f.cancel()
+                await it.aclose()
+
+
+async def collect_from_async_generator(iterator: AsyncGenerator[T, None]) -> list[T]:
+    """Collect all items from an async generator into a list."""
+    items = []
+    async for item in iterator:
+        items.append(item)
+    return items
diff --git a/vllm/utils/cache.py b/vllm/utils/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..4338983f906014494bac6fef661f901ab5cf3756
--- /dev/null
+++ b/vllm/utils/cache.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import UserDict
+from collections.abc import Callable, Hashable, Iterator, KeysView, Mapping
+from types import MappingProxyType
+from typing import NamedTuple, TypeVar, cast, overload
+
+import cachetools
+
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+_T = TypeVar("_T")
+
+
+class _Sentinel: ...
+
+
+ALL_PINNED_SENTINEL = _Sentinel()
+
+
+class _MappingOrderCacheView(UserDict[_K, _V]):
+    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
+        super().__init__(data)
+        self.ordered_keys = ordered_keys
+
+    def __iter__(self) -> Iterator[_K]:
+        return iter(self.ordered_keys)
+
+    def keys(self) -> KeysView[_K]:
+        return KeysView(self.ordered_keys)
+
+
+class CacheInfo(NamedTuple):
+    hits: int
+    total: int
+
+    @property
+    def hit_ratio(self) -> float:
+        if self.total == 0:
+            return 0
+
+        return self.hits / self.total
+
+    def __sub__(self, other: "CacheInfo"):
+        return CacheInfo(
+            hits=self.hits - other.hits,
+            total=self.total - other.total,
+        )
+
+
+class LRUCache(cachetools.LRUCache[_K, _V]):
+    def __init__(self, capacity: float, getsizeof: Callable[[_V], float] | None = None):
+        super().__init__(capacity, getsizeof)
+
+        self.pinned_items = set[_K]()
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
+        value = super().__getitem__(key)
+
+        if update_info:
+            self._hits += 1
+            self._total += 1
+
+        return value
+
+    def __delitem__(self, key: _K) -> None:
+        run_on_remove = key in self
+        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
+        super().__delitem__(key)
+        if key in self.pinned_items:
+            # Todo: add warning to inform that del pinned item
+            self._unpin(key)
+        if run_on_remove:
+            self._on_remove(key, value)
+
+    @property
+    def cache(self) -> Mapping[_K, _V]:
+        """Return the internal cache dictionary in order (read-only)."""
+        return _MappingOrderCacheView(
+            self._Cache__data,  # type: ignore
+            self.order,
+        )
+
+    @property
+    def order(self) -> Mapping[_K, None]:
+        """Return the internal order dictionary (read-only)."""
+        return MappingProxyType(self._LRUCache__order)  # type: ignore
+
+    @property
+    def capacity(self) -> float:
+        return self.maxsize
+
+    @property
+    def usage(self) -> float:
+        if self.maxsize == 0:
+            return 0
+
+        return self.currsize / self.maxsize
+
+    def stat(self, *, delta: bool = False) -> CacheInfo:
+        """
+        Gets the cumulative number of hits and queries against this cache.
+
+        If `delta=True`, instead gets these statistics
+        since the last call that also passed `delta=True`.
+        """
+        info = CacheInfo(hits=self._hits, total=self._total)
+
+        if delta:
+            info_delta = info - self._last_info
+            self._last_info = info
+            info = info_delta
+
+        return info
+
+    def touch(self, key: _K) -> None:
+        try:
+            self._LRUCache__order.move_to_end(key)  # type: ignore
+        except KeyError:
+            self._LRUCache__order[key] = None  # type: ignore
+
+    @overload
+    def get(self, key: _K, /) -> _V | None: ...
+
+    @overload
+    def get(self, key: _K, /, default: _V | _T) -> _V | _T: ...
+
+    def get(self, key: _K, /, default: _V | _T | None = None) -> _V | _T | None:
+        value: _V | _T | None
+        if key in self:
+            value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
+
+            self._hits += 1
+        else:
+            value = default
+
+        self._total += 1
+        return value
+
+    @overload
+    def pop(self, key: _K) -> _V: ...
+
+    @overload
+    def pop(self, key: _K, default: _V | _T) -> _V | _T: ...
+
+    def pop(self, key: _K, default: _V | _T | None = None) -> _V | _T | None:
+        value: _V | _T | None
+        if key not in self:
+            return default
+
+        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
+        self.__delitem__(key)
+        return value
+
+    def put(self, key: _K, value: _V) -> None:
+        self.__setitem__(key, value)
+
+    def pin(self, key: _K) -> None:
+        """
+        Pins a key in the cache preventing it from being
+        evicted in the LRU order.
+        """
+        if key not in self:
+            raise ValueError(f"Cannot pin key: {key} not in cache.")
+        self.pinned_items.add(key)
+
+    def _unpin(self, key: _K) -> None:
+        """
+        Unpins a key in the cache allowing it to be
+        evicted in the LRU order.
+        """
+        self.pinned_items.remove(key)
+
+    def _on_remove(self, key: _K, value: _V | None) -> None:
+        pass
+
+    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
+        if len(self) == 0:
+            return
+
+        self.popitem(remove_pinned=remove_pinned)
+
+    def _remove_old_if_needed(self) -> None:
+        while self.currsize > self.capacity:
+            self.remove_oldest()
+
+    def popitem(self, remove_pinned: bool = False):
+        """Remove and return the `(key, value)` pair least recently used."""
+        if not remove_pinned:
+            # pop the oldest item in the cache that is not pinned
+            lru_key = next(
+                (key for key in self.order if key not in self.pinned_items),
+                ALL_PINNED_SENTINEL,
+            )
+            if lru_key is ALL_PINNED_SENTINEL:
+                raise RuntimeError(
+                    "All items are pinned, cannot remove oldest from the cache."
+                )
+        else:
+            lru_key = next(iter(self.order))
+        value = self.pop(cast(_K, lru_key))
+        return (lru_key, value)
+
+    def clear(self) -> None:
+        while len(self) > 0:
+            self.remove_oldest(remove_pinned=True)
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
diff --git a/vllm/utils/collection_utils.py b/vllm/utils/collection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..57271311828cdef08c7a0efc0757307e7e184c3e
--- /dev/null
+++ b/vllm/utils/collection_utils.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Contains helpers that are applied to collections.
+
+This is similar in concept to the `collections` module.
+"""
+
+from collections import UserDict, defaultdict
+from collections.abc import Callable, Generator, Hashable, Iterable, Mapping
+from typing import Generic, Literal, TypeVar
+
+from typing_extensions import TypeIs, assert_never
+
+T = TypeVar("T")
+U = TypeVar("U")
+
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+
+
+class ClassRegistry(UserDict[type[T], _V]):
+    """
+    A registry that acts like a dictionary but searches for other classes
+    in the MRO if the original class is not found.
+    """
+
+    def __getitem__(self, key: type[T]) -> _V:
+        for cls in key.mro():
+            if cls in self.data:
+                return self.data[cls]
+
+        raise KeyError(key)
+
+    def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
+        if not isinstance(key, type):
+            return False
+
+        if strict:
+            return key in self.data
+
+        return any(cls in self.data for cls in key.mro())
+
+
+class LazyDict(Mapping[str, T], Generic[T]):
+    """
+    Evaluates dictionary items only when they are accessed.
+
+    Adapted from: https://stackoverflow.com/a/47212782/5082708
+    """
+
+    def __init__(self, factory: dict[str, Callable[[], T]]):
+        self._factory = factory
+        self._dict: dict[str, T] = {}
+
+    def __getitem__(self, key: str) -> T:
+        if key not in self._dict:
+            if key not in self._factory:
+                raise KeyError(key)
+            self._dict[key] = self._factory[key]()
+        return self._dict[key]
+
+    def __setitem__(self, key: str, value: Callable[[], T]):
+        self._factory[key] = value
+
+    def __iter__(self):
+        return iter(self._factory)
+
+    def __len__(self):
+        return len(self._factory)
+
+
+def as_list(maybe_list: Iterable[T]) -> list[T]:
+    """Convert iterable to list, unless it's already a list."""
+    return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
+
+
+def as_iter(obj: T | Iterable[T]) -> Iterable[T]:
+    if isinstance(obj, str) or not isinstance(obj, Iterable):
+        return [obj]  # type: ignore[list-item]
+    return obj
+
+
+def is_list_of(
+    value: object,
+    typ: type[T] | tuple[type[T], ...],
+    *,
+    check: Literal["first", "all"] = "first",
+) -> TypeIs[list[T]]:
+    if not isinstance(value, list):
+        return False
+
+    if check == "first":
+        return len(value) == 0 or isinstance(value[0], typ)
+    elif check == "all":
+        return all(isinstance(v, typ) for v in value)
+
+    assert_never(check)
+
+
+def chunk_list(lst: list[T], chunk_size: int) -> Generator[list[T]]:
+    """Yield successive chunk_size chunks from lst."""
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i : i + chunk_size]
+
+
+def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
+    """Flatten a list of lists to a single list."""
+    return [item for sublist in lists for item in sublist]
+
+
+def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
+    """
+    Unlike [`itertools.groupby`][], groups are not broken by
+    non-contiguous data.
+    """
+    groups = defaultdict[_K, list[_V]](list)
+
+    for value in values:
+        groups[key(value)].append(value)
+
+    return groups.items()
+
+
+def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
+    """Swap values between two keys."""
+    v1 = obj.get(key1)
+    v2 = obj.get(key2)
+    if v1 is not None:
+        obj[key2] = v1
+    else:
+        obj.pop(key2, None)
+    if v2 is not None:
+        obj[key1] = v2
+    else:
+        obj.pop(key1, None)
diff --git a/vllm/utils/counter.py b/vllm/utils/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2dce32e97e13d092051da2cfda4013968755fe4
--- /dev/null
+++ b/vllm/utils/counter.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+
+
+class Counter:
+    def __init__(self, start: int = 0) -> None:
+        super().__init__()
+
+        self.counter = start
+
+    def __next__(self) -> int:
+        i = self.counter
+        self.counter += 1
+        return i
+
+    def reset(self) -> None:
+        self.counter = 0
+
+
+class AtomicCounter:
+    """An atomic, thread-safe counter"""
+
+    def __init__(self, initial: int = 0) -> None:
+        """Initialize a new atomic counter to given initial value"""
+        super().__init__()
+
+        self._value = initial
+        self._lock = threading.Lock()
+
+    @property
+    def value(self) -> int:
+        return self._value
+
+    def inc(self, num: int = 1) -> int:
+        """Atomically increment the counter by num and return the new value"""
+        with self._lock:
+            self._value += num
+            return self._value
+
+    def dec(self, num: int = 1) -> int:
+        """Atomically decrement the counter by num and return the new value"""
+        with self._lock:
+            self._value -= num
+            return self._value
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 0e3bdaec829e56637f3bd236ef7e6a3486bd08fd..b61ebb9a998dcf5a92fc6045349ede7107553a3b 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -4,40 +4,66 @@
 
 Users of vLLM should always import **only** these wrappers.
 """
-from __future__ import annotations
 
 import functools
 import importlib
 import os
-from typing import Any, Callable, NoReturn
+from collections.abc import Callable
+from enum import Enum
+from typing import Any, NoReturn
 
 import torch
 
 import vllm.envs as envs
 from vllm.logger import logger
 from vllm.platforms import current_platform
-from vllm.utils import cdiv, has_deep_gemm
+from vllm.utils.import_utils import has_deep_gemm
+from vllm.utils.math_utils import cdiv
+
+
+class DeepGemmQuantScaleFMT(Enum):
+    # Float32 scales in Float32 tensor
+    FLOAT32 = 0
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Keep the scales in Float32 tensor.
+    FLOAT32_CEIL_UE8M0 = 1
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Pack the scales into a int32 tensor where each int32
+    # element contains 4 scale values.
+    UE8M0 = 2
+
+    @staticmethod
+    def from_oracle() -> "DeepGemmQuantScaleFMT":
+        if not is_deep_gemm_e8m0_used():
+            return DeepGemmQuantScaleFMT.FLOAT32
+        return (
+            DeepGemmQuantScaleFMT.UE8M0
+            if current_platform.is_device_capability(100)
+            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+        )
 
 
 @functools.cache
 def is_deep_gemm_supported() -> bool:
-    """Return ``True`` if DeepGEMM is supported on the current platform.
+    """Return `True` if DeepGEMM is supported on the current platform.
     Currently, only Hopper and Blackwell GPUs are supported.
     """
     is_supported_arch = current_platform.is_cuda() and (
         current_platform.is_device_capability(90)
-        or current_platform.is_device_capability(100))
+        or current_platform.is_device_capability(100)
+    )
     return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
 
 @functools.cache
 def is_deep_gemm_e8m0_used() -> bool:
-    """Return ``True`` if vLLM is configured to use DeepGEMM "
+    """Return `True` if vLLM is configured to use DeepGEMM "
     "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
         logger.debug_once(
-            "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
+            "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system."
+        )
         return False
 
     _lazy_init()
@@ -46,14 +72,8 @@ def is_deep_gemm_e8m0_used() -> bool:
         logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    if current_platform.is_device_capability(100) and \
-            envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
-        return True
-
-    if current_platform.is_device_capability(90) and \
-            envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
-        logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
+    if envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.info_once("DeepGEMM E8M0 enabled on current platform.")
         return True
 
     logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
@@ -63,8 +83,9 @@ def is_deep_gemm_e8m0_used() -> bool:
 def _missing(*_: Any, **__: Any) -> NoReturn:
     """Placeholder for unavailable DeepGEMM backend."""
     raise RuntimeError(
-        "DeepGEMM backend is not available. Please install the `deep_gemm` "
-        "package to enable FP8 kernels.")
+        "DeepGEMM backend is not available or outdated. Please install or "
+        "update the `deep_gemm` to a newer version to enable FP8 kernels."
+    )
 
 
 _fp8_gemm_nt_impl: Callable[..., Any] | None = None
@@ -74,6 +95,8 @@ _fp8_mqa_logits_impl: Callable[..., Any] | None = None
 _fp8_paged_mqa_logits_impl: Callable[..., Any] | None = None
 _get_paged_mqa_logits_metadata_impl: Callable[..., Any] | None = None
 _get_mn_major_tma_aligned_tensor_impl: Callable[..., Any] | None = None
+_get_mk_alignment_for_contiguous_layout_impl: Callable[..., Any] | None = None
+_transform_sf_into_required_layout_impl: Callable[..., Any] | None = None
 
 
 def _lazy_init() -> None:
@@ -82,23 +105,30 @@ def _lazy_init() -> None:
     global _fp8_mqa_logits_impl, _fp8_paged_mqa_logits_impl
     global _get_paged_mqa_logits_metadata_impl
     global _get_mn_major_tma_aligned_tensor_impl
-
+    global _get_mk_alignment_for_contiguous_layout_impl
+    global _transform_sf_into_required_layout_impl
     # fast path
-    if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None
-            or _grouped_masked_impl is not None
-            or _fp8_mqa_logits_impl is not None
-            or _fp8_paged_mqa_logits_impl is not None
-            or _get_paged_mqa_logits_metadata_impl is not None):
+    if (
+        _fp8_gemm_nt_impl is not None
+        or _grouped_impl is not None
+        or _grouped_masked_impl is not None
+        or _fp8_mqa_logits_impl is not None
+        or _fp8_paged_mqa_logits_impl is not None
+        or _get_paged_mqa_logits_metadata_impl is not None
+        or _get_mk_alignment_for_contiguous_layout_impl is not None
+        or _transform_sf_into_required_layout_impl is not None
+    ):
         return
 
     if not has_deep_gemm():
         return
 
     # Set up deep_gemm cache path
-    DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR'
+    DEEP_GEMM_JIT_CACHE_ENV_NAME = "DG_JIT_CACHE_DIR"
     if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None):
         os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join(
-            envs.VLLM_CACHE_ROOT, "deep_gemm")
+            envs.VLLM_CACHE_ROOT, "deep_gemm"
+        )
 
     _dg = importlib.import_module("deep_gemm")
 
@@ -108,9 +138,32 @@ def _lazy_init() -> None:
     _fp8_mqa_logits_impl = getattr(_dg, "fp8_mqa_logits", None)
     _fp8_paged_mqa_logits_impl = getattr(_dg, "fp8_paged_mqa_logits", None)
     _get_paged_mqa_logits_metadata_impl = getattr(
-        _dg, "get_paged_mqa_logits_metadata", None)
+        _dg, "get_paged_mqa_logits_metadata", None
+    )
     _get_mn_major_tma_aligned_tensor_impl = getattr(
-        _dg, "get_mn_major_tma_aligned_tensor", None)
+        _dg, "get_mn_major_tma_aligned_tensor", None
+    )
+    _get_mk_alignment_for_contiguous_layout_impl = getattr(
+        _dg, "get_mk_alignment_for_contiguous_layout", None
+    )
+    _transform_sf_into_required_layout_impl = getattr(
+        _dg, "transform_sf_into_required_layout", None
+    )
+
+
+def get_num_sms() -> int:
+    _lazy_init()
+    _dg = importlib.import_module("deep_gemm")
+    return int(_dg.get_num_sms())
+
+
+@functools.cache
+def get_mk_alignment_for_contiguous_layout() -> list[int]:
+    _lazy_init()
+    if _get_mk_alignment_for_contiguous_layout_impl is None:
+        return _missing()
+    mk_align_size = _get_mk_alignment_for_contiguous_layout_impl()
+    return [mk_align_size, mk_align_size]
 
 
 def get_num_sms() -> int:
@@ -131,18 +184,21 @@ def fp8_gemm_nt(*args, **kwargs):
     _lazy_init()
     if _fp8_gemm_nt_impl is None:
         return _missing(*args, **kwargs)
-    return _fp8_gemm_nt_impl(*args,
-                             disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
-                             **kwargs)
+    if "is_deep_gemm_e8m0_used" in kwargs:
+        use_ue8m0 = kwargs["is_deep_gemm_e8m0_used"]
+        del kwargs["is_deep_gemm_e8m0_used"]
+    else:
+        use_ue8m0 = is_deep_gemm_e8m0_used()
+    return _fp8_gemm_nt_impl(*args, disable_ue8m0_cast=not use_ue8m0, **kwargs)
 
 
 def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
     _lazy_init()
     if _grouped_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_impl(*args,
-                         disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
-                         **kwargs)
+    return _grouped_impl(
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
+    )
 
 
 def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
@@ -150,7 +206,113 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
     return _grouped_masked_impl(
-        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs)
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
+    )
+
+
+def transform_sf_into_required_layout(*args, **kwargs):
+    _lazy_init()
+    if _transform_sf_into_required_layout_impl is None:
+        return _missing(*args, **kwargs)
+    return _transform_sf_into_required_layout_impl(
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
+    )
+
+
+def fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+    _lazy_init()
+    if _fp8_mqa_logits_impl is None:
+        return _missing()
+    return _fp8_mqa_logits_impl(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
+
+
+def get_paged_mqa_logits_metadata(
+    context_lens: torch.Tensor, block_size: int, num_sms: int
+) -> torch.Tensor:
+    """Build scheduling metadata for paged MQA logits.
+
+    Args:
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            per batch element.
+        block_size: KV-cache block size in tokens (e.g., 64).
+        num_sms: Number of SMs available. 132 for Hopper
+
+    Returns:
+        Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
+        schedule work across SMs.
+    """
+    _lazy_init()
+    if _get_paged_mqa_logits_metadata_impl is None:
+        return _missing()
+    return _get_paged_mqa_logits_metadata_impl(context_lens, block_size, num_sms)
+
+
+def fp8_paged_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_cache_fp8: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    schedule_metadata: torch.Tensor,
+    max_model_len: int,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits using paged KV-cache.
+
+    Args:
+        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
+            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
+            4 bytes per (block,pos) store the `float` dequant scale.
+        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            for each batch element.
+        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
+            block indices to physical blocks in the paged cache.
+        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
+            used to distribute work across SMs.
+        max_model_len: Maximum sequence length used to size the logits output.
+
+    Returns:
+        Logits tensor of shape [B * next_n, max_model_len], dtype
+        `torch.float32`.
+    """
+    _lazy_init()
+    if _fp8_paged_mqa_logits_impl is None:
+        return _missing()
+    return _fp8_paged_mqa_logits_impl(
+        q_fp8,
+        kv_cache_fp8,
+        weights,
+        context_lens,
+        block_tables,
+        schedule_metadata,
+        max_model_len,
+        clean_logits=True,
+    )
 
 
 def fp8_mqa_logits(
@@ -261,15 +423,14 @@ DEFAULT_BLOCK_SIZE = [128, 128]
 # Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def per_block_cast_to_fp8(
-        x: torch.Tensor,
-        block_size: list[int] = DEFAULT_BLOCK_SIZE,
-        use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
     block_m, block_n = block_size
-    x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
+    x_padded = torch.zeros(
+        (_align(m, block_m), _align(n, block_n)), dtype=x.dtype, device=x.device
+    )
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
@@ -277,16 +438,17 @@ def per_block_cast_to_fp8(
     sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
     x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
-        x_view.size(0), x_view.size(2))
+        x_view.size(0), x_view.size(2)
+    )
 
 
 def calc_diff(x: torch.Tensor, y: torch.Tensor):
     """Return a global difference metric for unit tests.
 
     DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
-    error, causing ``torch.testing.assert_close`` to fail.  Instead of checking
+    error, causing `torch.testing.assert_close` to fail.  Instead of checking
     every element, we compute a cosine-style similarity over the whole tensor
-    and report ``1 - sim``.  Once kernel accuracy improves this helper can be
+    and report `1 - sim`.  Once kernel accuracy improves this helper can be
     removed.
     """
 
@@ -296,10 +458,19 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
     return 1 - sim
 
 
-def should_use_deepgemm_for_fp8_linear(output_dtype: torch.dtype,
-                                       weight: torch.Tensor):
-    return (is_deep_gemm_supported() and output_dtype == torch.bfloat16
-            and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
+def should_use_deepgemm_for_fp8_linear(
+    output_dtype: torch.dtype,
+    weight: torch.Tensor,
+    supports_deep_gemm: bool | None = None,
+):
+    if supports_deep_gemm is None:
+        supports_deep_gemm = is_deep_gemm_supported()
+    return (
+        supports_deep_gemm
+        and output_dtype == torch.bfloat16
+        and weight.shape[0] % 128 == 0
+        and weight.shape[1] % 128 == 0
+    )
 
 
 __all__ = [
@@ -316,4 +487,5 @@ __all__ = [
     "get_num_sms",
     "should_use_deepgemm_for_fp8_linear",
     "get_col_major_tma_aligned_tensor",
+    "get_mk_alignment_for_contiguous_layout",
 ]
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 734cd938792a95f5d04598b2d2c8036bae445730..1209d64901bf52f482488a7a1465324936a9d406 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -4,20 +4,24 @@
 
 Users of vLLM should always import **only** these wrappers.
 """
-from __future__ import annotations
 
 import contextlib
 import functools
 import importlib
 import importlib.util
 import os
-from typing import Any, Callable, NoReturn, Optional
+import shutil
+from collections.abc import Callable
+from typing import Any, NoReturn
 
 import requests
 import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -31,12 +35,34 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
 )
 
 
+@functools.cache
+def has_flashinfer_cubin() -> bool:
+    """Return `True` if flashinfer-cubin package is available."""
+    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+        return True
+    if importlib.util.find_spec("flashinfer_cubin") is not None:
+        return True
+    logger.debug_once("flashinfer-cubin package was not found")
+    return False
+
+
 @functools.cache
 def has_flashinfer() -> bool:
-    """Return ``True`` if FlashInfer is available."""
+    """Return `True` if flashinfer-python package is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
-    return importlib.util.find_spec("flashinfer") is not None
+    if importlib.util.find_spec("flashinfer") is None:
+        logger.debug_once("FlashInfer unavailable since package was not found")
+        return False
+    # When not using flashinfer cubin,
+    # Also check if nvcc is available since it's required to JIT compile flashinfer
+    if not has_flashinfer_cubin() and shutil.which("nvcc") is None:
+        logger.debug_once(
+            "FlashInfer unavailable since nvcc was not found "
+            "and not using pre-downloaded cubins"
+        )
+        return False
+    return True
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -44,7 +70,8 @@ def _missing(*_: Any, **__: Any) -> NoReturn:
     raise RuntimeError(
         "FlashInfer backend is not available. Please install the package "
         "to enable FlashInfer kernels: "
-        "https://github.com/flashinfer-ai/flashinfer")
+        "https://github.com/flashinfer-ai/flashinfer"
+    )
 
 
 def _get_submodule(module_name: str) -> Any | None:
@@ -56,9 +83,9 @@ def _get_submodule(module_name: str) -> Any | None:
 
 
 # General lazy import wrapper
-def _lazy_import_wrapper(module_name: str,
-                         attr_name: str,
-                         fallback_fn: Callable[..., Any] = _missing):
+def _lazy_import_wrapper(
+    module_name: str, attr_name: str, fallback_fn: Callable[..., Any] = _missing
+):
     """Create a lazy import wrapper for a specific function."""
 
     @functools.cache
@@ -79,34 +106,39 @@ def _lazy_import_wrapper(module_name: str,
 
 # Create lazy wrappers for each function
 flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
-    "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe")
+    "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"
+)
 flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper(
-    "flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe")
-flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
-                                                    "cutlass_fused_moe")
-fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
+    "flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe"
+)
+flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
+    "flashinfer.fused_moe", "cutlass_fused_moe"
+)
+flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
-    "flashinfer", "nvfp4_block_scale_interleave")
+    "flashinfer", "nvfp4_block_scale_interleave"
+)
 trtllm_fp4_block_scale_moe = _lazy_import_wrapper(
-    "flashinfer", "trtllm_fp4_block_scale_moe")
+    "flashinfer", "trtllm_fp4_block_scale_moe"
+)
 
 # Special case for autotune since it returns a context manager
 autotune = _lazy_import_wrapper(
     "flashinfer.autotuner",
     "autotune",
-    fallback_fn=lambda *args, **kwargs: contextlib.nullcontext())
+    fallback_fn=lambda *args, **kwargs: contextlib.nullcontext(),
+)
 
 
 @functools.cache
 def has_flashinfer_comm() -> bool:
-    """Return ``True`` if FlashInfer comm module is available."""
-    return has_flashinfer() and importlib.util.find_spec(
-        "flashinfer.comm") is not None
+    """Return `True` if FlashInfer comm module is available."""
+    return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
 
 
 @functools.cache
 def has_flashinfer_all2all() -> bool:
-    """Return ``True`` if FlashInfer mnnvl all2all is available."""
+    """Return `True` if FlashInfer mnnvl all2all is available."""
     if not has_flashinfer_comm():
         return False
 
@@ -127,14 +159,16 @@ def has_flashinfer_all2all() -> bool:
 
 @functools.cache
 def has_flashinfer_moe() -> bool:
-    """Return ``True`` if FlashInfer MoE module is available."""
-    return has_flashinfer() and importlib.util.find_spec(
-        "flashinfer.fused_moe") is not None
+    """Return `True` if FlashInfer MoE module is available."""
+    return (
+        has_flashinfer()
+        and importlib.util.find_spec("flashinfer.fused_moe") is not None
+    )
 
 
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
-    """Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
+    """Return `True` if FlashInfer CUTLASS fused MoE is available."""
     if not has_flashinfer_moe():
         return False
 
@@ -155,14 +189,13 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
 
 @functools.cache
 def has_nvidia_artifactory() -> bool:
-    """Return ``True`` if NVIDIA's artifactory is accessible.
+    """Return `True` if NVIDIA's artifactory is accessible.
 
     This checks connectivity to the kernel inference library artifactory
     which is required for downloading certain cubin kernels like TRTLLM FHMA.
     """
-    # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
-    # it's true, we could assume the cubins are available.
-    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+    # If we have pre-downloaded cubins, we can assume the cubins are available.
+    if has_flashinfer_cubin():
         return True
 
     try:
@@ -174,7 +207,8 @@ def has_nvidia_artifactory() -> bool:
         else:
             logger.warning_once(
                 "NVIDIA artifactory returned failed status code: %d",
-                response.status_code)
+                response.status_code,
+            )
         return accessible
     except Exception as e:
         logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e)
@@ -184,33 +218,38 @@ def has_nvidia_artifactory() -> bool:
 @functools.cache
 def supports_trtllm_attention() -> bool:
     """
-    TRTLLM attention is supported if the platform is SM100 and
-    NVIDIA artifactory is accessible
+    TRTLLM attention is supported if the platform is SM100,
+    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
     """
+    # Batch-invariant mode disables TRTLLM attention
+    if vllm_is_batch_invariant():
+        return False
+
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
-    return current_platform.is_device_capability(
-        100) and has_nvidia_artifactory()
+    return current_platform.is_device_capability(100) and has_nvidia_artifactory()
 
 
 @functools.cache
-def _force_use_trtllm_attention(env_value: Optional[bool]) -> Optional[bool]:
+def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
     """Cache the env value for VLLM_USE_TRTLLM_ATTENTION"""
     if env_value is not None:
         logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
     return env_value
 
 
-def force_use_trtllm_attention() -> Optional[bool]:
+def force_use_trtllm_attention() -> bool | None:
     """
-    Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set,
-    return ``True`` if TRTLLM attention is forced to be used,
-    return ``False`` if TRTLLM attention is forced to be not used.
+    Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
+    return `True` if TRTLLM attention is forced to be used,
+    return `False` if TRTLLM attention is forced to be not used.
     """
     return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
 
 
 def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
     """Check if the current configuration supports TRTLLM attention."""
+    if force_use_trtllm_attention() is False:
+        return False
     has_trtllm = supports_trtllm_attention()
     return has_trtllm and (num_qo_heads % num_kv_heads == 0)
 
@@ -220,25 +259,35 @@ def use_trtllm_attention(
     num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
+    dcp_world_size: int,
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
     has_sinks: bool = False,
     has_spec: bool = False,
 ) -> bool:
-    """Return ``True`` if TRTLLM attention is used."""
+    """Return `True` if TRTLLM attention is used."""
     force_use_trtllm = force_use_trtllm_attention()
 
     # Environment variable is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
+    # Decode context parallel is not supported
+    if dcp_world_size > 1:
+        logger.warning_once(
+            "Trtllm does not support returning LSE and as a result "
+            "does not support DCP, reverting to FlashInfer"
+        )
+        return False
+
     # The platform is not supported
     if not supports_trtllm_attention():
         if force_use_trtllm:
             logger.warning_once(
                 "TRTLLM attention is not supported on this platform, "
-                "but VLLM_USE_TRTLLM_ATTENTION is set to 1")
+                "but VLLM_USE_TRTLLM_ATTENTION is set to 1"
+            )
         return False
 
     # The combination of query and key heads is not supported
@@ -252,37 +301,36 @@ def use_trtllm_attention(
 
     if has_spec and not is_prefill:
         # Speculative decoding requires TRTLLM attention for decodes
-        logger.info_once(
-            "Using TRTLLM attention (enabled for speculative decoding).")
+        logger.info_once("Using TRTLLM attention (enabled for speculative decoding).")
         return True
 
     # Must use TRTLLM attention if query is FP8 quantized
     if q_dtype == current_platform.fp8_dtype():
-        if has_sinks:
-            raise RuntimeError(
-                "TRTLLM FP8-qkv kernel is not supported for attention sinks. "
-                "Use kv_cache_dtype=auto for now.")
         logger.info_once("Using TRTLLM attention (query is quantized).")
         return True
 
     # If sinks are being used, we must use TRTLLM attention as it's
     # the only backend that supports them
     if has_sinks:
-        logger.info_once(
-            "Using TRTLLM attention (required for attention sinks).")
+        logger.info_once("Using TRTLLM attention (required for attention sinks).")
         return True
 
     if force_use_trtllm is None:
         # Environment variable not set - use auto-detection
-        use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072
-                      and kv_cache_dtype == "auto")
-        if use_trtllm:
-            logger.warning_once("Using TRTLLM attention (auto-detected).")
+        if is_prefill:
+            # Prefill auto-detection
+            use_trtllm = kv_cache_dtype == "auto"
+            if use_trtllm:
+                logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
+        else:
+            # Decode auto-detection
+            use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
+            if use_trtllm:
+                logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
 
     # Environment variable is set to 1 - respect it
-    logger.info_once(
-        "Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)")
+    logger.info_once("Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)")
     return True
 
 
@@ -303,16 +351,14 @@ if has_flashinfer():
         backend: str,
     ) -> torch.Tensor:
         from flashinfer import mm_fp4 as flashinfer_mm_fp4_
-        return flashinfer_mm_fp4_(A,
-                                  B,
-                                  A_scale,
-                                  B_scale,
-                                  g_scale,
-                                  dtype,
-                                  block_size=16,
-                                  backend=backend)
-
-    @torch.library.register_fake("vllm::flashinfer_mm_fp4", )
+
+        return flashinfer_mm_fp4_(
+            A, B, A_scale, B_scale, g_scale, dtype, block_size=16, backend=backend
+        )
+
+    @torch.library.register_fake(
+        "vllm::flashinfer_mm_fp4",
+    )
     def flashinfer_mm_fp4_fake(
         A: torch.Tensor,
         B: torch.Tensor,
@@ -322,10 +368,7 @@ if has_flashinfer():
         dtype: torch.dtype,
         backend: str,
     ) -> torch.Tensor:
-        return torch.empty(A.shape[0],
-                           B.shape[1],
-                           dtype=dtype,
-                           device=A.device)
+        return torch.empty(A.shape[0], B.shape[1], dtype=dtype, device=A.device)
 
     @torch.library.custom_op(
         "vllm::bmm_fp8",
@@ -341,9 +384,12 @@ if has_flashinfer():
         backend: str,
     ) -> torch.Tensor:
         from flashinfer import bmm_fp8 as bmm_fp8_
+
         return bmm_fp8_(A, B, A_scale, B_scale, dtype, None, backend)
 
-    @torch.library.register_fake("vllm::bmm_fp8", )
+    @torch.library.register_fake(
+        "vllm::bmm_fp8",
+    )
     def bmm_fp8_fake(
         A: torch.Tensor,
         B: torch.Tensor,
@@ -352,24 +398,24 @@ if has_flashinfer():
         dtype: torch.dtype,
         backend: str,
     ) -> torch.Tensor:
-        return torch.empty(A.shape[0],
-                           A.shape[1],
-                           B.shape[2],
-                           dtype=dtype,
-                           device=A.device)
-
-
-def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
-                             block_scale_a: torch.Tensor,
-                             block_scale_b: torch.Tensor, alpha: torch.Tensor,
-                             out_dtype: torch.dtype,
-                             backend: str) -> torch.Tensor:
+        return torch.empty(
+            A.shape[0], A.shape[1], B.shape[2], dtype=dtype, device=A.device
+        )
+
+
+def flashinfer_scaled_fp4_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    block_scale_a: torch.Tensor,
+    block_scale_b: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype,
+    backend: str,
+) -> torch.Tensor:
     assert a.ndim == 2 and b.ndim == 2
     assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
     assert a.stride(-1) == 1 and b.stride(-1) == 1
     assert a.shape[1] == b.shape[1]
-    assert block_scale_a.shape[1] == a.shape[1] // 8
-    assert block_scale_b.shape[1] == b.shape[1] // 8
 
     if backend == "cutlass":
         block_scale_a = block_scale_a.view(torch.uint8)
@@ -387,12 +433,13 @@ def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
 
 
 def flashinfer_scaled_fp8_mm(
-        a: torch.Tensor,
-        b: torch.Tensor,
-        scale_a: torch.Tensor,
-        scale_b: torch.Tensor,
-        out_dtype: torch.dtype,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: torch.dtype,
+    bias: torch.Tensor | None = None,
+) -> torch.Tensor:
     assert a.ndim == 2 and b.ndim == 2
     assert a.shape[1] == b.shape[0]
     assert scale_a.numel() == 1 and scale_b.numel() == 1
@@ -425,7 +472,7 @@ __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
     "flashinfer_cutlass_fused_moe",
-    "fp4_quantize",
+    "flashinfer_fp4_quantize",
     "nvfp4_block_scale_interleave",
     "trtllm_fp4_block_scale_moe",
     "autotune",
diff --git a/vllm/utils/func_utils.py b/vllm/utils/func_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c061a0dad5525ec9197d9153f7b3f8490c8c310d
--- /dev/null
+++ b/vllm/utils/func_utils.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Contains helpers that are applied to functions.
+
+This is similar in concept to the `functools` module.
+"""
+
+import inspect
+import threading
+import warnings
+from collections.abc import Callable, Mapping
+from functools import lru_cache, partial, wraps
+from typing import Any, TypeVar
+
+from typing_extensions import ParamSpec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+P = ParamSpec("P")
+T = TypeVar("T")
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def identity(value: T, **kwargs) -> T:
+    """Returns the first provided value."""
+    return value
+
+
+def run_once(f: Callable[P, None]) -> Callable[P, None]:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
+        if wrapper.has_run:  # type: ignore[attr-defined]
+            return
+
+        with wrapper.lock:  # type: ignore[attr-defined]
+            if not wrapper.has_run:  # type: ignore[attr-defined]
+                wrapper.has_run = True  # type: ignore[attr-defined]
+                return f(*args, **kwargs)
+
+    wrapper.has_run = False  # type: ignore[attr-defined]
+    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
+    return wrapper
+
+
+def deprecate_args(
+    start_index: int,
+    is_deprecated: bool | Callable[[], bool] = True,
+    additional_message: str | None = None,
+) -> Callable[[F], F]:
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+        params = inspect.signature(fn).parameters
+        pos_types = (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        )
+        pos_kws = [kw for kw, param in params.items() if param.kind in pos_types]
+
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_args = pos_kws[start_index : len(args)]
+                if deprecated_args:
+                    msg = (
+                        f"The positional arguments {deprecated_args} are "
+                        "deprecated and will be removed in a future update."
+                    )
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+def deprecate_kwargs(
+    *kws: str,
+    is_deprecated: bool | Callable[[], bool] = True,
+    additional_message: str | None = None,
+) -> Callable[[F], F]:
+    deprecated_kws = set(kws)
+
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_kwargs = kwargs.keys() & deprecated_kws
+                if deprecated_kwargs:
+                    msg = (
+                        f"The keyword arguments {deprecated_kwargs} are "
+                        "deprecated and will be removed in a future update."
+                    )
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+@lru_cache
+def supports_kw(
+    callable: Callable[..., object],
+    kw_name: str,
+    *,
+    requires_kw_only: bool = False,
+    allow_var_kwargs: bool = True,
+) -> bool:
+    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
+    disallows kwargs names that can also be positional arguments.
+    """
+    params = inspect.signature(callable).parameters
+    if not params:
+        return False
+
+    param_val = params.get(kw_name)
+
+    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
+    passable_kw_types = set(
+        (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.KEYWORD_ONLY,
+        )
+    )
+
+    if param_val:
+        is_sig_param = param_val.kind in passable_kw_types
+        # We want kwargs only, but this is passable as a positional arg
+        if (
+            requires_kw_only
+            and is_sig_param
+            and param_val.kind != inspect.Parameter.KEYWORD_ONLY
+        ):
+            return False
+        if (requires_kw_only and param_val.kind == inspect.Parameter.KEYWORD_ONLY) or (
+            not requires_kw_only and is_sig_param
+        ):
+            return True
+
+    # If we're okay with var-kwargs, it's supported as long as
+    # the kw_name isn't something like *args, **kwargs
+    if allow_var_kwargs:
+        # Get the last param; type is ignored here because params is a proxy
+        # mapping, but it wraps an ordered dict, and they appear in order.
+        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
+        last_param = params[next(reversed(params))]  # type: ignore
+        return (
+            last_param.kind == inspect.Parameter.VAR_KEYWORD
+            and last_param.name != kw_name
+        )
+
+    return False
+
+
+def get_allowed_kwarg_only_overrides(
+    callable: Callable[..., object],
+    overrides: Mapping[str, object] | None,
+    *,
+    requires_kw_only: bool = True,
+    allow_var_kwargs: bool = False,
+) -> dict[str, Any]:
+    """
+    Given a callable which has one or more keyword only params and a dict
+    mapping param names to values, drop values that can be not be kwarg
+    expanded to overwrite one or more keyword-only args. This is used in a
+    few places to handle custom processor overrides for multimodal models,
+    e.g., for profiling when processor options provided by the user
+    may affect the number of mm tokens per instance.
+
+    Args:
+        callable: Callable which takes 0 or more keyword only arguments.
+                  If None is provided, all overrides names are allowed.
+        overrides: Potential overrides to be used when invoking the callable.
+        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
+
+    Returns:
+        Dictionary containing the kwargs to be leveraged which may be used
+        to overwrite one or more keyword only arguments when invoking the
+        callable.
+    """
+    if not overrides:
+        return {}
+
+    # Drop any mm_processor_kwargs provided by the user that
+    # are not kwargs, unless it can fit it var_kwargs param
+    filtered_overrides = {
+        kwarg_name: val
+        for kwarg_name, val in overrides.items()
+        if supports_kw(
+            callable,
+            kwarg_name,
+            requires_kw_only=requires_kw_only,
+            allow_var_kwargs=allow_var_kwargs,
+        )
+    }
+
+    # If anything is dropped, log a warning
+    dropped_keys = overrides.keys() - filtered_overrides.keys()
+    if dropped_keys:
+        if requires_kw_only:
+            logger.warning(
+                "The following intended overrides are not keyword-only args "
+                "and will be dropped: %s",
+                dropped_keys,
+            )
+        else:
+            logger.warning(
+                "The following intended overrides are not keyword args "
+                "and will be dropped: %s",
+                dropped_keys,
+            )
+
+    return filtered_overrides
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..160ac9ac263a983f57e4d0f5f2b154081eee89ad
--- /dev/null
+++ b/vllm/utils/gc_utils.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import json
+import time
+from collections import Counter
+from contextlib import suppress
+from typing import Any
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class GCDebugConfig:
+    """
+    Config for GC Debugger.
+    - 0: disable GC debugger
+    - 1: enable GC debugger with gc.collect elpased times
+    - '{"top_objects":5}': enable GC debugger with top 5 collected objects
+    """
+
+    def __init__(self, gc_debug_conf: str | None = None) -> None:
+        self.enabled: bool = False
+        self.top_objects: int = -1
+
+        if not gc_debug_conf or gc_debug_conf == "0":
+            pass
+        elif gc_debug_conf == "1":
+            self.enabled = True
+        else:
+            try:
+                json_conf = json.loads(gc_debug_conf)
+                self.enabled = True
+                self.top_objects = json_conf.get("top_objects", -1)
+            except Exception:
+                self.enabled = False
+                logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG)
+        logger.debug("GC Debug Config. %s", str(self))
+
+    def __repr__(self) -> str:
+        return f"enabled:{self.enabled},top_objects:{self.top_objects}"
+
+
+class GCDebugger:
+    """
+    Debugger for GC which logs helpful information for GC understanding.
+    To enable, you should call maybe_attach_gc_debug_callback in the process.
+    """
+
+    def __init__(self, config: GCDebugConfig) -> None:
+        self.config = config
+        # Start time in micro second of this GC cycle
+        self.start_time_ns: int = time.monotonic_ns()
+        # If config.top_objects is positive,
+        # compute top collected objects by object types
+        self.gc_top_collected_objects: str = ""
+
+    def handle(self, phase: str, info: dict[str, int]) -> None:
+        """
+        Handles a GC event (e.g. GC start or GC finish)
+        """
+        generation = info.get("generation")
+        if generation is None:
+            return
+        if phase == "start":
+            # Before GC started, record GC start time
+            # and top collected objects
+            self.start_time_ns = time.monotonic_ns()
+            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
+                gc.get_objects(generation), self.config.top_objects
+            )
+        elif phase == "stop":
+            # After GC finished, Record GC elapsed time and
+            # optionally top collected objects
+            elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
+            logger.info(
+                "GC took %.3fms to complete. "
+                "Collected %s objects in GC generation %d.%s",
+                elpased_ms,
+                str(info.get("collected", "?")),
+                generation,
+                (
+                    f" Top collected objects: \n{self.gc_top_collected_objects}"
+                    if self.gc_top_collected_objects
+                    else ""
+                ),
+            )
+
+
+def freeze_gc_heap() -> None:
+    """
+    Freeze all objects tracked by the garbage collector. It should be invoked
+    after server init / warmup, to reduce GC overhead from static objects
+    during serving time.
+    """
+    # Ensure all static objects are pushed down to the oldest generation for
+    # freeze
+    gc.collect(0)
+    gc.collect(1)
+    gc.collect(2)
+    # Freeze all GC tracked objects
+    gc.freeze()
+
+
+def maybe_attach_gc_debug_callback() -> None:
+    """
+    Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
+    """
+    config = GCDebugConfig(envs.VLLM_GC_DEBUG)
+    if config.enabled:
+        debugger: GCDebugger = GCDebugger(config)
+
+        def gc_callback(phase: str, info: dict[str, int]) -> None:
+            debugger.handle(phase, info)
+
+        gc.callbacks.append(gc_callback)
+
+
+def _compute_detailed_type(o: Any) -> str:
+    """
+    Detailed object type.
+
+    TODO(Jialin): Further enhance the detailed type with element types for
+    easier debugging. We tried but occasionally it would run into signals
+    which kills the engine.
+    """
+    size_str: str = ""
+    # Object doesn't support len() - this can happen with type objects
+    # or other objects that don't implement __len__ properly
+    with suppress(Exception):
+        size_str = f"(size:{len(o)})"
+    return f"{str(type(o))}{size_str}"
+
+
+def _compute_top_gc_collected_objects(objects: list[Any], top: int) -> str:
+    """
+    Group collected objects by types.
+    """
+    if top <= 0:
+        return ""
+    object_types = [_compute_detailed_type(o) for o in objects]
+    return "\n".join(
+        f"{count:>5}:{object_type}"
+        for object_type, count in Counter(object_types).most_common(top)
+    )
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f4f13d115f3a09207b423a6c6690f2a50308ad
--- /dev/null
+++ b/vllm/utils/hashing.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import hashlib
+import pickle
+from collections.abc import Callable
+from typing import Any
+
+import cbor2
+
+
+def sha256(input: Any) -> bytes:
+    """Hash any picklable Python object using SHA-256.
+
+    The input is serialized using pickle before hashing, which allows
+    arbitrary Python objects to be used. Note that this function does
+    not use a hash seed—if you need one, prepend it explicitly to the input.
+
+    Args:
+        input: Any picklable Python object.
+
+    Returns:
+        Bytes representing the SHA-256 hash of the serialized input.
+    """
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return hashlib.sha256(input_bytes).digest()
+
+
+def sha256_cbor(input: Any) -> bytes:
+    """Hash objects using CBOR serialization and SHA-256.
+
+    This option is useful for non-Python-dependent serialization and hashing.
+
+    Args:
+        input: Object to be serialized and hashed. Supported types include
+            basic Python types and complex structures like lists, tuples, and
+            dictionaries.
+            Custom classes must implement CBOR serialization methods.
+
+    Returns:
+        Bytes representing the SHA-256 hash of the CBOR serialized input.
+    """
+    input_bytes = cbor2.dumps(input, canonical=True)
+    return hashlib.sha256(input_bytes).digest()
+
+
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
+    """Get a hash function by name, or raise an error if the function is not found.
+
+    Args:
+        hash_fn_name: Name of the hash function.
+
+    Returns:
+        A hash function.
+    """
+    if hash_fn_name == "sha256":
+        return sha256
+    if hash_fn_name == "sha256_cbor":
+        return sha256_cbor
+
+    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01d2c7a6a33d779586313b7ac4a1ccec79545f4
--- /dev/null
+++ b/vllm/utils/import_utils.py
@@ -0,0 +1,411 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Contains helpers related to importing modules.
+
+This is similar in concept to the `importlib` module.
+"""
+
+import importlib.metadata
+import importlib.util
+import os
+import sys
+from functools import cache
+from types import ModuleType
+from typing import Any
+
+import regex as re
+from typing_extensions import Never
+
+
+# TODO: This function can be removed if transformer_modules classes are
+# serialized by value when communicating between processes
+def init_cached_hf_modules() -> None:
+    """
+    Lazy initialization of the Hugging Face modules.
+    """
+    from transformers.dynamic_module_utils import init_hf_modules
+
+    init_hf_modules()
+
+
+def import_pynvml():
+    """
+    Historical comments:
+
+    libnvml.so is the library behind nvidia-smi, and
+    pynvml is a Python wrapper around it. We use it to get GPU
+    status without initializing CUDA context in the current process.
+    Historically, there are two packages that provide pynvml:
+    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
+        wrapper. It is a dependency of vLLM, and is installed when users
+        install vLLM. It provides a Python module named `pynvml`.
+    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
+        Prior to version 12.0, it also provides a Python module `pynvml`,
+        and therefore conflicts with the official one. What's worse,
+        the module is a Python package, and has higher priority than
+        the official one which is a standalone Python file.
+        This causes errors when both of them are installed.
+        Starting from version 12.0, it migrates to a new module
+        named `pynvml_utils` to avoid the conflict.
+    It is so confusing that many packages in the community use the
+    unofficial one by mistake, and we have to handle this case.
+    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
+    one, and it will cause errors, see the issue
+    https://github.com/vllm-project/vllm/issues/12847 for example.
+    After all the troubles, we decide to copy the official `pynvml`
+    module to our codebase, and use it directly.
+    """
+    import vllm.third_party.pynvml as pynvml
+
+    return pynvml
+
+
+def import_from_path(module_name: str, file_path: str | os.PathLike):
+    """
+    Import a Python file according to its file path.
+
+    Based on the official recipe:
+    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
+    """
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ModuleNotFoundError(f"No module named {module_name!r}")
+
+    assert spec.loader is not None
+
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully-qualified class name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)
+
+
+@cache
+def get_vllm_optional_dependencies():
+    metadata = importlib.metadata.metadata("vllm")
+    requirements = metadata.get_all("Requires-Dist", [])
+    extras = metadata.get_all("Provides-Extra", [])
+
+    return {
+        extra: [
+            re.split(r";|>=|<=|==", req)[0]
+            for req in requirements
+            if req.endswith(f'extra == "{extra}"')
+        ]
+        for extra in extras
+    }
+
+
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    [`__getattr__`][vllm.utils.import_utils._PlaceholderBase.__getattr__]
+    is not called when they are accessed.
+
+    Info:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
+    """
+    A placeholder object to use when a module does not exist.
+
+    This enables more informative errors when trying to access attributes
+    of a module that does not exist.
+    """
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self, attr_path)
+
+    def __getattr__(self, key: str) -> Never:
+        name = self.__name
+
+        try:
+            importlib.import_module(name)
+        except ImportError as exc:
+            for extra, names in get_vllm_optional_dependencies().items():
+                if name in names:
+                    msg = f"Please install vllm[{extra}] for {extra} support"
+                    raise ImportError(msg) from exc
+
+            raise exc
+
+        raise AssertionError(
+            "PlaceholderModule should not be used "
+            "when the original module can be imported"
+        )
+
+
+class _PlaceholderModuleAttr(_PlaceholderBase):
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self.__module, f"{self.__attr_path}.{attr_path}")
+
+    def __getattr__(self, key: str) -> Never:
+        getattr(self.__module, f"{self.__attr_path}.{key}")
+
+        raise AssertionError(
+            "PlaceholderModule should not be used "
+            "when the original module can be imported"
+        )
+
+
+class LazyLoader(ModuleType):
+    """
+    `LazyLoader` module borrowed from [Tensorflow]
+    (https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py)
+    with an addition of "module caching".
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+    Modules such as `xgrammar` might do additional side effects, so we
+    only want to use this when it is needed, delaying all eager effects.
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: dict[str, Any],
+        name: str,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._module: ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> ModuleType:
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules
+            # ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise err from None
+
+        # Update this object's dict so that if someone keeps a
+        # reference to the LazyLoader, lookups are efficient
+        # (__getattr__ is only called on lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> list[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
+
+
+# Optional dependency detection utilities
+@cache
+def _has_module(module_name: str) -> bool:
+    """Return True if *module_name* can be found in the current environment.
+
+    The result is cached so that subsequent queries for the same module incur
+    no additional overhead.
+    """
+    return importlib.util.find_spec(module_name) is not None
+
+
+def has_pplx() -> bool:
+    """Whether the optional `pplx_kernels` package is available."""
+    return _has_module("pplx_kernels")
+
+
+def has_deep_ep() -> bool:
+    """Whether the optional `deep_ep` package is available."""
+    return _has_module("deep_ep")
+
+
+def has_deep_gemm() -> bool:
+    """Whether the optional `deep_gemm` package is available."""
+    return _has_module("deep_gemm")
+
+
+def has_triton_kernels() -> bool:
+    """Whether the optional `triton_kernels` package is available."""
+    return _has_module("triton_kernels")
+
+
+def has_tilelang() -> bool:
+    """Whether the optional `tilelang` package is available."""
+    return _has_module("tilelang")
+
+
+def has_arctic_inference() -> bool:
+    """Whether the optional `arctic_inference` package is available."""
+
+    return _has_module("arctic_inference")
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index 7eb58b5f5cf84a4fd7d0920463a422eac193653f..cde9aa6ff901c7699a66ae8ba8ae2b913b00d41a 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
 
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload
+from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload
 
 if TYPE_CHECKING:
     import torch
@@ -14,23 +14,20 @@ if TYPE_CHECKING:
 _T = TypeVar("_T")
 _U = TypeVar("_U")
 
-JSONTree = Union[
-    dict[str, "JSONTree[_T]"],
-    list["JSONTree[_T]"],
-    tuple["JSONTree[_T]", ...],
-    _T,
-]
+JSONTree: TypeAlias = (
+    dict[str, "JSONTree[_T]"] | list["JSONTree[_T]"] | tuple["JSONTree[_T]", ...] | _T
+)
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
-_JSONTree = Union[
-    dict[str, "JSONTree[_T]"],
-    list["JSONTree[_T]"],
-    tuple["JSONTree[_T]", ...],
-    dict[str, _T],
-    list[_T],
-    tuple[_T, ...],
-    _T,
-]
+_JSONTree: TypeAlias = (
+    dict[str, "JSONTree[_T]"]
+    | list["JSONTree[_T]"]
+    | tuple["JSONTree[_T]", ...]
+    | dict[str, _T]
+    | list[_T]
+    | tuple[_T, ...]
+    | _T
+)
 """
 Same as `JSONTree` but with additional `Union` members to satisfy overloads.
 """
@@ -52,46 +49,41 @@ def json_iter_leaves(value: JSONTree[_T]) -> Iterable[_T]:
 def json_map_leaves(
     func: Callable[["torch.Tensor"], "torch.Tensor"],
     value: "BatchedTensorInputs",
-) -> "BatchedTensorInputs":
-    ...
+) -> "BatchedTensorInputs": ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, dict[str, _T]],
-) -> Union[_U, dict[str, _U]]:
-    ...
+    value: _T | dict[str, _T],
+) -> _U | dict[str, _U]: ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, list[_T]],
-) -> Union[_U, list[_U]]:
-    ...
+    value: _T | list[_T],
+) -> _U | list[_U]: ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, tuple[_T, ...]],
-) -> Union[_U, tuple[_U, ...]]:
-    ...
+    value: _T | tuple[_T, ...],
+) -> _U | tuple[_U, ...]: ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
     value: JSONTree[_T],
-) -> JSONTree[_U]:
-    ...
+) -> JSONTree[_U]: ...
 
 
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union["BatchedTensorInputs", _JSONTree[_T]],
-) -> Union["BatchedTensorInputs", _JSONTree[_U]]:
+    value: "BatchedTensorInputs" | _JSONTree[_T],
+) -> "BatchedTensorInputs" | _JSONTree[_U]:
     """Apply a function to each leaf in a nested JSON structure."""
     if isinstance(value, dict):
         return {
@@ -109,28 +101,25 @@ def json_map_leaves(
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, dict[str, _T]],
+    value: _T | dict[str, _T],
     /,
-) -> _T:
-    ...
+) -> _T: ...
 
 
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, list[_T]],
+    value: _T | list[_T],
     /,
-) -> _T:
-    ...
+) -> _T: ...
 
 
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, tuple[_T, ...]],
+    value: _T | tuple[_T, ...],
     /,
-) -> _T:
-    ...
+) -> _T: ...
 
 
 @overload
@@ -138,8 +127,7 @@ def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
     value: JSONTree[_T],
     /,
-) -> _T:
-    ...
+) -> _T: ...
 
 
 @overload
@@ -148,16 +136,15 @@ def json_reduce_leaves(
     value: JSONTree[_T],
     initial: _U,
     /,
-) -> _U:
-    ...
+) -> _U: ...
 
 
 def json_reduce_leaves(
-        func: Callable[..., Union[_T, _U]],
-        value: _JSONTree[_T],
-        initial: _U = cast(_U, ...),  # noqa: B008
-        /,
-) -> Union[_T, _U]:
+    func: Callable[..., _T | _U],
+    value: _JSONTree[_T],
+    initial: _U = cast(_U, ...),  # noqa: B008
+    /,
+) -> _T | _U:
     """
     Apply a function of two arguments cumulatively to each leaf in a
     nested JSON structure, from left to right, so as to reduce the
diff --git a/vllm/utils/math_utils.py b/vllm/utils/math_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdfa5fd2cbcbd64d79fa4d1e6f2d2aaf3beffec9
--- /dev/null
+++ b/vllm/utils/math_utils.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Math utility functions for vLLM."""
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def next_power_of_2(n: int) -> int:
+    """The next power of 2 (inclusive)"""
+    if n < 1:
+        return 1
+    return 1 << (n - 1).bit_length()
+
+
+def prev_power_of_2(n: int) -> int:
+    """The previous power of 2 (inclusive)"""
+    if n <= 0:
+        return 0
+    return 1 << (n.bit_length() - 1)
+
+
+def round_up(x: int, y: int) -> int:
+    """Round up x to the nearest multiple of y."""
+    return ((x + y - 1) // y) * y
+
+
+def round_down(x: int, y: int) -> int:
+    """Round down x to the nearest multiple of y."""
+    return (x // y) * y
diff --git a/vllm/utils/mem_constants.py b/vllm/utils/mem_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b725fbb0f268fa582a54a4d70ed70cc6cb9a03
--- /dev/null
+++ b/vllm/utils/mem_constants.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+MB_bytes = 1_000_000
+"""The number of bytes in one megabyte (MB)."""
+
+MiB_bytes = 1 << 20
+"""The number of bytes in one mebibyte (MiB)."""
+
+GB_bytes = 1_000_000_000
+"""The number of bytes in one gigabyte (GB)."""
+
+GiB_bytes = 1 << 30
+"""The number of bytes in one gibibyte (GiB)."""
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a6757bed3bfb589f24e00809e6885f1ff7b06e
--- /dev/null
+++ b/vllm/utils/mem_utils.py
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import gc
+import time
+from collections.abc import Generator
+from dataclasses import dataclass, field
+from functools import cache
+
+import psutil
+import torch
+import torch.types
+
+from .mem_constants import GiB_bytes
+
+
+@cache
+def get_max_shared_memory_bytes(gpu: int = 0) -> int:
+    """Returns the maximum shared memory per thread block in bytes."""
+    from vllm import _custom_ops as ops
+
+    max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu)
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
+    assert max_shared_mem > 0, "max_shared_mem can not be zero"
+    return int(max_shared_mem)
+
+
+def get_cpu_memory() -> int:
+    """Returns the total CPU memory of the node in bytes."""
+    return psutil.virtual_memory().total
+
+
+class DeviceMemoryProfiler:
+    def __init__(self, device: torch.types.Device | None = None):
+        self.device = device
+
+    def current_memory_usage(self) -> float:
+        # Return the memory usage in bytes.
+        from vllm.platforms import current_platform
+
+        gc.collect()
+        return current_platform.get_current_memory_usage(self.device)
+
+    def __enter__(self):
+        self.initial_memory = self.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = self.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+
+    torch_peak: int = 0
+    free_memory: int = 0
+    total_memory: int = 0
+    cuda_memory: int = 0
+    torch_memory: int = 0
+    non_torch_memory: int = 0
+    timestamp: float = 0.0
+    auto_measure: bool = True
+
+    def __post_init__(self):
+        if self.auto_measure:
+            self.measure()
+
+    def measure(self):
+        from vllm.platforms import current_platform
+
+        # we measure the torch peak memory usage via allocated_bytes,
+        # rather than `torch.cuda.memory_reserved()` .
+        # After `torch.cuda.reset_peak_memory_stats()`,
+        # `torch.cuda.memory_reserved()` will keep growing, and only shrink
+        # when we call `torch.cuda.empty_cache()` or OOM happens.
+        self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
+
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
+        if (
+            current_platform.is_cuda()
+            and current_platform.get_device_capability() in shared_sysmem_device_mem_sms
+        ):
+            # On UMA (Orin, Thor and Spark) platform,
+            # where both CPU and GPU rely on system memory,
+            # the cudaMemGetInfo function shows the amount of free system memory
+            # rather than what’s actually available.
+            # In the case,
+            # torch.cuda.mem_get_info() only reports "free" memory,
+            # which can be lower than what is actually
+            # available due to not including cache memory.
+            # There’s also a comprehensive reference page
+            # that explains how you can compute the proper value yourself.
+            # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
+            self.free_memory = psutil.virtual_memory().available
+
+        self.cuda_memory = self.total_memory - self.free_memory
+
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        # this is used to measure the non-torch memory usage
+        self.torch_memory = torch.cuda.memory_reserved()
+
+        self.non_torch_memory = self.cuda_memory - self.torch_memory
+        self.timestamp = time.time()
+
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        return MemorySnapshot(
+            torch_peak=self.torch_peak - other.torch_peak,
+            free_memory=self.free_memory - other.free_memory,
+            total_memory=self.total_memory - other.total_memory,
+            cuda_memory=self.cuda_memory - other.cuda_memory,
+            torch_memory=self.torch_memory - other.torch_memory,
+            non_torch_memory=self.non_torch_memory - other.non_torch_memory,
+            timestamp=self.timestamp - other.timestamp,
+            auto_measure=False,
+        )
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result. All numbers are in bytes."""
+
+    non_kv_cache_memory: int = 0
+    torch_peak_increase: int = 0
+    non_torch_increase: int = 0
+    weights_memory: float = 0
+    before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+    def __repr__(self) -> str:
+        return (
+            f"Memory profiling takes {self.profile_time:.2f} seconds. "
+            f"Total non KV cache memory: "
+            f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+            f"torch peak memory increase: "
+            f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+            f"non-torch forward increase memory: "
+            f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+            f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
+        )
+
+
+@contextlib.contextmanager
+def memory_profiling(
+    baseline_snapshot: MemorySnapshot, weights_memory: int
+) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_snapshot: the memory snapshot before the current vLLM instance.
+    weights_memory: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
+
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
+
+    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
+    """  # noqa
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.before_create = baseline_snapshot
+    # the part of memory used for holding the model weights
+    result.weights_memory = weights_memory
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff_profile = result.after_profile - result.before_profile
+    diff_from_create = result.after_profile - result.before_create
+    result.torch_peak_increase = diff_profile.torch_peak
+    result.non_torch_increase = diff_from_create.non_torch_memory
+    result.profile_time = diff_profile.timestamp
+
+    non_torch_memory = result.non_torch_increase
+    peak_activation_memory = result.torch_peak_increase
+    result.non_kv_cache_memory = (
+        non_torch_memory + peak_activation_memory + result.weights_memory
+    )  # noqa
diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1459fcbd246a1f4af2a43085b5c796f8ad723ab
--- /dev/null
+++ b/vllm/utils/nccl.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import importlib
+import os
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def find_nccl_library() -> str:
+    """Return NCCL/RCCL shared library name to load.
+
+    Uses `VLLM_NCCL_SO_PATH` if set; otherwise chooses by torch backend.
+    """
+    so_file = envs.VLLM_NCCL_SO_PATH
+    if so_file:
+        logger.info(
+            "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
+        )
+    else:
+        if torch.version.cuda is not None:
+            so_file = "libnccl.so.2"
+        elif torch.version.hip is not None:
+            so_file = "librccl.so.1"
+        else:
+            raise ValueError("NCCL only supports CUDA and ROCm backends.")
+        logger.debug_once("Found nccl from library %s", so_file)
+    return so_file
+
+
+def find_nccl_include_paths() -> list[str] | None:
+    """Return possible include paths containing `nccl.h`.
+
+    Considers `VLLM_NCCL_INCLUDE_PATH` and the `nvidia-nccl-cuXX` package.
+    """
+    paths: list[str] = []
+    inc = envs.VLLM_NCCL_INCLUDE_PATH
+    if inc and os.path.isdir(inc):
+        paths.append(inc)
+
+    try:
+        spec = importlib.util.find_spec("nvidia.nccl")
+        if spec and getattr(spec, "submodule_search_locations", None):
+            for loc in spec.submodule_search_locations:
+                inc_dir = os.path.join(loc, "include")
+                if os.path.exists(os.path.join(inc_dir, "nccl.h")):
+                    paths.append(inc_dir)
+    except Exception as e:
+        logger.debug("Failed to find nccl include path from nvidia.nccl package: %s", e)
+
+    seen: set[str] = set()
+    out: list[str] = []
+    for p in paths:
+        if p and p not in seen:
+            out.append(p)
+            seen.add(p)
+    return out or None
diff --git a/vllm/utils/network_utils.py b/vllm/utils/network_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a68e48ba5e7a320b9847caa014684871a5cf802
--- /dev/null
+++ b/vllm/utils/network_utils.py
@@ -0,0 +1,331 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import ipaddress
+import os
+import socket
+import sys
+import warnings
+from collections.abc import (
+    Iterator,
+    Sequence,
+)
+from typing import Any
+from urllib.parse import urlparse
+from uuid import uuid4
+
+import psutil
+import zmq
+import zmq.asyncio
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def close_sockets(sockets: Sequence[zmq.Socket | zmq.asyncio.Socket]):
+    for sock in sockets:
+        if sock is not None:
+            sock.close(linger=0)
+
+
+def get_ip() -> str:
+    host_ip = envs.VLLM_HOST_IP
+    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
+        logger.warning(
+            "The environment variable HOST_IP is deprecated and ignored, as"
+            " it is often used by Docker and other software to"
+            " interact with the container's network stack. Please "
+            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
+            " to communicate with each other."
+        )
+    if host_ip:
+        return host_ip
+
+    # IP is not set, try to get it from the network interface
+
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+            return s.getsockname()[0]
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
+        with socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) as s:
+            # Google's public DNS server, see
+            # https://developers.google.com/speed/public-dns/docs/using#addresses
+            s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+            return s.getsockname()[0]
+    except Exception:
+        pass
+
+    warnings.warn(
+        "Failed to get the IP address, using 0.0.0.0 by default."
+        "The value can be set by the environment variable"
+        " VLLM_HOST_IP or HOST_IP.",
+        stacklevel=2,
+    )
+    return "0.0.0.0"
+
+
+def test_loopback_bind(address, family):
+    try:
+        s = socket.socket(family, socket.SOCK_DGRAM)
+        s.bind((address, 0))  # Port 0 = auto assign
+        s.close()
+        return True
+    except OSError:
+        return False
+
+
+def get_loopback_ip() -> str:
+    loopback_ip = envs.VLLM_LOOPBACK_IP
+    if loopback_ip:
+        return loopback_ip
+
+    # VLLM_LOOPBACK_IP is not set, try to get it based on network interface
+
+    if test_loopback_bind("127.0.0.1", socket.AF_INET):
+        return "127.0.0.1"
+    elif test_loopback_bind("::1", socket.AF_INET6):
+        return "::1"
+    else:
+        raise RuntimeError(
+            "Neither 127.0.0.1 nor ::1 are bound to a local interface. "
+            "Set the VLLM_LOOPBACK_IP environment variable explicitly."
+        )
+
+
+def is_valid_ipv6_address(address: str) -> bool:
+    try:
+        ipaddress.IPv6Address(address)
+        return True
+    except ValueError:
+        return False
+
+
+def split_host_port(host_port: str) -> tuple[str, int]:
+    # ipv6
+    if host_port.startswith("["):
+        host, port = host_port.rsplit("]", 1)
+        host = host[1:]
+        port = port.split(":")[1]
+        return host, int(port)
+    else:
+        host, port = host_port.split(":")
+        return host, int(port)
+
+
+def join_host_port(host: str, port: int) -> str:
+    if is_valid_ipv6_address(host):
+        return f"[{host}]:{port}"
+    else:
+        return f"{host}:{port}"
+
+
+def get_distributed_init_method(ip: str, port: int) -> str:
+    return get_tcp_uri(ip, port)
+
+
+def get_tcp_uri(ip: str, port: int) -> str:
+    if is_valid_ipv6_address(ip):
+        return f"tcp://[{ip}]:{port}"
+    else:
+        return f"tcp://{ip}:{port}"
+
+
+def get_open_zmq_ipc_path() -> str:
+    base_rpc_path = envs.VLLM_RPC_BASE_PATH
+    return f"ipc://{base_rpc_path}/{uuid4()}"
+
+
+def get_open_zmq_inproc_path() -> str:
+    return f"inproc://{uuid4()}"
+
+
+def get_open_port() -> int:
+    """
+    Get an open port for the vLLM process to listen on.
+    An edge case to handle, is when we run data parallel,
+    we need to avoid ports that are potentially used by
+    the data parallel master process.
+    Right now we reserve 10 ports for the data parallel master
+    process. Currently it uses 2 ports.
+    """
+    if "VLLM_DP_MASTER_PORT" in os.environ:
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
+        while True:
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
+    return _get_open_port()
+
+
+def get_open_ports_list(count: int = 5) -> list[int]:
+    """Get a list of open ports."""
+    ports = set[int]()
+    while len(ports) < count:
+        ports.add(get_open_port())
+    return list(ports)
+
+
+def _get_open_port() -> int:
+    port = envs.VLLM_PORT
+    if port is not None:
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d", port - 1, port)
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def find_process_using_port(port: int) -> psutil.Process | None:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
+    our_pid = os.getpid()
+    for conn in psutil.net_connections():
+        if conn.laddr.port == port and (conn.pid is not None and conn.pid != our_pid):
+            try:
+                return psutil.Process(conn.pid)
+            except psutil.NoSuchProcess:
+                return None
+    return None
+
+
+def split_zmq_path(path: str) -> tuple[str, str, str]:
+    """Split a zmq path into its parts."""
+    parsed = urlparse(path)
+    if not parsed.scheme:
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    scheme = parsed.scheme
+    host = parsed.hostname or ""
+    port = str(parsed.port or "")
+
+    if scheme == "tcp" and not all((host, port)):
+        # The host and port fields are required for tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    if scheme != "tcp" and port:
+        # port only makes sense with tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    return scheme, host, port
+
+
+def make_zmq_path(scheme: str, host: str, port: int | None = None) -> str:
+    """Make a ZMQ path from its parts.
+
+    Args:
+        scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc).
+        host: The host - can be an IPv4 address, IPv6 address, or hostname.
+        port: Optional port number, only used for TCP sockets.
+
+    Returns:
+        A properly formatted ZMQ path string.
+    """
+    if port is None:
+        return f"{scheme}://{host}"
+    if is_valid_ipv6_address(host):
+        return f"{scheme}://[{host}]:{port}"
+    return f"{scheme}://{host}:{port}"
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
+def make_zmq_socket(
+    ctx: zmq.asyncio.Context | zmq.Context,  # type: ignore[name-defined]
+    path: str,
+    socket_type: Any,
+    bind: bool | None = None,
+    identity: bytes | None = None,
+    linger: int | None = None,
+) -> zmq.Socket | zmq.asyncio.Socket:  # type: ignore[name-defined]
+    """Make a ZMQ socket with the proper bind/connect semantics."""
+
+    mem = psutil.virtual_memory()
+    socket = ctx.socket(socket_type)
+
+    # Calculate buffer size based on system memory
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
+    buf_size = int(0.5 * 1024**3) if total_mem > 32 and available_mem > 16 else -1
+
+    if bind is None:
+        bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB)
+
+    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    if identity is not None:
+        socket.setsockopt(zmq.IDENTITY, identity)
+
+    if linger is not None:
+        socket.setsockopt(zmq.LINGER, linger)
+
+    if socket_type == zmq.XPUB:
+        socket.setsockopt(zmq.XPUB_VERBOSE, True)
+
+    # Determine if the path is a TCP socket with an IPv6 address.
+    # Enable IPv6 on the zmq socket if so.
+    scheme, host, _ = split_zmq_path(path)
+    if scheme == "tcp" and is_valid_ipv6_address(host):
+        socket.setsockopt(zmq.IPV6, 1)
+
+    if bind:
+        socket.bind(path)
+    else:
+        socket.connect(path)
+
+    return socket
+
+
+@contextlib.contextmanager
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    bind: bool | None = None,
+    linger: int = 0,
+    identity: bytes | None = None,
+) -> Iterator[zmq.Socket]:
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context()  # type: ignore[attr-defined]
+    try:
+        yield make_zmq_socket(ctx, path, socket_type, bind=bind, identity=identity)
+    except KeyboardInterrupt:
+        logger.debug("Got Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=linger)
diff --git a/vllm/utils/platform_utils.py b/vllm/utils/platform_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..433c6734e8a928329741acb59d5f38ffef3ba6df
--- /dev/null
+++ b/vllm/utils/platform_utils.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+from collections.abc import Sequence
+from concurrent.futures.process import ProcessPoolExecutor
+from functools import cache
+from typing import Any
+
+import torch
+
+
+def cuda_is_initialized() -> bool:
+    """Check if CUDA is initialized."""
+    if not torch.cuda._is_compiled():
+        return False
+    return torch.cuda.is_initialized()
+
+
+def xpu_is_initialized() -> bool:
+    """Check if XPU is initialized."""
+    if not torch.xpu._is_compiled():
+        return False
+    return torch.xpu.is_initialized()
+
+
+def get_cu_count(device_id: int = 0) -> int:
+    """Returns the total number of compute units (CU) on single GPU."""
+    return torch.cuda.get_device_properties(device_id).multi_processor_count
+
+
+def cuda_get_device_properties(
+    device, names: Sequence[str], init_cuda=False
+) -> tuple[Any, ...]:
+    """Get specified CUDA device property values without initializing CUDA in
+    the current process."""
+    if init_cuda or cuda_is_initialized():
+        props = torch.cuda.get_device_properties(device)
+        return tuple(getattr(props, name) for name in names)
+
+    # Run in subprocess to avoid initializing CUDA as a side effect.
+    mp_ctx = multiprocessing.get_context("fork")
+    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
+        return executor.submit(cuda_get_device_properties, device, names, True).result()
+
+
+@cache
+def is_pin_memory_available() -> bool:
+    from vllm.platforms import current_platform
+
+    return current_platform.is_pin_memory_available()
+
+
+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
diff --git a/vllm/utils/profiling.py b/vllm/utils/profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6691069395776e370050a46e43ba3f42decff50
--- /dev/null
+++ b/vllm/utils/profiling.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import contextlib
+from collections.abc import Callable
+from functools import wraps
+from typing import Any
+
+
+@contextlib.contextmanager
+def cprofile_context(save_file: str | None = None):
+    """Run a cprofile
+
+    Args:
+        save_file: path to save the profile result. "1" or
+            None will result in printing to stdout.
+    """
+    import cProfile
+
+    prof = cProfile.Profile()
+    prof.enable()
+
+    try:
+        yield
+    finally:
+        prof.disable()
+        if save_file and save_file != "1":
+            prof.dump_stats(save_file)
+        else:
+            prof.print_stats(sort="cumtime")
+
+
+def cprofile(save_file: str | None = None, enabled: bool = True):
+    """Decorator to profile a Python method using cProfile.
+
+    Args:
+        save_file: Path to save the profile result.
+            If "1", None, or "", results will be printed to stdout.
+        enabled: Set to false to turn this into a no-op
+    """
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any):
+            if not enabled:
+                # If profiling is disabled, just call the function directly.
+                return func(*args, **kwargs)
+
+            with cprofile_context(save_file):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9b859159eadba3ecd7db902c3a081fee9fdf43
--- /dev/null
+++ b/vllm/utils/registry.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+
+class ExtensionManager:
+    """
+    A registry for managing pluggable extension classes.
+
+    This class provides a simple mechanism to register and instantiate
+    extension classes by name. It is commonly used to implement plugin
+    systems where different implementations can be swapped at runtime.
+
+    Examples:
+        Basic usage with a registry instance:
+
+        >>> FOO_REGISTRY = ExtensionManager()
+        >>> @FOO_REGISTRY.register("my_foo_impl")
+        ... class MyFooImpl(Foo):
+        ...     def __init__(self, value):
+        ...         self.value = value
+        >>> foo_impl = FOO_REGISTRY.load("my_foo_impl", value=123)
+
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize an empty extension registry.
+        """
+        self.name2class: dict[str, type] = {}
+
+    def register(self, name: str):
+        """
+        Decorator to register a class with the given name.
+        """
+
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+
+        return wrap
+
+    def load(self, cls_name: str, *args, **kwargs) -> Any:
+        """
+        Instantiate and return a registered extension class by name.
+        """
+        cls = self.name2class.get(cls_name)
+        assert cls is not None, f"Extension class {cls_name} not found"
+        return cls(*args, **kwargs)
diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b89fa6ce4db66e0c429a5429628f2f41197f535d
--- /dev/null
+++ b/vllm/utils/serial_utils.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import sys
+from dataclasses import dataclass
+from typing import Literal
+
+import numpy as np
+import torch
+from typing_extensions import assert_never
+
+from vllm import PoolingRequestOutput
+
+sys_byteorder = sys.byteorder
+
+
+EMBED_DTYPE_TO_TORCH_DTYPE = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    # I'm not sure if other platforms' CPUs support the fp8 data format.
+    # EMBED_DTYPE only uses the fp8 data representation,
+    # does not use fp8 computation, and only occurs on the CPU.
+    # Apologize for any possible break.
+    "fp8_e4m3": torch.float8_e4m3fn,
+    "fp8_e5m2": torch.float8_e5m2,
+}
+
+
+EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    # numpy does not support bfloat16 and fp8
+    "bfloat16": torch.float16,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
+}
+
+EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW = {
+    "float32": np.float32,
+    "float16": np.float16,
+    # numpy does not support bfloat16 and fp8
+    "bfloat16": np.float16,
+    "fp8_e4m3": np.uint8,
+    "fp8_e5m2": np.uint8,
+}
+
+ENDIANNESS = ["native", "big", "little"]
+
+EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
+Endianness = Literal["native", "big", "little"]
+EncodingFormat = Literal["float", "base64", "bytes"]
+
+
+def tensor2binary(
+    tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness
+) -> bytes:
+    assert isinstance(tensor, torch.Tensor)
+    assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE
+    assert endianness in ENDIANNESS
+
+    torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype]
+    torch_view_dtype = EMBED_DTYPE_TO_TORCH_DTYPE_VIEW[embed_dtype]
+
+    np_array = (
+        tensor.to(torch_dtype).flatten().contiguous().view(torch_view_dtype).numpy()
+    )
+
+    if endianness != "native" and endianness != sys_byteorder:
+        np_array = np_array.byteswap()
+
+    return np_array.tobytes()
+
+
+def binary2tensor(
+    binary: bytes,
+    shape: tuple[int, ...],
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+) -> torch.Tensor:
+    assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE
+    assert embed_dtype in EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW
+    assert endianness in ENDIANNESS
+
+    torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype]
+    np_dtype = EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW[embed_dtype]
+
+    np_array = np.frombuffer(binary, dtype=np_dtype).reshape(shape)
+
+    if endianness != "native" and endianness != sys_byteorder:
+        np_array = np_array.byteswap()
+
+    return torch.from_numpy(np_array).view(torch_dtype)
+
+
+def encode_pooling_output(
+    output: PoolingRequestOutput,
+    encoding_format: EncodingFormat,
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+) -> list[float] | str | bytes:
+    if encoding_format == "float":
+        return output.outputs.data.tolist()
+    elif encoding_format == "base64":
+        embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
+        return base64.b64encode(embedding_bytes).decode("utf-8")
+    elif encoding_format == "bytes":
+        return tensor2binary(output.outputs.data, embed_dtype, endianness)
+    assert_never(encoding_format)
+
+
+@dataclass
+class MetadataItem:
+    index: int
+    embed_dtype: EmbedDType
+    endianness: Endianness
+    start: int
+    end: int
+    shape: tuple[int, ...]
+
+
+def encode_pooling_bytes(
+    pooling_outputs: list[PoolingRequestOutput],
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+):
+    num_prompt_tokens = 0
+    items: list[dict[str, MetadataItem]] = []
+    body = []
+    offset = 0
+    for idx, output in enumerate(pooling_outputs):
+        binary = tensor2binary(
+            tensor=output.outputs.data,
+            embed_dtype=embed_dtype,
+            endianness=endianness,
+        )
+        size = len(binary)
+
+        item = {
+            "index": idx,
+            "embed_dtype": embed_dtype,
+            "endianness": endianness,
+            "start": offset,
+            "end": offset + size,
+            "shape": output.outputs.data.shape,
+        }
+
+        body.append(binary)
+        items.append(item)
+        prompt_token_ids = output.prompt_token_ids
+        num_prompt_tokens += len(prompt_token_ids)
+        offset += size
+
+    usage = {
+        "prompt_tokens": num_prompt_tokens,
+        "total_tokens": num_prompt_tokens,
+    }
+    return body, items, usage
+
+
+def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.Tensor]:
+    items.sort(key=lambda x: x.index)
+
+    tensor_list: list[torch.Tensor] = []
+    for item in items:
+        binary = body[item.start : item.end]
+        tensor = binary2tensor(binary, item.shape, item.embed_dtype, item.endianness)
+        tensor_list.append(tensor)
+    return tensor_list
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5968884e232a46eef2a5475907098a011c11f24a
--- /dev/null
+++ b/vllm/utils/system_utils.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import contextlib
+import multiprocessing
+import os
+import signal
+import sys
+from collections.abc import Callable, Iterator
+from pathlib import Path
+from typing import TextIO
+
+import psutil
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.ray.lazy_utils import is_in_ray_actor
+
+from .platform_utils import cuda_is_initialized, xpu_is_initialized
+
+logger = init_logger(__name__)
+
+CYAN = "\033[1;36m"
+RESET = "\033[0;0m"
+
+
+# Environment variable utilities
+
+
+def update_environment_variables(envs_dict: dict[str, str]):
+    """Update multiple environment variables with logging."""
+    for k, v in envs_dict.items():
+        if k in os.environ and os.environ[k] != v:
+            logger.warning(
+                "Overwriting environment variable %s from '%s' to '%s'",
+                k,
+                os.environ[k],
+                v,
+            )
+        os.environ[k] = v
+
+
+@contextlib.contextmanager
+def set_env_var(key: str, value: str) -> Iterator[None]:
+    """Temporarily set an environment variable."""
+    old = os.environ.get(key)
+    os.environ[key] = value
+    try:
+        yield
+    finally:
+        if old is None:
+            os.environ.pop(key, None)
+        else:
+            os.environ[key] = old
+
+
+# File path utilities
+
+
+def unique_filepath(fn: Callable[[int], Path]) -> Path:
+    """Generate a unique file path by trying incrementing integers.
+
+    Note: This function has a TOCTOU race condition.
+    Caller should use atomic operations (e.g., open with 'x' mode)
+    when creating the file to ensure thread safety.
+    """
+    i = 0
+    while True:
+        p = fn(i)
+        if not p.exists():
+            return p
+        i += 1
+
+
+# Process management utilities
+
+
+def _maybe_force_spawn():
+    """Check if we need to force the use of the `spawn` multiprocessing start
+    method.
+    """
+    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
+        return
+
+    reasons = []
+    if is_in_ray_actor():
+        # even if we choose to spawn, we need to pass the ray address
+        # to the subprocess so that it knows how to connect to the ray cluster.
+        # env vars are inherited by subprocesses, even if we use spawn.
+        import ray
+
+        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
+        reasons.append("In a Ray actor and can only be spawned")
+
+    if cuda_is_initialized():
+        reasons.append("CUDA is initialized")
+    elif xpu_is_initialized():
+        reasons.append("XPU is initialized")
+
+    if reasons:
+        logger.warning(
+            "We must use the `spawn` multiprocessing start method. "
+            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+            "See https://docs.vllm.ai/en/latest/usage/"
+            "troubleshooting.html#python-multiprocessing "
+            "for more information. Reasons: %s",
+            "; ".join(reasons),
+        )
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def get_mp_context():
+    """Get a multiprocessing context with a particular method (spawn or fork).
+    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
+    determine the multiprocessing method (default is fork). However, under
+    certain conditions, we may enforce spawn and override the value of
+    VLLM_WORKER_MULTIPROC_METHOD.
+    """
+    _maybe_force_spawn()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
+
+
+def set_process_title(
+    name: str,
+    suffix: str = "",
+    prefix: str = envs.VLLM_PROCESS_NAME_PREFIX,
+) -> None:
+    """Set the current process title with optional suffix."""
+    try:
+        import setproctitle
+    except ImportError:
+        return
+
+    if suffix:
+        name = f"{name}_{suffix}"
+
+    setproctitle.setproctitle(f"{prefix}::{name}")
+
+
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Add colored prefix to file output for log decoration."""
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find("\n", idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+
+
+def decorate_logs(process_name: str | None = None) -> None:
+    """Decorate stdout/stderr with process name and PID prefix."""
+    if process_name is None:
+        process_name = get_mp_context().current_process().name
+
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+
+def kill_process_tree(pid: int):
+    """
+    Kills all descendant processes of the given pid by sending SIGKILL.
+
+    Args:
+        pid (int): Process ID of the parent process
+    """
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+
+    # Get all children recursively
+    children = parent.children(recursive=True)
+
+    # Send SIGKILL to all children first
+    for child in children:
+        with contextlib.suppress(ProcessLookupError):
+            os.kill(child.pid, signal.SIGKILL)
+
+    # Finally kill the parent
+    with contextlib.suppress(ProcessLookupError):
+        os.kill(pid, signal.SIGKILL)
+
+
+# Resource utilities
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630
+def set_ulimit(target_soft_limit: int = 65535):
+    if sys.platform.startswith("win"):
+        logger.info("Windows detected, skipping ulimit adjustment.")
+        return
+
+    import resource
+
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase "
+                "with error %s. This can cause fd limit errors like "
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n",
+                current_soft,
+                e,
+            )
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index d75dbcd5401b28c6b1cd02ddb1c37200be204ace..526dfd38bac46fe91184b31af04a6e2ceef003a2 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
-                    get_type_hints)
+from types import UnionType
+from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
 
 import torch
 
@@ -11,19 +11,18 @@ logger = init_logger(__name__)
 
 
 class TensorShape:
-
     def __init__(
         self,
-        *dims: Union[int, str],
-        dynamic_dims: Optional[set[str]] = None,
+        *dims: int | str,
+        dynamic_dims: set[str] | None = None,
     ) -> None:
         super().__init__()
 
         self.dims = dims
         self.dynamic_dims = dynamic_dims if dynamic_dims else set()
 
-    def resolve(self, **bindings: int) -> tuple[Union[int, str], ...]:
-        resolved = list[Union[int, str]]()
+    def resolve(self, **bindings: int) -> tuple[int | str, ...]:
+        resolved = list[int | str]()
         for dim in self.dims:
             if isinstance(dim, str) and dim in bindings:
                 resolved.append(bindings[dim])
@@ -37,8 +36,7 @@ class TensorShape:
         for dim in self.dims:
             if isinstance(dim, str):
                 if dim in self.dynamic_dims:
-                    dim_strs.append(
-                        f"{dim}*")  # Mark dynamic dimensions with *
+                    dim_strs.append(f"{dim}*")  # Mark dynamic dimensions with *
                 else:
                     dim_strs.append(dim)
             else:
@@ -47,12 +45,11 @@ class TensorShape:
 
 
 class TensorSchema:
-
     def __init__(
         self,
         *,
         validate: bool = True,
-        resolve_bindings: Optional[dict[str, int]] = None,
+        resolve_bindings: dict[str, int] | None = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -75,7 +72,7 @@ class TensorSchema:
         self,
         actual: tuple[int, ...],
         reference: tuple[int, ...],
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         dynamic_dims: set[str],
     ) -> bool:
         if len(actual) != len(reference) or len(actual) > len(expected_shape):
@@ -94,39 +91,71 @@ class TensorSchema:
                 return False
         return True
 
-    def _validate_nested_tensors(
+    def _fmt_indexer(self, idxs: tuple[int, ...]) -> str:
+        if not idxs:
+            return ""
+
+        return str(list(idxs))
+
+    def _validate_field(
         self,
-        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+        value: object,
         field_name: str,
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         dynamic_dims: set[str],
+        leading_idxs: tuple[int, ...] = (),
     ) -> tuple[int, ...]:
-        """Validate a list/tuple of tensors and return the actual shape."""
+        """Validate a field and return the actual shape."""
+        if isinstance(value, (int, float)):
+            return ()  # Scalar
+        if isinstance(value, torch.Tensor):
+            return value.shape
+
+        if not isinstance(value, (list, tuple)):
+            raise TypeError(
+                f"{field_name}{self._fmt_indexer(leading_idxs)} is not "
+                f"one of the expected types: int, float, Tensor, list, tuple. "
+                f"Got: {type(value)}"
+            )
+
+        if len(value) == 0:
+            raise ValueError(
+                f"{field_name}{self._fmt_indexer(leading_idxs)} is an empty sequence"
+            )
+
         # Ensure all tensors in the list have the same
         # shape, besides dynamic dimensions
-        first = value[0]
         for i, v in enumerate(value):
-            if not isinstance(v, torch.Tensor):
-                raise ValueError(f"{field_name}[{i}] is not a "
-                                 f"torch.Tensor")
-            if not self._match_shape_with_dynamic(
-                    v.shape,
-                    first.shape,
-                    expected_shape,
-                    dynamic_dims,
+            shape = self._validate_field(
+                v,
+                field_name,
+                expected_shape[1:],
+                dynamic_dims,
+                leading_idxs=leading_idxs + (i,),
+            )
+
+            if i == 0:
+                first_shape = shape
+            elif not self._match_shape_with_dynamic(
+                shape,
+                first_shape,
+                expected_shape,
+                dynamic_dims,
             ):
-                raise ValueError(f"{field_name} contains inconsistent "
-                                 f"shapes: {first.shape} vs {v.shape} "
-                                 f"at index {i}")
+                raise ValueError(
+                    f"{field_name}{self._fmt_indexer(leading_idxs)} "
+                    f"contains inconsistent shapes: {first_shape} "
+                    f"(index 0) vs {shape} (index {i})"
+                )
 
         # Treat the list as a stacked tensor:
         # shape = (len(list), *tensor.shape)
-        return (len(value), ) + first.shape
+        return (len(value),) + first_shape
 
     def _validate_tensor_shape_expected(
         self,
         actual_shape: tuple[int, ...],
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         field_name: str,
         shape_env: dict[str, int],
         dynamic_dims: set[str],
@@ -134,27 +163,38 @@ class TensorSchema:
         """Validate that the actual tensor shape matches the expected shape."""
 
         if len(actual_shape) != len(expected_shape):
-            raise ValueError(f"{field_name} has rank {len(actual_shape)} "
-                             f"but expected {len(expected_shape)}")
+            raise ValueError(
+                f"{field_name} has rank {len(actual_shape)} "
+                f"but expected {len(expected_shape)}. "
+                f"Expected shape: {expected_shape}, "
+                f"but got {actual_shape}"
+            )
 
         for i, dim in enumerate(expected_shape):
             if dim in dynamic_dims:
                 continue
             elif isinstance(dim, int):
                 if actual_shape[i] != dim:
-                    raise ValueError(f"{field_name} dim[{i}] expected "
-                                     f"{dim}, got {actual_shape[i]}")
+                    raise ValueError(
+                        f"{field_name} dim[{i}] expected "
+                        f"{dim}, got {actual_shape[i]}. "
+                        f"Expected shape: {expected_shape}, "
+                        f"but got {actual_shape}"
+                    )
             elif isinstance(dim, str):
                 if dim in shape_env:
                     if actual_shape[i] != shape_env[dim]:
-                        raise ValueError(f"{field_name} dim[{i}] expected "
-                                         f"'{dim}'={shape_env[dim]}, got "
-                                         f"{actual_shape[i]}")
+                        raise ValueError(
+                            f"{field_name} dim[{i}] expected "
+                            f"'{dim}'={shape_env[dim]}, got "
+                            f"{actual_shape[i]}"
+                        )
                 else:
                     shape_env[dim] = actual_shape[i]
             else:
-                raise TypeError(f"{field_name} dim[{i}] has unsupported "
-                                f"type: {type(dim)}")
+                raise TypeError(
+                    f"{field_name} dim[{i}] has unsupported type: {type(dim)}"
+                )
 
     def validate(self) -> None:
         type_hints = get_type_hints(self.__class__, include_extras=True)
@@ -162,8 +202,7 @@ class TensorSchema:
 
         for field_name, field_type in type_hints.items():
             # Check if field is missing
-            if (not hasattr(self, field_name)
-                    or getattr(self, field_name) is None):
+            if not hasattr(self, field_name) or getattr(self, field_name) is None:
                 # Check if field is marked as optional
                 actual_type = field_type
                 if get_origin(field_type) is Annotated:
@@ -171,7 +210,8 @@ class TensorSchema:
                     actual_type = args[0]
 
                 # Check arg was provided as Union
-                if get_origin(actual_type) is Union:
+                if get_origin(actual_type) in {Union, UnionType}:
+                    # Union for Union[X, Y] and UnionType for X | Y
                     args = get_args(actual_type)
                     # Skip validation when Union contains None
                     if type(None) in args:
@@ -187,40 +227,20 @@ class TensorSchema:
                 for arg in args:
                     if isinstance(arg, TensorShape):
                         expected_shape = arg.resolve(**self._resolve_bindings)
-                        if isinstance(value, (list, tuple)):
-                            # list/tuple of Tensors → shape = (len(value), ...)
-                            if value and isinstance(value[0], torch.Tensor):
-                                actual_shape = self._validate_nested_tensors(
-                                    value, field_name, expected_shape,
-                                    arg.dynamic_dims)
-                            elif value:
-                                # list/tuple of scalars → shape = (len(value),)
-                                actual_shape = (len(value), )
-                            else:
-                                raise ValueError(
-                                    f"{field_name} is an empty list")
-
-                        # Tensor → shape = tensor.shape
-                        elif isinstance(value, torch.Tensor):
-                            actual_shape = value.shape
-
-                        # Otherwise, it's an unsupported type
-                        else:
-                            type_names = []
-                            for arg in args:
-                                if hasattr(arg, "__name__"):
-                                    type_names.append(str(arg.__name__))
-                                else:
-                                    type_names.append(str(arg))
-
-                            expected_types = ", ".join(type_names)
-                            raise ValueError(
-                                f"{field_name} is not one of the expected "
-                                f"types: {expected_types}")
+                        actual_shape = self._validate_field(
+                            value,
+                            field_name,
+                            expected_shape,
+                            arg.dynamic_dims,
+                        )
 
                         self._validate_tensor_shape_expected(
-                            actual_shape, expected_shape, field_name,
-                            shape_env, arg.dynamic_dims)
+                            actual_shape,
+                            expected_shape,
+                            field_name,
+                            shape_env,
+                            arg.dynamic_dims,
+                        )
 
     def print_shapes(self) -> None:
         """Print TensorShape annotations for debugging."""
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c094e14cff72387639f8d056b9c22f4398fcccc
--- /dev/null
+++ b/vllm/utils/torch_utils.py
@@ -0,0 +1,657 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import importlib.metadata
+import os
+import threading
+from collections.abc import Callable, Collection
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, TypeVar
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from packaging import version
+from packaging.version import Version
+from torch.library import Library
+
+import vllm.envs as envs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.sequence import IntermediateTensors
+else:
+    ModelConfig = object
+    IntermediateTensors = object
+
+
+STR_DTYPE_TO_TORCH_DTYPE = {
+    "float32": torch.float32,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+    "fp8": torch.uint8,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
+    "int8": torch.int8,
+    "fp8_inc": torch.float8_e4m3fn,
+    "fp8_ds_mla": torch.uint8,
+}
+
+TORCH_DTYPE_TO_NUMPY_DTYPE = {
+    torch.float16: np.float16,
+    torch.float32: np.float32,
+    torch.float64: np.float64,
+    torch.uint8: np.uint8,
+    torch.int32: np.int32,
+    torch.int64: np.int64,
+}
+
+
+T = TypeVar("T")
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+@contextlib.contextmanager
+def set_default_torch_num_threads(num_threads: int):
+    """Sets the default number of threads for PyTorch to the given value."""
+    old_num_threads = torch.get_num_threads()
+    torch.set_num_threads(num_threads)
+    yield
+    torch.set_num_threads(old_num_threads)
+
+
+@contextlib.contextmanager
+def guard_cuda_initialization():
+    """Avoid unexpected CUDA initialization."""
+    from vllm.platforms import current_platform
+
+    if not current_platform.is_cuda():
+        yield
+        return
+
+    had_key = "CUDA_VISIBLE_DEVICES" in os.environ
+    old_value = os.environ.get("CUDA_VISIBLE_DEVICES")
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    try:
+        yield
+    except Exception as e:
+        if "No CUDA GPUs are available" in str(e):
+            err_msg = "CUDA initialization is blocked."
+        else:
+            err_msg = str(e)
+        raise RuntimeError(err_msg) from e
+    finally:
+        if had_key:
+            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
+        else:
+            os.environ.pop("CUDA_VISIBLE_DEVICES")
+
+
+def get_dtype_size(dtype: torch.dtype) -> int:
+    """Get the size of the data type in bytes."""
+    return torch.tensor([], dtype=dtype).element_size()
+
+
+# bool = 0, int = 1, float = 2, complex = 3
+def _get_precision_level(dtype: torch.dtype) -> int:
+    # NOTE: Complex dtypes return `is_floating_point=False`
+    return (dtype != torch.bool) + dtype.is_floating_point + dtype.is_complex * 2
+
+
+def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
+    """
+    Test whether it is lossless to cast a tensor from
+    `src_dtype` to `tgt_dtype`.
+    """
+    if src_dtype == tgt_dtype:
+        return True
+
+    src_level = _get_precision_level(src_dtype)
+    tgt_level = _get_precision_level(tgt_dtype)
+
+    if src_level < tgt_level:
+        return True
+    if src_level > tgt_level:
+        return False
+
+    # Compare integral types
+    if not src_dtype.is_floating_point and not src_dtype.is_complex:
+        src_info = torch.iinfo(src_dtype)
+        tgt_info = torch.iinfo(tgt_dtype)
+        return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+
+    # Compare floating-point types
+    src_info = torch.finfo(src_dtype)
+    tgt_info = torch.finfo(tgt_dtype)
+    return (
+        src_info.min >= tgt_info.min
+        and src_info.max <= tgt_info.max
+        and src_info.resolution >= tgt_info.resolution
+    )
+
+
+def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
+    """
+    Get the common `dtype` where all of the other `dtypes` can be
+    cast to it without losing any information.
+    """
+    return max(
+        dtypes,
+        key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
+    )
+
+
+def _generate_random_fp8(
+    tensor: torch.Tensor,
+    low: float,
+    high: float,
+) -> None:
+    # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
+    # it may occur Inf or NaN if we directly use torch.randint
+    # to generate random data for fp8 data.
+    # For example, s.11111.00 in fp8e5m2 format represents Inf.
+    #     | E4M3        | E5M2
+    # -----|-------------|-------------------
+    # Inf | N/A         | s.11111.00
+    # NaN | s.1111.111  | s.11111.{01,10,11}
+    from vllm import _custom_ops as ops
+
+    tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
+    tensor_tmp.uniform_(low, high)
+    ops.convert_fp8(tensor, tensor_tmp)
+    del tensor_tmp
+
+
+def get_kv_cache_torch_dtype(
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
+) -> torch.dtype:
+    if isinstance(cache_dtype, str):
+        if cache_dtype == "auto":
+            if isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+                torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+            elif isinstance(model_dtype, torch.dtype):
+                torch_dtype = model_dtype
+            else:
+                raise ValueError(f"Invalid model dtype: {model_dtype}")
+        elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        else:
+            raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    elif isinstance(cache_dtype, torch.dtype):
+        torch_dtype = cache_dtype
+    else:
+        raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    return torch_dtype
+
+
+def kv_cache_dtype_str_to_dtype(
+    kv_cache_dtype: str, model_config: ModelConfig
+) -> torch.dtype:
+    if kv_cache_dtype == "auto":
+        # Model config may not be specified for unit tests, default to float16
+        return model_config.dtype if model_config else torch.half
+    return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+
+
+def create_kv_caches_with_random_flash(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
+    seed: int | None = None,
+    device: str | None = "cuda",
+    cache_layout: str | None = "NHD",
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    from vllm.platforms import current_platform
+
+    current_platform.seed_everything(seed)
+
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    assert cache_layout in ("NHD", "HND")
+    stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4)
+
+    kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i] for i in stride_order)
+    scale = head_size**-0.5
+
+    key_caches: list[torch.Tensor] = []
+    value_caches: list[torch.Tensor] = []
+
+    for _ in range(num_layers):
+        key_value_cache = torch.empty(
+            size=kv_cache_allocation_shape, dtype=dtype, device=device
+        ).permute(*stride_order)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_value_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(key_value_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_value_cache[:, 0])
+        value_caches.append(key_value_cache[:, 1])
+    return key_caches, value_caches
+
+
+def create_kv_caches_with_random(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
+    seed: int | None = None,
+    device: str | None = "cuda",
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    if cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            f"Does not support key cache of type fp8 with head_size {head_size}"
+        )
+    from vllm.platforms import current_platform
+
+    current_platform.seed_everything(seed)
+
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+
+    scale = head_size**-0.5
+    x = 16 // torch.tensor([], dtype=dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    key_caches: list[torch.Tensor] = []
+    for _ in range(num_layers):
+        key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(key_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_cache)
+
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_caches: list[torch.Tensor] = []
+    for _ in range(num_layers):
+        value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            value_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(value_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support value cache of type {cache_dtype}")
+        value_caches.append(value_cache)
+    return key_caches, value_caches
+
+
+def async_tensor_h2d(
+    data: list,
+    dtype: torch.dtype,
+    target_device: str | torch.device,
+    pin_memory: bool,
+) -> torch.Tensor:
+    """Asynchronously create a tensor and copy it from host to device."""
+    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
+    return t.to(device=target_device, non_blocking=True)
+
+
+def make_ndarray_with_pad(
+    x: list[list[T]],
+    pad: T,
+    dtype: npt.DTypeLike,
+    *,
+    max_len: int | None = None,
+) -> npt.NDArray:
+    """
+    Make a padded array from 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    if max_len is None:
+        # Unlike for most functions, map is faster than a genexpr over `len`
+        max_len = max(map(len, x), default=0)
+
+    padded_x = np.full((len(x), max_len), pad, dtype=dtype)
+    for ind, blocktb in enumerate(x):
+        assert len(blocktb) <= max_len
+        padded_x[ind, : len(blocktb)] = blocktb
+
+    return padded_x
+
+
+def make_tensor_with_pad(
+    x: list[list[T]],
+    pad: T,
+    dtype: torch.dtype,
+    *,
+    max_len: int | None = None,
+    device: str | torch.device | None = None,
+    pin_memory: bool = False,
+) -> torch.Tensor:
+    """
+    Make a padded tensor from 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
+    padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
+
+    tensor = torch.from_numpy(padded_x).to(device)
+    if pin_memory:
+        tensor = tensor.pin_memory()
+
+    return tensor
+
+
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream_tls = threading.local()
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    _current_stream_tls.value = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+class _StreamPlaceholder:
+    def __init__(self):
+        self.synchronize = lambda: None
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    from vllm.platforms import current_platform
+
+    if not hasattr(_current_stream_tls, "value") or _current_stream_tls.value is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        # On ROCm using the default 0 stream in combination with RCCL
+        # is hurting performance. Therefore creating a dedicated stream
+        # per process
+        if current_platform.is_rocm():
+            # torch.cuda.set_stream here is the alias of _pathed_set_stream
+            torch.cuda.set_stream(torch.cuda.Stream())
+        elif current_platform.is_cpu():
+            _current_stream_tls.value = _StreamPlaceholder()
+        else:
+            current_stream = current_platform.current_stream
+            if current_stream is not None:
+                _current_stream_tls.value = current_stream()
+            else:
+                raise ValueError(
+                    "Fail to set current stream, current platform "
+                    "may not support current_stream with torch API"
+                )
+    return _current_stream_tls.value
+
+
+# Global auxilary stream for running operations in background streams.
+# We have single global auxilary stream to avoid an explosion of streams
+# for every layer (and make profiling look sane).
+#
+# aux_stream() is currently used for:
+#   - MoE shared_expert overlap with router
+_aux_stream: torch.cuda.Stream | None = None
+
+
+def aux_stream() -> torch.cuda.Stream | None:
+    """
+    Ensures aux_stream is initialized only once
+    """
+    global _aux_stream
+
+    from vllm.platforms import current_platform
+
+    # TODO: validate this works properly on ROCm platform.
+    if _aux_stream is None and current_platform.is_cuda():
+        _aux_stream = torch.cuda.Stream()
+
+    return _aux_stream
+
+
+@lru_cache(maxsize=8)
+def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
+    # Note: cuda_visible_devices is not used, but we keep it as an argument for
+    # LRU Cache purposes.
+
+    # Code below is based on
+    # https://github.com/pytorch/pytorch/blob/
+    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
+    # torch/cuda/__init__.py#L831C1-L831C17
+    import torch.cuda
+    import torch.version
+
+    from vllm.platforms import current_platform
+
+    if not torch.cuda._is_compiled():
+        return 0
+    if current_platform.is_rocm():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = (
+            torch.cuda._device_count_amdsmi()
+            if (hasattr(torch.cuda, "_device_count_amdsmi"))
+            else -1
+        )
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
+    return r
+
+
+def cuda_device_count_stateless() -> int:
+    """Get number of CUDA devices, caching based on the value of
+    CUDA_VISIBLE_DEVICES at the time of call.
+
+    This should be used instead of torch.cuda.device_count()
+    unless CUDA_VISIBLE_DEVICES has already been set to the desired
+    value."""
+
+    # This can be removed and simply replaced with torch.cuda.get_device_count
+    # after https://github.com/pytorch/pytorch/pull/122815 is released.
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
+
+
+def weak_ref_tensor(tensor: Any) -> Any:
+    """
+    Create a weak reference to a tensor.
+    The new tensor will share the same data as the original tensor,
+    but will not keep the original tensor alive.
+    """
+    if isinstance(tensor, torch.Tensor):
+        return torch.ops._C.weak_ref_tensor(tensor)
+    else:
+        return tensor
+
+
+def weak_ref_tensors(
+    tensors: torch.Tensor
+    | list[torch.Tensor]
+    | tuple[torch.Tensor]
+    | IntermediateTensors,
+) -> torch.Tensor | list[Any] | tuple[Any] | Any:
+    """
+    Convenience function to create weak references to tensors,
+    for single tensor, list of tensors or tuple of tensors.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return weak_ref_tensor(tensors)
+    if isinstance(tensors, list):
+        return [weak_ref_tensor(t) for t in tensors]
+    if isinstance(tensors, tuple):
+        return tuple(weak_ref_tensor(t) for t in tensors)
+
+    # For IntermediateTensors used in pipeline parallelism
+    from vllm.sequence import IntermediateTensors
+
+    if isinstance(tensors, IntermediateTensors):
+        ret = IntermediateTensors(
+            {key: weak_ref_tensor(val) for key, val in tensors.tensors.items()}
+        )
+        return ret
+    raise ValueError("Invalid type for tensors")
+
+
+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
+# Helper function used in testing.
+def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
+    torch_version = version.parse(torch_version)
+    return torch_version >= version.parse(target)
+
+
+def is_torch_equal_or_newer(target: str) -> bool:
+    """Check if the installed torch version is >= the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        return _is_torch_equal_or_newer(str(torch.__version__), target)
+    except Exception:
+        # Fallback to PKG-INFO to load the package info, needed by the doc gen.
+        return Version(importlib.metadata.version("torch")) >= Version(target)
+
+
+def _is_torch_equal(target: str) -> bool:
+    assert target.count(".") == 2
+    torch_version = str(torch.__version__)
+    torch_version = version.parse(torch_version)
+    # torch version is like "2.6.0.dev20240101" or "2.6.0.dev20240101+cpu"
+    # or "2.6.0+cu128" but never "2.6.0.1"
+    return (
+        torch_version >= version.parse(target)
+        and version.parse(target + ".1") > torch_version
+    )
+
+
+def is_torch_equal(target: str) -> bool:
+    """Check if the installed torch version is == the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        return _is_torch_equal(target)
+    except Exception:
+        return Version(importlib.metadata.version("torch")) == Version(target)
+
+
+# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
+# In particular, the FakeScalarType is not supported for earlier versions of
+# PyTorch which breaks dynamo for any ops registered using ScalarType.
+def supports_dynamo() -> bool:
+    return is_torch_equal_or_newer("2.4.0")
+
+
+# Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
+def supports_xccl() -> bool:
+    return (
+        is_torch_equal_or_newer("2.8.0.dev") and torch.distributed.is_xccl_available()
+    )
+
+
+# Some backends use pytorch version < 2.4.0 which doesn't
+# support `torch.library.custom_op`.
+def supports_custom_op() -> bool:
+    return hasattr(torch.library, "custom_op")
+
+
+# create a library to hold the custom op
+vllm_lib = Library("vllm", "FRAGMENT")  # noqa
+
+
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: list[str] | None = None,
+    fake_impl: Callable | None = None,
+    target_lib: Library | None = None,
+    dispatch_key: str | None = None,
+    tags: tuple[torch.Tag, ...] = (),
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+    """
+    if not supports_custom_op():
+        from vllm.platforms import current_platform
+
+        assert not current_platform.is_cuda_alike(), (
+            "cuda platform needs torch>=2.4 to support custom op, "
+            "chances are you are using an old version of pytorch "
+            "or a custom build of pytorch. It is recommended to "
+            "use vLLM in a fresh new environment and let it install "
+            "the required dependencies."
+        )
+        return
+
+    if mutates_args is None:
+        mutates_args = []
+
+    if dispatch_key is None:
+        from vllm.platforms import current_platform
+
+        dispatch_key = current_platform.dispatch_key
+
+    import torch.library
+
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
+    my_lib = target_lib or vllm_lib
+    my_lib.define(op_name + schema_str, tags=tags)
+    my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 6ca0c63f6b599d5e936ae668b29f8e8990268015..f1254352c058533ef24d7a5714a61e4f313a24c7 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,72 +1,71 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar
 
-import numpy as np
 import torch
-from torch.nn.functional import scaled_dot_product_attention
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionLayer,
+    AttentionType,
+    is_quantized_kv_cache,
+)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
-from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.gpu_input_batch import InputBatch
-
-try:
-    import intel_extension_for_pytorch.llm.modules as ipex_modules
-    _use_ipex = True
-# AttributeError is to handle a bug in ipex
-# https://github.com/intel/intel-extension-for-pytorch/pull/813
-except (ImportError, AttributeError):
-    _use_ipex = False
-
-from vllm import _custom_ops as ops
 
 logger = init_logger(__name__)
 
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
+
 
-class TorchSDPABackend(AttentionBackend):
-    accept_output_buffer: bool = False
+class CPUAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
         return [torch.float16, torch.bfloat16, torch.float32]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        attn_impl = _get_paged_attn_impl()
-        is_valid, supported_head_sizes = attn_impl.validate_head_size(
-            head_size)
-        if not is_valid:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
     def get_name() -> str:
-        return "TORCH_SDPA"
+        return "CPU_ATTN"
 
-    @staticmethod
-    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """CPU attention supports decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        )
 
     @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TorchSDPAMetadata
+    def get_impl_cls() -> type["CPUAttentionBackendImpl"]:
+        return CPUAttentionBackendImpl
 
     @staticmethod
-    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
-        return TorchSDPAMetadataBuilderV1
+    def get_builder_cls() -> type["CPUAttentionMetadataBuilder"]:
+        return CPUAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -76,8 +75,7 @@ class TorchSDPABackend(AttentionBackend):
         head_size: int,
         cache_dtype_str: str = "auto",
     ) -> tuple[int, ...]:
-        return _get_paged_attn_impl().get_kv_cache_shape(
-            num_blocks, block_size, num_kv_heads, head_size)
+        return 2, num_blocks, num_kv_heads, block_size, head_size
 
     @staticmethod
     def use_cascade_attention(*args, **kwargs) -> bool:
@@ -85,401 +83,187 @@ class TorchSDPABackend(AttentionBackend):
 
 
 @dataclass
-class TorchSDPAMetadata(AttentionMetadata):
-    """Attention metadata for prefill and decode batched together."""
-    # Total number of prefill requests.
-    num_prefills: int
-    # Number of prefill tokens.
-    num_prefill_tokens: int
-    # Number of decode tokens. Note that it is equivalent to the number of
-    # decode requests.
-    num_decode_tokens: int
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
+class CPUAttentionMetadata:
+    isa: str
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
     slot_mapping: torch.Tensor
-    """Metadata for PagedAttention."""
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    seq_lens_tensor: Optional[torch.Tensor]
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    max_decode_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
-    """Metadata for TorchSDPABackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    chunked_prefill: bool
-    seq_lens: Optional[list[int]] = None  # For non-chunked prefill
-
-    # For chunked prefill only
-    max_query_len: Optional[int] = None
-    max_kv_len: Optional[int] = None
-    prefill_query_start_loc: Optional[torch.Tensor] = None
-    kv_start_loc: Optional[torch.Tensor] = None
-    prefill_block_tables: Optional[torch.Tensor] = None
-
-    # For V1 logits index only
-    query_start_loc: Optional[torch.Tensor] = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[list[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[list[torch.Tensor]] = None
-        self.encoder_attn_bias: Optional[list[torch.Tensor]] = None
-        self.cross_attn_bias: Optional[list[torch.Tensor]] = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        '''
-        All attention metadata required for encoder attention is set.
-        '''
-        return ((self.encoder_seq_lens is not None)
-                and (self.encoder_seq_lens_tensor is not None)
-                and (self.max_encoder_seq_len is not None))
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        '''
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        '''
-        return (self.is_all_encoder_attn_metadata_set
-                and (self.cross_slot_mapping is not None)
-                and (self.cross_block_tables is not None))
-
-    @property
-    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_prefill_tokens == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-    def get_seq_lens(
-        self,
-        attn_type: str,
-    ):
-        '''
-        Extract appropriate sequence lengths from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate sequence lengths tensor for query
-        * Appropriate sequence lengths tensor for key & value
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.seq_lens
-        elif attn_type == AttentionType.ENCODER:
-            seq_lens_q = self.encoder_seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-        return seq_lens_q, seq_lens_kv
-
-    def get_attn_bias(
-        self,
-        attn_type: str,
-    ) -> Optional[list[torch.Tensor]]:
-        '''
-        Extract appropriate attention bias from attention metadata
-        according to attention type.
+    scheduler_metadata: torch.Tensor | None
+    causal: bool = True
 
-        Arguments:
+    # can be removed after deprecate sdpa
+    use_sdpa_prefill: bool = False
+    num_decode_tokens: int = 0
+    sdpa_attn_masks: list[torch.Tensor | None] | None = None
+    sdpa_start_loc: torch.Tensor | None = None
 
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
 
-        Returns:
-        * Appropriate attention bias value given the attention type
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            return self.attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            return self.encoder_attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            return self.cross_attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def set_attn_bias(
+class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]):
+    def __init__(
         self,
-        attn_bias: list[torch.Tensor],
-        attn_type: str,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
     ) -> None:
-        '''
-        Update appropriate attention bias field of attention metadata,
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_bias: The desired attention bias value
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            self.attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            self.encoder_attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            self.cross_attn_bias = attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def get_seq_len_block_table_args(
-        self,
-        attn_type: str,
-    ) -> tuple:
-        '''
-        The particular choice of sequence-length- and block-table-related
-        attributes which should be extracted from attn_metadata is dependent
-        on the type of attention operation.
-
-        Decoder attn -> select entirely decoder self-attention-related fields
-        Encoder/decoder cross-attn -> select encoder sequence lengths &
-                                    cross-attn block-tables fields
-        Encoder attn -> select encoder sequence lengths fields & no block tables
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * is_prompt: True if prefill, False otherwise
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-
-        * Appropriate sequence-lengths tensor
-        * Appropriate max sequence-length scalar
-        * Appropriate block tables (or None)
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            # Decoder self-attention
-            # Choose max_seq_len based on whether we are in prompt_run
-            return (self.seq_lens_tensor, self.max_decode_seq_len,
-                    self.block_tables)
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            # Enc/dec cross-attention KVs match encoder sequence length;
-            # cross-attention utilizes special "cross" block tables
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    self.cross_block_tables)
-        elif attn_type == AttentionType.ENCODER:
-            # No block tables associated with encoder attention
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    None)
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device) -> None:
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
-        self.scheduler_config = vllm_config.scheduler_config
+        self.use_sdpa_prefill = False
+        reorder_batch_threshold = None
+        if current_platform.get_cpu_architecture() not in _CPU_ARCH_PREFER_MIXED_BATCH:
+            # in this case, decode seqs are reordered to the front of prefill seqs
+            # to split decode and prefill. Then use SDPA for prefill and
+            # cpu_attention_with_kv_cache for decode
+            reorder_batch_threshold = 1
+            self.use_sdpa_prefill = True
+
+        self._init_reorder_batch_threshold(reorder_batch_threshold, False)
 
-        # For reorder
-        self.reorder_prompt_req_index_list = np.empty(
-            vllm_config.scheduler_config.max_num_seqs, dtype=np.int64)
-        self.reorder_decode_req_index_list = np.empty(
-            vllm_config.scheduler_config.max_num_seqs, dtype=np.int64)
-        self.num_prompt_req: int = 0
+        self.kv_cache_spec = kv_cache_spec
+        self.vllm_config = vllm_config
 
-        self.seq_start_loc_cpu = torch.zeros(
-            vllm_config.scheduler_config.max_num_seqs + 1,
-            dtype=torch.int32,
-            device="cpu",
+        parallel_config = vllm_config.parallel_config
+        self.num_kv_heads = vllm_config.model_config.get_num_kv_heads(parallel_config)
+        self.num_heads = vllm_config.model_config.get_num_attention_heads(
+            parallel_config
         )
-        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
-
-    def reorder_batch(self, input_batch: InputBatch,
-                      scheduler_output: SchedulerOutput) -> bool:
-        prompt_list_idx = 0
-        decode_list_idx = 0
-        for req_index in range(input_batch.num_reqs):
-            if input_batch.num_computed_tokens_cpu[
-                    req_index] < input_batch.num_prompt_tokens[req_index]:
-                # prompt stage
-                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
-                prompt_list_idx += 1
-            else:
-                # decode stage
-                self.reorder_decode_req_index_list[decode_list_idx] = req_index
-                decode_list_idx += 1
-        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
-
-        # Update prompt requests number
-        self.num_prompt_req = prompt_list_idx
-
-        reorder_req_num = 0
-        for req_index in range(decode_list_idx):
-            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
-                reorder_req_num += 1
-            else:
-                break
-
-        if reorder_req_num == 0:
-            return False
-
-        reorder_prompt_list = (
-            self.reorder_prompt_req_index_list[:prompt_list_idx]
-            [-reorder_req_num:])
-        reorder_decode_list = (
-            self.reorder_decode_req_index_list[:decode_list_idx]
-            [:reorder_req_num])
-        assert reorder_decode_list.size == reorder_prompt_list.size
-
-        for idx in range(reorder_req_num):
-            prompt_req_index = reorder_prompt_list[idx].item()
-            decode_req_index = reorder_decode_list[idx].item()
-            input_batch.swap_states(prompt_req_index, decode_req_index)
-
-        return True
-
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> TorchSDPAMetadata:
+        self.head_dim = kv_cache_spec.head_size
+        self.dtype = vllm_config.model_config.dtype
+        self.window_size = getattr(kv_cache_spec, "sliding_window", -1)
+        if self.window_size is None:
+            self.window_size = -1
+        self.block_size = vllm_config.cache_config.block_size
+        self.isa = _get_attn_isa(self.dtype, self.block_size)
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> CPUAttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
-        num_prompt_req = self.num_prompt_req
-        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
-        ) if num_prompt_req > 0 else 0
-        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
-        ) if num_prompt_req < num_reqs else 0
-        self.seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
-
-        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        num_prefill_tokens = int(query_start_loc_cpu[num_prompt_req].item())
-        num_decode_tokens = int(query_start_loc_cpu[num_reqs].item() -
-                                num_prefill_tokens)
-
-        slot_mapping = common_attn_metadata.slot_mapping.long()
+        max_seq_len = common_attn_metadata.max_seq_len
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+        causal = common_attn_metadata.causal
+
+        sdpa_start_loc = query_start_loc
+        num_decode_tokens = 0
+        if self.use_sdpa_prefill and causal:
+            # Decoder, need reorder and truncate
+            assert self.reorder_batch_threshold
+            (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) = (
+                split_decodes_and_prefills(
+                    common_attn_metadata,
+                    decode_threshold=self.reorder_batch_threshold,
+                    require_uniform=True,
+                )
+            )
+            num_reqs = num_decodes
+            sdpa_start_loc = sdpa_start_loc[num_decodes:] - num_decode_tokens
+            seq_lens = seq_lens[:num_decodes]
+            query_start_loc = query_start_loc[: num_decodes + 1]
+            block_table_tensor = block_table_tensor[:num_decodes]
+
+        sheduler_metadata = None
+        if causal:
+            # for decode batch, use the custom kernel
+            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+                num_reqs=num_reqs,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_dim=self.head_dim,
+                seq_lens=seq_lens,
+                dtype=self.dtype,
+                query_start_loc=query_start_loc,
+                causal=causal,
+                sliding_window_size=self.window_size,
+                isa=self.isa,
+                enable_kv_split=True,
+            )
 
-        attn_metadata = TorchSDPAMetadata(
-            num_prefills=num_prompt_req,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            slot_mapping=slot_mapping,
-            # to ensure inference when chunked_prefill is disabled
-            seq_lens=seq_lens_cpu.tolist(),
-            seq_lens_tensor=seq_lens_cpu[num_prompt_req:num_reqs],  # decode
-            max_decode_seq_len=max_decode_seq_len,  # decode
-            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
-            chunked_prefill=self.scheduler_config.chunked_prefill_enabled,
+        attn_metadata = CPUAttentionMetadata(
+            isa=self.isa,
+            num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
-            max_kv_len=max_prefill_seq_len,
-            prefill_query_start_loc=query_start_loc_cpu[:num_prompt_req +
-                                                        1],  # prefill
-            kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req +
-                                                1],  # prefill
-            prefill_block_tables=block_table_tensor[:
-                                                    num_prompt_req],  # prefill
-            query_start_loc=query_start_loc_cpu[:num_reqs +
-                                                1],  # for logits index
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            scheduler_metadata=sheduler_metadata,
+            causal=causal,
+            use_sdpa_prefill=self.use_sdpa_prefill,
+            num_decode_tokens=num_decode_tokens,
+            sdpa_start_loc=sdpa_start_loc,
         )
 
         return attn_metadata
 
 
-class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
-
+class CPUAttentionBackendImpl(AttentionImpl):
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if logits_soft_cap is not None:
-            logger.warning_once("Torch SPDA does not support logits soft cap. "
-                                "Outputs may be slightly off.")
-        self.paged_attn_impl = _get_paged_attn_impl()
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
+        if logits_soft_cap is not None and attn_type in (
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        ):
+            logger.warning_once(
+                "CPU_ATTN does not support logits softcap for"
+                " ENCODER and ENCODER_ONLY, outputs may be slightly off"
+            )
+        if logits_soft_cap is None:
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
         self.num_kv_heads = num_kv_heads
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        elif attn_type == AttentionType.ENCODER_ONLY:
+            self.sliding_window = (sliding_window - 1, sliding_window - 1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
-
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.alibi_slopes is not None
-                          or self.sliding_window is not None)
 
-        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
-            raise NotImplementedError(
-                "Torch SDPA backend FP8 KV cache requires "
-                "intel_extension_for_pytorch support.")
+        if is_quantized_kv_cache(kv_cache_dtype):
+            raise NotImplementedError("FP8 KV cache is unsupported in CPU_ATTN")
         self.attn_type = attn_type
 
+        self.sinks = sinks
+        if self.sinks is not None:
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer"
+            )
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -487,182 +271,130 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        attn_metadata: CPUAttentionMetadata | None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        """Forward pass with torch SDPA and PagedAttention.
+        """Forward pass for CPU attention backend.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
             kv_cache: shape =
-                [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
+                [2, num_blocks, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
-                " for TorchSDPABackendImpl")
+                " for CPUAttentionBackendImpl"
+            )
 
         # For warming-up
         if attn_metadata is None:
-            return query
-
-        attn_type = self.attn_type
-        if (attn_type == AttentionType.ENCODER
-                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
-            raise AttributeError("Encoder attention requires setting "
-                                 "encoder metadata attributes.")
-        elif (attn_type == AttentionType.ENCODER_DECODER
-              and (not attn_metadata.is_all_cross_attn_metadata_set)):
-            raise AttributeError("Encoder/decoder cross-attention "
-                                 "requires setting cross-attention "
-                                 "metadata attributes.")
-
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-
-        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = self.paged_attn_impl.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-
-            if (key is not None) and (value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                self.paged_attn_impl.write_to_paged_cache(
-                    key, value, key_cache, value_cache, updated_slot_mapping,
-                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
-
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-
-        if attn_type == AttentionType.DECODER:
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-
-        output = torch.empty_like(query)
-        if prefill_meta := attn_metadata.prefill_metadata:
-            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
-                assert attn_metadata.seq_lens is not None
-                self._run_sdpa_forward(output,
-                                       query,
-                                       key,
-                                       value,
-                                       prefill_meta,
-                                       attn_type=attn_type)
-            else:
-                # prefix-enabled attention
-                assert not self.need_mask
-                import intel_extension_for_pytorch.llm.modules as ipex_modules
-                output = torch.empty_like(query)
-                ipex_modules.PagedAttention.flash_attn_varlen_func(
-                    output[:prefill_meta.num_prefill_tokens, :, :],
-                    query[:prefill_meta.num_prefill_tokens, :, :],
-                    key_cache,
-                    value_cache,
-                    prefill_meta.prefill_query_start_loc,
-                    prefill_meta.kv_start_loc,
-                    prefill_meta.max_query_len,
-                    prefill_meta.max_kv_len,
-                    self.scale,
-                    True,
-                    prefill_meta.prefill_block_tables,
-                    self.alibi_slopes,
-                )
+            return output
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            return self._run_sdpa_forward(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
+            )
 
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata.")
-            # Decoding run.
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = decode_meta.get_seq_len_block_table_args(attn_type)
-
-            self.paged_attn_impl.forward_decode(
-                output[attn_metadata.num_prefill_tokens:, :, :],
-                query[attn_metadata.num_prefill_tokens:, :, :],
+        # For decoder and cross-attention, use KV cache, size are
+        # [num_blocks, num_kv_heads, block_size, head_size]
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
+            ops.cpu_attn_reshape_and_cache(
+                key,
+                value,
                 key_cache,
                 value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
+                attn_metadata.slot_mapping,
+                attn_metadata.isa,
+            )
+
+        if attn_metadata.use_sdpa_prefill:
+            assert self.sinks is None, "Attention sink is unsupported in SDPA prefill"
+            num_decode_tokens = attn_metadata.num_decode_tokens
+            self._run_sdpa_forward(
+                query[num_decode_tokens:num_actual_tokens],
+                key[num_decode_tokens:num_actual_tokens],
+                value[num_decode_tokens:num_actual_tokens],
+                output[num_decode_tokens:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
+            )
+            num_actual_tokens = num_decode_tokens
+
+        if num_actual_tokens > 0:
+            ops.cpu_attention_with_kv_cache(
+                query=query[:num_actual_tokens],
+                key_cache=key_cache,
+                value_cache=value_cache,
+                output=output[:num_actual_tokens],  # type: ignore
+                query_start_loc=attn_metadata.query_start_loc,
+                seq_lens=attn_metadata.seq_lens,
+                scale=self.scale,
+                causal=attn_metadata.causal,
+                alibi_slopes=self.alibi_slopes,  # type: ignore
+                sliding_window=self.sliding_window,
+                block_table=attn_metadata.block_table,
+                softcap=self.logits_soft_cap,
+                scheduler_metadata=attn_metadata.scheduler_metadata,
+                s_aux=self.sinks,
             )
 
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
+        return output
 
     def _run_sdpa_forward(
         self,
-        output: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        output: torch.Tensor,
+        attn_metadata: CPUAttentionMetadata,
+        attn_type: str,
+    ) -> torch.Tensor:
+        attn_masks = attn_metadata.sdpa_attn_masks
         if attn_masks is None:
             if self.alibi_slopes is not None:
                 attn_masks = _make_alibi_bias(
-                    self.alibi_slopes, query.dtype,
-                    attn_metadata.seq_lens)  # type: ignore
-            elif self.sliding_window is not None:
+                    self.alibi_slopes,
+                    query.dtype,
+                    attn_metadata.sdpa_start_loc,
+                )
+            elif self.sliding_window[0] != -1 or self.sliding_window[1] != -1:
                 assert attn_metadata.seq_lens is not None
                 attn_masks = _make_sliding_window_bias(
-                    attn_metadata.seq_lens, self.sliding_window,
-                    query.dtype)  # type: ignore
+                    attn_metadata.sdpa_start_loc,
+                    self.sliding_window[0],
+                    self.sliding_window[1],
+                    query.dtype,
+                )
             else:
-                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
-                attn_masks = [None] * len(seq_lens)
-            attn_metadata.set_attn_bias(attn_masks, attn_type)
+                attn_masks = [None] * (attn_metadata.sdpa_start_loc.size(0) - 1)  # type: ignore
+            attn_metadata.sdpa_attn_masks = attn_masks
 
         query = query.movedim(0, query.dim() - 2)
         key = key.movedim(0, key.dim() - 2)
@@ -672,34 +404,41 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
             key = key.repeat_interleave(self.num_queries_per_kv, dim=-3)
             value = value.repeat_interleave(self.num_queries_per_kv, dim=-3)
 
-        causal_attn = (attn_type == AttentionType.DECODER)
-
-        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
-        start_q, start_kv = 0, 0
-        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
-                                               attn_masks):
-            end_q = start_q + seq_len_q
-            end_kv = start_kv + seq_len_kv
-            sub_out = scaled_dot_product_attention(
-                query[None, :, start_q:end_q, :],
-                key[None, :, start_kv:end_kv, :],
-                value[None, :, start_kv:end_kv, :],
-                attn_mask=mask,
-                dropout_p=0.0,
-                is_causal=causal_attn and mask is None,
-                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
+        causal_attn = attn_type == AttentionType.DECODER
+
+        sdpa_start_loc = attn_metadata.sdpa_start_loc.numpy()  # type: ignore
+        for i in range(len(attn_masks)):
+            mask = attn_masks[i]
+            start_q = sdpa_start_loc[i]
+            end_q = sdpa_start_loc[i + 1]
+            sub_out = (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query[None, :, start_q:end_q, :],
+                    key[None, :, start_q:end_q, :],
+                    value[None, :, start_q:end_q, :],
+                    attn_mask=mask,
+                    dropout_p=0.0,
+                    is_causal=causal_attn and mask is None,
+                    scale=self.scale,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
             output[start_q:end_q, :, :] = sub_out
-            start_q, start_kv = end_q, end_kv
+        return output
 
 
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     dtype: torch.dtype,
-    seq_lens: list[int],
+    sdpa_start_loc: torch.Tensor,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        bias = torch.arange(seq_len, dtype=dtype)  # type: ignore
         # NOTE(zhuohan): HF uses
         #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
@@ -710,210 +449,48 @@ def _make_alibi_bias(
         num_heads = alibi_slopes.shape[0]
         bias = bias[None, :].repeat((num_heads, 1, 1))
         bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
-        inf_mask = torch.empty(
-            (1, seq_len, seq_len),
-            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
+        inf_mask = (
+            torch.empty((1, seq_len, seq_len), dtype=bias.dtype)  # type: ignore
+            .fill_(-torch.inf)
+            .triu_(diagonal=1)
+        )
         attn_biases.append((bias + inf_mask).to(dtype))
 
     return attn_biases
 
 
 def _make_sliding_window_bias(
-    seq_lens: list[int],
-    window_size: Optional[int],
+    sdpa_start_loc: torch.Tensor,
+    left_window_size: int,
+    right_window_size: int,
     dtype: torch.dtype,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        mask = torch.full(  # type: ignore
+            (1, seq_len, seq_len),  # type: ignore
             fill_value=1,
+            dtype=dtype,
         )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+
+        if right_window_size != -1:
+            mask = torch.tril(mask, diagonal=right_window_size)
+        if left_window_size != -1:
+            mask = torch.triu(mask, diagonal=-left_window_size)
         mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
+        attn_biases.append(mask)
 
     return attn_biases
 
 
-class _PagedAttention:
-
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        SUPPORT_HS = [32, 64, 80, 96, 112, 128, 192, 256]
-        return head_size in SUPPORT_HS, SUPPORT_HS
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[int, ...]:
-        return 2, num_blocks, block_size * num_kv_heads * head_size
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        x = 16 // kv_cache.element_size()
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        tp_rank: int = 0
-        blocksparse_local_blocks: int = 0
-        blocksparse_vert_stride: int = 0
-        blocksparse_block_size: int = 64
-        blocksparse_head_sliding_step: int = 0
-        block_size = value_cache.shape[3]
-
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank,
-            blocksparse_local_blocks,
-            blocksparse_vert_stride,
-            blocksparse_block_size,
-            blocksparse_head_sliding_step,
-        )
-
-
-class _IPEXPagedAttention(_PagedAttention):
-
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        return True, []
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ipex_modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache,
-            slot_mapping.flatten().int())
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        block_size = value_cache.shape[2]
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            device="cpu",
-            dtype=torch.int32,
-        ).view(num_kv_heads,
-               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
-        ipex_modules.PagedAttention.single_query_cached_kv_attention(
-            output, query.contiguous(), key_cache, value_cache, head_mapping,
-            scale, block_tables, context_lens, block_size, max_context_len,
-            alibi_slopes)
-
-
-def _get_paged_attn_impl():
-    if _use_ipex:
-        return _IPEXPagedAttention
+def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
+        return "amx"
+    elif block_size % 32 == 0:
+        return "vec"
     else:
-        return _PagedAttention
+        return "vec16"
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index f0770f744146a917f730ab3f79e25d253578a4a3..a5d4435000d4dbaa88dbe5c0c7d1af8b8adbfbfa 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,75 +1,87 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
+
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar
 
 import numpy as np
 import torch
 
 from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+    is_quantized_kv_cache,
+)
 from vllm.attention.layer import Attention
+from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
-                                           get_flash_attn_version,
-                                           is_flash_attn_varlen_func_available)
+from vllm.attention.utils.fa_utils import (
+    flash_attn_supports_fp8,
+    get_flash_attn_version,
+    is_flash_attn_varlen_func_available,
+)
 
 if is_flash_attn_varlen_func_available():
-    from vllm.attention.utils.fa_utils import (flash_attn_varlen_func,
-                                               get_scheduler_metadata,
-                                               reshape_and_cache_flash)
-
+    from vllm.attention.utils.fa_utils import (
+        flash_attn_supports_sinks,
+        flash_attn_varlen_func,
+        get_scheduler_metadata,
+        reshape_and_cache_flash,
+    )
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config.cache import CacheDType
+from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
-from vllm.utils import cdiv
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              get_kv_cache_layout)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms.interface import DeviceCapability
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    get_dcp_local_seq_lens,
+    get_kv_cache_layout,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
 
 class FlashAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-    supports_quant_query_input: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # NOTE(tdoublep): while in principle, FA supports
+    # MultipleOf(16), these are the block sizes that do not
+    # suffer from the NaN propagation problem described here:
+    # https://github.com/Dao-AILab/flash-attention/issues/1974
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
 
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlashAttention supports all attention types."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return FlashAttentionMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
         return FlashAttentionMetadataBuilder
@@ -106,6 +118,44 @@ class FlashAttentionBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
+        if kv_cache_dtype is None:
+            return True
+        if kv_cache_dtype.startswith("fp8"):
+            return flash_attn_supports_fp8()
+        return kv_cache_dtype in ["auto"]
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        if not is_flash_attn_varlen_func_available():
+            return False
+        return flash_attn_supports_sinks()
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(8, 0)
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if has_sink and device_capability < DeviceCapability(9, 0):
+            return "sink not supported on compute capability < 9.0"
+        return None
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -128,22 +178,27 @@ class FlashAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
+
+    # For GQA DCP
+    max_dcp_context_kv_len: int | None = None
+    dcp_context_kv_lens: torch.Tensor | None = None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
     max_num_splits: int = 0
 
     causal: bool = True
 
 
 def _get_sliding_window_configs(
-        vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
+    vllm_config: VllmConfig,
+) -> set[tuple[int, int] | None]:
     """Get the set of all sliding window configs used in the model."""
-    sliding_window_configs: set[Optional[tuple[int, int]]] = set()
+    sliding_window_configs: set[tuple[int, int] | None] = set()
     layers = get_layers_from_vllm_config(vllm_config, Attention)
     for layer in layers.values():
         assert isinstance(layer.impl, FlashAttentionImpl)
@@ -151,8 +206,7 @@ def _get_sliding_window_configs(
     return sliding_window_configs
 
 
-class FlashAttentionMetadataBuilder(
-        AttentionMetadataBuilder[FlashAttentionMetadata]):
+class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetadata]):
     # FA3:
     # Supports full cudagraphs for all cases.
     #
@@ -171,11 +225,19 @@ class FlashAttentionMetadataBuilder(
     # to FULL_AND_PIECEWISE.
     # TODO(luka, lucas): audit FA2 as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support = AttentionCGSupport.ALWAYS \
-        if get_flash_attn_version() == 3 else AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support = (
+        AttentionCGSupport.ALWAYS
+        if get_flash_attn_version() == 3
+        else AttentionCGSupport.UNIFORM_BATCH
+    )
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
@@ -183,28 +245,36 @@ class FlashAttentionMetadataBuilder(
         self.compilation_config = vllm_config.compilation_config
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        self.num_heads_kv = self.model_config.get_num_kv_heads(
-            self.parallel_config)
+            self.parallel_config
+        )
+        self.num_heads_kv = self.model_config.get_num_kv_heads(self.parallel_config)
         self.kv_cache_dtype = kv_cache_spec.dtype
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
 
         self.max_num_splits = 0  # No upper bound on the number of splits.
-        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.aot_schedule = get_flash_attn_version() == 3
+
+        try:
+            from vllm.distributed.parallel_state import get_dcp_group
+
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
 
-        self.use_full_cuda_graph = \
+        self.dcp_kv_cache_interleave_size = (
+            self.parallel_config.dcp_kv_cache_interleave_size
+        )
+
+        self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        self.max_cudagraph_size = self.compilation_config.max_capture_size
+        )
+        self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
 
         if self.use_full_cuda_graph and self.aot_schedule:
-            if self.max_cudagraph_size > 992:
-                # This condition derives from FA3's internal heuristic.
-                # TODO(woosuk): Support larger cudagraph sizes.
-                raise ValueError(
-                    "Capture size larger than 992 is not supported for "
-                    "full cuda graph.")
-
             self.scheduler_metadata = torch.zeros(
                 vllm_config.scheduler_config.max_num_seqs + 1,
                 dtype=torch.int32,
@@ -213,19 +283,20 @@ class FlashAttentionMetadataBuilder(
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = (
-                envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
+            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
-        self.aot_sliding_window: Optional[tuple[int, int]] = None
+        self.aot_sliding_window: tuple[int, int] | None = None
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> FlashAttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlashAttentionMetadata:
         """
-        fast_build disables AOT scheduling, used when there will be few 
+        fast_build disables AOT scheduling, used when there will be few
         iterations i.e. spec-decode
         """
         num_reqs = common_attn_metadata.num_reqs
@@ -249,8 +320,7 @@ class FlashAttentionMetadataBuilder(
             # build() call so the layers are constructed (cannot populate)
             # in __init__.
             if aot_schedule:
-                sliding_window_configs = _get_sliding_window_configs(
-                    self.vllm_config)
+                sliding_window_configs = _get_sliding_window_configs(self.vllm_config)
                 if len(sliding_window_configs) == 1:
                     sliding_window_config = sliding_window_configs.pop()
                     if sliding_window_config is not None:
@@ -260,20 +330,24 @@ class FlashAttentionMetadataBuilder(
                     aot_schedule = False
 
         max_num_splits = 0  # 0 means use FA3's heuristics, not CG compatible
-        if self.use_full_cuda_graph and \
-            num_actual_tokens <= self.max_cudagraph_size:
+        if self.use_full_cuda_graph and num_actual_tokens <= self.max_cudagraph_size:
             # NOTE(woosuk): Setting num_splits > 1 may increase the memory
             # usage, because the intermediate buffers of size [num_splits,
             # num_heads, num_tokens, head_size] are allocated. Therefore,
             # we only set num_splits when using cuda graphs.
             max_num_splits = self.max_num_splits
 
-        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
-                     max_seq_len, causal):
+        if vllm_is_batch_invariant():
+            max_num_splits = 1
+
+        def schedule(
+            batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal
+        ):
             cache_dtype = self.cache_config.cache_dtype
             if cache_dtype.startswith("fp8"):
                 qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
-                    cache_dtype)
+                    cache_dtype
+                )
             else:
                 qkv_dtype = self.kv_cache_dtype
             if aot_schedule:
@@ -281,7 +355,7 @@ class FlashAttentionMetadataBuilder(
                     batch_size=batch_size,
                     max_seqlen_q=max_query_len,
                     max_seqlen_k=max_seq_len,
-                    num_heads_q=self.num_heads_q,
+                    num_heads_q=self.num_heads_q * self.dcp_world_size,
                     num_heads_kv=self.num_heads_kv,
                     headdim=self.headdim,
                     cache_seqlens=seqlens,
@@ -295,41 +369,73 @@ class FlashAttentionMetadataBuilder(
             return None
 
         use_cascade = common_prefix_len > 0
+        max_dcp_context_kv_len = 0
+        dcp_context_kv_lens = None
+
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+        prefix_scheduler_metadata = None
+
+        if self.dcp_world_size > 1:
+            query_kv_lens_cpu = (
+                common_attn_metadata.query_start_loc_cpu[1:]
+                - common_attn_metadata.query_start_loc_cpu[:-1]
+            )
+            dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
 
-        if use_cascade:
-            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
-                                                dtype=torch.int32,
-                                                device=self.device)
-            prefix_kv_lens = torch.tensor([common_prefix_len],
-                                          dtype=torch.int32,
-                                          device=self.device)
+            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
+                dcp_context_kv_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+            dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
+            max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
+
+            scheduler_metadata = schedule(
+                batch_size=num_reqs,
+                cu_query_lens=query_start_loc,
+                max_query_len=max_query_len,
+                seqlens=dcp_context_kv_lens,
+                max_seq_len=max_dcp_context_kv_len,
+                causal=False,
+            )
+        elif use_cascade:
+            cu_prefix_query_lens = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device=self.device
+            )
+            prefix_kv_lens = torch.tensor(
+                [common_prefix_len], dtype=torch.int32, device=self.device
+            )
             suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to(
-                self.device, non_blocking=True)
+                self.device, non_blocking=True
+            )
             prefix_scheduler_metadata = schedule(
                 batch_size=1,
                 cu_query_lens=cu_prefix_query_lens,
                 max_query_len=num_actual_tokens,
                 seqlens=prefix_kv_lens,
                 max_seq_len=common_prefix_len,
-                causal=False)
-            scheduler_metadata = schedule(batch_size=num_reqs,
-                                          cu_query_lens=query_start_loc,
-                                          max_query_len=max_query_len,
-                                          seqlens=suffix_kv_lens,
-                                          max_seq_len=max_seq_len -
-                                          common_prefix_len,
-                                          causal=True)
+                causal=False,
+            )
+            scheduler_metadata = schedule(
+                batch_size=num_reqs,
+                cu_query_lens=query_start_loc,
+                max_query_len=max_query_len,
+                seqlens=suffix_kv_lens,
+                max_seq_len=max_seq_len - common_prefix_len,
+                causal=True,
+            )
         else:
-            cu_prefix_query_lens = None
-            prefix_kv_lens = None
-            suffix_kv_lens = None
-            prefix_scheduler_metadata = None
-            scheduler_metadata = schedule(batch_size=num_reqs,
-                                          cu_query_lens=query_start_loc,
-                                          max_query_len=max_query_len,
-                                          seqlens=seq_lens,
-                                          max_seq_len=max_seq_len,
-                                          causal=causal)
+            scheduler_metadata = schedule(
+                batch_size=num_reqs,
+                cu_query_lens=query_start_loc,
+                max_query_len=max_query_len,
+                seqlens=seq_lens,
+                max_seq_len=max_seq_len,
+                causal=causal,
+            )
         # For FA3 + full cudagraph
         if self.use_full_cuda_graph and scheduler_metadata is not None:
             n = scheduler_metadata.shape[0]
@@ -349,6 +455,8 @@ class FlashAttentionMetadataBuilder(
             seq_lens=seq_lens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
+            max_dcp_context_kv_len=max_dcp_context_kv_len,
+            dcp_context_kv_lens=dcp_context_kv_lens,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             scheduler_metadata=scheduler_metadata,
@@ -357,7 +465,8 @@ class FlashAttentionMetadataBuilder(
             suffix_kv_lens=suffix_kv_lens,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
             max_num_splits=max_num_splits,
-            causal=causal)
+            causal=causal,
+        )
         return attn_metadata
 
     def use_cascade_attention(self, *args, **kwargs) -> bool:
@@ -365,6 +474,7 @@ class FlashAttentionMetadataBuilder(
 
 
 class FlashAttentionImpl(AttentionImpl):
+    can_return_lse_for_decode: bool = True
 
     def __init__(
         self,
@@ -372,13 +482,13 @@ class FlashAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -402,22 +512,28 @@ class FlashAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        FlashAttentionBackend.validate_head_size(head_size)
-
         self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
-        if is_quantized_kv_cache(self.kv_cache_dtype) \
-            and not flash_attn_supports_fp8():
+        # Cache the batch invariant result for use in forward passes
+        self.batch_invariant_enabled = vllm_is_batch_invariant()
+
+        if is_quantized_kv_cache(self.kv_cache_dtype) and not flash_attn_supports_fp8():
             raise NotImplementedError(
-                "FlashAttention does not support fp8 kv-cache on this device.")
+                "FlashAttention does not support fp8 kv-cache on this device."
+            )
 
         self.sinks = sinks
         if self.sinks is not None:
-            assert self.vllm_flash_attn_version == 3, (
-                "Sinks are only supported in FlashAttention 3")
+            assert flash_attn_supports_sinks(), (
+                "Sinks are only supported in FlashAttention 3"
+            )
             assert self.sinks.shape[0] == num_heads, (
                 "Sinks must have the same number of heads as the number of "
-                "heads in the layer")
+                "heads in the layer"
+            )
+
+    def supports_quant_query_input(self) -> bool:
+        return True
 
     def forward(
         self,
@@ -427,9 +543,9 @@ class FlashAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -450,12 +566,12 @@ class FlashAttentionImpl(AttentionImpl):
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashAttentionImpl")
+                "fused output quantization is not yet supported for FlashAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         attn_type = self.attn_type
 
@@ -474,11 +590,14 @@ class FlashAttentionImpl(AttentionImpl):
         if attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
             # For encoder attention,
             # we use direct Q, K, V tensors without caching
-            return self._forward_encoder_attention(query[:num_actual_tokens],
-                                                   key[:num_actual_tokens],
-                                                   value[:num_actual_tokens],
-                                                   output[:num_actual_tokens],
-                                                   attn_metadata, layer)
+            return self._forward_encoder_attention(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                layer,
+            )
 
         # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
@@ -486,8 +605,11 @@ class FlashAttentionImpl(AttentionImpl):
         # key and value may be None in the case of cross attention. They are
         # calculated once based on the output from the encoder and then cached
         # in KV cache.
-        if (self.kv_sharing_target_layer_name is None and key is not None
-                and value is not None):
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping is
@@ -509,7 +631,8 @@ class FlashAttentionImpl(AttentionImpl):
         if self.kv_cache_dtype.startswith("fp8"):
             # queries are quantized in the attention layer
             dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
-                self.kv_cache_dtype)
+                self.kv_cache_dtype
+            )
             key_cache = key_cache.view(dtype)
             value_cache = value_cache.view(dtype)
 
@@ -523,30 +646,45 @@ class FlashAttentionImpl(AttentionImpl):
 
             descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
 
-            flash_attn_varlen_func(
-                q=query[:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[:num_actual_tokens],
-                cu_seqlens_q=cu_seqlens_q,
-                max_seqlen_q=max_seqlen_q,
-                seqused_k=seqused_k,
-                max_seqlen_k=max_seqlen_k,
-                softmax_scale=self.scale,
-                causal=attn_metadata.causal,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=block_table,
-                softcap=self.logits_soft_cap,
-                scheduler_metadata=scheduler_metadata,
-                fa_version=self.vllm_flash_attn_version,
-                q_descale=layer._q_scale.expand(descale_shape),
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-                num_splits=attn_metadata.max_num_splits,
-                s_aux=self.sinks,
-            )
-            return output
+            if self.dcp_world_size > 1:
+                self._forward_with_dcp(
+                    query[:num_actual_tokens],
+                    key[:num_actual_tokens],
+                    value[:num_actual_tokens],
+                    key_cache,
+                    value_cache,
+                    output[:num_actual_tokens],
+                    attn_metadata,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                )
+                return output
+            else:
+                flash_attn_varlen_func(
+                    q=query[:num_actual_tokens],
+                    k=key_cache,
+                    v=value_cache,
+                    out=output[:num_actual_tokens],
+                    cu_seqlens_q=cu_seqlens_q,
+                    max_seqlen_q=max_seqlen_q,
+                    seqused_k=seqused_k,
+                    max_seqlen_k=max_seqlen_k,
+                    softmax_scale=self.scale,
+                    causal=attn_metadata.causal,
+                    alibi_slopes=self.alibi_slopes,
+                    window_size=self.sliding_window,
+                    block_table=block_table,
+                    softcap=self.logits_soft_cap,
+                    scheduler_metadata=scheduler_metadata,
+                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
+                    num_splits=attn_metadata.max_num_splits,
+                    s_aux=self.sinks,
+                )
+                return output
 
         # Cascade attention (rare case).
         cascade_attention(
@@ -566,15 +704,97 @@ class FlashAttentionImpl(AttentionImpl):
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
+            max_num_splits=attn_metadata.max_num_splits,
             fa_version=self.vllm_flash_attn_version,
             prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
             suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
             q_descale=layer._q_scale,
             k_descale=layer._k_scale,
             v_descale=layer._v_scale,
+            s_aux=self.sinks,
         )
         return output
 
+    def _forward_with_dcp(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        q_descale: torch.Tensor | None = None,
+        k_descale: torch.Tensor | None = None,
+        v_descale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        cu_seqlens_q = attn_metadata.query_start_loc
+        max_seqlen_q = attn_metadata.max_query_len
+        block_table = attn_metadata.block_table
+
+        query = query.contiguous()
+        query_across_dcp = get_dcp_group().all_gather(query, dim=1)
+        context_attn_out, context_lse = flash_attn_varlen_func(
+            q=query_across_dcp,
+            k=key_cache,
+            v=value_cache,
+            out=None,
+            cu_seqlens_q=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            seqused_k=attn_metadata.dcp_context_kv_lens,
+            max_seqlen_k=attn_metadata.max_dcp_context_kv_len,
+            softmax_scale=self.scale,
+            causal=False,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            block_table=block_table,
+            softcap=self.logits_soft_cap,
+            return_softmax_lse=True,
+            scheduler_metadata=attn_metadata.scheduler_metadata,
+            fa_version=self.vllm_flash_attn_version,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+        )
+        # FA returns LSE in shape [ H, B ] but cp_lse_ag_out_rs wants [ B, H ]
+        context_attn_out_cor, context_lse_cor = cp_lse_ag_out_rs(
+            context_attn_out,
+            context_lse.transpose(0, 1),
+            get_dcp_group(),
+            return_lse=True,
+        )
+        context_lse_cor = context_lse_cor.transpose(0, 1).contiguous()
+
+        query_attn_out, query_lse = flash_attn_varlen_func(
+            q=query,
+            k=key,
+            v=value,
+            out=None,
+            cu_seqlens_q=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_k=max_seqlen_q,
+            softmax_scale=self.scale,
+            causal=attn_metadata.causal,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            softcap=self.logits_soft_cap,
+            return_softmax_lse=True,
+            fa_version=self.vllm_flash_attn_version,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+        )
+        assert context_attn_out_cor.shape == query_attn_out.shape
+        assert context_lse_cor.shape == query_lse.shape
+        merge_attn_states(
+            output,
+            context_attn_out_cor,
+            context_lse_cor,
+            query_attn_out,
+            query_lse,
+        )
+
     def _forward_encoder_attention(
         self,
         query: torch.Tensor,
@@ -597,7 +817,8 @@ class FlashAttentionImpl(AttentionImpl):
         # For encoder attention, process FP8 quantization if needed
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError(
-                "quantization is not supported for encoder attention")
+                "quantization is not supported for encoder attention"
+            )
 
         # Use encoder-specific metadata for sequence information
         cu_seqlens_q = attn_metadata.query_start_loc
@@ -607,7 +828,8 @@ class FlashAttentionImpl(AttentionImpl):
 
         descale_shape = (
             cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
-            self.num_kv_heads)
+            self.num_kv_heads,
+        )
 
         # Call flash attention directly on Q, K, V tensors
         flash_attn_varlen_func(
@@ -628,6 +850,7 @@ class FlashAttentionImpl(AttentionImpl):
             q_descale=layer._q_scale.expand(descale_shape),
             k_descale=layer._k_scale.expand(descale_shape),
             v_descale=layer._v_scale.expand(descale_shape),
+            num_splits=1 if self.batch_invariant_enabled else 0,
         )
 
         return output
@@ -642,6 +865,7 @@ def use_cascade_attention(
     use_sliding_window: bool,
     use_local_attention: bool,
     num_sms: int,
+    dcp_world_size: int,
 ) -> bool:
     """Decide whether to use cascade attention.
 
@@ -663,6 +887,9 @@ def use_cascade_attention(
     num_reqs = len(query_lens)
     if num_reqs < 8:
         return False
+    # disable cascade attention for DCP
+    if dcp_world_size > 1:
+        return False
 
     # Heuristics to decide whether using cascade attention is beneficial.
     # 1. When FlashDecoding is not used for normal attention, cascade attention
@@ -670,8 +897,12 @@ def use_cascade_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     # The criteria for using FlashDecoding can be found in the following link:
     # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535
-    use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window
-                          and not use_alibi and np.all(query_lens == 1))
+    use_flash_decoding = (
+        num_queries_per_kv > 1
+        and not use_sliding_window
+        and not use_alibi
+        and np.all(query_lens == 1)
+    )
     if not use_flash_decoding:
         # Use cascade attention.
         return True
@@ -693,8 +924,9 @@ def use_cascade_attention(
     cascade_waves = cdiv(cascade_ctas, num_sms)
     cascade_time = cascade_waves * num_prefix_tiles
 
-    flash_decoding_ctas = (num_reqs * num_kv_heads *
-                           cdiv(num_queries_per_kv, q_tile_size))
+    flash_decoding_ctas = (
+        num_reqs * num_kv_heads * cdiv(num_queries_per_kv, q_tile_size)
+    )
     flash_decoding_ctas *= num_prefix_tiles
     flash_decoding_time = cdiv(flash_decoding_ctas, num_sms)
 
@@ -714,22 +946,25 @@ def cascade_attention(
     suffix_kv_lens: torch.Tensor,
     max_kv_len: int,
     softmax_scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     sliding_window: tuple[int, int],
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
+    max_num_splits: int,
     fa_version: int,
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
-    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
-    q_descale: Optional[torch.Tensor] = None,
-    k_descale: Optional[torch.Tensor] = None,
-    v_descale: Optional[torch.Tensor] = None,
+    prefix_scheduler_metadata: torch.Tensor | None = None,
+    suffix_scheduler_metadata: torch.Tensor | None = None,
+    q_descale: torch.Tensor | None = None,
+    k_descale: torch.Tensor | None = None,
+    v_descale: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
+    assert alibi_slopes is None, "Cascade attention does not support ALiBi."
     # TODO: Support sliding window.
     assert sliding_window == (-1, -1), (
-        "Cascade attention does not support sliding window.")
+        "Cascade attention does not support sliding window."
+    )
 
     num_tokens = query.shape[0]
     block_size = key_cache.shape[-3]
@@ -755,12 +990,13 @@ def cascade_attention(
         return_softmax_lse=True,
         scheduler_metadata=prefix_scheduler_metadata,
         fa_version=fa_version,
-        q_descale=q_descale.expand(descale_shape)
-        if q_descale is not None else None,
-        k_descale=k_descale.expand(descale_shape)
-        if k_descale is not None else None,
-        v_descale=v_descale.expand(descale_shape)
-        if v_descale is not None else None,
+        q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
+        # s_aux is incorporated into prefix_lse inside the GPU kernel,
+        # enabling its effect during the final attention merge.
+        s_aux=s_aux,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
@@ -782,14 +1018,11 @@ def cascade_attention(
         return_softmax_lse=True,
         scheduler_metadata=suffix_scheduler_metadata,
         fa_version=fa_version,
-        q_descale=q_descale.expand(descale_shape)
-        if q_descale is not None else None,
-        k_descale=k_descale.expand(descale_shape)
-        if k_descale is not None else None,
-        v_descale=v_descale.expand(descale_shape)
-        if v_descale is not None else None,
+        q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.
-    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
-                      suffix_lse)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
old mode 100755
new mode 100644
index 15a252734d4dd9d2bf80113b400bc80a50cff724..4da1637d96eb6ebafdb7d5209f6544a563a44365
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1,47 +1,68 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashInfer."""
-from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import numpy as np
 import torch
-from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
-                        BatchPrefillWithPagedKVCacheWrapper,
-                        MultiLevelCascadeAttentionWrapper)
+from flashinfer import (
+    BatchDecodeWithPagedKVCacheWrapper,
+    BatchPrefillWithPagedKVCacheWrapper,
+    BatchPrefillWithRaggedKVCacheWrapper,
+    MultiLevelCascadeAttentionWrapper,
+)
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
-
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionType)
+from typing_extensions import override
+
+from vllm import envs
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+)
+from vllm.attention.ops.common import cp_lse_ag_out_rs
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config.cache import CacheDType
+from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
+    QuantKey,
+    kFp8StaticTensorSym,
+    kNvfp4Quant,
+)
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv, is_pin_memory_available
-from vllm.utils.flashinfer import (can_use_trtllm_attention,
-                                   flashinfer_disable_q_quantization,
-                                   supports_trtllm_attention,
-                                   use_trtllm_attention)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              get_kv_cache_layout,
-                                              get_per_layer_parameters,
-                                              infer_global_hyperparameters,
-                                              split_decodes_and_prefills)
-# yapf: enable
+from vllm.utils.flashinfer import (
+    can_use_trtllm_attention,
+    flashinfer_disable_q_quantization,
+    use_trtllm_attention,
+)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    KVCacheLayoutType,
+    get_dcp_local_seq_lens,
+    get_kv_cache_layout,
+    get_per_layer_parameters,
+    infer_global_hyperparameters,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
 FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
@@ -55,7 +76,8 @@ def _get_trtllm_gen_workspace_buffer():
     global trtllm_gen_workspace_buffer
     if trtllm_gen_workspace_buffer is None:
         trtllm_gen_workspace_buffer = torch.zeros(
-            FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device='cuda')
+            envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device="cuda"
+        )
     return trtllm_gen_workspace_buffer
 
 
@@ -72,9 +94,9 @@ def _trtllm_prefill_attn_kvfp8_dequant(
 ):
     batch_idx = tl.program_id(0).to(tl.int64)
     mock_block_table_idx = tl.program_id(1).to(tl.int64)
-    orig_page_num = tl.load(block_tables_prefill_ptr +
-                            batch_idx * block_table_stride +
-                            mock_block_table_idx).to(tl.int64)
+    orig_page_num = tl.load(
+        block_tables_prefill_ptr + batch_idx * block_table_stride + mock_block_table_idx
+    ).to(tl.int64)
     if orig_page_num <= 0:
         return
     dequant_dtype = mock_kv_cache_ptr.dtype.element_ty
@@ -84,20 +106,24 @@ def _trtllm_prefill_attn_kvfp8_dequant(
     offset = orig_page_num * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
     fp8_vals = tl.load(kv_cache_ptr + offset)
     dequantized_vals = fp8_vals.to(tl.float32) * k_scale_val
-    mock_cache_offset = (batch_idx * block_table_stride + mock_block_table_idx
-                         + 1) * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
+    mock_cache_offset = (
+        batch_idx * block_table_stride + mock_block_table_idx + 1
+    ) * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
     dequantized_vals = dequantized_vals.to(dequant_dtype)
     tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals)
 
     # Dequantize V
     v_scale_val = tl.load(v_scale_ptr)
-    offset = (orig_page_num * KV_CACHE_STRIDE + K_CACHE_STRIDE +
-              tl.arange(0, K_CACHE_STRIDE))
+    offset = (
+        orig_page_num * KV_CACHE_STRIDE + K_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
+    )
     fp8_vals = tl.load(kv_cache_ptr + offset)
     dequantized_vals = fp8_vals.to(tl.float32) * v_scale_val
     mock_cache_offset = (
-        (batch_idx * block_table_stride + mock_block_table_idx + 1) *
-        KV_CACHE_STRIDE + K_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE))
+        (batch_idx * block_table_stride + mock_block_table_idx + 1) * KV_CACHE_STRIDE
+        + K_CACHE_STRIDE
+        + tl.arange(0, K_CACHE_STRIDE)
+    )
     dequantized_vals = dequantized_vals.to(dequant_dtype)
     tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals)
 
@@ -117,9 +143,7 @@ def trtllm_prefill_attn_kvfp8_dequant(
     kv_cache_stride = k_cache_stride * s[1]
     new_s = (batch_size * num_of_page_per_token + 1, s[1], s[2], s[3], s[4])
     # mock kv cache contains just the pages needed by this prefill
-    mock_kv_cache = torch.empty(new_s,
-                                dtype=dequant_dtype,
-                                device=kv_cache.device)
+    mock_kv_cache = torch.empty(new_s, dtype=dequant_dtype, device=kv_cache.device)
     # we simply sequentially index the pages needed by this prefill
     mock_block_table = torch.arange(
         start=1,
@@ -141,43 +165,137 @@ def trtllm_prefill_attn_kvfp8_dequant(
     return mock_kv_cache, mock_block_table
 
 
-class FlashInferBackend(AttentionBackend):
-    accept_output_buffer: bool = True
+class BatchDCPPrefillWrapper:
+    def __init__(
+        self,
+        workspace_buffer: torch.Tensor | None = None,
+    ):
+        self._context = BatchPrefillWithPagedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    def plan(
+        self,
+        qo_indptr_cpu: torch.Tensor,
+        paged_kv_indptr_cpu: torch.Tensor,
+        paged_kv_indices: torch.Tensor,
+        paged_kv_last_page_len_cpu: torch.Tensor,
+        prefill_start: int,
+        page_size: int,
+        num_qo_heads: int,
+        dcp_world_size: int,
+        num_kv_heads: int,
+        head_dim: int,
+        sm_scale: float,
+        window_left: int,
+        logits_soft_cap: float | None,
+        q_data_type: torch.dtype,
+        kv_cache_dtype: torch.dtype,
+        prefill_fixed_split_size: int,
+        disable_split_kv: bool,
+    ):
+        """Plan the prefill operation with given parameters."""
+        self._context.plan(
+            qo_indptr_cpu,
+            paged_kv_indptr_cpu,
+            paged_kv_indices,
+            paged_kv_last_page_len_cpu[prefill_start:],
+            num_qo_heads * dcp_world_size,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            causal=False,  # This is context run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+            kv_data_type=kv_cache_dtype,
+            fixed_split_size=prefill_fixed_split_size,
+            disable_split_kv=disable_split_kv,
+        )
+        self._new_tokens.plan(
+            qo_indptr=qo_indptr_cpu,
+            kv_indptr=qo_indptr_cpu,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim,
+            head_dim_vo=head_dim,
+            causal=True,  # This is newtokens run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+        )
 
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        return [64, 128, 256]
+    def run(
+        self,
+        layer: torch.nn.Module,
+        prefill_query: torch.Tensor,
+        kv_cache_permute: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+    ):
+        prefill_query_across_dcp = get_dcp_group().all_gather(
+            prefill_query.contiguous(), dim=1
+        )
+        output_context_tmp, lse_context_tmp = self._context.run(
+            prefill_query_across_dcp,
+            kv_cache_permute,
+            k_scale=layer._k_scale_float,
+            v_scale=layer._v_scale_float,
+            return_lse=True,
+        )
+        output_context, lse_context = cp_lse_ag_out_rs(
+            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+        )
+        lse_context = lse_context.transpose(0, 1).contiguous()
+
+        output_query, lse_query = self._new_tokens.run(
+            prefill_query,
+            key,
+            value,
+            return_lse=True,
+        )
+        lse_query = lse_query.transpose(0, 1).contiguous()
+
+        merge_attn_states(
+            out,
+            output_context,
+            lse_context,
+            output_query,
+            lse_query,
+        )
+        return out
 
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+
+class FlashInferBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # Note: Not sure for all platforms,
+    # but on Blackwell, only support a page size of
+    # 16, 32, 64
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER"
 
     @staticmethod
-    def get_impl_cls() -> type[FlashInferImpl]:
+    def get_impl_cls() -> type["FlashInferImpl"]:
         return FlashInferImpl
 
     @staticmethod
-    def get_metadata_cls() -> type[FlashInferMetadata]:
-        return FlashInferMetadata
-
-    @staticmethod
-    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+    def get_builder_cls() -> type["FlashInferMetadataBuilder"]:
         return FlashInferMetadataBuilder
 
     @staticmethod
@@ -212,6 +330,41 @@ class FlashInferBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        return [64, 128, 256]
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(7, 5) and capability <= DeviceCapability(
+            12, 1
+        )
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        """FlashInfer supports sinks when TRTLLM attention is available (SM100)."""
+        from vllm.utils.flashinfer import (
+            force_use_trtllm_attention,
+            supports_trtllm_attention,
+        )
+
+        # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0)
+        if force_use_trtllm_attention() is False:
+            return False
+
+        # Check if TRTLLM is supported on this platform
+        return supports_trtllm_attention()
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> KVCacheLayoutType | None:
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability is not None and capability.major == 10:
+            return "HND"
+        return None
+
 
 @dataclass
 class FlashInferMetadata:
@@ -240,56 +393,96 @@ class FlashInferMetadata:
     # For cascade attention (CPU for planning).
     use_cascade: bool
 
-    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
-    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
-    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+    prefill_wrapper: (
+        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+    ) = None
+    decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
+    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
 
-    qo_indptr_gpu: Optional[torch.Tensor] = None
-    paged_kv_indptr_gpu: Optional[torch.Tensor] = None
+    qo_indptr_gpu: torch.Tensor | None = None
+    paged_kv_indptr_gpu: torch.Tensor | None = None
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-
     reorder_batch_threshold: int = 1
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
         self._workspace_buffer = None
-        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._prefill_wrapper: (
+            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+        ) = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
+        if vllm_is_batch_invariant():
+            self.decode_fixed_split_size = 2048
+            self.prefill_fixed_split_size = 4096
+            self.disable_split_kv = True
+        else:
+            self.decode_fixed_split_size = -1
+            self.prefill_fixed_split_size = -1
+            self.disable_split_kv = False
+
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(self.model_config.max_model_len,
-                                     self.kv_cache_spec.block_size)
+        max_num_pages_per_req = cdiv(
+            self.model_config.max_model_len, self.kv_cache_spec.block_size
+        )
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
-        self.enable_cuda_graph = (self.compilation_config.cudagraph_mode.\
-            decode_mode() == CUDAGraphMode.FULL)
+        speculative_config = vllm_config.speculative_config
+        num_spec_tokens = (
+            speculative_config.num_speculative_tokens
+            if speculative_config is not None
+            else 0
+        )
+        self.enable_cuda_graph = (
+            self.compilation_config.cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+        )
         if self.enable_cuda_graph:
             # For full cudagraph capture, one `decode_wrapper` for each batch
             # size is needed for FlashInfer.
             self._decode_wrappers_cudagraph: dict[
-                int, BatchDecodeWithPagedKVCacheWrapper] = {}
+                int, BatchDecodeWithPagedKVCacheWrapper
+            ] = {}
             self._decode_cudagraph_max_bs = min(
-                max_num_reqs, self.compilation_config.max_capture_size)
+                (1 + num_spec_tokens) * max_num_reqs,
+                self.compilation_config.max_cudagraph_capture_size,
+            )
+
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+            self.dcp_kv_cache_interleave_size = (
+                vllm_config.parallel_config.dcp_kv_cache_interleave_size
+            )
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+            self.dcp_kv_cache_interleave_size = 1
+
+        self.num_qo_heads = (
+            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
+            * self.dcp_world_size
+        )
 
-        self.num_qo_heads = self.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config)
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
         self.head_dim = self.kv_cache_spec.head_size
-        FlashInferBackend.validate_head_size(self.head_dim)
         self.page_size = self.kv_cache_spec.block_size
 
         self.cache_dtype = self.cache_config.cache_dtype
         if self.cache_dtype.startswith("fp8"):
-            self.kv_cache_dtype = (
-                FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    self.cache_dtype))
+            self.kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.cache_dtype
+            )
         else:
             assert self.kv_cache_spec.dtype == self.model_config.dtype
             self.kv_cache_dtype = self.kv_cache_spec.dtype
@@ -298,91 +491,126 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION is set to 1. Otherwise, try to
         # use fp8 q if kv cache is fp8, and will fall back to model dtype
         # if TRTLLM attention kernel is not used when building attn metadata
-        if supports_trtllm_attention() and \
-            not flashinfer_disable_q_quantization():
+        can_use_trtllm = can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads)
+        if can_use_trtllm and not flashinfer_disable_q_quantization():
             self.q_data_type = self.kv_cache_dtype
         else:
             self.q_data_type = self.model_config.dtype
 
-        supports_spec_as_decode = \
-            can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads)
-        self._init_reorder_batch_threshold(1, supports_spec_as_decode)
+        # Prefer TRTLLM attention for decoding in all cases.
+        # This allows us to use AttentionCGSupport.UNIFORM_BATCH mode.
+        self.use_trtllm_decode_attention = can_use_trtllm
+        self._init_reorder_batch_threshold(1, supports_spec_as_decode=can_use_trtllm)
 
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
         # TODO: discard this for trtllm-gen backend
         self.global_hyperparameters = infer_global_hyperparameters(
-            get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
+            get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)
+        )
         self.sm_scale = self.global_hyperparameters.sm_scale
         self.window_left = self.global_hyperparameters.window_left
         self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
         self.has_sinks = self.global_hyperparameters.has_sinks
-        if self.has_sinks and not supports_trtllm_attention():
+        if self.has_sinks and not can_use_trtllm:
             raise NotImplementedError(
                 "FlashInfer backend currently does not support attention "
                 "sinks, please use trtllm on blackwell or flash attention on "
-                "earlier GPUs.")
+                "earlier GPUs."
+            )
         # Preparing persistent buffers (device-side)
-        self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
-                                           dtype=torch.int32,
-                                           device=self.device)
+        self.paged_kv_indptr = torch.zeros(
+            max_num_reqs + 1, dtype=torch.int32, device=self.device
+        )
         self.paged_kv_indices = torch.zeros(
             max_num_pages,  # max num pages possible
             dtype=torch.int32,
-            device=self.device)
-        self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
-                                                  dtype=torch.int32,
-                                                  device=self.device)
+            device=self.device,
+        )
+        self.paged_kv_last_page_len = torch.zeros(
+            max_num_reqs, dtype=torch.int32, device=self.device
+        )
         # host-side buffer
         pin_memory = is_pin_memory_available()
-        self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1,
-                                               dtype=torch.int32,
-                                               device="cpu",
-                                               pin_memory=pin_memory)
+        self.paged_kv_indptr_cpu = torch.zeros(
+            max_num_reqs + 1, dtype=torch.int32, device="cpu", pin_memory=pin_memory
+        )
         self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
         self.paged_kv_indptr_buffer = torch.zeros_like(
-            self.paged_kv_indptr_cpu, pin_memory=pin_memory)
-        self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
-                                                dtype=torch.int32,
-                                                device="cpu",
-                                                pin_memory=pin_memory)
-        self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs,
-                                                      dtype=torch.int32,
-                                                      device="cpu",
-                                                      pin_memory=pin_memory)
-        self.paged_kv_last_page_len_np = (
-            self.paged_kv_last_page_len_cpu.numpy())
+            self.paged_kv_indptr_cpu, pin_memory=pin_memory
+        )
+        self.paged_kv_indices_cpu = torch.zeros(
+            max_num_pages, dtype=torch.int32, device="cpu", pin_memory=pin_memory
+        )
+        self.paged_kv_last_page_len_cpu = torch.zeros(
+            max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=pin_memory
+        )
+        self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
+
+        if self.head_dim == 256 and current_platform.is_device_capability(100):
+            # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
+            # head size 256 and block size 16 is not supported on blackwell.
+            assert kv_cache_spec.block_size != 16, (
+                "There is a bug in FlashInfer "
+                "block_size 16 head size 256 support. Please avoid this combination by "
+                "passing --block-size 32 or --block-size 64."
+            )
+
+    @classmethod
+    @override
+    def get_cudagraph_support(
+        cls: type["FlashInferMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        has_trtllm_support = can_use_trtllm_attention(
+            num_qo_heads=vllm_config.model_config.get_num_attention_heads(
+                vllm_config.parallel_config
+            ),
+            num_kv_heads=kv_cache_spec.num_kv_heads,
+        )
+        if has_trtllm_support:
+            return AttentionCGSupport.UNIFORM_BATCH
+        else:
+            return AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
+            buffer_size = envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE
+            if vllm_is_batch_invariant():
+                buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT
             self._workspace_buffer = torch.zeros(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=self.device)
+                buffer_size, dtype=torch.uint8, device=self.device
+            )
         return self._workspace_buffer
 
-    def _get_prefill_wrapper(self):
+    def _get_prefill_wrapper(
+        self,
+    ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), get_kv_cache_layout())
+            if self.dcp_world_size > 1:
+                self._prefill_wrapper = BatchDCPPrefillWrapper(
+                    workspace_buffer=self._get_workspace_buffer(),
+                )
+            else:
+                self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                    self._get_workspace_buffer(), get_kv_cache_layout()
+                )
+        assert self._prefill_wrapper is not None
         return self._prefill_wrapper
 
-    def _get_decode_wrapper(self,
-                            batch_size: int,
-                            use_cudagraph: bool = False):
+    def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
         if use_cudagraph:
-            decode_wrapper = self._decode_wrappers_cudagraph.get(
-                batch_size, None)
+            decode_wrapper = self._decode_wrappers_cudagraph.get(batch_size, None)
         else:
             decode_wrapper = self._decode_wrapper
 
         if decode_wrapper is None:
             if use_cudagraph:
-                paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1]
+                paged_kv_indptr = self.paged_kv_indptr[: batch_size + 1]
                 paged_kv_indices = self.paged_kv_indices
-                paged_kv_last_page_len = self.paged_kv_last_page_len[:
-                                                                     batch_size]
+                paged_kv_last_page_len = self.paged_kv_last_page_len[:batch_size]
             else:
                 paged_kv_indptr = None
                 paged_kv_indices = None
@@ -411,28 +639,54 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
     def _get_cascade_wrapper(self):
         if self._cascade_wrapper is None:
             self._cascade_wrapper = MultiLevelCascadeAttentionWrapper(
-                2, self._get_workspace_buffer(), get_kv_cache_layout())
+                2, self._get_workspace_buffer(), get_kv_cache_layout()
+            )
         return self._cascade_wrapper
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> FlashInferMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlashInferMetadata:
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=self.reorder_batch_threshold,
-                                       require_uniform=True)
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold,
+                require_uniform=True,
+            )
+        )
 
         page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
+
+        if self.dcp_world_size > 1:
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
 
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        seq_lens_np = seq_lens_cpu.numpy()
         num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
@@ -442,17 +696,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             num_common_kv_blocks = common_prefix_len // page_size
 
             # Create CPU versions directly for cascade (no GPU versions needed)
-            shared_qo_indptr_cpu = torch.tensor([0, num_actual_tokens],
-                                                dtype=torch.int32,
-                                                device='cpu')
-            shared_kv_page_indptr_cpu = torch.tensor([0, num_common_kv_blocks],
-                                                     dtype=torch.int32,
-                                                     device='cpu')
-            shared_kv_page_indices_cpu = block_table_tensor[
-                0, :num_common_kv_blocks]
-            shared_kv_last_page_len_cpu = torch.tensor([page_size],
-                                                       dtype=torch.int32,
-                                                       device='cpu')
+            shared_qo_indptr_cpu = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indptr_cpu = torch.tensor(
+                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
+            shared_kv_last_page_len_cpu = torch.tensor(
+                [page_size], dtype=torch.int32, device="cpu"
+            )
 
             # Remove the blocks of the shared prefix from all requests.
             block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
@@ -467,22 +720,23 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         np.cumsum(
             num_blocks_np,
             dtype=np.int32,
-            out=self.paged_kv_indptr_np[1:num_reqs + 1],
+            out=self.paged_kv_indptr_np[1 : num_reqs + 1],
         )
         # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
         # after this line (e.g., for cuda graphs), we need to copy the data to
         # self.paged_kv_indptr_buffer to avoid race condition.
-        self.paged_kv_indptr_buffer[:num_reqs +
-                                    1] = (self.paged_kv_indptr_cpu[:num_reqs +
-                                                                   1])
-        paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
-        paged_kv_indptr.copy_(self.paged_kv_indptr_buffer[:num_reqs + 1],
-                              non_blocking=True)
+        self.paged_kv_indptr_buffer[: num_reqs + 1] = self.paged_kv_indptr_cpu[
+            : num_reqs + 1
+        ]
+        paged_kv_indptr = self.paged_kv_indptr[: num_reqs + 1]
+        paged_kv_indptr.copy_(
+            self.paged_kv_indptr_buffer[: num_reqs + 1], non_blocking=True
+        )
 
         # write self.paged_kv_indices inplace
         num_actual_pages = self.paged_kv_indptr_np[num_reqs]
         paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        _copy_page_indices_kernel[(num_reqs, )](
+        _copy_page_indices_kernel[(num_reqs,)](
             paged_kv_indices,
             block_table_tensor,
             block_table_tensor.stride(0),
@@ -493,39 +747,51 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # write self.paged_kv_last_page_len_cpu inplace
         paged_kv_last_page_len_np = seq_lens_np % page_size
         self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            paged_kv_last_page_len_np == 0,
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
             page_size,
             paged_kv_last_page_len_np,
         )
 
         uses_spec_reorder = self.reorder_batch_threshold > 1
-        prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
-                                                  self.num_kv_heads,
-                                                  num_prefill_tokens,
-                                                  max_seq_len,
-                                                  self.cache_dtype,
-                                                  self.q_data_type,
-                                                  is_prefill=True,
-                                                  has_sinks=self.has_sinks,
-                                                  has_spec=uses_spec_reorder)
-        decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
-                                                 self.num_kv_heads,
-                                                 num_decode_tokens,
-                                                 max_seq_len,
-                                                 self.cache_dtype,
-                                                 self.q_data_type,
-                                                 is_prefill=False,
-                                                 has_sinks=self.has_sinks,
-                                                 has_spec=uses_spec_reorder)
-        if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
-            raise NotImplementedError(
-                "FlashInfer backend currently does not support attention "
-                "sinks, please use trtllm on blackwell or flash attention on "
-                "earlier GPUs.")
+        prefill_use_trtllm = use_trtllm_attention(
+            self.num_qo_heads,
+            self.num_kv_heads,
+            num_prefill_tokens,
+            max_seq_len,
+            self.dcp_world_size,
+            self.cache_dtype,
+            self.q_data_type,
+            is_prefill=True,
+            has_sinks=self.has_sinks,
+            has_spec=uses_spec_reorder,
+        )
+        decode_use_trtllm = (
+            self.use_trtllm_decode_attention and self.dcp_world_size <= 1
+        )
 
-        # If TRTLLM attention is not used, the q quantization is not supported.
-        # Fall back to use model dtype.
         if not (prefill_use_trtllm and decode_use_trtllm):
+            if self.has_sinks:
+                raise NotImplementedError(
+                    "FlashInfer backend currently does not support attention "
+                    "sinks, please use trtllm on blackwell or flash attention "
+                    "on earlier GPUs."
+                )
+
+            if not self.global_hyperparameters.has_same_window_lefts:
+                raise ValueError(
+                    "Window left is not the same for all layers. "
+                    "One potential fix is to set disable_sliding_window=True"
+                )
+
+            assert self.global_hyperparameters.has_same_all_params, (
+                "FlashInfer backend currently only supports models in which "
+                "all layers share the same values for the following "
+                "hyperparameters: `window_left`, `logits_soft_cap`, "
+                "`sm_scale`."
+            )
+
+            # The q quantization is not supported for non-trtllm attention,
+            # fall back to model dtype.
             self.q_data_type = self.model_config.dtype
 
         attn_metadata = FlashInferMetadata(
@@ -546,8 +812,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             use_cascade=use_cascade,
         )
 
-        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
-        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[:1 + num_reqs]
+        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
         paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
         if attn_metadata.use_cascade:
@@ -570,25 +835,24 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
         else:
             # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back,
-            # according to reorder_batch()
+            # Decodes are at the front and prefills are at the back.
             num_prefills = attn_metadata.num_prefills
             num_decodes = attn_metadata.num_decodes
             if num_prefills > 0:
                 # Decodes are first so prefills start after the last decode
                 prefill_start = num_decodes
                 attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert qo_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert paged_kv_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert paged_kv_last_page_len_cpu[prefill_start:].shape[
-                    0] == num_prefills
+                assert qo_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
+                assert paged_kv_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
+                assert (
+                    paged_kv_last_page_len_cpu[prefill_start:].shape[0] == num_prefills
+                )
                 # Since prefill_wrapper.run() will be called with
                 # query[num_decode_tokens:] we need to adjust the qo_indptr
                 # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[
-                    prefill_start]
+                qo_indptr_cpu = (
+                    qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
+                )
                 paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
 
                 # Recompute max_q_len for the slice of requests we are using
@@ -596,68 +860,105 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 # we have a non-uniform batch with some short decodes offloaded
                 # to the prefill pathway
                 query_lens_prefill = qo_indptr_cpu[1:] - qo_indptr_cpu[:-1]
-                attn_metadata.max_q_len_prefill = \
-                    int(query_lens_prefill.max().item())
+                attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
 
                 if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        paged_kv_indices,
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                    )
+                    if self.dcp_world_size > 1:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu=qo_indptr_cpu,
+                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+                            paged_kv_indices=paged_kv_indices,
+                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
+                            prefill_start=prefill_start,
+                            page_size=self.page_size,
+                            num_qo_heads=self.num_qo_heads,
+                            dcp_world_size=self.dcp_world_size,
+                            num_kv_heads=self.num_kv_heads,
+                            head_dim=self.head_dim,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_cache_dtype=self.kv_cache_dtype,
+                            prefill_fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
+                    else:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper,
+                            BatchPrefillWithPagedKVCacheWrapper,
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu,
+                            paged_kv_indptr_cpu,
+                            paged_kv_indices,
+                            paged_kv_last_page_len_cpu[prefill_start:],
+                            self.num_qo_heads,
+                            self.num_kv_heads,
+                            self.head_dim,
+                            self.page_size,
+                            causal=True,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_data_type=self.kv_cache_dtype,
+                            fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
-                        self.device, non_blocking=True)
+                        self.device, non_blocking=True
+                    )
                     attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device, non_blocking=True)
+                        self.device, non_blocking=True
+                    )
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0
                 # possible required padding for cudagraph replay
-                use_cudagraph = (self.enable_cuda_graph and pure_decode and
-                                 num_decodes <= self._decode_cudagraph_max_bs)
+                use_cudagraph = (
+                    self.enable_cuda_graph
+                    and pure_decode
+                    and num_decode_tokens <= self._decode_cudagraph_max_bs
+                )
                 if use_cudagraph:
-                    num_input_tokens = (
-                        self.vllm_config.pad_for_cudagraph(num_decode_tokens))
+                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                        num_decode_tokens
+                    )
                     # Carefully fulfill the padding region with reasonable value
                     # on cpu.
                     # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
-                                             num_input_tokens].fill_(
-                                                 paged_kv_indptr_cpu[-1])
+                    self.paged_kv_indptr_cpu[
+                        1 + num_decodes : 1 + num_input_tokens
+                    ].fill_(paged_kv_indptr_cpu[-1])
                     # Fill the remaining paged_kv_last_page_len_cpu with 1.
                     # This is because flashinfer treats 0 as a full page
                     # instead of empty.
-                    self.paged_kv_last_page_len_cpu[
-                        num_decodes:num_input_tokens].fill_(1)
+                    self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_(
+                        1
+                    )
 
                 else:
                     num_input_tokens = num_decode_tokens
 
                 attn_metadata.decode_wrapper = self._get_decode_wrapper(
-                    num_input_tokens, use_cudagraph)
+                    num_input_tokens, use_cudagraph
+                )
                 if not attn_metadata.decode_use_trtllm:
                     # Use the persistent buffer with padding length,
                     # instead of the same address but chunked version
                     # in atten_metadata when using cudagraph.
                     fast_plan_decode(
                         attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
+                        self.paged_kv_indptr_cpu[: num_input_tokens + 1],
                         paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
+                        self.num_qo_heads * self.dcp_world_size,
                         self.num_kv_heads,
                         self.head_dim,
                         self.page_size,
@@ -668,6 +969,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
+                        fixed_split_size=self.decode_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
                     )
         return attn_metadata
 
@@ -682,6 +985,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
 
 class FlashInferImpl(AttentionImpl):
+    can_return_lse_for_decode: bool = True
 
     def __init__(
         self,
@@ -689,13 +993,13 @@ class FlashInferImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -708,8 +1012,9 @@ class FlashInferImpl(AttentionImpl):
             self.sliding_window = (-1, -1)
         else:
             self.sliding_window = (sliding_window - 1, 0)
-        self.window_left = (self.sliding_window[0]
-                            if self.sliding_window is not None else -1)
+        self.window_left = (
+            self.sliding_window[0] if self.sliding_window is not None else -1
+        )
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
@@ -717,30 +1022,45 @@ class FlashInferImpl(AttentionImpl):
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "FlashInferImpl"
+            )
 
-        self.sinks: Optional[torch.Tensor] = None
+        self.sinks: torch.Tensor | None = None
         if sinks is not None:
             if sinks.shape[0] != num_heads:
                 raise ValueError(
                     "Sinks must have the same number of heads as the number of "
                     f"heads in the layer. Expected {num_heads}, but got "
-                    f"{sinks.shape[0]}.")
+                    f"{sinks.shape[0]}."
+                )
             self.sinks = sinks
 
-        self.support_trtllm_attn = (supports_trtllm_attention()
-                                    and num_heads % num_kv_heads == 0)
-        self.bmm1_scale: Optional[float] = None
-        self.bmm2_scale: Optional[float] = None
-        self.o_sf_scale: Optional[float] = None
+        self.support_trtllm_attn = can_use_trtllm_attention(num_heads, num_kv_heads)
+        self.bmm1_scale: float | None = None
+        self.bmm2_scale: float | None = None
+        self.o_sf_scale: float | None = None
 
     def fused_output_quant_supported(self, quant_key: QuantKey):
-        return (self.support_trtllm_attn
-                and self.kv_cache_dtype.startswith("fp8")
-                and quant_key in (kFp8StaticTensorSym, kNvfp4Quant))
+        return (
+            self.support_trtllm_attn
+            and self.kv_cache_dtype.startswith("fp8")
+            and quant_key in (kFp8StaticTensorSym, kNvfp4Quant)
+        )
+
+    def supports_quant_query_input(self) -> bool:
+        if flashinfer_disable_q_quantization():
+            return False
+
+        return self.support_trtllm_attn
+
+    # FlashInfer requires attention sinks to be float32
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        if self.sinks is not None and self.sinks.dtype != torch.float32:
+            self.sinks = self.sinks.to(torch.float32)
 
     def forward(
         self,
@@ -750,9 +1070,9 @@ class FlashInferImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashInferMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashInfer.
 
@@ -771,31 +1091,41 @@ class FlashInferImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
+
+        # Ensure query dtype matches the expected dtype from attention metadata
+        assert attn_metadata.q_data_type == query.dtype, (
+            f"Query dtype mismatch: expected {attn_metadata.q_data_type}, "
+            f"got {query.dtype}"
+        )
 
         if self.bmm1_scale is None:
-            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
-                               self.scale)
+            self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
 
         if self.bmm2_scale is None:
             self.bmm2_scale = layer._v_scale_float
 
         # The attn+quant fusion happens when output_scale is provided.
         if output_scale is None:
-            assert output_block_scale is None, "output_block_scale "\
-                "is not supported when fusion has not happened"
+            assert output_block_scale is None, (
+                "output_block_scale is not supported when fusion has not happened"
+            )
         else:
-            assert attn_metadata.q_data_type == FP8_DTYPE, \
+            assert attn_metadata.q_data_type == FP8_DTYPE, (
                 "Query must be FP8 when attn+quant fusion happened."
-            assert (attn_metadata.prefill_use_trtllm and
-                    attn_metadata.decode_use_trtllm), "Must use TRT-LLM attn"
+            )
+            assert (
+                attn_metadata.prefill_use_trtllm and attn_metadata.decode_use_trtllm
+            ), "Must use TRT-LLM attn"
 
             if output.dtype == FP8_DTYPE:
-                assert output_block_scale is None, \
+                assert output_block_scale is None, (
                     "output_block_scale should not be provided for fp8 output"
+                )
             elif output.dtype == FP4_DTYPE:
-                assert output_block_scale is not None, \
+                assert output_block_scale is not None, (
                     "output_block_scale is required for nvfp4 output"
+                )
             else:
                 raise ValueError(f"Unsupported output dtype: {output.dtype}")
 
@@ -809,15 +1139,6 @@ class FlashInferImpl(AttentionImpl):
                 elif output.dtype == FP4_DTYPE:
                     self.o_sf_scale = layer._o_scale_float
 
-        # Insert FP8 quant for query
-        if attn_metadata.q_data_type == FP8_DTYPE:
-            num_tokens, num_heads, head_size = query.shape
-            query, _ = ops.scaled_fp8_quant(
-                query.reshape(
-                    (num_tokens, num_heads * head_size)).contiguous(),
-                layer._q_scale)
-            query = query.reshape((num_tokens, num_heads, head_size))
-
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
@@ -852,11 +1173,14 @@ class FlashInferImpl(AttentionImpl):
             # to process the cache when the kv_cache_dtype is fp8
             if self.kv_cache_dtype.startswith("fp8"):
                 torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    self.kv_cache_dtype)
+                    self.kv_cache_dtype
+                )
                 kv_cache = kv_cache.view(torch_dtype)
 
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
         output_padded = output
         output = output[:num_actual_tokens]
 
@@ -875,8 +1199,7 @@ class FlashInferImpl(AttentionImpl):
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)
         # Regular attention (common case).
-        # Decodes are at the front and prefills are at the back,
-        # according to reorder_batch()
+        # Decodes are at the front and prefills are at the back.
         if num_prefill_tokens > 0:
             prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
@@ -884,24 +1207,51 @@ class FlashInferImpl(AttentionImpl):
             assert prefill_wrapper is not None
 
             if not attn_metadata.prefill_use_trtllm:
-                assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == self.window_left
-                assert prefill_wrapper._logits_soft_cap == (
-                    self.logits_soft_cap or 0.0)
-                assert prefill_wrapper._sm_scale == self.scale
-                prefill_wrapper.run(
-                    prefill_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[num_decode_tokens:],
-                )
+                if self.dcp_world_size > 1:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    assert prefill_wrapper._context._window_left == self.window_left
+                    assert prefill_wrapper._context._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._context._sm_scale == self.scale
+                    assert not prefill_wrapper._context._causal
+                    assert prefill_wrapper._new_tokens._window_left == self.window_left
+                    assert prefill_wrapper._new_tokens._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._new_tokens._sm_scale == self.scale
+                    assert prefill_wrapper._new_tokens._causal
+
+                    prefill_wrapper.run(
+                        layer,
+                        prefill_query,
+                        kv_cache_permute,
+                        key[num_decode_tokens:],
+                        value[num_decode_tokens:],
+                        out=output[num_decode_tokens:],
+                    )
+                else:
+                    assert isinstance(
+                        prefill_wrapper, BatchPrefillWithPagedKVCacheWrapper
+                    )
+                    assert prefill_wrapper._window_left == self.window_left
+                    assert prefill_wrapper._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._sm_scale == self.scale
+                    assert prefill_wrapper._causal
+                    prefill_wrapper.run(
+                        prefill_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[num_decode_tokens:],
+                    )
             else:
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_prefill = attn_metadata.block_table_tensor[
-                    num_decodes:]
+                block_tables_prefill = attn_metadata.block_table_tensor[num_decodes:]
                 seq_lens_prefill = attn_metadata.seq_lens[num_decodes:]
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
@@ -914,28 +1264,31 @@ class FlashInferImpl(AttentionImpl):
 
                 if output.dtype == FP4_DTYPE:
                     assert self.o_sf_scale is not None
-                    out = FP4Tensor(data=output[num_decode_tokens:],
-                                    scale=output_block_scale,
-                                    scale_start_index=num_decode_tokens,
-                                    original_shape=prefill_query.shape)
+                    out = FP4Tensor(
+                        data=output[num_decode_tokens:],
+                        scale=output_block_scale,
+                        scale_start_index=num_decode_tokens,
+                        original_shape=prefill_query.shape,
+                    )
                 else:
                     assert self.o_sf_scale is None
                     out = output[num_decode_tokens:]
 
-                if attn_metadata.q_data_type != FP8_DTYPE \
-                    and self.kv_cache_dtype.startswith("fp8"):
+                if (
+                    attn_metadata.q_data_type != FP8_DTYPE
+                    and self.kv_cache_dtype.startswith("fp8")
+                ):
                     # TRTLLM prefill attention does not support BF16 Q
                     # and fp8 kv cache. So to enable prefill attention
                     # with fp8 kv cache, we can construct a mock block
                     # and mock kv cache with BF16 KV involved in the prefill
-                    mock_kv_cache, mock_block_table = (
-                        trtllm_prefill_attn_kvfp8_dequant(
-                            kv_cache_permute,
-                            block_tables_prefill,
-                            layer._k_scale,
-                            layer._v_scale,
-                            attn_metadata.q_data_type,
-                        ))
+                    mock_kv_cache, mock_block_table = trtllm_prefill_attn_kvfp8_dequant(
+                        kv_cache_permute,
+                        block_tables_prefill,
+                        layer._k_scale,
+                        layer._v_scale,
+                        attn_metadata.q_data_type,
+                    )
                 else:
                     mock_kv_cache = kv_cache_permute
                     mock_block_table = block_tables_prefill
@@ -967,22 +1320,46 @@ class FlashInferImpl(AttentionImpl):
 
             if not attn_metadata.decode_use_trtllm:
                 assert decode_wrapper._window_left == self.window_left
-                assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
-                                                           or 0.0)
+                assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
-                decode_wrapper.run(
-                    decode_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[:num_decode_tokens],
-                )
+
+                if self.dcp_world_size > 1:
+                    decode_query = get_dcp_group().all_gather(
+                        decode_query.contiguous(), dim=-2
+                    )
+                    output_tmp = torch.empty_like(decode_query)
+                    lse = torch.empty(
+                        (decode_query.size(0), decode_query.size(1)),
+                        dtype=torch.float32,
+                        device=decode_query.device,
+                    )
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output_tmp,
+                        lse=lse,
+                        return_lse=True,
+                    )
+                    output[:num_decode_tokens] = cp_lse_ag_out_rs(
+                        output_tmp, lse, get_dcp_group()
+                    )
+                else:
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[:num_decode_tokens],
+                    )
             else:
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_decode = attn_metadata.\
-                    block_table_tensor[:num_decode_tokens]
+                block_tables_decode = attn_metadata.block_table_tensor[
+                    :num_decode_tokens
+                ]
                 seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
@@ -995,10 +1372,12 @@ class FlashInferImpl(AttentionImpl):
 
                 if output.dtype == FP4_DTYPE:
                     assert self.o_sf_scale is not None
-                    out = FP4Tensor(data=output[:num_decode_tokens],
-                                    scale=output_block_scale,
-                                    scale_start_index=0,
-                                    original_shape=decode_query.shape)
+                    out = FP4Tensor(
+                        data=output[:num_decode_tokens],
+                        scale=output_block_scale,
+                        scale_start_index=0,
+                        original_shape=decode_query.shape,
+                    )
                 else:
                     assert self.o_sf_scale is None
                     out = output[:num_decode_tokens]
@@ -1008,8 +1387,7 @@ class FlashInferImpl(AttentionImpl):
                     # attention to be initialized with q_len = 0
                     q_len_per_req = 1
                 else:
-                    q_len_per_req = \
-                        num_decode_tokens // attn_metadata.num_decodes
+                    q_len_per_req = num_decode_tokens // attn_metadata.num_decodes
 
                 trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
@@ -1024,7 +1402,8 @@ class FlashInferImpl(AttentionImpl):
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
                     out=out,
-                    q_len_per_req=q_len_per_req)
+                    q_len_per_req=q_len_per_req,
+                )
         return output_padded
 
 
@@ -1040,14 +1419,16 @@ def fast_plan_decode(
     page_size: int,
     pos_encoding_mode: str = "NONE",
     window_left: int = -1,
-    logits_soft_cap: Optional[float] = None,
-    q_data_type: Optional[Union[str, torch.dtype]] = "float16",
-    kv_data_type: Optional[Union[str, torch.dtype]] = None,
-    data_type: Optional[Union[str, torch.dtype]] = None,
-    sm_scale: Optional[float] = None,
-    rope_scale: Optional[float] = None,
-    rope_theta: Optional[float] = None,
+    logits_soft_cap: float | None = None,
+    q_data_type: str | torch.dtype | None = "float16",
+    kv_data_type: str | torch.dtype | None = None,
+    data_type: str | torch.dtype | None = None,
+    sm_scale: float | None = None,
+    rope_scale: float | None = None,
+    rope_theta: float | None = None,
     non_blocking: bool = True,
+    fixed_split_size: int = -1,
+    disable_split_kv: bool = False,
 ) -> None:
     """
     A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for
@@ -1065,8 +1446,7 @@ def fast_plan_decode(
     # Warm up with the original plan if it is first call, and always run the
     # original plan if we run for dynamic shape. For fixed shape (cudagraph),
     # this warm up is to generate the _cached_module for the decode wrapper.
-    if not self.is_cuda_graph_enabled or \
-        getattr(self, "vllm_first_call", True):
+    if not self.is_cuda_graph_enabled or getattr(self, "vllm_first_call", True):
         self.plan(
             indptr_cpu,
             indices,
@@ -1085,6 +1465,10 @@ def fast_plan_decode(
             rope_scale,
             rope_theta,
             non_blocking,
+            None,  # block_tables
+            None,  # seq_lens
+            fixed_split_size,
+            disable_split_kv,
         )
         self.vllm_first_call = False
         return
@@ -1106,31 +1490,33 @@ def fast_plan_decode(
 
     if kv_data_type is None:
         kv_data_type = q_data_type
-    q_data_type = getattr(torch, q_data_type) if isinstance(
-        q_data_type, str) else q_data_type
-    kv_data_type = getattr(torch, kv_data_type) if isinstance(
-        kv_data_type, str) else kv_data_type
+    q_data_type = (
+        getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type
+    )
+    kv_data_type = (
+        getattr(torch, kv_data_type) if isinstance(kv_data_type, str) else kv_data_type
+    )
 
     if batch_size != self._fixed_batch_size:
         raise ValueError(
             "The batch size should be fixed in cudagraph mode, the runtime "
             "batch size {} mismatches the batch size set during "
-            "initialization {}".format(batch_size, self._fixed_batch_size))
+            "initialization {}".format(batch_size, self._fixed_batch_size)
+        )
     if len(indices) > len(self._paged_kv_indices_buf):
         raise ValueError(
-            "The size of indices should be less than or equal to the "
-            "allocated buffer")
+            "The size of indices should be less than or equal to the allocated buffer"
+        )
 
     # host-to-device copy for the indptr buffer
     self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True)
     # host-to-device copy for the last_page_len buffer
-    self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
-                                           non_blocking=True)
+    self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu, non_blocking=True)
 
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 15 arguments for tensor core version
+        # Make sure we pass exactly 18 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1147,6 +1533,9 @@ def fast_plan_decode(
             head_dim,
             head_dim,
             False,  # causal
+            window_left,
+            fixed_split_size,
+            disable_split_kv,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e
@@ -1176,6 +1565,8 @@ def _copy_page_indices_kernel(
     offset = tl.arange(0, BLOCK_SIZE)
     for i in tl.range(0, num_blocks, BLOCK_SIZE):
         block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks)
-        tl.store(page_indices + start_idx + i + offset,
-                 block_ids,
-                 mask=i + offset < num_blocks)
+        tl.store(
+            page_indices + start_idx + i + offset,
+            block_ids,
+            mask=i + offset < num_blocks,
+        )
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index ff634d35a0aac1505a2f26385c54f4676314fd6e..7768827d26dc3696a5521072c38f075bdbd74f51 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -2,36 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlexAttention."""
 
+import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Union
+from typing import ClassVar
 
 import torch
 import torch._dynamo.decorators
 import torch.nn.functional as F
-from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
-                                               _score_mod_signature, and_masks,
-                                               create_block_mask,
-                                               flex_attention)
-
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    _mask_mod_signature,
+    _score_mod_signature,
+    and_masks,
+    create_block_mask,
+    flex_attention,
+)
+
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    is_quantized_kv_cache,
+)
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.utils import cdiv, is_torch_equal_or_newer
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
-create_block_mask_compiled = torch.compile(create_block_mask,
-                                           fullgraph=True,
-                                           mode="reduce-overhead")
+create_block_mask_compiled = torch.compile(
+    create_block_mask, fullgraph=True, mode="reduce-overhead"
+)
 flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
 
 
@@ -39,7 +50,8 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
     device = offsets.device
     counts = offsets[1:] - offsets[:-1]
     return torch.repeat_interleave(
-        torch.arange(len(counts), device=device, dtype=torch.int32), counts)
+        torch.arange(len(counts), device=device, dtype=torch.int32), counts
+    )
 
 
 def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
@@ -61,27 +73,28 @@ def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
 
 class FlexAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        return  # FlexAttention supports any head size
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
     @staticmethod
     def get_name() -> str:
         return "FLEX_ATTENTION"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlexAttention supports both decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return FlexAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -100,11 +113,18 @@ class FlexAttentionBackend(AttentionBackend):
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
 
-#@torch.compile(fullgraph=True, mode="reduce-overhead")
-def physical_to_logical_mapping(block_table: torch.Tensor,
-                                seq_lens: torch.Tensor, block_size: int,
-                                total_blocks: int) -> torch.Tensor:
+
+# @torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(
+    block_table: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    total_blocks: int,
+) -> torch.Tensor:
     """
     Creates an inverse mapping from physical block locations to logical indices.
 
@@ -174,35 +194,37 @@ def physical_to_logical_mapping(block_table: torch.Tensor,
     max_reqs, max_num_blocks = block_table.shape
     device = block_table.device
 
-    physical_to_logical = torch.full((max_reqs, total_blocks),
-                                     -1,
-                                     dtype=torch.long,
-                                     device=device)
+    physical_to_logical = torch.full(
+        (max_reqs, total_blocks), -1, dtype=torch.long, device=device
+    )
 
     # Only process valid blocks to avoid garbage values
     num_blocks_per_seq = cdiv(seq_lens, block_size)
-    mask = torch.arange(max_num_blocks,
-                        device=device)[None, :] < num_blocks_per_seq[:, None]
+    mask = (
+        torch.arange(max_num_blocks, device=device)[None, :]
+        < num_blocks_per_seq[:, None]
+    )
 
     valid_block_table = torch.where(mask, block_table, 0)
     valid_logical_indices = torch.where(
-        mask,
-        torch.arange(max_num_blocks, device=device)[None, :], 0)
+        mask, torch.arange(max_num_blocks, device=device)[None, :], 0
+    )
 
-    physical_to_logical.scatter_(-1, valid_block_table.to(torch.int64),
-                                 valid_logical_indices)
+    physical_to_logical.scatter_(
+        -1, valid_block_table.to(torch.int64), valid_logical_indices
+    )
     # NB - Seems like block 0 is always empty so we reset it manually
     physical_to_logical[:, 0] = -1
     return physical_to_logical
 
 
 def unique_static_unsorted(
-        x: torch.Tensor,
-        *,
-        M: int,  # maximum positive value (0 is “skip me”)
-        dim: int = -1,  # axis along which to deduplicate
-        ignored_val: int = 0,  # value to ignore
-        pad_val: int = -1,  # sentinel for unused slots
+    x: torch.Tensor,
+    *,
+    M: int,  # maximum positive value (0 is “skip me”)
+    dim: int = -1,  # axis along which to deduplicate
+    ignored_val: int = 0,  # value to ignore
+    pad_val: int = -1,  # sentinel for unused slots
 ) -> torch.Tensor:
     """
     - Keeps the first occurrence of each non-zero value while preserving order,
@@ -234,8 +256,7 @@ def unique_static_unsorted(
     first_idx.scatter_reduce_(1, x_flat, idx, reduce="amin")
 
     # ── keep mask: first occurrence *and* value ≠ 0 ─────────────────────
-    keep = (x_flat != ignored_val) & (idx == first_idx.gather(1, x_flat)
-                                      )  # [B, N]
+    keep = (x_flat != ignored_val) & (idx == first_idx.gather(1, x_flat))  # [B, N]
 
     # ── left-pack uniques into a fresh tensor ───────────────────────────
     dest_pos = torch.cumsum(keep.to(torch.long), dim=1) - 1  # where to go
@@ -249,8 +270,9 @@ def unique_static_unsorted(
     return packed
 
 
-def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
-                    kv_idx: torch.Tensor):
+def causal_mask_mod(
+    b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor, kv_idx: torch.Tensor
+):
     return q_idx >= kv_idx
 
 
@@ -267,9 +289,9 @@ class FlexAttentionMetadata:
 
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Block info
     total_cache_tokens: int
@@ -285,15 +307,15 @@ class FlexAttentionMetadata:
 
     # Flex Metadata
     num_blocks = 0
-    block_mask: Optional[BlockMask] = None
-    score_mod: Optional[_score_mod_signature] = None
+    block_mask: BlockMask | None = None
+    score_mod: _score_mod_signature | None = None
     logical_mask_mod: _mask_mod_signature = causal_mask_mod
-    doc_ids: Optional[torch.Tensor] = None
+    doc_ids: torch.Tensor | None = None
     direct_build: bool = True
     q_block_size: int = 16
     kv_block_size: int = 16
-    transformed_score_mod: Optional[_score_mod_signature] = None
-    sliding_window: Optional[int] = None
+    transformed_score_mod: _score_mod_signature | None = None
+    sliding_window: int | None = None
 
     def _convert_physical_to_logical(
         self,
@@ -315,8 +337,7 @@ class FlexAttentionMetadata:
         physical_kv_block = physical_kv_idx // self.block_size
         physical_kv_offset = physical_kv_idx % self.block_size
         logical_block_idx = self.physical_to_logical[q_req, physical_kv_block]
-        logical_kv_idx = (logical_block_idx * self.block_size +
-                          physical_kv_offset)
+        logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset
 
         # Determine valid kv indices
         live_block = logical_block_idx >= 0
@@ -350,9 +371,9 @@ class FlexAttentionMetadata:
             q_idx: torch.Tensor,
             physical_kv_idx: torch.Tensor,
         ) -> torch.Tensor:
-            (is_valid, logical_q_idx,
-             logical_kv_idx) = self._convert_physical_to_logical(
-                 self.doc_ids, q_idx, physical_kv_idx)
+            (is_valid, logical_q_idx, logical_kv_idx) = (
+                self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx)
+            )
             # Apply mask modification only for valid indices
             return torch.where(
                 is_valid,
@@ -390,11 +411,11 @@ class FlexAttentionMetadata:
         """
 
         if self.sliding_window is None:
-            raise ValueError(
-                "sliding_window must be set for sliding window attention")
+            raise ValueError("sliding_window must be set for sliding window attention")
 
-        def sliding_window_mask_mod(b: torch.Tensor, h: torch.Tensor,
-                                    q_idx: torch.Tensor, kv_idx: torch.Tensor):
+        def sliding_window_mask_mod(
+            b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor, kv_idx: torch.Tensor
+        ):
             return torch.abs(q_idx - kv_idx) < self.sliding_window
 
         def final_mask_mod(
@@ -403,9 +424,9 @@ class FlexAttentionMetadata:
             q_idx: torch.Tensor,
             physical_kv_idx: torch.Tensor,
         ) -> torch.Tensor:
-            (is_valid, logical_q_idx,
-             logical_kv_idx) = self._convert_physical_to_logical(
-                 self.doc_ids, q_idx, physical_kv_idx)
+            (is_valid, logical_q_idx, logical_kv_idx) = (
+                self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx)
+            )
             return torch.where(
                 is_valid,
                 sliding_window_mask_mod(b, h, logical_q_idx, logical_kv_idx),
@@ -429,7 +450,7 @@ class FlexAttentionMetadata:
             mask_mod = and_masks(mask_mod, sliding_window_mask_mod)
         return mask_mod
 
-    def get_transformed_score_mod(self) -> Optional[_score_mod_signature]:
+    def get_transformed_score_mod(self) -> _score_mod_signature | None:
         """Creates the transformed score_mod function for FlexAttention.
 
         This function wraps the user's score_mod to handle physical-to-logical
@@ -449,18 +470,19 @@ class FlexAttentionMetadata:
             q_idx: torch.Tensor,
             physical_kv_idx: torch.Tensor,
         ) -> torch.Tensor:
-            (is_valid, logical_q_idx,
-             logical_kv_idx) = self._convert_physical_to_logical(
-                 request_lookup, q_idx, physical_kv_idx)
+            (is_valid, logical_q_idx, logical_kv_idx) = (
+                self._convert_physical_to_logical(
+                    request_lookup, q_idx, physical_kv_idx
+                )
+            )
 
             return torch.where(
                 is_valid,
-                user_score_mod(score,
-                               b,
-                               h,
-                               logical_q_idx,
-                               logical_kv_idx,
-                               physical_q=q_idx), -float('inf'))
+                user_score_mod(
+                    score, b, h, logical_q_idx, logical_kv_idx, physical_q=q_idx
+                ),
+                -float("inf"),
+            )
 
         return transformed_score_mod
 
@@ -491,18 +513,22 @@ class FlexAttentionMetadata:
                 f"FlexAttention currently requires the cache block size "
                 f"({self.block_size}) to be equal to the kv_block_size "
                 f"({self.kv_block_size}). Please check your model's "
-                f"configuration.")
+                f"configuration."
+            )
 
         used_pages = self.block_table[
-            self.doc_ids, :cdiv(self.max_seq_len, self.block_size)]
-        used_pages_padded = pad_to_multiple(used_pages,
-                                            multiple=self.q_block_size,
-                                            dim=0)
+            self.doc_ids, : cdiv(self.max_seq_len, self.block_size)
+        ]
+        used_pages_padded = pad_to_multiple(
+            used_pages, multiple=self.q_block_size, dim=0
+        )
         used_pages_padded = used_pages_padded.reshape(
-            used_pages_padded.shape[0] // self.q_block_size, -1)
+            used_pages_padded.shape[0] // self.q_block_size, -1
+        )
         used_pages_padded = used_pages_padded // page_to_block_ratio
-        kv_indices = unique_static_unsorted((used_pages_padded.long()),
-                                            M=self.num_blocks).to(torch.int32)
+        kv_indices = unique_static_unsorted(
+            (used_pages_padded.long()), M=self.num_blocks
+        ).to(torch.int32)
 
         kv_num_blocks = (kv_indices >= 0).sum(dim=-1).to(torch.int32)
         block_mask_kwargs = {
@@ -522,8 +548,7 @@ class FlexAttentionMetadata:
 
     def build_block_mask(self) -> BlockMask:
         mask_mod = self.get_mask_mod()
-        kv_len = (self.total_cache_tokens
-                  if self.causal else self.num_actual_tokens)
+        kv_len = self.total_cache_tokens if self.causal else self.num_actual_tokens
         return create_block_mask_compiled(
             mask_mod,
             None,
@@ -553,11 +578,14 @@ class FlexAttentionMetadata:
             self.block_mask = self.build_block_mask()
 
 
-class FlexAttentionMetadataBuilder(
-        AttentionMetadataBuilder[FlexAttentionMetadata]):
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadata]):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         self.model_config = vllm_config.model_config
@@ -565,26 +593,23 @@ class FlexAttentionMetadataBuilder(
         self.cache_config = vllm_config.cache_config
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        self.num_heads_kv = self.model_config.get_num_kv_heads(
-            self.parallel_config)
+            self.parallel_config
+        )
+        self.num_heads_kv = self.model_config.get_num_kv_heads(self.parallel_config)
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
-        self.direct_build: bool = is_torch_equal_or_newer("2.9.0.dev0")
-        self.q_block_size: int = 16 if is_torch_equal_or_newer(
-            "2.9.0.dev0") else 128
-        self.kv_block_size: int = 16 if is_torch_equal_or_newer(
-            "2.9.0.dev0") else 128
-
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return False
+        supports_small_blocks = is_torch_equal_or_newer("2.9.0.dev0")
+        self.direct_build: bool = supports_small_blocks
+        self.q_block_size: int = 16 if supports_small_blocks else 128
+        self.kv_block_size: int = self.block_size if supports_small_blocks else 128
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> FlexAttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlexAttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
@@ -607,15 +632,18 @@ class FlexAttentionMetadataBuilder(
         max_possible_seq_len = self.model_config.max_model_len
         num_gpu_blocks = self.cache_config.num_gpu_blocks
 
-        assert num_gpu_blocks is not None, \
+        assert num_gpu_blocks is not None, (
             "FlexAttention requires num_gpu_blocks to be set"
-        total_cache_tokens = (num_gpu_blocks * block_size)
+        )
+        total_cache_tokens = num_gpu_blocks * block_size
 
         inverse_block_table = physical_to_logical_mapping(
-            block_table_tensor, seq_lens, block_size, num_gpu_blocks)
+            block_table_tensor, seq_lens, block_size, num_gpu_blocks
+        )
 
         offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
-            self.device, non_blocking=True)
+            self.device, non_blocking=True
+        )
 
         out = FlexAttentionMetadata(
             causal=common_attn_metadata.causal,
@@ -638,7 +666,10 @@ class FlexAttentionMetadataBuilder(
             total_cache_tokens=total_cache_tokens,
             decode_offset=offset_tensor,
             num_blocks_per_seq=num_blocks_per_seq,
-            direct_build=self.direct_build,
+            # FIXME(Isotr0py): direct build has issue to build bidirectional
+            # attention block mask for encoder-only models, disable it temporarily.
+            # see: https://github.com/vllm-project/vllm/pull/27329#issuecomment-3431484053
+            direct_build=(self.direct_build and common_attn_metadata.causal),
             q_block_size=self.q_block_size,
             kv_block_size=self.kv_block_size,
         )
@@ -649,9 +680,9 @@ class FlexAttentionMetadataBuilder(
 
 
 class FlexAttentionImpl(AttentionImpl):
-    sliding_window: Optional[int]
-    alibi_slopes: Optional[torch.Tensor]
-    logits_soft_cap: Optional[float]
+    sliding_window: int | None
+    alibi_slopes: torch.Tensor | None
+    logits_soft_cap: float | None
 
     def __init__(
         self,
@@ -659,12 +690,12 @@ class FlexAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
         **kwargs,
     ) -> None:
         self.num_heads = num_heads
@@ -673,14 +704,15 @@ class FlexAttentionImpl(AttentionImpl):
         self.num_kv_heads = num_kv_heads
         self.attn_type = attn_type
 
-        if attn_type not in (AttentionType.ENCODER_ONLY,
-                             AttentionType.DECODER):
+        if attn_type not in (AttentionType.ENCODER_ONLY, AttentionType.DECODER):
             raise NotImplementedError(
-                f"FlexAttention does not support {attn_type} attention")
+                f"FlexAttention does not support {attn_type} attention"
+            )
 
         if alibi_slopes is not None:
             raise NotImplementedError(
-                "FlexAttention does not support alibi slopes yet.")
+                "FlexAttention does not support alibi slopes yet."
+            )
         else:
             self.alibi_slopes = None
 
@@ -690,19 +722,19 @@ class FlexAttentionImpl(AttentionImpl):
         self.logits_soft_cap = logits_soft_cap
         if self.logits_soft_cap is not None:
             raise NotImplementedError(
-                "FlexAttention does not support logits soft cap yet.")
+                "FlexAttention does not support logits soft cap yet."
+            )
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError(
-                "FlexAttention does not support kv sharing yet.")
+            raise NotImplementedError("FlexAttention does not support kv sharing yet.")
 
-        FlexAttentionBackend.validate_head_size(head_size)
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
-                "FlexAttention does not support quantized kv-cache. Yet")
+                "FlexAttention does not support quantized kv-cache. Yet"
+            )
 
     @staticmethod
     def view_as_4d(tensor: torch.Tensor) -> torch.Tensor:
@@ -720,9 +752,9 @@ class FlexAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlexAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FLexAttention.
 
@@ -739,14 +771,14 @@ class FlexAttentionImpl(AttentionImpl):
         assert output is not None, "Output tensor must be provided."
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlexAttentionImpl")
+                "fused output quantization is not yet supported for FlexAttentionImpl"
+            )
 
         enable_gqa = self.num_kv_heads != self.num_heads
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
             # query = self.view_as_4d(query).permute(0, 2, 1, 3)
             # return torch.empty_like(query)
 
@@ -759,11 +791,11 @@ class FlexAttentionImpl(AttentionImpl):
                 # in direct block mask building code path.
                 logger.warning_once(
                     "Using direct block mask building with sliding window, "
-                    "which is suboptimal now. Performance may be degraded.")
+                    "which is suboptimal now. Performance may be degraded."
+                )
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
-                attn_metadata.block_mask = (
-                    attn_metadata._build_block_mask_direct())
+                attn_metadata.block_mask = attn_metadata._build_block_mask_direct()
             else:
                 attn_metadata.block_mask = attn_metadata.build_block_mask()
 
@@ -776,8 +808,9 @@ class FlexAttentionImpl(AttentionImpl):
             )
 
             query = query[:, :, :num_actual_tokens, :]
-            if ((key_tensor.size(-2) > num_actual_tokens)
-                    or (value_tensor.size(-2) > num_actual_tokens)):
+            if (key_tensor.size(-2) > num_actual_tokens) or (
+                value_tensor.size(-2) > num_actual_tokens
+            ):
                 # In the encoder-only model with torch.compile,
                 # qkv might be padded, which might cause exception.
                 # see: https://github.com/vllm-project/vllm/pull/24872#discussion_r2353252290
@@ -801,8 +834,7 @@ class FlexAttentionImpl(AttentionImpl):
 
             # View out the block_size dim
             key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
-            value_cache = value_cache.view(-1, self.num_kv_heads,
-                                           self.head_size)
+            value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
             query, key_tensor, value_tensor = map(
                 lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
                 (query, key_cache, value_cache),
@@ -816,8 +848,9 @@ class FlexAttentionImpl(AttentionImpl):
         assert attn_metadata.block_mask is not None
         block_m, block_n = attn_metadata.block_mask.BLOCK_SIZE
 
-        kernel_options = get_kernel_options(query, block_m, block_n,
-                                            attn_metadata.direct_build)
+        kernel_options = get_kernel_options(
+            query, block_m, block_n, attn_metadata.direct_build
+        )
         out = flex_attention_compiled(
             query,
             key_tensor,
@@ -835,27 +868,59 @@ class FlexAttentionImpl(AttentionImpl):
         return output
 
 
-def get_kernel_options(query, block_m, block_n,
-                       use_direct_build: bool) -> dict[str, Union[int, bool]]:
-    kernel_options: dict[str, Union[int, bool]] = {
+def get_kernel_options(
+    query, block_m, block_n, use_direct_build: bool
+) -> dict[str, int | bool]:
+    kernel_options: dict[str, int | bool] = {
         "FORCE_USE_FLEX_ATTENTION": True,
     }
+
+    def ensure_divisible(candidate: int, block_size: int) -> int:
+        """Pick a kernel block size that divides the logical block."""
+        if block_size <= 0:
+            return candidate
+        candidate = min(candidate, block_size)
+        if candidate <= 0:
+            return block_size
+        if block_size % candidate == 0:
+            return candidate
+
+        candidate = math.gcd(candidate, block_size)
+        if candidate <= 1:
+            return block_size
+        return candidate
+
+    if vllm_is_batch_invariant():
+        kernel_options["BLOCK_M"] = 16
+        kernel_options["BLOCK_N"] = 16
+        kernel_options["IS_DIVISIBLE"] = False
+        return kernel_options
     if use_direct_build:
         kernel_options["BLOCK_M"] = block_m
         kernel_options["BLOCK_N"] = block_n
         return kernel_options
     else:
-        kernel_options["BLOCK_M"] = 64
-        kernel_options["BLOCK_N"] = 64
-        if query.dtype == torch.float32:
-            kernel_options["BLOCK_M"] = 32
-            kernel_options["BLOCK_N"] = 32
-        # if current_platform.is_cuda():
+        preferred_block = 32 if query.dtype == torch.float32 else 64
+        block_lower_bound = 16
+
+        block_m_candidate = ensure_divisible(preferred_block, block_m)
+        block_n_candidate = ensure_divisible(preferred_block, block_n)
+
         if torch.cuda.is_available():
             device_props = torch.cuda.get_device_properties()
             max_shared_memory = device_props.shared_memory_per_block_optin
             if max_shared_memory < 144 * 1024:
-                kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
-                kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2
+                block_m_candidate = ensure_divisible(
+                    max(1, block_m_candidate // 2), block_m
+                )
+                block_n_candidate = ensure_divisible(
+                    max(1, block_n_candidate // 2), block_n
+                )
+
+        block_m_candidate = max(block_m_candidate, block_lower_bound)
+        block_n_candidate = max(block_n_candidate, block_lower_bound)
+
+        kernel_options["BLOCK_M"] = block_m_candidate
+        kernel_options["BLOCK_N"] = block_n_candidate
 
     return kernel_options
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 11f165d6cfc6c83b513deda512030aa6d64f2030..69b5a6fb48564aca7a347f545a7472d869a7607b 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -1,24 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Backend for GatedDeltaNet attention."""
+
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              compute_causal_conv1d_metadata,
-                                              split_decodes_and_prefills)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    compute_causal_conv1d_metadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class GDNAttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_builder_cls() -> type["GDNAttentionMetadataBuilder"]:
         return GDNAttentionMetadataBuilder
@@ -34,55 +35,60 @@ class GDNAttentionMetadata:
     num_spec_decode_tokens: int
     num_actual_tokens: int
 
-    has_initial_state: Optional[torch.Tensor] = None
+    has_initial_state: torch.Tensor | None = None
 
-    spec_query_start_loc: Optional[
-        torch.Tensor] = None  # shape: [num_spec_decodes + 1,]
-    non_spec_query_start_loc: Optional[
-        torch.Tensor] = None  # shape: [batch - num_spec_decodes + 1,]
+    spec_query_start_loc: torch.Tensor | None = None  # shape: [num_spec_decodes + 1,]
+    non_spec_query_start_loc: torch.Tensor | None = (
+        None  # shape: [batch - num_spec_decodes + 1,]
+    )
 
-    spec_state_indices_tensor: Optional[
-        torch.Tensor] = None  # shape: [batch, num_spec]
-    non_spec_state_indices_tensor: Optional[
-        torch.Tensor] = None  # shape: [batch - num_spec_decodes,]
-    spec_sequence_masks: Optional[torch.Tensor] = None  # shape: [batch,]
-    spec_token_masks: Optional[
-        torch.
-        Tensor] = None  # shape: [num_prefill_tokens + num_decode_tokens,]
-    num_accepted_tokens: Optional[torch.Tensor] = None  # shape: [batch,]
+    spec_state_indices_tensor: torch.Tensor | None = None  # shape: [batch, num_spec]
+    non_spec_state_indices_tensor: torch.Tensor | None = (
+        None  # shape: [batch - num_spec_decodes,]
+    )
+    spec_sequence_masks: torch.Tensor | None = None  # shape: [batch,]
+    spec_token_indx: torch.Tensor | None = None
+    non_spec_token_indx: torch.Tensor | None = None
 
-    # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    num_accepted_tokens: torch.Tensor | None = None  # shape: [batch,]
 
+    # The following attributes are for triton implementation of causal_conv1d
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
-class GDNAttentionMetadataBuilder(
-        AttentionMetadataBuilder[GDNAttentionMetadata]):
 
-    cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
+class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]):
+    _cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
 
     reorder_batch_threshold: int = 1
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         assert isinstance(kv_cache_spec, MambaSpec)
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
         self.speculative_config = vllm_config.speculative_config
         self.kv_cache_spec = kv_cache_spec
         if self.speculative_config:
-            self.num_spec = self.speculative_config.num_speculative_tokens  # noqa: E501
+            self.num_spec = self.speculative_config.num_speculative_tokens
         else:
             self.num_spec = 0
         self.use_spec_decode = self.num_spec > 0
         self._init_reorder_batch_threshold(1, self.use_spec_decode)
 
-        self.use_full_cuda_graph = \
+        self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        )
         self.decode_cudagraph_max_bs = min(
-            self.vllm_config.scheduler_config.max_num_seqs *
-            (self.num_spec + 1), self.compilation_config.max_capture_size)
+            self.vllm_config.scheduler_config.max_num_seqs * (self.num_spec + 1),
+            self.compilation_config.max_cudagraph_capture_size,
+        )
 
         self.spec_state_indices_tensor = torch.empty(
             (self.decode_cudagraph_max_bs, self.num_spec + 1),
@@ -90,32 +96,37 @@ class GDNAttentionMetadataBuilder(
             device=device,
         )
         self.non_spec_state_indices_tensor = torch.empty(
-            (self.decode_cudagraph_max_bs, ),
+            (self.decode_cudagraph_max_bs,),
             dtype=torch.int32,
             device=device,
         )
         self.spec_sequence_masks = torch.empty(
-            (self.decode_cudagraph_max_bs, ),
+            (self.decode_cudagraph_max_bs,),
             dtype=torch.bool,
             device=device,
         )
-        self.spec_token_masks = torch.empty(
-            (self.decode_cudagraph_max_bs * (self.num_spec + 1), ),
-            dtype=torch.bool,
+        self.spec_token_indx = torch.empty(
+            (self.decode_cudagraph_max_bs * (self.num_spec + 1),),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.non_spec_token_indx = torch.empty(
+            (self.decode_cudagraph_max_bs * (self.num_spec + 1),),
+            dtype=torch.int32,
             device=device,
         )
         self.spec_query_start_loc = torch.empty(
-            (self.decode_cudagraph_max_bs + 1, ),
+            (self.decode_cudagraph_max_bs + 1,),
             dtype=torch.int32,
             device=device,
         )
         self.non_spec_query_start_loc = torch.empty(
-            (self.decode_cudagraph_max_bs + 1, ),
+            (self.decode_cudagraph_max_bs + 1,),
             dtype=torch.int32,
             device=device,
         )
         self.num_accepted_tokens = torch.empty(
-            (self.decode_cudagraph_max_bs, ),
+            (self.decode_cudagraph_max_bs,),
             dtype=torch.int32,
             device=device,
         )
@@ -124,8 +135,8 @@ class GDNAttentionMetadataBuilder(
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
-        num_accepted_tokens: Optional[torch.Tensor] = None,
-        num_decode_draft_tokens_cpu: Optional[torch.Tensor] = None,
+        num_accepted_tokens: torch.Tensor | None = None,
+        num_decode_draft_tokens_cpu: torch.Tensor | None = None,
         fast_build: bool = False,
     ) -> GDNAttentionMetadata:
         m = common_attn_metadata
@@ -135,9 +146,14 @@ class GDNAttentionMetadataBuilder(
         context_lens_tensor = context_lens.to(query_start_loc.device)
         nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
 
-        if (not self.use_spec_decode or num_decode_draft_tokens_cpu is None
-                or num_decode_draft_tokens_cpu[num_decode_draft_tokens_cpu >=
-                                               0].sum().item() == 0):
+        if (
+            not self.use_spec_decode
+            or num_decode_draft_tokens_cpu is None
+            or num_decode_draft_tokens_cpu[num_decode_draft_tokens_cpu >= 0]
+            .sum()
+            .item()
+            == 0
+        ):
             spec_sequence_masks = None
             num_spec_decodes = 0
         else:
@@ -147,13 +163,16 @@ class GDNAttentionMetadataBuilder(
                 spec_sequence_masks = None
             else:
                 spec_sequence_masks = spec_sequence_masks.to(
-                    query_start_loc.device, non_blocking=True)
+                    query_start_loc.device, non_blocking=True
+                )
 
         if spec_sequence_masks is None:
             num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-                split_decodes_and_prefills(m, decode_threshold=1))
+                split_decodes_and_prefills(m, decode_threshold=1)
+            )
             num_spec_decode_tokens = 0
-            spec_token_masks = None
+            spec_token_indx = None
+            non_spec_token_indx = None
             spec_state_indices_tensor = None
             non_spec_state_indices_tensor = m.block_table_tensor[:, 0]
             spec_query_start_loc = None
@@ -166,45 +185,63 @@ class GDNAttentionMetadataBuilder(
             num_decodes = (non_spec_query_lens == 1).sum().item()
             num_prefills = non_spec_query_lens.size(0) - num_decodes
             num_decode_tokens = num_decodes
-            num_prefill_tokens = non_spec_query_lens.sum().item(
-            ) - num_decode_tokens
+            num_prefill_tokens = non_spec_query_lens.sum().item() - num_decode_tokens
+            num_spec_decode_tokens = (
+                query_lens.sum().item() - num_prefill_tokens - num_decode_tokens
+            )
 
             if num_prefills == 0 and num_decodes == 0:
-                spec_token_masks = torch.ones(
-                    (min(num_spec_decodes *
-                         (self.num_spec + 1), query_start_loc[-1].item())),
-                    dtype=torch.bool,
-                    device=query_start_loc.device)
-                spec_state_indices_tensor = m.block_table_tensor[:, :self.
-                                                                 num_spec + 1]
+                spec_token_size = min(
+                    num_spec_decodes * (self.num_spec + 1),
+                    query_start_loc[-1].item(),
+                )
+                spec_token_indx = torch.arange(
+                    spec_token_size,
+                    dtype=torch.int32,
+                    device=query_start_loc.device,
+                )
+                non_spec_token_indx = torch.empty(
+                    0, dtype=torch.int32, device=query_start_loc.device
+                )
+                spec_state_indices_tensor = m.block_table_tensor[:, : self.num_spec + 1]
                 non_spec_state_indices_tensor = None
                 spec_query_start_loc = query_start_loc
                 non_spec_query_start_loc = None
             else:
                 spec_token_masks = torch.repeat_interleave(
-                    spec_sequence_masks, query_lens)
+                    spec_sequence_masks, query_lens
+                )
+                index = torch.argsort(spec_token_masks)
+                num_non_spec_tokens = num_prefill_tokens + num_decode_tokens
+                non_spec_token_indx = index[:num_non_spec_tokens]
+                spec_token_indx = index[num_non_spec_tokens:]
+
                 spec_state_indices_tensor = m.block_table_tensor[
-                    spec_sequence_masks, :self.num_spec + 1]
-                non_spec_state_indices_tensor = \
-                    m.block_table_tensor[~spec_sequence_masks, 0]
+                    spec_sequence_masks, : self.num_spec + 1
+                ]
+                non_spec_state_indices_tensor = m.block_table_tensor[
+                    ~spec_sequence_masks, 0
+                ]
 
                 spec_query_start_loc = torch.zeros(
                     num_spec_decodes + 1,
                     dtype=torch.int32,
-                    device=query_start_loc.device)
-                torch.cumsum(query_lens[spec_sequence_masks],
-                             dim=0,
-                             out=spec_query_start_loc[1:])
+                    device=query_start_loc.device,
+                )
+                torch.cumsum(
+                    query_lens[spec_sequence_masks], dim=0, out=spec_query_start_loc[1:]
+                )
                 non_spec_query_start_loc = torch.zeros(
                     query_lens.size(0) - num_spec_decodes + 1,
                     dtype=torch.int32,
-                    device=query_start_loc.device)
-                torch.cumsum(query_lens[~spec_sequence_masks],
-                             dim=0,
-                             out=non_spec_query_start_loc[1:])
+                    device=query_start_loc.device,
+                )
+                torch.cumsum(
+                    query_lens[~spec_sequence_masks],
+                    dim=0,
+                    out=non_spec_query_start_loc[1:],
+                )
 
-            num_spec_decode_tokens = (query_lens.sum().item() -
-                                      num_prefill_tokens - num_decode_tokens)
             assert num_accepted_tokens is not None
             num_accepted_tokens = num_accepted_tokens[spec_sequence_masks]
 
@@ -212,12 +249,14 @@ class GDNAttentionMetadataBuilder(
             has_initial_state = context_lens_tensor > 0
             if spec_sequence_masks is not None:
                 has_initial_state = has_initial_state[~spec_sequence_masks]
-            nums_dict, batch_ptr, token_chunk_offset_ptr = \
+            nums_dict, batch_ptr, token_chunk_offset_ptr = (
                 compute_causal_conv1d_metadata(non_spec_query_start_loc)
+            )
         else:
             has_initial_state = None
-        num_actual_tokens = num_prefill_tokens + num_decode_tokens + \
-            num_spec_decode_tokens
+        num_actual_tokens = (
+            num_prefill_tokens + num_decode_tokens + num_spec_decode_tokens
+        )
 
         # prepare tensors for cudagraph
         #
@@ -226,64 +265,77 @@ class GDNAttentionMetadataBuilder(
         #
         # In above cases, the max possible batch size for n tokens, can be
         # min(n, cudagraph_max_bs).
-        if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
-                and num_spec_decodes <= self.decode_cudagraph_max_bs
-                and num_spec_decode_tokens <= self.decode_cudagraph_max_bs):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(
-                m.num_actual_tokens)
+        if (
+            self.use_full_cuda_graph
+            and num_prefills == 0
+            and num_decodes == 0
+            and num_spec_decodes <= self.decode_cudagraph_max_bs
+            and num_spec_decode_tokens <= self.decode_cudagraph_max_bs
+        ):
+            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
             batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens)
 
             self.spec_state_indices_tensor[:num_spec_decodes].copy_(
-                spec_state_indices_tensor, non_blocking=True)
-            spec_state_indices_tensor = self.spec_state_indices_tensor[:
-                                                                       batch_size]
+                spec_state_indices_tensor, non_blocking=True
+            )
+            spec_state_indices_tensor = self.spec_state_indices_tensor[:batch_size]
             spec_state_indices_tensor[num_spec_decodes:].fill_(PAD_SLOT_ID)
 
             self.spec_sequence_masks[:num_spec_decodes].copy_(
-                spec_sequence_masks, non_blocking=True)
+                spec_sequence_masks, non_blocking=True
+            )
             spec_sequence_masks = self.spec_sequence_masks[:batch_size]
             spec_sequence_masks[num_spec_decodes:].fill_(False)
 
-            assert spec_token_masks is not None
-            self.spec_token_masks[:spec_token_masks.size(0)].copy_(
-                spec_token_masks, non_blocking=True)
-            spec_token_masks = self.spec_token_masks[:num_actual_tokens]
-            spec_token_masks[spec_token_masks.size(0):].fill_(False)
-
-            self.spec_query_start_loc[:num_spec_decodes + 1].copy_(
-                spec_query_start_loc, non_blocking=True)
-            spec_num_query_tokens = spec_query_start_loc[
-                -1]  # type: ignore[index]
-            spec_query_start_loc = self.spec_query_start_loc[:batch_size + 1]
-            spec_query_start_loc[num_spec_decodes +
-                                 1:].fill_(spec_num_query_tokens)
+            assert non_spec_token_indx is not None and spec_token_indx is not None
+            self.non_spec_token_indx[: non_spec_token_indx.size(0)].copy_(
+                non_spec_token_indx, non_blocking=True
+            )
+            non_spec_token_indx = self.non_spec_token_indx[
+                : non_spec_token_indx.size(0)
+            ]
+
+            self.spec_token_indx[: spec_token_indx.size(0)].copy_(
+                spec_token_indx, non_blocking=True
+            )
+            spec_token_indx = self.spec_token_indx[: spec_token_indx.size(0)]
+
+            self.spec_query_start_loc[: num_spec_decodes + 1].copy_(
+                spec_query_start_loc, non_blocking=True
+            )
+            spec_num_query_tokens = spec_query_start_loc[-1]  # type: ignore[index]
+            spec_query_start_loc = self.spec_query_start_loc[: batch_size + 1]
+            spec_query_start_loc[num_spec_decodes + 1 :].fill_(spec_num_query_tokens)
 
             self.num_accepted_tokens[:num_spec_decodes].copy_(
-                num_accepted_tokens, non_blocking=True)
+                num_accepted_tokens, non_blocking=True
+            )
             num_accepted_tokens = self.num_accepted_tokens[:batch_size]
             num_accepted_tokens[num_spec_decodes:].fill_(1)
 
-        if (self.use_full_cuda_graph and num_prefills == 0
-                and num_spec_decodes == 0
-                and num_decodes <= self.decode_cudagraph_max_bs):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(
-                m.num_actual_tokens)
+        if (
+            self.use_full_cuda_graph
+            and num_prefills == 0
+            and num_spec_decodes == 0
+            and num_decodes <= self.decode_cudagraph_max_bs
+        ):
+            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
             batch_size = num_actual_tokens
 
             self.non_spec_state_indices_tensor[:num_decodes].copy_(
-                non_spec_state_indices_tensor, non_blocking=True)
-            non_spec_state_indices_tensor = \
-                self.non_spec_state_indices_tensor[:batch_size]
+                non_spec_state_indices_tensor, non_blocking=True
+            )
+            non_spec_state_indices_tensor = self.non_spec_state_indices_tensor[
+                :batch_size
+            ]
             non_spec_state_indices_tensor[num_decodes:].fill_(PAD_SLOT_ID)
 
-            self.non_spec_query_start_loc[:num_decodes + 1].copy_(
-                non_spec_query_start_loc, non_blocking=True)
-            non_spec_num_query_tokens = non_spec_query_start_loc[
-                -1]  # type: ignore[index]
-            non_spec_query_start_loc = \
-                self.non_spec_query_start_loc[:batch_size + 1]
-            non_spec_query_start_loc[num_decodes +
-                                     1:].fill_(non_spec_num_query_tokens)
+            self.non_spec_query_start_loc[: num_decodes + 1].copy_(
+                non_spec_query_start_loc, non_blocking=True
+            )
+            non_spec_num_query_tokens = non_spec_query_start_loc[-1]  # type: ignore[index]
+            non_spec_query_start_loc = self.non_spec_query_start_loc[: batch_size + 1]
+            non_spec_query_start_loc[num_decodes + 1 :].fill_(non_spec_num_query_tokens)
 
         attn_metadata = GDNAttentionMetadata(
             num_prefills=num_prefills,
@@ -299,7 +351,8 @@ class GDNAttentionMetadataBuilder(
             spec_state_indices_tensor=spec_state_indices_tensor,
             non_spec_state_indices_tensor=non_spec_state_indices_tensor,
             spec_sequence_masks=spec_sequence_masks,
-            spec_token_masks=spec_token_masks,
+            spec_token_indx=spec_token_indx,
+            non_spec_token_indx=non_spec_token_indx,
             num_accepted_tokens=num_accepted_tokens,
             nums_dict=nums_dict,
             batch_ptr=batch_ptr,
@@ -308,7 +361,8 @@ class GDNAttentionMetadataBuilder(
         return attn_metadata
 
     def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata):
+        self, common_attn_metadata: CommonAttentionMetadata
+    ):
         """
         This method builds the metadata for full cudagraph capture.
         Currently, only decode is supported for full cudagraphs with Mamba.
@@ -317,16 +371,17 @@ class GDNAttentionMetadataBuilder(
 
         assert (
             m.num_reqs <= self.decode_cudagraph_max_bs
-            and m.num_actual_tokens <= self.decode_cudagraph_max_bs), (
-                f"GDN only supports decode-only full CUDAGraph capture. "
-                f"Make sure batch size ({m.num_reqs}) <= "
-                f"cudagraph capture sizes ({self.decode_cudagraph_max_bs}), "
-                f"and number of tokens ({m.num_actual_tokens}) <= "
-                f"cudagraph capture sizes ({self.decode_cudagraph_max_bs}).")
+            and m.num_actual_tokens <= self.decode_cudagraph_max_bs
+        ), (
+            f"GDN only supports decode-only full CUDAGraph capture. "
+            f"Make sure batch size ({m.num_reqs}) <= "
+            f"cudagraph capture sizes ({self.decode_cudagraph_max_bs}), "
+            f"and number of tokens ({m.num_actual_tokens}) <= "
+            f"cudagraph capture sizes ({self.decode_cudagraph_max_bs})."
+        )
 
         num_accepted_tokens = torch.diff(m.query_start_loc)
         num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
         m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
 
-        return self.build(0, m, num_accepted_tokens,
-                          num_decode_draft_tokens_cpu)
+        return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 0dc62d66802084d329556b9a7df4057ddff57838..1900c50849ecac0d93532f748388a07ed5b08d2c 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -6,14 +6,15 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              split_decodes_and_prefills)
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class LinearAttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_builder_cls() -> type["LinearAttentionMetadataBuilder"]:
         return LinearAttentionMetadataBuilder
@@ -31,20 +32,25 @@ class LinearAttentionMetadata:
     state_indices_tensor: torch.Tensor  # shape: [batch,]
 
 
-class LinearAttentionMetadataBuilder(
-        AttentionMetadataBuilder[LinearAttentionMetadata]):
-
+class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
     reorder_batch_threshold: int = 1
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         assert isinstance(kv_cache_spec, MambaSpec)
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> LinearAttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> LinearAttentionMetadata:
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
 
@@ -52,8 +58,9 @@ class LinearAttentionMetadataBuilder(
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
 
         attn_metadata = LinearAttentionMetadata(
             num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 7cbfa2c2c9a5460e1512681ba6863061addaae6f..8e949e53330c1aac8dff14db51c764fb3c4b7a85 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -2,20 +2,21 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.v1.attention.backends.mamba_attn import (
-    BaseMambaAttentionMetadataBuilder)
-from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
-                                              split_decodes_and_prefills)
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class Mamba1AttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_builder_cls() -> type["Mamba1AttentionMetadataBuilder"]:
         return Mamba1AttentionMetadataBuilder
@@ -23,19 +24,33 @@ class Mamba1AttentionBackend(AttentionBackend):
 
 @dataclass
 class Mamba1AttentionMetadata:
-    query_start_loc: torch.Tensor
-    context_lens_tensor: torch.Tensor
+    query_start_loc_p: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states: Optional[torch.Tensor]
+    has_initial_states_p: torch.Tensor | None
     num_prefills: int
     num_prefill_tokens: int
     num_decodes: int
     num_decode_tokens: int
     num_padded_decodes: int
 
+    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
+    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
+    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
+    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
+
 
 class Mamba1AttentionMetadataBuilder(
-        BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]):
+    BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
+):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+        assert isinstance(kv_cache_spec, MambaSpec)
 
     def build(
         self,
@@ -43,39 +58,108 @@ class Mamba1AttentionMetadataBuilder(
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> Mamba1AttentionMetadata:
-        query_start_loc = common_attn_metadata.query_start_loc
-
-        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-        context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
-            query_start_loc.device)
+        num_reqs = common_attn_metadata.num_reqs
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
 
-        has_initial_states = None
+        has_initial_states_p = None
+        query_start_loc_p = None
         padded_decodes = num_decodes
+        num_computed_tokens, num_computed_tokens_p = None, None
+        block_idx_first_scheduled_token = None
+        block_idx_first_scheduled_token_p = None
+
+        # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
+        # We should consolidate this code
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            # Return a tensor of shape (#requests, #max blocks)
+            state_indices_tensor = common_attn_metadata.block_table_tensor
+            mamba_block_size = self.kv_cache_spec.block_size
+            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+                self.device
+            )
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
+            )
+        else:
+            # Always return just a single block per each request:
+            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+            block_idx_last_scheduled_token = None
+            block_idx_last_computed_token = None
 
         if num_prefills > 0:
-            has_initial_states = context_lens_tensor > 0
-        elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs
-              and self.compilation_config.full_cuda_graph):
-            state_indices_for_decode = state_indices_tensor[:num_decodes]
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+            has_initial_states_cpu = (
+                common_attn_metadata.num_computed_tokens_cpu[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                > 0
+            )
+            has_initial_states_p = has_initial_states_cpu.to(
+                common_attn_metadata.query_start_loc.device
+            )
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                assert num_computed_tokens is not None
+                num_computed_tokens_p = num_computed_tokens[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                assert block_idx_first_scheduled_token is not None
+                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
+                    num_reqs - num_prefills : num_reqs
+                ]
+
+        elif (
+            num_decodes > 0
+            and num_decodes <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
             padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_for_decode, non_blocking=True)
+                state_indices_tensor, non_blocking=True
+            )
             state_indices_tensor = self.state_indices_tensor[:padded_decodes]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                self.block_idx_last_scheduled_token[:num_decodes].copy_(
+                    block_idx_last_scheduled_token, non_blocking=True
+                )
+                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
+                    :padded_decodes
+                ]
+                block_idx_last_scheduled_token[num_decodes:] = 0
+
+                self.block_idx_last_computed_token[:num_decodes].copy_(
+                    block_idx_last_computed_token, non_blocking=True
+                )
+                block_idx_last_computed_token = self.block_idx_last_computed_token[
+                    :padded_decodes
+                ]
+                block_idx_last_computed_token[num_decodes:] = 0
+
         return Mamba1AttentionMetadata(
-            query_start_loc=query_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            has_initial_states=has_initial_states,
+            query_start_loc_p=query_start_loc_p,
+            has_initial_states_p=has_initial_states_p,
             state_indices_tensor=state_indices_tensor,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_padded_decodes=padded_decodes,
+            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
+            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
+            block_idx_last_computed_token=block_idx_last_computed_token,
+            num_computed_tokens_p=num_computed_tokens_p,
         )
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 6f16fda962aea681b850beece5312694616b5aaa..888734e5d2b6b78ea50bee50a79b9df9c32ea876 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -1,109 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
+import itertools
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.mamba_attn import (
-    BaseMambaAttentionMetadataBuilder)
-from vllm.v1.attention.backends.utils import (PAD_SLOT_ID,
-                                              CommonAttentionMetadata,
-                                              compute_causal_conv1d_metadata,
-                                              split_decodes_and_prefills)
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
+from vllm.v1.attention.backends.utils import (
+    PAD_SLOT_ID,
+    CommonAttentionMetadata,
+    compute_causal_conv1d_metadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-def _query_start_loc_to_chunk_indices_offsets(
-        query_start_loc: torch.Tensor, chunk_size: int,
-        total_seqlens: int) -> tuple[torch.Tensor, torch.Tensor]:
+def compute_varlen_chunk_metadata(
+    query_start_loc: torch.Tensor,
+    chunk_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Args:
-        query_start_loc (torch.Tensor): 1D tensor of cumulative sequence 
-            lengths, shape (num_seqs + 1,).
-            The first element should be 0. Each entry represents the starting
-            index of a sequence in the flattened token array.
-        chunk_size (int): The size of each physical mamba chunk
-            (number of tokens per chunk).
-        total_seqlens (int): The total number of tokens in the batch.
-
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
-            - chunk_indices (torch.Tensor): 1D tensor of indices 
-                indicating the physical chunk for each logical chunk.
-            - chunk_offsets (torch.Tensor): 1D tensor of offsets
-                indicating the starting index of each logical chunk within
-                its physical chunk.
-
-    This function computes the chunk indices and offsets for the given
-    query_start_loc and chunk_size. Both are tensors of integers with length N,
-    where N is the number of logical (pseudo) chunks.
-    A logical chunk is a sequence of tokens that are all part of the same
-    sequence and are all in the same physical mamba chunk.
-    In other words, a logical chunk changes every time we cross a sequence
-    boundary or a physical mamba chunk boundary.
-    Logical chunks are needed to handle batched requests with initial states
-    (see _state_passing_fwd and _chunk_scan_fwd).
-    The chunk_indices tensor contains the index of the physical chunk for each
-    logical chunk.
-    The chunk_offsets tensor contains the offset (AKA starting index) of the
-    logical chunk in the physical chunk.
-
-    Example:
-    query_start_loc = [0, 5, 10]
-    chunk_size = 8
-    total_seqlens = 10
-    -> chunk_indices = [0, 0, 1]
-    -> chunk_offsets = [0, 5, 0]
-
-    In this example, we have 2 sequences, each with 5 tokens. The physical
-    chunk size is 8 tokens.
-    We have three logical chunks:
-    - the first logical chunk starts at token 0 in the first physical chunk
-        and contains all 5 tokens from the first sequence
-    - the second logical chunk starts at token 5 in the first physical chunk
-        and contains first 3 tokens from the second sequence
-    - the third logical chunk starts at token 0 in the second physical chunk
-        and contains the remaining 2 tokens from the second sequence
+    Build chunk-aligned, variable-length metadata used by Mamba2 SSD kernels.
+
+    Given per-sequence cumulative token starts `query_start_loc` of shape [B+1]
+    and a physical `chunk_size`, returns three tensors on the same device:
+      - cu_chunk_seqlens:  (nchunks+1,) int32   exclusive prefix-sum of
+        logical-chunk lengths (each logical chunk never crosses a sequence or
+        physical-chunk boundary).
+      - last_chunk_indices: (B,)       int32   index of the last logical chunk
+        for each sequence (=-1 for empty sequences).
+      - seq_idx_chunks:     (nchunks,) int32   sequence index for each logical
+        chunk in order.
+
+    This is intentionally lightweight and CPU-side; it mirrors the metadata
+    produced by the V1 Mamba2 meta-data builder and is exported so tests
+    (and other callers) can avoid duplicating the logic.
     """
+    assert query_start_loc.ndim == 1, "query_start_loc must be 1-D [B+1]"
+    assert int(query_start_loc[0].item()) == 0, "query_start_loc[0] must be 0"
+    device = query_start_loc.device
+
+    qsl64 = query_start_loc.to(torch.int64)
+    starts = qsl64[:-1].tolist()
+    ends = qsl64[1:].tolist()
+    total = int(qsl64[-1].item())
+
+    chunk_lens: list[int] = []
+    seq_idx_chunks: list[int] = []
+    last_chunk_indices: list[int] = [-1] * len(starts)
+
+    for b, (s, e) in enumerate(zip(starts, ends)):
+        if e <= s:
+            # empty sequence
+            continue
+        pos = s
+        while pos < e:
+            # split at both sequence boundaries and physical chunk boundaries
+            room = chunk_size - (pos % chunk_size)
+            take = min(room, e - pos)
+            chunk_lens.append(int(take))
+            seq_idx_chunks.append(b)
+            last_chunk_indices[b] = len(chunk_lens) - 1
+            pos += take
+
+    # Exclusive prefix sum over logical-chunk lengths
+    if chunk_lens:
+        cu_chunk_seqlens = torch.tensor(
+            [0] + list(itertools.accumulate(chunk_lens)),
+            device=device,
+            dtype=torch.int32,
+        )
+        # Final boundary must equal total tokens
+        assert int(cu_chunk_seqlens[-1].item()) == total
+    else:
+        cu_chunk_seqlens = torch.tensor([0], device=device, dtype=torch.int32)
 
-    cu_seqlens = query_start_loc[1:]  # remove prepended 0
-
-    # outputs will have length expansion of chunks that do not divide
-    # chunk_size
-    N = math.ceil(total_seqlens / chunk_size) + (cu_seqlens[:-1] % chunk_size
-                                                 > 0).sum()
-    chunk_indices = torch.arange(N,
-                                 dtype=torch.int,
-                                 device=query_start_loc.device)
-    chunk_offsets = torch.zeros((N, ),
-                                dtype=torch.int,
-                                device=query_start_loc.device)
-
-    p = 0  # num of insertions
-    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
-
-        # if does not divide chunk_size, then there is one chunk insertion
-        p += (s % chunk_size > 0)
-
-        # get the dimensions
-        # - the + 1 for _e is to shift the boundary by one chunk
-        # - this shifting is not needed if chunk_size divides e
-        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
-                                                             > 0)
-
-        # adjust indices and offsets
-        chunk_indices[_s:_e] -= p
-        chunk_offsets[_s] = s % chunk_size
-
-    return chunk_indices, chunk_offsets
+    last_chunk_indices_t = (
+        torch.tensor(last_chunk_indices, device=device, dtype=torch.int32)
+        if len(starts) > 0
+        else torch.empty((0,), device=device, dtype=torch.int32)
+    )
+    seq_idx_chunks_t = torch.tensor(seq_idx_chunks, device=device, dtype=torch.int32)
+    return cu_chunk_seqlens, last_chunk_indices_t, seq_idx_chunks_t
 
 
 class Mamba2AttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_builder_cls() -> type["Mamba2AttentionMetadataBuilder"]:
         return Mamba2AttentionMetadataBuilder
@@ -123,94 +107,228 @@ class Mamba2AttentionMetadata:
 
     # The following tensors only contain prefill requests and will be None if
     # the batch has no prefill request.
-    has_initial_states_p: Optional[torch.Tensor]
-    seq_idx_p: Optional[torch.Tensor]
-    chunk_indices_p: Optional[torch.Tensor]
-    chunk_offsets_p: Optional[torch.Tensor]
+    has_initial_states_p: torch.Tensor | None
+    seq_idx_p: torch.Tensor | None
+
+    # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
+    # each chunk, its offests into the varlen sequence dimension. It is defined
+    # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
+    # cu_chunk_seqlen_p[i+1].
+    cu_chunk_seqlen_p: torch.Tensor | None
+
+    # last_chunk_indices_p is a tensor of shape (batch,) that contains the
+    # index of the last chunk for every sequence in the (prefill) batch.
+    last_chunk_indices_p: torch.Tensor | None
 
     state_indices_tensor: torch.Tensor  # shape: [batch,]
+    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
+    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
+    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
+    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
 
     # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class Mamba2AttentionMetadataBuilder(
-        BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]):
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]
+):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
         assert self.chunk_size is not None, (
-            "chunk_size needs to be set in the model config for Mamba2 models")
+            "chunk_size needs to be set in the model config for Mamba2 models"
+        )
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> Mamba2AttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> Mamba2AttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
-        query_start_loc_p = None
         seq_lens = common_attn_metadata.seq_lens
 
+        query_start_loc_p = None
         seq_idx_p = None
-        chunk_indices_p, chunk_offsets_p = None, None
+        cu_chunk_seqlen_p = None
+        last_chunk_indices_p = None
+
         # Need flags to indicate if there are initial states
-        # currently we really only support the FlashAttention backend
         has_initial_states_p = None
         prep_initial_states = False
 
         # for causal_conv1d
         nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
 
-        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+        num_computed_tokens, num_computed_tokens_p = None, None
+        block_idx_first_scheduled_token = None
+        block_idx_first_scheduled_token_p = None
+
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            # Return a tensor of shape (#requests, #max blocks)
+            state_indices_tensor = common_attn_metadata.block_table_tensor
+            # Additional cache-related varaiables:
+            mamba_block_size = self.kv_cache_spec.block_size
+            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+                self.device
+            )
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
+            )
+        else:
+            # Always return just a single block per each request:
+            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+            # Additional cache-related varaiables:
+            block_idx_last_scheduled_token = None
+            block_idx_last_computed_token = None
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
 
-        # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
+        # Compute seq_idx for prefill only
         if num_prefills > 0:
-            #[batch,]
+            # [batch,]
             has_initial_states_cpu = (
-                common_attn_metadata.
-                num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
+                common_attn_metadata.num_computed_tokens_cpu[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                > 0
+            )
             prep_initial_states = torch.any(has_initial_states_cpu).item()
             has_initial_states_p = has_initial_states_cpu.to(
-                common_attn_metadata.query_start_loc.device)
-
-            query_start_loc_p = common_attn_metadata.query_start_loc[
-                -num_prefills - 1:] - num_decode_tokens
-
-            seq_idx_p = torch.repeat_interleave(torch.arange(
-                num_prefills,
-                dtype=torch.int32,
-                device=query_start_loc_p.device),
-                                                query_start_loc_p.diff(),
-                                                output_size=num_prefill_tokens)
-
-            # We compute metadata for chunked prefill once at the top level
-            # model forward and reuse them in mamba layers. If not needed,
-            # they will be ignored inside mamba kernels.
-            if prep_initial_states:
-                chunk_indices_p, chunk_offsets_p = (
-                    _query_start_loc_to_chunk_indices_offsets(
-                        query_start_loc_p, self.chunk_size,
-                        num_prefill_tokens))
-
-            nums_dict, batch_ptr, token_chunk_offset_ptr = \
+                common_attn_metadata.query_start_loc.device
+            )
+
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                assert num_computed_tokens is not None
+                num_computed_tokens_p = num_computed_tokens[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                assert block_idx_first_scheduled_token is not None
+                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
+                    num_reqs - num_prefills : num_reqs
+                ]
+            num_computed_tokens_p_cpu = common_attn_metadata.num_computed_tokens_cpu[
+                num_reqs - num_prefills : num_reqs
+            ]
+            query_start_loc_p_cpu = (
+                common_attn_metadata.query_start_loc_cpu[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+
+            # The code below carefully constructs the chunks such that:
+            # 1. Chunks contain tokens from a *single* sequence only.
+            # 2. For every sequence, we are guaranteed that we can
+            #    retrieve the mamba state *every* chunk_size tokens.
+            # Constraint (1) dramatically simplifies the mamba2 kernels.
+            # Constraint (2) dramatically simplifies the implementation
+            # of prefix caching for mamba2 (wip). We need to take care
+            # of the interaction with chunked prefill in order to
+            # satisfy constraint (2).
+            # TODO (tdoublep): This code could probably be optimized.
+            cu_chunk_seqlen = []
+            seq_idx = []
+            last_chunk_indices = []
+            seqlen_pos = 0
+            for req_idx in range(num_prefills):
+                this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
+                this_new_tokens = (
+                    query_start_loc_p_cpu[req_idx + 1].item()
+                    - query_start_loc_p_cpu[req_idx].item()
+                )
+
+                # if computed tokens are not chunk-aligned, use the first
+                # chunk to finish it off
+                if this_num_computed % self.chunk_size != 0:
+                    seq_idx.append(req_idx)
+                    cu_chunk_seqlen.append(seqlen_pos)
+                    # how many tokens to finish the chunk?
+                    chunk_len = (
+                        cdiv(this_num_computed, self.chunk_size) * self.chunk_size
+                        - this_num_computed
+                    )
+                    # we can only use at most this_new_tokens
+                    chunk_len = min(chunk_len, this_new_tokens)
+                    seqlen_pos += chunk_len
+                    this_new_tokens -= chunk_len
+
+                n_chunks = cdiv(this_new_tokens, self.chunk_size)
+                for chunk in range(n_chunks):
+                    seq_idx.append(req_idx)
+                    cu_chunk_seqlen.append(seqlen_pos)
+                    chunk_len = min(self.chunk_size, this_new_tokens)
+                    seqlen_pos += chunk_len
+                    this_new_tokens -= chunk_len
+
+                assert this_new_tokens == 0
+                last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
+
+            cu_chunk_seqlen.append(seqlen_pos)
+
+            seq_idx_p = torch.as_tensor(
+                seq_idx, device=query_start_loc_p.device, dtype=torch.int32
+            )
+            cu_chunk_seqlen_p = torch.as_tensor(
+                cu_chunk_seqlen, device=query_start_loc_p.device, dtype=torch.int32
+            )
+            last_chunk_indices_p = torch.as_tensor(
+                last_chunk_indices, device=query_start_loc_p.device, dtype=torch.int32
+            )
+
+            nums_dict, batch_ptr, token_chunk_offset_ptr = (
                 compute_causal_conv1d_metadata(query_start_loc_p)
+            )
 
-        elif num_decodes <= self.decode_cudagraph_max_bs:
+        elif (
+            num_decodes <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
             # Pad state tensor for CUDA graph
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
-            self.state_indices_tensor[:num_decodes].copy_(state_indices_tensor,
-                                                          non_blocking=True)
+            self.state_indices_tensor[:num_decodes].copy_(
+                state_indices_tensor, non_blocking=True
+            )
             state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                self.block_idx_last_scheduled_token[:num_decodes].copy_(
+                    block_idx_last_scheduled_token, non_blocking=True
+                )
+                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
+                    :num_input_tokens
+                ]
+                block_idx_last_scheduled_token[num_decodes:] = 0
+
+                self.block_idx_last_computed_token[:num_decodes].copy_(
+                    block_idx_last_computed_token, non_blocking=True
+                )
+                block_idx_last_computed_token = self.block_idx_last_computed_token[
+                    :num_input_tokens
+                ]
+                block_idx_last_computed_token[num_decodes:] = 0
+
         attn_metadata = Mamba2AttentionMetadata(
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
@@ -222,11 +340,15 @@ class Mamba2AttentionMetadataBuilder(
             chunk_size=self.chunk_size,
             has_initial_states_p=has_initial_states_p,
             seq_idx_p=seq_idx_p,
-            chunk_indices_p=chunk_indices_p,
-            chunk_offsets_p=chunk_offsets_p,
             state_indices_tensor=state_indices_tensor,
+            cu_chunk_seqlen_p=cu_chunk_seqlen_p,
+            last_chunk_indices_p=last_chunk_indices_p,
             nums_dict=nums_dict,
             batch_ptr=batch_ptr,
             token_chunk_offset_ptr=token_chunk_offset_ptr,
+            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
+            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
+            block_idx_last_computed_token=block_idx_last_computed_token,
+            num_computed_tokens_p=num_computed_tokens_p,
         )
         return attn_metadata
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index ef342ce421ae19eebc968ccd978b7ecd8427e622..0d875565fc99a82bfd32dbde01d62958b0f5ba16 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -7,9 +7,12 @@ from typing import ClassVar, TypeVar
 import torch
 
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 M = TypeVar("M")
@@ -17,36 +20,96 @@ M = TypeVar("M")
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
     reorder_batch_threshold: int = 1
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    )
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         assert isinstance(kv_cache_spec, MambaSpec)
         self.compilation_config = vllm_config.compilation_config
         self.decode_cudagraph_max_bs = min(
             self.vllm_config.scheduler_config.max_num_seqs,
-            self.compilation_config.max_capture_size)
-        self.state_indices_tensor = torch.empty(
-            (self.decode_cudagraph_max_bs, ),
-            dtype=torch.int32,
-            device=device,
+            self.compilation_config.max_cudagraph_capture_size,
         )
 
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            self.state_indices_tensor = torch.empty(
+                (
+                    self.decode_cudagraph_max_bs,
+                    cdiv(
+                        self.vllm_config.model_config.max_model_len,
+                        self.kv_cache_spec.block_size,
+                    ),
+                ),
+                dtype=torch.int32,
+                device=device,
+            )
+            self.block_idx_last_scheduled_token = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
+            self.block_idx_last_computed_token = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
+        else:
+            self.state_indices_tensor = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
+
     def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> M:
         """
         This method builds the metadata for full cudagraph capture.
         Currently, only decode is supported for full cudagraphs with Mamba.
         """
         m = common_attn_metadata
 
-        assert m.num_reqs == m.num_actual_tokens, \
-            "Mamba only supports decode-only full CUDAGraph capture. " \
+        assert m.num_reqs == m.num_actual_tokens, (
+            "Mamba only supports decode-only full CUDAGraph capture. "
             "Make sure all cudagraph capture sizes <= max_num_seq."
+        )
 
         m.max_query_len = 1  # decode-only
 
         return self.build(0, m)
+
+    def _compute_prefix_caching_block_indices(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        mamba_block_size: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+            self.device
+        )
+        # Block index of the last computed token
+        block_idx_last_computed_token = cdiv(num_computed_tokens, mamba_block_size) - 1
+        # which is <= block index for the first scheduled token
+        block_idx_first_scheduled_token = (
+            cdiv(num_computed_tokens + 1, mamba_block_size) - 1
+        )
+        # which is <= block index of the last scheduled token
+        block_idx_last_scheduled_token = (
+            cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1
+        )
+        # -1 in case it's non-computed and causes later issues with indexing
+        block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+
+        return (
+            block_idx_last_computed_token,
+            block_idx_first_scheduled_token,
+            block_idx_last_scheduled_token,
+        )
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
old mode 100755
new mode 100644
index 963f1c5abf2a719312d6c34d8a76110e6951db1c..6e797e87b18ce66265a665bfe8715460f40c077c
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -190,16 +190,20 @@ return curr_o @ W_O
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, field
-from typing import Generic, Optional, TypeVar, Union
+from enum import Enum
+from typing import ClassVar, Generic, TypeVar
 
 import torch
 from tqdm import tqdm
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
-                                              AttentionMetadata,
-                                              MLAAttentionImpl)
+from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionLayer,
+    MLAAttentionImpl,
+)
 from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
@@ -207,21 +211,48 @@ from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase,
-                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    LinearBase,
+    UnquantizedLinearMethod,
+)
 from vllm.platforms import current_platform
-from vllm.utils import cdiv, round_down
 from vllm.utils.flashinfer import has_nvidia_artifactory
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              get_per_layer_parameters,
-                                              infer_global_hyperparameters,
-                                              split_decodes_and_prefills)
+from vllm.utils.math_utils import cdiv, round_down
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    get_dcp_local_seq_lens,
+    get_per_layer_parameters,
+    infer_global_hyperparameters,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
+
+class QueryLenSupport(Enum):
+    """Defines the level of query length support for an attention backend's
+    decode pipeline.
+
+    - SINGLE_ONLY: Decode pipeline only supports single-token queries
+                   (query_len=1)
+    - UNIFORM: Decode pipeline supports uniform multi-token queries
+               (all requests must have same query_len > 1)
+    - VARLEN: Decode pipeline supports variable-length queries
+              (mixed query lengths in same batch)
+    """
+
+    SINGLE_ONLY = "single_only"
+    UNIFORM = "uniform"
+    VARLEN = "varlen"
+
+
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+
     is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
@@ -231,32 +262,24 @@ except ImportError:
 
 try:
     from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
-    from flashinfer.prefill import (  # noqa: F401
-        cudnn_batch_prefill_with_kv_cache)
+    from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache  # noqa: F401
+
     flashinfer_available = True
 except ImportError:
-    flashinfer_available = False
-
-
-def is_rocm_aiter_fp8bmm_enabled() -> bool:
-    return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER_FP8BMM \
-        and envs.VLLM_ROCM_USE_AITER
+    BatchPrefillWithRaggedKVCacheWrapper = object
 
+    flashinfer_available = False
 
-if is_rocm_aiter_fp8bmm_enabled():
-    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501 # isort: skip
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
-        as aiter_triton_fp8_bmm)
 
-    def dynamic_per_batched_tensor_quant(
-            x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
-        DTYPE_MAX = torch.finfo(dtype).max
-        min_val, max_val = x.aminmax()
-        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
-        scale = DTYPE_MAX / amax
-        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
-        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+def dynamic_per_batched_tensor_quant(
+    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+):
+    DTYPE_MAX = torch.finfo(dtype).max
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+    scale = DTYPE_MAX / amax
+    x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
 logger = init_logger(__name__)
@@ -265,17 +288,12 @@ CUDNN_WORKSPACE_SIZE = 12800
 
 
 class MLACommonBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
         return "TRITON_MLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return MLACommonMetadata
-
     @staticmethod
     def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
         return MLACommonMetadataBuilder
@@ -290,29 +308,18 @@ class MLACommonBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+    def is_mla(cls) -> bool:
+        return True
 
 
 @dataclass
 class MLACommonPrefillMetadata:
-    """ Prefill Specific Metadata """
+    """Prefill Specific Metadata"""
 
     @dataclass
     class ChunkedContextMetadata:
@@ -326,40 +333,40 @@ class MLACommonPrefillMetadata:
         workspace: torch.Tensor
 
         # for mla DCP
-        cp_chunk_seq_lens: Optional[list[list[int]]] = None
-        origin_context_lens: Optional[list[int]] = None
-        cp_cu_seq_lens: Optional[torch.Tensor] = None
-        chunk_size: Optional[int] = None
-        cu_seq_lens_lst: Optional[list[list[int]]] = None
+        padded_local_chunk_seq_lens: list[list[int]] | None = None
+        local_context_lens_allranks: list[list[int]] | None = None
+        padded_local_cu_seq_lens: torch.Tensor | None = None
+        cu_seq_lens_lst: list[list[int]] | None = None
+        chunk_size: int | None = None
 
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
     max_query_len: int
-    chunked_context: Optional[ChunkedContextMetadata] = None
+    chunked_context: ChunkedContextMetadata | None = None
+    query_seq_lens: torch.Tensor | None = None
 
 
 @dataclass
 class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
-    prefill_main: Optional['BatchPrefillWithRaggedKVCacheWrapper'] = None
-    prefill_chunks: list['BatchPrefillWithRaggedKVCacheWrapper'] = field(
-        default_factory=list)
+    prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
+    prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = field(
+        default_factory=list
+    )
 
 
 @dataclass
 class CudnnPrefillMetadata(MLACommonPrefillMetadata):
-
-    class ChunkedContextMetadata(
-            MLACommonPrefillMetadata.ChunkedContextMetadata):
+    class ChunkedContextMetadata(MLACommonPrefillMetadata.ChunkedContextMetadata):
         seq_lens: torch.Tensor
 
-    query_seq_lens: Optional[torch.Tensor] = None
-    cudnn_workspace: Optional[torch.Tensor] = None
+    cudnn_workspace: torch.Tensor | None = None
 
 
 @dataclass
 class MLACommonDecodeMetadata:
     block_table: torch.Tensor
     seq_lens: torch.Tensor
+    dcp_tot_seq_lens: torch.Tensor | None
 
 
 D = TypeVar("D", bound=MLACommonDecodeMetadata)
@@ -372,6 +379,7 @@ class MLACommonMetadata(Generic[D]):
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
     """
+
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -395,16 +403,21 @@ class MLACommonMetadata(Generic[D]):
     num_prefills: int
 
     # The dimension of the attention heads
-    head_dim: Optional[int] = None
+    head_dim: int | None = None
 
-    decode: Optional[D] = None
-    prefill: Optional[Union[MLACommonPrefillMetadata,
-                            FlashInferPrefillMetadata,
-                            CudnnPrefillMetadata]] = None
+    decode: D | None = None
+    prefill: (
+        MLACommonPrefillMetadata
+        | FlashInferPrefillMetadata
+        | CudnnPrefillMetadata
+        | None
+    ) = None
 
     def __post_init__(self):
-        if self.head_dim is not None:
-            MLACommonBackend.validate_head_size(self.head_dim)
+        if self.head_dim is not None and not MLACommonBackend.supports_head_size(
+            self.head_dim
+        ):
+            raise ValueError(f"Head dimension {self.head_dim} is not supported by MLA.")
 
 
 M = TypeVar("M", bound=MLACommonMetadata)
@@ -414,21 +427,31 @@ A = TypeVar("A")
 def use_flashinfer_prefill() -> bool:
     # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
-    return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available
-            and not envs.VLLM_USE_CUDNN_PREFILL
-            and current_platform.is_device_capability(100))
+    return (
+        not envs.VLLM_DISABLE_FLASHINFER_PREFILL
+        and flashinfer_available
+        and not envs.VLLM_USE_CUDNN_PREFILL
+        and not envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and current_platform.is_device_capability(100)
+    )
 
 
 def use_cudnn_prefill() -> bool:
-    return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL
-            and current_platform.is_device_capability(100)
-            and has_nvidia_artifactory())
+    return (
+        flashinfer_available
+        and envs.VLLM_USE_CUDNN_PREFILL
+        and current_platform.is_device_capability(100)
+        and has_nvidia_artifactory()
+    )
 
 
-# Currently 394MB, this can be tuned based on GEMM sizes used.
-# Chosen to be the same as sglang:
-#  https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
+def use_trtllm_ragged_deepseek_prefill() -> bool:
+    """Check if TRT-LLM ragged DeepSeek prefill should be used."""
+    return (
+        flashinfer_available
+        and envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and current_platform.is_device_capability(100)
+    )
 
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
@@ -436,19 +459,34 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
     """
+
+    # Defines the level of query length support for this backend.
+    # - SINGLE_ONLY: Only single-token queries (no spec decode support)
+    # - UNIFORM: Supports uniform multi-token queries (spec decode with uniform lengths)
+    # - VARLEN: Supports variable-length queries (spec decode with mixed lengths)
+    # If set to UNIFORM or VARLEN, this will increase `reorder_batch_threshold` when
+    # speculative decoding is enabled.
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.SINGLE_ONLY
+
+    # The threshold for reordering the batch into decode and prefill requests.
+    # If > 1, the batch will be reordered such that requests with
+    # query length <= threshold are classified as decode requests.
+    # Use `query_len_support` (above) to set this automatically
+    # when speculative decoding is enabled.
     reorder_batch_threshold: int = 1
 
     @staticmethod
-    def determine_chunked_prefill_workspace_size(
-            vllm_config: VllmConfig) -> int:
+    def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         scheduler_config = vllm_config.scheduler_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
 
         chunked_prefill_workspace_size = min(
             # Try for 8 full length request or at least 4 pages per-request
-            max(8 * model_config.max_model_len,
-                4 * scheduler_config.max_num_seqs * cache_config.block_size),
+            max(
+                8 * model_config.max_model_len,
+                4 * scheduler_config.max_num_seqs * cache_config.block_size,
+            ),
             # For long-context models try not to over-allocate limiting
             # kv-cache space, limiting it to 64k tokens,
             # which would result in the workspace being:
@@ -457,32 +495,38 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             # which would result in up-projected context being
             #   2*(192*128)*(64*1024) = 3gb
             # (assuming 192 QK head dim, 128 heads, and fp16)
-            64 * 1024)
+            64 * 1024,
+        )
 
         # Enforce that we enough for at least 1 page per request
         chunked_prefill_workspace_size = max(
             chunked_prefill_workspace_size,
-            scheduler_config.max_num_seqs * cache_config.block_size)
+            scheduler_config.max_num_seqs * cache_config.block_size,
+        )
 
         return chunked_prefill_workspace_size
 
-    def __init__(self,
-                 kv_cache_spec: AttentionSpec,
-                 layer_names: list[str],
-                 vllm_config: VllmConfig,
-                 device: torch.device,
-                 metadata_cls: Optional[type[M]] = None):
-        self.metadata_cls = metadata_cls \
-            if metadata_cls is not None else MLACommonMetadata
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+        metadata_cls: type[M] | None = None,
+        supports_dcp_with_varlen: bool = False,
+    ):
+        self.metadata_cls = (
+            metadata_cls if metadata_cls is not None else MLACommonMetadata
+        )
         self.kv_cache_spec = kv_cache_spec
         scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         self.compilation_config = vllm_config.compilation_config
+        self.vllm_config = vllm_config
         self.device = device
 
-        self.num_heads = self.model_config.get_num_attention_heads(
-            parallel_config)
+        self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
         self.aot_schedule = current_platform.is_cuda()
         try:
@@ -492,57 +536,73 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size
+        self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
 
         # Don't try to access the runner on AMD
         if self.aot_schedule:
             self.page_size = self.kv_cache_spec.block_size
 
-        self.chunked_prefill_workspace_size = \
+        self.chunked_prefill_workspace_size = (
             self.determine_chunked_prefill_workspace_size(vllm_config)
+        )
 
         if self.dcp_world_size > 1:
             # Note(hc): The local kvcache is incomplete when DCP is triggered,
             # an additional kvcache allgather across the DCP group is therefore
             # required, so the workspace has to be enlarged by 1/DCP relative
             # to the original TP allocation.
-            assert self.chunked_prefill_workspace_size % \
-                self.dcp_world_size == 0
+            assert self.chunked_prefill_workspace_size % self.dcp_world_size == 0
             self.chunked_prefill_workspace = torch.empty(
-                (self.chunked_prefill_workspace_size +
-                 self.chunked_prefill_workspace_size // self.dcp_world_size,
-                 self.model_config.get_head_size()),
+                (
+                    self.chunked_prefill_workspace_size
+                    + self.chunked_prefill_workspace_size // self.dcp_world_size,
+                    self.model_config.get_head_size(),
+                ),
                 dtype=self.model_config.dtype,
                 device=device,
             )
         else:
             self.chunked_prefill_workspace = torch.empty(
-                (self.chunked_prefill_workspace_size,
-                 self.model_config.get_head_size()),
+                (
+                    self.chunked_prefill_workspace_size,
+                    self.model_config.get_head_size(),
+                ),
                 dtype=self.model_config.dtype,
                 device=device,
             )
 
         self._use_cudnn_prefill = use_cudnn_prefill()
         self._use_fi_prefill = use_flashinfer_prefill()
+        self._use_trtllm_ragged_prefill = use_trtllm_ragged_deepseek_prefill()
         self.prefill_metadata_cls = (
             FlashInferPrefillMetadata
-            if self._use_fi_prefill else CudnnPrefillMetadata
-            if self._use_cudnn_prefill else MLACommonPrefillMetadata)
+            if self._use_fi_prefill
+            else CudnnPrefillMetadata
+            if self._use_cudnn_prefill
+            else MLACommonPrefillMetadata
+        )
 
         if self._use_fi_prefill:
             self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
                 dtype=torch.uint8,
-                device=device)
+                device=device,
+            )
 
-            self._fi_prefill_main: Optional[
-                BatchPrefillWithRaggedKVCacheWrapper] = None
-            self._fi_prefill_chunks: list[
-                BatchPrefillWithRaggedKVCacheWrapper] = []
+            self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
+            self._fi_prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
 
             self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(vllm_config, layer_names,
-                                         MLACommonImpl))
+                get_per_layer_parameters(vllm_config, layer_names, MLACommonImpl)
+            )
+
+        if self._use_trtllm_ragged_prefill:
+            self._workspace_buffer = torch.empty(
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device,
+            )
 
         if self._use_cudnn_prefill:
             self.cudnn_workspace = torch.empty(
@@ -551,6 +611,18 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 device=device,
             )
 
+        supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
+        self._init_reorder_batch_threshold(
+            self.reorder_batch_threshold, supports_spec_decode, supports_dcp_with_varlen
+        )
+
+        # Validate consistency between query_len_support and reorder_batch_threshold
+        if self.query_len_support == QueryLenSupport.SINGLE_ONLY:
+            assert self.reorder_batch_threshold == 1, (
+                f"reorder_batch_threshold must be 1 when query_len_support is "
+                f"SINGLE_ONLY, got {self.reorder_batch_threshold}"
+            )
+
     def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
         qo_indptr = prefill.query_start_loc
 
@@ -561,7 +633,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         if self._fi_prefill_main is None:
             self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
-                self._workspace_buffer, "NHD", backend="cutlass")
+                self._workspace_buffer, "NHD", backend="cutlass"
+            )
 
         if has_context:
             num_chunks = chunked_context.cu_seq_lens.shape[0]
@@ -570,7 +643,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 for _ in range(len(self._fi_prefill_chunks), num_chunks):
                     self._fi_prefill_chunks.append(
                         BatchPrefillWithRaggedKVCacheWrapper(
-                            self._workspace_buffer, "NHD", backend="cutlass"))
+                            self._workspace_buffer, "NHD", backend="cutlass"
+                        )
+                    )
             assert num_chunks <= len(self._fi_prefill_chunks)
 
         # In MLA, the non-latent num_qo_heads == num_kv_heads
@@ -581,8 +656,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         assert self.kv_cache_spec.num_kv_heads == 1
 
         # Get non-latent head_dim_qk and head_dim_vo
-        head_dim_qk = (self.mla_dims.qk_nope_head_dim +
-                       self.mla_dims.qk_rope_head_dim)
+        head_dim_qk = self.mla_dims.qk_nope_head_dim + self.mla_dims.qk_rope_head_dim
         head_dim_vo = self.mla_dims.v_head_dim
 
         # For main run, qo_indptr == kv_indptr
@@ -618,45 +692,52 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                     causal=False,  # This is context run
                     sm_scale=self._global_hyperparameters.sm_scale,
                     window_left=self._global_hyperparameters.window_left,
-                    logits_soft_cap=self._global_hyperparameters.
-                    logits_soft_cap,
+                    logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
                     q_data_type=self.model_config.dtype,
                 )
 
         prefill.prefill_main = self._fi_prefill_main
         prefill.prefill_chunks = self._fi_prefill_chunks
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens_cpu: torch.Tensor,
-                      seq_lens_device: torch.Tensor,
-                      query_start_loc_cpu: torch.Tensor,
-                      query_start_loc_device: torch.Tensor,
-                      num_decode_tokens: int) -> MLACommonDecodeMetadata:
+    def _build_decode(
+        self,
+        block_table_tensor: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        seq_lens_device: torch.Tensor,
+        query_start_loc_cpu: torch.Tensor,
+        query_start_loc_device: torch.Tensor,
+        num_decode_tokens: int,
+        dcp_tot_seq_lens_device: torch.Tensor | None,
+    ) -> MLACommonDecodeMetadata:
         return MLACommonDecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens_device,
+            dcp_tot_seq_lens=dcp_tot_seq_lens_device,
         )
 
     def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> M:
         """
         This method builds the metadata for full cudagraph capture.
         Currently, only decode is supported for full cudagraphs with MLA.
         """
         m = common_attn_metadata
-        assert m.num_reqs <= (m.num_actual_tokens *
-                              self.reorder_batch_threshold), \
-            "MLA only supports decode-only full CUDAGraph capture. " \
+        assert m.num_reqs <= (m.num_actual_tokens * self.reorder_batch_threshold), (
+            "MLA only supports decode-only full CUDAGraph capture. "
             "Make sure all cudagraph capture sizes <= max_num_seq."
+        )
 
         assert m.max_query_len <= self.reorder_batch_threshold  # decode only
 
         return self.build(0, m)
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> M:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> M:
         num_reqs = common_attn_metadata.num_reqs
         num_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
@@ -673,21 +754,20 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
+        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
-        num_computed_tokens_cpu = (common_attn_metadata.seq_lens_cpu -
-                                   query_seq_lens_cpu)
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=self.reorder_batch_threshold)
+        num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu
 
-        # Note(hc): update seq_lens of decode reqs under DCP.
-        if self.dcp_world_size > 1:
-            seq_lens[:num_decodes] = seq_lens[:num_decodes] \
-                // self.dcp_world_size + (self.dcp_rank <= \
-                (seq_lens[:num_decodes] - 1) % self.dcp_world_size)
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold,
+                require_uniform=(self.query_len_support != QueryLenSupport.VARLEN),
+            )
+        )
 
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
@@ -697,14 +777,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
-            # Note(hc): The context lengths in the perspective of dcp rank0.
-            cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() /
-                                             self.dcp_world_size).int()
-            origin_context_lens = context_lens_cpu.tolist()
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
-            prefill_query_start_loc = query_start_loc[
-                reqs_start:] - query_start_loc[reqs_start]
+            prefill_query_start_loc = (
+                query_start_loc[reqs_start:] - query_start_loc[reqs_start]
+            )
 
             chunked_context_metadata = None
             if max_context_len_cpu > 0:
@@ -716,16 +793,16 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 # prefill in the batch, we could probably use a more advanced
                 # algorithm here and allocate more workspace to prefills with
                 # longer context lengths
-                max_context_chunk = (self.chunked_prefill_workspace_size //
-                                     num_prefills_with_context_cpu)
+                max_context_chunk = (
+                    self.chunked_prefill_workspace_size // num_prefills_with_context_cpu
+                )
 
                 if self.aot_schedule:
                     # align max_context_chunk to page_size by rounding down,
                     # currently the `gather_and_maybe_dequant_cache` kernel
                     # cannot handle `context_chunk_starts` that are not aligned
                     # to page_size
-                    max_context_chunk = round_down(max_context_chunk,
-                                                   self.page_size)
+                    max_context_chunk = round_down(max_context_chunk, self.page_size)
 
                 assert max_context_chunk > 0
                 num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
@@ -736,77 +813,102 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
                 # Note(simon): this is done in CPU because of downstream's
                 # of `to_list`.
-                chunk_starts = \
-                    torch.arange(num_chunks, dtype=torch.int32) \
-                    .unsqueeze(1).expand(-1, num_prefills) \
+                chunk_starts = (
+                    torch.arange(num_chunks, dtype=torch.int32)
+                    .unsqueeze(1)
+                    .expand(-1, num_prefills)
                     * max_context_chunk
-                chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
-                                       chunk_starts + max_context_chunk)
+                )
+                chunk_ends = torch.min(
+                    context_lens_cpu.unsqueeze(0), chunk_starts + max_context_chunk
+                )
                 chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
 
-                cu_seq_lens_cpu = torch.zeros(num_chunks,
-                                              num_prefills + 1,
-                                              dtype=torch.int32,
-                                              pin_memory=True)
-                torch.cumsum(chunk_seq_lens,
-                             dim=1,
-                             out=cu_seq_lens_cpu[:, 1:],
-                             dtype=torch.int32)
+                cu_seq_lens_cpu = torch.zeros(
+                    num_chunks, num_prefills + 1, dtype=torch.int32, pin_memory=True
+                )
+                torch.cumsum(
+                    chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
+                )
 
                 if self.dcp_world_size > 1:
+                    local_context_lens_allranks = get_dcp_local_seq_lens(
+                        context_lens_cpu,
+                        self.dcp_world_size,
+                        None,
+                        self.dcp_local_block_size,
+                    )
+                    # Note(qcs): The max local context lengths
+                    # padded to `dcp_local_block_size`.
+                    padded_local_context_lens_cpu = (
+                        cdiv(
+                            context_lens_cpu,
+                            self.dcp_virtual_block_size,
+                        )
+                        * self.dcp_local_block_size
+                    )
                     # Note(hc): The above max_context_chunk already enforces
                     # block_size alignment, DCP just need the block_size can
                     # be divisible by dcp_world_size, because DCP use
                     # cp_gather_cache which not require `cp_chunk_starts`
                     # aligned to page_size.
                     assert max_context_chunk % self.dcp_world_size == 0
-                    cp_max_context_chunk = max_context_chunk // \
-                        self.dcp_world_size
-                    cp_chunk_starts = \
-                        torch.arange(num_chunks, dtype=torch.int32) \
-                        .unsqueeze(1).expand(-1, num_prefills) \
-                        * cp_max_context_chunk
-                    cp_chunk_ends = torch.min(
-                        cp_context_lens_cpu.unsqueeze(0),
-                        cp_chunk_starts + cp_max_context_chunk)
-                    cp_chunk_seq_lens = (cp_chunk_ends -
-                                         cp_chunk_starts).clamp(min=0)
-
-                    cp_cu_seq_lens_cpu = torch.zeros(num_chunks,
-                                                     num_prefills + 1,
-                                                     dtype=torch.int32,
-                                                     pin_memory=True)
-                    torch.cumsum(cp_chunk_seq_lens,
-                                 dim=1,
-                                 out=cp_cu_seq_lens_cpu[:, 1:],
-                                 dtype=torch.int32)
-
-                chunked_context_metadata_cls = \
-                    CudnnPrefillMetadata.ChunkedContextMetadata \
-                    if self._use_cudnn_prefill else \
-                        MLACommonPrefillMetadata.ChunkedContextMetadata
+                    padded_local_max_context_chunk_across_ranks = (
+                        cdiv(
+                            max_context_chunk,
+                            self.dcp_virtual_block_size,
+                        )
+                        * self.dcp_local_block_size
+                    )
+                    local_chunk_starts = (
+                        torch.arange(num_chunks, dtype=torch.int32)
+                        .unsqueeze(1)
+                        .expand(-1, num_prefills)
+                        * padded_local_max_context_chunk_across_ranks
+                    )
+                    local_chunk_ends = torch.min(
+                        padded_local_context_lens_cpu.unsqueeze(0),
+                        local_chunk_starts
+                        + padded_local_max_context_chunk_across_ranks,
+                    )
+                    padded_local_chunk_seq_lens = (
+                        local_chunk_ends - local_chunk_starts
+                    ).clamp(min=0)
+
+                    padded_local_cu_chunk_seq_lens_cpu = torch.zeros(
+                        num_chunks, num_prefills + 1, dtype=torch.int32, pin_memory=True
+                    )
+                    torch.cumsum(
+                        padded_local_chunk_seq_lens,
+                        dim=1,
+                        out=padded_local_cu_chunk_seq_lens_cpu[:, 1:],
+                        dtype=torch.int32,
+                    )
+
+                chunked_context_metadata_cls = (
+                    CudnnPrefillMetadata.ChunkedContextMetadata
+                    if self._use_cudnn_prefill
+                    else MLACommonPrefillMetadata.ChunkedContextMetadata
+                )
                 if self.dcp_world_size > 1:
-                    chunked_context_metadata = \
-                        chunked_context_metadata_cls(
-                        cu_seq_lens=cu_seq_lens_cpu \
-                            .to(device, non_blocking=True),
-                        starts=cp_chunk_starts.to(device, non_blocking=True),
-                        seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
+                    chunked_context_metadata = chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
+                        starts=local_chunk_starts.to(device, non_blocking=True),
+                        seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
                         workspace=self.chunked_prefill_workspace,
-                        cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
-                        origin_context_lens=origin_context_lens,
-                        cp_cu_seq_lens=cp_cu_seq_lens_cpu \
-                            .to(device, non_blocking=True),
-                        chunk_size=max_context_chunk,
+                        padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
+                        local_context_lens_allranks=local_context_lens_allranks.tolist(),
+                        padded_local_cu_seq_lens=padded_local_cu_chunk_seq_lens_cpu.to(
+                            device, non_blocking=True
+                        ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                        chunk_size=padded_local_max_context_chunk_across_ranks,
                     )
                 else:
-                    chunked_context_metadata = \
-                        chunked_context_metadata_cls(
-                        cu_seq_lens=cu_seq_lens_cpu \
-                            .to(device, non_blocking=True),
+                    chunked_context_metadata = chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
                         starts=chunk_starts.to(device, non_blocking=True),
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
@@ -817,8 +919,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 if self._use_cudnn_prefill:
                     chunked_context_metadata.seq_lens = chunk_seq_lens
 
-                assert max(chunked_context_metadata.max_seq_lens) <= \
-                    self.chunked_prefill_workspace_size
+                assert (
+                    max(chunked_context_metadata.max_seq_lens)
+                    <= self.chunked_prefill_workspace_size
+                )
 
             prefill_metadata = self.prefill_metadata_cls(
                 block_table=block_table_tensor[reqs_start:, ...],
@@ -829,19 +933,32 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
             if self._use_cudnn_prefill:
                 assert isinstance(prefill_metadata, CudnnPrefillMetadata)
-                prefill_metadata.query_seq_lens = prefill_query_start_loc[1:] \
-                    - prefill_query_start_loc[:-1]
+                prefill_metadata.query_seq_lens = (
+                    prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
+                )
                 prefill_metadata.cudnn_workspace = self.cudnn_workspace
 
+            if self._use_trtllm_ragged_prefill:
+                prefill_metadata.query_seq_lens = (
+                    prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
+                )
+
         decode_metadata = None
         if num_decodes > 0:
+            dcp_tot_seq_lens_device = None
+            if self.dcp_world_size > 1:
+                dcp_tot_seq_lens_device = seq_lens[:num_decodes]
+                seq_lens_cpu = dcp_local_seq_lens_cpu
+                seq_lens = dcp_local_seq_lens
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
                 seq_lens_cpu=seq_lens_cpu[:num_decodes],
                 seq_lens_device=seq_lens[:num_decodes],
-                query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1],
-                query_start_loc_device=query_start_loc[:num_decodes + 1],
+                query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
+                query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
+                dcp_tot_seq_lens_device=dcp_tot_seq_lens_device,
             )
 
         attn_metadata = self.metadata_cls(
@@ -870,9 +987,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 def reorg_kvcache(
     allgatered_kv_c_normed: torch.Tensor,
     allgatered_k_pe: torch.Tensor,
-    cp_chunk_seq_lens_lst: list[int],
-    origin_context_lens: list[int],
-    cp_world_size: int,
+    padded_local_chunk_seq_lens_lst: list[int],
+    local_context_lens_allranks: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
     chunk_size: int,
@@ -880,15 +996,19 @@ def reorg_kvcache(
     toks: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
-    reorg kvcache after cp local gather to tp layout for attn kernel.
-
+    reorg and unpad kvcache after cp local gather to tp layout for attn kernel.
+    e.g.
+    allgatered_kv_c_normed = [T0_0, T0_1, T0_2, T0_3, T1_0, T1_1, ...,
+                              T0_4, T0_5, pad, pad, T1_2, pad, ...]
+    -> reorganized_kv_c_normed = [T0_0, T0_1, T0_2, T0_3, T0_4, T0_5,
+                                  T1_0, T1_1, T1_2, ...]
     Args:
-        cp_chunk_seq_lens_lst: chunk context lengths under CP.
-        origin_context_lens: origin full context lengths under CP.
-        cp_world_size: CP size.
+        padded_local_chunk_seq_lens_lst: local chunk context lengths
+            under current CP rank.
+        local_context_lens_allranks: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
-        chunk_size: equals to max_context_chunk from
+        chunk_size: the local padded max context chunk from
             chunked_context_metadata building.
         chunk_idx: chunk idx of chunked_prefill.
         toks: the number of tokens for local gather cache.
@@ -897,33 +1017,38 @@ def reorg_kvcache(
     k_pe_segments = []
     src_token_idx = 0
     max_seq_len_check = 0
-    for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst,
-                                                    origin_context_lens):
-        chunk_context_len = chunk_size
-        if cp_chunk_seq_len != 0:
-            chunk_context_len = min(
-                chunk_context_len, origin_context_len - chunk_size * chunk_idx)
-        cp_target_rank = (chunk_context_len - 1) % cp_world_size
+    for padded_local_chunk_seq_len, local_context_lens in zip(
+        padded_local_chunk_seq_lens_lst, local_context_lens_allranks
+    ):
         cur_seq_len = 0
-        for rank in range(cp_world_size):
-            if rank > cp_target_rank and cp_chunk_seq_len:
-                real_cp_chunk_seq_len = cp_chunk_seq_len - 1
-            else:
-                real_cp_chunk_seq_len = cp_chunk_seq_len
-            if real_cp_chunk_seq_len:
-                kv_c_segment = allgatered_kv_c_normed[rank * toks +
-                                                      src_token_idx:rank *
-                                                      toks + src_token_idx +
-                                                      real_cp_chunk_seq_len]
-                k_pe_segment = allgatered_k_pe[rank * toks +
-                                               src_token_idx:rank * toks +
-                                               src_token_idx +
-                                               real_cp_chunk_seq_len]
+        for rank, local_context_len in enumerate(local_context_lens):
+            # Note(qcs): We split the context into multiple chunks,
+            # depending on the size of the workspace.
+            # local_context in dcp0:   |-----------------|
+            # local_context in dcp1:   |--------------|
+            # n*padded_local_chunk:    |-----|-----|-----|
+            # local_chunk_len in dcp1: |-----|-----|--|
+            # so we need update the last chunk length in dcp1.
+            local_chunk_len = min(
+                max(0, local_context_len - chunk_idx * chunk_size),
+                padded_local_chunk_seq_len,
+            )
+            if local_chunk_len != 0:
+                kv_c_segment = allgatered_kv_c_normed[
+                    rank * toks + src_token_idx : rank * toks
+                    + src_token_idx
+                    + local_chunk_len
+                ]
+                k_pe_segment = allgatered_k_pe[
+                    rank * toks + src_token_idx : rank * toks
+                    + src_token_idx
+                    + local_chunk_len
+                ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
-                cur_seq_len += real_cp_chunk_seq_len
+                cur_seq_len += local_chunk_len
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
-        src_token_idx += cp_chunk_seq_len
+        src_token_idx += padded_local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
     reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
     assert reorganized_kv_c_normed.shape[0] == sum_seq_len
@@ -946,14 +1071,14 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
@@ -961,7 +1086,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
         indexer=None,
-        q_pad_num_heads: Optional[int] = None,
+        q_pad_num_heads: int | None = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported for MLA")
@@ -981,6 +1106,139 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         self.kv_b_proj = kv_b_proj
         self.indexer = indexer
         self.q_pad_num_heads = q_pad_num_heads
+        self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        def get_layer_weight(layer):
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute: {WEIGHT_NAMES}."
+            )
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(
+                    layer.input_size_per_partition,
+                    dtype=act_dtype,
+                    device=get_layer_weight(layer).device,
+                )
+                dequant_weights = layer.quant_method.apply(layer, eye, bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+        ), (
+            f"{kv_b_proj_weight.shape=}, "
+            f"{self.kv_lora_rank=}, "
+            f"{self.num_heads=}, "
+            f"{self.qk_nope_head_dim=}, "
+            f"{self.v_head_dim=}"
+        )
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+
+        if self.is_aiter_triton_fp8_bmm_enabled:
+            W_K = W_UK.transpose(0, 1)  # 16 512 128
+            W_V = W_UV.permute(1, 2, 0)  # 16 128 512
+            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
+                W_K, dtype=current_platform.fp8_dtype()
+            )
+            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
+                W_V, dtype=current_platform.fp8_dtype()
+            )
+
+            # The kernel operates on non-padded inputs. Hence, pre-compiling
+            # triton kernel to avoid runtime compilation for unseen batch sizes
+            # Pre-compile for batch sizes 1 to 1024 to cover most use-cases.
+            # On DS-R1, this step adds roughly 50s to the model loading time.
+            max_batch_size = 1024  # [ToDo] Find the optimal upper limit
+            pre_compilation_list = list(range(1, max_batch_size + 1))
+            if is_global_first_rank():
+                pre_compilation_list = tqdm(
+                    pre_compilation_list,
+                    desc="[Aiter Triton] Pre-compiling fp8 BMM kernel",
+                    total=max_batch_size,
+                )
+
+            for m in pre_compilation_list:
+                x = torch.empty(
+                    (self.W_K.shape[0], m, self.W_K.shape[2]),
+                    dtype=torch.bfloat16,
+                    device=self.W_K.device,
+                )
+                rocm_aiter_ops.triton_fp8_bmm(
+                    x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
+                )
+
+                x = torch.empty(
+                    (self.W_V.shape[0], m, self.W_V.shape[2]),
+                    dtype=torch.bfloat16,
+                    device=self.W_V.device,
+                )
+                rocm_aiter_ops.triton_fp8_bmm(
+                    x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+                )
+        else:
+            # Convert from (L, N, V) to (N, L, V)
+            self.W_UV = W_UV.transpose(0, 1)
+            # Convert from (L, N, P) to (N, P, L)
+            self.W_UK_T = W_UK.permute(1, 2, 0)
+
+    def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        if self.is_aiter_triton_fp8_bmm_enabled:
+            # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
+            x = rocm_aiter_ops.triton_fp8_bmm(
+                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+            )
+            # Convert from (B, N, V) to (B, N * V)
+            x = x.reshape(-1, self.num_heads * self.v_head_dim)
+            # Copy result
+            out.copy_(x)
+        else:
+            # Convert from (B, N * V) to (N, B, V)
+            out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1)
+
+            # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+            torch.bmm(x, self.W_UV, out=out)  # Reuse "out" to make it "hot"
+
+            # Convert from (N, B, V) to (B, N * V)
+            out_new = out.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+
+            # Adjust output buffer shape back to the original (B, N * V)
+            N, B, V = out.shape
+            out.resize_((B, N * V))
+            out.copy_(out_new)  # Copy result
+
+
+class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -1118,10 +1376,16 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
             self._pad_v = False
+        elif use_trtllm_ragged_deepseek_prefill():
+            logger.debug_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
+            self._run_prefill_context_chunk = (
+                self._run_prefill_context_chunk_trtllm_ragged
+            )
+            self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
+            self._pad_v = False
         elif use_cudnn_prefill():
             logger.debug_once("Using CUDNN prefill for MLA")
-            self._run_prefill_context_chunk = \
-                self._run_prefill_context_chunk_cudnn
+            self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
             self._pad_v = False
         else:  # Use FlashAttention
@@ -1136,9 +1400,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             self.flash_attn_varlen_func = flash_attn_varlen_func
             self.vllm_flash_attn_version = get_flash_attn_version()
             if self.vllm_flash_attn_version is not None:
-                self.flash_attn_varlen_func = \
-                    functools.partial(flash_attn_varlen_func,
-                                    fa_version=self.vllm_flash_attn_version)
+                self.flash_attn_varlen_func = functools.partial(
+                    flash_attn_varlen_func, fa_version=self.vllm_flash_attn_version
+                )
 
             # For MLA the v head dim is smaller than qk head dim so we pad out
             # v with 0s to match the qk head dim for attention backends that do
@@ -1146,25 +1410,28 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # We don't need to pad V if we are on a hopper system with FA3
             self._pad_v = self.vllm_flash_attn_version is None or not (
                 self.vllm_flash_attn_version == 3
-                and current_platform.get_device_capability()[0] == 9)
+                and current_platform.get_device_capability()[0] == 9
+            )
 
-        self.dcp_world_size: Optional[int] = None
+        self.dcp_world_size: int | None = None
 
-        self.chunked_prefill_workspace_size = \
+        self.chunked_prefill_workspace_size = (
             MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
-            get_current_vllm_config())
-
-    def _flash_attn_varlen_diff_headdims(self,
-                                         q,
-                                         k,
-                                         v,
-                                         return_softmax_lse=False,
-                                         softmax_scale=None,
-                                         **kwargs):
+                get_current_vllm_config()
+            )
+        )
+        self.dcp_kv_cache_interleave_size: int = (
+            get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size
+        )
+
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
         maybe_padded_v = v
         if self._pad_v:
             maybe_padded_v = torch.nn.functional.pad(
-                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+                v, [0, q.shape[-1] - v.shape[-1]], value=0
+            )
 
         if is_vllm_fa:
             kwargs["return_softmax_lse"] = return_softmax_lse
@@ -1172,6 +1439,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # ROCm leverages the upstream flash_attn, which takes a parameter
             # called "return_attn_probs" instead of return_softmax_lse
             kwargs["return_attn_probs"] = return_softmax_lse
+        if vllm_is_batch_invariant():
+            kwargs["num_splits"] = 1
 
         attn_out = self.flash_attn_varlen_func(
             q=q,
@@ -1192,8 +1461,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             return attn_out, lse
         return attn_out
 
-    def _run_prefill_new_tokens_fa(self, prefill: MLACommonPrefillMetadata, q,
-                                   k, v, return_softmax_lse):
+    def _run_prefill_new_tokens_fa(
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+    ):
         return self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
@@ -1207,10 +1477,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             return_softmax_lse=return_softmax_lse,
         )
 
-    def _run_prefill_new_tokens_fi(self, prefill: MLACommonPrefillMetadata, q,
-                                   k, v, return_softmax_lse):
+    def _run_prefill_new_tokens_fi(
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+    ):
         assert isinstance(prefill, FlashInferPrefillMetadata)
         assert prefill.prefill_main is not None
+
         ret = prefill.prefill_main.run(
             q=q,
             k=k,
@@ -1219,12 +1491,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
         if isinstance(ret, tuple):
-            # Convert from (q_len, num_heads) to (num_heads, q_len)
             return ret[0], ret[1].transpose(0, 1).contiguous()
         return ret
 
-    def _run_prefill_new_tokens_cudnn(self, prefill: MLACommonPrefillMetadata,
-                                      q, k, v, return_softmax_lse):
+    def _run_prefill_new_tokens_cudnn(
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+    ):
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.query_seq_lens is not None
         output, lse = cudnn_batch_prefill_with_kv_cache(
@@ -1238,16 +1510,18 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
             actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
             causal=True,
-            return_lse=True,  # do not support False for now
-            is_cuda_graph_compatible=
-            True,  #Indicates actual_seq_lens are on GPU or CPU.
+            # Do not support False for now
+            return_lse=True,
+            # Indicates actual_seq_lens are on GPU or CPU.
+            is_cuda_graph_compatible=True,
         )
         if return_softmax_lse:
             return output, lse
         return output
 
-    def _run_prefill_context_chunk_fa(self, prefill: MLACommonPrefillMetadata,
-                                      chunk_idx: int, q, k, v):
+    def _run_prefill_context_chunk_fa(
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+    ):
         assert prefill.chunked_context is not None
         return self._flash_attn_varlen_diff_headdims(
             q=q,
@@ -1262,9 +1536,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             return_softmax_lse=True,
         )
 
-    def _run_prefill_context_chunk_fi(self, prefill: MLACommonPrefillMetadata,
-                                      chunk_idx: int, q, k, v):
+    def _run_prefill_context_chunk_fi(
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+    ):
         assert isinstance(prefill, FlashInferPrefillMetadata)
+
         attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
             q=q,
             k=k,
@@ -1274,9 +1550,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         # Convert from (q_len, num_heads) to (num_heads, q_len)
         return attn_out, lse.transpose(0, 1).contiguous()
 
-    def _run_prefill_context_chunk_cudnn(self,
-                                         prefill: MLACommonPrefillMetadata,
-                                         chunk_idx: int, q, k, v):
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
+
+    def _run_prefill_context_chunk_cudnn(
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+    ):
         assert isinstance(prefill, CudnnPrefillMetadata)
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
@@ -1290,34 +1569,109 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             max_token_per_sequence=prefill.max_query_len,
             max_sequence_kv=prefill.chunked_context.max_seq_lens[chunk_idx],
             actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].
-            view(-1, 1, 1, 1),
+            actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].view(
+                -1, 1, 1, 1
+            ),
             causal=False,
             return_lse=True,
-            is_cuda_graph_compatible=
-            True,  #Indicates actual_seq_lens are on GPU or CPU.
+            # Indicates actual_seq_lens are on GPU or CPU.
+            is_cuda_graph_compatible=True,
         )
 
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
+    def _run_prefill_new_tokens_trtllm_ragged(
+        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
+    ):
+        """TRT-LLM ragged attention for new tokens (causal)."""
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        assert prefill.query_seq_lens is not None
+
+        ret = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=self._workspace_buffer,
+            seq_lens=prefill.query_seq_lens,
+            max_q_len=prefill.max_query_len,
+            max_kv_len=prefill.max_query_len,
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=prefill.query_seq_lens.shape[0],
+            window_left=-1,
+            cum_seq_lens_q=prefill.query_start_loc,
+            cum_seq_lens_kv=prefill.query_start_loc,
+            enable_pdl=False,
+            is_causal=True,
+            return_lse=return_softmax_lse,
+        )
 
+        if isinstance(ret, tuple):
+            # Convert from (q_len, num_heads) to (num_heads, q_len)
+            return ret[0], ret[1].transpose(0, 1).contiguous()
+        return ret
+
+    def _run_prefill_context_chunk_trtllm_ragged(
+        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
+    ):
+        """TRT-LLM ragged attention for context chunks (non-causal)."""
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        assert prefill.chunked_context is not None
+        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
+
+        out = torch.zeros(
+            q.shape[0],
+            q.shape[1],
+            v.shape[2],
+            device=q.device,
+            dtype=q.dtype,
+        )
+        self._workspace_buffer.fill_(0)
+
+        attn_out, lse = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=self._workspace_buffer,
+            seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
+            max_q_len=prefill.max_query_len,
+            max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=prefill.chunked_context.seq_lens[chunk_idx].shape[0],
+            window_left=-1,
+            cum_seq_lens_q=prefill.query_start_loc,
+            cum_seq_lens_kv=prefill.chunked_context.cu_seq_lens[chunk_idx],
+            enable_pdl=False,
+            is_causal=False,
+            return_lse=True,
+            out=out,
+        )
+
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
         def get_layer_weight(layer):
             WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
             for attr in WEIGHT_NAMES:
                 if hasattr(layer, attr):
                     return getattr(layer, attr)
             raise AttributeError(
-                f"Layer '{layer}' has no recognized weight attribute:"
-                f" {WEIGHT_NAMES}.")
+                f"Layer '{layer}' has no recognized weight attribute: {WEIGHT_NAMES}."
+            )
 
         def get_and_maybe_dequant_weights(layer: LinearBase):
             if not isinstance(layer.quant_method, UnquantizedLinearMethod):
                 # NOTE: This should only be used offline, since it's O(N^3)
-                eye = torch.eye(layer.input_size_per_partition,
-                                dtype=act_dtype,
-                                device=get_layer_weight(layer).device)
-                dequant_weights = layer.quant_method.apply(layer,
-                                                           eye,
-                                                           bias=None)
+                eye = torch.eye(
+                    layer.input_size_per_partition,
+                    dtype=act_dtype,
+                    device=get_layer_weight(layer).device,
+                )
+                dequant_weights = layer.quant_method.apply(layer, eye, bias=None)
                 del eye
                 # standardize to (output, input)
                 return dequant_weights.T
@@ -1329,12 +1683,14 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
-                f"{kv_b_proj_weight.shape=}, "
-                f"{self.kv_lora_rank=}, "
-                f"{self.num_heads=}, "
-                f"{self.qk_nope_head_dim=}, "
-                f"{self.v_head_dim=}")
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+        ), (
+            f"{kv_b_proj_weight.shape=}, "
+            f"{self.kv_lora_rank=}, "
+            f"{self.num_heads=}, "
+            f"{self.qk_nope_head_dim=}, "
+            f"{self.v_head_dim=}"
+        )
         kv_b_proj_weight = kv_b_proj_weight.view(
             self.kv_lora_rank,
             self.num_heads,
@@ -1342,15 +1698,18 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
         W_UK, W_UV = kv_b_proj_weight.split(
-            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
 
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
-                W_K, dtype=current_platform.fp8_dtype())
+                W_K, dtype=current_platform.fp8_dtype()
+            )
             self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
-                W_V, dtype=current_platform.fp8_dtype())
+                W_V, dtype=current_platform.fp8_dtype()
+            )
 
             # The kernel operates on non-padded inputs. Hence, pre-compiling
             # triton kernel to avoid runtime compilation for unseen batch sizes
@@ -1366,23 +1725,23 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 )
 
             for m in pre_compilation_list:
-                x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
-                                dtype=torch.bfloat16,
-                                device=self.W_K.device)
-                aiter_triton_fp8_bmm(x,
-                                     self.W_K,
-                                     self.W_K_scale,
-                                     group_size=128,
-                                     transpose_bm=True)
+                x = torch.empty(
+                    (self.W_K.shape[0], m, self.W_K.shape[2]),
+                    dtype=torch.bfloat16,
+                    device=self.W_K.device,
+                )
+                rocm_aiter_ops.triton_fp8_bmm(
+                    x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
+                )
 
-                x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
-                                dtype=torch.bfloat16,
-                                device=self.W_V.device)
-                aiter_triton_fp8_bmm(x,
-                                     self.W_V,
-                                     self.W_V_scale,
-                                     group_size=128,
-                                     transpose_bm=True)
+                x = torch.empty(
+                    (self.W_V.shape[0], m, self.W_V.shape[2]),
+                    dtype=torch.bfloat16,
+                    device=self.W_V.device,
+                )
+                rocm_aiter_ops.triton_fp8_bmm(
+                    x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+                )
         else:
             # Convert from (L, N, V) to (N, L, V)
             self.W_UV = W_UV.transpose(0, 1)
@@ -1418,18 +1777,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 seq_starts=prefill_metadata.chunked_context.starts[i],
             )
 
-            kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank]
-            k_pe = workspace[:toks]\
-                [..., self.kv_lora_rank:].unsqueeze(1)
+            kv_c_normed = workspace[:toks][..., : self.kv_lora_rank]
+            k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1)
 
-            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
-                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            k_nope, v = kv_nope\
-                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view(
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
+            )
+            k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
-                          dim=-1)
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
             attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                 prefill=prefill_metadata,
@@ -1470,11 +1826,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
         assert prefill_metadata.chunked_context is not None
-        assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
-        assert prefill_metadata.chunked_context.origin_context_lens is not None
-        assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
-        assert prefill_metadata.chunked_context.chunk_size is not None
+        assert prefill_metadata.chunked_context.padded_local_chunk_seq_lens is not None
+        assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
+        assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None
         assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
 
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
@@ -1486,7 +1842,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
-                cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
+                cu_seq_lens=prefill_metadata.chunked_context.padded_local_cu_seq_lens[
+                    i
+                ],
                 batch_size=attn_metadata.num_prefills,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
             )
@@ -1494,44 +1852,44 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # |------- N tokens --------|--------- N*dcp_size tokens ----------|
             # |<- use for loca_gather ->|<--------- use for allgather -------->|
             allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
-            assert allgather_offset * (dcp_world_size +
-                                       1) == workspace.shape[0]
+            assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0]
             assert toks <= allgather_offset
             local_gathered_kvcache = workspace[:toks]
             cur_allgather_workspace = workspace[
-                allgather_offset:allgather_offset * (1 + dcp_world_size)]
+                allgather_offset : allgather_offset * (1 + dcp_world_size)
+            ]
             assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
-            cur_allgather_kvcache = cur_allgather_workspace[:toks *
-                                                            dcp_world_size]
-            cur_allgather_kvcache.copy_(get_dcp_group().all_gather(
-                local_gathered_kvcache, dim=0))
-            assert cur_allgather_kvcache.shape[
-                -1] == self.kv_lora_rank + self.qk_rope_head_dim
-            allgatered_kv_c_normed, allgatered_k_pe = \
-                cur_allgather_kvcache.unsqueeze(
-                1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+            cur_allgather_kvcache = cur_allgather_workspace[: toks * dcp_world_size]
+            cur_allgather_kvcache.copy_(
+                get_dcp_group().all_gather(local_gathered_kvcache, dim=0)
+            )
+            assert (
+                cur_allgather_kvcache.shape[-1]
+                == self.kv_lora_rank + self.qk_rope_head_dim
+            )
+            allgatered_kv_c_normed, allgatered_k_pe = cur_allgather_kvcache.unsqueeze(
+                1
+            ).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
 
             kv_c_normed, k_pe = reorg_kvcache(
                 allgatered_kv_c_normed,
                 allgatered_k_pe,
-                cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.
-                cp_chunk_seq_lens[i],
-                origin_context_lens=prefill_metadata.chunked_context.
-                origin_context_lens,
-                cp_world_size=dcp_world_size,
-                sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i]
-                [-1],
+                padded_local_chunk_seq_lens_lst=prefill_metadata.chunked_context.padded_local_chunk_seq_lens[
+                    i
+                ],
+                local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks,
+                sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
                 chunk_size=prefill_metadata.chunked_context.chunk_size,
                 chunk_idx=i,
-                toks=toks)
+                toks=toks,
+            )
 
-            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
-                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            k_nope, v = kv_nope\
-                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
-                          dim=-1)
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view(
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
+            )
+            k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
             attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                 prefill=prefill_metadata,
@@ -1574,10 +1932,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         assert self.dcp_world_size is not None
 
         has_context = attn_metadata.prefill.chunked_context is not None
-        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
-            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv_nope\
-            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
+        )
+        k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
@@ -1592,14 +1950,19 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         if has_context:
             suffix_output, suffix_lse = output
             if self.dcp_world_size > 1:
-                context_output, context_lse = \
+                context_output, context_lse = (
                     self._context_parallel_compute_prefill_context(
-                    q, kv_c_and_k_pe_cache, attn_metadata,
-                    k_scale=None, dcp_world_size=self.dcp_world_size)
+                        q,
+                        kv_c_and_k_pe_cache,
+                        attn_metadata,
+                        k_scale=None,
+                        dcp_world_size=self.dcp_world_size,
+                    )
+                )
             else:
-                context_output, context_lse = \
-                    self._compute_prefill_context(
-                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
+                context_output, context_lse = self._compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale
+                )
 
             output = torch.empty_like(suffix_output)
             merge_attn_states(
@@ -1612,18 +1975,18 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         # unpad if necessary
         if self._pad_v:
-            output = output[..., :v.shape[-1]]
+            output = output[..., : v.shape[-1]]
 
         return output.flatten(start_dim=-2)
 
     @abstractmethod
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         raise NotImplementedError
 
     def forward(
@@ -1634,24 +1997,27 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
         attn_metadata: M,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert output is not None, "Output tensor must be provided."
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for MLACommonImpl")
+                "fused output quantization is not yet supported for MLACommonImpl"
+            )
 
         if attn_metadata is None:
             # During the profile run try to simulate to worse case output size
             # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
             # since this can be large
             _ = torch.empty(
-                (self.chunked_prefill_workspace_size, self.num_heads,
-                 self.qk_nope_head_dim + self.v_head_dim),
+                (
+                    self.chunked_prefill_workspace_size,
+                    self.num_heads,
+                    self.qk_nope_head_dim + self.v_head_dim,
+                ),
                 device=k_c_normed.device,
                 dtype=k_c_normed.dtype,
             )
@@ -1675,9 +2041,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         k_c_normed = k_c_normed[:num_actual_toks, ...]
         k_pe = k_pe[:num_actual_toks, ...]
 
-        assert attn_metadata.num_decodes is not None and \
-            attn_metadata.num_prefills is not None and \
-            attn_metadata.num_decode_tokens is not None
+        assert (
+            attn_metadata.num_decodes is not None
+            and attn_metadata.num_prefills is not None
+            and attn_metadata.num_decode_tokens is not None
+        )
 
         has_decode = attn_metadata.num_decodes > 0
         has_prefill = attn_metadata.num_prefills > 0
@@ -1705,61 +2073,73 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         if has_prefill:
             output[num_decode_tokens:] = self._forward_prefill(
-                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata, layer._k_scale)
+                prefill_q,
+                prefill_k_c_normed,
+                prefill_k_pe,
+                kv_cache,
+                attn_metadata,
+                layer._k_scale,
+            )
 
         if has_decode:
             assert attn_metadata.decode is not None
+
             decode_q_nope, decode_q_pe = decode_q.split(
-                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+            )
+
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
 
-            # Pads the head_dim if necessary (for the underlying kernel)
             if self.q_pad_num_heads is not None:
                 B, N, L = decode_q_pe.shape
-                decode_pe_padded = decode_q_pe.new_empty(
-                    (B, self.q_pad_num_heads, L))
+                decode_pe_padded = decode_q_pe.new_empty((B, self.q_pad_num_heads, L))
                 decode_pe_padded.resize_((B, N, L))
                 decode_pe_padded.copy_(decode_q_pe)
                 decode_q_pe = decode_pe_padded
 
-            if is_rocm_aiter_fp8bmm_enabled():
+            if self.is_aiter_triton_fp8_bmm_enabled:
                 # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
-                decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope,
-                                                      self.W_K,
-                                                      self.W_K_scale,
-                                                      group_size=128,
-                                                      transpose_bm=True)
+                decode_ql_nope = rocm_aiter_ops.triton_fp8_bmm(
+                    decode_q_nope,
+                    self.W_K,
+                    self.W_K_scale,
+                    group_size=128,
+                    transpose_bm=True,
+                )
             else:
                 # Pads the head_dim if necessary (for the underlying kernel)
                 N, B, P = decode_q_nope.shape
                 _, _, L = self.W_UK_T.shape
+
                 if self.q_pad_num_heads is not None:
                     decode_ql_nope = decode_q_nope.new_empty(
-                        (self.q_pad_num_heads, B, L))
+                        (self.q_pad_num_heads, B, L)
+                    )
                     decode_ql_nope.resize_((N, B, L))
-
                 else:
                     decode_ql_nope = decode_q_nope.new_empty((N, B, L))
 
                 # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
                 torch.bmm(decode_q_nope, self.W_UK_T, out=decode_ql_nope)
+
                 # Convert from (N, B, L) to (B, N, L)
                 decode_ql_nope = decode_ql_nope.transpose(0, 1)
 
             if fp8_attention:
                 ql_nope_shape = decode_ql_nope.shape
                 decode_ql_nope, _ = ops.scaled_fp8_quant(
-                    decode_ql_nope.reshape([
-                        ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]
-                    ]), layer._q_scale)
+                    decode_ql_nope.reshape(
+                        [ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]]
+                    ),
+                    layer._q_scale,
+                )
                 decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape)
                 q_pe_shape = decode_q_pe.shape
                 decode_q_pe, _ = ops.scaled_fp8_quant(
-                    decode_q_pe.reshape(
-                        [q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
-                    layer._q_scale)
+                    decode_q_pe.reshape([q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
+                    layer._q_scale,
+                )
                 decode_q_pe = decode_q_pe.reshape(q_pe_shape)
 
             decode_q = (decode_ql_nope, decode_q_pe)
@@ -1771,10 +2151,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 decode_q = get_dcp_group().all_gather(decode_q, dim=1)
 
             # call decode attn
-            attn_out, lse = self._forward_decode(decode_q, kv_cache,
-                                                 attn_metadata, layer)
+            attn_out, lse = self._forward_decode(
+                decode_q, kv_cache, attn_metadata, layer
+            )
 
-            # recorect dcp attn_out with lse.
+            # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
                 attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
 
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index d44e20f2cb6be50a167fed0f2cae307e8ef55357..60cb5022a55ebd3b5d69bccc44cd042298c703ae 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -2,18 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
 import vllm._custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
-                                              is_quantized_kv_cache)
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+    is_quantized_kv_cache,
+)
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata,
-                                                   MLACommonMetadataBuilder)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonImpl,
+    MLACommonMetadata,
+    MLACommonMetadataBuilder,
+)
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 logger = init_logger(__name__)
@@ -21,11 +29,19 @@ logger = init_logger(__name__)
 
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
-    cudagraph_support: ClassVar[
-        AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    )
 
 
 class CutlassMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -39,13 +55,16 @@ class CutlassMLABackend(MLACommonBackend):
     def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
         return CutlassMLAMetadataBuilder
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
 
-class SM100Workspace:
 
+class SM100Workspace:
     def __init__(self, initial_workspace_size):
-        self._workspace_buf = torch.empty(initial_workspace_size,
-                                          device="cuda",
-                                          dtype=torch.uint8)
+        self._workspace_buf = torch.empty(
+            initial_workspace_size, device="cuda", dtype=torch.uint8
+        )
 
         self._block_size = 128  # Forced to 128
 
@@ -57,8 +76,7 @@ class SM100Workspace:
     def get_buf(self):
         return self._workspace_buf
 
-    def ensure_size(self, attn_metadata: MLACommonMetadata,
-                    num_kv_splits: int):
+    def ensure_size(self, attn_metadata: MLACommonMetadata, num_kv_splits: int):
         batch_size = attn_metadata.num_reqs
         max_seq_len = attn_metadata.max_query_len
 
@@ -66,7 +84,8 @@ class SM100Workspace:
             max_seq_len * self._block_size,
             batch_size,
             self._sm_count,
-            num_kv_splits=num_kv_splits)
+            num_kv_splits=num_kv_splits,
+        )
 
         if self._workspace_buf.shape[0] < workspace_size:
             self._workspace_buf.resize_(workspace_size)
@@ -81,51 +100,56 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
     can_return_lse_for_decode: bool = True
 
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads,
-                         head_size,
-                         scale,
-                         num_kv_heads,
-                         alibi_slopes,
-                         sliding_window,
-                         kv_cache_dtype,
-                         logits_soft_cap,
-                         attn_type,
-                         kv_sharing_target_layer_name,
-                         q_pad_num_heads=MAX_HEADS,
-                         **mla_args)
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            q_pad_num_heads=MAX_HEADS,
+            **mla_args,
+        )
 
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "CutlassMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "CutlassMLAImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "CutlassMLAImpl"
+            )
 
         # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging
         #       issues. In case the code hangs, use:
         #       FORCE_NUM_KV_SPLITS=1
         force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
         if force_num_kv_splits:
-            logger.warning_once("Forcing num_kv_splits to %d",
-                                int(force_num_kv_splits))
+            logger.debug_once("Forcing num_kv_splits to %d", int(force_num_kv_splits))
             self._num_kv_splits = int(force_num_kv_splits)
         else:
             self._num_kv_splits = -1  # => Auto-detect
@@ -144,14 +168,13 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         sm_scale: float,
         num_kv_splits: int,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        assert (q_nope.ndim == 3
-                ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
-        assert (
-            q_pe.ndim == 3), f"q_pe must be a 3D tensor, but got {q_pe.ndim}"
-        assert (
-            kv_c_and_k_pe_cache.ndim == 3
-        ), "kv_c_and_k_pe_cache must be a 3D tensor, but got {}".format(
-            kv_c_and_k_pe_cache.ndim)
+        assert q_nope.ndim == 3, f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
+        assert q_pe.ndim == 3, f"q_pe must be a 3D tensor, but got {q_pe.ndim}"
+        assert kv_c_and_k_pe_cache.ndim == 3, (
+            "kv_c_and_k_pe_cache must be a 3D tensor, but got {}".format(
+                kv_c_and_k_pe_cache.ndim
+            )
+        )
 
         B_q, H, D_q_nope = q_nope.shape
         B_q_2, H_2, D_q_pe = q_pe.shape
@@ -171,28 +194,31 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         assert len(page_table.shape) == 2
         B_block_table, block_num = page_table.shape
         assert B_block_table == B_q
-        assert (block_num
-                > 0), f"block num must be greater than 0, got {block_num}"
+        assert block_num > 0, f"block num must be greater than 0, got {block_num}"
         assert block_num % (128 / PAGE_SIZE) == 0
 
-        assert q_nope.dtype in (
-            torch.float16, torch.bfloat16, torch.float8_e4m3fn), (
-                f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got "
-                f"{q_nope.dtype}.")
+        assert q_nope.dtype in (torch.float16, torch.bfloat16, torch.float8_e4m3fn), (
+            f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got {q_nope.dtype}."
+        )
         assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
-        assert (
-            seq_lens.dtype == torch.int32
-        ), f"seq_lens.dtype needs to be int32 but got {seq_lens.dtype}."
-        assert (
-            page_table.dtype == torch.int32
-        ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
-
-        dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype)
-                 else q_nope.dtype)
+        assert seq_lens.dtype == torch.int32, (
+            f"seq_lens.dtype needs to be int32 but got {seq_lens.dtype}."
+        )
+        assert page_table.dtype == torch.int32, (
+            f"page_table.dtype needs to be int32 but got {page_table.dtype}."
+        )
+
+        dtype = (
+            torch.bfloat16
+            if is_quantized_kv_cache(self.kv_cache_dtype)
+            else q_nope.dtype
+        )
         out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
-        lse = (torch.empty(
-            (B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
-               if self.need_to_return_lse_for_decode else torch.Tensor())
+        lse = (
+            torch.empty((B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
+            if self.need_to_return_lse_for_decode
+            else torch.Tensor()
+        )
 
         ops.sm100_cutlass_mla_decode(
             out,
@@ -216,11 +242,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -228,7 +254,8 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             q_nope, q_pe = q
         else:
             q_nope, q_pe = torch.split(
-                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
 
         # Adjust workspace size (if necessary)
         self._workspace.ensure_size(attn_metadata, self._num_kv_splits)
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 652b1cdb6b767c1611e692d41c956f5ddfa3335d..12639edc8b9a13f42c25dfc1d1c0933dcde29430 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -2,23 +2,36 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
 from vllm import envs
-from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.utils.fa_utils import (flash_attn_supports_mla,
-                                           get_flash_attn_version)
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+    is_quantized_kv_cache,
+)
+from vllm.attention.utils.fa_utils import (
+    flash_attn_supports_mla,
+    get_flash_attn_version,
+)
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_dcp_group
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonDecodeMetadata,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata,
-                                                   MLACommonMetadataBuilder)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonDecodeMetadata,
+    MLACommonImpl,
+    MLACommonMetadata,
+    MLACommonMetadataBuilder,
+    QueryLenSupport,
+)
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
@@ -27,15 +40,14 @@ logger = init_logger(__name__)
 
 
 class FlashAttnMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["FlashAttnMLAMetadata"]:
-        return FlashAttnMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
         return FlashAttnMLAMetadataBuilder
@@ -44,13 +56,33 @@ class FlashAttnMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["FlashAttnMLAImpl"]:
         return FlashAttnMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 9
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if not flash_attn_supports_mla():
+            return "FlashAttention MLA not supported on this device"
+        return None
+
 
 @dataclass
 class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
     query_start_loc: torch.Tensor
     max_query_len: int
     max_seq_len: int
-    scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
     max_num_splits: int = 0
 
 
@@ -59,33 +91,35 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
     pass
 
 
-class FlashAttnMLAMetadataBuilder(
-        MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_BATCH
+class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.VARLEN
+    reorder_batch_threshold: int = 512  # process small prefills with decode pathway
 
-    reorder_batch_threshold: int = 512
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
-                         FlashAttnMLAMetadata)
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(
+            kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+            FlashAttnMLAMetadata,
+            supports_dcp_with_varlen=True,
+        )
         self.max_num_splits = 0  # No upper bound on the number of splits.
-        self.fa_aot_schedule = (get_flash_attn_version() == 3)
+        self.fa_aot_schedule = get_flash_attn_version() == 3
 
-        self.use_full_cuda_graph = \
+        self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        )
+        self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
 
         if self.use_full_cuda_graph and self.fa_aot_schedule:
-            self.max_cudagraph_size = self.compilation_config.max_capture_size
-
-            if self.max_cudagraph_size > 992:
-                # This condition derives from FA3's internal heuristic.
-                # TODO(woosuk): Support larger cudagraph sizes.
-                raise ValueError(
-                    "Capture size larger than 992 is not supported for "
-                    "full cuda graph.")
-
             self.scheduler_metadata = torch.zeros(
                 vllm_config.scheduler_config.max_num_seqs + 1,
                 dtype=torch.int32,
@@ -94,22 +128,27 @@ class FlashAttnMLAMetadataBuilder(
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = (
-                envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
+            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
 
-        # TODO(lucas): Until we add support for the DCP custom masking we need
-        #   to restrict decodes to q_len == 1 when DCP is enabled.
-        self.reorder_batch_threshold = 1 \
-            if get_dcp_group().world_size > 1 else self.reorder_batch_threshold
+        if vllm_is_batch_invariant():
+            self.max_num_splits = 1
 
-    def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens,
-                         max_seq_len, causal):
+    def _schedule_decode(
+        self,
+        num_reqs,
+        cu_query_lens,
+        max_query_len,
+        seqlens,
+        max_seq_len,
+        causal,
+        max_num_splits,
+    ):
         if self.fa_aot_schedule:
             return get_scheduler_metadata(
                 batch_size=num_reqs,
                 max_seqlen_q=max_query_len,
                 max_seqlen_k=max_seq_len,
-                num_heads_q=self.num_heads,
+                num_heads_q=self.num_heads * self.dcp_world_size,
                 num_heads_kv=1,
                 headdim=self.mla_dims.qk_rope_head_dim,
                 cache_seqlens=seqlens,
@@ -118,20 +157,36 @@ class FlashAttnMLAMetadataBuilder(
                 page_size=self.page_size,
                 cu_seqlens_q=cu_query_lens,
                 causal=causal,
-                num_splits=self.max_num_splits,
+                num_splits=max_num_splits,
             )
         return None
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens_cpu: torch.Tensor,
-                      seq_lens_device: torch.Tensor,
-                      query_start_loc_cpu: torch.Tensor,
-                      query_start_loc_device: torch.Tensor,
-                      num_decode_tokens: int) -> FlashAttnMLADecodeMetadata:
-        query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])
+    def _build_decode(
+        self,
+        block_table_tensor: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        seq_lens_device: torch.Tensor,
+        query_start_loc_cpu: torch.Tensor,
+        query_start_loc_device: torch.Tensor,
+        num_decode_tokens: int,
+        dcp_tot_seq_lens_device: torch.Tensor | None,
+    ) -> FlashAttnMLADecodeMetadata:
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
         max_seq_len = seq_lens_cpu.max().item()
 
+        # For Flash Attention MLA + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and num_decode_tokens <= self.max_cudagraph_size:
+            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+            # usage, because the intermediate buffers of size [num_splits,
+            # num_heads, num_tokens, head_size] are allocated. Therefore,
+            # we only set num_splits when using cuda graphs.
+            max_num_splits = self.max_num_splits
+
+        if vllm_is_batch_invariant():
+            max_num_splits = 1
+
         scheduler_metadata = self._schedule_decode(
             num_reqs=seq_lens_cpu.numel(),
             cu_query_lens=query_start_loc_device,
@@ -139,16 +194,16 @@ class FlashAttnMLAMetadataBuilder(
             seqlens=seq_lens_device,
             max_seq_len=max_seq_len,
             causal=True,
+            max_num_splits=max_num_splits,
         )
 
-        # For FA3 + full cudagraph
-        max_num_splits = 0
         if self.use_full_cuda_graph and scheduler_metadata is not None:
             n = scheduler_metadata.shape[0]
             # Ensure the persistent buffer is large enough
-            assert n <= self.scheduler_metadata.shape[0], \
-                f"Scheduler metadata size {n} exceeds buffer size " + \
-                f"{self.scheduler_metadata.shape[0]}"
+            assert n <= self.scheduler_metadata.shape[0], (
+                f"Scheduler metadata size {n} exceeds buffer size "
+                + f"{self.scheduler_metadata.shape[0]}"
+            )
             self.scheduler_metadata[:n] = scheduler_metadata
             # NOTE(woosuk): We should zero out the rest of the scheduler
             # metadata to guarantee the correctness. Otherwise, some thread
@@ -157,14 +212,7 @@ class FlashAttnMLAMetadataBuilder(
             self.scheduler_metadata[n:] = 0
             scheduler_metadata = self.scheduler_metadata[:n]
 
-            if num_decode_tokens <= self.max_cudagraph_size:
-                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
-                # usage, because the intermediate buffers of size [num_splits,
-                # num_heads, num_tokens, head_size] are allocated. Therefore,
-                # we only set num_splits when using cuda graphs.
-                max_num_splits = self.max_num_splits
-
-        return FlashAttnMLADecodeMetadata(
+        metadata = FlashAttnMLADecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens_device,
             query_start_loc=query_start_loc_device,
@@ -172,57 +220,72 @@ class FlashAttnMLAMetadataBuilder(
             max_seq_len=max_seq_len,
             scheduler_metadata=scheduler_metadata,
             max_num_splits=max_num_splits,
+            dcp_tot_seq_lens=dcp_tot_seq_lens_device,
         )
+        return metadata
 
 
 class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
     can_return_lse_for_decode: bool = True
 
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
-
-        assert flash_attn_supports_mla(), \
-            "FlashAttnMLA is not supported on this device"
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+
+        assert flash_attn_supports_mla(), "FlashAttnMLA is not supported on this device"
 
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "FlashAttnMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttnMLAImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "FlashAttnMLAImpl"
+            )
 
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
-                "FlashAttnMLA V1 with FP8 KV cache not yet supported")
+                "FlashAttnMLA V1 with FP8 KV cache not yet supported"
+            )
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashAttnMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -230,14 +293,14 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
             q_nope, q_pe = q
         else:
             q_nope, q_pe = torch.split(
-                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError(
-                "FP8 FlashAttention MLA not yet supported")
+            raise NotImplementedError("FP8 FlashAttention MLA not yet supported")
 
-        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
-        k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:]
+        kv_c_cache = kv_c_and_k_pe_cache[..., : self.kv_lora_rank]
+        k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank :]
 
         # NOTE(matt): During CUDA graph capture, max_query_len can be 0, but the
         # kernel uses this to calculate grid dimensions. Ensure it's at least 1
@@ -260,6 +323,9 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
             fa_version=3,  # only version 3 is supported
             scheduler_metadata=attn_metadata.decode.scheduler_metadata,
             num_splits=attn_metadata.decode.max_num_splits,
+            cp_world_size=self.dcp_world_size,
+            cp_rank=self.dcp_rank,
+            cp_tot_seqused_k=attn_metadata.decode.dcp_tot_seq_lens,
         )
 
         if self.need_to_return_lse_for_decode:
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 701248670f72ec796b1992af96504acd673bf8c2..52bb19e039e451e2b3930fb677e563c798db0501 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -1,23 +1,46 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
+from typing import ClassVar
 
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
-from vllm.attention.backends.abstract import AttentionLayer, AttentionType
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+)
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonImpl,
+    MLACommonMetadata,
+    MLACommonMetadataBuilder,
+    QueryLenSupport,
+)
+from vllm.v1.attention.backends.utils import AttentionCGSupport, KVCacheLayoutType
 
 logger = init_logger(__name__)
 
 FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
 
 
+class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
+
+
 class FlashInferMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -27,6 +50,18 @@ class FlashInferMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["FlashInferMLAImpl"]:
         return FlashInferMLAImpl
 
+    @staticmethod
+    def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
+        return FlashInferMLAMetadataBuilder
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return "HND"
+
 
 g_fi_workspace = torch.zeros(
     FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
@@ -36,49 +71,61 @@ g_fi_workspace = torch.zeros(
 
 
 class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
-
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
 
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "FlashInferMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferMLAImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "FlashInferMLAImpl"
+            )
 
         self._workspace_buffer = g_fi_workspace
-        self.bmm1_scale: Optional[float] = None
-        self.bmm2_scale: Optional[float] = None
+        self.bmm1_scale: float | None = None
+        self.bmm2_scale: float | None = None
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -87,11 +134,18 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
             q = torch.cat([q_nope, q_pe], dim=-1)
 
         # trtllm API requires extra dimension q_len_per_request for MTP
-        q = q.unsqueeze(1)
+        if attn_metadata.num_decode_tokens % attn_metadata.num_decodes != 0:
+            logger.warning_once(
+                """FlashInferMLAImpl got a query of uneven length.
+                This usually indicates an issue in batch reordering
+                or incorrect setup in dummy_run."""
+            )
+            q = q.unsqueeze(1)
+        else:
+            q = q.view(attn_metadata.num_decodes, -1, q.shape[-2], q.shape[-1])
 
         if self.bmm1_scale is None:
-            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
-                               self.scale)
+            self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
         if self.bmm2_scale is None:
             self.bmm2_scale = layer._v_scale_float
 
@@ -109,6 +163,9 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
             bmm2_scale=self.bmm2_scale,
         )
 
+        # Flatten the output for consistent shape
+        o = o.view(-1, o.shape[-2], o.shape[-1])
+
         # TODO: Return LSE pending support from Flashinfer API:
         # https://github.com/flashinfer-ai/flashinfer/pull/1566
         return o, None
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 67c21f83cf5d7e8b11e1e8a3290c0496e1037c19..3aab1f9bb7fb6adb5d10021773d188fd38db552b 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -2,37 +2,54 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionLayer, AttentionType
-from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
-                                         get_mla_metadata,
-                                         is_flashmla_supported)
+from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf
+from vllm.attention.ops.flashmla import (
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+    is_flashmla_dense_supported,
+)
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonDecodeMetadata,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata,
-                                                   MLACommonMetadataBuilder)
-from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonDecodeMetadata,
+    MLACommonImpl,
+    MLACommonMetadata,
+    MLACommonMetadataBuilder,
+    QueryLenSupport,
+)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    reshape_attn_output_for_spec_decode,
+    reshape_query_for_spec_decode,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
 
 class FlashMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
 
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["FlashMLAMetadata"]:
-        return FlashMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
         return FlashMLAMetadataBuilder
@@ -41,6 +58,31 @@ class FlashMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["FlashMLAImpl"]:
         return FlashMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if use_sparse:
+            from vllm.attention.ops.flashmla import is_flashmla_sparse_supported
+
+            return is_flashmla_sparse_supported()[1]
+        else:
+            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+
+            return is_flashmla_dense_supported()[1]
+
 
 @dataclass
 class FlashMLADecodeMetadata(MLACommonDecodeMetadata):
@@ -54,19 +96,29 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
+    reorder_batch_threshold: int = 128  # process small prefills with decode pathway
+    # ^ TODO(matt): tune this
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
-                         FlashMLAMetadata)
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(
+            kv_cache_spec, layer_names, vllm_config, device, FlashMLAMetadata
+        )
 
         self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
 
         self.cg_buf_tile_scheduler_metadata = None
         self.cg_buf_num_splits = None
+        self.is_fp8_kvcache = vllm_config.cache_config.cache_dtype.startswith("fp8")
 
         device_properties = torch.cuda.get_device_properties(self.device)
         num_sms = device_properties.multi_processor_count
@@ -82,19 +134,28 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
             self.cg_buf_num_splits = torch.empty(
                 (vllm_config.scheduler_config.max_num_seqs + 1),
                 device=self.device,
-                dtype=torch.int32)
-
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens_cpu: torch.Tensor,
-                      seq_lens_device: torch.Tensor,
-                      query_start_loc_cpu: torch.Tensor,
-                      query_start_loc_device: torch.Tensor,
-                      num_decode_tokens: int) -> FlashMLADecodeMetadata:
-        tile_scheduler_metadata, num_splits = \
-            get_mla_metadata(
+                dtype=torch.int32,
+            )
+
+    def _build_decode(
+        self,
+        block_table_tensor: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        seq_lens_device: torch.Tensor,
+        query_start_loc_cpu: torch.Tensor,
+        query_start_loc_device: torch.Tensor,
+        num_decode_tokens: int,
+        dcp_tot_seq_lens_device: torch.Tensor | None,
+    ) -> FlashMLADecodeMetadata:
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        # we use the max but all should be the same due to uniform length requirement
+        max_query_len = query_lens_cpu.max().item()
+        num_q_tokens_per_head_k = max_query_len * self.num_q_heads // 1
+        tile_scheduler_metadata, num_splits = get_mla_metadata(
             seq_lens_device,
-            self.num_q_heads,
-            1, # MQA for the decode path
+            num_q_tokens_per_head_k,
+            1,  # MQA for the decode path
+            is_fp8_kvcache=self.is_fp8_kvcache,
         )
 
         # TODO: we can disambiguate between decode and mixed-prefill decode here
@@ -107,8 +168,9 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
             sm_parts = tile_scheduler_metadata.size(0)
             # Metadata per-SM, upper bound on size (<= #SMs, TileMetadataSize)
             assert sm_parts <= self.cg_buf_tile_scheduler_metadata.size(0)
-            tile_scheduler_metadata_view = \
-                self.cg_buf_tile_scheduler_metadata[:sm_parts]
+            tile_scheduler_metadata_view = self.cg_buf_tile_scheduler_metadata[
+                :sm_parts
+            ]
             tile_scheduler_metadata_view.copy_(tile_scheduler_metadata)
             tile_scheduler_metadata = tile_scheduler_metadata_view
 
@@ -129,54 +191,67 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
             seq_lens=seq_lens_device,
             tile_scheduler_metadata=tile_scheduler_metadata,
             num_splits=num_splits,
+            dcp_tot_seq_lens=dcp_tot_seq_lens_device,
         )
 
 
 class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
-
     can_return_lse_for_decode: bool = True
 
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
-
-        is_supported, reason = is_flashmla_supported()
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+
+        is_supported, reason = is_flashmla_dense_supported()
         assert is_supported, reason
 
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "FlashMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashMLAImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "FlashMLAImpl"
+            )
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         # TODO: (zyongye) decode function for mla here
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
@@ -184,20 +259,56 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
         if type(q) is tuple:
             q = torch.cat(q, dim=-1)
 
+        # mypy assertion: q is now always a tensor
         assert isinstance(q, torch.Tensor)
+
+        num_decodes = attn_metadata.num_decodes
+        q = reshape_query_for_spec_decode(q, num_decodes)
+
+        tile_scheduler_metadata = attn_metadata.decode.tile_scheduler_metadata
+        num_splits = attn_metadata.decode.num_splits
+        if vllm_is_batch_invariant():
+            device = q.device
+            dtype = torch.int32
+
+            B = q.shape[0]
+            # block_table shape: [batch_size, max_num_blocks_per_seq]
+            # The number of blocks per sequence is in the second dimension
+            topk = attn_metadata.decode.block_table.shape[-1]
+            B_TOPK = 64
+            assert topk % B_TOPK == 0, f"topk ({topk}) must be divisible by {B_TOPK}"
+            end_block_idx = topk // B_TOPK
+
+            # Single partition => num_sm_parts = 1
+            # TileSchedulerMetaDataSize = 8, layout:
+            # [begin_idx, begin_block_idx, end_idx, end_block_idx,
+            #  begin_n_split_idx, _, _, _]
+            tile_scheduler_metadata = torch.zeros((1, 8), dtype=dtype, device=device)
+            tile_scheduler_metadata[0, 0] = 0  # begin_idx
+            tile_scheduler_metadata[0, 1] = 0  # sched_begin_block_idx
+            tile_scheduler_metadata[0, 2] = B - 1  # end_idx
+            tile_scheduler_metadata[0, 3] = end_block_idx
+            tile_scheduler_metadata[0, 4] = 0  # begin_n_split_idx
+            # fields [5..7] stay 0
+
+            # Non-split path ignores num_splits, but the API requires it:
+            # zeros of length B+1
+            num_splits = torch.zeros((B + 1,), dtype=dtype, device=device)
+
         o, lse = flash_mla_with_kvcache(
-            q=q.unsqueeze(1),  # Add seqlen dim of 1 (decode)
+            q=q,
             k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
             block_table=attn_metadata.decode.block_table,
             cache_seqlens=attn_metadata.decode.seq_lens,
             head_dim_v=self.kv_lora_rank,
-            tile_scheduler_metadata=attn_metadata.decode.
-            tile_scheduler_metadata,
-            num_splits=attn_metadata.decode.num_splits,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
             softmax_scale=self.scale,
             causal=True,
             descale_q=layer._q_scale.reshape(1),
             descale_k=layer._k_scale.reshape(1),
         )
 
+        o = reshape_attn_output_for_spec_decode(o)
+
         return o, lse
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 36c3c188042ccea066c9656c086a1cd69b73762c..bb8d914d157193c1f27b721739792f9dba0aa1a2 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, ClassVar, Optional
 
@@ -8,21 +7,30 @@ import numpy as np
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
-                                              AttentionMetadata)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionLayer,
+    MultipleOf,
+)
 from vllm.attention.backends.utils import get_mla_dims
-from vllm.attention.ops.flashmla import (flash_mla_sparse_prefill,
-                                         flash_mla_with_kvcache,
-                                         get_mla_metadata)
+from vllm.attention.ops.flashmla import (
+    flash_mla_sparse_prefill,
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+)
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 if TYPE_CHECKING:
@@ -32,35 +40,27 @@ logger = init_logger(__name__)
 """
 NOTE: FlashMLA Sparse uses an fp8 cache with the following format
 
-In the "FP8 with scale" format, each token's KV cache is 656 Bytes, 
+In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
 structured as:
--   **First 512 bytes:** The "quantized NoPE" part, containing 512 
+-   **First 512 bytes:** The "quantized NoPE" part, containing 512
     `float8_e4m3` values.
--   **Next 16 bytes:** Scale factors, containing 4 `float32` values. 
-    The first `float32` is the scale for the first 128 `float8_e4m3` values, 
+-   **Next 16 bytes:** Scale factors, containing 4 `float32` values.
+    The first `float32` is the scale for the first 128 `float8_e4m3` values,
     the second for the next 128, and so on.
--   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This 
+-   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
     part is not quantized for accuracy.
 """
 
 
-def _lse2_to_lse(lse_base2: torch.Tensor) -> torch.Tensor:
-    # Convert base-2 LSE to natural-log LSE
-    # Keep FP32 for numerical stability during the merge.
-    return (lse_base2.to(torch.float32) * math.log(2.0))
-
-
 class FlashMLASparseBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
     @staticmethod
     def get_name() -> str:
-        return "FLASHMLA_SPARSE_VLLM_V1"
-
-    @staticmethod
-    def get_metadata_cls() -> type[AttentionMetadata]:
-        return FlashMLASparseMetadata
+        return "FLASHMLA_SPARSE"
 
     @staticmethod
     def get_builder_cls() -> type["FlashMLASparseMetadataBuilder"]:
@@ -70,6 +70,22 @@ class FlashMLASparseBackend(AttentionBackend):
     def get_impl_cls() -> type["FlashMLASparseImpl"]:
         return FlashMLASparseImpl
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return True
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -85,43 +101,6 @@ class FlashMLASparseBackend(AttentionBackend):
         else:
             return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [576]
-
-
-@dataclass
-class MLASparsePrefillMetadata:
-    # NOTE(Chen): not call it "FlashMLASparsePrefillMetadata" because
-    # the kernel is not from flashmla
-    block_table: torch.Tensor
-    has_context: bool = False
-    context_lens: Optional[torch.Tensor] = None
-
-
-@dataclass
-class FlashMLASparseDecodeAndContextMetadata:
-    scheduler_metadata: torch.Tensor = None
-    num_splits: torch.Tensor = None
-    cache_lens: torch.Tensor = None
-    prefill_context_lengths: Optional[torch.Tensor] = None
-    prefill_new_k_start_locs: Optional[torch.Tensor] = None
-    dummy_block_table: torch.Tensor = None
-
-    def filter_prefill_indices(
-            self, indices: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        assert self.prefill_context_lengths is not None
-        prefill_context_lengths = self.prefill_context_lengths.unsqueeze(-1)
-        context_indices = torch.where(indices < prefill_context_lengths,
-                                      indices, -1)
-        new_token_indices = torch.where(indices >= prefill_context_lengths,
-                                        indices - prefill_context_lengths, -1)
-        return context_indices, new_token_indices
-
 
 @dataclass
 class FlashMLASparseMetadata:
@@ -140,12 +119,12 @@ class FlashMLASparseMetadata:
 
     @dataclass
     class FP8KernelMetadata:
-        scheduler_metadata: Optional[torch.Tensor]
+        scheduler_metadata: torch.Tensor | None
         num_splits: torch.Tensor
         dummy_block_table: torch.Tensor
         cache_lens: torch.Tensor
 
-    fp8_extra_metadata: Optional[FP8KernelMetadata] = None
+    fp8_extra_metadata: FP8KernelMetadata | None = None
 
 
 @triton.jit
@@ -194,8 +173,9 @@ def _convert_req_index_to_global_index_kernel(
     base = tl.load(bt_ptr, mask=valid_block, other=0)
 
     # If token == -1 OR block_id OOB, output -1; else base * BLOCK_SIZE + offset
-    out_val = tl.where(is_invalid_tok | (~valid_block), -1,
-                       base * BLOCK_SIZE + inblock_off)
+    out_val = tl.where(
+        is_invalid_tok | (~valid_block), -1, base * BLOCK_SIZE + inblock_off
+    )
 
     # Store results
     out_ptr_ij = out_ptr + token_id * out_stride0 + indice_id * out_stride1
@@ -203,31 +183,30 @@ def _convert_req_index_to_global_index_kernel(
 
 
 def triton_convert_req_index_to_global_index(
-        req_id: torch.Tensor,  # int32 [num_tokens]
-        block_table: torch.
-    Tensor,  # int32 [num_requests, max_num_blocks_per_req]
-        token_indices: torch.Tensor,  # int32 [num_tokens, NUM_TOPK_TOKENS]
-        BLOCK_SIZE: int = 64,
-        NUM_TOPK_TOKENS: int = 2048,
-        BLOCK_N: int = 128,  # tile width along columns
+    req_id: torch.Tensor,  # int32 [num_tokens]
+    block_table: torch.Tensor,  # int32 [num_requests, max_num_blocks_per_req]
+    token_indices: torch.Tensor,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+    BLOCK_SIZE: int = 64,
+    NUM_TOPK_TOKENS: int = 2048,
+    BLOCK_N: int = 128,  # tile width along columns
 ):
     """
     out[token_id, indice_id] =
-        block_table[req_id[token_id], 
+        block_table[req_id[token_id],
             token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE
         + token_indices[token_id, indice_id] % BLOCK_SIZE
 
     Only when token_indices[token_id, indice_id] == -1 do we output -1.
-    For safety, we also output -1 if the derived block_id would be 
+    For safety, we also output -1 if the derived block_id would be
         out-of-bounds.
     """
     assert req_id.dtype == torch.int32
     assert block_table.dtype == torch.int32
     assert token_indices.dtype == torch.int32
     assert token_indices.shape[1] == NUM_TOPK_TOKENS
-    assert NUM_TOPK_TOKENS % BLOCK_N == 0, \
-        f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible by" \
-        f"BLOCK_N ({BLOCK_N})"
+    assert NUM_TOPK_TOKENS % BLOCK_N == 0, (
+        f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible byBLOCK_N ({BLOCK_N})"
+    )
 
     num_tokens = req_id.shape[0]
     num_requests, max_num_blocks_per_req = block_table.shape
@@ -268,14 +247,16 @@ def triton_convert_req_index_to_global_index(
 
 
 @dataclass
-class FlashMLASparseMetadataBuilder(
-        AttentionMetadataBuilder[FlashMLASparseMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_BATCH
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetadata]):
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
 
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         cache_config = vllm_config.cache_config
         self.kv_cache_spec = kv_cache_spec
         self.model_config = vllm_config.model_config
@@ -285,28 +266,27 @@ class FlashMLASparseMetadataBuilder(
         props = torch.cuda.get_device_properties(device)
         sm_count = props.multi_processor_count
 
-        self.num_heads = self.model_config.get_num_attention_heads(
-            parallel_config)
+        self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
         self.topk_tokens = vllm_config.model_config.hf_config.index_topk
         self.use_fp8_kv_cache = cache_config.cache_dtype == "fp8_ds_mla"
-        self.topk_tokens_tensor = torch.tensor([self.topk_tokens],
-                                               device=device,
-                                               dtype=torch.int32)
+        self.topk_tokens_tensor = torch.tensor(
+            [self.topk_tokens], device=device, dtype=torch.int32
+        )
         self.max_model_len_tensor = torch.tensor(
-            [self.model_config.max_model_len],
-            device=device,
-            dtype=torch.int32)
+            [self.model_config.max_model_len], device=device, dtype=torch.int32
+        )
         # this is ignored by `flash_mla_with_kvcache` if indices not None
-        self.dummy_block_table = torch.empty((1, 1),
-                                             dtype=torch.int32,
-                                             device=self.device)
+        self.dummy_block_table = torch.empty(
+            (1, 1), dtype=torch.int32, device=self.device
+        )
 
         # Equation taken from FlashMLA/csrc/pybind.cpp
         h_q, h_k = self.num_heads, 1
         s_q = 1  # inversely proportional to s_q, so s_q = 1 is the largest
         max_num_sm_parts = int(
-            max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1))
+            max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1)
+        )
         if current_platform.is_device_capability(100):
             max_num_sm_parts *= 2
         self.tile_scheduler_metadata_buffer = torch.empty(
@@ -314,34 +294,38 @@ class FlashMLASparseMetadataBuilder(
             # see: FlashMLA/csrc/params.h
             (max_num_sm_parts, 8),
             dtype=torch.int32,
-            device=device)
+            device=device,
+        )
         self.num_splits_buffer = torch.empty(
             # We pack all the tokens into one batch for sparse attention.
             # Otherwise, we can exceed the sm of `get_mla_metadata`.
-            (
-                2, ),
+            (2,),
             dtype=torch.int32,
-            device=device)
+            device=device,
+        )
         self.req_id_per_token_buffer = torch.empty(
-            (vllm_config.scheduler_config.max_num_batched_tokens, ),
+            (vllm_config.scheduler_config.max_num_batched_tokens,),
             dtype=torch.int32,
-            device=device)
-
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> FlashMLASparseMetadata:
+            device=device,
+        )
 
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> FlashMLASparseMetadata:
         num_tokens = common_attn_metadata.num_actual_tokens
-        starts = np.asarray(common_attn_metadata.query_start_loc_cpu,
-                            dtype=np.int32)
+        starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
         seg_lengths = np.diff(starts)
         req_id_per_token = np.repeat(
-            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths)
+            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
+        )
         # Zero-fill for cudagraphs
         self.req_id_per_token_buffer.fill_(0)
-        self.req_id_per_token_buffer[:req_id_per_token.shape[0]]\
-            .copy_(torch.from_numpy(req_id_per_token), non_blocking=True)
+        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
+            torch.from_numpy(req_id_per_token), non_blocking=True
+        )
         req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
 
         fp8_extra_metadata = None
@@ -357,8 +341,9 @@ class FlashMLASparseMetadataBuilder(
 
             num_sm_parts = tile_scheduler_metadata.size(0)
             # Copy to persistent buffer for full-CG support
-            tile_scheduler_metadata_buffer = \
-                self.tile_scheduler_metadata_buffer[:num_sm_parts]
+            tile_scheduler_metadata_buffer = self.tile_scheduler_metadata_buffer[
+                :num_sm_parts
+            ]
             tile_scheduler_metadata_buffer.copy_(tile_scheduler_metadata)
             self.num_splits_buffer.copy_(num_splits)
 
@@ -371,7 +356,8 @@ class FlashMLASparseMetadataBuilder(
                 # accidentally mark indices invalid, we will use -1 exclusively
                 # to mark invalid indices
                 cache_lens=self.max_model_len_tensor,
-                dummy_block_table=self.dummy_block_table)
+                dummy_block_table=self.dummy_block_table,
+            )
 
         metadata = FlashMLASparseMetadata(
             num_reqs=common_attn_metadata.num_reqs,
@@ -390,62 +376,79 @@ class FlashMLASparseMetadataBuilder(
 
 
 class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
-
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            topk_indice_buffer: Optional[torch.Tensor] = None,
-            indexer: Optional["Indexer"] = None,
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        topk_indice_buffer: torch.Tensor | None = None,
+        indexer: Optional["Indexer"] = None,
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
         self.softmax_scale = scale
         assert indexer is not None
         self.topk_indices_buffer = indexer.topk_indices_buffer
-        self.padding = 128 if current_platform.is_device_capability(
-            100) else 64
+        self.padding = 128 if current_platform.is_device_capability(100) else 64
 
     def _forward_bf16_kv(
-            self, q: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor,
-            topk_indices: torch.Tensor,
-            attn_metadata: FlashMLASparseMetadata) -> torch.Tensor:
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: FlashMLASparseMetadata,
+    ) -> torch.Tensor:
         num_tokens = q.shape[0]
         kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
-            -1, 1, kv_c_and_k_pe_cache.shape[-1])
+            -1, 1, kv_c_and_k_pe_cache.shape[-1]
+        )
 
         # NOTE(Chen): kernel requires num_local_head to be a multiple of
         # 64 on hopper and 128 on blackwell
         if self.num_heads % self.padding != 0:
             assert self.padding % self.num_heads == 0
-            logger.warning_once(f"padding num_heads to {self.padding} \
-                    due to sparse attn kernel requirement")
+            logger.warning_once(
+                f"padding num_heads to {self.padding} \
+                    due to sparse attn kernel requirement"
+            )
             q_padded = q.new_empty((q.shape[0], self.padding, q.shape[2]))
-            q_padded[:, :self.num_heads, :] = q
+            q_padded[:, : self.num_heads, :] = q
             q = q_padded
 
         topk_indices = topk_indices.view(num_tokens, 1, -1)
-        output = flash_mla_sparse_prefill(q, kv_c_and_k_pe_cache, topk_indices,
-                                          self.softmax_scale)[0]
-        output = output[:, :self.num_heads, :]
+        output = flash_mla_sparse_prefill(
+            q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale
+        )[0]
+        output = output[:, : self.num_heads, :]
         return output
 
-    def _forward_fp8_kv(self, q: torch.Tensor,
-                        kv_c_and_k_pe_cache: torch.Tensor,
-                        topk_indices: torch.Tensor,
-                        attn_metadata: FlashMLASparseMetadata) -> torch.Tensor:
-
+    def _forward_fp8_kv(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: FlashMLASparseMetadata,
+    ) -> torch.Tensor:
         assert attn_metadata.fp8_extra_metadata is not None
         extra_metadata = attn_metadata.fp8_extra_metadata
 
@@ -472,9 +475,9 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
         attn_metadata: FlashMLASparseMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use
         # MQA 576/512 approach for both prefill and decode
@@ -483,8 +486,8 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for MLACommonImpl")
+                "fused output quantization is not yet supported for MLACommonImpl"
+            )
 
         if attn_metadata is None:
             # The zero fill is required when used with DP + EP
@@ -500,8 +503,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         k_c_normed = k_c_normed[:num_actual_toks, ...]
         k_pe = k_pe[:num_actual_toks, ...]
 
-        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                               dim=-1)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         # Convert from (B, N, P) to (N, B, P)
         q_nope = q_nope.transpose(0, 1)
         # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
@@ -534,11 +536,13 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
             )
 
         if self.kv_cache_dtype != "fp8_ds_mla":
-            attn_out = self._forward_bf16_kv(q, kv_cache, topk_indices_global,
-                                             attn_metadata)
+            attn_out = self._forward_bf16_kv(
+                q, kv_cache, topk_indices_global, attn_metadata
+            )
         else:
-            attn_out = self._forward_fp8_kv(q, kv_cache, topk_indices_global,
-                                            attn_metadata)
+            attn_out = self._forward_fp8_kv(
+                q, kv_cache, topk_indices_global, attn_metadata
+            )
 
         self._v_up_proj(attn_out, out=output[:num_actual_toks])
         return output
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 94b963f34e4a38c6e0423f7ce883fbfa66f8ff83..37aa5dad89a0ebd185e5c08a8fcf222929050813 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -1,28 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    MultipleOf,
+)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              split_decodes_and_prefills)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
 
 logger = init_logger(__name__)
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return DeepseekV32IndexerMetadata
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -76,7 +77,6 @@ class DeepSeekV32IndexerDecodeMetadata:
 
 @dataclass
 class DeepseekV32IndexerMetadata:
-
     # FIXME (zyongye)
     # hacky way to access the data now, need to be in chunked meta
     seq_lens: torch.Tensor
@@ -98,33 +98,33 @@ class DeepseekV32IndexerMetadata:
     num_prefills: int
     num_prefill_tokens: int
 
-    decode: Optional[DeepSeekV32IndexerDecodeMetadata] = None
-    prefill: Optional[DeepseekV32IndexerPrefillMetadata] = None
+    decode: DeepSeekV32IndexerDecodeMetadata | None = None
+    prefill: DeepseekV32IndexerPrefillMetadata | None = None
 
 
 # TODO (zyongye) optimize this, this is now vibe coded
 def kv_spans_from_batches(
-        start_seq_loc: torch.Tensor, seq_len_per_batch: torch.Tensor,
-        device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+    start_seq_loc: torch.Tensor, seq_len_per_batch: torch.Tensor, device: torch.device
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Args:
-      start_seq_loc: 1D long tensor [B+1], cumulative counts of 
+      start_seq_loc: 1D long tensor [B+1], cumulative counts of
                      selected tokens per batch.
-            Example: [0, 2, 4, 7] -> 
+            Example: [0, 2, 4, 7] ->
                      batch sizes (selected) [2, 2, 3], N=7 tokens total.
-      seq_len_per_batch: 1D long tensor [B], 
+      seq_len_per_batch: 1D long tensor [B],
                          full sequence length (KV length) of each batch.
                          Example: [5, 9, 4].
 
     Returns:
-      start_tensor: 1D long tensor [N], start offset in the 
+      start_tensor: 1D long tensor [N], start offset in the
                     concatenated KV cache for each token's batch.
-      end_location: 1D long tensor [N], 
+      end_location: 1D long tensor [N],
                     **exclusive** end = start + token's local position.
                     (So the attended KV slice is kv[start:end].)
 
-    Assumes each batch contributes its full `seq_len_per_batch[i]` 
-    keys to the KV cache, andthe selected tokens within a batch 
+    Assumes each batch contributes its full `seq_len_per_batch[i]`
+    keys to the KV cache, andthe selected tokens within a batch
     are the **last** `counts[i]` positions of that sequence.
     """
     q = start_seq_loc.to(dtype=torch.long)
@@ -138,8 +138,10 @@ def kv_spans_from_batches(
     B = L.numel()
 
     if N == 0:
-        return (torch.empty(0, dtype=torch.long, device=device),
-                torch.empty(0, dtype=torch.long, device=device))
+        return (
+            torch.empty(0, dtype=torch.long, device=device),
+            torch.empty(0, dtype=torch.long, device=device),
+        )
 
     # KV start offsets per batch in the concatenated KV cache
     kv_starts_per_batch = torch.cumsum(L, dim=0) - L  # [B]
@@ -155,8 +157,9 @@ def kv_spans_from_batches(
     L_expand = torch.repeat_interleave(L, counts)  # [N]
     m_expand = torch.repeat_interleave(counts, counts)  # [N]
     # position within the selected block: 1..counts[b]
-    pos_within = (torch.arange(N, dtype=torch.long) -
-                  torch.repeat_interleave(q[:-1], counts) + 1)
+    pos_within = (
+        torch.arange(N, dtype=torch.long) - torch.repeat_interleave(q[:-1], counts) + 1
+    )
 
     local_pos = L_expand - m_expand + pos_within  # [N], 1-based
     end_location = start_tensor + local_pos  # exclusive end
@@ -171,9 +174,9 @@ def get_max_prefill_buffer_size(vllm_config: VllmConfig):
     return max_model_len * 2
 
 
-def split_prefill_chunks(seq_lens_cpu: torch.Tensor,
-                         max_prefill_buffer_size: int,
-                         reqs_start: int) -> list[tuple[int, int]]:
+def split_prefill_chunks(
+    seq_lens_cpu: torch.Tensor, max_prefill_buffer_size: int, reqs_start: int
+) -> list[tuple[int, int]]:
     """
     Split the prefill chunks into a list of tuples of (reqs_start, reqs_end)
     such that the total sequence length of each chunk is less than the
@@ -183,7 +186,7 @@ def split_prefill_chunks(seq_lens_cpu: torch.Tensor,
         seq_lens_cpu: The sequence lengths of the prefill requests.
         max_prefill_buffer_size: The maximum prefill buffer size.
         reqs_start: The start index of the prefill requests.
-    
+
     Returns:
         A list of tuples of (reqs_start, reqs_end).
     """
@@ -203,20 +206,22 @@ def split_prefill_chunks(seq_lens_cpu: torch.Tensor,
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    )
 
     reorder_batch_threshold: int = 1
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         scheduler_config = self.vllm_config.scheduler_config
-        #NOTE(Chen):an estimated max size of flattened_kv. Need to double check.
-        self.max_prefill_buffer_size = get_max_prefill_buffer_size(
-            self.vllm_config)
+        # NOTE(Chen):an estimated max size of flattened_kv. Need to double check.
+        self.max_prefill_buffer_size = get_max_prefill_buffer_size(self.vllm_config)
         self.num_speculative_tokens = (
             self.vllm_config.speculative_config.num_speculative_tokens
-            if self.vllm_config.speculative_config else 0)
+            if self.vllm_config.speculative_config
+            else 0
+        )
         # Now deepgemm fp8_paged_mqa_logits does not support next_n > 2
         self.reorder_batch_threshold += min(self.num_speculative_tokens, 1)
 
@@ -225,31 +230,38 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
         self.num_sms = sm_count
 
         self.decode_lens_buffer = torch.empty(
-            (scheduler_config.max_num_seqs, ),
-            dtype=torch.int32,
-            device=self.device)
+            (scheduler_config.max_num_seqs,), dtype=torch.int32, device=self.device
+        )
 
         # See: DeepGMM/csrc/apis/attention.hpp
-        self.scheduler_metadata_buffer = torch.empty((self.num_sms + 1, 2),
-                                                     dtype=torch.int32,
-                                                     device=self.device)
-
-    def build_one_prefill_chunk(self, reqs_start, reqs_end,
-                                query_start_loc_cpu, seq_lens_cpu,
-                                block_table):
-        prefill_query_start_loc = query_start_loc_cpu[
-            reqs_start:reqs_end + 1] - query_start_loc_cpu[reqs_start]
+        self.scheduler_metadata_buffer = torch.empty(
+            (self.num_sms + 1, 2), dtype=torch.int32, device=self.device
+        )
+
+    def build_one_prefill_chunk(
+        self, reqs_start, reqs_end, query_start_loc_cpu, seq_lens_cpu, block_table
+    ):
+        prefill_query_start_loc = (
+            query_start_loc_cpu[reqs_start : reqs_end + 1]
+            - query_start_loc_cpu[reqs_start]
+        )
         cu_seqlen_ks, cu_seqlen_ke = kv_spans_from_batches(
-            prefill_query_start_loc, seq_lens_cpu[reqs_start:reqs_end],
-            self.device)
+            prefill_query_start_loc, seq_lens_cpu[reqs_start:reqs_end], self.device
+        )
         token_start = query_start_loc_cpu[reqs_start].item()
         token_end = query_start_loc_cpu[reqs_end].item()
         total_seq_lens = seq_lens_cpu[reqs_start:reqs_end].sum()
         assert total_seq_lens <= self.max_prefill_buffer_size
-        cu_seq_lens = torch.cat([
-            torch.zeros(1, dtype=torch.int32),
-            seq_lens_cpu[reqs_start:reqs_end].cumsum(dim=0)
-        ]).to(torch.int32).to(self.device)
+        cu_seq_lens = (
+            torch.cat(
+                [
+                    torch.zeros(1, dtype=torch.int32),
+                    seq_lens_cpu[reqs_start:reqs_end].cumsum(dim=0),
+                ]
+            )
+            .to(torch.int32)
+            .to(self.device)
+        )
         return DeepseekV32IndexerPrefillChunkMetadata(
             cu_seqlen_ks=cu_seqlen_ks,
             cu_seqlen_ke=cu_seqlen_ke,
@@ -261,19 +273,21 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
             num_reqs=reqs_end - reqs_start,
         )
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> DeepseekV32IndexerMetadata:
-
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> DeepseekV32IndexerMetadata:
         num_reqs = common_attn_metadata.num_reqs
         num_tokens = common_attn_metadata.num_actual_tokens
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold)
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
 
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
@@ -287,33 +301,39 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
             )
             chunks = [
                 self.build_one_prefill_chunk(
-                    reqs_start, reqs_end, query_start_loc_cpu,
+                    reqs_start,
+                    reqs_end,
+                    query_start_loc_cpu,
                     common_attn_metadata.seq_lens_cpu,
-                    common_attn_metadata.block_table_tensor)
+                    common_attn_metadata.block_table_tensor,
+                )
                 for reqs_start, reqs_end in chunk_seq_ids
             ]
             prefill_metadata = DeepseekV32IndexerPrefillMetadata(
-                chunks=chunks, )
+                chunks=chunks,
+            )
 
         decode_metadata = None
         if num_decodes > 0:
-            torch.diff(common_attn_metadata.query_start_loc[:num_decodes + 1],
-                       out=self.decode_lens_buffer[:num_decodes])
+            torch.diff(
+                common_attn_metadata.query_start_loc[: num_decodes + 1],
+                out=self.decode_lens_buffer[:num_decodes],
+            )
             decode_lens = self.decode_lens_buffer[:num_decodes]
             decode_lens_cpu = torch.diff(
-                common_attn_metadata.query_start_loc_cpu[:num_decodes + 1])
+                common_attn_metadata.query_start_loc_cpu[: num_decodes + 1]
+            )
 
             # Use CPU to avoid GPU sync; breaking async scheduling
-            requires_padding = (decode_lens_cpu.max()
-                                > decode_lens_cpu.min()).item()
+            requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()
 
             seq_lens = common_attn_metadata.seq_lens[:num_decodes]
 
             self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
-                seq_lens, self.kv_cache_spec.block_size, self.num_sms)
+                seq_lens, self.kv_cache_spec.block_size, self.num_sms
+            )
             decode_metadata = DeepSeekV32IndexerDecodeMetadata(
-                block_table=common_attn_metadata.
-                block_table_tensor[:num_decodes, ...],
+                block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
                 seq_lens=common_attn_metadata.seq_lens[:num_decodes],
                 decode_lens=decode_lens,
                 requires_padding=requires_padding,
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 79247e569b1c0af4dcfdcfd3402e7d5daeb3ca04..e1864526f02cc9a34676f7b3a01772a2dbb46ca8 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -2,35 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import AttentionLayer
-from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
-from vllm.utils import cdiv
-# yapf conflicts with isort for this docstring
-# yapf: disable
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonDecodeMetadata,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata,
-                                                   MLACommonMetadataBuilder)
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonDecodeMetadata,
+    MLACommonImpl,
+    MLACommonMetadata,
+    MLACommonMetadataBuilder,
+)
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-# yapf: enable
-
-
-def is_aiter_mla_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER \
-        and envs.VLLM_ROCM_USE_AITER_MLA
-
 
 class AiterMLABackend(MLACommonBackend):
-
     @staticmethod
     def get_name() -> str:
         return "ROCM_AITER_MLA"
@@ -39,10 +30,6 @@ class AiterMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["AiterMLAImpl"]:
         return AiterMLAImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AiterMLAMetadata"]:
-        return AiterMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
         return AiterMLAMetadataBuilder
@@ -51,14 +38,14 @@ class AiterMLABackend(MLACommonBackend):
 @dataclass
 class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     # The indptr of the paged kv cache, shape: [batch_size + 1]
-    paged_kv_indptr: Optional[torch.Tensor] = None
+    paged_kv_indptr: torch.Tensor | None = None
     # The page indices of the paged kv cache
-    paged_kv_indices: Optional[torch.Tensor] = None
+    paged_kv_indices: torch.Tensor | None = None
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size]
-    paged_kv_last_page_len: Optional[torch.Tensor] = None
+    paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
-    qo_indptr: Optional[torch.Tensor] = None
+    qo_indptr: torch.Tensor | None = None
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -68,19 +55,25 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     # TODO(luka, lucas): audit this as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    )
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
-                         AiterMLAMetadata)
-        assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
-            "only supports block size 1."
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(
+            kv_cache_spec, layer_names, vllm_config, device, AiterMLAMetadata
+        )
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
-                                     self.kv_cache_spec.block_size)
+        max_num_pages_per_req = cdiv(
+            vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size
+        )
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
@@ -89,74 +82,106 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-            self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
-                                               dtype=torch.int32,
-                                               device=device)
-            self.paged_kv_indices = torch.zeros(max_num_pages,
-                                                dtype=torch.int32,
-                                                device=device)
-            self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
-                                                      dtype=torch.int32,
-                                                      device=device)
-
-            self.qo_indptr = torch.arange(0,
-                                          max_num_reqs + 1,
-                                          dtype=torch.int32,
-                                          device=device)
-
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens_cpu: torch.Tensor,
-                      seq_lens_device: torch.Tensor,
-                      query_start_loc_cpu: torch.Tensor,
-                      query_start_loc_device: torch.Tensor,
-                      num_decode_tokens: int) -> AiterMLADecodeMetadata:
+            self.block_table_remapping = torch.zeros(
+                [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size],
+                dtype=torch.int32,
+                device=device,
+            )
+            self.paged_kv_indptr = torch.zeros(
+                max_num_reqs + 1, dtype=torch.int32, device=device
+            )
+            self.paged_kv_indices = torch.zeros(
+                max_num_pages, dtype=torch.int32, device=device
+            )
+            self.paged_kv_last_page_len = torch.zeros(
+                max_num_reqs, dtype=torch.int32, device=device
+            )
+
+            self.qo_indptr = torch.arange(
+                0, max_num_reqs + 1, dtype=torch.int32, device=device
+            )
+
+    def _build_decode(
+        self,
+        block_table_tensor: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        seq_lens_device: torch.Tensor,
+        query_start_loc_cpu: torch.Tensor,
+        query_start_loc_device: torch.Tensor,
+        num_decode_tokens: int,
+        dcp_tot_seq_lens_device: torch.Tensor | None,
+    ) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
-        block_table_bounds = (seq_lens_device + page_size - 1) // page_size
         device = self.device
         num_reqs = seq_lens_device.size(0)
+        bs, _ = block_table_tensor.shape
+        block_table_tensor = (
+            block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size
+        )
+        block_table_tensor = (
+            block_table_tensor
+            + torch.arange(
+                0,
+                page_size,
+                device=block_table_tensor.device,
+                dtype=block_table_tensor.dtype,
+            )[None, None, :]
+        )
+        block_table_tensor = block_table_tensor.view(bs, -1)
 
-        mask = (torch.arange(block_table_tensor.size(1),
-                             dtype=block_table_tensor.dtype,
-                             device=device).unsqueeze(0)
-                < block_table_bounds.unsqueeze(1))
+        # after remapping, we assume the block size already equals to 1
+
+        max_blk_size_per_req = block_table_tensor.shape[-1]
+        mask = torch.arange(
+            block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device
+        ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
         paged_kv_last_page_len = seq_lens_device % page_size
-        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
-                                             page_size, paged_kv_last_page_len)
+        paged_kv_last_page_len = torch.where(
+            paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len
+        )
 
-        paged_kv_indptr = torch.cat([
-            torch.zeros(1, dtype=block_table_bounds.dtype, device=device),
-            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
-        ])
+        paged_kv_indptr = torch.cat(
+            [
+                torch.zeros(1, dtype=seq_lens_device.dtype, device=device),
+                seq_lens_device.cumsum(dim=0, dtype=torch.int32),
+            ]
+        )
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-
             num_actual_pages = paged_kv_indices.size(0)
-
-            self.paged_kv_indices[:num_actual_pages].copy_(paged_kv_indices,
-                                                           non_blocking=True)
+            self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_(
+                block_table_tensor, non_blocking=True
+            )
+            block_table_tensor = self.block_table_remapping[
+                :num_reqs, :max_blk_size_per_req
+            ]
+
+            self.paged_kv_indices[:num_actual_pages].copy_(
+                paged_kv_indices, non_blocking=True
+            )
             self.paged_kv_indices[num_actual_pages:].fill_(-1)
             paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
 
-            self.paged_kv_indptr[:1 + num_reqs].copy_(paged_kv_indptr,
-                                                      non_blocking=True)
-            self.paged_kv_indptr[1 + num_reqs:].fill_(paged_kv_indptr[-1])
-            paged_kv_indptr = self.paged_kv_indptr[:1 + num_reqs]
+            self.paged_kv_indptr[: 1 + num_reqs].copy_(
+                paged_kv_indptr, non_blocking=True
+            )
+            self.paged_kv_indptr[1 + num_reqs :].fill_(paged_kv_indptr[-1])
+            paged_kv_indptr = self.paged_kv_indptr[: 1 + num_reqs]
 
             self.paged_kv_last_page_len[:num_reqs].copy_(
-                paged_kv_last_page_len, non_blocking=True)
+                paged_kv_last_page_len, non_blocking=True
+            )
             self.paged_kv_last_page_len[num_reqs:].fill_(1)
             paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
 
-            qo_indptr = self.qo_indptr[:1 + num_reqs]
+            qo_indptr = self.qo_indptr[: 1 + num_reqs]
 
         else:
-            qo_indptr = torch.arange(0,
-                                     num_reqs + 1,
-                                     step=1,
-                                     dtype=torch.int32,
-                                     device=device)
+            qo_indptr = torch.arange(
+                0, num_reqs + 1, step=1, dtype=torch.int32, device=device
+            )
 
         attn_metadata = AiterMLADecodeMetadata(
             block_table=block_table_tensor,
@@ -164,51 +189,61 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             paged_kv_indptr=paged_kv_indptr,
             paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len=paged_kv_last_page_len,
-            qo_indptr=qo_indptr)
+            qo_indptr=qo_indptr,
+            dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+        )
 
         return attn_metadata
 
 
 class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
-
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
-        assert (num_heads == 16 or num_heads == 128), (
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        assert num_heads == 16 or num_heads == 128, (
             f"Aiter MLA only supports 16 or 128 number of heads.\n"
             f"Provided {num_heads} number of heads.\n"
-            "Try adjusting tensor_parallel_size value.")
+            "Try adjusting tensor_parallel_size value."
+        )
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "Aiter MLA does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         from aiter import flash_attn_varlen_func
+
         self.flash_attn_varlen_func = flash_attn_varlen_func
 
-    def _flash_attn_varlen_diff_headdims(self,
-                                         q,
-                                         k,
-                                         v,
-                                         return_softmax_lse=False,
-                                         softmax_scale=None,
-                                         **kwargs):
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
         output = self.flash_attn_varlen_func(
             q=q,
             k=k,
@@ -222,11 +257,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: AiterMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -235,21 +270,25 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
-        o = torch.zeros(B,
-                        self.num_heads,
-                        self.kv_lora_rank,
-                        dtype=q.dtype,
-                        device=q.device)
+        o = torch.zeros(
+            B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+        )
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
 
         # max_seqlen_qo must be 1 except for MTP
         # TODO: Find the best value for MTP
         max_seqlen_qo = 1
-        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
-                             attn_metadata.decode.qo_indptr, max_seqlen_qo,
-                             attn_metadata.decode.paged_kv_indptr,
-                             attn_metadata.decode.paged_kv_indices,
-                             attn_metadata.decode.paged_kv_last_page_len)
+        rocm_aiter_ops.mla_decode_fwd(
+            q,
+            kv_buffer,
+            o,
+            self.scale,
+            attn_metadata.decode.qo_indptr,
+            max_seqlen_qo,
+            attn_metadata.decode.paged_kv_indptr,
+            attn_metadata.decode.paged_kv_indices,
+            attn_metadata.decode.paged_kv_last_page_len,
+        )
 
         return o, None
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 9abfedd219361375798f339cf762ff815acc43d4..5cce8682a341d96c72ee5e29089ea2e53d7c718e 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,26 +1,34 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
+from typing import ClassVar
 
 import torch
 
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
-                                              is_quantized_kv_cache)
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    is_quantized_kv_cache,
+)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.attention.ops.triton_flash_attention import triton_attention
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
-from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
-                                                   MLACommonImpl,
-                                                   MLACommonMetadata)
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBackend,
+    MLACommonImpl,
+    MLACommonMetadata,
+)
 
 logger = init_logger(__name__)
 
 
 class TritonMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
     @staticmethod
     def get_name() -> str:
@@ -30,105 +38,82 @@ class TritonMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["TritonMLAImpl"]:
         return TritonMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
 
 class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
     can_return_lse_for_decode: bool = True
 
     def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[list[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
 
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
                 "TritonMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, logits_soft_cap")
+                "alibi_slopes, sliding_window, logits_soft_cap"
+            )
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "TritonMLAImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "TritonMLAImpl"
+            )
 
         # if is_quantized_kv_cache(self.kv_cache_dtype):
         #     raise NotImplementedError(
-        #         "TritonMLA V1 with FP8 KV cache not yet supported")
-
-        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
-        self.triton_fa_func = triton_attention if HAS_TRITON else None
-
-    def _flash_attn_varlen_diff_headdims_rocm(self,
-                                              q,
-                                              k,
-                                              v,
-                                              softmax_scale=None,
-                                              **kwargs):
-        assert self.triton_fa_func is not None
-
-        # Triton Attention requires a padded V
-        padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-        # The output of triton_attention is a tuple of
-        # [output_tensor, encoded_softmax] where encoded_softmax is always None
-        output_tensor, _ = self.triton_fa_func(
+        #         "TritonMLA V1 with FP8 KV cache not yet supported"
+        #     )
+
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
+        return super()._flash_attn_varlen_diff_headdims(
             q,
             k,
-            padded_v,
-            None,  # output
-            kwargs["cu_seqlens_q"],
-            kwargs["cu_seqlens_k"],
-            kwargs["max_seqlen_q"],
-            kwargs["max_seqlen_k"],
-            kwargs["causal"],
-            softmax_scale,
-            None,  # bias
+            v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
         )
 
-        return output_tensor
-
-    def _flash_attn_varlen_diff_headdims(self,
-                                         q,
-                                         k,
-                                         v,
-                                         return_softmax_lse=False,
-                                         softmax_scale=None,
-                                         **kwargs):
-        if current_platform.is_rocm() \
-            and self.use_triton_flash_attn \
-            and not return_softmax_lse:
-            return self._flash_attn_varlen_diff_headdims_rocm(
-                q, k, v, softmax_scale=softmax_scale, **kwargs)
-        else:
-            return super()._flash_attn_varlen_diff_headdims(
-                q,
-                k,
-                v,
-                return_softmax_lse=return_softmax_lse,
-                softmax_scale=softmax_scale,
-                **kwargs)
-
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -141,13 +126,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
         q_num_heads = q.shape[1]
-        o = torch.zeros(B,
-                        q_num_heads,
-                        self.kv_lora_rank,
-                        dtype=q.dtype,
-                        device=q.device)
+        o = torch.zeros(
+            B, q_num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+        )
         lse = torch.zeros(B, q_num_heads, dtype=q.dtype, device=q.device)
-        num_kv_splits = 4  # TODO: heuristic
+
+        # For batch invariance, use only 1 split to ensure deterministic reduction
+        num_kv_splits = 1 if vllm_is_batch_invariant() else 4
 
         # TODO(lucas) Allocate ahead of time
         attn_logits = torch.empty(
@@ -165,13 +150,22 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
         # Add a head dim of 1
         kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
-        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        kv_c_cache = kv_c_and_k_pe_cache[..., : self.kv_lora_rank]
         PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
 
         # Run MQA
-        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o, lse,
-                             attn_metadata.decode.block_table,
-                             attn_metadata.decode.seq_lens, attn_logits,
-                             num_kv_splits, self.scale, PAGE_SIZE)
+        decode_attention_fwd(
+            q,
+            kv_c_and_k_pe_cache,
+            kv_c_cache,
+            o,
+            lse,
+            attn_metadata.decode.block_table,
+            attn_metadata.decode.seq_lens,
+            attn_logits,
+            num_kv_splits,
+            self.scale,
+            PAGE_SIZE,
+        )
 
         return o, lse
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 7ac1a063f56534195c13878a19fa942f5057b355..525026bac5a7eb2849dbfe425a8c132295e12be4 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -2,15 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionLayer,
+    AttentionType,
+)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv, next_power_of_2
+from vllm.utils.math_utils import cdiv, next_power_of_2
 
 logger = init_logger(__name__)
 
@@ -31,7 +34,7 @@ TPU_STR_DTYPE_TO_TORCH_DTYPE = {
 }
 
 try:
-    import tpu_commons  # noqa: F401
+    import tpu_inference  # noqa: F401
 except ImportError:
     # Lazy import torch_xla
     import torch_xla.core.xla_builder as xb
@@ -41,49 +44,62 @@ except ImportError:
     from torch_xla.experimental.custom_kernel import XLA_LIB
 
     @requires_jax
-    def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                                kv_cache: torch.Tensor,
-                                num_kv_update_slices: torch.Tensor,
-                                page_size: int, num_slices_per_block: int):
+    def kv_cache_update_op_impl(
+        kv: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache: torch.Tensor,
+        num_kv_update_slices: torch.Tensor,
+        page_size: int,
+        num_slices_per_block: int,
+    ):
         from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
+
         new_kv_cache = xb.call_jax(
             kv_cache_update,
-            (kv, slot_mapping, kv_cache, num_kv_update_slices), {
-                "page_size": page_size,
-                "num_slices_per_block": num_slices_per_block
-            })
+            (kv, slot_mapping, kv_cache, num_kv_update_slices),
+            {"page_size": page_size, "num_slices_per_block": num_slices_per_block},
+        )
         return new_kv_cache
 
-
     XLA_LIB.define(
-        "kv_cache_update_op(Tensor kv, Tensor slot_mapping," \
-        "Tensor kv_cache, Tensor num_kv_update_slices, int page_size," \
-        "int num_slices_per_block)" \
-        "-> Tensor", )
+        "kv_cache_update_op(Tensor kv, Tensor slot_mapping,"
+        "Tensor kv_cache, Tensor num_kv_update_slices, int page_size,"
+        "int num_slices_per_block)"
+        "-> Tensor",
+    )
 
     @impl(XLA_LIB, "kv_cache_update_op", "XLA")
-    def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                               kv_cache: torch.Tensor,
-                               num_kv_update_slices: torch.Tensor,
-                               page_size: int,
-                               num_slices_per_block: int) -> torch.Tensor:
-        new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
-                                               num_kv_update_slices, page_size,
-                                               num_slices_per_block)
+    def kv_cache_update_op_xla(
+        kv: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache: torch.Tensor,
+        num_kv_update_slices: torch.Tensor,
+        page_size: int,
+        num_slices_per_block: int,
+    ) -> torch.Tensor:
+        new_kv_cache = kv_cache_update_op_impl(
+            kv,
+            slot_mapping,
+            kv_cache,
+            num_kv_update_slices,
+            page_size,
+            num_slices_per_block,
+        )
         return new_kv_cache
 
     @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
-    def kv_cache_update_op_non_xla(kv: torch.Tensor,
-                                   slot_mapping: torch.Tensor,
-                                   kv_cache: torch.Tensor,
-                                   num_kv_update_slices: torch.Tensor,
-                                   page_size: int,
-                                   num_slices_per_block: int) -> torch.Tensor:
+    def kv_cache_update_op_non_xla(
+        kv: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache: torch.Tensor,
+        num_kv_update_slices: torch.Tensor,
+        page_size: int,
+        num_slices_per_block: int,
+    ) -> torch.Tensor:
         return kv_cache
 
 
 class PallasAttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_name() -> str:
         return "PALLAS"
@@ -92,10 +108,6 @@ class PallasAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
         return PallasAttentionBackendImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["PallasMetadata"]:
-        return PallasMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -104,8 +116,9 @@ class PallasAttentionBackend(AttentionBackend):
         head_size: int,
         cache_dtype_str: str = "auto",
     ) -> tuple[int, ...]:
-        padded_head_size = cdiv(
-            head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+        padded_head_size = (
+            cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+        )
         return (num_blocks, block_size, num_kv_heads * 2, padded_head_size)
 
     @staticmethod
@@ -122,10 +135,12 @@ class PallasAttentionBackend(AttentionBackend):
     # we simply make sure that the size is smaller than half of SMEM capacity.
     @staticmethod
     def get_min_page_size(vllm_config: VllmConfig) -> int:
-        max_num_page_per_req = (1024 * 1024 // 2 //
-                                vllm_config.scheduler_config.max_num_seqs // 4)
-        min_page_size = cdiv(vllm_config.model_config.max_model_len,
-                             max_num_page_per_req)
+        max_num_page_per_req = (
+            1024 * 1024 // 2 // vllm_config.scheduler_config.max_num_seqs // 4
+        )
+        min_page_size = cdiv(
+            vllm_config.model_config.max_model_len, max_num_page_per_req
+        )
         min_page_size = 1 << (min_page_size - 1).bit_length()
         return min_page_size
 
@@ -146,8 +161,7 @@ class PallasAttentionBackend(AttentionBackend):
         # handle VREG spills.
         if vllm_config.model_config.max_model_len > 8192:
             return 16
-        page_size = next_power_of_2(
-            vllm_config.model_config.max_model_len) // 16
+        page_size = next_power_of_2(vllm_config.model_config.max_model_len) // 16
         if page_size <= 16:
             return 16
         if page_size >= 256:
@@ -176,19 +190,18 @@ class PallasMetadata:
 
 
 class PallasAttentionBackendImpl(AttentionImpl):
-
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
+        kv_sharing_target_layer_name: int | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -203,15 +216,18 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("Alibi slopes is not supported.")
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "PallasAttentionBackendImpl"
+            )
 
         self.kv_cache_quantized_dtype = None
         if kv_cache_dtype != "auto":
             self.kv_cache_quantized_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE.get(
-                kv_cache_dtype.lower().strip())
+                kv_cache_dtype.lower().strip()
+            )
 
     def forward(
         self,
@@ -221,9 +237,9 @@ class PallasAttentionBackendImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: PallasMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -240,7 +256,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
-                " for PallasAttentionBackendImpl")
+                " for PallasAttentionBackendImpl"
+            )
 
         # For determine_available_memory case.
         if kv_cache.numel() == 0:
@@ -253,15 +270,18 @@ class PallasAttentionBackendImpl(AttentionImpl):
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
         if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
-            padded_head_size = cdiv(
-                self.head_size,
-                TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            padded_head_size = (
+                cdiv(self.head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            )
             query = torch.nn.functional.pad(
-                query, (0, padded_head_size - self.head_size), value=0.0)
+                query, (0, padded_head_size - self.head_size), value=0.0
+            )
             key = torch.nn.functional.pad(
-                key, (0, padded_head_size - self.head_size), value=0.0)
+                key, (0, padded_head_size - self.head_size), value=0.0
+            )
             value = torch.nn.functional.pad(
-                value, (0, padded_head_size - self.head_size), value=0.0)
+                value, (0, padded_head_size - self.head_size), value=0.0
+            )
 
         if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
             # Write input keys and values to the KV cache.
@@ -280,9 +300,9 @@ class PallasAttentionBackendImpl(AttentionImpl):
             )
 
         if self.kv_cache_quantized_dtype is not None and (
-                layer._k_scale_float == 0.0 or layer._v_scale_float == 0.0):
-            raise ValueError(
-                "k_scale_float and v_scale_float must be non-zero")
+            layer._k_scale_float == 0.0 or layer._v_scale_float == 0.0
+        ):
+            raise ValueError("k_scale_float and v_scale_float must be non-zero")
         output = torch.ops.xla.ragged_paged_attention(
             query,
             kv_cache,
@@ -305,7 +325,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         )
 
         if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0:
-            output = output[:, :, :self.head_size]
+            output = output[:, :, : self.head_size]
 
         return output.reshape(num_tokens, hidden_size)
 
@@ -317,11 +337,11 @@ def write_to_kv_cache(
     slot_mapping: torch.Tensor,
     num_slices_per_kv_cache_update_block: int,
     num_kv_update_slices: torch.Tensor,
-    kv_cache_quantized_dtype: Optional[torch.dtype] = None,
+    kv_cache_quantized_dtype: torch.dtype | None = None,
     k_scale: float = 1.0,
     v_scale: float = 1.0,
 ) -> None:
-    """ Write the key and values to the KV cache.
+    """Write the key and values to the KV cache.
 
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
@@ -330,8 +350,7 @@ def write_to_kv_cache(
         num_slices_per_kv_cache_update_block: int
     """
     _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
-    head_size = cdiv(head_size,
-                     TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+    head_size = cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
 
     if kv_cache_quantized_dtype is not None:
         dtype_info = torch.finfo(kv_cache_quantized_dtype)
@@ -343,15 +362,19 @@ def write_to_kv_cache(
         value = torch.clamp(value, dtype_info.min, dtype_info.max)
         value = value.to(kv_cache_quantized_dtype)
 
-    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
-                                                  head_size)
+    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads, head_size)
 
     torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
 
     kv_cache = kv_cache.flatten(0, 1)
     new_kv_cache = torch.ops.xla.kv_cache_update_op(
-        kv, slot_mapping, kv_cache, num_kv_update_slices, page_size,
-        num_slices_per_kv_cache_update_block)
+        kv,
+        slot_mapping,
+        kv_cache,
+        num_kv_update_slices,
+        page_size,
+        num_slices_per_kv_cache_update_block,
+    )
     # NOTE: the in-place copy will be optimized away by XLA compiler.
     kv_cache.copy_(new_kv_cache)
 
@@ -389,15 +412,18 @@ def get_dtype_packing(dtype):
     if 32 % bits != 0:
         raise ValueError(
             f"The bit width must be divisible by 32, but got bits={bits}, "
-            "dtype={dtype}")
+            "dtype={dtype}"
+        )
     return 32 // bits
 
 
-def get_page_size_bytes(block_size: int, num_kv_heads: int, head_size: int,
-                        kv_cache_dtype: torch.dtype) -> int:
+def get_page_size_bytes(
+    block_size: int, num_kv_heads: int, head_size: int, kv_cache_dtype: torch.dtype
+) -> int:
     """Returns the size in bytes of one page of the KV cache."""
-    padded_head_size = cdiv(head_size,
-                            TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+    padded_head_size = (
+        cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+    )
     num_combined_kv_heads = num_kv_heads * 2
 
     # NOTE: for the implicit padding in XLA
@@ -405,5 +431,6 @@ def get_page_size_bytes(block_size: int, num_kv_heads: int, head_size: int,
     num_combined_kv_heads = cdiv(num_combined_kv_heads, packing) * packing
 
     kv_cache_dtype_bits = dtype_bits(kv_cache_dtype)
-    return (block_size * num_combined_kv_heads * padded_head_size *
-            kv_cache_dtype_bits // 8)
+    return (
+        block_size * num_combined_kv_heads * padded_head_size * kv_cache_dtype_bits // 8
+    )
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ed63c7b1bda6b487415c50100d6819a697f2227a..ea611848b0e81fe963768a6e9bd370dbfa4271c6 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -1,207 +1,214 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with AiterFlashAttention."""
+
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+)
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import get_cu_count
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_prefills_and_extends,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 _PARTITION_SIZE_ROCM = 256
+_CP_TOKENS_PER_ITER_ROCM = 32 * 1024
 
 if current_platform.is_rocm():
     import aiter
 
     from vllm.triton_utils import tl, triton
-    from vllm.utils import direct_register_custom_op
+
+    def block_size(x, head_dim):
+        return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
+
+    def num_programs(total_tokens):
+        return min(total_tokens, get_cu_count())
 
     @triton.jit
-    def _vllm_layout_trans_kernel(
-        k_buffer_ptr,
-        v_buffer_ptr,
-        k_values_ptr,
-        v_values_ptr,
-        b_query_lens_loc,
-        b_seq_lens_loc,
-        block_table,
-        block_table_stride_0,
-        k_scale,
-        v_scale,
-        output_dtype: tl.constexpr,
-        E_DIM: tl.constexpr,
+    def cp_mha_gather_cache_kernel(
+        key_cache_ptr,  # [num_blocks, page_size, num_head, head_size]
+        value_cache_ptr,  # [num_blocks, page_size, num_head, head_size]
+        key_ptr,  # [num_tokens, num_heads, head_size]
+        value_ptr,  # [num_tokens, num_heads, head_size]
+        block_table_ptr,  # [num_batches, max_block_num]
+        cu_seqlens_kv_ptr,  # [num_batches + 1]
+        token_to_batch_ptr,  # [max_cum_tokens]
+        seq_start_ptr,  # [num_batches]
+        k_scale_ptr,
+        v_scale_ptr,
+        num_heads,
+        head_size,
+        x,
+        max_block_num,
+        num_tokens,
+        num_programs,
+        DEQUANT: tl.constexpr,
+        PAGE_SIZE: tl.constexpr,
+        CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
-        batch_idx = tl.program_id(0)
-        block_idx = tl.program_id(1)
-
-        batch_query_indexes = tl.load(b_query_lens_loc + batch_idx +
-                                      tl.arange(0, 2))
-        batch_query_start, batch_query_end = tl.split(batch_query_indexes)
-        query_len = batch_query_end - batch_query_start
-
-        if query_len <= 1:
-            return
-
-        batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx +
-                                      tl.arange(0, 2))
-        batch_token_start, batch_token_end = tl.split(batch_token_indexes)
-        seq_len = batch_token_end - batch_token_start
-
-        if block_idx * BLOCK_SIZE < seq_len:
-            block_mask = (block_idx * BLOCK_SIZE +
-                          tl.arange(0, BLOCK_SIZE)[:, None]) < seq_len
-
-            kv_idx = tl.load(block_table + batch_idx * block_table_stride_0 +
-                             block_idx).to(tl.int64)
-
-            kv_buffer_off = kv_idx * BLOCK_SIZE * E_DIM + tl.arange(
-                0, BLOCK_SIZE)[:, None] * E_DIM + tl.arange(0, E_DIM)[None, :]
-            k_vals = tl.load(k_buffer_ptr + kv_buffer_off,
-                             mask=block_mask,
-                             other=0.0)
-            if k_vals.dtype.is_fp8():
-                k_vals = (k_vals.to(tl.float32) *
-                          tl.load(k_scale)).to(output_dtype)
-            else:
-                k_vals = k_vals.to(output_dtype)
-
-            v_vals = tl.load(v_buffer_ptr + kv_buffer_off,
-                             mask=block_mask,
-                             other=0.0)
-            if v_vals.dtype.is_fp8():
-                v_vals = (v_vals.to(tl.float32) *
-                          tl.load(v_scale)).to(output_dtype)
-            else:
-                v_vals = v_vals.to(output_dtype)
-            kv_values_off = batch_token_start * E_DIM + \
-                block_idx * BLOCK_SIZE * E_DIM + \
-                tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM + \
-                tl.arange(0, E_DIM)[None, :]
-            tl.store(k_values_ptr + kv_values_off, k_vals, mask=block_mask)
-            tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
-
-    def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
-                          k_cache, v_cache, max_seq_len, k_scale, v_scale,
-                          output_dtype, total_tokens):
-        H_KV = v_cache.shape[2]
-        D = v_cache.shape[3]
-        BLOCK_SIZE = v_cache.shape[1]
-
-        k_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=k_cache.device,
+        bid = tl.program_id(0)
+        col_offsets = tl.arange(0, BLOCK_SIZE)
+        if DEQUANT:
+            k_scale = tl.load(k_scale_ptr)
+            v_scale = tl.load(v_scale_ptr)
+
+        for token_id in tl.range(bid, num_tokens, num_programs):
+            key_ptr_offset = key_ptr + token_id * head_size * num_heads
+            value_ptr_offset = value_ptr + token_id * head_size * num_heads
+            batch_idx = tl.load(token_to_batch_ptr + token_id)
+            batch_start = tl.load(seq_start_ptr + batch_idx)
+            token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
+            batch_offset = token_id - token_start + batch_start
+            block_offset = batch_offset // PAGE_SIZE
+            block_id = tl.load(
+                block_table_ptr + max_block_num * batch_idx + block_offset
+            )
+            slot_id = batch_offset % PAGE_SIZE
+
+            if CACHE_FORMAT == "NHD":
+                # for kv cache layout as
+                # K: [num_blocks, page_size, num_head, head_dim]
+                # V: [num_blocks, page_size, num_head, head_dim]
+                key_cache_ptr_offset = (
+                    key_cache_ptr
+                    + block_id * num_heads * head_size * PAGE_SIZE
+                    + slot_id * num_heads * head_size
+                )
+                value_cache_ptr_offset = (
+                    value_cache_ptr
+                    + block_id * num_heads * head_size * PAGE_SIZE
+                    + slot_id * num_heads * head_size
+                )
+
+                for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
+                    mask = (col_offsets + i) < head_size * num_heads
+                    k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
+                    v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
+                    if DEQUANT:
+                        k_dtype = k_reg.dtype
+                        v_dtype = v_reg.dtype
+                        k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
+                        v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
+                    tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
+                    tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
+
+    def cp_mha_gather_cache(
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        block_tables: torch.Tensor,
+        k_scales: torch.Tensor,
+        v_scales: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        token_to_batch: torch.Tensor,
+        seq_starts: torch.Tensor,
+        dequant: bool,
+        kv_cache_layout: str,
+        total_tokens: int,
+    ):
+        assert kv_cache_layout in ["v0", "NHD", "HND"], (
+            "kv_cache_layout only support v0, NHD, HND"
+        )
+        head_dim = key.shape[2]
+        x = 0
+        # assert dequant is True, "Currently, we only support "\
+        # "gather cache with dequant"
+        # For k cache layout: [num_blocks, num_heads, page_size, head_dim]
+        assert kv_cache_layout == "NHD", (
+            "ROCM_AITER_FA_BACKEND Only support NHD kv cache layout for now"
+        )
+        assert head_dim == key_cache.shape[3], (
+            "We assume your kv cache layout is [num_blocks, "
+            "page_size, num_heads, head_dim], but got otherwise"
         )
-        v_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=v_cache.device,
+        page_size = key_cache.shape[1]
+        num_heads = key_cache.shape[2]
+
+        NUM_PRGMS = num_programs(total_tokens)
+        BLOCK_SIZE = block_size(key_cache, head_dim)
+        grid = lambda meta: (NUM_PRGMS,)
+        cp_mha_gather_cache_kernel[grid](
+            key_cache,
+            value_cache,
+            key,
+            value,
+            block_tables,
+            cu_seqlens_kv,
+            token_to_batch,
+            seq_starts,
+            k_scales,
+            v_scales,
+            num_heads,
+            head_dim,
+            x,
+            block_tables.size(1),
+            total_tokens,
+            NUM_PRGMS,
+            DEQUANT=dequant,
+            PAGE_SIZE=page_size,
+            CACHE_FORMAT=kv_cache_layout,
+            BLOCK_SIZE=BLOCK_SIZE,
         )
 
-        grid = (block_table.shape[0],
-                (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
 
-        if output_dtype == torch.float16:
-            output_dtype = tl.float16
-        elif output_dtype == torch.bfloat16:
-            output_dtype = tl.bfloat16
-        else:
-            raise ValueError(f"Unsupported output dtype: {output_dtype}")
-
-        _vllm_layout_trans_kernel[grid](k_cache,
-                                        v_cache,
-                                        k_values,
-                                        v_values,
-                                        b_query_lens_loc,
-                                        b_seq_lens_loc,
-                                        block_table,
-                                        block_table.stride(0),
-                                        k_scale,
-                                        v_scale,
-                                        output_dtype=output_dtype,
-                                        E_DIM=H_KV * D,
-                                        BLOCK_SIZE=BLOCK_SIZE)
-
-        return k_values, v_values
-
-    def flash_attn_varlen_func_impl(
-        q: torch.Tensor,
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        out: torch.Tensor,
-        cu_seqlens_q: torch.Tensor,
-        cu_seqlens_k: torch.Tensor,
-        max_seqlen_q: int,
-        max_seqlen_k: int,
-        softmax_scale: float,
-        window_size: Optional[list[int]],  # -1 means infinite context window
-        alibi_slopes: Optional[list[float]],
-        block_table: torch.Tensor,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        total_tokens: int = 0,
-    ) -> torch.Tensor:
-        if total_tokens == 0:
-            total_tokens = int(cu_seqlens_k[-1].item())
-        k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table,
-                                 k_cache, v_cache, max_seqlen_k, k_scale,
-                                 v_scale, q.dtype, total_tokens)
-
-        output = aiter.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=cu_seqlens_q,
-            max_seqlen_q=max_seqlen_q,
-            min_seqlen_q=1,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_k=max_seqlen_k,
-            softmax_scale=softmax_scale,
-            causal=True,
-            alibi_slopes=alibi_slopes,
-            window_size=window_size,
-            out=out,
-        )
-        return output
+logger = init_logger(__name__)
 
-    def flash_attn_varlen_func_fake(
-        q: torch.Tensor,
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        out: torch.Tensor,
-        cu_seqlens_q: torch.Tensor,
-        cu_seqlens_k: torch.Tensor,
-        max_seqlen_q: int,
-        max_seqlen_k: int,
-        softmax_scale: float,
-        window_size: Optional[list[int]],  # -1 means infinite context window
-        alibi_slopes: Optional[list[float]],
-        block_table: torch.Tensor,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        total_tokens: int = 0,
-    ) -> torch.Tensor:
-        return torch.empty(q.shape[0],
-                           q.shape[1],
-                           v_cache.shape[-2],
-                           dtype=q.dtype,
-                           device=q.device)
 
-    direct_register_custom_op("flash_attn_varlen_func",
-                              flash_attn_varlen_func_impl, ["out"],
-                              flash_attn_varlen_func_fake,
-                              dispatch_key=current_platform.dispatch_key)
+@dataclass
+class AiterFlashAttentionDecodeMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
 
-logger = init_logger(__name__)
+
+@dataclass
+class AiterFlashAttentionPrefillMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
+
+
+@dataclass
+class AiterChunkContextMetadata:
+    workspace: torch.Tensor
+    cu_seq_lens_chunk: torch.Tensor
+    chunk_starts: torch.Tensor
+    token_to_batch: torch.Tensor
+    seq_tot: list[int]
+    max_seq_lens: list[int]
+    seq_lens: torch.Tensor
+    num_chunks: int
+    total_token_per_batch: list[int]
+
+
+@dataclass
+class AiterFlashAttentionChunkPrefillMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
+    chunk_context_metadata: AiterChunkContextMetadata
 
 
 @dataclass
@@ -222,7 +229,18 @@ class AiterFlashAttentionMetadata:
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
-    cu_seq_lens: Optional[torch.Tensor]
+
+    # prefill and deocde split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+    num_extends: int
+    num_extend_tokens: int
+
+    decode_metadata: AiterFlashAttentionDecodeMetadata | None
+    prefill_metadata: AiterFlashAttentionPrefillMetadata | None
+    extend_metadata: AiterFlashAttentionChunkPrefillMetadata | None
 
     # For cascade attention.
     use_cascade: bool
@@ -231,11 +249,18 @@ class AiterFlashAttentionMetadata:
 
 
 class AiterFlashAttentionMetadataBuilder(
-        AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
-    cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    AttentionMetadataBuilder[AiterFlashAttentionMetadata]
+):
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    reorder_batch_threshold: int = 1
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         self.model_config = vllm_config.model_config
@@ -243,68 +268,172 @@ class AiterFlashAttentionMetadataBuilder(
         self.cache_config = vllm_config.cache_config
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        self.num_heads_kv = self.model_config.get_num_kv_heads(
-            self.parallel_config)
+            self.parallel_config
+        )
+        self.num_heads_kv = self.model_config.get_num_kv_heads(self.parallel_config)
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
-        self.aot_sliding_window: Optional[tuple[int, int]] = None
+        self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
+        self.extend_workspace = torch.empty(
+            [2, _CP_TOKENS_PER_ITER_ROCM, self.num_heads_kv, self.headdim],
+            dtype=self.model_config.dtype,
+            device=device,
+        )
+
     def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata):
-        self.total_tokens = self.model_config.max_model_len \
+        self, common_attn_metadata: CommonAttentionMetadata
+    ):
+        self.total_tokens = (
+            self.model_config.max_model_len
             * self.vllm_config.scheduler_config.max_num_partial_prefills
-        res = self.build(common_prefix_len=0,
-                         common_attn_metadata=common_attn_metadata)
+        )
+        res = self.build(common_prefix_len=0, common_attn_metadata=common_attn_metadata)
         self.total_tokens = 0
         return res
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> 'AiterFlashAttentionMetadata':
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.max_seq_len
-        query_start_loc = common_attn_metadata.query_start_loc
-        seq_lens = common_attn_metadata.seq_lens
-        block_table_tensor = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-        if max_query_len > 1:
-            # We pre-compute cumulative seq len needed for prefill attention
-            # here to avoid recomputing it for every layer
-            cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=seq_lens.device)
-            torch.cumsum(seq_lens,
-                         dim=0,
-                         dtype=cu_seq_lens.dtype,
-                         out=cu_seq_lens[1:])
-            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
-        else:
-            cu_seq_lens = None
-            num_actual_kv_tokens = 0
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> "AiterFlashAttentionMetadata":
+        split_ret = split_decodes_prefills_and_extends(
+            common_attn_metadata,
+            decode_threshold=self.reorder_batch_threshold,
+        )
+
+        (
+            num_decodes,
+            num_extends,
+            num_prefills,
+            num_decode_tokens,
+            num_extend_tokens,
+            num_prefill_tokens,
+        ) = split_ret
+
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+
+        seq_lens = common_attn_metadata.seq_lens_cpu
+
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+
+        decode_metadata = None
+        if num_decodes > 0:
+            decode_metadata = AiterFlashAttentionDecodeMetadata(
+                max_query_len=query_lens_cpu[:num_decodes].max().item(),
+                min_query_len=query_lens_cpu[:num_decodes].min().item(),
+                max_seq_len=seq_lens[:num_decodes].max().item(),
+                query_start_loc=common_attn_metadata.query_start_loc[: num_decodes + 1],
+            )
+
+        prefill_metadata = None
+        if num_prefills > 0:
+            query_lens_for_prefill = query_lens_cpu[num_decodes + num_extends :]
+            query_start_loc_device = common_attn_metadata.query_start_loc[
+                num_decodes + num_extends :
+            ]
+            prefill_metadata = AiterFlashAttentionPrefillMetadata(
+                max_query_len=query_lens_for_prefill.max().item(),
+                min_query_len=query_lens_for_prefill.min().item(),
+                max_seq_len=seq_lens[num_decodes + num_extends :].max().item(),
+                query_start_loc=query_start_loc_device - query_start_loc_device[0],
+            )
+
+        extend_metadata = None
+        if num_extends > 0:
+            num_extends_slice = slice(num_decodes, num_decodes + num_extends)
+            query_lens_for_extend = query_lens_cpu[num_extends_slice]
+            seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
+            computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
+
+            # allocate the equal amount of workspace for
+            # each chunk prefill request
+            max_context_chunk = _CP_TOKENS_PER_ITER_ROCM // num_extends
+            num_chunks = cdiv(computed_kv_lens.max().item(), max_context_chunk)
+
+            chunk_starts = (
+                torch.arange(num_chunks, dtype=torch.int32)
+                .unsqueeze(1)
+                .expand(-1, num_extends)
+                * max_context_chunk
+            )
+            chunk_ends = torch.min(
+                computed_kv_lens.unsqueeze(0), chunk_starts + max_context_chunk
+            )
+            chunk_seq_lens = (chunk_ends - chunk_starts).clamp(
+                min=0
+            )  # [num_chunks, num_extends]
+            cu_seq_lens_cpu = torch.zeros(
+                [num_chunks, num_extends + 1], dtype=torch.int32, pin_memory=True
+            )
+            torch.cumsum(
+                chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
+            )
+            max_cum_tokens = cu_seq_lens_cpu[:, -1].max().item()
+
+            range_idx = torch.arange(max_cum_tokens, dtype=torch.int32)[None, None, :]
+            idx_to_batch_tensor = range_idx == cu_seq_lens_cpu[:, 1:][:, :, None]
+            idx_to_batch_tensor = idx_to_batch_tensor.sum(
+                dim=1
+            )  # [num_chunks, max_cum_tokens]
+            token_to_batch_tensor = torch.cumsum(idx_to_batch_tensor, dim=1)
+
+            chunk_context_metadata = AiterChunkContextMetadata(
+                workspace=self.extend_workspace,
+                cu_seq_lens_chunk=cu_seq_lens_cpu.to(self.device, non_blocking=True),
+                chunk_starts=chunk_starts.to(self.device, non_blocking=True),
+                seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                seq_lens=chunk_seq_lens,
+                token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
+                num_chunks=num_chunks,
+                total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
+            )
+
+            query_start_loc_device = common_attn_metadata.query_start_loc[
+                num_decodes : num_decodes + num_extends + 1
+            ]
+            seq_lens_device = common_attn_metadata.seq_lens[num_extends_slice]
+            cu_seq_lens = torch.zeros(
+                num_extends + 1, dtype=torch.int32, device=seq_lens_device.device
+            )
+            torch.cumsum(
+                seq_lens_device, dim=0, dtype=cu_seq_lens.dtype, out=cu_seq_lens[1:]
+            )
+            extend_metadata = AiterFlashAttentionChunkPrefillMetadata(
+                max_query_len=query_lens_for_extend.max().item(),
+                min_query_len=query_lens_for_extend.min().item(),
+                max_seq_len=seq_lens[num_extends_slice].max().item(),
+                query_start_loc=query_start_loc_device - query_start_loc_device[0],
+                chunk_context_metadata=chunk_context_metadata,
+            )
 
-        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
-                     max_seq_len, causal):
-            return None
+        num_actual_kv_tokens = torch.sum(seq_lens).item()
 
         use_cascade = common_prefix_len > 0
 
         attn_metadata = AiterFlashAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
             num_actual_kv_tokens=num_actual_kv_tokens,
-            max_query_len=max_query_len,
-            query_start_loc=query_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table_tensor,
-            slot_mapping=slot_mapping,
-            cu_seq_lens=cu_seq_lens,
+            max_query_len=common_attn_metadata.max_query_len,
+            query_start_loc=common_attn_metadata.query_start_loc,
+            max_seq_len=common_attn_metadata.max_seq_len,
+            seq_lens=common_attn_metadata.seq_lens,
+            block_table=common_attn_metadata.block_table_tensor,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_extends=num_extends,
+            num_extend_tokens=num_extend_tokens,
+            decode_metadata=decode_metadata,
+            prefill_metadata=prefill_metadata,
+            extend_metadata=extend_metadata,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             total_tokens=self.total_tokens,
@@ -316,28 +445,14 @@ class AiterFlashAttentionMetadataBuilder(
 
 
 class AiterFlashAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [64, 128, 256]
 
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
-
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
@@ -346,10 +461,6 @@ class AiterFlashAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["AiterFlashAttentionImpl"]:
         return AiterFlashAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return AiterFlashAttentionMetadata
-
     @staticmethod
     def get_builder_cls() -> type["AiterFlashAttentionMetadataBuilder"]:
         return AiterFlashAttentionMetadataBuilder
@@ -364,23 +475,23 @@ class AiterFlashAttentionBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
+
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
 
 class AiterFlashAttentionImpl(AttentionImpl):
-
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
+        kv_sharing_target_layer_name: int | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -396,20 +507,124 @@ class AiterFlashAttentionImpl(AttentionImpl):
         self.kv_cache_dtype = kv_cache_dtype
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
-            logits_soft_cap = 0.
+            logits_soft_cap = 0.0
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        AiterFlashAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "FlashAttentionImpl"
+            )
+
+    def extend_forward(
+        self,
+        attn_metadata: AiterFlashAttentionMetadata,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        output: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        min_seqlen_q: int,
+        block_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        k_scale: float,
+        v_scale: float,
+    ):
+        out, lse = aiter.flash_attn_varlen_func(
+            q=query,
+            k=key,
+            v=value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_q,
+            min_seqlen_q=min_seqlen_q,
+            dropout_p=0.0,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            return_lse=True,
+        )
+        assert attn_metadata.extend_metadata is not None
+        chunk_context_metadata = attn_metadata.extend_metadata.chunk_context_metadata
+        num_chunks = chunk_context_metadata.num_chunks
+        workspace = chunk_context_metadata.workspace
+        cu_seqlens_kv = chunk_context_metadata.cu_seq_lens_chunk
+        max_seqlens = chunk_context_metadata.max_seq_lens
+        chunk_starts = chunk_context_metadata.chunk_starts
+        token_to_batch = chunk_context_metadata.token_to_batch
+        total_token_per_batch = chunk_context_metadata.total_token_per_batch
+        key_fetched, value_fetched = workspace[0], workspace[1]
+        chunked_output = None
+        chunked_lse = None
+        for chunk_idx in range(num_chunks):
+            cp_mha_gather_cache(
+                key_cache=key_cache,
+                value_cache=value_cache,
+                key=key_fetched,
+                value=value_fetched,
+                block_tables=block_table,
+                k_scales=k_scale,
+                v_scales=v_scale,
+                cu_seqlens_kv=cu_seqlens_kv[chunk_idx],
+                token_to_batch=token_to_batch[chunk_idx],
+                seq_starts=chunk_starts[chunk_idx],
+                dequant=False,
+                kv_cache_layout="NHD",
+                total_tokens=total_token_per_batch[chunk_idx],
+            )
+
+            suf_out, suf_lse = aiter.flash_attn_varlen_func(
+                q=query,
+                k=key_fetched,
+                v=value_fetched,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_kv[chunk_idx],
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlens[chunk_idx],
+                min_seqlen_q=min_seqlen_q,
+                dropout_p=0.0,
+                softmax_scale=self.scale,
+                causal=False,
+                window_size=self.sliding_window,
+                alibi_slopes=self.alibi_slopes,
+                return_lse=True,
+            )
+            if chunked_output is None:
+                chunked_output = suf_out
+                chunked_lse = suf_lse
+            else:
+                tmp_output = torch.empty_like(out)
+                tmp_lse = torch.empty_like(lse)
+                merge_attn_states(
+                    output=tmp_output,
+                    output_lse=tmp_lse,
+                    prefix_output=chunked_output,
+                    prefix_lse=chunked_lse,
+                    suffix_output=suf_out,
+                    suffix_lse=suf_lse,
+                )
+                chunked_output = tmp_output
+                chunked_lse = tmp_lse
+
+        merge_attn_states(
+            output=output,
+            prefix_output=chunked_output,
+            prefix_lse=chunked_lse,
+            suffix_output=out,
+            suffix_lse=lse,
+        )
 
     def forward(
         self,
@@ -419,9 +634,9 @@ class AiterFlashAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AiterFlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with AiterFlashAttention.
 
@@ -442,32 +657,33 @@ class AiterFlashAttentionImpl(AttentionImpl):
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashAttentionImpl")
+                "fused output quantization is not yet supported for FlashAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # IMPORTANT!
-        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
-        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
-        # in this method. For example, `view` and `slice` (or `[:n]`) operations
-        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is
+        # executed in eager-mode PyTorch. Thus, we need to be careful
+        # about any CPU overhead in this method. For example, `view`
+        # and `slice` (or `[:n]`) operations are surprisingly slow even
+        # in the case they do not invoke any GPU ops.
         # Minimize the PyTorch ops in this method as much as possible.
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
-
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(0)
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping
+            # is not padded. However, we don't need to do
+            # key[:num_actual_tokens] and value[:num_actual_tokens] because
+            # the reshape_and_cache_flash op uses the slot_mapping's shape
+            # to determine the number of actual tokens.
+
             torch.ops._C_cache_ops.reshape_and_cache_flash(
                 key,
                 value,
@@ -483,67 +699,118 @@ class AiterFlashAttentionImpl(AttentionImpl):
             key_cache = key_cache.view(current_platform.fp8_dtype())
             value_cache = value_cache.view(current_platform.fp8_dtype())
 
+        # decode:extend:prefill
+        query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
+
+        output_actual_tokens = output[:num_actual_tokens]
+
+        num_decodes = attn_metadata.num_decodes
+        num_prefills = attn_metadata.num_prefills
+        num_extends = attn_metadata.num_extends
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_extend_tokens = attn_metadata.num_extend_tokens
         if not attn_metadata.use_cascade:
-            cu_seqlens_q = attn_metadata.query_start_loc
-            seqused_k = attn_metadata.seq_lens
-            max_seqlen_q = attn_metadata.max_query_len
-            max_seqlen_k = attn_metadata.max_seq_len
-            block_table = attn_metadata.block_table
-
-            if max_seqlen_q > 1:
-                torch.ops.vllm.flash_attn_varlen_func(
-                    query[:num_actual_tokens],
-                    key_cache,
-                    value_cache,
-                    out=output[:num_actual_tokens],
-                    cu_seqlens_q=cu_seqlens_q,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_k=max_seqlen_k,
+            # calculate for pure prefills
+            if num_prefills > 0:
+                assert attn_metadata.prefill_metadata is not None
+
+                prefill_query = query[num_decode_tokens + num_extend_tokens :]
+                prefill_key = key[num_decode_tokens + num_extend_tokens :]
+                prefill_value = value[num_decode_tokens + num_extend_tokens :]
+
+                aiter.flash_attn_varlen_func(
+                    q=prefill_query,
+                    k=prefill_key,
+                    v=prefill_value,
+                    cu_seqlens_q=attn_metadata.prefill_metadata.query_start_loc,
+                    cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc,
+                    max_seqlen_q=attn_metadata.prefill_metadata.max_query_len,
+                    max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len,
+                    min_seqlen_q=1,
+                    dropout_p=0.0,
                     softmax_scale=self.scale,
-                    alibi_slopes=self.alibi_slopes,
+                    causal=True,
                     window_size=self.sliding_window,
-                    block_table=block_table,
-                    cu_seqlens_k=attn_metadata.cu_seq_lens,
+                    alibi_slopes=self.alibi_slopes,
+                    out=output_actual_tokens[num_decode_tokens + num_extend_tokens :],
+                )
+
+            # calculate for extends
+            if num_extends > 0:
+                assert attn_metadata.extend_metadata is not None
+                extend_tokens_slice = slice(
+                    num_decode_tokens, num_decode_tokens + num_extend_tokens
+                )
+                extend_querys = query[extend_tokens_slice]
+                extend_keys = key[extend_tokens_slice]
+                extend_values = value[extend_tokens_slice]
+                extend_outputs = output[extend_tokens_slice]
+                self.extend_forward(
+                    attn_metadata=attn_metadata,
+                    query=extend_querys,
+                    key=extend_keys,
+                    value=extend_values,
+                    key_cache=key_cache,
+                    value_cache=value_cache,
+                    output=extend_outputs,
+                    cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
+                    max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
+                    max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
+                    min_seqlen_q=1,
+                    block_table=attn_metadata.block_table[
+                        num_decodes : num_decodes + num_extends
+                    ],
+                    slot_mapping=attn_metadata.slot_mapping[
+                        num_decodes : num_decodes + num_extends
+                    ],
                     k_scale=layer._k_scale,
                     v_scale=layer._v_scale,
-                    total_tokens=attn_metadata.num_actual_kv_tokens,
                 )
 
-            _, num_heads, head_size = query.shape
-            nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
-            num_seqs = seqused_k.shape[0]
-            max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
-                                  1) // _PARTITION_SIZE_ROCM
-
-            workspace_buffer = torch.empty(
-                (num_seqs * num_heads * max_num_partitions * head_size) *
-                nbytes_per_qo_elem + 2 *
-                (num_seqs * num_heads * max_num_partitions) * 4,
-                dtype=torch.uint8,
-                device=output.device,
-            )
+            # calculate for decodes
+            if num_decodes > 0:
+                assert attn_metadata.decode_metadata is not None
+                _, num_heads, head_size = query.shape
+                nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
+                num_seqs = attn_metadata.seq_lens.shape[0]
+                max_num_partitions = (
+                    attn_metadata.max_seq_len + _PARTITION_SIZE_ROCM - 1
+                ) // _PARTITION_SIZE_ROCM
+
+                workspace_buffer = torch.empty(
+                    (num_seqs * num_heads * max_num_partitions * head_size)
+                    * nbytes_per_qo_elem
+                    + 2 * (num_seqs * num_heads * max_num_partitions) * 4,
+                    dtype=torch.uint8,
+                    device=output.device,
+                )
 
-            torch.ops.aiter.paged_attention_v1(
-                output[:num_actual_tokens],
-                workspace_buffer,
-                query[:num_actual_tokens],
-                key_cache,
-                value_cache,
-                self.scale,
-                block_table,
-                cu_seqlens_q,
-                seqused_k,
-                max_seqlen_k,
-                self.alibi_slopes,
-                self.kv_cache_dtype,
-                "NHD",
-                self.logits_soft_cap,
-                layer._k_scale,
-                layer._v_scale,
-                None,
-                _PARTITION_SIZE_ROCM,
-            )
-            return output
+                torch.ops.aiter.paged_attention_v1(
+                    output[:num_decode_tokens],
+                    workspace_buffer,
+                    query[:num_decode_tokens],
+                    key_cache,
+                    value_cache,
+                    self.scale,
+                    attn_metadata.block_table[:num_decodes],
+                    attn_metadata.query_start_loc[:num_decodes],
+                    attn_metadata.seq_lens[:num_decodes],
+                    attn_metadata.max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    "NHD",
+                    self.logits_soft_cap,
+                    layer._k_scale,
+                    layer._v_scale,
+                    None,
+                    _PARTITION_SIZE_ROCM,
+                )
         else:
             raise NotImplementedError(
-                "Cascade attention is not implemented for ROCM AITER")
+                "Cascade attention is not implemented for ROCM AITER"
+            )
+
+        return output
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2639c0df04129d8d125006142f2234d21e7983b
--- /dev/null
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with PagedAttention and Triton prefix prefill."""
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import AttentionType
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kFp8StaticTensorSym,
+)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.rocm_attn import (
+    RocmAttentionBackend,
+    RocmAttentionImpl,
+    RocmAttentionMetadataBuilder,
+)
+
+logger = init_logger(__name__)
+
+
+class RocmAiterUnifiedAttentionBackend(RocmAttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_UNIFIED_ATTN"
+
+    @staticmethod
+    def get_impl_cls() -> type["RocmAiterUnifiedAttentionImpl"]:
+        return RocmAiterUnifiedAttentionImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+    @staticmethod
+    def get_builder_cls() -> type["RocmAttentionMetadataBuilder"]:
+        return RocmAttentionMetadataBuilder
+
+
+class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
+    def fused_output_quant_supported(self, quant_key: QuantKey):
+        return quant_key == kFp8StaticTensorSym
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            sinks,
+        )
+        logger.info_once(
+            "Using aiter unified attention for RocmAiterUnifiedAttentionImpl"
+        )
+        from aiter.ops.triton.unified_attention import unified_attention
+
+        self.unified_attention = unified_attention
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_block_scale is not None:
+            raise NotImplementedError(
+                "fused block_scale output quantization is not yet supported"
+                " for RocmAttentionImpl"
+            )
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output.fill_(0)
+
+        assert attn_metadata.use_cascade is False
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(self.fp8_dtype)
+            value_cache = value_cache.view(self.fp8_dtype)
+            assert layer._q_scale_float == 1.0, (
+                "A non 1.0 q_scale is not currently supported."
+            )
+
+        cu_seqlens_q = attn_metadata.query_start_loc
+        seqused_k = attn_metadata.seq_lens
+        max_seqlen_q = attn_metadata.max_query_len
+        max_seqlen_k = attn_metadata.max_seq_len
+        block_table = attn_metadata.block_table
+
+        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
+        self.unified_attention(
+            q=query[:num_actual_tokens],
+            k=key_cache,
+            v=value_cache,
+            out=output[:num_actual_tokens],
+            cu_seqlens_q=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            seqused_k=seqused_k,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=self.scale,
+            causal=True,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            block_table=block_table,
+            softcap=self.logits_soft_cap,
+            q_descale=None,  # Not supported
+            k_descale=layer._k_scale.expand(descale_shape),
+            v_descale=layer._v_scale.expand(descale_shape),
+            sinks=self.sinks,
+            output_scale=output_scale,
+        )
+
+        return output
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e973be16d7790f9fbd46ec40e30132f76e3d2f6c..6dfdfc19ccba18c82853a6bb831f9b43d27ac5f8 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -1,28 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
+
 from dataclasses import dataclass
-from functools import cache
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 
-from vllm import _custom_ops as ops
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.ops.chunked_prefill_paged_decode import (
-    chunked_prefill_paged_decode)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+)
+from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym)
+    QuantKey,
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
@@ -49,30 +53,34 @@ class RocmAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
 
 
-class RocmAttentionMetadataBuilder(
-        AttentionMetadataBuilder[RocmAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadata]):
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         self.block_size = kv_cache_spec.block_size
 
         model_config = vllm_config.model_config
         self.num_heads_q = model_config.get_num_attention_heads(
-            vllm_config.parallel_config)
-        self.num_heads_kv = model_config.get_num_kv_heads(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
+        self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config)
         self.headdim = model_config.get_head_size()
 
     def build_for_cudagraph_capture(
@@ -83,12 +91,21 @@ class RocmAttentionMetadataBuilder(
         # max_model_len will cause graph capture to be extremely
         # slow, so here we set it to 1.
         attn_metadata.seq_lens.fill_(1)
+
+        # Here we set the query start locs to 0. This is to
+        # cover up an invalid memory access in the prefix_prefil kernel
+        # that we run into during graph capture (#25985)
+        common_attn_metadata.query_start_loc.zero_()
+        common_attn_metadata.query_start_loc_cpu.zero_()
+
         return attn_metadata
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> RocmAttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> RocmAttentionMetadata:
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
 
@@ -101,14 +118,13 @@ class RocmAttentionMetadataBuilder(
         use_cascade = common_prefix_len > 0
 
         if use_cascade:
-            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
-                                                dtype=torch.int32,
-                                                device=self.device)
-            prefix_kv_lens = torch.tensor([common_prefix_len],
-                                          dtype=torch.int32,
-                                          device=self.device)
-            suffix_kv_lens = (common_attn_metadata.seq_lens_cpu -
-                              common_prefix_len)
+            cu_prefix_query_lens = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device=self.device
+            )
+            prefix_kv_lens = torch.tensor(
+                [common_prefix_len], dtype=torch.int32, device=self.device
+            )
+            suffix_kv_lens = common_attn_metadata.seq_lens_cpu - common_prefix_len
             suffix_kv_lens = suffix_kv_lens.to(self.device)
         else:
             cu_prefix_query_lens = None
@@ -135,12 +151,8 @@ class RocmAttentionMetadataBuilder(
 
 
 class RocmAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -148,14 +160,14 @@ class RocmAttentionBackend(AttentionBackend):
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
+        if not cls.supports_head_size(head_size):
             attn_type = cls.__name__.removesuffix("Backend")
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
+                f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
                 "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+                "FlexAttention backend which supports all head sizes."
+            )
 
     @staticmethod
     def get_name() -> str:
@@ -165,16 +177,13 @@ class RocmAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["RocmAttentionImpl"]:
         return RocmAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return RocmAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
         block_size: int,
         num_kv_heads: int,
         head_size: int,
+        cache_dtype_str: str = "auto",
     ) -> tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
@@ -189,17 +198,7 @@ class RocmAttentionBackend(AttentionBackend):
         return RocmAttentionMetadataBuilder
 
 
-@cache
-def use_aiter_unified_attention() -> bool:
-    """Check if aiter unified attention should be used."""
-    # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set
-    # to 1 as default
-    return envs.VLLM_ROCM_USE_AITER \
-        and envs.VLLM_USE_AITER_UNIFIED_ATTENTION
-
-
 class RocmAttentionImpl(AttentionImpl):
-
     def fused_output_quant_supported(self, quant_key: QuantKey):
         return quant_key == kFp8StaticTensorSym
 
@@ -209,13 +208,13 @@ class RocmAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -240,37 +239,22 @@ class RocmAttentionImpl(AttentionImpl):
         RocmAttentionBackend.validate_head_size(head_size)
 
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "RocmAttentionImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "RocmAttentionImpl"
+            )
 
         self.fp8_dtype = current_platform.fp8_dtype()
-        self.force_prefill_decode_attn = \
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-
-        if not self.force_prefill_decode_attn:
-            # If not using prefill decode attention, we use the Triton
-            # unified attention implementation.
-            if use_aiter_unified_attention():
-                logger.info_once(
-                    "Using aiter unified attention for RocmAttentionImpl")
-                from aiter.ops.triton.unified_attention import (
-                    unified_attention)
-                self.unified_attention = unified_attention
-            else:
-                logger.info_once(
-                    "Using vllm unified attention for RocmAttentionImpl")
-                from vllm.attention.ops.triton_unified_attention import (
-                    unified_attention)
-                self.unified_attention = unified_attention
 
         self.sinks = sinks
         if sinks is not None:
             assert sinks.shape[0] == num_heads, (
                 "Sinks must have the same number of heads as the number of "
                 f"heads in the layer. Sinks shape: {sinks.shape}, "
-                f"num_heads: {num_heads}.")
+                f"num_heads: {num_heads}."
+            )
 
     def forward(
         self,
@@ -280,9 +264,9 @@ class RocmAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -301,11 +285,12 @@ class RocmAttentionImpl(AttentionImpl):
         if output_block_scale is not None:
             raise NotImplementedError(
                 "fused block_scale output quantization is not yet supported"
-                " for RocmAttentionImpl")
+                " for RocmAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         assert attn_metadata.use_cascade is False
 
@@ -318,56 +303,32 @@ class RocmAttentionImpl(AttentionImpl):
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
-        if use_prefill_decode_attn:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-        else:
-            key_cache, value_cache = kv_cache.unbind(0)
+        key_cache, value_cache = PagedAttention.split_kv_cache(
+            kv_cache, self.num_kv_heads, self.head_size
+        )
 
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
-            if use_prefill_decode_attn:
-                PagedAttention.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-            else:
-                ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
+            PagedAttention.write_to_paged_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
             value_cache = value_cache.view(self.fp8_dtype)
-            num_tokens, num_heads, head_size = query.shape
-            assert layer._q_scale_float == 1.0, \
+            assert layer._q_scale_float == 1.0, (
                 "A non 1.0 q_scale is not currently supported."
-            if current_platform.is_cuda():
-                # Skip Q quantization on ROCm and XPU, enable this on cuda
-                # only, since dequantizing back to f32 in the attention kernel
-                # is not supported.
-                query, _ = ops.scaled_fp8_quant(
-                    query.reshape(
-                        (num_tokens, num_heads * head_size)).contiguous(),
-                    layer._q_scale)
-                query = query.reshape((num_tokens, num_heads, head_size))
+            )
 
         cu_seqlens_q = attn_metadata.query_start_loc
         seqused_k = attn_metadata.seq_lens
@@ -375,52 +336,27 @@ class RocmAttentionImpl(AttentionImpl):
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        if use_prefill_decode_attn:
-            # Compute attention and update output up to `num_actual_tokens`.
-            chunked_prefill_paged_decode(
-                query=query[:num_actual_tokens],
-                key=key[:num_actual_tokens],
-                value=value[:num_actual_tokens],
-                output=output[:num_actual_tokens],
-                kv_cache_dtype=self.kv_cache_dtype,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_table=block_table,
-                query_start_loc=cu_seqlens_q,
-                seq_lens=seqused_k,
-                max_seq_len=max_seqlen_k,
-                max_query_len=max_seqlen_q,
-                k_scale=layer._k_scale,
-                v_scale=layer._v_scale,
-                alibi_slopes=self.alibi_slopes,
-                sliding_window=self.sliding_window[0],
-                sm_scale=self.scale,
-                output_scale=output_scale,
-                sinks=self.sinks,
-            )
-
-        else:
-            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
-
-            self.unified_attention(
-                q=query[:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[:num_actual_tokens],
-                cu_seqlens_q=cu_seqlens_q,
-                max_seqlen_q=max_seqlen_q,
-                seqused_k=seqused_k,
-                max_seqlen_k=max_seqlen_k,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-                sinks=self.sinks,
-                output_scale=output_scale)
+        # Compute attention and update output up to `num_actual_tokens`.
+        chunked_prefill_paged_decode(
+            query=query[:num_actual_tokens],
+            key=key[:num_actual_tokens],
+            value=value[:num_actual_tokens],
+            output=output[:num_actual_tokens],
+            kv_cache_dtype=self.kv_cache_dtype,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            block_table=block_table,
+            query_start_loc=cu_seqlens_q,
+            seq_lens=seqused_k,
+            max_seq_len=max_seqlen_k,
+            max_query_len=max_seqlen_q,
+            k_scale=layer._k_scale,
+            v_scale=layer._v_scale,
+            alibi_slopes=self.alibi_slopes,
+            sliding_window=self.sliding_window[0],
+            sm_scale=self.scale,
+            output_scale=output_scale,
+            sinks=self.sinks,
+        )
 
         return output
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index df7f0d2310ab434518130f5610dbb727b6b77175..de0cb73db091755961ecd53843f8cf9b2cf01e0b 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -1,21 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
-                                              compute_causal_conv1d_metadata,
-                                              split_decodes_and_prefills)
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
+from vllm.v1.attention.backends.utils import (
+    PAD_SLOT_ID,
+    CommonAttentionMetadata,
+    compute_causal_conv1d_metadata,
+    split_decodes_and_prefills,
+)
 
 
 class ShortConvAttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_builder_cls() -> type["ShortConvAttentionMetadataBuilder"]:
         return ShortConvAttentionMetadataBuilder
@@ -29,32 +28,26 @@ class ShortConvAttentionMetadata:
     num_decode_tokens: int
 
     query_start_loc: torch.Tensor
-    has_initial_states: torch.Tensor
-    state_indices_tensor: torch.Tensor  # shape: [batch,]
+    state_indices_tensor: torch.Tensor
+    has_initial_states_p: torch.Tensor | None
 
     # For causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class ShortConvAttentionMetadataBuilder(
-        AttentionMetadataBuilder[ShortConvAttentionMetadata]):
-
-    reorder_batch_threshold: int = 1
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-        assert isinstance(kv_cache_spec, MambaSpec)
-
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> ShortConvAttentionMetadata:
+    BaseMambaAttentionMetadataBuilder[ShortConvAttentionMetadata]
+):
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> ShortConvAttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
         query_start_loc = common_attn_metadata.query_start_loc
-
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         # for causal_conv1d
@@ -62,31 +55,49 @@ class ShortConvAttentionMetadataBuilder(
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
-        has_initial_states = None
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
+
+        has_initial_states_p = None
         if num_prefills > 0:
-            #[batch,]
             has_initial_states_cpu = (
-                common_attn_metadata.
-                num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
-            has_initial_states = has_initial_states_cpu.to(
-                query_start_loc.device)
-
-            query_start_loc_p = common_attn_metadata.query_start_loc[
-                -num_prefills - 1:] - num_decode_tokens
-
-            nums_dict, batch_ptr, token_chunk_offset_ptr = \
+                common_attn_metadata.num_computed_tokens_cpu[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                > 0
+            )
+            has_initial_states_p = has_initial_states_cpu.to(query_start_loc.device)
+
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+
+            nums_dict, batch_ptr, token_chunk_offset_ptr = (
                 compute_causal_conv1d_metadata(query_start_loc_p)
+            )
+
+        elif (
+            num_decodes > 0
+            and num_decodes <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
+            self.state_indices_tensor[:num_decodes].copy_(
+                state_indices_tensor, non_blocking=True
+            )
+            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
+            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
         attn_metadata = ShortConvAttentionMetadata(
+            query_start_loc=query_start_loc,
+            state_indices_tensor=state_indices_tensor,
+            has_initial_states_p=has_initial_states_p,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
-            query_start_loc=query_start_loc,
-            has_initial_states=has_initial_states,
-            state_indices_tensor=state_indices_tensor,
             nums_dict=nums_dict,
             batch_ptr=batch_ptr,
             token_chunk_offset_ptr=token_chunk_offset_ptr,
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 583756129a2912492851c52db66e7af326ac61cd..1bf38ed225a4c2abe5d11ac67fec686970c92edc 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -4,52 +4,39 @@
 
 import ast
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import ClassVar, Optional
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+)
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
-from vllm import _custom_ops as ops
-
 logger = init_logger(__name__)
 
 
 class TreeAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
-
     @staticmethod
     def get_name() -> str:
         return "TREE_ATTN"
@@ -58,10 +45,6 @@ class TreeAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["TreeAttentionImpl"]:
         return TreeAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TreeAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -98,7 +81,7 @@ class TreeAttentionMetadata:
     num_prefills: int = 0
     num_decodes: int = 0
 
-    tree_attn_bias: Optional[torch.Tensor] = None
+    tree_attn_bias: torch.Tensor | None = None
 
     # Cached Prefill/decode metadata.
     _cached_prefill_metadata: Optional["TreeAttentionMetadata"] = None
@@ -114,9 +97,9 @@ class TreeAttentionMetadata:
             # metadata structure
             return self._cached_prefill_metadata
 
-        q_start_loc = self.query_start_loc[self.num_decodes:]
+        q_start_loc = self.query_start_loc[self.num_decodes :]
         q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes:]
+        kv_seqlens = self.seq_lens[self.num_decodes :]
         # Construct & cache prefill-phase attention metadata structure
         self._cached_prefill_metadata = TreeAttentionMetadata(
             num_actual_tokens=self.num_prefill_tokens,
@@ -124,8 +107,8 @@ class TreeAttentionMetadata:
             query_start_loc=q_start_loc - q_start_loc[0],
             max_seq_len=int(kv_seqlens.max().item()),
             seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes:],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens:],
+            block_table=self.block_table[self.num_decodes :],
+            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
         )
         return self._cached_prefill_metadata
 
@@ -139,9 +122,9 @@ class TreeAttentionMetadata:
             # metadata structure
             return self._cached_decode_metadata
 
-        q_start_loc = self.query_start_loc[:self.num_decodes + 1]
+        q_start_loc = self.query_start_loc[: self.num_decodes + 1]
         q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[:self.num_decodes]
+        kv_seqlens = self.seq_lens[: self.num_decodes]
         # Construct & cache decode-phase attention metadata structure
         self._cached_decode_metadata = TreeAttentionMetadata(
             num_actual_tokens=self.num_decode_tokens,
@@ -149,16 +132,14 @@ class TreeAttentionMetadata:
             query_start_loc=q_start_loc,
             max_seq_len=int(kv_seqlens.max().item()),
             seq_lens=kv_seqlens,
-            block_table=self.block_table[:self.num_decodes],
-            slot_mapping=self.slot_mapping[:self.num_decode_tokens],
+            block_table=self.block_table[: self.num_decodes],
+            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
             tree_attn_bias=self.tree_attn_bias,
         )
         return self._cached_decode_metadata
 
 
-class TreeAttentionMetadataBuilder(
-        AttentionMetadataBuilder[TreeAttentionMetadata]):
-
+class TreeAttentionMetadataBuilder(AttentionMetadataBuilder[TreeAttentionMetadata]):
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -172,10 +153,9 @@ class TreeAttentionMetadataBuilder(
 
         spec_config = vllm_config.speculative_config
         spec_token_tree = (spec := spec_config) and spec.speculative_token_tree
-        tree_choices: list[tuple[int,
-                                 ...]] = (ast.literal_eval(spec_token_tree)
-                                          if spec_token_tree is not None else
-                                          [(0, )])
+        tree_choices: list[tuple[int, ...]] = (
+            ast.literal_eval(spec_token_tree) if spec_token_tree is not None else [(0,)]
+        )
         # Construct the tree attention bias.
         depth_counts = _get_depth_counts(tree_choices)
         self.tree_attn_bias = _prepare_tree_attn_bias(
@@ -185,12 +165,7 @@ class TreeAttentionMetadataBuilder(
             device=device,
         )
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(
-            input_batch,
-            scheduler_output,
-            decode_threshold=self.tree_attn_bias.shape[0])
+        self.reorder_batch_threshold = self.tree_attn_bias.shape[0]
 
     def build(
         self,
@@ -200,8 +175,10 @@ class TreeAttentionMetadataBuilder(
     ) -> TreeAttentionMetadata:
         decode_threshold = self.tree_attn_bias.shape[0]
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=decode_threshold))
+            split_decodes_and_prefills(
+                common_attn_metadata, decode_threshold=decode_threshold
+            )
+        )
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         q_start_loc = common_attn_metadata.query_start_loc
@@ -241,8 +218,7 @@ class TreeAttentionMetadataBuilder(
             # Slice the tree attention bias for drafting. Exclude
             # the root level.
             start, end = 1, 1 + common_attn_metadata.max_query_len
-            self.tree_attn_bias = self.tree_attn_bias[start:end,
-                                                      start:end].contiguous()
+            self.tree_attn_bias = self.tree_attn_bias[start:end, start:end].contiguous()
 
         # Build attention bias.
         attn_metadata = self.build(0, common_attn_metadata, fast_build=True)
@@ -268,15 +244,14 @@ def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
 def _prepare_tree_attn_bias(
     sorted_tree_choices: list[tuple[int, ...]],
     depth_counts: list[int],
-    dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
+    dtype: torch.dtype | None,
+    device: torch.device | None,
 ) -> torch.Tensor:
     # +1 comes from the additional root node.
     tree_len = len(sorted_tree_choices) + 1
-    tree_attn_mask = torch.full((tree_len, tree_len),
-                                -torch.inf,
-                                device=device,
-                                dtype=dtype)
+    tree_attn_mask = torch.full(
+        (tree_len, tree_len), -torch.inf, device=device, dtype=dtype
+    )
 
     # Set diagonal to all zeros. Each token should
     # attend to itself.
@@ -298,26 +273,26 @@ def _prepare_tree_attn_bias(
             ancestor_idx = []
             for c in range(len(cur_tree_choice) - 1):
                 ancestor_idx.append(
-                    sorted_tree_choices.index(cur_tree_choice[:c + 1]) + 1)
+                    sorted_tree_choices.index(cur_tree_choice[: c + 1]) + 1
+                )
             tree_attn_mask[j + start + 1, ancestor_idx] = mask_val
         start += depth_counts[i]
     return tree_attn_mask
 
 
 class TreeAttentionImpl(AttentionImpl):
-
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -338,13 +313,13 @@ class TreeAttentionImpl(AttentionImpl):
         else:
             self.sliding_window = (sliding_window - 1, 0)
 
-        TreeAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "TreeAttentionImpl.")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "TreeAttentionImpl."
+            )
 
     def forward(
         self,
@@ -354,9 +329,9 @@ class TreeAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TreeAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with TreeAttention.
 
@@ -374,12 +349,12 @@ class TreeAttentionImpl(AttentionImpl):
 
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for TreeAttentionImpl")
+                "fused output quantization is not yet supported for TreeAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # Cache the input KVs.
         key_cache, value_cache = kv_cache.unbind(0)
@@ -404,8 +379,7 @@ class TreeAttentionImpl(AttentionImpl):
 
         num_actual_tokens = attn_metadata.num_actual_tokens
         num_decode_tokens = attn_metadata.num_decode_tokens
-        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1,
-                         key.shape[1])
+        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1, key.shape[1])
         if prefill_meta := attn_metadata.prefill_metadata:
             unified_attention(
                 q=query[num_decode_tokens:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 3983c5edc76f4f467ab7c93c987b37b191011486..889c79db18ef590fbdcf04fb3261d28af7580b31 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -1,31 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """High-Performance Triton-only Attention layer."""
+
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+)
 from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash)
+    triton_reshape_and_cache_flash,
+)
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym)
+    QuantKey,
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-if current_platform.is_cuda_alike():
-    from vllm import _custom_ops as ops
-elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
-
 logger = init_logger(__name__)
 
 
@@ -50,30 +57,34 @@ class TritonAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
 
 
-class TritonAttentionMetadataBuilder(
-        AttentionMetadataBuilder[TritonAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         self.block_size = kv_cache_spec.block_size
 
         model_config = vllm_config.model_config
         self.num_heads_q = model_config.get_num_attention_heads(
-            vllm_config.parallel_config)
-        self.num_heads_kv = model_config.get_num_kv_heads(
-            vllm_config.parallel_config)
+            vllm_config.parallel_config
+        )
+        self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config)
         self.headdim = model_config.get_head_size()
 
     def build_for_cudagraph_capture(
@@ -86,10 +97,12 @@ class TritonAttentionMetadataBuilder(
         attn_metadata.seq_lens.fill_(1)
         return attn_metadata
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> TritonAttentionMetadata:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> TritonAttentionMetadata:
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
 
@@ -102,14 +115,13 @@ class TritonAttentionMetadataBuilder(
         use_cascade = common_prefix_len > 0
 
         if use_cascade:
-            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
-                                                dtype=torch.int32,
-                                                device=self.device)
-            prefix_kv_lens = torch.tensor([common_prefix_len],
-                                          dtype=torch.int32,
-                                          device=self.device)
-            suffix_kv_lens = (common_attn_metadata.seq_lens_cpu -
-                              common_prefix_len)
+            cu_prefix_query_lens = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device=self.device
+            )
+            prefix_kv_lens = torch.tensor(
+                [common_prefix_len], dtype=torch.int32, device=self.device
+            )
+            suffix_kv_lens = common_attn_metadata.seq_lens_cpu - common_prefix_len
             suffix_kv_lens = suffix_kv_lens.to(self.device)
         else:
             cu_prefix_query_lens = None
@@ -136,22 +148,19 @@ class TritonAttentionMetadataBuilder(
 
 
 class TritonAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        # Triton Attention supports any head size above 32
-        if head_size < 32:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention."
-                f"Head sizes need to be larger or equal 32 for this backend. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -161,10 +170,6 @@ class TritonAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["TritonAttentionImpl"]:
         return TritonAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TritonAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -185,25 +190,39 @@ class TritonAttentionBackend(AttentionBackend):
     def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
         return TritonAttentionMetadataBuilder
 
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size >= 32
 
-class TritonAttentionImpl(AttentionImpl):
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return True
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
+
+class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
         return quant_key == kFp8StaticTensorSym
 
+    def supports_quant_query_input(self) -> bool:
+        return current_platform.is_cuda()
+
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -225,13 +244,13 @@ class TritonAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        TritonAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "TritonAttentionImpl")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "TritonAttentionImpl"
+            )
 
         self.fp8_dtype = current_platform.fp8_dtype()
 
@@ -240,7 +259,8 @@ class TritonAttentionImpl(AttentionImpl):
             assert sinks.shape[0] == num_heads, (
                 "Sinks must have the same number of heads as the number of "
                 f"heads in the layer. Sinks shape: {sinks.shape}, "
-                f"num_heads: {num_heads}.")
+                f"num_heads: {num_heads}."
+            )
 
     def forward(
         self,
@@ -250,9 +270,9 @@ class TritonAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TritonAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with Paged Attention impl. in Triton.
 
@@ -271,11 +291,12 @@ class TritonAttentionImpl(AttentionImpl):
         if output_block_scale is not None:
             raise NotImplementedError(
                 "fused block_scale output quantization is not yet supported"
-                " for TritonAttentionImpl")
+                " for TritonAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         assert attn_metadata.use_cascade is False
 
@@ -315,18 +336,9 @@ class TritonAttentionImpl(AttentionImpl):
             if key_cache.dtype != self.fp8_dtype:
                 key_cache = key_cache.view(self.fp8_dtype)
                 value_cache = value_cache.view(self.fp8_dtype)
-            num_tokens, num_heads, head_size = query.shape
-            assert layer._q_scale_float == 1.0, \
+            assert layer._q_scale_float == 1.0, (
                 "A non 1.0 q_scale is not currently supported."
-            if current_platform.is_cuda():
-                # Skip Q quantization on ROCm and XPU, enable this on cuda
-                # only, since dequantizing back to f32 in the attention kernel
-                # is not supported.
-                query, _ = ops.scaled_fp8_quant(
-                    query.reshape(
-                        (num_tokens, num_heads * head_size)).contiguous(),
-                    layer._q_scale)
-                query = query.reshape((num_tokens, num_heads, head_size))
+            )
 
         cu_seqlens_q = attn_metadata.query_start_loc
         seqused_k = attn_metadata.seq_lens
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index f37a829f401cc8d3d8bfb7ac2dee98bdd74c74bb..0dd1896331291f51b24fc35b4e23134f46a57f15 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -4,16 +4,24 @@ import abc
 import enum
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass, fields, make_dataclass
-from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Literal, Optional,
-                    Protocol, TypeVar, Union, get_args)
+from dataclasses import dataclass, field, fields, make_dataclass
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Generic,
+    Literal,
+    Protocol,
+    TypeVar,
+    get_args,
+)
 
 import numpy as np
 import torch
 from typing_extensions import runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionImpl
@@ -21,18 +29,18 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
-from vllm.attention.layer import Attention
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.utils import (
-    get_kv_connector_cache_layout)
+    get_kv_connector_cache_layout,
+)
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.ubatch_utils import UBatchSlice
 
 logger = init_logger(__name__)
 KVCacheLayoutType = Literal["NHD", "HND"]
-_KV_CACHE_LAYOUT_OVERRIDE: Union[KVCacheLayoutType, None] = None
+_KV_CACHE_LAYOUT_OVERRIDE: KVCacheLayoutType | None = None
 
 PAD_SLOT_ID = -1
 
@@ -46,7 +54,7 @@ class CommonAttentionMetadata:
     """
     Per-batch attention metadata, shared across layers and backends.
     AttentionMetadataBuilder instances use it to construct per-layer metadata.
-    
+
     For many of the tensors we keep both GPU and CPU versions.
     """
 
@@ -77,11 +85,15 @@ class CommonAttentionMetadata:
     causal: bool = True
 
     # Needed by FastPrefillAttentionBuilder
-    logits_indices_padded: Optional[torch.Tensor] = None
-    num_logits_indices: Optional[int] = None
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
     # Needed by CrossAttentionBuilder
-    encoder_seq_lens: Optional[np.ndarray] = None
+    encoder_seq_lens: np.ndarray | None = None
+
+    dcp_local_seq_lens: torch.Tensor | None = None
+    dcp_local_seq_lens_cpu: torch.Tensor | None = None
+    """Sequence lengths of the local rank in decode context parallelism world"""
 
 
 def slice_query_start_locs(
@@ -89,26 +101,27 @@ def slice_query_start_locs(
     request_slice: slice,
 ) -> torch.Tensor:
     """
-    Creates a new query_start_loc that corresponds to the requests in 
+    Creates a new query_start_loc that corresponds to the requests in
     request_slice.
 
     Note: This function creates a new tensor to hold the new query_start_locs.
     This will break cudagraph compatibility.
     """
-    return query_start_loc[request_slice.start: request_slice.stop + 1] -\
-        query_start_loc[request_slice.start]
+    return (
+        query_start_loc[request_slice.start : request_slice.stop + 1]
+        - query_start_loc[request_slice.start]
+    )
 
 
 def _make_metadata_with_slice(
-        ubatch_slice: UBatchSlice,
-        attn_metadata: CommonAttentionMetadata) -> CommonAttentionMetadata:
+    ubatch_slice: UBatchSlice, attn_metadata: CommonAttentionMetadata
+) -> CommonAttentionMetadata:
     """
-    This function creates a new CommonAttentionMetadata that corresponds to 
+    This function creates a new CommonAttentionMetadata that corresponds to
     the requests included in ubatch_slice
     """
 
-    assert not ubatch_slice.is_empty(), (
-        f"Ubatch slice {ubatch_slice} is empty")
+    assert not ubatch_slice.is_empty(), f"Ubatch slice {ubatch_slice} is empty"
 
     request_slice = ubatch_slice.request_slice
     token_slice = ubatch_slice.token_slice
@@ -119,10 +132,12 @@ def _make_metadata_with_slice(
     last_req = request_slice.stop - 1
     last_tok = token_slice.stop - 1
 
-    assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], \
+    assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], (
         "Token slice start outside of first request"
-    assert start_locs[last_req] <= last_tok < start_locs[last_req+1], \
+    )
+    assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], (
         "Token slice end outside of last request"
+    )
 
     # If the "middle" request has tokens in both ubatches, we have to split it.
     # If ubatch_slice is the first ubatch then we will be splitting the last
@@ -132,12 +147,13 @@ def _make_metadata_with_slice(
     splits_last_request = last_tok < start_locs[last_req + 1] - 1
 
     query_start_loc_cpu = slice_query_start_locs(start_locs, request_slice)
-    query_start_loc = slice_query_start_locs(attn_metadata.query_start_loc,
-                                             request_slice)
+    query_start_loc = slice_query_start_locs(
+        attn_metadata.query_start_loc, request_slice
+    )
 
     assert len(query_start_loc) >= 2, (
-        f"query_start_loc must have at least 2 elements, "
-        f"got {len(query_start_loc)}")
+        f"query_start_loc must have at least 2 elements, got {len(query_start_loc)}"
+    )
 
     if splits_first_request:
         tokens_skipped = first_tok - start_locs[first_req]
@@ -159,14 +175,13 @@ def _make_metadata_with_slice(
         seq_lens_cpu[-1] -= tokens_skipped
 
     max_seq_len = int(seq_lens_cpu.max())
-    num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[
-        request_slice]
+    num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[request_slice]
 
     num_requests = request_slice.stop - request_slice.start
     num_actual_tokens = token_slice.stop - token_slice.start
     max_query_len = int(
-        torch.max(torch.abs(query_start_loc_cpu[1:] -
-                            query_start_loc_cpu[:-1])).item())
+        torch.max(torch.abs(query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])).item()
+    )
 
     # This is to account for the case where we are in a dummy
     # run and query_start_loc_cpu is full of 0s
@@ -196,15 +211,14 @@ def split_attn_metadata(
     common_attn_metadata: CommonAttentionMetadata,
 ) -> list[CommonAttentionMetadata]:
     """
-    Creates a new CommonAttentionMetadata instance that corresponds to the 
+    Creates a new CommonAttentionMetadata instance that corresponds to the
     requests for each UBatchSlice in ubatch_slices.
 
     Note: This function does not modify common_attn_metadata
     """
     results = []
     for ubatch_slice in ubatch_slices:
-        results.append(
-            _make_metadata_with_slice(ubatch_slice, common_attn_metadata))
+        results.append(_make_metadata_with_slice(ubatch_slice, common_attn_metadata))
 
     return results
 
@@ -213,7 +227,7 @@ M = TypeVar("M")
 
 
 class AttentionCGSupport(enum.Enum):
-    """ Constants for the cudagraph support of the attention backend
+    """Constants for the cudagraph support of the attention backend
     Here we do not consider the cascade attention, as currently
     it is never cudagraph supported."""
 
@@ -221,7 +235,7 @@ class AttentionCGSupport(enum.Enum):
     """Cudagraph always supported; supports mixed-prefill-decode"""
     UNIFORM_BATCH = 2
     """Cudagraph supported for batches the only contain query lengths that are
-    the same, this can be used for spec-decode 
+    the same, this can be used for spec-decode
         i.e. "decodes" are 1 + num_speculative_tokens"""
     UNIFORM_SINGLE_TOKEN_DECODE = 1
     """Cudagraph supported for batches the only contain query_len==1 decodes"""
@@ -231,46 +245,73 @@ class AttentionCGSupport(enum.Enum):
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention (default: no).
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.NEVER
+    # Do not access directly. Call get_cudagraph_support() instead.
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
-    reorder_batch_threshold: Optional[int] = None
+    reorder_batch_threshold: int | None = None
 
     @abstractmethod
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
         self.kv_cache_spec = kv_cache_spec
         self.layer_names = layer_names
         self.vllm_config = vllm_config
         self.device = device
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AttentionMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        """Get the cudagraph support level of this builder class."""
+        return cls._cudagraph_support
+
     def _init_reorder_batch_threshold(
-            self,
-            reorder_batch_threshold: int = 1,
-            supports_spec_as_decode: bool = False) -> None:
+        self,
+        reorder_batch_threshold: int | None = 1,
+        supports_spec_as_decode: bool = False,
+        supports_dcp_with_varlen: bool = False,
+    ) -> None:
         self.reorder_batch_threshold = reorder_batch_threshold
-        if self.reorder_batch_threshold is not None \
-            and supports_spec_as_decode:
+        if self.reorder_batch_threshold is not None and supports_spec_as_decode:
             # If the backend supports spec-as-decode kernels, then we can set
             # the reorder_batch_threshold based on the number of speculative
             # tokens from the config.
             speculative_config = self.vllm_config.speculative_config
-            if (speculative_config is not None
-                    and speculative_config.num_speculative_tokens is not None):
-                self.reorder_batch_threshold = \
-                    1 + speculative_config.num_speculative_tokens
+            if (
+                speculative_config is not None
+                and speculative_config.num_speculative_tokens is not None
+            ):
+                self.reorder_batch_threshold = max(
+                    self.reorder_batch_threshold,
+                    1 + speculative_config.num_speculative_tokens,
+                )
+
+        if (
+            self.vllm_config.parallel_config.decode_context_parallel_size > 1
+            and not supports_dcp_with_varlen
+        ):
+            self.reorder_batch_threshold = 1
 
     @abstractmethod
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> M:
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> M:
         """
         Central method that builds attention metadata.
         Some builders (MLA) require reorder_batch to be called prior to build.
-        
+
         Args:
             common_prefix_len: The length of the common prefix of the batch.
             common_attn_metadata: The common attention metadata.
@@ -280,32 +321,17 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         """
         raise NotImplementedError
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        """
-        Update the order of requests in the batch based on the attention
-        backend's needs. For example, some attention backends (namely MLA) may
-        want to separate requests based on if the attention computation will be
-        compute-bound or memory-bound.
-
-        Args:
-            input_batch: input batch
-            scheduler_output: scheduler output.
-
-        Returns:
-            True if the batch was modified, False otherwise.
-        """
-        raise NotImplementedError
-
     def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> M:
         """
         Build attention metadata for CUDA graph capture. Uses build by default.
         Subclasses that override this method should call self.build or
         super().build_for_cudagraph_capture.
         """
-        return self.build(common_prefix_len=0,
-                          common_attn_metadata=common_attn_metadata)
+        return self.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
+        )
 
     def build_for_drafting(
         self,
@@ -314,7 +340,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     ) -> M:
         """
         Build attention metadata for draft model. Uses build by default.
-        
+
         Args:
             common_attn_metadata: The common attention metadata.
             draft_index: The index of the current draft operation.
@@ -323,9 +349,11 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
                 For tree-based attention, this index instead refers to the
                 draft attempt for the i-th level in the tree of tokens.
         """
-        return self.build(common_prefix_len=0,
-                          common_attn_metadata=common_attn_metadata,
-                          fast_build=True)
+        return self.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+            fast_build=True,
+        )
 
     def use_cascade_attention(
         self,
@@ -337,6 +365,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         use_sliding_window: bool,
         use_local_attention: bool,
         num_sms: int,
+        dcp_world_size: int,
     ) -> bool:
         return False
 
@@ -348,8 +377,11 @@ def get_kv_cache_layout():
 
     if _KV_CACHE_LAYOUT_OVERRIDE is not None:
         cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
-        logger.info_once("`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. " \
-                         "Setting KV cache layout to %s.", cache_layout)
+        logger.info_once(
+            "`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. "
+            "Setting KV cache layout to %s.",
+            cache_layout,
+        )
         return cache_layout
 
     # Format specified by the user.
@@ -359,8 +391,11 @@ def get_kv_cache_layout():
         cache_layout = get_kv_connector_cache_layout()
     else:
         assert is_valid_kv_cache_layout(cache_layout)
-        logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
-        "detected. Setting KV cache layout to %s.", cache_layout)
+        logger.info_once(
+            "`VLLM_KV_CACHE_LAYOUT` environment variable "
+            "detected. Setting KV cache layout to %s.",
+            cache_layout,
+        )
     return cache_layout
 
 
@@ -379,20 +414,23 @@ class PerLayerParameters:
     """
 
     window_left: int
-    logits_soft_cap: Optional[float]
+    logits_soft_cap: float | None
     sm_scale: float
     has_sinks: bool = False
+    # has same params for all layers
+    has_same_window_lefts: bool | None = field(default=None, compare=False)
+    has_same_all_params: bool | None = field(default=None, compare=False)
 
 
 def get_per_layer_parameters(
-        vllm_config: VllmConfig, layer_names: list[str],
-        cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]:
+    vllm_config: VllmConfig, layer_names: list[str], cls_: type["AttentionImpl"]
+) -> dict[str, PerLayerParameters]:
     """
     Scan layers in `layer_names` and determine some hyperparameters
     to use during `plan`.
     """
 
-    layers = get_layers_from_vllm_config(vllm_config, Attention, layer_names)
+    layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase, layer_names)
     per_layer_params: dict[str, PerLayerParameters] = {}
 
     for key, layer in layers.items():
@@ -406,17 +444,18 @@ def get_per_layer_parameters(
         sm_scale = impl.scale
         has_sinks = getattr(impl, "sinks", None) is not None
 
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale,
-                                                   has_sinks)
+        per_layer_params[key] = PerLayerParameters(
+            window_left, logits_soft_cap, sm_scale, has_sinks
+        )
 
     return per_layer_params
 
 
 def infer_global_hyperparameters(
-        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
+    per_layer_params: dict[str, PerLayerParameters],
+) -> PerLayerParameters:
     """
-    Currently, FlashInfer backend other than trtllm-gen 
+    Currently, FlashInfer backend other than trtllm-gen
     only support models in which all layers share
     the same values for the following hyperparameters:
     - `window_left`
@@ -432,18 +471,12 @@ def infer_global_hyperparameters(
     param_sets = list(per_layer_params.values())
     global_params = param_sets[0]
 
-    # trtllm attention doesn't need global hyper params so disable the check
-    if not envs.VLLM_USE_TRTLLM_ATTENTION:
-        for params in param_sets:
-            if params.window_left != global_params.window_left:
-                raise ValueError(
-                    "Window left is not the same for all layers. " \
-                    "One potential fix is to set disable_sliding_window=True")
-            assert params == global_params, (
-                "FlashInfer backend currently only supports models in which all"
-                "layers share the same values "
-                "for the following hyperparameters:"
-                "`window_left`, `logits_soft_cap`, `sm_scale`.")
+    global_params.has_same_window_lefts = all(
+        params.window_left == global_params.window_left for params in param_sets
+    )
+    global_params.has_same_all_params = all(
+        params == global_params for params in param_sets
+    )
 
     return global_params
 
@@ -525,11 +558,10 @@ def make_local_attention_virtual_batches(
     #   new_tokens_in_first_block = [2, 1, 4]
     #   local_blocks = [2, 4, 2]
     q_tokens_in_first_block = np.minimum(
-        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size),
-        q_seqlens).astype(np.int32)
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens
+    ).astype(np.int32)
     tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
-    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block,
-                            attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size)
 
     # Once we know the number of local blocks we can compute the request spans
     #  for each batch idx, we can figure out the number of "virtual" requests we
@@ -550,14 +582,13 @@ def make_local_attention_virtual_batches(
     rarange = np.repeat(local_blocks, local_blocks) - arange - 1
     # Then we can compute the seqlens_q_local, handling the fact that the
     #  first and last blocks could be partial
-    seqlens_q_local = \
-        np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
     # set the first block since this may be a partial block
     seqlens_q_local[arange == 0] = q_tokens_in_first_block
     # set the remaining blocks
     seqlens_q_local[arange > 0] = np.minimum(
-        seqlens_q_local - attn_chunk_size * (arange - 1),
-        attn_chunk_size)[arange > 0]
+        seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size
+    )[arange > 0]
 
     # convert from q_seqlens to cu_seqlens_q
     cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32)
@@ -569,22 +600,20 @@ def make_local_attention_virtual_batches(
     #  batch
     # For our example this will be:
     #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
-    seqlens_k_local = np.full(cu_num_blocks[-1],
-                              attn_chunk_size,
-                              dtype=np.int32)
+    seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32)
     seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
     num_computed_tokens_local = seqlens_k_local - seqlens_q_local
 
-    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - \
-        (rarange * attn_chunk_size + \
-            np.repeat(tokens_in_last_block, local_blocks))
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - (
+        rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks)
+    )
     # For the example the local attention blocks start at:
     #                           _b0_  _____b1_____  _b2_
     #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
     block_starts = k_seqstarts_absolute // block_size
-    assert attn_chunk_size % block_size == 0, \
-        f"attn_chunk_size {attn_chunk_size} is not " \
-        f"divisible by block_size {block_size}"
+    assert attn_chunk_size % block_size == 0, (
+        f"attn_chunk_size {attn_chunk_size} is not divisible by block_size {block_size}"
+    )
     pages_per_local_batch = attn_chunk_size // block_size
 
     # Create a block_table for the local attention blocks
@@ -605,12 +634,14 @@ def make_local_attention_virtual_batches(
     #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
     #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
     #   ]
-    block_indices = (block_starts[:, None] +
-                     np.arange(pages_per_local_batch, dtype=np.int32))
-    block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] -
-                                                   1)
-    batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
-                              local_blocks * pages_per_local_batch)
+    block_indices = block_starts[:, None] + np.arange(
+        pages_per_local_batch, dtype=np.int32
+    )
+    block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - 1)
+    batch_indices = np.repeat(
+        np.arange(actual_batch_size, dtype=np.int32),
+        local_blocks * pages_per_local_batch,
+    )
 
     # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance
     # regression when using numpy arrays (batch and block indices) to index into
@@ -618,8 +649,9 @@ def make_local_attention_virtual_batches(
     # tensor first, which recovers perf.
     batch_indices_torch = torch.from_numpy(batch_indices)
     block_indices_torch = torch.from_numpy(block_indices)
-    block_table_local = block_table[batch_indices_torch, block_indices_torch]\
-        .view(virtual_batches, -1)
+    block_table_local = block_table[batch_indices_torch, block_indices_torch].view(
+        virtual_batches, -1
+    )
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
     seq_lens_cpu = torch.from_numpy(seqlens_k_local)
@@ -627,8 +659,7 @@ def make_local_attention_virtual_batches(
 
     return CommonAttentionMetadata(
         query_start_loc_cpu=query_start_loc_cpu,
-        query_start_loc=query_start_loc_cpu.to(device=device,
-                                               non_blocking=True),
+        query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
         seq_lens_cpu=seq_lens_cpu,
         seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
         num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
@@ -668,9 +699,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
 
     # Find how many decode indices belong to each request
     # request_ids: [0, 1, 1, 2]
-    request_ids = torch.bucketize(logits_indices,
-                                  query_start_loc[1:],
-                                  right=True)
+    request_ids = torch.bucketize(logits_indices, query_start_loc[1:], right=True)
 
     # Figure out how many tokens are in each request
     # num_decode_tokens: [1, 2, 1]
@@ -678,9 +707,9 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
 
     # Calculate new query_start_loc with tokens in generation_indices
     # decode_query_start_loc: [0, 1, 3, 4]
-    decode_query_start_loc = torch.empty(num_reqs + 1,
-                                         device=query_start_loc.device,
-                                         dtype=query_start_loc.dtype)
+    decode_query_start_loc = torch.empty(
+        num_reqs + 1, device=query_start_loc.device, dtype=query_start_loc.dtype
+    )
 
     decode_query_start_loc[0] = 0
     decode_query_start_loc[1:] = torch.cumsum(num_decode_tokens, dim=0)
@@ -689,8 +718,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
 
     common_attn_metadata = CommonAttentionMetadata(
         query_start_loc=decode_query_start_loc,
-        query_start_loc_cpu=decode_query_start_loc.to("cpu",
-                                                      non_blocking=True),
+        query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
         seq_lens=seq_lens,
         seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
         num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
@@ -706,22 +734,92 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
 
 
 def subclass_attention_backend(
-        name_prefix: str, attention_backend_cls: type[AttentionBackend],
-        builder_cls: type[AttentionMetadataBuilder[M]]
+    name_prefix: str,
+    attention_backend_cls: type[AttentionBackend],
+    builder_cls: type[AttentionMetadataBuilder[M]],
 ) -> type[AttentionBackend]:
     """
     Return a new subclass where `get_builder_cls` returns `builder_cls`.
     """
     name: str = name_prefix + attention_backend_cls.__name__  # type: ignore
 
-    return type(name, (attention_backend_cls, ),
-                {"get_builder_cls": lambda: builder_cls})
+    return type(
+        name, (attention_backend_cls,), {"get_builder_cls": lambda: builder_cls}
+    )
+
+
+def split_decodes_prefills_and_extends(
+    common_attn_metadata: CommonAttentionMetadata,
+    decode_threshold: int = 1,
+) -> tuple[int, int, int, int, int, int]:
+    """
+    Assuming a reordered batch, finds the boundary between prefill and decode
+    requests.
+
+    Args:
+        common_attn_metadata: CommonAttentionMetadata object containing the
+            batch metadata.
+        decode_threshold: The maximum query length to be considered a decode.
+
+    Returns:
+        num_decodes: The number of decode requests.
+        num_extends: The number of extend requests.
+        num_prefills: The number of prefill requests.
+        num_decode_tokens: The number of tokens in the decode requests.
+        num_extend_tokens: The number of tokens in the extend requests.
+        num_prefill_tokens: The number of tokens in the prefill requests.
+    """
+    max_query_len = common_attn_metadata.max_query_len
+    num_reqs = common_attn_metadata.num_reqs
+    num_tokens = common_attn_metadata.num_actual_tokens
+    query_start_loc = common_attn_metadata.query_start_loc_cpu
+    seq_lens = common_attn_metadata.seq_lens_cpu
+
+    if max_query_len <= decode_threshold:
+        return num_reqs, 0, 0, num_tokens, 0, 0
+
+    query_lens = query_start_loc[1:] - query_start_loc[:-1]
+    is_prefill_or_extend = query_lens > decode_threshold
+    is_prefill = (seq_lens == query_lens) & is_prefill_or_extend
+    first_extend = is_prefill_or_extend.int().argmax(dim=-1).item()
+    first_prefill = is_prefill.int().argmax(dim=-1).item()
+    num_decodes = first_extend
+    num_decode_tokens = query_start_loc[first_extend].item()
+    if not torch.any(is_prefill_or_extend):
+        return (num_decodes, 0, 0, num_decode_tokens, 0, 0)
+
+    num_prefills_or_extends = num_reqs - num_decodes
+    num_prefill_or_extend_tokens = num_tokens - num_decode_tokens
+    if not torch.any(is_prefill):
+        return (
+            num_decodes,
+            num_prefills_or_extends,
+            0,
+            num_decode_tokens,
+            num_prefill_or_extend_tokens,
+            0,
+        )
+
+    num_extends = first_prefill - num_decodes
+    num_prefills = num_reqs - first_prefill
+
+    num_prefill_tokens = num_tokens - query_start_loc[first_prefill]
+    num_extend_tokens = num_prefill_or_extend_tokens - num_prefill_tokens
+    return (
+        num_decodes,
+        num_extends,
+        num_prefills,
+        num_decode_tokens,
+        num_extend_tokens,
+        num_prefill_tokens,
+    )
 
 
 def split_decodes_and_prefills(
-        common_attn_metadata: CommonAttentionMetadata,
-        decode_threshold: int = 1,
-        require_uniform: bool = False) -> tuple[int, int, int, int]:
+    common_attn_metadata: CommonAttentionMetadata,
+    decode_threshold: int = 1,
+    require_uniform: bool = False,
+) -> tuple[int, int, int, int]:
     """
     Assuming a reordered batch, finds the boundary between prefill and decode
     requests.
@@ -745,8 +843,9 @@ def split_decodes_and_prefills(
     num_tokens = common_attn_metadata.num_actual_tokens
     query_start_loc = common_attn_metadata.query_start_loc_cpu
 
-    if max_query_len <= decode_threshold and \
-        (not require_uniform or decode_threshold <= 1):
+    if max_query_len <= decode_threshold and (
+        not require_uniform or decode_threshold <= 1
+    ):
         return num_reqs, 0, num_tokens, 0
 
     query_lens = query_start_loc[1:] - query_start_loc[:-1]
@@ -779,63 +878,66 @@ def reorder_batch_to_split_decodes_and_prefills(
     """
     Reorders the batch to split into prefill and decode requests; places all
     requests with <= decode_threshold tokens at the front of the batch.
-    
+
     Returns:
         True if the batch was modified, False otherwise.
     """
-    # We now want to reorder the batch so that the "decode" requests are at
-    # the front and the "prefill" requests are at the back using the least
-    # amount of swaps possible. (NOTE for now we loosely use "decode" to mean
-    # requests where attention is likely memory-bound and "prefill" to mean
-    # requests where attention is likely compute-bound, TODO(lucas): figure out
-    # a better naming here)
-    decodes = []
-    prefills = []
-    num_decode_tokens = 0
-    num_prefill_tokens = 0
-
-    for i, req_id in enumerate(input_batch.req_ids):
-        num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-        # for now treat 1 scheduled token as "decode" even if it's not,
-        # we should update this to something like < 8 in the future but
-        # currently the TritonMLA._forward_decode only supports
-        # num_tokens = 1
-        if num_tokens <= decode_threshold:
-            decodes.append(i)
-            num_decode_tokens += num_tokens
-        else:
-            prefills.append(i)
-            num_prefill_tokens += num_tokens
-
-    # We hope that this is fairly minimal since decodes
-    # should be around for a number of iterations so hopefully they are
-    # relatively stationary (and new request are generally appended to the
-    # persistent batch so already should be at the back)
-    # To achieve this we loop over the decodes in descending order and
-    # the prefills in ascending order. We swap decodes from the  "back"
-    # i.e. past where the last decode should be in the reodorered with
-    # prefills from the front of the batch.
-    # `decodes` and `prefills` are already in ascending order just based on
-    # the above loop
-    num_decodes = len(decodes)
-    num_prefills = len(prefills)
-    modified_batch = False
-
-    for i in range(1, min(num_decodes, num_prefills) + 1):
-        # If the decode is at the "back" of the batch, i, we can swap it
-        # with the prefill closest to the front of the batch
-        decode_idx = decodes[num_decodes - i]
-        if decode_idx < num_decodes:
-            break
-
-        input_batch.swap_states(prefills[i - 1], decode_idx)
-        modified_batch = True
-
-    return modified_batch
-
-
-def reshape_query_for_spec_decode(query: torch.Tensor,
-                                  batch_size: int) -> torch.Tensor:
+    # We now want to reorder the batch into decode → extend → prefill order
+    # where:
+    #   decode: request with num_scheduled_tokens <= decode_threshold
+    #   extend: non-decode request with existing context
+    #   prefill: non-decode request with no existing context
+    # NOTE for now we loosely use "decode" to mean requests where attention is
+    #  likely memory-bound and "prefill" to mean requests where attention is
+    #  likely compute-bound,
+    num_reqs = len(input_batch.req_ids)
+    num_scheduled_tokens = [
+        scheduler_output.num_scheduled_tokens[id] for id in input_batch.req_ids
+    ]
+    num_scheduled_tokens_np = np.array(num_scheduled_tokens)
+    num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]
+
+    is_decode = num_scheduled_tokens_np <= decode_threshold
+    is_extend = (~is_decode) & (num_computed_tokens_np > 0)
+    is_prefill = (~is_decode) & (num_computed_tokens_np == 0)
+
+    # Desired order: decode → extend → prefill
+    req_regions = np.zeros(is_decode.shape, dtype=np.int32)  # 0 = decode by default
+    req_regions[is_extend] = 1
+    req_regions[is_prefill] = 2
+
+    num_decodes = int(is_decode.sum())
+    num_extends = int(is_extend.sum())
+
+    target_regions = np.zeros(num_reqs, dtype=np.int32)
+    target_regions[num_decodes : num_decodes + num_extends] = 1
+    target_regions[num_decodes + num_extends :] = 2
+
+    needs_swap = req_regions != target_regions
+
+    if not needs_swap.any():
+        return False
+
+    # Extract indices that need swapping and sort by target region
+    orig_indices = np.where(needs_swap)[0]
+    sorted_order = np.argsort(req_regions[needs_swap], kind="stable")
+    src_indices = orig_indices[sorted_order]
+
+    src_dest_map = {int(src): int(dst) for src, dst in zip(src_indices, orig_indices)}
+
+    for src in src_dest_map:
+        dst = src_dest_map[src]
+        while src != dst:
+            input_batch.swap_states(src, dst)
+            # Mark dst as done by updating its destination to itself
+            next_dst = src_dest_map.get(dst, dst)
+            src_dest_map[dst] = dst
+            dst = next_dst
+
+    return True
+
+
+def reshape_query_for_spec_decode(query: torch.Tensor, batch_size: int) -> torch.Tensor:
     """
     Reshapes the query tensor for the specified batch size, so that
     it has shape (batch_size, seq_len, num_heads, head_dim).
@@ -845,13 +947,13 @@ def reshape_query_for_spec_decode(query: torch.Tensor,
     num_heads = query.shape[1]
     head_dim = query.shape[2]
     assert total_tokens % batch_size == 0, (
-        f"{total_tokens=} is not divisible by {batch_size=}")
+        f"{total_tokens=} is not divisible by {batch_size=}"
+    )
     seq_len = total_tokens // batch_size
     return query.view(batch_size, seq_len, num_heads, head_dim)
 
 
-def reshape_attn_output_for_spec_decode(
-        attn_output: torch.Tensor) -> torch.Tensor:
+def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tensor:
     """
     Reshapes the attention output tensor, so that
     the batch_size and seq_len dimensions are combined.
@@ -859,17 +961,9 @@ def reshape_attn_output_for_spec_decode(
     if attn_output.dim() == 3:
         # Already in the correct shape
         return attn_output
-    assert attn_output.dim() == 4, \
-        f"attn_output must be 4D, got {attn_output.dim()}D"
+    assert attn_output.dim() == 4, f"attn_output must be 4D, got {attn_output.dim()}D"
     total_tokens = attn_output.shape[0] * attn_output.shape[1]
-    return attn_output.view(total_tokens, attn_output.shape[2],
-                            attn_output.shape[3])
-
-
-KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
-    ('logits_indices_padded', Optional[torch.Tensor], None),
-    ('num_logits_indices', int, 0),
-]
+    return attn_output.view(total_tokens, attn_output.shape[2], attn_output.shape[3])
 
 
 def subclass_attention_metadata(
@@ -881,69 +975,64 @@ def subclass_attention_metadata(
     Return a new subclass of `metadata_cls` with additional fields
     """
     name: str = name_prefix + metadata_cls.__name__  # type: ignore
-    Wrapped = make_dataclass(name, fields, bases=(metadata_cls, ))
+    Wrapped = make_dataclass(name, fields, bases=(metadata_cls,))
     return Wrapped
 
 
 @runtime_checkable
 class KVSharingFastPrefillMetadata(Protocol):
-    logits_indices_padded: torch.Tensor
-    num_logits_indices: int
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
 
 def create_fast_prefill_custom_backend(
     prefix: str,
     underlying_attn_backend: AttentionBackend,
 ) -> type[AttentionBackend]:
-
     underlying_builder = underlying_attn_backend.get_builder_cls()
 
     class FastPrefillAttentionBuilder(underlying_builder):  # type: ignore
-
-        def build(self,
-                  common_prefix_len: int,
-                  common_attn_metadata: CommonAttentionMetadata,
-                  fast_build: bool = False) -> AttentionMetadata:
-            new_common_attn_metadata =\
-            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
-            metadata = super().build(common_prefix_len,
-                                     new_common_attn_metadata, fast_build)
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            new_common_attn_metadata = (
+                make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+            )
+            metadata = super().build(
+                common_prefix_len, new_common_attn_metadata, fast_build
+            )
 
             class KVSharingFastPrefillAttentionMetadata(
-                    metadata.__class__,  #  type: ignore
-                    KVSharingFastPrefillMetadata):
-
+                metadata.__class__,  #  type: ignore
+                KVSharingFastPrefillMetadata,
+            ):
                 def __init__(self, metadata, common_attn_metadata):
                     # Shallow copy all fields in metadata cls
-                    for field in fields(metadata.__class__):
-                        setattr(self, field.name,
-                                getattr(metadata, field.name))
-
-                    # Set additional fields that will be used in model code
-                    assert (common_attn_metadata.logits_indices_padded
-                            is not None
-                            and common_attn_metadata.num_logits_indices
-                            is not None)
-                    self.logits_indices_padded = \
+                    for _field in fields(metadata.__class__):
+                        setattr(self, _field.name, getattr(metadata, _field.name))
+
+                    self.logits_indices_padded = (
                         common_attn_metadata.logits_indices_padded
-                    self.num_logits_indices = \
-                        common_attn_metadata.num_logits_indices
+                    )
+                    self.num_logits_indices = common_attn_metadata.num_logits_indices
 
-            return KVSharingFastPrefillAttentionMetadata(
-                metadata, common_attn_metadata)
+            return KVSharingFastPrefillAttentionMetadata(metadata, common_attn_metadata)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=FastPrefillAttentionBuilder)
+        builder_cls=FastPrefillAttentionBuilder,
+    )
 
     return attn_backend
 
 
 def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
-
     # Needed for causal_conv1d
-    seqlens = query_start_loc_p.diff().to('cpu')
+    seqlens = query_start_loc_p.diff().to("cpu")
     nums_dict = {}  # type: ignore
     batch_ptr = None
     token_chunk_offset_ptr = None
@@ -951,40 +1040,77 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
     for BLOCK_M in [8]:  # cover all BLOCK_M values
         nums = -(-seqlens // BLOCK_M)
         nums_dict[BLOCK_M] = {}
-        nums_dict[BLOCK_M]['nums'] = nums
-        nums_dict[BLOCK_M]['tot'] = nums.sum().item()
+        nums_dict[BLOCK_M]["nums"] = nums
+        nums_dict[BLOCK_M]["tot"] = nums.sum().item()
         mlist = torch.from_numpy(np.repeat(np.arange(len(nums)), nums))
-        nums_dict[BLOCK_M]['mlist'] = mlist
-        mlist_len = len(nums_dict[BLOCK_M]['mlist'])
-        nums_dict[BLOCK_M]['mlist_len'] = mlist_len
+        nums_dict[BLOCK_M]["mlist"] = mlist
+        mlist_len = len(nums_dict[BLOCK_M]["mlist"])
+        nums_dict[BLOCK_M]["mlist_len"] = mlist_len
         MAX_NUM_PROGRAMS = max(1024, mlist_len) * 2
         offsetlist = []  # type: ignore
         for idx, num in enumerate(nums):
             offsetlist.extend(range(num))
         offsetlist = torch.tensor(offsetlist, dtype=torch.int32)
-        nums_dict[BLOCK_M]['offsetlist'] = offsetlist
+        nums_dict[BLOCK_M]["offsetlist"] = offsetlist
 
         if batch_ptr is None:
             # Update default value after class definition
-            batch_ptr = torch.full((MAX_NUM_PROGRAMS, ),
-                                   PAD_SLOT_ID,
-                                   dtype=torch.int32,
-                                   device=device)
-            token_chunk_offset_ptr = torch.full((MAX_NUM_PROGRAMS, ),
-                                                PAD_SLOT_ID,
-                                                dtype=torch.int32,
-                                                device=device)
+            batch_ptr = torch.full(
+                (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=device
+            )
+            token_chunk_offset_ptr = torch.full(
+                (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=device
+            )
         else:
             if batch_ptr.nelement() < MAX_NUM_PROGRAMS:
                 batch_ptr.resize_(MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID)
                 token_chunk_offset_ptr.resize_(  # type: ignore
-                    MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID)
+                    MAX_NUM_PROGRAMS
+                ).fill_(PAD_SLOT_ID)
 
         batch_ptr[0:mlist_len].copy_(mlist)
         token_chunk_offset_ptr[  # type: ignore
-            0:mlist_len].copy_(offsetlist)
-        nums_dict[BLOCK_M]['batch_ptr'] = batch_ptr
-        nums_dict[BLOCK_M]['token_chunk_offset_ptr'] = (token_chunk_offset_ptr
-                                                        )  # type: ignore
+            0:mlist_len
+        ].copy_(offsetlist)
+        nums_dict[BLOCK_M]["batch_ptr"] = batch_ptr
+        nums_dict[BLOCK_M]["token_chunk_offset_ptr"] = token_chunk_offset_ptr  # type: ignore
 
     return nums_dict, batch_ptr, token_chunk_offset_ptr
+
+
+def get_dcp_local_seq_lens(
+    seq_lens: torch.Tensor,
+    dcp_world_size: int = 1,
+    dcp_rank: int | None = None,
+    dcp_kv_cache_interleave_size: int = 1,
+) -> torch.Tensor:
+    """While using dcp, kv_cache size stored on each rank may be different,
+    use this function to calculate split decode seq_lens of each dcp rank.
+    Only consider dcp now, we can extend the case of cp based on this.
+    """
+    num_requests = seq_lens.size(0)
+    if dcp_rank is None:
+        rank_offsets = (
+            torch.arange(dcp_world_size, dtype=torch.int32)
+            .unsqueeze(0)
+            .repeat(num_requests, 1)
+        )
+    else:
+        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+    seq_lens_tiled = (
+        seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
+    )
+    base = (
+        seq_lens_tiled
+        // dcp_kv_cache_interleave_size
+        // dcp_world_size
+        * dcp_kv_cache_interleave_size
+    )
+    remainder = seq_lens_tiled - base * dcp_world_size
+    remainder = torch.clip(
+        remainder - rank_offsets * dcp_kv_cache_interleave_size,
+        0,
+        dcp_kv_cache_interleave_size,
+    )
+    dcp_local_seq_lens = base + remainder
+    return dcp_local_seq_lens.squeeze(1)
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 9d667ee04f75b2f3150b7a56c2ace9839b9bf0e2..d15d79417cc61a1e2b36197904cfaa39f78b6197 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,45 +3,46 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import ClassVar, Optional
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionType,
+    MultipleOf,
+)
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+    split_decodes_and_prefills,
+)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 try:
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import (
-        AttentionBias, PagedBlockDiagonalCausalWithOffsetPaddedKeysMask)
+        AttentionBias,
+        PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
+    )
 
     XFORMERS_AVAILABLE = True
 except ImportError:
     XFORMERS_AVAILABLE = False
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 from vllm import _custom_ops as ops
 
 logger = init_logger(__name__)
 
 
 class XFormersAttentionBackend(AttentionBackend):
-
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -77,17 +78,6 @@ class XFormersAttentionBackend(AttentionBackend):
             256,
         ]
 
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
-
     @staticmethod
     def get_name() -> str:
         return "XFORMERS"
@@ -96,10 +86,6 @@ class XFormersAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["XFormersAttentionImpl"]:
         return XFormersAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return XFormersAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -153,9 +139,9 @@ class XFormersAttentionMetadata:
             # metadata structure
             return self._cached_prefill_metadata
 
-        q_start_loc = self.query_start_loc[self.num_decodes:]
+        q_start_loc = self.query_start_loc[self.num_decodes :]
         q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes:]
+        kv_seqlens = self.seq_lens[self.num_decodes :]
         # Construct & cache prefill-phase attention metadata structure
         self._cached_prefill_metadata = XFormersAttentionMetadata(
             num_actual_tokens=self.num_prefill_tokens,
@@ -163,8 +149,8 @@ class XFormersAttentionMetadata:
             query_start_loc=q_start_loc - q_start_loc[0],
             max_seq_len=int(kv_seqlens.max().item()),
             seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes:],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens:],
+            block_table=self.block_table[self.num_decodes :],
+            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
         )
         return self._cached_prefill_metadata
 
@@ -180,24 +166,24 @@ class XFormersAttentionMetadata:
 
         q_start_loc = self.query_start_loc
         q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[:self.num_decodes]
+        decode_kv_seqlens = self.seq_lens[: self.num_decodes]
         # Construct & cache decode-phase attention metadata structure
         self._cached_decode_metadata = XFormersAttentionMetadata(
             num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[:self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[:self.num_decodes + 1],
+            max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
+            query_start_loc=q_start_loc[: self.num_decodes + 1],
             max_seq_len=int(decode_kv_seqlens.max().item()),
             seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[:self.num_decodes],
-            slot_mapping=self.slot_mapping[:self.num_decode_tokens],
+            block_table=self.block_table[: self.num_decodes],
+            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
             attn_bias=self.attn_bias,
         )
         return self._cached_decode_metadata
 
 
 class XFormersAttentionMetadataBuilder(
-        AttentionMetadataBuilder[XFormersAttentionMetadata]):
-
+    AttentionMetadataBuilder[XFormersAttentionMetadata]
+):
     reorder_batch_threshold: int = 1
 
     def __init__(
@@ -214,13 +200,6 @@ class XFormersAttentionMetadataBuilder(
         self._num_decodes = 0
         self._num_decode_tokens = 0
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(
-            input_batch,
-            scheduler_output,
-            decode_threshold=self.reorder_batch_threshold)
-
     def build(
         self,
         common_prefix_len: int,
@@ -229,8 +208,9 @@ class XFormersAttentionMetadataBuilder(
     ) -> XFormersAttentionMetadata:
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         q_start_loc = common_attn_metadata.query_start_loc
@@ -246,14 +226,13 @@ class XFormersAttentionMetadataBuilder(
             # Construct the decoder bias.
             decode_q_seqlens = q_seqlens[:num_decodes]
             decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = (
-                PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                    q_seqlen=decode_q_seqlens.tolist(),
-                    kv_seqlen=decode_kv_seqlens.tolist(),
-                    page_size=self.block_size,
-                    block_tables=block_table[:num_decodes],
-                    device=block_table.device,
-                ))
+            bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=decode_q_seqlens.tolist(),
+                kv_seqlen=decode_kv_seqlens.tolist(),
+                page_size=self.block_size,
+                block_tables=block_table[:num_decodes],
+                device=block_table.device,
+            )
 
         return XFormersAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -272,25 +251,23 @@ class XFormersAttentionMetadataBuilder(
 
 
 class XFormersAttentionImpl(AttentionImpl):
-
     def __init__(
         self,
         num_heads: int,
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported in V0.")
         if alibi_slopes is not None:
-            raise NotImplementedError(
-                "XFormers does not support alibi slopes yet.")
+            raise NotImplementedError("XFormers does not support alibi slopes yet.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -310,13 +287,13 @@ class XFormersAttentionImpl(AttentionImpl):
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
 
-        XFormersAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "XFormersAttentionImpl.")
+            raise NotImplementedError(
+                "Encoder self-attention and "
+                "encoder/decoder cross-attention "
+                "are not implemented for "
+                "XFormersAttentionImpl."
+            )
 
     def forward(
         self,
@@ -326,9 +303,9 @@ class XFormersAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: XFormersAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with XFormers.
 
@@ -347,11 +324,12 @@ class XFormersAttentionImpl(AttentionImpl):
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl")
+                " for XFormersAttentionImpl"
+            )
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # Cache the input KVs.
         key_cache, value_cache = kv_cache.unbind(0)
@@ -377,8 +355,7 @@ class XFormersAttentionImpl(AttentionImpl):
         num_actual_tokens = attn_metadata.num_actual_tokens
         num_decode_tokens = attn_metadata.num_decode_tokens
         if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
-                             key.shape[1])
+            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
             unified_attention(
                 q=query[num_decode_tokens:num_actual_tokens],
                 k=key_cache,
@@ -403,36 +380,38 @@ class XFormersAttentionImpl(AttentionImpl):
             # Query for decode. KV is not needed because it is already cached.
             decode_query = query[:num_decode_tokens]
             # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(1, -1, self.num_kv_heads,
-                                  self.num_queries_per_kv, self.head_size)
+            q = decode_query.view(
+                1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
+            )
             # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(1, -1, self.num_kv_heads, 1,
-                                     self.head_size).expand(
-                                         1,
-                                         -1,
-                                         self.num_kv_heads,
-                                         self.num_queries_per_kv,
-                                         self.head_size,
-                                     )
-            cache_v = value_cache.view(1, -1, self.num_kv_heads, 1,
-                                       self.head_size).expand(
-                                           1,
-                                           -1,
-                                           self.num_kv_heads,
-                                           self.num_queries_per_kv,
-                                           self.head_size,
-                                       )
+            cache_k = key_cache.view(
+                1, -1, self.num_kv_heads, 1, self.head_size
+            ).expand(
+                1,
+                -1,
+                self.num_kv_heads,
+                self.num_queries_per_kv,
+                self.head_size,
+            )
+            cache_v = value_cache.view(
+                1, -1, self.num_kv_heads, 1, self.head_size
+            ).expand(
+                1,
+                -1,
+                self.num_kv_heads,
+                self.num_queries_per_kv,
+                self.head_size,
+            )
 
             attn_bias = decode_meta.attn_bias
-            output[:
-                   num_decode_tokens] = xops.memory_efficient_attention_forward(
-                       q,
-                       cache_k,
-                       cache_v,
-                       attn_bias=attn_bias,
-                       p=0.0,
-                       scale=self.scale,
-                   ).view(decode_query.shape)
+            output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
+                q,
+                cache_k,
+                cache_v,
+                attn_bias=attn_bias,
+                p=0.0,
+                scale=self.scale,
+            ).view(decode_query.shape)
 
         # Reshape the output tensor.
         return output
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 3cc738304821b614adb62b67d2f09fecc41f0405..55710ad5cc693feea4e0cdd21759d803ecacb6e2 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,18 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
-from typing import Any, Optional, Union
-
-from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared,
-                                        BlockRemoved, BlockStored,
-                                        KVCacheEvent)
+from collections.abc import Iterable, Sequence
+from typing import Any
+
+from vllm.distributed.kv_events import (
+    MEDIUM_GPU,
+    AllBlocksCleared,
+    BlockRemoved,
+    BlockStored,
+    KVCacheEvent,
+)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         ExternalBlockHash,
-                                         FreeKVCacheBlockQueue, KVCacheBlock,
-                                         get_block_hash,
-                                         make_block_hash_with_group_id,
-                                         maybe_convert_block_hash)
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashWithGroupId,
+    ExternalBlockHash,
+    FreeKVCacheBlockQueue,
+    KVCacheBlock,
+    get_block_hash,
+    make_block_hash_with_group_id,
+    maybe_convert_block_hash,
+)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -20,7 +28,7 @@ logger = init_logger(__name__)
 
 class BlockHashToBlockMap:
     """
-    Cache of blocks that are used for prefix caching. It caches blocks 
+    Cache of blocks that are used for prefix caching. It caches blocks
     from hash directly to a block or multiple blocks
     (i.e. {block_hash: KVCacheBlocks})
     - Mostly block_hash maps to a single KVCacheBlock, and KVCacheBlocks
@@ -42,11 +50,11 @@ class BlockHashToBlockMap:
     """
 
     def __init__(self):
-        self._cache: dict[BlockHashWithGroupId,
-                          Union[KVCacheBlock, dict[int, KVCacheBlock]]] = {}
+        self._cache: dict[
+            BlockHashWithGroupId, KVCacheBlock | dict[int, KVCacheBlock]
+        ] = {}
 
-    def get_one_block(self,
-                      key: BlockHashWithGroupId) -> Optional[KVCacheBlock]:
+    def get_one_block(self, key: BlockHashWithGroupId) -> KVCacheBlock | None:
         """
         Gets any block with the given block hash key.
         """
@@ -77,8 +85,7 @@ class BlockHashToBlockMap:
         else:
             self._unexpected_blocks_type(blocks)
 
-    def pop(self, key: BlockHashWithGroupId,
-            block_id: int) -> Optional[KVCacheBlock]:
+    def pop(self, key: BlockHashWithGroupId, block_id: int) -> KVCacheBlock | None:
         """
         Checks if block_hash exists and pop block_id from the cache
         """
@@ -148,8 +155,7 @@ class BlockPool:
         self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
 
         # Cache for block lookup
-        self.cached_block_hash_to_block: BlockHashToBlockMap = \
-            BlockHashToBlockMap()
+        self.cached_block_hash_to_block: BlockHashToBlockMap = BlockHashToBlockMap()
 
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
@@ -161,9 +167,9 @@ class BlockPool:
         self.kv_event_queue: list[KVCacheEvent] = []
 
     def get_cached_block(
-            self, block_hash: BlockHash,
-            kv_cache_group_ids: list[int]) -> Optional[list[KVCacheBlock]]:
-        """Get the cached block by the block hash for each group in 
+        self, block_hash: BlockHash, kv_cache_group_ids: list[int]
+    ) -> list[KVCacheBlock] | None:
+        """Get the cached block by the block hash for each group in
         `kv_cache_group_ids`, or None if cache miss for any group.
         If there are duplicated blocks, we return the first block in the cache.
 
@@ -177,9 +183,11 @@ class BlockPool:
         cached_blocks = []
         for group_id in kv_cache_group_ids:
             block_hash_with_group_id = make_block_hash_with_group_id(
-                block_hash, group_id)
+                block_hash, group_id
+            )
             block = self.cached_block_hash_to_block.get_one_block(
-                block_hash_with_group_id)
+                block_hash_with_group_id
+            )
             if not block:
                 return None
             cached_blocks.append(block)
@@ -211,48 +219,52 @@ class BlockPool:
             block_size: Number of tokens in each block.
             kv_cache_group_id: The id of the KV cache group.
         """
-        if num_cached_blocks == num_full_blocks:
+        if num_cached_blocks >= num_full_blocks:
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
         assert len(request.block_hashes) >= num_full_blocks
         new_block_hashes = request.block_hashes[num_cached_blocks:]
 
-        new_hashes: Optional[list[ExternalBlockHash]] = (
-            [] if self.enable_kv_cache_events else None)
+        new_hashes: list[ExternalBlockHash] | None = (
+            [] if self.enable_kv_cache_events else None
+        )
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
             block_hash = new_block_hashes[i]
 
             # Update and added the full block to the cache.
             block_hash_with_group_id = make_block_hash_with_group_id(
-                block_hash, kv_cache_group_id)
+                block_hash, kv_cache_group_id
+            )
             blk.block_hash = block_hash_with_group_id
-            self.cached_block_hash_to_block.insert(block_hash_with_group_id,
-                                                   blk)
+            self.cached_block_hash_to_block.insert(block_hash_with_group_id, blk)
             if new_hashes is not None:
                 new_hashes.append(maybe_convert_block_hash(block_hash))
 
         if self.enable_kv_cache_events:
             if num_cached_blocks == 0:
-                parent_block_hash: Optional[ExternalBlockHash] = None
+                parent_block_hash: ExternalBlockHash | None = None
             else:
                 parent_block = blocks[num_cached_blocks - 1]
                 assert parent_block.block_hash is not None
                 parent_block_hash = maybe_convert_block_hash(
-                    get_block_hash(parent_block.block_hash))
+                    get_block_hash(parent_block.block_hash)
+                )
 
             self.kv_event_queue.append(
                 BlockStored(
                     block_hashes=new_hashes,
                     parent_block_hash=parent_block_hash,
-                    token_ids=request.
-                    all_token_ids[num_cached_blocks *
-                                  block_size:num_full_blocks * block_size],
+                    token_ids=request.all_token_ids[
+                        num_cached_blocks * block_size : num_full_blocks * block_size
+                    ],
                     block_size=block_size,
-                    lora_id=request.lora_request.id
-                    if request.lora_request else None,
+                    lora_id=request.lora_request.adapter_id
+                    if request.lora_request
+                    else None,
                     medium=MEDIUM_GPU,
-                ))
+                )
+            )
 
     def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
         """Get new blocks from the free block pool.
@@ -266,8 +278,7 @@ class BlockPool:
             A list of new block.
         """
         if num_blocks > self.get_num_free_blocks():
-            raise ValueError(
-                f"Cannot get {num_blocks} free blocks from the pool")
+            raise ValueError(f"Cannot get {num_blocks} free blocks from the pool")
 
         ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)
 
@@ -299,8 +310,7 @@ class BlockPool:
             # The block doesn't have hash, eviction is not needed
             return False
 
-        if self.cached_block_hash_to_block.pop(block_hash,
-                                               block.block_id) is None:
+        if self.cached_block_hash_to_block.pop(block_hash, block.block_id) is None:
             # block not found in cached_block_hash_to_block,
             # eviction is not needed
             return False
@@ -313,13 +323,14 @@ class BlockPool:
             # we disable hybrid kv cache manager when kv cache event is
             # enabled, so there is only one group.
             self.kv_event_queue.append(
-                BlockRemoved(block_hashes=[
-                    maybe_convert_block_hash(get_block_hash(block_hash))
-                ],
-                             medium=MEDIUM_GPU))
+                BlockRemoved(
+                    block_hashes=[maybe_convert_block_hash(get_block_hash(block_hash))],
+                    medium=MEDIUM_GPU,
+                )
+            )
         return True
 
-    def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
+    def touch(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -347,10 +358,9 @@ class BlockPool:
         blocks_list = list(ordered_blocks)
         for block in blocks_list:
             block.ref_cnt -= 1
-        self.free_block_queue.append_n([
-            block for block in blocks_list
-            if block.ref_cnt == 0 and not block.is_null
-        ])
+        self.free_block_queue.append_n(
+            [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
+        )
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -365,7 +375,9 @@ class BlockPool:
         if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks - 1)
+                "blocks (%d) are not freed yet",
+                num_used_blocks - 1,
+            )
             return False
 
         # Remove all hashes so that no new blocks will hit.
@@ -405,7 +417,7 @@ class BlockPool:
 
     def take_events(self) -> list[KVCacheEvent]:
         """Atomically takes all events and clears the queue.
-        
+
         Returns:
             A list of KV cache events.
         """
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index eadea15a2e5e3fd725be8b2626d9b9af89652eda..3959e9a59a53b24f0ca7db073b217de239e94acc 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -33,12 +33,12 @@ class EncoderCacheManager:
     within requests, allowing for fine-grained memory management and enabling
     chunked processing of multimodal inputs.
 
-    Cache is enabled to share embeddings of same multimodal data 
-    item (identified by their hash value) between different requests, 
-    and eviction takes place at allocation time when there's no free 
+    Cache is enabled to share embeddings of same multimodal data
+    item (identified by their hash value) between different requests,
+    and eviction takes place at allocation time when there's no free
     space for new embeddings.
     Oldest cached embeddings with no request referenced will be first evicted.
-    
+
     Args:
         cache_size: Limit the size of the cache, measured by the number of
                     tokens from the input sequence.
@@ -99,27 +99,31 @@ class EncoderCacheManager:
         self.cached[mm_hash].add(request.request_id)
         return True
 
-    def can_allocate(self, request: Request, input_id: int,
-                     encoder_compute_budget: int,
-                     num_tokens_to_schedule: int) -> bool:
-        """Check if there's sufficient cache space for a multimodal input. 
+    def can_allocate(
+        self,
+        request: Request,
+        input_id: int,
+        encoder_compute_budget: int,
+        num_tokens_to_schedule: int,
+    ) -> bool:
+        """Check if there's sufficient cache space for a multimodal input.
         If there is, return True and update EncoderCacheManager state.
 
         If there is not enough free space in `num_free_slots` but there is
         enough reclaimable space in `num_freeable_slots`, entries will be
         evicted from `freeable` (their mm_hash appended to `freed`) until
-        enough space is available, and then this method returns True. 
+        enough space is available, and then this method returns True.
         Older entries are evicted first.
-        
-        Returns False only if the requested number of tokens exceeds both 
+
+        Returns False only if the requested number of tokens exceeds both
         the free and reclaimable capacities combined.
 
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
-            encoder_compute_budget: Number of encoder tokens allowed to be 
+            encoder_compute_budget: Number of encoder tokens allowed to be
                 computed when this method is invoked.
-            num_tokens_to_schedule: Number of tokens already scheduled to be 
+            num_tokens_to_schedule: Number of tokens already scheduled to be
                 allocated with cache space when this method is invoked.
 
         Returns:
@@ -127,7 +131,7 @@ class EncoderCacheManager:
             input (possibly after reclaiming `freeable` entries); otherwise
             False.
 
-        Note: This method does not allocate physical memory for the encoder 
+        Note: This method does not allocate physical memory for the encoder
         output but only the state of EncoderCacheManager.
         """
         num_tokens = request.get_num_encoder_tokens(input_id)
@@ -202,7 +206,7 @@ class EncoderCacheManager:
 
         When the reference set for the corresponding `mm_hash` becomes empty,
         the entry is appended to `freeable` and `num_freeable_slots` is
-        increased by the number of encoder tokens for that input. 
+        increased by the number of encoder tokens for that input.
 
         The entry is NOT physically freed until capacity is needed (e.g., by
         `can_allocate`).
@@ -221,8 +225,8 @@ class EncoderCacheManager:
     def free(self, request: Request) -> None:
         """Free all encoder input cache reference held by *request*.
 
-        For each cached input ID, `free_encoder_input` is invoked.  
-        The data stays in memory until eviction is triggered by a future 
+        For each cached input ID, `free_encoder_input` is invoked.
+        The data stays in memory until eviction is triggered by a future
         attempt allocation called by 'can_allocate'.
 
         Typically called when a request is finished, cancelled, or aborted.
@@ -236,9 +240,9 @@ class EncoderCacheManager:
 
         Returns:
             List of mm_hash strings that were actually evicted since the last
-            call to be used by the scheduler to notify workers about which 
-            encoder outputs can be removed from their caches. The internal 
-            list is cleared after this call. 
+            call to be used by the scheduler to notify workers about which
+            encoder outputs can be removed from their caches. The internal
+            list is cleared after this call.
         """
         freed = self.freed
         self.freed = []
@@ -250,7 +254,7 @@ def compute_encoder_budget(
     scheduler_config: "SchedulerConfig",
     mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
-    """Compute the encoder cache budget based on the model and scheduler 
+    """Compute the encoder cache budget based on the model and scheduler
     configurations.
 
     Returns:
@@ -260,8 +264,9 @@ def compute_encoder_budget(
             from the input sequence.
     """
     if mm_registry.supports_multimodal_inputs(model_config):
-        max_tokens_by_modality = mm_registry \
-            .get_max_tokens_per_item_by_nonzero_modality(model_config)
+        max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
+            model_config
+        )
 
         return compute_mm_encoder_budget(
             scheduler_config,
@@ -271,18 +276,17 @@ def compute_encoder_budget(
     return compute_text_encoder_budget(scheduler_config)
 
 
-def compute_text_encoder_budget(
-        scheduler_config: "SchedulerConfig") -> tuple[int, int]:
-    """Compute the encoder cache budget based on the model and scheduler 
+def compute_text_encoder_budget(scheduler_config: "SchedulerConfig") -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler
     configurations for a text-only model.
 
     Args:
         scheduler_config: Scheduler configuration.
 
     Returns:
-        - Compute budget for encoder execution, in unit of number of tokens 
+        - Compute budget for encoder execution, in unit of number of tokens
             in the input sequence.
-        - Space budget for encoder cache size, in unit of number of tokens 
+        - Space budget for encoder cache size, in unit of number of tokens
             in the input sequence.
     """
     # Currently text-only encoder-decoder models are not supported
@@ -293,7 +297,7 @@ def compute_mm_encoder_budget(
     scheduler_config: "SchedulerConfig",
     max_tokens_by_modality: Mapping[str, int],
 ) -> tuple[int, int]:
-    """Compute the encoder cache budget based on the model and scheduler 
+    """Compute the encoder cache budget based on the model and scheduler
     configurations for a multimodal model.
 
     Args:
@@ -312,22 +316,28 @@ def compute_mm_encoder_budget(
         logger.warning(
             "All non-text modalities supported by the model have been "
             "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
-            "not be initialized.")
+            "not be initialized."
+        )
         return 0, 0
 
     max_tokens_per_mm_item = max(max_tokens_by_modality.values())
 
-    if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
-            > scheduler_config.max_num_batched_tokens):
+    if (
+        scheduler_config.disable_chunked_mm_input
+        and max_tokens_per_mm_item > scheduler_config.max_num_batched_tokens
+    ):
         raise ValueError(
             "Chunked MM input disabled but max_tokens_per_mm_item "
             f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
             f" ({scheduler_config.max_num_batched_tokens}). Please increase "
-            "max_num_batched_tokens.")
+            "max_num_batched_tokens."
+        )
 
-    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
-                                 max_tokens_per_mm_item)
-    encoder_cache_size = max(scheduler_config.encoder_cache_size,
-                             max_tokens_per_mm_item)
+    encoder_compute_budget = max(
+        scheduler_config.max_num_encoder_input_tokens, max_tokens_per_mm_item
+    )
+    encoder_cache_size = max(
+        scheduler_config.encoder_cache_size, max_tokens_per_mm_item
+    )
 
     return encoder_compute_budget, encoder_cache_size
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 86771060c409928edf6ed110465aef8e0ac8db94..137e5e0cdb6d24db46bd239cedfca6a2d7d704e8 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Optional
+from collections.abc import Sequence
 
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+    CrossAttentionManager,
+    FullAttentionManager,
+    get_manager_for_kv_cache_spec,
+)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
 from vllm.v1.request import Request
 
 
@@ -30,8 +32,9 @@ class KVCacheCoordinator(ABC):
         self.max_model_len = max_model_len
         self.enable_caching = enable_caching
 
-        self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
-                                    enable_kv_cache_events)
+        self.block_pool = BlockPool(
+            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+        )
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
         self.use_eagle = use_eagle
@@ -41,19 +44,23 @@ class KVCacheCoordinator(ABC):
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
-            ) for i, kv_cache_group in enumerate(
-                self.kv_cache_config.kv_cache_groups))
+            )
+            for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups)
+        )
 
-    def get_num_blocks_to_allocate(self, request_id: str, num_tokens: int,
-                                   new_computed_blocks: tuple[
-                                       list[KVCacheBlock], ...],
-                                   num_encoder_tokens: int) -> int:
+    def get_num_blocks_to_allocate(
+        self,
+        request_id: str,
+        num_tokens: int,
+        new_computed_blocks: tuple[Sequence[KVCacheBlock], ...],
+        num_encoder_tokens: int,
+    ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
         Args:
             request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including 
+            num_tokens: The total number of tokens that need a slot (including
                 tokens that are already allocated).
             new_computed_blocks: The new computed blocks just hitting the
                 prefix caching.
@@ -69,15 +76,17 @@ class KVCacheCoordinator(ABC):
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_encoder_tokens, [])
+                    request_id, num_encoder_tokens, []
+                )
             else:
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_tokens, new_computed_blocks[i])
+                    request_id, num_tokens, new_computed_blocks[i]
+                )
         return num_blocks_to_allocate
 
     def save_new_computed_blocks(
-            self, request_id: str,
-            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None:
+        self, request_id: str, new_computed_blocks: tuple[Sequence[KVCacheBlock], ...]
+    ) -> None:
         """
         Add the new computed blocks to the request.
 
@@ -87,21 +96,18 @@ class KVCacheCoordinator(ABC):
                 prefix cache.
         """
         for i, manager in enumerate(self.single_type_managers):
-            manager.save_new_computed_blocks(request_id,
-                                             new_computed_blocks[i])
+            manager.save_new_computed_blocks(request_id, new_computed_blocks[i])
 
     def allocate_new_blocks(
-            self,
-            request_id: str,
-            num_tokens: int,
-            num_encoder_tokens: int = 0) -> tuple[list[KVCacheBlock], ...]:
+        self, request_id: str, num_tokens: int, num_encoder_tokens: int = 0
+    ) -> tuple[list[KVCacheBlock], ...]:
         """
-        Allocate new blocks for the request to give it at least `num_tokens` 
+        Allocate new blocks for the request to give it at least `num_tokens`
         token slots.
 
         Args:
             request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including 
+            num_tokens: The total number of tokens that need a slot (including
                 tokens that are already allocated).
             num_encoder_tokens: The number of encoder tokens for allocating
                 blocks for cross-attention.
@@ -111,9 +117,13 @@ class KVCacheCoordinator(ABC):
         """
         return tuple(
             manager.allocate_new_blocks(
-                request_id, num_encoder_tokens if isinstance(
-                    manager, CrossAttentionManager) else num_tokens)
-            for manager in self.single_type_managers)
+                request_id,
+                num_encoder_tokens
+                if isinstance(manager, CrossAttentionManager)
+                else num_tokens,
+            )
+            for manager in self.single_type_managers
+        )
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """
@@ -138,32 +148,26 @@ class KVCacheCoordinator(ABC):
         for manager in self.single_type_managers:
             manager.free(request_id)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> list[int]:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> list[int]:
         """
-        Get the number of common prefix blocks for all requests in the RUNNING
-        state for each kv cache group.
+        Get the number of common prefix blocks for all requests with allocated
+        KV cache for each kv cache group.
 
         Args:
-            request_id: The request ID.
-            num_running_requests: The total number of requests in the RUNNING
-                state.
+            running_request_id: The request ID of any running request, used to
+                identify the common prefix blocks.
 
         Returns:
-            list[int]: The number of common prefix blocks for all requests in
-                the RUNNING state for each kv cache group.
+            list[int]: The number of common prefix blocks for each kv cache group.
         """
-        num_blocks_per_group = [
-            manager.get_num_common_prefix_blocks(request_id,
-                                                 num_running_requests)
+        return [
+            manager.get_num_common_prefix_blocks(running_request_id)
             for manager in self.single_type_managers
         ]
-        return num_blocks_per_group
 
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
+    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks` and replace 
+        Remove the blocks that are no longer needed from `blocks` and replace
         the removed blocks with null_block.
 
         Args:
@@ -179,7 +183,8 @@ class KVCacheCoordinator(ABC):
         """
         return tuple(
             manager.req_to_blocks.get(request_id) or []
-            for manager in self.single_type_managers)
+            for manager in self.single_type_managers
+        )
 
     @abstractmethod
     def find_longest_cache_hit(
@@ -198,19 +203,25 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
     Does not implement any features related to prefix caching.
     """
 
-    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_kv_cache_events: bool,
-                 dcp_world_size: int):
-        super().__init__(kv_cache_config,
-                         max_model_len,
-                         use_eagle,
-                         False,
-                         enable_kv_cache_events,
-                         dcp_world_size=dcp_world_size)
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_kv_cache_events: bool,
+        dcp_world_size: int,
+    ):
+        super().__init__(
+            kv_cache_config,
+            max_model_len,
+            use_eagle,
+            False,
+            enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
+        )
         self.num_single_type_manager = len(self.single_type_managers)
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> list[int]:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> list[int]:
         return [0] * self.num_single_type_manager
 
     def find_longest_cache_hit(
@@ -219,7 +230,8 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
         max_cache_hit_length: int,
     ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         blocks: tuple[list[KVCacheBlock], ...] = tuple(
-            [] for _ in range(self.num_single_type_manager))
+            [] for _ in range(self.num_single_type_manager)
+        )
         return blocks, 0
 
 
@@ -230,23 +242,31 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
     full attention or all attention layers use sliding window attention.
     """
 
-    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool, dcp_world_size: int):
-        super().__init__(kv_cache_config,
-                         max_model_len,
-                         use_eagle,
-                         enable_caching,
-                         enable_kv_cache_events,
-                         dcp_world_size=dcp_world_size)
-        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
-            0].kv_cache_spec
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        enable_kv_cache_events: bool,
+        dcp_world_size: int,
+    ):
+        super().__init__(
+            kv_cache_config,
+            max_model_len,
+            use_eagle,
+            enable_caching,
+            enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
+        )
+        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
         if dcp_world_size > 1:
             self.block_size *= dcp_world_size
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
-            "UnitaryKVCacheCoordinator assumes only one kv cache group")
+            "UnitaryKVCacheCoordinator assumes only one kv cache group"
+        )
 
     def find_longest_cache_hit(
         self,
@@ -269,31 +289,39 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
     """
     KV cache coordinator for hybrid models with multiple KV cache types, and
     thus multiple kv cache groups.
-    To simplify `find_longest_cache_hit`, it only supports the combination of 
+    To simplify `find_longest_cache_hit`, it only supports the combination of
     two types of KV cache groups, and one of them must be full attention.
     May extend to more general cases in the future.
     """
 
-    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool, dcp_world_size: int):
-        super().__init__(kv_cache_config,
-                         max_model_len,
-                         use_eagle,
-                         enable_caching,
-                         enable_kv_cache_events,
-                         dcp_world_size=dcp_world_size)
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        enable_kv_cache_events: bool,
+        dcp_world_size: int,
+    ):
+        super().__init__(
+            kv_cache_config,
+            max_model_len,
+            use_eagle,
+            enable_caching,
+            enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
+        )
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
         """
-        Verifies that the model has exactly two types of KV cache groups, and 
+        Verifies that the model has exactly two types of KV cache groups, and
         one of them is full attention. Then, split the kv cache groups into full
         attention groups and other groups.
         """
-        full_attention_spec: Optional[FullAttentionSpec] = None
-        other_spec: Optional[KVCacheSpec] = None
+        full_attention_spec: FullAttentionSpec | None = None
+        other_spec: KVCacheSpec | None = None
         self.full_attention_group_ids: list[int] = []
         self.other_group_ids: list[int] = []
         for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
@@ -303,7 +331,8 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                 else:
                     assert full_attention_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes exactly one type of "
-                        "full attention groups now.")
+                        "full attention groups now."
+                    )
                 self.full_attention_group_ids.append(i)
             else:
                 if other_spec is None:
@@ -311,19 +340,22 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                 else:
                     assert other_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes "
-                        "exactly one other type of groups now.")
+                        "exactly one other type of groups now."
+                    )
                 self.other_group_ids.append(i)
 
         assert full_attention_spec is not None, (
             "HybridKVCacheCoordinator assumes exactly one type of full "
-            "attention groups now.")
+            "attention groups now."
+        )
         assert other_spec is not None, (
-            "HybridKVCacheCoordinator assumes exactly one type of other "
-            "groups now.")
+            "HybridKVCacheCoordinator assumes exactly one type of other groups now."
+        )
 
         self.full_attention_manager_cls = FullAttentionManager
         self.other_attention_cls = self.single_type_managers[
-            self.other_group_ids[0]].__class__
+            self.other_group_ids[0]
+        ].__class__
         self.full_attention_spec = full_attention_spec
         self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
@@ -334,7 +366,8 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             divisible = self.other_block_size % self.full_attention_block_size
             assert divisible == 0, (
                 "KVCacheCoordinator assumes the block_size of full "
-                "attention layers is divisible by other layers now.")
+                "attention layers is divisible by other layers now."
+            )
 
         if max(self.full_attention_group_ids) < min(self.other_group_ids):
             self.full_attn_first = True
@@ -347,7 +380,8 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                 "do not interleave, either full attention group ids "
                 "are before other attention group ids or vice versa."
                 "This is for simplifying merging hit_blocks_full_attn and "
-                "hit_blocks_other_attn to hit_blocks.")
+                "hit_blocks_other_attn to hit_blocks."
+            )
 
     def find_longest_cache_hit(
         self,
@@ -367,29 +401,26 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                 - The number of tokens of the longest cache hit.
         """
         # First, find the longest cache hit for full attention.
-        hit_blocks_full_attn = (
-            self.full_attention_manager_cls.find_longest_cache_hit(
-                block_hashes=block_hashes,
-                max_length=max_cache_hit_length,
-                kv_cache_group_ids=self.full_attention_group_ids,
-                block_pool=self.block_pool,
-                kv_cache_spec=self.full_attention_spec,
-                use_eagle=self.use_eagle,
-            ))
-        hit_length = len(
-            hit_blocks_full_attn[0]) * self.full_attention_block_size
+        hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=max_cache_hit_length,
+            kv_cache_group_ids=self.full_attention_group_ids,
+            block_pool=self.block_pool,
+            kv_cache_spec=self.full_attention_spec,
+            use_eagle=self.use_eagle,
+        )
+        hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
 
         # Next, find the cache hit for the other attention WITHIN
         # the cache hit of full attention.
-        hit_blocks_other_attn = (
-            self.other_attention_cls.find_longest_cache_hit(
-                block_hashes=block_hashes,
-                max_length=hit_length,
-                kv_cache_group_ids=self.other_group_ids,
-                block_pool=self.block_pool,
-                kv_cache_spec=self.other_spec,
-                use_eagle=self.use_eagle,
-            ))
+        hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=hit_length,
+            kv_cache_group_ids=self.other_group_ids,
+            block_pool=self.block_pool,
+            kv_cache_spec=self.other_spec,
+            use_eagle=self.use_eagle,
+        )
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
         # NOTE: the prefix cache hit length must be a multiple of block_size as
@@ -404,7 +435,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         # Truncate the full attention cache hit to the length of the
         # cache hit of the other attention.
         for group_hit_blocks in hit_blocks_full_attn:
-            del group_hit_blocks[hit_length // self.full_attention_block_size:]
+            del group_hit_blocks[hit_length // self.full_attention_block_size :]
 
         # Merge the hit blocks of full attention and other attention.
         if self.full_attn_first:
@@ -414,27 +445,36 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         return hit_blocks, hit_length
 
 
-def get_kv_cache_coordinator(kv_cache_config: KVCacheConfig,
-                             max_model_len: int, use_eagle: bool,
-                             enable_caching: bool,
-                             enable_kv_cache_events: bool,
-                             dcp_world_size: int) -> KVCacheCoordinator:
+def get_kv_cache_coordinator(
+    kv_cache_config: KVCacheConfig,
+    max_model_len: int,
+    use_eagle: bool,
+    enable_caching: bool,
+    enable_kv_cache_events: bool,
+    dcp_world_size: int,
+) -> KVCacheCoordinator:
     if not enable_caching:
-        return KVCacheCoordinatorNoPrefixCache(kv_cache_config,
-                                               max_model_len,
-                                               use_eagle,
-                                               enable_kv_cache_events,
-                                               dcp_world_size=dcp_world_size)
+        return KVCacheCoordinatorNoPrefixCache(
+            kv_cache_config,
+            max_model_len,
+            use_eagle,
+            enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
+        )
     if len(kv_cache_config.kv_cache_groups) == 1:
-        return UnitaryKVCacheCoordinator(kv_cache_config,
-                                         max_model_len,
-                                         use_eagle,
-                                         enable_caching,
-                                         enable_kv_cache_events,
-                                         dcp_world_size=dcp_world_size)
-    return HybridKVCacheCoordinator(kv_cache_config,
-                                    max_model_len,
-                                    use_eagle,
-                                    enable_caching,
-                                    enable_kv_cache_events,
-                                    dcp_world_size=dcp_world_size)
+        return UnitaryKVCacheCoordinator(
+            kv_cache_config,
+            max_model_len,
+            use_eagle,
+            enable_caching,
+            enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
+        )
+    return HybridKVCacheCoordinator(
+        kv_cache_config,
+        max_model_len,
+        use_eagle,
+        enable_caching,
+        enable_kv_cache_events,
+        dcp_world_size=dcp_world_size,
+    )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 401327f727a4a0d0b23b8742df3d23d42d8bfa97..7f405fc248ac28a1d83a5b4e6478bb7de4ce2977 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import itertools
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Literal, Optional, overload
+from typing import Literal, overload
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
@@ -10,7 +12,7 @@ from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
 from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
-from vllm.v1.request import Request, RequestStatus
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
@@ -22,40 +24,47 @@ class KVCacheBlocks:
     Scheduler and KVCacheManager, to hide KVCacheManager's internal data
     structure from the Scheduler.
     """
-    blocks: tuple[list[KVCacheBlock], ...]
+
+    blocks: tuple[Sequence[KVCacheBlock], ...]
     """
     `blocks[i][j]` refers to the i-th kv_cache_group
     and the j-th block of tokens.We don't use block of
     tokens as the outer dimension because it assumes all
-    kv_cache_groups have the same number of blocks, which is true for now but 
-    will be broken if we want to give different block_size to different 
+    kv_cache_groups have the same number of blocks, which is true for now but
+    will be broken if we want to give different block_size to different
     kv_cache_groups in the future.
+
+    Each single type KVCacheBlocks could be represented as:
+    - list[KVCacheBlock] for more than one KVCacheBlock
+    - an empty tuple for requests without KVCacheBlock
+      (a precomputed KVCacheBlocks is in KVCacheManager to avoid GC overhead)
     """
 
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
         return KVCacheBlocks(
-            tuple(blk1 + blk2
-                  for blk1, blk2 in zip(self.blocks, other.blocks)))
+            tuple(
+                list(itertools.chain(blk1, blk2))
+                for blk1, blk2 in zip(self.blocks, other.blocks)
+            )
+        )
 
     @overload
     def get_block_ids(
         self,
         allow_none: Literal[False] = False,
-    ) -> tuple[list[int], ...]:
-        ...
+    ) -> tuple[list[int], ...]: ...
 
     @overload
     def get_block_ids(
         self,
         allow_none: Literal[True] = True,
-    ) -> Optional[tuple[list[int], ...]]:
-        ...
+    ) -> tuple[list[int], ...] | None: ...
 
     def get_block_ids(
         self,
         allow_none: bool = False,
-    ) -> Optional[tuple[list[int], ...]]:
+    ) -> tuple[list[int], ...] | None:
         """
         Converts the KVCacheBlocks instance to block_ids.
 
@@ -72,18 +81,16 @@ class KVCacheBlocks:
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
         assert len(self.blocks) == 1, "Only one group is supported"
-        return [
-            block.block_id for block in self.blocks[0]
-            if block.block_hash is None
-        ]
+        return [block.block_id for block in self.blocks[0] if block.block_hash is None]
 
     def new_empty(self) -> "KVCacheBlocks":
-        """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
+        """
+        Creates a new KVCacheBlocks instance with no blocks.
+        """
+        return KVCacheBlocks(tuple(() for _ in range(len(self.blocks))))
 
 
 class KVCacheManager:
-
     def __init__(
         self,
         kv_cache_config: KVCacheConfig,
@@ -102,14 +109,20 @@ class KVCacheManager:
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_size: Optional[int] = None
+        self.block_size: int | None = None
         if self.enable_caching:
-            assert len(
-                set(g.kv_cache_spec.block_size
-                    for g in kv_cache_config.kv_cache_groups)
-            ) == 1, "Only one block size is supported for now"
+            assert (
+                len(
+                    set(
+                        g.kv_cache_spec.block_size
+                        for g in kv_cache_config.kv_cache_groups
+                    )
+                )
+                == 1
+            ), "Only one block size is supported for now"
             self.block_size = kv_cache_config.kv_cache_groups[
-                0].kv_cache_spec.block_size
+                0
+            ].kv_cache_spec.block_size
 
             if dcp_world_size > 1:
                 assert len(kv_cache_config.kv_cache_groups) == 1
@@ -130,6 +143,15 @@ class KVCacheManager:
         self.block_pool = self.coordinator.block_pool
         self.kv_cache_config = kv_cache_config
 
+        # Pre-constructed KVCacheBlocks with no blocks, callers should use this
+        # via create_kv_cache_blocks instead of creating new ones to avoid GC
+        # overhead.
+        #
+        # We use nested tuples to ensure the empty KVCacheBlocks is immutable.
+        self.empty_kv_cache_blocks = KVCacheBlocks(
+            tuple(() for _ in range(self.num_kv_cache_groups))
+        )
+
     @property
     def usage(self) -> float:
         """Get the KV cache usage.
@@ -139,7 +161,7 @@ class KVCacheManager:
         """
         return self.block_pool.get_usage()
 
-    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
+    def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
         """Get (and reset) the prefix cache stats.
 
         Returns:
@@ -151,8 +173,7 @@ class KVCacheManager:
         self.prefix_cache_stats = PrefixCacheStats()
         return stats
 
-    def get_computed_blocks(self,
-                            request: Request) -> tuple[KVCacheBlocks, int]:
+    def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
 
@@ -164,12 +185,12 @@ class KVCacheManager:
                 - A list of blocks that are computed for the request.
                 - The number of computed tokens.
         """
-        # Prefix caching is disabled or
-        # When the request requires prompt logprobs, we skip prefix caching.
-        if (not self.enable_caching
-                or (request.sampling_params is not None
-                    and request.sampling_params.prompt_logprobs is not None)):
-            return self.create_empty_block_list(), 0
+        # We skip finding the prefix cache hit when prefix caching is
+        # disabled or the request is marked as skipping kv cache read
+        # (which happens when the request requires prompt logprobs
+        # or calls a pooling model with all pooling).
+        if not self.enable_caching or request.skip_reading_prefix_cache:
+            return self.empty_kv_cache_blocks, 0
 
         # NOTE: When all tokens hit the cache, we must recompute the last token
         # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
@@ -179,27 +200,31 @@ class KVCacheManager:
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
         computed_blocks, num_new_computed_tokens = (
-            self.coordinator.find_longest_cache_hit(request.block_hashes,
-                                                    max_cache_hit_length))
+            self.coordinator.find_longest_cache_hit(
+                request.block_hashes, max_cache_hit_length
+            )
+        )
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
-            self.prefix_cache_stats.requests += 1
-            self.prefix_cache_stats.queries += request.num_tokens
-            self.prefix_cache_stats.hits += num_new_computed_tokens
+            self.prefix_cache_stats.record(
+                num_tokens=request.num_tokens,
+                num_hits=num_new_computed_tokens,
+                preempted=request.num_preemptions > 0,
+            )
 
-        return KVCacheBlocks(computed_blocks), num_new_computed_tokens
+        return self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
         self,
         request: Request,
         num_new_tokens: int,
         num_new_computed_tokens: int = 0,
-        new_computed_blocks: Optional[KVCacheBlocks] = None,
+        new_computed_blocks: KVCacheBlocks | None = None,
         num_lookahead_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
-    ) -> Optional[KVCacheBlocks]:
+    ) -> KVCacheBlocks | None:
         """Add slots for a request with new tokens to append.
 
         Args:
@@ -209,10 +234,10 @@ class KVCacheManager:
                 already been computed locally (i.e. new_computed_blocks).
             num_new_computed_tokens: The number of new computed tokens just
                 hitting the prefix caching, excluding external tokens.
-            new_computed_blocks: The cached blocks for the above new computed 
+            new_computed_blocks: The cached blocks for the above new computed
                 tokens.
             num_lookahead_tokens: The number of speculative tokens to allocate.
-                This is used by spec decode proposers with kv-cache such 
+                This is used by spec decode proposers with kv-cache such
                 as eagle.
             delay_cache_blocks: Whether to skip caching the blocks. This is
                 used by P/D when allocating blocks used in a KV transfer
@@ -241,8 +266,7 @@ class KVCacheManager:
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = tuple(
-                [] for _ in range(len(self.kv_cache_config.kv_cache_groups)))
+            new_computed_block_list = self.empty_kv_cache_blocks.blocks
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -250,16 +274,17 @@ class KVCacheManager:
         # insufficient free blocks.
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
-        self.coordinator.remove_skipped_blocks(request.request_id,
-                                               request.num_computed_tokens)
+        self.coordinator.remove_skipped_blocks(
+            request.request_id, request.num_computed_tokens
+        )
 
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
-        num_computed_tokens = (request.num_computed_tokens +
-                               num_new_computed_tokens)
+        num_computed_tokens = request.num_computed_tokens + num_new_computed_tokens
         num_tokens_need_slot = min(
             num_computed_tokens + num_new_tokens + num_lookahead_tokens,
-            self.max_model_len)
+            self.max_model_len,
+        )
 
         num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
             request_id=request.request_id,
@@ -277,31 +302,35 @@ class KVCacheManager:
             self.block_pool.touch(new_computed_block_list)
         else:
             assert not any(new_computed_block_list), (
-                "Computed blocks should be empty when "
-                "prefix caching is disabled")
+                "Computed blocks should be empty when prefix caching is disabled"
+            )
 
-        # Append the new computed blocks to the request blocks until now to
-        # avoid the case where the new blocks cannot be allocated.
-        self.coordinator.save_new_computed_blocks(request.request_id,
-                                                  new_computed_block_list)
+        if new_computed_block_list is not self.empty_kv_cache_blocks.blocks:
+            # Append the new computed blocks to the request blocks until now to
+            # avoid the case where the new blocks cannot be allocated.
+            self.coordinator.save_new_computed_blocks(
+                request.request_id, new_computed_block_list
+            )
 
         new_blocks = self.coordinator.allocate_new_blocks(
-            request.request_id, num_tokens_need_slot, num_encoder_tokens)
+            request.request_id, num_tokens_need_slot, num_encoder_tokens
+        )
 
         # P/D: delay caching blocks if we have to recv from
         # remote. Update state for locally cached blocks.
         if not self.enable_caching or delay_cache_blocks:
-            return KVCacheBlocks(new_blocks)
+            return self.create_kv_cache_blocks(new_blocks)
 
         # NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
         # num_new_tokens, but must exclude "non-committable" tokens (e.g.,
         # draft tokens that could be rejected). Therefore, we cap the number
         # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
-        num_tokens_to_cache = min(num_computed_tokens + num_new_tokens,
-                                  request.num_tokens)
+        num_tokens_to_cache = min(
+            num_computed_tokens + num_new_tokens, request.num_tokens
+        )
         self.coordinator.cache_blocks(request, num_tokens_to_cache)
 
-        return KVCacheBlocks(new_blocks)
+        return self.create_kv_cache_blocks(new_blocks)
 
     def free(self, request: Request) -> None:
         """Free the blocks allocated for the request.
@@ -329,48 +358,39 @@ class KVCacheManager:
             self.prefix_cache_stats.reset = True
         return True
 
-    def get_num_common_prefix_blocks(
-        self,
-        request: Request,
-        num_running_requests: int,
-    ) -> list[int]:
-        """Calculate the number of common prefix blocks shared by all requests
-        in the RUNNING state for each kv cache group.
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> list[int]:
+        """Calculate the number of common prefix blocks for each kv cache group.
 
-        The function determines this by selecting any request and iterating
-        through its blocks.  A block is considered a common prefix block if its
-        `ref_cnt` equals the total number of requests in the RUNNING state.
+        The function selects a running request and iterates through its blocks.
+        A block is considered a common prefix block if ALL requests with
+        allocated KV cache share it (i.e., ref_cnt equals the number of entries
+        in req_to_blocks).
 
-        NOTE(woosuk): The number of requests in the RUNNING state is **greater
+        NOTE(woosuk): The number of requests with allocated KV cache is **greater
         than or equal to** the number of requests scheduled in the current step.
-        This is because the RUNNING state only indicates that:
+        This is because having allocated KV cache only indicates that:
         1. The request has not yet finished, and
         2. The request holds its blocks unfreed.
 
-        While all scheduled requests must be in the RUNNING state, the inverse
-        is not necessarily true. There may be RUNNING requests that are not
-        scheduled in the current step.
+        While all scheduled requests must have allocated KV cache, the inverse
+        is not necessarily true. There may be requests with allocated KV cache
+        that are not scheduled in the current step.
 
         This can result in an edge case where the number of common prefix blocks
         is 0, even though all scheduled requests share a common prefix. This
-        occurs because there may be unscheduled RUNNING requests that do not
-        share the common prefix. Currently, this case cannot be easily detected,
-        so the function returns 0 in such cases.
+        occurs because there may be unscheduled requests that do not share the
+        common prefix. Currently, this case cannot be easily detected, so the
+        function returns 0 in such cases.
 
         Args:
-            request: Any request in the RUNNING state, used to identify the
-                common prefix blocks.
-            num_running_requests: The total number of requests in the RUNNING
-                state. This can be different from the number of scheduled
-                requests in the current step.
+            running_request_id: The request ID of any running request, used to
+                identify the common prefix blocks.
 
         Returns:
-            list[int]: The number of common prefix blocks for each kv cache 
+            list[int]: The number of common prefix blocks for each kv cache
             group.
         """
-        assert request.status == RequestStatus.RUNNING
-        return self.coordinator.get_num_common_prefix_blocks(
-            request.request_id, num_running_requests)
+        return self.coordinator.get_num_common_prefix_blocks(running_request_id)
 
     def take_events(self) -> list[KVCacheEvent]:
         """Take the KV cache events from the block pool.
@@ -382,7 +402,7 @@ class KVCacheManager:
 
     def get_blocks(self, request_id: str) -> KVCacheBlocks:
         """Get the blocks of a request."""
-        return KVCacheBlocks(self.coordinator.get_blocks(request_id))
+        return self.create_kv_cache_blocks(self.coordinator.get_blocks(request_id))
 
     def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
         """Get the block ids of a request."""
@@ -393,7 +413,8 @@ class KVCacheManager:
         if self.enable_caching:
             self.coordinator.cache_blocks(request, num_computed_tokens)
 
-    def create_empty_block_list(self) -> KVCacheBlocks:
-        """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks(tuple([]
-                                   for _ in range(self.num_kv_cache_groups)))
+    def create_kv_cache_blocks(
+        self, blocks: tuple[list[KVCacheBlock], ...]
+    ) -> KVCacheBlocks:
+        # Only create new KVCacheBlocks for non-empty blocks
+        return KVCacheBlocks(blocks) if any(blocks) else self.empty_kv_cache_blocks
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 2ff1bb681d80917fdff0de12f60025750614a63c..6e026215d4022500fb23009374c54b7ee3e92c28 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -4,58 +4,65 @@
 
 import copy
 import os
-from collections import defaultdict, deque
-from collections.abc import Iterable, Sequence
+from collections import defaultdict
+from collections.abc import Callable, Iterable, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, NewType, Optional, Union
+from typing import Any, NewType, TypeAlias
 
 from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, cdiv, sha256_cbor
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheSpec,
-                                        KVCacheTensor, SlidingWindowSpec,
-                                        UniformTypeKVCacheSpecs)
-from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.utils.hashing import sha256_cbor
+from vllm.utils.math_utils import cdiv
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.v1.kv_cache_interface import (
+    ChunkedLocalAttentionSpec,
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheSpec,
+    KVCacheTensor,
+    SlidingWindowSpec,
+    UniformTypeKVCacheSpecs,
+)
 from vllm.v1.request import Request
+from vllm.v1.utils import tensor_data
 
 # BlockHash represents the hash of a single KV-cache block used for
-# prefix caching.  Treating it as a distinct type from ``bytes`` helps
+# prefix caching.  Treating it as a distinct type from `bytes` helps
 # catch accidental misuse when passing around raw byte strings.
 BlockHash = NewType("BlockHash", bytes)
 
-# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID.
+# `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
 # It is represented as raw bytes for compactness and efficiency. The helper
-# functions below pack/unpack the ``BlockHash`` and group id into/from the key.
+# functions below pack/unpack the `BlockHash` and group id into/from the key.
 BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
 
 # ExternalBlockHash is used for reproducible prefix-cache block hashing.
-# It's a union of ``bytes`` and ``int`` to keep backward compatibility
+# It's a union of `bytes` and `int` to keep backward compatibility
 # after we default block hashing to use sha256 bytes.
-ExternalBlockHash = Union[bytes, int]
+ExternalBlockHash: TypeAlias = bytes | int
 
 
-def make_block_hash_with_group_id(block_hash: BlockHash,
-                                  group_id: int) -> BlockHashWithGroupId:
-    """Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``.
+def make_block_hash_with_group_id(
+    block_hash: BlockHash, group_id: int
+) -> BlockHashWithGroupId:
+    """Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
 
     The group id is encoded using 4 bytes in big-endian order and appended to
     the block hash bytes.  This representation avoids creating tuples while
     still allowing us to recover both components when needed.
     """
-    return BlockHashWithGroupId(block_hash +
-                                group_id.to_bytes(4, "big", signed=False))
+    return BlockHashWithGroupId(block_hash + group_id.to_bytes(4, "big", signed=False))
 
 
 def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
-    """Extract the ``BlockHash`` from a ``BlockHashWithGroupId``."""
+    """Extract the `BlockHash` from a `BlockHashWithGroupId`."""
     return BlockHash(key[:-4])
 
 
 def get_group_id(key: BlockHashWithGroupId) -> int:
-    """Extract the group id from a ``BlockHashWithGroupId``."""
+    """Extract the group id from a `BlockHashWithGroupId`."""
     return int.from_bytes(key[-4:], "big", signed=False)
 
 
@@ -87,7 +94,8 @@ def init_none_hash(hash_fn: Callable[[Any], bytes]):
             "PYTHONHASHSEED is not set. This will lead to non-reproducible "
             "block-hashes when using sha256_cbor as the hash function."
             "Consider setting PYTHONHASHSEED to a fixed value for "
-            "reproducibility.")
+            "reproducibility."
+        )
 
     if hash_seed is None:
         NONE_HASH = BlockHash(os.urandom(32))
@@ -95,104 +103,35 @@ def init_none_hash(hash_fn: Callable[[Any], bytes]):
         NONE_HASH = BlockHash(hash_fn(hash_seed))
 
 
-class PrefixCachingMetrics:
-    """Metrics for prefix caching with a hit rate of the max recent N requests.
-
-    Args:
-        max_recent_requests: The number of the max recent requests to aggregate.
-            Defaults to 1000.
-    """
-
-    def __init__(self, max_recent_requests: int = 1000):
-        self.max_recent_requests = max_recent_requests
-        # The current aggregated values.
-        self.aggregated_requests = 0
-        self.aggregated_query_total = 0
-        self.aggregated_query_hit = 0
-        # A deque of (requests, queries, hits) for the most recent requests.
-        self.query_queue: deque[tuple[int, int, int]] = deque()
-
-    def observe(self, stats: PrefixCacheStats):
-        """Observe the prefix caching for a set of requests.
-
-        This function is called with information gathered when new requests
-        are being scheduled and are looking for computed blocks.
-
-        When there are more than `max_recent_requests` requests, the oldest set
-        of requests are removed from the metrics.
-
-        Args:
-            stats: The prefix cache stats.
-        """
-        # reset_prefix_cache was invoked before the current update.
-        # Reset the metrics before aggregating the current stats.
-        if stats.reset:
-            self.reset()
-
-        # DO NOT appending empty stats to avoid helpful info get kicked out
-        # due to sliding window.
-        if stats.requests == 0:
-            return
-
-        # Update the metrics.
-        self.query_queue.append((stats.requests, stats.queries, stats.hits))
-        self.aggregated_requests += stats.requests
-        self.aggregated_query_total += stats.queries
-        self.aggregated_query_hit += stats.hits
-
-        # Remove the oldest stats until number of requests does not exceed
-        # the limit.
-        # NOTE: We preserve the latest added stats regardless.
-        while len(
-                self.query_queue
-        ) > 1 and self.aggregated_requests > self.max_recent_requests:
-            old_requests, old_queries, old_hits = self.query_queue.popleft()
-            self.aggregated_requests -= old_requests
-            self.aggregated_query_total -= old_queries
-            self.aggregated_query_hit -= old_hits
-
-    def reset(self):
-        """Reset the metrics."""
-        self.aggregated_requests = 0
-        self.aggregated_query_total = 0
-        self.aggregated_query_hit = 0
-        self.query_queue.clear()
-
-    @property
-    def hit_rate(self) -> float:
-        """Calculate the hit rate for the past N requests."""
-        if self.aggregated_query_total == 0:
-            return 0.0
-        return self.aggregated_query_hit / self.aggregated_query_total
-
-
 @dataclass
 class KVCacheBlock:
     """KV-cache block metadata."""
+
     # Block ID, ranging from 0 to num_gpu_blocks - 1.
     block_id: int
     # Reference count.
     ref_cnt: int = 0
     # The hash key (block hash + group id) of the block, only available
     # when the block is full and cached.
-    _block_hash: Optional[BlockHashWithGroupId] = None
+    _block_hash: BlockHashWithGroupId | None = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
-    prev_free_block: Optional["KVCacheBlock"] = None
-    next_free_block: Optional["KVCacheBlock"] = None
+    prev_free_block: "KVCacheBlock | None" = None
+    next_free_block: "KVCacheBlock | None" = None
 
     # Whether the block is a null block that should never be cached.
     is_null: bool = False
 
     @property
-    def block_hash(self) -> Optional[BlockHashWithGroupId]:
+    def block_hash(self) -> BlockHashWithGroupId | None:
         return self._block_hash
 
     @block_hash.setter
     def block_hash(self, block_hash: BlockHashWithGroupId):
         assert self.block_hash is None, (
-            "The block already has a hash. This should not happen.")
+            "The block already has a hash. This should not happen."
+        )
         self._block_hash = block_hash
 
     def reset_hash(self):
@@ -202,15 +141,15 @@ class KVCacheBlock:
     def __repr__(self) -> str:
         # Use block_id instead of KVCacheBlock object to avoid calling __repr__
         # on KVCacheBlock object recursively.
-        prev_block_id = (self.prev_free_block.block_id
-                         if self.prev_free_block else None)
-        next_block_id = (self.next_free_block.block_id
-                         if self.next_free_block else None)
-        return (f"KVCacheBlock(block_id={self.block_id}, "
-                f"ref_cnt={self.ref_cnt}, "
-                f"_block_hash={self._block_hash!r}, "
-                f"prev_free_block={prev_block_id}, "
-                f"next_free_block={next_block_id})")
+        prev_block_id = self.prev_free_block.block_id if self.prev_free_block else None
+        next_block_id = self.next_free_block.block_id if self.next_free_block else None
+        return (
+            f"KVCacheBlock(block_id={self.block_id}, "
+            f"ref_cnt={self.ref_cnt}, "
+            f"_block_hash={self._block_hash!r}, "
+            f"prev_free_block={prev_block_id}, "
+            f"next_free_block={next_block_id})"
+        )
 
 
 class FreeKVCacheBlockQueue:
@@ -271,12 +210,14 @@ class FreeKVCacheBlockQueue:
         Returns:
             The first free block.
         """
-        if (self.fake_free_list_head.next_free_block
-                is self.fake_free_list_tail
-                or self.fake_free_list_head.next_free_block is None):
+        if (
+            self.fake_free_list_head.next_free_block is self.fake_free_list_tail
+            or self.fake_free_list_head.next_free_block is None
+        ):
             assert self.num_free_blocks == 0, (
                 f"num_free_blocks ({self.num_free_blocks}) is out of sync "
-                "with the free list.")
+                "with the free list."
+            )
             raise ValueError("No free blocks available")
 
         first_block: KVCacheBlock = self.fake_free_list_head.next_free_block
@@ -284,8 +225,10 @@ class FreeKVCacheBlockQueue:
         if first_block.next_free_block is None:
             # This should not happen if the block is from the free list.
             # It indicates a bug in the caller's logic.
-            raise RuntimeError("Invalid block found in popleft() "
-                               "which doesn't have a valid next_free_block")
+            raise RuntimeError(
+                "Invalid block found in popleft() "
+                "which doesn't have a valid next_free_block"
+            )
 
         # Connect fake_head and the next block of first_block (i.e. second block
         # or fake tail).
@@ -360,7 +303,8 @@ class FreeKVCacheBlockQueue:
         """
         if self.fake_free_list_tail.prev_free_block is None:
             raise RuntimeError(
-                "prev_free_block of fake_free_list_tail should always exist")
+                "prev_free_block of fake_free_list_tail should always exist"
+            )
         last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block
 
         # Connect the new block after the last block.
@@ -384,7 +328,8 @@ class FreeKVCacheBlockQueue:
 
         last_block = self.fake_free_list_tail.prev_free_block
         assert last_block is not None, (
-            "prev_free_block of fake_free_list_tail should always exist")
+            "prev_free_block of fake_free_list_tail should always exist"
+        )
         # Add inter-connections between consecutive blocks
         for block in blocks:
             block.prev_free_block = last_block
@@ -406,7 +351,8 @@ class FreeKVCacheBlockQueue:
         ret = []
         if self.fake_free_list_head.next_free_block is None:
             raise RuntimeError(
-                "next_free_block of fake_free_list_head should always exist")
+                "next_free_block of fake_free_list_head should always exist"
+            )
         # Start from the first block
         curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block
         # As long as next_free_block is available, we haven't reached to
@@ -428,16 +374,18 @@ def need_extra_keys(request: Request) -> bool:
     """
 
     # Multimodal requests need to include the MM hash.
-    # LoRA requests need to include the LoRA ID.
+    # LoRA requests need to include the LoRA name.
     # Request with provided cache salt need to include the salt.
-    return bool(request.mm_features) or (request.lora_request
-                                         is not None) or (request.cache_salt
-                                                          is not None)
+    return (
+        bool(request.mm_features)
+        or (request.lora_request is not None)
+        or (request.cache_salt is not None)
+    )
 
 
-def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
-                            end_token_idx: int,
-                            start_mm_idx: int) -> tuple[list[Any], int]:
+def _gen_mm_extra_hash_keys(
+    request: Request, start_token_idx: int, end_token_idx: int, start_mm_idx: int
+) -> tuple[list[Any], int]:
     """Generate extra keys related to MultiModal request for block hash
     computation. For multi-modal inputs, the extra keys are
     (mm_hash, start_offset) that indicate a mm input contained in the
@@ -499,26 +447,48 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     return extra_keys, curr_mm_idx
 
 
-def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
+def _gen_lora_extra_hash_keys(request: Request) -> list[str]:
     """Generate extra keys related to LoRA for block hash computation.
 
     Args:
         request: The request object.
 
     Returns:
-        Return LoRA id of the request if it is a LoRA request. Return empty
+        Return LoRA name of the request if it is a LoRA request. Return empty
         list otherwise.
     """
     if not request.lora_request:
         return []
-    return [request.lora_request.lora_int_id]
+    return [request.lora_request.lora_name]
+
+
+def _gen_prompt_embeds_extra_hash_keys(
+    request: Request, start_token_idx: int, end_token_idx: int
+) -> list[bytes]:
+    """Generate extra keys related to prompt embeds for block hash computation.
+
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+
+    Returns:
+        Return prompt embeddings data of the request if it has prompt embeds.
+        Return empty list otherwise.
+    """
+    if request.prompt_embeds is None:
+        return []
+    block_prompt_embeds = request.prompt_embeds[start_token_idx:end_token_idx]
+    embeds_bytes = tensor_data(block_prompt_embeds).tobytes()
+    return [embeds_bytes]
 
 
 def generate_block_hash_extra_keys(
-        request: Request, start_token_idx: int, end_token_idx: int,
-        start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
+    request: Request, start_token_idx: int, end_token_idx: int, start_mm_idx: int
+) -> tuple[tuple[Any, ...] | None, int]:
     """Generate extra keys for the block hash. The extra keys can come from
-    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    the multi-modal inputs, request specific metadata (e.g., LoRA names), and
+    data from prompt embeddings.
 
     Args:
         request: The request object.
@@ -531,12 +501,19 @@ def generate_block_hash_extra_keys(
     """
     mm_extra_keys: list[Any]
     mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
-        request, start_token_idx, end_token_idx, start_mm_idx)
-    lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
-    cache_salt_keys: list[str] = [request.cache_salt] if (
-        start_token_idx == 0 and request.cache_salt) else []
+        request, start_token_idx, end_token_idx, start_mm_idx
+    )
+    lora_extra_keys: list[str] = _gen_lora_extra_hash_keys(request)
+    cache_salt_keys: list[str] = (
+        [request.cache_salt] if (start_token_idx == 0 and request.cache_salt) else []
+    )
+    prompt_embeds_keys = _gen_prompt_embeds_extra_hash_keys(
+        request, start_token_idx, end_token_idx
+    )
 
-    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys
+    extra_keys: list[Any] = (
+        lora_extra_keys + mm_extra_keys + cache_salt_keys + prompt_embeds_keys
+    )
 
     if not extra_keys:
         return None, new_start_mm_idx
@@ -545,10 +522,11 @@ def generate_block_hash_extra_keys(
 
 
 def hash_block_tokens(
-        hash_function: Callable[[Any], bytes],
-        parent_block_hash: Optional[BlockHash],
-        curr_block_token_ids: Sequence[int],
-        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
+    hash_function: Callable[[Any], bytes],
+    parent_block_hash: BlockHash | None,
+    curr_block_token_ids: Sequence[int],
+    extra_keys: tuple[Any, ...] | None = None,
+) -> BlockHash:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -569,8 +547,8 @@ def hash_block_tokens(
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHash(
-        hash_function(
-            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)))
+        hash_function((parent_block_hash, curr_block_token_ids_tuple, extra_keys))
+    )
 
 
 def get_request_block_hasher(
@@ -585,6 +563,10 @@ def get_request_block_hasher(
         start_token_idx = len(request.block_hashes) * block_size
         num_tokens = request.num_tokens
 
+        if start_token_idx + block_size > num_tokens:
+            # Early stop when there no new full blocks created.
+            return []
+
         curr_mm_idx = 0
         if start_token_idx > 0:
             # Set curr_mm_idx = -1 to indicate the last mm input.
@@ -593,8 +575,9 @@ def get_request_block_hasher(
             # last mm input.
             curr_mm_idx = -1
 
-        prev_block_hash_value = (request.block_hashes[-1]
-                                 if request.block_hashes else None)
+        prev_block_hash_value = (
+            request.block_hashes[-1] if request.block_hashes else None
+        )
         new_block_hashes: list[BlockHash] = []
         while True:
             end_token_idx = start_token_idx + block_size
@@ -604,13 +587,14 @@ def get_request_block_hasher(
 
             # MM and LoRA requests need extra keys for block-hash computation.
             extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
-                request, start_token_idx, end_token_idx, curr_mm_idx)
+                request, start_token_idx, end_token_idx, curr_mm_idx
+            )
 
             # Compute the hash of the current block
             block_tokens = request.all_token_ids[start_token_idx:end_token_idx]
-            block_hash = hash_block_tokens(caching_hash_fn,
-                                           prev_block_hash_value, block_tokens,
-                                           extra_keys)
+            block_hash = hash_block_tokens(
+                caching_hash_fn, prev_block_hash_value, block_tokens, extra_keys
+            )
 
             new_block_hashes.append(block_hash)
             start_token_idx += block_size
@@ -621,18 +605,20 @@ def get_request_block_hasher(
     return request_block_hasher
 
 
-def max_memory_usage_bytes(vllm_config: VllmConfig,
-                           kv_cache_specs: Iterable[KVCacheSpec]) -> int:
+def max_memory_usage_bytes(
+    vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
+) -> int:
     """
     Get the maximum memory usage in bytes for the given KV cache specs.
     """
-    return sum(
-        spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
+    return sum(spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
 
 
-def estimate_max_model_len(vllm_config: VllmConfig,
-                           kv_cache_spec: dict[str, KVCacheSpec],
-                           available_memory: int) -> int:
+def estimate_max_model_len(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> int:
     """
     Estimates the maximum model length that can fit in the available memory
     using binary search.
@@ -651,8 +637,7 @@ def estimate_max_model_len(vllm_config: VllmConfig,
         # Modify the max_model_len for this calculation
         vllm_config.model_config.max_model_len = model_len
         # Calculate memory needed for the given model length
-        memory_needed = max_memory_usage_bytes(vllm_config,
-                                               kv_cache_spec.values())
+        memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
         return memory_needed <= available_memory
 
     # Binary search for the maximum model length
@@ -675,9 +660,11 @@ def estimate_max_model_len(vllm_config: VllmConfig,
     return result
 
 
-def check_enough_kv_cache_memory(vllm_config: VllmConfig,
-                                 kv_cache_spec: dict[str, KVCacheSpec],
-                                 available_memory: int):
+def check_enough_kv_cache_memory(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+):
     """
     Checks whether `available_memory` is enough for the KV cache to hold at
     least one request with the model's max_model_len.
@@ -696,36 +683,41 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
         return
 
     if available_memory <= 0:
-        raise ValueError("No available memory for the cache blocks. "
-                         "Try increasing `gpu_memory_utilization` when "
-                         "initializing the engine.")
+        raise ValueError(
+            "No available memory for the cache blocks. "
+            "Try increasing `gpu_memory_utilization` when "
+            "initializing the engine."
+        )
 
     max_model_len = vllm_config.model_config.max_model_len
     needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
 
     if needed_memory > available_memory:
         # Estimate the maximum model length that can fit in the available memory
-        estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
-                                                   available_memory)
+        estimated_max_len = estimate_max_model_len(
+            vllm_config, kv_cache_spec, available_memory
+        )
         estimated_msg = ""
         if estimated_max_len > 0:
             estimated_msg = (
                 "Based on the available memory, "
-                f"the estimated maximum model length is {estimated_max_len}.")
+                f"the estimated maximum model length is {estimated_max_len}."
+            )
 
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
+            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/GiB_bytes:.2f} GiB). "
+            f"memory ({available_memory / GiB_bytes:.2f} GiB). "
             f"{estimated_msg} "
             f"Try increasing `gpu_memory_utilization` or decreasing "
-            f"`max_model_len` when initializing the engine.")
+            f"`max_model_len` when initializing the engine."
+        )
 
 
 def create_kv_cache_group_specs(
-        kv_cache_spec: dict[str, KVCacheSpec],
-        grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
+    kv_cache_spec: dict[str, KVCacheSpec], grouped_layer_names: list[list[str]]
+) -> list[KVCacheGroupSpec]:
     """
     Create KVCacheGroupSpec object for each kv cache group layer.
     The layers in the same group should share the same
@@ -748,7 +740,8 @@ def create_kv_cache_group_specs(
         ]
         merged_layer_spec = layer_specs[0].merge(layer_specs)
         kv_cache_groups.append(
-            KVCacheGroupSpec(layer_names_one_group, merged_layer_spec))
+            KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)
+        )
     return kv_cache_groups
 
 
@@ -778,19 +771,22 @@ def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
 
 
 def get_max_concurrency_for_kv_cache_config(
-        vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
+    vllm_config: VllmConfig, kv_cache_config: KVCacheConfig
+) -> float:
     """
     Get the maximum concurrency for the given KV cache configuration.
     """
     num_layer_per_group = max(
-        len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
+        len(group.layer_names) for group in kv_cache_config.kv_cache_groups
+    )
     max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
-        vllm_config,
-        (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
-    memory_per_block = kv_cache_config.kv_cache_groups[
-        0].kv_cache_spec.page_size_bytes * num_layer_per_group
-    num_block_per_request = cdiv(max_memory_usage_per_request,
-                                 memory_per_block)
+        vllm_config, (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups)
+    )
+    memory_per_block = (
+        kv_cache_config.kv_cache_groups[0].kv_cache_spec.page_size_bytes
+        * num_layer_per_group
+    )
+    num_block_per_request = cdiv(max_memory_usage_per_request, memory_per_block)
     max_concurrency = kv_cache_config.num_blocks / num_block_per_request
     return max_concurrency
 
@@ -800,18 +796,20 @@ def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
     Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
     """
     if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = \
-            vllm_config.cache_config.num_gpu_blocks_override
+        num_gpu_blocks_override = vllm_config.cache_config.num_gpu_blocks_override
         logger.info(
-            "Overriding num_gpu_blocks=%d with "
-            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+            "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
+            num_blocks,
+            num_gpu_blocks_override,
+        )
         num_blocks = num_gpu_blocks_override
 
     return num_blocks
 
 
-def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
-                   available_memory: int, page_size: int) -> int:
+def get_num_blocks(
+    vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int
+) -> int:
     """
     Get the number of kv cache blocks.
 
@@ -837,9 +835,10 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
 
 
 def _get_kv_cache_groups_uniform_spec(
-        kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
+    kv_cache_specs: dict[str, KVCacheSpec],
+) -> list[KVCacheGroupSpec]:
     """
-    Generates the KV cache configuration for a model with the same KV cache 
+    Generates the KV cache configuration for a model with the same KV cache
     spec for all layers.
 
     Args:
@@ -849,12 +848,12 @@ def _get_kv_cache_groups_uniform_spec(
         The generated KVCacheGroupSpecs
     """
 
-    return create_kv_cache_group_specs(kv_cache_specs,
-                                       [list(kv_cache_specs.keys())])
+    return create_kv_cache_group_specs(kv_cache_specs, [list(kv_cache_specs.keys())])
 
 
 def _get_kv_cache_groups_uniform_type(
-        spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]:
+    spec: UniformTypeKVCacheSpecs,
+) -> list[KVCacheGroupSpec]:
     """
     Generates the KV cache configuration for a model with one type of KV cache
     but different hidden sizes. All layers are merged into one group.
@@ -869,8 +868,7 @@ def _get_kv_cache_groups_uniform_type(
     return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)]
 
 
-def is_kv_cache_page_size_uniform(
-        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
     Whether all layers in the given KVCacheSpec have the same page size.
     Args:
@@ -884,70 +882,69 @@ def is_kv_cache_page_size_uniform(
     return len(page_sizes) == 1
 
 
-def is_kv_cache_type_attention_free(
-        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
-
+def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     # kv_cache_spec is an empty dict for attention free models
     return not kv_cache_spec
 
 
 def _get_kv_cache_groups_uniform_page_size(
-        kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> list[KVCacheGroupSpec]:
     """
-    Generates the KV cache groups for hybrid models with multiple 
-    attention types but still with a uniform page size (physical memory per 
+    Generates the KV cache groups for hybrid models with multiple
+    attention types but still with a uniform page size (physical memory per
     block per layer) for all layers.
 
     Detailed explanation about kv cache management of hybrid models:
     The layers in the models are repeated with some patterns, e.g., a model
     with 10 full attention layers and 20 sliding window attention layers can be
-    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times.
     The KVCacheManager allocates different block tables for each of the 3 layers
-    in the pattern, and repeats each of them 10 times to generate the 
+    in the pattern, and repeats each of them 10 times to generate the
     block_table for the 30 layers in the model.
     Therefore, we can group the layers in the model into 3 kv_cache_groups, each
     of which contains 10 layers in the model.
     The KVCacheManager allocates the block_table for each group based on its
-    kv_cache spec, and the model runner applies the block table to each layer 
+    kv_cache spec, and the model runner applies the block table to each layer
     in the group.
     For example:
-    1. A model only uses full attention. The pattern is 
-    (num_hidden_layers * full), so there is only one group and the block table 
-    is shared by all layers. It is already handled by 
+    1. A model only uses full attention. The pattern is
+    (num_hidden_layers * full), so there is only one group and the block table
+    is shared by all layers. It is already handled by
     `_get_kv_cache_config_uniform_type`.
-    2. A model with 10 full attention layers and 20 sliding window 
-    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    2. A model with 10 full attention layers and 20 sliding window
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so
     there are 3 kv_cache_groups, each of which represents 10 layers.
 
     To simplify the implementation, we make the following assumptions:
-    1. Physical memory per block: Must be the same across all KV cache groups. 
+    1. Physical memory per block: Must be the same across all KV cache groups.
     Breaking this assumption is non-trivial due to memory fragmentation concerns
     when allocating blocks of different sizes.
-    2. Tokens per block (block_size): Currently, we directly use 
-    `CacheConfig.block_size` for all layers. It can be extended to vary by KV 
-    cache group, but within each KV cache group, all layers must share the same 
+    2. Tokens per block (block_size): Currently, we directly use
+    `CacheConfig.block_size` for all layers. It can be extended to vary by KV
+    cache group, but within each KV cache group, all layers must share the same
     block size.
-    3. Physical memory per token per layer: This property is decided by model 
-    config. Currently we only support models that have the same physical memory 
-    per token per layer for all layers. Can be relaxed with a simple extension, 
+    3. Physical memory per token per layer: This property is decided by model
+    config. Currently we only support models that have the same physical memory
+    per token per layer for all layers. Can be relaxed with a simple extension,
     but still need to keep physical memory per block the same for all groups.
-    4. Number of layers per group: Currently assumed the same for all layers. 
-    Can be relaxed with a simple extension, but still need to keep physical 
+    4. Number of layers per group: Currently assumed the same for all layers.
+    Can be relaxed with a simple extension, but still need to keep physical
     memory per block the same for all groups.
     5. Attention type within groups: All layers in a group must share the same
-    attention type. One exception is that, when 
-    `--disable-hybrid-kv-cache-manager` is true, the single group for full 
-    attention layers may also include attention layers using sliding window or 
+    attention type. One exception is that, when
+    `--disable-hybrid-kv-cache-manager` is true, the single group for full
+    attention layers may also include attention layers using sliding window or
     LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
-    6. Support for multiple attention types: The design for most components is 
-    general to an arbitrary number of attention types. But 
-    `find_longest_cache_hit` only supports one attention type or two 
+    6. Support for multiple attention types: The design for most components is
+    general to an arbitrary number of attention types. But
+    `find_longest_cache_hit` only supports one attention type or two
     types of full-attention plus exactly one another type. The general
-    implementation of this function is feasible but we don't know how to 
+    implementation of this function is feasible but we don't know how to
     implement it cleanly yet.
 
-    As we assume tokens per block, physical memory per token per layer, and 
-    number of layers per group are the same now, we can ensure that physical 
+    As we assume tokens per block, physical memory per token per layer, and
+    number of layers per group are the same now, we can ensure that physical
     memory per block is the same for all groups.
 
     Args:
@@ -1001,10 +998,12 @@ def _get_kv_cache_groups_uniform_page_size(
     return create_kv_cache_group_specs(kv_cache_spec, grouped_layers)
 
 
-def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
-                                    kv_cache_groups: list[KVCacheGroupSpec],
-                                    kv_cache_specs: dict[str, KVCacheSpec],
-                                    available_memory: int) -> KVCacheConfig:
+def get_kv_cache_config_from_groups(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    kv_cache_specs: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> KVCacheConfig:
     """
     Generate the KV cache configuration from the KV cache groups and spec
     of each layer.
@@ -1027,19 +1026,22 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
         )
 
     # Determine how model runners should initialize the KV cache tensors.
-    if len(kv_cache_groups) == 1 and \
-        isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs):
+    if len(kv_cache_groups) == 1 and isinstance(
+        kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
+    ):
         # Special case: all layers have the same type of KV cache but with
         # different hidden size. Allocate different amount of memory for each
         # layer based on its hidden size.
-        num_blocks = available_memory // kv_cache_groups[
-            0].kv_cache_spec.page_size_bytes
+        num_blocks = (
+            available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
+        )
         num_blocks = may_override_num_blocks(vllm_config, num_blocks)
         per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
         kv_cache_tensors = [
-            KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes *
-                          num_blocks,
-                          shared_by=[layer_name])
+            KVCacheTensor(
+                size=per_layer_specs[layer_name].page_size_bytes * num_blocks,
+                shared_by=[layer_name],
+            )
             for layer_name in kv_cache_groups[0].layer_names
         ]
     else:
@@ -1055,8 +1057,9 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
 
         page_size = get_uniform_page_size(kv_cache_specs)
         assert group_size > 0, "group_size must be greater than 0"
-        num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
-                                    page_size)
+        num_blocks = get_num_blocks(
+            vllm_config, group_size, available_memory, page_size
+        )
         kv_cache_tensors = []
         for i in range(group_size):
             shared_by = []
@@ -1064,34 +1067,15 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
                 if i < len(kv_cache_groups[j].layer_names):
                     shared_by.append(kv_cache_groups[j].layer_names[i])
             kv_cache_tensors.append(
-                KVCacheTensor(size=page_size * num_blocks,
-                              shared_by=shared_by))
+                KVCacheTensor(size=page_size * num_blocks, shared_by=shared_by)
+            )
 
-    kv_cache_config = KVCacheConfig(
+    return KVCacheConfig(
         num_blocks=num_blocks,
         kv_cache_tensors=kv_cache_tensors,
         kv_cache_groups=kv_cache_groups,
     )
 
-    min_block_size = min(
-        [group.kv_cache_spec.block_size for group in kv_cache_groups])
-
-    # Print the KV cache size and maximum concurrency.
-    num_tokens = num_blocks // len(kv_cache_groups) * min_block_size
-    if vllm_config.parallel_config.decode_context_parallel_size > 1:
-        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
-        logger.info(
-            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
-            vllm_config.parallel_config.decode_context_parallel_size)
-    num_tokens_str = f"{num_tokens:,}"
-    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
-    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
-    max_concurrency = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config)
-    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                max_model_len_str, max_concurrency)
-    return kv_cache_config
-
 
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
@@ -1104,25 +1088,27 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
 
     if is_kv_cache_spec_uniform(
-            kv_cache_spec) or UniformTypeKVCacheSpecs.is_uniform_type(
-                kv_cache_spec):
+        kv_cache_spec
+    ) or UniformTypeKVCacheSpecs.is_uniform_type(kv_cache_spec):
         return
 
     logger.warning(
         "Hybrid KV cache manager is disabled for this hybrid model, "
         "This means we do not enable any optimizations for saving KV cache "
         "memory (e.g., dropping the KV cache outside the sliding window). "
-        "The compute of layers like sliding window is still saved.")
+        "The compute of layers like sliding window is still saved."
+    )
 
     has_full_attention = any(
-        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values()
+    )
     has_sliding_window = any(
-        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values()
+    )
     has_chunked_local_attention = any(
-        isinstance(spec, ChunkedLocalAttentionSpec)
-        for spec in kv_cache_spec.values())
-    if has_full_attention and (has_sliding_window
-                               or has_chunked_local_attention):
+        isinstance(spec, ChunkedLocalAttentionSpec) for spec in kv_cache_spec.values()
+    )
+    if has_full_attention and (has_sliding_window or has_chunked_local_attention):
         for layer_name, spec in kv_cache_spec.items():
             if isinstance(spec, SlidingWindowSpec):
                 kv_cache_spec[layer_name] = FullAttentionSpec(
@@ -1141,15 +1127,19 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     attention_chunk_size=spec.attention_chunk_size,
                 )
 
-    if not (is_kv_cache_spec_uniform(kv_cache_spec)
-            or UniformTypeKVCacheSpecs.is_uniform_type(kv_cache_spec)):
-        raise ValueError("Hybrid KV cache manager is disabled but failed to "
-                         "convert the KV cache specs to one unified type.")
+    if not (
+        is_kv_cache_spec_uniform(kv_cache_spec)
+        or UniformTypeKVCacheSpecs.is_uniform_type(kv_cache_spec)
+    ):
+        raise ValueError(
+            "Hybrid KV cache manager is disabled but failed to "
+            "convert the KV cache specs to one unified type."
+        )
 
 
 def get_kv_cache_groups(
-        vllm_config: VllmConfig,
-        kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
+    vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec]
+) -> list[KVCacheGroupSpec]:
     """
     Split the layers in the model into groups with the same KV cache spec.
 
@@ -1188,14 +1178,14 @@ def get_kv_cache_groups(
 
 
 def generate_scheduler_kv_cache_config(
-        kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig:
+    kv_cache_configs: list[KVCacheConfig],
+) -> KVCacheConfig:
     """
     Generate the KV cache configuration for the scheduler.
     """
-    assert all([
-        cfg.num_blocks == kv_cache_configs[0].num_blocks
-        for cfg in kv_cache_configs
-    ])
+    assert all(
+        [cfg.num_blocks == kv_cache_configs[0].num_blocks for cfg in kv_cache_configs]
+    )
     # All workers have the same kv_cache_config except layer names, so use
     # an arbitrary one to initialize the scheduler.
     cfg = copy.deepcopy(kv_cache_configs[0])
@@ -1204,15 +1194,57 @@ def generate_scheduler_kv_cache_config(
             # All layers in the UniformTypeKVCacheSpecs have the same type,
             # so use an arbitrary one to initialize the scheduler.
             group.kv_cache_spec = next(
-                iter(group.kv_cache_spec.kv_cache_specs.values()))
+                iter(group.kv_cache_spec.kv_cache_specs.values())
+            )
     return cfg
 
 
-def get_kv_cache_configs(vllm_config: VllmConfig,
-                         kv_cache_specs: list[dict[str, KVCacheSpec]],
-                         available_memory: list[int]) -> list[KVCacheConfig]:
+def _report_kv_cache_config(
+    vllm_config: VllmConfig, kv_cache_config: KVCacheConfig
+) -> None:
     """
-    Generates the KV cache configurations for a model. 
+    Log resolved KV cache configuration.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_config: The resolved KV cache configuration
+    """
+    min_block_size = min(
+        [group.kv_cache_spec.block_size for group in kv_cache_config.kv_cache_groups]
+    )
+
+    # Log the KV cache size and maximum concurrency.
+    num_tokens = (
+        kv_cache_config.num_blocks
+        // len(kv_cache_config.kv_cache_groups)
+        * min_block_size
+    )
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size,
+        )
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config
+    )
+    logger.info(
+        "Maximum concurrency for %s tokens per request: %.2fx",
+        max_model_len_str,
+        max_concurrency,
+    )
+
+
+def get_kv_cache_configs(
+    vllm_config: VllmConfig,
+    kv_cache_specs: list[dict[str, KVCacheSpec]],
+    available_memory: list[int],
+) -> list[KVCacheConfig]:
+    """
+    Generates the KV cache configurations for a model.
     Since we use a shared centralized controller for all workers, we need the
     `kv_cache_config` to be consistent across all workers to make sure
     the KV cache allocation can be applied to all workers. However, different
@@ -1225,13 +1257,14 @@ def get_kv_cache_configs(vllm_config: VllmConfig,
     3. Generate the KV cache configs for each worker based on the KV cache
        grouping strategy. (This is reasonable because the layer ratio of
        different PP stages are similar.)
-    4. Change the num_blocks of each worker to the smallest among all workers.
+    4. Change the num_blocks of each worker to the smallest among all workers
+       and shrink tensor sizes proportionally to avoid allocating unused memory.
 
     Args:
         vllm_config: The global VllmConfig
         kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
         available_memory: Memory available for KV cache in bytes for each
-            worker. 
+            worker.
 
     Returns:
         The generated KVCacheConfigs for each worker.
@@ -1239,9 +1272,11 @@ def get_kv_cache_configs(vllm_config: VllmConfig,
 
     # Check if the available memory is enough for each worker.
     for kv_cache_spec_one_worker, available_memory_one_worker in zip(
-            kv_cache_specs, available_memory):
-        check_enough_kv_cache_memory(vllm_config, kv_cache_spec_one_worker,
-                                     available_memory_one_worker)
+        kv_cache_specs, available_memory
+    ):
+        check_enough_kv_cache_memory(
+            vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
+        )
 
     # Merge the KV cache specs of all workers. Different PP stages may have
     # different layer names, and different TP ranks of the same PP stage should
@@ -1254,38 +1289,52 @@ def get_kv_cache_configs(vllm_config: VllmConfig,
             else:
                 assert merged_kv_cache_specs[layer_name] == layer_spec, (
                     "The KV cache specs for the same layer are different "
-                    "across workers. This is not supported yet.")
-    global_kv_cache_groups = get_kv_cache_groups(vllm_config,
-                                                 merged_kv_cache_specs)
+                    "across workers. This is not supported yet."
+                )
+    global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)
 
     kv_cache_configs: list[KVCacheConfig] = []
     for kv_cache_spec_one_worker, available_memory_one_worker in zip(
-            kv_cache_specs, available_memory):
+        kv_cache_specs, available_memory
+    ):
         kv_cache_groups_one_worker: list[KVCacheGroupSpec] = []
         for group in global_kv_cache_groups:
             group_layer_names_one_worker = [
-                layer_name for layer_name in group.layer_names
+                layer_name
+                for layer_name in group.layer_names
                 if layer_name in kv_cache_spec_one_worker
             ]
             kv_cache_groups_one_worker.append(
-                KVCacheGroupSpec(group_layer_names_one_worker,
-                                 group.kv_cache_spec))
+                KVCacheGroupSpec(group_layer_names_one_worker, group.kv_cache_spec)
+            )
         assert sum(
-            len(group.layer_names) for group in
-            kv_cache_groups_one_worker) == len(kv_cache_spec_one_worker), (
-                "Some layers are not assigned to any group.")
+            len(group.layer_names) for group in kv_cache_groups_one_worker
+        ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
         kv_cache_configs.append(
-            get_kv_cache_config_from_groups(vllm_config,
-                                            kv_cache_groups_one_worker,
-                                            kv_cache_spec_one_worker,
-                                            available_memory_one_worker))
-
-    # Change the num_blocks of each rank to the smallest among all ranks. We
-    # do not need to shrink the tensor size because it is valid to only use the
-    # first `num_blocks` blocks of the tensor.
-    min_num_blocks = min(kv_cache_config.num_blocks
-                         for kv_cache_config in kv_cache_configs)
+            get_kv_cache_config_from_groups(
+                vllm_config,
+                kv_cache_groups_one_worker,
+                kv_cache_spec_one_worker,
+                available_memory_one_worker,
+            )
+        )
+
+    # Change the num_blocks of each rank to the smallest among all ranks.
+    # We also need to shrink the tensor size proportionally to avoid
+    # allocating unused memory.
+    min_num_blocks = min(
+        kv_cache_config.num_blocks for kv_cache_config in kv_cache_configs
+    )
     for kv_cache_config in kv_cache_configs:
+        num_blocks_old = kv_cache_config.num_blocks
         kv_cache_config.num_blocks = min_num_blocks
 
+        # Shrink tensor size proportionally
+        for tensor in kv_cache_config.kv_cache_tensors:
+            assert tensor.size % num_blocks_old == 0
+            tensor.size = tensor.size // num_blocks_old * min_num_blocks
+
+        if len(kv_cache_config.kv_cache_groups) > 0:
+            _report_kv_cache_config(vllm_config, kv_cache_config)
+
     return kv_cache_configs
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 74ff6261732c7bea072fc96d374779b1851f6af7..3214f65a0972877b2fdb439dbba2132c2ed7fe05 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -12,19 +10,35 @@ logger = init_logger(__name__)
 
 
 class AsyncScheduler(Scheduler):
-
     def _update_after_schedule(
         self,
         scheduler_output: SchedulerOutput,
     ) -> None:
         super()._update_after_schedule(scheduler_output)
+        pending_structured_output_tokens = False
+        spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
-            if (request.num_computed_tokens == request.num_tokens +
-                    request.num_output_placeholders):
-                # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+            pending_structured_output_tokens |= (
+                request.use_structured_output and request.num_output_placeholders > 0
+            )
+            cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
+            if (
+                request.num_computed_tokens
+                == request.num_tokens
+                + request.num_output_placeholders
+                + cur_num_spec_tokens
+            ):
+                # The request will generate a new token plus num_spec_tokens
+                # in this scheduling step.
+                request.num_output_placeholders += 1 + cur_num_spec_tokens
+                # Add placeholders for the new tokens in spec_token_ids.
+                # Wwe will update the actual spec token ids in the worker process.
+                request.spec_token_ids = [-1] * self.num_spec_tokens
+
+        scheduler_output.pending_structured_output_tokens = (
+            pending_structured_output_tokens
+        )
 
     def _update_request_with_output(
         self,
@@ -33,7 +47,8 @@ class AsyncScheduler(Scheduler):
     ) -> tuple[list[int], bool]:
         status_before_update = request.status
         new_token_ids, stopped = super()._update_request_with_output(
-            request, new_token_ids)
+            request, new_token_ids
+        )
 
         # Update the number of output placeholders.
         request.num_output_placeholders -= len(new_token_ids)
@@ -42,6 +57,6 @@ class AsyncScheduler(Scheduler):
         # Cache the new tokens. Preempted requests should be skipped.
         if status_before_update == RequestStatus.RUNNING:
             self.kv_cache_manager.cache_blocks(
-                request,
-                request.num_computed_tokens - request.num_output_placeholders)
+                request, request.num_computed_tokens - request.num_output_placeholders
+            )
         return new_token_ids, stopped
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 5b1de3a66ceb460ad8926fd7657024a0f63cb14d..88d99d94028213ff70bd220216505970177ff13c 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -2,18 +2,35 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
+
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
-    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
     from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.metrics.stats import SchedulerStats
     from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
     from vllm.v1.request import Request, RequestStatus
+    from vllm.v1.structured_output import StructuredOutputManager
 
 
 class SchedulerInterface(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        kv_cache_config: "KVCacheConfig",
+        structured_output_manager: "StructuredOutputManager",
+        block_size: int,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
+    ) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def schedule(self) -> "SchedulerOutput":
@@ -41,6 +58,12 @@ class SchedulerInterface(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_grammar_bitmask(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> "GrammarOutput | None":
+        raise NotImplementedError
+
     @abstractmethod
     def update_from_output(
         self,
@@ -72,7 +95,7 @@ class SchedulerInterface(ABC):
     @abstractmethod
     def add_request(self, request: "Request") -> None:
         """Add a new request to the scheduler's internal queue.
-        
+
         Args:
             request: The new request being added.
         """
@@ -81,7 +104,7 @@ class SchedulerInterface(ABC):
     @abstractmethod
     def finish_requests(
         self,
-        request_ids: Union[str, Iterable[str]],
+        request_ids: str | Iterable[str],
         finished_status: "RequestStatus",
     ) -> None:
         """Finish the requests in the scheduler's internal queue. If the request
@@ -91,7 +114,7 @@ class SchedulerInterface(ABC):
         1. When the request is aborted by the client.
         2. When the frontend process detects a stop string of the request after
            de-tokenizing its generated tokens.
-           
+
         Args:
             request_ids: A single or a list of request IDs.
             finished_status: The finished status of the given requests.
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 209fc2a4404f37dbcf4b74c61263288afb0b05f4..20fdb3446404b68a9a6880c9271c1adc271c2646 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+from typing_extensions import deprecated
 
 from vllm._bc_linter import bc_linter_include
 
@@ -13,35 +14,42 @@ if TYPE_CHECKING:
     import numpy.typing as npt
     import torch
 
-    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-        KVConnectorMetadata)
+    from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
     from vllm.lora.request import LoRARequest
     from vllm.multimodal.inputs import MultiModalFeatureSpec
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
+else:
+    ECConnectorMetadata = object
+    KVConnectorMetadata = object
+    LoRARequest = object
+    MultiModalFeatureSpec = object
+    PoolingParams = object
+    SamplingParams = object
+    Request = object
 
 
 @bc_linter_include
 @dataclass
 class NewRequestData:
-
     req_id: str
-    prompt_token_ids: Optional[list[int]]
+    prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
-    lora_request: Optional[LoRARequest]
-    prompt_embeds: Optional[torch.Tensor] = None
+    lora_request: LoRARequest | None
+    prompt_embeds: "torch.Tensor | None" = None
 
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
-    ) -> NewRequestData:
+    ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
@@ -55,72 +63,91 @@ class NewRequestData:
         )
 
     def __repr__(self) -> str:
-        prompt_embeds_shape = (self.prompt_embeds.shape
-                               if self.prompt_embeds else None)
-        return (f"NewRequestData("
-                f"req_id={self.req_id},"
-                f"prompt_token_ids={self.prompt_token_ids},"
-                f"mm_features={self.mm_features},"
-                f"sampling_params={self.sampling_params},"
-                f"block_ids={self.block_ids},"
-                f"num_computed_tokens={self.num_computed_tokens},"
-                f"lora_request={self.lora_request},"
-                f"prompt_embeds_shape={prompt_embeds_shape}"
-                ")")
+        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        return (
+            f"NewRequestData("
+            f"req_id={self.req_id},"
+            f"prompt_token_ids={self.prompt_token_ids},"
+            f"mm_features={self.mm_features},"
+            f"sampling_params={self.sampling_params},"
+            f"block_ids={self.block_ids},"
+            f"num_computed_tokens={self.num_computed_tokens},"
+            f"lora_request={self.lora_request},"
+            f"prompt_embeds_shape={prompt_embeds_shape}"
+            ")"
+        )
 
     # Version of __repr__ with the prompt data obfuscated
     def anon_repr(self) -> str:
-        prompt_token_ids_len = len(
-            self.prompt_token_ids
-        ) if self.prompt_token_ids is not None else None
-        prompt_embeds_shape = (self.prompt_embeds.shape
-                               if self.prompt_embeds else None)
-        return (f"NewRequestData("
-                f"req_id={self.req_id},"
-                f"prompt_token_ids_len={prompt_token_ids_len},"
-                f"mm_features={self.mm_features},"
-                f"sampling_params={self.sampling_params},"
-                f"block_ids={self.block_ids},"
-                f"num_computed_tokens={self.num_computed_tokens},"
-                f"lora_request={self.lora_request},"
-                f"prompt_embeds_shape={prompt_embeds_shape}"
-                ")")
+        prompt_token_ids_len = (
+            len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
+        )
+        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        return (
+            f"NewRequestData("
+            f"req_id={self.req_id},"
+            f"prompt_token_ids_len={prompt_token_ids_len},"
+            f"mm_features={self.mm_features},"
+            f"sampling_params={self.sampling_params},"
+            f"block_ids={self.block_ids},"
+            f"num_computed_tokens={self.num_computed_tokens},"
+            f"lora_request={self.lora_request},"
+            f"prompt_embeds_shape={prompt_embeds_shape}"
+            ")"
+        )
 
 
 @bc_linter_include
 @dataclass
 class CachedRequestData:
-
     req_ids: list[str]
-    # If resumed_from_preemption is False, new_block_ids will be appended to
-    # the request's block IDs. If True, new_block_ids will be used as the
+    # For request ids not in resumed_req_ids, new_block_ids will be appended to
+    # the request's block IDs. For those in the set, new_block_ids will be used as the
     # request's block IDs instead of appending to the existing block IDs.
-    resumed_from_preemption: list[bool]
+    resumed_req_ids: set[str]
     # NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
     # When PP is not used, new_token_ids will be empty.
     new_token_ids: list[list[int]]
-    new_block_ids: list[Optional[tuple[list[int], ...]]]
+    # For requests not scheduled in the last step, propagate the token ids to the
+    # connector. Won't contain requests that were scheduled in the prior step.
+    all_token_ids: dict[str, list[int]]
+    new_block_ids: list[tuple[list[int], ...] | None]
     num_computed_tokens: list[int]
+    num_output_tokens: list[int]
 
     @property
     def num_reqs(self) -> int:
         return len(self.req_ids)
 
+    @cached_property
+    @deprecated("use resumed_req_ids field")
+    def resumed_from_preemption(self) -> list[bool]:
+        return [req_id in self.resumed_req_ids for req_id in self.req_ids]
+
+    @cached_property
+    @deprecated("use all_token_ids field")
+    def resumed_req_token_ids(self) -> list[list[int] | None]:
+        return [
+            self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None
+            for req_id in self.req_ids
+        ]
+
     @classmethod
-    def make_empty(cls) -> CachedRequestData:
+    def make_empty(cls) -> "CachedRequestData":
         return cls(
             req_ids=[],
-            resumed_from_preemption=[],
+            resumed_req_ids=set(),
             new_token_ids=[],
+            all_token_ids={},
             new_block_ids=[],
             num_computed_tokens=[],
+            num_output_tokens=[],
         )
 
 
 @bc_linter_include
 @dataclass
 class SchedulerOutput:
-
     # list of the requests that are scheduled for the first time.
     # We cache the request's data in each worker process, so that we don't
     # need to re-send it every scheduling step.
@@ -156,11 +183,20 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
-    # Dict of request ids to their index within the batch
-    # for filling the next token bitmask
-    structured_output_request_ids: dict[str, int]
-    # the bitmask for the whole batch
-    grammar_bitmask: Optional[npt.NDArray[np.int32]]
+    # Whether the scheduled requests have all the output tokens they
+    # need to perform grammar bitmask computation.
+    pending_structured_output_tokens: bool = False
 
     # KV Cache Connector metadata.
-    kv_connector_metadata: Optional[KVConnectorMetadata] = None
+    kv_connector_metadata: KVConnectorMetadata | None = None
+
+    # EC Cache Connector metadata
+    ec_connector_metadata: ECConnectorMetadata | None = None
+
+
+@dataclass
+class GrammarOutput:
+    # ids of structured output requests.
+    structured_output_request_ids: list[str]
+    # Bitmask ordered as structured_output_request_ids.
+    grammar_bitmask: "npt.NDArray[np.int32]"
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index fc2bc30b9a5fd06b4c900965138d9b73ff0fd859..7bc1010db23a26dd19943bc5d0505e3f8a0567d2 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import heapq
 from abc import ABC, abstractmethod
 from collections import deque
@@ -14,6 +12,7 @@ from vllm.v1.request import Request
 
 class SchedulingPolicy(Enum):
     """Enum for scheduling policies."""
+
     FCFS = "fcfs"
     PRIORITY = "priority"
 
@@ -42,7 +41,7 @@ class RequestQueue(ABC):
         pass
 
     @abstractmethod
-    def prepend_requests(self, requests: RequestQueue) -> None:
+    def prepend_requests(self, requests: "RequestQueue") -> None:
         """Prepend all requests from another queue to the front of this
         queue."""
         pass
@@ -111,9 +110,7 @@ class FCFSRequestQueue(deque[Request], RequestQueue):
     def remove_requests(self, requests: Iterable[Request]) -> None:
         """Remove multiple specific requests from the queue."""
         requests_to_remove = set(requests)
-        filtered_requests = [
-            req for req in self if req not in requests_to_remove
-        ]
+        filtered_requests = [req for req in self if req not in requests_to_remove]
         # deque does not support in-place filtering, so we need to clear
         # and extend
         self.clear()
@@ -150,8 +147,7 @@ class PriorityRequestQueue(RequestQueue):
 
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy."""
-        heapq.heappush(self._heap,
-                       (request.priority, request.arrival_time, request))
+        heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
 
     def pop_request(self) -> Request:
         """Pop a request from the queue according to priority policy."""
@@ -169,15 +165,15 @@ class PriorityRequestQueue(RequestQueue):
 
     def prepend_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy.
-        
-        Note: In a priority queue, there is no concept of prepending to the 
+
+        Note: In a priority queue, there is no concept of prepending to the
         front. Requests are ordered by (priority, arrival_time)."""
         self.add_request(request)
 
     def prepend_requests(self, requests: RequestQueue) -> None:
         """Add all requests from another queue according to priority policy.
-        
-        Note: In a priority queue, there is no concept of prepending to the 
+
+        Note: In a priority queue, there is no concept of prepending to the
         front. Requests are ordered by (priority, arrival_time)."""
         for request in requests:
             self.add_request(request)
@@ -190,8 +186,9 @@ class PriorityRequestQueue(RequestQueue):
     def remove_requests(self, requests: Iterable[Request]) -> None:
         """Remove multiple specific requests from the queue."""
         requests_to_remove = set(requests)
-        self._heap = [(p, t, r) for p, t, r in self._heap
-                      if r not in requests_to_remove]
+        self._heap = [
+            (p, t, r) for p, t, r in self._heap if r not in requests_to_remove
+        ]
         heapq.heapify(self._heap)
 
     def __bool__(self) -> bool:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 2b2cd63c2e20f346ba8ad0df2e32336632a10e98..4323141c435b76ad37c484f96d9a34ec4b6edd51 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,52 +1,61 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import itertools
 import time
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
-                                                          KVConnectorRole)
-from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorStats)
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+    SupportsHMA,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
-                                                compute_encoder_budget)
+from vllm.v1.core.encoder_cache_manager import (
+    EncoderCacheManager,
+    compute_encoder_budget,
+)
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
-from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
-                                       SchedulerOutput)
-from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
-                                              create_request_queue)
+from vllm.v1.core.sched.output import (
+    CachedRequestData,
+    GrammarOutput,
+    NewRequestData,
+    SchedulerOutput,
+)
+from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_queue
 from vllm.v1.core.sched.utils import check_stop, remove_all
-from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
-                            EngineCoreOutputs)
+from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
 from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import record_function_or_nullcontext
 
 logger = init_logger(__name__)
 
 
 class Scheduler(SchedulerInterface):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
         kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
+        block_size: int,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
         log_stats: bool = False,
@@ -66,61 +75,62 @@ class Scheduler(SchedulerInterface):
         # request ids should be included in the EngineCoreOutputs returned
         # by update_from_outputs(). This is currently used in the multi-engine
         # case to track request lifetimes efficiently.
-        self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
-            defaultdict(set) if include_finished_set else None)
+        self.finished_req_ids_dict: dict[int, set[str]] | None = (
+            defaultdict(set) if include_finished_set else None
+        )
+        self.prev_step_scheduled_req_ids: set[str] = set()
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = \
-            self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
-            and self.kv_events_config.enable_kv_cache_events)
+            and self.kv_events_config.enable_kv_cache_events
+        )
 
         # Create KVConnector for the Scheduler. Note that each Worker
         # will have a corresponding KVConnector with Role=WORKER.
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
+        self.connector_prefix_cache_stats: PrefixCacheStats | None = None
         if self.vllm_config.kv_transfer_config is not None:
-            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
-                "Multiple KV cache groups are not currently supported "
-                "with KV connectors")
             assert not self.is_encoder_decoder, (
-                "Encoder-decoder models are not currently supported "
-                "with KV connectors")
+                "Encoder-decoder models are not currently supported with KV connectors"
+            )
             self.connector = KVConnectorFactory.create_connector(
-                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+                config=self.vllm_config,
+                role=KVConnectorRole.SCHEDULER,
+                kv_cache_config=self.kv_cache_config,
+            )
+            if self.log_stats:
+                self.connector_prefix_cache_stats = PrefixCacheStats()
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
             self.parallel_config.data_parallel_rank,
         )
+        self.ec_connector = None
+        if self.vllm_config.ec_transfer_config is not None:
+            self.ec_connector = ECConnectorFactory.create_connector(
+                config=self.vllm_config, role=ECConnectorRole.SCHEDULER
+            )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
 
-        self.block_size = self.cache_config.block_size
-
-        self.dcp_world_size = \
-            vllm_config.parallel_config.decode_context_parallel_size
-        # Note(hc): The scheduler’s block_size must be multiplied
-        # by dcp_world_size, since block hashes are computed on the
-        # original full token sequence at a granularity of
-        # original_block_size × dcp_world_size.
-        if self.dcp_world_size > 1:
-            self.block_size *= self.dcp_world_size
+        self.block_size = block_size
+        self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
 
         # req_id -> Request
         self.requests: dict[str, Request] = {}
         # Scheduling policy
-        if self.scheduler_config.policy == "priority":
-            self.policy = SchedulingPolicy.PRIORITY
-        elif self.scheduler_config.policy == "fcfs":
-            self.policy = SchedulingPolicy.FCFS
-        else:
+        try:
+            self.policy = SchedulingPolicy(self.scheduler_config.policy)
+        except ValueError as e:
             raise ValueError(
-                f"Unknown scheduling policy: {self.scheduler_config.policy}")
+                f"Unknown scheduling policy: {self.scheduler_config.policy}"
+            ) from e
         # Priority queues for requests.
         self.waiting = create_request_queue(self.policy)
         self.running: list[Request] = []
@@ -133,6 +143,7 @@ class Scheduler(SchedulerInterface):
 
         # KV Connector: requests in process of async KV loading or recving
         self.finished_recving_kv_req_ids: set[str] = set()
+        self.failed_recving_kv_req_ids: set[str] = set()
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -152,8 +163,7 @@ class Scheduler(SchedulerInterface):
         # NOTE: For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized because cache size is 0
         # for these models.
-        self.encoder_cache_manager = EncoderCacheManager(
-            cache_size=encoder_cache_size)
+        self.encoder_cache_manager = EncoderCacheManager(cache_size=encoder_cache_size)
 
         speculative_config = vllm_config.speculative_config
         self.use_eagle = False
@@ -168,7 +178,7 @@ class Scheduler(SchedulerInterface):
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=self.cache_config.enable_prefix_caching,
+            enable_caching=bool(self.cache_config.enable_prefix_caching),
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
@@ -210,30 +220,41 @@ class Scheduler(SchedulerInterface):
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
 
-            num_new_tokens = (request.num_tokens_with_spec +
-                              request.num_output_placeholders -
-                              request.num_computed_tokens)
-            if (0 < self.scheduler_config.long_prefill_token_threshold <
-                    num_new_tokens):
-                num_new_tokens = (
-                    self.scheduler_config.long_prefill_token_threshold)
+            num_new_tokens = (
+                request.num_tokens_with_spec
+                + request.num_output_placeholders
+                - request.num_computed_tokens
+            )
+            if 0 < self.scheduler_config.long_prefill_token_threshold < num_new_tokens:
+                num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            # Make sure the input position does not exceed the max model len.
-            # This is necessary when using spec decoding.
+            # Make sure the input position does not exceed the max model len or
+            # request's max_tokens.
+            # This is necessary when using spec decoding and/or async scheduling.
+            max_total_tokens = min(
+                request.num_prompt_tokens + request.max_tokens, self.max_model_len
+            )
             num_new_tokens = min(
-                num_new_tokens,
-                self.max_model_len - 1 - request.num_computed_tokens)
+                num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
+            )
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
+            external_load_encoder_input: list[int] = []
             new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
-                (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_compute_budget
-                 ) = self._try_schedule_encoder_inputs(
-                     request, request.num_computed_tokens, num_new_tokens,
-                     encoder_compute_budget)
+                (
+                    encoder_inputs_to_schedule,
+                    num_new_tokens,
+                    new_encoder_compute_budget,
+                    external_load_encoder_input,
+                ) = self._try_schedule_encoder_inputs(
+                    request,
+                    request.num_computed_tokens,
+                    num_new_tokens,
+                    encoder_compute_budget,
+                )
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -251,12 +272,19 @@ class Scheduler(SchedulerInterface):
                 req_index += 1
                 continue
 
-            while True:
-                new_blocks = self.kv_cache_manager.allocate_slots(
-                    request,
-                    num_new_tokens,
-                    num_lookahead_tokens=self.num_lookahead_tokens)
-                if new_blocks is None:
+            # Schedule newly needed KV blocks for the request.
+            with record_function_or_nullcontext("schedule: allocate_slots"):
+                while True:
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens,
+                    )
+
+                    if new_blocks is not None:
+                        # The request can be scheduled.
+                        break
+
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
                     if self.policy == SchedulingPolicy.PRIORITY:
@@ -267,6 +295,26 @@ class Scheduler(SchedulerInterface):
                         self.running.remove(preempted_req)
                         if preempted_req in scheduled_running_reqs:
                             scheduled_running_reqs.remove(preempted_req)
+                            token_budget += num_scheduled_tokens[
+                                preempted_req.request_id
+                            ]
+                            req_to_new_blocks.pop(preempted_req.request_id)
+                            num_scheduled_tokens.pop(preempted_req.request_id)
+                            scheduled_spec_decode_tokens.pop(
+                                preempted_req.request_id, None
+                            )
+                            preempted_encoder_inputs = scheduled_encoder_inputs.pop(
+                                preempted_req.request_id, None
+                            )
+                            if preempted_encoder_inputs:
+                                # Restore encoder compute budget if the preempted
+                                # request had encoder inputs scheduled in this step.
+                                num_tokens_to_restore = sum(
+                                    preempted_req.get_num_encoder_tokens(i)
+                                    for i in preempted_encoder_inputs
+                                )
+                                encoder_compute_budget += num_tokens_to_restore
+                            req_index -= 1
                     else:
                         preempted_req = self.running.pop()
 
@@ -274,23 +322,21 @@ class Scheduler(SchedulerInterface):
                     self.encoder_cache_manager.free(preempted_req)
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
+                    preempted_req.num_preemptions += 1
                     if self.log_stats:
                         preempted_req.record_event(
-                            EngineCoreEventType.PREEMPTED, scheduled_timestamp)
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp
+                        )
 
                     self.waiting.prepend_request(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
-                        # No more request to preempt.
-                        can_schedule = False
+                        # No more request to preempt. Cannot schedule this request.
                         break
-                else:
-                    # The request can be scheduled.
-                    can_schedule = True
-                    break
-            if not can_schedule:
+
+            if new_blocks is None:
+                # Cannot schedule this request.
                 break
-            assert new_blocks is not None
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
@@ -301,30 +347,45 @@ class Scheduler(SchedulerInterface):
 
             # Speculative decode related.
             if request.spec_token_ids:
-                num_scheduled_spec_tokens = (num_new_tokens +
-                                             request.num_computed_tokens -
-                                             request.num_tokens)
+                num_scheduled_spec_tokens = (
+                    num_new_tokens
+                    + request.num_computed_tokens
+                    - request.num_tokens
+                    - request.num_output_placeholders
+                )
                 if num_scheduled_spec_tokens > 0:
                     # Trim spec_token_ids list to num_scheduled_spec_tokens.
                     del request.spec_token_ids[num_scheduled_spec_tokens:]
                     scheduled_spec_decode_tokens[request.request_id] = (
-                        request.spec_token_ids)
+                        request.spec_token_ids
+                    )
+                # New spec tokens will be set in `update_draft_token_ids` before the
+                # next step when applicable.
+                request.spec_token_ids = []
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
                 scheduled_encoder_inputs[request.request_id] = (
-                    encoder_inputs_to_schedule)
+                    encoder_inputs_to_schedule
+                )
                 # Allocate the encoder cache.
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_compute_budget = new_encoder_compute_budget
+            if external_load_encoder_input:
+                for i in external_load_encoder_input:
+                    self.encoder_cache_manager.allocate(request, i)
+                    if self.ec_connector is not None:
+                        self.ec_connector.update_state_after_alloc(request, i)
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
         if self.lora_config:
             scheduled_loras = set(
-                req.lora_request.lora_int_id for req in scheduled_running_reqs
-                if req.lora_request and req.lora_request.lora_int_id > 0)
+                req.lora_request.lora_int_id
+                for req in scheduled_running_reqs
+                if req.lora_request and req.lora_request.lora_int_id > 0
+            )
             assert len(scheduled_loras) <= self.lora_config.max_loras
 
         # Use a temporary RequestQueue to collect requests that need to be
@@ -347,7 +408,8 @@ class Scheduler(SchedulerInterface):
                     else:
                         logger.debug(
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
-                            request.request_id)
+                            request.request_id,
+                        )
                         self.waiting.pop_request()
                         skipped_waiting_requests.prepend_request(request)
                         continue
@@ -365,9 +427,14 @@ class Scheduler(SchedulerInterface):
 
                 # Check that adding the request still respects the max_loras
                 # constraint.
-                if (self.lora_config and request.lora_request and
-                    (len(scheduled_loras) == self.lora_config.max_loras and
-                     request.lora_request.lora_int_id not in scheduled_loras)):
+                if (
+                    self.lora_config
+                    and request.lora_request
+                    and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id not in scheduled_loras
+                    )
+                ):
                     # Scheduling would exceed max_loras, skip.
                     self.waiting.pop_request()
                     skipped_waiting_requests.prepend_request(request)
@@ -379,17 +446,19 @@ class Scheduler(SchedulerInterface):
                 # Get already-cached tokens.
                 if request.num_computed_tokens == 0:
                     # Get locally-cached tokens.
-                    new_computed_blocks, num_new_local_computed_tokens = \
-                        self.kv_cache_manager.get_computed_blocks(
-                            request)
+                    new_computed_blocks, num_new_local_computed_tokens = (
+                        self.kv_cache_manager.get_computed_blocks(request)
+                    )
 
                     # Get externally-cached tokens if using a KVConnector.
                     if self.connector is not None:
-                        num_external_computed_tokens, load_kv_async = (
+                        ext_tokens, load_kv_async = (
                             self.connector.get_num_new_matched_tokens(
-                                request, num_new_local_computed_tokens))
+                                request, num_new_local_computed_tokens
+                            )
+                        )
 
-                        if num_external_computed_tokens is None:
+                        if ext_tokens is None:
                             # The request cannot be scheduled because
                             # the KVConnector couldn't determine
                             # the number of matched tokens.
@@ -397,39 +466,43 @@ class Scheduler(SchedulerInterface):
                             skipped_waiting_requests.prepend_request(request)
                             continue
 
+                        num_external_computed_tokens = ext_tokens
+
                     # Total computed tokens (local + external).
-                    num_computed_tokens = (num_new_local_computed_tokens +
-                                           num_external_computed_tokens)
-                # KVTransfer: WAITING reqs have num_computed_tokens > 0
-                # after async KV recvs are completed.
+                    num_computed_tokens = (
+                        num_new_local_computed_tokens + num_external_computed_tokens
+                    )
                 else:
-                    new_computed_blocks = (
-                        self.kv_cache_manager.create_empty_block_list())
+                    # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                    # after async KV recvs are completed.
+                    new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
+                external_load_encoder_input = []
                 new_encoder_compute_budget = encoder_compute_budget
 
-                # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
+                    # KVTransfer: loading remote KV, do not allocate for new work.
                     assert num_external_computed_tokens > 0
                     num_new_tokens = 0
-                # Number of tokens to be scheduled.
                 else:
+                    # Number of tokens to be scheduled.
                     # We use `request.num_tokens` instead of
                     # `request.num_prompt_tokens` to consider the resumed
                     # requests, which have output tokens.
                     num_new_tokens = request.num_tokens - num_computed_tokens
-                    if (0 < self.scheduler_config.long_prefill_token_threshold
-                            < num_new_tokens):
-                        num_new_tokens = (
-                            self.scheduler_config.long_prefill_token_threshold)
+                    threshold = self.scheduler_config.long_prefill_token_threshold
+                    if 0 < threshold < num_new_tokens:
+                        num_new_tokens = threshold
 
                     # chunked prefill has to be enabled explicitly to allow
                     # pooling requests to be chunked
-                    if not self.scheduler_config.chunked_prefill_enabled and \
-                        num_new_tokens > token_budget:
+                    if (
+                        not self.scheduler_config.enable_chunked_prefill
+                        and num_new_tokens > token_budget
+                    ):
                         self.waiting.pop_request()
                         skipped_waiting_requests.prepend_request(request)
                         continue
@@ -439,11 +512,17 @@ class Scheduler(SchedulerInterface):
 
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
-                        (encoder_inputs_to_schedule, num_new_tokens,
-                         new_encoder_compute_budget
-                         ) = self._try_schedule_encoder_inputs(
-                             request, num_computed_tokens, num_new_tokens,
-                             encoder_compute_budget)
+                        (
+                            encoder_inputs_to_schedule,
+                            num_new_tokens,
+                            new_encoder_compute_budget,
+                            external_load_encoder_input,
+                        ) = self._try_schedule_encoder_inputs(
+                            request,
+                            num_computed_tokens,
+                            num_new_tokens,
+                            encoder_compute_budget,
+                        )
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -453,9 +532,9 @@ class Scheduler(SchedulerInterface):
                 # extra block gets allocated which
                 # creates a mismatch between the number
                 # of local and remote blocks.
-                effective_lookahead_tokens = (0 if request.num_computed_tokens
-                                              == 0 else
-                                              self.num_lookahead_tokens)
+                effective_lookahead_tokens = (
+                    0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
+                )
 
                 # Determine if we need to allocate cross-attention blocks.
                 if self.is_encoder_decoder and request.has_encoder_inputs:
@@ -463,8 +542,9 @@ class Scheduler(SchedulerInterface):
                     # always padded to the maximum length. If we support other
                     # encoder-decoder models, this will need to be updated if we
                     # want to only allocate what is needed.
-                    num_encoder_tokens =\
+                    num_encoder_tokens = (
                         self.scheduler_config.max_num_encoder_input_tokens
+                    )
                 else:
                     num_encoder_tokens = 0
 
@@ -492,6 +572,9 @@ class Scheduler(SchedulerInterface):
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
+                    self._update_connector_prefix_cache_stats(
+                        request, num_external_computed_tokens
+                    )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -506,20 +589,21 @@ class Scheduler(SchedulerInterface):
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
-                    request.record_event(EngineCoreEventType.SCHEDULED,
-                                         scheduled_timestamp)
+                    request.record_event(
+                        EngineCoreEventType.SCHEDULED, scheduled_timestamp
+                    )
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
                 elif request.status == RequestStatus.PREEMPTED:
                     scheduled_resumed_reqs.append(request)
                 else:
-                    raise RuntimeError(
-                        f"Invalid request status: {request.status}")
+                    raise RuntimeError(f"Invalid request status: {request.status}")
 
                 if self.lora_config and request.lora_request:
                     scheduled_loras.add(request.lora_request.lora_int_id)
                 req_to_new_blocks[request.request_id] = (
-                    self.kv_cache_manager.get_blocks(request.request_id))
+                    self.kv_cache_manager.get_blocks(request.request_id)
+                )
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
@@ -530,12 +614,18 @@ class Scheduler(SchedulerInterface):
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
                     scheduled_encoder_inputs[request.request_id] = (
-                        encoder_inputs_to_schedule)
+                        encoder_inputs_to_schedule
+                    )
                     # Allocate the encoder cache.
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_compute_budget = new_encoder_compute_budget
-
+                # Allocate for external load encoder cache
+                if external_load_encoder_input:
+                    for i in external_load_encoder_input:
+                        self.encoder_cache_manager.allocate(request, i)
+                        if self.ec_connector is not None:
+                            self.ec_connector.update_state_after_alloc(request, i)
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
             self.waiting.prepend_requests(skipped_waiting_requests)
@@ -543,42 +633,48 @@ class Scheduler(SchedulerInterface):
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+
         assert token_budget >= 0
         assert len(self.running) <= self.max_num_running_reqs
         # Since some requests in the RUNNING queue may not be scheduled in
         # this step, the total number of scheduled requests can be smaller than
         # len(self.running).
-        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
-                len(scheduled_running_reqs) <= len(self.running))
+        assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
+            scheduled_running_reqs
+        ) <= len(self.running)
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
-        num_common_prefix_blocks = [0] * len(
-            self.kv_cache_config.kv_cache_groups)
-        if self.running:
-            any_request = self.running[0]
-            num_common_prefix_blocks = (
-                self.kv_cache_manager.get_num_common_prefix_blocks(
-                    any_request, len(self.running)))
+        num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
+        with record_function_or_nullcontext("schedule: get_num_common_prefix_blocks"):
+            if self.running:
+                any_request = self.running[0]
+                num_common_prefix_blocks = (
+                    self.kv_cache_manager.get_num_common_prefix_blocks(
+                        any_request.request_id
+                    )
+                )
 
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(
-                req, req_to_new_blocks[req.request_id].get_block_ids())
+                req, req_to_new_blocks[req.request_id].get_block_ids()
+            )
             for req in scheduled_new_reqs
         ]
-        cached_reqs_data = self._make_cached_request_data(
-            scheduled_running_reqs,
-            scheduled_resumed_reqs,
-            num_scheduled_tokens,
-            scheduled_spec_decode_tokens,
-            req_to_new_blocks,
-        )
-        scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
-                              scheduled_resumed_reqs)
-        structured_output_request_ids, grammar_bitmask = (
-            self.get_grammar_bitmask(scheduled_requests,
-                                     scheduled_spec_decode_tokens))
+        with record_function_or_nullcontext("schedule: make_cached_request_data"):
+            cached_reqs_data = self._make_cached_request_data(
+                scheduled_running_reqs,
+                scheduled_resumed_reqs,
+                num_scheduled_tokens,
+                scheduled_spec_decode_tokens,
+                req_to_new_blocks,
+            )
+
+        # Record the request ids that were scheduled in this step.
+        self.prev_step_scheduled_req_ids.clear()
+        self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
+
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_cached_reqs=cached_reqs_data,
@@ -592,10 +688,7 @@ class Scheduler(SchedulerInterface):
             # It contains the request IDs that are finished in between
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
-            free_encoder_mm_hashes=self.encoder_cache_manager.
-            get_freed_mm_hashes(),
-            structured_output_request_ids=structured_output_request_ids,
-            grammar_bitmask=grammar_bitmask,
+            free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(),
         )
 
         # NOTE(Kuntai): this function is designed for multiple purposes:
@@ -603,27 +696,20 @@ class Scheduler(SchedulerInterface):
         # 2. Wrap up all the KV cache load / save ops into an opaque object
         # 3. Clear the internal states of the connector
         if self.connector is not None:
-            meta = self.connector.build_connector_meta(scheduler_output)
+            meta: KVConnectorMetadata = self.connector.build_connector_meta(
+                scheduler_output
+            )
             scheduler_output.kv_connector_metadata = meta
 
-        # collect KV cache events from KV cache manager
-        events = self.kv_cache_manager.take_events()
-
-        # collect KV cache events from connector
-        if self.connector is not None:
-            connector_events = self.connector.take_events()
-            if connector_events:
-                if events is None:
-                    events = list(connector_events)
-                else:
-                    events.extend(connector_events)
-
-        # publish collected KV cache events
-        if events:
-            batch = KVEventBatch(ts=time.time(), events=events)
-            self.kv_event_publisher.publish(batch)
+        # Build the connector meta for ECConnector
+        if self.ec_connector is not None:
+            ec_meta: ECConnectorMetadata = self.ec_connector.build_connector_meta(
+                scheduler_output
+            )
+            scheduler_output.ec_connector_metadata = ec_meta
 
-        self._update_after_schedule(scheduler_output)
+        with record_function_or_nullcontext("schedule: update_after_schedule"):
+            self._update_after_schedule(scheduler_output)
         return scheduler_output
 
     def _update_after_schedule(
@@ -667,43 +753,51 @@ class Scheduler(SchedulerInterface):
     ) -> CachedRequestData:
         req_ids: list[str] = []
         new_token_ids: list[list[int]] = []
-        new_block_ids: list[Optional[tuple[list[int], ...]]] = []
+        new_block_ids: list[tuple[list[int], ...] | None] = []
+        all_token_ids: dict[str, list[int]] = {}
         num_computed_tokens: list[int] = []
+        num_output_tokens: list[int] = []
+        resumed_req_ids = set()
 
-        use_connector = self.connector is not None
-        for req in itertools.chain(running_reqs, resumed_reqs):
+        num_running_reqs = len(running_reqs)
+        for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
             req_id = req.request_id
             req_ids.append(req_id)
-            num_tokens = (num_scheduled_tokens[req_id] -
-                          len(spec_decode_tokens.get(req_id, ())))
+            num_tokens = num_scheduled_tokens[req_id] - len(
+                spec_decode_tokens.get(req_id, ())
+            )
             if self.use_pp:
                 # When using PP, the scheduler sends the sampled tokens back,
                 # because there's no direct communication between the first-
                 # stage worker and the last-stage worker. Otherwise, we don't
                 # need to send the sampled tokens back because the model runner
                 # will cache them.
-                token_ids = req.all_token_ids[req.num_computed_tokens:req.
-                                              num_computed_tokens + num_tokens]
+                token_ids = req.all_token_ids[
+                    req.num_computed_tokens : req.num_computed_tokens + num_tokens
+                ]
                 new_token_ids.append(token_ids)
-            elif use_connector:
-                # When using a KVConnector, we add a placeholder to avoid index
-                # out of bounds errors. TODO: Remove this once the KVConnector
-                # is updated to handle token IDs properly.
-                new_token_ids.append([])
+            scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids
+            if idx >= num_running_reqs:
+                assert not scheduled_in_prev_step
+                resumed_req_ids.add(req_id)
+            if not scheduled_in_prev_step:
+                all_token_ids[req_id] = req.all_token_ids.copy()
             new_block_ids.append(
-                req_to_new_blocks[req_id].get_block_ids(allow_none=True))
+                req_to_new_blocks[req_id].get_block_ids(allow_none=True)
+            )
             num_computed_tokens.append(req.num_computed_tokens)
-        # Because resumed_reqs is usually empty, it is more efficient to do
-        # in-place appending so that we don't need to allocate a new list.
-        resumed_from_preemption = [False] * len(running_reqs)
-        resumed_from_preemption += [True] * len(resumed_reqs)
+            num_output_tokens.append(
+                req.num_output_tokens + req.num_output_placeholders
+            )
 
         return CachedRequestData(
             req_ids=req_ids,
-            resumed_from_preemption=resumed_from_preemption,
+            resumed_req_ids=resumed_req_ids,
             new_token_ids=new_token_ids,
+            all_token_ids=all_token_ids,
             new_block_ids=new_block_ids,
             num_computed_tokens=num_computed_tokens,
+            num_output_tokens=num_output_tokens,
         )
 
     def _try_schedule_encoder_inputs(
@@ -712,7 +806,7 @@ class Scheduler(SchedulerInterface):
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
-    ) -> tuple[list[int], int, int]:
+    ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
         and update `num_new_tokens` and encoder token budget accordingly.
@@ -722,6 +816,7 @@ class Scheduler(SchedulerInterface):
         in this step, i.e.,
         [num_computed_tokens, num_computed_tokens + num_new_tokens).
         - It is not already computed and stored in the encoder cache.
+        - It is not exist on remote encoder cache (via ECConnector)
         - There is sufficient encoder token budget to process it.
         - The encoder cache has space to store it.
 
@@ -733,12 +828,16 @@ class Scheduler(SchedulerInterface):
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_compute_budget
+            return [], num_new_tokens, encoder_compute_budget, []
         encoder_inputs_to_schedule: list[int] = []
         mm_features = request.mm_features
         assert mm_features is not None
         assert len(mm_features) > 0
+        external_load_encoder_input = []
 
+        # Check remote cache first
+        if self.ec_connector is not None:
+            remote_cache_has_item = self.ec_connector.has_caches(request)
         # NOTE: since scheduler operates on the request level (possibly with
         # multiple encoder inputs per request), we need to create temporary
         # trackers for accounting at the encoder input level.
@@ -758,7 +857,8 @@ class Scheduler(SchedulerInterface):
             if self.is_encoder_decoder and num_computed_tokens > 0:
                 assert start_pos == 0, (
                     "Encoder input should be processed at the beginning of "
-                    "the sequence when encoder-decoder models are used.")
+                    "the sequence when encoder-decoder models are used."
+                )
                 # Encoder input has already been computed
                 # The calculation here is a bit different. We don't turn encoder
                 # output into tokens that get processed by the decoder and
@@ -782,8 +882,7 @@ class Scheduler(SchedulerInterface):
                     # current step.
                     continue
 
-                if self.encoder_cache_manager.check_and_update_cache(
-                        request, i):
+                if self.encoder_cache_manager.check_and_update_cache(request, i):
                     # The encoder input is already computed and cached from a
                     # previous step.
                     continue
@@ -791,16 +890,18 @@ class Scheduler(SchedulerInterface):
             # If no encoder input chunking is allowed, we do not want to
             # partially schedule a multimodal item. If the scheduled range would
             # only cover part of the mm input, roll back to before the mm item.
-            if (self.scheduler_config.disable_chunked_mm_input
-                    and num_computed_tokens < start_pos
-                    and (num_computed_tokens + num_new_tokens)
-                    < (start_pos + num_encoder_tokens)):
+            if (
+                self.scheduler_config.disable_chunked_mm_input
+                and num_computed_tokens < start_pos
+                and (num_computed_tokens + num_new_tokens)
+                < (start_pos + num_encoder_tokens)
+            ):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
 
             if not self.encoder_cache_manager.can_allocate(
-                    request, i, encoder_compute_budget,
-                    num_tokens_to_schedule):
+                request, i, encoder_compute_budget, num_tokens_to_schedule
+            ):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
@@ -817,6 +918,12 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
+            if self.ec_connector is not None and remote_cache_has_item[i]:
+                mm_hashes_to_schedule.add(request.mm_features[i].identifier)
+                external_load_encoder_input.append(i)
+                num_tokens_to_schedule += num_encoder_tokens
+                continue
+
             num_tokens_to_schedule += num_encoder_tokens
             encoder_compute_budget -= num_encoder_tokens
             mm_hashes_to_schedule.add(request.mm_features[i].identifier)
@@ -826,37 +933,33 @@ class Scheduler(SchedulerInterface):
             encoder_inputs_to_schedule,
             num_new_tokens,
             encoder_compute_budget,
+            external_load_encoder_input,
         )
 
     def get_grammar_bitmask(
         self,
-        requests: list[Request],
-        scheduled_spec_decode_tokens: dict[str, list[int]],
-    ):
-        # NOTE: structured_output_request_ids maps
-        # a request's (request that uses structured output)
-        # request_id to its index in the batch.
-        # This will help us determine to slice the grammar bitmask
-        # and only applies valid mask for requests that
-        # uses structured decoding.
-        structured_output_request_ids: dict[str, int] = {}
-        for i, req in enumerate(requests):
-            if req.use_structured_output:
-                # PERF: in case of chunked prefill,
-                # request might not include any new tokens.
-                # Therefore, we might introduce some additional
-                # cycle to fill in the bitmask, which could be a big no-op.
-                structured_output_request_ids[req.request_id] = i
-
+        scheduler_output: SchedulerOutput,
+    ) -> GrammarOutput | None:
+        # Collect list of scheduled request ids that use structured output.
+        # The corresponding rows of the bitmask will be in this order.
+        # PERF: in case of chunked prefill,
+        # request might not include any new tokens.
+        # Therefore, we might introduce some additional
+        # cycle to fill in the bitmask, which could be a big no-op.
+        structured_output_request_ids = [
+            req_id
+            for req_id in scheduler_output.num_scheduled_tokens
+            if (req := self.requests.get(req_id)) and req.use_structured_output
+        ]
         if not structured_output_request_ids:
-            bitmask = None
-        else:
-            bitmask = self.structured_output_manager.grammar_bitmask(
-                self.requests,
-                structured_output_request_ids,
-                scheduled_spec_decode_tokens,
-            )
-        return structured_output_request_ids, bitmask
+            return None
+
+        bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            scheduler_output.scheduled_spec_decode_tokens,
+        )
+        return GrammarOutput(structured_output_request_ids, bitmask)
 
     def update_from_output(
         self,
@@ -872,9 +975,23 @@ class Scheduler(SchedulerInterface):
         kv_connector_output = model_runner_output.kv_connector_output
 
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
-        spec_decoding_stats: Optional[SpecDecodingStats] = None
-        kv_connector_stats = (kv_connector_output.kv_connector_stats
-                              if kv_connector_output else None)
+        spec_decoding_stats: SpecDecodingStats | None = None
+        kv_connector_stats: KVConnectorStats | None = (
+            kv_connector_output.kv_connector_stats if kv_connector_output else None
+        )
+        if kv_connector_stats and self.connector:
+            kv_stats = self.connector.get_kv_connector_stats()
+            if kv_stats:
+                kv_connector_stats = kv_connector_stats.aggregate(kv_stats)
+
+        failed_kv_load_req_ids = None
+        if kv_connector_output and kv_connector_output.invalid_block_ids:
+            # These blocks contain externally computed tokens that failed to
+            # load. Identify affected requests and adjust their computed token
+            # count to trigger recomputation of the invalid blocks.
+            failed_kv_load_req_ids = self._handle_invalid_blocks(
+                kv_connector_output.invalid_block_ids
+            )
 
         # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
         # the below loop can be a performance bottleneck. We should do our best
@@ -883,6 +1000,9 @@ class Scheduler(SchedulerInterface):
         stopped_preempted_reqs: set[Request] = set()
         for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
             assert num_tokens_scheduled > 0
+            if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
+                # Skip requests that were recovered from KV load failure
+                continue
             request = self.requests.get(req_id)
             if request is None:
                 # The request is already finished. This can happen if the
@@ -891,11 +1011,13 @@ class Scheduler(SchedulerInterface):
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = sampled_token_ids[
-                req_index] if sampled_token_ids else []
+            generated_token_ids: list[int] = (
+                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
+            )
 
             scheduled_spec_token_ids = (
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id)
+            )
             if scheduled_spec_token_ids:
                 num_draft_tokens = len(scheduled_spec_token_ids)
                 num_accepted = len(generated_token_ids) - 1
@@ -905,11 +1027,17 @@ class Scheduler(SchedulerInterface):
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
                 # tokens.
-                request.num_computed_tokens -= num_rejected
+                if request.num_computed_tokens > 0:
+                    request.num_computed_tokens -= num_rejected
+                # If async scheduling, num_output_placeholders also includes
+                # the scheduled spec tokens count and so is similarly adjusted.
+                if request.num_output_placeholders > 0:
+                    request.num_output_placeholders -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=num_draft_tokens,
-                    num_accepted_tokens=num_accepted)
+                    num_accepted_tokens=num_accepted,
+                )
 
             stopped = False
             new_logprobs = None
@@ -920,14 +1048,14 @@ class Scheduler(SchedulerInterface):
             # Check for stop and update request status.
             if new_token_ids:
                 new_token_ids, stopped = self._update_request_with_output(
-                    request, new_token_ids)
+                    request, new_token_ids
+                )
 
             # Stop checking for pooler models.
             pooler_output = None
             if pooler_outputs:
                 pooler_output = pooler_outputs[req_index]
-                stopped = check_stop(request, self.max_model_len,
-                                     pooler_output)
+                stopped = check_stop(request, self.max_model_len, pooler_output)
 
             if stopped:
                 kv_transfer_params = self._free_request(request)
@@ -937,28 +1065,27 @@ class Scheduler(SchedulerInterface):
                     stopped_preempted_reqs.add(request)
 
             # Extract sample logprobs if needed.
-            if request.sampling_params is not None \
-                and request.sampling_params.logprobs is not None and logprobs:
+            if (
+                request.sampling_params is not None
+                and request.sampling_params.logprobs is not None
+                and logprobs
+            ):
                 # NOTE: once we support N tokens per step (spec decode),
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
-            if new_token_ids and self.structured_output_manager.should_advance(
-                    request):
-                # NOTE: structured_output_request
-                # should not be None if use_structured_output, we have
-                # checked above, so safe to ignore type warning
-                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    req_id, new_token_ids)
+            if new_token_ids and self.structured_output_manager.should_advance(request):
+                struct_output_request = request.structured_output_request
+                assert struct_output_request is not None
+                assert struct_output_request.grammar is not None
+                struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
 
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
                 request.num_nans_in_logits = num_nans_in_logits[req_id]
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
-            if new_token_ids or pooler_output is not None \
-                or kv_transfer_params:
-
+            if new_token_ids or pooler_output is not None or kv_transfer_params:
                 # Add EngineCoreOutput for this Request.
                 outputs[request.client_index].append(
                     EngineCoreOutput(
@@ -973,7 +1100,9 @@ class Scheduler(SchedulerInterface):
                         kv_transfer_params=kv_transfer_params,
                         trace_headers=request.trace_headers,
                         num_cached_tokens=request.num_cached_tokens,
-                    ))
+                        num_nans_in_logits=request.num_nans_in_logits,
+                    )
+                )
             else:
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
@@ -986,9 +1115,25 @@ class Scheduler(SchedulerInterface):
             self.waiting.remove_requests(stopped_preempted_reqs)
 
         # KV Connector: update state for finished KV Transfers.
-        if model_runner_output.kv_connector_output:
-            self._update_from_kv_xfer_finished(
-                model_runner_output.kv_connector_output)
+        if kv_connector_output:
+            self._update_from_kv_xfer_finished(kv_connector_output)
+
+        # collect KV cache events from KV cache manager
+        events = self.kv_cache_manager.take_events()
+
+        # collect KV cache events from connector
+        if self.connector is not None:
+            connector_events = self.connector.take_events()
+            if connector_events:
+                if events is None:
+                    events = list(connector_events)
+                else:
+                    events.extend(connector_events)
+
+        # publish collected KV cache events
+        if events:
+            batch = KVEventBatch(ts=time.time(), events=events)
+            self.kv_event_publisher.publish(batch)
 
         # Create EngineCoreOutputs for all clients that have requests with
         # outputs in this step.
@@ -1007,11 +1152,13 @@ class Scheduler(SchedulerInterface):
                     eco.finished_requests = finished_set
                 else:
                     engine_core_outputs[client_index] = EngineCoreOutputs(
-                        finished_requests=finished_set)
+                        finished_requests=finished_set
+                    )
             finished_req_ids.clear()
 
-        if (stats := self.make_stats(spec_decoding_stats,
-                                     kv_connector_stats)) is not None:
+        if (
+            stats := self.make_stats(spec_decoding_stats, kv_connector_stats)
+        ) is not None:
             # Return stats to only one of the front-ends.
             if (eco := next(iter(engine_core_outputs.values()), None)) is None:
                 # We must return the stats even if there are no request
@@ -1042,8 +1189,9 @@ class Scheduler(SchedulerInterface):
         return new_token_ids, stopped
 
     def _free_encoder_inputs(self, request: Request) -> None:
-        cached_encoder_input_ids = (
-            self.encoder_cache_manager.get_cached_input_ids(request))
+        cached_encoder_input_ids = self.encoder_cache_manager.get_cached_input_ids(
+            request
+        )
         # OPTIMIZATION: Avoid list(set) if the set is empty.
         if not cached_encoder_input_ids:
             return
@@ -1058,21 +1206,19 @@ class Scheduler(SchedulerInterface):
                 # With Whisper, as soon as we've generated a single token,
                 # we know we're done with the encoder input. Cross Attention
                 # KVs have been calculated and cached already.
-                self.encoder_cache_manager.free_encoder_input(
-                    request, input_id)
+                self.encoder_cache_manager.free_encoder_input(request, input_id)
             elif start_pos + num_tokens <= request.num_computed_tokens:
                 # The encoder output is already processed and stored
                 # in the decoder's KV cache.
-                self.encoder_cache_manager.free_encoder_input(
-                    request, input_id)
+                self.encoder_cache_manager.free_encoder_input(request, input_id)
 
     def update_draft_token_ids(
         self,
         draft_token_ids: DraftTokenIds,
     ) -> None:
         for req_id, spec_token_ids in zip(
-                draft_token_ids.req_ids,
-                draft_token_ids.draft_token_ids,
+            draft_token_ids.req_ids,
+            draft_token_ids.draft_token_ids,
         ):
             request = self.requests.get(req_id)
             if request is None or request.is_finished():
@@ -1080,13 +1226,11 @@ class Scheduler(SchedulerInterface):
                 continue
 
             # Add newly generated spec token ids to the request.
-            if not spec_token_ids:
-                # NOTE(woosuk): request.spec_token_ids should be updated.
-                request.spec_token_ids.clear()
-            elif self.structured_output_manager.should_advance(request):
+            if self.structured_output_manager.should_advance(request):
                 metadata = request.structured_output_request
                 request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
-                    spec_token_ids)
+                    spec_token_ids
+                )
             else:
                 request.spec_token_ids = spec_token_ids
 
@@ -1102,7 +1246,7 @@ class Scheduler(SchedulerInterface):
 
     def finish_requests(
         self,
-        request_ids: Union[str, Iterable[str]],
+        request_ids: str | Iterable[str],
         finished_status: RequestStatus,
     ) -> None:
         """Handles the finish signal from outside the scheduler.
@@ -1112,7 +1256,7 @@ class Scheduler(SchedulerInterface):
         """
         assert RequestStatus.is_finished(finished_status)
         if isinstance(request_ids, str):
-            request_ids = (request_ids, )
+            request_ids = (request_ids,)
         else:
             request_ids = set(request_ids)
 
@@ -1123,7 +1267,7 @@ class Scheduler(SchedulerInterface):
         # First pass: collect requests to remove from queues
         for req_id in request_ids:
             request = self.requests.get(req_id)
-            if request is None:
+            if request is None or request.is_finished():
                 # Invalid request ID.
                 continue
 
@@ -1144,7 +1288,7 @@ class Scheduler(SchedulerInterface):
             request.status = finished_status
             self._free_request(request)
 
-    def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
+    def _free_request(self, request: Request) -> dict[str, Any] | None:
         assert request.is_finished()
 
         delay_free_blocks, kv_xfer_params = self._connector_finished(request)
@@ -1175,36 +1319,37 @@ class Scheduler(SchedulerInterface):
 
     def make_stats(
         self,
-        spec_decoding_stats: Optional[SpecDecodingStats] = None,
-        kv_connector_stats: Optional[KVConnectorStats] = None,
-    ) -> Optional[SchedulerStats]:
+        spec_decoding_stats: SpecDecodingStats | None = None,
+        kv_connector_stats: KVConnectorStats | None = None,
+    ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
         prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
         assert prefix_cache_stats is not None
-        return SchedulerStats(num_running_reqs=len(self.running),
-                              num_waiting_reqs=len(self.waiting),
-                              kv_cache_usage=self.kv_cache_manager.usage,
-                              prefix_cache_stats=prefix_cache_stats,
-                              spec_decoding_stats=spec_decoding_stats,
-                              num_corrupted_reqs=sum(req.is_output_corrupted
-                                                     for req in self.running),
-                              kv_connector_stats=kv_connector_stats.data
-                              if kv_connector_stats else None)
+        connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+            kv_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=prefix_cache_stats,
+            connector_prefix_cache_stats=connector_prefix_cache_stats,
+            spec_decoding_stats=spec_decoding_stats,
+            kv_connector_stats=kv_connector_stats.data if kv_connector_stats else None,
+        )
 
     def make_spec_decoding_stats(
         self,
-        spec_decoding_stats: Optional[SpecDecodingStats],
+        spec_decoding_stats: SpecDecodingStats | None,
         num_draft_tokens: int,
         num_accepted_tokens: int,
-    ) -> Optional[SpecDecodingStats]:
+    ) -> SpecDecodingStats | None:
         if not self.log_stats:
             return None
         if spec_decoding_stats is None:
             spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
         spec_decoding_stats.observe_draft(
-            num_draft_tokens=num_draft_tokens,
-            num_accepted_tokens=num_accepted_tokens)
+            num_draft_tokens=num_draft_tokens, num_accepted_tokens=num_accepted_tokens
+        )
         return spec_decoding_stats
 
     def shutdown(self) -> None:
@@ -1217,11 +1362,31 @@ class Scheduler(SchedulerInterface):
     # KV Connector Related Methods
     ########################################################################
 
-    def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
+    def _update_connector_prefix_cache_stats(
+        self, request: Request, num_external_tokens: int
+    ) -> None:
+        if self.connector_prefix_cache_stats is None:
+            return
+
+        self.connector_prefix_cache_stats.record(
+            num_tokens=request.num_tokens,
+            num_hits=num_external_tokens,
+            preempted=request.num_preemptions > 0,
+        )
+
+    def _make_connector_prefix_cache_stats(self) -> PrefixCacheStats | None:
+        if self.connector_prefix_cache_stats is None:
+            return None
+        stats = self.connector_prefix_cache_stats
+        self.connector_prefix_cache_stats = PrefixCacheStats()
+        return stats
+
+    def get_kv_connector(self) -> KVConnectorBase_V1 | None:
         return self.connector
 
     def _connector_finished(
-            self, request: Request) -> tuple[bool, Optional[dict[str, Any]]]:
+        self, request: Request
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Invoke the KV connector request_finished() method if applicable.
 
@@ -1231,8 +1396,17 @@ class Scheduler(SchedulerInterface):
         if self.connector is None:
             return False, None
 
-        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
-        return self.connector.request_finished(request, block_ids)
+        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
+
+        if not isinstance(self.connector, SupportsHMA):
+            # NOTE(Kuntai): We should deprecate this code path after we enforce
+            # all connectors to support HMA.
+            # Hybrid memory allocator should be already turned off for this
+            # code path, but let's double-check here.
+            assert len(self.kv_cache_config.kv_cache_groups) == 1
+            return self.connector.request_finished(request, block_ids[0])
+
+        return self.connector.request_finished_all_groups(request, block_ids)
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """
@@ -1250,25 +1424,37 @@ class Scheduler(SchedulerInterface):
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
 
-        # Now that the blocks are ready, actually cache them.
-        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
-        num_computed_tokens = len(block_ids) * self.block_size
-        # Handle the case where num request tokens less than one block.
-        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
-        if num_computed_tokens == request.num_tokens:
-            num_computed_tokens -= 1
-        # This will cache the blocks iff caching is enabled.
-        self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
+        if request.request_id in self.failed_recving_kv_req_ids:
+            # Request had KV load failures; num_computed_tokens was already
+            # updated in _update_requests_with_invalid_blocks
+            if request.num_computed_tokens:
+                # Cache any valid computed tokens.
+                self.kv_cache_manager.cache_blocks(request, request.num_computed_tokens)
+            else:
+                # No valid computed tokens, release allocated blocks.
+                # There may be a local cache hit on retry.
+                self.kv_cache_manager.free(request)
 
-        # Update the request state for scheduling.
-        request.num_computed_tokens = num_computed_tokens
+            self.failed_recving_kv_req_ids.remove(request.request_id)
+        else:
+            # Now that the blocks are ready, actually cache them.
+            (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id)
+            num_computed_tokens = len(block_ids) * self.block_size
+            # Handle the case where num request tokens less than one block.
+            num_computed_tokens = min(num_computed_tokens, request.num_tokens)
+            if num_computed_tokens == request.num_tokens:
+                num_computed_tokens -= 1
+            # This will cache the blocks iff caching is enabled.
+            self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
+
+            # Update the request state for scheduling.
+            request.num_computed_tokens = num_computed_tokens
 
         # Return that we are ready.
         self.finished_recving_kv_req_ids.remove(request.request_id)
         return True
 
-    def _update_from_kv_xfer_finished(self,
-                                      kv_connector_output: KVConnectorOutput):
+    def _update_from_kv_xfer_finished(self, kv_connector_output: KVConnectorOutput):
         """
         KV Connector: update the scheduler state based on the output.
 
@@ -1283,14 +1469,149 @@ class Scheduler(SchedulerInterface):
             self.connector.update_connector_output(kv_connector_output)
 
         # KV Connector:: update recv and send status from last step.
-        for req_id in (kv_connector_output.finished_recving or ()):
+        for req_id in kv_connector_output.finished_recving or ():
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
-        for req_id in (kv_connector_output.finished_sending or ()):
+        for req_id in kv_connector_output.finished_sending or ():
             logger.debug("Finished sending KV transfer for request %s", req_id)
-            if req_id not in self.requests:
-                logger.warning(
-                    "Got finished sending KV transfer for request %s,"
-                    "but the request is already freed.", req_id)
+            assert req_id in self.requests
+            self._free_blocks(self.requests[req_id])
+
+    def _update_requests_with_invalid_blocks(
+        self, requests: Iterable[Request], invalid_block_ids: set[int]
+    ) -> tuple[set[str], int]:
+        """
+        Identify and update requests affected by invalid KV cache blocks.
+
+        This method scans the given requests, detects those with invalid blocks
+        and adjusts their `num_computed_tokens` to the longest valid prefix.
+        For observability, it also accumulates the total number of tokens that
+        will need to be recomputed across all affected requests.
+
+        Args:
+            requests: The set of requests to scan for invalid blocks.
+            invalid_block_ids: IDs of invalid blocks.
+
+        Returns:
+            tuple:
+                - affected_req_ids (set[str]): IDs of requests impacted by
+                invalid blocks.
+                - total_affected_tokens (int): Total number of tokens that must
+                be recomputed across all affected requests (for observability).
+        """
+        affected_req_ids: set[str] = set()
+        total_affected_tokens = 0
+        # If a block is invalid and shared by multiple requests in the batch,
+        # these requests must be rescheduled, but only the first will recompute
+        # it. This set tracks blocks already marked for recomputation.
+        marked_invalid_block_ids: set[int] = set()
+        for request in requests:
+            is_affected = False
+            marked_invalid_block = False
+            req_id = request.request_id
+            # TODO (davidb): add support for hybrid memory allocator
+            (req_block_ids,) = self.kv_cache_manager.get_block_ids(req_id)
+            # We iterate only over blocks that may contain externally computed
+            # tokens
+            if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                # Async loading. If num_computed_tokens is set it implies we
+                # already processed some block failures for it in a prior step
+                req_num_computed_tokens = (
+                    request.num_computed_tokens
+                    if req_id in self.failed_recving_kv_req_ids
+                    else len(req_block_ids) * self.block_size
+                )
             else:
-                self._free_blocks(self.requests[req_id])
+                # Sync loading. num_computed_tokens includes new tokens
+                req_num_computed_tokens = request.num_cached_tokens
+
+            req_num_computed_blocks = (
+                req_num_computed_tokens + self.block_size - 1
+            ) // self.block_size
+            for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids):
+                if block_id not in invalid_block_ids:
+                    continue
+
+                is_affected = True
+
+                if block_id in marked_invalid_block_ids:
+                    # This invalid block is shared with a previous request
+                    # and was already marked for recomputation.
+                    # This means this request can still consider this block
+                    # as computed when rescheduled.
+                    # Currently this only applies to sync loading; Async
+                    # loading does not yet support block sharing
+                    continue
+
+                marked_invalid_block_ids.add(block_id)
+
+                if marked_invalid_block:
+                    # This request has already marked an invalid block for
+                    # recomputation and updated its num_computed_tokens.
+                    continue
+
+                marked_invalid_block = True
+                # Truncate the computed tokens at the first failed block
+                request.num_computed_tokens = idx * self.block_size
+                total_affected_tokens += (
+                    req_num_computed_tokens - request.num_computed_tokens
+                )
+
+            if is_affected:
+                if not marked_invalid_block:
+                    # All invalid blocks of this request are shared with
+                    # previous requests and will be recomputed by them.
+                    # Revert to considering only cached tokens as computed.
+                    # Currently this only applies to sync loading; Async
+                    # loading does not yet support block sharing
+                    total_affected_tokens += (
+                        request.num_computed_tokens - request.num_cached_tokens
+                    )
+                    request.num_computed_tokens = request.num_cached_tokens
+
+                affected_req_ids.add(request.request_id)
+
+        return affected_req_ids, total_affected_tokens
+
+    def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
+        total_requests_to_reschedule = 0
+        total_tokens_to_reschedule = 0
+
+        # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) ---
+        async_load_reqs = (
+            req
+            for req in self.waiting
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+        )
+        async_affected_req_ids, num_tokens_to_reschedule = (
+            self._update_requests_with_invalid_blocks(
+                async_load_reqs, invalid_block_ids
+            )
+        )
+
+        total_requests_to_reschedule += len(async_affected_req_ids)
+        total_tokens_to_reschedule += num_tokens_to_reschedule
+
+        # Mark requests with async KV load failures; they will be rescheduled
+        # once loading completes.
+        self.failed_recving_kv_req_ids |= async_affected_req_ids
+
+        # --- Handle sync KV loads (running requests) ---
+        sync_affected_req_ids, num_tokens_to_reschedule = (
+            self._update_requests_with_invalid_blocks(self.running, invalid_block_ids)
+        )
+
+        total_requests_to_reschedule += len(sync_affected_req_ids)
+        total_tokens_to_reschedule += num_tokens_to_reschedule
+
+        if total_requests_to_reschedule:
+            logger.warning(
+                "Recovered from KV load failure: "
+                "%d request(s) rescheduled (%d tokens affected).",
+                total_requests_to_reschedule,
+                total_tokens_to_reschedule,
+            )
+
+        # Return the IDs of affected running requests to skip in
+        # update_from_output.
+        return sync_affected_req_ids
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index c431843de6baa581d67519a106d1d0e5b26bd034..82166dc9783969a5800dbeb82333ee7367bd3440 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
-from typing import Optional
 
 import torch
 
@@ -40,14 +39,9 @@ def remove_all(lst: list, items_to_remove: set) -> list:
     return [item for item in lst if item not in items_to_remove]
 
 
-def check_stop(request: Request,
-               max_model_len: int,
-               pooler_output: Optional[torch.Tensor] = None) -> bool:
-    if (request.num_tokens >= max_model_len
-            or request.num_output_tokens >= request.max_tokens):
-        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-        return True
-
+def check_stop(
+    request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None
+) -> bool:
     if request.pooling_params:
         if pooler_output is not None:
             request.status = RequestStatus.FINISHED_STOPPED
@@ -56,9 +50,12 @@ def check_stop(request: Request,
 
     sampling_params = request.sampling_params
     assert sampling_params is not None
+
+    if request.num_output_tokens < sampling_params.min_tokens:
+        return False
+
     last_token_id = request.output_token_ids[-1]
-    if (not sampling_params.ignore_eos
-            and last_token_id == request.eos_token_id):
+    if not sampling_params.ignore_eos and last_token_id == request.eos_token_id:
         request.status = RequestStatus.FINISHED_STOPPED
         return True
 
@@ -66,4 +63,10 @@ def check_stop(request: Request,
         request.status = RequestStatus.FINISHED_STOPPED
         request.stop_reason = last_token_id
         return True
+    if (
+        request.num_tokens >= max_model_len
+        or request.num_output_tokens >= request.max_tokens
+    ):
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        return True
     return False
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e889f7804e844e24e79bb2a9a0570df567fc4e53..14ac83028ee4465977a07890786d0749c13975f3 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -3,20 +3,26 @@
 import itertools
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from collections.abc import Sequence
 
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        CrossAttentionSpec, FullAttentionSpec,
-                                        KVCacheSpec, MambaSpec,
-                                        MLAAttentionSpec, SlidingWindowSpec)
+from vllm.v1.kv_cache_interface import (
+    ChunkedLocalAttentionSpec,
+    CrossAttentionSpec,
+    FullAttentionSpec,
+    KVCacheSpec,
+    MambaSpec,
+    MLAAttentionSpec,
+    SlidingWindowSpec,
+)
 from vllm.v1.request import Request
 
 
 class SingleTypeKVCacheManager(ABC):
     """
-    An abstract base class for a manager that handle the kv cache management 
+    An abstract base class for a manager that handle the kv cache management
     logic of one specific type of attention layer.
     """
 
@@ -44,8 +50,7 @@ class SingleTypeKVCacheManager(ABC):
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
-        self.req_to_blocks: defaultdict[str,
-                                        list[KVCacheBlock]] = defaultdict(list)
+        self.req_to_blocks: defaultdict[str, list[KVCacheBlock]] = defaultdict(list)
 
         # {req_id: The number of cached blocks for this given request}
         # This is used to track the number of cached blocks for each request.
@@ -57,14 +62,17 @@ class SingleTypeKVCacheManager(ABC):
         self._null_block = block_pool.null_block
 
     def get_num_blocks_to_allocate(
-            self, request_id: str, num_tokens: int,
-            new_computed_blocks: list[KVCacheBlock]) -> int:
+        self,
+        request_id: str,
+        num_tokens: int,
+        new_computed_blocks: Sequence[KVCacheBlock],
+    ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
         Args:
             request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including 
+            num_tokens: The total number of tokens that need a slot (including
                 tokens that are already allocated).
             new_computed_blocks: The new computed blocks just hitting the
                 prefix caching.
@@ -74,20 +82,23 @@ class SingleTypeKVCacheManager(ABC):
         """
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        num_new_blocks = (num_required_blocks - len(new_computed_blocks) -
-                          len(self.req_to_blocks[request_id]))
+        num_new_blocks = (
+            num_required_blocks
+            - len(new_computed_blocks)
+            - len(self.req_to_blocks[request_id])
+        )
         # If a computed block of a request is an eviction candidate (in the
         # free queue and ref_cnt == 0), it will be changed from a free block
         # to a computed block when the request is allocated, so we also count
         # it as needed to be allocated.
         num_evictable_computed_blocks = sum(
-            blk.ref_cnt == 0 and not blk.is_null
-            for blk in new_computed_blocks)
+            blk.ref_cnt == 0 and not blk.is_null for blk in new_computed_blocks
+        )
         return num_new_blocks + num_evictable_computed_blocks
 
     def save_new_computed_blocks(
-            self, request_id: str,
-            new_computed_blocks: list[KVCacheBlock]) -> None:
+        self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock]
+    ) -> None:
         """
         Add the new computed blocks to the request.
 
@@ -106,15 +117,16 @@ class SingleTypeKVCacheManager(ABC):
             # A running request. Should not have new computed blocks.
             assert len(new_computed_blocks) == 0
 
-    def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> list[KVCacheBlock]:
+    def allocate_new_blocks(
+        self, request_id: str, num_tokens: int
+    ) -> list[KVCacheBlock]:
         """
-        Allocate new blocks for the request to give it at least `num_tokens` 
+        Allocate new blocks for the request to give it at least `num_tokens`
         token slots.
 
         Args:
             request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including 
+            num_tokens: The total number of tokens that need a slot (including
                 tokens that are already allocated).
 
         Returns:
@@ -136,12 +148,15 @@ class SingleTypeKVCacheManager(ABC):
 
         Args:
             request: The request.
-            num_tokens: The total number of tokens that need to be cached 
+            num_tokens: The total number of tokens that need to be cached
                 (including tokens that are already cached).
         """
-        num_cached_blocks = self.num_cached_block[request.request_id]
+        num_cached_blocks = self.num_cached_block.get(request.request_id, 0)
         num_full_blocks = num_tokens // self.block_size
 
+        if num_cached_blocks >= num_full_blocks:
+            return
+
         self.block_pool.cache_full_blocks(
             request=request,
             blocks=self.req_to_blocks[request.request_id],
@@ -171,20 +186,17 @@ class SingleTypeKVCacheManager(ABC):
         self.num_cached_block.pop(request_id, None)
 
     @abstractmethod
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
-        Get the number of common prefix blocks for all requests in the RUNNING
-        state.
+        Get the number of common prefix blocks for all requests with allocated
+        KV cache.
 
         Args:
-            request_id: The request ID.
-            num_running_requests: The total number of requests in the RUNNING
-                state.
+            running_request_id: The request ID.
 
         Returns:
-            The number of common prefix blocks for all requests in the RUNNING
-                state.
+            The number of common prefix blocks for all requests with allocated
+            KV cache.
         """
 
         raise NotImplementedError
@@ -202,12 +214,12 @@ class SingleTypeKVCacheManager(ABC):
         dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
-        Get the longest cache hit prefix of the blocks that is not longer than 
-        `max_length`. The prefix should be a common prefix hit for all the 
-        kv cache groups in `kv_cache_group_ids`. If no cache hit is found, 
-        return an empty list. 
-        If eagle is enabled, drop the last matched block to force recompute the 
-        last block to get the required hidden states for eagle drafting head. 
+        Get the longest cache hit prefix of the blocks that is not longer than
+        `max_length`. The prefix should be a common prefix hit for all the
+        kv cache groups in `kv_cache_group_ids`. If no cache hit is found,
+        return an empty list.
+        If eagle is enabled, drop the last matched block to force recompute the
+        last block to get the required hidden states for eagle drafting head.
         Need to be customized for each attention type.
 
         Args:
@@ -231,23 +243,56 @@ class SingleTypeKVCacheManager(ABC):
 
         raise NotImplementedError
 
-    @abstractmethod
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
+    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks` and free the 
-        blocks. The removed blocks should be replaced by null_block.
-        Need to be customized for each attention type.
+        Remove and free the blocks that are no longer needed for attention computation.
+        The removed blocks should be replaced by null_block.
+
+        This function depends on `get_num_skipped_tokens`, which need to be implemented
+        differently for each attention type.
 
         Args:
             request_id: The request ID.
             num_computed_tokens: The number of tokens that have been computed.
         """
-        raise NotImplementedError
+        # Remove the blocks that will be skipped during attention computation.
+        num_skipped_tokens = self.get_num_skipped_tokens(num_computed_tokens)
+        if num_skipped_tokens <= 0:
+            # This indicates that ALL tokens are inside attention window.
+            # Thus we do not need to free any blocks outside attention window.
+            # A typical case is full attention that we never free any token
+            # before the request is finished.
+            return
+        num_skipped_blocks = num_skipped_tokens // self.block_size
+        blocks = self.req_to_blocks[request_id]
+        removed_blocks: list[KVCacheBlock] = []
+        # Because the block starts from index 0, the num_skipped_block-th block
+        # corresponds to index num_skipped_blocks - 1.
+        for i in range(num_skipped_blocks - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        self.block_pool.free_blocks(removed_blocks)
 
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
 
-class FullAttentionManager(SingleTypeKVCacheManager):
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        # The default behavior is to not skip any tokens.
+        return 0
 
+
+class FullAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
@@ -261,10 +306,13 @@ class FullAttentionManager(SingleTypeKVCacheManager):
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
-        ), "FullAttentionManager can only be used for full attention " \
+        ), (
+            "FullAttentionManager can only be used for full attention "
             "and chunked local attention groups"
+        )
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
-            [] for _ in range(len(kv_cache_group_ids)))
+            [] for _ in range(len(kv_cache_group_ids))
+        )
         block_size = kv_cache_spec.block_size
         if dcp_world_size > 1:
             block_size *= dcp_world_size
@@ -274,7 +322,8 @@ class FullAttentionManager(SingleTypeKVCacheManager):
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
             if cached_block := block_pool.get_cached_block(
-                    block_hash, kv_cache_group_ids):
+                block_hash, kv_cache_group_ids
+            ):
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed.append(cached)
             else:
@@ -284,17 +333,11 @@ class FullAttentionManager(SingleTypeKVCacheManager):
                 computed.pop()
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
-        # No need to remove blocks for full attention.
-        pass
-
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
-        blocks = self.req_to_blocks[request_id]
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
+        blocks = self.req_to_blocks[running_request_id]
         num_common_blocks = 0
         for block in blocks:
-            if block.ref_cnt == num_running_requests:
+            if block.ref_cnt == len(self.req_to_blocks):
                 num_common_blocks += 1
             else:
                 break
@@ -302,9 +345,9 @@ class FullAttentionManager(SingleTypeKVCacheManager):
 
 
 class SlidingWindowManager(SingleTypeKVCacheManager):
-
-    def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
-                 **kwargs) -> None:
+    def __init__(
+        self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool, **kwargs
+    ) -> None:
         super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.sliding_window = kv_cache_spec.sliding_window
         self._null_block = block_pool.null_block
@@ -321,13 +364,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
-            "SlidingWindowManager can only be used for sliding window groups")
+            "SlidingWindowManager can only be used for sliding window groups"
+        )
         assert dcp_world_size == 1, "DCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
         sliding_window_contiguous_blocks = cdiv(
-            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size)
+            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size
+        )
         if use_eagle:
             # Need to drop the last matched block if eagle is enabled. For
             # sliding window layer, we achieve this by increasing the number of
@@ -341,14 +386,17 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
         max_num_blocks = max_length // kv_cache_spec.block_size
-        computed_blocks = tuple([block_pool.null_block] * max_num_blocks
-                                for _ in range(len(kv_cache_group_ids)))
+        computed_blocks = tuple(
+            [block_pool.null_block] * max_num_blocks
+            for _ in range(len(kv_cache_group_ids))
+        )
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
-                    block_hashes[i], kv_cache_group_ids):
+                block_hashes[i], kv_cache_group_ids
+            ):
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed[i] = cached
                 num_contiguous_blocks += 1
@@ -357,7 +405,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
                     for computed in computed_blocks:
-                        del computed[i + num_contiguous_blocks:]
+                        del computed[i + num_contiguous_blocks :]
                     match_found = True
                     break
             else:
@@ -372,39 +420,48 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
                 computed.pop()
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
-        # Remove the blocks that are no longer be in the sliding window and
-        # skipped during the attention computation.
-        last_useful_token = num_computed_tokens - self.sliding_window + 1
-        last_useful_block = last_useful_token // self.block_size
-        blocks = self.req_to_blocks[request_id]
-        removed_blocks: list[KVCacheBlock] = []
-        for i in range(last_useful_block - 1, -1, -1):
-            if blocks[i] == self._null_block:
-                # If the block is already a null block, the blocks before it
-                # should also have been set to null blocks by the previous calls
-                # to this function.
-                break
-            removed_blocks.append(blocks[i])
-            blocks[i] = self._null_block
-        self.block_pool.free_blocks(removed_blocks)
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
+
+        For sliding window, this corresponds to the tokens that are prior to
+        the current sliding window.
+
+        Example:
+        sliding_window=4, num_computed_tokens=7
+
+        Tokens:   [ 0  1  2  3  4  5  6  7 ]
+                  | ---- computed -----|
+                                         ^ next token to be computed
+                               |-----------| sliding window for next token
+                  |--skipped---|
+
+        The current window contains tokens 4~7. Tokens 0~3 will be skipped for
+        attention computation since they are outside the sliding window.
+        Thus, get_num_skipped_tokens(7) == 4.
+
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        return num_computed_tokens - self.sliding_window + 1
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
         NOTE(Chen): The prefix blocks are null blocks for sliding window layers.
-        So it's not correct to count ref_cnt like FullAttentionManager. Return 
-        0 here for correctness. Need to support cascade attention + sliding 
+        So it's not correct to count ref_cnt like FullAttentionManager. Return
+        0 here for correctness. Need to support cascade attention + sliding
         window in the future.
         """
         return 0
 
 
 class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
-
-    def __init__(self, kv_cache_spec: ChunkedLocalAttentionSpec,
-                 block_pool: BlockPool, **kwargs) -> None:
+    def __init__(
+        self, kv_cache_spec: ChunkedLocalAttentionSpec, block_pool: BlockPool, **kwargs
+    ) -> None:
         super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.attention_chunk_size = kv_cache_spec.attention_chunk_size
         self._null_block = block_pool.null_block
@@ -425,19 +482,19 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
         prefix of the blocks that is not longer than `max_length`. The prefix
         should be a common prefix hit for all the kv cache groups in
         `kv_cache_group_ids`. If no cache hit is found, return an empty list.
-        note we mark as computed if the whole block is outside of the local 
+        note we mark as computed if the whole block is outside of the local
         window, and set the block as null. Examples:
 
         1. Attention chunk size of 8, block size of 4, max length of 15
-        for next token at 15th (zero-indexed), 8th - 14th tokens are in 
-        the window(needs lookup), 0th - 7th are not in the window, 
-        so they are already marked as computed. We check the complete 
-        block3 (8th - 11th tokens), Assume block 3 is hit, we will return 
+        for next token at 15th (zero-indexed), 8th - 14th tokens are in
+        the window(needs lookup), 0th - 7th are not in the window,
+        so they are already marked as computed. We check the complete
+        block3 (8th - 11th tokens), Assume block 3 is hit, we will return
         [null, null, block 3], otherwise, we return [null, null]
 
         2. Attention chunk size of 8, block size of 4, max length of 16
-        for next token at 16th (zero-indexed), 0th - 15th tokens are not 
-        in the window, so they are already marked as computed. 
+        for next token at 16th (zero-indexed), 0th - 15th tokens are not
+        in the window, so they are already marked as computed.
         we return 4 blocks[null, null, null, null]
 
         Args:
@@ -452,74 +509,91 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
             A list of cached blocks
         """
         assert isinstance(kv_cache_spec, ChunkedLocalAttentionSpec), (
-            "ChunkedLocalAttentionManager can only be used for " +
-            "chunked local attention groups")
-        assert use_eagle is False, ("Hybrid KV cache is not supported for " +
-                                    "eagle + chunked local attention.")
+            "ChunkedLocalAttentionManager can only be used for "
+            + "chunked local attention groups"
+        )
+        assert use_eagle is False, (
+            "Hybrid KV cache is not supported for " + "eagle + chunked local attention."
+        )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
-            local_attention_start_idx = (max_length //
-                                         kv_cache_spec.attention_chunk_size *
-                                         kv_cache_spec.attention_chunk_size)
+            local_attention_start_idx = (
+                max_length
+                // kv_cache_spec.attention_chunk_size
+                * kv_cache_spec.attention_chunk_size
+            )
         else:
             local_attention_start_idx = 0
         # we marked blocks out of window as computed
         # with null blocks, and blocks inside window based on cache lookup
         # result [null] [null] ... [null] [hit block 1 (1st block contain
         # last window)] [hit block 2] ... [hit block x]
-        local_attention_start_block_idx = (local_attention_start_idx //
-                                           kv_cache_spec.block_size)
+        local_attention_start_block_idx = (
+            local_attention_start_idx // kv_cache_spec.block_size
+        )
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [block_pool.null_block] * local_attention_start_block_idx
-            for _ in range(len(kv_cache_group_ids)))
+            for _ in range(len(kv_cache_group_ids))
+        )
         for i in range(local_attention_start_block_idx, max_num_blocks):
             block_hash = block_hashes[i]
             if cached_block := block_pool.get_cached_block(
-                    block_hash, kv_cache_group_ids):
+                block_hash, kv_cache_group_ids
+            ):
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed.append(cached)
             else:
                 break
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
-        # Remove the blocks that are no longer be in the chunked attention
-        # window and skipped during the attention computation.
-
-        # [chunk 0][chunk 1]local_attention_start_idx ... current
-        # we computed previous number of chunks to get the idx of
-        # current chunk window starting offset,
-        # e.g. for computed 1024 tokens, the 1024th token (0 indexed)
-        # is in the second chunk, there are 1 prev chunk, the start idx
-        # is 1024. for 1023, it will be 0.
-        num_cached_block = self.num_cached_block.get(request_id, 0)
-        local_attention_start_idx = (
-            num_computed_tokens
-        ) // self.attention_chunk_size * self.attention_chunk_size
-        first_useful_block_idx = local_attention_start_idx // self.block_size
-        if num_cached_block > 0:
-            # Make sure we don't delete the last cached block
-            first_useful_block_idx = min(first_useful_block_idx,
-                                         num_cached_block - 1)
-        # if block size = 128, 0 -> block 0, 1024 (= 128 * 8) ->
-        # block 8, 372 (= 128 * 2 + 116) -> block 2
-        blocks = self.req_to_blocks[request_id]
-        removed_blocks: list[KVCacheBlock] = []
-        # we need to keep the last block to get the previous hash key
-        for i in range(first_useful_block_idx - 1, -1, -1):
-            if blocks[i] == self._null_block:
-                # If the block is already a null block, the blocks before it
-                # should also have been set to null blocks by the previous calls
-                # to this function.
-                break
-            removed_blocks.append(blocks[i])
-            blocks[i] = self._null_block
-        self.block_pool.free_blocks(removed_blocks)
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
+
+        For chunked local attention, this corresponds to the tokens that are on
+        the left side of the current chunk.
+
+        Example 1:
+        chunk size = 8, num_computed_tokens = 13
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 | ----- computed ---------------|
+                                                  ^^ next token to be computed
+                                   |----------------| <-- attention window for
+                                                          next token
+                 |--- skipped -----|
+        Output: get_num_skipped_tokens(13) == 8
+
+        Example 2:
+        chunk size = 8, num_computed_tokens = 8
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 | --- computed ---|
+                                     ^ next token to be computed
+                                   |--| <-- attention window for next token
+                 | --- skipped ----|
+        Output: get_num_skipped_tokens(8) == 8
+
+        Example 3:
+        chunk size = 8, num_computed_tokens = 7
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 |---computed---|
+                                 ^ next token to be computed
+                 |-----------------| <-- attention window for next token
+                 no token should be skipped.
+        Output: get_num_skipped_tokens(7) == 0
+
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        num_skipped_tokens = (
+            num_computed_tokens // self.attention_chunk_size
+        ) * self.attention_chunk_size
+        return num_skipped_tokens
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
         cascade attention is not supported by chunked local attention.
         """
@@ -527,7 +601,6 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
 
 
 class MambaManager(SingleTypeKVCacheManager):
-
     @classmethod
     def find_longest_cache_hit(
         cls,
@@ -539,67 +612,66 @@ class MambaManager(SingleTypeKVCacheManager):
         use_eagle: bool,
         dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
-        assert isinstance(
-            kv_cache_spec,
-            MambaSpec), ("MambaManager can only be used for mamba groups")
+        assert isinstance(kv_cache_spec, MambaSpec), (
+            "MambaManager can only be used for mamba groups"
+        )
         assert dcp_world_size == 1, "DCP not support mamba now."
-        # Prefix caching is not supported for mamba now. Always return empty
-        # list.
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
-            [] for _ in range(len(kv_cache_group_ids)))
-        return computed_blocks
+            [] for _ in range(len(kv_cache_group_ids))
+        )
 
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
-        # Each request will always have 1 block at this moment, so no need to
-        # remove blocks.
-        pass
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        # Search from right to left and early stop when a match is found.
+        for i in range(max_num_blocks - 1, -1, -1):
+            if cached_block := block_pool.get_cached_block(
+                block_hashes[i], kv_cache_group_ids
+            ):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    # the hit length logic later assumes:
+                    #  hit_length = len(hit_blocks_other_attn[0])
+                    #               * self.other_block_size
+                    # so we insert dummy blocks at the beginning:
+                    computed.extend([block_pool.null_block] * i)
+                    computed.append(cached)
+                break  # we just need the last match - early stopping
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
-        return 0
+        return computed_blocks
 
-    def get_num_blocks_to_allocate(
-            self, request_id: str, num_tokens: int,
-            new_computed_blocks: list[KVCacheBlock]) -> int:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
-        Get the number of blocks needed to be allocated for the request.
-
-        Args:
-            request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including
-                tokens that are already allocated).
-            new_computed_blocks: The new computed blocks just hitting the
-                prefix caching.
-
-        Returns:
-            The number of blocks
+        cascade attention is not supported by mamba
         """
+        return 0
 
+    def get_num_blocks_to_allocate(
+        self,
+        request_id: str,
+        num_tokens: int,
+        new_computed_blocks: Sequence[KVCacheBlock],
+    ) -> int:
+        # Allocate extra `num_speculative_blocks` blocks for
+        # speculative decoding (MTP/EAGLE) with linear attention.
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if self.kv_cache_spec.num_speculative_blocks > 0:
-            num_tokens += (self.kv_cache_spec.block_size *
-                           self.kv_cache_spec.num_speculative_blocks)
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        num_new_blocks = (num_required_blocks - len(new_computed_blocks) -
-                          len(self.req_to_blocks[request_id]))
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it will be changed from a free block
-        # to a computed block when the request is allocated, so we also count
-        # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(
-            blk.ref_cnt == 0 and not blk.is_null
-            for blk in new_computed_blocks)
-        return num_new_blocks + num_evictable_computed_blocks
+            num_tokens += (
+                self.kv_cache_spec.block_size
+                * self.kv_cache_spec.num_speculative_blocks
+            )
+        return super().get_num_blocks_to_allocate(
+            request_id, num_tokens, new_computed_blocks
+        )
 
-    def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> list[KVCacheBlock]:
+    def allocate_new_blocks(
+        self, request_id: str, num_tokens: int
+    ) -> list[KVCacheBlock]:
         # Allocate extra `num_speculative_blocks` blocks for
         # speculative decoding (MTP/EAGLE) with linear attention.
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if self.kv_cache_spec.num_speculative_blocks > 0:
-            num_tokens += (self.kv_cache_spec.block_size *
-                           self.kv_cache_spec.num_speculative_blocks)
+            num_tokens += (
+                self.kv_cache_spec.block_size
+                * self.kv_cache_spec.num_speculative_blocks
+            )
         return super().allocate_new_blocks(request_id, num_tokens)
 
 
@@ -607,8 +679,8 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
     """Manager for cross-attention KV cache in encoder-decoder models."""
 
     def save_new_computed_blocks(
-            self, request_id: str,
-            new_computed_blocks: list[KVCacheBlock]) -> None:
+        self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock]
+    ) -> None:
         # We do not cache blocks for cross-attention to be shared between
         # requests, so  `new_computed_blocks` should always be empty.
         assert len(new_computed_blocks) == 0
@@ -618,8 +690,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         # requests, so this method is not relevant.
         raise ValueError("Should not be called as prefix caching is disabled.")
 
-    def get_num_common_prefix_blocks(self, request_id: str,
-                                     num_running_requests: int) -> int:
+    def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         # Cross-attention blocks contain request-specific encoder states
         # and are not shared between different requests
         return 0
@@ -644,14 +715,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         # 2. Encoder states are computed once per request, not incrementally
         # 3. No reusable prefix exists between different multimodal inputs
         # Return empty blocks to indicate no cache hits
-        raise NotImplementedError(
-            "CrossAttentionManager does not support caching")
-
-    def remove_skipped_blocks(self, request_id: str,
-                              num_computed_tokens: int) -> None:
-        # Cross-attention blocks represent encoder states which are needed
-        # for the entire decoding process, so no blocks should be skipped
-        pass
+        raise NotImplementedError("CrossAttentionManager does not support caching")
 
 
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
@@ -664,8 +728,9 @@ spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
 }
 
 
-def get_manager_for_kv_cache_spec(kv_cache_spec: KVCacheSpec,
-                                  **kwargs) -> SingleTypeKVCacheManager:
+def get_manager_for_kv_cache_spec(
+    kv_cache_spec: KVCacheSpec, **kwargs
+) -> SingleTypeKVCacheManager:
     manager_class = spec_manager_map[type(kv_cache_spec)]
     manager = manager_class(kv_cache_spec, **kwargs)
     return manager
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index ea4fba8eeea6db6f9d3789d9e2f34a45b152d30d..b480ac78f23cf1575a277fb2d62814bf7f08627f 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -1,12 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from itertools import product
 
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
 
 
 class CudagraphDispatcher:
@@ -15,17 +12,17 @@ class CudagraphDispatcher:
     cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
-    for FULL cudagraph runtime mode. The keys are initialized depending on 
-    attention support and what cudagraph mode is set in CompilationConfig. The 
+    for FULL cudagraph runtime mode. The keys are initialized depending on
+    attention support and what cudagraph mode is set in CompilationConfig. The
     keys stored in dispatcher are the only source of truth for valid
     cudagraphs that can be dispatched at runtime.
 
-    At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
+    At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
     PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
-    based on the input key. After dispatching (communicate via forward context),
-    the cudagraph wrappers will trust the dispatch key to do either capturing
-    or replaying (if mode matched), or pass through to the underlying runnable 
-    without cudagraph (if mode no match or mode is NONE).
+    based on the input key. After dispatching (communicated via forward
+    context), the cudagraph wrappers will trust the dispatch key to either
+    capture or replay (if the mode matches), or pass through to the underlying
+    runnable without cudagraph (if the mode does not match or mode is NONE).
     """
 
     def __init__(self, vllm_config: VllmConfig):
@@ -40,81 +37,107 @@ class CudagraphDispatcher:
         }
 
         not_use_piecewise_compilation = (
-            not self.cudagraph_mode.requires_piecewise_compilation())
-
-        assert not_use_piecewise_compilation or \
-            self.compilation_config.is_attention_compiled_piecewise(), \
-            "Compilation level should be CompilationLevel.PIECEWISE when "\
-            "cudagraph_mode piecewise cudagraphs is used, "\
-            "and attention should be in splitting_ops or "\
-            "inductor splitting should be used. " \
-            f"cudagraph_mode={self.cudagraph_mode}, "\
-            f"compilation_level={self.compilation_config.level}, "\
+            not self.cudagraph_mode.requires_piecewise_compilation()
+        )
+
+        assert (
+            not_use_piecewise_compilation
+            or self.compilation_config.is_attention_compiled_piecewise()
+        ), (
+            "Compilation mode should be CompilationMode.VLLM_COMPILE when "
+            "cudagraph_mode piecewise cudagraphs is used, "
+            "and attention should be in splitting_ops or "
+            "inductor splitting should be used. "
+            f"cudagraph_mode={self.cudagraph_mode}, "
+            f"compilation_mode={self.compilation_config.mode}, "
             f"splitting_ops={self.compilation_config.splitting_ops}"
+        )
 
         self.keys_initialized = False
 
-    def add_cudagraph_key(self, runtime_mode: CUDAGraphMode,
-                          batch_descriptor: BatchDescriptor):
-        assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
-            f"Invalid cudagraph runtime mode: {runtime_mode}"
+    def add_cudagraph_key(
+        self, runtime_mode: CUDAGraphMode, batch_descriptor: BatchDescriptor
+    ):
+        assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], (
+            f"Invalid cudagraph runtime mode for keys: {runtime_mode}"
+        )
         self.cudagraph_keys[runtime_mode].add(batch_descriptor)
 
-    def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode,
-                                  uniform_decode_query_len: int):
+    def initialize_cudagraph_keys(
+        self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
+    ):
         # This should be called only after attention backend is initialized.
 
-        # Note: we create all valid keys possible for cudagraph but do not
-        # guarantee all keys would be used. For example, we create keys for
-        # piecewise cudagraphs when it is piecewise compilation, which is always
-        # valid, but for attention backend support unified routine, we may not
-        # trigger capturing/replaying the piecewise cudagraphs depending on
-        # CompilationConfig.cudagraph_mode. In addition, if we allow lazy
+        # LoRA activation cases to specialize the cuda graphs on
+        if self.vllm_config.lora_config:
+            if self.compilation_config.cudagraph_specialize_lora:
+                lora_cases = [True, False]
+            else:
+                lora_cases = [True]
+        else:
+            lora_cases = [False]
+
+        # Note: we create all valid keys for cudagraph here but do not
+        # guarantee all keys would be used. For example, if we allow lazy
         # capturing in future PR, some keys may never be triggered.
         if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
-            for bs in self.compilation_config.cudagraph_capture_sizes:
+            for bs, has_lora in product(
+                self.compilation_config.cudagraph_capture_sizes, lora_cases
+            ):
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(num_tokens=bs, uniform_decode=False))
+                    BatchDescriptor(
+                        num_tokens=bs, uniform_decode=False, has_lora=has_lora
+                    ),
+                )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
         # mode full cudagraphs then add them here.
-        if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL \
-            and cudagraph_mode.separate_routine():
-            max_num_tokens = uniform_decode_query_len * \
-                self.vllm_config.scheduler_config.max_num_seqs
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and cudagraph_mode.separate_routine()
+        ):
+            max_num_tokens = (
+                uniform_decode_query_len
+                * self.vllm_config.scheduler_config.max_num_seqs
+            )
             cudagraph_capture_sizes_for_decode = [
-                x for x in self.compilation_config.cudagraph_capture_sizes
+                x
+                for x in self.compilation_config.cudagraph_capture_sizes
                 if x <= max_num_tokens and x >= uniform_decode_query_len
             ]
-            for bs in cudagraph_capture_sizes_for_decode:
+            for bs, has_lora in product(cudagraph_capture_sizes_for_decode, lora_cases):
                 self.add_cudagraph_key(
                     CUDAGraphMode.FULL,
-                    BatchDescriptor(num_tokens=bs, uniform_decode=True))
+                    BatchDescriptor(
+                        num_tokens=bs, uniform_decode=True, has_lora=has_lora
+                    ),
+                )
         self.keys_initialized = True
 
     def dispatch(
-        self, batch_descriptor: BatchDescriptor
-    ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
+        self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
+    ) -> tuple[CUDAGraphMode, BatchDescriptor | None]:
         """
-        Given a batch descriptor, dispatch to a cudagraph mode.
-        A new batch descriptor is returned as we might dispatch a uniform batch 
+        Given conditions(e.g.,batch descriptor and if using cascade attention),
+        dispatch to a cudagraph runtime mode and the valid batch descriptor.
+        A new batch descriptor is returned as we might dispatch a uniform batch
         to a graph that supports a more general batch (uniform to non-uniform).
         """
         # if not initialized, just skip dispatching.
         if not self.keys_initialized:
-            logger.warning_once("cudagraph dispatching keys are not "
-                                "initialized. No cudagraph will be used.")
             return CUDAGraphMode.NONE, None
 
-        # check if key exists for full cudagraph
-        if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-            return CUDAGraphMode.FULL, batch_descriptor
-
-        # otherwise, check if non-uniform key exists
         non_uniform_key = batch_descriptor.non_uniform
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-            return CUDAGraphMode.FULL, non_uniform_key
+        # if a batch use cascade attention, bypass checking full cudagraphs
+        if not use_cascade_attn:
+            # check if key exists for full cudagraph
+            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_descriptor
+
+            # otherwise, check if non-uniform key exists
+            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, non_uniform_key
 
         # also check if non-uniform key exists for more "general"
         # piecewise cudagraph
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 345f5a464c2ccccef37a470c9ba5b3fd53135bc5..3f621d77c0241c4ce29ff651bd402c75bb024b97 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -4,7 +4,7 @@
 import enum
 import time
 from collections.abc import Mapping
-from typing import Any, Optional, Union
+from typing import Any
 
 import msgspec
 import torch
@@ -15,6 +15,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+from vllm.v1.serial_utils import UtilityResult
 
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
@@ -32,6 +33,7 @@ class FinishReason(enum.IntEnum):
     abort - aborted for another reason
 
     """
+
     STOP = 0
     LENGTH = 1
     ABORT = 2
@@ -41,22 +43,22 @@ class FinishReason(enum.IntEnum):
 
 
 class EngineCoreRequest(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
     request_id: str
-    prompt_token_ids: Optional[list[int]]
-    mm_features: Optional[list[MultiModalFeatureSpec]]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
-    eos_token_id: Optional[int]
+    prompt_token_ids: list[int] | None
+    mm_features: list[MultiModalFeatureSpec] | None
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
+    eos_token_id: int | None
     arrival_time: float
-    lora_request: Optional[LoRARequest]
-    cache_salt: Optional[str]
-    data_parallel_rank: Optional[int]
-    prompt_embeds: Optional[torch.Tensor] = None
+    lora_request: LoRARequest | None
+    cache_salt: str | None
+    data_parallel_rank: int | None
+    prompt_embeds: torch.Tensor | None = None
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
@@ -68,11 +70,12 @@ class EngineCoreRequest(
     current_wave: int = 0
     priority: int = 0
 
-    trace_headers: Optional[Mapping[str, str]] = None
+    trace_headers: Mapping[str, str] | None = None
 
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
+
     QUEUED = 1
     SCHEDULED = 2
     PREEMPTED = 3
@@ -85,70 +88,68 @@ class EngineCoreEvent(msgspec.Struct):
     frontend to calculate intervals between engine core events. These
     timestamps should not be compared with timestamps from other processes.
     """
+
     type: EngineCoreEventType
     timestamp: float
 
     @classmethod
-    def new_event(cls,
-                  event_type: EngineCoreEventType,
-                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+    def new_event(
+        cls, event_type: EngineCoreEventType, timestamp: float | None = None
+    ) -> "EngineCoreEvent":
         timestamp = time.monotonic() if timestamp is None else timestamp
         return cls(event_type, timestamp)
 
 
 class EngineCoreOutput(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
     request_id: str
     new_token_ids: list[int]
 
-    new_logprobs: Optional[LogprobsLists] = None
-    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+    new_logprobs: LogprobsLists | None = None
+    new_prompt_logprobs_tensors: LogprobsTensors | None = None
 
-    pooling_output: Optional[torch.Tensor] = None
+    pooling_output: torch.Tensor | None = None
 
-    finish_reason: Optional[FinishReason] = None
-    stop_reason: Union[int, str, None] = None
-    events: Optional[list[EngineCoreEvent]] = None
-    kv_transfer_params: Optional[dict[str, Any]] = None
+    finish_reason: FinishReason | None = None
+    stop_reason: int | str | None = None
+    events: list[EngineCoreEvent] | None = None
+    kv_transfer_params: dict[str, Any] | None = None
 
-    trace_headers: Optional[Mapping[str, str]] = None
+    trace_headers: Mapping[str, str] | None = None
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
 
+    # The number of NaNs in logits.
+    # A value greater than 0 indicates that the output is corrupted.
+    num_nans_in_logits: int = 0
+
     @property
     def finished(self) -> bool:
         return self.finish_reason is not None
 
 
-class UtilityResult:
-    """Wrapper for special handling when serializing/deserializing."""
-
-    def __init__(self, r: Any = None):
-        self.result = r
-
-
 class UtilityOutput(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
     call_id: int
 
     # Non-None implies the call failed, result should be None.
-    failure_message: Optional[str] = None
-    result: Optional[UtilityResult] = None
+    failure_message: str | None = None
+    result: UtilityResult | None = None
 
 
 class EngineCoreOutputs(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
     # NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
@@ -156,18 +157,18 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: list[EngineCoreOutput] = []
-    scheduler_stats: Optional[SchedulerStats] = None
+    scheduler_stats: SchedulerStats | None = None
     timestamp: float = 0.0
 
-    utility_output: Optional[UtilityOutput] = None
-    finished_requests: Optional[set[str]] = None
+    utility_output: UtilityOutput | None = None
+    finished_requests: set[str] | None = None
 
     # In DP case, used to signal that the current wave of requests
     # has finished and the engines are paused.
-    wave_complete: Optional[int] = None
+    wave_complete: int | None = None
     # In DP case, used to signal that a request was received for an
     # "old" wave, so the next wave needs to be started in other engines.
-    start_wave: Optional[int] = None
+    start_wave: int | None = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -179,12 +180,13 @@ class EngineCoreRequestType(enum.Enum):
     Request types defined as hex byte strings, so it can be sent over sockets
     without separate encoding step.
     """
-    ADD = b'\x00'
-    ABORT = b'\x01'
-    START_DP_WAVE = b'\x02'
-    UTILITY = b'\x03'
+
+    ADD = b"\x00"
+    ABORT = b"\x01"
+    START_DP_WAVE = b"\x02"
+    UTILITY = b"\x03"
     # Sentinel used within EngineCoreProc.
-    EXECUTOR_FAILED = b'\x04'
+    EXECUTOR_FAILED = b"\x04"
 
 
 class ReconfigureDistributedRequest(msgspec.Struct):
@@ -199,5 +201,6 @@ class ReconfigureRankType(enum.IntEnum):
     """
     Rank type for reconfiguring distributed request.
     """
+
     KEEP_CURRENT_RANK = -1
     SHUTDOWN_CURRENT_RANK = -2
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 757baecea9ce09ee066336fb32c8c05c81da99d5..c160c7cbcab4a03a5052ba7a3cb156e94006991c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -6,43 +6,45 @@ import socket
 import time
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
-from typing import Any, Optional, Union
+from typing import Any, cast
 
 import numpy as np
 import torch
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.utils import _validate_truncation_size
-from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 from vllm.inputs import PromptType
-from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               init_tokenizer_from_configs)
+from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
+from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
-                        deprecate_kwargs)
+from vllm.utils.async_utils import cancel_task_threadsafe
+from vllm.utils.collection_utils import as_list
+from vllm.utils.func_utils import deprecate_kwargs
+from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
-from vllm.v1.engine.output_processor import (OutputProcessor,
-                                             RequestOutputCollector)
+from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm.v1.executor import Executor
+from vllm.v1.metrics.loggers import (
+    StatLoggerFactory,
+    StatLoggerManager,
+    load_stat_logger_plugin_factories,
+)
 from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats
 
@@ -50,7 +52,6 @@ logger = init_logger(__name__)
 
 
 class AsyncLLM(EngineClient):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -61,8 +62,9 @@ class AsyncLLM(EngineClient):
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        client_addresses: Optional[dict[str, str]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        aggregate_engine_logging: bool = False,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ) -> None:
@@ -86,13 +88,6 @@ class AsyncLLM(EngineClient):
         Returns:
             None
         """
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
@@ -101,33 +96,37 @@ class AsyncLLM(EngineClient):
         self.observability_config = vllm_config.observability_config
         self.log_requests = log_requests
 
-        self.log_stats = log_stats or (stat_loggers is not None)
-        if not log_stats and stat_loggers is not None:
+        custom_stat_loggers = list(stat_loggers or [])
+        custom_stat_loggers.extend(load_stat_logger_plugin_factories())
+
+        has_custom_loggers = bool(custom_stat_loggers)
+        self.log_stats = log_stats or has_custom_loggers
+        if not log_stats and has_custom_loggers:
             logger.info(
-                "AsyncLLM created with log_stats=False and non-empty custom "
-                "logger list; enabling logging without default stat loggers")
+                "AsyncLLM created with log_stats=False, "
+                "but custom stat loggers were found; "
+                "enabling logging without default stat loggers."
+            )
 
         if self.model_config.skip_tokenizer_init:
-            self.tokenizer = None
+            tokenizer = None
         else:
-            # Tokenizer (+ ensure liveness if running in another process).
-            self.tokenizer = init_tokenizer_from_configs(
-                model_config=vllm_config.model_config)
+            tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(
-            vllm_config=vllm_config,
-            tokenizer=self.tokenizer,
-            mm_registry=mm_registry,
+        self.processor = Processor(self.vllm_config, tokenizer)
+        self.io_processor = get_io_processor(
+            self.vllm_config,
+            self.model_config.io_processor_plugin,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=self.log_stats)
-        if self.observability_config.otlp_traces_endpoint is not None:
-            tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
+        self.output_processor = OutputProcessor(
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+        )
+        endpoint = self.observability_config.otlp_traces_endpoint
+        if endpoint is not None:
+            tracer = init_tracer("vllm.llm_engine", endpoint)
             self.output_processor.tracer = tracer
 
         # EngineCore (starts the engine in background process).
@@ -141,18 +140,19 @@ class AsyncLLM(EngineClient):
         )
 
         # Loggers.
-        self.logger_manager: Optional[StatLoggerManager] = None
+        self.logger_manager: StatLoggerManager | None = None
         if self.log_stats:
             self.logger_manager = StatLoggerManager(
                 vllm_config=vllm_config,
                 engine_idxs=self.engine_core.engine_ranks_managed,
-                custom_stat_loggers=stat_loggers,
+                custom_stat_loggers=custom_stat_loggers,
                 enable_default_loggers=log_stats,
                 client_count=client_count,
+                aggregate_engine_logging=aggregate_engine_logging,
             )
             self.logger_manager.log_engine_initialized()
 
-        self.output_handler: Optional[asyncio.Task] = None
+        self.output_handler: asyncio.Task | None = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
             asyncio.get_running_loop()
@@ -163,7 +163,8 @@ class AsyncLLM(EngineClient):
         if envs.VLLM_TORCH_PROFILER_DIR:
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
-                envs.VLLM_TORCH_PROFILER_DIR)
+                envs.VLLM_TORCH_PROFILER_DIR,
+            )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
@@ -171,38 +172,33 @@ class AsyncLLM(EngineClient):
                 ],
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR,
-                    worker_name=worker_name,
-                    use_gzip=True))
+                    envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
+                ),
+            )
         else:
             self.profiler = None
 
     @classmethod
     @deprecate_kwargs(
         "disable_log_requests",
-        additional_message=("This argument will have no effect. "
-                            "Use `enable_log_requests` instead."),
+        additional_message=(
+            "This argument will have no effect. Use `enable_log_requests` instead."
+        ),
     )
     def from_vllm_config(
-            cls,
-            vllm_config: VllmConfig,
-            start_engine_loop: bool = True,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            stat_loggers: Optional[list[StatLoggerFactory]] = None,
-            enable_log_requests: bool = False,
-            disable_log_stats: bool = False,
-            client_addresses: Optional[dict[str, str]] = None,
-            client_count: int = 1,
-            client_index: int = 0,
-            disable_log_requests: bool = True,  # Deprecated, will be removed
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        enable_log_requests: bool = False,
+        aggregate_engine_logging: bool = False,
+        disable_log_stats: bool = False,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+        disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
         # Create the LLMEngine.
         return cls(
             vllm_config=vllm_config,
@@ -211,6 +207,7 @@ class AsyncLLM(EngineClient):
             stat_loggers=stat_loggers,
             log_requests=enable_log_requests,
             log_stats=not disable_log_stats,
+            aggregate_engine_logging=aggregate_engine_logging,
             usage_context=usage_context,
             client_addresses=client_addresses,
             client_count=client_count,
@@ -223,7 +220,7 @@ class AsyncLLM(EngineClient):
         engine_args: AsyncEngineArgs,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -253,7 +250,9 @@ class AsyncLLM(EngineClient):
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
-        cancel_task_threadsafe(getattr(self, "output_handler", None))
+        handler = getattr(self, "output_handler", None)
+        if handler is not None:
+            cancel_task_threadsafe(handler)
 
     async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return await self.engine_core.get_supported_tasks_async()
@@ -261,14 +260,15 @@ class AsyncLLM(EngineClient):
     async def add_request(
         self,
         request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt: EngineCoreRequest | PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
+        data_parallel_rank: int | None = None,
+        prompt_text: str | None = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -281,33 +281,61 @@ class AsyncLLM(EngineClient):
         queue = RequestOutputCollector(output_kind=params.output_kind)
 
         # Convert Input --> Request.
-        prompt_str, request = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            tokenization_kwargs, trace_headers, priority, data_parallel_rank)
+        if isinstance(prompt, EngineCoreRequest):
+            request = prompt
+        else:
+            assert prompt_text is None
+            logger.warning_once(
+                "Processor has been moved under OpenAIServing and will "
+                "be removed from AsyncLLM in v0.13."
+            )
+            request = self.processor.process_inputs(
+                request_id,
+                prompt,
+                params,
+                arrival_time,
+                lora_request,
+                tokenization_kwargs,
+                trace_headers,
+                priority,
+                data_parallel_rank,
+            )
+            if isinstance(prompt, str):
+                prompt_text = prompt
+            elif isinstance(prompt, Mapping):
+                prompt_text = cast(str | None, prompt.get("prompt"))
 
         if is_pooling or params.n == 1:
-            await self._add_request(request, prompt_str, None, 0, queue)
+            await self._add_request(request, prompt_text, None, 0, queue)
             return queue
 
+        # Get the updated SamplingParams from the request, which
+        # were cloned/updated in processor.process_inputs above.
+        parent_params = request.sampling_params
+        assert parent_params is not None
+
         # Fan out child requests (for n>1).
-        parent_request = ParentRequest(request_id, params)
-        for idx in range(params.n):
-            request_id, params = parent_request.get_child_info(idx)
-            child_request = request if idx == params.n - 1 else copy(request)
+        parent_request = ParentRequest(request_id, parent_params)
+        for idx in range(parent_params.n):
+            request_id, child_params = parent_request.get_child_info(idx)
+            child_request = request if idx == parent_params.n - 1 else copy(request)
             child_request.request_id = request_id
-            child_request.sampling_params = params
-            await self._add_request(child_request, prompt_str, parent_request,
-                                    idx, queue)
+            child_request.sampling_params = child_params
+            await self._add_request(
+                child_request, prompt_text, parent_request, idx, queue
+            )
         return queue
 
-    async def _add_request(self, request: EngineCoreRequest,
-                           prompt: Optional[str],
-                           parent_req: Optional[ParentRequest], index: int,
-                           queue: RequestOutputCollector):
-
+    async def _add_request(
+        self,
+        request: EngineCoreRequest,
+        prompt: str | None,
+        parent_req: ParentRequest | None,
+        index: int,
+        queue: RequestOutputCollector,
+    ):
         # Add the request to OutputProcessor (this process).
-        self.output_processor.add_request(request, prompt, parent_req, index,
-                                          queue)
+        self.output_processor.add_request(request, prompt, parent_req, index, queue)
 
         # Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
@@ -322,13 +350,16 @@ class AsyncLLM(EngineClient):
     # re-multiplexed in the API server anyhow.
     async def generate(
         self,
-        prompt: PromptType,
+        prompt: EngineCoreRequest | PromptType,
         sampling_params: SamplingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        *,
+        prompt_text: str | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
+        data_parallel_rank: int | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -345,12 +376,15 @@ class AsyncLLM(EngineClient):
         returning the RequestOutput back to the caller.
         """
 
-        if (self.vllm_config.cache_config.kv_sharing_fast_prefill
-                and sampling_params.prompt_logprobs):
+        if (
+            self.vllm_config.cache_config.kv_sharing_fast_prefill
+            and sampling_params.prompt_logprobs
+        ):
             raise ValueError(
                 "--kv-sharing-fast-prefill produces incorrect logprobs for "
                 "prompt tokens, please disable it when the requests need "
-                "prompt logprobs")
+                "prompt logprobs"
+            )
 
         try:
             # We start the output_handler on the first call to generate() so
@@ -358,24 +392,26 @@ class AsyncLLM(EngineClient):
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
-            tokenization_kwargs: dict[str, Any] = {}
-            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
+            if tokenization_kwargs is None:
+                tokenization_kwargs = {}
+                truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
 
-            _validate_truncation_size(
-                self.model_config.max_model_len,
-                truncate_prompt_tokens,
-                tokenization_kwargs,
-            )
+                _validate_truncation_size(
+                    self.model_config.max_model_len,
+                    truncate_prompt_tokens,
+                    tokenization_kwargs,
+                )
 
             q = await self.add_request(
                 request_id,
                 prompt,
                 sampling_params,
                 lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
                 trace_headers=trace_headers,
                 priority=priority,
-                tokenization_kwargs=tokenization_kwargs,
                 data_parallel_rank=data_parallel_rank,
+                prompt_text=prompt_text,
             )
 
             # The output_handler task pushes items into the queue.
@@ -389,6 +425,7 @@ class AsyncLLM(EngineClient):
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
                 finished = out.finished
+                assert isinstance(out, RequestOutput)
                 yield out
 
         # If the request is disconnected by the client, generate()
@@ -431,6 +468,7 @@ class AsyncLLM(EngineClient):
         output_processor = self.output_processor
         log_stats = self.log_stats
         logger_manager = self.logger_manager
+        processor = self.processor
 
         async def output_handler():
             try:
@@ -439,23 +477,26 @@ class AsyncLLM(EngineClient):
                     outputs = await engine_core.get_output_async()
                     num_outputs = len(outputs.outputs)
 
-                    iteration_stats = IterationStats() if (
-                        log_stats and num_outputs) else None
+                    iteration_stats = (
+                        IterationStats() if (log_stats and num_outputs) else None
+                    )
 
                     # Split outputs into chunks of at most
                     # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                     # event loop for too long.
-                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                        slices = (outputs.outputs, )
+                    if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs,)
                     else:
                         slices = np.array_split(
                             outputs.outputs,
-                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+                            cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE),
+                        )
 
                     for i, outputs_slice in enumerate(slices):
                         # 2) Process EngineCoreOutputs.
                         processed_outputs = output_processor.process_outputs(
-                            outputs_slice, outputs.timestamp, iteration_stats)
+                            outputs_slice, outputs.timestamp, iteration_stats
+                        )
                         # NOTE: RequestOutputs are pushed to their queues.
                         assert not processed_outputs.request_outputs
 
@@ -465,7 +506,10 @@ class AsyncLLM(EngineClient):
 
                         # 3) Abort any reqs that finished due to stop strings.
                         await engine_core.abort_requests_async(
-                            processed_outputs.reqs_to_abort)
+                            processed_outputs.reqs_to_abort
+                        )
+
+                    output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
                     # 4) Logging.
                     # TODO(rob): make into a coroutine and launch it in
@@ -475,6 +519,7 @@ class AsyncLLM(EngineClient):
                             engine_idx=outputs.engine_index,
                             scheduler_stats=outputs.scheduler_stats,
                             iteration_stats=iteration_stats,
+                            mm_cache_stats=processor.stat_mm_cache(),
                         )
             except Exception as e:
                 logger.exception("AsyncLLM output_handler failed.")
@@ -482,11 +527,12 @@ class AsyncLLM(EngineClient):
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
+    async def abort(self, request_id: str | Iterable[str]) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
-        request_ids = (request_id, ) if isinstance(
-            request_id, str) else as_list(request_id)
+        request_ids = (
+            (request_id,) if isinstance(request_id, str) else as_list(request_id)
+        )
         all_request_ids = self.output_processor.abort_requests(request_ids)
         await self.engine_core.abort_requests_async(all_request_ids)
 
@@ -498,11 +544,11 @@ class AsyncLLM(EngineClient):
         prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        truncate_prompt_tokens: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        truncate_prompt_tokens: int | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -525,7 +571,7 @@ class AsyncLLM(EngineClient):
             self._run_output_handler()
 
             if tokenization_kwargs is None:
-                tokenization_kwargs = dict[str, Any]()
+                tokenization_kwargs = {}
             _validate_truncation_size(
                 self.model_config.max_model_len,
                 truncate_prompt_tokens,
@@ -537,9 +583,9 @@ class AsyncLLM(EngineClient):
                 prompt,
                 pooling_params,
                 lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
                 trace_headers=trace_headers,
                 priority=priority,
-                tokenization_kwargs=tokenization_kwargs,
             )
 
             # The output_handler task pushes items into the queue.
@@ -582,24 +628,24 @@ class AsyncLLM(EngineClient):
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
 
-    async def get_vllm_config(self) -> VllmConfig:
-        return self.vllm_config
-
-    async def get_model_config(self) -> ModelConfig:
-        return self.model_config
+    @property
+    def tokenizer(self) -> AnyTokenizer | None:
+        return self.processor.tokenizer
 
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        return self.processor.input_preprocessor
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+        self.processor.tokenizer = tokenizer
 
     async def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
-            raise ValueError("Unable to get tokenizer because "
-                             "skip_tokenizer_init is True")
+            raise ValueError(
+                "Unable to get tokenizer because skip_tokenizer_init is True"
+            )
 
         return self.tokenizer
 
     async def is_tracing_enabled(self) -> bool:
-        return self.observability_config.otlp_traces_endpoint is not None
+        return self.observability_config.otlp_traces_endpoint is not None  # type: ignore
 
     async def do_log_stats(self) -> None:
         if self.logger_manager:
@@ -623,22 +669,25 @@ class AsyncLLM(EngineClient):
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_cache()
+        self.processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
-        if device == Device.CPU:
-            raise ValueError("Not supported on CPU.")
+    async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
         await self.reset_prefix_cache()
         await self.engine_core.sleep_async(level)
 
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(1, level)
+
+    async def wake_up(self, tags: list[str] | None = None) -> None:
         await self.engine_core.wake_up_async(tags)
 
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(0, 0)
+
     async def is_sleeping(self) -> bool:
         return await self.engine_core.is_sleeping_async()
 
@@ -658,16 +707,19 @@ class AsyncLLM(EngineClient):
         """Prevent an adapter from being evicted."""
         return await self.engine_core.pin_lora_async(lora_id)
 
-    async def collective_rpc(self,
-                             method: str,
-                             timeout: Optional[float] = None,
-                             args: tuple = (),
-                             kwargs: Optional[dict] = None):
+    async def collective_rpc(
+        self,
+        method: str,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+    ):
         """
         Perform a collective RPC call to the given path.
         """
         return await self.engine_core.collective_rpc_async(
-            method, timeout, args, kwargs)
+            method, timeout, args, kwargs
+        )
 
     async def wait_for_requests_to_drain(self, drain_timeout: int = 300):
         """Wait for all requests to be drained."""
@@ -677,16 +729,17 @@ class AsyncLLM(EngineClient):
                 logger.info("Engines are idle, requests have been drained")
                 return
 
-            logger.info(
-                "Engines are still running, waiting for requests to drain...")
+            logger.info("Engines are still running, waiting for requests to drain...")
             await asyncio.sleep(1)  # Wait 1 second before checking again
 
-        raise TimeoutError(f"Timeout reached after {drain_timeout} seconds "
-                           "waiting for requests to drain.")
+        raise TimeoutError(
+            f"Timeout reached after {drain_timeout} seconds "
+            "waiting for requests to drain."
+        )
 
-    async def scale_elastic_ep(self,
-                               new_data_parallel_size: int,
-                               drain_timeout: int = 300):
+    async def scale_elastic_ep(
+        self, new_data_parallel_size: int, drain_timeout: int = 300
+    ):
         """
         Scale up or down the data parallel size by adding or removing
         engine cores.
@@ -695,22 +748,24 @@ class AsyncLLM(EngineClient):
             drain_timeout:
                 Maximum time to wait for requests to drain (seconds)
         """
-        old_data_parallel_size = \
-            self.vllm_config.parallel_config.data_parallel_size
+        old_data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
         if old_data_parallel_size == new_data_parallel_size:
-            logger.info("Data parallel size is already %s, skipping scale",
-                        new_data_parallel_size)
+            logger.info(
+                "Data parallel size is already %s, skipping scale",
+                new_data_parallel_size,
+            )
             return
         logger.info(
-            "Waiting for requests to drain before "
-            "scaling up to %s engines...", new_data_parallel_size)
+            "Waiting for requests to drain before scaling up to %s engines...",
+            new_data_parallel_size,
+        )
         await self.wait_for_requests_to_drain(drain_timeout)
         logger.info(
-            "Requests have been drained, proceeding with scale "
-            "to %s engines", new_data_parallel_size)
+            "Requests have been drained, proceeding with scale to %s engines",
+            new_data_parallel_size,
+        )
         await self.engine_core.scale_elastic_ep(new_data_parallel_size)
-        self.vllm_config.parallel_config.data_parallel_size = \
-            new_data_parallel_size
+        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
 
         # recreate stat loggers
         if new_data_parallel_size > old_data_parallel_size and self.log_stats:
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 596edfdbe24f80f4570e38b70eb13a265c738e2c..953342cdd5d05dc1df5ef7ef0c7c014584e9c107 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -4,14 +4,14 @@ import copy
 import multiprocessing
 import time
 import weakref
-from typing import Optional
 
 import msgspec.msgpack
 import zmq
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_mp_context, make_zmq_socket, set_process_title
+from vllm.utils.network_utils import make_zmq_socket
+from vllm.utils.system_utils import get_mp_context, set_process_title
 from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
 from vllm.v1.serial_utils import MsgpackDecoder
 from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
@@ -56,7 +56,6 @@ class DPCoordinator:
     """
 
     def __init__(self, parallel_config: ParallelConfig):
-
         dp_size = parallel_config.data_parallel_size
         assert dp_size > 1, "Coordinator only used for data parallel"
 
@@ -68,7 +67,8 @@ class DPCoordinator:
         # either external or hybrid DP LB mode.
         local_only = not (external_lb or hybrid_lb)
         front_publish_address = get_engine_client_zmq_addr(
-            local_only=local_only, host=host)
+            local_only=local_only, host=host
+        )
 
         local_only_eng = dp_size == parallel_config.data_parallel_size_local
         back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
@@ -84,7 +84,8 @@ class DPCoordinator:
                 "back_output_address": back_output_address,
                 "back_publish_address": back_publish_address,
             },
-            daemon=True)
+            daemon=True,
+        )
         self.proc.start()
 
         self.stats_publish_address = front_publish_address
@@ -104,16 +105,12 @@ class DPCoordinator:
 
 
 class EngineState:
-
     def __init__(self):
         self.request_counts = [0, 0]  # [waiting, running]
 
 
 class DPCoordinatorProc:
-
-    def __init__(self,
-                 engine_count: int,
-                 min_stats_update_interval_ms: int = 100):
+    def __init__(self, engine_count: int, min_stats_update_interval_ms: int = 100):
         set_process_title("DPCoordinator")
         self.ctx = zmq.Context()
 
@@ -131,7 +128,8 @@ class DPCoordinatorProc:
     ):
         coordinator = DPCoordinatorProc(
             engine_count=engine_count,
-            min_stats_update_interval_ms=min_stats_update_interval_ms)
+            min_stats_update_interval_ms=min_stats_update_interval_ms,
+        )
         try:
             coordinator.process_input_socket(
                 front_publish_address,
@@ -141,10 +139,12 @@ class DPCoordinatorProc:
         except KeyboardInterrupt:
             logger.info("DP Coordinator process exiting")
 
-    def process_input_socket(self, front_publish_address: str,
-                             back_output_address: str,
-                             back_publish_address: str):
-
+    def process_input_socket(
+        self,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+    ):
         decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # For tracking request wave progression.
@@ -155,31 +155,35 @@ class DPCoordinatorProc:
         stats_changed = False
         last_stats_step = -1
         last_stats_wave = -1
-        last_step_counts: Optional[list[list[int]]] = None
+        last_step_counts: list[list[int]] | None = None
 
-        with make_zmq_socket(
+        with (
+            make_zmq_socket(
                 path=front_publish_address,  # IPC
                 ctx=self.ctx,
                 socket_type=zmq.XPUB,
                 bind=True,
-        ) as publish_front, make_zmq_socket(
+            ) as publish_front,
+            make_zmq_socket(
                 path=back_output_address,  # IPC or TCP
                 ctx=self.ctx,
                 socket_type=zmq.PULL,
                 bind=True,
-        ) as output_back, make_zmq_socket(
+            ) as output_back,
+            make_zmq_socket(
                 path=back_publish_address,  # IPC or TCP
                 ctx=self.ctx,
                 socket_type=zmq.XPUB,
                 bind=True,
-        ) as publish_back:
-
+            ) as publish_back,
+        ):
             # Wait until all engines subscribe.
             for _ in self.engines:
-                if publish_back.recv() != b'\x01':
+                if publish_back.recv() != b"\x01":
                     logger.error(
                         "DP Coordinator received unexpected message while "
-                        "waiting for engines to subscribe")
+                        "waiting for engines to subscribe"
+                    )
                     return
             # Send ready message to engines.
             publish_back.send(b"READY")
@@ -194,15 +198,13 @@ class DPCoordinatorProc:
                 elapsed = int(time.time() * 1000) - last_publish_time
                 # Send at stats_update_interval_ms interval if the stats have
                 # changed, or otherwise every 5 seconds.
-                wait_for = (self.stats_update_interval_ms
-                            if stats_changed else 5000)
+                wait_for = self.stats_update_interval_ms if stats_changed else 5000
 
                 # Wait at least 50ms to ensure we've received all stats for
                 # the current step.
                 min_timeout = 50 if last_step_counts is None else 0
 
-                events = poller.poll(timeout=max(min_timeout, wait_for -
-                                                 elapsed))
+                events = poller.poll(timeout=max(min_timeout, wait_for - elapsed))
                 if not events:
                     # Poller timeout - publish current stats to front-ends.
                     if last_step_counts is not None:
@@ -212,8 +214,7 @@ class DPCoordinatorProc:
                         engine_req_counts_list = self._get_engine_counts()
                         stats_changed = False
 
-                    to_publish = (engine_req_counts_list, current_wave,
-                                  engines_running)
+                    to_publish = (engine_req_counts_list, current_wave, engines_running)
                     publish_front.send(msgspec.msgpack.encode(to_publish))
                     last_publish_time = int(time.time() * 1000)
                     continue
@@ -223,13 +224,16 @@ class DPCoordinatorProc:
 
                 if publish_front in events:
                     buffer = publish_front.recv()
-                    if buffer in (b'\x01', b'\x00'):
+                    if buffer in (b"\x01", b"\x00"):
                         # Ignore subscription messages.
                         continue
 
                     decoded = msgspec.msgpack.decode(buffer)
-                    if isinstance(decoded, (list, tuple)) and len(
-                            decoded) == 2 and decoded[0] == "SCALE_ELASTIC_EP":
+                    if (
+                        isinstance(decoded, (list, tuple))
+                        and len(decoded) == 2
+                        and decoded[0] == "SCALE_ELASTIC_EP"
+                    ):
                         # Handle scale up notification
                         new_engine_count = decoded[1]
                         current_count = len(self.engines)
@@ -248,13 +252,17 @@ class DPCoordinatorProc:
                             # engine
                             engines_running = False
                             logger.info(
-                                "DPCoordinator scaled up from %s to %s "
-                                "engines", current_count, new_engine_count)
+                                "DPCoordinator scaled up from %s to %s engines",
+                                current_count,
+                                new_engine_count,
+                            )
                         else:
                             self.engines = self.engines[:new_engine_count]
                             logger.info(
-                                "DPCoordinator scaled down from %s to %s "
-                                "engines", current_count, new_engine_count)
+                                "DPCoordinator scaled down from %s to %s engines",
+                                current_count,
+                                new_engine_count,
+                            )
                         continue  # Skip normal engine notification processing
 
                     # We received a message on the front-end XPUB socket,
@@ -270,8 +278,9 @@ class DPCoordinatorProc:
 
                         engines_running = True
                         wave_state_changed = True
-                        self._send_start_wave(publish_back, current_wave,
-                                              engine_to_exclude)
+                        self._send_start_wave(
+                            publish_back, current_wave, engine_to_exclude
+                        )
 
                 if output_back in events:
                     # We received a message from one of the engines.
@@ -290,21 +299,28 @@ class DPCoordinatorProc:
                         stats = self.engines[eng_index].request_counts
                         stats_step = scheduler_stats.step_counter
                         stats_wave = scheduler_stats.current_wave
-                        if (stats_wave > last_stats_wave
-                                or stats_wave == last_stats_wave
-                                and stats_step > last_stats_step):
+                        if (
+                            stats_wave > last_stats_wave
+                            or stats_wave == last_stats_wave
+                            and stats_step > last_stats_step
+                        ):
                             if stats_changed:
-                                last_step_counts = self._get_engine_counts(
-                                    do_copy=True)
+                                last_step_counts = self._get_engine_counts(do_copy=True)
                             last_stats_step = stats_step
                             last_stats_wave = stats_wave
                         elif stats_wave != last_stats_wave or (
-                                stats_step != last_stats_step):
+                            stats_step != last_stats_step
+                        ):
                             logger.warning(
                                 "Received stats for out-of-order "
                                 "step (%d, %d) from engine %d (expected "
-                                "> (%d, %d))", stats_wave, stats_step,
-                                eng_index, last_stats_wave, last_stats_step)
+                                "> (%d, %d))",
+                                stats_wave,
+                                stats_step,
+                                eng_index,
+                                last_stats_wave,
+                                last_stats_step,
+                            )
                         stats[0] = scheduler_stats.num_waiting_reqs
                         stats[1] = scheduler_stats.num_running_reqs
                         stats_changed = True
@@ -315,20 +331,24 @@ class DPCoordinatorProc:
                         # (engines_running==False).
                         if current_wave <= wave:
                             new_wave = wave + 1
-                            logger.debug("Moving DP wave from %d to %d.",
-                                         current_wave, new_wave)
+                            logger.debug(
+                                "Moving DP wave from %d to %d.", current_wave, new_wave
+                            )
                             current_wave = new_wave
                             engines_running = False
                             wave_state_changed = True
                     elif (wave := outputs.start_wave) is not None and (
-                            wave > current_wave or
-                        (wave == current_wave and not engines_running)):
+                        wave > current_wave
+                        or (wave == current_wave and not engines_running)
+                    ):
                         # 3. The engine received request for a non-current wave
                         # so we must ensure that other engines progress to the
                         # next wave (race condition handling).
                         logger.debug(
                             "Starting wave %d after notification of "
-                            "stale wave request from engine.", wave)
+                            "stale wave request from engine.",
+                            wave,
+                        )
                         current_wave = wave
                         engines_running = True
                         wave_state_changed = True
@@ -339,16 +359,16 @@ class DPCoordinatorProc:
                     publish_front.send(msgspec.msgpack.encode(message))
 
     @staticmethod
-    def _send_start_wave(socket: zmq.Socket, wave: int,
-                         exclude_engine_index: Optional[int]):
+    def _send_start_wave(
+        socket: zmq.Socket, wave: int, exclude_engine_index: int | None
+    ):
         """Broadcast the START_DP_WAVE message to all the engines.
         It includes the current wave number and index of engine which
         has already received a request with this wave number and so doesn't
         require additional notification.
         """
         wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
-        socket.send_multipart(
-            (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+        socket.send_multipart((EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
 
     def _get_engine_counts(self, do_copy=False) -> list[list[int]]:
         """Return list of [waiting, running] count lists for each engine."""
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a43042a5510a8632d3abc7e9f3c3c11f5ce42fdd..d49eb752d56a5fb92e39b02a63722382f9f6719d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,49 +1,62 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
 import os
 import queue
 import signal
 import threading
 import time
 from collections import deque
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from concurrent.futures import Future
 from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, TypeVar, cast
 
 import msgspec
 import zmq
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import engine_receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
-from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
-                        resolve_obj_by_qualname, set_process_title)
-from vllm.v1.core.kv_cache_utils import (BlockHash,
-                                         generate_scheduler_kv_cache_config,
-                                         get_kv_cache_configs,
-                                         get_request_block_hasher,
-                                         init_none_hash)
+from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
+from vllm.utils.gc_utils import (
+    freeze_gc_heap,
+    maybe_attach_gc_debug_callback,
+)
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.utils.network_utils import make_zmq_socket
+from vllm.utils.system_utils import decorate_logs, set_process_title
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    generate_scheduler_kv_cache_config,
+    get_kv_cache_configs,
+    get_request_block_hasher,
+    init_none_hash,
+)
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
-from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
-                            EngineCoreRequestType,
-                            ReconfigureDistributedRequest, ReconfigureRankType,
-                            UtilityOutput, UtilityResult)
-from vllm.v1.engine.utils import (EngineHandshakeMetadata, EngineZmqAddresses,
-                                  get_device_indices)
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.engine import (
+    EngineCoreOutputs,
+    EngineCoreRequest,
+    EngineCoreRequestType,
+    ReconfigureDistributedRequest,
+    ReconfigureRankType,
+    UtilityOutput,
+    UtilityResult,
+)
+from vllm.v1.engine.utils import (
+    EngineHandshakeMetadata,
+    EngineZmqAddresses,
+    get_device_indices,
+)
+from vllm.v1.executor import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
@@ -57,117 +70,145 @@ logger = init_logger(__name__)
 POLLING_TIMEOUT_S = 2.5
 HANDSHAKE_TIMEOUT_MINS = 5
 
-_R = TypeVar('_R')  # Return type for collective_rpc
+_R = TypeVar("_R")  # Return type for collective_rpc
 
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: type[Executor],
-                 log_stats: bool,
-                 executor_fail_callback: Optional[Callable] = None):
-
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        executor_fail_callback: Callable | None = None,
+    ):
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
+
         load_general_plugins()
 
         self.vllm_config = vllm_config
-        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
-                    VLLM_VERSION, vllm_config)
+        if vllm_config.parallel_config.data_parallel_rank == 0:
+            logger.info(
+                "Initializing a V1 LLM engine (v%s) with config: %s",
+                VLLM_VERSION,
+                vllm_config,
+            )
 
         self.log_stats = log_stats
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
         if executor_fail_callback is not None:
-            self.model_executor.register_failure_callback(
-                executor_fail_callback)
+            self.model_executor.register_failure_callback(executor_fail_callback)
 
         self.available_gpu_memory_for_kv_cache = -1
 
         # Setup KV Caches and update CacheConfig after profiling.
-        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
-            self._initialize_kv_caches(vllm_config)
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
+            vllm_config
+        )
 
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
-        self.collective_rpc("initialize_cache",
-                            args=(num_gpu_blocks, num_cpu_blocks))
+        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
 
         self.structured_output_manager = StructuredOutputManager(vllm_config)
 
         # Setup scheduler.
-        if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
-            Scheduler = resolve_obj_by_qualname(
-                vllm_config.scheduler_config.scheduler_cls)
-        else:
-            Scheduler = vllm_config.scheduler_config.scheduler_cls
-
-        # This warning can be removed once the V1 Scheduler interface is
-        # finalized and we can maintain support for scheduler classes that
-        # implement it
-        if Scheduler is not V1Scheduler:
-            logger.warning(
-                "Using configured V1 scheduler class %s. "
-                "This scheduler interface is not public and "
-                "compatibility may not be maintained.",
-                vllm_config.scheduler_config.scheduler_cls)
+        Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
 
         if len(kv_cache_config.kv_cache_groups) == 0:
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
             logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+
+        scheduler_block_size = (
+            vllm_config.cache_config.block_size
+            * vllm_config.parallel_config.decode_context_parallel_size
+        )
 
         self.scheduler: SchedulerInterface = Scheduler(
             vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
             structured_output_manager=self.structured_output_manager,
-            include_finished_set=vllm_config.parallel_config.data_parallel_size
-            > 1,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size > 1,
             log_stats=self.log_stats,
+            block_size=scheduler_block_size,
         )
         self.use_spec_decode = vllm_config.speculative_config is not None
         if self.scheduler.connector is not None:  # type: ignore
-            self.model_executor.init_kv_output_aggregator(
-                self.scheduler.connector.get_finished_count())  # type: ignore
+            self.model_executor.init_kv_output_aggregator(self.scheduler.connector)  # type: ignore
 
         self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
         self.mm_receiver_cache = engine_receiver_cache_from_config(
-            vllm_config, mm_registry)
+            vllm_config, mm_registry
+        )
+
+        # If a KV connector is initialized for scheduler, we want to collect
+        # handshake metadata from all workers so the connector in the scheduler
+        # will have the full context
+        kv_connector = self.scheduler.get_kv_connector()
+        if kv_connector is not None:
+            # Collect and store KV connector xfer metadata from workers
+            # (after KV cache registration)
+            xfer_handshake_metadata = (
+                self.model_executor.get_kv_connector_handshake_metadata()
+            )
+
+            if xfer_handshake_metadata:
+                # xfer_handshake_metadata is list of dicts from workers
+                # Each dict already has structure {tp_rank: metadata}
+                # Merge all worker dicts into a single dict
+                content: dict[int, Any] = {}
+                for worker_dict in xfer_handshake_metadata:
+                    if worker_dict is not None:
+                        content.update(worker_dict)
+                kv_connector.set_xfer_handshake_metadata(content)
 
         # Setup batch queue for pipeline parallelism.
         # Batch queue for scheduled batches. This enables us to asynchronously
         # schedule and execute batches, and is required by pipeline parallelism
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
-        self.batch_queue: Optional[deque[tuple[Future[ModelRunnerOutput],
-                                               SchedulerOutput]]] = None
+        self.batch_queue: (
+            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None
+        ) = None
         if self.batch_queue_size > 1:
-            logger.info("Batch queue is enabled with size %d",
-                        self.batch_queue_size)
+            logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
-        self.request_block_hasher: Optional[Callable[[Request],
-                                                     list[BlockHash]]] = None
-        if (self.vllm_config.cache_config.enable_prefix_caching
-                or self.scheduler.get_kv_connector() is not None):
+        self.ec_producer = (
+            vllm_config.ec_transfer_config is not None
+            and vllm_config.ec_transfer_config.is_ec_producer
+        )
+        self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
 
-            block_size = vllm_config.cache_config.block_size
+        self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
+        if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
             caching_hash_fn = get_hash_fn_by_name(
-                vllm_config.cache_config.prefix_caching_hash_algo)
+                vllm_config.cache_config.prefix_caching_hash_algo
+            )
             init_none_hash(caching_hash_fn)
 
             self.request_block_hasher = get_request_block_hasher(
-                block_size, caching_hash_fn)
+                scheduler_block_size, caching_hash_fn
+            )
 
-        self.step_fn = (self.step if self.batch_queue is None else
-                        self.step_with_batch_queue)
+        self.step_fn = (
+            self.step if self.batch_queue is None else self.step_with_batch_queue
+        )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
+
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        freeze_gc_heap()
 
     def _initialize_kv_caches(
-            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
+        self, vllm_config: VllmConfig
+    ) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -178,28 +219,27 @@ class EngineCore:
             if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
                 dp_group = getattr(self, "dp_group", None)
                 assert dp_group is not None
-                self.available_gpu_memory_for_kv_cache = \
+                self.available_gpu_memory_for_kv_cache = (
                     ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
-                available_gpu_memory = [
-                    self.available_gpu_memory_for_kv_cache
-                ] * len(kv_cache_specs)
+                )
+                available_gpu_memory = [self.available_gpu_memory_for_kv_cache] * len(
+                    kv_cache_specs
+                )
             else:
                 # Profiles the peak memory usage of the model to determine how
                 # much memory can be allocated for kv cache.
-                available_gpu_memory = (
-                    self.model_executor.determine_available_memory())
-                self.available_gpu_memory_for_kv_cache = \
-                    available_gpu_memory[0]
+                available_gpu_memory = self.model_executor.determine_available_memory()
+                self.available_gpu_memory_for_kv_cache = available_gpu_memory[0]
         else:
             # Attention free models don't need memory for kv cache
             available_gpu_memory = [0] * len(kv_cache_specs)
 
         assert len(kv_cache_specs) == len(available_gpu_memory)
 
-        kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
-                                                available_gpu_memory)
-        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
-            kv_cache_configs)
+        kv_cache_configs = get_kv_cache_configs(
+            vllm_config, kv_cache_specs, available_gpu_memory
+        )
+        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
         num_gpu_blocks = scheduler_kv_cache_config.num_blocks
         num_cpu_blocks = 0
 
@@ -207,8 +247,11 @@ class EngineCore:
         self.model_executor.initialize_from_config(kv_cache_configs)
 
         elapsed = time.time() - start
-        logger.info(("init engine (profile, create kv cache, "
-                     "warmup model) took %.2f seconds"), elapsed)
+        logger.info_once(
+            "init engine (profile, create kv cache, warmup model) took %.2f seconds",
+            elapsed,
+            scope="local",
+        )
         return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
@@ -223,22 +266,27 @@ class EngineCore:
         # Validate the request_id type.
         if not isinstance(request.request_id, str):
             raise TypeError(
-                f"request_id must be a string, got {type(request.request_id)}")
+                f"request_id must be a string, got {type(request.request_id)}"
+            )
 
         if pooling_params := request.pooling_params:
             supported_pooling_tasks = [
-                task for task in self.get_supported_tasks()
-                if task in POOLING_TASKS
+                task for task in self.get_supported_tasks() if task in POOLING_TASKS
             ]
 
             if pooling_params.task not in supported_pooling_tasks:
-                raise ValueError(f"Unsupported task: {pooling_params.task!r} "
-                                 f"Supported tasks: {supported_pooling_tasks}")
+                raise ValueError(
+                    f"Unsupported task: {pooling_params.task!r} "
+                    f"Supported tasks: {supported_pooling_tasks}"
+                )
 
         if request.kv_transfer_params is not None and (
-                not self.scheduler.get_kv_connector()):
-            logger.warning("Got kv_transfer_params, but no KVConnector found. "
-                           "Disabling KVTransfer for this request.")
+            not self.scheduler.get_kv_connector()
+        ):
+            logger.warning(
+                "Got kv_transfer_params, but no KVConnector found. "
+                "Disabling KVTransfer for this request."
+            )
 
         self.scheduler.add_request(request)
 
@@ -248,27 +296,34 @@ class EngineCore:
         # TODO: The scheduler doesn't really need to know the
         # specific finish reason, TBD whether we propagate that
         # (i.e. client-aborted vs stop criteria met).
-        self.scheduler.finish_requests(request_ids,
-                                       RequestStatus.FINISHED_ABORTED)
+        self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED)
 
-    def execute_model_with_error_logging(
-        self,
-        model_fn: Callable[[SchedulerOutput], ModelRunnerOutput],
-        scheduler_output: SchedulerOutput,
-    ) -> ModelRunnerOutput:
+    @contextmanager
+    def log_error_detail(self, scheduler_output: SchedulerOutput):
         """Execute the model and log detailed info on failure."""
         try:
-            return model_fn(scheduler_output)
+            yield
         except Exception as err:
             # We do not want to catch BaseException here since we're only
             # interested in dumping info when the exception is due to an
             # error from execute_model itself.
 
             # NOTE: This method is exception-free
-            dump_engine_exception(self.vllm_config, scheduler_output,
-                                  self.scheduler.make_stats())
+            dump_engine_exception(
+                self.vllm_config, scheduler_output, self.scheduler.make_stats()
+            )
             raise err
 
+    def _log_err_callback(self, scheduler_output: SchedulerOutput):
+        """Log error details of a future that's not expected to return a result."""
+
+        def callback(f, sched_output=scheduler_output):
+            with self.log_error_detail(sched_output):
+                result = f.result()
+                assert result is None
+
+        return callback
+
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
@@ -281,24 +336,32 @@ class EngineCore:
         if not self.scheduler.has_requests():
             return {}, False
         scheduler_output = self.scheduler.schedule()
-        model_output = self.execute_model_with_error_logging(
-            self.model_executor.execute_model,  # type: ignore
-            scheduler_output)
+        future = self.model_executor.execute_model(scheduler_output, non_block=True)
+        grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+            if model_output is None:
+                model_output = self.model_executor.sample_tokens(grammar_output)
+
         engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, model_output)  # type: ignore
+            scheduler_output, model_output
+        )
 
-        return (engine_core_outputs,
-                scheduler_output.total_num_scheduled_tokens > 0)
+        return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
     def post_step(self, model_executed: bool) -> None:
-        if self.use_spec_decode and model_executed:
+        # When using async scheduling we can't get draft token ids in advance,
+        # so we update draft token ids in the worker process and don't
+        # need to update draft token ids here.
+        if not self.async_scheduling and self.use_spec_decode and model_executed:
             # Take the draft token ids.
             draft_token_ids = self.model_executor.take_draft_token_ids()
             if draft_token_ids is not None:
                 self.scheduler.update_draft_token_ids(draft_token_ids)
 
     def step_with_batch_queue(
-            self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
+        self,
+    ) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
         """Schedule and execute batches with the batch queue.
         Note that if nothing to output in this step, None is returned.
 
@@ -321,19 +384,46 @@ class EngineCore:
         assert len(batch_queue) < self.batch_queue_size
 
         model_executed = False
+        deferred_scheduler_output = None
         if self.scheduler.has_requests():
             scheduler_output = self.scheduler.schedule()
-            future = self.model_executor.execute_model(scheduler_output,
-                                                       non_block=True)
-            batch_queue.appendleft(
-                (future, scheduler_output))  # type: ignore[arg-type]
-
-            model_executed = scheduler_output.total_num_scheduled_tokens > 0
-            if model_executed and len(batch_queue) < self.batch_queue_size \
-                and not batch_queue[-1][0].done():
-                # Don't block on next worker response unless the queue is full
-                # or there are no more requests to schedule.
-                return None, True
+            exec_future = self.model_executor.execute_model(
+                scheduler_output, non_block=True
+            )
+            if not self.ec_producer:
+                model_executed = scheduler_output.total_num_scheduled_tokens > 0
+
+            if self.is_pooling_model or not model_executed:
+                # No sampling required (no requests scheduled).
+                future = cast(Future[ModelRunnerOutput], exec_future)
+            else:
+                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
+
+                if not scheduler_output.pending_structured_output_tokens:
+                    # We aren't waiting for any tokens, get any grammar output
+                    # and sample immediately.
+                    grammar_output = self.scheduler.get_grammar_bitmask(
+                        scheduler_output
+                    )
+                    future = self.model_executor.sample_tokens(
+                        grammar_output, non_block=True
+                    )
+                else:
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+
+            if not deferred_scheduler_output:
+                # Add this step's future to the queue.
+                batch_queue.appendleft((future, scheduler_output))
+                if (
+                    model_executed
+                    and len(batch_queue) < self.batch_queue_size
+                    and not batch_queue[-1][0].done()
+                ):
+                    # Don't block on next worker response unless the queue is full
+                    # or there are no more requests to schedule.
+                    return None, True
 
         elif not batch_queue:
             # Queue is empty. We should not reach here since this method should
@@ -343,11 +433,24 @@ class EngineCore:
 
         # Block until the next result is available.
         future, scheduler_output = batch_queue.pop()
-        model_output = self.execute_model_with_error_logging(
-            lambda _: future.result(), scheduler_output)
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
 
         engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, model_output)
+            scheduler_output, model_output
+        )
+
+        # NOTE(nick): We can either handle the deferred tasks here or save
+        # in a field and do it immediately once step_with_batch_queue is
+        # re-called. The latter slightly favors TTFT over TPOT/throughput.
+        if deferred_scheduler_output:
+            # We now have the tokens needed to compute the bitmask for the
+            # deferred request. Get the bitmask and call sample tokens.
+            grammar_output = self.scheduler.get_grammar_bitmask(
+                deferred_scheduler_output
+            )
+            future = self.model_executor.sample_tokens(grammar_output, non_block=True)
+            batch_queue.appendleft((future, deferred_scheduler_output))
 
         return engine_core_outputs, model_executed
 
@@ -363,21 +466,26 @@ class EngineCore:
 
     def reset_mm_cache(self):
         # NOTE: Since this is mainly for debugging, we don't attempt to
-        # re-sync the internal caches (P0 processor, P0 mirror, P1 mirror)
+        # re-sync the internal caches (P0 sender, P1 receiver)
         if self.scheduler.has_unfinished_requests():
-            logger.warning("Resetting the multi-modal cache when requests are "
-                           "in progress may lead to desynced internal caches.")
+            logger.warning(
+                "Resetting the multi-modal cache when requests are "
+                "in progress may lead to desynced internal caches."
+            )
 
+        # The cache either exists in EngineCore or WorkerWrapperBase
         if self.mm_receiver_cache is not None:
             self.mm_receiver_cache.clear_cache()
 
+        self.model_executor.reset_mm_cache()
+
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
@@ -401,30 +509,23 @@ class EngineCore:
     def save_sharded_state(
         self,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
-        self.model_executor.save_sharded_state(path=path,
-                                               pattern=pattern,
-                                               max_size=max_size)
-
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        return self.model_executor.collective_rpc(method, timeout, args,
-                                                  kwargs)
-
-    def save_tensorized_model(
-        self,
-        tensorizer_config,
-    ) -> None:
-        self.model_executor.save_tensorized_model(
-            tensorizer_config=tensorizer_config, )
+        self.model_executor.save_sharded_state(
+            path=path, pattern=pattern, max_size=max_size
+        )
 
-    def preprocess_add_request(
-            self, request: EngineCoreRequest) -> tuple[Request, int]:
+    def collective_rpc(
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args, kwargs)
+
+    def preprocess_add_request(self, request: EngineCoreRequest) -> tuple[Request, int]:
         """Preprocess the request.
 
         This function could be directly used in input processing thread to allow
@@ -434,12 +535,11 @@ class EngineCore:
         # `mm_receiver_cache` is reset at the end of LLMEngine init,
         # and will only be accessed in the input processing thread afterwards.
         if self.mm_receiver_cache is not None and request.mm_features:
-            request.mm_features = (
-                self.mm_receiver_cache.get_and_update_features(
-                    request.mm_features))
+            request.mm_features = self.mm_receiver_cache.get_and_update_features(
+                request.mm_features
+            )
 
-        req = Request.from_engine_core_request(request,
-                                               self.request_block_hasher)
+        req = Request.from_engine_core_request(request, self.request_block_hasher)
         if req.use_structured_output:
             # Note on thread safety: no race condition.
             # `grammar_init` is only invoked in input processing thread. For
@@ -453,7 +553,7 @@ class EngineCore:
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
-    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+    ENGINE_CORE_DEAD = b"ENGINE_CORE_DEAD"
 
     def __init__(
         self,
@@ -462,41 +562,50 @@ class EngineCoreProc(EngineCore):
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
         engine_index: int = 0,
     ):
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
-        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
-                                              bytes]]()
+        self.output_queue = queue.Queue[tuple[int, EngineCoreOutputs] | bytes]()
         executor_fail_callback = lambda: self.input_queue.put_nowait(
-            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+            (EngineCoreRequestType.EXECUTOR_FAILED, b"")
+        )
 
         self.engine_index = engine_index
         identity = self.engine_index.to_bytes(length=2, byteorder="little")
         self.engines_running = False
 
-        with self._perform_handshakes(handshake_address, identity,
-                                      local_client, vllm_config,
-                                      client_handshake_address) as addresses:
+        with self._perform_handshakes(
+            handshake_address,
+            identity,
+            local_client,
+            vllm_config,
+            client_handshake_address,
+        ) as addresses:
             self.client_count = len(addresses.outputs)
 
             # Set up data parallel environment.
             self.has_coordinator = addresses.coordinator_output is not None
             self.frontend_stats_publish_address = (
-                addresses.frontend_stats_publish_address)
-            logger.debug("Has DP Coordinator: %s, stats publish address: %s",
-                         self.has_coordinator,
-                         self.frontend_stats_publish_address)
+                addresses.frontend_stats_publish_address
+            )
+            logger.debug(
+                "Has DP Coordinator: %s, stats publish address: %s",
+                self.has_coordinator,
+                self.frontend_stats_publish_address,
+            )
             # Only publish request queue stats to coordinator for "internal"
             # and "hybrid" LB modes .
             self.publish_dp_lb_stats = (
                 self.has_coordinator
-                and not vllm_config.parallel_config.data_parallel_external_lb)
+                and not vllm_config.parallel_config.data_parallel_external_lb
+            )
 
             self._init_data_parallel(vllm_config)
 
-            super().__init__(vllm_config, executor_class, log_stats,
-                             executor_fail_callback)
+            super().__init__(
+                vllm_config, executor_class, log_stats, executor_fail_callback
+            )
 
             # Background Threads and Queues for IO. These enable us to
             # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -504,33 +613,43 @@ class EngineCoreProc(EngineCore):
             # model forward pass.
             # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
             ready_event = threading.Event()
-            input_thread = threading.Thread(target=self.process_input_sockets,
-                                            args=(addresses.inputs,
-                                                  addresses.coordinator_input,
-                                                  identity, ready_event),
-                                            daemon=True)
+            input_thread = threading.Thread(
+                target=self.process_input_sockets,
+                args=(
+                    addresses.inputs,
+                    addresses.coordinator_input,
+                    identity,
+                    ready_event,
+                ),
+                daemon=True,
+            )
             input_thread.start()
 
             self.output_thread = threading.Thread(
                 target=self.process_output_sockets,
-                args=(addresses.outputs, addresses.coordinator_output,
-                      self.engine_index),
-                daemon=True)
+                args=(
+                    addresses.outputs,
+                    addresses.coordinator_output,
+                    self.engine_index,
+                ),
+                daemon=True,
+            )
             self.output_thread.start()
 
             # Don't complete handshake until DP coordinator ready message is
             # received.
             while not ready_event.wait(timeout=10):
                 if not input_thread.is_alive():
-                    raise RuntimeError(
-                        "Input socket thread died during startup")
+                    raise RuntimeError("Input socket thread died during startup")
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # Mark the startup heap as static so that it's ignored by GC.
-        # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
+        # If enable, attach GC debugger after static variable freeze.
+        maybe_attach_gc_debug_callback()
+
+        # Enable environment variable cache (e.g. assume no more
+        # environment variable overrides after this point)
+        enable_envs_cache()
 
     @contextmanager
     def _perform_handshakes(
@@ -539,7 +658,7 @@ class EngineCoreProc(EngineCore):
         identity: bytes,
         local_client: bool,
         vllm_config: VllmConfig,
-        client_handshake_address: Optional[str],
+        client_handshake_address: str | None,
     ) -> Generator[EngineZmqAddresses, None, None]:
         """
         Perform startup handshakes.
@@ -566,18 +685,23 @@ class EngineCoreProc(EngineCore):
         input_ctx = zmq.Context()
         is_local = local_client and client_handshake_address is None
         headless = not local_client
-        handshake = self._perform_handshake(input_ctx, handshake_address,
-                                            identity, is_local, headless,
-                                            vllm_config,
-                                            vllm_config.parallel_config)
+        handshake = self._perform_handshake(
+            input_ctx,
+            handshake_address,
+            identity,
+            is_local,
+            headless,
+            vllm_config,
+            vllm_config.parallel_config,
+        )
         if client_handshake_address is None:
             with handshake as addresses:
                 yield addresses
         else:
             assert local_client
             local_handshake = self._perform_handshake(
-                input_ctx, client_handshake_address, identity, True, False,
-                vllm_config)
+                input_ctx, client_handshake_address, identity, True, False, vllm_config
+            )
             with handshake as addresses, local_handshake as client_addresses:
                 addresses.inputs = client_addresses.inputs
                 addresses.outputs = client_addresses.outputs
@@ -595,18 +719,20 @@ class EngineCoreProc(EngineCore):
         local_client: bool,
         headless: bool,
         vllm_config: VllmConfig,
-        parallel_config_to_update: Optional[ParallelConfig] = None,
+        parallel_config_to_update: ParallelConfig | None = None,
     ) -> Generator[EngineZmqAddresses, None, None]:
-        with make_zmq_socket(ctx,
-                             handshake_address,
-                             zmq.DEALER,
-                             identity=identity,
-                             linger=5000,
-                             bind=False) as handshake_socket:
+        with make_zmq_socket(
+            ctx,
+            handshake_address,
+            zmq.DEALER,
+            identity=identity,
+            linger=5000,
+            bind=False,
+        ) as handshake_socket:
             # Register engine with front-end.
-            addresses = self.startup_handshake(handshake_socket, local_client,
-                                               headless,
-                                               parallel_config_to_update)
+            addresses = self.startup_handshake(
+                handshake_socket, local_client, headless, parallel_config_to_update
+            )
             yield addresses
 
             # Send ready message.
@@ -615,40 +741,52 @@ class EngineCoreProc(EngineCore):
             # external LB case for our colocated front-end to use (coordinator
             # only runs with rank 0).
             dp_stats_address = self.frontend_stats_publish_address
-            handshake_socket.send(
-                msgspec.msgpack.encode({
-                    "status": "READY",
-                    "local": local_client,
-                    "headless": headless,
-                    "num_gpu_blocks": num_gpu_blocks,
-                    "dp_stats_address": dp_stats_address,
-                }))
+
+            # Include config hash for DP configuration validation
+            ready_msg = {
+                "status": "READY",
+                "local": local_client,
+                "headless": headless,
+                "num_gpu_blocks": num_gpu_blocks,
+                "dp_stats_address": dp_stats_address,
+            }
+            if vllm_config.parallel_config.data_parallel_size > 1:
+                ready_msg["parallel_config_hash"] = (
+                    vllm_config.parallel_config.compute_hash()
+                )
+
+            handshake_socket.send(msgspec.msgpack.encode(ready_msg))
 
     @staticmethod
     def startup_handshake(
         handshake_socket: zmq.Socket,
         local_client: bool,
         headless: bool,
-        parallel_config: Optional[ParallelConfig] = None,
+        parallel_config: ParallelConfig | None = None,
     ) -> EngineZmqAddresses:
-
         # Send registration message.
         handshake_socket.send(
-            msgspec.msgpack.encode({
-                "status": "HELLO",
-                "local": local_client,
-                "headless": headless,
-            }))
+            msgspec.msgpack.encode(
+                {
+                    "status": "HELLO",
+                    "local": local_client,
+                    "headless": headless,
+                }
+            )
+        )
 
         # Receive initialization message.
-        logger.info("Waiting for init message from front-end.")
+        logger.debug("Waiting for init message from front-end.")
         if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000):
-            raise RuntimeError("Did not receive response from front-end "
-                               f"process within {HANDSHAKE_TIMEOUT_MINS} "
-                               f"minutes")
+            raise RuntimeError(
+                "Did not receive response from front-end "
+                f"process within {HANDSHAKE_TIMEOUT_MINS} "
+                f"minutes"
+            )
         init_bytes = handshake_socket.recv()
         init_message: EngineHandshakeMetadata = msgspec.msgpack.decode(
-            init_bytes, type=EngineHandshakeMetadata)
+            init_bytes, type=EngineHandshakeMetadata
+        )
         logger.debug("Received init message: %s", init_message)
 
         if parallel_config is not None:
@@ -658,10 +796,7 @@ class EngineCoreProc(EngineCore):
         return init_message.addresses
 
     @staticmethod
-    def run_engine_core(*args,
-                        dp_rank: int = 0,
-                        local_dp_rank: int = 0,
-                        **kwargs):
+    def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
         """Launch EngineCore busy loop in background process."""
 
         # Signal handler used for graceful termination.
@@ -682,10 +817,9 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        engine_core: Optional[EngineCoreProc] = None
+        engine_core: EngineCoreProc | None = None
         try:
-            parallel_config: ParallelConfig = kwargs[
-                "vllm_config"].parallel_config
+            parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
                 set_process_title("EngineCore", f"DP{dp_rank}")
                 decorate_logs()
@@ -731,8 +865,11 @@ class EngineCoreProc(EngineCore):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.engines_running and not self.scheduler.has_requests() \
-                and not self.batch_queue:
+        while (
+            not self.engines_running
+            and not self.scheduler.has_requests()
+            and not self.batch_queue
+        ):
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -753,15 +890,16 @@ class EngineCoreProc(EngineCore):
         # Step the engine core.
         outputs, model_executed = self.step_fn()
         # Put EngineCoreOutputs into the output queue.
-        for output in (outputs.items() if outputs else ()):
+        for output in outputs.items() if outputs else ():
             self.output_queue.put_nowait(output)
         # Post-step hook.
         self.post_step(model_executed)
 
         return model_executed
 
-    def _handle_client_request(self, request_type: EngineCoreRequestType,
-                               request: Any) -> None:
+    def _handle_client_request(
+        self, request_type: EngineCoreRequestType, request: Any
+    ) -> None:
         """Dispatch request from client."""
 
         if request_type == EngineCoreRequestType.ADD:
@@ -778,29 +916,35 @@ class EngineCoreProc(EngineCore):
                 output.result = UtilityResult(result)
             except BaseException as e:
                 logger.exception("Invocation of %s method failed", method_name)
-                output.failure_message = (f"Call to {method_name} method"
-                                          f" failed: {str(e)}")
+                output.failure_message = (
+                    f"Call to {method_name} method failed: {str(e)}"
+                )
             self.output_queue.put_nowait(
-                (client_idx, EngineCoreOutputs(utility_output=output)))
+                (client_idx, EngineCoreOutputs(utility_output=output))
+            )
         elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
             raise RuntimeError("Executor failed.")
         else:
-            logger.error("Unrecognized input request type encountered: %s",
-                         request_type)
+            logger.error(
+                "Unrecognized input request type encountered: %s", request_type
+            )
 
     @staticmethod
     def _convert_msgspec_args(method, args):
         """If a provided arg type doesn't match corresponding target method
-         arg type, try converting to msgspec object."""
+        arg type, try converting to msgspec object."""
         if not args:
             return args
         arg_types = signature(method).parameters.values()
         assert len(args) <= len(arg_types)
         return tuple(
-            msgspec.convert(v, type=p.annotation) if isclass(p.annotation)
+            msgspec.convert(v, type=p.annotation)
+            if isclass(p.annotation)
             and issubclass(p.annotation, msgspec.Struct)
-            and not isinstance(v, p.annotation) else v
-            for v, p in zip(args, arg_types))
+            and not isinstance(v, p.annotation)
+            else v
+            for v, p in zip(args, arg_types)
+        )
 
     def _send_engine_dead(self):
         """Send EngineDead status to the EngineCoreClient."""
@@ -811,12 +955,18 @@ class EngineCoreProc(EngineCore):
         # Wait until msg sent by the daemon before shutdown.
         self.output_thread.join(timeout=5.0)
         if self.output_thread.is_alive():
-            logger.fatal("vLLM shutdown signal from EngineCore failed "
-                         "to send. Please report this issue.")
+            logger.fatal(
+                "vLLM shutdown signal from EngineCore failed "
+                "to send. Please report this issue."
+            )
 
-    def process_input_sockets(self, input_addresses: list[str],
-                              coord_input_address: Optional[str],
-                              identity: bytes, ready_event: threading.Event):
+    def process_input_sockets(
+        self,
+        input_addresses: list[str],
+        coord_input_address: str | None,
+        identity: bytes,
+        ready_event: threading.Event,
+    ):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
@@ -826,24 +976,26 @@ class EngineCoreProc(EngineCore):
         with ExitStack() as stack, zmq.Context() as ctx:
             input_sockets = [
                 stack.enter_context(
-                    make_zmq_socket(ctx,
-                                    input_address,
-                                    zmq.DEALER,
-                                    identity=identity,
-                                    bind=False))
+                    make_zmq_socket(
+                        ctx, input_address, zmq.DEALER, identity=identity, bind=False
+                    )
+                )
                 for input_address in input_addresses
             ]
             if coord_input_address is None:
                 coord_socket = None
             else:
                 coord_socket = stack.enter_context(
-                    make_zmq_socket(ctx,
-                                    coord_input_address,
-                                    zmq.XSUB,
-                                    identity=identity,
-                                    bind=False))
+                    make_zmq_socket(
+                        ctx,
+                        coord_input_address,
+                        zmq.XSUB,
+                        identity=identity,
+                        bind=False,
+                    )
+                )
                 # Send subscription message to coordinator.
-                coord_socket.send(b'\x01')
+                coord_socket.send(b"\x01")
 
             # Register sockets with poller.
             poller = zmq.Poller()
@@ -851,7 +1003,7 @@ class EngineCoreProc(EngineCore):
                 # Send initial message to each input socket - this is required
                 # before the front-end ROUTER socket can send input messages
                 # back to us.
-                input_socket.send(b'')
+                input_socket.send(b"")
                 poller.register(input_socket, zmq.POLLIN)
 
             if coord_socket is not None:
@@ -864,10 +1016,8 @@ class EngineCoreProc(EngineCore):
             while True:
                 for input_socket, _ in poller.poll():
                     # (RequestType, RequestData)
-                    type_frame, *data_frames = input_socket.recv_multipart(
-                        copy=False)
-                    request_type = EngineCoreRequestType(
-                        bytes(type_frame.buffer))
+                    type_frame, *data_frames = input_socket.recv_multipart(copy=False)
+                    request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                     # Deserialize the request data.
                     if request_type == EngineCoreRequestType.ADD:
@@ -879,9 +1029,12 @@ class EngineCoreProc(EngineCore):
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
 
-    def process_output_sockets(self, output_paths: list[str],
-                               coord_output_path: Optional[str],
-                               engine_index: int):
+    def process_output_sockets(
+        self,
+        output_paths: list[str],
+        coord_output_path: str | None,
+        engine_index: int,
+    ):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -898,13 +1051,19 @@ class EngineCoreProc(EngineCore):
         with ExitStack() as stack, zmq.Context() as ctx:
             sockets = [
                 stack.enter_context(
-                    make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000))
+                    make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000)
+                )
                 for output_path in output_paths
             ]
-            coord_socket = stack.enter_context(
-                make_zmq_socket(
-                    ctx, coord_output_path, zmq.PUSH, bind=False,
-                    linger=4000)) if coord_output_path is not None else None
+            coord_socket = (
+                stack.enter_context(
+                    make_zmq_socket(
+                        ctx, coord_output_path, zmq.PUSH, bind=False, linger=4000
+                    )
+                )
+                if coord_output_path is not None
+                else None
+            )
             max_reuse_bufs = len(sockets) + 1
 
             while True:
@@ -930,9 +1089,9 @@ class EngineCoreProc(EngineCore):
 
                 buffer = reuse_buffers.pop() if reuse_buffers else bytearray()
                 buffers = encoder.encode_into(outputs, buffer)
-                tracker = sockets[client_index].send_multipart(buffers,
-                                                               copy=False,
-                                                               track=True)
+                tracker = sockets[client_index].send_multipart(
+                    buffers, copy=False, track=True
+                )
                 if not tracker.done:
                     ref = outputs if len(buffers) > 1 else None
                     pending.appendleft((tracker, ref, buffer))
@@ -952,7 +1111,7 @@ class DPEngineCoreProc(EngineCoreProc):
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
     ):
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
@@ -962,18 +1121,24 @@ class DPEngineCoreProc(EngineCoreProc):
 
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        super().__init__(vllm_config, local_client, handshake_address,
-                         executor_class, log_stats, client_handshake_address,
-                         dp_rank)
+        super().__init__(
+            vllm_config,
+            local_client,
+            handshake_address,
+            executor_class,
+            log_stats,
+            client_handshake_address,
+            dp_rank,
+        )
 
     def _init_data_parallel(self, vllm_config: VllmConfig):
-
         # Configure GPUs and stateless process group for data parallel.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
         dp_size = vllm_config.parallel_config.data_parallel_size
         local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
 
         assert dp_size > 1
+        assert local_dp_rank is not None
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
         if vllm_config.kv_transfer_config is not None:
@@ -982,8 +1147,10 @@ class DPEngineCoreProc(EngineCoreProc):
             vllm_config.kv_transfer_config.engine_id = (
                 f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
             )
-            logger.debug("Setting kv_transfer_config.engine_id to %s",
-                         vllm_config.kv_transfer_config.engine_id)
+            logger.debug(
+                "Setting kv_transfer_config.engine_id to %s",
+                vllm_config.kv_transfer_config.engine_id,
+            )
 
         self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
@@ -1001,20 +1168,22 @@ class DPEngineCoreProc(EngineCoreProc):
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
                 self.output_queue.put_nowait(
-                    (-1, EngineCoreOutputs(start_wave=self.current_wave)))
+                    (-1, EngineCoreOutputs(start_wave=self.current_wave))
+                )
 
         super().add_request(request, request_wave)
 
-    def _handle_client_request(self, request_type: EngineCoreRequestType,
-                               request: Any) -> None:
+    def _handle_client_request(
+        self, request_type: EngineCoreRequestType, request: Any
+    ) -> None:
         if request_type == EngineCoreRequestType.START_DP_WAVE:
             new_wave, exclude_eng_index = request
             if exclude_eng_index != self.engine_index and (
-                    new_wave >= self.current_wave):
+                new_wave >= self.current_wave
+            ):
                 self.current_wave = new_wave
                 if not self.engines_running:
-                    logger.debug("EngineCore starting idle loop for wave %d.",
-                                 new_wave)
+                    logger.debug("EngineCore starting idle loop for wave %d.", new_wave)
                     self.engines_running = True
         else:
             super()._handle_client_request(request_type, request)
@@ -1027,11 +1196,10 @@ class DPEngineCoreProc(EngineCoreProc):
         counts = self.scheduler.get_request_counts()
         if counts != self.last_counts:
             self.last_counts = counts
-            stats = SchedulerStats(*counts,
-                                   step_counter=self.step_counter,
-                                   current_wave=self.current_wave)
-            self.output_queue.put_nowait(
-                (-1, EngineCoreOutputs(scheduler_stats=stats)))
+            stats = SchedulerStats(
+                *counts, step_counter=self.step_counter, current_wave=self.current_wave
+            )
+            self.output_queue.put_nowait((-1, EngineCoreOutputs(scheduler_stats=stats)))
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
@@ -1057,58 +1225,65 @@ class DPEngineCoreProc(EngineCoreProc):
 
             # 3) All-reduce operation to determine global unfinished reqs.
             self.engines_running = self._has_global_unfinished_reqs(
-                local_unfinished_reqs)
+                local_unfinished_reqs
+            )
 
             if not self.engines_running:
                 if self.dp_rank == 0 or not self.has_coordinator:
                     # Notify client that we are pausing the loop.
-                    logger.debug("Wave %d finished, pausing engine loop.",
-                                 self.current_wave)
+                    logger.debug(
+                        "Wave %d finished, pausing engine loop.", self.current_wave
+                    )
                     # In the coordinator case, dp rank 0 sends updates to the
                     # coordinator. Otherwise (offline spmd case), each rank
                     # sends the update to its colocated front-end process.
                     client_index = -1 if self.has_coordinator else 0
                     self.output_queue.put_nowait(
-                        (client_index,
-                         EngineCoreOutputs(wave_complete=self.current_wave)))
+                        (
+                            client_index,
+                            EngineCoreOutputs(wave_complete=self.current_wave),
+                        )
+                    )
                 # Increment wave count and reset step counter.
                 self.current_wave += 1
                 self.step_counter = 0
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
-
         # Optimization - only perform finish-sync all-reduce every 32 steps.
         self.step_counter += 1
         if self.step_counter % 32 != 0:
             return True
 
-        return ParallelConfig.has_unfinished_dp(self.dp_group,
-                                                local_unfinished)
+        return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished)
 
     def reinitialize_distributed(
-            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
         stateless_destroy_torch_distributed_process_group(self.dp_group)
         self.shutdown()
 
         parallel_config = self.vllm_config.parallel_config
         old_dp_size = parallel_config.data_parallel_size
-        parallel_config.data_parallel_size = \
-            reconfig_request.new_data_parallel_size
+        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
         if reconfig_request.new_data_parallel_rank != -1:
-            parallel_config.data_parallel_rank = \
-                reconfig_request.new_data_parallel_rank
+            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
         # local rank specifies device visibility, it should not be changed
-        assert reconfig_request.new_data_parallel_rank_local == \
-            ReconfigureRankType.KEEP_CURRENT_RANK
-        parallel_config.data_parallel_master_ip = \
+        assert (
+            reconfig_request.new_data_parallel_rank_local
+            == ReconfigureRankType.KEEP_CURRENT_RANK
+        )
+        parallel_config.data_parallel_master_ip = (
             reconfig_request.new_data_parallel_master_ip
-        parallel_config.data_parallel_master_port = \
+        )
+        parallel_config.data_parallel_master_port = (
             reconfig_request.new_data_parallel_master_port
+        )
         if reconfig_request.new_data_parallel_rank != -2:
             self.dp_rank = parallel_config.data_parallel_rank
             self.dp_group = parallel_config.stateless_init_dp_group()
-        reconfig_request.new_data_parallel_master_port = \
+        reconfig_request.new_data_parallel_master_port = (
             parallel_config.data_parallel_master_port
+        )
 
         self.model_executor.reinitialize_distributed(reconfig_request)
         if reconfig_request.new_data_parallel_size > old_dp_size:
@@ -1117,17 +1292,21 @@ class DPEngineCoreProc(EngineCoreProc):
             # engine-cores to new engine-cores so they can directly
             # use it in _initialize_kv_caches() rather than profiling.
             ParallelConfig.sync_kv_cache_memory_size(
-                self.dp_group, self.available_gpu_memory_for_kv_cache)
+                self.dp_group, self.available_gpu_memory_for_kv_cache
+            )
             # NOTE(yongji): newly joined workers require dummy_run even
             # CUDA graph is not used
             self.model_executor.collective_rpc("compile_or_warm_up_model")
-        if reconfig_request.new_data_parallel_rank == \
-        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
             self.shutdown()
             logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
         else:
-            logger.info("Distributed environment reinitialized for DP rank %s",
-                        self.dp_rank)
+            logger.info(
+                "Distributed environment reinitialized for DP rank %s", self.dp_rank
+            )
 
 
 class DPEngineCoreActor(DPEngineCoreProc):
@@ -1147,8 +1326,7 @@ class DPEngineCoreActor(DPEngineCoreProc):
     ):
         self.addresses = addresses
         vllm_config.parallel_config.data_parallel_rank = dp_rank
-        vllm_config.parallel_config.data_parallel_rank_local = \
-            local_dp_rank
+        vllm_config.parallel_config.data_parallel_rank_local = local_dp_rank
 
         # Set CUDA_VISIBLE_DEVICES as early as possible in actor life cycle
         # NOTE: in MP we set CUDA_VISIBLE_DEVICES at process creation time,
@@ -1169,39 +1347,46 @@ class DPEngineCoreActor(DPEngineCoreProc):
         # of ray.
         self._set_visible_devices(vllm_config, local_dp_rank)
 
-        super().__init__(vllm_config, local_client, "", executor_class,
-                         log_stats)
+        super().__init__(vllm_config, local_client, "", executor_class, log_stats)
 
-    def _set_visible_devices(self, vllm_config: VllmConfig,
-                             local_dp_rank: int):
+    def _set_visible_devices(self, vllm_config: VllmConfig, local_dp_rank: int):
         from vllm.platforms import current_platform
+
         if current_platform.is_xpu():
             pass
         else:
             device_control_env_var = current_platform.device_control_env_var
-            self._set_cuda_visible_devices(vllm_config, local_dp_rank,
-                                           device_control_env_var)
+            self._set_cuda_visible_devices(
+                vllm_config, local_dp_rank, device_control_env_var
+            )
 
-    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
-                                  local_dp_rank: int,
-                                  device_control_env_var: str):
+    def _set_cuda_visible_devices(
+        self, vllm_config: VllmConfig, local_dp_rank: int, device_control_env_var: str
+    ):
         world_size = vllm_config.parallel_config.world_size
         # Set CUDA_VISIBLE_DEVICES or equivalent.
         try:
-            value = get_device_indices(device_control_env_var, local_dp_rank,
-                                       world_size)
+            value = get_device_indices(
+                device_control_env_var, local_dp_rank, world_size
+            )
             os.environ[device_control_env_var] = value
         except IndexError as e:
             raise Exception(
                 f"Error setting {device_control_env_var}: "
                 f"local range: [{local_dp_rank * world_size}, "
                 f"{(local_dp_rank + 1) * world_size}) "
-                f"base value: \"{os.getenv(device_control_env_var)}\"") from e
+                f'base value: "{os.getenv(device_control_env_var)}"'
+            ) from e
 
     @contextmanager
-    def _perform_handshakes(self, handshake_address: str, identity: bytes,
-                            local_client: bool, vllm_config: VllmConfig,
-                            client_handshake_address: Optional[str]):
+    def _perform_handshakes(
+        self,
+        handshake_address: str,
+        identity: bytes,
+        local_client: bool,
+        vllm_config: VllmConfig,
+        client_handshake_address: str | None,
+    ):
         """
         For Ray, we don't need to actually perform handshake.
         All addresses information is known before the actor creation.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 76cd6f47f17912731e1de163dfdc2f17e043e6fe..5ec048f51b05fe96213e712de86b5a7d9826b734 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -10,11 +10,11 @@ import weakref
 from abc import ABC, abstractmethod
 
 from collections import defaultdict, deque
-from collections.abc import Awaitable, Sequence
+from collections.abc import Awaitable, Callable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, TypeAlias, TypeVar
 
 import msgspec.msgpack
 import zmq
@@ -24,32 +24,44 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path,
-                        in_loop, make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
-                            EngineCoreRequestType,
-                            ReconfigureDistributedRequest, ReconfigureRankType,
-                            UtilityOutput)
+from vllm.utils.async_utils import in_loop
+from vllm.utils.network_utils import (
+    close_sockets,
+    get_open_port,
+    get_open_zmq_inproc_path,
+    make_zmq_socket,
+)
+from vllm.v1.engine import (
+    EngineCoreOutputs,
+    EngineCoreRequest,
+    EngineCoreRequestType,
+    ReconfigureDistributedRequest,
+    ReconfigureRankType,
+    UtilityOutput,
+)
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
-from vllm.v1.engine.utils import (CoreEngineActorManager,
-                                  CoreEngineProcManager, launch_core_engines)
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.engine.utils import (
+    CoreEngineActorManager,
+    CoreEngineProcManager,
+    launch_core_engines,
+)
+from vllm.v1.executor import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 
 logger = init_logger(__name__)
 
-AnyFuture = Union[asyncio.Future[Any], Future[Any]]
+AnyFuture: TypeAlias = asyncio.Future[Any] | Future[Any]
 
-_R = TypeVar('_R')  # Return type for collective_rpc
+_R = TypeVar("_R")  # Return type for collective_rpc
 
 EngineIdentity = bytes
 
 
 class EngineCoreClient(ABC):
     """
-    EngineCoreClient: subclasses handle different methods for pushing 
+    EngineCoreClient: subclasses handle different methods for pushing
         and pulling from the EngineCore for asyncio / multiprocessing.
 
     Subclasses:
@@ -66,16 +78,17 @@ class EngineCoreClient(ABC):
         executor_class: type[Executor],
         log_stats: bool,
     ) -> "EngineCoreClient":
-
         # TODO: support this for debugging purposes.
         if asyncio_mode and not multiprocess_mode:
             raise NotImplementedError(
                 "Running EngineCore in asyncio without multiprocessing "
-                "is not currently supported.")
+                "is not currently supported."
+            )
 
         if multiprocess_mode and asyncio_mode:
             return EngineCoreClient.make_async_mp_client(
-                vllm_config, executor_class, log_stats)
+                vllm_config, executor_class, log_stats
+            )
 
         if multiprocess_mode and not asyncio_mode:
             return SyncMPClient(vllm_config, executor_class, log_stats)
@@ -87,13 +100,19 @@ class EngineCoreClient(ABC):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ) -> "MPClient":
         parallel_config = vllm_config.parallel_config
-        client_args = (vllm_config, executor_class, log_stats,
-                       client_addresses, client_count, client_index)
+        client_args = (
+            vllm_config,
+            executor_class,
+            log_stats,
+            client_addresses,
+            client_count,
+            client_index,
+        )
         if parallel_config.data_parallel_size > 1:
             if parallel_config.data_parallel_external_lb:
                 # External load balancer - client per DP rank.
@@ -103,8 +122,7 @@ class EngineCoreClient(ABC):
         return AsyncMPClient(*client_args)
 
     @abstractmethod
-    def shutdown(self):
-        ...
+    def shutdown(self): ...
 
     def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
@@ -127,7 +145,7 @@ class EngineCoreClient(ABC):
     def sleep(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         raise NotImplementedError
 
     def is_sleeping(self) -> bool:
@@ -154,17 +172,18 @@ class EngineCoreClient(ABC):
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    def save_sharded_state(self,
-                           path: str,
-                           pattern: Optional[str] = None,
-                           max_size: Optional[int] = None) -> None:
+    def save_sharded_state(
+        self, path: str, pattern: str | None = None, max_size: int | None = None
+    ) -> None:
         raise NotImplementedError
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+    def collective_rpc(
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
         raise NotImplementedError
 
     def dp_engines_running(self) -> bool:
@@ -196,7 +215,7 @@ class EngineCoreClient(ABC):
     async def sleep_async(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up_async(self, tags: list[str] | None = None) -> None:
         raise NotImplementedError
 
     async def is_sleeping_async(self) -> bool:
@@ -217,24 +236,24 @@ class EngineCoreClient(ABC):
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    async def save_sharded_state_async(self,
-                                       path: str,
-                                       pattern: Optional[str] = None,
-                                       max_size: Optional[int] = None) -> None:
+    async def save_sharded_state_async(
+        self, path: str, pattern: str | None = None, max_size: int | None = None
+    ) -> None:
         raise NotImplementedError
 
     async def collective_rpc_async(
-            self,
-            method: Union[str, Callable[..., _R]],
-            timeout: Optional[float] = None,
-            args: tuple = (),
-            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
         raise NotImplementedError
 
 
 class InprocClient(EngineCoreClient):
     """
-    InprocClient: client for in-process EngineCore. Intended 
+    InprocClient: client for in-process EngineCore. Intended
     for use in LLMEngine for V0-style add_request() and step()
         EngineCore setup in this process (no busy loop).
 
@@ -275,7 +294,7 @@ class InprocClient(EngineCoreClient):
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
@@ -296,17 +315,18 @@ class InprocClient(EngineCoreClient):
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
-    def save_sharded_state(self,
-                           path: str,
-                           pattern: Optional[str] = None,
-                           max_size: Optional[int] = None) -> None:
+    def save_sharded_state(
+        self, path: str, pattern: str | None = None, max_size: int | None = None
+    ) -> None:
         self.engine_core.save_sharded_state(path, pattern, max_size)
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+    def collective_rpc(
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
     def dp_engines_running(self) -> bool:
@@ -321,17 +341,16 @@ class BackgroundResources:
     ctx: zmq.Context
     # If CoreEngineProcManager, it manages local engines;
     # if CoreEngineActorManager, it manages all engines.
-    engine_manager: Optional[Union[CoreEngineProcManager,
-                                   CoreEngineActorManager]] = None
-    coordinator: Optional[DPCoordinator] = None
-    output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    first_req_send_socket: Optional[zmq.asyncio.Socket] = None
-    first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
-    stats_update_socket: Optional[zmq.asyncio.Socket] = None
-    output_queue_task: Optional[asyncio.Task] = None
-    stats_update_task: Optional[asyncio.Task] = None
-    shutdown_path: Optional[str] = None
+    engine_manager: CoreEngineProcManager | CoreEngineActorManager | None = None
+    coordinator: DPCoordinator | None = None
+    output_socket: zmq.Socket | zmq.asyncio.Socket | None = None
+    input_socket: zmq.Socket | zmq.asyncio.Socket | None = None
+    first_req_send_socket: zmq.asyncio.Socket | None = None
+    first_req_rcv_socket: zmq.asyncio.Socket | None = None
+    stats_update_socket: zmq.asyncio.Socket | None = None
+    output_queue_task: asyncio.Task | None = None
+    stats_update_task: asyncio.Task | None = None
+    shutdown_path: str | None = None
 
     # Set if any of the engines are dead. Here so that the output
     # processing threads can access it without holding a ref to the client.
@@ -348,12 +367,15 @@ class BackgroundResources:
 
         if isinstance(self.output_socket, zmq.asyncio.Socket):
             # Async case.
-            loop = self.output_queue_task._loop \
-                if self.output_queue_task else None
+            loop = self.output_queue_task._loop if self.output_queue_task else None
 
-            sockets = (self.output_socket, self.input_socket,
-                       self.first_req_send_socket, self.first_req_rcv_socket,
-                       self.stats_update_socket)
+            sockets = (
+                self.output_socket,
+                self.input_socket,
+                self.first_req_send_socket,
+                self.first_req_rcv_socket,
+                self.stats_update_socket,
+            )
 
             tasks = (self.output_queue_task, self.stats_update_task)
 
@@ -364,10 +386,11 @@ class BackgroundResources:
                         with contextlib.suppress(Exception):
                             task.cancel()
 
-            if in_loop(loop):
-                close_sockets_and_tasks()
-            elif loop and not loop.is_closed():
-                loop.call_soon_threadsafe(close_sockets_and_tasks)
+            if loop is not None:
+                if in_loop(loop):
+                    close_sockets_and_tasks()
+                elif not loop.is_closed():
+                    loop.call_soon_threadsafe(close_sockets_and_tasks)
             else:
                 # Loop has been closed, try to clean up directly.
                 del tasks
@@ -388,11 +411,10 @@ class BackgroundResources:
                 with self.ctx.socket(zmq.PAIR) as shutdown_sender:
                     shutdown_sender.connect(self.shutdown_path)
                     # Send shutdown signal.
-                    shutdown_sender.send(b'')
+                    shutdown_sender.send(b"")
 
     def validate_alive(self, frames: Sequence[zmq.Frame]):
-        if len(frames) == 1 and (frames[0].buffer
-                                 == EngineCoreProc.ENGINE_CORE_DEAD):
+        if len(frames) == 1 and (frames[0].buffer == EngineCoreProc.ENGINE_CORE_DEAD):
             self.engine_dead = True
             raise EngineDeadError()
 
@@ -405,7 +427,7 @@ class MPClient(EngineCoreClient):
 
         * pushes EngineCoreRequests via input_socket
         * pulls EngineCoreOutputs via output_socket
-    
+
         * AsyncMPClient subclass for AsyncLLM usage
         * SyncMPClient subclass for LLM usage
     """
@@ -416,7 +438,7 @@ class MPClient(EngineCoreClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
     ):
         self.vllm_config = vllm_config
         # Serialization setup.
@@ -437,35 +459,37 @@ class MPClient(EngineCoreClient):
             # State used for data parallel.
             self.engines_running = False
 
-            self.stats_update_address: Optional[str] = None
+            self.stats_update_address: str | None = None
             if client_addresses:
                 # Engines are managed externally to this client.
                 input_address = client_addresses["input_address"]
                 output_address = client_addresses["output_address"]
-                self.stats_update_address = client_addresses.get(
-                    "stats_update_address")
+                self.stats_update_address = client_addresses.get("stats_update_address")
             else:
                 # Engines are managed by this client.
-                with launch_core_engines(vllm_config, executor_class,
-                                         log_stats) as (engine_manager,
-                                                        coordinator,
-                                                        addresses):
+                with launch_core_engines(vllm_config, executor_class, log_stats) as (
+                    engine_manager,
+                    coordinator,
+                    addresses,
+                ):
                     self.resources.coordinator = coordinator
                     self.resources.engine_manager = engine_manager
 
-                (input_address, ) = addresses.inputs
-                (output_address, ) = addresses.outputs
-                self.stats_update_address = (
-                    addresses.frontend_stats_publish_address)
+                (input_address,) = addresses.inputs
+                (output_address,) = addresses.outputs
+                self.stats_update_address = addresses.frontend_stats_publish_address
                 if coordinator is not None:
                     assert self.stats_update_address == (
-                        coordinator.get_stats_publish_address())
+                        coordinator.get_stats_publish_address()
+                    )
 
             # Create input and output sockets.
             self.input_socket = self.resources.input_socket = make_zmq_socket(
-                self.ctx, input_address, zmq.ROUTER, bind=True)
+                self.ctx, input_address, zmq.ROUTER, bind=True
+            )
             self.resources.output_socket = make_zmq_socket(
-                self.ctx, output_address, zmq.PULL)
+                self.ctx, output_address, zmq.PULL
+            )
 
             parallel_config = vllm_config.parallel_config
             dp_size = parallel_config.data_parallel_size
@@ -474,19 +498,22 @@ class MPClient(EngineCoreClient):
             offline_mode = parallel_config.data_parallel_rank_local is not None
             # Client manages local+remote EngineCores in pure internal LB case.
             # Client manages local EngineCores in hybrid and external LB case.
-            local_engines_only = (parallel_config.data_parallel_hybrid_lb
-                                  or parallel_config.data_parallel_external_lb)
+            local_engines_only = (
+                parallel_config.data_parallel_hybrid_lb
+                or parallel_config.data_parallel_external_lb
+            )
 
             num_ranks = dp_local_size if local_engines_only else dp_size
-            self.engine_ranks_managed = [dp_rank] if offline_mode else list(
-                range(dp_rank, dp_rank + num_ranks))
+            self.engine_ranks_managed = (
+                [dp_rank] if offline_mode else list(range(dp_rank, dp_rank + num_ranks))
+            )
             assert parallel_config.data_parallel_size_local <= len(
-                self.engine_ranks_managed)
+                self.engine_ranks_managed
+            )
 
             # ZMQ identity of each engine that this client will talk to.
             self.core_engines: list[EngineIdentity] = [
-                rank.to_bytes(2, "little")
-                for rank in self.engine_ranks_managed
+                rank.to_bytes(2, "little") for rank in self.engine_ranks_managed
             ]
 
             # Wait for ready messages from each engine on the input socket.
@@ -494,8 +521,10 @@ class MPClient(EngineCoreClient):
             sync_input_socket = zmq.Socket.shadow(self.input_socket)
             while identities:
                 if not sync_input_socket.poll(timeout=600_000):
-                    raise TimeoutError("Timed out waiting for engines to send"
-                                       "initial message on input socket.")
+                    raise TimeoutError(
+                        "Timed out waiting for engines to send"
+                        "initial message on input socket."
+                    )
                 identity, _ = sync_input_socket.recv_multipart()
                 identities.remove(identity)
 
@@ -521,8 +550,9 @@ class MPClient(EngineCoreClient):
 
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
-        return EngineDeadError(
-            suppress_context=True) if self.resources.engine_dead else e
+        return (
+            EngineDeadError(suppress_context=True) if self.resources.engine_dead else e
+        )
 
     def ensure_alive(self):
         if self.resources.engine_dead:
@@ -542,8 +572,11 @@ class MPClient(EngineCoreClient):
     def start_engine_core_monitor(self):
         """Start a monitor thread for engine core processes."""
         engine_manager = self.resources.engine_manager
-        if (engine_manager is None or not hasattr(engine_manager, 'processes')
-                or not engine_manager.processes):
+        if (
+            engine_manager is None
+            or not hasattr(engine_manager, "processes")
+            or not engine_manager.processes
+        ):
             # No engine processes to monitor
             return
 
@@ -560,23 +593,26 @@ class MPClient(EngineCoreClient):
             if not _self or _self.resources.engine_dead:
                 return
             _self.resources.engine_dead = True
-            proc_name = next(proc.name for proc in engine_processes
-                             if proc.sentinel == died[0])
+            proc_name = next(
+                proc.name for proc in engine_processes if proc.sentinel == died[0]
+            )
             logger.error(
-                "Engine core proc %s died unexpectedly, "
-                "shutting down client.", proc_name)
+                "Engine core proc %s died unexpectedly, shutting down client.",
+                proc_name,
+            )
             _self.shutdown()
             # Note: For MPClient, we don't have a failure callback mechanism
             # like MultiprocExecutor, but we set engine_dead flag which will
             # cause subsequent operations to raise EngineDeadError
 
-        Thread(target=monitor_engine_cores,
-               daemon=True,
-               name="MPClientEngineMonitor").start()
+        Thread(
+            target=monitor_engine_cores, daemon=True, name="MPClientEngineMonitor"
+        ).start()
 
 
-def _process_utility_output(output: UtilityOutput,
-                            utility_results: dict[int, AnyFuture]):
+def _process_utility_output(
+    output: UtilityOutput, utility_results: dict[int, AnyFuture]
+):
     """Set the result from a utility method in the waiting future."""
     future = utility_results.pop(output.call_id)
     failure_message = output.failure_message
@@ -591,15 +627,17 @@ def _process_utility_output(output: UtilityOutput,
         # original calling task being cancelled.
         if failure_message is not None:
             logger.error(
-                "Cancelled call to utility method failed "
-                "with error: %s", failure_message)
+                "Cancelled call to utility method failed with error: %s",
+                failure_message,
+            )
 
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
-                 log_stats: bool):
+    def __init__(
+        self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool
+    ):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -608,7 +646,7 @@ class SyncMPClient(MPClient):
         )
 
         self.is_dp = self.vllm_config.parallel_config.data_parallel_size > 1
-        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
+        self.outputs_queue = queue.Queue[EngineCoreOutputs | Exception]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -642,8 +680,7 @@ class SyncMPClient(MPClient):
                     resources.validate_alive(frames)
                     outputs: EngineCoreOutputs = decoder.decode(frames)
                     if outputs.utility_output:
-                        _process_utility_output(outputs.utility_output,
-                                                utility_results)
+                        _process_utility_output(outputs.utility_output, utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
             except Exception as e:
@@ -654,9 +691,11 @@ class SyncMPClient(MPClient):
                 out_socket.close(linger=0)
 
         # Process outputs from engine in separate thread.
-        self.output_queue_thread = Thread(target=process_outputs_socket,
-                                          name="EngineCoreOutputQueueThread",
-                                          daemon=True)
+        self.output_queue_thread = Thread(
+            target=process_outputs_socket,
+            name="EngineCoreOutputQueueThread",
+            daemon=True,
+        )
         self.output_queue_thread.start()
 
         # The thread takes on responsibility for closing the socket.
@@ -677,8 +716,7 @@ class SyncMPClient(MPClient):
         self.ensure_alive()
         self.free_pending_messages()
         # (Identity, RequestType, SerializedRequest)
-        msg = (self.core_engine, request_type.value,
-               *self.encoder.encode(request))
+        msg = (self.core_engine, request_type.value, *self.encoder.encode(request))
 
         if len(msg) <= 3:
             # No auxiliary buffers => no tensor backing buffers in request.
@@ -692,8 +730,7 @@ class SyncMPClient(MPClient):
         call_id = uuid.uuid1().int >> 64
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
-        self._send_input(EngineCoreRequestType.UTILITY,
-                         (0, call_id, method, args))
+        self._send_input(EngineCoreRequestType.UTILITY, (0, call_id, method, args))
 
         return future.result()
 
@@ -733,7 +770,7 @@ class SyncMPClient(MPClient):
     def sleep(self, level: int = 1) -> None:
         self.call_utility("sleep", level)
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         self.call_utility("wake_up", tags)
 
     def is_sleeping(self) -> bool:
@@ -742,31 +779,33 @@ class SyncMPClient(MPClient):
     def execute_dummy_batch(self) -> None:
         self.call_utility("execute_dummy_batch")
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        return self.call_utility("collective_rpc", method, timeout, args,
-                                 kwargs)
-
-    def save_sharded_state(self,
-                           path: str,
-                           pattern: Optional[str] = None,
-                           max_size: Optional[int] = None) -> None:
+    def collective_rpc(
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
+        return self.call_utility("collective_rpc", method, timeout, args, kwargs)
+
+    def save_sharded_state(
+        self, path: str, pattern: str | None = None, max_size: int | None = None
+    ) -> None:
         self.call_utility("save_sharded_state", path, pattern, max_size)
 
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: type[Executor],
-                 log_stats: bool,
-                 client_addresses: Optional[dict[str, str]] = None,
-                 client_count: int = 1,
-                 client_index: int = 0):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
@@ -777,8 +816,7 @@ class AsyncMPClient(MPClient):
 
         self.client_count = client_count
         self.client_index = client_index
-        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
-                                                 Exception]]()
+        self.outputs_queue = asyncio.Queue[EngineCoreOutputs | Exception]()
         try:
             # If we are running in an asyncio event loop, start the queue task.
             # Otherwise, it will be started lazily. If it is not started here,
@@ -799,10 +837,9 @@ class AsyncMPClient(MPClient):
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
-        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
-                                          Awaitable[None]]] = getattr(
-                                              self.__class__,
-                                              "process_engine_outputs", None)
+        output_handler: (
+            Callable[[AsyncMPClient, EngineCoreOutputs], Awaitable[None]] | None
+        ) = getattr(self.__class__, "process_engine_outputs", None)
         _self_ref = weakref.ref(self) if output_handler else None
         output_socket = resources.output_socket
         assert output_socket is not None
@@ -814,8 +851,7 @@ class AsyncMPClient(MPClient):
                     resources.validate_alive(frames)
                     outputs: EngineCoreOutputs = decoder.decode(frames)
                     if outputs.utility_output:
-                        _process_utility_output(outputs.utility_output,
-                                                utility_results)
+                        _process_utility_output(outputs.utility_output, utility_results)
                         continue
 
                     if output_handler is not None:
@@ -834,7 +870,8 @@ class AsyncMPClient(MPClient):
                 outputs_queue.put_nowait(EngineDeadError())
 
         resources.output_queue_task = asyncio.create_task(
-            process_outputs_socket(), name="EngineCoreOutputQueueTask")
+            process_outputs_socket(), name="EngineCoreOutputQueueTask"
+        )
 
     async def get_output_async(self) -> EngineCoreOutputs:
         self._ensure_output_queue_task()
@@ -847,19 +884,21 @@ class AsyncMPClient(MPClient):
             raise self._format_exception(outputs) from None
         return outputs
 
-    def _send_input(self,
-                    request_type: EngineCoreRequestType,
-                    request: Any,
-                    engine: Optional[EngineIdentity] = None) -> Awaitable[Any]:
+    def _send_input(
+        self,
+        request_type: EngineCoreRequestType,
+        request: Any,
+        engine: EngineIdentity | None = None,
+    ) -> Awaitable[Any]:
         if engine is None:
             engine = self.core_engine
 
         message = (request_type.value, *self.encoder.encode(request))
         return self._send_input_message(message, engine, request)
 
-    def _send_input_message(self, message: tuple[bytestr,
-                                                 ...], engine: EngineIdentity,
-                            objects: Any) -> Awaitable[Any]:
+    def _send_input_message(
+        self, message: tuple[bytestr, ...], engine: EngineIdentity, objects: Any
+    ) -> Awaitable[Any]:
         """
         objects is a reference to retain until zmq is finished with the
         buffers, in case they were extracted from tensors in the request.
@@ -867,7 +906,7 @@ class AsyncMPClient(MPClient):
         self.ensure_alive()
         self.free_pending_messages()
 
-        msg = (engine, ) + message
+        msg = (engine,) + message
         if not objects or len(msg) <= 3:
             # No auxiliary buffers => no tensor backing buffers in request.
             return self.input_socket.send_multipart(msg, copy=False)
@@ -883,17 +922,18 @@ class AsyncMPClient(MPClient):
         return future
 
     async def call_utility_async(self, method: str, *args) -> Any:
-        return await self._call_utility_async(method,
-                                              *args,
-                                              engine=self.core_engine)
+        return await self._call_utility_async(method, *args, engine=self.core_engine)
 
-    async def _call_utility_async(self, method: str, *args,
-                                  engine: EngineIdentity) -> Any:
+    async def _call_utility_async(
+        self, method: str, *args, engine: EngineIdentity
+    ) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (self.client_index, call_id, method, args)))
+        message = (
+            EngineCoreRequestType.UTILITY.value,
+            *self.encoder.encode((self.client_index, call_id, method, args)),
+        )
         await self._send_input_message(message, engine, args)
         self._ensure_output_queue_task()
         return await future
@@ -922,7 +962,7 @@ class AsyncMPClient(MPClient):
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
 
-    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up_async(self, tags: list[str] | None = None) -> None:
         await self.call_utility_async("wake_up", tags)
 
     async def is_sleeping_async(self) -> bool:
@@ -943,38 +983,46 @@ class AsyncMPClient(MPClient):
     async def pin_lora_async(self, lora_id: int) -> bool:
         return await self.call_utility_async("pin_lora", lora_id)
 
-    async def save_sharded_state_async(self,
-                                       path: str,
-                                       pattern: Optional[str] = None,
-                                       max_size: Optional[int] = None) -> None:
-        await self.call_utility_async("save_sharded_state", path, pattern,
-                                      max_size)
+    async def save_sharded_state_async(
+        self, path: str, pattern: str | None = None, max_size: int | None = None
+    ) -> None:
+        await self.call_utility_async("save_sharded_state", path, pattern, max_size)
 
     async def collective_rpc_async(
-            self,
-            method: Union[str, Callable[..., _R]],
-            timeout: Optional[float] = None,
-            args: tuple = (),
-            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        return await self.call_utility_async("collective_rpc", method, timeout,
-                                             args, kwargs)
+        self,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
+        return await self.call_utility_async(
+            "collective_rpc", method, timeout, args, kwargs
+        )
 
 
 class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
     EngineCore. Assumes external load-balancing by default."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: type[Executor],
-                 log_stats: bool,
-                 client_addresses: Optional[dict[str, str]] = None,
-                 client_count: int = 1,
-                 client_index: int = 0):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ):
         self.current_wave = 0
 
-        super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_count, client_index)
+        super().__init__(
+            vllm_config,
+            executor_class,
+            log_stats,
+            client_addresses,
+            client_count,
+            client_index,
+        )
 
         # List of [waiting, running] pair per engine.
         # Used only by DPLBAsyncMPClient subclass.
@@ -982,10 +1030,8 @@ class DPAsyncMPClient(AsyncMPClient):
 
         self.first_req_sock_addr = get_open_zmq_inproc_path()
         self.first_req_send_socket = self.resources.first_req_send_socket = (
-            make_zmq_socket(self.ctx,
-                            self.first_req_sock_addr,
-                            zmq.PAIR,
-                            bind=True))
+            make_zmq_socket(self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=True)
+        )
         try:
             # If we are running in an asyncio event loop, start the stats task.
             # Otherwise, it will be started lazily.
@@ -1000,29 +1046,28 @@ class DPAsyncMPClient(AsyncMPClient):
             return
 
         assert self.stats_update_address is not None
+        stats_addr: str = self.stats_update_address
         assert len(self.engine_ranks_managed) > 0
         # NOTE: running and waiting counts are all global from
         # the Coordinator include all global EngineCores. This
         # slice includes just the cores managed by this client.
-        count_slice = slice(self.engine_ranks_managed[0],
-                            self.engine_ranks_managed[-1] + 1)
+        count_slice = slice(
+            self.engine_ranks_managed[0], self.engine_ranks_managed[-1] + 1
+        )
 
         async def run_engine_stats_update_task():
-            with (make_zmq_socket(self.ctx,
-                                  self.stats_update_address,
-                                  zmq.XSUB,
-                                  linger=0) as socket,
-                  make_zmq_socket(self.ctx,
-                                  self.first_req_sock_addr,
-                                  zmq.PAIR,
-                                  bind=False,
-                                  linger=0) as first_req_rcv_socket):
+            with (
+                make_zmq_socket(self.ctx, stats_addr, zmq.XSUB, linger=0) as socket,
+                make_zmq_socket(
+                    self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=False, linger=0
+                ) as first_req_rcv_socket,
+            ):
                 assert isinstance(socket, zmq.asyncio.Socket)
                 assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
                 self.resources.stats_update_socket = socket
                 self.resources.first_req_rcv_socket = first_req_rcv_socket
                 # Send subscription message.
-                await socket.send(b'\x01')
+                await socket.send(b"\x01")
 
                 poller = zmq.asyncio.Poller()
                 poller.register(socket, zmq.POLLIN)
@@ -1030,23 +1075,27 @@ class DPAsyncMPClient(AsyncMPClient):
 
                 while True:
                     events = await poller.poll()
-                    if not self.engines_running and len(events) == 2 or (
-                            events[0][0] == first_req_rcv_socket):
+                    if (
+                        not self.engines_running
+                        and len(events) == 2
+                        or (events[0][0] == first_req_rcv_socket)
+                    ):
                         # Check if this is a regular request notification or
                         # scale up notification
-                        buf = first_req_rcv_socket.recv(
-                            flags=zmq.NOBLOCK).result()
+                        buf = first_req_rcv_socket.recv(flags=zmq.NOBLOCK).result()
 
                         decoded = msgspec.msgpack.decode(buf)
-                        if isinstance(
-                                decoded,
-                            (list, tuple)) and len(decoded) == 2 and decoded[
-                                0] == "SCALE_ELASTIC_EP":
+                        if (
+                            isinstance(decoded, (list, tuple))
+                            and len(decoded) == 2
+                            and decoded[0] == "SCALE_ELASTIC_EP"
+                        ):
                             # Extract new engine count from the decoded message
                             new_engine_count = decoded[1]
                             # Send scale up notification to coordinator
                             scale_msg = msgspec.msgpack.encode(
-                                ("SCALE_ELASTIC_EP", new_engine_count))
+                                ("SCALE_ELASTIC_EP", new_engine_count)
+                            )
                             await socket.send(scale_msg)
                             continue
 
@@ -1057,14 +1106,14 @@ class DPAsyncMPClient(AsyncMPClient):
                         target_eng_index = decoded[1]
                         self.engines_running = True
                         msg = msgspec.msgpack.encode(
-                            (target_eng_index, self.current_wave))
+                            (target_eng_index, self.current_wave)
+                        )
                         await socket.send(msg)
 
                     buf = None
                     while True:
                         # Drain all stats events (we only care about latest).
-                        future: asyncio.Future[bytes] = socket.recv(
-                            flags=zmq.NOBLOCK)
+                        future: asyncio.Future[bytes] = socket.recv(flags=zmq.NOBLOCK)
                         if isinstance(future.exception(), zmq.Again):
                             break
                         buf = future.result()
@@ -1078,11 +1127,13 @@ class DPAsyncMPClient(AsyncMPClient):
                     if counts is not None:
                         sliced_counts = counts[count_slice]
                         self.lb_engines = sliced_counts
-                        logger.debug("Received counts: %s (%s)", sliced_counts,
-                                     count_slice)
+                        logger.debug(
+                            "Received counts: %s (%s)", sliced_counts, count_slice
+                        )
 
         resources.stats_update_task = asyncio.create_task(
-            run_engine_stats_update_task())
+            run_engine_stats_update_task()
+        )
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         self._ensure_stats_update_task()
@@ -1091,8 +1142,7 @@ class DPAsyncMPClient(AsyncMPClient):
         request.client_index = self.client_index
 
         chosen_engine = self.get_core_engine_for_request(request)
-        to_await = self._send_input(EngineCoreRequestType.ADD, request,
-                                    chosen_engine)
+        to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine)
         if not self.engines_running:
             # Notify coordinator that we're sending a request
             req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine))
@@ -1110,29 +1160,36 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
     EngineCore. Load-balances between multiple engine processes."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: type[Executor],
-                 log_stats: bool,
-                 client_addresses: Optional[dict[str, str]] = None,
-                 client_count: int = 1,
-                 client_index: int = 0):
-
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ):
         self.client_count = client_count
 
         # To route aborts to the correct engine.
         self.reqs_in_flight: dict[str, EngineIdentity] = {}
 
-        super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_count, client_index)
+        super().__init__(
+            vllm_config,
+            executor_class,
+            log_stats,
+            client_addresses,
+            client_count,
+            client_index,
+        )
 
         assert len(self.core_engines) > 1
 
-        self.eng_start_index = (len(self.core_engines) *
-                                self.client_index) // client_count
+        self.eng_start_index = (
+            len(self.core_engines) * self.client_index
+        ) // client_count
 
-    def get_core_engine_for_request(
-            self, request: EngineCoreRequest) -> EngineIdentity:
+    def get_core_engine_for_request(self, request: EngineCoreRequest) -> EngineIdentity:
         # Engines are in rank order.
         if (eng_index := request.data_parallel_rank) is None:
             current_counts = self.lb_engines
@@ -1160,14 +1217,19 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
 
     async def call_utility_async(self, method: str, *args) -> Any:
         # Only the result from the first engine is returned.
-        return (await asyncio.gather(*[
-            self._call_utility_async(method, *args, engine=engine)
-            for engine in self.core_engines
-        ]))[0]
+        return (
+            await asyncio.gather(
+                *[
+                    self._call_utility_async(method, *args, engine=engine)
+                    for engine in self.core_engines
+                ]
+            )
+        )[0]
 
     @staticmethod
-    async def process_engine_outputs(self: "DPLBAsyncMPClient",
-                                     outputs: EngineCoreOutputs):
+    async def process_engine_outputs(
+        self: "DPLBAsyncMPClient", outputs: EngineCoreOutputs
+    ):
         if outputs.finished_requests and self.reqs_in_flight:
             for req_id in outputs.finished_requests:
                 self.reqs_in_flight.pop(req_id, None)
@@ -1189,10 +1251,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         for engine, req_ids in by_engine.items():
             await self._abort_requests(req_ids, engine)
 
-    async def _abort_requests(self, request_ids: list[str],
-                              engine: EngineIdentity) -> None:
-        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
-                               engine)
+    async def _abort_requests(
+        self, request_ids: list[str], engine: EngineIdentity
+    ) -> None:
+        await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine)
 
     async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
         """Scale elastic EP data parallel size"""
@@ -1200,22 +1262,27 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
 
         assert new_data_parallel_size != cur_data_parallel_size, (
             f"new_data_parallel_size {new_data_parallel_size} must be "
-            f"different from cur_data_parallel_size {cur_data_parallel_size}")
+            f"different from cur_data_parallel_size {cur_data_parallel_size}"
+        )
 
-        assert self.vllm_config.parallel_config.data_parallel_backend == \
-            "ray", "Only ray DP backend supports scaling elastic EP"
+        assert self.vllm_config.parallel_config.data_parallel_backend == "ray", (
+            "Only ray DP backend supports scaling elastic EP"
+        )
 
         scale_up = new_data_parallel_size > cur_data_parallel_size
 
         if scale_up:
-            await self._scale_up_elastic_ep(cur_data_parallel_size,
-                                            new_data_parallel_size)
+            await self._scale_up_elastic_ep(
+                cur_data_parallel_size, new_data_parallel_size
+            )
         else:
-            await self._scale_down_elastic_ep(cur_data_parallel_size,
-                                              new_data_parallel_size)
+            await self._scale_down_elastic_ep(
+                cur_data_parallel_size, new_data_parallel_size
+            )
 
-    async def _scale_up_elastic_ep(self, cur_data_parallel_size: int,
-                                   new_data_parallel_size: int) -> None:
+    async def _scale_up_elastic_ep(
+        self, cur_data_parallel_size: int, new_data_parallel_size: int
+    ) -> None:
         """Scale up the data parallel size by creating new engine cores
         and reconfiguring existing ones."""
         cur_data_parallel_size = len(self.core_engines)
@@ -1223,21 +1290,18 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # Phase 1: Send reconfigure messages to all existing engines and wait
         # for them to be sent
         reconfig_futures = []
-        self.vllm_config.parallel_config.data_parallel_master_port = \
-            get_open_port()
+        self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
         for engine in self.core_engines:
             reconfig_request = ReconfigureDistributedRequest(
                 new_data_parallel_size=new_data_parallel_size,
                 new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_rank_local=\
-                ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_master_ip=self.vllm_config.parallel_config.
-                data_parallel_master_ip,
-                new_data_parallel_master_port=self.vllm_config.parallel_config.
-                data_parallel_master_port)
-            coro = self._call_utility_async("reinitialize_distributed",
-                                            reconfig_request,
-                                            engine=engine)
+                new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
+                new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
+            )
+            coro = self._call_utility_async(
+                "reinitialize_distributed", reconfig_request, engine=engine
+            )
             reconfig_futures.append(asyncio.create_task(coro))
 
         logger.info("All reconfigure messages sent, starting engine creation")
@@ -1245,10 +1309,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # Phase 2: Create new engines now that reconfig messages have been sent
         # self.resources.engine_manager is guaranteed to be
         # CoreEngineActorManager for RayDPClient
-        assert isinstance(self.resources.engine_manager,
-                          CoreEngineActorManager)
+        assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
         self.resources.engine_manager.scale_up_elastic_ep(
-            self.vllm_config, new_data_parallel_size)
+            self.vllm_config, new_data_parallel_size
+        )
 
         # Create new CoreEngine objects for the new engines
         new_engine_identities = set()
@@ -1263,7 +1327,8 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
             if not sync_input_socket.poll(timeout=600_000):
                 raise TimeoutError(
                     "Timed out waiting for new engines to send initial "
-                    "message on input socket.")
+                    "message on input socket."
+                )
             identity, _ = sync_input_socket.recv_multipart()
             new_engine_identities.discard(identity)
 
@@ -1275,42 +1340,42 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # stats_update_task connection
         self._ensure_stats_update_task()
         scale_up_marker = msgspec.msgpack.encode(
-            ("SCALE_ELASTIC_EP", new_data_parallel_size))
+            ("SCALE_ELASTIC_EP", new_data_parallel_size)
+        )
         await self.first_req_send_socket.send(scale_up_marker)
 
         # Update the parallel config
-        self.vllm_config.parallel_config.data_parallel_size = \
-            new_data_parallel_size
+        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         logger.info(
             "[Elastic EP] Scale up completed, new data parallel size: %s",
-            new_data_parallel_size)
+            new_data_parallel_size,
+        )
 
-    async def _scale_down_elastic_ep(self, cur_data_parallel_size: int,
-                                     new_data_parallel_size: int) -> None:
+    async def _scale_down_elastic_ep(
+        self, cur_data_parallel_size: int, new_data_parallel_size: int
+    ) -> None:
         """Scale down the data parallel size by shutting down and
         reconfiguring existing engine cores."""
         cur_data_parallel_size = len(self.core_engines)
 
-        self.vllm_config.parallel_config.data_parallel_master_port = \
-            get_open_port()
+        self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
 
         reconfig_futures = []
         for cur_dp_rank, engine in enumerate(self.core_engines):
             reconfig_request = ReconfigureDistributedRequest(
                 new_data_parallel_size=new_data_parallel_size,
                 new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_rank_local=\
-                ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_master_ip=self.vllm_config.parallel_config.
-                data_parallel_master_ip,
-                new_data_parallel_master_port=self.vllm_config.parallel_config.
-                data_parallel_master_port)
+                new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
+                new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
+                new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
+            )
             if cur_dp_rank >= new_data_parallel_size:
-                reconfig_request.new_data_parallel_rank = \
-                ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-            coro = self._call_utility_async("reinitialize_distributed",
-                                            reconfig_request,
-                                            engine=engine)
+                reconfig_request.new_data_parallel_rank = (
+                    ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+                )
+            coro = self._call_utility_async(
+                "reinitialize_distributed", reconfig_request, engine=engine
+            )
             reconfig_futures.append(asyncio.create_task(coro))
 
         for _ in range(new_data_parallel_size, cur_data_parallel_size):
@@ -1318,18 +1383,19 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
 
         await asyncio.gather(*reconfig_futures)
 
-        assert isinstance(self.resources.engine_manager,
-                          CoreEngineActorManager)
+        assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
         self.resources.engine_manager.scale_down_elastic_ep(
-            cur_data_parallel_size, new_data_parallel_size)
+            cur_data_parallel_size, new_data_parallel_size
+        )
 
         self._ensure_stats_update_task()
         scale_down_marker = msgspec.msgpack.encode(
-            ("SCALE_ELASTIC_EP", new_data_parallel_size))
+            ("SCALE_ELASTIC_EP", new_data_parallel_size)
+        )
         await self.first_req_send_socket.send(scale_down_marker)
 
-        self.vllm_config.parallel_config.data_parallel_size = \
-            new_data_parallel_size
+        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         logger.info(
             "[Elastic EP] Scale down completed, new data parallel size: %s",
-            new_data_parallel_size)
+            new_data_parallel_size,
+        )
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0f993a74c810371b0aa865deecd862966aee9076..b7a24096bf15f5b7730891e8b9d61cd1e66f25c5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import tokenizers
 from packaging import version
@@ -11,7 +10,10 @@ from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+    AnyTokenizer,
+    convert_prompt_ids_to_tokens,
+    detokenize_incrementally,
+)
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 
@@ -19,15 +21,13 @@ logger = init_logger(__name__)
 
 # Only tokenizers >= 0.21.1 supports DecodeStream used for
 # FastIncrementalDetokenizer.
-USE_FAST_DETOKENIZER = version.parse(
-    tokenizers.__version__) >= version.parse("0.21.1")
+USE_FAST_DETOKENIZER = version.parse(tokenizers.__version__) >= version.parse("0.21.1")
 
 # Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042
 INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered"
 
 
 class IncrementalDetokenizer:
-
     def __init__(self):
         self.token_ids: list[int] = []
 
@@ -35,8 +35,7 @@ class IncrementalDetokenizer:
     def output_token_ids(self) -> list[int]:
         return self.token_ids
 
-    def update(self, new_token_ids: list[int],
-               stop_terminated: bool) -> Optional[str]:
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
         self.token_ids.extend(new_token_ids)
         return None
 
@@ -46,18 +45,16 @@ class IncrementalDetokenizer:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
-
         assert request.sampling_params is not None
 
         if tokenizer is None:
             # No tokenizer => skipping detokenization.
             return IncrementalDetokenizer()
 
-        if USE_FAST_DETOKENIZER and isinstance(tokenizer,
-                                               PreTrainedTokenizerFast):
+        if USE_FAST_DETOKENIZER and isinstance(tokenizer, PreTrainedTokenizerFast):
             # Fast tokenizer => use tokenizers library DecodeStream.
             return FastIncrementalDetokenizer(tokenizer, request)
 
@@ -66,21 +63,27 @@ class IncrementalDetokenizer:
 
 
 class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
-
     def __init__(self, request: EngineCoreRequest):
         super().__init__()
 
         # Stop strings
         params = request.sampling_params
         assert params is not None
-        self.stop = stop = params.stop
+        stop_list: list[str]
+        if params.stop is None:
+            stop_list = []
+        elif isinstance(params.stop, str):
+            stop_list = [params.stop]
+        else:
+            stop_list = params.stop
+        self.stop = stop_list
         self.min_tokens = params.min_tokens
         self.include_stop_str_in_output = params.include_stop_str_in_output
 
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stop and not self.include_stop_str_in_output:
-            self.stop_buffer_length = max(len(s) for s in stop) - 1
+        if self.stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in self.stop) - 1
         else:
             self.stop_buffer_length = 0
         self._last_output_text_offset: int = 0
@@ -88,8 +91,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
         # Generation data
         self.output_text = ""
 
-    def update(self, new_token_ids: list[int],
-               stop_terminated: bool) -> Optional[str]:
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
@@ -117,8 +119,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
             self.token_ids.append(new_token_id)
             self.output_text += self.decode_next(new_token_id)
             # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
-            if self.min_tokens and len(
-                    self.output_token_ids) <= self.min_tokens:
+            if self.min_tokens and len(self.output_token_ids) <= self.min_tokens:
                 stop_check_offset = len(self.output_text)
 
         if skipped_stop_token_id is not None:
@@ -152,8 +153,11 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
         # We return the full output text if the sequence is finished.
         buffer_length = 0 if finished else self.stop_buffer_length
         if not delta:
-            return self.output_text[:-buffer_length] if buffer_length else (
-                self.output_text)
+            return (
+                self.output_text[:-buffer_length]
+                if buffer_length
+                else (self.output_text)
+            )
         length = len(self.output_text) - buffer_length
         last_offset = self._last_output_text_offset
         if last_offset < length:
@@ -163,9 +167,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
 
 
 class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
-
-    def __init__(self, tokenizer: PreTrainedTokenizerFast,
-                 request: EngineCoreRequest):
+    def __init__(self, tokenizer: PreTrainedTokenizerFast, request: EngineCoreRequest):
         super().__init__(request)
 
         sampling_params = request.sampling_params
@@ -173,8 +175,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
 
         self.request_id = request.request_id
         self.skip_special_tokens = sampling_params.skip_special_tokens
-        self.stream = DecodeStream(
-            skip_special_tokens=self.skip_special_tokens)
+        self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens)
 
         self.tokenizer: Tokenizer = tokenizer._tokenizer
 
@@ -185,7 +186,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
         if prompt_len > 4:
             for i in range(4, min(prompt_len + 1, 24)):
                 suffix = prompt_token_ids[-i:]
-                if '�' not in self.tokenizer.decode(suffix):
+                if "�" not in self.tokenizer.decode(suffix):
                     prompt_suffix = suffix
                     break
 
@@ -195,17 +196,18 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
 
         self.spaces_between_special_tokens = (
             sampling_params.skip_special_tokens
-            or sampling_params.spaces_between_special_tokens)
+            or sampling_params.spaces_between_special_tokens
+        )
 
         if not self.spaces_between_special_tokens:
             # Store dict of added token ids so that we can suppress
             # the spaces between them.
-            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
-                                           None)) is None:
+            if (
+                added_token_ids := getattr(self.tokenizer, "added_token_ids", None)
+            ) is None:
                 self.tokenizer.added_token_ids = added_token_ids = {
                     tid: tok.content
-                    for tid, tok in
-                    self.tokenizer.get_added_tokens_decoder().items()
+                    for tid, tok in self.tokenizer.get_added_tokens_decoder().items()
                 }
 
             if added_token_ids:
@@ -228,13 +230,13 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
 
         return token or ""
 
-    def _protected_step(self, next_token_id: int) -> Optional[str]:
+    def _protected_step(self, next_token_id: int) -> str | None:
         try:
             token = self.stream.step(self.tokenizer, next_token_id)
-        except OverflowError:
+        except (OverflowError, TypeError):
             # Handle rare observed overflow, still to be diagnosed.
             # See https://github.com/vllm-project/vllm/issues/21951.
-            logger.exception("Encountered invalid token id: %d", next_token_id)
+            logger.exception("Encountered invalid token id: %r", next_token_id)
             token = None
         except Exception as e:
             if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
@@ -245,15 +247,15 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
             # See https://github.com/vllm-project/vllm/issues/17448.
             logger.warning(
                 "Encountered invalid prefix detokenization error"
-                " for request %s, resetting decode stream.", self.request_id)
-            self.stream = DecodeStream(
-                skip_special_tokens=self.skip_special_tokens)
+                " for request %s, resetting decode stream.",
+                self.request_id,
+            )
+            self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens)
             token = self.stream.step(self.tokenizer, next_token_id)
         return token
 
 
 class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
-
     def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
         super().__init__(request)
 
@@ -262,7 +264,8 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
         assert params is not None
 
         self.prompt_len = length_from_prompt_token_ids_or_embeds(
-            request.prompt_token_ids, request.prompt_embeds)
+            request.prompt_token_ids, request.prompt_embeds
+        )
 
         # Metadata for incremental detokenization.
         if request.prompt_token_ids is not None:
@@ -271,37 +274,37 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
                     tokenizer=tokenizer,
                     prompt_ids=request.prompt_token_ids,
                     skip_special_tokens=params.skip_special_tokens,
-                ))
+                )
+            )
         else:
             # Prompt embedding requests cannot be detokenized, in general.
             self.tokens = [""] * self.prompt_len
             self.prefix_offset = 0
             self.read_offest = 0
 
-        self.token_ids.extend(request.prompt_token_ids
-                              or [0] * self.prompt_len)
+        self.token_ids.extend(request.prompt_token_ids or [0] * self.prompt_len)
 
         self.skip_special_tokens = params.skip_special_tokens
-        self.spaces_between_special_tokens = (
-            params.spaces_between_special_tokens)
+        self.spaces_between_special_tokens = params.spaces_between_special_tokens
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids if not self.prompt_len else (
-            self.token_ids[self.prompt_len:])
+        return (
+            self.token_ids
+            if not self.prompt_len
+            else (self.token_ids[self.prompt_len :])
+        )
 
     def decode_next(self, next_token_id: int) -> str:
-        new_tokens, decoded_text, prefix_offset, read_offset = (
-            detokenize_incrementally(
-                tokenizer=self.tokenizer,
-                all_input_ids=self.token_ids,
-                prev_tokens=self.tokens,
-                prefix_offset=self.prefix_offset,
-                read_offset=self.read_offset,
-                skip_special_tokens=self.skip_special_tokens,
-                spaces_between_special_tokens=self.
-                spaces_between_special_tokens,
-            ))
+        new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally(
+            tokenizer=self.tokenizer,
+            all_input_ids=self.token_ids,
+            prev_tokens=self.tokens,
+            prefix_offset=self.prefix_offset,
+            read_offset=self.read_offset,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+        )
 
         self.tokens.extend(new_tokens)
         self.prefix_offset = prefix_offset
@@ -315,7 +318,7 @@ def check_stop_strings(
     new_char_count: int,
     stop: list[str],
     include_in_output: bool,
-) -> Optional[tuple[str, int]]:
+) -> tuple[str, int] | None:
     """Check if any stop strings are matched and truncate sequence
     output text accordingly.
 
@@ -331,8 +334,7 @@ def check_stop_strings(
     for stop_str in stop:
         stop_string_len = len(stop_str)
         # Avoid searching already-searched text.
-        stop_index = output_text.find(stop_str,
-                                      1 - new_char_count - stop_string_len)
+        stop_index = output_text.find(stop_str, 1 - new_char_count - stop_string_len)
         if stop_index == -1:
             continue
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 692ba9dc840f8bc2ac3a5803ec3214de4e90f7d3..d9f79a019e2df8f51a229a1618de4bb14cf6cb1d 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 class EngineGenerateError(Exception):
     """Raised when a AsyncLLM.generate() fails. Recoverable."""
+
     pass
 
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 2738776e3d37c1f378d50dad2ca6ed01a8f1b29c..e403cea87788bc3bb0511fa3ee115d5cae7742ee 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from copy import copy
-from typing import Any, Callable, Optional, Union
+from typing import Any, cast
 
 import torch.nn as nn
 from typing_extensions import TypeVar
@@ -19,22 +19,23 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               init_tokenizer_from_configs)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
@@ -50,24 +51,13 @@ class LLMEngine:
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        aggregate_engine_logging: bool = False,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "LLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        if stat_loggers is not None:
-            raise NotImplementedError(
-                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
-                "Set VLLM_USE_V1=0 and file and issue on Github.")
-
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
         self.model_config = vllm_config.model_config
@@ -75,39 +65,43 @@ class LLMEngine:
 
         self.log_stats = log_stats
 
-        executor_backend = (
-            self.vllm_config.parallel_config.distributed_executor_backend)
+        executor_backend = self.vllm_config.parallel_config.distributed_executor_backend
         parallel_config = vllm_config.parallel_config
-        self.external_launcher_dp = (parallel_config.data_parallel_size > 1 and
-                                     executor_backend == "external_launcher")
+        self.external_launcher_dp = (
+            parallel_config.data_parallel_size > 1
+            and executor_backend == "external_launcher"
+        )
         # important: init dp group before init the engine_core
         # In the decoupled engine case this is handled in EngineCoreProc.
-        if not multiprocess_mode and parallel_config.data_parallel_size > 1 \
-            and not self.external_launcher_dp:
+        if (
+            not multiprocess_mode
+            and parallel_config.data_parallel_size > 1
+            and not self.external_launcher_dp
+        ):
             self.dp_group = parallel_config.stateless_init_dp_group()
         else:
             self.dp_group = None
         self.should_execute_dummy_batch = False
 
         if self.model_config.skip_tokenizer_init:
-            self.tokenizer = None
+            tokenizer = None
         else:
-            # Tokenizer (+ ensure liveness if running in another process).
-            self.tokenizer = init_tokenizer_from_configs(
-                model_config=vllm_config.model_config)
+            tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config=vllm_config,
-                                   tokenizer=self.tokenizer,
-                                   mm_registry=mm_registry)
+        self.processor = Processor(self.vllm_config, tokenizer)
+        self.io_processor = get_io_processor(
+            self.vllm_config,
+            self.model_config.io_processor_plugin,
+        )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=self.log_stats)
-        if self.observability_config.otlp_traces_endpoint is not None:
-            tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
+        self.output_processor = OutputProcessor(
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+        )
+        endpoint = self.observability_config.otlp_traces_endpoint
+        if endpoint is not None:
+            tracer = init_tracer("vllm.llm_engine", endpoint)
             self.output_processor.tracer = tracer
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
@@ -119,12 +113,13 @@ class LLMEngine:
             log_stats=self.log_stats,
         )
 
-        self.logger_manager: Optional[StatLoggerManager] = None
+        self.logger_manager: StatLoggerManager | None = None
         if self.log_stats:
             self.logger_manager = StatLoggerManager(
                 vllm_config=vllm_config,
                 custom_stat_loggers=stat_loggers,
                 enable_default_loggers=log_stats,
+                aggregate_engine_logging=aggregate_engine_logging,
             )
             self.logger_manager.log_engine_initialized()
 
@@ -145,22 +140,24 @@ class LLMEngine:
         cls,
         vllm_config: VllmConfig,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         disable_log_stats: bool = False,
     ) -> "LLMEngine":
-        return cls(vllm_config=vllm_config,
-                   executor_class=Executor.get_class(vllm_config),
-                   log_stats=(not disable_log_stats),
-                   usage_context=usage_context,
-                   stat_loggers=stat_loggers,
-                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            log_stats=(not disable_log_stats),
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING,
+        )
 
     @classmethod
     def from_engine_args(
         cls,
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         enable_multiprocessing: bool = False,
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
@@ -174,12 +171,14 @@ class LLMEngine:
             enable_multiprocessing = True
 
         # Create the LLMEngine.
-        return cls(vllm_config=vllm_config,
-                   executor_class=executor_class,
-                   log_stats=not engine_args.disable_log_stats,
-                   usage_context=usage_context,
-                   stat_loggers=stat_loggers,
-                   multiprocess_mode=enable_multiprocessing)
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=not engine_args.disable_log_stats,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=enable_multiprocessing,
+        )
 
     def get_num_unfinished_requests(self) -> int:
         return self.output_processor.get_num_unfinished_requests()
@@ -192,7 +191,8 @@ class LLMEngine:
 
     def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
         aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
-            self.dp_group, has_unfinished)
+            self.dp_group, has_unfinished
+        )
         if not has_unfinished and aggregated_has_unfinished:
             self.should_execute_dummy_batch = True
         return aggregated_has_unfinished
@@ -213,29 +213,48 @@ class LLMEngine:
     def add_request(
         self,
         request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt: EngineCoreRequest | PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
+        prompt_text: str | None = None,
     ) -> None:
         # Validate the request_id type.
         if not isinstance(request_id, str):
-            raise TypeError(
-                f"request_id must be a string, got {type(request_id)}")
+            raise TypeError(f"request_id must be a string, got {type(request_id)}")
 
         # Process raw inputs into the request.
-        prompt_str, request = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            tokenization_kwargs, trace_headers, priority)
+        if isinstance(prompt, EngineCoreRequest):
+            request = prompt
+        else:
+            assert prompt_text is None
+            logger.warning_once(
+                "Processor has been moved under LLM and will "
+                "be removed from LLMEngine in v0.13."
+            )
+            request = self.processor.process_inputs(
+                request_id,
+                prompt,
+                params,
+                arrival_time,
+                lora_request,
+                tokenization_kwargs,
+                trace_headers,
+                priority,
+            )
+            if isinstance(prompt, str):
+                prompt_text = prompt
+            elif isinstance(prompt, Mapping):
+                prompt_text = cast(str | None, prompt.get("prompt"))
 
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
             # Make a new RequestState and queue.
-            self.output_processor.add_request(request, prompt_str, None, 0)
+            self.output_processor.add_request(request, prompt_text, None, 0)
             # Add the request to EngineCore.
             self.engine_core.add_request(request)
             return
@@ -249,48 +268,48 @@ class LLMEngine:
             child_request.sampling_params = params
 
             # Make a new RequestState and queue.
-            self.output_processor.add_request(child_request, prompt_str,
-                                              parent_req, idx)
+            self.output_processor.add_request(
+                child_request, prompt_text, parent_req, idx
+            )
             # Add the request to EngineCore.
             self.engine_core.add_request(child_request)
 
-    def step(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]:
-
+    def step(self) -> list[RequestOutput | PoolingRequestOutput]:
         if self.should_execute_dummy_batch:
             self.should_execute_dummy_batch = False
             self.engine_core.execute_dummy_batch()
             return []
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        outputs = self.engine_core.get_output()
+        with record_function_or_nullcontext("llm_engine step: get_output"):
+            outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
-        iteration_stats = IterationStats() if self.log_stats else None
-        processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs,
-            engine_core_timestamp=outputs.timestamp,
-            iteration_stats=iteration_stats)
+        with record_function_or_nullcontext("llm_engine step: process_outputs"):
+            iteration_stats = IterationStats() if self.log_stats else None
+            processed_outputs = self.output_processor.process_outputs(
+                outputs.outputs,
+                engine_core_timestamp=outputs.timestamp,
+                iteration_stats=iteration_stats,
+            )
+            self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
-        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+        with record_function_or_nullcontext("llm_engine step: abort_requests"):
+            self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
         # 4) Record stats
-        if self.logger_manager is not None:
-            assert outputs.scheduler_stats is not None
-            self.logger_manager.record(
-                scheduler_stats=outputs.scheduler_stats,
-                iteration_stats=iteration_stats,
-            )
-            self.do_log_stats_with_interval()
+        with record_function_or_nullcontext("llm_engine step: record_stats"):
+            if self.logger_manager is not None and outputs.scheduler_stats is not None:
+                self.logger_manager.record(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=iteration_stats,
+                    mm_cache_stats=self.processor.stat_mm_cache(),
+                )
+                self.do_log_stats_with_interval()
 
         return processed_outputs.request_outputs
 
-    def get_vllm_config(self):
-        return self.vllm_config
-
-    def get_model_config(self):
-        return self.model_config
-
     def start_profile(self):
         self.engine_core.profile(True)
 
@@ -298,18 +317,24 @@ class LLMEngine:
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.clear_cache()
+        self.processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Optional[Device] = None):
+    def reset_prefix_cache(self):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(1, level)
+
+    def wake_up(self, tags: list[str] | None = None):
         self.engine_core.wake_up(tags)
 
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(0, 0)
+
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
 
@@ -317,10 +342,19 @@ class LLMEngine:
         assert self.log_stats, "Stat logging disabled"
         return get_metrics_snapshot()
 
+    @property
+    def tokenizer(self) -> AnyTokenizer | None:
+        return self.processor.tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+        self.processor.tokenizer = tokenizer
+
     def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
-            raise ValueError("Unable to get tokenizer because "
-                             "skip_tokenizer_init is True")
+            raise ValueError(
+                "Unable to get tokenizer because skip_tokenizer_init is True"
+            )
 
         return self.tokenizer
 
@@ -354,17 +388,21 @@ class LLMEngine:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[[WorkerBase], _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        return self.collective_rpc("apply_model", args=(func, ))
+        return self.collective_rpc("apply_model", args=(func,))
 
     def __del__(self):
-        if dp_group := getattr(self, "dp_group",
-                               None) and not self.external_launcher_dp:
+        if (
+            dp_group := getattr(self, "dp_group", None)
+            and not self.external_launcher_dp
+        ):
             stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 133122b6fcc0ce932cbdef3eb2eaafc24846a20f..b618d23472651a10da753b7a56d98dc38269aebd 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -2,14 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.logprobs import (
+    PromptLogprobs,
+    SampleLogprobs,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_ids_list_to_tokens)
+    AnyTokenizer,
+    convert_ids_list_to_tokens,
+)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
@@ -20,22 +26,21 @@ NONES = itertools.repeat(None)
 
 @dataclass
 class LogprobsProcessor:
-
     # Tokenizer for this request,
     # None if detokenization is disabled.
-    tokenizer: Optional[AnyTokenizer]
+    tokenizer: AnyTokenizer | None
 
     # Logprobs for this request
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: Optional[float]
-    num_logprobs: Optional[int]
-    num_prompt_logprobs: Optional[int]
+    logprobs: SampleLogprobs | None
+    prompt_logprobs: PromptLogprobs | None
+    cumulative_logprob: float | None
+    num_logprobs: int | None
+    num_prompt_logprobs: int | None
 
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         assert request.sampling_params is not None
@@ -43,10 +48,11 @@ class LogprobsProcessor:
         num_prompt_logprobs = request.sampling_params.prompt_logprobs
         return cls(
             tokenizer=tokenizer,
-            cumulative_logprob=(None if num_logprobs is None else 0.),
-            logprobs=(None if num_logprobs is None else []),
-            # NOTE: logprob of first prompt token is None.
-            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            cumulative_logprob=(None if num_logprobs is None else 0.0),
+            logprobs=(None if num_logprobs is None else create_sample_logprobs()),
+            prompt_logprobs=(
+                None if num_prompt_logprobs is None else create_prompt_logprobs()
+            ),
             num_prompt_logprobs=num_prompt_logprobs,
             num_logprobs=num_logprobs,
         )
@@ -66,28 +72,34 @@ class LogprobsProcessor:
         assert self.logprobs is not None
         assert self.cumulative_logprob is not None
 
-        token_ids_lst, logprobs_lst, ranks_lst = logprobs_lists
-
-        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst,
-                                             token_ids_lst):
+        token_ids_lst, logprobs_lst, ranks_lst, _ = logprobs_lists
 
+        for rank_np, logprobs_np, token_ids_np in zip(
+            ranks_lst, logprobs_lst, token_ids_lst
+        ):
+            rank = rank_np.tolist()
+            logprobs = logprobs_np.tolist()
+            token_ids = token_ids_np.tolist()
             # Detokenize (non-incrementally).
-            decoded_tokens = NONES if self.tokenizer is None else (
-                convert_ids_list_to_tokens(self.tokenizer, token_ids))
+            decoded_tokens = (
+                NONES
+                if self.tokenizer is None
+                else (convert_ids_list_to_tokens(self.tokenizer, token_ids))
+            )
 
             # Sampler puts the sampled logprob in first.
             sampled_token_logprob = logprobs[0]
             self.cumulative_logprob += sampled_token_logprob
 
-            # Update with the Logprob dictionary for this pos.
-            self.logprobs.append(
-                self._make_logprob_dict(
-                    logprobs,
-                    token_ids,
-                    decoded_tokens,
-                    rank,
-                    self.num_logprobs,
-                ))
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.logprobs,
+                token_ids,
+                logprobs,
+                decoded_tokens,
+                rank,
+                self.num_logprobs,
+            )
 
     def _update_prompt_logprobs(
         self,
@@ -109,9 +121,13 @@ class LogprobsProcessor:
 
         # Detokenize non-incrementally.
         # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = None if self.tokenizer is None else (
-            convert_ids_list_to_tokens(self.tokenizer,
-                                       token_ids.flatten().tolist()))
+        decoded_tokens = (
+            None
+            if self.tokenizer is None
+            else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids.flatten().tolist())
+            )
+        )
 
         # Recover shapes.
         num_prompt_tokens, num_logprobs = logprobs.shape
@@ -126,17 +142,21 @@ class LogprobsProcessor:
             # Handle flattening.
             offset = pos * num_logprobs
             offset_end = offset + num_logprobs
-            decoded_tokens_for_pos = NONES \
-            if decoded_tokens is None else decoded_tokens[offset:offset_end]
+            decoded_tokens_for_pos = (
+                NONES if decoded_tokens is None else decoded_tokens[offset:offset_end]
+            )
 
-            # Update with the Logprob dictionary for this pos.
-            self.prompt_logprobs.append(
-                self._make_logprob_dict(prompt_logprobs[pos], token_ids[pos],
-                                        decoded_tokens_for_pos,
-                                        prompt_token_ranks[pos],
-                                        self.num_prompt_logprobs))
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.prompt_logprobs,
+                token_ids[pos],
+                prompt_logprobs[pos],
+                decoded_tokens_for_pos,
+                prompt_token_ranks[pos],
+                self.num_prompt_logprobs,
+            )
 
-    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+    def pop_prompt_logprobs(self) -> PromptLogprobs | None:
         """Pop and return all request prompt logprobs
 
         The logprobs processor aggregates prompt chunk logprobs
@@ -155,45 +175,6 @@ class LogprobsProcessor:
             self.prompt_logprobs = []
         return plp
 
-    @staticmethod
-    def _make_logprob_dict(
-        logprobs: list[float],
-        logprob_token_ids: list[int],
-        decoded_tokens: Iterable[Optional[str]],
-        rank: int,
-        num_logprobs: int,
-    ) -> dict[int, Logprob]:
-        """Make a Logprob dictionary for a position.
-
-        Args:
-          logprobs: list of log probabilities
-          logprob_token_ids: list of top token ids
-          decoded_tokens: list of decoded top tokens
-          rank: rank of the sampled token
-          num_logprobs: number of logprobs requested
-            by the user (in addition to sampled logprob)
-
-        Returns:
-          dict[token id, Logprob]
-        """
-        if num_logprobs == -1:
-            num_logprobs = len(logprobs)
-        # We do not need a special case for the sampled token
-        # being in the topk, since inserting duplicated data
-        # into a dictionary twice is the same as doing it once.
-        topk_ranks = range(1, num_logprobs + 1)
-        ranks = itertools.chain((rank, ), topk_ranks)
-
-        return {
-            token_id: Logprob(
-                logprob=logprob,
-                rank=rank,
-                decoded_token=token,
-            )
-            for token_id, logprob, rank, token in zip(
-                logprob_token_ids, logprobs, ranks, decoded_tokens)
-        }
-
     def update_from_output(self, output: EngineCoreOutput) -> None:
         if output.new_logprobs is not None:
             self._update_sample_logprobs(output.new_logprobs)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 38b2d6824b47337ba872e54a279d7928dac00e92..bdbbfe2595f8122135f44e958973ebbb64fd1a45 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -4,23 +4,30 @@
 import asyncio
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Any, Optional, Union, cast
+from typing import Any, cast
 
 import torch
 
-from vllm.outputs import (CompletionOutput, PoolingOutput,
-                          PoolingRequestOutput, RequestOutput)
+from vllm.outputs import (
+    CompletionOutput,
+    PoolingOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+)
 from vllm.sampling_params import RequestOutputKind
-from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
-                          extract_trace_context)
+from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
-                                   RequestStateStats)
+from vllm.v1.metrics.stats import (
+    IterationStats,
+    LoRARequestStates,
+    RequestStateStats,
+    SchedulerStats,
+)
 
 
 class RequestOutputCollector:
@@ -34,22 +41,26 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[Union[RequestOutput, PoolingRequestOutput,
-                                    Exception]] = None
+        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
         self.ready = asyncio.Event()
 
-    def put(self, output: Union[RequestOutput, PoolingRequestOutput,
-                                Exception]) -> None:
+    def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
         """Non-blocking put operation."""
         if self.output is None or isinstance(output, Exception):
             self.output = output
             self.ready.set()
-        elif isinstance(self.output, (RequestOutput, PoolingRequestOutput)):
+        elif isinstance(self.output, RequestOutput) and isinstance(
+            output, RequestOutput
+        ):
             # This ensures that request outputs with different request indexes
             # (if n > 1) do not override each other.
             self.output.add(output, aggregate=self.aggregate)
+        elif isinstance(self.output, PoolingRequestOutput) and isinstance(
+            output, PoolingRequestOutput
+        ):
+            self.output = output
 
-    async def get(self) -> Union[RequestOutput, PoolingRequestOutput]:
+    async def get(self) -> RequestOutput | PoolingRequestOutput:
         """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
@@ -59,8 +70,7 @@ class RequestOutputCollector:
             raise output
         return output
 
-    def get_nowait(
-            self) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+    def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
         """Non-blocking get operation."""
         output = self.output
         if output is not None:
@@ -73,31 +83,31 @@ class RequestOutputCollector:
 
 @dataclass
 class OutputProcessorOutput:
-    request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
+    request_outputs: list[RequestOutput | PoolingRequestOutput]
     reqs_to_abort: list[str]
 
 
 class RequestState:
-
     def __init__(
         self,
         request_id: str,
-        parent_req: Optional[ParentRequest],
+        parent_req: ParentRequest | None,
         request_index: int,
-        lora_name: Optional[str],
+        lora_name: str | None,
         output_kind: RequestOutputKind,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_embeds: Optional[torch.Tensor],
-        logprobs_processor: Optional[LogprobsProcessor],
-        detokenizer: Optional[IncrementalDetokenizer],
-        max_tokens_param: Optional[int],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_embeds: torch.Tensor | None,
+        logprobs_processor: LogprobsProcessor | None,
+        detokenizer: IncrementalDetokenizer | None,
+        max_tokens_param: int | None,
         arrival_time: float,
-        queue: Optional[RequestOutputCollector],
+        queue: RequestOutputCollector | None,
         log_stats: bool,
-        top_p: Optional[float] = None,
-        n: Optional[int] = None,
-        temperature: Optional[float] = None,
+        stream_interval: int,
+        top_p: float | None = None,
+        n: int | None = None,
+        temperature: float | None = None,
     ):
         self.request_id = request_id
         self.parent_req = parent_req
@@ -108,7 +118,8 @@ class RequestState:
         self.prompt_token_ids = prompt_token_ids
         self.prompt_embeds = prompt_embeds
         self.prompt_len = length_from_prompt_token_ids_or_embeds(
-            self.prompt_token_ids, self.prompt_embeds)
+            self.prompt_token_ids, self.prompt_embeds
+        )
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.max_tokens_param = max_tokens_param
@@ -119,21 +130,24 @@ class RequestState:
         self.queue = queue
         self.num_cached_tokens = 0
 
-        self.stats = RequestStateStats(
-            arrival_time=arrival_time) if log_stats else None
+        self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
+
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.sent_tokens_offset = 0  # Offset of sent tokens
 
     @classmethod
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest],
+        prompt: str | None,
+        parent_req: ParentRequest | None,
         request_index: int,
-        queue: Optional[RequestOutputCollector],
+        queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
     ) -> "RequestState":
-
         if sampling_params := request.sampling_params:
             if not sampling_params.detokenize:
                 tokenizer = None
@@ -164,8 +178,9 @@ class RequestState:
             request_id=request.request_id,
             parent_req=parent_req,
             request_index=request_index,
-            lora_name=(request.lora_request.name
-                       if request.lora_request is not None else None),
+            lora_name=(
+                request.lora_request.name if request.lora_request is not None else None
+            ),
             output_kind=output_kind,
             prompt=prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -179,17 +194,17 @@ class RequestState:
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
+            stream_interval=stream_interval,
         )
 
     def make_request_output(
         self,
         new_token_ids: list[int],
-        pooling_output: Optional[torch.Tensor],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
-        kv_transfer_params: Optional[dict[str, Any]] = None,
-    ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
-
+        pooling_output: torch.Tensor | None,
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
+        kv_transfer_params: dict[str, Any] | None = None,
+    ) -> RequestOutput | PoolingRequestOutput | None:
         finished = finish_reason is not None
         final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
 
@@ -197,34 +212,57 @@ class RequestState:
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
+        if self.stream_interval > 1:
+            assert self.detokenizer is not None
+
+            # Send output request only when
+            # 1. It has finished, or
+            # 2. It is the first token, or
+            # 3. It has reached the stream interval number of tokens
+            if not (
+                finished
+                or self.sent_tokens_offset == 0
+                or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
+                >= self.stream_interval
+            ):
+                return None
+
+            if self.output_kind == RequestOutputKind.DELTA:
+                # Send tokens from the offset in DELTA mode, otherwise all
+                # tokens are sent.
+                new_token_ids = self.detokenizer.output_token_ids[
+                    self.sent_tokens_offset :
+                ]
+                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
+
         request_id = self.request_id
         if pooling_output is not None:
             return self._new_request_output(
-                request_id, [self._new_pooling_output(pooling_output)],
-                finished)
+                request_id, [self._new_pooling_output(pooling_output)], finished
+            )
 
-        output = self._new_completion_output(new_token_ids, finish_reason,
-                                             stop_reason)
+        output = self._new_completion_output(new_token_ids, finish_reason, stop_reason)
 
         if self.parent_req is None:
             outputs = [output]
         else:
             request_id, outputs, finished = self.parent_req.get_outputs(
-                request_id, output)
+                request_id, output
+            )
             if not outputs:
                 return None
 
-        return self._new_request_output(request_id, outputs, finished,
-                                        kv_transfer_params)
+        return self._new_request_output(
+            request_id, outputs, finished, kv_transfer_params
+        )
 
     def _new_request_output(
         self,
         request_id: str,
-        outputs: Union[list[CompletionOutput], list[PoolingOutput]],
+        outputs: list[CompletionOutput] | list[PoolingOutput],
         finished: bool,
-        kv_transfer_params: Optional[dict[str, Any]] = None,
-    ) -> Union[RequestOutput, PoolingRequestOutput]:
-
+        kv_transfer_params: dict[str, Any] | None = None,
+    ) -> RequestOutput | PoolingRequestOutput:
         first_output = outputs[0]
         if isinstance(first_output, PoolingOutput):
             assert len(outputs) == 1
@@ -233,6 +271,7 @@ class RequestState:
             return PoolingRequestOutput(
                 request_id=request_id,
                 outputs=first_output,
+                num_cached_tokens=self.num_cached_tokens,
                 prompt_token_ids=self.prompt_token_ids,
                 finished=finished,
             )
@@ -257,15 +296,15 @@ class RequestState:
             finished=finished,
             kv_transfer_params=kv_transfer_params,
             num_cached_tokens=self.num_cached_tokens,
+            metrics=self.stats,
         )
 
     def _new_completion_output(
         self,
         token_ids: list[int],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
     ) -> CompletionOutput:
-
         assert self.detokenizer is not None
         assert self.logprobs_processor is not None
         finished = finish_reason is not None
@@ -279,7 +318,7 @@ class RequestState:
         # Prepare logprobs, based on delta mode
         logprobs = self.logprobs_processor.logprobs
         if delta and logprobs:
-            logprobs = logprobs[-len(token_ids):]
+            logprobs = logprobs[-len(token_ids) :]
 
         return CompletionOutput(
             index=self.request_index,
@@ -288,26 +327,29 @@ class RequestState:
             logprobs=logprobs,
             cumulative_logprob=self.logprobs_processor.cumulative_logprob,
             finish_reason=str(finish_reason) if finished else None,
-            stop_reason=stop_reason if finished else None)
+            stop_reason=stop_reason if finished else None,
+        )
 
     def _new_pooling_output(
         self,
         pooling_output: torch.Tensor,
     ) -> PoolingOutput:
-
         return PoolingOutput(data=pooling_output)
 
 
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+    def __init__(
+        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
-        self.lora_states = LoRARequestStates()
-        self.tracer: Optional[Tracer] = None
+        self.lora_states = LoRARequestStates(log_stats)
+        self.tracer: Tracer | None = None
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -330,19 +372,22 @@ class OutputProcessor:
         for request_id in request_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
-                self.lora_states.abort_request(req_state)
+                self.lora_states.request_finished(request_id, req_state.lora_name)
                 request_ids_to_abort.append(request_id)
                 # Produce final abort output.
                 if req_state.queue is not None and (
-                        request_output := req_state.make_request_output(
-                            new_token_ids=[],
-                            # Set pooling_output is not None to
-                            # correctly enter the abort pooling branch
-                            pooling_output=torch.randn(0, device="cpu")
-                            if req_state.detokenizer is None else None,
-                            finish_reason=FinishReason.ABORT,
-                            stop_reason=None,
-                            kv_transfer_params=None)):
+                    request_output := req_state.make_request_output(
+                        new_token_ids=[],
+                        # Set pooling_output is not None to
+                        # correctly enter the abort pooling branch
+                        pooling_output=torch.randn(0, device="cpu")
+                        if req_state.detokenizer is None
+                        else None,
+                        finish_reason=FinishReason.ABORT,
+                        stop_reason=None,
+                        kv_transfer_params=None,
+                    )
+                ):
                     req_state.queue.put(request_output)
             elif parent := self.parent_requests.get(request_id):
                 # Abort children prior to removing the parent.
@@ -356,32 +401,34 @@ class OutputProcessor:
     def add_request(
         self,
         request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest] = None,
+        prompt: str | None,
+        parent_req: ParentRequest | None = None,
         request_index: int = 0,
-        queue: Optional[RequestOutputCollector] = None,
+        queue: RequestOutputCollector | None = None,
     ) -> None:
         request_id = request.request_id
         if request_id in self.request_states:
             raise ValueError(f"Request id {request_id} already running.")
 
-        req_state = RequestState.from_new_request(tokenizer=self.tokenizer,
-                                                  request=request,
-                                                  prompt=prompt,
-                                                  parent_req=parent_req,
-                                                  request_index=request_index,
-                                                  queue=queue,
-                                                  log_stats=self.log_stats)
+        req_state = RequestState.from_new_request(
+            tokenizer=self.tokenizer,
+            request=request,
+            prompt=prompt,
+            parent_req=parent_req,
+            request_index=request_index,
+            queue=queue,
+            log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
+        )
         self.request_states[request_id] = req_state
-        self.lora_states.add_request(req_state)
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
 
     def process_outputs(
         self,
         engine_core_outputs: list[EngineCoreOutput],
-        engine_core_timestamp: Optional[float] = None,
-        iteration_stats: Optional[IterationStats] = None,
+        engine_core_timestamp: float | None = None,
+        iteration_stats: IterationStats | None = None,
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -405,8 +452,7 @@ class OutputProcessor:
         within the loop below.
         """
 
-        request_outputs: Union[list[RequestOutput],
-                               list[PoolingRequestOutput]] = []
+        request_outputs: list[RequestOutput | PoolingRequestOutput] = []
         reqs_to_abort: list[str] = []
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
@@ -416,9 +462,9 @@ class OutputProcessor:
                 continue
 
             # 1) Compute stats for this iteration.
-            self._update_stats_from_output(req_state, engine_core_output,
-                                           engine_core_timestamp,
-                                           iteration_stats)
+            self._update_stats_from_output(
+                req_state, engine_core_output, engine_core_timestamp, iteration_stats
+            )
 
             new_token_ids = engine_core_output.new_token_ids
             pooling_output = engine_core_output.pooling_output
@@ -433,20 +479,24 @@ class OutputProcessor:
                 assert req_state.logprobs_processor is not None
                 # 2) Detokenize the token ids into text and perform stop checks.
                 stop_string = req_state.detokenizer.update(
-                    new_token_ids, finish_reason == FinishReason.STOP)
+                    new_token_ids, finish_reason == FinishReason.STOP
+                )
                 if stop_string:
                     finish_reason = FinishReason.STOP
                     stop_reason = stop_string
 
                 # 3) Compute sample and prompt logprobs for request,
                 # if required.
-                req_state.logprobs_processor.update_from_output(
-                    engine_core_output)
+                req_state.logprobs_processor.update_from_output(engine_core_output)
 
             # 4) Create and handle RequestOutput objects.
             if request_output := req_state.make_request_output(
-                    new_token_ids, pooling_output, finish_reason, stop_reason,
-                    kv_transfer_params):
+                new_token_ids,
+                pooling_output,
+                finish_reason,
+                stop_reason,
+                kv_transfer_params,
+            ):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put(request_output)
@@ -467,21 +517,26 @@ class OutputProcessor:
                     reqs_to_abort.append(req_id)
 
                 # Track per-request stats
-                self._update_stats_from_finished(req_state, finish_reason,
-                                                 iteration_stats)
+                self._update_stats_from_finished(
+                    req_state, finish_reason, iteration_stats
+                )
                 if self.tracer:
-                    self.do_tracing(engine_core_output, req_state,
-                                    iteration_stats)
-        self.lora_states.update_iteration_stats(iteration_stats)
+                    self.do_tracing(engine_core_output, req_state, iteration_stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
         )
 
-    def do_tracing(self, engine_core_output: EngineCoreOutput,
-                   req_state: RequestState,
-                   iteration_stats: Optional[IterationStats]) -> None:
+    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
+        self.lora_states.update_scheduler_stats(scheduler_stats)
+
+    def do_tracing(
+        self,
+        engine_core_output: EngineCoreOutput,
+        req_state: RequestState,
+        iteration_stats: IterationStats | None,
+    ) -> None:
         assert req_state.stats is not None
         assert iteration_stats is not None
         assert self.tracer is not None
@@ -489,75 +544,84 @@ class OutputProcessor:
         arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
         trace_context = extract_trace_context(engine_core_output.trace_headers)
         prompt_length = length_from_prompt_token_ids_or_embeds(
-            req_state.prompt_token_ids, req_state.prompt_embeds)
-        with (self.tracer.start_as_current_span(
-                "llm_request",
-                kind=SpanKind.SERVER,
-                context=trace_context,
-                start_time=arrival_time_nano_seconds) as span):
+            req_state.prompt_token_ids, req_state.prompt_embeds
+        )
+        with self.tracer.start_as_current_span(
+            "llm_request",
+            kind=SpanKind.SERVER,
+            context=trace_context,
+            start_time=arrival_time_nano_seconds,
+        ) as span:
             metrics = req_state.stats
-            e2e_time = iteration_stats.iteration_timestamp - \
-                       metrics.arrival_time
+            e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
             queued_time = metrics.scheduled_ts - metrics.queued_ts
             prefill_time = metrics.first_token_ts - metrics.scheduled_ts
             decode_time = metrics.last_token_ts - metrics.first_token_ts
             inference_time = metrics.last_token_ts - metrics.scheduled_ts
             span.set_attribute(
                 SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
-                metrics.first_token_latency)
+                metrics.first_token_latency,
+            )
             span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                               queued_time)
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
-                               prompt_length)
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
-                               metrics.num_generation_tokens)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, prompt_length)
             span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
-                prefill_time)
+                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                metrics.num_generation_tokens,
+            )
             span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
-                decode_time)
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time
+            )
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time
+            )
             span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
-                inference_time)
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time
+            )
 
             # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
-                               req_state.request_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
             if req_state.top_p:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
-                                   req_state.top_p)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
             if req_state.max_tokens_param:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                                   req_state.max_tokens_param)
+                span.set_attribute(
+                    SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, req_state.max_tokens_param
+                )
             if req_state.temperature:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
-                                   req_state.temperature)
+                span.set_attribute(
+                    SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, req_state.temperature
+                )
             if req_state.n:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
-                                   req_state.n)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)
 
-    def _update_stats_from_output(self, req_state: RequestState,
-                                  engine_core_output: EngineCoreOutput,
-                                  engine_core_timestamp: Optional[float],
-                                  iteration_stats: Optional[IterationStats]):
+    def _update_stats_from_output(
+        self,
+        req_state: RequestState,
+        engine_core_output: EngineCoreOutput,
+        engine_core_timestamp: float | None,
+        iteration_stats: IterationStats | None,
+    ):
         if iteration_stats is None:
             return
 
-        lora_stats = self.lora_states.get_stats(req_state)
-
         assert engine_core_timestamp is not None
         assert req_state.stats is not None
-        iteration_stats.update_from_output(engine_core_output,
-                                           engine_core_timestamp,
-                                           req_state.is_prefilling,
-                                           req_state.prompt_len,
-                                           req_state.stats, lora_stats)
-
-    def _update_stats_from_finished(self, req_state: RequestState,
-                                    finish_reason: Optional[FinishReason],
-                                    iteration_stats: Optional[IterationStats]):
+        iteration_stats.update_from_output(
+            engine_core_output,
+            engine_core_timestamp,
+            req_state.is_prefilling,
+            req_state.prompt_len,
+            req_state.stats,
+            self.lora_states,
+            req_state.lora_name,
+        )
+
+    def _update_stats_from_finished(
+        self,
+        req_state: RequestState,
+        finish_reason: FinishReason | None,
+        iteration_stats: IterationStats | None,
+    ):
         if iteration_stats is None:
             return
 
@@ -566,11 +630,13 @@ class OutputProcessor:
         iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
             num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
-                req_state.prompt_token_ids, req_state.prompt_embeds),
+                req_state.prompt_token_ids, req_state.prompt_embeds
+            ),
             max_tokens_param=req_state.max_tokens_param,
-            req_stats=req_state.stats)
-        self.lora_states.finish_request(req_state)
+            req_stats=req_state.stats,
+        )
+        self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
 
         ParentRequest.observe_finished_request(
-            req_state.parent_req, iteration_stats,
-            req_state.stats.num_generation_tokens)
+            req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
+        )
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 1e9911152c6df347aed12787764621e87b4b2beb..59aacd1963076ae2920ba417ad91121544610008 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import copy
-from typing import Optional
+from typing import Optional, cast
 
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -29,17 +29,18 @@ class ParentRequest:
     max_num_generation_tokens: int
 
     # To efficiently obtain child sampling params
-    cached_child_sampling_params: Optional[SamplingParams]
+    cached_child_sampling_params: SamplingParams | None
 
-    def __init__(self, request_id: str,
-                 sampling_params: SamplingParams) -> None:
+    def __init__(self, request_id: str, sampling_params: SamplingParams) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
 
         self.child_requests = set()
-        self.output_aggregator = [None] * sampling_params.n if (
-            sampling_params.output_kind
-            == RequestOutputKind.FINAL_ONLY) else []
+        self.output_aggregator = (
+            [cast(CompletionOutput, None)] * sampling_params.n
+            if (sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
+            else []
+        )
         self.max_num_generation_tokens = 0
         self.cached_child_sampling_params = None
 
@@ -49,7 +50,7 @@ class ParentRequest:
     ) -> SamplingParams:
         """Efficiently obtain child `sampling_params`
 
-        If `sampling_params.seed` is not `None` then 
+        If `sampling_params.seed` is not `None` then
         each child request requires a unique clone of
         parent `sampling_params` with a unique seed.
 
@@ -76,10 +77,10 @@ class ParentRequest:
 
     def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         """Get child request ID and sampling params.
-        
+
         Args:
           index: index within `n` child requests.
-        
+
         Returns:
           (request ID, sampling_params) tuple
         """
@@ -96,12 +97,21 @@ class ParentRequest:
         child_request_id: str,
         completion_output: CompletionOutput,
     ) -> tuple[str, list[CompletionOutput], bool]:
+        already_finished_and_returned: bool = False
         if completion_output.finished():
-            self.child_requests.remove(child_request_id)
+            if child_request_id in self.child_requests:
+                self.child_requests.remove(child_request_id)
+            else:
+                # child request ID is not available in child_requests
+                # which means the request had finished in previous
+                # batch step and returned to the client earlier
+                already_finished_and_returned = True
 
         if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
-            # If streaming, just return the current output.
-            outputs = [completion_output]
+            # If streaming, just return the current output
+            #
+            # DO NOT output finished and already returned child request to client again
+            outputs = [] if already_finished_and_returned else [completion_output]
         else:
             # If not streaming, aggregate the n final outputs.
             self.output_aggregator[completion_output.index] = completion_output
@@ -111,23 +121,25 @@ class ParentRequest:
         return self.request_id, outputs, finished
 
     def observe_num_generation_tokens(self, num_generation_tokens: int):
-        self.max_num_generation_tokens = max(num_generation_tokens,
-                                             self.max_num_generation_tokens)
+        self.max_num_generation_tokens = max(
+            num_generation_tokens, self.max_num_generation_tokens
+        )
         return self.max_num_generation_tokens
 
     @staticmethod
-    def observe_finished_request(parent_req: Optional['ParentRequest'],
-                                 iteration_stats: IterationStats,
-                                 num_generation_tokens: int):
-
+    def observe_finished_request(
+        parent_req: Optional["ParentRequest"],
+        iteration_stats: IterationStats,
+        num_generation_tokens: int,
+    ):
         n_param = parent_req.n if parent_req is not None else 1
 
         if parent_req is not None:
             num_generation_tokens = parent_req.observe_num_generation_tokens(
-                num_generation_tokens)
+                num_generation_tokens
+            )
 
         # Child requests finished, we can now record to iteration stats
         if parent_req is None or not parent_req.child_requests:
-            iteration_stats.max_num_generation_tokens_iter.append(
-                num_generation_tokens)
+            iteration_stats.max_num_generation_tokens_iter.append(num_generation_tokens)
             iteration_stats.n_params_iter.append(n_param)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 843ca9ad68e38209b927b1454754b005de388652..4cb911d8e22b74cd4d0808add245ecdfd1d200a4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -3,7 +3,7 @@
 
 import time
 from collections.abc import Mapping
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, cast
 
 from vllm.config import VllmConfig
 from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
@@ -14,6 +14,7 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -21,48 +22,52 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.structured_output.backend_guidance import (
-    validate_guidance_grammar)
+from vllm.v1.metrics.stats import MultiModalCacheStats
+from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
-    validate_structured_output_request_lm_format_enforcer)
+    validate_structured_output_request_lm_format_enforcer,
+)
 from vllm.v1.structured_output.backend_outlines import (
-    validate_structured_output_request_outlines)
-from vllm.v1.structured_output.backend_xgrammar import (
-    validate_xgrammar_grammar)
+    validate_structured_output_request_outlines,
+)
+from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
 
 logger = init_logger(__name__)
 
 
 class Processor:
-
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: AnyTokenizer | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-
+    ) -> None:
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
         self.structured_outputs_config = vllm_config.structured_outputs_config
-        self.tokenizer = tokenizer
 
-        self.generation_config_fields = (
-            self.model_config.try_get_generation_config())
+        self.generation_config_fields = self.model_config.try_get_generation_config()
 
         self.mm_registry = mm_registry
-        self.mm_processor_cache = processor_cache_from_config(
-            vllm_config, mm_registry)
+        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
 
         self.input_preprocessor = InputPreprocessor(
             self.model_config,
-            self.tokenizer,
+            tokenizer,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
         )
 
+    @property
+    def tokenizer(self) -> AnyTokenizer | None:
+        return self.input_preprocessor.tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+        self.input_preprocessor.tokenizer = tokenizer
+
     def _validate_logprobs(
         self,
         params: SamplingParams,
@@ -79,7 +84,8 @@ class Processor:
             if num_logprobs > max_logprobs:
                 raise ValueError(
                     f"Requested sample logprobs of {num_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}")
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
 
         # Validate prompt logprobs.
         if params.prompt_logprobs:
@@ -89,7 +95,8 @@ class Processor:
             if num_prompt_logprobs > max_logprobs:
                 raise ValueError(
                     f"Requested prompt logprobs of {num_prompt_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}")
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
 
     def _validate_sampling_params(
         self,
@@ -108,8 +115,7 @@ class Processor:
             return
         vocab_size = len(self.tokenizer)
         if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
-            raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id!")
+            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
 
     def _validate_logit_bias(
         self,
@@ -129,7 +135,8 @@ class Processor:
         if invalid_token_ids:
             raise ValueError(
                 f"token_id(s) {invalid_token_ids} in logit_bias contain "
-                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
+            )
 
     def _validate_supported_sampling_params(
         self,
@@ -140,12 +147,30 @@ class Processor:
             raise ValueError("vLLM V1 does not yet support best_of.")
         # Logits processors not supported.
         if params.logits_processors:
-            raise ValueError("vLLM V1 does not support per request "
-                             "user provided logits processors.")
+            raise ValueError(
+                "vLLM V1 does not support per request user provided logits processors."
+            )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
 
     def _validate_params(
         self,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
     ):
         """
         Validate supported SamplingParam.
@@ -167,7 +192,7 @@ class Processor:
         auto-hashed downstream.
         """
 
-        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
+        def _validate_single_prompt(single_prompt: dict | str) -> None:
             if not isinstance(single_prompt, dict):
                 return
             mm_data = single_prompt.get("multi_modal_data")
@@ -178,38 +203,44 @@ class Processor:
             for modality, items in mm_data.items():
                 if modality in mm_uuids:
                     data_len = len(items) if isinstance(items, list) else 1
-                    uuid_len = len(mm_uuids[modality]) if isinstance(
-                        mm_uuids[modality], list) else 1
+                    uuid_len = (
+                        len(mm_uuids[modality])
+                        if isinstance(mm_uuids[modality], list)
+                        else 1
+                    )
                     if uuid_len != data_len:
                         raise ValueError(
                             f"multi_modal_uuids for modality '{modality}' "
                             "must have same length as data: got "
                             f"{uuid_len} uuids vs "
-                            f"{data_len} items.")
+                            f"{data_len} items."
+                        )
                 else:
                     raise ValueError(
                         f"multi_modal_uuids for modality '{modality}' must "
-                        "be provided if multi_modal_data is provided.")
+                        "be provided if multi_modal_data is provided."
+                    )
 
         # Handle explicit encoder/decoder prompts or singleton prompt
         if isinstance(prompt, dict) and "encoder_prompt" in prompt:
             enc = prompt.get("encoder_prompt")
             dec = prompt.get("decoder_prompt")
             if enc is not None:
-                _validate_single_prompt(enc)
+                _validate_single_prompt(cast(dict | str, enc))
             if dec is not None:
-                _validate_single_prompt(dec)
+                _validate_single_prompt(cast(dict | str, dec))
         else:
             _validate_single_prompt(prompt)  # type: ignore[arg-type]
 
-    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
         if lora_request is None:
             return
 
         # LoRA request passed in while LoRA is not enabled
         if not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
+            raise ValueError(
+                f"Got lora_request {lora_request} but LoRA is not enabled!"
+            )
 
         if self.tokenizer is not None:
             logger.warning_once(
@@ -217,7 +248,8 @@ class Processor:
                 "tokenizers for different LoRAs. By default, vLLM uses base "
                 "model's tokenizer. If you are using a LoRA "
                 "with its own tokenizer, consider specifying `--tokenizer "
-                "[lora_path]` to use the LoRA tokenizer.")
+                "[lora_path]` to use the LoRA tokenizer."
+            )
 
     def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.structured_outputs or not self.structured_outputs_config:
@@ -235,24 +267,33 @@ class Processor:
             # to a specific backend based on `auto` behavior in a previous
             # request. We remember that it was set as a result of `auto`
             # using the `_backend_was_auto` field set in the params.
-            if (backend != _backend
-                    and not (backend == "auto"
-                             and params.structured_outputs._backend_was_auto)):
+            if backend != _backend and not (
+                backend == "auto" and params.structured_outputs._backend_was_auto
+            ):
                 raise ValueError(
                     "Request-level structured output backend selection is not "
                     f"supported. The request specified '{_backend}', but vLLM "
                     f"was initialised with '{backend}'. This error can be "
-                    "resolved by removing '_backend' from the request.")
+                    "resolved by removing '_backend' from the request."
+                )
         else:
             params.structured_outputs._backend = backend
 
         # Request content validation
-        if (isinstance(params.structured_outputs.choice, list)
-                and not params.structured_outputs.choice):
+        if (
+            isinstance(params.structured_outputs.choice, list)
+            and not params.structured_outputs.choice
+        ):
             # It is invalid for choice to be an empty list
             raise ValueError(
                 f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
             )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
 
         if backend.startswith("xgrammar"):
             # xgrammar with no fallback
@@ -292,7 +333,7 @@ class Processor:
         self,
         request_id: str,
         prompt: PromptType,
-    ) -> Optional[MultiModalUUIDDict]:
+    ) -> MultiModalUUIDDict | None:
         """Build per-item multimodal hash overrides when enabled. In this case,
         multimodal data items are identified by their request id, modality and
         index rather than their content.
@@ -315,36 +356,40 @@ class Processor:
         if not mm_data:
             return None
 
-        mm_uuids: MultiModalUUIDDict = {}
+        mm_uuids: dict[str, list[str | None] | str] = {}
         for modality, data in mm_data.items():
-            n = len(data) if isinstance(data, list) else 1
-            mm_uuids[modality] = [
-                f"{request_id}-{modality}-{i}" for i in range(n)
-            ]
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
+            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
     def process_inputs(
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> tuple[Optional[str], EngineCoreRequest]:
-
-        # TODO(woosuk): Support pooling models.
+        data_parallel_rank: int | None = None,
+    ) -> EngineCoreRequest:
         self._validate_lora(lora_request)
         self._validate_params(params)
 
         data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
-        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
-                                                   data_parallel_size):
-            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
-                             f"is out of range [0, {data_parallel_size}).")
+        if data_parallel_rank is not None and not (
+            0 <= data_parallel_rank < data_parallel_size
+        ):
+            raise ValueError(
+                f"data_parallel_rank {data_parallel_rank} "
+                f"is out of range [0, {data_parallel_size})."
+            )
 
         if arrival_time is None:
             arrival_time = time.time()
@@ -357,16 +402,20 @@ class Processor:
         # reused across requests, therefore identifying multimodal data items
         # by their content is no longer necessary, and we create uuids with
         # request id-modality-index as multimodal hash overrides.
-        if (self.model_config.multimodal_config and
-                self.model_config.multimodal_config.mm_processor_cache_gb == 0
-                and not self.cache_config.enable_prefix_caching):
+        if (
+            self.model_config.multimodal_config
+            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
+            and not self.cache_config.enable_prefix_caching
+        ):
             mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
         else:
             # Otherwise, use user-provided uuids as multimodal hash overrides
             # if provided.
             self._validate_multi_modal_uuids(prompt)
             if isinstance(prompt, dict):
-                mm_uuids = prompt.get("multi_modal_uuids")
+                mm_uuids = cast(
+                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
+                )
             else:
                 mm_uuids = None
 
@@ -380,6 +429,7 @@ class Processor:
             mm_uuids=mm_uuids,
         )
         from vllm.platforms import current_platform
+
         current_platform.validate_request(
             prompt=prompt,
             params=params,
@@ -391,16 +441,13 @@ class Processor:
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
         self._validate_model_inputs(encoder_inputs, decoder_inputs)
 
-        # Mypy does not always properly infer the types of some elements of
-        # discriminated unions of TypedDicts, because of how it handles
-        # inheritance of TypedDict. If we explicitly extract the items we want
-        # we can avoid type errors from using `dict.get` later in the method.
-        prompt_str: Optional[str] = None if decoder_inputs[
-            "type"] == "embeds" else decoder_inputs.get("prompt")
-        prompt_token_ids = decoder_inputs[
-            "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
-        prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
-            "type"] == "embeds" else None
+        # Mypy can be conservative for TypedDict unions; normalize access.
+        if decoder_inputs["type"] == "embeds":
+            prompt_token_ids = None
+            prompt_embeds = decoder_inputs["prompt_embeds"]
+        else:
+            prompt_token_ids = decoder_inputs["prompt_token_ids"]
+            prompt_embeds = None
 
         sampling_params = None
         pooling_params = None
@@ -410,18 +457,19 @@ class Processor:
             # If unset max tokens, then generate up to the max_model_len.
             if sampling_params.max_tokens is None:
                 seq_len = length_from_prompt_token_ids_or_embeds(
-                    prompt_token_ids, prompt_embeds)
-                sampling_params.max_tokens = \
-                    self.model_config.max_model_len - seq_len
+                    prompt_token_ids, prompt_embeds
+                )
+                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
             sampling_params.update_from_generation_config(
-                self.generation_config_fields, eos_token_id)
+                self.generation_config_fields, eos_token_id
+            )
             if self.tokenizer is not None:
                 sampling_params.update_from_tokenizer(self.tokenizer)
         else:
             pooling_params = params.clone()
 
         # Multimodal related.
-        mm_features: Optional[list[MultiModalFeatureSpec]] = None
+        mm_features: list[MultiModalFeatureSpec] | None = None
 
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
@@ -440,9 +488,11 @@ class Processor:
                         data=decoder_mm_inputs[modality][idx],
                         modality=modality,
                         identifier=decoder_mm_hashes[modality][idx],
-                        mm_position=decoder_mm_positions[modality][idx]))
+                        mm_position=decoder_mm_positions[modality][idx],
+                    )
+                )
 
-        return prompt_str, EngineCoreRequest(
+        return EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=prompt_token_ids,
             prompt_embeds=prompt_embeds,
@@ -458,8 +508,9 @@ class Processor:
             trace_headers=trace_headers,
         )
 
-    def _validate_model_inputs(self, encoder_inputs: Optional[SingletonInputs],
-                               decoder_inputs: SingletonInputs):
+    def _validate_model_inputs(
+        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
+    ):
         if encoder_inputs is not None:
             self._validate_model_input(encoder_inputs, prompt_type="encoder")
 
@@ -473,12 +524,17 @@ class Processor:
     ):
         model_config = self.model_config
 
-        prompt_ids = None if prompt_inputs[
-            "type"] == "embeds" else prompt_inputs["prompt_token_ids"]
-        prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
-            "type"] == "embeds" else None
-        prompt_len = length_from_prompt_token_ids_or_embeds(
-            prompt_ids, prompt_embeds)
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
         if not prompt_ids:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 pass  # Mllama may have empty encoder inputs for text-only data
@@ -487,10 +543,8 @@ class Processor:
             else:
                 raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        if self.model_config.skip_tokenizer_init:
-            tokenizer = None
-        else:
-            tokenizer = self.tokenizer
+        tokenizer = self.tokenizer
+        if tokenizer is not None:
             max_input_id = max(prompt_ids or [], default=0)
 
             # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
@@ -503,10 +557,10 @@ class Processor:
 
             # Here we take the max of the two to determine if a token id is
             # truly out-of-vocabulary.
-            if max_input_id > max(tokenizer.max_token_id,
-                                  self.model_config.get_vocab_size() - 1):
-                raise ValueError(
-                    f"Token id {max_input_id} is out of vocabulary")
+            if max_input_id > max(
+                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
+            ):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
         if prompt_len > max_prompt_len:
@@ -526,20 +580,42 @@ class Processor:
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well.")
+                    "of images, and possibly their aspect ratios as well."
+                )
             else:
                 suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens.")
+                    "number of text tokens."
+                )
 
             raise ValueError(
                 f"The {prompt_type} prompt (length {prompt_len}) is "
                 f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}")
+                f"{suggestion}"
+            )
 
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
 
-    def clear_cache(self) -> None:
-        self.input_preprocessor.clear_cache()
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
+        return self.input_preprocessor.stat_mm_cache()
+
+    def clear_mm_cache(self) -> None:
+        self.input_preprocessor.clear_mm_cache()
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 18ef25ceb6f5e1c5a9b85553b2d0c89b618c2ee4..d65cad7af03d6cc3ab2cf0afe1ff7b90228850ac 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -4,24 +4,26 @@
 import contextlib
 import os
 import weakref
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing import Process, connection
 from multiprocessing.process import BaseProcess
-from typing import TYPE_CHECKING, Callable, Optional, Union
+from typing import TYPE_CHECKING
 from unittest.mock import patch
 
 import msgspec
 import zmq
 
+from vllm import envs
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
-from vllm.utils import get_mp_context, get_open_zmq_ipc_path, zmq_socket_ctx
+from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx
+from vllm.utils.system_utils import get_mp_context
 from vllm.v1.engine.coordinator import DPCoordinator
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
 from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
 
 if TYPE_CHECKING:
@@ -55,13 +57,13 @@ class EngineZmqAddresses:
     # ZMQ output socket addresses for each front-end client (responses)
     outputs: list[str]
     # ZMQ input socket address of DP coordinator if applicable
-    coordinator_input: Optional[str] = None
+    coordinator_input: str | None = None
     # ZMQ output socket address of DP coordinator if applicable
-    coordinator_output: Optional[str] = None
+    coordinator_output: str | None = None
     # ZMQ socket for front-end to connect to DP coordinator.
     # Not used by engine, just relayed to front-end in handshake response.
     # Only required for external DP LB case.
-    frontend_stats_publish_address: Optional[str] = None
+    frontend_stats_publish_address: str | None = None
 
 
 @dataclass
@@ -70,8 +72,10 @@ class EngineHandshakeMetadata:
     including addresses of the front-end ZMQ queues that they should
     connect to.
     """
+
     addresses: EngineZmqAddresses
-    parallel_config: dict[str, Union[int, str, list[int]]]
+    parallel_config: dict[str, int | str | list[int]]
+    parallel_config_hash: str | None = None
 
 
 class CoreEngineProcManager:
@@ -91,7 +95,7 @@ class CoreEngineProcManager:
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
     ):
         context = get_mp_context()
         common_kwargs = {
@@ -103,8 +107,7 @@ class CoreEngineProcManager:
         }
 
         if client_handshake_address:
-            common_kwargs[
-                "client_handshake_address"] = client_handshake_address
+            common_kwargs["client_handshake_address"] = client_handshake_address
 
         self.processes: list[BaseProcess] = []
         local_dp_ranks = []
@@ -115,21 +118,36 @@ class CoreEngineProcManager:
             # Start EngineCore in background process.
             local_dp_ranks.append(local_index)
             self.processes.append(
-                context.Process(target=target_fn,
-                                name=f"EngineCore_DP{global_index}",
-                                kwargs=common_kwargs | {
-                                    "dp_rank": global_index,
-                                    "local_dp_rank": local_index,
-                                }))
+                context.Process(
+                    target=target_fn,
+                    name=f"EngineCore_DP{global_index}",
+                    kwargs=common_kwargs
+                    | {
+                        "dp_rank": global_index,
+                        "local_dp_rank": local_index,
+                    },
+                )
+            )
 
         self._finalizer = weakref.finalize(self, shutdown, self.processes)
 
         data_parallel = vllm_config.parallel_config.data_parallel_size > 1
         try:
             for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
-                with set_device_control_env_var(
-                        vllm_config, local_dp_rank) if (
-                            data_parallel) else contextlib.nullcontext():
+                # Adjust device control in DP for non-CUDA platforms
+                # as well as external and ray launchers
+                # For CUDA platforms, we use torch.cuda.set_device()
+                with (
+                    set_device_control_env_var(vllm_config, local_dp_rank)
+                    if (
+                        data_parallel
+                        and (
+                            not current_platform.is_cuda_alike()
+                            or vllm_config.parallel_config.use_ray
+                        )
+                    )
+                    else contextlib.nullcontext()
+                ):
                     proc.start()
         finally:
             # Kill other procs if not all are running.
@@ -151,27 +169,34 @@ class CoreEngineProcManager:
         """Returns dict of proc name -> exit code for any finished procs."""
         return {
             proc.name: proc.exitcode
-            for proc in self.processes if proc.exitcode is not None
+            for proc in self.processes
+            if proc.exitcode is not None
         }
 
 
 @contextlib.contextmanager
-def set_device_control_env_var(vllm_config: VllmConfig,
-                               local_dp_rank: int) -> Iterator[None]:
+def set_device_control_env_var(
+    vllm_config: VllmConfig, local_dp_rank: int
+) -> Iterator[None]:
     """
     Temporarily set CUDA_VISIBLE_DEVICES or equivalent
     for engine subprocess.
     """
     world_size = vllm_config.parallel_config.world_size
+    local_world_size = vllm_config.parallel_config.local_world_size
     evar = current_platform.device_control_env_var
 
-    value = get_device_indices(evar, local_dp_rank, world_size)
-    with patch.dict(os.environ, values=((evar, value), )):
+    value = get_device_indices(evar, local_dp_rank, world_size, local_world_size)
+    with patch.dict(os.environ, values=((evar, value),)):
         yield
 
 
-def get_device_indices(device_control_env_var: str, local_dp_rank: int,
-                       world_size: int):
+def get_device_indices(
+    device_control_env_var: str,
+    local_dp_rank: int,
+    world_size: int,
+    local_world_size: int | None = None,
+):
     """
     Returns a comma-separated string of device indices for the specified
     data parallel rank.
@@ -179,17 +204,24 @@ def get_device_indices(device_control_env_var: str, local_dp_rank: int,
     For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
     this will select devices 2 and 3 for local_dp_rank=1.
     """
+    if local_world_size is None:
+        local_world_size = world_size
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
-            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
-                           world_size))
+            for i in range(
+                local_dp_rank * world_size,
+                local_dp_rank * world_size + local_world_size,
+            )
+        )
     except IndexError as e:
-        raise Exception(f"Error setting {device_control_env_var}: "
-                        f"local range: [{local_dp_rank * world_size}, "
-                        f"{(local_dp_rank + 1) * world_size}) "
-                        "base value: "
-                        f"\"{os.getenv(device_control_env_var)}\"") from e
+        raise Exception(
+            f"Error setting {device_control_env_var}: "
+            f"local range: [{local_dp_rank * world_size}, "
+            f"{(local_dp_rank + 1) * world_size}) "
+            "base value: "
+            f'"{os.getenv(device_control_env_var)}"'
+        ) from e
     return value
 
 
@@ -208,15 +240,14 @@ class CoreEngineActorManager:
         addresses: EngineZmqAddresses,
         executor_class: type[Executor],
         log_stats: bool,
-        placement_groups: Optional[list["PlacementGroup"]] = None,
-        local_dp_ranks: Optional[list[int]] = None,
+        placement_groups: list["PlacementGroup"] | None = None,
+        local_dp_ranks: list[int] | None = None,
     ):
         import copy
 
         import ray
         from ray.runtime_env import RuntimeEnv
-        from ray.util.scheduling_strategies import (
-            PlacementGroupSchedulingStrategy)
+        from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
         from vllm.v1.engine.core import DPEngineCoreActor
 
@@ -225,8 +256,7 @@ class CoreEngineActorManager:
 
         env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor")
         self.env_vars_dict = {
-            name: os.environ[name]
-            for name in env_vars_list if name in os.environ
+            name: os.environ[name] for name in env_vars_list if name in os.environ
         }
         runtime_env = RuntimeEnv(env_vars=self.env_vars_dict)
 
@@ -234,37 +264,38 @@ class CoreEngineActorManager:
         self.executor_class = executor_class
         self.log_stats = log_stats
         dp_size = vllm_config.parallel_config.data_parallel_size
-        local_engine_count = \
-            vllm_config.parallel_config.data_parallel_size_local
+        local_engine_count = vllm_config.parallel_config.data_parallel_size_local
         world_size = vllm_config.parallel_config.world_size
 
         if ray.is_initialized():
-            logger.info(
-                "Ray is already initialized. Skipping Ray initialization.")
+            logger.info("Ray is already initialized. Skipping Ray initialization.")
         else:
             ray.init()
 
         if placement_groups is not None:
             assert local_dp_ranks is not None, (
-                "local_dp_ranks must be provided if "
-                "placement_groups is provided")
+                "local_dp_ranks must be provided if placement_groups is provided"
+            )
             assert len(placement_groups) == len(local_dp_ranks), (
-                "placement_groups and local_dp_ranks must "
-                "have the same length")
+                "placement_groups and local_dp_ranks must have the same length"
+            )
             logger.info("Using provided placement groups")
             # TODO(rui): validate passed-in placement groups
             self.created_placement_groups = []
         else:
-            placement_groups, local_dp_ranks = \
+            placement_groups, local_dp_ranks = (
                 CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            )
             self.created_placement_groups = placement_groups
         assert len(placement_groups) == dp_size, (
-            "Number of placement groups must match data parallel size")
+            "Number of placement groups must match data parallel size"
+        )
 
         self.placement_group_is_local = []
         refs = []
-        for index, local_index, pg in zip(range(dp_size), local_dp_ranks,
-                                          placement_groups):
+        for index, local_index, pg in zip(
+            range(dp_size), local_dp_ranks, placement_groups
+        ):
             dp_vllm_config = copy.deepcopy(vllm_config)
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
@@ -275,24 +306,32 @@ class CoreEngineActorManager:
             # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
             if current_platform.is_xpu():
                 device_evar = current_platform.device_control_env_var
-                device_indices = get_device_indices(device_evar, local_index,
-                                                    world_size)
+                device_indices = get_device_indices(
+                    device_evar, local_index, world_size
+                )
                 actor_env_vars = self.env_vars_dict.copy()
                 actor_env_vars[device_evar] = device_indices
                 runtime_env = RuntimeEnv(env_vars=actor_env_vars)
 
-            actor = ray.remote(DPEngineCoreActor).options(
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    placement_group=pg,
-                    placement_group_bundle_index=world_size,
-                ),
-                runtime_env=runtime_env).remote(vllm_config=dp_vllm_config,
-                                                executor_class=executor_class,
-                                                log_stats=log_stats,
-                                                local_client=local_client,
-                                                addresses=addresses,
-                                                dp_rank=index,
-                                                local_dp_rank=local_index)
+            actor = (
+                ray.remote(DPEngineCoreActor)
+                .options(
+                    scheduling_strategy=PlacementGroupSchedulingStrategy(
+                        placement_group=pg,
+                        placement_group_bundle_index=world_size,
+                    ),
+                    runtime_env=runtime_env,
+                )
+                .remote(
+                    vllm_config=dp_vllm_config,
+                    executor_class=executor_class,
+                    log_stats=log_stats,
+                    local_client=local_client,
+                    addresses=addresses,
+                    dp_rank=index,
+                    local_dp_rank=local_index,
+                )
+            )
             if local_client:
                 self.local_engine_actors.append(actor)
             else:
@@ -307,7 +346,7 @@ class CoreEngineActorManager:
 
     @staticmethod
     def create_dp_placement_groups(
-            vllm_config: VllmConfig
+        vllm_config: VllmConfig,
     ) -> tuple[list["PlacementGroup"], list[int]]:
         """
         Create placement groups for data parallel.
@@ -317,69 +356,180 @@ class CoreEngineActorManager:
         from ray._private.state import available_resources_per_node
 
         logger.info("Creating placement groups for data parallel")
-        dp_master_ip = \
-            vllm_config.parallel_config.data_parallel_master_ip
-        num_pg_to_create = vllm_config.parallel_config.data_parallel_size
-        local_engine_count = \
-            vllm_config.parallel_config.data_parallel_size_local
+        dp_master_ip = vllm_config.parallel_config.data_parallel_master_ip
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_size_local = vllm_config.parallel_config.data_parallel_size_local
 
         available_resources = available_resources_per_node()
         world_size = vllm_config.parallel_config.world_size
         placement_groups: list[PlacementGroup] = []
         local_dp_ranks: list[int] = []
-        dp_master_ip_key = f'node:{dp_master_ip}'
-        nodes = sorted(available_resources.values(),
-                       key=lambda x: dp_master_ip_key not in x)
-        assert len(nodes) > 0, (
-            "No nodes with resources found in Ray cluster.")
+
+        dp_master_ip_key = f"node:{dp_master_ip}"
+        nodes = sorted(
+            available_resources.values(), key=lambda x: dp_master_ip_key not in x
+        )
+        assert len(nodes) > 0, "No nodes with resources found in Ray cluster."
         assert dp_master_ip_key in nodes[0], (
-            "The DP master node (ip: %s) is missing or dead", dp_master_ip)
+            "The DP master node (ip: %s) is missing or dead",
+            dp_master_ip,
+        )
         device_str = current_platform.ray_device_key
+        n_node_devices: list[int] = [
+            int(node_resources[device_str])
+            for node_resources in nodes
+            if device_str in node_resources
+        ]
+        assert n_node_devices, f"No {device_str} found in Ray cluster."
+        max_device_per_node = max(n_node_devices)
+
+        pack_strategy = envs.VLLM_RAY_DP_PACK_STRATEGY
+        _supported_pack_strategies = ("strict", "fill", "span")
+        if pack_strategy not in _supported_pack_strategies:
+            raise ValueError(
+                f"{envs.VLLM_RAY_DP_PACK_STRATEGY} is not supported. "
+                "Make sure to set `VLLM_RAY_DP_PACK_STRATEGY` "
+                f"to one of {_supported_pack_strategies}"
+            )
+
+        all2all_backend = vllm_config.parallel_config.all2all_backend
+        if pack_strategy == "fill" and (
+            all2all_backend == "deepep_high_throughput"
+            or all2all_backend == "deepep_low_latency"
+        ):
+            raise ValueError(
+                "DeepEP kernels require EP ranks [0,7] (same for [8,15], ...) "
+                "to be on the same node, but VLLM_RAY_DP_PACK_STRATEGY=fill "
+                "does not guarantee that. "
+                "Please use VLLM_RAY_DP_PACK_STRATEGY=strict instead."
+            )
+
+        if pack_strategy in ("strict", "fill"):
+            placement_strategy = "STRICT_PACK"
+        else:
+            placement_strategy = "PACK"
+            assert world_size > max_device_per_node, (
+                f"World size {world_size} is smaller than the "
+                "maximum number of devices per node "
+                f"{max_device_per_node}. Make sure to set "
+                "`VLLM_RAY_DP_PACK_STRATEGY` to `strict` or `fill`"
+            )
+
+            # if we need multiple nodes per dp group, we require for now that
+            # available nodes are homogenous
+            assert set(n_node_devices) == {max_device_per_node}, (
+                f"Nodes are not homogenous, {nodes}"
+            )
+            assert world_size % max_device_per_node == 0, (
+                f"For multi-node data parallel groups, world_size ({world_size}) must "
+                f"be a multiple of number of devices per node ({max_device_per_node})."
+            )
+            assert len(n_node_devices) * max_device_per_node >= world_size * dp_size, (
+                f"Not enough total available nodes ({len(n_node_devices)}) "
+                f"and devices per node ({max_device_per_node}) "
+                f"to satisfy required world size {world_size} and data parallel size "
+                f"{dp_size}"
+            )
+            assert dp_size_local == 1, (
+                f"data-parallel-size-local {dp_size_local} should be set as the "
+                "default (1) for VLLM_RAY_DP_PACK_STRATEGY=span. "
+                "The actual data-parallel-size-local will be auto determined."
+            )
+
+        # bundles collected for a single DP rank from multiple nodes,
+        # for "span" pack strategy
+        collected_bundles = []
         for node_resources in nodes:
-            if device_str not in node_resources:
-                continue
-            # For now, each DP rank can only be assigned to one node
-            # TODO(rui): support allocating a single DP rank
-            # to multiple nodes
-            available_engine_count = int(
-                node_resources[device_str]) // world_size
-            if dp_master_ip_key in node_resources:
-                assert available_engine_count >= local_engine_count, (
-                    "Not enough resources to allocate DP ranks "
-                    f"on DP master node {dp_master_ip}")
-                for i in range(local_engine_count):
-                    bundles = [{
-                        device_str: 1.0,
-                        "node:" + dp_master_ip: 0.001
-                    }] * world_size + [{
-                        "CPU": 1.0
-                    }]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
+            node_ip_keys = [
+                key
+                for key in node_resources
+                if key != "node:__internal_head__" and key.startswith("node:")
+            ]
+            assert len(node_ip_keys) == 1, (
+                "Zero or multiple node IP keys found in node resources: %s",
+                node_ip_keys,
+            )
+            node_ip_key = node_ip_keys[0]
+            node_ip = node_ip_key.split(":")[1]
+
+            n_device_on_node = int(node_resources.get(device_str, 0))
+            if pack_strategy == "span" and n_device_on_node != 0:
+                # Strictly speaking,
+                # dp_size_available = n_device_on_node / world_size
+                # and is a fraction, but we use 1 for easier processing
+                dp_size_available = 1
+            else:
+                dp_size_available = n_device_on_node // world_size
+
+            if node_ip == dp_master_ip:
+                if dp_size_available < dp_size_local:
+                    raise ValueError(
+                        "Not enough resources to allocate %s DP ranks "
+                        "on DP master node %s, possible to fit %s DP ranks",
+                        dp_size_local,
+                        dp_master_ip,
+                        dp_size_available,
                     )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
+                dp_size_to_allocate = dp_size_local
+            elif pack_strategy == "strict":
+                if dp_size_available < dp_size_local:
+                    logger.info(
+                        "Skipping node %s as %s DP ranks could not fit, "
+                        "possible to fit %s DP ranks",
+                        node_ip,
+                        dp_size_local,
+                        dp_size_available,
+                    )
+                    continue
+                dp_size_to_allocate = dp_size_local
             else:
-                for i in range(available_engine_count):
-                    if len(placement_groups) == num_pg_to_create:
-                        break
-                    bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
+                # for "pack_strategy" in "fill" and "span"
+                # we always take everything that's available
+                dp_size_to_allocate = dp_size_available
+
+            for i in range(dp_size_to_allocate):
+                device_bundle = [{device_str: 1.0, "node:" + node_ip: 0.001}]
+                if pack_strategy == "span":
+                    collected_bundles += device_bundle * n_device_on_node
+                    assert len(collected_bundles) <= world_size, (
+                        "collected_bundles should be <= world_size, "
+                        f"but got {len(collected_bundles)=} and {world_size=}"
                     )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
-        if len(placement_groups) < num_pg_to_create:
+
+                    # we only create a placement group if we collected enough devices
+                    if len(collected_bundles) < world_size:
+                        continue
+
+                    bundles = collected_bundles + [{"CPU": 1.0}]
+                    collected_bundles = []
+                else:
+                    bundles = device_bundle * world_size + [{"CPU": 1.0}]
+
+                pg = ray.util.placement_group(
+                    name=f"dp_rank_{len(placement_groups)}",
+                    strategy=placement_strategy,
+                    bundles=bundles,
+                )
+                placement_groups.append(pg)
+                local_dp_ranks.append(i)
+                if len(placement_groups) == dp_size:
+                    break
+
+        if len(placement_groups) < dp_size:
             raise ValueError(
-                f"Not enough resources to allocate {num_pg_to_create} "
+                f"Not enough resources to allocate {dp_size} "
                 "placement groups, only created "
                 f"{len(placement_groups)} placement groups. "
                 "Available resources: "
-                f"{available_resources}")
+                f"{available_resources}"
+            )
+        assert len(placement_groups) == dp_size, (
+            f"Created {len(placement_groups)} DP placement groups, expected {dp_size}"
+        )
+        assert len(local_dp_ranks) == dp_size, (
+            f"local_dp_ranks length {len(local_dp_ranks)} does not match "
+            f"expected {dp_size}"
+        )
         return placement_groups, local_dp_ranks
 
     @staticmethod
@@ -390,8 +540,10 @@ class CoreEngineActorManager:
         Add placement groups for new data parallel size.
         """
         import ray
-        from ray._private.state import (available_resources_per_node,
-                                        total_resources_per_node)
+        from ray._private.state import (
+            available_resources_per_node,
+            total_resources_per_node,
+        )
         from ray.util.state import list_nodes
 
         old_dp_size = old_vllm_config.parallel_config.data_parallel_size
@@ -405,10 +557,10 @@ class CoreEngineActorManager:
 
         nodes = list_nodes()
         nodes = sorted(nodes, key=lambda node: node.node_ip != dp_master_ip)
-        assert nodes[0].node_ip == dp_master_ip, (
-            "The first node must be the head node")
+        assert nodes[0].node_ip == dp_master_ip, "The first node must be the head node"
         assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
-            "There can only be one head node")
+            "There can only be one head node"
+        )
 
         available_resources = available_resources_per_node()
         total_resources = total_resources_per_node()
@@ -446,12 +598,9 @@ class CoreEngineActorManager:
 
                 # Create bundles with node constraint for master node
                 if node_ip == dp_master_ip:
-                    bundles = [{
-                        device_str: 1.0,
-                        "node:" + dp_master_ip: 0.001
-                    }] * world_size + [{
-                        "CPU": 1.0
-                    }]
+                    bundles = [
+                        {device_str: 1.0, "node:" + dp_master_ip: 0.001}
+                    ] * world_size + [{"CPU": 1.0}]
                 else:
                     bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
 
@@ -470,69 +619,76 @@ class CoreEngineActorManager:
 
         return placement_groups, local_dp_ranks
 
-    def scale_up_elastic_ep(self, cur_vllm_config: VllmConfig,
-                            new_data_parallel_size: int) -> None:
+    def scale_up_elastic_ep(
+        self, cur_vllm_config: VllmConfig, new_data_parallel_size: int
+    ) -> None:
         import copy
 
         import ray
         from ray.runtime_env import RuntimeEnv
-        from ray.util.scheduling_strategies import (
-            PlacementGroupSchedulingStrategy)
+        from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
         from vllm.v1.engine.core import DPEngineCoreActor
 
-        cur_data_parallel_size = len(self.local_engine_actors) + \
-            len(self.remote_engine_actors)
+        cur_data_parallel_size = len(self.local_engine_actors) + len(
+            self.remote_engine_actors
+        )
 
         assert new_data_parallel_size > cur_data_parallel_size, (
             f"New data parallel size {new_data_parallel_size} must be greater "
             f"than current data parallel size {cur_data_parallel_size} "
-            "for scale up")
+            "for scale up"
+        )
 
-        placement_groups, local_dp_ranks = \
-            self.add_dp_placement_groups(
-                cur_vllm_config, new_data_parallel_size)
+        placement_groups, local_dp_ranks = self.add_dp_placement_groups(
+            cur_vllm_config, new_data_parallel_size
+        )
 
         world_size = cur_vllm_config.parallel_config.world_size
         dp_master_ip = cur_vllm_config.parallel_config.data_parallel_master_ip
         new_local_engines = 0
 
-        runtime_env = RuntimeEnv(env_vars=self.env_vars_dict
-                                 | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"})
-        for i, (pg,
-                local_rank) in enumerate(zip(placement_groups,
-                                             local_dp_ranks)):
+        runtime_env = RuntimeEnv(
+            env_vars=self.env_vars_dict | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"}
+        )
+        for i, (pg, local_rank) in enumerate(zip(placement_groups, local_dp_ranks)):
             rank = cur_data_parallel_size + i
             dp_vllm_config = copy.deepcopy(cur_vllm_config)
-            dp_vllm_config.parallel_config.data_parallel_size = \
-                new_data_parallel_size
+            dp_vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
             dp_vllm_config.parallel_config.placement_group = pg
 
             # Check if this placement group is on the head node
             local_client = any(
-                bundle.get("node:" + dp_master_ip, 0) > 0
-                for bundle in pg.bundle_specs)
+                bundle.get("node:" + dp_master_ip, 0) > 0 for bundle in pg.bundle_specs
+            )
 
             if local_client:
                 new_local_engines += 1
                 # Update data_parallel_size_local
                 dp_vllm_config.parallel_config.data_parallel_size_local = (
-                    cur_vllm_config.parallel_config.data_parallel_size_local +
-                    new_local_engines)
-
-            actor = ray.remote(DPEngineCoreActor).options(
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    placement_group=pg,
-                    placement_group_bundle_index=world_size,
-                ),
-                runtime_env=runtime_env).remote(
+                    cur_vllm_config.parallel_config.data_parallel_size_local
+                    + new_local_engines
+                )
+
+            actor = (
+                ray.remote(DPEngineCoreActor)
+                .options(
+                    scheduling_strategy=PlacementGroupSchedulingStrategy(
+                        placement_group=pg,
+                        placement_group_bundle_index=world_size,
+                    ),
+                    runtime_env=runtime_env,
+                )
+                .remote(
                     vllm_config=dp_vllm_config,
                     executor_class=self.executor_class,
                     log_stats=self.log_stats,
                     local_client=local_client,
                     addresses=self.addresses,
                     dp_rank=rank,
-                    local_dp_rank=local_rank)
+                    local_dp_rank=local_rank,
+                )
+            )
 
             if local_client:
                 self.local_engine_actors.append(actor)
@@ -541,37 +697,47 @@ class CoreEngineActorManager:
             self.created_placement_groups.append(pg)
             self.placement_group_is_local.append(local_client)
 
-        ray.get([
-            actor.wait_for_init.remote()
-            for actor in (self.local_engine_actors[-new_local_engines:]
-                          if new_local_engines > 0 else []) +
-            self.remote_engine_actors[-(len(placement_groups) -
-                                        new_local_engines):]
-        ])
+        ray.get(
+            [
+                actor.wait_for_init.remote()
+                for actor in (
+                    self.local_engine_actors[-new_local_engines:]
+                    if new_local_engines > 0
+                    else []
+                )
+                + self.remote_engine_actors[
+                    -(len(placement_groups) - new_local_engines) :
+                ]
+            ]
+        )
 
-        actors = (self.local_engine_actors[-new_local_engines:]
-                  if new_local_engines > 0 else []) + \
-            self.remote_engine_actors[-(len(placement_groups) -
-                                        new_local_engines):]
+        actors = (
+            self.local_engine_actors[-new_local_engines:]
+            if new_local_engines > 0
+            else []
+        ) + self.remote_engine_actors[-(len(placement_groups) - new_local_engines) :]
 
         for actor in actors:
             self.run_refs.append(actor.run.remote())
 
-        cur_vllm_config.parallel_config.data_parallel_size = \
-            new_data_parallel_size
+        cur_vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         # Update old_vllm_config with new data_parallel_size_local if any new
         # local engines were added
         if new_local_engines > 0:
-            cur_vllm_config.parallel_config.data_parallel_size_local += \
+            cur_vllm_config.parallel_config.data_parallel_size_local += (
                 new_local_engines
+            )
 
-    def scale_down_elastic_ep(self, cur_data_parallel_size: int,
-                              new_data_parallel_size: int) -> None:
+    def scale_down_elastic_ep(
+        self, cur_data_parallel_size: int, new_data_parallel_size: int
+    ) -> None:
         import ray
+
         assert cur_data_parallel_size > new_data_parallel_size, (
             f"cur_data_parallel_size {cur_data_parallel_size} must be greater "
             f"than new_data_parallel_size {new_data_parallel_size} "
-            "for scale down")
+            "for scale down"
+        )
         for _ in range(cur_data_parallel_size - new_data_parallel_size):
             pg = self.created_placement_groups.pop()
             is_local = self.placement_group_is_local.pop()
@@ -586,6 +752,7 @@ class CoreEngineActorManager:
 
     def close(self):
         import ray
+
         for actor in self.local_engine_actors + self.remote_engine_actors:
             ray.kill(actor)
         for pg in self.created_placement_groups:
@@ -598,11 +765,13 @@ def launch_core_engines(
     executor_class: type[Executor],
     log_stats: bool,
     num_api_servers: int = 1,
-) -> Iterator[tuple[
-        Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
-        Optional[DPCoordinator],
+) -> Iterator[
+    tuple[
+        CoreEngineProcManager | CoreEngineActorManager | None,
+        DPCoordinator | None,
         EngineZmqAddresses,
-]]:
+    ]
+]:
     """Launch engine and DP coordinator processes as needed."""
 
     parallel_config = vllm_config.parallel_config
@@ -611,8 +780,10 @@ def launch_core_engines(
     local_start_index = parallel_config.data_parallel_rank_local
     dp_rank = parallel_config.data_parallel_rank
     host = parallel_config.data_parallel_master_ip
-    local_engines_only = (parallel_config.data_parallel_hybrid_lb
-                          or parallel_config.data_parallel_external_lb)
+    local_engines_only = (
+        parallel_config.data_parallel_hybrid_lb
+        or parallel_config.data_parallel_external_lb
+    )
 
     # In offline mode there is an LLM instance per DP rank and
     # one core engine per LLM, see
@@ -621,8 +792,9 @@ def launch_core_engines(
 
     # client_local_only = True for cases where this front-end
     # sends requests only to colocated engines.
-    client_local_only = (offline_mode or local_engines_only
-                         or (local_engine_count == dp_size))
+    client_local_only = (
+        offline_mode or local_engines_only or (local_engine_count == dp_size)
+    )
 
     # Set up input and output addresses.
     addresses = EngineZmqAddresses(
@@ -644,12 +816,13 @@ def launch_core_engines(
         coordinator = DPCoordinator(parallel_config)
 
         addresses.coordinator_input, addresses.coordinator_output = (
-            coordinator.get_engine_socket_addresses())
+            coordinator.get_engine_socket_addresses()
+        )
         addresses.frontend_stats_publish_address = (
-            coordinator.get_stats_publish_address())
+            coordinator.get_stats_publish_address()
+        )
 
-        logger.info("Started DP Coordinator process (PID: %d)",
-                    coordinator.proc.pid)
+        logger.info("Started DP Coordinator process (PID: %d)", coordinator.proc.pid)
     else:
         coordinator = None
 
@@ -675,14 +848,14 @@ def launch_core_engines(
         # Note this also covers the case where we have zero local engines
         # and rank 0 is headless.
         engines_to_handshake = [
-            CoreEngine(index=i, local=(i < local_engine_count))
-            for i in range(dp_size)
+            CoreEngine(index=i, local=(i < local_engine_count)) for i in range(dp_size)
         ]
     else:
         # Rank > 0 handshakes with just the local cores it is managing.
         assert local_engines_only, (
             "Attempting to launch core_engines from dp_rank > 0, but "
-            "found internal DPLB, which is incompatible.")
+            "found internal DPLB, which is incompatible."
+        )
         engines_to_handshake = [
             CoreEngine(index=i, local=True)
             for i in range(dp_rank, dp_rank + local_engine_count)
@@ -695,7 +868,8 @@ def launch_core_engines(
     handshake_local_only = offline_mode or local_engine_count == dp_size
 
     handshake_address = get_engine_client_zmq_addr(
-        handshake_local_only, host, parallel_config.data_parallel_rpc_port)
+        handshake_local_only, host, parallel_config.data_parallel_rpc_port
+    )
 
     if local_engines_only and dp_rank > 0:
         assert not handshake_local_only
@@ -705,9 +879,9 @@ def launch_core_engines(
         local_handshake_address = handshake_address
         client_handshake_address = None
 
-    with zmq_socket_ctx(local_handshake_address, zmq.ROUTER,
-                        bind=True) as handshake_socket:
-
+    with zmq_socket_ctx(
+        local_handshake_address, zmq.ROUTER, bind=True
+    ) as handshake_socket:
         from vllm.v1.engine.core import EngineCoreProc
 
         # Start local engines.
@@ -722,7 +896,8 @@ def launch_core_engines(
                 local_client=True,
                 local_engine_count=local_engine_count,
                 start_index=dp_rank,
-                local_start_index=local_start_index or 0)
+                local_start_index=local_start_index or 0,
+            )
         else:
             local_engine_manager = None
 
@@ -746,8 +921,8 @@ def wait_for_engine_startup(
     core_engines: list[CoreEngine],
     parallel_config: ParallelConfig,
     cache_config: CacheConfig,
-    proc_manager: Optional[CoreEngineProcManager],
-    coord_process: Optional[Process],
+    proc_manager: CoreEngineProcManager | None,
+    coord_process: Process | None,
 ):
     # Wait for engine core process(es) to send ready messages.
     local_count = parallel_config.data_parallel_size_local
@@ -757,8 +932,10 @@ def wait_for_engine_startup(
     poller = zmq.Poller()
     poller.register(handshake_socket, zmq.POLLIN)
 
-    remote_should_be_headless = not parallel_config.data_parallel_hybrid_lb \
+    remote_should_be_headless = (
+        not parallel_config.data_parallel_hybrid_lb
         and not parallel_config.data_parallel_external_lb
+    )
 
     if proc_manager is not None:
         for sentinel in proc_manager.sentinels():
@@ -770,67 +947,80 @@ def wait_for_engine_startup(
         if not events:
             if any(conn_pending):
                 logger.debug(
-                    "Waiting for %d local, %d remote core engine proc(s) "
-                    "to connect.", *conn_pending)
+                    "Waiting for %d local, %d remote core engine proc(s) to connect.",
+                    *conn_pending,
+                )
             if any(start_pending):
                 logger.debug(
-                    "Waiting for %d local, %d remote core engine proc(s) "
-                    "to start.", *start_pending)
+                    "Waiting for %d local, %d remote core engine proc(s) to start.",
+                    *start_pending,
+                )
             continue
         if len(events) > 1 or events[0][0] != handshake_socket:
             # One of the local core processes exited.
             finished = proc_manager.finished_procs() if proc_manager else {}
             if coord_process is not None and coord_process.exitcode is not None:
                 finished[coord_process.name] = coord_process.exitcode
-            raise RuntimeError("Engine core initialization failed. "
-                               "See root cause above. "
-                               f"Failed core proc(s): {finished}")
+            raise RuntimeError(
+                "Engine core initialization failed. "
+                "See root cause above. "
+                f"Failed core proc(s): {finished}"
+            )
 
         # Receive HELLO and READY messages from the input socket.
         eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
         eng_index = int.from_bytes(eng_identity, "little")
-        engine = next((e for e in core_engines if e.identity == eng_identity),
-                      None)
+        engine = next((e for e in core_engines if e.identity == eng_identity), None)
         if engine is None:
-            raise RuntimeError(f"Message from engine with unexpected data "
-                               f"parallel rank: {eng_index}")
+            raise RuntimeError(
+                f"Message from engine with unexpected data parallel rank: {eng_index}"
+            )
         msg = msgspec.msgpack.decode(ready_msg_bytes)
         status, local, headless = msg["status"], msg["local"], msg["headless"]
         if local != engine.local:
-            raise RuntimeError(f"{status} message from "
-                               f"{'local' if local else 'remote'} "
-                               f"engine {eng_index}, expected it to be "
-                               f"{'local' if engine.local else 'remote'}")
+            raise RuntimeError(
+                f"{status} message from "
+                f"{'local' if local else 'remote'} "
+                f"engine {eng_index}, expected it to be "
+                f"{'local' if engine.local else 'remote'}"
+            )
 
         # Remote engines must be headless iff we aren't in hybrid dp lb mode.
         if not local and headless != remote_should_be_headless:
             if headless:
-                raise RuntimeError(f"Remote engine {eng_index} must not use "
-                                   f"--headless in external or hybrid dp lb "
-                                   f"mode")
+                raise RuntimeError(
+                    f"Remote engine {eng_index} must not use "
+                    f"--headless in external or hybrid dp lb "
+                    f"mode"
+                )
             else:
-                raise RuntimeError(f"Remote engine {eng_index} must use "
-                                   f"--headless unless in external or hybrid "
-                                   f"dp lb mode")
+                raise RuntimeError(
+                    f"Remote engine {eng_index} must use "
+                    f"--headless unless in external or hybrid "
+                    f"dp lb mode"
+                )
 
         if status == "HELLO" and engine.state == CoreEngineState.NEW:
-
-            # Send init message with DP config info.
+            # Send init message with DP config info and config hash.
+            # The config hash ensures all DP workers have compatible configs.
             init_message = msgspec.msgpack.encode(
                 EngineHandshakeMetadata(
                     addresses=addresses,
                     parallel_config={
-                        "data_parallel_master_ip":
-                        parallel_config.data_parallel_master_ip,
-                        "data_parallel_master_port":
-                        parallel_config.data_parallel_master_port,
-                        "_data_parallel_master_port_list":
-                        parallel_config._data_parallel_master_port_list,
-                        "data_parallel_size":
-                        parallel_config.data_parallel_size,
-                    }))
-            handshake_socket.send_multipart((eng_identity, init_message),
-                                            copy=False)
+                        k: getattr(parallel_config, k)
+                        for k in (
+                            "data_parallel_master_ip",
+                            "data_parallel_master_port",
+                            "_data_parallel_master_port_list",
+                            "data_parallel_size",
+                        )
+                    },
+                    parallel_config_hash=parallel_config.compute_hash()
+                    if parallel_config.data_parallel_size > 1
+                    else None,
+                )
+            )
+            handshake_socket.send_multipart((eng_identity, init_message), copy=False)
             conn_pending[0 if local else 1] -= 1
             start_pending[0 if local else 1] += 1
             engine.state = CoreEngineState.CONNECTED
@@ -846,15 +1036,37 @@ def wait_for_engine_startup(
             # one of the engine handshakes, and passed to the local
             # front-end process in the response from the other.
             if addresses.frontend_stats_publish_address is None:
-                addresses.frontend_stats_publish_address = msg.get(
-                    "dp_stats_address")
+                addresses.frontend_stats_publish_address = msg.get("dp_stats_address")
+
+            # Validate config hash consistency across DP workers
+            if parallel_config.data_parallel_size > 1:
+                worker_config_hash = msg.get("parallel_config_hash")
+                expected_hash = parallel_config.compute_hash()
+                if worker_config_hash != expected_hash:
+                    raise RuntimeError(
+                        f"Configuration mismatch detected for engine "
+                        f"{eng_index}. All DP workers must have identical "
+                        f"configurations for parameters that affect collective "
+                        f"communication (e.g., enable_eplb, "
+                        f"eplb_config.log_balancedness). "
+                        f"Worker hash: {worker_config_hash}, "
+                        f"Expected hash: {expected_hash}. "
+                        f"Please ensure all workers are started with the same "
+                        f"command-line arguments."
+                    )
 
             start_pending[0 if local else 1] -= 1
             engine.state = CoreEngineState.READY
         else:
-            raise RuntimeError(f"Unexpected {status} message for "
-                               f"{'local' if local else 'remote'} engine "
-                               f"{eng_index} in {engine.state} state.")
-
-        logger.debug("%s from %s core engine process %s.", status,
-                     "local" if local else "remote", eng_index)
+            raise RuntimeError(
+                f"Unexpected {status} message for "
+                f"{'local' if local else 'remote'} engine "
+                f"{eng_index} in {engine.state} state."
+            )
+
+        logger.debug(
+            "%s from %s core engine process %s.",
+            status,
+            "local" if local else "remote",
+            eng_index,
+        )
diff --git a/vllm/v1/executor/__init__.py b/vllm/v1/executor/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..30d52c73791e55c998b013638c36f4f00006af38 100644
--- a/vllm/v1/executor/__init__.py
+++ b/vllm/v1/executor/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .abstract import Executor
+from .uniproc_executor import UniProcExecutor
+
+__all__ = ["Executor", "UniProcExecutor"]
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 625017d52fff02dc7095229c595a0b8444ee6005..db8303fcec501b21b129b81cf914bb9ca6eacec6 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,80 +1,121 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Callable
 from concurrent.futures import Future
-from typing import Any, Callable, Optional, Union
-
-import torch
-import torch.distributed as dist
+from functools import cached_property
+from typing import TYPE_CHECKING, Literal, TypeVar, overload
 
 from vllm.config import VllmConfig
-from vllm.executor.executor_base import ExecutorBase
-from vllm.executor.uniproc_executor import (  # noqa
-    ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
-from vllm.executor.uniproc_executor import (  # noqa
-    UniProcExecutor as UniProcExecutorV0)
-from vllm.utils import resolve_obj_by_qualname
-from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorHandshakeMetadata,
+)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.tasks import SupportedTask
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.engine import ReconfigureDistributedRequest
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerBase
+
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R")
 
 FailureCallback = Callable[[], None]
 
 
-class Executor(ExecutorBase):
+class Executor(ABC):
+    """Abstract base class for vLLM executors."
+
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor that can execute the model on multiple devices.
     """
-    Abstract class for v1 executors, mainly define some methods for v1.
-    For methods shared by v0 and v1, define them in ExecutorBase"""
+
+    uses_ray: bool = False  # whether the executor uses Ray for orchestration.
+    supports_pp: bool = False  # whether the executor supports PP
 
     @staticmethod
     def get_class(vllm_config: VllmConfig) -> type["Executor"]:
         executor_class: type[Executor]
         parallel_config = vllm_config.parallel_config
-        distributed_executor_backend = (
-            parallel_config.distributed_executor_backend)
+        distributed_executor_backend = parallel_config.distributed_executor_backend
         # distributed_executor_backend must be set in VllmConfig.__post_init__
         if isinstance(distributed_executor_backend, type):
-            if not issubclass(distributed_executor_backend, ExecutorBase):
+            if not issubclass(distributed_executor_backend, Executor):
                 raise TypeError(
                     "distributed_executor_backend must be a subclass of "
-                    f"ExecutorBase. Got {distributed_executor_backend}.")
+                    f"Executor. Got {distributed_executor_backend}."
+                )
             executor_class = distributed_executor_backend
         elif distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_distributed_executor import (  # noqa
-                RayDistributedExecutor)
+            from vllm.v1.executor.ray_executor import RayDistributedExecutor
+
             executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
             executor_class = MultiprocExecutor
         elif distributed_executor_backend == "uni":
+            from vllm.v1.executor.uniproc_executor import UniProcExecutor
+
             executor_class = UniProcExecutor
         elif distributed_executor_backend == "external_launcher":
             # TODO: make v1 scheduling deterministic
             # to support external launcher
             executor_class = ExecutorWithExternalLauncher
         elif isinstance(distributed_executor_backend, str):
-            executor_class = resolve_obj_by_qualname(
-                distributed_executor_backend)
-            if not issubclass(executor_class, ExecutorBase):
+            executor_class = resolve_obj_by_qualname(distributed_executor_backend)
+            if not issubclass(executor_class, Executor):
                 raise TypeError(
                     "distributed_executor_backend must be a subclass of "
-                    f"ExecutorBase. Got {executor_class}.")
+                    f"Executor. Got {executor_class}."
+                )
         else:
-            raise ValueError("Unknown distributed executor backend: "
-                             f"{distributed_executor_backend}")
+            raise ValueError(
+                f"Unknown distributed executor backend: {distributed_executor_backend}"
+            )
         return executor_class
 
-    def initialize_from_config(self,
-                               kv_cache_configs: list[KVCacheConfig]) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self._init_executor()
+        self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
+        self.kv_output_aggregator: KVOutputAggregator | None = None
+
+    @abstractmethod
+    def _init_executor(self) -> None:
+        raise NotImplementedError
+
+    def initialize_from_config(self, kv_cache_configs: list[KVCacheConfig]) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        self.collective_rpc("initialize_from_config",
-                            args=(kv_cache_configs, ))
+        self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
         self.collective_rpc("compile_or_warm_up_model")
 
-    def register_failure_callback(self, callback: FailureCallback):
+    def register_failure_callback(self, callback: FailureCallback):  # noqa: B027
         """
         Register a function to be called if the executor enters a permanent
         failed state.
@@ -87,29 +128,108 @@ class Executor(ExecutorBase):
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
         return self.collective_rpc("get_kv_cache_spec")
 
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None,
-                       non_block: bool = False) -> list[Any]:
+    @overload
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: Literal[False] = False,
+    ) -> list[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+            non_block: If `True`, returns a list of Futures instead of waiting
+                for the results.
+
+        Returns:
+            A list containing the results from each worker.
+
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        pass
+
+    @overload
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: Literal[True] = True,
+    ) -> Future[list[_R]]:
+        pass
+
+    @abstractmethod
+    def collective_rpc(
+        self, method, timeout=None, args=(), kwargs=None, non_block: bool = False
+    ):
         raise NotImplementedError
 
-    def execute_model(
+    def get_kv_connector_handshake_metadata(
         self,
-        scheduler_output: SchedulerOutput,
-        non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
-        output = self.collective_rpc("execute_model",
-                                     args=(scheduler_output, ),
-                                     non_block=non_block)
+    ) -> list[dict[int, KVConnectorHandshakeMetadata]]:
+        return self.collective_rpc("get_kv_connector_handshake_metadata")
+
+    @overload
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: Literal[False] = False
+    ) -> ModelRunnerOutput | None:
+        pass
+
+    @overload
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: Literal[True] = True
+    ) -> Future[ModelRunnerOutput | None]:
+        pass
+
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        output = self.collective_rpc(  # type: ignore[call-overload]
+            "execute_model", args=(scheduler_output,), non_block=non_block
+        )
+        return output[0]
+
+    @overload
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: Literal[False] = False
+    ) -> ModelRunnerOutput:
+        pass
+
+    @overload
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: Literal[True] = True
+    ) -> Future[ModelRunnerOutput]:
+        pass
+
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        output = self.collective_rpc(  # type: ignore[call-overload]
+            "sample_tokens", args=(grammar_output,), non_block=non_block
+        )
         return output[0]
 
     def execute_dummy_batch(self) -> None:
         self.collective_rpc("execute_dummy_batch")
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
-        output = self.collective_rpc("take_draft_token_ids")
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        output: list[DraftTokenIds] = self.collective_rpc("take_draft_token_ids")
         return output[0]
 
     @property
@@ -117,21 +237,116 @@ class Executor(ExecutorBase):
         return 1
 
     def profile(self, is_start: bool = True):
-        self.collective_rpc("profile", args=(is_start, ))
+        self.collective_rpc("profile", args=(is_start,))
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: str | None = None,
+        max_size: int | None = None,
+    ) -> None:
+        self.collective_rpc(
+            "save_sharded_state",
+            kwargs=dict(path=path, pattern=pattern, max_size=max_size),
+        )
 
-class UniProcExecutor(UniProcExecutorV0, Executor):
-    pass
+    @abstractmethod
+    def check_health(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
 
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        self.collective_rpc("shutdown")
 
-class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
+    def init_kv_output_aggregator(self, connector: "KVConnectorBase") -> None:
+        """Init KVOutputAggregator"""
+        self.kv_output_aggregator = KVOutputAggregator.from_connector(
+            connector, self.parallel_config.world_size
+        )
 
-    def determine_available_memory(self) -> list[int]:  # in bytes
-        # same as determine_num_available_blocks in v0,
-        # we need to get the min across all ranks.
-        memory = super().determine_available_memory()
-        from vllm.distributed.parallel_state import get_world_group
-        cpu_group = get_world_group().cpu_group
-        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
-        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
-        return [memory_tensor.item()]
+    @cached_property  # Avoid unnecessary RPC calls
+    def supported_tasks(self) -> tuple[SupportedTask, ...]:
+        output: list[tuple[SupportedTask, ...]]
+        output = self.collective_rpc("get_supported_tasks")
+        return output[0]
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request,)))
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id,)))
+
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id,)))
+
+    def list_loras(self) -> set[int]:
+        sets: list[set[int]] = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
+
+    def reset_mm_cache(self) -> None:
+        """Reset the multi-modal cache in each worker."""
+        self.collective_rpc("reset_mm_cache")
+
+    def sleep(self, level: int = 1):
+        if self.is_sleeping:
+            logger.warning("Executor is already sleeping.")
+            return
+        time_before_sleep = time.perf_counter()
+        self.collective_rpc("sleep", kwargs=dict(level=level))
+        time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
+        self.is_sleeping = True
+        logger.info(
+            "It took %.6f seconds to fall asleep.", time_after_sleep - time_before_sleep
+        )
+
+    def wake_up(self, tags: list[str] | None = None):
+        if not self.is_sleeping:
+            logger.warning("Executor is not sleeping.")
+            return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning(
+                        "Tag %s is not in sleeping tags %s", tag, self.sleeping_tags
+                    )
+                    return
+        time_before_wakeup = time.perf_counter()
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
+        time_after_wakeup = time.perf_counter()
+        logger.info(
+            "It took %.6f seconds to wake up tags %s.",
+            time_after_wakeup - time_before_wakeup,
+            tags if tags is not None else self.sleeping_tags,
+        )
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
+
+    def reinitialize_distributed(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        raise NotImplementedError
+
+
+from vllm.v1.executor.uniproc_executor import (  # noqa: E402
+    ExecutorWithExternalLauncher as _ExecutorWithExternalLauncher,
+)
+from vllm.v1.executor.uniproc_executor import (  # noqa: E402
+    UniProcExecutor as _UniProcExecutor,
+)
+
+# For backwards compatibility.
+UniProcExecutor = _UniProcExecutor
+ExecutorWithExternalLauncher = _ExecutorWithExternalLauncher
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ef90af263664e7dfbc4e01e7d57e6fc1f2b7bf05..ad2ece50f9815189e98ec208d4ff3af6392b8570 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,7 +9,10 @@ import threading
 import time
 import traceback
 import weakref
-from concurrent.futures import Future, ThreadPoolExecutor
+from collections import deque
+from collections.abc import Callable, Sequence
+from concurrent.futures import Future, InvalidStateError
+from contextlib import suppress
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import cached_property, partial
@@ -17,55 +20,103 @@ from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
 from multiprocessing.synchronize import Lock as LockType
 from threading import Thread
-from typing import Any, Callable, Optional, Union, cast
+from typing import Any, cast
 
 import cloudpickle
 import torch
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
-from vllm.distributed import (destroy_distributed_environment,
-                              destroy_model_parallel)
-from vllm.distributed.device_communicators.shm_broadcast import (Handle,
-                                                                 MessageQueue)
-from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
-                                             get_pp_group, get_tp_group)
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
+from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.parallel_state import (
+    get_dcp_group,
+    get_dp_group,
+    get_ep_group,
+    get_inner_dp_world_group,
+    get_pp_group,
+    get_tp_group,
+)
+from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import worker_receiver_cache_from_config
-from vllm.utils import (_maybe_force_spawn, decorate_logs,
-                        get_distributed_init_method, get_loopback_ip,
-                        get_mp_context, get_open_port, set_process_title)
-from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.utils.network_utils import (
+    get_distributed_init_method,
+    get_loopback_ip,
+    get_open_port,
+)
+from vllm.utils.system_utils import (
+    _maybe_force_spawn,
+    decorate_logs,
+    get_mp_context,
+    set_process_title,
+)
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.executor.utils import get_and_update_mm_cache
-from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
-                             ModelRunnerOutput)
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 
-class MultiprocExecutor(Executor):
+class FutureWrapper(Future):
+    def __init__(
+        self,
+        futures_queue: deque[tuple["FutureWrapper", Callable]],
+        aggregate: Callable = lambda x: x,
+    ):
+        self.futures_queue = futures_queue
+        self.aggregate = aggregate
+        super().__init__()
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise RuntimeError("timeout not implemented")
+        # Drain any futures ahead of us in the queue.
+        while not self.done():
+            future, get_response = self.futures_queue.pop()
+            future.wait_for_response(get_response)
+        return super().result()
+
+    def wait_for_response(self, get_response: Callable):
+        try:
+            response = self.aggregate(get_response())
+            with suppress(InvalidStateError):
+                self.set_result(response)
+        except Exception as e:
+            with suppress(InvalidStateError):
+                self.set_exception(e)
+
 
+class MultiprocExecutor(Executor):
     supports_pp: bool = True
 
+    def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True):
+        self.monitor_workers = monitor_workers
+        super().__init__(vllm_config)
+
     def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
         self.shutdown_event = threading.Event()
-        self.failure_callback: Optional[FailureCallback] = None
-        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
+        self.failure_callback: FailureCallback | None = None
 
         self.world_size = self.parallel_config.world_size
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). "
+        )
+        self.local_world_size = self.parallel_config.local_world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         pp_parallel_size = self.parallel_config.pipeline_parallel_size
         assert self.world_size == tensor_parallel_size * pp_parallel_size, (
             f"world_size ({self.world_size}) must be equal to the "
             f"tensor_parallel_size ({tensor_parallel_size}) x pipeline"
-            f"_parallel_size ({pp_parallel_size}). ")
+            f"_parallel_size ({pp_parallel_size}). "
+        )
 
         # Set multiprocessing envs
         set_multiprocessing_worker_envs()
@@ -74,44 +125,79 @@ class MultiprocExecutor(Executor):
         # Since it only works for single node, we can use the loopback address
         # get_loopback_ip() for communication.
         distributed_init_method = get_distributed_init_method(
-            get_loopback_ip(), get_open_port())
-
+            get_loopback_ip(), get_open_port()
+        )
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
-        self.rpc_broadcast_mq = MessageQueue(self.world_size,
-                                             self.world_size,
-                                             max_chunk_bytes=max_chunk_bytes)
-        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
-
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
         # Create workers
         context = get_mp_context()
         shared_worker_lock = context.Lock()
         unready_workers: list[UnreadyWorkerProcHandle] = []
         success = False
         try:
-            for rank in range(self.world_size):
+            global_start_rank = (
+                self.local_world_size * self.parallel_config.node_rank_within_dp
+            )
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
                 unready_workers.append(
                     WorkerProc.make_worker_process(
                         vllm_config=self.vllm_config,
-                        local_rank=rank,
-                        rank=rank,
+                        local_rank=local_rank,
+                        rank=global_rank,
                         distributed_init_method=distributed_init_method,
                         input_shm_handle=scheduler_output_handle,
                         shared_worker_lock=shared_worker_lock,
-                    ))
+                    )
+                )
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
+
+            # Wait for all local workers to be ready.
             self.workers = WorkerProc.wait_for_ready(unready_workers)
 
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[0].peer_worker_response_mqs[
+                            rank
+                        ]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)
+
             # Ensure message queues are ready. Will deadlock if re-ordered
             # Must be kept consistent with the WorkerProc.
-            self.rpc_broadcast_mq.wait_until_ready()
-            for w in self.workers:
-                w.worker_response_mq.wait_until_ready()
 
-            self.start_worker_monitor()
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
             success = True
         finally:
             if not success:
@@ -120,22 +206,13 @@ class MultiprocExecutor(Executor):
                 for uw in unready_workers:
                     if uw.death_writer is not None:
                         uw.death_writer.close()
-                self._ensure_worker_termination(
-                    [uw.proc for uw in unready_workers])
-
-        # For pipeline parallel, we use a thread pool for asynchronous
-        # execute_model.
-        if self.max_concurrent_batches > 1:
-            # Note: must use only 1 IO thread to keep dequeue sequence
-            # from the response queue
-            # _async_aggregate_workers_output also assumes a single IO thread
-            self.io_thread_pool = ThreadPoolExecutor(
-                max_workers=1, thread_name_prefix="mp_exec_io")
+                self._ensure_worker_termination([uw.proc for uw in unready_workers])
+
+        self.futures_queue = deque[tuple[FutureWrapper, Callable]]()
 
         self.output_rank = self._get_output_rank()
-        self.has_connector = self.vllm_config.kv_transfer_config is not None
 
-    def start_worker_monitor(self):
+    def start_worker_monitor(self, inline=False) -> None:
         workers = self.workers
         self_ref = weakref.ref(self)
 
@@ -146,23 +223,26 @@ class MultiprocExecutor(Executor):
             sentinels = [h.proc.sentinel for h in workers]
             died = multiprocessing.connection.wait(sentinels)
             _self = self_ref()
-            if not _self or getattr(_self, 'shutting_down', False):
+            if not _self or getattr(_self, "shutting_down", False):
                 return
             _self.is_failed = True
-            proc_name = next(h.proc.name for h in workers
-                             if h.proc.sentinel == died[0])
+            proc_name = next(h.proc.name for h in workers if h.proc.sentinel == died[0])
             logger.error(
-                "Worker proc %s died unexpectedly, "
-                "shutting down executor.", proc_name)
+                "Worker proc %s died unexpectedly, shutting down executor.", proc_name
+            )
             _self.shutdown()
             callback = _self.failure_callback
             if callback is not None:
                 _self.failure_callback = None
                 callback()
 
-        Thread(target=monitor_workers,
-               daemon=True,
-               name="MultiprocWorkerMonitor").start()
+        if not inline:
+            Thread(
+                target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
+            ).start()
+            return
+
+        monitor_workers()
 
     def register_failure_callback(self, callback: FailureCallback):
         if self.is_failed:
@@ -170,107 +250,112 @@ class MultiprocExecutor(Executor):
         else:
             self.failure_callback = callback
 
-    def execute_model(
-        self,
-        scheduler_output: SchedulerOutput,
-        non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
-
-        if not self.has_connector:
-            # get output only from a single worker (output_rank)
-            (output, ) = self.collective_rpc(
-                "execute_model",
-                args=(scheduler_output, ),
-                unique_reply_rank=self.output_rank,
-                non_block=non_block,
-                timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS)
-            return output
-
-        # get output from all workers
-        outputs = self.collective_rpc(
+    def execute_model(  # type: ignore[override]
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
             "execute_model",
-            args=(scheduler_output, ),
+            args=(scheduler_output,),
+            unique_reply_rank=self.output_rank,
             non_block=non_block,
-            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS)
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
+        )
 
-        # aggregate all workers output to a single output
-        if non_block:
-            return self.kv_output_aggregator.async_aggregate(
-                outputs, self.output_rank)
-        return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            unique_reply_rank=self.output_rank,
+            non_block=non_block,
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
+        )
 
     def execute_dummy_batch(self) -> None:
-        self.collective_rpc("execute_dummy_batch",
-                            unique_reply_rank=self.output_rank)
+        self.collective_rpc("execute_dummy_batch", unique_reply_rank=self.output_rank)
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         # OPTIMIZATION: Get output only from a single worker (output_rank)
-        outputs = self.collective_rpc("take_draft_token_ids",
-                                      unique_reply_rank=self.output_rank)
-        return outputs[0]
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None,
-                       non_block: bool = False,
-                       unique_reply_rank: Optional[int] = None) -> list[Any]:
+        return self.collective_rpc(
+            "take_draft_token_ids", unique_reply_rank=self.output_rank
+        )
+
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: bool = False,
+        unique_reply_rank: int | None = None,
+        kv_output_aggregator: KVOutputAggregator = None,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
+        """Returns single result if unique_reply_rank and/or kv_output_aggregator
+        is provided, otherwise list."""
+        assert self.rpc_broadcast_mq is not None, (
+            "collective_rpc should not be called on follower node"
+        )
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
         deadline = None if timeout is None else time.monotonic() + timeout
         kwargs = kwargs or {}
 
-        # NOTE: If the args are heterogeneous, then we pack them into a list,
-        # and unpack them in the method of every worker, because every worker
-        # knows their own rank.
-        try:
-            if isinstance(method, str):
-                send_method = method
-            else:
-                send_method = cloudpickle.dumps(
-                    method, protocol=pickle.HIGHEST_PROTOCOL)
-            self.rpc_broadcast_mq.enqueue(
-                (send_method, args, kwargs, unique_reply_rank))
+        if kv_output_aggregator is not None:
+            output_rank = None
+            aggregate: Callable[[Any], Any] = partial(
+                kv_output_aggregator.aggregate, output_rank=unique_reply_rank or 0
+            )
+        else:
+            output_rank = unique_reply_rank
+            aggregate = lambda x: x
 
-            workers = (self.workers[unique_reply_rank],
-                       ) if unique_reply_rank is not None else self.workers
-            responses = []
+        if isinstance(method, str):
+            send_method = method
+        else:
+            send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
+        self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
 
-            def get_response(w: WorkerProcHandle,
-                             dequeue_timeout: Optional[float] = None,
-                             cancel_event: Optional[threading.Event] = None):
-                status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout, cancel=cancel_event)
+        response_mqs: Sequence[MessageQueue] = self.response_mqs
+        if output_rank is not None:
+            response_mqs = (response_mqs[output_rank],)
 
+        shutdown_event = self.shutdown_event
+
+        def get_response():
+            responses = []
+            for mq in response_mqs:
+                dequeue_timeout = (
+                    None if deadline is None else (deadline - time.monotonic())
+                )
+                try:
+                    status, result = mq.dequeue(
+                        timeout=dequeue_timeout, cancel=shutdown_event
+                    )
+                except TimeoutError as e:
+                    raise TimeoutError(f"RPC call to {method} timed out.") from e
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
                         f"Worker failed with error '{result}', please check the"
-                        " stack trace above for the root cause")
-                return result
-
-            for w in workers:
-                dequeue_timeout = None if deadline is None else (
-                    deadline - time.monotonic())
-
-                if self.io_thread_pool is not None:
-                    # We must consume worker_response_mq from a single thread.
-                    result = self.io_thread_pool.submit(  # type: ignore
-                        get_response, w, dequeue_timeout, self.shutdown_event)
-                    if not non_block:
-                        result = result.result()
-                elif not non_block:
-                    result = get_response(w, dequeue_timeout,
-                                          self.shutdown_event)
-                else:
-                    raise RuntimeError("non_block can only be used when"
-                                       " max_concurrent_batches > 1")
+                        " stack trace above for the root cause"
+                    )
                 responses.append(result)
+            return responses[0] if output_rank is not None else responses
+
+        if non_block:
+            future = FutureWrapper(self.futures_queue, aggregate=aggregate)
+            self.futures_queue.appendleft((future, get_response))
+            return future
+
+        # First drain any pending futures in the queue.
+        while self.futures_queue:
+            future, get_fut_response = self.futures_queue.pop()
+            future.wait_for_response(get_fut_response)
 
-            return responses
-        except TimeoutError as e:
-            raise TimeoutError(f"RPC call to {method} timed out.") from e
+        return aggregate(get_response())
 
     @staticmethod
     def _ensure_worker_termination(worker_procs: list[BaseProcess]):
@@ -302,11 +387,11 @@ class MultiprocExecutor(Executor):
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""
-        if not getattr(self, 'shutting_down', False):
+        if not getattr(self, "shutting_down", False):
             self.shutting_down = True
 
             # Make sure all the worker processes are terminated first.
-            if workers := getattr(self, 'workers', None):
+            if workers := getattr(self, "workers", None):
                 for w in workers:
                     # Close death_writer to signal child processes to exit
                     if w.death_writer is not None:
@@ -316,9 +401,6 @@ class MultiprocExecutor(Executor):
                 self._ensure_worker_termination([w.proc for w in workers])
 
             self.shutdown_event.set()
-            if self.io_thread_pool is not None:
-                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
-                del self.io_thread_pool
 
         self.rpc_broadcast_mq = None
 
@@ -348,27 +430,37 @@ class MultiprocExecutor(Executor):
 @dataclass
 class UnreadyWorkerProcHandle:
     """WorkerProcess handle before READY."""
+
     proc: BaseProcess
     rank: int
     ready_pipe: Connection
-    death_writer: Optional[Connection] = None
+    death_writer: Connection | None = None
 
 
 @dataclass
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    worker_response_mq: MessageQueue  # The worker process writes to this MQ
-    death_writer: Optional[Connection] = None
+    # The worker process writes to this MQ in single-node mode
+    worker_response_mq: MessageQueue | None
+    # This is only non empty on driver node,
+    # the peer worker process i writes to MQ
+    # `peer_worker_response_mqs[i]`
+    peer_worker_response_mqs: list[MessageQueue | None]
+    death_writer: Connection | None = None
 
     @classmethod
     def from_unready_handle(
-            cls, unready_handle: UnreadyWorkerProcHandle,
-            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        cls,
+        unready_handle: UnreadyWorkerProcHandle,
+        worker_response_mq: MessageQueue | None,
+        peer_worker_response_mqs: list[MessageQueue | None],
+    ) -> "WorkerProcHandle":
         return cls(
             proc=unready_handle.proc,
             rank=unready_handle.rank,
             worker_response_mq=worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
             death_writer=unready_handle.death_writer,
         )
 
@@ -378,6 +470,38 @@ class WorkerProc:
 
     READY_STR = "READY"
 
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        if vllm_config.parallel_config.nnodes_within_dp == 1:
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank
+            )
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
+            self.peer_response_handles = []
+        else:
+            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
+            self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster(
+                external_writer_handle=input_shm_handle,
+                # Since there is external_writer_handle from executor proc,
+                # where the ready signal from actual writer is sent out of the
+                # create_mq_broadcaster method and after this setup, we make it
+                # non blocking. The handshake will be triggered when
+                # worker.rpc_broadcast_mq.wait_until_ready() is called
+                blocking=False,
+            )
+            # Initializes remote message queue for sending the model output to the
+            # driver worker, exposing peer_response_handles for driver worker
+            # that include handles for all ranks
+            self.worker_response_mq, self.peer_response_handles = (
+                get_inner_dp_world_group().create_single_reader_mq_broadcasters(
+                    reader_rank_in_group=0
+                )
+            )
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -388,30 +512,25 @@ class WorkerProc:
         shared_worker_lock: LockType,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        wrapper = WorkerWrapperBase(
+            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
+        )
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]
-        is_driver_worker = (
-            rank % vllm_config.parallel_config.tensor_parallel_size == 0)
-        all_kwargs[rank] = {
+        is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0
+        all_kwargs[local_rank] = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "is_driver_worker": is_driver_worker,
+            "shared_worker_lock": shared_worker_lock,
         }
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
 
-        # Initialize MessageQueue for receiving SchedulerOutput
-        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-            input_shm_handle, self.worker.rank)
-
-        # Initializes a message queue for sending the model output
-        self.worker_response_mq = MessageQueue(1, 1)
-
         scheduler_config = vllm_config.scheduler_config
         self.use_async_scheduling = scheduler_config.async_scheduling
         if self.use_async_scheduling:
@@ -419,23 +538,26 @@ class WorkerProc:
             self.async_output_copy_thread = Thread(
                 target=self.async_output_busy_loop,
                 daemon=True,
-                name="WorkerAsyncOutputCopy")
+                name="WorkerAsyncOutputCopy",
+            )
             self.async_output_copy_thread.start()
 
-        # Initialize multimodal receiver cache if needed
-        self.mm_receiver_cache = worker_receiver_cache_from_config(
-            vllm_config, MULTIMODAL_REGISTRY, shared_worker_lock)
-
         # Initialize device
         self.worker.init_device()
 
         # Set process title and log prefix
         self.setup_proc_title_and_log_prefix(
-            enable_ep=vllm_config.parallel_config.enable_expert_parallel)
+            enable_ep=vllm_config.parallel_config.enable_expert_parallel
+        )
 
         # Load model
+        self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
+        # Enable environment variable cache (e.g. assume no more
+        # environment variable overrides after this point)
+        enable_envs_cache()
+
     @staticmethod
     def make_worker_process(
         vllm_config: VllmConfig,
@@ -463,10 +585,12 @@ class WorkerProc:
             "shared_worker_lock": shared_worker_lock,
         }
         # Run EngineCore busy loop in background process.
-        proc = context.Process(target=WorkerProc.worker_main,
-                               kwargs=process_kwargs,
-                               name=f"VllmWorker-{rank}",
-                               daemon=True)
+        proc = context.Process(
+            target=WorkerProc.worker_main,
+            kwargs=process_kwargs,
+            name=f"VllmWorker-{rank}",
+            daemon=True,
+        )
 
         proc.start()
         writer.close()
@@ -474,18 +598,41 @@ class WorkerProc:
         # death_reader in child will get EOFError
         return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
 
+    @staticmethod
+    def wait_for_response_handle_ready(
+        handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle
+    ) -> WorkerProcHandle:
+        response_handle = handles["handle"]
+        worker_response_mq: MessageQueue | None = None
+        if len(response_handle.local_reader_ranks) > 0:
+            worker_response_mq = MessageQueue.create_from_handle(response_handle, 0)
+        peer_response_handles = handles["peer_response_handles"]
+        peer_worker_response_mqs = [
+            MessageQueue.create_from_handle(handle, -1)
+            if handle.remote_subscribe_addr is not None
+            else None
+            for handle in peer_response_handles
+        ]
+        return WorkerProcHandle.from_unready_handle(
+            proc_handle,
+            worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+        )
+
     @staticmethod
     def wait_for_ready(
-        unready_proc_handles: list[UnreadyWorkerProcHandle]
+        unready_proc_handles: list[UnreadyWorkerProcHandle],
     ) -> list[WorkerProcHandle]:
-
-        e = Exception("WorkerProc initialization failed due to "
-                      "an exception in a background process. "
-                      "See stack trace for root cause.")
+        e = Exception(
+            "WorkerProc initialization failed due to "
+            "an exception in a background process. "
+            "See stack trace for root cause."
+        )
 
         pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
-        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
-            [None] * len(unready_proc_handles))
+        ready_proc_handles: list[WorkerProcHandle | None] = [None] * len(
+            unready_proc_handles
+        )
         while pipes:
             ready = multiprocessing.connection.wait(pipes.keys())
             for pipe in ready:
@@ -497,13 +644,10 @@ class WorkerProc:
                     if response["status"] != "READY":
                         raise e
 
-                    # Extract the message queue handle.
-                    worker_response_mq = MessageQueue.create_from_handle(
-                        response["handle"], 0)
-                    ready_proc_handles[unready_proc_handle.rank] = (
-                        WorkerProcHandle.from_unready_handle(
-                            unready_proc_handle, worker_response_mq))
-
+                    idx = unready_proc_handle.rank % len(ready_proc_handles)
+                    ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready(
+                        response, unready_proc_handle
+                    )
                 except EOFError:
                     e.__suppress_context__ = True
                     raise e from None
@@ -523,8 +667,8 @@ class WorkerProc:
 
     @staticmethod
     def worker_main(*args, **kwargs):
-        """ Worker initialization and execution loops.
-        This runs a background process """
+        """Worker initialization and execution loops.
+        This runs a background process"""
 
         # Signal handler used for graceful termination.
         # SystemExit exception is only raised once to allow this and worker
@@ -561,9 +705,9 @@ class WorkerProc:
                 except Exception as e:
                     logger.warning("Death monitoring error: %s", e)
 
-            death_monitor = Thread(target=monitor_parent_death,
-                                   daemon=True,
-                                   name="WorkerDeathMonitor")
+            death_monitor = Thread(
+                target=monitor_parent_death, daemon=True, name="WorkerDeathMonitor"
+            )
             death_monitor.start()
 
         try:
@@ -571,16 +715,18 @@ class WorkerProc:
             worker = WorkerProc(*args, **kwargs)
 
             # Send READY once we know everything is loaded
-            ready_writer.send({
-                "status":
-                WorkerProc.READY_STR,
-                "handle":
-                worker.worker_response_mq.export_handle(),
-            })
+            ready_writer.send(
+                {
+                    "status": WorkerProc.READY_STR,
+                    "handle": worker.worker_response_mq.export_handle(),
+                    "peer_response_handles": worker.peer_response_handles,
+                }
+            )
 
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
-            worker.rpc_broadcast_mq.wait_until_ready()
+            if worker.rpc_broadcast_mq is not None:
+                worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
             ready_writer.close()
             ready_writer = None
@@ -649,20 +795,18 @@ class WorkerProc:
             output = self.async_output_queue.get()
             self.enqueue_output(output)
 
-    def worker_busy_loop(self, cancel: Optional[threading.Event] = None):
+    def worker_busy_loop(self, cancel: threading.Event | None = None):
         """Main busy loop for Multiprocessing Workers"""
         while True:
             method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
-                cancel=cancel, indefinite=True)
+                cancel=cancel, indefinite=True
+            )
             try:
                 if isinstance(method, str):
                     func = getattr(self.worker, method)
                 elif isinstance(method, bytes):
                     func = partial(cloudpickle.loads(method), self.worker)
-                # retrieve from shm cache if available
-                if self.mm_receiver_cache is not None \
-                    and func.__name__ == "execute_model":
-                    get_and_update_mm_cache(self.mm_receiver_cache, args)
+
                 output = func(*args, **kwargs)
             except Exception as e:
                 # Notes have been introduced in python 3.11
@@ -686,6 +830,8 @@ class WorkerProc:
         pp_rank = get_pp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
+        dcp_size = get_dcp_group().world_size
+        dcp_rank = get_dcp_group().rank_in_group
         process_name = "Worker"
         if dp_size > 1:
             process_name += f"_DP{dp_rank}"
@@ -693,6 +839,8 @@ class WorkerProc:
             process_name += f"_PP{pp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
+        if dcp_size > 1:
+            process_name += f"_DCP{dcp_rank}"
         if enable_ep:
             ep_rank = get_ep_group().rank_in_group
             process_name += f"_EP{ep_rank}"
@@ -701,8 +849,8 @@ class WorkerProc:
 
 
 def set_multiprocessing_worker_envs():
-    """ Set up environment variables that should be used when there are workers
-    in a multiprocessing environment. This should be called by the parent 
+    """Set up environment variables that should be used when there are workers
+    in a multiprocessing environment. This should be called by the parent
     process before worker processes are created"""
 
     _maybe_force_spawn()
@@ -714,13 +862,16 @@ def set_multiprocessing_worker_envs():
     # impact on performance. The contention is amplified when running in a
     # container where CPU limits can cause throttling.
     default_omp_num_threads = 1
-    if "OMP_NUM_THREADS" not in os.environ and (
-            current_parallelism :=
-            torch.get_num_threads()) > default_omp_num_threads:
+    if (
+        "OMP_NUM_THREADS" not in os.environ
+        and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
+    ):
         logger.warning(
             "Reducing Torch parallelism from %d threads to %d to avoid "
             "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
             "external environment to tune this value as needed.",
-            current_parallelism, default_omp_num_threads)
+            current_parallelism,
+            default_omp_num_threads,
+        )
         os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
         torch.set_num_threads(default_omp_num_threads)
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index aadb5fd1dddd5d88080999aae150188d5b91c948..9a56c093ad697a573593e5529f2fc343d5400315 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -1,108 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.executor.ray_executor import (
+    RayDistributedExecutor as _RayDistributedExecutor,
+)
 
-from concurrent.futures import Future
-from typing import Optional, Union
-
-from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.executor.ray_distributed_executor import (  # noqa
-    RayDistributedExecutor as RayDistributedExecutorV0)
-from vllm.logger import init_logger
-from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.outputs import ModelRunnerOutput
-
-logger = init_logger(__name__)
-
-
-class FutureWrapper(Future):
-    """A wrapper around Ray output reference to meet the interface
-    of .execute_model(): The top level (core busy loop) expects .result() api 
-    to block and return a single output.
-    
-    If aggregator is provided, the outputs from all workers are aggregated upon 
-    the result() call. If not only the first worker's output is returned.
-    """
-
-    def __init__(self, refs, aggregator: Optional[KVOutputAggregator] = None):
-        super().__init__()
-        self.refs = refs
-        self.aggregator = aggregator
-
-    def result(self, timeout=None):
-        if timeout is not None:
-            raise NotImplementedError("timeout is not supported")
-
-        if self.aggregator is None:
-            return self.refs[0].get()
-
-        outputs = [ref.get() for ref in self.refs]
-        return self.aggregator.aggregate(outputs, output_rank=0)
-
-
-class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
-    """Ray distributed executor using Ray Compiled Graphs."""
-
-    supports_pp: bool = True
-
-    def _init_executor(self) -> None:
-        super()._init_executor()
-
-        # KV connector setup
-        self.has_connector = self.vllm_config.kv_transfer_config is not None
-
-    @property
-    def max_concurrent_batches(self) -> int:
-        """Ray distributed executor supports pipeline parallelism,
-        meaning that it allows PP size batches to be executed concurrently.
-        """
-        if self.scheduler_config.async_scheduling:
-            return 2
-        return self.parallel_config.pipeline_parallel_size
-
-    def execute_model(
-        self,
-        scheduler_output: SchedulerOutput,
-        non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
-        """Execute the model on the Ray workers.
-
-        Args:
-            scheduler_output: The scheduler output to execute.
-            non_block: If True, the method will return a Future.
-
-        Returns:
-            The model runner output.
-        """
-        # Build the compiled DAG for the first time.
-        if self.forward_dag is None:  # type: ignore
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
-
-        refs = self.forward_dag.execute(scheduler_output)  # type: ignore
-
-        if not self.has_connector:
-            # Get output only from a single worker (output_rank)
-            # When PP is not used, we block here until the result is available.
-            if not non_block:
-                return refs[0].get()
-
-            # When PP is used, we return a FutureWrapper immediately so that
-            # the scheduler can yield to the next batch.
-            return FutureWrapper(refs)
-
-        # Get output from all workers when connector is present
-        if not non_block:
-            # Block and get results from all workers
-            outputs = [ref.get() for ref in refs]
-            return self.kv_output_aggregator.aggregate(outputs)
-
-        # Return a future that will aggregate outputs from all workers
-        return FutureWrapper(refs, self.kv_output_aggregator)
-
-    def reinitialize_distributed(
-            self, reconfig_request: ReconfigureDistributedRequest) -> None:
-        self._run_workers("reinitialize_distributed", reconfig_request)
-        if reconfig_request.new_data_parallel_rank == \
-        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
-            self.shutdown()
+# For backwards compatibility.
+RayDistributedExecutor = _RayDistributedExecutor
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..406eafcd339b0461cd9762133b9eeb30353cd1fb
--- /dev/null
+++ b/vllm/v1/executor/ray_executor.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections import defaultdict
+from collections.abc import Callable
+from concurrent.futures import Future
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import cloudpickle
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.utils.network_utils import (
+    get_distributed_init_method,
+    get_ip,
+    get_open_port,
+)
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import (
+    FutureWrapper,
+    RayWorkerWrapper,
+    initialize_ray_cluster,
+    ray,
+)
+from vllm.v1.outputs import ModelRunnerOutput
+
+if ray is not None:
+    from ray.actor import ActorHandle
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+COMPLETED_NONE_FUTURE: Future[ModelRunnerOutput | None] = Future()
+COMPLETED_NONE_FUTURE.set_result(None)
+
+
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+
+
+class RayDistributedExecutor(Executor):
+    """Ray-based distributed executor"""
+
+    # These env vars are worker-specific, therefore are NOT copied
+    # from the driver to the workers
+    WORKER_SPECIFIC_ENV_VARS = {
+        "VLLM_HOST_IP",
+        "VLLM_HOST_PORT",
+        "LOCAL_RANK",
+        "CUDA_VISIBLE_DEVICES",
+    }
+
+    # These non-vLLM env vars are copied from the driver to workers
+    ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}
+
+    uses_ray: bool = True
+    supports_pp: bool = True
+
+    def _init_executor(self) -> None:
+        self.forward_dag: ray.dag.CompiledDAG | None = None
+
+        # For TPU or XPU, avoid compiling NVIDIA's NCCL
+        if current_platform.is_tpu() or current_platform.is_xpu():
+            os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
+
+        assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # KV connector setup
+        self.has_connector = self.vllm_config.kv_transfer_config is not None
+
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer
+        )
+
+        self.scheduler_output: SchedulerOutput | None = None
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        """Ray distributed executor supports pipeline parallelism,
+        meaning that it allows PP size batches to be executed concurrently.
+        """
+        if self.scheduler_config.async_scheduling:
+            return 2
+        return self.parallel_config.pipeline_parallel_size
+
+    def shutdown(self) -> None:
+        if logger:
+            # Somehow logger can be None here.
+            logger.info(
+                "Shutting down Ray distributed executor. If you see error log "
+                "from logging.cc regarding SIGTERM received, please ignore "
+                "because this is the expected termination process in Ray."
+            )
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update(
+            {
+                "nsight": {
+                    "t": "cuda,cudnn,cublas",
+                    "o": "'worker_process_%p'",
+                    "cuda-graph-trace": "node",
+                }
+            }
+        )
+
+        return ray_remote_kwargs
+
+    # child class could overwrite this to return actual env vars.
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs):
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: RayWorkerWrapper | None = None
+        # The remaining workers are the actual ray actors.
+        self.workers: list[RayWorkerWrapper] = []
+
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: list[list[RayWorkerWrapper]] = []
+
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs
+            )
+
+        # Create the workers.
+        bundle_indices: list[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, (
+                "VLLM_RAY_BUNDLE_INDICES must have the same size"
+                f" as the world size, but got {bundle_indices=} "
+                f"and {self.parallel_config.world_size=}"
+            )
+            assert len(set(bundle_indices)) == len(bundle_indices), (
+                "VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+                f" but got {bundle_indices=}"
+            )
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[: self.parallel_config.world_size]
+
+        worker_metadata: list[RayWorkerMetaData] = []
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
+            worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank))
+
+        worker_ips = ray.get(
+            [
+                each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+                for each in worker_metadata
+            ]
+        )
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
+
+        logger.debug("workers: %s", worker_metadata)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+
+        ip_counts: dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = item.ip
+            return 0 if ip == driver_ip else 1, ip_counts[ip], ip
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        sorted_worker_metadata = sorted(
+            worker_metadata, key=sort_by_driver_then_worker_ip
+        )
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank for item in sorted_worker_metadata
+        }
+        self.collective_rpc("adjust_rank", args=(rerank_mapping,))
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote())
+            )  # type: ignore[attr-defined]
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
+                " each node."
+            )
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [
+            {
+                current_platform.device_control_env_var: ",".join(
+                    map(str, node_gpus[node_id])
+                ),
+            }
+            for (node_id, _) in worker_node_and_gpu_ids
+        ]
+
+        # Environment variables to copy from driver to workers
+        env_vars_to_copy = get_env_vars_to_copy(
+            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
+            additional_vars=set(current_platform.additional_env_vars).union(
+                self.ADDITIONAL_ENV_VARS
+            ),
+            destination="workers",
+        )
+
+        # Copy existing env vars to each worker's args
+        for args in all_args_to_update_environment_variables:
+            # TODO: refactor platform-specific env vars
+            for name in env_vars_to_copy:
+                if name in os.environ:
+                    args[name] = os.environ[name]
+
+        self._env_vars_for_all_workers = all_args_to_update_environment_variables
+
+        self.collective_rpc(
+            "update_environment_variables", args=(self._get_env_vars_to_be_updated(),)
+        )
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port()
+        )
+
+        # Initialize the actual workers inside worker wrapper.
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self.collective_rpc("init_worker", args=(all_kwargs,))
+
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+            self.pp_tp_workers.append([])
+            for tp_rank in range(self.parallel_config.tensor_parallel_size):
+                # PP=2, TP=4
+                # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                rank = (pp_rank * self.parallel_config.tensor_parallel_size) + tp_rank
+                assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                assert pp_rank < len(self.pp_tp_workers)
+                self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
+    def reinitialize_distributed(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        self.collective_rpc("reinitialize_distributed", args=(reconfig_request,))
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
+            self.shutdown()
+
+    def execute_model(  # type: ignore[override]
+        self,
+        scheduler_output: SchedulerOutput,
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        if self.scheduler_output is not None:
+            raise RuntimeError(
+                "State error: sample_tokens() must be called "
+                "after execute_model() returns None."
+            )
+
+        if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens:
+            # Model will not execute, call model runner immediately.
+            return self._execute_dag(scheduler_output, None, non_block)
+
+        # Model will execute, defer to sample_tokens() call.
+        self.scheduler_output = scheduler_output
+        return COMPLETED_NONE_FUTURE if non_block else None
+
+    def sample_tokens(  # type: ignore[override]
+        self,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        """Execute the model on the Ray workers.
+
+        The scheduler output to use should have been provided in
+        a prior call to execute_model().
+
+        Args:
+            grammar_output: The structured outputs grammar bitmask, if applicable.
+            non_block: If True, the method will return a Future.
+
+        Returns:
+            The model runner output.
+        """
+        scheduler_output = self.scheduler_output
+        if scheduler_output is None:
+            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
+
+        self.scheduler_output = None
+
+        return self._execute_dag(scheduler_output, grammar_output, non_block)
+
+    def _execute_dag(
+        self,
+        scheduler_output: SchedulerOutput,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        # Build the compiled DAG for the first time.
+        if self.forward_dag is None:  # type: ignore
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        refs = self.forward_dag.execute((scheduler_output, grammar_output))  # type: ignore
+
+        if not self.has_connector:
+            # Get output only from a single worker (output_rank)
+            # When PP is not used, we block here until the result is available.
+            if not non_block:
+                return refs[0].get()
+
+            # When PP is used, we return a FutureWrapper immediately so that
+            # the scheduler can yield to the next batch.
+            return FutureWrapper(refs[0])
+
+        # Get output from all workers when connector is present
+        assert self.kv_output_aggregator is not None
+        if not non_block:
+            # Block and get results from all workers
+            return self.kv_output_aggregator.aggregate(ray.get(refs))
+
+        # Return a future that will aggregate outputs from all workers
+        return FutureWrapper(refs, self.kv_output_aggregator)
+
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+        non_block: bool = False,
+    ) -> list[Any] | Future[list[Any]]:
+        """Runs the given method on all workers."""
+        sent_method = method if isinstance(method, str) else cloudpickle.dumps(method)
+        del method
+
+        if kwargs is None:
+            kwargs = {}
+        ray_worker_outputs = [
+            worker.execute_method.remote(  # type: ignore[attr-defined]
+                sent_method, *args, **kwargs
+            )
+            for worker in self.workers
+        ]
+
+        # Get the results of the ray workers.
+        if non_block:
+            return FutureWrapper(ray_worker_outputs)
+
+        return ray.get(ray_worker_outputs, timeout=timeout)
+
+    def _check_ray_cgraph_installation(self):
+        import importlib.metadata
+
+        from packaging import version
+
+        required_version = version.parse("2.43.0")
+        current_version = version.parse(importlib.metadata.version("ray"))
+        if current_version < required_version:
+            raise ValueError(
+                f"Ray version {required_version} is "
+                f"required, but found {current_version}"
+            )
+
+        import importlib.util
+
+        cgraph_spec = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
+        if cgraph_spec is None:
+            raise ValueError(
+                "Ray Compiled Graph is not installed. "
+                "Run `pip install ray[cgraph]` to install it."
+            )
+
+        cupy_spec = importlib.util.find_spec("cupy")
+        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl":
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
+                "Run `pip install ray[cgraph]` and check cupy installation."
+            )
+
+    def _compiled_ray_dag(self, enable_asyncio: bool):
+        assert self.parallel_config.use_ray
+        self._check_ray_cgraph_installation()
+        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
+        # (it is 10 seconds by default). This is a Ray environment variable to
+        # control the timeout of getting result from a compiled graph execution,
+        # i.e., the distributed execution that includes model forward runs and
+        # intermediate tensor communications, in the case of vllm.
+        # Note: we should set this env var before importing
+        # ray.dag, otherwise it will not take effect.
+        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
+        from ray.dag import InputNode, MultiOutputNode
+
+        logger.info(
+            "RAY_CGRAPH_get_timeout is set to %s",
+            os.environ["RAY_CGRAPH_get_timeout"],  # noqa: SIM112
+        )
+        logger.info(
+            "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+            envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE,
+        )
+        logger.info(
+            "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+            envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM,
+        )
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'."
+            )
+
+        with InputNode() as input_data:
+            # Example DAG: PP=2, TP=4
+            #
+            # SchedulerOutput -> 0 -> (SchedulerOutput, IntermediateTensors) -> 4 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 1 -> (SchedulerOutput, IntermediateTensors) -> 5 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 2 -> (SchedulerOutput, IntermediateTensors) -> 6 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 3 -> (SchedulerOutput, IntermediateTensors) -> 7 -> ModelRunnerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                outputs = [
+                    worker.execute_model_ray.bind(outputs[i])  # type: ignore[attr-defined]
+                    for i, worker in enumerate(tp_group)
+                ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if (
+                    pp_rank < last_pp_rank
+                    and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"
+                ):
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+                    outputs = [
+                        output.with_tensor_transport(transport=transport)
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
+        if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
+            from ray.experimental.channel.accelerator_context import (
+                register_accelerator_context,
+            )
+
+            from vllm.distributed.device_communicators.ray_communicator import (
+                RayPPCommunicator,
+            )
+
+            register_accelerator_context(
+                torch_module_name="cuda", communicator_cls=RayPPCommunicator
+            )
+            logger.info(
+                "Using RayPPCommunicator "
+                "(which wraps vLLM _PP GroupCoordinator) "
+                "for Ray Compiled Graph communication."
+            )
+        else:
+            logger.info(
+                "Using Ray's NCCL communicator for Ray Compiled Graph communication."
+            )
+
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM,
+        )
+
+    def __del__(self):
+        self.shutdown()
+
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return
diff --git a/vllm/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
similarity index 64%
rename from vllm/executor/ray_utils.py
rename to vllm/v1/executor/ray_utils.py
index 0bdeb28569892dd7d8cbedb0000529a8201c6ec3..21910d1160bd490238dcd3f2295345cf6f2d2e96 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -4,22 +4,22 @@
 import os
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
-
-import msgspec
+from concurrent.futures import Future
+from typing import TYPE_CHECKING, Union
 
 import vllm.platforms
 from vllm.config import ParallelConfig
 from vllm.distributed import get_pp_group
-from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import get_ip
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.sequence import IntermediateTensors
+from vllm.utils.network_utils import get_ip
+from vllm.v1.outputs import AsyncModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
     from vllm.v1.outputs import ModelRunnerOutput
 
 logger = init_logger(__name__)
@@ -29,11 +29,13 @@ try:
     import ray
     from ray.util import placement_group_table
     from ray.util.placement_group import PlacementGroup
+
     try:
         from ray._private.state import available_resources_per_node
     except ImportError:
         # Ray 2.9.x doesn't expose `available_resources_per_node`
         from ray._private.state import state as _state
+
         available_resources_per_node = _state._available_resources_per_node
 
     class RayWorkerWrapper(WorkerWrapperBase):
@@ -48,61 +50,20 @@ try:
             # that thread.
             self.compiled_dag_cuda_device_set = False
 
-            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
-                                                         dec_hook=decode_hook)
-            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-
         def get_node_ip(self) -> str:
             return get_ip()
 
-        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+        def get_node_and_gpu_ids(self) -> tuple[str, list[int]]:
             node_id = ray.get_runtime_context().get_node_id()
             device_key = vllm.platforms.current_platform.ray_device_key
             if not device_key:
-                raise RuntimeError("current platform %s does not support ray.",
-                                   vllm.platforms.current_platform.device_name)
-            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
-            )[device_key]
+                raise RuntimeError(
+                    "current platform %s does not support ray.",
+                    vllm.platforms.current_platform.device_name,
+                )
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids()[device_key]
             return node_id, gpu_ids
 
-        def execute_model_spmd(
-            self, req_or_tuple: Union[bytes,
-                                      Tuple[bytes,
-                                            Optional[IntermediateTensors]]]
-        ) -> bytes:
-            """Execute model in SPMD fashion: used only when SPMD worker and
-            compiled DAG are both enabled.
-
-            Args:
-                req_or_tuple: A request or a tuple containing the
-                    request and intermediate tensors. Intermediate tensors are
-                    None unless if it is provided because it is > 0 pipeline
-                    stage. The request is serialized by msgspec.
-            """
-            if isinstance(req_or_tuple, bytes):
-                serialized_req, intermediate_tensors = req_or_tuple, None
-            else:
-                serialized_req, intermediate_tensors = req_or_tuple
-
-            execute_model_req = self.input_decoder.decode(serialized_req)
-
-            # TODO(swang): This is needed right now because Ray Compiled Graph
-            # executes on a background thread, so we need to reset torch's
-            # current device.
-            if not self.compiled_dag_cuda_device_set:
-                current_platform.set_device(self.worker.device)
-                self.compiled_dag_cuda_device_set = True
-
-            output = self.worker._execute_model_spmd(execute_model_req,
-                                                     intermediate_tensors)
-            # Pipeline model request and output to the next pipeline stage.
-            if isinstance(output, IntermediateTensors):
-                output = serialized_req, output
-            else:
-                output = self.output_encoder.encode(output)
-
-            return output
-
         def setup_device_if_necessary(self):
             # TODO(swang): This is needed right now because Ray CG executes
             # on a background thread, so we need to reset torch's current
@@ -114,37 +75,51 @@ try:
                     # Not needed
                     pass
                 else:
+                    assert self.worker.device is not None
                     current_platform.set_device(self.worker.device)
 
                 self.compiled_dag_cuda_device_set = True
 
         def execute_model_ray(
             self,
-            scheduler_output: Union["SchedulerOutput",
-                                    Tuple["SchedulerOutput",
-                                          "IntermediateTensors"]],
-        ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput",
-                                              "IntermediateTensors"]]:
+            execute_model_input: tuple["SchedulerOutput", "GrammarOutput"]
+            | tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"],
+        ) -> Union[
+            "ModelRunnerOutput",
+            tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"],
+        ]:
             # This method is used by Ray Compiled Graph to execute the model,
             # and it needs a special logic of self.setup_device_if_necessary()
             self.setup_device_if_necessary()
             assert self.worker is not None, "Worker is not initialized"
-            if isinstance(scheduler_output, tuple):
-                scheduler_output, intermediate_tensors = scheduler_output
+            if len(execute_model_input) == 3:
+                scheduler_output, grammar_output, intermediate_tensors = (
+                    execute_model_input
+                )
             else:
-                scheduler_output, intermediate_tensors = scheduler_output, None
+                scheduler_output, grammar_output = execute_model_input
+                intermediate_tensors = None
+            assert self.worker.model_runner is not None
             output = self.worker.model_runner.execute_model(
-                scheduler_output, intermediate_tensors)
+                scheduler_output, intermediate_tensors
+            )
             if isinstance(output, IntermediateTensors):
-                output = scheduler_output, output
+                output = scheduler_output, grammar_output, output
             elif not get_pp_group().is_last_rank:
                 # Case where there are no scheduled requests
                 # but may still be finished requests.
                 assert not output or not output.req_ids
-                output = scheduler_output, None
+                output = scheduler_output, grammar_output, None
+            elif output is None:
+                output = self.worker.model_runner.sample_tokens(grammar_output)
+                # Ensure outputs crossing Ray compiled DAG are serializable.
+                # AsyncModelRunnerOutput holds CUDA events and cannot be
+                # pickled.
+                if isinstance(output, AsyncModelRunnerOutput):
+                    output = output.get_output()
             return output
 
-        def override_env_vars(self, vars: Dict[str, str]):
+        def override_env_vars(self, vars: dict[str, str]):
             os.environ.update(vars)
 
     ray_import_err = None
@@ -157,6 +132,28 @@ except ImportError as e:
     RayWorkerWrapper = None  # type: ignore
 
 
+class FutureWrapper(Future):
+    """A wrapper around Ray output reference to meet the interface
+    of .execute_model(): The top level (core busy loop) expects .result() api
+    to block and return a single output.
+
+    If aggregator is provided, the outputs from all workers are aggregated upon
+    the result() call. If not only the first worker's output is returned.
+    """
+
+    def __init__(self, ref_or_refs, aggregator: KVOutputAggregator | None = None):
+        super().__init__()
+        self.ref_or_refs = ref_or_refs
+        self.aggregator = aggregator
+
+    def result(self, timeout=None):
+        outputs = ray.get(self.ref_or_refs, timeout=timeout)
+        if self.aggregator is None:
+            return outputs
+
+        return self.aggregator.aggregate(outputs, output_rank=0)
+
+
 def ray_is_available() -> bool:
     """Returns True if Ray is available."""
     return ray is not None
@@ -165,12 +162,15 @@ def ray_is_available() -> bool:
 def assert_ray_available():
     """Raise an exception if Ray is not available."""
     if ray is None:
-        raise ValueError(f"Failed to import Ray: {ray_import_err}."
-                         "Please install Ray with `pip install ray`.")
+        raise ValueError(
+            f"Failed to import Ray: {ray_import_err}."
+            "Please install Ray with `pip install ray`."
+        )
 
 
-def _verify_bundles(placement_group: "PlacementGroup",
-                    parallel_config: ParallelConfig, device_str: str):
+def _verify_bundles(
+    placement_group: "PlacementGroup", parallel_config: ParallelConfig, device_str: str
+):
     """Verify a given placement group has bundles located in the right place.
 
     There are 2 rules.
@@ -178,14 +178,15 @@ def _verify_bundles(placement_group: "PlacementGroup",
     - Fail if driver node is not included in a placement group.
     """
     assert ray.is_initialized(), (
-        "Ray is not initialized although distributed-executor-backend is ray.")
+        "Ray is not initialized although distributed-executor-backend is ray."
+    )
     pg_data = placement_group_table(placement_group)
     # bundle_idx -> node_id
     bundle_to_node_ids = pg_data["bundles_to_node_id"]
     # bundle_idx -> bundle (e.g., {"GPU": 1})
     bundles = pg_data["bundles"]
     # node_id -> List of bundle (e.g., {"GPU": 1})
-    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
+    node_id_to_bundle: dict[str, list[dict[str, float]]] = defaultdict(list)
 
     for bundle_idx, node_id in bundle_to_node_ids.items():
         node_id_to_bundle[node_id].append(bundles[bundle_idx])
@@ -211,8 +212,13 @@ def _verify_bundles(placement_group: "PlacementGroup",
                 "unless you have fast interconnect across nodes, like "
                 "Infiniband. To resolve this issue, make sure you have more "
                 "than %d GPUs available at each node.",
-                parallel_config.tensor_parallel_size, device_str, len(bundles),
-                device_str, node_id, parallel_config.tensor_parallel_size)
+                parallel_config.tensor_parallel_size,
+                device_str,
+                len(bundles),
+                device_str,
+                node_id,
+                parallel_config.tensor_parallel_size,
+            )
 
 
 def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
@@ -244,16 +250,40 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
             " and make sure the IP addresses used by ray cluster"
             " are the same as VLLM_HOST_IP environment variable"
             " specified in each node if you are running on a multi-node.",
-            int(time.time() - s), placement_group_specs)
+            int(time.time() - s),
+            placement_group_specs,
+        )
 
     try:
         ray.get(pg_ready_ref, timeout=0)
     except ray.exceptions.GetTimeoutError:
-        raise ValueError(
-            "Cannot provide a placement group of "
-            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` and `ray list nodes` to make sure the cluster has "
-            "enough resources.") from None
+        # Provide more helpful error message when GPU count is exceeded
+        total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
+        # If more than one GPU is required for the placement group, provide a
+        # more specific error message.
+        # We use >1 here because multi-GPU (tensor parallel) jobs are more
+        # likely to fail due to insufficient cluster resources, and users may
+        # need to adjust tensor_parallel_size to fit available GPUs.
+        if total_gpu_required > 1:
+            raise ValueError(
+                f"Cannot provide a placement group requiring "
+                f"{total_gpu_required} GPUs "
+                f"(placement_group_specs={placement_group_specs}) within "
+                f"{PG_WAIT_TIMEOUT} seconds.\n"
+                f"Tensor parallel size may exceed available GPUs in your "
+                f"cluster. Check resources with `ray status` and "
+                f"`ray list nodes`.\n"
+                f"If running on K8s with limited GPUs, consider reducing "
+                f"--tensor-parallel-size to match available GPU resources."
+            ) from None
+        else:
+            raise ValueError(
+                "Cannot provide a placement group of "
+                f"{placement_group_specs=} within "
+                f"{PG_WAIT_TIMEOUT} seconds. See "
+                "`ray status` and `ray list nodes` to make sure the cluster "
+                "has enough resources."
+            ) from None
 
 
 def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
@@ -268,14 +298,15 @@ def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
         # Exponential backoff for warning print.
         wait_interval *= 2
         logger.info(
-            "Waiting for removing a placement group of specs for "
-            "%d seconds.", int(time.time() - s))
+            "Waiting for removing a placement group of specs for %d seconds.",
+            int(time.time() - s),
+        )
         time.sleep(wait_interval)
 
 
 def initialize_ray_cluster(
     parallel_config: ParallelConfig,
-    ray_address: Optional[str] = None,
+    ray_address: str | None = None,
 ):
     """Initialize the distributed cluster with Ray.
 
@@ -291,6 +322,23 @@ def initialize_ray_cluster(
     assert_ray_available()
     from vllm.platforms import current_platform
 
+    # Prevalidate GPU requirements before Ray processing
+    if current_platform.is_cuda() and parallel_config.world_size > 1:
+        from vllm.utils.torch_utils import cuda_device_count_stateless
+
+        available_gpus = cuda_device_count_stateless()
+        if parallel_config.world_size > available_gpus:
+            logger.warning(
+                "Tensor parallel size (%d) exceeds available GPUs (%d). "
+                "This may result in Ray placement group allocation failures. "
+                "Consider reducing tensor_parallel_size to %d or less, "
+                "or ensure your Ray cluster has %d GPUs available.",
+                parallel_config.world_size,
+                available_gpus,
+                available_gpus,
+                parallel_config.world_size,
+            )
+
     if ray.is_initialized():
         logger.info("Ray is already initialized. Skipping Ray initialization.")
     elif current_platform.is_rocm() or current_platform.is_xpu():
@@ -300,19 +348,21 @@ def initialize_ray_cluster(
         except ConnectionError:
             logger.warning(
                 "No existing RAY instance detected. "
-                "A new instance will be launched with current node resources.")
-            ray.init(address=ray_address,
-                     num_gpus=parallel_config.world_size,
-                     runtime_env=parallel_config.ray_runtime_env)
+                "A new instance will be launched with current node resources."
+            )
+            ray.init(
+                address=ray_address,
+                num_gpus=parallel_config.world_size,
+                runtime_env=parallel_config.ray_runtime_env,
+            )
     else:
-        ray.init(address=ray_address,
-                 runtime_env=parallel_config.ray_runtime_env)
+        ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
 
     device_str = current_platform.ray_device_key
     if not device_str:
         raise ValueError(
-            f"current platform {current_platform.device_name} does not "
-            "support ray.")
+            f"current platform {current_platform.device_name} does not support ray."
+        )
 
     # Create or get the placement group for worker processes
     if parallel_config.placement_group:
@@ -331,8 +381,8 @@ def initialize_ray_cluster(
             bundle_devices = bundle.get(device_str, 0)
             if bundle_devices > 1:
                 raise ValueError(
-                    "Placement group bundle cannot have more than 1 "
-                    f"{device_str}.")
+                    f"Placement group bundle cannot have more than 1 {device_str}."
+                )
             if bundle_devices:
                 device_bundles += 1
         if parallel_config.world_size > device_bundles:
@@ -340,10 +390,10 @@ def initialize_ray_cluster(
                 f"The number of required {device_str}s exceeds the total "
                 f"number of available {device_str}s in the placement group. "
                 f"Required number of devices: {parallel_config.world_size}. "
-                f"Total number of devices: {device_bundles}.")
+                f"Total number of devices: {device_bundles}."
+            )
     else:
-        logger.info("No current placement group found. "
-                    "Creating a new placement group.")
+        logger.info("No current placement group found. Creating a new placement group.")
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
         # Log a warning message and delay resource allocation failure response.
         # Avoid immediate rejection to allow user-initiated placement group
@@ -351,12 +401,14 @@ def initialize_ray_cluster(
         if parallel_config.world_size > num_devices_in_cluster:
             logger.warning(
                 "The number of required %ss exceeds the total "
-                "number of available %ss in the placement group.", device_str,
-                device_str)
+                "number of available %ss in the placement group.",
+                device_str,
+                device_str,
+            )
         # Create a new placement group
-        placement_group_specs: List[Dict[str, float]] = ([{
-            device_str: 1.0
-        } for _ in range(parallel_config.world_size)])
+        placement_group_specs: list[dict[str, float]] = [
+            {device_str: 1.0} for _ in range(parallel_config.world_size)
+        ]
 
         # vLLM engine is also a worker to execute model with an accelerator,
         # so it requires to have the device in a current node. Check if
@@ -369,14 +421,16 @@ def initialize_ray_cluster(
                 f"Current node has no {device_str} available. "
                 f"{current_node_resource=}. vLLM engine cannot start without "
                 f"{device_str}. Make sure you have at least 1 {device_str} "
-                f"available in a node {current_node_id=} {current_ip=}.")
+                f"available in a node {current_node_id=} {current_ip=}."
+            )
         # This way, at least bundle is required to be created in a current
         # node.
         placement_group_specs[0][f"node:{current_ip}"] = 0.001
 
         # By default, Ray packs resources as much as possible.
         current_placement_group = ray.util.placement_group(
-            placement_group_specs, strategy="PACK")
+            placement_group_specs, strategy="PACK"
+        )
         _wait_until_pg_ready(current_placement_group)
 
     assert current_placement_group is not None
@@ -387,6 +441,7 @@ def initialize_ray_cluster(
 
 def get_num_tpu_nodes() -> int:
     from ray._private.accelerators import TPUAcceleratorManager
+
     cluster_resources = ray.cluster_resources()
     total_tpus = int(cluster_resources["TPU"])
     tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
diff --git a/vllm/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
similarity index 52%
rename from vllm/executor/uniproc_executor.py
rename to vllm/v1/executor/uniproc_executor.py
index 7a753d608a43ad9ac197a38964bc1df9d2e3f68f..095d3d1dac21b5d4dd4bb94223111f8e6fbcbc5d 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -1,66 +1,57 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from collections.abc import Callable
 from concurrent.futures import Future, ThreadPoolExecutor
 from functools import cached_property
 from multiprocessing import Lock
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import worker_receiver_cache_from_config
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        run_method)
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
-from vllm.v1.executor.utils import get_and_update_mm_cache
-from vllm.v1.outputs import AsyncModelRunnerOutput
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
+from vllm.v1.serial_utils import run_method
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 
-class UniProcExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
+class UniProcExecutor(Executor):
     def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rpc_rank=0)
+        """Initialize the worker and load the model."""
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
         distributed_init_method, rank, local_rank = self._distributed_args()
-        is_driver_worker = True
         kwargs = dict(
             vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            is_driver_worker=is_driver_worker,
+            is_driver_worker=True,
+            shared_worker_lock=Lock(),
         )
-        self.mm_receiver_cache = worker_receiver_cache_from_config(
-            self.vllm_config, MULTIMODAL_REGISTRY, Lock())
 
-        self.async_output_thread: Optional[ThreadPoolExecutor] = None
+        self.async_output_thread: ThreadPoolExecutor | None = None
         if self.max_concurrent_batches > 1:
             self.async_output_thread = ThreadPoolExecutor(
-                max_workers=1, thread_name_prefix="WorkerAsyncOutput")
+                max_workers=1, thread_name_prefix="WorkerAsyncOutput"
+            )
 
-        self.collective_rpc("init_worker", args=([kwargs], ))
-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
+        self.driver_worker.init_worker(all_kwargs=[kwargs])
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
 
     def _distributed_args(self) -> tuple[str, int, int]:
         """Return (distributed_init_method, rank, local_rank)."""
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
         # set local rank as the device index if specified
-        device_info = self.vllm_config.device_config.device.__str__().split(
-            ":")
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
         local_rank = int(device_info[1]) if len(device_info) > 1 else 0
         return distributed_init_method, 0, local_rank
 
@@ -68,32 +59,60 @@ class UniProcExecutor(ExecutorBase):
     def max_concurrent_batches(self) -> int:
         return 2 if self.scheduler_config.async_scheduling else 1
 
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None,
-                       non_block: bool = False) -> List[Any]:
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: bool = False,
+        single_value: bool = False,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
         if kwargs is None:
             kwargs = {}
-        if self.mm_receiver_cache is not None and method == "execute_model":
-            get_and_update_mm_cache(self.mm_receiver_cache, args)
 
         if not non_block:
-            return [run_method(self.driver_worker, method, args, kwargs)]
+            result = run_method(self.driver_worker, method, args, kwargs)
+            return result if single_value else [result]
 
         try:
             result = run_method(self.driver_worker, method, args, kwargs)
             if isinstance(result, AsyncModelRunnerOutput):
                 if (async_thread := self.async_output_thread) is not None:
-                    return [async_thread.submit(result.get_output)]
+                    get_output = result.get_output
+                    if not single_value:
+                        get_output = lambda go=result.get_output: [go()]
+                    return async_thread.submit(get_output)
                 result = result.get_output()
             future = Future[Any]()
-            future.set_result(result)
+            future.set_result(result if single_value else [result])
         except Exception as e:
             future = Future[Any]()
             future.set_exception(e)
-        return [future]
+        return future
+
+    def execute_model(  # type: ignore[override]
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        return self.collective_rpc("take_draft_token_ids", single_value=True)
 
     def check_health(self) -> None:
         # UniProcExecutor will always be healthy as long as
@@ -101,21 +120,20 @@ class UniProcExecutor(ExecutorBase):
         return
 
     def reinitialize_distributed(
-            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
         self.driver_worker.reinitialize_distributed(reconfig_request)
-        if reconfig_request.new_data_parallel_rank == \
-        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
             self.shutdown()
-        return
 
     def shutdown(self) -> None:
         if worker := self.driver_worker:
             worker.shutdown()
 
 
-UniProcExecutorAsync = UniProcExecutor
-
-
 class ExecutorWithExternalLauncher(UniProcExecutor):
     """An executor that uses external launchers to launch engines,
     specially designed for torchrun-compatible launchers, for
@@ -132,15 +150,13 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
     deterministic, all the engines will generate the same outputs,
     and they don't need to synchronize the states with each other.
     """
-    uses_ray: bool = False
 
     def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        if envs.VLLM_USE_V1:
-            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
-            ("To get deterministic execution in V1, "
-            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0")
+        """Initialize the worker and load the model."""
+        assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
+            "To get deterministic execution, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0"
+        )
         super()._init_executor()
 
     def _distributed_args(self) -> tuple[str, int, int]:
@@ -156,21 +172,12 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
         local_rank = int(os.environ["LOCAL_RANK"])
         return distributed_init_method, rank, local_rank
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """
-        Determine the number of available KV blocks.
-        Add an additional all_reduce to get the min across all ranks.
-        Note that even if we have the same `gpu_memory_utilization` and 
-        `swap_space`, the available memory in every rank might still 
-        differ because NCCL can take different amounts of memory in 
-        different ranks. Therefore, it is necessary to test if all ranks 
-        agree on the same KV cache configuration.
-        """
-        a, b = super().determine_num_available_blocks()
+    def determine_available_memory(self) -> list[int]:  # in bytes
+        # we need to get the min across all ranks.
+        memory = super().determine_available_memory()
         from vllm.distributed.parallel_state import get_world_group
+
         cpu_group = get_world_group().cpu_group
-        a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64)
-        b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64)
-        dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
-        dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
-        return a_tensor.item(), b_tensor.item()
+        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
+        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return [memory_tensor.item()]
diff --git a/vllm/v1/executor/utils.py b/vllm/v1/executor/utils.py
deleted file mode 100644
index 1855bc9963817886bf8de0576d62362e434c3259..0000000000000000000000000000000000000000
--- a/vllm/v1/executor/utils.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.multimodal.cache import ShmObjectStoreReceiverCache
-from vllm.v1.core.sched.output import SchedulerOutput
-
-
-def get_and_update_mm_cache(
-    receiver_cache: ShmObjectStoreReceiverCache,
-    args: tuple[SchedulerOutput],
-) -> None:
-    """
-    For each MultiModalKwargsItem in SchedulerOutput, fetch from shared memory
-    cache as needed.
-
-    Args:
-        receiver_cache: The receiver cache to update.
-        args: According to the collective_rpc call of execute_model method in
-            executor, args is a tuple of only one SchedulerOutput element.
-    """
-    scheduler_output = args[0]
-    for request_data in scheduler_output.scheduled_new_reqs:
-        request_data.mm_features = receiver_cache.get_and_update_features(
-            request_data.mm_features)
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 2818166535404d9666387180d4f5cccc7588a3bf..7f33eb7e699c72959ad25e134a182c3ad25dfb89 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -2,16 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
-from dataclasses import dataclass, fields
+from dataclasses import dataclass, fields, replace
 from math import prod
-from typing import Optional
 
 import torch
 from typing_extensions import Self
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv, get_dtype_size
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import get_dtype_size
 
 logger = init_logger(__name__)
 
@@ -44,13 +44,20 @@ class KVCacheSpec:
         """
         raise NotImplementedError
 
+    def copy_with_new_block_size(self, block_size: int) -> Self:
+        """
+        Create a new KVCacheSpec from self but replacing the block size.
+        """
+        return replace(self, block_size=block_size)
+
     @classmethod
     def merge(cls, specs: list[Self]) -> Self:
         """
         Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
         """
         assert all(spec == specs[0] for spec in specs[1:]), (
-            "All layers in the same KV cache group must be the same.")
+            "All layers in the same KV cache group must be the same."
+        )
         return copy.deepcopy(specs[0])
 
 
@@ -62,14 +69,19 @@ class AttentionSpec(KVCacheSpec):
 
     @property
     def page_size_bytes(self) -> int:
-        return 2 * self.block_size * self.num_kv_heads * self.head_size \
-                * get_dtype_size(self.dtype)
+        return (
+            2
+            * self.block_size
+            * self.num_kv_heads
+            * self.head_size
+            * get_dtype_size(self.dtype)
+        )
 
 
 @dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: Optional[int] = None
-    attention_chunk_size: Optional[int] = None
+    sliding_window: int | None = None
+    attention_chunk_size: int | None = None
     """
     When hybrid allocator is disabled and the model contains both full 
     attention layers and sliding window attention layers, sliding 
@@ -82,8 +94,7 @@ class FullAttentionSpec(AttentionSpec):
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
-        dcp_world_size = \
-            vllm_config.parallel_config.decode_context_parallel_size
+        dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
         # Note(hc): each dcp rank only need save
         # (max_model_len//dcp_world_size) tokens locally.
         if dcp_world_size > 1:
@@ -91,7 +102,7 @@ class FullAttentionSpec(AttentionSpec):
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
-    def merge_window_sizes(cls, window_sizes: set[int]) -> Optional[int]:
+    def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
         if len(window_sizes) == 0:
             return None
         elif len(window_sizes) == 1:
@@ -99,24 +110,30 @@ class FullAttentionSpec(AttentionSpec):
         else:
             raise ValueError(
                 "All attention layers in the same KV cache group must have the "
-                "same window size.")
+                "same window size."
+            )
 
     @classmethod
     def merge(cls, specs: list[Self]) -> Self:
         """
-        Merge a list of FullAttentionSpec objects into a single 
+        Merge a list of FullAttentionSpec objects into a single
         FullAttentionSpec object.
         """
         assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
-            "All attention layers in the same KV cache group must be "
-            "FullAttentionSpec.")
+            "All attention layers in the same KV cache group must be FullAttentionSpec."
+        )
 
-        sliding_window = set(spec.sliding_window for spec in specs
-                             if spec.sliding_window is not None)
-        attention_chunk_size = set(spec.attention_chunk_size for spec in specs
-                                   if spec.attention_chunk_size is not None)
+        sliding_window = set(
+            spec.sliding_window for spec in specs if spec.sliding_window is not None
+        )
+        attention_chunk_size = set(
+            spec.attention_chunk_size
+            for spec in specs
+            if spec.attention_chunk_size is not None
+        )
         assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
-            "MLAAttentionSpec should be merged in MLAAttentionSpec.merge")
+            "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
+        )
         merged_spec = cls(
             block_size=specs[0].block_size,
             num_kv_heads=specs[0].num_kv_heads,
@@ -129,19 +146,21 @@ class FullAttentionSpec(AttentionSpec):
             for f in fields(AttentionSpec):
                 assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
                     "All attention layers in the same KV cache group must have "
-                    "the same attention spec.")
-        assert (
-            (merged_spec.sliding_window is not None) +
-            (merged_spec.attention_chunk_size is not None) <= 1
-        ), ("Model with both sliding window layers and chunked local attention "
-            "layers is not supported.")
+                    "the same attention spec."
+                )
+        assert (merged_spec.sliding_window is not None) + (
+            merged_spec.attention_chunk_size is not None
+        ) <= 1, (
+            "Model with both sliding window layers and chunked local attention "
+            "layers is not supported."
+        )
         return merged_spec
 
 
 @dataclass(frozen=True)
 class MLAAttentionSpec(FullAttentionSpec):
     # TODO(Lucas/Chen): less hacky way to do this
-    cache_dtype_str: Optional[str] = None
+    cache_dtype_str: str | None = None
 
     @property
     def page_size_bytes(self) -> int:
@@ -149,18 +168,23 @@ class MLAAttentionSpec(FullAttentionSpec):
             # See `vllm/v1/attention/backends/mla/flashmla_sparse.py`
             #  for details.
             return self.block_size * 656
-        return self.block_size * self.num_kv_heads * self.head_size \
-                * get_dtype_size(self.dtype)
+        return (
+            self.block_size
+            * self.num_kv_heads
+            * self.head_size
+            * get_dtype_size(self.dtype)
+        )
 
     @classmethod
     def merge(cls, specs: list[Self]) -> Self:
         assert all(isinstance(spec, MLAAttentionSpec) for spec in specs), (
-            "All attention layers in the same KV cache group must be "
-            "MLAAttentionSpec.")
+            "All attention layers in the same KV cache group must be MLAAttentionSpec."
+        )
         cache_dtype_str_set = set(spec.cache_dtype_str for spec in specs)
         assert len(cache_dtype_str_set) == 1, (
             "All attention layers in the same KV cache group must use the same "
-            "quantization method.")
+            "quantization method."
+        )
         return cls(
             block_size=specs[0].block_size,
             num_kv_heads=specs[0].num_kv_heads,
@@ -176,15 +200,15 @@ class ChunkedLocalAttentionSpec(AttentionSpec):
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
-        max_num_batched_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
+        max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
 
         # During chunked prefill, we allocate KV cache for at most
         # `self.attention_chunk_size` computed tokens plus the newly scheduled
         # tokens. And we won't allocate KV cache for more than `max_model_len`
         # tokens.
-        num_tokens = min(self.attention_chunk_size + max_num_batched_tokens,
-                         max_model_len)
+        num_tokens = min(
+            self.attention_chunk_size + max_num_batched_tokens, max_model_len
+        )
 
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
@@ -194,18 +218,19 @@ class SlidingWindowSpec(AttentionSpec):
     sliding_window: int
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
-        assert vllm_config.parallel_config.decode_context_parallel_size == 1, \
+        assert vllm_config.parallel_config.decode_context_parallel_size == 1, (
             "DCP not support sliding window."
+        )
         max_model_len = vllm_config.model_config.max_model_len
-        max_num_batched_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
+        max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
 
         # During chunked prefill, we allocate KV cache for the last
         # `self.sliding_window-1` computed tokens plus the newly scheduled
         # tokens. And we won't allocate KV cache for more than `max_model_len`
         # tokens.
-        num_tokens = min(self.sliding_window - 1 + max_num_batched_tokens,
-                         max_model_len)
+        num_tokens = min(
+            self.sliding_window - 1 + max_num_batched_tokens, max_model_len
+        )
 
         # +1 here because the sliding window may not start from the beginning
         # of the block. For example, if the block size is 4 and num_token
@@ -218,7 +243,7 @@ class SlidingWindowSpec(AttentionSpec):
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtypes: tuple[torch.dtype]
-    page_size_padded: Optional[int] = None
+    page_size_padded: int | None = None
     mamba_type: str = "mamba2"
     num_speculative_blocks: int = 0
 
@@ -226,22 +251,20 @@ class MambaSpec(KVCacheSpec):
     def page_size_bytes(self) -> int:
         page_size = sum(
             prod(shape) * get_dtype_size(dtype)
-            for (shape, dtype) in zip(self.shapes, self.dtypes))
+            for (shape, dtype) in zip(self.shapes, self.dtypes)
+        )
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
         return page_size
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
-        # We allocate 1 block for each request now, so max_memory_usage_bytes is
-        # the same as page_size_bytes.
-        # Need to update this when supporting prefix caching.
-        return self.page_size_bytes
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
 
 @dataclass(frozen=True)
 class EncoderOnlyAttentionSpec(AttentionSpec):
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         # Encoder-only layers do not need KV cache
         return 0
@@ -256,8 +279,7 @@ class CrossAttentionSpec(AttentionSpec):
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         # For cross-attention, we need to cache encoder states
         # Get encoder length (e.g., 1500 for Whisper).
-        max_encoder_len = vllm_config.scheduler_config.\
-            max_num_encoder_input_tokens
+        max_encoder_len = vllm_config.scheduler_config.max_num_encoder_input_tokens
         return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
 
 
@@ -269,18 +291,18 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
     sliding window attentions with different window sizes are not the same type
     and should not be merged into one UniformTypeKVCacheSpecs.
     """
+
     kv_cache_specs: dict[str, KVCacheSpec]
 
     @property
     def page_size_bytes(self) -> int:
-        return sum(spec.page_size_bytes
-                   for spec in self.kv_cache_specs.values())
+        return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_num_pages = max(
-            cdiv(spec.max_memory_usage_bytes(vllm_config),
-                 spec.page_size_bytes)
-            for spec in self.kv_cache_specs.values())
+            cdiv(spec.max_memory_usage_bytes(vllm_config), spec.page_size_bytes)
+            for spec in self.kv_cache_specs.values()
+        )
         return max_num_pages * self.page_size_bytes
 
     @classmethod
@@ -295,35 +317,38 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
         one_spec = next(iter(kv_cache_specs.values()))
         if isinstance(one_spec, FullAttentionSpec):
             return all(
-                isinstance(spec, FullAttentionSpec)
-                for spec in kv_cache_specs.values())
+                isinstance(spec, FullAttentionSpec) for spec in kv_cache_specs.values()
+            )
         elif isinstance(one_spec, CrossAttentionSpec):
             return all(
-                isinstance(spec, CrossAttentionSpec)
-                for spec in kv_cache_specs.values())
+                isinstance(spec, CrossAttentionSpec) for spec in kv_cache_specs.values()
+            )
         elif isinstance(one_spec, SlidingWindowSpec):
             return all(
                 isinstance(spec, SlidingWindowSpec)
                 and spec.sliding_window == one_spec.sliding_window
-                for spec in kv_cache_specs.values())
+                for spec in kv_cache_specs.values()
+            )
         elif isinstance(one_spec, ChunkedLocalAttentionSpec):
             return all(
                 isinstance(spec, ChunkedLocalAttentionSpec)
                 and spec.attention_chunk_size == one_spec.attention_chunk_size
-                for spec in kv_cache_specs.values())
+                for spec in kv_cache_specs.values()
+            )
         elif isinstance(one_spec, MambaSpec):
             return all(
-                isinstance(spec, MambaSpec) and spec.num_speculative_blocks ==
-                one_spec.num_speculative_blocks
-                for spec in kv_cache_specs.values())
+                isinstance(spec, MambaSpec)
+                and spec.num_speculative_blocks == one_spec.num_speculative_blocks
+                for spec in kv_cache_specs.values()
+            )
         else:
             # NOTE(Chen): Please add new branches for new KV cache spec types.
             raise NotImplementedError(
-                f"Unsupported KV cache spec type: {type(one_spec)}")
+                f"Unsupported KV cache spec type: {type(one_spec)}"
+            )
 
     @classmethod
-    def from_specs(cls, kv_cache_specs: dict[str,
-                                             KVCacheSpec]) -> Optional[Self]:
+    def from_specs(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> Self | None:
         """
         Return a SameTypeKVCacheSpecs object if all layers have the same type
         of KV cache spec. Return None if not.
@@ -340,6 +365,7 @@ class KVCacheTensor:
     """
     A class for specifying how the workers should initialize the KV cache.
     """
+
     size: int  # size of the KV cache tensor in bytes
     shared_by: list[str]  # layer names that share the same KV cache tensor
 
@@ -350,6 +376,7 @@ class KVCacheGroupSpec:
     Represents a group of model layers that share the same KV cache block table.
     These layers are regarded as one layer in the KV cache manager.
     """
+
     # The names of model layers in this group
     layer_names: list[str]
     # The KV cache spec of this manager layer
@@ -361,6 +388,7 @@ class KVCacheConfig:
     """
     The KV cache configuration of a model.
     """
+
     """The number of KV cache blocks"""
     num_blocks: int
     """How should model runner initialize the KV cache tensors for each layer"""
diff --git a/vllm/v1/kv_offload/abstract.py b/vllm/v1/kv_offload/abstract.py
index 9f9c044ea1c538d6c2b8046725c34c88ae0d4896..c1d1cbebc1752a9c55bccc69baa0a3586037b867 100644
--- a/vllm/v1/kv_offload/abstract.py
+++ b/vllm/v1/kv_offload/abstract.py
@@ -30,7 +30,6 @@ The class provides the following primitives:
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.v1.core.kv_cache_utils import BlockHash
 
@@ -68,7 +67,6 @@ class OffloadingEvent:
 
 
 class OffloadingManager(ABC):
-
     @abstractmethod
     def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
         """
@@ -122,8 +120,8 @@ class OffloadingManager(ABC):
 
     @abstractmethod
     def prepare_store(
-            self,
-            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
         """
         Prepare the given blocks to be offloaded.
         The given blocks will be protected from eviction until
@@ -140,9 +138,7 @@ class OffloadingManager(ABC):
         """
         pass
 
-    def complete_store(self,
-                       block_hashes: Iterable[BlockHash],
-                       success: bool = True):
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
         """
         Marks blocks which were previously prepared to be stored, as stored.
         Following this call, the blocks become loadable.
diff --git a/vllm/v1/kv_offload/arc_manager.py b/vllm/v1/kv_offload/arc_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b166d6797f8061faf27fcd7f0dc5d23366bbfe
--- /dev/null
+++ b/vllm/v1/kv_offload/arc_manager.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+
+
+class ARCOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager implementing the ARC (Adaptive Replacement Cache)
+    eviction policy with a pluggable backend.
+
+    Data Structures:
+        T1: Recent cache containing blocks accessed once.
+        T2: Frequent cache containing blocks accessed multiple times.
+        B1/B2: Ghost lists tracking recently evicted blocks from T1/T2.
+        target_t1_size: Adaptive target size for the T1 partition.
+
+    Algorithm Flow:
+        1. Cache lookup (lookup):
+           Searches T1 and T2 for block hashes and counts consecutive hits
+           until a miss or non-ready block is encountered.
+
+        2. Cache touch (touch) - Adaptive Learning:
+           For each block_hash (in reverse order):
+           - If in T1: Move to T2 (promotion from recent to frequent).
+           - If in T2: Move to MRU position (end of queue).
+           - If in B1 ghost list: Increase target_t1_size.
+           - If in B2 ghost list: Decrease target_t1_size.
+
+        3. Block eviction (prepare_store) - Adaptive Replacement:
+           Determines eviction source based on adaptive target:
+           - If T1 size > target_t1_size: Evict from T1, add to B1.
+           - Otherwise: Evict from T2, add to B2.
+           Finally, bound each ghost list size.
+
+        4. Block insertion (prepare_store):
+           New blocks are always inserted into T1 and removed from B1/B2 if
+           present. Blocks may later be promoted to T2 during touch operations.
+
+    Adaptive Behavior:
+        The algorithm self-tunes the recency vs. frequency trade-off:
+        - B1 hit: Recent access patterns matter more → increase T1.
+        - B2 hit: Frequent access patterns matter more → decrease T1.
+    """
+
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        self.target_t1_size: float = 0.0
+        self.t1: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.t2: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        # block_hash -> None (only care about presence)
+        self.b1: OrderedDict[BlockHash, None] = OrderedDict()
+        self.b2: OrderedDict[BlockHash, None] = OrderedDict()
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
+        self.cache_capacity: int = self.backend.get_num_free_blocks()
+
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found in cache"
+            assert block.is_ready, f"Block {block_hash!r} is not ready for reading"
+
+            block.ref_cnt += 1
+            blocks.append(block)
+
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if block_hash in self.t1:
+                block = self.t1.pop(block_hash)
+                if not block.is_ready:
+                    # block was just prepared to be stored, not really touched twice
+                    self.t1.move_to_end(block_hash)
+                else:
+                    self.t2[block_hash] = block
+
+            elif block_hash in self.t2:
+                self.t2.move_to_end(block_hash)
+
+            elif block_hash in self.b1:
+                delta = max(1, len(self.b2) / len(self.b1))
+                self.target_t1_size = min(
+                    self.target_t1_size + delta, self.cache_capacity
+                )
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b1.move_to_end(block_hash)
+
+            elif block_hash in self.b2:
+                delta = max(1, len(self.b1) / len(self.b2))
+                self.target_t1_size = max(self.target_t1_size - delta, 0)
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b2.move_to_end(block_hash)
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found"
+            assert block.ref_cnt > 0, f"Block {block_hash!r} ref_cnt is already 0"
+
+            block.ref_cnt -= 1
+
+    def prepare_store(
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
+        block_hashes_to_store = []
+        for block_hash in block_hashes:
+            if block_hash not in self.t1 and block_hash not in self.t2:
+                block_hashes_to_store.append(block_hash)
+
+        if not block_hashes_to_store:
+            return PrepareStoreOutput(
+                block_hashes_to_store=[],
+                store_spec=self.backend.get_load_store_spec([], []),
+                block_hashes_evicted=[],
+            )
+
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
+
+        to_evict = []
+        while num_blocks_to_evict > 0:
+            block_to_evict = None
+            if len(self.t1) >= int(self.target_t1_size):
+                # try to evict the least recently used (oldest) block from T1
+                for block_hash, block in self.t1.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t1
+                        eviction_b = self.b1
+                        break
+            if not block_to_evict:
+                # try to evict the least recently used (oldest) block from T2
+                for block_hash, block in self.t2.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t2
+                        eviction_b = self.b2
+                        break
+                else:
+                    # cannot evict enough blocks, cache is full of in-use items
+                    return None
+
+            block_hash, block = block_to_evict
+            del eviction_t[block_hash]
+            eviction_b[block_hash] = None
+            to_evict.append(block_hash)
+            self.backend.free(block)
+            num_blocks_to_evict -= 1
+
+        for b in [self.b1, self.b2]:
+            for i in range(len(b) - self.cache_capacity):
+                b.popitem(last=False)
+
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
+
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store), (
+            "Backend did not allocate the expected number of blocks"
+        )
+
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.t1[block_hash] = block
+
+            self.b1.pop(block_hash, None)
+            self.b2.pop(block_hash, None)
+
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
+
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
+
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+
+        if success:
+            for block_hash in block_hashes:
+                block = self.t1.get(block_hash) or self.t2.get(block_hash)
+
+                if block is not None and not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.t1.pop(block_hash, None)
+
+                if block is None:
+                    block = self.t2.pop(block_hash, None)
+
+                if block is not None and not block.is_ready:
+                    self.backend.free(block)
+
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()
diff --git a/vllm/v1/kv_offload/backend.py b/vllm/v1/kv_offload/backend.py
index 87a74200116bb843ed8233d0470dd42f7cf02417..538f7bf0584b585c070cf69d472a978532c4f4d3 100644
--- a/vllm/v1/kv_offload/backend.py
+++ b/vllm/v1/kv_offload/backend.py
@@ -18,6 +18,7 @@ class BlockStatus(ctypes.Structure):
     load_store_spec - backend-specific information on how to actually
         read/write the block.
     """
+
     _fields_ = [("ref_cnt", ctypes.c_int32)]
 
     def __init__(self):
@@ -51,8 +52,7 @@ class Backend(ABC):
         pass
 
     @abstractmethod
-    def allocate_blocks(self,
-                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+    def allocate_blocks(self, block_hashes: list[BlockHash]) -> list[BlockStatus]:
         """
         Allocate space for writing blocks.
         This method assumes there is enough space for allocation.
@@ -80,8 +80,9 @@ class Backend(ABC):
         """
         pass
 
-    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
-                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+    def get_load_store_spec(
+        self, block_hashes: Iterable[BlockHash], blocks: Iterable[BlockStatus]
+    ) -> LoadStoreSpec:
         """
         Get backend-specific information on how to read/write blocks.
 
diff --git a/vllm/v1/kv_offload/backends/cpu.py b/vllm/v1/kv_offload/backends/cpu.py
index eb1123d1d83ac9d3724a8b1ce7a31e4dc7671ea4..736cf37853cdc271f005b03d3e8b547db93615d3 100644
--- a/vllm/v1/kv_offload/backends/cpu.py
+++ b/vllm/v1/kv_offload/backends/cpu.py
@@ -10,8 +10,7 @@ from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
 
 
 class CPUBlockStatus(BlockStatus):
-    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
-                                       ]  # type: ignore
+    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)]  # type: ignore
 
     def __init__(self, block_id: int):
         super().__init__()
@@ -19,23 +18,24 @@ class CPUBlockStatus(BlockStatus):
 
 
 class CPUBackend(Backend):
-
     def __init__(self, block_size: int, num_blocks: int):
-        super().__init__(block_size=block_size,
-                         medium=CPULoadStoreSpec.medium())
+        super().__init__(block_size=block_size, medium=CPULoadStoreSpec.medium())
 
         self.num_blocks: int = num_blocks
         self.num_allocated_blocks: int = 0
         self.allocated_blocks_free_list: list[int] = []
 
     def get_num_free_blocks(self):
-        return (len(self.allocated_blocks_free_list) + self.num_blocks -
-                self.num_allocated_blocks)
-
-    def allocate_blocks(self,
-                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
-        num_fresh_blocks = min(len(block_hashes),
-                               self.num_blocks - self.num_allocated_blocks)
+        return (
+            len(self.allocated_blocks_free_list)
+            + self.num_blocks
+            - self.num_allocated_blocks
+        )
+
+    def allocate_blocks(self, block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        num_fresh_blocks = min(
+            len(block_hashes), self.num_blocks - self.num_allocated_blocks
+        )
         num_reused_blocks = len(block_hashes) - num_fresh_blocks
         assert len(self.allocated_blocks_free_list) >= num_reused_blocks
 
@@ -56,6 +56,7 @@ class CPUBackend(Backend):
         assert isinstance(block, CPUBlockStatus)
         self.allocated_blocks_free_list.append(block.block_id)
 
-    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
-                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+    def get_load_store_spec(
+        self, block_hashes: Iterable[BlockHash], blocks: Iterable[BlockStatus]
+    ) -> LoadStoreSpec:
         return CPULoadStoreSpec([block.block_id for block in blocks])
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index b85d375fe63e28ad63e5a3f8b801af696c2652fb..4b1bbe6f0cc2a6e7aaa9291199d6cfb70f377b76 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterator
-from typing import Optional
 
 import torch
 
@@ -9,6 +8,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
@@ -18,46 +18,63 @@ from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 
 class CPUOffloadingSpec(OffloadingSpec):
-
     def __init__(self, vllm_config: VllmConfig):
         super().__init__(vllm_config)
 
         num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
         if not num_cpu_blocks:
-            raise Exception("num_cpu_blocks must be specified "
-                            "in kv_connector_extra_config")
+            raise Exception(
+                "num_cpu_blocks must be specified in kv_connector_extra_config"
+            )
         self.num_cpu_blocks: int = num_cpu_blocks
 
         # scheduler-side
-        self._manager: Optional[OffloadingManager] = None
+        self._manager: OffloadingManager | None = None
 
         # worker-side
-        self._handler: Optional[OffloadingHandler] = None
+        self._handler: OffloadingHandler | None = None
+
+        self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
 
     def get_manager(self) -> OffloadingManager:
         if not self._manager:
             kv_events_config = self.vllm_config.kv_events_config
-            enable_events = (kv_events_config is not None
-                             and kv_events_config.enable_kv_cache_events)
-            self._manager = LRUOffloadingManager(CPUBackend(
-                block_size=self.offloaded_block_size,
-                num_blocks=self.num_cpu_blocks),
-                                                 enable_events=enable_events)
+            enable_events = (
+                kv_events_config is not None and kv_events_config.enable_kv_cache_events
+            )
+
+            backend = CPUBackend(
+                block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
+            )
+
+            if self.eviction_policy == "lru":
+                self._manager = LRUOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            elif self.eviction_policy == "arc":
+                self._manager = ARCOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            else:
+                raise ValueError(
+                    f"Unknown eviction policy: {self.eviction_policy}. "
+                    f"Supported policies: lru, arc"
+                )
         return self._manager
 
     def get_handlers(
         self, kv_caches: dict[str, torch.Tensor]
-    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
-                        OffloadingHandler]]:
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         if not self._handler:
-            if not current_platform.is_cuda():
-                raise Exception("CPU Offloading is currently only supported"
-                                " on CUDA GPUs")
+            if not current_platform.is_cuda_alike():
+                raise Exception(
+                    "CPU Offloading is currently only supported on CUDA-alike GPUs"
+                )
 
             layer_names = list(kv_caches.keys())
-            layers = get_layers_from_vllm_config(self.vllm_config,
-                                                 AttentionLayerBase,
-                                                 layer_names)
+            layers = get_layers_from_vllm_config(
+                self.vllm_config, AttentionLayerBase, layer_names
+            )
             attn_backends = {
                 layer_name: layers[layer_name].get_attn_backend()
                 for layer_name in layer_names
@@ -68,7 +85,8 @@ class CPUOffloadingSpec(OffloadingSpec):
                 gpu_block_size=self.gpu_block_size,
                 cpu_block_size=self.offloaded_block_size,
                 num_cpu_blocks=self.num_cpu_blocks,
-                gpu_caches=kv_caches)
+                gpu_caches=kv_caches,
+            )
 
         assert self._handler is not None
         yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py
index f9bef6cea903820e767d5811747d56c8fc2ec177..b4d40cb48e1d18b0f192d55e7617e76c92e6dfe3 100644
--- a/vllm/v1/kv_offload/factory.py
+++ b/vllm/v1/kv_offload/factory.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.spec import OffloadingSpec
@@ -16,8 +17,7 @@ class OffloadingSpecFactory:
     _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {}
 
     @classmethod
-    def register_spec(cls, name: str, module_path: str,
-                      class_name: str) -> None:
+    def register_spec(cls, name: str, module_path: str, class_name: str) -> None:
         """Register a spec with a lazy-loading module and class name."""
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
@@ -51,6 +51,6 @@ class OffloadingSpecFactory:
 
 
 # Register various specs here.
-OffloadingSpecFactory.register_spec("CPUOffloadingSpec",
-                                    "vllm.v1.kv_offload.cpu",
-                                    "CPUOffloadingSpec")
+OffloadingSpecFactory.register_spec(
+    "CPUOffloadingSpec", "vllm.v1.kv_offload.cpu", "CPUOffloadingSpec"
+)
diff --git a/vllm/v1/kv_offload/lru_manager.py b/vllm/v1/kv_offload/lru_manager.py
index 18d3b1d637b322a662ce9da49a6441017f16aae8..0a0111f8879052d176a5625362619a10c3b93217 100644
--- a/vllm/v1/kv_offload/lru_manager.py
+++ b/vllm/v1/kv_offload/lru_manager.py
@@ -2,11 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from collections.abc import Iterable
-from typing import Optional
 
 from vllm.v1.core.kv_cache_utils import BlockHash
-from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
-                                         OffloadingManager, PrepareStoreOutput)
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
 from vllm.v1.kv_offload.backend import Backend, BlockStatus
 
 
@@ -19,8 +22,7 @@ class LRUOffloadingManager(OffloadingManager):
         self.backend: Backend = backend
         # block_hash -> BlockStatus
         self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
-        self.events: Optional[list[OffloadingEvent]] = \
-            [] if enable_events else None
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
 
     def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
         hit_count = 0
@@ -53,16 +55,16 @@ class LRUOffloadingManager(OffloadingManager):
             block.ref_cnt -= 1
 
     def prepare_store(
-            self,
-            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
         # filter out blocks that are already stored
         block_hashes_to_store = [
-            block_hash for block_hash in block_hashes
-            if block_hash not in self.blocks
+            block_hash for block_hash in block_hashes if block_hash not in self.blocks
         ]
 
-        num_blocks_to_evict = (len(block_hashes_to_store) -
-                               self.backend.get_num_free_blocks())
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
 
         # build list of blocks to evict
         to_evict = []
@@ -83,10 +85,13 @@ class LRUOffloadingManager(OffloadingManager):
 
         if to_evict and self.events is not None:
             self.events.append(
-                OffloadingEvent(block_hashes=to_evict,
-                                block_size=self.backend.block_size,
-                                medium=self.backend.medium,
-                                removed=True))
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
 
         blocks = self.backend.allocate_blocks(block_hashes_to_store)
         assert len(blocks) == len(block_hashes_to_store)
@@ -95,16 +100,15 @@ class LRUOffloadingManager(OffloadingManager):
             self.blocks[block_hash] = block
 
         # build store specs for allocated blocks
-        store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
-                                                      blocks)
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
 
-        return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
-                                  store_spec=store_spec,
-                                  block_hashes_evicted=to_evict)
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
 
-    def complete_store(self,
-                       block_hashes: Iterable[BlockHash],
-                       success: bool = True):
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
         stored_block_hashes: list[BlockHash] = []
         if success:
             for block_hash in block_hashes:
@@ -121,10 +125,13 @@ class LRUOffloadingManager(OffloadingManager):
 
         if stored_block_hashes and self.events is not None:
             self.events.append(
-                OffloadingEvent(block_hashes=stored_block_hashes,
-                                block_size=self.backend.block_size,
-                                medium=self.backend.medium,
-                                removed=False))
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
 
     def take_events(self) -> Iterable[OffloadingEvent]:
         if self.events is not None:
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index ed23d5e51934ff1ca1aafce06469961ef8019d0b..a3c539a47d4585e62ac30528fb413f19660bb92d 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -22,7 +22,8 @@ class OffloadingSpec(ABC):
     def __init__(self, vllm_config: "VllmConfig"):
         logger.warning(
             "Initializing OffloadingSpec. This API is experimental and "
-            "subject to change in the future as we iterate the design.")
+            "subject to change in the future as we iterate the design."
+        )
         self.vllm_config = vllm_config
 
         kv_transfer_config = vllm_config.kv_transfer_config
@@ -31,7 +32,8 @@ class OffloadingSpec(ABC):
 
         self.gpu_block_size = vllm_config.cache_config.block_size
         self.offloaded_block_size = int(
-            self.extra_config.get("block_size", self.gpu_block_size))
+            self.extra_config.get("block_size", self.gpu_block_size)
+        )
 
         assert self.offloaded_block_size % self.gpu_block_size == 0
 
@@ -47,8 +49,7 @@ class OffloadingSpec(ABC):
     @abstractmethod
     def get_handlers(
         self, kv_caches: dict[str, torch.Tensor]
-    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
-                        OffloadingHandler]]:
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
 
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 556c29247e5e7b7a2c7efd6a7286d469ddee0273..646f9d0d754239eb739f1334803f1c3fd4784af1 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -7,18 +7,23 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.attention import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
-from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
-                                              TransferResult, TransferSpec)
+from vllm.v1.kv_offload.worker.worker import (
+    OffloadingHandler,
+    TransferResult,
+    TransferSpec,
+)
 
 logger = init_logger(__name__)
 
 
-def expand_block_ids(block_ids: np.ndarray,
-                     block_size_factor: int,
-                     output: np.ndarray,
-                     skip_count: int = 0):
+def expand_block_ids(
+    block_ids: np.ndarray,
+    block_size_factor: int,
+    output: np.ndarray,
+    skip_count: int = 0,
+):
     """
     Convert a list of block IDs to a list of matching block ids,
     assuming each block is composed of actual block_size_factor blocks.
@@ -47,10 +52,14 @@ def expand_block_ids(block_ids: np.ndarray,
 
 
 class CpuGpuOffloadingHandler(OffloadingHandler):
-
-    def __init__(self, gpu_block_size: int, cpu_block_size: int,
-                 num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
-                 attn_backends: dict[str, type[AttentionBackend]]):
+    def __init__(
+        self,
+        gpu_block_size: int,
+        cpu_block_size: int,
+        num_cpu_blocks: int,
+        gpu_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
+    ):
         assert cpu_block_size % gpu_block_size == 0
         self.block_size_factor = cpu_block_size // gpu_block_size
 
@@ -75,7 +84,8 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
 
             gpu_shape = gpu_tensor.shape
             test_shape = attn_backends[layer_name].get_kv_cache_shape(
-                num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
+                num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
+            )
             if test_shape[0] == 1234:
                 # shape is (num_blocks, ...)
                 num_blocks_idx = 0
@@ -94,10 +104,13 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
 
             logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
             self.cpu_tensors.append(
-                torch.zeros(cpu_shape,
-                            dtype=gpu_tensor.dtype,
-                            device="cpu",
-                            pin_memory=pin_memory))
+                torch.zeros(
+                    cpu_shape,
+                    dtype=gpu_tensor.dtype,
+                    device="cpu",
+                    pin_memory=pin_memory,
+                )
+            )
 
     def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
         src_spec, dst_spec = spec
@@ -122,35 +135,36 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
         assert src_blocks.ndim == 1
         assert dst_blocks.ndim == 1
 
-        dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
+        dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor
         src_sub_block_count = src_blocks.size * src_block_size_factor
 
         assert (
-            src_sub_block_count == dst_blocks.size * dst_block_size_factor -
-            dst_sub_blocks_to_skip)
+            src_sub_block_count
+            == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip
+        )
 
         src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
         expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
-        expand_block_ids(dst_blocks,
-                         dst_block_size_factor,
-                         src_to_dst[:, 1],
-                         skip_count=dst_sub_blocks_to_skip)
+        expand_block_ids(
+            dst_blocks,
+            dst_block_size_factor,
+            src_to_dst[:, 1],
+            skip_count=dst_sub_blocks_to_skip,
+        )
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
-        event = self.events_pool.pop() if self.events_pool \
-            else torch.cuda.Event()
+        event = self.events_pool.pop() if self.events_pool else torch.cuda.Event()
         with torch.cuda.stream(stream):
             for src_tensor, dst_tensor, kv_dim in zip(
-                    src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
+                src_tensors, dst_tensors, self.kv_dim_before_num_blocks
+            ):
                 if kv_dim:
                     src_key_cache = src_tensor[0]
                     dst_key_cache = dst_tensor[0]
-                    ops.swap_blocks(src_key_cache, dst_key_cache,
-                                    src_to_dst_tensor)
+                    ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
                     src_value_cache = src_tensor[1]
                     dst_value_cache = dst_tensor[1]
-                    ops.swap_blocks(src_value_cache, dst_value_cache,
-                                    src_to_dst_tensor)
+                    ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
                 else:
                     ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
             event.record(stream)
diff --git a/vllm/v1/kv_offload/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py
index b7a52a088fb909d04b76323c68da0bea591086e7..58ba082497fa88b1955e4b64457fddf5e80bef9f 100644
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -74,12 +74,14 @@ class OffloadingWorker:
 
     def __init__(self):
         self.handlers: set[OffloadingHandler] = set()
-        self.transfer_type_to_handler: dict[TransferType,
-                                            OffloadingHandler] = {}
-
-    def register_handler(self, src_cls: type[LoadStoreSpec],
-                         dst_cls: type[LoadStoreSpec],
-                         handler: OffloadingHandler) -> None:
+        self.transfer_type_to_handler: dict[TransferType, OffloadingHandler] = {}
+
+    def register_handler(
+        self,
+        src_cls: type[LoadStoreSpec],
+        dst_cls: type[LoadStoreSpec],
+        handler: OffloadingHandler,
+    ) -> None:
         """
         Registers a new handler.
 
@@ -113,19 +115,19 @@ class OffloadingWorker:
         try:
             success = handler.transfer_async(job_id, spec)
         except Exception as e:
-            logger.warning("Exception in %r transfer %d: %r",
-                           transfer_type,
-                           job_id,
-                           e,
-                           exc_info=True)
+            logger.warning(
+                "Exception in %r transfer %d: %r",
+                transfer_type,
+                job_id,
+                e,
+                exc_info=True,
+            )
             return False
 
         if not success:
-            logger.warning("Failed to submit %r transfer %d", transfer_type,
-                           job_id)
+            logger.warning("Failed to submit %r transfer %d", transfer_type, job_id)
         else:
-            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
-                         spec)
+            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id, spec)
 
         return success
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index d68d111c67ca6bbaf48f8467bebf10046eb696e0..cb36e7973650eea2a61b1f6d7a9088bc98e9073f 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -4,23 +4,34 @@
 import logging
 import time
 from abc import ABC, abstractmethod
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import TypeAlias
 
-import prometheus_client
+from prometheus_client import Counter, Gauge, Histogram
 
+import vllm.envs as envs
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorLogging)
+    KVConnectorLogging,
+    KVConnectorPrometheus,
+)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
+from vllm.plugins import load_plugins_by_group
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
-from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.metrics.stats import (
+    CachingMetrics,
+    IterationStats,
+    MultiModalCacheStats,
+    SchedulerStats,
+)
 from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
 
-StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
+PerEngineStatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
+AggregateStatLoggerFactory = type["AggregateStatLoggerBase"]
+StatLoggerFactory = AggregateStatLoggerFactory | PerEngineStatLoggerFactory
 
 
 class StatLoggerBase(ABC):
@@ -32,39 +43,73 @@ class StatLoggerBase(ABC):
     """
 
     @abstractmethod
-    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
-        ...
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ...
 
     @abstractmethod
-    def record(self,
-               scheduler_stats: Optional[SchedulerStats],
-               iteration_stats: Optional[IterationStats],
-               engine_idx: int = 0):
-        ...
+    def record(
+        self,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int = 0,
+    ): ...
 
     @abstractmethod
-    def log_engine_initialized(self):
-        ...
+    def log_engine_initialized(self): ...
 
     def log(self):  # noqa
         pass
 
+    def record_sleep_state(self, is_awake: int, level: int):  # noqa
+        pass
 
-class LoggingStatLogger(StatLoggerBase):
 
+def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
+    factories: list[StatLoggerFactory] = []
+
+    for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items():
+        if not isinstance(plugin_class, type) or not issubclass(
+            plugin_class, StatLoggerBase
+        ):
+            raise TypeError(
+                f"Stat logger plugin {name!r} must be a subclass of "
+                f"StatLoggerBase (got {plugin_class!r})."
+            )
+
+        factories.append(plugin_class)
+
+    return factories
+
+
+class AggregateStatLoggerBase(StatLoggerBase):
+    """Abstract base class for loggers that
+    aggregate across multiple DP engines."""
+
+    @abstractmethod
+    def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): ...
+
+
+class LoggingStatLogger(StatLoggerBase):
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.engine_index = engine_index
         self.vllm_config = vllm_config
         self._reset(time.monotonic())
+
         self.last_scheduler_stats = SchedulerStats()
-        # Prefix cache metrics. This cannot be reset.
+
+        # Caching metrics. This cannot be reset.
         # TODO: Make the interval configurable.
-        self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.prefix_caching_metrics = CachingMetrics()
+        self.connector_prefix_caching_metrics = CachingMetrics()
+        self.mm_caching_metrics = CachingMetrics()
+
         self.spec_decoding_logging = SpecDecodingLogging()
         kv_tranfer_config = self.vllm_config.kv_transfer_config
-        self.kv_transfer_logging = KVConnectorLogging(kv_tranfer_config)
+        self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
+        self.engine_is_idle = False
+        self.aggregated = False
 
     def _reset(self, now):
         self.last_log_time = now
@@ -72,11 +117,15 @@ class LoggingStatLogger(StatLoggerBase):
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: int = 0
         self.num_generation_tokens: int = 0
+        self.num_corrupted_reqs: int = 0
+        self.num_preemptions: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
         self.num_generation_tokens += iteration_stats.num_generation_tokens
+        self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
+        self.num_preemptions += iteration_stats.num_preempted_reqs
 
     def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
@@ -85,103 +134,262 @@ class LoggingStatLogger(StatLoggerBase):
             return 0.0
         return float(tracked_stats / delta_time)
 
-    def record(self,
-               scheduler_stats: Optional[SchedulerStats],
-               iteration_stats: Optional[IterationStats],
-               engine_idx: int = 0):
+    @property
+    def log_prefix(self):
+        return "Engine {:03d}: ".format(self.engine_index)
+
+    def record(
+        self,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int = 0,
+    ):
         """Log Stats to standard output."""
         if iteration_stats:
             self._track_iteration_stats(iteration_stats)
 
         if scheduler_stats is not None:
-            self.prefix_caching_metrics.observe(
-                scheduler_stats.prefix_cache_stats)
+            self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
+
+            if scheduler_stats.connector_prefix_cache_stats is not None:
+                self.connector_prefix_caching_metrics.observe(
+                    scheduler_stats.connector_prefix_cache_stats
+                )
 
             if scheduler_stats.spec_decoding_stats is not None:
-                self.spec_decoding_logging.observe(
-                    scheduler_stats.spec_decoding_stats)
+                self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
             if kv_connector_stats := scheduler_stats.kv_connector_stats:
-                self.kv_transfer_logging.observe(kv_connector_stats)
-            self.last_scheduler_stats = scheduler_stats
+                self.kv_connector_logging.observe(kv_connector_stats)
+            if not self.aggregated:
+                self.last_scheduler_stats = scheduler_stats
+        if mm_cache_stats:
+            self.mm_caching_metrics.observe(mm_cache_stats)
 
-    def log(self):
+    def _update_stats(self):
         now = time.monotonic()
         prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
-        generation_throughput = self._get_throughput(
-            self.num_generation_tokens, now)
+        generation_throughput = self._get_throughput(self.num_generation_tokens, now)
 
         self._reset(now)
-
-        scheduler_stats = self.last_scheduler_stats
-
-        log_fn = logger.info
-        if not any(
-            (prompt_throughput, generation_throughput,
-             self.last_prompt_throughput, self.last_generation_throughput)):
-            # Avoid log noise on an idle production system
-            log_fn = logger.debug
+        self.engine_is_idle = not any(
+            (
+                prompt_throughput,
+                generation_throughput,
+                self.last_prompt_throughput,
+                self.last_generation_throughput,
+            )
+        )
         self.last_generation_throughput = generation_throughput
         self.last_prompt_throughput = prompt_throughput
 
+    def aggregate_scheduler_stats(self):
+        # noop for per engine loggers
+        return
+
+    def log(self):
+        self._update_stats()
+        self.aggregate_scheduler_stats()
+        # Avoid log noise on an idle production system
+        log_fn = logger.debug if self.engine_is_idle else logger.info
         # Format and print output.
+        log_parts = [
+            "Avg prompt throughput: %.1f tokens/s",
+            "Avg generation throughput: %.1f tokens/s",
+            "Running: %d reqs",
+            "Waiting: %d reqs",
+        ]
+        log_args = [
+            self.last_prompt_throughput,
+            self.last_generation_throughput,
+            self.last_scheduler_stats.num_running_reqs,
+            self.last_scheduler_stats.num_waiting_reqs,
+        ]
+
+        if self.num_preemptions > 0:
+            log_parts.append("Preemptions: %d")
+            log_args.append(self.num_preemptions)
+
+        log_parts.extend(
+            [
+                "GPU KV cache usage: %.1f%%",
+                "Prefix cache hit rate: %.1f%%",
+            ]
+        )
+        log_args.extend(
+            [
+                self.last_scheduler_stats.kv_cache_usage * 100,
+                self.prefix_caching_metrics.hit_rate * 100,
+            ]
+        )
+
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            log_parts.append("Corrupted: %d reqs")
+            log_args.append(self.num_corrupted_reqs)
+        if not self.connector_prefix_caching_metrics.empty:
+            log_parts.append("External prefix cache hit rate: %.1f%%")
+            log_args.append(self.connector_prefix_caching_metrics.hit_rate * 100)
+        if not self.mm_caching_metrics.empty:
+            log_parts.append("MM cache hit rate: %.1f%%")
+            log_args.append(self.mm_caching_metrics.hit_rate * 100)
+
         log_fn(
-            "Engine %03d: "
-            "Avg prompt throughput: %.1f tokens/s, "
-            "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs, "
-            "GPU KV cache usage: %.1f%%, "
-            "Prefix cache hit rate: %.1f%%",
-            self.engine_index,
-            prompt_throughput,
-            generation_throughput,
-            scheduler_stats.num_running_reqs,
-            scheduler_stats.num_waiting_reqs,
-            scheduler_stats.kv_cache_usage * 100,
-            self.prefix_caching_metrics.hit_rate * 100,
+            self.log_prefix + ", ".join(log_parts),
+            *log_args,
         )
+
         self.spec_decoding_logging.log(log_fn=log_fn)
-        self.kv_transfer_logging.log(log_fn=log_fn)
+        self.kv_connector_logging.log(log_fn=log_fn)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
-            logger.info(
+            logger.debug(
                 "Engine %03d: vllm cache_config_info with initialization "
-                "after num_gpu_blocks is: %d", self.engine_index,
-                self.vllm_config.cache_config.num_gpu_blocks)
+                "after num_gpu_blocks is: %d",
+                self.engine_index,
+                self.vllm_config.cache_config.num_gpu_blocks,
+            )
+
+
+class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        engine_indexes: list[int],
+    ):
+        self.engine_indexes = engine_indexes
+        self.last_scheduler_stats_dict: dict[int, SchedulerStats] = {
+            idx: SchedulerStats() for idx in self.engine_indexes
+        }
+        LoggingStatLogger.__init__(self, vllm_config, engine_index=-1)
+        self.aggregated = True
+
+    @property
+    def log_prefix(self):
+        return "{} Engines Aggregated: ".format(len(self.engine_indexes))
+
+    def record(
+        self,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int = 0,
+    ):
+        if engine_idx not in self.engine_indexes:
+            logger.warning("Unexpected engine_idx: %d", engine_idx)
+            return
+        LoggingStatLogger.record(
+            self,
+            scheduler_stats,
+            iteration_stats,
+            mm_cache_stats=mm_cache_stats,
+            engine_idx=engine_idx,
+        )
+        if scheduler_stats is not None:
+            self.last_scheduler_stats_dict[engine_idx] = scheduler_stats
+
+    def aggregate_scheduler_stats(self):
+        self.last_scheduler_stats = SchedulerStats()
+        for last_scheduler_stats in self.last_scheduler_stats_dict.values():
+            self.last_scheduler_stats.num_waiting_reqs += (
+                last_scheduler_stats.num_waiting_reqs
+            )
+            self.last_scheduler_stats.num_running_reqs += (
+                last_scheduler_stats.num_running_reqs
+            )
+            self.last_scheduler_stats.kv_cache_usage += (
+                last_scheduler_stats.kv_cache_usage
+            )
+        self.last_scheduler_stats.kv_cache_usage /= len(self.last_scheduler_stats_dict)
+
+    def log(self):
+        LoggingStatLogger.log(self)
 
+    def log_engine_initialized(self):
+        if self.vllm_config.cache_config.num_gpu_blocks:
+            logger.info(
+                "%d Engines: vllm cache_config_info with initialization "
+                "after num_gpu_blocks is: %d",
+                len(self.engine_indexes),
+                self.vllm_config.cache_config.num_gpu_blocks,
+            )
+
+
+class PerEngineStatLoggerAdapter(AggregateStatLoggerBase):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        engine_indexes: list[int],
+        per_engine_stat_logger_factory: PerEngineStatLoggerFactory,
+    ) -> None:
+        self.per_engine_stat_loggers = {}
+        self.engine_indexes = engine_indexes
+        for engine_index in engine_indexes:
+            self.per_engine_stat_loggers[engine_index] = per_engine_stat_logger_factory(
+                vllm_config, engine_index
+            )
+
+    def record(
+        self,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int = 0,
+    ):
+        if engine_idx not in self.per_engine_stat_loggers:
+            logger.warning("Unexpected engine_idx: %d", engine_idx)
+            return
+        self.per_engine_stat_loggers[engine_idx].record(
+            scheduler_stats,
+            iteration_stats,
+            mm_cache_stats=mm_cache_stats,
+            engine_idx=engine_idx,
+        )
+
+    def log(self):
+        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
+            per_engine_stat_logger.log()
+
+    def log_engine_initialized(self):
+        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
+            per_engine_stat_logger.log_engine_initialized()
 
-class PrometheusStatLogger(StatLoggerBase):
-    _gauge_cls = prometheus_client.Gauge
-    _counter_cls = prometheus_client.Counter
-    _histogram_cls = prometheus_client.Histogram
+
+class PrometheusStatLogger(AggregateStatLoggerBase):
+    _gauge_cls = Gauge
+    _counter_cls = Counter
+    _histogram_cls = Histogram
     _spec_decoding_cls = SpecDecodingProm
+    _kv_connector_cls = KVConnectorPrometheus
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 engine_indexes: Optional[list[int]] = None):
+    def __init__(
+        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
+    ):
         if engine_indexes is None:
             engine_indexes = [0]
+
         self.engine_indexes = engine_indexes
 
         unregister_vllm_metrics()
         self.vllm_config = vllm_config
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
+        self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
 
         labelnames = ["model_name", "engine"]
         model_name = vllm_config.model_config.served_model_name
         max_model_len = vllm_config.model_config.max_model_len
 
-        spec_decode_labelvalues: dict[int, list[str]] = {
-            idx: [model_name, str(idx)]
-            for idx in engine_indexes
+        per_engine_labelvalues: dict[int, list[str]] = {
+            idx: [model_name, str(idx)] for idx in engine_indexes
         }
 
         self.spec_decoding_prom = self._spec_decoding_cls(
-            vllm_config.speculative_config, labelnames,
-            spec_decode_labelvalues)
+            vllm_config.speculative_config, labelnames, per_engine_labelvalues
+        )
+        self.kv_connector_prom = self._kv_connector_cls(
+            vllm_config, labelnames, per_engine_labelvalues
+        )
 
         #
         # Scheduler state
@@ -190,21 +398,48 @@ class PrometheusStatLogger(StatLoggerBase):
             name="vllm:num_requests_running",
             documentation="Number of requests in model execution batches.",
             multiprocess_mode="mostrecent",
-            labelnames=labelnames)
-        self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running,
-                                                       engine_indexes,
-                                                       model_name)
+            labelnames=labelnames,
+        )
+        self.gauge_scheduler_running = make_per_engine(
+            gauge_scheduler_running, engine_indexes, model_name
+        )
 
         gauge_scheduler_waiting = self._gauge_cls(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
             multiprocess_mode="mostrecent",
-            labelnames=labelnames)
-        self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting,
-                                                       engine_indexes,
-                                                       model_name)
+            labelnames=labelnames,
+        )
+        self.gauge_scheduler_waiting = make_per_engine(
+            gauge_scheduler_waiting, engine_indexes, model_name
+        )
+
+        gauge_engine_sleep_state = self._gauge_cls(
+            name="vllm:engine_sleep_state",
+            documentation=(
+                "Engine sleep state; awake = 0 means engine is sleeping; "
+                "awake = 1 means engine is awake; "
+                "weights_offloaded = 1 means sleep level 1; "
+                "discard_all = 1 means sleep level 2."
+            ),
+            labelnames=labelnames + ["sleep_state"],
+            multiprocess_mode="mostrecent",
+        )
+
+        self.gauge_engine_sleep_state = {}
+        sleep_state = ["awake", "weights_offloaded", "discard_all"]
+
+        for s in sleep_state:
+            self.gauge_engine_sleep_state[s] = {
+                idx: gauge_engine_sleep_state.labels(
+                    engine=idx, model_name=model_name, sleep_state=s
+                )
+                for idx in engine_indexes
+            }
+
+        # Setting default values
+        self.record_sleep_state()
 
-        #
         # GPU cache
         #
         # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
@@ -215,11 +450,14 @@ class PrometheusStatLogger(StatLoggerBase):
                 name="vllm:gpu_cache_usage_perc",
                 documentation=(
                     "GPU KV-cache usage. 1 means 100 percent usage."
-                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
+                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
+                ),
                 multiprocess_mode="mostrecent",
-                labelnames=labelnames)
+                labelnames=labelnames,
+            )
             self.gauge_gpu_cache_usage = make_per_engine(
-                gauge_gpu_cache_usage, engine_indexes, model_name)
+                gauge_gpu_cache_usage, engine_indexes, model_name
+            )
 
         # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
         # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
@@ -231,9 +469,11 @@ class PrometheusStatLogger(StatLoggerBase):
                     "GPU prefix cache queries, in terms of number of queried"
                     "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
                 ),
-                labelnames=labelnames)
+                labelnames=labelnames,
+            )
             self.counter_gpu_prefix_cache_queries = make_per_engine(
-                counter_gpu_prefix_cache_queries, engine_indexes, model_name)
+                counter_gpu_prefix_cache_queries, engine_indexes, model_name
+            )
 
         # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
         # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
@@ -243,33 +483,110 @@ class PrometheusStatLogger(StatLoggerBase):
                 name="vllm:gpu_prefix_cache_hits",
                 documentation=(
                     "GPU prefix cache hits, in terms of number of cached "
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
-                labelnames=labelnames)
+                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
+                ),
+                labelnames=labelnames,
+            )
             self.counter_gpu_prefix_cache_hits = make_per_engine(
-                counter_gpu_prefix_cache_hits, engine_indexes, model_name)
+                counter_gpu_prefix_cache_hits, engine_indexes, model_name
+            )
 
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
-            labelnames=labelnames)
-        self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage,
-                                                    engine_indexes, model_name)
+            multiprocess_mode="mostrecent",
+            labelnames=labelnames,
+        )
+        self.gauge_kv_cache_usage = make_per_engine(
+            gauge_kv_cache_usage, engine_indexes, model_name
+        )
+
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            counter_corrupted_requests = self._counter_cls(
+                name="vllm:corrupted_requests",
+                documentation=(
+                    "Corrupted requests, in terms of total number of requests "
+                    "with NaNs in logits."
+                ),
+                labelnames=labelnames,
+            )
+            self.counter_corrupted_requests = make_per_engine(
+                counter_corrupted_requests, engine_indexes, model_name
+            )
 
         counter_prefix_cache_queries = self._counter_cls(
             name="vllm:prefix_cache_queries",
             documentation=(
-                "Prefix cache queries, in terms of number of queried tokens."),
-            labelnames=labelnames)
+                "Prefix cache queries, in terms of number of queried tokens."
+            ),
+            labelnames=labelnames,
+        )
         self.counter_prefix_cache_queries = make_per_engine(
-            counter_prefix_cache_queries, engine_indexes, model_name)
+            counter_prefix_cache_queries, engine_indexes, model_name
+        )
 
         counter_prefix_cache_hits = self._counter_cls(
             name="vllm:prefix_cache_hits",
-            documentation=(
-                "Prefix cache hits, in terms of number of cached tokens."),
-            labelnames=labelnames)
+            documentation=("Prefix cache hits, in terms of number of cached tokens."),
+            labelnames=labelnames,
+        )
         self.counter_prefix_cache_hits = make_per_engine(
-            counter_prefix_cache_hits, engine_indexes, model_name)
+            counter_prefix_cache_hits, engine_indexes, model_name
+        )
+
+        #
+        # External - KV connector prefix cache
+        #
+
+        counter_connector_prefix_cache_queries = self._counter_cls(
+            name="vllm:external_prefix_cache_queries",
+            documentation=(
+                "External prefix cache queries from KV connector "
+                "cross-instance cache sharing, in terms of number of queried tokens."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_connector_prefix_cache_queries = make_per_engine(
+            counter_connector_prefix_cache_queries, engine_indexes, model_name
+        )
+
+        counter_connector_prefix_cache_hits = self._counter_cls(
+            name="vllm:external_prefix_cache_hits",
+            documentation=(
+                "External prefix cache hits from KV connector "
+                "cross-instance cache sharing, in terms of number of cached tokens."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_connector_prefix_cache_hits = make_per_engine(
+            counter_connector_prefix_cache_hits, engine_indexes, model_name
+        )
+
+        #
+        # Multi-modal cache
+        #
+
+        counter_mm_cache_queries = self._counter_cls(
+            name="vllm:mm_cache_queries",
+            documentation=(
+                "Multi-modal cache queries, in terms of number of queried items."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_mm_cache_queries = make_per_engine(
+            counter_mm_cache_queries, engine_indexes, model_name
+        )
+
+        counter_mm_cache_hits = self._counter_cls(
+            name="vllm:mm_cache_hits",
+            documentation=(
+                "Multi-modal cache hits, in terms of number of cached items."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_mm_cache_hits = make_per_engine(
+            counter_mm_cache_hits, engine_indexes, model_name
+        )
 
         #
         # Counters
@@ -277,36 +594,41 @@ class PrometheusStatLogger(StatLoggerBase):
         counter_num_preempted_reqs = self._counter_cls(
             name="vllm:num_preemptions",
             documentation="Cumulative number of preemption from the engine.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.counter_num_preempted_reqs = make_per_engine(
-            counter_num_preempted_reqs, engine_indexes, model_name)
+            counter_num_preempted_reqs, engine_indexes, model_name
+        )
 
         counter_prompt_tokens = self._counter_cls(
             name="vllm:prompt_tokens",
             documentation="Number of prefill tokens processed.",
-            labelnames=labelnames)
-        self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens,
-                                                     engine_indexes,
-                                                     model_name)
+            labelnames=labelnames,
+        )
+        self.counter_prompt_tokens = make_per_engine(
+            counter_prompt_tokens, engine_indexes, model_name
+        )
 
         counter_generation_tokens = self._counter_cls(
             name="vllm:generation_tokens",
             documentation="Number of generation tokens processed.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.counter_generation_tokens = make_per_engine(
-            counter_generation_tokens, engine_indexes, model_name)
+            counter_generation_tokens, engine_indexes, model_name
+        )
 
-        self.counter_request_success: dict[FinishReason, dict[
-            int, prometheus_client.Counter]] = {}
+        self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {}
         counter_request_success_base = self._counter_cls(
             name="vllm:request_success",
             documentation="Count of successfully processed requests.",
-            labelnames=labelnames + ["finished_reason"])
+            labelnames=labelnames + ["finished_reason"],
+        )
         for reason in FinishReason:
             self.counter_request_success[reason] = {
-                idx:
-                counter_request_success_base.labels(model_name, str(idx),
-                                                    str(reason))
+                idx: counter_request_success_base.labels(
+                    model_name, str(idx), str(reason)
+                )
                 for idx in engine_indexes
             }
 
@@ -317,18 +639,21 @@ class PrometheusStatLogger(StatLoggerBase):
             name="vllm:request_prompt_tokens",
             documentation="Number of prefill tokens processed.",
             buckets=build_1_2_5_buckets(max_model_len),
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_num_prompt_tokens_request = make_per_engine(
-            histogram_num_prompt_tokens_request, engine_indexes, model_name)
+            histogram_num_prompt_tokens_request, engine_indexes, model_name
+        )
 
         histogram_num_generation_tokens_request = self._histogram_cls(
             name="vllm:request_generation_tokens",
             documentation="Number of generation tokens processed.",
             buckets=build_1_2_5_buckets(max_model_len),
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_num_generation_tokens_request = make_per_engine(
-            histogram_num_generation_tokens_request, engine_indexes,
-            model_name)
+            histogram_num_generation_tokens_request, engine_indexes, model_name
+        )
 
         # TODO: This metric might be incorrect in case of using multiple
         # api_server counts which uses prometheus mp.
@@ -336,38 +661,42 @@ class PrometheusStatLogger(StatLoggerBase):
         histogram_iteration_tokens = self._histogram_cls(
             name="vllm:iteration_tokens_total",
             documentation="Histogram of number of tokens per engine_step.",
-            buckets=[
-                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
-            ],
-            labelnames=labelnames)
+            buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+            labelnames=labelnames,
+        )
         self.histogram_iteration_tokens = make_per_engine(
-            histogram_iteration_tokens, engine_indexes, model_name)
+            histogram_iteration_tokens, engine_indexes, model_name
+        )
 
         histogram_max_num_generation_tokens_request = self._histogram_cls(
             name="vllm:request_max_num_generation_tokens",
-            documentation=
-            "Histogram of maximum number of requested generation tokens.",
+            documentation="Histogram of maximum number of requested generation tokens.",
             buckets=build_1_2_5_buckets(max_model_len),
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_max_num_generation_tokens_request = make_per_engine(
-            histogram_max_num_generation_tokens_request, engine_indexes,
-            model_name)
+            histogram_max_num_generation_tokens_request, engine_indexes, model_name
+        )
 
         histogram_n_request = self._histogram_cls(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
             buckets=[1, 2, 5, 10, 20],
-            labelnames=labelnames)
-        self.histogram_n_request = make_per_engine(histogram_n_request,
-                                                   engine_indexes, model_name)
+            labelnames=labelnames,
+        )
+        self.histogram_n_request = make_per_engine(
+            histogram_n_request, engine_indexes, model_name
+        )
 
         histogram_max_tokens_request = self._histogram_cls(
             name="vllm:request_params_max_tokens",
             documentation="Histogram of the max_tokens request parameter.",
             buckets=build_1_2_5_buckets(max_model_len),
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_max_tokens_request = make_per_engine(
-            histogram_max_tokens_request, engine_indexes, model_name)
+            histogram_max_tokens_request, engine_indexes, model_name
+        )
 
         #
         # Histogram of timing intervals
@@ -376,13 +705,34 @@ class PrometheusStatLogger(StatLoggerBase):
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
             buckets=[
-                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
-                2560.0
+                0.001,
+                0.005,
+                0.01,
+                0.02,
+                0.04,
+                0.06,
+                0.08,
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                20.0,
+                40.0,
+                80.0,
+                160.0,
+                640.0,
+                2560.0,
             ],
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_time_to_first_token = make_per_engine(
-            histogram_time_to_first_token, engine_indexes, model_name)
+            histogram_time_to_first_token, engine_indexes, model_name
+        )
 
         # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
         # TODO: in 0.12, only enable if show_hidden_metrics=True
@@ -390,86 +740,167 @@ class PrometheusStatLogger(StatLoggerBase):
             name="vllm:time_per_output_token_seconds",
             documentation=(
                 "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
+                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
+            ),
             buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+                0.01,
+                0.025,
+                0.05,
+                0.075,
+                0.1,
+                0.15,
+                0.2,
+                0.3,
+                0.4,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                20.0,
+                40.0,
+                80.0,
             ],
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_time_per_output_token = make_per_engine(
-            histogram_time_per_output_token, engine_indexes, model_name)
+            histogram_time_per_output_token, engine_indexes, model_name
+        )
 
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
             documentation="Histogram of inter-token latency in seconds.",
             buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+                0.01,
+                0.025,
+                0.05,
+                0.075,
+                0.1,
+                0.15,
+                0.2,
+                0.3,
+                0.4,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                20.0,
+                40.0,
+                80.0,
             ],
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_inter_token_latency = make_per_engine(
-            histogram_inter_token_latency, engine_indexes, model_name)
+            histogram_inter_token_latency, engine_indexes, model_name
+        )
 
         histogram_request_time_per_output_token = self._histogram_cls(
             name="vllm:request_time_per_output_token_seconds",
-            documentation=
-            "Histogram of time_per_output_token_seconds per request.",
+            documentation="Histogram of time_per_output_token_seconds per request.",
             buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+                0.01,
+                0.025,
+                0.05,
+                0.075,
+                0.1,
+                0.15,
+                0.2,
+                0.3,
+                0.4,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                20.0,
+                40.0,
+                80.0,
             ],
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_request_time_per_output_token = make_per_engine(
-            histogram_request_time_per_output_token, engine_indexes,
-            model_name)
+            histogram_request_time_per_output_token, engine_indexes, model_name
+        )
 
         request_latency_buckets = [
-            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
+            0.3,
+            0.5,
+            0.8,
+            1.0,
+            1.5,
+            2.0,
+            2.5,
+            5.0,
+            10.0,
+            15.0,
+            20.0,
+            30.0,
+            40.0,
+            50.0,
+            60.0,
+            120.0,
+            240.0,
+            480.0,
+            960.0,
+            1920.0,
+            7680.0,
         ]
         histogram_e2e_time_request = self._histogram_cls(
             name="vllm:e2e_request_latency_seconds",
             documentation="Histogram of e2e request latency in seconds.",
             buckets=request_latency_buckets,
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_e2e_time_request = make_per_engine(
-            histogram_e2e_time_request, engine_indexes, model_name)
+            histogram_e2e_time_request, engine_indexes, model_name
+        )
 
         histogram_queue_time_request = self._histogram_cls(
             name="vllm:request_queue_time_seconds",
-            documentation=
-            "Histogram of time spent in WAITING phase for request.",
+            documentation="Histogram of time spent in WAITING phase for request.",
             buckets=request_latency_buckets,
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_queue_time_request = make_per_engine(
-            histogram_queue_time_request, engine_indexes, model_name)
+            histogram_queue_time_request, engine_indexes, model_name
+        )
 
         histogram_inference_time_request = self._histogram_cls(
             name="vllm:request_inference_time_seconds",
-            documentation=
-            "Histogram of time spent in RUNNING phase for request.",
+            documentation="Histogram of time spent in RUNNING phase for request.",
             buckets=request_latency_buckets,
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_inference_time_request = make_per_engine(
-            histogram_inference_time_request, engine_indexes, model_name)
+            histogram_inference_time_request, engine_indexes, model_name
+        )
 
         histogram_prefill_time_request = self._histogram_cls(
             name="vllm:request_prefill_time_seconds",
-            documentation=
-            "Histogram of time spent in PREFILL phase for request.",
+            documentation="Histogram of time spent in PREFILL phase for request.",
             buckets=request_latency_buckets,
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_prefill_time_request = make_per_engine(
-            histogram_prefill_time_request, engine_indexes, model_name)
+            histogram_prefill_time_request, engine_indexes, model_name
+        )
 
         histogram_decode_time_request = self._histogram_cls(
             name="vllm:request_decode_time_seconds",
-            documentation=
-            "Histogram of time spent in DECODE phase for request.",
+            documentation="Histogram of time spent in DECODE phase for request.",
             buckets=request_latency_buckets,
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.histogram_decode_time_request = make_per_engine(
-            histogram_decode_time_request, engine_indexes, model_name)
+            histogram_decode_time_request, engine_indexes, model_name
+        )
 
         #
         # LoRA metrics
@@ -477,26 +908,24 @@ class PrometheusStatLogger(StatLoggerBase):
 
         # TODO: This metric might be incorrect in case of using multiple
         # api_server counts which uses prometheus mp.
-        self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
+        self.gauge_lora_info: Gauge | None = None
         if vllm_config.lora_config is not None:
             if len(self.engine_indexes) > 1:
-                raise NotImplementedError(
-                    "LoRA in DP mode is not supported yet.")
+                raise NotImplementedError("LoRA in DP mode is not supported yet.")
             self.labelname_max_lora = "max_lora"
             self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
             self.labelname_running_lora_adapters = "running_lora_adapters"
             self.max_lora = vllm_config.lora_config.max_loras
-            self.gauge_lora_info = \
-                self._gauge_cls(
-                    name="vllm:lora_requests_info",
-                    documentation="Running stats on lora requests.",
-                    multiprocess_mode="sum",
-                    labelnames=[
-                        self.labelname_max_lora,
-                        self.labelname_waiting_lora_adapters,
-                        self.labelname_running_lora_adapters,
-                    ],
-                )
+            self.gauge_lora_info = self._gauge_cls(
+                name="vllm:lora_requests_info",
+                documentation="Running stats on lora requests.",
+                multiprocess_mode="sum",
+                labelnames=[
+                    self.labelname_max_lora,
+                    self.labelname_waiting_lora_adapters,
+                    self.labelname_running_lora_adapters,
+                ],
+            )
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         metrics_info = config_obj.metrics_info()
@@ -522,54 +951,99 @@ class PrometheusStatLogger(StatLoggerBase):
             metrics_info["engine"] = str(engine_index)
             info_gauge.labels(**metrics_info).set(1)
 
-    def record(self,
-               scheduler_stats: Optional[SchedulerStats],
-               iteration_stats: Optional[IterationStats],
-               engine_idx: int = 0):
+    def record(
+        self,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int = 0,
+    ):
         """Log to prometheus."""
         if scheduler_stats is not None:
             self.gauge_scheduler_running[engine_idx].set(
-                scheduler_stats.num_running_reqs)
+                scheduler_stats.num_running_reqs
+            )
             self.gauge_scheduler_waiting[engine_idx].set(
-                scheduler_stats.num_waiting_reqs)
-
+                scheduler_stats.num_waiting_reqs
+            )
             if self.show_hidden_metrics:
                 self.gauge_gpu_cache_usage[engine_idx].set(
-                    scheduler_stats.kv_cache_usage)
-            self.gauge_kv_cache_usage[engine_idx].set(
-                scheduler_stats.kv_cache_usage)
+                    scheduler_stats.kv_cache_usage
+                )
+            self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
 
             if self.show_hidden_metrics:
                 self.counter_gpu_prefix_cache_queries[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.queries)
+                    scheduler_stats.prefix_cache_stats.queries
+                )
                 self.counter_gpu_prefix_cache_hits[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.hits)
+                    scheduler_stats.prefix_cache_stats.hits
+                )
 
             self.counter_prefix_cache_queries[engine_idx].inc(
-                scheduler_stats.prefix_cache_stats.queries)
+                scheduler_stats.prefix_cache_stats.queries
+            )
             self.counter_prefix_cache_hits[engine_idx].inc(
-                scheduler_stats.prefix_cache_stats.hits)
+                scheduler_stats.prefix_cache_stats.hits
+            )
+
+            if scheduler_stats.connector_prefix_cache_stats is not None:
+                self.counter_connector_prefix_cache_queries[engine_idx].inc(
+                    scheduler_stats.connector_prefix_cache_stats.queries
+                )
+                self.counter_connector_prefix_cache_hits[engine_idx].inc(
+                    scheduler_stats.connector_prefix_cache_stats.hits
+                )
 
             if scheduler_stats.spec_decoding_stats is not None:
                 self.spec_decoding_prom.observe(
-                    scheduler_stats.spec_decoding_stats, engine_idx)
+                    scheduler_stats.spec_decoding_stats, engine_idx
+                )
+
+            if scheduler_stats.kv_connector_stats is not None:
+                self.kv_connector_prom.observe(
+                    scheduler_stats.kv_connector_stats, engine_idx
+                )
+
+            if self.gauge_lora_info is not None:
+                running_lora_adapters = ",".join(
+                    scheduler_stats.running_lora_adapters.keys()
+                )
+                waiting_lora_adapters = ",".join(
+                    scheduler_stats.waiting_lora_adapters.keys()
+                )
+                lora_info_labels = {
+                    self.labelname_running_lora_adapters: running_lora_adapters,
+                    self.labelname_waiting_lora_adapters: waiting_lora_adapters,
+                    self.labelname_max_lora: self.max_lora,
+                }
+                self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
+
+        if mm_cache_stats is not None:
+            self.counter_mm_cache_queries[engine_idx].inc(mm_cache_stats.queries)
+            self.counter_mm_cache_hits[engine_idx].inc(mm_cache_stats.hits)
 
         if iteration_stats is None:
             return
-
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            self.counter_corrupted_requests[engine_idx].inc(
+                iteration_stats.num_corrupted_reqs
+            )
         self.counter_num_preempted_reqs[engine_idx].inc(
-            iteration_stats.num_preempted_reqs)
-        self.counter_prompt_tokens[engine_idx].inc(
-            iteration_stats.num_prompt_tokens)
+            iteration_stats.num_preempted_reqs
+        )
+        self.counter_prompt_tokens[engine_idx].inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens[engine_idx].inc(
-            iteration_stats.num_generation_tokens)
+            iteration_stats.num_generation_tokens
+        )
         self.histogram_iteration_tokens[engine_idx].observe(
-            iteration_stats.num_prompt_tokens + \
-            iteration_stats.num_generation_tokens)
+            iteration_stats.num_prompt_tokens + iteration_stats.num_generation_tokens
+        )
 
         for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
-            self.histogram_max_num_generation_tokens_request[
-                engine_idx].observe(max_gen_tokens)
+            self.histogram_max_num_generation_tokens_request[engine_idx].observe(
+                max_gen_tokens
+            )
         for n_param in iteration_stats.n_params_iter:
             self.histogram_n_request[engine_idx].observe(n_param)
         for ttft in iteration_stats.time_to_first_tokens_iter:
@@ -579,54 +1053,67 @@ class PrometheusStatLogger(StatLoggerBase):
             self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
-            self.counter_request_success[
-                finished_request.finish_reason][engine_idx].inc()
+            self.counter_request_success[finished_request.finish_reason][
+                engine_idx
+            ].inc()
             self.histogram_e2e_time_request[engine_idx].observe(
-                finished_request.e2e_latency)
+                finished_request.e2e_latency
+            )
             self.histogram_queue_time_request[engine_idx].observe(
-                finished_request.queued_time)
+                finished_request.queued_time
+            )
             self.histogram_prefill_time_request[engine_idx].observe(
-                finished_request.prefill_time)
+                finished_request.prefill_time
+            )
             self.histogram_inference_time_request[engine_idx].observe(
-                finished_request.inference_time)
+                finished_request.inference_time
+            )
             self.histogram_decode_time_request[engine_idx].observe(
-                finished_request.decode_time)
+                finished_request.decode_time
+            )
             self.histogram_num_prompt_tokens_request[engine_idx].observe(
-                finished_request.num_prompt_tokens)
+                finished_request.num_prompt_tokens
+            )
             self.histogram_num_generation_tokens_request[engine_idx].observe(
-                finished_request.num_generation_tokens)
+                finished_request.num_generation_tokens
+            )
             self.histogram_request_time_per_output_token[engine_idx].observe(
-                finished_request.mean_time_per_output_token)
+                finished_request.mean_time_per_output_token
+            )
             if finished_request.max_tokens_param:
                 self.histogram_max_tokens_request[engine_idx].observe(
-                    finished_request.max_tokens_param)
-
-        if self.gauge_lora_info is not None:
-            running_lora_adapters = \
-                ",".join(iteration_stats.running_lora_adapters.keys())
-            waiting_lora_adapters = \
-                ",".join(iteration_stats.waiting_lora_adapters.keys())
-            lora_info_labels = {
-                self.labelname_running_lora_adapters: running_lora_adapters,
-                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
-                self.labelname_max_lora: self.max_lora,
-            }
-            self.gauge_lora_info.labels(**lora_info_labels)\
-                                .set_to_current_time()
+                    finished_request.max_tokens_param
+                )
+
+    def record_sleep_state(self, sleep: int = 0, level: int = 0):
+        awake = 1
+        discard_all = 0
+        weights_offloaded = 0
+
+        if sleep == 1:
+            awake = 0
+            if level == 1:
+                weights_offloaded = 1
+            elif level == 2:
+                discard_all = 1
+
+        for engine_idx in self.engine_indexes:
+            self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all)
+            self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set(
+                weights_offloaded
+            )
+            self.gauge_engine_sleep_state["awake"][engine_idx].set(awake)
 
     def log_engine_initialized(self):
         self.log_metrics_info("cache_config", self.vllm_config.cache_config)
 
 
-PromMetric = Union[
-    prometheus_client.Gauge,
-    prometheus_client.Counter,
-    prometheus_client.Histogram,
-]
+PromMetric: TypeAlias = Gauge | Counter | Histogram
 
 
-def make_per_engine(metric: PromMetric, engine_idxs: list[int],
-                    model_name: str) -> dict[int, PromMetric]:
+def make_per_engine(
+    metric: PromMetric, engine_idxs: list[int], model_name: str
+) -> dict[int, PromMetric]:
     return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
 
 
@@ -673,69 +1160,79 @@ class StatLoggerManager:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        engine_idxs: Optional[list[int]] = None,
-        custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        engine_idxs: list[int] | None = None,
+        custom_stat_loggers: list[StatLoggerFactory] | None = None,
         enable_default_loggers: bool = True,
+        aggregate_engine_logging: bool = False,
         client_count: int = 1,
     ):
-        self.engine_idxs = engine_idxs if engine_idxs else [0]
-
-        factories: list[StatLoggerFactory] = []
+        self.engine_indexes = engine_idxs if engine_idxs else [0]
+        self.stat_loggers: list[AggregateStatLoggerBase] = []
+        stat_logger_factories: list[StatLoggerFactory] = []
         if custom_stat_loggers is not None:
-            factories.extend(custom_stat_loggers)
-
+            stat_logger_factories.extend(custom_stat_loggers)
         if enable_default_loggers and logger.isEnabledFor(logging.INFO):
             if client_count > 1:
                 logger.warning(
                     "AsyncLLM created with api_server_count more than 1; "
-                    "disabling stats logging to avoid incomplete stats.")
+                    "disabling stats logging to avoid incomplete stats."
+                )
             else:
-                factories.append(LoggingStatLogger)
-
-        # engine_idx: StatLogger
-        self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {}
-        prometheus_factory = PrometheusStatLogger
-        for engine_idx in self.engine_idxs:
-            loggers: list[StatLoggerBase] = []
-            for logger_factory in factories:
-                # If we get a custom prometheus logger, use that
-                # instead. This is typically used for the ray case.
-                if (isinstance(logger_factory, type)
-                        and issubclass(logger_factory, PrometheusStatLogger)):
-                    prometheus_factory = logger_factory
-                    continue
-                loggers.append(logger_factory(vllm_config,
-                                              engine_idx))  # type: ignore
-            self.per_engine_logger_dict[engine_idx] = loggers
-
-        # For Prometheus, need to share the metrics between EngineCores.
-        # Each EngineCore's metrics are expressed as a unique label.
-        self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs)
+                default_logger_factory = (
+                    AggregatedLoggingStatLogger
+                    if aggregate_engine_logging
+                    else LoggingStatLogger
+                )
+                stat_logger_factories.append(default_logger_factory)
+        custom_prometheus_logger: bool = False
+        for stat_logger_factory in stat_logger_factories:
+            if isinstance(stat_logger_factory, type) and issubclass(
+                stat_logger_factory, AggregateStatLoggerBase
+            ):
+                global_stat_logger = stat_logger_factory(
+                    vllm_config=vllm_config,
+                    engine_indexes=self.engine_indexes,
+                )
+                if isinstance(global_stat_logger, PrometheusStatLogger):
+                    custom_prometheus_logger = True
+            else:
+                # per engine logger
+                global_stat_logger = PerEngineStatLoggerAdapter(
+                    vllm_config=vllm_config,
+                    engine_indexes=self.engine_indexes,
+                    per_engine_stat_logger_factory=stat_logger_factory,  # type: ignore[arg-type]
+                )
+            self.stat_loggers.append(global_stat_logger)
+        if not custom_prometheus_logger:
+            self.stat_loggers.append(
+                PrometheusStatLogger(vllm_config, self.engine_indexes)
+            )
 
     def record(
         self,
-        scheduler_stats: Optional[SchedulerStats],
-        iteration_stats: Optional[IterationStats],
-        engine_idx: Optional[int] = None,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int | None = None,
     ):
         if engine_idx is None:
             engine_idx = 0
-
-        per_engine_loggers = self.per_engine_logger_dict[engine_idx]
-        for logger in per_engine_loggers:
-            logger.record(scheduler_stats, iteration_stats, engine_idx)
-
-        self.prometheus_logger.record(scheduler_stats, iteration_stats,
-                                      engine_idx)
+        for logger in self.stat_loggers:
+            logger.record(
+                scheduler_stats,
+                iteration_stats,
+                mm_cache_stats=mm_cache_stats,
+                engine_idx=engine_idx,
+            )
+
+    def record_sleep_state(self, sleep: int = 0, level: int = 0):
+        for logger in self.stat_loggers:
+            logger.record_sleep_state(sleep, level)
 
     def log(self):
-        for per_engine_loggers in self.per_engine_logger_dict.values():
-            for logger in per_engine_loggers:
-                logger.log()
+        for logger in self.stat_loggers:
+            logger.log()
 
     def log_engine_initialized(self):
-        self.prometheus_logger.log_engine_initialized()
-
-        for per_engine_loggers in self.per_engine_logger_dict.values():
-            for logger in per_engine_loggers:
-                logger.log_engine_initialized()
+        for agg_logger in self.stat_loggers:
+            agg_logger.log_engine_initialized()
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index a43cf9ce255e6c00ca82cca0de12ba01a197929b..1eacb785aa843e3969015302f7b8d1d3ab49b5a1 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -3,7 +3,6 @@
 
 import os
 import tempfile
-from typing import Optional
 
 from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
 
@@ -12,13 +11,11 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 # Global temporary directory for prometheus multiprocessing
-_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None
+_prometheus_multiproc_dir: tempfile.TemporaryDirectory | None = None
 
 
 def setup_multiprocess_prometheus():
-    """Set up prometheus multiprocessing directory if not already configured.
-    
-    """
+    """Set up prometheus multiprocessing directory if not already configured."""
     global _prometheus_multiproc_dir
 
     if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
@@ -27,19 +24,22 @@ def setup_multiprocess_prometheus():
         # cleaned up upon exit.
         _prometheus_multiproc_dir = tempfile.TemporaryDirectory()
         os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
-        logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
-                     _prometheus_multiproc_dir.name)
+        logger.debug(
+            "Created PROMETHEUS_MULTIPROC_DIR at %s", _prometheus_multiproc_dir.name
+        )
     else:
-        logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
-                       "This directory must be wiped between vLLM runs or "
-                       "you will find inaccurate metrics. Unset the variable "
-                       "and vLLM will properly handle cleanup.")
+        logger.warning(
+            "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
+            "This directory must be wiped between vLLM runs or "
+            "you will find inaccurate metrics. Unset the variable "
+            "and vLLM will properly handle cleanup."
+        )
 
 
 def get_prometheus_registry() -> CollectorRegistry:
-    """Get the appropriate prometheus registry based on multiprocessing 
+    """Get the appropriate prometheus registry based on multiprocessing
     configuration.
-    
+
     Returns:
         Registry: A prometheus registry
     """
@@ -54,11 +54,11 @@ def get_prometheus_registry() -> CollectorRegistry:
 
 def unregister_vllm_metrics():
     """Unregister any existing vLLM collectors from the prometheus registry.
-    
+
     This is useful for testing and CI/CD where metrics may be registered
     multiple times across test runs.
-    
-    Also, in case of multiprocess, we need to unregister the metrics from the 
+
+    Also, in case of multiprocess, we need to unregister the metrics from the
     global registry.
     """
     registry = REGISTRY
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index 609185753860915337d514d889a04e2f14858834..a319ffb1d2573b86250000c561f59bbd337b087c 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Optional, Union
 
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
 from vllm.v1.metrics.loggers import PrometheusStatLogger
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
 
@@ -15,11 +15,9 @@ import regex as re
 
 
 class RayPrometheusMetric:
-
     def __init__(self):
         if ray_metrics is None:
-            raise ImportError(
-                "RayPrometheusMetric requires Ray to be installed.")
+            raise ImportError("RayPrometheusMetric requires Ray to be installed.")
 
         self.metric: Metric = None
 
@@ -38,15 +36,14 @@ class RayPrometheusMetric:
                     f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
                 )
 
-            self.metric.set_default_tags(
-                dict(zip(self.metric._tag_keys, labels)))
+            self.metric.set_default_tags(dict(zip(self.metric._tag_keys, labels)))
 
         return self
 
     @staticmethod
     def _get_sanitized_opentelemetry_name(name: str) -> str:
         """
-        For compatibility with Ray + OpenTelemetry, the metric name must be 
+        For compatibility with Ray + OpenTelemetry, the metric name must be
         sanitized. In particular, this replaces disallowed character (e.g., ':')
         with '_' in the metric name.
         Allowed characters: a-z, A-Z, 0-9, _
@@ -63,23 +60,24 @@ class RayGaugeWrapper(RayPrometheusMetric):
     """Wraps around ray.util.metrics.Gauge to provide same API as
     prometheus_client.Gauge"""
 
-    def __init__(self,
-                 name: str,
-                 documentation: Optional[str] = "",
-                 labelnames: Optional[list[str]] = None,
-                 multiprocess_mode: Optional[str] = ""):
-
+    def __init__(
+        self,
+        name: str,
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
+        multiprocess_mode: str | None = "",
+    ):
         # All Ray metrics are keyed by WorkerId, so multiprocess modes like
         # "mostrecent", "all", "sum" do not apply. This logic can be manually
         # implemented at the observability layer (Prometheus/Grafana).
         del multiprocess_mode
         labelnames_tuple = tuple(labelnames) if labelnames else None
         name = self._get_sanitized_opentelemetry_name(name)
-        self.metric = ray_metrics.Gauge(name=name,
-                                        description=documentation,
-                                        tag_keys=labelnames_tuple)
+        self.metric = ray_metrics.Gauge(
+            name=name, description=documentation, tag_keys=labelnames_tuple
+        )
 
-    def set(self, value: Union[int, float]):
+    def set(self, value: int | float):
         return self.metric.set(value)
 
     def set_to_current_time(self):
@@ -91,17 +89,19 @@ class RayCounterWrapper(RayPrometheusMetric):
     """Wraps around ray.util.metrics.Counter to provide same API as
     prometheus_client.Counter"""
 
-    def __init__(self,
-                 name: str,
-                 documentation: Optional[str] = "",
-                 labelnames: Optional[list[str]] = None):
+    def __init__(
+        self,
+        name: str,
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
+    ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         name = self._get_sanitized_opentelemetry_name(name)
-        self.metric = ray_metrics.Counter(name=name,
-                                          description=documentation,
-                                          tag_keys=labelnames_tuple)
+        self.metric = ray_metrics.Counter(
+            name=name, description=documentation, tag_keys=labelnames_tuple
+        )
 
-    def inc(self, value: Union[int, float] = 1.0):
+    def inc(self, value: int | float = 1.0):
         if value == 0:
             return
         return self.metric.inc(value)
@@ -111,20 +111,24 @@ class RayHistogramWrapper(RayPrometheusMetric):
     """Wraps around ray.util.metrics.Histogram to provide same API as
     prometheus_client.Histogram"""
 
-    def __init__(self,
-                 name: str,
-                 documentation: Optional[str] = "",
-                 labelnames: Optional[list[str]] = None,
-                 buckets: Optional[list[float]] = None):
+    def __init__(
+        self,
+        name: str,
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
+        buckets: list[float] | None = None,
+    ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         name = self._get_sanitized_opentelemetry_name(name)
         boundaries = buckets if buckets else []
-        self.metric = ray_metrics.Histogram(name=name,
-                                            description=documentation,
-                                            tag_keys=labelnames_tuple,
-                                            boundaries=boundaries)
-
-    def observe(self, value: Union[int, float]):
+        self.metric = ray_metrics.Histogram(
+            name=name,
+            description=documentation,
+            tag_keys=labelnames_tuple,
+            boundaries=boundaries,
+        )
+
+    def observe(self, value: int | float):
         return self.metric.observe(value)
 
 
@@ -138,6 +142,18 @@ class RaySpecDecodingProm(SpecDecodingProm):
     _counter_cls = RayCounterWrapper
 
 
+class RayKVConnectorPrometheus(KVConnectorPrometheus):
+    """
+    RayKVConnectorPrometheus is used by RayMetrics to log Ray
+    metrics. Provides the same metrics as KV connectors but
+    uses Ray's util.metrics library.
+    """
+
+    _gauge_cls = RayGaugeWrapper
+    _counter_cls = RayCounterWrapper
+    _histogram_cls = RayHistogramWrapper
+
+
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""
 
@@ -145,6 +161,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
     _counter_cls = RayCounterWrapper
     _histogram_cls = RayHistogramWrapper
     _spec_decoding_cls = RaySpecDecodingProm
+    _kv_connector_cls = RayKVConnectorPrometheus
 
     @staticmethod
     def _unregister_vllm_metrics():
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
index 4d6e59984154115a1ad363cab0991ee957eaafed..48c88e5b61cb9f923358230730f198b57262d112 100644
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 from prometheus_client import REGISTRY
 from prometheus_client import Metric as PromMetric
@@ -17,6 +16,7 @@ class Metric:
     in some cases a single vLLM instance may have multiple
     metrics with the same name but different sets of labels.
     """
+
     name: str
     labels: dict[str, str]
 
@@ -24,6 +24,7 @@ class Metric:
 @dataclass
 class Counter(Metric):
     """A monotonically increasing integer counter."""
+
     value: int
 
 
@@ -34,12 +35,14 @@ class Vector(Metric):
     This type - which doesn't exist in Prometheus - models one very
     specific metric, vllm:spec_decode_num_accepted_tokens_per_pos.
     """
+
     values: list[int]
 
 
 @dataclass
 class Gauge(Metric):
     """A numerical value that can go up or down."""
+
     value: float
 
 
@@ -58,6 +61,7 @@ class Histogram(Metric):
     The sum property is the total sum of all observed
     values.
     """
+
     count: int
     sum: float
     buckets: dict[str, int]
@@ -87,7 +91,8 @@ def get_metrics_snapshot() -> list[Metric]:
             samples = _get_samples(metric)
             for s in samples:
                 collected.append(
-                    Gauge(name=metric.name, labels=s.labels, value=s.value))
+                    Gauge(name=metric.name, labels=s.labels, value=s.value)
+                )
         elif metric.type == "counter":
             samples = _get_samples(metric, "_total")
             if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
@@ -99,16 +104,15 @@ def get_metrics_snapshot() -> list[Metric]:
                 # accepted tokens using a Counter labeled with 'position'.
                 # We convert these into a vector of integer values.
                 #
-                for labels, values in _digest_num_accepted_by_pos_samples(
-                        samples):
+                for labels, values in _digest_num_accepted_by_pos_samples(samples):
                     collected.append(
-                        Vector(name=metric.name, labels=labels, values=values))
+                        Vector(name=metric.name, labels=labels, values=values)
+                    )
             else:
                 for s in samples:
                     collected.append(
-                        Counter(name=metric.name,
-                                labels=s.labels,
-                                value=int(s.value)))
+                        Counter(name=metric.name, labels=s.labels, value=int(s.value))
+                    )
 
         elif metric.type == "histogram":
             #
@@ -122,21 +126,24 @@ def get_metrics_snapshot() -> list[Metric]:
             count_samples = _get_samples(metric, "_count")
             sum_samples = _get_samples(metric, "_sum")
             for labels, buckets, count_value, sum_value in _digest_histogram(
-                    bucket_samples, count_samples, sum_samples):
+                bucket_samples, count_samples, sum_samples
+            ):
                 collected.append(
-                    Histogram(name=metric.name,
-                              labels=labels,
-                              buckets=buckets,
-                              count=count_value,
-                              sum=sum_value))
+                    Histogram(
+                        name=metric.name,
+                        labels=labels,
+                        buckets=buckets,
+                        count=count_value,
+                        sum=sum_value,
+                    )
+                )
         else:
             raise AssertionError(f"Unknown metric type {metric.type}")
 
     return collected
 
 
-def _get_samples(metric: PromMetric,
-                 suffix: Optional[str] = None) -> list[Sample]:
+def _get_samples(metric: PromMetric, suffix: str | None = None) -> list[Sample]:
     name = (metric.name + suffix) if suffix is not None else metric.name
     return [s for s in metric.samples if s.name == name]
 
@@ -148,8 +155,7 @@ def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]:
 
 
 def _digest_histogram(
-    bucket_samples: list[Sample], count_samples: list[Sample],
-    sum_samples: list[Sample]
+    bucket_samples: list[Sample], count_samples: list[Sample], sum_samples: list[Sample]
 ) -> list[tuple[dict[str, str], dict[str, int], int, float]]:
     #
     # In the case of DP, we have an indigestable
@@ -192,20 +198,25 @@ def _digest_histogram(
         labels_key = frozenset(s.labels.items())
         sums_by_labels[labels_key] = s.value
 
-    assert set(buckets_by_labels.keys()) == set(
-        counts_by_labels.keys()) == set(sums_by_labels.keys())
+    assert (
+        set(buckets_by_labels.keys())
+        == set(counts_by_labels.keys())
+        == set(sums_by_labels.keys())
+    )
 
     output = []
     label_keys = list(buckets_by_labels.keys())
     for k in label_keys:
         labels = dict(k)
-        output.append((labels, buckets_by_labels[k], counts_by_labels[k],
-                       sums_by_labels[k]))
+        output.append(
+            (labels, buckets_by_labels[k], counts_by_labels[k], sums_by_labels[k])
+        )
     return output
 
 
 def _digest_num_accepted_by_pos_samples(
-        samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]:
+    samples: list[Sample],
+) -> list[tuple[dict[str, str], list[int]]]:
     #
     # In the case of DP, we have an indigestable
     # per-position-per-engine count as a list of
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 296c39e8cdb5c7e91add77ee4b75ceb61aba08be..4e9db98db0bc2cd269f7cae421e17878d085a500 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -2,28 +2,152 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
+from collections import defaultdict, deque
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
+import vllm.envs as envs
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
-    from vllm.v1.engine.output_processor import RequestState
 
 
 @dataclass
-class PrefixCacheStats:
-    """Stores prefix cache hit statistics."""
-    # Whether reset_prefix_cache was invoked.
+class BaseCacheStats:
+    """Stores cache hit statistics."""
+
     reset: bool = False
-    # The number of requests in this update.
+    """Whether the cache was reset."""
+
     requests: int = 0
-    # The number of queries in these requests. Note that "queries" here
-    # means the number of tokens that were queried from the cache.
+    """The number of requests in this update."""
+
     queries: int = 0
-    # The number of hits in these requests.
+    """The number of queries in these requests."""
+
     hits: int = 0
+    """The number of hits in these requests."""
+
+
+class CachingMetrics:
+    """Metrics for caching with a hit rate of the most recent N requests.
+    Args:
+        interval: The number of the most recent requests to aggregate.
+            Defaults to 1000.
+    """
+
+    def __init__(self, max_recent_requests: int = 1000) -> None:
+        super().__init__()
+
+        self.max_recent_requests = max_recent_requests
+        # The current aggregated values.
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+
+        # A deque of (requests, queries, hits) for the most recent requests.
+        self.query_queue = deque[tuple[int, int, int]]()
+
+    def observe(self, stats: BaseCacheStats):
+        """Observe the prefix caching for a set of requests.
+
+        This function is called with information gathered when new requests
+        are being scheduled and are looking for computed blocks.
+
+        When there are more than `max_recent_requests` requests, the oldest set
+        of requests are removed from the metrics.
+
+        Args:
+            stats: The prefix cache stats.
+        """
+        # reset_prefix_cache was invoked before the current update.
+        # Reset the metrics before aggregating the current stats.
+        if stats.reset:
+            self.reset()
+
+        # DO NOT appending empty stats to avoid helpful info get kicked out
+        # due to sliding window.
+        if stats.requests == 0:
+            return
+
+        # Update the metrics.
+        self.query_queue.append((stats.requests, stats.queries, stats.hits))
+        self.aggregated_requests += stats.requests
+        self.aggregated_query_total += stats.queries
+        self.aggregated_query_hit += stats.hits
+
+        # Remove the oldest stats until number of requests does not exceed
+        # the limit.
+        # NOTE: We preserve the latest added stats regardless.
+        while (
+            len(self.query_queue) > 1
+            and self.aggregated_requests > self.max_recent_requests
+        ):
+            old_requests, old_queries, old_hits = self.query_queue.popleft()
+            self.aggregated_requests -= old_requests
+            self.aggregated_query_total -= old_queries
+            self.aggregated_query_hit -= old_hits
+
+    def reset(self):
+        """Reset the metrics."""
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        self.query_queue.clear()
+
+    @property
+    def empty(self) -> bool:
+        """Return true if no requests have been observed."""
+        return self.aggregated_requests == 0
+
+    @property
+    def hit_rate(self) -> float:
+        """Calculate the hit rate for the past N requests."""
+        if self.aggregated_query_total == 0:
+            return 0.0
+        return self.aggregated_query_hit / self.aggregated_query_total
+
+
+@dataclass
+class PrefixCacheStats(BaseCacheStats):
+    """
+    Stores prefix cache hit statistics.
+    - `reset`: Whether `reset_prefix_cache` was invoked.
+    - `queries`: Refers to the number of tokens that were queried.
+    """
+
+    preempted_requests: int = 0
+    """The number of previously preempted requests in this update."""
+
+    preempted_queries: int = 0
+    """The `queries` number for preempted requests."""
+
+    preempted_hits: int = 0
+    """The `hits` number for preempted requests."""
+
+    def record(self, num_tokens: int, num_hits: int, preempted: bool) -> None:
+        """Aggregate request information into the stats."""
+        if preempted:
+            # Previously preempted request
+            self.preempted_requests += 1
+            self.preempted_queries += num_tokens
+            self.preempted_hits += num_hits
+        else:
+            # New request
+            self.requests += 1
+            self.queries += num_tokens
+            self.hits += num_hits
+
+
+@dataclass
+class MultiModalCacheStats(BaseCacheStats):
+    """
+    Stores multi-modal cache hit statistics.
+    - `reset`: Whether `reset_mm_cache` was invoked.
+    - `queries`: Refers to the number of multi-modal data items
+      that were queried.
+    """
 
 
 @dataclass
@@ -39,19 +163,14 @@ class SchedulerStats:
 
     kv_cache_usage: float = 0.0
 
-    prefix_cache_stats: PrefixCacheStats = field(
-        default_factory=PrefixCacheStats)
+    prefix_cache_stats: PrefixCacheStats = field(default_factory=PrefixCacheStats)
+    connector_prefix_cache_stats: PrefixCacheStats | None = None
 
-    spec_decoding_stats: Optional[SpecDecodingStats] = None
-    kv_connector_stats: Optional[dict[str, Any]] = None
+    spec_decoding_stats: SpecDecodingStats | None = None
+    kv_connector_stats: dict[str, Any] | None = None
 
-    num_corrupted_reqs: int = 0
-
-
-@dataclass
-class LoRAStats:
-    waiting_requests: set[str] = field(default_factory=set)
-    running_requests: set[str] = field(default_factory=set)
+    waiting_lora_adapters: dict[str, int] = field(default_factory=dict)
+    running_lora_adapters: dict[str, int] = field(default_factory=dict)
 
 
 @dataclass
@@ -72,6 +191,9 @@ class RequestStateStats:
     # first token latency
     first_token_latency: float = 0.0
 
+    # Track if this request is corrupted (NaNs in logits)
+    is_corrupted: bool = False
+
 
 @dataclass
 class FinishedRequestStats:
@@ -81,12 +203,13 @@ class FinishedRequestStats:
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
-    max_tokens_param: Optional[int] = None
+    max_tokens_param: int | None = None
     queued_time: float = 0.0
     prefill_time: float = 0.0
     inference_time: float = 0.0
     decode_time: float = 0.0
     mean_time_per_output_token: float = 0.0
+    is_corrupted: bool = False
 
 
 class IterationStats:
@@ -102,17 +225,26 @@ class IterationStats:
         self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
         self.inter_token_latencies_iter: list[float] = []
-        self.waiting_lora_adapters: dict[str, int] = {}
-        self.running_lora_adapters: dict[str, int] = {}
+        self.num_corrupted_reqs: int = 0
+
+    def __repr__(self) -> str:
+        field_to_value_str = ", ".join(f"{k}={v}" for k, v in vars(self).items())
+        return f"{self.__class__.__name__}({field_to_value_str})"
 
     def _time_since(self, start: float) -> float:
         """Calculate an interval relative to this iteration's timestamp."""
         return self.iteration_timestamp - start
 
-    def update_from_output(self, output: "EngineCoreOutput",
-                           engine_core_timestamp: float, is_prefilling: bool,
-                           prompt_len: int, req_stats: RequestStateStats,
-                           lora_stats: Optional[LoRAStats]):
+    def update_from_output(
+        self,
+        output: "EngineCoreOutput",
+        engine_core_timestamp: float,
+        is_prefilling: bool,
+        prompt_len: int,
+        req_stats: RequestStateStats,
+        lora_states: "LoRARequestStates",
+        lora_name: str | None,
+    ):
         num_new_generation_tokens = len(output.new_token_ids)
 
         self.num_generation_tokens += num_new_generation_tokens
@@ -125,10 +257,25 @@ class IterationStats:
 
         req_stats.num_generation_tokens += num_new_generation_tokens
 
+        # Track if this request is corrupted (only check once per request)
+        # Early exit if already marked as corrupted to avoid redundant checks
+        if (
+            envs.VLLM_COMPUTE_NANS_IN_LOGITS
+            and not req_stats.is_corrupted
+            and output.num_nans_in_logits > 0
+        ):
+            req_stats.is_corrupted = True
+
         # Process request-level engine core events
         if output.events is not None:
-            self.update_from_events(output.request_id, output.events,
-                                    is_prefilling, req_stats, lora_stats)
+            self.update_from_events(
+                output.request_id,
+                output.events,
+                is_prefilling,
+                req_stats,
+                lora_states,
+                lora_name,
+            )
 
         # Process the batch-level "new tokens" engine core event
         if is_prefilling:
@@ -139,28 +286,37 @@ class IterationStats:
 
         req_stats.last_token_ts = engine_core_timestamp
 
-    def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
-                           is_prefilling: bool, req_stats: RequestStateStats,
-                           lora_stats: Optional[LoRAStats]):
+    def update_from_events(
+        self,
+        req_id: str,
+        events: list["EngineCoreEvent"],
+        is_prefilling: bool,
+        req_stats: RequestStateStats,
+        lora_states: "LoRARequestStates",
+        lora_name: str | None,
+    ):
         # Avoid circular dependency
         from vllm.v1.engine import EngineCoreEventType
+
         for event in events:
             if event.type == EngineCoreEventType.QUEUED:
                 req_stats.queued_ts = event.timestamp
-                if lora_stats is not None:
-                    lora_stats.waiting_requests.add(req_id)
+                lora_states.request_waiting(req_id, lora_name)
             elif event.type == EngineCoreEventType.SCHEDULED:
                 if req_stats.scheduled_ts == 0.0:  # ignore preemptions
                     req_stats.scheduled_ts = event.timestamp
-                LoRARequestStates.scheduled_request(lora_stats, req_id)
+                lora_states.request_running(req_id, lora_name)
             elif event.type == EngineCoreEventType.PREEMPTED:
                 self.num_preempted_reqs += 1
-                LoRARequestStates.preempted_request(lora_stats, req_id)
-
-    def update_from_finished_request(self, finish_reason: "FinishReason",
-                                     num_prompt_tokens: int,
-                                     max_tokens_param: Optional[int],
-                                     req_stats: RequestStateStats):
+                lora_states.request_waiting(req_id, lora_name)
+
+    def update_from_finished_request(
+        self,
+        finish_reason: "FinishReason",
+        num_prompt_tokens: int,
+        max_tokens_param: int | None,
+        req_stats: RequestStateStats,
+    ):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
         # Queued interval is from first QUEUED event to first SCHEDULED
@@ -179,79 +335,86 @@ class IterationStats:
         inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
 
         # Do not count the token generated by the prefill phase
-        mean_time_per_output_token = (decode_time /
-                                      (req_stats.num_generation_tokens - 1)
-                                      if req_stats.num_generation_tokens -
-                                      1 > 0 else 0)
-
-        finished_req = \
-            FinishedRequestStats(finish_reason=finish_reason,
-                                 e2e_latency=e2e_latency,
-                                 num_prompt_tokens=num_prompt_tokens,
-                                 num_generation_tokens=req_stats.num_generation_tokens,
-                                 max_tokens_param=max_tokens_param,
-                                 queued_time=queued_time,
-                                 prefill_time=prefill_time,
-                                 inference_time=inference_time,
-                                 decode_time=decode_time,
-                                 mean_time_per_output_token=mean_time_per_output_token)
+        mean_time_per_output_token = (
+            decode_time / (req_stats.num_generation_tokens - 1)
+            if req_stats.num_generation_tokens - 1 > 0
+            else 0
+        )
+
+        finished_req = FinishedRequestStats(
+            finish_reason=finish_reason,
+            e2e_latency=e2e_latency,
+            num_prompt_tokens=num_prompt_tokens,
+            num_generation_tokens=req_stats.num_generation_tokens,
+            max_tokens_param=max_tokens_param,
+            queued_time=queued_time,
+            prefill_time=prefill_time,
+            inference_time=inference_time,
+            decode_time=decode_time,
+            mean_time_per_output_token=mean_time_per_output_token,
+            is_corrupted=req_stats.is_corrupted,
+        )
         self.finished_requests.append(finished_req)
 
+        # Count corrupted requests when they finish (only once per request)
+        if req_stats.is_corrupted:
+            self.num_corrupted_reqs += 1
 
-class LoRARequestStates:
-    """Per-LoRA request state stats."""
+
+class LoRAStats:
+    """Tracks waiting and running request IDs for a single LoRA."""
 
     def __init__(self):
-        self.lora_name_to_stats: dict[str, LoRAStats] = {}
+        self.waiting: set[str] = set()
+        self.running: set[str] = set()
 
-    def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
-        if req_state.lora_name is None:
-            return None
-        if req_state.lora_name not in self.lora_name_to_stats:
-            self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
-        return self.lora_name_to_stats[req_state.lora_name]
+    def update(self, req_id: str, waiting: bool, running: bool):
+        assert not (waiting and running)
+        if waiting:
+            self.waiting.add(req_id)
+        else:
+            self.waiting.discard(req_id)
 
-    def add_request(self, req_state: 'RequestState'):
-        if (lora_stats := self.get_stats(req_state)) is not None:
-            lora_stats.waiting_requests.add(req_state.request_id)
+        if running:
+            self.running.add(req_id)
+        else:
+            self.running.discard(req_id)
 
-    def finish_request(self, req_state: 'RequestState'):
-        if req_state.lora_name is None:
-            return
-        lora_stats = self.lora_name_to_stats[req_state.lora_name]
-        lora_stats.running_requests.remove(req_state.request_id)
+    @property
+    def empty(self) -> bool:
+        return not (self.waiting or self.running)
 
-    def abort_request(self, req_state: 'RequestState'):
-        if req_state.lora_name is None:
-            return
-        lora_stats = self.lora_name_to_stats[req_state.lora_name]
-        lora_stats.waiting_requests.discard(req_state.request_id)
-        lora_stats.running_requests.discard(req_state.request_id)
-
-    # Break the pattern for this lifecycle methods so we can
-    # call this from IterationStats.update_from_events()
-    @staticmethod
-    def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
-        if lora_stats is None:
-            return
-        lora_stats.waiting_requests.remove(request_id)
-        lora_stats.running_requests.add(request_id)
 
-    @staticmethod
-    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
-        if lora_stats is None:
+class LoRARequestStates:
+    """A per-LoRA count of running and waiting requests."""
+
+    def __init__(self, log_stats: bool = False):
+        self.log_stats = log_stats
+        self.requests: defaultdict[str, LoRAStats] = defaultdict(LoRAStats)
+
+    def _request_update(
+        self, req_id: str, lora_name: str | None, waiting: bool, running: bool
+    ):
+        if not self.log_stats or lora_name is None:
             return
-        lora_stats.running_requests.remove(request_id)
-        lora_stats.waiting_requests.add(request_id)
 
-    def update_iteration_stats(self,
-                               iteration_stats: Optional[IterationStats]):
-        if iteration_stats is None:
+        lora_stats = self.requests[lora_name]
+        lora_stats.update(req_id, waiting, running)
+        if lora_stats.empty:
+            del self.requests[lora_name]
+
+    def request_waiting(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=True, running=False)
+
+    def request_running(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=False, running=True)
+
+    def request_finished(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=False, running=False)
+
+    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
+        if not self.log_stats or scheduler_stats is None:
             return
-        for lora_name, stats in self.lora_name_to_stats.items():
-            if stats.waiting_requests:
-                iteration_stats.waiting_lora_adapters[lora_name] = \
-                    len(stats.waiting_requests)
-            if stats.running_requests:
-                iteration_stats.running_lora_adapters[lora_name] = \
-                    len(stats.running_requests)
+        for lora_name, stats in self.requests.items():
+            scheduler_stats.waiting_lora_adapters[lora_name] = len(stats.waiting)
+            scheduler_stats.running_lora_adapters[lora_name] = len(stats.running)
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 01f3676abd9260a197b08aa60421cae39d45d580..c0b2835c3124c367f3fb1abcb88dad3341b9162b 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -2,62 +2,95 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, NamedTuple
 
+import numpy as np
 import torch
 
+from vllm.v1.core.sched.output import SchedulerOutput
+
 if TYPE_CHECKING:
-    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-        KVConnectorStats)
+    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
+else:
+    KVConnectorStats = object
 
 
 class LogprobsLists(NamedTuple):
-
-    # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: list[list[int]]
-    # [num_reqs, max_num_logprobs + 1]
-    logprobs: list[list[float]]
+    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
+    logprob_token_ids: np.ndarray
+    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
+    logprobs: np.ndarray
+    # [num_reqs x num_generated_tokens]
+    sampled_token_ranks: np.ndarray
     # [num_reqs]
-    sampled_token_ranks: list[int]
-
-    def slice(self, start: int, end: int):
+    # Used for slicing the logprobs in cases like speculative
+    # decoding where the number of generated tokens may be
+    # different for each request.
+    cu_num_generated_tokens: list[int] | None = None
+
+    def slice(self, start_req_idx: int, end_req_idx: int):
+        if self.cu_num_generated_tokens:
+            start = self.cu_num_generated_tokens[start_req_idx]
+            end = self.cu_num_generated_tokens[end_req_idx]
+            # Recompute cumulative array starting from 0
+            cu_num_offset = self.cu_num_generated_tokens[start_req_idx]
+            sliced_cu_num_generated_tokens = [
+                cu_num - cu_num_offset
+                for cu_num in self.cu_num_generated_tokens[
+                    start_req_idx : end_req_idx + 1
+                ]
+            ]
+        else:
+            start = start_req_idx
+            end = end_req_idx
+            sliced_cu_num_generated_tokens = None
         return LogprobsLists(
             self.logprob_token_ids[start:end],
             self.logprobs[start:end],
             self.sampled_token_ranks[start:end],
+            sliced_cu_num_generated_tokens,
         )
 
 
 class LogprobsTensors(NamedTuple):
-
-    # [num_reqs, max_num_logprobs + 1]
+    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
     logprob_token_ids: torch.Tensor
-    # [num_reqs, max_num_logprobs + 1]
+    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
     logprobs: torch.Tensor
-    # [num_reqs]
+    # [num_reqs x num_generated_tokens]
     selected_token_ranks: torch.Tensor
 
-    def tolists(self):
+    def tolists(self, cu_num_generated_tokens: list[int] | None = None):
         return LogprobsLists(
-            self.logprob_token_ids.tolist(),
-            self.logprobs.tolist(),
-            self.selected_token_ranks.tolist(),
+            self.logprob_token_ids.cpu().numpy(),
+            self.logprobs.cpu().numpy(),
+            self.selected_token_ranks.cpu().numpy(),
+            cu_num_generated_tokens,
+        )
+
+    def to_cpu_nonblocking(self) -> "LogprobsTensors":
+        if self.logprob_token_ids.device.type == "cpu":
+            return self
+        return LogprobsTensors(
+            self.logprob_token_ids.to("cpu", non_blocking=True),
+            self.logprobs.to("cpu", non_blocking=True),
+            self.selected_token_ranks.to("cpu", non_blocking=True),
         )
 
     @staticmethod
-    def empty_cpu(num_positions: int,
-                  num_tokens_per_position: int) -> "LogprobsTensors":
+    def empty_cpu(
+        num_positions: int, num_tokens_per_position: int
+    ) -> "LogprobsTensors":
         """Create empty LogprobsTensors on CPU."""
 
         logprob_token_ids = torch.empty(
-            (num_positions, num_tokens_per_position),
-            dtype=torch.int32,
-            device="cpu")
+            (num_positions, num_tokens_per_position), dtype=torch.int32, device="cpu"
+        )
         logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
-        selected_token_ranks = torch.empty(num_positions,
-                                           dtype=torch.int32,
-                                           device="cpu")
+        selected_token_ranks = torch.empty(
+            num_positions, dtype=torch.int32, device="cpu"
+        )
         return LogprobsTensors(
             logprob_token_ids=logprob_token_ids,
             logprobs=logprobs,
@@ -67,37 +100,55 @@ class LogprobsTensors(NamedTuple):
 
 # [num_reqs, <dynamic>]
 # The shape of each element depends on the pooler used
-PoolerOutput = Union[torch.Tensor, list[torch.Tensor]]
+PoolerOutput = torch.Tensor | list[torch.Tensor]
 
 
 @dataclass
 class SamplerOutput:
-
     # [num_reqs, max_num_generated_tokens]
     # Different requests can have different number of generated tokens.
     # All requests are padded to max_num_generated_tokens.
     # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
     sampled_token_ids: torch.Tensor
-    logprobs_tensors: Optional[LogprobsTensors]
+    logprobs_tensors: LogprobsTensors | None
 
 
 @dataclass
 class KVConnectorOutput:
     # [req_ids]
-    finished_sending: Optional[set[str]] = None
-    finished_recving: Optional[set[str]] = None
-    kv_connector_stats: Optional["KVConnectorStats"] = None
+    finished_sending: set[str] | None = None
+    finished_recving: set[str] | None = None
+    kv_connector_stats: KVConnectorStats | None = None
+    # IDs of externally computed KV blocks that failed to load.
+    # Requests referencing these blocks should be rescheduled to recompute them
+    invalid_block_ids: set[int] = field(default_factory=set)
+    # Configuration describing how many finished sending/receiving
+    # notifications should be expected for each request. This allows
+    # handshake-based connectors like Nixl to update the KVOutputAggregator.
+    # It captures a static setup info and should almost always remain constant
+    # for a given connector after discovery. Default value entails no change.
+    expected_finished_count: int = 0
 
     def is_empty(self):
-        return (not self.finished_sending and not self.finished_recving
-                and not self.kv_connector_stats)
+        return (
+            not self.finished_sending
+            and not self.finished_recving
+            and not self.kv_connector_stats
+            and not self.invalid_block_ids
+        )
+
+
+@dataclass
+class ECConnectorOutput:
+    # [mm_hash]
+    finished_sending: set[str] | None = None
+    finished_recving: set[str] | None = None
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
 # This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
 class ModelRunnerOutput:
-
     # [num_reqs]
     req_ids: list[str]
     # req_id -> index
@@ -107,35 +158,36 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[np.ndarray]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
-    logprobs: Optional[LogprobsLists]
+    logprobs: LogprobsLists | None
 
     # req_id -> (token_ids, logprobs, ranks)
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
-    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
+    prompt_logprobs_dict: dict[str, LogprobsTensors | None]
 
     # [num_reqs, hidden_size]
-    pooler_output: list[Optional[torch.Tensor]]
+    pooler_output: list[torch.Tensor | None]
+
+    kv_connector_output: KVConnectorOutput | None = None
 
-    kv_connector_output: Optional[KVConnectorOutput] = None
+    ec_connector_output: ECConnectorOutput | None = None
 
     # req_id -> num_nans_in_logits
-    num_nans_in_logits: Optional[dict[str, int]] = None
+    num_nans_in_logits: dict[str, int] | None = None
 
 
 # ModelRunnerOutput wrapper for async scheduling.
 class AsyncModelRunnerOutput(ABC):
-
     @abstractmethod
     def get_output(self) -> ModelRunnerOutput:
         """Get the ModelRunnerOutput for this async output.
-        
+
         This is a blocking call that waits until the results are ready, which
         might involve copying device tensors to the host.
         This method should only be called once per AsyncModelRunnerOutput.
@@ -145,17 +197,53 @@ class AsyncModelRunnerOutput(ABC):
 
 @dataclass
 class DraftTokenIds:
-
     # [num_reqs]
     req_ids: list[str]
     # num_reqs x num_draft_tokens
     draft_token_ids: list[list[int]]
 
 
-EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
-                                              req_id_to_index={},
-                                              sampled_token_ids=[],
-                                              logprobs=None,
-                                              prompt_logprobs_dict={},
-                                              pooler_output=[],
-                                              num_nans_in_logits=None)
+def make_empty_encoder_model_runner_output(
+    scheduler_output: "SchedulerOutput",
+) -> ModelRunnerOutput:
+    """
+    Create a ModelRunnerOutput stub that contains the correct
+    per-request bookkeeping but no generated data yet.
+    """
+    if not scheduler_output.num_scheduled_tokens:
+        return EMPTY_MODEL_RUNNER_OUTPUT
+
+    # Convert to list so we get a deterministic, indexable sequence
+    req_ids: list[str] = list(scheduler_output.num_scheduled_tokens.keys())
+
+    # Give every request its own contiguous index
+    req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
+
+    # No tokens generated yet ⇒ one empty list per request
+    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]
+
+    # Pooler outputs are not available yet ⇒ use None placeholders
+    pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
+
+    return ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_id_to_index,
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=pooler_output,
+        kv_connector_output=None,
+        ec_connector_output=None,
+        num_nans_in_logits=None,
+    )
+
+
+EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+    req_ids=[],
+    req_id_to_index={},
+    sampled_token_ids=[],
+    logprobs=None,
+    prompt_logprobs_dict={},
+    pooler_output=[],
+    num_nans_in_logits=None,
+)
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 46506d272e90a507beea90b5629b38fee6a3f177..9883ab8fb9964c1a8caae0d1c6524a2640bed87d 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
 from vllm.pooling_params import PoolingParams
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 
 pin_memory = is_pin_memory_available()
 
@@ -29,49 +28,55 @@ class PoolingCursor:
         )
 
     def is_partial_prefill(self):
-        return not torch.all(
-            self.prompt_lens_cpu == self.num_scheduled_tokens_cpu)
+        return not torch.all(self.prompt_lens_cpu == self.num_scheduled_tokens_cpu)
 
 
 @dataclass
 class PoolingMetadata:
     """Tensors for pooling."""
+
     prompt_lens: torch.Tensor  # CPU Tensor
-    prompt_token_ids: Optional[torch.Tensor]
+    prompt_token_ids: torch.Tensor | None
     pooling_params: list[PoolingParams]
-    pooling_cursor: Optional[PoolingCursor] = None
+    pooling_cursor: PoolingCursor | None = None
 
     def __getitem__(self, indices: slice):
         return PoolingMetadata(
             prompt_lens=self.prompt_lens[indices],
-            prompt_token_ids=None if self.prompt_token_ids is None else
-            self.prompt_token_ids[indices],
+            prompt_token_ids=None
+            if self.prompt_token_ids is None
+            else self.prompt_token_ids[indices],
             pooling_params=self.pooling_params[indices],
             pooling_cursor=None
-            if self.pooling_cursor is None else self.pooling_cursor[indices],
+            if self.pooling_cursor is None
+            else self.pooling_cursor[indices],
         )
 
-    def build_pooling_cursor(self, num_scheduled_tokens: list[int],
-                             device: torch.device):
-        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens,
-                                                   self.prompt_lens, device)
+    def build_pooling_cursor(
+        self, num_scheduled_tokens: list[int], device: torch.device
+    ):
+        self.pooling_cursor = build_pooling_cursor(
+            num_scheduled_tokens, self.prompt_lens, device
+        )
 
 
-def build_pooling_cursor(num_scheduled_tokens: list[int],
-                         prompt_lens: torch.Tensor, device: torch.device):
+def build_pooling_cursor(
+    num_scheduled_tokens: list[int], prompt_lens: torch.Tensor, device: torch.device
+):
     assert len(prompt_lens) == len(num_scheduled_tokens)
 
     n_seq = len(num_scheduled_tokens)
     index = list(range(n_seq))
     num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu")
-    cumsum = torch.zeros(n_seq + 1,
-                         dtype=torch.int64,
-                         pin_memory=pin_memory,
-                         device="cpu")
+    cumsum = torch.zeros(
+        n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
+    )
     torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:])
     cumsum = cumsum.to(device, non_blocking=True)
-    return PoolingCursor(index=index,
-                         first_token_indices_gpu=cumsum[:n_seq],
-                         last_token_indices_gpu=cumsum[1:] - 1,
-                         prompt_lens_cpu=prompt_lens,
-                         num_scheduled_tokens_cpu=num_scheduled_tokens)
+    return PoolingCursor(
+        index=index,
+        first_token_indices_gpu=cumsum[:n_seq],
+        last_token_indices_gpu=cumsum[1:] - 1,
+        prompt_lens_cpu=prompt_lens,
+        num_scheduled_tokens_cpu=num_scheduled_tokens,
+    )
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index ff10fa00c1cf681aad5f07ca4f5e7f3417888a3d..3d92906fbf4b125ce95bd2249e1cc00f8682130e 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,9 +3,9 @@
 
 import enum
 import time
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
@@ -13,8 +13,12 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
-                            EngineCoreRequest, FinishReason)
+from vllm.v1.engine import (
+    EngineCoreEvent,
+    EngineCoreEventType,
+    EngineCoreRequest,
+    FinishReason,
+)
 from vllm.v1.structured_output.request import StructuredOutputRequest
 from vllm.v1.utils import ConstantList
 
@@ -24,25 +28,22 @@ if TYPE_CHECKING:
 
 
 class Request:
-
     def __init__(
         self,
         request_id: str,
-        prompt_token_ids: Optional[list[int]],
-        sampling_params: Optional[SamplingParams],
-        pooling_params: Optional[PoolingParams],
-        eos_token_id: Optional[int],
+        prompt_token_ids: list[int] | None,
+        sampling_params: SamplingParams | None,
+        pooling_params: PoolingParams | None,
+        eos_token_id: int | None,
         client_index: int = 0,
-        arrival_time: Optional[float] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        mm_features: Optional[list[MultiModalFeatureSpec]] = None,
+        arrival_time: float | None = None,
+        prompt_embeds: torch.Tensor | None = None,
+        mm_features: list[MultiModalFeatureSpec] | None = None,
         lora_request: Optional["LoRARequest"] = None,
-        structured_output_request: Optional["StructuredOutputRequest"] = None,
-        cache_salt: Optional[str] = None,
+        cache_salt: str | None = None,
         priority: int = 0,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        block_hasher: Optional[Callable[["Request"],
-                                        list["BlockHash"]]] = None,
+        trace_headers: Mapping[str, str] | None = None,
+        block_hasher: Callable[["Request"], list["BlockHash"]] | None = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -52,17 +53,17 @@ class Request:
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
-        self.structured_output_request = structured_output_request
-        self.arrival_time = arrival_time if arrival_time is not None else \
-            time.time()
+        self.structured_output_request = StructuredOutputRequest.from_sampling_params(
+            sampling_params
+        )
+        self.arrival_time = arrival_time if arrival_time is not None else time.time()
 
         self.status = RequestStatus.WAITING
-        self.use_structured_output = False
         self.events: list[EngineCoreEvent] = []
-        self.stop_reason: Union[int, str, None] = None
+        self.stop_reason: int | str | None = None
 
         # P/D: Connector-specific KV transfer parameters.
-        self.kv_transfer_params: Optional[dict[str, Any]] = None
+        self.kv_transfer_params: dict[str, Any] | None = None
 
         if pooling_params is not None:
             # Pooling models.
@@ -71,29 +72,31 @@ class Request:
             # Generative models.
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
-            if sampling_params.structured_outputs is not None:
+            if self.structured_output_request is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
-                self.use_structured_output = True
 
             if sampling_params.extra_args is not None:
-                self.kv_transfer_params = \
-                    sampling_params.extra_args.get("kv_transfer_params")
+                self.kv_transfer_params = sampling_params.extra_args.get(
+                    "kv_transfer_params"
+                )
         else:
-            raise ValueError(
-                "sampling_params and pooling_params can't both be unset")
+            raise ValueError("sampling_params and pooling_params can't both be unset")
 
         self.prompt_token_ids = prompt_token_ids
         self.prompt_embeds = prompt_embeds
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
-            prompt_token_ids, prompt_embeds)
+            prompt_token_ids, prompt_embeds
+        )
         self._output_token_ids: list[int] = []
-        self._all_token_ids: list[int] = self.prompt_token_ids.copy(
-        ) if self.prompt_token_ids is not None else [0
-                                                     ] * self.num_prompt_tokens
+        self._all_token_ids: list[int] = (
+            self.prompt_token_ids.copy()
+            if self.prompt_token_ids is not None
+            else [0] * self.num_prompt_tokens
+        )
         self.num_output_placeholders = 0  # Used in async scheduling.
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
-        self.cache_salt: Optional[str] = cache_salt
+        self.cache_salt: str | None = cache_salt
 
         # Multi-modal related
         self.mm_features = mm_features or []
@@ -115,17 +118,22 @@ class Request:
         # indicates that the output is corrupted
         self.num_nans_in_logits = 0
 
+        # The number of requests being preempted by the scheduler
+        self.num_preemptions = 0
+
         self.block_hashes: list[BlockHash] = []
-        self.get_hash_new_full_blocks: Optional[Callable[
-            [], list[BlockHash]]] = None
+        self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
         if block_hasher is not None:
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
 
+        self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
+
     @classmethod
     def from_engine_core_request(
-        cls, request: EngineCoreRequest,
-        block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
+        cls,
+        request: EngineCoreRequest,
+        block_hasher: Callable[["Request"], list["BlockHash"]] | None,
     ) -> "Request":
         return cls(
             request_id=request.request_id,
@@ -138,9 +146,6 @@ class Request:
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
             lora_request=request.lora_request,
-            structured_output_request=StructuredOutputRequest(
-                sampling_params=request.sampling_params) \
-                    if request.sampling_params else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
             trace_headers=request.trace_headers,
@@ -149,7 +154,7 @@ class Request:
 
     def append_output_token_ids(
         self,
-        token_ids: Union[int, list[int]],
+        token_ids: int | list[int],
     ) -> None:
         if isinstance(token_ids, int):
             self._output_token_ids.append(token_ids)
@@ -162,8 +167,8 @@ class Request:
             self.block_hashes.extend(self.get_hash_new_full_blocks())
 
     @property
-    def is_output_corrupted(self) -> bool:
-        return self.num_nans_in_logits > 0
+    def use_structured_output(self) -> bool:
+        return self.structured_output_request is not None
 
     @property
     def num_tokens(self) -> int:
@@ -177,10 +182,23 @@ class Request:
     def num_output_tokens(self) -> int:
         return len(self._output_token_ids)
 
+    def get_skip_reading_prefix_cache(self) -> bool:
+        if (
+            self.sampling_params is not None
+            and self.sampling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.sampling_params.skip_reading_prefix_cache
+        elif (
+            self.pooling_params is not None
+            and self.pooling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.pooling_params.skip_reading_prefix_cache
+        return False
+
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[FinishReason, None]:
+    def get_finished_reason(self) -> FinishReason | None:
         return RequestStatus.get_finished_reason(self.status)
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
@@ -191,11 +209,11 @@ class Request:
     def record_event(
         self,
         event_type: EngineCoreEventType,
-        timestamp: Optional[float] = None,
+        timestamp: float | None = None,
     ) -> None:
         self.events.append(EngineCoreEvent.new_event(event_type, timestamp))
 
-    def take_events(self) -> Optional[list[EngineCoreEvent]]:
+    def take_events(self) -> list[EngineCoreEvent] | None:
         if not self.events:
             return None
         events, self.events = self.events, []
@@ -204,6 +222,7 @@ class Request:
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
+
     WAITING = enum.auto()
     WAITING_FOR_FSM = enum.auto()
     WAITING_FOR_REMOTE_KVS = enum.auto()
@@ -224,8 +243,7 @@ class RequestStatus(enum.IntEnum):
         return status > RequestStatus.PREEMPTED
 
     @staticmethod
-    def get_finished_reason(
-            status: "RequestStatus") -> Union[FinishReason, None]:
+    def get_finished_reason(status: "RequestStatus") -> FinishReason | None:
         return _FINISHED_REASON_MAP.get(status)
 
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 10cad5b53071677daa77b1bc4384a2d687ff1a97..8b174af4c7794737972c185992e325cf72eab3eb 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,23 +5,27 @@ import inspect
 import itertools
 from abc import abstractmethod
 from collections.abc import Sequence
-from functools import partial
-from typing import TYPE_CHECKING, Optional, Union
+from functools import lru_cache, partial
+from typing import TYPE_CHECKING
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor
 from vllm.sampling_params import SamplingParams
-from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
-                                                     MinPLogitsProcessor,
-                                                     MinTokensLogitsProcessor,
-                                                     process_dict_updates)
-from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
-                                                       LogitsProcessor,
-                                                       MoveDirectionality)
-from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
-                                                   LogitsProcessors)
+from vllm.utils.torch_utils import guard_cuda_initialization
+from vllm.v1.sample.logits_processor.builtin import (
+    LogitBiasLogitsProcessor,
+    MinPLogitsProcessor,
+    MinTokensLogitsProcessor,
+    process_dict_updates,
+)
+from vllm.v1.sample.logits_processor.interface import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+from vllm.v1.sample.logits_processor.state import BatchUpdateBuilder, LogitsProcessors
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -30,10 +34,17 @@ logger = init_logger(__name__)
 
 # Error message when the user tries to initialize vLLM with a pooling model
 # and custom logitsproces
-STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom"
-                                   " logits processors.")
+STR_POOLING_REJECTS_LOGITSPROCS = (
+    "Pooling models do not support custom logits processors."
+)
 
-LOGITSPROCS_GROUP = 'vllm.logits_processors'
+# Error message when the user tries to initialize vLLM with a speculative
+# decoding enabled and custom logitsproces
+STR_SPEC_DEC_REJECTS_LOGITSPROCS = (
+    "Custom logits processors are not supported when speculative decoding is enabled."
+)
+
+LOGITSPROCS_GROUP = "vllm.logits_processors"
 
 BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
     MinTokensLogitsProcessor,
@@ -45,36 +56,35 @@ BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
 def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
     """Load all installed logit processor plugins"""
 
-    import sys
-
-    if sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
+    from importlib.metadata import entry_points
 
     installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
     if len(installed_logitsprocs_plugins) == 0:
-        logger.debug("No logitsprocs plugins installed (group %s).",
-                     LOGITSPROCS_GROUP)
+        logger.debug("No logitsprocs plugins installed (group %s).", LOGITSPROCS_GROUP)
         return []
 
     # Load logitsprocs plugins
-    logger.debug("Loading installed logitsprocs plugins (group %s):",
-                 LOGITSPROCS_GROUP)
+    logger.debug("Loading installed logitsprocs plugins (group %s):", LOGITSPROCS_GROUP)
     classes: list[type[LogitsProcessor]] = []
     for entrypoint in installed_logitsprocs_plugins:
         try:
-            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
-                         entrypoint.name, entrypoint.value)
-            classes.append(entrypoint.load())
+            logger.debug(
+                "- Loading logitproc plugin entrypoint=%s target=%s",
+                entrypoint.name,
+                entrypoint.value,
+            )
+            with guard_cuda_initialization():
+                classes.append(entrypoint.load())
         except Exception as e:
+            logger.error("Failed to load LogitsProcessor plugin %s: %s", entrypoint, e)
             raise RuntimeError(
-                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
+                f"Failed to load LogitsProcessor plugin {entrypoint}"
+            ) from e
     return classes
 
 
 def _load_logitsprocs_by_fqcns(
-    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]]
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
 ) -> list[type[LogitsProcessor]]:
     """Load logit processor types, identifying them by fully-qualified class
     names (FQCNs).
@@ -99,13 +109,14 @@ def _load_logitsprocs_by_fqcns(
 
     logger.debug(
         "%s additional custom logits processors specified, checking whether "
-        "they need to be loaded.", len(logits_processors))
+        "they need to be loaded.",
+        len(logits_processors),
+    )
 
     classes: list[type[LogitsProcessor]] = []
     for ldx, logitproc in enumerate(logits_processors):
         if isinstance(logitproc, type):
-            logger.debug(" - Already-loaded logit processor: %s",
-                         logitproc.__name__)
+            logger.debug(" - Already-loaded logit processor: %s", logitproc.__name__)
             if not issubclass(logitproc, LogitsProcessor):
                 raise ValueError(
                     f"{logitproc.__name__} is not a subclass of LogitsProcessor"
@@ -118,8 +129,15 @@ def _load_logitsprocs_by_fqcns(
 
         try:
             # Load module
-            module = importlib.import_module(module_path)
+            with guard_cuda_initialization():
+                module = importlib.import_module(module_path)
         except Exception as e:
+            logger.error(
+                "Failed to load %sth LogitsProcessor plugin %s: %s",
+                ldx,
+                logitproc,
+                e,
+            )
             raise RuntimeError(
                 f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
             ) from e
@@ -131,15 +149,14 @@ def _load_logitsprocs_by_fqcns(
         if not isinstance(obj, type):
             raise ValueError("Loaded logit processor must be a type.")
         if not issubclass(obj, LogitsProcessor):
-            raise ValueError(
-                f"{obj.__name__} must be a subclass of LogitsProcessor")
+            raise ValueError(f"{obj.__name__} must be a subclass of LogitsProcessor")
         classes.append(obj)
 
     return classes
 
 
 def _load_custom_logitsprocs(
-    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]],
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
 ) -> list[type[LogitsProcessor]]:
     """Load all custom logits processors.
 
@@ -155,13 +172,13 @@ def _load_custom_logitsprocs(
       A list of all loaded logitproc types
     """
     from vllm.platforms import current_platform
+
     if current_platform.is_tpu():
         # No logitsprocs specified by caller
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return []
 
-    return (_load_logitsprocs_plugins() +
-            _load_logitsprocs_by_fqcns(logits_processors))
+    return _load_logitsprocs_plugins() + _load_logitsprocs_by_fqcns(logits_processors)
 
 
 def build_logitsprocs(
@@ -169,28 +186,58 @@ def build_logitsprocs(
     device: torch.device,
     is_pin_memory: bool,
     is_pooling_model: bool,
-    custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (),
+    custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (),
 ) -> LogitsProcessors:
     if is_pooling_model:
         if custom_logitsprocs:
             raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS)
-        logger.debug("Skipping logits processor loading because pooling models"
-                     " do not support logits processors.")
+        logger.debug(
+            "Skipping logits processor loading because pooling models"
+            " do not support logits processors."
+        )
+        return LogitsProcessors()
+
+    # Check if speculative decoding is enabled.
+    if vllm_config.speculative_config:
+        if custom_logitsprocs:
+            raise ValueError(STR_SPEC_DEC_REJECTS_LOGITSPROCS)
+        logger.warning(
+            "min_p, logit_bias, and min_tokens parameters won't currently work "
+            "with speculative decoding enabled."
+        )
         return LogitsProcessors()
+
     custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs)
     return LogitsProcessors(
-        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
-            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
+        ctor(vllm_config, device, is_pin_memory)
+        for ctor in itertools.chain(
+            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes
+        )
+    )
+
+
+cached_load_custom_logitsprocs = lru_cache(_load_custom_logitsprocs)
+
+
+def validate_logits_processors_parameters(
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
+    sampling_params: SamplingParams,
+):
+    logits_processors = (
+        tuple(logits_processors) if logits_processors is not None else None
+    )
+    for logits_procs in cached_load_custom_logitsprocs(logits_processors):
+        logits_procs.validate_params(sampling_params)
 
 
 class AdapterLogitsProcessor(LogitsProcessor):
     """Wrapper for per-request logits processors
-    
+
     To wrap a specific per-request logits processor,
     * Subclass `AdapterLogitsProcessor`
     * Implement `self.is_argmax_invariant()` base-class method
     * Implement `self.new_req_logits_processor(params)`
-    
+
     `self.__init__(vllm_config, device, is_pin_memory)` does not need to be
     overridden in general. However, to implement custom constructor behavior -
     especially any logic which operates on or stores `vllm_config`, `device`,
@@ -199,8 +246,9 @@ class AdapterLogitsProcessor(LogitsProcessor):
     `super().__init__(vllm_config, device, is_pin_memory)`
     """
 
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool):
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
         """Subclass must invoke
         `super().__init__(vllm_config, device, is_pin_memory)`.
 
@@ -225,7 +273,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """Consume request info; return a per-request logits processor.
 
         Return None if logits processor does not need to be applied to request
@@ -236,16 +284,16 @@ class AdapterLogitsProcessor(LogitsProcessor):
         Returns:
           None if logits processor should not be applied to request; otherwise
           returns a `RequestLogitsProcessor` instance
-        
+
         """
         raise NotImplementedError
 
     def _new_state(
         self,
         params: SamplingParams,
-        prompt_ids: Optional[list[int]],
+        prompt_ids: list[int] | None,
         output_ids: list[int],
-    ) -> Optional[partial[torch.Tensor]]:
+    ) -> partial[torch.Tensor] | None:
         """Return state representation for new request
 
         Returns None if logits processor is not applicable to request
@@ -257,15 +305,18 @@ class AdapterLogitsProcessor(LogitsProcessor):
 
         Returns:
           logits processor partial[Tensor] or None
-        
+
         """
         if req_lp := self.new_req_logits_processor(params):
-            args = [prompt_ids, output_ids] if (len(
-                inspect.signature(req_lp).parameters) == 3) else [output_ids]
+            args = (
+                [prompt_ids, output_ids]
+                if (len(inspect.signature(req_lp).parameters) == 3)
+                else [output_ids]
+            )
             return partial(req_lp, *args)
         return None
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         process_dict_updates(
             self.req_info,
             batch_update,
@@ -286,9 +337,16 @@ class AdapterLogitsProcessor(LogitsProcessor):
 
 
 __all__ = [
-    "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor",
-    "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
-    "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
-    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP",
-    "AdapterLogitsProcessor"
+    "LogitsProcessor",
+    "LogitBiasLogitsProcessor",
+    "MinPLogitsProcessor",
+    "MinTokensLogitsProcessor",
+    "BatchUpdate",
+    "BatchUpdateBuilder",
+    "MoveDirectionality",
+    "LogitsProcessors",
+    "build_logitsprocs",
+    "STR_POOLING_REJECTS_LOGITSPROCS",
+    "LOGITSPROCS_GROUP",
+    "AdapterLogitsProcessor",
 ]
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index fc655d993cb4cdd02ad5cbba617d1a200a4d54ec..4ee7dc2880c8c885ac4f96239ccd43e9ad62fe63 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING, TypeVar
 
 import torch
 
 from vllm import SamplingParams
-from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
-                                                       LogitsProcessor,
-                                                       MoveDirectionality)
+from vllm.v1.sample.logits_processor.interface import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -17,25 +19,24 @@ T = TypeVar("T")
 
 
 class MinPLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool):
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
-        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=is_pin_memory)
+        self.min_p_cpu_tensor = torch.zeros(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=is_pin_memory
+        )
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
 
         self.use_double_tensor = torch.device(device).type != "cpu"
 
         if self.use_double_tensor:
             # Pre-allocated device tensor
-            self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
-                                                          dtype=torch.float32,
-                                                          device=device)
+            self.min_p_device: torch.Tensor = torch.empty(
+                (max_num_reqs,), dtype=torch.float32, device=device
+            )
         else:
             self.min_p_device = self.min_p_cpu_tensor
         # Current slice of the device tensor
@@ -48,7 +49,7 @@ class MinPLogitsProcessor(LogitsProcessor):
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         if not batch_update:
             return
 
@@ -93,8 +94,7 @@ class MinPLogitsProcessor(LogitsProcessor):
         if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
             self.min_p = self.min_p_device[:size]
             if self.use_double_tensor:
-                self.min_p.copy_(self.min_p_cpu_tensor[:size],
-                                 non_blocking=True)
+                self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
             self.min_p.unsqueeze_(1)
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
@@ -104,38 +104,37 @@ class MinPLogitsProcessor(LogitsProcessor):
         # Convert logits to probability distribution
         probability_values = torch.nn.functional.softmax(logits, dim=-1)
         # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
+        max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
         # Adjust min_p
         adjusted_min_p = max_probabilities.mul_(self.min_p)
         # Identify valid tokens using threshold comparison
         invalid_token_mask = probability_values < adjusted_min_p
         # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float('inf')
+        logits[invalid_token_mask] = -float("inf")
         return logits
 
 
 class LogitBiasLogitsProcessor(LogitsProcessor):
-
     def __init__(self, _, device: torch.device, is_pin_memory: bool):
         self.device = device
         self.pin_memory = is_pin_memory
         self.biases: dict[int, dict[int, float]] = {}
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
-        self.logits_slice = (self._device_tensor([], torch.int32),
-                             self._device_tensor([], torch.int32))
+        self.logits_slice = (
+            self._device_tensor([], torch.int32),
+            self._device_tensor([], torch.int32),
+        )
 
     def is_argmax_invariant(self) -> bool:
         """Logit bias can rebalance token probabilities and change the
         outcome of argmax in greedy sampling."""
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         needs_update = process_dict_updates(
-            self.biases, batch_update,
-            lambda params, _, __: params.logit_bias or None)
+            self.biases, batch_update, lambda params, _, __: params.logit_bias or None
+        )
 
         # Update tensors if needed.
         if needs_update:
@@ -148,15 +147,15 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
                 biases.extend(lb.values())
 
             self.bias_tensor = self._device_tensor(biases, torch.float32)
-            self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                 self._device_tensor(tok_ids, torch.int32))
+            self.logits_slice = (
+                self._device_tensor(reqs, torch.int32),
+                self._device_tensor(tok_ids, torch.int32),
+            )
 
     def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
-        return (torch.tensor(data,
-                             device="cpu",
-                             dtype=dtype,
-                             pin_memory=self.pin_memory).to(device=self.device,
-                                                            non_blocking=True))
+        return torch.tensor(
+            data, device="cpu", dtype=dtype, pin_memory=self.pin_memory
+        ).to(device=self.device, non_blocking=True)
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if self.biases:
@@ -165,20 +164,19 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
 
 
 class MinTokensLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool):
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
         # index -> (min_toks, output_token_ids, stop_token_ids)
         self.device = device
         self.pin_memory = is_pin_memory
         self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
 
         # (req_idx_tensor,eos_tok_id_tensor)
-        self.logits_slice: tuple[torch.Tensor,
-                                 torch.Tensor] = (self._device_tensor(
-                                     [], torch.int32),
-                                                  self._device_tensor(
-                                                      [], torch.int32))
+        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (
+            self._device_tensor([], torch.int32),
+            self._device_tensor([], torch.int32),
+        )
 
     def is_argmax_invariant(self) -> bool:
         """By censoring stop tokens, min-tokens can change the outcome
@@ -187,22 +185,24 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
     @staticmethod
     def add_request(
-        params: SamplingParams, _: Optional[list[int]],
-        output_tok_ids: list[int]
-    ) -> Optional[tuple[int, Sequence[int], set[int]]]:
+        params: SamplingParams, _: list[int] | None, output_tok_ids: list[int]
+    ) -> tuple[int, Sequence[int], set[int]] | None:
         min_tokens = params.min_tokens
         if not min_tokens or len(output_tok_ids) >= min_tokens:
             return None
         return min_tokens, output_tok_ids, params.all_stop_token_ids
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        needs_update = process_dict_updates(self.min_toks, batch_update,
-                                            self.add_request)
+    def update_state(self, batch_update: BatchUpdate | None):
+        needs_update = process_dict_updates(
+            self.min_toks, batch_update, self.add_request
+        )
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
-            to_remove = tuple(index for index, (min_toks, out_tok_ids,
-                                                _) in self.min_toks.items()
-                              if len(out_tok_ids) >= min_toks)
+            to_remove = tuple(
+                index
+                for index, (min_toks, out_tok_ids, _) in self.min_toks.items()
+                if len(out_tok_ids) >= min_toks
+            )
             if to_remove:
                 needs_update = True
                 for index in to_remove:
@@ -216,15 +216,15 @@ class MinTokensLogitsProcessor(LogitsProcessor):
                 reqs.extend([req] * len(stop_tok_ids))
                 tok_ids.extend(stop_tok_ids)
 
-            self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                 self._device_tensor(tok_ids, torch.int32))
+            self.logits_slice = (
+                self._device_tensor(reqs, torch.int32),
+                self._device_tensor(tok_ids, torch.int32),
+            )
 
     def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
-        return (torch.tensor(data,
-                             device="cpu",
-                             dtype=dtype,
-                             pin_memory=self.pin_memory).to(device=self.device,
-                                                            non_blocking=True))
+        return torch.tensor(
+            data, device="cpu", dtype=dtype, pin_memory=self.pin_memory
+        ).to(device=self.device, non_blocking=True)
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if self.min_toks:
@@ -234,9 +234,9 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
 
 def process_dict_updates(
-    req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
-    new_state: Callable[[SamplingParams, Optional[list[int]], list[int]],
-                        Optional[T]]
+    req_entries: dict[int, T],
+    batch_update: BatchUpdate | None,
+    new_state: Callable[[SamplingParams, list[int] | None, list[int]], T | None],
 ) -> bool:
     """Utility function to update dict state for sparse LogitsProcessors."""
 
@@ -246,8 +246,7 @@ def process_dict_updates(
 
     updated = False
     for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
-        if (state := new_state(params, prompt_tok_ids,
-                               output_tok_ids)) is not None:
+        if (state := new_state(params, prompt_tok_ids, output_tok_ids)) is not None:
             req_entries[index] = state
             updated = True
         elif req_entries.pop(index, None) is not None:
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index a84afc2f347a0e70bb07c0d73e116f431447a403..0cbfb187878a2e8a800ea57ef5f5c7575b0828f6 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -26,7 +26,7 @@ RemovedRequest = int
 
 # (index, params, prompt_tok_ids, output_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]]
+AddedRequest = tuple[int, SamplingParams, list[int] | None, list[int]]
 
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
@@ -36,6 +36,7 @@ MovedRequest = tuple[int, int, MoveDirectionality]
 @dataclass(frozen=True)
 class BatchUpdate:
     """Persistent batch state change info for logitsprocs"""
+
     batch_size: int  # Current num reqs in batch
 
     # Metadata for requests added to, removed from, and moved
@@ -57,10 +58,18 @@ class BatchUpdate:
 
 
 class LogitsProcessor(ABC):
+    @classmethod
+    def validate_params(cls, sampling_params: SamplingParams):
+        """Validate sampling params for this logits processor.
+
+        Raise ValueError for invalid ones.
+        """
+        return None
 
     @abstractmethod
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool) -> None:
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 0a1196559d3e36d97dff749d25b5a25b68216528..c15219da5cf7939cf2e74a8e995e32cdd2893a8c 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterator
 from itertools import chain
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
-from vllm.v1.sample.logits_processor.interface import (AddedRequest,
-                                                       BatchUpdate,
-                                                       MovedRequest,
-                                                       RemovedRequest)
+from vllm.v1.sample.logits_processor.interface import (
+    AddedRequest,
+    BatchUpdate,
+    MovedRequest,
+    RemovedRequest,
+)
 
 if TYPE_CHECKING:
     from vllm.v1.sample.logits_processor.interface import LogitsProcessor
@@ -41,9 +43,9 @@ class BatchUpdateBuilder:
 
     def __init__(
         self,
-        removed: Optional[list[RemovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
+        removed: list[RemovedRequest] | None = None,
+        added: list[AddedRequest] | None = None,
+        moved: list[MovedRequest] | None = None,
     ) -> None:
         self._removed = removed or []
         self.added = added or []
@@ -81,22 +83,23 @@ class BatchUpdateBuilder:
           index: request index
         """
         if self._is_removed_sorted:
-            raise RuntimeError("Cannot register new removed request after"
-                               " self.removed has been read.")
+            raise RuntimeError(
+                "Cannot register new removed request after self.removed has been read."
+            )
         self._removed.append(index)
         self.batch_changed = True
 
     def has_removed(self) -> bool:
         return bool(self._removed)
 
-    def peek_removed(self) -> Optional[int]:
+    def peek_removed(self) -> int | None:
         """Return lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
             return self._removed[-1]
         return None
 
-    def pop_removed(self) -> Optional[int]:
+    def pop_removed(self) -> int | None:
         """Pop lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
@@ -113,10 +116,10 @@ class BatchUpdateBuilder:
         self.batch_changed = False
         return batch_changed
 
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+    def get_and_reset(self, batch_size: int) -> BatchUpdate | None:
         """Generate a logitsprocs batch update data structure and reset
         internal batch update builder state.
-        
+
         Args:
           batch_size: current persistent batch size
 
@@ -145,15 +148,16 @@ class BatchUpdateBuilder:
 class LogitsProcessors:
     """Encapsulates initialized logitsproc objects."""
 
-    def __init__(
-            self,
-            logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None:
+    def __init__(self, logitsprocs: Iterator["LogitsProcessor"] | None = None) -> None:
         self.argmax_invariant: list[LogitsProcessor] = []
         self.non_argmax_invariant: list[LogitsProcessor] = []
         if logitsprocs:
             for logitproc in logitsprocs:
-                (self.argmax_invariant if logitproc.is_argmax_invariant() else
-                 self.non_argmax_invariant).append(logitproc)
+                (
+                    self.argmax_invariant
+                    if logitproc.is_argmax_invariant()
+                    else self.non_argmax_invariant
+                ).append(logitproc)
 
     @property
     def all(self) -> Iterator["LogitsProcessor"]:
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9d6a87cea3d07f474d6643b0f9860340891e6212..b1101b1b2318731f5286013c0b1f99391424f22a 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -11,21 +10,20 @@ from vllm.v1.sample.logits_processor import LogitsProcessors
 
 @dataclass
 class SamplingMetadata:
-
-    temperature: Optional[torch.Tensor]
+    temperature: torch.Tensor | None
     all_greedy: bool
     all_random: bool
 
-    top_p: Optional[torch.Tensor]
-    top_k: Optional[torch.Tensor]
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
 
     generators: dict[int, torch.Generator]
 
     # None means no logprobs, 0 means sampled token logprobs only
-    max_num_logprobs: Optional[int]
+    max_num_logprobs: int | None
 
     no_penalties: bool
-    prompt_token_ids: Optional[torch.Tensor]
+    prompt_token_ids: torch.Tensor | None
     frequency_penalties: torch.Tensor
     presence_penalties: torch.Tensor
     repetition_penalties: torch.Tensor
@@ -34,10 +32,13 @@ class SamplingMetadata:
 
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
-    allowed_token_ids_mask: Optional[torch.Tensor]
+    allowed_token_ids_mask: torch.Tensor | None
 
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
 
     # Loaded logits processors
     logitsprocs: LogitsProcessors
+
+    # Speculative token ids
+    spec_token_ids: list[list[int]] | None = None
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 1b699565f26f2c654aa223425756c6c58d05ae8c..8e2c798dd35ff1cb50d7a4bec09a0de308bf2b97 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -17,10 +17,7 @@ def _apply_bad_words_single_batch(
 
         prefix_length = len(bad_word_ids) - 1
         last_token_id = bad_word_ids[-1]
-        if prefix_length > 0:
-            actual_prefix = past_tokens_ids[-prefix_length:]
-        else:
-            actual_prefix = []
+        actual_prefix = past_tokens_ids[-prefix_length:] if prefix_length > 0 else []
         expected_prefix = bad_word_ids[:prefix_length]
 
         assert len(actual_prefix) == len(expected_prefix)
@@ -35,5 +32,21 @@ def apply_bad_words(
     past_tokens_ids: list[list[int]],
 ) -> None:
     for i, bad_words_ids in bad_words_token_ids.items():
-        _apply_bad_words_single_batch(logits[i], bad_words_ids,
-                                      past_tokens_ids[i])
+        _apply_bad_words_single_batch(logits[i], bad_words_ids, past_tokens_ids[i])
+
+
+def apply_bad_words_with_drafts(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+    num_draft_tokens: list[int],
+) -> None:
+    start_idx = 0
+    for i, bad_words_ids in bad_words_token_ids.items():
+        for draft_idx in range(num_draft_tokens[i]):
+            _apply_bad_words_single_batch(
+                logits[start_idx + draft_idx],
+                bad_words_ids,
+                past_tokens_ids[start_idx + draft_idx],
+            )
+        start_idx += num_draft_tokens[i]
diff --git a/vllm/v1/sample/ops/logprobs.py b/vllm/v1/sample/ops/logprobs.py
index 82875b7c8452281cbf43513dc08dfefb2c549f14..cf36d46e13fdac4afab61000e476f79f4a6b712f 100644
--- a/vllm/v1/sample/ops/logprobs.py
+++ b/vllm/v1/sample/ops/logprobs.py
@@ -8,8 +8,7 @@ from vllm.platforms import current_platform
 
 
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
-def batched_count_greater_than(x: torch.Tensor,
-                               values: torch.Tensor) -> torch.Tensor:
+def batched_count_greater_than(x: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
     """
     Counts elements in each row of x that are greater than the corresponding
     value in values.  Use torch.compile to generate an optimized kernel for
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 5d54f6679a1a98d0be34052d301477b05a1e0752..241d9de957ea24fbf71c65627b6472a5ed832a53 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -4,7 +4,8 @@
 import torch
 
 from vllm.model_executor.layers.utils import apply_penalties
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
 
 
 def apply_all_penalties(
@@ -19,15 +20,28 @@ def apply_all_penalties(
     Applies presence, frequency and repetition penalties to the logits.
     """
     _, vocab_size = logits.shape
-    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
-                                          logits.device)
-    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
-                           presence_penalties, frequency_penalties,
-                           repetition_penalties)
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, logits.device)
 
+    # In the async scheduling case, rows that won't have penalties applied may contain
+    # -1 placeholder token ids. We must replace these with valid token ids so that the
+    # scatter done in apply_penalties is valid.
+    # NOTE(nick): The penalties implementation is currently quite inefficient and
+    # will be reworked anyhow.
+    output_tokens_t.masked_fill_(output_tokens_t == -1, vocab_size)
 
-def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int,
-                        device: torch.device) -> torch.Tensor:
+    return apply_penalties(
+        logits,
+        prompt_token_ids,
+        output_tokens_t,
+        presence_penalties,
+        frequency_penalties,
+        repetition_penalties,
+    )
+
+
+def _convert_to_tensors(
+    output_token_ids: list[list[int]], vocab_size: int, device: torch.device
+) -> torch.Tensor:
     """
     Convert the different list data structures to tensors.
     """
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5bcf1b585441f3d3d343b0c64f1f375c963a81f3..02ea658b7f20e3937a41e78c4d7a81e44945ab8c 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,25 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.nn as nn
 from packaging import version
 
 from vllm import envs
-from vllm.config import LogprobsMode
+from vllm.config.model import LogprobsMode
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.platforms import CpuArchEnum, current_platform
 
 logger = init_logger(__name__)
 
-try:
-    import flashinfer.sampling
-    is_flashinfer_available = True
-except ImportError:
-    is_flashinfer_available = False
-
 
 class TopKTopPSampler(nn.Module):
     """
@@ -34,42 +27,34 @@ class TopKTopPSampler(nn.Module):
         self.logprobs_mode = logprobs_mode
         # flashinfer optimization does not apply if intermediate
         # logprobs/logits after top_k/top_p need to be returned
-        if logprobs_mode not in ("processed_logits", "processed_logprobs"
-                                 ) and current_platform.is_cuda():
-            if is_flashinfer_available:
-                flashinfer_version = flashinfer.__version__
-                if version.parse(flashinfer_version) < version.parse("0.2.3"):
-                    logger.warning_once(
-                        "FlashInfer version >= 0.2.3 required. "
-                        "Falling back to default sampling implementation.")
-                    self.forward = self.forward_native
-                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
-                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
-                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
-                    # default it is unused). For backward compatibility, we set
-                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
-                    # interpret it differently in V0 and V1 samplers: In V0,
-                    # None means False, while in V1, None means True. This is
-                    # why we use the condition
-                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    logger.info_once(
-                        "Using FlashInfer for top-p & top-k sampling.")
-                    self.forward = self.forward_cuda
-                else:
-                    logger.warning_once(
-                        "FlashInfer is available, but it is not enabled. "
-                        "Falling back to the PyTorch-native implementation of "
-                        "top-p & top-k sampling. For the best performance, "
-                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
-                    self.forward = self.forward_native
+        if (
+            logprobs_mode not in ("processed_logits", "processed_logprobs")
+            and current_platform.is_cuda()
+        ):
+            if envs.VLLM_USE_FLASHINFER_SAMPLER:
+                # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
+                logger.info_once(
+                    "Using FlashInfer for top-p & top-k sampling.",
+                    scope="global",
+                )
+                self.forward = self.forward_cuda
             else:
-                logger.warning_once(
-                    "FlashInfer is not available. Falling back to the PyTorch-"
-                    "native implementation of top-p & top-k sampling. For the "
-                    "best performance, please install FlashInfer.")
+                logger.debug_once(
+                    "FlashInfer top-p/top-k sampling is available but disabled "
+                    "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
+                    "after verifying accuracy for your workloads."
+                )
                 self.forward = self.forward_native
+
         elif current_platform.is_cpu():
-            self.forward = self.forward_cpu
+            arch = current_platform.get_cpu_architecture()
+            # Fall back to native implementation for POWERPC and RISCV.
+            # On PowerPC argmax produces incorrect output with torch.compile.
+            # PR: https://github.com/vllm-project/vllm/pull/26987
+            if arch in (CpuArchEnum.RISCV, CpuArchEnum.POWERPC):
+                self.forward = self.forward_native
+            else:
+                self.forward = self.forward_cpu
         else:
             self.forward = self.forward_native
 
@@ -79,9 +64,9 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         PyTorch-native implementation of top-k and top-p sampling.
 
@@ -100,22 +85,24 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """More optimized implementation for top-k and top-p sampling."""
         # We prefer `random_sample` over `flashinfer_sample` when sorting is
         # not needed. This is because `random_sample` does not require
         # CPU-GPU synchronization while `flashinfer_sample` does.
         if (k is None and p is None) or generators:
             if generators:
-                logger.debug_once("FlashInfer 0.2.3+ does not support "
-                                  "per-request generators. Falling back to "
-                                  "PyTorch-native implementation.")
+                logger.debug_once(
+                    "FlashInfer 0.2.3+ does not support "
+                    "per-request generators. Falling back to "
+                    "PyTorch-native implementation."
+                )
             return self.forward_native(logits, generators, k, p)
-        assert self.logprobs_mode not in (
-            "processed_logits", "processed_logprobs"
-        ), "FlashInfer does not support returning logits/logprobs"
+        assert self.logprobs_mode not in ("processed_logits", "processed_logprobs"), (
+            "FlashInfer does not support returning logits/logprobs"
+        )
         # flashinfer sampling functions expect contiguous logits.
         # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
         # because of slicing operation in logits_processor.
@@ -125,9 +112,9 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         PyTorch-native implementation of top-k and top-p sampling for CPU.
 
@@ -140,15 +127,6 @@ class TopKTopPSampler(nn.Module):
         elif self.logprobs_mode == "processed_logprobs":
             logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
 
-        # Note: this is a workaround for
-        # https://github.com/pytorch/pytorch/pull/151218
-        @torch.compile(dynamic=True)
-        def compiled_random_sample(logits: torch.Tensor) -> torch.Tensor:
-            probs = logits.softmax(dim=-1, dtype=torch.float32)
-            q = torch.empty_like(probs)
-            q.exponential_()
-            return probs.div(q).argmax(dim=-1).view(-1)
-
         if len(generators) != logits.shape[0]:
             return compiled_random_sample(logits), logits_to_return
         else:
@@ -161,10 +139,20 @@ class TopKTopPSampler(nn.Module):
             return probs.div_(q).argmax(dim=-1).view(-1), logits_to_return
 
 
+# Note: this is a workaround for
+# https://github.com/pytorch/pytorch/pull/151218
+@torch.compile(dynamic=True)
+def compiled_random_sample(logits: torch.Tensor) -> torch.Tensor:
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+    q = torch.empty_like(probs)
+    q.exponential_()
+    return probs.div(q).argmax(dim=-1).view(-1)
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
@@ -255,8 +243,8 @@ def random_sample(
 
 def flashinfer_sample(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
     generators: dict[int, torch.Generator],
 ) -> torch.Tensor:
     """Sample from the logits using FlashInfer.
@@ -273,20 +261,30 @@ def flashinfer_sample(
     does not. Call this function at the end of the forward pass to minimize
     the synchronization overhead.
     """
+    import flashinfer
+
+    if version.parse(flashinfer.__version__) < version.parse("0.2.3"):
+        raise ImportError(
+            "FlashInfer version >= 0.2.3 required for top-k and top-p sampling. "
+        )
+
     assert not (k is None and p is None)
     if k is None:
         # Top-p only.
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         next_token_ids = flashinfer.sampling.top_p_sampling_from_probs(
-            probs, p, deterministic=True)
+            probs, p, deterministic=True
+        )
     elif p is None:
         # Top-k only.
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         next_token_ids = flashinfer.sampling.top_k_sampling_from_probs(
-            probs, k, deterministic=True)
+            probs, k, deterministic=True
+        )
     else:
         # Both top-k and top-p.
         next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits(
-            logits, k, p, deterministic=True)
+            logits, k, p, deterministic=True
+        )
 
     return next_token_ids.view(-1)
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 8f0b38ecb34d7fed58720948abbe3bc3c8c1f713..f31a0cddda9ae2e24ad0022d23110f244bce31b6 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,20 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
+from dataclasses import replace
+
+import numpy as np
 import torch
 import torch.nn as nn
 
 from vllm.logger import init_logger
 from vllm.triton_utils import tl, triton
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.bad_words import apply_bad_words_with_drafts
+from vllm.v1.sample.ops.penalties import apply_all_penalties
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 logger = init_logger(__name__)
 
 PLACEHOLDER_TOKEN_ID: tl.constexpr = -1
-GREEDY_TEMPERATURE: tl.constexpr = -1
+GREEDY_TEMPERATURE: tl.constexpr = 0
 # Maximum number of speculative draft tokens allowed per request in a single
 # step. This value is chosen to be large enough to handle typical use cases.
 MAX_SPEC_LEN = 128
@@ -43,18 +49,23 @@ class RejectionSampler(nn.Module):
         output tokens = accepted tokens + recovered tokens + bonus tokens
     """
 
+    def __init__(self, sampler: Sampler):
+        super().__init__()
+        self.sampler = sampler
+        logprobs_mode = self.sampler.logprobs_mode
+        self.is_processed_logprobs_mode = logprobs_mode.startswith("processed")
+        self.is_logits_logprobs_mode = logprobs_mode.endswith("logits")
+
     def forward(
         self,
         metadata: SpecDecodeMetadata,
         # [num_tokens, vocab_size]
-        draft_probs: Optional[torch.Tensor],
-        # [num_tokens, vocab_size]
-        target_logits: torch.Tensor,
-        # [batch_size, 1]
-        bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor | None,
+        # [num_tokens + batch_size, vocab_size]
+        logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        '''
+    ) -> SamplerOutput:
+        """
         Args:
             metadata:
                 Metadata for spec decoding.
@@ -62,35 +73,65 @@ class RejectionSampler(nn.Module):
                 Probability distribution for the draft tokens. Shape is
                 [num_tokens, vocab_size]. Can be None if probabilities are
                 not provided, which is the case for ngram spec decode.
-            target_logits (torch.Tensor):
+            logits (torch.Tensor):
                 Target model's logits probability distribution.
-                Shape is [num_tokens, vocab_size]. Here, probabilities from
-                different requests are flattened into a single tensor because
-                this is the shape of the output logits.
-                NOTE: `target_logits` can be updated in place to save memory.
-            bonus_token_ids (torch.Tensor):
-                A tensor containing bonus tokens. Shape is [batch_size, 1].
-                Bonus tokens are added to the end of the sequence if all
-                proposed tokens are accepted. We generate the bonus tokens
-                outside of the rejection sampler with the default sampling
-                strategy. It allows for more flexibility in the sampling
-                process such as top_p, top_k sampling.
+                Shape is [num_tokens + batch_size, vocab_size]. Here,
+                probabilities from different requests are flattened into a
+                single tensor because this is the shape of the output logits.
+                NOTE: `logits` can be updated in place to save memory.
             sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                 Additional metadata needed for sampling, such as temperature,
                 top-k/top-p parameters, or other relevant information.
         Returns:
-            output_token_ids (torch.Tensor):
-                A tensor containing the final output token IDs.
-        '''
+            SamplerOutput:
+                Contains the final output token IDs and their logprobs if
+                requested.
+        """
         assert metadata.max_spec_len <= MAX_SPEC_LEN
+
+        bonus_logits_indices = metadata.bonus_logits_indices
+        target_logits_indices = metadata.target_logits_indices
+
+        # When indexing with a tensor (bonus_logits_indices), PyTorch
+        # creates a new tensor with separate storage from the original
+        # logits tensor. This means any in-place operations on bonus_logits
+        # won't affect the original logits tensor.
+        assert logits is not None
+        bonus_logits = logits[bonus_logits_indices]
+        bonus_sampler_output = self.sampler(
+            logits=bonus_logits,
+            sampling_metadata=replace(
+                sampling_metadata,
+                max_num_logprobs=-1,
+            ),
+            predict_bonus_token=True,
+            # Override the logprobs mode to return logits because they are
+            # needed later to compute the accepted token logprobs.
+            logprobs_mode_override="processed_logits"
+            if self.is_processed_logprobs_mode
+            else "raw_logits",
+        )
+        bonus_token_ids = bonus_sampler_output.sampled_token_ids
+
+        # Just like `bonus_logits`, `target_logits` is a new tensor with
+        # separate storage from the original `logits` tensor. Therefore,
+        # it is safe to update `target_logits` in place.
+        raw_target_logits = logits[target_logits_indices]
+        # Use float32 for the target_logits.
+        raw_target_logits = raw_target_logits.to(torch.float32)
+        target_logits = self.apply_logits_processors(
+            raw_target_logits, sampling_metadata, metadata
+        )
         # [num_tokens, vocab_size]
         # NOTE(woosuk): `target_logits` can be updated in place inside the
-        # `compute_probs` function.
-        target_probs = compute_probs(
+        # `apply_sampling_constraints` function.
+        target_logits = apply_sampling_constraints(
             target_logits,
             metadata.cu_num_draft_tokens,
             sampling_metadata,
         )
+        # Compute probability distribution from target logits.
+        target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
 
         output_token_ids = rejection_sample(
             metadata.draft_token_ids,
@@ -102,34 +143,174 @@ class RejectionSampler(nn.Module):
             bonus_token_ids,
             sampling_metadata,
         )
-        return output_token_ids
+
+        logprobs_tensors = None
+        if sampling_metadata.max_num_logprobs:
+            logprobs_tensors = self._get_logprobs_tensors(
+                sampling_metadata.max_num_logprobs,
+                metadata,
+                logits,
+                target_logits if self.is_processed_logprobs_mode else raw_target_logits,
+                bonus_sampler_output.logprobs_tensors.logprobs,
+                output_token_ids,
+            )
+
+        return SamplerOutput(
+            sampled_token_ids=output_token_ids,
+            logprobs_tensors=logprobs_tensors,
+        )
+
+    def _get_logprobs_tensors(
+        self,
+        max_num_logprobs: int,
+        metadata: SpecDecodeMetadata,
+        logits: torch.Tensor,
+        target_logits: torch.Tensor,
+        bonus_logits: torch.Tensor,
+        sampled_token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        cu_num_sampled_tokens = torch.zeros_like(metadata.cu_num_sampled_tokens)
+        cu_num_sampled_tokens[1:] = metadata.cu_num_sampled_tokens[:-1]
+
+        # Collect target and bonus logits.
+        bonus_logits_indices = metadata.bonus_logits_indices
+        target_logits_indices = metadata.target_logits_indices
+        final_logits = torch.zeros_like(logits, dtype=torch.float32)
+        final_logits[target_logits_indices] = target_logits.to(torch.float32)
+        final_logits[bonus_logits_indices] = bonus_logits.to(torch.float32)
+
+        # Compute accepted token indices.
+        accepted_mask = sampled_token_ids != PLACEHOLDER_TOKEN_ID
+        num_accepted_tokens = accepted_mask.sum(dim=-1)
+        accepted_logit_indices = accepted_mask.nonzero(as_tuple=True)[1]
+        accepted_logit_indices += cu_num_sampled_tokens.repeat_interleave(
+            num_accepted_tokens
+        )
+
+        # Compute logprobs for accepted tokens.
+        accepted_logits = final_logits[accepted_logit_indices]
+        accepted_logprobs = (
+            accepted_logits
+            if self.is_logits_logprobs_mode
+            else self.sampler.compute_logprobs(accepted_logits)
+        )
+        accepted_tokens = sampled_token_ids[accepted_mask]
+        return self.sampler.gather_logprobs(
+            accepted_logprobs,
+            max_num_logprobs,
+            accepted_tokens.to(torch.int64),
+        )
 
     @staticmethod
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+    ) -> list[np.ndarray]:
         """Parse the output of the rejection sampler.
-
         Args:
             output_token_ids: The sampled token IDs in shape
                 [batch_size, max_spec_len + 1]. The rejected tokens are
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
             vocab_size: The size of the vocabulary.
-
         Returns:
             A list of lists of token IDs.
         """
         output_token_ids_np = output_token_ids.cpu().numpy()
         # Create mask for valid tokens.
-        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
-                      (output_token_ids_np < vocab_size))
-        outputs = [
-            row[valid_mask[i]].tolist()
-            for i, row in enumerate(output_token_ids_np)
-        ]
-        return outputs
+        valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
+            output_token_ids_np < vocab_size
+        )
+        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
+
+    def apply_logits_processors(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        metadata: SpecDecodeMetadata,
+    ) -> torch.Tensor:
+        has_penalties = not sampling_metadata.no_penalties
+        any_penalties_or_bad_words = (
+            sampling_metadata.bad_words_token_ids or has_penalties
+        )
+
+        output_token_ids = sampling_metadata.output_token_ids
+        if any_penalties_or_bad_words:
+            output_token_ids = self._combine_outputs_with_spec_tokens(
+                output_token_ids,
+                sampling_metadata.spec_token_ids,
+            )
+
+        # Calculate indices of target logits.
+        if sampling_metadata.allowed_token_ids_mask is not None or has_penalties:
+            num_requests = len(sampling_metadata.output_token_ids)
+            num_draft_tokens = torch.tensor(metadata.num_draft_tokens, device="cpu")
+            original_indices = torch.arange(num_requests, device="cpu")
+            repeat_indices_cpu = original_indices.repeat_interleave(num_draft_tokens)
+            repeat_indices = repeat_indices_cpu.to(
+                device=logits.device, non_blocking=True
+            )
+            logits = self.apply_penalties(
+                logits, sampling_metadata, metadata, repeat_indices, output_token_ids
+            )
+
+            # Apply allowed token ids.
+            if sampling_metadata.allowed_token_ids_mask is not None:
+                token_mask = sampling_metadata.allowed_token_ids_mask[repeat_indices]
+                logits.masked_fill_(token_mask, float("-inf"))
+
+        # Apply bad words exclusion.
+        if bad_words_token_ids := sampling_metadata.bad_words_token_ids:
+            apply_bad_words_with_drafts(
+                logits, bad_words_token_ids, output_token_ids, metadata.num_draft_tokens
+            )
+
+        return logits
+
+    @staticmethod
+    def apply_penalties(
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        metadata: SpecDecodeMetadata,
+        repeat_indices: torch.Tensor,
+        output_token_ids: list[list[int]],
+    ) -> torch.Tensor:
+        if sampling_metadata.no_penalties:
+            return logits
+
+        assert sampling_metadata.prompt_token_ids is not None
+
+        prompt_token_ids = sampling_metadata.prompt_token_ids[repeat_indices]
+        presence_penalties = sampling_metadata.presence_penalties[repeat_indices]
+        frequency_penalties = sampling_metadata.frequency_penalties[repeat_indices]
+        repetition_penalties = sampling_metadata.repetition_penalties[repeat_indices]
+
+        logits = apply_all_penalties(
+            logits,
+            prompt_token_ids,
+            presence_penalties,
+            frequency_penalties,
+            repetition_penalties,
+            output_token_ids,
+        )
+        return logits
+
+    @staticmethod
+    def _combine_outputs_with_spec_tokens(
+        output_token_ids: list[list[int]],
+        spec_token_ids: list[list[int]] | None = None,
+    ) -> list[list[int]]:
+        if spec_token_ids is None:
+            return output_token_ids
+
+        result = []
+        for out, spec in zip(output_token_ids, spec_token_ids):
+            if len(spec) == 0:
+                continue
+            result.append(out)
+            for i in range(len(spec) - 1):
+                result.append([*result[-1], spec[i]])
+        return result
 
 
 def rejection_sample(
@@ -141,7 +322,7 @@ def rejection_sample(
     # [batch_size]
     cu_num_draft_tokens: torch.Tensor,
     # [num_tokens, vocab_size]
-    draft_probs: Optional[torch.Tensor],
+    draft_probs: torch.Tensor | None,
     # [num_tokens, vocab_size]
     target_probs: torch.Tensor,
     # [batch_size, 1]
@@ -164,12 +345,12 @@ def rejection_sample(
     assert target_probs.shape == (num_tokens, vocab_size)
 
     # Create output buffer.
-    output_token_ids = torch.empty(
+    output_token_ids = torch.full(
         (batch_size, max_spec_len + 1),
+        PLACEHOLDER_TOKEN_ID,
         dtype=torch.int32,  # Consistent with SamplerOutput.sampled_token_ids.
         device=device,
     )
-    output_token_ids.fill_(PLACEHOLDER_TOKEN_ID)
 
     if sampling_metadata.all_greedy:
         is_greedy = None
@@ -178,7 +359,7 @@ def rejection_sample(
     if not sampling_metadata.all_random:
         # Rejection sampling for greedy sampling requests.
         target_argmax = target_probs.argmax(dim=-1)
-        rejection_greedy_sample_kernel[(batch_size, )](
+        rejection_greedy_sample_kernel[(batch_size,)](
             output_token_ids,
             cu_num_draft_tokens,
             draft_token_ids,
@@ -186,7 +367,6 @@ def rejection_sample(
             bonus_token_ids,
             is_greedy,
             max_spec_len,
-            num_warps=1,
         )
         if sampling_metadata.all_greedy:
             return output_token_ids
@@ -214,7 +394,7 @@ def rejection_sample(
     )
 
     # Rejection sampling for random sampling requests.
-    rejection_random_sample_kernel[(batch_size, )](
+    rejection_random_sample_kernel[(batch_size,)](
         output_token_ids,
         cu_num_draft_tokens,
         draft_token_ids,
@@ -227,32 +407,30 @@ def rejection_sample(
         max_spec_len,
         vocab_size,
         NO_DRAFT_PROBS=draft_probs is None,
-        num_warps=1,
     )
     return output_token_ids
 
 
-def compute_probs(
+def apply_sampling_constraints(
     logits: torch.Tensor,  # [num_tokens, vocab_size]
     cu_num_draft_tokens: torch.Tensor,  # [batch_size]
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    """Compute probability distribution from logits based on sampling metadata.
+    """Process logits based on sampling metadata.
 
-    This function applies temperature scaling to the logits and converts
-    them to probabilities using softmax. For greedy decoding, it returns
+    This function applies temperature scaling to the logits,
+    as well as top-k and top-p. For greedy decoding, it returns
     the original logits.
 
     Args:
-        logits: Input logits tensor to be converted to probabilities.
+        logits: Input logits tensor to be processed.
         cu_num_draft_tokens: Cumulative number of draft tokens.
         sampling_metadata: Metadata containing sampling parameters such as
             temperature and whether greedy sampling is used.
 
     Returns:
-        torch.Tensor: Probability distribution (softmax of scaled logits)
-            if non-greedy sampling is used, otherwise returns the
-            original logits.
+        torch.Tensor: Processed logits if non-greedy sampling is used,
+        otherwise returns the original logits.
     """
     assert logits.ndim == 2
     assert cu_num_draft_tokens.ndim == 1
@@ -288,9 +466,7 @@ def compute_probs(
 
     # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
     # which is slow for large vocab sizes. This may cause performance issues.
-    logits = apply_top_k_top_p(logits, top_k, top_p)
-    output_prob = logits.softmax(dim=-1, dtype=torch.float32)
-    return output_prob
+    return apply_top_k_top_p(logits, top_k, top_p)
 
 
 def expand_batch_to_tokens(
@@ -322,14 +498,13 @@ def expand_batch_to_tokens(
     batch_size = x.shape[0]
     assert cu_num_tokens.shape[0] == batch_size
     expanded_x = x.new_empty(num_tokens)
-    expand_kernel[(batch_size, )](
+    expand_kernel[(batch_size,)](
         expanded_x,
         x,
         cu_num_tokens,
         replace_from,
         replace_to,
         MAX_NUM_TOKENS=MAX_SPEC_LEN,  # To avoid recompilation.
-        num_warps=1,
     )
     return expanded_x
 
@@ -371,7 +546,7 @@ def generate_uniform_probs(
     # https://github.com/pytorch/pytorch/issues/16706. Using float64
     # mitigates the issue.
     uniform_probs = torch.rand(
-        (num_tokens, ),
+        (num_tokens,),
         dtype=torch.float64,
         device=device,
     )
@@ -397,7 +572,7 @@ def sample_recovered_tokens(
     # [num_tokens]
     draft_token_ids: torch.Tensor,
     # [num_tokens, vocab_size]
-    draft_probs: Optional[torch.Tensor],
+    draft_probs: torch.Tensor | None,
     # [num_tokens, vocab_size]
     target_probs: torch.Tensor,
     sampling_metadata: SamplingMetadata,
@@ -447,18 +622,12 @@ def rejection_greedy_sample_kernel(
     req_idx = tl.program_id(0)
     # FIXME(woosuk): Because is_greedy_ptr is not None at profiling run,
     # re-compilation may happen during runtime when is_greedy_ptr is None.
-    if is_greedy_ptr is None:
-        is_greedy = True
-    else:
-        is_greedy = tl.load(is_greedy_ptr + req_idx)
+    is_greedy = True if is_greedy_ptr is None else tl.load(is_greedy_ptr + req_idx)
     if not is_greedy:
         # Early exit for non-greedy sampling requests.
         return
 
-    if req_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    start_idx = 0 if req_idx == 0 else tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
     end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
     num_draft_tokens = end_idx - start_idx
 
@@ -467,8 +636,10 @@ def rejection_greedy_sample_kernel(
         if not rejected:
             draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
             target_argmax_id = tl.load(target_argmax_ptr + start_idx + pos)
-            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
-                     target_argmax_id)
+            tl.store(
+                output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                target_argmax_id,
+            )
             if draft_token_id != target_argmax_id:
                 # Reject.
                 rejected = True
@@ -477,8 +648,9 @@ def rejection_greedy_sample_kernel(
         # If all tokens are accepted, append the bonus token.
         bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
         tl.store(
-            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
-            num_draft_tokens, bonus_token_id)
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) + num_draft_tokens,
+            bonus_token_id,
+        )
 
 
 # NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
@@ -503,10 +675,7 @@ def rejection_random_sample_kernel(
         # Early exit for greedy sampling requests.
         return
 
-    if req_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    start_idx = 0 if req_idx == 0 else tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
     end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
     num_draft_tokens = end_idx - start_idx
 
@@ -517,12 +686,12 @@ def rejection_random_sample_kernel(
             if NO_DRAFT_PROBS:
                 draft_prob = 1
             else:
-                draft_prob = tl.load(draft_probs_ptr +
-                                     (start_idx + pos) * vocab_size +
-                                     draft_token_id)
-            target_prob = tl.load(target_probs_ptr +
-                                  (start_idx + pos) * vocab_size +
-                                  draft_token_id)
+                draft_prob = tl.load(
+                    draft_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id
+                )
+            target_prob = tl.load(
+                target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id
+            )
             uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos)
             # NOTE(woosuk): While the draft probability should never be 0,
             # we check it to avoid NaNs. If it happens to be 0, we reject.
@@ -533,15 +702,17 @@ def rejection_random_sample_kernel(
                 # Reject. Use recovered token.
                 rejected = True
                 token_id = tl.load(recovered_token_ids_ptr + start_idx + pos)
-            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
-                     token_id)
+            tl.store(
+                output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos, token_id
+            )
 
     if not rejected:
         # If all tokens are accepted, append the bonus token.
         bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
         tl.store(
-            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
-            num_draft_tokens, bonus_token_id)
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) + num_draft_tokens,
+            bonus_token_id,
+        )
 
 
 # NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
@@ -565,9 +736,7 @@ def expand_kernel(
     src_val = tl.load(input_ptr + req_idx)
     src_val = tl.where(src_val == replace_from, replace_to, src_val)
     offset = tl.arange(0, MAX_NUM_TOKENS)
-    tl.store(output_ptr + start_idx + offset,
-             src_val,
-             mask=offset < num_tokens)
+    tl.store(output_ptr + start_idx + offset, src_val, mask=offset < num_tokens)
 
 
 @triton.jit
@@ -583,10 +752,7 @@ def sample_recovered_tokens_kernel(
     NO_DRAFT_PROBS: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
-    if req_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    start_idx = 0 if req_idx == 0 else tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
     end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
     num_draft_tokens = end_idx - start_idx
 
@@ -598,26 +764,30 @@ def sample_recovered_tokens_kernel(
     vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
     if NO_DRAFT_PROBS:
         draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
-        prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
-                       vocab_offset,
-                       mask=((vocab_offset < vocab_size) &
-                             (vocab_offset != draft_token_id)),
-                       other=0)
+        prob = tl.load(
+            target_probs_ptr + (start_idx + pos) * vocab_size + vocab_offset,
+            mask=((vocab_offset < vocab_size) & (vocab_offset != draft_token_id)),
+            other=0,
+        )
     else:
-        draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
-                             vocab_offset,
-                             mask=vocab_offset < vocab_size,
-                             other=0)
-        target_prob = tl.load(target_probs_ptr +
-                              (start_idx + pos) * vocab_size + vocab_offset,
-                              mask=vocab_offset < vocab_size,
-                              other=0)
+        draft_prob = tl.load(
+            draft_probs_ptr + (start_idx + pos) * vocab_size + vocab_offset,
+            mask=vocab_offset < vocab_size,
+            other=0,
+        )
+        target_prob = tl.load(
+            target_probs_ptr + (start_idx + pos) * vocab_size + vocab_offset,
+            mask=vocab_offset < vocab_size,
+            other=0,
+        )
         prob = tl.maximum(target_prob - draft_prob, 0)
         # NOTE(woosuk): We don't need `prob = prob / tl.sum(prob)` here because
         # `tl.argmax` will select the maximum value.
 
-    q = tl.load(q_ptr + req_idx * vocab_size + vocab_offset,
-                mask=vocab_offset < vocab_size,
-                other=float("-inf"))
+    q = tl.load(
+        q_ptr + req_idx * vocab_size + vocab_offset,
+        mask=vocab_offset < vocab_size,
+        other=float("-inf"),
+    )
     recovered_id = tl.argmax(prob / q, axis=-1)
     tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 83ea766b1b4ada4c4df54a1dd4729e713f97c970..39c63fe31ad2c08be48b123e47bf0f1b8fe7decb 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -2,13 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
-from typing import Optional
-
 import torch
 import torch.nn as nn
 
-from vllm.config import LogprobsMode
-from vllm.utils import is_pin_memory_available
+from vllm.config.model import LogprobsMode
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
@@ -24,39 +22,39 @@ class Sampler(nn.Module):
     A layer that samples the next tokens from the model's outputs
     with the following steps in order:
 
-    1. If logprobs are requested:  
+    1. If logprobs are requested:
         a) If `logprobs_mode` is `raw_logprobs`, compute logprobs
-           as the final logprobs to return.  
+           as the final logprobs to return.
         b) If `logprobs_mode` is `raw_logits`, clone the logits
-           as the final logprobs to return.  
-    2. Convert logits to float32.  
-    3. Apply allowed token ids whitelist.  
-    4. Apply bad words exclusion.  
+           as the final logprobs to return.
+    2. Convert logits to float32.
+    3. Apply allowed token ids whitelist.
+    4. Apply bad words exclusion.
     5. Apply logit processors which are not argmax-invariant,
-       i.e. that can impact greedy sampling.  
-        a) Min tokens processor  
-        b) Logit bias processor  
-    6. Apply penalties  
-        a) Repetition penalty  
-        b) Frequency penalty  
-        c) Presence penalty  
-    7. Sample the next tokens. `sample` method performs the following steps:  
+       i.e. that can impact greedy sampling.
+        a) Min tokens processor
+        b) Logit bias processor
+    6. Apply penalties
+        a) Repetition penalty
+        b) Frequency penalty
+        c) Presence penalty
+    7. Sample the next tokens. `sample` method performs the following steps:
         a) If not `all_random`, perform greedy sampling. If `all_greedy`,
-           return the greedily sampled tokens and final logprobs if requested.  
-        b) Apply temperature.  
+           return the greedily sampled tokens and final logprobs if requested.
+        b) Apply temperature.
         c) Apply logit processors which are argmax-invariant, by default
-           the min_p processor.  
-        d) Apply top_k and/or top_p.  
-        e) Sample the next tokens with the probability distribution.  
+           the min_p processor.
+        d) Apply top_k and/or top_p.
+        e) Sample the next tokens with the probability distribution.
         f) If `all_random` or temperature >= epsilon (1e-5), return the
            randomly sampled tokens and final logprobs if requested. Else,
-           return the greedily sampled tokens and logprobs if requested.  
+           return the greedily sampled tokens and logprobs if requested.
     8. Gather the logprobs of the top `max_num_logprobs` and sampled token
        (if requested). Note that if the sampled token is within the top
        `max_num_logprobs`, the logprob will be eventually merged in
        `LogprobsProcessor` during output processing. Therefore, the
        final output may contain either `max_num_logprobs + 1` or
-       `max_num_logprobs` logprobs.  
+       `max_num_logprobs` logprobs.
     9. Return the final `SamplerOutput`.
     """
 
@@ -70,32 +68,27 @@ class Sampler(nn.Module):
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
+        predict_bonus_token: bool = False,
+        logprobs_mode_override: LogprobsMode | None = None,
     ) -> SamplerOutput:
+        logprobs_mode = logprobs_mode_override or self.logprobs_mode
         # NOTE(woosuk): Use the original logits (before any penalties or
         # temperature scaling) for the top-k logprobs.
         # This is different from the V0 sampler, which uses the logits that
         # is used for sampling (after penalties and temperature scaling).
         num_logprobs = sampling_metadata.max_num_logprobs
         if num_logprobs is not None:
-            if self.logprobs_mode == "raw_logprobs":
+            if logprobs_mode == "raw_logprobs":
                 raw_logprobs = self.compute_logprobs(logits)
-            elif self.logprobs_mode == "raw_logits":
+            elif logprobs_mode == "raw_logits":
                 raw_logprobs = logits.clone()
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
-        # Apply allowed token ids.
-        logits = self.apply_allowed_token_ids(logits, sampling_metadata)
-        # Apply bad words exclusion.
-        logits = self.apply_bad_words(logits, sampling_metadata)
-
-        # Apply logits processors which can impact greedy sampling
-        for processor in sampling_metadata.logitsprocs.non_argmax_invariant:
-            logits = processor.apply(logits)
-
-        # Apply penalties (e.g., min_tokens, freq_penalties).
-        logits = self.apply_penalties(logits, sampling_metadata)
 
+        logits = self.apply_logits_processors(
+            logits, sampling_metadata, predict_bonus_token
+        )
         # Sample the next token.
         sampled, processed_logprobs = self.sample(logits, sampling_metadata)
         if processed_logprobs is not None:
@@ -106,10 +99,18 @@ class Sampler(nn.Module):
         # return int32 (while PyTorch argmax and topk return int64).
         sampled = sampled.long()
 
-        # Gather the logprobs of the topk and sampled token (if requested).
-        # Get logprobs and rank tensors (if requested)
-        logprobs_tensors = None if num_logprobs is None else \
-            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)
+        if num_logprobs is None:
+            logprobs_tensors = None
+        elif num_logprobs == -1:
+            # Return the full unsorted and unranked logprobs.
+            logprobs_tensors = LogprobsTensors(
+                torch.empty(0), raw_logprobs, torch.empty(0)
+            )
+        else:
+            # Gather the logprobs and ranks of the topk and sampled token.
+            logprobs_tensors = self.gather_logprobs(
+                raw_logprobs, num_logprobs, token_ids=sampled
+            )
 
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
@@ -124,8 +125,8 @@ class Sampler(nn.Module):
         )
         return sampler_output
 
+    @staticmethod
     def apply_temperature(
-        self,
         logits: torch.Tensor,
         temp: torch.Tensor,
         all_random: bool,
@@ -136,22 +137,24 @@ class Sampler(nn.Module):
             temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
         return logits.div_(temp.unsqueeze(dim=1))
 
-    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+    @staticmethod
+    def greedy_sample(logits: torch.Tensor) -> torch.Tensor:
         return logits.argmax(dim=-1).view(-1)
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logprobs_mode_override: LogprobsMode | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """Sample logits based on sampling metadata.
 
         The various logits processing functions called in this method
         may update the logits tensor in-place.
         """
 
-        assert not (sampling_metadata.all_greedy
-                    and sampling_metadata.all_random)
+        logprobs_mode = logprobs_mode_override or self.logprobs_mode
+        assert not (sampling_metadata.all_greedy and sampling_metadata.all_random)
         if sampling_metadata.all_random:
             greedy_sampled = None
         else:
@@ -159,17 +162,18 @@ class Sampler(nn.Module):
             if sampling_metadata.all_greedy:
                 processed_logprobs = None
                 if sampling_metadata.max_num_logprobs is not None:
-                    if self.logprobs_mode == "processed_logits":
+                    if logprobs_mode == "processed_logits":
                         processed_logprobs = logits
-                    elif self.logprobs_mode == "processed_logprobs":
+                    elif logprobs_mode == "processed_logprobs":
                         processed_logprobs = self.compute_logprobs(logits)
                 return greedy_sampled, processed_logprobs
 
         assert sampling_metadata.temperature is not None
 
         # Apply temperature.
-        logits = self.apply_temperature(logits, sampling_metadata.temperature,
-                                        sampling_metadata.all_random)
+        logits = self.apply_temperature(
+            logits, sampling_metadata.temperature, sampling_metadata.all_random
+        )
 
         # Apply logits processors that only apply to random sampling
         # (argmax invariant)
@@ -195,11 +199,12 @@ class Sampler(nn.Module):
         )
         return sampled, processed_logprobs
 
-    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+    @staticmethod
+    def compute_logprobs(logits: torch.Tensor) -> torch.Tensor:
         return logits.log_softmax(dim=-1, dtype=torch.float32)
 
+    @staticmethod
     def gather_logprobs(
-        self,
         logprobs: torch.Tensor,
         num_logprobs: int,
         token_ids: torch.Tensor,
@@ -224,9 +229,7 @@ class Sampler(nn.Module):
         """
         assert token_ids.dtype == torch.int64
         # Find the topK values.
-        topk_logprobs, topk_indices = torch.topk(logprobs,
-                                                 num_logprobs,
-                                                 dim=-1)
+        topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
 
         # Get with the logprob of the prompt or sampled token.
         token_ids = token_ids.unsqueeze(-1)
@@ -244,42 +247,70 @@ class Sampler(nn.Module):
 
         return LogprobsTensors(indices, logprobs, token_ranks)
 
-    def apply_penalties(
+    @staticmethod
+    def _combine_outputs_with_spec_tokens(
+        output_token_ids: list[list[int]],
+        spec_token_ids: list[list[int]] | None = None,
+    ) -> list[list[int]]:
+        if spec_token_ids is None:
+            return output_token_ids
+
+        return [
+            [*out, *spec] if spec else out
+            for out, spec in zip(output_token_ids, spec_token_ids)
+        ]
+
+    def apply_logits_processors(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
+        predict_bonus_token: bool,
     ) -> torch.Tensor:
-        if not sampling_metadata.no_penalties:
-            assert sampling_metadata.prompt_token_ids is not None
-            logits = apply_all_penalties(
-                logits,
-                sampling_metadata.prompt_token_ids,
-                sampling_metadata.presence_penalties,
-                sampling_metadata.frequency_penalties,
-                sampling_metadata.repetition_penalties,
-                sampling_metadata.output_token_ids,
+        bad_words_token_ids = sampling_metadata.bad_words_token_ids
+        any_penalties_or_bad_words = (
+            bool(bad_words_token_ids) or not sampling_metadata.no_penalties
+        )
+
+        output_token_ids = sampling_metadata.output_token_ids
+        if predict_bonus_token and any_penalties_or_bad_words:
+            # Combine base outputs with spec tokens when speculative decoding
+            # is enabled.
+            output_token_ids = self._combine_outputs_with_spec_tokens(
+                output_token_ids,
+                sampling_metadata.spec_token_ids,
             )
-        return logits
 
-    def apply_allowed_token_ids(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
+        # Apply allowed token ids.
         if sampling_metadata.allowed_token_ids_mask is not None:
-            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
-                                float("-inf"))
+            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask, float("-inf"))
+
+        # Apply bad words exclusion.
+        if bad_words_token_ids:
+            apply_bad_words(logits, bad_words_token_ids, output_token_ids)
+
+        # Apply logits processors which can impact greedy sampling.
+        for processor in sampling_metadata.logitsprocs.non_argmax_invariant:
+            logits = processor.apply(logits)
+
+        # Apply penalties (e.g., freq_penalties).
+        logits = self.apply_penalties(logits, sampling_metadata, output_token_ids)
         return logits
 
-    def apply_bad_words(
-        self,
+    @staticmethod
+    def apply_penalties(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
+        output_token_ids: list[list[int]],
     ) -> torch.Tensor:
-        if sampling_metadata.bad_words_token_ids:
-            apply_bad_words(
-                logits,
-                sampling_metadata.bad_words_token_ids,
-                sampling_metadata.output_token_ids,
-            )
-        return logits
+        if sampling_metadata.no_penalties:
+            return logits
+
+        assert sampling_metadata.prompt_token_ids is not None
+        return apply_all_penalties(
+            logits,
+            sampling_metadata.prompt_token_ids,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.repetition_penalties,
+            output_token_ids,
+        )
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 6491c84f607625cacaff9cd08341775e33a98e60..0c1a22e84eceaade70d16dc4b0bc145bc84fac7c 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
-from typing import Optional
 
 import torch
 
@@ -31,6 +30,7 @@ class TPUSupportedSamplingMetadata:
     top_p: torch.Tensor = None
 
     all_greedy: bool = True
+    all_random: bool = False
 
     # Whether logprobs are to be gathered in this batch of request. To balance
     # out compile time and runtime, a fixed `max_number_logprobs` value is used
@@ -48,15 +48,13 @@ class TPUSupportedSamplingMetadata:
 
     min_tokens = None  # impl is not vectorized
 
-    logit_bias: list[Optional[dict[int, float]]] = field(
-        default_factory=lambda: list())
+    logit_bias: list[dict[int, float] | None] = field(default_factory=lambda: list())
 
     allowed_token_ids_mask = None
     bad_words_token_ids = None
 
     # Generator not supported by xla
-    _generators: dict[int,
-                      torch.Generator] = field(default_factory=lambda: dict())
+    _generators: dict[int, torch.Generator] = field(default_factory=lambda: dict())
 
     @property
     def generators(self) -> dict[int, torch.Generator]:
@@ -69,13 +67,13 @@ class TPUSupportedSamplingMetadata:
         input_batch: InputBatch,
         padded_num_reqs: int,
         xla_device: torch.device,
-        generate_params_if_all_greedy: bool = False
+        generate_params_if_all_greedy: bool = False,
     ) -> "TPUSupportedSamplingMetadata":
         """
         Copy sampling tensors slices from `input_batch` to on device tensors.
 
-        `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
-        slices dynamic shapes on device tensors. This impl moves the dynamic 
+        `InputBatch._make_sampling_metadata` causes recompilation on XLA as it
+        slices dynamic shapes on device tensors. This impl moves the dynamic
         ops to CPU and produces tensors of fixed `padded_num_reqs` size.
 
         Args:
@@ -87,11 +85,11 @@ class TPUSupportedSamplingMetadata:
                 we want to pre-compile a graph with sampling parameters, even if
                 they are not strictly needed for greedy decoding.
         """
-        needs_logprobs = input_batch.max_num_logprobs>0 if \
-            input_batch.max_num_logprobs else False
+        needs_logprobs = (
+            input_batch.max_num_logprobs > 0 if input_batch.max_num_logprobs else False
+        )
         # Early return to avoid unnecessary cpu to tpu copy
-        if (input_batch.all_greedy is True
-                and generate_params_if_all_greedy is False):
+        if input_batch.all_greedy is True and generate_params_if_all_greedy is False:
             return cls(all_greedy=True, logprobs=needs_logprobs)
 
         num_reqs = input_batch.num_reqs
@@ -100,25 +98,23 @@ class TPUSupportedSamplingMetadata:
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
 
-        fill_slice(input_batch.temperature_cpu_tensor,
-                   DEFAULT_SAMPLING_PARAMS["temperature"])
-        fill_slice(input_batch.min_p_cpu_tensor,
-                   DEFAULT_SAMPLING_PARAMS["min_p"])
-        fill_slice(input_batch.top_k_cpu_tensor,
-                   DEFAULT_SAMPLING_PARAMS["top_k"])
-        fill_slice(input_batch.top_p_cpu_tensor,
-                   DEFAULT_SAMPLING_PARAMS["top_p"])
+        fill_slice(
+            input_batch.temperature_cpu_tensor, DEFAULT_SAMPLING_PARAMS["temperature"]
+        )
+        fill_slice(input_batch.min_p_cpu_tensor, DEFAULT_SAMPLING_PARAMS["min_p"])
+        fill_slice(input_batch.top_k_cpu_tensor, DEFAULT_SAMPLING_PARAMS["top_k"])
+        fill_slice(input_batch.top_p_cpu_tensor, DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
-            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
-            to(xla_device),
+            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].to(
+                xla_device
+            ),
             all_greedy=input_batch.all_greedy,
+            all_random=input_batch.all_random,
             # TODO enable more and avoid returning None values
-            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
-                xla_device),
-            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
-                xla_device),
-            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
-                xla_device),
-            logprobs=needs_logprobs)
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(xla_device),
+            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(xla_device),
+            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(xla_device),
+            logprobs=needs_logprobs,
+        )
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 17b83a4ba074c706b7f17fb326575ed2facee28e..8f0463c76ce156ed0697073ca416d8218e8fef34 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
-from typing import Optional
-
 import torch
 import torch.nn as nn
 
@@ -14,7 +12,6 @@ _SAMPLING_EPS = 1e-5
 
 
 class Sampler(nn.Module):
-
     def __init__(self):
         # TODO(houseroad): Add support for logprobs_mode.
         super().__init__()
@@ -35,14 +32,19 @@ class Sampler(nn.Module):
             # [num_requests, 1], where each row represents one generated
             # token per request.
             sampled_token_ids=sampled.unsqueeze(-1),
-            logprobs_tensors=None)
+            logprobs_tensors=None,
+        )
         return sampler_output
 
     def apply_temperature(
         self,
         logits: torch.Tensor,
         temp: torch.Tensor,
+        all_random: bool = False,
     ) -> torch.Tensor:
+        # Avoid division by zero for greedy sampling (temperature ~ 0.0).
+        if not all_random:
+            temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
         return logits.div_(temp.unsqueeze(dim=1))
 
     def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
@@ -58,7 +60,9 @@ class Sampler(nn.Module):
         assert sampling_metadata.temperature is not None
 
         # Apply temperature.
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+        logits = self.apply_temperature(
+            logits, sampling_metadata.temperature, sampling_metadata.all_random
+        )
 
         # Apply min_p.
         if sampling_metadata.min_p is not None:
@@ -73,11 +77,13 @@ class Sampler(nn.Module):
 
         # Random sample.
         probs = logits.softmax(dim=-1, dtype=torch.float32)
-        random_sampled = self.random_sample(probs,
-                                            sampling_metadata.generators)
+        random_sampled = self.random_sample(probs, sampling_metadata.generators)
 
-        sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
-                              greedy_sampled, random_sampled)
+        sampled = torch.where(
+            sampling_metadata.temperature < _SAMPLING_EPS,
+            greedy_sampled,
+            random_sampled,
+        )
         return sampled
 
     def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
@@ -107,9 +113,7 @@ class Sampler(nn.Module):
           Sampled token rank tensor, (num tokens)
         """
         # Find the topK values.
-        topk_logprobs, topk_indices = torch.topk(logprobs,
-                                                 num_logprobs,
-                                                 dim=-1)
+        topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
 
         # Get with the logprob of the prompt or sampled token.
         token_ids = token_ids.unsqueeze(-1)
@@ -138,9 +142,7 @@ class Sampler(nn.Module):
         # Convert logits to probability distribution
         probability_values = torch.nn.functional.softmax(logits, dim=-1)
         # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
+        max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
         # Reshape min_p for broadcasting
         adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
         # Identify valid tokens using threshold comparison
@@ -168,8 +170,8 @@ class Sampler(nn.Module):
 
 def apply_top_k_top_p(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Apply top-k and top-p optimized for TPU.
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 876838084b9aa0c61cf5ddb88ccbdb973e2c41ed..0a6806390451daa30bddc4c8a129ac7e72d6663d 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,10 +4,11 @@
 import dataclasses
 import importlib
 import pickle
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
+from functools import partial
 from inspect import isclass
 from types import FunctionType
-from typing import Any, Callable, Optional, Union
+from typing import Any, TypeAlias, get_type_hints
 
 import cloudpickle
 import msgspec
@@ -15,19 +16,25 @@ import numpy as np
 import torch
 import zmq
 from msgspec import msgpack
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import core_schema
 
 from vllm import envs
 from vllm.logger import init_logger
-# yapf: disable
-from vllm.multimodal.inputs import (BaseMultiModalField,
-                                    MultiModalBatchedField,
-                                    MultiModalFieldConfig, MultiModalFieldElem,
-                                    MultiModalFlatField, MultiModalKwargs,
-                                    MultiModalKwargsItem,
-                                    MultiModalKwargsItems,
-                                    MultiModalSharedField, NestedTensors)
-# yapf: enable
-from vllm.v1.engine import UtilityResult
+from vllm.multimodal.inputs import (
+    BaseMultiModalField,
+    MultiModalBatchedField,
+    MultiModalFieldConfig,
+    MultiModalFieldElem,
+    MultiModalFlatField,
+    MultiModalKwargs,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalSharedField,
+    NestedTensors,
+)
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.utils import tensor_data
 
 logger = init_logger(__name__)
 
@@ -44,15 +51,17 @@ MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
     MultiModalBatchedField: "batched",
 }
 
-bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
+bytestr: TypeAlias = bytes | bytearray | memoryview | zmq.Frame
 
 
 def _log_insecure_serialization_warning():
-    logger.warning_once("Allowing insecure serialization using pickle due to "
-                        "VLLM_ALLOW_INSECURE_SERIALIZATION=1")
+    logger.warning_once(
+        "Allowing insecure serialization using pickle due to "
+        "VLLM_ALLOW_INSECURE_SERIALIZATION=1"
+    )
 
 
-def _typestr(val: Any) -> Optional[tuple[str, str]]:
+def _typestr(val: Any) -> tuple[str, str] | None:
     if val is None:
         return None
     t = type(val)
@@ -72,8 +81,8 @@ def _encode_type_info_recursive(obj: Any) -> Any:
 
 
 def _decode_type_info_recursive(
-        type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any],
-                                                        Any]) -> Any:
+    type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any], Any]
+) -> Any:
     """Recursively decode type information for nested structures of
     lists/dicts."""
     if type_info is None:
@@ -85,8 +94,9 @@ def _decode_type_info_recursive(
             for k in type_info
         }
     if isinstance(type_info, list) and (
-            # Exclude serialized tensors/numpy arrays.
-            len(type_info) != 2 or not isinstance(type_info[0], str)):
+        # Exclude serialized tensors/numpy arrays.
+        len(type_info) != 2 or not isinstance(type_info[0], str)
+    ):
         assert isinstance(data, list)
         return [
             _decode_type_info_recursive(ti, d, convert_fn)
@@ -95,31 +105,38 @@ def _decode_type_info_recursive(
     return convert_fn(type_info, data)
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
     Note that unlike vanilla `msgspec` Encoders, this interface is generally
     not thread-safe when encoding tensors / numpy arrays.
 
-    By default, arrays below 256B are serialized inline Larger will get sent 
+    By default, arrays below 256B are serialized inline Larger will get sent
     via dedicated messages. Note that this is a per-tensor limit.
     """
 
-    def __init__(self, size_threshold: Optional[int] = None):
+    def __init__(self, size_threshold: int | None = None):
         if size_threshold is None:
             size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
         # pass custom data to the hook otherwise.
-        self.aux_buffers: Optional[list[bytestr]] = None
+        self.aux_buffers: list[bytestr] | None = None
         self.size_threshold = size_threshold
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
 
     def encode(self, obj: Any) -> Sequence[bytestr]:
         try:
-            self.aux_buffers = bufs = [b'']
+            self.aux_buffers = bufs = [b""]
             bufs[0] = self.encoder.encode(obj)
             # This `bufs` list allows us to collect direct pointers to backing
             # buffers of tensors and np arrays, and return them along with the
@@ -143,14 +160,15 @@ class MsgpackEncoder:
             return self._encode_tensor(obj)
 
         # Fall back to pickle for object or void kind ndarrays.
-        if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
+        if isinstance(obj, np.ndarray) and obj.dtype.kind not in ("O", "V"):
             return self._encode_ndarray(obj)
 
         if isinstance(obj, slice):
             # We are assuming only int-based values will be used here.
             return tuple(
                 int(v) if v is not None else None
-                for v in (obj.start, obj.stop, obj.step))
+                for v in (obj.start, obj.stop, obj.step)
+            )
 
         if isinstance(obj, MultiModalKwargsItem):
             return self._encode_mm_item(obj)
@@ -171,21 +189,24 @@ class MsgpackEncoder:
             return _encode_type_info_recursive(result), result
 
         if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
-            raise TypeError(f"Object of type {type(obj)} is not serializable"
-                            "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
-                            "fallback to pickle-based serialization.")
+            raise TypeError(
+                f"Object of type {type(obj)} is not serializable"
+                "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
+                "fallback to pickle-based serialization."
+            )
 
         if isinstance(obj, FunctionType):
             # `pickle` is generally faster than cloudpickle, but can have
             # problems serializing methods.
             return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
 
-        return msgpack.Ext(CUSTOM_TYPE_PICKLE,
-                           pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
+        return msgpack.Ext(
+            CUSTOM_TYPE_PICKLE, pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        )
 
     def _encode_ndarray(
         self, obj: np.ndarray
-    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+    ) -> tuple[str, tuple[int, ...], int | memoryview]:
         assert self.aux_buffers is not None
         # If the array is non-contiguous, we need to copy it first
         arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
@@ -205,17 +226,17 @@ class MsgpackEncoder:
 
     def _encode_tensor(
         self, obj: torch.Tensor
-    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+    ) -> tuple[str, tuple[int, ...], int | memoryview]:
         assert self.aux_buffers is not None
         # view the tensor as a contiguous 1D array of bytes
-        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
+        arr_data = tensor_data(obj)
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
-            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
         else:
             # Otherwise encode index of backing buffer to avoid copy.
             data = len(self.aux_buffers)
-            self.aux_buffers.append(arr.data)
+            self.aux_buffers.append(arr_data)
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
@@ -225,27 +246,22 @@ class MsgpackEncoder:
             for modality, itemlist in items.items()
         }
 
-    def _encode_mm_item(self,
-                        item: MultiModalKwargsItem) -> list[dict[str, Any]]:
+    def _encode_mm_item(self, item: MultiModalKwargsItem) -> list[dict[str, Any]]:
         return [self._encode_mm_field_elem(elem) for elem in item.values()]
 
-    def _encode_mm_field_elem(self,
-                              elem: MultiModalFieldElem) -> dict[str, Any]:
+    def _encode_mm_field_elem(self, elem: MultiModalFieldElem) -> dict[str, Any]:
         return {
-            "modality":
-            elem.modality,
-            "key":
-            elem.key,
-            "data": (None if elem.data is None else
-                     self._encode_nested_tensors(elem.data)),
-            "field":
-            self._encode_mm_field(elem.field),
+            "modality": elem.modality,
+            "key": elem.key,
+            "data": (
+                None if elem.data is None else self._encode_nested_tensors(elem.data)
+            ),
+            "field": self._encode_mm_field(elem.field),
         }
 
     def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
         return {
-            modality: self._encode_nested_tensors(data)
-            for modality, data in kw.items()
+            modality: self._encode_nested_tensors(data) for modality, data in kw.items()
         }
 
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
@@ -264,8 +280,7 @@ class MsgpackEncoder:
             raise TypeError(f"Unsupported field type: {field.__class__}")
         # We just need to copy all of the field values in order
         # which will be then used to reconstruct the field.
-        field_values = (getattr(field, f.name)
-                        for f in dataclasses.fields(field))
+        field_values = (getattr(field, f.name) for f in dataclasses.fields(field))
         return name, *field_values
 
 
@@ -276,19 +291,19 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Optional[Any] = None):
-        args = () if t is None else (t, )
-        self.decoder = msgpack.Decoder(*args,
-                                       ext_hook=self.ext_hook,
-                                       dec_hook=self.dec_hook)
+    def __init__(self, t: Any | None = None, share_mem: bool = True):
+        self.share_mem = share_mem
+        self.pin_tensors = is_pin_memory_available()
+        args = () if t is None else (t,)
+        self.decoder = msgpack.Decoder(
+            *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
+        )
         self.aux_buffers: Sequence[bytestr] = ()
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
 
-    def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
-        if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
-            # TODO - This check can become `isinstance(bufs, bytestr)`
-            # as of Python 3.10.
+    def decode(self, bufs: bytestr | Sequence[bytestr]) -> Any:
+        if isinstance(bufs, bytestr):  # type: ignore
             return self.decoder.decode(bufs)
 
         self.aux_buffers = bufs
@@ -320,11 +335,14 @@ class MsgpackDecoder:
         result_type, result = obj
         if result_type is not None:
             if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
-                raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
-                                "be set to use custom utility result types")
+                raise TypeError(
+                    "VLLM_ALLOW_INSECURE_SERIALIZATION must "
+                    "be set to use custom utility result types"
+                )
             # Use recursive decoding to handle nested structures
-            result = _decode_type_info_recursive(result_type, result,
-                                                 self._convert_result)
+            result = _decode_type_info_recursive(
+                result_type, result, self._convert_result
+            )
         return UtilityResult(result)
 
     def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
@@ -340,37 +358,47 @@ class MsgpackDecoder:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
+        arr = np.frombuffer(buffer, dtype=dtype)
+        if not self.share_mem:
+            arr = arr.copy()
+        return arr.reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
-        # Copy from inline representation, to decouple the memory storage
-        # of the message from the original buffer. And also make Torch
-        # not complain about a readonly memoryview.
-        buffer = self.aux_buffers[data] if isinstance(data, int) \
-            else bytearray(data)
+        is_aux = isinstance(data, int)
+        buffer = self.aux_buffers[data] if is_aux else data
+        buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
-        if not buffer:  # torch.frombuffer doesn't like empty buffers
+        if not buffer.nbytes:  # torch.frombuffer doesn't like empty buffers
             assert 0 in shape
             return torch.empty(shape, dtype=torch_dtype)
         # Create uint8 array
         arr = torch.frombuffer(buffer, dtype=torch.uint8)
+        # Clone ensures tensor is backed by pytorch-owned memory for safe
+        # future async CPU->GPU transfer.
+        # Pin larger tensors for more efficient CPU->GPU transfer.
+        if not is_aux:
+            arr = arr.clone()
+        elif not self.share_mem:
+            arr = arr.pin_memory() if self.pin_tensors else arr.clone()
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
     def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
-        return MultiModalKwargsItems({
-            modality: [self._decode_mm_item(item) for item in itemlist]
-            for modality, itemlist in obj.items()
-        })
+        return MultiModalKwargsItems(
+            {
+                modality: [self._decode_mm_item(item) for item in itemlist]
+                for modality, itemlist in obj.items()
+            }
+        )
 
     def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
         return MultiModalKwargsItem.from_elems(
-            [self._decode_mm_field_elem(v) for v in obj])
+            [self._decode_mm_field_elem(v) for v in obj]
+        )
 
-    def _decode_mm_field_elem(self, obj: dict[str,
-                                              Any]) -> MultiModalFieldElem:
+    def _decode_mm_field_elem(self, obj: dict[str, Any]) -> MultiModalFieldElem:
         if obj["data"] is not None:
             obj["data"] = self._decode_nested_tensors(obj["data"])
 
@@ -387,10 +415,12 @@ class MsgpackDecoder:
         return MultiModalFieldElem(**obj)
 
     def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
-        return MultiModalKwargs({
-            modality: self._decode_nested_tensors(data)
-            for modality, data in obj.items()
-        })
+        return MultiModalKwargs(
+            {
+                modality: self._decode_nested_tensors(data)
+                for modality, data in obj.items()
+            }
+        )
 
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):
@@ -419,5 +449,84 @@ class MsgpackDecoder:
             if code == CUSTOM_TYPE_CLOUDPICKLE:
                 return cloudpickle.loads(data)
 
-        raise NotImplementedError(
-            f"Extension type code {code} is not supported")
+        raise NotImplementedError(f"Extension type code {code} is not supported")
+
+
+def run_method(
+    obj: Any,
+    method: str | bytes | Callable,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+) -> Any:
+    """
+    Run a method of an object with the given arguments and keyword arguments.
+    If the method is string, it will be converted to a method using getattr.
+    If the method is serialized bytes and will be deserialized using
+    cloudpickle.
+    If the method is a callable, it will be called directly.
+    """
+    if isinstance(method, bytes):
+        func = partial(cloudpickle.loads(method), obj)
+    elif isinstance(method, str):
+        try:
+            func = getattr(obj, method)
+        except AttributeError:
+            raise NotImplementedError(
+                f"Method {method!r} is not implemented."
+            ) from None
+    else:
+        func = partial(method, obj)  # type: ignore
+    return func(*args, **kwargs)
+
+
+class PydanticMsgspecMixin:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> core_schema.CoreSchema:
+        """
+        Make msgspec.Struct compatible with Pydantic, respecting defaults.
+        Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the
+        API as input or in `/docs`. Note this is cached by Pydantic and not
+        called on every validation.
+        """
+        msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)}
+        type_hints = get_type_hints(source_type)
+
+        # Build the Pydantic typed_dict_field for each msgspec field
+        fields = {}
+        for name, hint in type_hints.items():
+            msgspec_field = msgspec_fields[name]
+
+            # typed_dict_field using the handler to get the schema
+            field_schema = handler(hint)
+
+            # Add default value to the schema.
+            if msgspec_field.default_factory is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default_factory=msgspec_field.default_factory,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            elif msgspec_field.default is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default=msgspec_field.default,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            else:
+                # No default, so Pydantic will treat it as required
+                fields[name] = core_schema.typed_dict_field(field_schema)
+        return core_schema.no_info_after_validator_function(
+            cls._validate_msgspec,
+            core_schema.typed_dict_schema(fields),
+        )
+
+    @classmethod
+    def _validate_msgspec(cls, value: Any) -> Any:
+        """Validate and convert input to msgspec.Struct instance."""
+        if isinstance(value, cls):
+            return value
+        if isinstance(value, dict):
+            return cls(**value)
+        return msgspec.convert(value, type=cls)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 81c3118f5c5a50a6587779b7eb9ea8d6c13d8ed6..7306f996db198ceaf64f9df401ae1336e01a6378 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -3,32 +3,41 @@
 import ast
 from dataclasses import replace
 from importlib.util import find_spec
-from typing import Optional
 
 import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.attention.layer import Attention
-from vllm.config import (CompilationLevel, VllmConfig,
-                         get_layers_from_vllm_config)
+from vllm.config import (
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    get_layers_from_vllm_config,
+)
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
-from vllm.utils import is_pin_memory_available
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata,
-                                                  TreeAttentionMetadataBuilder)
+from vllm.v1.attention.backends.tree_attn import (
+    TreeAttentionMetadata,
+    TreeAttentionMetadataBuilder,
+)
 from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -40,7 +49,6 @@ PADDING_SLOT_ID = -1
 
 
 class EagleProposer:
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -58,80 +66,111 @@ class EagleProposer:
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
-        self.num_speculative_tokens = (
-            self.speculative_config.num_speculative_tokens)
-        self.max_num_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
+        self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
+        self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
         self.token_arange_np = np.arange(self.max_num_tokens)
         # We need to get the hidden size from the draft model config because
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
 
-        self.is_multimodal_model = vllm_config.model_config \
-            .is_multimodal_model
+        # Multi-modal data support
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
+            vllm_config.model_config
+        )
 
-        self.attn_metadata_builder: Optional[AttentionMetadataBuilder] = None
-        self.draft_indexer_metadata_builder: Optional[
-            AttentionMetadataBuilder] = None
+        self.attn_metadata_builder: AttentionMetadataBuilder | None = None
+        self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
 
-        self.use_cuda_graph = (self.vllm_config.compilation_config.level
-                               == CompilationLevel.PIECEWISE and
-                               not self.vllm_config.model_config.enforce_eager
-                               and not self.speculative_config.enforce_eager)
-        self.cudagraph_batch_sizes = list(
-            reversed(
-                self.vllm_config.compilation_config.cudagraph_capture_sizes))
+        self.use_cuda_graph = False
+
+        compilation_config = self.vllm_config.compilation_config
+        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            cudagraph_mode = compilation_config.cudagraph_mode
+            if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
+                CUDAGraphMode.PIECEWISE
+            ):
+                logger.warning(
+                    "Currently the eagle proposer only supports cudagraph_mode "
+                    "PIECEWISE, if you want the drafter to use cuda graphs, "
+                    "please set compilation_config.cudagraph_mode to PIECEWISE "
+                    "or FULL_AND_PIECEWISE"
+                )
+            self.use_cuda_graph = (
+                cudagraph_mode.has_mode(CUDAGraphMode.PIECEWISE)
+                and not self.speculative_config.enforce_eager
+            )
 
+        self.cudagraph_batch_sizes = (
+            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
+            if self.use_cuda_graph
+            else []
+        )
+
+        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
-        self.input_ids = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int32,
-                                     device=device)
-        self.positions = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int64,
-                                     device=device)
+        self.input_ids = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device=device
+        )
+        self.uses_mrope = self.vllm_config.model_config.uses_mrope
+        if self.uses_mrope:
+            # M-RoPE need (3, max_num_tokens)
+            self.mrope_positions = torch.zeros(
+                (3, self.max_num_tokens), dtype=torch.int64, device=device
+            )
+        else:
+            # RoPE need (max_num_tokens,)
+            self.positions = torch.zeros(
+                self.max_num_tokens, dtype=torch.int64, device=device
+            )
         self.hidden_states = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=device)
+            (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
+        )
 
         # We need +1 here because the arange is used to set query_start_loc,
         # which has one more element than batch_size.
         max_batch_size = vllm_config.scheduler_config.max_num_seqs
         max_num_slots_for_arange = max(max_batch_size + 1, self.max_num_tokens)
-        self.arange = torch.arange(max_num_slots_for_arange,
-                                   device=device,
-                                   dtype=torch.int32)
+        self.arange = torch.arange(
+            max_num_slots_for_arange, device=device, dtype=torch.int32
+        )
 
         self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=device)
+            (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
+        )
 
         self.backup_next_token_ids = CpuGpuBuffer(
             max_batch_size,
             dtype=torch.int32,
             pin_memory=is_pin_memory_available(),
             device=device,
-            with_numpy=True)
+            with_numpy=True,
+        )
 
         # Determine allowed attention backends once during initialization.
-        self.allowed_attn_types: Optional[tuple] = None
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
+        self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
-            # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
-            # if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"):
+
+            # ROCM_AITER_FA is an optional backend
+            # if find_spec(
+            #     AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
+            # ):
             #     from vllm.v1.attention.backends.rocm_aiter_fa import (
-            #         AiterFlashAttentionMetadata)
+            #         AiterFlashAttentionMetadata,
+            #     )
+
             #     rocm_types.append(AiterFlashAttentionMetadata)
             self.allowed_attn_types = tuple(rocm_types)
 
         # Parse the speculative token tree.
         spec_token_tree = self.speculative_config.speculative_token_tree
-        self.tree_choices: list[tuple[int,
-                                      ...]] = ast.literal_eval(spec_token_tree)
+        self.tree_choices: list[tuple[int, ...]] = ast.literal_eval(spec_token_tree)
         tree_depth = len(self.tree_choices[-1])
         # Precompute per-level properties of the tree.
         num_drafts_per_level = [0] * tree_depth
@@ -140,10 +179,12 @@ class EagleProposer:
         self.cu_drafts_per_level = [num_drafts_per_level[0]]
         self.child_drafts_per_level = [num_drafts_per_level[0]]
         for level in range(1, tree_depth):
-            self.cu_drafts_per_level.append(self.cu_drafts_per_level[-1] +
-                                            num_drafts_per_level[level])
-            self.child_drafts_per_level.append(num_drafts_per_level[level] //
-                                               num_drafts_per_level[level - 1])
+            self.cu_drafts_per_level.append(
+                self.cu_drafts_per_level[-1] + num_drafts_per_level[level]
+            )
+            self.child_drafts_per_level.append(
+                num_drafts_per_level[level] // num_drafts_per_level[level - 1]
+            )
         # Precompute draft position offsets in flattened tree.
         self.tree_draft_pos_offsets = torch.arange(
             1,
@@ -152,20 +193,31 @@ class EagleProposer:
             dtype=torch.int32,
         ).repeat(max_batch_size, 1)
 
+    def _get_positions(self, num_tokens: int):
+        if self.uses_mrope:
+            return self.mrope_positions[:, :num_tokens]
+        return self.positions[:num_tokens]
+
+    def _set_positions(self, num_tokens: int, positions: torch.Tensor):
+        if self.uses_mrope:
+            self.mrope_positions[:, :num_tokens] = positions
+        else:
+            self.positions[:num_tokens] = positions
+
     def propose(
         self,
         # [num_tokens]
         target_token_ids: torch.Tensor,
-        # [num_tokens]
+        # [num_tokens] or [3, num_tokens] when M-RoPE is enabled
         target_positions: torch.Tensor,
         # [num_tokens, hidden_size]
         target_hidden_states: torch.Tensor,
         # [batch_size]
         next_token_ids: torch.Tensor,
-        last_token_indices: Optional[torch.Tensor],
+        last_token_indices: torch.Tensor | None,
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
-        mm_embeds: Optional[list[torch.Tensor]] = None,
+        mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -176,30 +228,34 @@ class EagleProposer:
         if self.method == "eagle3":
             assert isinstance(self.model, Eagle3LlamaForCausalLM)
             target_hidden_states = self.model.combine_hidden_states(
-                target_hidden_states)
+                target_hidden_states
+            )
             assert target_hidden_states.shape[-1] == self.hidden_size
         # Shift the input ids by one token.
         # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
-        self.input_ids[:num_tokens - 1] = target_token_ids[1:]
+        self.input_ids[: num_tokens - 1] = target_token_ids[1:]
         # Replace the last token with the next token.
         # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
         self.input_ids[last_token_indices] = next_token_ids
 
         assert self.runner is not None
 
-        # FIXME: need to consider multiple kv_cache_groups
-        ubatch_id = dbo_current_ubatch_id()
-        attn_metadata_builder = \
-            self.runner.attn_groups[0][0].metadata_builders[ubatch_id]
+        if self.attn_metadata_builder is None:
+            attn_metadata_builder = self._get_attention_metadata_builder()
+        else:
+            attn_metadata_builder = self.attn_metadata_builder
+
         attn_metadata = attn_metadata_builder.build_for_drafting(
-            common_attn_metadata=common_attn_metadata, draft_index=0)
+            common_attn_metadata=common_attn_metadata, draft_index=0
+        )
         # FIXME: support hybrid kv for draft model (remove separate indexer)
         if self.draft_indexer_metadata_builder:
             draft_indexer_metadata = (
                 self.draft_indexer_metadata_builder.build_for_drafting(
                     common_attn_metadata=common_attn_metadata,
                     draft_index=0,
-                ))
+                )
+            )
         else:
             draft_indexer_metadata = None
         # At this moment, we assume all eagle layers belong to the same KV
@@ -207,37 +263,45 @@ class EagleProposer:
         per_layer_attn_metadata = {}
         for layer_name in self.attn_layer_names:
             per_layer_attn_metadata[layer_name] = attn_metadata
+
         for layer_name in self.indexer_layer_names:
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
-        if self.use_cuda_graph and \
-                num_tokens <= self.cudagraph_batch_sizes[-1]:
+        cudagraph_runtime_mode = CUDAGraphMode.NONE
+        if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+            cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
             num_input_tokens = num_tokens
         # copy inputs to buffer for cudagraph
-        self.positions[:num_tokens] = target_positions
+        self._set_positions(num_tokens, target_positions)
         self.hidden_states[:num_tokens] = target_hidden_states
-        if self.is_multimodal_model:
-            input_ids = self.input_ids[:num_tokens]
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids,
-                multimodal_embeddings=mm_embeds or None,
+
+        if self.supports_mm_inputs:
+            mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
+
+            self.inputs_embeds[:num_tokens] = self.model.embed_input_ids(
+                self.input_ids[:num_tokens],
+                multimodal_embeddings=mm_embeds,
+                is_multimodal=is_mm_embed,
             )
-            self.inputs_embeds[:num_tokens] = inputs_embeds
-            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+
             input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
         else:
-            inputs_embeds = None
             input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
 
-        with set_forward_context(per_layer_attn_metadata,
-                                 self.vllm_config,
-                                 num_tokens=num_input_tokens):
+        with set_forward_context(
+            per_layer_attn_metadata,
+            self.vllm_config,
+            num_tokens=num_input_tokens,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+        ):
             ret_hidden_states = self.model(
                 input_ids=input_ids,
-                positions=self.positions[:num_input_tokens],
+                positions=self._get_positions(num_input_tokens),
                 hidden_states=self.hidden_states[:num_input_tokens],
                 inputs_embeds=inputs_embeds,
             )
@@ -254,8 +318,16 @@ class EagleProposer:
             draft_token_ids = logits.argmax(dim=-1)
             return draft_token_ids.view(-1, 1)
 
-        positions = target_positions[last_token_indices]
-        if self.method in ("deepseek_mtp", "ernie_mtp", "longcat_flash_mtp"):
+        if self.uses_mrope:
+            positions = target_positions[:, last_token_indices]
+        else:
+            positions = target_positions[last_token_indices]
+        if self.method in (
+            "deepseek_mtp",
+            "ernie_mtp",
+            "longcat_flash_mtp",
+            "pangu_ultra_moe_mtp",
+        ):
             hidden_states = self.hidden_states[last_token_indices]
         else:
             hidden_states = hidden_states[last_token_indices]
@@ -274,99 +346,130 @@ class EagleProposer:
 
         draft_token_ids = logits.argmax(dim=-1)
 
-        if self.allowed_attn_types is not None and \
-            not isinstance(attn_metadata, self.allowed_attn_types):
+        if self.allowed_attn_types is not None and not isinstance(
+            attn_metadata, self.allowed_attn_types
+        ):
             raise ValueError(
                 f"Unsupported attention metadata type for speculative "
                 "decoding with num_speculative_tokens > 1: "
                 f"{type(attn_metadata)}. Supported types are: "
-                f"{self.allowed_attn_types}")
+                f"{self.allowed_attn_types}"
+            )
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        if self.use_cuda_graph and \
-                batch_size <= self.cudagraph_batch_sizes[-1]:
+        if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]:
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+            cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
             input_batch_size = batch_size
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
-        common_attn_metadata.query_start_loc = self.arange[:batch_size + 1]
+        common_attn_metadata.query_start_loc = self.arange[: batch_size + 1]
         common_attn_metadata.query_start_loc_cpu = torch.from_numpy(
-            self.token_arange_np[:batch_size + 1]).clone()
+            self.token_arange_np[: batch_size + 1]
+        ).clone()
         for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
             input_ids = draft_token_ids_list[-1].int()
-            positions += 1
-
-            # NOTE(woosuk): We should handle the case where the draft model
-            # generates tokens beyond the max model length. Since it is complex
-            # to remove such requests from the batch, we keep them in the batch
-            # but adjust the position ids and slot mappings to avoid the
-            # out-of-range access during the model execution. The draft tokens
-            # generated with this adjustment should be ignored.
-            exceeds_max_model_len = positions >= self.max_model_len
-            # Mask out the position ids that exceed the max model length.
-            # Otherwise, we may get out-of-range error in RoPE.
-            clamped_positions = torch.where(exceeds_max_model_len, 0,
-                                            positions)
-
+            if self.uses_mrope:
+                positions += 1
+                # NOTE(woosuk): We should handle the case where the draft model
+                # generates tokens beyond the max model length.
+                # Since it is complex to remove such requests from the batch,
+                # we keep them in the batch but adjust the position ids
+                # and slot mappings to avoid the
+                # out-of-range access during the model execution.
+                # The draft tokens generated with this adjustment
+                # should be ignored.
+                exceeds_max_model_len = positions[0] >= self.max_model_len
+                # Mask out the position ids that exceed the max model length.
+                # Otherwise, we may get out-of-range error in RoPE.
+                clamped_positions = torch.where(
+                    exceeds_max_model_len.unsqueeze(0),
+                    torch.zeros_like(positions),
+                    positions,
+                )
+            else:
+                positions += 1
+                exceeds_max_model_len = positions >= self.max_model_len
+                clamped_positions = torch.where(exceeds_max_model_len, 0, positions)
+            # For data integrity when async scheduling, we shouldn't use in place
+            # operations in case they are modified in next step's `prepare_input`
+            # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            common_attn_metadata.seq_lens_cpu += 1
+            # This is an out-of-place operation to avoid modifying the original tensor.
+            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
-            common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len,
-                                                       1)
 
-            common_attn_metadata.num_computed_tokens_cpu = \
+            common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
+            common_attn_metadata.num_computed_tokens_cpu = (
                 common_attn_metadata.seq_lens_cpu - 1
+            )
 
             # Compute the slot mapping.
-            block_numbers = clamped_positions // self.block_size
+            if self.uses_mrope:
+                # all dimensions of positions are the same
+                block_numbers = clamped_positions[0] // self.block_size
+            else:
+                block_numbers = clamped_positions // self.block_size
             block_ids = common_attn_metadata.block_table_tensor.gather(
-                dim=1, index=block_numbers.view(-1, 1))
+                dim=1, index=block_numbers.view(-1, 1)
+            )
             block_ids = block_ids.view(-1)
-            common_attn_metadata.slot_mapping = (
-                block_ids * self.block_size +
-                clamped_positions % self.block_size)
+            if self.uses_mrope:
+                common_attn_metadata.slot_mapping = (
+                    block_ids * self.block_size + clamped_positions[0] % self.block_size
+                )
+            else:
+                common_attn_metadata.slot_mapping = (
+                    block_ids * self.block_size + clamped_positions % self.block_size
+                )
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
             # padding tokens.
             common_attn_metadata.slot_mapping.masked_fill_(
-                exceeds_max_model_len, PADDING_SLOT_ID)
+                exceeds_max_model_len, PADDING_SLOT_ID
+            )
 
             # Rebuild attention metadata
             attn_metadata = attn_metadata_builder.build_for_drafting(  # type: ignore
-                common_attn_metadata=common_attn_metadata,
-                draft_index=token_index + 1)
+                common_attn_metadata=common_attn_metadata, draft_index=token_index + 1
+            )
             for layer_name in self.attn_layer_names:
                 per_layer_attn_metadata[layer_name] = attn_metadata
 
             # copy inputs to buffer for cudagraph
             self.input_ids[:batch_size] = input_ids
-            self.positions[:batch_size] = clamped_positions
+            self._set_positions(batch_size, clamped_positions)
             self.hidden_states[:batch_size] = hidden_states
-            if self.is_multimodal_model:
-                inputs_embeds = self.model.get_input_embeddings(input_ids)
-                self.inputs_embeds[:batch_size] = inputs_embeds
-                inputs_embeds = self.inputs_embeds[:input_batch_size]
+            if self.supports_mm_inputs:
+                self.inputs_embeds[:batch_size] = self.model.embed_input_ids(input_ids)
+
                 input_ids = None
+                inputs_embeds = self.inputs_embeds[:input_batch_size]
             else:
-                inputs_embeds = None
                 input_ids = self.input_ids[:input_batch_size]
+                inputs_embeds = None
 
             # Run the model.
-            with set_forward_context(per_layer_attn_metadata,
-                                     self.vllm_config,
-                                     num_tokens=input_batch_size):
+            with set_forward_context(
+                per_layer_attn_metadata,
+                self.vllm_config,
+                num_tokens=input_batch_size,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+            ):
                 ret_hidden_states = self.model(
                     input_ids=input_ids,
-                    positions=self.positions[:input_batch_size],
+                    positions=self._get_positions(input_batch_size),
                     hidden_states=self.hidden_states[:input_batch_size],
                     inputs_embeds=inputs_embeds,
                 )
@@ -385,10 +488,12 @@ class EagleProposer:
         return draft_token_ids
 
     def prepare_next_token_ids_cpu(
-            self, sampled_token_ids: list[list[int]],
-            requests: dict[str,
-                           CachedRequestState], gpu_input_batch: InputBatch,
-            num_scheduled_tokens: dict[str, int]) -> torch.Tensor:
+        self,
+        sampled_token_ids: list[np.ndarray],
+        requests: dict[str, CachedRequestState],
+        gpu_input_batch: InputBatch,
+        num_scheduled_tokens: dict[str, int],
+    ) -> torch.Tensor:
         """
         This function is used to prepare the inputs for speculative decoding.
         It calculates the next token ids for each request based on the sampled
@@ -399,7 +504,7 @@ class EagleProposer:
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids:
+            if token_ids.shape[0] > 0:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -407,23 +512,22 @@ class EagleProposer:
                 # Get the next token id from the request state.
                 req_id = req_ids[i]
                 req_state = requests[req_id]
-                seq_len = (req_state.num_computed_tokens +
-                           num_scheduled_tokens[req_id])
+                seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        next_token_ids = torch.tensor(next_token_ids,
-                                      dtype=torch.int32,
-                                      device=self.input_ids.device)
-        return next_token_ids
-
-    def prepare_next_token_ids_padded(self,
-                               common_attn_metadata: CommonAttentionMetadata,
-                               sampled_token_ids: torch.Tensor,
-                               requests: dict[str, CachedRequestState],
-                               gpu_input_batch: InputBatch,
-                               discard_request_indices: torch.Tensor,
-                               num_discarded_requests: int) -> \
-                                tuple[torch.Tensor, torch.Tensor]:
+        return torch.tensor(
+            next_token_ids, dtype=torch.int32, device=self.input_ids.device
+        )
+
+    def prepare_next_token_ids_padded(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        sampled_token_ids: torch.Tensor,
+        requests: dict[str, CachedRequestState],
+        gpu_input_batch: InputBatch,
+        discard_request_indices: torch.Tensor,
+        num_discarded_requests: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding.
         It calculates the next token ids and the number of valid sampled tokens
@@ -437,30 +541,30 @@ class EagleProposer:
 
         # Precompute get_token_id for when there is no valid next token
         num_reqs = gpu_input_batch.num_reqs
-        self.backup_next_token_ids.np[:num_reqs] = np.array([
-            requests[gpu_input_batch.req_ids[i]].get_token_id(
-                common_attn_metadata.seq_lens_cpu[i].item())
-            for i in range(num_reqs)
-        ])
+        self.backup_next_token_ids.np[:num_reqs] = np.array(
+            [
+                requests[gpu_input_batch.req_ids[i]].get_token_id(
+                    common_attn_metadata.seq_lens_cpu[i].item()
+                )
+                for i in range(num_reqs)
+            ]
+        )
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
 
         # Mask out the sampled tokens indices that should not be sampled.
-        discard_sampled_tokens_req_indices = \
-            discard_request_indices[:num_discarded_requests]
+        discard_sampled_tokens_req_indices = discard_request_indices[
+            :num_discarded_requests
+        ]
 
         valid_sampled_token_ids_gpu = sampled_token_ids.clone()
         valid_sampled_token_ids_gpu.index_fill_(
-            0, discard_sampled_tokens_req_indices, -1)
+            0, discard_sampled_tokens_req_indices, -1
+        )
 
         # Generate a mask for all valid tokens within those requests
-        max_gen_len = sampled_token_ids.shape[-1]
-        if max_gen_len == 1:
-            valid_mask = torch.ones_like(valid_sampled_token_ids_gpu,
-                                         dtype=torch.bool)
-        else:
-            valid_mask = (
-                (valid_sampled_token_ids_gpu != -1) &
-                (valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size))
+        valid_mask = (valid_sampled_token_ids_gpu != -1) & (
+            valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size
+        )
 
         # Count the number of valid tokens in each request
         valid_sampled_tokens_count = valid_mask.sum(dim=1)
@@ -472,22 +576,25 @@ class EagleProposer:
         # Get last valid token from each row
         # (assume undefined state where there is no valid token)
         selected_tokens = torch.gather(
-            valid_sampled_token_ids_gpu, 1,
-            last_valid_indices_safe.unsqueeze(1)).squeeze(1)
+            valid_sampled_token_ids_gpu, 1, last_valid_indices_safe.unsqueeze(1)
+        ).squeeze(1)
 
         # Use last token if valid, pre-computed backup if not
         batch_size = valid_sampled_token_ids_gpu.shape[0]
         next_token_ids = torch.where(
-            last_valid_indices != -1, selected_tokens,
-            self.backup_next_token_ids.gpu[:batch_size])
+            last_valid_indices != -1,
+            selected_tokens,
+            self.backup_next_token_ids.gpu[:batch_size],
+        )
 
         return next_token_ids, valid_sampled_tokens_count
 
-    def prepare_inputs_padded(self,
-                                common_attn_metadata: CommonAttentionMetadata,
-                                spec_decode_metadata: SpecDecodeMetadata,
-                                valid_sampled_tokens_count: torch.Tensor) -> \
-                    tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
+    def prepare_inputs_padded(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        spec_decode_metadata: SpecDecodeMetadata,
+        valid_sampled_tokens_count: torch.Tensor,
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
@@ -496,21 +603,23 @@ class EagleProposer:
         used as padding and filtered out later by `token_indices_to_sample`.
         No blocking CPU operations should be introduced in this function.
         """
-        num_draft_tokens_gpu = torch.cat([
-            spec_decode_metadata.cu_num_draft_tokens[0:1],
-            spec_decode_metadata.cu_num_draft_tokens[1:] -
-            spec_decode_metadata.cu_num_draft_tokens[:-1]
-        ])
+        num_draft_tokens_gpu = torch.cat(
+            [
+                spec_decode_metadata.cu_num_draft_tokens[0:1],
+                spec_decode_metadata.cu_num_draft_tokens[1:]
+                - spec_decode_metadata.cu_num_draft_tokens[:-1],
+            ]
+        )
 
         num_rejected_tokens_gpu = torch.where(
             num_draft_tokens_gpu > 0,
             num_draft_tokens_gpu + 1 - valid_sampled_tokens_count,
-            torch.zeros_like(num_draft_tokens_gpu))
+            torch.zeros_like(num_draft_tokens_gpu),
+        )
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
 
-        new_query_len_per_req = (query_start_loc_cpu[1:] -
-                                 query_start_loc_cpu[:-1])
+        new_query_len_per_req = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
         total_num_tokens = query_start_loc_cpu[-1].item()
         token_indices = self.arange[:total_num_tokens]
@@ -520,8 +629,7 @@ class EagleProposer:
             seq_lens=common_attn_metadata.seq_lens,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.
-            num_computed_tokens_cpu,
+            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
@@ -529,10 +637,12 @@ class EagleProposer:
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
             causal=True,
+            dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        token_indices_to_sample = common_attn_metadata.query_start_loc[1:] - 1 \
-            - num_rejected_tokens_gpu
+        token_indices_to_sample = (
+            common_attn_metadata.query_start_loc[1:] - 1 - num_rejected_tokens_gpu
+        )
 
         return spec_common_attn_metadata, token_indices, token_indices_to_sample
 
@@ -547,10 +657,10 @@ class EagleProposer:
         hidden_states: torch.Tensor,
         common_attn_metadata: CommonAttentionMetadata,
     ) -> list[torch.Tensor]:
-        tree_attn_metadata_builder = \
-            self.runner.attn_groups[0][0].get_metadata_builder()
-        assert isinstance(tree_attn_metadata_builder,
-                          TreeAttentionMetadataBuilder)
+        tree_attn_metadata_builder = self.runner.attn_groups[0][
+            0
+        ].get_metadata_builder()
+        assert isinstance(tree_attn_metadata_builder, TreeAttentionMetadataBuilder)
 
         total_num_drafts = self.cu_drafts_per_level[0]
         level_num_drafts = total_num_drafts
@@ -559,31 +669,31 @@ class EagleProposer:
         if num_children == 1:
             draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
         else:
-            draft_token_ids = torch.topk(logits, num_children,
-                                         dim=-1).indices.view(batch_size, -1)
+            draft_token_ids = torch.topk(logits, num_children, dim=-1).indices.view(
+                batch_size, -1
+            )
         draft_token_ids_list = [draft_token_ids]
         draft_hidden_states = hidden_states.view(batch_size, 1, -1)
 
         # Initialize empty tensors for concatenation with the level outputs.
-        tree_input_ids = torch.empty(0,
-                                     device=self.input_ids.device,
-                                     dtype=self.input_ids.dtype)
-        tree_positions = torch.empty(0,
-                                     device=self.positions.device,
-                                     dtype=self.positions.dtype)
-        tree_hidden_states = torch.empty(0,
-                                         device=self.hidden_states.device,
-                                         dtype=self.hidden_states.dtype)
+        tree_input_ids = torch.empty(
+            0, device=self.input_ids.device, dtype=self.input_ids.dtype
+        )
+        tree_positions = torch.empty(
+            0, device=self.positions.device, dtype=self.positions.dtype
+        )
+        tree_hidden_states = torch.empty(
+            0, device=self.hidden_states.device, dtype=self.hidden_states.dtype
+        )
         # Precompute the draft token positions.
         flattened_draft_positions = (
-            positions.view(batch_size, -1) +
-            self.tree_draft_pos_offsets[:batch_size, :])
+            positions.view(batch_size, -1) + self.tree_draft_pos_offsets[:batch_size, :]
+        )
         tree_depth = len(self.cu_drafts_per_level)
         for level in range(tree_depth - 1):
             # Get draft positions for RoPE.
             draft_positions = positions + (level + 1)
-            exceeds_max_model_len = (positions +
-                                     total_num_drafts) >= self.max_model_len
+            exceeds_max_model_len = (positions + total_num_drafts) >= self.max_model_len
             # Mask out the position ids that exceed the max model length.
             # Otherwise, we may get out-of-range error in RoPE.
             draft_positions = torch.where(
@@ -595,27 +705,28 @@ class EagleProposer:
             if level_num_drafts > 1:
                 # Repeat the positions for each draft at this level.
                 draft_positions = draft_positions.repeat_interleave(
-                    level_num_drafts, dim=1)
+                    level_num_drafts, dim=1
+                )
 
             if num_children > 1:
                 # Repeat draft hidden states for each child.
                 draft_hidden_states = draft_hidden_states.repeat_interleave(
-                    num_children, dim=1)
+                    num_children, dim=1
+                )
 
             # Concatenate the draft tokens, positions, and hidden states.
-            tree_input_ids = torch.cat([tree_input_ids, draft_token_ids],
-                                       dim=1)
-            tree_positions = torch.cat([tree_positions, draft_positions],
-                                       dim=1)
+            tree_input_ids = torch.cat([tree_input_ids, draft_token_ids], dim=1)
+            tree_positions = torch.cat([tree_positions, draft_positions], dim=1)
             tree_hidden_states = torch.cat(
-                [tree_hidden_states, draft_hidden_states], dim=1)
+                [tree_hidden_states, draft_hidden_states], dim=1
+            )
 
             # Build new attention metadata for the next level of drafts.
             # This is necessary to support tree attention.
             query_len = total_num_drafts
             common_attn_metadata = replace(
                 common_attn_metadata,
-                query_start_loc=query_len * self.arange[:batch_size + 1],
+                query_start_loc=query_len * self.arange[: batch_size + 1],
                 seq_lens=common_attn_metadata.seq_lens + level_num_drafts,
                 num_actual_tokens=batch_size * query_len,
                 max_query_len=query_len,
@@ -631,20 +742,20 @@ class EagleProposer:
                 per_layer_attn_metadata[layer_name] = attn_metadata
 
             # Consider max model length.
-            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
-                                            self.max_model_len)
+            attn_metadata.max_seq_len = min(
+                attn_metadata.max_seq_len, self.max_model_len
+            )
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
             attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
 
             # Compute the slot mapping.
-            query_positions = flattened_draft_positions[:, level:level +
-                                                        query_len]
+            query_positions = flattened_draft_positions[:, level : level + query_len]
             block_numbers = query_positions // self.block_size
-            block_ids = attn_metadata.block_table.gather(dim=1,
-                                                         index=block_numbers)
-            slot_mapping = (block_ids * self.block_size +
-                            query_positions % self.block_size)
+            block_ids = attn_metadata.block_table.gather(dim=1, index=block_numbers)
+            slot_mapping = (
+                block_ids * self.block_size + query_positions % self.block_size
+            )
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
             # padding tokens.
@@ -656,19 +767,21 @@ class EagleProposer:
             input_ids = tree_input_ids.view(-1)
             self.input_ids[:num_tokens] = input_ids
             self.positions[:num_tokens] = tree_positions.view(-1)
-            self.hidden_states[:num_tokens] = tree_hidden_states.view(
-                num_tokens, -1)
+            self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1)
 
-            if self.use_cuda_graph and \
-                    num_tokens <= self.cudagraph_batch_sizes[-1]:
-                num_input_tokens = self.vllm_config.pad_for_cudagraph(
-                    num_tokens)
+            if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+                num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+                cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
             else:
                 num_input_tokens = num_tokens
+                cudagraph_runtime_mode = CUDAGraphMode.NONE
             # Run the model.
-            with set_forward_context(per_layer_attn_metadata,
-                                     self.vllm_config,
-                                     num_tokens=num_input_tokens):
+            with set_forward_context(
+                per_layer_attn_metadata,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+            ):
                 last_hidden_states, hidden_states = self.model(
                     input_ids=self.input_ids[:num_input_tokens],
                     positions=self.positions[:num_input_tokens],
@@ -678,28 +791,29 @@ class EagleProposer:
 
             # Get the output hidden states for the draft tokens.
             draft_hidden_states = hidden_states[:num_tokens].view(
-                batch_size, query_len, -1)[:, -level_num_drafts:]
+                batch_size, query_len, -1
+            )[:, -level_num_drafts:]
             draft_last_hidden_states = last_hidden_states[:num_tokens].view(
-                batch_size, query_len, -1)[:, -level_num_drafts:]
+                batch_size, query_len, -1
+            )[:, -level_num_drafts:]
 
             # Get the output logits for the draft tokens.
             logits = self.model.compute_logits(
-                draft_last_hidden_states.reshape(batch_size * level_num_drafts,
-                                                 -1))
+                draft_last_hidden_states.reshape(batch_size * level_num_drafts, -1)
+            )
 
             # Sample a draft token for each child at the next tree level.
             num_children = self.child_drafts_per_level[level + 1]
             if num_children == 1:
                 draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
             else:
-                draft_token_ids = torch.topk(logits, num_children,
-                                             dim=-1).indices.view(
-                                                 batch_size, -1)
+                draft_token_ids = torch.topk(logits, num_children, dim=-1).indices.view(
+                    batch_size, -1
+                )
             draft_token_ids_list.append(draft_token_ids)
 
             # Update the # drafts counters for the next tree level.
-            level_num_drafts = self.cu_drafts_per_level[level +
-                                                        1] - total_num_drafts
+            level_num_drafts = self.cu_drafts_per_level[level + 1] - total_num_drafts
             total_num_drafts = self.cu_drafts_per_level[level + 1]
         return draft_token_ids_list
 
@@ -735,17 +849,14 @@ class EagleProposer:
             n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
             for i, n in enumerate(num_draft_tokens)
         ]
-        num_rejected_tokens = torch.tensor(num_rejected_tokens,
-                                           dtype=torch.int32)
+        num_rejected_tokens = torch.tensor(num_rejected_tokens, dtype=torch.int32)
 
         device = common_attn_metadata.query_start_loc.device
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu \
-            - num_rejected_tokens
+        new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu - num_rejected_tokens
 
         # [0, q1, q1 + q2, q1 + q2 + q3] -> [q1, q2, q3]
-        new_query_len_per_req = (query_start_loc_cpu[1:] -
-                                 query_start_loc_cpu[:-1])
+        new_query_len_per_req = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         # [q1, q2, q3] -> [q1 - n1, q2 - n2, q3 - n3]
         new_num_tokens_per_req = new_query_len_per_req - num_rejected_tokens
         new_num_tokens_per_req_np = new_num_tokens_per_req.numpy()
@@ -755,7 +866,8 @@ class EagleProposer:
         new_query_start_loc_cpu = torch.zeros(
             query_start_loc_cpu.shape,
             dtype=torch.int32,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+        )
         new_query_start_loc_np = new_query_start_loc_cpu.numpy()
         np.cumsum(new_num_tokens_per_req_np, out=new_query_start_loc_np[1:])
 
@@ -765,36 +877,36 @@ class EagleProposer:
         # [0, 2, 6, 9] ->
         # [0, 0, 2, 2, 2, 2, 6, 6, 6]
         #  _r1_  ____r2____  ___r3__
-        new_query_start_locs_expanded = np.repeat(new_query_start_loc_np[:-1],
-                                                  new_num_tokens_per_req_np)
+        new_query_start_locs_expanded = np.repeat(
+            new_query_start_loc_np[:-1], new_num_tokens_per_req_np
+        )
         # [0, 1, 2, 3, 4, 5, 6, 7, 8] ->
         # [0, 1, 0, 1, 2, 3, 0, 1, 2]
         #  _r1_  ____r2____  ___r3__
-        token_offests = self.token_arange_np[:total_num_tokens] \
-            - new_query_start_locs_expanded
+        token_offests = (
+            self.token_arange_np[:total_num_tokens] - new_query_start_locs_expanded
+        )
 
         # Expand starting positions to match token pattern
         # [0, q1, q1 + q2] ->
         # [0, 0, q1, q1, q1, q1, q1 + q2, q1 + q2, q1 + q2]
         #  _r1_  _____r2_______  ___________r3____________
         old_query_start_locs_expanded = np.repeat(
-            query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np)
+            query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np
+        )
         # Final token indices are:
         # [0, 1,                                // req 1
         #  q1 + 0, q1 + 1, q1 + 2, q1 + 3,       // req 2
         #  q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3
         token_indices_np = token_offests + old_query_start_locs_expanded
-        token_indices = torch.from_numpy(token_indices_np).to(
-            device, non_blocking=True)
+        token_indices = torch.from_numpy(token_indices_np).to(device, non_blocking=True)
 
         spec_common_attn_metadata = CommonAttentionMetadata(
-            query_start_loc=new_query_start_loc_cpu.to(device,
-                                                       non_blocking=True),
+            query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
             seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
             query_start_loc_cpu=new_query_start_loc_cpu,
             seq_lens_cpu=new_seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.
-            num_computed_tokens_cpu,
+            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
@@ -802,114 +914,207 @@ class EagleProposer:
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
             causal=True,
+            dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
         return spec_common_attn_metadata, token_indices
 
+    def get_model_name(self, model: nn.Module) -> str:
+        if hasattr(model, "module"):  # multi-GPU
+            model = model.module
+        return model.__class__.__name__
+
     def load_model(self, target_model: nn.Module) -> None:
-        draft_model_config = \
-            self.vllm_config.speculative_config.draft_model_config
+        draft_model_config = self.vllm_config.speculative_config.draft_model_config
         target_attn_layer_names = set(
-            get_layers_from_vllm_config(self.vllm_config, Attention).keys())
+            get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys()
+        )
         # FIXME: support hybrid kv for draft model
         target_indexer_layer_names = set(
-            get_layers_from_vllm_config(self.vllm_config,
-                                        DeepseekV32IndexerCache).keys())
+            get_layers_from_vllm_config(
+                self.vllm_config, DeepseekV32IndexerCache
+            ).keys()
+        )
 
         from vllm.compilation.backends import set_model_tag
+
         with set_model_tag("eagle_head"):
-            self.model = get_model(vllm_config=self.vllm_config,
-                                   model_config=draft_model_config)
+            self.model = get_model(
+                vllm_config=self.vllm_config, model_config=draft_model_config
+            )
 
         draft_attn_layer_names = (
-            get_layers_from_vllm_config(self.vllm_config, Attention).keys() -
-            target_attn_layer_names)
-        indexer_layers = get_layers_from_vllm_config(self.vllm_config,
-                                                     DeepseekV32IndexerCache)
-        draft_indexer_layer_names = (indexer_layers.keys() -
-                                     target_indexer_layer_names)
-        self.attn_layer_names = list(draft_attn_layer_names)
+            get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys()
+            - target_attn_layer_names
+        )
+        indexer_layers = get_layers_from_vllm_config(
+            self.vllm_config, DeepseekV32IndexerCache
+        )
+        draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names
+        self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names)
         self.indexer_layer_names = list(draft_indexer_layer_names)
 
         if self.indexer_layer_names:
             first_layer = self.indexer_layer_names[0]
             self.draft_indexer_metadata_builder = (
-                indexer_layers[first_layer].get_attn_backend().get_builder_cls(
-                )(
-                    indexer_layers[first_layer].get_kv_cache_spec(),
+                indexer_layers[first_layer]
+                .get_attn_backend()
+                .get_builder_cls()(
+                    indexer_layers[first_layer].get_kv_cache_spec(self.vllm_config),
                     self.indexer_layer_names,
                     self.vllm_config,
                     self.device,
-                ))
+                )
+            )
         else:
             self.draft_indexer_metadata_builder = None
 
+        if self.supports_mm_inputs:
+            # Even if the target model is multimodal, we can also use
+            # text-only draft models
+            try:
+                dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device)
+                self.model.embed_input_ids(dummy_input_ids, multimodal_embeddings=None)
+            except (NotImplementedError, AttributeError, TypeError):
+                logger.warning(
+                    "Draft model does not support multimodal inputs, "
+                    "falling back to text-only mode"
+                )
+                self.supports_mm_inputs = False
+
         if supports_multimodal(target_model):
             # handle multimodality
-            self.model.config.image_token_index = (
-                target_model.config.image_token_index)
+            if (
+                self.get_model_name(target_model)
+                == "Qwen2_5_VLForConditionalGeneration"
+            ):
+                self.model.config.image_token_index = target_model.config.image_token_id
+            else:
+                self.model.config.image_token_index = (
+                    target_model.config.image_token_index
+                )
             target_language_model = target_model.get_language_model()
         else:
             target_language_model = target_model
+
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
-            if hasattr(target_language_model.model, 'embed_tokens'):
+            if hasattr(target_language_model.model, "embed_tokens"):
                 target_embed_tokens = target_language_model.model.embed_tokens
-            elif hasattr(target_language_model.model, 'embedding'):
+            elif hasattr(target_language_model.model, "embedding"):
                 target_embed_tokens = target_language_model.model.embedding
             else:
                 raise AttributeError(
-                    "Target model does not have 'embed_tokens' or 'embedding' "
-                    "attribute")
+                    "Target model does not have 'embed_tokens' or 'embedding' attribute"
+                )
 
-            # Check if shapes match and we found the embedding
-            eagle_shape = self.model.model.embed_tokens.weight.shape
-            target_shape = target_embed_tokens.weight.shape
-            if eagle_shape == target_shape:
-                logger.info(
-                    "Assuming the EAGLE head shares the same vocab embedding"
-                    " with the target model.")
-                del self.model.model.embed_tokens
-                self.model.model.embed_tokens = target_embed_tokens
+            share_embeddings = False
+            if hasattr(self.model, "has_own_embed_tokens"):
+                # EAGLE model
+                if not self.model.has_own_embed_tokens:
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model without its own embed_tokens in the"
+                        " checkpoint. Sharing target model embedding weights with the"
+                        " draft model."
+                    )
+                elif (
+                    isinstance(target_embed_tokens.weight, torch.Tensor)
+                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
+                    and torch.equal(
+                        target_embed_tokens.weight, self.model.model.embed_tokens.weight
+                    )
+                ):
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model with embed_tokens identical to the target"
+                        " model. Sharing target model embedding weights with the draft"
+                        " model."
+                    )
+                else:
+                    logger.info(
+                        "Detected EAGLE model with distinct embed_tokens weights. "
+                        "Keeping separate embedding weights from the target model."
+                    )
             else:
+                # MTP model
+                share_embeddings = True
                 logger.info(
-                    "The EAGLE head's vocab embedding will be loaded separately"
-                    " from the target model.")
+                    "Detected MTP model. "
+                    "Sharing target model embedding weights with the draft model."
+                )
+
+            if share_embeddings:
+                if hasattr(self.model.model, "embed_tokens"):
+                    del self.model.model.embed_tokens
+                self.model.model.embed_tokens = target_embed_tokens
         else:
             logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately"
-                " from the target model.")
+                "The draft model's vocab embedding will be loaded separately"
+                " from the target model."
+            )
 
         # share lm_head with the target model if needed
-        # some model definition do not define lm_head explicitly
-        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
-        if self.vllm_config.speculative_config.method != "eagle3":
-            if hasattr(target_language_model, "lm_head"):
+        share_lm_head = False
+        if hasattr(self.model, "has_own_lm_head"):
+            # EAGLE model
+            if not self.model.has_own_lm_head:
+                share_lm_head = True
                 logger.info(
-                    "Loading EAGLE LM head weights from the target model.")
-                self.model.lm_head = target_language_model.lm_head
-        else:
-            if (hasattr(self.model, "lm_head")
-                    and hasattr(target_language_model, "lm_head")
-                    and self.model.lm_head.weight.shape
-                    == target_language_model.lm_head.weight.shape):
-                logger.info("Assuming the EAGLE head shares the same lm_head"
-                            " with the target model.")
-                del self.model.lm_head
-                self.model.lm_head = target_language_model.lm_head
+                    "Detected EAGLE model without its own lm_head in the checkpoint. "
+                    "Sharing target model lm_head weights with the draft model."
+                )
+            elif (
+                hasattr(target_language_model, "lm_head")
+                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
+                and isinstance(self.model.lm_head.weight, torch.Tensor)
+                and torch.equal(
+                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                )
+            ):
+                share_lm_head = True
+                logger.info(
+                    "Detected EAGLE model with lm_head identical to the target model. "
+                    "Sharing target model lm_head weights with the draft model."
+                )
             else:
                 logger.info(
-                    "The EAGLE head's lm_head will be loaded separately"
-                    " from the target model.")
+                    "Detected EAGLE model with distinct lm_head weights. "
+                    "Keeping separate lm_head weights from the target model."
+                )
+        else:
+            # MTP model
+            share_lm_head = True
+            logger.info(
+                "Detected MTP model. "
+                "Sharing target model lm_head weights with the draft model."
+            )
+
+        if share_lm_head and hasattr(target_language_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_language_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(
         self,
         num_tokens: int,
+        use_cudagraphs=True,
     ) -> None:
-        with set_forward_context(None, self.vllm_config,
-                                 num_tokens=num_tokens):
-            if self.is_multimodal_model:
+        # Determine if CUDA graphs should be used for this run.
+        cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
+        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
+            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+
+        with set_forward_context(
+            None,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=(
+                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
+            ),
+        ):
+            if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
             else:
@@ -918,13 +1123,12 @@ class EagleProposer:
 
             self.model(
                 input_ids=input_ids,
-                positions=self.positions[:num_tokens],
+                positions=self._get_positions(num_tokens),
                 hidden_states=self.hidden_states[:num_tokens],
                 inputs_embeds=inputs_embeds,
             )
 
-    def _get_attention_metadata_builder(
-            self) -> list[AttentionMetadataBuilder]:
+    def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         """Find and return the attention metadata builders for EAGLE layers.
 
         Returns:
@@ -945,11 +1149,11 @@ class EagleProposer:
                 break
 
         assert builder is not None, (
-            "Failed to find attention metadata builder for EAGLE layers.")
+            "Failed to find attention metadata builder for EAGLE layers."
+        )
         return builder
 
-    def validate_same_kv_cache_group(self,
-                                     kv_cache_config: KVCacheConfig) -> None:
+    def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Validate that all eagle layers belong to the same KVCacheGroup.
         Need this assumption to ensure all eagle layers can use the
@@ -960,12 +1164,17 @@ class EagleProposer:
         for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
             for layer_name in kv_cache_group.layer_names:
                 kv_cache_groups[layer_name] = id
-        assert len(
-            set([
-                kv_cache_groups[layer_name]
-                for layer_name in self.attn_layer_names
-            ])
-        ) == 1, "All eagle layers should belong to the same kv cache group"
+        assert (
+            len(
+                set(
+                    [
+                        kv_cache_groups[layer_name]
+                        for layer_name in self.attn_layer_names
+                    ]
+                )
+            )
+            == 1
+        ), "All eagle layers should belong to the same kv cache group"
 
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
@@ -985,8 +1194,15 @@ def compute_probs_and_sample_next_token(
         next_token_ids = logits.argmax(dim=-1)
         return next_token_ids, probs
 
-    is_greedy = sampling_metadata.temperature == -1
-    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    assert sampling_metadata.temperature is not None
+
+    # Use epsilon comparison to detect greedy sampling (temperature ~ 0.0)
+    # consistent with sampler.py's _SAMPLING_EPS threshold
+    temperature = sampling_metadata.temperature
+    # Avoid division by zero if there are greedy requests.
+    if not sampling_metadata.all_random:
+        is_greedy = temperature < _SAMPLING_EPS
+        temperature = torch.where(is_greedy, 1.0, temperature)
     logits.div_(temperature.view(-1, 1))
     probs = logits.softmax(dim=-1, dtype=torch.float32)
 
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 70b29c05c2a50aedd5ea1c58bc36711376b75235..12b903ccaca97bb27bc6cea8bc9d0ccf0f9fa13e 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.v1.sample.metadata import SamplingMetadata
 
 # Initialize logger
@@ -27,10 +28,9 @@ class MedusaProposer:
         # Save config parameters
         self.vllm_config = vllm_config
         self.device = device
-        self.max_num_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
-        self.hidden_size = vllm_config.speculative_config.\
-            draft_model_config.get_hidden_size(
+        self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+        self.hidden_size = (
+            vllm_config.speculative_config.draft_model_config.get_hidden_size()
         )
         self.dtype = vllm_config.model_config.dtype
 
@@ -51,16 +51,23 @@ class MedusaProposer:
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
+
         with set_model_tag("medusa_head"):
-            self.model = get_model(vllm_config=self.vllm_config,
-                                   model_config=self.vllm_config.
-                                   speculative_config.draft_model_config)
+            self.model = get_model(
+                vllm_config=self.vllm_config,
+                model_config=self.vllm_config.speculative_config.draft_model_config,
+            )
+        assert not (
+            is_mixture_of_experts(self.model)
+            and self.vllm_config.parallel_config.enable_eplb
+        ), "EPLB for Medusa is not supported"
 
     @torch.inference_mode()
     def dummy_run(self, num_tokens: int) -> None:
-        hidden_states = torch.zeros((self.max_num_tokens, self.hidden_size),
-                                    dtype=self.dtype,
-                                    device=self.device)
-        with set_forward_context(None, self.vllm_config,
-                                 num_tokens=num_tokens):
+        hidden_states = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        with set_forward_context(None, self.vllm_config, num_tokens=num_tokens):
             self.model(hidden_states)
diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py
index b1efb40612d541f04ceaa682a3e2ad6fbaf8d042..6955ae79d01da0f468172b6a66839b5117bab633 100644
--- a/vllm/v1/spec_decode/metadata.py
+++ b/vllm/v1/spec_decode/metadata.py
@@ -8,13 +8,14 @@ import torch
 
 @dataclass
 class SpecDecodeMetadata:
-
     # [num_tokens]
     draft_token_ids: torch.Tensor
     # [batch_size]
     num_draft_tokens: list[int]
     # [batch_size]
     cu_num_draft_tokens: torch.Tensor
+    # [batch_size]
+    cu_num_sampled_tokens: torch.Tensor
     # [num_tokens]
     target_logits_indices: torch.Tensor
     # [batch_size]
@@ -33,29 +34,32 @@ class SpecDecodeMetadata:
     ) -> "SpecDecodeMetadata":
         batch_size = len(draft_token_ids)
         num_draft_tokens = [len(ids) for ids in draft_token_ids]
+        num_sampled_tokens = [len(ids) + 1 for ids in draft_token_ids]
         flattened_draft_token_ids = sum(draft_token_ids, [])
         num_tokens = len(flattened_draft_token_ids)
 
-        draft_token_ids_tensor = torch.tensor(flattened_draft_token_ids,
-                                              dtype=torch.int32,
-                                              device=device)
+        draft_token_ids_tensor = torch.tensor(
+            flattened_draft_token_ids, dtype=torch.int32, device=device
+        )
         cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
-        cu_num_draft_tokens_tensor = torch.from_numpy(cu_num_draft_tokens).to(
-            device)
+        cu_num_draft_tokens_tensor = torch.from_numpy(cu_num_draft_tokens).to(device)
+        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32)
+        cu_num_sampled_tokens_tensor = torch.from_numpy(cu_num_sampled_tokens).to(
+            device
+        )
 
-        target_logits_indices = torch.zeros(num_tokens,
-                                            dtype=torch.int32,
-                                            device=device)
-        bonus_logits_indices = torch.zeros(batch_size,
-                                           dtype=torch.int32,
-                                           device=device)
-        logits_indices = torch.zeros(num_tokens + batch_size,
-                                     dtype=torch.int32,
-                                     device=device)
+        target_logits_indices = torch.zeros(
+            num_tokens, dtype=torch.int32, device=device
+        )
+        bonus_logits_indices = torch.zeros(batch_size, dtype=torch.int32, device=device)
+        logits_indices = torch.zeros(
+            num_tokens + batch_size, dtype=torch.int32, device=device
+        )
         return cls(
             draft_token_ids=draft_token_ids_tensor,
             num_draft_tokens=num_draft_tokens,
             cu_num_draft_tokens=cu_num_draft_tokens_tensor,
+            cu_num_sampled_tokens=cu_num_sampled_tokens_tensor,
             target_logits_indices=target_logits_indices,
             bonus_logits_indices=bonus_logits_indices,
             logits_indices=logits_indices,
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 282e6f65e7abe53afe964ab4a613a4aacd2c9cb6..79d856a143ba9fed340ed17c2a13e8475af84e51 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -3,7 +3,6 @@
 
 import time
 from dataclasses import dataclass, field
-from typing import Optional
 
 import numpy as np
 import prometheus_client
@@ -31,8 +30,10 @@ class SpecDecodingStats:
 
     @classmethod
     def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
-        return cls(num_spec_tokens=num_spec_tokens,
-                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
+        return cls(
+            num_spec_tokens=num_spec_tokens,
+            num_accepted_tokens_per_pos=[0] * num_spec_tokens,
+        )
 
     def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
         self.num_drafts += 1
@@ -64,10 +65,10 @@ class SpecDecodingLogging:
     def observe(self, spec_decoding_stats: SpecDecodingStats):
         self.num_drafts.append(spec_decoding_stats.num_drafts)
         self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
-        self.num_accepted_tokens.append(
-            spec_decoding_stats.num_accepted_tokens)
+        self.num_accepted_tokens.append(spec_decoding_stats.num_accepted_tokens)
         self.accepted_tokens_per_pos_lists.append(
-            spec_decoding_stats.num_accepted_tokens_per_pos)
+            spec_decoding_stats.num_accepted_tokens_per_pos
+        )
 
     def log(self, log_fn=logger.info):
         if not self.num_drafts:
@@ -83,8 +84,11 @@ class SpecDecodingLogging:
             draft_throughput = num_draft_tokens / elapsed_time
             accepted_throughput = num_accepted_tokens / elapsed_time
 
-        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
-                                 100 if num_draft_tokens > 0 else float("nan"))
+        draft_acceptance_rate = (
+            num_accepted_tokens / num_draft_tokens * 100
+            if num_draft_tokens > 0
+            else float("nan")
+        )
 
         # Conventionally, mean acceptance length includes the bonus token
         mean_acceptance_length = 1 + (num_accepted_tokens / num_drafts)
@@ -138,7 +142,7 @@ class SpecDecodingProm:
 
     def __init__(
         self,
-        speculative_config: Optional[SpeculativeConfig],
+        speculative_config: SpeculativeConfig | None,
         labelnames: list[str],
         per_engine_labelvalues: dict[int, list[str]],
     ):
@@ -149,27 +153,36 @@ class SpecDecodingProm:
         counter_drafts = self._counter_cls(
             name="vllm:spec_decode_num_drafts",
             documentation="Number of spec decoding drafts.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.counter_spec_decode_num_drafts = make_per_engine(
-            counter_drafts, per_engine_labelvalues)
+            counter_drafts, per_engine_labelvalues
+        )
 
         counter_draft_tokens = self._counter_cls(
             name="vllm:spec_decode_num_draft_tokens",
             documentation="Number of draft tokens.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.counter_spec_decode_num_draft_tokens = make_per_engine(
-            counter_draft_tokens, per_engine_labelvalues)
+            counter_draft_tokens, per_engine_labelvalues
+        )
 
         counter_accepted_tokens = self._counter_cls(
             name="vllm:spec_decode_num_accepted_tokens",
             documentation="Number of accepted tokens.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+        )
         self.counter_spec_decode_num_accepted_tokens = make_per_engine(
-            counter_accepted_tokens, per_engine_labelvalues)
+            counter_accepted_tokens, per_engine_labelvalues
+        )
 
         assert speculative_config is not None
-        num_spec_tokens = (speculative_config.num_speculative_tokens
-                           if self.spec_decoding_enabled else 0)
+        num_spec_tokens = (
+            speculative_config.num_speculative_tokens
+            if self.spec_decoding_enabled
+            else 0
+        )
         pos_labelnames = labelnames + ["position"]
         base_counter = self._counter_cls(
             name="vllm:spec_decode_num_accepted_tokens_per_pos",
@@ -177,33 +190,33 @@ class SpecDecodingProm:
             labelnames=pos_labelnames,
         )
         self.counter_spec_decode_num_accepted_tokens_per_pos: dict[
-            int, list[prometheus_client.Counter]] = {
-                idx: [
-                    base_counter.labels(*lv, str(pos))
-                    for pos in range(num_spec_tokens)
-                ]
-                for idx, lv in per_engine_labelvalues.items()
-            }
-
-    def observe(self,
-                spec_decoding_stats: SpecDecodingStats,
-                engine_idx: int = 0):
+            int, list[prometheus_client.Counter]
+        ] = {
+            idx: [base_counter.labels(*lv, str(pos)) for pos in range(num_spec_tokens)]
+            for idx, lv in per_engine_labelvalues.items()
+        }
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats, engine_idx: int = 0):
         if not self.spec_decoding_enabled:
             return
         self.counter_spec_decode_num_drafts[engine_idx].inc(
-            spec_decoding_stats.num_drafts)
+            spec_decoding_stats.num_drafts
+        )
         self.counter_spec_decode_num_draft_tokens[engine_idx].inc(
-            spec_decoding_stats.num_draft_tokens)
+            spec_decoding_stats.num_draft_tokens
+        )
         self.counter_spec_decode_num_accepted_tokens[engine_idx].inc(
-            spec_decoding_stats.num_accepted_tokens)
+            spec_decoding_stats.num_accepted_tokens
+        )
         for pos, counter in enumerate(
-                self.
-                counter_spec_decode_num_accepted_tokens_per_pos[engine_idx]):
+            self.counter_spec_decode_num_accepted_tokens_per_pos[engine_idx]
+        ):
             counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])
 
 
-def make_per_engine(counter: prometheus_client.Counter,
-                    per_engine_labelvalues: dict[int, list[str]]):
+def make_per_engine(
+    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[str]]
+):
     """Create a counter for each label value."""
     return {
         idx: counter.labels(*labelvalues)
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index aed050a3540c4e9fecdbbbecbb9f112c0e0238a2..378937dba9882a6c8cc95a1774f20ba45ec7cb37 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -9,7 +9,6 @@ from vllm.config import VllmConfig
 
 
 class NgramProposer:
-
     def __init__(self, vllm_config: VllmConfig):
         assert vllm_config.speculative_config is not None
         assert vllm_config.speculative_config.prompt_lookup_min is not None
@@ -28,8 +27,7 @@ class NgramProposer:
 
         # Pre-allocate buffers for numba batch propose.
         max_num_seqs = vllm_config.scheduler_config.max_num_seqs
-        self.valid_ngram_draft = np.zeros((max_num_seqs, self.k),
-                                          dtype=np.int32)
+        self.valid_ngram_draft = np.zeros((max_num_seqs, self.k), dtype=np.int32)
         self.valid_ngram_num_drafts = np.zeros((max_num_seqs), dtype=np.int32)
 
         # Threshold of total number of tokens in the batch to enable
@@ -55,9 +53,13 @@ class NgramProposer:
 
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
-        self.propose([[]] * 1024, [""] * 1024, np.zeros(1024, dtype=np.int32),
-                     np.zeros((1024, self.max_model_len), dtype=np.int32),
-                     set())
+        self.propose(
+            [np.array([])] * 1024,
+            [""] * 1024,
+            np.zeros(1024, dtype=np.int32),
+            np.zeros((1024, self.max_model_len), dtype=np.int32),
+            set(),
+        )
 
     def batch_propose(
         self,
@@ -67,20 +69,20 @@ class NgramProposer:
         token_ids_cpu: np.ndarray,
     ) -> list[list[int]]:
         """Batch version of ngram proposer using numba for acceleration.
-        
+
         Args:
-            valid_ngram_requests: 
+            valid_ngram_requests:
                 Set of indices of requests that need ngram proposals.
-            num_tokens_no_spec: 
-                Numpy array of shape (batch_size,) representing the number 
+            num_tokens_no_spec:
+                Numpy array of shape (batch_size,) representing the number
                 of tokens without speculative tokens for each request.
-            token_ids_cpu: 
-                Numpy array of shape (batch_size, max_model_len) 
+            token_ids_cpu:
+                Numpy array of shape (batch_size, max_model_len)
                 representing the token IDs for each request.
 
         Returns:
-            list[list[int]]: 
-                A list where each element is a list of proposed 
+            list[list[int]]:
+                A list where each element is a list of proposed
                 token IDs for the corresponding request.
         """
         draft_token_ids: list[list[int]] = []
@@ -96,26 +98,32 @@ class NgramProposer:
             total_tokens = np.sum(num_tokens_no_spec)
             if total_tokens >= self.num_tokens_threshold:
                 final_num_threads = max(
-                    1, min(self.num_numba_thread_available,
-                           num_ngram_requests))
+                    1, min(self.num_numba_thread_available, num_ngram_requests)
+                )
                 set_num_threads(final_num_threads)
             else:
                 set_num_threads(1)
 
-            batch_propose_numba(valid_ngram_requests, num_tokens_no_spec,
-                                token_ids_cpu, self.min_n, self.max_n,
-                                self.max_model_len, self.k,
-                                self.valid_ngram_draft,
-                                self.valid_ngram_num_drafts)
+            batch_propose_numba(
+                valid_ngram_requests,
+                num_tokens_no_spec,
+                token_ids_cpu,
+                self.min_n,
+                self.max_n,
+                self.max_model_len,
+                self.k,
+                self.valid_ngram_draft,
+                self.valid_ngram_num_drafts,
+            )
 
             # Restore original number of threads.
             set_num_threads(original_num_numba_threads)
 
         for i in range(num_requests):
-            if i in valid_ngram_requests and \
-                self.valid_ngram_num_drafts[i] > 0:
-                draft_token_ids.append(self.valid_ngram_draft[
-                    i, :self.valid_ngram_num_drafts[i]].tolist())
+            if i in valid_ngram_requests and self.valid_ngram_num_drafts[i] > 0:
+                draft_token_ids.append(
+                    self.valid_ngram_draft[i, : self.valid_ngram_num_drafts[i]].tolist()
+                )
             else:
                 draft_token_ids.append([])
 
@@ -123,17 +131,16 @@ class NgramProposer:
 
     def propose(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
         spec_decode_unsupported_reqs: set,
     ) -> list[list[int]]:
-
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = len(sampled_ids)
+            num_sampled_ids = sampled_ids.shape[0]
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
@@ -166,12 +173,17 @@ class NgramProposer:
 
 
 @njit(parallel=True)
-def batch_propose_numba(valid_ngram_requests: list,
-                        num_tokens_no_spec: np.ndarray,
-                        token_ids_cpu: np.ndarray, min_n: int, max_n: int,
-                        max_model_len: int, k: int,
-                        valid_ngram_draft: np.ndarray,
-                        valid_ngram_num_drafts: np.ndarray):
+def batch_propose_numba(
+    valid_ngram_requests: list,
+    num_tokens_no_spec: np.ndarray,
+    token_ids_cpu: np.ndarray,
+    min_n: int,
+    max_n: int,
+    max_model_len: int,
+    k: int,
+    valid_ngram_draft: np.ndarray,
+    valid_ngram_num_drafts: np.ndarray,
+):
     for i in prange(len(valid_ngram_requests)):
         idx = valid_ngram_requests[i]
         num_tokens = num_tokens_no_spec[idx]
@@ -181,19 +193,22 @@ def batch_propose_numba(valid_ngram_requests: list,
             min_ngram=min_n,
             max_ngram=max_n,
             max_model_len=max_model_len,
-            k=k)
+            k=k,
+        )
 
         valid_ngram_num_drafts[i] = drafter_output.shape[0]
         if len(drafter_output):
-            valid_ngram_draft[i, :drafter_output.shape[0]] = drafter_output
+            valid_ngram_draft[i, : drafter_output.shape[0]] = drafter_output
 
 
 @jit(nopython=True)
-def _find_longest_matched_ngram_and_propose_tokens(origin_tokens: np.ndarray,
-                                                   min_ngram: int,
-                                                   max_ngram: int,
-                                                   max_model_len: int,
-                                                   k: int) -> np.ndarray:
+def _find_longest_matched_ngram_and_propose_tokens(
+    origin_tokens: np.ndarray,
+    min_ngram: int,
+    max_ngram: int,
+    max_model_len: int,
+    k: int,
+) -> np.ndarray:
     """
     Find the longest n-gram which matches the suffix of the given tokens
     whose length is within [min_ngram, max_ngram] (inclusive).
@@ -203,12 +218,12 @@ def _find_longest_matched_ngram_and_propose_tokens(origin_tokens: np.ndarray,
     # Do not generate draft tokens is context is shorter than minimum n-gram
     total_token = origin_tokens.shape[0]
     if total_token < min_ngram:
-        return np.empty((0, ), dtype=origin_tokens.dtype)
+        return np.empty((0,), dtype=origin_tokens.dtype)
 
     # Do not generate draft tokens beyond the max model length.
     k = min(k, max_model_len - total_token)
     if k <= 0:
-        return np.empty((0, ), dtype=origin_tokens.dtype)
+        return np.empty((0,), dtype=origin_tokens.dtype)
 
     # Flip tokens, and the goal become to find longest ngram
     # on the rightmost position which matches the prefix with
@@ -265,7 +280,7 @@ def _find_longest_matched_ngram_and_propose_tokens(origin_tokens: np.ndarray,
 
     if longest_ngram < min_ngram:
         # No valid ngram is found
-        return np.empty((0, ), dtype=origin_tokens.dtype)
+        return np.empty((0,), dtype=origin_tokens.dtype)
 
     # Flip the position back, so in origin_tokens,
     # origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
@@ -273,4 +288,4 @@ def _find_longest_matched_ngram_and_propose_tokens(origin_tokens: np.ndarray,
     # total_token-1-position+longest_ngram
     start_position = total_token - 1 - position + longest_ngram
     k = min(k, total_token - start_position)
-    return origin_tokens[start_position:start_position + k]
+    return origin_tokens[start_position : start_position + k]
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76e0ffe778d419507969ee4e3a185703fcae395
--- /dev/null
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
+from vllm.config import VllmConfig
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class SuffixDecodingProposer:
+    """
+    Speculative decoding proposer for Suffix Decoding (https://arxiv.org/pdf/2411.04975).
+    This class imports and uses the official implementation from Arctic Inference
+    (https://github.com/snowflakedb/ArcticInference).
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        config = vllm_config.speculative_config
+        self.num_speculative_tokens = config.num_speculative_tokens
+        self.max_tree_depth = config.suffix_decoding_max_tree_depth
+        self.max_spec_factor = config.suffix_decoding_max_spec_factor
+        self.min_token_prob = config.suffix_decoding_min_token_prob
+        self.max_model_len = vllm_config.model_config.max_model_len
+
+        # Lazy import to avoid error when Suffix Decoding is not used.
+        from arctic_inference.suffix_decoding import SuffixDecodingCache
+
+        # Initialize and empty cache. This object will take care of caching request
+        # outputs, evicting old requests, and manages the per-prompt suffix trees.
+        self.suffix_cache = SuffixDecodingCache(
+            max_tree_depth=config.suffix_decoding_max_tree_depth,
+            max_cached_requests=config.suffix_decoding_max_cached_requests,
+        )
+
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampled_token_ids: list[np.ndarray],
+    ) -> list[list[int]]:
+        """
+        Propose speculative tokens for each request in the input batch. Suffix Decoding
+        will speculate a dynamic number of tokens for each request every decoding step,
+        so each entry in the returned list may have different lengths.
+        """
+        draft_token_ids: list[np.ndarray] = []
+        for i, sampled_ids in enumerate(sampled_token_ids):
+            if sampled_ids.shape[0] == 0:
+                # Skip speculative decoding for partial prefills.
+                draft_token_ids.append([])
+                continue
+
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
+            req_id = input_batch.req_ids[i]
+            if req_id in input_batch.spec_decode_unsupported_reqs:
+                draft_token_ids.append([])
+                continue
+
+            num_tokens = input_batch.num_tokens_no_spec[i]
+            if num_tokens >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
+            index = input_batch.req_id_to_index[req_id]
+            if req_id not in self.suffix_cache.active_requests:
+                if req_id in self.suffix_cache.cached_requests:
+                    # Reset the suffix cache for this request.
+                    self.suffix_cache.evict_cached_response(req_id)
+                num_prompt_tokens = input_batch.num_prompt_tokens[index]
+                prompt_token_ids = input_batch.token_ids_cpu[index, :num_prompt_tokens]
+                # Start a new request, this will build the suffix tree for that prompt.
+                self.suffix_cache.start_request(req_id, prompt_token_ids)
+
+            # Append the newly sampled ids to the suffix cache for this request.
+            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
+
+            # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
+            # we extract the pattern from the end of the input.
+            start = max(0, num_tokens - self.max_tree_depth)
+            pattern = input_batch.token_ids_cpu[i, start:num_tokens]
+            draft = self.suffix_cache.speculate(
+                req_id,
+                pattern,
+                max_spec_tokens=min(
+                    self.num_speculative_tokens, self.max_model_len - num_tokens - 1
+                ),
+                max_spec_factor=self.max_spec_factor,
+                min_token_prob=self.min_token_prob,
+            )
+
+            draft_token_ids.append(draft.token_ids)
+
+        # Stop requests that were not seen in the input batch.
+        for req_id in (
+            self.suffix_cache.active_requests - input_batch.req_id_to_index.keys()
+        ):
+            self.suffix_cache.stop_request(req_id)
+
+        return draft_token_ids
+
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 1116179dc5b61e923104e065e5da50970ce2568a..1901c6fc9f14f63cae93068a02627892e88429c9 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -7,8 +7,10 @@ _SAMPLING_EPS = 1e-5
 
 def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
     """True if request is incompatible with speculative decoding"""
-    return (sampling_params.frequency_penalty != 0.0
-            or sampling_params.presence_penalty != 0.0
-            or sampling_params.repetition_penalty != 1.0
-            or sampling_params.min_p > _SAMPLING_EPS
-            or sampling_params.logprobs is not None)
+    return (
+        sampling_params.frequency_penalty != 0.0
+        or sampling_params.presence_penalty != 0.0
+        or sampling_params.repetition_penalty != 1.0
+        or sampling_params.min_p > _SAMPLING_EPS
+        or sampling_params.logprobs is not None
+    )
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 13c33d3edf1415893f60fc8fba880d2b8e314a86..029129cf1a475b1dd29f267e4941a58bba61bee5 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import multiprocessing
 from concurrent.futures import Future, ThreadPoolExecutor
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
-from vllm.utils import LazyLoader
+from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
-from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
-                                                     StructuredOutputGrammar)
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+)
 from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
 
 if TYPE_CHECKING:
@@ -26,6 +26,9 @@ if TYPE_CHECKING:
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
+    ReasoningParser = object
+    Request = object
+
 logger = init_logger(__name__)
 
 
@@ -33,11 +36,11 @@ class StructuredOutputManager:
     """Engine-level manager for structured output requests."""
 
     def __init__(self, vllm_config: VllmConfig):
-        self.backend: Optional[StructuredOutputBackend] = None
-        self.reasoner: Optional[ReasoningParser] = None
+        self.backend: StructuredOutputBackend | None = None
+        self.reasoner: ReasoningParser | None = None
         self.vllm_config = vllm_config
 
-        self._grammar_bitmask: Optional[torch.Tensor] = None
+        self._grammar_bitmask: torch.Tensor | None = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
         max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
@@ -48,8 +51,7 @@ class StructuredOutputManager:
             # - at least 1 CPU
             # - at most half the number of CPUs or 8, whichever is less
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
-            self.executor_for_fillmask = ThreadPoolExecutor(
-                max_workers=max_workers)
+            self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
         if not self.vllm_config.model_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
@@ -60,21 +62,39 @@ class StructuredOutputManager:
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
             self.tokenizer = init_tokenizer_from_configs(
-                model_config=self.vllm_config.model_config)
-            reasoning_parser = \
-                    self.vllm_config.structured_outputs_config.reasoning_parser
+                model_config=self.vllm_config.model_config
+            )
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
+            reasoning_parser_plugin = (
+                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+            )
+            if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
+                ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
+
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
             if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_parser)
+                    reasoning_parser
+                )
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
+        self.enable_in_reasoning = (
+            self.vllm_config.structured_outputs_config.enable_in_reasoning
+        )
+
     def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:
             return
 
         if TYPE_CHECKING:
-            assert request.sampling_params is not None and \
-                request.sampling_params.structured_outputs is not None
+            assert (
+                request.sampling_params is not None
+                and request.sampling_params.structured_outputs is not None
+            )
 
         # Initialize the backend the first time it is needed.
         #
@@ -98,8 +118,7 @@ class StructuredOutputManager:
                     vocab_size=vocab_size,
                 )
             elif backend == "outlines":
-                from vllm.v1.structured_output.backend_outlines import (
-                    OutlinesBackend)
+                from vllm.v1.structured_output.backend_outlines import OutlinesBackend
 
                 self.backend = OutlinesBackend(
                     self.vllm_config,
@@ -108,15 +127,16 @@ class StructuredOutputManager:
                 )
             elif backend == "lm-format-enforcer":
                 from vllm.v1.structured_output.backend_lm_format_enforcer import (  # noqa: E501
-                    LMFormatEnforcerBackend)
+                    LMFormatEnforcerBackend,
+                )
+
                 self.backend = LMFormatEnforcerBackend(
                     self.vllm_config,
                     tokenizer=self.tokenizer,
                     vocab_size=vocab_size,
                 )
             else:
-                raise ValueError(
-                    f"Unsupported structured output backend: {backend}")
+                raise ValueError(f"Unsupported structured output backend: {backend}")
 
         grammar = self.executor.submit(self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
@@ -160,17 +180,18 @@ class StructuredOutputManager:
     def grammar_bitmask(
         self,
         requests: dict[str, Request],
-        structured_output_request_ids: dict[str, int],
+        structured_output_request_ids: list[str],
         scheduled_spec_decode_tokens: dict[str, list[int]],
-    ) -> Optional[npt.NDArray[np.int32]]:
+    ) -> "npt.NDArray[np.int32] | None":
         # Prepare the structured output bitmask for this batch.
         if not structured_output_request_ids:
             return None
 
         max_num_spec_tokens = 0
         if self.vllm_config.speculative_config is not None:
-            max_num_spec_tokens = \
+            max_num_spec_tokens = (
                 self.vllm_config.speculative_config.num_speculative_tokens
+            )
 
         if self._grammar_bitmask is None:
             assert self.backend is not None
@@ -179,25 +200,25 @@ class StructuredOutputManager:
             # Allocate a bitmask for each token needing to be checked:
             # one for each speculative position, and one more for the
             # bonus token / non-speculative token.
-            self._grammar_bitmask = \
-                self.backend.allocate_token_bitmask(
-                    max_batch_size * (1 + max_num_spec_tokens))
+            self._grammar_bitmask = self.backend.allocate_token_bitmask(
+                max_batch_size * (1 + max_num_spec_tokens)
+            )
 
         # Generate a batched bitmask for all structured output requests.
         # When speculative decoding is enabled, we need to include multiple
         # masks for each request, one for each possible bonus token position.
         # These are stored inline in the tensor and unpacked by the gpu runner.
         cumulative_index = 0
-        ordered_seq = sorted(structured_output_request_ids.items(),
-                             key=lambda x: x[1])
 
         # Optimized parallel filling of bitmasks for
         # non-spec, large-batch-size cases
-        if len(ordered_seq) > self.fill_bitmask_parallel_threshold and \
-                max_num_spec_tokens == 0:
+        if (
+            len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold
+            and max_num_spec_tokens == 0
+        ):
             promises = []
             batch = []
-            for req_id, _ in ordered_seq:
+            for req_id in structured_output_request_ids:
                 request = requests[req_id]
                 structured_output_request = request.structured_output_request
                 if TYPE_CHECKING:
@@ -205,8 +226,9 @@ class StructuredOutputManager:
                     assert structured_output_request.grammar is not None
 
                 apply_bitmask = self.should_fill_bitmask(request)
-                batch.append((structured_output_request.grammar,
-                              cumulative_index, apply_bitmask))
+                batch.append(
+                    (structured_output_request.grammar, cumulative_index, apply_bitmask)
+                )
                 if len(batch) == self.fill_bitmask_parallel_batch_size:
                     promises.append(self._async_submit_fill_bitmask(batch))
                     batch = []
@@ -220,7 +242,7 @@ class StructuredOutputManager:
                 promise.result()
         else:
             # Fallback to serial filling of bitmasks for small-batch-size cases
-            for req_id, _ in ordered_seq:
+            for req_id in structured_output_request_ids:
                 request = requests[req_id]
                 structured_output_request = request.structured_output_request
 
@@ -232,18 +254,29 @@ class StructuredOutputManager:
                 state_advancements = 0
                 req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
                 for i, token in enumerate(req_tokens + [None]):
-                    self._fill_bitmasks([(structured_output_request.grammar,
-                                          cumulative_index, apply_bitmask)])
-
-                    if apply_bitmask and token is not None and \
-                        not structured_output_request.grammar.is_terminated():
-                        assert structured_output_request.grammar.accept_tokens(
-                            req_id, [token])
+                    self._fill_bitmasks(
+                        [
+                            (
+                                structured_output_request.grammar,
+                                cumulative_index,
+                                apply_bitmask,
+                            )
+                        ]
+                    )
+
+                    if (
+                        apply_bitmask
+                        and token is not None
+                        and not structured_output_request.grammar.is_terminated()
+                    ):
+                        accepted = structured_output_request.grammar.accept_tokens(
+                            req_id, [token]
+                        )
+                        assert accepted, (token, req_id, scheduled_spec_decode_tokens)
                         state_advancements += 1
                     cumulative_index += 1
                 if state_advancements > 0:
-                    structured_output_request.grammar.rollback(
-                        state_advancements)
+                    structured_output_request.grammar.rollback(state_advancements)
 
         bitmask_tensor = self._grammar_bitmask
         if cumulative_index < bitmask_tensor.shape[0]:
@@ -255,11 +288,18 @@ class StructuredOutputManager:
         return bitmask_tensor.numpy()
 
     def should_fill_bitmask(self, request: Request) -> bool:
+        # NOTE (Hanchen) if enable_in_reasoning is True, it means that
+        # the model needs to be constrained in reasoning. So we should always
+        # enable the bitmask filling.
+
         if self.reasoner is not None:
+            if self.enable_in_reasoning:
+                return True
             assert request.structured_output_request is not None
             if request.structured_output_request.reasoning_ended is None:
-                request.structured_output_request.reasoning_ended = \
+                request.structured_output_request.reasoning_ended = (
                     self.reasoner.is_reasoning_end(request.prompt_token_ids)
+                )
             return request.structured_output_request.reasoning_ended
         return True
 
@@ -274,22 +314,25 @@ class StructuredOutputManager:
             assert request.structured_output_request.grammar is not None
         # by default, we should always advance
         # for cases that don't use thinking mode.
-        if self.reasoner is not None:
-            structured_req = request.structured_output_request
-
-            if structured_req.reasoning_ended:
-                return True
+        if self.reasoner is None:
+            return True
 
-            # Check if reasoning ends in *this* step
-            if self.reasoner.is_reasoning_end(request.all_token_ids):
-                # Reasoning just ended, so we shouldn't advance til
-                # next pass
-                structured_req.reasoning_ended = True
+        # if the model needs structured in reasoning, we should advance
+        if self.enable_in_reasoning:
+            return True
 
-            return False
-        else:
+        structured_req = request.structured_output_request
+        if structured_req.reasoning_ended:
             return True
 
+        # Check if reasoning ends in *this* step
+        if self.reasoner.is_reasoning_end(request.all_token_ids):
+            # Reasoning just ended, so we shouldn't advance til
+            # next pass
+            structured_req.reasoning_ended = True
+
+        return False
+
     def clear_backend(self) -> None:
         if self.backend is not None:
             self.backend.destroy()
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index e06ab6377de3a3863275148bb715f0946ada73d8..2962a439dcb3e940043414abe9b4d37c79ad4985 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,22 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import copy
 import json
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import torch
 
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
-                                                     StructuredOutputGrammar,
-                                                     StructuredOutputOptions)
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
 from vllm.v1.structured_output.request import get_structured_output_key
 
 if TYPE_CHECKING:
@@ -26,8 +26,7 @@ if TYPE_CHECKING:
 else:
     llguidance = LazyLoader("llguidance", globals(), "llguidance")
     llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
-    llguidance_torch = LazyLoader("llguidance.torch", globals(),
-                                  "llguidance.torch")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(), "llguidance.torch")
 
 logger = init_logger(__name__)
 
@@ -36,16 +35,18 @@ def _walk_json_for_additional_properties(data: object):
     if isinstance(data, dict):
         for value in data.values():
             _walk_json_for_additional_properties(value)
-        if 'additionalProperties' not in data and \
-            ('properties' in data or 'patternProperties' in data):
-            data['additionalProperties'] = False
+        if "additionalProperties" not in data and (
+            "properties" in data or "patternProperties" in data
+        ):
+            data["additionalProperties"] = False
     elif isinstance(data, list):
         for item in data:
             _walk_json_for_additional_properties(item)
 
 
 def process_for_additional_properties(
-        guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
+    guide_json: str | dict[str, Any],
+) -> dict[str, Any]:
     if isinstance(guide_json, str):
         guide_json_obj = json.loads(guide_json)
     else:
@@ -57,21 +58,27 @@ def process_for_additional_properties(
 
 @dataclass
 class GuidanceBackend(StructuredOutputBackend):
-
     def __post_init__(self):
-        self.disable_any_whitespace = \
+        self.disable_any_whitespace = (
             self.vllm_config.structured_outputs_config.disable_any_whitespace
-        self.disable_additional_properties = \
+        )
+        self.disable_additional_properties = (
             self.vllm_config.structured_outputs_config.disable_additional_properties
+        )
 
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
-            self.tokenizer, self.vocab_size)
+            self.tokenizer, self.vocab_size
+        )
 
-    def compile_grammar(self, request_type: StructuredOutputOptions,
-                        grammar_spec: str) -> StructuredOutputGrammar:
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec, self.disable_any_whitespace,
-            self.disable_additional_properties)
+            request_type,
+            grammar_spec,
+            self.disable_any_whitespace,
+            self.disable_additional_properties,
+        )
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -90,7 +97,8 @@ class GuidanceBackend(StructuredOutputBackend):
 
     def allocate_token_bitmask(self, max_num_seqs: int):
         return llguidance_torch.allocate_token_bitmask(
-            max_num_seqs, self.ll_tokenizer.vocab_size)
+            max_num_seqs, self.ll_tokenizer.vocab_size
+        )
 
     def destroy(self):
         pass
@@ -103,6 +111,7 @@ class GuidanceGrammar(StructuredOutputGrammar):
     vocab_size: int
     printed_error: bool = False
     terminated: bool = False
+    rollback_lag: int = 0
 
     def check_error(self):
         if not self.printed_error:
@@ -119,6 +128,8 @@ class GuidanceGrammar(StructuredOutputGrammar):
         """
 
         if self.ll_tokenizer.eos_token in tokens:
+            if self.ll_matcher.is_stopped() and not self.terminated:
+                self.rollback_lag = 1
             self.terminated = True
 
         if self.ll_matcher.is_stopped():
@@ -155,8 +166,11 @@ class GuidanceGrammar(StructuredOutputGrammar):
         return tokens[:num_tokens]
 
     def rollback(self, num_tokens: int) -> None:
-        self.ll_matcher.rollback(num_tokens)
-        self.check_error()
+        if num_tokens > 0:
+            self.ll_matcher.rollback(num_tokens - self.rollback_lag)
+            self.terminated = False
+            self.rollback_lag = 0
+            self.check_error()
 
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         # this will automatically return [EOS] mask if the matcher is stopped
@@ -174,19 +188,21 @@ class GuidanceGrammar(StructuredOutputGrammar):
 
 def serialize_guidance_grammar(
     request_type: StructuredOutputOptions,
-    grammar_spec: Union[str, dict[str, Any]],
+    grammar_spec: str | dict[str, Any],
     disable_any_whitespace: bool = False,
     disable_additional_properties: bool = False,
 ) -> str:
-
-    def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str:
+    def _process_schema(
+        grammar_spec: str | dict[str, Any],
+    ) -> str:
         if disable_additional_properties:
             grammar_spec = process_for_additional_properties(grammar_spec)
         return llguidance.LLMatcher.grammar_from_json_schema(
             grammar_spec,
             defaults={
                 "whitespace_flexible": not disable_any_whitespace,
-            })
+            },
+        )
 
     if request_type == StructuredOutputOptions.JSON:
         return _process_schema(grammar_spec)
@@ -195,7 +211,8 @@ def serialize_guidance_grammar(
             '{"type": "object"}',
             defaults={
                 "whitespace_flexible": not disable_any_whitespace,
-            })
+            },
+        )
     else:
         if request_type == StructuredOutputOptions.REGEX:
             tp = "regex"
@@ -215,30 +232,33 @@ def serialize_guidance_grammar(
                 trig = next((t for t in triggers if begin.startswith(t)), None)
                 if trig is None:
                     raise ValueError(
-                        f"Trigger {begin} not found in triggers {triggers}")
+                        f"Trigger {begin} not found in triggers {triggers}"
+                    )
                 tags.append(
                     llguidance.StructTag(
                         trigger=trig,
                         begin=s["begin"],
                         grammar=_process_schema(s["schema"]),
                         end=s["end"],
-                    ))
+                    )
+                )
             if not tags:
-                raise ValueError(
-                    "No structural tags found in the grammar spec.")
+                raise ValueError("No structural tags found in the grammar spec.")
             return llguidance.StructTag.to_grammar(tags)
         else:
-            logger.error("Validation should have already occurred. "
-                         "Please file an issue.")
-            raise ValueError("grammar is not of valid supported types. "
-                             f"({request_type!s})")
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})"
+            )
         return llguidance.grammar_from(tp, grammar_spec)
 
 
 def validate_guidance_grammar(
-        sampling_params: SamplingParams,
-        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
-    tp, grm = get_structured_output_key(sampling_params)
+    sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None
+) -> None:
+    tp, grm = get_structured_output_key(sampling_params.structured_outputs)
     guidance_grm = serialize_guidance_grammar(tp, grm)
     err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
     if err:
diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index 465b2428f89384b05e82ec56e2e764f56c4f0cf7..150c57feda0f0d681f623ba87d673c84f011efb9 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import ast
 import json
 from dataclasses import dataclass, field
@@ -12,27 +10,32 @@ import torch
 from transformers import PreTrainedTokenizerBase
 
 from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
-                                                     StructuredOutputGrammar,
-                                                     StructuredOutputOptions)
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
 
 if TYPE_CHECKING:
     import lmformatenforcer
     import lmformatenforcer.integrations.vllm as lmfe_vllm
 else:
-    lmformatenforcer = LazyLoader("lmformatenforcer", globals(),
-                                  "lmformatenforcer")
-    lmfe_vllm = LazyLoader("lmformatenforcer.integrations.vllm", globals(),
-                           "lmformatenforcer.integrations.vllm")
+    lmformatenforcer = LazyLoader("lmformatenforcer", globals(), "lmformatenforcer")
+    lmfe_vllm = LazyLoader(
+        "lmformatenforcer.integrations.vllm",
+        globals(),
+        "lmformatenforcer.integrations.vllm",
+    )
 
 
 @lru_cache
 def _cached_build_vllm_token_enforcer_tokenizer_data(
-        tokenizer: PreTrainedTokenizerBase,
-        vocab_size: int) -> lmfe_vllm.TokenEnforcerTokenizerData:
+    tokenizer: PreTrainedTokenizerBase, vocab_size: int
+) -> "lmfe_vllm.TokenEnforcerTokenizerData":
     return lmfe_vllm.build_vllm_token_enforcer_tokenizer_data(
-        tokenizer, use_bitmask=True, vocab_size=vocab_size)
+        tokenizer, use_bitmask=True, vocab_size=vocab_size
+    )
 
 
 @dataclass
@@ -44,7 +47,8 @@ class LMFormatEnforcerGrammar(StructuredOutputGrammar):
         original_len = len(self.current_tokens_prefix)
         for token in tokens:
             if not self.token_enforcer.get_allowed_tokens(
-                    self.current_tokens_prefix).is_token_allowed(token):
+                self.current_tokens_prefix
+            ).is_token_allowed(token):
                 # Rollback partial updates to ensure atomicity.
                 del self.current_tokens_prefix[original_len:]
                 return False
@@ -56,8 +60,8 @@ class LMFormatEnforcerGrammar(StructuredOutputGrammar):
             prefix = tokens[:prefix_length]
             next_token = tokens[prefix_length]
             if not self.token_enforcer.get_allowed_tokens(
-                    self.current_tokens_prefix +
-                    prefix).is_token_allowed(next_token):
+                self.current_tokens_prefix + prefix
+            ).is_token_allowed(next_token):
                 break
         else:
             return tokens
@@ -69,14 +73,16 @@ class LMFormatEnforcerGrammar(StructuredOutputGrammar):
 
     def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
         allowed_tokens = self.token_enforcer.get_allowed_tokens(
-            self.current_tokens_prefix)
+            self.current_tokens_prefix
+        )
         bitmask[batch_index] = allowed_tokens.allowed_tokens
 
     def is_terminated(self) -> bool:
         # We are considered terminated if the prefix ends with eos_token_id
-        return_value = len(
-            self.current_tokens_prefix) > 0 and self.current_tokens_prefix[
-                -1] == self.token_enforcer.eos_token_id
+        return_value = (
+            len(self.current_tokens_prefix) > 0
+            and self.current_tokens_prefix[-1] == self.token_enforcer.eos_token_id
+        )
         return return_value
 
     def reset(self):
@@ -85,18 +91,18 @@ class LMFormatEnforcerGrammar(StructuredOutputGrammar):
 
 @dataclass
 class LMFormatEnforcerBackend(StructuredOutputBackend):
-
     def __post_init__(self):
         self.tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
-            self.tokenizer, self.vocab_size)
+            self.tokenizer, self.vocab_size
+        )
 
-    def compile_grammar(self, request_type: StructuredOutputOptions,
-                        grammar_spec: str) -> StructuredOutputGrammar:
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
         character_level_parser: lmformatenforcer.CharacterLevelParser
         if request_type == StructuredOutputOptions.JSON:
             spec_dict = json.loads(grammar_spec)
-            character_level_parser = lmformatenforcer.JsonSchemaParser(
-                spec_dict)
+            character_level_parser = lmformatenforcer.JsonSchemaParser(spec_dict)
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
             character_level_parser = lmformatenforcer.JsonSchemaParser(None)
         elif request_type == StructuredOutputOptions.REGEX:
@@ -104,14 +110,17 @@ class LMFormatEnforcerBackend(StructuredOutputBackend):
         elif request_type == StructuredOutputOptions.CHOICE:
             choices = ast.literal_eval(grammar_spec)
             character_level_parser = lmformatenforcer.UnionParser(
-                [lmformatenforcer.StringParser(choice) for choice in choices])
+                [lmformatenforcer.StringParser(choice) for choice in choices]
+            )
         else:
             raise ValueError(
-                "Invalid request type for LM Format Enforcer backend"
-                f"({request_type!s})")
+                f"Invalid request type for LM Format Enforcer backend({request_type!s})"
+            )
         max_rollback_tokens = (
             self.vllm_config.speculative_config.num_speculative_tokens
-            if self.vllm_config.speculative_config is not None else 0)
+            if self.vllm_config.speculative_config is not None
+            else 0
+        )
 
         if max_rollback_tokens > 0:
             raise ValueError(
@@ -136,8 +145,7 @@ class LMFormatEnforcerBackend(StructuredOutputBackend):
         pass
 
 
-def validate_structured_output_request_lm_format_enforcer(
-        params: SamplingParams):
+def validate_structured_output_request_lm_format_enforcer(params: SamplingParams):
     if params.structured_outputs is None:
         return
 
@@ -163,5 +171,7 @@ def validate_structured_output_request_lm_format_enforcer(
     elif so_params.choice:
         return
     elif so_params.grammar:
-        raise ValueError("LM Format Enforcer structured outputs backend "
-                         "does not support grammar specifications")
+        raise ValueError(
+            "LM Format Enforcer structured outputs backend "
+            "does not support grammar specifications"
+        )
diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index e5e638a6ad7649f108c9d32cc3bb601fed9f72c6..34916079f821aa684f479ea14a2e2e8aa05e0225 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -14,21 +14,24 @@ import torch
 from regex import escape as regex_escape
 
 from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
-                                                     StructuredOutputGrammar,
-                                                     StructuredOutputOptions)
-from vllm.v1.structured_output.utils import (OutlinesVocabulary,
-                                             get_outlines_cache,
-                                             get_outlines_vocabulary)
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+from vllm.v1.structured_output.utils import (
+    OutlinesVocabulary,
+    get_outlines_cache,
+    get_outlines_vocabulary,
+)
 
 if TYPE_CHECKING:
     import outlines_core as oc
     import outlines_core.json_schema as json_schema
 else:
     oc = LazyLoader("oc", globals(), "outlines_core")
-    json_schema = LazyLoader("json_schema", globals(),
-                             "outlines_core.json_schema")
+    json_schema = LazyLoader("json_schema", globals(), "outlines_core.json_schema")
 
 # Python 3.11+ sre_parse and sre_constants
 # are deprecated, so we must import them from re
@@ -46,13 +49,13 @@ else:
 
 @dataclass
 class OutlinesBackend(StructuredOutputBackend):
-
     def __post_init__(self):
         self.vocabulary = get_outlines_vocabulary(self.tokenizer)
         self.cache = get_outlines_cache()
 
-    def _compile_index(self, regex_string: str,
-                       vocabulary: OutlinesVocabulary) -> oc.Index:
+    def _compile_index(
+        self, regex_string: str, vocabulary: OutlinesVocabulary
+    ) -> oc.Index:
         cache_key = f"{vocabulary._hash}_{regex_string}"
         if cache_key in self.cache:
             return self.cache[cache_key]
@@ -62,8 +65,9 @@ class OutlinesBackend(StructuredOutputBackend):
 
         return index
 
-    def compile_grammar(self, request_type: StructuredOutputOptions,
-                        grammar_spec: str) -> StructuredOutputGrammar:
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
         if request_type == StructuredOutputOptions.JSON:
             regex = json_schema.build_regex_from_schema(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:
@@ -79,10 +83,13 @@ class OutlinesBackend(StructuredOutputBackend):
         index = self._compile_index(regex, self.vocabulary)
         max_rollback_tokens = (
             self.vllm_config.speculative_config.num_speculative_tokens
-            if self.vllm_config.speculative_config is not None else 0)
-        return OutlinesGrammar(vocab_size=self.vocab_size,
-                               guide=oc.Guide(
-                                   index, max_rollback=max_rollback_tokens))
+            if self.vllm_config.speculative_config is not None
+            else 0
+        )
+        return OutlinesGrammar(
+            vocab_size=self.vocab_size,
+            guide=oc.Guide(index, max_rollback=max_rollback_tokens),
+        )
 
     def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
         return torch.full(
@@ -98,20 +105,15 @@ class OutlinesBackend(StructuredOutputBackend):
 
 @dataclass
 class OutlinesGrammar(StructuredOutputGrammar):
-
     vocab_size: int
     guide: oc.Guide = field(hash=False)
-    num_processed_tokens: int = field(default_factory=lambda: 0,
-                                      repr=False,
-                                      hash=False,
-                                      init=False)
+    num_processed_tokens: int = field(
+        default_factory=lambda: 0, repr=False, hash=False, init=False
+    )
 
     # outlines_core signals done on DFA accept; vLLM expects done after EOS.
     # We delay the finished flag by one step so EOS can still be emitted.
-    _prev_finished: bool = field(default=False,
-                                 init=False,
-                                 repr=False,
-                                 hash=False)
+    _prev_finished: bool = field(default=False, init=False, repr=False, hash=False)
 
     def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
         """Accepts a list of tokens and advances the FSM.
@@ -142,8 +144,7 @@ class OutlinesGrammar(StructuredOutputGrammar):
 
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         mask = bitmask[idx]
-        self.guide.write_mask_into(mask.data_ptr(), mask.numel(),
-                                   mask.element_size())
+        self.guide.write_mask_into(mask.data_ptr(), mask.numel(), mask.element_size())
 
     def is_terminated(self) -> bool:
         curr = self.guide.is_finished()
@@ -187,8 +188,10 @@ def validate_structured_output_request_outlines(params: SamplingParams):
         regex = "(" + "|".join(choices) + ")"
         validate_regex_is_buildable(regex)
     elif so_params.grammar:
-        raise ValueError("Outlines structured outputs backend "
-                         "does not support grammar specifications")
+        raise ValueError(
+            "Outlines structured outputs backend "
+            "does not support grammar specifications"
+        )
 
 
 def _prefix_needs_context(parsed) -> bool:
@@ -196,7 +199,7 @@ def _prefix_needs_context(parsed) -> bool:
 
     def subpattern_consumes(parsed) -> bool:
         """Return True if subpattern can consume at least one character."""
-        tokens = parsed.data if hasattr(parsed, 'data') else parsed
+        tokens = parsed.data if hasattr(parsed, "data") else parsed
         for ttype, tval in tokens:
             # literal, character class, or dot always consumes
             if ttype in (sre_parse.LITERAL, sre_parse.IN, sre_parse.ANY):
@@ -212,17 +215,18 @@ def _prefix_needs_context(parsed) -> bool:
                 if any(subpattern_consumes(br) for br in branches):
                     return True
             # grouped subpattern: recurse into its contents
-            elif ttype == sre_parse.SUBPATTERN and subpattern_consumes(
-                    tval[3]):
+            elif ttype == sre_parse.SUBPATTERN and subpattern_consumes(tval[3]):
                 return True
         # No consumers, return False
         return False
 
-    tokens = parsed.data if hasattr(parsed, 'data') else parsed
+    tokens = parsed.data if hasattr(parsed, "data") else parsed
     for ttype, tval in tokens:
         # Direct anchors or look-around
-        if ttype == sre_parse.AT or ttype in (sre_constants.ASSERT,
-                                              sre_constants.ASSERT_NOT):
+        if ttype == sre_parse.AT or ttype in (
+            sre_constants.ASSERT,
+            sre_constants.ASSERT_NOT,
+        ):
             return True
 
         # Nested subpattern: check
@@ -261,9 +265,8 @@ def _prefix_needs_context(parsed) -> bool:
 
 def _check_unsupported(parsed) -> None:
     """Check for regex features unsupported by regex-automata"""
-    tokens = parsed.data if hasattr(parsed, 'data') else parsed
+    tokens = parsed.data if hasattr(parsed, "data") else parsed
     for ttype, tval in tokens:
-
         # backreference
         if ttype in (sre_parse.GROUPREF, sre_parse.GROUPREF_EXISTS):
             raise ValueError("Backreferences are unsupported.")
@@ -274,8 +277,7 @@ def _check_unsupported(parsed) -> None:
 
         # unicode word boundaries
         elif ttype == sre_parse.AT:
-            if tval in (sre_constants.AT_BOUNDARY,
-                        sre_constants.AT_NON_BOUNDARY):
+            if tval in (sre_constants.AT_BOUNDARY, sre_constants.AT_NON_BOUNDARY):
                 raise ValueError("Unicode word boundaries are unsupported.")
 
         elif ttype == sre_parse.BRANCH:
@@ -308,7 +310,8 @@ def validate_regex_is_buildable(pattern: str) -> None:
         raise ValueError(
             f"Regex uses unsupported feature for structured outputs: {e}. "
             "Only basic matching constructs are supported—lookarounds, "
-            "backreferences, and unicode boundaries are not.") from e
+            "backreferences, and unicode boundaries are not."
+        ) from e
 
     if _prefix_needs_context(parsed):
         raise ValueError(
@@ -317,4 +320,5 @@ def validate_regex_is_buildable(pattern: str) -> None:
             "in a way which requires context before any token is matched."
             "structured outputs needs regexes that can match without needing "
             "that context. Try rewriting the pattern without using these "
-            f"constructs. Pattern:\n{pattern}")
+            f"constructs. Pattern:\n{pattern}"
+        )
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 9a53aa7a1ad1088bb635d8f559126e2b96f1efe9..7dc9589b63b86b69644f6c40e35ec82be1e3410c 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import enum
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -13,6 +11,9 @@ if TYPE_CHECKING:
 
     from vllm.config import VllmConfig
     from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    VllmConfig = object
+    AnyTokenizer = object
 
 
 class StructuredOutputOptions(enum.Enum):
@@ -69,7 +70,7 @@ class StructuredOutputGrammar(ABC):
         """
 
     @abstractmethod
-    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+    def fill_bitmask(self, bitmask: "torch.Tensor", batch_index: int) -> None:
         """
         Fills the bitmask for a specific batch index.
 
@@ -103,8 +104,9 @@ class StructuredOutputBackend(ABC):
     vocab_size: int
 
     @abstractmethod
-    def compile_grammar(self, request_type: StructuredOutputOptions,
-                        grammar_spec: str) -> StructuredOutputGrammar:
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
         """
         Compiles a grammar specification into a structured output grammar.
 
@@ -118,7 +120,7 @@ class StructuredOutputBackend(ABC):
         """
 
     @abstractmethod
-    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+    def allocate_token_bitmask(self, max_num_seqs: int) -> "torch.Tensor":
         """
         Allocates a token bitmask for the specified maximum number of sequences.
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index a853e6540719e6843edf0b869de0707b603d06c7..c9f2dc07da78698afedf7f0952a9bf0e934b5198 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -13,13 +11,17 @@ import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import LazyLoader
-from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
-                                                     StructuredOutputGrammar,
-                                                     StructuredOutputOptions)
-from vllm.v1.structured_output.utils import (choice_as_grammar,
-                                             convert_lark_to_ebnf,
-                                             grammar_is_likely_lark)
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputBackend,
+    StructuredOutputGrammar,
+    StructuredOutputOptions,
+)
+from vllm.v1.structured_output.utils import (
+    choice_as_grammar,
+    convert_lark_to_ebnf,
+    grammar_is_likely_lark,
+)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -31,40 +33,25 @@ logger = init_logger(__name__)
 
 @dataclass
 class XgrammarBackend(StructuredOutputBackend):
-
     def __post_init__(self):
-        self.disable_any_whitespace = \
+        self.disable_any_whitespace = (
             self.vllm_config.structured_outputs_config.disable_any_whitespace
+        )
 
         if isinstance(self.tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
-            try:
-                if self.tokenizer.is_tekken:
-                    encoded_vocab = self.tokenizer._vocab
-                else:
-                    encoded_vocab = [
-                        token for token, _ in sorted(
-                            self.tokenizer.get_vocab().items(),
-                            key=lambda x: x[1],
-                        )
-                    ]
-                stop_token_ids = None
-                if (hasattr(
-                        self.tokenizer,
-                        "eos_token_id",
-                ) and self.tokenizer.eos_token_id is not None):
-                    stop_token_ids = [self.tokenizer.eos_token_id]
-            except AttributeError as e:
-                raise ValueError(
-                    f"Cannot get the vocabulary of the tokenizer "
-                    f"{type(self.tokenizer)}. The tokenizer should have a "
-                    "get_vocab method.") from e
+            stop_token_ids = [self.tokenizer.eos_token_id]
+
+            # not self.tokenizer.vocab_size as self.tokenizer.vocab
+            # collapses all decoded errors into a single token.
+            self.vocab_size = len(self.tokenizer.vocab)
             tokenizer_info = xgr.TokenizerInfo(  # type: ignore
-                encoded_vocab=encoded_vocab,
+                encoded_vocab=self.tokenizer.vocab,
                 # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
                 vocab_type=xgr.VocabType.RAW
-                if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
+                if self.tokenizer.is_tekken
+                else xgr.VocabType.BYTE_FALLBACK,
                 vocab_size=self.vocab_size,
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,
@@ -83,40 +70,47 @@ class XgrammarBackend(StructuredOutputBackend):
 
         self.num_speculative_tokens = 0
         if self.vllm_config.speculative_config is not None:
-            self.num_speculative_tokens = \
+            self.num_speculative_tokens = (
                 self.vllm_config.speculative_config.num_speculative_tokens
+            )
 
-    def compile_grammar(self, request_type: StructuredOutputOptions,
-                        grammar_spec: str) -> StructuredOutputGrammar:
+    def compile_grammar(
+        self, request_type: StructuredOutputOptions, grammar_spec: str
+    ) -> StructuredOutputGrammar:
         if request_type == StructuredOutputOptions.JSON:
             ctx = self.compiler.compile_json_schema(
-                grammar_spec, any_whitespace=not self.disable_any_whitespace)
+                grammar_spec, any_whitespace=not self.disable_any_whitespace
+            )
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
             ctx = self.compiler.compile_json_schema(
-                '{"type": "object"}',
-                any_whitespace=not self.disable_any_whitespace)
+                '{"type": "object"}', any_whitespace=not self.disable_any_whitespace
+            )
         elif request_type == StructuredOutputOptions.GRAMMAR:
             ctx = self.compiler.compile_grammar(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:
             ctx = self.compiler.compile_regex(grammar_spec)
         elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
             s_tag = json.loads(grammar_spec)
-            tags = [
-                xgr.StructuralTagItem(
-                    begin=s["begin"],
-                    schema=json.dumps(s["schema"]),
-                    end=s["end"],
-                ) for s in s_tag["structures"]
-            ]
-            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
-                tags, s_tag["triggers"])
-            ctx = self.compiler.compile_structural_tag(structural_tag)
+            if "structures" in s_tag:
+                # Falling back to deprecated method of compiling structural tag
+                tags = [
+                    xgr.StructuralTagItem(
+                        begin=s["begin"],
+                        schema=json.dumps(s["schema"]),
+                        end=s["end"],
+                    )
+                    for s in s_tag["structures"]
+                ]
+                ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
+            else:
+                ctx = self.compiler.compile_structural_tag(grammar_spec)
         else:
             logger.error(
                 "Validation should have already occurred. Please file an issue."
             )
             raise ValueError(
-                f"grammar is not of valid supported types. ({request_type!s})")
+                f"grammar is not of valid supported types. ({request_type!s})"
+            )
 
         return XgrammarGrammar(
             matcher=xgr.GrammarMatcher(
@@ -146,10 +140,9 @@ class XgrammarGrammar(StructuredOutputGrammar):
     vocab_size: int
     matcher: xgr.GrammarMatcher = field(hash=False)
     ctx: xgr.CompiledGrammar = field(hash=False)
-    num_processed_tokens: int = field(default_factory=lambda: 0,
-                                      repr=False,
-                                      hash=False,
-                                      init=False)
+    num_processed_tokens: int = field(
+        default_factory=lambda: 0, repr=False, hash=False, init=False
+    )
     _is_terminated: bool = field(default=False, repr=False, hash=False)
 
     def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
@@ -164,7 +157,10 @@ class XgrammarGrammar(StructuredOutputGrammar):
             if not self.matcher.accept_token(token):
                 logger.error(
                     "Failed to advance FSM for request %s "
-                    "for tokens %s. Please file an issue.", request_id, token)
+                    "for tokens %s. Please file an issue.",
+                    request_id,
+                    token,
+                )
                 return False
             self.num_processed_tokens += 1
         self._is_terminated = self.matcher.is_terminated()
@@ -203,6 +199,25 @@ class XgrammarGrammar(StructuredOutputGrammar):
         self.matcher.reset()
 
 
+# cf https://github.com/mlc-ai/xgrammar/blob/a32ac892676d2eedc0327416105b9b06edfb94b2/cpp/json_schema_converter.cc
+STRING_SUPPORTED_FORMATS = {
+    "email",
+    "date",
+    "time",
+    "date-time",
+    "duration",
+    "ipv4",
+    "ipv6",
+    "hostname",
+    "uuid",
+    "uri",
+    "uri-reference",
+    "uri-template",
+    "json-pointer",
+    "relative-json-pointer",
+}
+
+
 def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
     """Check if JSON schema contains features unsupported by xgrammar."""
 
@@ -216,18 +231,29 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
 
         # Check for array unsupported keywords
         if obj.get("type") == "array" and any(
-                key in obj for key in ("uniqueItems", "contains",
-                                       "minContains", "maxContains")):
+            key in obj
+            for key in ("uniqueItems", "contains", "minContains", "maxContains")
+        ):
             return True
 
         # Unsupported keywords for strings
-        if obj.get("type") == "string" and "format" in obj:
+        if (
+            obj.get("type") == "string"
+            and "format" in obj
+            and obj["format"] not in STRING_SUPPORTED_FORMATS
+        ):
             return True
 
         # Unsupported keywords for objects
         if obj.get("type") == "object" and any(
-                key in obj for key in ("minProperties", "maxProperties",
-                                       "propertyNames", "patternProperties")):
+            key in obj
+            for key in (
+                "minProperties",
+                "maxProperties",
+                "propertyNames",
+                "patternProperties",
+            )
+        ):
             return True
 
         # Recursively check all nested objects and arrays
@@ -259,16 +285,18 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
         try:
             xgr.Grammar.from_regex(so_params.regex)
         except Exception as err:
-            raise ValueError("Failed to transform regex into a grammar: "
-                             f"{err}") from err
+            raise ValueError(
+                f"Failed to transform regex into a grammar: {err}"
+            ) from err
 
     if so_params.choice:
         choice_grammar = choice_as_grammar(so_params.choice)
         try:
             xgr.Grammar.from_ebnf(choice_grammar)
         except Exception as err:
-            raise ValueError("Failed to transform choices into a grammar: "
-                             "{err}") from err
+            raise ValueError(
+                "Failed to transform choices into a grammar: {err}"
+            ) from err
         so_params.choice = None
         so_params.grammar = choice_grammar
         return
@@ -285,12 +313,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
         try:
             xgr.Grammar.from_json_schema(schema)
         except Exception as err:
-            raise ValueError("Failed to transform json schema into a grammar: "
-                             f"{err}") from err
+            raise ValueError(
+                f"Failed to transform json schema into a grammar: {err}"
+            ) from err
 
         if has_xgrammar_unsupported_json_features(schema):
-            raise ValueError("The provided JSON schema contains features not "
-                             "supported by xgrammar.")
+            raise ValueError(
+                "The provided JSON schema contains features not supported by xgrammar."
+            )
         return
 
     if so_params.grammar:
@@ -300,7 +330,8 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
                 so_params.grammar = convert_lark_to_ebnf(so_params.grammar)
             except ValueError as e:
                 raise ValueError(
-                    "Failed to convert the grammar from Lark to EBNF. ") from e
+                    "Failed to convert the grammar from Lark to EBNF. "
+                ) from e
 
         # Test parsing EBNF grammar, possibly already converted from Lark
         try:
@@ -313,15 +344,19 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
     if so_params.structural_tag:
         try:
             s_tag = json.loads(so_params.structural_tag)
-            tags = [
-                xgr.StructuralTagItem(
-                    begin=s["begin"],
-                    schema=json.dumps(s["schema"]),
-                    end=s["end"],
-                ) for s in s_tag["structures"]
-            ]
-            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
-                tags, s_tag["triggers"])
-            xgr.Grammar.from_structural_tag(structural_tag)
+
+            # Using the deprecated method of compiling structural tag
+            if "structures" in s_tag:
+                tags = [
+                    xgr.StructuralTagItem(
+                        begin=s["begin"],
+                        schema=json.dumps(s["schema"]),
+                        end=s["end"],
+                    )
+                    for s in s_tag["structures"]
+                ]
+                xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
+            else:
+                xgr.Grammar.from_structural_tag(so_params.structural_tag)
         except Exception as e:
             raise ValueError("Invalid structural tag specification.") from e
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 99974ef46ecd516d68404ac8a846a8a2dc93f137..94ae36a1abb4ffd2223ae64a493f3a8a6bcbdd3b 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -1,27 +1,39 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import dataclasses
 import functools
 import json
 from concurrent.futures import Future
 from concurrent.futures._base import TimeoutError
-from typing import Optional, Union, cast
+from typing import cast
 
-from vllm.sampling_params import SamplingParams
-from vllm.v1.structured_output.backend_types import (StructuredOutputGrammar,
-                                                     StructuredOutputKey,
-                                                     StructuredOutputOptions)
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.v1.structured_output.backend_types import (
+    StructuredOutputGrammar,
+    StructuredOutputKey,
+    StructuredOutputOptions,
+)
 
 
 @dataclasses.dataclass
 class StructuredOutputRequest:
+    params: StructuredOutputsParams
+    _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
+    reasoning_ended: bool | None = None
 
-    sampling_params: SamplingParams
-    _grammar: Optional[Union[Future[StructuredOutputGrammar],
-                             StructuredOutputGrammar]] = None
-    reasoning_ended: Optional[bool] = None
+    @staticmethod
+    def from_sampling_params(
+        sampling_params: SamplingParams | None,
+    ) -> "StructuredOutputRequest | None":
+        if sampling_params is None:
+            return None
+        params = sampling_params.structured_outputs
+        if params:
+            if params.all_constraints_none():
+                return None
+            else:
+                return StructuredOutputRequest(params=params)
+        return None
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
@@ -41,46 +53,42 @@ class StructuredOutputRequest:
         return self._check_grammar_completion()
 
     @property
-    def grammar(self) -> Optional[StructuredOutputGrammar]:
+    def grammar(self) -> StructuredOutputGrammar | None:
         completed = self._check_grammar_completion()
-        return cast(Optional[StructuredOutputGrammar],
-                    self._grammar) if completed else None
+        return (
+            cast(StructuredOutputGrammar | None, self._grammar) if completed else None
+        )
 
     @grammar.setter
     def grammar(
-        self, grammar: Union[StructuredOutputGrammar,
-                             Future[StructuredOutputGrammar]]
+        self, grammar: StructuredOutputGrammar | Future[StructuredOutputGrammar]
     ) -> None:
         self._grammar = grammar
 
     @functools.cached_property
     def structured_output_key(self) -> StructuredOutputKey:
-        return get_structured_output_key(self.sampling_params)
+        return get_structured_output_key(self.params)
 
 
-def get_structured_output_key(
-        sampling_params: SamplingParams) -> StructuredOutputKey:
-    params = sampling_params.structured_outputs
-    assert params is not None, "params can't be None."
+def get_structured_output_key(params: StructuredOutputsParams) -> StructuredOutputKey:
     if params.json is not None:
         if not isinstance(params.json, str):
             json_str = json.dumps(params.json)
         else:
             json_str = params.json
-        return (StructuredOutputOptions.JSON, json_str)
-    elif params.json_object:
-        return (StructuredOutputOptions.JSON_OBJECT, "")
-    elif params.regex is not None:
-        return (StructuredOutputOptions.REGEX, params.regex)
-    elif params.choice is not None:
+        return StructuredOutputOptions.JSON, json_str
+    if params.json_object:
+        return StructuredOutputOptions.JSON_OBJECT, ""
+    if params.regex is not None:
+        return StructuredOutputOptions.REGEX, params.regex
+    if params.choice is not None:
         if not isinstance(params.choice, str):
             json_str = json.dumps(params.choice)
         else:
             json_str = params.choice
-        return (StructuredOutputOptions.CHOICE, json_str)
-    elif params.grammar is not None:
-        return (StructuredOutputOptions.GRAMMAR, params.grammar)
-    elif params.structural_tag is not None:
-        return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag)
-    else:
-        raise ValueError("No valid structured output parameter found")
+        return StructuredOutputOptions.CHOICE, json_str
+    if params.grammar is not None:
+        return StructuredOutputOptions.GRAMMAR, params.grammar
+    if params.structural_tag is not None:
+        return StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag
+    raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index b9b09bea1e80f62cf30deedcb8df9b10fe7a5e20..d2d14fcfc43621bcdc7e0e4162d5e22b5b784a02 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 from __future__ import annotations
 
 import hashlib
@@ -16,7 +15,8 @@ from diskcache import Cache
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import LazyLoader
+from vllm.utils.import_utils import LazyLoader
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 if TYPE_CHECKING:
     import outlines_core as oc
@@ -25,7 +25,6 @@ if TYPE_CHECKING:
     import xgrammar as xgr
 
     from vllm.transformers_utils.tokenizer import AnyTokenizer
-    from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -37,6 +36,10 @@ else:
         "transformers.models.gpt2.tokenization_gpt2",
     )
 
+    AnyTokenizer = object
+    SchedulerOutput = object
+    InputBatch = object
+
 logger = init_logger(__name__)
 
 CACHE = None
@@ -44,9 +47,9 @@ CACHE = None
 
 def apply_grammar_bitmask(
     scheduler_output: SchedulerOutput,
+    grammar_output: GrammarOutput,
     input_batch: InputBatch,
     logits: torch.Tensor,
-    device: torch.device,
 ) -> None:
     """
     Apply grammar bitmask to output logits of the model with xgrammar function.
@@ -55,11 +58,10 @@ def apply_grammar_bitmask(
         scheduler_output (SchedulerOutput): The result of engine scheduling.
         input_batch (InputBatch): The input of model runner.
         logits (torch.Tensor): The output logits of model forward.
-        device (torch.device): The device that model runner running on.
     """
-    grammar_bitmask = scheduler_output.grammar_bitmask
-    if grammar_bitmask is None:
-        return
+    # Serialization of np.ndarray is much more efficient than a tensor,
+    # so we receive it in that format.
+    grammar_bitmask = grammar_output.grammar_bitmask
 
     # We receive the structured output bitmask from the scheduler,
     # compacted to contain bitmasks only for structured output requests.
@@ -76,45 +78,52 @@ def apply_grammar_bitmask(
     for req_id, batch_index in seq:
         logit_index = batch_index + cumulative_offset
         cumulative_offset += len(
-            scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
-        if req_id in scheduler_output.structured_output_request_ids:
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
+        )
+        if req_id in grammar_output.structured_output_request_ids:
             struct_out_req_batch_indices[req_id] = logit_index
 
     out_indices = []
 
     # Reorder the bitmask to match the order of the requests in the batch.
-    sorted_bitmask = np.full(shape=(logits.shape[0], grammar_bitmask.shape[1]),
-                             fill_value=-1,
-                             dtype=grammar_bitmask.dtype)
+    sorted_bitmask = np.full(
+        shape=(logits.shape[0], grammar_bitmask.shape[1]),
+        fill_value=-1,
+        dtype=grammar_bitmask.dtype,
+    )
     cumulative_index = 0
-    seq = sorted(scheduler_output.structured_output_request_ids.items(),
-                 key=lambda x: x[1])
-    for req_id, _ in seq:
+    for req_id in grammar_output.structured_output_request_ids:
         num_spec_tokens = len(
-            scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
+        )
         if req_id in struct_out_req_batch_indices:
             logit_index = struct_out_req_batch_indices[req_id]
             for i in range(1 + num_spec_tokens):
-                sorted_bitmask[logit_index + i] = \
-                    grammar_bitmask[cumulative_index + i]
+                sorted_bitmask[logit_index + i] = grammar_bitmask[cumulative_index + i]
                 out_indices.append(logit_index + i)
         cumulative_index += 1 + num_spec_tokens
-    grammar_bitmask = sorted_bitmask
+
+    # Copy async to device as tensor.
+    grammar_bitmask = torch.from_numpy(sorted_bitmask).to(
+        logits.device, non_blocking=True
+    )
 
     # If the length of out indices and the logits have the same shape
     # we don't need to pass indices to the kernel,
     # since the bitmask is already aligned with the logits.
     skip_out_indices = len(out_indices) == logits.shape[0]
 
-    # Serialization of np.ndarray is much more efficient than a tensor,
-    # so we receive it in that format.
-    grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
+    index_tensor = None
+    if not skip_out_indices:
+        # xgrammar expects a python list of indices but it will actually work with
+        # a tensor. If we copy the tensor ourselves here we can do it in a non_blocking
+        # manner and there should be no cpu sync within xgrammar.
+        index_tensor = torch.tensor(
+            out_indices, dtype=torch.int32, device="cpu", pin_memory=True
+        )
+        index_tensor = index_tensor.to(logits.device, non_blocking=True)
 
-    xgr.apply_token_bitmask_inplace(
-        logits,
-        grammar_bitmask.to(device, non_blocking=True),
-        indices=out_indices if not skip_out_indices else None,
-    )
+    xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor)
 
 
 class OutlinesVocabulary:
@@ -128,8 +137,7 @@ class OutlinesVocabulary:
         self.inner = vocabulary
         # Have to do abs(hash()) because python hashes can
         # be negative, and we are using hash as a cache key.
-        hex_str = hashlib.sha256(
-            vocabulary.__repr__().encode('utf-8')).hexdigest()
+        hex_str = hashlib.sha256(vocabulary.__repr__().encode("utf-8")).hexdigest()
         hash_int = int(hex_str, 16)
         self._hash = hash_int
 
@@ -165,16 +173,18 @@ def get_outlines_cache():
 
     cache_dir = get_outlines_cache_path()
     if envs.VLLM_V1_USE_OUTLINES_CACHE:
-        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
-                       "cache. It may consume a lot of disk space and should "
-                       "not be used with untrusted clients.")
+        logger.warning(
+            "Enabling outlines cache. This is an unbounded on-disk "
+            "cache. It may consume a lot of disk space and should "
+            "not be used with untrusted clients."
+        )
         cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
         outlines_version = importlib.metadata.version("outlines_core")
 
-        cached_version = cache.get('__version__', None)
+        cached_version = cache.get("__version__", None)
         if cached_version != outlines_version:
             cache.clear()
-        cache.set('__version__', outlines_version)
+        cache.set("__version__", outlines_version)
         return cache
     else:
         return LRUCache(maxsize=128)
@@ -194,19 +204,17 @@ def _reduced_vocabulary(
         A Dict of token string -> equivalent token ids
     """
 
-    unicode_to_bytes = {
-        v: k
-        for k, v in tokenization_gpt2.bytes_to_unicode().items()
-    }
+    unicode_to_bytes = {v: k for k, v in tokenization_gpt2.bytes_to_unicode().items()}
 
     def convert_token_to_string(token: str) -> str:
-
         string = tokenizer.convert_tokens_to_string([token])
 
         # A hack to handle missing spaces to HF's Llama tokenizers
-        if (type(token) is str
-                and token.startswith(file_utils.SPIECE_UNDERLINE)
-                or token == "<0x20>"):
+        if (
+            type(token) is str
+            and token.startswith(file_utils.SPIECE_UNDERLINE)
+            or token == "<0x20>"
+        ):
             return " " + string
 
         return string
@@ -226,8 +234,7 @@ def _reduced_vocabulary(
                 # by this point.
                 token_bytes = bytes(token_str)  # type: ignore[arg-type]
 
-            elif "\ufffd" in token_str and not re_replacement_seq.match(
-                    token_str):
+            elif "\ufffd" in token_str and not re_replacement_seq.match(token_str):
                 # Handle tokens with invalid UTF-8 sequences.
                 if re_llama_byte_token.match(token):
                     # Llama-like tokenizers use <0xXX> for incomplete sequences.
@@ -238,12 +245,13 @@ def _reduced_vocabulary(
                     if None in byte_vals:
                         raise RuntimeError(
                             f"Cannot convert token `{token}`"
-                            f" ({token_idx}) to bytes: {token_str}")
+                            f" ({token_idx}) to bytes: {token_str}"
+                        )
                     # safe to ignore, since if None in byte_vals,
                     # an error is thrown.
                     token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
             else:
-                token_bytes = token_str.encode('utf-8')
+                token_bytes = token_str.encode("utf-8")
 
             if token_idx != eos_token_id:
                 vocabulary.setdefault(token_bytes, []).append(token_idx)
@@ -254,16 +262,18 @@ def _reduced_vocabulary(
 
 
 def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
-    """Get the `Vocabulary` object for a given tokenizer.
-    """
+    """Get the `Vocabulary` object for a given tokenizer."""
     if hasattr(tokenizer, "_outlines_vocabulary"):
         return tokenizer._outlines_vocabulary  # type: ignore
 
     try:
-        if hasattr(
+        if (
+            hasattr(
                 tokenizer,
                 "eos_token_id",
-        ) and tokenizer.eos_token_id is not None:
+            )
+            and tokenizer.eos_token_id is not None
+        ):
             eos_token_id = tokenizer.eos_token_id
         else:
             raise ValueError(
@@ -272,17 +282,18 @@ def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
 
         reduced_vocab = _reduced_vocabulary(
             tokenizer,
-            eos_token_id  #type: ignore
+            eos_token_id,  # type: ignore
         )
-        vocabulary = OutlinesVocabulary(
-            oc.Vocabulary(eos_token_id, reduced_vocab))
+        vocabulary = OutlinesVocabulary(oc.Vocabulary(eos_token_id, reduced_vocab))
         tokenizer._outlines_vocabulary = vocabulary  # type: ignore
 
         return vocabulary
     except AttributeError as e:
-        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
-                         f"({type(tokenizer)}). The tokenizer should have a "
-                         "get_vocab method.") from e
+        raise ValueError(
+            f"Cannot get the vocabulary of the tokenizer "
+            f"({type(tokenizer)}). The tokenizer should have a "
+            "get_vocab method."
+        ) from e
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
@@ -304,14 +315,14 @@ def grammar_is_likely_lark(grammar_str: str) -> bool:
     if not grammar_str or not isinstance(grammar_str, str):
         return False
 
-    for line in grammar_str.split('\n'):
+    for line in grammar_str.split("\n"):
         # Remove both comment styles
-        line = re.sub(r'(#|//).*$', '', line).strip()
+        line = re.sub(r"(#|//).*$", "", line).strip()
         if not line:
             continue
 
         # Look for EBNF rule definition
-        if '::=' in line:
+        if "::=" in line:
             return False
 
     return True
@@ -348,40 +359,41 @@ def convert_lark_to_ebnf(grammar_str: str) -> str:
 
     def clean_line(line: str) -> str:
         """Remove comments and whitespace from line."""
-        return re.sub(r'(#|//).*$', '', line).strip()
+        return re.sub(r"(#|//).*$", "", line).strip()
 
     def check_quotes(text: str, rule_name: str, line_num: int) -> None:
         """Validate quote matching in text."""
         if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
-            raise ValueError(
-                f"Mismatched quotes in {rule_name} on line {line_num}")
+            raise ValueError(f"Mismatched quotes in {rule_name} on line {line_num}")
 
     def extract_references(text: str) -> set[str]:
         """Extract rule references from text."""
         # Remove quoted strings and special characters
-        text = re.sub(r'"[^"]*"', '', text)
-        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
-        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+        text = re.sub(r'"[^"]*"', "", text)
+        text = re.sub(r"[+*?()|\[\]{}]", " ", text)
+        return set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", text))
 
     # First pass: Find root rule and validate rule definitions
-    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    lines = [clean_line(line) for line in grammar_str.split("\n")]
     first_rule = None
 
     for line_num, line in enumerate(lines, 1):
-        if not line or line.startswith('|'):
+        if not line or line.startswith("|"):
             continue
 
-        if ':' in line:
+        if ":" in line:
             try:
-                name = line.split(':', 1)[0].strip().strip('?')
+                name = line.split(":", 1)[0].strip().strip("?")
                 defined_rules.add(name)
                 if first_rule is None:
                     first_rule = name
-                if name == 'start':
-                    first_rule = 'start'
+                if name == "start":
+                    first_rule = "start"
             except IndexError as e:
-                raise ValueError(f"Invalid rule format on line {line_num}. "
-                                 "Expected 'rule_name: definition'") from e
+                raise ValueError(
+                    f"Invalid rule format on line {line_num}. "
+                    "Expected 'rule_name: definition'"
+                ) from e
 
     if not defined_rules:
         raise ValueError("No valid rules found in grammar")
@@ -398,29 +410,33 @@ def convert_lark_to_ebnf(grammar_str: str) -> str:
             continue
 
         try:
-            if ':' in line and not line.startswith('|'):
+            if ":" in line and not line.startswith("|"):
                 # Save previous rule if exists
                 if current_rule:
                     output_lines.append(
-                        f"{current_rule} ::= {' | '.join(current_definition)}")
+                        f"{current_rule} ::= {' | '.join(current_definition)}"
+                    )
 
                 # Process new rule
-                name, definition = line.split(':', 1)
-                current_rule = name.strip().strip('?')
+                name, definition = line.split(":", 1)
+                current_rule = name.strip().strip("?")
 
                 check_quotes(definition, f"rule '{current_rule}'", line_num)
                 definition = re.sub(r"'([^']*)'", r'"\1"', definition)
                 referenced_rules.update(extract_references(definition))
                 current_definition = [definition.strip()]
 
-            elif line.startswith('|'):
+            elif line.startswith("|"):
                 if not current_rule:
-                    raise ValueError(f"Alternative '|' on line {line_num} "
-                                     "without a preceding rule definition")
+                    raise ValueError(
+                        f"Alternative '|' on line {line_num} "
+                        "without a preceding rule definition"
+                    )
 
                 alt_def = line[1:].strip()
-                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
-                             line_num)
+                check_quotes(
+                    alt_def, f"alternative for rule '{current_rule}'", line_num
+                )
                 alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
                 referenced_rules.update(extract_references(alt_def))
                 current_definition.append(alt_def)
@@ -430,25 +446,24 @@ def convert_lark_to_ebnf(grammar_str: str) -> str:
 
     # Add final rule if exists
     if current_rule:
-        output_lines.append(
-            f"{current_rule} ::= {' | '.join(current_definition)}")
+        output_lines.append(f"{current_rule} ::= {' | '.join(current_definition)}")
 
     # Validate all rules are defined
-    undefined_rules = referenced_rules - defined_rules - {'root'}
+    undefined_rules = referenced_rules - defined_rules - {"root"}
     if undefined_rules:
-        raise ValueError("Referenced rules are not defined: "
-                         f"{', '.join(sorted(undefined_rules))}")
+        raise ValueError(
+            f"Referenced rules are not defined: {', '.join(sorted(undefined_rules))}"
+        )
 
-    return '\n'.join(output_lines)
+    return "\n".join(output_lines)
 
 
 def choice_as_grammar(choice: list[str]) -> str:
-
     def escape_ebnf_string(s: str) -> str:
         """Escape special characters in a EBNF string."""
         # Escape double quotes and backslashes
-        return re.sub(r'(["\\])', r'\\\1', s)
+        return re.sub(r'(["\\])', r"\\\1", s)
 
     escaped_choices = (escape_ebnf_string(c) for c in choice)
-    grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+    grammar = "root ::= " + " | ".join(f'"{c}"' for c in escaped_choices)
     return grammar
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index ec4417290f61103ddc821255c79d309de7d44606..29099d1e9b17ef4e1d16cc81f8fc249269fbe805 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -5,29 +5,34 @@ import contextlib
 import multiprocessing
 import time
 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import AbstractContextManager
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
-from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
-                    Union, overload)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    Optional,
+    TypeVar,
+    Union,
+    overload,
+)
 
 import torch
 from torch.autograd.profiler import record_function
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
-                                  usage_message)
-from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
-                        kill_process_tree)
+from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message
+from vllm.utils.network_utils import get_open_port, get_open_zmq_ipc_path, get_tcp_uri
+from vllm.utils.system_utils import kill_process_tree
 
 if TYPE_CHECKING:
     import numpy as np
 
     from vllm.v1.engine.coordinator import DPCoordinator
-    from vllm.v1.engine.utils import (CoreEngineActorManager,
-                                      CoreEngineProcManager)
+    from vllm.v1.engine.utils import CoreEngineActorManager, CoreEngineProcManager
 
 logger = init_logger(__name__)
 
@@ -35,7 +40,6 @@ T = TypeVar("T")
 
 
 class ConstantList(Generic[T], Sequence):
-
     def __init__(self, x: list[T]) -> None:
         self._x = x
 
@@ -57,33 +61,25 @@ class ConstantList(Generic[T], Sequence):
     def clear(self):
         raise TypeError("Cannot clear a constant list")
 
-    def index(self,
-              item: T,
-              start: int = 0,
-              stop: Optional[int] = None) -> int:
-        return self._x.index(item, start,
-                             stop if stop is not None else len(self._x))
+    def index(self, item: T, start: int = 0, stop: int | None = None) -> int:
+        return self._x.index(item, start, stop if stop is not None else len(self._x))
 
     @overload
-    def __getitem__(self, item: int) -> T:
-        ...
+    def __getitem__(self, item: int) -> T: ...
 
     @overload
-    def __getitem__(self, s: slice, /) -> list[T]:
-        ...
+    def __getitem__(self, s: slice, /) -> list[T]: ...
 
-    def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]:
+    def __getitem__(self, item: int | slice) -> T | list[T]:
         return self._x[item]
 
     @overload
-    def __setitem__(self, item: int, value: T):
-        ...
+    def __setitem__(self, item: int, value: T): ...
 
     @overload
-    def __setitem__(self, s: slice, value: T, /):
-        ...
+    def __setitem__(self, s: slice, value: T, /): ...
 
-    def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]):
+    def __setitem__(self, item: int | slice, value: T | list[T]):
         raise TypeError("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -101,23 +97,23 @@ class ConstantList(Generic[T], Sequence):
     def __repr__(self):
         return f"ConstantList({self._x})"
 
+    def copy(self) -> list[T]:
+        return self._x.copy()
+
 
 class CpuGpuBuffer:
     """Buffer to easily copy tensors between CPU and GPU."""
 
     def __init__(
         self,
-        *size: Union[int, torch.SymInt],
+        *size: int | torch.SymInt,
         dtype: torch.dtype,
         device: torch.device,
         pin_memory: bool,
         with_numpy: bool = True,
     ) -> None:
-        self.cpu = torch.zeros(*size,
-                               dtype=dtype,
-                               device="cpu",
-                               pin_memory=pin_memory)
-        self.gpu = self.cpu.to(device)
+        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=pin_memory)
+        self.gpu = torch.zeros_like(self.cpu, device=device)
         self.np: np.ndarray
         # To keep type hints simple (avoiding generics and subclasses), we
         # only conditionally create the numpy array attribute. This can cause
@@ -126,15 +122,16 @@ class CpuGpuBuffer:
             if dtype == torch.bfloat16:
                 raise ValueError(
                     "Bfloat16 torch tensors cannot be directly cast to a "
-                    "numpy array, so call CpuGpuBuffer with with_numpy=False")
+                    "numpy array, so call CpuGpuBuffer with with_numpy=False"
+                )
             self.np = self.cpu.numpy()
 
-    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
+    def copy_to_gpu(self, n: int | None = None) -> torch.Tensor:
         if n is None:
             return self.gpu.copy_(self.cpu, non_blocking=True)
         return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
 
-    def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor:
+    def copy_to_cpu(self, n: int | None = None) -> torch.Tensor:
         """NOTE: Because this method is non-blocking, explicit synchronization
         is needed to ensure the data is copied to CPU."""
         if n is None:
@@ -142,9 +139,7 @@ class CpuGpuBuffer:
         return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)
 
 
-def get_engine_client_zmq_addr(local_only: bool,
-                               host: str,
-                               port: int = 0) -> str:
+def get_engine_client_zmq_addr(local_only: bool, host: str, port: int = 0) -> str:
     """Assign a new ZMQ socket address.
 
     If local_only is True, participants are colocated and so a unique IPC
@@ -153,8 +148,11 @@ def get_engine_client_zmq_addr(local_only: bool,
     Otherwise, the provided host and port will be used to construct a TCP
     address (port == 0 means assign an available port)."""
 
-    return get_open_zmq_ipc_path() if local_only else (get_tcp_uri(
-        host, port or get_open_port()))
+    return (
+        get_open_zmq_ipc_path()
+        if local_only
+        else (get_tcp_uri(host, port or get_open_port()))
+    )
 
 
 class APIServerProcessManager:
@@ -173,7 +171,7 @@ class APIServerProcessManager:
         num_servers: int,
         input_addresses: list[str],
         output_addresses: list[str],
-        stats_update_address: Optional[str] = None,
+        stats_update_address: str | None = None,
     ):
         """Initialize and start API server worker processes.
 
@@ -195,21 +193,23 @@ class APIServerProcessManager:
         spawn_context = multiprocessing.get_context("spawn")
         self.processes: list[BaseProcess] = []
 
-        for i, in_addr, out_addr in zip(range(num_servers), input_addresses,
-                                        output_addresses):
+        for i, in_addr, out_addr in zip(
+            range(num_servers), input_addresses, output_addresses
+        ):
             client_config = {
                 "input_address": in_addr,
                 "output_address": out_addr,
                 "client_count": num_servers,
-                "client_index": i
+                "client_index": i,
             }
             if stats_update_address is not None:
                 client_config["stats_update_address"] = stats_update_address
 
-            proc = spawn_context.Process(target=target_server_fn,
-                                         name=f"ApiServer_{i}",
-                                         args=(listen_address, sock, args,
-                                               client_config))
+            proc = spawn_context.Process(
+                target=target_server_fn,
+                name=f"ApiServer_{i}",
+                args=(listen_address, sock, args, client_config),
+            )
             self.processes.append(proc)
             proc.start()
 
@@ -224,10 +224,11 @@ class APIServerProcessManager:
 
 
 def wait_for_completion_or_failure(
-        api_server_manager: APIServerProcessManager,
-        engine_manager: Optional[Union["CoreEngineProcManager",
-                                       "CoreEngineActorManager"]] = None,
-        coordinator: Optional["DPCoordinator"] = None) -> None:
+    api_server_manager: APIServerProcessManager,
+    engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
+    | None = None,
+    coordinator: Optional["DPCoordinator"] = None,
+) -> None:
     """Wait for all processes to complete or detect if any fail.
 
     Raises an exception if any process exits with a non-zero status.
@@ -240,16 +241,14 @@ def wait_for_completion_or_failure(
         coordinator: The coordinator for data parallel.
     """
 
-    from vllm.v1.engine.utils import (CoreEngineActorManager,
-                                      CoreEngineProcManager)
+    from vllm.v1.engine.utils import CoreEngineActorManager, CoreEngineProcManager
 
     try:
         logger.info("Waiting for API servers to complete ...")
         # Create a mapping of sentinels to their corresponding processes
         # for efficient lookup
         sentinel_to_proc: dict[Any, BaseProcess] = {
-            proc.sentinel: proc
-            for proc in api_server_manager.processes
+            proc.sentinel: proc for proc in api_server_manager.processes
         }
 
         if coordinator:
@@ -265,8 +264,7 @@ def wait_for_completion_or_failure(
         # Check if any process terminates
         while sentinel_to_proc or actor_run_refs:
             # Wait for any process to terminate
-            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc,
-                                                         timeout=5)
+            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc, timeout=5)
 
             # Process any terminated processes
             for sentinel in ready_sentinels:
@@ -276,17 +274,18 @@ def wait_for_completion_or_failure(
                 if proc.exitcode != 0:
                     raise RuntimeError(
                         f"Process {proc.name} (PID: {proc.pid}) "
-                        f"died with exit code {proc.exitcode}")
+                        f"died with exit code {proc.exitcode}"
+                    )
 
             if actor_run_refs:
                 import ray
+
                 _, actor_run_refs = ray.wait(actor_run_refs, timeout=5)
 
     except KeyboardInterrupt:
         logger.info("Received KeyboardInterrupt, shutting down API servers...")
     except Exception as e:
-        logger.exception("Exception occurred while running API servers: %s",
-                         str(e))
+        logger.exception("Exception occurred while running API servers: %s", str(e))
         raise
     finally:
         logger.info("Terminating remaining processes ...")
@@ -319,8 +318,9 @@ def shutdown(procs: list[BaseProcess]):
             kill_process_tree(pid)
 
 
-def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
-               length: int) -> torch.Tensor:
+def copy_slice(
+    from_tensor: torch.Tensor, to_tensor: torch.Tensor, length: int
+) -> torch.Tensor:
     """
     Copy the first length elements of a tensor into another tensor in a
     non-blocking manner.
@@ -333,8 +333,8 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
 
 
 def report_usage_stats(
-        vllm_config,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
+    vllm_config, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT
+) -> None:
     """Report usage statistics if enabled."""
 
     if not is_usage_stats_enabled():
@@ -342,37 +342,41 @@ def report_usage_stats(
 
     from vllm.model_executor.model_loader import get_architecture_class_name
 
+    parallel_config = vllm_config.parallel_config
+
+    # Prepare KV connector string if applicable
+    kv_connector = None
+    if vllm_config.kv_transfer_config is not None:
+        kv_connector = vllm_config.kv_transfer_config.kv_connector
+
     usage_message.report_usage(
         get_architecture_class_name(vllm_config.model_config),
         usage_context,
         extra_kvs={
             # Common configuration
-            "dtype":
-            str(vllm_config.model_config.dtype),
-            "tensor_parallel_size":
-            vllm_config.parallel_config.tensor_parallel_size,
-            "block_size":
-            vllm_config.cache_config.block_size,
-            "gpu_memory_utilization":
-            vllm_config.cache_config.gpu_memory_utilization,
-            "kv_cache_memory_bytes":
-            vllm_config.cache_config.kv_cache_memory_bytes,
+            "dtype": str(vllm_config.model_config.dtype),
+            "block_size": vllm_config.cache_config.block_size,
+            "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
+            "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
             # Quantization
-            "quantization":
-            vllm_config.model_config.quantization,
-            "kv_cache_dtype":
-            str(vllm_config.cache_config.cache_dtype),
-
+            "quantization": vllm_config.model_config.quantization,
+            "kv_cache_dtype": str(vllm_config.cache_config.cache_dtype),
             # Feature flags
-            "enable_lora":
-            bool(vllm_config.lora_config),
-            "enable_prefix_caching":
-            vllm_config.cache_config.enable_prefix_caching,
-            "enforce_eager":
-            vllm_config.model_config.enforce_eager,
-            "disable_custom_all_reduce":
-            vllm_config.parallel_config.disable_custom_all_reduce,
-        })
+            "enable_lora": bool(vllm_config.lora_config),
+            "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
+            "enforce_eager": vllm_config.model_config.enforce_eager,
+            "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
+            # Distributed parallelism settings
+            "tensor_parallel_size": parallel_config.tensor_parallel_size,
+            "data_parallel_size": parallel_config.data_parallel_size,
+            "pipeline_parallel_size": parallel_config.pipeline_parallel_size,
+            "enable_expert_parallel": parallel_config.enable_expert_parallel,
+            # All2All backend for MoE expert parallel
+            "all2all_backend": parallel_config.all2all_backend,
+            # KV connector used
+            "kv_connector": kv_connector,
+        },
+    )
 
 
 _PROFILER_FUNC = None
@@ -390,7 +394,21 @@ def record_function_or_nullcontext(name: str) -> AbstractContextManager:
         func = record_function
     elif envs.VLLM_NVTX_SCOPES_FOR_PROFILING:
         import nvtx
+
         func = nvtx.annotate
 
     _PROFILER_FUNC = func
     return func(name)
+
+
+def tensor_data(tensor: torch.Tensor) -> memoryview:
+    """Get the raw data of a tensor as a uint8 memoryview, useful for
+    serializing and hashing.
+
+    Args:
+        tensor: The input tensor.
+
+    Returns:
+        A memoryview of the tensor data as uint8.
+    """
+    return tensor.flatten().contiguous().view(torch.uint8).numpy().data
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 82b6d1b514d50d0f7bbecc26ac9ded77cd711bac..9f6c19e464308152b1a1838436ae882cb3316de7 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,20 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import numpy as np
 import torch
 
 from vllm.distributed import get_dcp_group
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
 
 logger = init_logger(__name__)
 
 
 class BlockTable:
-
     def __init__(
         self,
         block_size: int,
@@ -23,21 +21,65 @@ class BlockTable:
         max_num_batched_tokens: int,
         pin_memory: bool,
         device: torch.device,
+        kernel_block_size: int,
+        dcp_kv_cache_interleave_size: int,
     ):
-        self.block_size = block_size
+        """
+        Args:
+            block_size: Block size used for KV cache memory allocation
+            max_num_reqs: Maximum number of concurrent requests supported.
+            max_num_blocks_per_req: Maximum number of blocks per request.
+            max_num_batched_tokens: Maximum number of tokens in a batch.
+            pin_memory: Whether to pin memory for faster GPU transfers.
+            device: Target device for the block table.
+            kernel_block_size: The block_size of underlying attention kernel.
+                Will be the same as `block_size` if `block_size` is supported
+                by the attention kernel.
+        """
         self.max_num_reqs = max_num_reqs
-        self.max_num_blocks_per_req = max_num_blocks_per_req
         self.max_num_batched_tokens = max_num_batched_tokens
         self.pin_memory = pin_memory
         self.device = device
 
-        self.block_table = self._make_buffer(max_num_reqs,
-                                             max_num_blocks_per_req,
-                                             dtype=torch.int32)
+        if kernel_block_size == block_size:
+            # Standard case: allocation and computation use same block size
+            # No block splitting needed, direct mapping
+            self.block_size = block_size
+            self.blocks_per_kv_block = 1
+            self.use_hybrid_blocks = False
+        else:
+            # Hybrid case: allocation block size differs from kernel block size
+            # Memory blocks are subdivided to match kernel requirements
+            # Example: 32-token memory blocks with 16-token kernel blocks
+            # → Each memory block corresponds to 2 kernel blocks
+            if block_size % kernel_block_size != 0:
+                raise ValueError(
+                    f"kernel_block_size {kernel_block_size} must divide "
+                    f"kv_manager_block_size size {block_size} evenly"
+                )
+
+            self.block_size = kernel_block_size
+            self.blocks_per_kv_block = block_size // kernel_block_size
+            self.use_hybrid_blocks = True
+
+        self.max_num_blocks_per_req = max_num_blocks_per_req * self.blocks_per_kv_block
+
+        self.block_table = self._make_buffer(
+            self.max_num_reqs, self.max_num_blocks_per_req, dtype=torch.int32
+        )
         self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
 
-        self.slot_mapping = self._make_buffer(self.max_num_batched_tokens,
-                                              dtype=torch.int64)
+        self.slot_mapping = self._make_buffer(
+            self.max_num_batched_tokens, dtype=torch.int64
+        )
+
+        if self.use_hybrid_blocks:
+            self._kernel_block_arange = np.arange(0, self.blocks_per_kv_block).reshape(
+                1, -1
+            )
+        else:
+            self._kernel_block_arange = None
+
         try:
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
@@ -45,6 +87,7 @@ class BlockTable:
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size
 
     def append_row(
         self,
@@ -53,10 +96,16 @@ class BlockTable:
     ) -> None:
         if not block_ids:
             return
+
+        if self.use_hybrid_blocks:
+            block_ids = self.map_to_kernel_blocks(
+                np.array(block_ids), self.blocks_per_kv_block, self._kernel_block_arange
+            )
+
         num_blocks = len(block_ids)
         start = self.num_blocks_per_row[row_idx]
         self.num_blocks_per_row[row_idx] += num_blocks
-        self.block_table.np[row_idx, start:start + num_blocks] = block_ids
+        self.block_table.np[row_idx, start : start + num_blocks] = block_ids
 
     def add_row(self, block_ids: list[int], row_idx: int) -> None:
         self.num_blocks_per_row[row_idx] = 0
@@ -73,8 +122,9 @@ class BlockTable:
         self.num_blocks_per_row[src_tgt] = self.num_blocks_per_row[tgt_src]
         self.block_table.np[src_tgt] = self.block_table.np[tgt_src]
 
-    def compute_slot_mapping(self, req_indices: np.ndarray,
-                             positions: np.ndarray) -> None:
+    def compute_slot_mapping(
+        self, req_indices: np.ndarray, positions: np.ndarray
+    ) -> None:
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
         # where K is the max_num_blocks_per_req and the block size is 2.
@@ -89,28 +139,46 @@ class BlockTable:
             # Use a "virtual block" which equals to world_size * block_size
             # for block_table_indices calculation.
             virtual_block_size = self.block_size * self.dcp_world_size
-            block_table_indices = (req_indices * self.max_num_blocks_per_req +
-                                   positions // virtual_block_size)
+            block_table_indices = (
+                req_indices * self.max_num_blocks_per_req
+                + positions // virtual_block_size
+            )
+
             block_numbers = self.block_table.np.ravel()[block_table_indices]
             # Use virtual_block_size for mask calculation, which marks local
             # tokens.
             virtual_block_offsets = positions % virtual_block_size
-            mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
+            mask = (
+                virtual_block_offsets
+                // self.dcp_kv_cache_interleave_size
+                % self.dcp_world_size
+                == self.dcp_rank
+            )
             # Calculate local block_offsets
-            block_offsets = virtual_block_offsets // self.dcp_world_size
+            block_offsets = (
+                virtual_block_offsets
+                // (self.dcp_world_size * self.dcp_kv_cache_interleave_size)
+                * self.dcp_kv_cache_interleave_size
+                + virtual_block_offsets % self.dcp_kv_cache_interleave_size
+            )
             # Calculate slot_mapping
             slot_mapping = block_numbers * self.block_size + block_offsets
             # Write final slots, use -1 for not-local
-            self.slot_mapping.np[:req_indices.shape[0]] = np.where(
-                mask, slot_mapping, -1)
+            self.slot_mapping.np[: req_indices.shape[0]] = np.where(
+                mask, slot_mapping, -1
+            )
         else:
-            block_table_indices = (req_indices * self.max_num_blocks_per_req +
-                                   positions // self.block_size)
+            block_table_indices = (
+                req_indices * self.max_num_blocks_per_req + positions // self.block_size
+            )
+
             block_numbers = self.block_table.np.ravel()[block_table_indices]
             block_offsets = positions % self.block_size
-            np.add(block_numbers * self.block_size,
-                   block_offsets,
-                   out=self.slot_mapping.np[:req_indices.shape[0]])
+            np.add(
+                block_numbers * self.block_size,
+                block_offsets,
+                out=self.slot_mapping.np[: req_indices.shape[0]],
+            )
 
     def commit_block_table(self, num_reqs: int) -> None:
         self.block_table.copy_to_gpu(num_reqs)
@@ -122,6 +190,36 @@ class BlockTable:
         self.block_table.gpu.fill_(0)
         self.block_table.cpu.fill_(0)
 
+    @staticmethod
+    def map_to_kernel_blocks(
+        kv_manager_block_ids: np.ndarray,
+        blocks_per_kv_block: int,
+        kernel_block_arange: np.ndarray,
+    ) -> np.ndarray:
+        """Convert kv_manager_block_id IDs to kernel block IDs.
+
+        Example:
+            # kv_manager_block_ids: 32 tokens,
+            # Kernel block size: 16 tokens
+            # blocks_per_kv_block = 2
+            >>> kv_manager_block_ids = np.array([0, 1, 2])
+            >>> Result: [0, 1, 2, 3, 4, 5]
+
+            # Each kv_manager_block_id maps to 2 kernel block id:
+            # kv_manager_block_id 0 → kernel block id [0, 1]
+            # kv_manager_block_id 1 → kernel block id [2, 3]
+            # kv_manager_block_id 2 → kernel block id [4, 5]
+        """
+        if blocks_per_kv_block == 1:
+            return kv_manager_block_ids
+
+        kernel_block_ids = (
+            kv_manager_block_ids.reshape(-1, 1) * blocks_per_kv_block
+            + kernel_block_arange
+        )
+
+        return kernel_block_ids.reshape(-1)
+
     def get_device_tensor(self, num_reqs: int) -> torch.Tensor:
         """Returns the device tensor of the block table."""
         return self.block_table.gpu[:num_reqs]
@@ -134,25 +232,29 @@ class BlockTable:
         """Returns the numpy array of the block table."""
         return self.block_table.np
 
-    def _make_buffer(self, *size: Union[int, torch.SymInt],
-                     dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(*size,
-                            dtype=dtype,
-                            device=self.device,
-                            pin_memory=self.pin_memory)
+    def _make_buffer(
+        self, *size: int | torch.SymInt, dtype: torch.dtype
+    ) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
+        )
 
 
 class MultiGroupBlockTable:
     """The BlockTables for each KV cache group."""
 
-    def __init__(self,
-                 max_num_reqs: int,
-                 max_model_len: int,
-                 max_num_batched_tokens: int,
-                 pin_memory: bool,
-                 device: torch.device,
-                 block_sizes: list[int],
-                 num_speculative_tokens: int = 0) -> None:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        pin_memory: bool,
+        device: torch.device,
+        block_sizes: list[int],
+        kernel_block_sizes: list[int],
+        num_speculative_tokens: int = 0,
+        dcp_kv_cache_interleave_size: int = 1,
+    ) -> None:
         # Note(hc): each dcp rank only store
         # (max_model_len//dcp_world_size) tokens in kvcache,
         # so the block_size which used for calc max_num_blocks_per_req
@@ -163,16 +265,30 @@ class MultiGroupBlockTable:
             # DCP might not be initialized in testing
             dcp_world_size = 1
 
+        if len(kernel_block_sizes) != len(block_sizes):
+            raise ValueError(
+                f"kernel_block_sizes length ({len(kernel_block_sizes)}) "
+                f"must match block_sizes length ({len(block_sizes)})"
+            )
+
         self.block_tables = [
             BlockTable(
-                block_size, max_num_reqs,
-                max(cdiv(max_model_len, block_size * dcp_world_size),
-                    1 + num_speculative_tokens), max_num_batched_tokens,
-                pin_memory, device) for block_size in block_sizes
+                block_size,
+                max_num_reqs,
+                max(
+                    cdiv(max_model_len, block_size * dcp_world_size),
+                    1 + num_speculative_tokens,
+                ),
+                max_num_batched_tokens,
+                pin_memory,
+                device,
+                kernel_block_size,
+                dcp_kv_cache_interleave_size,
+            )
+            for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes)
         ]
 
-    def append_row(self, block_ids: tuple[list[int], ...],
-                   row_idx: int) -> None:
+    def append_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.append_row(block_ids[i], row_idx)
 
@@ -188,8 +304,9 @@ class MultiGroupBlockTable:
         for block_table in self.block_tables:
             block_table.swap_row(src, tgt)
 
-    def compute_slot_mapping(self, req_indices: np.ndarray,
-                             positions: np.ndarray) -> None:
+    def compute_slot_mapping(
+        self, req_indices: np.ndarray, positions: np.ndarray
+    ) -> None:
         for block_table in self.block_tables:
             block_table.compute_slot_mapping(req_indices, positions)
 
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 6a97f7ebc3fcdd59a596b957674f5ddceb12f852..40f011fed1ada7c1a3de86eccf59d8a4aad53560 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -9,18 +9,13 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-
 logger = init_logger(__name__)
 
 
 class CPUModelRunner(GPUModelRunner):
-
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         with _torch_cuda_wrapper():
             super().__init__(vllm_config, device)
@@ -33,55 +28,9 @@ class CPUModelRunner(GPUModelRunner):
 
         self._postprocess_tensors()
 
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
-        """
-        Update the order of requests in the batch based on the attention
-        backend's needs. For example, some attention backends (namely MLA) may
-        want to separate requests based on if the attention computation will be
-        compute-bound or memory-bound.
-
-        Args:
-            scheduler_output: The scheduler output.
-        """
-        # Attention free models have zero kv_cache_groups, however models
-        # like Mamba are also attention free but use the kv_cache for
-        # keeping its internal state. This is why we check the number
-        # of kv_cache groups instead of solely checking
-        # for self.model_config.is_attention_free.
-        if len(self.kv_cache_config.kv_cache_groups) == 0:
-            return
-
-        if len(self.kv_cache_config.kv_cache_groups) > 1:
-            raise ValueError("Multiple KVCacheGroups is not"
-                             "currently supported with CPU model runner.")
-
-        # Guard against encoder-only / pooling models where `attn_groups`
-        # may be empty or lack the expected metadata_builder.
-        # Without this check, accessing `attn_groups[0][0]` would trigger
-        # an AssertionError on CPU backend.
-        if not hasattr(self, "attn_groups") or not self.attn_groups:
-            return
-        if not self.attn_groups[0]:
-            return
-
-        mb = getattr(self.attn_groups[0][0], "metadata_builders", None)
-        if isinstance(mb, list):
-            if not isinstance(mb[0], TorchSDPAMetadataBuilderV1):
-                return
-            mb[0].reorder_batch(self.input_batch, scheduler_output)
-            return
-        elif not isinstance(mb, TorchSDPAMetadataBuilderV1):
-            # Encoder-only / rerank models do not benefit from reordering,
-            # so we safely skip here.
-            return
-
-        # Safe path for decoder/attention-heavy models
-        mb.reorder_batch(self.input_batch, scheduler_output)
-
     def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
-        def replace_tensor(obj: Any, cpu_attr_name: str,
-                           device_attr_name) -> None:
+        def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None:
             cpu_tensor = getattr(obj, cpu_attr_name, None)
             device_tensor = getattr(obj, device_attr_name, None)
             if cpu_tensor is not None and device_tensor is not None:
@@ -107,8 +56,7 @@ class CPUModelRunner(GPUModelRunner):
         self.model = get_model(vllm_config=self.vllm_config)
 
         if self.lora_config:
-            self.model = self.load_lora_model(self.model, self.vllm_config,
-                                              self.device)
+            self.model = self.load_lora_model(self.model, self.vllm_config, self.device)
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -117,7 +65,13 @@ class CPUModelRunner(GPUModelRunner):
         logger.info("Warming up model for the compilation...")
         # Only generate graph for the generic shape
         with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(max(16, self.max_num_reqs))
+            self._dummy_run(
+                min(
+                    max(16, self.max_num_reqs),
+                    self.scheduler_config.max_num_batched_tokens,
+                )
+            )
+
         logger.info("Warming up done.")
 
     def _init_device_properties(self) -> None:
@@ -126,26 +80,19 @@ class CPUModelRunner(GPUModelRunner):
     def _sync_device(self) -> None:
         pass
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
-        return sampled_token_ids.tolist()
-
-    def get_dp_padding(self,
-                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+    def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
         # Note: For CPU backend, dp padding is not required for now.
         return 0, None
 
 
 @contextmanager
 def _torch_cuda_wrapper():
-
     class _EventPlaceholder:
-
         def __init__(self, *args, **kwargs) -> None:
             self.record = lambda: None
             self.synchronize = lambda: None
 
     class _StreamPlaceholder:
-
         def __init__(self, *args, **kwargs) -> None:
             pass
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index c6a686d6b75e159cdd62a5358af94fa9768a2f77..4420a057d1e5802d317d653238878b73baed6494 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import platform
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -13,28 +13,52 @@ from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
-from vllm.v1.worker.gpu_worker import (Worker,
-                                       init_worker_distributed_environment)
+from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
 logger = init_logger(__name__)
 
 
 class CPUWorker(Worker):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False):
-        super().__init__(vllm_config,
-                         local_rank,
-                         rank,
-                         distributed_init_method,
-                         is_driver_worker=is_driver_worker)
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        super().__init__(
+            vllm_config,
+            local_rank,
+            rank,
+            distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
 
         self.parallel_config.disable_custom_all_reduce = True
 
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
+                ),
+            )
+        else:
+            self.profiler = None
+
     def init_device(self):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
@@ -43,48 +67,55 @@ class CPUWorker(Worker):
             if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X):
                 # For S390X/POWERPC SMT-8/4/2
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
-                    lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4])
-            elif current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                    lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
+                )
+            elif cpu_arch == CpuArchEnum.X86:
                 # For x86 SMT-2, use 1 CPU per core
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
-                    lambda cpus: cpus[-1:])
+                    lambda cpus: cpus[-1:]
+                )
             else:
-                self.local_omp_cpuid = "all"
+                self.local_omp_cpuid = "nobind"
+        elif omp_cpuids == "nobind":
+            self.local_omp_cpuid = "nobind"
         else:
             local_dp_rank = self.parallel_config.data_parallel_rank_local
             omp_cpuids = omp_cpuids.split("|")
             if local_dp_rank is not None:
                 world_size = self.parallel_config.world_size
-                omp_cpuids = omp_cpuids[local_dp_rank *
-                                        world_size:(local_dp_rank + 1) *
-                                        world_size]
+                omp_cpuids = omp_cpuids[
+                    local_dp_rank * world_size : (local_dp_rank + 1) * world_size
+                ]
             self.local_omp_cpuid = omp_cpuids[self.rank]
 
-        if self.local_omp_cpuid != "all":
+        if self.local_omp_cpuid != "nobind":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
 
         # Note: unique identifier for creating allreduce shared memory
-        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
-            ":")[-1]
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(":")[-1]
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.vllm_config, self.rank,
-                                            self.distributed_init_method,
-                                            self.local_rank,
-                                            current_platform.dist_backend)
+        init_worker_distributed_environment(
+            self.vllm_config,
+            self.rank,
+            self.distributed_init_method,
+            self.local_rank,
+            current_platform.dist_backend,
+        )
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
         self.model_runner: CPUModelRunner = CPUModelRunner(
-            self.vllm_config, torch.device("cpu"))
+            self.vllm_config, torch.device("cpu")
+        )
 
     def sleep(self, level: int = 1) -> None:
         logger.warning("sleep mode is not supported on CPU, ignore it.")
         pass
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         logger.warning("sleep mode is not supported on CPU, ignore it.")
         pass
 
@@ -98,31 +129,31 @@ class CPUWorker(Worker):
         self.model_runner.warming_up_model()
 
     def _get_autobind_cpu_ids(
-        self, cpu_selector: Callable[[list[LogicalCPUInfo]],
-                                     list[LogicalCPUInfo]]
+        self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
     ) -> str:
         """
-        Return CPU ids to bind based on NUMA nodes. 
-        Currently for rank N, only CPU ids on the N-th node in available NUMA 
+        Return CPU ids to bind based on NUMA nodes.
+        Currently for rank N, only CPU ids on the N-th node in available NUMA
         node list will be selected.
         Args:
-            cpu_selector: a callable object to select CPUs from a CPU list 
+            cpu_selector: a callable object to select CPUs from a CPU list
             of a physical core. The input is a LogicalCPUInfo list, sorted by
-            the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be 
+            the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be
             returned.
         """
 
-        allowed_numa_nodes, logical_cpu_list = \
+        allowed_numa_nodes, logical_cpu_list = (
             CpuPlatform.get_allowed_cpu_core_node_list()
+        )
         assert len(allowed_numa_nodes) >= self.parallel_config.world_size, (
             f"No enough allowed NUMA nodes to bind threads of "
             f"{self.parallel_config.world_size} CPUWorkers. "
             f"Allowed NUMA nodes are {allowed_numa_nodes}. "
-            "Please try to bind threads manually.")
+            "Please try to bind threads manually."
+        )
 
-        # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]``
-        selected_numa_node = allowed_numa_nodes[
-            self.local_rank]  # type: ignore
+        # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
+        selected_numa_node = allowed_numa_nodes[self.local_rank]  # type: ignore
         logical_cpu_list = [
             x for x in logical_cpu_list if x.numa_node == selected_numa_node
         ]
@@ -142,15 +173,34 @@ class CPUWorker(Worker):
         # Reserve CPUs for other processes
         reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
         if reserve_cpu_num is None:
-            need_reserve = (self.parallel_config.world_size > 1 or
-                            self.parallel_config.data_parallel_size_local > 1)
+            need_reserve = (
+                self.parallel_config.world_size > 1
+                or self.parallel_config.data_parallel_size_local > 1
+            )
             reserve_cpu_num = 1 if need_reserve else 0
         assert len(logical_cpu_list) > reserve_cpu_num, (
             f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) "
-            f"should less than {len(logical_cpu_list)}.")
+            f"should less than {len(logical_cpu_list)}."
+        )
         if reserve_cpu_num != 0:
             logical_cpu_list = logical_cpu_list[:-reserve_cpu_num]
 
-        logger.info("auto thread-binding list (id, physical core): %s",
-                    [(x.id, x.physical_core) for x in logical_cpu_list])
+        logger.info(
+            "auto thread-binding list (id, physical core): %s",
+            [(x.id, x.physical_core) for x in logical_cpu_list],
+        )
         return ",".join([str(x.id) for x in logical_cpu_list])
+
+    def profile(self, is_start: bool = True):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        if is_start:
+            self.profiler.start()
+        else:
+            self.profiler.stop()
+            if self.local_rank == 0:
+                logger.info(
+                    self.profiler.key_averages().table(
+                        sort_by="self_cpu_time_total", row_limit=50
+                    )
+                )
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..464fbf11a21ad95b515cd5dd5131edc0624ea717
--- /dev/null
+++ b/vllm/v1/worker/dp_utils.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+from vllm.config import ParallelConfig
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.logger import init_logger
+from vllm.v1.worker.ubatch_utils import (
+    UBatchSlices,
+    check_ubatch_thresholds,
+    create_ubatch_slices,
+    is_second_ubatch_empty,
+)
+
+logger = init_logger(__name__)
+
+
+def _get_device_and_group(parallel_config: ParallelConfig):
+    # Use the actual device assigned to the DP group, not just the device type
+    device = get_dp_group().device
+    group = get_dp_group().device_group
+
+    # Transfering this tensor from GPU to CPU will introduce a GPU sync
+    # point that could adversely affect performance of vllm with asynch
+    # scheduling. This environment variable exists to quickly disable
+    # this optimization if we run into this case.
+    if parallel_config.disable_nccl_for_dp_synchronization:
+        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
+        device = "cpu"
+        group = get_dp_group().cpu_group
+    return device, group
+
+
+def _run_ar(
+    should_ubatch: bool,
+    should_dp_pad: bool,
+    orig_num_tokens_per_ubatch: int,
+    padded_num_tokens_per_ubatch: int,
+    parallel_config: ParallelConfig,
+) -> torch.Tensor:
+    dp_size = parallel_config.data_parallel_size
+    dp_rank = parallel_config.data_parallel_rank
+    device, group = _get_device_and_group(parallel_config)
+    tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
+    tensor[0][dp_rank] = orig_num_tokens_per_ubatch
+    tensor[1][dp_rank] = padded_num_tokens_per_ubatch
+    tensor[2][dp_rank] = 1 if should_ubatch else 0
+    tensor[3][dp_rank] = 1 if should_dp_pad else 0
+    dist.all_reduce(tensor, group=group)
+    return tensor
+
+
+def _post_process_ubatch(tensor: torch.Tensor) -> bool:
+    orig_num_tokens_tensor = tensor[0, :]
+    padded_num_tokens_tensor = tensor[1, :]
+
+    # First determine if we are going to be ubatching.
+    should_ubatch: bool = bool(torch.all(tensor[2] == 1).item())
+    if not should_ubatch:
+        return False
+    # If the DP ranks are planning to ubatch, make sure that
+    # there are no "empty" second ubatches
+    orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
+    padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
+    if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
+        logger.debug(
+            "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
+        )
+        should_ubatch = False
+    return should_ubatch
+
+
+def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch.Tensor:
+    num_tokens_across_dp = tensor[1, :]
+    if should_dp_pad:
+        # If DP padding is enabled, ensure that each rank is processing the same number
+        # of tokens
+        max_num_tokens = int(num_tokens_across_dp.max().item())
+        return torch.tensor(
+            [max_num_tokens] * len(num_tokens_across_dp),
+            device="cpu",
+            dtype=torch.int32,
+        )
+    else:
+        return num_tokens_across_dp.cpu()
+
+
+def _synchronize_dp_ranks(
+    num_tokens_unpadded: int,
+    num_tokens_padded: int,
+    should_attempt_ubatching: bool,
+    should_attempt_dp_padding: bool,
+    parallel_config: ParallelConfig,
+) -> tuple[bool, torch.Tensor | None]:
+    """
+    1. Decides if each DP rank is going to microbatch. Either all ranks
+    run with microbatching or none of them do.
+
+    2. Determines the total number of tokens that each rank will run.
+    When running microbatched or if should_attempt_dp_padding is True, all
+    ranks will be padded out so that the run with the same number of tokens
+
+    Returns: tuple[
+        should_ubatch: Are all DP ranks going to microbatch
+        num_tokens_after_padding: A tensor containing the total number of
+        tokens per-microbatch for each DP rank including any DP padding.
+    ]
+
+    """
+    assert num_tokens_padded >= num_tokens_unpadded
+
+    # Coordinate between the DP ranks via an All Reduce
+    # to determine the total number of tokens that each rank
+    # will run and if we are using ubatching or not.
+    tensor = _run_ar(
+        should_ubatch=should_attempt_ubatching,
+        should_dp_pad=should_attempt_dp_padding,
+        orig_num_tokens_per_ubatch=num_tokens_unpadded,
+        padded_num_tokens_per_ubatch=num_tokens_padded,
+        parallel_config=parallel_config,
+    )
+
+    should_dp_pad = bool(torch.all(tensor[3] == 1).item())
+
+    # DP ranks should all have the same value for should_attempt_dp_padding.
+    assert should_attempt_dp_padding == should_dp_pad
+
+    # Check conditions for microbatching
+    should_ubatch = _post_process_ubatch(tensor)
+
+    if should_ubatch and not should_dp_pad:
+        logger.debug_once(
+            "Microbatching has been triggered and requires DP padding. "
+            "Enabling DP padding even though it has been explicitly "
+            "disabled.",
+            scope="global",
+        )
+        should_dp_pad = True
+
+    # Pad all DP ranks up to the maximum token count across ranks if
+    # should_dp_pad is True
+    num_tokens_after_padding = _post_process_dp_padding(
+        tensor,
+        should_dp_pad,
+    )
+
+    return should_ubatch, num_tokens_after_padding
+
+
+def coordinate_batch_across_dp(
+    num_tokens_unpadded: int,
+    allow_microbatching: bool,
+    allow_dp_padding: bool,
+    parallel_config: ParallelConfig,
+    num_tokens_padded: int | None = None,
+    uniform_decode: bool | None = None,
+    num_scheduled_tokens_per_request: np.ndarray | None = None,
+) -> tuple[UBatchSlices | None, torch.Tensor | None]:
+    """
+    Coordinates amongst all DP ranks to determine if and how the full batch
+    should be split into microbatches.
+
+    Args:
+        num_tokens_unpadded: Number of tokens without accounting for padding
+        allow_microbatching: If microbatching should be attempted
+        allow_dp_padding: If all DP ranks should be padded up to the same value
+        parallel_config: The parallel config
+        num_tokens_padded: Number of tokens including any non-DP padding (CUDA graphs,
+            TP, etc)
+        uniform_decode: Only used if allow_microbatching is True. True if the batch
+            only contains single token decodes
+        num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
+            number of tokens per request.
+
+    Returns: tuple[
+        ubatch_slices: if this is set then all DP ranks have agreed to
+        microbatch
+        num_tokens_after_padding: A tensor containing the total number of
+        tokens per-microbatch for each DP rank including padding. Will be
+        padded up to the max value across all DP ranks when allow_dp_padding
+        is True.
+    ]
+
+    """
+    if parallel_config.data_parallel_size == 1:
+        # Early exit.
+        return None, None
+
+    # If the caller has explicitly enabled microbatching.
+    should_attempt_ubatching = False
+    if allow_microbatching:
+        # Check preconditions for microbatching
+        assert uniform_decode is not None
+        should_attempt_ubatching = check_ubatch_thresholds(
+            parallel_config,
+            num_tokens_unpadded,
+            uniform_decode=uniform_decode,
+        )
+
+    if num_tokens_padded is None:
+        num_tokens_padded = num_tokens_unpadded
+
+    (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
+        num_tokens_unpadded,
+        num_tokens_padded,
+        should_attempt_ubatching,
+        allow_dp_padding,
+        parallel_config,
+    )
+
+    # Don't microbatch unless every other DP worker is also microbatching
+    if not should_ubatch:
+        return (None, num_tokens_after_padding)
+
+    # This doesn't actually pad the ubatch slices. It just initializes the
+    # split point to the padded value so that padding can be applied
+    # to the second ubatch in pad_out_ubatch_slice after attention
+    # metadata creation
+    assert num_tokens_after_padding is not None
+    token_split_point = int(num_tokens_after_padding[0].item()) // 2
+
+    assert num_scheduled_tokens_per_request is not None
+    ubatch_slices = create_ubatch_slices(
+        num_scheduled_tokens_per_request, token_split_point
+    )
+
+    return (ubatch_slices, num_tokens_after_padding)
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..00bc909df2975d9ea3163ad65743e2e6855b215d
--- /dev/null
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Define EC connector functionality mixin for model runners.
+"""
+
+from collections.abc import Generator
+from contextlib import AbstractContextManager, contextmanager, nullcontext
+from typing import (
+    TYPE_CHECKING,  # noqa: UP035
+)
+
+import torch
+
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorBase
+from vllm.logger import init_logger
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+# Defined as a EC connector functionality mixin for ModelRunner (GPU, TPU)
+class ECConnectorModelRunnerMixin:
+    @staticmethod
+    def maybe_save_ec_to_connector(
+        encoder_cache: dict[str, torch.Tensor],
+        mm_hash: str,
+    ):
+        if not has_ec_transfer():
+            logger.debug("Not have ec transfer please check")
+            return
+        connector = get_ec_transfer()
+        connector.save_caches(encoder_cache=encoder_cache, mm_hash=mm_hash)
+
+    @staticmethod
+    def get_finished_ec_transfers(
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[set[str] | None, set[str] | None]:
+        if has_ec_transfer():
+            return get_ec_transfer().get_finished(scheduler_output.finished_req_ids)
+        return None, None
+
+    @staticmethod
+    def maybe_get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> AbstractContextManager[ECConnectorOutput | None]:
+        return (
+            ECConnectorModelRunnerMixin._get_ec_connector_output(
+                scheduler_output, encoder_cache, **kwargs
+            )
+            if has_ec_transfer()
+            else nullcontext()
+        )
+
+    # This context manager must be used within an active forward context.
+    # It encapsulates the entire EC conector lifecycle within execute_model
+    @staticmethod
+    @contextmanager
+    def _get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> Generator[ECConnectorOutput, None, None]:
+        output = ECConnectorOutput()
+
+        ec_connector = get_ec_transfer()
+        assert isinstance(ec_connector, ECConnectorBase)
+        assert scheduler_output.ec_connector_metadata is not None
+        ec_connector.bind_connector_metadata(scheduler_output.ec_connector_metadata)
+
+        if not ec_connector.is_producer:
+            ec_connector.start_load_caches(encoder_cache, **kwargs)
+
+        try:
+            yield output
+        finally:
+            output.finished_sending, output.finished_recving = (
+                ec_connector.get_finished(scheduler_output.finished_req_ids)
+            )
+
+            ec_connector.clear_connector_metadata()
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 67fb9864b19c959db7cdd7903c2dca07f4b2aa0b..7cf6afa3fc37140f02ee5e2b1574e8e4362a77d4 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -3,22 +3,24 @@
 # Datastructures defining a GPU input batch
 
 from dataclasses import dataclass
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import torch
-from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             LogitsProcessors,
-                                             MoveDirectionality)
+from vllm.v1.sample.logits_processor import (
+    BatchUpdateBuilder,
+    LogitsProcessors,
+    MoveDirectionality,
+)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -27,57 +29,49 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable
 
 @dataclass
 class CachedRequestState:
-
     req_id: str
-    prompt_token_ids: Optional[list[int]]
+    prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
-    generator: Optional[torch.Generator]
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
+    generator: torch.Generator | None
 
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     output_token_ids: list[int]
 
-    mrope_positions: Optional[torch.Tensor] = None
-    mrope_position_delta: Optional[int] = None
+    mrope_positions: torch.Tensor | None = None
+    mrope_position_delta: int | None = None
+
+    lora_request: LoRARequest | None = None
+    prompt_embeds: torch.Tensor | None = None
 
-    lora_request: Optional[LoRARequest] = None
-    prompt_embeds: Optional[torch.Tensor] = None
+    # Used when both async_scheduling and spec_decode are enabled.
+    prev_num_draft_len: int = 0
 
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
-            self.prompt_token_ids, self.prompt_embeds)
+            self.prompt_token_ids, self.prompt_embeds
+        )
 
     @property
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
 
-    # Temporary back-compatibility for plugins that define model runner
-    @property
-    @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
-                "removed in v0.13. Please use `mm_kwargs` instead.")
-    def mm_inputs(self) -> list[MultiModalKwargsItems]:
-        return [
-            MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features
-            if f.data is not None
-        ]
-
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             if self.prompt_token_ids is None:
                 raise ValueError(
                     f"Tried to access token index {idx}, but that token was "
-                    "provided via prompt_embeds, and its ID is unknown.")
+                    "provided via prompt_embeds, and its ID is unknown."
+                )
             return self.prompt_token_ids[idx]
-        elif idx - self.num_prompt_tokens < len(self.output_token_ids):
+        if idx - self.num_prompt_tokens < len(self.output_token_ids):
             return self.output_token_ids[idx - self.num_prompt_tokens]
-        else:
-            return -1
+        return -1
 
 
 class InputBatch:
-
     def __init__(
         self,
         max_num_reqs: int,
@@ -87,10 +81,13 @@ class InputBatch:
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
-        logitsprocs: Optional[LogitsProcessors] = None,
+        kernel_block_sizes: list[int],
+        logitsprocs: LogitsProcessors | None = None,
+        logitsprocs_need_output_token_ids: bool = False,
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
         num_speculative_tokens: int = 0,
+        dcp_kv_cache_interleave_size: int = 1,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -101,7 +98,7 @@ class InputBatch:
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self._req_ids: list[Optional[str]] = []
+        self._req_ids: list[str | None] = []
         self.req_id_to_index: dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
@@ -115,10 +112,10 @@ class InputBatch:
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.is_token_ids = torch.zeros((max_num_reqs, max_model_len),
-                                        device="cpu",
-                                        dtype=bool,
-                                        pin_memory=False)
+        self.is_token_ids_tensor = torch.zeros(
+            (max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False
+        )
+        self.is_token_ids = self.is_token_ids_tensor.numpy()
         # Store prompt embeddings per request to avoid OOM from large upfront
         # allocation if max_model_len is big.
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
@@ -127,13 +124,12 @@ class InputBatch:
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
-            (max_num_reqs, ),
+            (max_num_reqs,),
             device="cpu",
             dtype=torch.int32,
             pin_memory=pin_memory,
         )
-        self.num_computed_tokens_cpu = \
-            self.num_computed_tokens_cpu_tensor.numpy()
+        self.num_computed_tokens_cpu = self.num_computed_tokens_cpu_tensor.numpy()
 
         # Block table.
         self.block_table = MultiGroupBlockTable(
@@ -143,38 +139,33 @@ class InputBatch:
             pin_memory=pin_memory,
             device=device,
             block_sizes=block_sizes,
+            kernel_block_sizes=kernel_block_sizes,
             num_speculative_tokens=num_speculative_tokens,
+            dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
         )
 
         # Sampling-related.
-        self.temperature = torch.empty((max_num_reqs, ),
-                                       dtype=torch.float32,
-                                       device=device)
-        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                  dtype=torch.float32,
-                                                  device="cpu",
-                                                  pin_memory=pin_memory)
+        self.temperature = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device=device
+        )
+        self.temperature_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=pin_memory
+        )
         self.temperature_cpu = self.temperature_cpu_tensor.numpy()
         self.greedy_reqs: set[str] = set()
         self.random_reqs: set[str] = set()
 
-        self.top_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
+        self.top_p = torch.empty((max_num_reqs,), dtype=torch.float32, device=device)
+        self.top_p_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=pin_memory
+        )
         self.top_p_cpu = self.top_p_cpu_tensor.numpy()
         self.top_p_reqs: set[str] = set()
 
-        self.top_k = torch.empty((max_num_reqs, ),
-                                 dtype=torch.int32,
-                                 device=device)
-        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.int32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
+        self.top_k = torch.empty((max_num_reqs,), dtype=torch.int32, device=device)
+        self.top_k_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.int32, device="cpu", pin_memory=pin_memory
+        )
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
@@ -182,54 +173,43 @@ class InputBatch:
         self.spec_decode_unsupported_reqs: set[str] = set()
 
         # Frequency penalty related data structures
-        self.frequency_penalties = torch.empty((max_num_reqs, ),
-                                               dtype=torch.float,
-                                               device=device)
+        self.frequency_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
+        )
         self.frequency_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.frequency_penalties_cpu = \
-            self.frequency_penalties_cpu_tensor.numpy()
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.frequency_penalties_cpu = self.frequency_penalties_cpu_tensor.numpy()
         self.frequency_penalties_reqs: set[str] = set()
 
         # Presence penalty related data structures
-        self.presence_penalties = torch.empty((max_num_reqs, ),
-                                              dtype=torch.float,
-                                              device=device)
-        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                         dtype=torch.float,
-                                                         device="cpu",
-                                                         pin_memory=pin_memory)
-        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        self.presence_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
         )
+        self.presence_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy()
         self.presence_penalties_reqs: set[str] = set()
 
         # Repetition penalty related data structures
-        self.repetition_penalties = torch.empty((max_num_reqs, ),
-                                                dtype=torch.float,
-                                                device=device)
+        self.repetition_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
+        )
         self.repetition_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.repetition_penalties_cpu = \
-            self.repetition_penalties_cpu_tensor.numpy()
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.repetition_penalties_cpu = self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
         # Speculative decoding
-        self.num_accepted_tokens_cpu_tensor = torch.ones((max_num_reqs, ),
-                                                         dtype=torch.int64,
-                                                         device="cpu",
-                                                         pin_memory=pin_memory)
-        self.num_accepted_tokens_cpu = \
-            self.num_accepted_tokens_cpu_tensor.numpy()
+        self.num_accepted_tokens_cpu_tensor = torch.ones(
+            (max_num_reqs,), dtype=torch.int64, device="cpu", pin_memory=pin_memory
+        )
+        self.num_accepted_tokens_cpu = self.num_accepted_tokens_cpu_tensor.numpy()
 
         # lora related
-        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
-                                             dtype=np.int32)
+        self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int64)
         self.lora_id_to_request_ids: dict[int, set[str]] = {}
         self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
 
@@ -255,20 +235,23 @@ class InputBatch:
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
-        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
-        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask: torch.Tensor | None = None
+        self.allowed_token_ids_mask_cpu_tensor: torch.Tensor | None = None
 
         # req_index -> bad_words_token_ids
         self.bad_words_token_ids: dict[int, list[list[int]]] = {}
 
-        self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
-                                                          dtype=bool)
+        self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, dtype=bool)
 
-        self.req_output_token_ids: list[Optional[list[int]]] = []
+        self.req_output_token_ids: list[list[int] | None] = []
 
         # Store provided logitsprocs. If none are provided, initialize empty
         # data structure
         self.logitsprocs = logitsprocs or LogitsProcessors()
+        self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids
+
+        # Store last speculative tokens for sampler.
+        self.spec_token_ids: list[list[int] | None] = []
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -276,9 +259,13 @@ class InputBatch:
         self.pooling_params: dict[str, PoolingParams] = {}
 
         # Cached reference to the GPU tensor of previously sampled tokens
-        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
-        self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None
-        self.prev_req_id_to_index: Optional[dict[str, int]] = None
+        self.prev_sampled_token_ids: torch.Tensor | None = None
+        self.prev_req_id_to_index: dict[str, int] | None = None
+        # These are used to update output_token_ids with real sampled
+        # ids from prior step, if required by current sampling params
+        # (e.g. penalties).
+        self.sampled_token_ids_cpu: torch.Tensor | None = None
+        self.async_copy_ready_event: torch.cuda.Event | None = None
 
     @property
     def req_ids(self) -> list[str]:
@@ -302,8 +289,13 @@ class InputBatch:
             # Detailed added request metadata is only required for non-pooling
             # models, to support logitsprocs.
             self.batch_update_builder.added.append(
-                (new_req_index, request.sampling_params,
-                 request.prompt_token_ids, request.output_token_ids))
+                (
+                    new_req_index,
+                    request.sampling_params,
+                    request.prompt_token_ids,
+                    request.output_token_ids,
+                )
+            )
 
         return new_req_index
 
@@ -317,28 +309,29 @@ class InputBatch:
         if req_index == len(self._req_ids):
             self._req_ids.append(req_id)
             self.req_output_token_ids.append(request.output_token_ids)
+            self.spec_token_ids.append([])
         else:
             self._req_ids[req_index] = req_id
             self.req_output_token_ids[req_index] = request.output_token_ids
+            self.spec_token_ids[req_index] = []
 
         self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
-            request.prompt_token_ids, request.prompt_embeds)
+            request.prompt_token_ids, request.prompt_embeds
+        )
         self.num_prompt_tokens[req_index] = num_prompt_tokens
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
         if request.prompt_token_ids is not None:
-            self.token_ids_cpu[
-                req_index, :num_prompt_tokens] = request.prompt_token_ids
+            self.token_ids_cpu[req_index, :num_prompt_tokens] = request.prompt_token_ids
             self.is_token_ids[req_index, :num_prompt_tokens] = True
         else:
             self.is_token_ids[req_index, :num_prompt_tokens] = False
         if request.prompt_embeds is not None:
             self.req_prompt_embeds[req_index] = request.prompt_embeds
-        self.token_ids_cpu[req_index,
-                           start_idx:end_idx] = request.output_token_ids
+        self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
         self.is_token_ids[req_index, start_idx:end_idx] = True
         # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
         # NOTE(woosuk): This may include spec decode tokens.
@@ -350,8 +343,7 @@ class InputBatch:
         self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
-            if (self.is_spec_decode
-                    and is_spec_decode_unsupported(sampling_params)):
+            if self.is_spec_decode and is_spec_decode_unsupported(sampling_params):
                 self.spec_decode_unsupported_reqs.add(req_id)
             if sampling_params.sampling_type == SamplingType.GREEDY:
                 # Should avoid division by zero later when apply_temperature.
@@ -370,16 +362,15 @@ class InputBatch:
             else:
                 top_k = self.vocab_size
             self.top_k_cpu[req_index] = top_k
-            self.frequency_penalties_cpu[
-                req_index] = sampling_params.frequency_penalty
+            self.frequency_penalties_cpu[req_index] = sampling_params.frequency_penalty
             if sampling_params.frequency_penalty != 0.0:
                 self.frequency_penalties_reqs.add(req_id)
-            self.presence_penalties_cpu[
-                req_index] = sampling_params.presence_penalty
+            self.presence_penalties_cpu[req_index] = sampling_params.presence_penalty
             if sampling_params.presence_penalty != 0.0:
                 self.presence_penalties_reqs.add(req_id)
-            self.repetition_penalties_cpu[
-                req_index] = sampling_params.repetition_penalty
+            self.repetition_penalties_cpu[req_index] = (
+                sampling_params.repetition_penalty
+            )
             if sampling_params.repetition_penalty != 1.0:
                 self.repetition_penalties_reqs.add(req_id)
 
@@ -389,13 +380,17 @@ class InputBatch:
                 self.generators[req_index] = request.generator
 
             if sampling_params.logprobs is not None:
-                self.num_logprobs[req_id] = (self.vocab_size
-                                             if sampling_params.logprobs == -1
-                                             else sampling_params.logprobs)
+                self.num_logprobs[req_id] = (
+                    self.vocab_size
+                    if sampling_params.logprobs == -1
+                    else sampling_params.logprobs
+                )
             if sampling_params.prompt_logprobs is not None:
                 self.num_prompt_logprobs[req_id] = (
-                    self.vocab_size if sampling_params.prompt_logprobs == -1
-                    else sampling_params.prompt_logprobs)
+                    self.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -406,24 +401,29 @@ class InputBatch:
                         self.max_num_reqs,
                         self.vocab_size,
                         dtype=torch.bool,
-                        device=self.device)
+                        device=self.device,
+                    )
                     self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
                         self.max_num_reqs,
                         self.vocab_size,
                         dtype=torch.bool,
-                        device="cpu")
+                        device="cpu",
+                    )
                 self.allowed_token_ids_mask_cpu_tensor[req_index] = True
                 # False means we don't fill with -inf.
                 self.allowed_token_ids_mask_cpu_tensor[req_index][
-                    sampling_params.allowed_token_ids] = False
+                    sampling_params.allowed_token_ids
+                ] = False
 
             if sampling_params.bad_words_token_ids:
-                self.bad_words_token_ids[
-                    req_index] = sampling_params.bad_words_token_ids
+                self.bad_words_token_ids[req_index] = (
+                    sampling_params.bad_words_token_ids
+                )
         elif pooling_params := request.pooling_params:
             self.pooling_params[req_id] = pooling_params
             self.logits_processing_needs_token_ids[req_index] = (
-                pooling_params.requires_token_ids)
+                pooling_params.requires_token_ids
+            )
         else:
             raise NotImplementedError("Unrecognized request type")
 
@@ -445,7 +445,7 @@ class InputBatch:
 
         return req_index
 
-    def remove_request(self, req_id: str) -> Optional[int]:
+    def remove_request(self, req_id: str) -> int | None:
         """This method must always be followed by a call to condense().
 
         Args:
@@ -462,6 +462,7 @@ class InputBatch:
         self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
+        self.spec_token_ids[req_index] = None
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
@@ -500,21 +501,36 @@ class InputBatch:
     def swap_states(self, i1: int, i2: int) -> None:
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
-        self._req_ids[i1], self._req_ids[i2] =\
-            self._req_ids[i2], self._req_ids[i1] # noqa
-        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
-            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        self._req_ids[i1], self._req_ids[i2] = self._req_ids[i2], self._req_ids[i1]  # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] = (
+            self.req_output_token_ids[i2],
+            self.req_output_token_ids[i1],
+        )
+        self.spec_token_ids[i1], self.spec_token_ids[i2] = (
+            self.spec_token_ids[i2],
+            self.spec_token_ids[i1],
+        )
         assert old_id_i1 is not None and old_id_i2 is not None
-        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
-            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
-        self.num_tokens[i1], self.num_tokens[i2] =\
-            self.num_tokens[i2], self.num_tokens[i1]
-        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
-            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
-        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
-            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
-        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
-            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] = (
+            self.req_id_to_index[old_id_i2],
+            self.req_id_to_index[old_id_i1],
+        )
+        self.num_tokens[i1], self.num_tokens[i2] = (
+            self.num_tokens[i2],
+            self.num_tokens[i1],
+        )
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
+            self.num_tokens_no_spec[i2],
+            self.num_tokens_no_spec[i1],
+        )
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] = (
+            self.num_prompt_tokens[i2],
+            self.num_prompt_tokens[i1],
+        )
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] = (
+            self.num_computed_tokens_cpu[i2],
+            self.num_computed_tokens_cpu[i1],
+        )
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -541,8 +557,10 @@ class InputBatch:
 
         self.block_table.swap_row(i1, i2)
 
-        self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \
-            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] = (
+            self.request_lora_mapping[i2],
+            self.request_lora_mapping[i1],
+        )
 
         if self.is_pooling_model:
             # Sampling and logits parameters don't apply to pooling models.
@@ -550,32 +568,42 @@ class InputBatch:
 
         # For autoregressive models, track detailed request reordering info
         # to support logitsprocs.
-        self.batch_update_builder.moved.append(
-            (i1, i2, MoveDirectionality.SWAP))
-
-        self.temperature_cpu[i1], self.temperature_cpu[i2] = \
-            self.temperature_cpu[i2], self.temperature_cpu[i1]
-        self.top_p_cpu[i1], self.top_p_cpu[i2] = \
-            self.top_p_cpu[i2], self.top_p_cpu[i1]
-        self.top_k_cpu[i1], self.top_k_cpu[i2] = \
-            self.top_k_cpu[i2], self.top_k_cpu[i1]
-        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] = \
-            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
-        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] = \
-            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
-        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = \
-            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.num_accepted_tokens_cpu[i1], self.num_accepted_tokens_cpu[i2] =\
-            self.num_accepted_tokens_cpu[i2], self.num_accepted_tokens_cpu[i1]
+        self.batch_update_builder.moved.append((i1, i2, MoveDirectionality.SWAP))
+
+        self.temperature_cpu[i1], self.temperature_cpu[i2] = (
+            self.temperature_cpu[i2],
+            self.temperature_cpu[i1],
+        )
+        self.top_p_cpu[i1], self.top_p_cpu[i2] = self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] = self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] = (
+            self.frequency_penalties_cpu[i2],
+            self.frequency_penalties_cpu[i1],
+        )
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] = (
+            self.presence_penalties_cpu[i2],
+            self.presence_penalties_cpu[i1],
+        )
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = (
+            self.repetition_penalties_cpu[i2],
+            self.repetition_penalties_cpu[i1],
+        )
+        self.num_accepted_tokens_cpu[i1], self.num_accepted_tokens_cpu[i2] = (
+            self.num_accepted_tokens_cpu[i2],
+            self.num_accepted_tokens_cpu[i1],
+        )
 
         swap_dict_values(self.generators, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[i1], \
-                self.allowed_token_ids_mask_cpu_tensor[i2] =\
-                self.allowed_token_ids_mask_cpu_tensor[i2], \
-                    self.allowed_token_ids_mask_cpu_tensor[i1]
+            (
+                self.allowed_token_ids_mask_cpu_tensor[i1],
+                self.allowed_token_ids_mask_cpu_tensor[i2],
+            ) = (
+                self.allowed_token_ids_mask_cpu_tensor[i2],
+                self.allowed_token_ids_mask_cpu_tensor[i1],
+            )
 
     def condense(self) -> None:
         """Slide non-empty requests down into lower, empty indices.
@@ -597,6 +625,7 @@ class InputBatch:
             # The batched states are empty.
             self._req_ids.clear()
             self.req_output_token_ids.clear()
+            self.spec_token_ids.clear()
             return
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
@@ -625,25 +654,34 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
+            spec_token_ids = self.spec_token_ids[last_req_index]
+            self.spec_token_ids[empty_index] = spec_token_ids
+            self.spec_token_ids[last_req_index] = None
+
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
-                last_req_index, :num_tokens]
+                last_req_index, :num_tokens
+            ]
             self.is_token_ids[empty_index, :num_tokens] = self.is_token_ids[
-                last_req_index, :num_tokens]
+                last_req_index, :num_tokens
+            ]
             if last_req_index in self.req_prompt_embeds:
-                self.req_prompt_embeds[
-                    empty_index] = self.req_prompt_embeds.pop(last_req_index)
+                self.req_prompt_embeds[empty_index] = self.req_prompt_embeds.pop(
+                    last_req_index
+                )
             self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
-                last_req_index]
-            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
-                last_req_index]
-            self.num_computed_tokens_cpu[
-                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+                last_req_index
+            ]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[last_req_index]
+            self.num_computed_tokens_cpu[empty_index] = self.num_computed_tokens_cpu[
+                last_req_index
+            ]
             self.block_table.move_row(last_req_index, empty_index)
 
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
-                last_req_index]
+                last_req_index
+            ]
 
             if self.is_pooling_model:
                 last_req_index -= 1
@@ -653,33 +691,35 @@ class InputBatch:
             # Autoregressive models require detailed tracking of condense
             # operations to support logitsprocs
             self.batch_update_builder.moved.append(
-                (last_req_index, empty_index,
-                 MoveDirectionality.UNIDIRECTIONAL))
+                (last_req_index, empty_index, MoveDirectionality.UNIDIRECTIONAL)
+            )
 
-            self.temperature_cpu[empty_index] = self.temperature_cpu[
-                last_req_index]
+            self.temperature_cpu[empty_index] = self.temperature_cpu[last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.frequency_penalties_cpu[
-                empty_index] = self.frequency_penalties_cpu[last_req_index]
-            self.presence_penalties_cpu[
-                empty_index] = self.presence_penalties_cpu[last_req_index]
-            self.repetition_penalties_cpu[
-                empty_index] = self.repetition_penalties_cpu[last_req_index]
-            self.num_accepted_tokens_cpu[
-                empty_index] = self.num_accepted_tokens_cpu[last_req_index]
+            self.frequency_penalties_cpu[empty_index] = self.frequency_penalties_cpu[
+                last_req_index
+            ]
+            self.presence_penalties_cpu[empty_index] = self.presence_penalties_cpu[
+                last_req_index
+            ]
+            self.repetition_penalties_cpu[empty_index] = self.repetition_penalties_cpu[
+                last_req_index
+            ]
+            self.num_accepted_tokens_cpu[empty_index] = self.num_accepted_tokens_cpu[
+                last_req_index
+            ]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
             # TODO convert these to LogitsProcessors
             if self.allowed_token_ids_mask_cpu_tensor is not None:
-                self.allowed_token_ids_mask_cpu_tensor[
-                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
-                        last_req_index]
+                self.allowed_token_ids_mask_cpu_tensor[empty_index] = (
+                    self.allowed_token_ids_mask_cpu_tensor[last_req_index]
+                )
 
-            bad_words_token_ids = self.bad_words_token_ids.pop(
-                last_req_index, None)
+            bad_words_token_ids = self.bad_words_token_ids.pop(last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
 
@@ -689,6 +729,7 @@ class InputBatch:
         # Trim lists to the batch size.
         del self._req_ids[num_reqs:]
         del self.req_output_token_ids[num_reqs:]
+        del self.spec_token_ids[num_reqs:]
 
     def refresh_metadata(self):
         """Apply any batch updates to sampling metadata."""
@@ -711,8 +752,9 @@ class InputBatch:
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
-            temperature = copy_slice(self.temperature_cpu_tensor,
-                                     self.temperature, num_reqs)
+            temperature = copy_slice(
+                self.temperature_cpu_tensor, self.temperature, num_reqs
+            )
         else:
             temperature = None
         if not self.no_top_p:
@@ -724,30 +766,51 @@ class InputBatch:
             # Since syncing these tensors is expensive only copy them
             # if necessary i.e. if there are requests which require
             # penalties to be applied during sampling.
-            copy_slice(self.frequency_penalties_cpu_tensor,
-                       self.frequency_penalties, num_reqs)
-            copy_slice(self.presence_penalties_cpu_tensor,
-                       self.presence_penalties, num_reqs)
-            copy_slice(self.repetition_penalties_cpu_tensor,
-                       self.repetition_penalties, num_reqs)
+            copy_slice(
+                self.frequency_penalties_cpu_tensor, self.frequency_penalties, num_reqs
+            )
+            copy_slice(
+                self.presence_penalties_cpu_tensor, self.presence_penalties, num_reqs
+            )
+            copy_slice(
+                self.repetition_penalties_cpu_tensor,
+                self.repetition_penalties,
+                num_reqs,
+            )
 
         needs_prompt_token_ids = (
             not self.no_penalties
-            or self.logits_processing_needs_token_ids[:num_reqs].any())
-        if needs_prompt_token_ids:
-            # The prompt tokens are used only for applying penalties or
-            # step pooling during the sampling/pooling process.
-            # Hence copy these tensors only when there are requests which
-            # need penalties/step_pooler to be applied.
-            prompt_token_ids = self._make_prompt_token_ids_tensor()
-        else:
-            prompt_token_ids = None
+            or self.logits_processing_needs_token_ids[:num_reqs].any()
+        )
+        # The prompt tokens are used only for applying penalties or
+        # step pooling during the sampling/pooling process.
+        # Hence copy these tensors only when there are requests which
+        # need penalties/step_pooler to be applied.
+        prompt_token_ids = (
+            self._make_prompt_token_ids_tensor() if needs_prompt_token_ids else None
+        )
 
-        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        # Only set output_token_ids if required by the current requests'
+        # sampling parameters.
+        needs_output_token_ids = (
+            not self.no_penalties
+            or bool(self.bad_words_token_ids)
+            or self.logitsprocs_need_output_token_ids
+        )
+        output_token_ids = (
+            cast(list[list[int]], self.req_output_token_ids)
+            if needs_output_token_ids
+            else []
+        )
+
+        allowed_token_ids_mask: torch.Tensor | None = None
         if not self.no_allowed_token_ids:
             assert self.allowed_token_ids_mask is not None
-            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
-                       self.allowed_token_ids_mask, num_reqs)
+            copy_slice(
+                self.allowed_token_ids_mask_cpu_tensor,
+                self.allowed_token_ids_mask,
+                num_reqs,
+            )
             allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
 
         return SamplingMetadata(
@@ -762,7 +825,8 @@ class InputBatch:
             frequency_penalties=self.frequency_penalties[:num_reqs],
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
-            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            output_token_ids=output_token_ids,
+            spec_token_ids=cast(list[list[int]], self.spec_token_ids),
             no_penalties=self.no_penalties,
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
@@ -777,8 +841,7 @@ class InputBatch:
         pooling_params = self.get_pooling_params()
 
         return PoolingMetadata(
-            prompt_lens=torch.from_numpy(
-                self.num_prompt_tokens[:self.num_reqs]),
+            prompt_lens=torch.from_numpy(self.num_prompt_tokens[: self.num_reqs]),
             prompt_token_ids=self.sampling_metadata.prompt_token_ids,
             pooling_params=pooling_params,
         )
@@ -797,33 +860,80 @@ class InputBatch:
         # Use the value of vocab_size as a pad since we don't have a
         # token_id of this value.
         for i in range(num_reqs):
-            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
-        return prompt_token_ids_cpu_tensor.to(device=self.device,
-                                              non_blocking=True)
+            prompt_token_ids[i, self.num_prompt_tokens[i] :] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device, non_blocking=True)
 
     def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
+        self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
     ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return
         datastructures used to activate the current LoRAs.
         Returns:
-            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
-               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            1. prompt_lora_mapping: A tuple of size np.sum(num_sampled_tokens)
+               where, prompt_lora_mapping[i] is the LoRA id to use for the ith
+               sampled token.
             2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
                where, token_lora_mapping[i] is the LoRA id to use for ith token.
             3. lora_requests: Set of relevant LoRA requests.
         """
 
-        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
-        prompt_lora_mapping = tuple(req_lora_mapping)
-        token_lora_mapping = tuple(
-            req_lora_mapping.repeat(num_scheduled_tokens))
+        req_lora_mapping = self.request_lora_mapping[: self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping.repeat(num_sampled_tokens))
+        token_lora_mapping = tuple(req_lora_mapping.repeat(num_scheduled_tokens))
+
         active_lora_requests: set[LoRARequest] = set(
-            self.lora_id_to_lora_request.values())
+            self.lora_id_to_lora_request.values()
+        )
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
+    def set_async_sampled_token_ids(
+        self,
+        sampled_token_ids_cpu: torch.Tensor,
+        async_copy_ready_event: torch.cuda.Event,
+    ) -> None:
+        """
+        In async scheduling case, store ref to sampled_token_ids_cpu
+        tensor and corresponding copy-ready event. Used to repair
+        output_token_ids prior to sampling, if needed by logits processors.
+        """
+        if self.sampling_metadata.output_token_ids:
+            self.sampled_token_ids_cpu = sampled_token_ids_cpu
+            self.async_copy_ready_event = async_copy_ready_event
+        else:
+            self.sampled_token_ids_cpu = None
+            self.async_copy_ready_event = None
+
+    def update_async_output_token_ids(self) -> None:
+        """
+        In async scheduling case, update output_token_ids in sampling metadata
+        from prior steps sampled token ids once they've finished copying to CPU.
+        This is called right before they are needed by the logits processors.
+        """
+        output_token_ids = self.sampling_metadata.output_token_ids
+        if self.sampled_token_ids_cpu is None or not output_token_ids:
+            # Output token ids not needed or not async scheduling.
+            return
+
+        assert self.prev_req_id_to_index is not None
+        sampled_token_ids = None
+        for index, req_id in enumerate(self.req_ids):
+            prev_index = self.prev_req_id_to_index.get(req_id)
+            if prev_index is None:
+                continue
+            req_output_token_ids = output_token_ids[index]
+            if not req_output_token_ids or req_output_token_ids[-1] != -1:
+                # Final output id is not a placeholder, some tokens must have
+                # been discarded after a kv-load failure.
+                continue
+            if sampled_token_ids is None:
+                assert self.async_copy_ready_event is not None
+                self.async_copy_ready_event.synchronize()
+                sampled_token_ids = self.sampled_token_ids_cpu.squeeze(-1).tolist()
+            # Replace placeholder token id with actual sampled id.
+            req_output_token_ids[-1] = sampled_token_ids[prev_index]
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -846,12 +956,14 @@ class InputBatch:
 
     @property
     def no_penalties(self) -> bool:
-        return (len(self.presence_penalties_reqs) == 0
-                and len(self.frequency_penalties_reqs) == 0
-                and len(self.repetition_penalties_reqs) == 0)
+        return (
+            len(self.presence_penalties_reqs) == 0
+            and len(self.frequency_penalties_reqs) == 0
+            and len(self.repetition_penalties_reqs) == 0
+        )
 
     @property
-    def max_num_logprobs(self) -> Optional[int]:
+    def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a438c7777b3c09e9710c3ab6e0dce440d81bb7cb..15629469391b4af0cfd02ae63c27ade4c2b753c8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,87 +7,123 @@ import time
 from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
-from copy import deepcopy
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast
+from copy import copy, deepcopy
+from functools import reduce
+from itertools import product
+from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
 
 import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 from tqdm import tqdm
-from typing_extensions import TypeAlias
 
 import vllm.envs as envs
 from vllm.attention import Attention, AttentionType
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    MultipleOf,
+)
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
-from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
-                         get_layers_from_vllm_config, update_config)
+from vllm.config import (
+    CompilationMode,
+    CUDAGraphMode,
+    VllmConfig,
+    get_layers_from_vllm_config,
+    update_config,
+)
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
 from vllm.distributed.eplb.eplb_state import EplbState
-from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-                                          has_kv_transfer_group)
+from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
-    prepare_communication_buffer_for_model)
-from vllm.forward_context import (BatchDescriptor, DPMetadata,
-                                  set_forward_context)
+    get_dcp_group,
+    get_pp_group,
+    get_tp_group,
+    graph_capture,
+    is_global_first_rank,
+    prepare_communication_buffer_for_model,
+)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
-from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   is_mixture_of_experts,
-                                                   supports_eagle3,
-                                                   supports_mrope,
-                                                   supports_multimodal_pruning,
-                                                   supports_transcription)
-# yapf: enable
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    is_mixture_of_experts,
+    supports_eagle3,
+    supports_mrope,
+    supports_multimodal_pruning,
+    supports_transcription,
+)
 from vllm.model_executor.models.interfaces_base import (
-    VllmModelForPooling, is_pooling_model, is_text_generation_model)
+    VllmModelForPooling,
+    is_pooling_model,
+    is_text_generation_model,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (
+    BatchedTensorInputs,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, cdiv, check_use_alibi, get_dtype_size,
-                        is_pin_memory_available,
-                        length_from_prompt_token_ids_or_embeds, round_up,
-                        supports_dynamo)
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.utils.jsontree import json_map_leaves
-from vllm.v1.attention.backends.flash_attn import AttentionMetadata
+from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import (
+    get_dtype_size,
+    kv_cache_dtype_str_to_dtype,
+    supports_dynamo,
+)
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
-    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
-    reorder_batch_to_split_decodes_and_prefills, split_attn_metadata)
+    get_dcp_local_seq_lens,
+    reorder_batch_to_split_decodes_and_prefills,
+    split_attn_metadata,
+)
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.v1.kv_cache_interface import (AttentionSpec,
-                                        ChunkedLocalAttentionSpec,
-                                        CrossAttentionSpec,
-                                        EncoderOnlyAttentionSpec,
-                                        FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheSpec,
-                                        MambaSpec, MLAAttentionSpec,
-                                        SlidingWindowSpec,
-                                        UniformTypeKVCacheSpecs)
-# yapf: enable
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
-                             DraftTokenIds, LogprobsLists, LogprobsTensors,
-                             ModelRunnerOutput, PoolerOutput, SamplerOutput)
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    CrossAttentionSpec,
+    EncoderOnlyAttentionSpec,
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheSpec,
+    MambaSpec,
+    SlidingWindowSpec,
+    UniformTypeKVCacheSpecs,
+)
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    AsyncModelRunnerOutput,
+    DraftTokenIds,
+    ECConnectorOutput,
+    KVConnectorOutput,
+    LogprobsLists,
+    LogprobsTensors,
+    ModelRunnerOutput,
+    PoolerOutput,
+    SamplerOutput,
+    make_empty_encoder_model_runner_output,
+)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -97,84 +133,129 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
+from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
-from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin)
+from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.ubatch_splitting import (check_ubatch_thresholds,
-                                             ubatch_split)
-from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices
+from vllm.v1.worker.ubatch_utils import (
+    UBatchSlice,
+    UBatchSlices,
+    check_ubatch_thresholds,
+)
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 
-from .utils import (AttentionGroup, MultiModalBudget,
-                    add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache,
-                    gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import (
+    AttentionGroup,
+    MultiModalBudget,
+    add_kv_sharing_layers_to_kv_cache_groups,
+    bind_kv_cache,
+    gather_mm_placeholders,
+    sanity_check_mm_encoder_outputs,
+    scatter_mm_placeholders,
+)
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 logger = init_logger(__name__)
 
 AttnMetadataDict: TypeAlias = dict[str, AttentionMetadata]
 # list when ubatching is enabled
-PerLayerAttnMetadata: TypeAlias = Union[list[AttnMetadataDict],
-                                        AttnMetadataDict]
+PerLayerAttnMetadata: TypeAlias = list[AttnMetadataDict] | AttnMetadataDict
 
 
 # Wrapper for ModelRunnerOutput to support overlapped execution.
 class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
-
     def __init__(
         self,
         model_runner_output: ModelRunnerOutput,
         sampled_token_ids: torch.Tensor,
+        logprobs_tensors: torch.Tensor | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
+        vocab_size: int,
     ):
         self._model_runner_output = model_runner_output
         self._invalid_req_indices = invalid_req_indices
 
         # Event on the copy stream so we can synchronize the non-blocking copy.
-        self._async_copy_ready_event = torch.cuda.Event()
+        self.async_copy_ready_event = torch.cuda.Event()
 
         # Keep a reference to the device tensor to avoid it being
         # deallocated until we finish copying it to the host.
         self._sampled_token_ids = sampled_token_ids
+        self.vocab_size = vocab_size
+        self._logprobs_tensors = logprobs_tensors
 
         # Initiate the copy on a separate stream, but do not synchronize it.
         default_stream = torch.cuda.current_stream()
         with torch.cuda.stream(async_output_copy_stream):
             async_output_copy_stream.wait_stream(default_stream)
-            self._sampled_token_ids_cpu = self._sampled_token_ids.to(
-                'cpu', non_blocking=True)
-            self._async_copy_ready_event.record()
+            self.sampled_token_ids_cpu = self._sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            self._logprobs_tensors_cpu = (
+                self._logprobs_tensors.to_cpu_nonblocking()
+                if self._logprobs_tensors
+                else None
+            )
+            self.async_copy_ready_event.record()
 
     def get_output(self) -> ModelRunnerOutput:
         """Copy the device tensors to the host and return a ModelRunnerOutput.
-        
+
         This function blocks until the copy is finished.
         """
-        self._async_copy_ready_event.synchronize()
+        self.async_copy_ready_event.synchronize()
 
-        # Release the device tensor once the copy has completed
+        # Release the device tensors once the copy has completed.
+        del self._logprobs_tensors
         del self._sampled_token_ids
-
-        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in self.sampled_token_ids_cpu.numpy()
+            ]
+        else:
+            valid_sampled_token_ids = RejectionSampler.parse_output(
+                self.sampled_token_ids_cpu,
+                self.vocab_size,
+            )
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids[i] = np.array([])
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
+        if self._logprobs_tensors_cpu:
+            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
+            # for async sched + spec decode + logprobs compatibility.
+            output.logprobs = self._logprobs_tensors_cpu.tolists()
         return output
 
 
-class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+class ExecuteModelState(NamedTuple):
+    """Ephemeral cached state transferred between execute_model() and
+    sample_tokens(), after execute_model() returns None."""
 
+    scheduler_output: "SchedulerOutput"
+    logits: torch.Tensor
+    spec_decode_metadata: SpecDecodeMetadata | None
+    spec_decode_common_attn_metadata: CommonAttentionMetadata | None
+    hidden_states: torch.Tensor
+    sample_hidden_states: torch.Tensor
+    aux_hidden_states: list[torch.Tensor] | None
+    ec_connector_output: ECConnectorOutput | None
+
+
+class GPUModelRunner(
+    LoRAModelRunnerMixin, KVConnectorModelRunnerMixin, ECConnectorModelRunnerMixin
+):
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -192,8 +273,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.observability_config = vllm_config.observability_config
 
         from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
-        set_cpu_offload_max_bytes(
-            int(self.cache_config.cpu_offload_gb * 1024**3))
+
+        set_cpu_offload_max_bytes(int(self.cache_config.cpu_offload_gb * 1024**3))
 
         model_config = self.model_config
         cache_config = self.cache_config
@@ -202,20 +283,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.device = device
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
-        if cache_config.cache_dtype == "auto":
-            self.kv_cache_dtype = self.dtype
-        else:
-            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
-                cache_config.cache_dtype]
+        self.kv_cache_dtype = kv_cache_dtype_str_to_dtype(
+            cache_config.cache_dtype, self.model_config
+        )
 
-        self.is_pooling_model = (model_config.runner_type == 'pooling')
+        self.is_pooling_model = model_config.runner_type == "pooling"
         self.enable_prompt_embeds = model_config.enable_prompt_embeds
         self.is_multimodal_raw_input_only_model = (
-            model_config.is_multimodal_raw_input_only_model)
+            model_config.is_multimodal_raw_input_only_model
+        )
         # This will be overridden in load_model()
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
+
+        # Always set to false after the first forward pass
+        self.calculate_kv_scales = self.cache_config.calculate_kv_scales
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
+        self.dcp_rank = 0 if self.dcp_world_size <= 1 else get_dcp_group().rank_in_group
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
 
@@ -224,16 +308,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # TODO: Support overlapping mirco-batches
         # https://github.com/vllm-project/vllm/issues/18019
         self.broadcast_pp_output = (
-            self.parallel_config.distributed_executor_backend
-            == "external_launcher" and len(get_pp_group().ranks) > 0)
+            self.parallel_config.distributed_executor_backend == "external_launcher"
+            and len(get_pp_group().ranks) > 0
+        )
 
         # Model-related.
-        self.num_query_heads = model_config.get_num_attention_heads(
-            parallel_config)
+        self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
         self.hidden_size = model_config.get_hidden_size()
         self.attention_chunk_size = model_config.attention_chunk_size
         # Only relevant for models using ALiBi (e.g, MPT)
-        self.use_alibi = check_use_alibi(model_config)
+        self.use_alibi = model_config.uses_alibi
 
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
@@ -241,20 +325,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config)
+            model_config
+        )
 
         if self.model_config.is_encoder_decoder:
             # Maximum length of the encoder input, only for encoder-decoder
             # models.
-            self.max_encoder_len = scheduler_config.\
-                            max_num_encoder_input_tokens
+            self.max_encoder_len = scheduler_config.max_num_encoder_input_tokens
         else:
             self.max_encoder_len = 0
 
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
-        self.eplb_state: Optional[EplbState] = None
+        self.eplb_state: EplbState | None = None
         """
         State of the expert parallelism load balancer.
 
@@ -278,21 +362,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the last PP rank. This is not ideal if there are many
         # layers in the draft model.
         if self.speculative_config and get_pp_group().is_last_rank:
+            self.drafter: (
+                NgramProposer | SuffixDecodingProposer | EagleProposer | MedusaProposer
+            )
             if self.speculative_config.method == "ngram":
                 self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.method == "suffix":
+                self.drafter = SuffixDecodingProposer(self.vllm_config)
             elif self.speculative_config.use_eagle():
-                self.drafter = EagleProposer(self.vllm_config, self.device,
-                                             self)  # type: ignore
+                self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
                     self.use_aux_hidden_state_outputs = True
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
-                    vllm_config=self.vllm_config,
-                    device=self.device)  # type: ignore
+                    vllm_config=self.vllm_config, device=self.device
+                )
             else:
-                raise ValueError("Unknown speculative decoding method: "
-                                 f"{self.speculative_config.method}")
-            self.rejection_sampler = RejectionSampler()
+                raise ValueError(
+                    "Unknown speculative decoding method: "
+                    f"{self.speculative_config.method}"
+                )
+            self.rejection_sampler = RejectionSampler(self.sampler)
+
+        self.num_spec_tokens = 0
+        if self.speculative_config:
+            self.num_spec_tokens = self.speculative_config.num_speculative_tokens
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
@@ -307,6 +401,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # solution, we initialize the input batch here, and re-initialize it
         # in `initialize_kv_cache` if the block_sizes here is different from
         # the block_sizes in the kv cache config.
+        custom_logitsprocs = model_config.logits_processors
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             # We need to use the encoder length for encoder-decoer
@@ -317,55 +412,78 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
+            kernel_block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
             logitsprocs=build_logitsprocs(
-                self.vllm_config, self.device, self.pin_memory,
+                self.vllm_config,
+                self.device,
+                self.pin_memory,
                 self.is_pooling_model,
-                self.vllm_config.model_config.logits_processors),
+                custom_logitsprocs,
+            ),
+            # We currently don't know whether a particular custom logits processor
+            # uses output token ids so we set this conservatively.
+            logitsprocs_need_output_token_ids=bool(custom_logitsprocs),
             is_pooling_model=self.is_pooling_model,
+            dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size,
         )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
-        self.async_output_copy_stream = torch.cuda.Stream() if \
-            self.use_async_scheduling else None
+        # Separate cuda stream for overlapping transfer of sampled token ids from
+        # GPU to CPU when async scheduling is enabled.
+        self.async_output_copy_stream: torch.cuda.Stream | None = None
+        # cuda event to synchronize use of reused CPU tensors between steps
+        # when async scheduling is enabled.
+        self.prepare_inputs_event: torch.cuda.Event | None = None
+        if self.use_async_scheduling:
+            self.async_output_copy_stream = torch.cuda.Stream()
+            self.prepare_inputs_event = torch.cuda.Event()
 
-        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
-        # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
-        # The batch sizes in the config are in descending order.
-        if self.compilation_config.cudagraph_capture_sizes and \
-                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
-            self.cudagraph_batch_sizes = list(
-                reversed(self.compilation_config.cudagraph_capture_sizes))
+        if (
+            self.compilation_config.cudagraph_capture_sizes
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            self.cudagraph_batch_sizes = sorted(
+                self.compilation_config.cudagraph_capture_sizes
+            )
 
         # Cache the device properties.
         self._init_device_properties()
 
         # Persistent buffers for CUDA graphs.
-        self.input_ids = self._make_buffer(self.max_num_tokens,
-                                           dtype=torch.int32)
-        self.positions = self._make_buffer(self.max_num_tokens,
-                                           dtype=torch.int64)
-        self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
-                                                 dtype=torch.int32)
+        self.input_ids = self._make_buffer(self.max_num_tokens, dtype=torch.int32)
+        self.positions = self._make_buffer(self.max_num_tokens, dtype=torch.int64)
+        self.query_start_loc = self._make_buffer(
+            self.max_num_reqs + 1, dtype=torch.int32
+        )
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        if self.dcp_world_size > 1:
+            self.dcp_local_seq_lens = self._make_buffer(
+                self.max_num_reqs, dtype=torch.int32
+            )
         # Because inputs_embeds may be bfloat16 and we don't need a numpy
         # version of this tensor, avoid a RuntimeError by not creating a
         # numpy buffer.
-        self.inputs_embeds = self._make_buffer(self.max_num_tokens,
-                                               self.hidden_size,
-                                               dtype=self.dtype,
-                                               numpy=False)
-        self.is_token_ids = self._make_buffer(self.max_num_tokens,
-                                              dtype=torch.bool)
-        self.discard_request_indices = self._make_buffer(self.max_num_reqs,
-                                                         dtype=torch.int64)
+        self.inputs_embeds = self._make_buffer(
+            self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
+        )
+        self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
+        self.discard_request_indices = self._make_buffer(
+            self.max_num_reqs, dtype=torch.int64
+        )
         self.num_discarded_requests = 0
 
-        self.num_decode_draft_tokens = self._make_buffer(self.max_num_reqs,
-                                                         dtype=torch.int32)
-        self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
-                                                     dtype=torch.int64)
+        self.num_decode_draft_tokens = self._make_buffer(
+            self.max_num_reqs, dtype=torch.int32
+        )
+        self.num_accepted_tokens = self._make_buffer(
+            self.max_num_reqs, dtype=torch.int64
+        )
+
+        # Only relevant for multimodal models
+        if self.supports_mm_inputs:
+            self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -380,25 +498,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # 1D-RoPE.
             # See page 5 of https://arxiv.org/abs/2409.12191
             self.mrope_positions = self._make_buffer(
-                (3, self.max_num_tokens + 1), dtype=torch.int64)
-
-        # CUDA event to synchronize use of reused CPU tensors between steps
-        # when async scheduling is enabled.
-        self.prepare_inputs_event: Optional[torch.cuda.Event] = None
-        if self.use_async_scheduling:
-            self.prepare_inputs_event = torch.cuda.Event()
-            # Start in a completed state.
-            self.prepare_inputs_event.record(torch.cuda.default_stream())
+                (3, self.max_num_tokens + 1), dtype=torch.int64
+            )
 
         # None in the first PP rank. The rest are set after load_model.
-        self.intermediate_tensors: Optional[IntermediateTensors] = None
+        self.intermediate_tensors: IntermediateTensors | None = None
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         # Keep in int64 to avoid overflow with long context
-        self.arange_np = np.arange(max(self.max_num_reqs + 1,
-                                       self.max_model_len,
-                                       self.max_num_tokens),
-                                   dtype=np.int64)
+        self.arange_np = np.arange(
+            max(self.max_num_reqs + 1, self.max_model_len, self.max_num_tokens),
+            dtype=np.int64,
+        )
 
         # Layer pairings for cross-layer KV sharing.
         # If an Attention layer `layer_name` is in the keys of this dict, it
@@ -410,21 +521,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.kv_sharing_fast_prefill_logits_indices = None
         if self.cache_config.kv_sharing_fast_prefill:
             self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
-                self.max_num_tokens, dtype=torch.int32, device=self.device)
+                self.max_num_tokens, dtype=torch.int32, device=self.device
+            )
 
-        self.uniform_decode_query_len = 1 if not self.speculative_config else \
-            1 + self.speculative_config.num_speculative_tokens
+        self.uniform_decode_query_len = 1 + self.num_spec_tokens
 
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
 
-        self.mm_budget = MultiModalBudget(
-            self.model_config,
-            self.scheduler_config,
-            self.mm_registry,
-        ) if self.supports_mm_inputs else None
+        self.mm_budget = (
+            MultiModalBudget(
+                self.model_config,
+                self.scheduler_config,
+                self.mm_registry,
+            )
+            if self.supports_mm_inputs
+            else None
+        )
 
-        self.reorder_batch_threshold: Optional[int] = None
+        self.reorder_batch_threshold: int | None = None
 
         # Attention layers that are only in the KVCacheConfig of the runner
         # (e.g., KV sharing, encoder-only attention), but not in the
@@ -432,24 +547,57 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.runner_only_attn_layers: set[str] = set()
 
         # Cached outputs.
-        self._draft_token_ids: Optional[Union[list[list[int]],
-                                              torch.Tensor]] = None
+        self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
         self.transfer_event = torch.cuda.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
-            (self.max_model_len, 1),
+            (self.max_num_reqs, 1),
             dtype=torch.int64,
             device="cpu",
-            pin_memory=self.pin_memory)
-
-    def _make_buffer(self,
-                     *size: Union[int, torch.SymInt],
-                     dtype: torch.dtype,
-                     numpy: bool = True) -> CpuGpuBuffer:
-        return CpuGpuBuffer(*size,
-                            dtype=dtype,
-                            device=self.device,
-                            pin_memory=self.pin_memory,
-                            with_numpy=numpy)
+            pin_memory=self.pin_memory,
+        )
+
+        # Pre-allocated tensor for copying valid sampled token counts to CPU,
+        # with dedicated stream for overlapping and event for coordination.
+        self.valid_sampled_token_count_event: torch.cuda.Event | None = None
+        self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
+        if self.use_async_scheduling and self.num_spec_tokens:
+            self.valid_sampled_token_count_event = torch.cuda.Event()
+            self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
+        self.valid_sampled_token_count_cpu = torch.empty(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+
+        # Ephemeral state transferred between execute_model() and sample_tokens().
+        self.execute_model_state: ExecuteModelState | None = None
+        self.kv_connector_output: KVConnectorOutput | None = None
+
+    def reset_mm_cache(self) -> None:
+        if self.mm_budget:
+            self.mm_budget.reset_cache()
+
+    def _get_positions(self, num_tokens: Any):
+        if isinstance(num_tokens, int):
+            if self.uses_mrope:
+                return self.mrope_positions.gpu[:, :num_tokens]
+            return self.positions.gpu[:num_tokens]
+        else:
+            if self.uses_mrope:
+                return self.mrope_positions.gpu[:, num_tokens]
+            return self.positions.gpu[num_tokens]
+
+    def _make_buffer(
+        self, *size: int | torch.SymInt, dtype: torch.dtype, numpy: bool = True
+    ) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *size,
+            dtype=dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            with_numpy=numpy,
+        )
 
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
@@ -462,9 +610,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         token_type_id_requests = dict[int, Any]()
         for i, param in enumerate(pooling_params):
-            if param.extra_kwargs is not None and \
-            (token_types := param.extra_kwargs.get(
-                "compressed_token_type_ids")) is not None:
+            if (
+                param.extra_kwargs is not None
+                and (token_types := param.extra_kwargs.get("compressed_token_type_ids"))
+                is not None
+            ):
                 token_type_id_requests[i] = token_types
 
         if len(token_type_id_requests) == 0:
@@ -479,7 +629,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             token_type_ids.append(ids)
 
         model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to(
-            device=self.device)
+            device=self.device
+        )
         return model_kwargs
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
@@ -501,21 +652,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         if self.reorder_batch_threshold is not None:
-            # NOTE(lucas): currently no backend supports the custom masking
-            #  required for DCP with q_len > 1, so we assert here. Remove this
-            #  assert once the custom mask is support is added to FA3.
-            if self.dcp_world_size > 1:
-                assert self.reorder_batch_threshold == 1, \
-                    "DCP not support reorder_batch_threshold > 1 now."
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,
-                decode_threshold=self.reorder_batch_threshold)
+                decode_threshold=self.reorder_batch_threshold,
+            )
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
-        """Initialize attributes from torch.cuda.get_device_properties
-        """
+        """Initialize attributes from torch.cuda.get_device_properties"""
         self.device_properties = torch.cuda.get_device_properties(self.device)
         self.num_sms = self.device_properties.multi_processor_count
 
@@ -571,8 +716,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             sampling_params = new_req_data.sampling_params
             pooling_params = new_req_data.pooling_params
 
-            if sampling_params and \
-                sampling_params.sampling_type == SamplingType.RANDOM_SEED:
+            if (
+                sampling_params
+                and sampling_params.sampling_type == SamplingType.RANDOM_SEED
+            ):
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
             else:
@@ -611,11 +758,42 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
+
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        # then use it to update actual num_computed_tokens of each request.
+        valid_sampled_token_count = self._get_valid_sampled_token_count()
+
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
-            resumed_from_preemption = req_data.resumed_from_preemption[i]
+            resumed_from_preemption = req_id in req_data.resumed_req_ids
+            num_output_tokens = req_data.num_output_tokens[i]
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+
+            # prev_num_draft_len is used in async scheduling mode with
+            # spec decode. it indicates if need to update num_computed_tokens
+            # of the request. for example:
+            # fist step: num_computed_tokens = 0, spec_tokens = [],
+            # prev_num_draft_len = 0.
+            # second step: num_computed_tokens = 100(prompt lenth),
+            # spec_tokens = [a,b], prev_num_draft_len = 0.
+            # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
+            # prev_num_draft_len = 2.
+            # num_computed_tokens in first step and second step does't contain
+            # the spec tokens length, but in third step it contains the
+            # spec tokens length. we only need to update num_computed_tokens
+            # when prev_num_draft_len > 0.
+            if req_state.prev_num_draft_len:
+                if req_index is None:
+                    req_state.prev_num_draft_len = 0
+                else:
+                    assert self.input_batch.prev_req_id_to_index is not None
+                    prev_req_index = self.input_batch.prev_req_id_to_index[req_id]
+                    num_accepted = valid_sampled_token_count[prev_req_index] - 1
+                    num_rejected = req_state.prev_num_draft_len - num_accepted
+                    num_computed_tokens -= num_rejected
+                    req_state.output_token_ids.extend([-1] * num_accepted)
 
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
@@ -627,42 +805,57 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 new_token_ids = req_data.new_token_ids[i]
                 # Add the sampled token(s) from the previous step (if any).
                 # This doesn't include "unverified" tokens like spec tokens.
-                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
-                                  req_state.num_tokens)
+                num_new_tokens = (
+                    num_computed_tokens + len(new_token_ids) - req_state.num_tokens
+                )
                 if num_new_tokens == 1:
                     # Avoid slicing list in most common case.
                     req_state.output_token_ids.append(new_token_ids[-1])
                 elif num_new_tokens > 0:
-                    req_state.output_token_ids.extend(
-                        new_token_ids[-num_new_tokens:])
+                    req_state.output_token_ids.extend(new_token_ids[-num_new_tokens:])
+            elif num_output_tokens < len(req_state.output_token_ids):
+                # Some output tokens were discarded due to a sync-KV-load
+                # failure. Align the cached state.
+                del req_state.output_token_ids[num_output_tokens:]
+                if req_index is not None:
+                    end_idx = (
+                        self.input_batch.num_prompt_tokens[req_index]
+                        + num_output_tokens
+                    )
+                    self.input_batch.num_tokens[req_index] = end_idx
+                    self.input_batch.num_tokens_no_spec[req_index] = end_idx
 
             # Update the block IDs.
             if not resumed_from_preemption:
                 if new_block_ids is not None:
                     # Append the new blocks to the existing block IDs.
-                    for block_ids, new_ids in zip(req_state.block_ids,
-                                                  new_block_ids):
+                    for block_ids, new_ids in zip(req_state.block_ids, new_block_ids):
                         block_ids.extend(new_ids)
             else:
+                assert req_index is None
                 assert new_block_ids is not None
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
 
-            req_index = self.input_batch.req_id_to_index.get(req_id)
             if req_index is None:
                 # The request is not in the persistent batch.
                 # The request was either preempted and resumed later, or was not
                 # scheduled in the previous step and needs to be added again.
+
+                if self.use_async_scheduling and num_output_tokens > 0:
+                    # We must recover the output token ids for resumed requests in the
+                    # async scheduling case, so that correct input_ids are obtained.
+                    resumed_token_ids = req_data.all_token_ids[req_id]
+                    req_state.output_token_ids = resumed_token_ids[-num_output_tokens:]
+
                 reqs_to_add.append(req_state)
                 continue
 
             # Update the persistent batch.
-            self.input_batch.num_computed_tokens_cpu[req_index] = (
-                num_computed_tokens)
+            self.input_batch.num_computed_tokens_cpu[req_index] = num_computed_tokens
             if new_block_ids is not None:
-                self.input_batch.block_table.append_row(
-                    new_block_ids, req_index)
+                self.input_batch.block_table.append_row(new_block_ids, req_index)
 
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
@@ -671,24 +864,44 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 start_token_index = num_computed_tokens
                 end_token_index = num_computed_tokens + len(new_token_ids)
                 self.input_batch.token_ids_cpu[
-                    req_index,
-                    start_token_index:end_token_index] = new_token_ids
-                self.input_batch.num_tokens_no_spec[
-                    req_index] = end_token_index
+                    req_index, start_token_index:end_token_index
+                ] = new_token_ids
+                self.input_batch.num_tokens_no_spec[req_index] = end_token_index
                 self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = (
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
-            if spec_token_ids:
-                num_spec_tokens = len(spec_token_ids)
+            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+                req_id, []
+            )
+            num_spec_tokens = len(spec_token_ids)
+            # For async scheduling, token_ids_cpu assigned from
+            # spec_token_ids are placeholders and will be overwritten in
+            # _prepare_input_ids.
+            if num_spec_tokens:
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
                 end_token_index = start_index + num_spec_tokens
                 self.input_batch.token_ids_cpu[
-                    req_index, start_index:end_token_index] = spec_token_ids
+                    req_index, start_index:end_token_index
+                ] = spec_token_ids
                 # NOTE(woosuk): `num_tokens` here may include spec tokens.
                 self.input_batch.num_tokens[req_index] += num_spec_tokens
 
+            # When speculative decoding is used with structured output,
+            # the scheduler can drop draft tokens that do not
+            # conform to the schema. This can result in
+            # scheduler_output.scheduled_spec_decode_tokens being empty,
+            # even when speculative decoding is enabled.
+            self.input_batch.spec_token_ids[req_index] = spec_token_ids
+
+            # there are no draft tokens with async scheduling,
+            # we clear the spec_decoding info in scheduler_output and
+            # use normal sampling but rejection_sampling.
+            if self.use_async_scheduling:
+                req_state.prev_num_draft_len = num_spec_tokens
+                if num_spec_tokens and self._draft_token_ids is None:
+                    scheduler_output.total_num_scheduled_tokens -= num_spec_tokens
+                    scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens
+                    scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None)
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
@@ -702,7 +915,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.input_batch.refresh_metadata()
 
     def _update_states_after_model_execute(
-            self, output_token_ids: torch.Tensor) -> None:
+        self, output_token_ids: torch.Tensor
+    ) -> None:
         """Update the cached states after model execution.
 
         This is used for MTP/EAGLE for hybrid models, as in linear attention,
@@ -715,61 +929,39 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         # Find the number of accepted tokens for each sequence.
-        num_accepted_tokens = (torch.cat(
-            [
-                output_token_ids,
-                torch.full((output_token_ids.size(0), 1),
-                           -1,
-                           device=output_token_ids.device),
-            ],
-            dim=1) == -1).int().argmax(-1).cpu().numpy()
+        num_accepted_tokens = (
+            (
+                torch.cat(
+                    [
+                        output_token_ids,
+                        torch.full(
+                            (output_token_ids.size(0), 1),
+                            -1,
+                            device=output_token_ids.device,
+                        ),
+                    ],
+                    dim=1,
+                )
+                == -1
+            )
+            .int()
+            .argmax(-1)
+            .cpu()
+            .numpy()
+        )
         for i, num_tokens in enumerate(num_accepted_tokens):
             self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
 
     def _init_mrope_positions(self, req_state: CachedRequestState):
-        image_grid_thw = []
-        video_grid_thw = []
-        second_per_grid_ts = []
-        audio_feature_lengths = []
-        use_audio_in_video = False
-        for mm_feature in req_state.mm_features:
-            mm_item = mm_feature.data
-            if mm_item is None:
-                continue
-            mm_input = mm_item.get_data()
-            if (t := mm_input.get("image_grid_thw")) is not None:
-                image_grid_thw.append(t.tolist())
-            if (t := mm_input.get("video_grid_thw")) is not None:
-                video_grid_thw.append(t.tolist())
-            if (t := mm_input.get("second_per_grid_ts")) is not None:
-                second_per_grid_ts.append(t)
-            if (t := mm_input.get("audio_feature_lengths")) is not None:
-                audio_feature_lengths.append(t)
-            if mm_input.get("use_audio_in_video") is True:
-                use_audio_in_video = True
-
-        if supports_mrope(self.model):
-            req_state.mrope_positions, req_state.mrope_position_delta = \
-                self.model.get_mrope_input_positions(
-                    req_state.prompt_token_ids,
-                    hf_config=self.model_config.hf_config,
-                    image_grid_thw=image_grid_thw,
-                    video_grid_thw=video_grid_thw,
-                    second_per_grid_ts=second_per_grid_ts,
-                    audio_feature_lengths=audio_feature_lengths,
-                    use_audio_in_video=use_audio_in_video,
-                )
-        else:
-            req_state.mrope_positions, req_state.mrope_position_delta = \
-                MRotaryEmbedding.get_input_positions_tensor(
-                    req_state.prompt_token_ids,
-                    hf_config=self.model_config.hf_config,
-                    image_grid_thw=image_grid_thw,
-                    video_grid_thw=video_grid_thw,
-                    second_per_grid_ts=second_per_grid_ts,
-                    audio_feature_lengths=audio_feature_lengths,
-                    use_audio_in_video=use_audio_in_video,
-                )
+        model = self.get_model()
+        assert supports_mrope(model), "M-RoPE support is not implemented."
+
+        req_state.mrope_positions, req_state.mrope_position_delta = (
+            model.get_mrope_input_positions(
+                req_state.prompt_token_ids,
+                req_state.mm_features,
+            )
+        )
 
     def _extract_mm_kwargs(
         self,
@@ -788,10 +980,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = cast(SupportsMultiModal, self.model)
         mm_kwargs_combined: BatchedTensorInputs = {}
         for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                mm_kwargs,
-                device=self.device,
-                pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
+            mm_kwargs,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             mm_kwargs_combined.update(mm_kwargs_group)
 
@@ -810,7 +1003,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _get_cumsum_and_arange(
         self,
         num_tokens: np.ndarray,
-        cumsum_dtype: Optional[np.dtype] = None,
+        cumsum_dtype: np.dtype | None = None,
     ) -> tuple[np.ndarray, np.ndarray]:
         """Get the cumulative sum and batched arange of the given array.
         # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
@@ -827,10 +1020,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return cu_num_tokens, arange
 
-    def _prepare_input_ids(self, total_num_scheduled_tokens: int,
-                           cu_num_tokens: np.ndarray) -> None:
+    def _prepare_input_ids(
+        self,
+        scheduler_output: "SchedulerOutput",
+        total_num_scheduled_tokens: int,
+        cu_num_tokens: np.ndarray,
+    ) -> None:
         """Prepare the input IDs for the current batch.
-        
+
         Carefully handles the `prev_sampled_token_ids` which can be cached
         from the previous engine iteration, in which case those tokens on the
         GPU need to be copied into the corresponding slots into input_ids."""
@@ -848,21 +1045,43 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # on the GPU from prev_sampled_token_ids.
         prev_req_id_to_index = self.input_batch.prev_req_id_to_index
         assert prev_req_id_to_index is not None
-        flattened_indices = []
-        prev_common_req_indices = []
+        sample_flattened_indices: list[int] = []
+        spec_flattened_indices: list[int] = []
+        prev_common_req_indices: list[int] = []
+        prev_draft_token_indices: list[int] = []
         indices_match = True
         max_flattened_index = -1
+        total_num_spec_tokens = 0
+        scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+
         for req_id, cur_index in self.input_batch.req_id_to_index.items():
             if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
                 prev_common_req_indices.append(prev_index)
                 # We need to compute the flattened input_ids index of the
                 # last token in each common request.
+                draft_len = len(scheduled_spec_tokens.get(req_id, ()))
+                total_num_spec_tokens += draft_len
                 flattened_index = cu_num_tokens[cur_index].item() - 1
-                flattened_indices.append(flattened_index)
-                indices_match &= (prev_index == flattened_index)
+                # example: cu_num_tokens = [2, 5, 8], draft_tokens = [1, 2, 2]
+                # sample_flattened_indices = [0, 2, 5]
+                # spec_flattened_indices = [1,   3, 4,    6, 7]
+                sample_flattened_indices.append(flattened_index - draft_len)
+                spec_flattened_indices.extend(
+                    range(flattened_index - draft_len + 1, flattened_index + 1)
+                )
+                start = prev_index * self.num_spec_tokens
+                # prev_draft_token_indices is used to find which draft_tokens_id
+                # should be copied to input_ids
+                # example: prev draft_tokens_id [[1,2], [3,4], [5, 6]]
+                # flatten draft_tokens_id [1,2,3,4,5,6]
+                # draft_len of each request [1, 2, 1]
+                # then prev_draft_token_indices is [0,   2, 3,   4]
+                prev_draft_token_indices.extend(range(start, start + draft_len))
+                indices_match &= prev_index == flattened_index
                 max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(flattened_indices)
-        if num_commmon_tokens < total_num_scheduled_tokens:
+        num_commmon_tokens = len(sample_flattened_indices)
+        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
+        if num_commmon_tokens < total_without_spec:
             # If not all requests are decodes from the last iteration,
             # We need to copy the input_ids_cpu to the GPU first.
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
@@ -871,7 +1090,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
         if num_commmon_tokens == 0:
             # No requests in common with the previous iteration
-            # So input_ids_cpu will have all the input ids.
+            # So input_ids.cpu will have all the input ids.
             return
         if indices_match and max_flattened_index == (num_commmon_tokens - 1):
             # Common-case optimization: the batch is unchanged
@@ -879,57 +1098,83 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # The indices are both the same permutation of 0..N-1 so
             # we can copy directly using a single slice.
             self.input_ids.gpu[:num_commmon_tokens].copy_(
-                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
-                                                        0],
-                non_blocking=True)
+                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0],
+                non_blocking=True,
+            )
             if self.enable_prompt_embeds:
                 self.is_token_ids.gpu[:num_commmon_tokens] = True
             return
-        # Upload the index tensors asynchronously
-        # so the scatter can be non-blocking.
-        input_ids_index_tensor = torch.tensor(flattened_indices,
-                                              dtype=torch.int64,
-                                              pin_memory=self.pin_memory).to(
-                                                  self.device,
-                                                  non_blocking=True)
+        # Upload the index tensors asynchronously so the scatter can be non-blocking.
+        sampled_tokens_index_tensor = torch.tensor(
+            sample_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
         prev_common_req_indices_tensor = torch.tensor(
-            prev_common_req_indices,
-            dtype=torch.int64,
-            pin_memory=self.pin_memory).to(self.device, non_blocking=True)
+            prev_common_req_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
         self.input_ids.gpu.scatter_(
             dim=0,
-            index=input_ids_index_tensor,
+            index=sampled_tokens_index_tensor,
             src=self.input_batch.prev_sampled_token_ids[
-                prev_common_req_indices_tensor, 0])
+                prev_common_req_indices_tensor, 0
+            ],
+        )
+
+        # Scatter the draft tokens after the sampled tokens are scattered.
+        if self._draft_token_ids is None or not spec_flattened_indices:
+            return
+
+        assert isinstance(self._draft_token_ids, torch.Tensor)
+        draft_tokens_index_tensor = torch.tensor(
+            spec_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+        prev_draft_token_indices_tensor = torch.tensor(
+            prev_draft_token_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+
+        # because input_ids dtype is torch.int32,
+        # so convert draft_token_ids to torch.int32 here.
+        draft_token_ids = self._draft_token_ids.to(dtype=torch.int32)
+        self._draft_token_ids = None
+
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=draft_tokens_index_tensor,
+            src=draft_token_ids.flatten()[prev_draft_token_indices_tensor],
+        )
 
     def _get_encoder_seq_lens(
         self,
-        scheduler_output: "SchedulerOutput",
+        scheduled_encoder_inputs: dict[str, list[int]],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
-    ) -> Optional[np.ndarray]:
+    ) -> np.ndarray | None:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
             return None
 
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
         encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
-        for req_id in scheduler_output.scheduled_encoder_inputs:
+        for req_id in scheduled_encoder_inputs:
             req_index = self.input_batch.req_id_to_index[req_id]
             encoder_seq_lens[req_index] = self.max_encoder_len
 
         return encoder_seq_lens
 
     def _prepare_inputs(
-        self, scheduler_output: "SchedulerOutput"
-    ) -> tuple[PerLayerAttnMetadata, torch.Tensor,
-               Optional[SpecDecodeMetadata], np.ndarray,
-               Optional[CommonAttentionMetadata], int, Optional[UBatchSlices],
-               Optional[torch.Tensor]]:
+        self,
+        scheduler_output: "SchedulerOutput",
+        num_scheduled_tokens: np.ndarray,
+        max_num_scheduled_tokens: int,
+    ) -> tuple[
+        torch.Tensor,
+        SpecDecodeMetadata | None,
+        UBatchSlices | None,
+        torch.Tensor | None,
+    ]:
         """
         :return: tuple[
-            attn_metadata: layer-to-attention_metadata mapping,
-            logits_indices, spec_decode_metadata
+            logits_indices, spec_decode_metadata,
+            ubatch_slices, num_tokens_across_dp,
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -941,27 +1186,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit_block_table(num_reqs)
 
-        # Get the number of scheduled tokens for each request.
-        req_ids = self.input_batch.req_ids
-        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-        num_scheduled_tokens = np.array(tokens, dtype=np.int32)
-        max_num_scheduled_tokens = max(tokens)
-
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        req_indices = np.repeat(self.arange_np[:num_reqs],
-                                num_scheduled_tokens)
+        req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens)
 
         # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
         # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        cu_num_tokens, arange = self._get_cumsum_and_arange(
-            num_scheduled_tokens)
+        cu_num_tokens, arange = self._get_cumsum_and_arange(num_scheduled_tokens)
 
         # Get positions.
         positions_np = self.positions.np[:total_num_scheduled_tokens]
-        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
-               arange,
-               out=positions_np)
+        np.add(
+            self.input_batch.num_computed_tokens_cpu[req_indices],
+            arange,
+            out=positions_np,
+        )
 
         # Calculate M-RoPE positions.
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -972,24 +1211,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
-        token_indices = (positions_np +
-                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+        token_indices = (
+            positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]
+        )
         token_indices_tensor = torch.from_numpy(token_indices)
 
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
-        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
-                           0,
-                           token_indices_tensor,
-                           out=self.input_ids.cpu[:total_num_scheduled_tokens])
+        torch.index_select(
+            self.input_batch.token_ids_cpu_tensor.flatten(),
+            0,
+            token_indices_tensor,
+            out=self.input_ids.cpu[:total_num_scheduled_tokens],
+        )
         if self.enable_prompt_embeds:
-            is_token_ids = self.input_batch.is_token_ids.flatten()
+            is_token_ids = self.input_batch.is_token_ids_tensor.flatten()
             torch.index_select(
                 is_token_ids,
                 0,
                 token_indices_tensor,
-                out=self.is_token_ids.cpu[:total_num_scheduled_tokens])
+                out=self.is_token_ids.cpu[:total_num_scheduled_tokens],
+            )
 
         # Because we did not pre-allocate a massive prompt_embeds CPU tensor on
         # the InputBatch, we need to fill in the prompt embeds into the expected
@@ -1023,52 +1266,54 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 actual_num_sched = actual_end - start_pos
 
                 if actual_num_sched > 0:
-                    self.inputs_embeds.cpu[output_idx:output_idx +
-                                           actual_num_sched].copy_(
-                                               req_embeds[start_pos:actual_end]
-                                           )
+                    self.inputs_embeds.cpu[
+                        output_idx : output_idx + actual_num_sched
+                    ].copy_(req_embeds[start_pos:actual_end])
 
                 output_idx += num_sched
 
-        self.input_batch.block_table.compute_slot_mapping(
-            req_indices, positions_np)
-        self.input_batch.block_table.commit_slot_mapping(
-            total_num_scheduled_tokens)
+        self.input_batch.block_table.compute_slot_mapping(req_indices, positions_np)
+        self.input_batch.block_table.commit_slot_mapping(total_num_scheduled_tokens)
 
         # Prepare the attention metadata.
         self.query_start_loc.np[0] = 0
-        self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
+        self.query_start_loc.np[1 : num_reqs + 1] = cu_num_tokens
         # Note: pad query_start_loc to be non-decreasing, as kernels
         # like FlashAttention requires that
-        self.query_start_loc.np[num_reqs + 1:].fill(cu_num_tokens[-1])
+        self.query_start_loc.np[num_reqs + 1 :].fill(cu_num_tokens[-1])
         self.query_start_loc.copy_to_gpu()
-        query_start_loc = self.query_start_loc.gpu[:num_reqs + 1]
+        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
 
         num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
-        num_tokens_padded = num_tokens_unpadded + self.get_local_padding(
-            num_tokens_unpadded)
-        uniform_decode = \
-            (max_num_scheduled_tokens == self.uniform_decode_query_len) and \
-            (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-        ubatch_slices, num_tokens_after_padding = \
-            ubatch_split(num_scheduled_tokens,
-                         num_tokens_unpadded,
-                         num_tokens_padded,
-                         uniform_decode=uniform_decode,
-                         vllm_config=self.vllm_config)
+        num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
+        uniform_decode = (
+            max_num_scheduled_tokens == self.uniform_decode_query_len
+        ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
+
+        # Disable DP padding when running eager to avoid excessive padding when
+        # running prefills. This lets us set enforce_eager on the prefiller in
+        # a P/D setup and still use CUDA graphs (enabled by this padding) on the
+        # decoder.
+        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+
+        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=num_tokens_unpadded,
+            parallel_config=self.parallel_config,
+            allow_microbatching=True,
+            allow_dp_padding=allow_dp_padding,
+            num_tokens_padded=num_tokens_padded,
+            uniform_decode=uniform_decode,
+            num_scheduled_tokens_per_request=num_scheduled_tokens,
+        )
 
         self.seq_lens.np[:num_reqs] = (
-            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-            num_scheduled_tokens)
+            self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens
+        )
         # Fill unused with 0 for full cuda graph mode.
         self.seq_lens.np[num_reqs:].fill(0)
         self.seq_lens.copy_to_gpu()
-        seq_lens = self.seq_lens.gpu[:num_reqs]
-        max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
-        num_tokens = [
-            self.requests[r].num_tokens for r in self.input_batch.req_ids
-        ]
+        num_tokens = [self.requests[r].num_tokens for r in self.input_batch.req_ids]
         num_tokens_np = np.array(num_tokens, dtype=np.int32)
 
         # Record the index of requests that should not be sampled,
@@ -1076,25 +1321,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np
         discard_request_indices = np.nonzero(discard_requests_mask)[0]
         self.num_discarded_requests = len(discard_request_indices)
-        self.discard_request_indices.np[:self.num_discarded_requests] = (
-            discard_request_indices)
+        self.discard_request_indices.np[: self.num_discarded_requests] = (
+            discard_request_indices
+        )
 
         self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
 
         # Copy the tensors to the GPU.
-        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+        self._prepare_input_ids(
+            scheduler_output,
+            total_num_scheduled_tokens,
+            cu_num_tokens,
+        )
 
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
-                non_blocking=True)
+                non_blocking=True,
+            )
         else:
             # Common case (1D positions)
             self.positions.copy_to_gpu(total_num_scheduled_tokens)
 
-        use_spec_decode = len(
-            scheduler_output.scheduled_spec_decode_tokens) > 0
+        use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
         if not use_spec_decode:
             # NOTE(woosuk): Due to chunked prefills, the batch may contain
             # partial requests. While we should not sample any token
@@ -1104,6 +1354,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logits_indices = query_start_loc[1:] - 1
             num_draft_tokens = None
             spec_decode_metadata = None
+            num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
         else:
             # Get the number of draft tokens for each request.
             # Iterate over the dictionary rather than all requests since not all
@@ -1112,53 +1363,128 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # For chunked prefills, use -1 as mask rather than 0, as guided
             # decoding may rollback speculative tokens.
             num_decode_draft_tokens = np.full(num_reqs, -1, dtype=np.int32)
-            for req_id, draft_token_ids in (
-                    scheduler_output.scheduled_spec_decode_tokens.items()):
+            for (
+                req_id,
+                draft_token_ids,
+            ) in scheduler_output.scheduled_spec_decode_tokens.items():
                 req_idx = self.input_batch.req_id_to_index[req_id]
                 num_draft_tokens[req_idx] = len(draft_token_ids)
-                num_decode_draft_tokens[req_idx] = (len(draft_token_ids) if (
-                    self.input_batch.num_computed_tokens_cpu[req_idx]
-                    >= self.input_batch.num_prompt_tokens[req_idx]) else -1)
+                num_decode_draft_tokens[req_idx] = (
+                    len(draft_token_ids)
+                    if (
+                        self.input_batch.num_computed_tokens_cpu[req_idx]
+                        >= self.input_batch.num_prompt_tokens[req_idx]
+                    )
+                    else -1
+                )
             spec_decode_metadata = self._calc_spec_decode_metadata(
-                num_draft_tokens, cu_num_tokens)
+                num_draft_tokens, cu_num_tokens
+            )
             logits_indices = spec_decode_metadata.logits_indices
-
+            num_sampled_tokens = num_draft_tokens + 1
             # For DECODE only cuda graph of some attention backends (e.g., GDN).
-            self.num_decode_draft_tokens.np[:
-                                            num_reqs] = num_decode_draft_tokens
+            self.num_decode_draft_tokens.np[:num_reqs] = num_decode_draft_tokens
             self.num_decode_draft_tokens.np[num_reqs:].fill(-1)
             self.num_decode_draft_tokens.copy_to_gpu()
 
+        # Hot-Swap lora model
+        if self.lora_config:
+            assert (
+                np.sum(num_sampled_tokens)
+                <= self.vllm_config.scheduler_config.max_num_batched_tokens
+            )
+            self.set_active_loras(
+                self.input_batch, num_scheduled_tokens, num_sampled_tokens
+            )
+
+        return (
+            logits_indices,
+            spec_decode_metadata,
+            ubatch_slices,
+            num_tokens_across_dp,
+        )
+
+    def _build_attention_metadata(
+        self,
+        total_num_scheduled_tokens: int,
+        max_num_scheduled_tokens: int,
+        num_reqs: int,
+        ubatch_slices: UBatchSlices | None = None,
+        logits_indices: torch.Tensor | None = None,
+        use_spec_decode: bool = False,
+        for_cudagraph_capture: bool = False,
+        scheduled_encoder_inputs: dict[str, list[int]] | None = None,
+        cascade_attn_prefix_lens: list[list[int]] | None = None,
+    ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
+        """
+        :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
+        """
         logits_indices_padded = None
-        if self.cache_config.kv_sharing_fast_prefill:
-            logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
-                logits_indices)
+        num_logits_indices = None
+        if logits_indices is not None:
+            num_logits_indices = logits_indices.size(0)
+            if self.cache_config.kv_sharing_fast_prefill:
+                logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
+                    logits_indices
+                )
+
+        # update seq_lens of decode reqs under DCP.
+        if self.dcp_world_size > 1:
+            self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
+                self.seq_lens.cpu[:num_reqs],
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.parallel_config.dcp_kv_cache_interleave_size,
+            )
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
 
-        # Used in the below loop.
-        query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1]
+        # Used in the below loop
+        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1]
+        seq_lens = self.seq_lens.gpu[:num_reqs]
         seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
-        num_computed_tokens_cpu = (
-            self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
+        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
+            :num_reqs
+        ]
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs]
+
         spec_decode_common_attn_metadata = None
+
+        if for_cudagraph_capture:
+            # For some attention backends (e.g. FA) with sliding window models we need
+            # to make sure the backend see a max_seq_len that is larger to the sliding
+            # window size when capturing to make sure the correct kernel is selected.
+            max_seq_len = self.max_model_len
+        else:
+            max_seq_len = self.seq_lens.np[:num_reqs].max().item()
+
         if use_spec_decode:
             self.num_accepted_tokens.np[:num_reqs] = (
-                self.input_batch.num_accepted_tokens_cpu[:num_reqs])
+                self.input_batch.num_accepted_tokens_cpu[:num_reqs]
+            )
             self.num_accepted_tokens.np[num_reqs:].fill(1)
             self.num_accepted_tokens.copy_to_gpu()
 
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
-        for kv_cache_group_id, kv_cache_group_spec in enumerate(
-                self.kv_cache_config.kv_cache_groups):
+        for kv_cache_gid, kv_cache_group in enumerate(
+            self.kv_cache_config.kv_cache_groups
+        ):
             encoder_seq_lens = self._get_encoder_seq_lens(
-                scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs)
+                scheduled_encoder_inputs or {},
+                kv_cache_group.kv_cache_spec,
+                num_reqs,
+            )
 
-            if isinstance(kv_cache_group_spec.kv_cache_spec,
-                          EncoderOnlyAttentionSpec):
+            if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
@@ -1167,24 +1493,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     device=self.device,
                 )
                 slot_mapping = torch.zeros(
-                    (total_num_scheduled_tokens, ),
+                    (total_num_scheduled_tokens,),
                     dtype=torch.int64,
                     device=self.device,
                 )
-                num_common_prefix_blocks = 0
             else:
-                blk_table = self.input_batch.block_table[kv_cache_group_id]
+                blk_table = self.input_batch.block_table[kv_cache_gid]
                 blk_table_tensor = blk_table.get_device_tensor(num_reqs)
-                slot_mapping = blk_table.slot_mapping.gpu[:
-                                                          total_num_scheduled_tokens]
+                slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens]
 
                 # Fill unused with -1. Needed for reshape_and_cache in full cuda
                 # graph mode.
-                blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(
-                    -1)
-                num_common_prefix_blocks = (
-                    scheduler_output.
-                    num_common_prefix_blocks[kv_cache_group_id])
+                blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -1199,71 +1519,107 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
                 logits_indices_padded=logits_indices_padded,
-                num_logits_indices=logits_indices.size(0),
+                num_logits_indices=num_logits_indices,
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
+                dcp_local_seq_lens=dcp_local_seq_lens,
+                dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
 
-            if (self.speculative_config
-                    and spec_decode_common_attn_metadata is None):
+            if self.speculative_config and spec_decode_common_attn_metadata is None:
                 if isinstance(self.drafter, EagleProposer):
-                    if (self.drafter.attn_layer_names[0]
-                            in kv_cache_group_spec.layer_names):
+                    if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
                         spec_decode_common_attn_metadata = common_attn_metadata
                 else:
                     spec_decode_common_attn_metadata = common_attn_metadata
 
-            for attn_group in self.attn_groups[kv_cache_group_id]:
-                # Prepare for cascade attention if enabled & beneficial.
-                common_prefix_len = 0
+            for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]):
+                cascade_attn_prefix_len = (
+                    cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
+                    if cascade_attn_prefix_lens
+                    else 0
+                )
                 builder = attn_group.get_metadata_builder()
-                if self.cascade_attn_enabled:
-                    common_prefix_len = self._compute_cascade_attn_prefix_len(
-                        num_scheduled_tokens,
-                        num_common_prefix_blocks,
-                        attn_group.kv_cache_spec,
-                        builder,
-                    )
 
                 extra_attn_metadata_args = {}
-                if use_spec_decode and isinstance(builder,
-                                                  GDNAttentionMetadataBuilder):
+                if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                     extra_attn_metadata_args = dict(
-                        num_accepted_tokens=self.num_accepted_tokens.
-                        gpu[:num_reqs],
-                        num_decode_draft_tokens_cpu=self.
-                        num_decode_draft_tokens.cpu[:num_reqs],
+                        num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs],
+                        num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
+                            :num_reqs
+                        ],
                     )
 
                 if ubatch_slices is not None:
                     common_attn_metadata_list = split_attn_metadata(
-                        ubatch_slices, common_attn_metadata)
+                        ubatch_slices, common_attn_metadata
+                    )
                     for ubid, common_attn_metadata in enumerate(
-                            common_attn_metadata_list):
-                        attn_metadata_i = (attn_group.get_metadata_builder(
-                            ubatch_id=ubid).build(
-                                common_prefix_len=common_prefix_len,
-                                common_attn_metadata=common_attn_metadata))
-                        for layer_name in kv_cache_group_spec.layer_names:
+                        common_attn_metadata_list
+                    ):
+                        builder = attn_group.get_metadata_builder(ubatch_id=ubid)
+                        if for_cudagraph_capture:
+                            attn_metadata_i = builder.build_for_cudagraph_capture(
+                                common_attn_metadata
+                            )
+                        else:
+                            attn_metadata_i = builder.build(
+                                common_prefix_len=cascade_attn_prefix_len,
+                                common_attn_metadata=common_attn_metadata,
+                            )
+                        for layer_name in kv_cache_group.layer_names:
                             assert type(attn_metadata) is list
                             attn_metadata[ubid][layer_name] = attn_metadata_i
                 else:
                     assert isinstance(attn_metadata, dict)
-                    attn_metadata_i = builder.build(
-                        common_prefix_len=common_prefix_len,
-                        common_attn_metadata=common_attn_metadata,
-                        **extra_attn_metadata_args)
+                    if for_cudagraph_capture:
+                        attn_metadata_i = builder.build_for_cudagraph_capture(
+                            common_attn_metadata
+                        )
+                    else:
+                        attn_metadata_i = builder.build(
+                            common_prefix_len=cascade_attn_prefix_len,
+                            common_attn_metadata=common_attn_metadata,
+                            **extra_attn_metadata_args,
+                        )
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
-        # Hot-Swap lora model
-        if self.lora_config:
-            self.set_active_loras(self.input_batch, num_scheduled_tokens)
+        return attn_metadata, spec_decode_common_attn_metadata
+
+    def _compute_cascade_attn_prefix_lens(
+        self,
+        num_scheduled_tokens: np.ndarray,
+        num_common_prefix_blocks: list[int],
+    ) -> list[list[int]] | None:
+        """
+        :return: Optional[cascade_attn_prefix_lens]
+            cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
+            None if we should not use cascade attention
+        """
+
+        use_cascade_attn = False
+        num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups)
+        cascade_attn_prefix_lens: list[list[int]] = [
+            [] for _ in range(num_kv_cache_groups)
+        ]
+
+        for kv_cache_gid in range(num_kv_cache_groups):
+            for attn_group in self.attn_groups[kv_cache_gid]:
+                if isinstance(attn_group.kv_cache_spec, EncoderOnlyAttentionSpec):
+                    cascade_attn_prefix_len = 0
+                else:
+                    # 0 if cascade attention should not be used
+                    cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len(
+                        num_scheduled_tokens,
+                        num_common_prefix_blocks[kv_cache_gid],
+                        attn_group.kv_cache_spec,
+                        attn_group.get_metadata_builder(),
+                    )
+                cascade_attn_prefix_lens[kv_cache_gid].append(cascade_attn_prefix_len)
+                use_cascade_attn |= cascade_attn_prefix_len > 0
 
-        return (attn_metadata, logits_indices, spec_decode_metadata,
-                num_scheduled_tokens, spec_decode_common_attn_metadata,
-                max_num_scheduled_tokens, ubatch_slices,
-                num_tokens_after_padding)
+        return cascade_attn_prefix_lens if use_cascade_attn else None
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1289,6 +1645,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Returns:
             int: Length of common prefix in tokens.
         """
+
         common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
         if common_prefix_len == 0:
             # Common case.
@@ -1335,18 +1692,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # this case.
         num_reqs = len(num_scheduled_tokens)
         common_prefix_len = min(
-            common_prefix_len,
-            self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
+            common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()
+        )
         # common_prefix_len should be a multiple of the block size.
-        common_prefix_len = (common_prefix_len // kv_cache_spec.block_size *
-                             kv_cache_spec.block_size)
-        use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or
-                              (isinstance(kv_cache_spec, FullAttentionSpec)
-                               and kv_cache_spec.sliding_window is not None))
-        use_local_attention = (
-            isinstance(kv_cache_spec, ChunkedLocalAttentionSpec)
-            or (isinstance(kv_cache_spec, FullAttentionSpec)
-                and kv_cache_spec.attention_chunk_size is not None))
+        common_prefix_len = (
+            common_prefix_len // kv_cache_spec.block_size * kv_cache_spec.block_size
+        )
+        use_sliding_window = isinstance(kv_cache_spec, SlidingWindowSpec) or (
+            isinstance(kv_cache_spec, FullAttentionSpec)
+            and kv_cache_spec.sliding_window is not None
+        )
+        use_local_attention = isinstance(kv_cache_spec, ChunkedLocalAttentionSpec) or (
+            isinstance(kv_cache_spec, FullAttentionSpec)
+            and kv_cache_spec.attention_chunk_size is not None
+        )
         assert isinstance(kv_cache_spec, AttentionSpec)
         use_cascade = attn_metadata_builder.use_cascade_attention(
             common_prefix_len=common_prefix_len,
@@ -1357,6 +1716,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             use_sliding_window=use_sliding_window,
             use_local_attention=use_local_attention,
             num_sms=self.num_sms,
+            dcp_world_size=self.dcp_world_size,
         )
         return common_prefix_len if use_cascade else 0
 
@@ -1366,18 +1726,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req = self.requests[req_id]
             assert req.mrope_positions is not None
 
-            num_computed_tokens = \
-                self.input_batch.num_computed_tokens_cpu[index]
-            num_scheduled_tokens = \
-                scheduler_output.num_scheduled_tokens[req_id]
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
-                req.prompt_token_ids, req.prompt_embeds)
+                req.prompt_token_ids, req.prompt_embeds
+            )
 
             if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
-                prompt_part_len = max(0,
-                                      num_prompt_tokens - num_computed_tokens)
-                completion_part_len = max(
-                    0, num_scheduled_tokens - prompt_part_len)
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
             else:
                 prompt_part_len = num_scheduled_tokens
                 completion_part_len = 0
@@ -1391,8 +1748,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 src_start = num_computed_tokens
                 src_end = num_computed_tokens + prompt_part_len
 
-                self.mrope_positions.cpu[:, dst_start:dst_end] = (
-                    req.mrope_positions[:, src_start:src_end])
+                self.mrope_positions.cpu[:, dst_start:dst_end] = req.mrope_positions[
+                    :, src_start:src_end
+                ]
                 mrope_pos_ptr += prompt_part_len
 
             if completion_part_len > 0:
@@ -1432,10 +1790,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
         # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
         cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
-            num_sampled_tokens, cumsum_dtype=np.int32)
+            num_sampled_tokens, cumsum_dtype=np.int32
+        )
         # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
         logits_indices = np.repeat(
-            cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
+            cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens
+        )
         # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
         logits_indices += arange
 
@@ -1446,37 +1806,46 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # cu_num_draft_tokens: [3, 3, 5, 5, 6]
         # arange: [0, 1, 2, 0, 1, 0]
         cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
-            num_draft_tokens, cumsum_dtype=np.int32)
+            num_draft_tokens, cumsum_dtype=np.int32
+        )
         # [0, 0, 0, 5, 5, 9]
         target_logits_indices = np.repeat(
-            cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
+            cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens
+        )
         # [0, 1, 2, 5, 6, 9]
         target_logits_indices += arange
 
         # TODO: Optimize the CPU -> GPU copy.
         cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
-            self.device, non_blocking=True)
-        logits_indices = torch.from_numpy(logits_indices).to(self.device,
-                                                             non_blocking=True)
+            self.device, non_blocking=True
+        )
+        cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
+            self.device, non_blocking=True
+        )
+        logits_indices = torch.from_numpy(logits_indices).to(
+            self.device, non_blocking=True
+        )
         target_logits_indices = torch.from_numpy(target_logits_indices).to(
-            self.device, non_blocking=True)
+            self.device, non_blocking=True
+        )
         bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
-            self.device, non_blocking=True)
+            self.device, non_blocking=True
+        )
 
         # Compute the draft token ids.
         # draft_token_indices:      [  1,   2,   3, 105, 106, 208]
         draft_token_ids = self.input_ids.gpu[logits_indices]
         draft_token_ids = draft_token_ids[target_logits_indices + 1]
 
-        metadata = SpecDecodeMetadata(
+        return SpecDecodeMetadata(
             draft_token_ids=draft_token_ids,
             num_draft_tokens=num_draft_tokens.tolist(),
             cu_num_draft_tokens=cu_num_draft_tokens,
+            cu_num_sampled_tokens=cu_num_sampled_tokens,
             target_logits_indices=target_logits_indices,
             bonus_logits_indices=bonus_logits_indices,
             logits_indices=logits_indices,
         )
-        return metadata
 
     def _prepare_kv_sharing_fast_prefill(
         self,
@@ -1485,23 +1854,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.kv_sharing_fast_prefill_logits_indices is not None
         num_logits = logits_indices.shape[0]
         assert num_logits > 0
-        self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
-            logits_indices)
+        self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(logits_indices)
         # There might have leftover indices in logits_indices[num_logits:]
         # from previous iterations, whose values may be greater than the
         # batch size in the current iteration. To ensure indices are always
         # valid, we fill the padded indices with the last index.
         self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
-            logits_indices[-1].item())
-        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                and num_logits <= self.cudagraph_batch_sizes[-1]):
+            logits_indices[-1].item()
+        )
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and num_logits <= self.cudagraph_batch_sizes[-1]
+        ):
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
             num_logits_padded = self.vllm_config.pad_for_cudagraph(num_logits)
         else:
             num_logits_padded = num_logits
-        logits_indices_padded = (
-            self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded])
+        logits_indices_padded = self.kv_sharing_fast_prefill_logits_indices[
+            :num_logits_padded
+        ]
         return logits_indices_padded
 
     def _batch_mm_kwargs_from_scheduler(
@@ -1540,7 +1912,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # Batch the multi-modal inputs using the helper method.
         mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
-            scheduler_output)
+            scheduler_output
+        )
 
         if not mm_kwargs:
             return
@@ -1555,28 +1928,44 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = cast(SupportsMultiModal, self.model)
         encoder_outputs = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
-                mm_kwargs,
-                device=self.device,
-                pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
+            mm_kwargs,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
+            curr_group_outputs = []
+
+            # EVS-related change.
             # (ekhvedchenia): Temporary hack to limit peak memory usage when
-            # processing multimodal data.This solves the issue with scheduler
+            # processing multimodal data. This solves the issue with scheduler
             # putting too many video samples into a single batch. Scheduler
             # uses pruned vision tokens count to compare it versus compute
             # budget which is incorrect (Either input media size or non-pruned
             # output vision tokens count should be considered)
-            curr_group_outputs = []
-
-            if self.is_multimodal_pruning_enabled and modality == "video":
-                micro_batch_size = 1
-                for i in range(0, num_items, micro_batch_size):
-                    micro_batch_mm_inputs = dict(
-                        (k, v[i:i + micro_batch_size])
-                        for k, v in mm_kwargs_group.items())
+            # TODO(ywang96): Fix memory profiling to take EVS into account and
+            # remove this hack.
+            if (
+                self.is_multimodal_pruning_enabled
+                and modality == "video"
+                and num_items > 1
+            ):
+                for video_mm_kwargs_item in filter(
+                    lambda item: item.modality == "video", mm_kwargs
+                ):
+                    _, _, micro_batch_mm_inputs = next(
+                        group_mm_kwargs_by_modality(
+                            [video_mm_kwargs_item],
+                            device=self.device,
+                            pin_memory=self.pin_memory,
+                            merge_by_field_config=model.merge_by_field_config,
+                            multimodal_cpu_fields=model.multimodal_cpu_fields,
+                        )
+                    )
 
-                    micro_batch_outputs = model.get_multimodal_embeddings(
-                        **micro_batch_mm_inputs)
+                    micro_batch_outputs = model.embed_multimodal(
+                        **micro_batch_mm_inputs
+                    )
 
                     curr_group_outputs.extend(micro_batch_outputs)
             else:
@@ -1587,8 +1976,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.get_multimodal_embeddings(
-                    **mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -1602,22 +1990,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 output,
                 is_embed=pos_info.is_embed,
             )
+            logger.debug("Finish execute for mm hash %s", mm_hash)
+            self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
         shift_computed_tokens: int = 0,
-    ) -> list[torch.Tensor]:
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+
+        mm_embeds = list[torch.Tensor]()
+        is_mm_embed = self.is_mm_embed.cpu
+        is_mm_embed[:total_num_scheduled_tokens] = False
+
+        req_start_idx = 0
         should_sync_mrope_positions = False
-        mm_embeds: list[torch.Tensor] = []
+
         for req_id in self.input_batch.req_ids:
             mm_embeds_req: list[torch.Tensor] = []
 
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
-                req_id]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             req_state = self.requests[req_id]
-            num_computed_tokens = \
-                req_state.num_computed_tokens + shift_computed_tokens
+            num_computed_tokens = req_state.num_computed_tokens + shift_computed_tokens
+
             for mm_feature in req_state.mm_features:
                 pos_info = mm_feature.mm_position
                 start_pos = pos_info.offset
@@ -1644,12 +2040,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
                 mm_hash = mm_feature.identifier
                 encoder_output = self.encoder_cache.get(mm_hash, None)
-                assert encoder_output is not None,\
-                    f"Encoder cache miss for {mm_hash}."
+                assert encoder_output is not None, f"Encoder cache miss for {mm_hash}."
 
                 if (is_embed := pos_info.is_embed) is not None:
                     is_embed = is_embed[start_idx:end_idx]
 
+                req_start_pos = req_start_idx + start_pos - num_computed_tokens
+                is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
+                    True if is_embed is None else is_embed
+                )
+
                 mm_embeds_item = gather_mm_placeholders(
                     encoder_output[start_idx:end_idx],
                     is_embed=is_embed,
@@ -1657,6 +2057,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 mm_embeds_req.append(mm_embeds_item)
 
             if self.is_multimodal_pruning_enabled and self.uses_mrope:
+                assert req_state.mrope_positions is not None
                 should_sync_mrope_positions = True
                 mm_embeds_req, new_mrope_positions, new_delta = (
                     self.model.recompute_mrope_positions(
@@ -1664,19 +2065,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         multimodal_embeddings=mm_embeds_req,
                         mrope_positions=req_state.mrope_positions,
                         num_computed_tokens=req_state.num_computed_tokens,
-                    ))
-                assert req_state.mrope_positions is not None
+                    )
+                )
                 req_state.mrope_positions.copy_(new_mrope_positions)
                 req_state.mrope_position_delta = new_delta
 
             mm_embeds.extend(mm_embeds_req)
+            req_start_idx += num_scheduled_tokens
+
+        is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens)
 
         if should_sync_mrope_positions:
             self._calc_mrope_positions(scheduler_output)
-            self.mrope_positions.copy_to_gpu(
-                scheduler_output.total_num_scheduled_tokens)
+            self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
-        return mm_embeds
+        return mm_embeds, is_mm_embed
 
     def _extract_encoder_inputs(
         self,
@@ -1697,10 +2100,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = cast(SupportsMultiModal, self.model)
         encoder_features = {}
         for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                mm_kwargs,
-                device=self.device,
-                pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
+            mm_kwargs,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Add the grouped features to encoder_features dict
             # This allows the model to receive them as kwargs (e.g.,
@@ -1737,21 +2141,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if (self.scheduler_config.chunked_prefill_enabled
-                and "encode" in supported_tasks):
-            supported_tasks.remove("encode")
-
-            logger.debug_once("Chunked prefill is not supported with "
-                              "encode task which using ALL pooling. "
-                              "Please turn off chunked prefill by "
-                              "`--no-enable-chunked-prefill` before using it.")
+        if self.scheduler_config.enable_chunked_prefill:
+            if "token_embed" in supported_tasks:
+                supported_tasks.remove("token_embed")
+            if "token_classify" in supported_tasks:
+                supported_tasks.remove("token_classify")
+
+            logger.debug_once(
+                "Chunked prefill is not supported with "
+                "token_embed and token_classify tasks "
+                "which using ALL pooling. "
+                "Please turn off chunked prefill by "
+                "`--no-enable-chunked-prefill` before using it."
+            )
 
         if "score" in supported_tasks:
             num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
             if num_labels != 1:
                 supported_tasks.remove("score")
-                logger.debug_once(
-                    "Score API is only enabled for num_labels == 1.")
+                logger.debug_once("Score API is only enabled for num_labels == 1.")
 
         return supported_tasks
 
@@ -1766,9 +2174,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return tuple(tasks)
 
     def sync_and_slice_intermediate_tensors(
-            self, num_tokens: int, intermediate_tensors: IntermediateTensors,
-            sync_self: bool) -> IntermediateTensors:
-
+        self,
+        num_tokens: int,
+        intermediate_tensors: IntermediateTensors,
+        sync_self: bool,
+    ) -> IntermediateTensors:
         assert self.intermediate_tensors is not None
 
         tp = self.vllm_config.parallel_config.tensor_parallel_size
@@ -1780,21 +2190,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             assert intermediate_tensors is not None
             for k, v in intermediate_tensors.items():
                 is_scattered = k == "residual" and is_rs
-                copy_len = num_tokens // tp if is_scattered else \
-                    num_tokens
+                copy_len = num_tokens // tp if is_scattered else num_tokens
                 self.intermediate_tensors[k][:copy_len].copy_(
-                    v[:copy_len], non_blocking=True)
-
-        return IntermediateTensors({
-            k:
-            v[:num_tokens //
-              tp] if k == "residual" and is_rs else v[:num_tokens]
-            for k, v in self.intermediate_tensors.items()
-        })
-
-    def eplb_step(self,
-                  is_dummy: bool = False,
-                  is_profile: bool = False) -> None:
+                    v[:copy_len], non_blocking=True
+                )
+
+        return IntermediateTensors(
+            {
+                k: v[: num_tokens // tp]
+                if k == "residual" and is_rs
+                else v[:num_tokens]
+                for k, v in self.intermediate_tensors.items()
+            }
+        )
+
+    def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
         """
         Step for the EPLB (Expert Parallelism Load Balancing) state.
         """
@@ -1805,80 +2215,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = self.get_model()
         assert is_mixture_of_experts(model)
         self.eplb_state.step(
-            model,
             is_dummy,
             is_profile,
             log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
-    def get_dp_padding(self,
-                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
-        """
-        Determines the total number of tokens that each rank will run.
-        All ranks will be padded out so that they run with the same number
-        of tokens
-
-        Returns: tuple[
-            num_pad_tokens: The number of tokens that will be added to the batch
-            num_tokens_after_padding: A tensor containing the total number of
-            tokens for each DP rank including padding.
-        ]
-        """
-        dp_size = self.vllm_config.parallel_config.data_parallel_size
-        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
-
-        # For DP: Don't pad when setting enforce_eager.
-        # This lets us set enforce_eager on the prefiller in a P/D setup and
-        # still use CUDA graphs (enabled by this padding) on the decoder.
-        #
-        # TODO(tms) : There are many cases where padding is enabled for
-        # prefills, causing unnecessary and excessive padding of activations.
-
-        if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
-            # Early exit.
-            return 0, None
-
-        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
-            num_tokens, dp_size, dp_rank)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
-        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
-                                                dp_size,
-                                                device="cpu",
-                                                dtype=torch.int32)
-        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
-
-    def get_local_padding(self, num_tokens_unpadded: int) -> int:
-
-        num_tokens_padded = num_tokens_unpadded
-
-        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]):
-            # Use piecewise CUDA graphs.
-            # Add padding to the batch size.
-            num_tokens_padded = self.vllm_config.pad_for_cudagraph(
-                num_tokens_unpadded)
-        else:
-            # Eager mode.
-            # Pad tokens to multiple of tensor_parallel_size when
-            # enabled collective fusion for SP
-            tp_size = self.vllm_config.parallel_config.tensor_parallel_size
-            if self.vllm_config.compilation_config.pass_config. \
-                enable_sequence_parallelism and tp_size > 1:
-                num_tokens_padded = round_up(num_tokens_unpadded, tp_size)
-
-        num_pad_tokens = num_tokens_padded - num_tokens_unpadded
-        return num_pad_tokens
-
     # This is where the second ubatch is adjusted to account for the padding.
     # Should be called after attention metadata creation. This just pads
     # the second ubatch slice out to the total number of tokens
     # (num_tokens + padding)
-    def pad_out_ubatch_slice(self, ubatch_slices: UBatchSlices,
-                             num_total_tokens: int):
-        padded_second_ubatch_slice = slice(ubatch_slices[1].token_slice.start,
-                                           num_total_tokens)
-        ubatch_slices[1] = UBatchSlice(padded_second_ubatch_slice,
-                                       padded_second_ubatch_slice)
+    @staticmethod
+    def pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
+        padded_second_ubatch_slice = slice(
+            ubatch_slices[1].token_slice.start, num_total_tokens
+        )
+        ubatch_slices[1] = UBatchSlice(
+            padded_second_ubatch_slice, padded_second_ubatch_slice
+        )
 
     def _pool(
         self,
@@ -1886,16 +2239,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_scheduled_tokens: int,
         num_scheduled_tokens_np: np.ndarray,
     ) -> ModelRunnerOutput:
-        assert self.input_batch.num_reqs ==\
-            len(self.input_batch.pooling_params), \
-        "Either all or none of the requests in" \
-        " a batch must be pooling request"
+        assert self.input_batch.num_reqs == len(self.input_batch.pooling_params), (
+            "Either all or none of the requests in a batch must be pooling request"
+        )
 
         hidden_states = hidden_states[:num_scheduled_tokens]
         pooling_metadata = self.input_batch.get_pooling_metadata()
-        pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
-                                              device=hidden_states.device)
-        seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs]
+        pooling_metadata.build_pooling_cursor(
+            num_scheduled_tokens_np.tolist(), device=hidden_states.device
+        )
+        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
 
         model = cast(VllmModelForPooling, self.model)
         raw_pooler_output: PoolerOutput = model.pooler(
@@ -1908,10 +2261,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         self._sync_device()
 
-        pooler_output: list[Optional[torch.Tensor]] = []
+        pooler_output: list[torch.Tensor | None] = []
         for raw_output, seq_len, prompt_len in zip(
-                raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
-
+            raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens
+        ):
             output = raw_output if seq_len == prompt_len else None
             pooler_output.append(output)
 
@@ -1925,11 +2278,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
 
     def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
-        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
-                and hasattr(self, "cudagraph_batch_sizes")
-                and self.cudagraph_batch_sizes
-                and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and hasattr(self, "cudagraph_batch_sizes")
+            and self.cudagraph_batch_sizes
+            and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]
+        ):
             # Use CUDA graphs.
             # Add padding to the batch size.
             return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens)
@@ -1938,51 +2292,57 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
-        if (self.compilation_config.pass_config.enable_sequence_parallelism
-                and tp_size > 1):
+        if (
+            self.compilation_config.pass_config.enable_sequence_parallelism
+            and tp_size > 1
+        ):
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
     def _preprocess(
         self,
         scheduler_output: "SchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        ubatch_slices: Optional[UBatchSlices] = None,
-        num_tokens_after_padding: Optional[torch.Tensor] = None,
-    ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor],
-               Optional[torch.Tensor], torch.Tensor,
-               Optional[IntermediateTensors], dict[str, Any]]:
-
+        num_input_tokens: int,  # Padded
+        intermediate_tensors: IntermediateTensors | None = None,
+    ) -> tuple[
+        torch.Tensor | None,
+        torch.Tensor | None,
+        torch.Tensor,
+        IntermediateTensors | None,
+        dict[str, Any],
+        ECConnectorOutput | None,
+    ]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if ubatch_slices:
-            assert num_tokens_after_padding is not None
-            num_input_tokens = int(num_tokens_after_padding[0].item() * 2)
-            self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-        elif ubatch_slices is None:
-            num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens)
-            num_pad, num_tokens_after_padding = self.get_dp_padding(
-                num_input_tokens)
-            num_input_tokens += num_pad
+        is_first_rank = get_pp_group().is_first_rank
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
-        if (self.supports_mm_inputs and get_pp_group().is_first_rank
-                and not self.model_config.is_encoder_decoder):
+        ec_connector_output = None
+
+        if (
+            self.supports_mm_inputs
+            and is_first_rank
+            and not self.model_config.is_encoder_decoder
+        ):
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            with self.maybe_get_ec_connector_output(
+                scheduler_output,
+                encoder_cache=self.encoder_cache,
+            ) as ec_connector_output:
+                self._execute_mm_encoder(scheduler_output)
+                mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
 
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds_scheduled = self.model.get_input_embeddings(
-                input_ids=self.input_ids.gpu[:num_scheduled_tokens],
-                multimodal_embeddings=mm_embeds or None,
+            inputs_embeds_scheduled = self.model.embed_input_ids(
+                self.input_ids.gpu[:num_scheduled_tokens],
+                multimodal_embeddings=mm_embeds,
+                is_multimodal=is_mm_embed,
             )
 
             # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(
-                inputs_embeds_scheduled)
+            self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled)
 
             input_ids = None
             inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
@@ -1990,7 +2350,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
-        elif self.enable_prompt_embeds and get_pp_group().is_first_rank:
+        elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.
             # TODO(qthequartermasterman): Since even when prompt embeds are
@@ -2003,14 +2363,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # If a batch only has token ids, then including the embedding layer
             # in the CUDA graph will be more performant (like in the else case
             # below).
-            token_ids_idx = self.is_token_ids.gpu[:num_scheduled_tokens] \
-                .nonzero(as_tuple=False) \
+            token_ids_idx = (
+                self.is_token_ids.gpu[:num_scheduled_tokens]
+                .nonzero(as_tuple=False)
                 .squeeze(1)
+            )
             # Some tokens ids may need to become embeds
             if token_ids_idx.numel() > 0:
                 token_ids = self.input_ids.gpu[token_ids_idx]
-                tokens_to_embeds = self.model.get_input_embeddings(
-                    input_ids=token_ids)
+                tokens_to_embeds = self.model.embed_input_ids(input_ids=token_ids)
                 self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
 
             inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
@@ -2029,87 +2390,78 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         else:
             positions = self.positions.gpu[:num_input_tokens]
 
-        if get_pp_group().is_first_rank:
+        if is_first_rank:
             intermediate_tensors = None
         else:
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                num_input_tokens, intermediate_tensors, True)
+                num_input_tokens, intermediate_tensors, True
+            )
 
-        if (self.model_config.is_encoder_decoder
-                and scheduler_output.scheduled_encoder_inputs):
+        if (
+            self.model_config.is_encoder_decoder
+            and scheduler_output.scheduled_encoder_inputs
+        ):
             encoder_inputs = self._extract_encoder_inputs(scheduler_output)
             model_kwargs.update(encoder_inputs)
 
         return (
-            num_scheduled_tokens,
-            num_input_tokens,
-            num_tokens_after_padding,
             input_ids,
             inputs_embeds,
             positions,
             intermediate_tensors,
             model_kwargs,
+            ec_connector_output,
         )
 
     def _sample(
-            self, logits: Optional[torch.Tensor],
-            spec_decode_metadata: Optional[SpecDecodeMetadata]
+        self,
+        logits: torch.Tensor | None,
+        spec_decode_metadata: SpecDecodeMetadata | None,
     ) -> SamplerOutput:
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if spec_decode_metadata is None:
-            sampler_output = self.sampler(
+            # Update output token ids with tokens sampled in last step
+            # if async scheduling and required by current sampling params.
+            self.input_batch.update_async_output_token_ids()
+            return self.sampler(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
             )
-        else:
-            # When indexing with a tensor (bonus_logits_indices), PyTorch
-            # creates a new tensor with separate storage from the original
-            # logits tensor. This means any in-place operations on bonus_logits
-            # won't affect the original logits tensor.
-            assert logits is not None
-            bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
-            sampler_output = self.sampler(
-                logits=bonus_logits,
-                sampling_metadata=sampling_metadata,
-            )
-            bonus_token_ids = sampler_output.sampled_token_ids
-
-            # Just like `bonus_logits`, `target_logits` is a new tensor with
-            # separate storage from the original `logits` tensor. Therefore,
-            # it is safe to update `target_logits` in place.
-            target_logits = logits[spec_decode_metadata.target_logits_indices]
-            output_token_ids = self.rejection_sampler(
-                spec_decode_metadata,
-                None,  # draft_probs
-                target_logits,
-                bonus_token_ids,
-                sampling_metadata,
-            )
-            sampler_output.sampled_token_ids = output_token_ids
-            self._update_states_after_model_execute(output_token_ids)
 
+        sampler_output = self.rejection_sampler(
+            spec_decode_metadata,
+            None,  # draft_probs
+            logits,
+            sampling_metadata,
+        )
+        self._update_states_after_model_execute(sampler_output.sampled_token_ids)
         return sampler_output
 
     def _bookkeeping_sync(
-        self, scheduler_output: "SchedulerOutput",
-        sampler_output: SamplerOutput, logits: Optional[torch.Tensor],
-        hidden_states: torch.Tensor, num_scheduled_tokens: int
+        self,
+        scheduler_output: "SchedulerOutput",
+        sampler_output: SamplerOutput,
+        logits: torch.Tensor | None,
+        hidden_states: torch.Tensor,
+        num_scheduled_tokens: int,
+        spec_decode_metadata: SpecDecodeMetadata | None,
     ) -> tuple[
-            dict[str, int],
-            Optional[LogprobsLists],
-            list[list[int]],
-            dict[str, Optional[LogprobsTensors]],
-            list[str],
-            dict[str, int],
-            list[int],
+        dict[str, int],
+        LogprobsLists | None,
+        list[np.ndarray],
+        dict[str, LogprobsTensors | None],
+        list[str],
+        dict[str, int],
+        list[int],
     ]:
         num_nans_in_logits = {}
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
 
-        discard_sampled_tokens_req_indices = \
-            self.discard_request_indices.np[:self.num_discarded_requests]
+        discard_sampled_tokens_req_indices = self.discard_request_indices.np[
+            : self.num_discarded_requests
+        ]
         for i in discard_sampled_tokens_req_indices:
             gen = self.input_batch.generators.get(int(i))
             if gen is not None:
@@ -2118,24 +2470,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Copy some objects so they don't get modified after returning.
         # This is important when using async scheduling.
         req_ids_output_copy = self.input_batch.req_ids.copy()
-        req_id_to_index_output_copy = \
-            self.input_batch.req_id_to_index.copy()
-
-        # NOTE: GPU -> CPU Sync happens here.
-        # Move as many CPU operations as possible before this sync point.
-        logprobs_tensors = sampler_output.logprobs_tensors
-        logprobs_lists = logprobs_tensors.tolists() \
-            if logprobs_tensors is not None else None
-
-        # Compute prompt logprobs if needed.
-        prompt_logprobs_dict = self._get_prompt_logprobs_dict(
-            hidden_states[:num_scheduled_tokens],
-            scheduler_output.num_scheduled_tokens,
-        )
+        req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy()
 
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
+        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2150,20 +2490,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
+                valid_sampled_token_ids[int(i)] = np.array([])
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
             invalid_req_indices_set = set(invalid_req_indices)
-            assert sampled_token_ids.shape[-1] == 1
 
             # Cache the sampled tokens on the GPU and avoid CPU sync.
             # These will be copied into input_ids in the next step
             # when preparing inputs.
-            self.input_batch.prev_sampled_token_ids = \
-                sampled_token_ids
-            self.input_batch.prev_sampled_token_ids_invalid_indices = \
-                invalid_req_indices_set
+            # With spec decoding, this is done in propose_draft_token_ids().
+            if self.input_batch.prev_sampled_token_ids is None:
+                assert sampled_token_ids.shape[-1] == 1
+                self.input_batch.prev_sampled_token_ids = sampled_token_ids
             self.input_batch.prev_req_id_to_index = {
                 req_id: i
                 for i, req_id in enumerate(self.input_batch.req_ids)
@@ -2176,24 +2515,40 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
+        logprobs_tensors = sampler_output.logprobs_tensors
+        cu_num_accepted_tokens = (
+            [0] if spec_decode_metadata and logprobs_tensors else None
+        )
         for req_idx in range(num_sampled_tokens):
+            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = [-1] if \
-                    req_idx not in invalid_req_indices_set else None
+                sampled_ids = (
+                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
+                )
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
-            if not sampled_ids:
+
+            num_sampled_ids: int = (
+                sampled_ids.shape[0] if sampled_ids is not None else 0
+            )
+
+            if cu_num_accepted_tokens is not None:
+                cu_num_accepted_tokens.append(
+                    cu_num_accepted_tokens[-1] + num_sampled_ids
+                )
+
+            if sampled_ids is None or num_sampled_ids == 0:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
-            end_idx = start_idx + len(sampled_ids)
+            end_idx = start_idx + num_sampled_ids
             assert end_idx <= self.max_model_len, (
                 "Sampled token IDs exceed the max model length. "
                 f"Total number of tokens: {end_idx} > max_model_len: "
-                f"{self.max_model_len}")
+                f"{self.max_model_len}"
+            )
 
-            self.input_batch.token_ids_cpu[req_idx,
-                                           start_idx:end_idx] = sampled_ids
+            self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids
             self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
@@ -2202,6 +2557,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
+        logprobs_lists = (
+            logprobs_tensors.tolists(cu_num_accepted_tokens)
+            if not self.use_async_scheduling and logprobs_tensors is not None
+            else None
+        )
+
+        # Compute prompt logprobs if needed.
+        prompt_logprobs_dict = self._get_prompt_logprobs_dict(
+            hidden_states[:num_scheduled_tokens],
+            scheduler_output.num_scheduled_tokens,
+        )
+
         return (
             num_nans_in_logits,
             logprobs_lists,
@@ -2227,64 +2594,197 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         finally:
             self.prepare_inputs_event.record()
 
+    def _model_forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        positions: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **model_kwargs: dict[str, Any],
+    ) -> Any:
+        """Helper method to call the model forward pass.
+
+        This method can be overridden by subclasses for model execution.
+        Motivation: We can inspect only this method versus
+        the whole execute_model, which has additional logic.
+
+        Args:
+            input_ids: Input token IDs
+            positions: Token positions
+            intermediate_tensors: Tensors from previous pipeline stages
+            inputs_embeds: Input embeddings (alternative to input_ids)
+            **model_kwargs: Additional model arguments
+
+        Returns:
+            Model output tensor
+        """
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **model_kwargs,
+        )
+
     @torch.inference_mode()
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
-        with record_function_or_nullcontext("Preprocess"):
+        intermediate_tensors: IntermediateTensors | None = None,
+    ) -> ModelRunnerOutput | IntermediateTensors | None:
+        if self.execute_model_state is not None:
+            raise RuntimeError(
+                "State error: sample_tokens() must be called "
+                "after execute_model() returns None."
+            )
+
+        # self._draft_token_ids is None when `input_fits_in_drafter=False`
+        # and there is no draft tokens scheduled. so it need to update the
+        # spec_decoding info in scheduler_output with async_scheduling.
+        # use deepcopy to avoid the modification has influence on the
+        # scheduler_output in engine core process.
+        # TODO(Ronald1995): deepcopy is expensive when there is a large
+        # number of requests, optimize it later.
+        if (
+            self.use_async_scheduling
+            and self.num_spec_tokens
+            and self._draft_token_ids is None
+        ):
+            scheduler_output = deepcopy(scheduler_output)
+
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        with record_function_or_nullcontext("gpu_model_runner: preprocess"):
             with self.synchronize_input_prep():
                 # Update persistent batch states.
                 self._update_states(scheduler_output)
 
-                if not scheduler_output.total_num_scheduled_tokens:
+                if has_ec_transfer() and get_ec_transfer().is_producer:
+                    with self.maybe_get_ec_connector_output(
+                        scheduler_output,
+                        encoder_cache=self.encoder_cache,
+                    ) as ec_connector_output:
+                        self._execute_mm_encoder(scheduler_output)
+                        return make_empty_encoder_model_runner_output(scheduler_output)
+
+                if not num_scheduled_tokens:
+                    if (
+                        self.parallel_config.distributed_executor_backend
+                        == "external_launcher"
+                        and self.parallel_config.data_parallel_size > 1
+                    ):
+                        # this is a corner case when both external launcher
+                        # and DP are enabled, num_scheduled_tokens could be
+                        # 0, and has_unfinished_requests in the outer loop
+                        # returns True. before returning early here we call
+                        # dummy run to ensure coordinate_batch_across_dp
+                        # is called into to avoid out of sync issues.
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT
                     return self.kv_connector_no_forward(
-                        scheduler_output, self.vllm_config)
+                        scheduler_output, self.vllm_config
+                    )
                 if self.cache_config.kv_sharing_fast_prefill:
                     assert not self.input_batch.num_prompt_logprobs, (
                         "--kv-sharing-fast-prefill produces incorrect "
                         "logprobs for prompt tokens, tokens, please disable "
-                        "it when the requests need prompt logprobs")
+                        "it when the requests need prompt logprobs"
+                    )
 
-                # Prepare the decoder inputs.
-                (attn_metadata, logits_indices, spec_decode_metadata,
-                 num_scheduled_tokens_np, spec_decode_common_attn_metadata,
-                 max_query_len, ubatch_slices, num_tokens_after_padding
-                 ) = self._prepare_inputs(scheduler_output)
+                num_reqs = self.input_batch.num_reqs
+                req_ids = self.input_batch.req_ids
+                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+                num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
+                max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
 
-            (
-                num_scheduled_tokens,
-                num_input_tokens,
-                num_tokens_across_dp,
-                input_ids,
-                inputs_embeds,
-                positions,
-                intermediate_tensors,
-                model_kwargs,
-            ) = self._preprocess(scheduler_output, intermediate_tensors,
-                                 ubatch_slices, num_tokens_after_padding)
-
-            uniform_decode = (max_query_len
-                              == self.uniform_decode_query_len) and (
-                                  num_scheduled_tokens
-                                  == self.input_batch.num_reqs * max_query_len)
-            batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                               uniform_decode=uniform_decode)
-            cudagraph_runtime_mode, batch_descriptor = \
-                self.cudagraph_dispatcher.dispatch(batch_descriptor)
-
-        # This is currently to get around the assert in the DPMetadata
-        # where it wants `num_tokens_across_dp` to align with `num_tokens`
-        if ubatch_slices is not None:
-            num_input_tokens = ubatch_slices[0].num_tokens
+                (
+                    logits_indices,
+                    spec_decode_metadata,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                ) = self._prepare_inputs(
+                    scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens
+                )
+
+                cascade_attn_prefix_lens = None
+                # Disable cascade attention when using microbatching (DBO)
+                if self.cascade_attn_enabled and ubatch_slices is None:
+                    # Pre-compute cascade attention prefix lengths
+                    # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state
+                    cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
+                        num_scheduled_tokens_np,
+                        scheduler_output.num_common_prefix_blocks,
+                    )
+
+                # TODO(lucas): move cudagraph dispatching here:
+                #   https://github.com/vllm-project/vllm/issues/23789
+
+                total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
+                attn_metadata, spec_decode_common_attn_metadata = (
+                    self._build_attention_metadata(
+                        total_num_scheduled_tokens=total_num_scheduled_tokens,
+                        max_num_scheduled_tokens=max_num_scheduled_tokens,
+                        num_reqs=num_reqs,
+                        ubatch_slices=ubatch_slices,
+                        logits_indices=logits_indices,
+                        use_spec_decode=use_spec_decode,
+                        scheduled_encoder_inputs=scheduler_output.scheduled_encoder_inputs,
+                        cascade_attn_prefix_lens=cascade_attn_prefix_lens,
+                    )
+                )
+
+                dp_rank = self.parallel_config.data_parallel_rank
+                if ubatch_slices:
+                    assert num_tokens_across_dp is not None
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                    self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
+                elif num_tokens_across_dp is not None:
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                else:
+                    num_input_tokens = self._get_num_input_tokens(
+                        scheduler_output.total_num_scheduled_tokens
+                    )
+
+                (
+                    input_ids,
+                    inputs_embeds,
+                    positions,
+                    intermediate_tensors,
+                    model_kwargs,
+                    ec_connector_output,
+                ) = self._preprocess(
+                    scheduler_output, num_input_tokens, intermediate_tensors
+                )
+
+            uniform_decode = (
+                max_num_scheduled_tokens == self.uniform_decode_query_len
+            ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
+            batch_descriptor = BatchDescriptor(
+                num_tokens=num_input_tokens,
+                uniform_decode=uniform_decode,
+                has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
+            )
+            cudagraph_runtime_mode, batch_descriptor = (
+                self.cudagraph_dispatcher.dispatch(
+                    batch_descriptor,
+                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                )
+            )
+
+        # Set cudagraph mode to none if calc_kv_scales is true.
+        # KV scales calculation involves dynamic operations that are incompatible
+        # with CUDA graph capture.
+        if self.calculate_kv_scales:
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            # Mark KV scales as calculated after the first forward pass
+            self.calculate_kv_scales = False
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
-        with (set_forward_context(
+        with (
+            set_forward_context(
                 attn_metadata,
                 self.vllm_config,
                 num_tokens=num_input_tokens,
@@ -2292,10 +2792,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
                 batch_descriptor=batch_descriptor,
                 ubatch_slices=ubatch_slices,
-        ), record_function_or_nullcontext("Forward"),
-              self.maybe_get_kv_connector_output(scheduler_output) as
-              kv_connector_output):
-            model_output = self.model(
+            ),
+            record_function_or_nullcontext("gpu_model_runner: forward"),
+            self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
+        ):
+            model_output = self._model_forward(
                 input_ids=input_ids,
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
@@ -2303,7 +2804,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 **model_kwargs,
             )
 
-        with record_function_or_nullcontext("Postprocess"):
+        with record_function_or_nullcontext("gpu_model_runner: postprocess"):
             if self.use_aux_hidden_state_outputs:
                 # True when EAGLE 3 is used.
                 hidden_states, aux_hidden_states = model_output
@@ -2318,12 +2819,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # Return the intermediate tensors.
                     assert isinstance(hidden_states, IntermediateTensors)
                     hidden_states.kv_connector_output = kv_connector_output
+                    self.kv_connector_output = kv_connector_output
                     return hidden_states
 
                 if self.is_pooling_model:
                     # Return the pooling output.
-                    output = self._pool(hidden_states, num_scheduled_tokens,
-                                        num_scheduled_tokens_np)
+                    output = self._pool(
+                        hidden_states, num_scheduled_tokens, num_scheduled_tokens_np
+                    )
                     output.kv_connector_output = kv_connector_output
                     return output
 
@@ -2333,42 +2836,96 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # Rare case.
                 assert not self.is_pooling_model
 
+                sample_hidden_states = hidden_states[logits_indices]
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
-                        "residual":
-                        not is_residual_scattered_for_sp(
-                            self.vllm_config, num_input_tokens)
+                        "residual": not is_residual_scattered_for_sp(
+                            self.vllm_config, num_input_tokens
+                        )
                     }
                     get_pp_group().send_tensor_dict(
                         hidden_states.tensors,
                         all_gather_group=get_tp_group(),
-                        all_gather_tensors=all_gather_tensors)
+                        all_gather_tensors=all_gather_tensors,
+                    )
                     logits = None
                 else:
-                    sample_hidden_states = hidden_states[logits_indices]
                     logits = self.model.compute_logits(sample_hidden_states)
 
                 model_output_broadcast_data = {}
                 if logits is not None:
                     model_output_broadcast_data["logits"] = logits.contiguous()
 
-                model_output_broadcast_data = get_pp_group(
-                ).broadcast_tensor_dict(model_output_broadcast_data,
-                                        src=len(get_pp_group().ranks) - 1)
+                model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
+                    model_output_broadcast_data, src=len(get_pp_group().ranks) - 1
+                )
                 assert model_output_broadcast_data is not None
                 logits = model_output_broadcast_data["logits"]
 
-            # Apply structured output bitmasks if present
-            if scheduler_output.grammar_bitmask is not None:
-                apply_grammar_bitmask(scheduler_output, self.input_batch,
-                                      logits, self.device)
+        self.execute_model_state = ExecuteModelState(
+            scheduler_output,
+            logits,
+            spec_decode_metadata,
+            spec_decode_common_attn_metadata,
+            hidden_states,
+            sample_hidden_states,
+            aux_hidden_states,
+            ec_connector_output,
+        )
+        self.kv_connector_output = kv_connector_output
+        return None
+
+    @torch.inference_mode
+    def sample_tokens(
+        self, grammar_output: "GrammarOutput | None"
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
+        kv_connector_output = self.kv_connector_output
+        self.kv_connector_output = None
+
+        if self.execute_model_state is None:
+            # Nothing to do (PP non-final rank case), output isn't used.
+            if not kv_connector_output:
+                return None  # noqa
+
+            # In case of PP with kv transfer, we need to pass through the
+            # kv_connector_output
+            if kv_connector_output.is_empty():
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            output.kv_connector_output = kv_connector_output
+            return output
+
+        # Unpack ephemeral state.
+        (
+            scheduler_output,
+            logits,
+            spec_decode_metadata,
+            spec_decode_common_attn_metadata,
+            hidden_states,
+            sample_hidden_states,
+            aux_hidden_states,
+            ec_connector_output,
+        ) = self.execute_model_state
+        # Clear ephemeral state.
+        self.execute_model_state = None
+
+        # Apply structured output bitmasks if present.
+        if grammar_output is not None:
+            apply_grammar_bitmask(
+                scheduler_output, grammar_output, self.input_batch, logits
+            )
 
-        with record_function_or_nullcontext("Sample"):
+        with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        def propose_draft_token_ids(sampled_token_ids):
+        self.input_batch.prev_sampled_token_ids = None
+
+        def propose_draft_token_ids(
+            sampled_token_ids: torch.Tensor | list[np.ndarray],
+        ) -> None:
             assert spec_decode_common_attn_metadata is not None
-            with record_function_or_nullcontext("Draft"):
+            with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
                     scheduler_output,
                     sampled_token_ids,
@@ -2380,28 +2937,48 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     spec_decode_common_attn_metadata,
                 )
 
-        use_padded_batch_for_eagle = self.speculative_config and \
-            self.speculative_config.use_eagle() and \
-            not self.speculative_config.disable_padded_drafter_batch
+        use_padded_batch_for_eagle = (
+            self.speculative_config
+            and self.speculative_config.use_eagle()
+            and not self.speculative_config.disable_padded_drafter_batch
+        )
         effective_drafter_max_model_len = self.max_model_len
         if effective_drafter_max_model_len is None:
             effective_drafter_max_model_len = self.model_config.max_model_len
-        if (self.speculative_config
-                and self.speculative_config.draft_model_config is not None
-                and self.speculative_config.draft_model_config.max_model_len
-                is not None):
+        if (
+            self.speculative_config
+            and self.speculative_config.draft_model_config is not None
+            and self.speculative_config.draft_model_config.max_model_len is not None
+        ):
             effective_drafter_max_model_len = (
-                self.speculative_config.draft_model_config.max_model_len)
+                self.speculative_config.draft_model_config.max_model_len
+            )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.seq_lens.max() +
-            self.speculative_config.num_speculative_tokens
-            <= effective_drafter_max_model_len)
-        if use_padded_batch_for_eagle and input_fits_in_drafter:
-            # EAGLE speculative decoding can use the GPU sampled tokens
-            # as inputs, and does not need to wait for bookkeeping to finish.
-            propose_draft_token_ids(sampler_output.sampled_token_ids)
-
-        with record_function_or_nullcontext("Bookkeep"):
+            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
+            <= effective_drafter_max_model_len
+        )
+        if use_padded_batch_for_eagle:
+            sampled_token_ids = sampler_output.sampled_token_ids
+            if input_fits_in_drafter:
+                # EAGLE speculative decoding can use the GPU sampled tokens
+                # as inputs, and does not need to wait for bookkeeping to finish.
+                propose_draft_token_ids(sampled_token_ids)
+            elif self.valid_sampled_token_count_event is not None:
+                next_token_ids, valid_sampled_tokens_count = (
+                    self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_indices.gpu,
+                        self.num_discarded_requests,
+                    )
+                )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
+
+        with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
                 num_nans_in_logits,
                 logprobs_lists,
@@ -2410,41 +2987,67 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 req_ids_output_copy,
                 req_id_to_index_output_copy,
                 invalid_req_indices,
-            ) = self._bookkeeping_sync(scheduler_output, sampler_output,
-                                       logits, hidden_states,
-                                       num_scheduled_tokens)
+            ) = self._bookkeeping_sync(
+                scheduler_output,
+                sampler_output,
+                logits,
+                hidden_states,
+                scheduler_output.total_num_scheduled_tokens,
+                spec_decode_metadata,
+            )
 
-        if (self.speculative_config and not use_padded_batch_for_eagle
-                and input_fits_in_drafter):
+        if (
+            self.speculative_config
+            and not use_padded_batch_for_eagle
+            and input_fits_in_drafter
+        ):
             # ngram and other speculative decoding methods use the sampled
             # tokens on the CPU, so they are run after bookkeeping.
             propose_draft_token_ids(valid_sampled_token_ids)
 
-        with record_function_or_nullcontext("EPLB"):
+        with record_function_or_nullcontext("gpu_model_runner: eplb"):
             self.eplb_step()
-
-        output = ModelRunnerOutput(
-            req_ids=req_ids_output_copy,
-            req_id_to_index=req_id_to_index_output_copy,
-            sampled_token_ids=valid_sampled_token_ids,
-            logprobs=logprobs_lists,
-            prompt_logprobs_dict=prompt_logprobs_dict,
-            pooler_output=[],
-            kv_connector_output=kv_connector_output,
-            num_nans_in_logits=num_nans_in_logits,
-        )
+        with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
+            output = ModelRunnerOutput(
+                req_ids=req_ids_output_copy,
+                req_id_to_index=req_id_to_index_output_copy,
+                sampled_token_ids=valid_sampled_token_ids,
+                logprobs=logprobs_lists,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+                kv_connector_output=kv_connector_output,
+                ec_connector_output=ec_connector_output
+                if self.supports_mm_inputs
+                else None,
+                num_nans_in_logits=num_nans_in_logits,
+            )
 
         if not self.use_async_scheduling:
             return output
+        with record_function_or_nullcontext(
+            "gpu_model_runner: AsyncGPUModelRunnerOutput"
+        ):
+            async_output = AsyncGPUModelRunnerOutput(
+                model_runner_output=output,
+                sampled_token_ids=sampler_output.sampled_token_ids,
+                logprobs_tensors=sampler_output.logprobs_tensors,
+                invalid_req_indices=invalid_req_indices,
+                async_output_copy_stream=self.async_output_copy_stream,
+                vocab_size=self.input_batch.vocab_size,
+            )
+        with record_function_or_nullcontext(
+            "gpu_model_runner: set_async_sampled_token_ids"
+        ):
+            # Save ref of sampled_token_ids CPU tensor if the batch contains
+            # any requests with sampling params that require output ids.
+            self.input_batch.set_async_sampled_token_ids(
+                async_output.sampled_token_ids_cpu,
+                async_output.async_copy_ready_event,
+            )
 
-        return AsyncGPUModelRunnerOutput(
-            model_runner_output=output,
-            sampled_token_ids=sampler_output.sampled_token_ids,
-            invalid_req_indices=invalid_req_indices,
-            async_output_copy_stream=self.async_output_copy_stream,
-        )
+        return async_output
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         if self._draft_token_ids is None:
             return None
         req_ids = self.input_batch.req_ids
@@ -2455,26 +3058,63 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._draft_token_ids = None
         return DraftTokenIds(req_ids, draft_token_ids)
 
+    def _copy_valid_sampled_token_count(
+        self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
+    ) -> None:
+        if self.valid_sampled_token_count_event is None:
+            return
+
+        default_stream = torch.cuda.current_stream()
+        # Initialize a new stream to overlap the copy operation with
+        # prepare_input of draft model.
+        with torch.cuda.stream(self.valid_sampled_token_count_copy_stream):
+            self.valid_sampled_token_count_copy_stream.wait_stream(default_stream)  # type: ignore
+            counts = valid_sampled_tokens_count
+            counts_cpu = self.valid_sampled_token_count_cpu
+            counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True)
+            self.valid_sampled_token_count_event.record()
+
+        self.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1)
+
+    def _get_valid_sampled_token_count(self) -> list[int]:
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids
+        if (
+            self.valid_sampled_token_count_event is None
+            or prev_sampled_token_ids is None
+        ):
+            return []
+
+        counts_cpu = self.valid_sampled_token_count_cpu
+        self.valid_sampled_token_count_event.synchronize()
+        return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist()
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: Union[torch.Tensor, list[list[int]]],
+        sampled_token_ids: torch.Tensor | list[np.ndarray],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
-        aux_hidden_states: Optional[list[torch.Tensor]],
-        spec_decode_metadata: Optional[SpecDecodeMetadata],
+        aux_hidden_states: list[torch.Tensor] | None,
+        spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> Union[list[list[int]], torch.Tensor]:
+    ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.drafter.propose(
-                sampled_token_ids, self.input_batch.req_ids,
+                sampled_token_ids,
+                self.input_batch.req_ids,
                 self.input_batch.num_tokens_no_spec,
                 self.input_batch.token_ids_cpu,
-                self.input_batch.spec_decode_unsupported_reqs)
+                self.input_batch.spec_decode_unsupported_reqs,
+            )
+        elif self.speculative_config.method == "suffix":
+            assert isinstance(sampled_token_ids, list)
+            assert isinstance(self.drafter, SuffixDecodingProposer)
+            draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids)
         elif self.speculative_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
@@ -2485,11 +3125,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 indices = []
                 offset = 0
-                assert spec_decode_metadata is not None
+                assert spec_decode_metadata is not None, (
+                    "No spec decode metadata for medusa"
+                )
                 for num_draft, tokens in zip(
-                        spec_decode_metadata.num_draft_tokens,
-                        sampled_token_ids):
-                    indices.append(offset + len(tokens) - 1)
+                    spec_decode_metadata.num_draft_tokens, sampled_token_ids
+                ):
+                    indices.append(offset + tokens.shape[0] - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -2505,72 +3147,85 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # When padded-batch is disabled, the sampled_token_ids should be
                 # the cpu-side list[list[int]] of valid sampled tokens for each
                 # request, with invalid requests having empty lists.
-                assert isinstance(sampled_token_ids, list), \
-                    "sampled_token_ids should be a python list when" \
+                assert isinstance(sampled_token_ids, list), (
+                    "sampled_token_ids should be a python list when"
                     "padded-batch is disabled."
+                )
                 next_token_ids = self.drafter.prepare_next_token_ids_cpu(
-                    sampled_token_ids, self.requests, self.input_batch,
-                    scheduler_output.num_scheduled_tokens)
+                    sampled_token_ids,
+                    self.requests,
+                    self.input_batch,
+                    scheduler_output.num_scheduled_tokens,
+                )
             else:
                 # When using padded-batch, the sampled_token_ids should be
                 # the gpu tensor of sampled tokens for each request, of shape
                 # (num_reqs, num_spec_tokens + 1) with rejected tokens having
                 # value -1.
-                assert isinstance(sampled_token_ids, torch.Tensor), \
-                    "sampled_token_ids should be a torch.Tensor when" \
+                assert isinstance(sampled_token_ids, torch.Tensor), (
+                    "sampled_token_ids should be a torch.Tensor when"
                     "padded-batch is enabled."
-                next_token_ids, valid_sampled_tokens_count = \
+                )
+                next_token_ids, valid_sampled_tokens_count = (
                     self.drafter.prepare_next_token_ids_padded(
                         common_attn_metadata,
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
                         self.discard_request_indices.gpu,
-                        self.num_discarded_requests
+                        self.num_discarded_requests,
                     )
+                )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
                 # input_ids can be None for multimodal models.
                 target_token_ids = self.input_ids.gpu[:num_scheduled_tokens]
-                # TODO(woosuk): Support M-RoPE.
-                target_positions = self.positions.gpu[:num_scheduled_tokens]
+                target_positions = self._get_positions(num_scheduled_tokens)
                 if self.use_aux_hidden_state_outputs:
                     assert aux_hidden_states is not None
                     target_hidden_states = torch.cat(
-                        [h[:num_scheduled_tokens] for h in aux_hidden_states],
-                        dim=-1)
+                        [h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1
+                    )
                 else:
                     target_hidden_states = hidden_states[:num_scheduled_tokens]
             else:
                 if self.speculative_config.disable_padded_drafter_batch:
                     token_indices_to_sample = None
-                    common_attn_metadata, token_indices =\
-                        self.drafter.prepare_inputs(
-                            common_attn_metadata,
-                            sampled_token_ids,
-                            spec_decode_metadata.num_draft_tokens)
+                    common_attn_metadata, token_indices = self.drafter.prepare_inputs(
+                        common_attn_metadata,
+                        sampled_token_ids,
+                        spec_decode_metadata.num_draft_tokens,
+                    )
                 else:
-                    common_attn_metadata, token_indices, \
-                        token_indices_to_sample =\
+                    common_attn_metadata, token_indices, token_indices_to_sample = (
                         self.drafter.prepare_inputs_padded(
                             common_attn_metadata,
                             spec_decode_metadata,
-                            valid_sampled_tokens_count)
+                            valid_sampled_tokens_count,
+                        )
+                    )
 
                 target_token_ids = self.input_ids.gpu[token_indices]
-                # TODO(woosuk): Support M-RoPE.
-                target_positions = self.positions.gpu[token_indices]
+                target_positions = self._get_positions(token_indices)
                 if self.use_aux_hidden_state_outputs:
                     assert aux_hidden_states is not None
                     target_hidden_states = torch.cat(
-                        [h[token_indices] for h in aux_hidden_states], dim=-1)
+                        [h[token_indices] for h in aux_hidden_states], dim=-1
+                    )
                 else:
                     target_hidden_states = hidden_states[token_indices]
-            mm_embeds = None
+
             if self.supports_mm_inputs:
-                mm_embeds = self._gather_mm_embeddings(scheduler_output,
-                                                       shift_computed_tokens=1)
+                mm_embed_inputs = self._gather_mm_embeddings(
+                    scheduler_output,
+                    shift_computed_tokens=1,
+                )
+            else:
+                mm_embed_inputs = None
 
             draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
@@ -2580,16 +3235,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 last_token_indices=token_indices_to_sample,
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
-                mm_embeds=mm_embeds,
+                mm_embed_inputs=mm_embed_inputs,
             )
+
         return draft_token_ids
 
     def update_config(self, overrides: dict[str, Any]) -> None:
         allowed_config_names = {"load_config", "model_config"}
         for config_name, config_overrides in overrides.items():
-            assert config_name in allowed_config_names, \
-                f"Config `{config_name}` not supported. " \
+            assert config_name in allowed_config_names, (
+                f"Config `{config_name}` not supported. "
                 f"Allowed configs: {allowed_config_names}"
+            )
             config = getattr(self, config_name)
             new_config = update_config(config, config_overrides)
             setattr(self, config_name, new_config)
@@ -2599,120 +3256,184 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Args:
             eep_scale_up: the model loading is for elastic EP scale up.
         """
-        logger.info("Starting to load model %s...", self.model_config.model)
-        if eep_scale_up:
-            from vllm.distributed.parallel_state import get_ep_group
-            num_local_physical_experts = torch.empty(1,
-                                                     dtype=torch.int32,
-                                                     device="cpu")
-            torch.distributed.broadcast(num_local_physical_experts,
-                                        group=get_ep_group().cpu_group,
-                                        group_src=0)
-            num_local_physical_experts = int(num_local_physical_experts.item())
-            new_ep_size = get_ep_group().world_size
-            global_expert_load, old_global_expert_indices = (
-                EplbState.recv_state())
-            num_logical_experts = global_expert_load.shape[1]
-            self.parallel_config.eplb_config.num_redundant_experts = (
-                num_local_physical_experts * new_ep_size - num_logical_experts)
-            assert old_global_expert_indices.shape[
-                1] % num_local_physical_experts == 0
-            old_ep_size = old_global_expert_indices.shape[
-                1] // num_local_physical_experts
-            rank_mapping = {
-                old_ep_rank: old_ep_rank
-                for old_ep_rank in range(old_ep_size)
-            }
-        else:
-            global_expert_load = None
-            old_global_expert_indices = None
-            rank_mapping = None
+        logger.info_once(
+            "Starting to load model %s...",
+            self.model_config.model,
+            scope="global",
+        )
+        global_expert_loads, old_global_expert_indices_per_model, rank_mapping = (
+            EplbState.get_eep_state(self.parallel_config)
+            if eep_scale_up
+            else (None, None, None)
+        )
 
+        if self.parallel_config.enable_eplb:
+            self.eplb_state = EplbState(self.parallel_config, self.device)
+            eplb_models = 0
         with DeviceMemoryProfiler() as m:
             time_before_load = time.perf_counter()
             model_loader = get_model_loader(self.load_config)
-            logger.info("Loading model from scratch...")
             self.model = model_loader.load_model(
-                vllm_config=self.vllm_config, model_config=self.model_config)
+                vllm_config=self.vllm_config, model_config=self.model_config
+            )
             if self.lora_config:
-                self.model = self.load_lora_model(self.model, self.vllm_config,
-                                                  self.device)
+                self.model = self.load_lora_model(
+                    self.model, self.vllm_config, self.device
+                )
             if hasattr(self, "drafter"):
-                logger.info("Loading drafter model...")
+                logger.info_once("Loading drafter model...")
                 self.drafter.load_model(self.model)
+                if (
+                    hasattr(self.drafter, "model")
+                    and is_mixture_of_experts(self.drafter.model)
+                    and self.parallel_config.enable_eplb
+                ):
+                    logger.info_once(
+                        "EPLB is enabled for drafter model %s.",
+                        self.vllm_config.speculative_config.draft_model_config.model,
+                    )
+
+                    global_expert_load = (
+                        global_expert_loads[eplb_models]
+                        if global_expert_loads
+                        else None
+                    )
+                    old_global_expert_indices = (
+                        old_global_expert_indices_per_model[eplb_models]
+                        if old_global_expert_indices_per_model
+                        else None
+                    )
+                    if self.eplb_state is None:
+                        self.eplb_state = EplbState(self.parallel_config, self.device)
+                    self.eplb_state.add_model(
+                        self.drafter.model,
+                        self.vllm_config.speculative_config.draft_model_config,
+                        global_expert_load,
+                        old_global_expert_indices,
+                        rank_mapping,
+                    )
+                    eplb_models += 1
+
             if self.use_aux_hidden_state_outputs:
-                if supports_eagle3(self.model):
-                    self.model.set_aux_hidden_state_layers(
-                        self.model.get_eagle3_aux_hidden_state_layers())
-                else:
+                if not supports_eagle3(self.get_model()):
                     raise RuntimeError(
                         "Model does not support EAGLE3 interface but "
-                        "aux_hidden_state_outputs was requested")
+                        "aux_hidden_state_outputs was requested"
+                    )
+
+                # Try to get auxiliary layers from speculative config,
+                # otherwise use model's default layers
+                aux_layers = self._get_eagle3_aux_layers_from_config()
+                if aux_layers:
+                    logger.info(
+                        "Using auxiliary layers from speculative config: %s",
+                        aux_layers,
+                    )
+                else:
+                    aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
+
+                self.model.set_aux_hidden_state_layers(aux_layers)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GiB and %.6f seconds",
-                    self.model_memory_usage / GiB_bytes,
-                    time_after_load - time_before_load)
+        logger.info_once(
+            "Model loading took %.4f GiB memory and %.6f seconds",
+            self.model_memory_usage / GiB_bytes,
+            time_after_load - time_before_load,
+            scope="local",
+        )
         prepare_communication_buffer_for_model(self.model)
+        self.is_multimodal_pruning_enabled = (
+            supports_multimodal_pruning(self.get_model())
+            and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
+        )
 
-        self.is_multimodal_pruning_enabled = (supports_multimodal_pruning(
-            self.model) and self.model_config.multimodal_config.
-                                              is_multimodal_pruning_enabled())
-
-        if is_mixture_of_experts(
-                self.model) and self.parallel_config.enable_eplb:
-            logger.info("EPLB is enabled for model %s.",
-                        self.model_config.model)
-            self.eplb_state = EplbState.build(
+        if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
+            logger.info_once("EPLB is enabled for model %s.", self.model_config.model)
+            global_expert_load = (
+                global_expert_loads[eplb_models] if global_expert_loads else None
+            )
+            old_global_expert_indices = (
+                old_global_expert_indices_per_model[eplb_models]
+                if old_global_expert_indices_per_model
+                else None
+            )
+            assert self.eplb_state is not None
+            self.eplb_state.add_model(
                 self.model,
-                self.device,
-                self.parallel_config,
+                self.model_config,
                 global_expert_load,
                 old_global_expert_indices,
                 rank_mapping,
             )
 
         if (
-            self.vllm_config.compilation_config.level == \
-                CompilationLevel.DYNAMO_AS_IS and supports_dynamo()
+            self.vllm_config.compilation_config.mode
+            == CompilationMode.STOCK_TORCH_COMPILE
+            and supports_dynamo()
         ):
-            backend = self.vllm_config.compilation_config.init_backend(
-                self.vllm_config)
-            compilation_counter.dynamo_as_is_count += 1
+            backend = self.vllm_config.compilation_config.init_backend(self.vllm_config)
+            compilation_counter.stock_torch_compile_count += 1
             self.model.compile(fullgraph=True, backend=backend)
             return
-        # for other compilation levels, cudagraph behavior is controlled by
+        # for other compilation modes, cudagraph behavior is controlled by
         # CudagraphWraper and CudagraphDispatcher of vllm.
 
         # wrap the model with full cudagraph wrapper if needed.
-        if self.compilation_config.cudagraph_mode.has_full_cudagraphs() \
-            and not self.parallel_config.enable_dbo:
-            self.model = CUDAGraphWrapper(self.model,
-                                          self.vllm_config,
-                                          runtime_mode=CUDAGraphMode.FULL)
+        if (
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+            and not self.parallel_config.enable_dbo
+        ):
+            self.model = CUDAGraphWrapper(
+                self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
+            )
         elif self.parallel_config.enable_dbo:
             if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-                self.model = UBatchWrapper(self.model, self.vllm_config,
-                                           CUDAGraphMode.FULL, self.device)
+                self.model = UBatchWrapper(
+                    self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
+                )
             else:
-                self.model = UBatchWrapper(self.model, self.vllm_config,
-                                           CUDAGraphMode.NONE, self.device)
+                self.model = UBatchWrapper(
+                    self.model, self.vllm_config, CUDAGraphMode.NONE, self.device
+                )
+
+    def _get_eagle3_aux_layers_from_config(self) -> tuple[int, ...] | None:
+        """Extract Eagle3 auxiliary layer indices from speculative config.
+
+        These indices specify which hidden states from the base model should
+        be used as auxiliary inputs for the Eagle3 drafter model during
+        speculative decoding.
+
+        Returns:
+            Tuple of layer indices if found in draft model config,
+            None otherwise.
+        """
+        if not (self.speculative_config and self.speculative_config.draft_model_config):
+            return None
+
+        hf_config = self.speculative_config.draft_model_config.hf_config
+        if not hasattr(hf_config, "eagle_aux_hidden_state_layer_ids"):
+            return None
+
+        layer_ids = hf_config.eagle_aux_hidden_state_layer_ids
+        if layer_ids and isinstance(layer_ids, (list, tuple)):
+            return tuple(layer_ids)
+
+        return None
 
     def reload_weights(self) -> None:
-        assert getattr(self, "model", None) is not None, \
+        assert getattr(self, "model", None) is not None, (
             "Cannot reload weights before model is loaded."
+        )
         model_loader = get_model_loader(self.load_config)
         logger.info("Reloading weights inplace...")
-        model = self.get_model()
-        model_loader.load_weights(model, model_config=self.model_config)
+        model_loader.load_weights(self.get_model(), model_config=self.model_config)
 
     def save_tensorized_model(
         self,
         tensorizer_config: "TensorizerConfig",
     ) -> None:
-        model = self.get_model()
         TensorizerLoader.save_model(
-            model,
+            self.get_model(),
             tensorizer_config=tensorizer_config,
             model_config=self.model_config,
         )
@@ -2721,13 +3442,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         hidden_states: torch.Tensor,
         num_scheduled_tokens: dict[str, int],
-    ) -> dict[str, Optional[LogprobsTensors]]:
+    ) -> dict[str, LogprobsTensors | None]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
         in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
-        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
         # maintainable loop over optimal performance.
@@ -2743,7 +3464,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             num_prompt_tokens = len(request.prompt_token_ids)
             prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
-                self.device, non_blocking=True)
+                self.device, non_blocking=True
+            )
 
             # Set up target LogprobsTensors object.
             logprobs_tensors = in_progress_dict.get(req_id)
@@ -2751,7 +3473,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # Create empty logprobs CPU tensors for the entire prompt.
                 # If chunked, we'll copy in slice by slice.
                 logprobs_tensors = LogprobsTensors.empty_cpu(
-                    num_prompt_tokens - 1, num_prompt_logprobs + 1)
+                    num_prompt_tokens - 1, num_prompt_logprobs + 1
+                )
                 in_progress_dict[req_id] = logprobs_tensors
 
             # Determine number of logits to retrieve.
@@ -2781,27 +3504,29 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # then there is prompt logprob generated for each index.
             req_idx = self.input_batch.req_id_to_index[req_id]
             offset = self.query_start_loc.np[req_idx].item()
-            prompt_hidden_states = hidden_states[offset:offset + num_logits]
+            prompt_hidden_states = hidden_states[offset : offset + num_logits]
             logits = self.model.compute_logits(prompt_hidden_states)
 
             # Get the "target" tokens for each index. For prompt at index i,
             # the token at prompt index i+1 is the "sampled" token we want
             # to gather the logprob for.
-            tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
+            tgt_token_ids = prompt_token_ids[start_tok : start_tok + num_logits]
 
             # Compute prompt logprobs.
             logprobs = self.sampler.compute_logprobs(logits)
             token_ids, logprobs, ranks = self.sampler.gather_logprobs(
-                logprobs, num_prompt_logprobs, tgt_token_ids)
+                logprobs, num_prompt_logprobs, tgt_token_ids
+            )
 
             # Transfer GPU->CPU async.
             chunk_slice = slice(start_idx, start_idx + num_logits)
             logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
-                token_ids, non_blocking=True)
-            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
-                                                         non_blocking=True)
+                token_ids, non_blocking=True
+            )
+            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs, non_blocking=True)
             logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
-                ranks, non_blocking=True)
+                ranks, non_blocking=True
+            )
 
         # Remove requests that have completed prefill from the batch
         # num_prompt_logprobs_dict.
@@ -2817,7 +3542,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _get_nans_in_logits(
         self,
-        logits: Optional[torch.Tensor],
+        logits: torch.Tensor | None,
     ) -> dict[str, int]:
         try:
             if logits is None:
@@ -2829,8 +3554,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 req_index = self.input_batch.req_id_to_index[req_id]
                 num_nans_in_logits[req_id] = (
                     int(num_nans_for_index[req_index])
-                    if num_nans_for_index is not None
-                    and req_index < logits.shape[0] else 0)
+                    if num_nans_for_index is not None and req_index < logits.shape[0]
+                    else 0
+                )
             return num_nans_in_logits
         except IndexError:
             return {}
@@ -2856,11 +3582,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.input_ids.gpu,
                     low=0,
                     high=self.model_config.get_vocab_size(),
-                    dtype=input_ids.dtype)
+                    dtype=input_ids.dtype,
+                )
 
             logger.debug_once("Randomizing dummy data for DP Rank")
-            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
-                            non_blocking=True)
+            input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True)
             yield
             input_ids.fill_(0)
 
@@ -2885,19 +3611,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         model = cast(SupportsMultiModal, self.model)
-        return next(mm_kwargs_group
-                    for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                        dummy_mm_items,
-                        device=self.device,
-                        pin_memory=self.pin_memory,
-                        merge_by_field_config=model.merge_by_field_config,
-                    ))
+        return next(
+            mm_kwargs_group
+            for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                dummy_mm_items,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                merge_by_field_config=model.merge_by_field_config,
+                multimodal_cpu_fields=model.multimodal_cpu_fields,
+            )
+        )
 
     @torch.inference_mode()
     def _dummy_run(
         self,
         num_tokens: int,
-        cudagraph_runtime_mode: Optional[CUDAGraphMode] = None,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
         force_attention: bool = False,
         uniform_decode: bool = False,
         allow_microbatching: bool = True,
@@ -2905,6 +3634,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         is_profile: bool = False,
         create_mixed_batch: bool = False,
         remove_lora: bool = True,
+        activate_lora: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -2927,10 +3657,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             create_mixed_batch: If True, create a mixed batch with both decode
                 (1 token) and prefill (multiple tokens) requests.
             remove_lora: If False, dummy LoRAs are not destroyed after the run
+            activate_lora: If False, dummy_run is performed without LoRAs.
         """
-        assert cudagraph_runtime_mode is None or cudagraph_runtime_mode in {
-            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
-        }
+        assert (
+            cudagraph_runtime_mode is None
+            or cudagraph_runtime_mode.valid_runtime_modes()
+        )
 
         # If cudagraph_mode.decode_mode() == FULL and
         # cudagraph_mode.separate_routine(). This means that we are using
@@ -2945,8 +3677,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # When setting max_query_len = 1, we switch to and capture the optimized
         # routine of FA2 for pure decode, i.e., Flashdecode + an optimization
         # for GQA/MQA.
-        max_query_len = self.uniform_decode_query_len if uniform_decode else \
-                                                                num_tokens
+        max_query_len = self.uniform_decode_query_len if uniform_decode else num_tokens
 
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
@@ -2957,19 +3688,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             assert not uniform_decode
             # Create mixed batch:
             # first half decode tokens, second half one prefill
-            num_decode_tokens = num_tokens // 2
+            num_decode_tokens = min(max_num_reqs - 1, num_tokens // 2)
             num_prefill_tokens = num_tokens - num_decode_tokens
             num_reqs = num_decode_tokens + 1
 
             # Create decode requests (1 token each) followed by prefill request
-            num_scheduled_tokens_list = [1] * num_decode_tokens + [
-                num_prefill_tokens
-            ]
+            num_scheduled_tokens_list = [1] * num_decode_tokens + [num_prefill_tokens]
             # Note: Overriding max_query_len to be the prefill tokens
             max_query_len = num_prefill_tokens
         elif uniform_decode:
             assert not create_mixed_batch
-            num_reqs = cdiv(num_tokens, max_query_len)
+            num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len))
             num_scheduled_tokens_list = [max_query_len] * num_reqs
             if num_tokens % max_query_len != 0:
                 num_scheduled_tokens_list[-1] = num_tokens % max_query_len
@@ -2981,129 +3710,86 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
-        num_scheduled_tokens = np.array(num_scheduled_tokens_list,
-                                        dtype=np.int32)
+        num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
         total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
+        num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        ubatch_slices = None
-        num_tokens_after_padding = None
+        # Disable DP padding when running eager
+        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
 
         # We currently only microbatch if the number of tokens is
         # over a certain threshold.
-        if self.parallel_config.enable_dbo and allow_microbatching:
-            ubatch_slices, ubatch_num_tokens_after_padding = ubatch_split(
-                num_scheduled_tokens,
-                total_num_scheduled_tokens,
-                total_num_scheduled_tokens,
-                uniform_decode=uniform_decode,
-                vllm_config=self.vllm_config,
-            )
-            # Currently when DBO is enabled `ubatch_split` returns
-            # the num_tokens_after_padding for a single ubatch, but we have 2
-            # TODO(sage,lucas): this is cruft that should be addressed in the
-            # padding refactor.
-            if ubatch_num_tokens_after_padding is not None:
-                num_tokens_after_padding = ubatch_num_tokens_after_padding * 2
-
-        # If we failed to microbatch, currently need to resynchronize
-        # TODO(lucas,sage): we should be able to avoid this second sync by
-        #  refactoring `get_dp_padding_ubatch` and `get_dp_padding` into
-        #  a single `coordinate_batch_across_dp` function.
-        if num_tokens_after_padding is None:
-            num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
-            num_tokens_after_padding = num_tokens + num_pad
-        else:
-            num_tokens_across_dp = num_tokens_after_padding
-            num_tokens_after_padding = int(num_tokens_after_padding[0].item())
+        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=total_num_scheduled_tokens,
+            parallel_config=self.vllm_config.parallel_config,
+            allow_microbatching=allow_microbatching,
+            allow_dp_padding=allow_dp_padding,
+            num_tokens_padded=total_num_scheduled_tokens,
+            uniform_decode=uniform_decode,
+            num_scheduled_tokens_per_request=num_scheduled_tokens,
+        )
+        num_tokens_after_padding = num_tokens
+        if num_tokens_across_dp is not None:
+            dp_rank = self.parallel_config.data_parallel_rank
+            num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
 
-        attn_metadata: Optional[PerLayerAttnMetadata] = None
+        attn_metadata: PerLayerAttnMetadata | None = None
 
         # If force_attention is True, we always capture attention. Otherwise,
         # it only happens for cudagraph_runtime_mode=FULL.
         if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
-            attn_metadata = {}
-            if ubatch_slices is not None:
-                attn_metadata = [dict() for _ in range(len(ubatch_slices))]
-
             if create_mixed_batch:
                 # In the mixed batch mode (used for FI warmup), we use
                 # shorter sequence lengths to run faster.
                 # TODO(luka) better system for describing dummy batches
                 seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
             else:
-                seq_lens = max_query_len
+                seq_lens = max_query_len  # type: ignore[assignment]
             self.seq_lens.np[:num_reqs] = seq_lens
             self.seq_lens.np[num_reqs:] = 0
             self.seq_lens.copy_to_gpu()
 
-            cum_num_tokens, _ = self._get_cumsum_and_arange(
-                num_scheduled_tokens)
-            self.query_start_loc.np[1:num_reqs + 1] = cum_num_tokens
+            cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
+            self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
             self.query_start_loc.copy_to_gpu()
 
-            for kv_cache_group_id, kv_cache_group_spec in enumerate(
-                    self.kv_cache_config.kv_cache_groups):
-                common_attn_metadata = CommonAttentionMetadata(
-                    query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
-                    query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
-                                                                 1],
-                    seq_lens=self.seq_lens.gpu[:num_reqs],
-                    seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
-                    num_computed_tokens_cpu=self.input_batch.
-                    num_computed_tokens_cpu_tensor[:num_reqs],
-                    num_reqs=num_reqs,
-                    num_actual_tokens=num_tokens,
-                    max_query_len=max_query_len,
-                    max_seq_len=self.max_model_len,
-                    block_table_tensor=self.input_batch.
-                    block_table[kv_cache_group_id].get_device_tensor(num_reqs),
-                    slot_mapping=self.input_batch.block_table[
-                        kv_cache_group_id].slot_mapping.gpu[:num_tokens],
-                    causal=True)
-                for attn_group in self.attn_groups[kv_cache_group_id]:
-                    if ubatch_slices is not None:
-                        common_attn_metadata_list = split_attn_metadata(
-                            ubatch_slices, common_attn_metadata)
-                        for ubid, common_attn_metadata in enumerate(
-                                common_attn_metadata_list):
-                            assert common_attn_metadata.max_query_len == 1
-                            attn_metadata_i = (attn_group\
-                                               .get_metadata_builder(ubatch_id=ubid)\
-                                               .build_for_cudagraph_capture(common_attn_metadata))
-                            for layer_name in attn_group.layer_names:
-                                assert type(attn_metadata) is list
-                                attn_metadata[ubid][
-                                    layer_name] = attn_metadata_i
-                    else:
-                        assert type(attn_metadata) is dict
-                        attn_metadata_i = attn_group.get_metadata_builder()\
-                            .build_for_cudagraph_capture(common_attn_metadata)
-                        for layer_name in attn_group.layer_names:
-                            attn_metadata[layer_name] = attn_metadata_i
-
-        with self.maybe_dummy_run_with_lora(self.lora_config,
-                                            num_scheduled_tokens, remove_lora):
-            model_kwargs = self._init_model_kwargs(num_tokens)
-            if (self.supports_mm_inputs
-                    and not self.model_config.is_encoder_decoder):
+            attn_metadata, _ = self._build_attention_metadata(
+                total_num_scheduled_tokens=num_tokens,
+                max_num_scheduled_tokens=max_query_len,
+                num_reqs=num_reqs,
+                ubatch_slices=ubatch_slices,
+                for_cudagraph_capture=True,
+            )
+
+        with self.maybe_dummy_run_with_lora(
+            self.lora_config,
+            num_scheduled_tokens,
+            num_sampled_tokens,
+            activate_lora,
+            remove_lora,
+        ):
+            # Make sure padding doesn't exceed max_num_tokens
+            assert num_tokens_after_padding <= self.max_num_tokens
+            model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+            if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             elif self.enable_prompt_embeds:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
-                model_kwargs = self._init_model_kwargs(num_tokens)
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
+                model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
             else:
-                input_ids = self.input_ids.gpu[:num_tokens]
+                input_ids = self.input_ids.gpu[:num_tokens_after_padding]
                 inputs_embeds = None
 
             if self.uses_mrope:
-                positions = self.mrope_positions.gpu[:, :num_tokens]
+                positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
             else:
-                positions = self.positions.gpu[:num_tokens]
+                positions = self.positions.gpu[:num_tokens_after_padding]
 
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
@@ -3113,23 +3799,36 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         self.model.make_empty_intermediate_tensors(
                             batch_size=self.max_num_tokens,
                             dtype=self.model_config.dtype,
-                            device=self.device))
+                            device=self.device,
+                        )
+                    )
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                    num_tokens, None, False)
+                    num_tokens_after_padding, None, False
+                )
 
             # filter out the valid batch descriptor
-            _cg_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
-                BatchDescriptor(num_tokens=num_tokens_after_padding,
-                                uniform_decode=uniform_decode)) \
-                if not is_profile else (CUDAGraphMode.NONE, None)
+            _cg_mode, batch_descriptor = (
+                self.cudagraph_dispatcher.dispatch(
+                    BatchDescriptor(
+                        num_tokens=num_tokens_after_padding,
+                        uniform_decode=uniform_decode,
+                        has_lora=activate_lora and self.lora_config is not None,
+                    )
+                )
+                if not is_profile
+                else (CUDAGraphMode.NONE, None)
+            )
             if cudagraph_runtime_mode is not None:
                 # we allow forcing NONE when the dispatcher disagrees to support
                 # warm ups for cudagraph capture
-                assert cudagraph_runtime_mode == CUDAGraphMode.NONE or \
-                    cudagraph_runtime_mode == _cg_mode, (
+                assert (
+                    cudagraph_runtime_mode == CUDAGraphMode.NONE
+                    or cudagraph_runtime_mode == _cg_mode
+                ), (
                     f"Cudagraph runtime mode mismatch at dummy_run. "
-                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.")
+                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
+                )
             else:
                 cudagraph_runtime_mode = _cg_mode
 
@@ -3141,14 +3840,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 if num_tokens_across_dp is not None:
                     num_tokens_across_dp[:] = num_tokens_after_padding
 
-            with self.maybe_randomize_inputs(input_ids), set_forward_context(
+            with (
+                self.maybe_randomize_inputs(input_ids),
+                set_forward_context(
                     attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens_after_padding,
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
                     batch_descriptor=batch_descriptor,
-                    ubatch_slices=ubatch_slices):
+                    ubatch_slices=ubatch_slices,
+                ),
+            ):
                 outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,
@@ -3164,7 +3867,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
-                self.drafter.dummy_run(num_tokens)
+                use_cudagraphs = (
+                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    and not self.speculative_config.enforce_eager
+                )
+
+                # Note(gnovack) - We need to disable cudagraphs for one of the two
+                # lora cases when cudagraph_specialize_lora is enabled. This is a
+                # short term mitigation for issue mentioned in
+                # https://github.com/vllm-project/vllm/issues/28334
+                if self.compilation_config.cudagraph_specialize_lora and activate_lora:
+                    use_cudagraphs = False
+
+                self.drafter.dummy_run(
+                    num_tokens,
+                    use_cudagraphs=use_cudagraphs,
+                )
 
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real
@@ -3177,7 +3895,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.eplb_step(is_dummy=True, is_profile=is_profile)
 
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
-        return hidden_states, hidden_states[logit_indices]
+        logit_indices_device = torch.from_numpy(logit_indices).to(
+            self.device, non_blocking=True
+        )
+        return hidden_states, hidden_states[logit_indices_device]
 
     @torch.inference_mode()
     def _dummy_sampler_run(
@@ -3192,8 +3913,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(hidden_states)
         num_reqs = logits.size(0)
 
-        dummy_tensors = lambda v: torch.full(
-            (num_reqs, ), v, device=self.device)
+        dummy_tensors = lambda v: torch.full((num_reqs,), v, device=self.device)
 
         dummy_metadata = SamplingMetadata(
             temperature=dummy_tensors(0.5),
@@ -3209,47 +3929,46 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             presence_penalties=dummy_tensors(0.1),
             repetition_penalties=dummy_tensors(0.1),
             output_token_ids=[[] for _ in range(num_reqs)],
+            spec_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
             logitsprocs=LogitsProcessors(),
         )
         try:
-            sampler_output = self.sampler(logits=logits,
-                                          sampling_metadata=dummy_metadata)
+            sampler_output = self.sampler(
+                logits=logits, sampling_metadata=dummy_metadata
+            )
         except RuntimeError as e:
-            if 'out of memory' in str(e):
+            if "out of memory" in str(e):
                 raise RuntimeError(
                     "CUDA out of memory occurred when warming up sampler with "
                     f"{num_reqs} dummy requests. Please try lowering "
                     "`max_num_seqs` or `gpu_memory_utilization` when "
-                    "initializing the engine.") from e
+                    "initializing the engine."
+                ) from e
             else:
                 raise e
         if self.speculative_config:
             draft_token_ids = [[0] for _ in range(num_reqs)]
             dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-                draft_token_ids, self.device)
+                draft_token_ids, self.device
+            )
 
             num_tokens = sum(len(ids) for ids in draft_token_ids)
             # draft_probs = torch.randn(
             #     num_tokens, logits.shape[-1], device=self.device,
             #     dtype=logits.dtype)
             draft_probs = None
-            target_logits = torch.randn(num_tokens,
-                                        logits.shape[-1],
-                                        device=self.device,
-                                        dtype=logits.dtype)
-            # NOTE(woosuk): Here, we should use int32 because the sampler uses
-            # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
-            # will occur at runtime.
-            bonus_token_ids = torch.zeros(num_reqs,
-                                          device=self.device,
-                                          dtype=torch.int32)
+            logits = torch.randn(
+                num_tokens + num_reqs,
+                logits.shape[-1],
+                device=self.device,
+                dtype=logits.dtype,
+            )
             self.rejection_sampler(
                 dummy_spec_decode_metadata,
                 draft_probs,
-                target_logits,
-                bonus_token_ids,
+                logits,
                 dummy_metadata,
             )
         return sampler_output
@@ -3274,9 +3993,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_scheduled_tokens_list,
             device="cpu",
         )
-        dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
-                                      dtype=torch.int32,
-                                      device=self.device)
+        dummy_token_ids = torch.zeros(
+            (num_reqs, req_num_tokens), dtype=torch.int32, device=self.device
+        )
 
         model = cast(VllmModelForPooling, self.get_model())
         dummy_pooling_params = PoolingParams(task=task)
@@ -3290,19 +4009,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             pooling_params=[dummy_pooling_params] * num_reqs,
         )
 
-        dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list,
-                                            device=hidden_states.device)
+        dummy_metadata.build_pooling_cursor(
+            num_scheduled_tokens_list, device=hidden_states.device
+        )
 
         try:
-            return model.pooler(hidden_states=hidden_states,
-                                pooling_metadata=dummy_metadata)
+            return model.pooler(
+                hidden_states=hidden_states, pooling_metadata=dummy_metadata
+            )
         except RuntimeError as e:
-            if 'out of memory' in str(e):
+            if "out of memory" in str(e):
                 raise RuntimeError(
                     "CUDA out of memory occurred when warming up pooler "
                     f"({task=}) with {num_reqs} dummy requests. Please try "
                     "lowering `max_num_seqs` or `gpu_memory_utilization` when "
-                    "initializing the engine.") from e
+                    "initializing the engine."
+                ) from e
             else:
                 raise e
 
@@ -3312,8 +4034,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         hidden_states: torch.Tensor,
     ) -> PoolerOutput:
         # Find the task that has the largest output for subsequent steps
+        supported_pooling_tasks = self.get_supported_pooling_tasks()
+
+        if not supported_pooling_tasks:
+            if self.scheduler_config.enable_chunked_prefill:
+                raise RuntimeError(
+                    f"Model {self.model_config.model} does not support "
+                    "any pooling tasks with chunked prefill enabled. "
+                    "Please add --no-enable-chunked-prefill to your "
+                    "config or CLI args. See "
+                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
+                    "to learn more."
+                )
+            else:
+                raise RuntimeError(
+                    f"Model {self.model_config.model} does not support "
+                    "any pooling tasks. See "
+                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
+                    "to learn more."
+                )
+
         output_size = dict[PoolingTask, float]()
-        for task in self.get_supported_pooling_tasks():
+        for task in supported_pooling_tasks:
             # Run a full batch with each task to ensure none of them OOMs
             output = self._dummy_pooler_run_task(hidden_states, task)
             output_size[task] = sum(o.nbytes for o in output)
@@ -3328,7 +4070,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
-                    "encoder cache.")
+                    "encoder cache."
+                )
             else:
                 mm_budget = self.mm_budget
                 assert mm_budget is not None
@@ -3338,8 +4081,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # modality with the max possible input tokens even when
                     # it supports multiple.
                     dummy_modality = mm_budget.get_modality_with_max_tokens()
-                    max_mm_items_per_batch = mm_budget \
-                        .max_items_per_batch_by_modality[dummy_modality]
+                    max_mm_items_per_batch = mm_budget.max_items_per_batch_by_modality[
+                        dummy_modality
+                    ]
 
                     logger.info(
                         "Encoder cache will be initialized with a budget of "
@@ -3357,9 +4101,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     )
 
                     # Run multimodal encoder.
-                    dummy_encoder_outputs = \
-                        self.model.get_multimodal_embeddings(
-                        **batched_dummy_mm_inputs)
+                    dummy_encoder_outputs = self.model.embed_multimodal(
+                        **batched_dummy_mm_inputs
+                    )
 
                     sanity_check_mm_encoder_outputs(
                         dummy_encoder_outputs,
@@ -3376,7 +4120,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         expanded_outputs = []
                         for output in dummy_encoder_outputs:
                             expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1]))
+                                (encoder_budget, encoder_output_shape[-1])
+                            )
                             num_tokens = output.shape[0]
                             expanded[:num_tokens].copy_(output)
                             expanded_outputs.append(expanded)
@@ -3384,12 +4129,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         dummy_encoder_outputs = expanded_outputs
 
                     # Cache the dummy encoder outputs.
-                    self.encoder_cache["tmp"] = dict(
-                        enumerate(dummy_encoder_outputs))
+                    self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
         # Add `is_profile` here to pre-allocate communication buffers
-        hidden_states, last_hidden_states \
-            = self._dummy_run(self.max_num_tokens, is_profile=True)
+        hidden_states, last_hidden_states = self._dummy_run(
+            self.max_num_tokens, is_profile=True
+        )
         if get_pp_group().is_last_rank:
             if self.is_pooling_model:
                 output = self._dummy_pooler_run(hidden_states)
@@ -3406,15 +4151,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             logger.warning(
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
-                "ensure `cudagraph_mode` was not manually set to `NONE`")
+                "ensure `cudagraph_mode` was not manually set to `NONE`"
+            )
             return 0
-        else:
-            self.initialize_cudagraph_capture()
 
         compilation_counter.num_gpu_runner_capture_triggers += 1
 
         start_time = time.perf_counter()
-        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         @contextmanager
         def freeze_gc():
@@ -3437,33 +4180,55 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # can reuse the memory pool allocated for the large shapes.
         set_cudagraph_capturing_enabled(True)
         with freeze_gc(), graph_capture(device=self.device):
+            start_free_gpu_memory = torch.cuda.mem_get_info()[0]
             cudagraph_mode = self.compilation_config.cudagraph_mode
             assert cudagraph_mode is not None
+
+            if self.lora_config:
+                if self.compilation_config.cudagraph_specialize_lora:
+                    lora_cases = [True, False]
+                else:
+                    lora_cases = [True]
+            else:
+                lora_cases = [False]
+
             if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
                 cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
-
-                compilation_cases = list(reversed(self.cudagraph_batch_sizes))
+                # make sure we capture the largest batch size first
+                compilation_cases = list(
+                    product(reversed(self.cudagraph_batch_sizes), lora_cases)
+                )
                 self._capture_cudagraphs(
                     compilation_cases,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
-                    uniform_decode=False)
+                    uniform_decode=False,
+                )
 
             # Capture full cudagraph for uniform decode batches if we
             # don't already have full mixed prefill-decode cudagraphs.
-            if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \
-                cudagraph_mode.separate_routine():
-                max_num_tokens = self.scheduler_config.max_num_seqs * \
-                        self.uniform_decode_query_len
+            if (
+                cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+                and cudagraph_mode.separate_routine()
+            ):
+                max_num_tokens = (
+                    self.scheduler_config.max_num_seqs * self.uniform_decode_query_len
+                )
                 decode_cudagraph_batch_sizes = [
-                    x for x in self.cudagraph_batch_sizes if
-                    x <= max_num_tokens and x >= self.uniform_decode_query_len
+                    x
+                    for x in self.cudagraph_batch_sizes
+                    if max_num_tokens >= x >= self.uniform_decode_query_len
                 ]
                 compilation_cases_decode = list(
-                    reversed(decode_cudagraph_batch_sizes))
+                    product(reversed(decode_cudagraph_batch_sizes), lora_cases)
+                )
                 self._capture_cudagraphs(
                     compilation_cases=compilation_cases_decode,
                     cudagraph_runtime_mode=CUDAGraphMode.FULL,
-                    uniform_decode=True)
+                    uniform_decode=True,
+                )
+
+            torch.cuda.synchronize()
+            end_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         # Disable cudagraph capturing globally, so any unexpected cudagraph
         # capturing will be detected and raise an error after here.
@@ -3473,20 +4238,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         set_cudagraph_capturing_enabled(False)
 
         end_time = time.perf_counter()
-        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time
         cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
         # This usually takes 5~20 seconds.
-        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
-                    elapsed_time, cuda_graph_size / (1 << 30))
+        logger.info_once(
+            "Graph capturing finished in %.0f secs, took %.2f GiB",
+            elapsed_time,
+            cuda_graph_size / (1 << 30),
+            scope="local",
+        )
         return cuda_graph_size
 
-    def _capture_cudagraphs(self, compilation_cases: list[int],
-                            cudagraph_runtime_mode: CUDAGraphMode,
-                            uniform_decode: bool):
-        assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
-            cudagraph_runtime_mode in [CUDAGraphMode.FULL,
-                                        CUDAGraphMode.PIECEWISE]
+    def _capture_cudagraphs(
+        self,
+        compilation_cases: list[tuple[int, bool]],
+        cudagraph_runtime_mode: CUDAGraphMode,
+        uniform_decode: bool,
+    ):
+        assert (
+            cudagraph_runtime_mode != CUDAGraphMode.NONE
+            and cudagraph_runtime_mode.valid_runtime_modes()
+        ), f"Invalid cudagraph runtime mode: {cudagraph_runtime_mode}"
 
         # Only rank 0 should print progress bar during capture
         if is_global_first_rank():
@@ -3495,22 +4267,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 disable=not self.load_config.use_tqdm_on_load,
                 desc="Capturing CUDA graphs ({}, {})".format(
                     "decode" if uniform_decode else "mixed prefill-decode",
-                    cudagraph_runtime_mode.name))
+                    cudagraph_runtime_mode.name,
+                ),
+            )
 
         # We skip EPLB here since we don't want to record dummy metrics
-        for num_tokens in compilation_cases:
+        for num_tokens, activate_lora in compilation_cases:
             # We currently only capture ubatched graphs when its a FULL
             # cudagraph, a uniform decode batch, and the number of tokens
             # is above the threshold. Otherwise we just capture a non-ubatched
             # version of the graph
-            allow_microbatching = self.parallel_config.enable_dbo \
-                and cudagraph_runtime_mode == CUDAGraphMode.FULL \
-                and uniform_decode \
+            allow_microbatching = (
+                self.parallel_config.enable_dbo
+                and cudagraph_runtime_mode == CUDAGraphMode.FULL
+                and uniform_decode
                 and check_ubatch_thresholds(
                     config=self.vllm_config.parallel_config,
                     num_tokens=num_tokens,
                     uniform_decode=uniform_decode,
                 )
+            )
 
             for _ in range(self.compilation_config.cudagraph_num_of_warmups):
                 # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
@@ -3518,29 +4294,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # if we want to warm up attention or not. This is
                 # different from the case where `FULL` implies capture
                 # attention while `PIECEWISE` implies no attention.
-                force_attention = (
-                    cudagraph_runtime_mode == CUDAGraphMode.FULL)
-                self._dummy_run(num_tokens,
-                                cudagraph_runtime_mode=CUDAGraphMode.NONE,
-                                force_attention=force_attention,
-                                uniform_decode=uniform_decode,
-                                allow_microbatching=allow_microbatching,
-                                skip_eplb=True,
-                                remove_lora=False)
-            self._dummy_run(num_tokens,
-                            cudagraph_runtime_mode=cudagraph_runtime_mode,
-                            uniform_decode=uniform_decode,
-                            allow_microbatching=allow_microbatching,
-                            skip_eplb=True,
-                            remove_lora=False)
+                force_attention = cudagraph_runtime_mode == CUDAGraphMode.FULL
+                self._dummy_run(
+                    num_tokens,
+                    cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                    force_attention=force_attention,
+                    uniform_decode=uniform_decode,
+                    allow_microbatching=allow_microbatching,
+                    skip_eplb=True,
+                    remove_lora=False,
+                    activate_lora=activate_lora,
+                )
+            self._dummy_run(
+                num_tokens,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                uniform_decode=uniform_decode,
+                allow_microbatching=allow_microbatching,
+                skip_eplb=True,
+                remove_lora=False,
+                activate_lora=activate_lora,
+            )
         self.maybe_remove_all_loras(self.lora_config)
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
         """
-        assert len(self.attn_groups) == 0, \
-            "Attention backends are already initialized"
+        assert len(self.attn_groups) == 0, "Attention backends are already initialized"
 
         class AttentionGroupKey(NamedTuple):
             attn_backend: type[AttentionBackend]
@@ -3548,10 +4328,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         def get_attn_backends_for_group(
             kv_cache_group_spec: KVCacheGroupSpec,
-        ) -> dict[AttentionGroupKey, list[str]]:
+        ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]:
             layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase,
-                kv_cache_group_spec.layer_names)
+                self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names
+            )
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
             # Dedupe based on full class name; this is a bit safer than
@@ -3571,157 +4351,316 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 full_cls_name = attn_backend.full_cls_name()
                 layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec
                 if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs):
-                    layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[
-                        layer_name]
+                    layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[layer_name]
                 key = (full_cls_name, layer_kv_cache_spec)
-                attn_backends[key] = AttentionGroupKey(attn_backend,
-                                                       layer_kv_cache_spec)
+                attn_backends[key] = AttentionGroupKey(
+                    attn_backend, layer_kv_cache_spec
+                )
                 attn_backend_layers[key].append(layer_name)
-            return {
-                attn_backends[k]: v
-                for k, v in attn_backend_layers.items()
-            }
+            return (
+                {attn_backends[k]: v for k, v in attn_backend_layers.items()},
+                set(group_key.attn_backend for group_key in attn_backends.values()),
+            )
 
         def create_attn_groups(
             attn_backends_map: dict[AttentionGroupKey, list[str]],
+            kv_cache_group_id: int,
         ) -> list[AttentionGroup]:
             attn_groups: list[AttentionGroup] = []
-            for (attn_backend,
-                 kv_cache_spec), layer_names in attn_backends_map.items():
-                attn_group = AttentionGroup.create_with_metadata_builders(
+            for (attn_backend, kv_cache_spec), layer_names in attn_backends_map.items():
+                attn_group = AttentionGroup(
                     attn_backend,
                     layer_names,
                     kv_cache_spec,
-                    self.vllm_config,
-                    self.device,
-                    num_metadata_builders=1
-                    if not self.parallel_config.enable_dbo else 2,
+                    kv_cache_group_id,
                 )
 
                 attn_groups.append(attn_group)
             return attn_groups
 
+        attention_backend_maps = []
+        attention_backend_list = []
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
             attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
-            self.attn_groups.append(create_attn_groups(attn_backends))
+            attention_backend_maps.append(attn_backends[0])
+            attention_backend_list.append(attn_backends[1])
 
+        # Resolve cudagraph_mode before actually initialize metadata_builders
+        self._check_and_update_cudagraph_mode(
+            attention_backend_list, kv_cache_config.kv_cache_groups
+        )
+
+        for i, attn_backend_map in enumerate(attention_backend_maps):
+            self.attn_groups.append(create_attn_groups(attn_backend_map, i))
+
+    def initialize_metadata_builders(
+        self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int]
+    ) -> None:
+        """
+        Create the metadata builders for all KV cache groups and attn groups.
+        """
+        for kv_cache_group_id in range(len(kv_cache_config.kv_cache_groups)):
+            for attn_group in self.attn_groups[kv_cache_group_id]:
+                attn_group.create_metadata_builders(
+                    self.vllm_config,
+                    self.device,
+                    kernel_block_sizes[kv_cache_group_id]
+                    if kv_cache_group_id < len(kernel_block_sizes)
+                    else None,
+                    num_metadata_builders=1
+                    if not self.parallel_config.enable_dbo
+                    else 2,
+                )
         # Calculate reorder batch threshold (if needed)
+        # Note (tdoublep): do this *after* constructing builders,
+        # because some of them change the threshold at init time.
         self.calculate_reorder_batch_threshold()
 
-    def initialize_cudagraph_capture(self) -> None:
+    def _check_and_update_cudagraph_mode(
+        self,
+        attention_backends: list[set[type[AttentionBackend]]],
+        kv_cache_groups: list[KVCacheGroupSpec],
+    ) -> None:
+        """
+        Resolve the cudagraph_mode when there are multiple attention
+        groups with potential conflicting CUDA graph support.
+        Then initialize the cudagraph_dispatcher based on the resolved
+        cudagraph_mode.
+        """
         min_cg_support = AttentionCGSupport.ALWAYS
-        min_cg_builder_name = None
+        min_cg_backend_name = None
+
+        for attn_backend_set, kv_cache_group in zip(
+            attention_backends, kv_cache_groups
+        ):
+            for attn_backend in attn_backend_set:
+                builder_cls = attn_backend.get_builder_cls()
 
-        for attn_group in self._attn_group_iterator():
-            builder = attn_group.get_metadata_builder()
-            if builder.cudagraph_support.value < min_cg_support.value:
-                min_cg_support = builder.cudagraph_support
-                min_cg_builder_name = builder.__class__.__name__
+                cg_support = builder_cls.get_cudagraph_support(
+                    self.vllm_config, kv_cache_group.kv_cache_spec
+                )
+                if cg_support.value < min_cg_support.value:
+                    min_cg_support = cg_support
+                    min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
         # check cudagraph for mixed batch is supported
-        if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \
-            and min_cg_support != AttentionCGSupport.ALWAYS:
-            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
-                   f"with {min_cg_builder_name} backend (support: "
-                   f"{min_cg_support})")
+        if (
+            cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL
+            and min_cg_support != AttentionCGSupport.ALWAYS
+        ):
+            msg = (
+                f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
+                f"with {min_cg_backend_name} backend (support: "
+                f"{min_cg_support})"
+            )
             if min_cg_support == AttentionCGSupport.NEVER:
                 # if not supported any full cudagraphs, just raise it.
-                msg += "; please try cudagraph_mode=PIECEWISE, and "\
-                    "make sure compilation level is piecewise"
+                msg += (
+                    "; please try cudagraph_mode=PIECEWISE, and "
+                    "make sure compilation mode is VLLM_COMPILE"
+                )
                 raise ValueError(msg)
 
             # attempt to resolve the full cudagraph related mode
             if self.compilation_config.splitting_ops_contain_attention():
                 msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.FULL_AND_PIECEWISE
+                )
             else:
                 msg += "; setting cudagraph_mode=FULL_DECODE_ONLY"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.FULL_DECODE_ONLY
+                )
             logger.warning(msg)
 
         # check that if we are doing decode full-cudagraphs it is supported
-        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
-                and min_cg_support == AttentionCGSupport.NEVER):
-            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
-                   f"with {min_cg_builder_name} backend (support: "
-                   f"{min_cg_support})")
-            if (self.compilation_config.level == CompilationLevel.PIECEWISE and
-                (self.compilation_config.splitting_ops_contain_attention()
-                 or self.compilation_config.use_inductor_graph_partition)):
-                msg += "; setting cudagraph_mode=PIECEWISE because "\
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and min_cg_support == AttentionCGSupport.NEVER
+        ):
+            msg = (
+                f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
+                f"with {min_cg_backend_name} backend (support: "
+                f"{min_cg_support})"
+            )
+            if self.compilation_config.mode == CompilationMode.VLLM_COMPILE and (
+                self.compilation_config.splitting_ops_contain_attention()
+                or self.compilation_config.use_inductor_graph_partition
+            ):
+                msg += (
+                    "; setting cudagraph_mode=PIECEWISE because "
                     "attention is compiled piecewise"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                )
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.PIECEWISE
+                )
             else:
-                msg += "; setting cudagraph_mode=NONE because "\
+                msg += (
+                    "; setting cudagraph_mode=NONE because "
                     "attention is not compiled piecewise"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                )
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.NONE
+                )
             logger.warning(msg)
 
         # check that if we are doing spec-decode + decode full-cudagraphs it is
         # supported
-        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
-                and self.uniform_decode_query_len > 1 and min_cg_support.value
-                < AttentionCGSupport.UNIFORM_BATCH.value):
-            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
-                   f" with spec-decode for attention backend "
-                   f"{min_cg_builder_name} (support: {min_cg_support})")
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and self.uniform_decode_query_len > 1
+            and min_cg_support.value < AttentionCGSupport.UNIFORM_BATCH.value
+        ):
+            msg = (
+                f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
+                f" with spec-decode for attention backend "
+                f"{min_cg_backend_name} (support: {min_cg_support})"
+            )
             if self.compilation_config.splitting_ops_contain_attention():
                 msg += "; setting cudagraph_mode=PIECEWISE"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.PIECEWISE
+                )
             else:
                 msg += "; setting cudagraph_mode=NONE"
-                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                cudagraph_mode = self.compilation_config.cudagraph_mode = (
                     CUDAGraphMode.NONE
+                )
             logger.warning(msg)
 
         # double check that we can support full cudagraph if they are requested
         # even after automatic downgrades
-        if cudagraph_mode.has_full_cudagraphs() \
-            and min_cg_support == AttentionCGSupport.NEVER:
-            raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not "
-                             f"supported with {min_cg_builder_name} backend ("
-                             f"support:{min_cg_support}) "
-                             "; please try cudagraph_mode=PIECEWISE, "
-                             "and make sure compilation level is piecewise")
-
-        # Trigger cudagraph dispatching keys initialization here (after
-        # initializing attn backends).
+        if (
+            cudagraph_mode.has_full_cudagraphs()
+            and min_cg_support == AttentionCGSupport.NEVER
+        ):
+            raise ValueError(
+                f"CUDAGraphMode.{cudagraph_mode.name} is not "
+                f"supported with {min_cg_backend_name} backend ("
+                f"support:{min_cg_support}) "
+                "; please try cudagraph_mode=PIECEWISE, "
+                "and make sure compilation mode is VLLM_COMPILE"
+            )
+
+        # if we have dedicated decode cudagraphs, and spec-decode is enabled,
+        # we need to adjust the cudagraph sizes to be a multiple of the uniform
+        # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
+        # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
+        # Will be removed in the near future when we have seperate cudagraph capture
+        # sizes for decode and mixed prefill-decode.
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and cudagraph_mode.separate_routine()
+            and self.uniform_decode_query_len > 1
+        ):
+            self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+                self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
+            )
+            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+
+        # Trigger cudagraph dispatching keys initialization after
+        # resolved cudagraph mode.
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
-            self.compilation_config.cudagraph_mode,
-            self.uniform_decode_query_len)
+            self.compilation_config.cudagraph_mode, self.uniform_decode_query_len
+        )
 
     def calculate_reorder_batch_threshold(self) -> None:
         """
-        Check that if any backends reorder batches; that the reordering
-        is compatible (e.g., decode threshold is the same)
+        Choose the minimum reorder batch threshold from all attention groups.
+        Backends should be able to support lower threshold then what they request
+        just may have a performance penalty due to that backend treating decodes
+        as prefills.
         """
-        for group in self._attn_group_iterator():
-            attn_metadata_builder_i = group.get_metadata_builder()
-
-            # check that if any backends reorder batches; that the reordering
-            # is compatible (e.g., decode threshold is the same)
-            reorder_batch_threshold_i = (
-                attn_metadata_builder_i.reorder_batch_threshold)
-            if reorder_batch_threshold_i is not None:
-                if self.reorder_batch_threshold is not None:
-                    if reorder_batch_threshold_i != \
-                        self.reorder_batch_threshold:
-                        raise ValueError(
-                            f"Attention backend reorders decodes with "
-                            f"threshold {reorder_batch_threshold_i} but other "
-                            f"backend uses threshold "
-                            f"{self.reorder_batch_threshold}")
-                else:
-                    self.reorder_batch_threshold = reorder_batch_threshold_i
+        min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b)
+
+        reorder_batch_thresholds = [
+            group.get_metadata_builder().reorder_batch_threshold
+            for group in self._attn_group_iterator()
+        ]
+        # If there are no attention groups (attention-free model) or no backend
+        # reports a threshold, leave reordering disabled.
+        if len(reorder_batch_thresholds) == 0:
+            self.reorder_batch_threshold = None
+            return
+        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)
 
-    def may_reinitialize_input_batch(self,
-                                     kv_cache_config: KVCacheConfig) -> None:
+    @staticmethod
+    def select_common_block_size(
+        kv_manager_block_size: int, attn_groups: list[AttentionGroup]
+    ) -> int:
+        """
+        Select a block size that is supported by all backends and is a factor of
+        kv_manager_block_size.
+
+        If kv_manager_block_size is supported by all backends, return it directly.
+        Otherwise, return the max supported size.
+
+        Args:
+            kv_manager_block_size: Block size of KV cache
+            attn_groups: List of attention groups
+
+        Returns:
+            The selected block size
+
+        Raises:
+            ValueError: If no valid block size found
+        """
+
+        def block_size_is_supported(
+            backends: list[type[AttentionBackend]], block_size: int
+        ) -> bool:
+            """
+            Check if the block size is supported by all backends.
+            """
+            for backend in backends:
+                is_supported = False
+                for supported_size in backend.supported_kernel_block_sizes:
+                    if isinstance(supported_size, int):
+                        if block_size == supported_size:
+                            is_supported = True
+                    elif isinstance(supported_size, MultipleOf):
+                        if block_size % supported_size.base == 0:
+                            is_supported = True
+                    else:
+                        raise ValueError(f"Unknown supported size: {supported_size}")
+                if not is_supported:
+                    return False
+            return True
+
+        backends = [group.backend for group in attn_groups]
+
+        # Case 1: if the block_size of kv cache manager is supported by all backends,
+        # return it directly
+        if block_size_is_supported(backends, kv_manager_block_size):
+            return kv_manager_block_size
+
+        # Case 2: otherwise, the block_size must be an `int`-format supported size of
+        # at least one backend. Iterate over all `int`-format supported sizes in
+        # descending order and return the first one that is supported by all backends.
+        # Simple proof:
+        # If the supported size b is in MultipleOf(x_i) format for all attention
+        # backends i, and b a factor of kv_manager_block_size, then
+        # kv_manager_block_size also satisfies MultipleOf(x_i) for all i. We will
+        # return kv_manager_block_size in case 1.
+        all_int_supported_sizes = set(
+            supported_size
+            for backend in backends
+            for supported_size in backend.supported_kernel_block_sizes
+            if isinstance(supported_size, int)
+        )
+
+        for supported_size in sorted(all_int_supported_sizes, reverse=True):
+            if kv_manager_block_size % supported_size != 0:
+                continue
+            if block_size_is_supported(backends, supported_size):
+                return supported_size
+        raise ValueError(f"No common block size for {kv_manager_block_size}. ")
+
+    def may_reinitialize_input_batch(
+        self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int]
+    ) -> None:
         """
         Re-initialize the input batch if the block sizes are different from
         `[self.cache_config.block_size]`. This usually happens when there
@@ -3729,16 +4668,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         Args:
             kv_cache_config: The KV cache configuration.
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
         """
         block_sizes = [
             kv_cache_group.kv_cache_spec.block_size
             for kv_cache_group in kv_cache_config.kv_cache_groups
+            if not isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec)
         ]
-        if block_sizes != [self.cache_config.block_size]:
+
+        if block_sizes != [self.cache_config.block_size] or kernel_block_sizes != [
+            self.cache_config.block_size
+        ]:
             assert self.cache_config.cpu_offload_gb == 0, (
                 "Cannot re-initialize the input batch when CPU weight "
                 "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
-                "for more details.")
+                "for more details."
+            )
             self.input_batch = InputBatch(
                 max_num_reqs=self.max_num_reqs,
                 max_model_len=max(self.max_model_len, self.max_encoder_len),
@@ -3747,16 +4692,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
+                kernel_block_sizes=kernel_block_sizes,
                 is_spec_decode=bool(self.vllm_config.speculative_config),
                 logitsprocs=self.input_batch.logitsprocs,
+                logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
-                num_speculative_tokens=(
-                    self.vllm_config.speculative_config.num_speculative_tokens
-                    if self.vllm_config.speculative_config else 0),
+                num_speculative_tokens=self.num_spec_tokens,
             )
 
     def _allocate_kv_cache_tensors(
-            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        self, kv_cache_config: KVCacheConfig
+    ) -> dict[str, torch.Tensor]:
         """
         Initializes the KV cache buffer with the correct size. The buffer needs
         to be reshaped to the desired shape before being used by the models.
@@ -3766,12 +4712,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Returns:
             dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
-         """
+        """
         kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
         for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-            tensor = torch.zeros(kv_cache_tensor.size,
-                                 dtype=torch.int8,
-                                 device=self.device)
+            tensor = torch.zeros(
+                kv_cache_tensor.size, dtype=torch.int8, device=self.device
+            )
             for layer_name in kv_cache_tensor.shared_by:
                 kv_cache_raw_tensors[layer_name] = tensor
 
@@ -3781,8 +4727,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 if layer_name in self.runner_only_attn_layers:
                     continue
                 layer_names.add(layer_name)
-        assert layer_names == set(kv_cache_raw_tensors.keys(
-        )), "Some layers are not correctly initialized"
+        assert layer_names == set(kv_cache_raw_tensors.keys()), (
+            "Some layers are not correctly initialized"
+        )
         return kv_cache_raw_tensors
 
     def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
@@ -3794,10 +4741,54 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for attn_groups in self.attn_groups:
             yield from attn_groups
 
+    def _prepare_kernel_block_sizes(self, kv_cache_config: KVCacheConfig) -> list[int]:
+        """
+        Generate kernel_block_sizes that matches each block_size.
+
+        For attention backends that support virtual block splitting,
+        use the supported block sizes from the backend.
+        For other backends (like Mamba), use the same block size (no splitting).
+
+        Args:
+            kv_cache_config: The KV cache configuration.
+
+        Returns:
+            list[int]: List of kernel block sizes for each cache group.
+        """
+        kernel_block_sizes = []
+        for kv_cache_gid, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group.kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                # All layers in the UniformTypeKVCacheSpecs have the same type,
+                # Pick an arbitrary one to dispatch.
+                kv_cache_spec = next(iter(kv_cache_spec.kv_cache_specs.values()))
+            if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
+                continue
+            elif isinstance(kv_cache_spec, AttentionSpec):
+                # This is an attention backend that supports virtual
+                # block splitting. Get the supported block sizes from
+                # all backends in the group.
+                attn_groups = self.attn_groups[kv_cache_gid]
+                kv_manager_block_size = kv_cache_group.kv_cache_spec.block_size
+                selected_kernel_size = self.select_common_block_size(
+                    kv_manager_block_size, attn_groups
+                )
+                kernel_block_sizes.append(selected_kernel_size)
+            elif isinstance(kv_cache_spec, MambaSpec):
+                # This is likely Mamba or other non-attention cache,
+                # no splitting.
+                kernel_block_sizes.append(kv_cache_spec.block_size)
+            else:
+                raise NotImplementedError(
+                    f"unknown kv cache spec {kv_cache_group.kv_cache_spec}"
+                )
+        return kernel_block_sizes
+
     def _reshape_kv_cache_tensors(
         self,
         kv_cache_config: KVCacheConfig,
         kv_cache_raw_tensors: dict[str, torch.Tensor],
+        kernel_block_sizes: list[int],
     ) -> dict[str, torch.Tensor]:
         """
         Reshape the KV cache tensors to the desired shape and dtype.
@@ -3806,6 +4797,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_config: The KV cache config
             kv_cache_raw_tensors: The KV cache buffer of each layer, with
                 correct size but uninitialized shape.
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
         Returns:
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
@@ -3815,55 +4807,65 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for group in self._kv_cache_spec_attn_group_iterator():
             kv_cache_spec = group.kv_cache_spec
             attn_backend = group.backend
+            if group.kv_cache_group_id == len(kernel_block_sizes):
+                # There may be a last group for layers without kv cache.
+                continue
+            kernel_block_size = kernel_block_sizes[group.kv_cache_group_id]
             for layer_name in group.layer_names:
                 if layer_name in self.runner_only_attn_layers:
                     continue
                 raw_tensor = kv_cache_raw_tensors[layer_name]
                 assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
-                num_blocks = (raw_tensor.numel() //
-                              kv_cache_spec.page_size_bytes)
+                num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
                 if isinstance(kv_cache_spec, AttentionSpec):
                     has_attn = True
+                    num_blocks_per_kv_block = (
+                        kv_cache_spec.block_size // kernel_block_size
+                    )
+                    kernel_num_blocks = num_blocks * num_blocks_per_kv_block
+
                     kv_cache_shape = attn_backend.get_kv_cache_shape(
-                        num_blocks,
-                        kv_cache_spec.block_size,
+                        kernel_num_blocks,
+                        kernel_block_size,
                         kv_cache_spec.num_kv_heads,
                         kv_cache_spec.head_size,
-                        cache_dtype_str=self.cache_config.cache_dtype)
+                        cache_dtype_str=self.cache_config.cache_dtype,
+                    )
                     dtype = kv_cache_spec.dtype
                     try:
-                        kv_cache_stride_order = \
-                            attn_backend.get_kv_cache_stride_order()
-                        assert len(kv_cache_stride_order) == len(
-                            kv_cache_shape)
+                        kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+                        assert len(kv_cache_stride_order) == len(kv_cache_shape)
                     except (AttributeError, NotImplementedError):
-                        kv_cache_stride_order = tuple(
-                            range(len(kv_cache_shape)))
+                        kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
                     # The allocation respects the backend-defined stride order
                     # to ensure the semantic remains consistent for each
                     # backend. We first obtain the generic kv cache shape and
                     # then permute it according to the stride order which could
                     # result in a non-contiguous tensor.
-                    kv_cache_shape = tuple(kv_cache_shape[i]
-                                           for i in kv_cache_stride_order)
+                    kv_cache_shape = tuple(
+                        kv_cache_shape[i] for i in kv_cache_stride_order
+                    )
                     # Maintain original KV shape view.
                     inv_order = [
                         kv_cache_stride_order.index(i)
                         for i in range(len(kv_cache_stride_order))
                     ]
-                    kv_caches[layer_name] = kv_cache_raw_tensors[
-                        layer_name].view(dtype).view(kv_cache_shape).permute(
-                            *inv_order)
+                    kv_caches[layer_name] = (
+                        kv_cache_raw_tensors[layer_name]
+                        .view(dtype)
+                        .view(kv_cache_shape)
+                        .permute(*inv_order)
+                    )
                 elif isinstance(kv_cache_spec, MambaSpec):
                     has_mamba = True
                     raw_tensor = kv_cache_raw_tensors[layer_name]
                     state_tensors = []
                     storage_offset_bytes = 0
-                    for (shape, dtype) in zip(kv_cache_spec.shapes,
-                                              kv_cache_spec.dtypes):
+                    for shape, dtype in zip(kv_cache_spec.shapes, kv_cache_spec.dtypes):
                         dtype_size = get_dtype_size(dtype)
                         num_element_per_page = (
-                            kv_cache_spec.page_size_bytes // dtype_size)
+                            kv_cache_spec.page_size_bytes // dtype_size
+                        )
                         target_shape = (num_blocks, *shape)
                         stride = torch.empty(target_shape).stride()
                         target_stride = (num_element_per_page, *stride[1:])
@@ -3887,7 +4889,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return kv_caches
 
     def _update_hybrid_attention_mamba_layout(
-            self, kv_caches: dict[str, torch.Tensor]) -> None:
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> None:
         """
         Update the layout of attention layers from (2, num_blocks, ...) to
         (num_blocks, 2, ...).
@@ -3900,24 +4903,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_spec = group.kv_cache_spec
             for layer_name in group.layer_names:
                 kv_cache = kv_caches[layer_name]
-                if (isinstance(kv_cache_spec, AttentionSpec)
-                        and kv_cache.shape[0] == 2):
-                    assert kv_cache.shape[1] != 2, \
-                        "Fail to determine whether the layout is " \
-                        "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \
+                if isinstance(kv_cache_spec, AttentionSpec) and kv_cache.shape[0] == 2:
+                    assert kv_cache.shape[1] != 2, (
+                        "Fail to determine whether the layout is "
+                        "(2, num_blocks, ...) or (num_blocks, 2, ...) for "
                         f"a tensor of shape {kv_cache.shape}"
+                    )
                     hidden_size = kv_cache.shape[2:].numel()
-                    kv_cache.as_strided_(size=kv_cache.shape,
-                                         stride=(hidden_size, 2 * hidden_size,
-                                                 *kv_cache.stride()[2:]))
+                    kv_cache.as_strided_(
+                        size=kv_cache.shape,
+                        stride=(hidden_size, 2 * hidden_size, *kv_cache.stride()[2:]),
+                    )
 
     def initialize_kv_cache_tensors(
-            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int]
+    ) -> dict[str, torch.Tensor]:
         """
         Initialize the memory buffer for KV cache.
 
         Args:
             kv_cache_config: The KV cache config
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
+
         Returns:
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
@@ -3925,25 +4932,29 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Initialize the memory buffer for KV cache
         kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
         # Change the memory buffer to the desired shape
-        kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
-                                                   kv_cache_raw_tensors)
+        kv_caches = self._reshape_kv_cache_tensors(
+            kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
+        )
 
         # Set up cross-layer KV cache sharing
-        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
-        ):
-            logger.debug("%s reuses KV cache of %s", layer_name,
-                         target_layer_name)
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
+            logger.debug("%s reuses KV cache of %s", layer_name, target_layer_name)
             kv_caches[layer_name] = kv_caches[target_layer_name]
 
-        num_attn_module = 2 \
-            if self.model_config.hf_config.model_type == "longcat_flash" else 1
-        bind_kv_cache(kv_caches,
-                      self.compilation_config.static_forward_context,
-                      self.kv_caches, num_attn_module)
+        num_attn_module = (
+            2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
+        )
+        bind_kv_cache(
+            kv_caches,
+            self.compilation_config.static_forward_context,
+            self.kv_caches,
+            num_attn_module,
+        )
         return kv_caches
 
     def maybe_add_kv_sharing_layers_to_kv_cache_groups(
-            self, kv_cache_config: KVCacheConfig) -> None:
+        self, kv_cache_config: KVCacheConfig
+    ) -> None:
         """
         Add layers that re-use KV cache to KV cache group of its target layer.
         Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
@@ -3962,12 +4973,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other
             # similar KV sharing setups, only the layers that generate KV caches
             # are involved in the prefill phase, enabling prefill to early exit.
-            attn_layers = get_layers_from_vllm_config(self.vllm_config,
-                                                      Attention)
+            attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
             for layer_name in reversed(attn_layers):
                 if layer_name in self.shared_kv_cache_layers:
-                    self.kv_sharing_fast_prefill_eligible_layers.add(
-                        layer_name)
+                    self.kv_sharing_fast_prefill_eligible_layers.add(layer_name)
                 else:
                     break
 
@@ -3980,11 +4989,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         """
         kv_cache_config = deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
-        self.may_reinitialize_input_batch(kv_cache_config)
         self.may_add_encoder_only_layers_to_kv_cache_config()
         self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
-        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
+        # The kernel block size for all KV cache groups. For example, if
+        # kv_cache_manager uses block_size 256 for a given group, but the attention
+        # backends for that group only supports block_size 64, we will return
+        # kernel_block_size 64 and split the 256-token-block to 4 blocks with 64
+        # tokens each.
+        kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config)
+
+        # create metadata builders
+        self.initialize_metadata_builders(kv_cache_config, kernel_block_sizes)
+
+        # Reinitialize need to after initialize_attn_backend
+        self.may_reinitialize_input_batch(kv_cache_config, kernel_block_sizes)
+        kv_caches = self.initialize_kv_cache_tensors(
+            kv_cache_config, kernel_block_sizes
+        )
 
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
@@ -3993,30 +5015,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.drafter.validate_same_kv_cache_group(kv_cache_config)
 
         if has_kv_transfer_group():
-            get_kv_transfer_group().register_kv_caches(kv_caches)
-            if self.device.type == 'xpu':
-                get_kv_transfer_group().set_host_xfer_buffer_ops(
-                    copy_kv_blocks)
+            kv_transfer_group = get_kv_transfer_group()
+            kv_transfer_group.register_kv_caches(kv_caches)
+            kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            layer_names = self.attn_groups[0][0].layer_names
-            layers = get_layers_from_vllm_config(self.vllm_config,
-                                                 AttentionLayerBase,
-                                                 layer_names)
+            layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
             for layer in layers.values():
                 assert layer.impl.need_to_return_lse_for_decode, (
                     "DCP requires attention impls to return"
                     " the softmax lse for decode, but the impl "
                     f"{layer.impl.__class__.__name__} "
-                    "does not return the softmax lse for decode.")
+                    "does not return the softmax lse for decode."
+                )
 
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
         Add encoder-only layers to the KV cache config.
         """
         block_size = self.vllm_config.cache_config.block_size
-        encoder_only_attn_specs: dict[AttentionSpec,
-                                      list[str]] = defaultdict(list)
+        encoder_only_attn_specs: dict[AttentionSpec, list[str]] = defaultdict(list)
         attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
         for layer_name, attn_module in attn_layers.items():
             if attn_module.attn_type == AttentionType.ENCODER_ONLY:
@@ -4024,16 +5042,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     block_size=block_size,
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype)
+                    dtype=self.kv_cache_dtype,
+                )
                 encoder_only_attn_specs[attn_spec].append(layer_name)
                 self.runner_only_attn_layers.add(layer_name)
         if len(encoder_only_attn_specs) > 0:
-            assert len(
-                encoder_only_attn_specs
-            ) == 1, "Only support one encoder-only attention spec now"
+            assert len(encoder_only_attn_specs) == 1, (
+                "Only support one encoder-only attention spec now"
+            )
             spec, layer_names = encoder_only_attn_specs.popitem()
             self.kv_cache_config.kv_cache_groups.append(
-                KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec))
+                KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec)
+            )
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
@@ -4043,15 +5063,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
-
-        block_size = self.vllm_config.cache_config.block_size
-        use_mla = self.vllm_config.model_config.use_mla
-        cache_dtype_str = self.vllm_config.cache_config.cache_dtype
+        if has_ec_transfer() and get_ec_transfer().is_producer:
+            return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
         for layer_name, attn_module in attn_layers.items():
-            if (kv_tgt_layer :=
-                    attn_module.kv_sharing_target_layer_name) is not None:
+            if isinstance(attn_module, Attention) and (
+                kv_tgt_layer := attn_module.kv_sharing_target_layer_name
+            ):
                 # The layer doesn't need its own KV cache and will use that of
                 # the target layer. We skip creating a KVCacheSpec for it, so
                 # that KV cache management logic will act as this layer does
@@ -4061,90 +5080,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # or enable more requests to be processed simultaneously.
                 self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
                 continue
-
-            # TODO(lucas): move the attention specs into the model layers like
-            # the attention backends
-            if attn_module.attn_type == AttentionType.DECODER:
-                if attn_module.sliding_window is not None:
-                    assert not use_mla, "MLA is not supported for sliding" \
-                        "window"
-                    kv_cache_spec[layer_name] = SlidingWindowSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        sliding_window=attn_module.sliding_window)
-                elif use_mla:
-                    kv_cache_spec[layer_name] = MLAAttentionSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        cache_dtype_str=cache_dtype_str)
-                elif self.attention_chunk_size is not None \
-                        and isinstance(attn_module, ChunkedLocalAttention):
-                    kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        attention_chunk_size=self.attention_chunk_size)
-                else:
-                    kv_cache_spec[layer_name] = FullAttentionSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype)
-            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
-                kv_cache_spec[layer_name] = CrossAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype)
-            elif attn_module.attn_type in (AttentionType.ENCODER,
-                                           AttentionType.ENCODER_ONLY):
-                # encoder-only attention does not need KV cache.
-                continue
-            else:
-                raise ValueError(
-                    f"Unknown attention type: {attn_module.attn_type}")
-
-        mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
-        if len(mamba_layers) > 0:
-            if (self.vllm_config.speculative_config is not None
-                    and self.vllm_config.model_config.hf_config.model_type
-                    not in ["qwen3_next"]):
-                raise NotImplementedError(
-                    "Mamba with speculative decoding is not supported yet.")
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                raise NotImplementedError(
-                    "Prefix caching is not supported for Mamba yet.")
-            max_model_len = self.vllm_config.model_config.max_model_len
-
-            page_size_padded = (
-                self.vllm_config.cache_config.mamba_page_size_padded)
-
-            # Set block_size to max_model_len, so that mamba model will always
-            # have only one block in the KV cache.
-            for layer_name, mamba_module in mamba_layers.items():
-                kv_cache_spec[layer_name] = MambaSpec(
-                    shapes=mamba_module.get_state_shape(),
-                    dtypes=mamba_module.get_state_dtype(),
-                    block_size=max_model_len,
-                    page_size_padded=page_size_padded,
-                    mamba_type=mamba_module.mamba_type,
-                    num_speculative_blocks=(
-                        self.speculative_config.num_speculative_tokens
-                        if self.speculative_config else 0),
-                )
-        ds_indexer_layers = get_layers_from_vllm_config(
-            self.vllm_config, DeepseekV32IndexerCache)
-        for layer_name, ds_indexer_module in ds_indexer_layers.items():
-            kv_cache_spec[layer_name] = ds_indexer_module.get_kv_cache_spec()
+            # Skip modules that don't need KV cache (eg encoder-only attention)
+            if spec := attn_module.get_kv_cache_spec(self.vllm_config):
+                kv_cache_spec[layer_name] = spec
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -4153,8 +5095,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # this is in the critical path of every single model
         # forward loop, this has caused perf issue for a disagg
         # setup.
-        pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
+        pinned = self.sampled_token_ids_pinned_cpu[: sampled_token_ids.shape[0]]
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return pinned.tolist()
+        return [row for row in pinned.numpy()]
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 39be8c74102e434f199fa4194a3ebeb27ba61c6f..9de123263755b461bb00063639d99c2e4f29a0f4 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import threading
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 
@@ -11,14 +12,17 @@ import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import get_ep_group
-from vllm.distributed.device_communicators.pynccl_allocator import (
-    set_graph_pool_id)
-from vllm.forward_context import (create_forward_context, get_forward_context,
-                                  override_forward_context)
+from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
+from vllm.forward_context import (
+    DPMetadata,
+    create_forward_context,
+    get_forward_context,
+    override_forward_context,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import has_deep_gemm
+from vllm.utils.import_utils import has_deep_gemm
 from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts
 
 logger = init_logger(__name__)
@@ -29,8 +33,8 @@ class UbatchMetadata:
     context: UBatchContext
     input_ids: torch.Tensor
     positions: torch.Tensor
-    inputs_embeds: Optional[torch.Tensor]
-    intermediate_tensors: Optional[IntermediateTensors]
+    inputs_embeds: torch.Tensor | None
+    intermediate_tensors: IntermediateTensors | None
     num_tokens: int
 
 
@@ -38,31 +42,35 @@ class UbatchMetadata:
 class CUDAGraphMetaData:
     cudagraph: torch.cuda.CUDAGraph
     ubatch_metadata: UbatchMetadata
-    outputs: Optional[Any] = None
+    outputs: Any | None = None
 
 
 class SMControlContextManager:
-
-    def __init__(self, comm_sms: int, set_comm_sms: Callable[[int], None],
-                 set_compute_sms: Callable[[int], None]):
+    def __init__(
+        self,
+        comm_sms: int,
+        set_comm_sms: Callable[[int], None],
+        set_compute_sms: Callable[[int], None],
+    ):
         """
-        Context manager for controlling SM (Streaming Multiprocessor) 
+        Context manager for controlling SM (Streaming Multiprocessor)
         allocation. Upon entering the context, it sets the number of SMs
         allocated for communication and computation to comm_sms and
         total_sms - comm_sms respectively. Upon exiting, it restores the
         allocation to use all available SMs (i.e. total_sms).
 
         Args:
-            comm_sms (int): The number of SMs to allocate for communication. 
+            comm_sms (int): The number of SMs to allocate for communication.
                 (The remainder will be used for computation.)
-            set_comm_sms (Callable[[int], None]): 
+            set_comm_sms (Callable[[int], None]):
                 A function that sets the number of SMs for communication.
-            set_compute_sms (Callable[[int], None]): 
+            set_compute_sms (Callable[[int], None]):
                 A function that sets the number of SMs for computation.
         """
 
-        assert current_platform.is_cuda(), \
+        assert current_platform.is_cuda(), (
             "SM control is currently only supported on CUDA"
+        )
 
         props = torch.cuda.get_device_properties(torch.cuda.current_device())
         total_sms = props.multi_processor_count
@@ -84,9 +92,13 @@ class SMControlContextManager:
 
 
 class UBatchWrapper:
-
-    def __init__(self, runnable: Callable, vllm_config: VllmConfig,
-                 runtime_mode: CUDAGraphMode, device: torch.cuda.device):
+    def __init__(
+        self,
+        runnable: Callable,
+        vllm_config: VllmConfig,
+        runtime_mode: CUDAGraphMode,
+        device: torch.cuda.device,
+    ):
         self.runnable = runnable
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
@@ -100,7 +112,8 @@ class UBatchWrapper:
         self.graph_pool = None
         if runtime_mode is not CUDAGraphMode.NONE:
             self.cudagraph_wrapper = CUDAGraphWrapper(
-                runnable, vllm_config, runtime_mode=runtime_mode)
+                runnable, vllm_config, runtime_mode=runtime_mode
+            )
             self.graph_pool = current_platform.get_global_graph_pool()
 
         self.sm_control = self._create_sm_control_context(vllm_config)
@@ -114,8 +127,7 @@ class UBatchWrapper:
         if vllm_config.parallel_config.enable_expert_parallel:
             # Currently only DeepEP highthroughput supports SM control so this
             # only affects that case.
-            all2all_manager = get_ep_group(
-            ).device_communicator.all2all_manager
+            all2all_manager = get_ep_group().device_communicator.all2all_manager
 
             if all2all_manager.max_sms_used() is not None:
                 comm_sms = min(comm_sms, all2all_manager.max_sms_used())
@@ -127,18 +139,23 @@ class UBatchWrapper:
         set_compute_sms = lambda sms: None
         if has_deep_gemm() and comm_sms > 0:
             import deep_gemm as dg
+
             set_compute_sms = lambda sms: dg.set_num_sms(sms)
 
-        return SMControlContextManager(comm_sms=comm_sms,
-                                       set_comm_sms=set_comm_sms,
-                                       set_compute_sms=set_compute_sms)
+        return SMControlContextManager(
+            comm_sms=comm_sms,
+            set_comm_sms=set_comm_sms,
+            set_compute_sms=set_compute_sms,
+        )
 
     def __getattr__(self, key: str):
         # allow accessing the attributes of the runnable.
         if hasattr(self.runnable, key):
             return getattr(self.runnable, key)
-        raise AttributeError(f"Attribute {key} not exists in the runnable of "
-                             f"cudagraph wrapper: {self.runnable}")
+        raise AttributeError(
+            f"Attribute {key} not exists in the runnable of "
+            f"cudagraph wrapper: {self.runnable}"
+        )
 
     def unwrap(self) -> Callable:
         # in case we need to access the original runnable.
@@ -153,14 +170,14 @@ class UBatchWrapper:
         the graph capture.
 
         The flow is as follows:
-        1. The main thread starts up each ubatch thread. Each thread will 
+        1. The main thread starts up each ubatch thread. Each thread will
         initialize its cuda context (torch.cuda.current_blas_handle())
         before going to sleep upon entering the ubatch_context.
 
-        2. The main thread starts the graph capture and wakes up the first 
+        2. The main thread starts the graph capture and wakes up the first
         ubatch thread.
 
-        3. Each ubatch thread runs the model to completion and returns the 
+        3. Each ubatch thread runs the model to completion and returns the
         completed output tensors back to the main thread.
 
         4. The main thread stores the captured cudagraph along with its metadata
@@ -187,36 +204,38 @@ class UBatchWrapper:
 
         results: list[tuple[int, torch.Tensor]] = []
         compute_stream = ubatch_metadata[0].context.compute_stream
-        num_tokens = ubatch_metadata[0].num_tokens + \
-            ubatch_metadata[1].num_tokens
+        num_tokens = ubatch_metadata[0].num_tokens + ubatch_metadata[1].num_tokens
 
         # Ubatches will manually manage the forward context, so we override
         # it to None here so we can have it restored correctly later
         with override_forward_context(None):
             ubatch_threads = []
             for metadata in ubatch_metadata:
-                thread = threading.Thread(target=_capture_ubatch_thread,
-                                          args=(
-                                              results,
-                                              metadata,
-                                          ))
+                thread = threading.Thread(
+                    target=_capture_ubatch_thread,
+                    args=(
+                        results,
+                        metadata,
+                    ),
+                )
                 ubatch_threads.append(thread)
                 thread.start()
             self.ready_barrier.wait()  # Wait for both threads to be ready
 
             # Capture the cudagraph
-            cudagraph_metadata = \
-                CUDAGraphMetaData(
-                            cudagraph=torch.cuda.CUDAGraph(),
-                            ubatch_metadata=ubatch_metadata,
-                        )
+            cudagraph_metadata = CUDAGraphMetaData(
+                cudagraph=torch.cuda.CUDAGraph(),
+                ubatch_metadata=ubatch_metadata,
+            )
             if self.graph_pool is not None:
                 set_graph_pool_id(self.graph_pool)
             else:
                 set_graph_pool_id(current_platform.graph_pool_handle())
-            with torch.cuda.graph(cudagraph_metadata.cudagraph,
-                                  stream=compute_stream,
-                                  pool=self.graph_pool):
+            with torch.cuda.graph(
+                cudagraph_metadata.cudagraph,
+                stream=compute_stream,
+                pool=self.graph_pool,
+            ):
                 ubatch_metadata[0].context.cpu_wait_event.set()
                 for thread in ubatch_threads:
                     thread.join()
@@ -227,7 +246,6 @@ class UBatchWrapper:
         return cudagraph_metadata.outputs
 
     def _run_ubatches(self, ubatch_metadata, model) -> torch.Tensor:
-
         @torch.inference_mode()
         def _ubatch_thread(results, model, ubatch_metadata):
             with ubatch_metadata.context:
@@ -247,12 +265,14 @@ class UBatchWrapper:
         with override_forward_context(None):
             ubatch_threads = []
             for metadata in ubatch_metadata:
-                thread = threading.Thread(target=_ubatch_thread,
-                                          args=(
-                                              results,
-                                              model,
-                                              metadata,
-                                          ))
+                thread = threading.Thread(
+                    target=_ubatch_thread,
+                    args=(
+                        results,
+                        model,
+                        metadata,
+                    ),
+                )
                 ubatch_threads.append(thread)
                 thread.start()
             self.ready_barrier.wait()  # Wait for both threads to be ready
@@ -263,11 +283,19 @@ class UBatchWrapper:
         result = torch.cat(sorted_results, dim=0)
         return result
 
-    def _make_ubatch_metadata(self, ubatch_slices, attn_metadata, input_ids,
-                              positions, inputs_embeds, intermediate_tensors,
-                              compute_stream, dp_metadata, batch_descriptor,
-                              cudagraph_runtime_mode) -> list[UbatchMetadata]:
-
+    def _make_ubatch_metadata(
+        self,
+        ubatch_slices,
+        attn_metadata,
+        input_ids,
+        positions,
+        inputs_embeds,
+        intermediate_tensors,
+        compute_stream,
+        dp_metadata,
+        batch_descriptor,
+        cudagraph_runtime_mode,
+    ) -> list[UbatchMetadata]:
         # Create one forward context per ubatch
         forward_contexts = []
         for i, ubatch_slice in enumerate(ubatch_slices):
@@ -277,22 +305,32 @@ class UBatchWrapper:
                     self.vllm_config,
                     dp_metadata=dp_metadata,
                     batch_descriptor=batch_descriptor,
-                    cudagraph_runtime_mode=cudagraph_runtime_mode))
+                    cudagraph_runtime_mode=cudagraph_runtime_mode,
+                )
+            )
 
         ubatch_ctxs = make_ubatch_contexts(
             num_micro_batches=len(ubatch_slices),
             comm_stream=self.comm_stream,
             compute_stream=compute_stream,
             forward_contexts=forward_contexts,
-            ready_barrier=self.ready_barrier)
+            ready_barrier=self.ready_barrier,
+        )
 
         ubatch_metadata: list[UbatchMetadata] = []
         for i, ubatch_slice in enumerate(ubatch_slices):
-            sliced_input_ids, sliced_positions, sliced_inputs_embeds, \
-            sliced_intermediate_tensors = \
-                self._slice_model_inputs(
-                    ubatch_slice.token_slice, input_ids, positions,
-                    inputs_embeds, intermediate_tensors)
+            (
+                sliced_input_ids,
+                sliced_positions,
+                sliced_inputs_embeds,
+                sliced_intermediate_tensors,
+            ) = self._slice_model_inputs(
+                ubatch_slice.token_slice,
+                input_ids,
+                positions,
+                inputs_embeds,
+                intermediate_tensors,
+            )
             ubatch_metadata.append(
                 UbatchMetadata(
                     context=ubatch_ctxs[i],
@@ -300,13 +338,21 @@ class UBatchWrapper:
                     positions=sliced_positions,
                     inputs_embeds=sliced_inputs_embeds,
                     intermediate_tensors=sliced_intermediate_tensors,
-                    num_tokens=ubatch_slice.token_slice.stop -
-                    ubatch_slice.token_slice.start))
+                    num_tokens=ubatch_slice.token_slice.stop
+                    - ubatch_slice.token_slice.start,
+                )
+            )
 
         return ubatch_metadata
 
-    def _slice_model_inputs(self, tokens_slice: slice, input_ids, positions,
-                            inputs_embeds, intermediate_tensors):
+    def _slice_model_inputs(
+        self,
+        tokens_slice: slice,
+        input_ids,
+        positions,
+        inputs_embeds,
+        intermediate_tensors,
+    ):
         sliced_input_ids = input_ids[tokens_slice]
         # if we are using mrope. Mrope adds an additional dimension to the
         # positions tensor
@@ -314,13 +360,17 @@ class UBatchWrapper:
             sliced_positions = positions[:, tokens_slice]
         else:
             sliced_positions = positions[tokens_slice]
-        sliced_inputs_embeds = inputs_embeds[
-            tokens_slice] if inputs_embeds else None
-        sliced_intermediate_tensors = intermediate_tensors[
-            tokens_slice] if intermediate_tensors else None
-
-        return (sliced_input_ids, sliced_positions, sliced_inputs_embeds,
-                sliced_intermediate_tensors)
+        sliced_inputs_embeds = inputs_embeds[tokens_slice] if inputs_embeds else None
+        sliced_intermediate_tensors = (
+            intermediate_tensors[tokens_slice] if intermediate_tensors else None
+        )
+
+        return (
+            sliced_input_ids,
+            sliced_positions,
+            sliced_inputs_embeds,
+            sliced_intermediate_tensors,
+        )
 
     def __call__(self, *args, **kwargs):
         forward_context = get_forward_context()
@@ -330,7 +380,6 @@ class UBatchWrapper:
 
         # If there's no ubatching, just run the runnable object
         if ubatch_slices is None:
-
             # This is to account for the case where ubatching was aborted.
             # When we capture full graphs we only capture one graph per shape,
             # meaning that if we have a ubatched  cudagraph for the current
@@ -342,29 +391,43 @@ class UBatchWrapper:
                 if batch_descriptor.num_tokens in self.cudagraphs:
                     cudagraph_runtime_mode = CUDAGraphMode.NONE
 
-            if cudagraph_runtime_mode in (CUDAGraphMode.NONE,
-                                          CUDAGraphMode.PIECEWISE):
+            if cudagraph_runtime_mode in (CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE):
                 return self.runnable(*args, **kwargs)
             else:
                 assert self.cudagraph_wrapper is not None
                 return self.cudagraph_wrapper(*args, **kwargs)
 
         attn_metadata = forward_context.attn_metadata
-        num_tokens = (ubatch_slices[0].token_slice.stop -
-                      ubatch_slices[0].token_slice.start) * 2
-        input_ids = kwargs['input_ids']
-        positions = kwargs['positions']
-        intermediate_tensors = kwargs['intermediate_tensors']
-        inputs_embeds = kwargs['inputs_embeds']
+        num_tokens = (
+            ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
+        ) * 2
+        input_ids = kwargs["input_ids"]
+        positions = kwargs["positions"]
+        intermediate_tensors = kwargs["intermediate_tensors"]
+        inputs_embeds = kwargs["inputs_embeds"]
         compute_stream = torch.cuda.current_stream()
 
         dp_metadata = forward_context.dp_metadata
 
         # We shouldn't be here unless we are running with multiple DP ranks
         assert dp_metadata is not None
-
-        if num_tokens not in self.cudagraphs \
-            and cudagraph_runtime_mode is CUDAGraphMode.FULL:
+        num_tokens_per_ubatch = (
+            ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
+        )
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        ubatch_num_tokens_across_dp = torch.tensor(
+            [num_tokens_per_ubatch] * dp_size, device="cpu", dtype=torch.int32
+        )
+        ubatch_dp_metadata = DPMetadata.make(
+            self.vllm_config.parallel_config,
+            num_tokens_per_ubatch,
+            ubatch_num_tokens_across_dp,
+        )
+
+        if (
+            num_tokens not in self.cudagraphs
+            and cudagraph_runtime_mode is CUDAGraphMode.FULL
+        ):
             ubatch_metadata = self._make_ubatch_metadata(
                 ubatch_slices=ubatch_slices,
                 attn_metadata=attn_metadata,
@@ -373,13 +436,16 @@ class UBatchWrapper:
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
                 compute_stream=compute_stream,
-                dp_metadata=dp_metadata,
+                dp_metadata=ubatch_dp_metadata,
                 batch_descriptor=batch_descriptor,
-                cudagraph_runtime_mode=CUDAGraphMode.NONE)
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            )
             with self.sm_control:
                 return self._capture_ubatches(ubatch_metadata, self.model)
-        elif num_tokens in self.cudagraphs \
-            and cudagraph_runtime_mode is CUDAGraphMode.FULL:
+        elif (
+            num_tokens in self.cudagraphs
+            and cudagraph_runtime_mode is CUDAGraphMode.FULL
+        ):
             cudagraph_metadata = self.cudagraphs[num_tokens]
             cudagraph_metadata.cudagraph.replay()
             return cudagraph_metadata.outputs
@@ -394,6 +460,7 @@ class UBatchWrapper:
                 compute_stream=compute_stream,
                 dp_metadata=dp_metadata,
                 batch_descriptor=batch_descriptor,
-                cudagraph_runtime_mode=CUDAGraphMode.NONE)
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            )
             with self.sm_control:
                 return self._run_ubatches(ubatch_metadata, self.model)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 8c75e8914857bafac7ee0985c239c3c69b729982..52f84ddbf159fe0879fb9ee115b3416705e8a21a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
-import copy
+
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
-from typing import TYPE_CHECKING, Any, Optional, Union
+from types import NoneType
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.distributed
@@ -13,23 +14,40 @@ import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment,
-                              set_custom_all_reduce)
-from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
-from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.distributed import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+    set_custom_all_reduce,
+)
+from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
+from vllm.distributed.kv_transfer import (
+    ensure_kv_transfer_initialized,
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+)
+from vllm.distributed.parallel_state import (
+    get_pp_group,
+    get_tp_group,
+)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
+from vllm.profiler.gpu_profiler import CudaProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+from vllm.v1.core.sched.output import GrammarOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
-                             DraftTokenIds, ModelRunnerOutput)
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    DraftTokenIds,
+    ModelRunnerOutput,
+)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
@@ -43,7 +61,6 @@ if TYPE_CHECKING:
 
 
 class Worker(WorkerBase):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -52,16 +69,18 @@ class Worker(WorkerBase):
         distributed_init_method: str,
         is_driver_worker: bool = False,
     ):
-
-        super().__init__(vllm_config=vllm_config,
-                         local_rank=local_rank,
-                         rank=rank,
-                         distributed_init_method=distributed_init_method,
-                         is_driver_worker=is_driver_worker)
+        super().__init__(
+            vllm_config=vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
+            from vllm.utils.import_utils import init_cached_hf_modules
+
             init_cached_hf_modules()
 
         # Buffers saved before sleep
@@ -71,8 +90,11 @@ class Worker(WorkerBase):
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
             logger.debug(
                 "Profiler config: record_shapes=%s,"
                 "profile_memory=%s,with_stack=%s,with_flops=%s",
@@ -91,7 +113,11 @@ class Worker(WorkerBase):
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True))
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                ),
+            )
+        elif envs.VLLM_TORCH_CUDA_PROFILE:
+            self.profiler = CudaProfilerWrapper()
         else:
             self.profiler = None
 
@@ -104,22 +130,22 @@ class Worker(WorkerBase):
         if level == 2:
             model = self.model_runner.model
             self._sleep_saved_buffers = {
-                name: buffer.cpu().clone()
-                for name, buffer in model.named_buffers()
+                name: buffer.cpu().clone() for name, buffer in model.named_buffers()
             }
 
         allocator = CuMemAllocator.get_instance()
-        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        allocator.sleep(offload_tags=("weights",) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
         freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
         used_bytes = total - free_bytes_after_sleep
         assert freed_bytes >= 0, "Memory usage increased after sleeping."
         logger.info(
-            "Sleep mode freed %.2f GiB memory, "
-            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
-            used_bytes / GiB_bytes)
+            "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
+            freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes,
+        )
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         from vllm.device_allocator.cumem import CuMemAllocator
 
         allocator = CuMemAllocator.get_instance()
@@ -133,23 +159,21 @@ class Worker(WorkerBase):
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
-    def _maybe_get_memory_pool_context(self,
-                                       tag: str) -> AbstractContextManager:
+    def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator
 
             allocator = CuMemAllocator.get_instance()
             if tag == "weights":
                 assert allocator.get_current_usage() == 0, (
-                    "Sleep mode can only be "
-                    "used for one instance per process.")
+                    "Sleep mode can only be used for one instance per process."
+                )
             context = allocator.use_memory_pool(tag=tag)
         else:
             context = nullcontext()
         return context
 
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -157,6 +181,37 @@ class Worker(WorkerBase):
         if self.device_config.device.type == "cuda":
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            if (
+                self.parallel_config.data_parallel_size > 1
+                and self.parallel_config.data_parallel_size_local > 0
+                and self.parallel_config.distributed_executor_backend
+                not in ["ray", "external_launcher"]
+                and self.vllm_config.parallel_config.data_parallel_backend != "ray"
+                and self.vllm_config.parallel_config.nnodes_within_dp == 1
+            ):
+                # Use local DP rank if available, otherwise use global DP rank.
+                dp_local_rank = self.parallel_config.data_parallel_rank_local
+                if dp_local_rank is None:
+                    dp_local_rank = self.parallel_config.data_parallel_rank
+
+                tp_pp_world_size = (
+                    self.parallel_config.pipeline_parallel_size
+                    * self.parallel_config.tensor_parallel_size
+                )
+
+                # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK
+                self.local_rank += dp_local_rank * tp_pp_world_size
+                assert self.local_rank < torch.cuda.device_count(), (
+                    f"DP adjusted local rank {self.local_rank} is out of bounds. "
+                )
+                visible_device_count = (
+                    torch.cuda.device_count() if torch.cuda.is_available() else 0
+                )
+                assert self.parallel_config.local_world_size <= visible_device_count, (
+                    f"local_world_size ({self.parallel_config.local_world_size}) must "
+                    f"be less than or equal to the number of visible devices "
+                    f"({visible_device_count})."
+                )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
@@ -166,10 +221,13 @@ class Worker(WorkerBase):
             # memory snapshot
             # This ensures NCCL buffers are allocated before we measure
             # available memory
-            init_worker_distributed_environment(self.vllm_config, self.rank,
-                                                self.distributed_init_method,
-                                                self.local_rank,
-                                                current_platform.dist_backend)
+            init_worker_distributed_environment(
+                self.vllm_config,
+                self.rank,
+                self.distributed_init_method,
+                self.local_rank,
+                current_platform.dist_backend,
+            )
 
             # Set random seed.
             set_random_seed(self.model_config.seed)
@@ -180,8 +238,10 @@ class Worker(WorkerBase):
 
             # take current memory snapshot
             self.init_snapshot = MemorySnapshot()
-            self.requested_memory = (self.init_snapshot.total_memory *
-                                     self.cache_config.gpu_memory_utilization)
+            self.requested_memory = (
+                self.init_snapshot.total_memory
+                * self.cache_config.gpu_memory_utilization
+            )
             if self.init_snapshot.free_memory < self.requested_memory:
                 GiB = lambda b: round(b / GiB_bytes, 2)
                 raise ValueError(
@@ -194,12 +254,12 @@ class Worker(WorkerBase):
                     f"utilization or reduce GPU memory used by other processes."
                 )
         else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
+            raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
         # Construct the model runner
         self.model_runner: GPUModelRunner = GPUModelRunner(
-            self.vllm_config, self.device)
+            self.vllm_config, self.device
+        )
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -238,16 +298,17 @@ class Worker(WorkerBase):
             self.model_runner.profile_run()
 
             msg = (
-                f"Initial free memory {GiB(self.init_snapshot.free_memory)} "
-                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for "
+                f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
+                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
                 "KV Cache as specified by kv_cache_memory_bytes config and "
-                "skipped memory profiling. This does does not respect the "
+                "skipped memory profiling. This does not respect the "
                 "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
                 "config when you want manual control of KV cache memory "
                 "size. If OOM'ed, check the difference of initial free "
                 "memory between the current run and the previous run "
                 "where kv_cache_memory_bytes is suggested and update it "
-                "correspondingly.")
+                "correspondingly."
+            )
             logger.info(msg)
             return kv_cache_memory_bytes
 
@@ -257,8 +318,8 @@ class Worker(WorkerBase):
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         with memory_profiling(
-                self.init_snapshot,
-                weights_memory=int(self.model_runner.model_memory_usage),
+            self.init_snapshot,
+            weights_memory=int(self.model_runner.model_memory_usage),
         ) as profile_result:
             self.model_runner.profile_run()
 
@@ -275,15 +336,15 @@ class Worker(WorkerBase):
             "This happens when other processes sharing the same container "
             "release GPU memory while vLLM is profiling during initialization. "
             "To fix this, ensure consistent GPU memory allocation or "
-            "isolate vLLM in its own container.")
-        self.available_kv_cache_memory_bytes = self.requested_memory \
-            - profile_result.non_kv_cache_memory
+            "isolate vLLM in its own container."
+        )
+        self.available_kv_cache_memory_bytes = (
+            self.requested_memory - profile_result.non_kv_cache_memory
+        )
 
-        unrequested_memory = self.init_snapshot.free_memory \
-            - self.requested_memory
+        unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
         logger.debug(
-            "Initial free memory: %.2f GiB; "
-            "Requested memory: %.2f (util), %.2f GiB",
+            "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
             GiB(self.init_snapshot.free_memory),
             self.cache_config.gpu_memory_utilization,
             GiB(self.requested_memory),
@@ -295,18 +356,43 @@ class Worker(WorkerBase):
             GiB(free_gpu_memory - unrequested_memory),
         )
         logger.debug(profile_result)
-        logger.info("Available KV cache memory: %.2f GiB",
-                    GiB(self.available_kv_cache_memory_bytes))
+        logger.info_once(
+            "Available KV cache memory: %.2f GiB",
+            GiB(self.available_kv_cache_memory_bytes),
+            scope="local",
+        )
         gc.collect()
 
         return int(self.available_kv_cache_memory_bytes)
 
+    def get_kv_connector_handshake_metadata(self) -> dict | None:
+        """Get KV connector metadata from this worker if available."""
+
+        if not has_kv_transfer_group():
+            return None
+
+        connector = get_kv_transfer_group()
+        # Return None for connectors that don't need to exchange handshake
+        # metadata across workers.
+        if (metadata := connector.get_handshake_metadata()) is None:
+            return None
+
+        tp_rank = get_tp_group().rank_in_group
+        return {tp_rank: metadata}
+
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
 
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
 
+        # Init kv cache connector here, because it requires
+        # `kv_cache_config`.
+        # NOTE(Kuntai): This need to be done before `initialize_kv_cache`,
+        # because `initialize_kv_cache` will inject kv cache groups not
+        # related to kv cache connector (e.g. kv cache sharing layers).
+        ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator
 
@@ -324,15 +410,14 @@ class Worker(WorkerBase):
         warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
         if not self.model_config.enforce_eager:
             warmup_sizes = [
-                x for x in warmup_sizes if x not in
-                self.vllm_config.compilation_config.cudagraph_capture_sizes
+                x
+                for x in warmup_sizes
+                if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes
             ]
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
-            self.model_runner._dummy_run(size,
-                                         skip_eplb=True,
-                                         remove_lora=False)
+            self.model_runner._dummy_run(size, skip_eplb=True, remove_lora=False)
         self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
 
         # Warmup and tune the kernels used during model execution before
@@ -343,8 +428,9 @@ class Worker(WorkerBase):
         if not self.model_config.enforce_eager:
             cuda_graph_memory_bytes = self.model_runner.capture_model()
 
-        if (self.cache_config.kv_cache_memory_bytes is None
-                and hasattr(self, "peak_activation_memory")):
+        if self.cache_config.kv_cache_memory_bytes is None and hasattr(
+            self, "peak_activation_memory"
+        ):
             # Suggests optimal kv cache memory size if we rely on
             # memory_profiling to guess the kv cache memory size which
             # provides peak_activation_memory and a few other memory
@@ -358,16 +444,22 @@ class Worker(WorkerBase):
             # slightly underestimate the memory consumption.
             # So leave a small buffer (=150MiB) to avoid OOM.
             redundancy_buffer_memory = 150 * (1 << 20)
-            non_kv_cache_memory = (self.model_runner.model_memory_usage +
-                                   self.peak_activation_memory +
-                                   self.non_torch_memory +
-                                   cuda_graph_memory_bytes)
+            non_kv_cache_memory = (
+                self.model_runner.model_memory_usage
+                + self.peak_activation_memory
+                + self.non_torch_memory
+                + cuda_graph_memory_bytes
+            )
             kv_cache_memory_bytes_to_gpu_limit = (
-                self.init_snapshot.free_memory - non_kv_cache_memory -
-                redundancy_buffer_memory)
+                self.init_snapshot.free_memory
+                - non_kv_cache_memory
+                - redundancy_buffer_memory
+            )
             kv_cache_memory_bytes_to_requested_limit = (
-                int(self.requested_memory) - non_kv_cache_memory -
-                redundancy_buffer_memory)
+                int(self.requested_memory)
+                - non_kv_cache_memory
+                - redundancy_buffer_memory
+            )
 
             msg = (
                 f"Free memory on device "
@@ -388,7 +480,8 @@ class Worker(WorkerBase):
                 f"{kv_cache_memory_bytes_to_gpu_limit}` "
                 f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
                 f"utilize gpu memory. Current kv cache memory in use is "
-                f"{GiB(self.available_kv_cache_memory_bytes)} GiB.")
+                f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
+            )
 
             logger.debug(msg)
 
@@ -398,81 +491,97 @@ class Worker(WorkerBase):
         # NOTE: This is called after `capture_model` on purpose to prevent
         # memory buffers from being cleared by `torch.cuda.empty_cache`.
         if get_pp_group().is_last_rank:
-            max_num_reqs = min(self.scheduler_config.max_num_seqs,
-                               self.scheduler_config.max_num_batched_tokens)
+            max_num_reqs = min(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens,
+            )
 
             # We skip EPLB here since we don't want to record dummy metrics
-            hidden_states, last_hidden_states = \
-                self.model_runner._dummy_run(
-                    num_tokens=max_num_reqs,
-                    skip_eplb=True,
-                )
+            hidden_states, last_hidden_states = self.model_runner._dummy_run(
+                num_tokens=max_num_reqs,
+                skip_eplb=True,
+            )
             if self.model_runner.is_pooling_model:
                 self.model_runner._dummy_pooler_run(hidden_states)
             else:
-                self.model_runner._dummy_sampler_run(
-                    hidden_states=last_hidden_states)
+                self.model_runner._dummy_sampler_run(hidden_states=last_hidden_states)
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def reset_mm_cache(self) -> None:
+        self.model_runner.reset_mm_cache()
+
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.model_runner.get_supported_tasks()
 
+    def annotate_profile(self, scheduler_output):
+        # add trace annotation so that we can easily distinguish
+        # new/cached request numbers in each iteration
+        if not self.profiler:
+            return nullcontext()
+
+        num_new = len(scheduler_output.scheduled_new_reqs)
+        num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
+
+        return torch.profiler.record_function(
+            f"execute_new_{num_new}_cached_{num_cached}"
+        )
+
+    @torch.inference_mode()
+    def sample_tokens(
+        self, grammar_output: "GrammarOutput | None"
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput:
+        return self.model_runner.sample_tokens(grammar_output)
+
     @torch.inference_mode()
     def execute_model(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
+        self, scheduler_output: "SchedulerOutput"
+    ) -> ModelRunnerOutput | None:
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        num_input_tokens = self.model_runner._get_num_input_tokens(
-            num_scheduled_tokens)
+        num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens)
         all_gather_tensors = {
-            "residual":
-            not is_residual_scattered_for_sp(self.vllm_config,
-                                             num_input_tokens)
+            "residual": not is_residual_scattered_for_sp(
+                self.vllm_config, num_input_tokens
+            )
         }
         if forward_pass and not get_pp_group().is_first_rank:
             intermediate_tensors = IntermediateTensors(
                 get_pp_group().recv_tensor_dict(
                     all_gather_group=get_tp_group(),
-                    all_gather_tensors=all_gather_tensors))
+                    all_gather_tensors=all_gather_tensors,
+                )
+            )
 
-        output = self.model_runner.execute_model(scheduler_output,
-                                                 intermediate_tensors)
-        if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
-            return output
+        with self.annotate_profile(scheduler_output):
+            output = self.model_runner.execute_model(
+                scheduler_output, intermediate_tensors
+            )
+            if isinstance(output, (ModelRunnerOutput, NoneType)):
+                return output
 
         assert isinstance(output, IntermediateTensors)
         parallel_config = self.vllm_config.parallel_config
-        assert parallel_config.distributed_executor_backend != (
-            "external_launcher") and not get_pp_group().is_last_rank
-
-        get_pp_group().send_tensor_dict(output.tensors,
-                                        all_gather_group=get_tp_group(),
-                                        all_gather_tensors=all_gather_tensors)
-
-        kv_connector_output = output.kv_connector_output
-        if not kv_connector_output:
-            return None
+        assert (
+            parallel_config.distributed_executor_backend != "external_launcher"
+            and not get_pp_group().is_last_rank
+        )
 
-        # In case of PP with kv transfer, we need to pass through the
-        # kv_connector_output
-        if (not kv_connector_output.finished_sending
-                and not kv_connector_output.finished_recving):
-            return EMPTY_MODEL_RUNNER_OUTPUT
+        get_pp_group().send_tensor_dict(
+            output.tensors,
+            all_gather_group=get_tp_group(),
+            all_gather_tensors=all_gather_tensors,
+        )
 
-        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-        output.kv_connector_output = kv_connector_output
-        return output
+        return None
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.model_runner.take_draft_token_ids()
 
     def profile(self, is_start: bool = True):
@@ -482,10 +591,19 @@ class Worker(WorkerBase):
             self.profiler.start()
         else:
             self.profiler.stop()
-            # only print profiler results on rank 0
-            if self.local_rank == 0:
-                print(self.profiler.key_averages().table(
-                    sort_by="self_cuda_time_total"))
+            if isinstance(self.profiler, torch.profiler.profile):
+                rank = self.local_rank
+                profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+                sort_key = "self_cuda_time_total"
+                table = self.profiler.key_averages().table(sort_by=sort_key)
+
+                with open(profiler_out_file, "w") as f:
+                    print(table, file=f)
+
+                # only print profiler results on rank 0
+                if rank == 0:
+                    print(table)
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1, uniform_decode=True)
@@ -506,68 +624,77 @@ class Worker(WorkerBase):
         # worker will always be healthy as long as it's running.
         return
 
-    def _eplb_before_scale_down(self, old_ep_size: int,
-                                new_ep_size: int) -> None:
+    def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None:
         from vllm.distributed.parallel_state import get_ep_group
+
         if get_ep_group().rank == 0:
-            logger.info("[Elastic EP] Starting expert resharding "
-                        "before scaling down...")
+            logger.info(
+                "[Elastic EP] Starting expert resharding before scaling down..."
+            )
         rank_mapping = {
             old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
             for old_ep_rank in range(old_ep_size)
         }
         assert self.model_runner.eplb_state is not None
-        self.model_runner.eplb_state.rearrange(self.model_runner.model,
-                                               execute_shuffle=True,
-                                               global_expert_load=None,
-                                               rank_mapping=rank_mapping)
+        self.model_runner.eplb_state.rearrange(
+            execute_shuffle=True,
+            global_expert_load=None,
+            rank_mapping=rank_mapping,
+        )
         torch.cuda.synchronize()
         if get_ep_group().rank == 0:
             logger.info("[Elastic EP] Expert resharding completed!")
 
     def _eplb_after_scale_up(
-            self, old_ep_size: int, new_ep_size: int,
-            global_expert_load: Optional[torch.Tensor]) -> None:
+        self,
+        old_ep_size: int,
+        new_ep_size: int,
+        global_expert_loads: list[torch.Tensor] | None,
+    ) -> None:
         from vllm.distributed.parallel_state import get_ep_group
+
         if get_ep_group().rank == 0:
-            logger.info("[Elastic EP] Starting expert resharding "
-                        "after scaling up...")
-        rank_mapping = {
-            old_ep_rank: old_ep_rank
-            for old_ep_rank in range(old_ep_size)
-        }
+            logger.info("[Elastic EP] Starting expert resharding after scaling up...")
+        rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
-            self.model_runner.model,
             execute_shuffle=True,
-            global_expert_load=global_expert_load,
-            rank_mapping=rank_mapping)
+            global_expert_loads=global_expert_loads,
+            rank_mapping=rank_mapping,
+        )
         if get_ep_group().rank == 0:
             logger.info("[Elastic EP] Expert resharding completed!")
 
     def _reconfigure_parallel_config(
-            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
         """
         Update parallel config with provided reconfig_request
         """
         parallel_config = self.vllm_config.parallel_config
-        parallel_config.data_parallel_size = \
-            reconfig_request.new_data_parallel_size
-        if reconfig_request.new_data_parallel_rank != \
-        ReconfigureRankType.KEEP_CURRENT_RANK:
-            parallel_config.data_parallel_rank = \
-                reconfig_request.new_data_parallel_rank
-        if reconfig_request.new_data_parallel_rank_local != \
-        ReconfigureRankType.KEEP_CURRENT_RANK:
-            parallel_config.data_parallel_rank_local = \
+        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
+        if (
+            reconfig_request.new_data_parallel_rank
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
+        if (
+            reconfig_request.new_data_parallel_rank_local
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank_local = (
                 reconfig_request.new_data_parallel_rank_local
-        parallel_config.data_parallel_master_ip = \
+            )
+        parallel_config.data_parallel_master_ip = (
             reconfig_request.new_data_parallel_master_ip
-        parallel_config.data_parallel_master_port = \
+        )
+        parallel_config.data_parallel_master_port = (
             reconfig_request.new_data_parallel_master_port
+        )
 
-    def _reconfigure_moe(self, old_ep_size: int,
-                         new_ep_size: int) -> Optional[torch.Tensor]:
+    def _reconfigure_moe(
+        self, old_ep_size: int, new_ep_size: int
+    ) -> torch.Tensor | None:
         """
         Reconfigure MoE modules with provided reconfig_request
 
@@ -575,75 +702,121 @@ class Worker(WorkerBase):
         otherwise None
         """
         from vllm.distributed.parallel_state import (
-            get_dp_group, get_ep_group, prepare_communication_buffer_for_model)
+            get_dp_group,
+            get_ep_group,
+            prepare_communication_buffer_for_model,
+        )
         from vllm.model_executor.layers.fused_moe.layer import (
-            FusedMoEParallelConfig)
+            FusedMoE,
+            FusedMoEParallelConfig,
+        )
 
         parallel_config = self.vllm_config.parallel_config
-        moe_modules = [
-            module for module in self.model_runner.model.modules()
-            if (module.__class__.__name__ == "FusedMoE"
-                or module.__class__.__name__ == "SharedFusedMoE")
-        ]
-        num_local_experts = moe_modules[0].moe_config.num_local_experts
-        assert all(module.moe_config.num_local_experts == num_local_experts
-                   for module in moe_modules), (
-                       "All MoE modules must have the same number of experts")
-        for module in moe_modules:
-            module.moe_config.num_experts = num_local_experts * new_ep_size
-            module.global_num_experts = module.moe_config.num_experts
-            module.moe_parallel_config = FusedMoEParallelConfig.make(
-                tp_size_=get_tp_group().world_size,
-                dp_size_=get_dp_group().world_size,
-                vllm_parallel_config=parallel_config,
-            )
-            module.moe_config.moe_parallel_config = module.moe_parallel_config
+
+        def get_moe_modules(model: torch.nn.Module) -> list[FusedMoE]:
+            return [
+                module
+                for module in model.modules()
+                if (
+                    module.__class__.__name__ == "FusedMoE"
+                    or module.__class__.__name__ == "SharedFusedMoE"
+                )
+            ]
+
+        def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
+            assert all(
+                module.moe_config.num_local_experts == num_local_experts
+                for module in moe_modules
+            ), "All MoE modules must have the same number of experts"
+            for module in moe_modules:
+                module.moe_config.num_experts = num_local_experts * new_ep_size
+                module.global_num_experts = module.moe_config.num_experts
+                module.moe_parallel_config = FusedMoEParallelConfig.make(
+                    tp_size_=get_tp_group().world_size,
+                    dp_size_=get_dp_group().world_size,
+                    vllm_parallel_config=parallel_config,
+                )
+                module.moe_config.moe_parallel_config = module.moe_parallel_config
+            return moe_modules
+
+        model_moe_modules = get_moe_modules(self.model_runner.model)
+        num_local_experts = model_moe_modules[0].moe_config.num_local_experts
+
+        update_moe_modules(model_moe_modules, num_local_experts)
+        drafter_model = None
+        if hasattr(self.model_runner, "drafter") and hasattr(
+            self.model_runner.drafter, "model"
+        ):
+            drafter_model = self.model_runner.drafter.model
+        if drafter_model is not None and is_mixture_of_experts(drafter_model):
+            drafter_moe_modules = get_moe_modules(drafter_model)
+            # Check if drafter and model have matching configs
+            assert (
+                drafter_moe_modules[0].moe_config.num_local_experts == num_local_experts
+            ), "Drafter and model configs should be the same"
+            update_moe_modules(drafter_moe_modules, num_local_experts)
+
         if new_ep_size < old_ep_size:
             num_local_physical_experts = num_local_experts
             assert self.model_runner.eplb_state is not None
-            new_physical_experts = \
+            new_physical_experts = (
                 self.model_runner.eplb_state.physical_to_logical_map.shape[1]
+            )
             parallel_config.eplb_config.num_redundant_experts = (
-                new_physical_experts -
-                self.model_runner.eplb_state.logical_replica_count.shape[1])
-            global_expert_load = None
+                new_physical_experts
+                - self.model_runner.eplb_state.logical_replica_count.shape[1]
+            )
+            global_expert_loads = None
         else:
-            num_local_physical_experts = torch.tensor([num_local_experts],
-                                                      dtype=torch.int32,
-                                                      device="cpu")
-            torch.distributed.broadcast(num_local_physical_experts,
-                                        group=get_ep_group().cpu_group,
-                                        group_src=0)
+            num_local_physical_experts = torch.tensor(
+                [num_local_experts], dtype=torch.int32, device="cpu"
+            )
+            torch.distributed.broadcast(
+                num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0
+            )
             num_local_physical_experts = num_local_physical_experts.item()
             new_physical_experts = num_local_physical_experts * new_ep_size
             assert self.model_runner.eplb_state is not None
-            global_expert_load = self.model_runner.eplb_state.rearrange(
-                self.model_runner.model, execute_shuffle=False)
+            global_expert_loads = self.model_runner.eplb_state.rearrange(
+                execute_shuffle=False
+            )
             parallel_config.eplb_config.num_redundant_experts = (
-                new_physical_experts - global_expert_load.shape[1])
+                new_physical_experts - global_expert_loads[0].shape[1]
+            )
         prepare_communication_buffer_for_model(self.model_runner.model)
+        if drafter_model is not None:
+            prepare_communication_buffer_for_model(drafter_model)
         self.model_runner.model.update_physical_experts_metadata(
             num_physical_experts=new_physical_experts,
-            num_local_physical_experts=num_local_physical_experts)
-        return global_expert_load
+            num_local_physical_experts=num_local_physical_experts,
+        )
+        return global_expert_loads
 
     def reinitialize_distributed(
-            self, reconfig_request: ReconfigureDistributedRequest) -> None:
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
         from vllm.config import set_current_vllm_config
         from vllm.distributed.parallel_state import (
-            cleanup_dist_env_and_memory, get_ep_group)
+            cleanup_dist_env_and_memory,
+            get_ep_group,
+        )
 
         old_ep_size = get_ep_group().world_size
         old_ep_rank = get_ep_group().rank
-        new_ep_size = reconfig_request.new_data_parallel_size * get_tp_group(
-        ).world_size * get_pp_group().world_size
+        new_ep_size = (
+            reconfig_request.new_data_parallel_size
+            * get_tp_group().world_size
+            * get_pp_group().world_size
+        )
         if new_ep_size < old_ep_size:
             self._eplb_before_scale_down(old_ep_size, new_ep_size)
 
         cleanup_dist_env_and_memory()
 
-        if reconfig_request.new_data_parallel_rank == \
-        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
             assert old_ep_rank >= new_ep_size
             # shutdown
             return
@@ -651,24 +824,27 @@ class Worker(WorkerBase):
         self._reconfigure_parallel_config(reconfig_request)
 
         with set_current_vllm_config(self.vllm_config):
-            init_worker_distributed_environment(self.vllm_config, self.rank,
-                                                self.distributed_init_method,
-                                                self.local_rank)
+            init_worker_distributed_environment(
+                self.vllm_config,
+                self.rank,
+                self.distributed_init_method,
+                self.local_rank,
+            )
 
-        global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size)
+        global_expert_loads = self._reconfigure_moe(old_ep_size, new_ep_size)
 
         if new_ep_size > old_ep_size:
-            assert global_expert_load is not None
-            self._eplb_after_scale_up(old_ep_size, new_ep_size,
-                                      global_expert_load)
+            assert global_expert_loads is not None
+            self._eplb_after_scale_up(old_ep_size, new_ep_size, global_expert_loads)
 
     def save_sharded_state(
         self,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         from vllm.model_executor.model_loader import ShardedStateLoader
+
         ShardedStateLoader.save_model(
             self.model_runner.model,
             path,
@@ -681,7 +857,8 @@ class Worker(WorkerBase):
         tensorizer_config: "TensorizerConfig",
     ) -> None:
         self.model_runner.save_tensorized_model(
-            tensorizer_config=tensorizer_config, )
+            tensorizer_config=tensorizer_config,
+        )
 
     def shutdown(self) -> None:
         if runner := getattr(self, "model_runner", None):
@@ -691,20 +868,27 @@ class Worker(WorkerBase):
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
     rank: int,
-    distributed_init_method: Optional[str] = None,
+    distributed_init_method: str | None = None,
     local_rank: int = -1,
     backend: str = "nccl",
 ) -> None:
     """Initialize the distributed environment."""
     parallel_config = vllm_config.parallel_config
+    from vllm.model_executor.layers.batch_invariant import init_batch_invariance
+
+    init_batch_invariance()
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
-    init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank, backend)
+    init_distributed_environment(
+        parallel_config.world_size, rank, distributed_init_method, local_rank, backend
+    )
 
     ensure_model_parallel_initialized(
         parallel_config.tensor_parallel_size,
         parallel_config.pipeline_parallel_size,
-        parallel_config.decode_context_parallel_size)
+        parallel_config.decode_context_parallel_size,
+    )
 
-    ensure_kv_transfer_initialized(vllm_config)
+    # Init ec connector here before KV caches caches init
+    # NOTE: We do not init KV caches for Encoder-only instance in EPD disagg mode
+    ensure_ec_transfer_initialized(vllm_config)
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index 7eaff924ecc1f16e358426aa533497218410f9d6..db037a9fccd5c7753e8b14724a401949f4ad39e7 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -3,22 +3,29 @@
 """
 Define KV connector functionality mixin for model runners.
 """
+
 import copy
+from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
-from typing import Generator  # noqa: UP035
-from typing import TYPE_CHECKING, Optional
+from typing import (
+    TYPE_CHECKING,  # noqa: UP035
+)
 
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown,
-                                          get_kv_transfer_group,
-                                          has_kv_transfer_group)
+from vllm.distributed.kv_transfer import (
+    ensure_kv_transfer_shutdown,
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+)
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
-from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
-    KVConnectorStats)
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    KVConnectorOutput,
+    ModelRunnerOutput,
+)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -28,7 +35,6 @@ logger = init_logger(__name__)
 
 # Defined as a kv connector functionality mixin for ModelRunner (GPU, TPU)
 class KVConnectorModelRunnerMixin:
-
     @staticmethod
     def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
         # Update KVConnector with the KVConnector metadata forward().
@@ -36,8 +42,7 @@ class KVConnectorModelRunnerMixin:
             kv_connector = get_kv_transfer_group()
             assert isinstance(kv_connector, KVConnectorBase)
             assert scheduler_output.kv_connector_metadata is not None
-            kv_connector.bind_connector_metadata(
-                scheduler_output.kv_connector_metadata)
+            kv_connector.bind_connector_metadata(scheduler_output.kv_connector_metadata)
 
             # Background KV cache transfers happen here.
             # These transfers are designed to be async and the requests
@@ -59,24 +64,27 @@ class KVConnectorModelRunnerMixin:
     @staticmethod
     def get_finished_kv_transfers(
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_finished(
-                scheduler_output.finished_req_ids)
+                scheduler_output.finished_req_ids
+            )
         return None, None
 
     @staticmethod
-    def kv_connector_no_forward(scheduler_output: "SchedulerOutput",
-                                vllm_config: VllmConfig) -> ModelRunnerOutput:
+    def kv_connector_no_forward(
+        scheduler_output: "SchedulerOutput", vllm_config: VllmConfig
+    ) -> ModelRunnerOutput:
         # KV send/recv even if no work to do.
-        with set_forward_context(
-                None, vllm_config
-        ), KVConnectorModelRunnerMixin._get_kv_connector_output(
-                scheduler_output, wait_for_save=False) as kv_connector_output:
+        with (
+            set_forward_context(None, vllm_config),
+            KVConnectorModelRunnerMixin._get_kv_connector_output(
+                scheduler_output, wait_for_save=False
+            ) as kv_connector_output,
+        ):
             pass
 
-        if (not kv_connector_output.finished_sending
-                and not kv_connector_output.finished_recving):
+        if kv_connector_output.is_empty():
             return EMPTY_MODEL_RUNNER_OUTPUT
 
         output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
@@ -85,18 +93,20 @@ class KVConnectorModelRunnerMixin:
 
     @staticmethod
     def maybe_get_kv_connector_output(
-        scheduler_output: "SchedulerOutput"
-    ) -> AbstractContextManager[Optional[KVConnectorOutput]]:
-        return KVConnectorModelRunnerMixin._get_kv_connector_output(
-            scheduler_output) if has_kv_transfer_group() else nullcontext()
+        scheduler_output: "SchedulerOutput",
+    ) -> AbstractContextManager[KVConnectorOutput | None]:
+        return (
+            KVConnectorModelRunnerMixin._get_kv_connector_output(scheduler_output)
+            if has_kv_transfer_group()
+            else nullcontext()
+        )
 
     # This context manager must be used within an active forward context.
     # It encapsulates the entire KV connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_kv_connector_output(
-        scheduler_output: "SchedulerOutput",
-        wait_for_save: bool = True
+        scheduler_output: "SchedulerOutput", wait_for_save: bool = True
     ) -> Generator[KVConnectorOutput, None, None]:
         output = KVConnectorOutput()
 
@@ -104,8 +114,7 @@ class KVConnectorModelRunnerMixin:
         kv_connector = get_kv_transfer_group()
         assert isinstance(kv_connector, KVConnectorBase)
         assert scheduler_output.kv_connector_metadata is not None
-        kv_connector.bind_connector_metadata(
-            scheduler_output.kv_connector_metadata)
+        kv_connector.bind_connector_metadata(scheduler_output.kv_connector_metadata)
 
         # Background KV cache transfers happen here.
         # These transfers are designed to be async and the requests
@@ -119,14 +128,17 @@ class KVConnectorModelRunnerMixin:
                 kv_connector.wait_for_save()
 
             output.finished_sending, output.finished_recving = (
-                kv_connector.get_finished(scheduler_output.finished_req_ids))
+                kv_connector.get_finished(scheduler_output.finished_req_ids)
+            )
+            output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
 
-            output.kv_connector_stats = KVConnectorModelRunnerMixin.\
-            get_kv_connector_stats()
+            output.kv_connector_stats = (
+                KVConnectorModelRunnerMixin.get_kv_connector_stats()
+            )
             kv_connector.clear_connector_metadata()
 
     @staticmethod
-    def get_kv_connector_stats() -> Optional[KVConnectorStats]:
+    def get_kv_connector_stats() -> KVConnectorStats | None:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index e416f50322f48108ba0906b0ac4f725eda1d0493..37abe56494609aedb2f67c7168061b6d32bb9fa2 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,6 @@ Define LoRA functionality mixin for model runners.
 """
 
 from contextlib import contextmanager
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -21,27 +20,24 @@ from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
 from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
-InputBatch = Union[TPUInputBatch, GPUInputBatch]
+InputBatch = TPUInputBatch | GPUInputBatch
 
 logger = init_logger(__name__)
 
 
 # Defined as a mixin for GPUModelRunner
 class LoRAModelRunnerMixin:
-
-    LORA_WARMUP_RANK = 8
-
-    def load_lora_model(self, model: nn.Module, vllm_config: VllmConfig,
-                        device: torch.device) -> nn.Module:
-
+    def load_lora_model(
+        self, model: nn.Module, vllm_config: VllmConfig, device: torch.device
+    ) -> nn.Module:
         if not supports_lora(model):
-            raise ValueError(
-                f"{model.__class__.__name__} does not support LoRA yet.")
+            raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.")
 
         if supports_multimodal(model):
-            logger.warning("Regarding multimodal models, vLLM currently "
-                           "only supports adding LoRA to language model.")
-
+            logger.warning(
+                "Regarding multimodal models, vLLM currently "
+                "only supports adding LoRA to language model."
+            )
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
             vllm_config,
@@ -51,41 +47,50 @@ class LoRAModelRunnerMixin:
         )
         return self.lora_manager.create_lora_manager(model)
 
-    def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...],
-                          token_lora_mapping: tuple[int, ...],
-                          lora_requests: set[LoRARequest]) -> None:
+    def _set_active_loras(
+        self,
+        prompt_lora_mapping: tuple[int, ...],
+        token_lora_mapping: tuple[int, ...],
+        lora_requests: set[LoRARequest],
+    ) -> None:
         self._ensure_lora_enabled()
 
         # Set is_prefill to True, so we always use the SGMV kernels on
         # non-cuda platforms.
         # On cuda platforms we use the same kernels for prefill and
         # decode and this flag is generally ignored.
-        lora_mapping = LoRAMapping(token_lora_mapping,
-                                   prompt_lora_mapping,
-                                   is_prefill=True)
+        lora_mapping = LoRAMapping(
+            token_lora_mapping, prompt_lora_mapping, is_prefill=True
+        )
         self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
 
     def _ensure_lora_enabled(self) -> None:
         if not hasattr(self, "lora_manager"):
-            raise RuntimeError(
-                "LoRA is not enabled. Use --enable-lora to enable LoRA.")
-
-    def set_active_loras(self, input_batch: InputBatch,
-                         num_scheduled_tokens: np.ndarray) -> None:
-
-        prompt_lora_mapping: tuple[int, ...]  # of size input_batch.num_reqs
-        token_lora_mapping: tuple[int,
-                                  ...]  # of size np.sum(num_scheduled_tokens)
+            raise RuntimeError("LoRA is not enabled. Use --enable-lora to enable LoRA.")
+
+    def set_active_loras(
+        self,
+        input_batch: InputBatch,
+        num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray | None = None,
+    ) -> None:
+        if num_sampled_tokens is None:
+            num_sampled_tokens = np.ones_like(num_scheduled_tokens, dtype=np.int32)
+
+        prompt_lora_mapping: tuple[int, ...]  # of size np.sum(num_sampled_tokens)
+        token_lora_mapping: tuple[int, ...]  # of size np.sum(num_scheduled_tokens)
         lora_requests: set[LoRARequest]
-        prompt_lora_mapping, token_lora_mapping, lora_requests = \
-                            input_batch.make_lora_inputs(num_scheduled_tokens)
-        return self._set_active_loras(prompt_lora_mapping, token_lora_mapping,
-                                      lora_requests)
+        prompt_lora_mapping, token_lora_mapping, lora_requests = (
+            input_batch.make_lora_inputs(num_scheduled_tokens, num_sampled_tokens)
+        )
+        return self._set_active_loras(
+            prompt_lora_mapping, token_lora_mapping, lora_requests
+        )
 
     @contextmanager
-    def maybe_setup_dummy_loras(self,
-                                lora_config: Optional[LoRAConfig],
-                                remove_lora: bool = True):
+    def maybe_setup_dummy_loras(
+        self, lora_config: LoRAConfig | None, remove_lora: bool = True
+    ):
         if lora_config is None:
             yield
         else:
@@ -93,12 +98,16 @@ class LoRAModelRunnerMixin:
             assert self.lora_manager is not None, "LoRA is not enabled"
 
             num_loras = lora_config.max_loras
-
+            lora_warmup_rank = (
+                lora_config.max_lora_rank if lora_config.max_lora_rank < 8 else 8
+            )
             # Make dummy lora requests
             lora_requests: set[LoRARequest] = {
-                LoRARequest(lora_name=f"warmup_{lora_id}",
-                            lora_int_id=lora_id,
-                            lora_path="/not/a/real/path")
+                LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
                 for lora_id in range(1, num_loras + 1)
             }
 
@@ -106,8 +115,7 @@ class LoRAModelRunnerMixin:
                 # Add the dummy LoRAs here so _set_active_loras doesn't try to
                 # load from disk.
                 for lr in lora_requests:
-                    self.lora_manager.add_dummy_lora(
-                        lr, rank=self.LORA_WARMUP_RANK)
+                    self.lora_manager.add_dummy_lora(lr, rank=lora_warmup_rank)
 
                 yield
 
@@ -116,8 +124,16 @@ class LoRAModelRunnerMixin:
                 self.lora_manager.remove_all_adapters()
 
     @contextmanager
-    def maybe_select_dummy_loras(self, lora_config: Optional[LoRAConfig],
-                                 num_scheduled_tokens: np.ndarray):
+    def maybe_select_dummy_loras(
+        self,
+        lora_config: LoRAConfig | None,
+        num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray | None = None,
+        activate_lora: bool = True,
+    ):
+        if num_sampled_tokens is None:
+            num_sampled_tokens = np.ones_like(num_scheduled_tokens, dtype=np.int32)
+
         if lora_config is None:
             yield
         else:
@@ -129,39 +145,53 @@ class LoRAModelRunnerMixin:
 
             # Make prompt lora mapping
             # Assign LoRA IDs cyclically to simulate a worst-case scenario.
-            prompt_lora_mapping = (np.arange(num_reqs, dtype=np.int32) %
-                                   num_loras) + 1
+            if activate_lora:
+                prompt_lora_mapping = (
+                    np.arange(num_reqs, dtype=np.int32) % num_loras
+                ) + 1
+            else:
+                prompt_lora_mapping = np.zeros(num_reqs, dtype=np.int32)
+
+            # Make sample lora mapping
+            sample_lora_mapping = np.repeat(prompt_lora_mapping, num_sampled_tokens)
 
             # Make token lora mapping
-            token_lora_mapping = np.repeat(prompt_lora_mapping,
-                                           num_scheduled_tokens)
+            token_lora_mapping = np.repeat(prompt_lora_mapping, num_scheduled_tokens)
 
             # Make dummy lora requests
             lora_requests: set[LoRARequest] = {
-                LoRARequest(lora_name=f"warmup_{lora_id}",
-                            lora_int_id=lora_id,
-                            lora_path="/not/a/real/path")
+                LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
                 for lora_id in range(1, num_loras + 1)
             }
 
-            self._set_active_loras(tuple(prompt_lora_mapping),
-                                   tuple(token_lora_mapping), lora_requests)
+            self._set_active_loras(
+                tuple(sample_lora_mapping), tuple(token_lora_mapping), lora_requests
+            )
 
             yield
 
     @contextmanager
-    def maybe_dummy_run_with_lora(self,
-                                  lora_config: Optional[LoRAConfig],
-                                  num_scheduled_tokens: np.ndarray,
-                                  remove_lora: bool = True):
+    def maybe_dummy_run_with_lora(
+        self,
+        lora_config: LoRAConfig | None,
+        num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray,
+        activate_lora: bool = True,
+        remove_lora: bool = True,
+    ):
         with (
-                self.maybe_setup_dummy_loras(lora_config, remove_lora),
-                self.maybe_select_dummy_loras(lora_config,
-                                              num_scheduled_tokens),
+            self.maybe_setup_dummy_loras(lora_config, remove_lora),
+            self.maybe_select_dummy_loras(
+                lora_config, num_scheduled_tokens, num_sampled_tokens, activate_lora
+            ),
         ):
             yield
 
-    def maybe_remove_all_loras(self, lora_config: Optional[LoRAConfig]):
+    def maybe_remove_all_loras(self, lora_config: LoRAConfig | None):
         if lora_config is None:
             return
         self.lora_manager.remove_all_adapters()
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 4cd0ac352de0fc7af5e8196d1c73d1176883c8de..6bf4f9193184967dd680d4f70f86d447f0a30cb6 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining a TPU input batch
 
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingType
-from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
@@ -18,16 +19,16 @@ _SAMPLING_EPS = 1e-5
 
 
 class InputBatch:
-
     def __init__(
-            self,
-            max_num_reqs: int,
-            max_model_len: int,
-            max_num_batched_tokens: int,
-            device: torch.device,
-            pin_memory: bool,
-            vocab_size: int,
-            block_sizes: list[int],  # The block_size of each kv cache group
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        pin_memory: bool,
+        vocab_size: int,
+        block_sizes: list[int],  # The block_size of each kv cache group
+        kernel_block_sizes: list[int],
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -36,7 +37,7 @@ class InputBatch:
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self._req_ids: list[Optional[str]] = []
+        self._req_ids: list[str | None] = []
         self.req_id_to_index: dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
@@ -54,13 +55,12 @@ class InputBatch:
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
-            (max_num_reqs, ),
+            (max_num_reqs,),
             device="cpu",
             dtype=torch.int32,
             pin_memory=pin_memory,
         )
-        self.num_computed_tokens_cpu = \
-            self.num_computed_tokens_cpu_tensor.numpy()
+        self.num_computed_tokens_cpu = self.num_computed_tokens_cpu_tensor.numpy()
 
         # Block table.
         self.block_table = MultiGroupBlockTable(
@@ -70,94 +70,76 @@ class InputBatch:
             pin_memory=pin_memory,
             device=device,
             block_sizes=block_sizes,
+            kernel_block_sizes=kernel_block_sizes,
         )
 
         # Sampling-related.
-        self.temperature = torch.empty((max_num_reqs, ),
-                                       dtype=torch.float32,
-                                       device=device)
-        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                  dtype=torch.float32,
-                                                  device="cpu",
-                                                  pin_memory=pin_memory)
+        self.temperature = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device=device
+        )
+        self.temperature_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=pin_memory
+        )
         self.temperature_cpu = self.temperature_cpu_tensor.numpy()
         self.greedy_reqs: set[str] = set()
         self.random_reqs: set[str] = set()
 
-        self.top_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
+        self.top_p = torch.empty((max_num_reqs,), dtype=torch.float32, device=device)
+        self.top_p_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=pin_memory
+        )
         self.top_p_cpu = self.top_p_cpu_tensor.numpy()
         self.top_p_reqs: set[str] = set()
 
-        self.top_k = torch.empty((max_num_reqs, ),
-                                 dtype=torch.int32,
-                                 device=device)
-        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.int32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
+        self.top_k = torch.empty((max_num_reqs,), dtype=torch.int32, device=device)
+        self.top_k_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.int32, device="cpu", pin_memory=pin_memory
+        )
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        self.min_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
+        self.min_p = torch.empty((max_num_reqs,), dtype=torch.float32, device=device)
+        self.min_p_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=pin_memory
+        )
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
         self.min_p_reqs: set[str] = set()
 
         # Frequency penalty related data structures
-        self.frequency_penalties = torch.empty((max_num_reqs, ),
-                                               dtype=torch.float,
-                                               device=device)
+        self.frequency_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
+        )
         self.frequency_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.frequency_penalties_cpu = \
-            self.frequency_penalties_cpu_tensor.numpy()
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.frequency_penalties_cpu = self.frequency_penalties_cpu_tensor.numpy()
         self.frequency_penalties_reqs: set[str] = set()
 
         # Presence penalty related data structures
-        self.presence_penalties = torch.empty((max_num_reqs, ),
-                                              dtype=torch.float,
-                                              device=device)
-        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                         dtype=torch.float,
-                                                         device="cpu",
-                                                         pin_memory=pin_memory)
-        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        self.presence_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
         )
+        self.presence_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy()
         self.presence_penalties_reqs: set[str] = set()
 
         # Repetition penalty related data structures
-        self.repetition_penalties = torch.empty((max_num_reqs, ),
-                                                dtype=torch.float,
-                                                device=device)
+        self.repetition_penalties = torch.empty(
+            (max_num_reqs,), dtype=torch.float, device=device
+        )
         self.repetition_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.repetition_penalties_cpu = \
-            self.repetition_penalties_cpu_tensor.numpy()
+            (max_num_reqs,), dtype=torch.float, device="cpu", pin_memory=pin_memory
+        )
+        self.repetition_penalties_cpu = self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
         # req_index -> (min_tokens, stop_token_ids)
         self.min_tokens: dict[int, tuple[int, set[int]]] = {}
 
         # lora related
-        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
-                                             dtype=np.int32)
+        self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int64)
         self.lora_id_to_request_ids: dict[int, set[str]] = {}
         self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
 
@@ -174,18 +156,17 @@ class InputBatch:
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        self.logit_bias: list[Optional[dict[int,
-                                            float]]] = [None] * max_num_reqs
+        self.logit_bias: list[dict[int, float] | None] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
-        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
-        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask: torch.Tensor | None = None
+        self.allowed_token_ids_mask_cpu_tensor: torch.Tensor | None = None
 
         # req_index -> bad_words_token_ids
         self.bad_words_token_ids: dict[int, list[list[int]]] = {}
 
-        self.req_output_token_ids: list[Optional[list[int]]] = []
+        self.req_output_token_ids: list[list[int] | None] = []
 
     @property
     def req_ids(self) -> list[str]:
@@ -196,7 +177,7 @@ class InputBatch:
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
+        req_index: int | None = None,
     ) -> None:
         if req_index is None:
             req_index = self.num_reqs
@@ -214,15 +195,14 @@ class InputBatch:
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
-            request.prompt_token_ids, request.prompt_embeds)
+            request.prompt_token_ids, request.prompt_embeds
+        )
         # TODO: copy prompt_embeds
         self.num_prompt_tokens[req_index] = num_prompt_tokens
-        self.token_ids_cpu[
-            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        self.token_ids_cpu[req_index, :num_prompt_tokens] = request.prompt_token_ids
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
-        self.token_ids_cpu[req_index,
-                           start_idx:end_idx] = request.output_token_ids
+        self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
         # Number of token ids in token_ids_cpu.
         # NOTE(woosuk): This may include spec decode tokens.
         self.num_tokens[req_index] = request.num_tokens
@@ -235,8 +215,8 @@ class InputBatch:
         sampling_params = request.sampling_params
         assert sampling_params is not None, "pooling requests not supported yet"
         if sampling_params.sampling_type == SamplingType.GREEDY:
-            # Avoid later division by zero.
-            self.temperature_cpu[req_index] = -1.0
+            # Should avoid division by zero later when apply_temperature.
+            self.temperature_cpu[req_index] = 0.0
             self.greedy_reqs.add(req_id)
         else:
             self.temperature_cpu[req_index] = sampling_params.temperature
@@ -252,23 +232,22 @@ class InputBatch:
             top_k = self.vocab_size
         self.top_k_cpu[req_index] = top_k
         self.min_p_cpu[req_index] = sampling_params.min_p
-        self.frequency_penalties_cpu[
-            req_index] = sampling_params.frequency_penalty
+        self.frequency_penalties_cpu[req_index] = sampling_params.frequency_penalty
         if sampling_params.min_p > _SAMPLING_EPS:
             self.min_p_reqs.add(req_id)
         if sampling_params.frequency_penalty != 0.0:
             self.frequency_penalties_reqs.add(req_id)
-        self.presence_penalties_cpu[
-            req_index] = sampling_params.presence_penalty
+        self.presence_penalties_cpu[req_index] = sampling_params.presence_penalty
         if sampling_params.presence_penalty != 0.0:
             self.presence_penalties_reqs.add(req_id)
-        self.repetition_penalties_cpu[
-            req_index] = sampling_params.repetition_penalty
+        self.repetition_penalties_cpu[req_index] = sampling_params.repetition_penalty
         if sampling_params.repetition_penalty != 1.0:
             self.repetition_penalties_reqs.add(req_id)
         if sampling_params.min_tokens:
-            self.min_tokens[req_index] = (sampling_params.min_tokens,
-                                          sampling_params.all_stop_token_ids)
+            self.min_tokens[req_index] = (
+                sampling_params.min_tokens,
+                sampling_params.all_stop_token_ids,
+            )
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -287,23 +266,23 @@ class InputBatch:
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
                 # False means we don't fill with -inf.
-                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
-                                                          self.vocab_size,
-                                                          dtype=torch.bool,
-                                                          device=self.device)
-                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                self.allowed_token_ids_mask = torch.zeros(
                     self.max_num_reqs,
                     self.vocab_size,
                     dtype=torch.bool,
-                    device="cpu")
+                    device=self.device,
+                )
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs, self.vocab_size, dtype=torch.bool, device="cpu"
+                )
             self.allowed_token_ids_mask_cpu_tensor[req_index] = True
             # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = False
+                sampling_params.allowed_token_ids
+            ] = False
 
         if sampling_params.bad_words_token_ids:
-            self.bad_words_token_ids[
-                req_index] = sampling_params.bad_words_token_ids
+            self.bad_words_token_ids[req_index] = sampling_params.bad_words_token_ids
 
         # Add request lora ID
         if request.lora_request:
@@ -318,7 +297,7 @@ class InputBatch:
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
-    def remove_request(self, req_id: str) -> Optional[int]:
+    def remove_request(self, req_id: str) -> int | None:
         """This method must always be followed by a call to condense()."""
 
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -361,35 +340,51 @@ class InputBatch:
     def swap_states(self, i1: int, i2: int) -> None:
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
-        self._req_ids[i1], self._req_ids[i2] =\
-            self._req_ids[i2], self._req_ids[i1] # noqa
-        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
-            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        self._req_ids[i1], self._req_ids[i2] = self._req_ids[i2], self._req_ids[i1]  # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] = (
+            self.req_output_token_ids[i2],
+            self.req_output_token_ids[i1],
+        )
         assert old_id_i1 is not None and old_id_i2 is not None
-        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
-            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
-        self.num_tokens[i1], self.num_tokens[i2] =\
-            self.num_tokens[i2], self.num_tokens[i1]
-        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
-            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
-        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
-            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
-        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
-            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
-        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
-            self.temperature_cpu[i2], self.temperature_cpu[i1]
-        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
-            self.top_p_cpu[i2], self.top_p_cpu[i1]
-        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
-            self.top_k_cpu[i2], self.top_k_cpu[i1]
-        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
-            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
-        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
-            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
-        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
-            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-            self.min_p_cpu[i2], self.min_p_cpu[i1]
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] = (
+            self.req_id_to_index[old_id_i2],
+            self.req_id_to_index[old_id_i1],
+        )
+        self.num_tokens[i1], self.num_tokens[i2] = (
+            self.num_tokens[i2],
+            self.num_tokens[i1],
+        )
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
+            self.num_tokens_no_spec[i2],
+            self.num_tokens_no_spec[i1],
+        )
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] = (
+            self.num_prompt_tokens[i2],
+            self.num_prompt_tokens[i1],
+        )
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] = (
+            self.num_computed_tokens_cpu[i2],
+            self.num_computed_tokens_cpu[i1],
+        )
+        self.temperature_cpu[i1], self.temperature_cpu[i2] = (
+            self.temperature_cpu[i2],
+            self.temperature_cpu[i1],
+        )
+        self.top_p_cpu[i1], self.top_p_cpu[i2] = self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] = self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] = (
+            self.frequency_penalties_cpu[i2],
+            self.frequency_penalties_cpu[i1],
+        )
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] = (
+            self.presence_penalties_cpu[i2],
+            self.presence_penalties_cpu[i1],
+        )
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = (
+            self.repetition_penalties_cpu[i2],
+            self.repetition_penalties_cpu[i1],
+        )
+        self.min_p_cpu[i1], self.min_p_cpu[i2] = self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -404,21 +399,28 @@ class InputBatch:
         swap_dict_values(self.min_tokens, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
-        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
-            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-        self.logit_bias[i1], self.logit_bias[i2] =\
-            self.logit_bias[i2], self.logit_bias[i1]
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] = (
+            self.request_lora_mapping[i2],
+            self.request_lora_mapping[i1],
+        )
+        self.logit_bias[i1], self.logit_bias[i2] = (
+            self.logit_bias[i2],
+            self.logit_bias[i1],
+        )
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[i1], \
-                self.allowed_token_ids_mask_cpu_tensor[i2] =\
-                self.allowed_token_ids_mask_cpu_tensor[i2], \
-                    self.allowed_token_ids_mask_cpu_tensor[i1]
+            (
+                self.allowed_token_ids_mask_cpu_tensor[i1],
+                self.allowed_token_ids_mask_cpu_tensor[i2],
+            ) = (
+                self.allowed_token_ids_mask_cpu_tensor[i2],
+                self.allowed_token_ids_mask_cpu_tensor[i1],
+            )
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> None:
         """Move non-empty requests down into lower, empty indices.
-        
+
         Args:
           empty_req_indices: empty batch indices, sorted descending.
         """
@@ -454,25 +456,29 @@ class InputBatch:
 
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
-                last_req_index, :num_tokens]
+                last_req_index, :num_tokens
+            ]
             self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
-                last_req_index]
-            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
-                last_req_index]
-            self.num_computed_tokens_cpu[
-                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+                last_req_index
+            ]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[last_req_index]
+            self.num_computed_tokens_cpu[empty_index] = self.num_computed_tokens_cpu[
+                last_req_index
+            ]
             self.block_table.move_row(last_req_index, empty_index)
-            self.temperature_cpu[empty_index] = self.temperature_cpu[
-                last_req_index]
+            self.temperature_cpu[empty_index] = self.temperature_cpu[last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.frequency_penalties_cpu[
-                empty_index] = self.frequency_penalties_cpu[last_req_index]
-            self.presence_penalties_cpu[
-                empty_index] = self.presence_penalties_cpu[last_req_index]
-            self.repetition_penalties_cpu[
-                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.frequency_penalties_cpu[empty_index] = self.frequency_penalties_cpu[
+                last_req_index
+            ]
+            self.presence_penalties_cpu[empty_index] = self.presence_penalties_cpu[
+                last_req_index
+            ]
+            self.repetition_penalties_cpu[empty_index] = self.repetition_penalties_cpu[
+                last_req_index
+            ]
             self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
@@ -483,28 +489,28 @@ class InputBatch:
                 self.min_tokens[empty_index] = min_token
 
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
-                last_req_index]
+                last_req_index
+            ]
 
             self.logit_bias[empty_index] = self.logit_bias[last_req_index]
 
             if self.allowed_token_ids_mask_cpu_tensor is not None:
-                self.allowed_token_ids_mask_cpu_tensor[
-                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
-                        last_req_index]
+                self.allowed_token_ids_mask_cpu_tensor[empty_index] = (
+                    self.allowed_token_ids_mask_cpu_tensor[last_req_index]
+                )
 
-            bad_words_token_ids = self.bad_words_token_ids.pop(
-                last_req_index, None)
+            bad_words_token_ids = self.bad_words_token_ids.pop(last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
         # Trim lists to the batch size.
-        del self._req_ids[self.num_reqs:]
-        del self.req_output_token_ids[self.num_reqs:]
+        del self._req_ids[self.num_reqs :]
+        del self.req_output_token_ids[self.num_reqs :]
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
-        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        max_prompt_len = self.num_prompt_tokens[: self.num_reqs].max()
         prompt_token_ids_cpu_tensor = torch.empty(
             (self.num_reqs, max_prompt_len),
             device="cpu",
@@ -512,17 +518,15 @@ class InputBatch:
             pin_memory=self.pin_memory,
         )
         prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
-        prompt_token_ids[:] = self.token_ids_cpu[:self.
-                                                 num_reqs, :max_prompt_len]
+        prompt_token_ids[:] = self.token_ids_cpu[: self.num_reqs, :max_prompt_len]
         # Use the value of vocab_size as a pad since we don't have a
         # token_id of this value.
         for i in range(self.num_reqs):
-            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
-        return prompt_token_ids_cpu_tensor.to(device=self.device,
-                                              non_blocking=True)
+            prompt_token_ids[i, self.num_prompt_tokens[i] :] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device, non_blocking=True)
 
     def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
+        self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
     ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return
@@ -535,12 +539,12 @@ class InputBatch:
             3. lora_requests: Set of relevant LoRA requests.
         """
 
-        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        req_lora_mapping = self.request_lora_mapping[: self.num_reqs]
         prompt_lora_mapping = tuple(req_lora_mapping)
-        token_lora_mapping = tuple(
-            req_lora_mapping.repeat(num_scheduled_tokens))
+        token_lora_mapping = tuple(req_lora_mapping.repeat(num_scheduled_tokens))
         active_lora_requests: set[LoRARequest] = set(
-            self.lora_id_to_lora_request.values())
+            self.lora_id_to_lora_request.values()
+        )
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
@@ -570,12 +574,14 @@ class InputBatch:
 
     @property
     def no_penalties(self) -> bool:
-        return (len(self.presence_penalties_reqs) == 0
-                and len(self.frequency_penalties_reqs) == 0
-                and len(self.repetition_penalties_reqs) == 0)
+        return (
+            len(self.presence_penalties_reqs) == 0
+            and len(self.frequency_penalties_reqs) == 0
+            and len(self.repetition_penalties_reqs) == 0
+        )
 
     @property
-    def max_num_logprobs(self) -> Optional[int]:
+    def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
     @property
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 39f2974c8088899fa650234fc395a4e5d64861ae..e9eb7cad38f88cfc8bf51aafcc1777dd3de832de 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -3,12 +3,13 @@
 import bisect
 import gc
 import time
-from typing import TYPE_CHECKING, Any, Optional, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import numpy as np
 import torch
 import torch.nn as nn
+
 # TPU XLA related
 import torch_xla
 import torch_xla.core.xla_model as xm
@@ -18,51 +19,80 @@ import torch_xla.runtime as xr
 import vllm.envs as envs
 from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import (ParallelConfig, VllmConfig,
-                         get_layers_from_vllm_config, update_config)
-from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-                                          has_kv_transfer_group)
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    ParallelConfig,
+    VllmConfig,
+    get_layers_from_vllm_config,
+    update_config,
+)
+from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.model_executor.model_loader.tpu import TPUModelLoader
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   supports_transcription)
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    supports_transcription,
+)
 from vllm.model_executor.models.interfaces_base import (
-    is_pooling_model, is_text_generation_model)
+    is_pooling_model,
+    is_text_generation_model,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (
+    BatchedTensorInputs,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
-from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available,
-                        prev_power_of_2)
-from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE,
-                                               PallasAttentionBackend,
-                                               PallasMetadata,
-                                               get_page_size_bytes)
-from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
-                                        KVCacheConfig, KVCacheSpec,
-                                        SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists,
-                             LogprobsTensors, ModelRunnerOutput)
+from vllm.utils.math_utils import cdiv, prev_power_of_2
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.pallas import (
+    TPU_STR_DTYPE_TO_TORCH_DTYPE,
+    PallasAttentionBackend,
+    PallasMetadata,
+    get_page_size_bytes,
+)
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+    MLAAttentionSpec,
+    SlidingWindowSpec,
+)
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    LogprobsLists,
+    LogprobsTensors,
+    ModelRunnerOutput,
+)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin, KVConnectorOutput)
+    KVConnectorModelRunnerMixin,
+    KVConnectorOutput,
+)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups,
-                    bind_kv_cache, sanity_check_mm_encoder_outputs)
+from .utils import (
+    MultiModalBudget,
+    add_kv_sharing_layers_to_kv_cache_groups,
+    bind_kv_cache,
+    sanity_check_mm_encoder_outputs,
+)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 logger = init_logger(__name__)
 
@@ -107,12 +137,11 @@ MIN_NUM_SEQS = 8
 # branch predictions are included as subgraph inputs to facilitate
 # pre-compilation.
 class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
-
     def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
-        original_parallel_config: Optional[ParallelConfig] = None,
+        original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -139,7 +168,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_devices = xr.global_runtime_device_count()
             mesh_shape = (num_devices, 1)
             device_ids = np.array(range(num_devices))
-            self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+            self.mesh = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
 
         self.enforce_eager = model_config.enforce_eager
 
@@ -155,8 +184,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 self.kv_cache_dtype = model_dtype
         else:
-            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[
-                cache_config.cache_dtype]
+            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
         self._hidden_states_dtype = self.dtype
 
         self.sliding_window = model_config.get_sliding_window()
@@ -164,25 +192,28 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.max_model_len = model_config.max_model_len
         self.most_model_len = envs.VLLM_TPU_MOST_MODEL_LEN
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.num_blocks_per_most_len_req = cdiv(
-            self.most_model_len,
-            self.block_size) if self.most_model_len is not None else None
+        self.num_blocks_per_most_len_req = (
+            cdiv(self.most_model_len, self.block_size)
+            if self.most_model_len is not None
+            else None
+        )
         # InputBatch needs to work with sampling tensors greater than padding
         # to avoid dynamic shapes. Also, avoid suboptimal alignment.
         self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
         self.num_tokens_paddings = _get_token_paddings(
             min_token_size=16,
             max_token_size=scheduler_config.max_num_batched_tokens,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP,
+        )
         # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
         # padded max value to pre-allocate data structures and pre-compile.
         self.max_num_tokens = self.num_tokens_paddings[-1]
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
-            parallel_config, LayerBlockType.attention)
-        self.num_query_heads = model_config.get_num_attention_heads(
-            parallel_config)
+            parallel_config, "attention"
+        )
+        self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
@@ -195,17 +226,21 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config)
+            model_config
+        )
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
 
-        self._num_slices_per_kv_cache_update_block = \
-            _get_num_slices_per_kv_cache_update_block(get_page_size_bytes(
-                block_size=self.block_size,
-                num_kv_heads=self.num_kv_heads,
-                head_size=self.head_size,
-                kv_cache_dtype=self.kv_cache_dtype,
-            ))
+        self._num_slices_per_kv_cache_update_block = (
+            _get_num_slices_per_kv_cache_update_block(
+                get_page_size_bytes(
+                    block_size=self.block_size,
+                    num_kv_heads=self.num_kv_heads,
+                    head_size=self.head_size,
+                    kv_cache_dtype=self.kv_cache_dtype,
+                )
+            )
+        )
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
@@ -225,50 +260,74 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.block_size],
+            kernel_block_sizes=[self.cache_config.block_size],
         )
 
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
         # Sometimes the numpy op is faster so we create both.
-        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
-                                         dtype=torch.int32,
-                                         device="cpu")
+        self.input_ids_cpu = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device="cpu"
+        )
 
-        self.positions_cpu = torch.zeros(self.max_num_tokens,
-                                         dtype=torch.int32,
-                                         device="cpu")
+        self.positions_cpu = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device="cpu"
+        )
         self.positions_np = self.positions_cpu.numpy()
         self.block_table_cpu = torch.zeros(
             (self.max_num_reqs, self.max_num_blocks_per_req),
             dtype=torch.int32,
-            device="cpu")
+            device="cpu",
+        )
         # adjust num_reqs to avoid SMEM OOM.
-        self.num_reqs_most_model_len = min(
-            PallasAttentionBackend.get_max_num_seqs(self.most_model_len,
-                                                    self.block_size),
-            self.max_num_reqs) if self.most_model_len is not None else None
+        self.num_reqs_most_model_len = (
+            min(
+                PallasAttentionBackend.get_max_num_seqs(
+                    self.most_model_len, self.block_size
+                ),
+                self.max_num_reqs,
+            )
+            if self.most_model_len is not None
+            else None
+        )
         self.num_reqs_max_model_len = min(
-            PallasAttentionBackend.get_max_num_seqs(self.max_model_len,
-                                                    self.block_size),
-            self.max_num_reqs)
-        self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1,
-                                               dtype=torch.int32,
-                                               device="cpu",
-                                               pin_memory=self.pin_memory)
+            PallasAttentionBackend.get_max_num_seqs(
+                self.max_model_len, self.block_size
+            ),
+            self.max_num_reqs,
+        )
+        self.query_start_loc_cpu = torch.zeros(
+            self.max_num_tokens + 1,
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
         self.query_start_loc_np = self.query_start_loc_cpu.numpy()
 
-        self.seq_lens_cpu = torch.zeros(self.max_num_tokens,
-                                        dtype=torch.int32,
-                                        device="cpu",
-                                        pin_memory=self.pin_memory)
+        self.seq_lens_cpu = torch.zeros(
+            self.max_num_tokens,
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
+        # Only relevant for multimodal models
+        if self.supports_mm_inputs:
+            self.is_mm_embed_cpu = torch.zeros(
+                self.max_num_tokens,
+                dtype=torch.bool,
+                device="cpu",
+                pin_memory=self.pin_memory,
+            )
+
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
         # Keep in int64 to avoid overflow with long context
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
         self.num_reqs_paddings = _get_req_paddings(
-            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
+            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs
+        )
 
         # Layer pairings for cross-layer KV sharing.
         # If an Attention layer `layer_name` is in the keys of this dict, it
@@ -281,30 +340,47 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             (self.max_num_reqs, cdiv(self.vocab_size, 32)),
             dtype=torch.int32,
             device="cpu",
-            pin_memory=self.pin_memory)
+            pin_memory=self.pin_memory,
+        )
         self.require_structured_out_cpu = torch.zeros(
             (self.max_num_reqs, 1),
             dtype=torch.bool,
             device="cpu",
-            pin_memory=self.pin_memory)
+            pin_memory=self.pin_memory,
+        )
         self.structured_decode_arange = torch.arange(
-            0, 32, device="cpu", pin_memory=self.pin_memory)
+            0, 32, device="cpu", pin_memory=self.pin_memory
+        )
 
-        self.mm_budget = (MultiModalBudget(
-            self.model_config,
-            self.scheduler_config,
-            self.mm_registry,
-        ) if self.supports_mm_inputs else None)
+        self.mm_budget = (
+            MultiModalBudget(
+                self.model_config,
+                self.scheduler_config,
+                self.mm_registry,
+            )
+            if self.supports_mm_inputs
+            else None
+        )
 
         if not self.use_spmd:
             self.sample_from_logits_func = torch.compile(
                 self.sample_from_logits,
                 backend="openxla",
                 fullgraph=True,
-                dynamic=False)
+                dynamic=False,
+            )
         else:
             self.sample_from_logits_func = self.sample_from_logits
 
+        # For passing scheduler_output between successive
+        # execute_model() and sample_tokens() calls.
+        self.scheduler_output: SchedulerOutput | None = None
+        self.mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None
+
+    def reset_mm_cache(self) -> None:
+        if self.mm_budget:
+            self.mm_budget.reset_cache()
+
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
         if not check_comp:
@@ -315,8 +391,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if new_compiled_graphs == 0:
             return
 
-        logger.info("Add new %d compiled XLA graphs due to %s",
-                    new_compiled_graphs, case_str)
+        logger.info(
+            "Add new %d compiled XLA graphs due to %s", new_compiled_graphs, case_str
+        )
         self.num_xla_graphs += new_compiled_graphs
 
     def _verify_num_xla_graphs(self, case_str):
@@ -328,7 +405,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.num_xla_graphs == curr_cached_graph, (
             "Recompilation after warm up is detected during {}."
             " num_xla_graphs = {} curr_cached_graph = {}".format(
-                case_str, self.num_xla_graphs, curr_cached_graph))
+                case_str, self.num_xla_graphs, curr_cached_graph
+            )
+        )
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         """Update the cached states and the persistent batch with the scheduler
@@ -381,8 +460,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
-            assert new_req_data.sampling_params is not None,\
+            assert new_req_data.sampling_params is not None, (
                 "Pooling is not supported in TPU yet"
+            )
             req_id = new_req_data.req_id
             sampling_params = new_req_data.sampling_params
 
@@ -408,15 +488,14 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
-            resumed_from_preemption = req_data.resumed_from_preemption[i]
+            resumed_from_preemption = req_id in req_data.resumed_req_ids
 
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
             if not resumed_from_preemption:
                 if new_block_ids is not None:
                     # Append the new blocks to the existing block IDs.
-                    for block_ids, new_ids in zip(req_state.block_ids,
-                                                  new_block_ids):
+                    for block_ids, new_ids in zip(req_state.block_ids, new_block_ids):
                         block_ids.extend(new_ids)
             else:
                 assert new_block_ids is not None
@@ -433,23 +512,17 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 continue
 
             # Update the persistent batch.
-            self.input_batch.num_computed_tokens_cpu[req_index] = (
-                num_computed_tokens)
+            self.input_batch.num_computed_tokens_cpu[req_index] = num_computed_tokens
             if new_block_ids is not None:
-                self.input_batch.block_table.append_row(
-                    new_block_ids, req_index)
+                self.input_batch.block_table.append_row(new_block_ids, req_index)
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         removed_req_indices = sorted(removed_req_indices, reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
-            if removed_req_indices:
-                # Fill the empty index.
-                req_index = removed_req_indices.pop()
-            else:
-                # Append to the end.
-                req_index = None
+            # Fill the empty index or append to the end
+            req_index = removed_req_indices.pop() if removed_req_indices else None
             self.input_batch.add_request(req_state, req_index)
 
         # Condense the batched states if there are empty indices.
@@ -502,56 +575,77 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             format. Layers that do not need KV cache are not included.
         """
 
-        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
         block_size = self.vllm_config.cache_config.block_size
+        cache_dtype_str = self.vllm_config.cache_config.cache_dtype
+
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
-            if (kv_tgt_layer :=
-                    attn_module.kv_sharing_target_layer_name) is not None:
-                # The layer doesn't need its own KV cache and will use that of
-                # the target layer. We skip creating a KVCacheSpec for it, so
-                # that KV cache management logic will act as this layer does
-                # not exist, and doesn't allocate KV cache for the layer. This
-                # enables the memory saving of cross-layer kv sharing, allowing
-                # a given amount of memory to accommodate longer context lengths
-                # or enable more requests to be processed simultaneously.
-                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
-                continue
+            # Classic Attention path
+            if isinstance(attn_module, Attention):
+                if (
+                    kv_tgt_layer := attn_module.kv_sharing_target_layer_name
+                ) is not None:
+                    # The layer doesn't need its own KV cache and will use that of
+                    # the target layer. We skip creating a KVCacheSpec for it, so
+                    # that KV cache management logic will act as this layer does
+                    # not exist, and doesn't allocate KV cache for the layer. This
+                    # enables the memory saving of cross-layer kv sharing, allowing
+                    # a given amount of memory to accommodate longer context lengths
+                    # or enable more requests to be processed simultaneously.
+                    self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                    continue
 
-            if attn_module.attn_type == AttentionType.DECODER:
-                if isinstance(attn_module, ChunkedLocalAttention):
-                    logger.warning_once(
-                        "Using irope in Pallas is not supported yet, it "
-                        "will fall back to global attention for long context.")
-                if attn_module.sliding_window is not None:
-                    kv_cache_spec[layer_name] = SlidingWindowSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        sliding_window=attn_module.sliding_window,
-                    )
+                if attn_module.attn_type == AttentionType.DECODER:
+                    if isinstance(attn_module, ChunkedLocalAttention):
+                        logger.warning_once(
+                            "Using irope in Pallas is not supported yet, it "
+                            "will fall back to global attention for long context."
+                        )
+                    if attn_module.sliding_window is not None:
+                        kv_cache_spec[layer_name] = SlidingWindowSpec(
+                            block_size=block_size,
+                            num_kv_heads=attn_module.num_kv_heads,
+                            head_size=attn_module.head_size,
+                            dtype=self.kv_cache_dtype,
+                            sliding_window=attn_module.sliding_window,
+                        )
+                    else:
+                        kv_cache_spec[layer_name] = FullAttentionSpec(
+                            block_size=block_size,
+                            num_kv_heads=attn_module.num_kv_heads,
+                            head_size=attn_module.head_size,
+                            dtype=self.kv_cache_dtype,
+                        )
+                elif attn_module.attn_type in (
+                    AttentionType.ENCODER,
+                    AttentionType.ENCODER_ONLY,
+                ):
+                    # encoder-only attention does not need KV cache.
+                    continue
+                elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                    raise NotImplementedError
                 else:
-                    kv_cache_spec[layer_name] = FullAttentionSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                    )
-            elif attn_module.attn_type in (AttentionType.ENCODER,
-                                           AttentionType.ENCODER_ONLY):
-                # encoder-only attention does not need KV cache.
-                continue
-            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
-                raise NotImplementedError
+                    raise ValueError(f"Unknown attention type: {attn_module.attn_type}")
+            # MLAAttention path
+            elif isinstance(attn_module, MLAAttention):
+                if layer_name in kv_cache_spec:
+                    continue
+                kv_cache_spec[layer_name] = MLAAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=attn_module.head_size,
+                    dtype=self.kv_cache_dtype,
+                    cache_dtype_str=cache_dtype_str,
+                )
             else:
-                raise ValueError(
-                    f"Unknown attention type: {attn_module.attn_type}")
+                continue
 
         return kv_cache_spec
 
-    def _get_slot_mapping_metadata(self, num_reqs,
-                                   num_scheduled_tokens_per_req) -> np.ndarray:
+    def _get_slot_mapping_metadata(
+        self, num_reqs, num_scheduled_tokens_per_req
+    ) -> np.ndarray:
         """
         Computes metadata for mapping slots to blocks in the key-value (KV)
         cache for a batch of requests.
@@ -576,14 +670,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 - slice_len (int): The length of the slice.
         """
         slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
-        slices_end = self.input_batch.num_computed_tokens_cpu[:num_reqs] + \
-            num_scheduled_tokens_per_req
+        slices_end = (
+            self.input_batch.num_computed_tokens_cpu[:num_reqs]
+            + num_scheduled_tokens_per_req
+        )
         local_block_start_idx = slices_start // self.block_size
         local_block_end_idx = (slices_end - 1) // self.block_size
         no_repeat_req_indices = self.arange_np[:num_reqs]
         global_block_start_idx = (
-            no_repeat_req_indices * self.max_num_blocks_per_req +
-            local_block_start_idx)
+            no_repeat_req_indices * self.max_num_blocks_per_req + local_block_start_idx
+        )
         block_lens = local_block_end_idx - local_block_start_idx + 1
         global_block_start_idx = np.repeat(global_block_start_idx, block_lens)
         slice_arange = np.concatenate([self.arange_np[:n] for n in block_lens])
@@ -591,30 +687,31 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
         block_numbers = block_table_cpu.flatten()[global_block_indices].numpy()
         total_block_len = np.sum(block_lens)
-        slot_mapping_slices = np.repeat(np.array([[0, self.block_size]],
-                                                 dtype=np.int32),
-                                        total_block_len,
-                                        axis=0)
+        slot_mapping_slices = np.repeat(
+            np.array([[0, self.block_size]], dtype=np.int32), total_block_len, axis=0
+        )
         cu_block_lens = np.zeros(len(block_lens) + 1, dtype=np.int32)
         np.cumsum(block_lens, out=cu_block_lens[1:])
         for req_idx in range(num_reqs):
-            slot_mapping_slices[cu_block_lens[req_idx]][
-                0] = slices_start[req_idx] % self.block_size
-            slot_mapping_slices[
-                cu_block_lens[req_idx + 1] -
-                1][1] = (slices_end[req_idx] - 1) % self.block_size + 1
+            slot_mapping_slices[cu_block_lens[req_idx]][0] = (
+                slices_start[req_idx] % self.block_size
+            )
+            slot_mapping_slices[cu_block_lens[req_idx + 1] - 1][1] = (
+                slices_end[req_idx] - 1
+            ) % self.block_size + 1
         slice_lens = slot_mapping_slices[:, 1] - slot_mapping_slices[:, 0]
         cu_slices_lens = np.zeros(len(slice_lens) + 1, dtype=np.int32)
         np.cumsum(slice_lens, out=cu_slices_lens[1:])
-        kv_cache_start_indices = slot_mapping_slices[:, 0] + \
-            (block_numbers * self.block_size)
+        kv_cache_start_indices = slot_mapping_slices[:, 0] + (
+            block_numbers * self.block_size
+        )
         new_kv_start_indices = cu_slices_lens[:-1]
         slot_mapping_metadata = np.stack(
-            [kv_cache_start_indices, new_kv_start_indices, slice_lens], axis=1)
+            [kv_cache_start_indices, new_kv_start_indices, slice_lens], axis=1
+        )
         return slot_mapping_metadata
 
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput",
-                        start_index: int):
+    def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int):
         assert scheduler_output.total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
@@ -636,22 +733,24 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_scheduled_tokens_per_req.append(num_tokens)
         if use_max_model_len:
             if len(num_scheduled_tokens_per_req) > self.num_reqs_max_model_len:
-                num_scheduled_tokens_per_req = \
-                    num_scheduled_tokens_per_req[:self.num_reqs_max_model_len]
+                num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
+                    : self.num_reqs_max_model_len
+                ]
                 end_index = start_index + self.num_reqs_max_model_len
             else:
                 end_index = num_reqs
         else:
-            if len(num_scheduled_tokens_per_req
-                   ) > self.num_reqs_most_model_len:
-                num_scheduled_tokens_per_req = \
-                    num_scheduled_tokens_per_req[:self.num_reqs_most_model_len]
+            if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len:
+                num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
+                    : self.num_reqs_most_model_len
+                ]
                 end_index = start_index + self.num_reqs_most_model_len
             else:
                 end_index = num_reqs
         max_num_scheduled_tokens_all_reqs = max(num_scheduled_tokens_per_req)
-        num_scheduled_tokens_per_req = np.array(num_scheduled_tokens_per_req,
-                                                dtype=np.int32)
+        num_scheduled_tokens_per_req = np.array(
+            num_scheduled_tokens_per_req, dtype=np.int32
+        )
         total_num_scheduled_tokens = sum(num_scheduled_tokens_per_req)
         assert max_num_scheduled_tokens_all_reqs > 0
 
@@ -660,121 +759,130 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
         # For each scheduled token, what are the corresponding req index.
-        req_indices = np.repeat(self.arange_np[:num_reqs],
-                                num_scheduled_tokens_per_req)
+        req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens_per_req)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # For each scheduled token, what is its position in corresponding req.
         arange = np.concatenate(
-            [self.arange_np[:n] for n in num_scheduled_tokens_per_req])
+            [self.arange_np[:n] for n in num_scheduled_tokens_per_req]
+        )
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
-        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
-               arange,
-               out=positions_np)
+        np.add(
+            self.input_batch.num_computed_tokens_cpu[req_indices],
+            arange,
+            out=positions_np,
+        )
 
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
-        token_indices = (positions_np +
-                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+        token_indices = (
+            positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]
+        )
 
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
-        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
-                           0,
-                           torch.from_numpy(token_indices),
-                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
+        torch.index_select(
+            self.input_batch.token_ids_cpu_tensor.flatten(),
+            0,
+            torch.from_numpy(token_indices),
+            out=self.input_ids_cpu[:total_num_scheduled_tokens],
+        )
 
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens_per_req,
-                  out=self.query_start_loc_np[1:num_reqs + 1])
-        self.query_start_loc_np[num_reqs + 1:] = 1
+        np.cumsum(
+            num_scheduled_tokens_per_req, out=self.query_start_loc_np[1 : num_reqs + 1]
+        )
+        self.query_start_loc_np[num_reqs + 1 :] = 1
 
         self.seq_lens_np[:num_reqs] = (
-            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-            num_scheduled_tokens_per_req)
+            self.input_batch.num_computed_tokens_cpu[:num_reqs]
+            + num_scheduled_tokens_per_req
+        )
 
         # Do the padding and copy the tensors to the TPU.
         padded_total_num_scheduled_tokens = _get_padded_token_len(
-            self.num_tokens_paddings, total_num_scheduled_tokens)
+            self.num_tokens_paddings, total_num_scheduled_tokens
+        )
         # Zero out to avoid spurious values from prev iteration (last cp chunk)
         self.input_ids_cpu[
-            total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
-        self.input_ids = self.input_ids_cpu[:
-                                            padded_total_num_scheduled_tokens].to(
-                                                self.device)
-        self.position_ids = self.positions_cpu[:
-                                               padded_total_num_scheduled_tokens].to(
-                                                   self.device)
+            total_num_scheduled_tokens:padded_total_num_scheduled_tokens
+        ] = 0
+        self.input_ids = self.input_ids_cpu[:padded_total_num_scheduled_tokens].to(
+            self.device
+        )
+        self.position_ids = self.positions_cpu[:padded_total_num_scheduled_tokens].to(
+            self.device
+        )
         if use_max_model_len:
-            block_tables = self.block_table_cpu[:self.num_reqs_max_model_len, :
-                                                self.max_num_blocks_per_req]
-            block_tables[:num_reqs, :self.max_num_blocks_per_req] = (
-                self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs])
-            query_start_loc = self.query_start_loc_cpu[:self.
-                                                       num_reqs_max_model_len +
-                                                       1].to(self.device)
-            seq_lens = self.seq_lens_cpu[:self.num_reqs_max_model_len].to(
-                self.device)
+            block_tables = self.block_table_cpu[
+                : self.num_reqs_max_model_len, : self.max_num_blocks_per_req
+            ]
+            block_tables[:num_reqs, : self.max_num_blocks_per_req] = (
+                self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]
+            )
+            query_start_loc = self.query_start_loc_cpu[
+                : self.num_reqs_max_model_len + 1
+            ].to(self.device)
+            seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device)
         else:
-            block_tables = self.block_table_cpu[:self.
-                                                num_reqs_most_model_len, :self.
-                                                num_blocks_per_most_len_req]
-            block_tables[:num_reqs, :self.num_blocks_per_most_len_req] = (
-                self.input_batch.block_table[0].get_cpu_tensor()
-                [:num_reqs, :self.num_blocks_per_most_len_req])
-            query_start_loc = self.query_start_loc_cpu[:self.
-                                                       num_reqs_most_model_len +
-                                                       1].to(self.device)
-            seq_lens = self.seq_lens_cpu[:self.num_reqs_most_model_len].to(
-                self.device)
+            block_tables = self.block_table_cpu[
+                : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req
+            ]
+            block_tables[:num_reqs, : self.num_blocks_per_most_len_req] = (
+                self.input_batch.block_table[0].get_cpu_tensor()[
+                    :num_reqs, : self.num_blocks_per_most_len_req
+                ]
+            )
+            query_start_loc = self.query_start_loc_cpu[
+                : self.num_reqs_most_model_len + 1
+            ].to(self.device)
+            seq_lens = self.seq_lens_cpu[: self.num_reqs_most_model_len].to(self.device)
         block_tables = block_tables.to(self.device)
 
         # Calculate the slot mapping
         slot_mapping_metadata = self._get_slot_mapping_metadata(
-            num_reqs, num_scheduled_tokens_per_req)
+            num_reqs, num_scheduled_tokens_per_req
+        )
         num_kv_update_slices = slot_mapping_metadata.shape[0]
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
-            padded_total_num_scheduled_tokens, self.max_num_reqs,
-            self.block_size)
+            padded_total_num_scheduled_tokens, self.max_num_reqs, self.block_size
+        )
         slot_mapping_metadata = np.pad(
             slot_mapping_metadata,
             [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]],
-            constant_values=0)
+            constant_values=0,
+        )
         slot_mapping_metadata = np.transpose(slot_mapping_metadata)
-        slot_mapping_metadata = torch.tensor(slot_mapping_metadata,
-                                             device=self.device)
+        slot_mapping_metadata = torch.tensor(slot_mapping_metadata, device=self.device)
 
         if self.lora_config is not None:
             # We need to respect padding when activating LoRA adapters
             padded_num_scheduled_tokens_per_req = np.copy(
                 num_scheduled_tokens_per_req
             )  # Copying to avoid accidental state corruption bugs
-            padded_num_scheduled_tokens_per_req[-1] += \
+            padded_num_scheduled_tokens_per_req[-1] += (
                 padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+            )
 
-            self.set_active_loras(self.input_batch,
-                                  padded_num_scheduled_tokens_per_req)
+            self.set_active_loras(self.input_batch, padded_num_scheduled_tokens_per_req)
 
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping_metadata,
             block_tables=block_tables,
             context_lens=seq_lens,
             query_start_loc=query_start_loc,
-            num_seqs=torch.tensor([num_reqs],
-                                  dtype=torch.int32,
-                                  device=self.device),
-            num_kv_update_slices=torch.tensor([num_kv_update_slices],
-                                              dtype=torch.int32,
-                                              device=self.device),
-            num_slices_per_kv_cache_update_block=self.
-            _num_slices_per_kv_cache_update_block,
+            num_seqs=torch.tensor([num_reqs], dtype=torch.int32, device=self.device),
+            num_kv_update_slices=torch.tensor(
+                [num_kv_update_slices], dtype=torch.int32, device=self.device
+            ),
+            num_slices_per_kv_cache_update_block=self._num_slices_per_kv_cache_update_block,
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this
@@ -782,10 +890,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
-            num_reqs, self.max_num_reqs)
+            num_reqs, self.max_num_reqs
+        )
         # Indices at which we sample (positions of last token in the sequence).
         # Padded to avoid recompiling when `num_reqs` varies.
-        logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
+        logits_indices = self.query_start_loc_cpu[1 : padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
 
         if self.lora_config is not None:
@@ -793,20 +902,23 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             padded_num_scheduled_tokens_per_req = np.copy(
                 num_scheduled_tokens_per_req
             )  # Copying to avoid accidental state corruption bugs
-            padded_num_scheduled_tokens_per_req[-1] += \
+            padded_num_scheduled_tokens_per_req[-1] += (
                 padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+            )
 
-            self.set_active_loras(self.input_batch,
-                                  padded_num_scheduled_tokens_per_req)
+            self.set_active_loras(self.input_batch, padded_num_scheduled_tokens_per_req)
 
-        layer_names = get_layers_from_vllm_config(self.vllm_config,
-                                                  Attention).keys()
+        layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys()
         per_layer_attn_metadata = {
-            layer_name: attn_metadata
-            for layer_name in layer_names
+            layer_name: attn_metadata for layer_name in layer_names
         }
-        return per_layer_attn_metadata, logits_indices, padded_num_reqs,\
-            num_reqs, end_index
+        return (
+            per_layer_attn_metadata,
+            logits_indices,
+            padded_num_reqs,
+            num_reqs,
+            end_index,
+        )
 
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
@@ -836,10 +948,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = cast(SupportsMultiModal, self.model)
         encoder_outputs = []
         for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
-                mm_kwargs,
-                device=self.device,
-                pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
+            mm_kwargs,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -849,8 +962,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
             torch_xla.sync(wait=False)
-            curr_group_outputs = model.get_multimodal_embeddings(
-                **mm_kwargs_group)
+            curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             torch_xla.sync(wait=False)
 
             sanity_check_mm_encoder_outputs(
@@ -870,20 +982,30 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # assume to only have whole mm items to process. Hence we avoid the
         # intrinsic dynamism that `scatter_mm_placeholders` introduces.
         for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
-            assert pos_info.is_embed is None, "Expected all positions to be"\
-                " contiguous and embeddings."
+            assert pos_info.is_embed is None, (
+                "Expected all positions to be contiguous and embeddings."
+            )
             self.encoder_cache[mm_hash] = output
 
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> list[torch.Tensor]:
-        mm_embeds: list[torch.Tensor] = []
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        padded_total_num_scheduled_tokens = _get_padded_token_len(
+            self.num_tokens_paddings, total_num_scheduled_tokens
+        )
+
+        is_mm_embed = self.is_mm_embed_cpu
+        is_mm_embed[:padded_total_num_scheduled_tokens] = False
+        mm_embeds = list[torch.Tensor]()
+        req_start_idx = 0
+
         for req_id in self.input_batch.req_ids:
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
-                req_id]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             req_state = self.requests[req_id]
             num_computed_tokens = req_state.num_computed_tokens
+
             # TODO unroll loop and assume/enforce --disable_chunked_mm_input
             # NOTE (NickLucche) here we diverge from logic in other runners, as
             # we assume to only have whole mm items to process. Hence we avoid
@@ -904,26 +1026,51 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # The encoder output is already processed and stored
                     # in the decoder's KV cache.
                     continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens,
+                )
+                assert start_idx < end_idx
+
                 mm_hash = mm_feature.identifier
                 encoder_output = self.encoder_cache.get(mm_hash, None)
-                assert encoder_output is not None,\
-                      f"Encoder cache miss for {mm_hash}."
-                assert pos_info.is_embed is None, "Expected all positions to"\
-                " be contiguous and embeddings."
-                encoder_output = self.encoder_cache[mm_hash]
+                assert encoder_output is not None, f"Encoder cache miss for {mm_hash}."
+
+                assert pos_info.is_embed is None, (
+                    "Expected all positions to be contiguous and embeddings."
+                )
+
+                req_start_pos = req_start_idx + start_pos - num_computed_tokens
+                is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = True
+
+                # Only whole mm items are processed
                 mm_embeds.append(encoder_output)
-        return mm_embeds
 
-    def _get_model_inputs(self, input_ids: torch.Tensor,
-                          mm_embeds: list[torch.Tensor]):
+            req_start_idx += num_scheduled_tokens
+
+        is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens].to(self.device)
+
+        return mm_embeds, is_mm_embed
+
+    def _get_model_inputs(
+        self,
+        input_ids: torch.Tensor,
+        mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None,
+    ):
         if self.supports_mm_inputs:
+            mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
+
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids=input_ids,
+            inputs_embeds = self.model.embed_input_ids(
+                input_ids,
                 multimodal_embeddings=mm_embeds,
+                is_multimodal=is_mm_embed,
             )
+
             return None, inputs_embeds
         else:
             # For text-only models, we use token ids as input.
@@ -936,8 +1083,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> ModelRunnerOutput:
+        intermediate_tensors: IntermediateTensors | None = None,
+    ) -> ModelRunnerOutput | None:
+        if self.scheduler_output is not None:
+            raise RuntimeError(
+                "State error: sample_tokens() must be called "
+                "after execute_model() returns None."
+            )
         # Update cached state
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
@@ -945,16 +1097,32 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # Return empty ModelRunnerOutput if there's no work to do.
                 return EMPTY_MODEL_RUNNER_OUTPUT
 
-            return self.kv_connector_no_forward(scheduler_output,
-                                                self.vllm_config)
+            return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
 
+        mm_embed_inputs = None
         if self.supports_mm_inputs:
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
-        else:
-            mm_embeds = []
+            mm_embed_inputs = self._gather_mm_embeddings(scheduler_output)
+
         torch_xla.sync(wait=False)
+
+        self.scheduler_output = scheduler_output
+        self.mm_embed_inputs = mm_embed_inputs
+        return None
+
+    @torch.no_grad()
+    def sample_tokens(
+        self, grammar_output: "GrammarOutput | None"
+    ) -> ModelRunnerOutput:
+        if self.scheduler_output is None:
+            # Nothing to do (PP non-final rank case), output isn't used.
+            return None  # noqa
+        scheduler_output = self.scheduler_output
+        mm_embed_inputs = self.mm_embed_inputs
+        self.scheduler_output = None
+        self.mm_embed_inputs = None
+
         # Prepare inputs, the requests might be split into multiple
         # executions, combine the result of each execution.
         start_index = 0
@@ -967,41 +1135,48 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.maybe_setup_kv_connector(scheduler_output)
 
         while start_index < self.input_batch.num_reqs:
-            attn_metadata, logits_indices, padded_num_reqs, num_reqs,\
-                end_index = self._prepare_inputs(scheduler_output, start_index)
+            attn_metadata, logits_indices, padded_num_reqs, num_reqs, end_index = (
+                self._prepare_inputs(scheduler_output, start_index)
+            )
             input_ids, inputs_embeds = self._get_model_inputs(
-                self.input_ids, mm_embeds)
+                self.input_ids, mm_embed_inputs
+            )
             torch_xla.sync(wait=False)
             # Run the decoder
             with set_forward_context(
-                    attn_metadata,
-                    self.vllm_config,
-                    num_tokens=scheduler_output.total_num_scheduled_tokens):
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=scheduler_output.total_num_scheduled_tokens,
+            ):
                 hidden_states = self.model(
                     input_ids=input_ids,
                     positions=self.position_ids,
                     inputs_embeds=inputs_embeds,
                 )
-            hidden_states = self.select_hidden_states(hidden_states,
-                                                      logits_indices)
+            hidden_states = self.select_hidden_states(hidden_states, logits_indices)
             logits = self.compute_logits(hidden_states)
-            tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
-                from_input_batch(self.input_batch, padded_num_reqs, self.device)
-            if scheduler_output.grammar_bitmask is not None:
-                require_struct_decoding, grammar_bitmask_padded, arange = \
-                    self.prepare_structured_decoding_input(logits,
-                                                           scheduler_output)
-                logits = self.structured_decode(require_struct_decoding,
-                                                grammar_bitmask_padded, logits,
-                                                arange)
+            tpu_sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
+                self.input_batch, padded_num_reqs, self.device
+            )
+            if grammar_output is not None:
+                require_struct_decoding, grammar_bitmask_padded, arange = (
+                    self.prepare_structured_decoding_input(logits, grammar_output)
+                )
+                logits = self.structured_decode(
+                    require_struct_decoding, grammar_bitmask_padded, logits, arange
+                )
             selected_token_ids = self.sample_from_logits_func(
-                logits, tpu_sampling_metadata)
+                logits, tpu_sampling_metadata
+            )
             # NOTE (NickLucche) Use the original logits (before any penalties or
             # temperature scaling) for the top-k logprobs. We can't enforce it
             # due to recompilations outside torch.compiled code, so just make
             # sure `sample_from_logits` does not modify the logits in-place.
-            logprobs = self.gather_logprobs(logits, selected_token_ids) \
-                if tpu_sampling_metadata.logprobs else None
+            logprobs = (
+                self.gather_logprobs(logits, selected_token_ids)
+                if tpu_sampling_metadata.logprobs
+                else None
+            )
 
             # Remove padding on cpu and keep dynamic op outside of xla graph.
             selected_token_ids = selected_token_ids.cpu()[:num_reqs]
@@ -1017,8 +1192,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # should be called right after each single forward pass,
         # instead of the forwards of the entire input batch.
         self.maybe_wait_for_kv_save()
-        finished_sending, finished_recving = (
-            self.get_finished_kv_transfers(scheduler_output))
+        finished_sending, finished_recving = self.get_finished_kv_transfers(
+            scheduler_output
+        )
 
         selected_token_ids = torch.cat(combined_selected_tokens, dim=0)
         if tpu_sampling_metadata.logprobs:
@@ -1029,16 +1205,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     result.extend(input_list)
                 return result
 
-            logprobs_lists = LogprobsLists(logprob_token_ids=concat_lists(
-                [lp.logprob_token_ids for lp in combined_logprobs]),
-                                           logprobs=concat_lists([
-                                               lp.logprobs
-                                               for lp in combined_logprobs
-                                           ]),
-                                           sampled_token_ranks=concat_lists([
-                                               lp.sampled_token_ranks
-                                               for lp in combined_logprobs
-                                           ]))
+            logprobs_lists = LogprobsLists(
+                logprob_token_ids=concat_lists(
+                    [lp.logprob_token_ids for lp in combined_logprobs]
+                ),
+                logprobs=concat_lists([lp.logprobs for lp in combined_logprobs]),
+                sampled_token_ranks=concat_lists(
+                    [lp.sampled_token_ranks for lp in combined_logprobs]
+                ),
+            )
         else:
             logprobs_lists = None
 
@@ -1050,8 +1225,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
-            seq_len = (req_state.num_computed_tokens +
-                       scheduler_output.num_scheduled_tokens[req_id])
+            seq_len = (
+                req_state.num_computed_tokens
+                + scheduler_output.num_scheduled_tokens[req_id]
+            )
             if seq_len >= req_state.num_tokens:
                 request_seq_lens.append((i, req_state, seq_len))
             else:
@@ -1067,23 +1244,25 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 discard_sampled_tokens_req_indices.append(i)
 
         assert all(
-            req_id is not None for req_id in
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+            req_id is not None for req_id in self.input_batch.req_ids[:num_reqs]
+        ), "req_ids contains None"
         req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs])
 
-        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
         for req_id in self.input_batch.req_ids[:num_reqs]:
             prompt_logprobs_dict[req_id] = None
 
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids = selected_token_ids.tolist()
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in selected_token_ids.numpy()
+            ]
 
             # Mask out the sampled tokens that should not be sampled.
             # TODO: Keep in sync with gpu_model_runner.py, in particular
             #       the "else" case here
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+                valid_sampled_token_ids[i] = np.array([])
 
             # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
@@ -1096,22 +1275,24 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
             valid_sampled_token_ids = [
-                seq.tolist()
-                for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
             self.input_batch.num_tokens[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:
                 target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
-                self.input_batch.token_ids_cpu[
-                    i, target_slice] = valid_sampled_token_ids[i]
+                self.input_batch.token_ids_cpu[i, target_slice] = (
+                    valid_sampled_token_ids[i]
+                )
                 req_state.output_token_ids.extend(valid_sampled_token_ids[i])
 
-        kv_connector_output = None if (
-            finished_sending is None
-            and finished_recving is None) else KVConnectorOutput(
+        kv_connector_output = (
+            None
+            if (finished_sending is None and finished_recving is None)
+            else KVConnectorOutput(
                 finished_sending=finished_sending,
                 finished_recving=finished_recving,
             )
+        )
 
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
@@ -1134,9 +1315,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # https://github.com/vllm-project/vllm/pull/20095#discussion_r2201497754
         allowed_config_names = {"load_config", "model_config"}
         for config_name, config_overrides in overrides.items():
-            assert config_name in allowed_config_names, \
-                f"Config `{config_name}` not supported. " \
+            assert config_name in allowed_config_names, (
+                f"Config `{config_name}` not supported. "
                 f"Allowed configs: {allowed_config_names}"
+            )
             config = getattr(self, config_name)
             new_config = update_config(config, config_overrides)
             setattr(self, config_name, new_config)
@@ -1155,30 +1337,34 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the embedding weights.
         xm_tp_rank = xr.global_ordinal()
         with patch(
-                "vllm.model_executor.layers.vocab_parallel_embedding."
-                "get_tensor_model_parallel_rank",
-                return_value=xm_tp_rank):
+            "vllm.model_executor.layers.vocab_parallel_embedding."
+            "get_tensor_model_parallel_rank",
+            return_value=xm_tp_rank,
+        ):
             try:
                 if self.use_spmd:
                     tpu_loader = TPUModelLoader(
-                        load_config=self.vllm_config.load_config)
+                        load_config=self.vllm_config.load_config
+                    )
                     model = tpu_loader.load_model(
                         vllm_config=self.vllm_config,
                         model_config=self.vllm_config.model_config,
-                        mesh=self.mesh)
+                        mesh=self.mesh,
+                    )
                 else:
                     model_loader = get_model_loader(self.load_config)
                     logger.info("Loading model from scratch...")
                     model = model_loader.load_model(
-                        vllm_config=self.vllm_config,
-                        model_config=self.model_config)
+                        vllm_config=self.vllm_config, model_config=self.model_config
+                    )
             except RuntimeError as e:
                 raise RuntimeError(
                     f"Unable to load model, a likely reason is the model is "
                     "too large for the current device's HBM memory. "
                     "Consider switching to a smaller model "
                     "or sharding the weights on more chips. "
-                    f"See the detailed error: {e}") from e
+                    f"See the detailed error: {e}"
+                ) from e
         if self.lora_config is not None:
             model = self.load_lora_model(model, self.vllm_config, self.device)
             replace_set_lora(model)
@@ -1192,44 +1378,43 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.sampler = TPUSampler()
 
     def reload_weights(self) -> None:
-        assert getattr(self, "model", None) is not None, \
+        assert getattr(self, "model", None) is not None, (
             "Cannot reload weights before model is loaded."
+        )
         model_loader = get_model_loader(self.load_config)
         logger.info("Reloading weights inplace...")
         model_loader.load_weights(self.model, model_config=self.model_config)
 
     @torch.no_grad()
-    def _dummy_run(self, num_tokens: int, num_reqs: int,
-                   num_blocks: int) -> None:
+    def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
         if self.supports_mm_inputs:
             input_ids = None
-            inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
-                                        dtype=self.dtype,
-                                        device=self.device)
+            inputs_embeds = torch.zeros(
+                (num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
+            )
         else:
-            input_ids = torch.zeros((num_tokens),
-                                    dtype=torch.int32).to(self.device)
+            input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
             inputs_embeds = None
         actual_num_reqs = min(num_tokens, num_reqs)
-        position_ids = torch.zeros(num_tokens,
-                                   dtype=torch.int32).to(self.device)
+        position_ids = torch.zeros(num_tokens, dtype=torch.int32).to(self.device)
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
-            num_tokens, self.max_num_reqs, self.block_size)
-        num_kv_update_slices = torch.tensor([padded_num_slices],
-                                            dtype=torch.int32).to(self.device)
-        slot_mapping = torch.zeros((3, padded_num_slices),
-                                   dtype=torch.int32).to(self.device)
-        block_tables = torch.zeros((num_reqs, num_blocks),
-                                   dtype=torch.int32).to(self.device)
+            num_tokens, self.max_num_reqs, self.block_size
+        )
+        num_kv_update_slices = torch.tensor([padded_num_slices], dtype=torch.int32).to(
+            self.device
+        )
+        slot_mapping = torch.zeros((3, padded_num_slices), dtype=torch.int32).to(
+            self.device
+        )
+        block_tables = torch.zeros((num_reqs, num_blocks), dtype=torch.int32).to(
+            self.device
+        )
         query_lens = [1] * num_reqs
-        query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                                    dtype=torch.int32),
-                                       dim=0,
-                                       dtype=torch.int32).to(self.device)
-        context_lens = torch.ones((num_reqs, ),
-                                  dtype=torch.int32).to(self.device)
-        num_seqs = torch.tensor([actual_num_reqs],
-                                dtype=torch.int32).to(self.device)
+        query_start_loc = torch.cumsum(
+            torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32
+        ).to(self.device)
+        context_lens = torch.ones((num_reqs,), dtype=torch.int32).to(self.device)
+        num_seqs = torch.tensor([actual_num_reqs], dtype=torch.int32).to(self.device)
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
             block_tables=block_tables,
@@ -1237,8 +1422,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             query_start_loc=query_start_loc,
             num_seqs=num_seqs,
             num_kv_update_slices=num_kv_update_slices,
-            num_slices_per_kv_cache_update_block=self.
-            _num_slices_per_kv_cache_update_block,
+            num_slices_per_kv_cache_update_block=self._num_slices_per_kv_cache_update_block,
         )
 
         if self.supports_mm_inputs:
@@ -1251,27 +1435,29 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
         torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
 
-        layer_names = get_layers_from_vllm_config(self.vllm_config,
-                                                  Attention).keys()
+        layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys()
         per_layer_attn_metadata = {
-            layer_name: attn_metadata
-            for layer_name in layer_names
+            layer_name: attn_metadata for layer_name in layer_names
         }
 
-        with self.maybe_select_dummy_loras(
-                self.lora_config,
-                np.array([num_tokens], dtype=np.int32)), set_forward_context(
-                    per_layer_attn_metadata, self.vllm_config, 0):
-            out = self.model(input_ids=input_ids,
-                             positions=position_ids,
-                             inputs_embeds=inputs_embeds)
+        with (
+            self.maybe_select_dummy_loras(
+                self.lora_config, np.array([num_tokens], dtype=np.int32)
+            ),
+            set_forward_context(per_layer_attn_metadata, self.vllm_config, 0),
+        ):
+            out = self.model(
+                input_ids=input_ids, positions=position_ids, inputs_embeds=inputs_embeds
+            )
         self._hidden_states_dtype = out.dtype
 
-    def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping,
-                          lora_requests) -> None:
+    def _set_active_loras(
+        self, prompt_lora_mapping, token_lora_mapping, lora_requests
+    ) -> None:
         torch_xla.sync(wait=False)  # Captures input updates
-        super()._set_active_loras(prompt_lora_mapping, token_lora_mapping,
-                                  lora_requests)
+        super()._set_active_loras(
+            prompt_lora_mapping, token_lora_mapping, lora_requests
+        )
         torch_xla.sync(wait=False)  # Captures metadata updates
 
     def _precompile_mm_encoder(self) -> None:
@@ -1288,8 +1474,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         for mode, max_items_per_seq in max_items_per_seq_by_modality.items():
             logger.info(
-                "Compiling Multimodal %s Encoder with different input"
-                " shapes.", mode)
+                "Compiling Multimodal %s Encoder with different input shapes.", mode
+            )
             start = time.perf_counter()
             # No padding for MM encoder just yet.
             for num_items in range(1, max_items_per_seq + 1):
@@ -1300,13 +1486,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 )
                 # Run multimodal encoder.
                 torch_xla.sync(wait=False)
-                mm_embeds = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs)
+                mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
                 torch_xla.sync(wait=False)
                 num_patches = mm_embeds[0].shape[0]
                 items_size = num_patches * num_items
 
-                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # NOTE (NickLucche) pre-compile `embed_input_ids` when mm
                 # embeddings are present. We assume `--disable-mm-chunked`,
                 # hence only whole items can be scheduled. This implies we just
                 # need to compile when `num_items` fit the (padded) `input_ids`
@@ -1315,47 +1500,61 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         # XLA Workaround: if torch.zeros(..device) is used, XLA
                         # compiles a scalar+expansion op, which won't match
                         # the graph generated at runtime. CPU->TPU must be used
-                        placeholders_ids = torch.zeros(num_tokens,
-                                                       dtype=torch.int32,
-                                                       device="cpu")
+                        placeholders_ids = torch.zeros(
+                            num_tokens, dtype=torch.int32, device="cpu"
+                        )
                         # Align placeholders and actual num mm_embeddings.
-                        placeholders_ids[:items_size] = \
-                            hf_config.image_token_index
+                        placeholders_ids[:items_size] = hf_config.image_token_index
 
                         placeholders_ids = placeholders_ids.to(self.device)
+
+                        mm_mask = torch.tensor([False] * num_tokens)
+                        mm_mask[:items_size] = True
+                        mm_mask = mm_mask.to(self.device)
                         # Assign outputs or the graph will be cut short.
-                        a, b = self._get_model_inputs(placeholders_ids,
-                                                      [mm_embeds])
+                        a, b = self._get_model_inputs(
+                            placeholders_ids,
+                            mm_embed_inputs=([mm_embeds], mm_mask),
+                        )
                         assert a is None
                         torch_xla.sync(wait=False)
 
-            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # Pre-compile `embed_input_ids` when mm_embeddings are not
             # present. Chunk is only made of text, no mm_placeholders.
             for num_tokens in self.num_tokens_paddings:
-                placeholders_ids = torch.zeros(num_tokens,
-                                               dtype=torch.int32,
-                                               device="cpu")
+                placeholders_ids = torch.zeros(
+                    num_tokens, dtype=torch.int32, device="cpu"
+                )
                 placeholders_ids = placeholders_ids.to(self.device)
-                a, b = self._get_model_inputs(placeholders_ids, [])
+                a, b = self._get_model_inputs(
+                    placeholders_ids,
+                    mm_embed_inputs=None,
+                )
                 assert a is None
                 torch_xla.sync(wait=False)
 
             xm.wait_device_ops()
             end = time.perf_counter()
             logger.info(
-                "Multimodal %s Encoder compilation finished in in %.2f "
-                "[secs].", mode, end - start)
+                "Multimodal %s Encoder compilation finished in in %.2f [secs].",
+                mode,
+                end - start,
+            )
 
     def _precompile_backbone(self) -> None:
         logger.info("Compiling the model with different input shapes.")
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
-            self._dummy_run(num_tokens, self.num_reqs_max_model_len,
-                            self.max_num_blocks_per_req)
+            self._dummy_run(
+                num_tokens, self.num_reqs_max_model_len, self.max_num_blocks_per_req
+            )
             if self.most_model_len is not None:
-                self._dummy_run(num_tokens, self.num_reqs_most_model_len,
-                                self.num_blocks_per_most_len_req)
+                self._dummy_run(
+                    num_tokens,
+                    self.num_reqs_most_model_len,
+                    self.num_blocks_per_most_len_req,
+                )
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in %.2f [secs].", end - start)
@@ -1364,23 +1563,19 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _precompile_select_hidden_states(self) -> None:
         # Compile hidden state selection function for bucketed
         # n_tokens x max_num_reqs. Graph is really small so this is fine.
-        logger.info(
-            "Compiling select_hidden_states with different input shapes.")
+        logger.info("Compiling select_hidden_states with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_tokens in self.num_tokens_paddings:
-            dummy_hidden = torch.zeros((num_tokens, hsize),
-                                       device=self.device,
-                                       dtype=self._hidden_states_dtype)
+            dummy_hidden = torch.zeros(
+                (num_tokens, hsize), device=self.device, dtype=self._hidden_states_dtype
+            )
             torch._dynamo.mark_dynamic(dummy_hidden, 0)
             for num_reqs in self.num_reqs_paddings:
-                indices = torch.zeros(num_reqs,
-                                      dtype=torch.int32,
-                                      device=self.device)
+                indices = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
                 torch._dynamo.mark_dynamic(indices, 0)
                 self.select_hidden_states(dummy_hidden, indices)
-                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
-                            num_reqs)
+                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens, num_reqs)
                 # Requests can't be more than tokens. But do compile for the
                 # next bigger value in case num_tokens uses bucketed padding.
                 if num_reqs >= min(num_tokens, self.max_num_reqs):
@@ -1395,9 +1590,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_reqs in self.num_reqs_paddings:
-            dummy_hidden = torch.zeros((num_reqs, hsize),
-                                       device=self.device,
-                                       dtype=self._hidden_states_dtype)
+            dummy_hidden = torch.zeros(
+                (num_reqs, hsize), device=self.device, dtype=self._hidden_states_dtype
+            )
             torch._dynamo.mark_dynamic(dummy_hidden, 0)
             self.compute_logits(dummy_hidden)
             logger.info("  -- num_seqs: %d", num_reqs)
@@ -1407,23 +1602,28 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._update_num_xla_graphs("compute_logits")
 
     def _precompile_structured_decoding(self) -> None:
-        logger.info(
-            "Compiling structured_decoding with different input shapes.")
+        logger.info("Compiling structured_decoding with different input shapes.")
         start = time.perf_counter()
         for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
-                                       device=self.device,
-                                       dtype=self._hidden_states_dtype)
-            dummy_require_struct_decoding = \
-                self.require_structured_out_cpu[:num_reqs].to(self.device)
-            dummy_grammar_bitmask = \
-                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            dummy_logits = torch.zeros(
+                (num_reqs, self.vocab_size),
+                device=self.device,
+                dtype=self._hidden_states_dtype,
+            )
+            dummy_require_struct_decoding = self.require_structured_out_cpu[
+                :num_reqs
+            ].to(self.device)
+            dummy_grammar_bitmask = self.grammar_bitmask_cpu[:num_reqs].to(self.device)
             # The first dimension of the above 3 dummy tensors cannot be
             # mark_dynamic because some operations in structured_decode require
             # them to be static.
             arange = self.structured_decode_arange.to(self.device)
-            self.structured_decode(dummy_require_struct_decoding,
-                                   dummy_grammar_bitmask, dummy_logits, arange)
+            self.structured_decode(
+                dummy_require_struct_decoding,
+                dummy_grammar_bitmask,
+                dummy_logits,
+                arange,
+            )
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1431,30 +1631,29 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._update_num_xla_graphs("structured_decoding")
 
     def _precompile_sample_from_logits(self) -> None:
-        logger.info(
-            "Compiling sample_from_logits with different input shapes.")
+        logger.info("Compiling sample_from_logits with different input shapes.")
         start = time.perf_counter()
         for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
-                                       device=self.device,
-                                       dtype=self._hidden_states_dtype)
+            dummy_logits = torch.zeros(
+                (num_reqs, self.vocab_size),
+                device=self.device,
+                dtype=self._hidden_states_dtype,
+            )
             # The first dimension of dummy_logits cannot be mark_dynamic
             # because some operations in the sampler require it to be static.
             for all_greedy in [False, True]:
                 generate_params_if_all_greedy = not all_greedy
-                sampling_metadata = (
-                    TPUSupportedSamplingMetadata.from_input_batch(
-                        self.input_batch,
-                        num_reqs,
-                        self.device,
-                        generate_params_if_all_greedy,
-                    ))
+                sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
+                    self.input_batch,
+                    num_reqs,
+                    self.device,
+                    generate_params_if_all_greedy,
+                )
                 sampling_metadata.all_greedy = all_greedy
                 with self.maybe_select_dummy_loras(
-                        self.lora_config, np.array([num_reqs],
-                                                   dtype=np.int32)):
-                    self.sample_from_logits_func(dummy_logits,
-                                                 sampling_metadata)
+                    self.lora_config, np.array([num_reqs], dtype=np.int32)
+                ):
+                    self.sample_from_logits_func(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1465,13 +1664,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logger.info("Compiling gather_logprobs with different input shapes.")
         start = time.perf_counter()
         for num_reqs in self.num_reqs_paddings:
-            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
-                                       device=self.device,
-                                       dtype=self._hidden_states_dtype)
-            dummy_tokens = torch.zeros((num_reqs, 1),
-                                       dtype=torch.int64).to(self.device)
+            dummy_logits = torch.zeros(
+                (num_reqs, self.vocab_size),
+                device=self.device,
+                dtype=self._hidden_states_dtype,
+            )
+            dummy_tokens = torch.zeros((num_reqs, 1), dtype=torch.int64).to(self.device)
             with self.maybe_select_dummy_loras(
-                    self.lora_config, np.array([num_reqs], dtype=np.int32)):
+                self.lora_config, np.array([num_reqs], dtype=np.int32)
+            ):
                 self.gather_logprobs(dummy_logits, dummy_tokens)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
@@ -1501,7 +1702,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
-                    "encoder cache.")
+                    "encoder cache."
+                )
             else:
                 mm_budget = self.mm_budget
                 assert mm_budget is not None
@@ -1512,8 +1714,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # modality with the max possible input tokens even when
                     # it supports multiple.
                     dummy_modality = mm_budget.get_modality_with_max_tokens()
-                    max_mm_items_per_batch = mm_budget \
-                        .max_items_per_batch_by_modality[dummy_modality]
+                    max_mm_items_per_batch = mm_budget.max_items_per_batch_by_modality[
+                        dummy_modality
+                    ]
 
                     logger.info(
                         "Encoder cache will be initialized with a budget of "
@@ -1535,15 +1738,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # impact of recompilation until it's fixed.
                     start = time.perf_counter()
                     torch_xla.sync(wait=False)
-                    dummy_encoder_outputs = \
-                        self.model.get_multimodal_embeddings(
-                        **batched_dummy_mm_inputs)
+                    dummy_encoder_outputs = self.model.embed_multimodal(
+                        **batched_dummy_mm_inputs
+                    )
                     torch_xla.sync(wait=False)
                     xm.wait_device_ops()
                     end = time.perf_counter()
                     logger.info(
                         "Multimodal Encoder profiling finished in %.2f [secs].",
-                        end - start)
+                        end - start,
+                    )
 
                     sanity_check_mm_encoder_outputs(
                         dummy_encoder_outputs,
@@ -1551,15 +1755,18 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     )
 
                     # Cache the dummy encoder outputs.
-                    self.encoder_cache["tmp"] = dict(
-                        enumerate(dummy_encoder_outputs))
+                    self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
         # Trigger compilation for general shape.
-        self._dummy_run(num_tokens, self.num_reqs_max_model_len,
-                        self.max_num_blocks_per_req)
+        self._dummy_run(
+            num_tokens, self.num_reqs_max_model_len, self.max_num_blocks_per_req
+        )
         if self.most_model_len is not None:
-            self._dummy_run(num_tokens, self.num_reqs_most_model_len,
-                            self.num_blocks_per_most_len_req)
+            self._dummy_run(
+                num_tokens,
+                self.num_reqs_most_model_len,
+                self.num_blocks_per_most_len_req,
+            )
 
         torch_xla.sync(wait=False)
         xm.wait_device_ops()
@@ -1584,10 +1791,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_config.kv_cache_groups,
         )
 
-        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
-        ):
-            logger.debug("%s reuses KV cache of %s", layer_name,
-                         target_layer_name)
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
+            logger.debug("%s reuses KV cache of %s", layer_name, target_layer_name)
             kv_caches[layer_name] = kv_caches[target_layer_name]
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
@@ -1599,11 +1804,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         """
         if len(kv_cache_config.kv_cache_groups) > 1:
             raise NotImplementedError(
-                "Hybrid models with more than one KV cache type are not "
-                "supported yet.")
+                "Hybrid models with more than one KV cache type are not supported yet."
+            )
 
-        if kv_cache_config.kv_cache_groups[
-                0].kv_cache_spec.block_size != self.block_size:
+        if (
+            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+            != self.block_size
+        ):
             self.input_batch = InputBatch(
                 max_num_reqs=self.max_num_reqs,
                 max_model_len=self.max_model_len,
@@ -1614,16 +1821,21 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 block_sizes=[
                     kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
                 ],
+                kernel_block_sizes=[
+                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+                ],
             )
         # Verify dtype compatibility between block_table_cpu and input_batch
-        assert self.block_table_cpu.dtype == self.input_batch.block_table[
-            0].get_cpu_tensor().dtype
+        assert (
+            self.block_table_cpu.dtype
+            == self.input_batch.block_table[0].get_cpu_tensor().dtype
+        )
 
         kv_cache_sizes = {}
         for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
             assert len(kv_cache_tensor.shared_by) == 1, (
-                "KV cache tensor shared by multiple layers is not supported in "
-                "TPU.")
+                "KV cache tensor shared by multiple layers is not supported in TPU."
+            )
             kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
         kv_caches: dict[str, torch.Tensor] = {}
@@ -1637,19 +1849,23 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     if self.use_spmd:
                         num_kv_heads = kv_cache_spec.num_kv_heads
                         assert self.original_parallel_config is not None
-                        tp_size = \
-                            self.original_parallel_config.tensor_parallel_size
+                        tp_size = self.original_parallel_config.tensor_parallel_size
                         # TODO: Handle kv cache duplication under SPMD mode.
                         assert num_kv_heads % tp_size == 0, (
                             f"num_kv_heads {num_kv_heads} must be divisible by "
-                            f"tp_size {tp_size} under SPMD mode")
+                            f"tp_size {tp_size} under SPMD mode"
+                        )
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
-                        num_blocks, kv_cache_spec.block_size,
-                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                        num_blocks,
+                        kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads,
+                        kv_cache_spec.head_size,
+                    )
                     dtype = kv_cache_spec.dtype
 
-                    tpu_kv_cache = torch.zeros(kv_cache_shape,
-                                               dtype=dtype).to(self.device)
+                    tpu_kv_cache = torch.zeros(kv_cache_shape, dtype=dtype).to(
+                        self.device
+                    )
 
                     kv_caches[layer_name] = tpu_kv_cache
                 else:
@@ -1661,19 +1877,19 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
-            self.kv_caches)
+            self.kv_caches,
+        )
 
         if self.use_spmd:
             # Shard KV Cache
             for cache in self.kv_caches:
-                xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
+                xs.mark_sharding(cache, self.mesh, (None, "x", None, None))
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
             get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
 
     def reset_dynamo_cache(self):
-
         # NOTE: We check `is_multimodal_model` instead of `supports_mm_inputs`
         # since the compiled model object of the language backbone of a
         # multimodal model needs to be extracted via `get_language_model`.
@@ -1681,41 +1897,43 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model
-        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+        if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper):
             logger.info("Clear dynamo cache and cached dynamo bytecode.")
             torch._dynamo.eval_frame.remove_from_cache(
-                compiled_model.original_code_object)
-            compiled_model.compiled_codes.clear()
+                compiled_model.original_code_object()
+            )
+            # Reset the wrapper to re-initialize.
+            compiled_model.compiled = False
+            TorchCompileWithNoGuardsWrapper.__init__(compiled_model)
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def select_hidden_states(self, hidden_states, indices_do_sample):
         return hidden_states[indices_do_sample]
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def compute_logits(self,
-                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+    def compute_logits(self, sample_hidden_states: torch.Tensor) -> torch.Tensor:
         return self.model.compute_logits(sample_hidden_states)
 
     # TODO: Under SPMD mode, sample_from_logits has correctness issue.
     #       Re-enable the torch.compile once the issue is fixed in torchxla.
     # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_logits(
-            self, logits: torch.Tensor,
-            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
+        self, logits: torch.Tensor, sampling_metadata: TPUSupportedSamplingMetadata
+    ) -> torch.Tensor:
         """
-        Sample with xla-friendly function. This function is to be traced 
+        Sample with xla-friendly function. This function is to be traced
         separately from `forward` for lighter compilation overhead.
         """
         if sampling_metadata.all_greedy:
             out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
         else:
-            out_tokens = self.sampler(logits,
-                                      sampling_metadata).sampled_token_ids
+            out_tokens = self.sampler(logits, sampling_metadata).sampled_token_ids
         return out_tokens
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def gather_logprobs(self, logits: torch.Tensor,
-                        sampled_tokens: torch.Tensor) -> LogprobsTensors:
+    def gather_logprobs(
+        self, logits: torch.Tensor, sampled_tokens: torch.Tensor
+    ) -> LogprobsTensors:
         """
         Gather the top_logprobs with corresponding tokens. Use a fixed number
         of logprobs as an alternative to having multiple pre-compiled graphs.
@@ -1725,66 +1943,74 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return self.sampler.gather_logprobs(
             logprobs,
             self.model_config.max_logprobs,
-            token_ids=sampled_tokens.squeeze(-1))
+            token_ids=sampled_tokens.squeeze(-1),
+        )
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def structured_decode(self, require_struct_decoding: torch.Tensor,
-                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
-                          arange: torch.Tensor) -> torch.Tensor:
+    def structured_decode(
+        self,
+        require_struct_decoding: torch.Tensor,
+        grammar_bitmask: torch.Tensor,
+        logits: torch.Tensor,
+        arange: torch.Tensor,
+    ) -> torch.Tensor:
         return torch.where(
             require_struct_decoding,
             self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
-            logits)
+            logits,
+        )
 
-    def apply_grammar_bitmask(self, logits: torch.Tensor,
-                              grammar_bitmask: torch.Tensor,
-                              arange: torch.Tensor):
-        assert (logits.shape[0] == grammar_bitmask.shape[0])
+    def apply_grammar_bitmask(
+        self, logits: torch.Tensor, grammar_bitmask: torch.Tensor, arange: torch.Tensor
+    ):
+        assert logits.shape[0] == grammar_bitmask.shape[0]
         logits_cloned = logits.clone()
         for i in range(logits.shape[0]):
-            unpacked_bitmask = (torch.bitwise_right_shift(
-                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
-            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            unpacked_bitmask = (
+                torch.bitwise_right_shift(grammar_bitmask[i][:, None], arange[None, :])
+                & 1
+            ) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[: self.vocab_size]
             logits_cloned[i] = logits_cloned[i].masked_fill(
-                unpacked_bitmask, -float("inf"))
+                unpacked_bitmask, -float("inf")
+            )
         return logits_cloned
 
-    def get_multimodal_embeddings(self, *args, **kwargs):
-        return self.model.get_multimodal_embeddings(*args, **kwargs)
+    def embed_multimodal(self, *args, **kwargs):
+        return self.model.embed_multimodal(*args, **kwargs)
 
-    def get_input_embeddings(self, *args, **kwargs):
-        return self.model.get_input_embeddings(*args, **kwargs)
+    def embed_input_ids(self, *args, **kwargs):
+        return self.model.embed_input_ids(*args, **kwargs)
 
     def prepare_structured_decoding_input(
-        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+        self, logits: torch.Tensor, grammar_output: "GrammarOutput"
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        grammar_bitmask = scheduler_output.grammar_bitmask
-        assert grammar_bitmask is not None
+        grammar_bitmask = grammar_output.grammar_bitmask
         num_reqs, _ = logits.shape
 
         # Reset pre-allocated tensors
         self.grammar_bitmask_cpu.zero_()
         self.require_structured_out_cpu.zero_()
 
-        sorted_struct_requests = sorted(
-            scheduler_output.structured_output_request_ids.items(),
-            key=lambda item: item[1])
         cumulative_mask_idx = 0
-        for req_id, _ in sorted_struct_requests:
+        for req_id in grammar_output.structured_output_request_ids:
             if req_id not in self.input_batch.req_id_to_index:
                 continue
             batch_index = self.input_batch.req_id_to_index[req_id]
             self.grammar_bitmask_cpu[batch_index] = torch.from_numpy(
-                grammar_bitmask[cumulative_mask_idx])
+                grammar_bitmask[cumulative_mask_idx]
+            )
             # It's not guaranteed that all requests in this batch require
             # structured output, so create a bool tensor to represent
             # the requests that need structured output.
             self.require_structured_out_cpu[batch_index] = True
             cumulative_mask_idx += 1
 
-        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
-            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
-            self.structured_decode_arange.to(logits.device)
+        return (
+            self.require_structured_out_cpu[:num_reqs].to(logits.device),
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device),
+            self.structured_decode_arange.to(logits.device),
+        )
 
     def _get_mm_dummy_batch(
         self,
@@ -1807,13 +2033,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         model = cast(SupportsMultiModal, self.model)
-        return next(grouped_mm_kwargs
-                    for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
-                        dummy_mm_items,
-                        device=self.device,
-                        pin_memory=self.pin_memory,
-                        merge_by_field_config=model.merge_by_field_config,
-                    ))
+        return next(
+            grouped_mm_kwargs
+            for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
+                dummy_mm_items,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                merge_by_field_config=model.merge_by_field_config,
+                multimodal_cpu_fields=model.multimodal_cpu_fields,
+            )
+        )
 
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
@@ -1834,9 +2063,10 @@ def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
     return min(res, upper_limit)
 
 
-def _get_token_paddings(min_token_size: int, max_token_size: int,
-                        padding_gap: int) -> list[int]:
-    """Generate a list of padding size, starting from min_token_size, 
+def _get_token_paddings(
+    min_token_size: int, max_token_size: int, padding_gap: int
+) -> list[int]:
+    """Generate a list of padding size, starting from min_token_size,
     ending with a number that can cover max_token_size
 
     If padding_gap == 0 then:
@@ -1874,15 +2104,15 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
 
 
 def _get_padded_token_len(paddings: list[int], x: int) -> int:
-    """Return the first element in paddings list greater or equal to x.
-    """
+    """Return the first element in paddings list greater or equal to x."""
     index = bisect.bisect_left(paddings, x)
     assert index < len(paddings)
     return paddings[index]
 
 
-def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int,
-                                           page_size: int) -> int:
+def _get_padded_num_kv_cache_update_slices(
+    num_tokens: int, max_num_reqs: int, page_size: int
+) -> int:
     """Calculates the padded number of KV cache update slices to avoid
     recompilation."""
     # NOTE(chengjiyao): let's say R_i is the token num for i-th request,
@@ -1918,19 +2148,17 @@ def _get_num_slices_per_kv_cache_update_block(page_size_bytes: int) -> int:
 
 
 def replace_set_lora(model):
-
     def _tpu_set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
     ):
         # TODO: The integer index leads to a recompilation, but converting it
         # to a tensor doesn't seem to work anymore. This might be fixed with a
         # later release of torch_xla.
-        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias)
+        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor)
         torch_xla.sync(wait=False)
 
     def _tpu_reset_lora(self, index: int):
@@ -1942,5 +2170,4 @@ def replace_set_lora(model):
             module._original_set_lora = module.set_lora
             module._original_reset_lora = module.reset_lora
             module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
-            module.reset_lora = _tpu_reset_lora.__get__(
-                module, module.__class__)
+            module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index d4f0a65f2a16482ecaca5fe570ad4dcb6680a210..a716a9c3aa82214f94a5c89582fb9a262439a97c 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -3,28 +3,31 @@
 """A TPU worker class."""
 
 import os
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 
 import torch
-import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
-                                          has_kv_transfer_group)
+from vllm.distributed import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
+from vllm.distributed.kv_transfer import (
+    ensure_kv_transfer_initialized,
+)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.platforms.tpu import USE_TPU_COMMONS
+from vllm.platforms.tpu import USE_TPU_INFERENCE
 from vllm.tasks import SupportedTask
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.utils import bind_kv_cache
@@ -33,8 +36,8 @@ logger = init_logger(__name__)
 
 _R = TypeVar("_R")
 
-if not USE_TPU_COMMONS:
-    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
+if not USE_TPU_INFERENCE:
+    logger.info("tpu_inference not found, using vLLM's TPUWorker.")
     import torch_xla.core.xla_model as xm
     import torch_xla.debug.profiler as xp
     import torch_xla.runtime as xr
@@ -44,7 +47,6 @@ if not USE_TPU_COMMONS:
 
 
 class TPUWorker:
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -82,12 +84,12 @@ class TPUWorker:
         if self.cache_config.cache_dtype == "auto":
             self.cache_dtype = self.model_config.dtype
         else:
-            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
-                self.cache_config.cache_dtype]
+            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_config.cache_dtype]
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
+            from vllm.utils.import_utils import init_cached_hf_modules
+
             init_cached_hf_modules()
 
         # Delay profiler initialization to the start of the profiling.
@@ -100,14 +102,14 @@ class TPUWorker:
             # For TPU, we can only have 1 active profiler session for 1 profiler
             # server. So we only profile on rank0.
             self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        self.profile_dir)
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s", self.profile_dir
+            )
 
         if self.model_config.seed is None:
             self.model_config.seed = 0
 
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -118,9 +120,10 @@ class TPUWorker:
         # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
-            os.environ.get("LIBTPU_INIT_ARGS", "") +
-            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
-            " --xla_jf_conv_input_fusion=False")
+            os.environ.get("LIBTPU_INIT_ARGS", "")
+            + " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False"
+        )
         # --xla_jf_conv_input_fusion=False is used to improve the perf of
         # quantized matmul.
         torch.set_grad_enabled(False)
@@ -128,8 +131,8 @@ class TPUWorker:
 
         # Initialize the distributed environment.
         self._init_tpu_worker_distributed_environment(
-            self.vllm_config, self.rank, self.distributed_init_method,
-            self.local_rank)
+            self.vllm_config, self.rank, self.distributed_init_method, self.local_rank
+        )
 
         # Device initialization should happen after initializing
         # the distributed runtime.
@@ -158,14 +161,15 @@ class TPUWorker:
         # cache during development is recommended.We can disable it by
         # `export VLLM_XLA_CACHE_PATH=`
         if envs.VLLM_XLA_CACHE_PATH:
-            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                         f"tp{world_size}_rank{rank}")
+            per_rank_path = os.path.join(
+                envs.VLLM_XLA_CACHE_PATH, f"tp{world_size}_rank{rank}"
+            )
             xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
-        self.model_runner = \
-            TPUModelRunner(self.vllm_config, self.device,
-                           self.original_parallel_config)
+        self.model_runner = TPUModelRunner(
+            self.vllm_config, self.device, self.original_parallel_config
+        )
 
         if rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -178,19 +182,21 @@ class TPUWorker:
             if isinstance(layer_spec, AttentionSpec):
                 dtype = layer_spec.dtype
 
-                # Use an empty tensor instead of `None`` to force Dynamo to pass
-                # it by reference, rather by specializing on the value ``None``.
+                # Use an empty tensor instead of `None` to force Dynamo to pass
+                # it by reference, rather by specializing on the value `None`.
                 tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
                 kv_caches[layer_name] = tpu_kv_cache
             else:
                 raise NotImplementedError(
-                    f"Unsupported KV cache spec '{type(layer_spec)}'")
+                    f"Unsupported KV cache spec '{type(layer_spec)}'"
+                )
 
         runner_kv_caches: list[torch.Tensor] = []
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
-            runner_kv_caches)
+            runner_kv_caches,
+        )
 
         # `max_num_tokens >= max_num_batched_tokens` due to padding.
         with self.model_runner.maybe_setup_dummy_loras(self.lora_config):
@@ -215,6 +221,7 @@ class TPUWorker:
             # TODO: use xm.get_memory_info for SPMD once it's supported in
             # PyTorch/XLA.
             import tpu_info
+
             chip_type, _ = tpu_info.device.get_local_chips()
             device_usage = tpu_info.metrics.get_chip_usage(chip_type)
             total_memory_size = device_usage[0].total_memory
@@ -231,30 +238,29 @@ class TPUWorker:
         profiled = current_mem * 1.02
 
         # Calculate the TPU KV cache size based on profiling.
-        usable_memory_size = int(total_memory_size *
-                                 self.cache_config.gpu_memory_utilization)
+        usable_memory_size = int(
+            total_memory_size * self.cache_config.gpu_memory_utilization
+        )
         tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
         head_size = self.model_config.get_head_size()
         if head_size > 0:
-            padded_head_size = cdiv(
-                head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            padded_head_size = (
+                cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+            )
             if padded_head_size != head_size:
-                logger.warning_once("head size is padded to %d",
-                                    padded_head_size)
+                logger.warning_once("head size is padded to %d", padded_head_size)
             # We adjust the usable memory size for the KV cache to prevent OOM
             # errors, even after padding the head_size.
-            tpu_kv_cache_bytes = (tpu_kv_cache_bytes * head_size //
-                                  padded_head_size)
+            tpu_kv_cache_bytes = tpu_kv_cache_bytes * head_size // padded_head_size
         return int(tpu_kv_cache_bytes)
 
+    def sample_tokens(self, grammar_output: "GrammarOutput") -> ModelRunnerOutput:
+        return self.model_runner.sample_tokens(grammar_output)
+
     def execute_model(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> Optional[ModelRunnerOutput]:
-        output = self.model_runner.execute_model(scheduler_output)
-        # every worker's output is needed when kv_transfer_group is set up
-        return output if self.is_driver_worker or has_kv_transfer_group(
-        ) else None
+        self, scheduler_output: "SchedulerOutput"
+    ) -> ModelRunnerOutput | None:
+        return self.model_runner.execute_model(scheduler_output)
 
     def profile(self, is_start: bool = True):
         if self.rank < 1:
@@ -287,6 +293,9 @@ class TPUWorker:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def reset_mm_cache(self) -> None:
+        self.model_runner.reset_mm_cache()
+
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()
 
@@ -308,7 +317,7 @@ class TPUWorker:
         self,
         vllm_config: VllmConfig,
         rank: int,
-        distributed_init_method: Optional[str] = None,
+        distributed_init_method: str | None = None,
         local_rank: int = -1,
     ) -> None:
         """Initialize the distributed environment."""
@@ -327,8 +336,8 @@ class TPUWorker:
             backend=current_platform.dist_backend,
         )
         ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size)
+            parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size
+        )
 
         ensure_kv_transfer_initialized(vllm_config)
 
@@ -340,7 +349,7 @@ class TPUWorker:
         return fn(self.get_model())
 
 
-if USE_TPU_COMMONS:
-    from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+if USE_TPU_INFERENCE:
+    from tpu_inference.worker import TPUWorker as TpuInferenceWorker
 
-    TPUWorker = TPUCommonsWorker  # type: ignore
+    TPUWorker = TpuInferenceWorker  # type: ignore
diff --git a/vllm/v1/worker/ubatch_splitting.py b/vllm/v1/worker/ubatch_splitting.py
deleted file mode 100644
index 7767750aa6048871c2bf68333507fa6593e549dc..0000000000000000000000000000000000000000
--- a/vllm/v1/worker/ubatch_splitting.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import numpy as np
-import torch
-
-from vllm.config import ParallelConfig, VllmConfig
-from vllm.forward_context import DPMetadata
-from vllm.logger import init_logger
-from vllm.utils import round_up
-from vllm.v1.worker.ubatch_utils import (UBatchSlice, UBatchSlices,
-                                         is_second_ubatch_empty)
-
-logger = init_logger(__name__)
-
-
-def should_ubatch_with_num_tokens(
-    should_ubatch: bool,
-    orig_num_tokens_per_ubatch: int,
-    padded_num_tokens_per_ubatch: int,
-    vllm_config: VllmConfig,
-) -> tuple[bool, Optional[torch.Tensor]]:
-    dp_size = vllm_config.parallel_config.data_parallel_size
-    dp_rank = vllm_config.parallel_config.data_parallel_rank
-    return DPMetadata.should_ubatch_across_dp(should_ubatch,
-                                              orig_num_tokens_per_ubatch,
-                                              padded_num_tokens_per_ubatch,
-                                              dp_size, dp_rank)
-
-
-def check_ubatch_thresholds(config: ParallelConfig, num_tokens: int,
-                            uniform_decode: bool) -> bool:
-    if not config.enable_dbo:
-        return False
-    if uniform_decode:
-        return num_tokens >= config.dbo_decode_token_threshold
-    else:
-        return num_tokens >= config.dbo_prefill_token_threshold
-
-
-def get_dp_padding_ubatch(
-        num_tokens_unpadded: int, num_tokens_padded: int,
-        should_attempt_ubatching: bool,
-        vllm_config: VllmConfig) -> tuple[bool, Optional[torch.Tensor]]:
-    """
-    1. Decides if each DP rank is going to microbatch. Either all ranks
-    run with microbatching or none of them do. If this function decides
-    not to run with microbatching. It will "abort" meaning that no padding
-    information will be returned to the caller. It will return (False, None)
-
-    2. Determines the total number of tokens that each rank will run.
-    All ranks will be padded out so that the run with the same number
-    of tokens
-
-    Returns: tuple[
-        should_ubatch: Are all DP ranks going to microbatch
-        num_tokens_after_padding: A tensor containing the total number of
-        tokens per-microbatch for each DP rank including padding. Will be
-        None if should_ubatch if False
-    ]
-
-    """
-    assert num_tokens_padded >= num_tokens_unpadded
-    dp_size = vllm_config.parallel_config.data_parallel_size
-    if dp_size == 1:
-        # Early exit.
-        return False, None
-
-    # If this DP rank doesn't want to attempt microbatching
-    if not should_attempt_ubatching:
-        (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
-            False, 0, 0, vllm_config)
-        assert should_ubatch is False
-        assert num_tokens_across_dp is None
-        return should_ubatch, num_tokens_across_dp
-
-    # Round up to the next multiple of two for even divisibility
-    num_tokens_padded = round_up(num_tokens_padded, 2)
-    num_tokens_per_ubatch = num_tokens_padded // 2
-    should_ubatch = True
-
-    # Sanity Check that the existing padding isn't giving us an empty second
-    # ubatch. Abort if so
-    if is_second_ubatch_empty(num_tokens_unpadded, num_tokens_padded):
-        logger.debug(
-            "Empty second µbatch detected: unpadded tokens: %s, padded "
-            "tokens: %s", num_tokens_unpadded, num_tokens_padded)
-        should_ubatch = False
-
-    # Note that we compute the number of padded tokens per ubatch
-    (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
-        should_ubatch, num_tokens_unpadded // 2, num_tokens_per_ubatch,
-        vllm_config)
-    if not should_ubatch:
-        assert num_tokens_across_dp is None
-        return should_ubatch, num_tokens_across_dp
-
-    assert num_tokens_across_dp is not None
-
-    max_tokens_across_dp_cpu = int(torch.max(num_tokens_across_dp).item())
-    num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
-                                            dp_size,
-                                            device="cpu",
-                                            dtype=torch.int32)
-    return should_ubatch, num_tokens_after_padding
-
-def create_ubatch_slices(num_scheduled_tokens: np.ndarray, split_point: int) \
-    -> UBatchSlices:
-    # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
-    # in cu_num_tokens directly (i.e. query_start_loc)
-    cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
-    np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
-
-    first_ubatch_token_slice = slice(0, split_point)
-    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
-
-    # Determine request slices using exclusive stop semantics
-    # First ubatch includes requests whose tokens overlap [0, split_point)
-    first_ubatch_req_stop = int(
-        np.searchsorted(cu_num_tokens, split_point, side="left"))
-    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
-
-    # Second ubatch starts at the request that contains the split_point
-    # or the request starting exactly at split_point (if on boundary)
-    second_ubatch_req_start = int(
-        np.searchsorted(cu_num_tokens, split_point, side="right") - 1)
-    second_ubatch_req_slice = slice(second_ubatch_req_start,
-                                    len(cu_num_tokens) - 1)
-
-    return [
-        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
-        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice)
-    ]
-
-
-def ubatch_split(
-    num_scheduled_tokens_per_request: np.ndarray,
-    num_tokens_unpadded: int,
-    num_tokens_padded: int,
-    uniform_decode: bool,
-    vllm_config: VllmConfig,
-) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
-    """
-    Coordinates amongst all DP ranks to determine if and how the full batch
-    should be split into microbatches.
-
-    Returns: tuple[
-        ubatch_slices: if this is set then all DP ranks have agreed to 
-        microbatch
-        num_tokens_after_padding: A tensor containing the total number of
-        tokens per-microbatch for each DP rank including padding. Will be
-        None if ubatch_slices is None
-    ]
-
-    """
-    parallel_config = vllm_config.parallel_config
-    # Don't bother with the should_ubatch handshaking unless microbatching
-    # is enabled
-    if not parallel_config.enable_dbo:
-        return (None, None)
-
-    # Check preconditions for microbatching
-    should_attempt_ubatching = check_ubatch_thresholds(
-        parallel_config,
-        num_tokens_unpadded,
-        uniform_decode=uniform_decode,
-    )
-
-    # Don't microbatch unless every other DP worker is also microbatching
-    should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
-        num_tokens_unpadded,
-        num_tokens_padded,
-        should_attempt_ubatching,
-        vllm_config,
-    )
-
-    if not should_ubatch:
-        return (None, None)
-
-    # This doesn't actually pad the ubatch slices. It just initializes the
-    # split point to the padded value so that padding can be applied
-    # to the second ubatch in pad_out_ubatch_slice after attention
-    # metadata creation
-    assert num_tokens_after_padding is not None
-    token_split_point = int(num_tokens_after_padding[0].item())
-
-    ubatch_slices = create_ubatch_slices(num_scheduled_tokens_per_request,
-                                         token_split_point)
-
-    return (ubatch_slices, num_tokens_after_padding)
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 33d58aa948434165dee7c3c68e16f5a0557b1a60..33a1921d2d98eea5f20fa9128fc1c6d62a272f7b 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from typing import TypeAlias
 
-from typing_extensions import TypeAlias
+import numpy as np
+
+from vllm.config import ParallelConfig
 
 
 @dataclass
@@ -11,8 +14,10 @@ class UBatchSlice:
     token_slice: slice
 
     def is_empty(self) -> bool:
-        return self.request_slice.start == self.request_slice.stop \
+        return (
+            self.request_slice.start == self.request_slice.stop
             or self.token_slice.start == self.token_slice.stop
+        )
 
     @property
     def num_tokens(self) -> int:
@@ -22,6 +27,47 @@ class UBatchSlice:
 UBatchSlices: TypeAlias = list[UBatchSlice]
 
 
-def is_second_ubatch_empty(orig_num_tokens_per_ubatch: int,
-                           padded_num_tokens_per_ubatch: int) -> bool:
-    return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch
+def is_second_ubatch_empty(orig_num_tokens: int, padded_num_tokens: int) -> bool:
+    return (padded_num_tokens // 2) >= orig_num_tokens
+
+
+def check_ubatch_thresholds(
+    config: ParallelConfig, num_tokens: int, uniform_decode: bool
+) -> bool:
+    if not config.enable_dbo:
+        return False
+    if uniform_decode:
+        return num_tokens >= config.dbo_decode_token_threshold
+    else:
+        return num_tokens >= config.dbo_prefill_token_threshold
+
+
+def create_ubatch_slices(
+    num_scheduled_tokens: np.ndarray, split_point: int
+) -> UBatchSlices:
+    # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
+    # in cu_num_tokens directly (i.e. query_start_loc)
+    cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
+    np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
+
+    first_ubatch_token_slice = slice(0, split_point)
+    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
+
+    # Determine request slices using exclusive stop semantics
+    # First ubatch includes requests whose tokens overlap [0, split_point)
+    first_ubatch_req_stop = int(
+        np.searchsorted(cu_num_tokens, split_point, side="left")
+    )
+    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
+
+    # Second ubatch starts at the request that contains the split_point
+    # or the request starting exactly at split_point (if on boundary)
+    second_ubatch_req_start = int(
+        np.searchsorted(cu_num_tokens, split_point, side="right") - 1
+    )
+    second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
+
+    return [
+        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
+        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
+    ]
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index c26cb07123a53cec90d57fd181b7ec8fb368e3f6..9f16b1e6d03ee21b1ef8ae09a797399cee752865 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -7,10 +7,10 @@ import torch
 
 from vllm import forward_context
 from vllm.forward_context import ForwardContext
-from vllm.utils import current_stream
+from vllm.utils.torch_utils import current_stream
 
 _THREAD_ID_TO_CONTEXT: dict = {}
-_CURRENT_CONTEXTS: list[Optional['UBatchContext']] = [None, None]
+_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [None, None]
 
 
 class UBatchContext:
@@ -18,17 +18,19 @@ class UBatchContext:
     Context manager for micro-batching synchronization using threading events.
     """
 
-    def __init__(self,
-                 id: int,
-                 comm_stream: torch.cuda.Stream,
-                 compute_stream: torch.cuda.Stream,
-                 forward_context: ForwardContext,
-                 ready_barrier: threading.Barrier,
-                 cpu_wait_event: threading.Event,
-                 cpu_signal_event: threading.Event,
-                 gpu_comm_done_event: torch.cuda.Event,
-                 gpu_compute_done_event: torch.cuda.Event,
-                 schedule: str = "default"):
+    def __init__(
+        self,
+        id: int,
+        comm_stream: torch.cuda.Stream,
+        compute_stream: torch.cuda.Stream,
+        forward_context: ForwardContext,
+        ready_barrier: threading.Barrier,
+        cpu_wait_event: threading.Event,
+        cpu_signal_event: threading.Event,
+        gpu_comm_done_event: torch.cuda.Event,
+        gpu_compute_done_event: torch.cuda.Event,
+        schedule: str = "default",
+    ):
         self.id = id
         self.comm_stream = comm_stream
         self.compute_stream = compute_stream
@@ -151,7 +153,6 @@ def dbo_current_ubatch_id() -> int:
 
 
 def _register_ubatch_function(func):
-
     def wrapper(*args, **kwargs):
         if len(_THREAD_ID_TO_CONTEXT) > 0:
             ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()]
@@ -161,20 +162,20 @@ def _register_ubatch_function(func):
     return wrapper
 
 
-dbo_maybe_run_recv_hook = _register_ubatch_function(
-    UBatchContext.maybe_run_recv_hook)
+dbo_maybe_run_recv_hook = _register_ubatch_function(UBatchContext.maybe_run_recv_hook)
 dbo_yield = _register_ubatch_function(UBatchContext.yield_)
 dbo_yield_and_switch_from_compute_to_comm = _register_ubatch_function(
-    UBatchContext.yield_and_switch_from_compute_to_comm)
+    UBatchContext.yield_and_switch_from_compute_to_comm
+)
 dbo_yield_and_switch_from_comm_to_compute = _register_ubatch_function(
-    UBatchContext.yield_and_switch_from_comm_to_compute)
+    UBatchContext.yield_and_switch_from_comm_to_compute
+)
 dbo_switch_to_comm = _register_ubatch_function(UBatchContext.switch_to_comm)
-dbo_switch_to_compute = _register_ubatch_function(
-    UBatchContext.switch_to_compute)
-dbo_switch_to_comm_sync = _register_ubatch_function(
-    UBatchContext.switch_to_comm_sync)
+dbo_switch_to_compute = _register_ubatch_function(UBatchContext.switch_to_compute)
+dbo_switch_to_comm_sync = _register_ubatch_function(UBatchContext.switch_to_comm_sync)
 dbo_switch_to_compute_sync = _register_ubatch_function(
-    UBatchContext.switch_to_compute_sync)
+    UBatchContext.switch_to_compute_sync
+)
 
 
 def dbo_register_recv_hook(recv_hook):
@@ -184,6 +185,15 @@ def dbo_register_recv_hook(recv_hook):
         next_ctx.recv_hook = recv_hook
 
 
+def dbo_get_previous_event(func, *args, **kwargs):
+    if len(_THREAD_ID_TO_CONTEXT) > 0:
+        ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()]
+        ctx = _CURRENT_CONTEXTS[ctx_idx]
+        # execute callable on the ubatch compute stream to record/wait events there
+        with torch.cuda.stream(ctx.compute_stream):
+            return func(*args, **kwargs)
+
+
 def make_ubatch_contexts(
     num_micro_batches: int,
     compute_stream: torch.cuda.Stream,
@@ -197,28 +207,25 @@ def make_ubatch_contexts(
     Create a context manager for micro-batching synchronization.
     """
     cpu_events = [threading.Event() for _ in range(num_micro_batches)]
-    gpu_comm_done_events = [
-        torch.cuda.Event() for _ in range(num_micro_batches)
-    ]
-    gpu_compute_done_events = [
-        torch.cuda.Event() for _ in range(num_micro_batches)
-    ]
+    gpu_comm_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
+    gpu_compute_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
 
     assert len(forward_contexts) == 2
 
     ctxs = []
     for i in range(num_micro_batches):
-        ctx = UBatchContext(id=i,
-                            compute_stream=compute_stream,
-                            comm_stream=comm_stream,
-                            forward_context=forward_contexts[i],
-                            ready_barrier=ready_barrier,
-                            cpu_wait_event=cpu_events[i],
-                            cpu_signal_event=cpu_events[(i + 1) %
-                                                        num_micro_batches],
-                            gpu_comm_done_event=gpu_comm_done_events[i],
-                            gpu_compute_done_event=gpu_compute_done_events[i],
-                            schedule=schedule)
+        ctx = UBatchContext(
+            id=i,
+            compute_stream=compute_stream,
+            comm_stream=comm_stream,
+            forward_context=forward_contexts[i],
+            ready_barrier=ready_barrier,
+            cpu_wait_event=cpu_events[i],
+            cpu_signal_event=cpu_events[(i + 1) % num_micro_batches],
+            gpu_comm_done_event=gpu_comm_done_events[i],
+            gpu_compute_done_event=gpu_compute_done_events[i],
+            schedule=schedule,
+        )
         ctxs.append(ctx)
 
     return ctxs
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 3e0dbda59435dbb1824fb7cfa45b27f170a4beda..095407a8b95963d4661d0273037d071f688fac1a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -35,18 +35,18 @@ class MultiModalBudget:
         self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
-        self.cache = cache = processor_only_cache_from_config(
-            model_config, mm_registry)
+        self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
 
         self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config,
-                                                              cache=cache)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
 
-        max_tokens_by_modality = mm_registry \
-            .get_max_tokens_per_item_by_nonzero_modality(model_config,
-                                                         cache=cache)
+        max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+            profiler_limits=self.mm_limits,
+        )
 
         encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget(
             scheduler_config,
@@ -126,36 +126,47 @@ class MultiModalBudget:
 
         return max_items_per_prompt, max_items_per_batch
 
+    def reset_cache(self) -> None:
+        if self.cache is not None:
+            self.cache.clear_cache()
+
 
 @dataclass
 class AttentionGroup:
     backend: type[AttentionBackend]
+    layer_names: list[str]
+    kv_cache_spec: KVCacheSpec
+    kv_cache_group_id: int
     # When ubatching is enabled we will have a metadata builder for each ubatch
     # so that if they use internal persistant buffers for cudagraphs, and they
     # won't have to worry about conflicting with the other ubatches.
-    metadata_builders: list[AttentionMetadataBuilder]
-    layer_names: list[str]
-    kv_cache_spec: KVCacheSpec
+    metadata_builders: list[AttentionMetadataBuilder] = field(
+        default_factory=lambda: []
+    )
 
-    @staticmethod
-    def create_with_metadata_builders(
-        backend: type[AttentionBackend],
-        layer_names: list[str],
-        kv_cache_spec: KVCacheSpec,
-        vllm_config: VllmConfig,
-        device: torch.device,
+    def create_metadata_builders(
+        self,
+        vllm_config,
+        device,
+        kernel_block_size: int | None,
         num_metadata_builders: int = 1,
-    ) -> 'AttentionGroup':
-        metadata_builders = [
-            backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config,
-                                      device)
+    ):
+        kv_cache_spec_builder = (
+            self.kv_cache_spec.copy_with_new_block_size(kernel_block_size)
+            if kernel_block_size is not None
+            else self.kv_cache_spec
+        )
+        self.metadata_builders = [
+            self.backend.get_builder_cls()(
+                kv_cache_spec_builder,
+                self.layer_names,
+                vllm_config,
+                device,
+            )
             for _ in range(num_metadata_builders)
         ]
-        return AttentionGroup(backend, metadata_builders, layer_names,
-                              kv_cache_spec)
 
-    def get_metadata_builder(self,
-                             ubatch_id: int = 0) -> AttentionMetadataBuilder:
+    def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder:
         assert len(self.metadata_builders) > ubatch_id
         return self.metadata_builders[ubatch_id]
 
@@ -166,30 +177,33 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
         f"or a single 3D tensor, but got {type(mm_embeddings)} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method."
+    )
 
     assert len(mm_embeddings) == expected_num_items, (
         "Expected number of multimodal embeddings to match number of "
         f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method."
+    )
 
     assert all(e.ndim == 2 for e in mm_embeddings), (
         "Expected multimodal embeddings to be a sequence of 2D tensors, "
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method."
+    )
 
 
 def scatter_mm_placeholders(
     embeds: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
+    is_embed: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Scatter the multimodal embeddings into a contiguous tensor that represents
@@ -217,7 +231,7 @@ def scatter_mm_placeholders(
 
 def gather_mm_placeholders(
     placeholders: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
+    is_embed: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Reconstructs the embeddings from the placeholder tokens.
@@ -234,7 +248,7 @@ def gather_mm_placeholders(
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
-    runner_only_attn_layers: Optional[set[str]] = None,
+    runner_only_attn_layers: set[str] | None = None,
 ) -> None:
     """
     Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
@@ -266,7 +280,7 @@ def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
     forward_context: dict[str, "Attention"],
     runner_kv_caches: list[torch.Tensor],
-    num_attn_module: Optional[int] = 1,
+    num_attn_module: int | None = 1,
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
@@ -290,8 +304,7 @@ def bind_kv_cache(
     # Convert kv_caches dict to a list of tensors in the order of layer_index.
     index2name = defaultdict(list)
     for layer_name in kv_caches:
-        index2name[extract_layer_index(layer_name,
-                                       num_attn_module)].append(layer_name)
+        index2name[extract_layer_index(layer_name, num_attn_module)].append(layer_name)
 
     for layer_index in sorted(index2name.keys()):
         layer_names = index2name[layer_index]
@@ -319,16 +332,20 @@ def bind_kv_cache(
         forward_context[layer_name].kv_cache = [kv_cache]
 
 
-def is_residual_scattered_for_sp(vllm_config: VllmConfig,
-                                 num_input_tokens: int) -> bool:
+def is_residual_scattered_for_sp(
+    vllm_config: VllmConfig, num_input_tokens: int
+) -> bool:
     """Check if the residual tensor is scattered for sequence parallelism.
 
     The residual tensor is scattered across tensor parallel ranks when sequence
-    parallelism and tensor parallelism is enabled, and the number of
-    input tokens is one of the compilation sizes.
+    parallelism and tensor parallelism is enabled.
+
+    This follows the same logic as SequenceParallelismPass.is_applicable():
+    - In full-graph compilation mode (no splitting ops or using inductor graph
+      partition), SP is always applied
+    - Otherwise, SP is only applied for specific shapes in compile_sizes
     """
-    if not vllm_config.compilation_config.pass_config.\
-        enable_sequence_parallelism:
+    if not vllm_config.compilation_config.pass_config.enable_sequence_parallelism:
         return False
 
     tp = vllm_config.parallel_config.tensor_parallel_size
@@ -340,5 +357,10 @@ def is_residual_scattered_for_sp(vllm_config: VllmConfig,
     # to be a multiple of tensor_parallel_size (tp) earlier.
     assert num_input_tokens % tp == 0
 
-    # Currently, SP is only enabled for static size fx graphs.
-    return (num_input_tokens in vllm_config.compilation_config.compile_sizes)
+    if (
+        not vllm_config.compilation_config.splitting_ops
+        or vllm_config.compilation_config.use_inductor_graph_partition
+    ):
+        return True
+
+    return num_input_tokens in vllm_config.compilation_config.compile_sizes
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 038ce4b54f960d9c8495549a5c5dde4fae8856df..16f321c080779ab8bec8fc1d09b201a14b09eefc 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -1,23 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+import os
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch
 import torch.nn as nn
 
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import worker_receiver_cache_from_config
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.system_utils import update_environment_variables
 from vllm.v1.kv_cache_interface import KVCacheSpec
-from vllm.worker.worker_base import WorkerBase as WorkerBaseV0
+from vllm.v1.serial_utils import run_method
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+    from vllm.v1.outputs import AsyncModelRunnerOutput, ModelRunnerOutput
+else:
+    SchedulerOutput = object
+    GrammarOutput = object
+    AsyncModelRunnerOutput = object
+    ModelRunnerOutput = object
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R")
 
-class WorkerBase(WorkerBaseV0):
-    """
-    Abstract class for v1 worker, mainly define some methods for v1.
-    For methods shared by v0 and v1, define them in v0 WorkerBase
+
+class WorkerBase:
+    """Worker interface that allows vLLM to cleanly separate implementations for
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
     """
 
     def __init__(
@@ -27,10 +45,10 @@ class WorkerBase(WorkerBaseV0):
         rank: int,
         distributed_init_method: str,
         is_driver_worker: bool = False,
-    ):
+    ) -> None:
         """
         Initialize common worker components.
-        
+
         Args:
             vllm_config: Complete vLLM configuration
             local_rank: Local device index
@@ -39,8 +57,22 @@ class WorkerBase(WorkerBaseV0):
             is_driver_worker: Whether this worker handles driver
                 responsibilities
         """
-        # Configuration storage
-        super().__init__(vllm_config=vllm_config)
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
+
+        from vllm.platforms import current_platform
+
+        self.current_platform = current_platform
 
         self.parallel_config.rank = rank
         self.local_rank = local_rank
@@ -49,8 +81,8 @@ class WorkerBase(WorkerBaseV0):
         self.is_driver_worker = is_driver_worker
 
         # Device and model state
-        self.device: Optional[torch.device] = None
-        self.model_runner: Optional[nn.Module] = None
+        self.device: torch.device | None = None
+        self.model_runner: nn.Module | None = None
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """Get specifications for KV cache implementation."""
@@ -63,3 +95,281 @@ class WorkerBase(WorkerBaseV0):
     def check_health(self) -> None:
         """Basic health check (override for device-specific checks)."""
         return
+
+    def init_device(self) -> None:
+        """Initialize device state, such as loading the model or other on-device
+        memory allocations.
+        """
+        raise NotImplementedError
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+        """Initialize the KV cache with the given size in blocks."""
+        raise NotImplementedError
+
+    def reset_mm_cache(self) -> None:
+        reset_fn = getattr(self.model_runner, "reset_mm_cache", None)
+        if callable(reset_fn):
+            reset_fn()
+
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
+        """Apply a function on the model inside this worker."""
+        return fn(self.get_model())
+
+    def load_model(self) -> None:
+        """Load model onto target device."""
+        raise NotImplementedError
+
+    def execute_model(
+        self, scheduler_output: SchedulerOutput
+    ) -> ModelRunnerOutput | None:
+        """If this method returns None, sample_tokens should be called immediately after
+        to obtain the ModelRunnerOutput.
+
+        Note that this design may be changed in future if/when structured outputs
+        parallelism is re-architected.
+        """
+        raise NotImplementedError
+
+    def sample_tokens(
+        self, grammar_output: GrammarOutput
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput:
+        """Should be called immediately after execute_model iff it returned None."""
+        raise NotImplementedError
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size of a single cache block, in bytes. Used in
+        speculative decoding.
+        """
+        raise NotImplementedError
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def list_loras(self) -> set[int]:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        """Get vocabulary size from model configuration."""
+        return self.model_config.get_vocab_size()
+
+    def shutdown(self) -> None:
+        """Clean up resources held by the worker."""
+        return
+
+
+class WorkerWrapperBase:
+    """
+    This class represents one process in an executor/engine. It is responsible
+    for lazily initializing the worker and handling the worker's lifecycle.
+    We first instantiate the WorkerWrapper, which remembers the worker module
+    and class name. Then, when we call `update_environment_variables`, and the
+    real initialization happens in `init_worker`.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        rpc_rank: int = 0,
+        global_rank: int | None = None,
+    ) -> None:
+        """
+        Initialize the worker wrapper with the given vllm_config and rpc_rank.
+        Note: rpc_rank is the rank of the worker in the executor. In most cases,
+        it is also the rank of the worker in the distributed group. However,
+        when multiple executors work together, they can be different.
+        e.g. in the case of SPMD-style offline inference with TP=2,
+        users can launch 2 engines/executors, each with only 1 worker.
+        All workers have rpc_rank=0, but they have different ranks in the TP
+        group.
+        """
+        self.rpc_rank = rpc_rank
+        self.global_rank = self.rpc_rank if global_rank is None else global_rank
+        self.worker: WorkerBase | None = None
+
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one.
+        # TODO: investigate if we can remove this field in `WorkerWrapperBase`,
+        # `init_cached_hf_modules` should be unnecessary now.
+        self.vllm_config: VllmConfig | None = None
+
+        # `model_config` can be None in tests
+        model_config = vllm_config.model_config
+        if model_config and model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils.import_utils import init_cached_hf_modules
+
+            init_cached_hf_modules()
+
+    def shutdown(self) -> None:
+        if self.worker is not None:
+            self.worker.shutdown()
+
+    def adjust_rank(self, rank_mapping: dict[int, int]) -> None:
+        """
+        Adjust the rpc_rank based on the given mapping.
+        It is only used during the initialization of the executor,
+        to adjust the rpc_rank of workers after we create all workers.
+        """
+        if self.rpc_rank in rank_mapping:
+            self.rpc_rank = rank_mapping[self.rpc_rank]
+
+    def update_environment_variables(
+        self,
+        envs_list: list[dict[str, str]],
+    ) -> None:
+        envs = envs_list[self.rpc_rank]
+        key = "CUDA_VISIBLE_DEVICES"
+        if key in envs and key in os.environ:
+            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
+            # suppress the warning in `update_environment_variables`
+            del os.environ[key]
+        update_environment_variables(envs)
+
+    def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
+        """
+        Here we inject some common logic before initializing the worker.
+        Arguments are passed to the worker class constructor.
+        """
+        kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config")
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker"
+        )
+        self.vllm_config.enable_trace_function_call_for_thread()
+
+        from vllm.plugins import load_general_plugins
+
+        load_general_plugins()
+
+        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
+            worker_class = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_cls
+            )
+        else:
+            raise ValueError(
+                "passing worker_cls is no longer supported. Please pass keep the class in a separate module and pass the qualified name of the class as a string."  # noqa: E501
+            )
+        if self.vllm_config.parallel_config.worker_extension_cls:
+            worker_extension_cls = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_extension_cls
+            )
+            extended_calls = []
+            if worker_extension_cls not in worker_class.__bases__:
+                # check any conflicts between worker and worker_extension_cls
+                for attr in dir(worker_extension_cls):
+                    if attr.startswith("__"):
+                        continue
+                    assert not hasattr(worker_class, attr), (
+                        f"Worker class {worker_class} already has an attribute"
+                        f" {attr}, which conflicts with the worker"
+                        f" extension class {worker_extension_cls}."
+                    )
+                    if callable(getattr(worker_extension_cls, attr)):
+                        extended_calls.append(attr)
+                # dynamically inherit the worker extension class
+                worker_class.__bases__ = worker_class.__bases__ + (
+                    worker_extension_cls,
+                )
+                logger.info(
+                    "Injected %s into %s for extended collective_rpc calls %s",
+                    worker_extension_cls,
+                    worker_class,
+                    extended_calls,
+                )
+
+        shared_worker_lock = kwargs.pop("shared_worker_lock", None)
+        if shared_worker_lock is None:
+            msg = (
+                "Missing `shared_worker_lock` argument from executor. "
+                "This argument is needed for mm_processor_cache_type='shm'."
+            )
+
+            mm_config = self.vllm_config.model_config.multimodal_config
+            if mm_config and mm_config.mm_processor_cache_type == "shm":
+                raise ValueError(msg)
+            else:
+                logger.warning_once(msg)
+
+            self.mm_receiver_cache = None
+        else:
+            self.mm_receiver_cache = worker_receiver_cache_from_config(
+                self.vllm_config,
+                MULTIMODAL_REGISTRY,
+                shared_worker_lock,
+            )
+
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
+
+    def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
+        kv_cache_config = kv_cache_configs[self.global_rank]
+        with set_current_vllm_config(self.vllm_config):
+            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+
+    def init_device(self):
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during device initialization
+            self.worker.init_device()  # type: ignore
+
+    def execute_method(self, method: str | bytes, *args, **kwargs):
+        try:
+            # method resolution order:
+            # if a method is defined in this class, it will be called directly.
+            # otherwise, since we define `__getattr__` and redirect attribute
+            # query to `self.worker`, the method will be called on the worker.
+            return run_method(self, method, args, kwargs)
+        except Exception as e:
+            # if the driver worker also execute methods,
+            # exceptions in the rest worker may cause deadlock in rpc like ray
+            # see https://github.com/vllm-project/vllm/issues/3455
+            # print the error and inform the user to solve the error
+            msg = (
+                f"Error executing method {method!r}. "
+                "This might cause deadlock in distributed execution."
+            )
+            logger.exception(msg)
+            raise e
+
+    def __getattr__(self, attr: str):
+        return getattr(self.worker, attr)
+
+    def _apply_mm_cache(self, scheduler_output: SchedulerOutput) -> None:
+        mm_cache = self.mm_receiver_cache
+        if mm_cache is None:
+            return
+
+        for req_data in scheduler_output.scheduled_new_reqs:
+            req_data.mm_features = mm_cache.get_and_update_features(
+                req_data.mm_features
+            )
+
+    def execute_model(
+        self,
+        scheduler_output: SchedulerOutput,
+        *args,
+        **kwargs,
+    ) -> ModelRunnerOutput | None:
+        self._apply_mm_cache(scheduler_output)
+
+        assert self.worker is not None
+        return self.worker.execute_model(scheduler_output, *args, **kwargs)
+
+    def reset_mm_cache(self) -> None:
+        mm_receiver_cache = self.mm_receiver_cache
+        if mm_receiver_cache is not None:
+            mm_receiver_cache.clear_cache()
+
+        assert self.worker is not None
+        self.worker.reset_mm_cache()
diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 7becdd392498f9be8aaf3c251dc41a03ffc5bcbb..4f82c18da73aac825157928544f9e9c16f2123a0 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -37,9 +37,7 @@ class XPUModelRunner(GPUModelRunner):
 
 @contextmanager
 def _torch_cuda_wrapper():
-
     class _EventPlaceholder:
-
         def __init__(self, *args, **kwargs) -> None:
             self.record = lambda: None
             self.synchronize = lambda: None
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 7355206f30f57419a9423bed77852149a6ac527c..26c6f8d06bdcdc6cf4a45304db4cf974c3a40909 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -11,8 +11,7 @@ from vllm.distributed import get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.v1.worker.gpu_worker import (Worker,
-                                       init_worker_distributed_environment)
+from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.xpu_model_runner import XPUModelRunner
 
 logger = init_logger(__name__)
@@ -29,8 +28,9 @@ class XPUWorker(Worker):
         distributed_init_method: str,
         is_driver_worker: bool = False,
     ):
-        super().__init__(vllm_config, local_rank, rank,
-                         distributed_init_method, is_driver_worker)
+        super().__init__(
+            vllm_config, local_rank, rank, distributed_init_method, is_driver_worker
+        )
         device_config = self.device_config
         assert device_config.device_type == "xpu"
         assert current_platform.is_xpu()
@@ -39,8 +39,11 @@ class XPUWorker(Worker):
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
             logger.debug(
                 "Profiler config: record_shapes=%s,"
                 "profile_memory=%s,with_stack=%s,with_flops=%s",
@@ -59,7 +62,9 @@ class XPUWorker(Worker):
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True))
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                ),
+            )
         else:
             self.profiler = None
 
@@ -75,8 +80,7 @@ class XPUWorker(Worker):
             # and we don't have any API to get it. so we mark it as 128MB.
             used_memory = torch.xpu.memory_allocated()
             non_torch_allocations = 128 * 1024 * 1024
-            free_gpu_memory = total_gpu_memory - (used_memory +
-                                                  non_torch_allocations)
+            free_gpu_memory = total_gpu_memory - (used_memory + non_torch_allocations)
             return free_gpu_memory, total_gpu_memory
 
     @torch.inference_mode()
@@ -97,10 +101,12 @@ class XPUWorker(Worker):
 
         free_gpu_memory, total_gpu_memory = torch.xpu.mem_get_info()
         current_allocated_bytes = torch.xpu.memory_allocated()
-        msg = ("Before memory profiling run, "
-               f"total GPU memory: {total_gpu_memory / 1024**2:.2f} MB, "
-               f"model load takes {current_allocated_bytes / 1024**2:.2f} MB, "
-               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
+        msg = (
+            "Before memory profiling run, "
+            f"total GPU memory: {total_gpu_memory / 1024**2:.2f} MB, "
+            f"model load takes {current_allocated_bytes / 1024**2:.2f} MB, "
+            f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB."
+        )
         logger.info(msg)
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
@@ -113,67 +119,71 @@ class XPUWorker(Worker):
             "Error in memory profiling. "
             f"Initial free memory {self.init_gpu_memory}, current free memory"
             f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
+            "not properly cleaned up before initializing the vLLM instance."
+        )
 
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.xpu.memory_stats()["allocated_bytes.all.peak"]
 
         torch.xpu.empty_cache()
-        torch_allocated_bytes = torch.xpu.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = self.xpu_get_mem_info(
-        )[1] - self.xpu_get_mem_info()[0]
+        torch_allocated_bytes = torch.xpu.memory_stats()["allocated_bytes.all.current"]
+        total_allocated_bytes = self.xpu_get_mem_info()[1] - self.xpu_get_mem_info()[0]
 
         non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
         if non_torch_allocations > 0:
             peak_memory += non_torch_allocations
         available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
-
-        msg = ("After memory profiling run, "
-               f"peak memory usage is {peak_memory / 1024**2:.2f} MB,"
-               f"torch mem is {torch_allocated_bytes / 1024**2:.2f} MB, "
-               f"non-torch mem is {non_torch_allocations / 1024**2:.2f} MB, "
-               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
+            total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory
+        )
+
+        msg = (
+            "After memory profiling run, "
+            f"peak memory usage is {peak_memory / 1024**2:.2f} MB,"
+            f"torch mem is {torch_allocated_bytes / 1024**2:.2f} MB, "
+            f"non-torch mem is {non_torch_allocations / 1024**2:.2f} MB, "
+            f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB."
+        )
         logger.info(msg)
 
         return int(available_kv_cache_memory)
 
     def init_device(self):
-        if self.device_config.device.type == "xpu" and current_platform.is_xpu(
-        ):
+        if self.device_config.device.type == "xpu" and current_platform.is_xpu():
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
             current_platform.check_if_supports_dtype(self.model_config.dtype)
             torch.xpu.empty_cache()
             self.init_gpu_memory = torch.xpu.get_device_properties(
-                self.local_rank).total_memory
+                self.local_rank
+            ).total_memory
         else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
+            raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
         ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
-        ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
-                                         str(self.parallel_config.world_size))
-        os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE
+        ENV_LOCAL_WORLD_SIZE = os.getenv(
+            "LOCAL_WORLD_SIZE", str(self.parallel_config.world_size)
+        )
         os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
         os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
         os.environ["LOCAL_RANK"] = str(self.local_rank)
 
-        init_worker_distributed_environment(self.vllm_config, self.rank,
-                                            self.distributed_init_method,
-                                            self.local_rank,
-                                            current_platform.dist_backend)
+        init_worker_distributed_environment(
+            self.vllm_config,
+            self.rank,
+            self.distributed_init_method,
+            self.local_rank,
+            current_platform.dist_backend,
+        )
 
         # global all_reduce needed for overall oneccl warm up
-        torch.distributed.all_reduce(torch.zeros(1).xpu(),
-                                     group=get_world_group().device_group)
+        torch.distributed.all_reduce(
+            torch.zeros(1).xpu(), group=get_world_group().device_group
+        )
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
         self.model_runner = XPUModelRunner(  # type: ignore
-            self.vllm_config, self.device)
+            self.vllm_config, self.device
+        )
diff --git a/vllm/version.py b/vllm/version.py
index 6c88b1b5a3bf4180bca6a93ce549e6ee9906ea81..63095f8bce1ea4fe4800deaac7b2c7f05b38ff11 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -6,9 +6,7 @@ try:
 except Exception as e:
     import warnings
 
-    warnings.warn(f"Failed to read commit hash:\n{e}",
-                  RuntimeWarning,
-                  stacklevel=2)
+    warnings.warn(f"Failed to read commit hash:\n{e}", RuntimeWarning, stacklevel=2)
 
     __version__ = "dev"
     __version_tuple__ = (0, 0, __version__)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
deleted file mode 100644
index 20fabef4f19b986e21214d815c79e029d243ce1e..0000000000000000000000000000000000000000
--- a/vllm/worker/worker_base.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar,
-                    Union)
-
-import cloudpickle
-import torch.nn as nn
-
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (enable_trace_function_call_for_thread,
-                        resolve_obj_by_qualname, run_method,
-                        update_environment_variables,
-                        warn_for_unimplemented_methods)
-from vllm.v1.outputs import SamplerOutput
-
-logger = init_logger(__name__)
-
-_R = TypeVar("_R")
-
-
-@warn_for_unimplemented_methods
-class WorkerBase:
-    """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware. Also abstracts control plane communication, e.g., to
-    communicate request metadata to other workers.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self.kv_transfer_config = vllm_config.kv_transfer_config
-        self.compilation_config = vllm_config.compilation_config
-        from vllm.platforms import current_platform
-        self.current_platform = current_platform
-
-    def init_device(self) -> None:
-        """Initialize device state, such as loading the model or other on-device
-        memory allocations.
-        """
-        raise NotImplementedError
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
-        """
-        raise NotImplementedError
-
-    def get_model(self) -> nn.Module:
-        raise NotImplementedError
-
-    def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
-        """Apply a function on the model inside this worker."""
-        return fn(self.get_model())
-
-    def load_model(self) -> None:
-        """Load model onto target device."""
-        raise NotImplementedError
-
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
-        raise NotImplementedError
-
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        with self.current_platform.inference_mode():
-            while True:
-                output = self.execute_model(execute_model_req=None)
-                if output is None:
-                    return None
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        The implementation may run profiling or other heuristics to determine
-        the size of caches.
-
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        raise NotImplementedError
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size of a single cache block, in bytes. Used in
-        speculative decoding.
-        """
-        raise NotImplementedError
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError
-
-    @property
-    def vocab_size(self) -> int:
-        """Get vocabulary size from model configuration."""
-        return self.model_config.get_vocab_size()
-
-    def shutdown(self) -> None:
-        """Clean up resources held by the worker."""
-        return
-
-
-class WorkerWrapperBase:
-    """
-    This class represents one process in an executor/engine. It is responsible
-    for lazily initializing the worker and handling the worker's lifecycle.
-    We first instantiate the WorkerWrapper, which remembers the worker module
-    and class name. Then, when we call `update_environment_variables`, and the
-    real initialization happens in `init_worker`.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        rpc_rank: int = 0,
-    ) -> None:
-        """
-        Initialize the worker wrapper with the given vllm_config and rpc_rank.
-        Note: rpc_rank is the rank of the worker in the executor. In most cases,
-        it is also the rank of the worker in the distributed group. However,
-        when multiple executors work together, they can be different.
-        e.g. in the case of SPMD-style offline inference with TP=2,
-        users can launch 2 engines/executors, each with only 1 worker.
-        All workers have rpc_rank=0, but they have different ranks in the TP
-        group.
-        """
-        self.rpc_rank = rpc_rank
-        self.worker: Optional[WorkerBase] = None
-        self.vllm_config: Optional[VllmConfig] = None
-        # do not store this `vllm_config`, `init_worker` will set the final
-        # one. TODO: investigate if we can remove this field in
-        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
-        # unnecessary now.
-        if vllm_config.model_config is not None:
-            # it can be None in tests
-            trust_remote_code = vllm_config.model_config.trust_remote_code
-            if trust_remote_code:
-                # note: lazy import to avoid importing torch before initializing
-                from vllm.utils import init_cached_hf_modules
-                init_cached_hf_modules()
-
-    def shutdown(self) -> None:
-        if self.worker is not None:
-            self.worker.shutdown()
-
-    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
-        """
-        Adjust the rpc_rank based on the given mapping.
-        It is only used during the initialization of the executor,
-        to adjust the rpc_rank of workers after we create all workers.
-        """
-        if self.rpc_rank in rank_mapping:
-            self.rpc_rank = rank_mapping[self.rpc_rank]
-
-    def update_environment_variables(self, envs_list: List[Dict[str,
-                                                                str]]) -> None:
-        envs = envs_list[self.rpc_rank]
-        key = 'CUDA_VISIBLE_DEVICES'
-        if key in envs and key in os.environ:
-            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
-            # suppress the warning in `update_environment_variables`
-            del os.environ[key]
-        update_environment_variables(envs)
-
-    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
-        """
-        Here we inject some common logic before initializing the worker.
-        Arguments are passed to the worker class constructor.
-        """
-        kwargs = all_kwargs[self.rpc_rank]
-        self.vllm_config = kwargs.get("vllm_config")
-        assert self.vllm_config is not None, (
-            "vllm_config is required to initialize the worker")
-        enable_trace_function_call_for_thread(self.vllm_config)
-
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
-        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
-            worker_class = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_cls)
-        else:
-            logger.warning(
-                "passing worker_cls as a class object is strongly deprecated,"
-                " as the serialization of class objects can be tricky and"
-                " error-prone. To be safe, please keep the class in a separate"
-                " module and pass the qualified name of the class as a string."
-            )
-            assert isinstance(self.vllm_config.parallel_config.worker_cls,
-                              bytes)
-            worker_class = cloudpickle.loads(
-                self.vllm_config.parallel_config.worker_cls)
-        if self.vllm_config.parallel_config.worker_extension_cls:
-            worker_extension_cls = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_extension_cls)
-            extended_calls = []
-            if worker_extension_cls not in worker_class.__bases__:
-                # check any conflicts between worker and worker_extension_cls
-                for attr in dir(worker_extension_cls):
-                    if attr.startswith("__"):
-                        continue
-                    assert not hasattr(worker_class, attr), (
-                        f"Worker class {worker_class} already has an attribute"
-                        f" {attr}, which conflicts with the worker"
-                        f" extension class {worker_extension_cls}.")
-                    if callable(getattr(worker_extension_cls, attr)):
-                        extended_calls.append(attr)
-                # dynamically inherit the worker extension class
-                worker_class.__bases__ = worker_class.__bases__ + (
-                    worker_extension_cls, )
-                logger.info(
-                    "Injected %s into %s for extended collective_rpc calls %s",
-                    worker_extension_cls, worker_class, extended_calls)
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during worker initialization
-            self.worker = worker_class(**kwargs)
-            assert self.worker is not None
-
-    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
-        with set_current_vllm_config(self.vllm_config):
-            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
-
-    def init_device(self):
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during device initialization
-            self.worker.init_device()  # type: ignore
-
-    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
-        try:
-            # method resolution order:
-            # if a method is defined in this class, it will be called directly.
-            # otherwise, since we define `__getattr__` and redirect attribute
-            # query to `self.worker`, the method will be called on the worker.
-            return run_method(self, method, args, kwargs)
-        except Exception as e:
-            # if the driver worker also execute methods,
-            # exceptions in the rest worker may cause deadlock in rpc like ray
-            # see https://github.com/vllm-project/vllm/issues/3455
-            # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method!r}. "
-                   "This might cause deadlock in distributed execution.")
-            logger.exception(msg)
-            raise e
-
-    def __getattr__(self, attr):
-        return getattr(self.worker, attr)